pax_global_header00006660000000000000000000000064147762766350014541gustar00rootroot0000000000000052 comment=10aece4157eb79315da205f39e19bf6ab3ee30d0 aom-3.12.1/000077500000000000000000000000001477627663500124015ustar00rootroot00000000000000aom-3.12.1/.clang-format000066400000000000000000000003661477627663500147610ustar00rootroot00000000000000--- Language: Cpp BasedOnStyle: Google AllowShortCaseLabelsOnASingleLine: true ConstructorInitializerAllOnOneLineOrOnePerLine: false Cpp11BracedListStyle: false DerivePointerAlignment: false PointerAlignment: Right SortIncludes: false aom-3.12.1/.cmake-format.py000066400000000000000000000060361477627663500154040ustar00rootroot00000000000000# Generated with cmake-format 0.5.1 # How wide to allow formatted cmake files line_width = 80 # How many spaces to tab for indent tab_size = 2 # If arglists are longer than this, break them always max_subargs_per_line = 10 # If true, separate flow control names from their parentheses with a space separate_ctrl_name_with_space = False # If true, separate function names from parentheses with a space separate_fn_name_with_space = False # If a statement is wrapped to more than one line, than dangle the closing # parenthesis on it's own line dangle_parens = False # What character to use for bulleted lists bullet_char = '*' # What character to use as punctuation after numerals in an enumerated list enum_char = '.' # What style line endings to use in the output. line_ending = u'unix' # Format command names consistently as 'lower' or 'upper' case command_case = u'lower' # Format keywords consistently as 'lower' or 'upper' case keyword_case = u'unchanged' # Specify structure for custom cmake functions additional_commands = { "foo": { "flags": [ "BAR", "BAZ" ], "kwargs": { "HEADERS": "*", "DEPENDS": "*", "SOURCES": "*" } } } # A list of command names which should always be wrapped always_wrap = [] # Specify the order of wrapping algorithms during successive reflow attempts algorithm_order = [0, 1, 2, 3, 4] # If true, the argument lists which are known to be sortable will be sorted # lexicographicall autosort = False # enable comment markup parsing and reflow enable_markup = True # If comment markup is enabled, don't reflow the first comment block in # eachlistfile. Use this to preserve formatting of your # copyright/licensestatements. first_comment_is_literal = True # If comment markup is enabled, don't reflow any comment block which matchesthis # (regex) pattern. Default is `None` (disabled). literal_comment_pattern = None # Regular expression to match preformat fences in comments # default=r'^\s*([`~]{3}[`~]*)(.*)$' fence_pattern = u'^\\s*([`~]{3}[`~]*)(.*)$' # Regular expression to match rulers in comments # default=r'^\s*[^\w\s]{3}.*[^\w\s]{3}$' ruler_pattern = u'^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$' # If true, emit the unicode byte-order mark (BOM) at the start of the file emit_byteorder_mark = False # If a comment line starts with at least this many consecutive hash characters, # then don't lstrip() them off. This allows for lazy hash rulers where the first # hash char is not separated by space hashruler_min_length = 10 # If true, then insert a space between the first hash char and remaining hash # chars in a hash ruler, and normalize it's length to fill the column canonicalize_hashrulers = True # Specify the encoding of the input file. Defaults to utf-8. input_encoding = u'utf-8' # Specify the encoding of the output file. Defaults to utf-8. Note that cmake # only claims to support utf-8 so be careful when using anything else output_encoding = u'utf-8' # A dictionary containing any per-command configuration overrides. Currently # only `command_case` is supported. per_command = {} aom-3.12.1/.gitattributes000066400000000000000000000007001477627663500152710ustar00rootroot00000000000000*.[chs] filter=fixtabswsp *.[ch]pp filter=fixtabswsp *.[ch]xx filter=fixtabswsp *.asm filter=fixtabswsp *.php filter=fixtabswsp *.pl filter=fixtabswsp *.sh filter=fixtabswsp *.txt filter=fixwsp [Mm]akefile filter=fixwsp *.mk filter=fixwsp *.rc -crlf *.ds[pw] -crlf *.bat -crlf *.mmp -crlf *.dpj -crlf *.pjt -crlf *.vcp -crlf *.inf -crlf aom-3.12.1/.gitignore000066400000000000000000000001101477627663500143610ustar00rootroot00000000000000tags cmbuild .DS_Store .cproject .idea .project .vscode .gitignore TAGS aom-3.12.1/.mailmap000066400000000000000000000133021477627663500140210ustar00rootroot00000000000000Aasaipriya Chandran Aasaipriya Chandran Aasaipriya C <100778@ittiam.com> Adrian Grange Adrian Grange Alexander Bokov Alexis Ballier Alpha Lam Andrey Norkin Angie Chiang Arild Fuldseth Arild Fuldseth Aℓex Converse Aℓex Converse Aasaipriya Chandran Aasaipriya Chandran Aasaipriya C <100778@ittiam.com> Apurve Pandey Apurve Kumar Pandey Apurve Pandey Bohan Li Changjun Yang Chi Yo Tsai Chi Yo Tsai Chm Damon Shen Daniele Castagna Deb Mukherjee Elliott Karpilovsky Emil Keyder Erik Niemeyer Frederic Barbier Fyodor Kyslov Grant Hsu Guillaume Martres Guillaume Martres Guillaume Martres Guillaume Martres Hangyu Kuang Hangyu Kuang Hui Su Iole Moccagatta Jacky Chen James Zern Jean-Marc Valin Jian Zhou Jim Bankoski Johann Koenig Johann Koenig Johann Koenig Johann Koenig Johann Koenig John Koleszar Joshua Litt Kyle Siefring Kyle Siefring Lin Zheng Logan Goldberg Lokeshwar Reddy B Luc Trudeau Luc Trudeau Marco Paniconi Marco Paniconi Michael Bebenita Michael Horowitz Mingliang Chen Monty Montgomery Mudassir Galaganath Narayan Kalaburgi Mudassir Galaganath Mudassir Galagnath Nathan E. Egge Nathan E. Egge Onur Guleryuz Pascal Massimino Pascal Massimino Paul Wilkins Peng Bin Peng Bin Peter de Rivaz Rachel Barker David Barker Ralph Giles Ralph Giles Remya Prakasan Roger Zhou Ronald S. Bultje Ryan Lei Ryan Lei Ryan Lei Sachin Kumar Garg Sai Deng Sami Pietilä Sarah Parker Susanna D'Souza Tamar Levy Tamar Levy Tero Rintaluoma Thomas Davies Thomas Timothy B. Terriberry Timothy B. Terriberry Timothy B. Terriberry Tim Terriberry Tom Finegan Tom Finegan Tristan Matthews Venkat Sanampudi Vignesh Venkatasubramanian Vitalii Dziumenko Wei-Ting Lin Wei-Ting Lin Wenyao Liu Will Bresnahan Yaowu Xu Yaowu Xu Yaowu Xu Yaowu Xu Yaowu Xu Yaowu Xu Zhipin Deng Zoe Liu aom-3.12.1/AUTHORS000066400000000000000000000302461477627663500134560ustar00rootroot00000000000000# This file is automatically generated from the git commit history # by tools/gen_authors.sh. Aamir Anis Aaron Watry Aasaipriya Chandran Abo Talib Mahfoodh Adrian Grange Ahmad Sharif Akshata Jadhav Alexander Bokov Alexander Voronov Aℓex Converse Alexis Ballier Alex Peterson Alok Ahuja Alpha Lam A.Mahfoodh Ami Fischman Andoni Morales Alastruey Andres Mejia Andrew Russell Andrey Norkin Angie Chiang Aniket Dhok Aniket Wanare Ankur Saxena Anupam Pandey Apurve Kumar Pandey Aras Pranckevicius Arild Fuldseth Aron Rosenberg Arpad Panyik Arun Singh Negi Athulya Raj Raji Mohini Attila Nagy Balaji Anandapadmanaban Bohan Li Brennan Shacklett Brion Vibber Bruno Berthier Casey Smalley Changjun Yang Charles 'Buck' Krasic Cheng Chen Cherma Rajan A Chethan Kumar R E Chi Yo Tsai Chm Christian Duvivier Christopher Degawa Cyril Concolato Dake He Damon Shen Dandan Ding Daniel Cheng Daniele Castagna Daniel Kang Daniel Max Valenzuela Danil Chapovalov David Major David Michael Barr David Turner Deb Mukherjee Deepa K G Denis Nikitin Di Chen Diksha Singh Dim Temp Dmitry Kovalev Dominic Symes Dragan Mrdjan Ed Baker Edward Hervey Ehsan Akhgari Elliott Karpilovsky Emil Keyder Erik Niemeyer Fabio Pedretti Fangwen Fu Fergus Simpson Frank Bossen Frank Galligan Frederic Barbier Fredrik Söderquist Fritz Koenig Fyodor Kyslov Gaute Strokkenes George Steed Gerda Zsejke More Geza Lore Ghislain MARY Giuseppe Scrivano Gordana Cmiljanovic Grant Hsu Guillaume Martres Guillermo Ballester Valor Hamsalekha S Hangyu Kuang Hanno Böck Hari Limaye Harish Mahendrakar Henrik Lundin Hien Ho Hirokazu Honda Hui Su Ilie Halip Ilya Brailovskiy Imdad Sardharwalla Iole Moccagatta Ivan Krasin Ivan Maltz Ivan Rosales Jacek Caban Jack Haughton Jacky Chen James Berry James Yu James Zern Jan Gerber Jan Kratochvil Janne Salonen Jayasanker J Jayashri Murugan Jean-Marc Valin Jean-Yves Avenard Jeff Faust Jeff Muizelaar Jeff Petkau Jerome Jiang jerry Jia Jia Jian Zhou Jim Bankoski Jingning Han Joe Young Joey Parrish Johann Koenig John Koleszar Johnny Klonaris John Stark Jonathan Matthews Jonathan Wright Joshua Bleecher Snyder Joshua Litt Josh Verdejo Julia Robson Julio Barba Justin Clift Justin Lebar Katsuhisa Yuasa Kavi Ramamurthy KO Myung-Hun Konstantinos Margaritis Krishna Malladi Kwanghoon Son Kyle Siefring Larisa Markeeva Lauren Partin Lawrence Velázquez leolzhao Leon Kollar L. E. Segovia Lester Lu liang zhao Linfeng Zhang Link.Meng Lin Zheng Logan Goldberg Lokeshwar Reddy B Lou Quillio Luca Barbato Luca Versari Luc Trudeau Madhu Peringassery Krishnan Mahesh Madhav Makoto Kato Mans Rullgard Marco Paniconi Mark Horvath Mark Mentovai Mark Wachsler Martin Ettl Martin Storsjo Maryla Matthew Heaney Matthieu Vaudano Mattias Hansson Maxym Dmytrychenko Michael Bebenita Michael Horowitz Michael Kohler Michelle Findlay-Olynyk Mike Frysinger Mike Hommey Mikhal Shemer Minghai Shang Mingliang Chen Mirko Bonadei Monty Montgomery Morton Jonuschat Mudassir Galaganath Mufaddal Chakera Narayan Kalaburgi Nathan E. Egge Neeraj Gadgil Neha Mary Thomas Neil Birkbeck Nico Weber Nithya V S Ola Hugosson Oleg Nalivayko Onur Guleryuz Parag Salasakar Pascal Massimino Patrik Westin Paul Wilkins Pavel Frolov Pavol Rusnak Paweł Hajdan Peng Bin Pengchong Jin Peter Boström Peter de Rivaz Peter Kasting Philip Jägenstedt Philippe Antoine Pradeep Kumar Priit Laes Qiu Jianlin Rachel Barker Rafael Ávila de Espíndola Rafaël Carré Ralph Giles Ranjit Kumar Tulabandu Ravi Chaudhary Remya Prakasan Remy Foray Rob Bradford Robert-André Mauchin Robert Chin Roger Zhou Rohit Athavale Ronald S. Bultje Rostislav Pehlivanov Ruiling Song Rui Ueyama Ruoyu Zhong Rupert Swarbrick Ryan Lei Ryan Overbeck Sachin Kumar Garg Sai Deng Salome Thirot Sami Boukortt Sami Pietilä Samuel Thibault Samuthirika S Sarah Parker Sasi Inguva Satheesh Kumar Satish Kumar Suman Scott Graham Scott LaVarnway Sean DuBois Sean McGovern Sean Purser-Haskell Sebastien Alaiwan Sergey Kolomenkin Sergey Ulanov S Hamsalekha Shimon Doodkin Shunyao Li SmilingWolf Soo-Chul Han Stanislav Vitvitskyy Stefan Holmer Steinar Midtskogen Steve Lhomme Suman Sunkara Susanna D'Souza Taekhyun Kim Takanori MATSUURA Takuto Ikuta Tamar Levy Tao Bai Tarek AMARA Tarundeep Singh Tero Rintaluoma Thijs Vermeir Thomas Daede Thomas Davies Thomas Tim Kopp Timothy B. Terriberry Timo Witte Todd Nguyen Tom Anderson Tom Finegan Tristan Matthews Umang Saini Urvang Joshi Venkat Sanampudi Victoria Zhislina Vignesh Venkatasubramanian Vikas Prasad Vincent Rabaud Vishesh Vishnu Teja Manyam Vitalii Dziumenko Wan-Teh Chang Wei-Ting Lin Wenyao Liu Will Bresnahan Xiaoqing Zhu Xing Jin Xin Zhao Yannis Guyon Yaowu Xu Yeqing Wu Yi Luo Yingying Ma Yongzhe Wang Yuan Tong Yu-Chen (Eric) Sun Yue Chen Yunqing Wang Yury Gitman Yushin Cho Zhaoliang Ma Zhijie Yang Zhipin Deng Zoe Liu aom-3.12.1/CHANGELOG000066400000000000000000001507061477627663500136240ustar00rootroot000000000000002025-04-11 v3.12.1 This release includes several bug fixes. This release is ABI compatible with the last release. See https://aomedia.googlesource.com/aom/+log/v3.12.0..v3.12.1 for all the commits in this release. - Bug Fixes * b:396169342: Assertion `av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv)' failed. * b:401671154: typo in void init_src_params(...) * Coverity defect 323670: Uninitialized scalar variable in encode_with_and_without_superres() * cmake: bump minimum version to 3.16 * cfl_ppc: fix subtract_average_vsx * Fix an incorrect index in av1_highbd_pixel_proj_error_neon 2025-02-10 v3.12.0 This release includes new codec interfaces, compression efficiency and perceptual improvements, speedup and memory optimizations, and bug fixes. This release is ABI compatible with the last release. Five internal functions (aom_free, aom_malloc, aom_wb_bytes_written, aom_wb_write_bit, aom_wb_write_literal) that were exported by mistake are no longer exported from the libaom shared library. The removal of these internal functions from the ABI is a bug fix and does not break ABI compatibility. Acknowledgments: The image quality optimizations in the new tuning mode AOM_TUNE_IQ were originally developed for SVT-AV1-PSY by Cole Ogaard, Gianni Rosato, Julio Barba, and Zakaria Djebrouni. - New Features * New tuning mode AOM_TUNE_IQ (image quality) for the AOME_SET_TUNING codec control (--tune=iq) in all-intra mode. The feature detection macro AOM_HAVE_TUNE_IQ, if defined, indicates that AOM_TUNE_IQ is available. The image quality optimizations in AOM_TUNE_IQ were developed by using the SSIMULACRA 2 metric for guidance and validated with subjective visual quality checks. * New value 6 for the AV1E_SET_DELTAQ_MODE codec control (--deltaq-mode): use modulation for all intra using Variance Boost. Variance Boost is a variance adaptive quantization implementation that modulates qindex depending on the ratio of low-variance to high-variance 8x8 subblocks within a 64x64 superblock, as well as the actual variance of the subblocks themselves. * New value 3 for the AV1E_SET_ENABLE_CDEF codec control (--enable-cdef): Enable CDEF adaptively based on frame qindex. * In all-intra mode, the AOME_SET_SHARPNESS codec control now also sets the loop_filter_sharpness syntax element in the bitstream. Larger values increasingly reduce how much the filtering can change the sample values on block edges to favor perceived sharpness. * In all-intra mode, the default value of the AV1E_SET_QM_MIN codec control is decreased to 4, and the default value of the AV1E_SET_QM_MAX codec control is increased to 10. The default values in good-quality and realtime modes remain unchanged (5 and 9, respectively). - Compression Efficiency Improvements * Tuning mode AOM_TUNE_IQ improves image compression efficiency on the CLIC dataset by up to 12% for the same SSIMULACRA 2 score, up to 14% for the same DSSIM score, and up to 17% for the same Butteraugli score. * ~3% BD-rate gains for speed 11 VGA camera mode. * ~5% BD-rate gains for speed 11 on scroll clips screen mode. - Perceptual Quality Improvements * Adjust temporal filter strength for better visual quality. * RTC screen: visual quality improvements for scrolling and for scene/slide changes. * RTC camera mode: visual quality improvements for speed 11 VGA. - Speedup and Memory Optimizations * Optimize the Arm Neon implementation of the loop filter functions with an average uplift of 15 - 25% in microbenchmarks. * Add the CDEF optimization for RISC-V. * Help the compiler generate better vectorized code for variance calculation and warped motion in generic CPU builds. * Make several arrays const. - Other Improvements * Binary size reduction: 1 - 2% compared with last release, with CONFIG_REALTIME_ONLY enabled, CONFIG_AV1_DECODER and CONFIG_AV1_HIGHBITDEPTH disabled. * Build: compile source files in parallel under MSVC. - Bug Fixes * Fix bug where metadata added with aom_img_add_metadata was lost when frame scaling was used. * Bug b:383306740: RTC: Fix to issues with scrolling for screen content. * Bug b:382465458: RTC: Fix to artifact for grayscale input. * Bug b:380247338: RTC: Fix to encode_time spikes on scene/slide changes. * RTC: Fix to rate correction factor update for VBR screen mode. https://groups.google.com/a/aomedia.org/g/av1-discuss/c/nJxECdg-7P8 * Bug b:378401081: RTC: Fix to cyclic refresh update for external RC (rate control). 2024-10-24 v3.11.0 This release includes perceptual quality improvements, binary size reduction under certain configurations and many bug fixes. This release changes the default encoder configuration for the AOM_USAGE_REALTIME mode. This release is ABI compatible with the last release. - Perceptual Quality Improvements * Visual quality improvements for RTC screen content * Higher quality on scene or slide changes * Faster quality ramp-up for static content after scene change * Quality improvements for screen content with active maps - Speedup * Added and improved Neon SIMD paths for dynamic frame scaling with ~1.5% overall encoding speedup. - Other Improvements * Binary size reduction: 10% compared with last release, with CONFIG_REALTIME_ONLY enabled, CONFIG_AV1_DECODER and CONFIG_AV1_HIGHBITDEPTH disabled. * Update default_extra_cfg for CONFIG_REALTIME_ONLY to provide proper RTC defaults settings * Change the default valuess of the following encoder config options in the AOM_USAGE_REALTIME mode: * rc_overshoot_pct and rc_undershoot_pct are changed from 25 to 50 * rc_buf_sz is changed from 6000 to 1000 * rc_buf_initial_sz is changed from 4000 to 600 * rc_buf_optimal_sz is changed from 5000 to 600 - Bug Fixes * aomedia:363016123: rtc: Fix setting of intra-only frame for set_ref_frame_config and add checks * aomedia:42302583: rtc: Fix for artifacts for screen with active_maps * b:365088425: rtc: Allow for lower-QP on scene/slide changes * b:367285382: Fix to encoder quality max-out too early for screen * b:362936830: rtc: Allow QP to decrease more aggressively for static content * b:361617762: Clamp the calculation of sb64_target_rate to INT_MAX * chromium:362123224: rtc-svc: Reset ref_map_idx for references not used * chromium:367892770: Fix to possible integer overflow in reset_rc * webrtc:369633254: rtc-svc: Fix to reset ref_idx for svc * Fix exit condition in rate correction update 2024-08-27 v3.10.0 This release includes new codec interfaces, compression efficiency and perceptual improvements, speedup and memory optimizations and many bug fixes. This release is ABI compatible with the last release. The definitions of the internal macros AOM_INLINE and AOM_FORCE_INLINE have been removed from the public header aom/aom_integer.h. - New Features * New codec controls: * AV1E_SET_AUTO_TILES * AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC * AV1E_SET_POSTENCODE_DROP_RTC: Post encode frame drop feature. * AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR * New key-value pair for aom_codec_set_option(): * "auto-tiles": equivalent to the new codec control AV1E_SET_AUTO_TILES. - Deprecated Features * Deprecated codec control: * AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR: Use the new codec control AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR instead. * The sframe_mode field in the aom_codec_enc_cfg_t struct is not implemented. - Compression Efficiency Improvements * BD-rate gain of 0.7 - 1.3% (by enabling global motion tool) for speed 5 and speed 6 with ~5% encode time increase. * RTC speed 11 video: ~3-5% BD-rate gain for VGA and QVGA. - Perceptual Quality Improvements * RTC quality improvements for slide changes and scrolling content. - Speedup and Memory Optimizations * RTC screen content speedups: * ~2x speedup for high motion content for speed 11. * ~2x speedup on key frame coding for speed >= 10. * Arm: Significant uplifts in speed in this release (vs v3.9.1) have come from tuning the various convolutions according to filter size (doing 8-tap when only 2-tap is required is inefficient) and also deploying Armv8.6 USMMLA instructions in 6-tap and 12-tap standard bitdepth convolutions. * Standard bitdepth RTC: * speed 5: +5% * speed 6: +4% * speed 7: +5% * speed 8: +4% * speed 9: +6% * speed 10: +6% * Standard bitdepth VoD: * speed 0: +9% * speed 1: +12% * speed 2: +9% * speed 3: +3% * speed 4: +3% * speed 5: -9% (expected due to global motion changes) * speed 6: -3% (expected due to global motion changes) * High bitdepth VoD: * speed 0: +4% * speed 1: +19% * speed 2: +23% * speed 3: +1% * speed 4: +1% * speed 5: -8% (expected due to global motion changes) * speed 6: -3% (expected due to global motion changes) * Standard bitdepth 2x1 horizontal super-resolution/scaling encoding: +101% - Other Improvements * Reduce bit rate overshoot on slide content. - Bug Fixes * rtc: Bug fix for active_maps with sb_size=128. * b:343429036: rtc: Fix source_sad setting near boundary. * Fix to QP for temporal enhancement after key frame. * b:343429192: rtc: Condition QP adjustment on rc->q_1/2_frame > 0. 2024-06-07 v3.8.3 This release includes several bug fixes. This release is ABI compatible with the last release. See https://aomedia.googlesource.com/aom/+log/v3.8.2..v3.8.3 for all the commits in this release. - Bug Fixes * aomedia:2754, aomedia:3567: Ensure thread stack size is at least 256 KB * aomedia:3382, chromium:339877165: update codec config after svc/scale controls (CVE-2024-5493) * aomedia:3561: libaom-3.8.2 armv7 Android build failed * aomedia:3580: Allow g_timebase.num to be greater than g_timebase.den * Arm SVE build fixes. * av1_block_error_lp_neon: fix block_size param type 2024-06-05 v3.9.1 This release includes several bug fixes. This release is ABI compatible with the last release. See https://aomedia.googlesource.com/aom/+log/v3.9.0..v3.9.1 for all the commits in this release. - Bug Fixes * aomedia:2754, aomedia:3567: Ensure thread stack size is at least 256 KB * b:330639949, oss-fuzz:68195: Increase scaling in linsolve_wiener * Fix high target data rate overflow. * aomedia:3509: Fix two UBSan errors in av1_rc_update_framerate() * aomedia:3382, chromium:339877165: update codec config after svc/scale controls (CVE-2024-5493) * aomedia:3561: libaom-3.8.2 armv7 Android build failed * aomedia:3571: {,highbd_}intrapred_neon.c: Avoid over-reads in z1 and z3 preds * aomedia:3578: libaom-3.9.0 undefined reference to `aom_sub_pixel_variance16xh_ssse3' * aomedia:3579: Use round for RC calculations in cyclic_refresh * aomedia:3580: Allow g_timebase.num to be greater than g_timebase.den * oss-fuzz:68774: libaom:av1_dec_fuzzer: Segv on unknown address in od_ec_dec_init * Arm SVE build fixes. * av1_block_error_lp_neon: fix block_size param type * av1_block_error_lp_sve: fix block_size param type 2024-04-09 v3.9.0 This release includes new codec interfaces, compression efficiency and perceptual improvements, speedup for RTC for both video and screen content, and many bug fixes. This release is ABI compatible with the previous release. - New Features * New codec control * AV1E_SET_SVC_FRAME_DROP_MODE is added to configure the SVC encoder to only drop spatial layers or the whole superframe. * Active Map is fixed and tested for RTC. * CONFIG_QUANT_MATRIX is added to disable quantization matrices when aom decoder is disabled with CONFIG_AV1_DECODER. Reduces ~10% binary size when both are disabled. * libwebm is updated to libwebm-1.0.0.31-1-gaffd7f4. - Compression Efficiency Improvements * RTC encoding improvements * 1-2% BD-rate gain for screen content with temporal layers; 5% BD-rate gain on scrolling content. - Perceptual Quality Improvements * For RTC screen content * Reduced color artifacts for RTC screen content * Visual quality improved for scene changes for SVC with quality layers. * Removed visual artifacts for speed 11 - Speedups: * RTC Speed 11: aggressive speedup setting added for video mode, resolutions <= VGA: ~30% faster than speed 10. * 5-9% speed up for high bit-depth encoding with good mode on Arm, half of which comes from SVE/SVE2 optimizations. - Other improvements * Further improvements to global motion estimation. * Documented minimum required SIMD support: SSE4.1 on x86, Neon on Arm. * Remove unneeded SIMD functions, saving >100 KiB from binary size. * Cleaned up and improved pattern_search. * Added end-to-end c vs SIMD bit-exactness test. * Added config flag to calc psnr using libvmaf peak: use a slightly different peak value for PSNR (1020 and 2040 for 10- and 12-bit) - Bug Fixes * Fuzzing bug fixes * b/329485898 Null-dereference WRITE in av1_cdef_frame_mt * b/329810149 Null-dereference WRITE in av1_cdef_copy_sb8_16 * b/329813868 Ill in av1_cdef_frame_mt * chromium:327882824 Null-dereference WRITE in av1_cdef_init_fb_row * b/330014723 Null-dereference WRITE in cdef_copy_rect8_16bit_to_16bit_avx2 * b/310455204 Null-dereference WRITE in prepare_enc_workers * b/314858909 Heap-buffer-overflow in aom_variance64x64_avx2 * oss-fuzz:67132 av1_dec_fuzzer: ASSERT: (pbi->tile_count_minus_1 + 1) <= (pbi->output_frame_width_in_tiles_minus_1 + 1) * oss-fuzz:67058 av1_dec_fuzzer: ASSERT: i == 0 || tile_w == *w * oss-fuzz:67161 av1_dec_fuzzer: ASSERT: i == 0 || tile_h == *h * oss-fuzz:67059 av1_dec_fuzzer: Crash in mem_get_varsize * oss-fuzz:67162 av1_dec_fuzzer: Use-of-uninitialized-value in od_ec_decode_bool_q15 * oss-fuzz:67184 av1_dec_fuzzer: Heap-buffer-overflow in od_ec_dec_init * oss-fuzz:67216 av1_dec_fuzzer: Heap-buffer-overflow in od_ec_dec_normalize * oss-fuzz:67055 av1_dec_fuzzer: Heap-buffer-overflow in get_ls_tile_buffers * libaom library * aomedia:3510 Large value of duration could cause encoder overflow * chromium:328105513 Fix build conflicts between Abseil and libaom/libvpx in Win ARM64 builds * aomedia:3544 AV1/SharpnessTestLarge.SharpnessPSNRTest failures after 59c592bb8 * aomedia:3531 Exception encountered with PSNR calculation * aomedia:3541 Can not compile correctly by CYGWIN * chromium:41482688 heap-buffer-overflow write in vpx_img_read() (tools_common.c) with VPX_IMG_FMT_NV12 * aomedia:3521 Assertion failures on Arm in CNNTest.* in av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon and av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon * aomedia:3486 C vs NEON mismatch in AV1 encoder * aomedia:3536 Over write in highbd_dr_prediction_z3_upsample1_neon() * aomedia:3276 Significant progress on ensuring all allocations are checked * aomedia:3491 heap-buffer-overflow encoding frames of size 256x256, 512x512 in good quality usage mode using 4 threads * aomedia:3322 PSNR number discrepancy * aomedia:3493 Cmake generates garbage symbols for libaom_srcs.gni * aomedia:3478 GCC 12.2.0 emits a -Wstringop-overflow warning on aom/av1/encoder/motion_search_facade.c * aomedia:3484 C vs NEON mismatch in AV1 encoder for high-bitdepth case 2024-03-08 v3.8.2 This release includes several bug fixes. This release is ABI compatible with the last release. See https://aomedia.googlesource.com/aom/+log/v3.8.1..v3.8.2 for all the commits in this release. - Bug Fixes * aomedia:3523: SIGFPE in av1_twopass_postencode_update() pass2_strategy.c:4261. * aomedia:3535, b/317646516: Over reads in aom_convolve_copy_neon(). * aomedia:3543: invalid feature modifier when compiling aom_dsp/arm/aom_convolve8_neon_i8mm.c on Debian 10 with arm64 architecture. * aomedia:3545: Failed to parse configurations due to inconsistent elements between two arrays "av1_ctrl_args" and "av1_arg_ctrl_map" in aomenc.c. * oss-fuzz:66474, b/319140742: Integer-overflow in search_wiener. * Zero initialize an array in cdef search. 2024-01-17 v3.8.1 This release includes several bug fixes. This release is ABI compatible with the last release. See https://aomedia.googlesource.com/aom/+log/v3.8.0..v3.8.1 for all the commits in this release. - Bug Fixes * aomedia:3520: get_cubic_kernel_dbl: Assertion `0 <= x && x < 1' failed. * aomedia:3526: alloc_compressor_data() is called during every aom_codec_control() call on the encoder. * aomedia:3527: aom/av1/encoder/mcomp.c:1810: av1_full_pixel_search: Assertion `ms_params->ms_buffers.ref->width == ms_params->ms_buffers.src->width' failed. * aomedia:3534: libaom encoder crashed by AOM_USAGE_ALL_INTRA and AOM_EFLAG_NO_REF_LAST flags. * b/310455204: Recreate workers if necessary. * b/310548198: Update frame size in actual encoding. * b/314858909: Do not use adaptive error estimate. * Fix a hang of cmake on arm64 macOS with cmake 3.27.0 or later. 2024-01-18 v3.7.2 This release includes three bug fixes. This release is ABI compatible with the last release. See https://aomedia.googlesource.com/aom/+log/v3.7.1..v3.7.2 for all the commits in this release. - Bug Fixes * aomedia:3520: get_cubic_kernel_dbl: Assertion `0 <= x && x < 1' failed. * aomedia:3526: alloc_compressor_data() is called during every aom_codec_control() call on the encoder. Note that this partially reverts the fix for bug aomedia:3349. * b/310457427 and b/310766628: Only use rec_sse in CBR mode. * Fix a hang of cmake on arm64 macOS with cmake 3.27.0 or later. 2023-11-30 v3.8.0 This release includes new codec interfaces, compression efficiency and perceptual improvements, speedup and memory optimizations and many bug fixes. This release is ABI compatible with the last release. - New Features * New codec controls: * AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR: Set the maximum number of consecutive frame drops allowed for the frame dropper in 1 pass CBR mode. * Run-time CPU feature detection for all Arm platforms: CRC, DotProd, I8MM and SVE CPU feature presence is detected at run time and code paths making use of these features are selected dynamically. These code paths provide meaningful performance gains for standard bitdepth RTC and VoD encoding: up to 10% and 20% respectively, over the Armv8.0-A baseline build. * RTC: Frame-dropper support added to the rate control library. * RTC Rate control improvements for low bitrate and for SVC. - Compression Efficiency Improvements * Improved accuracy of cost estimation for loop restoration and global motion. * Improved selection of loop restoration unit size - full search up to (non-realtime) speed 2, retuned static selection at higher speeds. * RTC Screen content mode: 3-5% bdrate gains across speeds 7 - 10. * Good-quality mode: 0.2 - 0.5% bdrate gains across speeds 1 - 4. - Perceptual Quality Improvements * RTC Screen: Improved visual quality for scrolling. * RTC: Improved color quality for both screen and video mode. - Speedup and Memory Optimizations * Good-quality, single-thread encoder speedups: o 15% improvement for speed 5. o 12% improvement for speed 6. * Arm standard bitdepth VoD (--good): o 8% speedup for speeds 0 and 1. o 20% speedup for speed 2. o 27% speedup for speed 3. o 30% speedup for speed 4. o 38% speedup for speeds 5 and 6. * Arm high bitdepth VoD (--good): o 206% speedup for speeds 0 and 1. o 180% speedup for speed 2. o 51% speedup for speeds 3 and 4. o 68% speedup for speed 5. o 72% speedup for speed 6. * RTC Screen content: 2-6% speedup across speeds 7-10. * RTC: 2-3% speedup for temporal layers. * RTC: Speedups to reference scaling in nonrd pickmode. * Good-quality mode: Simplified global motion estimation, saving ~1200 lines of code and 1KB of tables while improving quality. - Bug Fixes * Fixes to improve libaom stability in case of memory allocation failures. * Fixes to SIMD functions (x86 AVX2/SSE2 and ARM Neon). * b/310457427, b/310766628: Bug fixes to only use rec_sse in CBR mode. 2023-11-17 v3.7.1 This release includes several bug fixes. This release is ABI compatible with the last release. See https://aomedia.googlesource.com/aom/+log/v3.7.0..v3.7.1 for all the commits in this release. - Bug Fixes * aomedia:3349: heap overflow when increasing resolution * aomedia:3478: GCC 12.2.0 emits a -Wstringop-overflow warning on aom/av1/encoder/motion_search_facade.c * aomedia:3489: Detect encoder and image high bit depth mismatch * aomedia:3491: heap-buffer-overflow on frame size change (CVE-2023-6879) * b/303023614: Segfault at encoding time for high bit depth images 2023-08-10 v3.7.0 This release includes new codec interfaces, compression efficiency and perceptual improvements, speedup and memory optimizations and many bug fixes. This release is ABI compatible with the last release. - New Features * New codec controls: * AV1E_SET_QUANTIZER_ONE_PASS: Set quantizer for each frame. * AV1E_ENABLE_RATE_GUIDE_DELTAQ: enable the rate distribution guided delta quantization in all intra mode. The "enable-rate-guide-deltaq" option is added for this control. * AV1E_SET_RATE_DISTRIBUTION_INFO: set the input file for rate distribution used in all intra mode. The "rate-distribution-info" option is added for this control. * AV1E_GET_LUMA_CDEF_STRENGTH * AV1E_SET_BITRATE_ONE_PASS_CBR * AOM_SCALING_MODE is extended to include 2/3 and 1/3 scaling. * aom_tune_metric is extended to include AOM_TUNE_VMAF_SALIENCY_MAP. The "tune" option is extended to include "vmaf_saliency_map". * SVC example encoder svc_encoder_rtc is able to use the rate control library. * Loopfilter level and CDEF filter level is supported by RTC rate control library. * New speed (--cpu-used) 11, intended for RTC screen sharing, added for faster encoding with ~3% bdrate loss with 16% IC (instruction count) speedup compared to speed 10. - Compression Efficiency Improvements * Improved VoD encoding performance * 0.1-0.6% BDrate gains for encoding speeds 2 to 6 * Rate control accuracy improvement in VBR mode * RTC encoding improvements * Screen content mode: 10-19% BDrate gains for speeds 6 - 10 * Temporal layers video mode, for speed 10: * 2 temporal layers on low resolutions: 13-15% BDrate gain * 3 temporal layers on VGA/HD: 3-4% BDrate gain - Perceptual Quality Improvements * Fixed multiple block and color artifacts for RTC screen content by * Incorporating color into RD cost for IDTX * Reducing thresholds for palette mode in non RD mode * Allowing more palette mode testing * Improved color sensitivity for altref in non-RD mode. * Reduced video flickering for temporal layer encoding. - Speedup and Memory Optimizations * Speed up the VoD encoder * 2-5% for encoding speed 2 to 4 * 9-15% for encoding speed 5 to 6 * ARM * Standard bitdepth * speed 5: +31% * speed 4: +2% * speed 3: +9% * speed 2: +157% * High bitdepth * speed 5: +85% * RTC speedups * Screen content mode * 15% IC speedup for speeds 6-8 * ARM: 7% for speed 9, 3% for speed 10 * Temporal layers video mode * 7% speedup for 3 temporal layers on VGA/HD, for speed 10 * Single layer video * x86: 2% IC speedup for speeds 7-10 * ARM: 2-4% speedup across speeds 5-10 - Other improvements * VoD: Major improvements to global motion estimation, now enabled up to speed 4 * RTC * Fixes to make lossless coding work. * Fixes to make frame dropper (--drop_frames) work for single and temporal layers. * Improvements to RPS (reference picture selection) recovery frames. * Improvements to rate control for temporal layers. * libwebm is updated to libwebm-1.0.0.29-9-g1930e3c - Bug Fixes * aomedia:3261 Assertion failed when encoding av1 with film grain and '--monochrome' flag * aomedia:3276 ensure all allocations are checked (partial fix) * aomedia:3451 The libaom library calls exit() * aomedia:3450 enable -Wshadow for C++ sources * aomedia:3449 Test Seg Faults After b459af3e345be402db052a143fcc5383d4b74cbd * aomedia:3416 prune unused symbols / restrict symbol visibility * aomedia:3443 Jenkins failure: UninstantiatedParameterizedTestSuite * aomedia:3434 realtime failures with CONFIG_BITSTREAM_DEBUG=1 * aomedia:3433 DeltaqModeTest crash w/row_mt=0 * aomedia:3429 Encoder crash when turn on both ExternalResize and g_threads > 2 * aomedia:3438 Build failure with `-DSANITIZE=address -DBUILD_SHARED_LIBS=ON` when using clang. * aomedia:3435 Block artifacts when scrolling with AV1 in screen sharing scenarios * aomedia:3170 vmaf tune presets produce extreme glitches in one scene * aomedia:3401 Building shared libaom with MSVC results in a race condition with the export library * aomedia:3420 Floating point exception in av1_tpl_get_frame_importance() * aomedia:3424 heap-buffer-overflow in ScaleFilterCols_16_C() (SIGABRT) * aomedia:3417 examples/svc_encoder_rtc.c is using internal macros and functions * aomedia:3372 SEGV in assign_frame_buffer_p av1_common_int.h * aomedia:3130 'cpu-features.h' file not found on Android NDK 22 * aomedia:3415 Encoder/decoder mismatch for svc_encoder_rtc running 1 SL 3 TL * aomedia:3412 Lossless Mode Fails Loopback Bit Test * aomedia:3409 The use of AV1_VAR_OFFS in av1/encoder/var_based_part.c is incorrect for high bit depths * aomedia:3403 test_libaom fails with error message "feenableexcept() failed" on Linux arm * aomedia:3370 Random color block at fast motion area * aomedia:3393 Assertion failure in av1_convolve_2d_sr_c() * aomedia:3392 Strong artifacting for high bit-depth real-time * aomedia:3376 aomenc --threads=10 --deltaq-mode=3 crashes after "Allintra: multi-threading of calculating differential contrast" * aomedia:3380 Crashes and ASan and TSan errors in deltaq-mode=3 multithreading code * chromium:1410766 heap-buffer-overflow in aom_yv12_copy_v_c * Cannot set level via AV1E_SET_TARGET_SEQ_LEVEL_IDX * Encoding failure due to the use of loop restoration with unintended use of lossless mode. * Signed integer overflow in scan_past_frames * Signed integer overflow in update_a_sep_sym * Flickering in AV1 1440p/2160p HDR transcodes * Fixed artifacts with screen share at encoder speed 10 * Fixed prediction setup for IDTX 2023-05-08 v3.6.1 This release includes several bug fixes. This release is ABI compatible with the last release. See https://aomedia.googlesource.com/aom/+log/v3.6.0..v3.6.1 for all the commits in this release. - Bug Fixes * aomedia:2871: Guard the support of the 7.x and 8.x levels for AV1 under the CONFIG_CWG_C013 config flag, and only output the 7.x and 8.x levels when explicitly requested. * aomedia:3382: Choose sb_size by ppi instead of svc. * aomedia:3384: Fix fullpel search limits. * aomedia:3388: Replace left shift of xq_active by multiplication. * aomedia:3389: Fix MV clamping in av1_mv_pred. * aomedia:3390: set_ld_layer_depth: cap max_layer_depth to MAX_ARF_LAYERS. * aomedia:3418: Fix MV clamping in av1_int_pro_motion_estimation. * aomedia:3429: Move lpf thread data init to lpf_pipeline_mt_init(). * b:266719111: Fix undefined behavior in Arm Neon code. * b:269840681: nonrd_opt: align scan tables. * rtc: Fix is_key_frame setting in variance partition. * Build: Fix build with clang-cl and Visual Studio. * Build: Fix module definition file for MinGW/MSYS. 2023-02-03 v3.6.0 This release includes compression efficiency and perceptual quality improvements, speedup and memory optimizations, and some new features. This release is ABI compatible with the last release. - New Features * New values 20-27 (corresponding to levels 7.0-7.3 and 8.0-8.3) for the encoder control AV1E_SET_TARGET_SEQ_LEVEL_IDX (note that the proposal to add the new levels are still in draft status). The original special value 24 (keep level stats only for level monitoring) is renumbered as 32. * New encoder control AV1E_SET_SKIP_POSTPROC_FILTERING to skip the application of post-processing filters on reconstructed frame in all intra mode. * New encoder option "kf-max-pyr-height": Maximum height of pyramid structure used for the GOP starting with a key frame (-1 to 5). * Make SVC work for screen content. * Rate control improvements to reduce frame-size spikes for screen content coding. * RISC-V architecture support with gcc toolchain. - Compression Efficiency Improvements * Peak compression efficiency in VOD setting is improved by 1%. * 0.7% - 2.2% RTC encoding BDrate gains for real time speed 8 to 10. * 15% RTC encoding BDrate gains for screen content speed 10. - Perceptual Quality Improvements * Resolved a visual quality issue that was reported for high resolution clips (2K) for speed 4 and above in VOD use case. * Visual quality improvements to screen content coding. * Quality improvements to temporal layer RTC coding. - Speedup and Memory Optimizations * RTC single-thread encoder speedup: o ~6% instruction count reduction for speed 5 and 6. o ~15% instruction count reduction for speed 7. o ~10% instruction count reduction for speed 8 to 10 (>=360p resolutions). * RTC multi-thread encoder speedup (beyond single-thread speedup): o 5-8% encode time reduction for speed 7 to 10. * RTC screen-content encoder speedup: o 11% instruction count reduction for speed 9 and 10 (>=720p resolutions). * ~5% reduction in heap memory requirements for RTC, speed 6 to 10. * AVIF: o 4-5% speedup for speed 9 in still-picture encoding mode. o 3-4% heap memory reduction in still-picture encoding mode for 360p-720p resolutions with multiple threads. - Bug Fixes * Added a workaround for an AV1 specification bug which makes TRANSLATION type global motion models unusable. * Fixed AddressSanitizer global-buffer-overflow errors in av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c. * Fixed AddressSanitizer heap-buffer-overflow error in av1_wiener_convolve_add_src_neon(). * chromium:1393384 Avoid scene detection on spatial resize. * aomedia:3308 Remove color artifacts under high motion. * aomedia:3310 Avoid out of memory failures with Visual Studio 2017, 2019, and 2022 for Win32 x86 builds. * aomedia:3346 Make SVC work properly for screen content. * aomedia:3348 Fix a bug where an uninitialized search_site is used. * aomedia:3365 Work around what seems like a Visual Studio 2022 compiler optimization bug. * aomedia:3369 Incorrect PSNR values reported by libaom for 12-bit encode. 2022-08-31 v3.5.0 This release is ABI compatible with the last one, including speedup and memory optimizations, and new APIs and features. - New Features * Support for frame parallel encode for larger number of threads. --fp-mt flag is available for all build configurations. * New codec control AV1E_GET_NUM_OPERATING_POINTS - Speedup and Memory Optimizations * Speed-up multithreaded encoding for good quality mode for larger number of threads through frame parallel encoding: o 30-34% encode time reduction for 1080p, 16 threads, 1x1 tile configuration (tile_rows x tile_columns) o 18-28% encode time reduction for 1080p, 16 threads, 2x4 tile configuration o 18-20% encode time reduction for 2160p, 32 threads, 2x4 tile configuration * 16-20% speed-up for speed=6 to 8 in still-picture encoding mode * 5-6% heap memory reduction for speed=6 to 10 in real-time encoding mode * Improvements to the speed for speed=7, 8 in real-time encoding mode * Improvements to the speed for speed=9, 10 in real-time screen encoding mode * Optimizations to improve multi-thread efficiency in real-time encoding mode * 10-15% speed up for SVC with temporal layers * SIMD optimizations: o Improve av1_quantize_fp_32x32_neon() 1.05x to 1.24x faster o Add aom_highbd_quantize_b{,_32x32,_64x64}_adaptive_neon() 3.15x to 5.6x faster than "C" o Improve av1_quantize_fp_64x64_neon() 1.17x to 1.66x faster o Add aom_quantize_b_avx2() 1.4x to 1.7x faster than aom_quantize_b_avx() o Add aom_quantize_b_32x32_avx2() 1.4x to 2.3x faster than aom_quantize_b_32x32_avx() o Add aom_quantize_b_64x64_avx2() 2.0x to 2.4x faster than aom_quantize_b_64x64_ssse3() o Add aom_highbd_quantize_b_32x32_avx2() 9.0x to 10.5x faster than aom_highbd_quantize_b_32x32_c() o Add aom_highbd_quantize_b_64x64_avx2() 7.3x to 9.7x faster than aom_highbd_quantize_b_64x64_c() o Improve aom_highbd_quantize_b_avx2() 1.07x to 1.20x faster o Improve av1_quantize_fp_avx2() 1.13x to 1.49x faster o Improve av1_quantize_fp_32x32_avx2() 1.07x to 1.54x faster o Improve av1_quantize_fp_64x64_avx2() 1.03x to 1.25x faster o Improve av1_quantize_lp_avx2() 1.07x to 1.16x faster - Bug fixes including but not limited to * aomedia:3206 Assert that skip_width > 0 for deconvolve function * aomedia:3278 row_mt enc: Delay top-right sync when intraBC is enabled * aomedia:3282 blend_a64_*_neon: fix bus error in armv7 * aomedia:3283 FRAME_PARALLEL: Propagate border size to all cpis * aomedia:3283 RESIZE_MODE: Fix incorrect strides being used for motion search * aomedia:3286 rtc-svc: Fix to dynamic_enable spatial layers * aomedia:3289 rtc-screen: Fix to skipping inter-mode test in nonrd * aomedia:3289 rtc-screen: Fix for skip newmv on flat blocks * aomedia:3299 Fix build failure with CONFIG_TUNE_VMAF=1 * aomedia:3296 Fix the conflict --enable-tx-size-search=0 with nonrd mode --enable-tx-size-search will be ignored in non-rd pick mode * aomedia:3304 Fix off-by-one error of max w/h in validate_config * aomedia:3306 Do not use pthread_setname_np on GNU/Hurd * aomedia:3325 row-multithreading produces invalid bitstream in some cases * chromium:1346938, chromium:1338114 * compiler_flags.cmake: fix flag detection w/cmake 3.17-3.18.2 * tools/*.py: update to python3 * aom_configure.cmake: detect PIE and set CONFIG_PIC * test/simd_cmp_impl: use explicit types w/CompareSimd* * rtc: Fix to disable segm for aq-mode=3 * rtc: Fix to color_sensitivity in variance partition * rtc-screen: Fix bsize in model rd computation for intra chroma * Fixes to ensure the correct behavior of the encoder algorithms (like segmentation, computation of statistics, etc.) 2022-06-17 v3.4.0 This release includes compression efficiency and perceptual quality improvements, speedup and memory optimizations, and some new features. There are no ABI or API breaking changes in this release. - New Features * New --dist-metric flag with "qm-psnr" value to use quantization matrices in the distortion computation for RD search. The default value is "psnr". * New command line option "--auto-intra-tools-off=1" to make all-intra encoding faster for high bit rate under "--deltaq-mode=3" mode. * New rate control library aom_av1_rc for real-time hardware encoders. Supports CBR for both one spatial layer and SVC. * New image format AOM_IMG_FMT_NV12 can be used as input to the encoder. The presence of AOM_IMG_FMT_NV12 can be detected at compile time by checking if the macro AOM_HAVE_IMG_FMT_NV12 is defined. * New codec controls for the encoder: o AV1E_SET_AUTO_INTRA_TOOLS_OFF. Only in effect if --deltaq-mode=3. o AV1E_SET_RTC_EXTERNAL_RC o AV1E_SET_FP_MT. Only supported if libaom is built with -DCONFIG_FRAME_PARALLEL_ENCODE=1. o AV1E_GET_TARGET_SEQ_LEVEL_IDX * New key-value pairs for the key-value API: o --auto-intra-tools-off=0 (default) or 1. Only in effect if --deltaq-mode=3. o --strict-level-conformance=0 (default) or 1 o --fp-mt=0 (default) or 1. Only supported if libaom is built with -DCONFIG_FRAME_PARALLEL_ENCODE=1. * New aomenc options (not supported by the key-value API): o --nv12 - Compression Efficiency Improvements * Correctly calculate SSE for high bitdepth in skip mode, 0.2% to 0.6% coding gain. * RTC at speed 9/10: BD-rate gain of ~4/5% * RTC screen content coding: many improvements for real-time screen at speed 10 (quality, speedup, and rate control), up to high resolutions (1080p). * RTC-SVC: fixes to make intra-only frames work for spatial layers. * RTC-SVC: quality improvements for temporal layers. * AV1 RT: A new passive rate control strategy for screen content, an average of 7.5% coding gain, with some clips of 20+%. The feature is turned off by default due to higher bit rate variation. - Perceptual Quality Improvements * RTC: Visual quality improvements for high speeds (9/10) * Improvements in coding quality for all intra mode - Speedup and Memory Optimizations * ~10% speedup in good quality mode encoding. * ~7% heap memory reduction in good quality encoding mode for speed 5 and 6. * Ongoing improvements to intra-frame encoding performance on Arm * Faster encoding speed for "--deltaq-mode=3" mode. * ~10% speedup for speed 5/6, ~15% speedup for speed 7/8, and ~10% speedup for speed 9/10 in real time encoding mode * ~20% heap memory reduction in still-picture encoding mode for 360p-720p resolutions with multiple threads * ~13% speedup for speed 6 and ~12% speedup for speed 9 in still-picture encoding mode. * Optimizations to improve multi-thread efficiency for still-picture encoding mode. - Bug Fixes * b/204460717: README.md: replace master with main * b/210677928: libaom disable_order is surprising for max_reference_frames=3 * b/222461449: -DCONFIG_TUNE_BUTTERAUGLI=1 broken * b/227207606: write_greyscale writes incorrect chroma in highbd mode * b/229955363: Integer-overflow in linsolve_wiener * https://crbug.com/aomedia/2032 * https://crbug.com/aomedia/2397 * https://crbug.com/aomedia/2563 * https://crbug.com/aomedia/2815 * https://crbug.com/aomedia/3009 * https://crbug.com/aomedia/3018 * https://crbug.com/aomedia/3045 * https://crbug.com/aomedia/3101 * https://crbug.com/aomedia/3130 * https://crbug.com/aomedia/3173 * https://crbug.com/aomedia/3184 * https://crbug.com/aomedia/3187 * https://crbug.com/aomedia/3190 * https://crbug.com/aomedia/3195 * https://crbug.com/aomedia/3197 * https://crbug.com/aomedia/3201 * https://crbug.com/aomedia/3202 * https://crbug.com/aomedia/3204 * https://crbug.com/aomedia/3205 * https://crbug.com/aomedia/3207 * https://crbug.com/aomedia/3208 * https://crbug.com/aomedia/3209 * https://crbug.com/aomedia/3213 * https://crbug.com/aomedia/3214 * https://crbug.com/aomedia/3219 * https://crbug.com/aomedia/3222 * https://crbug.com/aomedia/3223 * https://crbug.com/aomedia/3225 * https://crbug.com/aomedia/3226 * https://crbug.com/aomedia/3228 * https://crbug.com/aomedia/3232 * https://crbug.com/aomedia/3236 * https://crbug.com/aomedia/3237 * https://crbug.com/aomedia/3238 * https://crbug.com/aomedia/3240 * https://crbug.com/aomedia/3243 * https://crbug.com/aomedia/3244 * https://crbug.com/aomedia/3246 * https://crbug.com/aomedia/3248 * https://crbug.com/aomedia/3250 * https://crbug.com/aomedia/3251 * https://crbug.com/aomedia/3252 * https://crbug.com/aomedia/3255 * https://crbug.com/aomedia/3257 * https://crbug.com/aomedia/3259 * https://crbug.com/aomedia/3260 * https://crbug.com/aomedia/3267 * https://crbug.com/aomedia/3268 * https://crbug.com/aomedia/3269 * https://crbug.com/aomedia/3276 * https://crbug.com/aomedia/3278 * https://crbug.com/chromium/1290068 * https://crbug.com/chromium/1303237 * https://crbug.com/chromium/1304990 * https://crbug.com/chromium/1321141 * https://crbug.com/chromium/1321388 * https://crbug.com/oss-fuzz/44846 * https://crbug.com/oss-fuzz/44856 * https://crbug.com/oss-fuzz/44862 * https://crbug.com/oss-fuzz/44904 * https://crbug.com/oss-fuzz/45056 2022-01-28 v3.3.0 This release includes compression efficiency and perceptual quality improvements, speedup and memory optimizations, some new features, and several bug fixes. - New Features * AV1 RT: Introducing CDEF search level 5 * Changed real time speed 4 to behave the same as real time speed 5 * Add --deltaq-strength * rtc: Allow scene-change and overshoot detection for svc * rtc: Intra-only frame for svc * AV1 RT: Option 2 for codec control AV1E_SET_ENABLE_CDEF to disable CDEF on non-ref frames * New codec controls AV1E_SET_LOOPFILTER_CONTROL and AOME_GET_LOOPFILTER_LEVEL * Improvements to three pass encoding - Compression Efficiency Improvements * Overall compression gains: 0.6% - Perceptual Quality Improvements * Improves the perceptual quality of high QP encoding for delta-q mode 4 * Auto select noise synthesis level for all intra - Speedup and Memory Optimizations * Added many SSE2 optimizations. * Good quality 2-pass encoder speedups: o Speed 2: 9% o Speed 3: 12.5% o Speed 4: 8% o Speed 5: 3% o Speed 6: 4% * Real time mode encoder speedups: o Speed 5: 2.6% BDRate gain, 4% speedup o Speed 6: 3.5% BDRate gain, 4% speedup o Speed 9: 1% BDRate gain, 3% speedup o Speed 10: 3% BDRate gain, neutral speedup * All intra encoding speedups (AVIF): o Single thread - speed 6: 8% o Single thread - speed 9: 15% o Multi thread(8) - speed 6: 14% o Multi thread(8) - speed 9: 34% - Bug Fixes * Issue 3163: Segmentation fault when using --enable-keyframe-filtering=2 * Issue 2436: Integer overflow in av1_warp_affine_c() * Issue 3226: armv7 build failure due to gcc-11 * Issue 3195: Bug report on libaom (AddressSanitizer: heap-buffer-overflow) * Issue 3191: Bug report on libaom (AddressSanitizer: SEGV on unknown address) * Issue 3176: Some SSE2/SADx4AvgTest.* tests fail on Windows * Issue 3175: Some SSE2/SADSkipTest.* tests fail on Windows 2021-10-13 v3.2.0 This release includes compression efficiency and perceptual quality improvements, speedup and memory optimizations, as well as some new features. - New Features * Introduced speeds 7, 8, and 9 for all intra mode. * Introduced speed 10 for real time mode. * Introduced an API that allows external partition decisions. * SVC: added support for compound prediction. * SVC: added support for fixed SVC modes. - Compression Efficiency Improvements * Intra-mode search improvement. * Improved real time (RT) mode BDrate savings by ~5% (RT speed 5) and ~12% (RT speed 6). The improvement was measured on the video conference set. * Improved real time mode for nonrd path (speed 7, 8, 9): BDrate gains of ~3-5%. * Rate control and RD adjustments based on ML research in VP9. Gains of ~0.5-1.0% for HD. - Perceptual Quality Improvements * Added a new mode --deltaq-mode=3 to improve perceptual quality based on a differential contrast model for still images. * Added a new mode --deltaq-mode=4 to improve perceptual quality based on user rated cq_level data set for still images. * Weighting of some intra mode and partition size choices to better manage and retain texture. - Speedup and Memory Optimizations * Further improved 2-pass good quality encoder speed: o Speed 2 speedup: 18% o Speed 3 speedup: 22% o Speed 4 speedup: 37% o Speed 5 speedup: 30% o Speed 6 speedup: 20% * Optimized the real time encoder (measured on the video conference set): o RT speed 5 speedup: 110% o RT speed 6 speedup: 77% - Bug Fixes * Issue 3069: Fix one-pass mode keyframe placement off-by-one error. * Issue 3156: Fix a bug in av1_quantize_lp AVX2 optimization. 2021-09-29 v3.1.3 This release includes several bug fixes. - Bug fixes: The following four cmake changes should help the people building libaom using MSVC. 1. exports: use CMAKE_SHARED_LIBRARY_PREFIX to determine lib name https://aomedia-review.googlesource.com/c/aom/+/142342 2. aom_install: Install lib dlls to bindir https://aomedia-review.googlesource.com/c/aom/+/146546 3. aom_install: use relpath for install https://aomedia-review.googlesource.com/c/aom/+/146550 4. aom_install: don't exclude msvc from install https://aomedia-review.googlesource.com/c/aom/+/146547 aom/aom_encoder.h: remove configure option reference https://aomedia-review.googlesource.com/c/aom/+/146743 Issue 3113: Tests for detecting chroma subsampling in av1_copy_and_extend_frame() do not work when y_width or y_height is 1 Issue 3115: image2yuvconfig() should calculate uv_crop_width and uv_crop_height from y_crop_width and y_crop_height Issue 3140: rc_overshoot_pct is documented as having a range of 0-1000, but is range checked against 0-100 Issue 3147: Build failure on Apple M1 arm64 2021-07-20 v3.1.2 This release includes several bug fixes. - Bug fixes: exports.cmake: use APPLE and WIN32 and use def for mingw-w64 https://aomedia-review.googlesource.com/c/aom/+/139882 Issue 2993: Incorrect spatial_id when decoding base layer of multi-layer stream Issue 3080: Chroma Resampling by Encoder on Y4M Inputs Files Tagged as C420mpeg2 Issue 3081: Use of uninitialized value $version_extra in concatenation (.) or string at aom/build/cmake/version.pl line 88. 2021-06-08 v3.1.1 This release includes several bug fixes. - Bug fixes: Issue 2965: Cherry-picked the following four commits for the tune=butteraugli mode. 1. Add libjxl to pkg_config if enabled: https://aomedia-review.googlesource.com/c/aom/+/136044 2. Declare set_mb_butteraugli_rdmult_scaling static: https://aomedia-review.googlesource.com/c/aom/+/134506 3. Add color range detection in tune=butteraugli mode: https://aomedia-review.googlesource.com/c/aom/+/135521 4. Enable tune=butteraugli in all-intra mode: https://aomedia-review.googlesource.com/c/aom/+/136082 Issue 3021: Fix vmaf model initialization error when not set to tune=vmaf Issue 3050: Compilation fails with -DCONFIG_TUNE_VMAF=1 Issue 3054: Consistent crash on near-static screen content, keyframe related 2021-05-03 v3.1.0 This release adds an "all intra" mode to the encoder, which significantly speeds up the encoding of AVIF still images at speed 6. - Upgrading: All intra mode for encoding AVIF still images and AV1 all intra videos: AOM_USAGE_ALL_INTRA (2) can be passed as the 'usage' argument to aom_codec_enc_config_default(). New encoder control IDs added: - AV1E_SET_ENABLE_DIAGONAL_INTRA: Enable diagonal (D45 to D203) intra prediction modes (0: false, 1: true (default)). Also available as "enable-diagonal-intra" for the aom_codec_set_option() function. New aom_tune_metric enum value: AOM_TUNE_BUTTERAUGLI. The new aomenc option --tune=butteraugli was added to optimize the encoder's perceptual quality by optimizing the Butteraugli metric. Install libjxl (JPEG XL) and then pass -DCONFIG_TUNE_BUTTERAUGLI=1 to the cmake command to enable it. Addition of support for libvmaf 2.x. - Enhancements: Heap memory consumption for encoding AVIF still images is significantly reduced. - Bug fixes: Issue 2601: third_party/libaom fails licensecheck Issue 2950: Conditional expression for rc->this_key_frame_forced is always true in find_next_key_frame() Issue 2988: "make install" installs the aom.h header twice Issue 2992: Incorrectly printing the temporal_id twice in dump_obu tool Issue 2998: Issue 2999: Issue 3000: 2021-02-24 v3.0.0 This release includes compression efficiency improvement, speed improvement for realtime mode, as well as some new APIs. - Upgrading: Support for PSNR calculation based on stream bit-depth. New encoder control IDs added: - AV1E_SET_ENABLE_RECT_TX - AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP - AV1E_GET_BASELINE_GF_INTERVAL - AV1E_SET_ENABLE_DNL_DENOISING New decoder control IDs added: - AOMD_GET_FWD_KF_PRESENT - AOMD_GET_FRAME_FLAGS - AOMD_GET_ALTREF_PRESENT - AOMD_GET_TILE_INFO - AOMD_GET_SCREEN_CONTENT_TOOLS_INFO - AOMD_GET_STILL_PICTURE - AOMD_GET_SB_SIZE - AOMD_GET_SHOW_EXISTING_FRAME_FLAG - AOMD_GET_S_FRAME_INFO New aom_tune_content enum value: AOM_CONTENT_FILM New aom_tune_metric enum value: AOM_TUNE_VMAF_NEG_MAX_GAIN Coefficient and mode update can be turned off via AV1E_SET_{COEFF/MODE}_COST_UPD_FREQ. New key & value API added, available with aom_codec_set_option() function. Scaling API expanded to include 1/4, 3/4 and 1/8. - Enhancements: Better multithreading performance with realtime mode. New speed 9 setting for faster realtime encoding. Smaller binary size with low bitdepth and realtime only build. Temporal denoiser and its optimizations on x86 and Neon. Optimizations for scaling. Faster encoding with speed settings 2 to 6 for good encoding mode. Improved documentation throughout the library, with function level documentation, tree view and support for the dot tool. - Bug fixes: Aside from those mentioned in v2.0.1 and v2.0.2, this release includes the following bug fixes: Issue 2940: Segfault when encoding with --use-16bit-internal and --limit > 1 Issue 2941: Decoder mismatch with --rt --bit-depth=10 and --cpu-used=8 Issue 2895: mingw-w64 i686 gcc fails to build Issue 2874: Separate ssse3 functions from sse2 file. 2021-02-09 v2.0.2 This release includes several bug fixes. - Bug fixes: Issue 2643: Modify the assertion in temporal filter intrinsics. Issue 2648: Fix unit test ThreadTestLarge.EncoderResultTest/49 assertion failure. Issue 2869: Add -Wimplicit-function-declaration as C flag only. Issue 2878: Avoid memset in the av1_filter_intra_predictor module functions. Issue 2903: Fix a typo bug in apply_temporal_filter_planewise. Call av1_setup_frame_size() when dropping a frame in the encode_frame_to_data_rate() function in av1/encoder/encoder.c. 2020-11-25 v2.0.1 This release includes two bug fixes. - Bug fixes: Issue 2723: Fix crash in chroma_check() when generating a monochrome encoded stream in real-time mode. Issue 2833: Fix crash on some input when reduced still picture header is used in real-time mode and speed >=7. 2020-05-07 v2.0.0 "Applejack" First official release of libaom. This release includes new real-time mode and SVC support. - Upgrading: AOM_SET_POSTPROC, AOM_CODEC_CAP_POSTPROC and AOM_CODEC_USE_POSTPROC are removed. AOM_SET_DBG_* is removed. Multi-resolution encoding is removed. put_frame and put_slice callbacks are removed. - Enhancements: Full-sweep document update for codec controls. 2018-06-28 v1.0.0 AOMedia Codec Workgroup Approved version 1.0 2016-04-07 v0.1.0 "AOMedia Codec 1" This release is the first Alliance for Open Media codec. aom-3.12.1/CMakeLists.txt000066400000000000000000001252471477627663500151540ustar00rootroot00000000000000# # Copyright (c) 2016, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # cmake_minimum_required(VERSION 3.16) set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}") set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}") if("${AOM_ROOT}" STREQUAL "${AOM_CONFIG_DIR}") message( FATAL_ERROR "Building from within the aom source tree is not supported.\n" "Hint: Run these commands\n" "$ rm -rf CMakeCache.txt CMakeFiles\n" "$ mkdir -p ../aom_build\n" "$ cd ../aom_build\n" "And re-run CMake from the aom_build directory.") endif() project(AOM C CXX) # GENERATED source property global visibility. if(POLICY CMP0118) cmake_policy(SET CMP0118 NEW) endif() if(NOT EMSCRIPTEN) if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type: Debug, Release, RelWithDebInfo or MinSizeRel" FORCE) endif() endif() if(MSVC AND MSVC_VERSION LESS 1920) message( WARNING "MSVC versions prior to 2019 (v16) are not supported and may generate" " incorrect code!") endif() # Library version info. Update LT_CURRENT, LT_REVISION and LT_AGE when making a # public release by following the guidelines in the libtool document: # https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info # # c=, r=, a= # # libtool generates a .so file as .so.[c-a].a.r, while -version-info c:r:a is # passed to libtool. # # We set SO_FILE_VERSION = [c-a].a.r set(LT_CURRENT 15) set(LT_REVISION 1) set(LT_AGE 12) math(EXPR SO_VERSION "${LT_CURRENT} - ${LT_AGE}") set(SO_FILE_VERSION "${SO_VERSION}.${LT_AGE}.${LT_REVISION}") unset(LT_CURRENT) unset(LT_REVISION) unset(LT_AGE) # Enable generators like Xcode and Visual Studio to place projects in folders. set_property(GLOBAL PROPERTY USE_FOLDERS TRUE) include("${AOM_ROOT}/build/cmake/aom_configure.cmake") if(CONFIG_THREE_PASS) include("${AOM_ROOT}/common/ivf_dec.cmake") endif() include("${AOM_ROOT}/aom_dsp/aom_dsp.cmake") include("${AOM_ROOT}/aom_mem/aom_mem.cmake") include("${AOM_ROOT}/aom_ports/aom_ports.cmake") include("${AOM_ROOT}/aom_scale/aom_scale.cmake") include("${AOM_ROOT}/aom_util/aom_util.cmake") include("${AOM_ROOT}/av1/av1.cmake") include("${AOM_ROOT}/build/cmake/aom_install.cmake") include("${AOM_ROOT}/build/cmake/sanitizers.cmake") include("${AOM_ROOT}/build/cmake/util.cmake") include("${AOM_ROOT}/test/test.cmake") list(APPEND AOM_RTCD_SOURCES "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h" "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h" "${AOM_CONFIG_DIR}/config/av1_rtcd.h" "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl" "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c" "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl" "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c" "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl" "${AOM_ROOT}/av1/common/av1_rtcd.c" "${AOM_ROOT}/build/cmake/rtcd.pl") list(APPEND AOM_LIBWEBM_SOURCES "${AOM_ROOT}/third_party/libwebm/common/hdr_util.cc" "${AOM_ROOT}/third_party/libwebm/common/hdr_util.h" "${AOM_ROOT}/third_party/libwebm/common/webmids.h" "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxer.cc" "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxer.h" "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxertypes.h" "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc" "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxerutil.h" "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvwriter.cc" "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvwriter.h" "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvparser.cc" "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvparser.h" "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvreader.cc" "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvreader.h") list(APPEND AOM_LIBYUV_SOURCES "${AOM_ROOT}/third_party/libyuv/include/libyuv/basic_types.h" "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert.h" "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert_argb.h" "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert_from.h" "${AOM_ROOT}/third_party/libyuv/include/libyuv/cpu_id.h" "${AOM_ROOT}/third_party/libyuv/include/libyuv/planar_functions.h" "${AOM_ROOT}/third_party/libyuv/include/libyuv/rotate.h" "${AOM_ROOT}/third_party/libyuv/include/libyuv/row.h" "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale.h" "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale_row.h" "${AOM_ROOT}/third_party/libyuv/source/convert_argb.cc" "${AOM_ROOT}/third_party/libyuv/source/cpu_id.cc" "${AOM_ROOT}/third_party/libyuv/source/planar_functions.cc" "${AOM_ROOT}/third_party/libyuv/source/row_any.cc" "${AOM_ROOT}/third_party/libyuv/source/row_common.cc" "${AOM_ROOT}/third_party/libyuv/source/row_gcc.cc" "${AOM_ROOT}/third_party/libyuv/source/row_mips.cc" "${AOM_ROOT}/third_party/libyuv/source/row_neon.cc" "${AOM_ROOT}/third_party/libyuv/source/row_neon64.cc" "${AOM_ROOT}/third_party/libyuv/source/row_win.cc" "${AOM_ROOT}/third_party/libyuv/source/scale.cc" "${AOM_ROOT}/third_party/libyuv/source/scale_any.cc" "${AOM_ROOT}/third_party/libyuv/source/scale_common.cc" "${AOM_ROOT}/third_party/libyuv/source/scale_gcc.cc" "${AOM_ROOT}/third_party/libyuv/source/scale_mips.cc" "${AOM_ROOT}/third_party/libyuv/source/scale_neon.cc" "${AOM_ROOT}/third_party/libyuv/source/scale_neon64.cc" "${AOM_ROOT}/third_party/libyuv/source/scale_win.cc" "${AOM_ROOT}/third_party/libyuv/source/scale_uv.cc") list(APPEND AOM_SOURCES "${AOM_CONFIG_DIR}/config/aom_config.c" "${AOM_CONFIG_DIR}/config/aom_config.h" "${AOM_ROOT}/aom/aom.h" "${AOM_ROOT}/aom/aom_codec.h" "${AOM_ROOT}/aom/aom_decoder.h" "${AOM_ROOT}/aom/aom_encoder.h" "${AOM_ROOT}/aom/aom_external_partition.h" "${AOM_ROOT}/aom/aom_frame_buffer.h" "${AOM_ROOT}/aom/aom_image.h" "${AOM_ROOT}/aom/aom_integer.h" "${AOM_ROOT}/aom/aomcx.h" "${AOM_ROOT}/aom/aomdx.h" "${AOM_ROOT}/aom/internal/aom_codec_internal.h" "${AOM_ROOT}/aom/internal/aom_image_internal.h" "${AOM_ROOT}/aom/src/aom_codec.c" "${AOM_ROOT}/aom/src/aom_decoder.c" "${AOM_ROOT}/aom/src/aom_encoder.c" "${AOM_ROOT}/aom/src/aom_image.c" "${AOM_ROOT}/aom/src/aom_integer.c") list(APPEND AOM_COMMON_APP_UTIL_SOURCES "${AOM_ROOT}/av1/arg_defs.c" "${AOM_ROOT}/av1/arg_defs.h" "${AOM_ROOT}/common/args_helper.c" "${AOM_ROOT}/common/args_helper.h" "${AOM_ROOT}/common/args.c" "${AOM_ROOT}/common/args.h" "${AOM_ROOT}/common/av1_config.c" "${AOM_ROOT}/common/av1_config.h" "${AOM_ROOT}/common/md5_utils.c" "${AOM_ROOT}/common/md5_utils.h" "${AOM_ROOT}/common/tools_common.c" "${AOM_ROOT}/common/tools_common.h" "${AOM_ROOT}/common/video_common.h" "${AOM_ROOT}/common/rawenc.c" "${AOM_ROOT}/common/rawenc.h" "${AOM_ROOT}/common/y4menc.c" "${AOM_ROOT}/common/y4menc.h" "${AOM_ROOT}/common/ivfdec.c" "${AOM_ROOT}/common/ivfdec.h") list(APPEND AOM_DECODER_APP_UTIL_SOURCES "${AOM_ROOT}/common/obudec.c" "${AOM_ROOT}/common/obudec.h" "${AOM_ROOT}/common/video_reader.c" "${AOM_ROOT}/common/video_reader.h") list(APPEND AOM_ENCODER_APP_UTIL_SOURCES "${AOM_ROOT}/common/ivfenc.c" "${AOM_ROOT}/common/ivfenc.h" "${AOM_ROOT}/common/video_writer.c" "${AOM_ROOT}/common/video_writer.h" "${AOM_ROOT}/common/warnings.c" "${AOM_ROOT}/common/warnings.h" "${AOM_ROOT}/common/y4minput.c" "${AOM_ROOT}/common/y4minput.h" "${AOM_ROOT}/examples/encoder_util.h" "${AOM_ROOT}/examples/encoder_util.c" "${AOM_ROOT}/examples/multilayer_metadata.h" "${AOM_ROOT}/examples/multilayer_metadata.cc") list(APPEND AOM_ENCODER_STATS_SOURCES "${AOM_ROOT}/stats/aomstats.c" "${AOM_ROOT}/stats/aomstats.h" "${AOM_ROOT}/stats/rate_hist.c" "${AOM_ROOT}/stats/rate_hist.h") list(APPEND AOM_VERSION_SOURCES "${AOM_CONFIG_DIR}/config/aom_version.h") list(APPEND AOM_WEBM_DECODER_SOURCES "${AOM_ROOT}/common/webmdec.cc" "${AOM_ROOT}/common/webmdec.h") list(APPEND AOM_WEBM_ENCODER_SOURCES "${AOM_ROOT}/common/webmenc.cc" "${AOM_ROOT}/common/webmenc.h") include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR} ${AOM_ROOT}/apps ${AOM_ROOT}/common ${AOM_ROOT}/examples ${AOM_ROOT}/stats) # Targets add_library(aom_version ${AOM_VERSION_SOURCES}) add_no_op_source_file_to_target(aom_version c) add_custom_command(OUTPUT "${AOM_CONFIG_DIR}/config/aom_version.h" COMMAND ${CMAKE_COMMAND} ARGS -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} -DAOM_ROOT=${AOM_ROOT} -DGIT_EXECUTABLE=${GIT_EXECUTABLE} -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P "${AOM_ROOT}/build/cmake/version.cmake" COMMENT "Writing aom_version.h" VERBATIM) add_custom_target(aom_version_check COMMAND ${CMAKE_COMMAND} -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} -DAOM_ROOT=${AOM_ROOT} -DGIT_EXECUTABLE=${GIT_EXECUTABLE} -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P "${AOM_ROOT}/build/cmake/version.cmake" COMMENT "Updating version info if necessary." VERBATIM) if(BUILD_SHARED_LIBS AND NOT MSVC) # Generate version file immediately for non-MSVC shared builds: The version # string is needed for the aom target. execute_process(COMMAND ${CMAKE_COMMAND} -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} -DAOM_ROOT=${AOM_ROOT} -DGIT_EXECUTABLE=${GIT_EXECUTABLE} -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P "${AOM_ROOT}/build/cmake/version.cmake") endif() add_dependencies(aom_version aom_version_check) # TODO(tomfinegan): Move rtcd target setup where it belongs for each rtcd # source. add_rtcd_build_step("${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl" "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h" "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c" "aom_dsp_rtcd") add_rtcd_build_step("${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl" "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h" "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c" "aom_scale_rtcd") add_rtcd_build_step("${AOM_ROOT}/av1/common/av1_rtcd_defs.pl" "${AOM_CONFIG_DIR}/config/av1_rtcd.h" "${AOM_ROOT}/av1/common/av1_rtcd.c" "av1_rtcd") add_library(aom_rtcd OBJECT ${AOM_RTCD_SOURCES}) add_dependencies(aom_rtcd aom_version) if(ENABLE_EXAMPLES) add_library(aom_encoder_stats OBJECT ${AOM_ENCODER_STATS_SOURCES}) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_encoder_stats) endif() # Xcode generator cannot take a library composed solely of objects. See # https://gitlab.kitware.com/cmake/cmake/-/issues/17500 if(XCODE) set(target_objs_aom ${AOM_SOURCES}) else() add_library(aom_obj OBJECT ${AOM_SOURCES}) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_obj) set(target_objs_aom $) endif() add_library(aom ${target_objs_aom} $) if(BUILD_SHARED_LIBS) add_library(aom_static STATIC ${target_objs_aom} $) set_target_properties(aom_static PROPERTIES OUTPUT_NAME aom) if(MSVC OR (WIN32 AND NOT MINGW)) # Fix race condition between the import library and the static library. # Affects MSVC in all three flavors (stock, clang-cl, LLVM -- the latter # sets MSVC and MINGW both to FALSE). set_target_properties(aom PROPERTIES ARCHIVE_OUTPUT_NAME "aom_dll") endif() if(NOT MSVC) # Extract version string and set VERSION/SOVERSION for the aom target. extract_version_string("${AOM_CONFIG_DIR}/config/aom_version.h" aom_version_triple) # Strip any trailing version information, if present. string(FIND "${aom_version_triple}" "-" dash_pos) if(NOT dash_pos EQUAL -1) string(SUBSTRING "${aom_version_triple}" 0 ${dash_pos} aom_version_triple) endif() # cmake-format: off # VERSION is embedded in the .so file name. # libaom.so -> libaom.so.SOVERSION # libaom.so.SOVERSION -> libaom.so.VERSION # libaom.so.VERSION # cmake-format: on set_target_properties(aom PROPERTIES SOVERSION ${SO_VERSION}) set_target_properties(aom PROPERTIES VERSION ${SO_FILE_VERSION}) endif() endif() if(NOT WIN32 AND NOT APPLE) target_link_libraries(aom ${AOM_LIB_LINK_TYPE} m) if(BUILD_SHARED_LIBS) target_link_libraries(aom_static ${AOM_LIB_LINK_TYPE} m) endif() endif() if(CONFIG_AV1_ENCODER) list(APPEND AOM_AV1_RC_SOURCES "${AOM_ROOT}/av1/ratectrl_rtc.h" "${AOM_ROOT}/av1/ratectrl_rtc.cc") add_library(aom_av1_rc ${AOM_AV1_RC_SOURCES}) # aom_av1_rc calls libaom's internal functions, so it must be linked with the # libaom static library. if(BUILD_SHARED_LIBS) target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom_static) # TODO: https://aomedia.issues.chromium.org/391715078 - This condition can # be removed after aom_av1_rc restricts its symbol visibility. if(CYGWIN OR MINGW) target_link_options(aom_av1_rc ${AOM_LIB_LINK_TYPE} LINKER:--allow-multiple-definition) endif() else() target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom) endif() if(BUILD_SHARED_LIBS) # On Windows, global symbols are not exported from a DLL by default. Enable # the WINDOWS_EXPORT_ALL_SYMBOLS property to export all global symbols from # the aom_av1_rc DLL on Windows, to match the default behavior on other # platforms. set_target_properties(aom_av1_rc PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) # The aom_av1_rc library and its header "av1/ratectrl_rtc.h" are not # installed by the "install" command, so we don't need to worry about # versioning the aom_av1_rc shared library. If we start to install the # aom_av1_rc library, the library should be versioned. endif() if(NOT WIN32 AND NOT APPLE) target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} m) endif() set_target_properties(aom_av1_rc PROPERTIES LINKER_LANGUAGE CXX) endif() # List of object and static library targets. set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_rtcd aom_mem aom_scale aom) if(CONFIG_AV1_ENCODER) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_rc) endif() if(BUILD_SHARED_LIBS) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_static) endif() # Setup dependencies. if(CONFIG_THREE_PASS) setup_ivf_dec_targets() endif() setup_aom_dsp_targets() setup_aom_mem_targets() setup_aom_ports_targets() setup_aom_util_targets() setup_aom_scale_targets() setup_av1_targets() # Make all library targets depend on aom_rtcd to make sure it builds first. foreach(aom_lib ${AOM_LIB_TARGETS}) if(NOT "${aom_lib}" STREQUAL "aom_rtcd") add_dependencies(${aom_lib} aom_rtcd) endif() endforeach() # Generate a C file containing the function usage_exit(). Users of the # aom_common_app_util library must define this function. This is a convenience # to allow omission of the function from applications that might want to use # other pieces of the util support without defining usage_exit(). file(WRITE "${AOM_GEN_SRC_DIR}/usage_exit.c" "#include \n\n#include \"common/tools_common.h\"\n\n" "void usage_exit(void) { exit(EXIT_FAILURE); }\n") # # Application and application support targets. # if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS) add_library(aom_common_app_util OBJECT ${AOM_COMMON_APP_UTIL_SOURCES}) add_library(aom_usage_exit OBJECT "${AOM_GEN_SRC_DIR}/usage_exit.c") set_property(TARGET ${example} PROPERTY FOLDER examples) if(CONFIG_AV1_DECODER) add_library(aom_decoder_app_util OBJECT ${AOM_DECODER_APP_UTIL_SOURCES}) set_property(TARGET ${example} PROPERTY FOLDER examples) # obudec depends on internal headers that require *rtcd.h add_dependencies(aom_decoder_app_util aom_rtcd) endif() if(CONFIG_AV1_ENCODER) add_library(aom_encoder_app_util OBJECT ${AOM_ENCODER_APP_UTIL_SOURCES}) set_property(TARGET ${example} PROPERTY FOLDER examples) endif() endif() if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES) add_executable(aomdec "${AOM_ROOT}/apps/aomdec.c" $ $) add_executable(decode_to_md5 "${AOM_ROOT}/examples/decode_to_md5.c" $ $) add_executable(decode_with_drops "${AOM_ROOT}/examples/decode_with_drops.c" $ $) add_executable(simple_decoder "${AOM_ROOT}/examples/simple_decoder.c" $ $) add_executable(scalable_decoder "${AOM_ROOT}/examples/scalable_decoder.c" $ $) if(CONFIG_ANALYZER) add_executable(analyzer "${AOM_ROOT}/examples/analyzer.cc" $ $) target_link_libraries(analyzer ${AOM_LIB_LINK_TYPE} ${wxWidgets_LIBRARIES}) list(APPEND AOM_APP_TARGETS analyzer) list(APPEND AOM_DECODER_EXAMPLE_TARGETS analyzer) endif() if(CONFIG_INSPECTION) add_executable(inspect "${AOM_ROOT}/examples/inspect.c" $ $) list(APPEND AOM_DECODER_EXAMPLE_TARGETS inspect) if(EMSCRIPTEN) add_preproc_definition(_POSIX_SOURCE) append_link_flag_to_target("inspect" "--emrun") append_link_flag_to_target("inspect" "-s USE_PTHREADS=0") append_link_flag_to_target("inspect" "-s WASM=1") append_link_flag_to_target("inspect" "-s MODULARIZE=1") append_link_flag_to_target("inspect" "-s ALLOW_MEMORY_GROWTH=1") append_link_flag_to_target( "inspect" "-s \'EXTRA_EXPORTED_RUNTIME_METHODS=[\"UTF8ToString\"]\'") append_link_flag_to_target("inspect" "-s EXPORT_NAME=\"\'DecoderModule\'\"") append_link_flag_to_target("inspect" "--memory-init-file 0") if("${CMAKE_BUILD_TYPE}" STREQUAL "") # Default to -O3 when no build type is specified. append_compiler_flag("-O3") endif() em_link_post_js(inspect "${AOM_ROOT}/tools/inspect-post.js") endif() endif() # Maintain a list of decoder example targets. list(APPEND AOM_DECODER_EXAMPLE_TARGETS aomdec decode_to_md5 decode_with_drops scalable_decoder simple_decoder) # Add decoder examples to the app targets list. list(APPEND AOM_APP_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS}) endif() if(CONFIG_LIBYUV OR CONFIG_TUNE_BUTTERAUGLI) add_library(yuv OBJECT ${AOM_LIBYUV_SOURCES}) if(NOT MSVC) target_compile_options(yuv PRIVATE -Wno-shadow) # Many functions in libyuv trigger this warning when enabled with gcc and # clang. is_flag_present(AOM_CXX_FLAGS "-Wmissing-declarations" flag_present) if(flag_present) target_compile_options(yuv PRIVATE -Wno-missing-declarations) endif() # Many functions in libyuv trigger this warning when enabled with clang. is_flag_present(AOM_CXX_FLAGS "-Wmissing-prototypes" flag_present) if(flag_present) target_compile_options(yuv PRIVATE -Wno-missing-prototypes) endif() endif() include_directories("${AOM_ROOT}/third_party/libyuv/include") endif() if(CONFIG_AV1_ENCODER) if(ENABLE_EXAMPLES) add_executable(aomenc "${AOM_ROOT}/apps/aomenc.c" $ $ $) add_executable(lossless_encoder "${AOM_ROOT}/examples/lossless_encoder.c" $ $) add_executable(set_maps "${AOM_ROOT}/examples/set_maps.c" $ $) add_executable(simple_encoder "${AOM_ROOT}/examples/simple_encoder.c" $ $) add_executable(twopass_encoder "${AOM_ROOT}/examples/twopass_encoder.c" $ $) if(NOT BUILD_SHARED_LIBS AND NOT CONFIG_REALTIME_ONLY) add_executable(noise_model "${AOM_ROOT}/examples/noise_model.c" $ $) add_executable(photon_noise_table "${AOM_ROOT}/examples/photon_noise_table.c" $ $) endif() add_executable(scalable_encoder "${AOM_ROOT}/examples/scalable_encoder.c" $ $) add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.cc" $ $) target_link_libraries(svc_encoder_rtc ${AOM_LIB_LINK_TYPE} aom_av1_rc) # Maintain a list of encoder example targets. list(APPEND AOM_ENCODER_EXAMPLE_TARGETS aomenc lossless_encoder set_maps simple_encoder scalable_encoder svc_encoder_rtc twopass_encoder) if(NOT BUILD_SHARED_LIBS AND NOT CONFIG_REALTIME_ONLY) list(APPEND AOM_ENCODER_EXAMPLE_TARGETS noise_model photon_noise_table) endif() endif() if(ENABLE_TOOLS) if(CONFIG_ENTROPY_STATS AND NOT BUILD_SHARED_LIBS) # TODO(tomfinegan): Sort out why a simple link command with # aom_entropy_optimizer.c won't work on macos, but dragging in all the # helper machinery allows the link to succeed. add_executable(aom_entropy_optimizer "${AOM_ROOT}/tools/aom_entropy_optimizer.c" $ $ $) # Maintain a list of encoder tool targets. list(APPEND AOM_ENCODER_TOOL_TARGETS aom_entropy_optimizer) endif() endif() # Add encoder examples and tools to the targets list. list(APPEND AOM_APP_TARGETS ${AOM_ENCODER_EXAMPLE_TARGETS} ${AOM_ENCODER_TOOL_TARGETS}) if(CONFIG_TUNE_BUTTERAUGLI) find_package(PkgConfig) # Use find_library() with STATIC_LINK_JXL for static build since # pkg_check_modules() with LIBJXL_STATIC is not working. if(STATIC_LINK_JXL OR NOT PKG_CONFIG_FOUND) find_library(LIBJXL_LIBRARIES libjxl.a) find_library(LIBHWY_LIBRARIES libhwy.a) find_library(LIBSKCMS_LIBRARIES libskcms.a) find_library(LIBBROTLICOMMON_LIBRARIES libbrotlicommon-static.a) find_library(LIBBROTLIENC_LIBRARIES libbrotlienc-static.a) find_library(LIBBROTLIDEC_LIBRARIES libbrotlidec-static.a) find_path(LIBJXL_INCLUDE_DIRS butteraugli.h PATH_SUFFIXES jxl) if(LIBJXL_LIBRARIES AND LIBHWY_LIBRARIES AND LIBSKCMS_LIBRARIES AND LIBBROTLICOMMON_LIBRARIES AND LIBBROTLIENC_LIBRARIES AND LIBBROTLIDEC_LIBRARIES AND LIBJXL_INCLUDE_DIRS) message(STATUS "Found JXL library: ${LIBJXL_LIBRARIES} " "${LIBHWY_LIBRARIES} ${LIBSKCMS_LIBRARIES} " "${LIBBROTLICOMMON_LIBRARIES} ${LIBBROTLIENC_LIBRARIES}" "${LIBBROTLIDEC_LIBRARIES}") message(STATUS "Found JXL include: ${LIBJXL_INCLUDE_DIRS}") else() message(FATAL_ERROR "JXL library not found.") endif() target_link_libraries(aom PRIVATE ${LIBJXL_LIBRARIES} ${LIBHWY_LIBRARIES} ${LIBSKCMS_LIBRARIES} ${LIBBROTLIENC_LIBRARIES} ${LIBBROTLIDEC_LIBRARIES} ${LIBBROTLICOMMON_LIBRARIES}) target_include_directories(aom_dsp_encoder PRIVATE ${LIBJXL_INCLUDE_DIRS}) else() pkg_check_modules(LIBJXL REQUIRED libjxl) target_link_libraries(aom PRIVATE ${LIBJXL_LDFLAGS}) target_include_directories(aom_dsp_encoder PRIVATE ${LIBJXL_INCLUDE_DIRS}) if(LIBJXL_CFLAGS) append_compiler_flag("${LIBJXL_CFLAGS}") endif() pkg_check_modules(LIBHWY REQUIRED libhwy) target_link_libraries(aom PRIVATE ${LIBHWY_LDFLAGS}) target_include_directories(aom_dsp_encoder PRIVATE ${LIBLIBHWY_INCLUDE_DIRS}) if(LIBHWY_CFLAGS) append_compiler_flag("${LIBHWY_CFLAGS}") endif() endif() set_target_properties(aom PROPERTIES LINKER_LANGUAGE CXX) if(BUILD_SHARED_LIBS) set_target_properties(aom_static PROPERTIES LINKER_LANGUAGE CXX) endif() list(APPEND AOM_LIB_TARGETS yuv) target_sources(aom PRIVATE $) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $) endif() endif() if(CONFIG_TFLITE) include(FetchContent) set(TFLITE_TAG "v2.6.1") message(STATUS "Fetching TFLite ${TFLITE_TAG}...") # static linking makes life with TFLite much easier set(TFLITE_C_BUILD_SHARED_LIBS OFF) # We don't care about comparing against these delegates (yet), and disabling # it reduces compile time meaningfully set(TFLITE_ENABLE_RUY OFF) set(TFLITE_ENABLE_XNNPACK OFF) fetchcontent_declare(tflite GIT_REPOSITORY https://github.com/tensorflow/tensorflow GIT_TAG ${TFLITE_TAG} GIT_SHALLOW TRUE) fetchcontent_getproperties(tflite) if(NOT tflite_POPULATED) fetchcontent_populate(tflite) # Some of the subprojects (e.g. Eigen) are very noisy and emit status # messages all the time. Temporary ignore status messages while adding # this to silence it. Ugly but effective. set(OLD_CMAKE_MESSAGE_LOG_LEVEL ${CMAKE_MESSAGE_LOG_LEVEL}) set(CMAKE_MESSAGE_LOG_LEVEL WARNING) add_subdirectory(${tflite_SOURCE_DIR}/tensorflow/lite/c ${tflite_BINARY_DIR}) set(CMAKE_MESSAGE_LOG_LEVEL ${OLD_CMAKE_MESSAGE_LOG_LEVEL}) endif() # Disable some noisy warnings in tflite target_compile_options(tensorflow-lite PRIVATE -w) # tensorflowlite_c is implicitly declared by this FetchContent include_directories(${tflite_SOURCE_DIR}) target_link_libraries(aom PRIVATE tensorflow-lite) endif() if(CONFIG_TUNE_VMAF) find_package(PkgConfig) if(PKG_CONFIG_FOUND) pkg_check_modules(VMAF REQUIRED libvmaf) if(BUILD_SHARED_LIBS) target_link_libraries(aom_static PRIVATE ${VMAF_LDFLAGS}) endif() target_link_libraries(aom PRIVATE ${VMAF_LDFLAGS}) target_include_directories(aom_dsp_encoder PRIVATE ${VMAF_INCLUDE_DIRS}) if(VMAF_CFLAGS) foreach(flag "${VMAF_CFLAGS}") append_compiler_flag("${flag}") endforeach() endif() else() message(FATAL_ERROR "CONFIG_TUNE_VMAF error: pkg-config not found.") endif() set_target_properties(aom PROPERTIES LINKER_LANGUAGE CXX) if(BUILD_SHARED_LIBS) set_target_properties(aom_static PROPERTIES LINKER_LANGUAGE CXX) endif() endif() endif() if(ENABLE_EXAMPLES) # Maintain a separate variable listing only the examples to facilitate # installation of example programs into an examples sub directory of # $AOM_DIST_DIR/bin when building the dist target. list(APPEND AOM_EXAMPLE_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS} ${AOM_ENCODER_EXAMPLE_TARGETS}) endif() if(ENABLE_TOOLS) if(CONFIG_AV1_DECODER) add_executable(dump_obu "${AOM_ROOT}/tools/dump_obu.cc" "${AOM_ROOT}/tools/obu_parser.cc" "${AOM_ROOT}/tools/obu_parser.h" $ $ $) list(APPEND AOM_TOOL_TARGETS dump_obu) list(APPEND AOM_APP_TARGETS dump_obu) # Maintain a separate variable listing only the examples to facilitate # installation of example programs into an tools sub directory of # $AOM_DIST_DIR/bin when building the dist target. list(APPEND AOM_TOOL_TARGETS ${AOM_DECODER_TOOL_TARGETS} ${AOM_ENCODER_TOOL_TARGETS}) endif() endif() if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER) add_executable(aom_cx_set_ref "${AOM_ROOT}/examples/aom_cx_set_ref.c" $ $) list(APPEND AOM_EXAMPLE_TARGETS aom_cx_set_ref) list(APPEND AOM_APP_TARGETS aom_cx_set_ref) endif() if(ENABLE_EXAMPLES AND CONFIG_AV1_ENCODER) add_executable(lightfield_encoder "${AOM_ROOT}/examples/lightfield_encoder.c" $ $) list(APPEND AOM_EXAMPLE_TARGETS lightfield_encoder) list(APPEND AOM_APP_TARGETS lightfield_encoder) endif() if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER) add_executable(lightfield_tile_list_decoder "${AOM_ROOT}/examples/lightfield_tile_list_decoder.c" $ $) list(APPEND AOM_EXAMPLE_TARGETS lightfield_tile_list_decoder) list(APPEND AOM_APP_TARGETS lightfield_tile_list_decoder) endif() if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER) add_executable(lightfield_decoder "${AOM_ROOT}/examples/lightfield_decoder.c" $ $) list(APPEND AOM_EXAMPLE_TARGETS lightfield_decoder) list(APPEND AOM_APP_TARGETS lightfield_decoder) endif() if(ENABLE_EXAMPLES AND CONFIG_AV1_ENCODER AND CONFIG_AV1_DECODER) add_executable(lightfield_bitstream_parsing "${AOM_ROOT}/examples/lightfield_bitstream_parsing.c" $ $ $) list(APPEND AOM_EXAMPLE_TARGETS lightfield_bitstream_parsing) list(APPEND AOM_APP_TARGETS lightfield_bitstream_parsing) endif() foreach(aom_app ${AOM_APP_TARGETS}) target_link_libraries(${aom_app} ${AOM_LIB_LINK_TYPE} aom) endforeach() if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS) if(CONFIG_LIBYUV) # Add to existing targets. foreach(aom_app ${AOM_APP_TARGETS}) target_sources(${aom_app} PRIVATE $) set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX) endforeach() endif() if(CONFIG_WEBM_IO) add_library(webm OBJECT ${AOM_LIBWEBM_SOURCES}) include_directories("${AOM_ROOT}/third_party/libwebm") target_compile_definitions(webm PRIVATE __STDC_CONSTANT_MACROS) target_compile_definitions(webm PRIVATE __STDC_LIMIT_MACROS) if(NOT MSVC) target_compile_options(webm PRIVATE -Wno-shadow) endif() # Add to existing targets. if(CONFIG_AV1_DECODER) target_sources(aom_decoder_app_util PRIVATE ${AOM_WEBM_DECODER_SOURCES}) endif() if(CONFIG_AV1_ENCODER) target_sources(aom_encoder_app_util PRIVATE ${AOM_WEBM_ENCODER_SOURCES}) endif() foreach(aom_app ${AOM_APP_TARGETS}) target_sources(${aom_app} PRIVATE $) set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX) endforeach() endif() endif() if(ENABLE_TESTS) # Create test_libaom target and the targets it depends on. setup_aom_test_targets() endif() if(HAVE_PTHREAD_H AND CONFIG_MULTITHREAD) find_package(Threads) target_link_libraries(aom ${AOM_LIB_LINK_TYPE} Threads::Threads) if(BUILD_SHARED_LIBS) target_link_libraries(aom_static ${AOM_LIB_LINK_TYPE} Threads::Threads) endif() endif() if(XCODE) # TODO(tomfinegan): Make sure target has no C++ files before doing this as # it's not necessary in that case. if(CONFIG_LIBYUV OR CONFIG_WEBM_IO) # The Xcode generator does not obey LINKER_LANGUAGE. Because of the issue # what looks like a C++ file needs to be in any target that Xcode will link # when the target contains a C++ dependency. Without this Xcode will try to # link with the C linker, which always ends badly when a dependency actually # includes C++. # Note: LINKER_LANGUAGE is explicitly set to C++ for all targets touched # here, it really is the Xcode generator's fault, or just a deficiency in # Xcode itself. foreach(aom_app ${AOM_APP_TARGETS}) add_no_op_source_file_to_target("${aom_app}" "cc") endforeach() endif() endif() if(ENABLE_EXAMPLES AND "${CMAKE_GENERATOR}" MATCHES "Makefiles$") # For historical purposes place the example binaries in the example directory. file(MAKE_DIRECTORY "${AOM_CONFIG_DIR}/examples") foreach(target ${AOM_EXAMPLE_TARGETS}) if(NOT "${target}" MATCHES "aomdec\|aomenc") set_target_properties(${target} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${AOM_CONFIG_DIR}/examples") endif() endforeach() if(ENABLE_TOOLS AND AOM_TOOL_TARGETS) # The same expectation is true for tool targets. file(MAKE_DIRECTORY "${AOM_CONFIG_DIR}/tools") set_target_properties(${AOM_TOOL_TARGETS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${AOM_CONFIG_DIR}/tools") endif() endif() if(BUILD_SHARED_LIBS) # Don't use -Wl,-z,defs with Clang's sanitizers. # # Clang's AddressSanitizer documentation says "When linking shared libraries, # the AddressSanitizer run-time is not linked, so -Wl,-z,defs may cause link # errors (don't use it with AddressSanitizer)." See # https://clang.llvm.org/docs/AddressSanitizer.html#usage. Similarly, see # https://clang.llvm.org/docs/MemorySanitizer.html#usage. if(NOT (APPLE OR CYGWIN OR WIN32) AND NOT (CMAKE_C_COMPILER_ID MATCHES "Clang" AND SANITIZE)) # The -z defs linker option reports unresolved symbol references from object # files when building a shared library. if("${CMAKE_VERSION}" VERSION_LESS "3.13") # target_link_options() is not available before CMake 3.13. target_link_libraries(aom PRIVATE -Wl,-z,defs) else() target_link_options(aom PRIVATE LINKER:-z,defs) endif() endif() include("${AOM_ROOT}/build/cmake/exports.cmake") setup_exports_target() endif() # Handle user supplied compile and link flags last to ensure they're obeyed. set_user_flags() # Aomedia documentation rule. set(DOXYGEN_VERSION_VALUE 0) if(ENABLE_DOCS) find_package(Doxygen) if(DOXYGEN_FOUND) # Check if Doxygen version is >= minimum required version(i.e. 1.8.10). set(MINIMUM_DOXYGEN_VERSION 1008010) if(DOXYGEN_VERSION) # Strip SHA1 from version string if present. string(REGEX REPLACE "^([0-9]+\\.[0-9]+\\.[0-9]+).*" "\\1" DOXYGEN_VERSION ${DOXYGEN_VERSION}) # Replace dots with semicolons to create a list. string(REGEX REPLACE "\\." ";" DOXYGEN_VERSION_LIST ${DOXYGEN_VERSION}) # Parse version components from the list. list(GET DOXYGEN_VERSION_LIST 0 DOXYGEN_MAJOR) list(GET DOXYGEN_VERSION_LIST 1 DOXYGEN_MINOR) list(GET DOXYGEN_VERSION_LIST 2 DOXYGEN_PATCH) endif() # Construct a version value for comparison. math(EXPR DOXYGEN_MAJOR "${DOXYGEN_MAJOR}*1000000") math(EXPR DOXYGEN_MINOR "${DOXYGEN_MINOR}*1000") math(EXPR DOXYGEN_VERSION_VALUE "${DOXYGEN_MAJOR} + ${DOXYGEN_MINOR} + ${DOXYGEN_PATCH}") if(${DOXYGEN_VERSION_VALUE} LESS ${MINIMUM_DOXYGEN_VERSION}) set(DOXYGEN_FOUND NO) endif() endif() if(DOXYGEN_FOUND) include("${AOM_ROOT}/docs.cmake") setup_documentation_targets() else() message( "--- Cannot find doxygen(version 1.8.10 or newer), ENABLE_DOCS turned off." ) set(ENABLE_DOCS OFF) endif() endif() # Aomedia dist rule. if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES) list(APPEND AOM_DIST_APPS $) endif() if(CONFIG_AV1_ENCODER AND ENABLE_EXAMPLES) list(APPEND AOM_DIST_APPS $) endif() if(ENABLE_EXAMPLES) foreach(example ${AOM_EXAMPLE_TARGETS}) list(APPEND AOM_DIST_EXAMPLES $) set_property(TARGET ${example} PROPERTY FOLDER examples) endforeach() endif() if(ENABLE_TOOLS) foreach(tool ${AOM_TOOL_TARGETS}) list(APPEND AOM_DIST_TOOLS $) set_property(TARGET ${tool} PROPERTY FOLDER tools) endforeach() endif() if(NOT AOM_DIST_DIR) set(AOM_DIST_DIR "${AOM_CONFIG_DIR}/dist") endif() add_custom_target(dist COMMAND ${CMAKE_COMMAND} -DAOM_ROOT=${AOM_ROOT} -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} -DAOM_DIST_DIR=${AOM_DIST_DIR} -DAOM_DIST_APPS="${AOM_DIST_APPS}" -DAOM_DIST_EXAMPLES="${AOM_DIST_EXAMPLES}" -DAOM_DIST_TOOLS="${AOM_DIST_TOOLS}" -DAOM_DIST_INCLUDES="${AOM_INSTALL_INCS}" -DAOM_DIST_LIBS=$ -DENABLE_DOCS=${ENABLE_DOCS} -P "${AOM_ROOT}/build/cmake/dist.cmake" DEPENDS ${AOM_INSTALL_BINS} ${AOM_INSTALL_LIBS} ${AOM_INSTALL_INCS} ${AOM_EXAMPLE_TARGETS} ${AOM_TOOL_TARGETS}) if(ENABLE_DOCS) add_dependencies(dist docs) endif() # Collect all variables containing libaom source files. get_cmake_property(all_cmake_vars VARIABLES) foreach(var ${all_cmake_vars}) if("${var}" MATCHES "SOURCES$\|_INTRIN_\|_ASM_" AND NOT "${var}" MATCHES "DOXYGEN\|LIBYUV\|_PKG_\|TEST" AND NOT "${var}" MATCHES "_ASM_NASM\|_ASM_COMPILER") list(APPEND aom_source_vars ${var}) endif() endforeach() if(NOT CONFIG_AV1_DECODER) list(FILTER aom_source_vars EXCLUDE REGEX "_DECODER_") endif() # Libaom_srcs.txt generation. set(libaom_srcs_txt_file "${AOM_CONFIG_DIR}/libaom_srcs.txt") file(WRITE "${libaom_srcs_txt_file}" "# This file is generated. DO NOT EDIT.\n") # Static source file list first. foreach(aom_source_var ${aom_source_vars}) foreach(file ${${aom_source_var}}) if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}") string(REPLACE "${AOM_ROOT}/" "" file "${file}") if(NOT CONFIG_AV1_DECODER AND "${file}" MATCHES "aom_decoder") continue() endif() file(APPEND "${libaom_srcs_txt_file}" "${file}\n") endif() endforeach() endforeach() file(APPEND "${libaom_srcs_txt_file}" "# Files below this line are generated by the libaom build system.\n") foreach(aom_source_var ${aom_source_vars}) foreach(file ${${aom_source_var}}) if("${file}" MATCHES "${AOM_CONFIG_DIR}") string(REPLACE "${AOM_CONFIG_DIR}/" "" file "${file}") file(APPEND "${libaom_srcs_txt_file}" "${file}\n") endif() endforeach() endforeach() # Libaom_srcs.gni generation. set(libaom_srcs_gni_file "${AOM_CONFIG_DIR}/libaom_srcs.gni") file(WRITE "${libaom_srcs_gni_file}" "# This file is generated. DO NOT EDIT.\n") foreach(aom_source_var ${aom_source_vars}) if("${${aom_source_var}}" MATCHES "${AOM_ROOT}") string(TOLOWER ${aom_source_var} aom_source_var_lowercase) file(APPEND "${libaom_srcs_gni_file}" "\n${aom_source_var_lowercase} = [\n") endif() foreach(file ${${aom_source_var}}) if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}") string(REPLACE "${AOM_ROOT}" "//third_party/libaom/source/libaom" file "${file}") if(NOT CONFIG_AV1_DECODER AND "${file}" MATCHES "aom_decoder") continue() endif() file(APPEND "${libaom_srcs_gni_file}" " \"${file}\",\n") endif() endforeach() if("${${aom_source_var}}" MATCHES "${AOM_ROOT}") file(APPEND "${libaom_srcs_gni_file}" "]\n") endif() endforeach() file(APPEND "${libaom_srcs_gni_file}" "\n# Files below this line are generated by the libaom build system.\n") foreach(aom_source_var ${aom_source_vars}) if("${${aom_source_var}}" MATCHES "${AOM_CONFIG_DIR}") string(TOLOWER ${aom_source_var} aom_source_var_lowercase) file(APPEND "${libaom_srcs_gni_file}" "\n${aom_source_var_lowercase}_gen = [\n") endif() foreach(file ${${aom_source_var}}) if(NOT "${file}" MATCHES "${AOM_ROOT}") string(REPLACE "${AOM_CONFIG_DIR}" "//third_party/libaom/source/libaom" file "${file}") file(APPEND "${libaom_srcs_gni_file}" " \"${file}\",\n") endif() endforeach() if("${${aom_source_var}}" MATCHES "${AOM_CONFIG_DIR}") file(APPEND "${libaom_srcs_gni_file}" "]\n") endif() endforeach() # Generate aom.pc and setup install rule. setup_aom_install_targets() aom-3.12.1/LICENSE000066400000000000000000000024441477627663500134120ustar00rootroot00000000000000Copyright (c) 2016, Alliance for Open Media. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. aom-3.12.1/PATENTS000066400000000000000000000131051477627663500134420ustar00rootroot00000000000000Alliance for Open Media Patent License 1.0 1. License Terms. 1.1. Patent License. Subject to the terms and conditions of this License, each Licensor, on behalf of itself and successors in interest and assigns, grants Licensee a non-sublicensable, perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as expressly stated in this License) patent license to its Necessary Claims to make, use, sell, offer for sale, import or distribute any Implementation. 1.2. Conditions. 1.2.1. Availability. As a condition to the grant of rights to Licensee to make, sell, offer for sale, import or distribute an Implementation under Section 1.1, Licensee must make its Necessary Claims available under this License, and must reproduce this License with any Implementation as follows: a. For distribution in source code, by including this License in the root directory of the source code with its Implementation. b. For distribution in any other form (including binary, object form, and/or hardware description code (e.g., HDL, RTL, Gate Level Netlist, GDSII, etc.)), by including this License in the documentation, legal notices, and/or other written materials provided with the Implementation. 1.2.2. Additional Conditions. This license is directly from Licensor to Licensee. Licensee acknowledges as a condition of benefiting from it that no rights from Licensor are received from suppliers, distributors, or otherwise in connection with this License. 1.3. Defensive Termination. If any Licensee, its Affiliates, or its agents initiates patent litigation or files, maintains, or voluntarily participates in a lawsuit against another entity or any person asserting that any Implementation infringes Necessary Claims, any patent licenses granted under this License directly to the Licensee are immediately terminated as of the date of the initiation of action unless 1) that suit was in response to a corresponding suit regarding an Implementation first brought against an initiating entity, or 2) that suit was brought to enforce the terms of this License (including intervention in a third-party action by a Licensee). 1.4. Disclaimers. The Reference Implementation and Specification are provided "AS IS" and without warranty. The entire risk as to implementing or otherwise using the Reference Implementation or Specification is assumed by the implementer and user. Licensor expressly disclaims any warranties (express, implied, or otherwise), including implied warranties of merchantability, non-infringement, fitness for a particular purpose, or title, related to the material. IN NO EVENT WILL LICENSOR BE LIABLE TO ANY OTHER PARTY FOR LOST PROFITS OR ANY FORM OF INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR NOT THE OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 2. Definitions. 2.1. Affiliate. "Affiliate" means an entity that directly or indirectly Controls, is Controlled by, or is under common Control of that party. 2.2. Control. "Control" means direct or indirect control of more than 50% of the voting power to elect directors of that corporation, or for any other entity, the power to direct management of such entity. 2.3. Decoder. "Decoder" means any decoder that conforms fully with all non-optional portions of the Specification. 2.4. Encoder. "Encoder" means any encoder that produces a bitstream that can be decoded by a Decoder only to the extent it produces such a bitstream. 2.5. Final Deliverable. "Final Deliverable" means the final version of a deliverable approved by the Alliance for Open Media as a Final Deliverable. 2.6. Implementation. "Implementation" means any implementation, including the Reference Implementation, that is an Encoder and/or a Decoder. An Implementation also includes components of an Implementation only to the extent they are used as part of an Implementation. 2.7. License. "License" means this license. 2.8. Licensee. "Licensee" means any person or entity who exercises patent rights granted under this License. 2.9. Licensor. "Licensor" means (i) any Licensee that makes, sells, offers for sale, imports or distributes any Implementation, or (ii) a person or entity that has a licensing obligation to the Implementation as a result of its membership and/or participation in the Alliance for Open Media working group that developed the Specification. 2.10. Necessary Claims. "Necessary Claims" means all claims of patents or patent applications, (a) that currently or at any time in the future, are owned or controlled by the Licensor, and (b) (i) would be an Essential Claim as defined by the W3C Policy as of February 5, 2004 (https://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential) as if the Specification was a W3C Recommendation; or (ii) are infringed by the Reference Implementation. 2.11. Reference Implementation. "Reference Implementation" means an Encoder and/or Decoder released by the Alliance for Open Media as a Final Deliverable. 2.12. Specification. "Specification" means the specification designated by the Alliance for Open Media as a Final Deliverable for which this License was issued. aom-3.12.1/README.md000066400000000000000000000617241477627663500136720ustar00rootroot00000000000000README.md {#LREADME} ========= # AV1 Codec Library ## Contents 1. [Building the lib and applications](#building-the-library-and-applications) - [Prerequisites](#prerequisites) - [Get the code](#get-the-code) - [Basics](#basic-build) - [Configuration options](#configuration-options) - [Dylib builds](#dylib-builds) - [Debugging](#debugging) - [Cross compiling](#cross-compiling) - [Sanitizer support](#sanitizers) - [MSVC builds](#microsoft-visual-studio-builds) - [Xcode builds](#xcode-builds) - [Emscripten builds](#emscripten-builds) - [Extra Build Flags](#extra-build-flags) - [Build with VMAF support](#build-with-vmaf) 2. [Testing the library](#testing-the-av1-codec) - [Basics](#testing-basics) - [Unit tests](#unit-tests) - [Example tests](#example-tests) - [Encoder tests](#encoder-tests) - [IDE hosted tests](#ide-hosted-tests) - [Downloading test data](#downloading-the-test-data) - [Adding a new test data file](#adding-a-new-test-data-file) - [Additional test data](#additional-test-data) - [Sharded testing](#sharded-testing) - [Running tests directly](#running-test_libaom-directly) - [Running tests via CMake](#running-the-tests-via-the-cmake-build) 3. [Coding style](#coding-style) 4. [License header](#license-header) 5. [Submitting patches](#submitting-patches) - [Login cookie](#login-cookie) - [Contributor agreement](#contributor-agreement) - [Testing your code](#testing-your-code) - [Commit message hook](#commit-message-hook) - [Upload your change](#upload-your-change) - [Incorporating Reviewer Comments](#incorporating-reviewer-comments) - [Submitting your change](#submitting-your-change) - [Viewing change status](#viewing-the-status-of-uploaded-changes) 6. [Support](#support) 7. [Bug reports](#bug-reports) ## Building the library and applications {#building-the-library-and-applications} ### Prerequisites {#prerequisites} 1. [CMake](https://cmake.org). See CMakeLists.txt for the minimum version required. 2. [Git](https://git-scm.com/). 3. A modern C compiler. gcc 6+, clang 7+, Microsoft Visual Studio 2019+ or the latest version of MinGW-w64 (clang64 or ucrt toolchains) are recommended. A C++ compiler is necessary to build the unit tests and some features contained in the examples. 4. [Perl](https://www.perl.org/). 5. For x86 targets, [yasm](http://yasm.tortall.net/) or a recent version (2.14 or later) of [nasm](http://www.nasm.us/). (If both yasm and nasm are present, yasm will be used by default. Pass -DENABLE_NASM=ON to cmake to select nasm.) If you download yasm with the intention to work with Visual Studio, please download win32.exe or win64.exe and rename it into yasm.exe. DO NOT download or use vsyasm.exe. The MSYS2 version of the yasm binary can also be used and avoids an issue caused by a missing Visual C++ Redistributable install (Visual Studio 2010, MSVCR100.dll). 6. Building the documentation requires [doxygen version 1.8.10 or newer](http://doxygen.org). 7. Emscripten builds require the portable [EMSDK](https://kripken.github.io/emscripten-site/index.html). ### Get the code {#get-the-code} The AV1 library source code is stored in the Alliance for Open Media Git repository: ~~~ $ git clone https://aomedia.googlesource.com/aom # By default, the above command stores the source in the aom directory: $ cd aom ~~~ ### Basic build {#basic-build} CMake replaces the configure step typical of many projects. Running CMake will produce configuration and build files for the currently selected CMake generator. For most systems the default generator is Unix Makefiles. The basic form of a makefile build is the following: ~~~ $ cmake path/to/aom $ make ~~~ The above will generate a makefile build that produces the AV1 library and applications for the current host system after the make step completes successfully. The compiler chosen varies by host platform, but a general rule applies: On systems where cc and c++ are present in $PATH at the time CMake is run the generated build will use cc and c++ by default. ### Configuration options {#configuration-options} The AV1 codec library has a great many configuration options. These come in two varieties: 1. Build system configuration options. These have the form `ENABLE_FEATURE`. 2. AV1 codec configuration options. These have the form `CONFIG_FEATURE`. Both types of options are set at the time CMake is run. The following example enables ccache and disables the AV1 encoder: ~~~ $ cmake path/to/aom -DENABLE_CCACHE=1 -DCONFIG_AV1_ENCODER=0 $ make ~~~ The available configuration options are too numerous to list here. Build system configuration options can be found at the top of the CMakeLists.txt file found in the root of the AV1 repository, and AV1 codec configuration options can currently be found in the file `build/cmake/aom_config_defaults.cmake`. ### Dylib builds {#dylib-builds} A dylib (shared object) build of the AV1 codec library can be enabled via the CMake built in variable `BUILD_SHARED_LIBS`: ~~~ $ cmake path/to/aom -DBUILD_SHARED_LIBS=1 $ make ~~~ This is currently only supported on non-Windows targets. ### Debugging {#debugging} Depending on the generator used there are multiple ways of going about debugging AV1 components. For single configuration generators like the Unix Makefiles generator, setting `CMAKE_BUILD_TYPE` to Debug is sufficient: ~~~ $ cmake path/to/aom -DCMAKE_BUILD_TYPE=Debug ~~~ For Xcode, mainly because configuration controls for Xcode builds are buried two configuration windows deep and must be set for each subproject within the Xcode IDE individually, `CMAKE_CONFIGURATION_TYPES` should be set to Debug: ~~~ $ cmake path/to/aom -G Xcode -DCMAKE_CONFIGURATION_TYPES=Debug ~~~ For Visual Studio the in-IDE configuration controls should be used. Simply set the IDE project configuration to Debug to allow for stepping through the code. In addition to the above it can sometimes be useful to debug only C and C++ code. To disable all assembly code and intrinsics set `AOM_TARGET_CPU` to generic at generation time: ~~~ $ cmake path/to/aom -DAOM_TARGET_CPU=generic ~~~ ### Cross compiling {#cross-compiling} For the purposes of building the AV1 codec and applications and relative to the scope of this guide, all builds for architectures differing from the native host architecture will be considered cross compiles. The AV1 CMake build handles cross compiling via the use of toolchain files included in the AV1 repository. The toolchain files available at the time of this writing are: - arm64-ios.cmake - arm64-linux-clang.cmake - arm64-linux-gcc.cmake - arm64-mingw-gcc.cmake - armv7-ios.cmake - armv7-linux-gcc.cmake - armv7-mingw-gcc.cmake - armv7s-ios.cmake - ppc-linux-gcc.cmake - riscv-linux-gcc.cmake - x86-ios-simulator.cmake - x86-linux.cmake - x86-macos.cmake - x86-mingw-gcc.cmake - x86\_64-ios-simulator.cmake - x86\_64-mingw-gcc.cmake The following example demonstrates use of the x86-macos.cmake toolchain file on a x86\_64 MacOS host: ~~~ $ cmake path/to/aom \ -DCMAKE_TOOLCHAIN_FILE=path/to/aom/build/cmake/toolchains/x86-macos.cmake $ make ~~~ To build for an unlisted target creation of a new toolchain file is the best solution. The existing toolchain files can be used a starting point for a new toolchain file since each one exposes the basic requirements for toolchain files as used in the AV1 codec build. As a temporary work around an unoptimized AV1 configuration that builds only C and C++ sources can be produced using the following commands: ~~~ $ cmake path/to/aom -DAOM_TARGET_CPU=generic $ make ~~~ In addition to the above it's important to note that the toolchain files suffixed with gcc behave differently than the others. These toolchain files attempt to obey the $CROSS environment variable. ### Sanitizers {#sanitizers} Sanitizer integration is built-in to the CMake build system. To enable a sanitizer, add `-DSANITIZE=` to the CMake command line. For example, to enable address sanitizer: ~~~ $ cmake path/to/aom -DSANITIZE=address $ make ~~~ Sanitizers available vary by platform, target, and compiler. Consult your compiler documentation to determine which, if any, are available. ### Microsoft Visual Studio builds {#microsoft-visual-studio-builds} Building the AV1 codec library in Microsoft Visual Studio is supported. Visual Studio 2019 (16.0) or later is required. The following example demonstrates generating projects and a solution for the Microsoft IDE: ~~~ # This does not require a bash shell; Command Prompt (cmd.exe) is fine. # This assumes the build host is a Windows x64 computer. # To create a Visual Studio 2022 solution for the x64 target: $ cmake path/to/aom -G "Visual Studio 17 2022" # To create a Visual Studio 2022 solution for the 32-bit x86 target: $ cmake path/to/aom -G "Visual Studio 17 2022" -A Win32 # To create a Visual Studio 2019 solution for the x64 target: $ cmake path/to/aom -G "Visual Studio 16 2019" # To create a Visual Studio 2019 solution for the 32-bit x86 target: $ cmake path/to/aom -G "Visual Studio 16 2019" -A Win32 # To build the solution: $ cmake --build . ~~~ NOTE: The build system targets Windows 7 or later by compiling files with `-D_WIN32_WINNT=0x0601`. ### Xcode builds {#xcode-builds} Building the AV1 codec library in Xcode is supported. The following example demonstrates generating an Xcode project: ~~~ $ cmake path/to/aom -G Xcode ~~~ ### Emscripten builds {#emscripten-builds} Building the AV1 codec library with Emscripten is supported. Typically this is used to hook into the AOMAnalyzer GUI application. These instructions focus on using the inspector with AOMAnalyzer, but all tools can be built with Emscripten. It is assumed here that you have already downloaded and installed the EMSDK, installed and activated at least one toolchain, and setup your environment appropriately using the emsdk\_env script. 1. Build [AOM Analyzer](https://github.com/xiph/aomanalyzer). 2. Configure the build: ~~~ $ cmake path/to/aom \ -DENABLE_CCACHE=1 \ -DAOM_TARGET_CPU=generic \ -DENABLE_DOCS=0 \ -DENABLE_TESTS=0 \ -DCONFIG_ACCOUNTING=1 \ -DCONFIG_INSPECTION=1 \ -DCONFIG_MULTITHREAD=0 \ -DCONFIG_RUNTIME_CPU_DETECT=0 \ -DCONFIG_WEBM_IO=0 \ -DCMAKE_TOOLCHAIN_FILE=path/to/emsdk-portable/.../Emscripten.cmake ~~~ 3. Build it: run make if that's your generator of choice: ~~~ $ make inspect ~~~ 4. Run the analyzer: ~~~ # inspect.js is in the examples sub directory of the directory in which you # executed cmake. $ path/to/AOMAnalyzer path/to/examples/inspect.js path/to/av1/input/file ~~~ ### Extra build flags {#extra-build-flags} Three variables allow for passing of additional flags to the build system. - AOM\_EXTRA\_C\_FLAGS - AOM\_EXTRA\_CXX\_FLAGS - AOM\_EXTRA\_EXE\_LINKER\_FLAGS The build system attempts to ensure the flags passed through the above variables are passed to tools last in order to allow for override of default behavior. These flags can be used, for example, to enable asserts in a release build: ~~~ $ cmake path/to/aom \ -DCMAKE_BUILD_TYPE=Release \ -DAOM_EXTRA_C_FLAGS=-UNDEBUG \ -DAOM_EXTRA_CXX_FLAGS=-UNDEBUG ~~~ ### Build with VMAF support {#build-with-vmaf} After installing [libvmaf.a](https://github.com/Netflix/vmaf/tree/master/libvmaf), you can use it with the encoder: ~~~ $ cmake path/to/aom -DCONFIG_TUNE_VMAF=1 ~~~ Please note that the default VMAF model ("/usr/local/share/model/vmaf_v0.6.1.json") will be used unless you set the following flag when running the encoder: ~~~ # --vmaf-model-path=path/to/model ~~~ ## Testing the AV1 codec {#testing-the-av1-codec} ### Testing basics {#testing-basics} There are several methods of testing the AV1 codec. All of these methods require the presence of the AV1 source code and a working build of the AV1 library and applications. #### 1. Unit tests: {#unit-tests} The unit tests can be run at build time: ~~~ # Before running the make command the LIBAOM_TEST_DATA_PATH environment # variable should be set to avoid downloading the test files to the # cmake build configuration directory. $ cmake path/to/aom # Note: The AV1 CMake build creates many test targets. Running make # with multiple jobs will speed up the test run significantly. $ make runtests ~~~ #### 2. Example tests: {#example-tests} The example tests require a bash shell and can be run in the following manner: ~~~ # See the note above about LIBAOM_TEST_DATA_PATH above. $ cmake path/to/aom $ make # It's best to build the testdata target using many make jobs. # Running it like this will verify and download (if necessary) # one at a time, which takes a while. $ make testdata $ path/to/aom/test/examples.sh --bin-path examples ~~~ #### 3. Encoder tests: {#encoder-tests} When making a change to the encoder run encoder tests to confirm that your change has a positive or negligible impact on encode quality. When running these tests the build configuration should be changed to enable internal encoder statistics: ~~~ $ cmake path/to/aom -DCONFIG_INTERNAL_STATS=1 $ make ~~~ The repository contains scripts intended to make running these tests as simple as possible. The following example demonstrates creating a set of baseline clips for comparison to results produced after making your change to libaom: ~~~ # This will encode all Y4M files in the current directory using the # settings specified to create the encoder baseline statistical data: $ cd path/to/test/inputs # This command line assumes that run_encodes.sh, its helper script # best_encode.sh, and the aomenc you intend to test are all within a # directory in your PATH. $ run_encodes.sh 200 500 50 baseline ~~~ After making your change and creating the baseline clips, you'll need to run encodes that include your change(s) to confirm that things are working as intended: ~~~ # This will encode all Y4M files in the current directory using the # settings specified to create the statistical data for your change: $ cd path/to/test/inputs # This command line assumes that run_encodes.sh, its helper script # best_encode.sh, and the aomenc you intend to test are all within a # directory in your PATH. $ run_encodes.sh 200 500 50 mytweak ~~~ After creating both data sets you can use `test/visual_metrics.py` to generate a report that can be viewed in a web browser: ~~~ $ visual_metrics.py metrics_template.html "*stt" baseline mytweak \ > mytweak.html ~~~ You can view the report by opening mytweak.html in a web browser. ### IDE hosted tests {#ide-hosted-tests} By default the generated projects files created by CMake will not include the runtests and testdata rules when generating for IDEs like Microsoft Visual Studio and Xcode. This is done to avoid intolerably long build cycles in the IDEs-- IDE behavior is to build all targets when selecting the build project options in MSVS and Xcode. To enable the test rules in IDEs the `ENABLE_IDE_TEST_HOSTING` variable must be enabled at CMake generation time: ~~~ # This example uses Xcode. To get a list of the generators # available, run cmake with the -G argument missing its # value. $ cmake path/to/aom -DENABLE_IDE_TEST_HOSTING=1 -G Xcode ~~~ ### Downloading the test data {#downloading-the-test-data} The fastest and easiest way to obtain the test data is to use CMake to generate a build using the Unix Makefiles generator, and then to build only the testdata rule. By default the test files will be downloaded to the current directory. The `LIBAOM_TEST_DATA_PATH` environment variable can be used to set a custom one. ~~~ $ cmake path/to/aom -G "Unix Makefiles" # 28 is used because there are 28 test files as of this writing. $ make -j28 testdata ~~~ The above make command will only download and verify the test data. ### Adding a new test data file {#adding-a-new-test-data-file} First, add the new test data file to the `aom-test-data` bucket of the `aomedia-testing` project on Google Cloud Platform. You may need to ask someone with the necessary access permissions to do this for you. NOTE: When a new test data file is added to the `aom-test-data` bucket, its "Public access" is initially "Not public". We need to change its "Public access" to "Public" by using the following [`gsutil`](https://cloud.google.com/storage/docs/gsutil_install) command: ~~~ $ gsutil acl ch -g all:R gs://aom-test-data/test-data-file-name ~~~ This command grants the `AllUsers` group READ access to the file named "test-data-file-name" in the `aom-test-data` bucket. Once the new test data file has been added to `aom-test-data`, create a CL to add the name of the new test data file to `test/test_data_util.cmake` and add the SHA1 checksum of the new test data file to `test/test-data.sha1`. (The SHA1 checksum of a file can be calculated by running the `sha1sum` command on the file.) ### Additional test data {#additional-test-data} The test data mentioned above is strictly intended for unit testing. Additional input data for testing the encoder can be obtained from: https://media.xiph.org/video/derf/ ### Sharded testing {#sharded-testing} The AV1 codec library unit tests are built upon gtest which supports sharding of test jobs. Sharded test runs can be achieved in a couple of ways. #### 1. Running test\_libaom directly: {#running-test_libaom-directly} ~~~ # Set the environment variable GTEST_TOTAL_SHARDS to control the number of # shards. $ export GTEST_TOTAL_SHARDS=10 # (GTEST shard indexing is 0 based). $ seq 0 $(( $GTEST_TOTAL_SHARDS - 1 )) \ | xargs -n 1 -P 0 -I{} env GTEST_SHARD_INDEX={} ./test_libaom ~~~ To create a test shard for each CPU core available on the current system set `GTEST_TOTAL_SHARDS` to the number of CPU cores on your system minus one. #### 2. Running the tests via the CMake build: {#running-the-tests-via-the-cmake-build} ~~~ # For IDE based builds, ENABLE_IDE_TEST_HOSTING must be enabled. See # the IDE hosted tests section above for more information. If the IDE # supports building targets concurrently tests will be sharded by default. # For make and ninja builds the -j parameter controls the number of shards # at test run time. This example will run the tests using 10 shards via # make. $ make -j10 runtests ~~~ The maximum number of test targets that can run concurrently is determined by the number of CPUs on the system where the build is configured as detected by CMake. A system with 24 cores can run 24 test shards using a value of 24 with the `-j` parameter. When CMake is unable to detect the number of cores 10 shards is the default maximum value. ## Coding style {#coding-style} We are using the Google C Coding Style defined by the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html). The coding style used by this project is enforced with clang-format using the configuration contained in the [.clang-format](https://chromium.googlesource.com/webm/aom/+/main/.clang-format) file in the root of the repository. You can download clang-format using your system's package manager, or directly from [llvm.org](http://llvm.org/releases/download.html). You can also view the [documentation](https://clang.llvm.org/docs/ClangFormat.html) on llvm.org. Output from clang-format varies by clang-format version, for best results your version should match the one used on Jenkins. You can find the clang-format version by reading the comment in the `.clang-format` file linked above. Before pushing changes for review you can format your code with: ~~~ # Apply clang-format to modified .c, .h and .cc files $ clang-format -i --style=file \ $(git diff --name-only --diff-filter=ACMR '*.[hc]' '*.cc') ~~~ Check the .clang-format file for the version used to generate it if there is any difference between your local formatting and the review system. Some Git installations have clang-format integration. Here are some examples: ~~~ # Apply clang-format to all staged changes: $ git clang-format # Clang format all staged and unstaged changes: $ git clang-format -f # Clang format all staged and unstaged changes interactively: $ git clang-format -f -p ~~~ ## License header {#license-header} Use the following comment block in new C/C++ source files, replacing "${year}" with the current year. The same comment should be added to other file types, adjusting the comment syntax as necessary. ``` /* * Copyright (c) ${year}, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ ``` ## Submitting patches {#submitting-patches} We manage the submission of patches using the [Gerrit](https://www.gerritcodereview.com/) code review tool. This tool implements a workflow on top of the Git version control system to ensure that all changes get peer reviewed and tested prior to their distribution. ### Login cookie {#login-cookie} Browse to [AOMedia Git index](https://aomedia.googlesource.com/) and login with your account (Gmail credentials, for example). Next, follow the `Generate Password` Password link at the top of the page. You’ll be given instructions for creating a cookie to use with our Git repos. You must also have a Gerrit account associated with your Google account. To do this visit the [Gerrit review server](https://aomedia-review.googlesource.com) and click "Sign in" (top right). ### Contributor agreement {#contributor-agreement} You will be required to execute a [contributor agreement](http://aomedia.org/license) to ensure that the AOMedia Project has the right to distribute your changes. Note: If you are pushing changes on behalf of an Alliance for Open Media member organization this step is not necessary. ### Testing your code {#testing-your-code} The testing basics are covered in the [testing section](#testing-the-av1-codec) above. In addition to the local tests, many more (e.g. asan, tsan, valgrind) will run through Jenkins instances upon upload to gerrit. ### Commit message hook {#commit-message-hook} Gerrit requires that each submission include a unique Change-Id. You can assign one manually using git commit --amend, but it’s easier to automate it with the commit-msg hook provided by Gerrit. Copy commit-msg to the `.git/hooks` directory of your local repo. Here's an example: ~~~ $ curl -Lo aom/.git/hooks/commit-msg https://chromium-review.googlesource.com/tools/hooks/commit-msg # Next, ensure that the downloaded commit-msg script is executable: $ chmod u+x aom/.git/hooks/commit-msg ~~~ See the Gerrit [documentation](https://gerrit-review.googlesource.com/Documentation/user-changeid.html) for more information. ### Upload your change {#upload-your-change} The command line to upload your patch looks like this: ~~~ $ git push https://aomedia-review.googlesource.com/aom HEAD:refs/for/main ~~~ ### Incorporating reviewer comments {#incorporating-reviewer-comments} If you previously uploaded a change to Gerrit and the Approver has asked for changes, follow these steps: 1. Edit the files to make the changes the reviewer has requested. 2. Recommit your edits using the --amend flag, for example: ~~~ $ git commit -a --amend ~~~ 3. Use the same git push command as above to upload to Gerrit again for another review cycle. In general, you should not rebase your changes when doing updates in response to review. Doing so can make it harder to follow the evolution of your change in the diff view. ### Submitting your change {#submitting-your-change} Once your change has been Approved and Verified, you can “submit” it through the Gerrit UI. This will usually automatically rebase your change onto the branch specified. Sometimes this can’t be done automatically. If you run into this problem, you must rebase your changes manually: ~~~ $ git fetch $ git rebase origin/branchname ~~~ If there are any conflicts, resolve them as you normally would with Git. When you’re done, reupload your change. ### Viewing the status of uploaded changes {#viewing-the-status-of-uploaded-changes} To check the status of a change that you uploaded, open [Gerrit](https://aomedia-review.googlesource.com/), sign in, and click My > Changes. ## Support {#support} This library is an open source project supported by its community. Please please email aomediacodec@jointdevelopment.kavi.com for help. ## Bug reports {#bug-reports} Bug reports can be filed in the Alliance for Open Media [issue tracker](https://aomedia.issues.chromium.org/). For security reports, select 'Security report' from the Template dropdown. aom-3.12.1/Sample.cfg000066400000000000000000000046001477627663500143030ustar00rootroot00000000000000#sample config file super_block_size = 128 # super block size. 0, 64 or 128 max_partition_size = 128 # max partition size(8, 16, 32, 64, 128) min_partition_size = 4 # min partition size(4, 8, 16, 32, 64) disable_rect_partition_type = 0 # disable rectangle partition type disable_ab_partition_type = 0 # disable AB partition type disable_1to4_partition_type = 0 # disable 1 to 4 and 4 to 1 partition type disable_intra_angle_delta = 0 # disable intra angle delta disable_paeth_intra = 0 # disable paeth intra disable_smooth_intra = 0 # disable intra smooth mode disable_intra_edge_filter = 0 # disable intra edge filter disable_filter_intra = 0 # disable filter intra disable_intrabc = 0 # disable Intra Block Copy disable_cfl = 0 # disable chroma from luma prediction disable_palette = 0 # disable Palette disable_flip_idtx = 0 # disable flip and identity transform disable_tx_64x64 = 0 # disable 64x64 transform reduced_tx_type_set = 0 # use reduced transform type set reduced_reference_set = 0 # use reduced reference frame set disable_obmc = 0 # disable OBMC disable_warp_motion = 0 # disable Warped Motion disable_global_motion = 0 # disable global motion disable_ref_frame_mv = 0 # disable ref mv disable_dual_filter = 0 # disable dual interpolation filter disable_one_sided_comp = 0 # disable one sided compound mode disable_masked_comp = 0 # disable masked compound prediction disable_diff_wtd_comp = 0 # disable difference weighted compound mode disable_inter_inter_wedge = 0 # disable inter/inter wedge comp disable_dist_wtd_comp = 0 # disable distant weighted compound mode disable_inter_intra_comp = 0 # disable inter/intra compound mode. disable_inter_intra_wedge = 0 # disable inter/intra wedge comp disable_smooth_inter_intra = 0 # disable smooth inter/intra disable_cdef = 0 # disable CDEF filter disable_lr = 0 # disable Loop Restoration Filter disable_trellis_quant = 0 # disable trellis quantizationaom-3.12.1/aom/000077500000000000000000000000001477627663500131555ustar00rootroot00000000000000aom-3.12.1/aom/aom.h000066400000000000000000000074421477627663500141110ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\defgroup aom AOM * \ingroup codecs * AOM is aom's newest video compression algorithm that uses motion * compensated prediction, Discrete Cosine Transform (DCT) coding of the * prediction error signal and context dependent entropy coding techniques * based on arithmetic principles. It features: * - YUV 4:2:0 image format * - Macro-block based coding (16x16 luma plus two 8x8 chroma) * - 1/4 (1/8) pixel accuracy motion compensated prediction * - 4x4 DCT transform * - 128 level linear quantizer * - In loop deblocking filter * - Context-based entropy coding * * @{ */ /*!\file * \brief Provides controls common to both the AOM encoder and decoder. */ #ifndef AOM_AOM_AOM_H_ #define AOM_AOM_AOM_H_ #include "aom/aom_codec.h" #include "aom/aom_image.h" #ifdef __cplusplus extern "C" { #endif /*!\brief Control functions * * The set of macros define the control functions of AOM interface * The range for common control IDs is 230-255(max). */ enum aom_com_control_id { /*!\brief Codec control function to get a pointer to a reference frame * * av1_ref_frame_t* parameter */ AV1_GET_REFERENCE = 230, /*!\brief Codec control function to write a frame into a reference buffer * * av1_ref_frame_t* parameter */ AV1_SET_REFERENCE = 231, /*!\brief Codec control function to get a copy of reference frame from the * decoder * * av1_ref_frame_t* parameter */ AV1_COPY_REFERENCE = 232, /*!\brief Codec control function to get a pointer to the new frame * * aom_image_t* parameter */ AV1_GET_NEW_FRAME_IMAGE = 233, /*!\brief Codec control function to copy the new frame to an external buffer * * aom_image_t* parameter */ AV1_COPY_NEW_FRAME_IMAGE = 234, /*!\brief Start point of control IDs for aom_dec_control_id. * Any new common control IDs should be added above. */ AOM_DECODER_CTRL_ID_START = 256 // No common control IDs should be added after AOM_DECODER_CTRL_ID_START. }; /*!\brief AV1 specific reference frame data struct * * Define the data struct to access av1 reference frames. */ typedef struct av1_ref_frame { int idx; /**< frame index to get (input) */ int use_external_ref; /**< Directly use external ref buffer(decoder only) */ aom_image_t img; /**< img structure to populate (output) */ } av1_ref_frame_t; /*!\cond */ /*!\brief aom decoder control function parameter type * * Defines the data type for each of AOM decoder control function requires. * * \note For each control ID "X", a macro-define of * AOM_CTRL_X is provided. It is used at compile time to determine * if the control ID is supported by the libaom library available, * when the libaom version cannot be controlled. */ AOM_CTRL_USE_TYPE(AV1_GET_REFERENCE, av1_ref_frame_t *) #define AOM_CTRL_AV1_GET_REFERENCE AOM_CTRL_USE_TYPE(AV1_SET_REFERENCE, av1_ref_frame_t *) #define AOM_CTRL_AV1_SET_REFERENCE AOM_CTRL_USE_TYPE(AV1_COPY_REFERENCE, av1_ref_frame_t *) #define AOM_CTRL_AV1_COPY_REFERENCE AOM_CTRL_USE_TYPE(AV1_GET_NEW_FRAME_IMAGE, aom_image_t *) #define AOM_CTRL_AV1_GET_NEW_FRAME_IMAGE AOM_CTRL_USE_TYPE(AV1_COPY_NEW_FRAME_IMAGE, aom_image_t *) #define AOM_CTRL_AV1_COPY_NEW_FRAME_IMAGE /*!\endcond */ /*! @} - end defgroup aom */ #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_AOM_H_ aom-3.12.1/aom/aom_codec.h000066400000000000000000000515221477627663500152440ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /////////////////////////////////////////////////////////////////////////////// // Internal implementation details /////////////////////////////////////////////////////////////////////////////// // // There are two levels of interfaces used to access the AOM codec: the // aom_codec_iface and the aom_codec_ctx. // // 1. aom_codec_iface_t // (Related files: aom/aom_codec.h, aom/src/aom_codec.c, // aom/internal/aom_codec_internal.h, av1/av1_cx_iface.c, // av1/av1_dx_iface.c) // // Used to initialize the codec context, which contains the configuration for // for modifying the encoder/decoder during run-time. See the other // documentation in this header file for more details. For the most part, // users will call helper functions, such as aom_codec_iface_name, // aom_codec_get_caps, etc., to interact with it. // // The main purpose of the aom_codec_iface_t is to provide a way to generate // a default codec config, find out what capabilities the implementation has, // and create an aom_codec_ctx_t (which is actually used to interact with the // codec). // // Note that the implementations for the AV1 algorithm are located in // av1/av1_cx_iface.c and av1/av1_dx_iface.c // // // 2. aom_codec_ctx_t // (Related files: aom/aom_codec.h, av1/av1_cx_iface.c, av1/av1_dx_iface.c, // aom/aomcx.h, aom/aomdx.h, aom/src/aom_encoder.c, aom/src/aom_decoder.c) // // The actual interface between user code and the codec. It stores the name // of the codec, a pointer back to the aom_codec_iface_t that initialized it, // initialization flags, a config for either encoder or the decoder, and a // pointer to internal data. // // The codec is configured / queried through calls to aom_codec_control, // which takes a control ID (listed in aomcx.h and aomdx.h) and a parameter. // In the case of "getter" control IDs, the parameter is modified to have // the requested value; in the case of "setter" control IDs, the codec's // configuration is changed based on the parameter. Note that a aom_codec_err_t // is returned, which indicates if the operation was successful or not. // // Note that for the encoder, the aom_codec_alg_priv_t points to the // the aom_codec_alg_priv structure in av1/av1_cx_iface.c, and for the decoder, // the struct in av1/av1_dx_iface.c. Variables such as AV1_COMP cpi are stored // here and also used in the core algorithm. // // At the end, aom_codec_destroy should be called for each initialized // aom_codec_ctx_t. /*!\defgroup codec Common Algorithm Interface * This abstraction allows applications to easily support multiple video * formats with minimal code duplication. This section describes the interface * common to all codecs (both encoders and decoders). * @{ */ /*!\file * \brief Describes the codec algorithm interface to applications. * * This file describes the interface between an application and a * video codec algorithm. * * An application instantiates a specific codec instance by using * aom_codec_dec_init() or aom_codec_enc_init() and a pointer to the * algorithm's interface structure: *
 *     my_app.c:
 *       extern aom_codec_iface_t my_codec;
 *       {
 *           aom_codec_ctx_t algo;
 *           int threads = 4;
 *           aom_codec_dec_cfg_t cfg = { threads, 0, 0, 1 };
 *           res = aom_codec_dec_init(&algo, &my_codec, &cfg, 0);
 *       }
 *     
* * Once initialized, the instance is managed using other functions from * the aom_codec_* family. */ #ifndef AOM_AOM_AOM_CODEC_H_ #define AOM_AOM_AOM_CODEC_H_ #ifdef __cplusplus extern "C" { #endif #include "aom/aom_image.h" #include "aom/aom_integer.h" /*!\brief Decorator indicating a function is deprecated */ #ifndef AOM_DEPRECATED #if defined(__GNUC__) #define AOM_DEPRECATED __attribute__((deprecated)) #elif defined(_MSC_VER) #define AOM_DEPRECATED #else #define AOM_DEPRECATED #endif #endif /* AOM_DEPRECATED */ #ifndef AOM_DECLSPEC_DEPRECATED #if defined(__GNUC__) #define AOM_DECLSPEC_DEPRECATED /**< \copydoc #AOM_DEPRECATED */ #elif defined(_MSC_VER) /*!\brief \copydoc #AOM_DEPRECATED */ #define AOM_DECLSPEC_DEPRECATED __declspec(deprecated) #else #define AOM_DECLSPEC_DEPRECATED /**< \copydoc #AOM_DEPRECATED */ #endif #endif /* AOM_DECLSPEC_DEPRECATED */ /*!\brief Decorator indicating a function is potentially unused */ #ifdef AOM_UNUSED #elif defined(__GNUC__) || defined(__clang__) #define AOM_UNUSED __attribute__((unused)) #else #define AOM_UNUSED #endif /*!\brief Decorator indicating that given struct/union/enum is packed */ #ifndef ATTRIBUTE_PACKED #if defined(__GNUC__) #define ATTRIBUTE_PACKED __attribute__((packed)) #elif defined(_MSC_VER) #define ATTRIBUTE_PACKED #else #define ATTRIBUTE_PACKED #endif #endif /* ATTRIBUTE_PACKED */ /*!\brief Current ABI version number * * \internal * If this file is altered in any way that changes the ABI, this value * must be bumped. Examples include, but are not limited to, changing * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ #define AOM_CODEC_ABI_VERSION (7 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/ /*!\brief Algorithm return codes */ typedef enum { /*!\brief Operation completed without error */ AOM_CODEC_OK, /*!\brief Unspecified error */ AOM_CODEC_ERROR, /*!\brief Memory operation failed */ AOM_CODEC_MEM_ERROR, /*!\brief ABI version mismatch */ AOM_CODEC_ABI_MISMATCH, /*!\brief Algorithm does not have required capability */ AOM_CODEC_INCAPABLE, /*!\brief The given bitstream is not supported. * * The bitstream was unable to be parsed at the highest level. The decoder * is unable to proceed. This error \ref SHOULD be treated as fatal to the * stream. */ AOM_CODEC_UNSUP_BITSTREAM, /*!\brief Encoded bitstream uses an unsupported feature * * The decoder does not implement a feature required by the encoder. This * return code should only be used for features that prevent future * pictures from being properly decoded. This error \ref MAY be treated as * fatal to the stream or \ref MAY be treated as fatal to the current GOP. */ AOM_CODEC_UNSUP_FEATURE, /*!\brief The coded data for this stream is corrupt or incomplete * * There was a problem decoding the current frame. This return code * should only be used for failures that prevent future pictures from * being properly decoded. This error \ref MAY be treated as fatal to the * stream or \ref MAY be treated as fatal to the current GOP. If decoding * is continued for the current GOP, artifacts may be present. */ AOM_CODEC_CORRUPT_FRAME, /*!\brief An application-supplied parameter is not valid. * */ AOM_CODEC_INVALID_PARAM, /*!\brief An iterator reached the end of list. * */ AOM_CODEC_LIST_END } aom_codec_err_t; /*! \brief Codec capabilities bitfield * * Each codec advertises the capabilities it supports as part of its * ::aom_codec_iface_t interface structure. Capabilities are extra interfaces * or functionality, and are not required to be supported. * * The available flags are specified by AOM_CODEC_CAP_* defines. */ typedef long aom_codec_caps_t; #define AOM_CODEC_CAP_DECODER 0x1 /**< Is a decoder */ #define AOM_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */ /*! \brief Initialization-time Feature Enabling * * Certain codec features must be known at initialization time, to allow for * proper memory allocation. * * The available flags are specified by AOM_CODEC_USE_* defines. The bits are * allocated as follows: * 0x1 - 0x80: codec (common to decoder and encoder) * 0x100 - 0x8000: decoder * 0x10000 - 0x800000: encoder */ typedef long aom_codec_flags_t; // Experimental feature policy // // New features may be marked as experimental. Experimental features are not // part of the stable API and may be modified or removed in a future release. // Experimental features are made available only if you pass the // AOM_CODEC_USE_EXPERIMENTAL flag to the codec init function. // // If you use experimental features, you must rebuild your code whenever you // update to a new libaom release, and you must be prepared to modify your code // when an experimental feature you use is modified or removed. If you are not // sure, DO NOT use experimental features. #define AOM_CODEC_USE_EXPERIMENTAL 0x1 /**< Enables experimental features */ /*!\brief Time Stamp Type * * An integer, which when multiplied by the stream's time base, provides * the absolute time of a sample. */ typedef int64_t aom_codec_pts_t; /*!\brief Codec interface structure. * * Contains function pointers and other data private to the codec * implementation. This structure is opaque to the application. Common * functions used with this structure: * - aom_codec_iface_name(aom_codec_iface_t *iface): get the * name of the codec * - aom_codec_get_caps(aom_codec_iface_t *iface): returns * the capabilities of the codec * - aom_codec_enc_config_default: generate the default config for * initializing the encoder (see documentation in aom_encoder.h) * - aom_codec_dec_init, aom_codec_enc_init: initialize the codec context * structure (see documentation on aom_codec_ctx). * * To get access to the AV1 encoder and decoder, use aom_codec_av1_cx() and * aom_codec_av1_dx(). */ typedef const struct aom_codec_iface aom_codec_iface_t; /*!\brief Codec private data structure. * * Contains data private to the codec implementation. This structure is opaque * to the application. */ typedef struct aom_codec_priv aom_codec_priv_t; /*!\brief Compressed Frame Flags * * This type represents a bitfield containing information about a compressed * frame that may be useful to an application. The most significant 16 bits * can be used by an algorithm to provide additional detail, for example to * support frame types that are codec specific (MPEG-1 D-frames for example) */ typedef uint32_t aom_codec_frame_flags_t; #define AOM_FRAME_IS_KEY 0x1u /**< frame is the start of a GOP */ /*!\brief frame can be dropped without affecting the stream (no future frame * depends on this one) */ #define AOM_FRAME_IS_DROPPABLE 0x2u /*!\brief this is an INTRA_ONLY frame */ #define AOM_FRAME_IS_INTRAONLY 0x10u /*!\brief this is an S-frame */ #define AOM_FRAME_IS_SWITCH 0x20u /*!\brief this is an error-resilient frame */ #define AOM_FRAME_IS_ERROR_RESILIENT 0x40u /*!\brief this is a key-frame dependent recovery-point frame */ #define AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT 0x80u /*!\brief Iterator * * Opaque storage used for iterating over lists. */ typedef const void *aom_codec_iter_t; /*!\brief Codec context structure * * All codecs \ref MUST support this context structure fully. In general, * this data should be considered private to the codec algorithm, and * not be manipulated or examined by the calling application. Applications * may reference the 'name' member to get a printable description of the * algorithm. */ typedef struct aom_codec_ctx { const char *name; /**< Printable interface name */ aom_codec_iface_t *iface; /**< Interface pointers */ aom_codec_err_t err; /**< Last returned error */ const char *err_detail; /**< Detailed info, if available */ aom_codec_flags_t init_flags; /**< Flags passed at init time */ union { /**< Decoder Configuration Pointer */ const struct aom_codec_dec_cfg *dec; /**< Encoder Configuration Pointer */ const struct aom_codec_enc_cfg *enc; const void *raw; } config; /**< Configuration pointer aliasing union */ aom_codec_priv_t *priv; /**< Algorithm private storage */ } aom_codec_ctx_t; /*!\brief Bit depth for codec * * * This enumeration determines the bit depth of the codec. */ typedef enum aom_bit_depth { AOM_BITS_8 = 8, /**< 8 bits */ AOM_BITS_10 = 10, /**< 10 bits */ AOM_BITS_12 = 12, /**< 12 bits */ } aom_bit_depth_t; /*!\brief Superblock size selection. * * Defines the superblock size used for encoding. The superblock size can * either be fixed at 64x64 or 128x128 pixels, or it can be dynamically * selected by the encoder for each frame. */ typedef enum aom_superblock_size { AOM_SUPERBLOCK_SIZE_64X64, /**< Always use 64x64 superblocks. */ AOM_SUPERBLOCK_SIZE_128X128, /**< Always use 128x128 superblocks. */ AOM_SUPERBLOCK_SIZE_DYNAMIC /**< Select superblock size dynamically. */ } aom_superblock_size_t; /* * Library Version Number Interface * * For example, see the following sample return values: * aom_codec_version() (1<<16 | 2<<8 | 3) * aom_codec_version_str() "v1.2.3-rc1-16-gec6a1ba" * aom_codec_version_extra_str() "rc1-16-gec6a1ba" */ /*!\brief Return the version information (as an integer) * * Returns a packed encoding of the library version number. This will only * include the major.minor.patch component of the version number. Note that this * encoded value should be accessed through the macros provided, as the encoding * may change in the future. * */ int aom_codec_version(void); /*!\brief Return the major version number */ #define aom_codec_version_major() ((aom_codec_version() >> 16) & 0xff) /*!\brief Return the minor version number */ #define aom_codec_version_minor() ((aom_codec_version() >> 8) & 0xff) /*!\brief Return the patch version number */ #define aom_codec_version_patch() ((aom_codec_version() >> 0) & 0xff) /*!\brief Return the version information (as a string) * * Returns a printable string containing the full library version number. This * may contain additional text following the three digit version number, as to * indicate release candidates, pre-release versions, etc. * */ const char *aom_codec_version_str(void); /*!\brief Return the version information (as a string) * * Returns a printable "extra string". This is the component of the string * returned by aom_codec_version_str() following the three digit version number. * */ const char *aom_codec_version_extra_str(void); /*!\brief Return the build configuration * * Returns a printable string containing an encoded version of the build * configuration. This may be useful to aom support. * */ const char *aom_codec_build_config(void); /*!\brief Return the name for a given interface * * Returns a human readable string for name of the given codec interface. * * \param[in] iface Interface pointer * */ const char *aom_codec_iface_name(aom_codec_iface_t *iface); /*!\brief Convert error number to printable string * * Returns a human readable string for the last error returned by the * algorithm. The returned error will be one line and will not contain * any newline characters. * * * \param[in] err Error number. * */ const char *aom_codec_err_to_string(aom_codec_err_t err); /*!\brief Retrieve error synopsis for codec context * * Returns a human readable string for the last error returned by the * algorithm. The returned error will be one line and will not contain * any newline characters. * * * \param[in] ctx Pointer to this instance's context. * */ const char *aom_codec_error(const aom_codec_ctx_t *ctx); /*!\brief Retrieve detailed error information for codec context * * Returns a human readable string providing detailed information about * the last error. The returned string is only valid until the next * aom_codec_* function call (except aom_codec_error and * aom_codec_error_detail) on the codec context. * * \param[in] ctx Pointer to this instance's context. * * \retval NULL * No detailed information is available. */ const char *aom_codec_error_detail(const aom_codec_ctx_t *ctx); /* REQUIRED FUNCTIONS * * The following functions are required to be implemented for all codecs. * They represent the base case functionality expected of all codecs. */ /*!\brief Destroy a codec instance * * Destroys a codec context, freeing any associated memory buffers. * * \param[in] ctx Pointer to this instance's context * * \retval #AOM_CODEC_OK * The codec instance has been destroyed. * \retval #AOM_CODEC_INVALID_PARAM * ctx is a null pointer. * \retval #AOM_CODEC_ERROR * Codec context not initialized. */ aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx); /*!\brief Get the capabilities of an algorithm. * * Retrieves the capabilities bitfield from the algorithm's interface. * * \param[in] iface Pointer to the algorithm interface * */ aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface); /*!\name Codec Control * * The aom_codec_control function exchanges algorithm specific data with the * codec instance. Additionally, the macro AOM_CODEC_CONTROL_TYPECHECKED is * provided, which will type-check the parameter against the control ID before * calling aom_codec_control - note that this macro requires the control ID * to be directly encoded in it, e.g., * AOM_CODEC_CONTROL_TYPECHECKED(&ctx, AOME_SET_CPUUSED, 8). * * The codec control IDs can be found in aom.h, aomcx.h, and aomdx.h * (defined as aom_com_control_id, aome_enc_control_id, and aom_dec_control_id). * @{ */ /*!\brief Algorithm Control * * aom_codec_control takes a context, a control ID, and a third parameter * (with varying type). If the context is non-null and an error occurs, * ctx->err will be set to the same value as the return value. * * \param[in] ctx Pointer to this instance's context * \param[in] ctrl_id Algorithm specific control identifier. * Must be nonzero. * * \retval #AOM_CODEC_OK * The control request was processed. * \retval #AOM_CODEC_ERROR * The control request was not processed. * \retval #AOM_CODEC_INVALID_PARAM * The control ID was zero, or the data was not valid. */ aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...); /*!\brief Key & Value API * * aom_codec_set_option() takes a context, a key (option name) and a value. If * the context is non-null and an error occurs, ctx->err will be set to the same * value as the return value. * * \param[in] ctx Pointer to this instance's context * \param[in] name The name of the option (key) * \param[in] value The value of the option * * \retval #AOM_CODEC_OK * The value of the option was set. * \retval #AOM_CODEC_INVALID_PARAM * The data was not valid. * \retval #AOM_CODEC_ERROR * The option was not successfully set. */ aom_codec_err_t aom_codec_set_option(aom_codec_ctx_t *ctx, const char *name, const char *value); /*!\brief aom_codec_control wrapper macro (adds type-checking, less flexible) * * This macro allows for type safe conversions across the variadic parameter * to aom_codec_control(). However, it requires the explicit control ID * be passed in (it cannot be passed in via a variable) -- otherwise a compiler * error will occur. After the type checking, it calls aom_codec_control. */ #define AOM_CODEC_CONTROL_TYPECHECKED(ctx, id, data) \ aom_codec_control_typechecked_##id(ctx, id, data) /**<\hideinitializer*/ /*!\brief Creates type checking mechanisms for aom_codec_control * * It defines a static function with the correctly typed arguments as a wrapper * to the type-unsafe aom_codec_control function. It also creates a typedef * for each type. */ #define AOM_CTRL_USE_TYPE(id, typ) \ static aom_codec_err_t aom_codec_control_typechecked_##id( \ aom_codec_ctx_t *, int, typ) AOM_UNUSED; \ static aom_codec_err_t aom_codec_control_typechecked_##id( \ aom_codec_ctx_t *ctx, int ctrl, typ data) { \ return aom_codec_control(ctx, ctrl, data); \ } /**<\hideinitializer*/ \ typedef typ aom_codec_control_type_##id; /*!@} end Codec Control group */ /*!\brief OBU types. */ typedef enum ATTRIBUTE_PACKED { OBU_SEQUENCE_HEADER = 1, OBU_TEMPORAL_DELIMITER = 2, OBU_FRAME_HEADER = 3, OBU_TILE_GROUP = 4, OBU_METADATA = 5, OBU_FRAME = 6, OBU_REDUNDANT_FRAME_HEADER = 7, OBU_TILE_LIST = 8, OBU_PADDING = 15, } OBU_TYPE; /*!\brief OBU metadata types. */ typedef enum { OBU_METADATA_TYPE_AOM_RESERVED_0 = 0, OBU_METADATA_TYPE_HDR_CLL = 1, OBU_METADATA_TYPE_HDR_MDCV = 2, OBU_METADATA_TYPE_SCALABILITY = 3, OBU_METADATA_TYPE_ITUT_T35 = 4, OBU_METADATA_TYPE_TIMECODE = 5, } OBU_METADATA_TYPE; /*!\brief Returns string representation of OBU_TYPE. * * \param[in] type The OBU_TYPE to convert to string. */ const char *aom_obu_type_to_string(OBU_TYPE type); /*!@} - end defgroup codec*/ #ifdef __cplusplus } #endif #endif // AOM_AOM_AOM_CODEC_H_ aom-3.12.1/aom/aom_decoder.h000066400000000000000000000243241477627663500155740ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_AOM_DECODER_H_ #define AOM_AOM_AOM_DECODER_H_ /*!\defgroup decoder Decoder Algorithm Interface * \ingroup codec * This abstraction allows applications using this decoder to easily support * multiple video formats with minimal code duplication. This section describes * the interface common to all decoders. * @{ */ /*!\file * \brief Describes the decoder algorithm interface to applications. * * This file describes the interface between an application and a * video decoder algorithm. * */ #ifdef __cplusplus extern "C" { #endif #include "aom/aom_codec.h" // IWYU pragma: export #include "aom/aom_frame_buffer.h" /*!\brief Current ABI version number * * \internal * If this file is altered in any way that changes the ABI, this value * must be bumped. Examples include, but are not limited to, changing * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ #define AOM_DECODER_ABI_VERSION \ (6 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Decoder capabilities bitfield * * Each decoder advertises the capabilities it supports as part of its * ::aom_codec_iface_t interface structure. Capabilities are extra interfaces * or functionality, and are not required to be supported by a decoder. * * The available flags are specified by AOM_CODEC_CAP_* defines. */ /*!brief Can support external frame buffers */ #define AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x200000 /*! \brief Initialization-time Feature Enabling * * Certain codec features must be known at initialization time, to allow for * proper memory allocation. * * The available flags are specified by AOM_CODEC_USE_* defines. */ /*!\brief Stream properties * * This structure is used to query or set properties of the decoded * stream. */ typedef struct aom_codec_stream_info { unsigned int w; /**< Width (or 0 for unknown/default) */ unsigned int h; /**< Height (or 0 for unknown/default) */ unsigned int is_kf; /**< Current frame is a keyframe */ unsigned int number_spatial_layers; /**< Number of spatial layers */ unsigned int number_temporal_layers; /**< Number of temporal layers */ unsigned int is_annexb; /**< Is Bitstream in Annex-B format */ } aom_codec_stream_info_t; /* REQUIRED FUNCTIONS * * The following functions are required to be implemented for all decoders. * They represent the base case functionality expected of all decoders. */ /*!\brief Initialization Configurations * * This structure is used to pass init time configuration options to the * decoder. */ typedef struct aom_codec_dec_cfg { unsigned int threads; /**< Maximum number of threads to use, default 1 */ unsigned int w; /**< Width */ unsigned int h; /**< Height */ unsigned int allow_lowbitdepth; /**< Allow use of low-bitdepth coding path */ } aom_codec_dec_cfg_t; /**< alias for struct aom_codec_dec_cfg */ /*!\brief Initialize a decoder instance * * Initializes a decoder context using the given interface. Applications * should call the aom_codec_dec_init convenience macro instead of this * function directly, to ensure that the ABI version number parameter * is properly initialized. * * If the library was configured with cmake -DCONFIG_MULTITHREAD=0, this * call is not thread safe and should be guarded with a lock if being used * in a multithreaded context. * * \param[in] ctx Pointer to this instance's context. * \param[in] iface Pointer to the algorithm interface to use. * \param[in] cfg Configuration to use, if known. May be NULL. * \param[in] flags Bitfield of AOM_CODEC_USE_* flags * \param[in] ver ABI version number. Must be set to * AOM_DECODER_ABI_VERSION * \retval #AOM_CODEC_OK * The decoder algorithm has been initialized. * \retval #AOM_CODEC_MEM_ERROR * Memory allocation failed. */ aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx, aom_codec_iface_t *iface, const aom_codec_dec_cfg_t *cfg, aom_codec_flags_t flags, int ver); /*!\brief Convenience macro for aom_codec_dec_init_ver() * * Ensures the ABI version parameter is properly set. */ #define aom_codec_dec_init(ctx, iface, cfg, flags) \ aom_codec_dec_init_ver(ctx, iface, cfg, flags, AOM_DECODER_ABI_VERSION) /*!\brief Parse stream info from a buffer * * Performs high level parsing of the bitstream. Construction of a decoder * context is not necessary. Can be used to determine if the bitstream is * of the proper format, and to extract information from the stream. * * \param[in] iface Pointer to the algorithm interface * \param[in] data Pointer to a block of data to parse * \param[in] data_sz Size of the data buffer * \param[in,out] si Pointer to stream info to update. The is_annexb * member \ref MUST be properly initialized. This * function sets the rest of the members. * * \retval #AOM_CODEC_OK * Bitstream is parsable and stream information updated. * \retval #AOM_CODEC_INVALID_PARAM * One of the arguments is invalid, for example a NULL pointer. * \retval #AOM_CODEC_UNSUP_BITSTREAM * The decoder didn't recognize the coded data, or the * buffer was too short. */ aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface, const uint8_t *data, size_t data_sz, aom_codec_stream_info_t *si); /*!\brief Return information about the current stream. * * Returns information about the stream that has been parsed during decoding. * * \param[in] ctx Pointer to this instance's context * \param[in,out] si Pointer to stream info to update. * * \retval #AOM_CODEC_OK * Bitstream is parsable and stream information updated. * \retval #AOM_CODEC_INVALID_PARAM * One of the arguments is invalid, for example a NULL pointer. * \retval #AOM_CODEC_UNSUP_BITSTREAM * The decoder couldn't parse the submitted data. */ aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx, aom_codec_stream_info_t *si); /*!\brief Decode data * * Processes a buffer of coded data. Encoded data \ref MUST be passed in DTS * (decode time stamp) order. Frames produced will always be in PTS * (presentation time stamp) order. * * \param[in] ctx Pointer to this instance's context * \param[in] data Pointer to this block of new coded data. * \param[in] data_sz Size of the coded data, in bytes. * \param[in] user_priv Application specific data to associate with * this frame. * * \return Returns #AOM_CODEC_OK if the coded data was processed completely * and future pictures can be decoded without error. Otherwise, * see the descriptions of the other error codes in ::aom_codec_err_t * for recoverability capabilities. */ aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data, size_t data_sz, void *user_priv); /*!\brief Decoded frames iterator * * Iterates over a list of the frames available for display. The iterator * storage should be initialized to NULL to start the iteration. Iteration is * complete when this function returns NULL. * * The list of available frames becomes valid upon completion of the * aom_codec_decode call, and remains valid until the next call to * aom_codec_decode. * * \param[in] ctx Pointer to this instance's context * \param[in,out] iter Iterator storage, initialized to NULL * * \return Returns a pointer to an image, if one is ready for display. Frames * produced will always be in PTS (presentation time stamp) order. */ aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter); /*!\defgroup cap_external_frame_buffer External Frame Buffer Functions * * The following function is required to be implemented for all decoders * that advertise the AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability. * Calling this function for codecs that don't advertise this capability * will result in an error code being returned, usually AOM_CODEC_INCAPABLE. * @{ */ /*!\brief Pass in external frame buffers for the decoder to use. * * Registers functions to be called when libaom needs a frame buffer * to decode the current frame and a function to be called when libaom does * not internally reference the frame buffer. This set function must * be called before the first call to decode or libaom will assume the * default behavior of allocating frame buffers internally. * * \param[in] ctx Pointer to this instance's context * \param[in] cb_get Pointer to the get callback function * \param[in] cb_release Pointer to the release callback function * \param[in] cb_priv Callback's private data * * \retval #AOM_CODEC_OK * External frame buffers will be used by libaom. * \retval #AOM_CODEC_INVALID_PARAM * One or more of the callbacks were NULL. * \retval #AOM_CODEC_ERROR * Decoder context not initialized. * \retval #AOM_CODEC_INCAPABLE * Algorithm not capable of using external frame buffers. * * \note * When decoding AV1, the application may be required to pass in at least * #AOM_MAXIMUM_WORK_BUFFERS external frame buffers. */ aom_codec_err_t aom_codec_set_frame_buffer_functions( aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get, aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv); /*!@} - end defgroup cap_external_frame_buffer */ /*!@} - end defgroup decoder*/ #ifdef __cplusplus } #endif #endif // AOM_AOM_AOM_DECODER_H_ aom-3.12.1/aom/aom_encoder.h000066400000000000000000001217661477627663500156160ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_AOM_ENCODER_H_ #define AOM_AOM_AOM_ENCODER_H_ /*!\defgroup encoder Encoder Algorithm Interface * \ingroup codec * This abstraction allows applications using this encoder to easily support * multiple video formats with minimal code duplication. This section describes * the interface common to all encoders. * @{ */ /*!\file * \brief Describes the encoder algorithm interface to applications. * * This file describes the interface between an application and a * video encoder algorithm. * */ #ifdef __cplusplus extern "C" { #endif #include "aom/aom_codec.h" // IWYU pragma: export #include "aom/aom_external_partition.h" /*!\brief Current ABI version number * * \hideinitializer * \internal * If this file is altered in any way that changes the ABI, this value * must be bumped. Examples include, but are not limited to, changing * types, removing or reassigning enums, adding/removing/rearranging * fields to structures * * Note: In the definition of AOM_ENCODER_ABI_VERSION, 3 is the value of * AOM_EXT_PART_ABI_VERSION in libaom v3.2.0. The old value of * AOM_EXT_PART_ABI_VERSION is used so as to not break the ABI version check in * aom_codec_enc_init_ver() when an application compiled against libaom v3.2.0 * passes the old value of AOM_ENCODER_ABI_VERSION to aom_codec_enc_init_ver(). * The external partition API is still experimental. When it is declared stable, * we will replace 3 with AOM_EXT_PART_ABI_VERSION in the definition of * AOM_ENCODER_ABI_VERSION. */ #define AOM_ENCODER_ABI_VERSION \ (10 + AOM_CODEC_ABI_VERSION + /*AOM_EXT_PART_ABI_VERSION=*/3) /*! \brief Encoder capabilities bitfield * * Each encoder advertises the capabilities it supports as part of its * ::aom_codec_iface_t interface structure. Capabilities are extra * interfaces or functionality, and are not required to be supported * by an encoder. * * The available flags are specified by AOM_CODEC_CAP_* defines. */ #define AOM_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */ /*! Can support input images at greater than 8 bitdepth. */ #define AOM_CODEC_CAP_HIGHBITDEPTH 0x40000 /*! \brief Initialization-time Feature Enabling * * Certain codec features must be known at initialization time, to allow * for proper memory allocation. * * The available flags are specified by AOM_CODEC_USE_* defines. */ #define AOM_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */ #define AOM_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */ // 0x80000 was used for the experimental feature AOM_CODEC_USE_PRESET during // libaom v3.11.0 development but was removed before the release. /*!\brief Generic fixed size buffer structure * * This structure is able to hold a reference to any fixed size buffer. */ typedef struct aom_fixed_buf { void *buf; /**< Pointer to the data. Does NOT own the data! */ size_t sz; /**< Length of the buffer, in chars */ } aom_fixed_buf_t; /**< alias for struct aom_fixed_buf */ /*!\brief Error Resilient flags * * These flags define which error resilient features to enable in the * encoder. The flags are specified through the * aom_codec_enc_cfg::g_error_resilient variable. */ typedef uint32_t aom_codec_er_flags_t; /*!\brief Improve resiliency against losses of whole frames */ #define AOM_ERROR_RESILIENT_DEFAULT 0x1 /*!\brief Encoder output packet variants * * This enumeration lists the different kinds of data packets that can be * returned by calls to aom_codec_get_cx_data(). Algorithms \ref MAY * extend this list to provide additional functionality. */ enum aom_codec_cx_pkt_kind { AOM_CODEC_CX_FRAME_PKT, /**< Compressed video frame */ AOM_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */ AOM_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */ AOM_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */ AOM_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions */ }; /*!\brief Encoder output packet * * This structure contains the different kinds of output data the encoder * may produce while compressing a frame. */ typedef struct aom_codec_cx_pkt { enum aom_codec_cx_pkt_kind kind; /**< packet variant */ union { struct { void *buf; /**< compressed data buffer */ size_t sz; /**< length of compressed data */ /*!\brief time stamp to show frame (in timebase units) */ aom_codec_pts_t pts; /*!\brief duration to show frame (in timebase units) */ unsigned long duration; aom_codec_frame_flags_t flags; /**< flags for this frame */ /*!\brief the partition id defines the decoding order of the partitions. * Only applicable when "output partition" mode is enabled. First * partition has id 0.*/ int partition_id; /*!\brief size of the visible frame in this packet */ size_t vis_frame_size; } frame; /**< data for compressed frame packet */ aom_fixed_buf_t twopass_stats; /**< data for two-pass packet */ aom_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */ struct aom_psnr_pkt { unsigned int samples[4]; /**< Number of samples, total/y/u/v */ uint64_t sse[4]; /**< sum squared error, total/y/u/v */ double psnr[4]; /**< PSNR, total/y/u/v */ /*!\brief Number of samples, total/y/u/v when * input bit-depth < stream bit-depth.*/ unsigned int samples_hbd[4]; /*!\brief sum squared error, total/y/u/v when * input bit-depth < stream bit-depth.*/ uint64_t sse_hbd[4]; /*!\brief PSNR, total/y/u/v when * input bit-depth < stream bit-depth.*/ double psnr_hbd[4]; } psnr; /**< data for PSNR packet */ aom_fixed_buf_t raw; /**< data for arbitrary packets */ } data; /**< packet data */ } aom_codec_cx_pkt_t; /**< alias for struct aom_codec_cx_pkt */ /*!\brief Rational Number * * This structure holds a fractional value. */ typedef struct aom_rational { int num; /**< fraction numerator */ int den; /**< fraction denominator */ } aom_rational_t; /**< alias for struct aom_rational */ /*!\brief Multi-pass Encoding Pass * * AOM_RC_LAST_PASS is kept for backward compatibility. * If passes is not given and pass==2, the codec will assume passes=2. * For new code, it is recommended to use AOM_RC_SECOND_PASS and set * the "passes" member to 2 via the key & val API for two-pass encoding. */ enum aom_enc_pass { AOM_RC_ONE_PASS = 0, /**< Single pass mode */ AOM_RC_FIRST_PASS = 1, /**< First pass of multi-pass mode */ AOM_RC_SECOND_PASS = 2, /**< Second pass of multi-pass mode */ AOM_RC_THIRD_PASS = 3, /**< Third pass of multi-pass mode */ AOM_RC_LAST_PASS = 2, /**< Final pass of two-pass mode */ }; /*!\brief Rate control mode */ enum aom_rc_mode { AOM_VBR, /**< Variable Bit Rate (VBR) mode */ AOM_CBR, /**< Constant Bit Rate (CBR) mode */ AOM_CQ, /**< Constrained Quality (CQ) mode */ AOM_Q, /**< Constant Quality (Q) mode */ }; /*!\brief Keyframe placement mode. * * This enumeration determines whether keyframes are placed automatically by * the encoder or whether this behavior is disabled. Older releases of this * SDK were implemented such that AOM_KF_FIXED meant keyframes were disabled. * This name is confusing for this behavior, so the new symbols to be used * are AOM_KF_AUTO and AOM_KF_DISABLED. */ enum aom_kf_mode { AOM_KF_FIXED, /**< deprecated, implies AOM_KF_DISABLED */ AOM_KF_AUTO, /**< Encoder determines optimal placement automatically */ AOM_KF_DISABLED = 0 /**< Encoder does not place keyframes. */ }; /*!\brief Frame super-resolution mode. */ typedef enum { /**< Frame super-resolution is disabled for all frames. */ AOM_SUPERRES_NONE, /**< All frames are coded at the specified scale and super-resolved. */ AOM_SUPERRES_FIXED, /**< All frames are coded at a random scale and super-resolved. */ AOM_SUPERRES_RANDOM, /**< Super-resolution scale for each frame is determined based on the q index of that frame. */ AOM_SUPERRES_QTHRESH, /**< Full-resolution or super-resolution and the scale (in case of super-resolution) are automatically selected for each frame. */ AOM_SUPERRES_AUTO, } aom_superres_mode; /*!\brief Encoder Config Options * * This type allows to enumerate and control flags defined for encoder control * via config file at runtime. */ typedef struct cfg_options { /*!\brief Indicate init by cfg file * 0 or 1 */ unsigned int init_by_cfg_file; /*!\brief Superblock size * 0, 64 or 128 */ unsigned int super_block_size; /*!\brief max partition size * 8, 16, 32, 64, 128 */ unsigned int max_partition_size; /*!\brief min partition size * 8, 16, 32, 64, 128 */ unsigned int min_partition_size; /*!\brief disable AB Shape partition type * */ unsigned int disable_ab_partition_type; /*!\brief disable rectangular partition type * */ unsigned int disable_rect_partition_type; /*!\brief disable 1:4/4:1 partition type * */ unsigned int disable_1to4_partition_type; /*!\brief disable flip and identity transform type * */ unsigned int disable_flip_idtx; /*!\brief disable CDEF filter * */ unsigned int disable_cdef; /*!\brief disable Loop Restoration Filter * */ unsigned int disable_lr; /*!\brief disable OBMC * */ unsigned int disable_obmc; /*!\brief disable Warped Motion * */ unsigned int disable_warp_motion; /*!\brief disable global motion * */ unsigned int disable_global_motion; /*!\brief disable dist weighted compound * */ unsigned int disable_dist_wtd_comp; /*!\brief disable diff weighted compound * */ unsigned int disable_diff_wtd_comp; /*!\brief disable inter/intra compound * */ unsigned int disable_inter_intra_comp; /*!\brief disable masked compound * */ unsigned int disable_masked_comp; /*!\brief disable one sided compound * */ unsigned int disable_one_sided_comp; /*!\brief disable Palette * */ unsigned int disable_palette; /*!\brief disable Intra Block Copy * */ unsigned int disable_intrabc; /*!\brief disable chroma from luma * */ unsigned int disable_cfl; /*!\brief disable intra smooth mode * */ unsigned int disable_smooth_intra; /*!\brief disable filter intra * */ unsigned int disable_filter_intra; /*!\brief disable dual filter * */ unsigned int disable_dual_filter; /*!\brief disable intra angle delta * */ unsigned int disable_intra_angle_delta; /*!\brief disable intra edge filter * */ unsigned int disable_intra_edge_filter; /*!\brief disable 64x64 transform * */ unsigned int disable_tx_64x64; /*!\brief disable smooth inter/intra * */ unsigned int disable_smooth_inter_intra; /*!\brief disable inter/inter wedge comp * */ unsigned int disable_inter_inter_wedge; /*!\brief disable inter/intra wedge comp * */ unsigned int disable_inter_intra_wedge; /*!\brief disable paeth intra * */ unsigned int disable_paeth_intra; /*!\brief disable trellis quantization * */ unsigned int disable_trellis_quant; /*!\brief disable ref frame MV * */ unsigned int disable_ref_frame_mv; /*!\brief use reduced reference frame set * */ unsigned int reduced_reference_set; /*!\brief use reduced transform type set * */ unsigned int reduced_tx_type_set; } cfg_options_t; /*!\brief Encoded Frame Flags * * This type indicates a bitfield to be passed to aom_codec_encode(), defining * per-frame boolean values. By convention, bits common to all codecs will be * named AOM_EFLAG_*, and bits specific to an algorithm will be named * /algo/_eflag_*. The lower order 16 bits are reserved for common use. */ typedef long aom_enc_frame_flags_t; /*!\brief Force this frame to be a keyframe */ #define AOM_EFLAG_FORCE_KF (1 << 0) /*!\brief Encoder configuration structure * * This structure contains the encoder settings that have common representations * across all codecs. This doesn't imply that all codecs support all features, * however. */ typedef struct aom_codec_enc_cfg { /* * generic settings (g) */ /*!\brief Algorithm specific "usage" value * * Algorithms may define multiple values for usage, which may convey the * intent of how the application intends to use the stream. If this value * is non-zero, consult the documentation for the codec to determine its * meaning. */ unsigned int g_usage; /*!\brief Maximum number of threads to use * * For multi-threaded implementations, use no more than this number of * threads. The codec may use fewer threads than allowed. The value * 0 is equivalent to the value 1. */ unsigned int g_threads; /*!\brief Bitstream profile to use * * Some codecs support a notion of multiple bitstream profiles. Typically * this maps to a set of features that are turned on or off. Often the * profile to use is determined by the features of the intended decoder. * Consult the documentation for the codec to determine the valid values * for this parameter, or set to zero for a sane default. */ unsigned int g_profile; /**< profile of bitstream to use */ /*!\brief Width of the frame * * This value identifies the presentation resolution of the frame, * in pixels. Note that the frames passed as input to the encoder must * have this resolution. Frames will be presented by the decoder in this * resolution, independent of any spatial resampling the encoder may do. */ unsigned int g_w; /*!\brief Height of the frame * * This value identifies the presentation resolution of the frame, * in pixels. Note that the frames passed as input to the encoder must * have this resolution. Frames will be presented by the decoder in this * resolution, independent of any spatial resampling the encoder may do. */ unsigned int g_h; /*!\brief Max number of frames to encode * * If force video mode is off (the default) and g_limit is 1, the encoder * will encode a still picture (still_picture is set to 1 in the sequence * header OBU). If in addition full_still_picture_hdr is 0 (the default), * the encoder will use a reduced header (reduced_still_picture_header is * set to 1 in the sequence header OBU) for the still picture. */ unsigned int g_limit; /*!\brief Forced maximum width of the frame * * If this value is non-zero then it is used to force the maximum frame * width written in write_sequence_header(). */ unsigned int g_forced_max_frame_width; /*!\brief Forced maximum height of the frame * * If this value is non-zero then it is used to force the maximum frame * height written in write_sequence_header(). */ unsigned int g_forced_max_frame_height; /*!\brief Bit-depth of the codec * * This value identifies the bit_depth of the codec, * Only certain bit-depths are supported as identified in the * aom_bit_depth_t enum. */ aom_bit_depth_t g_bit_depth; /*!\brief Bit-depth of the input frames * * This value identifies the bit_depth of the input frames in bits. * Note that the frames passed as input to the encoder must have * this bit-depth. */ unsigned int g_input_bit_depth; /*!\brief Stream timebase units * * Indicates the smallest interval of time, in seconds, used by the stream. * For fixed frame rate material, or variable frame rate material where * frames are timed at a multiple of a given clock (ex: video capture), * the \ref RECOMMENDED method is to set the timebase to the reciprocal * of the frame rate (ex: 1001/30000 for 29.970 Hz NTSC). This allows the * pts to correspond to the frame number, which can be handy. For * re-encoding video from containers with absolute time timestamps, the * \ref RECOMMENDED method is to set the timebase to that of the parent * container or multimedia framework (ex: 1/1000 for ms, as in FLV). */ struct aom_rational g_timebase; /*!\brief Enable error resilient modes. * * The error resilient bitfield indicates to the encoder which features * it should enable to take measures for streaming over lossy or noisy * links. */ aom_codec_er_flags_t g_error_resilient; /*!\brief Multi-pass Encoding Mode * * This value should be set to the current phase for multi-pass encoding. * For single pass, set to #AOM_RC_ONE_PASS. */ enum aom_enc_pass g_pass; /*!\brief Allow lagged encoding * * If set, this value allows the encoder to consume a number of input * frames before producing output frames. This allows the encoder to * base decisions for the current frame on future frames. This does * increase the latency of the encoding pipeline, so it is not appropriate * in all situations (ex: realtime encoding). * * Note that this is a maximum value -- the encoder may produce frames * sooner than the given limit. Set this value to 0 to disable this * feature. */ unsigned int g_lag_in_frames; /* * rate control settings (rc) */ /*!\brief Temporal resampling configuration, if supported by the codec. * * Temporal resampling allows the codec to "drop" frames as a strategy to * meet its target data rate. This can cause temporal discontinuities in * the encoded video, which may appear as stuttering during playback. This * trade-off is often acceptable, but for many applications is not. It can * be disabled in these cases. * * Note that not all codecs support this feature. All aom AVx codecs do. * For other codecs, consult the documentation for that algorithm. * * This threshold is described as a percentage of the target data buffer. * When the data buffer falls below this percentage of fullness, a * dropped frame is indicated. Set the threshold to zero (0) to disable * this feature. */ unsigned int rc_dropframe_thresh; /*!\brief Mode for spatial resampling, if supported by the codec. * * Spatial resampling allows the codec to compress a lower resolution * version of the frame, which is then upscaled by the decoder to the * correct presentation resolution. This increases visual quality at * low data rates, at the expense of CPU time on the encoder/decoder. */ unsigned int rc_resize_mode; /*!\brief Frame resize denominator. * * The denominator for resize to use, assuming 8 as the numerator. * * Valid denominators are 8 - 16 for now. */ unsigned int rc_resize_denominator; /*!\brief Keyframe resize denominator. * * The denominator for resize to use, assuming 8 as the numerator. * * Valid denominators are 8 - 16 for now. */ unsigned int rc_resize_kf_denominator; /*!\brief Frame super-resolution scaling mode. * * Similar to spatial resampling, frame super-resolution integrates * upscaling after the encode/decode process. Taking control of upscaling and * using restoration filters should allow it to outperform normal resizing. */ aom_superres_mode rc_superres_mode; /*!\brief Frame super-resolution denominator. * * The denominator for superres to use. If fixed it will only change if the * cumulative scale change over resizing and superres is greater than 1/2; * this forces superres to reduce scaling. * * Valid denominators are 8 to 16. * * Used only by AOM_SUPERRES_FIXED. */ unsigned int rc_superres_denominator; /*!\brief Keyframe super-resolution denominator. * * The denominator for superres to use. If fixed it will only change if the * cumulative scale change over resizing and superres is greater than 1/2; * this forces superres to reduce scaling. * * Valid denominators are 8 - 16 for now. */ unsigned int rc_superres_kf_denominator; /*!\brief Frame super-resolution q threshold. * * The q level threshold after which superres is used. * Valid values are 1 to 63. * * Used only by AOM_SUPERRES_QTHRESH */ unsigned int rc_superres_qthresh; /*!\brief Keyframe super-resolution q threshold. * * The q level threshold after which superres is used for key frames. * Valid values are 1 to 63. * * Used only by AOM_SUPERRES_QTHRESH */ unsigned int rc_superres_kf_qthresh; /*!\brief Rate control algorithm to use. * * Indicates whether the end usage of this stream is to be streamed over * a bandwidth constrained link, indicating that Constant Bit Rate (CBR) * mode should be used, or whether it will be played back on a high * bandwidth link, as from a local disk, where higher variations in * bitrate are acceptable. */ enum aom_rc_mode rc_end_usage; /*!\brief Two-pass stats buffer. * * A buffer containing all of the stats packets produced in the first * pass, concatenated. */ aom_fixed_buf_t rc_twopass_stats_in; /*!\brief first pass mb stats buffer. * * A buffer containing all of the first pass mb stats packets produced * in the first pass, concatenated. */ aom_fixed_buf_t rc_firstpass_mb_stats_in; /*!\brief Target data rate * * Target bitrate to use for this stream, in kilobits per second. * Max allowed value is 2000000 */ unsigned int rc_target_bitrate; /* * quantizer settings */ /*!\brief Minimum (Best Quality) Quantizer * * The quantizer is the most direct control over the quality of the * encoded image. The range of valid values for the quantizer is codec * specific. Consult the documentation for the codec to determine the * values to use. To determine the range programmatically, call * aom_codec_enc_config_default() with a usage value of 0. */ unsigned int rc_min_quantizer; /*!\brief Maximum (Worst Quality) Quantizer * * The quantizer is the most direct control over the quality of the * encoded image. The range of valid values for the quantizer is codec * specific. Consult the documentation for the codec to determine the * values to use. To determine the range programmatically, call * aom_codec_enc_config_default() with a usage value of 0. */ unsigned int rc_max_quantizer; /* * bitrate tolerance */ /*!\brief Rate control adaptation undershoot control * * This value, controls the tolerance of the VBR algorithm to undershoot * and is used as a trigger threshold for more aggressive adaptation of Q. * * Valid values in the range 0-100. */ unsigned int rc_undershoot_pct; /*!\brief Rate control adaptation overshoot control * * This value, controls the tolerance of the VBR algorithm to overshoot * and is used as a trigger threshold for more aggressive adaptation of Q. * * Valid values in the range 0-100. */ unsigned int rc_overshoot_pct; /* * decoder buffer model parameters */ /*!\brief Decoder Buffer Size * * This value indicates the amount of data that may be buffered by the * decoding application. Note that this value is expressed in units of * time (milliseconds). For example, a value of 5000 indicates that the * client will buffer (at least) 5000ms worth of encoded data. Use the * target bitrate (#rc_target_bitrate) to convert to bits/bytes, if * necessary. */ unsigned int rc_buf_sz; /*!\brief Decoder Buffer Initial Size * * This value indicates the amount of data that will be buffered by the * decoding application prior to beginning playback. This value is * expressed in units of time (milliseconds). Use the target bitrate * (#rc_target_bitrate) to convert to bits/bytes, if necessary. */ unsigned int rc_buf_initial_sz; /*!\brief Decoder Buffer Optimal Size * * This value indicates the amount of data that the encoder should try * to maintain in the decoder's buffer. This value is expressed in units * of time (milliseconds). Use the target bitrate (#rc_target_bitrate) * to convert to bits/bytes, if necessary. */ unsigned int rc_buf_optimal_sz; /* * 2 pass rate control parameters */ /*!\brief Two-pass mode CBR/VBR bias * * Bias, expressed on a scale of 0 to 100, for determining target size * for the current frame. The value 0 indicates the optimal CBR mode * value should be used. The value 100 indicates the optimal VBR mode * value should be used. Values in between indicate which way the * encoder should "lean." */ unsigned int rc_2pass_vbr_bias_pct; /*!\brief Two-pass mode per-GOP minimum bitrate * * This value, expressed as a percentage of the target bitrate, indicates * the minimum bitrate to be used for a single GOP (aka "section") */ unsigned int rc_2pass_vbr_minsection_pct; /*!\brief Two-pass mode per-GOP maximum bitrate * * This value, expressed as a percentage of the target bitrate, indicates * the maximum bitrate to be used for a single GOP (aka "section") */ unsigned int rc_2pass_vbr_maxsection_pct; /* * keyframing settings (kf) */ /*!\brief Option to enable forward reference key frame * */ int fwd_kf_enabled; /*!\brief Keyframe placement mode * * This value indicates whether the encoder should place keyframes at a * fixed interval, or determine the optimal placement automatically * (as governed by the #kf_min_dist and #kf_max_dist parameters) */ enum aom_kf_mode kf_mode; /*!\brief Keyframe minimum interval * * This value, expressed as a number of frames, prevents the encoder from * placing a keyframe nearer than kf_min_dist to the previous keyframe. At * least kf_min_dist frames non-keyframes will be coded before the next * keyframe. Set kf_min_dist equal to kf_max_dist for a fixed interval. */ unsigned int kf_min_dist; /*!\brief Keyframe maximum interval * * This value, expressed as a number of frames, forces the encoder to code * a keyframe if one has not been coded in the last kf_max_dist frames. * A value of 0 implies all frames will be keyframes. Set kf_min_dist * equal to kf_max_dist for a fixed interval. */ unsigned int kf_max_dist; /*!\brief sframe interval * * This value, expressed as a number of frames, forces the encoder to code * an S-Frame every sframe_dist frames. */ unsigned int sframe_dist; /*!\brief sframe insertion mode * * This value must be set to 1 or 2, and tells the encoder how to insert * S-Frames. It will only have an effect if sframe_dist != 0. * * If altref is enabled: * - if sframe_mode == 1, the considered frame will be made into an * S-Frame only if it is an altref frame * - if sframe_mode == 2, the next altref frame will be made into an * S-Frame. * * Otherwise: the considered frame will be made into an S-Frame. * * \attention Not implemented. */ unsigned int sframe_mode; /*!\brief Tile coding mode * * This value indicates the tile coding mode. * A value of 0 implies a normal non-large-scale tile coding. A value of 1 * implies a large-scale tile coding. */ unsigned int large_scale_tile; /*!\brief Monochrome mode * * If this is nonzero, the encoder will generate a monochrome stream * with no chroma planes. */ unsigned int monochrome; /*!\brief full_still_picture_hdr * * If this is nonzero, the encoder will generate a full header * (reduced_still_picture_header is set to 0 in the sequence header OBU) even * for still picture encoding. If this is zero (the default), a reduced * header (reduced_still_picture_header is set to 1 in the sequence header * OBU) is used for still picture encoding. This flag has no effect when a * regular video with more than a single frame is encoded. */ unsigned int full_still_picture_hdr; /*!\brief Bitstream syntax mode * * This value indicates the bitstream syntax mode. * A value of 0 indicates bitstream is saved as Section 5 bitstream. A value * of 1 indicates the bitstream is saved in Annex-B format */ unsigned int save_as_annexb; /*!\brief Number of explicit tile widths specified * * This value indicates the number of tile widths specified * A value of 0 implies no tile widths are specified. * Tile widths are given in the array tile_widths[] */ int tile_width_count; /*!\brief Number of explicit tile heights specified * * This value indicates the number of tile heights specified * A value of 0 implies no tile heights are specified. * Tile heights are given in the array tile_heights[] */ int tile_height_count; /*!\brief Maximum number of tile widths in tile widths array * * This define gives the maximum number of elements in the tile_widths array. */ #define MAX_TILE_WIDTHS 64 // maximum tile width array length /*!\brief Array of specified tile widths * * This array specifies tile widths (and may be empty) * The number of widths specified is given by tile_width_count */ int tile_widths[MAX_TILE_WIDTHS]; /*!\brief Maximum number of tile heights in tile heights array. * * This define gives the maximum number of elements in the tile_heights array. */ #define MAX_TILE_HEIGHTS 64 // maximum tile height array length /*!\brief Array of specified tile heights * * This array specifies tile heights (and may be empty) * The number of heights specified is given by tile_height_count */ int tile_heights[MAX_TILE_HEIGHTS]; /*!\brief Whether encoder should use fixed QP offsets. * * If a value of 1 is provided, encoder will use fixed QP offsets for frames * at different levels of the pyramid. * If a value of 0 is provided, encoder will NOT use fixed QP offsets. * Note: This option is only relevant for --end-usage=q. */ unsigned int use_fixed_qp_offsets; /*!\brief Deprecated and ignored. DO NOT USE. * * TODO(aomedia:3269): Remove fixed_qp_offsets in libaom v4.0.0. */ int fixed_qp_offsets[5]; /*!\brief Options defined per config file * */ cfg_options_t encoder_cfg; } aom_codec_enc_cfg_t; /**< alias for struct aom_codec_enc_cfg */ /*!\brief Initialize an encoder instance * * Initializes an encoder context using the given interface. Applications * should call the aom_codec_enc_init convenience macro instead of this * function directly, to ensure that the ABI version number parameter * is properly initialized. * * If the library was configured with -DCONFIG_MULTITHREAD=0, this call * is not thread safe and should be guarded with a lock if being used * in a multithreaded context. * * If aom_codec_enc_init_ver() fails, it is not necessary to call * aom_codec_destroy() on the encoder context. * * \param[in] ctx Pointer to this instance's context. * \param[in] iface Pointer to the algorithm interface to use. * \param[in] cfg Configuration to use, if known. * \param[in] flags Bitfield of AOM_CODEC_USE_* flags * \param[in] ver ABI version number. Must be set to * AOM_ENCODER_ABI_VERSION * \retval #AOM_CODEC_OK * The encoder algorithm has been initialized. * \retval #AOM_CODEC_MEM_ERROR * Memory allocation failed. */ aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx, aom_codec_iface_t *iface, const aom_codec_enc_cfg_t *cfg, aom_codec_flags_t flags, int ver); /*!\brief Convenience macro for aom_codec_enc_init_ver() * * Ensures the ABI version parameter is properly set. */ #define aom_codec_enc_init(ctx, iface, cfg, flags) \ aom_codec_enc_init_ver(ctx, iface, cfg, flags, AOM_ENCODER_ABI_VERSION) /*!\brief Get the default configuration for a usage. * * Initializes an encoder configuration structure with default values. Supports * the notion of "usages" so that an algorithm may offer different default * settings depending on the user's intended goal. This function \ref SHOULD * be called by all applications to initialize the configuration structure * before specializing the configuration with application specific values. * * \param[in] iface Pointer to the algorithm interface to use. * \param[out] cfg Configuration buffer to populate. * \param[in] usage Algorithm specific usage value. For AV1, must be * set to AOM_USAGE_GOOD_QUALITY (0), * AOM_USAGE_REALTIME (1), or AOM_USAGE_ALL_INTRA (2). * * \retval #AOM_CODEC_OK * The configuration was populated. * \retval #AOM_CODEC_INCAPABLE * Interface is not an encoder interface. * \retval #AOM_CODEC_INVALID_PARAM * A parameter was NULL, or the usage value was not recognized. */ aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface, aom_codec_enc_cfg_t *cfg, unsigned int usage); /*!\brief Set or change configuration * * Reconfigures an encoder instance according to the given configuration. * * \param[in] ctx Pointer to this instance's context * \param[in] cfg Configuration buffer to use * * \retval #AOM_CODEC_OK * The configuration was populated. * \retval #AOM_CODEC_INCAPABLE * Interface is not an encoder interface. * \retval #AOM_CODEC_INVALID_PARAM * A parameter was NULL, or the usage value was not recognized. */ aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx, const aom_codec_enc_cfg_t *cfg); /*!\brief Get global stream headers * * Retrieves a stream level global header packet, if supported by the codec. * Calls to this function should be deferred until all configuration information * has been passed to libaom. Otherwise the global header data may be * invalidated by additional configuration changes. * * The AV1 implementation of this function returns an OBU. The OBU returned is * in Low Overhead Bitstream Format. Specifically, the obu_has_size_field bit is * set, and the buffer contains the obu_size field for the returned OBU. * * \param[in] ctx Pointer to this instance's context * * \retval NULL * Encoder does not support global header, or an error occurred while * generating the global header. * * \retval Non-NULL * Pointer to buffer containing global header packet. The caller owns the * memory associated with this buffer, and must free the 'buf' member of the * aom_fixed_buf_t as well as the aom_fixed_buf_t pointer. Memory returned * must be freed via call to free(). */ aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx); /*!\brief usage parameter analogous to AV1 GOOD QUALITY mode. */ #define AOM_USAGE_GOOD_QUALITY 0u /*!\brief usage parameter analogous to AV1 REALTIME mode. */ #define AOM_USAGE_REALTIME 1u /*!\brief usage parameter analogous to AV1 all intra mode. */ #define AOM_USAGE_ALL_INTRA 2u /*!\brief Encode a frame * * Encodes a video frame at the given "presentation time." The presentation * time stamp (PTS) \ref MUST be strictly increasing. * * When the last frame has been passed to the encoder, this function should * continue to be called in a loop, with the img parameter set to NULL. This * will signal the end-of-stream condition to the encoder and allow it to * encode any held buffers. Encoding is complete when aom_codec_encode() is * called with img set to NULL and aom_codec_get_cx_data() returns no data. * * \param[in] ctx Pointer to this instance's context * \param[in] img Image data to encode, NULL to flush. * Encoding sample values outside the range * [0..(1<bit_depth)-1] is undefined behavior. * Note: Although img is declared as a const pointer, * if AV1E_SET_DENOISE_NOISE_LEVEL is set to a nonzero * value aom_codec_encode() modifies (denoises) the * samples in img->planes[i] . * \param[in] pts Presentation time stamp, in timebase units. If img * is NULL, pts is ignored. * \param[in] duration Duration to show frame, in timebase units. If img * is not NULL, duration must be nonzero. If img is * NULL, duration is ignored. * \param[in] flags Flags to use for encoding this frame. * * \retval #AOM_CODEC_OK * The configuration was populated. * \retval #AOM_CODEC_INCAPABLE * Interface is not an encoder interface. * \retval #AOM_CODEC_INVALID_PARAM * A parameter was NULL, the image format is unsupported, etc. * * \note * `duration` is of the unsigned long type, which can be 32 or 64 bits. * `duration` must be less than or equal to UINT32_MAX so that its range is * independent of the size of unsigned long. */ aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img, aom_codec_pts_t pts, unsigned long duration, aom_enc_frame_flags_t flags); /*!\brief Set compressed data output buffer * * Sets the buffer that the codec should output the compressed data * into. This call effectively sets the buffer pointer returned in the * next AOM_CODEC_CX_FRAME_PKT packet. Subsequent packets will be * appended into this buffer. The buffer is preserved across frames, * so applications must periodically call this function after flushing * the accumulated compressed data to disk or to the network to reset * the pointer to the buffer's head. * * `pad_before` bytes will be skipped before writing the compressed * data, and `pad_after` bytes will be appended to the packet. The size * of the packet will be the sum of the size of the actual compressed * data, pad_before, and pad_after. The padding bytes will be preserved * (not overwritten). * * Note that calling this function does not guarantee that the returned * compressed data will be placed into the specified buffer. In the * event that the encoded data will not fit into the buffer provided, * the returned packet \ref MAY point to an internal buffer, as it would * if this call were never used. In this event, the output packet will * NOT have any padding, and the application must free space and copy it * to the proper place. This is of particular note in configurations * that may output multiple packets for a single encoded frame (e.g., lagged * encoding) or if the application does not reset the buffer periodically. * * Applications may restore the default behavior of the codec providing * the compressed data buffer by calling this function with a NULL * buffer. * * Applications \ref MUSTNOT call this function during iteration of * aom_codec_get_cx_data(). * * \param[in] ctx Pointer to this instance's context * \param[in] buf Buffer to store compressed data into * \param[in] pad_before Bytes to skip before writing compressed data * \param[in] pad_after Bytes to skip after writing compressed data * * \retval #AOM_CODEC_OK * The buffer was set successfully. * \retval #AOM_CODEC_INVALID_PARAM * A parameter was NULL, the image format is unsupported, etc. */ aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx, const aom_fixed_buf_t *buf, unsigned int pad_before, unsigned int pad_after); /*!\brief Encoded data iterator * * Iterates over a list of data packets to be passed from the encoder to the * application. The different kinds of packets available are enumerated in * #aom_codec_cx_pkt_kind. * * #AOM_CODEC_CX_FRAME_PKT packets should be passed to the application's * muxer. Multiple compressed frames may be in the list. * #AOM_CODEC_STATS_PKT packets should be appended to a global buffer. * * The application \ref MUST silently ignore any packet kinds that it does * not recognize or support. * * The data buffers returned from this function are only guaranteed to be * valid until the application makes another call to any aom_codec_* function. * * \param[in] ctx Pointer to this instance's context * \param[in,out] iter Iterator storage, initialized to NULL * * \return Returns a pointer to an output data packet (compressed frame data, * two-pass statistics, etc.) or NULL to signal end-of-list. * */ const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter); /*!\brief Get Preview Frame * * Returns an image that can be used as a preview. Shows the image as it would * exist at the decompressor. The application \ref MUST NOT write into this * image buffer. * * \param[in] ctx Pointer to this instance's context * * \return Returns a pointer to a preview image, or NULL if no image is * available. * */ const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx); /*!@} - end defgroup encoder*/ #ifdef __cplusplus } #endif #endif // AOM_AOM_AOM_ENCODER_H_ aom-3.12.1/aom/aom_external_partition.h000066400000000000000000000427411477627663500201050ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_AOM_EXTERNAL_PARTITION_H_ #define AOM_AOM_AOM_EXTERNAL_PARTITION_H_ /*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder * \ingroup aom * * @{ */ #include /*!\file * \brief Provides function pointer definitions for the external partition. * * \note The external partition API should be considered experimental. Until the * external partition API is declared stable, breaking changes may be made to * this API in a future libaom release. */ /*!\brief Current ABI version number * * \internal * If this file is altered in any way that changes the ABI, this value * must be bumped. Examples include, but are not limited to, changing * types, removing or reassigning enums, adding/removing/rearranging * fields to structures. */ #define AOM_EXT_PART_ABI_VERSION 8 #ifdef __cplusplus extern "C" { #endif /*!\brief Abstract external partition model handler */ typedef void *aom_ext_part_model_t; /*!\brief Number of features to determine whether to skip partition none and * do partition split directly. The same as "FEATURE_SIZE_SMS_SPLIT". */ #define AOM_EXT_PART_SIZE_DIRECT_SPLIT 17 /*!\brief Number of features to use simple motion search to prune out * rectangular partition in some direction. The same as * "FEATURE_SIZE_SMS_PRUNE_PART". */ #define AOM_EXT_PART_SIZE_PRUNE_PART 25 /*!\brief Number of features to prune split and rectangular partition * after PARTITION_NONE. */ #define AOM_EXT_PART_SIZE_PRUNE_NONE 4 /*!\brief Number of features to terminates partition after partition none using * simple_motion_search features and the rate, distortion, and rdcost of * PARTITION_NONE. The same as "FEATURE_SIZE_SMS_TERM_NONE". */ #define AOM_EXT_PART_SIZE_TERM_NONE 28 /*!\brief Number of features to terminates partition after partition split. */ #define AOM_EXT_PART_SIZE_TERM_SPLIT 31 /*!\brief Number of features to prune rectangular partition using stats * collected after partition split. */ #define AOM_EXT_PART_SIZE_PRUNE_RECT 9 /*!\brief Number of features to prune AB partition using stats * collected after rectangular partition.. */ #define AOM_EXT_PART_SIZE_PRUNE_AB 10 /*!\brief Number of features to prune 4-way partition using stats * collected after AB partition. */ #define AOM_EXT_PART_SIZE_PRUNE_4_WAY 18 /*!\brief Decision mode of the external partition model. * AOM_EXT_PART_WHOLE_TREE: the external partition model should provide the * whole partition tree for the superblock. * * AOM_EXT_PART_RECURSIVE: the external partition model provides the partition * decision of the current block only. The decision process starts from * the superblock size, down to the smallest block size (4x4) recursively. */ typedef enum aom_ext_part_decision_mode { AOM_EXT_PART_WHOLE_TREE = 0, AOM_EXT_PART_RECURSIVE = 1, } aom_ext_part_decision_mode_t; /*!\brief Config information sent to the external partition model. * * For example, the maximum superblock size determined by the sequence header. */ typedef struct aom_ext_part_config { int superblock_size; ///< super block size (either 64x64 or 128x128) } aom_ext_part_config_t; /*!\brief Features pass to the external model to make partition decisions. * Specifically, features collected before NONE partition. * Features "f" are used to determine: * partition_none_allowed, partition_horz_allowed, partition_vert_allowed, * do_rectangular_split, do_square_split * Features "f_part2" are used to determine: * prune_horz, prune_vert. */ typedef struct aom_partition_features_before_none { /*! features to determine whether skip partition none and do split directly */ float f[AOM_EXT_PART_SIZE_DIRECT_SPLIT]; /*! features to determine whether to prune rectangular partition */ float f_part2[AOM_EXT_PART_SIZE_PRUNE_PART]; } aom_partition_features_before_none_t; /*!\brief Features pass to the external model to make partition decisions. * Specifically, features collected after NONE partition. */ typedef struct aom_partition_features_none { /*! features to prune split and rectangular partition */ float f[AOM_EXT_PART_SIZE_PRUNE_NONE]; /*! features to determine termination of partition */ float f_terminate[AOM_EXT_PART_SIZE_TERM_NONE]; } aom_partition_features_none_t; /*!\brief Features pass to the external model to make partition decisions. * Specifically, features collected after SPLIT partition. */ typedef struct aom_partition_features_split { /*! features to determine termination of partition */ float f_terminate[AOM_EXT_PART_SIZE_TERM_SPLIT]; /*! features to determine pruning rect partition */ float f_prune_rect[AOM_EXT_PART_SIZE_PRUNE_RECT]; } aom_partition_features_split_t; /*!\brief Features pass to the external model to make partition decisions. * Specifically, features collected after RECTANGULAR partition. */ typedef struct aom_partition_features_rect { /*! features to determine pruning AB partition */ float f[AOM_EXT_PART_SIZE_PRUNE_AB]; } aom_partition_features_rect_t; /*!\brief Features pass to the external model to make partition decisions. * Specifically, features collected after AB partition: HORZ_A, HORZ_B, VERT_A, * VERT_B. */ typedef struct aom_partition_features_ab { /*! features to determine pruning 4-way partition */ float f[AOM_EXT_PART_SIZE_PRUNE_4_WAY]; } aom_partition_features_ab_t; /*!\brief Feature id to tell the external model the current stage in partition * pruning and what features to use to make decisions accordingly. */ typedef enum { AOM_EXT_PART_FEATURE_BEFORE_NONE, AOM_EXT_PART_FEATURE_BEFORE_NONE_PART2, AOM_EXT_PART_FEATURE_AFTER_NONE, AOM_EXT_PART_FEATURE_AFTER_NONE_PART2, AOM_EXT_PART_FEATURE_AFTER_SPLIT, AOM_EXT_PART_FEATURE_AFTER_SPLIT_PART2, AOM_EXT_PART_FEATURE_AFTER_RECT, AOM_EXT_PART_FEATURE_AFTER_AB } AOM_EXT_PART_FEATURE_ID; /*!\brief Features collected from the tpl process. * * The tpl process collects information that help measure the inter-frame * dependency. * The tpl process is computed in the unit of tpl_bsize_1d (16x16). * Therefore, the max number of units inside a superblock is * 128x128 / (16x16) = 64. Change it if the tpl process changes. */ typedef struct aom_sb_tpl_features { int available; ///< If tpl stats are available int tpl_unit_length; ///< The block length of tpl process int num_units; ///< The number of units inside the current superblock int64_t intra_cost[64]; ///< The intra cost of each unit int64_t inter_cost[64]; ///< The inter cost of each unit int64_t mc_dep_cost[64]; ///< The motion compensated dependency cost } aom_sb_tpl_features_t; /*!\brief Features collected from the simple motion process. * * The simple motion process collects information by applying motion compensated * prediction on each block. * The block size is 16x16, which could be changed. If it is changed, update * comments and the array size here. */ typedef struct aom_sb_simple_motion_features { int unit_length; ///< The block length of the simple motion process int num_units; ///< The number of units inside the current superblock int block_sse[64]; ///< Sum of squared error of each unit int block_var[64]; ///< Variance of each unit } aom_sb_simple_motion_features_t; /*!\brief Features of each super block. * * Features collected for each super block before partition search. */ typedef struct aom_sb_features { /*! Features from motion search */ aom_sb_simple_motion_features_t motion_features; /*! Features from tpl process */ aom_sb_tpl_features_t tpl_features; } aom_sb_features_t; /*!\brief Features pass to the external model to make partition decisions. * * The encoder sends these features to the external model through * "func()" defined in ..... * * NOTE: new member variables may be added to this structure in the future. * Once new features are finalized, bump the major version of libaom. */ typedef struct aom_partition_features { // Features for the current supervised multi-stage ML model. /*! Feature ID to indicate active features */ AOM_EXT_PART_FEATURE_ID id; /*! Features collected before NONE partition */ aom_partition_features_before_none_t before_part_none; /*! Features collected after NONE partition */ aom_partition_features_none_t after_part_none; /*! Features collected after SPLIT partition */ aom_partition_features_split_t after_part_split; /*! Features collected after RECTANGULAR partition */ aom_partition_features_rect_t after_part_rect; /*! Features collected after AB partition */ aom_partition_features_ab_t after_part_ab; // Features for a new ML model. aom_sb_features_t sb_features; ///< Features collected for the super block int mi_row; ///< Mi_row position of the block int mi_col; ///< Mi_col position of the block int frame_width; ///< Frame width int frame_height; ///< Frame height int block_size; ///< As "BLOCK_SIZE" in av1/common/enums.h /*! * Valid partition types. A bitmask is used. "1" represents the * corresponding type is valid. The bitmask follows the enum order for * PARTITION_TYPE in "enums.h" to represent one partition type at a bit. * For example, 0x01 stands for only PARTITION_NONE is valid, * 0x09 (00...001001) stands for PARTITION_NONE and PARTITION_SPLIT are valid. */ int valid_partition_types; int update_type; ///< Frame update type, defined in ratectrl.h int qindex; ///< Quantization index, range: [0, 255] int rdmult; ///< Rate-distortion multiplier int pyramid_level; ///< The level of this frame in the hierarchical structure int has_above_block; ///< Has above neighbor block int above_block_width; ///< Width of the above block, -1 if not exist int above_block_height; ///< Height of the above block, -1 if not exist int has_left_block; ///< Has left neighbor block int left_block_width; ///< Width of the left block, -1 if not exist int left_block_height; ///< Height of the left block, -1 if not exist /*! * The following parameters are collected from applying simple motion search. * Sum of squared error (SSE) and variance of motion compensated residual * are good indicators of block partitioning. * If a block is a square, we also apply motion search for its 4 sub blocks. * If not a square, their values are -1. * If a block is able to split horizontally, we apply motion search and get * stats for horizontal blocks. If not, their values are -1. * If a block is able to split vertically, we apply motion search and get * stats for vertical blocks. If not, their values are -1. */ unsigned int block_sse; ///< SSE of motion compensated residual unsigned int block_var; ///< Variance of motion compensated residual unsigned int sub_block_sse[4]; ///< SSE of sub blocks. unsigned int sub_block_var[4]; ///< Variance of sub blocks. unsigned int horz_block_sse[2]; ///< SSE of horz sub blocks unsigned int horz_block_var[2]; ///< Variance of horz sub blocks unsigned int vert_block_sse[2]; ///< SSE of vert sub blocks unsigned int vert_block_var[2]; ///< Variance of vert sub blocks /*! * The following parameters are calculated from tpl model. * If tpl model is not available, their values are -1. */ int64_t tpl_intra_cost; ///< Intra cost, ref to "TplDepStats" in tpl_model.h int64_t tpl_inter_cost; ///< Inter cost in tpl model int64_t tpl_mc_dep_cost; ///< Motion compensated dependency cost in tpl model } aom_partition_features_t; /*!\brief Partition decisions received from the external model. * * The encoder receives partition decisions and encodes the superblock * with the given partition type. * The encoder receives it from "func()" define in .... * * NOTE: new member variables may be added to this structure in the future. * Once new features are finalized, bump the major version of libaom. */ typedef struct aom_partition_decision { // Decisions for directly set partition types int is_final_decision; ///< The flag whether it's the final decision int num_nodes; ///< The number of leaf nodes int partition_decision[2048]; ///< Partition decisions int current_decision; ///< Partition decision for the current block // Decisions for partition type pruning int terminate_partition_search; ///< Terminate further partition search int partition_none_allowed; ///< Allow partition none type int partition_rect_allowed[2]; ///< Allow rectangular partitions int do_rectangular_split; ///< Try rectangular split partition int do_square_split; ///< Try square split partition int prune_rect_part[2]; ///< Prune rectangular partition int horza_partition_allowed; ///< Allow HORZ_A partition int horzb_partition_allowed; ///< Allow HORZ_B partition int verta_partition_allowed; ///< Allow VERT_A partition int vertb_partition_allowed; ///< Allow VERT_B partition int partition_horz4_allowed; ///< Allow HORZ4 partition int partition_vert4_allowed; ///< Allow VERT4 partition } aom_partition_decision_t; /*!\brief Encoding stats for the given partition decision. * * The encoding stats collected by encoding the superblock with the * given partition types. * The encoder sends the stats to the external model for training * or inference through "func()" defined in .... */ typedef struct aom_partition_stats { int rate; ///< Rate cost of the block int64_t dist; ///< Distortion of the block int64_t rdcost; ///< Rate-distortion cost of the block } aom_partition_stats_t; /*!\brief Enum for return status. */ typedef enum aom_ext_part_status { AOM_EXT_PART_OK = 0, ///< Status of success AOM_EXT_PART_ERROR = 1, ///< Status of failure AOM_EXT_PART_TEST = 2, ///< Status used for tests } aom_ext_part_status_t; /*!\brief Callback of creating an external partition model. * * The callback is invoked by the encoder to create an external partition * model. * * \param[in] priv Callback's private data * \param[in] part_config Config information pointer for model creation * \param[out] ext_part_model Pointer to the model */ typedef aom_ext_part_status_t (*aom_ext_part_create_model_fn_t)( void *priv, const aom_ext_part_config_t *part_config, aom_ext_part_model_t *ext_part_model); /*!\brief Callback of sending features to the external partition model. * * The callback is invoked by the encoder to send features to the external * partition model. * * \param[in] ext_part_model The external model * \param[in] part_features Pointer to the features */ typedef aom_ext_part_status_t (*aom_ext_part_send_features_fn_t)( aom_ext_part_model_t ext_part_model, const aom_partition_features_t *part_features); /*!\brief Callback of receiving partition decisions from the external * partition model. * * The callback is invoked by the encoder to receive partition decisions from * the external partition model. * * \param[in] ext_part_model The external model * \param[in] ext_part_decision Pointer to the partition decisions */ typedef aom_ext_part_status_t (*aom_ext_part_get_decision_fn_t)( aom_ext_part_model_t ext_part_model, aom_partition_decision_t *ext_part_decision); /*!\brief Callback of sending stats to the external partition model. * * The callback is invoked by the encoder to send encoding stats to * the external partition model. * * \param[in] ext_part_model The external model * \param[in] ext_part_stats Pointer to the encoding stats */ typedef aom_ext_part_status_t (*aom_ext_part_send_partition_stats_fn_t)( aom_ext_part_model_t ext_part_model, const aom_partition_stats_t *ext_part_stats); /*!\brief Callback of deleting the external partition model. * * The callback is invoked by the encoder to delete the external partition * model. * * \param[in] ext_part_model The external model */ typedef aom_ext_part_status_t (*aom_ext_part_delete_model_fn_t)( aom_ext_part_model_t ext_part_model); /*!\brief Callback function set for external partition model. * * Uses can enable external partition model by registering a set of * callback functions with the flag: AV1E_SET_EXTERNAL_PARTITION_MODEL */ typedef struct aom_ext_part_funcs { /*! * Create an external partition model. */ aom_ext_part_create_model_fn_t create_model; /*! * Send features to the external partition model to make partition decisions. */ aom_ext_part_send_features_fn_t send_features; /*! * Get partition decisions from the external partition model. */ aom_ext_part_get_decision_fn_t get_partition_decision; /*! * Send stats of the current partition to the external model. */ aom_ext_part_send_partition_stats_fn_t send_partition_stats; /*! * Delete the external partition model. */ aom_ext_part_delete_model_fn_t delete_model; /*! * The decision mode of the model. */ aom_ext_part_decision_mode_t decision_mode; /*! * Private data for the external partition model. */ void *priv; } aom_ext_part_funcs_t; /*!@} - end defgroup aom_encoder*/ #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_AOM_EXTERNAL_PARTITION_H_ aom-3.12.1/aom/aom_frame_buffer.h000066400000000000000000000063771477627663500166220ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_AOM_FRAME_BUFFER_H_ #define AOM_AOM_AOM_FRAME_BUFFER_H_ /*!\file * \brief Describes the decoder external frame buffer interface. */ #ifdef __cplusplus extern "C" { #endif #include "aom/aom_integer.h" /*!\brief The maximum number of work buffers used by libaom. * Support maximum 4 threads to decode video in parallel. * Each thread will use one work buffer. * TODO(hkuang): Add support to set number of worker threads dynamically. */ #define AOM_MAXIMUM_WORK_BUFFERS 8 /*!\brief The maximum number of reference buffers that a AV1 encoder may use. */ #define AOM_MAXIMUM_REF_BUFFERS 8 /*!\brief External frame buffer * * This structure holds allocated frame buffers used by the decoder. */ typedef struct aom_codec_frame_buffer { uint8_t *data; /**< Pointer to the data buffer */ size_t size; /**< Size of data in bytes */ void *priv; /**< Frame's private data */ } aom_codec_frame_buffer_t; /*!\brief get frame buffer callback prototype * * This callback is invoked by the decoder to retrieve data for the frame * buffer in order for the decode call to complete. The callback must * allocate at least min_size in bytes and assign it to fb->data. The callback * must zero out all the data allocated. Then the callback must set fb->size * to the allocated size. The application does not need to align the allocated * data. The callback is triggered when the decoder needs a frame buffer to * decode a compressed image into. This function may be called more than once * for every call to aom_codec_decode. The application may set fb->priv to * some data which will be passed back in the aom_image_t and the release * function call. |fb| is guaranteed to not be NULL. On success the callback * must return 0. Any failure the callback must return a value less than 0. * * \param[in] priv Callback's private data * \param[in] min_size Size in bytes needed by the buffer * \param[in,out] fb Pointer to aom_codec_frame_buffer_t */ typedef int (*aom_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size, aom_codec_frame_buffer_t *fb); /*!\brief release frame buffer callback prototype * * This callback is invoked by the decoder when the frame buffer is not * referenced by any other buffers. |fb| is guaranteed to not be NULL. On * success the callback must return 0. Any failure the callback must return * a value less than 0. * * \param[in] priv Callback's private data * \param[in] fb Pointer to aom_codec_frame_buffer_t */ typedef int (*aom_release_frame_buffer_cb_fn_t)(void *priv, aom_codec_frame_buffer_t *fb); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_AOM_FRAME_BUFFER_H_ aom-3.12.1/aom/aom_image.h000066400000000000000000000472561477627663500152620ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief Describes the aom image descriptor and associated operations * */ #ifndef AOM_AOM_AOM_IMAGE_H_ #define AOM_AOM_AOM_IMAGE_H_ #ifdef __cplusplus extern "C" { #endif #include "aom/aom_integer.h" /*!\brief Current ABI version number * * \internal * If this file is altered in any way that changes the ABI, this value * must be bumped. Examples include, but are not limited to, changing * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ #define AOM_IMAGE_ABI_VERSION (9) /**<\hideinitializer*/ #define AOM_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */ #define AOM_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */ /** 0x400 used to signal alpha channel, skipping for backwards compatibility. */ #define AOM_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */ /*!\brief List of supported image formats */ typedef enum aom_img_fmt { AOM_IMG_FMT_NONE, AOM_IMG_FMT_YV12 = AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP | 1, /**< planar YVU */ AOM_IMG_FMT_I420 = AOM_IMG_FMT_PLANAR | 2, AOM_IMG_FMT_AOMYV12 = AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP | 3, /** < planar 4:2:0 format with aom color space */ AOM_IMG_FMT_AOMI420 = AOM_IMG_FMT_PLANAR | 4, AOM_IMG_FMT_I422 = AOM_IMG_FMT_PLANAR | 5, AOM_IMG_FMT_I444 = AOM_IMG_FMT_PLANAR | 6, /*!\brief Allows detection of the presence of AOM_IMG_FMT_NV12 at compile time. */ #define AOM_HAVE_IMG_FMT_NV12 1 AOM_IMG_FMT_NV12 = AOM_IMG_FMT_PLANAR | 7, /**< 4:2:0 with U and V interleaved */ AOM_IMG_FMT_I42016 = AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH, AOM_IMG_FMT_YV1216 = AOM_IMG_FMT_YV12 | AOM_IMG_FMT_HIGHBITDEPTH, AOM_IMG_FMT_I42216 = AOM_IMG_FMT_I422 | AOM_IMG_FMT_HIGHBITDEPTH, AOM_IMG_FMT_I44416 = AOM_IMG_FMT_I444 | AOM_IMG_FMT_HIGHBITDEPTH, } aom_img_fmt_t; /**< alias for enum aom_img_fmt */ /*!\brief List of supported color primaries */ typedef enum aom_color_primaries { AOM_CICP_CP_RESERVED_0 = 0, /**< For future use */ AOM_CICP_CP_BT_709 = 1, /**< BT.709 */ AOM_CICP_CP_UNSPECIFIED = 2, /**< Unspecified */ AOM_CICP_CP_RESERVED_3 = 3, /**< For future use */ AOM_CICP_CP_BT_470_M = 4, /**< BT.470 System M (historical) */ AOM_CICP_CP_BT_470_B_G = 5, /**< BT.470 System B, G (historical) */ AOM_CICP_CP_BT_601 = 6, /**< BT.601 */ AOM_CICP_CP_SMPTE_240 = 7, /**< SMPTE 240 */ AOM_CICP_CP_GENERIC_FILM = 8, /**< Generic film (color filters using illuminant C) */ AOM_CICP_CP_BT_2020 = 9, /**< BT.2020, BT.2100 */ AOM_CICP_CP_XYZ = 10, /**< SMPTE 428 (CIE 1921 XYZ) */ AOM_CICP_CP_SMPTE_431 = 11, /**< SMPTE RP 431-2 */ AOM_CICP_CP_SMPTE_432 = 12, /**< SMPTE EG 432-1 */ AOM_CICP_CP_RESERVED_13 = 13, /**< For future use (values 13 - 21) */ AOM_CICP_CP_EBU_3213 = 22, /**< EBU Tech. 3213-E */ AOM_CICP_CP_RESERVED_23 = 23 /**< For future use (values 23 - 255) */ } aom_color_primaries_t; /**< alias for enum aom_color_primaries */ /*!\brief List of supported transfer functions */ typedef enum aom_transfer_characteristics { AOM_CICP_TC_RESERVED_0 = 0, /**< For future use */ AOM_CICP_TC_BT_709 = 1, /**< BT.709 */ AOM_CICP_TC_UNSPECIFIED = 2, /**< Unspecified */ AOM_CICP_TC_RESERVED_3 = 3, /**< For future use */ AOM_CICP_TC_BT_470_M = 4, /**< BT.470 System M (historical) */ AOM_CICP_TC_BT_470_B_G = 5, /**< BT.470 System B, G (historical) */ AOM_CICP_TC_BT_601 = 6, /**< BT.601 */ AOM_CICP_TC_SMPTE_240 = 7, /**< SMPTE 240 M */ AOM_CICP_TC_LINEAR = 8, /**< Linear */ AOM_CICP_TC_LOG_100 = 9, /**< Logarithmic (100 : 1 range) */ AOM_CICP_TC_LOG_100_SQRT10 = 10, /**< Logarithmic (100 * Sqrt(10) : 1 range) */ AOM_CICP_TC_IEC_61966 = 11, /**< IEC 61966-2-4 */ AOM_CICP_TC_BT_1361 = 12, /**< BT.1361 */ AOM_CICP_TC_SRGB = 13, /**< sRGB or sYCC*/ AOM_CICP_TC_BT_2020_10_BIT = 14, /**< BT.2020 10-bit systems */ AOM_CICP_TC_BT_2020_12_BIT = 15, /**< BT.2020 12-bit systems */ AOM_CICP_TC_SMPTE_2084 = 16, /**< SMPTE ST 2084, ITU BT.2100 PQ */ AOM_CICP_TC_SMPTE_428 = 17, /**< SMPTE ST 428 */ AOM_CICP_TC_HLG = 18, /**< BT.2100 HLG, ARIB STD-B67 */ AOM_CICP_TC_RESERVED_19 = 19 /**< For future use (values 19-255) */ } aom_transfer_characteristics_t; /**< alias for enum aom_transfer_characteristics */ /*!\brief List of supported matrix coefficients */ typedef enum aom_matrix_coefficients { AOM_CICP_MC_IDENTITY = 0, /**< Identity matrix */ AOM_CICP_MC_BT_709 = 1, /**< BT.709 */ AOM_CICP_MC_UNSPECIFIED = 2, /**< Unspecified */ AOM_CICP_MC_RESERVED_3 = 3, /**< For future use */ AOM_CICP_MC_FCC = 4, /**< US FCC 73.628 */ AOM_CICP_MC_BT_470_B_G = 5, /**< BT.470 System B, G (historical) */ AOM_CICP_MC_BT_601 = 6, /**< BT.601 */ AOM_CICP_MC_SMPTE_240 = 7, /**< SMPTE 240 M */ AOM_CICP_MC_SMPTE_YCGCO = 8, /**< YCgCo */ AOM_CICP_MC_BT_2020_NCL = 9, /**< BT.2020 non-constant luminance, BT.2100 YCbCr */ AOM_CICP_MC_BT_2020_CL = 10, /**< BT.2020 constant luminance */ AOM_CICP_MC_SMPTE_2085 = 11, /**< SMPTE ST 2085 YDzDx */ AOM_CICP_MC_CHROMAT_NCL = 12, /**< Chromaticity-derived non-constant luminance */ AOM_CICP_MC_CHROMAT_CL = 13, /**< Chromaticity-derived constant luminance */ AOM_CICP_MC_ICTCP = 14, /**< BT.2100 ICtCp */ AOM_CICP_MC_RESERVED_15 = 15 /**< For future use (values 15-255) */ } aom_matrix_coefficients_t; /**< alias for enum aom_matrix_coefficients */ /*!\brief List of supported color range */ typedef enum aom_color_range { AOM_CR_STUDIO_RANGE = 0, /**<- Y [16..235], UV [16..240] (bit depth 8) */ /**<- Y [64..940], UV [64..960] (bit depth 10) */ /**<- Y [256..3760], UV [256..3840] (bit depth 12) */ AOM_CR_FULL_RANGE = 1 /**<- YUV/RGB [0..255] (bit depth 8) */ /**<- YUV/RGB [0..1023] (bit depth 10) */ /**<- YUV/RGB [0..4095] (bit depth 12) */ } aom_color_range_t; /**< alias for enum aom_color_range */ /*!\brief List of chroma sample positions */ typedef enum aom_chroma_sample_position { AOM_CSP_UNKNOWN = 0, /**< Unknown */ AOM_CSP_VERTICAL = 1, /**< Horizontally co-located with luma(0, 0)*/ /**< sample, between two vertical samples */ AOM_CSP_COLOCATED = 2, /**< Co-located with luma(0, 0) sample */ AOM_CSP_RESERVED = 3 /**< Reserved value */ } aom_chroma_sample_position_t; /**< alias for enum aom_chroma_sample_position */ /*!\brief List of insert flags for Metadata * * These flags control how the library treats metadata during encode. * * While encoding, when metadata is added to an aom_image via * aom_img_add_metadata(), the flag passed along with the metadata will * determine where the metadata OBU will be placed in the encoded OBU stream. * Metadata will be emitted into the output stream within the next temporal unit * if it satisfies the specified insertion flag. * * During decoding, when the library encounters a metadata OBU, it is always * flagged as AOM_MIF_ANY_FRAME and emitted with the next output aom_image. */ typedef enum aom_metadata_insert_flags { AOM_MIF_NON_KEY_FRAME = 0, /**< Adds metadata if it's not keyframe */ AOM_MIF_KEY_FRAME = 1, /**< Adds metadata only if it's a keyframe */ AOM_MIF_ANY_FRAME = 2 /**< Adds metadata to any type of frame */ } aom_metadata_insert_flags_t; /*!\brief Array of aom_metadata structs for an image. */ typedef struct aom_metadata_array aom_metadata_array_t; /*!\brief Metadata payload. */ typedef struct aom_metadata { uint32_t type; /**< Metadata type */ uint8_t *payload; /**< Metadata payload data */ size_t sz; /**< Metadata payload size */ aom_metadata_insert_flags_t insert_flag; /**< Metadata insertion flag */ } aom_metadata_t; /**\brief Image Descriptor */ typedef struct aom_image { aom_img_fmt_t fmt; /**< Image Format */ aom_color_primaries_t cp; /**< CICP Color Primaries */ aom_transfer_characteristics_t tc; /**< CICP Transfer Characteristics */ aom_matrix_coefficients_t mc; /**< CICP Matrix Coefficients */ int monochrome; /**< Whether image is monochrome */ aom_chroma_sample_position_t csp; /**< chroma sample position */ aom_color_range_t range; /**< Color Range */ /* Image storage dimensions */ unsigned int w; /**< Stored image width */ unsigned int h; /**< Stored image height */ unsigned int bit_depth; /**< Stored image bit-depth */ /* Image display dimensions */ unsigned int d_w; /**< Displayed image width */ unsigned int d_h; /**< Displayed image height */ /* Image intended rendering dimensions */ unsigned int r_w; /**< Intended rendering image width */ unsigned int r_h; /**< Intended rendering image height */ /* Chroma subsampling info */ unsigned int x_chroma_shift; /**< subsampling order, X */ unsigned int y_chroma_shift; /**< subsampling order, Y */ /* Image data pointers. */ #define AOM_PLANE_PACKED 0 /**< To be used for all packed formats */ #define AOM_PLANE_Y 0 /**< Y (Luminance) plane */ #define AOM_PLANE_U 1 /**< U (Chroma) plane */ #define AOM_PLANE_V 2 /**< V (Chroma) plane */ /* planes[AOM_PLANE_V] = NULL and stride[AOM_PLANE_V] = 0 when fmt == * AOM_IMG_FMT_NV12 */ unsigned char *planes[3]; /**< pointer to the top left pixel for each plane */ int stride[3]; /**< stride between rows for each plane */ size_t sz; /**< data size */ int bps; /**< bits per sample (for packed formats) */ int temporal_id; /**< Temporal layer Id of image */ int spatial_id; /**< Spatial layer Id of image */ /*!\brief The following member may be set by the application to associate * data with this image. */ void *user_priv; /* The following members should be treated as private. */ unsigned char *img_data; /**< private */ int img_data_owner; /**< private */ int self_allocd; /**< private */ aom_metadata_array_t *metadata; /**< Metadata payloads associated with the image. */ void *fb_priv; /**< Frame buffer data associated with the image. */ } aom_image_t; /**< alias for struct aom_image */ /*!\brief Open a descriptor, allocating storage for the underlying image * * Returns a descriptor for storing an image of the given format. The * storage for the image is allocated on the heap. * * \param[in] img Pointer to storage for descriptor. If this parameter * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image * \param[in] d_w Width of the image. Must not exceed 0x08000000 * (2^27). * \param[in] d_h Height of the image. Must not exceed 0x08000000 * (2^27). * \param[in] align Alignment, in bytes, of the image buffer and * each row in the image (stride). Must not exceed * 65536. * * \return Returns a pointer to the initialized image descriptor. If the img * parameter is non-null, the value of the img parameter will be * returned. */ aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, unsigned int d_h, unsigned int align); /*!\brief Open a descriptor, using existing storage for the underlying image * * Returns a descriptor for storing an image of the given format. The * storage for the image has been allocated elsewhere, and a descriptor is * desired to "wrap" that storage. * * \param[in] img Pointer to storage for descriptor. If this parameter * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image * \param[in] d_w Width of the image. Must not exceed 0x08000000 * (2^27). * \param[in] d_h Height of the image. Must not exceed 0x08000000 * (2^27). * \param[in] align Alignment, in bytes, of each row in the image * (stride). Must not exceed 65536. * \param[in] img_data Storage to use for the image. The storage must * outlive the returned image descriptor; it can be * disposed of after calling aom_img_free(). * * \return Returns a pointer to the initialized image descriptor. If the img * parameter is non-null, the value of the img parameter will be * returned. */ aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, unsigned int d_h, unsigned int align, unsigned char *img_data); /*!\brief Open a descriptor, allocating storage for the underlying image with a * border * * Returns a descriptor for storing an image of the given format and its * borders. The storage for the image is allocated on the heap. * * \param[in] img Pointer to storage for descriptor. If this parameter * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image * \param[in] d_w Width of the image. Must not exceed 0x08000000 * (2^27). * \param[in] d_h Height of the image. Must not exceed 0x08000000 * (2^27). * \param[in] align Alignment, in bytes, of the image buffer and * each row in the image (stride). Must not exceed * 65536. * \param[in] size_align Alignment, in pixels, of the image width and height. * Must not exceed 65536. * \param[in] border A border that is padded on four sides of the image. * Must not exceed 65536. * * \return Returns a pointer to the initialized image descriptor. If the img * parameter is non-null, the value of the img parameter will be * returned. */ aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, unsigned int d_h, unsigned int align, unsigned int size_align, unsigned int border); /*!\brief Set the rectangle identifying the displayed portion of the image * * Updates the displayed rectangle (aka viewport) on the image surface to * match the specified coordinates and size. Specifically, sets img->d_w, * img->d_h, and elements of the img->planes[] array. * * \param[in] img Image descriptor * \param[in] x leftmost column * \param[in] y topmost row * \param[in] w width * \param[in] h height * \param[in] border A border that is padded on four sides of the image. * * \return 0 if the requested rectangle is valid, nonzero (-1) otherwise. */ int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y, unsigned int w, unsigned int h, unsigned int border); /*!\brief Flip the image vertically (top for bottom) * * Adjusts the image descriptor's pointers and strides to make the image * be referenced upside-down. * * \param[in] img Image descriptor */ void aom_img_flip(aom_image_t *img); /*!\brief Close an image descriptor * * Frees all allocated storage associated with an image descriptor. * * \param[in] img Image descriptor */ void aom_img_free(aom_image_t *img); /*!\brief Get the width of a plane * * Get the width of a plane of an image * * \param[in] img Image descriptor * \param[in] plane Plane index */ int aom_img_plane_width(const aom_image_t *img, int plane); /*!\brief Get the height of a plane * * Get the height of a plane of an image * * \param[in] img Image descriptor * \param[in] plane Plane index */ int aom_img_plane_height(const aom_image_t *img, int plane); /*!\brief Add metadata to image. * * Adds metadata to aom_image_t. * Function makes a copy of the provided data parameter. * Metadata insertion point is controlled by insert_flag. * * \param[in] img Image descriptor * \param[in] type Metadata type * \param[in] data Metadata contents * \param[in] sz Metadata contents size * \param[in] insert_flag Metadata insert flag * * \return Returns 0 on success. If img or data is NULL, sz is 0, or memory * allocation fails, it returns -1. */ int aom_img_add_metadata(aom_image_t *img, uint32_t type, const uint8_t *data, size_t sz, aom_metadata_insert_flags_t insert_flag); /*!\brief Return a metadata payload stored within the image metadata array. * * Gets the metadata (aom_metadata_t) at the indicated index in the image * metadata array. * * \param[in] img Pointer to image descriptor to get metadata from * \param[in] index Metadata index to get from metadata array * * \return Returns a const pointer to the selected metadata, if img and/or index * is invalid, it returns NULL. */ const aom_metadata_t *aom_img_get_metadata(const aom_image_t *img, size_t index); /*!\brief Return the number of metadata blocks within the image. * * Gets the number of metadata blocks contained within the provided image * metadata array. * * \param[in] img Pointer to image descriptor to get metadata number * from. * * \return Returns the size of the metadata array. If img or metadata is NULL, * it returns 0. */ size_t aom_img_num_metadata(const aom_image_t *img); /*!\brief Remove metadata from image. * * Removes all metadata in image metadata list and sets metadata list pointer * to NULL. * * \param[in] img Image descriptor */ void aom_img_remove_metadata(aom_image_t *img); /*!\brief Allocate memory for aom_metadata struct. * * Allocates storage for the metadata payload, sets its type and copies the * payload data into the aom_metadata struct. A metadata payload buffer of size * sz is allocated and sz bytes are copied from data into the payload buffer. * * \param[in] type Metadata type * \param[in] data Metadata data pointer * \param[in] sz Metadata size * \param[in] insert_flag Metadata insert flag * * \return Returns the newly allocated aom_metadata struct. If data is NULL, * sz is 0, or memory allocation fails, it returns NULL. */ aom_metadata_t *aom_img_metadata_alloc(uint32_t type, const uint8_t *data, size_t sz, aom_metadata_insert_flags_t insert_flag); /*!\brief Free metadata struct. * * Free metadata struct and its buffer. * * \param[in] metadata Metadata struct pointer */ void aom_img_metadata_free(aom_metadata_t *metadata); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_AOM_IMAGE_H_ aom-3.12.1/aom/aom_integer.h000066400000000000000000000041661477627663500156260ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_AOM_INTEGER_H_ #define AOM_AOM_AOM_INTEGER_H_ /* get ptrdiff_t, size_t, wchar_t, NULL */ #include // IWYU pragma: export /* Assume platforms have the C99 standard integer types. */ #if defined(__cplusplus) #if !defined(__STDC_FORMAT_MACROS) #define __STDC_FORMAT_MACROS #endif #if !defined(__STDC_LIMIT_MACROS) #define __STDC_LIMIT_MACROS #endif #endif // __cplusplus #include // IWYU pragma: export #include // IWYU pragma: export #if defined(__cplusplus) extern "C" { #endif // __cplusplus // Returns size of uint64_t when encoded using LEB128. size_t aom_uleb_size_in_bytes(uint64_t value); // Returns 0 on success, -1 on decode failure. // On success, 'value' stores the decoded LEB128 value and 'length' stores // the number of bytes decoded. int aom_uleb_decode(const uint8_t *buffer, size_t available, uint64_t *value, size_t *length); // Encodes LEB128 integer. Returns 0 when successful, and -1 upon failure. int aom_uleb_encode(uint64_t value, size_t available, uint8_t *coded_value, size_t *coded_size); // Encodes LEB128 integer to size specified. Returns 0 when successful, and -1 // upon failure. // Note: This will write exactly pad_to_size bytes; if the value cannot be // encoded in this many bytes, then this will fail. int aom_uleb_encode_fixed_size(uint64_t value, size_t available, size_t pad_to_size, uint8_t *coded_value, size_t *coded_size); #if defined(__cplusplus) } // extern "C" #endif // __cplusplus #endif // AOM_AOM_AOM_INTEGER_H_ aom-3.12.1/aom/aomcx.h000066400000000000000000002222511477627663500144410ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_AOMCX_H_ #define AOM_AOM_AOMCX_H_ /*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder * \ingroup aom * * @{ */ #include "aom/aom.h" #include "aom/aom_encoder.h" #include "aom/aom_external_partition.h" /*!\file * \brief Provides definitions for using AOM or AV1 encoder algorithm within the * aom Codec Interface. * * Several interfaces are excluded with CONFIG_REALTIME_ONLY build: * Global motion * Warped motion * OBMC * TPL model * Loop restoration * * The following features are also disabled with CONFIG_REALTIME_ONLY: * AV1E_SET_QUANT_B_ADAPT * CNN * 4X rectangular blocks * 4X rectangular transform in intra prediction */ #ifdef __cplusplus extern "C" { #endif /*!\name Algorithm interface for AV1 * * This interface provides the capability to encode raw AV1 streams. *@{ */ /*!\brief A single instance of the AV1 encoder. *\deprecated This access mechanism is provided for backwards compatibility; * prefer aom_codec_av1_cx(). */ extern aom_codec_iface_t aom_codec_av1_cx_algo; /*!\brief The interface to the AV1 encoder. */ extern aom_codec_iface_t *aom_codec_av1_cx(void); /*!@} - end algorithm interface member group */ /* * Algorithm Flags */ /*!\brief Don't reference the last frame * * When this flag is set, the encoder will not use the last frame as a * predictor. When not set, the encoder will choose whether to use the * last frame or not automatically. */ #define AOM_EFLAG_NO_REF_LAST (1 << 16) /*!\brief Don't reference the last2 frame * * When this flag is set, the encoder will not use the last2 frame as a * predictor. When not set, the encoder will choose whether to use the * last2 frame or not automatically. */ #define AOM_EFLAG_NO_REF_LAST2 (1 << 17) /*!\brief Don't reference the last3 frame * * When this flag is set, the encoder will not use the last3 frame as a * predictor. When not set, the encoder will choose whether to use the * last3 frame or not automatically. */ #define AOM_EFLAG_NO_REF_LAST3 (1 << 18) /*!\brief Don't reference the golden frame * * When this flag is set, the encoder will not use the golden frame as a * predictor. When not set, the encoder will choose whether to use the * golden frame or not automatically. */ #define AOM_EFLAG_NO_REF_GF (1 << 19) /*!\brief Don't reference the alternate reference frame * * When this flag is set, the encoder will not use the alt ref frame as a * predictor. When not set, the encoder will choose whether to use the * alt ref frame or not automatically. */ #define AOM_EFLAG_NO_REF_ARF (1 << 20) /*!\brief Don't reference the bwd reference frame * * When this flag is set, the encoder will not use the bwd ref frame as a * predictor. When not set, the encoder will choose whether to use the * bwd ref frame or not automatically. */ #define AOM_EFLAG_NO_REF_BWD (1 << 21) /*!\brief Don't reference the alt2 reference frame * * When this flag is set, the encoder will not use the alt2 ref frame as a * predictor. When not set, the encoder will choose whether to use the * alt2 ref frame or not automatically. */ #define AOM_EFLAG_NO_REF_ARF2 (1 << 22) /*!\brief Don't update the last frame * * When this flag is set, the encoder will not update the last frame with * the contents of the current frame. */ #define AOM_EFLAG_NO_UPD_LAST (1 << 23) /*!\brief Don't update the golden frame * * When this flag is set, the encoder will not update the golden frame with * the contents of the current frame. */ #define AOM_EFLAG_NO_UPD_GF (1 << 24) /*!\brief Don't update the alternate reference frame * * When this flag is set, the encoder will not update the alt ref frame with * the contents of the current frame. */ #define AOM_EFLAG_NO_UPD_ARF (1 << 25) /*!\brief Disable entropy update * * When this flag is set, the encoder will not update its internal entropy * model based on the entropy of this frame. */ #define AOM_EFLAG_NO_UPD_ENTROPY (1 << 26) /*!\brief Disable ref frame mvs * * When this flag is set, the encoder will not allow frames to * be encoded using mfmv. */ #define AOM_EFLAG_NO_REF_FRAME_MVS (1 << 27) /*!\brief Enable error resilient frame * * When this flag is set, the encoder will code frames as error * resilient. */ #define AOM_EFLAG_ERROR_RESILIENT (1 << 28) /*!\brief Enable s frame mode * * When this flag is set, the encoder will code frames as an * s frame. */ #define AOM_EFLAG_SET_S_FRAME (1 << 29) /*!\brief Force primary_ref_frame to PRIMARY_REF_NONE * * When this flag is set, the encoder will set a frame's primary_ref_frame * to PRIMARY_REF_NONE */ #define AOM_EFLAG_SET_PRIMARY_REF_NONE (1 << 30) /*!\brief AVx encoder control functions * * This set of macros define the control functions available for AVx * encoder interface. * The range of encode control ID is 7-229(max). * * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) */ enum aome_enc_control_id { /*!\brief Codec control function to set which reference frame encoder can use, * int parameter. */ AOME_USE_REFERENCE = 7, /*!\brief Codec control function to pass an ROI map to encoder, aom_roi_map_t* * parameter. */ AOME_SET_ROI_MAP = 8, /*!\brief Codec control function to pass an Active map to encoder, * aom_active_map_t* parameter. */ AOME_SET_ACTIVEMAP = 9, /* NOTE: enum 10 unused */ /*!\brief Codec control function to set encoder scaling mode for the next * frame to be coded, aom_scaling_mode_t* parameter. */ AOME_SET_SCALEMODE = 11, /*!\brief Codec control function to set encoder spatial layer id, int * parameter. */ AOME_SET_SPATIAL_LAYER_ID = 12, /*!\brief Codec control function to set encoder internal speed settings, * int parameter * * Changes in this value influences the complexity of algorithms used in * encoding process, values greater than 0 will increase encoder speed at * the expense of quality. * * Valid range: 0..11. 0 runs the slowest, and 11 runs the fastest; * quality improves as speed decreases (since more compression * possibilities are explored). * * NOTE: 10 and 11 are only allowed in AOM_USAGE_REALTIME. In * AOM_USAGE_GOOD_QUALITY and AOM_USAGE_ALL_INTRA, 9 is the highest allowed * value. However, AOM_USAGE_GOOD_QUALITY treats 7..9 the same as 6. Also, * AOM_USAGE_REALTIME treats 0..4 the same as 5. */ AOME_SET_CPUUSED = 13, /*!\brief Codec control function to enable automatic set and use alf frames, * unsigned int parameter * * - 0 = disable * - 1 = enable (default) */ AOME_SET_ENABLEAUTOALTREF = 14, /* NOTE: enum 15 unused */ /*!\brief Codec control function to set the sharpness parameter, * unsigned int parameter. * * This parameter controls the level at which rate-distortion optimization of * transform coefficients favors sharpness in the block. * * Valid range: 0..7. The default is 0. * * Values 1-7 will avoid eob and skip block optimization and will change * rdmult in favor of block sharpness. * * In all-intra mode: it also sets the `loop_filter_sharpness` syntax element * in the bitstream. Larger values increasingly reduce how much the filtering * can change the sample values on block edges to favor perceived sharpness. */ AOME_SET_SHARPNESS = AOME_SET_ENABLEAUTOALTREF + 2, // 16 /*!\brief Codec control function to set the threshold for MBs treated static, * unsigned int parameter */ AOME_SET_STATIC_THRESHOLD = 17, /* NOTE: enum 18 unused */ /*!\brief Codec control function to get last quantizer chosen by the encoder, * int* parameter * * Return value uses internal quantizer scale defined by the codec. */ AOME_GET_LAST_QUANTIZER = AOME_SET_STATIC_THRESHOLD + 2, // 19 /*!\brief Codec control function to get last quantizer chosen by the encoder, * int* parameter * * Return value uses the 0..63 scale as used by the rc_*_quantizer config * parameters. */ AOME_GET_LAST_QUANTIZER_64 = 20, /*!\brief Codec control function to set the max no of frames to create arf, * unsigned int parameter */ AOME_SET_ARNR_MAXFRAMES = 21, /*!\brief Codec control function to set the filter strength for the arf, * unsigned int parameter */ AOME_SET_ARNR_STRENGTH = 22, /* NOTE: enum 23 unused */ /*!\brief Codec control function to set visual tuning, aom_tune_metric (int) * parameter * * The default is AOM_TUNE_PSNR. */ AOME_SET_TUNING = AOME_SET_ARNR_STRENGTH + 2, // 24 /*!\brief Codec control function to set constrained / constant quality level, * unsigned int parameter * * Valid range: 0..63 * * \attention For this value to be used aom_codec_enc_cfg_t::rc_end_usage * must be set to #AOM_CQ or #AOM_Q. */ AOME_SET_CQ_LEVEL = 25, /*!\brief Codec control function to set max data rate for intra frames, * unsigned int parameter * * This value controls additional clamping on the maximum size of a * keyframe. It is expressed as a percentage of the average * per-frame bitrate, with the special (and default) value 0 meaning * unlimited, or no additional clamping beyond the codec's built-in * algorithm. * * For example, to allocate no more than 4.5 frames worth of bitrate * to a keyframe, set this to 450. */ AOME_SET_MAX_INTRA_BITRATE_PCT = 26, /*!\brief Codec control function to set number of spatial layers, int * parameter */ AOME_SET_NUMBER_SPATIAL_LAYERS = 27, /*!\brief Codec control function to set max data rate for inter frames, * unsigned int parameter * * This value controls additional clamping on the maximum size of an * inter frame. It is expressed as a percentage of the average * per-frame bitrate, with the special (and default) value 0 meaning * unlimited, or no additional clamping beyond the codec's built-in * algorithm. * * For example, to allow no more than 4.5 frames worth of bitrate * to an inter frame, set this to 450. */ AV1E_SET_MAX_INTER_BITRATE_PCT = AOME_SET_MAX_INTRA_BITRATE_PCT + 2, // 28 /*!\brief Boost percentage for Golden Frame in CBR mode, unsigned int * parameter * * This value controls the amount of boost given to Golden Frame in * CBR mode. It is expressed as a percentage of the average * per-frame bitrate, with the special (and default) value 0 meaning * the feature is off, i.e., no golden frame boost in CBR mode and * average bitrate target is used. * * For example, to allow 100% more bits, i.e, 2X, in a golden frame * than average frame, set this to 100. */ AV1E_SET_GF_CBR_BOOST_PCT = 29, /* NOTE: enum 30 unused */ /*!\brief Codec control function to set lossless encoding mode, unsigned int * parameter * * AV1 can operate in lossless encoding mode, in which the bitstream * produced will be able to decode and reconstruct a perfect copy of * input source. * * - 0 = normal coding mode, may be lossy (default) * - 1 = lossless coding mode */ AV1E_SET_LOSSLESS = AV1E_SET_GF_CBR_BOOST_PCT + 2, // 31 /*!\brief Codec control function to enable the row based multi-threading * of the encoder, unsigned int parameter * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ROW_MT = 32, /*!\brief Codec control function to set number of tile columns. unsigned int * parameter * * In encoding and decoding, AV1 allows an input image frame be partitioned * into separate vertical tile columns, which can be encoded or decoded * independently. This enables easy implementation of parallel encoding and * decoding. The parameter for this control describes the number of tile * columns (in log2 units), which has a valid range of [0, 6]: * \verbatim 0 = 1 tile column 1 = 2 tile columns 2 = 4 tile columns ..... n = 2**n tile columns \endverbatim * By default, the value is 0, i.e. one single column tile for entire image. */ AV1E_SET_TILE_COLUMNS = 33, /*!\brief Codec control function to set number of tile rows, unsigned int * parameter * * In encoding and decoding, AV1 allows an input image frame be partitioned * into separate horizontal tile rows, which can be encoded or decoded * independently. The parameter for this control describes the number of tile * rows (in log2 units), which has a valid range of [0, 6]: * \verbatim 0 = 1 tile row 1 = 2 tile rows 2 = 4 tile rows ..... n = 2**n tile rows \endverbatim * By default, the value is 0, i.e. one single row tile for entire image. */ AV1E_SET_TILE_ROWS = 34, /*!\brief Codec control function to enable RDO modulated by frame temporal * dependency, unsigned int parameter * * - 0 = disable * - 1 = enable (default) * * \note Excluded from CONFIG_REALTIME_ONLY build. */ AV1E_SET_ENABLE_TPL_MODEL = 35, /*!\brief Codec control function to enable temporal filtering on key frame, * unsigned int parameter * * - 0 = disable * - 1 = enable without overlay (default) * - 2 = enable with overlay */ AV1E_SET_ENABLE_KEYFRAME_FILTERING = 36, /*!\brief Codec control function to enable frame parallel decoding feature, * unsigned int parameter * * AV1 has a bitstream feature to reduce decoding dependency between frames * by turning off backward update of probability context used in encoding * and decoding. This allows staged parallel processing of more than one * video frames in the decoder. This control function provides a means to * turn this feature on or off for bitstreams produced by encoder. * * - 0 = disable (default) * - 1 = enable */ AV1E_SET_FRAME_PARALLEL_DECODING = 37, /*!\brief Codec control function to enable error_resilient_mode, int parameter * * AV1 has a bitstream feature to guarantee parsability of a frame * by turning on the error_resilient_decoding mode, even though the * reference buffers are unreliable or not received. * * - 0 = disable (default) * - 1 = enable */ AV1E_SET_ERROR_RESILIENT_MODE = 38, /*!\brief Codec control function to enable s_frame_mode, int parameter * * AV1 has a bitstream feature to designate certain frames as S-frames, * from where we can switch to a different stream, * even though the reference buffers may not be exactly identical. * * - 0 = disable (default) * - 1 = enable */ AV1E_SET_S_FRAME_MODE = 39, /*!\brief Codec control function to set adaptive quantization mode, unsigned * int parameter * * AV1 has a segment based feature that allows encoder to adaptively change * quantization parameter for each segment within a frame to improve the * subjective quality. This control makes encoder operate in one of the * several AQ modes supported. * * - 0 = disable (default) * - 1 = variance * - 2 = complexity * - 3 = cyclic refresh */ AV1E_SET_AQ_MODE = 40, /*!\brief Codec control function to enable/disable periodic Q boost, unsigned * int parameter * * One AV1 encoder speed feature is to enable quality boost by lowering * frame level Q periodically. This control function provides a means to * turn on/off this feature. * * - 0 = disable (default) * - 1 = enable */ AV1E_SET_FRAME_PERIODIC_BOOST = 41, /*!\brief Codec control function to set noise sensitivity, unsigned int * parameter * * - 0 = disable (default) * - 1 = enable (Y only) */ AV1E_SET_NOISE_SENSITIVITY = 42, /*!\brief Codec control function to set content type, aom_tune_content * parameter * * - AOM_CONTENT_DEFAULT = Regular video content (default) * - AOM_CONTENT_SCREEN = Screen capture content * - AOM_CONTENT_FILM = Film content */ AV1E_SET_TUNE_CONTENT = 43, /*!\brief Codec control function to set CDF update mode, unsigned int * parameter * * - 0: no update * - 1: update on every frame (default) * - 2: selectively update */ AV1E_SET_CDF_UPDATE_MODE = 44, /*!\brief Codec control function to set color space info, int parameter * * - 0 = For future use * - 1 = BT.709 * - 2 = Unspecified (default) * - 3 = For future use * - 4 = BT.470 System M (historical) * - 5 = BT.470 System B, G (historical) * - 6 = BT.601 * - 7 = SMPTE 240 * - 8 = Generic film (color filters using illuminant C) * - 9 = BT.2020, BT.2100 * - 10 = SMPTE 428 (CIE 1921 XYZ) * - 11 = SMPTE RP 431-2 * - 12 = SMPTE EG 432-1 * - 13..21 = For future use * - 22 = EBU Tech. 3213-E * - 23 = For future use */ AV1E_SET_COLOR_PRIMARIES = 45, /*!\brief Codec control function to set transfer function info, int parameter * * - 0 = For future use * - 1 = BT.709 * - 2 = Unspecified (default) * - 3 = For future use * - 4 = BT.470 System M (historical) * - 5 = BT.470 System B, G (historical) * - 6 = BT.601 * - 7 = SMPTE 240 M * - 8 = Linear * - 9 = Logarithmic (100 : 1 range) * - 10 = Logarithmic (100 * Sqrt(10) : 1 range) * - 11 = IEC 61966-2-4 * - 12 = BT.1361 * - 13 = sRGB or sYCC * - 14 = BT.2020 10-bit systems * - 15 = BT.2020 12-bit systems * - 16 = SMPTE ST 2084, ITU BT.2100 PQ * - 17 = SMPTE ST 428 * - 18 = BT.2100 HLG, ARIB STD-B67 * - 19 = For future use */ AV1E_SET_TRANSFER_CHARACTERISTICS = 46, /*!\brief Codec control function to set transfer function info, int parameter * * - 0 = Identity matrix * - 1 = BT.709 * - 2 = Unspecified (default) * - 3 = For future use * - 4 = US FCC 73.628 * - 5 = BT.470 System B, G (historical) * - 6 = BT.601 * - 7 = SMPTE 240 M * - 8 = YCgCo * - 9 = BT.2020 non-constant luminance, BT.2100 YCbCr * - 10 = BT.2020 constant luminance * - 11 = SMPTE ST 2085 YDzDx * - 12 = Chromaticity-derived non-constant luminance * - 13 = Chromaticity-derived constant luminance * - 14 = BT.2100 ICtCp * - 15 = For future use */ AV1E_SET_MATRIX_COEFFICIENTS = 47, /*!\brief Codec control function to set chroma 4:2:0 sample position info, * aom_chroma_sample_position_t parameter * * AOM_CSP_UNKNOWN is default */ AV1E_SET_CHROMA_SAMPLE_POSITION = 48, /*!\brief Codec control function to set minimum interval between GF/ARF * frames, unsigned int parameter * * By default the value is set as 4. */ AV1E_SET_MIN_GF_INTERVAL = 49, /*!\brief Codec control function to set minimum interval between GF/ARF * frames, unsigned int parameter * * By default the value is set as 16. */ AV1E_SET_MAX_GF_INTERVAL = 50, /*!\brief Codec control function to get an active map back from the encoder, aom_active_map_t* parameter */ AV1E_GET_ACTIVEMAP = 51, /*!\brief Codec control function to set color range bit, int parameter * * - 0 = Limited range, 16..235 or HBD equivalent (default) * - 1 = Full range, 0..255 or HBD equivalent */ AV1E_SET_COLOR_RANGE = 52, /*!\brief Codec control function to set intended rendering image size, * int32_t[2] parameter * * By default, this is identical to the image size in pixels. */ AV1E_SET_RENDER_SIZE = 53, /*!\brief Control to set target sequence level index for a certain operating * point (OP), int parameter * Possible values are in the form of "ABxy". * - AB: OP index. * - xy: Target level index for the OP. Possible values are: * + 0~27: corresponding to level 2.0 ~ 8.3. Note: * > Levels 2.2 (2), 2.3 (3), 3.2 (6), 3.3 (7), 4.2 (10) & 4.3 (11) are * undefined. * > Levels 7.x and 8.x (20~27) are in draft status, available under the * config flag CONFIG_CWG_C013. * + 31: maximum parameters level, no level-based constraints. * + 32: keep level stats only for level monitoring. * * E.g.: * - "0" means target level index 0 (2.0) for the 0th OP; * - "109" means target level index 9 (4.1) for the 1st OP; * - "1019" means target level index 19 (6.3) for the 10th OP. * * If the target level is not specified for an OP, the maximum parameters * level of 31 is used as default. */ AV1E_SET_TARGET_SEQ_LEVEL_IDX = 54, /*!\brief Codec control function to get sequence level index for each * operating point. int* parameter. There can be at most 32 operating points. * The results will be written into a provided integer array of sufficient * size. */ AV1E_GET_SEQ_LEVEL_IDX = 55, /*!\brief Codec control function to set intended superblock size, unsigned int * parameter * * By default, the superblock size is determined separately for each * frame by the encoder. */ AV1E_SET_SUPERBLOCK_SIZE = 56, /*!\brief Codec control function to enable automatic set and use of * bwd-pred frames, unsigned int parameter * * - 0 = disable (default) * - 1 = enable */ AOME_SET_ENABLEAUTOBWDREF = 57, /*!\brief Codec control function to encode with CDEF, unsigned int parameter * * CDEF is the constrained directional enhancement filter which is an * in-loop filter aiming to remove coding artifacts * * - 0 = disable * - 1 = enable for all frames (default) * - 2 = disable for non-reference frames * - 3 = enable adaptively based on frame qindex */ AV1E_SET_ENABLE_CDEF = 58, /*!\brief Codec control function to encode with Loop Restoration Filter, * unsigned int parameter * * - 0 = disable * - 1 = enable (default) * * \note Excluded from CONFIG_REALTIME_ONLY build. */ AV1E_SET_ENABLE_RESTORATION = 59, /*!\brief Codec control function to force video mode, unsigned int parameter * * - 0 = do not force video mode (default) * - 1 = force video mode even for a single frame */ AV1E_SET_FORCE_VIDEO_MODE = 60, /*!\brief Codec control function to predict with OBMC mode, unsigned int * parameter * * - 0 = disable * - 1 = enable (default) * * \note Excluded from CONFIG_REALTIME_ONLY build. */ AV1E_SET_ENABLE_OBMC = 61, /*!\brief Codec control function to encode without trellis quantization, * unsigned int parameter * * - 0 = apply trellis quantization (default) * - 1 = do not apply trellis quantization * - 2 = disable trellis quantization in rd search * - 3 = disable trellis quantization in estimate yrd */ AV1E_SET_DISABLE_TRELLIS_QUANT = 62, /*!\brief Codec control function to encode with quantisation matrices, * unsigned int parameter * * AOM can operate with default quantisation matrices dependent on * quantisation level and block type. * * - 0 = disable (default) * - 1 = enable */ AV1E_SET_ENABLE_QM = 63, /*!\brief Codec control function to set the min quant matrix flatness, * unsigned int parameter * * AOM can operate with different ranges of quantisation matrices. * As quantisation levels increase, the matrices get flatter. This * control sets the minimum level of flatness from which the matrices * are determined. * * By default, the encoder sets this minimum at level 5 (4 in all intra * mode). */ AV1E_SET_QM_MIN = 64, /*!\brief Codec control function to set the max quant matrix flatness, * unsigned int parameter * * AOM can operate with different ranges of quantisation matrices. * As quantisation levels increase, the matrices get flatter. This * control sets the maximum level of flatness possible. * * By default, the encoder sets this maximum at level 9 (10 in all intra * mode) */ AV1E_SET_QM_MAX = 65, /*!\brief Codec control function to set the min quant matrix flatness, * unsigned int parameter * * AOM can operate with different ranges of quantisation matrices. * As quantisation levels increase, the matrices get flatter. This * control sets the flatness for luma (Y). * * By default, the encoder sets this minimum at half the available * range. */ AV1E_SET_QM_Y = 66, /*!\brief Codec control function to set the min quant matrix flatness, * unsigned int parameter * * AOM can operate with different ranges of quantisation matrices. * As quantisation levels increase, the matrices get flatter. This * control sets the flatness for chroma (U). * * By default, the encoder sets this minimum at half the available * range. */ AV1E_SET_QM_U = 67, /*!\brief Codec control function to set the min quant matrix flatness, * unsigned int parameter * * AOM can operate with different ranges of quantisation matrices. * As quantisation levels increase, the matrices get flatter. This * control sets the flatness for chrome (V). * * By default, the encoder sets this minimum at half the available * range. */ AV1E_SET_QM_V = 68, /* NOTE: enum 69 unused */ /*!\brief Codec control function to set a maximum number of tile groups, * unsigned int parameter * * This will set the maximum number of tile groups. This will be * overridden if an MTU size is set. The default value is 1. */ AV1E_SET_NUM_TG = 70, /*!\brief Codec control function to set an MTU size for a tile group, unsigned * int parameter * * This will set the maximum number of bytes in a tile group. This can be * exceeded only if a single tile is larger than this amount. * * By default, the value is 0, in which case a fixed number of tile groups * is used. */ AV1E_SET_MTU = 71, /* NOTE: enum 72 unused */ /*!\brief Codec control function to enable/disable rectangular partitions, int * parameter * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_RECT_PARTITIONS = 73, /*!\brief Codec control function to enable/disable AB partitions, int * parameter * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_AB_PARTITIONS = 74, /*!\brief Codec control function to enable/disable 1:4 and 4:1 partitions, int * parameter * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_1TO4_PARTITIONS = 75, /*!\brief Codec control function to set min partition size, int parameter * * min_partition_size is applied to both width and height of the partition. * i.e, both width and height of a partition can not be smaller than * the min_partition_size, except the partition at the picture boundary. * * Valid values: [4, 8, 16, 32, 64, 128]. The default value is 4 for * 4x4. */ AV1E_SET_MIN_PARTITION_SIZE = 76, /*!\brief Codec control function to set max partition size, int parameter * * max_partition_size is applied to both width and height of the partition. * i.e, both width and height of a partition can not be larger than * the max_partition_size. * * Valid values:[4, 8, 16, 32, 64, 128] The default value is 128 for * 128x128. */ AV1E_SET_MAX_PARTITION_SIZE = 77, /*!\brief Codec control function to turn on / off intra edge filter * at sequence level, int parameter * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_INTRA_EDGE_FILTER = 78, /*!\brief Codec control function to turn on / off frame order hint (int * parameter). Affects: joint compound mode, motion field motion vector, * ref frame sign bias * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_ORDER_HINT = 79, /*!\brief Codec control function to turn on / off 64-length transforms, int * parameter * * This will enable or disable usage of length 64 transforms in any * direction. * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_TX64 = 80, /*!\brief Codec control function to turn on / off flip and identity * transforms, int parameter * * This will enable or disable usage of flip and identity transform * types in any direction. If enabled, this includes: * - FLIPADST_DCT * - DCT_FLIPADST * - FLIPADST_FLIPADST * - ADST_FLIPADST * - FLIPADST_ADST * - IDTX * - V_DCT * - H_DCT * - V_ADST * - H_ADST * - V_FLIPADST * - H_FLIPADST * * Valid values: * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_FLIP_IDTX = 81, /*!\brief Codec control function to turn on / off rectangular transforms, int * parameter * * This will enable or disable usage of rectangular transforms. NOTE: * Rectangular transforms only enabled when corresponding rectangular * partitions are. * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_RECT_TX = 82, /*!\brief Codec control function to turn on / off dist-wtd compound mode * at sequence level, int parameter * * This will enable or disable distance-weighted compound mode. * \attention If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced * to 0. * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_DIST_WTD_COMP = 83, /*!\brief Codec control function to turn on / off ref frame mvs (mfmv) usage * at sequence level, int parameter * * \attention If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced * to 0. * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_REF_FRAME_MVS = 84, /*!\brief Codec control function to set temporal mv prediction * enabling/disabling at frame level, int parameter * * \attention If AV1E_SET_ENABLE_REF_FRAME_MVS is 0, then this flag is * forced to 0. * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ALLOW_REF_FRAME_MVS = 85, /*!\brief Codec control function to turn on / off dual interpolation filter * for a sequence, int parameter * * - 0 = disable * - 1 = enable */ AV1E_SET_ENABLE_DUAL_FILTER = 86, /*!\brief Codec control function to turn on / off delta quantization in chroma * planes for a sequence, int parameter * * - 0 = disable (default) * - 1 = enable */ AV1E_SET_ENABLE_CHROMA_DELTAQ = 87, /*!\brief Codec control function to turn on / off masked compound usage * (wedge and diff-wtd compound modes) for a sequence, int parameter * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_MASKED_COMP = 88, /*!\brief Codec control function to turn on / off one sided compound usage * for a sequence, int parameter * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_ONESIDED_COMP = 89, /*!\brief Codec control function to turn on / off interintra compound * for a sequence, int parameter * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_INTERINTRA_COMP = 90, /*!\brief Codec control function to turn on / off smooth inter-intra * mode for a sequence, int parameter * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_SMOOTH_INTERINTRA = 91, /*!\brief Codec control function to turn on / off difference weighted * compound, int parameter * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_DIFF_WTD_COMP = 92, /*!\brief Codec control function to turn on / off interinter wedge * compound, int parameter * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_INTERINTER_WEDGE = 93, /*!\brief Codec control function to turn on / off interintra wedge * compound, int parameter * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_INTERINTRA_WEDGE = 94, /*!\brief Codec control function to turn on / off global motion usage * for a sequence, int parameter * * - 0 = disable * - 1 = enable (default) * * \note Excluded from CONFIG_REALTIME_ONLY build. */ AV1E_SET_ENABLE_GLOBAL_MOTION = 95, /*!\brief Codec control function to turn on / off warped motion usage * at sequence level, int parameter * * - 0 = disable * - 1 = enable (default) * * \note Excluded from CONFIG_REALTIME_ONLY build. */ AV1E_SET_ENABLE_WARPED_MOTION = 96, /*!\brief Codec control function to turn on / off warped motion usage * at frame level, int parameter * * \attention If AV1E_SET_ENABLE_WARPED_MOTION is 0, then this flag is * forced to 0. * * - 0 = disable * - 1 = enable (default) * * \note Excluded from CONFIG_REALTIME_ONLY build. */ AV1E_SET_ALLOW_WARPED_MOTION = 97, /*!\brief Codec control function to turn on / off filter intra usage at * sequence level, int parameter * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_FILTER_INTRA = 98, /*!\brief Codec control function to turn on / off smooth intra modes usage, * int parameter * * This will enable or disable usage of smooth, smooth_h and smooth_v intra * modes. * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_SMOOTH_INTRA = 99, /*!\brief Codec control function to turn on / off Paeth intra mode usage, int * parameter * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_PAETH_INTRA = 100, /*!\brief Codec control function to turn on / off CFL uv intra mode usage, int * parameter * * This will enable or disable usage of chroma-from-luma intra mode. * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_CFL_INTRA = 101, /*!\brief Codec control function to turn on / off frame superresolution, int * parameter * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_SUPERRES = 102, /*!\brief Codec control function to turn on / off overlay frames for * filtered ALTREF frames, int parameter * * This will enable or disable coding of overlay frames for filtered ALTREF * frames. When set to 0, overlay frames are not used but show existing frame * is used to display the filtered ALTREF frame as is. As a result the decoded * frame rate remains the same as the display frame rate. The default is 1. */ AV1E_SET_ENABLE_OVERLAY = 103, /*!\brief Codec control function to turn on/off palette mode, int parameter */ AV1E_SET_ENABLE_PALETTE = 104, /*!\brief Codec control function to turn on/off intra block copy mode, int parameter */ AV1E_SET_ENABLE_INTRABC = 105, /*!\brief Codec control function to turn on/off intra angle delta, int parameter */ AV1E_SET_ENABLE_ANGLE_DELTA = 106, /*!\brief Codec control function to set the delta q mode, unsigned int * parameter * * AV1 supports a delta q mode feature, that allows modulating q per * superblock. * * - 0 = deltaq signaling off * - 1 = use modulation to maximize objective quality (default) * - 2 = use modulation for local test * - 3 = use modulation for key frame perceptual quality optimization * - 4 = use modulation for user rating based perceptual quality optimization * - 5 = use modulation for HDR video * - 6 = use modulation for all intra using Variance Boost */ AV1E_SET_DELTAQ_MODE = 107, /*!\brief Codec control function to turn on/off loopfilter modulation * when delta q modulation is enabled, unsigned int parameter. * * \attention AV1 only supports loopfilter modulation when delta q * modulation is enabled as well. */ AV1E_SET_DELTALF_MODE = 108, /*!\brief Codec control function to set the single tile decoding mode, * unsigned int parameter * * \attention Only applicable if large scale tiling is on. * * - 0 = single tile decoding is off * - 1 = single tile decoding is on (default) */ AV1E_SET_SINGLE_TILE_DECODING = 109, /*!\brief Codec control function to enable the extreme motion vector unit * test, unsigned int parameter * * - 0 = off * - 1 = MAX_EXTREME_MV * - 2 = MIN_EXTREME_MV * * \note This is only used in motion vector unit test. */ AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST = 110, /*!\brief Codec control function to signal picture timing info in the * bitstream, aom_timing_info_type_t parameter. Default is * AOM_TIMING_UNSPECIFIED. */ AV1E_SET_TIMING_INFO_TYPE = 111, /*!\brief Codec control function to add film grain parameters (one of several * preset types) info in the bitstream, int parameter * Valid range: 0..16, 0 is unknown, 1..16 are test vectors */ AV1E_SET_FILM_GRAIN_TEST_VECTOR = 112, /*!\brief Codec control function to set the path to the film grain parameters, * const char* parameter */ AV1E_SET_FILM_GRAIN_TABLE = 113, /*!\brief Sets the noise level, int parameter */ AV1E_SET_DENOISE_NOISE_LEVEL = 114, /*!\brief Sets the denoisers block size, unsigned int parameter */ AV1E_SET_DENOISE_BLOCK_SIZE = 115, /*!\brief Sets the chroma subsampling x value, unsigned int parameter */ AV1E_SET_CHROMA_SUBSAMPLING_X = 116, /*!\brief Sets the chroma subsampling y value, unsigned int parameter */ AV1E_SET_CHROMA_SUBSAMPLING_Y = 117, /*!\brief Control to use a reduced tx type set, int parameter */ AV1E_SET_REDUCED_TX_TYPE_SET = 118, /*!\brief Control to use dct only for intra modes, int parameter */ AV1E_SET_INTRA_DCT_ONLY = 119, /*!\brief Control to use dct only for inter modes, int parameter */ AV1E_SET_INTER_DCT_ONLY = 120, /*!\brief Control to use default tx type only for intra modes, int parameter */ AV1E_SET_INTRA_DEFAULT_TX_ONLY = 121, /*!\brief Control to use adaptive quantize_b, int parameter */ AV1E_SET_QUANT_B_ADAPT = 122, /*!\brief Control to select maximum height for the GF group pyramid structure, * unsigned int parameter * * Valid range: 0..5 */ AV1E_SET_GF_MAX_PYRAMID_HEIGHT = 123, /*!\brief Control to select maximum reference frames allowed per frame, int * parameter * * Valid range: 3..7 */ AV1E_SET_MAX_REFERENCE_FRAMES = 124, /*!\brief Control to use reduced set of single and compound references, int parameter */ AV1E_SET_REDUCED_REFERENCE_SET = 125, /*!\brief Control to set frequency of the cost updates for coefficients, * unsigned int parameter * * - 0 = update at SB level (default) * - 1 = update at SB row level in tile * - 2 = update at tile level * - 3 = turn off */ AV1E_SET_COEFF_COST_UPD_FREQ = 126, /*!\brief Control to set frequency of the cost updates for mode, unsigned int * parameter * * - 0 = update at SB level (default) * - 1 = update at SB row level in tile * - 2 = update at tile level * - 3 = turn off */ AV1E_SET_MODE_COST_UPD_FREQ = 127, /*!\brief Control to set frequency of the cost updates for motion vectors, * unsigned int parameter * * - 0 = update at SB level (default) * - 1 = update at SB row level in tile * - 2 = update at tile level * - 3 = turn off */ AV1E_SET_MV_COST_UPD_FREQ = 128, /*!\brief Control to set bit mask that specifies which tier each of the 32 * possible operating points conforms to, unsigned int parameter * * - 0 = main tier (default) * - 1 = high tier */ AV1E_SET_TIER_MASK = 129, /*!\brief Control to set minimum compression ratio, unsigned int parameter * Take integer values. If non-zero, encoder will try to keep the compression * ratio of each frame to be higher than the given value divided by 100. * E.g. 850 means minimum compression ratio of 8.5. */ AV1E_SET_MIN_CR = 130, /* NOTE: enums 145-149 unused */ /*!\brief Codec control function to set the layer id, aom_svc_layer_id_t* * parameter */ AV1E_SET_SVC_LAYER_ID = 131, /*!\brief Codec control function to set SVC parameters, aom_svc_params_t* * parameter */ AV1E_SET_SVC_PARAMS = 132, /*!\brief Codec control function to set the reference frame config, * aom_svc_ref_frame_config_t* parameter */ AV1E_SET_SVC_REF_FRAME_CONFIG = 133, /*!\brief Codec control function to set the path to the VMAF model used when * tuning the encoder for VMAF, const char* parameter */ AV1E_SET_VMAF_MODEL_PATH = 134, /*!\brief Codec control function to enable EXT_TILE_DEBUG in AV1 encoder, * unsigned int parameter * * - 0 = disable (default) * - 1 = enable * * \note This is only used in lightfield example test. */ AV1E_ENABLE_EXT_TILE_DEBUG = 135, /*!\brief Codec control function to enable the superblock multipass unit test * in AV1 to ensure that the encoder does not leak state between different * passes. unsigned int parameter. * * - 0 = disable (default) * - 1 = enable * * \note This is only used in sb_multipass unit test. */ AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST = 136, /*!\brief Control to select minimum height for the GF group pyramid structure, * unsigned int parameter * * Valid values: 0..5 */ AV1E_SET_GF_MIN_PYRAMID_HEIGHT = 137, /*!\brief Control to set average complexity of the corpus in the case of * single pass vbr based on LAP, unsigned int parameter */ AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP = 138, /*!\brief Control to get baseline gf interval */ AV1E_GET_BASELINE_GF_INTERVAL = 139, /*\brief Control to set encoding the denoised frame from denoise-noise-level * * - 0 = disabled/encode the original frame * - 1 = enabled/encode the denoised frame (default) */ AV1E_SET_ENABLE_DNL_DENOISING = 140, /*!\brief Codec control function to turn on / off D45 to D203 intra mode * usage, int parameter * * This will enable or disable usage of D45 to D203 intra modes, which are a * subset of directional modes. This control has no effect if directional * modes are disabled (AV1E_SET_ENABLE_DIRECTIONAL_INTRA set to 0). * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_DIAGONAL_INTRA = 141, /*!\brief Control to set frequency of the cost updates for intrabc motion * vectors, unsigned int parameter * * - 0 = update at SB level (default) * - 1 = update at SB row level in tile * - 2 = update at tile level * - 3 = turn off */ AV1E_SET_DV_COST_UPD_FREQ = 142, /*!\brief Codec control to set the path for partition stats read and write. * const char * parameter. */ AV1E_SET_PARTITION_INFO_PATH = 143, /*!\brief Codec control to use an external partition model * A set of callback functions is passed through this control * to let the encoder encode with given partitions. */ AV1E_SET_EXTERNAL_PARTITION = 144, /*!\brief Codec control function to turn on / off directional intra mode * usage, int parameter * * - 0 = disable * - 1 = enable (default) */ AV1E_SET_ENABLE_DIRECTIONAL_INTRA = 145, /*!\brief Control to turn on / off transform size search. * Note: it can not work with non RD pick mode in real-time encoding, * where the max transform size is only 16x16. * It will be ignored if non RD pick mode is set. * * - 0 = disable, transforms always have the largest possible size * - 1 = enable, search for the best transform size for each block (default) */ AV1E_SET_ENABLE_TX_SIZE_SEARCH = 146, /*!\brief Codec control function to set reference frame compound prediction. * aom_svc_ref_frame_comp_pred_t* parameter */ AV1E_SET_SVC_REF_FRAME_COMP_PRED = 147, /*!\brief Set --deltaq-mode strength. * * Valid range: [0, 1000] */ AV1E_SET_DELTAQ_STRENGTH = 148, /*!\brief Codec control to control loop filter * * - 0 = Loop filter is disabled for all frames * - 1 = Loop filter is enabled for all frames * - 2 = Loop filter is disabled for non-reference frames * - 3 = Loop filter is disabled for the frames with low motion */ AV1E_SET_LOOPFILTER_CONTROL = 149, /*!\brief Codec control function to get the loopfilter chosen by the encoder, * int* parameter */ AOME_GET_LOOPFILTER_LEVEL = 150, /*!\brief Codec control to automatically turn off several intra coding tools, * unsigned int parameter * - 0 = do not use the feature * - 1 = enable the automatic decision to turn off several intra tools */ AV1E_SET_AUTO_INTRA_TOOLS_OFF = 151, /*!\brief Codec control function to set flag for rate control used by external * encoders. * - 1 = Enable rate control for external encoders. This will disable content * dependency in rate control and cyclic refresh. * - 0 = Default. Disable rate control for external encoders. */ AV1E_SET_RTC_EXTERNAL_RC = 152, /*!\brief Codec control function to enable frame parallel multi-threading * of the encoder, unsigned int parameter * * - 0 = disable (default) * - 1 = enable */ AV1E_SET_FP_MT = 153, /*!\brief Codec control to enable actual frame parallel encode or * simulation of frame parallel encode in FPMT unit test, unsigned int * parameter * * - 0 = simulate frame parallel encode * - 1 = actual frame parallel encode (default) * * \note This is only used in FPMT unit test. */ AV1E_SET_FP_MT_UNIT_TEST = 154, /*!\brief Codec control function to get the target sequence level index for * each operating point. int* parameter. There can be at most 32 operating * points. The results will be written into a provided integer array of * sufficient size. If a target level is not set, the result will be 31. * Please refer to https://aomediacodec.github.io/av1-spec/#levels for more * details on level definitions and indices. */ AV1E_GET_TARGET_SEQ_LEVEL_IDX = 155, /*!\brief Codec control function to get the number of operating points. int* * parameter. */ AV1E_GET_NUM_OPERATING_POINTS = 156, /*!\brief Codec control function to skip the application of post-processing * filters on reconstructed frame, unsigned int parameter * * - 0 = disable (default) * - 1 = enable * * \attention For this value to be used aom_codec_enc_cfg_t::g_usage * must be set to AOM_USAGE_ALL_INTRA. */ AV1E_SET_SKIP_POSTPROC_FILTERING = 157, /*!\brief Codec control function to enable the superblock level * qp sweep in AV1 to ensure that end-to-end test runs well, * unsigned int parameter. * * - 0 = disable (default) * - 1 = enable * * \note This is only used in sb_qp_sweep unit test. */ AV1E_ENABLE_SB_QP_SWEEP = 158, /*!\brief Codec control to set quantizer for the next frame, int parameter. * * - Valid range [0, 63] * * This will turn off cyclic refresh. Only applicable to 1-pass. */ AV1E_SET_QUANTIZER_ONE_PASS = 159, /*!\brief Codec control to enable the rate distribution guided delta * quantization in all intra mode, unsigned int parameter * * - 0 = disable (default) * - 1 = enable * * \attention This feature requires --deltaq-mode=3, also an input file * which contains rate distribution for each 16x16 block, * passed in by --rate-distribution-info=rate_distribution.txt. */ AV1E_ENABLE_RATE_GUIDE_DELTAQ = 160, /*!\brief Codec control to set the input file for rate distribution used * in all intra mode, const char * parameter * The input should be the name of a text file, which * contains (rows x cols) float values separated by space. * Each float value represent the number of bits for each 16x16 block. * rows = (frame_height + 15) / 16 * cols = (frame_width + 15) / 16 * * \attention This feature requires --enable-rate-guide-deltaq=1. */ AV1E_SET_RATE_DISTRIBUTION_INFO = 161, /*!\brief Codec control to get the CDEF strength for Y / luma plane, * int * parameter. * Returns an integer array of CDEF_MAX_STRENGTHS elements. */ AV1E_GET_LUMA_CDEF_STRENGTH = 162, /*!\brief Codec control to set the target bitrate in kilobits per second, * unsigned int parameter. For 1 pass CBR mode, single layer encoding. * This controls replaces the call aom_codec_enc_config_set(&codec, &cfg) * when only target bitrate is changed, and so is much cheaper as it * bypasses a lot of unneeded code checks. */ AV1E_SET_BITRATE_ONE_PASS_CBR = 163, /*!\brief Codec control to set the maximum number of consecutive frame drops, * in units of frames, allowed for the frame dropper in 1 pass * CBR mode, int parameter. Value of zero has no effect. * \deprecated Use the new control AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR. */ AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR = 164, /*!\brief Codec control to set the frame drop mode for SVC, * unsigned int parameter. The valid values are constants of the * AOM_SVC_FRAME_DROP_MODE enum: AOM_LAYER_DROP or AOM_FULL_SUPERFRAME_DROP. */ AV1E_SET_SVC_FRAME_DROP_MODE = 165, /*!\brief Codec control to set auto tiling, unsigned int parameter. * Value of 1 means encoder will set number of tile_columns and tile_rows, * based on the number of threads and resolution. This will override any * settings set via SET_TILE_COLUMNS/ROWS. If the value is 0 no change is * done, the previous setting (if any) for tile_columns/rows is preserved. */ AV1E_SET_AUTO_TILES = 166, /*!\brief Codec control to get the high motion content flag, used for * screen content realtime (RTC) encoding, int * parameter. * Returns an integer. * 1 means high motion content flag is set to 1, 0 means set to 0. */ AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC = 167, /*!\brief Codec control to enable post encode frame drop for RTC encoding, * int parameter. * * Value of 1 means encoder will enable post encode drop. Default is 0 (not * enabled). Post encode drop is only allowed when frame dropping is enabled * (aom_codec_enc_cfg::rc_dropframe_thresh > 0). */ AV1E_SET_POSTENCODE_DROP_RTC = 168, /*!\brief Codec control to set the maximum number of consecutive frame drops, * in units of time (milliseconds), allowed for the frame dropper in 1 pass * CBR mode, int parameter. Value of zero has no effect. */ AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR = 169, // Any new encoder control IDs should be added above. // Maximum allowed encoder control ID is 229. // No encoder control ID should be added below. }; /*!\brief aom 1-D scaling mode * * This set of constants define 1-D aom scaling modes */ typedef enum aom_scaling_mode_1d { AOME_NORMAL = 0, AOME_FOURFIVE = 1, AOME_THREEFIVE = 2, AOME_THREEFOUR = 3, AOME_ONEFOUR = 4, AOME_ONEEIGHT = 5, AOME_ONETWO = 6, AOME_TWOTHREE = 7, AOME_ONETHREE = 8 } AOM_SCALING_MODE; /*!\brief Max number of segments * * This is the limit of number of segments allowed within a frame. * * Currently same as "MAX_SEGMENTS" in AV1, the maximum that AV1 supports. * */ #define AOM_MAX_SEGMENTS 8 /*!\brief aom region of interest map * * These defines the data structures for the region of interest map * * TODO(yaowu): create a unit test for ROI map related APIs * */ typedef struct aom_roi_map { /*! An id between 0 and 7 for each 8x8 region within a frame. */ unsigned char *roi_map; unsigned int rows; /**< Number of rows. */ unsigned int cols; /**< Number of columns. */ int delta_q[AOM_MAX_SEGMENTS]; /**< Quantizer deltas. */ int delta_lf[AOM_MAX_SEGMENTS]; /**< Loop filter deltas. */ /*! Static breakout threshold for each segment. */ unsigned int static_threshold[AOM_MAX_SEGMENTS]; } aom_roi_map_t; /*!\brief aom active region map * * These defines the data structures for active region map * */ typedef struct aom_active_map { /*!\brief specify an on (1) or off (0) each 16x16 region within a frame */ unsigned char *active_map; unsigned int rows; /**< number of rows */ unsigned int cols; /**< number of cols */ } aom_active_map_t; /*!\brief aom image scaling mode * * This defines the data structure for image scaling mode * */ typedef struct aom_scaling_mode { AOM_SCALING_MODE h_scaling_mode; /**< horizontal scaling mode */ AOM_SCALING_MODE v_scaling_mode; /**< vertical scaling mode */ } aom_scaling_mode_t; /*!brief AV1 encoder content type */ typedef enum { AOM_CONTENT_DEFAULT, AOM_CONTENT_SCREEN, AOM_CONTENT_FILM, AOM_CONTENT_INVALID } aom_tune_content; /*!brief AV1 encoder timing info type signaling */ typedef enum { AOM_TIMING_UNSPECIFIED, AOM_TIMING_EQUAL, AOM_TIMING_DEC_MODEL } aom_timing_info_type_t; /*!\brief Model tuning parameters * * Changes the encoder to tune for certain types of input material. * * \note * AOM_TUNE_IQ is restricted to all intra mode (AOM_USAGE_ALL_INTRA). Setting * the tuning option to AOM_TUNE_IQ causes the following options to be set * (expressed as command-line options): * * --enable-qm=1 * * --qm-min=2 * * --qm-max=10 * * --sharpness=7 * * --dist-metric=qm-psnr * * --enable-cdef=3 * * --enable-chroma-deltaq=1 * * --deltaq-mode=6 */ typedef enum { AOM_TUNE_PSNR = 0, AOM_TUNE_SSIM = 1, /* NOTE: enums 2 and 3 unused */ AOM_TUNE_VMAF_WITH_PREPROCESSING = 4, AOM_TUNE_VMAF_WITHOUT_PREPROCESSING = 5, AOM_TUNE_VMAF_MAX_GAIN = 6, AOM_TUNE_VMAF_NEG_MAX_GAIN = 7, AOM_TUNE_BUTTERAUGLI = 8, AOM_TUNE_VMAF_SALIENCY_MAP = 9, /*!\brief Allows detection of the presence of AOM_TUNE_IQ at compile time. */ #define AOM_HAVE_TUNE_IQ 1 /* Image quality (or intra quality). Increases image quality and consistency, * guided by the SSIMULACRA 2 metric and subjective quality checks. Shares * the rdmult code with AOM_TUNE_SSIM. */ AOM_TUNE_IQ = 10, } aom_tune_metric; /*!\brief Distortion metric to use for RD optimization. * * Changes the encoder to use a different distortion metric for RD search. Note * that this value operates on a "lower level" compared to aom_tune_metric - it * affects the distortion metric inside a block, while aom_tune_metric only * affects RD across blocks. * */ typedef enum { // Use PSNR for in-block rate-distortion optimization. AOM_DIST_METRIC_PSNR, // Use quantization matrix-weighted PSNR for in-block rate-distortion // optimization. If --enable-qm=1 is not specified, this falls back to // behaving in the same way as AOM_DIST_METRIC_PSNR. AOM_DIST_METRIC_QM_PSNR, } aom_dist_metric; #define AOM_MAX_LAYERS 32 /**< Max number of layers */ #define AOM_MAX_SS_LAYERS 4 /**< Max number of spatial layers */ #define AOM_MAX_TS_LAYERS 8 /**< Max number of temporal layers */ /*!brief Struct for spatial and temporal layer ID */ typedef struct aom_svc_layer_id { int spatial_layer_id; /**< Spatial layer ID */ int temporal_layer_id; /**< Temporal layer ID */ } aom_svc_layer_id_t; /*!brief Parameter type for SVC * * In the arrays of size AOM_MAX_LAYERS, the index for spatial layer `sl` and * temporal layer `tl` is sl * number_temporal_layers + tl. * */ typedef struct aom_svc_params { int number_spatial_layers; /**< Number of spatial layers */ int number_temporal_layers; /**< Number of temporal layers */ int max_quantizers[AOM_MAX_LAYERS]; /**< Max Q for each layer */ int min_quantizers[AOM_MAX_LAYERS]; /**< Min Q for each layer */ int scaling_factor_num[AOM_MAX_SS_LAYERS]; /**< Scaling factor-numerator */ int scaling_factor_den[AOM_MAX_SS_LAYERS]; /**< Scaling factor-denominator */ /*! Target bitrate for each layer, in kilobits per second */ int layer_target_bitrate[AOM_MAX_LAYERS]; /*! Frame rate factor for each temporal layer */ int framerate_factor[AOM_MAX_TS_LAYERS]; } aom_svc_params_t; /*!brief Parameters for setting ref frame config */ typedef struct aom_svc_ref_frame_config { // Three arrays need to be set: reference[], ref_id[], refresh[]. // reference[i]: is a boolean flag to indicate which of the 7 possible // references are used for prediction. Values are 0 (not used as reference) // or 1 (use as reference). The index 0 - 6 refers to the references: // last(0), last2(1), last3(2), golden(3), bwdref(4), altref2(5), altref(6). // ref_idx[i]: maps a reference to one of the 8 buffers slots, values are // 0 - 7. The ref_idx for a unused reference (reference[i] = 1, and not used // for refresh, see below) can be set to the ref_idx of the first reference // used (usually LAST). // refresh[i] is a boolean flag to indicate if a buffer is updated/refreshed // with the current encoded frame. Values are 0 (no refresh) or 1 (refresh). // The refresh is done internally by looking at the ref_idx[j], for j = 0 - 6, // so to refresh a buffer slot (i) a reference must be mapped to that slot // (i = ref_idx[j]). // Examples for usage (for RTC encoding) are in: examples/svc_encoder_rtc.c. int reference[7]; /**< Reference flag for each of the 7 references. */ /*! Buffer slot index (0..7) for each of 7 references indexed above. */ int ref_idx[7]; int refresh[8]; /**< Refresh flag for each of the 8 buffer slots. */ } aom_svc_ref_frame_config_t; /*!brief Parameters for setting ref frame compound prediction */ typedef struct aom_svc_ref_frame_comp_pred { // Use compound prediction for the ref_frame pairs GOLDEN_LAST (0), // LAST2_LAST (1), and ALTREF_LAST (2). int use_comp_pred[3]; /**= 256. * * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) */ enum aom_dec_control_id { /*!\brief Codec control function to get info on which reference frames were * updated by the last decode, int* parameter */ AOMD_GET_LAST_REF_UPDATES = AOM_DECODER_CTRL_ID_START, /*!\brief Codec control function to check if the indicated frame is corrupted, int* parameter */ AOMD_GET_FRAME_CORRUPTED, /*!\brief Codec control function to get info on which reference frames were * used by the last decode, int* parameter */ AOMD_GET_LAST_REF_USED, /*!\brief Codec control function to get the dimensions that the current * frame is decoded at, int* parameter * * This may be different to the intended display size for the frame as * specified in the wrapper or frame header (see AV1D_GET_DISPLAY_SIZE). */ AV1D_GET_FRAME_SIZE, /*!\brief Codec control function to get the current frame's intended display * dimensions (as specified in the wrapper or frame header), int* parameter * * This may be different to the decoded dimensions of this frame (see * AV1D_GET_FRAME_SIZE). */ AV1D_GET_DISPLAY_SIZE, /*!\brief Codec control function to get the bit depth of the stream, * unsigned int* parameter */ AV1D_GET_BIT_DEPTH, /*!\brief Codec control function to get the image format of the stream, * aom_img_fmt_t* parameter */ AV1D_GET_IMG_FORMAT, /*!\brief Codec control function to get the width and height (in pixels) of * the tiles in a tile list, unsigned int* parameter * * Tile width is in the high 16 bits of the output value, and tile height is * in the low 16 bits of the output value. */ AV1D_GET_TILE_SIZE, /*!\brief Codec control function to get the tile count in a tile list, * unsigned int* parameter */ AV1D_GET_TILE_COUNT, /*!\brief Codec control function to set the byte alignment of the planes in * the reference buffers, int parameter * * Valid values are power of 2, from 32 to 1024. A value of 0 sets * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly * follows Y plane, and V plane directly follows U plane. Default value is 0. */ AV1_SET_BYTE_ALIGNMENT, /*!\brief Codec control function to invert the decoding order to from right to * left, int parameter * * The function is used in a test to confirm the decoding independence of tile * columns. The function may be used in application where this order * of decoding is desired. int parameter * * TODO(yaowu): Rework the unit test that uses this control, and in a future * release, this test-only control shall be removed. */ AV1_INVERT_TILE_DECODE_ORDER, /*!\brief Codec control function to set the skip loop filter flag, int * parameter * * Valid values are integers. The decoder will skip the loop filter * when its value is set to nonzero. If the loop filter is skipped the * decoder may accumulate decode artifacts. The default value is 0. */ AV1_SET_SKIP_LOOP_FILTER, /*!\brief Codec control function to retrieve a pointer to the Accounting * struct, takes Accounting** as parameter * * If called before a frame has been decoded, this returns AOM_CODEC_ERROR. * The caller should ensure that AOM_CODEC_OK is returned before attempting * to dereference the Accounting pointer. * * \attention When configured with -DCONFIG_ACCOUNTING=0, the default, this * returns AOM_CODEC_INCAPABLE. */ AV1_GET_ACCOUNTING, /*!\brief Codec control function to get last decoded frame quantizer, * int* parameter * * Returned value uses internal quantizer scale defined by the codec. */ AOMD_GET_LAST_QUANTIZER, /*!\brief Codec control function to set the range of tile decoding, int * parameter * * A value that is greater and equal to zero indicates only the specific * row/column is decoded. A value that is -1 indicates the whole row/column * is decoded. A special case is both values are -1 that means the whole * frame is decoded. */ AV1_SET_DECODE_TILE_ROW, AV1_SET_DECODE_TILE_COL, /*!\brief Codec control function to set the tile coding mode, unsigned int * parameter * * - 0 = tiles are coded in normal tile mode * - 1 = tiles are coded in large-scale tile mode */ AV1_SET_TILE_MODE, /*!\brief Codec control function to get the frame header information of an * encoded frame, aom_tile_data* parameter */ AV1D_GET_FRAME_HEADER_INFO, /*!\brief Codec control function to get the start address and size of a * tile in the coded bitstream, aom_tile_data* parameter. */ AV1D_GET_TILE_DATA, /*!\brief Codec control function to set the external references' pointers in * the decoder, av1_ext_ref_frame_t* parameter. * * This is used while decoding the tile list OBU in large-scale tile coding * mode. */ AV1D_SET_EXT_REF_PTR, /*!\brief Codec control function to enable the ext-tile software debug and * testing code in the decoder, unsigned int parameter */ AV1D_EXT_TILE_DEBUG, /*!\brief Codec control function to enable the row based multi-threading of * decoding, unsigned int parameter * * - 0 = disabled * - 1 = enabled (default) */ AV1D_SET_ROW_MT, /*!\brief Codec control function to indicate whether bitstream is in * Annex-B format, unsigned int parameter */ AV1D_SET_IS_ANNEXB, /*!\brief Codec control function to indicate which operating point to use, * int parameter * * A scalable stream may define multiple operating points, each of which * defines a set of temporal and spatial layers to be processed. The * operating point index may take a value between 0 and * operating_points_cnt_minus_1 (which is at most 31). */ AV1D_SET_OPERATING_POINT, /*!\brief Codec control function to indicate whether to output one frame per * temporal unit (the default), or one frame per spatial layer, int parameter * * In a scalable stream, each temporal unit corresponds to a single "frame" * of video, and within a temporal unit there may be multiple spatial layers * with different versions of that frame. * For video playback, only the highest-quality version (within the * selected operating point) is needed, but for some use cases it is useful * to have access to multiple versions of a frame when they are available. */ AV1D_SET_OUTPUT_ALL_LAYERS, /*!\brief Codec control function to set an aom_inspect_cb callback that is * invoked each time a frame is decoded, aom_inspect_init* parameter * * \attention When configured with -DCONFIG_INSPECTION=0, the default, this * returns AOM_CODEC_INCAPABLE. */ AV1_SET_INSPECTION_CALLBACK, /*!\brief Codec control function to set the skip film grain flag, int * parameter * * Valid values are integers. The decoder will skip the film grain when its * value is set to nonzero. The default value is 0. */ AV1D_SET_SKIP_FILM_GRAIN, /*!\brief Codec control function to check the presence of forward key frames, * int* parameter */ AOMD_GET_FWD_KF_PRESENT, /*!\brief Codec control function to get the frame flags of the previous frame * decoded, int* parameter * * This will return a flag of type aom_codec_frame_flags_t. */ AOMD_GET_FRAME_FLAGS, /*!\brief Codec control function to check the presence of altref frames, int* * parameter */ AOMD_GET_ALTREF_PRESENT, /*!\brief Codec control function to get tile information of the previous frame * decoded, aom_tile_info* parameter * * This will return a struct of type aom_tile_info. */ AOMD_GET_TILE_INFO, /*!\brief Codec control function to get screen content tools information, * aom_screen_content_tools_info* parameter * * It returns a struct of type aom_screen_content_tools_info, which contains * the header flags allow_screen_content_tools, allow_intrabc, and * force_integer_mv. */ AOMD_GET_SCREEN_CONTENT_TOOLS_INFO, /*!\brief Codec control function to get the still picture coding information, * aom_still_picture_info* parameter */ AOMD_GET_STILL_PICTURE, /*!\brief Codec control function to get superblock size, * aom_superblock_size_t* parameter * * It returns an enum, indicating the superblock size read from the sequence * header(0 for BLOCK_64X64 and 1 for BLOCK_128X128) */ AOMD_GET_SB_SIZE, /*!\brief Codec control function to check if the previous frame * decoded has show existing frame flag set, int* parameter */ AOMD_GET_SHOW_EXISTING_FRAME_FLAG, /*!\brief Codec control function to get the S_FRAME coding information, * aom_s_frame_info* parameter */ AOMD_GET_S_FRAME_INFO, /*!\brief Codec control function to get the show frame flag, int* parameter */ AOMD_GET_SHOW_FRAME_FLAG, /*!\brief Codec control function to get the base q index of a frame, int* * parameter */ AOMD_GET_BASE_Q_IDX, /*!\brief Codec control function to get the order hint of a frame, unsigned * int* parameter */ AOMD_GET_ORDER_HINT, /*!\brief Codec control function to get the info of a 4x4 block. * Parameters: int mi_row, int mi_col, and MB_MODE_INFO*. * * \note This only returns a shallow copy, so all pointer members should not * be used. */ AV1D_GET_MI_INFO, }; /*!\cond */ /*!\brief AOM decoder control function parameter type * * Defines the data types that AOMD control functions take. * * \note Additional common controls are defined in aom.h. * * \note For each control ID "X", a macro-define of * AOM_CTRL_X is provided. It is used at compile time to determine * if the control ID is supported by the libaom library available, * when the libaom version cannot be controlled. */ AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_UPDATES, int *) #define AOM_CTRL_AOMD_GET_LAST_REF_UPDATES AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_CORRUPTED, int *) #define AOM_CTRL_AOMD_GET_FRAME_CORRUPTED AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_USED, int *) #define AOM_CTRL_AOMD_GET_LAST_REF_USED AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *) #define AOM_CTRL_AV1D_GET_FRAME_SIZE AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *) #define AOM_CTRL_AV1D_GET_DISPLAY_SIZE AOM_CTRL_USE_TYPE(AV1D_GET_BIT_DEPTH, unsigned int *) #define AOM_CTRL_AV1D_GET_BIT_DEPTH AOM_CTRL_USE_TYPE(AV1D_GET_IMG_FORMAT, aom_img_fmt_t *) #define AOM_CTRL_AV1D_GET_IMG_FORMAT AOM_CTRL_USE_TYPE(AV1D_GET_TILE_SIZE, unsigned int *) #define AOM_CTRL_AV1D_GET_TILE_SIZE AOM_CTRL_USE_TYPE(AV1D_GET_TILE_COUNT, unsigned int *) #define AOM_CTRL_AV1D_GET_TILE_COUNT AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int) #define AOM_CTRL_AV1_INVERT_TILE_DECODE_ORDER AOM_CTRL_USE_TYPE(AV1_SET_SKIP_LOOP_FILTER, int) #define AOM_CTRL_AV1_SET_SKIP_LOOP_FILTER AOM_CTRL_USE_TYPE(AV1_GET_ACCOUNTING, Accounting **) #define AOM_CTRL_AV1_GET_ACCOUNTING AOM_CTRL_USE_TYPE(AOMD_GET_LAST_QUANTIZER, int *) #define AOM_CTRL_AOMD_GET_LAST_QUANTIZER AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_ROW, int) #define AOM_CTRL_AV1_SET_DECODE_TILE_ROW AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_COL, int) #define AOM_CTRL_AV1_SET_DECODE_TILE_COL AOM_CTRL_USE_TYPE(AV1_SET_TILE_MODE, unsigned int) #define AOM_CTRL_AV1_SET_TILE_MODE AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_HEADER_INFO, aom_tile_data *) #define AOM_CTRL_AV1D_GET_FRAME_HEADER_INFO AOM_CTRL_USE_TYPE(AV1D_GET_TILE_DATA, aom_tile_data *) #define AOM_CTRL_AV1D_GET_TILE_DATA AOM_CTRL_USE_TYPE(AV1D_SET_EXT_REF_PTR, av1_ext_ref_frame_t *) #define AOM_CTRL_AV1D_SET_EXT_REF_PTR AOM_CTRL_USE_TYPE(AV1D_EXT_TILE_DEBUG, unsigned int) #define AOM_CTRL_AV1D_EXT_TILE_DEBUG AOM_CTRL_USE_TYPE(AV1D_SET_ROW_MT, unsigned int) #define AOM_CTRL_AV1D_SET_ROW_MT AOM_CTRL_USE_TYPE(AV1D_SET_IS_ANNEXB, unsigned int) #define AOM_CTRL_AV1D_SET_IS_ANNEXB AOM_CTRL_USE_TYPE(AV1D_SET_OPERATING_POINT, int) #define AOM_CTRL_AV1D_SET_OPERATING_POINT AOM_CTRL_USE_TYPE(AV1D_SET_OUTPUT_ALL_LAYERS, int) #define AOM_CTRL_AV1D_SET_OUTPUT_ALL_LAYERS AOM_CTRL_USE_TYPE(AV1_SET_INSPECTION_CALLBACK, aom_inspect_init *) #define AOM_CTRL_AV1_SET_INSPECTION_CALLBACK AOM_CTRL_USE_TYPE(AV1D_SET_SKIP_FILM_GRAIN, int) #define AOM_CTRL_AV1D_SET_SKIP_FILM_GRAIN AOM_CTRL_USE_TYPE(AOMD_GET_FWD_KF_PRESENT, int *) #define AOM_CTRL_AOMD_GET_FWD_KF_PRESENT AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_FLAGS, int *) #define AOM_CTRL_AOMD_GET_FRAME_FLAGS AOM_CTRL_USE_TYPE(AOMD_GET_ALTREF_PRESENT, int *) #define AOM_CTRL_AOMD_GET_ALTREF_PRESENT AOM_CTRL_USE_TYPE(AOMD_GET_TILE_INFO, aom_tile_info *) #define AOM_CTRL_AOMD_GET_TILE_INFO AOM_CTRL_USE_TYPE(AOMD_GET_SCREEN_CONTENT_TOOLS_INFO, aom_screen_content_tools_info *) #define AOM_CTRL_AOMD_GET_SCREEN_CONTENT_TOOLS_INFO AOM_CTRL_USE_TYPE(AOMD_GET_STILL_PICTURE, aom_still_picture_info *) #define AOM_CTRL_AOMD_GET_STILL_PICTURE AOM_CTRL_USE_TYPE(AOMD_GET_SB_SIZE, aom_superblock_size_t *) #define AOMD_CTRL_AOMD_GET_SB_SIZE AOM_CTRL_USE_TYPE(AOMD_GET_SHOW_EXISTING_FRAME_FLAG, int *) #define AOMD_CTRL_AOMD_GET_SHOW_EXISTING_FRAME_FLAG AOM_CTRL_USE_TYPE(AOMD_GET_S_FRAME_INFO, aom_s_frame_info *) #define AOMD_CTRL_AOMD_GET_S_FRAME_INFO AOM_CTRL_USE_TYPE(AOMD_GET_SHOW_FRAME_FLAG, int *) #define AOM_CTRL_AOMD_GET_SHOW_FRAME_FLAG AOM_CTRL_USE_TYPE(AOMD_GET_BASE_Q_IDX, int *) #define AOM_CTRL_AOMD_GET_BASE_Q_IDX AOM_CTRL_USE_TYPE(AOMD_GET_ORDER_HINT, unsigned int *) #define AOM_CTRL_AOMD_GET_ORDER_HINT // The AOM_CTRL_USE_TYPE macro can't be used with AV1D_GET_MI_INFO because // AV1D_GET_MI_INFO takes more than one parameter. #define AOM_CTRL_AV1D_GET_MI_INFO /*!\endcond */ /*! @} - end defgroup aom_decoder */ #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_AOMDX_H_ aom-3.12.1/aom/exports_com000066400000000000000000000015131477627663500154420ustar00rootroot00000000000000text aom_codec_build_config text aom_codec_control text aom_codec_destroy text aom_codec_err_to_string text aom_codec_error text aom_codec_error_detail text aom_codec_get_caps text aom_codec_iface_name text aom_codec_set_option text aom_codec_version text aom_codec_version_extra_str text aom_codec_version_str text aom_img_add_metadata text aom_img_alloc text aom_img_alloc_with_border text aom_img_flip text aom_img_free text aom_img_get_metadata text aom_img_metadata_free text aom_img_metadata_alloc text aom_img_num_metadata text aom_img_plane_height text aom_img_plane_width text aom_img_remove_metadata text aom_img_set_rect text aom_img_wrap text aom_rb_bytes_read text aom_rb_read_bit text aom_rb_read_literal text aom_rb_read_uvlc text aom_uleb_decode text aom_uleb_encode text aom_uleb_encode_fixed_size text aom_uleb_size_in_bytes aom-3.12.1/aom/exports_dec000066400000000000000000000003511477627663500154160ustar00rootroot00000000000000text aom_codec_dec_init_ver text aom_codec_decode text aom_codec_get_frame text aom_codec_get_stream_info text aom_codec_peek_stream_info text aom_codec_set_frame_buffer_functions text aom_obu_type_to_string text aom_read_obu_header aom-3.12.1/aom/exports_enc000066400000000000000000000003571477627663500154360ustar00rootroot00000000000000text aom_codec_enc_config_default text aom_codec_enc_config_set text aom_codec_enc_init_ver text aom_codec_encode text aom_codec_get_cx_data text aom_codec_get_global_headers text aom_codec_get_preview_frame text aom_codec_set_cx_data_buf aom-3.12.1/aom/internal/000077500000000000000000000000001477627663500147715ustar00rootroot00000000000000aom-3.12.1/aom/internal/aom_codec_internal.h000066400000000000000000000405511477627663500207540ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief Describes the decoder algorithm interface for algorithm * implementations. * * This file defines the private structures and data types that are only * relevant to implementing an algorithm, as opposed to using it. * * To create a decoder algorithm class, an interface structure is put * into the global namespace: *
 *     my_codec.c:
 *       aom_codec_iface_t my_codec = {
 *           "My Codec v1.0",
 *           AOM_CODEC_ALG_ABI_VERSION,
 *           ...
 *       };
 *     
* * An application instantiates a specific decoder instance by using * aom_codec_dec_init() and a pointer to the algorithm's interface structure: *
 *     my_app.c:
 *       extern aom_codec_iface_t my_codec;
 *       {
 *           aom_codec_ctx_t algo;
 *           int threads = 4;
 *           aom_codec_dec_cfg_t cfg = { threads, 0, 0, 1 };
 *           res = aom_codec_dec_init(&algo, &my_codec, &cfg, 0);
 *       }
 *     
* * Once initialized, the instance is managed using other functions from * the aom_codec_* family. */ #ifndef AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_ #define AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_ #include "../aom_decoder.h" #include "../aom_encoder.h" #include "common/args_helper.h" #include #ifdef __cplusplus extern "C" { #endif /*!\brief Current ABI version number * * \internal * If this file is altered in any way that changes the ABI, this value * must be bumped. Examples include, but are not limited to, changing * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ #define AOM_CODEC_INTERNAL_ABI_VERSION (7) /**<\hideinitializer*/ typedef struct aom_codec_alg_priv aom_codec_alg_priv_t; /*!\brief init function pointer prototype * * Performs algorithm-specific initialization of the decoder context. This * function is called by aom_codec_dec_init() and aom_codec_enc_init(), so * plugins implementing this interface may trust the input parameters to be * properly initialized. * * \param[in] ctx Pointer to this instance's context * \retval #AOM_CODEC_OK * The input stream was recognized and decoder initialized. * \retval #AOM_CODEC_MEM_ERROR * Memory operation failed. */ typedef aom_codec_err_t (*aom_codec_init_fn_t)(aom_codec_ctx_t *ctx); /*!\brief destroy function pointer prototype * * Performs algorithm-specific destruction of the decoder context. This * function is called by the generic aom_codec_destroy() wrapper function, * so plugins implementing this interface may trust the input parameters * to be properly initialized. * * \param[in] ctx Pointer to this instance's context * \retval #AOM_CODEC_OK * The input stream was recognized and decoder initialized. * \retval #AOM_CODEC_MEM_ERROR * Memory operation failed. */ typedef aom_codec_err_t (*aom_codec_destroy_fn_t)(aom_codec_alg_priv_t *ctx); /*!\brief parse stream info function pointer prototype * * Performs high level parsing of the bitstream. This function is called by the * generic aom_codec_peek_stream_info() wrapper function, so plugins * implementing this interface may trust the input parameters to be properly * initialized. * * \param[in] data Pointer to a block of data to parse * \param[in] data_sz Size of the data buffer * \param[in,out] si Pointer to stream info to update. The is_annexb * member \ref MUST be properly initialized. This * function sets the rest of the members. * * \retval #AOM_CODEC_OK * Bitstream is parsable and stream information updated */ typedef aom_codec_err_t (*aom_codec_peek_si_fn_t)(const uint8_t *data, size_t data_sz, aom_codec_stream_info_t *si); /*!\brief Return information about the current stream. * * Returns information about the stream that has been parsed during decoding. * * \param[in] ctx Pointer to this instance's context * \param[in,out] si Pointer to stream info to update * * \retval #AOM_CODEC_OK * Bitstream is parsable and stream information updated */ typedef aom_codec_err_t (*aom_codec_get_si_fn_t)(aom_codec_alg_priv_t *ctx, aom_codec_stream_info_t *si); /*!\brief control function pointer prototype * * This function is used to exchange algorithm specific data with the decoder * instance. This can be used to implement features specific to a particular * algorithm. * * This function is called by the generic aom_codec_control() wrapper * function, so plugins implementing this interface may trust the input * parameters to be properly initialized. However, this interface does not * provide type safety for the exchanged data or assign meanings to the * control IDs. Those details should be specified in the algorithm's * header file. In particular, the ctrl_id parameter is guaranteed to exist * in the algorithm's control mapping table, and the data parameter may be NULL. * * * \param[in] ctx Pointer to this instance's context * \param[in] ctrl_id Algorithm specific control identifier * \param[in,out] data Data to exchange with algorithm instance. * * \retval #AOM_CODEC_OK * The internal state data was deserialized. */ typedef aom_codec_err_t (*aom_codec_control_fn_t)(aom_codec_alg_priv_t *ctx, va_list ap); /*!\brief codec option setter function pointer prototype * This function is used to set a codec option using a key (option name) & value * pair. * * \param[in] ctx Pointer to this instance's context * \param[in] name A string of the option's name (key) * \param[in] value A string of the value to be set to * * \retval #AOM_CODEC_OK * The option is successfully set to the value * \retval #AOM_CODEC_INVALID_PARAM * The data was not valid. */ typedef aom_codec_err_t (*aom_codec_set_option_fn_t)(aom_codec_alg_priv_t *ctx, const char *name, const char *value); /*!\brief control function pointer mapping * * This structure stores the mapping between control identifiers and * implementing functions. Each algorithm provides a list of these * mappings. This list is searched by the aom_codec_control() * function to determine which function to invoke. The special * value defined by CTRL_MAP_END is used to indicate end-of-list, and must be * present. It can be tested with the at_ctrl_map_end function. Note that * ctrl_id values \ref MUST be non-zero. */ typedef const struct aom_codec_ctrl_fn_map { int ctrl_id; aom_codec_control_fn_t fn; } aom_codec_ctrl_fn_map_t; #define CTRL_MAP_END \ { 0, NULL } static inline int at_ctrl_map_end(aom_codec_ctrl_fn_map_t *e) { return e->ctrl_id == 0 && e->fn == NULL; } /*!\brief decode data function pointer prototype * * Processes a buffer of coded data. This function is called by the generic * aom_codec_decode() wrapper function, so plugins implementing this interface * may trust the input parameters to be properly initialized. * * \param[in] ctx Pointer to this instance's context * \param[in] data Pointer to this block of new coded data. * \param[in] data_sz Size of the coded data, in bytes. * * \return Returns #AOM_CODEC_OK if the coded data was processed completely * and future pictures can be decoded without error. Otherwise, * see the descriptions of the other error codes in ::aom_codec_err_t * for recoverability capabilities. */ typedef aom_codec_err_t (*aom_codec_decode_fn_t)(aom_codec_alg_priv_t *ctx, const uint8_t *data, size_t data_sz, void *user_priv); /*!\brief Decoded frames iterator * * Iterates over a list of the frames available for display. The iterator * storage should be initialized to NULL to start the iteration. Iteration is * complete when this function returns NULL. * * The list of available frames becomes valid upon completion of the * aom_codec_decode call, and remains valid until the next call to * aom_codec_decode. * * \param[in] ctx Pointer to this instance's context * \param[in out] iter Iterator storage, initialized to NULL * * \return Returns a pointer to an image, if one is ready for display. Frames * produced will always be in PTS (presentation time stamp) order. */ typedef aom_image_t *(*aom_codec_get_frame_fn_t)(aom_codec_alg_priv_t *ctx, aom_codec_iter_t *iter); /*!\brief Pass in external frame buffers for the decoder to use. * * Registers functions to be called when libaom needs a frame buffer * to decode the current frame and a function to be called when libaom does * not internally reference the frame buffer. This set function must * be called before the first call to decode or libaom will assume the * default behavior of allocating frame buffers internally. * * \param[in] ctx Pointer to this instance's context * \param[in] cb_get Pointer to the get callback function * \param[in] cb_release Pointer to the release callback function * \param[in] cb_priv Callback's private data * * \retval #AOM_CODEC_OK * External frame buffers will be used by libaom. * \retval #AOM_CODEC_INVALID_PARAM * One or more of the callbacks were NULL. * \retval #AOM_CODEC_ERROR * Decoder context not initialized, or algorithm not capable of * using external frame buffers. * * \note * When decoding AV1, the application may be required to pass in at least * #AOM_MAXIMUM_WORK_BUFFERS external frame * buffers. */ typedef aom_codec_err_t (*aom_codec_set_fb_fn_t)( aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get, aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv); typedef aom_codec_err_t (*aom_codec_encode_fn_t)(aom_codec_alg_priv_t *ctx, const aom_image_t *img, aom_codec_pts_t pts, unsigned long duration, aom_enc_frame_flags_t flags); typedef const aom_codec_cx_pkt_t *(*aom_codec_get_cx_data_fn_t)( aom_codec_alg_priv_t *ctx, aom_codec_iter_t *iter); typedef aom_codec_err_t (*aom_codec_enc_config_set_fn_t)( aom_codec_alg_priv_t *ctx, const aom_codec_enc_cfg_t *cfg); typedef aom_fixed_buf_t *(*aom_codec_get_global_headers_fn_t)( aom_codec_alg_priv_t *ctx); typedef aom_image_t *(*aom_codec_get_preview_frame_fn_t)( aom_codec_alg_priv_t *ctx); /*!\brief Decoder algorithm interface * * All decoders \ref MUST expose a variable of this type. */ struct aom_codec_iface { const char *name; /**< Identification String */ int abi_version; /**< Implemented ABI version */ aom_codec_caps_t caps; /**< Decoder capabilities */ aom_codec_init_fn_t init; /**< \copydoc ::aom_codec_init_fn_t */ aom_codec_destroy_fn_t destroy; /**< \copydoc ::aom_codec_destroy_fn_t */ aom_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::aom_codec_ctrl_fn_map_t */ struct aom_codec_dec_iface { aom_codec_peek_si_fn_t peek_si; /**< \copydoc ::aom_codec_peek_si_fn_t */ aom_codec_get_si_fn_t get_si; /**< \copydoc ::aom_codec_get_si_fn_t */ aom_codec_decode_fn_t decode; /**< \copydoc ::aom_codec_decode_fn_t */ aom_codec_get_frame_fn_t get_frame; /**< \copydoc ::aom_codec_get_frame_fn_t */ aom_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::aom_codec_set_fb_fn_t */ } dec; struct aom_codec_enc_iface { int cfg_count; const aom_codec_enc_cfg_t *cfgs; /**< \copydoc ::aom_codec_enc_cfg_t */ aom_codec_encode_fn_t encode; /**< \copydoc ::aom_codec_encode_fn_t */ aom_codec_get_cx_data_fn_t get_cx_data; /**< \copydoc ::aom_codec_get_cx_data_fn_t */ aom_codec_enc_config_set_fn_t cfg_set; /**< \copydoc ::aom_codec_enc_config_set_fn_t */ aom_codec_get_global_headers_fn_t get_glob_hdrs; /**< \copydoc ::aom_codec_get_global_headers_fn_t */ aom_codec_get_preview_frame_fn_t get_preview; /**< \copydoc ::aom_codec_get_preview_frame_fn_t */ } enc; aom_codec_set_option_fn_t set_option; }; /*!\brief Instance private storage * * This structure is allocated by the algorithm's init function. It can be * extended in one of two ways. First, a second, algorithm specific structure * can be allocated and the priv member pointed to it. Alternatively, this * structure can be made the first member of the algorithm specific structure, * and the pointer cast to the proper type. */ struct aom_codec_priv { const char *err_detail; aom_codec_flags_t init_flags; struct { aom_fixed_buf_t cx_data_dst_buf; unsigned int cx_data_pad_before; unsigned int cx_data_pad_after; aom_codec_cx_pkt_t cx_data_pkt; } enc; }; #define CAST(id, arg) va_arg((arg), aom_codec_control_type_##id) /* Internal Utility Functions * * The following functions are intended to be used inside algorithms as * utilities for manipulating aom_codec_* data structures. */ struct aom_codec_pkt_list { unsigned int cnt; unsigned int max; struct aom_codec_cx_pkt pkts[1]; }; #define aom_codec_pkt_list_decl(n) \ union { \ struct aom_codec_pkt_list head; \ struct { \ struct aom_codec_pkt_list head; \ struct aom_codec_cx_pkt pkts[n]; \ } alloc; \ } #define aom_codec_pkt_list_init(m) \ (m)->alloc.head.cnt = 0, \ (m)->alloc.head.max = sizeof((m)->alloc.pkts) / sizeof((m)->alloc.pkts[0]) int aom_codec_pkt_list_add(struct aom_codec_pkt_list *, const struct aom_codec_cx_pkt *); const aom_codec_cx_pkt_t *aom_codec_pkt_list_get( struct aom_codec_pkt_list *list, aom_codec_iter_t *iter); #include #include struct aom_internal_error_info { aom_codec_err_t error_code; int has_detail; char detail[ARG_ERR_MSG_MAX_LEN]; int setjmp; // Boolean: whether 'jmp' is valid. jmp_buf jmp; }; #define CLANG_ANALYZER_NORETURN #if defined(__has_feature) #if __has_feature(attribute_analyzer_noreturn) #undef CLANG_ANALYZER_NORETURN #define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn)) #endif #endif // Tells the compiler to perform `printf` format string checking if the // compiler supports it; see the 'format' attribute in // . #define LIBAOM_FORMAT_PRINTF(string_index, first_to_check) #if defined(__has_attribute) #if __has_attribute(format) #undef LIBAOM_FORMAT_PRINTF #define LIBAOM_FORMAT_PRINTF(string_index, first_to_check) \ __attribute__((__format__(__printf__, string_index, first_to_check))) #endif #endif // Records the error code and error message. Does not call longjmp(). void aom_set_error(struct aom_internal_error_info *info, aom_codec_err_t error, const char *fmt, ...) LIBAOM_FORMAT_PRINTF(3, 4); void aom_internal_error(struct aom_internal_error_info *info, aom_codec_err_t error, const char *fmt, ...) LIBAOM_FORMAT_PRINTF(3, 4) CLANG_ANALYZER_NORETURN; // Calls aom_internal_error() with the error code and error message in `src`. // `info` and `src` must not point to the same struct, i.e., self copy is // prohibited. void aom_internal_error_copy(struct aom_internal_error_info *info, const struct aom_internal_error_info *src) CLANG_ANALYZER_NORETURN; void aom_merge_corrupted_flag(int *corrupted, int value); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_ aom-3.12.1/aom/internal/aom_image_internal.h000066400000000000000000000072261477627663500207630ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief Describes the internal functions associated with the aom image * descriptor. * */ #ifndef AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_ #define AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_ #include "aom/aom_image.h" #ifdef __cplusplus extern "C" { #endif /*!\brief Array of aom_metadata structs for an image. */ struct aom_metadata_array { size_t sz; /* Number of metadata structs in the list */ aom_metadata_t **metadata_array; /* Array of metadata structs */ }; /*!\brief Alloc memory for aom_metadata_array struct. * * Allocate memory for aom_metadata_array struct. * If sz is 0 the aom_metadata_array struct's internal buffer list will be * NULL, but the aom_metadata_array struct itself will still be allocated. * Returns a pointer to the allocated struct or NULL on failure. * * \param[in] sz Size of internal metadata list buffer */ aom_metadata_array_t *aom_img_metadata_array_alloc(size_t sz); /*!\brief Free metadata array struct. * * Free metadata array struct and all metadata structs inside. * * \param[in] arr Metadata array struct pointer */ void aom_img_metadata_array_free(aom_metadata_array_t *arr); typedef void *(*aom_alloc_img_data_cb_fn_t)(void *priv, size_t size); /*!\brief Open a descriptor, allocating storage for the underlying image by * using the provided callback function. * * Returns a descriptor for storing an image of the given format. The storage * for the image is allocated by using the provided callback function. Unlike * aom_img_alloc(), the returned descriptor does not own the storage for the * image. The caller is responsible for freeing the storage for the image. * * Note: If the callback function is invoked and succeeds, * aom_img_alloc_with_cb() is guaranteed to succeed. Therefore, if * aom_img_alloc_with_cb() fails, the caller is assured that no storage was * allocated. * * \param[in] img Pointer to storage for descriptor. If this parameter * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image * \param[in] d_w Width of the image * \param[in] d_h Height of the image * \param[in] align Alignment, in bytes, of the image buffer and * each row in the image (stride). * \param[in] alloc_cb Callback function used to allocate storage for the * image. * \param[in] cb_priv The first argument ('priv') for the callback * function. * * \return Returns a pointer to the initialized image descriptor. If the img * parameter is non-null, the value of the img parameter will be * returned. */ aom_image_t *aom_img_alloc_with_cb(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, unsigned int d_h, unsigned int align, aom_alloc_img_data_cb_fn_t alloc_cb, void *cb_priv); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_ aom-3.12.1/aom/src/000077500000000000000000000000001477627663500137445ustar00rootroot00000000000000aom-3.12.1/aom/src/aom_codec.c000066400000000000000000000137121477627663500160250ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief Provides the high level interface to wrap decoder algorithms. * */ #include #include #include #include "config/aom_config.h" #include "config/aom_version.h" #include "aom/aom_integer.h" #include "aom/internal/aom_codec_internal.h" int aom_codec_version(void) { return VERSION_PACKED; } const char *aom_codec_version_str(void) { return VERSION_STRING_NOSP; } const char *aom_codec_version_extra_str(void) { return VERSION_EXTRA; } const char *aom_codec_iface_name(aom_codec_iface_t *iface) { return iface ? iface->name : ""; } const char *aom_codec_err_to_string(aom_codec_err_t err) { switch (err) { case AOM_CODEC_OK: return "Success"; case AOM_CODEC_ERROR: return "Unspecified internal error"; case AOM_CODEC_MEM_ERROR: return "Memory allocation error"; case AOM_CODEC_ABI_MISMATCH: return "ABI version mismatch"; case AOM_CODEC_INCAPABLE: return "Codec does not implement requested capability"; case AOM_CODEC_UNSUP_BITSTREAM: return "Bitstream not supported by this decoder"; case AOM_CODEC_UNSUP_FEATURE: return "Bitstream required feature not supported by this decoder"; case AOM_CODEC_CORRUPT_FRAME: return "Corrupt frame detected"; case AOM_CODEC_INVALID_PARAM: return "Invalid parameter"; case AOM_CODEC_LIST_END: return "End of iterated list"; } return "Unrecognized error code"; } const char *aom_codec_error(const aom_codec_ctx_t *ctx) { return (ctx) ? aom_codec_err_to_string(ctx->err) : aom_codec_err_to_string(AOM_CODEC_INVALID_PARAM); } const char *aom_codec_error_detail(const aom_codec_ctx_t *ctx) { if (ctx && ctx->err) return ctx->priv ? ctx->priv->err_detail : ctx->err_detail; return NULL; } aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx) { if (!ctx) { return AOM_CODEC_INVALID_PARAM; } if (!ctx->iface || !ctx->priv) { ctx->err = AOM_CODEC_ERROR; return AOM_CODEC_ERROR; } ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv); ctx->iface = NULL; ctx->name = NULL; ctx->priv = NULL; ctx->err = AOM_CODEC_OK; return AOM_CODEC_OK; } aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface) { return iface ? iface->caps : 0; } aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) { if (!ctx) { return AOM_CODEC_INVALID_PARAM; } // Control ID must be non-zero. if (!ctrl_id) { ctx->err = AOM_CODEC_INVALID_PARAM; return AOM_CODEC_INVALID_PARAM; } if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps) { ctx->err = AOM_CODEC_ERROR; return AOM_CODEC_ERROR; } // "ctrl_maps" is an array of (control ID, function pointer) elements, // with CTRL_MAP_END as a sentinel. for (aom_codec_ctrl_fn_map_t *entry = ctx->iface->ctrl_maps; !at_ctrl_map_end(entry); ++entry) { if (entry->ctrl_id == ctrl_id) { va_list ap; va_start(ap, ctrl_id); ctx->err = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap); va_end(ap); return ctx->err; } } ctx->err = AOM_CODEC_ERROR; ctx->priv->err_detail = "Invalid control ID"; return AOM_CODEC_ERROR; } aom_codec_err_t aom_codec_set_option(aom_codec_ctx_t *ctx, const char *name, const char *value) { if (!ctx) { return AOM_CODEC_INVALID_PARAM; } if (!ctx->iface || !ctx->priv || !ctx->iface->set_option) { ctx->err = AOM_CODEC_ERROR; return AOM_CODEC_ERROR; } ctx->err = ctx->iface->set_option((aom_codec_alg_priv_t *)ctx->priv, name, value); return ctx->err; } LIBAOM_FORMAT_PRINTF(3, 0) static void set_error(struct aom_internal_error_info *info, aom_codec_err_t error, const char *fmt, va_list ap) { info->error_code = error; info->has_detail = 0; if (fmt) { size_t sz = sizeof(info->detail); info->has_detail = 1; vsnprintf(info->detail, sz - 1, fmt, ap); info->detail[sz - 1] = '\0'; } } void aom_set_error(struct aom_internal_error_info *info, aom_codec_err_t error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); set_error(info, error, fmt, ap); va_end(ap); assert(!info->setjmp); } void aom_internal_error(struct aom_internal_error_info *info, aom_codec_err_t error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); set_error(info, error, fmt, ap); va_end(ap); if (info->setjmp) longjmp(info->jmp, info->error_code); } void aom_internal_error_copy(struct aom_internal_error_info *info, const struct aom_internal_error_info *src) { assert(info != src); assert(!src->setjmp); if (!src->has_detail) { aom_internal_error(info, src->error_code, NULL); } else { aom_internal_error(info, src->error_code, "%s", src->detail); } } void aom_merge_corrupted_flag(int *corrupted, int value) { *corrupted |= value; } const char *aom_obu_type_to_string(OBU_TYPE type) { switch (type) { case OBU_SEQUENCE_HEADER: return "OBU_SEQUENCE_HEADER"; case OBU_TEMPORAL_DELIMITER: return "OBU_TEMPORAL_DELIMITER"; case OBU_FRAME_HEADER: return "OBU_FRAME_HEADER"; case OBU_REDUNDANT_FRAME_HEADER: return "OBU_REDUNDANT_FRAME_HEADER"; case OBU_FRAME: return "OBU_FRAME"; case OBU_TILE_GROUP: return "OBU_TILE_GROUP"; case OBU_METADATA: return "OBU_METADATA"; case OBU_TILE_LIST: return "OBU_TILE_LIST"; case OBU_PADDING: return "OBU_PADDING"; default: break; } return ""; } aom-3.12.1/aom/src/aom_decoder.c000066400000000000000000000077401477627663500163610ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief Provides the high level interface to wrap decoder algorithms. * */ #include #include "aom/internal/aom_codec_internal.h" #define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var) static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) { return (aom_codec_alg_priv_t *)ctx->priv; } aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx, aom_codec_iface_t *iface, const aom_codec_dec_cfg_t *cfg, aom_codec_flags_t flags, int ver) { aom_codec_err_t res; if (ver != AOM_DECODER_ABI_VERSION) res = AOM_CODEC_ABI_MISMATCH; else if (!ctx || !iface) res = AOM_CODEC_INVALID_PARAM; else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION) res = AOM_CODEC_ABI_MISMATCH; else if (!(iface->caps & AOM_CODEC_CAP_DECODER)) res = AOM_CODEC_INCAPABLE; else { memset(ctx, 0, sizeof(*ctx)); ctx->iface = iface; ctx->name = iface->name; ctx->priv = NULL; ctx->init_flags = flags; ctx->config.dec = cfg; res = ctx->iface->init(ctx); if (res) { ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; aom_codec_destroy(ctx); } } return SAVE_STATUS(ctx, res); } aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface, const uint8_t *data, size_t data_sz, aom_codec_stream_info_t *si) { aom_codec_err_t res; if (!iface || !data || !data_sz || !si) { res = AOM_CODEC_INVALID_PARAM; } else { /* Set default/unknown values */ si->w = 0; si->h = 0; res = iface->dec.peek_si(data, data_sz, si); } return res; } aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx, aom_codec_stream_info_t *si) { aom_codec_err_t res; if (!ctx || !si) { res = AOM_CODEC_INVALID_PARAM; } else if (!ctx->iface || !ctx->priv) { res = AOM_CODEC_ERROR; } else { /* Set default/unknown values */ si->w = 0; si->h = 0; res = ctx->iface->dec.get_si(get_alg_priv(ctx), si); } return SAVE_STATUS(ctx, res); } aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data, size_t data_sz, void *user_priv) { aom_codec_err_t res; if (!ctx) res = AOM_CODEC_INVALID_PARAM; else if (!ctx->iface || !ctx->priv) res = AOM_CODEC_ERROR; else { res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv); } return SAVE_STATUS(ctx, res); } aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter) { aom_image_t *img; if (!ctx || !iter || !ctx->iface || !ctx->priv) img = NULL; else img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter); return img; } aom_codec_err_t aom_codec_set_frame_buffer_functions( aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get, aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { aom_codec_err_t res; if (!ctx || !cb_get || !cb_release) { res = AOM_CODEC_INVALID_PARAM; } else if (!ctx->iface || !ctx->priv) { res = AOM_CODEC_ERROR; } else if (!(ctx->iface->caps & AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) { res = AOM_CODEC_INCAPABLE; } else { res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release, cb_priv); } return SAVE_STATUS(ctx, res); } aom-3.12.1/aom/src/aom_encoder.c000066400000000000000000000260441477627663500163710ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief Provides the high level interface to wrap encoder algorithms. * */ #include "config/aom_config.h" #if HAVE_FEXCEPT #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include #endif #include #include #include #include "aom/aom_encoder.h" #include "aom/internal/aom_codec_internal.h" #define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var) static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) { return (aom_codec_alg_priv_t *)ctx->priv; } aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx, aom_codec_iface_t *iface, const aom_codec_enc_cfg_t *cfg, aom_codec_flags_t flags, int ver) { aom_codec_err_t res; // The value of AOM_ENCODER_ABI_VERSION in libaom v3.0.0 and v3.1.0 - v3.1.3. // // We are compatible with these older libaom releases. AOM_ENCODER_ABI_VERSION // was incremented after these releases for two reasons: // 1. AOM_ENCODER_ABI_VERSION takes contribution from // AOM_EXT_PART_ABI_VERSION. The external partition API is still // experimental, so it should not be considered as part of the stable ABI. // fd9ed8366 External partition: Define APIs // https://aomedia-review.googlesource.com/c/aom/+/135663 // 2. As a way to detect the presence of speeds 7-9 in all-intra mode. I (wtc) // suggested this change because I misunderstood how // AOM_ENCODER_ABI_VERSION was used. // bbdfa68d1 AllIntra: Redefine all-intra mode speed features for speed 7+ // https://aomedia-review.googlesource.com/c/aom/+/140624 const int aom_encoder_abi_version_25 = 25; // TODO(bug aomedia:3228): Remove the check for aom_encoder_abi_version_25 in // libaom v4.0.0. if (ver != AOM_ENCODER_ABI_VERSION && ver != aom_encoder_abi_version_25) res = AOM_CODEC_ABI_MISMATCH; else if (!ctx || !iface || !cfg) res = AOM_CODEC_INVALID_PARAM; else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION) res = AOM_CODEC_ABI_MISMATCH; else if (!(iface->caps & AOM_CODEC_CAP_ENCODER)) res = AOM_CODEC_INCAPABLE; else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR)) res = AOM_CODEC_INCAPABLE; else if ((flags & AOM_CODEC_USE_HIGHBITDEPTH) && !(iface->caps & AOM_CODEC_CAP_HIGHBITDEPTH)) { res = AOM_CODEC_INCAPABLE; } else if (cfg->g_bit_depth > 8 && (flags & AOM_CODEC_USE_HIGHBITDEPTH) == 0) { res = AOM_CODEC_INVALID_PARAM; ctx->err_detail = "High bit-depth used without the AOM_CODEC_USE_HIGHBITDEPTH flag."; } else { ctx->iface = iface; ctx->name = iface->name; ctx->priv = NULL; ctx->init_flags = flags; ctx->config.enc = cfg; res = ctx->iface->init(ctx); if (res) { // IMPORTANT: ctx->priv->err_detail must be null or point to a string // that remains valid after ctx->priv is destroyed, such as a C string // literal. This makes it safe to call aom_codec_error_detail() after // aom_codec_enc_init_ver() failed. ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; aom_codec_destroy(ctx); } } return SAVE_STATUS(ctx, res); } aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface, aom_codec_enc_cfg_t *cfg, unsigned int usage) { aom_codec_err_t res; if (!iface || !cfg) res = AOM_CODEC_INVALID_PARAM; else if (!(iface->caps & AOM_CODEC_CAP_ENCODER)) res = AOM_CODEC_INCAPABLE; else { res = AOM_CODEC_INVALID_PARAM; for (int i = 0; i < iface->enc.cfg_count; ++i) { if (iface->enc.cfgs[i].g_usage == usage) { *cfg = iface->enc.cfgs[i]; res = AOM_CODEC_OK; /* default values */ memset(&cfg->encoder_cfg, 0, sizeof(cfg->encoder_cfg)); cfg->encoder_cfg.super_block_size = 0; // Dynamic cfg->encoder_cfg.max_partition_size = 128; cfg->encoder_cfg.min_partition_size = 4; cfg->encoder_cfg.disable_trellis_quant = 3; break; } } } return res; } #if AOM_ARCH_X86 || AOM_ARCH_X86_64 /* On X86, disable the x87 unit's internal 80 bit precision for better * consistency with the SSE unit's 64 bit precision. */ #include "aom_ports/x86.h" #define FLOATING_POINT_SET_PRECISION \ unsigned short x87_orig_mode = x87_set_double_precision(); #define FLOATING_POINT_RESTORE_PRECISION x87_set_control_word(x87_orig_mode); #else #define FLOATING_POINT_SET_PRECISION #define FLOATING_POINT_RESTORE_PRECISION #endif // AOM_ARCH_X86 || AOM_ARCH_X86_64 #if HAVE_FEXCEPT && CONFIG_DEBUG #define FLOATING_POINT_SET_EXCEPTIONS \ const int float_excepts = \ feenableexcept(FE_DIVBYZERO | FE_UNDERFLOW | FE_OVERFLOW); #define FLOATING_POINT_RESTORE_EXCEPTIONS \ if (float_excepts != -1) { \ fedisableexcept(FE_ALL_EXCEPT); \ feenableexcept(float_excepts); \ } #else #define FLOATING_POINT_SET_EXCEPTIONS #define FLOATING_POINT_RESTORE_EXCEPTIONS #endif // HAVE_FEXCEPT && CONFIG_DEBUG /* clang-format off */ #define FLOATING_POINT_INIT \ do { \ FLOATING_POINT_SET_PRECISION \ FLOATING_POINT_SET_EXCEPTIONS #define FLOATING_POINT_RESTORE \ FLOATING_POINT_RESTORE_EXCEPTIONS \ FLOATING_POINT_RESTORE_PRECISION \ } while (0); /* clang-format on */ aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img, aom_codec_pts_t pts, unsigned long duration, aom_enc_frame_flags_t flags) { aom_codec_err_t res = AOM_CODEC_OK; if (!ctx || (img && !duration)) res = AOM_CODEC_INVALID_PARAM; else if (!ctx->iface || !ctx->priv) res = AOM_CODEC_ERROR; else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER)) res = AOM_CODEC_INCAPABLE; else if (img && ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) != 0) != ((ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) != 0)) { res = AOM_CODEC_INVALID_PARAM; #if ULONG_MAX > UINT32_MAX } else if (duration > UINT32_MAX) { res = AOM_CODEC_INVALID_PARAM; #endif } else { /* Execute in a normalized floating point environment, if the platform * requires it. */ FLOATING_POINT_INIT res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags); FLOATING_POINT_RESTORE } return SAVE_STATUS(ctx, res); } const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter) { const aom_codec_cx_pkt_t *pkt = NULL; if (ctx) { if (!iter) ctx->err = AOM_CODEC_INVALID_PARAM; else if (!ctx->iface || !ctx->priv) ctx->err = AOM_CODEC_ERROR; else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER)) ctx->err = AOM_CODEC_INCAPABLE; else pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter); } if (pkt && pkt->kind == AOM_CODEC_CX_FRAME_PKT) { // If the application has specified a destination area for the // compressed data, and the codec has not placed the data there, // and it fits, copy it. aom_codec_priv_t *const priv = ctx->priv; char *const dst_buf = (char *)priv->enc.cx_data_dst_buf.buf; if (dst_buf && pkt->data.raw.buf != dst_buf && pkt->data.raw.sz + priv->enc.cx_data_pad_before + priv->enc.cx_data_pad_after <= priv->enc.cx_data_dst_buf.sz) { aom_codec_cx_pkt_t *modified_pkt = &priv->enc.cx_data_pkt; memcpy(dst_buf + priv->enc.cx_data_pad_before, pkt->data.raw.buf, pkt->data.raw.sz); *modified_pkt = *pkt; modified_pkt->data.raw.buf = dst_buf; modified_pkt->data.raw.sz += priv->enc.cx_data_pad_before + priv->enc.cx_data_pad_after; pkt = modified_pkt; } if (dst_buf == pkt->data.raw.buf) { priv->enc.cx_data_dst_buf.buf = dst_buf + pkt->data.raw.sz; priv->enc.cx_data_dst_buf.sz -= pkt->data.raw.sz; } } return pkt; } aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx, const aom_fixed_buf_t *buf, unsigned int pad_before, unsigned int pad_after) { if (!ctx || !ctx->priv) return AOM_CODEC_INVALID_PARAM; if (buf) { ctx->priv->enc.cx_data_dst_buf = *buf; ctx->priv->enc.cx_data_pad_before = pad_before; ctx->priv->enc.cx_data_pad_after = pad_after; } else { ctx->priv->enc.cx_data_dst_buf.buf = NULL; ctx->priv->enc.cx_data_dst_buf.sz = 0; ctx->priv->enc.cx_data_pad_before = 0; ctx->priv->enc.cx_data_pad_after = 0; } return AOM_CODEC_OK; } const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx) { aom_image_t *img = NULL; if (ctx) { if (!ctx->iface || !ctx->priv) ctx->err = AOM_CODEC_ERROR; else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER)) ctx->err = AOM_CODEC_INCAPABLE; else if (!ctx->iface->enc.get_preview) ctx->err = AOM_CODEC_INCAPABLE; else img = ctx->iface->enc.get_preview(get_alg_priv(ctx)); } return img; } aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx) { aom_fixed_buf_t *buf = NULL; if (ctx) { if (!ctx->iface || !ctx->priv) ctx->err = AOM_CODEC_ERROR; else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER)) ctx->err = AOM_CODEC_INCAPABLE; else if (!ctx->iface->enc.get_glob_hdrs) ctx->err = AOM_CODEC_INCAPABLE; else buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx)); } return buf; } aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx, const aom_codec_enc_cfg_t *cfg) { aom_codec_err_t res; if (!ctx || !ctx->iface || !ctx->priv || !cfg) res = AOM_CODEC_INVALID_PARAM; else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER)) res = AOM_CODEC_INCAPABLE; else res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg); return SAVE_STATUS(ctx, res); } int aom_codec_pkt_list_add(struct aom_codec_pkt_list *list, const struct aom_codec_cx_pkt *pkt) { if (list->cnt < list->max) { list->pkts[list->cnt++] = *pkt; return 0; } return 1; } const aom_codec_cx_pkt_t *aom_codec_pkt_list_get( struct aom_codec_pkt_list *list, aom_codec_iter_t *iter) { const aom_codec_cx_pkt_t *pkt; if (!(*iter)) { *iter = list->pkts; } pkt = (const aom_codec_cx_pkt_t *)*iter; if ((size_t)(pkt - list->pkts) < list->cnt) *iter = pkt + 1; else pkt = NULL; return pkt; } aom-3.12.1/aom/src/aom_image.c000066400000000000000000000331121477627663500160260ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "aom/aom_image.h" #include "aom/aom_integer.h" #include "aom/internal/aom_image_internal.h" #include "aom_mem/aom_mem.h" static inline unsigned int align_image_dimension(unsigned int d, unsigned int subsampling, unsigned int size_align) { unsigned int align; align = (1 << subsampling) - 1; align = (size_align - 1 > align) ? (size_align - 1) : align; return ((d + align) & ~align); } static aom_image_t *img_alloc_helper( aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, unsigned int d_h, unsigned int buf_align, unsigned int stride_align, unsigned int size_align, unsigned int border, unsigned char *img_data, aom_alloc_img_data_cb_fn_t alloc_cb, void *cb_priv) { /* NOTE: In this function, bit_depth is either 8 or 16 (if * AOM_IMG_FMT_HIGHBITDEPTH is set), never 10 or 12. */ unsigned int xcs, ycs, bps, bit_depth; if (img != NULL) memset(img, 0, sizeof(aom_image_t)); if (fmt == AOM_IMG_FMT_NONE) goto fail; /* Impose maximum values on input parameters so that this function can * perform arithmetic operations without worrying about overflows. */ if (d_w > 0x08000000 || d_h > 0x08000000 || buf_align > 65536 || stride_align > 65536 || size_align > 65536 || border > 65536) { goto fail; } /* Treat align==0 like align==1 */ if (!buf_align) buf_align = 1; /* Validate alignment (must be power of 2) */ if (buf_align & (buf_align - 1)) goto fail; /* Treat align==0 like align==1 */ if (!stride_align) stride_align = 1; /* Validate alignment (must be power of 2) */ if (stride_align & (stride_align - 1)) goto fail; /* Treat align==0 like align==1 */ if (!size_align) size_align = 1; /* Validate alignment (must be power of 2) */ if (size_align & (size_align - 1)) goto fail; /* Get sample size for this format */ switch (fmt) { case AOM_IMG_FMT_I420: case AOM_IMG_FMT_YV12: case AOM_IMG_FMT_NV12: case AOM_IMG_FMT_AOMI420: case AOM_IMG_FMT_AOMYV12: bps = 12; break; case AOM_IMG_FMT_I422: bps = 16; break; case AOM_IMG_FMT_I444: bps = 24; break; case AOM_IMG_FMT_YV1216: case AOM_IMG_FMT_I42016: bps = 24; break; case AOM_IMG_FMT_I42216: bps = 32; break; case AOM_IMG_FMT_I44416: bps = 48; break; default: bps = 16; break; } bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8; /* Get chroma shift values for this format */ switch (fmt) { case AOM_IMG_FMT_I420: case AOM_IMG_FMT_YV12: case AOM_IMG_FMT_NV12: case AOM_IMG_FMT_AOMI420: case AOM_IMG_FMT_AOMYV12: case AOM_IMG_FMT_I422: case AOM_IMG_FMT_I42016: case AOM_IMG_FMT_YV1216: case AOM_IMG_FMT_I42216: xcs = 1; break; default: xcs = 0; break; } switch (fmt) { case AOM_IMG_FMT_I420: case AOM_IMG_FMT_YV12: case AOM_IMG_FMT_NV12: case AOM_IMG_FMT_AOMI420: case AOM_IMG_FMT_AOMYV12: case AOM_IMG_FMT_YV1216: case AOM_IMG_FMT_I42016: ycs = 1; break; default: ycs = 0; break; } /* Calculate storage sizes given the chroma subsampling */ const unsigned int w = align_image_dimension(d_w, xcs, size_align); assert(d_w <= w); const unsigned int h = align_image_dimension(d_h, ycs, size_align); assert(d_h <= h); uint64_t s = (uint64_t)w + 2 * border; s = (fmt & AOM_IMG_FMT_PLANAR) ? s : s * bps / bit_depth; s = s * bit_depth / 8; s = (s + stride_align - 1) & ~((uint64_t)stride_align - 1); if (s > INT_MAX) goto fail; const int stride_in_bytes = (int)s; /* Allocate the new image */ if (!img) { img = (aom_image_t *)calloc(1, sizeof(aom_image_t)); if (!img) goto fail; img->self_allocd = 1; } img->img_data = img_data; if (!img_data) { const uint64_t alloc_size = (fmt & AOM_IMG_FMT_PLANAR) ? (uint64_t)(h + 2 * border) * stride_in_bytes * bps / bit_depth : (uint64_t)(h + 2 * border) * stride_in_bytes; if (alloc_size != (size_t)alloc_size) goto fail; if (alloc_cb) { const size_t padded_alloc_size = (size_t)alloc_size + buf_align - 1; img->img_data = (uint8_t *)alloc_cb(cb_priv, padded_alloc_size); if (img->img_data) { img->img_data = (uint8_t *)aom_align_addr(img->img_data, buf_align); } img->img_data_owner = 0; } else { img->img_data = (uint8_t *)aom_memalign(buf_align, (size_t)alloc_size); img->img_data_owner = 1; } img->sz = (size_t)alloc_size; } if (!img->img_data) goto fail; img->fmt = fmt; img->bit_depth = bit_depth; // aligned width and aligned height img->w = w; img->h = h; img->x_chroma_shift = xcs; img->y_chroma_shift = ycs; img->bps = bps; /* Calculate strides */ img->stride[AOM_PLANE_Y] = stride_in_bytes; img->stride[AOM_PLANE_U] = img->stride[AOM_PLANE_V] = stride_in_bytes >> xcs; if (fmt == AOM_IMG_FMT_NV12) { // Each row is a row of U and a row of V interleaved, so the stride is twice // as long. img->stride[AOM_PLANE_U] *= 2; img->stride[AOM_PLANE_V] = 0; } /* Default viewport to entire image. (This aom_img_set_rect call always * succeeds.) */ int ret = aom_img_set_rect(img, 0, 0, d_w, d_h, border); assert(ret == 0); (void)ret; return img; fail: aom_img_free(img); return NULL; } aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, unsigned int d_h, unsigned int align) { return img_alloc_helper(img, fmt, d_w, d_h, align, align, 1, 0, NULL, NULL, NULL); } aom_image_t *aom_img_alloc_with_cb(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, unsigned int d_h, unsigned int align, aom_alloc_img_data_cb_fn_t alloc_cb, void *cb_priv) { return img_alloc_helper(img, fmt, d_w, d_h, align, align, 1, 0, NULL, alloc_cb, cb_priv); } aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, unsigned int d_h, unsigned int stride_align, unsigned char *img_data) { /* Set buf_align = 1. It is ignored by img_alloc_helper because img_data is * not NULL. */ return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, 1, 0, img_data, NULL, NULL); } aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, unsigned int d_h, unsigned int align, unsigned int size_align, unsigned int border) { return img_alloc_helper(img, fmt, d_w, d_h, align, align, size_align, border, NULL, NULL, NULL); } int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y, unsigned int w, unsigned int h, unsigned int border) { if (x <= UINT_MAX - w && x + w <= img->w && y <= UINT_MAX - h && y + h <= img->h) { img->d_w = w; img->d_h = h; x += border; y += border; /* Calculate plane pointers */ if (!(img->fmt & AOM_IMG_FMT_PLANAR)) { img->planes[AOM_PLANE_PACKED] = img->img_data + x * img->bps / 8 + y * img->stride[AOM_PLANE_PACKED]; } else { const int bytes_per_sample = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; unsigned char *data = img->img_data; img->planes[AOM_PLANE_Y] = data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y]; data += ((size_t)img->h + 2 * border) * img->stride[AOM_PLANE_Y]; unsigned int uv_border_h = border >> img->y_chroma_shift; unsigned int uv_x = x >> img->x_chroma_shift; unsigned int uv_y = y >> img->y_chroma_shift; if (img->fmt == AOM_IMG_FMT_NV12) { img->planes[AOM_PLANE_U] = data + uv_x * bytes_per_sample * 2 + uv_y * img->stride[AOM_PLANE_U]; img->planes[AOM_PLANE_V] = NULL; } else if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) { img->planes[AOM_PLANE_U] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U]; data += ((size_t)(img->h >> img->y_chroma_shift) + 2 * uv_border_h) * img->stride[AOM_PLANE_U]; img->planes[AOM_PLANE_V] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V]; } else { img->planes[AOM_PLANE_V] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V]; data += ((size_t)(img->h >> img->y_chroma_shift) + 2 * uv_border_h) * img->stride[AOM_PLANE_V]; img->planes[AOM_PLANE_U] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U]; } } return 0; } return -1; } void aom_img_flip(aom_image_t *img) { /* Note: In the calculation pointer adjustment calculation, we want the * rhs to be promoted to a signed type. Section 6.3.1.8 of the ISO C99 * standard indicates that if the adjustment parameter is unsigned, the * stride parameter will be promoted to unsigned, causing errors when * the lhs is a larger type than the rhs. */ img->planes[AOM_PLANE_Y] += (signed)(img->d_h - 1) * img->stride[AOM_PLANE_Y]; img->stride[AOM_PLANE_Y] = -img->stride[AOM_PLANE_Y]; img->planes[AOM_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1) * img->stride[AOM_PLANE_U]; img->stride[AOM_PLANE_U] = -img->stride[AOM_PLANE_U]; img->planes[AOM_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1) * img->stride[AOM_PLANE_V]; img->stride[AOM_PLANE_V] = -img->stride[AOM_PLANE_V]; } void aom_img_free(aom_image_t *img) { if (img) { aom_img_remove_metadata(img); if (img->img_data && img->img_data_owner) aom_free(img->img_data); if (img->self_allocd) free(img); } } int aom_img_plane_width(const aom_image_t *img, int plane) { if (plane > 0) return (img->d_w + img->x_chroma_shift) >> img->x_chroma_shift; else return img->d_w; } int aom_img_plane_height(const aom_image_t *img, int plane) { if (plane > 0) return (img->d_h + img->y_chroma_shift) >> img->y_chroma_shift; else return img->d_h; } aom_metadata_t *aom_img_metadata_alloc( uint32_t type, const uint8_t *data, size_t sz, aom_metadata_insert_flags_t insert_flag) { if (!data || sz == 0) return NULL; aom_metadata_t *metadata = (aom_metadata_t *)malloc(sizeof(aom_metadata_t)); if (!metadata) return NULL; metadata->type = type; metadata->payload = (uint8_t *)malloc(sz); if (!metadata->payload) { free(metadata); return NULL; } memcpy(metadata->payload, data, sz); metadata->sz = sz; metadata->insert_flag = insert_flag; return metadata; } void aom_img_metadata_free(aom_metadata_t *metadata) { if (metadata) { if (metadata->payload) free(metadata->payload); free(metadata); } } aom_metadata_array_t *aom_img_metadata_array_alloc(size_t sz) { aom_metadata_array_t *arr = (aom_metadata_array_t *)calloc(1, sizeof(aom_metadata_array_t)); if (!arr) return NULL; if (sz > 0) { arr->metadata_array = (aom_metadata_t **)calloc(sz, sizeof(aom_metadata_t *)); if (!arr->metadata_array) { aom_img_metadata_array_free(arr); return NULL; } arr->sz = sz; } return arr; } void aom_img_metadata_array_free(aom_metadata_array_t *arr) { if (arr) { if (arr->metadata_array) { for (size_t i = 0; i < arr->sz; i++) { aom_img_metadata_free(arr->metadata_array[i]); } free(arr->metadata_array); } free(arr); } } int aom_img_add_metadata(aom_image_t *img, uint32_t type, const uint8_t *data, size_t sz, aom_metadata_insert_flags_t insert_flag) { if (!img) return -1; if (!img->metadata) { img->metadata = aom_img_metadata_array_alloc(0); if (!img->metadata) return -1; } aom_metadata_t *metadata = aom_img_metadata_alloc(type, data, sz, insert_flag); if (!metadata) return -1; aom_metadata_t **metadata_array = (aom_metadata_t **)realloc(img->metadata->metadata_array, (img->metadata->sz + 1) * sizeof(metadata)); if (!metadata_array) { aom_img_metadata_free(metadata); return -1; } img->metadata->metadata_array = metadata_array; img->metadata->metadata_array[img->metadata->sz] = metadata; img->metadata->sz++; return 0; } void aom_img_remove_metadata(aom_image_t *img) { if (img && img->metadata) { aom_img_metadata_array_free(img->metadata); img->metadata = NULL; } } const aom_metadata_t *aom_img_get_metadata(const aom_image_t *img, size_t index) { if (!img) return NULL; const aom_metadata_array_t *array = img->metadata; if (array && index < array->sz) { return array->metadata_array[index]; } return NULL; } size_t aom_img_num_metadata(const aom_image_t *img) { if (!img || !img->metadata) return 0; return img->metadata->sz; } aom-3.12.1/aom/src/aom_integer.c000066400000000000000000000062501477627663500164040ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom/aom_integer.h" static const size_t kMaximumLeb128Size = 8; static const uint8_t kLeb128ByteMask = 0x7f; // Binary: 01111111 // Disallow values larger than 32-bits to ensure consistent behavior on 32 and // 64 bit targets: value is typically used to determine buffer allocation size // when decoded. static const uint64_t kMaximumLeb128Value = UINT32_MAX; size_t aom_uleb_size_in_bytes(uint64_t value) { size_t size = 0; do { ++size; } while ((value >>= 7) != 0); return size; } int aom_uleb_decode(const uint8_t *buffer, size_t available, uint64_t *value, size_t *length) { if (buffer && value) { *value = 0; for (size_t i = 0; i < kMaximumLeb128Size && i < available; ++i) { const uint8_t decoded_byte = *(buffer + i) & kLeb128ByteMask; *value |= ((uint64_t)decoded_byte) << (i * 7); if ((*(buffer + i) >> 7) == 0) { if (length) { *length = i + 1; } // Fail on values larger than 32-bits to ensure consistent behavior on // 32 and 64 bit targets: value is typically used to determine buffer // allocation size. if (*value > UINT32_MAX) return -1; return 0; } } } // If we get here, either the buffer/value pointers were invalid, // or we ran over the available space return -1; } int aom_uleb_encode(uint64_t value, size_t available, uint8_t *coded_value, size_t *coded_size) { const size_t leb_size = aom_uleb_size_in_bytes(value); if (value > kMaximumLeb128Value || leb_size > kMaximumLeb128Size || leb_size > available || !coded_value || !coded_size) { return -1; } for (size_t i = 0; i < leb_size; ++i) { uint8_t byte = value & 0x7f; value >>= 7; if (value != 0) byte |= 0x80; // Signal that more bytes follow. *(coded_value + i) = byte; } *coded_size = leb_size; return 0; } int aom_uleb_encode_fixed_size(uint64_t value, size_t available, size_t pad_to_size, uint8_t *coded_value, size_t *coded_size) { if (value > kMaximumLeb128Value || !coded_value || !coded_size || available < pad_to_size || pad_to_size > kMaximumLeb128Size) { return -1; } const uint64_t limit = 1ULL << (7 * pad_to_size); if (value >= limit) { // Can't encode 'value' within 'pad_to_size' bytes return -1; } for (size_t i = 0; i < pad_to_size; ++i) { uint8_t byte = value & 0x7f; value >>= 7; if (i < pad_to_size - 1) byte |= 0x80; // Signal that more bytes follow. *(coded_value + i) = byte; } assert(value == 0); *coded_size = pad_to_size; return 0; } aom-3.12.1/aom_dsp/000077500000000000000000000000001477627663500140235ustar00rootroot00000000000000aom-3.12.1/aom_dsp/aom_convolve.c000066400000000000000000000234741477627663500166700ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_ports/mem.h" static inline int horz_scalar_product(const uint8_t *a, const int16_t *b) { int sum = 0; for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; return sum; } static inline int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride, const int16_t *b) { int sum = 0; for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; return sum; } static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h) { src -= SUBPEL_TAPS / 2 - 1; for (int y = 0; y < h; ++y) { int x_q4 = x0_q4; for (int x = 0; x < w; ++x) { const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; const int sum = horz_scalar_product(src_x, x_filter); dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); x_q4 += x_step_q4; } src += src_stride; dst += dst_stride; } } static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h) { src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (int x = 0; x < w; ++x) { int y_q4 = y0_q4; for (int y = 0; y < h; ++y) { const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; const int sum = vert_scalar_product(src_y, src_stride, y_filter); dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); y_q4 += y_step_q4; } ++src; ++dst; } } static const InterpKernel *get_filter_base(const int16_t *filter) { // NOTE: This assumes that the filter table is 256-byte aligned. return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); } static int get_filter_offset(const int16_t *f, const InterpKernel *base) { return (int)((const InterpKernel *)(intptr_t)f - base); } void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); (void)filter_y; (void)y_step_q4; convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, w, h); } void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); (void)filter_x; (void)x_step_q4; convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, w, h); } void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. // (2) Interpolate temp vertically to derive the sub-pixel result. // Deriving the maximum number of rows in the temp buffer (135): // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). // --Largest block size is 64x64 pixels. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the // original frame (in 1/16th pixel units). // --Must round-up because block may be located at sub-pixel position. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. // When calling in frame scaling function, the smallest scaling factor is x1/4 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still // big enough. uint8_t temp[64 * 135]; const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; assert(w <= 64); assert(h <= 64); assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); assert(x_step_q4 <= 64); convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, filter, x0_q4, x_step_q4, w, intermediate_height); convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, y0_q4, y_step_q4, w, h); } void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { for (int r = h; r > 0; --r) { memmove(dst, src, w); src += src_stride; dst += dst_stride; } } #if CONFIG_AV1_HIGHBITDEPTH static inline int highbd_vert_scalar_product(const uint16_t *a, ptrdiff_t a_stride, const int16_t *b) { int sum = 0; for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; return sum; } static inline int highbd_horz_scalar_product(const uint16_t *a, const int16_t *b) { int sum = 0; for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; return sum; } static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; for (int y = 0; y < h; ++y) { int x_q4 = x0_q4; for (int x = 0; x < w; ++x) { const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; const int sum = highbd_horz_scalar_product(src_x, x_filter); dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); x_q4 += x_step_q4; } src += src_stride; dst += dst_stride; } } static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (int x = 0; x < w; ++x) { int y_q4 = y0_q4; for (int y = 0; y < h; ++y) { const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter); dst[y * dst_stride] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); y_q4 += y_step_q4; } ++src; ++dst; } } void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); (void)filter_y; (void)y_step_q4; highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, w, h, bd); } void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); (void)filter_x; (void)x_step_q4; highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, w, h, bd); } void aom_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h) { for (int y = 0; y < h; ++y) { memmove(dst, src, w * sizeof(src[0])); src += src_stride; dst += dst_stride; } } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/aom_dsp.cmake000066400000000000000000000562711477627663500164620ustar00rootroot00000000000000# # Copyright (c) 2017, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # if(AOM_AOM_DSP_AOM_DSP_CMAKE_) return() endif() # AOM_AOM_DSP_AOM_DSP_CMAKE_ set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1) list(APPEND AOM_DSP_COMMON_SOURCES "${AOM_ROOT}/aom_dsp/aom_convolve.c" "${AOM_ROOT}/aom_dsp/aom_dsp_common.h" "${AOM_ROOT}/aom_dsp/aom_filter.h" "${AOM_ROOT}/aom_dsp/aom_simd.h" "${AOM_ROOT}/aom_dsp/aom_simd_inline.h" "${AOM_ROOT}/aom_dsp/bitreader_buffer.c" "${AOM_ROOT}/aom_dsp/bitreader_buffer.h" "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c" "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h" "${AOM_ROOT}/aom_dsp/blend.h" "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c" "${AOM_ROOT}/aom_dsp/blend_a64_mask.c" "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c" "${AOM_ROOT}/aom_dsp/entcode.c" "${AOM_ROOT}/aom_dsp/entcode.h" "${AOM_ROOT}/aom_dsp/grain_params.h" "${AOM_ROOT}/aom_dsp/intrapred.c" "${AOM_ROOT}/aom_dsp/intrapred_common.h" "${AOM_ROOT}/aom_dsp/loopfilter.c" "${AOM_ROOT}/aom_dsp/odintrin.c" "${AOM_ROOT}/aom_dsp/odintrin.h" "${AOM_ROOT}/aom_dsp/prob.h" "${AOM_ROOT}/aom_dsp/recenter.h" "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h" "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h" "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h" "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h" "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h" "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h" "${AOM_ROOT}/aom_dsp/txfm_common.h" "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h") list(APPEND AOM_DSP_COMMON_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/intrapred_asm_sse2.asm") if(CONFIG_AV1_HIGHBITDEPTH) list(APPEND AOM_DSP_COMMON_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_asm_sse2.asm") endif() list(APPEND AOM_DSP_COMMON_INTRIN_SSE2 "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c" "${AOM_ROOT}/aom_dsp/x86/convolve.h" "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h" "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c" "${AOM_ROOT}/aom_dsp/x86/intrapred_x86.h" "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c" "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h" "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h" "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h" "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h" "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.h" "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_sse2.h") list(APPEND AOM_DSP_COMMON_ASM_SSSE3 "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm" "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm") list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3 "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/convolve_ssse3.h" "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c") list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/blend_mask_sse4.h" "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c" "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c" "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c" "${AOM_ROOT}/aom_dsp/x86/intrapred_sse4.c" "${AOM_ROOT}/aom_dsp/x86/intrapred_utils.h") list(APPEND AOM_DSP_COMMON_INTRIN_AVX2 "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_avx2.c" "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c" "${AOM_ROOT}/aom_dsp/x86/common_avx2.h" "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h" "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h" "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c" "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c" "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c" "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_avx2.h") if(CONFIG_SVT_AV1) list(APPEND AOM_DSP_COMMON_INTRIN_AVX2 "${AOM_ROOT}/third_party/SVT-AV1/convolve_2d_avx2.h" "${AOM_ROOT}/third_party/SVT-AV1/convolve_avx2.h" "${AOM_ROOT}/third_party/SVT-AV1/EbMemory_AVX2.h" "${AOM_ROOT}/third_party/SVT-AV1/EbMemory_SSE4_1.h" "${AOM_ROOT}/third_party/SVT-AV1/synonyms.h") endif() list(APPEND AOM_DSP_COMMON_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c" "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c" "${AOM_ROOT}/aom_dsp/arm/aom_scaled_convolve8_neon.c" "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c" "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c" "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c") list(APPEND AOM_DSP_COMMON_INTRIN_NEON_DOTPROD "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_dotprod.c" "${AOM_ROOT}/aom_dsp/arm/aom_scaled_convolve8_neon_dotprod.c") list(APPEND AOM_DSP_COMMON_INTRIN_NEON_I8MM "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_i8mm.c" "${AOM_ROOT}/aom_dsp/arm/aom_scaled_convolve8_neon_i8mm.c") if(CONFIG_AV1_HIGHBITDEPTH) list(APPEND AOM_DSP_COMMON_INTRIN_SSE2 "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c") list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3 "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c") list(APPEND AOM_DSP_COMMON_INTRIN_AVX2 "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c") list(APPEND AOM_DSP_COMMON_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_hmask_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_mask_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_vmask_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_convolve8_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_intrapred_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_loopfilter_neon.c") list(APPEND AOM_DSP_COMMON_INTRIN_SVE "${AOM_ROOT}/aom_dsp/arm/highbd_convolve8_sve.c") endif() if(CONFIG_AV1_DECODER) list(APPEND AOM_DSP_DECODER_SOURCES "${AOM_ROOT}/aom_dsp/binary_codes_reader.c" "${AOM_ROOT}/aom_dsp/binary_codes_reader.h" "${AOM_ROOT}/aom_dsp/bitreader.c" "${AOM_ROOT}/aom_dsp/bitreader.h" "${AOM_ROOT}/aom_dsp/entdec.c" "${AOM_ROOT}/aom_dsp/entdec.h") endif() if(CONFIG_AV1_ENCODER) list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/avg.c" "${AOM_ROOT}/aom_dsp/bitwriter.c" "${AOM_ROOT}/aom_dsp/bitwriter.h" "${AOM_ROOT}/aom_dsp/blk_sse_sum.c" "${AOM_ROOT}/aom_dsp/entenc.c" "${AOM_ROOT}/aom_dsp/entenc.h" "${AOM_ROOT}/aom_dsp/fft.c" "${AOM_ROOT}/aom_dsp/fft_common.h" "${AOM_ROOT}/aom_dsp/fwd_txfm.c" "${AOM_ROOT}/aom_dsp/psnr.c" "${AOM_ROOT}/aom_dsp/psnr.h" "${AOM_ROOT}/aom_dsp/quantize.c" "${AOM_ROOT}/aom_dsp/quantize.h" "${AOM_ROOT}/aom_dsp/sad.c" "${AOM_ROOT}/aom_dsp/sad_av1.c" "${AOM_ROOT}/aom_dsp/subtract.c" "${AOM_ROOT}/aom_dsp/sse.c" "${AOM_ROOT}/aom_dsp/ssim.c" "${AOM_ROOT}/aom_dsp/ssim.h" "${AOM_ROOT}/aom_dsp/sum_squares.c" "${AOM_ROOT}/aom_dsp/variance.c" "${AOM_ROOT}/aom_dsp/variance.h") # Flow estimation library and grain/noise table/model. if(NOT CONFIG_REALTIME_ONLY) list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/pyramid.c" "${AOM_ROOT}/aom_dsp/binary_codes_writer.c" "${AOM_ROOT}/aom_dsp/binary_codes_writer.h" "${AOM_ROOT}/aom_dsp/flow_estimation/corner_detect.c" "${AOM_ROOT}/aom_dsp/flow_estimation/corner_match.c" "${AOM_ROOT}/aom_dsp/flow_estimation/disflow.c" "${AOM_ROOT}/aom_dsp/flow_estimation/flow_estimation.c" "${AOM_ROOT}/aom_dsp/grain_table.c" "${AOM_ROOT}/aom_dsp/grain_table.h" "${AOM_ROOT}/aom_dsp/noise_model.c" "${AOM_ROOT}/aom_dsp/noise_model.h" "${AOM_ROOT}/aom_dsp/noise_util.c" "${AOM_ROOT}/aom_dsp/noise_util.h" "${AOM_ROOT}/aom_dsp/flow_estimation/ransac.c") list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1 "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_sse4.c" "${AOM_ROOT}/aom_dsp/flow_estimation/x86/disflow_sse4.c") list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2 "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_avx2.c" "${AOM_ROOT}/aom_dsp/flow_estimation/x86/disflow_avx2.c") list(APPEND AOM_DSP_ENCODER_INTRIN_NEON "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_neon.c") list(APPEND AOM_DSP_ENCODER_INTRIN_SVE "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_sve.c") endif() list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm") list(APPEND AOM_DSP_ENCODER_ASM_SSE2_X86_64 "${AOM_ROOT}/aom_dsp/x86/ssim_sse2_x86_64.asm") list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2 "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse2.c" "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c" "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h" "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c" "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h" "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c" "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c" "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h" "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_sse2.c" "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c" "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c") list(APPEND AOM_DSP_ENCODER_ASM_SSSE3 "${AOM_ROOT}/aom_dsp/x86/subpel_variance_ssse3.asm") list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64 "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm" "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm") list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2 "${AOM_ROOT}/aom_dsp/x86/avg_intrin_avx2.c" "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c" "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c" "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c" "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_avx2.c" "${AOM_ROOT}/aom_dsp/x86/quantize_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c" "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sse_avx2.c" "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c" "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c" "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c" "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c") list(APPEND AOM_DSP_ENCODER_INTRIN_AVX "${AOM_ROOT}/aom_dsp/x86/aom_quantize_avx.c") list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3 "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h" "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h" "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/variance_ssse3.c") list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse4.c" "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c" "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c" "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c") list(APPEND AOM_DSP_ENCODER_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/avg_pred_neon.c" "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c" "${AOM_ROOT}/aom_dsp/arm/sadxd_neon.c" "${AOM_ROOT}/aom_dsp/arm/sad_neon.c" "${AOM_ROOT}/aom_dsp/arm/masked_sad_neon.c" "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c" "${AOM_ROOT}/aom_dsp/arm/variance_neon.c" "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c" "${AOM_ROOT}/aom_dsp/arm/avg_neon.c" "${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c" "${AOM_ROOT}/aom_dsp/arm/obmc_sad_neon.c" "${AOM_ROOT}/aom_dsp/arm/sse_neon.c" "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c" "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c" "${AOM_ROOT}/aom_dsp/arm/blk_sse_sum_neon.c") list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD "${AOM_ROOT}/aom_dsp/arm/sad_neon_dotprod.c" "${AOM_ROOT}/aom_dsp/arm/sadxd_neon_dotprod.c" "${AOM_ROOT}/aom_dsp/arm/sse_neon_dotprod.c" "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon_dotprod.c" "${AOM_ROOT}/aom_dsp/arm/variance_neon_dotprod.c") list(APPEND AOM_DSP_ENCODER_INTRIN_SVE "${AOM_ROOT}/aom_dsp/arm/avg_sve.c" "${AOM_ROOT}/aom_dsp/arm/blk_sse_sum_sve.c" "${AOM_ROOT}/aom_dsp/arm/sum_squares_sve.c") if(CONFIG_AV1_HIGHBITDEPTH) list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm") list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2 "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_sse2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c") list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2 "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_avx2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_sad_avx2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c") list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c") list(APPEND AOM_DSP_ENCODER_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/highbd_avg_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_avg_pred_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_hadamard_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_masked_sad_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_sad_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_quantize_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_sad_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_sadxd_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_sse_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_subpel_variance_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon.c") list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon_dotprod.c") list(APPEND AOM_DSP_ENCODER_INTRIN_SVE "${AOM_ROOT}/aom_dsp/arm/highbd_sse_sve.c" "${AOM_ROOT}/aom_dsp/arm/highbd_variance_sve.c") endif() if(CONFIG_INTERNAL_STATS) list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/fastssim.c" "${AOM_ROOT}/aom_dsp/psnrhvs.c") endif() if(CONFIG_TUNE_VMAF) list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/vmaf.c" "${AOM_ROOT}/aom_dsp/vmaf.h") endif() if(CONFIG_TUNE_BUTTERAUGLI) list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/butteraugli.c" "${AOM_ROOT}/aom_dsp/butteraugli.h") endif() if(CONFIG_REALTIME_ONLY) list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_AVX2 "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_avx2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_avx2.c" "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c" "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c") list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c" "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c") list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE2 "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_sse2.c") list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_sad_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c" "${AOM_ROOT}/aom_dsp/arm/obmc_sad_neon.c" "${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c") endif() endif() # Creates aom_dsp build targets. Must not be called until after libaom target # has been created. function(setup_aom_dsp_targets) add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES}) list(APPEND AOM_LIB_TARGETS aom_dsp_common) create_no_op_source_file("aom_av1" "c" "no_op_source_file") add_library(aom_dsp OBJECT "${no_op_source_file}") target_sources(aom PRIVATE $) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $) endif() list(APPEND AOM_LIB_TARGETS aom_dsp) # Not all generators support libraries consisting only of object files. Add a # source file to the aom_dsp target. add_no_op_source_file_to_target("aom_dsp" "c") if(CONFIG_AV1_DECODER) add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES}) list(APPEND AOM_LIB_TARGETS aom_dsp_decoder) target_sources(aom PRIVATE $) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $) endif() endif() if(CONFIG_AV1_ENCODER) add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES}) list(APPEND AOM_LIB_TARGETS aom_dsp_encoder) target_sources(aom PRIVATE $) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $) endif() if(CONFIG_TUNE_VMAF) target_include_directories(aom_dsp_encoder PRIVATE ${VMAF_INCLUDE_DIRS}) endif() endif() if(HAVE_SSE2) add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2") add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_SSE2") if(CONFIG_AV1_ENCODER) if("${AOM_TARGET_CPU}" STREQUAL "x86_64") list(APPEND AOM_DSP_ENCODER_ASM_SSE2 ${AOM_DSP_ENCODER_ASM_SSE2_X86_64}) endif() add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2") add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder" "AOM_DSP_ENCODER_INTRIN_SSE2") endif() endif() if(HAVE_SSSE3) add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3") add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_SSSE3") if(CONFIG_AV1_ENCODER) if("${AOM_TARGET_CPU}" STREQUAL "x86_64") list(APPEND AOM_DSP_ENCODER_ASM_SSSE3 ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64}) endif() add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3") add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder" "AOM_DSP_ENCODER_INTRIN_SSSE3") endif() endif() if(HAVE_SSE4_1) add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_SSE4_1") if(CONFIG_AV1_ENCODER) add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder" "AOM_DSP_ENCODER_INTRIN_SSE4_1") endif() endif() if(HAVE_AVX) if(CONFIG_AV1_ENCODER) add_intrinsics_object_library("-mavx" "avx" "aom_dsp_encoder" "AOM_DSP_ENCODER_INTRIN_AVX") endif() endif() if(HAVE_AVX2) add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_AVX2") if(CONFIG_AV1_ENCODER) add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder" "AOM_DSP_ENCODER_INTRIN_AVX2") endif() endif() if(HAVE_NEON) add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON") if(CONFIG_AV1_ENCODER) add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" "aom_dsp_encoder" "AOM_DSP_ENCODER_INTRIN_NEON") endif() endif() if(HAVE_NEON_DOTPROD) add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON_DOTPROD") if(CONFIG_AV1_ENCODER) add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod" "aom_dsp_encoder" "AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD") endif() endif() if(HAVE_NEON_I8MM) add_intrinsics_object_library("${AOM_NEON_I8MM_FLAG}" "neon_i8mm" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON_I8MM") endif() if(HAVE_SVE) add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_SVE") if(CONFIG_AV1_ENCODER) add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_dsp_encoder" "AOM_DSP_ENCODER_INTRIN_SVE") endif() endif() target_sources(aom PRIVATE $) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $) endif() # Pass the new lib targets up to the parent scope instance of # $AOM_LIB_TARGETS. set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) endfunction() aom-3.12.1/aom_dsp/aom_dsp_common.h000066400000000000000000000070021477627663500171650ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_AOM_DSP_COMMON_H_ #define AOM_AOM_DSP_AOM_DSP_COMMON_H_ #include #include "config/aom_config.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" #ifdef __cplusplus extern "C" { #endif #if defined(_MSC_VER) #define AOM_FORCE_INLINE __forceinline #else #define AOM_FORCE_INLINE __inline__ __attribute__((always_inline)) #endif #define PI 3.141592653589793238462643383279502884 #define AOMMIN(x, y) (((x) < (y)) ? (x) : (y)) #define AOMMAX(x, y) (((x) > (y)) ? (x) : (y)) #define AOMSIGN(x) ((x) < 0 ? -1 : 0) #define NELEMENTS(x) (int)(sizeof(x) / sizeof(x[0])) #define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b') #define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0) /* Left shifting a negative value became undefined behavior in C99 (downgraded from merely implementation-defined in C89). This should still compile to the correct thing on any two's-complement machine, but avoid ubsan warnings.*/ #define AOM_SIGNED_SHL(x, shift) ((x) * (((x)*0 + 1) << (shift))) // These can be used to give a hint about branch outcomes. // This can have an effect, even if your target processor has a // good branch predictor, as these hints can affect basic block // ordering by the compiler. #ifdef __GNUC__ #define LIKELY(v) __builtin_expect(v, 1) #define UNLIKELY(v) __builtin_expect(v, 0) #else #define LIKELY(v) (v) #define UNLIKELY(v) (v) #endif typedef uint8_t qm_val_t; #define AOM_QM_BITS 5 // Note: // tran_low_t is the datatype used for final transform coefficients. // tran_high_t is the datatype used for intermediate transform stages. typedef int64_t tran_high_t; typedef int32_t tran_low_t; static inline uint8_t clip_pixel(int val) { return (val > 255) ? 255 : (val < 0) ? 0 : val; } static inline int clamp(int value, int low, int high) { return value < low ? low : (value > high ? high : value); } static inline int64_t clamp64(int64_t value, int64_t low, int64_t high) { return value < low ? low : (value > high ? high : value); } static inline double fclamp(double value, double low, double high) { return value < low ? low : (value > high ? high : value); } static inline uint16_t clip_pixel_highbd(int val, int bd) { switch (bd) { case 8: default: return (uint16_t)clamp(val, 0, 255); case 10: return (uint16_t)clamp(val, 0, 1023); case 12: return (uint16_t)clamp(val, 0, 4095); } } // The result of this branchless code is equivalent to (value < 0 ? 0 : value) // or max(0, value) and might be faster in some cases. // Care should be taken since the behavior of right shifting signed type // negative value is undefined by C standards and implementation defined, static inline unsigned int negative_to_zero(int value) { return value & ~(value >> (sizeof(value) * 8 - 1)); } // Returns the saturating cast of a double value to int. static inline int saturate_cast_double_to_int(double d) { if (d > INT_MAX) return INT_MAX; return (int)d; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_AOM_DSP_COMMON_H_ aom-3.12.1/aom_dsp/aom_dsp_rtcd.c000066400000000000000000000012771477627663500166340ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_config.h" #define RTCD_C #include "config/aom_dsp_rtcd.h" #include "aom_ports/aom_once.h" void aom_dsp_rtcd(void) { aom_once(setup_rtcd_internal); } aom-3.12.1/aom_dsp/aom_dsp_rtcd_defs.pl000077500000000000000000002744241477627663500200370ustar00rootroot00000000000000## ## Copyright (c) 2017, Alliance for Open Media. All rights reserved. ## ## This source code is subject to the terms of the BSD 2 Clause License and ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ## was not distributed with this source code in the LICENSE file, you can ## obtain it at www.aomedia.org/license/software. If the Alliance for Open ## Media Patent License 1.0 was not distributed with this source code in the ## PATENTS file, you can obtain it at www.aomedia.org/license/patent. ## sub aom_dsp_forward_decls() { print <=4 && $h >=4 && ($w == 2*$h || $h == 2*$w)); if ((aom_config("CONFIG_REALTIME_ONLY") ne "yes") || (aom_config("CONFIG_AV1_DECODER") eq "yes")) { push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 4*$h || $h == 4*$w)); } # !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER } } @pred_names = qw/dc dc_top dc_left dc_128 v h paeth smooth smooth_v smooth_h/; # # Intra prediction # foreach (@tx_sizes) { ($w, $h) = @$_; foreach $pred_name (@pred_names) { add_proto "void", "aom_${pred_name}_predictor_${w}x${h}", "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}", "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; } } } specialize qw/aom_dc_top_predictor_4x4 neon sse2/; specialize qw/aom_dc_top_predictor_4x8 neon sse2/; specialize qw/aom_dc_top_predictor_8x4 neon sse2/; specialize qw/aom_dc_top_predictor_8x8 neon sse2/; specialize qw/aom_dc_top_predictor_8x16 neon sse2/; specialize qw/aom_dc_top_predictor_16x8 neon sse2/; specialize qw/aom_dc_top_predictor_16x16 neon sse2/; specialize qw/aom_dc_top_predictor_16x32 neon sse2/; specialize qw/aom_dc_top_predictor_32x16 neon sse2 avx2/; specialize qw/aom_dc_top_predictor_32x32 neon sse2 avx2/; specialize qw/aom_dc_top_predictor_32x64 neon sse2 avx2/; specialize qw/aom_dc_top_predictor_64x32 neon sse2 avx2/; specialize qw/aom_dc_top_predictor_64x64 neon sse2 avx2/; specialize qw/aom_dc_left_predictor_4x4 neon sse2/; specialize qw/aom_dc_left_predictor_4x8 neon sse2/; specialize qw/aom_dc_left_predictor_8x4 neon sse2/; specialize qw/aom_dc_left_predictor_8x8 neon sse2/; specialize qw/aom_dc_left_predictor_8x16 neon sse2/; specialize qw/aom_dc_left_predictor_16x8 neon sse2/; specialize qw/aom_dc_left_predictor_16x16 neon sse2/; specialize qw/aom_dc_left_predictor_16x32 neon sse2/; specialize qw/aom_dc_left_predictor_32x16 neon sse2 avx2/; specialize qw/aom_dc_left_predictor_32x32 neon sse2 avx2/; specialize qw/aom_dc_left_predictor_32x64 neon sse2 avx2/; specialize qw/aom_dc_left_predictor_64x32 neon sse2 avx2/; specialize qw/aom_dc_left_predictor_64x64 neon sse2 avx2/; specialize qw/aom_dc_128_predictor_4x4 neon sse2/; specialize qw/aom_dc_128_predictor_4x8 neon sse2/; specialize qw/aom_dc_128_predictor_8x4 neon sse2/; specialize qw/aom_dc_128_predictor_8x8 neon sse2/; specialize qw/aom_dc_128_predictor_8x16 neon sse2/; specialize qw/aom_dc_128_predictor_16x8 neon sse2/; specialize qw/aom_dc_128_predictor_16x16 neon sse2/; specialize qw/aom_dc_128_predictor_16x32 neon sse2/; specialize qw/aom_dc_128_predictor_32x16 neon sse2 avx2/; specialize qw/aom_dc_128_predictor_32x32 neon sse2 avx2/; specialize qw/aom_dc_128_predictor_32x64 neon sse2 avx2/; specialize qw/aom_dc_128_predictor_64x32 neon sse2 avx2/; specialize qw/aom_dc_128_predictor_64x64 neon sse2 avx2/; specialize qw/aom_v_predictor_4x4 neon sse2/; specialize qw/aom_v_predictor_4x8 neon sse2/; specialize qw/aom_v_predictor_8x4 neon sse2/; specialize qw/aom_v_predictor_8x8 neon sse2/; specialize qw/aom_v_predictor_8x16 neon sse2/; specialize qw/aom_v_predictor_16x8 neon sse2/; specialize qw/aom_v_predictor_16x16 neon sse2/; specialize qw/aom_v_predictor_16x32 neon sse2/; specialize qw/aom_v_predictor_32x16 neon sse2 avx2/; specialize qw/aom_v_predictor_32x32 neon sse2 avx2/; specialize qw/aom_v_predictor_32x64 neon sse2 avx2/; specialize qw/aom_v_predictor_64x32 neon sse2 avx2/; specialize qw/aom_v_predictor_64x64 neon sse2 avx2/; specialize qw/aom_h_predictor_4x4 neon sse2/; specialize qw/aom_h_predictor_4x8 neon sse2/; specialize qw/aom_h_predictor_8x4 neon sse2/; specialize qw/aom_h_predictor_8x8 neon sse2/; specialize qw/aom_h_predictor_8x16 neon sse2/; specialize qw/aom_h_predictor_16x8 neon sse2/; specialize qw/aom_h_predictor_16x16 neon sse2/; specialize qw/aom_h_predictor_16x32 neon sse2/; specialize qw/aom_h_predictor_32x16 neon sse2/; specialize qw/aom_h_predictor_32x32 neon sse2 avx2/; specialize qw/aom_h_predictor_32x64 neon sse2/; specialize qw/aom_h_predictor_64x32 neon sse2/; specialize qw/aom_h_predictor_64x64 neon sse2/; specialize qw/aom_paeth_predictor_4x4 ssse3 neon/; specialize qw/aom_paeth_predictor_4x8 ssse3 neon/; specialize qw/aom_paeth_predictor_8x4 ssse3 neon/; specialize qw/aom_paeth_predictor_8x8 ssse3 neon/; specialize qw/aom_paeth_predictor_8x16 ssse3 neon/; specialize qw/aom_paeth_predictor_16x8 ssse3 avx2 neon/; specialize qw/aom_paeth_predictor_16x16 ssse3 avx2 neon/; specialize qw/aom_paeth_predictor_16x32 ssse3 avx2 neon/; specialize qw/aom_paeth_predictor_32x16 ssse3 avx2 neon/; specialize qw/aom_paeth_predictor_32x32 ssse3 avx2 neon/; specialize qw/aom_paeth_predictor_32x64 ssse3 avx2 neon/; specialize qw/aom_paeth_predictor_64x32 ssse3 avx2 neon/; specialize qw/aom_paeth_predictor_64x64 ssse3 avx2 neon/; specialize qw/aom_smooth_predictor_4x4 neon ssse3/; specialize qw/aom_smooth_predictor_4x8 neon ssse3/; specialize qw/aom_smooth_predictor_8x4 neon ssse3/; specialize qw/aom_smooth_predictor_8x8 neon ssse3/; specialize qw/aom_smooth_predictor_8x16 neon ssse3/; specialize qw/aom_smooth_predictor_16x8 neon ssse3/; specialize qw/aom_smooth_predictor_16x16 neon ssse3/; specialize qw/aom_smooth_predictor_16x32 neon ssse3/; specialize qw/aom_smooth_predictor_32x16 neon ssse3/; specialize qw/aom_smooth_predictor_32x32 neon ssse3/; specialize qw/aom_smooth_predictor_32x64 neon ssse3/; specialize qw/aom_smooth_predictor_64x32 neon ssse3/; specialize qw/aom_smooth_predictor_64x64 neon ssse3/; specialize qw/aom_smooth_v_predictor_4x4 neon ssse3/; specialize qw/aom_smooth_v_predictor_4x8 neon ssse3/; specialize qw/aom_smooth_v_predictor_8x4 neon ssse3/; specialize qw/aom_smooth_v_predictor_8x8 neon ssse3/; specialize qw/aom_smooth_v_predictor_8x16 neon ssse3/; specialize qw/aom_smooth_v_predictor_16x8 neon ssse3/; specialize qw/aom_smooth_v_predictor_16x16 neon ssse3/; specialize qw/aom_smooth_v_predictor_16x32 neon ssse3/; specialize qw/aom_smooth_v_predictor_32x16 neon ssse3/; specialize qw/aom_smooth_v_predictor_32x32 neon ssse3/; specialize qw/aom_smooth_v_predictor_32x64 neon ssse3/; specialize qw/aom_smooth_v_predictor_64x32 neon ssse3/; specialize qw/aom_smooth_v_predictor_64x64 neon ssse3/; specialize qw/aom_smooth_h_predictor_4x4 neon ssse3/; specialize qw/aom_smooth_h_predictor_4x8 neon ssse3/; specialize qw/aom_smooth_h_predictor_8x4 neon ssse3/; specialize qw/aom_smooth_h_predictor_8x8 neon ssse3/; specialize qw/aom_smooth_h_predictor_8x16 neon ssse3/; specialize qw/aom_smooth_h_predictor_16x8 neon ssse3/; specialize qw/aom_smooth_h_predictor_16x16 neon ssse3/; specialize qw/aom_smooth_h_predictor_16x32 neon ssse3/; specialize qw/aom_smooth_h_predictor_32x16 neon ssse3/; specialize qw/aom_smooth_h_predictor_32x32 neon ssse3/; specialize qw/aom_smooth_h_predictor_32x64 neon ssse3/; specialize qw/aom_smooth_h_predictor_64x32 neon ssse3/; specialize qw/aom_smooth_h_predictor_64x64 neon ssse3/; # TODO(yunqingwang): optimize rectangular DC_PRED to replace division # by multiply and shift. specialize qw/aom_dc_predictor_4x4 neon sse2/; specialize qw/aom_dc_predictor_4x8 neon sse2/; specialize qw/aom_dc_predictor_8x4 neon sse2/; specialize qw/aom_dc_predictor_8x8 neon sse2/; specialize qw/aom_dc_predictor_8x16 neon sse2/; specialize qw/aom_dc_predictor_16x8 neon sse2/; specialize qw/aom_dc_predictor_16x16 neon sse2/; specialize qw/aom_dc_predictor_16x32 neon sse2/; specialize qw/aom_dc_predictor_32x16 neon sse2 avx2/; specialize qw/aom_dc_predictor_32x32 neon sse2 avx2/; specialize qw/aom_dc_predictor_32x64 neon sse2 avx2/; specialize qw/aom_dc_predictor_64x64 neon sse2 avx2/; specialize qw/aom_dc_predictor_64x32 neon sse2 avx2/; if ((aom_config("CONFIG_REALTIME_ONLY") ne "yes") || (aom_config("CONFIG_AV1_DECODER") eq "yes")) { specialize qw/aom_dc_top_predictor_4x16 neon sse2/; specialize qw/aom_dc_top_predictor_8x32 neon sse2/; specialize qw/aom_dc_top_predictor_16x4 neon sse2/; specialize qw/aom_dc_top_predictor_16x64 neon sse2/; specialize qw/aom_dc_top_predictor_32x8 neon sse2/; specialize qw/aom_dc_top_predictor_64x16 neon sse2 avx2/; specialize qw/aom_dc_left_predictor_4x16 neon sse2/; specialize qw/aom_dc_left_predictor_8x32 neon sse2/; specialize qw/aom_dc_left_predictor_16x4 neon sse2/; specialize qw/aom_dc_left_predictor_16x64 neon sse2/; specialize qw/aom_dc_left_predictor_32x8 neon sse2/; specialize qw/aom_dc_left_predictor_64x16 neon sse2 avx2/; specialize qw/aom_dc_128_predictor_4x16 neon sse2/; specialize qw/aom_dc_128_predictor_8x32 neon sse2/; specialize qw/aom_dc_128_predictor_16x4 neon sse2/; specialize qw/aom_dc_128_predictor_16x64 neon sse2/; specialize qw/aom_dc_128_predictor_32x8 neon sse2/; specialize qw/aom_dc_128_predictor_64x16 neon sse2 avx2/; specialize qw/aom_v_predictor_4x16 neon sse2/; specialize qw/aom_v_predictor_8x32 neon sse2/; specialize qw/aom_v_predictor_16x4 neon sse2/; specialize qw/aom_v_predictor_16x64 neon sse2/; specialize qw/aom_v_predictor_32x8 neon sse2/; specialize qw/aom_v_predictor_64x16 neon sse2 avx2/; specialize qw/aom_h_predictor_4x16 neon sse2/; specialize qw/aom_h_predictor_8x32 neon sse2/; specialize qw/aom_h_predictor_16x4 neon sse2/; specialize qw/aom_h_predictor_16x64 neon sse2/; specialize qw/aom_h_predictor_32x8 neon sse2/; specialize qw/aom_h_predictor_64x16 neon sse2/; specialize qw/aom_paeth_predictor_4x16 ssse3 neon/; specialize qw/aom_paeth_predictor_8x32 ssse3 neon/; specialize qw/aom_paeth_predictor_16x4 ssse3 neon/; specialize qw/aom_paeth_predictor_16x64 ssse3 avx2 neon/; specialize qw/aom_paeth_predictor_32x8 ssse3 neon/; specialize qw/aom_paeth_predictor_64x16 ssse3 avx2 neon/; specialize qw/aom_smooth_predictor_4x16 neon ssse3/; specialize qw/aom_smooth_predictor_8x32 neon ssse3/; specialize qw/aom_smooth_predictor_16x4 neon ssse3/; specialize qw/aom_smooth_predictor_16x64 neon ssse3/; specialize qw/aom_smooth_predictor_32x8 neon ssse3/; specialize qw/aom_smooth_predictor_64x16 neon ssse3/; specialize qw/aom_smooth_v_predictor_4x16 neon ssse3/; specialize qw/aom_smooth_v_predictor_8x32 neon ssse3/; specialize qw/aom_smooth_v_predictor_16x4 neon ssse3/; specialize qw/aom_smooth_v_predictor_16x64 neon ssse3/; specialize qw/aom_smooth_v_predictor_32x8 neon ssse3/; specialize qw/aom_smooth_v_predictor_64x16 neon ssse3/; specialize qw/aom_smooth_h_predictor_4x16 neon ssse3/; specialize qw/aom_smooth_h_predictor_8x32 neon ssse3/; specialize qw/aom_smooth_h_predictor_16x4 neon ssse3/; specialize qw/aom_smooth_h_predictor_16x64 neon ssse3/; specialize qw/aom_smooth_h_predictor_32x8 neon ssse3/; specialize qw/aom_smooth_h_predictor_64x16 neon ssse3/; specialize qw/aom_dc_predictor_4x16 neon sse2/; specialize qw/aom_dc_predictor_8x32 neon sse2/; specialize qw/aom_dc_predictor_16x4 neon sse2/; specialize qw/aom_dc_predictor_16x64 neon sse2/; specialize qw/aom_dc_predictor_32x8 neon sse2/; specialize qw/aom_dc_predictor_64x16 neon sse2 avx2/; } # !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { specialize qw/aom_highbd_v_predictor_4x4 sse2 neon/; specialize qw/aom_highbd_v_predictor_4x8 sse2 neon/; specialize qw/aom_highbd_v_predictor_8x4 sse2 neon/; specialize qw/aom_highbd_v_predictor_8x8 sse2 neon/; specialize qw/aom_highbd_v_predictor_8x16 sse2 neon/; specialize qw/aom_highbd_v_predictor_16x8 sse2 neon/; specialize qw/aom_highbd_v_predictor_16x16 sse2 neon/; specialize qw/aom_highbd_v_predictor_16x32 sse2 neon/; specialize qw/aom_highbd_v_predictor_32x16 sse2 neon/; specialize qw/aom_highbd_v_predictor_32x32 sse2 neon/; specialize qw/aom_highbd_v_predictor_32x64 neon/; specialize qw/aom_highbd_v_predictor_64x32 neon/; specialize qw/aom_highbd_v_predictor_64x64 neon/; # TODO(yunqingwang): optimize rectangular DC_PRED to replace division # by multiply and shift. specialize qw/aom_highbd_dc_predictor_4x4 sse2 neon/; specialize qw/aom_highbd_dc_predictor_4x8 sse2 neon/; specialize qw/aom_highbd_dc_predictor_8x4 sse2 neon/; specialize qw/aom_highbd_dc_predictor_8x8 sse2 neon/; specialize qw/aom_highbd_dc_predictor_8x16 sse2 neon/; specialize qw/aom_highbd_dc_predictor_16x8 sse2 neon/; specialize qw/aom_highbd_dc_predictor_16x16 sse2 neon/; specialize qw/aom_highbd_dc_predictor_16x32 sse2 neon/; specialize qw/aom_highbd_dc_predictor_32x16 sse2 neon/; specialize qw/aom_highbd_dc_predictor_32x32 sse2 neon/; specialize qw/aom_highbd_dc_predictor_32x64 neon/; specialize qw/aom_highbd_dc_predictor_64x32 neon/; specialize qw/aom_highbd_dc_predictor_64x64 neon/; specialize qw/aom_highbd_h_predictor_4x4 sse2 neon/; specialize qw/aom_highbd_h_predictor_4x8 sse2 neon/; specialize qw/aom_highbd_h_predictor_8x4 sse2 neon/; specialize qw/aom_highbd_h_predictor_8x8 sse2 neon/; specialize qw/aom_highbd_h_predictor_8x16 sse2 neon/; specialize qw/aom_highbd_h_predictor_16x8 sse2 neon/; specialize qw/aom_highbd_h_predictor_16x16 sse2 neon/; specialize qw/aom_highbd_h_predictor_16x32 sse2 neon/; specialize qw/aom_highbd_h_predictor_32x16 sse2 neon/; specialize qw/aom_highbd_h_predictor_32x32 sse2 neon/; specialize qw/aom_highbd_h_predictor_32x64 neon/; specialize qw/aom_highbd_h_predictor_64x32 neon/; specialize qw/aom_highbd_h_predictor_64x64 neon/; specialize qw/aom_highbd_dc_128_predictor_4x4 sse2 neon/; specialize qw/aom_highbd_dc_128_predictor_4x8 sse2 neon/; specialize qw/aom_highbd_dc_128_predictor_8x4 sse2 neon/; specialize qw/aom_highbd_dc_128_predictor_8x8 sse2 neon/; specialize qw/aom_highbd_dc_128_predictor_8x16 sse2 neon/; specialize qw/aom_highbd_dc_128_predictor_16x8 sse2 neon/; specialize qw/aom_highbd_dc_128_predictor_16x16 sse2 neon/; specialize qw/aom_highbd_dc_128_predictor_16x32 sse2 neon/; specialize qw/aom_highbd_dc_128_predictor_32x16 sse2 neon/; specialize qw/aom_highbd_dc_128_predictor_32x32 sse2 neon/; specialize qw/aom_highbd_dc_128_predictor_32x64 neon/; specialize qw/aom_highbd_dc_128_predictor_64x32 neon/; specialize qw/aom_highbd_dc_128_predictor_64x64 neon/; specialize qw/aom_highbd_dc_left_predictor_4x4 sse2 neon/; specialize qw/aom_highbd_dc_left_predictor_4x8 sse2 neon/; specialize qw/aom_highbd_dc_left_predictor_8x4 sse2 neon/; specialize qw/aom_highbd_dc_left_predictor_8x8 sse2 neon/; specialize qw/aom_highbd_dc_left_predictor_8x16 sse2 neon/; specialize qw/aom_highbd_dc_left_predictor_16x8 sse2 neon/; specialize qw/aom_highbd_dc_left_predictor_16x16 sse2 neon/; specialize qw/aom_highbd_dc_left_predictor_16x32 sse2 neon/; specialize qw/aom_highbd_dc_left_predictor_32x16 sse2 neon/; specialize qw/aom_highbd_dc_left_predictor_32x32 sse2 neon/; specialize qw/aom_highbd_dc_left_predictor_32x64 neon/; specialize qw/aom_highbd_dc_left_predictor_64x32 neon/; specialize qw/aom_highbd_dc_left_predictor_64x64 neon/; specialize qw/aom_highbd_dc_top_predictor_4x4 sse2 neon/; specialize qw/aom_highbd_dc_top_predictor_4x8 sse2 neon/; specialize qw/aom_highbd_dc_top_predictor_8x4 sse2 neon/; specialize qw/aom_highbd_dc_top_predictor_8x8 sse2 neon/; specialize qw/aom_highbd_dc_top_predictor_8x16 sse2 neon/; specialize qw/aom_highbd_dc_top_predictor_16x8 sse2 neon/; specialize qw/aom_highbd_dc_top_predictor_16x16 sse2 neon/; specialize qw/aom_highbd_dc_top_predictor_16x32 sse2 neon/; specialize qw/aom_highbd_dc_top_predictor_32x16 sse2 neon/; specialize qw/aom_highbd_dc_top_predictor_32x32 sse2 neon/; specialize qw/aom_highbd_dc_top_predictor_32x64 neon/; specialize qw/aom_highbd_dc_top_predictor_64x32 neon/; specialize qw/aom_highbd_dc_top_predictor_64x64 neon/; specialize qw/aom_highbd_paeth_predictor_4x4 neon/; specialize qw/aom_highbd_paeth_predictor_4x8 neon/; specialize qw/aom_highbd_paeth_predictor_8x4 neon/; specialize qw/aom_highbd_paeth_predictor_8x8 neon/; specialize qw/aom_highbd_paeth_predictor_8x16 neon/; specialize qw/aom_highbd_paeth_predictor_16x8 neon/; specialize qw/aom_highbd_paeth_predictor_16x16 neon/; specialize qw/aom_highbd_paeth_predictor_16x32 neon/; specialize qw/aom_highbd_paeth_predictor_32x16 neon/; specialize qw/aom_highbd_paeth_predictor_32x32 neon/; specialize qw/aom_highbd_paeth_predictor_32x64 neon/; specialize qw/aom_highbd_paeth_predictor_64x32 neon/; specialize qw/aom_highbd_paeth_predictor_64x64 neon/; specialize qw/aom_highbd_smooth_predictor_4x4 neon/; specialize qw/aom_highbd_smooth_predictor_4x8 neon/; specialize qw/aom_highbd_smooth_predictor_8x4 neon/; specialize qw/aom_highbd_smooth_predictor_8x8 neon/; specialize qw/aom_highbd_smooth_predictor_8x16 neon/; specialize qw/aom_highbd_smooth_predictor_16x8 neon/; specialize qw/aom_highbd_smooth_predictor_16x16 neon/; specialize qw/aom_highbd_smooth_predictor_16x32 neon/; specialize qw/aom_highbd_smooth_predictor_32x16 neon/; specialize qw/aom_highbd_smooth_predictor_32x32 neon/; specialize qw/aom_highbd_smooth_predictor_32x64 neon/; specialize qw/aom_highbd_smooth_predictor_64x32 neon/; specialize qw/aom_highbd_smooth_predictor_64x64 neon/; specialize qw/aom_highbd_smooth_v_predictor_4x4 neon/; specialize qw/aom_highbd_smooth_v_predictor_4x8 neon/; specialize qw/aom_highbd_smooth_v_predictor_8x4 neon/; specialize qw/aom_highbd_smooth_v_predictor_8x8 neon/; specialize qw/aom_highbd_smooth_v_predictor_8x16 neon/; specialize qw/aom_highbd_smooth_v_predictor_16x8 neon/; specialize qw/aom_highbd_smooth_v_predictor_16x16 neon/; specialize qw/aom_highbd_smooth_v_predictor_16x32 neon/; specialize qw/aom_highbd_smooth_v_predictor_32x16 neon/; specialize qw/aom_highbd_smooth_v_predictor_32x32 neon/; specialize qw/aom_highbd_smooth_v_predictor_32x64 neon/; specialize qw/aom_highbd_smooth_v_predictor_64x32 neon/; specialize qw/aom_highbd_smooth_v_predictor_64x64 neon/; specialize qw/aom_highbd_smooth_h_predictor_4x4 neon/; specialize qw/aom_highbd_smooth_h_predictor_4x8 neon/; specialize qw/aom_highbd_smooth_h_predictor_8x4 neon/; specialize qw/aom_highbd_smooth_h_predictor_8x8 neon/; specialize qw/aom_highbd_smooth_h_predictor_8x16 neon/; specialize qw/aom_highbd_smooth_h_predictor_16x8 neon/; specialize qw/aom_highbd_smooth_h_predictor_16x16 neon/; specialize qw/aom_highbd_smooth_h_predictor_16x32 neon/; specialize qw/aom_highbd_smooth_h_predictor_32x16 neon/; specialize qw/aom_highbd_smooth_h_predictor_32x32 neon/; specialize qw/aom_highbd_smooth_h_predictor_32x64 neon/; specialize qw/aom_highbd_smooth_h_predictor_64x32 neon/; specialize qw/aom_highbd_smooth_h_predictor_64x64 neon/; if ((aom_config("CONFIG_REALTIME_ONLY") ne "yes") || (aom_config("CONFIG_AV1_DECODER") eq "yes")) { specialize qw/aom_highbd_v_predictor_4x16 neon/; specialize qw/aom_highbd_v_predictor_8x32 neon/; specialize qw/aom_highbd_v_predictor_16x4 neon/; specialize qw/aom_highbd_v_predictor_16x64 neon/; specialize qw/aom_highbd_v_predictor_32x8 neon/; specialize qw/aom_highbd_v_predictor_64x16 neon/; specialize qw/aom_highbd_dc_predictor_4x16 neon/; specialize qw/aom_highbd_dc_predictor_8x32 neon/; specialize qw/aom_highbd_dc_predictor_16x4 neon/; specialize qw/aom_highbd_dc_predictor_16x64 neon/; specialize qw/aom_highbd_dc_predictor_32x8 neon/; specialize qw/aom_highbd_dc_predictor_64x16 neon/; specialize qw/aom_highbd_h_predictor_4x16 neon/; specialize qw/aom_highbd_h_predictor_8x32 neon/; specialize qw/aom_highbd_h_predictor_16x4 neon/; specialize qw/aom_highbd_h_predictor_16x64 neon/; specialize qw/aom_highbd_h_predictor_32x8 neon/; specialize qw/aom_highbd_h_predictor_64x16 neon/; specialize qw/aom_highbd_dc_128_predictor_4x16 neon/; specialize qw/aom_highbd_dc_128_predictor_8x32 neon/; specialize qw/aom_highbd_dc_128_predictor_16x4 neon/; specialize qw/aom_highbd_dc_128_predictor_16x64 neon/; specialize qw/aom_highbd_dc_128_predictor_32x8 neon/; specialize qw/aom_highbd_dc_128_predictor_64x16 neon/; specialize qw/aom_highbd_dc_left_predictor_4x16 neon/; specialize qw/aom_highbd_dc_left_predictor_8x32 neon/; specialize qw/aom_highbd_dc_left_predictor_16x4 neon/; specialize qw/aom_highbd_dc_left_predictor_16x64 neon/; specialize qw/aom_highbd_dc_left_predictor_32x8 neon/; specialize qw/aom_highbd_dc_left_predictor_64x16 neon/; specialize qw/aom_highbd_dc_top_predictor_4x16 neon/; specialize qw/aom_highbd_dc_top_predictor_8x32 neon/; specialize qw/aom_highbd_dc_top_predictor_16x4 neon/; specialize qw/aom_highbd_dc_top_predictor_16x64 neon/; specialize qw/aom_highbd_dc_top_predictor_32x8 neon/; specialize qw/aom_highbd_dc_top_predictor_64x16 neon/; specialize qw/aom_highbd_paeth_predictor_4x16 neon/; specialize qw/aom_highbd_paeth_predictor_8x32 neon/; specialize qw/aom_highbd_paeth_predictor_16x4 neon/; specialize qw/aom_highbd_paeth_predictor_16x64 neon/; specialize qw/aom_highbd_paeth_predictor_32x8 neon/; specialize qw/aom_highbd_paeth_predictor_64x16 neon/; specialize qw/aom_highbd_smooth_predictor_4x16 neon/; specialize qw/aom_highbd_smooth_predictor_8x32 neon/; specialize qw/aom_highbd_smooth_predictor_16x4 neon/; specialize qw/aom_highbd_smooth_predictor_16x64 neon/; specialize qw/aom_highbd_smooth_predictor_32x8 neon/; specialize qw/aom_highbd_smooth_predictor_64x16 neon/; specialize qw/aom_highbd_smooth_v_predictor_4x16 neon/; specialize qw/aom_highbd_smooth_v_predictor_8x32 neon/; specialize qw/aom_highbd_smooth_v_predictor_16x4 neon/; specialize qw/aom_highbd_smooth_v_predictor_16x64 neon/; specialize qw/aom_highbd_smooth_v_predictor_32x8 neon/; specialize qw/aom_highbd_smooth_v_predictor_64x16 neon/; specialize qw/aom_highbd_smooth_h_predictor_4x16 neon/; specialize qw/aom_highbd_smooth_h_predictor_8x32 neon/; specialize qw/aom_highbd_smooth_h_predictor_16x4 neon/; specialize qw/aom_highbd_smooth_h_predictor_16x64 neon/; specialize qw/aom_highbd_smooth_h_predictor_32x8 neon/; specialize qw/aom_highbd_smooth_h_predictor_64x16 neon/; } # !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER } # # Sub Pixel Filters # add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h"; add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; specialize qw/aom_convolve_copy neon sse2 avx2/; specialize qw/aom_convolve8_horiz neon neon_dotprod neon_i8mm ssse3/, "$avx2_ssse3"; specialize qw/aom_convolve8_vert neon neon_dotprod neon_i8mm ssse3/, "$avx2_ssse3"; add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/aom_scaled_2d ssse3 neon neon_dotprod neon_i8mm/; if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h"; specialize qw/aom_highbd_convolve_copy sse2 avx2 neon/; add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd"; specialize qw/aom_highbd_convolve8_horiz sse2 avx2 neon sve/; add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd"; specialize qw/aom_highbd_convolve8_vert sse2 avx2 neon sve/; } # # Loopfilter # add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/aom_lpf_vertical_14 sse2 neon/; add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/aom_lpf_vertical_14_dual sse2 neon/; add_proto qw/void aom_lpf_vertical_14_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0"; specialize qw/aom_lpf_vertical_14_quad avx2 sse2 neon/; add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/aom_lpf_vertical_6 sse2 neon/; add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/aom_lpf_vertical_8 sse2 neon/; add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/aom_lpf_vertical_8_dual sse2 neon/; add_proto qw/void aom_lpf_vertical_8_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0"; specialize qw/aom_lpf_vertical_8_quad sse2 neon/; add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/aom_lpf_vertical_4 sse2 neon/; add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/aom_lpf_vertical_4_dual sse2 neon/; add_proto qw/void aom_lpf_vertical_4_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0"; specialize qw/aom_lpf_vertical_4_quad sse2 neon/; add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/aom_lpf_horizontal_14 sse2 neon/; add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/aom_lpf_horizontal_14_dual sse2 neon/; add_proto qw/void aom_lpf_horizontal_14_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0"; specialize qw/aom_lpf_horizontal_14_quad sse2 avx2 neon/; add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/aom_lpf_horizontal_6 sse2 neon/; add_proto qw/void aom_lpf_horizontal_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/aom_lpf_horizontal_6_dual sse2 neon/; add_proto qw/void aom_lpf_horizontal_6_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0"; specialize qw/aom_lpf_horizontal_6_quad sse2 avx2 neon/; add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/aom_lpf_horizontal_8 sse2 neon/; add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/aom_lpf_horizontal_8_dual sse2 neon/; add_proto qw/void aom_lpf_horizontal_8_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0"; specialize qw/aom_lpf_horizontal_8_quad sse2 avx2 neon/; add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/aom_lpf_horizontal_4 sse2 neon/; add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/aom_lpf_horizontal_4_dual sse2 neon/; add_proto qw/void aom_lpf_horizontal_4_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0"; specialize qw/aom_lpf_horizontal_4_quad sse2 neon/; add_proto qw/void aom_lpf_vertical_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/aom_lpf_vertical_6_dual sse2 neon/; add_proto qw/void aom_lpf_vertical_6_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0"; specialize qw/aom_lpf_vertical_6_quad sse2 neon/; if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_vertical_14 neon sse2/; add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; specialize qw/aom_highbd_lpf_vertical_14_dual neon sse2 avx2/; add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_vertical_8 neon sse2/; add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; specialize qw/aom_highbd_lpf_vertical_8_dual neon sse2 avx2/; add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_vertical_6 neon sse2/; add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; specialize qw/aom_highbd_lpf_vertical_6_dual neon sse2/; add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_vertical_4 neon sse2/; add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; specialize qw/aom_highbd_lpf_vertical_4_dual neon sse2 avx2/; add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_horizontal_14 neon sse2/; add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd"; specialize qw/aom_highbd_lpf_horizontal_14_dual neon sse2 avx2/; add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_horizontal_6 neon sse2/; add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; specialize qw/aom_highbd_lpf_horizontal_6_dual neon sse2/; add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_horizontal_8 neon sse2/; add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; specialize qw/aom_highbd_lpf_horizontal_8_dual neon sse2 avx2/; add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_horizontal_4 neon sse2/; add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; specialize qw/aom_highbd_lpf_horizontal_4_dual neon sse2 avx2/; } # # Encoder functions. # # # Forward transform # if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){ add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_fdct4x4 neon sse2/; add_proto qw/void aom_fdct4x4_lp/, "const int16_t *input, int16_t *output, int stride"; specialize qw/aom_fdct4x4_lp neon sse2/; if (aom_config("CONFIG_INTERNAL_STATS") eq "yes"){ # 8x8 DCT transform for psnr-hvs. Unlike other transforms isn't compatible # with av1 scan orders, because it does two transposes. add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_fdct8x8 neon sse2/, "$ssse3_x86_64"; # High bit depth if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_highbd_fdct8x8 sse2/; } } # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation) add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output"; add_proto qw/void aom_fft4x4_float/, "const float *input, float *temp, float *output"; specialize qw/aom_fft4x4_float sse2/; add_proto qw/void aom_fft8x8_float/, "const float *input, float *temp, float *output"; specialize qw/aom_fft8x8_float avx2 sse2/; add_proto qw/void aom_fft16x16_float/, "const float *input, float *temp, float *output"; specialize qw/aom_fft16x16_float avx2 sse2/; add_proto qw/void aom_fft32x32_float/, "const float *input, float *temp, float *output"; specialize qw/aom_fft32x32_float avx2 sse2/; add_proto qw/void aom_ifft2x2_float/, "const float *input, float *temp, float *output"; add_proto qw/void aom_ifft4x4_float/, "const float *input, float *temp, float *output"; specialize qw/aom_ifft4x4_float sse2/; add_proto qw/void aom_ifft8x8_float/, "const float *input, float *temp, float *output"; specialize qw/aom_ifft8x8_float avx2 sse2/; add_proto qw/void aom_ifft16x16_float/, "const float *input, float *temp, float *output"; specialize qw/aom_ifft16x16_float avx2 sse2/; add_proto qw/void aom_ifft32x32_float/, "const float *input, float *temp, float *output"; specialize qw/aom_ifft32x32_float avx2 sse2/; } # CONFIG_AV1_ENCODER # # Quantization # if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_quantize_b sse2 neon avx avx2/, "$ssse3_x86_64"; add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_quantize_b_32x32 neon avx avx2/, "$ssse3_x86_64"; add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_quantize_b_64x64 neon ssse3 avx2/; if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { add_proto qw/void aom_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_quantize_b_adaptive sse2 avx2/; add_proto qw/void aom_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_quantize_b_32x32_adaptive sse2/; add_proto qw/void aom_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_quantize_b_64x64_adaptive sse2/; } } # CONFIG_AV1_ENCODER if (aom_config("CONFIG_AV1_ENCODER") eq "yes" && aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_highbd_quantize_b sse2 avx2 neon/; add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_highbd_quantize_b_32x32 sse2 avx2 neon/; add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_highbd_quantize_b_64x64 sse2 avx2 neon/; if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { add_proto qw/void aom_highbd_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_highbd_quantize_b_adaptive sse2 avx2 neon/; add_proto qw/void aom_highbd_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_highbd_quantize_b_32x32_adaptive sse2 avx2 neon/; add_proto qw/void aom_highbd_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_highbd_quantize_b_64x64_adaptive sse2 neon/; } } # CONFIG_AV1_ENCODER # # Alpha blending with mask # add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params"; specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/; add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh"; add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; specialize "aom_blend_a64_mask", qw/sse4_1 neon avx2/; specialize "aom_blend_a64_hmask", qw/sse4_1 neon/; specialize "aom_blend_a64_vmask", qw/sse4_1 neon/; if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd"; add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd"; add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd"; add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd"; specialize "aom_highbd_blend_a64_mask", qw/sse4_1 neon/; specialize "aom_highbd_blend_a64_hmask", qw/sse4_1 neon/; specialize "aom_highbd_blend_a64_vmask", qw/sse4_1 neon/; specialize "aom_highbd_blend_a64_d16_mask", qw/sse4_1 neon avx2/; } if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # # Block subtraction # add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; specialize qw/aom_subtract_block neon sse2 avx2/; add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height"; specialize qw/aom_sse sse4_1 avx2 neon neon_dotprod/; add_proto qw/void/, "aom_get_blk_sse_sum", "const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum"; specialize qw/aom_get_blk_sse_sum sse2 avx2 neon sve/; if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; specialize qw/aom_highbd_subtract_block sse2 neon/; add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height"; specialize qw/aom_highbd_sse sse4_1 avx2 neon sve/; } # # Sum of Squares # add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height"; specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon sve/; add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N"; specialize qw/aom_sum_squares_i16 sse2 neon sve/; add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height"; specialize qw/aom_var_2d_u8 sse2 avx2 neon neon_dotprod/; if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height"; specialize qw/aom_var_2d_u16 sse2 avx2 neon sve/; } # # Single block SAD / Single block Avg SAD # foreach (@encoder_block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; if ($h >= 16) { add_proto qw/unsigned int/, "aom_sad_skip_${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; } if ($w != 4 && $h != 4) { add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; } } add_proto qw/uint64_t aom_sum_sse_2d_i16/, "const int16_t *src, int src_stride, int width, int height, int *sum"; specialize qw/aom_sum_sse_2d_i16 avx2 neon sse2 sve/; specialize qw/aom_sad128x128 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad128x64 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad64x128 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad64x64 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad64x32 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad32x64 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad32x32 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad32x16 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad16x32 sse2 neon neon_dotprod/; specialize qw/aom_sad16x16 sse2 neon neon_dotprod/; specialize qw/aom_sad16x8 sse2 neon neon_dotprod/; specialize qw/aom_sad8x16 sse2 neon/; specialize qw/aom_sad8x8 sse2 neon/; specialize qw/aom_sad8x4 sse2 neon/; specialize qw/aom_sad4x8 sse2 neon/; specialize qw/aom_sad4x4 sse2 neon/; specialize qw/aom_sad4x16 sse2 neon/; specialize qw/aom_sad16x4 sse2 neon neon_dotprod/; specialize qw/aom_sad8x32 sse2 neon/; specialize qw/aom_sad32x8 sse2 neon neon_dotprod/; specialize qw/aom_sad16x64 sse2 neon neon_dotprod/; specialize qw/aom_sad64x16 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_128x128 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_128x64 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_64x128 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_64x64 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_64x32 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_32x64 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_32x32 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_32x16 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_16x32 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_16x16 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_16x8 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_8x16 sse2 neon/; specialize qw/aom_sad_skip_4x16 sse2 neon/; specialize qw/aom_sad_skip_8x32 sse2 neon/; specialize qw/aom_sad_skip_16x64 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_64x16 sse2 neon neon_dotprod/; specialize qw/aom_sad128x128_avg avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad128x64_avg avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad64x128_avg avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad64x64_avg avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad64x32_avg avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad32x64_avg avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad32x32_avg avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad32x16_avg avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad16x32_avg sse2 neon neon_dotprod/; specialize qw/aom_sad16x16_avg sse2 neon neon_dotprod/; specialize qw/aom_sad16x8_avg sse2 neon neon_dotprod/; specialize qw/aom_sad8x16_avg sse2 neon/; specialize qw/aom_sad8x8_avg sse2 neon/; specialize qw/aom_sad8x32_avg sse2 neon/; specialize qw/aom_sad32x8_avg sse2 neon neon_dotprod/; specialize qw/aom_sad16x64_avg sse2 neon neon_dotprod/; specialize qw/aom_sad64x16_avg sse2 neon neon_dotprod/; if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { foreach (@encoder_block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; if ($h >= 16) { add_proto qw/unsigned int/, "aom_highbd_sad_skip_${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; } if ($w != 4 && $h != 4) { add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; } if ($w != 128 && $h != 128 && $w != 4) { specialize "aom_highbd_sad${w}x${h}", qw/sse2/; specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/; } } specialize qw/aom_highbd_sad128x128 avx2 neon/; specialize qw/aom_highbd_sad128x64 avx2 neon/; specialize qw/aom_highbd_sad64x128 avx2 neon/; specialize qw/aom_highbd_sad64x64 avx2 sse2 neon/; specialize qw/aom_highbd_sad64x32 avx2 sse2 neon/; specialize qw/aom_highbd_sad32x64 avx2 sse2 neon/; specialize qw/aom_highbd_sad32x32 avx2 sse2 neon/; specialize qw/aom_highbd_sad32x16 avx2 sse2 neon/; specialize qw/aom_highbd_sad16x32 avx2 sse2 neon/; specialize qw/aom_highbd_sad16x16 avx2 sse2 neon/; specialize qw/aom_highbd_sad16x8 avx2 sse2 neon/; specialize qw/aom_highbd_sad8x16 sse2 neon/; specialize qw/aom_highbd_sad8x8 sse2 neon/; specialize qw/aom_highbd_sad8x4 sse2 neon/; specialize qw/aom_highbd_sad4x8 sse2 neon/; specialize qw/aom_highbd_sad4x4 sse2 neon/; specialize qw/aom_highbd_sad4x16 sse2 neon/; specialize qw/aom_highbd_sad16x4 avx2 sse2 neon/; specialize qw/aom_highbd_sad8x32 sse2 neon/; specialize qw/aom_highbd_sad32x8 avx2 sse2 neon/; specialize qw/aom_highbd_sad16x64 avx2 sse2 neon/; specialize qw/aom_highbd_sad64x16 avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_128x128 avx2 neon/; specialize qw/aom_highbd_sad_skip_128x64 avx2 neon/; specialize qw/aom_highbd_sad_skip_64x128 avx2 neon/; specialize qw/aom_highbd_sad_skip_64x64 avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_64x32 avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_32x64 avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_32x32 avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_32x16 avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_16x32 avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_16x16 avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_8x16 sse2 neon/; specialize qw/aom_highbd_sad_skip_4x16 sse2 neon/; specialize qw/aom_highbd_sad_skip_8x32 sse2 neon/; specialize qw/aom_highbd_sad_skip_16x64 avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_64x16 avx2 sse2 neon/; specialize qw/aom_highbd_sad128x128_avg avx2 neon/; specialize qw/aom_highbd_sad128x64_avg avx2 neon/; specialize qw/aom_highbd_sad64x128_avg avx2 neon/; specialize qw/aom_highbd_sad64x64_avg avx2 sse2 neon/; specialize qw/aom_highbd_sad64x32_avg avx2 sse2 neon/; specialize qw/aom_highbd_sad32x64_avg avx2 sse2 neon/; specialize qw/aom_highbd_sad32x32_avg avx2 sse2 neon/; specialize qw/aom_highbd_sad32x16_avg avx2 sse2 neon/; specialize qw/aom_highbd_sad16x32_avg avx2 sse2 neon/; specialize qw/aom_highbd_sad16x16_avg avx2 sse2 neon/; specialize qw/aom_highbd_sad16x8_avg avx2 sse2 neon/; specialize qw/aom_highbd_sad8x16_avg neon/; specialize qw/aom_highbd_sad8x8_avg neon/; specialize qw/aom_highbd_sad8x32_avg sse2 neon/; specialize qw/aom_highbd_sad16x64_avg avx2 sse2 neon/; specialize qw/aom_highbd_sad32x8_avg avx2 sse2 neon/; specialize qw/aom_highbd_sad64x16_avg avx2 sse2 neon/; } # # Masked SAD # foreach (@encoder_block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask"; specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2 neon/; } if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { foreach (@encoder_block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask"; specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2 neon/; } } # # OBMC SAD # if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { foreach (@encoder_block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2 neon/; } } if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { foreach (@encoder_block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2 neon/; } } } } # # Multi-block SAD, comparing a reference to N independent blocks # foreach (@encoder_block_sizes) { ($w, $h) = @$_; add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]"; add_proto qw/void/, "aom_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]"; if ($h >= 16) { add_proto qw/void/, "aom_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]"; } } specialize qw/aom_sad128x128x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad128x64x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad64x128x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad64x64x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad64x32x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad32x64x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad32x32x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad32x16x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad16x32x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad16x16x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad16x8x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad8x16x4d sse2 neon/; specialize qw/aom_sad8x8x4d sse2 neon/; specialize qw/aom_sad8x4x4d sse2 neon/; specialize qw/aom_sad4x8x4d sse2 neon/; specialize qw/aom_sad4x4x4d sse2 neon/; specialize qw/aom_sad64x16x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad32x8x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad16x64x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad16x4x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad8x32x4d sse2 neon/; specialize qw/aom_sad4x16x4d sse2 neon/; specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_128x64x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_64x128x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_64x64x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_64x32x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_64x16x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_32x64x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_32x32x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_32x16x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_16x64x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_16x32x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_16x16x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_16x8x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_8x32x4d sse2 neon/; specialize qw/aom_sad_skip_8x16x4d sse2 neon/; specialize qw/aom_sad_skip_4x16x4d sse2 neon/; specialize qw/aom_sad128x128x3d avx2 neon neon_dotprod/; specialize qw/aom_sad128x64x3d avx2 neon neon_dotprod/; specialize qw/aom_sad64x128x3d avx2 neon neon_dotprod/; specialize qw/aom_sad64x64x3d avx2 neon neon_dotprod/; specialize qw/aom_sad64x32x3d avx2 neon neon_dotprod/; specialize qw/aom_sad32x64x3d avx2 neon neon_dotprod/; specialize qw/aom_sad32x32x3d avx2 neon neon_dotprod/; specialize qw/aom_sad32x16x3d avx2 neon neon_dotprod/; specialize qw/aom_sad16x32x3d avx2 neon neon_dotprod/; specialize qw/aom_sad16x16x3d avx2 neon neon_dotprod/; specialize qw/aom_sad16x8x3d avx2 neon neon_dotprod/; specialize qw/aom_sad8x16x3d neon/; specialize qw/aom_sad8x8x3d neon/; specialize qw/aom_sad8x4x3d neon/; specialize qw/aom_sad4x8x3d neon/; specialize qw/aom_sad4x4x3d neon/; specialize qw/aom_sad64x16x3d avx2 neon neon_dotprod/; specialize qw/aom_sad32x8x3d avx2 neon neon_dotprod/; specialize qw/aom_sad16x64x3d avx2 neon neon_dotprod/; specialize qw/aom_sad16x4x3d avx2 neon neon_dotprod/; specialize qw/aom_sad8x32x3d neon/; specialize qw/aom_sad4x16x3d neon/; # # Multi-block SAD, comparing a reference to N independent blocks # if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { foreach (@encoder_block_sizes) { ($w, $h) = @$_; add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]"; add_proto qw/void/, "aom_highbd_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]"; if ($h >= 16) { add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]"; } if ($w != 128 && $h != 128) { specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/; } } specialize qw/aom_highbd_sad128x128x4d avx2 neon/; specialize qw/aom_highbd_sad128x64x4d avx2 neon/; specialize qw/aom_highbd_sad64x128x4d avx2 neon/; specialize qw/aom_highbd_sad64x64x4d sse2 avx2 neon/; specialize qw/aom_highbd_sad64x32x4d sse2 avx2 neon/; specialize qw/aom_highbd_sad32x64x4d sse2 avx2 neon/; specialize qw/aom_highbd_sad32x32x4d sse2 avx2 neon/; specialize qw/aom_highbd_sad32x16x4d sse2 avx2 neon/; specialize qw/aom_highbd_sad16x32x4d sse2 avx2 neon/; specialize qw/aom_highbd_sad16x16x4d sse2 avx2 neon/; specialize qw/aom_highbd_sad16x8x4d sse2 avx2 neon/; specialize qw/aom_highbd_sad8x16x4d sse2 neon/; specialize qw/aom_highbd_sad8x8x4d sse2 neon/; specialize qw/aom_highbd_sad8x4x4d sse2 neon/; specialize qw/aom_highbd_sad4x8x4d sse2 neon/; specialize qw/aom_highbd_sad4x4x4d sse2 neon/; specialize qw/aom_highbd_sad4x16x4d sse2 neon/; specialize qw/aom_highbd_sad16x4x4d avx2 sse2 neon/; specialize qw/aom_highbd_sad8x32x4d sse2 neon/; specialize qw/aom_highbd_sad32x8x4d avx2 sse2 neon/; specialize qw/aom_highbd_sad16x64x4d avx2 sse2 neon/; specialize qw/aom_highbd_sad64x16x4d avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_128x128x4d avx2 neon/; specialize qw/aom_highbd_sad_skip_128x64x4d avx2 neon/; specialize qw/aom_highbd_sad_skip_64x128x4d avx2 neon/; specialize qw/aom_highbd_sad_skip_64x64x4d avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_64x32x4d avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_32x64x4d avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_32x32x4d avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_32x16x4d avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_16x32x4d avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_16x16x4d avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_8x16x4d sse2 neon/; specialize qw/aom_highbd_sad_skip_4x16x4d sse2 neon/; specialize qw/aom_highbd_sad_skip_8x32x4d sse2 neon/; specialize qw/aom_highbd_sad_skip_16x64x4d avx2 sse2 neon/; specialize qw/aom_highbd_sad_skip_64x16x4d avx2 sse2 neon/; specialize qw/aom_highbd_sad128x128x3d avx2 neon/; specialize qw/aom_highbd_sad128x64x3d avx2 neon/; specialize qw/aom_highbd_sad64x128x3d avx2 neon/; specialize qw/aom_highbd_sad64x64x3d avx2 neon/; specialize qw/aom_highbd_sad64x32x3d avx2 neon/; specialize qw/aom_highbd_sad32x64x3d avx2 neon/; specialize qw/aom_highbd_sad32x32x3d avx2 neon/; specialize qw/aom_highbd_sad32x16x3d avx2 neon/; specialize qw/aom_highbd_sad16x32x3d avx2 neon/; specialize qw/aom_highbd_sad16x16x3d avx2 neon/; specialize qw/aom_highbd_sad16x8x3d avx2 neon/; specialize qw/aom_highbd_sad8x16x3d neon/; specialize qw/aom_highbd_sad8x8x3d neon/; specialize qw/aom_highbd_sad8x4x3d neon/; specialize qw/aom_highbd_sad4x8x3d neon/; specialize qw/aom_highbd_sad4x4x3d neon/; specialize qw/aom_highbd_sad64x16x3d avx2 neon/; specialize qw/aom_highbd_sad32x8x3d avx2 neon/; specialize qw/aom_highbd_sad16x64x3d avx2 neon/; specialize qw/aom_highbd_sad16x4x3d avx2 neon/; specialize qw/aom_highbd_sad8x32x3d neon/; specialize qw/aom_highbd_sad4x16x3d neon/; } # # Avg # add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p"; specialize qw/aom_avg_8x8 sse2 neon/; add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p"; specialize qw/aom_avg_4x4 sse2 neon/; add_proto qw/void aom_avg_8x8_quad/, "const uint8_t *s, int p, int x16_idx, int y16_idx, int *avg"; specialize qw/aom_avg_8x8_quad avx2 sse2 neon/; add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; specialize qw/aom_minmax_8x8 sse2 neon/; if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int aom_highbd_avg_8x8/, "const uint8_t *, int p"; specialize qw/aom_highbd_avg_8x8 neon/; add_proto qw/unsigned int aom_highbd_avg_4x4/, "const uint8_t *, int p"; specialize qw/aom_highbd_avg_4x4 neon/; add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; specialize qw/aom_highbd_minmax_8x8 neon/; } add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor"; specialize qw/aom_int_pro_row avx2 sse2 neon/; add_proto qw/void aom_int_pro_col/, "int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor"; specialize qw/aom_int_pro_col avx2 sse2 neon/; add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, int bwl"; specialize qw/aom_vector_var avx2 sse4_1 neon sve/; # # hamadard transform and satd for implmenting temporal dependency model # add_proto qw/void aom_hadamard_4x4/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/aom_hadamard_4x4 sse2 neon/; add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/aom_hadamard_8x8 sse2 neon/; add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/aom_hadamard_16x16 avx2 sse2 neon/; add_proto qw/void aom_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/aom_hadamard_32x32 avx2 sse2 neon/; add_proto qw/void aom_hadamard_lp_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; specialize qw/aom_hadamard_lp_8x8 sse2 neon/; add_proto qw/void aom_hadamard_lp_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; specialize qw/aom_hadamard_lp_16x16 sse2 avx2 neon/; add_proto qw/void aom_hadamard_lp_8x8_dual/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; specialize qw/aom_hadamard_lp_8x8_dual sse2 avx2 neon/; if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/aom_highbd_hadamard_8x8 avx2 neon/; add_proto qw/void aom_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/aom_highbd_hadamard_16x16 avx2 neon/; add_proto qw/void aom_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/aom_highbd_hadamard_32x32 avx2 neon/; } add_proto qw/int aom_satd/, "const tran_low_t *coeff, int length"; specialize qw/aom_satd neon sse2 avx2/; add_proto qw/int aom_satd_lp/, "const int16_t *coeff, int length"; specialize qw/aom_satd_lp sse2 avx2 neon/; # # Structured Similarity (SSIM) # add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64"; if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; } } # CONFIG_AV1_ENCODER if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # # Specialty Variance # add_proto qw/void aom_get_var_sse_sum_8x8_quad/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8"; specialize qw/aom_get_var_sse_sum_8x8_quad avx2 sse2 neon neon_dotprod/; add_proto qw/void aom_get_var_sse_sum_16x16_dual/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16"; specialize qw/aom_get_var_sse_sum_16x16_dual avx2 sse2 neon neon_dotprod/; add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; specialize qw/aom_mse16x16 sse2 avx2 neon neon_dotprod/; specialize qw/aom_mse16x8 sse2 neon neon_dotprod/; specialize qw/aom_mse8x16 sse2 neon neon_dotprod/; specialize qw/aom_mse8x8 sse2 neon neon_dotprod/; if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { foreach $bd (8, 10, 12) { add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; if ($bd eq 8) { specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon neon_dotprod/; specialize "aom_highbd_${bd}_mse16x8", qw/neon neon_dotprod/; specialize "aom_highbd_${bd}_mse8x16", qw/neon neon_dotprod/; specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon neon_dotprod/; } elsif ($bd eq 10) { specialize "aom_highbd_${bd}_mse16x16", qw/avx2 sse2 neon sve/; specialize "aom_highbd_${bd}_mse16x8", qw/neon sve/; specialize "aom_highbd_${bd}_mse8x16", qw/neon sve/; specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon sve/; } else { specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon sve/; specialize "aom_highbd_${bd}_mse16x8", qw/neon sve/; specialize "aom_highbd_${bd}_mse8x16", qw/neon sve/; specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon sve/; } } } if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *"; specialize qw/aom_get_mb_ss sse2 neon/; } # # Variance / Subpixel Variance / Subpixel Avg Variance # add_proto qw/uint64_t/, "aom_mse_wxh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h"; specialize qw/aom_mse_wxh_16bit sse2 avx2 neon/; add_proto qw/uint64_t/, "aom_mse_16xh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int w, int h"; specialize qw/aom_mse_16xh_16bit sse2 avx2 neon/; foreach (@encoder_block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; } specialize qw/aom_variance128x128 sse2 avx2 neon neon_dotprod/; specialize qw/aom_variance128x64 sse2 avx2 neon neon_dotprod/; specialize qw/aom_variance64x128 sse2 avx2 neon neon_dotprod/; specialize qw/aom_variance64x64 sse2 avx2 neon neon_dotprod/; specialize qw/aom_variance64x32 sse2 avx2 neon neon_dotprod/; specialize qw/aom_variance32x64 sse2 avx2 neon neon_dotprod/; specialize qw/aom_variance32x32 sse2 avx2 neon neon_dotprod/; specialize qw/aom_variance32x16 sse2 avx2 neon neon_dotprod/; specialize qw/aom_variance16x32 sse2 avx2 neon neon_dotprod/; specialize qw/aom_variance16x16 sse2 avx2 neon neon_dotprod/; specialize qw/aom_variance16x8 sse2 avx2 neon neon_dotprod/; specialize qw/aom_variance8x16 sse2 neon neon_dotprod/; specialize qw/aom_variance8x8 sse2 neon neon_dotprod/; specialize qw/aom_variance8x4 sse2 neon neon_dotprod/; specialize qw/aom_variance4x8 sse2 neon neon_dotprod/; specialize qw/aom_variance4x4 sse2 neon neon_dotprod/; specialize qw/aom_sub_pixel_variance128x128 avx2 neon ssse3/; specialize qw/aom_sub_pixel_variance128x64 avx2 neon ssse3/; specialize qw/aom_sub_pixel_variance64x128 avx2 neon ssse3/; specialize qw/aom_sub_pixel_variance64x64 avx2 neon ssse3/; specialize qw/aom_sub_pixel_variance64x32 avx2 neon ssse3/; specialize qw/aom_sub_pixel_variance32x64 avx2 neon ssse3/; specialize qw/aom_sub_pixel_variance32x32 avx2 neon ssse3/; specialize qw/aom_sub_pixel_variance32x16 avx2 neon ssse3/; specialize qw/aom_sub_pixel_variance16x32 avx2 neon ssse3/; specialize qw/aom_sub_pixel_variance16x16 avx2 neon ssse3/; specialize qw/aom_sub_pixel_variance16x8 avx2 neon ssse3/; specialize qw/aom_sub_pixel_variance8x16 neon ssse3/; specialize qw/aom_sub_pixel_variance8x8 neon ssse3/; specialize qw/aom_sub_pixel_variance8x4 neon ssse3/; specialize qw/aom_sub_pixel_variance4x8 neon ssse3/; specialize qw/aom_sub_pixel_variance4x4 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance128x128 avx2 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance128x64 avx2 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance64x128 avx2 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance64x64 avx2 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance64x32 avx2 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance32x64 avx2 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance32x32 avx2 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance32x16 avx2 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance16x32 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance16x16 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance16x8 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance8x16 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance8x8 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance8x4 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance4x8 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance4x4 neon ssse3/; if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { specialize qw/aom_variance4x16 neon neon_dotprod sse2/; specialize qw/aom_variance16x4 neon neon_dotprod sse2 avx2/; specialize qw/aom_variance8x32 neon neon_dotprod sse2/; specialize qw/aom_variance32x8 neon neon_dotprod sse2 avx2/; specialize qw/aom_variance16x64 neon neon_dotprod sse2 avx2/; specialize qw/aom_variance64x16 neon neon_dotprod sse2 avx2/; specialize qw/aom_sub_pixel_variance4x16 neon ssse3/; specialize qw/aom_sub_pixel_variance16x4 neon avx2 ssse3/; specialize qw/aom_sub_pixel_variance8x32 neon ssse3/; specialize qw/aom_sub_pixel_variance32x8 neon ssse3/; specialize qw/aom_sub_pixel_variance16x64 neon avx2 ssse3/; specialize qw/aom_sub_pixel_variance64x16 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance4x16 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance16x4 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance8x32 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance32x8 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance16x64 neon ssse3/; specialize qw/aom_sub_pixel_avg_variance64x16 neon ssse3/; } if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { foreach $bd (8, 10, 12) { foreach (@encoder_block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; } } specialize qw/aom_highbd_12_variance128x128 sse2 neon sve/; specialize qw/aom_highbd_12_variance128x64 sse2 neon sve/; specialize qw/aom_highbd_12_variance64x128 sse2 neon sve/; specialize qw/aom_highbd_12_variance64x64 sse2 neon sve/; specialize qw/aom_highbd_12_variance64x32 sse2 neon sve/; specialize qw/aom_highbd_12_variance32x64 sse2 neon sve/; specialize qw/aom_highbd_12_variance32x32 sse2 neon sve/; specialize qw/aom_highbd_12_variance32x16 sse2 neon sve/; specialize qw/aom_highbd_12_variance16x32 sse2 neon sve/; specialize qw/aom_highbd_12_variance16x16 sse2 neon sve/; specialize qw/aom_highbd_12_variance16x8 sse2 neon sve/; specialize qw/aom_highbd_12_variance8x16 sse2 neon sve/; specialize qw/aom_highbd_12_variance8x8 sse2 neon sve/; specialize qw/aom_highbd_12_variance8x4 neon sve/; specialize qw/aom_highbd_12_variance4x8 neon sve/; specialize qw/aom_highbd_12_variance4x4 sse4_1 neon sve/; specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon sve/; specialize qw/aom_highbd_10_variance128x64 sse2 avx2 neon sve/; specialize qw/aom_highbd_10_variance64x128 sse2 avx2 neon sve/; specialize qw/aom_highbd_10_variance64x64 sse2 avx2 neon sve/; specialize qw/aom_highbd_10_variance64x32 sse2 avx2 neon sve/; specialize qw/aom_highbd_10_variance32x64 sse2 avx2 neon sve/; specialize qw/aom_highbd_10_variance32x32 sse2 avx2 neon sve/; specialize qw/aom_highbd_10_variance32x16 sse2 avx2 neon sve/; specialize qw/aom_highbd_10_variance16x32 sse2 avx2 neon sve/; specialize qw/aom_highbd_10_variance16x16 sse2 avx2 neon sve/; specialize qw/aom_highbd_10_variance16x8 sse2 avx2 neon sve/; specialize qw/aom_highbd_10_variance8x16 sse2 avx2 neon sve/; specialize qw/aom_highbd_10_variance8x8 sse2 avx2 neon sve/; specialize qw/aom_highbd_10_variance8x4 neon sve/; specialize qw/aom_highbd_10_variance4x8 neon sve/; specialize qw/aom_highbd_10_variance4x4 sse4_1 neon sve/; specialize qw/aom_highbd_8_variance128x128 sse2 neon sve/; specialize qw/aom_highbd_8_variance128x64 sse2 neon sve/; specialize qw/aom_highbd_8_variance64x128 sse2 neon sve/; specialize qw/aom_highbd_8_variance64x64 sse2 neon sve/; specialize qw/aom_highbd_8_variance64x32 sse2 neon sve/; specialize qw/aom_highbd_8_variance32x64 sse2 neon sve/; specialize qw/aom_highbd_8_variance32x32 sse2 neon sve/; specialize qw/aom_highbd_8_variance32x16 sse2 neon sve/; specialize qw/aom_highbd_8_variance16x32 sse2 neon sve/; specialize qw/aom_highbd_8_variance16x16 sse2 neon sve/; specialize qw/aom_highbd_8_variance16x8 sse2 neon sve/; specialize qw/aom_highbd_8_variance8x16 sse2 neon sve/; specialize qw/aom_highbd_8_variance8x8 sse2 neon sve/; specialize qw/aom_highbd_8_variance8x4 neon sve/; specialize qw/aom_highbd_8_variance4x8 neon sve/; specialize qw/aom_highbd_8_variance4x4 sse4_1 neon sve/; if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { foreach $bd (8, 10, 12) { my $avx2 = ($bd == 10) ? "avx2" : ""; specialize "aom_highbd_${bd}_variance64x16" , $avx2, qw/sse2 neon sve/; specialize "aom_highbd_${bd}_variance32x8" , $avx2, qw/sse2 neon sve/; specialize "aom_highbd_${bd}_variance16x64" , $avx2, qw/sse2 neon sve/; specialize "aom_highbd_${bd}_variance16x4" , qw/neon sve/; specialize "aom_highbd_${bd}_variance8x32" , $avx2, qw/sse2 neon sve/; specialize "aom_highbd_${bd}_variance4x16" , qw/neon sve/; } } specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_variance4x8 neon/; specialize qw/aom_highbd_12_sub_pixel_variance4x4 sse4_1 neon/; specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2 neon/; specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2 avx2 neon/; specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2 avx2 neon/; specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2 avx2 neon/; specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2 avx2 neon/; specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2 avx2 neon/; specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2 avx2 neon/; specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2 avx2 neon/; specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2 avx2 neon/; specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2 avx2 neon/; specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2 avx2 neon/; specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2 avx2 neon/; specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2 avx2 neon/; specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2 neon/; specialize qw/aom_highbd_10_sub_pixel_variance4x8 neon/; specialize qw/aom_highbd_10_sub_pixel_variance4x4 sse4_1 neon/; specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_variance4x8 neon/; specialize qw/aom_highbd_8_sub_pixel_variance4x4 sse4_1 neon/; if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { foreach $bd (8, 10, 12) { specialize "aom_highbd_${bd}_sub_pixel_variance64x16" , qw/sse2 neon/; specialize "aom_highbd_${bd}_sub_pixel_variance32x8" , qw/sse2 neon/; specialize "aom_highbd_${bd}_sub_pixel_variance16x64" , qw/sse2 neon/; specialize "aom_highbd_${bd}_sub_pixel_variance16x4" , qw/sse2 neon/; specialize "aom_highbd_${bd}_sub_pixel_variance8x32" , qw/sse2 neon/; specialize "aom_highbd_${bd}_sub_pixel_variance4x16" , qw/neon/; } } specialize qw/aom_highbd_12_sub_pixel_avg_variance128x128 neon/; specialize qw/aom_highbd_12_sub_pixel_avg_variance128x64 neon/; specialize qw/aom_highbd_12_sub_pixel_avg_variance64x128 neon/; specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/; specialize qw/aom_highbd_12_sub_pixel_avg_variance4x8 neon/; specialize qw/aom_highbd_12_sub_pixel_avg_variance4x4 sse4_1 neon/; specialize qw/aom_highbd_10_sub_pixel_avg_variance128x128 neon/; specialize qw/aom_highbd_10_sub_pixel_avg_variance128x64 neon/; specialize qw/aom_highbd_10_sub_pixel_avg_variance64x128 neon/; specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/; specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2 neon/; specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2 neon/; specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2 neon/; specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2 neon/; specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2 neon/; specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2 neon/; specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2 neon/; specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2 neon/; specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2 neon/; specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/; specialize qw/aom_highbd_10_sub_pixel_avg_variance4x8 neon/; specialize qw/aom_highbd_10_sub_pixel_avg_variance4x4 sse4_1 neon/; specialize qw/aom_highbd_8_sub_pixel_avg_variance128x128 neon/; specialize qw/aom_highbd_8_sub_pixel_avg_variance128x64 neon/; specialize qw/aom_highbd_8_sub_pixel_avg_variance64x128 neon/; specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/; specialize qw/aom_highbd_8_sub_pixel_avg_variance4x8 neon/; specialize qw/aom_highbd_8_sub_pixel_avg_variance4x4 sse4_1 neon/; if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { foreach $bd (8, 10, 12) { specialize "aom_highbd_${bd}_sub_pixel_avg_variance64x16" , qw/sse2 neon/; specialize "aom_highbd_${bd}_sub_pixel_avg_variance32x8" , qw/sse2 neon/; specialize "aom_highbd_${bd}_sub_pixel_avg_variance16x64" , qw/sse2 neon/; specialize "aom_highbd_${bd}_sub_pixel_avg_variance16x4" , qw/sse2 neon/; specialize "aom_highbd_${bd}_sub_pixel_avg_variance8x32" , qw/sse2 neon/; specialize "aom_highbd_${bd}_sub_pixel_avg_variance4x16" , qw/neon/; } } } # # Masked Variance / Masked Subpixel Variance # foreach (@encoder_block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse"; specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3 neon/; } if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { foreach $bd ("_8_", "_10_", "_12_") { foreach (@encoder_block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse"; specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3 neon/; } } } # # OBMC Variance / OBMC Subpixel Variance # if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { foreach (@encoder_block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2 neon/; specialize "aom_obmc_sub_pixel_variance${w}x${h}", qw/sse4_1 neon/; } if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { foreach $bd ("_8_", "_10_", "_12_") { foreach (@encoder_block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1 neon/; specialize "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", qw/neon/; } } } } # # Comp Avg # add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; specialize qw/aom_comp_avg_pred avx2 neon/; if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; specialize qw/aom_highbd_comp_avg_pred neon/; add_proto qw/uint64_t/, "aom_mse_wxh_16bit_highbd", "uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h"; specialize qw/aom_mse_wxh_16bit_highbd sse2 avx2 neon sve/; } add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; specialize qw/aom_comp_mask_pred ssse3 avx2 neon/; if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; specialize qw/aom_highbd_comp_mask_pred sse2 avx2 neon/; } # Flow estimation library if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { add_proto qw/bool aom_compute_mean_stddev/, "const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev"; specialize qw/aom_compute_mean_stddev sse4_1 avx2/; add_proto qw/double aom_compute_correlation/, "const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2"; specialize qw/aom_compute_correlation sse4_1 avx2/; add_proto qw/void aom_compute_flow_at_point/, "const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v"; specialize qw/aom_compute_flow_at_point sse4_1 avx2 neon sve/; } } # CONFIG_AV1_ENCODER 1; aom-3.12.1/aom_dsp/aom_filter.h000066400000000000000000000033551477627663500163230ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_AOM_FILTER_H_ #define AOM_AOM_DSP_AOM_FILTER_H_ #include "aom/aom_integer.h" #ifdef __cplusplus extern "C" { #endif #define FILTER_BITS 7 #define SUBPEL_BITS 4 #define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) #define SUBPEL_SHIFTS (1 << SUBPEL_BITS) #define SUBPEL_TAPS 8 #define SCALE_SUBPEL_BITS 10 #define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS) #define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1) #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS) #define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2) #define RS_SUBPEL_BITS 6 #define RS_SUBPEL_MASK ((1 << RS_SUBPEL_BITS) - 1) #define RS_SCALE_SUBPEL_BITS 14 #define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1) #define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS) #define RS_SCALE_EXTRA_OFF (1 << (RS_SCALE_EXTRA_BITS - 1)) typedef int16_t InterpKernel[SUBPEL_TAPS]; #define BIL_SUBPEL_BITS 3 #define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS) // 2 tap bilinear filters static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = { { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, }; #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_AOM_FILTER_H_ aom-3.12.1/aom_dsp/aom_simd.h000066400000000000000000000022541477627663500157670ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_AOM_SIMD_H_ #define AOM_AOM_DSP_AOM_SIMD_H_ #include #if defined(_WIN32) #include #endif #include "config/aom_config.h" #include "aom_dsp/aom_simd_inline.h" #define SIMD_CHECK 1 // Sanity checks in C equivalents // VS compiling for 32 bit targets does not support vector types in // structs as arguments, which makes the v256 type of the intrinsics // hard to support, so optimizations for this target are disabled. #if HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)) #include "simd/v256_intrinsics_x86.h" #else #include "simd/v256_intrinsics.h" #endif #endif // AOM_AOM_DSP_AOM_SIMD_H_ aom-3.12.1/aom_dsp/aom_simd_inline.h000066400000000000000000000015401477627663500173220ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_AOM_SIMD_INLINE_H_ #define AOM_AOM_DSP_AOM_SIMD_INLINE_H_ #include "aom_dsp/aom_dsp_common.h" #ifndef SIMD_INLINE #define SIMD_INLINE static AOM_FORCE_INLINE #endif #define SIMD_CLAMP(value, min, max) \ ((value) > (max) ? (max) : (value) < (min) ? (min) : (value)) #endif // AOM_AOM_DSP_AOM_SIMD_INLINE_H_ aom-3.12.1/aom_dsp/arm/000077500000000000000000000000001477627663500146025ustar00rootroot00000000000000aom-3.12.1/aom_dsp/arm/aom_convolve8_neon.c000066400000000000000000000376761477627663500205670ustar00rootroot00000000000000/* * Copyright (c) 2014 The WebM project authors. All rights reserved. * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/arm/aom_convolve8_neon.h" #include "aom_dsp/arm/aom_filter.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" static inline void convolve8_horiz_8tap_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) { // All filter values are even so halve them to reduce intermediate precision // requirements. const int16x8_t filter = vshrq_n_s16(vld1q_s16(filter_x), 1); if (h == 4) { uint8x8_t t0, t1, t2, t3; load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); src += 7; do { load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); // We halved the filter values so -1 from right shift. uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); transpose_elems_inplace_u8_4x4(&d01, &d23); store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01); store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; src += 4; dst += 4; w -= 4; } while (w != 0); } else { if (w == 4) { do { uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3); store_u8x4_strided_x2(dst + 0 * dst_stride, 4 * dst_stride, d0); store_u8x4_strided_x2(dst + 1 * dst_stride, 4 * dst_stride, d1); store_u8x4_strided_x2(dst + 2 * dst_stride, 4 * dst_stride, d2); store_u8x4_strided_x2(dst + 3 * dst_stride, 4 * dst_stride, d3); src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; } while (h > 0); } else { do { int width = w; const uint8_t *s = src; uint8_t *d = dst; uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); s += 7; do { load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter); uint8x8_t d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter); uint8x8_t d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter); uint8x8_t d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter); transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); s0 = s8; s1 = s9; s2 = s10; s3 = s11; s4 = s12; s5 = s13; s6 = s14; s += 8; d += 8; width -= 8; } while (width != 0); src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; } while (h > 0); } } } static inline void convolve8_horiz_4tap_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) { // All filter values are even, halve to reduce intermediate precision // requirements. const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1); if (w == 4) { do { uint8x8_t t01[4]; t01[0] = load_unaligned_u8(src + 0, (int)src_stride); t01[1] = load_unaligned_u8(src + 1, (int)src_stride); t01[2] = load_unaligned_u8(src + 2, (int)src_stride); t01[3] = load_unaligned_u8(src + 3, (int)src_stride); int16x8_t s01[4]; s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0])); s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1])); s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2])); s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3])); uint8x8_t d01 = convolve4_8(s01[0], s01[1], s01[2], s01[3], filter); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); src += 2 * src_stride; dst += 2 * dst_stride; h -= 2; } while (h > 0); } else { do { int width = w; const uint8_t *s = src; uint8_t *d = dst; do { uint8x8_t t0[4], t1[4]; load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]); load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]); int16x8_t s0[4], s1[4]; s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0])); s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1])); s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2])); s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3])); s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0])); s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1])); s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2])); s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3])); uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter); uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter); store_u8_8x2(d, dst_stride, d0, d1); s += 8; d += 8; width -= 8; } while (width != 0); src += 2 * src_stride; dst += 2 * dst_stride; h -= 2; } while (h > 0); } } void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); (void)x_step_q4; (void)filter_y; (void)y_step_q4; src -= ((SUBPEL_TAPS / 2) - 1); int filter_taps = get_filter_taps_convolve8(filter_x); if (filter_taps == 2) { convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w, h); } else if (filter_taps == 4) { convolve8_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride, filter_x, w, h); } else { convolve8_horiz_8tap_neon(src, src_stride, dst, dst_stride, filter_x, w, h); } } static inline void convolve8_vert_8tap_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) { // All filter values are even so halve them to reduce intermediate precision // requirements. const int16x8_t filter = vshrq_n_s16(vld1q_s16(filter_y), 1); if (w == 4) { uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); src += 7 * src_stride; do { load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); // We halved the filter values so -1 from right shift. uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); int height = h; const uint8_t *s = src + 7 * src_stride; uint8_t *d = dst; do { load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src += 8; dst += 8; w -= 8; } while (w != 0); } } void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); (void)filter_x; (void)x_step_q4; (void)y_step_q4; src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; int filter_taps = get_filter_taps_convolve8(filter_y); if (filter_taps == 2) { convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride, filter_y, w, h); } else if (filter_taps == 4) { convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride, filter_y, w, h); } else { convolve8_vert_8tap_neon(src, src_stride, dst, dst_stride, filter_y, w, h); } } aom-3.12.1/aom_dsp/arm/aom_convolve8_neon.h000066400000000000000000000265261477627663500205640ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_ #define AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_ #include #include "aom_dsp/aom_filter.h" #include "aom_dsp/arm/mem_neon.h" #include "config/aom_config.h" static inline int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t filter) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0); sum = vmla_lane_s16(sum, s1, filter_lo, 1); sum = vmla_lane_s16(sum, s2, filter_lo, 2); sum = vmla_lane_s16(sum, s3, filter_lo, 3); sum = vmla_lane_s16(sum, s4, filter_hi, 0); sum = vmla_lane_s16(sum, s5, filter_hi, 1); sum = vmla_lane_s16(sum, s6, filter_hi, 2); sum = vmla_lane_s16(sum, s7, filter_hi, 3); return sum; } static inline uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t filter) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0); sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); sum = vmlaq_lane_s16(sum, s3, filter_lo, 3); sum = vmlaq_lane_s16(sum, s4, filter_hi, 0); sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void convolve8_horiz_2tap_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) { // Bilinear filter values are all positive. const uint8x8_t f0 = vdup_n_u8((uint8_t)filter_x[3]); const uint8x8_t f1 = vdup_n_u8((uint8_t)filter_x[4]); if (w == 4) { do { uint8x8_t s0 = load_unaligned_u8(src + 0 * src_stride + 0, (int)src_stride); uint8x8_t s1 = load_unaligned_u8(src + 0 * src_stride + 1, (int)src_stride); uint8x8_t s2 = load_unaligned_u8(src + 2 * src_stride + 0, (int)src_stride); uint8x8_t s3 = load_unaligned_u8(src + 2 * src_stride + 1, (int)src_stride); uint16x8_t sum0 = vmull_u8(s0, f0); sum0 = vmlal_u8(sum0, s1, f1); uint16x8_t sum1 = vmull_u8(s2, f0); sum1 = vmlal_u8(sum1, s3, f1); uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS); uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d0); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d1); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h > 0); } else if (w == 8) { do { uint8x8_t s0 = vld1_u8(src + 0 * src_stride + 0); uint8x8_t s1 = vld1_u8(src + 0 * src_stride + 1); uint8x8_t s2 = vld1_u8(src + 1 * src_stride + 0); uint8x8_t s3 = vld1_u8(src + 1 * src_stride + 1); uint16x8_t sum0 = vmull_u8(s0, f0); sum0 = vmlal_u8(sum0, s1, f1); uint16x8_t sum1 = vmull_u8(s2, f0); sum1 = vmlal_u8(sum1, s3, f1); uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS); uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS); vst1_u8(dst + 0 * dst_stride, d0); vst1_u8(dst + 1 * dst_stride, d1); src += 2 * src_stride; dst += 2 * dst_stride; h -= 2; } while (h > 0); } else { do { int width = w; const uint8_t *s = src; uint8_t *d = dst; do { uint8x16_t s0 = vld1q_u8(s + 0); uint8x16_t s1 = vld1q_u8(s + 1); uint16x8_t sum0 = vmull_u8(vget_low_u8(s0), f0); sum0 = vmlal_u8(sum0, vget_low_u8(s1), f1); uint16x8_t sum1 = vmull_u8(vget_high_u8(s0), f0); sum1 = vmlal_u8(sum1, vget_high_u8(s1), f1); uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS); uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS); vst1q_u8(d, vcombine_u8(d0, d1)); s += 16; d += 16; width -= 16; } while (width != 0); src += src_stride; dst += dst_stride; } while (--h > 0); } } static inline uint8x8_t convolve4_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x4_t filter) { int16x8_t sum = vmulq_lane_s16(s0, filter, 0); sum = vmlaq_lane_s16(sum, s1, filter, 1); sum = vmlaq_lane_s16(sum, s2, filter, 2); sum = vmlaq_lane_s16(sum, s3, filter, 3); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void convolve8_vert_4tap_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) { // All filter values are even, halve to reduce intermediate precision // requirements. const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1); if (w == 4) { uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride); uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride); int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); src += 2 * src_stride; do { uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride); uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride); uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, (int)src_stride); uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, (int)src_stride); int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23)); int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34)); int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45)); int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56)); uint8x8_t d01 = convolve4_8(s01, s12, s23, s34, filter); uint8x8_t d23 = convolve4_8(s23, s34, s45, s56, filter); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); s01 = s45; s12 = s56; src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { uint8x8_t t0, t1, t2; load_u8_8x3(src, src_stride, &t0, &t1, &t2); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int height = h; const uint8_t *s = src + 3 * src_stride; uint8_t *d = dst; do { uint8x8_t t3; load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3)); uint8x8_t d0 = convolve4_8(s0, s1, s2, s3, filter); uint8x8_t d1 = convolve4_8(s1, s2, s3, s4, filter); uint8x8_t d2 = convolve4_8(s2, s3, s4, s5, filter); uint8x8_t d3 = convolve4_8(s3, s4, s5, s6, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src += 8; dst += 8; w -= 8; } while (w != 0); } } static inline void convolve8_vert_2tap_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) { // Bilinear filter values are all positive. uint8x8_t f0 = vdup_n_u8((uint8_t)filter_y[3]); uint8x8_t f1 = vdup_n_u8((uint8_t)filter_y[4]); if (w == 4) { do { uint8x8_t s0 = load_unaligned_u8(src + 0 * src_stride, (int)src_stride); uint8x8_t s1 = load_unaligned_u8(src + 1 * src_stride, (int)src_stride); uint8x8_t s2 = load_unaligned_u8(src + 2 * src_stride, (int)src_stride); uint8x8_t s3 = load_unaligned_u8(src + 3 * src_stride, (int)src_stride); uint16x8_t sum0 = vmull_u8(s0, f0); sum0 = vmlal_u8(sum0, s1, f1); uint16x8_t sum1 = vmull_u8(s2, f0); sum1 = vmlal_u8(sum1, s3, f1); uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS); uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d0); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d1); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h > 0); } else if (w == 8) { do { uint8x8_t s0, s1, s2; load_u8_8x3(src, src_stride, &s0, &s1, &s2); uint16x8_t sum0 = vmull_u8(s0, f0); sum0 = vmlal_u8(sum0, s1, f1); uint16x8_t sum1 = vmull_u8(s1, f0); sum1 = vmlal_u8(sum1, s2, f1); uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS); uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS); vst1_u8(dst + 0 * dst_stride, d0); vst1_u8(dst + 1 * dst_stride, d1); src += 2 * src_stride; dst += 2 * dst_stride; h -= 2; } while (h > 0); } else { do { int width = w; const uint8_t *s = src; uint8_t *d = dst; do { uint8x16_t s0 = vld1q_u8(s + 0 * src_stride); uint8x16_t s1 = vld1q_u8(s + 1 * src_stride); uint16x8_t sum0 = vmull_u8(vget_low_u8(s0), f0); sum0 = vmlal_u8(sum0, vget_low_u8(s1), f1); uint16x8_t sum1 = vmull_u8(vget_high_u8(s0), f0); sum1 = vmlal_u8(sum1, vget_high_u8(s1), f1); uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS); uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS); vst1q_u8(d, vcombine_u8(d0, d1)); s += 16; d += 16; width -= 16; } while (width != 0); src += src_stride; dst += dst_stride; } while (--h > 0); } } #endif // AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_ aom-3.12.1/aom_dsp/arm/aom_convolve8_neon_dotprod.c000066400000000000000000000536601477627663500223110ustar00rootroot00000000000000/* * Copyright (c) 2014 The WebM project authors. All rights reserved. * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/arm/aom_convolve8_neon.h" #include "aom_dsp/arm/aom_filter.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" // Filter values always sum to 128. #define FILTER_WEIGHT 128 DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { // Shift left and insert new last column in transposed 4x4 block. 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, // Shift left and insert two new columns in transposed 4x4 block. 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, // Shift left and insert three new columns in transposed 4x4 block. 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; static inline int16x4_t convolve8_4_h(const uint8x16_t samples, const int8x8_t filters, const uint8x16x2_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t samples_128 = vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; // Accumulate into 128 * FILTER_WEIGHT to account for range transform. int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT); int32x4_t sum = vdotq_lane_s32(acc, perm_samples[0], filters, 0); sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1); // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } static inline uint8x8_t convolve8_8_h(const uint8x16_t samples, const int8x8_t filters, const uint8x16x3_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t samples_128 = vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), vqtbl1q_s8(samples_128, permute_tbl.val[1]), vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; // Accumulate into 128 * FILTER_WEIGHT to account for range transform. int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT); // First 4 output values. int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); sum0 = vdotq_lane_s32(sum0, perm_samples[1], filters, 1); // Second 4 output values. int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); sum1 = vdotq_lane_s32(sum1, perm_samples[2], filters, 1); // Narrow and re-pack. int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); return vqrshrun_n_s16(sum, FILTER_BITS); } static inline void convolve8_horiz_8tap_neon_dotprod( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x)); if (w == 4) { const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl); do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); int16x4_t d0 = convolve8_4_h(s0, filter, perm_tbl); int16x4_t d1 = convolve8_4_h(s1, filter, perm_tbl); int16x4_t d2 = convolve8_4_h(s2, filter, perm_tbl); int16x4_t d3 = convolve8_4_h(s3, filter, perm_tbl); uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h > 0); } else { const uint8x16x3_t perm_tbl = vld1q_u8_x3(kDotProdPermuteTbl); do { int width = w; const uint8_t *s = src; uint8_t *d = dst; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint8x8_t d0 = convolve8_8_h(s0, filter, perm_tbl); uint8x8_t d1 = convolve8_8_h(s1, filter, perm_tbl); uint8x8_t d2 = convolve8_8_h(s2, filter, perm_tbl); uint8x8_t d3 = convolve8_8_h(s3, filter, perm_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h > 0); } } static inline int16x4_t convolve4_4_h(const uint8x16_t samples, const int8x8_t filters, const uint8x16_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t samples_128 = vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl); // Accumulate into 128 * FILTER_WEIGHT to account for range transform. // (Divide by 2 since we halved the filter values.) int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2); int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0); // Further narrowing and packing is performed by the caller. return vmovn_s32(sum); } static inline uint8x8_t convolve4_8_h(const uint8x16_t samples, const int8x8_t filters, const uint8x16x2_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t samples_128 = vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; // Accumulate into 128 * FILTER_WEIGHT to account for range transform. // (Divide by 2 since we halved the filter values.) int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT / 2); // First 4 output values. int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); // Second 4 output values. int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); // Narrow and re-pack. int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void convolve8_horiz_4tap_neon_dotprod( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) { const int16x4_t x_filter = vld1_s16(filter_x + 2); // All 4-tap and bilinear filter values are even, so halve them to reduce // intermediate precision requirements. const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); if (width == 4) { const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl); do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl); int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl); int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl); int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl); // We halved the filter values so -1 from right shift. uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 0); } else { const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl); do { const uint8_t *s = src; uint8_t *d = dst; int w = width; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl); uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl); uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl); uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 0); } } void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); (void)x_step_q4; (void)filter_y; (void)y_step_q4; src -= ((SUBPEL_TAPS / 2) - 1); int filter_taps = get_filter_taps_convolve8(filter_x); if (filter_taps == 2) { convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w, h); } else if (filter_taps == 4) { convolve8_horiz_4tap_neon_dotprod(src + 2, src_stride, dst, dst_stride, filter_x, w, h); } else { convolve8_horiz_8tap_neon_dotprod(src, src_stride, dst, dst_stride, filter_x, w, h); } } static inline void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, int8x8_t a3, int8x16_t *b) { // Transpose 8-bit elements and concatenate result rows as follows: // a0: 00, 01, 02, 03, XX, XX, XX, XX // a1: 10, 11, 12, 13, XX, XX, XX, XX // a2: 20, 21, 22, 23, XX, XX, XX, XX // a3: 30, 31, 32, 33, XX, XX, XX, XX // // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); int8x16_t a01 = vzipq_s8(a0q, a1q).val[0]; int8x16_t a23 = vzipq_s8(a2q, a3q).val[0]; int16x8_t a0123 = vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0]; *b = vreinterpretq_s8_s16(a0123); } static inline void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, int8x8_t a3, int8x16_t *b0, int8x16_t *b1) { // Transpose 8-bit elements and concatenate result rows as follows: // a0: 00, 01, 02, 03, 04, 05, 06, 07 // a1: 10, 11, 12, 13, 14, 15, 16, 17 // a2: 20, 21, 22, 23, 24, 25, 26, 27 // a3: 30, 31, 32, 33, 34, 35, 36, 37 // // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); int8x16_t a01 = vzipq_s8(a0q, a1q).val[0]; int8x16_t a23 = vzipq_s8(a2q, a3q).val[0]; int16x8x2_t a0123 = vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)); *b0 = vreinterpretq_s8_s16(a0123.val[0]); *b1 = vreinterpretq_s8_s16(a0123.val[1]); } static inline int16x4_t convolve8_4_v(const int8x16_t samples_lo, const int8x16_t samples_hi, const int8x8_t filters) { // The sample range transform and permutation are performed by the caller. // Accumulate into 128 * FILTER_WEIGHT to account for range transform. int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT); int32x4_t sum = vdotq_lane_s32(acc, samples_lo, filters, 0); sum = vdotq_lane_s32(sum, samples_hi, filters, 1); // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } static inline uint8x8_t convolve8_8_v(const int8x16_t samples0_lo, const int8x16_t samples0_hi, const int8x16_t samples1_lo, const int8x16_t samples1_hi, const int8x8_t filters) { // The sample range transform and permutation are performed by the caller. // Accumulate into 128 * FILTER_WEIGHT to account for range transform. int32x4_t acc = vdupq_n_s32(128 * FILTER_WEIGHT); // First 4 output values. int32x4_t sum0 = vdotq_lane_s32(acc, samples0_lo, filters, 0); sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1); // Second 4 output values. int32x4_t sum1 = vdotq_lane_s32(acc, samples1_lo, filters, 0); sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1); // Narrow and re-pack. int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); return vqrshrun_n_s16(sum, FILTER_BITS); } static inline void convolve8_vert_8tap_neon_dotprod( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y)); const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); int8x16x2_t samples_LUT; if (w == 4) { uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); src += 7 * src_stride; // Clamp sample range to [-128, 127] for 8-bit signed dot product. int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); // This operation combines a conventional transpose and the sample permute // (see horizontal case) required before computing the dot product. int8x16_t s0123, s1234, s2345, s3456; transpose_concat_4x4(s0, s1, s2, s3, &s0123); transpose_concat_4x4(s1, s2, s3, s4, &s1234); transpose_concat_4x4(s2, s3, s4, s5, &s2345); transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t t7, t8, t9, t10; load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); int8x16_t s4567, s5678, s6789, s78910; transpose_concat_4x4(s7, s8, s9, s10, &s78910); // Merge new data into block from previous iteration. samples_LUT.val[0] = s3456; samples_LUT.val[1] = s78910; s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); int16x4_t d0 = convolve8_4_v(s0123, s4567, filter); int16x4_t d1 = convolve8_4_v(s1234, s5678, filter); int16x4_t d2 = convolve8_4_v(s2345, s6789, filter); int16x4_t d3 = convolve8_4_v(s3456, s78910, filter); uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; s3456 = s78910; src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { int height = h; const uint8_t *s = src; uint8_t *d = dst; uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); s += 7 * src_stride; // Clamp sample range to [-128, 127] for 8-bit signed dot product. int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); // This operation combines a conventional transpose and the sample permute // (see horizontal case) required before computing the dot product. int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, s3456_lo, s3456_hi; transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t t7, t8, t9, t10; load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, s78910_lo, s78910_hi; transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); // Merge new data into block from previous iteration. samples_LUT.val[0] = s3456_lo; samples_LUT.val[1] = s78910_lo; s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); samples_LUT.val[0] = s3456_hi; samples_LUT.val[1] = s78910_hi; s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); uint8x8_t d0 = convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter); uint8x8_t d1 = convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter); uint8x8_t d2 = convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter); uint8x8_t d3 = convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; s1234_hi = s5678_hi; s2345_lo = s6789_lo; s2345_hi = s6789_hi; s3456_lo = s78910_lo; s3456_hi = s78910_hi; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src += 8; dst += 8; w -= 8; } while (w != 0); } } void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); (void)filter_x; (void)x_step_q4; (void)y_step_q4; src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; int filter_taps = get_filter_taps_convolve8(filter_y); if (filter_taps == 2) { convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride, filter_y, w, h); } else if (filter_taps == 4) { convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride, filter_y, w, h); } else { convolve8_vert_8tap_neon_dotprod(src, src_stride, dst, dst_stride, filter_y, w, h); } } aom-3.12.1/aom_dsp/arm/aom_convolve8_neon_i8mm.c000066400000000000000000000466341477627663500215130ustar00rootroot00000000000000/* * Copyright (c) 2014 The WebM project authors. All rights reserved. * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/arm/aom_convolve8_neon.h" #include "aom_dsp/arm/aom_filter.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" DECLARE_ALIGNED(16, static const uint8_t, kMatMulPermuteTbl[32]) = { // clang-format off 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 // clang-format on }; DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { // Shift left and insert new last column in transposed 4x4 block. 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, // Shift left and insert two new columns in transposed 4x4 block. 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, // Shift left and insert three new columns in transposed 4x4 block. 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; static inline int16x4_t convolve8_4_h(const uint8x16_t samples, const int8x8_t filters, const uint8x16x2_t permute_tbl) { // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), vqtbl1q_u8(samples, permute_tbl.val[1]) }; int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1); // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } static inline uint8x8_t convolve8_8_h(const uint8x16_t samples, const int8x8_t filters, const uint8x16x3_t permute_tbl) { // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } uint8x16_t permuted_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]), vqtbl1q_u8(samples, permute_tbl.val[1]), vqtbl1q_u8(samples, permute_tbl.val[2]) }; // First 4 output values. int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1); // Second 4 output values. int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1); // Narrow and re-pack. int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); return vqrshrun_n_s16(sum, FILTER_BITS); } static inline void convolve8_horiz_8tap_neon_i8mm( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int w, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x)); if (w == 4) { const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl); do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); int16x4_t d0 = convolve8_4_h(s0, filter, perm_tbl); int16x4_t d1 = convolve8_4_h(s1, filter, perm_tbl); int16x4_t d2 = convolve8_4_h(s2, filter, perm_tbl); int16x4_t d3 = convolve8_4_h(s3, filter, perm_tbl); uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h > 0); } else { const uint8x16x3_t perm_tbl = vld1q_u8_x3(kDotProdPermuteTbl); do { int width = w; const uint8_t *s = src; uint8_t *d = dst; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint8x8_t d0 = convolve8_8_h(s0, filter, perm_tbl); uint8x8_t d1 = convolve8_8_h(s1, filter, perm_tbl); uint8x8_t d2 = convolve8_8_h(s2, filter, perm_tbl); uint8x8_t d3 = convolve8_8_h(s3, filter, perm_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h > 0); } } static inline int16x4_t convolve6_4_h(const uint8x16_t samples, const int8x16_t filter, const uint8x16_t permute_tbl) { // Permute samples ready for matrix multiply. // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl); // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register. int32x4_t sum = vusmmlaq_s32(vdupq_n_s32(0), perm_samples, filter); // Further narrowing and packing is performed by the caller. return vmovn_s32(sum); } static inline uint8x8_t convolve6_8_h(const uint8x16_t samples, const int8x16_t filter, const uint8x16x2_t permute_tbl) { // Permute samples ready for matrix multiply. // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 } uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), vqtbl1q_u8(samples, permute_tbl.val[1]) }; // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register. int32x4_t sum0123 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[0], filter); int32x4_t sum4567 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[1], filter); // Narrow and re-pack. int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void convolve8_horiz_6tap_neon_i8mm( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height) { // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(filter_x), 1); // Stagger the filter for use with the matrix multiply instructions. // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } const int8x16_t filter = vcombine_s8(vext_s8(x_filter, x_filter, 1), x_filter); if (width == 4) { const uint8x16_t perm_tbl = vld1q_u8(kMatMulPermuteTbl); do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); int16x4_t t0 = convolve6_4_h(s0, filter, perm_tbl); int16x4_t t1 = convolve6_4_h(s1, filter, perm_tbl); int16x4_t t2 = convolve6_4_h(s2, filter, perm_tbl); int16x4_t t3 = convolve6_4_h(s3, filter, perm_tbl); // We halved the filter values so -1 from right shift. uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 0); } else { const uint8x16x2_t perm_tbl = vld1q_u8_x2(kMatMulPermuteTbl); do { int w = width; const uint8_t *s = src; uint8_t *d = dst; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint8x8_t d0 = convolve6_8_h(s0, filter, perm_tbl); uint8x8_t d1 = convolve6_8_h(s1, filter, perm_tbl); uint8x8_t d2 = convolve6_8_h(s2, filter, perm_tbl); uint8x8_t d3 = convolve6_8_h(s3, filter, perm_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 0); } } void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); (void)x_step_q4; (void)filter_y; (void)y_step_q4; src -= ((SUBPEL_TAPS / 2) - 1); int filter_taps = get_filter_taps_convolve8(filter_x); if (filter_taps == 2) { convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w, h); } else if (filter_taps <= 6) { convolve8_horiz_6tap_neon_i8mm(src + 1, src_stride, dst, dst_stride, filter_x, w, h); } else { convolve8_horiz_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_x, w, h); } } static inline void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x16_t *b) { // Transpose 8-bit elements and concatenate result rows as follows: // a0: 00, 01, 02, 03, XX, XX, XX, XX // a1: 10, 11, 12, 13, XX, XX, XX, XX // a2: 20, 21, 22, 23, XX, XX, XX, XX // a3: 30, 31, 32, 33, XX, XX, XX, XX // // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0]; uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0]; uint16x8_t a0123 = vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0]; *b = vreinterpretq_u8_u16(a0123); } static inline void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x16_t *b0, uint8x16_t *b1) { // Transpose 8-bit elements and concatenate result rows as follows: // a0: 00, 01, 02, 03, 04, 05, 06, 07 // a1: 10, 11, 12, 13, 14, 15, 16, 17 // a2: 20, 21, 22, 23, 24, 25, 26, 27 // a3: 30, 31, 32, 33, 34, 35, 36, 37 // // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0]; uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0]; uint16x8x2_t a0123 = vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)); *b0 = vreinterpretq_u8_u16(a0123.val[0]); *b1 = vreinterpretq_u8_u16(a0123.val[1]); } static inline int16x4_t convolve8_4_v(const uint8x16_t samples_lo, const uint8x16_t samples_hi, const int8x8_t filters) { // Sample permutation is performed by the caller. int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); sum = vusdotq_lane_s32(sum, samples_hi, filters, 1); // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } static inline uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo, const uint8x16_t samples0_hi, const uint8x16_t samples1_lo, const uint8x16_t samples1_hi, const int8x8_t filters) { // Sample permutation is performed by the caller. // First 4 output values. int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0); sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1); // Second 4 output values. int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0); sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1); // Narrow and re-pack. int16x8_t sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); return vqrshrun_n_s16(sum, FILTER_BITS); } static inline void convolve8_vert_8tap_neon_i8mm( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_y, int w, int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y)); const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); uint8x16x2_t samples_LUT; if (w == 4) { uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src += 7 * src_stride; // This operation combines a conventional transpose and the sample permute // (see horizontal case) required before computing the dot product. uint8x16_t s0123, s1234, s2345, s3456; transpose_concat_4x4(s0, s1, s2, s3, &s0123); transpose_concat_4x4(s1, s2, s3, s4, &s1234); transpose_concat_4x4(s2, s3, s4, s5, &s2345); transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t s7, s8, s9, s10; load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); uint8x16_t s4567, s5678, s6789, s78910; transpose_concat_4x4(s7, s8, s9, s10, &s78910); // Merge new data into block from previous iteration. samples_LUT.val[0] = s3456; samples_LUT.val[1] = s78910; s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); int16x4_t d0 = convolve8_4_v(s0123, s4567, filter); int16x4_t d1 = convolve8_4_v(s1234, s5678, filter); int16x4_t d2 = convolve8_4_v(s2345, s6789, filter); int16x4_t d3 = convolve8_4_v(s3456, s78910, filter); uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; s3456 = s78910; src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { int height = h; const uint8_t *s = src; uint8_t *d = dst; uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; // This operation combines a conventional transpose and the sample permute // (see horizontal case) required before computing the dot product. uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, s3456_lo, s3456_hi; transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t s7, s8, s9, s10; load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, s78910_lo, s78910_hi; transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); // Merge new data into block from previous iteration. samples_LUT.val[0] = s3456_lo; samples_LUT.val[1] = s78910_lo; s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); samples_LUT.val[0] = s3456_hi; samples_LUT.val[1] = s78910_hi; s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); uint8x8_t d0 = convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter); uint8x8_t d1 = convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter); uint8x8_t d2 = convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter); uint8x8_t d3 = convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; s1234_hi = s5678_hi; s2345_lo = s6789_lo; s2345_hi = s6789_hi; s3456_lo = s78910_lo; s3456_hi = s78910_hi; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src += 8; dst += 8; w -= 8; } while (w != 0); } } void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { assert((intptr_t)dst % 4 == 0); assert(dst_stride % 4 == 0); (void)filter_x; (void)x_step_q4; (void)y_step_q4; src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; int filter_taps = get_filter_taps_convolve8(filter_y); if (filter_taps == 2) { convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride, filter_y, w, h); } else if (filter_taps == 4) { convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride, filter_y, w, h); } else { convolve8_vert_8tap_neon_i8mm(src, src_stride, dst, dst_stride, filter_y, w, h); } } aom-3.12.1/aom_dsp/arm/aom_convolve_copy_neon.c000066400000000000000000000103551477627663500215120ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_dsp_rtcd.h" void aom_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { const uint8_t *src1; uint8_t *dst1; int y; if (!(w & 0x0F)) { for (y = 0; y < h; ++y) { src1 = src; dst1 = dst; for (int x = 0; x < (w >> 4); ++x) { vst1q_u8(dst1, vld1q_u8(src1)); src1 += 16; dst1 += 16; } src += src_stride; dst += dst_stride; } } else if (!(w & 0x07)) { for (y = 0; y < h; ++y) { vst1_u8(dst, vld1_u8(src)); src += src_stride; dst += dst_stride; } } else if (!(w & 0x03)) { for (y = 0; y < h; ++y) { memcpy(dst, src, sizeof(uint32_t)); src += src_stride; dst += dst_stride; } } else if (!(w & 0x01)) { for (y = 0; y < h; ++y) { memcpy(dst, src, sizeof(uint16_t)); src += src_stride; dst += dst_stride; } } } #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h) { if (w < 4) { // copy2 do { memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; h -= 2; } while (h != 0); } else if (w == 4) { // copy4 uint16x4_t s0, s1; do { s0 = vld1_u16(src); src += src_stride; s1 = vld1_u16(src); src += src_stride; vst1_u16(dst, s0); dst += dst_stride; vst1_u16(dst, s1); dst += dst_stride; h -= 2; } while (h != 0); } else if (w == 8) { // copy8 uint16x8_t s0, s1; do { s0 = vld1q_u16(src); src += src_stride; s1 = vld1q_u16(src); src += src_stride; vst1q_u16(dst, s0); dst += dst_stride; vst1q_u16(dst, s1); dst += dst_stride; h -= 2; } while (h != 0); } else if (w < 32) { // copy16 uint16x8_t s0, s1, s2, s3; do { s0 = vld1q_u16(src); s1 = vld1q_u16(src + 8); src += src_stride; s2 = vld1q_u16(src); s3 = vld1q_u16(src + 8); src += src_stride; vst1q_u16(dst, s0); vst1q_u16(dst + 8, s1); dst += dst_stride; vst1q_u16(dst, s2); vst1q_u16(dst + 8, s3); dst += dst_stride; h -= 2; } while (h != 0); } else if (w == 32) { // copy32 uint16x8_t s0, s1, s2, s3; do { s0 = vld1q_u16(src); s1 = vld1q_u16(src + 8); s2 = vld1q_u16(src + 16); s3 = vld1q_u16(src + 24); src += src_stride; vst1q_u16(dst, s0); vst1q_u16(dst + 8, s1); vst1q_u16(dst + 16, s2); vst1q_u16(dst + 24, s3); dst += dst_stride; } while (--h != 0); } else { // copy64 uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7; do { const uint16_t *s = src; uint16_t *d = dst; int width = w; do { s0 = vld1q_u16(s); s1 = vld1q_u16(s + 8); s2 = vld1q_u16(s + 16); s3 = vld1q_u16(s + 24); s4 = vld1q_u16(s + 32); s5 = vld1q_u16(s + 40); s6 = vld1q_u16(s + 48); s7 = vld1q_u16(s + 56); vst1q_u16(d, s0); vst1q_u16(d + 8, s1); vst1q_u16(d + 16, s2); vst1q_u16(d + 24, s3); vst1q_u16(d + 32, s4); vst1q_u16(d + 40, s5); vst1q_u16(d + 48, s6); vst1q_u16(d + 56, s7); s += 64; d += 64; width -= 64; } while (width > 0); src += src_stride; dst += dst_stride; } while (--h != 0); } } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/arm/aom_filter.h000066400000000000000000000017061477627663500171000ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_ARM_AOM_FILTER_H_ #define AOM_AOM_DSP_ARM_AOM_FILTER_H_ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" static inline int get_filter_taps_convolve8(const int16_t *filter) { if (filter[0] | filter[7]) { return 8; } if (filter[1] | filter[6]) { return 6; } if (filter[2] | filter[5]) { return 4; } return 2; } #endif // AOM_AOM_DSP_ARM_AOM_FILTER_H_ aom-3.12.1/aom_dsp/arm/aom_neon_sve2_bridge.h000066400000000000000000000031361477627663500210240ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_ARM_AOM_NEON_SVE2_BRIDGE_H_ #define AOM_AOM_DSP_ARM_AOM_NEON_SVE2_BRIDGE_H_ #include #include "config/aom_dsp_rtcd.h" #include "config/aom_config.h" // We can access instructions exclusive to the SVE2 instruction set from a // predominantly Neon context by making use of the Neon-SVE bridge intrinsics // to reinterpret Neon vectors as SVE vectors - with the high part of the SVE // vector (if it's longer than 128 bits) being "don't care". // While sub-optimal on machines that have SVE vector length > 128-bit - as the // remainder of the vector is unused - this approach is still beneficial when // compared to a Neon-only solution. static inline int16x8_t aom_tbl2_s16(int16x8_t s0, int16x8_t s1, uint16x8_t tbl) { svint16x2_t samples = svcreate2_s16(svset_neonq_s16(svundef_s16(), s0), svset_neonq_s16(svundef_s16(), s1)); return svget_neonq_s16( svtbl2_s16(samples, svset_neonq_u16(svundef_u16(), tbl))); } #endif // AOM_AOM_DSP_ARM_AOM_NEON_SVE2_BRIDGE_H_ aom-3.12.1/aom_dsp/arm/aom_neon_sve_bridge.h000066400000000000000000000051001477627663500207330ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_ARM_AOM_NEON_SVE_BRIDGE_H_ #define AOM_AOM_DSP_ARM_AOM_NEON_SVE_BRIDGE_H_ #include #include "config/aom_dsp_rtcd.h" #include "config/aom_config.h" // We can access instructions exclusive to the SVE instruction set from a // predominantly Neon context by making use of the Neon-SVE bridge intrinsics // to reinterpret Neon vectors as SVE vectors - with the high part of the SVE // vector (if it's longer than 128 bits) being "don't care". // While sub-optimal on machines that have SVE vector length > 128-bit - as the // remainder of the vector is unused - this approach is still beneficial when // compared to a Neon-only solution. static inline uint64x2_t aom_udotq_u16(uint64x2_t acc, uint16x8_t x, uint16x8_t y) { return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc), svset_neonq_u16(svundef_u16(), x), svset_neonq_u16(svundef_u16(), y))); } static inline int64x2_t aom_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) { return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc), svset_neonq_s16(svundef_s16(), x), svset_neonq_s16(svundef_s16(), y))); } #define aom_svdot_lane_s16(sum, s0, f, lane) \ svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), sum), \ svset_neonq_s16(svundef_s16(), s0), \ svset_neonq_s16(svundef_s16(), f), lane)) static inline uint16x8_t aom_tbl_u16(uint16x8_t s, uint16x8_t tbl) { return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), s), svset_neonq_u16(svundef_u16(), tbl))); } static inline int16x8_t aom_tbl_s16(int16x8_t s, uint16x8_t tbl) { return svget_neonq_s16(svtbl_s16(svset_neonq_s16(svundef_s16(), s), svset_neonq_u16(svundef_u16(), tbl))); } #endif // AOM_AOM_DSP_ARM_AOM_NEON_SVE_BRIDGE_H_ aom-3.12.1/aom_dsp/arm/aom_scaled_convolve8_neon.c000066400000000000000000000311611477627663500220610ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/aom_convolve8_neon.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "config/aom_dsp_rtcd.h" static inline void scaled_convolve_horiz_neon( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const InterpKernel *const x_filter, const int x0_q4, const int x_step_q4, int w, int h) { DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); if (w == 4) { do { int x_q4 = x0_q4; // Process a 4x4 tile. for (int r = 0; r < 4; ++r) { const uint8_t *s = &src[x_q4 >> SUBPEL_BITS]; if (x_q4 & SUBPEL_MASK) { // Halve filter values (all even) to avoid the need for saturating // arithmetic in convolution kernels. const int16x8_t filter = vshrq_n_s16(vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]), 1); uint8x8_t t0, t1, t2, t3; load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); // We halved the filter values so -1 from right shift. uint8x8_t d0 = vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS - 1); store_u8_4x1(&temp[4 * r], d0); } else { // Memcpy for non-subpel locations. s += SUBPEL_TAPS / 2 - 1; for (int c = 0; c < 4; ++c) { temp[r * 4 + c] = s[c * src_stride]; } } x_q4 += x_step_q4; } // Transpose the 4x4 result tile and store. uint8x8_t d01 = vld1_u8(temp + 0); uint8x8_t d23 = vld1_u8(temp + 8); transpose_elems_inplace_u8_4x4(&d01, &d23); store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01); store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h > 0); return; } // w >= 8 do { int x_q4 = x0_q4; uint8_t *d = dst; int width = w; do { // Process an 8x8 tile. for (int r = 0; r < 8; ++r) { const uint8_t *s = &src[x_q4 >> SUBPEL_BITS]; if (x_q4 & SUBPEL_MASK) { // Halve filter values (all even) to avoid the need for saturating // arithmetic in convolution kernels. const int16x8_t filter = vshrq_n_s16(vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]), 1); uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); vst1_u8(&temp[r * 8], d0); } else { // Memcpy for non-subpel locations. s += SUBPEL_TAPS / 2 - 1; for (int c = 0; c < 8; ++c) { temp[r * 8 + c] = s[c * src_stride]; } } x_q4 += x_step_q4; } // Transpose the 8x8 result tile and store. uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); d += 8; width -= 8; } while (width != 0); src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; } while (h > 0); } static inline void scaled_convolve_vert_neon( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const InterpKernel *const y_filter, const int y0_q4, const int y_step_q4, int w, int h) { int y_q4 = y0_q4; if (w == 4) { do { const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; if (y_q4 & SUBPEL_MASK) { // Halve filter values (all even) to avoid the need for saturating // arithmetic in convolution kernels. const int16x8_t filter = vshrq_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1); uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); // We halved the filter values so -1 from right shift. uint8x8_t d0 = vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS - 1); store_u8_4x1(dst, d0); } else { // Memcpy for non-subpel locations. memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 4); } y_q4 += y_step_q4; dst += dst_stride; } while (--h != 0); return; } if (w == 8) { do { const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; if (y_q4 & SUBPEL_MASK) { // Halve filter values (all even) to avoid the need for saturating // arithmetic in convolution kernels. const int16x8_t filter = vshrq_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1); uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); vst1_u8(dst, d0); } else { // Memcpy for non-subpel locations. memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 8); } y_q4 += y_step_q4; dst += dst_stride; } while (--h != 0); return; } // w >= 16 do { const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; uint8_t *d = dst; int width = w; if (y_q4 & SUBPEL_MASK) { do { // Halve filter values (all even) to avoid the need for saturating // arithmetic in convolution kernels. const int16x8_t filter = vshrq_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1); uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_16x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; s0[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); s1[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1))); s2[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2))); s3[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3))); s4[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t4))); s5[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t5))); s6[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t6))); s7[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t7))); s0[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); s1[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1))); s2[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2))); s3[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3))); s4[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t4))); s5[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t5))); s6[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t6))); s7[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t7))); uint8x8_t d0 = convolve8_8(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0], s6[0], s7[0], filter); uint8x8_t d1 = convolve8_8(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1], s6[1], s7[1], filter); vst1q_u8(d, vcombine_u8(d0, d1)); s += 16; d += 16; width -= 16; } while (width != 0); } else { // Memcpy for non-subpel locations. s += (SUBPEL_TAPS / 2 - 1) * src_stride; do { uint8x16_t s0 = vld1q_u8(s); vst1q_u8(d, s0); s += 16; d += 16; width -= 16; } while (width != 0); } y_q4 += y_step_q4; dst += dst_stride; } while (--h != 0); } void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { // Fixed size intermediate buffer, im_block, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. // (2) Interpolate temp vertically to derive the sub-pixel result. // Deriving the maximum number of rows in the im_block buffer (135): // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). // --Largest block size is 64x64 pixels. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the // original frame (in 1/16th pixel units). // --Must round-up because block may be located at sub-pixel position. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. // --Require an additional 8 rows for the horiz_w8 transpose tail. // When calling in frame scaling function, the smallest scaling factor is x1/4 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still // big enough. DECLARE_ALIGNED(16, uint8_t, im_block[(135 + 8) * 64]); const int im_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; const ptrdiff_t im_stride = 64; assert(w <= 64); assert(h <= 64); assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); assert(x_step_q4 <= 64); // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2 // lines post both horizontally and vertically. const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1; const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride; scaled_convolve_horiz_neon(src - horiz_offset - vert_offset, src_stride, im_block, im_stride, filter, x0_q4, x_step_q4, w, im_height); scaled_convolve_vert_neon(im_block, im_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, h); } aom-3.12.1/aom_dsp/arm/aom_scaled_convolve8_neon_dotprod.c000066400000000000000000000325531477627663500236220ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/aom_convolve8_neon.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "config/aom_dsp_rtcd.h" static inline uint8x8_t convolve8_4_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2, uint8x8_t s3, int8x8_t filter) { int8x16_t filter_x2 = vcombine_s8(filter, filter); uint8x16_t s01 = vcombine_u8(s0, s1); uint8x16_t s23 = vcombine_u8(s2, s3); // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t s01_128 = vreinterpretq_s8_u8(vsubq_u8(s01, vdupq_n_u8(128))); int8x16_t s23_128 = vreinterpretq_s8_u8(vsubq_u8(s23, vdupq_n_u8(128))); // Accumulate into 128 << (FILTER_BITS - 1) / 2 to account for range // transform. const int32x4_t acc = vdupq_n_s32((128 << (FILTER_BITS - 1)) / 2); int32x4_t sum01 = vdotq_s32(acc, s01_128, filter_x2); int32x4_t sum23 = vdotq_s32(acc, s23_128, filter_x2); int32x4_t sum0123 = vpaddq_s32(sum01, sum23); int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vdup_n_s16(0)); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline uint8x8_t convolve8_8_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2, uint8x8_t s3, uint8x8_t s4, uint8x8_t s5, uint8x8_t s6, uint8x8_t s7, int8x8_t filter) { int8x16_t filter_x2 = vcombine_s8(filter, filter); uint8x16_t s01 = vcombine_u8(s0, s1); uint8x16_t s23 = vcombine_u8(s2, s3); uint8x16_t s45 = vcombine_u8(s4, s5); uint8x16_t s67 = vcombine_u8(s6, s7); // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t s01_128 = vreinterpretq_s8_u8(vsubq_u8(s01, vdupq_n_u8(128))); int8x16_t s23_128 = vreinterpretq_s8_u8(vsubq_u8(s23, vdupq_n_u8(128))); int8x16_t s45_128 = vreinterpretq_s8_u8(vsubq_u8(s45, vdupq_n_u8(128))); int8x16_t s67_128 = vreinterpretq_s8_u8(vsubq_u8(s67, vdupq_n_u8(128))); // Accumulate into 128 << (FILTER_BITS - 1) / 2 to account for range // transform. const int32x4_t acc = vdupq_n_s32((128 << (FILTER_BITS - 1)) / 2); int32x4_t sum01 = vdotq_s32(acc, s01_128, filter_x2); int32x4_t sum23 = vdotq_s32(acc, s23_128, filter_x2); int32x4_t sum45 = vdotq_s32(acc, s45_128, filter_x2); int32x4_t sum67 = vdotq_s32(acc, s67_128, filter_x2); int32x4_t sum0123 = vpaddq_s32(sum01, sum23); int32x4_t sum4567 = vpaddq_s32(sum45, sum67); int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void scaled_convolve_horiz_neon_dotprod( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const InterpKernel *const x_filter, const int x0_q4, const int x_step_q4, int w, int h) { DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); if (w == 4) { do { int x_q4 = x0_q4; // Process a 4x4 tile. for (int r = 0; r < 4; ++r) { // Halve filter values (all even) to avoid the need for saturating // arithmetic in convolution kernels. const int8x8_t filter = vshrn_n_s16(vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]), 1); const uint8_t *s = &src[x_q4 >> SUBPEL_BITS]; uint8x8_t s0, s1, s2, s3; load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3); uint8x8_t d0 = convolve8_4_h(s0, s1, s2, s3, filter); store_u8_4x1(&temp[4 * r], d0); x_q4 += x_step_q4; } // Transpose the 4x4 result tile and store. uint8x8_t d01 = vld1_u8(temp + 0); uint8x8_t d23 = vld1_u8(temp + 8); transpose_elems_inplace_u8_4x4(&d01, &d23); store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01); store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h > 0); return; } // w >= 8 do { int x_q4 = x0_q4; uint8_t *d = dst; int width = w; do { // Process an 8x8 tile. for (int r = 0; r < 8; ++r) { // Halve filter values (all even) to avoid the need for saturating // arithmetic in convolution kernels. const int8x8_t filter = vshrn_n_s16(vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]), 1); const uint8_t *s = &src[x_q4 >> SUBPEL_BITS]; uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7; load_u8_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); uint8x8_t d0 = convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter); vst1_u8(&temp[r * 8], d0); x_q4 += x_step_q4; } // Transpose the 8x8 result tile and store. uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); d += 8; width -= 8; } while (width != 0); src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; } while (h > 0); } static inline uint8x8_t convolve8_4_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2, uint8x8_t s3, uint8x8_t s4, uint8x8_t s5, uint8x8_t s6, uint8x8_t s7, int8x8_t filter) { uint8x16_t s01 = vcombine_u8(vzip1_u8(s0, s1), vdup_n_u8(0)); uint8x16_t s23 = vcombine_u8(vzip1_u8(s2, s3), vdup_n_u8(0)); uint8x16_t s45 = vcombine_u8(vzip1_u8(s4, s5), vdup_n_u8(0)); uint8x16_t s67 = vcombine_u8(vzip1_u8(s6, s7), vdup_n_u8(0)); uint8x16_t s0123 = vreinterpretq_u8_u16( vzip1q_u16(vreinterpretq_u16_u8(s01), vreinterpretq_u16_u8(s23))); uint8x16_t s4567 = vreinterpretq_u8_u16( vzip1q_u16(vreinterpretq_u16_u8(s45), vreinterpretq_u16_u8(s67))); // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t s0123_128 = vreinterpretq_s8_u8(vsubq_u8(s0123, vdupq_n_u8(128))); int8x16_t s4567_128 = vreinterpretq_s8_u8(vsubq_u8(s4567, vdupq_n_u8(128))); // Accumulate into 128 << (FILTER_BITS - 1) to account for range transform. int32x4_t sum = vdupq_n_s32(128 << (FILTER_BITS - 1)); sum = vdotq_lane_s32(sum, s0123_128, filter, 0); sum = vdotq_lane_s32(sum, s4567_128, filter, 1); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(vcombine_s16(vmovn_s32(sum), vdup_n_s16(0)), FILTER_BITS - 1); } static inline uint8x8_t convolve8_8_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2, uint8x8_t s3, uint8x8_t s4, uint8x8_t s5, uint8x8_t s6, uint8x8_t s7, int8x8_t filter) { uint8x16_t s01 = vzip1q_u8(vcombine_u8(s0, vdup_n_u8(0)), vcombine_u8(s1, vdup_n_u8(0))); uint8x16_t s23 = vzip1q_u8(vcombine_u8(s2, vdup_n_u8(0)), vcombine_u8(s3, vdup_n_u8(0))); uint8x16_t s45 = vzip1q_u8(vcombine_u8(s4, vdup_n_u8(0)), vcombine_u8(s5, vdup_n_u8(0))); uint8x16_t s67 = vzip1q_u8(vcombine_u8(s6, vdup_n_u8(0)), vcombine_u8(s7, vdup_n_u8(0))); uint8x16_t s0123[2] = { vreinterpretq_u8_u16( vzip1q_u16(vreinterpretq_u16_u8(s01), vreinterpretq_u16_u8(s23))), vreinterpretq_u8_u16( vzip2q_u16(vreinterpretq_u16_u8(s01), vreinterpretq_u16_u8(s23))) }; uint8x16_t s4567[2] = { vreinterpretq_u8_u16( vzip1q_u16(vreinterpretq_u16_u8(s45), vreinterpretq_u16_u8(s67))), vreinterpretq_u8_u16( vzip2q_u16(vreinterpretq_u16_u8(s45), vreinterpretq_u16_u8(s67))) }; // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t s0123_128[2] = { vreinterpretq_s8_u8(vsubq_u8(s0123[0], vdupq_n_u8(128))), vreinterpretq_s8_u8(vsubq_u8(s0123[1], vdupq_n_u8(128))) }; int8x16_t s4567_128[2] = { vreinterpretq_s8_u8(vsubq_u8(s4567[0], vdupq_n_u8(128))), vreinterpretq_s8_u8(vsubq_u8(s4567[1], vdupq_n_u8(128))) }; // Accumulate into 128 << (FILTER_BITS - 1) to account for range transform. const int32x4_t acc = vdupq_n_s32(128 << (FILTER_BITS - 1)); int32x4_t sum0123 = vdotq_lane_s32(acc, s0123_128[0], filter, 0); sum0123 = vdotq_lane_s32(sum0123, s4567_128[0], filter, 1); int32x4_t sum4567 = vdotq_lane_s32(acc, s0123_128[1], filter, 0); sum4567 = vdotq_lane_s32(sum4567, s4567_128[1], filter, 1); int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void scaled_convolve_vert_neon_dotprod( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const InterpKernel *const y_filter, const int y0_q4, const int y_step_q4, int w, int h) { int y_q4 = y0_q4; if (w == 4) { do { const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; if (y_q4 & SUBPEL_MASK) { // Halve filter values (all even) to avoid the need for saturating // arithmetic in convolution kernels. const int8x8_t filter = vshrn_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1); uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7; load_u8_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); uint8x8_t d0 = convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter); store_u8_4x1(dst, d0); } else { // Memcpy for non-subpel locations. memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 4); } y_q4 += y_step_q4; dst += dst_stride; } while (--h != 0); return; } // w >= 8 do { const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; uint8_t *d = dst; int width = w; if (y_q4 & SUBPEL_MASK) { // Halve filter values (all even) to avoid the need for saturating // arithmetic in convolution kernels. const int8x8_t filter = vshrn_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1); do { uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7; load_u8_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); uint8x8_t d0 = convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter); vst1_u8(d, d0); s += 8; d += 8; width -= 8; } while (width != 0); } else { // Memcpy for non-subpel locations. s += (SUBPEL_TAPS / 2 - 1) * src_stride; do { uint8x8_t s0 = vld1_u8(s); vst1_u8(d, s0); s += 8; d += 8; width -= 8; } while (width != 0); } y_q4 += y_step_q4; dst += dst_stride; } while (--h != 0); } void aom_scaled_2d_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { // Fixed size intermediate buffer, im_block, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. // (2) Interpolate temp vertically to derive the sub-pixel result. // Deriving the maximum number of rows in the im_block buffer (135): // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). // --Largest block size is 64x64 pixels. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the // original frame (in 1/16th pixel units). // --Must round-up because block may be located at sub-pixel position. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. // --Require an additional 8 rows for the horiz_w8 transpose tail. // When calling in frame scaling function, the smallest scaling factor is x1/4 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still // big enough. DECLARE_ALIGNED(16, uint8_t, im_block[(135 + 8) * 64]); const int im_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; const ptrdiff_t im_stride = 64; assert(w <= 64); assert(h <= 64); assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); assert(x_step_q4 <= 64); // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2 // lines post both horizontally and vertically. const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1; const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride; scaled_convolve_horiz_neon_dotprod(src - horiz_offset - vert_offset, src_stride, im_block, im_stride, filter, x0_q4, x_step_q4, w, im_height); scaled_convolve_vert_neon_dotprod(im_block, im_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, h); } aom-3.12.1/aom_dsp/arm/aom_scaled_convolve8_neon_i8mm.c000066400000000000000000000272151477627663500230200ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/aom_convolve8_neon.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "config/aom_dsp_rtcd.h" static inline uint8x8_t convolve8_4_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2, uint8x8_t s3, int8x8_t filter) { int8x16_t filter_x2 = vcombine_s8(filter, filter); uint8x16_t s01 = vcombine_u8(s0, s1); uint8x16_t s23 = vcombine_u8(s2, s3); int32x4_t sum01 = vusdotq_s32(vdupq_n_s32(0), s01, filter_x2); int32x4_t sum23 = vusdotq_s32(vdupq_n_s32(0), s23, filter_x2); int32x4_t sum0123 = vpaddq_s32(sum01, sum23); int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vdup_n_s16(0)); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline uint8x8_t convolve8_8_h(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2, uint8x8_t s3, uint8x8_t s4, uint8x8_t s5, uint8x8_t s6, uint8x8_t s7, int8x8_t filter) { int8x16_t filter_x2 = vcombine_s8(filter, filter); uint8x16_t s01 = vcombine_u8(s0, s1); uint8x16_t s23 = vcombine_u8(s2, s3); uint8x16_t s45 = vcombine_u8(s4, s5); uint8x16_t s67 = vcombine_u8(s6, s7); int32x4_t sum01 = vusdotq_s32(vdupq_n_s32(0), s01, filter_x2); int32x4_t sum23 = vusdotq_s32(vdupq_n_s32(0), s23, filter_x2); int32x4_t sum45 = vusdotq_s32(vdupq_n_s32(0), s45, filter_x2); int32x4_t sum67 = vusdotq_s32(vdupq_n_s32(0), s67, filter_x2); int32x4_t sum0123 = vpaddq_s32(sum01, sum23); int32x4_t sum4567 = vpaddq_s32(sum45, sum67); int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void scaled_convolve_horiz_neon_i8mm( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const InterpKernel *const x_filter, const int x0_q4, const int x_step_q4, int w, int h) { DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); if (w == 4) { do { int x_q4 = x0_q4; // Process a 4x4 tile. for (int r = 0; r < 4; ++r) { // Halve filter values (all even) to avoid the need for saturating // arithmetic in convolution kernels. const int8x8_t filter = vshrn_n_s16(vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]), 1); const uint8_t *s = &src[x_q4 >> SUBPEL_BITS]; uint8x8_t s0, s1, s2, s3; load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3); uint8x8_t d0 = convolve8_4_h(s0, s1, s2, s3, filter); store_u8_4x1(&temp[4 * r], d0); x_q4 += x_step_q4; } // Transpose the 4x4 result tile and store. uint8x8_t d01 = vld1_u8(temp + 0); uint8x8_t d23 = vld1_u8(temp + 8); transpose_elems_inplace_u8_4x4(&d01, &d23); store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01); store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h > 0); return; } // w >= 8 do { int x_q4 = x0_q4; uint8_t *d = dst; int width = w; do { // Process an 8x8 tile. for (int r = 0; r < 8; ++r) { // Halve filter values (all even) to avoid the need for saturating // arithmetic in convolution kernels. const int8x8_t filter = vshrn_n_s16(vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]), 1); const uint8_t *s = &src[x_q4 >> SUBPEL_BITS]; uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7; load_u8_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); uint8x8_t d0 = convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter); vst1_u8(&temp[r * 8], d0); x_q4 += x_step_q4; } // Transpose the 8x8 result tile and store. uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); d += 8; width -= 8; } while (width != 0); src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; } while (h > 0); } static inline uint8x8_t convolve8_4_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2, uint8x8_t s3, uint8x8_t s4, uint8x8_t s5, uint8x8_t s6, uint8x8_t s7, int8x8_t filter) { uint8x16_t s01 = vcombine_u8(vzip1_u8(s0, s1), vdup_n_u8(0)); uint8x16_t s23 = vcombine_u8(vzip1_u8(s2, s3), vdup_n_u8(0)); uint8x16_t s45 = vcombine_u8(vzip1_u8(s4, s5), vdup_n_u8(0)); uint8x16_t s67 = vcombine_u8(vzip1_u8(s6, s7), vdup_n_u8(0)); uint8x16_t s0123 = vreinterpretq_u8_u16( vzip1q_u16(vreinterpretq_u16_u8(s01), vreinterpretq_u16_u8(s23))); uint8x16_t s4567 = vreinterpretq_u8_u16( vzip1q_u16(vreinterpretq_u16_u8(s45), vreinterpretq_u16_u8(s67))); int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0123, filter, 0); sum = vusdotq_lane_s32(sum, s4567, filter, 1); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(vcombine_s16(vmovn_s32(sum), vdup_n_s16(0)), FILTER_BITS - 1); } static inline uint8x8_t convolve8_8_v(uint8x8_t s0, uint8x8_t s1, uint8x8_t s2, uint8x8_t s3, uint8x8_t s4, uint8x8_t s5, uint8x8_t s6, uint8x8_t s7, int8x8_t filter) { uint8x16_t s01 = vzip1q_u8(vcombine_u8(s0, vdup_n_u8(0)), vcombine_u8(s1, vdup_n_u8(0))); uint8x16_t s23 = vzip1q_u8(vcombine_u8(s2, vdup_n_u8(0)), vcombine_u8(s3, vdup_n_u8(0))); uint8x16_t s45 = vzip1q_u8(vcombine_u8(s4, vdup_n_u8(0)), vcombine_u8(s5, vdup_n_u8(0))); uint8x16_t s67 = vzip1q_u8(vcombine_u8(s6, vdup_n_u8(0)), vcombine_u8(s7, vdup_n_u8(0))); uint8x16_t s0123[2] = { vreinterpretq_u8_u16( vzip1q_u16(vreinterpretq_u16_u8(s01), vreinterpretq_u16_u8(s23))), vreinterpretq_u8_u16( vzip2q_u16(vreinterpretq_u16_u8(s01), vreinterpretq_u16_u8(s23))) }; uint8x16_t s4567[2] = { vreinterpretq_u8_u16( vzip1q_u16(vreinterpretq_u16_u8(s45), vreinterpretq_u16_u8(s67))), vreinterpretq_u8_u16( vzip2q_u16(vreinterpretq_u16_u8(s45), vreinterpretq_u16_u8(s67))) }; int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0123[0], filter, 0); sum0123 = vusdotq_lane_s32(sum0123, s4567[0], filter, 1); int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0123[1], filter, 0); sum4567 = vusdotq_lane_s32(sum4567, s4567[1], filter, 1); int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void scaled_convolve_vert_neon_i8mm( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const InterpKernel *const y_filter, const int y0_q4, const int y_step_q4, int w, int h) { int y_q4 = y0_q4; if (w == 4) { do { const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; if (y_q4 & SUBPEL_MASK) { // Halve filter values (all even) to avoid the need for saturating // arithmetic in convolution kernels. const int8x8_t filter = vshrn_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1); uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7; load_u8_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); uint8x8_t d0 = convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter); store_u8_4x1(dst, d0); } else { // Memcpy for non-subpel locations. memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 4); } y_q4 += y_step_q4; dst += dst_stride; } while (--h != 0); return; } // w >= 8 do { const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; uint8_t *d = dst; int width = w; if (y_q4 & SUBPEL_MASK) { // Halve filter values (all even) to avoid the need for saturating // arithmetic in convolution kernels. const int8x8_t filter = vshrn_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1); do { uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7; load_u8_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); uint8x8_t d0 = convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter); vst1_u8(d, d0); s += 8; d += 8; width -= 8; } while (width != 0); } else { // Memcpy for non-subpel locations. s += (SUBPEL_TAPS / 2 - 1) * src_stride; do { uint8x8_t s0 = vld1_u8(s); vst1_u8(d, s0); s += 8; d += 8; width -= 8; } while (width != 0); } y_q4 += y_step_q4; dst += dst_stride; } while (--h != 0); } void aom_scaled_2d_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { // Fixed size intermediate buffer, im_block, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. // (2) Interpolate temp vertically to derive the sub-pixel result. // Deriving the maximum number of rows in the im_block buffer (135): // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). // --Largest block size is 64x64 pixels. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the // original frame (in 1/16th pixel units). // --Must round-up because block may be located at sub-pixel position. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. // --Require an additional 8 rows for the horiz_w8 transpose tail. // When calling in frame scaling function, the smallest scaling factor is x1/4 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still // big enough. DECLARE_ALIGNED(16, uint8_t, im_block[(135 + 8) * 64]); const int im_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; const ptrdiff_t im_stride = 64; assert(w <= 64); assert(h <= 64); assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); assert(x_step_q4 <= 64); // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2 // lines post both horizontally and vertically. const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1; const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride; scaled_convolve_horiz_neon_i8mm(src - horiz_offset - vert_offset, src_stride, im_block, im_stride, filter, x0_q4, x_step_q4, w, im_height); scaled_convolve_vert_neon_i8mm(im_block, im_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, h); } aom-3.12.1/aom_dsp/arm/avg_neon.c000066400000000000000000000237711477627663500165540ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" unsigned int aom_avg_4x4_neon(const uint8_t *p, int stride) { const uint8x8_t s0 = load_unaligned_u8(p, stride); const uint8x8_t s1 = load_unaligned_u8(p + 2 * stride, stride); const uint32_t sum = horizontal_add_u16x8(vaddl_u8(s0, s1)); return (sum + (1 << 3)) >> 4; } unsigned int aom_avg_8x8_neon(const uint8_t *p, int stride) { uint8x8_t s0 = vld1_u8(p); p += stride; uint8x8_t s1 = vld1_u8(p); p += stride; uint16x8_t acc = vaddl_u8(s0, s1); int i = 0; do { const uint8x8_t si = vld1_u8(p); p += stride; acc = vaddw_u8(acc, si); } while (++i < 6); const uint32_t sum = horizontal_add_u16x8(acc); return (sum + (1 << 5)) >> 6; } void aom_avg_8x8_quad_neon(const uint8_t *s, int p, int x16_idx, int y16_idx, int *avg) { avg[0] = aom_avg_8x8_neon(s + y16_idx * p + x16_idx, p); avg[1] = aom_avg_8x8_neon(s + y16_idx * p + (x16_idx + 8), p); avg[2] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + x16_idx, p); avg[3] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + (x16_idx + 8), p); } int aom_satd_lp_neon(const int16_t *coeff, int length) { int16x8_t s0 = vld1q_s16(coeff); int16x8_t s1 = vld1q_s16(coeff + 8); int16x8_t abs0 = vabsq_s16(s0); int16x8_t abs1 = vabsq_s16(s1); int32x4_t acc0 = vpaddlq_s16(abs0); int32x4_t acc1 = vpaddlq_s16(abs1); length -= 16; coeff += 16; while (length != 0) { s0 = vld1q_s16(coeff); s1 = vld1q_s16(coeff + 8); abs0 = vabsq_s16(s0); abs1 = vabsq_s16(s1); acc0 = vpadalq_s16(acc0, abs0); acc1 = vpadalq_s16(acc1, abs1); length -= 16; coeff += 16; } int32x4_t accum = vaddq_s32(acc0, acc1); return horizontal_add_s32x4(accum); } void aom_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor) { assert(width % 16 == 0); assert(height % 4 == 0); const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor); uint16x8_t sum_lo[2], sum_hi[2]; int w = 0; do { const uint8_t *r = ref + w; uint8x16_t r0 = vld1q_u8(r + 0 * ref_stride); uint8x16_t r1 = vld1q_u8(r + 1 * ref_stride); uint8x16_t r2 = vld1q_u8(r + 2 * ref_stride); uint8x16_t r3 = vld1q_u8(r + 3 * ref_stride); sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); r += 4 * ref_stride; for (int h = height - 4; h != 0; h -= 4) { r0 = vld1q_u8(r + 0 * ref_stride); r1 = vld1q_u8(r + 1 * ref_stride); r2 = vld1q_u8(r + 2 * ref_stride); r3 = vld1q_u8(r + 3 * ref_stride); uint16x8_t tmp0_lo = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); uint16x8_t tmp0_hi = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); uint16x8_t tmp1_lo = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); uint16x8_t tmp1_hi = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); sum_lo[0] = vaddq_u16(sum_lo[0], tmp0_lo); sum_hi[0] = vaddq_u16(sum_hi[0], tmp0_hi); sum_lo[1] = vaddq_u16(sum_lo[1], tmp1_lo); sum_hi[1] = vaddq_u16(sum_hi[1], tmp1_hi); r += 4 * ref_stride; } sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]); sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]); const int16x8_t avg0 = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor); const int16x8_t avg1 = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor); vst1q_s16(hbuf + w, avg0); vst1q_s16(hbuf + w + 8, avg1); w += 16; } while (w < width); } void aom_int_pro_col_neon(int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor) { assert(width % 16 == 0); assert(height % 4 == 0); const int16x4_t neg_norm_factor = vdup_n_s16(-norm_factor); uint16x8_t sum[4]; int h = 0; do { sum[0] = vpaddlq_u8(vld1q_u8(ref + 0 * ref_stride)); sum[1] = vpaddlq_u8(vld1q_u8(ref + 1 * ref_stride)); sum[2] = vpaddlq_u8(vld1q_u8(ref + 2 * ref_stride)); sum[3] = vpaddlq_u8(vld1q_u8(ref + 3 * ref_stride)); for (int w = 16; w < width; w += 16) { sum[0] = vpadalq_u8(sum[0], vld1q_u8(ref + 0 * ref_stride + w)); sum[1] = vpadalq_u8(sum[1], vld1q_u8(ref + 1 * ref_stride + w)); sum[2] = vpadalq_u8(sum[2], vld1q_u8(ref + 2 * ref_stride + w)); sum[3] = vpadalq_u8(sum[3], vld1q_u8(ref + 3 * ref_stride + w)); } uint16x4_t sum_4d = vmovn_u32(horizontal_add_4d_u16x8(sum)); int16x4_t avg = vshl_s16(vreinterpret_s16_u16(sum_4d), neg_norm_factor); vst1_s16(vbuf + h, avg); ref += 4 * ref_stride; h += 4; } while (h < height); } // coeff: 20 bits, dynamic range [-524287, 524287]. // length: value range {16, 32, 64, 128, 256, 512, 1024}. int aom_satd_neon(const tran_low_t *coeff, int length) { const int32x4_t zero = vdupq_n_s32(0); int32x4_t s0 = vld1q_s32(&coeff[0]); int32x4_t s1 = vld1q_s32(&coeff[4]); int32x4_t s2 = vld1q_s32(&coeff[8]); int32x4_t s3 = vld1q_s32(&coeff[12]); int32x4_t accum0 = vabsq_s32(s0); int32x4_t accum1 = vabsq_s32(s2); accum0 = vabaq_s32(accum0, s1, zero); accum1 = vabaq_s32(accum1, s3, zero); length -= 16; coeff += 16; while (length != 0) { s0 = vld1q_s32(&coeff[0]); s1 = vld1q_s32(&coeff[4]); s2 = vld1q_s32(&coeff[8]); s3 = vld1q_s32(&coeff[12]); accum0 = vabaq_s32(accum0, s0, zero); accum1 = vabaq_s32(accum1, s1, zero); accum0 = vabaq_s32(accum0, s2, zero); accum1 = vabaq_s32(accum1, s3, zero); length -= 16; coeff += 16; } // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024] return horizontal_add_s32x4(vaddq_s32(accum0, accum1)); } int aom_vector_var_neon(const int16_t *ref, const int16_t *src, int bwl) { assert(bwl >= 2 && bwl <= 5); int width = 4 << bwl; int16x8_t r = vld1q_s16(ref); int16x8_t s = vld1q_s16(src); // diff: dynamic range [-510, 510] 10 (signed) bits. int16x8_t diff = vsubq_s16(r, s); // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits. int16x8_t v_mean = diff; // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits. int32x4_t v_sse[2]; v_sse[0] = vmull_s16(vget_low_s16(diff), vget_low_s16(diff)); v_sse[1] = vmull_s16(vget_high_s16(diff), vget_high_s16(diff)); ref += 8; src += 8; width -= 8; do { r = vld1q_s16(ref); s = vld1q_s16(src); diff = vsubq_s16(r, s); v_mean = vaddq_s16(v_mean, diff); v_sse[0] = vmlal_s16(v_sse[0], vget_low_s16(diff), vget_low_s16(diff)); v_sse[1] = vmlal_s16(v_sse[1], vget_high_s16(diff), vget_high_s16(diff)); ref += 8; src += 8; width -= 8; } while (width != 0); // Dynamic range [0, 65280], 16 (unsigned) bits. const uint32_t mean_abs = abs(horizontal_add_s16x8(v_mean)); const int32_t sse = horizontal_add_s32x4(vaddq_s32(v_sse[0], v_sse[1])); // (mean_abs * mean_abs): dynamic range 32 (unsigned) bits. return sse - ((mean_abs * mean_abs) >> (bwl + 2)); } void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int *min, int *max) { // Load and concatenate. const uint8x16_t a01 = load_u8_8x2(a + 0 * a_stride, a_stride); const uint8x16_t a23 = load_u8_8x2(a + 2 * a_stride, a_stride); const uint8x16_t a45 = load_u8_8x2(a + 4 * a_stride, a_stride); const uint8x16_t a67 = load_u8_8x2(a + 6 * a_stride, a_stride); const uint8x16_t b01 = load_u8_8x2(b + 0 * b_stride, b_stride); const uint8x16_t b23 = load_u8_8x2(b + 2 * b_stride, b_stride); const uint8x16_t b45 = load_u8_8x2(b + 4 * b_stride, b_stride); const uint8x16_t b67 = load_u8_8x2(b + 6 * b_stride, b_stride); // Absolute difference. const uint8x16_t ab01_diff = vabdq_u8(a01, b01); const uint8x16_t ab23_diff = vabdq_u8(a23, b23); const uint8x16_t ab45_diff = vabdq_u8(a45, b45); const uint8x16_t ab67_diff = vabdq_u8(a67, b67); // Max values between the Q vectors. const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff); const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff); const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff); const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff); const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max); const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min); #if AOM_ARCH_AARCH64 *min = *max = 0; // Clear high bits *((uint8_t *)max) = vmaxvq_u8(ab07_max); *((uint8_t *)min) = vminvq_u8(ab07_min); #else // Split into 64-bit vectors and execute pairwise min/max. uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max)); uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min)); // Enough runs of vpmax/min propagate the max/min values to every position. ab_max = vpmax_u8(ab_max, ab_max); ab_min = vpmin_u8(ab_min, ab_min); ab_max = vpmax_u8(ab_max, ab_max); ab_min = vpmin_u8(ab_min, ab_min); ab_max = vpmax_u8(ab_max, ab_max); ab_min = vpmin_u8(ab_min, ab_min); *min = *max = 0; // Clear high bits // Store directly to avoid costly neon->gpr transfer. vst1_lane_u8((uint8_t *)max, ab_max, 0); vst1_lane_u8((uint8_t *)min, ab_min, 0); #endif } aom-3.12.1/aom_dsp/arm/avg_pred_neon.c000066400000000000000000000103611477627663500175550ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/arm/blend_neon.h" #include "aom_dsp/arm/dist_wtd_avg_neon.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/blend.h" void aom_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride) { if (width > 8) { do { const uint8_t *pred_ptr = pred; const uint8_t *ref_ptr = ref; uint8_t *comp_pred_ptr = comp_pred; int w = width; do { const uint8x16_t p = vld1q_u8(pred_ptr); const uint8x16_t r = vld1q_u8(ref_ptr); const uint8x16_t avg = vrhaddq_u8(p, r); vst1q_u8(comp_pred_ptr, avg); ref_ptr += 16; pred_ptr += 16; comp_pred_ptr += 16; w -= 16; } while (w != 0); ref += ref_stride; pred += width; comp_pred += width; } while (--height != 0); } else if (width == 8) { int h = height / 2; do { const uint8x16_t p = vld1q_u8(pred); const uint8x16_t r = load_u8_8x2(ref, ref_stride); const uint8x16_t avg = vrhaddq_u8(p, r); vst1q_u8(comp_pred, avg); ref += 2 * ref_stride; pred += 16; comp_pred += 16; } while (--h != 0); } else { int h = height / 4; assert(width == 4); do { const uint8x16_t p = vld1q_u8(pred); const uint8x16_t r = load_unaligned_u8q(ref, ref_stride); const uint8x16_t avg = vrhaddq_u8(p, r); vst1q_u8(comp_pred, avg); ref += 4 * ref_stride; pred += 16; comp_pred += 16; } while (--h != 0); } } void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { const uint8_t *src0 = invert_mask ? pred : ref; const uint8_t *src1 = invert_mask ? ref : pred; const int src_stride0 = invert_mask ? width : ref_stride; const int src_stride1 = invert_mask ? ref_stride : width; if (width > 8) { do { const uint8_t *src0_ptr = src0; const uint8_t *src1_ptr = src1; const uint8_t *mask_ptr = mask; uint8_t *comp_pred_ptr = comp_pred; int w = width; do { const uint8x16_t s0 = vld1q_u8(src0_ptr); const uint8x16_t s1 = vld1q_u8(src1_ptr); const uint8x16_t m0 = vld1q_u8(mask_ptr); uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, s0, s1); vst1q_u8(comp_pred_ptr, blend_u8); src0_ptr += 16; src1_ptr += 16; mask_ptr += 16; comp_pred_ptr += 16; w -= 16; } while (w != 0); src0 += src_stride0; src1 += src_stride1; mask += mask_stride; comp_pred += width; } while (--height != 0); } else if (width == 8) { do { const uint8x8_t s0 = vld1_u8(src0); const uint8x8_t s1 = vld1_u8(src1); const uint8x8_t m0 = vld1_u8(mask); uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1); vst1_u8(comp_pred, blend_u8); src0 += src_stride0; src1 += src_stride1; mask += mask_stride; comp_pred += 8; } while (--height != 0); } else { int h = height / 2; assert(width == 4); do { const uint8x8_t s0 = load_unaligned_u8(src0, src_stride0); const uint8x8_t s1 = load_unaligned_u8(src1, src_stride1); const uint8x8_t m0 = load_unaligned_u8(mask, mask_stride); uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1); vst1_u8(comp_pred, blend_u8); src0 += 2 * src_stride0; src1 += 2 * src_stride1; mask += 2 * mask_stride; comp_pred += 8; } while (--h != 0); } } aom-3.12.1/aom_dsp/arm/avg_sve.c000066400000000000000000000043721477627663500164060ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_ports/mem.h" int aom_vector_var_sve(const int16_t *ref, const int16_t *src, int bwl) { assert(bwl >= 2 && bwl <= 5); int width = 4 << bwl; int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; int16x8_t v_mean[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; do { int16x8_t r0 = vld1q_s16(ref); int16x8_t s0 = vld1q_s16(src); // diff: dynamic range [-510, 510] 10 (signed) bits. int16x8_t diff0 = vsubq_s16(r0, s0); // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits. v_mean[0] = vaddq_s16(v_mean[0], diff0); // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits. sse_s64[0] = aom_sdotq_s16(sse_s64[0], diff0, diff0); int16x8_t r1 = vld1q_s16(ref + 8); int16x8_t s1 = vld1q_s16(src + 8); // diff: dynamic range [-510, 510] 10 (signed) bits. int16x8_t diff1 = vsubq_s16(r1, s1); // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits. v_mean[1] = vaddq_s16(v_mean[1], diff1); // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits. sse_s64[1] = aom_sdotq_s16(sse_s64[1], diff1, diff1); ref += 16; src += 16; width -= 16; } while (width != 0); // Dynamic range [0, 65280], 16 (unsigned) bits. const uint32_t mean_abs = abs(vaddlvq_s16(vaddq_s16(v_mean[0], v_mean[1]))); const int64_t sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[1])); // (mean_abs * mean_abs): dynamic range 32 (unsigned) bits. return (int)(sse - ((mean_abs * mean_abs) >> (bwl + 2))); } aom-3.12.1/aom_dsp/arm/blend_a64_mask_neon.c000066400000000000000000000370521477627663500205450ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/blend_neon.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/blend.h" static uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b, uint16x8_t round_offset) { const uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m); uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(m), vget_low_u16(a)); uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(m), vget_high_u16(a)); blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b)); blend_u32_hi = vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b)); uint16x4_t blend_u16_lo = vshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS); uint16x4_t blend_u16_hi = vshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS); uint16x8_t res = vcombine_u16(blend_u16_lo, blend_u16_hi); res = vqsubq_u16(res, round_offset); return vqrshrn_n_u16(res, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS); } void aom_lowbd_blend_a64_d16_mask_neon( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params) { (void)conv_params; const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const uint16x8_t offset_vec = vdupq_n_u16(round_offset); assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); assert(h >= 4); assert(w >= 4); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); if (subw == 0 && subh == 0) { if (w >= 8) { do { int i = 0; do { uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i)); uint16x8_t s0 = vld1q_u16(src0 + i); uint16x8_t s1 = vld1q_u16(src1 + i); uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec); vst1_u8(dst + i, blend); i += 8; } while (i < w); mask += mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else { do { uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride)); uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec); store_u8x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } } else if (subw == 1 && subh == 1) { if (w >= 8) { do { int i = 0; do { uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i); uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i); uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8); uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8); uint16x8_t s0 = vld1q_u16(src0 + i); uint16x8_t s1 = vld1q_u16(src1 + i); uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); vst1_u8(dst + i, blend); i += 8; } while (i < w); mask += 2 * mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else { do { uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride); uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride); uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); store_u8x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } } else if (subw == 1 && subh == 0) { if (w >= 8) { do { int i = 0; do { uint8x8_t m0 = vld1_u8(mask + 2 * i); uint8x8_t m1 = vld1_u8(mask + 2 * i + 8); uint16x8_t s0 = vld1q_u16(src0 + i); uint16x8_t s1 = vld1q_u16(src1 + i); uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); vst1_u8(dst + i, blend); i += 8; } while (i < w); mask += mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else { do { uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); store_u8x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } } else { if (w >= 8) { do { int i = 0; do { uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i); uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i); uint16x8_t s0 = vld1q_u16(src0 + i); uint16x8_t s1 = vld1q_u16(src1 + i); uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1)); uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); vst1_u8(dst + i, blend); i += 8; } while (i < w); mask += 2 * mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else { do { uint8x8_t m0_2 = load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride); uint8x8_t m1_3 = load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride); uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3)); uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); store_u8x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } } } void aom_blend_a64_mask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh) { assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); if ((subw | subh) == 0) { if (w > 8) { do { int i = 0; do { uint8x16_t m0 = vld1q_u8(mask + i); uint8x16_t s0 = vld1q_u8(src0 + i); uint8x16_t s1 = vld1q_u8(src1 + i); uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1); vst1q_u8(dst + i, blend); i += 16; } while (i < w); mask += mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else if (w == 8) { do { uint8x8_t m0 = vld1_u8(mask); uint8x8_t s0 = vld1_u8(src0); uint8x8_t s1 = vld1_u8(src1); uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); vst1_u8(dst, blend); mask += mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else { do { uint8x8_t m0 = load_unaligned_u8_4x2(mask, mask_stride); uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride); uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride); uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); store_u8x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } } else if ((subw & subh) == 1) { if (w > 8) { do { int i = 0; do { uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i); uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i); uint8x16_t m2 = vld1q_u8(mask + 0 * mask_stride + 2 * i + 16); uint8x16_t m3 = vld1q_u8(mask + 1 * mask_stride + 2 * i + 16); uint8x16_t s0 = vld1q_u8(src0 + i); uint8x16_t s1 = vld1q_u8(src1 + i); uint8x16_t m_avg = avg_blend_pairwise_u8x16_4(m0, m1, m2, m3); uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1); vst1q_u8(dst + i, blend); i += 16; } while (i < w); mask += 2 * mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else if (w == 8) { do { uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 8); uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 8); uint8x8_t s0 = vld1_u8(src0); uint8x8_t s1 = vld1_u8(src1); uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3); uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); vst1_u8(dst, blend); mask += 2 * mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else { do { uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride); uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride); uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride); uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride); uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3); uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); store_u8x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } } else if (subw == 1 && subh == 0) { if (w > 8) { do { int i = 0; do { uint8x16_t m0 = vld1q_u8(mask + 2 * i); uint8x16_t m1 = vld1q_u8(mask + 2 * i + 16); uint8x16_t s0 = vld1q_u8(src0 + i); uint8x16_t s1 = vld1q_u8(src1 + i); uint8x16_t m_avg = avg_blend_pairwise_u8x16(m0, m1); uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1); vst1q_u8(dst + i, blend); i += 16; } while (i < w); mask += mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else if (w == 8) { do { uint8x8_t m0 = vld1_u8(mask); uint8x8_t m1 = vld1_u8(mask + 8); uint8x8_t s0 = vld1_u8(src0); uint8x8_t s1 = vld1_u8(src1); uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1); uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); vst1_u8(dst, blend); mask += mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else { do { uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride); uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride); uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1); uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); store_u8x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } } else { if (w > 8) { do { int i = 0; do { uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + i); uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + i); uint8x16_t s0 = vld1q_u8(src0 + i); uint8x16_t s1 = vld1q_u8(src1 + i); uint8x16_t m_avg = avg_blend_u8x16(m0, m1); uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1); vst1q_u8(dst + i, blend); i += 16; } while (i < w); mask += 2 * mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else if (w == 8) { do { uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); uint8x8_t s0 = vld1_u8(src0); uint8x8_t s1 = vld1_u8(src1); uint8x8_t m_avg = avg_blend_u8x8(m0, m1); uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); vst1_u8(dst, blend); mask += 2 * mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else { do { uint8x8_t m0_2 = load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride); uint8x8_t m1_3 = load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride); uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride); uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride); uint8x8_t m_avg = avg_blend_u8x8(m0_2, m1_3); uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); store_u8x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } } } aom-3.12.1/aom_dsp/arm/blend_neon.h000066400000000000000000000113111477627663500170530ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_ARM_BLEND_NEON_H_ #define AOM_AOM_DSP_ARM_BLEND_NEON_H_ #include #include "aom_dsp/blend.h" static inline uint8x16_t alpha_blend_a64_u8x16(uint8x16_t m, uint8x16_t a, uint8x16_t b) { const uint8x16_t m_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m); uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m), vget_low_u8(a)); uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m), vget_high_u8(a)); blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m_inv), vget_low_u8(b)); blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m_inv), vget_high_u8(b)); uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS); uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS); return vcombine_u8(blend_u8_lo, blend_u8_hi); } static inline uint8x8_t alpha_blend_a64_u8x8(uint8x8_t m, uint8x8_t a, uint8x8_t b) { const uint8x8_t m_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m); uint16x8_t blend_u16 = vmull_u8(m, a); blend_u16 = vmlal_u8(blend_u16, m_inv, b); return vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS); } #if CONFIG_AV1_HIGHBITDEPTH static inline uint16x8_t alpha_blend_a64_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b) { uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m); uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(a), vget_low_u16(m)); uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(a), vget_high_u16(m)); blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(b), vget_low_u16(m_inv)); blend_u32_hi = vmlal_u16(blend_u32_hi, vget_high_u16(b), vget_high_u16(m_inv)); uint16x4_t blend_u16_lo = vrshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS); uint16x4_t blend_u16_hi = vrshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS); return vcombine_u16(blend_u16_lo, blend_u16_hi); } static inline uint16x4_t alpha_blend_a64_u16x4(uint16x4_t m, uint16x4_t a, uint16x4_t b) { const uint16x4_t m_inv = vsub_u16(vdup_n_u16(AOM_BLEND_A64_MAX_ALPHA), m); uint32x4_t blend_u16 = vmull_u16(m, a); blend_u16 = vmlal_u16(blend_u16, m_inv, b); return vrshrn_n_u32(blend_u16, AOM_BLEND_A64_ROUND_BITS); } #endif // CONFIG_AV1_HIGHBITDEPTH static inline uint8x8_t avg_blend_u8x8(uint8x8_t a, uint8x8_t b) { return vrhadd_u8(a, b); } static inline uint8x16_t avg_blend_u8x16(uint8x16_t a, uint8x16_t b) { return vrhaddq_u8(a, b); } static inline uint8x8_t avg_blend_pairwise_u8x8(uint8x8_t a, uint8x8_t b) { return vrshr_n_u8(vpadd_u8(a, b), 1); } static inline uint8x16_t avg_blend_pairwise_u8x16(uint8x16_t a, uint8x16_t b) { #if AOM_ARCH_AARCH64 return vrshrq_n_u8(vpaddq_u8(a, b), 1); #else uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a)); uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b)); return vrshrq_n_u8(vcombine_u8(sum_pairwise_a, sum_pairwise_b), 1); #endif // AOM_ARCH_AARCH64 } static inline uint8x8_t avg_blend_pairwise_u8x8_4(uint8x8_t a, uint8x8_t b, uint8x8_t c, uint8x8_t d) { uint8x8_t a_c = vpadd_u8(a, c); uint8x8_t b_d = vpadd_u8(b, d); return vrshr_n_u8(vqadd_u8(a_c, b_d), 2); } static inline uint8x16_t avg_blend_pairwise_u8x16_4(uint8x16_t a, uint8x16_t b, uint8x16_t c, uint8x16_t d) { #if AOM_ARCH_AARCH64 uint8x16_t a_c = vpaddq_u8(a, c); uint8x16_t b_d = vpaddq_u8(b, d); return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2); #else uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a)); uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b)); uint8x8_t sum_pairwise_c = vpadd_u8(vget_low_u8(c), vget_high_u8(c)); uint8x8_t sum_pairwise_d = vpadd_u8(vget_low_u8(d), vget_high_u8(d)); uint8x16_t a_c = vcombine_u8(sum_pairwise_a, sum_pairwise_c); uint8x16_t b_d = vcombine_u8(sum_pairwise_b, sum_pairwise_d); return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2); #endif // AOM_ARCH_AARCH64 } #endif // AOM_AOM_DSP_ARM_BLEND_NEON_H_ aom-3.12.1/aom_dsp/arm/blk_sse_sum_neon.c000066400000000000000000000074771477627663500203120ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_dsp_rtcd.h" #include "config/aom_config.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" static inline void get_blk_sse_sum_4xh_neon(const int16_t *data, int stride, int bh, int *x_sum, int64_t *x2_sum) { int i = bh; int32x4_t sum = vdupq_n_s32(0); int32x4_t sse = vdupq_n_s32(0); do { int16x8_t d = vcombine_s16(vld1_s16(data), vld1_s16(data + stride)); sum = vpadalq_s16(sum, d); sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d)); sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d)); data += 2 * stride; i -= 2; } while (i != 0); *x_sum = horizontal_add_s32x4(sum); *x2_sum = horizontal_long_add_s32x4(sse); } static inline void get_blk_sse_sum_8xh_neon(const int16_t *data, int stride, int bh, int *x_sum, int64_t *x2_sum) { int i = bh; int32x4_t sum = vdupq_n_s32(0); int32x4_t sse = vdupq_n_s32(0); // Input is 12-bit wide, so we can add up to 127 squared elements in a signed // 32-bits element. Since we're accumulating into an int32x4_t and the maximum // value for bh is 32, we don't have to worry about sse overflowing. do { int16x8_t d = vld1q_s16(data); sum = vpadalq_s16(sum, d); sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d)); sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d)); data += stride; } while (--i != 0); *x_sum = horizontal_add_s32x4(sum); *x2_sum = horizontal_long_add_s32x4(sse); } static inline void get_blk_sse_sum_large_neon(const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum) { int32x4_t sum = vdupq_n_s32(0); int64x2_t sse = vdupq_n_s64(0); // Input is 12-bit wide, so we can add up to 127 squared elements in a signed // 32-bits element. Since we're accumulating into an int32x4_t vector that // means we can process up to (127*4)/bw rows before we need to widen to // 64 bits. int i_limit = (127 * 4) / bw; int i_tmp = bh > i_limit ? i_limit : bh; int i = 0; do { int32x4_t sse_s32 = vdupq_n_s32(0); do { int j = bw; const int16_t *data_ptr = data; do { int16x8_t d = vld1q_s16(data_ptr); sum = vpadalq_s16(sum, d); sse_s32 = vmlal_s16(sse_s32, vget_low_s16(d), vget_low_s16(d)); sse_s32 = vmlal_s16(sse_s32, vget_high_s16(d), vget_high_s16(d)); data_ptr += 8; j -= 8; } while (j != 0); data += stride; i++; } while (i < i_tmp && i < bh); sse = vpadalq_s32(sse, sse_s32); i_tmp += i_limit; } while (i < bh); *x_sum = horizontal_add_s32x4(sum); *x2_sum = horizontal_add_s64x2(sse); } void aom_get_blk_sse_sum_neon(const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum) { if (bw == 4) { get_blk_sse_sum_4xh_neon(data, stride, bh, x_sum, x2_sum); } else if (bw == 8) { get_blk_sse_sum_8xh_neon(data, stride, bh, x_sum, x2_sum); } else { assert(bw % 8 == 0); get_blk_sse_sum_large_neon(data, stride, bw, bh, x_sum, x2_sum); } } aom-3.12.1/aom_dsp/arm/blk_sse_sum_sve.c000066400000000000000000000063131477627663500201340ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_dsp_rtcd.h" #include "config/aom_config.h" #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" static inline void get_blk_sse_sum_4xh_sve(const int16_t *data, int stride, int bh, int *x_sum, int64_t *x2_sum) { int32x4_t sum = vdupq_n_s32(0); int64x2_t sse = vdupq_n_s64(0); do { int16x8_t d = vcombine_s16(vld1_s16(data), vld1_s16(data + stride)); sum = vpadalq_s16(sum, d); sse = aom_sdotq_s16(sse, d, d); data += 2 * stride; bh -= 2; } while (bh != 0); *x_sum = vaddvq_s32(sum); *x2_sum = vaddvq_s64(sse); } static inline void get_blk_sse_sum_8xh_sve(const int16_t *data, int stride, int bh, int *x_sum, int64_t *x2_sum) { int32x4_t sum[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; do { int16x8_t d0 = vld1q_s16(data); int16x8_t d1 = vld1q_s16(data + stride); sum[0] = vpadalq_s16(sum[0], d0); sum[1] = vpadalq_s16(sum[1], d1); sse[0] = aom_sdotq_s16(sse[0], d0, d0); sse[1] = aom_sdotq_s16(sse[1], d1, d1); data += 2 * stride; bh -= 2; } while (bh != 0); *x_sum = vaddvq_s32(vaddq_s32(sum[0], sum[1])); *x2_sum = vaddvq_s64(vaddq_s64(sse[0], sse[1])); } static inline void get_blk_sse_sum_large_sve(const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum) { int32x4_t sum[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; do { int j = bw; const int16_t *data_ptr = data; do { int16x8_t d0 = vld1q_s16(data_ptr); int16x8_t d1 = vld1q_s16(data_ptr + 8); sum[0] = vpadalq_s16(sum[0], d0); sum[1] = vpadalq_s16(sum[1], d1); sse[0] = aom_sdotq_s16(sse[0], d0, d0); sse[1] = aom_sdotq_s16(sse[1], d1, d1); data_ptr += 16; j -= 16; } while (j != 0); data += stride; } while (--bh != 0); *x_sum = vaddvq_s32(vaddq_s32(sum[0], sum[1])); *x2_sum = vaddvq_s64(vaddq_s64(sse[0], sse[1])); } void aom_get_blk_sse_sum_sve(const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum) { if (bw == 4) { get_blk_sse_sum_4xh_sve(data, stride, bh, x_sum, x2_sum); } else if (bw == 8) { get_blk_sse_sum_8xh_sve(data, stride, bh, x_sum, x2_sum); } else { assert(bw % 16 == 0); get_blk_sse_sum_large_sve(data, stride, bw, bh, x_sum, x2_sum); } } aom-3.12.1/aom_dsp/arm/dist_wtd_avg_neon.h000066400000000000000000000050611477627663500204520ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_ #define AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_ #include #include "aom_dsp/aom_dsp_common.h" #include "av1/common/enums.h" static inline uint8x8_t dist_wtd_avg_u8x8(uint8x8_t a, uint8x8_t b, uint8x8_t wta, uint8x8_t wtb) { uint16x8_t wtd_sum = vmull_u8(a, wta); wtd_sum = vmlal_u8(wtd_sum, b, wtb); return vrshrn_n_u16(wtd_sum, DIST_PRECISION_BITS); } static inline uint16x4_t dist_wtd_avg_u16x4(uint16x4_t a, uint16x4_t b, uint16x4_t wta, uint16x4_t wtb) { uint32x4_t wtd_sum = vmull_u16(a, wta); wtd_sum = vmlal_u16(wtd_sum, b, wtb); return vrshrn_n_u32(wtd_sum, DIST_PRECISION_BITS); } static inline uint8x16_t dist_wtd_avg_u8x16(uint8x16_t a, uint8x16_t b, uint8x16_t wta, uint8x16_t wtb) { uint16x8_t wtd_sum_lo = vmull_u8(vget_low_u8(a), vget_low_u8(wta)); uint16x8_t wtd_sum_hi = vmull_u8(vget_high_u8(a), vget_high_u8(wta)); wtd_sum_lo = vmlal_u8(wtd_sum_lo, vget_low_u8(b), vget_low_u8(wtb)); wtd_sum_hi = vmlal_u8(wtd_sum_hi, vget_high_u8(b), vget_high_u8(wtb)); uint8x8_t wtd_avg_lo = vrshrn_n_u16(wtd_sum_lo, DIST_PRECISION_BITS); uint8x8_t wtd_avg_hi = vrshrn_n_u16(wtd_sum_hi, DIST_PRECISION_BITS); return vcombine_u8(wtd_avg_lo, wtd_avg_hi); } static inline uint16x8_t dist_wtd_avg_u16x8(uint16x8_t a, uint16x8_t b, uint16x8_t wta, uint16x8_t wtb) { uint32x4_t wtd_sum_lo = vmull_u16(vget_low_u16(a), vget_low_u16(wta)); uint32x4_t wtd_sum_hi = vmull_u16(vget_high_u16(a), vget_high_u16(wta)); wtd_sum_lo = vmlal_u16(wtd_sum_lo, vget_low_u16(b), vget_low_u16(wtb)); wtd_sum_hi = vmlal_u16(wtd_sum_hi, vget_high_u16(b), vget_high_u16(wtb)); uint16x4_t wtd_avg_lo = vrshrn_n_u32(wtd_sum_lo, DIST_PRECISION_BITS); uint16x4_t wtd_avg_hi = vrshrn_n_u32(wtd_sum_hi, DIST_PRECISION_BITS); return vcombine_u16(wtd_avg_lo, wtd_avg_hi); } #endif // AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_ aom-3.12.1/aom_dsp/arm/fwd_txfm_neon.c000066400000000000000000000334571477627663500176170ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/txfm_common.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" static void aom_fdct4x4_helper(const int16_t *input, int stride, int16x4_t *input_0, int16x4_t *input_1, int16x4_t *input_2, int16x4_t *input_3) { *input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4); *input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4); *input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4); *input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4); // If the very first value != 0, then add 1. if (input[0] != 0) { const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1)); *input_0 = vadd_s16(*input_0, one); } for (int i = 0; i < 2; ++i) { const int16x8_t input_01 = vcombine_s16(*input_0, *input_1); const int16x8_t input_32 = vcombine_s16(*input_3, *input_2); // in_0 +/- in_3, in_1 +/- in_2 const int16x8_t s_01 = vaddq_s16(input_01, input_32); const int16x8_t s_32 = vsubq_s16(input_01, input_32); // step_0 +/- step_1, step_2 +/- step_3 const int16x4_t s_0 = vget_low_s16(s_01); const int16x4_t s_1 = vget_high_s16(s_01); const int16x4_t s_2 = vget_high_s16(s_32); const int16x4_t s_3 = vget_low_s16(s_32); // (s_0 +/- s_1) * cospi_16_64 // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1); const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1); const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int32_t)cospi_16_64); const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int32_t)cospi_16_64); // fdct_round_shift int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS); int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS); // s_3 * cospi_8_64 + s_2 * cospi_24_64 // s_3 * cospi_24_64 - s_2 * cospi_8_64 const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int32_t)cospi_8_64); const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int32_t)cospi_24_64); const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, (int32_t)cospi_24_64); const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, (int32_t)cospi_8_64); // fdct_round_shift int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS); int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS); // Only transpose the first pass if (i == 0) { transpose_elems_inplace_s16_4x4(&out_0, &out_1, &out_2, &out_3); } *input_0 = out_0; *input_1 = out_1; *input_2 = out_2; *input_3 = out_3; } } void aom_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, int stride) { // input[M * stride] * 16 int16x4_t input_0, input_1, input_2, input_3; aom_fdct4x4_helper(input, stride, &input_0, &input_1, &input_2, &input_3); // Not quite a rounding shift. Only add 1 despite shifting by 2. const int16x8_t one = vdupq_n_s16(1); int16x8_t out_01 = vcombine_s16(input_0, input_1); int16x8_t out_23 = vcombine_s16(input_2, input_3); out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2); out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2); store_s16q_to_tran_low(final_output + 0 * 8, out_01); store_s16q_to_tran_low(final_output + 1 * 8, out_23); } void aom_fdct4x4_lp_neon(const int16_t *input, int16_t *final_output, int stride) { // input[M * stride] * 16 int16x4_t input_0, input_1, input_2, input_3; aom_fdct4x4_helper(input, stride, &input_0, &input_1, &input_2, &input_3); // Not quite a rounding shift. Only add 1 despite shifting by 2. const int16x8_t one = vdupq_n_s16(1); int16x8_t out_01 = vcombine_s16(input_0, input_1); int16x8_t out_23 = vcombine_s16(input_2, input_3); out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2); out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2); vst1q_s16(final_output + 0 * 8, out_01); vst1q_s16(final_output + 1 * 8, out_23); } #if CONFIG_INTERNAL_STATS void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { // stage 1 int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); for (int i = 0; i < 2; ++i) { int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7; const int16x8_t v_s0 = vaddq_s16(input_0, input_7); const int16x8_t v_s1 = vaddq_s16(input_1, input_6); const int16x8_t v_s2 = vaddq_s16(input_2, input_5); const int16x8_t v_s3 = vaddq_s16(input_3, input_4); const int16x8_t v_s4 = vsubq_s16(input_3, input_4); const int16x8_t v_s5 = vsubq_s16(input_2, input_5); const int16x8_t v_s6 = vsubq_s16(input_1, input_6); const int16x8_t v_s7 = vsubq_s16(input_0, input_7); // fdct4(step, step); int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); // fdct4(step, step); int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64); int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64); int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64); int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64); v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64); v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64); v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64); v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64); v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64); v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64); v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64); v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64); { const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 } // Stage 2 v_x0 = vsubq_s16(v_s6, v_s5); v_x1 = vaddq_s16(v_s6, v_s5); v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64); v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64); v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64); v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64); { const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); const int16x8_t ab = vcombine_s16(a, b); const int16x8_t cd = vcombine_s16(c, d); // Stage 3 v_x0 = vaddq_s16(v_s4, ab); v_x1 = vsubq_s16(v_s4, ab); v_x2 = vsubq_s16(v_s7, cd); v_x3 = vaddq_s16(v_s7, cd); } // Stage 4 v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64); v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64); v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64); v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64); v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64); v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64); v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64); v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64); v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64); v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64); v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64); v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64); v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64); v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64); v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64); v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64); { const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 } // transpose 8x8 { // 00 01 02 03 40 41 42 43 // 10 11 12 13 50 51 52 53 // 20 21 22 23 60 61 62 63 // 30 31 32 33 70 71 72 73 // 04 05 06 07 44 45 46 47 // 14 15 16 17 54 55 56 57 // 24 25 26 27 64 65 66 67 // 34 35 36 37 74 75 76 77 const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2)); const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3)); const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6)); const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7)); const int16x8x2_t r01_s16 = vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), vreinterpretq_s16_s32(r13_s32.val[0])); const int16x8x2_t r23_s16 = vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), vreinterpretq_s16_s32(r13_s32.val[1])); const int16x8x2_t r45_s16 = vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), vreinterpretq_s16_s32(r57_s32.val[0])); const int16x8x2_t r67_s16 = vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), vreinterpretq_s16_s32(r57_s32.val[1])); input_0 = r01_s16.val[0]; input_1 = r01_s16.val[1]; input_2 = r23_s16.val[0]; input_3 = r23_s16.val[1]; input_4 = r45_s16.val[0]; input_5 = r45_s16.val[1]; input_6 = r67_s16.val[0]; input_7 = r67_s16.val[1]; // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 } } // for { // from aom_dct_sse2.c // Post-condition (division by two) // division of two 16 bits signed numbers using shifts // n / 2 = (n - (n >> 15)) >> 1 const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15); const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15); const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15); const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15); const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15); const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15); const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15); const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15); input_0 = vhsubq_s16(input_0, sign_in0); input_1 = vhsubq_s16(input_1, sign_in1); input_2 = vhsubq_s16(input_2, sign_in2); input_3 = vhsubq_s16(input_3, sign_in3); input_4 = vhsubq_s16(input_4, sign_in4); input_5 = vhsubq_s16(input_5, sign_in5); input_6 = vhsubq_s16(input_6, sign_in6); input_7 = vhsubq_s16(input_7, sign_in7); // store results vst1q_s16(&final_output[0 * 8], input_0); vst1q_s16(&final_output[1 * 8], input_1); vst1q_s16(&final_output[2 * 8], input_2); vst1q_s16(&final_output[3 * 8], input_3); vst1q_s16(&final_output[4 * 8], input_4); vst1q_s16(&final_output[5 * 8], input_5); vst1q_s16(&final_output[6 * 8], input_6); vst1q_s16(&final_output[7 * 8], input_7); } } #endif // CONFIG_INTERNAL_STATS aom-3.12.1/aom_dsp/arm/hadamard_neon.c000066400000000000000000000274341477627663500175400ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" static inline void hadamard_4x4_one_pass(int16x4_t *a0, int16x4_t *a1, int16x4_t *a2, int16x4_t *a3) { const int16x4_t b0 = vhadd_s16(*a0, *a1); const int16x4_t b1 = vhsub_s16(*a0, *a1); const int16x4_t b2 = vhadd_s16(*a2, *a3); const int16x4_t b3 = vhsub_s16(*a2, *a3); *a0 = vadd_s16(b0, b2); *a1 = vadd_s16(b1, b3); *a2 = vsub_s16(b0, b2); *a3 = vsub_s16(b1, b3); } void aom_hadamard_4x4_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int16x4_t a0 = vld1_s16(src_diff); int16x4_t a1 = vld1_s16(src_diff + src_stride); int16x4_t a2 = vld1_s16(src_diff + 2 * src_stride); int16x4_t a3 = vld1_s16(src_diff + 3 * src_stride); hadamard_4x4_one_pass(&a0, &a1, &a2, &a3); transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3); hadamard_4x4_one_pass(&a0, &a1, &a2, &a3); store_s16_to_tran_low(coeff, a0); store_s16_to_tran_low(coeff + 4, a1); store_s16_to_tran_low(coeff + 8, a2); store_s16_to_tran_low(coeff + 12, a3); } static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, int16x8_t *a6, int16x8_t *a7) { const int16x8_t b0 = vaddq_s16(*a0, *a1); const int16x8_t b1 = vsubq_s16(*a0, *a1); const int16x8_t b2 = vaddq_s16(*a2, *a3); const int16x8_t b3 = vsubq_s16(*a2, *a3); const int16x8_t b4 = vaddq_s16(*a4, *a5); const int16x8_t b5 = vsubq_s16(*a4, *a5); const int16x8_t b6 = vaddq_s16(*a6, *a7); const int16x8_t b7 = vsubq_s16(*a6, *a7); const int16x8_t c0 = vaddq_s16(b0, b2); const int16x8_t c1 = vaddq_s16(b1, b3); const int16x8_t c2 = vsubq_s16(b0, b2); const int16x8_t c3 = vsubq_s16(b1, b3); const int16x8_t c4 = vaddq_s16(b4, b6); const int16x8_t c5 = vaddq_s16(b5, b7); const int16x8_t c6 = vsubq_s16(b4, b6); const int16x8_t c7 = vsubq_s16(b5, b7); *a0 = vaddq_s16(c0, c4); *a1 = vsubq_s16(c2, c6); *a2 = vsubq_s16(c0, c4); *a3 = vaddq_s16(c2, c6); *a4 = vaddq_s16(c3, c7); *a5 = vsubq_s16(c3, c7); *a6 = vsubq_s16(c1, c5); *a7 = vaddq_s16(c1, c5); } void aom_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int16x8_t a0 = vld1q_s16(src_diff); int16x8_t a1 = vld1q_s16(src_diff + src_stride); int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride); int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride); int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride); int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride); int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride); hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); // Skip the second transpose because it is not required. store_s16q_to_tran_low(coeff + 0, a0); store_s16q_to_tran_low(coeff + 8, a1); store_s16q_to_tran_low(coeff + 16, a2); store_s16q_to_tran_low(coeff + 24, a3); store_s16q_to_tran_low(coeff + 32, a4); store_s16q_to_tran_low(coeff + 40, a5); store_s16q_to_tran_low(coeff + 48, a6); store_s16q_to_tran_low(coeff + 56, a7); } void aom_hadamard_lp_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { int16x8_t a0 = vld1q_s16(src_diff); int16x8_t a1 = vld1q_s16(src_diff + src_stride); int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride); int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride); int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride); int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride); int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride); hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); // Skip the second transpose because it is not required. vst1q_s16(coeff + 0, a0); vst1q_s16(coeff + 8, a1); vst1q_s16(coeff + 16, a2); vst1q_s16(coeff + 24, a3); vst1q_s16(coeff + 32, a4); vst1q_s16(coeff + 40, a5); vst1q_s16(coeff + 48, a6); vst1q_s16(coeff + 56, a7); } void aom_hadamard_lp_8x8_dual_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { for (int i = 0; i < 2; i++) { aom_hadamard_lp_8x8_neon(src_diff + (i * 8), src_stride, coeff + (i * 64)); } } void aom_hadamard_lp_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { /* Rearrange 16x16 to 8x32 and remove stride. * Top left first. */ aom_hadamard_lp_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); /* Top right. */ aom_hadamard_lp_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64); /* Bottom left. */ aom_hadamard_lp_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128); /* Bottom right. */ aom_hadamard_lp_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); for (int i = 0; i < 64; i += 8) { const int16x8_t a0 = vld1q_s16(coeff + 0); const int16x8_t a1 = vld1q_s16(coeff + 64); const int16x8_t a2 = vld1q_s16(coeff + 128); const int16x8_t a3 = vld1q_s16(coeff + 192); const int16x8_t b0 = vhaddq_s16(a0, a1); const int16x8_t b1 = vhsubq_s16(a0, a1); const int16x8_t b2 = vhaddq_s16(a2, a3); const int16x8_t b3 = vhsubq_s16(a2, a3); const int16x8_t c0 = vaddq_s16(b0, b2); const int16x8_t c1 = vaddq_s16(b1, b3); const int16x8_t c2 = vsubq_s16(b0, b2); const int16x8_t c3 = vsubq_s16(b1, b3); vst1q_s16(coeff + 0, c0); vst1q_s16(coeff + 64, c1); vst1q_s16(coeff + 128, c2); vst1q_s16(coeff + 192, c3); coeff += 8; } } void aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { /* Rearrange 16x16 to 8x32 and remove stride. * Top left first. */ aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); /* Top right. */ aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64); /* Bottom left. */ aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128); /* Bottom right. */ aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); // Each iteration of the loop operates on entire rows (16 samples each) // because we need to swap the second and third quarters of every row in the // output to match AVX2 output (i.e., aom_hadamard_16x16_avx2). See the for // loop at the end of aom_hadamard_16x16_c. for (int i = 0; i < 64; i += 16) { const int32x4_t a00 = vld1q_s32(coeff + 0); const int32x4_t a01 = vld1q_s32(coeff + 64); const int32x4_t a02 = vld1q_s32(coeff + 128); const int32x4_t a03 = vld1q_s32(coeff + 192); const int32x4_t b00 = vhaddq_s32(a00, a01); const int32x4_t b01 = vhsubq_s32(a00, a01); const int32x4_t b02 = vhaddq_s32(a02, a03); const int32x4_t b03 = vhsubq_s32(a02, a03); const int32x4_t c00 = vaddq_s32(b00, b02); const int32x4_t c01 = vaddq_s32(b01, b03); const int32x4_t c02 = vsubq_s32(b00, b02); const int32x4_t c03 = vsubq_s32(b01, b03); const int32x4_t a10 = vld1q_s32(coeff + 4 + 0); const int32x4_t a11 = vld1q_s32(coeff + 4 + 64); const int32x4_t a12 = vld1q_s32(coeff + 4 + 128); const int32x4_t a13 = vld1q_s32(coeff + 4 + 192); const int32x4_t b10 = vhaddq_s32(a10, a11); const int32x4_t b11 = vhsubq_s32(a10, a11); const int32x4_t b12 = vhaddq_s32(a12, a13); const int32x4_t b13 = vhsubq_s32(a12, a13); const int32x4_t c10 = vaddq_s32(b10, b12); const int32x4_t c11 = vaddq_s32(b11, b13); const int32x4_t c12 = vsubq_s32(b10, b12); const int32x4_t c13 = vsubq_s32(b11, b13); const int32x4_t a20 = vld1q_s32(coeff + 8 + 0); const int32x4_t a21 = vld1q_s32(coeff + 8 + 64); const int32x4_t a22 = vld1q_s32(coeff + 8 + 128); const int32x4_t a23 = vld1q_s32(coeff + 8 + 192); const int32x4_t b20 = vhaddq_s32(a20, a21); const int32x4_t b21 = vhsubq_s32(a20, a21); const int32x4_t b22 = vhaddq_s32(a22, a23); const int32x4_t b23 = vhsubq_s32(a22, a23); const int32x4_t c20 = vaddq_s32(b20, b22); const int32x4_t c21 = vaddq_s32(b21, b23); const int32x4_t c22 = vsubq_s32(b20, b22); const int32x4_t c23 = vsubq_s32(b21, b23); const int32x4_t a30 = vld1q_s32(coeff + 12 + 0); const int32x4_t a31 = vld1q_s32(coeff + 12 + 64); const int32x4_t a32 = vld1q_s32(coeff + 12 + 128); const int32x4_t a33 = vld1q_s32(coeff + 12 + 192); const int32x4_t b30 = vhaddq_s32(a30, a31); const int32x4_t b31 = vhsubq_s32(a30, a31); const int32x4_t b32 = vhaddq_s32(a32, a33); const int32x4_t b33 = vhsubq_s32(a32, a33); const int32x4_t c30 = vaddq_s32(b30, b32); const int32x4_t c31 = vaddq_s32(b31, b33); const int32x4_t c32 = vsubq_s32(b30, b32); const int32x4_t c33 = vsubq_s32(b31, b33); vst1q_s32(coeff + 0 + 0, c00); vst1q_s32(coeff + 0 + 4, c20); vst1q_s32(coeff + 0 + 8, c10); vst1q_s32(coeff + 0 + 12, c30); vst1q_s32(coeff + 64 + 0, c01); vst1q_s32(coeff + 64 + 4, c21); vst1q_s32(coeff + 64 + 8, c11); vst1q_s32(coeff + 64 + 12, c31); vst1q_s32(coeff + 128 + 0, c02); vst1q_s32(coeff + 128 + 4, c22); vst1q_s32(coeff + 128 + 8, c12); vst1q_s32(coeff + 128 + 12, c32); vst1q_s32(coeff + 192 + 0, c03); vst1q_s32(coeff + 192 + 4, c23); vst1q_s32(coeff + 192 + 8, c13); vst1q_s32(coeff + 192 + 12, c33); coeff += 16; } } void aom_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { /* Top left first. */ aom_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); /* Top right. */ aom_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride, coeff + 256); /* Bottom left. */ aom_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride, coeff + 512); /* Bottom right. */ aom_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride, coeff + 768); for (int i = 0; i < 256; i += 4) { const int32x4_t a0 = vld1q_s32(coeff); const int32x4_t a1 = vld1q_s32(coeff + 256); const int32x4_t a2 = vld1q_s32(coeff + 512); const int32x4_t a3 = vld1q_s32(coeff + 768); const int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2); const int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2); const int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2); const int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2); const int32x4_t c0 = vaddq_s32(b0, b2); const int32x4_t c1 = vaddq_s32(b1, b3); const int32x4_t c2 = vsubq_s32(b0, b2); const int32x4_t c3 = vsubq_s32(b1, b3); vst1q_s32(coeff + 0, c0); vst1q_s32(coeff + 256, c1); vst1q_s32(coeff + 512, c2); vst1q_s32(coeff + 768, c3); coeff += 4; } } aom-3.12.1/aom_dsp/arm/highbd_avg_neon.c000066400000000000000000000111721477627663500200510ustar00rootroot00000000000000/* * Copyright (c) 2023 The WebM project authors. All rights reserved. * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_ports/mem.h" uint32_t aom_highbd_avg_4x4_neon(const uint8_t *a, int a_stride) { const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); uint16x4_t sum, a0, a1, a2, a3; load_u16_4x4(a_ptr, a_stride, &a0, &a1, &a2, &a3); sum = vadd_u16(a0, a1); sum = vadd_u16(sum, a2); sum = vadd_u16(sum, a3); return (horizontal_add_u16x4(sum) + (1 << 3)) >> 4; } uint32_t aom_highbd_avg_8x8_neon(const uint8_t *a, int a_stride) { const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7; load_u16_8x8(a_ptr, a_stride, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); sum = vaddq_u16(a0, a1); sum = vaddq_u16(sum, a2); sum = vaddq_u16(sum, a3); sum = vaddq_u16(sum, a4); sum = vaddq_u16(sum, a5); sum = vaddq_u16(sum, a6); sum = vaddq_u16(sum, a7); return (horizontal_add_u16x8(sum) + (1 << 5)) >> 6; } void aom_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max) { const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8); const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p); const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p); const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p); const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p); const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p); const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p); const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p); const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p); const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp); const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp); const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp); const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp); const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp); const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp); const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp); const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp); const uint16x8_t abs_diff0 = vabdq_u16(a0, b0); const uint16x8_t abs_diff1 = vabdq_u16(a1, b1); const uint16x8_t abs_diff2 = vabdq_u16(a2, b2); const uint16x8_t abs_diff3 = vabdq_u16(a3, b3); const uint16x8_t abs_diff4 = vabdq_u16(a4, b4); const uint16x8_t abs_diff5 = vabdq_u16(a5, b5); const uint16x8_t abs_diff6 = vabdq_u16(a6, b6); const uint16x8_t abs_diff7 = vabdq_u16(a7, b7); const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1); const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3); const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5); const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7); const uint16x8_t max0123 = vmaxq_u16(max01, max23); const uint16x8_t max4567 = vmaxq_u16(max45, max67); const uint16x8_t max07 = vmaxq_u16(max0123, max4567); const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1); const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3); const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5); const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7); const uint16x8_t min0123 = vminq_u16(min01, min23); const uint16x8_t min4567 = vminq_u16(min45, min67); const uint16x8_t min07 = vminq_u16(min0123, min4567); #if AOM_ARCH_AARCH64 *max = (int)vmaxvq_u16(max07); *min = (int)vminvq_u16(min07); #else // Split into 64-bit vectors and execute pairwise min/max. uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07)); uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07)); // Enough runs of vpmax/min propagate the max/min values to every position. ab_max = vpmax_u16(ab_max, ab_max); ab_min = vpmin_u16(ab_min, ab_min); ab_max = vpmax_u16(ab_max, ab_max); ab_min = vpmin_u16(ab_min, ab_min); ab_max = vpmax_u16(ab_max, ab_max); ab_min = vpmin_u16(ab_min, ab_min); *min = *max = 0; // Clear high bits // Store directly to avoid costly neon->gpr transfer. vst1_lane_u16((uint16_t *)max, ab_max, 0); vst1_lane_u16((uint16_t *)min, ab_min, 0); #endif } aom-3.12.1/aom_dsp/arm/highbd_avg_pred_neon.c000066400000000000000000000074351477627663500210720ustar00rootroot00000000000000/* * Copyright (c) 2023 The WebM project authors. All rights reserved. * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/arm/blend_neon.h" #include "aom_dsp/arm/dist_wtd_avg_neon.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/blend.h" void aom_highbd_comp_avg_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride) { const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); int i = height; if (width > 8) { do { int j = 0; do { const uint16x8_t p = vld1q_u16(pred + j); const uint16x8_t r = vld1q_u16(ref + j); uint16x8_t avg = vrhaddq_u16(p, r); vst1q_u16(comp_pred + j, avg); j += 8; } while (j < width); comp_pred += width; pred += width; ref += ref_stride; } while (--i != 0); } else if (width == 8) { do { const uint16x8_t p = vld1q_u16(pred); const uint16x8_t r = vld1q_u16(ref); uint16x8_t avg = vrhaddq_u16(p, r); vst1q_u16(comp_pred, avg); comp_pred += width; pred += width; ref += ref_stride; } while (--i != 0); } else { assert(width == 4); do { const uint16x4_t p = vld1_u16(pred); const uint16x4_t r = vld1_u16(ref); uint16x4_t avg = vrhadd_u16(p, r); vst1_u16(comp_pred, avg); comp_pred += width; pred += width; ref += ref_stride; } while (--i != 0); } } void aom_highbd_comp_mask_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); const uint16_t *src0 = invert_mask ? pred : ref; const uint16_t *src1 = invert_mask ? ref : pred; const int src_stride0 = invert_mask ? width : ref_stride; const int src_stride1 = invert_mask ? ref_stride : width; if (width >= 8) { do { int j = 0; do { const uint16x8_t s0 = vld1q_u16(src0 + j); const uint16x8_t s1 = vld1q_u16(src1 + j); const uint16x8_t m0 = vmovl_u8(vld1_u8(mask + j)); uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, s0, s1); vst1q_u16(comp_pred + j, blend_u16); j += 8; } while (j < width); src0 += src_stride0; src1 += src_stride1; mask += mask_stride; comp_pred += width; } while (--height != 0); } else { assert(width == 4); do { const uint16x4_t s0 = vld1_u16(src0); const uint16x4_t s1 = vld1_u16(src1); const uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(mask))); uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, s0, s1); vst1_u16(comp_pred, blend_u16); src0 += src_stride0; src1 += src_stride1; mask += mask_stride; comp_pred += 4; } while (--height != 0); } } aom-3.12.1/aom_dsp/arm/highbd_blend_a64_hmask_neon.c000066400000000000000000000061041477627663500222140ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/arm/blend_neon.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/blend.h" void aom_highbd_blend_a64_hmask_neon(uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd) { (void)bd; const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); assert(bd == 8 || bd == 10 || bd == 12); if (w >= 8) { do { int i = 0; do { uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i)); uint16x8_t s0 = vld1q_u16(src0 + i); uint16x8_t s1 = vld1q_u16(src1 + i); uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1); vst1q_u16(dst + i, blend); i += 8; } while (i < w); src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else if (w == 4) { const uint16x8_t m0 = vmovl_u8(load_unaligned_dup_u8_4x2(mask)); do { uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1); store_u16x4_strided_x2(dst, dst_stride, blend); src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } else if (w == 2 && h >= 8) { const uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_dup_u8_2x4(mask))); do { uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride); uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride); uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1); store_u16x2_strided_x2(dst, dst_stride, blend); src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } else { aom_highbd_blend_a64_hmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, src1_stride, mask, w, h, bd); } } aom-3.12.1/aom_dsp/arm/highbd_blend_a64_mask_neon.c000066400000000000000000000623071477627663500220530ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/arm/blend_neon.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/blend.h" #define HBD_BLEND_A64_D16_MASK(bd, round0_bits) \ static inline uint16x8_t alpha_##bd##_blend_a64_d16_u16x8( \ uint16x8_t m, uint16x8_t a, uint16x8_t b, int32x4_t round_offset) { \ const uint16x8_t m_inv = \ vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m); \ \ uint32x4_t blend_u32_lo = vmlal_u16(vreinterpretq_u32_s32(round_offset), \ vget_low_u16(m), vget_low_u16(a)); \ uint32x4_t blend_u32_hi = vmlal_u16(vreinterpretq_u32_s32(round_offset), \ vget_high_u16(m), vget_high_u16(a)); \ \ blend_u32_lo = \ vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b)); \ blend_u32_hi = \ vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b)); \ \ uint16x4_t blend_u16_lo = \ vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_lo), \ AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS - \ round0_bits - COMPOUND_ROUND1_BITS); \ uint16x4_t blend_u16_hi = \ vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_hi), \ AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS - \ round0_bits - COMPOUND_ROUND1_BITS); \ \ uint16x8_t blend_u16 = vcombine_u16(blend_u16_lo, blend_u16_hi); \ blend_u16 = vminq_u16(blend_u16, vdupq_n_u16((1 << bd) - 1)); \ \ return blend_u16; \ } \ \ static inline void highbd_##bd##_blend_a64_d16_mask_neon( \ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, \ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, \ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, \ int subh) { \ const int offset_bits = bd + 2 * FILTER_BITS - round0_bits; \ int32_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + \ (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); \ int32x4_t offset = \ vdupq_n_s32(-(round_offset << AOM_BLEND_A64_ROUND_BITS)); \ \ if ((subw | subh) == 0) { \ if (w >= 8) { \ do { \ int i = 0; \ do { \ uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i)); \ uint16x8_t s0 = vld1q_u16(src0 + i); \ uint16x8_t s1 = vld1q_u16(src1 + i); \ \ uint16x8_t blend = \ alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset); \ \ vst1q_u16(dst + i, blend); \ i += 8; \ } while (i < w); \ \ mask += mask_stride; \ src0 += src0_stride; \ src1 += src1_stride; \ dst += dst_stride; \ } while (--h != 0); \ } else { \ do { \ uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride)); \ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \ \ uint16x8_t blend = \ alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset); \ \ store_u16x4_strided_x2(dst, dst_stride, blend); \ \ mask += 2 * mask_stride; \ src0 += 2 * src0_stride; \ src1 += 2 * src1_stride; \ dst += 2 * dst_stride; \ h -= 2; \ } while (h != 0); \ } \ } else if ((subw & subh) == 1) { \ if (w >= 8) { \ do { \ int i = 0; \ do { \ uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i); \ uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i); \ uint16x8_t s0 = vld1q_u16(src0 + i); \ uint16x8_t s1 = vld1q_u16(src1 + i); \ \ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4( \ vget_low_u8(m0), vget_low_u8(m1), vget_high_u8(m0), \ vget_high_u8(m1))); \ uint16x8_t blend = \ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ \ vst1q_u16(dst + i, blend); \ i += 8; \ } while (i < w); \ \ mask += 2 * mask_stride; \ src0 += src0_stride; \ src1 += src1_stride; \ dst += dst_stride; \ } while (--h != 0); \ } else { \ do { \ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); \ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); \ uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride); \ uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride); \ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \ \ uint16x8_t m_avg = \ vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); \ uint16x8_t blend = \ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ \ store_u16x4_strided_x2(dst, dst_stride, blend); \ \ mask += 4 * mask_stride; \ src0 += 2 * src0_stride; \ src1 += 2 * src1_stride; \ dst += 2 * dst_stride; \ h -= 2; \ } while (h != 0); \ } \ } else if (subw == 1 && subh == 0) { \ if (w >= 8) { \ do { \ int i = 0; \ do { \ uint8x8_t m0 = vld1_u8(mask + 2 * i); \ uint8x8_t m1 = vld1_u8(mask + 2 * i + 8); \ uint16x8_t s0 = vld1q_u16(src0 + i); \ uint16x8_t s1 = vld1q_u16(src1 + i); \ \ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); \ uint16x8_t blend = \ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ \ vst1q_u16(dst + i, blend); \ i += 8; \ } while (i < w); \ \ mask += mask_stride; \ src0 += src0_stride; \ src1 += src1_stride; \ dst += dst_stride; \ } while (--h != 0); \ } else { \ do { \ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); \ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); \ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \ \ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); \ uint16x8_t blend = \ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ \ store_u16x4_strided_x2(dst, dst_stride, blend); \ \ mask += 2 * mask_stride; \ src0 += 2 * src0_stride; \ src1 += 2 * src1_stride; \ dst += 2 * dst_stride; \ h -= 2; \ } while (h != 0); \ } \ } else { \ if (w >= 8) { \ do { \ int i = 0; \ do { \ uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i); \ uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i); \ uint16x8_t s0 = vld1q_u16(src0 + i); \ uint16x8_t s1 = vld1q_u16(src1 + i); \ \ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1)); \ uint16x8_t blend = \ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ \ vst1q_u16(dst + i, blend); \ i += 8; \ } while (i < w); \ \ mask += 2 * mask_stride; \ src0 += src0_stride; \ src1 += src1_stride; \ dst += dst_stride; \ } while (--h != 0); \ } else { \ do { \ uint8x8_t m0_2 = \ load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride); \ uint8x8_t m1_3 = \ load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride); \ uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \ uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \ \ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3)); \ uint16x8_t blend = \ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ \ store_u16x4_strided_x2(dst, dst_stride, blend); \ \ mask += 4 * mask_stride; \ src0 += 2 * src0_stride; \ src1 += 2 * src1_stride; \ dst += 2 * dst_stride; \ h -= 2; \ } while (h != 0); \ } \ } \ } // 12 bitdepth HBD_BLEND_A64_D16_MASK(12, (ROUND0_BITS + 2)) // 10 bitdepth HBD_BLEND_A64_D16_MASK(10, ROUND0_BITS) // 8 bitdepth HBD_BLEND_A64_D16_MASK(8, ROUND0_BITS) void aom_highbd_blend_a64_d16_mask_neon( uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd) { (void)conv_params; assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); if (bd == 12) { highbd_12_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, subw, subh); } else if (bd == 10) { highbd_10_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, subw, subh); } else { highbd_8_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, subw, subh); } } void aom_highbd_blend_a64_mask_neon(uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd) { (void)bd; const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); assert(bd == 8 || bd == 10 || bd == 12); if ((subw | subh) == 0) { if (w >= 8) { do { int i = 0; do { uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i)); uint16x8_t s0 = vld1q_u16(src0 + i); uint16x8_t s1 = vld1q_u16(src1 + i); uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1); vst1q_u16(dst + i, blend); i += 8; } while (i < w); mask += mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else { do { uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride)); uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1); store_u16x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } } else if ((subw & subh) == 1) { if (w >= 8) { do { int i = 0; do { uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i); uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i); uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8); uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8); uint16x8_t s0 = vld1q_u16(src0 + i); uint16x8_t s1 = vld1q_u16(src1 + i); uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); vst1q_u16(dst + i, blend); i += 8; } while (i < w); mask += 2 * mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else { do { uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride); uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride); uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); store_u16x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } } else if (subw == 1 && subh == 0) { if (w >= 8) { do { int i = 0; do { uint8x8_t m0 = vld1_u8(mask + 2 * i); uint8x8_t m1 = vld1_u8(mask + 2 * i + 8); uint16x8_t s0 = vld1q_u16(src0 + i); uint16x8_t s1 = vld1q_u16(src1 + i); uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); vst1q_u16(dst + i, blend); i += 8; } while (i < w); mask += mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else { do { uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); store_u16x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } } else { if (w >= 8) { do { int i = 0; do { uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i); uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i); uint16x8_t s0 = vld1q_u16(src0 + i); uint16x8_t s1 = vld1q_u16(src1 + i); uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1)); uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); vst1q_u16(dst + i, blend); i += 8; } while (i < w); mask += 2 * mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else { do { uint8x8_t m0_2 = load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride); uint8x8_t m1_3 = load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride); uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3)); uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); store_u16x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } } } aom-3.12.1/aom_dsp/arm/highbd_blend_a64_vmask_neon.c000066400000000000000000000065201477627663500222340ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/arm/blend_neon.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/blend.h" void aom_highbd_blend_a64_vmask_neon(uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd) { (void)bd; const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); assert(bd == 8 || bd == 10 || bd == 12); if (w >= 8) { do { uint16x8_t m = vmovl_u8(vdup_n_u8(mask[0])); int i = 0; do { uint16x8_t s0 = vld1q_u16(src0 + i); uint16x8_t s1 = vld1q_u16(src1 + i); uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1); vst1q_u16(dst + i, blend); i += 8; } while (i < w); mask += 1; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else if (w == 4) { do { uint16x4_t m1 = vdup_n_u16((uint16_t)mask[0]); uint16x4_t m2 = vdup_n_u16((uint16_t)mask[1]); uint16x8_t m = vcombine_u16(m1, m2); uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1); store_u16x4_strided_x2(dst, dst_stride, blend); mask += 2; src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } else if (w == 2 && h >= 8) { do { uint16x4_t m0 = vdup_n_u16(0); m0 = vld1_lane_u16((uint16_t *)mask, m0, 0); uint8x8_t m0_zip = vzip_u8(vreinterpret_u8_u16(m0), vreinterpret_u8_u16(m0)).val[0]; m0 = vget_low_u16(vmovl_u8(m0_zip)); uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride); uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride); uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1); store_u16x2_strided_x2(dst, dst_stride, blend); mask += 2; src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } else { aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, src1_stride, mask, w, h, bd); } } aom-3.12.1/aom_dsp/arm/highbd_convolve8_neon.c000066400000000000000000000341451477627663500212240ustar00rootroot00000000000000/* * Copyright (c) 2014 The WebM project authors. All rights reserved. * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/arm/aom_filter.h" #include "aom_dsp/arm/highbd_convolve8_neon.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" static inline uint16x4_t highbd_convolve8_4( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t filter, const uint16x4_t max) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); int32x4_t sum = vmull_lane_s16(s0, filter_lo, 0); sum = vmlal_lane_s16(sum, s1, filter_lo, 1); sum = vmlal_lane_s16(sum, s2, filter_lo, 2); sum = vmlal_lane_s16(sum, s3, filter_lo, 3); sum = vmlal_lane_s16(sum, s4, filter_hi, 0); sum = vmlal_lane_s16(sum, s5, filter_hi, 1); sum = vmlal_lane_s16(sum, s6, filter_hi, 2); sum = vmlal_lane_s16(sum, s7, filter_hi, 3); uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve8_8( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t filter, const uint16x8_t max) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filter_lo, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_lo, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_lo, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_lo, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_hi, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_hi, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_hi, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_hi, 3); int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filter_lo, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_lo, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_lo, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_lo, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_hi, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_hi, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_hi, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_hi, 3); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), vqrshrun_n_s32(sum1, FILTER_BITS)); return vminq_u16(res, max); } static void highbd_convolve_horiz_8tap_neon( const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) { assert(w >= 4 && h >= 4); const int16x8_t x_filter = vld1q_s16(x_filter_ptr); if (w == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x4_t s0[8], s1[8], s2[8], s3[8]; load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7]); uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], s0[7], x_filter, max); uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], s1[7], x_filter, max); uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], s2[7], x_filter, max); uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], s3[7], x_filter, max); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h > 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[8], s1[8], s2[8], s3[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7]); uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], s0[7], x_filter, max); uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], s1[7], x_filter, max); uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], s2[7], x_filter, max); uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], s3[7], x_filter, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width > 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height > 0); } } static void highbd_convolve_horiz_4tap_neon( const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) { assert(w >= 4 && h >= 4); const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); if (w == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x4_t s0[4], s1[4], s2[4], s3[4]; load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); uint16x4_t d0 = highbd_convolve4_4(s0[0], s0[1], s0[2], s0[3], x_filter, max); uint16x4_t d1 = highbd_convolve4_4(s1[0], s1[1], s1[2], s1[3], x_filter, max); uint16x4_t d2 = highbd_convolve4_4(s2[0], s2[1], s2[2], s2[3], x_filter, max); uint16x4_t d3 = highbd_convolve4_4(s3[0], s3[1], s3[2], s3[3], x_filter, max); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h > 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[4], s1[4], s2[4], s3[4]; load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); uint16x8_t d0 = highbd_convolve4_8(s0[0], s0[1], s0[2], s0[3], x_filter, max); uint16x8_t d1 = highbd_convolve4_8(s1[0], s1[1], s1[2], s1[3], x_filter, max); uint16x8_t d2 = highbd_convolve4_8(s2[0], s2[1], s2[2], s2[3], x_filter, max); uint16x8_t d3 = highbd_convolve4_8(s3[0], s3[1], s3[2], s3[3], x_filter, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width > 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height > 0); } } void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { if (x_step_q4 != 16) { aom_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); } else { (void)filter_y; (void)y_step_q4; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; const int filter_taps = get_filter_taps_convolve8(filter_x); if (filter_taps == 2) { highbd_convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w, h, bd); } else if (filter_taps == 4) { highbd_convolve_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride, filter_x, w, h, bd); } else { highbd_convolve_horiz_8tap_neon(src, src_stride, dst, dst_stride, filter_x, w, h, bd); } } } static void highbd_convolve_vert_8tap_neon( const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, ptrdiff_t dst_stride, const int16_t *y_filter_ptr, int w, int h, int bd) { assert(w >= 4 && h >= 4); const int16x8_t y_filter = vld1q_s16(y_filter_ptr); if (w == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x4_t s0, s1, s2, s3, s4, s5, s6; load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; do { int16x4_t s7, s8, s9, s10; load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); uint16x4_t d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max); uint16x4_t d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max); uint16x4_t d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max); uint16x4_t d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h > 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); do { int height = h; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; do { int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); uint16x8_t d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max); uint16x8_t d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max); uint16x8_t d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max); uint16x8_t d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height > 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w > 0); } } void aom_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { if (y_step_q4 != 16) { aom_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); } else { (void)filter_x; (void)x_step_q4; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= (SUBPEL_TAPS / 2 - 1) * src_stride; const int filter_taps = get_filter_taps_convolve8(filter_y); if (filter_taps == 2) { highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride, filter_y, w, h, bd); } else if (filter_taps == 4) { highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride, filter_y, w, h, bd); } else { highbd_convolve_vert_8tap_neon(src, src_stride, dst, dst_stride, filter_y, w, h, bd); } } } aom-3.12.1/aom_dsp/arm/highbd_convolve8_neon.h000066400000000000000000000224451477627663500212310ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_ #define AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_ #include #include "config/aom_config.h" #include "aom_dsp/arm/mem_neon.h" static inline void highbd_convolve8_horiz_2tap_neon( const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) { // Bilinear filter values are all positive and multiples of 8. Divide by 8 to // reduce intermediate precision requirements and allow the use of non // widening multiply. const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8); const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); if (w == 4) { do { uint16x8_t s0 = load_unaligned_u16_4x2(src_ptr + 0 * src_stride + 0, (int)src_stride); uint16x8_t s1 = load_unaligned_u16_4x2(src_ptr + 0 * src_stride + 1, (int)src_stride); uint16x8_t s2 = load_unaligned_u16_4x2(src_ptr + 2 * src_stride + 0, (int)src_stride); uint16x8_t s3 = load_unaligned_u16_4x2(src_ptr + 2 * src_stride + 1, (int)src_stride); uint16x8_t sum01 = vmulq_u16(s0, f0); sum01 = vmlaq_u16(sum01, s1, f1); uint16x8_t sum23 = vmulq_u16(s2, f0); sum23 = vmlaq_u16(sum23, s3, f1); // We divided filter taps by 8 so subtract 3 from right shift. sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3); sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3); sum01 = vminq_u16(sum01, max); sum23 = vminq_u16(sum23, max); store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01); store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; } while (h > 0); } else { do { int width = w; const uint16_t *s = src_ptr; uint16_t *d = dst_ptr; do { uint16x8_t s0 = vld1q_u16(s + 0 * src_stride + 0); uint16x8_t s1 = vld1q_u16(s + 0 * src_stride + 1); uint16x8_t s2 = vld1q_u16(s + 1 * src_stride + 0); uint16x8_t s3 = vld1q_u16(s + 1 * src_stride + 1); uint16x8_t sum01 = vmulq_u16(s0, f0); sum01 = vmlaq_u16(sum01, s1, f1); uint16x8_t sum23 = vmulq_u16(s2, f0); sum23 = vmlaq_u16(sum23, s3, f1); // We divided filter taps by 8 so subtract 3 from right shift. sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3); sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3); sum01 = vminq_u16(sum01, max); sum23 = vminq_u16(sum23, max); vst1q_u16(d + 0 * dst_stride, sum01); vst1q_u16(d + 1 * dst_stride, sum23); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 2 * src_stride; dst_ptr += 2 * dst_stride; h -= 2; } while (h > 0); } } static inline uint16x4_t highbd_convolve4_4( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t filter, const uint16x4_t max) { int32x4_t sum = vmull_lane_s16(s0, filter, 0); sum = vmlal_lane_s16(sum, s1, filter, 1); sum = vmlal_lane_s16(sum, s2, filter, 2); sum = vmlal_lane_s16(sum, s3, filter, 3); uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve4_8( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x4_t filter, const uint16x8_t max) { int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filter, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3); int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filter, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), vqrshrun_n_s32(sum1, FILTER_BITS)); return vminq_u16(res, max); } static inline void highbd_convolve8_vert_4tap_neon( const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, ptrdiff_t dst_stride, const int16_t *y_filter_ptr, int w, int h, int bd) { assert(w >= 4 && h >= 4); const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); if (w == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x4_t s0, s1, s2; load_s16_4x3(s, src_stride, &s0, &s1, &s2); s += 3 * src_stride; do { int16x4_t s3, s4, s5, s6; load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, y_filter, max); uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, y_filter, max); uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, y_filter, max); uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, y_filter, max); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h > 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); do { int height = h; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x8_t s0, s1, s2; load_s16_8x3(s, src_stride, &s0, &s1, &s2); s += 3 * src_stride; do { int16x8_t s3, s4, s5, s6; load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); uint16x8_t d0 = highbd_convolve4_8(s0, s1, s2, s3, y_filter, max); uint16x8_t d1 = highbd_convolve4_8(s1, s2, s3, s4, y_filter, max); uint16x8_t d2 = highbd_convolve4_8(s2, s3, s4, s5, y_filter, max); uint16x8_t d3 = highbd_convolve4_8(s3, s4, s5, s6, y_filter, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height > 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w > 0); } } static inline void highbd_convolve8_vert_2tap_neon( const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) { // Bilinear filter values are all positive and multiples of 8. Divide by 8 to // reduce intermediate precision requirements and allow the use of non // widening multiply. const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8); const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); if (w == 4) { do { uint16x8_t s0 = load_unaligned_u16_4x2(src_ptr + 0 * src_stride, (int)src_stride); uint16x8_t s1 = load_unaligned_u16_4x2(src_ptr + 1 * src_stride, (int)src_stride); uint16x8_t s2 = load_unaligned_u16_4x2(src_ptr + 2 * src_stride, (int)src_stride); uint16x8_t s3 = load_unaligned_u16_4x2(src_ptr + 3 * src_stride, (int)src_stride); uint16x8_t sum01 = vmulq_u16(s0, f0); sum01 = vmlaq_u16(sum01, s1, f1); uint16x8_t sum23 = vmulq_u16(s2, f0); sum23 = vmlaq_u16(sum23, s3, f1); // We divided filter taps by 8 so subtract 3 from right shift. sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3); sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3); sum01 = vminq_u16(sum01, max); sum23 = vminq_u16(sum23, max); store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01); store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; } while (h > 0); } else { do { int width = w; const uint16_t *s = src_ptr; uint16_t *d = dst_ptr; do { uint16x8_t s0, s1, s2; load_u16_8x3(s, src_stride, &s0, &s1, &s2); uint16x8_t sum01 = vmulq_u16(s0, f0); sum01 = vmlaq_u16(sum01, s1, f1); uint16x8_t sum23 = vmulq_u16(s1, f0); sum23 = vmlaq_u16(sum23, s2, f1); // We divided filter taps by 8 so subtract 3 from right shift. sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3); sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3); sum01 = vminq_u16(sum01, max); sum23 = vminq_u16(sum23, max); vst1q_u16(d + 0 * dst_stride, sum01); vst1q_u16(d + 1 * dst_stride, sum23); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 2 * src_stride; dst_ptr += 2 * dst_stride; h -= 2; } while (h > 0); } } #endif // AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_ aom-3.12.1/aom_dsp/arm/highbd_convolve8_sve.c000066400000000000000000000524431477627663500210630ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/aom_filter.h" #include "aom_dsp/arm/highbd_convolve8_neon.h" #include "aom_dsp/arm/mem_neon.h" static inline uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter, uint16x4_t max) { int64x2_t sum[4]; sum[0] = aom_sdotq_s16(vdupq_n_s64(0), s[0], filter); sum[1] = aom_sdotq_s16(vdupq_n_s64(0), s[1], filter); sum[2] = aom_sdotq_s16(vdupq_n_s64(0), s[2], filter); sum[3] = aom_sdotq_s16(vdupq_n_s64(0), s[3], filter); int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]); int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve8_8_h(int16x8_t s[8], int16x8_t filter, uint16x8_t max) { int64x2_t sum[8]; sum[0] = aom_sdotq_s16(vdupq_n_s64(0), s[0], filter); sum[1] = aom_sdotq_s16(vdupq_n_s64(0), s[1], filter); sum[2] = aom_sdotq_s16(vdupq_n_s64(0), s[2], filter); sum[3] = aom_sdotq_s16(vdupq_n_s64(0), s[3], filter); sum[4] = aom_sdotq_s16(vdupq_n_s64(0), s[4], filter); sum[5] = aom_sdotq_s16(vdupq_n_s64(0), s[5], filter); sum[6] = aom_sdotq_s16(vdupq_n_s64(0), s[6], filter); sum[7] = aom_sdotq_s16(vdupq_n_s64(0), s[7], filter); int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]); int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]); int64x2_t sum45 = vpaddq_s64(sum[4], sum[5]); int64x2_t sum67 = vpaddq_s64(sum[6], sum[7]); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), vqrshrun_n_s32(sum4567, FILTER_BITS)); return vminq_u16(res, max); } static inline void highbd_convolve8_horiz_8tap_sve( const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height, int bd) { const int16x8_t filter = vld1q_s16(filter_x); if (width == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); const int16_t *s = (const int16_t *)src; uint16_t *d = dst; do { int16x8_t s0[4], s1[4], s2[4], s3[4]; load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); uint16x4_t d0 = highbd_convolve8_4_h(s0, filter, max); uint16x4_t d1 = highbd_convolve8_4_h(s1, filter, max); uint16x4_t d2 = highbd_convolve8_4_h(s2, filter, max); uint16x4_t d3 = highbd_convolve8_4_h(s3, filter, max); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height > 0); } else { do { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); const int16_t *s = (const int16_t *)src; uint16_t *d = dst; int w = width; do { int16x8_t s0[8], s1[8], s2[8], s3[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7]); uint16x8_t d0 = highbd_convolve8_8_h(s0, filter, max); uint16x8_t d1 = highbd_convolve8_8_h(s1, filter, max); uint16x8_t d2 = highbd_convolve8_8_h(s2, filter, max); uint16x8_t d3 = highbd_convolve8_8_h(s3, filter, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 0); } } // clang-format off DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[16]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, }; DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = { 0, 2, 4, 6, 1, 3, 5, 7, }; // clang-format on static inline uint16x4_t highbd_convolve4_4_h(int16x8_t s, int16x8_t filter, uint16x8x2_t permute_tbl, uint16x4_t max) { int16x8_t permuted_samples0 = aom_tbl_s16(s, permute_tbl.val[0]); int16x8_t permuted_samples1 = aom_tbl_s16(s, permute_tbl.val[1]); int64x2_t sum0 = aom_svdot_lane_s16(vdupq_n_s64(0), permuted_samples0, filter, 0); int64x2_t sum1 = aom_svdot_lane_s16(vdupq_n_s64(0), permuted_samples1, filter, 0); int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1)); uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve4_8_h(int16x8_t s[4], int16x8_t filter, uint16x8_t idx, uint16x8_t max) { int64x2_t sum04 = aom_svdot_lane_s16(vdupq_n_s64(0), s[0], filter, 0); int64x2_t sum15 = aom_svdot_lane_s16(vdupq_n_s64(0), s[1], filter, 0); int64x2_t sum26 = aom_svdot_lane_s16(vdupq_n_s64(0), s[2], filter, 0); int64x2_t sum37 = aom_svdot_lane_s16(vdupq_n_s64(0), s[3], filter, 0); int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS), vqrshrun_n_s32(res1, FILTER_BITS)); res = aom_tbl_u16(res, idx); return vminq_u16(res, max); } static inline void highbd_convolve8_horiz_4tap_sve( const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int width, int height, int bd) { const int16x8_t filter = vcombine_s16(vld1_s16(filter_x + 2), vdup_n_s16(0)); if (width == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); const int16_t *s = (const int16_t *)src; uint16_t *d = dst; do { int16x8_t s0, s1, s2, s3; load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x4_t d0 = highbd_convolve4_4_h(s0, filter, permute_tbl, max); uint16x4_t d1 = highbd_convolve4_4_h(s1, filter, permute_tbl, max); uint16x4_t d2 = highbd_convolve4_4_h(s2, filter, permute_tbl, max); uint16x4_t d3 = highbd_convolve4_4_h(s3, filter, permute_tbl, max); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height > 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); do { const int16_t *s = (const int16_t *)src; uint16_t *d = dst; int w = width; do { int16x8_t s0[4], s1[4], s2[4], s3[4]; load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); uint16x8_t d0 = highbd_convolve4_8_h(s0, filter, idx, max); uint16x8_t d1 = highbd_convolve4_8_h(s1, filter, idx, max); uint16x8_t d2 = highbd_convolve4_8_h(s2, filter, idx, max); uint16x8_t d3 = highbd_convolve4_8_h(s3, filter, idx, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 0); } } void aom_highbd_convolve8_horiz_sve(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int width, int height, int bd) { assert(x_step_q4 == 16); assert(width >= 4 && height >= 4); (void)filter_y; (void)x_step_q4; (void)y_step_q4; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; const int filter_taps = get_filter_taps_convolve8(filter_x); if (filter_taps == 2) { highbd_convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, width, height, bd); } else if (filter_taps == 4) { highbd_convolve8_horiz_4tap_sve(src + 2, src_stride, dst, dst_stride, filter_x, width, height, bd); } else { highbd_convolve8_horiz_8tap_sve(src, src_stride, dst, dst_stride, filter_x, width, height, bd); } } DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { // Shift left and insert new last column in transposed 4x4 block. 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 24, 25, // Shift left and insert two new columns in transposed 4x4 block. 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15, 24, 25, 26, 27, // Shift left and insert three new columns in transposed 4x4 block. 6, 7, 16, 17, 18, 19, 20, 21, 14, 15, 24, 25, 26, 27, 28, 29 }; static inline void transpose_concat_4x4(int16x4_t s0, int16x4_t s1, int16x4_t s2, int16x4_t s3, int16x8_t res[2]) { // Transpose 16-bit elements and concatenate result rows as follows: // s0: 00, 01, 02, 03 // s1: 10, 11, 12, 13 // s2: 20, 21, 22, 23 // s3: 30, 31, 32, 33 // // res[0]: 00 10 20 30 01 11 21 31 // res[1]: 02 12 22 32 03 13 23 33 int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0)); int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0)); int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0)); int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0)); int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q)); int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q)); int32x4x2_t s0123 = vzipq_s32(s01, s23); res[0] = vreinterpretq_s16_s32(s0123.val[0]); res[1] = vreinterpretq_s16_s32(s0123.val[1]); } static inline void transpose_concat_8x4(int16x8_t s0, int16x8_t s1, int16x8_t s2, int16x8_t s3, int16x8_t res[4]) { // Transpose 16-bit elements and concatenate result rows as follows: // s0: 00, 01, 02, 03, 04, 05, 06, 07 // s1: 10, 11, 12, 13, 14, 15, 16, 17 // s2: 20, 21, 22, 23, 24, 25, 26, 27 // s3: 30, 31, 32, 33, 34, 35, 36, 37 // // res_lo[0]: 00 10 20 30 01 11 21 31 // res_lo[1]: 02 12 22 32 03 13 23 33 // res_hi[0]: 04 14 24 34 05 15 25 35 // res_hi[1]: 06 16 26 36 07 17 27 37 int16x8x2_t tr01_16 = vzipq_s16(s0, s1); int16x8x2_t tr23_16 = vzipq_s16(s2, s3); int32x4x2_t tr01_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[0]), vreinterpretq_s32_s16(tr23_16.val[0])); int32x4x2_t tr23_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[1]), vreinterpretq_s32_s16(tr23_16.val[1])); res[0] = vreinterpretq_s16_s32(tr01_32.val[0]); res[1] = vreinterpretq_s16_s32(tr01_32.val[1]); res[2] = vreinterpretq_s16_s32(tr23_32.val[0]); res[3] = vreinterpretq_s16_s32(tr23_32.val[1]); } static inline void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4], uint8x16_t tbl, int16x8_t res[4]) { int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]), vreinterpretq_s8_s16(t1[0]) }; int8x16x2_t samples1 = { vreinterpretq_s8_s16(t0[1]), vreinterpretq_s8_s16(t1[1]) }; int8x16x2_t samples2 = { vreinterpretq_s8_s16(t0[2]), vreinterpretq_s8_s16(t1[2]) }; int8x16x2_t samples3 = { vreinterpretq_s8_s16(t0[3]), vreinterpretq_s8_s16(t1[3]) }; res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples0, tbl)); res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl)); res[2] = vreinterpretq_s16_s8(vqtbl2q_s8(samples2, tbl)); res[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples3, tbl)); } static inline void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2], uint8x16_t tbl, int16x8_t res[2]) { int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]), vreinterpretq_s8_s16(t1[0]) }; int8x16x2_t samples1 = { vreinterpretq_s8_s16(t0[1]), vreinterpretq_s8_s16(t1[1]) }; res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples0, tbl)); res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl)); } static inline uint16x4_t highbd_convolve8_4_v(int16x8_t samples_lo[2], int16x8_t samples_hi[2], int16x8_t filter, uint16x4_t max) { int64x2_t sum[2]; sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0); sum[0] = aom_svdot_lane_s16(sum[0], samples_hi[0], filter, 1); sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0); sum[1] = aom_svdot_lane_s16(sum[1], samples_hi[1], filter, 1); int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1])); uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve8_8_v(int16x8_t samples_lo[4], int16x8_t samples_hi[4], int16x8_t filter, uint16x8_t max) { int64x2_t sum[4]; sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0); sum[0] = aom_svdot_lane_s16(sum[0], samples_hi[0], filter, 1); sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0); sum[1] = aom_svdot_lane_s16(sum[1], samples_hi[1], filter, 1); sum[2] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[2], filter, 0); sum[2] = aom_svdot_lane_s16(sum[2], samples_hi[2], filter, 1); sum[3] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[3], filter, 0); sum[3] = aom_svdot_lane_s16(sum[3], samples_hi[3], filter, 1); int32x4_t res0 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1])); int32x4_t res1 = vcombine_s32(vmovn_s64(sum[2]), vmovn_s64(sum[3])); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS), vqrshrun_n_s32(res1, FILTER_BITS)); return vminq_u16(res, max); } static inline void highbd_convolve8_vert_8tap_sve( const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_y, int width, int height, int bd) { const int16x8_t y_filter = vld1q_s16(filter_y); uint8x16_t merge_block_tbl[3]; merge_block_tbl[0] = vld1q_u8(kDotProdMergeBlockTbl); merge_block_tbl[1] = vld1q_u8(kDotProdMergeBlockTbl + 16); merge_block_tbl[2] = vld1q_u8(kDotProdMergeBlockTbl + 32); if (width == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); int16_t *s = (int16_t *)src; int16x4_t s0, s1, s2, s3, s4, s5, s6; load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; // This operation combines a conventional transpose and the sample permute // required before computing the dot product. int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; transpose_concat_4x4(s0, s1, s2, s3, s0123); transpose_concat_4x4(s1, s2, s3, s4, s1234); transpose_concat_4x4(s2, s3, s4, s5, s2345); transpose_concat_4x4(s3, s4, s5, s6, s3456); do { int16x4_t s7, s8, s9, s10; load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); int16x8_t s4567[2], s5678[2], s6789[2], s78910[2]; // Transpose and shuffle the 4 lines that were loaded. transpose_concat_4x4(s7, s8, s9, s10, s78910); // Merge new data into block from previous iteration. aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[0], s4567); aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[1], s5678); aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[2], s6789); uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, y_filter, max); uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, y_filter, max); uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, y_filter, max); uint16x4_t d3 = highbd_convolve8_4_v(s3456, s78910, y_filter, max); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123[0] = s4567[0]; s0123[1] = s4567[1]; s1234[0] = s5678[0]; s1234[1] = s5678[1]; s2345[0] = s6789[0]; s2345[1] = s6789[1]; s3456[0] = s78910[0]; s3456[1] = s78910[1]; s += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); do { int h = height; int16_t *s = (int16_t *)src; uint16_t *d = dst; int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; // This operation combines a conventional transpose and the sample permute // required before computing the dot product. int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; transpose_concat_8x4(s0, s1, s2, s3, s0123); transpose_concat_8x4(s1, s2, s3, s4, s1234); transpose_concat_8x4(s2, s3, s4, s5, s2345); transpose_concat_8x4(s3, s4, s5, s6, s3456); do { int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); int16x8_t s4567[4], s5678[4], s6789[4], s78910[4]; // Transpose and shuffle the 4 lines that were loaded. transpose_concat_8x4(s7, s8, s9, s10, s78910); // Merge new data into block from previous iteration. aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[0], s4567); aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[1], s5678); aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[2], s6789); uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, y_filter, max); uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, y_filter, max); uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, y_filter, max); uint16x8_t d3 = highbd_convolve8_8_v(s3456, s78910, y_filter, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123[0] = s4567[0]; s0123[1] = s4567[1]; s0123[2] = s4567[2]; s0123[3] = s4567[3]; s1234[0] = s5678[0]; s1234[1] = s5678[1]; s1234[2] = s5678[2]; s1234[3] = s5678[3]; s2345[0] = s6789[0]; s2345[1] = s6789[1]; s2345[2] = s6789[2]; s2345[3] = s6789[3]; s3456[0] = s78910[0]; s3456[1] = s78910[1]; s3456[2] = s78910[2]; s3456[3] = s78910[3]; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); src += 8; dst += 8; width -= 8; } while (width != 0); } } void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int width, int height, int bd) { assert(y_step_q4 == 16); assert(width >= 4 && height >= 4); (void)filter_x; (void)y_step_q4; (void)x_step_q4; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= (SUBPEL_TAPS / 2 - 1) * src_stride; const int filter_taps = get_filter_taps_convolve8(filter_y); if (filter_taps == 2) { highbd_convolve8_vert_2tap_neon(src + 3 * src_stride, src_stride, dst, dst_stride, filter_y, width, height, bd); } else if (filter_taps == 4) { highbd_convolve8_vert_4tap_neon(src + 2 * src_stride, src_stride, dst, dst_stride, filter_y, width, height, bd); } else { highbd_convolve8_vert_8tap_sve(src, src_stride, dst, dst_stride, filter_y, width, height, bd); } } aom-3.12.1/aom_dsp/arm/highbd_hadamard_neon.c000066400000000000000000000172251477627663500210420ustar00rootroot00000000000000/* * Copyright (c) 2023 The WebM project authors. All rights reserved. * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_ports/mem.h" static inline void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, int16x8_t *a6, int16x8_t *a7) { int16x8_t b0 = vaddq_s16(*a0, *a1); int16x8_t b1 = vsubq_s16(*a0, *a1); int16x8_t b2 = vaddq_s16(*a2, *a3); int16x8_t b3 = vsubq_s16(*a2, *a3); int16x8_t b4 = vaddq_s16(*a4, *a5); int16x8_t b5 = vsubq_s16(*a4, *a5); int16x8_t b6 = vaddq_s16(*a6, *a7); int16x8_t b7 = vsubq_s16(*a6, *a7); int16x8_t c0 = vaddq_s16(b0, b2); int16x8_t c2 = vsubq_s16(b0, b2); int16x8_t c1 = vaddq_s16(b1, b3); int16x8_t c3 = vsubq_s16(b1, b3); int16x8_t c4 = vaddq_s16(b4, b6); int16x8_t c6 = vsubq_s16(b4, b6); int16x8_t c5 = vaddq_s16(b5, b7); int16x8_t c7 = vsubq_s16(b5, b7); *a0 = vaddq_s16(c0, c4); *a2 = vsubq_s16(c0, c4); *a7 = vaddq_s16(c1, c5); *a6 = vsubq_s16(c1, c5); *a3 = vaddq_s16(c2, c6); *a1 = vsubq_s16(c2, c6); *a4 = vaddq_s16(c3, c7); *a5 = vsubq_s16(c3, c7); } static inline void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1, int16x4_t a2, int16x4_t a3, int16x4_t a4, int16x4_t a5, int16x4_t a6, int16x4_t a7, tran_low_t *coeff) { int32x4_t b0 = vaddl_s16(a0, a1); int32x4_t b1 = vsubl_s16(a0, a1); int32x4_t b2 = vaddl_s16(a2, a3); int32x4_t b3 = vsubl_s16(a2, a3); int32x4_t b4 = vaddl_s16(a4, a5); int32x4_t b5 = vsubl_s16(a4, a5); int32x4_t b6 = vaddl_s16(a6, a7); int32x4_t b7 = vsubl_s16(a6, a7); int32x4_t c0 = vaddq_s32(b0, b2); int32x4_t c2 = vsubq_s32(b0, b2); int32x4_t c1 = vaddq_s32(b1, b3); int32x4_t c3 = vsubq_s32(b1, b3); int32x4_t c4 = vaddq_s32(b4, b6); int32x4_t c6 = vsubq_s32(b4, b6); int32x4_t c5 = vaddq_s32(b5, b7); int32x4_t c7 = vsubq_s32(b5, b7); int32x4_t d0 = vaddq_s32(c0, c4); int32x4_t d2 = vsubq_s32(c0, c4); int32x4_t d7 = vaddq_s32(c1, c5); int32x4_t d6 = vsubq_s32(c1, c5); int32x4_t d3 = vaddq_s32(c2, c6); int32x4_t d1 = vsubq_s32(c2, c6); int32x4_t d4 = vaddq_s32(c3, c7); int32x4_t d5 = vsubq_s32(c3, c7); vst1q_s32(coeff + 0, d0); vst1q_s32(coeff + 4, d1); vst1q_s32(coeff + 8, d2); vst1q_s32(coeff + 12, d3); vst1q_s32(coeff + 16, d4); vst1q_s32(coeff + 20, d5); vst1q_s32(coeff + 24, d6); vst1q_s32(coeff + 28, d7); } void aom_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int16x4_t b0, b1, b2, b3, b4, b5, b6, b7; int16x8_t s0 = vld1q_s16(src_diff + 0 * src_stride); int16x8_t s1 = vld1q_s16(src_diff + 1 * src_stride); int16x8_t s2 = vld1q_s16(src_diff + 2 * src_stride); int16x8_t s3 = vld1q_s16(src_diff + 3 * src_stride); int16x8_t s4 = vld1q_s16(src_diff + 4 * src_stride); int16x8_t s5 = vld1q_s16(src_diff + 5 * src_stride); int16x8_t s6 = vld1q_s16(src_diff + 6 * src_stride); int16x8_t s7 = vld1q_s16(src_diff + 7 * src_stride); // For the first pass we can stay in 16-bit elements (4095*8 = 32760). hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); // For the second pass we need to widen to 32-bit elements, so we're // processing 4 columns at a time. // Skip the second transpose because it is not required. b0 = vget_low_s16(s0); b1 = vget_low_s16(s1); b2 = vget_low_s16(s2); b3 = vget_low_s16(s3); b4 = vget_low_s16(s4); b5 = vget_low_s16(s5); b6 = vget_low_s16(s6); b7 = vget_low_s16(s7); hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff); b0 = vget_high_s16(s0); b1 = vget_high_s16(s1); b2 = vget_high_s16(s2); b3 = vget_high_s16(s3); b4 = vget_high_s16(s4); b5 = vget_high_s16(s5); b6 = vget_high_s16(s6); b7 = vget_high_s16(s7); hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32); } void aom_highbd_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { // Rearrange 16x16 to 8x32 and remove stride. // Top left first. aom_highbd_hadamard_8x8_neon(src_diff, src_stride, coeff); // Top right. aom_highbd_hadamard_8x8_neon(src_diff + 8, src_stride, coeff + 64); // Bottom left. aom_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride, src_stride, coeff + 128); // Bottom right. aom_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride + 8, src_stride, coeff + 192); for (int i = 0; i < 16; i++) { int32x4_t a0 = vld1q_s32(coeff + 4 * i); int32x4_t a1 = vld1q_s32(coeff + 4 * i + 64); int32x4_t a2 = vld1q_s32(coeff + 4 * i + 128); int32x4_t a3 = vld1q_s32(coeff + 4 * i + 192); int32x4_t b0 = vhaddq_s32(a0, a1); int32x4_t b1 = vhsubq_s32(a0, a1); int32x4_t b2 = vhaddq_s32(a2, a3); int32x4_t b3 = vhsubq_s32(a2, a3); int32x4_t c0 = vaddq_s32(b0, b2); int32x4_t c1 = vaddq_s32(b1, b3); int32x4_t c2 = vsubq_s32(b0, b2); int32x4_t c3 = vsubq_s32(b1, b3); vst1q_s32(coeff + 4 * i, c0); vst1q_s32(coeff + 4 * i + 64, c1); vst1q_s32(coeff + 4 * i + 128, c2); vst1q_s32(coeff + 4 * i + 192, c3); } } void aom_highbd_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { // Rearrange 32x32 to 16x64 and remove stride. // Top left first. aom_highbd_hadamard_16x16_neon(src_diff, src_stride, coeff); // Top right. aom_highbd_hadamard_16x16_neon(src_diff + 16, src_stride, coeff + 256); // Bottom left. aom_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride, src_stride, coeff + 512); // Bottom right. aom_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride + 16, src_stride, coeff + 768); for (int i = 0; i < 64; i++) { int32x4_t a0 = vld1q_s32(coeff + 4 * i); int32x4_t a1 = vld1q_s32(coeff + 4 * i + 256); int32x4_t a2 = vld1q_s32(coeff + 4 * i + 512); int32x4_t a3 = vld1q_s32(coeff + 4 * i + 768); int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2); int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2); int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2); int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2); int32x4_t c0 = vaddq_s32(b0, b2); int32x4_t c1 = vaddq_s32(b1, b3); int32x4_t c2 = vsubq_s32(b0, b2); int32x4_t c3 = vsubq_s32(b1, b3); vst1q_s32(coeff + 4 * i, c0); vst1q_s32(coeff + 4 * i + 256, c1); vst1q_s32(coeff + 4 * i + 512, c2); vst1q_s32(coeff + 4 * i + 768, c3); } } aom-3.12.1/aom_dsp/arm/highbd_intrapred_neon.c000066400000000000000000003601711477627663500212720ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_dsp/intrapred_common.h" // ----------------------------------------------------------------------------- // DC static inline void highbd_dc_store_4xh(uint16_t *dst, ptrdiff_t stride, int h, uint16x4_t dc) { for (int i = 0; i < h; ++i) { vst1_u16(dst + i * stride, dc); } } static inline void highbd_dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int h, uint16x8_t dc) { for (int i = 0; i < h; ++i) { vst1q_u16(dst + i * stride, dc); } } static inline void highbd_dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int h, uint16x8_t dc) { for (int i = 0; i < h; ++i) { vst1q_u16(dst + i * stride, dc); vst1q_u16(dst + i * stride + 8, dc); } } static inline void highbd_dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int h, uint16x8_t dc) { for (int i = 0; i < h; ++i) { vst1q_u16(dst + i * stride, dc); vst1q_u16(dst + i * stride + 8, dc); vst1q_u16(dst + i * stride + 16, dc); vst1q_u16(dst + i * stride + 24, dc); } } static inline void highbd_dc_store_64xh(uint16_t *dst, ptrdiff_t stride, int h, uint16x8_t dc) { for (int i = 0; i < h; ++i) { vst1q_u16(dst + i * stride, dc); vst1q_u16(dst + i * stride + 8, dc); vst1q_u16(dst + i * stride + 16, dc); vst1q_u16(dst + i * stride + 24, dc); vst1q_u16(dst + i * stride + 32, dc); vst1q_u16(dst + i * stride + 40, dc); vst1q_u16(dst + i * stride + 48, dc); vst1q_u16(dst + i * stride + 56, dc); } } static inline uint32x4_t horizontal_add_and_broadcast_long_u16x8(uint16x8_t a) { // Need to assume input is up to 16 bits wide from dc 64x64 partial sum, so // promote first. const uint32x4_t b = vpaddlq_u16(a); #if AOM_ARCH_AARCH64 const uint32x4_t c = vpaddq_u32(b, b); return vpaddq_u32(c, c); #else const uint32x2_t c = vadd_u32(vget_low_u32(b), vget_high_u32(b)); const uint32x2_t d = vpadd_u32(c, c); return vcombine_u32(d, d); #endif } static inline uint16x8_t highbd_dc_load_partial_sum_4(const uint16_t *left) { // Nothing to do since sum is already one vector, but saves needing to // special case w=4 or h=4 cases. The combine will be zero cost for a sane // compiler since vld1 already sets the top half of a vector to zero as part // of the operation. return vcombine_u16(vld1_u16(left), vdup_n_u16(0)); } static inline uint16x8_t highbd_dc_load_partial_sum_8(const uint16_t *left) { // Nothing to do since sum is already one vector, but saves needing to // special case w=8 or h=8 cases. return vld1q_u16(left); } static inline uint16x8_t highbd_dc_load_partial_sum_16(const uint16_t *left) { const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits const uint16x8_t a1 = vld1q_u16(left + 8); return vaddq_u16(a0, a1); // up to 13 bits } static inline uint16x8_t highbd_dc_load_partial_sum_32(const uint16_t *left) { const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits const uint16x8_t a1 = vld1q_u16(left + 8); const uint16x8_t a2 = vld1q_u16(left + 16); const uint16x8_t a3 = vld1q_u16(left + 24); const uint16x8_t b0 = vaddq_u16(a0, a1); // up to 13 bits const uint16x8_t b1 = vaddq_u16(a2, a3); return vaddq_u16(b0, b1); // up to 14 bits } static inline uint16x8_t highbd_dc_load_partial_sum_64(const uint16_t *left) { const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits const uint16x8_t a1 = vld1q_u16(left + 8); const uint16x8_t a2 = vld1q_u16(left + 16); const uint16x8_t a3 = vld1q_u16(left + 24); const uint16x8_t a4 = vld1q_u16(left + 32); const uint16x8_t a5 = vld1q_u16(left + 40); const uint16x8_t a6 = vld1q_u16(left + 48); const uint16x8_t a7 = vld1q_u16(left + 56); const uint16x8_t b0 = vaddq_u16(a0, a1); // up to 13 bits const uint16x8_t b1 = vaddq_u16(a2, a3); const uint16x8_t b2 = vaddq_u16(a4, a5); const uint16x8_t b3 = vaddq_u16(a6, a7); const uint16x8_t c0 = vaddq_u16(b0, b1); // up to 14 bits const uint16x8_t c1 = vaddq_u16(b2, b3); return vaddq_u16(c0, c1); // up to 15 bits } #define HIGHBD_DC_PREDICTOR(w, h, shift) \ void aom_highbd_dc_predictor_##w##x##h##_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ const uint16x8_t a = highbd_dc_load_partial_sum_##w(above); \ const uint16x8_t l = highbd_dc_load_partial_sum_##h(left); \ const uint32x4_t sum = \ horizontal_add_and_broadcast_long_u16x8(vaddq_u16(a, l)); \ const uint16x4_t dc0 = vrshrn_n_u32(sum, shift); \ highbd_dc_store_##w##xh(dst, stride, (h), vdupq_lane_u16(dc0, 0)); \ } void aom_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { // In the rectangular cases we simply extend the shorter vector to uint16x8 // in order to accumulate, however in the 4x4 case there is no shorter vector // to extend so it is beneficial to do the whole calculation in uint16x4 // instead. (void)bd; const uint16x4_t a = vld1_u16(above); // up to 12 bits const uint16x4_t l = vld1_u16(left); uint16x4_t sum = vpadd_u16(a, l); // up to 13 bits sum = vpadd_u16(sum, sum); // up to 14 bits sum = vpadd_u16(sum, sum); const uint16x4_t dc = vrshr_n_u16(sum, 3); highbd_dc_store_4xh(dst, stride, 4, dc); } HIGHBD_DC_PREDICTOR(8, 8, 4) HIGHBD_DC_PREDICTOR(16, 16, 5) HIGHBD_DC_PREDICTOR(32, 32, 6) HIGHBD_DC_PREDICTOR(64, 64, 7) #undef HIGHBD_DC_PREDICTOR static inline int divide_using_multiply_shift(int num, int shift1, int multiplier, int shift2) { const int interm = num >> shift1; return interm * multiplier >> shift2; } #define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB #define HIGHBD_DC_MULTIPLIER_1X4 0x6667 #define HIGHBD_DC_SHIFT2 17 static inline int highbd_dc_predictor_rect(int bw, int bh, int sum, int shift1, uint32_t multiplier) { return divide_using_multiply_shift(sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2); } #undef HIGHBD_DC_SHIFT2 #define HIGHBD_DC_PREDICTOR_RECT(w, h, q, shift, mult) \ void aom_highbd_dc_predictor_##w##x##h##_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ uint16x8_t sum_above = highbd_dc_load_partial_sum_##w(above); \ uint16x8_t sum_left = highbd_dc_load_partial_sum_##h(left); \ uint16x8_t sum_vec = vaddq_u16(sum_left, sum_above); \ int sum = horizontal_add_u16x8(sum_vec); \ int dc0 = highbd_dc_predictor_rect((w), (h), sum, (shift), (mult)); \ highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u16(dc0)); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER HIGHBD_DC_PREDICTOR_RECT(4, 8, , 2, HIGHBD_DC_MULTIPLIER_1X2) HIGHBD_DC_PREDICTOR_RECT(4, 16, , 2, HIGHBD_DC_MULTIPLIER_1X4) HIGHBD_DC_PREDICTOR_RECT(8, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X2) HIGHBD_DC_PREDICTOR_RECT(8, 16, q, 3, HIGHBD_DC_MULTIPLIER_1X2) HIGHBD_DC_PREDICTOR_RECT(8, 32, q, 3, HIGHBD_DC_MULTIPLIER_1X4) HIGHBD_DC_PREDICTOR_RECT(16, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X4) HIGHBD_DC_PREDICTOR_RECT(16, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X2) HIGHBD_DC_PREDICTOR_RECT(16, 32, q, 4, HIGHBD_DC_MULTIPLIER_1X2) HIGHBD_DC_PREDICTOR_RECT(16, 64, q, 4, HIGHBD_DC_MULTIPLIER_1X4) HIGHBD_DC_PREDICTOR_RECT(32, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X4) HIGHBD_DC_PREDICTOR_RECT(32, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X2) HIGHBD_DC_PREDICTOR_RECT(32, 64, q, 5, HIGHBD_DC_MULTIPLIER_1X2) HIGHBD_DC_PREDICTOR_RECT(64, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X4) HIGHBD_DC_PREDICTOR_RECT(64, 32, q, 5, HIGHBD_DC_MULTIPLIER_1X2) #else HIGHBD_DC_PREDICTOR_RECT(4, 8, , 2, HIGHBD_DC_MULTIPLIER_1X2) HIGHBD_DC_PREDICTOR_RECT(8, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X2) HIGHBD_DC_PREDICTOR_RECT(8, 16, q, 3, HIGHBD_DC_MULTIPLIER_1X2) HIGHBD_DC_PREDICTOR_RECT(16, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X2) HIGHBD_DC_PREDICTOR_RECT(16, 32, q, 4, HIGHBD_DC_MULTIPLIER_1X2) HIGHBD_DC_PREDICTOR_RECT(32, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X2) HIGHBD_DC_PREDICTOR_RECT(32, 64, q, 5, HIGHBD_DC_MULTIPLIER_1X2) HIGHBD_DC_PREDICTOR_RECT(64, 32, q, 5, HIGHBD_DC_MULTIPLIER_1X2) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #undef HIGHBD_DC_PREDICTOR_RECT #undef HIGHBD_DC_MULTIPLIER_1X2 #undef HIGHBD_DC_MULTIPLIER_1X4 // ----------------------------------------------------------------------------- // DC_128 #define HIGHBD_DC_PREDICTOR_128(w, h, q) \ void aom_highbd_dc_128_predictor_##w##x##h##_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)above; \ (void)bd; \ (void)left; \ highbd_dc_store_##w##xh(dst, stride, (h), \ vdup##q##_n_u16(0x80 << (bd - 8))); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER HIGHBD_DC_PREDICTOR_128(4, 4, ) HIGHBD_DC_PREDICTOR_128(4, 8, ) HIGHBD_DC_PREDICTOR_128(4, 16, ) HIGHBD_DC_PREDICTOR_128(8, 4, q) HIGHBD_DC_PREDICTOR_128(8, 8, q) HIGHBD_DC_PREDICTOR_128(8, 16, q) HIGHBD_DC_PREDICTOR_128(8, 32, q) HIGHBD_DC_PREDICTOR_128(16, 4, q) HIGHBD_DC_PREDICTOR_128(16, 8, q) HIGHBD_DC_PREDICTOR_128(16, 16, q) HIGHBD_DC_PREDICTOR_128(16, 32, q) HIGHBD_DC_PREDICTOR_128(16, 64, q) HIGHBD_DC_PREDICTOR_128(32, 8, q) HIGHBD_DC_PREDICTOR_128(32, 16, q) HIGHBD_DC_PREDICTOR_128(32, 32, q) HIGHBD_DC_PREDICTOR_128(32, 64, q) HIGHBD_DC_PREDICTOR_128(64, 16, q) HIGHBD_DC_PREDICTOR_128(64, 32, q) HIGHBD_DC_PREDICTOR_128(64, 64, q) #else HIGHBD_DC_PREDICTOR_128(4, 4, ) HIGHBD_DC_PREDICTOR_128(4, 8, ) HIGHBD_DC_PREDICTOR_128(8, 4, q) HIGHBD_DC_PREDICTOR_128(8, 8, q) HIGHBD_DC_PREDICTOR_128(8, 16, q) HIGHBD_DC_PREDICTOR_128(16, 8, q) HIGHBD_DC_PREDICTOR_128(16, 16, q) HIGHBD_DC_PREDICTOR_128(16, 32, q) HIGHBD_DC_PREDICTOR_128(32, 16, q) HIGHBD_DC_PREDICTOR_128(32, 32, q) HIGHBD_DC_PREDICTOR_128(32, 64, q) HIGHBD_DC_PREDICTOR_128(64, 32, q) HIGHBD_DC_PREDICTOR_128(64, 64, q) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #undef HIGHBD_DC_PREDICTOR_128 // ----------------------------------------------------------------------------- // DC_LEFT static inline uint32x4_t highbd_dc_load_sum_4(const uint16_t *left) { const uint16x4_t a = vld1_u16(left); // up to 12 bits const uint16x4_t b = vpadd_u16(a, a); // up to 13 bits return vcombine_u32(vpaddl_u16(b), vdup_n_u32(0)); } static inline uint32x4_t highbd_dc_load_sum_8(const uint16_t *left) { return horizontal_add_and_broadcast_long_u16x8(vld1q_u16(left)); } static inline uint32x4_t highbd_dc_load_sum_16(const uint16_t *left) { return horizontal_add_and_broadcast_long_u16x8( highbd_dc_load_partial_sum_16(left)); } static inline uint32x4_t highbd_dc_load_sum_32(const uint16_t *left) { return horizontal_add_and_broadcast_long_u16x8( highbd_dc_load_partial_sum_32(left)); } static inline uint32x4_t highbd_dc_load_sum_64(const uint16_t *left) { return horizontal_add_and_broadcast_long_u16x8( highbd_dc_load_partial_sum_64(left)); } #define DC_PREDICTOR_LEFT(w, h, shift, q) \ void aom_highbd_dc_left_predictor_##w##x##h##_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)above; \ (void)bd; \ const uint32x4_t sum = highbd_dc_load_sum_##h(left); \ const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift)); \ highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER DC_PREDICTOR_LEFT(4, 4, 2, ) DC_PREDICTOR_LEFT(4, 8, 3, ) DC_PREDICTOR_LEFT(4, 16, 4, ) DC_PREDICTOR_LEFT(8, 4, 2, q) DC_PREDICTOR_LEFT(8, 8, 3, q) DC_PREDICTOR_LEFT(8, 16, 4, q) DC_PREDICTOR_LEFT(8, 32, 5, q) DC_PREDICTOR_LEFT(16, 4, 2, q) DC_PREDICTOR_LEFT(16, 8, 3, q) DC_PREDICTOR_LEFT(16, 16, 4, q) DC_PREDICTOR_LEFT(16, 32, 5, q) DC_PREDICTOR_LEFT(16, 64, 6, q) DC_PREDICTOR_LEFT(32, 8, 3, q) DC_PREDICTOR_LEFT(32, 16, 4, q) DC_PREDICTOR_LEFT(32, 32, 5, q) DC_PREDICTOR_LEFT(32, 64, 6, q) DC_PREDICTOR_LEFT(64, 16, 4, q) DC_PREDICTOR_LEFT(64, 32, 5, q) DC_PREDICTOR_LEFT(64, 64, 6, q) #else DC_PREDICTOR_LEFT(4, 4, 2, ) DC_PREDICTOR_LEFT(4, 8, 3, ) DC_PREDICTOR_LEFT(8, 4, 2, q) DC_PREDICTOR_LEFT(8, 8, 3, q) DC_PREDICTOR_LEFT(8, 16, 4, q) DC_PREDICTOR_LEFT(16, 8, 3, q) DC_PREDICTOR_LEFT(16, 16, 4, q) DC_PREDICTOR_LEFT(16, 32, 5, q) DC_PREDICTOR_LEFT(32, 16, 4, q) DC_PREDICTOR_LEFT(32, 32, 5, q) DC_PREDICTOR_LEFT(32, 64, 6, q) DC_PREDICTOR_LEFT(64, 32, 5, q) DC_PREDICTOR_LEFT(64, 64, 6, q) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #undef DC_PREDICTOR_LEFT // ----------------------------------------------------------------------------- // DC_TOP #define DC_PREDICTOR_TOP(w, h, shift, q) \ void aom_highbd_dc_top_predictor_##w##x##h##_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ (void)left; \ const uint32x4_t sum = highbd_dc_load_sum_##w(above); \ const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift)); \ highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER DC_PREDICTOR_TOP(4, 4, 2, ) DC_PREDICTOR_TOP(4, 8, 2, ) DC_PREDICTOR_TOP(4, 16, 2, ) DC_PREDICTOR_TOP(8, 4, 3, q) DC_PREDICTOR_TOP(8, 8, 3, q) DC_PREDICTOR_TOP(8, 16, 3, q) DC_PREDICTOR_TOP(8, 32, 3, q) DC_PREDICTOR_TOP(16, 4, 4, q) DC_PREDICTOR_TOP(16, 8, 4, q) DC_PREDICTOR_TOP(16, 16, 4, q) DC_PREDICTOR_TOP(16, 32, 4, q) DC_PREDICTOR_TOP(16, 64, 4, q) DC_PREDICTOR_TOP(32, 8, 5, q) DC_PREDICTOR_TOP(32, 16, 5, q) DC_PREDICTOR_TOP(32, 32, 5, q) DC_PREDICTOR_TOP(32, 64, 5, q) DC_PREDICTOR_TOP(64, 16, 6, q) DC_PREDICTOR_TOP(64, 32, 6, q) DC_PREDICTOR_TOP(64, 64, 6, q) #else DC_PREDICTOR_TOP(4, 4, 2, ) DC_PREDICTOR_TOP(4, 8, 2, ) DC_PREDICTOR_TOP(8, 4, 3, q) DC_PREDICTOR_TOP(8, 8, 3, q) DC_PREDICTOR_TOP(8, 16, 3, q) DC_PREDICTOR_TOP(16, 8, 4, q) DC_PREDICTOR_TOP(16, 16, 4, q) DC_PREDICTOR_TOP(16, 32, 4, q) DC_PREDICTOR_TOP(32, 16, 5, q) DC_PREDICTOR_TOP(32, 32, 5, q) DC_PREDICTOR_TOP(32, 64, 5, q) DC_PREDICTOR_TOP(64, 32, 6, q) DC_PREDICTOR_TOP(64, 64, 6, q) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #undef DC_PREDICTOR_TOP // ----------------------------------------------------------------------------- // V_PRED #define HIGHBD_V_NXM(W, H) \ void aom_highbd_v_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)left; \ (void)bd; \ vertical##W##xh_neon(dst, stride, above, H); \ } static inline uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) { uint16x8x2_t x; // Clang/gcc uses ldp here. x.val[0] = vld1q_u16(ptr); x.val[1] = vld1q_u16(ptr + 8); return x; } static inline void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) { vst1q_u16(ptr, x.val[0]); vst1q_u16(ptr + 8, x.val[1]); } static inline void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const above, int height) { const uint16x4_t row = vld1_u16(above); int y = height; do { vst1_u16(dst, row); vst1_u16(dst + stride, row); dst += stride << 1; y -= 2; } while (y != 0); } static inline void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const above, int height) { const uint16x8_t row = vld1q_u16(above); int y = height; do { vst1q_u16(dst, row); vst1q_u16(dst + stride, row); dst += stride << 1; y -= 2; } while (y != 0); } static inline void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const above, int height) { const uint16x8x2_t row = load_uint16x8x2(above); int y = height; do { store_uint16x8x2(dst, row); store_uint16x8x2(dst + stride, row); dst += stride << 1; y -= 2; } while (y != 0); } static inline uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) { uint16x8x4_t x; // Clang/gcc uses ldp here. x.val[0] = vld1q_u16(ptr); x.val[1] = vld1q_u16(ptr + 8); x.val[2] = vld1q_u16(ptr + 16); x.val[3] = vld1q_u16(ptr + 24); return x; } static inline void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) { vst1q_u16(ptr, x.val[0]); vst1q_u16(ptr + 8, x.val[1]); vst1q_u16(ptr + 16, x.val[2]); vst1q_u16(ptr + 24, x.val[3]); } static inline void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const above, int height) { const uint16x8x4_t row = load_uint16x8x4(above); int y = height; do { store_uint16x8x4(dst, row); store_uint16x8x4(dst + stride, row); dst += stride << 1; y -= 2; } while (y != 0); } static inline void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const above, int height) { uint16_t *dst32 = dst + 32; const uint16x8x4_t row = load_uint16x8x4(above); const uint16x8x4_t row32 = load_uint16x8x4(above + 32); int y = height; do { store_uint16x8x4(dst, row); store_uint16x8x4(dst32, row32); store_uint16x8x4(dst + stride, row); store_uint16x8x4(dst32 + stride, row32); dst += stride << 1; dst32 += stride << 1; y -= 2; } while (y != 0); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER HIGHBD_V_NXM(4, 4) HIGHBD_V_NXM(4, 8) HIGHBD_V_NXM(4, 16) HIGHBD_V_NXM(8, 4) HIGHBD_V_NXM(8, 8) HIGHBD_V_NXM(8, 16) HIGHBD_V_NXM(8, 32) HIGHBD_V_NXM(16, 4) HIGHBD_V_NXM(16, 8) HIGHBD_V_NXM(16, 16) HIGHBD_V_NXM(16, 32) HIGHBD_V_NXM(16, 64) HIGHBD_V_NXM(32, 8) HIGHBD_V_NXM(32, 16) HIGHBD_V_NXM(32, 32) HIGHBD_V_NXM(32, 64) HIGHBD_V_NXM(64, 16) HIGHBD_V_NXM(64, 32) HIGHBD_V_NXM(64, 64) #else HIGHBD_V_NXM(4, 4) HIGHBD_V_NXM(4, 8) HIGHBD_V_NXM(8, 4) HIGHBD_V_NXM(8, 8) HIGHBD_V_NXM(8, 16) HIGHBD_V_NXM(16, 8) HIGHBD_V_NXM(16, 16) HIGHBD_V_NXM(16, 32) HIGHBD_V_NXM(32, 16) HIGHBD_V_NXM(32, 32) HIGHBD_V_NXM(32, 64) HIGHBD_V_NXM(64, 32) HIGHBD_V_NXM(64, 64) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // ----------------------------------------------------------------------------- // H_PRED static inline void highbd_h_store_4x4(uint16_t *dst, ptrdiff_t stride, uint16x4_t left) { vst1_u16(dst + 0 * stride, vdup_lane_u16(left, 0)); vst1_u16(dst + 1 * stride, vdup_lane_u16(left, 1)); vst1_u16(dst + 2 * stride, vdup_lane_u16(left, 2)); vst1_u16(dst + 3 * stride, vdup_lane_u16(left, 3)); } static inline void highbd_h_store_8x4(uint16_t *dst, ptrdiff_t stride, uint16x4_t left) { vst1q_u16(dst + 0 * stride, vdupq_lane_u16(left, 0)); vst1q_u16(dst + 1 * stride, vdupq_lane_u16(left, 1)); vst1q_u16(dst + 2 * stride, vdupq_lane_u16(left, 2)); vst1q_u16(dst + 3 * stride, vdupq_lane_u16(left, 3)); } static inline void highbd_h_store_16x1(uint16_t *dst, uint16x8_t left) { vst1q_u16(dst + 0, left); vst1q_u16(dst + 8, left); } static inline void highbd_h_store_16x4(uint16_t *dst, ptrdiff_t stride, uint16x4_t left) { highbd_h_store_16x1(dst + 0 * stride, vdupq_lane_u16(left, 0)); highbd_h_store_16x1(dst + 1 * stride, vdupq_lane_u16(left, 1)); highbd_h_store_16x1(dst + 2 * stride, vdupq_lane_u16(left, 2)); highbd_h_store_16x1(dst + 3 * stride, vdupq_lane_u16(left, 3)); } static inline void highbd_h_store_32x1(uint16_t *dst, uint16x8_t left) { vst1q_u16(dst + 0, left); vst1q_u16(dst + 8, left); vst1q_u16(dst + 16, left); vst1q_u16(dst + 24, left); } static inline void highbd_h_store_32x4(uint16_t *dst, ptrdiff_t stride, uint16x4_t left) { highbd_h_store_32x1(dst + 0 * stride, vdupq_lane_u16(left, 0)); highbd_h_store_32x1(dst + 1 * stride, vdupq_lane_u16(left, 1)); highbd_h_store_32x1(dst + 2 * stride, vdupq_lane_u16(left, 2)); highbd_h_store_32x1(dst + 3 * stride, vdupq_lane_u16(left, 3)); } static inline void highbd_h_store_64x1(uint16_t *dst, uint16x8_t left) { vst1q_u16(dst + 0, left); vst1q_u16(dst + 8, left); vst1q_u16(dst + 16, left); vst1q_u16(dst + 24, left); vst1q_u16(dst + 32, left); vst1q_u16(dst + 40, left); vst1q_u16(dst + 48, left); vst1q_u16(dst + 56, left); } static inline void highbd_h_store_64x4(uint16_t *dst, ptrdiff_t stride, uint16x4_t left) { highbd_h_store_64x1(dst + 0 * stride, vdupq_lane_u16(left, 0)); highbd_h_store_64x1(dst + 1 * stride, vdupq_lane_u16(left, 1)); highbd_h_store_64x1(dst + 2 * stride, vdupq_lane_u16(left, 2)); highbd_h_store_64x1(dst + 3 * stride, vdupq_lane_u16(left, 3)); } void aom_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)above; (void)bd; highbd_h_store_4x4(dst, stride, vld1_u16(left)); } void aom_highbd_h_predictor_4x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)above; (void)bd; uint16x8_t l = vld1q_u16(left); highbd_h_store_4x4(dst + 0 * stride, stride, vget_low_u16(l)); highbd_h_store_4x4(dst + 4 * stride, stride, vget_high_u16(l)); } void aom_highbd_h_predictor_8x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)above; (void)bd; highbd_h_store_8x4(dst, stride, vld1_u16(left)); } void aom_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)above; (void)bd; uint16x8_t l = vld1q_u16(left); highbd_h_store_8x4(dst + 0 * stride, stride, vget_low_u16(l)); highbd_h_store_8x4(dst + 4 * stride, stride, vget_high_u16(l)); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_highbd_h_predictor_16x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)above; (void)bd; highbd_h_store_16x4(dst, stride, vld1_u16(left)); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_highbd_h_predictor_16x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)above; (void)bd; uint16x8_t l = vld1q_u16(left); highbd_h_store_16x4(dst + 0 * stride, stride, vget_low_u16(l)); highbd_h_store_16x4(dst + 4 * stride, stride, vget_high_u16(l)); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_highbd_h_predictor_32x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)above; (void)bd; uint16x8_t l = vld1q_u16(left); highbd_h_store_32x4(dst + 0 * stride, stride, vget_low_u16(l)); highbd_h_store_32x4(dst + 4 * stride, stride, vget_high_u16(l)); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // For cases where height >= 16 we use pairs of loads to get LDP instructions. #define HIGHBD_H_WXH_LARGE(w, h) \ void aom_highbd_h_predictor_##w##x##h##_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)above; \ (void)bd; \ for (int i = 0; i < (h) / 16; ++i) { \ uint16x8_t l0 = vld1q_u16(left + 0); \ uint16x8_t l1 = vld1q_u16(left + 8); \ highbd_h_store_##w##x4(dst + 0 * stride, stride, vget_low_u16(l0)); \ highbd_h_store_##w##x4(dst + 4 * stride, stride, vget_high_u16(l0)); \ highbd_h_store_##w##x4(dst + 8 * stride, stride, vget_low_u16(l1)); \ highbd_h_store_##w##x4(dst + 12 * stride, stride, vget_high_u16(l1)); \ left += 16; \ dst += 16 * stride; \ } \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER HIGHBD_H_WXH_LARGE(4, 16) HIGHBD_H_WXH_LARGE(8, 16) HIGHBD_H_WXH_LARGE(8, 32) HIGHBD_H_WXH_LARGE(16, 16) HIGHBD_H_WXH_LARGE(16, 32) HIGHBD_H_WXH_LARGE(16, 64) HIGHBD_H_WXH_LARGE(32, 16) HIGHBD_H_WXH_LARGE(32, 32) HIGHBD_H_WXH_LARGE(32, 64) HIGHBD_H_WXH_LARGE(64, 16) HIGHBD_H_WXH_LARGE(64, 32) HIGHBD_H_WXH_LARGE(64, 64) #else HIGHBD_H_WXH_LARGE(8, 16) HIGHBD_H_WXH_LARGE(16, 16) HIGHBD_H_WXH_LARGE(16, 32) HIGHBD_H_WXH_LARGE(32, 16) HIGHBD_H_WXH_LARGE(32, 32) HIGHBD_H_WXH_LARGE(32, 64) HIGHBD_H_WXH_LARGE(64, 32) HIGHBD_H_WXH_LARGE(64, 64) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #undef HIGHBD_H_WXH_LARGE // ----------------------------------------------------------------------------- // PAETH static inline void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row, const uint16_t *const left_column, int width, int height) { const uint16x8_t top_left = vdupq_n_u16(top_row[-1]); const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]); uint16x8_t top; if (width == 4) { top = vcombine_u16(vld1_u16(top_row), vdup_n_u16(0)); } else { // width == 8 top = vld1q_u16(top_row); } for (int y = 0; y < height; ++y) { const uint16x8_t left = vdupq_n_u16(left_column[y]); const uint16x8_t left_dist = vabdq_u16(top, top_left); const uint16x8_t top_dist = vabdq_u16(left, top_left); const uint16x8_t top_left_dist = vabdq_u16(vaddq_u16(top, left), top_left_x2); const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist); const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist); const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist); // if (left_dist <= top_dist && left_dist <= top_left_dist) const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left); // dest[x] = left_column[y]; // Fill all the unused spaces with 'top'. They will be overwritten when // the positions for top_left are known. uint16x8_t result = vbslq_u16(left_mask, left, top); // else if (top_dist <= top_left_dist) // dest[x] = top_row[x]; // Add these values to the mask. They were already set. const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left); // else // dest[x] = top_left; result = vbslq_u16(left_or_top_mask, result, top_left); if (width == 4) { vst1_u16(dest, vget_low_u16(result)); } else { // width == 8 vst1q_u16(dest, result); } dest += stride; } } #define HIGHBD_PAETH_NXM(W, H) \ void aom_highbd_paeth_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER HIGHBD_PAETH_NXM(4, 4) HIGHBD_PAETH_NXM(4, 8) HIGHBD_PAETH_NXM(4, 16) HIGHBD_PAETH_NXM(8, 4) HIGHBD_PAETH_NXM(8, 8) HIGHBD_PAETH_NXM(8, 16) HIGHBD_PAETH_NXM(8, 32) #else HIGHBD_PAETH_NXM(4, 4) HIGHBD_PAETH_NXM(4, 8) HIGHBD_PAETH_NXM(8, 4) HIGHBD_PAETH_NXM(8, 8) HIGHBD_PAETH_NXM(8, 16) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // Select the closest values and collect them. static inline uint16x8_t select_paeth(const uint16x8_t top, const uint16x8_t left, const uint16x8_t top_left, const uint16x8_t left_le_top, const uint16x8_t left_le_top_left, const uint16x8_t top_le_top_left) { // if (left_dist <= top_dist && left_dist <= top_left_dist) const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left); // dest[x] = left_column[y]; // Fill all the unused spaces with 'top'. They will be overwritten when // the positions for top_left are known. const uint16x8_t result = vbslq_u16(left_mask, left, top); // else if (top_dist <= top_left_dist) // dest[x] = top_row[x]; // Add these values to the mask. They were already set. const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left); // else // dest[x] = top_left; return vbslq_u16(left_or_top_mask, result, top_left); } #define PAETH_PREDICTOR(num) \ do { \ const uint16x8_t left_dist = vabdq_u16(top[num], top_left); \ const uint16x8_t top_left_dist = \ vabdq_u16(vaddq_u16(top[num], left), top_left_x2); \ const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist); \ const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist); \ const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist); \ const uint16x8_t result = \ select_paeth(top[num], left, top_left, left_le_top, left_le_top_left, \ top_le_top_left); \ vst1q_u16(dest + (num * 8), result); \ } while (0) #define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8)) static inline void highbd_paeth16_plus_x_h_neon( uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row, const uint16_t *const left_column, int width, int height) { const uint16x8_t top_left = vdupq_n_u16(top_row[-1]); const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]); uint16x8_t top[8]; top[0] = LOAD_TOP_ROW(0); top[1] = LOAD_TOP_ROW(1); if (width > 16) { top[2] = LOAD_TOP_ROW(2); top[3] = LOAD_TOP_ROW(3); if (width == 64) { top[4] = LOAD_TOP_ROW(4); top[5] = LOAD_TOP_ROW(5); top[6] = LOAD_TOP_ROW(6); top[7] = LOAD_TOP_ROW(7); } } for (int y = 0; y < height; ++y) { const uint16x8_t left = vdupq_n_u16(left_column[y]); const uint16x8_t top_dist = vabdq_u16(left, top_left); PAETH_PREDICTOR(0); PAETH_PREDICTOR(1); if (width > 16) { PAETH_PREDICTOR(2); PAETH_PREDICTOR(3); if (width == 64) { PAETH_PREDICTOR(4); PAETH_PREDICTOR(5); PAETH_PREDICTOR(6); PAETH_PREDICTOR(7); } } dest += stride; } } #define HIGHBD_PAETH_NXM_WIDE(W, H) \ void aom_highbd_paeth_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER HIGHBD_PAETH_NXM_WIDE(16, 4) HIGHBD_PAETH_NXM_WIDE(16, 8) HIGHBD_PAETH_NXM_WIDE(16, 16) HIGHBD_PAETH_NXM_WIDE(16, 32) HIGHBD_PAETH_NXM_WIDE(16, 64) HIGHBD_PAETH_NXM_WIDE(32, 8) HIGHBD_PAETH_NXM_WIDE(32, 16) HIGHBD_PAETH_NXM_WIDE(32, 32) HIGHBD_PAETH_NXM_WIDE(32, 64) HIGHBD_PAETH_NXM_WIDE(64, 16) HIGHBD_PAETH_NXM_WIDE(64, 32) HIGHBD_PAETH_NXM_WIDE(64, 64) #else HIGHBD_PAETH_NXM_WIDE(16, 8) HIGHBD_PAETH_NXM_WIDE(16, 16) HIGHBD_PAETH_NXM_WIDE(16, 32) HIGHBD_PAETH_NXM_WIDE(32, 16) HIGHBD_PAETH_NXM_WIDE(32, 32) HIGHBD_PAETH_NXM_WIDE(32, 64) HIGHBD_PAETH_NXM_WIDE(64, 32) HIGHBD_PAETH_NXM_WIDE(64, 64) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // ----------------------------------------------------------------------------- // SMOOTH // 256 - v = vneg_s8(v) static inline uint16x4_t negate_s8(const uint16x4_t v) { return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v))); } static inline void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, const uint16_t *const left_column, const int height) { const uint16_t top_right = top_row[3]; const uint16_t bottom_left = left_column[height - 1]; const uint16_t *const weights_y = smooth_weights_u16 + height - 4; const uint16x4_t top_v = vld1_u16(top_row); const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); const uint16x4_t weights_x_v = vld1_u16(smooth_weights_u16); const uint16x4_t scaled_weights_x = negate_s8(weights_x_v); const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); for (int y = 0; y < height; ++y) { // Each variable in the running summation is named for the last item to be // accumulated. const uint32x4_t weighted_top = vmlal_n_u16(weighted_tr, top_v, weights_y[y]); const uint32x4_t weighted_left = vmlal_n_u16(weighted_top, weights_x_v, left_column[y]); const uint32x4_t weighted_bl = vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]); const uint16x4_t pred = vrshrn_n_u32(weighted_bl, SMOOTH_WEIGHT_LOG2_SCALE + 1); vst1_u16(dst, pred); dst += stride; } } // Common code between 8xH and [16|32|64]xH. static inline void highbd_calculate_pred8( uint16_t *dst, const uint32x4_t weighted_corners_low, const uint32x4_t weighted_corners_high, const uint16x4x2_t top_vals, const uint16x4x2_t weights_x, const uint16_t left_y, const uint16_t weight_y) { // Each variable in the running summation is named for the last item to be // accumulated. const uint32x4_t weighted_top_low = vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y); const uint32x4_t weighted_edges_low = vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y); const uint16x4_t pred_low = vrshrn_n_u32(weighted_edges_low, SMOOTH_WEIGHT_LOG2_SCALE + 1); vst1_u16(dst, pred_low); const uint32x4_t weighted_top_high = vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y); const uint32x4_t weighted_edges_high = vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y); const uint16x4_t pred_high = vrshrn_n_u32(weighted_edges_high, SMOOTH_WEIGHT_LOG2_SCALE + 1); vst1_u16(dst + 4, pred_high); } static void highbd_smooth_8xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, const uint16_t *const left_column, const int height) { const uint16_t top_right = top_row[7]; const uint16_t bottom_left = left_column[height - 1]; const uint16_t *const weights_y = smooth_weights_u16 + height - 4; const uint16x4x2_t top_vals = { { vld1_u16(top_row), vld1_u16(top_row + 4) } }; const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4), vld1_u16(smooth_weights_u16 + 8) } }; const uint32x4_t weighted_tr_low = vmull_n_u16(negate_s8(weights_x.val[0]), top_right); const uint32x4_t weighted_tr_high = vmull_n_u16(negate_s8(weights_x.val[1]), top_right); for (int y = 0; y < height; ++y) { const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); const uint32x4_t weighted_corners_low = vaddq_u32(weighted_bl, weighted_tr_low); const uint32x4_t weighted_corners_high = vaddq_u32(weighted_bl, weighted_tr_high); highbd_calculate_pred8(dst, weighted_corners_low, weighted_corners_high, top_vals, weights_x, left_column[y], weights_y[y]); dst += stride; } } #define HIGHBD_SMOOTH_NXM(W, H) \ void aom_highbd_smooth_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER HIGHBD_SMOOTH_NXM(4, 4) HIGHBD_SMOOTH_NXM(4, 8) HIGHBD_SMOOTH_NXM(8, 4) HIGHBD_SMOOTH_NXM(8, 8) HIGHBD_SMOOTH_NXM(4, 16) HIGHBD_SMOOTH_NXM(8, 16) HIGHBD_SMOOTH_NXM(8, 32) #else HIGHBD_SMOOTH_NXM(4, 4) HIGHBD_SMOOTH_NXM(4, 8) HIGHBD_SMOOTH_NXM(8, 4) HIGHBD_SMOOTH_NXM(8, 8) HIGHBD_SMOOTH_NXM(8, 16) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #undef HIGHBD_SMOOTH_NXM // For width 16 and above. #define HIGHBD_SMOOTH_PREDICTOR(W) \ static void highbd_smooth_##W##xh_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \ const uint16_t *const left_column, const int height) { \ const uint16_t top_right = top_row[(W)-1]; \ const uint16_t bottom_left = left_column[height - 1]; \ const uint16_t *const weights_y = smooth_weights_u16 + height - 4; \ \ /* Precompute weighted values that don't vary with |y|. */ \ uint32x4_t weighted_tr_low[(W) >> 3]; \ uint32x4_t weighted_tr_high[(W) >> 3]; \ for (int i = 0; i < (W) >> 3; ++i) { \ const int x = i << 3; \ const uint16x4_t weights_x_low = \ vld1_u16(smooth_weights_u16 + (W)-4 + x); \ weighted_tr_low[i] = vmull_n_u16(negate_s8(weights_x_low), top_right); \ const uint16x4_t weights_x_high = \ vld1_u16(smooth_weights_u16 + (W) + x); \ weighted_tr_high[i] = vmull_n_u16(negate_s8(weights_x_high), top_right); \ } \ \ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); \ for (int y = 0; y < height; ++y) { \ const uint32x4_t weighted_bl = \ vmull_n_u16(bottom_left_v, 256 - weights_y[y]); \ uint16_t *dst_x = dst; \ for (int i = 0; i < (W) >> 3; ++i) { \ const int x = i << 3; \ const uint16x4x2_t top_vals = { { vld1_u16(top_row + x), \ vld1_u16(top_row + x + 4) } }; \ const uint32x4_t weighted_corners_low = \ vaddq_u32(weighted_bl, weighted_tr_low[i]); \ const uint32x4_t weighted_corners_high = \ vaddq_u32(weighted_bl, weighted_tr_high[i]); \ /* Accumulate weighted edge values and store. */ \ const uint16x4x2_t weights_x = { \ { vld1_u16(smooth_weights_u16 + (W)-4 + x), \ vld1_u16(smooth_weights_u16 + (W) + x) } \ }; \ highbd_calculate_pred8(dst_x, weighted_corners_low, \ weighted_corners_high, top_vals, weights_x, \ left_column[y], weights_y[y]); \ dst_x += 8; \ } \ dst += stride; \ } \ } HIGHBD_SMOOTH_PREDICTOR(16) HIGHBD_SMOOTH_PREDICTOR(32) HIGHBD_SMOOTH_PREDICTOR(64) #undef HIGHBD_SMOOTH_PREDICTOR #define HIGHBD_SMOOTH_NXM_WIDE(W, H) \ void aom_highbd_smooth_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER HIGHBD_SMOOTH_NXM_WIDE(16, 4) HIGHBD_SMOOTH_NXM_WIDE(16, 8) HIGHBD_SMOOTH_NXM_WIDE(16, 16) HIGHBD_SMOOTH_NXM_WIDE(16, 32) HIGHBD_SMOOTH_NXM_WIDE(16, 64) HIGHBD_SMOOTH_NXM_WIDE(32, 8) HIGHBD_SMOOTH_NXM_WIDE(32, 16) HIGHBD_SMOOTH_NXM_WIDE(32, 32) HIGHBD_SMOOTH_NXM_WIDE(32, 64) HIGHBD_SMOOTH_NXM_WIDE(64, 16) HIGHBD_SMOOTH_NXM_WIDE(64, 32) HIGHBD_SMOOTH_NXM_WIDE(64, 64) #else HIGHBD_SMOOTH_NXM_WIDE(16, 8) HIGHBD_SMOOTH_NXM_WIDE(16, 16) HIGHBD_SMOOTH_NXM_WIDE(16, 32) HIGHBD_SMOOTH_NXM_WIDE(32, 16) HIGHBD_SMOOTH_NXM_WIDE(32, 32) HIGHBD_SMOOTH_NXM_WIDE(32, 64) HIGHBD_SMOOTH_NXM_WIDE(64, 32) HIGHBD_SMOOTH_NXM_WIDE(64, 64) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #undef HIGHBD_SMOOTH_NXM_WIDE static void highbd_smooth_v_4xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, const uint16_t *const left_column, const int height) { const uint16_t bottom_left = left_column[height - 1]; const uint16_t *const weights_y = smooth_weights_u16 + height - 4; const uint16x4_t top_v = vld1_u16(top_row); const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); for (int y = 0; y < height; ++y) { const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); const uint32x4_t weighted_top = vmlal_n_u16(weighted_bl, top_v, weights_y[y]); vst1_u16(dst, vrshrn_n_u32(weighted_top, SMOOTH_WEIGHT_LOG2_SCALE)); dst += stride; } } static void highbd_smooth_v_8xh_neon(uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row, const uint16_t *const left_column, const int height) { const uint16_t bottom_left = left_column[height - 1]; const uint16_t *const weights_y = smooth_weights_u16 + height - 4; const uint16x4_t top_low = vld1_u16(top_row); const uint16x4_t top_high = vld1_u16(top_row + 4); const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); for (int y = 0; y < height; ++y) { const uint32x4_t weighted_bl = vmull_n_u16(bottom_left_v, 256 - weights_y[y]); const uint32x4_t weighted_top_low = vmlal_n_u16(weighted_bl, top_low, weights_y[y]); vst1_u16(dst, vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE)); const uint32x4_t weighted_top_high = vmlal_n_u16(weighted_bl, top_high, weights_y[y]); vst1_u16(dst + 4, vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); dst += stride; } } #define HIGHBD_SMOOTH_V_NXM(W, H) \ void aom_highbd_smooth_v_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER HIGHBD_SMOOTH_V_NXM(4, 4) HIGHBD_SMOOTH_V_NXM(4, 8) HIGHBD_SMOOTH_V_NXM(4, 16) HIGHBD_SMOOTH_V_NXM(8, 4) HIGHBD_SMOOTH_V_NXM(8, 8) HIGHBD_SMOOTH_V_NXM(8, 16) HIGHBD_SMOOTH_V_NXM(8, 32) #else HIGHBD_SMOOTH_V_NXM(4, 4) HIGHBD_SMOOTH_V_NXM(4, 8) HIGHBD_SMOOTH_V_NXM(8, 4) HIGHBD_SMOOTH_V_NXM(8, 8) HIGHBD_SMOOTH_V_NXM(8, 16) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #undef HIGHBD_SMOOTH_V_NXM // For width 16 and above. #define HIGHBD_SMOOTH_V_PREDICTOR(W) \ static void highbd_smooth_v_##W##xh_neon( \ uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row, \ const uint16_t *const left_column, const int height) { \ const uint16_t bottom_left = left_column[height - 1]; \ const uint16_t *const weights_y = smooth_weights_u16 + height - 4; \ \ uint16x4x2_t top_vals[(W) >> 3]; \ for (int i = 0; i < (W) >> 3; ++i) { \ const int x = i << 3; \ top_vals[i].val[0] = vld1_u16(top_row + x); \ top_vals[i].val[1] = vld1_u16(top_row + x + 4); \ } \ \ const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); \ for (int y = 0; y < height; ++y) { \ const uint32x4_t weighted_bl = \ vmull_n_u16(bottom_left_v, 256 - weights_y[y]); \ \ uint16_t *dst_x = dst; \ for (int i = 0; i < (W) >> 3; ++i) { \ const uint32x4_t weighted_top_low = \ vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]); \ vst1_u16(dst_x, \ vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE)); \ \ const uint32x4_t weighted_top_high = \ vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]); \ vst1_u16(dst_x + 4, \ vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); \ dst_x += 8; \ } \ dst += stride; \ } \ } HIGHBD_SMOOTH_V_PREDICTOR(16) HIGHBD_SMOOTH_V_PREDICTOR(32) HIGHBD_SMOOTH_V_PREDICTOR(64) #undef HIGHBD_SMOOTH_V_PREDICTOR #define HIGHBD_SMOOTH_V_NXM_WIDE(W, H) \ void aom_highbd_smooth_v_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER HIGHBD_SMOOTH_V_NXM_WIDE(16, 4) HIGHBD_SMOOTH_V_NXM_WIDE(16, 8) HIGHBD_SMOOTH_V_NXM_WIDE(16, 16) HIGHBD_SMOOTH_V_NXM_WIDE(16, 32) HIGHBD_SMOOTH_V_NXM_WIDE(16, 64) HIGHBD_SMOOTH_V_NXM_WIDE(32, 8) HIGHBD_SMOOTH_V_NXM_WIDE(32, 16) HIGHBD_SMOOTH_V_NXM_WIDE(32, 32) HIGHBD_SMOOTH_V_NXM_WIDE(32, 64) HIGHBD_SMOOTH_V_NXM_WIDE(64, 16) HIGHBD_SMOOTH_V_NXM_WIDE(64, 32) HIGHBD_SMOOTH_V_NXM_WIDE(64, 64) #else HIGHBD_SMOOTH_V_NXM_WIDE(16, 8) HIGHBD_SMOOTH_V_NXM_WIDE(16, 16) HIGHBD_SMOOTH_V_NXM_WIDE(16, 32) HIGHBD_SMOOTH_V_NXM_WIDE(32, 16) HIGHBD_SMOOTH_V_NXM_WIDE(32, 32) HIGHBD_SMOOTH_V_NXM_WIDE(32, 64) HIGHBD_SMOOTH_V_NXM_WIDE(64, 32) HIGHBD_SMOOTH_V_NXM_WIDE(64, 64) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #undef HIGHBD_SMOOTH_V_NXM_WIDE static inline void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, const uint16_t *const left_column, const int height) { const uint16_t top_right = top_row[3]; const uint16x4_t weights_x = vld1_u16(smooth_weights_u16); const uint16x4_t scaled_weights_x = negate_s8(weights_x); const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); for (int y = 0; y < height; ++y) { const uint32x4_t weighted_left = vmlal_n_u16(weighted_tr, weights_x, left_column[y]); vst1_u16(dst, vrshrn_n_u32(weighted_left, SMOOTH_WEIGHT_LOG2_SCALE)); dst += stride; } } static inline void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, const uint16_t *const left_column, const int height) { const uint16_t top_right = top_row[7]; const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4), vld1_u16(smooth_weights_u16 + 8) } }; const uint32x4_t weighted_tr_low = vmull_n_u16(negate_s8(weights_x.val[0]), top_right); const uint32x4_t weighted_tr_high = vmull_n_u16(negate_s8(weights_x.val[1]), top_right); for (int y = 0; y < height; ++y) { const uint16_t left_y = left_column[y]; const uint32x4_t weighted_left_low = vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y); vst1_u16(dst, vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE)); const uint32x4_t weighted_left_high = vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y); vst1_u16(dst + 4, vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); dst += stride; } } #define HIGHBD_SMOOTH_H_NXM(W, H) \ void aom_highbd_smooth_h_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER HIGHBD_SMOOTH_H_NXM(4, 4) HIGHBD_SMOOTH_H_NXM(4, 8) HIGHBD_SMOOTH_H_NXM(4, 16) HIGHBD_SMOOTH_H_NXM(8, 4) HIGHBD_SMOOTH_H_NXM(8, 8) HIGHBD_SMOOTH_H_NXM(8, 16) HIGHBD_SMOOTH_H_NXM(8, 32) #else HIGHBD_SMOOTH_H_NXM(4, 4) HIGHBD_SMOOTH_H_NXM(4, 8) HIGHBD_SMOOTH_H_NXM(8, 4) HIGHBD_SMOOTH_H_NXM(8, 8) HIGHBD_SMOOTH_H_NXM(8, 16) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #undef HIGHBD_SMOOTH_H_NXM // For width 16 and above. #define HIGHBD_SMOOTH_H_PREDICTOR(W) \ static void highbd_smooth_h_##W##xh_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \ const uint16_t *const left_column, const int height) { \ const uint16_t top_right = top_row[(W)-1]; \ \ uint16x4_t weights_x_low[(W) >> 3]; \ uint16x4_t weights_x_high[(W) >> 3]; \ uint32x4_t weighted_tr_low[(W) >> 3]; \ uint32x4_t weighted_tr_high[(W) >> 3]; \ for (int i = 0; i < (W) >> 3; ++i) { \ const int x = i << 3; \ weights_x_low[i] = vld1_u16(smooth_weights_u16 + (W)-4 + x); \ weighted_tr_low[i] = \ vmull_n_u16(negate_s8(weights_x_low[i]), top_right); \ weights_x_high[i] = vld1_u16(smooth_weights_u16 + (W) + x); \ weighted_tr_high[i] = \ vmull_n_u16(negate_s8(weights_x_high[i]), top_right); \ } \ \ for (int y = 0; y < height; ++y) { \ uint16_t *dst_x = dst; \ const uint16_t left_y = left_column[y]; \ for (int i = 0; i < (W) >> 3; ++i) { \ const uint32x4_t weighted_left_low = \ vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y); \ vst1_u16(dst_x, \ vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE)); \ \ const uint32x4_t weighted_left_high = \ vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y); \ vst1_u16(dst_x + 4, \ vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); \ dst_x += 8; \ } \ dst += stride; \ } \ } HIGHBD_SMOOTH_H_PREDICTOR(16) HIGHBD_SMOOTH_H_PREDICTOR(32) HIGHBD_SMOOTH_H_PREDICTOR(64) #undef HIGHBD_SMOOTH_H_PREDICTOR #define HIGHBD_SMOOTH_H_NXM_WIDE(W, H) \ void aom_highbd_smooth_h_predictor_##W##x##H##_neon( \ uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ (void)bd; \ highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER HIGHBD_SMOOTH_H_NXM_WIDE(16, 4) HIGHBD_SMOOTH_H_NXM_WIDE(16, 8) HIGHBD_SMOOTH_H_NXM_WIDE(16, 16) HIGHBD_SMOOTH_H_NXM_WIDE(16, 32) HIGHBD_SMOOTH_H_NXM_WIDE(16, 64) HIGHBD_SMOOTH_H_NXM_WIDE(32, 8) HIGHBD_SMOOTH_H_NXM_WIDE(32, 16) HIGHBD_SMOOTH_H_NXM_WIDE(32, 32) HIGHBD_SMOOTH_H_NXM_WIDE(32, 64) HIGHBD_SMOOTH_H_NXM_WIDE(64, 16) HIGHBD_SMOOTH_H_NXM_WIDE(64, 32) HIGHBD_SMOOTH_H_NXM_WIDE(64, 64) #else HIGHBD_SMOOTH_H_NXM_WIDE(16, 8) HIGHBD_SMOOTH_H_NXM_WIDE(16, 16) HIGHBD_SMOOTH_H_NXM_WIDE(16, 32) HIGHBD_SMOOTH_H_NXM_WIDE(32, 16) HIGHBD_SMOOTH_H_NXM_WIDE(32, 32) HIGHBD_SMOOTH_H_NXM_WIDE(32, 64) HIGHBD_SMOOTH_H_NXM_WIDE(64, 32) HIGHBD_SMOOTH_H_NXM_WIDE(64, 64) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #undef HIGHBD_SMOOTH_H_NXM_WIDE // ----------------------------------------------------------------------------- // Z1 static const int16_t iota1_s16[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 }; static const int16_t iota2_s16[] = { 0, 2, 4, 6, 8, 10, 12, 14 }; static AOM_FORCE_INLINE uint16x4_t highbd_dr_z1_apply_shift_x4(uint16x4_t a0, uint16x4_t a1, int shift) { // The C implementation of the z1 predictor uses (32 - shift) and a right // shift by 5, however we instead double shift to avoid an unnecessary right // shift by 1. uint32x4_t res = vmull_n_u16(a1, shift); res = vmlal_n_u16(res, a0, 64 - shift); return vrshrn_n_u32(res, 6); } static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0, uint16x8_t a1, int shift) { return vcombine_u16( highbd_dr_z1_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), shift), highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift)); } // clang-format off static const uint8_t kLoadMaxShuffles[] = { 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, }; // clang-format on static inline uint16x8_t zn_load_masked_neon(const uint16_t *ptr, int shuffle_idx) { uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]); uint8x16_t src = vreinterpretq_u8_u16(vld1q_u16(ptr)); #if AOM_ARCH_AARCH64 return vreinterpretq_u16_u8(vqtbl1q_u8(src, shuffle)); #else uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } }; uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle)); uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle)); return vreinterpretq_u16_u8(vcombine_u8(lo, hi)); #endif } static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, int dx) { assert(bw % 4 == 0); assert(bh % 4 == 0); assert(dx > 0); const int max_base_x = (bw + bh) - 1; const int above_max = above[max_base_x]; const int16x8_t iota1x8 = vld1q_s16(iota1_s16); const int16x4_t iota1x4 = vget_low_s16(iota1x8); int x = dx; int r = 0; do { const int base = x >> 6; if (base >= max_base_x) { for (int i = r; i < bh; ++i) { aom_memset16(dst, above_max, bw); dst += stride; } return; } // The C implementation of the z1 predictor when not upsampling uses: // ((x & 0x3f) >> 1) // The right shift is unnecessary here since we instead shift by +1 later, // so adjust the mask to 0x3e to ensure we don't consider the extra bit. const int shift = x & 0x3e; if (bw == 4) { const uint16x4_t a0 = vld1_u16(&above[base]); const uint16x4_t a1 = vld1_u16(&above[base + 1]); const uint16x4_t val = highbd_dr_z1_apply_shift_x4(a0, a1, shift); const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota1x4); const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max)); vst1_u16(dst, res); } else { int c = 0; do { uint16x8_t a0; uint16x8_t a1; if (base + c >= max_base_x) { a0 = a1 = vdupq_n_u16(above_max); } else { if (base + c + 7 >= max_base_x) { int shuffle_idx = max_base_x - base - c; a0 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx); } else { a0 = vld1q_u16(above + base + c); } if (base + c + 8 >= max_base_x) { int shuffle_idx = max_base_x - base - c - 1; a1 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx); } else { a1 = vld1q_u16(above + base + c + 1); } } vst1q_u16(dst + c, highbd_dr_z1_apply_shift_x8(a0, a1, shift)); c += 8; } while (c < bw); } dst += stride; x += dx; } while (++r < bh); } static void highbd_dr_prediction_z1_upsample1_neon(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, int dx) { assert(bw % 4 == 0); assert(bh % 4 == 0); assert(dx > 0); const int max_base_x = ((bw + bh) - 1) << 1; const int above_max = above[max_base_x]; const int16x8_t iota2x8 = vld1q_s16(iota2_s16); const int16x4_t iota2x4 = vget_low_s16(iota2x8); int x = dx; int r = 0; do { const int base = x >> 5; if (base >= max_base_x) { for (int i = r; i < bh; ++i) { aom_memset16(dst, above_max, bw); dst += stride; } return; } // The C implementation of the z1 predictor when upsampling uses: // (((x << 1) & 0x3f) >> 1) // The right shift is unnecessary here since we instead shift by +1 later, // so adjust the mask to 0x3e to ensure we don't consider the extra bit. const int shift = (x << 1) & 0x3e; if (bw == 4) { const uint16x4x2_t a01 = vld2_u16(&above[base]); const uint16x4_t val = highbd_dr_z1_apply_shift_x4(a01.val[0], a01.val[1], shift); const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota2x4); const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max)); vst1_u16(dst, res); } else { int c = 0; do { const uint16x8x2_t a01 = vld2q_u16(&above[base + 2 * c]); const uint16x8_t val = highbd_dr_z1_apply_shift_x8(a01.val[0], a01.val[1], shift); const uint16x8_t cmp = vcgtq_s16(vdupq_n_s16(max_base_x - base - 2 * c), iota2x8); const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max)); vst1q_u16(dst + c, res); c += 8; } while (c < bw); } dst += stride; x += dx; } while (++r < bh); } // Directional prediction, zone 1: 0 < angle < 90 void av1_highbd_dr_prediction_z1_neon(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd) { (void)left; (void)dy; (void)bd; assert(dy == 1); if (upsample_above) { highbd_dr_prediction_z1_upsample1_neon(dst, stride, bw, bh, above, dx); } else { highbd_dr_prediction_z1_upsample0_neon(dst, stride, bw, bh, above, dx); } } // ----------------------------------------------------------------------------- // Z2 #if AOM_ARCH_AARCH64 // Incrementally shift more elements from `above` into the result, merging with // existing `left` elements. // X0, X1, X2, X3 // Y0, X0, X1, X2 // Y0, Y1, X0, X1 // Y0, Y1, Y2, X0 // Y0, Y1, Y2, Y3 // clang-format off static const uint8_t z2_merge_shuffles_u16x4[5][8] = { { 8, 9, 10, 11, 12, 13, 14, 15 }, { 0, 1, 8, 9, 10, 11, 12, 13 }, { 0, 1, 2, 3, 8, 9, 10, 11 }, { 0, 1, 2, 3, 4, 5, 8, 9 }, { 0, 1, 2, 3, 4, 5, 6, 7 }, }; // clang-format on // Incrementally shift more elements from `above` into the result, merging with // existing `left` elements. // X0, X1, X2, X3, X4, X5, X6, X7 // Y0, X0, X1, X2, X3, X4, X5, X6 // Y0, Y1, X0, X1, X2, X3, X4, X5 // Y0, Y1, Y2, X0, X1, X2, X3, X4 // Y0, Y1, Y2, Y3, X0, X1, X2, X3 // Y0, Y1, Y2, Y3, Y4, X0, X1, X2 // Y0, Y1, Y2, Y3, Y4, Y5, X0, X1 // Y0, Y1, Y2, Y3, Y4, Y5, Y6, X0 // Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7 // clang-format off static const uint8_t z2_merge_shuffles_u16x8[9][16] = { { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, { 0, 1, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 }, { 0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 }, { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 }, { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 18, 19, 20, 21 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, }; // clang-format on // clang-format off static const uint16_t z2_y_iter_masks_u16x4[5][4] = { { 0U, 0U, 0U, 0U }, { 0xffffU, 0U, 0U, 0U }, { 0xffffU, 0xffffU, 0U, 0U }, { 0xffffU, 0xffffU, 0xffffU, 0U }, { 0xffffU, 0xffffU, 0xffffU, 0xffffU }, }; // clang-format on // clang-format off static const uint16_t z2_y_iter_masks_u16x8[9][8] = { { 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U }, { 0xffffU, 0U, 0U, 0U, 0U, 0U, 0U, 0U }, { 0xffffU, 0xffffU, 0U, 0U, 0U, 0U, 0U, 0U }, { 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U, 0U, 0U }, { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U, 0U }, { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U }, { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U }, { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U }, { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU }, }; // clang-format on static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x8( const uint16x8_t left_data, const int16x4_t indices, int base, int n) { // Need to adjust indices to operate on 0-based indices rather than // `base`-based indices and then adjust from uint16x4 indices to uint8x8 // indices so we can use a tbl instruction (which only operates on bytes). uint8x8_t left_indices = vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base))); left_indices = vtrn1_u8(left_indices, left_indices); left_indices = vadd_u8(left_indices, left_indices); left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100))); const uint16x4_t ret = vreinterpret_u16_u8( vqtbl1_u8(vreinterpretq_u8_u16(left_data), left_indices)); return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n])); } static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x16( const uint16x8x2_t left_data, const int16x4_t indices, int base, int n) { // Need to adjust indices to operate on 0-based indices rather than // `base`-based indices and then adjust from uint16x4 indices to uint8x8 // indices so we can use a tbl instruction (which only operates on bytes). uint8x8_t left_indices = vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base))); left_indices = vtrn1_u8(left_indices, left_indices); left_indices = vadd_u8(left_indices, left_indices); left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100))); uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]), vreinterpretq_u8_u16(left_data.val[1]) } }; const uint16x4_t ret = vreinterpret_u16_u8(vqtbl2_u8(data_u8, left_indices)); return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n])); } static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x8( const uint16x8_t left_data, const int16x8_t indices, int base, int n) { // Need to adjust indices to operate on 0-based indices rather than // `base`-based indices and then adjust from uint16x4 indices to uint8x8 // indices so we can use a tbl instruction (which only operates on bytes). uint8x16_t left_indices = vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base))); left_indices = vtrn1q_u8(left_indices, left_indices); left_indices = vaddq_u8(left_indices, left_indices); left_indices = vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100))); const uint16x8_t ret = vreinterpretq_u16_u8( vqtbl1q_u8(vreinterpretq_u8_u16(left_data), left_indices)); return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n])); } static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x16( const uint16x8x2_t left_data, const int16x8_t indices, int base, int n) { // Need to adjust indices to operate on 0-based indices rather than // `base`-based indices and then adjust from uint16x4 indices to uint8x8 // indices so we can use a tbl instruction (which only operates on bytes). uint8x16_t left_indices = vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base))); left_indices = vtrn1q_u8(left_indices, left_indices); left_indices = vaddq_u8(left_indices, left_indices); left_indices = vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100))); uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]), vreinterpretq_u8_u16(left_data.val[1]) } }; const uint16x8_t ret = vreinterpretq_u16_u8(vqtbl2q_u8(data_u8, left_indices)); return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n])); } #endif // AOM_ARCH_AARCH64 static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4( const uint16_t *left, const int16x4_t indices, int n) { assert(n > 0); assert(n <= 4); // Load two elements at a time and then uzp them into separate vectors, to // reduce the number of memory accesses. uint32x2_t ret0_u32 = vdup_n_u32(0); uint32x2_t ret1_u32 = vdup_n_u32(0); // Use a single vget_lane_u64 to minimize vector to general purpose register // transfers and then mask off the bits we actually want. const uint64_t indices0123 = vget_lane_u64(vreinterpret_u64_s16(indices), 0); const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU); const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU); const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU); const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU); // At time of writing both Clang and GCC produced better code with these // nested if-statements compared to a switch statement with fallthrough. load_unaligned_u32_2x1_lane(ret0_u32, left + idx0, 0); if (n > 1) { load_unaligned_u32_2x1_lane(ret0_u32, left + idx1, 1); if (n > 2) { load_unaligned_u32_2x1_lane(ret1_u32, left + idx2, 0); if (n > 3) { load_unaligned_u32_2x1_lane(ret1_u32, left + idx3, 1); } } } return vuzp_u16(vreinterpret_u16_u32(ret0_u32), vreinterpret_u16_u32(ret1_u32)); } static AOM_FORCE_INLINE uint16x8x2_t highbd_dr_prediction_z2_gather_left_x8( const uint16_t *left, const int16x8_t indices, int n) { assert(n > 0); assert(n <= 8); // Load two elements at a time and then uzp them into separate vectors, to // reduce the number of memory accesses. uint32x4_t ret0_u32 = vdupq_n_u32(0); uint32x4_t ret1_u32 = vdupq_n_u32(0); // Use a pair of vget_lane_u64 to minimize vector to general purpose register // transfers and then mask off the bits we actually want. const uint64_t indices0123 = vgetq_lane_u64(vreinterpretq_u64_s16(indices), 0); const uint64_t indices4567 = vgetq_lane_u64(vreinterpretq_u64_s16(indices), 1); const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU); const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU); const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU); const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU); const int idx4 = (int16_t)((indices4567 >> 0) & 0xffffU); const int idx5 = (int16_t)((indices4567 >> 16) & 0xffffU); const int idx6 = (int16_t)((indices4567 >> 32) & 0xffffU); const int idx7 = (int16_t)((indices4567 >> 48) & 0xffffU); // At time of writing both Clang and GCC produced better code with these // nested if-statements compared to a switch statement with fallthrough. load_unaligned_u32_4x1_lane(ret0_u32, left + idx0, 0); if (n > 1) { load_unaligned_u32_4x1_lane(ret0_u32, left + idx1, 1); if (n > 2) { load_unaligned_u32_4x1_lane(ret0_u32, left + idx2, 2); if (n > 3) { load_unaligned_u32_4x1_lane(ret0_u32, left + idx3, 3); if (n > 4) { load_unaligned_u32_4x1_lane(ret1_u32, left + idx4, 0); if (n > 5) { load_unaligned_u32_4x1_lane(ret1_u32, left + idx5, 1); if (n > 6) { load_unaligned_u32_4x1_lane(ret1_u32, left + idx6, 2); if (n > 7) { load_unaligned_u32_4x1_lane(ret1_u32, left + idx7, 3); } } } } } } } return vuzpq_u16(vreinterpretq_u16_u32(ret0_u32), vreinterpretq_u16_u32(ret1_u32)); } static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_merge_x4( uint16x4_t out_x, uint16x4_t out_y, int base_shift) { assert(base_shift >= 0); assert(base_shift <= 4); // On AArch64 we can permute the data from the `above` and `left` vectors // into a single vector in a single load (of the permute vector) + tbl. #if AOM_ARCH_AARCH64 const uint8x8x2_t out_yx = { { vreinterpret_u8_u16(out_y), vreinterpret_u8_u16(out_x) } }; return vreinterpret_u16_u8( vtbl2_u8(out_yx, vld1_u8(z2_merge_shuffles_u16x4[base_shift]))); #else uint16x4_t out = out_y; for (int c2 = base_shift, x_idx = 0; c2 < 4; ++c2, ++x_idx) { out[c2] = out_x[x_idx]; } return out; #endif } static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_merge_x8( uint16x8_t out_x, uint16x8_t out_y, int base_shift) { assert(base_shift >= 0); assert(base_shift <= 8); // On AArch64 we can permute the data from the `above` and `left` vectors // into a single vector in a single load (of the permute vector) + tbl. #if AOM_ARCH_AARCH64 const uint8x16x2_t out_yx = { { vreinterpretq_u8_u16(out_y), vreinterpretq_u8_u16(out_x) } }; return vreinterpretq_u16_u8( vqtbl2q_u8(out_yx, vld1q_u8(z2_merge_shuffles_u16x8[base_shift]))); #else uint16x8_t out = out_y; for (int c2 = base_shift, x_idx = 0; c2 < 8; ++c2, ++x_idx) { out[c2] = out_x[x_idx]; } return out; #endif } static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_apply_shift_x4( uint16x4_t a0, uint16x4_t a1, int16x4_t shift) { uint32x4_t res = vmull_u16(a1, vreinterpret_u16_s16(shift)); res = vmlal_u16(res, a0, vsub_u16(vdup_n_u16(32), vreinterpret_u16_s16(shift))); return vrshrn_n_u32(res, 5); } static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_apply_shift_x8( uint16x8_t a0, uint16x8_t a1, int16x8_t shift) { return vcombine_u16( highbd_dr_prediction_z2_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), vget_low_s16(shift)), highbd_dr_prediction_z2_apply_shift_x4( vget_high_u16(a0), vget_high_u16(a1), vget_high_s16(shift))); } static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_step_x4( const uint16_t *above, const uint16x4_t above0, const uint16x4_t above1, const uint16_t *left, int dx, int dy, int r, int c) { const int16x4_t iota = vld1_s16(iota1_s16); const int x0 = (c << 6) - (r + 1) * dx; const int y0 = (r << 6) - (c + 1) * dy; const int16x4_t x0123 = vadd_s16(vdup_n_s16(x0), vshl_n_s16(iota, 6)); const int16x4_t y0123 = vsub_s16(vdup_n_s16(y0), vmul_n_s16(iota, dy)); const int16x4_t shift_x0123 = vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1); const int16x4_t shift_y0123 = vshr_n_s16(vand_s16(y0123, vdup_n_s16(0x3F)), 1); const int16x4_t base_y0123 = vshr_n_s16(y0123, 6); const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c; // Based on the value of `base_shift` there are three possible cases to // compute the result: // 1) base_shift <= 0: We can load and operate entirely on data from the // `above` input vector. // 2) base_shift < vl: We can load from `above[-1]` and shift // `vl - base_shift` elements across to the end of the // vector, then compute the remainder from `left`. // 3) base_shift >= vl: We can load and operate entirely on data from the // `left` input vector. if (base_shift <= 0) { const int base_x = x0 >> 6; const uint16x4_t a0 = vld1_u16(above + base_x); const uint16x4_t a1 = vld1_u16(above + base_x + 1); return highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123); } else if (base_shift < 4) { const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4( left + 1, base_y0123, base_shift); const uint16x4_t out16_y = highbd_dr_prediction_z2_apply_shift_x4( l01.val[0], l01.val[1], shift_y0123); // No need to reload from above in the loop, just use pre-loaded constants. const uint16x4_t out16_x = highbd_dr_prediction_z2_apply_shift_x4(above0, above1, shift_x0123); return highbd_dr_prediction_z2_merge_x4(out16_x, out16_y, base_shift); } else { const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4(left + 1, base_y0123, 4); return highbd_dr_prediction_z2_apply_shift_x4(l01.val[0], l01.val[1], shift_y0123); } } static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_step_x8( const uint16_t *above, const uint16x8_t above0, const uint16x8_t above1, const uint16_t *left, int dx, int dy, int r, int c) { const int16x8_t iota = vld1q_s16(iota1_s16); const int x0 = (c << 6) - (r + 1) * dx; const int y0 = (r << 6) - (c + 1) * dy; const int16x8_t x01234567 = vaddq_s16(vdupq_n_s16(x0), vshlq_n_s16(iota, 6)); const int16x8_t y01234567 = vsubq_s16(vdupq_n_s16(y0), vmulq_n_s16(iota, dy)); const int16x8_t shift_x01234567 = vshrq_n_s16(vandq_s16(x01234567, vdupq_n_s16(0x3F)), 1); const int16x8_t shift_y01234567 = vshrq_n_s16(vandq_s16(y01234567, vdupq_n_s16(0x3F)), 1); const int16x8_t base_y01234567 = vshrq_n_s16(y01234567, 6); const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c; // Based on the value of `base_shift` there are three possible cases to // compute the result: // 1) base_shift <= 0: We can load and operate entirely on data from the // `above` input vector. // 2) base_shift < vl: We can load from `above[-1]` and shift // `vl - base_shift` elements across to the end of the // vector, then compute the remainder from `left`. // 3) base_shift >= vl: We can load and operate entirely on data from the // `left` input vector. if (base_shift <= 0) { const int base_x = x0 >> 6; const uint16x8_t a0 = vld1q_u16(above + base_x); const uint16x8_t a1 = vld1q_u16(above + base_x + 1); return highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567); } else if (base_shift < 8) { const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8( left + 1, base_y01234567, base_shift); const uint16x8_t out16_y = highbd_dr_prediction_z2_apply_shift_x8( l01.val[0], l01.val[1], shift_y01234567); // No need to reload from above in the loop, just use pre-loaded constants. const uint16x8_t out16_x = highbd_dr_prediction_z2_apply_shift_x8(above0, above1, shift_x01234567); return highbd_dr_prediction_z2_merge_x8(out16_x, out16_y, base_shift); } else { const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8(left + 1, base_y01234567, 8); return highbd_dr_prediction_z2_apply_shift_x8(l01.val[0], l01.val[1], shift_y01234567); } } // Left array is accessed from -1 through `bh - 1` inclusive. // Above array is accessed from -1 through `bw - 1` inclusive. #define HIGHBD_DR_PREDICTOR_Z2_WXH(bw, bh) \ static void highbd_dr_prediction_z2_##bw##x##bh##_neon( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ const uint16_t *left, int upsample_above, int upsample_left, int dx, \ int dy, int bd) { \ (void)bd; \ (void)upsample_above; \ (void)upsample_left; \ assert(!upsample_above); \ assert(!upsample_left); \ assert(bw % 4 == 0); \ assert(bh % 4 == 0); \ assert(dx > 0); \ assert(dy > 0); \ \ uint16_t left_data[bh + 1]; \ memcpy(left_data, left - 1, (bh + 1) * sizeof(uint16_t)); \ \ uint16x8_t a0, a1; \ if (bw == 4) { \ a0 = vcombine_u16(vld1_u16(above - 1), vdup_n_u16(0)); \ a1 = vcombine_u16(vld1_u16(above + 0), vdup_n_u16(0)); \ } else { \ a0 = vld1q_u16(above - 1); \ a1 = vld1q_u16(above + 0); \ } \ \ int r = 0; \ do { \ if (bw == 4) { \ vst1_u16(dst, highbd_dr_prediction_z2_step_x4( \ above, vget_low_u16(a0), vget_low_u16(a1), \ left_data, dx, dy, r, 0)); \ } else { \ int c = 0; \ do { \ vst1q_u16(dst + c, highbd_dr_prediction_z2_step_x8( \ above, a0, a1, left_data, dx, dy, r, c)); \ c += 8; \ } while (c < bw); \ } \ dst += stride; \ } while (++r < bh); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER HIGHBD_DR_PREDICTOR_Z2_WXH(4, 16) HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16) HIGHBD_DR_PREDICTOR_Z2_WXH(8, 32) HIGHBD_DR_PREDICTOR_Z2_WXH(16, 4) HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8) HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16) HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32) HIGHBD_DR_PREDICTOR_Z2_WXH(16, 64) HIGHBD_DR_PREDICTOR_Z2_WXH(32, 8) HIGHBD_DR_PREDICTOR_Z2_WXH(32, 16) HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32) HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64) HIGHBD_DR_PREDICTOR_Z2_WXH(64, 16) HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32) HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64) #else HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16) HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8) HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16) HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32) HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32) HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64) HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32) HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #undef HIGHBD_DR_PREDICTOR_Z2_WXH typedef void (*highbd_dr_prediction_z2_ptr)(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd); static void highbd_dr_prediction_z2_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd) { (void)bd; assert(dx > 0); assert(dy > 0); const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; const int min_base_x = -(1 << (upsample_above + frac_bits_x)); // if `upsample_left` then we need -2 through 6 inclusive from `left`. // else we only need -1 through 3 inclusive. #if AOM_ARCH_AARCH64 uint16x8_t left_data0, left_data1; if (upsample_left) { left_data0 = vld1q_u16(left - 2); left_data1 = vld1q_u16(left - 1); } else { left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0)); left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0)); } #endif const int16x4_t iota0123 = vld1_s16(iota1_s16); const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1); for (int r = 0; r < 4; ++r) { const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; const int x0 = (r + 1) * dx; const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0)); const int base_x0 = (-x0) >> frac_bits_x; if (base_shift <= 0) { uint16x4_t a0, a1; int16x4_t shift_x0123; if (upsample_above) { const uint16x4x2_t a01 = vld2_u16(above + base_x0); a0 = a01.val[0]; a1 = a01.val[1]; shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F)); } else { a0 = vld1_u16(above + base_x0); a1 = vld1_u16(above + base_x0 + 1); shift_x0123 = vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1); } vst1_u16(dst, highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123)); } else if (base_shift < 4) { // Calculate Y component from `left`. const int y_iters = base_shift; const int16x4_t y0123 = vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); const int16x4_t shift_y0123 = vshr_n_s16( vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); uint16x4_t l0, l1; #if AOM_ARCH_AARCH64 const int left_data_base = upsample_left ? -2 : -1; l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123, left_data_base, y_iters); l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123, left_data_base, y_iters); #else const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters); l0 = l01.val[0]; l1 = l01.val[1]; #endif const uint16x4_t out_y = highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123); // Calculate X component from `above`. const int16x4_t shift_x0123 = vshr_n_s16( vand_s16(vmul_n_s16(x0123, 1 << upsample_above), vdup_n_s16(0x3F)), 1); uint16x4_t a0, a1; if (upsample_above) { const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); a0 = a01.val[0]; a1 = a01.val[1]; } else { a0 = vld1_u16(above - 1); a1 = vld1_u16(above + 0); } const uint16x4_t out_x = highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123); // Combine X and Y vectors. const uint16x4_t out = highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift); vst1_u16(dst, out); } else { const int16x4_t y0123 = vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); const int16x4_t shift_y0123 = vshr_n_s16( vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); uint16x4_t l0, l1; #if AOM_ARCH_AARCH64 const int left_data_base = upsample_left ? -2 : -1; l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123, left_data_base, 4); l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123, left_data_base, 4); #else const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4); l0 = l01.val[0]; l1 = l01.val[1]; #endif vst1_u16(dst, highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123)); } dst += stride; } } static void highbd_dr_prediction_z2_4x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd) { (void)bd; assert(dx > 0); assert(dy > 0); const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; const int min_base_x = -(1 << (upsample_above + frac_bits_x)); // if `upsample_left` then we need -2 through 14 inclusive from `left`. // else we only need -1 through 6 inclusive. #if AOM_ARCH_AARCH64 uint16x8x2_t left_data0, left_data1; if (upsample_left) { left_data0 = vld1q_u16_x2(left - 2); left_data1 = vld1q_u16_x2(left - 1); } else { left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } }; left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } }; } #endif const int16x4_t iota0123 = vld1_s16(iota1_s16); const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1); for (int r = 0; r < 8; ++r) { const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; const int x0 = (r + 1) * dx; const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0)); const int base_x0 = (-x0) >> frac_bits_x; if (base_shift <= 0) { uint16x4_t a0, a1; int16x4_t shift_x0123; if (upsample_above) { const uint16x4x2_t a01 = vld2_u16(above + base_x0); a0 = a01.val[0]; a1 = a01.val[1]; shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F)); } else { a0 = vld1_u16(above + base_x0); a1 = vld1_u16(above + base_x0 + 1); shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F)); } vst1_u16(dst, highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123)); } else if (base_shift < 4) { // Calculate Y component from `left`. const int y_iters = base_shift; const int16x4_t y0123 = vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); const int16x4_t shift_y0123 = vshr_n_s16( vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); uint16x4_t l0, l1; #if AOM_ARCH_AARCH64 const int left_data_base = upsample_left ? -2 : -1; l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16( left_data0, base_y0123, left_data_base, y_iters); l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16( left_data1, base_y0123, left_data_base, y_iters); #else const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters); l0 = l01.val[0]; l1 = l01.val[1]; #endif const uint16x4_t out_y = highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123); // Calculate X component from `above`. uint16x4_t a0, a1; int16x4_t shift_x0123; if (upsample_above) { const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); a0 = a01.val[0]; a1 = a01.val[1]; shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F)); } else { a0 = vld1_u16(above - 1); a1 = vld1_u16(above + 0); shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F)); } const uint16x4_t out_x = highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123); // Combine X and Y vectors. const uint16x4_t out = highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift); vst1_u16(dst, out); } else { const int16x4_t y0123 = vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); const int16x4_t shift_y0123 = vshr_n_s16( vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); uint16x4_t l0, l1; #if AOM_ARCH_AARCH64 const int left_data_base = upsample_left ? -2 : -1; l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data0, base_y0123, left_data_base, 4); l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data1, base_y0123, left_data_base, 4); #else const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4); l0 = l01.val[0]; l1 = l01.val[1]; #endif vst1_u16(dst, highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123)); } dst += stride; } } static void highbd_dr_prediction_z2_8x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd) { (void)bd; assert(dx > 0); assert(dy > 0); const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; const int min_base_x = -(1 << (upsample_above + frac_bits_x)); // if `upsample_left` then we need -2 through 6 inclusive from `left`. // else we only need -1 through 3 inclusive. #if AOM_ARCH_AARCH64 uint16x8_t left_data0, left_data1; if (upsample_left) { left_data0 = vld1q_u16(left - 2); left_data1 = vld1q_u16(left - 1); } else { left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0)); left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0)); } #endif const int16x8_t iota01234567 = vld1q_s16(iota1_s16); const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1); for (int r = 0; r < 4; ++r) { const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; const int x0 = (r + 1) * dx; const int16x8_t x01234567 = vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0)); const int base_x0 = (-x0) >> frac_bits_x; if (base_shift <= 0) { uint16x8_t a0, a1; int16x8_t shift_x01234567; if (upsample_above) { const uint16x8x2_t a01 = vld2q_u16(above + base_x0); a0 = a01.val[0]; a1 = a01.val[1]; shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); } else { a0 = vld1q_u16(above + base_x0); a1 = vld1q_u16(above + base_x0 + 1); shift_x01234567 = vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); } vst1q_u16( dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567)); } else if (base_shift < 8) { // Calculate Y component from `left`. const int y_iters = base_shift; const int16x8_t y01234567 = vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); const int16x8_t base_y01234567 = vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); const int16x8_t shift_y01234567 = vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), vdupq_n_s16(0x3F)), 1); uint16x8_t l0, l1; #if AOM_ARCH_AARCH64 const int left_data_base = upsample_left ? -2 : -1; l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( left_data0, base_y01234567, left_data_base, y_iters); l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( left_data1, base_y01234567, left_data_base, y_iters); #else const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters); l0 = l01.val[0]; l1 = l01.val[1]; #endif const uint16x8_t out_y = highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567); // Calculate X component from `above`. uint16x8_t a0, a1; int16x8_t shift_x01234567; if (upsample_above) { const uint16x8x2_t a01 = vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); a0 = a01.val[0]; a1 = a01.val[1]; shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); } else { a0 = vld1q_u16(above - 1); a1 = vld1q_u16(above + 0); shift_x01234567 = vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); } const uint16x8_t out_x = highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567); // Combine X and Y vectors. const uint16x8_t out = highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift); vst1q_u16(dst, out); } else { const int16x8_t y01234567 = vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); const int16x8_t base_y01234567 = vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); const int16x8_t shift_y01234567 = vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), vdupq_n_s16(0x3F)), 1); uint16x8_t l0, l1; #if AOM_ARCH_AARCH64 const int left_data_base = upsample_left ? -2 : -1; l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( left_data0, base_y01234567, left_data_base, 8); l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( left_data1, base_y01234567, left_data_base, 8); #else const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8); l0 = l01.val[0]; l1 = l01.val[1]; #endif vst1q_u16( dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567)); } dst += stride; } } static void highbd_dr_prediction_z2_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd) { (void)bd; assert(dx > 0); assert(dy > 0); const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; const int min_base_x = -(1 << (upsample_above + frac_bits_x)); // if `upsample_left` then we need -2 through 14 inclusive from `left`. // else we only need -1 through 6 inclusive. #if AOM_ARCH_AARCH64 uint16x8x2_t left_data0, left_data1; if (upsample_left) { left_data0 = vld1q_u16_x2(left - 2); left_data1 = vld1q_u16_x2(left - 1); } else { left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } }; left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } }; } #endif const int16x8_t iota01234567 = vld1q_s16(iota1_s16); const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1); for (int r = 0; r < 8; ++r) { const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; const int x0 = (r + 1) * dx; const int16x8_t x01234567 = vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0)); const int base_x0 = (-x0) >> frac_bits_x; if (base_shift <= 0) { uint16x8_t a0, a1; int16x8_t shift_x01234567; if (upsample_above) { const uint16x8x2_t a01 = vld2q_u16(above + base_x0); a0 = a01.val[0]; a1 = a01.val[1]; shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); } else { a0 = vld1q_u16(above + base_x0); a1 = vld1q_u16(above + base_x0 + 1); shift_x01234567 = vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); } vst1q_u16( dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567)); } else if (base_shift < 8) { // Calculate Y component from `left`. const int y_iters = base_shift; const int16x8_t y01234567 = vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); const int16x8_t base_y01234567 = vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); const int16x8_t shift_y01234567 = vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), vdupq_n_s16(0x3F)), 1); uint16x8_t l0, l1; #if AOM_ARCH_AARCH64 const int left_data_base = upsample_left ? -2 : -1; l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( left_data0, base_y01234567, left_data_base, y_iters); l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( left_data1, base_y01234567, left_data_base, y_iters); #else const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters); l0 = l01.val[0]; l1 = l01.val[1]; #endif const uint16x8_t out_y = highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567); // Calculate X component from `above`. uint16x8_t a0, a1; int16x8_t shift_x01234567; if (upsample_above) { const uint16x8x2_t a01 = vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); a0 = a01.val[0]; a1 = a01.val[1]; shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); } else { a0 = vld1q_u16(above - 1); a1 = vld1q_u16(above + 0); shift_x01234567 = vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); } const uint16x8_t out_x = highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567); // Combine X and Y vectors. const uint16x8_t out = highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift); vst1q_u16(dst, out); } else { const int16x8_t y01234567 = vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); const int16x8_t base_y01234567 = vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); const int16x8_t shift_y01234567 = vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), vdupq_n_s16(0x3F)), 1); uint16x8_t l0, l1; #if AOM_ARCH_AARCH64 const int left_data_base = upsample_left ? -2 : -1; l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( left_data0, base_y01234567, left_data_base, 8); l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( left_data1, base_y01234567, left_data_base, 8); #else const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8); l0 = l01.val[0]; l1 = l01.val[1]; #endif vst1q_u16( dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567)); } dst += stride; } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = { { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon, &highbd_dr_prediction_z2_4x8_neon, &highbd_dr_prediction_z2_4x16_neon, NULL, NULL }, { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon, &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon, &highbd_dr_prediction_z2_8x32_neon, NULL }, { NULL, NULL, &highbd_dr_prediction_z2_16x4_neon, &highbd_dr_prediction_z2_16x8_neon, &highbd_dr_prediction_z2_16x16_neon, &highbd_dr_prediction_z2_16x32_neon, &highbd_dr_prediction_z2_16x64_neon }, { NULL, NULL, NULL, &highbd_dr_prediction_z2_32x8_neon, &highbd_dr_prediction_z2_32x16_neon, &highbd_dr_prediction_z2_32x32_neon, &highbd_dr_prediction_z2_32x64_neon }, { NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x16_neon, &highbd_dr_prediction_z2_64x32_neon, &highbd_dr_prediction_z2_64x64_neon }, }; #else static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = { { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon, &highbd_dr_prediction_z2_4x8_neon, NULL, NULL, NULL }, { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon, &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon, NULL, NULL }, { NULL, NULL, NULL, &highbd_dr_prediction_z2_16x8_neon, &highbd_dr_prediction_z2_16x16_neon, &highbd_dr_prediction_z2_16x32_neon, NULL }, { NULL, NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_32x32_neon, &highbd_dr_prediction_z2_32x64_neon }, { NULL, NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x32_neon, &highbd_dr_prediction_z2_64x64_neon }, }; #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // Directional prediction, zone 2: 90 < angle < 180 void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd) { highbd_dr_prediction_z2_ptr f = dr_predictor_z2_arr_neon[get_msb(bw)][get_msb(bh)]; assert(f != NULL); f(dst, stride, above, left, upsample_above, upsample_left, dx, dy, bd); } // ----------------------------------------------------------------------------- // Z3 // Both the lane to the use and the shift amount must be immediates. #define HIGHBD_DR_PREDICTOR_Z3_STEP_X4(out, iota, base, in0, in1, s0, s1, \ lane, shift) \ do { \ uint32x4_t val = vmull_lane_u16((in0), (s0), (lane)); \ val = vmlal_lane_u16(val, (in1), (s1), (lane)); \ const uint16x4_t cmp = vadd_u16((iota), vdup_n_u16(base)); \ const uint16x4_t res = vrshrn_n_u32(val, (shift)); \ *(out) = vbsl_u16(vclt_u16(cmp, vdup_n_u16(max_base_y)), res, \ vdup_n_u16(left_max)); \ } while (0) #define HIGHBD_DR_PREDICTOR_Z3_STEP_X8(out, iota, base, in0, in1, s0, s1, \ lane, shift) \ do { \ uint32x4_t val_lo = vmull_lane_u16(vget_low_u16(in0), (s0), (lane)); \ val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane)); \ uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \ val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane)); \ *(out) = vcombine_u16(vrshrn_n_u32(val_lo, (shift)), \ vrshrn_n_u32(val_hi, (shift))); \ } while (0) static inline uint16x8x2_t z3_load_left_neon(const uint16_t *left0, int ofs, int max_ofs) { uint16x8_t r0; uint16x8_t r1; if (ofs + 7 >= max_ofs) { int shuffle_idx = max_ofs - ofs; r0 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx); } else { r0 = vld1q_u16(left0 + ofs); } if (ofs + 8 >= max_ofs) { int shuffle_idx = max_ofs - ofs - 1; r1 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx); } else { r1 = vld1q_u16(left0 + ofs + 1); } return (uint16x8x2_t){ { r0, r1 } }; } static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *left, int dy) { assert(bw % 4 == 0); assert(bh % 4 == 0); assert(dy > 0); // Factor out left + 1 to give the compiler a better chance of recognising // that the offsets used for the loads from left and left + 1 are otherwise // identical. const uint16_t *left1 = left + 1; const int max_base_y = (bw + bh - 1); const int left_max = left[max_base_y]; const int frac_bits = 6; const uint16x8_t iota1x8 = vreinterpretq_u16_s16(vld1q_s16(iota1_s16)); const uint16x4_t iota1x4 = vget_low_u16(iota1x8); // The C implementation of the z3 predictor when not upsampling uses: // ((y & 0x3f) >> 1) // The right shift is unnecessary here since we instead shift by +1 later, // so adjust the mask to 0x3e to ensure we don't consider the extra bit. const uint16x4_t shift_mask = vdup_n_u16(0x3e); if (bh == 4) { int y = dy; int c = 0; do { // Fully unroll the 4x4 block to allow us to use immediate lane-indexed // multiply instructions. const uint16x4_t shifts1 = vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1); const int base0 = (y + 0 * dy) >> frac_bits; const int base1 = (y + 1 * dy) >> frac_bits; const int base2 = (y + 2 * dy) >> frac_bits; const int base3 = (y + 3 * dy) >> frac_bits; uint16x4_t out[4]; if (base0 >= max_base_y) { out[0] = vdup_n_u16(left_max); } else { const uint16x4_t l00 = vld1_u16(left + base0); const uint16x4_t l01 = vld1_u16(left1 + base0); HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota1x4, base0, l00, l01, shifts0, shifts1, 0, 6); } if (base1 >= max_base_y) { out[1] = vdup_n_u16(left_max); } else { const uint16x4_t l10 = vld1_u16(left + base1); const uint16x4_t l11 = vld1_u16(left1 + base1); HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota1x4, base1, l10, l11, shifts0, shifts1, 1, 6); } if (base2 >= max_base_y) { out[2] = vdup_n_u16(left_max); } else { const uint16x4_t l20 = vld1_u16(left + base2); const uint16x4_t l21 = vld1_u16(left1 + base2); HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota1x4, base2, l20, l21, shifts0, shifts1, 2, 6); } if (base3 >= max_base_y) { out[3] = vdup_n_u16(left_max); } else { const uint16x4_t l30 = vld1_u16(left + base3); const uint16x4_t l31 = vld1_u16(left1 + base3); HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota1x4, base3, l30, l31, shifts0, shifts1, 3, 6); } transpose_array_inplace_u16_4x4(out); for (int r2 = 0; r2 < 4; ++r2) { vst1_u16(dst + r2 * stride + c, out[r2]); } y += 4 * dy; c += 4; } while (c < bw); } else { int y = dy; int c = 0; do { int r = 0; do { // Fully unroll the 4x4 block to allow us to use immediate lane-indexed // multiply instructions. const uint16x4_t shifts1 = vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1); const int base0 = ((y + 0 * dy) >> frac_bits) + r; const int base1 = ((y + 1 * dy) >> frac_bits) + r; const int base2 = ((y + 2 * dy) >> frac_bits) + r; const int base3 = ((y + 3 * dy) >> frac_bits) + r; uint16x8_t out[4]; if (base0 >= max_base_y) { out[0] = vdupq_n_u16(left_max); } else { const uint16x8x2_t l0 = z3_load_left_neon(left, base0, max_base_y); HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l0.val[0], l0.val[1], shifts0, shifts1, 0, 6); } if (base1 >= max_base_y) { out[1] = vdupq_n_u16(left_max); } else { const uint16x8x2_t l1 = z3_load_left_neon(left, base1, max_base_y); HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l1.val[0], l1.val[1], shifts0, shifts1, 1, 6); } if (base2 >= max_base_y) { out[2] = vdupq_n_u16(left_max); } else { const uint16x8x2_t l2 = z3_load_left_neon(left, base2, max_base_y); HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l2.val[0], l2.val[1], shifts0, shifts1, 2, 6); } if (base3 >= max_base_y) { out[3] = vdupq_n_u16(left_max); } else { const uint16x8x2_t l3 = z3_load_left_neon(left, base3, max_base_y); HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l3.val[0], l3.val[1], shifts0, shifts1, 3, 6); } transpose_array_inplace_u16_4x8(out); for (int r2 = 0; r2 < 4; ++r2) { vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2])); } for (int r2 = 0; r2 < 4; ++r2) { vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2])); } r += 8; } while (r < bh); y += 4 * dy; c += 4; } while (c < bw); } } static void highbd_dr_prediction_z3_upsample1_neon(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *left, int dy) { assert(bw % 4 == 0); assert(bh % 4 == 0); assert(dy > 0); const int max_base_y = (bw + bh - 1) << 1; const int left_max = left[max_base_y]; const int frac_bits = 5; const uint16x4_t iota1x4 = vreinterpret_u16_s16(vld1_s16(iota1_s16)); const uint16x8_t iota2x8 = vreinterpretq_u16_s16(vld1q_s16(iota2_s16)); const uint16x4_t iota2x4 = vget_low_u16(iota2x8); // The C implementation of the z3 predictor when upsampling uses: // (((x << 1) & 0x3f) >> 1) // The two shifts are unnecessary here since the lowest bit is guaranteed to // be zero when the mask is applied, so adjust the mask to 0x1f to avoid // needing the shifts at all. const uint16x4_t shift_mask = vdup_n_u16(0x1F); if (bh == 4) { int y = dy; int c = 0; do { // Fully unroll the 4x4 block to allow us to use immediate lane-indexed // multiply instructions. const uint16x4_t shifts1 = vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1); const int base0 = (y + 0 * dy) >> frac_bits; const int base1 = (y + 1 * dy) >> frac_bits; const int base2 = (y + 2 * dy) >> frac_bits; const int base3 = (y + 3 * dy) >> frac_bits; const uint16x4x2_t l0 = vld2_u16(left + base0); const uint16x4x2_t l1 = vld2_u16(left + base1); const uint16x4x2_t l2 = vld2_u16(left + base2); const uint16x4x2_t l3 = vld2_u16(left + base3); uint16x4_t out[4]; HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota2x4, base0, l0.val[0], l0.val[1], shifts0, shifts1, 0, 5); HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota2x4, base1, l1.val[0], l1.val[1], shifts0, shifts1, 1, 5); HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota2x4, base2, l2.val[0], l2.val[1], shifts0, shifts1, 2, 5); HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota2x4, base3, l3.val[0], l3.val[1], shifts0, shifts1, 3, 5); transpose_array_inplace_u16_4x4(out); for (int r2 = 0; r2 < 4; ++r2) { vst1_u16(dst + r2 * stride + c, out[r2]); } y += 4 * dy; c += 4; } while (c < bw); } else { assert(bh % 8 == 0); int y = dy; int c = 0; do { int r = 0; do { // Fully unroll the 4x8 block to allow us to use immediate lane-indexed // multiply instructions. const uint16x4_t shifts1 = vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1); const int base0 = ((y + 0 * dy) >> frac_bits) + (r * 2); const int base1 = ((y + 1 * dy) >> frac_bits) + (r * 2); const int base2 = ((y + 2 * dy) >> frac_bits) + (r * 2); const int base3 = ((y + 3 * dy) >> frac_bits) + (r * 2); const uint16x8x2_t l0 = vld2q_u16(left + base0); const uint16x8x2_t l1 = vld2q_u16(left + base1); const uint16x8x2_t l2 = vld2q_u16(left + base2); const uint16x8x2_t l3 = vld2q_u16(left + base3); uint16x8_t out[4]; HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota2x8, base0, l0.val[0], l0.val[1], shifts0, shifts1, 0, 5); HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota2x8, base1, l1.val[0], l1.val[1], shifts0, shifts1, 1, 5); HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota2x8, base2, l2.val[0], l2.val[1], shifts0, shifts1, 2, 5); HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota2x8, base3, l3.val[0], l3.val[1], shifts0, shifts1, 3, 5); transpose_array_inplace_u16_4x8(out); for (int r2 = 0; r2 < 4; ++r2) { vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2])); } for (int r2 = 0; r2 < 4; ++r2) { vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2])); } r += 8; } while (r < bh); y += 4 * dy; c += 4; } while (c < bw); } } // Directional prediction, zone 3: 180 < angle < 270 void av1_highbd_dr_prediction_z3_neon(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd) { (void)above; (void)dx; (void)bd; assert(bw % 4 == 0); assert(bh % 4 == 0); assert(dx == 1); assert(dy > 0); if (upsample_left) { highbd_dr_prediction_z3_upsample1_neon(dst, stride, bw, bh, left, dy); } else { highbd_dr_prediction_z3_upsample0_neon(dst, stride, bw, bh, left, dy); } } #undef HIGHBD_DR_PREDICTOR_Z3_STEP_X4 #undef HIGHBD_DR_PREDICTOR_Z3_STEP_X8 aom-3.12.1/aom_dsp/arm/highbd_loopfilter_neon.c000066400000000000000000001551361477627663500214640ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "config/aom_config.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/transpose_neon.h" #include "mem_neon.h" static inline int16x4_t clip3_s16(const int16x4_t val, const int16x4_t low, const int16x4_t high) { return vmin_s16(vmax_s16(val, low), high); } static inline uint16x8_t convert_to_unsigned_pixel_u16(int16x8_t val, int bitdepth) { const int16x8_t low = vdupq_n_s16(0); const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1); return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(val, low)), high); } // (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh) static inline uint16x4_t hev(const uint16x8_t abd_p0p1_q0q1, const uint16_t thresh) { const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh)); return vorr_u16(vget_low_u16(a), vget_high_u16(a)); } // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh static inline uint16x4_t outer_threshold(const uint16x4_t p1, const uint16x4_t p0, const uint16x4_t q0, const uint16x4_t q1, const uint16_t outer_thresh) { const uint16x4_t abd_p0q0 = vabd_u16(p0, q0); const uint16x4_t abd_p1q1 = vabd_u16(p1, q1); const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1); const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1); const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half); return vcle_u16(sum, vdup_n_u16(outer_thresh)); } // abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && // outer_threshold() static inline uint16x4_t needs_filter4(const uint16x8_t abd_p0p1_q0q1, const uint16_t inner_thresh, const uint16x4_t outer_mask) { const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh)); const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a)); return vand_u16(inner_mask, outer_mask); } // abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh && // abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh && // outer_threshold() static inline uint16x4_t needs_filter6(const uint16x8_t abd_p0p1_q0q1, const uint16x8_t abd_p1p2_q1q2, const uint16_t inner_thresh, const uint16x4_t outer_mask) { const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2); const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh)); const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b)); return vand_u16(inner_mask, outer_mask); } // abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh && // abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && // abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh // outer_threshold() static inline uint16x4_t needs_filter8(const uint16x8_t abd_p0p1_q0q1, const uint16x8_t abd_p1p2_q1q2, const uint16x8_t abd_p2p3_q2q3, const uint16_t inner_thresh, const uint16x4_t outer_mask) { const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2); const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3); const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh)); const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c)); return vand_u16(inner_mask, outer_mask); } // ----------------------------------------------------------------------------- // filterN_masks functions. static inline void filter4_masks(const uint16x8_t p0q0, const uint16x8_t p1q1, const uint16_t hev_thresh, const uint16x4_t outer_mask, const uint16_t inner_thresh, uint16x4_t *const hev_mask, uint16x4_t *const needs_filter4_mask) { const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1); // This includes cases where needs_filter4() is not true and so filter2() will // not be applied. const uint16x4_t hev_tmp_mask = hev(p0p1_q0q1, hev_thresh); *needs_filter4_mask = needs_filter4(p0p1_q0q1, inner_thresh, outer_mask); // filter2() will only be applied if both needs_filter4() and hev() are true. *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask); } // abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh && // abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh // |flat_thresh| == 4 for 10 bit decode. static inline uint16x4_t is_flat3(const uint16x8_t abd_p0p1_q0q1, const uint16x8_t abd_p0p2_q0q2, const int bitdepth) { const int flat_thresh = 1 << (bitdepth - 8); const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2); const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh)); return vand_u16(vget_low_u16(b), vget_high_u16(b)); } static inline void filter6_masks( const uint16x8_t p2q2, const uint16x8_t p1q1, const uint16x8_t p0q0, const uint16_t hev_thresh, const uint16x4_t outer_mask, const uint16_t inner_thresh, const int bitdepth, uint16x4_t *const needs_filter6_mask, uint16x4_t *const is_flat3_mask, uint16x4_t *const hev_mask) { const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1); *hev_mask = hev(abd_p0p1_q0q1, hev_thresh); *is_flat3_mask = is_flat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), bitdepth); *needs_filter6_mask = needs_filter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), inner_thresh, outer_mask); } // is_flat4 uses N=1, IsFlatOuter4 uses N=4. // abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh && // abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh && // abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh // |flat_thresh| == 4 for 10 bit decode. static inline uint16x4_t is_flat4(const uint16x8_t abd_pnp0_qnq0, const uint16x8_t abd_pn1p0_qn1q0, const uint16x8_t abd_pn2p0_qn2q0, const int bitdepth) { const int flat_thresh = 1 << (bitdepth - 8); const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0); const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0); const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh)); return vand_u16(vget_low_u16(c), vget_high_u16(c)); } static inline void filter8_masks( const uint16x8_t p3q3, const uint16x8_t p2q2, const uint16x8_t p1q1, const uint16x8_t p0q0, const uint16_t hev_thresh, const uint16x4_t outer_mask, const uint16_t inner_thresh, const int bitdepth, uint16x4_t *const needs_filter8_mask, uint16x4_t *const is_flat4_mask, uint16x4_t *const hev_mask) { const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1); *hev_mask = hev(abd_p0p1_q0q1, hev_thresh); const uint16x4_t v_is_flat4 = is_flat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), vabdq_u16(p0q0, p3q3), bitdepth); *needs_filter8_mask = needs_filter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3), inner_thresh, outer_mask); // |is_flat4_mask| is used to decide where to use the result of filter8. // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false, // overriding the question of whether to use filter8. Because filter4 doesn't // apply to p2q2, |is_flat4_mask| chooses directly between filter8 and the // source value. To be correct, the mask must account for this override. *is_flat4_mask = vand_u16(v_is_flat4, *needs_filter8_mask); } // ----------------------------------------------------------------------------- // filterN functions. // Calculate filter4() or filter2() based on |hev_mask|. static inline void filter4(const uint16x8_t p0q0, const uint16x8_t p0q1, const uint16x8_t p1q1, const uint16x4_t hev_mask, int bitdepth, uint16x8_t *const p1q1_result, uint16x8_t *const p0q0_result) { const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4); // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val); // q0mp0 means "q0 minus p0". const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1)); const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3); // If this is for filter2() then include |p1mq1|. Otherwise zero it. const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (bitdepth - 1))); const int16x4_t max_signed_pixel = vdup_n_s16((1 << (bitdepth - 1)) - 1); const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1); const int16x4_t p1mq1_saturated = clip3_s16(p1mq1, min_signed_pixel, max_signed_pixel); const int16x4_t hev_option = vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated); const int16x4_t a = vadd_s16(q0mp0_3, hev_option); // Need to figure out what's going on here because there are some unnecessary // tricks to accommodate 8x8 as smallest 8bpp vector // We can not shift with rounding because the clamp comes *before* the // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 = // Clip3(a + 3, min_signed_val, max_signed_val) >> 3; const int16x4_t plus_four = clip3_s16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel); const int16x4_t plus_three = clip3_s16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel); const int16x4_t a1 = vshr_n_s16(plus_four, 3); const int16x4_t a2 = vshr_n_s16(plus_three, 3); // a3 = (a1 + 1) >> 1; const int16x4_t a3 = vrshr_n_s16(a1, 1); const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3)); const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3); // Need to shift the second term or we end up with a2_ma2. const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1)); const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1); *p1q1_result = convert_to_unsigned_pixel_u16(p1q1_a3, bitdepth); *p0q0_result = convert_to_unsigned_pixel_u16(p0q0_a, bitdepth); } void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { uint16x4_t src[4]; load_u16_4x4(s - 2 * pitch, pitch, &src[0], &src[1], &src[2], &src[3]); // Adjust thresholds to bitdepth. const int outer_thresh = *blimit << (bd - 8); const int inner_thresh = *limit << (bd - 8); const int hev_thresh = *thresh << (bd - 8); const uint16x4_t outer_mask = outer_threshold(src[0], src[1], src[2], src[3], outer_thresh); uint16x4_t hev_mask; uint16x4_t needs_filter4_mask; const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]); const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]); filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, &needs_filter4_mask); if (vget_lane_u64(vreinterpret_u64_u16(needs_filter4_mask), 0) == 0) { // None of the values will be filtered. return; } // Copy the masks to the high bits for packed comparisons later. const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); const uint16x8_t needs_filter4_mask_8 = vcombine_u16(needs_filter4_mask, needs_filter4_mask); uint16x8_t f_p1q1; uint16x8_t f_p0q0; const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]); filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f_p1q1, &f_p0q0); // Already integrated the hev mask when calculating the filtered values. const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0); // p1/q1 are unmodified if only hev() is true. This works because it was and'd // with |needs_filter4_mask| previously. const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); store_u16_4x4(s - 2 * pitch, pitch, vget_low_u16(p1q1_output), vget_low_u16(p0q0_output), vget_high_u16(p0q0_output), vget_high_u16(p1q1_output)); } void aom_highbd_lpf_horizontal_4_dual_neon( uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_horizontal_4_neon(s, pitch, blimit0, limit0, thresh0, bd); aom_highbd_lpf_horizontal_4_neon(s + 4, pitch, blimit1, limit1, thresh1, bd); } void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { // Offset by 2 uint16_t values to load from first p1 position. uint16x4_t src[4]; load_u16_4x4(s - 2, pitch, &src[0], &src[1], &src[2], &src[3]); transpose_array_inplace_u16_4x4(src); // Adjust thresholds to bitdepth. const int outer_thresh = *blimit << (bd - 8); const int inner_thresh = *limit << (bd - 8); const int hev_thresh = *thresh << (bd - 8); const uint16x4_t outer_mask = outer_threshold(src[0], src[1], src[2], src[3], outer_thresh); uint16x4_t hev_mask; uint16x4_t needs_filter4_mask; const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]); const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]); filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, &needs_filter4_mask); if (vget_lane_u64(vreinterpret_u64_u16(needs_filter4_mask), 0) == 0) { // None of the values will be filtered. return; } // Copy the masks to the high bits for packed comparisons later. const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); const uint16x8_t needs_filter4_mask_8 = vcombine_u16(needs_filter4_mask, needs_filter4_mask); uint16x8_t f_p1q1; uint16x8_t f_p0q0; const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]); filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f_p1q1, &f_p0q0); // Already integrated the hev mask when calculating the filtered values. const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0); // p1/q1 are unmodified if only hev() is true. This works because it was and'd // with |needs_filter4_mask| previously. const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); uint16x4_t output[4] = { vget_low_u16(p1q1_output), vget_low_u16(p0q0_output), vget_high_u16(p0q0_output), vget_high_u16(p1q1_output), }; transpose_array_inplace_u16_4x4(output); store_u16_4x4(s - 2, pitch, output[0], output[1], output[2], output[3]); } void aom_highbd_lpf_vertical_4_dual_neon( uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_vertical_4_neon(s, pitch, blimit0, limit0, thresh0, bd); aom_highbd_lpf_vertical_4_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1, bd); } static inline void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1, const uint16x8_t p0q0, uint16x8_t *const p1q1_output, uint16x8_t *const p0q0_output) { // Sum p1 and q1 output from opposite directions. // The formula is regrouped to allow 3 doubling operations to be combined. // // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0 // ^^^^^^^^ // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2) // ^^^^^^^^ // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 // ^^^^^^^^^^^ uint16x8_t sum = vaddq_u16(p2q2, p1q1); // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 // ^^^^^^ sum = vaddq_u16(sum, p0q0); // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 // ^^^^^^ ^^^^^^ // Should dual issue with the left shift. const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4); const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0); // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 // ^^^^^^^^^^^ ^^^^ sum = vmlaq_n_u16(outer_sum, sum, 2); *p1q1_output = vrshrq_n_u16(sum, 3); // Convert to p0 and q0 output: // p0 = p1 - (2 * p2) + q0 + q1 // q0 = q1 - (2 * q2) + p0 + p1 // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 // ^^^^^^^^^^^^^^^^^ sum = vmlsq_n_u16(sum, p2q2, 2); const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4); sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1)); *p0q0_output = vrshrq_n_u16(sum, 3); } void aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { uint16x4_t src[6]; load_u16_4x6(s - 3 * pitch, pitch, &src[0], &src[1], &src[2], &src[3], &src[4], &src[5]); // Adjust thresholds to bitdepth. const int outer_thresh = *blimit << (bd - 8); const int inner_thresh = *limit << (bd - 8); const int hev_thresh = *thresh << (bd - 8); const uint16x4_t outer_mask = outer_threshold(src[1], src[2], src[3], src[4], outer_thresh); uint16x4_t hev_mask; uint16x4_t needs_filter_mask; uint16x4_t is_flat3_mask; const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]); const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]); const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]); filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd, &needs_filter_mask, &is_flat3_mask, &hev_mask); if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) { // None of the values will be filtered. return; } uint16x8_t p0q0_output, p1q1_output; uint16x8_t f6_p1q1, f6_p0q0; // Not needing filter4() at all is a very common case, so isolate it to avoid // needlessly computing filter4(). if (vget_lane_s64(vreinterpret_s64_u16(is_flat3_mask), 0) == -1 && vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) { filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); p1q1_output = f6_p1q1; p0q0_output = f6_p0q0; } else { // Copy the masks to the high bits for packed comparisons later. const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); const uint16x8_t needs_filter_mask_8 = vcombine_u16(needs_filter_mask, needs_filter_mask); uint16x8_t f4_p1q1; uint16x8_t f4_p0q0; const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); // Because we did not return after testing |needs_filter_mask| we know it is // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or // filter6. Therefore if it is false when |needs_filter_mask| is true, // filter6 output is not used. const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); if (vget_lane_u64(need_filter6, 0) == 0) { // filter6() does not apply, but filter4() applies to one or more values. p0q0_output = p0q0; p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); } else { filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); } } store_u16_4x4(s - 2 * pitch, pitch, vget_low_u16(p1q1_output), vget_low_u16(p0q0_output), vget_high_u16(p0q0_output), vget_high_u16(p1q1_output)); } void aom_highbd_lpf_horizontal_6_dual_neon( uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_horizontal_6_neon(s, pitch, blimit0, limit0, thresh0, bd); aom_highbd_lpf_horizontal_6_neon(s + 4, pitch, blimit1, limit1, thresh1, bd); } void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { // Overread by 2 values. These overreads become the high halves of src_raw[2] // and src_raw[3] after transpose. uint16x8_t src_raw[4]; load_u16_8x4(s - 3, pitch, &src_raw[0], &src_raw[1], &src_raw[2], &src_raw[3]); transpose_array_inplace_u16_4x8(src_raw); // p2, p1, p0, q0, q1, q2 const uint16x4_t src[6] = { vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]), vget_low_u16(src_raw[2]), vget_low_u16(src_raw[3]), vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]), }; // Adjust thresholds to bitdepth. const int outer_thresh = *blimit << (bd - 8); const int inner_thresh = *limit << (bd - 8); const int hev_thresh = *thresh << (bd - 8); const uint16x4_t outer_mask = outer_threshold(src[1], src[2], src[3], src[4], outer_thresh); uint16x4_t hev_mask; uint16x4_t needs_filter_mask; uint16x4_t is_flat3_mask; const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]); const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]); const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]); filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd, &needs_filter_mask, &is_flat3_mask, &hev_mask); if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) { // None of the values will be filtered. return; } uint16x8_t p0q0_output, p1q1_output; // Because we did not return after testing |needs_filter_mask| we know it is // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or // filter6. Therefore if it is false when |needs_filter_mask| is true, filter6 // output is not used. uint16x8_t f6_p1q1, f6_p0q0; const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); // Not needing filter4() at all is a very common case, so isolate it to avoid // needlessly computing filter4(). if (vget_lane_s64(vreinterpret_s64_u16(is_flat3_mask), 0) == -1 && vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) { filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); p1q1_output = f6_p1q1; p0q0_output = f6_p0q0; } else { // Copy the masks to the high bits for packed comparisons later. const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); const uint16x8_t needs_filter_mask_8 = vcombine_u16(needs_filter_mask, needs_filter_mask); uint16x8_t f4_p1q1; uint16x8_t f4_p0q0; // ZIP1 p0q0, p1q1 may perform better here. const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); if (vget_lane_u64(need_filter6, 0) == 0) { // filter6() does not apply, but filter4() applies to one or more values. p0q0_output = p0q0; p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); } else { filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); } } uint16x4_t output[4] = { vget_low_u16(p1q1_output), vget_low_u16(p0q0_output), vget_high_u16(p0q0_output), vget_high_u16(p1q1_output), }; transpose_array_inplace_u16_4x4(output); store_u16_4x4(s - 2, pitch, output[0], output[1], output[2], output[3]); } void aom_highbd_lpf_vertical_6_dual_neon( uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_vertical_6_neon(s, pitch, blimit0, limit0, thresh0, bd); aom_highbd_lpf_vertical_6_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1, bd); } static inline void filter8(const uint16x8_t p3q3, const uint16x8_t p2q2, const uint16x8_t p1q1, const uint16x8_t p0q0, uint16x8_t *const p2q2_output, uint16x8_t *const p1q1_output, uint16x8_t *const p0q0_output) { // Sum p2 and q2 output from opposite directions. // The formula is regrouped to allow 2 doubling operations to be combined. // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0 // ^^^^^^^^ // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3) // ^^^^^^^^ // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 // ^^^^^^^^^^^ const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2); // Add two other terms to make dual issue with shift more likely. // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 // ^^^^^^^^^^^ const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1); // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 // ^^^^^ ^^^^^^^^^^^^^ uint16x8_t sum = vmlaq_n_u16(p01q01, p23q23, 2); // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 // ^^^^^^ sum = vaddq_u16(sum, p3q3); // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 // ^^^^^^ const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4); sum = vaddq_u16(sum, q0p0); *p2q2_output = vrshrq_n_u16(sum, 3); // Convert to p1 and q1 output: // p1 = p2 - p3 - p2 + p1 + q1 // q1 = q2 - q3 - q2 + q0 + p1 sum = vsubq_u16(sum, p23q23); const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4); sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1)); *p1q1_output = vrshrq_n_u16(sum, 3); // Convert to p0 and q0 output: // p0 = p1 - p3 - p1 + p0 + q2 // q0 = q1 - q3 - q1 + q0 + p2 sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1)); const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4); sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2)); *p0q0_output = vrshrq_n_u16(sum, 3); } void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { uint16x4_t src[8]; load_u16_4x8(s - 4 * pitch, pitch, &src[0], &src[1], &src[2], &src[3], &src[4], &src[5], &src[6], &src[7]); // Adjust thresholds to bitdepth. const int outer_thresh = *blimit << (bd - 8); const int inner_thresh = *limit << (bd - 8); const int hev_thresh = *thresh << (bd - 8); const uint16x4_t outer_mask = outer_threshold(src[2], src[3], src[4], src[5], outer_thresh); uint16x4_t hev_mask; uint16x4_t needs_filter_mask; uint16x4_t is_flat4_mask; const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]); const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]); const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]); const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]); filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd, &needs_filter_mask, &is_flat4_mask, &hev_mask); if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) { // None of the values will be filtered. return; } uint16x8_t p0q0_output, p1q1_output, p2q2_output; uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; // Not needing filter4() at all is a very common case, so isolate it to avoid // needlessly computing filter4(). if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 && vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) { filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); p2q2_output = f8_p2q2; p1q1_output = f8_p1q1; p0q0_output = f8_p0q0; } else { // Copy the masks to the high bits for packed comparisons later. const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); const uint16x8_t needs_filter_mask_8 = vcombine_u16(needs_filter_mask, needs_filter_mask); uint16x8_t f4_p1q1; uint16x8_t f4_p0q0; const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]); filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); // Because we did not return after testing |needs_filter_mask| we know it is // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or // filter8. Therefore if it is false when |needs_filter_mask| is true, // filter8 output is not used. const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); if (vget_lane_u64(need_filter8, 0) == 0) { // filter8() does not apply, but filter4() applies to one or more values. p2q2_output = p2q2; p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); } else { const uint16x8_t is_flat4_mask_8 = vcombine_u16(is_flat4_mask, is_flat4_mask); filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); } } store_u16_4x6(s - 3 * pitch, pitch, vget_low_u16(p2q2_output), vget_low_u16(p1q1_output), vget_low_u16(p0q0_output), vget_high_u16(p0q0_output), vget_high_u16(p1q1_output), vget_high_u16(p2q2_output)); } void aom_highbd_lpf_horizontal_8_dual_neon( uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_horizontal_8_neon(s, pitch, blimit0, limit0, thresh0, bd); aom_highbd_lpf_horizontal_8_neon(s + 4, pitch, blimit1, limit1, thresh1, bd); } static inline uint16x8_t reverse_low_half(const uint16x8_t a) { return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a)); } void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n. // To get desired pairs after transpose, one half should be reversed. uint16x8_t src[4]; load_u16_8x4(s - 4, pitch, &src[0], &src[1], &src[2], &src[3]); // src[0] = p0q0 // src[1] = p1q1 // src[2] = p2q2 // src[3] = p3q3 loop_filter_transpose_u16_4x8q(src); // Adjust thresholds to bitdepth. const int outer_thresh = *blimit << (bd - 8); const int inner_thresh = *limit << (bd - 8); const int hev_thresh = *thresh << (bd - 8); const uint16x4_t outer_mask = outer_threshold( vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]), vget_high_u16(src[1]), outer_thresh); uint16x4_t hev_mask; uint16x4_t needs_filter_mask; uint16x4_t is_flat4_mask; const uint16x8_t p0q0 = src[0]; const uint16x8_t p1q1 = src[1]; const uint16x8_t p2q2 = src[2]; const uint16x8_t p3q3 = src[3]; filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd, &needs_filter_mask, &is_flat4_mask, &hev_mask); if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) { // None of the values will be filtered. return; } uint16x8_t p0q0_output, p1q1_output, p2q2_output; uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; // Not needing filter4() at all is a very common case, so isolate it to avoid // needlessly computing filter4(). if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 && vget_lane_s64(vreinterpret_s64_u16(needs_filter_mask), 0) == -1) { filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); p2q2_output = f8_p2q2; p1q1_output = f8_p1q1; p0q0_output = f8_p0q0; } else { // Copy the masks to the high bits for packed comparisons later. const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); const uint16x8_t needs_filter_mask_8 = vcombine_u16(needs_filter_mask, needs_filter_mask); uint16x8_t f4_p1q1; uint16x8_t f4_p0q0; const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); // Because we did not return after testing |needs_filter_mask| we know it is // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or // filter8. Therefore if it is false when |needs_filter_mask| is true, // filter8 output is not used. const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); if (vget_lane_u64(need_filter8, 0) == 0) { // filter8() does not apply, but filter4() applies to one or more values. p2q2_output = p2q2; p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); } else { const uint16x8_t is_flat4_mask_8 = vcombine_u16(is_flat4_mask, is_flat4_mask); filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); } } uint16x8_t output[4] = { p0q0_output, p1q1_output, p2q2_output, p3q3 }; // After transpose, |output| will contain rows of the form: // p0 p1 p2 p3 q0 q1 q2 q3 transpose_array_inplace_u16_4x8(output); // Reverse p values to produce original order: // p3 p2 p1 p0 q0 q1 q2 q3 store_u16_8x4(s - 4, pitch, reverse_low_half(output[0]), reverse_low_half(output[1]), reverse_low_half(output[2]), reverse_low_half(output[3])); } void aom_highbd_lpf_vertical_8_dual_neon( uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_vertical_8_neon(s, pitch, blimit0, limit0, thresh0, bd); aom_highbd_lpf_vertical_8_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1, bd); } static inline void filter14( const uint16x8_t p6q6, const uint16x8_t p5q5, const uint16x8_t p4q4, const uint16x8_t p3q3, const uint16x8_t p2q2, const uint16x8_t p1q1, const uint16x8_t p0q0, uint16x8_t *const p5q5_output, uint16x8_t *const p4q4_output, uint16x8_t *const p3q3_output, uint16x8_t *const p2q2_output, uint16x8_t *const p1q1_output, uint16x8_t *const p0q0_output) { // Sum p5 and q5 output from opposite directions. // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 // ^^^^^^^^ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) // ^^^^^^^^ const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6); // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 // ^^^^^^^^^^^^^^^^^^^ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) // ^^^^^^^^^^^^^^^^^^^ const uint16x8_t p45q45 = vaddq_u16(p5q5, p4q4); uint16x8_t sum = vmlaq_n_u16(p6q6_x7, p45q45, 2); // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 // ^^^^^^^ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) // ^^^^^^^ sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum); // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 // ^^^^^^^ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) // ^^^^^^^ sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum); // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 // ^^ // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) // ^^ const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4); sum = vaddq_u16(sum, q0p0); *p5q5_output = vrshrq_n_u16(sum, 4); // Convert to p4 and q4 output: // p4 = p5 - (2 * p6) + p3 + q1 // q4 = q5 - (2 * q6) + q3 + p1 sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1)); const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4); sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum); *p4q4_output = vrshrq_n_u16(sum, 4); // Convert to p3 and q3 output: // p3 = p4 - p6 - p5 + p2 + q2 // q3 = q4 - q6 - q5 + q2 + p2 sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5)); const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4); sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum); *p3q3_output = vrshrq_n_u16(sum, 4); // Convert to p2 and q2 output: // p2 = p3 - p6 - p4 + p1 + q3 // q2 = q3 - q6 - q4 + q1 + p3 sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4)); const uint16x8_t q3p3 = vextq_u16(p3q3, p3q3, 4); sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum); *p2q2_output = vrshrq_n_u16(sum, 4); // Convert to p1 and q1 output: // p1 = p2 - p6 - p3 + p0 + q4 // q1 = q2 - q6 - q3 + q0 + p4 sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3)); const uint16x8_t q4p4 = vextq_u16(p4q4, p4q4, 4); sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum); *p1q1_output = vrshrq_n_u16(sum, 4); // Convert to p0 and q0 output: // p0 = p1 - p6 - p2 + q0 + q5 // q0 = q1 - q6 - q2 + p0 + p5 sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2)); const uint16x8_t q5p5 = vextq_u16(p5q5, p5q5, 4); sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum); *p0q0_output = vrshrq_n_u16(sum, 4); } void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { uint16x4_t src[14]; load_u16_4x14(s - 7 * pitch, pitch, &src[0], &src[1], &src[2], &src[3], &src[4], &src[5], &src[6], &src[7], &src[8], &src[9], &src[10], &src[11], &src[12], &src[13]); // Adjust thresholds to bitdepth. const int outer_thresh = *blimit << (bd - 8); const int inner_thresh = *limit << (bd - 8); const int hev_thresh = *thresh << (bd - 8); const uint16x4_t outer_mask = outer_threshold(src[5], src[6], src[7], src[8], outer_thresh); uint16x4_t hev_mask; uint16x4_t needs_filter_mask; uint16x4_t is_flat4_mask; const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]); const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]); const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]); const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]); filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd, &needs_filter_mask, &is_flat4_mask, &hev_mask); if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) { // None of the values will be filtered. return; } const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]); const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]); const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]); // Mask to choose between the outputs of filter8 and filter14. // As with the derivation of |is_flat4_mask|, the question of whether to use // filter14 is only raised where |is_flat4_mask| is true. const uint16x4_t is_flat4_outer_mask = vand_u16( is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), vabdq_u16(p0q0, p6q6), bd)); uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, p5q5_output; uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_outer_mask), 0) == -1) { // filter14() applies to all values. filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); p5q5_output = f14_p5q5; p4q4_output = f14_p4q4; p3q3_output = f14_p3q3; p2q2_output = f14_p2q2; p1q1_output = f14_p1q1; p0q0_output = f14_p0q0; } else if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 && vget_lane_u64(vreinterpret_u64_u16(is_flat4_outer_mask), 0) == 0) { // filter8() applies to all values. filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); p5q5_output = p5q5; p4q4_output = p4q4; p3q3_output = p3q3; p2q2_output = f8_p2q2; p1q1_output = f8_p1q1; p0q0_output = f8_p0q0; } else { // Copy the masks to the high bits for packed comparisons later. const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); const uint16x8_t needs_filter_mask_8 = vcombine_u16(needs_filter_mask, needs_filter_mask); uint16x8_t f4_p1q1; uint16x8_t f4_p0q0; const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]); filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); // Because we did not return after testing |needs_filter_mask| we know it is // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or // filter8. Therefore if it is false when |needs_filter_mask| is true, // filter8 output is not used. const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); if (vget_lane_u64(need_filter8, 0) == 0) { // filter8() and filter14() do not apply, but filter4() applies to one or // more values. p5q5_output = p5q5; p4q4_output = p4q4; p3q3_output = p3q3; p2q2_output = p2q2; p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); } else { const uint16x8_t use_filter8_mask = vcombine_u16(is_flat4_mask, is_flat4_mask); filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); if (vget_lane_u64(need_filter14, 0) == 0) { // filter14() does not apply, but filter8() and filter4() apply to one // or more values. p5q5_output = p5q5; p4q4_output = p4q4; p3q3_output = p3q3; p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); } else { // All filters may contribute values to final outputs. const uint16x8_t use_filter14_mask = vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); } } } store_u16_4x12(s - 6 * pitch, pitch, vget_low_u16(p5q5_output), vget_low_u16(p4q4_output), vget_low_u16(p3q3_output), vget_low_u16(p2q2_output), vget_low_u16(p1q1_output), vget_low_u16(p0q0_output), vget_high_u16(p0q0_output), vget_high_u16(p1q1_output), vget_high_u16(p2q2_output), vget_high_u16(p3q3_output), vget_high_u16(p4q4_output), vget_high_u16(p5q5_output)); } void aom_highbd_lpf_horizontal_14_dual_neon( uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_horizontal_14_neon(s, pitch, blimit0, limit0, thresh0, bd); aom_highbd_lpf_horizontal_14_neon(s + 4, pitch, blimit1, limit1, thresh1, bd); } static inline uint16x8x2_t permute_acdb64(const uint16x8_t ab, const uint16x8_t cd) { uint16x8x2_t acdb; #if AOM_ARCH_AARCH64 // a[b] <- [c]d acdb.val[0] = vreinterpretq_u16_u64( vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd))); // [a]b <- c[d] acdb.val[1] = vreinterpretq_u16_u64( vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab))); #else // a[b] <- [c]d acdb.val[0] = vreinterpretq_u16_u64( vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0), vreinterpretq_u64_u16(ab), 1)); // [a]b <- c[d] acdb.val[1] = vreinterpretq_u16_u64( vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1), vreinterpretq_u64_u16(ab), 0)); #endif // AOM_ARCH_AARCH64 return acdb; } void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { // Low halves: p7 p6 p5 p4 // High halves: p3 p2 p1 p0 uint16x8_t src_p[4]; load_u16_8x4(s - 8, pitch, &src_p[0], &src_p[1], &src_p[2], &src_p[3]); // p7 will be the low half of src_p[0]. Not used until the end. transpose_array_inplace_u16_4x8(src_p); // Low halves: q0 q1 q2 q3 // High halves: q4 q5 q6 q7 uint16x8_t src_q[4]; load_u16_8x4(s, pitch, &src_q[0], &src_q[1], &src_q[2], &src_q[3]); // q7 will be the high half of src_q[3]. Not used until the end. transpose_array_inplace_u16_4x8(src_q); // Adjust thresholds to bitdepth. const int outer_thresh = *blimit << (bd - 8); const int inner_thresh = *limit << (bd - 8); const int hev_thresh = *thresh << (bd - 8); const uint16x4_t outer_mask = outer_threshold( vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]), vget_low_u16(src_q[1]), outer_thresh); const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4); const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4); const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4); const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4); uint16x4_t hev_mask; uint16x4_t needs_filter_mask; uint16x4_t is_flat4_mask; filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd, &needs_filter_mask, &is_flat4_mask, &hev_mask); if (vget_lane_u64(vreinterpret_u64_u16(needs_filter_mask), 0) == 0) { // None of the values will be filtered. return; } const uint16x8_t p4q4 = vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0])); const uint16x8_t p5q5 = vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1])); const uint16x8_t p6q6 = vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2])); const uint16x8_t p7q7 = vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3])); // Mask to choose between the outputs of filter8 and filter14. // As with the derivation of |is_flat4_mask|, the question of whether to use // filter14 is only raised where |is_flat4_mask| is true. const uint16x4_t is_flat4_outer_mask = vand_u16( is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), vabdq_u16(p0q0, p6q6), bd)); uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, p5q5_output; uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_outer_mask), 0) == -1) { // filter14() applies to all values. filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); p5q5_output = f14_p5q5; p4q4_output = f14_p4q4; p3q3_output = f14_p3q3; p2q2_output = f14_p2q2; p1q1_output = f14_p1q1; p0q0_output = f14_p0q0; } else if (vget_lane_s64(vreinterpret_s64_u16(is_flat4_mask), 0) == -1 && vget_lane_u64(vreinterpret_u64_u16(is_flat4_outer_mask), 0) == 0) { // filter8() applies to all values. filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); p5q5_output = p5q5; p4q4_output = p4q4; p3q3_output = p3q3; p2q2_output = f8_p2q2; p1q1_output = f8_p1q1; p0q0_output = f8_p0q0; } else { // Copy the masks to the high bits for packed comparisons later. const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); const uint16x8_t needs_filter_mask_8 = vcombine_u16(needs_filter_mask, needs_filter_mask); uint16x8_t f4_p1q1; uint16x8_t f4_p0q0; const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); // Because we did not return after testing |needs_filter_mask| we know it is // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or // filter8. Therefore if it is false when |needs_filter_mask| is true, // filter8 output is not used. const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); if (vget_lane_u64(need_filter8, 0) == 0) { // filter8() and filter14() do not apply, but filter4() applies to one or // more values. p5q5_output = p5q5; p4q4_output = p4q4; p3q3_output = p3q3; p2q2_output = p2q2; p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); } else { const uint16x8_t use_filter8_mask = vcombine_u16(is_flat4_mask, is_flat4_mask); filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); if (vget_lane_u64(need_filter14, 0) == 0) { // filter14() does not apply, but filter8() and filter4() apply to one // or more values. p5q5_output = p5q5; p4q4_output = p4q4; p3q3_output = p3q3; p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); } else { // All filters may contribute values to final outputs. const uint16x8_t use_filter14_mask = vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); } } } // To get the correctly ordered rows from the transpose, we need: // p7p3 p6p2 p5p1 p4p0 // q0q4 q1q5 q2q6 q3q7 const uint16x8x2_t p7p3_q3q7 = permute_acdb64(p7q7, p3q3_output); const uint16x8x2_t p6p2_q2q6 = permute_acdb64(p6q6, p2q2_output); const uint16x8x2_t p5p1_q1q5 = permute_acdb64(p5q5_output, p1q1_output); const uint16x8x2_t p4p0_q0q4 = permute_acdb64(p4q4_output, p0q0_output); uint16x8_t output_p[4] = { p7p3_q3q7.val[0], p6p2_q2q6.val[0], p5p1_q1q5.val[0], p4p0_q0q4.val[0] }; uint16x8_t output_q[4] = { p4p0_q0q4.val[1], p5p1_q1q5.val[1], p6p2_q2q6.val[1], p7p3_q3q7.val[1] }; transpose_array_inplace_u16_4x8(output_p); transpose_array_inplace_u16_4x8(output_q); // Reverse p values to produce original order: // p3 p2 p1 p0 q0 q1 q2 q3 store_u16_8x4(s - 8, pitch, output_p[0], output_p[1], output_p[2], output_p[3]); store_u16_8x4(s, pitch, output_q[0], output_q[1], output_q[2], output_q[3]); } void aom_highbd_lpf_vertical_14_dual_neon( uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_vertical_14_neon(s, pitch, blimit0, limit0, thresh0, bd); aom_highbd_lpf_vertical_14_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1, bd); } aom-3.12.1/aom_dsp/arm/highbd_masked_sad_neon.c000066400000000000000000000311101477627663500213610ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/blend_neon.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_dsp/blend.h" static inline uint16x8_t masked_sad_8x1_neon(uint16x8_t sad, const uint16_t *src, const uint16_t *a, const uint16_t *b, const uint8_t *m) { const uint16x8_t s0 = vld1q_u16(src); const uint16x8_t a0 = vld1q_u16(a); const uint16x8_t b0 = vld1q_u16(b); const uint16x8_t m0 = vmovl_u8(vld1_u8(m)); uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, a0, b0); return vaddq_u16(sad, vabdq_u16(blend_u16, s0)); } static inline uint16x8_t masked_sad_16x1_neon(uint16x8_t sad, const uint16_t *src, const uint16_t *a, const uint16_t *b, const uint8_t *m) { sad = masked_sad_8x1_neon(sad, src, a, b, m); return masked_sad_8x1_neon(sad, &src[8], &a[8], &b[8], &m[8]); } static inline uint16x8_t masked_sad_32x1_neon(uint16x8_t sad, const uint16_t *src, const uint16_t *a, const uint16_t *b, const uint8_t *m) { sad = masked_sad_16x1_neon(sad, src, a, b, m); return masked_sad_16x1_neon(sad, &src[16], &a[16], &b[16], &m[16]); } static inline unsigned int masked_sad_128xh_large_neon( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, int height) { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; do { uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; for (int h = 0; h < 4; ++h) { sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m); sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]); sad[2] = masked_sad_32x1_neon(sad[2], &src[64], &a[64], &b[64], &m[64]); sad[3] = masked_sad_32x1_neon(sad[3], &src[96], &a[96], &b[96], &m[96]); src += src_stride; a += a_stride; b += b_stride; m += m_stride; } sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]); sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]); sad_u32[2] = vpadalq_u16(sad_u32[2], sad[2]); sad_u32[3] = vpadalq_u16(sad_u32[3], sad[3]); height -= 4; } while (height != 0); sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[1]); sad_u32[2] = vaddq_u32(sad_u32[2], sad_u32[3]); sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[2]); return horizontal_add_u32x4(sad_u32[0]); } static inline unsigned int masked_sad_64xh_large_neon( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, int height) { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0) }; do { uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0) }; for (int h = 0; h < 4; ++h) { sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m); sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]); src += src_stride; a += a_stride; b += b_stride; m += m_stride; } sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]); sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]); height -= 4; } while (height != 0); return horizontal_add_u32x4(vaddq_u32(sad_u32[0], sad_u32[1])); } static inline unsigned int masked_sad_32xh_large_neon( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, int height) { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); uint32x4_t sad_u32 = vdupq_n_u32(0); do { uint16x8_t sad = vdupq_n_u16(0); for (int h = 0; h < 4; ++h) { sad = masked_sad_32x1_neon(sad, src, a, b, m); src += src_stride; a += a_stride; b += b_stride; m += m_stride; } sad_u32 = vpadalq_u16(sad_u32, sad); height -= 4; } while (height != 0); return horizontal_add_u32x4(sad_u32); } static inline unsigned int masked_sad_16xh_large_neon( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, int height) { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); uint32x4_t sad_u32 = vdupq_n_u32(0); do { uint16x8_t sad_u16 = vdupq_n_u16(0); for (int h = 0; h < 8; ++h) { sad_u16 = masked_sad_16x1_neon(sad_u16, src, a, b, m); src += src_stride; a += a_stride; b += b_stride; m += m_stride; } sad_u32 = vpadalq_u16(sad_u32, sad_u16); height -= 8; } while (height != 0); return horizontal_add_u32x4(sad_u32); } #if !CONFIG_REALTIME_ONLY static inline unsigned int masked_sad_8xh_large_neon( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, int height) { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); uint32x4_t sad_u32 = vdupq_n_u32(0); do { uint16x8_t sad_u16 = vdupq_n_u16(0); for (int h = 0; h < 16; ++h) { sad_u16 = masked_sad_8x1_neon(sad_u16, src, a, b, m); src += src_stride; a += a_stride; b += b_stride; m += m_stride; } sad_u32 = vpadalq_u16(sad_u32, sad_u16); height -= 16; } while (height != 0); return horizontal_add_u32x4(sad_u32); } #endif // !CONFIG_REALTIME_ONLY static inline unsigned int masked_sad_16xh_small_neon( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, int height) { // For 12-bit data, we can only accumulate up to 128 elements in the // uint16x8_t type sad accumulator, so we can only process up to 8 rows // before we have to accumulate into 32-bit elements. assert(height <= 8); const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); uint16x8_t sad = vdupq_n_u16(0); do { sad = masked_sad_16x1_neon(sad, src, a, b, m); src += src_stride; a += a_stride; b += b_stride; m += m_stride; } while (--height != 0); return horizontal_add_u16x8(sad); } static inline unsigned int masked_sad_8xh_small_neon( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, int height) { // For 12-bit data, we can only accumulate up to 128 elements in the // uint16x8_t type sad accumulator, so we can only process up to 16 rows // before we have to accumulate into 32-bit elements. assert(height <= 16); const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); uint16x8_t sad = vdupq_n_u16(0); do { sad = masked_sad_8x1_neon(sad, src, a, b, m); src += src_stride; a += a_stride; b += b_stride; m += m_stride; } while (--height != 0); return horizontal_add_u16x8(sad); } static inline unsigned int masked_sad_4xh_small_neon( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, int height) { // For 12-bit data, we can only accumulate up to 64 elements in the // uint16x4_t type sad accumulator, so we can only process up to 16 rows // before we have to accumulate into 32-bit elements. assert(height <= 16); const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); uint16x4_t sad = vdup_n_u16(0); do { uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(m))); uint16x4_t a0 = load_unaligned_u16_4x1(a); uint16x4_t b0 = load_unaligned_u16_4x1(b); uint16x4_t s0 = load_unaligned_u16_4x1(src); uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, a0, b0); sad = vadd_u16(sad, vabd_u16(blend_u16, s0)); src += src_stride; a += a_stride; b += b_stride; m += m_stride; } while (--height != 0); return horizontal_add_u16x4(sad); } #define HIGHBD_MASKED_SAD_WXH_SMALL_NEON(w, h) \ unsigned int aom_highbd_masked_sad##w##x##h##_neon( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ int invert_mask) { \ if (!invert_mask) \ return masked_sad_##w##xh_small_neon(src, src_stride, ref, ref_stride, \ second_pred, w, msk, msk_stride, \ h); \ else \ return masked_sad_##w##xh_small_neon(src, src_stride, second_pred, w, \ ref, ref_stride, msk, msk_stride, \ h); \ } #define HIGHBD_MASKED_SAD_WXH_LARGE_NEON(w, h) \ unsigned int aom_highbd_masked_sad##w##x##h##_neon( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ int invert_mask) { \ if (!invert_mask) \ return masked_sad_##w##xh_large_neon(src, src_stride, ref, ref_stride, \ second_pred, w, msk, msk_stride, \ h); \ else \ return masked_sad_##w##xh_large_neon(src, src_stride, second_pred, w, \ ref, ref_stride, msk, msk_stride, \ h); \ } HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 4) HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 8) HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 4) HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 8) HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 16) HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 8) HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 16) HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 32) HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 16) HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 32) HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 64) HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 32) HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 64) HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 128) HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 64) HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 128) #if !CONFIG_REALTIME_ONLY HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 16) HIGHBD_MASKED_SAD_WXH_LARGE_NEON(8, 32) HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 4) HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 64) HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 8) HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 16) #endif // !CONFIG_REALTIME_ONLY aom-3.12.1/aom_dsp/arm/highbd_obmc_sad_neon.c000066400000000000000000000167411477627663500210520ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" static inline void highbd_obmc_sad_8x1_s16_neon(uint16x8_t ref, const int32_t *mask, const int32_t *wsrc, uint32x4_t *sum) { int16x8_t ref_s16 = vreinterpretq_s16_u16(ref); int32x4_t wsrc_lo = vld1q_s32(wsrc); int32x4_t wsrc_hi = vld1q_s32(wsrc + 4); int32x4_t mask_lo = vld1q_s32(mask); int32x4_t mask_hi = vld1q_s32(mask + 4); int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi)); int32x4_t pre_lo = vmull_s16(vget_low_s16(ref_s16), vget_low_s16(mask_s16)); int32x4_t pre_hi = vmull_s16(vget_high_s16(ref_s16), vget_high_s16(mask_s16)); uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo)); uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi)); *sum = vrsraq_n_u32(*sum, abs_lo, 12); *sum = vrsraq_n_u32(*sum, abs_hi, 12); } static inline unsigned int highbd_obmc_sad_4xh_neon(const uint8_t *ref, int ref_stride, const int32_t *wsrc, const int32_t *mask, int height) { const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); uint32x4_t sum = vdupq_n_u32(0); int h = height / 2; do { uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride); highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum); ref_ptr += 2 * ref_stride; wsrc += 8; mask += 8; } while (--h != 0); return horizontal_add_u32x4(sum); } static inline unsigned int highbd_obmc_sad_8xh_neon(const uint8_t *ref, int ref_stride, const int32_t *wsrc, const int32_t *mask, int height) { const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); uint32x4_t sum = vdupq_n_u32(0); do { uint16x8_t r = vld1q_u16(ref_ptr); highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum); ref_ptr += ref_stride; wsrc += 8; mask += 8; } while (--height != 0); return horizontal_add_u32x4(sum); } static inline unsigned int highbd_obmc_sad_large_neon(const uint8_t *ref, int ref_stride, const int32_t *wsrc, const int32_t *mask, int width, int height) { const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; do { int i = 0; do { uint16x8_t r0 = vld1q_u16(ref_ptr + i); highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]); uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8); highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]); wsrc += 16; mask += 16; i += 16; } while (i < width); ref_ptr += ref_stride; } while (--height != 0); return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); } static inline unsigned int highbd_obmc_sad_16xh_neon(const uint8_t *ref, int ref_stride, const int32_t *wsrc, const int32_t *mask, int h) { return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h); } static inline unsigned int highbd_obmc_sad_32xh_neon(const uint8_t *ref, int ref_stride, const int32_t *wsrc, const int32_t *mask, int height) { uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); do { uint16x8_t r0 = vld1q_u16(ref_ptr); uint16x8_t r1 = vld1q_u16(ref_ptr + 8); uint16x8_t r2 = vld1q_u16(ref_ptr + 16); uint16x8_t r3 = vld1q_u16(ref_ptr + 24); highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]); highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]); highbd_obmc_sad_8x1_s16_neon(r2, mask + 16, wsrc + 16, &sum[2]); highbd_obmc_sad_8x1_s16_neon(r3, mask + 24, wsrc + 24, &sum[3]); wsrc += 32; mask += 32; ref_ptr += ref_stride; } while (--height != 0); sum[0] = vaddq_u32(sum[0], sum[1]); sum[2] = vaddq_u32(sum[2], sum[3]); return horizontal_add_u32x4(vaddq_u32(sum[0], sum[2])); } static inline unsigned int highbd_obmc_sad_64xh_neon(const uint8_t *ref, int ref_stride, const int32_t *wsrc, const int32_t *mask, int h) { return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h); } static inline unsigned int highbd_obmc_sad_128xh_neon(const uint8_t *ref, int ref_stride, const int32_t *wsrc, const int32_t *mask, int h) { return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h); } #define HIGHBD_OBMC_SAD_WXH_NEON(w, h) \ unsigned int aom_highbd_obmc_sad##w##x##h##_neon( \ const uint8_t *ref, int ref_stride, const int32_t *wsrc, \ const int32_t *mask) { \ return highbd_obmc_sad_##w##xh_neon(ref, ref_stride, wsrc, mask, h); \ } HIGHBD_OBMC_SAD_WXH_NEON(4, 4) HIGHBD_OBMC_SAD_WXH_NEON(4, 8) HIGHBD_OBMC_SAD_WXH_NEON(8, 4) HIGHBD_OBMC_SAD_WXH_NEON(8, 8) HIGHBD_OBMC_SAD_WXH_NEON(8, 16) HIGHBD_OBMC_SAD_WXH_NEON(16, 8) HIGHBD_OBMC_SAD_WXH_NEON(16, 16) HIGHBD_OBMC_SAD_WXH_NEON(16, 32) HIGHBD_OBMC_SAD_WXH_NEON(32, 16) HIGHBD_OBMC_SAD_WXH_NEON(32, 32) HIGHBD_OBMC_SAD_WXH_NEON(32, 64) HIGHBD_OBMC_SAD_WXH_NEON(64, 32) HIGHBD_OBMC_SAD_WXH_NEON(64, 64) HIGHBD_OBMC_SAD_WXH_NEON(64, 128) HIGHBD_OBMC_SAD_WXH_NEON(128, 64) HIGHBD_OBMC_SAD_WXH_NEON(128, 128) #if !CONFIG_REALTIME_ONLY HIGHBD_OBMC_SAD_WXH_NEON(4, 16) HIGHBD_OBMC_SAD_WXH_NEON(8, 32) HIGHBD_OBMC_SAD_WXH_NEON(16, 4) HIGHBD_OBMC_SAD_WXH_NEON(16, 64) HIGHBD_OBMC_SAD_WXH_NEON(32, 8) HIGHBD_OBMC_SAD_WXH_NEON(64, 16) #endif // !CONFIG_REALTIME_ONLY aom-3.12.1/aom_dsp/arm/highbd_obmc_variance_neon.c000066400000000000000000000343361477627663500220730ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" static inline void highbd_obmc_variance_8x1_s16_neon(uint16x8_t pre, const int32_t *wsrc, const int32_t *mask, uint32x4_t *sse, int32x4_t *sum) { int16x8_t pre_s16 = vreinterpretq_s16_u16(pre); int32x4_t wsrc_lo = vld1q_s32(&wsrc[0]); int32x4_t wsrc_hi = vld1q_s32(&wsrc[4]); int32x4_t mask_lo = vld1q_s32(&mask[0]); int32x4_t mask_hi = vld1q_s32(&mask[4]); int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi)); int32x4_t diff_lo = vmull_s16(vget_low_s16(pre_s16), vget_low_s16(mask_s16)); int32x4_t diff_hi = vmull_s16(vget_high_s16(pre_s16), vget_high_s16(mask_s16)); diff_lo = vsubq_s32(wsrc_lo, diff_lo); diff_hi = vsubq_s32(wsrc_hi, diff_hi); // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up. // This difference only affects the bit patterns at the rounding breakpoints // exactly, so we can add -1 to all negative numbers to move the breakpoint // one value across and into the correct rounding region. diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31); diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31); int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12); int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12); *sum = vaddq_s32(*sum, round_lo); *sum = vaddq_s32(*sum, round_hi); *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_lo), vreinterpretq_u32_s32(round_lo)); *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_hi), vreinterpretq_u32_s32(round_hi)); } // For 12-bit data, we can only accumulate up to 256 elements in the unsigned // 32-bit elements (4095*4095*256 = 4292870400) before we have to accumulate // into 64-bit elements. Therefore blocks of size 32x64, 64x32, 64x64, 64x128, // 128x64, 128x128 are processed in a different helper function. static inline void highbd_obmc_variance_xlarge_neon( const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int width, int h, int h_limit, uint64_t *sse, int64_t *sum) { uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); int32x4_t sum_s32 = vdupq_n_s32(0); uint64x2_t sse_u64 = vdupq_n_u64(0); // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit // accumulator overflows. After hitting this limit we accumulate into 64-bit // elements. int h_tmp = h > h_limit ? h_limit : h; do { uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int j = 0; do { int i = 0; do { uint16x8_t pre0 = vld1q_u16(pre_ptr + i); highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32[0], &sum_s32); uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8); highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32[1], &sum_s32); i += 16; wsrc += 16; mask += 16; } while (i < width); pre_ptr += pre_stride; j++; } while (j < h_tmp); sse_u64 = vpadalq_u32(sse_u64, sse_u32[0]); sse_u64 = vpadalq_u32(sse_u64, sse_u32[1]); h -= h_tmp; } while (h != 0); *sse = horizontal_add_u64x2(sse_u64); *sum = horizontal_long_add_s32x4(sum_s32); } static inline void highbd_obmc_variance_xlarge_neon_128xh( const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int h, uint64_t *sse, int64_t *sum) { highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 128, h, 16, sse, sum); } static inline void highbd_obmc_variance_xlarge_neon_64xh( const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int h, uint64_t *sse, int64_t *sum) { highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 64, h, 32, sse, sum); } static inline void highbd_obmc_variance_xlarge_neon_32xh( const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int h, uint64_t *sse, int64_t *sum) { highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 32, h, 64, sse, sum); } static inline void highbd_obmc_variance_large_neon( const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int width, int h, uint64_t *sse, int64_t *sum) { uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); uint32x4_t sse_u32 = vdupq_n_u32(0); int32x4_t sum_s32 = vdupq_n_s32(0); do { int i = 0; do { uint16x8_t pre0 = vld1q_u16(pre_ptr + i); highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32, &sum_s32); uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8); highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32, &sum_s32); i += 16; wsrc += 16; mask += 16; } while (i < width); pre_ptr += pre_stride; } while (--h != 0); *sse = horizontal_long_add_u32x4(sse_u32); *sum = horizontal_long_add_s32x4(sum_s32); } static inline void highbd_obmc_variance_neon_128xh( const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int h, uint64_t *sse, int64_t *sum) { highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse, sum); } static inline void highbd_obmc_variance_neon_64xh(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int h, uint64_t *sse, int64_t *sum) { highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum); } static inline void highbd_obmc_variance_neon_32xh(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int h, uint64_t *sse, int64_t *sum) { highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum); } static inline void highbd_obmc_variance_neon_16xh(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int h, uint64_t *sse, int64_t *sum) { highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum); } static inline void highbd_obmc_variance_neon_8xh(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, const int32_t *mask, int h, uint64_t *sse, int64_t *sum) { uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); uint32x4_t sse_u32 = vdupq_n_u32(0); int32x4_t sum_s32 = vdupq_n_s32(0); do { uint16x8_t pre_u16 = vld1q_u16(pre); highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32); pre += pre_stride; wsrc += 8; mask += 8; } while (--h != 0); *sse = horizontal_long_add_u32x4(sse_u32); *sum = horizontal_long_add_s32x4(sum_s32); } static inline void highbd_obmc_variance_neon_4xh(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, const int32_t *mask, int h, uint64_t *sse, int64_t *sum) { assert(h % 2 == 0); uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); uint32x4_t sse_u32 = vdupq_n_u32(0); int32x4_t sum_s32 = vdupq_n_s32(0); do { uint16x8_t pre_u16 = load_unaligned_u16_4x2(pre, pre_stride); highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32); pre += 2 * pre_stride; wsrc += 8; mask += 8; h -= 2; } while (h != 0); *sse = horizontal_long_add_u32x4(sse_u32); *sum = horizontal_long_add_s32x4(sum_s32); } static inline void highbd_8_obmc_variance_cast(int64_t sum64, uint64_t sse64, int *sum, unsigned int *sse) { *sum = (int)sum64; *sse = (unsigned int)sse64; } static inline void highbd_10_obmc_variance_cast(int64_t sum64, uint64_t sse64, int *sum, unsigned int *sse) { *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); } static inline void highbd_12_obmc_variance_cast(int64_t sum64, uint64_t sse64, int *sum, unsigned int *sse) { *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); } #define HIGHBD_OBMC_VARIANCE_WXH_NEON(w, h, bitdepth) \ unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ const int32_t *mask, unsigned int *sse) { \ int sum; \ int64_t sum64; \ uint64_t sse64; \ highbd_obmc_variance_neon_##w##xh(pre, pre_stride, wsrc, mask, h, &sse64, \ &sum64); \ highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse); \ return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h)); \ } #define HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(w, h, bitdepth) \ unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ const int32_t *mask, unsigned int *sse) { \ int sum; \ int64_t sum64; \ uint64_t sse64; \ highbd_obmc_variance_xlarge_neon_##w##xh(pre, pre_stride, wsrc, mask, h, \ &sse64, &sum64); \ highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse); \ return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h)); \ } // 8-bit HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 8) HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 8) // 10-bit HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 10) HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 10) // 12-bit HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 12) HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 12) HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 12) HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 12) HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 12) HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 12) HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 12) HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 12) HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 12) HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 12) HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 12) HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 12) HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 12) HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 12) HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 12) HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(32, 64, 12) HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 12) HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 32, 12) HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 64, 12) HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 128, 12) HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 64, 12) HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 128, 12) aom-3.12.1/aom_dsp/arm/highbd_quantize_neon.c000066400000000000000000000453111477627663500211360ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/quantize.h" static inline uint32_t sum_abs_coeff(const uint32x4_t a) { #if AOM_ARCH_AARCH64 return vaddvq_u32(a); #else const uint64x2_t b = vpaddlq_u32(a); const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b)); return (uint32_t)vget_lane_u64(c, 0); #endif } static inline uint16x4_t quantize_4( const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32, int32x4_t v_dequant_s32, int32x4_t v_round_s32, int32x4_t v_zbin_s32, int32x4_t v_quant_shift_s32, int log_scale) { const int32x4_t v_coeff = vld1q_s32(coeff_ptr); const int32x4_t v_coeff_sign = vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0))); const int32x4_t v_abs_coeff = vabsq_s32(v_coeff); // if (abs_coeff < zbins[rc != 0]), const uint32x4_t v_zbin_mask = vcgeq_s32(v_abs_coeff, v_zbin_s32); const int32x4_t v_log_scale = vdupq_n_s32(log_scale); // const int64_t tmp = (int64_t)abs_coeff + log_scaled_round; const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32); // const int32_t tmpw32 = tmp * wt; const int32x4_t v_tmpw32 = vmulq_s32(v_tmp, vdupq_n_s32((1 << AOM_QM_BITS))); // const int32_t tmp2 = (int32_t)((tmpw32 * quant64) >> 16); const int32x4_t v_tmp2 = vqdmulhq_s32(v_tmpw32, v_quant_s32); // const int32_t tmp3 = // ((((tmp2 + tmpw32)<< log_scale) * (int64_t)(quant_shift << 15)) >> 32); const int32x4_t v_tmp3 = vqdmulhq_s32( vshlq_s32(vaddq_s32(v_tmp2, v_tmpw32), v_log_scale), v_quant_shift_s32); // const int abs_qcoeff = vmask ? (int)tmp3 >> AOM_QM_BITS : 0; const int32x4_t v_abs_qcoeff = vandq_s32(vreinterpretq_s32_u32(v_zbin_mask), vshrq_n_s32(v_tmp3, AOM_QM_BITS)); // const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant_iwt) >> log_scale; // vshlq_s32 will shift right if shift value is negative. const int32x4_t v_abs_dqcoeff = vshlq_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), vnegq_s32(v_log_scale)); // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); const int32x4_t v_qcoeff = vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); const int32x4_t v_dqcoeff = vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); vst1q_s32(qcoeff_ptr, v_qcoeff); vst1q_s32(dqcoeff_ptr, v_dqcoeff); // Used to find eob. const uint32x4_t nz_qcoeff_mask = vcgtq_s32(v_abs_qcoeff, vdupq_n_s32(0)); return vmovn_u32(nz_qcoeff_mask); } static inline int16x8_t get_max_lane_eob(const int16_t *iscan, int16x8_t v_eobmax, uint16x8_t v_mask) { const int16x8_t v_iscan = vld1q_s16(&iscan[0]); const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1)); const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0)); return vmaxq_s16(v_eobmax, v_nz_iscan); } #if !CONFIG_REALTIME_ONLY static inline void get_min_max_lane_eob(const int16_t *iscan, int16x8_t *v_eobmin, int16x8_t *v_eobmax, uint16x8_t v_mask, intptr_t n_coeffs) { const int16x8_t v_iscan = vld1q_s16(&iscan[0]); const int16x8_t v_nz_iscan_max = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1)); #if SKIP_EOB_FACTOR_ADJUST const int16x8_t v_nz_iscan_min = vbslq_s16(v_mask, v_iscan, vdupq_n_s16((int16_t)n_coeffs)); *v_eobmin = vminq_s16(*v_eobmin, v_nz_iscan_min); #else (void)v_eobmin; #endif *v_eobmax = vmaxq_s16(*v_eobmax, v_nz_iscan_max); } #endif // !CONFIG_REALTIME_ONLY static inline uint16_t get_max_eob(int16x8_t v_eobmax) { #if AOM_ARCH_AARCH64 return (uint16_t)vmaxvq_s16(v_eobmax); #else const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax)); const int64x1_t v_eobmax_xx32 = vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); const int16x4_t v_eobmax_tmp = vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); const int64x1_t v_eobmax_xxx3 = vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); const int16x4_t v_eobmax_final = vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); return (uint16_t)vget_lane_s16(v_eobmax_final, 0); #endif } #if SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY static inline uint16_t get_min_eob(int16x8_t v_eobmin) { #if AOM_ARCH_AARCH64 return (uint16_t)vminvq_s16(v_eobmin); #else const int16x4_t v_eobmin_3210 = vmin_s16(vget_low_s16(v_eobmin), vget_high_s16(v_eobmin)); const int64x1_t v_eobmin_xx32 = vshr_n_s64(vreinterpret_s64_s16(v_eobmin_3210), 32); const int16x4_t v_eobmin_tmp = vmin_s16(v_eobmin_3210, vreinterpret_s16_s64(v_eobmin_xx32)); const int64x1_t v_eobmin_xxx3 = vshr_n_s64(vreinterpret_s64_s16(v_eobmin_tmp), 16); const int16x4_t v_eobmin_final = vmin_s16(v_eobmin_tmp, vreinterpret_s16_s64(v_eobmin_xxx3)); return (uint16_t)vget_lane_s16(v_eobmin_final, 0); #endif } #endif // SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY static void highbd_quantize_b_neon( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const int log_scale) { (void)scan; const int16x4_t v_quant = vld1_s16(quant_ptr); const int16x4_t v_dequant = vld1_s16(dequant_ptr); const int16x4_t v_zero = vdup_n_s16(0); const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero); const int16x4_t v_round_no_scale = vld1_s16(round_ptr); const int16x4_t v_round_log_scale = vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale))); const int16x4_t v_round = vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale); const int16x4_t v_quant_shift = vld1_s16(quant_shift_ptr); const int16x4_t v_zbin_no_scale = vld1_s16(zbin_ptr); const int16x4_t v_zbin_log_scale = vqrdmulh_n_s16(v_zbin_no_scale, (int16_t)(1 << (15 - log_scale))); const int16x4_t v_zbin = vbsl_s16(v_round_select, v_zbin_log_scale, v_zbin_no_scale); int32x4_t v_round_s32 = vmovl_s16(v_round); int32x4_t v_quant_s32 = vshlq_n_s32(vmovl_s16(v_quant), 15); int32x4_t v_dequant_s32 = vmovl_s16(v_dequant); int32x4_t v_quant_shift_s32 = vshlq_n_s32(vmovl_s16(v_quant_shift), 15); int32x4_t v_zbin_s32 = vmovl_s16(v_zbin); uint16x4_t v_mask_lo, v_mask_hi; int16x8_t v_eobmax = vdupq_n_s16(-1); intptr_t non_zero_count = n_coeffs; assert(n_coeffs > 8); // Pre-scan pass const int32x4_t v_zbin_s32x = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1); intptr_t i = n_coeffs; do { const int32x4_t v_coeff_a = vld1q_s32(coeff_ptr + i - 4); const int32x4_t v_coeff_b = vld1q_s32(coeff_ptr + i - 8); const int32x4_t v_abs_coeff_a = vabsq_s32(v_coeff_a); const int32x4_t v_abs_coeff_b = vabsq_s32(v_coeff_b); const uint32x4_t v_mask_a = vcgeq_s32(v_abs_coeff_a, v_zbin_s32x); const uint32x4_t v_mask_b = vcgeq_s32(v_abs_coeff_b, v_zbin_s32x); // If the coefficient is in the base ZBIN range, then discard. if (sum_abs_coeff(v_mask_a) + sum_abs_coeff(v_mask_b) == 0) { non_zero_count -= 8; } else { break; } i -= 8; } while (i > 0); const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count; memset(qcoeff_ptr + non_zero_count, 0, remaining_zcoeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr + non_zero_count, 0, remaining_zcoeffs * sizeof(*dqcoeff_ptr)); // DC and first 3 AC v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale); // overwrite the DC constants with AC constants v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1); v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1); v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1); v_quant_shift_s32 = vdupq_lane_s32(vget_low_s32(v_quant_shift_s32), 1); v_zbin_s32 = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1); // 4 more AC v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale); v_eobmax = get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); intptr_t count = non_zero_count - 8; for (; count > 0; count -= 8) { coeff_ptr += 8; qcoeff_ptr += 8; dqcoeff_ptr += 8; iscan += 8; v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale); v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale); // Find the max lane eob for 8 coeffs. v_eobmax = get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); } *eob_ptr = get_max_eob(v_eobmax); } void aom_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0); } void aom_highbd_quantize_b_32x32_neon( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1); } void aom_highbd_quantize_b_64x64_neon( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 2); } #if !CONFIG_REALTIME_ONLY static void highbd_quantize_b_adaptive_neon( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const int log_scale) { (void)scan; const int16x4_t v_quant = vld1_s16(quant_ptr); const int16x4_t v_dequant = vld1_s16(dequant_ptr); const int16x4_t v_zero = vdup_n_s16(0); const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero); const int16x4_t v_round_no_scale = vld1_s16(round_ptr); const int16x4_t v_round_log_scale = vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale))); const int16x4_t v_round = vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale); const int16x4_t v_quant_shift = vld1_s16(quant_shift_ptr); const int16x4_t v_zbin_no_scale = vld1_s16(zbin_ptr); const int16x4_t v_zbin_log_scale = vqrdmulh_n_s16(v_zbin_no_scale, (int16_t)(1 << (15 - log_scale))); const int16x4_t v_zbin = vbsl_s16(v_round_select, v_zbin_log_scale, v_zbin_no_scale); int32x4_t v_round_s32 = vmovl_s16(v_round); int32x4_t v_quant_s32 = vshlq_n_s32(vmovl_s16(v_quant), 15); int32x4_t v_dequant_s32 = vmovl_s16(v_dequant); int32x4_t v_quant_shift_s32 = vshlq_n_s32(vmovl_s16(v_quant_shift), 15); int32x4_t v_zbin_s32 = vmovl_s16(v_zbin); uint16x4_t v_mask_lo, v_mask_hi; int16x8_t v_eobmax = vdupq_n_s16(-1); int16x8_t v_eobmin = vdupq_n_s16((int16_t)n_coeffs); assert(n_coeffs > 8); // Pre-scan pass const int32x4_t v_zbin_s32x = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1); const int prescan_add_1 = ROUND_POWER_OF_TWO(dequant_ptr[1] * EOB_FACTOR, 7 + AOM_QM_BITS); const int32x4_t v_zbin_prescan = vaddq_s32(v_zbin_s32x, vdupq_n_s32(prescan_add_1)); intptr_t non_zero_count = n_coeffs; intptr_t i = n_coeffs; do { const int32x4_t v_coeff_a = vld1q_s32(coeff_ptr + i - 4); const int32x4_t v_coeff_b = vld1q_s32(coeff_ptr + i - 8); const int32x4_t v_abs_coeff_a = vabsq_s32(v_coeff_a); const int32x4_t v_abs_coeff_b = vabsq_s32(v_coeff_b); const uint32x4_t v_mask_a = vcgeq_s32(v_abs_coeff_a, v_zbin_prescan); const uint32x4_t v_mask_b = vcgeq_s32(v_abs_coeff_b, v_zbin_prescan); // If the coefficient is in the base ZBIN range, then discard. if (sum_abs_coeff(v_mask_a) + sum_abs_coeff(v_mask_b) == 0) { non_zero_count -= 8; } else { break; } i -= 8; } while (i > 0); const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count; memset(qcoeff_ptr + non_zero_count, 0, remaining_zcoeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr + non_zero_count, 0, remaining_zcoeffs * sizeof(*dqcoeff_ptr)); // DC and first 3 AC v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale); // overwrite the DC constants with AC constants v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1); v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1); v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1); v_quant_shift_s32 = vdupq_lane_s32(vget_low_s32(v_quant_shift_s32), 1); v_zbin_s32 = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1); // 4 more AC v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale); get_min_max_lane_eob(iscan, &v_eobmin, &v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi), n_coeffs); intptr_t count = non_zero_count - 8; for (; count > 0; count -= 8) { coeff_ptr += 8; qcoeff_ptr += 8; dqcoeff_ptr += 8; iscan += 8; v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale); v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale); get_min_max_lane_eob(iscan, &v_eobmin, &v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi), n_coeffs); } int eob = get_max_eob(v_eobmax); #if SKIP_EOB_FACTOR_ADJUST const int first = get_min_eob(v_eobmin); if (eob >= 0 && first == eob) { const int rc = scan[eob]; if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; const qm_val_t wt = (1 << AOM_QM_BITS); const int coeff = coeff_ptr[rc] * wt; const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; const int prescan_add_val = ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) { qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; eob = -1; } } } #endif // SKIP_EOB_FACTOR_ADJUST *eob_ptr = eob + 1; } void aom_highbd_quantize_b_adaptive_neon( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { highbd_quantize_b_adaptive_neon( coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0); } void aom_highbd_quantize_b_32x32_adaptive_neon( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { highbd_quantize_b_adaptive_neon( coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1); } void aom_highbd_quantize_b_64x64_adaptive_neon( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { highbd_quantize_b_adaptive_neon( coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 2); } #endif // !CONFIG_REALTIME_ONLY aom-3.12.1/aom_dsp/arm/highbd_sad_neon.c000066400000000000000000000404561477627663500200520ustar00rootroot00000000000000/* * Copyright (c) 2023 The WebM project authors. All rights reserved. * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" static inline uint32_t highbd_sad4xh_small_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); uint32x4_t sum = vdupq_n_u32(0); int i = h; do { uint16x4_t s = vld1_u16(src16_ptr); uint16x4_t r = vld1_u16(ref16_ptr); sum = vabal_u16(sum, s, r); src16_ptr += src_stride; ref16_ptr += ref_stride; } while (--i != 0); return horizontal_add_u32x4(sum); } static inline uint32_t highbd_sad8xh_small_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); uint16x8_t sum = vdupq_n_u16(0); int i = h; do { uint16x8_t s = vld1q_u16(src16_ptr); uint16x8_t r = vld1q_u16(ref16_ptr); sum = vabaq_u16(sum, s, r); src16_ptr += src_stride; ref16_ptr += ref_stride; } while (--i != 0); return horizontal_add_u16x8(sum); } #if !CONFIG_REALTIME_ONLY static inline uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); uint32x4_t sum_u32 = vdupq_n_u32(0); int i = h; do { uint16x8_t s = vld1q_u16(src16_ptr); uint16x8_t r = vld1q_u16(ref16_ptr); uint16x8_t sum_u16 = vabdq_u16(s, r); sum_u32 = vpadalq_u16(sum_u32, sum_u16); src16_ptr += src_stride; ref16_ptr += ref_stride; } while (--i != 0); return horizontal_add_u32x4(sum_u32); } #endif // !CONFIG_REALTIME_ONLY static inline uint32_t highbd_sad16xh_large_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = h; do { uint16x8_t s0 = vld1q_u16(src16_ptr); uint16x8_t r0 = vld1q_u16(ref16_ptr); uint16x8_t diff0 = vabdq_u16(s0, r0); sum[0] = vpadalq_u16(sum[0], diff0); uint16x8_t s1 = vld1q_u16(src16_ptr + 8); uint16x8_t r1 = vld1q_u16(ref16_ptr + 8); uint16x8_t diff1 = vabdq_u16(s1, r1); sum[1] = vpadalq_u16(sum[1], diff1); src16_ptr += src_stride; ref16_ptr += ref_stride; } while (--i != 0); sum[0] = vaddq_u32(sum[0], sum[1]); return horizontal_add_u32x4(sum[0]); } static inline uint32_t highbd_sadwxh_large_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int w, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; int i = h; do { int j = 0; do { uint16x8_t s0 = vld1q_u16(src16_ptr + j); uint16x8_t r0 = vld1q_u16(ref16_ptr + j); uint16x8_t diff0 = vabdq_u16(s0, r0); sum[0] = vpadalq_u16(sum[0], diff0); uint16x8_t s1 = vld1q_u16(src16_ptr + j + 8); uint16x8_t r1 = vld1q_u16(ref16_ptr + j + 8); uint16x8_t diff1 = vabdq_u16(s1, r1); sum[1] = vpadalq_u16(sum[1], diff1); uint16x8_t s2 = vld1q_u16(src16_ptr + j + 16); uint16x8_t r2 = vld1q_u16(ref16_ptr + j + 16); uint16x8_t diff2 = vabdq_u16(s2, r2); sum[2] = vpadalq_u16(sum[2], diff2); uint16x8_t s3 = vld1q_u16(src16_ptr + j + 24); uint16x8_t r3 = vld1q_u16(ref16_ptr + j + 24); uint16x8_t diff3 = vabdq_u16(s3, r3); sum[3] = vpadalq_u16(sum[3], diff3); j += 32; } while (j < w); src16_ptr += src_stride; ref16_ptr += ref_stride; } while (--i != 0); sum[0] = vaddq_u32(sum[0], sum[1]); sum[2] = vaddq_u32(sum[2], sum[3]); sum[0] = vaddq_u32(sum[0], sum[2]); return horizontal_add_u32x4(sum[0]); } static inline unsigned int highbd_sad128xh_large_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, h); } static inline unsigned int highbd_sad64xh_large_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h); } static inline unsigned int highbd_sad32xh_large_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h); } #define HBD_SAD_WXH_SMALL_NEON(w, h) \ unsigned int aom_highbd_sad##w##x##h##_neon( \ const uint8_t *src, int src_stride, const uint8_t *ref, \ int ref_stride) { \ return highbd_sad##w##xh_small_neon(src, src_stride, ref, ref_stride, \ (h)); \ } #define HBD_SAD_WXH_LARGE_NEON(w, h) \ unsigned int aom_highbd_sad##w##x##h##_neon( \ const uint8_t *src, int src_stride, const uint8_t *ref, \ int ref_stride) { \ return highbd_sad##w##xh_large_neon(src, src_stride, ref, ref_stride, \ (h)); \ } HBD_SAD_WXH_SMALL_NEON(4, 4) HBD_SAD_WXH_SMALL_NEON(4, 8) HBD_SAD_WXH_SMALL_NEON(8, 4) HBD_SAD_WXH_SMALL_NEON(8, 8) HBD_SAD_WXH_SMALL_NEON(8, 16) HBD_SAD_WXH_LARGE_NEON(16, 8) HBD_SAD_WXH_LARGE_NEON(16, 16) HBD_SAD_WXH_LARGE_NEON(16, 32) HBD_SAD_WXH_LARGE_NEON(32, 16) HBD_SAD_WXH_LARGE_NEON(32, 32) HBD_SAD_WXH_LARGE_NEON(32, 64) HBD_SAD_WXH_LARGE_NEON(64, 32) HBD_SAD_WXH_LARGE_NEON(64, 64) HBD_SAD_WXH_LARGE_NEON(64, 128) HBD_SAD_WXH_LARGE_NEON(128, 64) HBD_SAD_WXH_LARGE_NEON(128, 128) #if !CONFIG_REALTIME_ONLY HBD_SAD_WXH_SMALL_NEON(4, 16) HBD_SAD_WXH_LARGE_NEON(8, 32) HBD_SAD_WXH_LARGE_NEON(16, 4) HBD_SAD_WXH_LARGE_NEON(16, 64) HBD_SAD_WXH_LARGE_NEON(32, 8) HBD_SAD_WXH_LARGE_NEON(64, 16) #endif // !CONFIG_REALTIME_ONLY #define HBD_SAD_SKIP_WXH_SMALL_NEON(w, h) \ unsigned int aom_highbd_sad_skip_##w##x##h##_neon( \ const uint8_t *src, int src_stride, const uint8_t *ref, \ int ref_stride) { \ return 2 * highbd_sad##w##xh_small_neon(src, 2 * src_stride, ref, \ 2 * ref_stride, (h) / 2); \ } #define HBD_SAD_SKIP_WXH_LARGE_NEON(w, h) \ unsigned int aom_highbd_sad_skip_##w##x##h##_neon( \ const uint8_t *src, int src_stride, const uint8_t *ref, \ int ref_stride) { \ return 2 * highbd_sad##w##xh_large_neon(src, 2 * src_stride, ref, \ 2 * ref_stride, (h) / 2); \ } HBD_SAD_SKIP_WXH_SMALL_NEON(8, 16) HBD_SAD_SKIP_WXH_LARGE_NEON(16, 16) HBD_SAD_SKIP_WXH_LARGE_NEON(16, 32) HBD_SAD_SKIP_WXH_LARGE_NEON(32, 16) HBD_SAD_SKIP_WXH_LARGE_NEON(32, 32) HBD_SAD_SKIP_WXH_LARGE_NEON(32, 64) HBD_SAD_SKIP_WXH_LARGE_NEON(64, 32) HBD_SAD_SKIP_WXH_LARGE_NEON(64, 64) HBD_SAD_SKIP_WXH_LARGE_NEON(64, 128) HBD_SAD_SKIP_WXH_LARGE_NEON(128, 64) HBD_SAD_SKIP_WXH_LARGE_NEON(128, 128) #if !CONFIG_REALTIME_ONLY HBD_SAD_SKIP_WXH_SMALL_NEON(4, 16) HBD_SAD_SKIP_WXH_SMALL_NEON(8, 32) HBD_SAD_SKIP_WXH_LARGE_NEON(16, 64) HBD_SAD_SKIP_WXH_LARGE_NEON(64, 16) #endif // !CONFIG_REALTIME_ONLY static inline uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h, const uint8_t *second_pred) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); uint32x4_t sum = vdupq_n_u32(0); int i = h; do { uint16x8_t s = vld1q_u16(src16_ptr); uint16x8_t r = vld1q_u16(ref16_ptr); uint16x8_t p = vld1q_u16(pred16_ptr); uint16x8_t avg = vrhaddq_u16(r, p); uint16x8_t diff = vabdq_u16(s, avg); sum = vpadalq_u16(sum, diff); src16_ptr += src_stride; ref16_ptr += ref_stride; pred16_ptr += 8; } while (--i != 0); return horizontal_add_u32x4(sum); } static inline uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h, const uint8_t *second_pred) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = h; do { uint16x8_t s0, s1, r0, r1, p0, p1; uint16x8_t avg0, avg1, diff0, diff1; s0 = vld1q_u16(src16_ptr); r0 = vld1q_u16(ref16_ptr); p0 = vld1q_u16(pred16_ptr); avg0 = vrhaddq_u16(r0, p0); diff0 = vabdq_u16(s0, avg0); sum[0] = vpadalq_u16(sum[0], diff0); s1 = vld1q_u16(src16_ptr + 8); r1 = vld1q_u16(ref16_ptr + 8); p1 = vld1q_u16(pred16_ptr + 8); avg1 = vrhaddq_u16(r1, p1); diff1 = vabdq_u16(s1, avg1); sum[1] = vpadalq_u16(sum[1], diff1); src16_ptr += src_stride; ref16_ptr += ref_stride; pred16_ptr += 16; } while (--i != 0); sum[0] = vaddq_u32(sum[0], sum[1]); return horizontal_add_u32x4(sum[0]); } static inline uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int w, int h, const uint8_t *second_pred) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; int i = h; do { int j = 0; do { uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3; uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3; s0 = vld1q_u16(src16_ptr + j); r0 = vld1q_u16(ref16_ptr + j); p0 = vld1q_u16(pred16_ptr + j); avg0 = vrhaddq_u16(r0, p0); diff0 = vabdq_u16(s0, avg0); sum[0] = vpadalq_u16(sum[0], diff0); s1 = vld1q_u16(src16_ptr + j + 8); r1 = vld1q_u16(ref16_ptr + j + 8); p1 = vld1q_u16(pred16_ptr + j + 8); avg1 = vrhaddq_u16(r1, p1); diff1 = vabdq_u16(s1, avg1); sum[1] = vpadalq_u16(sum[1], diff1); s2 = vld1q_u16(src16_ptr + j + 16); r2 = vld1q_u16(ref16_ptr + j + 16); p2 = vld1q_u16(pred16_ptr + j + 16); avg2 = vrhaddq_u16(r2, p2); diff2 = vabdq_u16(s2, avg2); sum[2] = vpadalq_u16(sum[2], diff2); s3 = vld1q_u16(src16_ptr + j + 24); r3 = vld1q_u16(ref16_ptr + j + 24); p3 = vld1q_u16(pred16_ptr + j + 24); avg3 = vrhaddq_u16(r3, p3); diff3 = vabdq_u16(s3, avg3); sum[3] = vpadalq_u16(sum[3], diff3); j += 32; } while (j < w); src16_ptr += src_stride; ref16_ptr += ref_stride; pred16_ptr += w; } while (--i != 0); sum[0] = vaddq_u32(sum[0], sum[1]); sum[2] = vaddq_u32(sum[2], sum[3]); sum[0] = vaddq_u32(sum[0], sum[2]); return horizontal_add_u32x4(sum[0]); } static inline unsigned int highbd_sad128xh_avg_neon( const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h, const uint8_t *second_pred) { return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, h, second_pred); } static inline unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h, const uint8_t *second_pred) { return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h, second_pred); } static inline unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h, const uint8_t *second_pred) { return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h, second_pred); } #define HBD_SAD_WXH_AVG_NEON(w, h) \ uint32_t aom_highbd_sad##w##x##h##_avg_neon( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred) { \ return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ second_pred); \ } HBD_SAD_WXH_AVG_NEON(8, 8) HBD_SAD_WXH_AVG_NEON(8, 16) HBD_SAD_WXH_AVG_NEON(16, 8) HBD_SAD_WXH_AVG_NEON(16, 16) HBD_SAD_WXH_AVG_NEON(16, 32) HBD_SAD_WXH_AVG_NEON(32, 16) HBD_SAD_WXH_AVG_NEON(32, 32) HBD_SAD_WXH_AVG_NEON(32, 64) HBD_SAD_WXH_AVG_NEON(64, 32) HBD_SAD_WXH_AVG_NEON(64, 64) HBD_SAD_WXH_AVG_NEON(64, 128) HBD_SAD_WXH_AVG_NEON(128, 64) HBD_SAD_WXH_AVG_NEON(128, 128) #if !CONFIG_REALTIME_ONLY HBD_SAD_WXH_AVG_NEON(8, 32) HBD_SAD_WXH_AVG_NEON(16, 64) HBD_SAD_WXH_AVG_NEON(32, 8) HBD_SAD_WXH_AVG_NEON(64, 16) #endif // !CONFIG_REALTIME_ONLY aom-3.12.1/aom_dsp/arm/highbd_sadxd_neon.c000066400000000000000000000622651477627663500204100ustar00rootroot00000000000000/* * Copyright (c) 2023 The WebM project authors. All rights reserved. * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" static inline void highbd_sad4xhx4d_small_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; int i = 0; do { uint16x4_t s = vld1_u16(src16_ptr + i * src_stride); uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride); uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride); uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride); uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride); sum[0] = vabal_u16(sum[0], s, r0); sum[1] = vabal_u16(sum[1], s, r1); sum[2] = vabal_u16(sum[2], s, r2); sum[3] = vabal_u16(sum[3], s, r3); } while (++i < h); vst1q_u32(res, horizontal_add_4d_u32x4(sum)); } static inline void highbd_sad8xhx4d_small_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; uint32x4_t sum_u32[4]; int i = 0; do { uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride)); sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride)); sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride)); sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride)); } while (++i < h); sum_u32[0] = vpaddlq_u16(sum[0]); sum_u32[1] = vpaddlq_u16(sum[1]); sum_u32[2] = vpaddlq_u16(sum[2]); sum_u32[3] = vpaddlq_u16(sum[3]); vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32)); } static inline void sad8_neon(uint16x8_t src, uint16x8_t ref, uint32x4_t *const sad_sum) { uint16x8_t abs_diff = vabdq_u16(src, ref); *sad_sum = vpadalq_u16(*sad_sum, abs_diff); } #if !CONFIG_REALTIME_ONLY static inline void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; int i = 0; do { uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]); sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]); sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]); sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]); } while (++i < h); vst1q_u32(res, horizontal_add_4d_u32x4(sum)); } #endif // !CONFIG_REALTIME_ONLY static inline void highbd_sad16xhx4d_large_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; uint32x4_t sum[4]; int i = 0; do { uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride); sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]); sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]); sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]); sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]); uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8); sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]); sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]); sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]); sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]); } while (++i < h); sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); vst1q_u32(res, horizontal_add_4d_u32x4(sum)); } static inline void highbd_sadwxhx4d_large_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], int w, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; uint32x4_t sum[4]; int i = 0; do { int j = 0; do { uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j); sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]); sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]); sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]); sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]); uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8); sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]); sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]); sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]); sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]); uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16); sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16), &sum_lo[0]); sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16), &sum_lo[1]); sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16), &sum_lo[2]); sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16), &sum_lo[3]); uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24); sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24), &sum_hi[0]); sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24), &sum_hi[1]); sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24), &sum_hi[2]); sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24), &sum_hi[3]); j += 32; } while (j < w); } while (++i < h); sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); vst1q_u32(res, horizontal_add_4d_u32x4(sum)); } static inline void highbd_sad128xhx4d_large_neon( const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], int h) { highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 128, h); } static inline void highbd_sad64xhx4d_large_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], int h) { highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, h); } static inline void highbd_sad32xhx4d_large_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], int h) { highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, h); } #define HBD_SAD_WXH_4D_SMALL_NEON(w, h) \ void aom_highbd_sad##w##x##h##x4d_neon( \ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ highbd_sad##w##xhx4d_small_neon(src, src_stride, ref_array, ref_stride, \ sad_array, (h)); \ } #define HBD_SAD_WXH_4D_LARGE_NEON(w, h) \ void aom_highbd_sad##w##x##h##x4d_neon( \ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ highbd_sad##w##xhx4d_large_neon(src, src_stride, ref_array, ref_stride, \ sad_array, (h)); \ } HBD_SAD_WXH_4D_SMALL_NEON(4, 4) HBD_SAD_WXH_4D_SMALL_NEON(4, 8) HBD_SAD_WXH_4D_SMALL_NEON(8, 4) HBD_SAD_WXH_4D_SMALL_NEON(8, 8) HBD_SAD_WXH_4D_SMALL_NEON(8, 16) HBD_SAD_WXH_4D_LARGE_NEON(16, 8) HBD_SAD_WXH_4D_LARGE_NEON(16, 16) HBD_SAD_WXH_4D_LARGE_NEON(16, 32) HBD_SAD_WXH_4D_LARGE_NEON(32, 16) HBD_SAD_WXH_4D_LARGE_NEON(32, 32) HBD_SAD_WXH_4D_LARGE_NEON(32, 64) HBD_SAD_WXH_4D_LARGE_NEON(64, 32) HBD_SAD_WXH_4D_LARGE_NEON(64, 64) HBD_SAD_WXH_4D_LARGE_NEON(64, 128) HBD_SAD_WXH_4D_LARGE_NEON(128, 64) HBD_SAD_WXH_4D_LARGE_NEON(128, 128) #if !CONFIG_REALTIME_ONLY HBD_SAD_WXH_4D_SMALL_NEON(4, 16) HBD_SAD_WXH_4D_LARGE_NEON(8, 32) HBD_SAD_WXH_4D_LARGE_NEON(16, 4) HBD_SAD_WXH_4D_LARGE_NEON(16, 64) HBD_SAD_WXH_4D_LARGE_NEON(32, 8) HBD_SAD_WXH_4D_LARGE_NEON(64, 16) #endif // !CONFIG_REALTIME_ONLY #define HBD_SAD_SKIP_WXH_4D_SMALL_NEON(w, h) \ void aom_highbd_sad_skip_##w##x##h##x4d_neon( \ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ highbd_sad##w##xhx4d_small_neon(src, 2 * src_stride, ref_array, \ 2 * ref_stride, sad_array, ((h) >> 1)); \ sad_array[0] <<= 1; \ sad_array[1] <<= 1; \ sad_array[2] <<= 1; \ sad_array[3] <<= 1; \ } #define HBD_SAD_SKIP_WXH_4D_LARGE_NEON(w, h) \ void aom_highbd_sad_skip_##w##x##h##x4d_neon( \ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ highbd_sad##w##xhx4d_large_neon(src, 2 * src_stride, ref_array, \ 2 * ref_stride, sad_array, ((h) >> 1)); \ sad_array[0] <<= 1; \ sad_array[1] <<= 1; \ sad_array[2] <<= 1; \ sad_array[3] <<= 1; \ } HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 16) HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 16) HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 32) HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 16) HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 32) HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 64) HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 32) HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 64) HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 128) HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 64) HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 128) #if !CONFIG_REALTIME_ONLY HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 16) HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 32) HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 64) HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 16) #endif // !CONFIG_REALTIME_ONLY static inline void highbd_sad4xhx3d_small_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; int i = 0; do { uint16x4_t s = vld1_u16(src16_ptr + i * src_stride); uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride); uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride); uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride); sum[0] = vabal_u16(sum[0], s, r0); sum[1] = vabal_u16(sum[1], s, r1); sum[2] = vabal_u16(sum[2], s, r2); } while (++i < h); res[0] = horizontal_add_u32x4(sum[0]); res[1] = horizontal_add_u32x4(sum[1]); res[2] = horizontal_add_u32x4(sum[2]); } static inline void highbd_sad8xhx3d_small_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; int i = 0; do { uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride)); sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride)); sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride)); } while (++i < h); res[0] = horizontal_add_u32x4(vpaddlq_u16(sum[0])); res[1] = horizontal_add_u32x4(vpaddlq_u16(sum[1])); res[2] = horizontal_add_u32x4(vpaddlq_u16(sum[2])); } #if !CONFIG_REALTIME_ONLY static inline void highbd_sad8xhx3d_large_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; int i = 0; do { uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); uint16x8_t r0 = vld1q_u16(ref16_ptr0 + i * ref_stride); uint16x8_t r1 = vld1q_u16(ref16_ptr1 + i * ref_stride); uint16x8_t r2 = vld1q_u16(ref16_ptr2 + i * ref_stride); sad8_neon(s, r0, &sum[0]); sad8_neon(s, r1, &sum[1]); sad8_neon(s, r2, &sum[2]); } while (++i < h); res[0] = horizontal_add_u32x4(sum[0]); res[1] = horizontal_add_u32x4(sum[1]); res[2] = horizontal_add_u32x4(sum[2]); } #endif // !CONFIG_REALTIME_ONLY static inline void highbd_sad16xhx3d_large_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; int i = 0; do { uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride); sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]); sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]); sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]); uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8); sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]); sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]); sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]); } while (++i < h); res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0])); res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1])); res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2])); } static inline void highbd_sadwxhx3d_large_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], int w, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; uint32x4_t sum[3]; int i = 0; do { int j = 0; do { uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j); sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]); sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]); sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]); uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8); sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]); sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]); sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]); uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16); sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16), &sum_lo[0]); sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16), &sum_lo[1]); sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16), &sum_lo[2]); uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24); sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24), &sum_hi[0]); sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24), &sum_hi[1]); sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24), &sum_hi[2]); j += 32; } while (j < w); } while (++i < h); sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); res[0] = horizontal_add_u32x4(sum[0]); res[1] = horizontal_add_u32x4(sum[1]); res[2] = horizontal_add_u32x4(sum[2]); } static inline void highbd_sad128xhx3d_large_neon( const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], int h) { highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 128, h); } static inline void highbd_sad64xhx3d_large_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], int h) { highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, h); } static inline void highbd_sad32xhx3d_large_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], int h) { highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, h); } #define HBD_SAD_WXH_3D_SMALL_NEON(w, h) \ void aom_highbd_sad##w##x##h##x3d_neon( \ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ highbd_sad##w##xhx3d_small_neon(src, src_stride, ref_array, ref_stride, \ sad_array, (h)); \ } #define HBD_SAD_WXH_3D_LARGE_NEON(w, h) \ void aom_highbd_sad##w##x##h##x3d_neon( \ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ highbd_sad##w##xhx3d_large_neon(src, src_stride, ref_array, ref_stride, \ sad_array, (h)); \ } HBD_SAD_WXH_3D_SMALL_NEON(4, 4) HBD_SAD_WXH_3D_SMALL_NEON(4, 8) HBD_SAD_WXH_3D_SMALL_NEON(8, 4) HBD_SAD_WXH_3D_SMALL_NEON(8, 8) HBD_SAD_WXH_3D_SMALL_NEON(8, 16) HBD_SAD_WXH_3D_LARGE_NEON(16, 8) HBD_SAD_WXH_3D_LARGE_NEON(16, 16) HBD_SAD_WXH_3D_LARGE_NEON(16, 32) HBD_SAD_WXH_3D_LARGE_NEON(32, 16) HBD_SAD_WXH_3D_LARGE_NEON(32, 32) HBD_SAD_WXH_3D_LARGE_NEON(32, 64) HBD_SAD_WXH_3D_LARGE_NEON(64, 32) HBD_SAD_WXH_3D_LARGE_NEON(64, 64) HBD_SAD_WXH_3D_LARGE_NEON(64, 128) HBD_SAD_WXH_3D_LARGE_NEON(128, 64) HBD_SAD_WXH_3D_LARGE_NEON(128, 128) #if !CONFIG_REALTIME_ONLY HBD_SAD_WXH_3D_SMALL_NEON(4, 16) HBD_SAD_WXH_3D_LARGE_NEON(8, 32) HBD_SAD_WXH_3D_LARGE_NEON(16, 4) HBD_SAD_WXH_3D_LARGE_NEON(16, 64) HBD_SAD_WXH_3D_LARGE_NEON(32, 8) HBD_SAD_WXH_3D_LARGE_NEON(64, 16) #endif // !CONFIG_REALTIME_ONLY aom-3.12.1/aom_dsp/arm/highbd_sse_neon.c000066400000000000000000000253351477627663500200740ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/arm/sum_neon.h" static inline void highbd_sse_8x1_init_neon(const uint16_t *src, const uint16_t *ref, uint32x4_t *sse_acc0, uint32x4_t *sse_acc1) { uint16x8_t s = vld1q_u16(src); uint16x8_t r = vld1q_u16(ref); uint16x8_t abs_diff = vabdq_u16(s, r); uint16x4_t abs_diff_lo = vget_low_u16(abs_diff); uint16x4_t abs_diff_hi = vget_high_u16(abs_diff); *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo); *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi); } static inline void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref, uint32x4_t *sse_acc0, uint32x4_t *sse_acc1) { uint16x8_t s = vld1q_u16(src); uint16x8_t r = vld1q_u16(ref); uint16x8_t abs_diff = vabdq_u16(s, r); uint16x4_t abs_diff_lo = vget_low_u16(abs_diff); uint16x4_t abs_diff_hi = vget_high_u16(abs_diff); *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo); *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi); } static inline int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int height) { uint32x4_t sse[16]; highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); highbd_sse_8x1_init_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]); highbd_sse_8x1_init_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]); highbd_sse_8x1_init_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]); highbd_sse_8x1_init_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]); highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]); highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]); highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]); highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]); highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]); highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]); highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]); highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]); src += src_stride; ref += ref_stride; while (--height != 0) { highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]); highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]); highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]); highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]); highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]); highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]); highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]); highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]); highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]); highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]); highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]); highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]); src += src_stride; ref += ref_stride; } return horizontal_long_add_u32x4_x16(sse); } static inline int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int height) { uint32x4_t sse[8]; highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]); highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]); highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]); highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]); src += src_stride; ref += ref_stride; while (--height != 0) { highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]); highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]); highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]); highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]); src += src_stride; ref += ref_stride; } return horizontal_long_add_u32x4_x8(sse); } static inline int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int height) { uint32x4_t sse[8]; highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); src += src_stride; ref += ref_stride; while (--height != 0) { highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); src += src_stride; ref += ref_stride; } return horizontal_long_add_u32x4_x8(sse); } static inline int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int height) { uint32x4_t sse[4]; highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); src += src_stride; ref += ref_stride; while (--height != 0) { highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); src += src_stride; ref += ref_stride; } return horizontal_long_add_u32x4_x4(sse); } static inline int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int height) { uint32x4_t sse[2]; highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]); src += src_stride; ref += ref_stride; while (--height != 0) { highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]); src += src_stride; ref += ref_stride; } return horizontal_long_add_u32x4_x2(sse); } static inline int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int height) { // Peel the first loop iteration. uint16x4_t s = vld1_u16(src); uint16x4_t r = vld1_u16(ref); uint16x4_t abs_diff = vabd_u16(s, r); uint32x4_t sse = vmull_u16(abs_diff, abs_diff); src += src_stride; ref += ref_stride; while (--height != 0) { s = vld1_u16(src); r = vld1_u16(ref); abs_diff = vabd_u16(s, r); sse = vmlal_u16(sse, abs_diff, abs_diff); src += src_stride; ref += ref_stride; } return horizontal_long_add_u32x4(sse); } static inline int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int width, int height) { // { 0, 1, 2, 3, 4, 5, 6, 7 } uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100)); uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7)); uint64_t sse = 0; do { int w = width; int offset = 0; do { uint16x8_t s = vld1q_u16(src + offset); uint16x8_t r = vld1q_u16(ref + offset); if (w < 8) { // Mask out-of-range elements. s = vandq_u16(s, remainder_mask); r = vandq_u16(r, remainder_mask); } uint16x8_t abs_diff = vabdq_u16(s, r); uint16x4_t abs_diff_lo = vget_low_u16(abs_diff); uint16x4_t abs_diff_hi = vget_high_u16(abs_diff); uint32x4_t sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo); sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi); sse += horizontal_long_add_u32x4(sse_u32); offset += 8; w -= 8; } while (w > 0); src += src_stride; ref += ref_stride; } while (--height != 0); return sse; } int64_t aom_highbd_sse_neon(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, int width, int height) { uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); switch (width) { case 4: return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height); case 8: return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height); case 16: return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height); case 32: return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height); case 64: return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height); case 128: return highbd_sse_128xh_neon(src, src_stride, ref, ref_stride, height); default: return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width, height); } } aom-3.12.1/aom_dsp/arm/highbd_sse_sve.c000066400000000000000000000172201477627663500177240ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" #include "config/aom_dsp_rtcd.h" static inline void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref, uint64x2_t *sse) { uint16x8_t s = vld1q_u16(src); uint16x8_t r = vld1q_u16(ref); uint16x8_t abs_diff = vabdq_u16(s, r); *sse = aom_udotq_u16(*sse, abs_diff, abs_diff); } static inline int64_t highbd_sse_128xh_sve(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int height) { uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0) }; do { highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]); highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]); highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]); highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]); highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0]); highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[1]); highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[2]); highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[3]); highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0]); highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[1]); highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[2]); highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[3]); highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[0]); highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[1]); highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[2]); highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[3]); src += src_stride; ref += ref_stride; } while (--height != 0); sse[0] = vaddq_u64(sse[0], sse[1]); sse[2] = vaddq_u64(sse[2], sse[3]); sse[0] = vaddq_u64(sse[0], sse[2]); return vaddvq_u64(sse[0]); } static inline int64_t highbd_sse_64xh_sve(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int height) { uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0) }; do { highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]); highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]); highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]); highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]); highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0]); highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[1]); highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[2]); highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[3]); src += src_stride; ref += ref_stride; } while (--height != 0); sse[0] = vaddq_u64(sse[0], sse[1]); sse[2] = vaddq_u64(sse[2], sse[3]); sse[0] = vaddq_u64(sse[0], sse[2]); return vaddvq_u64(sse[0]); } static inline int64_t highbd_sse_32xh_sve(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int height) { uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0) }; do { highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]); highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]); highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]); highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]); src += src_stride; ref += ref_stride; } while (--height != 0); sse[0] = vaddq_u64(sse[0], sse[1]); sse[2] = vaddq_u64(sse[2], sse[3]); sse[0] = vaddq_u64(sse[0], sse[2]); return vaddvq_u64(sse[0]); } static inline int64_t highbd_sse_16xh_sve(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int height) { uint64x2_t sse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; do { highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]); highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]); src += src_stride; ref += ref_stride; } while (--height != 0); return vaddvq_u64(vaddq_u64(sse[0], sse[1])); } static inline int64_t highbd_sse_8xh_sve(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int height) { uint64x2_t sse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; do { highbd_sse_8x1_neon(src + 0 * src_stride, ref + 0 * ref_stride, &sse[0]); highbd_sse_8x1_neon(src + 1 * src_stride, ref + 1 * ref_stride, &sse[1]); src += 2 * src_stride; ref += 2 * ref_stride; height -= 2; } while (height != 0); return vaddvq_u64(vaddq_u64(sse[0], sse[1])); } static inline int64_t highbd_sse_4xh_sve(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int height) { uint64x2_t sse = vdupq_n_u64(0); do { uint16x8_t s = load_unaligned_u16_4x2(src, src_stride); uint16x8_t r = load_unaligned_u16_4x2(ref, ref_stride); uint16x8_t abs_diff = vabdq_u16(s, r); sse = aom_udotq_u16(sse, abs_diff, abs_diff); src += 2 * src_stride; ref += 2 * ref_stride; height -= 2; } while (height != 0); return vaddvq_u64(sse); } static inline int64_t highbd_sse_wxh_sve(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int width, int height) { svuint64_t sse = svdup_n_u64(0); uint64_t step = svcnth(); do { int w = 0; const uint16_t *src_ptr = src; const uint16_t *ref_ptr = ref; do { svbool_t pred = svwhilelt_b16_u32(w, width); svuint16_t s = svld1_u16(pred, src_ptr); svuint16_t r = svld1_u16(pred, ref_ptr); svuint16_t abs_diff = svabd_u16_z(pred, s, r); sse = svdot_u64(sse, abs_diff, abs_diff); src_ptr += step; ref_ptr += step; w += step; } while (w < width); src += src_stride; ref += ref_stride; } while (--height != 0); return svaddv_u64(svptrue_b64(), sse); } int64_t aom_highbd_sse_sve(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, int width, int height) { uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); switch (width) { case 4: return highbd_sse_4xh_sve(src, src_stride, ref, ref_stride, height); case 8: return highbd_sse_8xh_sve(src, src_stride, ref, ref_stride, height); case 16: return highbd_sse_16xh_sve(src, src_stride, ref, ref_stride, height); case 32: return highbd_sse_32xh_sve(src, src_stride, ref, ref_stride, height); case 64: return highbd_sse_64xh_sve(src, src_stride, ref, ref_stride, height); case 128: return highbd_sse_128xh_sve(src, src_stride, ref, ref_stride, height); default: return highbd_sse_wxh_sve(src, src_stride, ref, ref_stride, width, height); } } aom-3.12.1/aom_dsp/arm/highbd_subpel_variance_neon.c000066400000000000000000001600361477627663500224420ustar00rootroot00000000000000/* * Copyright (c) 2023 The WebM project authors. All rights reserved. * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_dsp/variance.h" // The bilinear filters look like this: // // {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, // { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }} // // We can factor out the highest common multiple, such that the sum of both // weights will be 8 instead of 128. The benefits of this are two-fold: // // 1) We can infer the filter values from the filter_offset parameter in the // bilinear filter functions below - we don't have to actually load the values // from memory: // f0 = 8 - filter_offset // f1 = filter_offset // // 2) Scaling the pixel values by 8, instead of 128 enables us to operate on // 16-bit data types at all times, rather than widening out to 32-bit and // requiring double the number of data processing instructions. (12-bit * 8 = // 15-bit.) // Process a block exactly 4 wide and any height. static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset) { const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); const uint16x4_t f1 = vdup_n_u16(filter_offset); int i = dst_height; do { uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr); uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step); uint16x4_t blend = vmul_u16(s0, f0); blend = vmla_u16(blend, s1, f1); blend = vrshr_n_u16(blend, 3); vst1_u16(dst_ptr, blend); src_ptr += src_stride; dst_ptr += 4; } while (--i != 0); } // Process a block which is a multiple of 8 and any height. static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_width, int dst_height, int filter_offset) { const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); const uint16x8_t f1 = vdupq_n_u16(filter_offset); int i = dst_height; do { int j = 0; do { uint16x8_t s0 = vld1q_u16(src_ptr + j); uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); uint16x8_t blend = vmulq_u16(s0, f0); blend = vmlaq_u16(blend, s1, f1); blend = vrshrq_n_u16(blend, 3); vst1q_u16(dst_ptr + j, blend); j += 8; } while (j < dst_width); src_ptr += src_stride; dst_ptr += dst_width; } while (--i != 0); } static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset) { highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 8, dst_height, filter_offset); } static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset) { highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset); } static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset) { highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset); } static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset) { highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset); } static void highbd_var_filter_block2d_bil_w128(const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset) { highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset); } static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_width, int dst_height) { int i = dst_height; // We only specialize on the filter values for large block sizes (>= 16x16.) assert(dst_width >= 16 && dst_width % 16 == 0); do { int j = 0; do { uint16x8_t s0 = vld1q_u16(src_ptr + j); uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); uint16x8_t avg = vrhaddq_u16(s0, s1); vst1q_u16(dst_ptr + j, avg); j += 8; } while (j < dst_width); src_ptr += src_stride; dst_ptr += dst_width; } while (--i != 0); } #define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, uint32_t *sse) { \ uint16_t tmp0[w * (h + 1)]; \ uint16_t tmp1[w * h]; \ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ xoffset); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ \ return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ w, ref, ref_stride, sse); \ } #define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, unsigned int *sse) { \ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ \ if (xoffset == 0) { \ if (yoffset == 0) { \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp[w * h]; \ highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \ h); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \ src_stride, h, yoffset); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ uint16_t tmp0[w * (h + 1)]; \ if (yoffset == 0) { \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ (h + 1)); \ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ (h + 1)); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } else { \ uint16_t tmp0[w * (h + 1)]; \ if (yoffset == 0) { \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \ xoffset); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ (h + 1), xoffset); \ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ (h + 1), xoffset); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } \ } // 8-bit HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4) HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8) HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4) HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8) HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128) #if !CONFIG_REALTIME_ONLY HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16) HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16) #endif // !CONFIG_REALTIME_ONLY // 10-bit HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4) HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8) HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4) HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8) HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128) #if !CONFIG_REALTIME_ONLY HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16) HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16) #endif // !CONFIG_REALTIME_ONLY // 12-bit HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4) HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8) HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4) HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8) HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128) #if !CONFIG_REALTIME_ONLY HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16) HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8) HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16) #endif // !CONFIG_REALTIME_ONLY // Combine bilinear filter with aom_highbd_comp_avg_pred for blocks having // width 4. static void highbd_avg_pred_var_filter_block2d_bil_w4( const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset, const uint16_t *second_pred) { const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); const uint16x4_t f1 = vdup_n_u16(filter_offset); int i = dst_height; do { uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr); uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step); uint16x4_t p = vld1_u16(second_pred); uint16x4_t blend = vmul_u16(s0, f0); blend = vmla_u16(blend, s1, f1); blend = vrshr_n_u16(blend, 3); vst1_u16(dst_ptr, vrhadd_u16(blend, p)); src_ptr += src_stride; dst_ptr += 4; second_pred += 4; } while (--i != 0); } // Combine bilinear filter with aom_highbd_comp_avg_pred for large blocks. static void highbd_avg_pred_var_filter_block2d_bil_large( const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_width, int dst_height, int filter_offset, const uint16_t *second_pred) { const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); const uint16x8_t f1 = vdupq_n_u16(filter_offset); int i = dst_height; do { int j = 0; do { uint16x8_t s0 = vld1q_u16(src_ptr + j); uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); uint16x8_t p = vld1q_u16(second_pred); uint16x8_t blend = vmulq_u16(s0, f0); blend = vmlaq_u16(blend, s1, f1); blend = vrshrq_n_u16(blend, 3); vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p)); j += 8; second_pred += 8; } while (j < dst_width); src_ptr += src_stride; dst_ptr += dst_width; } while (--i != 0); } static void highbd_avg_pred_var_filter_block2d_bil_w8( const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset, const uint16_t *second_pred) { highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 8, dst_height, filter_offset, second_pred); } static void highbd_avg_pred_var_filter_block2d_bil_w16( const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset, const uint16_t *second_pred) { highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset, second_pred); } static void highbd_avg_pred_var_filter_block2d_bil_w32( const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset, const uint16_t *second_pred) { highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset, second_pred); } static void highbd_avg_pred_var_filter_block2d_bil_w64( const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset, const uint16_t *second_pred) { highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset, second_pred); } static void highbd_avg_pred_var_filter_block2d_bil_w128( const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset, const uint16_t *second_pred) { highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset, second_pred); } // Combine averaging subpel filter with aom_highbd_comp_avg_pred. static void highbd_avg_pred_var_filter_block2d_avg( const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, int dst_width, int dst_height, const uint16_t *second_pred) { int i = dst_height; // We only specialize on the filter values for large block sizes (>= 16x16.) assert(dst_width >= 16 && dst_width % 16 == 0); do { int j = 0; do { uint16x8_t s0 = vld1q_u16(src_ptr + j); uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); uint16x8_t avg = vrhaddq_u16(s0, s1); uint16x8_t p = vld1q_u16(second_pred); avg = vrhaddq_u16(avg, p); vst1q_u16(dst_ptr + j, avg); j += 8; second_pred += 8; } while (j < dst_width); src_ptr += src_stride; dst_ptr += dst_width; } while (--i != 0); } // Implementation of aom_highbd_comp_avg_pred for blocks having width >= 16. static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int dst_width, int dst_height, const uint16_t *second_pred) { int i = dst_height; // We only specialize on the filter values for large block sizes (>= 16x16.) assert(dst_width >= 16 && dst_width % 16 == 0); do { int j = 0; do { uint16x8_t s = vld1q_u16(src_ptr + j); uint16x8_t p = vld1q_u16(second_pred); uint16x8_t avg = vrhaddq_u16(s, p); vst1q_u16(dst_ptr + j, avg); j += 8; second_pred += 8; } while (j < dst_width); src_ptr += src_stride; dst_ptr += dst_width; } while (--i != 0); } #define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, uint32_t *sse, \ const uint8_t *second_pred) { \ uint16_t tmp0[w * (h + 1)]; \ uint16_t tmp1[w * h]; \ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ xoffset); \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ \ return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ w, ref, ref_stride, sse); \ } #define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ unsigned int aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ const uint8_t *src, int source_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, uint32_t *sse, \ const uint8_t *second_pred) { \ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ \ if (xoffset == 0) { \ uint16_t tmp[w * h]; \ if (yoffset == 0) { \ highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \ CONVERT_TO_SHORTPTR(second_pred)); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ highbd_avg_pred_var_filter_block2d_avg( \ src_ptr, tmp, source_stride, source_stride, w, h, \ CONVERT_TO_SHORTPTR(second_pred)); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } else { \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ src_ptr, tmp, source_stride, source_stride, h, yoffset, \ CONVERT_TO_SHORTPTR(second_pred)); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ uint16_t tmp0[w * (h + 1)]; \ if (yoffset == 0) { \ highbd_avg_pred_var_filter_block2d_avg( \ src_ptr, tmp0, source_stride, 1, w, h, \ CONVERT_TO_SHORTPTR(second_pred)); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ (h + 1)); \ highbd_avg_pred_var_filter_block2d_avg( \ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ (h + 1)); \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } else { \ uint16_t tmp0[w * (h + 1)]; \ if (yoffset == 0) { \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ src_ptr, tmp0, source_stride, 1, h, xoffset, \ CONVERT_TO_SHORTPTR(second_pred)); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ (h + 1), xoffset); \ highbd_avg_pred_var_filter_block2d_avg( \ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ (h + 1), xoffset); \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } \ } // 8-bit HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128) #if !CONFIG_REALTIME_ONLY HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16) #endif // !CONFIG_REALTIME_ONLY // 10-bit HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128) #if !CONFIG_REALTIME_ONLY HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16) #endif // !CONFIG_REALTIME_ONLY // 12-bit HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128) #if !CONFIG_REALTIME_ONLY HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16) HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8) HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16) #endif // !CONFIG_REALTIME_ONLY #define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ unsigned int \ aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ const uint8_t *msk, int msk_stride, int invert_mask, \ unsigned int *sse) { \ uint16_t tmp0[w * (h + 1)]; \ uint16_t tmp1[w * (h + 1)]; \ uint16_t tmp2[w * h]; \ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ xoffset); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w, \ h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ msk_stride, invert_mask); \ return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp2), \ w, ref, ref_stride, sse); \ } #define HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ unsigned int \ aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ const uint8_t *msk, int msk_stride, int invert_mask, \ unsigned int *sse) { \ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ if (xoffset == 0) { \ uint16_t tmp0[w * h]; \ if (yoffset == 0) { \ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp0), second_pred, \ w, h, src, src_stride, msk, msk_stride, \ invert_mask); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, src_stride, \ w, h); \ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ msk_stride, invert_mask); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, \ src_stride, h, yoffset); \ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ msk_stride, invert_mask); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ uint16_t tmp0[w * (h + 1)]; \ if (yoffset == 0) { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ msk_stride, invert_mask); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ uint16_t tmp2[w * h]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ (h + 1)); \ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ msk_stride, invert_mask); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ uint16_t tmp2[w * h]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ (h + 1)); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ msk_stride, invert_mask); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ } \ } else { \ if (yoffset == 0) { \ uint16_t tmp0[w * h]; \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \ xoffset); \ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ msk_stride, invert_mask); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp0[w * (h + 1)]; \ uint16_t tmp1[w * h]; \ uint16_t tmp2[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ (h + 1), xoffset); \ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ msk_stride, invert_mask); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp0[w * (h + 1)]; \ uint16_t tmp1[w * (h + 1)]; \ uint16_t tmp2[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ (h + 1), xoffset); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ msk_stride, invert_mask); \ return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ } \ } \ } // 8-bit HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4) HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8) HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4) HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8) HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128) #if !CONFIG_REALTIME_ONLY HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16) HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16) #endif // !CONFIG_REALTIME_ONLY // 10-bit HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4) HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8) HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4) HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8) HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128) #if !CONFIG_REALTIME_ONLY HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16) HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16) #endif // !CONFIG_REALTIME_ONLY // 12-bit HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4) HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8) HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4) HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8) HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128) #if !CONFIG_REALTIME_ONLY HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16) HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8) HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16) #endif // !CONFIG_REALTIME_ONLY #if !CONFIG_REALTIME_ONLY #define HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ unsigned int \ aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon( \ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); \ uint16_t tmp0[w * (h + 1)]; \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h + 1, \ xoffset); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \ } #define SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ unsigned int \ aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon( \ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); \ if (xoffset == 0) { \ if (yoffset == 0) { \ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ pre, pre_stride, wsrc, mask, sse); \ } else if (yoffset == 4) { \ uint16_t tmp[w * h]; \ highbd_var_filter_block2d_avg(pre_ptr, tmp, pre_stride, pre_stride, w, \ h); \ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse); \ } else { \ uint16_t tmp[w * h]; \ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp, pre_stride, \ pre_stride, h, yoffset); \ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse); \ } \ } else if (xoffset == 4) { \ uint16_t tmp0[w * (h + 1)]; \ if (yoffset == 0) { \ highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h); \ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \ } else { \ uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \ } \ } else { \ uint16_t tmp0[w * (h + 1)]; \ if (yoffset == 0) { \ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h, \ xoffset); \ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, \ h + 1, xoffset); \ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \ } else { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, \ h + 1, xoffset); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \ } \ } \ } // 8-bit HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128) // 10-bit HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128) // 12-bit HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4) HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64) SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128) #endif // !CONFIG_REALTIME_ONLY aom-3.12.1/aom_dsp/arm/highbd_variance_neon.c000066400000000000000000000467441477627663500211010ustar00rootroot00000000000000/* * Copyright (c) 2023 The WebM project authors. All rights reserved. * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_dsp/variance.h" // Process a block of width 4 two rows at a time. static inline void highbd_variance_4xh_neon(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, int h, uint64_t *sse, int64_t *sum) { int16x8_t sum_s16 = vdupq_n_s16(0); int32x4_t sse_s32 = vdupq_n_s32(0); int i = h; do { const uint16x8_t s = load_unaligned_u16_4x2(src_ptr, src_stride); const uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride); int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); sum_s16 = vaddq_s16(sum_s16, diff); sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff)); sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff)); src_ptr += 2 * src_stride; ref_ptr += 2 * ref_stride; i -= 2; } while (i != 0); *sum = horizontal_add_s16x8(sum_s16); *sse = horizontal_add_s32x4(sse_s32); } // For 8-bit and 10-bit data, since we're using two int32x4 accumulators, all // block sizes can be processed in 32-bit elements (1023*1023*128*32 = // 4286582784 for a 128x128 block). static inline void highbd_variance_large_neon(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, int w, int h, uint64_t *sse, int64_t *sum) { int32x4_t sum_s32 = vdupq_n_s32(0); int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; int i = h; do { int j = 0; do { const uint16x8_t s = vld1q_u16(src_ptr + j); const uint16x8_t r = vld1q_u16(ref_ptr + j); const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); sum_s32 = vpadalq_s16(sum_s32, diff); sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); sse_s32[1] = vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); j += 8; } while (j < w); src_ptr += src_stride; ref_ptr += ref_stride; } while (--i != 0); *sum = horizontal_add_s32x4(sum_s32); *sse = horizontal_long_add_u32x4(vaddq_u32( vreinterpretq_u32_s32(sse_s32[0]), vreinterpretq_u32_s32(sse_s32[1]))); } static inline void highbd_variance_8xh_neon(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int h, uint64_t *sse, int64_t *sum) { highbd_variance_large_neon(src, src_stride, ref, ref_stride, 8, h, sse, sum); } static inline void highbd_variance_16xh_neon(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int h, uint64_t *sse, int64_t *sum) { highbd_variance_large_neon(src, src_stride, ref, ref_stride, 16, h, sse, sum); } static inline void highbd_variance_32xh_neon(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int h, uint64_t *sse, int64_t *sum) { highbd_variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum); } static inline void highbd_variance_64xh_neon(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int h, uint64_t *sse, int64_t *sum) { highbd_variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum); } static inline void highbd_variance_128xh_neon(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int h, uint64_t *sse, int64_t *sum) { highbd_variance_large_neon(src, src_stride, ref, ref_stride, 128, h, sse, sum); } // For 12-bit data, we can only accumulate up to 128 elements in the sum of // squares (4095*4095*128 = 2146435200), and because we're using two int32x4 // accumulators, we can only process up to 32 32-element rows (32*32/8 = 128) // or 16 64-element rows before we have to accumulate into 64-bit elements. // Therefore blocks of size 32x64, 64x32, 64x64, 64x128, 128x64, 128x128 are // processed in a different helper function. // Process a block of any size where the width is divisible by 8, with // accumulation into 64-bit elements. static inline void highbd_variance_xlarge_neon( const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, int w, int h, int h_limit, uint64_t *sse, int64_t *sum) { int32x4_t sum_s32 = vdupq_n_s32(0); int64x2_t sse_s64 = vdupq_n_s64(0); // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit // accumulator overflows. After hitting this limit we accumulate into 64-bit // elements. int h_tmp = h > h_limit ? h_limit : h; int i = 0; do { int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; do { int j = 0; do { const uint16x8_t s0 = vld1q_u16(src_ptr + j); const uint16x8_t r0 = vld1q_u16(ref_ptr + j); const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s0, r0)); sum_s32 = vpadalq_s16(sum_s32, diff); sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); sse_s32[1] = vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); j += 8; } while (j < w); src_ptr += src_stride; ref_ptr += ref_stride; i++; } while (i < h_tmp); sse_s64 = vpadalq_s32(sse_s64, sse_s32[0]); sse_s64 = vpadalq_s32(sse_s64, sse_s32[1]); h_tmp += h_limit; } while (i < h); *sum = horizontal_add_s32x4(sum_s32); *sse = (uint64_t)horizontal_add_s64x2(sse_s64); } static inline void highbd_variance_32xh_xlarge_neon( const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int h, uint64_t *sse, int64_t *sum) { highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 32, h, 32, sse, sum); } static inline void highbd_variance_64xh_xlarge_neon( const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int h, uint64_t *sse, int64_t *sum) { highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 64, h, 16, sse, sum); } static inline void highbd_variance_128xh_xlarge_neon( const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int h, uint64_t *sse, int64_t *sum) { highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 128, h, 8, sse, sum); } #define HBD_VARIANCE_WXH_8_NEON(w, h) \ uint32_t aom_highbd_8_variance##w##x##h##_neon( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, uint32_t *sse) { \ int sum; \ uint64_t sse_long = 0; \ int64_t sum_long = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ &sse_long, &sum_long); \ *sse = (uint32_t)sse_long; \ sum = (int)sum_long; \ return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \ } #define HBD_VARIANCE_WXH_10_NEON(w, h) \ uint32_t aom_highbd_10_variance##w##x##h##_neon( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, uint32_t *sse) { \ int sum; \ int64_t var; \ uint64_t sse_long = 0; \ int64_t sum_long = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ &sse_long, &sum_long); \ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ return (var >= 0) ? (uint32_t)var : 0; \ } #define HBD_VARIANCE_WXH_12_NEON(w, h) \ uint32_t aom_highbd_12_variance##w##x##h##_neon( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, uint32_t *sse) { \ int sum; \ int64_t var; \ uint64_t sse_long = 0; \ int64_t sum_long = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ &sse_long, &sum_long); \ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ return (var >= 0) ? (uint32_t)var : 0; \ } #define HBD_VARIANCE_WXH_12_XLARGE_NEON(w, h) \ uint32_t aom_highbd_12_variance##w##x##h##_neon( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, uint32_t *sse) { \ int sum; \ int64_t var; \ uint64_t sse_long = 0; \ int64_t sum_long = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ highbd_variance_##w##xh_xlarge_neon(src, src_stride, ref, ref_stride, h, \ &sse_long, &sum_long); \ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ return (var >= 0) ? (uint32_t)var : 0; \ } // 8-bit HBD_VARIANCE_WXH_8_NEON(4, 4) HBD_VARIANCE_WXH_8_NEON(4, 8) HBD_VARIANCE_WXH_8_NEON(8, 4) HBD_VARIANCE_WXH_8_NEON(8, 8) HBD_VARIANCE_WXH_8_NEON(8, 16) HBD_VARIANCE_WXH_8_NEON(16, 8) HBD_VARIANCE_WXH_8_NEON(16, 16) HBD_VARIANCE_WXH_8_NEON(16, 32) HBD_VARIANCE_WXH_8_NEON(32, 16) HBD_VARIANCE_WXH_8_NEON(32, 32) HBD_VARIANCE_WXH_8_NEON(32, 64) HBD_VARIANCE_WXH_8_NEON(64, 32) HBD_VARIANCE_WXH_8_NEON(64, 64) HBD_VARIANCE_WXH_8_NEON(64, 128) HBD_VARIANCE_WXH_8_NEON(128, 64) HBD_VARIANCE_WXH_8_NEON(128, 128) // 10-bit HBD_VARIANCE_WXH_10_NEON(4, 4) HBD_VARIANCE_WXH_10_NEON(4, 8) HBD_VARIANCE_WXH_10_NEON(8, 4) HBD_VARIANCE_WXH_10_NEON(8, 8) HBD_VARIANCE_WXH_10_NEON(8, 16) HBD_VARIANCE_WXH_10_NEON(16, 8) HBD_VARIANCE_WXH_10_NEON(16, 16) HBD_VARIANCE_WXH_10_NEON(16, 32) HBD_VARIANCE_WXH_10_NEON(32, 16) HBD_VARIANCE_WXH_10_NEON(32, 32) HBD_VARIANCE_WXH_10_NEON(32, 64) HBD_VARIANCE_WXH_10_NEON(64, 32) HBD_VARIANCE_WXH_10_NEON(64, 64) HBD_VARIANCE_WXH_10_NEON(64, 128) HBD_VARIANCE_WXH_10_NEON(128, 64) HBD_VARIANCE_WXH_10_NEON(128, 128) // 12-bit HBD_VARIANCE_WXH_12_NEON(4, 4) HBD_VARIANCE_WXH_12_NEON(4, 8) HBD_VARIANCE_WXH_12_NEON(8, 4) HBD_VARIANCE_WXH_12_NEON(8, 8) HBD_VARIANCE_WXH_12_NEON(8, 16) HBD_VARIANCE_WXH_12_NEON(16, 8) HBD_VARIANCE_WXH_12_NEON(16, 16) HBD_VARIANCE_WXH_12_NEON(16, 32) HBD_VARIANCE_WXH_12_NEON(32, 16) HBD_VARIANCE_WXH_12_NEON(32, 32) HBD_VARIANCE_WXH_12_XLARGE_NEON(32, 64) HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 32) HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64) HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 128) HBD_VARIANCE_WXH_12_XLARGE_NEON(128, 64) HBD_VARIANCE_WXH_12_XLARGE_NEON(128, 128) #if !CONFIG_REALTIME_ONLY // 8-bit HBD_VARIANCE_WXH_8_NEON(4, 16) HBD_VARIANCE_WXH_8_NEON(8, 32) HBD_VARIANCE_WXH_8_NEON(16, 4) HBD_VARIANCE_WXH_8_NEON(16, 64) HBD_VARIANCE_WXH_8_NEON(32, 8) HBD_VARIANCE_WXH_8_NEON(64, 16) // 10-bit HBD_VARIANCE_WXH_10_NEON(4, 16) HBD_VARIANCE_WXH_10_NEON(8, 32) HBD_VARIANCE_WXH_10_NEON(16, 4) HBD_VARIANCE_WXH_10_NEON(16, 64) HBD_VARIANCE_WXH_10_NEON(32, 8) HBD_VARIANCE_WXH_10_NEON(64, 16) // 12-bit HBD_VARIANCE_WXH_12_NEON(4, 16) HBD_VARIANCE_WXH_12_NEON(8, 32) HBD_VARIANCE_WXH_12_NEON(16, 4) HBD_VARIANCE_WXH_12_NEON(16, 64) HBD_VARIANCE_WXH_12_NEON(32, 8) HBD_VARIANCE_WXH_12_NEON(64, 16) #endif // !CONFIG_REALTIME_ONLY static inline uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, int w, int h, unsigned int *sse) { uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = h; do { int j = 0; do { uint16x8_t s = vld1q_u16(src_ptr + j); uint16x8_t r = vld1q_u16(ref_ptr + j); uint16x8_t diff = vabdq_u16(s, r); sse_u32[0] = vmlal_u16(sse_u32[0], vget_low_u16(diff), vget_low_u16(diff)); sse_u32[1] = vmlal_u16(sse_u32[1], vget_high_u16(diff), vget_high_u16(diff)); j += 8; } while (j < w); src_ptr += src_stride; ref_ptr += ref_stride; } while (--i != 0); *sse = horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1])); return *sse; } #define HIGHBD_MSE_WXH_NEON(w, h) \ uint32_t aom_highbd_8_mse##w##x##h##_neon( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, uint32_t *sse) { \ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ return *sse; \ } \ \ uint32_t aom_highbd_10_mse##w##x##h##_neon( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, uint32_t *sse) { \ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ *sse = ROUND_POWER_OF_TWO(*sse, 4); \ return *sse; \ } \ \ uint32_t aom_highbd_12_mse##w##x##h##_neon( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, uint32_t *sse) { \ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ *sse = ROUND_POWER_OF_TWO(*sse, 8); \ return *sse; \ } HIGHBD_MSE_WXH_NEON(16, 16) HIGHBD_MSE_WXH_NEON(16, 8) HIGHBD_MSE_WXH_NEON(8, 16) HIGHBD_MSE_WXH_NEON(8, 8) #undef HIGHBD_MSE_WXH_NEON static inline uint64x2_t mse_accumulate_u16_8x2(uint64x2_t sum, uint16x8_t s0, uint16x8_t s1, uint16x8_t d0, uint16x8_t d1) { uint16x8_t e0 = vabdq_u16(s0, d0); uint16x8_t e1 = vabdq_u16(s1, d1); uint32x4_t mse = vmull_u16(vget_low_u16(e0), vget_low_u16(e0)); mse = vmlal_u16(mse, vget_high_u16(e0), vget_high_u16(e0)); mse = vmlal_u16(mse, vget_low_u16(e1), vget_low_u16(e1)); mse = vmlal_u16(mse, vget_high_u16(e1), vget_high_u16(e1)); return vpadalq_u32(sum, mse); } uint64_t aom_mse_wxh_16bit_highbd_neon(uint16_t *dst, int dstride, uint16_t *src, int sstride, int w, int h) { assert((w == 8 || w == 4) && (h == 8 || h == 4)); uint64x2_t sum = vdupq_n_u64(0); if (w == 8) { do { uint16x8_t d0 = vld1q_u16(dst + 0 * dstride); uint16x8_t d1 = vld1q_u16(dst + 1 * dstride); uint16x8_t s0 = vld1q_u16(src + 0 * sstride); uint16x8_t s1 = vld1q_u16(src + 1 * sstride); sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1); dst += 2 * dstride; src += 2 * sstride; h -= 2; } while (h != 0); } else { // w == 4 do { uint16x8_t d0 = load_unaligned_u16_4x2(dst + 0 * dstride, dstride); uint16x8_t d1 = load_unaligned_u16_4x2(dst + 2 * dstride, dstride); uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride); uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride); sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1); dst += 4 * dstride; src += 4 * sstride; h -= 4; } while (h != 0); } return horizontal_add_u64x2(sum); } aom-3.12.1/aom_dsp/arm/highbd_variance_neon_dotprod.c000066400000000000000000000067111477627663500226220ustar00rootroot00000000000000/* * Copyright (c) 2023 The WebM project authors. All rights reserved. * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom_dsp/arm/sum_neon.h" #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" static inline uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, int h, unsigned int *sse) { uint32x4_t sse_u32 = vdupq_n_u32(0); int i = h / 2; do { uint16x8_t s0 = vld1q_u16(src_ptr); src_ptr += src_stride; uint16x8_t s1 = vld1q_u16(src_ptr); src_ptr += src_stride; uint16x8_t r0 = vld1q_u16(ref_ptr); ref_ptr += ref_stride; uint16x8_t r1 = vld1q_u16(ref_ptr); ref_ptr += ref_stride; uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); uint8x16_t diff = vabdq_u8(s, r); sse_u32 = vdotq_u32(sse_u32, diff, diff); } while (--i != 0); *sse = horizontal_add_u32x4(sse_u32); return *sse; } static inline uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, int h, unsigned int *sse) { uint32x4_t sse_u32 = vdupq_n_u32(0); int i = h; do { uint16x8_t s0 = vld1q_u16(src_ptr); uint16x8_t s1 = vld1q_u16(src_ptr + 8); uint16x8_t r0 = vld1q_u16(ref_ptr); uint16x8_t r1 = vld1q_u16(ref_ptr + 8); uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); uint8x16_t diff = vabdq_u8(s, r); sse_u32 = vdotq_u32(sse_u32, diff, diff); src_ptr += src_stride; ref_ptr += ref_stride; } while (--i != 0); *sse = horizontal_add_u32x4(sse_u32); return *sse; } #define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h) \ uint32_t aom_highbd_8_mse##w##x##h##_neon_dotprod( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, uint32_t *sse) { \ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, \ sse); \ return *sse; \ } HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16) HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8) HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16) HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8) #undef HIGHBD_MSE_WXH_NEON_DOTPROD aom-3.12.1/aom_dsp/arm/highbd_variance_sve.c000066400000000000000000000361061477627663500207260ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/variance.h" // Process a block of width 4 two rows at a time. static inline void highbd_variance_4xh_sve(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, int h, uint64_t *sse, int64_t *sum) { int16x8_t sum_s16 = vdupq_n_s16(0); int64x2_t sse_s64 = vdupq_n_s64(0); do { const uint16x8_t s = load_unaligned_u16_4x2(src_ptr, src_stride); const uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride); int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); sum_s16 = vaddq_s16(sum_s16, diff); sse_s64 = aom_sdotq_s16(sse_s64, diff, diff); src_ptr += 2 * src_stride; ref_ptr += 2 * ref_stride; h -= 2; } while (h != 0); *sum = vaddlvq_s16(sum_s16); *sse = vaddvq_s64(sse_s64); } static inline void variance_8x1_sve(const uint16_t *src, const uint16_t *ref, int32x4_t *sum, int64x2_t *sse) { const uint16x8_t s = vld1q_u16(src); const uint16x8_t r = vld1q_u16(ref); const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); *sum = vpadalq_s16(*sum, diff); *sse = aom_sdotq_s16(*sse, diff, diff); } static inline void highbd_variance_8xh_sve(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, int h, uint64_t *sse, int64_t *sum) { int32x4_t sum_s32 = vdupq_n_s32(0); int64x2_t sse_s64 = vdupq_n_s64(0); do { variance_8x1_sve(src_ptr, ref_ptr, &sum_s32, &sse_s64); src_ptr += src_stride; ref_ptr += ref_stride; } while (--h != 0); *sum = vaddlvq_s32(sum_s32); *sse = vaddvq_s64(sse_s64); } static inline void highbd_variance_16xh_sve(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, int h, uint64_t *sse, int64_t *sum) { int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; do { variance_8x1_sve(src_ptr, ref_ptr, &sum_s32[0], &sse_s64[0]); variance_8x1_sve(src_ptr + 8, ref_ptr + 8, &sum_s32[1], &sse_s64[1]); src_ptr += src_stride; ref_ptr += ref_stride; } while (--h != 0); *sum = vaddlvq_s32(vaddq_s32(sum_s32[0], sum_s32[1])); *sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[1])); } static inline void highbd_variance_large_sve(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, int w, int h, uint64_t *sse, int64_t *sum) { int32x4_t sum_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0) }; int64x2_t sse_s64[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0) }; do { int j = 0; do { variance_8x1_sve(src_ptr + j, ref_ptr + j, &sum_s32[0], &sse_s64[0]); variance_8x1_sve(src_ptr + j + 8, ref_ptr + j + 8, &sum_s32[1], &sse_s64[1]); variance_8x1_sve(src_ptr + j + 16, ref_ptr + j + 16, &sum_s32[2], &sse_s64[2]); variance_8x1_sve(src_ptr + j + 24, ref_ptr + j + 24, &sum_s32[3], &sse_s64[3]); j += 32; } while (j < w); src_ptr += src_stride; ref_ptr += ref_stride; } while (--h != 0); sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]); sum_s32[2] = vaddq_s32(sum_s32[2], sum_s32[3]); *sum = vaddlvq_s32(vaddq_s32(sum_s32[0], sum_s32[2])); sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]); sse_s64[2] = vaddq_s64(sse_s64[2], sse_s64[3]); *sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[2])); } static inline void highbd_variance_32xh_sve(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int h, uint64_t *sse, int64_t *sum) { highbd_variance_large_sve(src, src_stride, ref, ref_stride, 32, h, sse, sum); } static inline void highbd_variance_64xh_sve(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int h, uint64_t *sse, int64_t *sum) { highbd_variance_large_sve(src, src_stride, ref, ref_stride, 64, h, sse, sum); } static inline void highbd_variance_128xh_sve(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int h, uint64_t *sse, int64_t *sum) { highbd_variance_large_sve(src, src_stride, ref, ref_stride, 128, h, sse, sum); } #define HBD_VARIANCE_WXH_8_SVE(w, h) \ uint32_t aom_highbd_8_variance##w##x##h##_sve( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, uint32_t *sse) { \ int sum; \ uint64_t sse_long = 0; \ int64_t sum_long = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ &sse_long, &sum_long); \ *sse = (uint32_t)sse_long; \ sum = (int)sum_long; \ return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \ } #define HBD_VARIANCE_WXH_10_SVE(w, h) \ uint32_t aom_highbd_10_variance##w##x##h##_sve( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, uint32_t *sse) { \ int sum; \ int64_t var; \ uint64_t sse_long = 0; \ int64_t sum_long = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ &sse_long, &sum_long); \ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ return (var >= 0) ? (uint32_t)var : 0; \ } #define HBD_VARIANCE_WXH_12_SVE(w, h) \ uint32_t aom_highbd_12_variance##w##x##h##_sve( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, uint32_t *sse) { \ int sum; \ int64_t var; \ uint64_t sse_long = 0; \ int64_t sum_long = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ &sse_long, &sum_long); \ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ return (var >= 0) ? (uint32_t)var : 0; \ } // 8-bit HBD_VARIANCE_WXH_8_SVE(4, 4) HBD_VARIANCE_WXH_8_SVE(4, 8) HBD_VARIANCE_WXH_8_SVE(8, 4) HBD_VARIANCE_WXH_8_SVE(8, 8) HBD_VARIANCE_WXH_8_SVE(8, 16) HBD_VARIANCE_WXH_8_SVE(16, 8) HBD_VARIANCE_WXH_8_SVE(16, 16) HBD_VARIANCE_WXH_8_SVE(16, 32) HBD_VARIANCE_WXH_8_SVE(32, 16) HBD_VARIANCE_WXH_8_SVE(32, 32) HBD_VARIANCE_WXH_8_SVE(32, 64) HBD_VARIANCE_WXH_8_SVE(64, 32) HBD_VARIANCE_WXH_8_SVE(64, 64) HBD_VARIANCE_WXH_8_SVE(64, 128) HBD_VARIANCE_WXH_8_SVE(128, 64) HBD_VARIANCE_WXH_8_SVE(128, 128) // 10-bit HBD_VARIANCE_WXH_10_SVE(4, 4) HBD_VARIANCE_WXH_10_SVE(4, 8) HBD_VARIANCE_WXH_10_SVE(8, 4) HBD_VARIANCE_WXH_10_SVE(8, 8) HBD_VARIANCE_WXH_10_SVE(8, 16) HBD_VARIANCE_WXH_10_SVE(16, 8) HBD_VARIANCE_WXH_10_SVE(16, 16) HBD_VARIANCE_WXH_10_SVE(16, 32) HBD_VARIANCE_WXH_10_SVE(32, 16) HBD_VARIANCE_WXH_10_SVE(32, 32) HBD_VARIANCE_WXH_10_SVE(32, 64) HBD_VARIANCE_WXH_10_SVE(64, 32) HBD_VARIANCE_WXH_10_SVE(64, 64) HBD_VARIANCE_WXH_10_SVE(64, 128) HBD_VARIANCE_WXH_10_SVE(128, 64) HBD_VARIANCE_WXH_10_SVE(128, 128) // 12-bit HBD_VARIANCE_WXH_12_SVE(4, 4) HBD_VARIANCE_WXH_12_SVE(4, 8) HBD_VARIANCE_WXH_12_SVE(8, 4) HBD_VARIANCE_WXH_12_SVE(8, 8) HBD_VARIANCE_WXH_12_SVE(8, 16) HBD_VARIANCE_WXH_12_SVE(16, 8) HBD_VARIANCE_WXH_12_SVE(16, 16) HBD_VARIANCE_WXH_12_SVE(16, 32) HBD_VARIANCE_WXH_12_SVE(32, 16) HBD_VARIANCE_WXH_12_SVE(32, 32) HBD_VARIANCE_WXH_12_SVE(32, 64) HBD_VARIANCE_WXH_12_SVE(64, 32) HBD_VARIANCE_WXH_12_SVE(64, 64) HBD_VARIANCE_WXH_12_SVE(64, 128) HBD_VARIANCE_WXH_12_SVE(128, 64) HBD_VARIANCE_WXH_12_SVE(128, 128) #if !CONFIG_REALTIME_ONLY // 8-bit HBD_VARIANCE_WXH_8_SVE(4, 16) HBD_VARIANCE_WXH_8_SVE(8, 32) HBD_VARIANCE_WXH_8_SVE(16, 4) HBD_VARIANCE_WXH_8_SVE(16, 64) HBD_VARIANCE_WXH_8_SVE(32, 8) HBD_VARIANCE_WXH_8_SVE(64, 16) // 10-bit HBD_VARIANCE_WXH_10_SVE(4, 16) HBD_VARIANCE_WXH_10_SVE(8, 32) HBD_VARIANCE_WXH_10_SVE(16, 4) HBD_VARIANCE_WXH_10_SVE(16, 64) HBD_VARIANCE_WXH_10_SVE(32, 8) HBD_VARIANCE_WXH_10_SVE(64, 16) // 12-bit HBD_VARIANCE_WXH_12_SVE(4, 16) HBD_VARIANCE_WXH_12_SVE(8, 32) HBD_VARIANCE_WXH_12_SVE(16, 4) HBD_VARIANCE_WXH_12_SVE(16, 64) HBD_VARIANCE_WXH_12_SVE(32, 8) HBD_VARIANCE_WXH_12_SVE(64, 16) #endif // !CONFIG_REALTIME_ONLY #undef HBD_VARIANCE_WXH_8_SVE #undef HBD_VARIANCE_WXH_10_SVE #undef HBD_VARIANCE_WXH_12_SVE static inline uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, int w, int h, unsigned int *sse) { uint64x2_t sse_u64 = vdupq_n_u64(0); do { int j = 0; do { uint16x8_t s = vld1q_u16(src_ptr + j); uint16x8_t r = vld1q_u16(ref_ptr + j); uint16x8_t diff = vabdq_u16(s, r); sse_u64 = aom_udotq_u16(sse_u64, diff, diff); j += 8; } while (j < w); src_ptr += src_stride; ref_ptr += ref_stride; } while (--h != 0); *sse = (uint32_t)vaddvq_u64(sse_u64); return *sse; } #define HIGHBD_MSE_WXH_SVE(w, h) \ uint32_t aom_highbd_10_mse##w##x##h##_sve( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, uint32_t *sse) { \ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse); \ *sse = ROUND_POWER_OF_TWO(*sse, 4); \ return *sse; \ } \ \ uint32_t aom_highbd_12_mse##w##x##h##_sve( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, uint32_t *sse) { \ uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse); \ *sse = ROUND_POWER_OF_TWO(*sse, 8); \ return *sse; \ } HIGHBD_MSE_WXH_SVE(16, 16) HIGHBD_MSE_WXH_SVE(16, 8) HIGHBD_MSE_WXH_SVE(8, 16) HIGHBD_MSE_WXH_SVE(8, 8) #undef HIGHBD_MSE_WXH_SVE uint64_t aom_mse_wxh_16bit_highbd_sve(uint16_t *dst, int dstride, uint16_t *src, int sstride, int w, int h) { assert((w == 8 || w == 4) && (h == 8 || h == 4)); uint64x2_t sum = vdupq_n_u64(0); if (w == 8) { do { uint16x8_t d0 = vld1q_u16(dst + 0 * dstride); uint16x8_t d1 = vld1q_u16(dst + 1 * dstride); uint16x8_t s0 = vld1q_u16(src + 0 * sstride); uint16x8_t s1 = vld1q_u16(src + 1 * sstride); uint16x8_t abs_diff0 = vabdq_u16(s0, d0); uint16x8_t abs_diff1 = vabdq_u16(s1, d1); sum = aom_udotq_u16(sum, abs_diff0, abs_diff0); sum = aom_udotq_u16(sum, abs_diff1, abs_diff1); dst += 2 * dstride; src += 2 * sstride; h -= 2; } while (h != 0); } else { // w == 4 do { uint16x8_t d0 = load_unaligned_u16_4x2(dst + 0 * dstride, dstride); uint16x8_t d1 = load_unaligned_u16_4x2(dst + 2 * dstride, dstride); uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride); uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride); uint16x8_t abs_diff0 = vabdq_u16(s0, d0); uint16x8_t abs_diff1 = vabdq_u16(s1, d1); sum = aom_udotq_u16(sum, abs_diff0, abs_diff0); sum = aom_udotq_u16(sum, abs_diff1, abs_diff1); dst += 4 * dstride; src += 4 * sstride; h -= 4; } while (h != 0); } return vaddvq_u64(sum); } aom-3.12.1/aom_dsp/arm/intrapred_neon.c000066400000000000000000004071401477627663500177630ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/reinterpret_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_dsp/intrapred_common.h" //------------------------------------------------------------------------------ // DC 4x4 static inline uint16x8_t dc_load_sum_4(const uint8_t *in) { const uint8x8_t a = load_u8_4x1(in); const uint16x4_t p0 = vpaddl_u8(a); const uint16x4_t p1 = vpadd_u16(p0, p0); return vcombine_u16(p1, vdup_n_u16(0)); } static inline void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h, uint8x8_t dc) { for (int i = 0; i < h; ++i) { store_u8_4x1(dst + i * stride, dc); } } void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint16x8_t sum_top = dc_load_sum_4(above); const uint16x8_t sum_left = dc_load_sum_4(left); const uint16x8_t sum = vaddq_u16(sum_left, sum_top); const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0)); } void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint16x8_t sum_left = dc_load_sum_4(left); const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 2); (void)above; dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0)); } void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint16x8_t sum_top = dc_load_sum_4(above); const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 2); (void)left; dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0)); } void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t dc0 = vdup_n_u8(0x80); (void)above; (void)left; dc_store_4xh(dst, stride, 4, dc0); } //------------------------------------------------------------------------------ // DC 8x8 static inline uint16x8_t dc_load_sum_8(const uint8_t *in) { // This isn't used in the case where we want to load both above and left // vectors, since we want to avoid performing the reduction twice. const uint8x8_t a = vld1_u8(in); const uint16x4_t p0 = vpaddl_u8(a); const uint16x4_t p1 = vpadd_u16(p0, p0); const uint16x4_t p2 = vpadd_u16(p1, p1); return vcombine_u16(p2, vdup_n_u16(0)); } static inline uint16x8_t horizontal_add_and_broadcast_u16x8(uint16x8_t a) { #if AOM_ARCH_AARCH64 // On AArch64 we could also use vdupq_n_u16(vaddvq_u16(a)) here to save an // instruction, however the addv instruction is usually slightly more // expensive than a pairwise addition, so the need for immediately // broadcasting the result again seems to negate any benefit. const uint16x8_t b = vpaddq_u16(a, a); const uint16x8_t c = vpaddq_u16(b, b); return vpaddq_u16(c, c); #else const uint16x4_t b = vadd_u16(vget_low_u16(a), vget_high_u16(a)); const uint16x4_t c = vpadd_u16(b, b); const uint16x4_t d = vpadd_u16(c, c); return vcombine_u16(d, d); #endif } static inline void dc_store_8xh(uint8_t *dst, ptrdiff_t stride, int h, uint8x8_t dc) { for (int i = 0; i < h; ++i) { vst1_u8(dst + i * stride, dc); } } void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t sum_top = vld1_u8(above); const uint8x8_t sum_left = vld1_u8(left); uint16x8_t sum = vaddl_u8(sum_left, sum_top); sum = horizontal_add_and_broadcast_u16x8(sum); const uint8x8_t dc0 = vrshrn_n_u16(sum, 4); dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0)); } void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint16x8_t sum_left = dc_load_sum_8(left); const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 3); (void)above; dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0)); } void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint16x8_t sum_top = dc_load_sum_8(above); const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 3); (void)left; dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0)); } void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t dc0 = vdup_n_u8(0x80); (void)above; (void)left; dc_store_8xh(dst, stride, 8, dc0); } //------------------------------------------------------------------------------ // DC 16x16 static inline uint16x8_t dc_load_partial_sum_16(const uint8_t *in) { const uint8x16_t a = vld1q_u8(in); // delay the remainder of the reduction until // horizontal_add_and_broadcast_u16x8, since we want to do it once rather // than twice in the case we are loading both above and left. return vpaddlq_u8(a); } static inline uint16x8_t dc_load_sum_16(const uint8_t *in) { return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_16(in)); } static inline void dc_store_16xh(uint8_t *dst, ptrdiff_t stride, int h, uint8x16_t dc) { for (int i = 0; i < h; ++i) { vst1q_u8(dst + i * stride, dc); } } void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint16x8_t sum_top = dc_load_partial_sum_16(above); const uint16x8_t sum_left = dc_load_partial_sum_16(left); uint16x8_t sum = vaddq_u16(sum_left, sum_top); sum = horizontal_add_and_broadcast_u16x8(sum); const uint8x8_t dc0 = vrshrn_n_u16(sum, 5); dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0)); } void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint16x8_t sum_left = dc_load_sum_16(left); const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 4); (void)above; dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0)); } void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint16x8_t sum_top = dc_load_sum_16(above); const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 4); (void)left; dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0)); } void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t dc0 = vdupq_n_u8(0x80); (void)above; (void)left; dc_store_16xh(dst, stride, 16, dc0); } //------------------------------------------------------------------------------ // DC 32x32 static inline uint16x8_t dc_load_partial_sum_32(const uint8_t *in) { const uint8x16_t a0 = vld1q_u8(in); const uint8x16_t a1 = vld1q_u8(in + 16); // delay the remainder of the reduction until // horizontal_add_and_broadcast_u16x8, since we want to do it once rather // than twice in the case we are loading both above and left. return vpadalq_u8(vpaddlq_u8(a0), a1); } static inline uint16x8_t dc_load_sum_32(const uint8_t *in) { return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_32(in)); } static inline void dc_store_32xh(uint8_t *dst, ptrdiff_t stride, int h, uint8x16_t dc) { for (int i = 0; i < h; ++i) { vst1q_u8(dst + i * stride, dc); vst1q_u8(dst + i * stride + 16, dc); } } void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint16x8_t sum_top = dc_load_partial_sum_32(above); const uint16x8_t sum_left = dc_load_partial_sum_32(left); uint16x8_t sum = vaddq_u16(sum_left, sum_top); sum = horizontal_add_and_broadcast_u16x8(sum); const uint8x8_t dc0 = vrshrn_n_u16(sum, 6); dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0)); } void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint16x8_t sum_left = dc_load_sum_32(left); const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 5); (void)above; dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0)); } void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint16x8_t sum_top = dc_load_sum_32(above); const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 5); (void)left; dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0)); } void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t dc0 = vdupq_n_u8(0x80); (void)above; (void)left; dc_store_32xh(dst, stride, 32, dc0); } //------------------------------------------------------------------------------ // DC 64x64 static inline uint16x8_t dc_load_partial_sum_64(const uint8_t *in) { const uint8x16_t a0 = vld1q_u8(in); const uint8x16_t a1 = vld1q_u8(in + 16); const uint8x16_t a2 = vld1q_u8(in + 32); const uint8x16_t a3 = vld1q_u8(in + 48); const uint16x8_t p01 = vpadalq_u8(vpaddlq_u8(a0), a1); const uint16x8_t p23 = vpadalq_u8(vpaddlq_u8(a2), a3); // delay the remainder of the reduction until // horizontal_add_and_broadcast_u16x8, since we want to do it once rather // than twice in the case we are loading both above and left. return vaddq_u16(p01, p23); } static inline uint16x8_t dc_load_sum_64(const uint8_t *in) { return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_64(in)); } static inline void dc_store_64xh(uint8_t *dst, ptrdiff_t stride, int h, uint8x16_t dc) { for (int i = 0; i < h; ++i) { vst1q_u8(dst + i * stride, dc); vst1q_u8(dst + i * stride + 16, dc); vst1q_u8(dst + i * stride + 32, dc); vst1q_u8(dst + i * stride + 48, dc); } } void aom_dc_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint16x8_t sum_top = dc_load_partial_sum_64(above); const uint16x8_t sum_left = dc_load_partial_sum_64(left); uint16x8_t sum = vaddq_u16(sum_left, sum_top); sum = horizontal_add_and_broadcast_u16x8(sum); const uint8x8_t dc0 = vrshrn_n_u16(sum, 7); dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0)); } void aom_dc_left_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint16x8_t sum_left = dc_load_sum_64(left); const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 6); (void)above; dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0)); } void aom_dc_top_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint16x8_t sum_top = dc_load_sum_64(above); const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 6); (void)left; dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0)); } void aom_dc_128_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t dc0 = vdupq_n_u8(0x80); (void)above; (void)left; dc_store_64xh(dst, stride, 64, dc0); } //------------------------------------------------------------------------------ // DC rectangular cases #define DC_MULTIPLIER_1X2 0x5556 #define DC_MULTIPLIER_1X4 0x3334 #define DC_SHIFT2 16 static inline int divide_using_multiply_shift(int num, int shift1, int multiplier, int shift2) { const int interm = num >> shift1; return interm * multiplier >> shift2; } static inline int calculate_dc_from_sum(int bw, int bh, uint32_t sum, int shift1, int multiplier) { const int expected_dc = divide_using_multiply_shift( sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2); assert(expected_dc < (1 << 8)); return expected_dc; } #undef DC_SHIFT2 void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint8x8_t a = load_u8_4x1(above); uint8x8_t l = vld1_u8(left); uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l)); uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2); dc_store_4xh(dst, stride, 8, vdup_n_u8(dc)); } void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint8x8_t a = vld1_u8(above); uint8x8_t l = load_u8_4x1(left); uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l)); uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2); dc_store_8xh(dst, stride, 4, vdup_n_u8(dc)); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint8x8_t a = load_u8_4x1(above); uint8x16_t l = vld1q_u8(left); uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a); uint32_t sum = horizontal_add_u16x8(sum_al); uint32_t dc = calculate_dc_from_sum(4, 16, sum, 2, DC_MULTIPLIER_1X4); dc_store_4xh(dst, stride, 16, vdup_n_u8(dc)); } void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint8x16_t a = vld1q_u8(above); uint8x8_t l = load_u8_4x1(left); uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l); uint32_t sum = horizontal_add_u16x8(sum_al); uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4); dc_store_16xh(dst, stride, 4, vdupq_n_u8(dc)); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint8x8_t a = vld1_u8(above); uint8x16_t l = vld1q_u8(left); uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a); uint32_t sum = horizontal_add_u16x8(sum_al); uint32_t dc = calculate_dc_from_sum(8, 16, sum, 3, DC_MULTIPLIER_1X2); dc_store_8xh(dst, stride, 16, vdup_n_u8(dc)); } void aom_dc_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint8x16_t a = vld1q_u8(above); uint8x8_t l = vld1_u8(left); uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l); uint32_t sum = horizontal_add_u16x8(sum_al); uint32_t dc = calculate_dc_from_sum(16, 8, sum, 3, DC_MULTIPLIER_1X2); dc_store_16xh(dst, stride, 8, vdupq_n_u8(dc)); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint8x8_t a = vld1_u8(above); uint16x8_t sum_left = dc_load_partial_sum_32(left); uint16x8_t sum_al = vaddw_u8(sum_left, a); uint32_t sum = horizontal_add_u16x8(sum_al); uint32_t dc = calculate_dc_from_sum(8, 32, sum, 3, DC_MULTIPLIER_1X4); dc_store_8xh(dst, stride, 32, vdup_n_u8(dc)); } void aom_dc_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint16x8_t sum_top = dc_load_partial_sum_32(above); uint8x8_t l = vld1_u8(left); uint16x8_t sum_al = vaddw_u8(sum_top, l); uint32_t sum = horizontal_add_u16x8(sum_al); uint32_t dc = calculate_dc_from_sum(32, 8, sum, 3, DC_MULTIPLIER_1X4); dc_store_32xh(dst, stride, 8, vdupq_n_u8(dc)); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint16x8_t sum_above = dc_load_partial_sum_16(above); uint16x8_t sum_left = dc_load_partial_sum_32(left); uint16x8_t sum_al = vaddq_u16(sum_left, sum_above); uint32_t sum = horizontal_add_u16x8(sum_al); uint32_t dc = calculate_dc_from_sum(16, 32, sum, 4, DC_MULTIPLIER_1X2); dc_store_16xh(dst, stride, 32, vdupq_n_u8(dc)); } void aom_dc_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint16x8_t sum_above = dc_load_partial_sum_32(above); uint16x8_t sum_left = dc_load_partial_sum_16(left); uint16x8_t sum_al = vaddq_u16(sum_left, sum_above); uint32_t sum = horizontal_add_u16x8(sum_al); uint32_t dc = calculate_dc_from_sum(32, 16, sum, 4, DC_MULTIPLIER_1X2); dc_store_32xh(dst, stride, 16, vdupq_n_u8(dc)); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint16x8_t sum_above = dc_load_partial_sum_16(above); uint16x8_t sum_left = dc_load_partial_sum_64(left); uint16x8_t sum_al = vaddq_u16(sum_left, sum_above); uint32_t sum = horizontal_add_u16x8(sum_al); uint32_t dc = calculate_dc_from_sum(16, 64, sum, 4, DC_MULTIPLIER_1X4); dc_store_16xh(dst, stride, 64, vdupq_n_u8(dc)); } void aom_dc_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint16x8_t sum_above = dc_load_partial_sum_64(above); uint16x8_t sum_left = dc_load_partial_sum_16(left); uint16x8_t sum_al = vaddq_u16(sum_above, sum_left); uint32_t sum = horizontal_add_u16x8(sum_al); uint32_t dc = calculate_dc_from_sum(64, 16, sum, 4, DC_MULTIPLIER_1X4); dc_store_64xh(dst, stride, 16, vdupq_n_u8(dc)); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint16x8_t sum_above = dc_load_partial_sum_32(above); uint16x8_t sum_left = dc_load_partial_sum_64(left); uint16x8_t sum_al = vaddq_u16(sum_above, sum_left); uint32_t sum = horizontal_add_u16x8(sum_al); uint32_t dc = calculate_dc_from_sum(32, 64, sum, 5, DC_MULTIPLIER_1X2); dc_store_32xh(dst, stride, 64, vdupq_n_u8(dc)); } void aom_dc_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint16x8_t sum_above = dc_load_partial_sum_64(above); uint16x8_t sum_left = dc_load_partial_sum_32(left); uint16x8_t sum_al = vaddq_u16(sum_above, sum_left); uint32_t sum = horizontal_add_u16x8(sum_al); uint32_t dc = calculate_dc_from_sum(64, 32, sum, 5, DC_MULTIPLIER_1X2); dc_store_64xh(dst, stride, 32, vdupq_n_u8(dc)); } #undef DC_MULTIPLIER_1X2 #undef DC_MULTIPLIER_1X4 #define DC_PREDICTOR_128(w, h, q) \ void aom_dc_128_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \ const uint8_t *above, \ const uint8_t *left) { \ (void)above; \ (void)left; \ dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u8(0x80)); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER DC_PREDICTOR_128(4, 16, ) DC_PREDICTOR_128(8, 32, ) DC_PREDICTOR_128(16, 4, q) DC_PREDICTOR_128(16, 64, q) DC_PREDICTOR_128(32, 8, q) DC_PREDICTOR_128(64, 16, q) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER DC_PREDICTOR_128(4, 8, ) DC_PREDICTOR_128(8, 4, ) DC_PREDICTOR_128(8, 16, ) DC_PREDICTOR_128(16, 8, q) DC_PREDICTOR_128(16, 32, q) DC_PREDICTOR_128(32, 16, q) DC_PREDICTOR_128(32, 64, q) DC_PREDICTOR_128(64, 32, q) #undef DC_PREDICTOR_128 #define DC_PREDICTOR_LEFT(w, h, shift, q) \ void aom_dc_left_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \ const uint8_t *above, \ const uint8_t *left) { \ (void)above; \ const uint16x8_t sum = dc_load_sum_##h(left); \ const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift)); \ dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0)); \ } DC_PREDICTOR_LEFT(4, 8, 3, ) DC_PREDICTOR_LEFT(8, 4, 2, ) DC_PREDICTOR_LEFT(8, 16, 4, ) DC_PREDICTOR_LEFT(16, 8, 3, q) DC_PREDICTOR_LEFT(16, 32, 5, q) DC_PREDICTOR_LEFT(32, 16, 4, q) DC_PREDICTOR_LEFT(32, 64, 6, q) DC_PREDICTOR_LEFT(64, 32, 5, q) #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER DC_PREDICTOR_LEFT(4, 16, 4, ) DC_PREDICTOR_LEFT(16, 4, 2, q) DC_PREDICTOR_LEFT(8, 32, 5, ) DC_PREDICTOR_LEFT(32, 8, 3, q) DC_PREDICTOR_LEFT(16, 64, 6, q) DC_PREDICTOR_LEFT(64, 16, 4, q) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #undef DC_PREDICTOR_LEFT #define DC_PREDICTOR_TOP(w, h, shift, q) \ void aom_dc_top_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \ const uint8_t *above, \ const uint8_t *left) { \ (void)left; \ const uint16x8_t sum = dc_load_sum_##w(above); \ const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift)); \ dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0)); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER DC_PREDICTOR_TOP(8, 32, 3, ) DC_PREDICTOR_TOP(4, 16, 2, ) DC_PREDICTOR_TOP(16, 4, 4, q) DC_PREDICTOR_TOP(16, 64, 4, q) DC_PREDICTOR_TOP(32, 8, 5, q) DC_PREDICTOR_TOP(64, 16, 6, q) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER DC_PREDICTOR_TOP(4, 8, 2, ) DC_PREDICTOR_TOP(8, 4, 3, ) DC_PREDICTOR_TOP(8, 16, 3, ) DC_PREDICTOR_TOP(16, 8, 4, q) DC_PREDICTOR_TOP(16, 32, 4, q) DC_PREDICTOR_TOP(32, 16, 5, q) DC_PREDICTOR_TOP(32, 64, 5, q) DC_PREDICTOR_TOP(64, 32, 6, q) #undef DC_PREDICTOR_TOP // ----------------------------------------------------------------------------- static inline void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h, uint8x8_t d0) { for (int i = 0; i < h; ++i) { store_u8_4x1(dst + i * stride, d0); } } static inline void v_store_8xh(uint8_t *dst, ptrdiff_t stride, int h, uint8x8_t d0) { for (int i = 0; i < h; ++i) { vst1_u8(dst + i * stride, d0); } } static inline void v_store_16xh(uint8_t *dst, ptrdiff_t stride, int h, uint8x16_t d0) { for (int i = 0; i < h; ++i) { vst1q_u8(dst + i * stride, d0); } } static inline void v_store_32xh(uint8_t *dst, ptrdiff_t stride, int h, uint8x16_t d0, uint8x16_t d1) { for (int i = 0; i < h; ++i) { vst1q_u8(dst + 0, d0); vst1q_u8(dst + 16, d1); dst += stride; } } static inline void v_store_64xh(uint8_t *dst, ptrdiff_t stride, int h, uint8x16_t d0, uint8x16_t d1, uint8x16_t d2, uint8x16_t d3) { for (int i = 0; i < h; ++i) { vst1q_u8(dst + 0, d0); vst1q_u8(dst + 16, d1); vst1q_u8(dst + 32, d2); vst1q_u8(dst + 48, d3); dst += stride; } } void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_store_4xh(dst, stride, 4, load_u8_4x1(above)); } void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_store_8xh(dst, stride, 8, vld1_u8(above)); } void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_store_16xh(dst, stride, 16, vld1q_u8(above)); } void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(above); const uint8x16_t d1 = vld1q_u8(above + 16); (void)left; v_store_32xh(dst, stride, 32, d0, d1); } void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_store_4xh(dst, stride, 8, load_u8_4x1(above)); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_store_4xh(dst, stride, 16, load_u8_4x1(above)); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_store_8xh(dst, stride, 4, vld1_u8(above)); } void aom_v_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_store_8xh(dst, stride, 16, vld1_u8(above)); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_store_8xh(dst, stride, 32, vld1_u8(above)); } void aom_v_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_store_16xh(dst, stride, 4, vld1q_u8(above)); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_store_16xh(dst, stride, 8, vld1q_u8(above)); } void aom_v_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_store_16xh(dst, stride, 32, vld1q_u8(above)); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_store_16xh(dst, stride, 64, vld1q_u8(above)); } void aom_v_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(above); const uint8x16_t d1 = vld1q_u8(above + 16); (void)left; v_store_32xh(dst, stride, 8, d0, d1); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(above); const uint8x16_t d1 = vld1q_u8(above + 16); (void)left; v_store_32xh(dst, stride, 16, d0, d1); } void aom_v_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(above); const uint8x16_t d1 = vld1q_u8(above + 16); (void)left; v_store_32xh(dst, stride, 64, d0, d1); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(above); const uint8x16_t d1 = vld1q_u8(above + 16); const uint8x16_t d2 = vld1q_u8(above + 32); const uint8x16_t d3 = vld1q_u8(above + 48); (void)left; v_store_64xh(dst, stride, 16, d0, d1, d2, d3); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(above); const uint8x16_t d1 = vld1q_u8(above + 16); const uint8x16_t d2 = vld1q_u8(above + 32); const uint8x16_t d3 = vld1q_u8(above + 48); (void)left; v_store_64xh(dst, stride, 32, d0, d1, d2, d3); } void aom_v_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(above); const uint8x16_t d1 = vld1q_u8(above + 16); const uint8x16_t d2 = vld1q_u8(above + 32); const uint8x16_t d3 = vld1q_u8(above + 48); (void)left; v_store_64xh(dst, stride, 64, d0, d1, d2, d3); } // ----------------------------------------------------------------------------- static inline void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0)); store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1)); store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2)); store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3)); store_u8_4x1(dst + 4 * stride, vdup_lane_u8(d0, 4)); store_u8_4x1(dst + 5 * stride, vdup_lane_u8(d0, 5)); store_u8_4x1(dst + 6 * stride, vdup_lane_u8(d0, 6)); store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7)); } static inline void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0)); vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1)); vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2)); vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3)); vst1_u8(dst + 4 * stride, vdup_lane_u8(d0, 4)); vst1_u8(dst + 5 * stride, vdup_lane_u8(d0, 5)); vst1_u8(dst + 6 * stride, vdup_lane_u8(d0, 6)); vst1_u8(dst + 7 * stride, vdup_lane_u8(d0, 7)); } static inline void h_store_16x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0)); vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1)); vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2)); vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3)); vst1q_u8(dst + 4 * stride, vdupq_lane_u8(d0, 4)); vst1q_u8(dst + 5 * stride, vdupq_lane_u8(d0, 5)); vst1q_u8(dst + 6 * stride, vdupq_lane_u8(d0, 6)); vst1q_u8(dst + 7 * stride, vdupq_lane_u8(d0, 7)); } static inline void h_store_32x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0)); vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0)); dst += stride; vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1)); vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1)); dst += stride; vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2)); vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2)); dst += stride; vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3)); vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3)); dst += stride; vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4)); vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4)); dst += stride; vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5)); vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5)); dst += stride; vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6)); vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6)); dst += stride; vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7)); vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7)); } static inline void h_store_64x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0)); vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0)); vst1q_u8(dst + 32, vdupq_lane_u8(d0, 0)); vst1q_u8(dst + 48, vdupq_lane_u8(d0, 0)); dst += stride; vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1)); vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1)); vst1q_u8(dst + 32, vdupq_lane_u8(d0, 1)); vst1q_u8(dst + 48, vdupq_lane_u8(d0, 1)); dst += stride; vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2)); vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2)); vst1q_u8(dst + 32, vdupq_lane_u8(d0, 2)); vst1q_u8(dst + 48, vdupq_lane_u8(d0, 2)); dst += stride; vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3)); vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3)); vst1q_u8(dst + 32, vdupq_lane_u8(d0, 3)); vst1q_u8(dst + 48, vdupq_lane_u8(d0, 3)); dst += stride; vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4)); vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4)); vst1q_u8(dst + 32, vdupq_lane_u8(d0, 4)); vst1q_u8(dst + 48, vdupq_lane_u8(d0, 4)); dst += stride; vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5)); vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5)); vst1q_u8(dst + 32, vdupq_lane_u8(d0, 5)); vst1q_u8(dst + 48, vdupq_lane_u8(d0, 5)); dst += stride; vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6)); vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6)); vst1q_u8(dst + 32, vdupq_lane_u8(d0, 6)); vst1q_u8(dst + 48, vdupq_lane_u8(d0, 6)); dst += stride; vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7)); vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7)); vst1q_u8(dst + 32, vdupq_lane_u8(d0, 7)); vst1q_u8(dst + 48, vdupq_lane_u8(d0, 7)); } void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t d0 = load_u8_4x1(left); (void)above; store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0)); store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1)); store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2)); store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3)); } void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t d0 = vld1_u8(left); (void)above; h_store_8x8(dst, stride, d0); } void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(left); (void)above; h_store_16x8(dst, stride, vget_low_u8(d0)); h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0)); } void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(left); const uint8x16_t d1 = vld1q_u8(left + 16); (void)above; h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0)); h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0)); h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1)); h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1)); } void aom_h_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t d0 = vld1_u8(left); (void)above; h_store_4x8(dst, stride, d0); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(left); (void)above; h_store_4x8(dst + 0 * stride, stride, vget_low_u8(d0)); h_store_4x8(dst + 8 * stride, stride, vget_high_u8(d0)); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t d0 = load_u8_4x1(left); (void)above; vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0)); vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1)); vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2)); vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3)); } void aom_h_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(left); (void)above; h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0)); h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0)); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(left); const uint8x16_t d1 = vld1q_u8(left + 16); (void)above; h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0)); h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0)); h_store_8x8(dst + 16 * stride, stride, vget_low_u8(d1)); h_store_8x8(dst + 24 * stride, stride, vget_high_u8(d1)); } void aom_h_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t d0 = load_u8_4x1(left); (void)above; vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0)); vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1)); vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2)); vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3)); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t d0 = vld1_u8(left); (void)above; h_store_16x8(dst, stride, d0); } void aom_h_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(left); const uint8x16_t d1 = vld1q_u8(left + 16); (void)above; h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0)); h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0)); h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1)); h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1)); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(left); const uint8x16_t d1 = vld1q_u8(left + 16); const uint8x16_t d2 = vld1q_u8(left + 32); const uint8x16_t d3 = vld1q_u8(left + 48); (void)above; h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0)); h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0)); h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1)); h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1)); h_store_16x8(dst + 32 * stride, stride, vget_low_u8(d2)); h_store_16x8(dst + 40 * stride, stride, vget_high_u8(d2)); h_store_16x8(dst + 48 * stride, stride, vget_low_u8(d3)); h_store_16x8(dst + 56 * stride, stride, vget_high_u8(d3)); } void aom_h_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t d0 = vld1_u8(left); (void)above; h_store_32x8(dst, stride, d0); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(left); (void)above; h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0)); h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0)); } void aom_h_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(left + 0); const uint8x16_t d1 = vld1q_u8(left + 16); const uint8x16_t d2 = vld1q_u8(left + 32); const uint8x16_t d3 = vld1q_u8(left + 48); (void)above; h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0)); h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0)); h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1)); h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1)); h_store_32x8(dst + 32 * stride, stride, vget_low_u8(d2)); h_store_32x8(dst + 40 * stride, stride, vget_high_u8(d2)); h_store_32x8(dst + 48 * stride, stride, vget_low_u8(d3)); h_store_32x8(dst + 56 * stride, stride, vget_high_u8(d3)); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t d0 = vld1q_u8(left); (void)above; h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0)); h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0)); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; for (int i = 0; i < 2; ++i) { const uint8x16_t d0 = vld1q_u8(left); h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0)); h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0)); left += 16; dst += 16 * stride; } } void aom_h_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; for (int i = 0; i < 4; ++i) { const uint8x16_t d0 = vld1q_u8(left); h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0)); h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0)); left += 16; dst += 16 * stride; } } /* ---------------------P R E D I C T I O N Z 1--------------------------- */ // Low bit depth functions static DECLARE_ALIGNED(32, const uint8_t, BaseMask[33][32]) = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, }; static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64( int H, int W, uint8x8_t *dst, const uint8_t *above, int upsample_above, int dx) { const int frac_bits = 6 - upsample_above; const int max_base_x = ((W + H) - 1) << upsample_above; assert(dx > 0); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 const uint8x8_t a_mbase_x = vdup_n_u8(above[max_base_x]); int x = dx; for (int r = 0; r < W; r++) { int base = x >> frac_bits; int base_max_diff = (max_base_x - base) >> upsample_above; if (base_max_diff <= 0) { for (int i = r; i < W; ++i) { dst[i] = a_mbase_x; // save 4 values } return; } if (base_max_diff > H) base_max_diff = H; uint8x8x2_t a01_128; uint16x8_t shift; if (upsample_above) { a01_128 = vld2_u8(above + base); shift = vdupq_n_u16(((x << upsample_above) & 0x3f) >> 1); } else { a01_128.val[0] = vld1_u8(above + base); a01_128.val[1] = vld1_u8(above + base + 1); shift = vdupq_n_u16((x & 0x3f) >> 1); } uint16x8_t diff = vsubl_u8(a01_128.val[1], a01_128.val[0]); uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a01_128.val[0], vdup_n_u8(32)); uint16x8_t res = vmlaq_u16(a32, diff, shift); uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]); dst[r] = vbsl_u8(mask, vshrn_n_u16(res, 5), a_mbase_x); x += dx; } } static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int upsample_above, int dx) { uint8x8_t dstvec[16]; dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above, dx); for (int i = 0; i < N; i++) { vst1_lane_u32((uint32_t *)(dst + stride * i), vreinterpret_u32_u8(dstvec[i]), 0); } } static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int upsample_above, int dx) { uint8x8_t dstvec[32]; dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above, dx); for (int i = 0; i < N; i++) { vst1_u8(dst + stride * i, dstvec[i]); } } static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon( int H, int W, uint8x16_t *dst, const uint8_t *above, int upsample_above, int dx) { const int frac_bits = 6 - upsample_above; const int max_base_x = ((W + H) - 1) << upsample_above; assert(dx > 0); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]); int x = dx; for (int r = 0; r < W; r++) { int base = x >> frac_bits; int base_max_diff = (max_base_x - base) >> upsample_above; if (base_max_diff <= 0) { for (int i = r; i < W; ++i) { dst[i] = a_mbase_x; // save 4 values } return; } if (base_max_diff > H) base_max_diff = H; uint16x8_t shift; uint8x16_t a0_128, a1_128; if (upsample_above) { uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base); a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]); a1_128 = vextq_u8(a0_128, vdupq_n_u8(0), 8); shift = vdupq_n_u16(x & 0x1f); } else { a0_128 = vld1q_u8(above + base); a1_128 = vld1q_u8(above + base + 1); shift = vdupq_n_u16((x & 0x3f) >> 1); } uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); uint16x8_t diff_hi = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); uint16x8_t a32_lo = vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32)); uint16x8_t a32_hi = vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32)); uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift); uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift); uint8x16_t v_temp = vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)); uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]); dst[r] = vbslq_u8(mask, v_temp, a_mbase_x); x += dx; } } static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int upsample_above, int dx) { uint8x16_t dstvec[64]; dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx); for (int i = 0; i < N; i++) { vst1q_u8(dst + stride * i, dstvec[i]); } } static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon( int N, uint8x16x2_t *dstvec, const uint8_t *above, int dx) { const int frac_bits = 6; const int max_base_x = ((32 + N) - 1); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]); int x = dx; for (int r = 0; r < N; r++) { int base = x >> frac_bits; int base_max_diff = (max_base_x - base); if (base_max_diff <= 0) { for (int i = r; i < N; ++i) { dstvec[i].val[0] = a_mbase_x; // save 32 values dstvec[i].val[1] = a_mbase_x; } return; } if (base_max_diff > 32) base_max_diff = 32; uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1); uint8x16_t res16[2]; for (int j = 0, jj = 0; j < 32; j += 16, jj++) { int mdiff = base_max_diff - j; if (mdiff <= 0) { res16[jj] = a_mbase_x; } else { uint8x16_t a0_128 = vld1q_u8(above + base + j); uint8x16_t a1_128 = vld1q_u8(above + base + j + 1); uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); uint16x8_t diff_hi = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); uint16x8_t a32_lo = vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32)); uint16x8_t a32_hi = vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32)); uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift); uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift); res16[jj] = vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)); } } uint8x16_t mask_lo = vld1q_u8(BaseMask[base_max_diff]); uint8x16_t mask_hi = vld1q_u8(BaseMask[base_max_diff] + 16); dstvec[r].val[0] = vbslq_u8(mask_lo, res16[0], a_mbase_x); dstvec[r].val[1] = vbslq_u8(mask_hi, res16[1], a_mbase_x); x += dx; } } static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int dx) { uint8x16x2_t dstvec[64]; dr_prediction_z1_32xN_internal_neon(N, dstvec, above, dx); for (int i = 0; i < N; i++) { vst1q_u8(dst + stride * i, dstvec[i].val[0]); vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]); } } // clang-format off static const uint8_t kLoadMaxShuffles[] = { 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, }; // clang-format on static inline uint8x16_t z1_load_masked_neon(const uint8_t *ptr, int shuffle_idx) { uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]); uint8x16_t src = vld1q_u8(ptr); #if AOM_ARCH_AARCH64 return vqtbl1q_u8(src, shuffle); #else uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } }; uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle)); uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle)); return vcombine_u8(lo, hi); #endif } static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int dx) { const int frac_bits = 6; const int max_base_x = ((64 + N) - 1); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]); int x = dx; for (int r = 0; r < N; r++, dst += stride) { int base = x >> frac_bits; if (base >= max_base_x) { for (int i = r; i < N; ++i) { vst1q_u8(dst, a_mbase_x); vst1q_u8(dst + 16, a_mbase_x); vst1q_u8(dst + 32, a_mbase_x); vst1q_u8(dst + 48, a_mbase_x); dst += stride; } return; } uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1); uint8x16_t base_inc128 = vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100), vcreate_u8(0x0F0E0D0C0B0A0908))); for (int j = 0; j < 64; j += 16) { if (base + j >= max_base_x) { vst1q_u8(dst + j, a_mbase_x); } else { uint8x16_t a0_128; uint8x16_t a1_128; if (base + j + 15 >= max_base_x) { int shuffle_idx = max_base_x - base - j; a0_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx); } else { a0_128 = vld1q_u8(above + base + j); } if (base + j + 16 >= max_base_x) { int shuffle_idx = max_base_x - base - j - 1; a1_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx); } else { a1_128 = vld1q_u8(above + base + j + 1); } uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); uint16x8_t diff_hi = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); uint16x8_t a32_lo = vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32)); uint16x8_t a32_hi = vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32)); uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift); uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift); vst1q_u8(dst + j, vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5))); base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16)); } } x += dx; } } // Directional prediction, zone 1: 0 < angle < 90 void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy) { (void)left; (void)dy; switch (bw) { case 4: dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx); break; case 8: dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx); break; case 16: dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx); break; case 32: dr_prediction_z1_32xN_neon(bh, dst, stride, above, dx); break; case 64: dr_prediction_z1_64xN_neon(bh, dst, stride, above, dx); break; default: break; } } /* ---------------------P R E D I C T I O N Z 2--------------------------- */ #if !AOM_ARCH_AARCH64 static DECLARE_ALIGNED(16, const uint8_t, LoadMaskz2[4][16]) = { { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff } }; #endif // !AOM_ARCH_AARCH64 static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_above_neon( const uint8_t *above, int upsample_above, int dx, int base_x, int y, uint8x8_t *a0_x, uint8x8_t *a1_x, uint16x4_t *shift0) { uint16x4_t r6 = vcreate_u16(0x00C0008000400000); uint16x4_t ydx = vdup_n_u16(y * dx); if (upsample_above) { // Cannot use LD2 here since we only want to load eight bytes, but LD2 can // only load either 16 or 32. uint8x8_t v_tmp = vld1_u8(above + base_x); *a0_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[0]; *a1_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[1]; *shift0 = vand_u16(vsub_u16(r6, ydx), vdup_n_u16(0x1f)); } else { *a0_x = load_unaligned_u8_4x1(above + base_x); *a1_x = load_unaligned_u8_4x1(above + base_x + 1); *shift0 = vand_u16(vhsub_u16(r6, ydx), vdup_n_u16(0x1f)); } } static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_left_neon( #if AOM_ARCH_AARCH64 uint8x16x2_t left_vals, #else const uint8_t *left, #endif int upsample_left, int dy, int r, int min_base_y, int frac_bits_y, uint16x4_t *a0_y, uint16x4_t *a1_y, uint16x4_t *shift1) { int16x4_t dy64 = vdup_n_s16(dy); int16x4_t v_1234 = vcreate_s16(0x0004000300020001); int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y); int16x4_t min_base_y64 = vdup_n_s16(min_base_y); int16x4_t v_r6 = vdup_n_s16(r << 6); int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64); int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y); // Values in base_y_c64 range from -2 through 14 inclusive. base_y_c64 = vmax_s16(base_y_c64, min_base_y64); #if AOM_ARCH_AARCH64 uint8x8_t left_idx0 = vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2))); // [0, 16] uint8x8_t left_idx1 = vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3))); // [1, 17] *a0_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx0)); *a1_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx1)); #else // !AOM_ARCH_AARCH64 DECLARE_ALIGNED(32, int16_t, base_y_c[4]); vst1_s16(base_y_c, base_y_c64); uint8x8_t a0_y_u8 = vdup_n_u8(0); a0_y_u8 = vld1_lane_u8(left + base_y_c[0], a0_y_u8, 0); a0_y_u8 = vld1_lane_u8(left + base_y_c[1], a0_y_u8, 2); a0_y_u8 = vld1_lane_u8(left + base_y_c[2], a0_y_u8, 4); a0_y_u8 = vld1_lane_u8(left + base_y_c[3], a0_y_u8, 6); base_y_c64 = vadd_s16(base_y_c64, vdup_n_s16(1)); vst1_s16(base_y_c, base_y_c64); uint8x8_t a1_y_u8 = vdup_n_u8(0); a1_y_u8 = vld1_lane_u8(left + base_y_c[0], a1_y_u8, 0); a1_y_u8 = vld1_lane_u8(left + base_y_c[1], a1_y_u8, 2); a1_y_u8 = vld1_lane_u8(left + base_y_c[2], a1_y_u8, 4); a1_y_u8 = vld1_lane_u8(left + base_y_c[3], a1_y_u8, 6); *a0_y = vreinterpret_u16_u8(a0_y_u8); *a1_y = vreinterpret_u16_u8(a1_y_u8); #endif // AOM_ARCH_AARCH64 if (upsample_left) { *shift1 = vand_u16(vreinterpret_u16_s16(y_c64), vdup_n_u16(0x1f)); } else { *shift1 = vand_u16(vshr_n_u16(vreinterpret_u16_s16(y_c64), 1), vdup_n_u16(0x1f)); } } static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_above_neon( const uint8_t *above, int upsample_above, int dx, int base_x, int y) { uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001), vcreate_u16(0x0008000700060005)); uint16x8_t ydx = vdupq_n_u16(y * dx); uint16x8_t r6 = vshlq_n_u16(vextq_u16(c1234, vdupq_n_u16(0), 2), 6); uint16x8_t shift0; uint8x8_t a0_x0; uint8x8_t a1_x0; if (upsample_above) { uint8x8x2_t v_tmp = vld2_u8(above + base_x); a0_x0 = v_tmp.val[0]; a1_x0 = v_tmp.val[1]; shift0 = vandq_u16(vsubq_u16(r6, ydx), vdupq_n_u16(0x1f)); } else { a0_x0 = vld1_u8(above + base_x); a1_x0 = vld1_u8(above + base_x + 1); shift0 = vandq_u16(vhsubq_u16(r6, ydx), vdupq_n_u16(0x1f)); } uint16x8_t diff0 = vsubl_u8(a1_x0, a0_x0); // a[x+1] - a[x] uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a0_x0, vdup_n_u8(32)); // a[x] * 32 + 16 uint16x8_t res = vmlaq_u16(a32, diff0, shift0); return vshrn_n_u16(res, 5); } static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_left_neon( #if AOM_ARCH_AARCH64 uint8x16x3_t left_vals, #else const uint8_t *left, #endif int upsample_left, int dy, int r, int min_base_y, int frac_bits_y) { int16x8_t v_r6 = vdupq_n_s16(r << 6); int16x8_t dy128 = vdupq_n_s16(dy); int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y); int16x8_t min_base_y128 = vdupq_n_s16(min_base_y); uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001), vcreate_u16(0x0008000700060005)); int16x8_t y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128); int16x8_t base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y); // Values in base_y_c128 range from -2 through 31 inclusive. base_y_c128 = vmaxq_s16(base_y_c128, min_base_y128); #if AOM_ARCH_AARCH64 uint8x16_t left_idx0 = vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(2))); // [0, 33] uint8x16_t left_idx1 = vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(3))); // [1, 34] uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1); uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01); uint8x8_t a0_x1 = vget_low_u8(a01_x); uint8x8_t a1_x1 = vget_high_u8(a01_x); #else // !AOM_ARCH_AARCH64 uint8x8_t a0_x1 = load_u8_gather_s16_x8(left, base_y_c128); uint8x8_t a1_x1 = load_u8_gather_s16_x8(left + 1, base_y_c128); #endif // AOM_ARCH_AARCH64 uint16x8_t shift1; if (upsample_left) { shift1 = vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x1f)); } else { shift1 = vshrq_n_u16( vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x3f)), 1); } uint16x8_t diff1 = vsubl_u8(a1_x1, a0_x1); uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a0_x1, vdup_n_u8(32)); uint16x8_t res = vmlaq_u16(a32, diff1, shift1); return vshrn_n_u16(res, 5); } static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_above_neon( const uint8_t *above, int dx, int base_x, int y, int j) { uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000), vcreate_u16(0x0007000600050004)), vcombine_u16(vcreate_u16(0x000B000A00090008), vcreate_u16(0x000F000E000D000C)) } }; uint16x8_t j256 = vdupq_n_u16(j); uint16x8_t ydx = vdupq_n_u16((uint16_t)(y * dx)); const uint8x16_t a0_x128 = vld1q_u8(above + base_x + j); const uint8x16_t a1_x128 = vld1q_u8(above + base_x + j + 1); uint16x8_t res6_0 = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6); uint16x8_t res6_1 = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6); uint16x8_t shift0 = vshrq_n_u16(vandq_u16(vsubq_u16(res6_0, ydx), vdupq_n_u16(0x3f)), 1); uint16x8_t shift1 = vshrq_n_u16(vandq_u16(vsubq_u16(res6_1, ydx), vdupq_n_u16(0x3f)), 1); // a[x+1] - a[x] uint16x8_t diff0 = vsubl_u8(vget_low_u8(a1_x128), vget_low_u8(a0_x128)); uint16x8_t diff1 = vsubl_u8(vget_high_u8(a1_x128), vget_high_u8(a0_x128)); // a[x] * 32 + 16 uint16x8_t a32_0 = vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_x128), vdup_n_u8(32)); uint16x8_t a32_1 = vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_x128), vdup_n_u8(32)); uint16x8_t res0 = vmlaq_u16(a32_0, diff0, shift0); uint16x8_t res1 = vmlaq_u16(a32_1, diff1, shift1); return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5)); } static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_left_neon( #if AOM_ARCH_AARCH64 uint8x16x4_t left_vals0, uint8x16x4_t left_vals1, #else const uint8_t *left, #endif int dy, int r, int j) { // here upsample_above and upsample_left are 0 by design of // av1_use_intra_edge_upsample const int min_base_y = -1; int16x8_t min_base_y256 = vdupq_n_s16(min_base_y); int16x8_t half_min_base_y256 = vdupq_n_s16(min_base_y >> 1); int16x8_t dy256 = vdupq_n_s16(dy); uint16x8_t j256 = vdupq_n_u16(j); uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000), vcreate_u16(0x0007000600050004)), vcombine_u16(vcreate_u16(0x000B000A00090008), vcreate_u16(0x000F000E000D000C)) } }; uint16x8x2_t c1234 = { { vaddq_u16(c0123.val[0], vdupq_n_u16(1)), vaddq_u16(c0123.val[1], vdupq_n_u16(1)) } }; int16x8_t v_r6 = vdupq_n_s16(r << 6); int16x8_t c256_0 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[0])); int16x8_t c256_1 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[1])); int16x8_t mul16_lo = vreinterpretq_s16_u16( vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_0, dy256)), vreinterpretq_u16_s16(half_min_base_y256))); int16x8_t mul16_hi = vreinterpretq_s16_u16( vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_1, dy256)), vreinterpretq_u16_s16(half_min_base_y256))); int16x8_t y_c256_lo = vsubq_s16(v_r6, mul16_lo); int16x8_t y_c256_hi = vsubq_s16(v_r6, mul16_hi); int16x8_t base_y_c256_lo = vshrq_n_s16(y_c256_lo, 6); int16x8_t base_y_c256_hi = vshrq_n_s16(y_c256_hi, 6); base_y_c256_lo = vmaxq_s16(min_base_y256, base_y_c256_lo); base_y_c256_hi = vmaxq_s16(min_base_y256, base_y_c256_hi); #if !AOM_ARCH_AARCH64 int16_t min_y = vgetq_lane_s16(base_y_c256_hi, 7); int16_t max_y = vgetq_lane_s16(base_y_c256_lo, 0); int16_t offset_diff = max_y - min_y; uint8x8_t a0_y0; uint8x8_t a0_y1; uint8x8_t a1_y0; uint8x8_t a1_y1; if (offset_diff < 16) { // Avoid gathers where the data we want is close together in memory. // We don't need this for AArch64 since we can already use TBL to cover the // full range of possible values. assert(offset_diff >= 0); int16x8_t min_y256 = vdupq_lane_s16(vget_high_s16(base_y_c256_hi), 3); int16x8x2_t base_y_offset; base_y_offset.val[0] = vsubq_s16(base_y_c256_lo, min_y256); base_y_offset.val[1] = vsubq_s16(base_y_c256_hi, min_y256); int8x16_t base_y_offset128 = vcombine_s8(vqmovn_s16(base_y_offset.val[0]), vqmovn_s16(base_y_offset.val[1])); uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]); uint8x16_t a0_y128 = vld1q_u8(left + min_y); uint8x16_t a1_y128 = vld1q_u8(left + min_y + 1); a0_y128 = vandq_u8(a0_y128, v_loadmaskz2); a1_y128 = vandq_u8(a1_y128, v_loadmaskz2); uint8x8_t v_index_low = vget_low_u8(vreinterpretq_u8_s8(base_y_offset128)); uint8x8_t v_index_high = vget_high_u8(vreinterpretq_u8_s8(base_y_offset128)); uint8x8x2_t v_tmp, v_res; v_tmp.val[0] = vget_low_u8(a0_y128); v_tmp.val[1] = vget_high_u8(a0_y128); v_res.val[0] = vtbl2_u8(v_tmp, v_index_low); v_res.val[1] = vtbl2_u8(v_tmp, v_index_high); a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]); v_tmp.val[0] = vget_low_u8(a1_y128); v_tmp.val[1] = vget_high_u8(a1_y128); v_res.val[0] = vtbl2_u8(v_tmp, v_index_low); v_res.val[1] = vtbl2_u8(v_tmp, v_index_high); a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]); a0_y0 = vget_low_u8(a0_y128); a0_y1 = vget_high_u8(a0_y128); a1_y0 = vget_low_u8(a1_y128); a1_y1 = vget_high_u8(a1_y128); } else { a0_y0 = load_u8_gather_s16_x8(left, base_y_c256_lo); a0_y1 = load_u8_gather_s16_x8(left, base_y_c256_hi); a1_y0 = load_u8_gather_s16_x8(left + 1, base_y_c256_lo); a1_y1 = load_u8_gather_s16_x8(left + 1, base_y_c256_hi); } #else // Values in left_idx{0,1} range from 0 through 63 inclusive. uint8x16_t left_idx0 = vreinterpretq_u8_s16(vaddq_s16(base_y_c256_lo, vdupq_n_s16(1))); uint8x16_t left_idx1 = vreinterpretq_u8_s16(vaddq_s16(base_y_c256_hi, vdupq_n_s16(1))); uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1); uint8x16_t a0_y01 = vqtbl4q_u8(left_vals0, left_idx01); uint8x16_t a1_y01 = vqtbl4q_u8(left_vals1, left_idx01); uint8x8_t a0_y0 = vget_low_u8(a0_y01); uint8x8_t a0_y1 = vget_high_u8(a0_y01); uint8x8_t a1_y0 = vget_low_u8(a1_y01); uint8x8_t a1_y1 = vget_high_u8(a1_y01); #endif // !AOM_ARCH_AARCH64 uint16x8_t shifty_lo = vshrq_n_u16( vandq_u16(vreinterpretq_u16_s16(y_c256_lo), vdupq_n_u16(0x3f)), 1); uint16x8_t shifty_hi = vshrq_n_u16( vandq_u16(vreinterpretq_u16_s16(y_c256_hi), vdupq_n_u16(0x3f)), 1); // a[x+1] - a[x] uint16x8_t diff_lo = vsubl_u8(a1_y0, a0_y0); uint16x8_t diff_hi = vsubl_u8(a1_y1, a0_y1); // a[x] * 32 + 16 uint16x8_t a32_lo = vmlal_u8(vdupq_n_u16(16), a0_y0, vdup_n_u8(32)); uint16x8_t a32_hi = vmlal_u8(vdupq_n_u16(16), a0_y1, vdup_n_u8(32)); uint16x8_t res0 = vmlaq_u16(a32_lo, diff_lo, shifty_lo); uint16x8_t res1 = vmlaq_u16(a32_hi, diff_hi, shifty_hi); return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5)); } static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy) { const int min_base_x = -(1 << upsample_above); const int min_base_y = -(1 << upsample_left); const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; assert(dx > 0); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 #if AOM_ARCH_AARCH64 // Use ext rather than loading left + 14 directly to avoid over-read. const uint8x16_t left_m2 = vld1q_u8(left - 2); const uint8x16_t left_0 = vld1q_u8(left); const uint8x16_t left_14 = vextq_u8(left_0, left_0, 14); const uint8x16x2_t left_vals = { { left_m2, left_14 } }; #define LEFT left_vals #else // !AOM_ARCH_AARCH64 #define LEFT left #endif // AOM_ARCH_AARCH64 for (int r = 0; r < N; r++) { int y = r + 1; int base_x = (-y * dx) >> frac_bits_x; const int base_min_diff = (min_base_x - ((-y * dx) >> frac_bits_x) + upsample_above) >> upsample_above; if (base_min_diff <= 0) { uint8x8_t a0_x_u8, a1_x_u8; uint16x4_t shift0; dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y, &a0_x_u8, &a1_x_u8, &shift0); uint8x8_t a0_x = a0_x_u8; uint8x8_t a1_x = a1_x_u8; uint16x8_t diff = vsubl_u8(a1_x, a0_x); // a[x+1] - a[x] uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a0_x, vdup_n_u8(32)); // a[x] * 32 + 16 uint16x8_t res = vmlaq_u16(a32, diff, vcombine_u16(shift0, vdup_n_u16(0))); uint8x8_t resx = vshrn_n_u16(res, 5); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resx), 0); } else if (base_min_diff < 4) { uint8x8_t a0_x_u8, a1_x_u8; uint16x4_t shift0; dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y, &a0_x_u8, &a1_x_u8, &shift0); uint16x8_t a0_x = vmovl_u8(a0_x_u8); uint16x8_t a1_x = vmovl_u8(a1_x_u8); uint16x4_t a0_y; uint16x4_t a1_y; uint16x4_t shift1; dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y, frac_bits_y, &a0_y, &a1_y, &shift1); a0_x = vcombine_u16(vget_low_u16(a0_x), a0_y); a1_x = vcombine_u16(vget_low_u16(a1_x), a1_y); uint16x8_t shift = vcombine_u16(shift0, shift1); uint16x8_t diff = vsubq_u16(a1_x, a0_x); // a[x+1] - a[x] uint16x8_t a32 = vmlaq_n_u16(vdupq_n_u16(16), a0_x, 32); // a[x] * 32 + 16 uint16x8_t res = vmlaq_u16(a32, diff, shift); uint8x8_t resx = vshrn_n_u16(res, 5); uint8x8_t resy = vext_u8(resx, vdup_n_u8(0), 4); uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]); uint8x8_t v_resxy = vbsl_u8(mask, resy, resx); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0); } else { uint16x4_t a0_y, a1_y; uint16x4_t shift1; dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y, frac_bits_y, &a0_y, &a1_y, &shift1); uint16x4_t diff = vsub_u16(a1_y, a0_y); // a[x+1] - a[x] uint16x4_t a32 = vmla_n_u16(vdup_n_u16(16), a0_y, 32); // a[x] * 32 + 16 uint16x4_t res = vmla_u16(a32, diff, shift1); uint8x8_t resy = vshrn_n_u16(vcombine_u16(res, vdup_n_u16(0)), 5); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resy), 0); } dst += stride; } #undef LEFT } static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy) { const int min_base_x = -(1 << upsample_above); const int min_base_y = -(1 << upsample_left); const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 #if AOM_ARCH_AARCH64 // Use ext rather than loading left + 30 directly to avoid over-read. const uint8x16_t left_m2 = vld1q_u8(left - 2); const uint8x16_t left_0 = vld1q_u8(left + 0); const uint8x16_t left_16 = vld1q_u8(left + 16); const uint8x16_t left_14 = vextq_u8(left_0, left_16, 14); const uint8x16_t left_30 = vextq_u8(left_16, left_16, 14); const uint8x16x3_t left_vals = { { left_m2, left_14, left_30 } }; #define LEFT left_vals #else // !AOM_ARCH_AARCH64 #define LEFT left #endif // AOM_ARCH_AARCH64 for (int r = 0; r < N; r++) { int y = r + 1; int base_x = (-y * dx) >> frac_bits_x; int base_min_diff = (min_base_x - base_x + upsample_above) >> upsample_above; if (base_min_diff <= 0) { uint8x8_t resx = dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y); vst1_u8(dst, resx); } else if (base_min_diff < 8) { uint8x8_t resx = dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y); uint8x8_t resy = dr_prediction_z2_Nx8_left_neon( LEFT, upsample_left, dy, r, min_base_y, frac_bits_y); uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]); uint8x8_t resxy = vbsl_u8(mask, resy, resx); vst1_u8(dst, resxy); } else { uint8x8_t resy = dr_prediction_z2_Nx8_left_neon( LEFT, upsample_left, dy, r, min_base_y, frac_bits_y); vst1_u8(dst, resy); } dst += stride; } #undef LEFT } static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int dx, int dy) { // here upsample_above and upsample_left are 0 by design of // av1_use_intra_edge_upsample const int min_base_x = -1; #if AOM_ARCH_AARCH64 const uint8x16_t left_m1 = vld1q_u8(left - 1); const uint8x16_t left_0 = vld1q_u8(left + 0); const uint8x16_t left_16 = vld1q_u8(left + 16); const uint8x16_t left_32 = vld1q_u8(left + 32); const uint8x16_t left_48 = vld1q_u8(left + 48); const uint8x16_t left_15 = vextq_u8(left_0, left_16, 15); const uint8x16_t left_31 = vextq_u8(left_16, left_32, 15); const uint8x16_t left_47 = vextq_u8(left_32, left_48, 15); const uint8x16x4_t left_vals0 = { { left_m1, left_15, left_31, left_47 } }; const uint8x16x4_t left_vals1 = { { left_0, left_16, left_32, left_48 } }; #define LEFT left_vals0, left_vals1 #else // !AOM_ARCH_AARCH64 #define LEFT left #endif // AOM_ARCH_AARCH64 for (int r = 0; r < H; r++) { int y = r + 1; int base_x = (-y * dx) >> 6; for (int j = 0; j < W; j += 16) { const int base_min_diff = min_base_x - base_x - j; if (base_min_diff <= 0) { uint8x16_t resx = dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j); vst1q_u8(dst + j, resx); } else if (base_min_diff < 16) { uint8x16_t resx = dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j); uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j); uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]); uint8x16_t resxy = vbslq_u8(mask, resy, resx); vst1q_u8(dst + j, resxy); } else { uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j); vst1q_u8(dst + j, resy); } } // for j dst += stride; } #undef LEFT } // Directional prediction, zone 2: 90 < angle < 180 void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy) { assert(dx > 0); assert(dy > 0); switch (bw) { case 4: dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above, upsample_left, dx, dy); break; case 8: dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above, upsample_left, dx, dy); break; default: dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left, dx, dy); break; } } /* ---------------------P R E D I C T I O N Z 3--------------------------- */ #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static AOM_FORCE_INLINE void z3_transpose_arrays_u8_16x4(const uint8x16_t *x, uint8x16x2_t *d) { uint8x16x2_t w0 = vzipq_u8(x[0], x[1]); uint8x16x2_t w1 = vzipq_u8(x[2], x[3]); d[0] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), vreinterpretq_u16_u8(w1.val[0]))); d[1] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), vreinterpretq_u16_u8(w1.val[1]))); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static AOM_FORCE_INLINE void z3_transpose_arrays_u8_4x4(const uint8x8_t *x, uint8x8x2_t *d) { uint8x8x2_t w0 = vzip_u8(x[0], x[1]); uint8x8x2_t w1 = vzip_u8(x[2], x[3]); *d = aom_reinterpret_u8_u16_x2( vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]))); } static AOM_FORCE_INLINE void z3_transpose_arrays_u8_8x4(const uint8x8_t *x, uint8x8x2_t *d) { uint8x8x2_t w0 = vzip_u8(x[0], x[1]); uint8x8x2_t w1 = vzip_u8(x[2], x[3]); d[0] = aom_reinterpret_u8_u16_x2( vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]))); d[1] = aom_reinterpret_u8_u16_x2( vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]))); } static void z3_transpose_arrays_u8_16x16(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst, ptrdiff_t pitchDst) { // The same as the normal transposes in transpose_neon.h, but with a stride // between consecutive vectors of elements. uint8x16_t r[16]; uint8x16_t d[16]; for (int i = 0; i < 16; i++) { r[i] = vld1q_u8(src + i * pitchSrc); } transpose_arrays_u8_16x16(r, d); for (int i = 0; i < 16; i++) { vst1q_u8(dst + i * pitchDst, d[i]); } } static void z3_transpose_arrays_u8_16nx16n(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst, ptrdiff_t pitchDst, int width, int height) { for (int j = 0; j < height; j += 16) { for (int i = 0; i < width; i += 16) { z3_transpose_arrays_u8_16x16(src + i * pitchSrc + j, pitchSrc, dst + j * pitchDst + i, pitchDst); } } } static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[4]; uint8x8x2_t dest; dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy); z3_transpose_arrays_u8_4x4(dstvec, &dest); store_u8x4_strided_x2(dst + stride * 0, stride, dest.val[0]); store_u8x4_strided_x2(dst + stride * 2, stride, dest.val[1]); } static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[8]; uint8x8_t d[8]; dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy); transpose_arrays_u8_8x8(dstvec, d); store_u8_8x8(dst, stride, d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]); } static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[4]; uint8x8x2_t d[2]; dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy); z3_transpose_arrays_u8_8x4(dstvec, d); store_u8x4_strided_x2(dst + stride * 0, stride, d[0].val[0]); store_u8x4_strided_x2(dst + stride * 2, stride, d[0].val[1]); store_u8x4_strided_x2(dst + stride * 4, stride, d[1].val[0]); store_u8x4_strided_x2(dst + stride * 6, stride, d[1].val[1]); } static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[8]; uint8x8_t d[8]; dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy); transpose_arrays_u8_8x8(dstvec, d); store_u8_8x4(dst, stride, d[0], d[1], d[2], d[3]); } static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x16_t dstvec[8]; uint8x8_t d[16]; dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy); transpose_arrays_u8_16x8(dstvec, d); for (int i = 0; i < 16; i++) { vst1_u8(dst + i * stride, d[i]); } } static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[16]; uint8x16_t d[8]; dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy); transpose_arrays_u8_8x16(dstvec, d); for (int i = 0; i < 8; i++) { vst1q_u8(dst + i * stride, d[i]); } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x16_t dstvec[4]; uint8x16x2_t d[2]; dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy); z3_transpose_arrays_u8_16x4(dstvec, d); store_u8x4_strided_x4(dst + stride * 0, stride, d[0].val[0]); store_u8x4_strided_x4(dst + stride * 4, stride, d[0].val[1]); store_u8x4_strided_x4(dst + stride * 8, stride, d[1].val[0]); store_u8x4_strided_x4(dst + stride * 12, stride, d[1].val[1]); } static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[16]; uint8x16_t d[8]; dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy); transpose_arrays_u8_8x16(dstvec, d); for (int i = 0; i < 4; i++) { vst1q_u8(dst + i * stride, d[i]); } } static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { (void)upsample_left; uint8x16x2_t dstvec[16]; uint8x16_t d[32]; uint8x16_t v_zero = vdupq_n_u8(0); dr_prediction_z1_32xN_internal_neon(8, dstvec, left, dy); for (int i = 8; i < 16; i++) { dstvec[i].val[0] = v_zero; dstvec[i].val[1] = v_zero; } transpose_arrays_u8_32x16(dstvec, d); for (int i = 0; i < 32; i++) { vst1_u8(dst + i * stride, vget_low_u8(d[i])); } } static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[32]; uint8x16_t d[16]; dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy); transpose_arrays_u8_8x16(dstvec, d); transpose_arrays_u8_8x16(dstvec + 16, d + 8); for (int i = 0; i < 8; i++) { vst1q_u8(dst + i * stride, d[i]); vst1q_u8(dst + i * stride + 16, d[i + 8]); } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x16_t dstvec[16]; uint8x16_t d[16]; dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy); transpose_arrays_u8_16x16(dstvec, d); for (int i = 0; i < 16; i++) { vst1q_u8(dst + i * stride, d[i]); } } static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { (void)upsample_left; uint8x16x2_t dstvec[32]; uint8x16_t d[64]; dr_prediction_z1_32xN_internal_neon(32, dstvec, left, dy); transpose_arrays_u8_32x16(dstvec, d); transpose_arrays_u8_32x16(dstvec + 16, d + 32); for (int i = 0; i < 32; i++) { vst1q_u8(dst + i * stride, d[i]); vst1q_u8(dst + i * stride + 16, d[i + 32]); } } static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { (void)upsample_left; DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]); dr_prediction_z1_64xN_neon(64, dstT, 64, left, dy); z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 64, 64); } static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { (void)upsample_left; uint8x16x2_t dstvec[16]; uint8x16_t d[32]; dr_prediction_z1_32xN_internal_neon(16, dstvec, left, dy); transpose_arrays_u8_32x16(dstvec, d); for (int i = 0; i < 16; i++) { vst1q_u8(dst + 2 * i * stride, d[2 * i + 0]); vst1q_u8(dst + (2 * i + 1) * stride, d[2 * i + 1]); } } static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x16_t dstvec[32]; dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy); for (int i = 0; i < 32; i += 16) { uint8x16_t d[16]; transpose_arrays_u8_16x16(dstvec + i, d); for (int j = 0; j < 16; j++) { vst1q_u8(dst + j * stride + i, d[j]); } } } static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { (void)upsample_left; uint8_t dstT[64 * 32]; dr_prediction_z1_64xN_neon(32, dstT, 64, left, dy); z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 32, 64); } static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { (void)upsample_left; uint8_t dstT[32 * 64]; dr_prediction_z1_32xN_neon(64, dstT, 32, left, dy); z3_transpose_arrays_u8_16nx16n(dstT, 32, dst, stride, 64, 32); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { (void)upsample_left; uint8_t dstT[64 * 16]; dr_prediction_z1_64xN_neon(16, dstT, 64, left, dy); z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 16, 64); } static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x16_t dstvec[64]; dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy); for (int i = 0; i < 64; i += 16) { uint8x16_t d[16]; transpose_arrays_u8_16x16(dstvec + i, d); for (int j = 0; j < 16; ++j) { vst1q_u8(dst + j * stride + i, d[j]); } } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER typedef void (*dr_prediction_z3_fn)(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy); #if CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER static const dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = { { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon, NULL, NULL, NULL }, { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon, dr_prediction_z3_8x16_neon, NULL, NULL }, { NULL, NULL, NULL, dr_prediction_z3_16x8_neon, dr_prediction_z3_16x16_neon, dr_prediction_z3_16x32_neon, NULL }, { NULL, NULL, NULL, NULL, dr_prediction_z3_32x16_neon, dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon }, { NULL, NULL, NULL, NULL, NULL, dr_prediction_z3_64x32_neon, dr_prediction_z3_64x64_neon }, }; #else static const dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = { { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon, dr_prediction_z3_4x16_neon, NULL, NULL }, { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon, dr_prediction_z3_8x16_neon, dr_prediction_z3_8x32_neon, NULL }, { NULL, NULL, dr_prediction_z3_16x4_neon, dr_prediction_z3_16x8_neon, dr_prediction_z3_16x16_neon, dr_prediction_z3_16x32_neon, dr_prediction_z3_16x64_neon }, { NULL, NULL, NULL, dr_prediction_z3_32x8_neon, dr_prediction_z3_32x16_neon, dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon }, { NULL, NULL, NULL, NULL, dr_prediction_z3_64x16_neon, dr_prediction_z3_64x32_neon, dr_prediction_z3_64x64_neon }, }; #endif // CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy) { (void)above; (void)dx; assert(dx == 1); assert(dy > 0); dr_prediction_z3_fn f = dr_prediction_z3_arr[get_msb(bw)][get_msb(bh)]; assert(f != NULL); f(dst, stride, left, upsample_left, dy); } // ----------------------------------------------------------------------------- // SMOOTH_PRED // 256 - v = vneg_s8(v) static inline uint8x8_t negate_s8(const uint8x8_t v) { return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v))); } static void smooth_4xh_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, const uint8_t *const left_column, const int height) { const uint8_t top_right = top_row[3]; const uint8_t bottom_left = left_column[height - 1]; const uint8_t *const weights_y = smooth_weights + height - 4; uint8x8_t top_v = load_u8_4x1(top_row); const uint8x8_t top_right_v = vdup_n_u8(top_right); const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); uint8x8_t weights_x_v = load_u8_4x1(smooth_weights); const uint8x8_t scaled_weights_x = negate_s8(weights_x_v); const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); assert(height > 0); int y = 0; do { const uint8x8_t left_v = vdup_n_u8(left_column[y]); const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); const uint16x8_t weighted_top_bl = vmlal_u8(weighted_bl, weights_y_v, top_v); const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x_v, left_v); // Maximum value of each parameter: 0xFF00 const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr); const uint8x8_t result = vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(result), 0); dst += stride; } while (++y != height); } static inline uint8x8_t calculate_pred(const uint16x8_t weighted_top_bl, const uint16x8_t weighted_left_tr) { // Maximum value of each parameter: 0xFF00 const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr); return vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE); } static inline uint8x8_t calculate_weights_and_pred( const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr, const uint8x8_t bottom_left, const uint8x8_t weights_x, const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) { const uint16x8_t weighted_top = vmull_u8(weights_y, top); const uint16x8_t weighted_top_bl = vmlal_u8(weighted_top, scaled_weights_y, bottom_left); const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left); return calculate_pred(weighted_top_bl, weighted_left_tr); } static void smooth_8xh_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, const uint8_t *const left_column, const int height) { const uint8_t top_right = top_row[7]; const uint8_t bottom_left = left_column[height - 1]; const uint8_t *const weights_y = smooth_weights + height - 4; const uint8x8_t top_v = vld1_u8(top_row); const uint8x8_t top_right_v = vdup_n_u8(top_right); const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); const uint8x8_t weights_x_v = vld1_u8(smooth_weights + 4); const uint8x8_t scaled_weights_x = negate_s8(weights_x_v); const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); assert(height > 0); int y = 0; do { const uint8x8_t left_v = vdup_n_u8(left_column[y]); const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); const uint8x8_t result = calculate_weights_and_pred(top_v, left_v, weighted_tr, bottom_left_v, weights_x_v, scaled_weights_y, weights_y_v); vst1_u8(dst, result); dst += stride; } while (++y != height); } #define SMOOTH_NXM(W, H) \ void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \ const uint8_t *above, \ const uint8_t *left) { \ smooth_##W##xh_neon(dst, y_stride, above, left, H); \ } SMOOTH_NXM(4, 4) SMOOTH_NXM(4, 8) SMOOTH_NXM(8, 4) SMOOTH_NXM(8, 8) SMOOTH_NXM(8, 16) #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER SMOOTH_NXM(4, 16) SMOOTH_NXM(8, 32) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #undef SMOOTH_NXM static inline uint8x16_t calculate_weights_and_predq( const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right, const uint8x8_t weights_y, const uint8x16_t weights_x, const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) { const uint16x8_t weighted_top_bl_low = vmlal_u8(weighted_bl, weights_y, vget_low_u8(top)); const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); const uint16x8_t weighted_left_tr_low = vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right); const uint8x8_t result_low = calculate_pred(weighted_top_bl_low, weighted_left_tr_low); const uint16x8_t weighted_top_bl_high = vmlal_u8(weighted_bl, weights_y, vget_high_u8(top)); const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); const uint16x8_t weighted_left_tr_high = vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right); const uint8x8_t result_high = calculate_pred(weighted_top_bl_high, weighted_left_tr_high); return vcombine_u8(result_low, result_high); } // 256 - v = vneg_s8(v) static inline uint8x16_t negate_s8q(const uint8x16_t v) { return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v))); } // For width 16 and above. #define SMOOTH_PREDICTOR(W) \ static void smooth_##W##xh_neon( \ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \ const uint8_t *const left_column, const int height) { \ const uint8_t top_right = top_row[(W)-1]; \ const uint8_t bottom_left = left_column[height - 1]; \ const uint8_t *const weights_y = smooth_weights + height - 4; \ \ uint8x16_t top_v[4]; \ top_v[0] = vld1q_u8(top_row); \ if ((W) > 16) { \ top_v[1] = vld1q_u8(top_row + 16); \ if ((W) == 64) { \ top_v[2] = vld1q_u8(top_row + 32); \ top_v[3] = vld1q_u8(top_row + 48); \ } \ } \ \ const uint8x8_t top_right_v = vdup_n_u8(top_right); \ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \ \ uint8x16_t weights_x_v[4]; \ weights_x_v[0] = vld1q_u8(smooth_weights + (W)-4); \ if ((W) > 16) { \ weights_x_v[1] = vld1q_u8(smooth_weights + (W) + 16 - 4); \ if ((W) == 64) { \ weights_x_v[2] = vld1q_u8(smooth_weights + (W) + 32 - 4); \ weights_x_v[3] = vld1q_u8(smooth_weights + (W) + 48 - 4); \ } \ } \ \ uint8x16_t scaled_weights_x[4]; \ scaled_weights_x[0] = negate_s8q(weights_x_v[0]); \ if ((W) > 16) { \ scaled_weights_x[1] = negate_s8q(weights_x_v[1]); \ if ((W) == 64) { \ scaled_weights_x[2] = negate_s8q(weights_x_v[2]); \ scaled_weights_x[3] = negate_s8q(weights_x_v[3]); \ } \ } \ \ for (int y = 0; y < height; ++y) { \ const uint8x8_t left_v = vdup_n_u8(left_column[y]); \ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \ const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \ const uint16x8_t weighted_bl = \ vmull_u8(scaled_weights_y, bottom_left_v); \ \ vst1q_u8(dst, calculate_weights_and_predq( \ top_v[0], left_v, top_right_v, weights_y_v, \ weights_x_v[0], scaled_weights_x[0], weighted_bl)); \ \ if ((W) > 16) { \ vst1q_u8(dst + 16, \ calculate_weights_and_predq( \ top_v[1], left_v, top_right_v, weights_y_v, \ weights_x_v[1], scaled_weights_x[1], weighted_bl)); \ if ((W) == 64) { \ vst1q_u8(dst + 32, \ calculate_weights_and_predq( \ top_v[2], left_v, top_right_v, weights_y_v, \ weights_x_v[2], scaled_weights_x[2], weighted_bl)); \ vst1q_u8(dst + 48, \ calculate_weights_and_predq( \ top_v[3], left_v, top_right_v, weights_y_v, \ weights_x_v[3], scaled_weights_x[3], weighted_bl)); \ } \ } \ \ dst += stride; \ } \ } SMOOTH_PREDICTOR(16) SMOOTH_PREDICTOR(32) SMOOTH_PREDICTOR(64) #undef SMOOTH_PREDICTOR #define SMOOTH_NXM_WIDE(W, H) \ void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \ const uint8_t *above, \ const uint8_t *left) { \ smooth_##W##xh_neon(dst, y_stride, above, left, H); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER SMOOTH_NXM_WIDE(16, 4) SMOOTH_NXM_WIDE(16, 64) SMOOTH_NXM_WIDE(32, 8) SMOOTH_NXM_WIDE(64, 16) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER SMOOTH_NXM_WIDE(16, 8) SMOOTH_NXM_WIDE(16, 16) SMOOTH_NXM_WIDE(16, 32) SMOOTH_NXM_WIDE(32, 16) SMOOTH_NXM_WIDE(32, 32) SMOOTH_NXM_WIDE(32, 64) SMOOTH_NXM_WIDE(64, 32) SMOOTH_NXM_WIDE(64, 64) #undef SMOOTH_NXM_WIDE // ----------------------------------------------------------------------------- // SMOOTH_V_PRED // For widths 4 and 8. #define SMOOTH_V_PREDICTOR(W) \ static void smooth_v_##W##xh_neon( \ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \ const uint8_t *const left_column, const int height) { \ const uint8_t bottom_left = left_column[height - 1]; \ const uint8_t *const weights_y = smooth_weights + height - 4; \ \ uint8x8_t top_v; \ if ((W) == 4) { \ top_v = load_u8_4x1(top_row); \ } else { /* width == 8 */ \ top_v = vld1_u8(top_row); \ } \ \ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \ \ assert(height > 0); \ int y = 0; \ do { \ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \ const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \ \ const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); \ const uint16x8_t weighted_top_bl = \ vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v); \ const uint8x8_t pred = \ vrshrn_n_u16(weighted_top_bl, SMOOTH_WEIGHT_LOG2_SCALE); \ \ if ((W) == 4) { \ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \ } else { /* width == 8 */ \ vst1_u8(dst, pred); \ } \ dst += stride; \ } while (++y != height); \ } SMOOTH_V_PREDICTOR(4) SMOOTH_V_PREDICTOR(8) #undef SMOOTH_V_PREDICTOR #define SMOOTH_V_NXM(W, H) \ void aom_smooth_v_predictor_##W##x##H##_neon( \ uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \ const uint8_t *left) { \ smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER SMOOTH_V_NXM(4, 16) SMOOTH_V_NXM(8, 32) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER SMOOTH_V_NXM(4, 4) SMOOTH_V_NXM(4, 8) SMOOTH_V_NXM(8, 4) SMOOTH_V_NXM(8, 8) SMOOTH_V_NXM(8, 16) #undef SMOOTH_V_NXM static inline uint8x16_t calculate_vertical_weights_and_pred( const uint8x16_t top, const uint8x8_t weights_y, const uint16x8_t weighted_bl) { const uint16x8_t pred_low = vmlal_u8(weighted_bl, weights_y, vget_low_u8(top)); const uint16x8_t pred_high = vmlal_u8(weighted_bl, weights_y, vget_high_u8(top)); const uint8x8_t pred_scaled_low = vrshrn_n_u16(pred_low, SMOOTH_WEIGHT_LOG2_SCALE); const uint8x8_t pred_scaled_high = vrshrn_n_u16(pred_high, SMOOTH_WEIGHT_LOG2_SCALE); return vcombine_u8(pred_scaled_low, pred_scaled_high); } // For width 16 and above. #define SMOOTH_V_PREDICTOR(W) \ static void smooth_v_##W##xh_neon( \ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \ const uint8_t *const left_column, const int height) { \ const uint8_t bottom_left = left_column[height - 1]; \ const uint8_t *const weights_y = smooth_weights + height - 4; \ \ uint8x16_t top_v[4]; \ top_v[0] = vld1q_u8(top_row); \ if ((W) > 16) { \ top_v[1] = vld1q_u8(top_row + 16); \ if ((W) == 64) { \ top_v[2] = vld1q_u8(top_row + 32); \ top_v[3] = vld1q_u8(top_row + 48); \ } \ } \ \ const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \ \ assert(height > 0); \ int y = 0; \ do { \ const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \ const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \ const uint16x8_t weighted_bl = \ vmull_u8(scaled_weights_y, bottom_left_v); \ \ const uint8x16_t pred_0 = calculate_vertical_weights_and_pred( \ top_v[0], weights_y_v, weighted_bl); \ vst1q_u8(dst, pred_0); \ \ if ((W) > 16) { \ const uint8x16_t pred_1 = calculate_vertical_weights_and_pred( \ top_v[1], weights_y_v, weighted_bl); \ vst1q_u8(dst + 16, pred_1); \ \ if ((W) == 64) { \ const uint8x16_t pred_2 = calculate_vertical_weights_and_pred( \ top_v[2], weights_y_v, weighted_bl); \ vst1q_u8(dst + 32, pred_2); \ \ const uint8x16_t pred_3 = calculate_vertical_weights_and_pred( \ top_v[3], weights_y_v, weighted_bl); \ vst1q_u8(dst + 48, pred_3); \ } \ } \ \ dst += stride; \ } while (++y != height); \ } SMOOTH_V_PREDICTOR(16) SMOOTH_V_PREDICTOR(32) SMOOTH_V_PREDICTOR(64) #undef SMOOTH_V_PREDICTOR #define SMOOTH_V_NXM_WIDE(W, H) \ void aom_smooth_v_predictor_##W##x##H##_neon( \ uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \ const uint8_t *left) { \ smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER SMOOTH_V_NXM_WIDE(16, 4) SMOOTH_V_NXM_WIDE(32, 8) SMOOTH_V_NXM_WIDE(64, 16) SMOOTH_V_NXM_WIDE(16, 64) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER SMOOTH_V_NXM_WIDE(16, 8) SMOOTH_V_NXM_WIDE(16, 16) SMOOTH_V_NXM_WIDE(16, 32) SMOOTH_V_NXM_WIDE(32, 16) SMOOTH_V_NXM_WIDE(32, 32) SMOOTH_V_NXM_WIDE(32, 64) SMOOTH_V_NXM_WIDE(64, 32) SMOOTH_V_NXM_WIDE(64, 64) #undef SMOOTH_V_NXM_WIDE // ----------------------------------------------------------------------------- // SMOOTH_H_PRED // For widths 4 and 8. #define SMOOTH_H_PREDICTOR(W) \ static void smooth_h_##W##xh_neon( \ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \ const uint8_t *const left_column, const int height) { \ const uint8_t top_right = top_row[(W)-1]; \ \ const uint8x8_t top_right_v = vdup_n_u8(top_right); \ /* Over-reads for 4xN but still within the array. */ \ const uint8x8_t weights_x = vld1_u8(smooth_weights + (W)-4); \ const uint8x8_t scaled_weights_x = negate_s8(weights_x); \ const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); \ \ assert(height > 0); \ int y = 0; \ do { \ const uint8x8_t left_v = vdup_n_u8(left_column[y]); \ const uint16x8_t weighted_left_tr = \ vmlal_u8(weighted_tr, weights_x, left_v); \ const uint8x8_t pred = \ vrshrn_n_u16(weighted_left_tr, SMOOTH_WEIGHT_LOG2_SCALE); \ \ if ((W) == 4) { \ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \ } else { /* width == 8 */ \ vst1_u8(dst, pred); \ } \ dst += stride; \ } while (++y != height); \ } SMOOTH_H_PREDICTOR(4) SMOOTH_H_PREDICTOR(8) #undef SMOOTH_H_PREDICTOR #define SMOOTH_H_NXM(W, H) \ void aom_smooth_h_predictor_##W##x##H##_neon( \ uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \ const uint8_t *left) { \ smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER SMOOTH_H_NXM(4, 16) SMOOTH_H_NXM(8, 32) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER SMOOTH_H_NXM(4, 4) SMOOTH_H_NXM(4, 8) SMOOTH_H_NXM(8, 4) SMOOTH_H_NXM(8, 8) SMOOTH_H_NXM(8, 16) #undef SMOOTH_H_NXM static inline uint8x16_t calculate_horizontal_weights_and_pred( const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x, const uint8x16_t scaled_weights_x) { const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); const uint16x8_t weighted_left_tr_low = vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right); const uint8x8_t pred_scaled_low = vrshrn_n_u16(weighted_left_tr_low, SMOOTH_WEIGHT_LOG2_SCALE); const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); const uint16x8_t weighted_left_tr_high = vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right); const uint8x8_t pred_scaled_high = vrshrn_n_u16(weighted_left_tr_high, SMOOTH_WEIGHT_LOG2_SCALE); return vcombine_u8(pred_scaled_low, pred_scaled_high); } // For width 16 and above. #define SMOOTH_H_PREDICTOR(W) \ static void smooth_h_##W##xh_neon( \ uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \ const uint8_t *const left_column, const int height) { \ const uint8_t top_right = top_row[(W)-1]; \ \ const uint8x8_t top_right_v = vdup_n_u8(top_right); \ \ uint8x16_t weights_x[4]; \ weights_x[0] = vld1q_u8(smooth_weights + (W)-4); \ if ((W) > 16) { \ weights_x[1] = vld1q_u8(smooth_weights + (W) + 16 - 4); \ if ((W) == 64) { \ weights_x[2] = vld1q_u8(smooth_weights + (W) + 32 - 4); \ weights_x[3] = vld1q_u8(smooth_weights + (W) + 48 - 4); \ } \ } \ \ uint8x16_t scaled_weights_x[4]; \ scaled_weights_x[0] = negate_s8q(weights_x[0]); \ if ((W) > 16) { \ scaled_weights_x[1] = negate_s8q(weights_x[1]); \ if ((W) == 64) { \ scaled_weights_x[2] = negate_s8q(weights_x[2]); \ scaled_weights_x[3] = negate_s8q(weights_x[3]); \ } \ } \ \ assert(height > 0); \ int y = 0; \ do { \ const uint8x8_t left_v = vdup_n_u8(left_column[y]); \ \ const uint8x16_t pred_0 = calculate_horizontal_weights_and_pred( \ left_v, top_right_v, weights_x[0], scaled_weights_x[0]); \ vst1q_u8(dst, pred_0); \ \ if ((W) > 16) { \ const uint8x16_t pred_1 = calculate_horizontal_weights_and_pred( \ left_v, top_right_v, weights_x[1], scaled_weights_x[1]); \ vst1q_u8(dst + 16, pred_1); \ \ if ((W) == 64) { \ const uint8x16_t pred_2 = calculate_horizontal_weights_and_pred( \ left_v, top_right_v, weights_x[2], scaled_weights_x[2]); \ vst1q_u8(dst + 32, pred_2); \ \ const uint8x16_t pred_3 = calculate_horizontal_weights_and_pred( \ left_v, top_right_v, weights_x[3], scaled_weights_x[3]); \ vst1q_u8(dst + 48, pred_3); \ } \ } \ dst += stride; \ } while (++y != height); \ } SMOOTH_H_PREDICTOR(16) SMOOTH_H_PREDICTOR(32) SMOOTH_H_PREDICTOR(64) #undef SMOOTH_H_PREDICTOR #define SMOOTH_H_NXM_WIDE(W, H) \ void aom_smooth_h_predictor_##W##x##H##_neon( \ uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \ const uint8_t *left) { \ smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER SMOOTH_H_NXM_WIDE(16, 4) SMOOTH_H_NXM_WIDE(16, 64) SMOOTH_H_NXM_WIDE(32, 8) SMOOTH_H_NXM_WIDE(64, 16) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER SMOOTH_H_NXM_WIDE(16, 8) SMOOTH_H_NXM_WIDE(16, 16) SMOOTH_H_NXM_WIDE(16, 32) SMOOTH_H_NXM_WIDE(32, 16) SMOOTH_H_NXM_WIDE(32, 32) SMOOTH_H_NXM_WIDE(32, 64) SMOOTH_H_NXM_WIDE(64, 32) SMOOTH_H_NXM_WIDE(64, 64) #undef SMOOTH_H_NXM_WIDE // ----------------------------------------------------------------------------- // PAETH static inline void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride, const uint8_t *const top_row, const uint8_t *const left_column, int width, int height) { const uint8x8_t top_left = vdup_n_u8(top_row[-1]); const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]); uint8x8_t top; if (width == 4) { top = load_u8_4x1(top_row); } else { // width == 8 top = vld1_u8(top_row); } assert(height > 0); int y = 0; do { const uint8x8_t left = vdup_n_u8(left_column[y]); const uint8x8_t left_dist = vabd_u8(top, top_left); const uint8x8_t top_dist = vabd_u8(left, top_left); const uint16x8_t top_left_dist = vabdq_u16(vaddl_u8(top, left), top_left_x2); const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist); const uint8x8_t left_le_top_left = vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist)); const uint8x8_t top_le_top_left = vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist)); // if (left_dist <= top_dist && left_dist <= top_left_dist) const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left); // dest[x] = left_column[y]; // Fill all the unused spaces with 'top'. They will be overwritten when // the positions for top_left are known. uint8x8_t result = vbsl_u8(left_mask, left, top); // else if (top_dist <= top_left_dist) // dest[x] = top_row[x]; // Add these values to the mask. They were already set. const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left); // else // dest[x] = top_left; result = vbsl_u8(left_or_top_mask, result, top_left); if (width == 4) { store_u8_4x1(dest, result); } else { // width == 8 vst1_u8(dest, result); } dest += stride; } while (++y != height); } #define PAETH_NXM(W, H) \ void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \ const uint8_t *above, \ const uint8_t *left) { \ paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER PAETH_NXM(4, 16) PAETH_NXM(8, 32) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER PAETH_NXM(4, 4) PAETH_NXM(4, 8) PAETH_NXM(8, 4) PAETH_NXM(8, 8) PAETH_NXM(8, 16) // Calculate X distance <= TopLeft distance and pack the resulting mask into // uint8x8_t. static inline uint8x16_t x_le_top_left(const uint8x16_t x_dist, const uint16x8_t top_left_dist_low, const uint16x8_t top_left_dist_high) { const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low), vqmovn_u16(top_left_dist_high)); return vcleq_u8(x_dist, top_left_dist); } // Select the closest values and collect them. static inline uint8x16_t select_paeth(const uint8x16_t top, const uint8x16_t left, const uint8x16_t top_left, const uint8x16_t left_le_top, const uint8x16_t left_le_top_left, const uint8x16_t top_le_top_left) { // if (left_dist <= top_dist && left_dist <= top_left_dist) const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left); // dest[x] = left_column[y]; // Fill all the unused spaces with 'top'. They will be overwritten when // the positions for top_left are known. uint8x16_t result = vbslq_u8(left_mask, left, top); // else if (top_dist <= top_left_dist) // dest[x] = top_row[x]; // Add these values to the mask. They were already set. const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left); // else // dest[x] = top_left; return vbslq_u8(left_or_top_mask, result, top_left); } // Generate numbered and high/low versions of top_left_dist. #define TOP_LEFT_DIST(num) \ const uint16x8_t top_left_##num##_dist_low = vabdq_u16( \ vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \ const uint16x8_t top_left_##num##_dist_high = vabdq_u16( \ vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2) // Generate numbered versions of XLeTopLeft with x = left. #define LEFT_LE_TOP_LEFT(num) \ const uint8x16_t left_le_top_left_##num = \ x_le_top_left(left_##num##_dist, top_left_##num##_dist_low, \ top_left_##num##_dist_high) // Generate numbered versions of XLeTopLeft with x = top. #define TOP_LE_TOP_LEFT(num) \ const uint8x16_t top_le_top_left_##num = x_le_top_left( \ top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high) static inline void paeth16_plus_x_h_neon(uint8_t *dest, ptrdiff_t stride, const uint8_t *const top_row, const uint8_t *const left_column, int width, int height) { const uint8x16_t top_left = vdupq_n_u8(top_row[-1]); const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]); uint8x16_t top[4]; top[0] = vld1q_u8(top_row); if (width > 16) { top[1] = vld1q_u8(top_row + 16); if (width == 64) { top[2] = vld1q_u8(top_row + 32); top[3] = vld1q_u8(top_row + 48); } } assert(height > 0); int y = 0; do { const uint8x16_t left = vdupq_n_u8(left_column[y]); const uint8x16_t top_dist = vabdq_u8(left, top_left); const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left); TOP_LEFT_DIST(0); const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist); LEFT_LE_TOP_LEFT(0); TOP_LE_TOP_LEFT(0); const uint8x16_t result_0 = select_paeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0, top_le_top_left_0); vst1q_u8(dest, result_0); if (width > 16) { const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left); TOP_LEFT_DIST(1); const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist); LEFT_LE_TOP_LEFT(1); TOP_LE_TOP_LEFT(1); const uint8x16_t result_1 = select_paeth(top[1], left, top_left, left_1_le_top, left_le_top_left_1, top_le_top_left_1); vst1q_u8(dest + 16, result_1); if (width == 64) { const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left); TOP_LEFT_DIST(2); const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist); LEFT_LE_TOP_LEFT(2); TOP_LE_TOP_LEFT(2); const uint8x16_t result_2 = select_paeth(top[2], left, top_left, left_2_le_top, left_le_top_left_2, top_le_top_left_2); vst1q_u8(dest + 32, result_2); const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left); TOP_LEFT_DIST(3); const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist); LEFT_LE_TOP_LEFT(3); TOP_LE_TOP_LEFT(3); const uint8x16_t result_3 = select_paeth(top[3], left, top_left, left_3_le_top, left_le_top_left_3, top_le_top_left_3); vst1q_u8(dest + 48, result_3); } } dest += stride; } while (++y != height); } #define PAETH_NXM_WIDE(W, H) \ void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \ const uint8_t *above, \ const uint8_t *left) { \ paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \ } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER PAETH_NXM_WIDE(16, 4) PAETH_NXM_WIDE(16, 64) PAETH_NXM_WIDE(32, 8) PAETH_NXM_WIDE(64, 16) #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER PAETH_NXM_WIDE(16, 8) PAETH_NXM_WIDE(16, 16) PAETH_NXM_WIDE(16, 32) PAETH_NXM_WIDE(32, 16) PAETH_NXM_WIDE(32, 32) PAETH_NXM_WIDE(32, 64) PAETH_NXM_WIDE(64, 32) PAETH_NXM_WIDE(64, 64) aom-3.12.1/aom_dsp/arm/loopfilter_neon.c000066400000000000000000001156231477627663500201540ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "config/aom_config.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" static inline uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1, uint8x8_t p0q0, const uint8_t blimit, const uint8_t limit) { // Calculate mask values for four samples uint32x2x2_t p0q0_p1q1; uint16x8_t temp_16x8; uint16x4_t temp0_16x4, temp1_16x4; uint8x8_t mask_8x8, temp_8x8; const uint8x8_t limit_8x8 = vdup_n_u8(limit); const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit); mask_8x8 = vabd_u8(p3q3, p2q2); mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p2q2, p1q1)); mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0)); mask_8x8 = vcle_u8(mask_8x8, limit_8x8); temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); mask_8x8 = vand_u8(mask_8x8, temp_8x8); p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), vreinterpret_u8_u32(p0q0_p1q1.val[1])); temp_16x8 = vmovl_u8(temp_8x8); temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); mask_8x8 = vand_u8(mask_8x8, temp_8x8); return mask_8x8; } static inline uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0, const uint8_t blimit, const uint8_t limit) { uint32x2x2_t p0q0_p1q1; uint16x8_t temp_16x8; uint16x4_t temp0_16x4, temp1_16x4; const uint16x4_t blimit_16x4 = vdup_n_u16(blimit); const uint8x8_t limit_8x8 = vdup_n_u8(limit); uint8x8_t mask_8x8, temp_8x8; mask_8x8 = vabd_u8(p1q1, p0q0); mask_8x8 = vcle_u8(mask_8x8, limit_8x8); temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); mask_8x8 = vand_u8(mask_8x8, temp_8x8); p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), vreinterpret_u8_u32(p0q0_p1q1.val[1])); temp_16x8 = vmovl_u8(temp_8x8); temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); mask_8x8 = vand_u8(mask_8x8, temp_8x8); return mask_8x8; } static inline uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1, uint8x8_t p0q0) { const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1 uint8x8_t flat_8x8, temp_8x8; flat_8x8 = vabd_u8(p1q1, p0q0); flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0)); flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p3q3, p0q0)); flat_8x8 = vcle_u8(flat_8x8, thresh_8x8); temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8))); flat_8x8 = vand_u8(flat_8x8, temp_8x8); return flat_8x8; } static inline uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1, uint8x8_t p0q0) { const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1 uint8x8_t flat_8x8, temp_8x8; flat_8x8 = vabd_u8(p1q1, p0q0); flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0)); flat_8x8 = vcle_u8(flat_8x8, thresh_8x8); temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8))); flat_8x8 = vand_u8(flat_8x8, temp_8x8); return flat_8x8; } static inline uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1, uint8x8_t p0q0, const uint8_t blimit, const uint8_t limit) { // Calculate mask3 values for four samples uint32x2x2_t p0q0_p1q1; uint16x8_t temp_16x8; uint16x4_t temp0_16x4, temp1_16x4; uint8x8_t mask_8x8, temp_8x8; const uint8x8_t limit_8x8 = vdup_n_u8(limit); const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit); mask_8x8 = vabd_u8(p2q2, p1q1); mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0)); mask_8x8 = vcle_u8(mask_8x8, limit_8x8); temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); mask_8x8 = vand_u8(mask_8x8, temp_8x8); p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), vreinterpret_u8_u32(p0q0_p1q1.val[1])); temp_16x8 = vmovl_u8(temp_8x8); temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); mask_8x8 = vand_u8(mask_8x8, temp_8x8); return mask_8x8; } static inline void filter4(const uint8x8_t p0q0, const uint8x8_t p1q1, uint8x8_t *p0q0_output, uint8x8_t *p1q1_output, uint8x8_t mask_8x8, const uint8_t thresh) { const uint8x8_t thresh_f4 = vdup_n_u8(thresh); const int8x8_t sign_mask = vdup_n_s8(0x80); const int8x8_t val_4 = vdup_n_s8(4); const int8x8_t val_3 = vdup_n_s8(3); int8x8_t pq_s0 = veor_s8(vreinterpret_s8_u8(p0q0), sign_mask); int8x8_t pq_s1 = veor_s8(vreinterpret_s8_u8(p1q1), sign_mask); int32x2x2_t ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); int32x2x2_t ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); int8x8_t ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); int8x8_t qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); int8x8_t ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); int8x8_t qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); // hev_mask uint8x8_t temp0_8x8 = vcgt_u8(vabd_u8(p0q0, p1q1), thresh_f4); uint8x8_t temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); int8x8_t hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); // add outer taps if we have high edge variance int8x8_t filter_s8 = vqsub_s8(ps1_s8, qs1_s8); filter_s8 = vand_s8(filter_s8, hev_8x8); // inner taps int8x8_t temp_s8 = vqsub_s8(qs0_s8, ps0_s8); int16x8_t filter_s16 = vmovl_s8(filter_s8); filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); filter_s8 = vqmovn_s16(filter_s16); filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); int8x8_t filter1_s8 = vqadd_s8(filter_s8, val_4); int8x8_t filter2_s8 = vqadd_s8(filter_s8, val_3); filter1_s8 = vshr_n_s8(filter1_s8, 3); filter2_s8 = vshr_n_s8(filter2_s8, 3); int8x8_t oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); int8x8_t op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); filter_s8 = vrshr_n_s8(filter1_s8, 1); filter_s8 = vbic_s8(filter_s8, hev_8x8); int8x8_t oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); int8x8_t op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); *p0q0_output = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); *p1q1_output = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); } static inline void filter8(const uint8x8_t p0q0, const uint8x8_t p1q1, const uint8x8_t p2q2, const uint8x8_t p3q3, uint8x8_t *p0q0_output, uint8x8_t *p1q1_output, uint8x8_t *p2q2_output) { // Reverse p and q. uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4); uint8x8_t q1p1 = vext_u8(p1q1, p1q1, 4); uint8x8_t q2p2 = vext_u8(p2q2, p2q2, 4); uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1); uint16x8_t p2q2_p3q3 = vaddl_u8(p3q3, p2q2); uint16x8_t out = vaddq_u16(p0q0_p1q1, p2q2_p3q3); uint16x8_t q0p0_p3q3 = vaddl_u8(q0p0, p3q3); uint16x8_t out_q0p0_p3q3 = vaddq_u16(out, q0p0_p3q3); uint16x8_t out_pq2 = vaddq_u16(out_q0p0_p3q3, p2q2_p3q3); uint16x8_t p1q1_q1p1 = vaddl_u8(p1q1, q1p1); uint16x8_t out_pq1 = vaddq_u16(out_q0p0_p3q3, p1q1_q1p1); uint16x8_t q0p0_p0q0 = vaddl_u8(q0p0, p0q0); uint16x8_t q1p1_q2p2 = vaddl_u8(q1p1, q2p2); uint16x8_t out_pq0 = vaddq_u16(q0p0_p0q0, q1p1_q2p2); out_pq0 = vaddq_u16(out_pq0, out); *p0q0_output = vrshrn_n_u16(out_pq0, 3); *p1q1_output = vrshrn_n_u16(out_pq1, 3); *p2q2_output = vrshrn_n_u16(out_pq2, 3); } static inline void filter14(const uint8x8_t p0q0, const uint8x8_t p1q1, const uint8x8_t p2q2, const uint8x8_t p3q3, const uint8x8_t p4q4, const uint8x8_t p5q5, const uint8x8_t p6q6, uint8x8_t *p0q0_output, uint8x8_t *p1q1_output, uint8x8_t *p2q2_output, uint8x8_t *p3q3_output, uint8x8_t *p4q4_output, uint8x8_t *p5q5_output) { // Reverse p and q. uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4); uint8x8_t q1p1 = vext_u8(p1q1, p1q1, 4); uint8x8_t q2p2 = vext_u8(p2q2, p2q2, 4); uint8x8_t q3p3 = vext_u8(p3q3, p3q3, 4); uint8x8_t q4p4 = vext_u8(p4q4, p4q4, 4); uint8x8_t q5p5 = vext_u8(p5q5, p5q5, 4); uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1); uint16x8_t p2q2_p3q3 = vaddl_u8(p2q2, p3q3); uint16x8_t out = vaddq_u16(p0q0_p1q1, p2q2_p3q3); uint16x8_t q0p0_p4q4 = vaddl_u8(q0p0, p4q4); uint16x8_t p5q5_p6q6 = vaddl_u8(p5q5, p6q6); uint16x8_t tmp = vaddq_u16(q0p0_p4q4, p5q5_p6q6); // This offset removes the need for a rounding shift at the end. uint16x8_t tmp_offset = vaddq_u16(tmp, vdupq_n_u16(1 << 3)); out = vaddq_u16(out, tmp_offset); uint16x8_t out_pq5 = vaddw_u8(out, p4q4); uint16x8_t out_pq4 = vaddw_u8(out_pq5, p3q3); uint16x8_t out_pq3 = vaddw_u8(out_pq4, p2q2); out_pq5 = vaddw_u8(out_pq5, p5q5); uint16x8_t out_pq0 = vaddw_u8(out, p1q1); uint16x8_t out_pq1 = vaddw_u8(out_pq0, p2q2); uint16x8_t out_pq2 = vaddw_u8(out_pq1, p3q3); uint16x8_t p0q0_q0p0 = vaddl_u8(p0q0, q0p0); out_pq0 = vaddq_u16(out_pq0, p0q0_q0p0); uint16x8_t p0q0_p6q6 = vaddl_u8(p0q0, p6q6); out_pq1 = vaddq_u16(out_pq1, p0q0_p6q6); uint16x8_t p5q5_q1p1 = vaddl_u8(p5q5, q1p1); out_pq4 = vaddq_u16(out_pq4, p5q5_q1p1); uint16x8_t p6q6_p6q6 = vaddl_u8(p6q6, p6q6); out_pq2 = vaddq_u16(out_pq2, p6q6_p6q6); uint16x8_t p6q6_temp = vaddw_u8(p6q6_p6q6, p6q6); out_pq3 = vaddq_u16(out_pq3, p6q6_temp); p6q6_temp = vaddw_u8(p6q6_temp, p6q6); out_pq4 = vaddq_u16(out_pq4, p6q6_temp); p6q6_temp = vaddq_u16(p6q6_temp, p6q6_p6q6); out_pq5 = vaddq_u16(out_pq5, p6q6_temp); uint16x8_t qp_sum = vaddl_u8(q2p2, q1p1); out_pq3 = vaddq_u16(out_pq3, qp_sum); qp_sum = vaddw_u8(qp_sum, q3p3); out_pq2 = vaddq_u16(out_pq2, qp_sum); qp_sum = vaddw_u8(qp_sum, q4p4); out_pq1 = vaddq_u16(out_pq1, qp_sum); qp_sum = vaddw_u8(qp_sum, q5p5); out_pq0 = vaddq_u16(out_pq0, qp_sum); *p0q0_output = vshrn_n_u16(out_pq0, 4); *p1q1_output = vshrn_n_u16(out_pq1, 4); *p2q2_output = vshrn_n_u16(out_pq2, 4); *p3q3_output = vshrn_n_u16(out_pq3, 4); *p4q4_output = vshrn_n_u16(out_pq4, 4); *p5q5_output = vshrn_n_u16(out_pq5, 4); } static inline void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4, uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit, const uint8_t limit, const uint8_t thresh) { uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4, out_f14_pq5; uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2; uint8x8_t out_f4_pq0, out_f4_pq1; // Calculate filter masks. uint8x8_t mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit); uint8x8_t flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0); uint8x8_t flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0); // No filtering. if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) { return; } uint8x8_t filter8_cond = vand_u8(flat_8x8, mask_8x8); uint8x8_t filter4_cond = vmvn_u8(filter8_cond); uint8x8_t filter14_cond = vand_u8(filter8_cond, flat2_8x8); if (vget_lane_s64(vreinterpret_s64_u8(filter14_cond), 0) == -1) { // Only filter14() applies. filter14(*p0q0, *p1q1, *p2q2, *p3q3, *p4q4, *p5q5, *p6q6, &out_f14_pq0, &out_f14_pq1, &out_f14_pq2, &out_f14_pq3, &out_f14_pq4, &out_f14_pq5); *p0q0 = out_f14_pq0; *p1q1 = out_f14_pq1; *p2q2 = out_f14_pq2; *p3q3 = out_f14_pq3; *p4q4 = out_f14_pq4; *p5q5 = out_f14_pq5; } else if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0 && vget_lane_s64(vreinterpret_s64_u8(filter8_cond), 0) == -1) { // Only filter8() applies. filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1, &out_f7_pq2); *p0q0 = out_f7_pq0; *p1q1 = out_f7_pq1; *p2q2 = out_f7_pq2; } else { filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh); if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0 && vget_lane_u64(vreinterpret_u64_u8(filter8_cond), 0) == 0) { // filter8() and filter14() do not apply, but filter4() applies to one or // more values. *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); } else { filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1, &out_f7_pq2); if (vget_lane_u64(vreinterpret_u64_u8(filter14_cond), 0) == 0) { // filter14() does not apply, but filter8() and filter4() apply to one // or more values. filter4 outputs *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); // filter8 outputs *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); } else { // All filters may contribute values to final outputs. filter14(*p0q0, *p1q1, *p2q2, *p3q3, *p4q4, *p5q5, *p6q6, &out_f14_pq0, &out_f14_pq1, &out_f14_pq2, &out_f14_pq3, &out_f14_pq4, &out_f14_pq5); // filter4 outputs *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); // filter8 outputs *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); // filter14 outputs *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0); *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1); *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2); *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3); *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4); *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5); } } } } static inline void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit, const uint8_t limit, const uint8_t thresh) { uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2; uint8x8_t out_f4_pq0, out_f4_pq1; // Calculate filter masks. uint8x8_t mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit); uint8x8_t flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0); // No filtering. if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) { return; } uint8x8_t filter8_cond = vand_u8(flat_8x8, mask_8x8); uint8x8_t filter4_cond = vmvn_u8(filter8_cond); // Not needing filter4() at all is a very common case, so isolate it to avoid // needlessly computing filter4(). if (vget_lane_s64(vreinterpret_s64_u8(filter8_cond), 0) == -1) { filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1, &out_f7_pq2); *p0q0 = out_f7_pq0; *p1q1 = out_f7_pq1; *p2q2 = out_f7_pq2; } else { filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh); if (vget_lane_u64(vreinterpret_u64_u8(filter8_cond), 0) == 0) { // filter8() does not apply, but filter4() applies to one or more values. *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); } else { filter8(*p0q0, *p1q1, *p2q2, *p3q3, &out_f7_pq0, &out_f7_pq1, &out_f7_pq2); // filter4 outputs *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); // filter8 outputs *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); } } } static inline void filter6(const uint8x8_t p0q0, const uint8x8_t p1q1, const uint8x8_t p2q2, uint8x8_t *p0q0_output, uint8x8_t *p1q1_output) { uint8x8_t q0p0 = vext_u8(p0q0, p0q0, 4); uint16x8_t p0q0_p1q1 = vaddl_u8(p0q0, p1q1); uint16x8_t out = vaddq_u16(p0q0_p1q1, p0q0_p1q1); uint16x8_t q0p0_p2q2 = vaddl_u8(q0p0, p2q2); out = vaddq_u16(out, q0p0_p2q2); uint16x8_t q0p0_q1p1 = vextq_u16(p0q0_p1q1, p0q0_p1q1, 4); uint16x8_t out_pq0 = vaddq_u16(out, q0p0_q1p1); uint16x8_t p2q2_p2q2 = vaddl_u8(p2q2, p2q2); uint16x8_t out_pq1 = vaddq_u16(out, p2q2_p2q2); *p0q0_output = vrshrn_n_u16(out_pq0, 3); *p1q1_output = vrshrn_n_u16(out_pq1, 3); } static inline void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit, const uint8_t limit, const uint8_t thresh) { uint8x8_t out_f6_pq0, out_f6_pq1; uint8x8_t out_f4_pq0, out_f4_pq1; // Calculate filter masks. uint8x8_t mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit); uint8x8_t flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0); // No filtering. if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) { return; } uint8x8_t filter6_cond = vand_u8(flat_8x8, mask_8x8); uint8x8_t filter4_cond = vmvn_u8(filter6_cond); // Not needing filter4 at all is a very common case, so isolate it to avoid // needlessly computing filter4. if (vget_lane_s64(vreinterpret_s64_u8(filter6_cond), 0) == -1) { filter6(*p0q0, *p1q1, *p2q2, &out_f6_pq0, &out_f6_pq1); *p0q0 = out_f6_pq0; *p1q1 = out_f6_pq1; } else { filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh); if (vget_lane_u64(vreinterpret_u64_u8(filter6_cond), 0) == 0) { // filter6 does not apply, but filter4 applies to one or more values. *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); } else { // All filters may contribute to the final output. filter6(*p0q0, *p1q1, *p2q2, &out_f6_pq0, &out_f6_pq1); // filter4 outputs *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); // filter6 outputs *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0); *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1); } } } static inline void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit, const uint8_t limit, const uint8_t thresh) { uint8x8_t out_f4_pq0, out_f4_pq1; // Calculate filter mask uint8x8_t mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit); // No filtering. if (vget_lane_u64(vreinterpret_u64_u8(mask_8x8), 0) == 0) { return; } filter4(*p0q0, *p1q1, &out_f4_pq0, &out_f4_pq1, mask_8x8, thresh); *p0q0 = out_f4_pq0; *p1q1 = out_f4_pq1; } void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { uint8x16_t row0, row1, row2, row3; uint8x8_t pxp3, p6p2, p5p1, p4p0; uint8x8_t q0q4, q1q5, q2q6, q3qy; uint32x2x2_t p6q6_p2q2, p5q5_p1q1, p4q4_p0q0, pxqx_p3q3; uint32x2_t pq_rev; uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6; // row0: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y // row1: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y // row2: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y // row3: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y load_u8_16x4(src - 8, stride, &row0, &row1, &row2, &row3); pxp3 = vget_low_u8(row0); p6p2 = vget_low_u8(row1); p5p1 = vget_low_u8(row2); p4p0 = vget_low_u8(row3); transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0); q0q4 = vget_high_u8(row0); q1q5 = vget_high_u8(row1); q2q6 = vget_high_u8(row2); q3qy = vget_high_u8(row3); transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy); pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy)); pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev); pq_rev = vrev64_u32(vreinterpret_u32_u8(q1q5)); p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5p1), pq_rev); pq_rev = vrev64_u32(vreinterpret_u32_u8(q0q4)); p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4p0), pq_rev); pq_rev = vrev64_u32(vreinterpret_u32_u8(q2q6)); p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6p2), pq_rev); p0q0 = vreinterpret_u8_u32(p4q4_p0q0.val[1]); p1q1 = vreinterpret_u8_u32(p5q5_p1q1.val[1]); p2q2 = vreinterpret_u8_u32(p6q6_p2q2.val[1]); p3q3 = vreinterpret_u8_u32(pxqx_p3q3.val[1]); p4q4 = vreinterpret_u8_u32(p4q4_p0q0.val[0]); p5q5 = vreinterpret_u8_u32(p5q5_p1q1.val[0]); p6q6 = vreinterpret_u8_u32(p6q6_p2q2.val[0]); lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); pxqx_p3q3 = vtrn_u32(pxqx_p3q3.val[0], vreinterpret_u32_u8(p3q3)); p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5q5), vreinterpret_u32_u8(p1q1)); p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4q4), vreinterpret_u32_u8(p0q0)); p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6q6), vreinterpret_u32_u8(p2q2)); pxqx_p3q3.val[1] = vrev64_u32(pxqx_p3q3.val[1]); p5q5_p1q1.val[1] = vrev64_u32(p5q5_p1q1.val[1]); p4q4_p0q0.val[1] = vrev64_u32(p4q4_p0q0.val[1]); p6q6_p2q2.val[1] = vrev64_u32(p6q6_p2q2.val[1]); q0q4 = vreinterpret_u8_u32(p4q4_p0q0.val[1]); q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]); q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]); q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]); transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy); pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]); p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]); p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]); p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]); transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0); row0 = vcombine_u8(pxp3, q0q4); row1 = vcombine_u8(p6p2, q1q5); row2 = vcombine_u8(p5p1, q2q6); row3 = vcombine_u8(p4p0, q3qy); store_u8_16x4(src - 8, stride, row0, row1, row2, row3); } void aom_lpf_vertical_14_dual_neon( uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_vertical_14_neon(s, pitch, blimit0, limit0, thresh0); aom_lpf_vertical_14_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1); } void aom_lpf_vertical_14_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { aom_lpf_vertical_14_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit, thresh); aom_lpf_vertical_14_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit, thresh, blimit, limit, thresh); } void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { uint32x2x2_t p2q2_p1q1, p3q3_p0q0; uint32x2_t pq_rev; uint8x8_t p3q0, p2q1, p1q2, p0q3; uint8x8_t p0q0, p1q1, p2q2, p3q3; // row0: p3 p2 p1 p0 | q0 q1 q2 q3 // row1: p3 p2 p1 p0 | q0 q1 q2 q3 // row2: p3 p2 p1 p0 | q0 q1 q2 q3 // row3: p3 p2 p1 p0 | q0 q1 q2 q3 load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3); transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3); pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3)); p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev); pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2)); p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev); p0q0 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1])); p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); p3q3 = vreinterpret_u8_u32(p3q3_p0q0.val[0]); lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0)); p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q3), pq_rev); pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1)); p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev); p0q3 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1])); p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]); transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3); store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3); } void aom_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_vertical_8_neon(s, pitch, blimit0, limit0, thresh0); aom_lpf_vertical_8_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1); } void aom_lpf_vertical_8_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { aom_lpf_vertical_8_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit, thresh); aom_lpf_vertical_8_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit, thresh, blimit, limit, thresh); } void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { uint32x2x2_t p2q2_p1q1, pxqy_p0q0; uint32x2_t pq_rev; uint8x8_t pxq0, p2q1, p1q2, p0qy; uint8x8_t p0q0, p1q1, p2q2, pxqy; // row0: px p2 p1 p0 | q0 q1 q2 qy // row1: px p2 p1 p0 | q0 q1 q2 qy // row2: px p2 p1 p0 | q0 q1 q2 qy // row3: px p2 p1 p0 | q0 q1 q2 qy load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy); transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy); pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy)); pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev); pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2)); p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev); p0q0 = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1])); p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); pxqy = vreinterpret_u8_u32(pxqy_p0q0.val[0]); lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0)); pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxqy), pq_rev); pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1)); p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev); p0qy = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1])); p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]); transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy); store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy); } void aom_lpf_vertical_6_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_vertical_6_neon(s, pitch, blimit0, limit0, thresh0); aom_lpf_vertical_6_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1); } void aom_lpf_vertical_6_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { aom_lpf_vertical_6_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit, thresh); aom_lpf_vertical_6_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit, thresh, blimit, limit, thresh); } void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0; uint32x2_t pq_rev; uint8x8_t p1p0, q0q1; uint8x8_t p0q0, p1q1; // row0: p1 p0 | q0 q1 // row1: p1 p0 | q0 q1 // row2: p1 p0 | q0 q1 // row3: p1 p0 | q0 q1 load_unaligned_u8_4x4(src - 2, stride, &p1p0, &q0q1); transpose_elems_inplace_u8_4x4(&p1p0, &q0q1); p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1)); pq_rev = vrev64_u32(p1q0_p0q1.val[1]); p1q1_p0q0 = vtrn_u32(p1q0_p0q1.val[0], pq_rev); p1q1 = vreinterpret_u8_u32(p1q1_p0q0.val[0]); p0q0 = vreinterpret_u8_u32(p1q1_p0q0.val[1]); lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh); p1p0_q1q0 = vtrn_u32(vreinterpret_u32_u8(p1q1), vreinterpret_u32_u8(p0q0)); p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]); q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1])); transpose_elems_inplace_u8_4x4(&p1p0, &q0q1); store_u8x4_strided_x2(src - 2, 2 * stride, p1p0); store_u8x4_strided_x2(src + stride - 2, 2 * stride, q0q1); } void aom_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_vertical_4_neon(s, pitch, blimit0, limit0, thresh0); aom_lpf_vertical_4_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1); } void aom_lpf_vertical_4_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { aom_lpf_vertical_4_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit, thresh); aom_lpf_vertical_4_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit, thresh, blimit, limit, thresh); } void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { uint8x8_t p6q6 = load_u8_4x2(src - 7 * stride, 13 * stride); uint8x8_t p5q5 = load_u8_4x2(src - 6 * stride, 11 * stride); uint8x8_t p4q4 = load_u8_4x2(src - 5 * stride, 9 * stride); uint8x8_t p3q3 = load_u8_4x2(src - 4 * stride, 7 * stride); uint8x8_t p2q2 = load_u8_4x2(src - 3 * stride, 5 * stride); uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride); uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride); lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); store_u8x4_strided_x2(src - 1 * stride, 1 * stride, p0q0); store_u8x4_strided_x2(src - 2 * stride, 3 * stride, p1q1); store_u8x4_strided_x2(src - 3 * stride, 5 * stride, p2q2); store_u8x4_strided_x2(src - 4 * stride, 7 * stride, p3q3); store_u8x4_strided_x2(src - 5 * stride, 9 * stride, p4q4); store_u8x4_strided_x2(src - 6 * stride, 11 * stride, p5q5); } void aom_lpf_horizontal_14_dual_neon( uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_horizontal_14_neon(s, pitch, blimit0, limit0, thresh0); aom_lpf_horizontal_14_neon(s + 4, pitch, blimit1, limit1, thresh1); } // TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed // up. void aom_lpf_horizontal_14_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { aom_lpf_horizontal_14_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit, thresh); aom_lpf_horizontal_14_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh, blimit, limit, thresh); } void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { uint8x8_t p0q0, p1q1, p2q2, p3q3; p3q3 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 4 * stride))); p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride))); p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride))); p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride))); p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1)); p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1)); p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1)); p3q3 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1)); lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); vst1_lane_u32((uint32_t *)(src - 4 * stride), vreinterpret_u32_u8(p3q3), 0); vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0); vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0); vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0); vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1); vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1); vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1); vst1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1); } void aom_lpf_horizontal_8_dual_neon( uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_horizontal_8_neon(s, pitch, blimit0, limit0, thresh0); aom_lpf_horizontal_8_neon(s + 4, pitch, blimit1, limit1, thresh1); } // TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed // up. void aom_lpf_horizontal_8_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { aom_lpf_horizontal_8_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit, thresh); aom_lpf_horizontal_8_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh, blimit, limit, thresh); } void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { uint8x8_t p0q0, p1q1, p2q2; p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride))); p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride))); p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride))); p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1)); p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1)); p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1)); lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0); vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0); vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0); vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1); vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1); vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1); } void aom_lpf_horizontal_6_dual_neon( uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_horizontal_6_neon(s, pitch, blimit0, limit0, thresh0); aom_lpf_horizontal_6_neon(s + 4, pitch, blimit1, limit1, thresh1); } // TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed // up. void aom_lpf_horizontal_6_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { aom_lpf_horizontal_6_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit, thresh); aom_lpf_horizontal_6_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh, blimit, limit, thresh); } void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride); uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride); lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh); store_u8x4_strided_x2(src - 1 * stride, 1 * stride, p0q0); store_u8x4_strided_x2(src - 2 * stride, 3 * stride, p1q1); } void aom_lpf_horizontal_4_dual_neon( uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_horizontal_4_neon(s, pitch, blimit0, limit0, thresh0); aom_lpf_horizontal_4_neon(s + 4, pitch, blimit1, limit1, thresh1); } // TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed // up. void aom_lpf_horizontal_4_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { aom_lpf_horizontal_4_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit, thresh); aom_lpf_horizontal_4_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh, blimit, limit, thresh); } aom-3.12.1/aom_dsp/arm/masked_sad_neon.c000066400000000000000000000221031477627663500200560ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/blend_neon.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_dsp/blend.h" static inline uint16x8_t masked_sad_16x1_neon(uint16x8_t sad, const uint8_t *src, const uint8_t *a, const uint8_t *b, const uint8_t *m) { uint8x16_t m0 = vld1q_u8(m); uint8x16_t a0 = vld1q_u8(a); uint8x16_t b0 = vld1q_u8(b); uint8x16_t s0 = vld1q_u8(src); uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, a0, b0); return vpadalq_u8(sad, vabdq_u8(blend_u8, s0)); } static inline unsigned masked_sad_128xh_neon(const uint8_t *src, int src_stride, const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *m, int m_stride, int height) { // Eight accumulator vectors are required to avoid overflow in the 128x128 // case. assert(height <= 128); uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; do { sad[0] = masked_sad_16x1_neon(sad[0], &src[0], &a[0], &b[0], &m[0]); sad[1] = masked_sad_16x1_neon(sad[1], &src[16], &a[16], &b[16], &m[16]); sad[2] = masked_sad_16x1_neon(sad[2], &src[32], &a[32], &b[32], &m[32]); sad[3] = masked_sad_16x1_neon(sad[3], &src[48], &a[48], &b[48], &m[48]); sad[4] = masked_sad_16x1_neon(sad[4], &src[64], &a[64], &b[64], &m[64]); sad[5] = masked_sad_16x1_neon(sad[5], &src[80], &a[80], &b[80], &m[80]); sad[6] = masked_sad_16x1_neon(sad[6], &src[96], &a[96], &b[96], &m[96]); sad[7] = masked_sad_16x1_neon(sad[7], &src[112], &a[112], &b[112], &m[112]); src += src_stride; a += a_stride; b += b_stride; m += m_stride; height--; } while (height != 0); return horizontal_long_add_u16x8(sad[0], sad[1]) + horizontal_long_add_u16x8(sad[2], sad[3]) + horizontal_long_add_u16x8(sad[4], sad[5]) + horizontal_long_add_u16x8(sad[6], sad[7]); } static inline unsigned masked_sad_64xh_neon(const uint8_t *src, int src_stride, const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *m, int m_stride, int height) { // Four accumulator vectors are required to avoid overflow in the 64x128 case. assert(height <= 128); uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; do { sad[0] = masked_sad_16x1_neon(sad[0], &src[0], &a[0], &b[0], &m[0]); sad[1] = masked_sad_16x1_neon(sad[1], &src[16], &a[16], &b[16], &m[16]); sad[2] = masked_sad_16x1_neon(sad[2], &src[32], &a[32], &b[32], &m[32]); sad[3] = masked_sad_16x1_neon(sad[3], &src[48], &a[48], &b[48], &m[48]); src += src_stride; a += a_stride; b += b_stride; m += m_stride; height--; } while (height != 0); return horizontal_long_add_u16x8(sad[0], sad[1]) + horizontal_long_add_u16x8(sad[2], sad[3]); } static inline unsigned masked_sad_32xh_neon(const uint8_t *src, int src_stride, const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *m, int m_stride, int height) { // We could use a single accumulator up to height=64 without overflow. assert(height <= 64); uint16x8_t sad = vdupq_n_u16(0); do { sad = masked_sad_16x1_neon(sad, &src[0], &a[0], &b[0], &m[0]); sad = masked_sad_16x1_neon(sad, &src[16], &a[16], &b[16], &m[16]); src += src_stride; a += a_stride; b += b_stride; m += m_stride; height--; } while (height != 0); return horizontal_add_u16x8(sad); } static inline unsigned masked_sad_16xh_neon(const uint8_t *src, int src_stride, const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *m, int m_stride, int height) { // We could use a single accumulator up to height=128 without overflow. assert(height <= 128); uint16x8_t sad = vdupq_n_u16(0); do { sad = masked_sad_16x1_neon(sad, src, a, b, m); src += src_stride; a += a_stride; b += b_stride; m += m_stride; height--; } while (height != 0); return horizontal_add_u16x8(sad); } static inline unsigned masked_sad_8xh_neon(const uint8_t *src, int src_stride, const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *m, int m_stride, int height) { // We could use a single accumulator up to height=128 without overflow. assert(height <= 128); uint16x4_t sad = vdup_n_u16(0); do { uint8x8_t m0 = vld1_u8(m); uint8x8_t a0 = vld1_u8(a); uint8x8_t b0 = vld1_u8(b); uint8x8_t s0 = vld1_u8(src); uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0); sad = vpadal_u8(sad, vabd_u8(blend_u8, s0)); src += src_stride; a += a_stride; b += b_stride; m += m_stride; height--; } while (height != 0); return horizontal_add_u16x4(sad); } static inline unsigned masked_sad_4xh_neon(const uint8_t *src, int src_stride, const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *m, int m_stride, int height) { // Process two rows per loop iteration. assert(height % 2 == 0); // We could use a single accumulator up to height=256 without overflow. assert(height <= 256); uint16x4_t sad = vdup_n_u16(0); do { uint8x8_t m0 = load_unaligned_u8(m, m_stride); uint8x8_t a0 = load_unaligned_u8(a, a_stride); uint8x8_t b0 = load_unaligned_u8(b, b_stride); uint8x8_t s0 = load_unaligned_u8(src, src_stride); uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0); sad = vpadal_u8(sad, vabd_u8(blend_u8, s0)); src += 2 * src_stride; a += 2 * a_stride; b += 2 * b_stride; m += 2 * m_stride; height -= 2; } while (height != 0); return horizontal_add_u16x4(sad); } #define MASKED_SAD_WXH_NEON(width, height) \ unsigned aom_masked_sad##width##x##height##_neon( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ int invert_mask) { \ if (!invert_mask) \ return masked_sad_##width##xh_neon(src, src_stride, ref, ref_stride, \ second_pred, width, msk, msk_stride, \ height); \ else \ return masked_sad_##width##xh_neon(src, src_stride, second_pred, width, \ ref, ref_stride, msk, msk_stride, \ height); \ } MASKED_SAD_WXH_NEON(4, 4) MASKED_SAD_WXH_NEON(4, 8) MASKED_SAD_WXH_NEON(8, 4) MASKED_SAD_WXH_NEON(8, 8) MASKED_SAD_WXH_NEON(8, 16) MASKED_SAD_WXH_NEON(16, 8) MASKED_SAD_WXH_NEON(16, 16) MASKED_SAD_WXH_NEON(16, 32) MASKED_SAD_WXH_NEON(32, 16) MASKED_SAD_WXH_NEON(32, 32) MASKED_SAD_WXH_NEON(32, 64) MASKED_SAD_WXH_NEON(64, 32) MASKED_SAD_WXH_NEON(64, 64) MASKED_SAD_WXH_NEON(64, 128) MASKED_SAD_WXH_NEON(128, 64) MASKED_SAD_WXH_NEON(128, 128) #if !CONFIG_REALTIME_ONLY MASKED_SAD_WXH_NEON(4, 16) MASKED_SAD_WXH_NEON(16, 4) MASKED_SAD_WXH_NEON(8, 32) MASKED_SAD_WXH_NEON(32, 8) MASKED_SAD_WXH_NEON(16, 64) MASKED_SAD_WXH_NEON(64, 16) #endif aom-3.12.1/aom_dsp/arm/mem_neon.h000066400000000000000000001366771477627663500165740ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_ #define AOM_AOM_DSP_ARM_MEM_NEON_H_ #include #include #include "aom_dsp/aom_dsp_common.h" // Support for xN Neon intrinsics is lacking in some compilers. #if defined(__arm__) || defined(_M_ARM) #define ARM_32_BIT #endif // DEFICIENT_CLANG_32_BIT includes clang-cl. #if defined(__clang__) && defined(ARM_32_BIT) && \ (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7)) #define DEFICIENT_CLANG_32_BIT // This includes clang-cl. #endif #if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT) #define GCC_32_BIT #endif #if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT) static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) { uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16), vld1q_u8(ptr + 2 * 16) } }; return res; } static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) { uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } }; return res; } static inline uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) { uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } }; return res; } static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) { uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8), vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } }; return res; } static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) { int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } }; return res; } static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) { int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8), vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } }; return res; } static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) { vst1_u8(ptr + 0 * 8, a.val[0]); vst1_u8(ptr + 1 * 8, a.val[1]); } static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) { vst1_u8(ptr + 0 * 8, a.val[0]); vst1_u8(ptr + 1 * 8, a.val[1]); vst1_u8(ptr + 2 * 8, a.val[2]); vst1_u8(ptr + 3 * 8, a.val[3]); } static inline void vst1q_u16_x2(uint16_t *ptr, uint16x8x2_t a) { vst1q_u16(ptr + 0 * 8, a.val[0]); vst1q_u16(ptr + 1 * 8, a.val[1]); } static inline void vst1q_u16_x4(uint16_t *ptr, uint16x8x4_t a) { vst1q_u16(ptr + 0 * 8, a.val[0]); vst1q_u16(ptr + 1 * 8, a.val[1]); vst1q_u16(ptr + 2 * 8, a.val[2]); vst1q_u16(ptr + 3 * 8, a.val[3]); } #elif defined(__GNUC__) && !defined(__clang__) // GCC 64-bit. #if __GNUC__ < 8 static inline uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) { uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } }; return res; } static inline int16x8x2_t vld1q_s16_x2(const int16_t *ptr) { int16x8x2_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8) } }; return res; } #endif // __GNUC__ < 8 #if __GNUC__ < 9 static inline uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) { uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16), vld1q_u8(ptr + 2 * 16) } }; return res; } #endif // __GNUC__ < 9 #if ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805 static inline uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) { uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8), vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } }; return res; } static inline int16x8x4_t vld1q_s16_x4(const int16_t *ptr) { int16x8x4_t res = { { vld1q_s16(ptr + 0 * 8), vld1q_s16(ptr + 1 * 8), vld1q_s16(ptr + 2 * 8), vld1q_s16(ptr + 3 * 8) } }; return res; } static inline void vst1_u8_x2(uint8_t *ptr, uint8x8x2_t a) { vst1_u8(ptr + 0 * 8, a.val[0]); vst1_u8(ptr + 1 * 8, a.val[1]); } static inline void vst1_u8_x4(uint8_t *ptr, uint8x8x4_t a) { vst1_u8(ptr + 0 * 8, a.val[0]); vst1_u8(ptr + 1 * 8, a.val[1]); vst1_u8(ptr + 2 * 8, a.val[2]); vst1_u8(ptr + 3 * 8, a.val[3]); } #endif // ((__GNUC__ << 8) | __GNUC_MINOR__) < 0x805 #endif // defined(__GNUC__) && !defined(__clang__) static inline void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, const uint8x8_t s1) { vst1_u8(s, s0); s += p; vst1_u8(s, s1); s += p; } static inline uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) { return vcombine_u8(vld1_u8(s), vld1_u8(s + p)); } // Load four bytes into the low half of a uint8x8_t, zero the upper half. static inline uint8x8_t load_u8_4x1(const uint8_t *p) { uint8x8_t ret = vdup_n_u8(0); ret = vreinterpret_u8_u32( vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0)); return ret; } static inline uint8x8_t load_u8_4x2(const uint8_t *p, int stride) { uint8x8_t ret = vdup_n_u8(0); ret = vreinterpret_u8_u32( vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0)); p += stride; ret = vreinterpret_u8_u32( vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1)); return ret; } static inline uint16x4_t load_u16_2x2(const uint16_t *p, int stride) { uint16x4_t ret = vdup_n_u16(0); ret = vreinterpret_u16_u32( vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0)); p += stride; ret = vreinterpret_u16_u32( vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1)); return ret; } static inline void load_u8_8x8(const uint8_t *s, ptrdiff_t p, uint8x8_t *const s0, uint8x8_t *const s1, uint8x8_t *const s2, uint8x8_t *const s3, uint8x8_t *const s4, uint8x8_t *const s5, uint8x8_t *const s6, uint8x8_t *const s7) { *s0 = vld1_u8(s); s += p; *s1 = vld1_u8(s); s += p; *s2 = vld1_u8(s); s += p; *s3 = vld1_u8(s); s += p; *s4 = vld1_u8(s); s += p; *s5 = vld1_u8(s); s += p; *s6 = vld1_u8(s); s += p; *s7 = vld1_u8(s); } static inline void load_u8_8x7(const uint8_t *s, ptrdiff_t p, uint8x8_t *const s0, uint8x8_t *const s1, uint8x8_t *const s2, uint8x8_t *const s3, uint8x8_t *const s4, uint8x8_t *const s5, uint8x8_t *const s6) { *s0 = vld1_u8(s); s += p; *s1 = vld1_u8(s); s += p; *s2 = vld1_u8(s); s += p; *s3 = vld1_u8(s); s += p; *s4 = vld1_u8(s); s += p; *s5 = vld1_u8(s); s += p; *s6 = vld1_u8(s); } static inline void load_u8_8x6(const uint8_t *s, ptrdiff_t p, uint8x8_t *const s0, uint8x8_t *const s1, uint8x8_t *const s2, uint8x8_t *const s3, uint8x8_t *const s4, uint8x8_t *const s5) { *s0 = vld1_u8(s); s += p; *s1 = vld1_u8(s); s += p; *s2 = vld1_u8(s); s += p; *s3 = vld1_u8(s); s += p; *s4 = vld1_u8(s); s += p; *s5 = vld1_u8(s); } static inline void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, uint8x8_t *const s0, uint8x8_t *const s1, uint8x8_t *const s2, uint8x8_t *const s3) { *s0 = vld1_u8(s); s += p; *s1 = vld1_u8(s); s += p; *s2 = vld1_u8(s); s += p; *s3 = vld1_u8(s); } static inline void load_u8_8x3(const uint8_t *s, const ptrdiff_t p, uint8x8_t *const s0, uint8x8_t *const s1, uint8x8_t *const s2) { *s0 = vld1_u8(s); s += p; *s1 = vld1_u8(s); s += p; *s2 = vld1_u8(s); } static inline void load_u16_4x4(const uint16_t *s, const ptrdiff_t p, uint16x4_t *const s0, uint16x4_t *const s1, uint16x4_t *const s2, uint16x4_t *const s3) { *s0 = vld1_u16(s); s += p; *s1 = vld1_u16(s); s += p; *s2 = vld1_u16(s); s += p; *s3 = vld1_u16(s); s += p; } static inline void load_u16_4x6(const uint16_t *s, ptrdiff_t p, uint16x4_t *const s0, uint16x4_t *const s1, uint16x4_t *const s2, uint16x4_t *const s3, uint16x4_t *const s4, uint16x4_t *const s5) { *s0 = vld1_u16(s); s += p; *s1 = vld1_u16(s); s += p; *s2 = vld1_u16(s); s += p; *s3 = vld1_u16(s); s += p; *s4 = vld1_u16(s); s += p; *s5 = vld1_u16(s); } static inline void load_u16_4x7(const uint16_t *s, ptrdiff_t p, uint16x4_t *const s0, uint16x4_t *const s1, uint16x4_t *const s2, uint16x4_t *const s3, uint16x4_t *const s4, uint16x4_t *const s5, uint16x4_t *const s6) { *s0 = vld1_u16(s); s += p; *s1 = vld1_u16(s); s += p; *s2 = vld1_u16(s); s += p; *s3 = vld1_u16(s); s += p; *s4 = vld1_u16(s); s += p; *s5 = vld1_u16(s); s += p; *s6 = vld1_u16(s); } static inline void load_u16_4x8(const uint16_t *s, ptrdiff_t p, uint16x4_t *const s0, uint16x4_t *const s1, uint16x4_t *const s2, uint16x4_t *const s3, uint16x4_t *const s4, uint16x4_t *const s5, uint16x4_t *const s6, uint16x4_t *const s7) { *s0 = vld1_u16(s); s += p; *s1 = vld1_u16(s); s += p; *s2 = vld1_u16(s); s += p; *s3 = vld1_u16(s); s += p; *s4 = vld1_u16(s); s += p; *s5 = vld1_u16(s); s += p; *s6 = vld1_u16(s); s += p; *s7 = vld1_u16(s); } static inline void load_u16_4x14(const uint16_t *s, ptrdiff_t p, uint16x4_t *const s0, uint16x4_t *const s1, uint16x4_t *const s2, uint16x4_t *const s3, uint16x4_t *const s4, uint16x4_t *const s5, uint16x4_t *const s6, uint16x4_t *const s7, uint16x4_t *const s8, uint16x4_t *const s9, uint16x4_t *const s10, uint16x4_t *const s11, uint16x4_t *const s12, uint16x4_t *const s13) { *s0 = vld1_u16(s); s += p; *s1 = vld1_u16(s); s += p; *s2 = vld1_u16(s); s += p; *s3 = vld1_u16(s); s += p; *s4 = vld1_u16(s); s += p; *s5 = vld1_u16(s); s += p; *s6 = vld1_u16(s); s += p; *s7 = vld1_u16(s); s += p; *s8 = vld1_u16(s); s += p; *s9 = vld1_u16(s); s += p; *s10 = vld1_u16(s); s += p; *s11 = vld1_u16(s); s += p; *s12 = vld1_u16(s); s += p; *s13 = vld1_u16(s); } static inline void load_s16_8x2(const int16_t *s, const ptrdiff_t p, int16x8_t *const s0, int16x8_t *const s1) { *s0 = vld1q_s16(s); s += p; *s1 = vld1q_s16(s); } static inline void load_u16_8x2(const uint16_t *s, const ptrdiff_t p, uint16x8_t *const s0, uint16x8_t *const s1) { *s0 = vld1q_u16(s); s += p; *s1 = vld1q_u16(s); } static inline void load_u16_8x3(const uint16_t *s, const ptrdiff_t p, uint16x8_t *const s0, uint16x8_t *const s1, uint16x8_t *const s2) { *s0 = vld1q_u16(s); s += p; *s1 = vld1q_u16(s); s += p; *s2 = vld1q_u16(s); } static inline void load_u16_8x4(const uint16_t *s, const ptrdiff_t p, uint16x8_t *const s0, uint16x8_t *const s1, uint16x8_t *const s2, uint16x8_t *const s3) { *s0 = vld1q_u16(s); s += p; *s1 = vld1q_u16(s); s += p; *s2 = vld1q_u16(s); s += p; *s3 = vld1q_u16(s); s += p; } static inline void load_s16_4x12(const int16_t *s, ptrdiff_t p, int16x4_t *const s0, int16x4_t *const s1, int16x4_t *const s2, int16x4_t *const s3, int16x4_t *const s4, int16x4_t *const s5, int16x4_t *const s6, int16x4_t *const s7, int16x4_t *const s8, int16x4_t *const s9, int16x4_t *const s10, int16x4_t *const s11) { *s0 = vld1_s16(s); s += p; *s1 = vld1_s16(s); s += p; *s2 = vld1_s16(s); s += p; *s3 = vld1_s16(s); s += p; *s4 = vld1_s16(s); s += p; *s5 = vld1_s16(s); s += p; *s6 = vld1_s16(s); s += p; *s7 = vld1_s16(s); s += p; *s8 = vld1_s16(s); s += p; *s9 = vld1_s16(s); s += p; *s10 = vld1_s16(s); s += p; *s11 = vld1_s16(s); } static inline void load_s16_4x11(const int16_t *s, ptrdiff_t p, int16x4_t *const s0, int16x4_t *const s1, int16x4_t *const s2, int16x4_t *const s3, int16x4_t *const s4, int16x4_t *const s5, int16x4_t *const s6, int16x4_t *const s7, int16x4_t *const s8, int16x4_t *const s9, int16x4_t *const s10) { *s0 = vld1_s16(s); s += p; *s1 = vld1_s16(s); s += p; *s2 = vld1_s16(s); s += p; *s3 = vld1_s16(s); s += p; *s4 = vld1_s16(s); s += p; *s5 = vld1_s16(s); s += p; *s6 = vld1_s16(s); s += p; *s7 = vld1_s16(s); s += p; *s8 = vld1_s16(s); s += p; *s9 = vld1_s16(s); s += p; *s10 = vld1_s16(s); } static inline void load_u16_4x11(const uint16_t *s, ptrdiff_t p, uint16x4_t *const s0, uint16x4_t *const s1, uint16x4_t *const s2, uint16x4_t *const s3, uint16x4_t *const s4, uint16x4_t *const s5, uint16x4_t *const s6, uint16x4_t *const s7, uint16x4_t *const s8, uint16x4_t *const s9, uint16x4_t *const s10) { *s0 = vld1_u16(s); s += p; *s1 = vld1_u16(s); s += p; *s2 = vld1_u16(s); s += p; *s3 = vld1_u16(s); s += p; *s4 = vld1_u16(s); s += p; *s5 = vld1_u16(s); s += p; *s6 = vld1_u16(s); s += p; *s7 = vld1_u16(s); s += p; *s8 = vld1_u16(s); s += p; *s9 = vld1_u16(s); s += p; *s10 = vld1_u16(s); } static inline void load_s16_4x8(const int16_t *s, ptrdiff_t p, int16x4_t *const s0, int16x4_t *const s1, int16x4_t *const s2, int16x4_t *const s3, int16x4_t *const s4, int16x4_t *const s5, int16x4_t *const s6, int16x4_t *const s7) { *s0 = vld1_s16(s); s += p; *s1 = vld1_s16(s); s += p; *s2 = vld1_s16(s); s += p; *s3 = vld1_s16(s); s += p; *s4 = vld1_s16(s); s += p; *s5 = vld1_s16(s); s += p; *s6 = vld1_s16(s); s += p; *s7 = vld1_s16(s); } static inline void load_s16_4x7(const int16_t *s, ptrdiff_t p, int16x4_t *const s0, int16x4_t *const s1, int16x4_t *const s2, int16x4_t *const s3, int16x4_t *const s4, int16x4_t *const s5, int16x4_t *const s6) { *s0 = vld1_s16(s); s += p; *s1 = vld1_s16(s); s += p; *s2 = vld1_s16(s); s += p; *s3 = vld1_s16(s); s += p; *s4 = vld1_s16(s); s += p; *s5 = vld1_s16(s); s += p; *s6 = vld1_s16(s); } static inline void load_s16_4x6(const int16_t *s, ptrdiff_t p, int16x4_t *const s0, int16x4_t *const s1, int16x4_t *const s2, int16x4_t *const s3, int16x4_t *const s4, int16x4_t *const s5) { *s0 = vld1_s16(s); s += p; *s1 = vld1_s16(s); s += p; *s2 = vld1_s16(s); s += p; *s3 = vld1_s16(s); s += p; *s4 = vld1_s16(s); s += p; *s5 = vld1_s16(s); } static inline void load_s16_4x5(const int16_t *s, ptrdiff_t p, int16x4_t *const s0, int16x4_t *const s1, int16x4_t *const s2, int16x4_t *const s3, int16x4_t *const s4) { *s0 = vld1_s16(s); s += p; *s1 = vld1_s16(s); s += p; *s2 = vld1_s16(s); s += p; *s3 = vld1_s16(s); s += p; *s4 = vld1_s16(s); } static inline void load_u16_4x5(const uint16_t *s, const ptrdiff_t p, uint16x4_t *const s0, uint16x4_t *const s1, uint16x4_t *const s2, uint16x4_t *const s3, uint16x4_t *const s4) { *s0 = vld1_u16(s); s += p; *s1 = vld1_u16(s); s += p; *s2 = vld1_u16(s); s += p; *s3 = vld1_u16(s); s += p; *s4 = vld1_u16(s); s += p; } static inline void load_u8_8x5(const uint8_t *s, ptrdiff_t p, uint8x8_t *const s0, uint8x8_t *const s1, uint8x8_t *const s2, uint8x8_t *const s3, uint8x8_t *const s4) { *s0 = vld1_u8(s); s += p; *s1 = vld1_u8(s); s += p; *s2 = vld1_u8(s); s += p; *s3 = vld1_u8(s); s += p; *s4 = vld1_u8(s); } static inline void load_u16_8x5(const uint16_t *s, const ptrdiff_t p, uint16x8_t *const s0, uint16x8_t *const s1, uint16x8_t *const s2, uint16x8_t *const s3, uint16x8_t *const s4) { *s0 = vld1q_u16(s); s += p; *s1 = vld1q_u16(s); s += p; *s2 = vld1q_u16(s); s += p; *s3 = vld1q_u16(s); s += p; *s4 = vld1q_u16(s); s += p; } static inline void load_s16_4x4(const int16_t *s, ptrdiff_t p, int16x4_t *const s0, int16x4_t *const s1, int16x4_t *const s2, int16x4_t *const s3) { *s0 = vld1_s16(s); s += p; *s1 = vld1_s16(s); s += p; *s2 = vld1_s16(s); s += p; *s3 = vld1_s16(s); } static inline void load_s16_4x3(const int16_t *s, ptrdiff_t p, int16x4_t *const s0, int16x4_t *const s1, int16x4_t *const s2) { *s0 = vld1_s16(s); s += p; *s1 = vld1_s16(s); s += p; *s2 = vld1_s16(s); } static inline void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, const uint8x8_t s1, const uint8x8_t s2, const uint8x8_t s3, const uint8x8_t s4, const uint8x8_t s5, const uint8x8_t s6, const uint8x8_t s7) { vst1_u8(s, s0); s += p; vst1_u8(s, s1); s += p; vst1_u8(s, s2); s += p; vst1_u8(s, s3); s += p; vst1_u8(s, s4); s += p; vst1_u8(s, s5); s += p; vst1_u8(s, s6); s += p; vst1_u8(s, s7); } static inline void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, const uint8x8_t s1, const uint8x8_t s2, const uint8x8_t s3) { vst1_u8(s, s0); s += p; vst1_u8(s, s1); s += p; vst1_u8(s, s2); s += p; vst1_u8(s, s3); } static inline void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0, const uint8x16_t s1, const uint8x16_t s2, const uint8x16_t s3) { vst1q_u8(s, s0); s += p; vst1q_u8(s, s1); s += p; vst1q_u8(s, s2); s += p; vst1q_u8(s, s3); } static inline void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride, const uint16x8_t s0, const uint16x8_t s1, const uint16x8_t s2, const uint16x8_t s3, const uint16x8_t s4, const uint16x8_t s5, const uint16x8_t s6, const uint16x8_t s7) { vst1q_u16(s, s0); s += dst_stride; vst1q_u16(s, s1); s += dst_stride; vst1q_u16(s, s2); s += dst_stride; vst1q_u16(s, s3); s += dst_stride; vst1q_u16(s, s4); s += dst_stride; vst1q_u16(s, s5); s += dst_stride; vst1q_u16(s, s6); s += dst_stride; vst1q_u16(s, s7); } static inline void store_u16_4x3(uint16_t *s, ptrdiff_t dst_stride, const uint16x4_t s0, const uint16x4_t s1, const uint16x4_t s2) { vst1_u16(s, s0); s += dst_stride; vst1_u16(s, s1); s += dst_stride; vst1_u16(s, s2); } static inline void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride, const uint16x4_t s0, const uint16x4_t s1, const uint16x4_t s2, const uint16x4_t s3) { vst1_u16(s, s0); s += dst_stride; vst1_u16(s, s1); s += dst_stride; vst1_u16(s, s2); s += dst_stride; vst1_u16(s, s3); } static inline void store_u16_4x6(uint16_t *s, ptrdiff_t dst_stride, const uint16x4_t s0, const uint16x4_t s1, const uint16x4_t s2, const uint16x4_t s3, const uint16x4_t s4, const uint16x4_t s5) { vst1_u16(s, s0); s += dst_stride; vst1_u16(s, s1); s += dst_stride; vst1_u16(s, s2); s += dst_stride; vst1_u16(s, s3); s += dst_stride; vst1_u16(s, s4); s += dst_stride; vst1_u16(s, s5); } static inline void store_u16_4x12(uint16_t *s, ptrdiff_t dst_stride, const uint16x4_t s0, const uint16x4_t s1, const uint16x4_t s2, const uint16x4_t s3, const uint16x4_t s4, const uint16x4_t s5, const uint16x4_t s6, const uint16x4_t s7, const uint16x4_t s8, const uint16x4_t s9, const uint16x4_t s10, const uint16x4_t s11) { vst1_u16(s, s0); s += dst_stride; vst1_u16(s, s1); s += dst_stride; vst1_u16(s, s2); s += dst_stride; vst1_u16(s, s3); s += dst_stride; vst1_u16(s, s4); s += dst_stride; vst1_u16(s, s5); s += dst_stride; vst1_u16(s, s6); s += dst_stride; vst1_u16(s, s7); s += dst_stride; vst1_u16(s, s8); s += dst_stride; vst1_u16(s, s9); s += dst_stride; vst1_u16(s, s10); s += dst_stride; vst1_u16(s, s11); s += dst_stride; } static inline void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride, const uint16x8_t s0, const uint16x8_t s1) { vst1q_u16(s, s0); s += dst_stride; vst1q_u16(s, s1); } static inline void store_u16_8x3(uint16_t *s, ptrdiff_t dst_stride, const uint16x8_t s0, const uint16x8_t s1, const uint16x8_t s2) { vst1q_u16(s, s0); s += dst_stride; vst1q_u16(s, s1); s += dst_stride; vst1q_u16(s, s2); } static inline void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride, const uint16x8_t s0, const uint16x8_t s1, const uint16x8_t s2, const uint16x8_t s3) { vst1q_u16(s, s0); s += dst_stride; vst1q_u16(s, s1); s += dst_stride; vst1q_u16(s, s2); s += dst_stride; vst1q_u16(s, s3); } static inline void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride, const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7) { vst1q_s16(s, s0); s += dst_stride; vst1q_s16(s, s1); s += dst_stride; vst1q_s16(s, s2); s += dst_stride; vst1q_s16(s, s3); s += dst_stride; vst1q_s16(s, s4); s += dst_stride; vst1q_s16(s, s5); s += dst_stride; vst1q_s16(s, s6); s += dst_stride; vst1q_s16(s, s7); } static inline void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride, const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3) { vst1_s16(s, s0); s += dst_stride; vst1_s16(s, s1); s += dst_stride; vst1_s16(s, s2); s += dst_stride; vst1_s16(s, s3); } static inline void store_s16_4x8(int16_t *s, ptrdiff_t dst_stride, const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7) { vst1_s16(s, s0); s += dst_stride; vst1_s16(s, s1); s += dst_stride; vst1_s16(s, s2); s += dst_stride; vst1_s16(s, s3); s += dst_stride; vst1_s16(s, s4); s += dst_stride; vst1_s16(s, s5); s += dst_stride; vst1_s16(s, s6); s += dst_stride; vst1_s16(s, s7); } static inline void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride, const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3) { vst1q_s16(s, s0); s += dst_stride; vst1q_s16(s, s1); s += dst_stride; vst1q_s16(s, s2); s += dst_stride; vst1q_s16(s, s3); } static inline void store_s16_8x2(int16_t *s, ptrdiff_t dst_stride, const int16x8_t s0, const int16x8_t s1) { vst1q_s16(s, s0); s += dst_stride; vst1q_s16(s, s1); } static inline void load_u8_8x11(const uint8_t *s, ptrdiff_t p, uint8x8_t *const s0, uint8x8_t *const s1, uint8x8_t *const s2, uint8x8_t *const s3, uint8x8_t *const s4, uint8x8_t *const s5, uint8x8_t *const s6, uint8x8_t *const s7, uint8x8_t *const s8, uint8x8_t *const s9, uint8x8_t *const s10) { *s0 = vld1_u8(s); s += p; *s1 = vld1_u8(s); s += p; *s2 = vld1_u8(s); s += p; *s3 = vld1_u8(s); s += p; *s4 = vld1_u8(s); s += p; *s5 = vld1_u8(s); s += p; *s6 = vld1_u8(s); s += p; *s7 = vld1_u8(s); s += p; *s8 = vld1_u8(s); s += p; *s9 = vld1_u8(s); s += p; *s10 = vld1_u8(s); } static inline void load_s16_8x10(const int16_t *s, ptrdiff_t p, int16x8_t *const s0, int16x8_t *const s1, int16x8_t *const s2, int16x8_t *const s3, int16x8_t *const s4, int16x8_t *const s5, int16x8_t *const s6, int16x8_t *const s7, int16x8_t *const s8, int16x8_t *const s9) { *s0 = vld1q_s16(s); s += p; *s1 = vld1q_s16(s); s += p; *s2 = vld1q_s16(s); s += p; *s3 = vld1q_s16(s); s += p; *s4 = vld1q_s16(s); s += p; *s5 = vld1q_s16(s); s += p; *s6 = vld1q_s16(s); s += p; *s7 = vld1q_s16(s); s += p; *s8 = vld1q_s16(s); s += p; *s9 = vld1q_s16(s); } static inline void load_s16_8x11(const int16_t *s, ptrdiff_t p, int16x8_t *const s0, int16x8_t *const s1, int16x8_t *const s2, int16x8_t *const s3, int16x8_t *const s4, int16x8_t *const s5, int16x8_t *const s6, int16x8_t *const s7, int16x8_t *const s8, int16x8_t *const s9, int16x8_t *const s10) { *s0 = vld1q_s16(s); s += p; *s1 = vld1q_s16(s); s += p; *s2 = vld1q_s16(s); s += p; *s3 = vld1q_s16(s); s += p; *s4 = vld1q_s16(s); s += p; *s5 = vld1q_s16(s); s += p; *s6 = vld1q_s16(s); s += p; *s7 = vld1q_s16(s); s += p; *s8 = vld1q_s16(s); s += p; *s9 = vld1q_s16(s); s += p; *s10 = vld1q_s16(s); } static inline void load_s16_8x12(const int16_t *s, ptrdiff_t p, int16x8_t *const s0, int16x8_t *const s1, int16x8_t *const s2, int16x8_t *const s3, int16x8_t *const s4, int16x8_t *const s5, int16x8_t *const s6, int16x8_t *const s7, int16x8_t *const s8, int16x8_t *const s9, int16x8_t *const s10, int16x8_t *const s11) { *s0 = vld1q_s16(s); s += p; *s1 = vld1q_s16(s); s += p; *s2 = vld1q_s16(s); s += p; *s3 = vld1q_s16(s); s += p; *s4 = vld1q_s16(s); s += p; *s5 = vld1q_s16(s); s += p; *s6 = vld1q_s16(s); s += p; *s7 = vld1q_s16(s); s += p; *s8 = vld1q_s16(s); s += p; *s9 = vld1q_s16(s); s += p; *s10 = vld1q_s16(s); s += p; *s11 = vld1q_s16(s); } static inline void load_u16_8x11(const uint16_t *s, ptrdiff_t p, uint16x8_t *const s0, uint16x8_t *const s1, uint16x8_t *const s2, uint16x8_t *const s3, uint16x8_t *const s4, uint16x8_t *const s5, uint16x8_t *const s6, uint16x8_t *const s7, uint16x8_t *const s8, uint16x8_t *const s9, uint16x8_t *const s10) { *s0 = vld1q_u16(s); s += p; *s1 = vld1q_u16(s); s += p; *s2 = vld1q_u16(s); s += p; *s3 = vld1q_u16(s); s += p; *s4 = vld1q_u16(s); s += p; *s5 = vld1q_u16(s); s += p; *s6 = vld1q_u16(s); s += p; *s7 = vld1q_u16(s); s += p; *s8 = vld1q_u16(s); s += p; *s9 = vld1q_u16(s); s += p; *s10 = vld1q_u16(s); } static inline void load_s16_8x8(const int16_t *s, ptrdiff_t p, int16x8_t *const s0, int16x8_t *const s1, int16x8_t *const s2, int16x8_t *const s3, int16x8_t *const s4, int16x8_t *const s5, int16x8_t *const s6, int16x8_t *const s7) { *s0 = vld1q_s16(s); s += p; *s1 = vld1q_s16(s); s += p; *s2 = vld1q_s16(s); s += p; *s3 = vld1q_s16(s); s += p; *s4 = vld1q_s16(s); s += p; *s5 = vld1q_s16(s); s += p; *s6 = vld1q_s16(s); s += p; *s7 = vld1q_s16(s); } static inline void load_u16_8x7(const uint16_t *s, ptrdiff_t p, uint16x8_t *const s0, uint16x8_t *const s1, uint16x8_t *const s2, uint16x8_t *const s3, uint16x8_t *const s4, uint16x8_t *const s5, uint16x8_t *const s6) { *s0 = vld1q_u16(s); s += p; *s1 = vld1q_u16(s); s += p; *s2 = vld1q_u16(s); s += p; *s3 = vld1q_u16(s); s += p; *s4 = vld1q_u16(s); s += p; *s5 = vld1q_u16(s); s += p; *s6 = vld1q_u16(s); } static inline void load_s16_8x7(const int16_t *s, ptrdiff_t p, int16x8_t *const s0, int16x8_t *const s1, int16x8_t *const s2, int16x8_t *const s3, int16x8_t *const s4, int16x8_t *const s5, int16x8_t *const s6) { *s0 = vld1q_s16(s); s += p; *s1 = vld1q_s16(s); s += p; *s2 = vld1q_s16(s); s += p; *s3 = vld1q_s16(s); s += p; *s4 = vld1q_s16(s); s += p; *s5 = vld1q_s16(s); s += p; *s6 = vld1q_s16(s); } static inline void load_s16_8x6(const int16_t *s, ptrdiff_t p, int16x8_t *const s0, int16x8_t *const s1, int16x8_t *const s2, int16x8_t *const s3, int16x8_t *const s4, int16x8_t *const s5) { *s0 = vld1q_s16(s); s += p; *s1 = vld1q_s16(s); s += p; *s2 = vld1q_s16(s); s += p; *s3 = vld1q_s16(s); s += p; *s4 = vld1q_s16(s); s += p; *s5 = vld1q_s16(s); } static inline void load_s16_8x5(const int16_t *s, ptrdiff_t p, int16x8_t *const s0, int16x8_t *const s1, int16x8_t *const s2, int16x8_t *const s3, int16x8_t *const s4) { *s0 = vld1q_s16(s); s += p; *s1 = vld1q_s16(s); s += p; *s2 = vld1q_s16(s); s += p; *s3 = vld1q_s16(s); s += p; *s4 = vld1q_s16(s); } static inline void load_s16_8x4(const int16_t *s, ptrdiff_t p, int16x8_t *const s0, int16x8_t *const s1, int16x8_t *const s2, int16x8_t *const s3) { *s0 = vld1q_s16(s); s += p; *s1 = vld1q_s16(s); s += p; *s2 = vld1q_s16(s); s += p; *s3 = vld1q_s16(s); } static inline void load_s16_8x3(const int16_t *s, ptrdiff_t p, int16x8_t *const s0, int16x8_t *const s1, int16x8_t *const s2) { *s0 = vld1q_s16(s); s += p; *s1 = vld1q_s16(s); s += p; *s2 = vld1q_s16(s); } #if AOM_ARCH_AARCH64 #define load_unaligned_u32_2x1_lane(v, p, lane) \ do { \ (v) = vld1_lane_u32((const uint32_t *)(p), (v), (lane)); \ } while (0) #define load_unaligned_u32_4x1_lane(v, p, lane) \ do { \ (v) = vld1q_lane_u32((const uint32_t *)(p), (v), (lane)); \ } while (0) #else #define load_unaligned_u32_2x1_lane(v, p, lane) \ do { \ uint32_t tmp; \ memcpy(&tmp, (p), 4); \ (v) = vset_lane_u32(tmp, (v), (lane)); \ } while (0) #define load_unaligned_u32_4x1_lane(v, p, lane) \ do { \ uint32_t tmp; \ memcpy(&tmp, (p), 4); \ (v) = vsetq_lane_u32(tmp, (v), (lane)); \ } while (0) #endif // Load 2 sets of 4 bytes when alignment is not guaranteed. static inline uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) { uint32_t a; memcpy(&a, buf, 4); buf += stride; uint32x2_t a_u32 = vdup_n_u32(a); memcpy(&a, buf, 4); a_u32 = vset_lane_u32(a, a_u32, 1); return vreinterpret_u8_u32(a_u32); } // Load 4 sets of 4 bytes when alignment is not guaranteed. static inline uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) { uint32_t a; uint32x4_t a_u32; if (stride == 4) return vld1q_u8(buf); memcpy(&a, buf, 4); buf += stride; a_u32 = vdupq_n_u32(a); memcpy(&a, buf, 4); buf += stride; a_u32 = vsetq_lane_u32(a, a_u32, 1); memcpy(&a, buf, 4); buf += stride; a_u32 = vsetq_lane_u32(a, a_u32, 2); memcpy(&a, buf, 4); a_u32 = vsetq_lane_u32(a, a_u32, 3); return vreinterpretq_u8_u32(a_u32); } static inline uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) { uint16_t a; uint16x4_t a_u16; memcpy(&a, buf, 2); buf += stride; a_u16 = vdup_n_u16(a); memcpy(&a, buf, 2); a_u16 = vset_lane_u16(a, a_u16, 1); return vreinterpret_u8_u16(a_u16); } static inline uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) { uint32_t a; uint32x2_t a_u32; memcpy(&a, buf, 4); a_u32 = vdup_n_u32(0); a_u32 = vset_lane_u32(a, a_u32, 0); return vreinterpret_u8_u32(a_u32); } static inline uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) { uint32_t a; uint32x2_t a_u32; memcpy(&a, buf, 4); a_u32 = vdup_n_u32(a); return vreinterpret_u8_u32(a_u32); } static inline uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) { uint16_t a; uint16x4_t a_u32; memcpy(&a, buf, 2); a_u32 = vdup_n_u16(a); return vreinterpret_u8_u16(a_u32); } static inline uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) { uint32_t a; uint32x2_t a_u32; memcpy(&a, buf, 4); buf += stride; a_u32 = vdup_n_u32(a); memcpy(&a, buf, 4); a_u32 = vset_lane_u32(a, a_u32, 1); return vreinterpret_u8_u32(a_u32); } static inline void load_unaligned_u8_4x4(const uint8_t *buf, int stride, uint8x8_t *tu0, uint8x8_t *tu1) { *tu0 = load_unaligned_u8_4x2(buf, stride); buf += 2 * stride; *tu1 = load_unaligned_u8_4x2(buf, stride); } static inline void load_unaligned_u8_3x8(const uint8_t *buf, int stride, uint8x8_t *tu0, uint8x8_t *tu1, uint8x8_t *tu2) { load_unaligned_u8_4x4(buf, stride, tu0, tu1); buf += 4 * stride; *tu2 = load_unaligned_u8_4x2(buf, stride); } static inline void load_unaligned_u8_4x8(const uint8_t *buf, int stride, uint8x8_t *tu0, uint8x8_t *tu1, uint8x8_t *tu2, uint8x8_t *tu3) { load_unaligned_u8_4x4(buf, stride, tu0, tu1); buf += 4 * stride; load_unaligned_u8_4x4(buf, stride, tu2, tu3); } static inline void load_u8_16x8(const uint8_t *s, ptrdiff_t p, uint8x16_t *const s0, uint8x16_t *const s1, uint8x16_t *const s2, uint8x16_t *const s3, uint8x16_t *const s4, uint8x16_t *const s5, uint8x16_t *const s6, uint8x16_t *const s7) { *s0 = vld1q_u8(s); s += p; *s1 = vld1q_u8(s); s += p; *s2 = vld1q_u8(s); s += p; *s3 = vld1q_u8(s); s += p; *s4 = vld1q_u8(s); s += p; *s5 = vld1q_u8(s); s += p; *s6 = vld1q_u8(s); s += p; *s7 = vld1q_u8(s); } static inline void load_u8_16x5(const uint8_t *s, ptrdiff_t p, uint8x16_t *const s0, uint8x16_t *const s1, uint8x16_t *const s2, uint8x16_t *const s3, uint8x16_t *const s4) { *s0 = vld1q_u8(s); s += p; *s1 = vld1q_u8(s); s += p; *s2 = vld1q_u8(s); s += p; *s3 = vld1q_u8(s); s += p; *s4 = vld1q_u8(s); } static inline void load_u8_16x4(const uint8_t *s, ptrdiff_t p, uint8x16_t *const s0, uint8x16_t *const s1, uint8x16_t *const s2, uint8x16_t *const s3) { *s0 = vld1q_u8(s); s += p; *s1 = vld1q_u8(s); s += p; *s2 = vld1q_u8(s); s += p; *s3 = vld1q_u8(s); } static inline void load_u8_16x3(const uint8_t *s, ptrdiff_t p, uint8x16_t *const s0, uint8x16_t *const s1, uint8x16_t *const s2) { *s0 = vld1q_u8(s); s += p; *s1 = vld1q_u8(s); s += p; *s2 = vld1q_u8(s); } static inline void load_u16_8x8(const uint16_t *s, const ptrdiff_t p, uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5, uint16x8_t *s6, uint16x8_t *s7) { *s0 = vld1q_u16(s); s += p; *s1 = vld1q_u16(s); s += p; *s2 = vld1q_u16(s); s += p; *s3 = vld1q_u16(s); s += p; *s4 = vld1q_u16(s); s += p; *s5 = vld1q_u16(s); s += p; *s6 = vld1q_u16(s); s += p; *s7 = vld1q_u16(s); } static inline void load_u16_16x4(const uint16_t *s, ptrdiff_t p, uint16x8_t *const s0, uint16x8_t *const s1, uint16x8_t *const s2, uint16x8_t *const s3, uint16x8_t *const s4, uint16x8_t *const s5, uint16x8_t *const s6, uint16x8_t *const s7) { *s0 = vld1q_u16(s); *s1 = vld1q_u16(s + 8); s += p; *s2 = vld1q_u16(s); *s3 = vld1q_u16(s + 8); s += p; *s4 = vld1q_u16(s); *s5 = vld1q_u16(s + 8); s += p; *s6 = vld1q_u16(s); *s7 = vld1q_u16(s + 8); } static inline uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf, int stride) { uint32_t a; uint32x2_t a_u32; memcpy(&a, buf, 4); buf += stride; a_u32 = vdup_n_u32(a); memcpy(&a, buf, 4); a_u32 = vset_lane_u32(a, a_u32, 1); return vreinterpret_u16_u32(a_u32); } static inline uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) { uint64_t a; uint64x1_t a_u64 = vdup_n_u64(0); memcpy(&a, buf, 8); a_u64 = vset_lane_u64(a, a_u64, 0); return vreinterpret_u16_u64(a_u64); } static inline uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf, uint32_t stride) { uint64_t a; uint64x2_t a_u64; memcpy(&a, buf, 8); buf += stride; a_u64 = vdupq_n_u64(0); a_u64 = vsetq_lane_u64(a, a_u64, 0); memcpy(&a, buf, 8); buf += stride; a_u64 = vsetq_lane_u64(a, a_u64, 1); return vreinterpretq_u16_u64(a_u64); } static inline int16x8_t load_unaligned_s16_4x2(const int16_t *buf, uint32_t stride) { int64_t a; int64x2_t a_s64; memcpy(&a, buf, 8); buf += stride; a_s64 = vdupq_n_s64(0); a_s64 = vsetq_lane_s64(a, a_s64, 0); memcpy(&a, buf, 8); buf += stride; a_s64 = vsetq_lane_s64(a, a_s64, 1); return vreinterpretq_s16_s64(a_s64); } static inline void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride, uint16x8_t *tu0, uint16x8_t *tu1) { *tu0 = load_unaligned_u16_4x2(buf, stride); buf += 2 * stride; *tu1 = load_unaligned_u16_4x2(buf, stride); } static inline void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1, int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) { *s1 = vld1q_s32(s); s += p; *s2 = vld1q_s32(s); s += p; *s3 = vld1q_s32(s); s += p; *s4 = vld1q_s32(s); } static inline void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1, int32x4_t s2, int32x4_t s3, int32x4_t s4) { vst1q_s32(s, s1); s += p; vst1q_s32(s, s2); s += p; vst1q_s32(s, s3); s += p; vst1q_s32(s, s4); } static inline void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1, uint32x4_t *s2, uint32x4_t *s3, uint32x4_t *s4) { *s1 = vld1q_u32(s); s += p; *s2 = vld1q_u32(s); s += p; *s3 = vld1q_u32(s); s += p; *s4 = vld1q_u32(s); } static inline void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) { vst1q_u32(s, s1); s += p; vst1q_u32(s, s2); s += p; vst1q_u32(s, s3); s += p; vst1q_u32(s, s4); } static inline int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { const int32x4_t v0 = vld1q_s32(buf); const int32x4_t v1 = vld1q_s32(buf + 4); const int16x4_t s0 = vmovn_s32(v0); const int16x4_t s1 = vmovn_s32(v1); return vcombine_s16(s0, s1); } static inline void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); vst1q_s32(buf, v0); vst1q_s32(buf + 4, v1); } static inline void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) { const int32x4_t v0 = vmovl_s16(a); vst1q_s32(buf, v0); } static inline uint8x8_t load_u8_gather_s16_x8(const uint8_t *src, int16x8_t indices) { // Recent Clang and GCC versions correctly identify that this zero-broadcast // is redundant. Alternatively we could load and broadcast the zeroth element // and then replace the other lanes, however this is slower than loading a // single element without broadcast on some micro-architectures. uint8x8_t ret = vdup_n_u8(0); ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 0), ret, 0); ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 1), ret, 1); ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 2), ret, 2); ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 3), ret, 3); ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 0), ret, 4); ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 1), ret, 5); ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 2), ret, 6); ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 3), ret, 7); return ret; } // The `lane` parameter here must be an immediate. #define store_u8_2x1_lane(dst, src, lane) \ do { \ uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \ memcpy(dst, &a, 2); \ } while (0) #define store_u8_4x1_lane(dst, src, lane) \ do { \ uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \ memcpy(dst, &a, 4); \ } while (0) #define store_u16_2x1_lane(dst, src, lane) \ do { \ uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \ memcpy(dst, &a, 4); \ } while (0) #define store_u16_4x1_lane(dst, src, lane) \ do { \ uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \ memcpy(dst, &a, 8); \ } while (0) #define store_s16_4x1_lane(dst, src, lane) \ do { \ int64_t a = vgetq_lane_s64(vreinterpretq_s64_s16(src), lane); \ memcpy(dst, &a, 8); \ } while (0) // Store the low 16-bits from a single vector. static inline void store_u8_2x1(uint8_t *dst, const uint8x8_t src) { store_u8_2x1_lane(dst, src, 0); } // Store the low 32-bits from a single vector. static inline void store_u8_4x1(uint8_t *dst, const uint8x8_t src) { store_u8_4x1_lane(dst, src, 0); } // Store two blocks of 16-bits from a single vector. static inline void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride, uint8x8_t src) { store_u8_2x1_lane(dst, src, 0); dst += dst_stride; store_u8_2x1_lane(dst, src, 1); } static inline void store_u8x2_strided_x4(uint8_t *dst, uint32_t dst_stride, uint8x8_t src) { store_u8_2x1_lane(dst, src, 0); dst += dst_stride; store_u8_2x1_lane(dst, src, 1); dst += dst_stride; store_u8_2x1_lane(dst, src, 2); dst += dst_stride; store_u8_2x1_lane(dst, src, 3); } // Store two blocks of 32-bits from a single vector. static inline void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride, uint8x8_t src) { store_u8_4x1_lane(dst, src, 0); dst += stride; store_u8_4x1_lane(dst, src, 1); } // Store four blocks of 32-bits from a single vector. static inline void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride, uint8x16_t src) { store_u8_4x1_lane(dst, vget_low_u8(src), 0); dst += stride; store_u8_4x1_lane(dst, vget_low_u8(src), 1); dst += stride; store_u8_4x1_lane(dst, vget_high_u8(src), 0); dst += stride; store_u8_4x1_lane(dst, vget_high_u8(src), 1); } // Store the low 32-bits from a single vector. static inline void store_u16_2x1(uint16_t *dst, const uint16x4_t src) { store_u16_2x1_lane(dst, src, 0); } // Store two blocks of 32-bits from a single vector. static inline void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride, uint16x4_t src) { store_u16_2x1_lane(dst, src, 0); dst += dst_stride; store_u16_2x1_lane(dst, src, 1); } // Store two blocks of 64-bits from a single vector. static inline void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride, uint16x8_t src) { store_u16_4x1_lane(dst, src, 0); dst += dst_stride; store_u16_4x1_lane(dst, src, 1); } // Store two blocks of 64-bits from a single vector. static inline void store_s16x4_strided_x2(int16_t *dst, int32_t dst_stride, int16x8_t src) { store_s16_4x1_lane(dst, src, 0); dst += dst_stride; store_s16_4x1_lane(dst, src, 1); } #undef store_u8_2x1_lane #undef store_u8_4x1_lane #undef store_u16_2x1_lane #undef store_u16_4x1_lane #undef store_s16_4x1_lane #endif // AOM_AOM_DSP_ARM_MEM_NEON_H_ aom-3.12.1/aom_dsp/arm/obmc_sad_neon.c000066400000000000000000000213631477627663500175410ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "mem_neon.h" #include "sum_neon.h" static inline void obmc_sad_8x1_s16_neon(int16x8_t ref_s16, const int32_t *mask, const int32_t *wsrc, uint32x4_t *sum) { int32x4_t wsrc_lo = vld1q_s32(wsrc); int32x4_t wsrc_hi = vld1q_s32(wsrc + 4); int32x4_t mask_lo = vld1q_s32(mask); int32x4_t mask_hi = vld1q_s32(mask + 4); int16x8_t mask_s16 = vuzpq_s16(vreinterpretq_s16_s32(mask_lo), vreinterpretq_s16_s32(mask_hi)) .val[0]; int32x4_t pre_lo = vmull_s16(vget_low_s16(ref_s16), vget_low_s16(mask_s16)); int32x4_t pre_hi = vmull_s16(vget_high_s16(ref_s16), vget_high_s16(mask_s16)); uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo)); uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi)); *sum = vrsraq_n_u32(*sum, abs_lo, 12); *sum = vrsraq_n_u32(*sum, abs_hi, 12); } #if AOM_ARCH_AARCH64 // Use tbl for doing a double-width zero extension from 8->32 bits since we can // do this in one instruction rather than two (indices out of range (255 here) // are set to zero by tbl). DECLARE_ALIGNED(16, static const uint8_t, obmc_variance_permute_idx[]) = { 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255, 8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255, 12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255 }; static inline void obmc_sad_8x1_s32_neon(uint32x4_t ref_u32_lo, uint32x4_t ref_u32_hi, const int32_t *mask, const int32_t *wsrc, uint32x4_t sum[2]) { int32x4_t wsrc_lo = vld1q_s32(wsrc); int32x4_t wsrc_hi = vld1q_s32(wsrc + 4); int32x4_t mask_lo = vld1q_s32(mask); int32x4_t mask_hi = vld1q_s32(mask + 4); int32x4_t pre_lo = vmulq_s32(vreinterpretq_s32_u32(ref_u32_lo), mask_lo); int32x4_t pre_hi = vmulq_s32(vreinterpretq_s32_u32(ref_u32_hi), mask_hi); uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo)); uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi)); sum[0] = vrsraq_n_u32(sum[0], abs_lo, 12); sum[1] = vrsraq_n_u32(sum[1], abs_hi, 12); } static inline unsigned int obmc_sad_large_neon(const uint8_t *ref, int ref_stride, const int32_t *wsrc, const int32_t *mask, int width, int height) { uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; // Use tbl for doing a double-width zero extension from 8->32 bits since we // can do this in one instruction rather than two. uint8x16_t pre_idx0 = vld1q_u8(&obmc_variance_permute_idx[0]); uint8x16_t pre_idx1 = vld1q_u8(&obmc_variance_permute_idx[16]); uint8x16_t pre_idx2 = vld1q_u8(&obmc_variance_permute_idx[32]); uint8x16_t pre_idx3 = vld1q_u8(&obmc_variance_permute_idx[48]); int h = height; do { int w = width; const uint8_t *ref_ptr = ref; do { uint8x16_t r = vld1q_u8(ref_ptr); uint32x4_t ref_u32_lo = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx0)); uint32x4_t ref_u32_hi = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx1)); obmc_sad_8x1_s32_neon(ref_u32_lo, ref_u32_hi, mask, wsrc, sum); ref_u32_lo = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx2)); ref_u32_hi = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx3)); obmc_sad_8x1_s32_neon(ref_u32_lo, ref_u32_hi, mask + 8, wsrc + 8, sum); ref_ptr += 16; wsrc += 16; mask += 16; w -= 16; } while (w != 0); ref += ref_stride; } while (--h != 0); return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); } #else // !AOM_ARCH_AARCH64 static inline unsigned int obmc_sad_large_neon(const uint8_t *ref, int ref_stride, const int32_t *wsrc, const int32_t *mask, int width, int height) { uint32x4_t sum = vdupq_n_u32(0); int h = height; do { int w = width; const uint8_t *ref_ptr = ref; do { uint8x16_t r = vld1q_u8(ref_ptr); int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r))); obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum); ref_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(r))); obmc_sad_8x1_s16_neon(ref_s16, mask + 8, wsrc + 8, &sum); ref_ptr += 16; wsrc += 16; mask += 16; w -= 16; } while (w != 0); ref += ref_stride; } while (--h != 0); return horizontal_add_u32x4(sum); } #endif // AOM_ARCH_AARCH64 static inline unsigned int obmc_sad_128xh_neon(const uint8_t *ref, int ref_stride, const int32_t *wsrc, const int32_t *mask, int h) { return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h); } static inline unsigned int obmc_sad_64xh_neon(const uint8_t *ref, int ref_stride, const int32_t *wsrc, const int32_t *mask, int h) { return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h); } static inline unsigned int obmc_sad_32xh_neon(const uint8_t *ref, int ref_stride, const int32_t *wsrc, const int32_t *mask, int h) { return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 32, h); } static inline unsigned int obmc_sad_16xh_neon(const uint8_t *ref, int ref_stride, const int32_t *wsrc, const int32_t *mask, int h) { return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h); } static inline unsigned int obmc_sad_8xh_neon(const uint8_t *ref, int ref_stride, const int32_t *wsrc, const int32_t *mask, int height) { uint32x4_t sum = vdupq_n_u32(0); int h = height; do { uint8x8_t r = vld1_u8(ref); int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(r)); obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum); ref += ref_stride; wsrc += 8; mask += 8; } while (--h != 0); return horizontal_add_u32x4(sum); } static inline unsigned int obmc_sad_4xh_neon(const uint8_t *ref, int ref_stride, const int32_t *wsrc, const int32_t *mask, int height) { uint32x4_t sum = vdupq_n_u32(0); int h = height / 2; do { uint8x8_t r = load_unaligned_u8(ref, ref_stride); int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(r)); obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum); ref += 2 * ref_stride; wsrc += 8; mask += 8; } while (--h != 0); return horizontal_add_u32x4(sum); } #define OBMC_SAD_WXH_NEON(w, h) \ unsigned int aom_obmc_sad##w##x##h##_neon( \ const uint8_t *ref, int ref_stride, const int32_t *wsrc, \ const int32_t *mask) { \ return obmc_sad_##w##xh_neon(ref, ref_stride, wsrc, mask, h); \ } OBMC_SAD_WXH_NEON(4, 4) OBMC_SAD_WXH_NEON(4, 8) OBMC_SAD_WXH_NEON(4, 16) OBMC_SAD_WXH_NEON(8, 4) OBMC_SAD_WXH_NEON(8, 8) OBMC_SAD_WXH_NEON(8, 16) OBMC_SAD_WXH_NEON(8, 32) OBMC_SAD_WXH_NEON(16, 4) OBMC_SAD_WXH_NEON(16, 8) OBMC_SAD_WXH_NEON(16, 16) OBMC_SAD_WXH_NEON(16, 32) OBMC_SAD_WXH_NEON(16, 64) OBMC_SAD_WXH_NEON(32, 8) OBMC_SAD_WXH_NEON(32, 16) OBMC_SAD_WXH_NEON(32, 32) OBMC_SAD_WXH_NEON(32, 64) OBMC_SAD_WXH_NEON(64, 16) OBMC_SAD_WXH_NEON(64, 32) OBMC_SAD_WXH_NEON(64, 64) OBMC_SAD_WXH_NEON(64, 128) OBMC_SAD_WXH_NEON(128, 64) OBMC_SAD_WXH_NEON(128, 128) aom-3.12.1/aom_dsp/arm/obmc_variance_neon.c000066400000000000000000000265171477627663500205700ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "mem_neon.h" #include "sum_neon.h" static inline void obmc_variance_8x1_s16_neon(int16x8_t pre_s16, const int32_t *wsrc, const int32_t *mask, int32x4_t *ssev, int32x4_t *sumv) { // For 4xh and 8xh we observe it is faster to avoid the double-widening of // pre. Instead we do a single widening step and narrow the mask to 16-bits // to allow us to perform a widening multiply. Widening multiply // instructions have better throughput on some micro-architectures but for // the larger block sizes this benefit is outweighed by the additional // instruction needed to first narrow the mask vectors. int32x4_t wsrc_s32_lo = vld1q_s32(&wsrc[0]); int32x4_t wsrc_s32_hi = vld1q_s32(&wsrc[4]); int16x8_t mask_s16 = vuzpq_s16(vreinterpretq_s16_s32(vld1q_s32(&mask[0])), vreinterpretq_s16_s32(vld1q_s32(&mask[4]))) .val[0]; int32x4_t diff_s32_lo = vmlsl_s16(wsrc_s32_lo, vget_low_s16(pre_s16), vget_low_s16(mask_s16)); int32x4_t diff_s32_hi = vmlsl_s16(wsrc_s32_hi, vget_high_s16(pre_s16), vget_high_s16(mask_s16)); // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up. // This difference only affects the bit patterns at the rounding breakpoints // exactly, so we can add -1 to all negative numbers to move the breakpoint // one value across and into the correct rounding region. diff_s32_lo = vsraq_n_s32(diff_s32_lo, diff_s32_lo, 31); diff_s32_hi = vsraq_n_s32(diff_s32_hi, diff_s32_hi, 31); int32x4_t round_s32_lo = vrshrq_n_s32(diff_s32_lo, 12); int32x4_t round_s32_hi = vrshrq_n_s32(diff_s32_hi, 12); *sumv = vrsraq_n_s32(*sumv, diff_s32_lo, 12); *sumv = vrsraq_n_s32(*sumv, diff_s32_hi, 12); *ssev = vmlaq_s32(*ssev, round_s32_lo, round_s32_lo); *ssev = vmlaq_s32(*ssev, round_s32_hi, round_s32_hi); } #if AOM_ARCH_AARCH64 // Use tbl for doing a double-width zero extension from 8->32 bits since we can // do this in one instruction rather than two (indices out of range (255 here) // are set to zero by tbl). DECLARE_ALIGNED(16, static const uint8_t, obmc_variance_permute_idx[]) = { 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255, 8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255, 12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255 }; static inline void obmc_variance_8x1_s32_neon( int32x4_t pre_lo, int32x4_t pre_hi, const int32_t *wsrc, const int32_t *mask, int32x4_t *ssev, int32x4_t *sumv) { int32x4_t wsrc_lo = vld1q_s32(&wsrc[0]); int32x4_t wsrc_hi = vld1q_s32(&wsrc[4]); int32x4_t mask_lo = vld1q_s32(&mask[0]); int32x4_t mask_hi = vld1q_s32(&mask[4]); int32x4_t diff_lo = vmlsq_s32(wsrc_lo, pre_lo, mask_lo); int32x4_t diff_hi = vmlsq_s32(wsrc_hi, pre_hi, mask_hi); // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away from // zero, however vrshrq_n_s32 rounds to nearest with ties rounded up. This // difference only affects the bit patterns at the rounding breakpoints // exactly, so we can add -1 to all negative numbers to move the breakpoint // one value across and into the correct rounding region. diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31); diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31); int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12); int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12); *sumv = vrsraq_n_s32(*sumv, diff_lo, 12); *sumv = vrsraq_n_s32(*sumv, diff_hi, 12); *ssev = vmlaq_s32(*ssev, round_lo, round_lo); *ssev = vmlaq_s32(*ssev, round_hi, round_hi); } static inline void obmc_variance_large_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int width, int height, unsigned *sse, int *sum) { assert(width % 16 == 0); // Use tbl for doing a double-width zero extension from 8->32 bits since we // can do this in one instruction rather than two. uint8x16_t pre_idx0 = vld1q_u8(&obmc_variance_permute_idx[0]); uint8x16_t pre_idx1 = vld1q_u8(&obmc_variance_permute_idx[16]); uint8x16_t pre_idx2 = vld1q_u8(&obmc_variance_permute_idx[32]); uint8x16_t pre_idx3 = vld1q_u8(&obmc_variance_permute_idx[48]); int32x4_t ssev = vdupq_n_s32(0); int32x4_t sumv = vdupq_n_s32(0); int h = height; do { int w = width; do { uint8x16_t pre_u8 = vld1q_u8(pre); int32x4_t pre_s32_lo = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx0)); int32x4_t pre_s32_hi = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx1)); obmc_variance_8x1_s32_neon(pre_s32_lo, pre_s32_hi, &wsrc[0], &mask[0], &ssev, &sumv); pre_s32_lo = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx2)); pre_s32_hi = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx3)); obmc_variance_8x1_s32_neon(pre_s32_lo, pre_s32_hi, &wsrc[8], &mask[8], &ssev, &sumv); wsrc += 16; mask += 16; pre += 16; w -= 16; } while (w != 0); pre += pre_stride - width; } while (--h != 0); *sse = horizontal_add_s32x4(ssev); *sum = horizontal_add_s32x4(sumv); } #else // !AOM_ARCH_AARCH64 static inline void obmc_variance_large_neon(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int width, int height, unsigned *sse, int *sum) { // Non-aarch64 targets do not have a 128-bit tbl instruction, so use the // widening version of the core kernel instead. assert(width % 16 == 0); int32x4_t ssev = vdupq_n_s32(0); int32x4_t sumv = vdupq_n_s32(0); int h = height; do { int w = width; do { uint8x16_t pre_u8 = vld1q_u8(pre); int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pre_u8))); obmc_variance_8x1_s16_neon(pre_s16, &wsrc[0], &mask[0], &ssev, &sumv); pre_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pre_u8))); obmc_variance_8x1_s16_neon(pre_s16, &wsrc[8], &mask[8], &ssev, &sumv); wsrc += 16; mask += 16; pre += 16; w -= 16; } while (w != 0); pre += pre_stride - width; } while (--h != 0); *sse = horizontal_add_s32x4(ssev); *sum = horizontal_add_s32x4(sumv); } #endif // AOM_ARCH_AARCH64 static inline void obmc_variance_neon_128xh(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int h, unsigned *sse, int *sum) { obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse, sum); } static inline void obmc_variance_neon_64xh(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int h, unsigned *sse, int *sum) { obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum); } static inline void obmc_variance_neon_32xh(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int h, unsigned *sse, int *sum) { obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum); } static inline void obmc_variance_neon_16xh(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int h, unsigned *sse, int *sum) { obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum); } static inline void obmc_variance_neon_8xh(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int h, unsigned *sse, int *sum) { int32x4_t ssev = vdupq_n_s32(0); int32x4_t sumv = vdupq_n_s32(0); do { uint8x8_t pre_u8 = vld1_u8(pre); int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(pre_u8)); obmc_variance_8x1_s16_neon(pre_s16, wsrc, mask, &ssev, &sumv); pre += pre_stride; wsrc += 8; mask += 8; } while (--h != 0); *sse = horizontal_add_s32x4(ssev); *sum = horizontal_add_s32x4(sumv); } static inline void obmc_variance_neon_4xh(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int h, unsigned *sse, int *sum) { assert(h % 2 == 0); int32x4_t ssev = vdupq_n_s32(0); int32x4_t sumv = vdupq_n_s32(0); do { uint8x8_t pre_u8 = load_unaligned_u8(pre, pre_stride); int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(pre_u8)); obmc_variance_8x1_s16_neon(pre_s16, wsrc, mask, &ssev, &sumv); pre += 2 * pre_stride; wsrc += 8; mask += 8; h -= 2; } while (h != 0); *sse = horizontal_add_s32x4(ssev); *sum = horizontal_add_s32x4(sumv); } #define OBMC_VARIANCE_WXH_NEON(W, H) \ unsigned aom_obmc_variance##W##x##H##_neon( \ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ const int32_t *mask, unsigned *sse) { \ int sum; \ obmc_variance_neon_##W##xh(pre, pre_stride, wsrc, mask, H, sse, &sum); \ return *sse - (unsigned)(((int64_t)sum * sum) / (W * H)); \ } OBMC_VARIANCE_WXH_NEON(4, 4) OBMC_VARIANCE_WXH_NEON(4, 8) OBMC_VARIANCE_WXH_NEON(8, 4) OBMC_VARIANCE_WXH_NEON(8, 8) OBMC_VARIANCE_WXH_NEON(8, 16) OBMC_VARIANCE_WXH_NEON(16, 8) OBMC_VARIANCE_WXH_NEON(16, 16) OBMC_VARIANCE_WXH_NEON(16, 32) OBMC_VARIANCE_WXH_NEON(32, 16) OBMC_VARIANCE_WXH_NEON(32, 32) OBMC_VARIANCE_WXH_NEON(32, 64) OBMC_VARIANCE_WXH_NEON(64, 32) OBMC_VARIANCE_WXH_NEON(64, 64) OBMC_VARIANCE_WXH_NEON(64, 128) OBMC_VARIANCE_WXH_NEON(128, 64) OBMC_VARIANCE_WXH_NEON(128, 128) OBMC_VARIANCE_WXH_NEON(4, 16) OBMC_VARIANCE_WXH_NEON(16, 4) OBMC_VARIANCE_WXH_NEON(8, 32) OBMC_VARIANCE_WXH_NEON(32, 8) OBMC_VARIANCE_WXH_NEON(16, 64) OBMC_VARIANCE_WXH_NEON(64, 16) aom-3.12.1/aom_dsp/arm/reinterpret_neon.h000066400000000000000000000031511477627663500203350ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_ #define AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_ #include #include "aom_dsp/aom_dsp_common.h" // For AOM_FORCE_INLINE. #include "config/aom_config.h" #define REINTERPRET_NEON(u, to_sz, to_count, from_sz, from_count, n, q) \ static AOM_FORCE_INLINE u##int##to_sz##x##to_count##x##n##_t \ aom_reinterpret##q##_##u##to_sz##_##u##from_sz##_x##n( \ const u##int##from_sz##x##from_count##x##n##_t src) { \ u##int##to_sz##x##to_count##x##n##_t ret; \ for (int i = 0; i < (n); ++i) { \ ret.val[i] = vreinterpret##q##_##u##to_sz##_##u##from_sz(src.val[i]); \ } \ return ret; \ } REINTERPRET_NEON(u, 8, 8, 16, 4, 2, ) // uint8x8x2_t from uint16x4x2_t REINTERPRET_NEON(u, 8, 16, 16, 8, 2, q) // uint8x16x2_t from uint16x8x2_t #endif // AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_ aom-3.12.1/aom_dsp/arm/sad_neon.c000066400000000000000000000410031477627663500165320ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" static inline unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { // We use 8 accumulators to prevent overflow for large values of 'h', as well // as enabling optimal UADALP instruction throughput on CPUs that have either // 2 or 4 Neon pipes. uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; int i = h; do { uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7; uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7; uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; s0 = vld1q_u8(src_ptr); r0 = vld1q_u8(ref_ptr); diff0 = vabdq_u8(s0, r0); sum[0] = vpadalq_u8(sum[0], diff0); s1 = vld1q_u8(src_ptr + 16); r1 = vld1q_u8(ref_ptr + 16); diff1 = vabdq_u8(s1, r1); sum[1] = vpadalq_u8(sum[1], diff1); s2 = vld1q_u8(src_ptr + 32); r2 = vld1q_u8(ref_ptr + 32); diff2 = vabdq_u8(s2, r2); sum[2] = vpadalq_u8(sum[2], diff2); s3 = vld1q_u8(src_ptr + 48); r3 = vld1q_u8(ref_ptr + 48); diff3 = vabdq_u8(s3, r3); sum[3] = vpadalq_u8(sum[3], diff3); s4 = vld1q_u8(src_ptr + 64); r4 = vld1q_u8(ref_ptr + 64); diff4 = vabdq_u8(s4, r4); sum[4] = vpadalq_u8(sum[4], diff4); s5 = vld1q_u8(src_ptr + 80); r5 = vld1q_u8(ref_ptr + 80); diff5 = vabdq_u8(s5, r5); sum[5] = vpadalq_u8(sum[5], diff5); s6 = vld1q_u8(src_ptr + 96); r6 = vld1q_u8(ref_ptr + 96); diff6 = vabdq_u8(s6, r6); sum[6] = vpadalq_u8(sum[6], diff6); s7 = vld1q_u8(src_ptr + 112); r7 = vld1q_u8(ref_ptr + 112); diff7 = vabdq_u8(s7, r7); sum[7] = vpadalq_u8(sum[7], diff7); src_ptr += src_stride; ref_ptr += ref_stride; } while (--i != 0); uint32x4_t sum_u32 = vpaddlq_u16(sum[0]); sum_u32 = vpadalq_u16(sum_u32, sum[1]); sum_u32 = vpadalq_u16(sum_u32, sum[2]); sum_u32 = vpadalq_u16(sum_u32, sum[3]); sum_u32 = vpadalq_u16(sum_u32, sum[4]); sum_u32 = vpadalq_u16(sum_u32, sum[5]); sum_u32 = vpadalq_u16(sum_u32, sum[6]); sum_u32 = vpadalq_u16(sum_u32, sum[7]); return horizontal_add_u32x4(sum_u32); } static inline unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; int i = h; do { uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3; uint8x16_t diff0, diff1, diff2, diff3; s0 = vld1q_u8(src_ptr); r0 = vld1q_u8(ref_ptr); diff0 = vabdq_u8(s0, r0); sum[0] = vpadalq_u8(sum[0], diff0); s1 = vld1q_u8(src_ptr + 16); r1 = vld1q_u8(ref_ptr + 16); diff1 = vabdq_u8(s1, r1); sum[1] = vpadalq_u8(sum[1], diff1); s2 = vld1q_u8(src_ptr + 32); r2 = vld1q_u8(ref_ptr + 32); diff2 = vabdq_u8(s2, r2); sum[2] = vpadalq_u8(sum[2], diff2); s3 = vld1q_u8(src_ptr + 48); r3 = vld1q_u8(ref_ptr + 48); diff3 = vabdq_u8(s3, r3); sum[3] = vpadalq_u8(sum[3], diff3); src_ptr += src_stride; ref_ptr += ref_stride; } while (--i != 0); uint32x4_t sum_u32 = vpaddlq_u16(sum[0]); sum_u32 = vpadalq_u16(sum_u32, sum[1]); sum_u32 = vpadalq_u16(sum_u32, sum[2]); sum_u32 = vpadalq_u16(sum_u32, sum[3]); return horizontal_add_u32x4(sum_u32); } static inline unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; int i = h; do { uint8x16_t s0 = vld1q_u8(src_ptr); uint8x16_t r0 = vld1q_u8(ref_ptr); uint8x16_t diff0 = vabdq_u8(s0, r0); sum[0] = vpadalq_u8(sum[0], diff0); uint8x16_t s1 = vld1q_u8(src_ptr + 16); uint8x16_t r1 = vld1q_u8(ref_ptr + 16); uint8x16_t diff1 = vabdq_u8(s1, r1); sum[1] = vpadalq_u8(sum[1], diff1); src_ptr += src_stride; ref_ptr += ref_stride; } while (--i != 0); return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1])); } static inline unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { uint16x8_t sum = vdupq_n_u16(0); int i = h; do { uint8x16_t s = vld1q_u8(src_ptr); uint8x16_t r = vld1q_u8(ref_ptr); uint8x16_t diff = vabdq_u8(s, r); sum = vpadalq_u8(sum, diff); src_ptr += src_stride; ref_ptr += ref_stride; } while (--i != 0); return horizontal_add_u16x8(sum); } static inline unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { uint16x8_t sum = vdupq_n_u16(0); int i = h; do { uint8x8_t s = vld1_u8(src_ptr); uint8x8_t r = vld1_u8(ref_ptr); sum = vabal_u8(sum, s, r); src_ptr += src_stride; ref_ptr += ref_stride; } while (--i != 0); return horizontal_add_u16x8(sum); } static inline unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { uint16x8_t sum = vdupq_n_u16(0); int i = h / 2; do { uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); sum = vabal_u8(sum, s, r); src_ptr += 2 * src_stride; ref_ptr += 2 * ref_stride; } while (--i != 0); return horizontal_add_u16x8(sum); } #define SAD_WXH_NEON(w, h) \ unsigned int aom_sad##w##x##h##_neon(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride) { \ return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \ } SAD_WXH_NEON(4, 4) SAD_WXH_NEON(4, 8) SAD_WXH_NEON(8, 4) SAD_WXH_NEON(8, 8) SAD_WXH_NEON(8, 16) SAD_WXH_NEON(16, 8) SAD_WXH_NEON(16, 16) SAD_WXH_NEON(16, 32) SAD_WXH_NEON(32, 16) SAD_WXH_NEON(32, 32) SAD_WXH_NEON(32, 64) SAD_WXH_NEON(64, 32) SAD_WXH_NEON(64, 64) SAD_WXH_NEON(64, 128) SAD_WXH_NEON(128, 64) SAD_WXH_NEON(128, 128) #if !CONFIG_REALTIME_ONLY SAD_WXH_NEON(4, 16) SAD_WXH_NEON(8, 32) SAD_WXH_NEON(16, 4) SAD_WXH_NEON(16, 64) SAD_WXH_NEON(32, 8) SAD_WXH_NEON(64, 16) #endif // !CONFIG_REALTIME_ONLY #undef SAD_WXH_NEON #define SAD_SKIP_WXH_NEON(w, h) \ unsigned int aom_sad_skip_##w##x##h##_neon( \ const uint8_t *src, int src_stride, const uint8_t *ref, \ int ref_stride) { \ return 2 * \ sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \ } SAD_SKIP_WXH_NEON(8, 16) SAD_SKIP_WXH_NEON(16, 16) SAD_SKIP_WXH_NEON(16, 32) SAD_SKIP_WXH_NEON(32, 16) SAD_SKIP_WXH_NEON(32, 32) SAD_SKIP_WXH_NEON(32, 64) SAD_SKIP_WXH_NEON(64, 32) SAD_SKIP_WXH_NEON(64, 64) SAD_SKIP_WXH_NEON(64, 128) SAD_SKIP_WXH_NEON(128, 64) SAD_SKIP_WXH_NEON(128, 128) #if !CONFIG_REALTIME_ONLY SAD_SKIP_WXH_NEON(4, 16) SAD_SKIP_WXH_NEON(8, 32) SAD_SKIP_WXH_NEON(16, 64) SAD_SKIP_WXH_NEON(64, 16) #endif // !CONFIG_REALTIME_ONLY #undef SAD_SKIP_WXH_NEON static inline unsigned int sad128xh_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h, const uint8_t *second_pred) { // We use 8 accumulators to prevent overflow for large values of 'h', as well // as enabling optimal UADALP instruction throughput on CPUs that have either // 2 or 4 Neon pipes. uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; int i = h; do { uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7; uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7; uint8x16_t p0, p1, p2, p3, p4, p5, p6, p7; uint8x16_t avg0, avg1, avg2, avg3, avg4, avg5, avg6, avg7; uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; s0 = vld1q_u8(src_ptr); r0 = vld1q_u8(ref_ptr); p0 = vld1q_u8(second_pred); avg0 = vrhaddq_u8(r0, p0); diff0 = vabdq_u8(s0, avg0); sum[0] = vpadalq_u8(sum[0], diff0); s1 = vld1q_u8(src_ptr + 16); r1 = vld1q_u8(ref_ptr + 16); p1 = vld1q_u8(second_pred + 16); avg1 = vrhaddq_u8(r1, p1); diff1 = vabdq_u8(s1, avg1); sum[1] = vpadalq_u8(sum[1], diff1); s2 = vld1q_u8(src_ptr + 32); r2 = vld1q_u8(ref_ptr + 32); p2 = vld1q_u8(second_pred + 32); avg2 = vrhaddq_u8(r2, p2); diff2 = vabdq_u8(s2, avg2); sum[2] = vpadalq_u8(sum[2], diff2); s3 = vld1q_u8(src_ptr + 48); r3 = vld1q_u8(ref_ptr + 48); p3 = vld1q_u8(second_pred + 48); avg3 = vrhaddq_u8(r3, p3); diff3 = vabdq_u8(s3, avg3); sum[3] = vpadalq_u8(sum[3], diff3); s4 = vld1q_u8(src_ptr + 64); r4 = vld1q_u8(ref_ptr + 64); p4 = vld1q_u8(second_pred + 64); avg4 = vrhaddq_u8(r4, p4); diff4 = vabdq_u8(s4, avg4); sum[4] = vpadalq_u8(sum[4], diff4); s5 = vld1q_u8(src_ptr + 80); r5 = vld1q_u8(ref_ptr + 80); p5 = vld1q_u8(second_pred + 80); avg5 = vrhaddq_u8(r5, p5); diff5 = vabdq_u8(s5, avg5); sum[5] = vpadalq_u8(sum[5], diff5); s6 = vld1q_u8(src_ptr + 96); r6 = vld1q_u8(ref_ptr + 96); p6 = vld1q_u8(second_pred + 96); avg6 = vrhaddq_u8(r6, p6); diff6 = vabdq_u8(s6, avg6); sum[6] = vpadalq_u8(sum[6], diff6); s7 = vld1q_u8(src_ptr + 112); r7 = vld1q_u8(ref_ptr + 112); p7 = vld1q_u8(second_pred + 112); avg7 = vrhaddq_u8(r7, p7); diff7 = vabdq_u8(s7, avg7); sum[7] = vpadalq_u8(sum[7], diff7); src_ptr += src_stride; ref_ptr += ref_stride; second_pred += 128; } while (--i != 0); uint32x4_t sum_u32 = vpaddlq_u16(sum[0]); sum_u32 = vpadalq_u16(sum_u32, sum[1]); sum_u32 = vpadalq_u16(sum_u32, sum[2]); sum_u32 = vpadalq_u16(sum_u32, sum[3]); sum_u32 = vpadalq_u16(sum_u32, sum[4]); sum_u32 = vpadalq_u16(sum_u32, sum[5]); sum_u32 = vpadalq_u16(sum_u32, sum[6]); sum_u32 = vpadalq_u16(sum_u32, sum[7]); return horizontal_add_u32x4(sum_u32); } static inline unsigned int sad64xh_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h, const uint8_t *second_pred) { uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; int i = h; do { uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3; uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3; s0 = vld1q_u8(src_ptr); r0 = vld1q_u8(ref_ptr); p0 = vld1q_u8(second_pred); avg0 = vrhaddq_u8(r0, p0); diff0 = vabdq_u8(s0, avg0); sum[0] = vpadalq_u8(sum[0], diff0); s1 = vld1q_u8(src_ptr + 16); r1 = vld1q_u8(ref_ptr + 16); p1 = vld1q_u8(second_pred + 16); avg1 = vrhaddq_u8(r1, p1); diff1 = vabdq_u8(s1, avg1); sum[1] = vpadalq_u8(sum[1], diff1); s2 = vld1q_u8(src_ptr + 32); r2 = vld1q_u8(ref_ptr + 32); p2 = vld1q_u8(second_pred + 32); avg2 = vrhaddq_u8(r2, p2); diff2 = vabdq_u8(s2, avg2); sum[2] = vpadalq_u8(sum[2], diff2); s3 = vld1q_u8(src_ptr + 48); r3 = vld1q_u8(ref_ptr + 48); p3 = vld1q_u8(second_pred + 48); avg3 = vrhaddq_u8(r3, p3); diff3 = vabdq_u8(s3, avg3); sum[3] = vpadalq_u8(sum[3], diff3); src_ptr += src_stride; ref_ptr += ref_stride; second_pred += 64; } while (--i != 0); uint32x4_t sum_u32 = vpaddlq_u16(sum[0]); sum_u32 = vpadalq_u16(sum_u32, sum[1]); sum_u32 = vpadalq_u16(sum_u32, sum[2]); sum_u32 = vpadalq_u16(sum_u32, sum[3]); return horizontal_add_u32x4(sum_u32); } static inline unsigned int sad32xh_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h, const uint8_t *second_pred) { uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; int i = h; do { uint8x16_t s0 = vld1q_u8(src_ptr); uint8x16_t r0 = vld1q_u8(ref_ptr); uint8x16_t p0 = vld1q_u8(second_pred); uint8x16_t avg0 = vrhaddq_u8(r0, p0); uint8x16_t diff0 = vabdq_u8(s0, avg0); sum[0] = vpadalq_u8(sum[0], diff0); uint8x16_t s1 = vld1q_u8(src_ptr + 16); uint8x16_t r1 = vld1q_u8(ref_ptr + 16); uint8x16_t p1 = vld1q_u8(second_pred + 16); uint8x16_t avg1 = vrhaddq_u8(r1, p1); uint8x16_t diff1 = vabdq_u8(s1, avg1); sum[1] = vpadalq_u8(sum[1], diff1); src_ptr += src_stride; ref_ptr += ref_stride; second_pred += 32; } while (--i != 0); return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1])); } static inline unsigned int sad16xh_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h, const uint8_t *second_pred) { uint16x8_t sum = vdupq_n_u16(0); int i = h; do { uint8x16_t s = vld1q_u8(src_ptr); uint8x16_t r = vld1q_u8(ref_ptr); uint8x16_t p = vld1q_u8(second_pred); uint8x16_t avg = vrhaddq_u8(r, p); uint8x16_t diff = vabdq_u8(s, avg); sum = vpadalq_u8(sum, diff); src_ptr += src_stride; ref_ptr += ref_stride; second_pred += 16; } while (--i != 0); return horizontal_add_u16x8(sum); } static inline unsigned int sad8xh_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h, const uint8_t *second_pred) { uint16x8_t sum = vdupq_n_u16(0); int i = h; do { uint8x8_t s = vld1_u8(src_ptr); uint8x8_t r = vld1_u8(ref_ptr); uint8x8_t p = vld1_u8(second_pred); uint8x8_t avg = vrhadd_u8(r, p); sum = vabal_u8(sum, s, avg); src_ptr += src_stride; ref_ptr += ref_stride; second_pred += 8; } while (--i != 0); return horizontal_add_u16x8(sum); } #define SAD_WXH_AVG_NEON(w, h) \ unsigned int aom_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred) { \ return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ second_pred); \ } SAD_WXH_AVG_NEON(8, 8) SAD_WXH_AVG_NEON(8, 16) SAD_WXH_AVG_NEON(16, 8) SAD_WXH_AVG_NEON(16, 16) SAD_WXH_AVG_NEON(16, 32) SAD_WXH_AVG_NEON(32, 16) SAD_WXH_AVG_NEON(32, 32) SAD_WXH_AVG_NEON(32, 64) SAD_WXH_AVG_NEON(64, 32) SAD_WXH_AVG_NEON(64, 64) SAD_WXH_AVG_NEON(64, 128) SAD_WXH_AVG_NEON(128, 64) SAD_WXH_AVG_NEON(128, 128) #if !CONFIG_REALTIME_ONLY SAD_WXH_AVG_NEON(8, 32) SAD_WXH_AVG_NEON(16, 64) SAD_WXH_AVG_NEON(32, 8) SAD_WXH_AVG_NEON(64, 16) #endif // !CONFIG_REALTIME_ONLY #undef SAD_WXH_AVG_NEON aom-3.12.1/aom_dsp/arm/sad_neon_dotprod.c000066400000000000000000000233101477627663500202660ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" static inline unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int w, int h) { // Only two accumulators are required for optimal instruction throughput of // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = h; do { int j = 0; do { uint8x16_t s0, s1, r0, r1, diff0, diff1; s0 = vld1q_u8(src_ptr + j); r0 = vld1q_u8(ref_ptr + j); diff0 = vabdq_u8(s0, r0); sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); s1 = vld1q_u8(src_ptr + j + 16); r1 = vld1q_u8(ref_ptr + j + 16); diff1 = vabdq_u8(s1, r1); sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); j += 32; } while (j < w); src_ptr += src_stride; ref_ptr += ref_stride; } while (--i != 0); return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); } static inline unsigned int sad128xh_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, h); } static inline unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h); } static inline unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h); } static inline unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = h / 2; do { uint8x16_t s0, s1, r0, r1, diff0, diff1; s0 = vld1q_u8(src_ptr); r0 = vld1q_u8(ref_ptr); diff0 = vabdq_u8(s0, r0); sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); src_ptr += src_stride; ref_ptr += ref_stride; s1 = vld1q_u8(src_ptr); r1 = vld1q_u8(ref_ptr); diff1 = vabdq_u8(s1, r1); sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); src_ptr += src_stride; ref_ptr += ref_stride; } while (--i != 0); return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); } #define SAD_WXH_NEON_DOTPROD(w, h) \ unsigned int aom_sad##w##x##h##_neon_dotprod( \ const uint8_t *src, int src_stride, const uint8_t *ref, \ int ref_stride) { \ return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \ } SAD_WXH_NEON_DOTPROD(16, 8) SAD_WXH_NEON_DOTPROD(16, 16) SAD_WXH_NEON_DOTPROD(16, 32) SAD_WXH_NEON_DOTPROD(32, 16) SAD_WXH_NEON_DOTPROD(32, 32) SAD_WXH_NEON_DOTPROD(32, 64) SAD_WXH_NEON_DOTPROD(64, 32) SAD_WXH_NEON_DOTPROD(64, 64) SAD_WXH_NEON_DOTPROD(64, 128) SAD_WXH_NEON_DOTPROD(128, 64) SAD_WXH_NEON_DOTPROD(128, 128) #if !CONFIG_REALTIME_ONLY SAD_WXH_NEON_DOTPROD(16, 4) SAD_WXH_NEON_DOTPROD(16, 64) SAD_WXH_NEON_DOTPROD(32, 8) SAD_WXH_NEON_DOTPROD(64, 16) #endif // !CONFIG_REALTIME_ONLY #undef SAD_WXH_NEON_DOTPROD #define SAD_SKIP_WXH_NEON_DOTPROD(w, h) \ unsigned int aom_sad_skip_##w##x##h##_neon_dotprod( \ const uint8_t *src, int src_stride, const uint8_t *ref, \ int ref_stride) { \ return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \ 2 * ref_stride, (h) / 2); \ } SAD_SKIP_WXH_NEON_DOTPROD(16, 16) SAD_SKIP_WXH_NEON_DOTPROD(16, 32) SAD_SKIP_WXH_NEON_DOTPROD(32, 16) SAD_SKIP_WXH_NEON_DOTPROD(32, 32) SAD_SKIP_WXH_NEON_DOTPROD(32, 64) SAD_SKIP_WXH_NEON_DOTPROD(64, 32) SAD_SKIP_WXH_NEON_DOTPROD(64, 64) SAD_SKIP_WXH_NEON_DOTPROD(64, 128) SAD_SKIP_WXH_NEON_DOTPROD(128, 64) SAD_SKIP_WXH_NEON_DOTPROD(128, 128) #if !CONFIG_REALTIME_ONLY SAD_SKIP_WXH_NEON_DOTPROD(16, 64) SAD_SKIP_WXH_NEON_DOTPROD(64, 16) #endif // !CONFIG_REALTIME_ONLY #undef SAD_SKIP_WXH_NEON_DOTPROD static inline unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int w, int h, const uint8_t *second_pred) { // Only two accumulators are required for optimal instruction throughput of // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = h; do { int j = 0; do { uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; s0 = vld1q_u8(src_ptr + j); r0 = vld1q_u8(ref_ptr + j); p0 = vld1q_u8(second_pred); avg0 = vrhaddq_u8(r0, p0); diff0 = vabdq_u8(s0, avg0); sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); s1 = vld1q_u8(src_ptr + j + 16); r1 = vld1q_u8(ref_ptr + j + 16); p1 = vld1q_u8(second_pred + 16); avg1 = vrhaddq_u8(r1, p1); diff1 = vabdq_u8(s1, avg1); sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); j += 32; second_pred += 32; } while (j < w); src_ptr += src_stride; ref_ptr += ref_stride; } while (--i != 0); return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); } static inline unsigned int sad128xh_avg_neon_dotprod( const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h, const uint8_t *second_pred) { return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, h, second_pred); } static inline unsigned int sad64xh_avg_neon_dotprod( const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h, const uint8_t *second_pred) { return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h, second_pred); } static inline unsigned int sad32xh_avg_neon_dotprod( const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h, const uint8_t *second_pred) { return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h, second_pred); } static inline unsigned int sad16xh_avg_neon_dotprod( const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h, const uint8_t *second_pred) { uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = h / 2; do { uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; s0 = vld1q_u8(src_ptr); r0 = vld1q_u8(ref_ptr); p0 = vld1q_u8(second_pred); avg0 = vrhaddq_u8(r0, p0); diff0 = vabdq_u8(s0, avg0); sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); src_ptr += src_stride; ref_ptr += ref_stride; second_pred += 16; s1 = vld1q_u8(src_ptr); r1 = vld1q_u8(ref_ptr); p1 = vld1q_u8(second_pred); avg1 = vrhaddq_u8(r1, p1); diff1 = vabdq_u8(s1, avg1); sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); src_ptr += src_stride; ref_ptr += ref_stride; second_pred += 16; } while (--i != 0); return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); } #define SAD_WXH_AVG_NEON_DOTPROD(w, h) \ unsigned int aom_sad##w##x##h##_avg_neon_dotprod( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred) { \ return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \ second_pred); \ } SAD_WXH_AVG_NEON_DOTPROD(16, 8) SAD_WXH_AVG_NEON_DOTPROD(16, 16) SAD_WXH_AVG_NEON_DOTPROD(16, 32) SAD_WXH_AVG_NEON_DOTPROD(32, 16) SAD_WXH_AVG_NEON_DOTPROD(32, 32) SAD_WXH_AVG_NEON_DOTPROD(32, 64) SAD_WXH_AVG_NEON_DOTPROD(64, 32) SAD_WXH_AVG_NEON_DOTPROD(64, 64) SAD_WXH_AVG_NEON_DOTPROD(64, 128) SAD_WXH_AVG_NEON_DOTPROD(128, 64) SAD_WXH_AVG_NEON_DOTPROD(128, 128) #if !CONFIG_REALTIME_ONLY SAD_WXH_AVG_NEON_DOTPROD(16, 64) SAD_WXH_AVG_NEON_DOTPROD(32, 8) SAD_WXH_AVG_NEON_DOTPROD(64, 16) #endif // !CONFIG_REALTIME_ONLY #undef SAD_WXH_AVG_NEON_DOTPROD aom-3.12.1/aom_dsp/arm/sadxd_neon.c000066400000000000000000000424171477627663500171000ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" static inline void sad16_neon(uint8x16_t src, uint8x16_t ref, uint16x8_t *const sad_sum) { uint8x16_t abs_diff = vabdq_u8(src, ref); *sad_sum = vpadalq_u8(*sad_sum, abs_diff); } static inline void sadwxhx3d_large_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[3], int ref_stride, uint32_t res[3], int w, int h, int h_overflow) { uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; int h_limit = h > h_overflow ? h_overflow : h; int ref_offset = 0; int i = 0; do { uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; do { int j = 0; do { const uint8x16_t s0 = vld1q_u8(src + j); sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]); sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]); sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]); const uint8x16_t s1 = vld1q_u8(src + j + 16); sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]); sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]); sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]); j += 32; } while (j < w); src += src_stride; ref_offset += ref_stride; } while (++i < h_limit); sum[0] = vpadalq_u16(sum[0], sum_lo[0]); sum[0] = vpadalq_u16(sum[0], sum_hi[0]); sum[1] = vpadalq_u16(sum[1], sum_lo[1]); sum[1] = vpadalq_u16(sum[1], sum_hi[1]); sum[2] = vpadalq_u16(sum[2], sum_lo[2]); sum[2] = vpadalq_u16(sum[2], sum_hi[2]); h_limit += h_overflow; } while (i < h); res[0] = horizontal_add_u32x4(sum[0]); res[1] = horizontal_add_u32x4(sum[1]); res[2] = horizontal_add_u32x4(sum[2]); } static inline void sad128xhx3d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[3], int ref_stride, uint32_t res[3], int h) { sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32); } static inline void sad64xhx3d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[3], int ref_stride, uint32_t res[3], int h) { sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64); } static inline void sad32xhx3d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[3], int ref_stride, uint32_t res[3], int h) { uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; int ref_offset = 0; int i = h; do { const uint8x16_t s0 = vld1q_u8(src); sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]); sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]); sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]); const uint8x16_t s1 = vld1q_u8(src + 16); sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]); sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]); sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]); src += src_stride; ref_offset += ref_stride; } while (--i != 0); res[0] = horizontal_long_add_u16x8(sum_lo[0], sum_hi[0]); res[1] = horizontal_long_add_u16x8(sum_lo[1], sum_hi[1]); res[2] = horizontal_long_add_u16x8(sum_lo[2], sum_hi[2]); } static inline void sad16xhx3d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[3], int ref_stride, uint32_t res[3], int h) { uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; int ref_offset = 0; int i = h; do { const uint8x16_t s = vld1q_u8(src); sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]); sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]); sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]); src += src_stride; ref_offset += ref_stride; } while (--i != 0); res[0] = horizontal_add_u16x8(sum[0]); res[1] = horizontal_add_u16x8(sum[1]); res[2] = horizontal_add_u16x8(sum[2]); } static inline void sad8xhx3d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[3], int ref_stride, uint32_t res[3], int h) { uint16x8_t sum[3]; uint8x8_t s = vld1_u8(src); sum[0] = vabdl_u8(s, vld1_u8(ref[0])); sum[1] = vabdl_u8(s, vld1_u8(ref[1])); sum[2] = vabdl_u8(s, vld1_u8(ref[2])); src += src_stride; int ref_offset = ref_stride; int i = h - 1; do { s = vld1_u8(src); sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset)); sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset)); sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset)); src += src_stride; ref_offset += ref_stride; } while (--i != 0); res[0] = horizontal_add_u16x8(sum[0]); res[1] = horizontal_add_u16x8(sum[1]); res[2] = horizontal_add_u16x8(sum[2]); } static inline void sad4xhx3d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[3], int ref_stride, uint32_t res[3], int h) { assert(h % 2 == 0); uint16x8_t sum[3]; uint8x8_t s = load_unaligned_u8(src, src_stride); uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride); uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride); uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride); sum[0] = vabdl_u8(s, r0); sum[1] = vabdl_u8(s, r1); sum[2] = vabdl_u8(s, r2); src += 2 * src_stride; int ref_offset = 2 * ref_stride; int i = (h / 2) - 1; do { s = load_unaligned_u8(src, src_stride); r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride); r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride); r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride); sum[0] = vabal_u8(sum[0], s, r0); sum[1] = vabal_u8(sum[1], s, r1); sum[2] = vabal_u8(sum[2], s, r2); src += 2 * src_stride; ref_offset += 2 * ref_stride; } while (--i != 0); res[0] = horizontal_add_u16x8(sum[0]); res[1] = horizontal_add_u16x8(sum[1]); res[2] = horizontal_add_u16x8(sum[2]); } #define SAD_WXH_3D_NEON(w, h) \ void aom_sad##w##x##h##x3d_neon(const uint8_t *src, int src_stride, \ const uint8_t *const ref[4], int ref_stride, \ uint32_t res[4]) { \ sad##w##xhx3d_neon(src, src_stride, ref, ref_stride, res, (h)); \ } SAD_WXH_3D_NEON(4, 4) SAD_WXH_3D_NEON(4, 8) SAD_WXH_3D_NEON(8, 4) SAD_WXH_3D_NEON(8, 8) SAD_WXH_3D_NEON(8, 16) SAD_WXH_3D_NEON(16, 8) SAD_WXH_3D_NEON(16, 16) SAD_WXH_3D_NEON(16, 32) SAD_WXH_3D_NEON(32, 16) SAD_WXH_3D_NEON(32, 32) SAD_WXH_3D_NEON(32, 64) SAD_WXH_3D_NEON(64, 32) SAD_WXH_3D_NEON(64, 64) SAD_WXH_3D_NEON(64, 128) SAD_WXH_3D_NEON(128, 64) SAD_WXH_3D_NEON(128, 128) #if !CONFIG_REALTIME_ONLY SAD_WXH_3D_NEON(4, 16) SAD_WXH_3D_NEON(8, 32) SAD_WXH_3D_NEON(16, 4) SAD_WXH_3D_NEON(16, 64) SAD_WXH_3D_NEON(32, 8) SAD_WXH_3D_NEON(64, 16) #endif // !CONFIG_REALTIME_ONLY #undef SAD_WXH_3D_NEON static inline void sadwxhx4d_large_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int w, int h, int h_overflow) { uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; int h_limit = h > h_overflow ? h_overflow : h; int ref_offset = 0; int i = 0; do { uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; do { int j = 0; do { const uint8x16_t s0 = vld1q_u8(src + j); sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]); sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]); sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]); sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]); const uint8x16_t s1 = vld1q_u8(src + j + 16); sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]); sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]); sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]); sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]); j += 32; } while (j < w); src += src_stride; ref_offset += ref_stride; } while (++i < h_limit); sum[0] = vpadalq_u16(sum[0], sum_lo[0]); sum[0] = vpadalq_u16(sum[0], sum_hi[0]); sum[1] = vpadalq_u16(sum[1], sum_lo[1]); sum[1] = vpadalq_u16(sum[1], sum_hi[1]); sum[2] = vpadalq_u16(sum[2], sum_lo[2]); sum[2] = vpadalq_u16(sum[2], sum_hi[2]); sum[3] = vpadalq_u16(sum[3], sum_lo[3]); sum[3] = vpadalq_u16(sum[3], sum_hi[3]); h_limit += h_overflow; } while (i < h); vst1q_u32(res, horizontal_add_4d_u32x4(sum)); } static inline void sad128xhx4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32); } static inline void sad64xhx4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64); } static inline void sad32xhx4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; int ref_offset = 0; int i = h; do { const uint8x16_t s0 = vld1q_u8(src); sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]); sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]); sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]); sad16_neon(s0, vld1q_u8(ref[3] + ref_offset), &sum_lo[3]); const uint8x16_t s1 = vld1q_u8(src + 16); sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]); sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]); sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]); sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + 16), &sum_hi[3]); src += src_stride; ref_offset += ref_stride; } while (--i != 0); vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi)); } static inline void sad16xhx4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; uint32x4_t sum_u32[4]; int ref_offset = 0; int i = h; do { const uint8x16_t s = vld1q_u8(src); sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum_u16[0]); sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum_u16[1]); sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum_u16[2]); sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum_u16[3]); src += src_stride; ref_offset += ref_stride; } while (--i != 0); sum_u32[0] = vpaddlq_u16(sum_u16[0]); sum_u32[1] = vpaddlq_u16(sum_u16[1]); sum_u32[2] = vpaddlq_u16(sum_u16[2]); sum_u32[3] = vpaddlq_u16(sum_u16[3]); vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32)); } static inline void sad8xhx4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { uint16x8_t sum[4]; uint8x8_t s = vld1_u8(src); sum[0] = vabdl_u8(s, vld1_u8(ref[0])); sum[1] = vabdl_u8(s, vld1_u8(ref[1])); sum[2] = vabdl_u8(s, vld1_u8(ref[2])); sum[3] = vabdl_u8(s, vld1_u8(ref[3])); src += src_stride; int ref_offset = ref_stride; int i = h - 1; do { s = vld1_u8(src); sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset)); sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset)); sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset)); sum[3] = vabal_u8(sum[3], s, vld1_u8(ref[3] + ref_offset)); src += src_stride; ref_offset += ref_stride; } while (--i != 0); vst1q_u32(res, horizontal_add_4d_u16x8(sum)); } static inline void sad4xhx4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { uint16x8_t sum[4]; uint8x8_t s = load_unaligned_u8(src, src_stride); uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride); uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride); uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride); uint8x8_t r3 = load_unaligned_u8(ref[3], ref_stride); sum[0] = vabdl_u8(s, r0); sum[1] = vabdl_u8(s, r1); sum[2] = vabdl_u8(s, r2); sum[3] = vabdl_u8(s, r3); src += 2 * src_stride; int ref_offset = 2 * ref_stride; int i = h / 2; while (--i != 0) { s = load_unaligned_u8(src, src_stride); r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride); r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride); r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride); r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride); sum[0] = vabal_u8(sum[0], s, r0); sum[1] = vabal_u8(sum[1], s, r1); sum[2] = vabal_u8(sum[2], s, r2); sum[3] = vabal_u8(sum[3], s, r3); src += 2 * src_stride; ref_offset += 2 * ref_stride; } vst1q_u32(res, horizontal_add_4d_u16x8(sum)); } #define SAD_WXH_4D_NEON(w, h) \ void aom_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \ const uint8_t *const ref[4], int ref_stride, \ uint32_t res[4]) { \ sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \ } SAD_WXH_4D_NEON(4, 4) SAD_WXH_4D_NEON(4, 8) SAD_WXH_4D_NEON(8, 4) SAD_WXH_4D_NEON(8, 8) SAD_WXH_4D_NEON(8, 16) SAD_WXH_4D_NEON(16, 8) SAD_WXH_4D_NEON(16, 16) SAD_WXH_4D_NEON(16, 32) SAD_WXH_4D_NEON(32, 16) SAD_WXH_4D_NEON(32, 32) SAD_WXH_4D_NEON(32, 64) SAD_WXH_4D_NEON(64, 32) SAD_WXH_4D_NEON(64, 64) SAD_WXH_4D_NEON(64, 128) SAD_WXH_4D_NEON(128, 64) SAD_WXH_4D_NEON(128, 128) #if !CONFIG_REALTIME_ONLY SAD_WXH_4D_NEON(4, 16) SAD_WXH_4D_NEON(8, 32) SAD_WXH_4D_NEON(16, 4) SAD_WXH_4D_NEON(16, 64) SAD_WXH_4D_NEON(32, 8) SAD_WXH_4D_NEON(64, 16) #endif // !CONFIG_REALTIME_ONLY #undef SAD_WXH_4D_NEON #define SAD_SKIP_WXH_4D_NEON(w, h) \ void aom_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \ const uint8_t *const ref[4], \ int ref_stride, uint32_t res[4]) { \ sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res, \ ((h) >> 1)); \ res[0] <<= 1; \ res[1] <<= 1; \ res[2] <<= 1; \ res[3] <<= 1; \ } SAD_SKIP_WXH_4D_NEON(8, 16) SAD_SKIP_WXH_4D_NEON(16, 16) SAD_SKIP_WXH_4D_NEON(16, 32) SAD_SKIP_WXH_4D_NEON(32, 16) SAD_SKIP_WXH_4D_NEON(32, 32) SAD_SKIP_WXH_4D_NEON(32, 64) SAD_SKIP_WXH_4D_NEON(64, 32) SAD_SKIP_WXH_4D_NEON(64, 64) SAD_SKIP_WXH_4D_NEON(64, 128) SAD_SKIP_WXH_4D_NEON(128, 64) SAD_SKIP_WXH_4D_NEON(128, 128) #if !CONFIG_REALTIME_ONLY SAD_SKIP_WXH_4D_NEON(4, 16) SAD_SKIP_WXH_4D_NEON(8, 32) SAD_SKIP_WXH_4D_NEON(16, 64) SAD_SKIP_WXH_4D_NEON(64, 16) #endif // !CONFIG_REALTIME_ONLY #undef SAD_SKIP_WXH_4D_NEON aom-3.12.1/aom_dsp/arm/sadxd_neon_dotprod.c000066400000000000000000000257021477627663500206310ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" static inline void sad16_neon(uint8x16_t src, uint8x16_t ref, uint32x4_t *const sad_sum) { uint8x16_t abs_diff = vabdq_u8(src, ref); *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1)); } static inline void sadwxhx3d_large_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int w, int h) { uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; int ref_offset = 0; int i = h; do { int j = 0; do { const uint8x16_t s0 = vld1q_u8(src + j); sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]); sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]); sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]); const uint8x16_t s1 = vld1q_u8(src + j + 16); sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]); sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]); sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]); j += 32; } while (j < w); src += src_stride; ref_offset += ref_stride; } while (--i != 0); res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0])); res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1])); res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2])); } static inline void sad128xhx3d_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h); } static inline void sad64xhx3d_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h); } static inline void sad32xhx3d_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h); } static inline void sad16xhx3d_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; int ref_offset = 0; int i = h; do { const uint8x16_t s = vld1q_u8(src); sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]); sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]); sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]); src += src_stride; ref_offset += ref_stride; } while (--i != 0); res[0] = horizontal_add_u32x4(sum[0]); res[1] = horizontal_add_u32x4(sum[1]); res[2] = horizontal_add_u32x4(sum[2]); } #define SAD_WXH_3D_NEON_DOTPROD(w, h) \ void aom_sad##w##x##h##x3d_neon_dotprod(const uint8_t *src, int src_stride, \ const uint8_t *const ref[4], \ int ref_stride, uint32_t res[4]) { \ sad##w##xhx3d_neon_dotprod(src, src_stride, ref, ref_stride, res, (h)); \ } SAD_WXH_3D_NEON_DOTPROD(16, 8) SAD_WXH_3D_NEON_DOTPROD(16, 16) SAD_WXH_3D_NEON_DOTPROD(16, 32) SAD_WXH_3D_NEON_DOTPROD(32, 16) SAD_WXH_3D_NEON_DOTPROD(32, 32) SAD_WXH_3D_NEON_DOTPROD(32, 64) SAD_WXH_3D_NEON_DOTPROD(64, 32) SAD_WXH_3D_NEON_DOTPROD(64, 64) SAD_WXH_3D_NEON_DOTPROD(64, 128) SAD_WXH_3D_NEON_DOTPROD(128, 64) SAD_WXH_3D_NEON_DOTPROD(128, 128) #if !CONFIG_REALTIME_ONLY SAD_WXH_3D_NEON_DOTPROD(16, 4) SAD_WXH_3D_NEON_DOTPROD(16, 64) SAD_WXH_3D_NEON_DOTPROD(32, 8) SAD_WXH_3D_NEON_DOTPROD(64, 16) #endif // !CONFIG_REALTIME_ONLY #undef SAD_WXH_3D_NEON_DOTPROD static inline void sadwxhx4d_large_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int w, int h) { uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; uint32x4_t sum[4]; int ref_offset = 0; int i = h; do { int j = 0; do { const uint8x16_t s0 = vld1q_u8(src + j); sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]); sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]); sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]); sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]); const uint8x16_t s1 = vld1q_u8(src + j + 16); sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]); sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]); sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]); sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]); j += 32; } while (j < w); src += src_stride; ref_offset += ref_stride; } while (--i != 0); sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); vst1q_u32(res, horizontal_add_4d_u32x4(sum)); } static inline void sad128xhx4d_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h); } static inline void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h); } static inline void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h); } static inline void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; int ref_offset = 0; int i = h; do { const uint8x16_t s = vld1q_u8(src); sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]); sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]); sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]); sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum[3]); src += src_stride; ref_offset += ref_stride; } while (--i != 0); vst1q_u32(res, horizontal_add_4d_u32x4(sum)); } #define SAD_WXH_4D_NEON_DOTPROD(w, h) \ void aom_sad##w##x##h##x4d_neon_dotprod(const uint8_t *src, int src_stride, \ const uint8_t *const ref[4], \ int ref_stride, uint32_t res[4]) { \ sad##w##xhx4d_neon_dotprod(src, src_stride, ref, ref_stride, res, (h)); \ } SAD_WXH_4D_NEON_DOTPROD(16, 8) SAD_WXH_4D_NEON_DOTPROD(16, 16) SAD_WXH_4D_NEON_DOTPROD(16, 32) SAD_WXH_4D_NEON_DOTPROD(32, 16) SAD_WXH_4D_NEON_DOTPROD(32, 32) SAD_WXH_4D_NEON_DOTPROD(32, 64) SAD_WXH_4D_NEON_DOTPROD(64, 32) SAD_WXH_4D_NEON_DOTPROD(64, 64) SAD_WXH_4D_NEON_DOTPROD(64, 128) SAD_WXH_4D_NEON_DOTPROD(128, 64) SAD_WXH_4D_NEON_DOTPROD(128, 128) #if !CONFIG_REALTIME_ONLY SAD_WXH_4D_NEON_DOTPROD(16, 4) SAD_WXH_4D_NEON_DOTPROD(16, 64) SAD_WXH_4D_NEON_DOTPROD(32, 8) SAD_WXH_4D_NEON_DOTPROD(64, 16) #endif // !CONFIG_REALTIME_ONLY #undef SAD_WXH_4D_NEON_DOTPROD #define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h) \ void aom_sad_skip_##w##x##h##x4d_neon_dotprod( \ const uint8_t *src, int src_stride, const uint8_t *const ref[4], \ int ref_stride, uint32_t res[4]) { \ sad##w##xhx4d_neon_dotprod(src, 2 * src_stride, ref, 2 * ref_stride, res, \ ((h) >> 1)); \ res[0] <<= 1; \ res[1] <<= 1; \ res[2] <<= 1; \ res[3] <<= 1; \ } SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16) SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32) SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16) SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32) SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64) SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32) SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64) SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 128) SAD_SKIP_WXH_4D_NEON_DOTPROD(128, 64) SAD_SKIP_WXH_4D_NEON_DOTPROD(128, 128) #if !CONFIG_REALTIME_ONLY SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 64) SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 16) #endif // !CONFIG_REALTIME_ONLY #undef SAD_SKIP_WXH_4D_NEON_DOTPROD aom-3.12.1/aom_dsp/arm/sse_neon.c000066400000000000000000000150771477627663500165710ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" static inline void sse_16x1_neon(const uint8_t *src, const uint8_t *ref, uint32x4_t *sse) { uint8x16_t s = vld1q_u8(src); uint8x16_t r = vld1q_u8(ref); uint8x16_t abs_diff = vabdq_u8(s, r); uint8x8_t abs_diff_lo = vget_low_u8(abs_diff); uint8x8_t abs_diff_hi = vget_high_u8(abs_diff); *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo)); *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi)); } static inline void sse_8x1_neon(const uint8_t *src, const uint8_t *ref, uint32x4_t *sse) { uint8x8_t s = vld1_u8(src); uint8x8_t r = vld1_u8(ref); uint8x8_t abs_diff = vabd_u8(s, r); *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff)); } static inline void sse_4x2_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, uint32x4_t *sse) { uint8x8_t s = load_unaligned_u8(src, src_stride); uint8x8_t r = load_unaligned_u8(ref, ref_stride); uint8x8_t abs_diff = vabd_u8(s, r); *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff)); } static inline uint32_t sse_wxh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height) { uint32x4_t sse = vdupq_n_u32(0); if ((width & 0x07) && ((width & 0x07) < 5)) { int i = height; do { int j = 0; do { sse_8x1_neon(src + j, ref + j, &sse); sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse); j += 8; } while (j + 4 < width); sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse); src += 2 * src_stride; ref += 2 * ref_stride; i -= 2; } while (i != 0); } else { int i = height; do { int j = 0; do { sse_8x1_neon(src + j, ref + j, &sse); j += 8; } while (j < width); src += src_stride; ref += ref_stride; } while (--i != 0); } return horizontal_add_u32x4(sse); } static inline uint32_t sse_128xh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int height) { uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = height; do { sse_16x1_neon(src, ref, &sse[0]); sse_16x1_neon(src + 16, ref + 16, &sse[1]); sse_16x1_neon(src + 32, ref + 32, &sse[0]); sse_16x1_neon(src + 48, ref + 48, &sse[1]); sse_16x1_neon(src + 64, ref + 64, &sse[0]); sse_16x1_neon(src + 80, ref + 80, &sse[1]); sse_16x1_neon(src + 96, ref + 96, &sse[0]); sse_16x1_neon(src + 112, ref + 112, &sse[1]); src += src_stride; ref += ref_stride; } while (--i != 0); return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); } static inline uint32_t sse_64xh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int height) { uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = height; do { sse_16x1_neon(src, ref, &sse[0]); sse_16x1_neon(src + 16, ref + 16, &sse[1]); sse_16x1_neon(src + 32, ref + 32, &sse[0]); sse_16x1_neon(src + 48, ref + 48, &sse[1]); src += src_stride; ref += ref_stride; } while (--i != 0); return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); } static inline uint32_t sse_32xh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int height) { uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = height; do { sse_16x1_neon(src, ref, &sse[0]); sse_16x1_neon(src + 16, ref + 16, &sse[1]); src += src_stride; ref += ref_stride; } while (--i != 0); return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); } static inline uint32_t sse_16xh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int height) { uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = height; do { sse_16x1_neon(src, ref, &sse[0]); src += src_stride; ref += ref_stride; sse_16x1_neon(src, ref, &sse[1]); src += src_stride; ref += ref_stride; i -= 2; } while (i != 0); return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); } static inline uint32_t sse_8xh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int height) { uint32x4_t sse = vdupq_n_u32(0); int i = height; do { sse_8x1_neon(src, ref, &sse); src += src_stride; ref += ref_stride; } while (--i != 0); return horizontal_add_u32x4(sse); } static inline uint32_t sse_4xh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int height) { uint32x4_t sse = vdupq_n_u32(0); int i = height; do { sse_4x2_neon(src, src_stride, ref, ref_stride, &sse); src += 2 * src_stride; ref += 2 * ref_stride; i -= 2; } while (i != 0); return horizontal_add_u32x4(sse); } int64_t aom_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height) { switch (width) { case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height); case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height); case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height); case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height); case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height); case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height); default: return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height); } } aom-3.12.1/aom_dsp/arm/sse_neon_dotprod.c000066400000000000000000000165101477627663500203150ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" static inline void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref, uint32x4_t *sse) { uint8x16_t s = vld1q_u8(src); uint8x16_t r = vld1q_u8(ref); uint8x16_t abs_diff = vabdq_u8(s, r); *sse = vdotq_u32(*sse, abs_diff, abs_diff); } static inline void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref, uint32x2_t *sse) { uint8x8_t s = vld1_u8(src); uint8x8_t r = vld1_u8(ref); uint8x8_t abs_diff = vabd_u8(s, r); *sse = vdot_u32(*sse, abs_diff, abs_diff); } static inline void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, uint32x2_t *sse) { uint8x8_t s = load_unaligned_u8(src, src_stride); uint8x8_t r = load_unaligned_u8(ref, ref_stride); uint8x8_t abs_diff = vabd_u8(s, r); *sse = vdot_u32(*sse, abs_diff, abs_diff); } static inline uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height) { uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) }; if ((width & 0x07) && ((width & 0x07) < 5)) { int i = height; do { int j = 0; do { sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]); sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride, &sse[1]); j += 8; } while (j + 4 < width); sse_4x2_neon_dotprod(src + j, src_stride, ref + j, ref_stride, &sse[0]); src += 2 * src_stride; ref += 2 * ref_stride; i -= 2; } while (i != 0); } else { int i = height; do { int j = 0; do { sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]); sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride, &sse[1]); j += 8; } while (j < width); src += 2 * src_stride; ref += 2 * ref_stride; i -= 2; } while (i != 0); } return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1])); } static inline uint32_t sse_128xh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int height) { uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = height; do { sse_16x1_neon_dotprod(src, ref, &sse[0]); sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]); sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]); sse_16x1_neon_dotprod(src + 64, ref + 64, &sse[0]); sse_16x1_neon_dotprod(src + 80, ref + 80, &sse[1]); sse_16x1_neon_dotprod(src + 96, ref + 96, &sse[0]); sse_16x1_neon_dotprod(src + 112, ref + 112, &sse[1]); src += src_stride; ref += ref_stride; } while (--i != 0); return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); } static inline uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int height) { uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = height; do { sse_16x1_neon_dotprod(src, ref, &sse[0]); sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]); sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]); src += src_stride; ref += ref_stride; } while (--i != 0); return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); } static inline uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int height) { uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = height; do { sse_16x1_neon_dotprod(src, ref, &sse[0]); sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); src += src_stride; ref += ref_stride; } while (--i != 0); return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); } static inline uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int height) { uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = height; do { sse_16x1_neon_dotprod(src, ref, &sse[0]); src += src_stride; ref += ref_stride; sse_16x1_neon_dotprod(src, ref, &sse[1]); src += src_stride; ref += ref_stride; i -= 2; } while (i != 0); return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); } static inline uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int height) { uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) }; int i = height; do { sse_8x1_neon_dotprod(src, ref, &sse[0]); src += src_stride; ref += ref_stride; sse_8x1_neon_dotprod(src, ref, &sse[1]); src += src_stride; ref += ref_stride; i -= 2; } while (i != 0); return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1])); } static inline uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int height) { uint32x2_t sse = vdup_n_u32(0); int i = height; do { sse_4x2_neon_dotprod(src, src_stride, ref, ref_stride, &sse); src += 2 * src_stride; ref += 2 * ref_stride; i -= 2; } while (i != 0); return horizontal_add_u32x2(sse); } int64_t aom_sse_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height) { switch (width) { case 4: return sse_4xh_neon_dotprod(src, src_stride, ref, ref_stride, height); case 8: return sse_8xh_neon_dotprod(src, src_stride, ref, ref_stride, height); case 16: return sse_16xh_neon_dotprod(src, src_stride, ref, ref_stride, height); case 32: return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height); case 64: return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height); case 128: return sse_128xh_neon_dotprod(src, src_stride, ref, ref_stride, height); default: return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width, height); } } aom-3.12.1/aom_dsp/arm/subpel_variance_neon.c000066400000000000000000001167511477627663500211420ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "config/aom_config.h" #include "aom_ports/mem.h" #include "aom/aom_integer.h" #include "aom_dsp/variance.h" #include "aom_dsp/arm/mem_neon.h" static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset) { const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); const uint8x8_t f1 = vdup_n_u8(filter_offset); int i = dst_height; do { uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); uint16x8_t blend = vmull_u8(s0, f0); blend = vmlal_u8(blend, s1, f1); uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); vst1_u8(dst_ptr, blend_u8); src_ptr += 2 * src_stride; dst_ptr += 2 * 4; i -= 2; } while (i != 0); } static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset) { const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); const uint8x8_t f1 = vdup_n_u8(filter_offset); int i = dst_height; do { uint8x8_t s0 = vld1_u8(src_ptr); uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); uint16x8_t blend = vmull_u8(s0, f0); blend = vmlal_u8(blend, s1, f1); uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); vst1_u8(dst_ptr, blend_u8); src_ptr += src_stride; dst_ptr += 8; } while (--i != 0); } static void var_filter_block2d_bil_large(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, int dst_width, int dst_height, int filter_offset) { const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); const uint8x8_t f1 = vdup_n_u8(filter_offset); int i = dst_height; do { int j = 0; do { uint8x16_t s0 = vld1q_u8(src_ptr + j); uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0); blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1); uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0); blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1); uint8x16_t blend_u8 = vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3)); vst1q_u8(dst_ptr + j, blend_u8); j += 16; } while (j < dst_width); src_ptr += src_stride; dst_ptr += dst_width; } while (--i != 0); } static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset) { var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset); } static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset) { var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset); } static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset) { var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset); } static void var_filter_block2d_bil_w128(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset) { var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset); } static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, int dst_width, int dst_height) { // We only specialise on the filter values for large block sizes (>= 16x16.) assert(dst_width >= 16 && dst_width % 16 == 0); int i = dst_height; do { int j = 0; do { uint8x16_t s0 = vld1q_u8(src_ptr + j); uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); uint8x16_t avg = vrhaddq_u8(s0, s1); vst1q_u8(dst_ptr + j, avg); j += 16; } while (j < dst_width); src_ptr += src_stride; dst_ptr += dst_width; } while (--i != 0); } #define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ unsigned int aom_sub_pixel_variance##w##x##h##_neon( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, uint32_t *sse) { \ uint8_t tmp0[w * (h + padding)]; \ uint8_t tmp1[w * h]; \ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ xoffset); \ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } #define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ unsigned int aom_sub_pixel_variance##w##x##h##_neon( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, unsigned int *sse) { \ if (xoffset == 0) { \ if (yoffset == 0) { \ return aom_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint8_t tmp[w * h]; \ var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ } else { \ uint8_t tmp[w * h]; \ var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \ yoffset); \ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ uint8_t tmp0[w * (h + padding)]; \ if (yoffset == 0) { \ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint8_t tmp1[w * (h + padding)]; \ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } else { \ uint8_t tmp1[w * (h + padding)]; \ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } \ } else { \ uint8_t tmp0[w * (h + padding)]; \ if (yoffset == 0) { \ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint8_t tmp1[w * h]; \ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ xoffset); \ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } else { \ uint8_t tmp1[w * h]; \ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ xoffset); \ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } \ } \ } SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1) SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1) SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1) // Realtime mode doesn't use 4x rectangular blocks. #if !CONFIG_REALTIME_ONLY SUBPEL_VARIANCE_WXH_NEON(4, 16, 2) SUBPEL_VARIANCE_WXH_NEON(8, 32, 1) SUBPEL_VARIANCE_WXH_NEON(16, 4, 1) SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1) SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1) SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1) #endif // !CONFIG_REALTIME_ONLY #undef SUBPEL_VARIANCE_WXH_NEON #undef SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON // Combine bilinear filter with aom_comp_avg_pred for blocks having width 4. static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset, const uint8_t *second_pred) { const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); const uint8x8_t f1 = vdup_n_u8(filter_offset); int i = dst_height; do { uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); uint16x8_t blend = vmull_u8(s0, f0); blend = vmlal_u8(blend, s1, f1); uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); uint8x8_t p = vld1_u8(second_pred); uint8x8_t avg = vrhadd_u8(blend_u8, p); vst1_u8(dst_ptr, avg); src_ptr += 2 * src_stride; dst_ptr += 2 * 4; second_pred += 2 * 4; i -= 2; } while (i != 0); } // Combine bilinear filter with aom_comp_avg_pred for blocks having width 8. static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset, const uint8_t *second_pred) { const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); const uint8x8_t f1 = vdup_n_u8(filter_offset); int i = dst_height; do { uint8x8_t s0 = vld1_u8(src_ptr); uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); uint16x8_t blend = vmull_u8(s0, f0); blend = vmlal_u8(blend, s1, f1); uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); uint8x8_t p = vld1_u8(second_pred); uint8x8_t avg = vrhadd_u8(blend_u8, p); vst1_u8(dst_ptr, avg); src_ptr += src_stride; dst_ptr += 8; second_pred += 8; } while (--i > 0); } // Combine bilinear filter with aom_comp_avg_pred for large blocks. static void avg_pred_var_filter_block2d_bil_large( const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, int dst_width, int dst_height, int filter_offset, const uint8_t *second_pred) { const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); const uint8x8_t f1 = vdup_n_u8(filter_offset); int i = dst_height; do { int j = 0; do { uint8x16_t s0 = vld1q_u8(src_ptr + j); uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0); blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1); uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0); blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1); uint8x16_t blend_u8 = vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3)); uint8x16_t p = vld1q_u8(second_pred); uint8x16_t avg = vrhaddq_u8(blend_u8, p); vst1q_u8(dst_ptr + j, avg); j += 16; second_pred += 16; } while (j < dst_width); src_ptr += src_stride; dst_ptr += dst_width; } while (--i != 0); } // Combine bilinear filter with aom_comp_avg_pred for blocks having width 16. static void avg_pred_var_filter_block2d_bil_w16( const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset, const uint8_t *second_pred) { avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset, second_pred); } // Combine bilinear filter with aom_comp_avg_pred for blocks having width 32. static void avg_pred_var_filter_block2d_bil_w32( const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset, const uint8_t *second_pred) { avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset, second_pred); } // Combine bilinear filter with aom_comp_avg_pred for blocks having width 64. static void avg_pred_var_filter_block2d_bil_w64( const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset, const uint8_t *second_pred) { avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset, second_pred); } // Combine bilinear filter with aom_comp_avg_pred for blocks having width 128. static void avg_pred_var_filter_block2d_bil_w128( const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, int dst_height, int filter_offset, const uint8_t *second_pred) { avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset, second_pred); } // Combine averaging subpel filter with aom_comp_avg_pred. static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, int dst_width, int dst_height, const uint8_t *second_pred) { // We only specialise on the filter values for large block sizes (>= 16x16.) assert(dst_width >= 16 && dst_width % 16 == 0); int i = dst_height; do { int j = 0; do { uint8x16_t s0 = vld1q_u8(src_ptr + j); uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); uint8x16_t avg = vrhaddq_u8(s0, s1); uint8x16_t p = vld1q_u8(second_pred); avg = vrhaddq_u8(avg, p); vst1q_u8(dst_ptr + j, avg); j += 16; second_pred += 16; } while (j < dst_width); src_ptr += src_stride; dst_ptr += dst_width; } while (--i != 0); } // Implementation of aom_comp_avg_pred for blocks having width >= 16. static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int dst_width, int dst_height, const uint8_t *second_pred) { // We only specialise on the filter values for large block sizes (>= 16x16.) assert(dst_width >= 16 && dst_width % 16 == 0); int i = dst_height; do { int j = 0; do { uint8x16_t s = vld1q_u8(src_ptr + j); uint8x16_t p = vld1q_u8(second_pred); uint8x16_t avg = vrhaddq_u8(s, p); vst1q_u8(dst_ptr + j, avg); j += 16; second_pred += 16; } while (j < dst_width); src_ptr += src_stride; dst_ptr += dst_width; } while (--i != 0); } #define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon( \ const uint8_t *src, int source_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, uint32_t *sse, \ const uint8_t *second_pred) { \ uint8_t tmp0[w * (h + padding)]; \ uint8_t tmp1[w * h]; \ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \ xoffset); \ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ second_pred); \ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } #define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon( \ const uint8_t *src, int source_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, unsigned int *sse, \ const uint8_t *second_pred) { \ if (xoffset == 0) { \ uint8_t tmp[w * h]; \ if (yoffset == 0) { \ avg_pred(src, tmp, source_stride, w, h, second_pred); \ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \ source_stride, w, h, second_pred); \ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ } else { \ avg_pred_var_filter_block2d_bil_w##w( \ src, tmp, source_stride, source_stride, h, yoffset, second_pred); \ return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ uint8_t tmp0[w * (h + padding)]; \ if (yoffset == 0) { \ avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \ second_pred); \ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint8_t tmp1[w * (h + padding)]; \ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } else { \ uint8_t tmp1[w * (h + padding)]; \ var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ second_pred); \ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } \ } else { \ uint8_t tmp0[w * (h + padding)]; \ if (yoffset == 0) { \ avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \ xoffset, second_pred); \ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint8_t tmp1[w * h]; \ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ (h + padding), xoffset); \ avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } else { \ uint8_t tmp1[w * h]; \ var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ (h + padding), xoffset); \ avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ second_pred); \ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } \ } \ } SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2) SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2) SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1) SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1) SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1) SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1) SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1) SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1) SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1) SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1) SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1) SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1) SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1) SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1) SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1) SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1) #if !CONFIG_REALTIME_ONLY SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2) SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1) SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1) SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1) SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1) SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1) #endif // !CONFIG_REALTIME_ONLY #undef SUBPEL_AVG_VARIANCE_WXH_NEON #undef SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON #if !CONFIG_REALTIME_ONLY #define OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon( \ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ uint8_t tmp0[w * (h + padding)]; \ uint8_t tmp1[w * h]; \ var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \ xoffset); \ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ return aom_obmc_variance##w##x##h(tmp1, w, wsrc, mask, sse); \ } #define SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon( \ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ if (xoffset == 0) { \ if (yoffset == 0) { \ return aom_obmc_variance##w##x##h##_neon(pre, pre_stride, wsrc, mask, \ sse); \ } else if (yoffset == 4) { \ uint8_t tmp[w * h]; \ var_filter_block2d_avg(pre, tmp, pre_stride, pre_stride, w, h); \ return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse); \ } else { \ uint8_t tmp[w * h]; \ var_filter_block2d_bil_w##w(pre, tmp, pre_stride, pre_stride, h, \ yoffset); \ return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse); \ } \ } else if (xoffset == 4) { \ uint8_t tmp0[w * (h + padding)]; \ if (yoffset == 0) { \ var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h); \ return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse); \ } else if (yoffset == 4) { \ uint8_t tmp1[w * (h + padding)]; \ var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding); \ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \ } else { \ uint8_t tmp1[w * (h + padding)]; \ var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding); \ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \ } \ } else { \ uint8_t tmp0[w * (h + padding)]; \ if (yoffset == 0) { \ var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h, xoffset); \ return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse); \ } else if (yoffset == 4) { \ uint8_t tmp1[w * h]; \ var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \ xoffset); \ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \ } else { \ uint8_t tmp1[w * h]; \ var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \ xoffset); \ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \ } \ } \ } OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2) OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1) OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1) OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1) SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1) SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1) SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1) SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1) SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1) #undef OBMC_SUBPEL_VARIANCE_WXH_NEON #undef SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON #endif // !CONFIG_REALTIME_ONLY #define MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ const uint8_t *msk, int msk_stride, int invert_mask, \ unsigned int *sse) { \ uint8_t tmp0[w * (h + padding)]; \ uint8_t tmp1[w * h]; \ uint8_t tmp2[w * h]; \ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ xoffset); \ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, msk_stride, \ invert_mask); \ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \ } #define SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ const uint8_t *msk, int msk_stride, int invert_mask, \ unsigned int *sse) { \ if (xoffset == 0) { \ uint8_t tmp0[w * h]; \ if (yoffset == 0) { \ aom_comp_mask_pred_neon(tmp0, second_pred, w, h, src, src_stride, msk, \ msk_stride, invert_mask); \ return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint8_t tmp1[w * h]; \ var_filter_block2d_avg(src, tmp0, src_stride, src_stride, w, h); \ aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \ msk_stride, invert_mask); \ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } else { \ uint8_t tmp1[w * h]; \ var_filter_block2d_bil_w##w(src, tmp0, src_stride, src_stride, h, \ yoffset); \ aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \ msk_stride, invert_mask); \ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ uint8_t tmp0[w * (h + padding)]; \ if (yoffset == 0) { \ uint8_t tmp1[w * h]; \ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \ aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \ msk_stride, invert_mask); \ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint8_t tmp1[w * h]; \ uint8_t tmp2[w * h]; \ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \ msk_stride, invert_mask); \ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \ } else { \ uint8_t tmp1[w * h]; \ uint8_t tmp2[w * h]; \ var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \ msk_stride, invert_mask); \ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \ } \ } else { \ if (yoffset == 0) { \ uint8_t tmp0[w * h]; \ uint8_t tmp1[w * h]; \ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \ aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \ msk_stride, invert_mask); \ return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint8_t tmp0[w * (h + padding)]; \ uint8_t tmp1[w * h]; \ uint8_t tmp2[w * h]; \ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ xoffset); \ var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \ msk_stride, invert_mask); \ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \ } else { \ uint8_t tmp0[w * (h + padding)]; \ uint8_t tmp1[w * (h + padding)]; \ uint8_t tmp2[w * h]; \ var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ xoffset); \ var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \ msk_stride, invert_mask); \ return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \ } \ } \ } MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1) SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1) SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1) // Realtime mode doesn't use 4x rectangular blocks. #if !CONFIG_REALTIME_ONLY MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2) MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1) MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1) SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1) SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1) SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1) #endif // !CONFIG_REALTIME_ONLY #undef MASKED_SUBPEL_VARIANCE_WXH_NEON #undef SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON aom-3.12.1/aom_dsp/arm/subtract_neon.c000066400000000000000000000145121477627663500176170ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" void aom_subtract_block_neon(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src, ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) { if (cols > 16) { int r = rows; do { int c = 0; do { const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00)); const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00)); const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16)); const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16)); vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); c += 32; } while (c < cols); diff += diff_stride; pred += pred_stride; src += src_stride; } while (--r != 0); } else if (cols > 8) { int r = rows; do { const uint8x16_t v_src = vld1q_u8(&src[0]); const uint8x16_t v_pred = vld1q_u8(&pred[0]); const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred)); const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred)); vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); diff += diff_stride; pred += pred_stride; src += src_stride; } while (--r != 0); } else if (cols > 4) { int r = rows; do { const uint8x8_t v_src = vld1_u8(&src[0]); const uint8x8_t v_pred = vld1_u8(&pred[0]); const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); diff += diff_stride; pred += pred_stride; src += src_stride; } while (--r != 0); } else { int r = rows; do { int c = 0; do { diff[c] = src[c] - pred[c]; } while (++c < cols); diff += diff_stride; pred += pred_stride; src += src_stride; } while (--r != 0); } } #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_subtract_block_neon(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src8, ptrdiff_t src_stride, const uint8_t *pred8, ptrdiff_t pred_stride) { uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); if (cols > 16) { int r = rows; do { int c = 0; do { const uint16x8_t v_src_00 = vld1q_u16(&src[c + 0]); const uint16x8_t v_pred_00 = vld1q_u16(&pred[c + 0]); const uint16x8_t v_diff_00 = vsubq_u16(v_src_00, v_pred_00); const uint16x8_t v_src_08 = vld1q_u16(&src[c + 8]); const uint16x8_t v_pred_08 = vld1q_u16(&pred[c + 8]); const uint16x8_t v_diff_08 = vsubq_u16(v_src_08, v_pred_08); vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_00)); vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_08)); c += 16; } while (c < cols); diff += diff_stride; pred += pred_stride; src += src_stride; } while (--r != 0); } else if (cols > 8) { int r = rows; do { const uint16x8_t v_src_00 = vld1q_u16(&src[0]); const uint16x8_t v_pred_00 = vld1q_u16(&pred[0]); const uint16x8_t v_diff_00 = vsubq_u16(v_src_00, v_pred_00); const uint16x8_t v_src_08 = vld1q_u16(&src[8]); const uint16x8_t v_pred_08 = vld1q_u16(&pred[8]); const uint16x8_t v_diff_08 = vsubq_u16(v_src_08, v_pred_08); vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_00)); vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_08)); diff += diff_stride; pred += pred_stride; src += src_stride; } while (--r != 0); } else if (cols > 4) { int r = rows; do { const uint16x8_t v_src_r0 = vld1q_u16(&src[0]); const uint16x8_t v_src_r1 = vld1q_u16(&src[src_stride]); const uint16x8_t v_pred_r0 = vld1q_u16(&pred[0]); const uint16x8_t v_pred_r1 = vld1q_u16(&pred[pred_stride]); const uint16x8_t v_diff_r0 = vsubq_u16(v_src_r0, v_pred_r0); const uint16x8_t v_diff_r1 = vsubq_u16(v_src_r1, v_pred_r1); vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_r0)); vst1q_s16(&diff[diff_stride], vreinterpretq_s16_u16(v_diff_r1)); diff += diff_stride << 1; pred += pred_stride << 1; src += src_stride << 1; r -= 2; } while (r != 0); } else { int r = rows; do { const uint16x4_t v_src_r0 = vld1_u16(&src[0]); const uint16x4_t v_src_r1 = vld1_u16(&src[src_stride]); const uint16x4_t v_pred_r0 = vld1_u16(&pred[0]); const uint16x4_t v_pred_r1 = vld1_u16(&pred[pred_stride]); const uint16x4_t v_diff_r0 = vsub_u16(v_src_r0, v_pred_r0); const uint16x4_t v_diff_r1 = vsub_u16(v_src_r1, v_pred_r1); vst1_s16(&diff[0], vreinterpret_s16_u16(v_diff_r0)); vst1_s16(&diff[diff_stride], vreinterpret_s16_u16(v_diff_r1)); diff += diff_stride << 1; pred += pred_stride << 1; src += src_stride << 1; r -= 2; } while (r != 0); } } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/arm/sum_neon.h000066400000000000000000000241071477627663500166020ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_ARM_SUM_NEON_H_ #define AOM_AOM_DSP_ARM_SUM_NEON_H_ #include "config/aom_dsp_rtcd.h" #include "config/aom_config.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" static inline int horizontal_add_u8x8(const uint8x8_t a) { #if AOM_ARCH_AARCH64 return vaddlv_u8(a); #else uint16x4_t b = vpaddl_u8(a); uint32x2_t c = vpaddl_u16(b); return vget_lane_u32(c, 0) + vget_lane_u32(c, 1); #endif } static inline int horizontal_add_s16x8(const int16x8_t a) { #if AOM_ARCH_AARCH64 return vaddlvq_s16(a); #else const int32x4_t b = vpaddlq_s16(a); const int64x2_t c = vpaddlq_s32(b); const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)), vreinterpret_s32_s64(vget_high_s64(c))); return vget_lane_s32(d, 0); #endif } static inline int horizontal_add_s32x4(const int32x4_t a) { #if AOM_ARCH_AARCH64 return vaddvq_s32(a); #else const int64x2_t b = vpaddlq_s32(a); const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), vreinterpret_s32_s64(vget_high_s64(b))); return vget_lane_s32(c, 0); #endif } static inline int64_t horizontal_add_s64x2(const int64x2_t a) { #if AOM_ARCH_AARCH64 return vaddvq_s64(a); #else return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); #endif } static inline uint64_t horizontal_add_u64x2(const uint64x2_t a) { #if AOM_ARCH_AARCH64 return vaddvq_u64(a); #else return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); #endif } static inline uint64_t horizontal_long_add_u32x4(const uint32x4_t a) { #if AOM_ARCH_AARCH64 return vaddlvq_u32(a); #else const uint64x2_t b = vpaddlq_u32(a); return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1); #endif } static inline int64_t horizontal_long_add_s32x4(const int32x4_t a) { #if AOM_ARCH_AARCH64 return vaddlvq_s32(a); #else const int64x2_t b = vpaddlq_s32(a); return vgetq_lane_s64(b, 0) + vgetq_lane_s64(b, 1); #endif } static inline uint32_t horizontal_add_u32x4(const uint32x4_t a) { #if AOM_ARCH_AARCH64 return vaddvq_u32(a); #else const uint64x2_t b = vpaddlq_u32(a); const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), vreinterpret_u32_u64(vget_high_u64(b))); return vget_lane_u32(c, 0); #endif } static inline uint32x4_t horizontal_add_4d_u32x4(const uint32x4_t sum[4]) { #if AOM_ARCH_AARCH64 uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]); uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]); return vpaddq_u32(res01, res23); #else uint32x4_t res = vdupq_n_u32(0); res = vsetq_lane_u32(horizontal_add_u32x4(sum[0]), res, 0); res = vsetq_lane_u32(horizontal_add_u32x4(sum[1]), res, 1); res = vsetq_lane_u32(horizontal_add_u32x4(sum[2]), res, 2); res = vsetq_lane_u32(horizontal_add_u32x4(sum[3]), res, 3); return res; #endif } static inline int32x4_t horizontal_add_4d_s32x4(const int32x4_t sum[4]) { #if AOM_ARCH_AARCH64 int32x4_t res01 = vpaddq_s32(sum[0], sum[1]); int32x4_t res23 = vpaddq_s32(sum[2], sum[3]); return vpaddq_s32(res01, res23); #else int32x4_t res = vdupq_n_s32(0); res = vsetq_lane_s32(horizontal_add_s32x4(sum[0]), res, 0); res = vsetq_lane_s32(horizontal_add_s32x4(sum[1]), res, 1); res = vsetq_lane_s32(horizontal_add_s32x4(sum[2]), res, 2); res = vsetq_lane_s32(horizontal_add_s32x4(sum[3]), res, 3); return res; #endif } static inline uint32_t horizontal_long_add_u16x8(const uint16x8_t vec_lo, const uint16x8_t vec_hi) { #if AOM_ARCH_AARCH64 return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi); #else const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); const uint64x2_t b = vpaddlq_u32(a); const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), vreinterpret_u32_u64(vget_high_u64(b))); return vget_lane_u32(c, 0); #endif } static inline uint32x4_t horizontal_long_add_4d_u16x8( const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) { const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]); const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]); const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]); const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]); const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]); const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]); const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]); const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]); #if AOM_ARCH_AARCH64 const uint32x4_t c0 = vpaddq_u32(b0, b1); const uint32x4_t c1 = vpaddq_u32(b2, b3); return vpaddq_u32(c0, c1); #else const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0)); const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1)); const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2)); const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3)); const uint32x2_t d0 = vpadd_u32(c0, c1); const uint32x2_t d1 = vpadd_u32(c2, c3); return vcombine_u32(d0, d1); #endif } static inline uint32_t horizontal_add_u16x8(const uint16x8_t a) { #if AOM_ARCH_AARCH64 return vaddlvq_u16(a); #else const uint32x4_t b = vpaddlq_u16(a); const uint64x2_t c = vpaddlq_u32(b); const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), vreinterpret_u32_u64(vget_high_u64(c))); return vget_lane_u32(d, 0); #endif } static inline uint32x4_t horizontal_add_4d_u16x8(const uint16x8_t sum[4]) { #if AOM_ARCH_AARCH64 const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); const uint16x8_t b0 = vpaddq_u16(a0, a1); return vpaddlq_u16(b0); #else const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); const uint16x4_t b0 = vpadd_u16(a0, a1); const uint16x4_t b1 = vpadd_u16(a2, a3); return vpaddlq_u16(vcombine_u16(b0, b1)); #endif } static inline int32x4_t horizontal_add_4d_s16x8(const int16x8_t sum[4]) { #if AOM_ARCH_AARCH64 const int16x8_t a0 = vpaddq_s16(sum[0], sum[1]); const int16x8_t a1 = vpaddq_s16(sum[2], sum[3]); const int16x8_t b0 = vpaddq_s16(a0, a1); return vpaddlq_s16(b0); #else const int16x4_t a0 = vadd_s16(vget_low_s16(sum[0]), vget_high_s16(sum[0])); const int16x4_t a1 = vadd_s16(vget_low_s16(sum[1]), vget_high_s16(sum[1])); const int16x4_t a2 = vadd_s16(vget_low_s16(sum[2]), vget_high_s16(sum[2])); const int16x4_t a3 = vadd_s16(vget_low_s16(sum[3]), vget_high_s16(sum[3])); const int16x4_t b0 = vpadd_s16(a0, a1); const int16x4_t b1 = vpadd_s16(a2, a3); return vpaddlq_s16(vcombine_s16(b0, b1)); #endif } static inline uint32_t horizontal_add_u32x2(const uint32x2_t a) { #if AOM_ARCH_AARCH64 return vaddv_u32(a); #else const uint64x1_t b = vpaddl_u32(a); return vget_lane_u32(vreinterpret_u32_u64(b), 0); #endif } static inline uint64_t horizontal_long_add_u32x2(const uint32x2_t a) { #if AOM_ARCH_AARCH64 return vaddlv_u32(a); #else const uint64x1_t b = vpaddl_u32(a); return vget_lane_u64(b, 0); #endif } static inline uint32_t horizontal_add_u16x4(const uint16x4_t a) { #if AOM_ARCH_AARCH64 return vaddlv_u16(a); #else const uint32x2_t b = vpaddl_u16(a); const uint64x1_t c = vpaddl_u32(b); return vget_lane_u32(vreinterpret_u32_u64(c), 0); #endif } static inline int32x4_t horizontal_add_2d_s32(int32x4_t a, int32x4_t b) { #if AOM_ARCH_AARCH64 return vpaddq_s32(a, b); #else const int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); const int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); return vcombine_s32(a0, b0); #endif } static inline int32x2_t add_pairwise_s32x4(int32x4_t a) { #if AOM_ARCH_AARCH64 return vget_low_s32(vpaddq_s32(a, a)); #else return vpadd_s32(vget_low_s32(a), vget_high_s32(a)); #endif } static inline uint64_t horizontal_long_add_u32x4_x2(const uint32x4_t a[2]) { return horizontal_long_add_u32x4(a[0]) + horizontal_long_add_u32x4(a[1]); } static inline uint64_t horizontal_long_add_u32x4_x4(const uint32x4_t a[4]) { uint64x2_t sum = vpaddlq_u32(a[0]); sum = vpadalq_u32(sum, a[1]); sum = vpadalq_u32(sum, a[2]); sum = vpadalq_u32(sum, a[3]); return horizontal_add_u64x2(sum); } static inline uint64_t horizontal_long_add_u32x4_x8(const uint32x4_t a[8]) { uint64x2_t sum[2]; sum[0] = vpaddlq_u32(a[0]); sum[1] = vpaddlq_u32(a[1]); sum[0] = vpadalq_u32(sum[0], a[2]); sum[1] = vpadalq_u32(sum[1], a[3]); sum[0] = vpadalq_u32(sum[0], a[4]); sum[1] = vpadalq_u32(sum[1], a[5]); sum[0] = vpadalq_u32(sum[0], a[6]); sum[1] = vpadalq_u32(sum[1], a[7]); return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1])); } static inline uint64_t horizontal_long_add_u32x4_x16(const uint32x4_t a[16]) { uint64x2_t sum[2]; sum[0] = vpaddlq_u32(a[0]); sum[1] = vpaddlq_u32(a[1]); sum[0] = vpadalq_u32(sum[0], a[2]); sum[1] = vpadalq_u32(sum[1], a[3]); sum[0] = vpadalq_u32(sum[0], a[4]); sum[1] = vpadalq_u32(sum[1], a[5]); sum[0] = vpadalq_u32(sum[0], a[6]); sum[1] = vpadalq_u32(sum[1], a[7]); sum[0] = vpadalq_u32(sum[0], a[8]); sum[1] = vpadalq_u32(sum[1], a[9]); sum[0] = vpadalq_u32(sum[0], a[10]); sum[1] = vpadalq_u32(sum[1], a[11]); sum[0] = vpadalq_u32(sum[0], a[12]); sum[1] = vpadalq_u32(sum[1], a[13]); sum[0] = vpadalq_u32(sum[0], a[14]); sum[1] = vpadalq_u32(sum[1], a[15]); return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1])); } #endif // AOM_AOM_DSP_ARM_SUM_NEON_H_ aom-3.12.1/aom_dsp/arm/sum_squares_neon.c000066400000000000000000000437631477627663500203510ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" static inline uint64_t aom_sum_squares_2d_i16_4x4_neon(const int16_t *src, int stride) { int16x4_t s0 = vld1_s16(src + 0 * stride); int16x4_t s1 = vld1_s16(src + 1 * stride); int16x4_t s2 = vld1_s16(src + 2 * stride); int16x4_t s3 = vld1_s16(src + 3 * stride); int32x4_t sum_squares = vmull_s16(s0, s0); sum_squares = vmlal_s16(sum_squares, s1, s1); sum_squares = vmlal_s16(sum_squares, s2, s2); sum_squares = vmlal_s16(sum_squares, s3, s3); return horizontal_long_add_u32x4(vreinterpretq_u32_s32(sum_squares)); } static inline uint64_t aom_sum_squares_2d_i16_4xn_neon(const int16_t *src, int stride, int height) { int32x4_t sum_squares[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; int h = height; do { int16x4_t s0 = vld1_s16(src + 0 * stride); int16x4_t s1 = vld1_s16(src + 1 * stride); int16x4_t s2 = vld1_s16(src + 2 * stride); int16x4_t s3 = vld1_s16(src + 3 * stride); sum_squares[0] = vmlal_s16(sum_squares[0], s0, s0); sum_squares[0] = vmlal_s16(sum_squares[0], s1, s1); sum_squares[1] = vmlal_s16(sum_squares[1], s2, s2); sum_squares[1] = vmlal_s16(sum_squares[1], s3, s3); src += 4 * stride; h -= 4; } while (h != 0); return horizontal_long_add_u32x4( vreinterpretq_u32_s32(vaddq_s32(sum_squares[0], sum_squares[1]))); } static inline uint64_t aom_sum_squares_2d_i16_nxn_neon(const int16_t *src, int stride, int width, int height) { uint64x2_t sum_squares = vdupq_n_u64(0); int h = height; do { int32x4_t ss_row[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; int w = 0; do { const int16_t *s = src + w; int16x8_t s0 = vld1q_s16(s + 0 * stride); int16x8_t s1 = vld1q_s16(s + 1 * stride); int16x8_t s2 = vld1q_s16(s + 2 * stride); int16x8_t s3 = vld1q_s16(s + 3 * stride); ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s0), vget_low_s16(s0)); ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s1), vget_low_s16(s1)); ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s2), vget_low_s16(s2)); ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s3), vget_low_s16(s3)); ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s0), vget_high_s16(s0)); ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s1), vget_high_s16(s1)); ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s2), vget_high_s16(s2)); ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s3), vget_high_s16(s3)); w += 8; } while (w < width); sum_squares = vpadalq_u32( sum_squares, vreinterpretq_u32_s32(vaddq_s32(ss_row[0], ss_row[1]))); src += 4 * stride; h -= 4; } while (h != 0); return horizontal_add_u64x2(sum_squares); } uint64_t aom_sum_squares_2d_i16_neon(const int16_t *src, int stride, int width, int height) { // 4 elements per row only requires half an SIMD register, so this // must be a special case, but also note that over 75% of all calls // are with size == 4, so it is also the common case. if (LIKELY(width == 4 && height == 4)) { return aom_sum_squares_2d_i16_4x4_neon(src, stride); } else if (LIKELY(width == 4 && (height & 3) == 0)) { return aom_sum_squares_2d_i16_4xn_neon(src, stride, height); } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) { // Generic case return aom_sum_squares_2d_i16_nxn_neon(src, stride, width, height); } else { return aom_sum_squares_2d_i16_c(src, stride, width, height); } } static inline uint64_t aom_sum_sse_2d_i16_4x4_neon(const int16_t *src, int stride, int *sum) { int16x4_t s0 = vld1_s16(src + 0 * stride); int16x4_t s1 = vld1_s16(src + 1 * stride); int16x4_t s2 = vld1_s16(src + 2 * stride); int16x4_t s3 = vld1_s16(src + 3 * stride); int32x4_t sse = vmull_s16(s0, s0); sse = vmlal_s16(sse, s1, s1); sse = vmlal_s16(sse, s2, s2); sse = vmlal_s16(sse, s3, s3); int32x4_t sum_01 = vaddl_s16(s0, s1); int32x4_t sum_23 = vaddl_s16(s2, s3); *sum += horizontal_add_s32x4(vaddq_s32(sum_01, sum_23)); return horizontal_long_add_u32x4(vreinterpretq_u32_s32(sse)); } static inline uint64_t aom_sum_sse_2d_i16_4xn_neon(const int16_t *src, int stride, int height, int *sum) { int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; int32x2_t sum_acc[2] = { vdup_n_s32(0), vdup_n_s32(0) }; int h = height; do { int16x4_t s0 = vld1_s16(src + 0 * stride); int16x4_t s1 = vld1_s16(src + 1 * stride); int16x4_t s2 = vld1_s16(src + 2 * stride); int16x4_t s3 = vld1_s16(src + 3 * stride); sse[0] = vmlal_s16(sse[0], s0, s0); sse[0] = vmlal_s16(sse[0], s1, s1); sse[1] = vmlal_s16(sse[1], s2, s2); sse[1] = vmlal_s16(sse[1], s3, s3); sum_acc[0] = vpadal_s16(sum_acc[0], s0); sum_acc[0] = vpadal_s16(sum_acc[0], s1); sum_acc[1] = vpadal_s16(sum_acc[1], s2); sum_acc[1] = vpadal_s16(sum_acc[1], s3); src += 4 * stride; h -= 4; } while (h != 0); *sum += horizontal_add_s32x4(vcombine_s32(sum_acc[0], sum_acc[1])); return horizontal_long_add_u32x4( vreinterpretq_u32_s32(vaddq_s32(sse[0], sse[1]))); } static inline uint64_t aom_sum_sse_2d_i16_nxn_neon(const int16_t *src, int stride, int width, int height, int *sum) { uint64x2_t sse = vdupq_n_u64(0); int32x4_t sum_acc = vdupq_n_s32(0); int h = height; do { int32x4_t sse_row[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; int w = 0; do { const int16_t *s = src + w; int16x8_t s0 = vld1q_s16(s + 0 * stride); int16x8_t s1 = vld1q_s16(s + 1 * stride); int16x8_t s2 = vld1q_s16(s + 2 * stride); int16x8_t s3 = vld1q_s16(s + 3 * stride); sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s0), vget_low_s16(s0)); sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s1), vget_low_s16(s1)); sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s2), vget_low_s16(s2)); sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s3), vget_low_s16(s3)); sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s0), vget_high_s16(s0)); sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s1), vget_high_s16(s1)); sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s2), vget_high_s16(s2)); sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s3), vget_high_s16(s3)); sum_acc = vpadalq_s16(sum_acc, s0); sum_acc = vpadalq_s16(sum_acc, s1); sum_acc = vpadalq_s16(sum_acc, s2); sum_acc = vpadalq_s16(sum_acc, s3); w += 8; } while (w < width); sse = vpadalq_u32(sse, vreinterpretq_u32_s32(vaddq_s32(sse_row[0], sse_row[1]))); src += 4 * stride; h -= 4; } while (h != 0); *sum += horizontal_add_s32x4(sum_acc); return horizontal_add_u64x2(sse); } uint64_t aom_sum_sse_2d_i16_neon(const int16_t *src, int stride, int width, int height, int *sum) { uint64_t sse; if (LIKELY(width == 4 && height == 4)) { sse = aom_sum_sse_2d_i16_4x4_neon(src, stride, sum); } else if (LIKELY(width == 4 && (height & 3) == 0)) { // width = 4, height is a multiple of 4. sse = aom_sum_sse_2d_i16_4xn_neon(src, stride, height, sum); } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) { // Generic case - width is multiple of 8, height is multiple of 4. sse = aom_sum_sse_2d_i16_nxn_neon(src, stride, width, height, sum); } else { sse = aom_sum_sse_2d_i16_c(src, stride, width, height, sum); } return sse; } static inline uint64_t aom_sum_squares_i16_4xn_neon(const int16_t *src, uint32_t n) { uint64x2_t sum_u64 = vdupq_n_u64(0); int i = n; do { uint32x4_t sum; int16x4_t s0 = vld1_s16(src); sum = vreinterpretq_u32_s32(vmull_s16(s0, s0)); sum_u64 = vpadalq_u32(sum_u64, sum); src += 4; i -= 4; } while (i >= 4); if (i > 0) { return horizontal_add_u64x2(sum_u64) + aom_sum_squares_i16_c(src, i); } return horizontal_add_u64x2(sum_u64); } static inline uint64_t aom_sum_squares_i16_8xn_neon(const int16_t *src, uint32_t n) { uint64x2_t sum_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; int i = n; do { uint32x4_t sum[2]; int16x8_t s0 = vld1q_s16(src); sum[0] = vreinterpretq_u32_s32(vmull_s16(vget_low_s16(s0), vget_low_s16(s0))); sum[1] = vreinterpretq_u32_s32(vmull_s16(vget_high_s16(s0), vget_high_s16(s0))); sum_u64[0] = vpadalq_u32(sum_u64[0], sum[0]); sum_u64[1] = vpadalq_u32(sum_u64[1], sum[1]); src += 8; i -= 8; } while (i >= 8); if (i > 0) { return horizontal_add_u64x2(vaddq_u64(sum_u64[0], sum_u64[1])) + aom_sum_squares_i16_c(src, i); } return horizontal_add_u64x2(vaddq_u64(sum_u64[0], sum_u64[1])); } uint64_t aom_sum_squares_i16_neon(const int16_t *src, uint32_t n) { // This function seems to be called only for values of N >= 64. See // av1/encoder/compound_type.c. if (LIKELY(n >= 8)) { return aom_sum_squares_i16_8xn_neon(src, n); } if (n >= 4) { return aom_sum_squares_i16_4xn_neon(src, n); } return aom_sum_squares_i16_c(src, n); } static inline uint64_t aom_var_2d_u8_4xh_neon(uint8_t *src, int src_stride, int width, int height) { uint64_t sum = 0; uint64_t sse = 0; uint32x2_t sum_u32 = vdup_n_u32(0); uint32x4_t sse_u32 = vdupq_n_u32(0); // 255*256 = 65280, so we can accumulate up to 256 8-bit elements in a 16-bit // element before we need to accumulate to 32-bit elements. Since we're // accumulating in uint16x4_t vectors, this means we can accumulate up to 4 // rows of 256 elements. Therefore the limit can be computed as: h_limit = (4 // * 256) / width. int h_limit = (4 * 256) / width; int h_tmp = height > h_limit ? h_limit : height; int h = 0; do { uint16x4_t sum_u16 = vdup_n_u16(0); do { uint8_t *src_ptr = src; int w = width; do { uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); sum_u16 = vpadal_u8(sum_u16, s0); uint16x8_t sse_u16 = vmull_u8(s0, s0); sse_u32 = vpadalq_u16(sse_u32, sse_u16); src_ptr += 8; w -= 8; } while (w >= 8); // Process remaining columns in the row using C. while (w > 0) { int idx = width - w; const uint8_t v = src[idx]; sum += v; sse += v * v; w--; } src += 2 * src_stride; h += 2; } while (h < h_tmp && h < height); sum_u32 = vpadal_u16(sum_u32, sum_u16); h_tmp += h_limit; } while (h < height); sum += horizontal_long_add_u32x2(sum_u32); sse += horizontal_long_add_u32x4(sse_u32); return sse - sum * sum / (width * height); } static inline uint64_t aom_var_2d_u8_8xh_neon(uint8_t *src, int src_stride, int width, int height) { uint64_t sum = 0; uint64_t sse = 0; uint32x2_t sum_u32 = vdup_n_u32(0); uint32x4_t sse_u32 = vdupq_n_u32(0); // 255*256 = 65280, so we can accumulate up to 256 8-bit elements in a 16-bit // element before we need to accumulate to 32-bit elements. Since we're // accumulating in uint16x4_t vectors, this means we can accumulate up to 4 // rows of 256 elements. Therefore the limit can be computed as: h_limit = (4 // * 256) / width. int h_limit = (4 * 256) / width; int h_tmp = height > h_limit ? h_limit : height; int h = 0; do { uint16x4_t sum_u16 = vdup_n_u16(0); do { uint8_t *src_ptr = src; int w = width; do { uint8x8_t s0 = vld1_u8(src_ptr); sum_u16 = vpadal_u8(sum_u16, s0); uint16x8_t sse_u16 = vmull_u8(s0, s0); sse_u32 = vpadalq_u16(sse_u32, sse_u16); src_ptr += 8; w -= 8; } while (w >= 8); // Process remaining columns in the row using C. while (w > 0) { int idx = width - w; const uint8_t v = src[idx]; sum += v; sse += v * v; w--; } src += src_stride; ++h; } while (h < h_tmp && h < height); sum_u32 = vpadal_u16(sum_u32, sum_u16); h_tmp += h_limit; } while (h < height); sum += horizontal_long_add_u32x2(sum_u32); sse += horizontal_long_add_u32x4(sse_u32); return sse - sum * sum / (width * height); } static inline uint64_t aom_var_2d_u8_16xh_neon(uint8_t *src, int src_stride, int width, int height) { uint64_t sum = 0; uint64_t sse = 0; uint32x4_t sum_u32 = vdupq_n_u32(0); uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; // 255*256 = 65280, so we can accumulate up to 256 8-bit elements in a 16-bit // element before we need to accumulate to 32-bit elements. Since we're // accumulating in uint16x8_t vectors, this means we can accumulate up to 8 // rows of 256 elements. Therefore the limit can be computed as: h_limit = (8 // * 256) / width. int h_limit = (8 * 256) / width; int h_tmp = height > h_limit ? h_limit : height; int h = 0; do { uint16x8_t sum_u16 = vdupq_n_u16(0); do { int w = width; uint8_t *src_ptr = src; do { uint8x16_t s0 = vld1q_u8(src_ptr); sum_u16 = vpadalq_u8(sum_u16, s0); uint16x8_t sse_u16_lo = vmull_u8(vget_low_u8(s0), vget_low_u8(s0)); uint16x8_t sse_u16_hi = vmull_u8(vget_high_u8(s0), vget_high_u8(s0)); sse_u32[0] = vpadalq_u16(sse_u32[0], sse_u16_lo); sse_u32[1] = vpadalq_u16(sse_u32[1], sse_u16_hi); src_ptr += 16; w -= 16; } while (w >= 16); // Process remaining columns in the row using C. while (w > 0) { int idx = width - w; const uint8_t v = src[idx]; sum += v; sse += v * v; w--; } src += src_stride; ++h; } while (h < h_tmp && h < height); sum_u32 = vpadalq_u16(sum_u32, sum_u16); h_tmp += h_limit; } while (h < height); sum += horizontal_long_add_u32x4(sum_u32); sse += horizontal_long_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1])); return sse - sum * sum / (width * height); } uint64_t aom_var_2d_u8_neon(uint8_t *src, int src_stride, int width, int height) { if (width >= 16) { return aom_var_2d_u8_16xh_neon(src, src_stride, width, height); } if (width >= 8) { return aom_var_2d_u8_8xh_neon(src, src_stride, width, height); } if (width >= 4 && height % 2 == 0) { return aom_var_2d_u8_4xh_neon(src, src_stride, width, height); } return aom_var_2d_u8_c(src, src_stride, width, height); } #if CONFIG_AV1_HIGHBITDEPTH static inline uint64_t aom_var_2d_u16_4xh_neon(uint8_t *src, int src_stride, int width, int height) { uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); uint64_t sum = 0; uint64_t sse = 0; uint32x2_t sum_u32 = vdup_n_u32(0); uint64x2_t sse_u64 = vdupq_n_u64(0); int h = height; do { int w = width; uint16_t *src_ptr = src_u16; do { uint16x4_t s0 = vld1_u16(src_ptr); sum_u32 = vpadal_u16(sum_u32, s0); uint32x4_t sse_u32 = vmull_u16(s0, s0); sse_u64 = vpadalq_u32(sse_u64, sse_u32); src_ptr += 4; w -= 4; } while (w >= 4); // Process remaining columns in the row using C. while (w > 0) { int idx = width - w; const uint16_t v = src_u16[idx]; sum += v; sse += v * v; w--; } src_u16 += src_stride; } while (--h != 0); sum += horizontal_long_add_u32x2(sum_u32); sse += horizontal_add_u64x2(sse_u64); return sse - sum * sum / (width * height); } static inline uint64_t aom_var_2d_u16_8xh_neon(uint8_t *src, int src_stride, int width, int height) { uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); uint64_t sum = 0; uint64_t sse = 0; uint32x4_t sum_u32 = vdupq_n_u32(0); uint64x2_t sse_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; int h = height; do { int w = width; uint16_t *src_ptr = src_u16; do { uint16x8_t s0 = vld1q_u16(src_ptr); sum_u32 = vpadalq_u16(sum_u32, s0); uint32x4_t sse_u32_lo = vmull_u16(vget_low_u16(s0), vget_low_u16(s0)); uint32x4_t sse_u32_hi = vmull_u16(vget_high_u16(s0), vget_high_u16(s0)); sse_u64[0] = vpadalq_u32(sse_u64[0], sse_u32_lo); sse_u64[1] = vpadalq_u32(sse_u64[1], sse_u32_hi); src_ptr += 8; w -= 8; } while (w >= 8); // Process remaining columns in the row using C. while (w > 0) { int idx = width - w; const uint16_t v = src_u16[idx]; sum += v; sse += v * v; w--; } src_u16 += src_stride; } while (--h != 0); sum += horizontal_long_add_u32x4(sum_u32); sse += horizontal_add_u64x2(vaddq_u64(sse_u64[0], sse_u64[1])); return sse - sum * sum / (width * height); } uint64_t aom_var_2d_u16_neon(uint8_t *src, int src_stride, int width, int height) { if (width >= 8) { return aom_var_2d_u16_8xh_neon(src, src_stride, width, height); } if (width >= 4) { return aom_var_2d_u16_4xh_neon(src, src_stride, width, height); } return aom_var_2d_u16_c(src, src_stride, width, height); } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/arm/sum_squares_neon_dotprod.c000066400000000000000000000101151477627663500220650ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "config/aom_dsp_rtcd.h" static inline uint64_t aom_var_2d_u8_4xh_neon_dotprod(uint8_t *src, int src_stride, int width, int height) { uint64_t sum = 0; uint64_t sse = 0; uint32x2_t sum_u32 = vdup_n_u32(0); uint32x2_t sse_u32 = vdup_n_u32(0); int h = height / 2; do { int w = width; uint8_t *src_ptr = src; do { uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1)); sse_u32 = vdot_u32(sse_u32, s0, s0); src_ptr += 8; w -= 8; } while (w >= 8); // Process remaining columns in the row using C. while (w > 0) { int idx = width - w; const uint8_t v = src[idx]; sum += v; sse += v * v; w--; } src += 2 * src_stride; } while (--h != 0); sum += horizontal_long_add_u32x2(sum_u32); sse += horizontal_long_add_u32x2(sse_u32); return sse - sum * sum / (width * height); } static inline uint64_t aom_var_2d_u8_8xh_neon_dotprod(uint8_t *src, int src_stride, int width, int height) { uint64_t sum = 0; uint64_t sse = 0; uint32x2_t sum_u32 = vdup_n_u32(0); uint32x2_t sse_u32 = vdup_n_u32(0); int h = height; do { int w = width; uint8_t *src_ptr = src; do { uint8x8_t s0 = vld1_u8(src_ptr); sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1)); sse_u32 = vdot_u32(sse_u32, s0, s0); src_ptr += 8; w -= 8; } while (w >= 8); // Process remaining columns in the row using C. while (w > 0) { int idx = width - w; const uint8_t v = src[idx]; sum += v; sse += v * v; w--; } src += src_stride; } while (--h != 0); sum += horizontal_long_add_u32x2(sum_u32); sse += horizontal_long_add_u32x2(sse_u32); return sse - sum * sum / (width * height); } static inline uint64_t aom_var_2d_u8_16xh_neon_dotprod(uint8_t *src, int src_stride, int width, int height) { uint64_t sum = 0; uint64_t sse = 0; uint32x4_t sum_u32 = vdupq_n_u32(0); uint32x4_t sse_u32 = vdupq_n_u32(0); int h = height; do { int w = width; uint8_t *src_ptr = src; do { uint8x16_t s0 = vld1q_u8(src_ptr); sum_u32 = vdotq_u32(sum_u32, s0, vdupq_n_u8(1)); sse_u32 = vdotq_u32(sse_u32, s0, s0); src_ptr += 16; w -= 16; } while (w >= 16); // Process remaining columns in the row using C. while (w > 0) { int idx = width - w; const uint8_t v = src[idx]; sum += v; sse += v * v; w--; } src += src_stride; } while (--h != 0); sum += horizontal_long_add_u32x4(sum_u32); sse += horizontal_long_add_u32x4(sse_u32); return sse - sum * sum / (width * height); } uint64_t aom_var_2d_u8_neon_dotprod(uint8_t *src, int src_stride, int width, int height) { if (width >= 16) { return aom_var_2d_u8_16xh_neon_dotprod(src, src_stride, width, height); } if (width >= 8) { return aom_var_2d_u8_8xh_neon_dotprod(src, src_stride, width, height); } if (width >= 4 && height % 2 == 0) { return aom_var_2d_u8_4xh_neon_dotprod(src, src_stride, width, height); } return aom_var_2d_u8_c(src, src_stride, width, height); } aom-3.12.1/aom_dsp/arm/sum_squares_sve.c000066400000000000000000000300051477627663500201700ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" static inline uint64_t aom_sum_squares_2d_i16_4xh_sve(const int16_t *src, int stride, int height) { int64x2_t sum_squares = vdupq_n_s64(0); do { int16x8_t s = vcombine_s16(vld1_s16(src), vld1_s16(src + stride)); sum_squares = aom_sdotq_s16(sum_squares, s, s); src += 2 * stride; height -= 2; } while (height != 0); return (uint64_t)vaddvq_s64(sum_squares); } static inline uint64_t aom_sum_squares_2d_i16_8xh_sve(const int16_t *src, int stride, int height) { int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; do { int16x8_t s0 = vld1q_s16(src + 0 * stride); int16x8_t s1 = vld1q_s16(src + 1 * stride); sum_squares[0] = aom_sdotq_s16(sum_squares[0], s0, s0); sum_squares[1] = aom_sdotq_s16(sum_squares[1], s1, s1); src += 2 * stride; height -= 2; } while (height != 0); sum_squares[0] = vaddq_s64(sum_squares[0], sum_squares[1]); return (uint64_t)vaddvq_s64(sum_squares[0]); } static inline uint64_t aom_sum_squares_2d_i16_large_sve(const int16_t *src, int stride, int width, int height) { int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; do { const int16_t *src_ptr = src; int w = width; do { int16x8_t s0 = vld1q_s16(src_ptr); int16x8_t s1 = vld1q_s16(src_ptr + 8); sum_squares[0] = aom_sdotq_s16(sum_squares[0], s0, s0); sum_squares[1] = aom_sdotq_s16(sum_squares[1], s1, s1); src_ptr += 16; w -= 16; } while (w != 0); src += stride; } while (--height != 0); sum_squares[0] = vaddq_s64(sum_squares[0], sum_squares[1]); return (uint64_t)vaddvq_s64(sum_squares[0]); } static inline uint64_t aom_sum_squares_2d_i16_wxh_sve(const int16_t *src, int stride, int width, int height) { svint64_t sum_squares = svdup_n_s64(0); uint64_t step = svcnth(); do { const int16_t *src_ptr = src; int w = 0; do { svbool_t pred = svwhilelt_b16_u32(w, width); svint16_t s0 = svld1_s16(pred, src_ptr); sum_squares = svdot_s64(sum_squares, s0, s0); src_ptr += step; w += step; } while (w < width); src += stride; } while (--height != 0); return (uint64_t)svaddv_s64(svptrue_b64(), sum_squares); } uint64_t aom_sum_squares_2d_i16_sve(const int16_t *src, int stride, int width, int height) { if (width == 4) { return aom_sum_squares_2d_i16_4xh_sve(src, stride, height); } if (width == 8) { return aom_sum_squares_2d_i16_8xh_sve(src, stride, height); } if (width % 16 == 0) { return aom_sum_squares_2d_i16_large_sve(src, stride, width, height); } return aom_sum_squares_2d_i16_wxh_sve(src, stride, width, height); } uint64_t aom_sum_squares_i16_sve(const int16_t *src, uint32_t n) { // This function seems to be called only for values of N >= 64. See // av1/encoder/compound_type.c. Additionally, because N = width x height for // width and height between the standard block sizes, N will also be a // multiple of 64. if (LIKELY(n % 64 == 0)) { int64x2_t sum[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0) }; do { int16x8_t s0 = vld1q_s16(src); int16x8_t s1 = vld1q_s16(src + 8); int16x8_t s2 = vld1q_s16(src + 16); int16x8_t s3 = vld1q_s16(src + 24); sum[0] = aom_sdotq_s16(sum[0], s0, s0); sum[1] = aom_sdotq_s16(sum[1], s1, s1); sum[2] = aom_sdotq_s16(sum[2], s2, s2); sum[3] = aom_sdotq_s16(sum[3], s3, s3); src += 32; n -= 32; } while (n != 0); sum[0] = vaddq_s64(sum[0], sum[1]); sum[2] = vaddq_s64(sum[2], sum[3]); sum[0] = vaddq_s64(sum[0], sum[2]); return vaddvq_s64(sum[0]); } return aom_sum_squares_i16_c(src, n); } static inline uint64_t aom_sum_sse_2d_i16_4xh_sve(const int16_t *src, int stride, int height, int *sum) { int64x2_t sse = vdupq_n_s64(0); int32x4_t sum_s32 = vdupq_n_s32(0); do { int16x8_t s = vcombine_s16(vld1_s16(src), vld1_s16(src + stride)); sse = aom_sdotq_s16(sse, s, s); sum_s32 = vpadalq_s16(sum_s32, s); src += 2 * stride; height -= 2; } while (height != 0); *sum += vaddvq_s32(sum_s32); return vaddvq_s64(sse); } static inline uint64_t aom_sum_sse_2d_i16_8xh_sve(const int16_t *src, int stride, int height, int *sum) { int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; int32x4_t sum_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; do { int16x8_t s0 = vld1q_s16(src); int16x8_t s1 = vld1q_s16(src + stride); sse[0] = aom_sdotq_s16(sse[0], s0, s0); sse[1] = aom_sdotq_s16(sse[1], s1, s1); sum_acc[0] = vpadalq_s16(sum_acc[0], s0); sum_acc[1] = vpadalq_s16(sum_acc[1], s1); src += 2 * stride; height -= 2; } while (height != 0); *sum += vaddvq_s32(vaddq_s32(sum_acc[0], sum_acc[1])); return vaddvq_s64(vaddq_s64(sse[0], sse[1])); } static inline uint64_t aom_sum_sse_2d_i16_16xh_sve(const int16_t *src, int stride, int width, int height, int *sum) { int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; int32x4_t sum_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; do { int w = 0; do { int16x8_t s0 = vld1q_s16(src + w); int16x8_t s1 = vld1q_s16(src + w + 8); sse[0] = aom_sdotq_s16(sse[0], s0, s0); sse[1] = aom_sdotq_s16(sse[1], s1, s1); sum_acc[0] = vpadalq_s16(sum_acc[0], s0); sum_acc[1] = vpadalq_s16(sum_acc[1], s1); w += 16; } while (w < width); src += stride; } while (--height != 0); *sum += vaddvq_s32(vaddq_s32(sum_acc[0], sum_acc[1])); return vaddvq_s64(vaddq_s64(sse[0], sse[1])); } uint64_t aom_sum_sse_2d_i16_sve(const int16_t *src, int stride, int width, int height, int *sum) { uint64_t sse; if (width == 4) { sse = aom_sum_sse_2d_i16_4xh_sve(src, stride, height, sum); } else if (width == 8) { sse = aom_sum_sse_2d_i16_8xh_sve(src, stride, height, sum); } else if (width % 16 == 0) { sse = aom_sum_sse_2d_i16_16xh_sve(src, stride, width, height, sum); } else { sse = aom_sum_sse_2d_i16_c(src, stride, width, height, sum); } return sse; } #if CONFIG_AV1_HIGHBITDEPTH static inline uint64_t aom_var_2d_u16_4xh_sve(uint8_t *src, int src_stride, int width, int height) { uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); uint64_t sum = 0; uint64_t sse = 0; uint32x4_t sum_u32 = vdupq_n_u32(0); uint64x2_t sse_u64 = vdupq_n_u64(0); int h = height; do { uint16x8_t s0 = vcombine_u16(vld1_u16(src_u16), vld1_u16(src_u16 + src_stride)); sum_u32 = vpadalq_u16(sum_u32, s0); sse_u64 = aom_udotq_u16(sse_u64, s0, s0); src_u16 += 2 * src_stride; h -= 2; } while (h != 0); sum += vaddlvq_u32(sum_u32); sse += vaddvq_u64(sse_u64); return sse - sum * sum / (width * height); } static inline uint64_t aom_var_2d_u16_8xh_sve(uint8_t *src, int src_stride, int width, int height) { uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); uint64_t sum = 0; uint64_t sse = 0; uint32x4_t sum_u32 = vdupq_n_u32(0); uint64x2_t sse_u64 = vdupq_n_u64(0); int h = height; do { int w = width; uint16_t *src_ptr = src_u16; do { uint16x8_t s0 = vld1q_u16(src_ptr); sum_u32 = vpadalq_u16(sum_u32, s0); sse_u64 = aom_udotq_u16(sse_u64, s0, s0); src_ptr += 8; w -= 8; } while (w != 0); src_u16 += src_stride; } while (--h != 0); sum += vaddlvq_u32(sum_u32); sse += vaddvq_u64(sse_u64); return sse - sum * sum / (width * height); } static inline uint64_t aom_var_2d_u16_16xh_sve(uint8_t *src, int src_stride, int width, int height) { uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); uint64_t sum = 0; uint64_t sse = 0; uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; uint64x2_t sse_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; int h = height; do { int w = width; uint16_t *src_ptr = src_u16; do { uint16x8_t s0 = vld1q_u16(src_ptr); uint16x8_t s1 = vld1q_u16(src_ptr + 8); sum_u32[0] = vpadalq_u16(sum_u32[0], s0); sum_u32[1] = vpadalq_u16(sum_u32[1], s1); sse_u64[0] = aom_udotq_u16(sse_u64[0], s0, s0); sse_u64[1] = aom_udotq_u16(sse_u64[1], s1, s1); src_ptr += 16; w -= 16; } while (w != 0); src_u16 += src_stride; } while (--h != 0); sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[1]); sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[1]); sum += vaddlvq_u32(sum_u32[0]); sse += vaddvq_u64(sse_u64[0]); return sse - sum * sum / (width * height); } static inline uint64_t aom_var_2d_u16_large_sve(uint8_t *src, int src_stride, int width, int height) { uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); uint64_t sum = 0; uint64_t sse = 0; uint32x4_t sum_u32[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; uint64x2_t sse_u64[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0) }; int h = height; do { int w = width; uint16_t *src_ptr = src_u16; do { uint16x8_t s0 = vld1q_u16(src_ptr); uint16x8_t s1 = vld1q_u16(src_ptr + 8); uint16x8_t s2 = vld1q_u16(src_ptr + 16); uint16x8_t s3 = vld1q_u16(src_ptr + 24); sum_u32[0] = vpadalq_u16(sum_u32[0], s0); sum_u32[1] = vpadalq_u16(sum_u32[1], s1); sum_u32[2] = vpadalq_u16(sum_u32[2], s2); sum_u32[3] = vpadalq_u16(sum_u32[3], s3); sse_u64[0] = aom_udotq_u16(sse_u64[0], s0, s0); sse_u64[1] = aom_udotq_u16(sse_u64[1], s1, s1); sse_u64[2] = aom_udotq_u16(sse_u64[2], s2, s2); sse_u64[3] = aom_udotq_u16(sse_u64[3], s3, s3); src_ptr += 32; w -= 32; } while (w != 0); src_u16 += src_stride; } while (--h != 0); sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[1]); sum_u32[2] = vaddq_u32(sum_u32[2], sum_u32[3]); sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[2]); sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[1]); sse_u64[2] = vaddq_u64(sse_u64[2], sse_u64[3]); sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[2]); sum += vaddlvq_u32(sum_u32[0]); sse += vaddvq_u64(sse_u64[0]); return sse - sum * sum / (width * height); } uint64_t aom_var_2d_u16_sve(uint8_t *src, int src_stride, int width, int height) { if (width == 4) { return aom_var_2d_u16_4xh_sve(src, src_stride, width, height); } if (width == 8) { return aom_var_2d_u16_8xh_sve(src, src_stride, width, height); } if (width == 16) { return aom_var_2d_u16_16xh_sve(src, src_stride, width, height); } if (width % 32 == 0) { return aom_var_2d_u16_large_sve(src, src_stride, width, height); } return aom_var_2d_u16_neon(src, src_stride, width, height); } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/arm/transpose_neon.h000066400000000000000000001471061477627663500200210ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_ #define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_ #include #include "aom_dsp/aom_dsp_common.h" // For AOM_FORCE_INLINE. #include "config/aom_config.h" static inline void transpose_elems_u8_8x8( uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4, uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1, uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6, uint8x8_t *o7) { // Swap 8 bit elements. Goes from: // a0: 00 01 02 03 04 05 06 07 // a1: 10 11 12 13 14 15 16 17 // a2: 20 21 22 23 24 25 26 27 // a3: 30 31 32 33 34 35 36 37 // a4: 40 41 42 43 44 45 46 47 // a5: 50 51 52 53 54 55 56 57 // a6: 60 61 62 63 64 65 66 67 // a7: 70 71 72 73 74 75 76 77 // to: // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56 // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57 // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76 // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77 const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(a0, a4), vcombine_u8(a1, a5)); const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(a2, a6), vcombine_u8(a3, a7)); // Swap 16 bit elements resulting in: // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74 // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76 // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75 // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), vreinterpretq_u16_u8(b1.val[0])); const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), vreinterpretq_u16_u8(b1.val[1])); // Unzip 32 bit elements resulting in: // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), vreinterpretq_u32_u16(c1.val[0])); const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), vreinterpretq_u32_u16(c1.val[1])); *o0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0])); *o1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0])); *o2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); *o3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); *o4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); *o5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); *o6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1])); *o7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); } static inline void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5, uint8x8_t *a6, uint8x8_t *a7) { transpose_elems_u8_8x8(*a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7, a0, a1, a2, a3, a4, a5, a6, a7); } static inline void transpose_arrays_u8_8x8(const uint8x8_t *in, uint8x8_t *out) { transpose_elems_u8_8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], &out[0], &out[1], &out[2], &out[3], &out[4], &out[5], &out[6], &out[7]); } static AOM_FORCE_INLINE void transpose_arrays_u8_8x16(const uint8x8_t *x, uint8x16_t *d) { uint8x8x2_t w0 = vzip_u8(x[0], x[1]); uint8x8x2_t w1 = vzip_u8(x[2], x[3]); uint8x8x2_t w2 = vzip_u8(x[4], x[5]); uint8x8x2_t w3 = vzip_u8(x[6], x[7]); uint8x8x2_t w8 = vzip_u8(x[8], x[9]); uint8x8x2_t w9 = vzip_u8(x[10], x[11]); uint8x8x2_t w10 = vzip_u8(x[12], x[13]); uint8x8x2_t w11 = vzip_u8(x[14], x[15]); uint16x4x2_t w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])); uint16x4x2_t w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0])); uint16x4x2_t w12 = vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0])); uint16x4x2_t w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]), vreinterpret_u16_u8(w11.val[0])); uint32x2x2_t w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]), vreinterpret_u32_u16(w5.val[0])); uint32x2x2_t w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]), vreinterpret_u32_u16(w5.val[1])); uint32x2x2_t w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]), vreinterpret_u32_u16(w13.val[0])); uint32x2x2_t w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]), vreinterpret_u32_u16(w13.val[1])); // Store first 4-line result d[0] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0])); d[1] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1])); d[2] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0])); d[3] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1])); w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])); w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1])); w12 = vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1])); w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]), vreinterpret_u16_u8(w11.val[1])); w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]), vreinterpret_u32_u16(w5.val[0])); w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]), vreinterpret_u32_u16(w5.val[1])); w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]), vreinterpret_u32_u16(w13.val[0])); w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]), vreinterpret_u32_u16(w13.val[1])); // Store second 4-line result d[4] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0])); d[5] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1])); d[6] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0])); d[7] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1])); } static AOM_FORCE_INLINE void transpose_arrays_u8_16x8(const uint8x16_t *x, uint8x8_t *d) { uint8x16x2_t w0 = vzipq_u8(x[0], x[1]); uint8x16x2_t w1 = vzipq_u8(x[2], x[3]); uint8x16x2_t w2 = vzipq_u8(x[4], x[5]); uint8x16x2_t w3 = vzipq_u8(x[6], x[7]); uint16x8x2_t w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), vreinterpretq_u16_u8(w1.val[0])); uint16x8x2_t w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), vreinterpretq_u16_u8(w3.val[0])); uint16x8x2_t w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), vreinterpretq_u16_u8(w1.val[1])); uint16x8x2_t w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), vreinterpretq_u16_u8(w3.val[1])); uint32x4x2_t w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]), vreinterpretq_u32_u16(w5.val[0])); uint32x4x2_t w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]), vreinterpretq_u32_u16(w7.val[0])); uint32x4x2_t w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]), vreinterpretq_u32_u16(w5.val[1])); uint32x4x2_t w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]), vreinterpretq_u32_u16(w7.val[1])); d[0] = vreinterpret_u8_u32(vget_low_u32(w8.val[0])); d[1] = vreinterpret_u8_u32(vget_high_u32(w8.val[0])); d[2] = vreinterpret_u8_u32(vget_low_u32(w8.val[1])); d[3] = vreinterpret_u8_u32(vget_high_u32(w8.val[1])); d[4] = vreinterpret_u8_u32(vget_low_u32(w10.val[0])); d[5] = vreinterpret_u8_u32(vget_high_u32(w10.val[0])); d[6] = vreinterpret_u8_u32(vget_low_u32(w10.val[1])); d[7] = vreinterpret_u8_u32(vget_high_u32(w10.val[1])); d[8] = vreinterpret_u8_u32(vget_low_u32(w9.val[0])); d[9] = vreinterpret_u8_u32(vget_high_u32(w9.val[0])); d[10] = vreinterpret_u8_u32(vget_low_u32(w9.val[1])); d[11] = vreinterpret_u8_u32(vget_high_u32(w9.val[1])); d[12] = vreinterpret_u8_u32(vget_low_u32(w11.val[0])); d[13] = vreinterpret_u8_u32(vget_high_u32(w11.val[0])); d[14] = vreinterpret_u8_u32(vget_low_u32(w11.val[1])); d[15] = vreinterpret_u8_u32(vget_high_u32(w11.val[1])); } static inline uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { uint16x8x2_t b0; #if AOM_ARCH_AARCH64 b0.val[0] = vreinterpretq_u16_u64( vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); b0.val[1] = vreinterpretq_u16_u64( vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); #else b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), vreinterpret_u16_u32(vget_low_u32(a1))); b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), vreinterpret_u16_u32(vget_high_u32(a1))); #endif return b0; } static inline void transpose_arrays_u8_16x16(const uint8x16_t *x, uint8x16_t *d) { uint8x16x2_t w0 = vzipq_u8(x[0], x[1]); uint8x16x2_t w1 = vzipq_u8(x[2], x[3]); uint8x16x2_t w2 = vzipq_u8(x[4], x[5]); uint8x16x2_t w3 = vzipq_u8(x[6], x[7]); uint8x16x2_t w4 = vzipq_u8(x[8], x[9]); uint8x16x2_t w5 = vzipq_u8(x[10], x[11]); uint8x16x2_t w6 = vzipq_u8(x[12], x[13]); uint8x16x2_t w7 = vzipq_u8(x[14], x[15]); uint16x8x2_t w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), vreinterpretq_u16_u8(w1.val[0])); uint16x8x2_t w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), vreinterpretq_u16_u8(w3.val[0])); uint16x8x2_t w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]), vreinterpretq_u16_u8(w5.val[0])); uint16x8x2_t w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]), vreinterpretq_u16_u8(w7.val[0])); uint32x4x2_t w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]), vreinterpretq_u32_u16(w9.val[0])); uint32x4x2_t w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]), vreinterpretq_u32_u16(w11.val[0])); uint32x4x2_t w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]), vreinterpretq_u32_u16(w9.val[1])); uint32x4x2_t w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]), vreinterpretq_u32_u16(w11.val[1])); uint16x8x2_t d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]); d[0] = vreinterpretq_u8_u16(d01.val[0]); d[1] = vreinterpretq_u8_u16(d01.val[1]); uint16x8x2_t d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]); d[2] = vreinterpretq_u8_u16(d23.val[0]); d[3] = vreinterpretq_u8_u16(d23.val[1]); uint16x8x2_t d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]); d[4] = vreinterpretq_u8_u16(d45.val[0]); d[5] = vreinterpretq_u8_u16(d45.val[1]); uint16x8x2_t d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]); d[6] = vreinterpretq_u8_u16(d67.val[0]); d[7] = vreinterpretq_u8_u16(d67.val[1]); // upper half w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), vreinterpretq_u16_u8(w1.val[1])); w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), vreinterpretq_u16_u8(w3.val[1])); w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]), vreinterpretq_u16_u8(w5.val[1])); w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]), vreinterpretq_u16_u8(w7.val[1])); w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]), vreinterpretq_u32_u16(w9.val[0])); w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]), vreinterpretq_u32_u16(w11.val[0])); w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]), vreinterpretq_u32_u16(w9.val[1])); w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]), vreinterpretq_u32_u16(w11.val[1])); d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]); d[8] = vreinterpretq_u8_u16(d01.val[0]); d[9] = vreinterpretq_u8_u16(d01.val[1]); d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]); d[10] = vreinterpretq_u8_u16(d23.val[0]); d[11] = vreinterpretq_u8_u16(d23.val[1]); d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]); d[12] = vreinterpretq_u8_u16(d45.val[0]); d[13] = vreinterpretq_u8_u16(d45.val[1]); d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]); d[14] = vreinterpretq_u8_u16(d67.val[0]); d[15] = vreinterpretq_u8_u16(d67.val[1]); } static AOM_FORCE_INLINE void transpose_arrays_u8_32x16(const uint8x16x2_t *x, uint8x16_t *d) { uint8x16_t x2[32]; for (int i = 0; i < 16; ++i) { x2[i] = x[i].val[0]; x2[i + 16] = x[i].val[1]; } transpose_arrays_u8_16x16(x2, d); transpose_arrays_u8_16x16(x2 + 16, d + 16); } static inline void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, uint8x8_t *a3) { // Swap 8 bit elements. Goes from: // a0: 00 01 02 03 04 05 06 07 // a1: 10 11 12 13 14 15 16 17 // a2: 20 21 22 23 24 25 26 27 // a3: 30 31 32 33 34 35 36 37 // to: // b0.val[0]: 00 10 02 12 04 14 06 16 // b0.val[1]: 01 11 03 13 05 15 07 17 // b1.val[0]: 20 30 22 32 24 34 26 36 // b1.val[1]: 21 31 23 33 25 35 27 37 const uint8x8x2_t b0 = vtrn_u8(*a0, *a1); const uint8x8x2_t b1 = vtrn_u8(*a2, *a3); // Swap 16 bit elements resulting in: // c0.val[0]: 00 10 20 30 04 14 24 34 // c0.val[1]: 02 12 22 32 06 16 26 36 // c1.val[0]: 01 11 21 31 05 15 25 35 // c1.val[1]: 03 13 23 33 07 17 27 37 const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0])); const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1])); *a0 = vreinterpret_u8_u16(c0.val[0]); *a1 = vreinterpret_u8_u16(c1.val[0]); *a2 = vreinterpret_u8_u16(c0.val[1]); *a3 = vreinterpret_u8_u16(c1.val[1]); } static inline void transpose_elems_inplace_u8_16x4(uint8x16_t *a0, uint8x16_t *a1, uint8x16_t *a2, uint8x16_t *a3) { // Swap 8 bit elements. Goes from: // a0: 00 01 02 03 04 05 06 07 08 09 010 011 012 013 014 015 // a1: 10 11 12 13 14 15 16 17 18 19 110 111 112 113 114 115 // a2: 20 21 22 23 24 25 26 27 28 29 210 211 212 213 214 215 // a3: 30 31 32 33 34 35 36 37 38 39 310 311 312 313 314 315 // to: // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 010 110 012 112 014 114 // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 011 111 013 113 015 115 // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 210 310 212 312 214 314 // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 211 311 213 313 215 315 const uint8x16x2_t b0 = vtrnq_u8(*a0, *a1); const uint8x16x2_t b1 = vtrnq_u8(*a2, *a3); // Swap 16 bit elements resulting in: // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 012 112 212 312 // c0.val[1]: 02 12 22 32 06 16 26 36 09 19 29 39 013 113 213 313 // c1.val[0]: 01 11 21 31 05 15 25 35 010 110 210 310 014 114 214 314 // c1.val[1]: 03 13 23 33 07 17 27 37 011 111 211 311 015 115 215 315 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), vreinterpretq_u16_u8(b1.val[0])); const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), vreinterpretq_u16_u8(b1.val[1])); *a0 = vreinterpretq_u8_u16(c0.val[0]); *a1 = vreinterpretq_u8_u16(c1.val[0]); *a2 = vreinterpretq_u8_u16(c0.val[1]); *a3 = vreinterpretq_u8_u16(c1.val[1]); } static inline void transpose_elems_inplace_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) { // Swap 16 bit elements. Goes from: // a0: 00 01 02 03 10 11 12 13 // a1: 20 21 22 23 30 31 32 33 // to: // b0.val[0]: 00 01 20 21 10 11 30 31 // b0.val[1]: 02 03 22 23 12 13 32 33 const uint16x4x2_t b0 = vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1)); // Swap 32 bit elements resulting in: // c0.val[0]: 00 01 20 21 02 03 22 23 // c0.val[1]: 10 11 30 31 12 13 32 33 const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]), vreinterpret_u32_u16(b0.val[1])); // Swap 8 bit elements resulting in: // d0.val[0]: 00 10 20 30 02 12 22 32 // d0.val[1]: 01 11 21 31 03 13 23 33 const uint8x8x2_t d0 = vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1])); *a0 = d0.val[0]; *a1 = d0.val[1]; } static inline void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4, uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1, uint8x8_t *o2, uint8x8_t *o3) { // Swap 32 bit elements. Goes from: // a0: 00 01 02 03 XX XX XX XX // a1: 10 11 12 13 XX XX XX XX // a2: 20 21 22 23 XX XX XX XX // a3; 30 31 32 33 XX XX XX XX // a4: 40 41 42 43 XX XX XX XX // a5: 50 51 52 53 XX XX XX XX // a6: 60 61 62 63 XX XX XX XX // a7: 70 71 72 73 XX XX XX XX // to: // b0.val[0]: 00 01 02 03 40 41 42 43 // b1.val[0]: 10 11 12 13 50 51 52 53 // b2.val[0]: 20 21 22 23 60 61 62 63 // b3.val[0]: 30 31 32 33 70 71 72 73 const uint32x2x2_t b0 = vtrn_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4)); const uint32x2x2_t b1 = vtrn_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5)); const uint32x2x2_t b2 = vtrn_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6)); const uint32x2x2_t b3 = vtrn_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7)); // Swap 16 bit elements resulting in: // c0.val[0]: 00 01 20 21 40 41 60 61 // c0.val[1]: 02 03 22 23 42 43 62 63 // c1.val[0]: 10 11 30 31 50 51 70 71 // c1.val[1]: 12 13 32 33 52 53 72 73 const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]), vreinterpret_u16_u32(b2.val[0])); const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]), vreinterpret_u16_u32(b3.val[0])); // Swap 8 bit elements resulting in: // d0.val[0]: 00 10 20 30 40 50 60 70 // d0.val[1]: 01 11 21 31 41 51 61 71 // d1.val[0]: 02 12 22 32 42 52 62 72 // d1.val[1]: 03 13 23 33 43 53 63 73 const uint8x8x2_t d0 = vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0])); const uint8x8x2_t d1 = vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1])); *o0 = d0.val[0]; *o1 = d0.val[1]; *o2 = d1.val[0]; *o3 = d1.val[1]; } static inline void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) { // Input: // 00 01 02 03 // 10 11 12 13 // 20 21 22 23 // 30 31 32 33 // b: // 00 10 02 12 // 01 11 03 13 const uint16x4x2_t b = vtrn_u16(a[0], a[1]); // c: // 20 30 22 32 // 21 31 23 33 const uint16x4x2_t c = vtrn_u16(a[2], a[3]); // d: // 00 10 20 30 // 02 12 22 32 const uint32x2x2_t d = vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0])); // e: // 01 11 21 31 // 03 13 23 33 const uint32x2x2_t e = vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1])); // Output: // 00 10 20 30 // 01 11 21 31 // 02 12 22 32 // 03 13 23 33 a[0] = vreinterpret_u16_u32(d.val[0]); a[1] = vreinterpret_u16_u32(e.val[0]); a[2] = vreinterpret_u16_u32(d.val[1]); a[3] = vreinterpret_u16_u32(e.val[1]); } static inline void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) { // 4x8 Input: // a[0]: 00 01 02 03 04 05 06 07 // a[1]: 10 11 12 13 14 15 16 17 // a[2]: 20 21 22 23 24 25 26 27 // a[3]: 30 31 32 33 34 35 36 37 // b0.val[0]: 00 10 02 12 04 14 06 16 // b0.val[1]: 01 11 03 13 05 15 07 17 // b1.val[0]: 20 30 22 32 24 34 26 36 // b1.val[1]: 21 31 23 33 25 35 27 37 const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]); const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]); // c0.val[0]: 00 10 20 30 04 14 24 34 // c0.val[1]: 02 12 22 32 06 16 26 36 // c1.val[0]: 01 11 21 31 05 15 25 35 // c1.val[1]: 03 13 23 33 07 17 27 37 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), vreinterpretq_u32_u16(b1.val[0])); const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), vreinterpretq_u32_u16(b1.val[1])); // 8x4 Output: // a[0]: 00 10 20 30 04 14 24 34 // a[1]: 01 11 21 31 05 15 25 35 // a[2]: 02 12 22 32 06 16 26 36 // a[3]: 03 13 23 33 07 17 27 37 a[0] = vreinterpretq_u16_u32(c0.val[0]); a[1] = vreinterpretq_u16_u32(c1.val[0]); a[2] = vreinterpretq_u16_u32(c0.val[1]); a[3] = vreinterpretq_u16_u32(c1.val[1]); } // Special transpose for loop filter. // 4x8 Input: // p_q: p3 p2 p1 p0 q0 q1 q2 q3 // a[0]: 00 01 02 03 04 05 06 07 // a[1]: 10 11 12 13 14 15 16 17 // a[2]: 20 21 22 23 24 25 26 27 // a[3]: 30 31 32 33 34 35 36 37 // 8x4 Output: // a[0]: 03 13 23 33 04 14 24 34 p0q0 // a[1]: 02 12 22 32 05 15 25 35 p1q1 // a[2]: 01 11 21 31 06 16 26 36 p2q2 // a[3]: 00 10 20 30 07 17 27 37 p3q3 // Direct reapplication of the function will reset the high halves, but // reverse the low halves: // p_q: p0 p1 p2 p3 q0 q1 q2 q3 // a[0]: 33 32 31 30 04 05 06 07 // a[1]: 23 22 21 20 14 15 16 17 // a[2]: 13 12 11 10 24 25 26 27 // a[3]: 03 02 01 00 34 35 36 37 // Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but // reverse the high halves. // The standard transpose_u16_4x8q will produce the same reversals, but with the // order of the low halves also restored relative to the high halves. This is // preferable because it puts all values from the same source row back together, // but some post-processing is inevitable. static inline void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) { // b0.val[0]: 00 10 02 12 04 14 06 16 // b0.val[1]: 01 11 03 13 05 15 07 17 // b1.val[0]: 20 30 22 32 24 34 26 36 // b1.val[1]: 21 31 23 33 25 35 27 37 const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]); const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]); // Reverse odd vectors to bring the appropriate items to the front of zips. // b0.val[0]: 00 10 02 12 04 14 06 16 // r0 : 03 13 01 11 07 17 05 15 // b1.val[0]: 20 30 22 32 24 34 26 36 // r1 : 23 33 21 31 27 37 25 35 const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1])); const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1])); // Zip to complete the halves. // c0.val[0]: 00 10 20 30 02 12 22 32 p3p1 // c0.val[1]: 04 14 24 34 06 16 26 36 q0q2 // c1.val[0]: 03 13 23 33 01 11 21 31 p0p2 // c1.val[1]: 07 17 27 37 05 15 25 35 q3q1 const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]), vreinterpretq_u32_u16(b1.val[0])); const uint32x4x2_t c1 = vzipq_u32(r0, r1); // d0.val[0]: 00 10 20 30 07 17 27 37 p3q3 // d0.val[1]: 02 12 22 32 05 15 25 35 p1q1 // d1.val[0]: 03 13 23 33 04 14 24 34 p0q0 // d1.val[1]: 01 11 21 31 06 16 26 36 p2q2 const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]); // The third row of c comes first here to swap p2 with q0. const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]); // 8x4 Output: // a[0]: 03 13 23 33 04 14 24 34 p0q0 // a[1]: 02 12 22 32 05 15 25 35 p1q1 // a[2]: 01 11 21 31 06 16 26 36 p2q2 // a[3]: 00 10 20 30 07 17 27 37 p3q3 a[0] = d1.val[0]; // p0q0 a[1] = d0.val[1]; // p1q1 a[2] = d1.val[1]; // p2q2 a[3] = d0.val[0]; // p3q3 } static inline void transpose_elems_u16_4x8( const uint16x4_t a0, const uint16x4_t a1, const uint16x4_t a2, const uint16x4_t a3, const uint16x4_t a4, const uint16x4_t a5, const uint16x4_t a6, const uint16x4_t a7, uint16x8_t *o0, uint16x8_t *o1, uint16x8_t *o2, uint16x8_t *o3) { // Combine rows. Goes from: // a0: 00 01 02 03 // a1: 10 11 12 13 // a2: 20 21 22 23 // a3: 30 31 32 33 // a4: 40 41 42 43 // a5: 50 51 52 53 // a6: 60 61 62 63 // a7: 70 71 72 73 // to: // b0: 00 01 02 03 40 41 42 43 // b1: 10 11 12 13 50 51 52 53 // b2: 20 21 22 23 60 61 62 63 // b3: 30 31 32 33 70 71 72 73 const uint16x8_t b0 = vcombine_u16(a0, a4); const uint16x8_t b1 = vcombine_u16(a1, a5); const uint16x8_t b2 = vcombine_u16(a2, a6); const uint16x8_t b3 = vcombine_u16(a3, a7); // Swap 16 bit elements resulting in: // c0.val[0]: 00 10 02 12 40 50 42 52 // c0.val[1]: 01 11 03 13 41 51 43 53 // c1.val[0]: 20 30 22 32 60 70 62 72 // c1.val[1]: 21 31 23 33 61 71 63 73 const uint16x8x2_t c0 = vtrnq_u16(b0, b1); const uint16x8x2_t c1 = vtrnq_u16(b2, b3); // Swap 32 bit elements resulting in: // d0.val[0]: 00 10 20 30 40 50 60 70 // d0.val[1]: 02 12 22 32 42 52 62 72 // d1.val[0]: 01 11 21 31 41 51 61 71 // d1.val[1]: 03 13 23 33 43 53 63 73 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]), vreinterpretq_u32_u16(c1.val[0])); const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]), vreinterpretq_u32_u16(c1.val[1])); *o0 = vreinterpretq_u16_u32(d0.val[0]); *o1 = vreinterpretq_u16_u32(d1.val[0]); *o2 = vreinterpretq_u16_u32(d0.val[1]); *o3 = vreinterpretq_u16_u32(d1.val[1]); } static inline void transpose_elems_s16_4x8( const int16x4_t a0, const int16x4_t a1, const int16x4_t a2, const int16x4_t a3, const int16x4_t a4, const int16x4_t a5, const int16x4_t a6, const int16x4_t a7, int16x8_t *o0, int16x8_t *o1, int16x8_t *o2, int16x8_t *o3) { // Combine rows. Goes from: // a0: 00 01 02 03 // a1: 10 11 12 13 // a2: 20 21 22 23 // a3: 30 31 32 33 // a4: 40 41 42 43 // a5: 50 51 52 53 // a6: 60 61 62 63 // a7: 70 71 72 73 // to: // b0: 00 01 02 03 40 41 42 43 // b1: 10 11 12 13 50 51 52 53 // b2: 20 21 22 23 60 61 62 63 // b3: 30 31 32 33 70 71 72 73 const int16x8_t b0 = vcombine_s16(a0, a4); const int16x8_t b1 = vcombine_s16(a1, a5); const int16x8_t b2 = vcombine_s16(a2, a6); const int16x8_t b3 = vcombine_s16(a3, a7); // Swap 16 bit elements resulting in: // c0.val[0]: 00 10 02 12 40 50 42 52 // c0.val[1]: 01 11 03 13 41 51 43 53 // c1.val[0]: 20 30 22 32 60 70 62 72 // c1.val[1]: 21 31 23 33 61 71 63 73 const int16x8x2_t c0 = vtrnq_s16(b0, b1); const int16x8x2_t c1 = vtrnq_s16(b2, b3); // Swap 32 bit elements resulting in: // d0.val[0]: 00 10 20 30 40 50 60 70 // d0.val[1]: 02 12 22 32 42 52 62 72 // d1.val[0]: 01 11 21 31 41 51 61 71 // d1.val[1]: 03 13 23 33 43 53 63 73 const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), vreinterpretq_s32_s16(c1.val[0])); const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), vreinterpretq_s32_s16(c1.val[1])); *o0 = vreinterpretq_s16_s32(d0.val[0]); *o1 = vreinterpretq_s16_s32(d1.val[0]); *o2 = vreinterpretq_s16_s32(d0.val[1]); *o3 = vreinterpretq_s16_s32(d1.val[1]); } static inline void transpose_elems_inplace_u16_8x8( uint16x8_t *a0, uint16x8_t *a1, uint16x8_t *a2, uint16x8_t *a3, uint16x8_t *a4, uint16x8_t *a5, uint16x8_t *a6, uint16x8_t *a7) { // Swap 16 bit elements. Goes from: // a0: 00 01 02 03 04 05 06 07 // a1: 10 11 12 13 14 15 16 17 // a2: 20 21 22 23 24 25 26 27 // a3: 30 31 32 33 34 35 36 37 // a4: 40 41 42 43 44 45 46 47 // a5: 50 51 52 53 54 55 56 57 // a6: 60 61 62 63 64 65 66 67 // a7: 70 71 72 73 74 75 76 77 // to: // b0.val[0]: 00 10 02 12 04 14 06 16 // b0.val[1]: 01 11 03 13 05 15 07 17 // b1.val[0]: 20 30 22 32 24 34 26 36 // b1.val[1]: 21 31 23 33 25 35 27 37 // b2.val[0]: 40 50 42 52 44 54 46 56 // b2.val[1]: 41 51 43 53 45 55 47 57 // b3.val[0]: 60 70 62 72 64 74 66 76 // b3.val[1]: 61 71 63 73 65 75 67 77 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1); const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3); const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5); const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7); // Swap 32 bit elements resulting in: // c0.val[0]: 00 10 20 30 04 14 24 34 // c0.val[1]: 02 12 22 32 06 16 26 36 // c1.val[0]: 01 11 21 31 05 15 25 35 // c1.val[1]: 03 13 23 33 07 17 27 37 // c2.val[0]: 40 50 60 70 44 54 64 74 // c2.val[1]: 42 52 62 72 46 56 66 76 // c3.val[0]: 41 51 61 71 45 55 65 75 // c3.val[1]: 43 53 63 73 47 57 67 77 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), vreinterpretq_u32_u16(b1.val[0])); const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), vreinterpretq_u32_u16(b1.val[1])); const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]), vreinterpretq_u32_u16(b3.val[0])); const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]), vreinterpretq_u32_u16(b3.val[1])); // Swap 64 bit elements resulting in: // d0.val[0]: 00 10 20 30 40 50 60 70 // d0.val[1]: 04 14 24 34 44 54 64 74 // d1.val[0]: 01 11 21 31 41 51 61 71 // d1.val[1]: 05 15 25 35 45 55 65 75 // d2.val[0]: 02 12 22 32 42 52 62 72 // d2.val[1]: 06 16 26 36 46 56 66 76 // d3.val[0]: 03 13 23 33 43 53 63 73 // d3.val[1]: 07 17 27 37 47 57 67 77 const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c2.val[0]); const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c3.val[0]); const uint16x8x2_t d2 = aom_vtrnq_u64_to_u16(c0.val[1], c2.val[1]); const uint16x8x2_t d3 = aom_vtrnq_u64_to_u16(c1.val[1], c3.val[1]); *a0 = d0.val[0]; *a1 = d1.val[0]; *a2 = d2.val[0]; *a3 = d3.val[0]; *a4 = d0.val[1]; *a5 = d1.val[1]; *a6 = d2.val[1]; *a7 = d3.val[1]; } static inline int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { int16x8x2_t b0; #if AOM_ARCH_AARCH64 b0.val[0] = vreinterpretq_s16_s64( vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); b0.val[1] = vreinterpretq_s16_s64( vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); #else b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), vreinterpret_s16_s32(vget_low_s32(a1))); b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)), vreinterpret_s16_s32(vget_high_s32(a1))); #endif return b0; } static inline void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, int16x8_t *a6, int16x8_t *a7) { // Swap 16 bit elements. Goes from: // a0: 00 01 02 03 04 05 06 07 // a1: 10 11 12 13 14 15 16 17 // a2: 20 21 22 23 24 25 26 27 // a3: 30 31 32 33 34 35 36 37 // a4: 40 41 42 43 44 45 46 47 // a5: 50 51 52 53 54 55 56 57 // a6: 60 61 62 63 64 65 66 67 // a7: 70 71 72 73 74 75 76 77 // to: // b0.val[0]: 00 10 02 12 04 14 06 16 // b0.val[1]: 01 11 03 13 05 15 07 17 // b1.val[0]: 20 30 22 32 24 34 26 36 // b1.val[1]: 21 31 23 33 25 35 27 37 // b2.val[0]: 40 50 42 52 44 54 46 56 // b2.val[1]: 41 51 43 53 45 55 47 57 // b3.val[0]: 60 70 62 72 64 74 66 76 // b3.val[1]: 61 71 63 73 65 75 67 77 const int16x8x2_t b0 = vtrnq_s16(*a0, *a1); const int16x8x2_t b1 = vtrnq_s16(*a2, *a3); const int16x8x2_t b2 = vtrnq_s16(*a4, *a5); const int16x8x2_t b3 = vtrnq_s16(*a6, *a7); // Swap 32 bit elements resulting in: // c0.val[0]: 00 10 20 30 04 14 24 34 // c0.val[1]: 02 12 22 32 06 16 26 36 // c1.val[0]: 01 11 21 31 05 15 25 35 // c1.val[1]: 03 13 23 33 07 17 27 37 // c2.val[0]: 40 50 60 70 44 54 64 74 // c2.val[1]: 42 52 62 72 46 56 66 76 // c3.val[0]: 41 51 61 71 45 55 65 75 // c3.val[1]: 43 53 63 73 47 57 67 77 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), vreinterpretq_s32_s16(b1.val[0])); const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), vreinterpretq_s32_s16(b1.val[1])); const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), vreinterpretq_s32_s16(b3.val[0])); const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), vreinterpretq_s32_s16(b3.val[1])); // Swap 64 bit elements resulting in: // d0.val[0]: 00 10 20 30 40 50 60 70 // d0.val[1]: 04 14 24 34 44 54 64 74 // d1.val[0]: 01 11 21 31 41 51 61 71 // d1.val[1]: 05 15 25 35 45 55 65 75 // d2.val[0]: 02 12 22 32 42 52 62 72 // d2.val[1]: 06 16 26 36 46 56 66 76 // d3.val[0]: 03 13 23 33 43 53 63 73 // d3.val[1]: 07 17 27 37 47 57 67 77 const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); *a0 = d0.val[0]; *a1 = d1.val[0]; *a2 = d2.val[0]; *a3 = d3.val[0]; *a4 = d0.val[1]; *a5 = d1.val[1]; *a6 = d2.val[1]; *a7 = d3.val[1]; } static inline void transpose_arrays_s16_8x8(const int16x8_t *a, int16x8_t *out) { // Swap 16 bit elements. Goes from: // a0: 00 01 02 03 04 05 06 07 // a1: 10 11 12 13 14 15 16 17 // a2: 20 21 22 23 24 25 26 27 // a3: 30 31 32 33 34 35 36 37 // a4: 40 41 42 43 44 45 46 47 // a5: 50 51 52 53 54 55 56 57 // a6: 60 61 62 63 64 65 66 67 // a7: 70 71 72 73 74 75 76 77 // to: // b0.val[0]: 00 10 02 12 04 14 06 16 // b0.val[1]: 01 11 03 13 05 15 07 17 // b1.val[0]: 20 30 22 32 24 34 26 36 // b1.val[1]: 21 31 23 33 25 35 27 37 // b2.val[0]: 40 50 42 52 44 54 46 56 // b2.val[1]: 41 51 43 53 45 55 47 57 // b3.val[0]: 60 70 62 72 64 74 66 76 // b3.val[1]: 61 71 63 73 65 75 67 77 const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]); const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]); const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]); const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]); // Swap 32 bit elements resulting in: // c0.val[0]: 00 10 20 30 04 14 24 34 // c0.val[1]: 02 12 22 32 06 16 26 36 // c1.val[0]: 01 11 21 31 05 15 25 35 // c1.val[1]: 03 13 23 33 07 17 27 37 // c2.val[0]: 40 50 60 70 44 54 64 74 // c2.val[1]: 42 52 62 72 46 56 66 76 // c3.val[0]: 41 51 61 71 45 55 65 75 // c3.val[1]: 43 53 63 73 47 57 67 77 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), vreinterpretq_s32_s16(b1.val[0])); const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), vreinterpretq_s32_s16(b1.val[1])); const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), vreinterpretq_s32_s16(b3.val[0])); const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), vreinterpretq_s32_s16(b3.val[1])); // Swap 64 bit elements resulting in: // d0.val[0]: 00 10 20 30 40 50 60 70 // d0.val[1]: 04 14 24 34 44 54 64 74 // d1.val[0]: 01 11 21 31 41 51 61 71 // d1.val[1]: 05 15 25 35 45 55 65 75 // d2.val[0]: 02 12 22 32 42 52 62 72 // d2.val[1]: 06 16 26 36 46 56 66 76 // d3.val[0]: 03 13 23 33 43 53 63 73 // d3.val[1]: 07 17 27 37 47 57 67 77 const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); out[0] = d0.val[0]; out[1] = d1.val[0]; out[2] = d2.val[0]; out[3] = d3.val[0]; out[4] = d0.val[1]; out[5] = d1.val[1]; out[6] = d2.val[1]; out[7] = d3.val[1]; } static inline void transpose_elems_inplace_s16_8x4(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, int16x8_t *a3) { // Swap 16 bit elements. Goes from: // a0: 00 01 02 03 04 05 06 07 // a1: 10 11 12 13 14 15 16 17 // a2: 20 21 22 23 24 25 26 27 // a3: 30 31 32 33 34 35 36 37 // to: // b0.val[0]: 00 10 02 12 04 14 06 16 // b0.val[1]: 01 11 03 13 05 15 07 17 // b1.val[0]: 20 30 22 32 24 34 26 36 // b1.val[1]: 21 31 23 33 25 35 27 37 const int16x8x2_t b0 = vtrnq_s16(*a0, *a1); const int16x8x2_t b1 = vtrnq_s16(*a2, *a3); // Swap 32 bit elements resulting in: // c0.val[0]: 00 10 20 30 04 14 24 34 // c0.val[1]: 01 11 21 31 05 15 25 35 // c1.val[0]: 02 12 22 32 06 16 26 36 // c1.val[1]: 03 13 23 33 07 17 27 37 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), vreinterpretq_s32_s16(b1.val[0])); const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), vreinterpretq_s32_s16(b1.val[1])); *a0 = vreinterpretq_s16_s32(c0.val[0]); *a1 = vreinterpretq_s16_s32(c1.val[0]); *a2 = vreinterpretq_s16_s32(c0.val[1]); *a3 = vreinterpretq_s16_s32(c1.val[1]); } static inline void transpose_elems_inplace_u16_4x4(uint16x4_t *a0, uint16x4_t *a1, uint16x4_t *a2, uint16x4_t *a3) { // Swap 16 bit elements. Goes from: // a0: 00 01 02 03 // a1: 10 11 12 13 // a2: 20 21 22 23 // a3: 30 31 32 33 // to: // b0.val[0]: 00 10 02 12 // b0.val[1]: 01 11 03 13 // b1.val[0]: 20 30 22 32 // b1.val[1]: 21 31 23 33 const uint16x4x2_t b0 = vtrn_u16(*a0, *a1); const uint16x4x2_t b1 = vtrn_u16(*a2, *a3); // Swap 32 bit elements resulting in: // c0.val[0]: 00 10 20 30 // c0.val[1]: 02 12 22 32 // c1.val[0]: 01 11 21 31 // c1.val[1]: 03 13 23 33 const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]), vreinterpret_u32_u16(b1.val[0])); const uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]), vreinterpret_u32_u16(b1.val[1])); *a0 = vreinterpret_u16_u32(c0.val[0]); *a1 = vreinterpret_u16_u32(c1.val[0]); *a2 = vreinterpret_u16_u32(c0.val[1]); *a3 = vreinterpret_u16_u32(c1.val[1]); } static inline void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1, int16x4_t *a2, int16x4_t *a3) { // Swap 16 bit elements. Goes from: // a0: 00 01 02 03 // a1: 10 11 12 13 // a2: 20 21 22 23 // a3: 30 31 32 33 // to: // b0.val[0]: 00 10 02 12 // b0.val[1]: 01 11 03 13 // b1.val[0]: 20 30 22 32 // b1.val[1]: 21 31 23 33 const int16x4x2_t b0 = vtrn_s16(*a0, *a1); const int16x4x2_t b1 = vtrn_s16(*a2, *a3); // Swap 32 bit elements resulting in: // c0.val[0]: 00 10 20 30 // c0.val[1]: 02 12 22 32 // c1.val[0]: 01 11 21 31 // c1.val[1]: 03 13 23 33 const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), vreinterpret_s32_s16(b1.val[0])); const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), vreinterpret_s32_s16(b1.val[1])); *a0 = vreinterpret_s16_s32(c0.val[0]); *a1 = vreinterpret_s16_s32(c1.val[0]); *a2 = vreinterpret_s16_s32(c0.val[1]); *a3 = vreinterpret_s16_s32(c1.val[1]); } static inline int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { int32x4x2_t b0; #if AOM_ARCH_AARCH64 b0.val[0] = vreinterpretq_s32_s64( vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); b0.val[1] = vreinterpretq_s32_s64( vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); #else b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1)); b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1)); #endif return b0; } static inline void transpose_elems_s32_4x4(const int32x4_t a0, const int32x4_t a1, const int32x4_t a2, const int32x4_t a3, int32x4_t *o0, int32x4_t *o1, int32x4_t *o2, int32x4_t *o3) { // Swap 32 bit elements. Goes from: // a0: 00 01 02 03 // a1: 10 11 12 13 // a2: 20 21 22 23 // a3: 30 31 32 33 // to: // b0.val[0]: 00 10 02 12 // b0.val[1]: 01 11 03 13 // b1.val[0]: 20 30 22 32 // b1.val[1]: 21 31 23 33 const int32x4x2_t b0 = vtrnq_s32(a0, a1); const int32x4x2_t b1 = vtrnq_s32(a2, a3); // Swap 64 bit elements resulting in: // c0.val[0]: 00 10 20 30 // c0.val[1]: 02 12 22 32 // c1.val[0]: 01 11 21 31 // c1.val[1]: 03 13 23 33 const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]); const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]); *o0 = c0.val[0]; *o1 = c1.val[0]; *o2 = c0.val[1]; *o3 = c1.val[1]; } static inline void transpose_elems_inplace_s32_4x4(int32x4_t *a0, int32x4_t *a1, int32x4_t *a2, int32x4_t *a3) { transpose_elems_s32_4x4(*a0, *a1, *a2, *a3, a0, a1, a2, a3); } static inline void transpose_arrays_s32_4x4(const int32x4_t *in, int32x4_t *out) { transpose_elems_s32_4x4(in[0], in[1], in[2], in[3], &out[0], &out[1], &out[2], &out[3]); } static AOM_FORCE_INLINE void transpose_arrays_s32_4nx4n(const int32x4_t *in, int32x4_t *out, const int width, const int height) { const int h = height >> 2; const int w = width >> 2; for (int j = 0; j < w; j++) { for (int i = 0; i < h; i++) { transpose_arrays_s32_4x4(in + j * height + i * 4, out + i * width + j * 4); } } } #define TRANSPOSE_ARRAYS_S32_WXH_NEON(w, h) \ static AOM_FORCE_INLINE void transpose_arrays_s32_##w##x##h( \ const int32x4_t *in, int32x4_t *out) { \ transpose_arrays_s32_4nx4n(in, out, w, h); \ } TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 8) TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 16) TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 4) TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 8) TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 16) TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 32) TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 8) TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 16) TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 32) TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 64) TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 8) TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 16) TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 32) TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 64) TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 16) TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 32) #undef TRANSPOSE_ARRAYS_S32_WXH_NEON static inline int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) { #if AOM_ARCH_AARCH64 return vtrn1q_s64(a, b); #else return vcombine_s64(vget_low_s64(a), vget_low_s64(b)); #endif } static inline int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) { #if AOM_ARCH_AARCH64 return vtrn2q_s64(a, b); #else return vcombine_s64(vget_high_s64(a), vget_high_s64(b)); #endif } static inline void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1, int32x4_t a2, int32x4_t a3, int32x4_t a4, int32x4_t a5, int32x4_t a6, int32x4_t a7, int32x4x2_t *o0, int32x4x2_t *o1, int32x4x2_t *o2, int32x4x2_t *o3) { // Perform a 4 x 8 matrix transpose by building on top of the existing 4 x 4 // matrix transpose implementation: // [ A ]^T => [ A^T B^T ] // [ B ] transpose_elems_inplace_s32_4x4(&a0, &a1, &a2, &a3); // A^T transpose_elems_inplace_s32_4x4(&a4, &a5, &a6, &a7); // B^T o0->val[0] = a0; o1->val[0] = a1; o2->val[0] = a2; o3->val[0] = a3; o0->val[1] = a4; o1->val[1] = a5; o2->val[1] = a6; o3->val[1] = a7; } static inline void transpose_elems_inplace_s32_8x8( int32x4x2_t *a0, int32x4x2_t *a1, int32x4x2_t *a2, int32x4x2_t *a3, int32x4x2_t *a4, int32x4x2_t *a5, int32x4x2_t *a6, int32x4x2_t *a7) { // Perform an 8 x 8 matrix transpose by building on top of the existing 4 x 4 // matrix transpose implementation: // [ A B ]^T => [ A^T C^T ] // [ C D ] [ B^T D^T ] int32x4_t q0_v1 = a0->val[0]; int32x4_t q0_v2 = a1->val[0]; int32x4_t q0_v3 = a2->val[0]; int32x4_t q0_v4 = a3->val[0]; int32x4_t q1_v1 = a0->val[1]; int32x4_t q1_v2 = a1->val[1]; int32x4_t q1_v3 = a2->val[1]; int32x4_t q1_v4 = a3->val[1]; int32x4_t q2_v1 = a4->val[0]; int32x4_t q2_v2 = a5->val[0]; int32x4_t q2_v3 = a6->val[0]; int32x4_t q2_v4 = a7->val[0]; int32x4_t q3_v1 = a4->val[1]; int32x4_t q3_v2 = a5->val[1]; int32x4_t q3_v3 = a6->val[1]; int32x4_t q3_v4 = a7->val[1]; transpose_elems_inplace_s32_4x4(&q0_v1, &q0_v2, &q0_v3, &q0_v4); // A^T transpose_elems_inplace_s32_4x4(&q1_v1, &q1_v2, &q1_v3, &q1_v4); // B^T transpose_elems_inplace_s32_4x4(&q2_v1, &q2_v2, &q2_v3, &q2_v4); // C^T transpose_elems_inplace_s32_4x4(&q3_v1, &q3_v2, &q3_v3, &q3_v4); // D^T a0->val[0] = q0_v1; a1->val[0] = q0_v2; a2->val[0] = q0_v3; a3->val[0] = q0_v4; a0->val[1] = q2_v1; a1->val[1] = q2_v2; a2->val[1] = q2_v3; a3->val[1] = q2_v4; a4->val[0] = q1_v1; a5->val[0] = q1_v2; a6->val[0] = q1_v3; a7->val[0] = q1_v4; a4->val[1] = q3_v1; a5->val[1] = q3_v2; a6->val[1] = q3_v3; a7->val[1] = q3_v4; } static inline void transpose_arrays_s16_4x4(const int16x4_t *const in, int16x4_t *const out) { int16x4_t a0 = in[0]; int16x4_t a1 = in[1]; int16x4_t a2 = in[2]; int16x4_t a3 = in[3]; transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3); out[0] = a0; out[1] = a1; out[2] = a2; out[3] = a3; } static inline void transpose_arrays_s16_4x8(const int16x4_t *const in, int16x8_t *const out) { #if AOM_ARCH_AARCH64 const int16x8_t a0 = vzip1q_s16(vcombine_s16(in[0], vdup_n_s16(0)), vcombine_s16(in[1], vdup_n_s16(0))); const int16x8_t a1 = vzip1q_s16(vcombine_s16(in[2], vdup_n_s16(0)), vcombine_s16(in[3], vdup_n_s16(0))); const int16x8_t a2 = vzip1q_s16(vcombine_s16(in[4], vdup_n_s16(0)), vcombine_s16(in[5], vdup_n_s16(0))); const int16x8_t a3 = vzip1q_s16(vcombine_s16(in[6], vdup_n_s16(0)), vcombine_s16(in[7], vdup_n_s16(0))); #else int16x4x2_t temp; temp = vzip_s16(in[0], in[1]); const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]); temp = vzip_s16(in[2], in[3]); const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]); temp = vzip_s16(in[4], in[5]); const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]); temp = vzip_s16(in[6], in[7]); const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]); #endif const int32x4x2_t b02 = vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1)); const int32x4x2_t b13 = vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3)); #if AOM_ARCH_AARCH64 out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]), vreinterpretq_s64_s32(b13.val[0]))); out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]), vreinterpretq_s64_s32(b13.val[0]))); out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]), vreinterpretq_s64_s32(b13.val[1]))); out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]), vreinterpretq_s64_s32(b13.val[1]))); #else out[0] = vreinterpretq_s16_s32( vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2)); out[2] = vreinterpretq_s16_s32( vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2)); out[1] = vreinterpretq_s16_s32( vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2)); out[3] = vreinterpretq_s16_s32( vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2)); #endif } static inline void transpose_arrays_s16_8x4(const int16x8_t *const in, int16x4_t *const out) { // Swap 16 bit elements. Goes from: // in[0]: 00 01 02 03 04 05 06 07 // in[1]: 10 11 12 13 14 15 16 17 // in[2]: 20 21 22 23 24 25 26 27 // in[3]: 30 31 32 33 34 35 36 37 // to: // b0.val[0]: 00 10 02 12 04 14 06 16 // b0.val[1]: 01 11 03 13 05 15 07 17 // b1.val[0]: 20 30 22 32 24 34 26 36 // b1.val[1]: 21 31 23 33 25 35 27 37 const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]); const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]); // Swap 32 bit elements resulting in: // c0.val[0]: 00 10 20 30 04 14 24 34 // c0.val[1]: 02 12 22 32 06 16 26 36 // c1.val[0]: 01 11 21 31 05 15 25 35 // c1.val[1]: 03 13 23 33 07 17 27 37 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[0]), vreinterpretq_u32_s16(b1.val[0])); const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[1]), vreinterpretq_u32_s16(b1.val[1])); // Unpack 64 bit elements resulting in: // out[0]: 00 10 20 30 // out[1]: 01 11 21 31 // out[2]: 02 12 22 32 // out[3]: 03 13 23 33 // out[4]: 04 14 24 34 // out[5]: 05 15 25 35 // out[6]: 06 16 26 36 // out[7]: 07 17 27 37 out[0] = vget_low_s16(vreinterpretq_s16_u32(c0.val[0])); out[1] = vget_low_s16(vreinterpretq_s16_u32(c1.val[0])); out[2] = vget_low_s16(vreinterpretq_s16_u32(c0.val[1])); out[3] = vget_low_s16(vreinterpretq_s16_u32(c1.val[1])); out[4] = vget_high_s16(vreinterpretq_s16_u32(c0.val[0])); out[5] = vget_high_s16(vreinterpretq_s16_u32(c1.val[0])); out[6] = vget_high_s16(vreinterpretq_s16_u32(c0.val[1])); out[7] = vget_high_s16(vreinterpretq_s16_u32(c1.val[1])); } static inline void transpose_arrays_s64_4x4(const int64x2_t *in, int64x2_t *out) { // Perform a 4x4 matrix transpose going from: // in[0] = 00 01 // in[1] = 02 03 // in[2] = 10 11 // in[3] = 12 13 // in[4] = 20 21 // in[5] = 22 23 // in[6] = 30 31 // in[7] = 32 33 // // to: // out[0] = 00 10 // out[1] = 20 30 // out[2] = 01 11 // out[3] = 21 31 // out[4] = 02 12 // out[5] = 22 32 // out[6] = 03 13 // out[7] = 23 33 out[0] = aom_vtrn1q_s64(in[0], in[2]); out[1] = aom_vtrn1q_s64(in[4], in[6]); out[2] = aom_vtrn2q_s64(in[0], in[2]); out[3] = aom_vtrn2q_s64(in[4], in[6]); out[4] = aom_vtrn1q_s64(in[1], in[3]); out[5] = aom_vtrn1q_s64(in[5], in[7]); out[6] = aom_vtrn2q_s64(in[1], in[3]); out[7] = aom_vtrn2q_s64(in[5], in[7]); } #endif // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_ aom-3.12.1/aom_dsp/arm/variance_neon.c000066400000000000000000000413031477627663500175560ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_ports/mem.h" #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" static inline void variance_4xh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int h, uint32_t *sse, int *sum) { int16x8_t sum_s16 = vdupq_n_s16(0); int32x4_t sse_s32 = vdupq_n_s32(0); // Number of rows we can process before 'sum_s16' overflows: // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows. assert(h <= 256); int i = h; do { uint8x8_t s = load_unaligned_u8(src, src_stride); uint8x8_t r = load_unaligned_u8(ref, ref_stride); int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r)); sum_s16 = vaddq_s16(sum_s16, diff); sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff)); sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff)); src += 2 * src_stride; ref += 2 * ref_stride; i -= 2; } while (i != 0); *sum = horizontal_add_s16x8(sum_s16); *sse = (uint32_t)horizontal_add_s32x4(sse_s32); } static inline void variance_8xh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int h, uint32_t *sse, int *sum) { int16x8_t sum_s16 = vdupq_n_s16(0); int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; // Number of rows we can process before 'sum_s16' overflows: // 32767 / 255 ~= 128 assert(h <= 128); int i = h; do { uint8x8_t s = vld1_u8(src); uint8x8_t r = vld1_u8(ref); int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r)); sum_s16 = vaddq_s16(sum_s16, diff); sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); sse_s32[1] = vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); src += src_stride; ref += ref_stride; } while (--i != 0); *sum = horizontal_add_s16x8(sum_s16); *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1])); } static inline void variance_16xh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int h, uint32_t *sse, int *sum) { int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; // Number of rows we can process before 'sum_s16' accumulators overflow: // 32767 / 255 ~= 128, so 128 16-wide rows. assert(h <= 128); int i = h; do { uint8x16_t s = vld1q_u8(src); uint8x16_t r = vld1q_u8(ref); int16x8_t diff_l = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r))); int16x8_t diff_h = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r))); sum_s16[0] = vaddq_s16(sum_s16[0], diff_l); sum_s16[1] = vaddq_s16(sum_s16[1], diff_h); sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l)); sse_s32[1] = vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l)); sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h)); sse_s32[1] = vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h)); src += src_stride; ref += ref_stride; } while (--i != 0); *sum = horizontal_add_s16x8(vaddq_s16(sum_s16[0], sum_s16[1])); *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1])); } static inline void variance_large_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int w, int h, int h_limit, uint32_t *sse, int *sum) { int32x4_t sum_s32 = vdupq_n_s32(0); int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit // accumulator overflows. After hitting this limit we accumulate into 32-bit // elements. int h_tmp = h > h_limit ? h_limit : h; int i = 0; do { int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; do { int j = 0; do { uint8x16_t s = vld1q_u8(src + j); uint8x16_t r = vld1q_u8(ref + j); int16x8_t diff_l = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r))); int16x8_t diff_h = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r))); sum_s16[0] = vaddq_s16(sum_s16[0], diff_l); sum_s16[1] = vaddq_s16(sum_s16[1], diff_h); sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l)); sse_s32[1] = vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l)); sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h)); sse_s32[1] = vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h)); j += 16; } while (j < w); src += src_stride; ref += ref_stride; i++; } while (i < h_tmp); sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]); sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]); h_tmp += h_limit; } while (i < h); *sum = horizontal_add_s32x4(sum_s32); *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1])); } static inline void variance_32xh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int h, uint32_t *sse, int *sum) { variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum); } static inline void variance_64xh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int h, uint32_t *sse, int *sum) { variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum); } static inline void variance_128xh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int h, uint32_t *sse, int *sum) { variance_large_neon(src, src_stride, ref, ref_stride, 128, h, 16, sse, sum); } #define VARIANCE_WXH_NEON(w, h, shift) \ unsigned int aom_variance##w##x##h##_neon( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ unsigned int *sse) { \ int sum; \ variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum); \ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ } VARIANCE_WXH_NEON(4, 4, 4) VARIANCE_WXH_NEON(4, 8, 5) VARIANCE_WXH_NEON(8, 4, 5) VARIANCE_WXH_NEON(8, 8, 6) VARIANCE_WXH_NEON(8, 16, 7) VARIANCE_WXH_NEON(16, 8, 7) VARIANCE_WXH_NEON(16, 16, 8) VARIANCE_WXH_NEON(16, 32, 9) VARIANCE_WXH_NEON(32, 16, 9) VARIANCE_WXH_NEON(32, 32, 10) VARIANCE_WXH_NEON(32, 64, 11) VARIANCE_WXH_NEON(64, 32, 11) VARIANCE_WXH_NEON(64, 64, 12) VARIANCE_WXH_NEON(64, 128, 13) VARIANCE_WXH_NEON(128, 64, 13) VARIANCE_WXH_NEON(128, 128, 14) #if !CONFIG_REALTIME_ONLY VARIANCE_WXH_NEON(4, 16, 6) VARIANCE_WXH_NEON(8, 32, 8) VARIANCE_WXH_NEON(16, 4, 6) VARIANCE_WXH_NEON(16, 64, 10) VARIANCE_WXH_NEON(32, 8, 8) VARIANCE_WXH_NEON(64, 16, 10) #endif #undef VARIANCE_WXH_NEON // TODO(yunqingwang): Perform variance of two/four 8x8 blocks similar to that of // AVX2. Also, implement the NEON for variance computation present in this // function. void aom_get_var_sse_sum_8x8_quad_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8) { // Loop over four 8x8 blocks. Process one 8x32 block. for (int k = 0; k < 4; k++) { variance_8xh_neon(src + (k * 8), src_stride, ref + (k * 8), ref_stride, 8, &sse8x8[k], &sum8x8[k]); } *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3]; *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3]; for (int i = 0; i < 4; i++) { var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6); } } void aom_get_var_sse_sum_16x16_dual_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16) { int sum16x16[2] = { 0 }; // Loop over two 16x16 blocks. Process one 16x32 block. for (int k = 0; k < 2; k++) { variance_16xh_neon(src + (k * 16), src_stride, ref + (k * 16), ref_stride, 16, &sse16x16[k], &sum16x16[k]); } *tot_sse += sse16x16[0] + sse16x16[1]; *tot_sum += sum16x16[0] + sum16x16[1]; for (int i = 0; i < 2; i++) { var16x16[i] = sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8); } } static inline unsigned int mse8xh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse, int h) { uint8x8_t s[2], r[2]; int16x4_t diff_lo[2], diff_hi[2]; uint16x8_t diff[2]; int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; int i = h; do { s[0] = vld1_u8(src); src += src_stride; s[1] = vld1_u8(src); src += src_stride; r[0] = vld1_u8(ref); ref += ref_stride; r[1] = vld1_u8(ref); ref += ref_stride; diff[0] = vsubl_u8(s[0], r[0]); diff[1] = vsubl_u8(s[1], r[1]); diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0])); diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1])); sse_s32[0] = vmlal_s16(sse_s32[0], diff_lo[0], diff_lo[0]); sse_s32[1] = vmlal_s16(sse_s32[1], diff_lo[1], diff_lo[1]); diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0])); diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1])); sse_s32[0] = vmlal_s16(sse_s32[0], diff_hi[0], diff_hi[0]); sse_s32[1] = vmlal_s16(sse_s32[1], diff_hi[1], diff_hi[1]); i -= 2; } while (i != 0); sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[1]); *sse = horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0])); return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0])); } static inline unsigned int mse16xh_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse, int h) { uint8x16_t s[2], r[2]; int16x4_t diff_lo[4], diff_hi[4]; uint16x8_t diff[4]; int32x4_t sse_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0) }; int i = h; do { s[0] = vld1q_u8(src); src += src_stride; s[1] = vld1q_u8(src); src += src_stride; r[0] = vld1q_u8(ref); ref += ref_stride; r[1] = vld1q_u8(ref); ref += ref_stride; diff[0] = vsubl_u8(vget_low_u8(s[0]), vget_low_u8(r[0])); diff[1] = vsubl_u8(vget_high_u8(s[0]), vget_high_u8(r[0])); diff[2] = vsubl_u8(vget_low_u8(s[1]), vget_low_u8(r[1])); diff[3] = vsubl_u8(vget_high_u8(s[1]), vget_high_u8(r[1])); diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0])); diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1])); sse_s32[0] = vmlal_s16(sse_s32[0], diff_lo[0], diff_lo[0]); sse_s32[1] = vmlal_s16(sse_s32[1], diff_lo[1], diff_lo[1]); diff_lo[2] = vreinterpret_s16_u16(vget_low_u16(diff[2])); diff_lo[3] = vreinterpret_s16_u16(vget_low_u16(diff[3])); sse_s32[2] = vmlal_s16(sse_s32[2], diff_lo[2], diff_lo[2]); sse_s32[3] = vmlal_s16(sse_s32[3], diff_lo[3], diff_lo[3]); diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0])); diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1])); sse_s32[0] = vmlal_s16(sse_s32[0], diff_hi[0], diff_hi[0]); sse_s32[1] = vmlal_s16(sse_s32[1], diff_hi[1], diff_hi[1]); diff_hi[2] = vreinterpret_s16_u16(vget_high_u16(diff[2])); diff_hi[3] = vreinterpret_s16_u16(vget_high_u16(diff[3])); sse_s32[2] = vmlal_s16(sse_s32[2], diff_hi[2], diff_hi[2]); sse_s32[3] = vmlal_s16(sse_s32[3], diff_hi[3], diff_hi[3]); i -= 2; } while (i != 0); sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[1]); sse_s32[2] = vaddq_s32(sse_s32[2], sse_s32[3]); sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[2]); *sse = horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0])); return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0])); } #define MSE_WXH_NEON(w, h) \ unsigned int aom_mse##w##x##h##_neon(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride, \ unsigned int *sse) { \ return mse##w##xh_neon(src, src_stride, ref, ref_stride, sse, h); \ } MSE_WXH_NEON(8, 8) MSE_WXH_NEON(8, 16) MSE_WXH_NEON(16, 8) MSE_WXH_NEON(16, 16) #undef MSE_WXH_NEON static inline uint64x2_t mse_accumulate_u16_u8_8x2(uint64x2_t sum, uint16x8_t s0, uint16x8_t s1, uint8x8_t d0, uint8x8_t d1) { int16x8_t e0 = vreinterpretq_s16_u16(vsubw_u8(s0, d0)); int16x8_t e1 = vreinterpretq_s16_u16(vsubw_u8(s1, d1)); int32x4_t mse = vmull_s16(vget_low_s16(e0), vget_low_s16(e0)); mse = vmlal_s16(mse, vget_high_s16(e0), vget_high_s16(e0)); mse = vmlal_s16(mse, vget_low_s16(e1), vget_low_s16(e1)); mse = vmlal_s16(mse, vget_high_s16(e1), vget_high_s16(e1)); return vpadalq_u32(sum, vreinterpretq_u32_s32(mse)); } static uint64x2_t mse_wxh_16bit(uint8_t *dst, int dstride, const uint16_t *src, int sstride, int w, int h) { assert((w == 8 || w == 4) && (h == 8 || h == 4)); uint64x2_t sum = vdupq_n_u64(0); if (w == 8) { do { uint8x8_t d0 = vld1_u8(dst + 0 * dstride); uint8x8_t d1 = vld1_u8(dst + 1 * dstride); uint16x8_t s0 = vld1q_u16(src + 0 * sstride); uint16x8_t s1 = vld1q_u16(src + 1 * sstride); sum = mse_accumulate_u16_u8_8x2(sum, s0, s1, d0, d1); dst += 2 * dstride; src += 2 * sstride; h -= 2; } while (h != 0); } else { do { uint8x8_t d0 = load_unaligned_u8_4x2(dst + 0 * dstride, dstride); uint8x8_t d1 = load_unaligned_u8_4x2(dst + 2 * dstride, dstride); uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride); uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride); sum = mse_accumulate_u16_u8_8x2(sum, s0, s1, d0, d1); dst += 4 * dstride; src += 4 * sstride; h -= 4; } while (h != 0); } return sum; } // Computes mse for a given block size. This function gets called for specific // block sizes, which are 8x8, 8x4, 4x8 and 4x4. uint64_t aom_mse_wxh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src, int sstride, int w, int h) { return horizontal_add_u64x2(mse_wxh_16bit(dst, dstride, src, sstride, w, h)); } #if !CONFIG_REALTIME_ONLY uint32_t aom_get_mb_ss_neon(const int16_t *a) { int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; for (int i = 0; i < 256; i = i + 8) { int16x8_t a_s16 = vld1q_s16(a + i); sse[0] = vmlal_s16(sse[0], vget_low_s16(a_s16), vget_low_s16(a_s16)); sse[1] = vmlal_s16(sse[1], vget_high_s16(a_s16), vget_high_s16(a_s16)); } return horizontal_add_s32x4(vaddq_s32(sse[0], sse[1])); } #endif // !CONFIG_REALTIME_ONLY uint64_t aom_mse_16xh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src, int w, int h) { uint64x2_t sum = vdupq_n_u64(0); int num_blks = 16 / w; do { sum = vaddq_u64(sum, mse_wxh_16bit(dst, dstride, src, w, w, h)); dst += w; src += w * h; } while (--num_blks != 0); return horizontal_add_u64x2(sum); } aom-3.12.1/aom_dsp/arm/variance_neon_dotprod.c000066400000000000000000000263351477627663500213210ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_ports/mem.h" #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" static inline void variance_4xh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int h, uint32_t *sse, int *sum) { uint32x4_t src_sum = vdupq_n_u32(0); uint32x4_t ref_sum = vdupq_n_u32(0); uint32x4_t sse_u32 = vdupq_n_u32(0); int i = h; do { uint8x16_t s = load_unaligned_u8q(src, src_stride); uint8x16_t r = load_unaligned_u8q(ref, ref_stride); src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); uint8x16_t abs_diff = vabdq_u8(s, r); sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); src += 4 * src_stride; ref += 4 * ref_stride; i -= 4; } while (i != 0); int32x4_t sum_diff = vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum)); *sum = horizontal_add_s32x4(sum_diff); *sse = horizontal_add_u32x4(sse_u32); } static inline void variance_8xh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int h, uint32_t *sse, int *sum) { uint32x4_t src_sum = vdupq_n_u32(0); uint32x4_t ref_sum = vdupq_n_u32(0); uint32x4_t sse_u32 = vdupq_n_u32(0); int i = h; do { uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride)); uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride)); src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); uint8x16_t abs_diff = vabdq_u8(s, r); sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); src += 2 * src_stride; ref += 2 * ref_stride; i -= 2; } while (i != 0); int32x4_t sum_diff = vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum)); *sum = horizontal_add_s32x4(sum_diff); *sse = horizontal_add_u32x4(sse_u32); } static inline void variance_16xh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int h, uint32_t *sse, int *sum) { uint32x4_t src_sum = vdupq_n_u32(0); uint32x4_t ref_sum = vdupq_n_u32(0); uint32x4_t sse_u32 = vdupq_n_u32(0); int i = h; do { uint8x16_t s = vld1q_u8(src); uint8x16_t r = vld1q_u8(ref); src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); uint8x16_t abs_diff = vabdq_u8(s, r); sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); src += src_stride; ref += ref_stride; } while (--i != 0); int32x4_t sum_diff = vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum)); *sum = horizontal_add_s32x4(sum_diff); *sse = horizontal_add_u32x4(sse_u32); } static inline void variance_large_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int w, int h, uint32_t *sse, int *sum) { uint32x4_t src_sum = vdupq_n_u32(0); uint32x4_t ref_sum = vdupq_n_u32(0); uint32x4_t sse_u32 = vdupq_n_u32(0); int i = h; do { int j = 0; do { uint8x16_t s = vld1q_u8(src + j); uint8x16_t r = vld1q_u8(ref + j); src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); uint8x16_t abs_diff = vabdq_u8(s, r); sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); j += 16; } while (j < w); src += src_stride; ref += ref_stride; } while (--i != 0); int32x4_t sum_diff = vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum)); *sum = horizontal_add_s32x4(sum_diff); *sse = horizontal_add_u32x4(sse_u32); } static inline void variance_32xh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int h, uint32_t *sse, int *sum) { variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 32, h, sse, sum); } static inline void variance_64xh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int h, uint32_t *sse, int *sum) { variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 64, h, sse, sum); } static inline void variance_128xh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int h, uint32_t *sse, int *sum) { variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 128, h, sse, sum); } #define VARIANCE_WXH_NEON_DOTPROD(w, h, shift) \ unsigned int aom_variance##w##x##h##_neon_dotprod( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ unsigned int *sse) { \ int sum; \ variance_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, sse, \ &sum); \ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ } VARIANCE_WXH_NEON_DOTPROD(4, 4, 4) VARIANCE_WXH_NEON_DOTPROD(4, 8, 5) VARIANCE_WXH_NEON_DOTPROD(8, 4, 5) VARIANCE_WXH_NEON_DOTPROD(8, 8, 6) VARIANCE_WXH_NEON_DOTPROD(8, 16, 7) VARIANCE_WXH_NEON_DOTPROD(16, 8, 7) VARIANCE_WXH_NEON_DOTPROD(16, 16, 8) VARIANCE_WXH_NEON_DOTPROD(16, 32, 9) VARIANCE_WXH_NEON_DOTPROD(32, 16, 9) VARIANCE_WXH_NEON_DOTPROD(32, 32, 10) VARIANCE_WXH_NEON_DOTPROD(32, 64, 11) VARIANCE_WXH_NEON_DOTPROD(64, 32, 11) VARIANCE_WXH_NEON_DOTPROD(64, 64, 12) VARIANCE_WXH_NEON_DOTPROD(64, 128, 13) VARIANCE_WXH_NEON_DOTPROD(128, 64, 13) VARIANCE_WXH_NEON_DOTPROD(128, 128, 14) #if !CONFIG_REALTIME_ONLY VARIANCE_WXH_NEON_DOTPROD(4, 16, 6) VARIANCE_WXH_NEON_DOTPROD(8, 32, 8) VARIANCE_WXH_NEON_DOTPROD(16, 4, 6) VARIANCE_WXH_NEON_DOTPROD(16, 64, 10) VARIANCE_WXH_NEON_DOTPROD(32, 8, 8) VARIANCE_WXH_NEON_DOTPROD(64, 16, 10) #endif #undef VARIANCE_WXH_NEON_DOTPROD void aom_get_var_sse_sum_8x8_quad_neon_dotprod( const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8) { // Loop over four 8x8 blocks. Process one 8x32 block. for (int k = 0; k < 4; k++) { variance_8xh_neon_dotprod(src + (k * 8), src_stride, ref + (k * 8), ref_stride, 8, &sse8x8[k], &sum8x8[k]); } *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3]; *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3]; for (int i = 0; i < 4; i++) { var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6); } } void aom_get_var_sse_sum_16x16_dual_neon_dotprod( const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16) { int sum16x16[2] = { 0 }; // Loop over two 16x16 blocks. Process one 16x32 block. for (int k = 0; k < 2; k++) { variance_16xh_neon_dotprod(src + (k * 16), src_stride, ref + (k * 16), ref_stride, 16, &sse16x16[k], &sum16x16[k]); } *tot_sse += sse16x16[0] + sse16x16[1]; *tot_sum += sum16x16[0] + sum16x16[1]; for (int i = 0; i < 2; i++) { var16x16[i] = sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8); } } static inline unsigned int mse8xh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse, int h) { uint32x4_t sse_u32 = vdupq_n_u32(0); int i = h; do { uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride)); uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride)); uint8x16_t abs_diff = vabdq_u8(s, r); sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); src += 2 * src_stride; ref += 2 * ref_stride; i -= 2; } while (i != 0); *sse = horizontal_add_u32x4(sse_u32); return horizontal_add_u32x4(sse_u32); } static inline unsigned int mse16xh_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse, int h) { uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int i = h; do { uint8x16_t s0 = vld1q_u8(src); uint8x16_t s1 = vld1q_u8(src + src_stride); uint8x16_t r0 = vld1q_u8(ref); uint8x16_t r1 = vld1q_u8(ref + ref_stride); uint8x16_t abs_diff0 = vabdq_u8(s0, r0); uint8x16_t abs_diff1 = vabdq_u8(s1, r1); sse_u32[0] = vdotq_u32(sse_u32[0], abs_diff0, abs_diff0); sse_u32[1] = vdotq_u32(sse_u32[1], abs_diff1, abs_diff1); src += 2 * src_stride; ref += 2 * ref_stride; i -= 2; } while (i != 0); *sse = horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1])); return horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1])); } #define MSE_WXH_NEON_DOTPROD(w, h) \ unsigned int aom_mse##w##x##h##_neon_dotprod( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ unsigned int *sse) { \ return mse##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, sse, h); \ } MSE_WXH_NEON_DOTPROD(8, 8) MSE_WXH_NEON_DOTPROD(8, 16) MSE_WXH_NEON_DOTPROD(16, 8) MSE_WXH_NEON_DOTPROD(16, 16) #undef MSE_WXH_NEON_DOTPROD aom-3.12.1/aom_dsp/avg.c000066400000000000000000000431371477627663500147540ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" void aom_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max) { int i, j; *min = 255; *max = 0; for (i = 0; i < 8; ++i, s += p, d += dp) { for (j = 0; j < 8; ++j) { int diff = abs(s[j] - d[j]); *min = diff < *min ? diff : *min; *max = diff > *max ? diff : *max; } } } unsigned int aom_avg_4x4_c(const uint8_t *s, int p) { int i, j; int sum = 0; for (i = 0; i < 4; ++i, s += p) for (j = 0; j < 4; sum += s[j], ++j) { } return (sum + 8) >> 4; } unsigned int aom_avg_8x8_c(const uint8_t *s, int p) { int i, j; int sum = 0; for (i = 0; i < 8; ++i, s += p) for (j = 0; j < 8; sum += s[j], ++j) { } return (sum + 32) >> 6; } void aom_avg_8x8_quad_c(const uint8_t *s, int p, int x16_idx, int y16_idx, int *avg) { for (int k = 0; k < 4; k++) { const int x8_idx = x16_idx + ((k & 1) << 3); const int y8_idx = y16_idx + ((k >> 1) << 3); const uint8_t *s_tmp = s + y8_idx * p + x8_idx; avg[k] = aom_avg_8x8_c(s_tmp, p); } } #if CONFIG_AV1_HIGHBITDEPTH unsigned int aom_highbd_avg_8x8_c(const uint8_t *s8, int p) { int i, j; int sum = 0; const uint16_t *s = CONVERT_TO_SHORTPTR(s8); for (i = 0; i < 8; ++i, s += p) for (j = 0; j < 8; sum += s[j], ++j) { } return (sum + 32) >> 6; } unsigned int aom_highbd_avg_4x4_c(const uint8_t *s8, int p) { int i, j; int sum = 0; const uint16_t *s = CONVERT_TO_SHORTPTR(s8); for (i = 0; i < 4; ++i, s += p) for (j = 0; j < 4; sum += s[j], ++j) { } return (sum + 8) >> 4; } void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max) { int i, j; const uint16_t *s = CONVERT_TO_SHORTPTR(s8); const uint16_t *d = CONVERT_TO_SHORTPTR(d8); *min = 65535; *max = 0; for (i = 0; i < 8; ++i, s += p, d += dp) { for (j = 0; j < 8; ++j) { int diff = abs(s[j] - d[j]); *min = diff < *min ? diff : *min; *max = diff > *max ? diff : *max; } } } #endif // CONFIG_AV1_HIGHBITDEPTH static void hadamard_col4(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { int16_t b0 = (src_diff[0 * src_stride] + src_diff[1 * src_stride]) >> 1; int16_t b1 = (src_diff[0 * src_stride] - src_diff[1 * src_stride]) >> 1; int16_t b2 = (src_diff[2 * src_stride] + src_diff[3 * src_stride]) >> 1; int16_t b3 = (src_diff[2 * src_stride] - src_diff[3 * src_stride]) >> 1; coeff[0] = b0 + b2; coeff[1] = b1 + b3; coeff[2] = b0 - b2; coeff[3] = b1 - b3; } void aom_hadamard_4x4_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int idx; int16_t buffer[16]; int16_t buffer2[16]; int16_t *tmp_buf = &buffer[0]; for (idx = 0; idx < 4; ++idx) { hadamard_col4(src_diff, src_stride, tmp_buf); // src_diff: 9 bit // dynamic range [-255, 255] tmp_buf += 4; ++src_diff; } tmp_buf = &buffer[0]; for (idx = 0; idx < 4; ++idx) { hadamard_col4(tmp_buf, 4, buffer2 + 4 * idx); // tmp_buf: 12 bit // dynamic range [-2040, 2040] // buffer2: 15 bit // dynamic range [-16320, 16320] ++tmp_buf; } // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_4x4_sse2). for (int i = 0; i < 4; i++) { for (int j = 0; j < 4; j++) { coeff[i * 4 + j] = (tran_low_t)buffer2[j * 4 + i]; } } } // src_diff: first pass, 9 bit, dynamic range [-255, 255] // second pass, 12 bit, dynamic range [-2040, 2040] static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; int16_t c0 = b0 + b2; int16_t c1 = b1 + b3; int16_t c2 = b0 - b2; int16_t c3 = b1 - b3; int16_t c4 = b4 + b6; int16_t c5 = b5 + b7; int16_t c6 = b4 - b6; int16_t c7 = b5 - b7; coeff[0] = c0 + c4; coeff[7] = c1 + c5; coeff[3] = c2 + c6; coeff[4] = c3 + c7; coeff[2] = c0 - c4; coeff[6] = c1 - c5; coeff[1] = c2 - c6; coeff[5] = c3 - c7; } void aom_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int idx; int16_t buffer[64]; int16_t buffer2[64]; int16_t *tmp_buf = &buffer[0]; for (idx = 0; idx < 8; ++idx) { hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit // dynamic range [-255, 255] tmp_buf += 8; ++src_diff; } tmp_buf = &buffer[0]; for (idx = 0; idx < 8; ++idx) { hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx); // tmp_buf: 12 bit // dynamic range [-2040, 2040] // buffer2: 15 bit // dynamic range [-16320, 16320] ++tmp_buf; } // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_8x8_sse2). for (int i = 0; i < 8; i++) { for (int j = 0; j < 8; j++) { coeff[i * 8 + j] = (tran_low_t)buffer2[j * 8 + i]; } } } void aom_hadamard_lp_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { int16_t buffer[64]; int16_t buffer2[64]; int16_t *tmp_buf = &buffer[0]; for (int idx = 0; idx < 8; ++idx) { hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit // dynamic range [-255, 255] tmp_buf += 8; ++src_diff; } tmp_buf = &buffer[0]; for (int idx = 0; idx < 8; ++idx) { hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx); // tmp_buf: 12 bit // dynamic range [-2040, 2040] // buffer2: 15 bit // dynamic range [-16320, 16320] ++tmp_buf; } for (int idx = 0; idx < 64; ++idx) coeff[idx] = buffer2[idx]; // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_lp_8x8_sse2). for (int i = 0; i < 8; i++) { for (int j = 0; j < 8; j++) { coeff[i * 8 + j] = buffer2[j * 8 + i]; } } } void aom_hadamard_lp_8x8_dual_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { for (int i = 0; i < 2; i++) { aom_hadamard_lp_8x8_c(src_diff + (i * 8), src_stride, (int16_t *)coeff + (i * 64)); } } // In place 16x16 2D Hadamard transform void aom_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int idx; for (idx = 0; idx < 4; ++idx) { // src_diff: 9 bit, dynamic range [-255, 255] const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); } // coeff: 15 bit, dynamic range [-16320, 16320] for (idx = 0; idx < 64; ++idx) { tran_low_t a0 = coeff[0]; tran_low_t a1 = coeff[64]; tran_low_t a2 = coeff[128]; tran_low_t a3 = coeff[192]; tran_low_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] tran_low_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range tran_low_t b2 = (a2 + a3) >> 1; // [-16320, 16320] tran_low_t b3 = (a2 - a3) >> 1; coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] coeff[64] = b1 + b3; coeff[128] = b0 - b2; coeff[192] = b1 - b3; ++coeff; } coeff -= 64; // Extra shift to match AVX2 output (i.e., aom_hadamard_16x16_avx2). // Note that to match SSE2 output, it does not need this step. for (int i = 0; i < 16; i++) { for (int j = 0; j < 4; j++) { tran_low_t temp = coeff[i * 16 + 4 + j]; coeff[i * 16 + 4 + j] = coeff[i * 16 + 8 + j]; coeff[i * 16 + 8 + j] = temp; } } } void aom_hadamard_lp_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { for (int idx = 0; idx < 4; ++idx) { // src_diff: 9 bit, dynamic range [-255, 255] const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; aom_hadamard_lp_8x8_c(src_ptr, src_stride, coeff + idx * 64); } for (int idx = 0; idx < 64; ++idx) { int16_t a0 = coeff[0]; int16_t a1 = coeff[64]; int16_t a2 = coeff[128]; int16_t a3 = coeff[192]; int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320] int16_t b3 = (a2 - a3) >> 1; coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] coeff[64] = b1 + b3; coeff[128] = b0 - b2; coeff[192] = b1 - b3; ++coeff; } } void aom_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int idx; for (idx = 0; idx < 4; ++idx) { // src_diff: 9 bit, dynamic range [-255, 255] const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; aom_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256); } // coeff: 16 bit, dynamic range [-32768, 32767] for (idx = 0; idx < 256; ++idx) { tran_low_t a0 = coeff[0]; tran_low_t a1 = coeff[256]; tran_low_t a2 = coeff[512]; tran_low_t a3 = coeff[768]; tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 17 bit, [-65536, 65535] tran_low_t b1 = (a0 - a1) >> 2; // b0-b3: 15 bit, dynamic range tran_low_t b2 = (a2 + a3) >> 2; // [-16384, 16383] tran_low_t b3 = (a2 - a3) >> 2; coeff[0] = b0 + b2; // 16 bit, [-32768, 32767] coeff[256] = b1 + b3; coeff[512] = b0 - b2; coeff[768] = b1 - b3; ++coeff; } } #if CONFIG_AV1_HIGHBITDEPTH static void hadamard_highbd_col8_first_pass(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; int16_t c0 = b0 + b2; int16_t c1 = b1 + b3; int16_t c2 = b0 - b2; int16_t c3 = b1 - b3; int16_t c4 = b4 + b6; int16_t c5 = b5 + b7; int16_t c6 = b4 - b6; int16_t c7 = b5 - b7; coeff[0] = c0 + c4; coeff[7] = c1 + c5; coeff[3] = c2 + c6; coeff[4] = c3 + c7; coeff[2] = c0 - c4; coeff[6] = c1 - c5; coeff[1] = c2 - c6; coeff[5] = c3 - c7; } // src_diff: 16 bit, dynamic range [-32760, 32760] // coeff: 19 bit static void hadamard_highbd_col8_second_pass(const int16_t *src_diff, ptrdiff_t src_stride, int32_t *coeff) { int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; int32_t c0 = b0 + b2; int32_t c1 = b1 + b3; int32_t c2 = b0 - b2; int32_t c3 = b1 - b3; int32_t c4 = b4 + b6; int32_t c5 = b5 + b7; int32_t c6 = b4 - b6; int32_t c7 = b5 - b7; coeff[0] = c0 + c4; coeff[7] = c1 + c5; coeff[3] = c2 + c6; coeff[4] = c3 + c7; coeff[2] = c0 - c4; coeff[6] = c1 - c5; coeff[1] = c2 - c6; coeff[5] = c3 - c7; } // The order of the output coeff of the hadamard is not important. For // optimization purposes the final transpose may be skipped. void aom_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int idx; int16_t buffer[64]; int32_t buffer2[64]; int16_t *tmp_buf = &buffer[0]; for (idx = 0; idx < 8; ++idx) { // src_diff: 13 bit // buffer: 16 bit, dynamic range [-32760, 32760] hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf); tmp_buf += 8; ++src_diff; } tmp_buf = &buffer[0]; for (idx = 0; idx < 8; ++idx) { // buffer: 16 bit // buffer2: 19 bit, dynamic range [-262080, 262080] hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx); ++tmp_buf; } for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx]; } // In place 16x16 2D Hadamard transform void aom_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int idx; for (idx = 0; idx < 4; ++idx) { // src_diff: 13 bit, dynamic range [-4095, 4095] const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; aom_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); } // coeff: 19 bit, dynamic range [-262080, 262080] for (idx = 0; idx < 64; ++idx) { tran_low_t a0 = coeff[0]; tran_low_t a1 = coeff[64]; tran_low_t a2 = coeff[128]; tran_low_t a3 = coeff[192]; tran_low_t b0 = (a0 + a1) >> 1; tran_low_t b1 = (a0 - a1) >> 1; tran_low_t b2 = (a2 + a3) >> 1; tran_low_t b3 = (a2 - a3) >> 1; // new coeff dynamic range: 20 bit coeff[0] = b0 + b2; coeff[64] = b1 + b3; coeff[128] = b0 - b2; coeff[192] = b1 - b3; ++coeff; } } void aom_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int idx; for (idx = 0; idx < 4; ++idx) { // src_diff: 13 bit, dynamic range [-4095, 4095] const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; aom_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256); } // coeff: 20 bit for (idx = 0; idx < 256; ++idx) { tran_low_t a0 = coeff[0]; tran_low_t a1 = coeff[256]; tran_low_t a2 = coeff[512]; tran_low_t a3 = coeff[768]; tran_low_t b0 = (a0 + a1) >> 2; tran_low_t b1 = (a0 - a1) >> 2; tran_low_t b2 = (a2 + a3) >> 2; tran_low_t b3 = (a2 - a3) >> 2; // new coeff dynamic range: 20 bit coeff[0] = b0 + b2; coeff[256] = b1 + b3; coeff[512] = b0 - b2; coeff[768] = b1 - b3; ++coeff; } } #endif // CONFIG_AV1_HIGHBITDEPTH // coeff: 20 bits, dynamic range [-524287, 524287]. // length: value range {16, 32, 64, 128, 256, 512, 1024}. int aom_satd_c(const tran_low_t *coeff, int length) { int i; int satd = 0; for (i = 0; i < length; ++i) satd += abs(coeff[i]); // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024] return satd; } int aom_satd_lp_c(const int16_t *coeff, int length) { int satd = 0; for (int i = 0; i < length; ++i) satd += abs(coeff[i]); // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] return satd; } // Integer projection onto row vectors. // height: value range {16, 32, 64, 128}. void aom_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor) { assert(height >= 2); for (int idx = 0; idx < width; ++idx) { hbuf[idx] = 0; // hbuf[idx]: 14 bit, dynamic range [0, 32640]. for (int i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride]; // hbuf[idx]: 9 bit, dynamic range [0, 1020]. hbuf[idx] >>= norm_factor; ++ref; } } // width: value range {16, 32, 64, 128}. void aom_int_pro_col_c(int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor) { for (int ht = 0; ht < height; ++ht) { int16_t sum = 0; // sum: 14 bit, dynamic range [0, 32640] for (int idx = 0; idx < width; ++idx) sum += ref[idx]; vbuf[ht] = sum >> norm_factor; ref += ref_stride; } } // ref: [0 - 510] // src: [0 - 510] // bwl: {2, 3, 4, 5} int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl) { int i; int width = 4 << bwl; int sse = 0, mean = 0, var; for (i = 0; i < width; ++i) { int diff = ref[i] - src[i]; // diff: dynamic range [-510, 510], 10 bits. mean += diff; // mean: dynamic range 16 bits. sse += diff * diff; // sse: dynamic range 26 bits. } // (mean * mean): dynamic range 31 bits. // If width == 128, the mean can be 510 * 128 = 65280, and log2(65280 ** 2) ~= // 31.99, so it needs to be casted to unsigned int to compute its square. const unsigned int mean_abs = abs(mean); var = sse - ((mean_abs * mean_abs) >> (bwl + 2)); return var; } aom-3.12.1/aom_dsp/binary_codes_reader.c000066400000000000000000000037721477627663500201630ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom_dsp/binary_codes_reader.h" #include "aom_dsp/recenter.h" #define read_primitive_quniform(r, n, ACCT_STR_NAME) \ read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME)) #define read_primitive_subexpfin(r, n, k, ACCT_STR_NAME) \ read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME)) static uint16_t read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM) { if (n <= 1) return 0; const int l = get_msb(n) + 1; const int m = (1 << l) - n; const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME); return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME); } // Decode finite subexponential code that for a symbol v in [0, n-1] with // parameter k static uint16_t read_primitive_subexpfin_(aom_reader *r, uint16_t n, uint16_t k ACCT_STR_PARAM) { int i = 0; int mk = 0; while (1) { int b = (i ? k + i - 1 : k); int a = (1 << b); if (n <= mk + 3 * a) { return read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk; } if (!aom_read_bit(r, ACCT_STR_NAME)) { return aom_read_literal(r, b, ACCT_STR_NAME) + mk; } i = i + 1; mk += a; } assert(0); return 0; } uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, uint16_t ref ACCT_STR_PARAM) { return inv_recenter_finite_nonneg( n, ref, read_primitive_subexpfin(r, n, k, ACCT_STR_NAME)); } aom-3.12.1/aom_dsp/binary_codes_reader.h000066400000000000000000000022631477627663500201620ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_BINARY_CODES_READER_H_ #define AOM_AOM_DSP_BINARY_CODES_READER_H_ #ifdef __cplusplus extern "C" { #endif #include #include "config/aom_config.h" #include "aom/aom_integer.h" #include "aom_dsp/bitreader.h" #include "aom_dsp/bitreader_buffer.h" #define aom_read_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \ aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME)) uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, uint16_t ref ACCT_STR_PARAM); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_BINARY_CODES_READER_H_ aom-3.12.1/aom_dsp/binary_codes_writer.c000066400000000000000000000062071477627663500202310ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom_dsp/bitwriter.h" #include "aom_dsp/binary_codes_writer.h" #include "aom_dsp/recenter.h" #include "aom_ports/bitops.h" // Encodes a value v in [0, n-1] quasi-uniformly static void write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) { if (n <= 1) return; const int l = get_msb(n) + 1; const int m = (1 << l) - n; if (v < m) { aom_write_literal(w, v, l - 1); } else { aom_write_literal(w, m + ((v - m) >> 1), l - 1); aom_write_bit(w, (v - m) & 1); } } static int count_primitive_quniform(uint16_t n, uint16_t v) { if (n <= 1) return 0; const int l = get_msb(n) + 1; const int m = (1 << l) - n; return v < m ? l - 1 : l; } // Finite subexponential code that codes a symbol v in [0, n-1] with parameter k static void write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k, uint16_t v) { int i = 0; int mk = 0; while (1) { int b = (i ? k + i - 1 : k); int a = (1 << b); if (n <= mk + 3 * a) { write_primitive_quniform(w, n - mk, v - mk); break; } else { int t = (v >= mk + a); aom_write_bit(w, t); if (t) { i = i + 1; mk += a; } else { aom_write_literal(w, v - mk, b); break; } } } } static int count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) { int count = 0; int i = 0; int mk = 0; while (1) { int b = (i ? k + i - 1 : k); int a = (1 << b); if (n <= mk + 3 * a) { count += count_primitive_quniform(n - mk, v - mk); break; } else { int t = (v >= mk + a); count++; if (t) { i = i + 1; mk += a; } else { count += b; break; } } } return count; } // Finite subexponential code that codes a symbol v in [0, n-1] with parameter k // based on a reference ref also in [0, n-1]. // Recenters symbol around r first and then uses a finite subexponential code. void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k, uint16_t ref, uint16_t v) { write_primitive_subexpfin(w, n, k, recenter_finite_nonneg(n, ref, v)); } int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, uint16_t v) { return count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v)); } int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, int16_t v) { ref += n - 1; v += n - 1; const uint16_t scaled_n = (n << 1) - 1; return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v); } aom-3.12.1/aom_dsp/binary_codes_writer.h000066400000000000000000000027171477627663500202400ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_BINARY_CODES_WRITER_H_ #define AOM_AOM_DSP_BINARY_CODES_WRITER_H_ #ifdef __cplusplus extern "C" { #endif #include #include "config/aom_config.h" #include "aom/aom_integer.h" #include "aom_dsp/bitwriter.h" #include "aom_dsp/bitwriter_buffer.h" // Finite subexponential code that codes a symbol v in [0, n-1] with parameter k // based on a reference ref also in [0, n-1]. void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k, uint16_t ref, uint16_t v); // Functions that counts bits for the above primitives int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, uint16_t v); int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, int16_t v); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_BINARY_CODES_WRITER_H_ aom-3.12.1/aom_dsp/bitreader.c000066400000000000000000000025661477627663500161410ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom_dsp/bitreader.h" int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size) { if (size && !buffer) { return 1; } r->buffer_end = buffer + size; r->buffer = buffer; od_ec_dec_init(&r->ec, buffer, (uint32_t)size); #if CONFIG_ACCOUNTING r->accounting = NULL; #endif return 0; } const uint8_t *aom_reader_find_begin(aom_reader *r) { return r->buffer; } const uint8_t *aom_reader_find_end(aom_reader *r) { return r->buffer_end; } uint32_t aom_reader_tell(const aom_reader *r) { return od_ec_dec_tell(&r->ec); } uint32_t aom_reader_tell_frac(const aom_reader *r) { return od_ec_dec_tell_frac(&r->ec); } int aom_reader_has_overflowed(const aom_reader *r) { const uint32_t tell_bits = aom_reader_tell(r); const uint32_t tell_bytes = (tell_bits + 7) >> 3; return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer); } aom-3.12.1/aom_dsp/bitreader.h000066400000000000000000000160021477627663500161340ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_BITREADER_H_ #define AOM_AOM_DSP_BITREADER_H_ #include #include #include "config/aom_config.h" #include "aom/aomdx.h" #include "aom/aom_integer.h" #include "aom_dsp/entdec.h" #include "aom_dsp/odintrin.h" #include "aom_dsp/prob.h" #if CONFIG_BITSTREAM_DEBUG #include "aom_util/debug_util.h" #endif // CONFIG_BITSTREAM_DEBUG #if CONFIG_ACCOUNTING #include "av1/decoder/accounting.h" #define ACCT_STR_NAME acct_str #define ACCT_STR_PARAM , const char *ACCT_STR_NAME #define ACCT_STR_ARG(s) , s #else #define ACCT_STR_PARAM #define ACCT_STR_ARG(s) #endif #define aom_read(r, prob, ACCT_STR_NAME) \ aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME)) #define aom_read_bit(r, ACCT_STR_NAME) \ aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME)) #define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \ aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME)) #define aom_read_literal(r, bits, ACCT_STR_NAME) \ aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME)) #define aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME) \ aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) #define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \ aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) #ifdef __cplusplus extern "C" { #endif struct aom_reader { const uint8_t *buffer; const uint8_t *buffer_end; od_ec_dec ec; #if CONFIG_ACCOUNTING Accounting *accounting; #endif uint8_t allow_update_cdf; }; typedef struct aom_reader aom_reader; int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size); const uint8_t *aom_reader_find_begin(aom_reader *r); const uint8_t *aom_reader_find_end(aom_reader *r); // Returns true if the bit reader has tried to decode more data from the buffer // than was actually provided. int aom_reader_has_overflowed(const aom_reader *r); // Returns the position in the bit reader in bits. uint32_t aom_reader_tell(const aom_reader *r); // Returns the position in the bit reader in 1/8th bits. uint32_t aom_reader_tell_frac(const aom_reader *r); #if CONFIG_ACCOUNTING static inline void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) { if (r->accounting != NULL) { uint32_t tell_frac; tell_frac = aom_reader_tell_frac(r); aom_accounting_record(r->accounting, ACCT_STR_NAME, tell_frac - r->accounting->last_tell_frac); r->accounting->last_tell_frac = tell_frac; } } static inline void aom_update_symb_counts(const aom_reader *r, int is_binary) { if (r->accounting != NULL) { r->accounting->syms.num_multi_syms += !is_binary; r->accounting->syms.num_binary_syms += !!is_binary; } } #endif static inline int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) { int p = (0x7FFFFF - (prob << 15) + prob) >> 8; int bit = od_ec_decode_bool_q15(&r->ec, p); #if CONFIG_BITSTREAM_DEBUG { int i; int ref_bit, ref_nsymbs; aom_cdf_prob ref_cdf[16]; const int queue_r = bitstream_queue_get_read(); const int frame_idx = aom_bitstream_queue_get_frame_read(); bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs); if (ref_nsymbs != 2) { fprintf(stderr, "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs " "%d queue_r %d\n", frame_idx, 2, ref_nsymbs, queue_r); assert(0); } if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) || (ref_cdf[1] != 32767)) { fprintf(stderr, "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d", frame_idx, p, 32767, ref_cdf[0]); for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]); fprintf(stderr, "} queue_r %d\n", queue_r); assert(0); } if (bit != ref_bit) { fprintf(stderr, "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d " "queue_r %d\n", frame_idx, bit, ref_bit, queue_r); assert(0); } } #endif #if CONFIG_ACCOUNTING if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); aom_update_symb_counts(r, 1); #endif return bit; } static inline int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) { int ret; ret = aom_read(r, 128, NULL); // aom_prob_half #if CONFIG_ACCOUNTING if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); #endif return ret; } static inline int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) { int literal = 0, bit; for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit; #if CONFIG_ACCOUNTING if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); #endif return literal; } static inline int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf, int nsymbs ACCT_STR_PARAM) { int symb; assert(cdf != NULL); symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs); #if CONFIG_BITSTREAM_DEBUG { int i; int cdf_error = 0; int ref_symb, ref_nsymbs; aom_cdf_prob ref_cdf[16]; const int queue_r = bitstream_queue_get_read(); const int frame_idx = aom_bitstream_queue_get_frame_read(); bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs); if (nsymbs != ref_nsymbs) { fprintf(stderr, "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d " "queue_r %d\n", frame_idx, nsymbs, ref_nsymbs, queue_r); cdf_error = 0; assert(0); } else { for (i = 0; i < nsymbs; ++i) if (cdf[i] != ref_cdf[i]) cdf_error = 1; } if (cdf_error) { fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx, cdf[0]); for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]); fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]); for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]); fprintf(stderr, "} queue_r %d\n", queue_r); assert(0); } if (symb != ref_symb) { fprintf( stderr, "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n", frame_idx, symb, ref_symb, queue_r); assert(0); } } #endif #if CONFIG_ACCOUNTING if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); aom_update_symb_counts(r, (nsymbs == 2)); #endif return symb; } static inline int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf, int nsymbs ACCT_STR_PARAM) { int ret; ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME); if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs); return ret; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_BITREADER_H_ aom-3.12.1/aom_dsp/bitreader_buffer.c000066400000000000000000000072461477627663500174720ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "aom_dsp/bitreader_buffer.h" #include "aom_dsp/recenter.h" #include "aom_ports/bitops.h" size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb) { return (rb->bit_offset + 7) >> 3; } int aom_rb_read_bit(struct aom_read_bit_buffer *rb) { const uint32_t off = rb->bit_offset; const uint32_t p = off >> 3; const int q = 7 - (int)(off & 0x7); if (rb->bit_buffer + p < rb->bit_buffer_end) { const int bit = (rb->bit_buffer[p] >> q) & 1; rb->bit_offset = off + 1; return bit; } else { if (rb->error_handler) rb->error_handler(rb->error_handler_data); return 0; } } int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) { assert(bits <= 31); int value = 0, bit; for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit; return value; } #if CONFIG_AV1_DECODER uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits) { assert(bits <= 32); uint32_t value = 0; int bit; for (bit = bits - 1; bit >= 0; bit--) value |= (uint32_t)aom_rb_read_bit(rb) << bit; return value; } int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) { const int nbits = sizeof(unsigned) * 8 - bits - 1; const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits; return ((int)value) >> nbits; } #endif // CONFIG_AV1_DECODER uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) { int leading_zeros = 0; while (leading_zeros < 32 && !aom_rb_read_bit(rb)) ++leading_zeros; // Maximum 32 bits. if (leading_zeros == 32) return UINT32_MAX; const uint32_t base = (1u << leading_zeros) - 1; const uint32_t value = aom_rb_read_literal(rb, leading_zeros); return base + value; } #if CONFIG_AV1_DECODER static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb, uint16_t n) { if (n <= 1) return 0; const int l = get_msb(n) + 1; const int m = (1 << l) - n; const int v = aom_rb_read_literal(rb, l - 1); return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb); } static uint16_t aom_rb_read_primitive_subexpfin(struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k) { int i = 0; int mk = 0; while (1) { int b = (i ? k + i - 1 : k); int a = (1 << b); if (n <= mk + 3 * a) { return aom_rb_read_primitive_quniform(rb, n - mk) + mk; } if (!aom_rb_read_bit(rb)) { return aom_rb_read_literal(rb, b) + mk; } i = i + 1; mk += a; } assert(0); return 0; } static uint16_t aom_rb_read_primitive_refsubexpfin( struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, uint16_t ref) { return inv_recenter_finite_nonneg(n, ref, aom_rb_read_primitive_subexpfin(rb, n, k)); } int16_t aom_rb_read_signed_primitive_refsubexpfin( struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) { ref += n - 1; const uint16_t scaled_n = (n << 1) - 1; return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1; } #endif // CONFIG_AV1_DECODER aom-3.12.1/aom_dsp/bitreader_buffer.h000066400000000000000000000031541477627663500174710ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_BITREADER_BUFFER_H_ #define AOM_AOM_DSP_BITREADER_BUFFER_H_ #include #include "aom/aom_integer.h" #include "config/aom_config.h" #ifdef __cplusplus extern "C" { #endif typedef void (*aom_rb_error_handler)(void *data); struct aom_read_bit_buffer { const uint8_t *bit_buffer; const uint8_t *bit_buffer_end; uint32_t bit_offset; void *error_handler_data; aom_rb_error_handler error_handler; }; size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb); int aom_rb_read_bit(struct aom_read_bit_buffer *rb); int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits); uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb); #if CONFIG_AV1_DECODER uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits); int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits); int16_t aom_rb_read_signed_primitive_refsubexpfin( struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref); #endif // CONFIG_AV1_DECODER #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_BITREADER_BUFFER_H_ aom-3.12.1/aom_dsp/bitwriter.c000066400000000000000000000021711477627663500162030ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom_dsp/bitwriter.h" void aom_start_encode(aom_writer *w, uint8_t *source) { w->buffer = source; w->pos = 0; od_ec_enc_init(&w->ec, 62025); } int aom_stop_encode(aom_writer *w) { int nb_bits; uint32_t bytes; unsigned char *data; data = od_ec_enc_done(&w->ec, &bytes); if (!data) { od_ec_enc_clear(&w->ec); return -1; } nb_bits = od_ec_enc_tell(&w->ec); memcpy(w->buffer, data, bytes); w->pos = bytes; od_ec_enc_clear(&w->ec); return nb_bits; } int aom_tell_size(aom_writer *w) { const int nb_bits = od_ec_enc_tell(&w->ec); return nb_bits; } aom-3.12.1/aom_dsp/bitwriter.h000066400000000000000000000055221477627663500162130ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_BITWRITER_H_ #define AOM_AOM_DSP_BITWRITER_H_ #include #include "config/aom_config.h" #include "aom_dsp/entenc.h" #include "aom_dsp/prob.h" #if CONFIG_RD_DEBUG #include "av1/common/blockd.h" #include "av1/encoder/cost.h" #endif #if CONFIG_BITSTREAM_DEBUG #include "aom_util/debug_util.h" #endif // CONFIG_BITSTREAM_DEBUG #ifdef __cplusplus extern "C" { #endif struct aom_writer { unsigned int pos; uint8_t *buffer; od_ec_enc ec; uint8_t allow_update_cdf; }; typedef struct aom_writer aom_writer; typedef struct TOKEN_STATS { int cost; #if CONFIG_RD_DEBUG int txb_coeff_cost_map[TXB_COEFF_COST_MAP_SIZE][TXB_COEFF_COST_MAP_SIZE]; #endif } TOKEN_STATS; static inline void init_token_stats(TOKEN_STATS *token_stats) { #if CONFIG_RD_DEBUG int r, c; for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) { for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) { token_stats->txb_coeff_cost_map[r][c] = 0; } } #endif token_stats->cost = 0; } void aom_start_encode(aom_writer *w, uint8_t *buffer); // Returns a negative number on error. Caller must check the return value and // handle error. int aom_stop_encode(aom_writer *w); int aom_tell_size(aom_writer *w); static inline void aom_write(aom_writer *w, int bit, int probability) { int p = (0x7FFFFF - (probability << 15) + probability) >> 8; #if CONFIG_BITSTREAM_DEBUG aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 }; bitstream_queue_push(bit, cdf, 2); #endif od_ec_encode_bool_q15(&w->ec, bit, p); } static inline void aom_write_bit(aom_writer *w, int bit) { aom_write(w, bit, 128); // aom_prob_half } static inline void aom_write_literal(aom_writer *w, int data, int bits) { int bit; for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit)); } static inline void aom_write_cdf(aom_writer *w, int symb, const aom_cdf_prob *cdf, int nsymbs) { #if CONFIG_BITSTREAM_DEBUG bitstream_queue_push(symb, cdf, nsymbs); #endif od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs); } static inline void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf, int nsymbs) { aom_write_cdf(w, symb, cdf, nsymbs); if (w->allow_update_cdf) update_cdf(cdf, symb, nsymbs); } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_BITWRITER_H_ aom-3.12.1/aom_dsp/bitwriter_buffer.c000066400000000000000000000106011477627663500175310ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "aom_dsp/bitwriter_buffer.h" #include "aom_dsp/recenter.h" #include "aom_ports/bitops.h" int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb) { return (wb->bit_offset % CHAR_BIT == 0); } uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) { return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0); } void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) { const int off = (int)wb->bit_offset; const int p = off / CHAR_BIT; const int q = CHAR_BIT - 1 - off % CHAR_BIT; if (q == CHAR_BIT - 1) { // Zero next char and write bit wb->bit_buffer[p] = bit << q; } else { wb->bit_buffer[p] &= ~(1 << q); wb->bit_buffer[p] |= bit << q; } wb->bit_offset = off + 1; } static void overwrite_bit(struct aom_write_bit_buffer *wb, int bit) { // Do not zero bytes but overwrite exisiting values const int off = (int)wb->bit_offset; const int p = off / CHAR_BIT; const int q = CHAR_BIT - 1 - off % CHAR_BIT; wb->bit_buffer[p] &= ~(1 << q); wb->bit_buffer[p] |= bit << q; wb->bit_offset = off + 1; } void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) { assert(bits <= 31); int bit; for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1); } void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb, uint32_t data, int bits) { assert(bits <= 32); int bit; for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1); } void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data, int bits) { int bit; for (bit = bits - 1; bit >= 0; bit--) overwrite_bit(wb, (data >> bit) & 1); } void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data, int bits) { aom_wb_write_literal(wb, data, bits + 1); } void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) { int64_t shift_val = ++v; int leading_zeroes = 1; assert(shift_val > 0); while (shift_val >>= 1) leading_zeroes += 2; aom_wb_write_literal(wb, 0, leading_zeroes >> 1); aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1); } static void wb_write_primitive_quniform(struct aom_write_bit_buffer *wb, uint16_t n, uint16_t v) { if (n <= 1) return; const int l = get_msb(n) + 1; const int m = (1 << l) - n; if (v < m) { aom_wb_write_literal(wb, v, l - 1); } else { aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1); aom_wb_write_bit(wb, (v - m) & 1); } } static void wb_write_primitive_subexpfin(struct aom_write_bit_buffer *wb, uint16_t n, uint16_t k, uint16_t v) { int i = 0; int mk = 0; while (1) { int b = (i ? k + i - 1 : k); int a = (1 << b); if (n <= mk + 3 * a) { wb_write_primitive_quniform(wb, n - mk, v - mk); break; } else { int t = (v >= mk + a); aom_wb_write_bit(wb, t); if (t) { i = i + 1; mk += a; } else { aom_wb_write_literal(wb, v - mk, b); break; } } } } static void wb_write_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, uint16_t n, uint16_t k, uint16_t ref, uint16_t v) { wb_write_primitive_subexpfin(wb, n, k, recenter_finite_nonneg(n, ref, v)); } void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, uint16_t n, uint16_t k, int16_t ref, int16_t v) { ref += n - 1; v += n - 1; const uint16_t scaled_n = (n << 1) - 1; wb_write_primitive_refsubexpfin(wb, scaled_n, k, ref, v); } aom-3.12.1/aom_dsp/bitwriter_buffer.h000066400000000000000000000034161477627663500175440ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_BITWRITER_BUFFER_H_ #define AOM_AOM_DSP_BITWRITER_BUFFER_H_ #include "aom/aom_integer.h" #ifdef __cplusplus extern "C" { #endif struct aom_write_bit_buffer { uint8_t *bit_buffer; uint32_t bit_offset; }; int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb); uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb); void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit); void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits); void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb, uint32_t data, int bits); void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data, int bits); void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data, int bits); void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v); void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, uint16_t n, uint16_t k, int16_t ref, int16_t v); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_BITWRITER_BUFFER_H_ aom-3.12.1/aom_dsp/blend.h000066400000000000000000000034111477627663500152570ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_BLEND_H_ #define AOM_AOM_DSP_BLEND_H_ #include "aom_ports/mem.h" // Various blending functions and macros. // See also the aom_blend_* functions in aom_dsp_rtcd.h // Alpha blending with alpha values from the range [0, 64], where 64 // means use the first input and 0 means use the second input. #define AOM_BLEND_A64_ROUND_BITS 6 #define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS) // 64 #define AOM_BLEND_A64(a, v0, v1) \ ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \ AOM_BLEND_A64_ROUND_BITS) // Alpha blending with alpha values from the range [0, 256], where 256 // means use the first input and 0 means use the second input. #define AOM_BLEND_A256_ROUND_BITS 8 #define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS) // 256 #define AOM_BLEND_A256(a, v0, v1) \ ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \ AOM_BLEND_A256_ROUND_BITS) // Blending by averaging. #define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1) #define DIFF_FACTOR_LOG2 4 #define DIFF_FACTOR (1 << DIFF_FACTOR_LOG2) #endif // AOM_AOM_DSP_BLEND_H_ aom-3.12.1/aom_dsp/blend_a64_hmask.c000066400000000000000000000044761477627663500171230ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/blend.h" #include "config/aom_dsp_rtcd.h" void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h) { int i, j; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { dst[i * dst_stride + j] = AOM_BLEND_A64( mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd) { int i, j; uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); (void)bd; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); assert(bd == 8 || bd == 10 || bd == 12); for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { dst[i * dst_stride + j] = AOM_BLEND_A64( mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } #endif aom-3.12.1/aom_dsp/blend_a64_mask.c000066400000000000000000000322021477627663500167370ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "aom_dsp/blend.h" #include "aom_dsp/aom_dsp_common.h" #include "config/aom_dsp_rtcd.h" // Blending with alpha mask. Mask values come from the range [0, 64], // as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can // be the same as dst, or dst can be different from both sources. // NOTE(rachelbarker): The input and output of aom_blend_a64_d16_mask_c() are // in a higher intermediate precision, and will later be rounded down to pixel // precision. // Thus, in order to avoid double-rounding, we want to use normal right shifts // within this function, not ROUND_POWER_OF_TWO. // This works because of the identity: // ROUND_POWER_OF_TWO(x >> y, z) == ROUND_POWER_OF_TWO(x, y+z) // // In contrast, the output of the non-d16 functions will not be further rounded, // so we *should* use ROUND_POWER_OF_TWO there. void aom_lowbd_blend_a64_d16_mask_c( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params) { int i, j; const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int round_offset = (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); assert(h >= 4); assert(w >= 4); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); if (subw == 0 && subh == 0) { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { int32_t res; const int m = mask[i * mask_stride + j]; res = ((m * (int32_t)src0[i * src0_stride + j] + (AOM_BLEND_A64_MAX_ALPHA - m) * (int32_t)src1[i * src1_stride + j]) >> AOM_BLEND_A64_ROUND_BITS); res -= round_offset; dst[i * dst_stride + j] = clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); } } } else if (subw == 1 && subh == 1) { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { int32_t res; const int m = ROUND_POWER_OF_TWO( mask[(2 * i) * mask_stride + (2 * j)] + mask[(2 * i + 1) * mask_stride + (2 * j)] + mask[(2 * i) * mask_stride + (2 * j + 1)] + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], 2); res = ((m * (int32_t)src0[i * src0_stride + j] + (AOM_BLEND_A64_MAX_ALPHA - m) * (int32_t)src1[i * src1_stride + j]) >> AOM_BLEND_A64_ROUND_BITS); res -= round_offset; dst[i * dst_stride + j] = clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); } } } else if (subw == 1 && subh == 0) { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { int32_t res; const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], mask[i * mask_stride + (2 * j + 1)]); res = ((m * (int32_t)src0[i * src0_stride + j] + (AOM_BLEND_A64_MAX_ALPHA - m) * (int32_t)src1[i * src1_stride + j]) >> AOM_BLEND_A64_ROUND_BITS); res -= round_offset; dst[i * dst_stride + j] = clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); } } } else { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { int32_t res; const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], mask[(2 * i + 1) * mask_stride + j]); res = ((int32_t)(m * (int32_t)src0[i * src0_stride + j] + (AOM_BLEND_A64_MAX_ALPHA - m) * (int32_t)src1[i * src1_stride + j]) >> AOM_BLEND_A64_ROUND_BITS); res -= round_offset; dst[i * dst_stride + j] = clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); } } } } #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_blend_a64_d16_mask_c( uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd) { const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int round_offset = (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); // excerpt from clip_pixel_highbd() // set saturation_value to (1 << bd) - 1 unsigned int saturation_value; switch (bd) { case 8: default: saturation_value = 255; break; case 10: saturation_value = 1023; break; case 12: saturation_value = 4095; break; } if (subw == 0 && subh == 0) { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { int32_t res; const int m = mask[j]; res = ((m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> AOM_BLEND_A64_ROUND_BITS); res -= round_offset; unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); dst[j] = AOMMIN(v, saturation_value); } mask += mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } } else if (subw == 1 && subh == 1) { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { int32_t res; const int m = ROUND_POWER_OF_TWO( mask[2 * j] + mask[mask_stride + 2 * j] + mask[2 * j + 1] + mask[mask_stride + 2 * j + 1], 2); res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> AOM_BLEND_A64_ROUND_BITS; res -= round_offset; unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); dst[j] = AOMMIN(v, saturation_value); } mask += 2 * mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } } else if (subw == 1 && subh == 0) { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { int32_t res; const int m = AOM_BLEND_AVG(mask[2 * j], mask[2 * j + 1]); res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> AOM_BLEND_A64_ROUND_BITS; res -= round_offset; unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); dst[j] = AOMMIN(v, saturation_value); } mask += mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } } else { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { int32_t res; const int m = AOM_BLEND_AVG(mask[j], mask[mask_stride + j]); res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> AOM_BLEND_A64_ROUND_BITS; res -= round_offset; unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); dst[j] = AOMMIN(v, saturation_value); } mask += 2 * mask_stride; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } } } #endif // CONFIG_AV1_HIGHBITDEPTH // Blending with alpha mask. Mask values come from the range [0, 64], // as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can // be the same as dst, or dst can be different from both sources. void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh) { int i, j; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); if (subw == 0 && subh == 0) { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { const int m = mask[i * mask_stride + j]; dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } else if (subw == 1 && subh == 1) { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { const int m = ROUND_POWER_OF_TWO( mask[(2 * i) * mask_stride + (2 * j)] + mask[(2 * i + 1) * mask_stride + (2 * j)] + mask[(2 * i) * mask_stride + (2 * j + 1)] + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], 2); dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } else if (subw == 1 && subh == 0) { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], mask[i * mask_stride + (2 * j + 1)]); dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } else { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], mask[(2 * i + 1) * mask_stride + j]); dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } } #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd) { int i, j; uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); (void)bd; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); assert(bd == 8 || bd == 10 || bd == 12); if (subw == 0 && subh == 0) { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { const int m = mask[i * mask_stride + j]; dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } else if (subw == 1 && subh == 1) { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { const int m = ROUND_POWER_OF_TWO( mask[(2 * i) * mask_stride + (2 * j)] + mask[(2 * i + 1) * mask_stride + (2 * j)] + mask[(2 * i) * mask_stride + (2 * j + 1)] + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], 2); dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } else if (subw == 1 && subh == 0) { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], mask[i * mask_stride + (2 * j + 1)]); dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } else { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], mask[(2 * i + 1) * mask_stride + j]); dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/blend_a64_vmask.c000066400000000000000000000046561477627663500171410ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/blend.h" #include "config/aom_dsp_rtcd.h" void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h) { int i, j; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); for (i = 0; i < h; ++i) { const int m = mask[i]; for (j = 0; j < w; ++j) { dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd) { int i, j; uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); (void)bd; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); assert(bd == 8 || bd == 10 || bd == 12); for (i = 0; i < h; ++i) { const int m = mask[i]; for (j = 0; j < w; ++j) { dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } #endif aom-3.12.1/aom_dsp/blk_sse_sum.c000066400000000000000000000016141477627663500164770ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_dsp_rtcd.h" void aom_get_blk_sse_sum_c(const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum) { *x_sum = 0; *x2_sum = 0; for (int i = 0; i < bh; ++i) { for (int j = 0; j < bw; ++j) { const int val = data[j]; *x_sum += val; *x2_sum += val * val; } data += stride; } } aom-3.12.1/aom_dsp/butteraugli.c000066400000000000000000000106311477627663500165170ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/butteraugli.h" #include "aom_mem/aom_mem.h" #include "third_party/libyuv/include/libyuv/convert_argb.h" int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *distorted, int bit_depth, aom_matrix_coefficients_t matrix_coefficients, aom_color_range_t color_range, float *dist_map) { (void)bit_depth; assert(bit_depth == 8); const int width = source->y_crop_width; const int height = source->y_crop_height; const int ss_x = source->subsampling_x; const int ss_y = source->subsampling_y; const struct YuvConstants *yuv_constants; if (matrix_coefficients == AOM_CICP_MC_BT_709) { if (color_range == AOM_CR_FULL_RANGE) return 0; yuv_constants = &kYuvH709Constants; } else { yuv_constants = color_range == AOM_CR_FULL_RANGE ? &kYuvJPEGConstants : &kYuvI601Constants; } const int stride_argb = width * 4; const size_t buffer_size = (size_t)height * stride_argb; uint8_t *src_argb = (uint8_t *)aom_malloc(buffer_size); uint8_t *distorted_argb = (uint8_t *)aom_malloc(buffer_size); if (!src_argb || !distorted_argb) { aom_free(src_argb); aom_free(distorted_argb); return 0; } if (ss_x == 1 && ss_y == 1) { I420ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer, source->uv_stride, source->v_buffer, source->uv_stride, src_argb, stride_argb, yuv_constants, width, height); I420ToARGBMatrix(distorted->y_buffer, distorted->y_stride, distorted->u_buffer, distorted->uv_stride, distorted->v_buffer, distorted->uv_stride, distorted_argb, stride_argb, yuv_constants, width, height); } else if (ss_x == 1 && ss_y == 0) { I422ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer, source->uv_stride, source->v_buffer, source->uv_stride, src_argb, stride_argb, yuv_constants, width, height); I422ToARGBMatrix(distorted->y_buffer, distorted->y_stride, distorted->u_buffer, distorted->uv_stride, distorted->v_buffer, distorted->uv_stride, distorted_argb, stride_argb, yuv_constants, width, height); } else if (ss_x == 0 && ss_y == 0) { I444ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer, source->uv_stride, source->v_buffer, source->uv_stride, src_argb, stride_argb, yuv_constants, width, height); I444ToARGBMatrix(distorted->y_buffer, distorted->y_stride, distorted->u_buffer, distorted->uv_stride, distorted->v_buffer, distorted->uv_stride, distorted_argb, stride_argb, yuv_constants, width, height); } else { aom_free(src_argb); aom_free(distorted_argb); return 0; } JxlPixelFormat pixel_format = { 4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0 }; JxlButteraugliApi *api = JxlButteraugliApiCreate(NULL); JxlButteraugliApiSetHFAsymmetry(api, 0.8f); JxlButteraugliResult *result = JxlButteraugliCompute( api, width, height, &pixel_format, src_argb, buffer_size, &pixel_format, distorted_argb, buffer_size); const float *distmap = NULL; uint32_t row_stride; JxlButteraugliResultGetDistmap(result, &distmap, &row_stride); if (distmap == NULL) { JxlButteraugliApiDestroy(api); JxlButteraugliResultDestroy(result); aom_free(src_argb); aom_free(distorted_argb); return 0; } for (int j = 0; j < height; ++j) { for (int i = 0; i < width; ++i) { dist_map[j * width + i] = distmap[j * row_stride + i]; } } JxlButteraugliApiDestroy(api); JxlButteraugliResultDestroy(result); aom_free(src_argb); aom_free(distorted_argb); return 1; } aom-3.12.1/aom_dsp/butteraugli.h000066400000000000000000000017631477627663500165320ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_BUTTERAUGLI_H_ #define AOM_AOM_DSP_BUTTERAUGLI_H_ #include "aom_scale/yv12config.h" // Returns a boolean that indicates success/failure. int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *distorted, int bit_depth, aom_matrix_coefficients_t matrix_coefficients, aom_color_range_t color_range, float *dist_map); #endif // AOM_AOM_DSP_BUTTERAUGLI_H_ aom-3.12.1/aom_dsp/entcode.c000066400000000000000000000043721477627663500156160ustar00rootroot00000000000000/* * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom_dsp/entcode.h" /*Given the current total integer number of bits used and the current value of rng, computes the fraction number of bits used to OD_BITRES precision. This is used by od_ec_enc_tell_frac() and od_ec_dec_tell_frac(). nbits_total: The number of whole bits currently used, i.e., the value returned by od_ec_enc_tell() or od_ec_dec_tell(). rng: The current value of rng from either the encoder or decoder state. Return: The number of bits scaled by 2**OD_BITRES. This will always be slightly larger than the exact value (e.g., all rounding error is in the positive direction).*/ uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng) { uint32_t nbits; int l; int i; /*To handle the non-integral number of bits still left in the encoder/decoder state, we compute the worst-case number of bits of val that must be encoded to ensure that the value is inside the range for any possible subsequent bits. The computation here is independent of val itself (the decoder does not even track that value), even though the real number of bits used after od_ec_enc_done() may be 1 smaller if rng is a power of two and the corresponding trailing bits of val are all zeros. If we did try to track that special case, then coding a value with a probability of 1/(1 << n) might sometimes appear to use more than n bits. This may help explain the surprising result that a newly initialized encoder or decoder claims to have used 1 bit.*/ nbits = nbits_total << OD_BITRES; l = 0; for (i = OD_BITRES; i-- > 0;) { int b; rng = rng * rng >> 15; b = (int)(rng >> 16); l = l << 1 | b; rng >>= b; } return nbits - l; } aom-3.12.1/aom_dsp/entcode.h000066400000000000000000000026121477627663500156160ustar00rootroot00000000000000/* * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_ENTCODE_H_ #define AOM_AOM_DSP_ENTCODE_H_ #include #include #include "aom_dsp/odintrin.h" #include "aom_dsp/prob.h" #define EC_PROB_SHIFT 6 #define EC_MIN_PROB 4 // must be <= (1< 1/8th bits.*/ #define OD_BITRES (3) #define OD_ICDF AOM_ICDF /*See entcode.c for further documentation.*/ OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng); #endif // AOM_AOM_DSP_ENTCODE_H_ aom-3.12.1/aom_dsp/entdec.c000066400000000000000000000223271477627663500154370ustar00rootroot00000000000000/* * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom_dsp/entdec.h" #include "aom_dsp/prob.h" /*A range decoder. This is an entropy decoder based upon \cite{Mar79}, which is itself a rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}. It is very similar to arithmetic encoding, except that encoding is done with digits in any base, instead of with bits, and so it is faster when using larger bases (i.e.: a byte). The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$ is the base, longer than the theoretical optimum, but to my knowledge there is no published justification for this claim. This only seems true when using near-infinite precision arithmetic so that the process is carried out with no rounding errors. An excellent description of implementation details is available at http://www.arturocampos.com/ac_range.html A recent work \cite{MNW98} which proposes several changes to arithmetic encoding for efficiency actually re-discovers many of the principles behind range encoding, and presents a good theoretical analysis of them. End of stream is handled by writing out the smallest number of bits that ensures that the stream will be correctly decoded regardless of the value of any subsequent bits. od_ec_dec_tell() can be used to determine how many bits were needed to decode all the symbols thus far; other data can be packed in the remaining bits of the input buffer. @PHDTHESIS{Pas76, author="Richard Clark Pasco", title="Source coding algorithms for fast data compression", school="Dept. of Electrical Engineering, Stanford University", address="Stanford, CA", month=May, year=1976, URL="http://www.richpasco.org/scaffdc.pdf" } @INPROCEEDINGS{Mar79, author="Martin, G.N.N.", title="Range encoding: an algorithm for removing redundancy from a digitised message", booktitle="Video & Data Recording Conference", year=1979, address="Southampton", month=Jul, URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz" } @ARTICLE{MNW98, author="Alistair Moffat and Radford Neal and Ian H. Witten", title="Arithmetic Coding Revisited", journal="{ACM} Transactions on Information Systems", year=1998, volume=16, number=3, pages="256--294", month=Jul, URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf" }*/ /*This is meant to be a large, positive constant that can still be efficiently loaded as an immediate (on platforms like ARM, for example). Even relatively modest values like 100 would work fine.*/ #define OD_EC_LOTS_OF_BITS (0x4000) /*The return value of od_ec_dec_tell does not change across an od_ec_dec_refill call.*/ static void od_ec_dec_refill(od_ec_dec *dec) { int s; od_ec_window dif; int16_t cnt; const unsigned char *bptr; const unsigned char *end; dif = dec->dif; cnt = dec->cnt; bptr = dec->bptr; end = dec->end; s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15); for (; s >= 0 && bptr < end; s -= 8, bptr++) { /*Each time a byte is inserted into the window (dif), bptr advances and cnt is incremented by 8, so the total number of consumed bits (the return value of od_ec_dec_tell) does not change.*/ assert(s <= OD_EC_WINDOW_SIZE - 8); dif ^= (od_ec_window)bptr[0] << s; cnt += 8; } if (bptr >= end) { /*We've reached the end of the buffer. It is perfectly valid for us to need to fill the window with additional bits past the end of the buffer (and this happens in normal operation). These bits should all just be taken as zero. But we cannot increment bptr past 'end' (this is undefined behavior), so we start to increment dec->tell_offs. We also don't want to keep testing bptr against 'end', so we set cnt to OD_EC_LOTS_OF_BITS and adjust dec->tell_offs so that the total number of unconsumed bits in the window (dec->cnt - dec->tell_offs) does not change. This effectively puts lots of zero bits into the window, and means we won't try to refill it from the buffer for a very long time (at which point we'll put lots of zero bits into the window again).*/ dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt; cnt = OD_EC_LOTS_OF_BITS; } dec->dif = dif; dec->cnt = cnt; dec->bptr = bptr; } /*Takes updated dif and range values, renormalizes them so that 32768 <= rng < 65536 (reading more bytes from the stream into dif if necessary), and stores them back in the decoder context. dif: The new value of dif. rng: The new value of the range. ret: The value to return. Return: ret. This allows the compiler to jump to this function via a tail-call.*/ static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng, int ret) { int d; assert(rng <= 65535U); /*The number of leading zeros in the 16-bit binary representation of rng.*/ d = 16 - OD_ILOG_NZ(rng); /*d bits in dec->dif are consumed.*/ dec->cnt -= d; /*This is equivalent to shifting in 1's instead of 0's.*/ dec->dif = ((dif + 1) << d) - 1; dec->rng = rng << d; if (dec->cnt < 0) od_ec_dec_refill(dec); return ret; } /*Initializes the decoder. buf: The input buffer to use. storage: The size in bytes of the input buffer.*/ void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage) { dec->buf = buf; dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8); dec->end = buf + storage; dec->bptr = buf; dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1; dec->rng = 0x8000; dec->cnt = -15; od_ec_dec_refill(dec); } /*Decode a single binary value. f: The probability that the bit is one, scaled by 32768. Return: The value decoded (0 or 1).*/ int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) { od_ec_window dif; od_ec_window vw; unsigned r; unsigned r_new; unsigned v; int ret; assert(0 < f); assert(f < 32768U); dif = dec->dif; r = dec->rng; assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r); assert(32768U <= r); v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)); v += EC_MIN_PROB; vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); ret = 1; r_new = v; if (dif >= vw) { r_new = r - v; dif -= vw; ret = 0; } return od_ec_dec_normalize(dec, dif, r_new, ret); } /*Decodes a symbol given an inverse cumulative distribution function (CDF) table in Q15. icdf: CDF_PROB_TOP minus the CDF, such that symbol s falls in the range [s > 0 ? (CDF_PROB_TOP - icdf[s - 1]) : 0, CDF_PROB_TOP - icdf[s]). The values must be monotonically non-increasing, and icdf[nsyms - 1] must be 0. nsyms: The number of symbols in the alphabet. This should be at most 16. Return: The decoded symbol s.*/ int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) { od_ec_window dif; unsigned r; unsigned c; unsigned u; unsigned v; int ret; (void)nsyms; dif = dec->dif; r = dec->rng; const int N = nsyms - 1; assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r); assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP)); assert(32768U <= r); assert(7 - EC_PROB_SHIFT >= 0); c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16)); v = r; ret = -1; do { u = v; v = ((r >> 8) * (uint32_t)(icdf[++ret] >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)); v += EC_MIN_PROB * (N - ret); } while (c < v); assert(v < u); assert(u <= r); r = u - v; dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); return od_ec_dec_normalize(dec, dif, r, ret); } /*Returns the number of bits "used" by the decoded symbols so far. This same number can be computed in either the encoder or the decoder, and is suitable for making coding decisions. Return: The number of bits. This will always be slightly larger than the exact value (e.g., all rounding error is in the positive direction).*/ int od_ec_dec_tell(const od_ec_dec *dec) { /*There is a window of bits stored in dec->dif. The difference (dec->bptr - dec->buf) tells us how many bytes have been read into this window. The difference (dec->cnt - dec->tell_offs) tells us how many of the bits in that window remain unconsumed.*/ return (int)((dec->bptr - dec->buf) * 8 - dec->cnt + dec->tell_offs); } /*Returns the number of bits "used" by the decoded symbols so far. This same number can be computed in either the encoder or the decoder, and is suitable for making coding decisions. Return: The number of bits scaled by 2**OD_BITRES. This will always be slightly larger than the exact value (e.g., all rounding error is in the positive direction).*/ uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) { return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng); } aom-3.12.1/aom_dsp/entdec.h000066400000000000000000000056011477627663500154400ustar00rootroot00000000000000/* * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_ENTDEC_H_ #define AOM_AOM_DSP_ENTDEC_H_ #include #include "aom_dsp/entcode.h" #ifdef __cplusplus extern "C" { #endif typedef struct od_ec_dec od_ec_dec; #if defined(OD_ACCOUNTING) && OD_ACCOUNTING #define OD_ACC_STR , char *acc_str #define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb, str) #else #define OD_ACC_STR #define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb) #endif /*The entropy decoder context.*/ struct od_ec_dec { /*The start of the current input buffer.*/ const unsigned char *buf; /*An offset used to keep track of tell after reaching the end of the stream. This is constant throughout most of the decoding process, but becomes important once we hit the end of the buffer and stop incrementing bptr (and instead pretend cnt has lots of bits).*/ int32_t tell_offs; /*The end of the current input buffer.*/ const unsigned char *end; /*The read pointer for the entropy-coded bits.*/ const unsigned char *bptr; /*The difference between the high end of the current range, (low + rng), and the coded value, minus 1. This stores up to OD_EC_WINDOW_SIZE bits of that difference, but the decoder only uses the top 16 bits of the window to decode the next symbol. As we shift up during renormalization, if we don't have enough bits left in the window to fill the top 16, we'll read in more bits of the coded value.*/ od_ec_window dif; /*The number of values in the current range.*/ uint16_t rng; /*The number of bits of data in the current value.*/ int16_t cnt; }; /*See entdec.c for further documentation.*/ void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage) OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); OD_WARN_UNUSED_RESULT int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) OD_ARG_NONNULL(1); OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *cdf, int nsyms) OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) OD_ARG_NONNULL(1); OD_WARN_UNUSED_RESULT int od_ec_dec_tell(const od_ec_dec *dec) OD_ARG_NONNULL(1); OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) OD_ARG_NONNULL(1); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_ENTDEC_H_ aom-3.12.1/aom_dsp/entenc.c000066400000000000000000000261051477627663500154470ustar00rootroot00000000000000/* * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "aom_dsp/entenc.h" #include "aom_dsp/prob.h" #if OD_MEASURE_EC_OVERHEAD #if !defined(M_LOG2E) #define M_LOG2E (1.4426950408889634073599246810019) #endif #define OD_LOG2(x) (M_LOG2E * log(x)) #endif // OD_MEASURE_EC_OVERHEAD /*A range encoder. See entdec.c and the references for implementation details \cite{Mar79,MNW98}. @INPROCEEDINGS{Mar79, author="Martin, G.N.N.", title="Range encoding: an algorithm for removing redundancy from a digitised message", booktitle="Video \& Data Recording Conference", year=1979, address="Southampton", month=Jul, URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz" } @ARTICLE{MNW98, author="Alistair Moffat and Radford Neal and Ian H. Witten", title="Arithmetic Coding Revisited", journal="{ACM} Transactions on Information Systems", year=1998, volume=16, number=3, pages="256--294", month=Jul, URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf" }*/ /*Takes updated low and range values, renormalizes them so that 32768 <= rng < 65536 (flushing bytes from low to the output buffer if necessary), and stores them back in the encoder context. low: The new value of low. rng: The new value of the range.*/ static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_enc_window low, unsigned rng) { int d; int c; int s; if (enc->error) return; c = enc->cnt; assert(rng <= 65535U); /*The number of leading zeros in the 16-bit binary representation of rng.*/ d = 16 - OD_ILOG_NZ(rng); s = c + d; /* We flush every time "low" cannot safely and efficiently accommodate any more data. Overall, c must not exceed 63 at the time of byte flush out. To facilitate this, "s" cannot exceed 56-bits because we have to keep 1 byte for carry. Also, we need to subtract 16 because we want to keep room for the next symbol worth "d"-bits (max 15). An alternate condition would be if (e < d), where e = number of leading zeros in "low", indicating there is not enough rooom to accommodate "rng" worth of "d"-bits in "low". However, this approach needs additional computations: (i) compute "e", (ii) push the leading 0x00's as a special case. */ if (s >= 40) { // 56 - 16 unsigned char *out = enc->buf; uint32_t storage = enc->storage; uint32_t offs = enc->offs; if (offs + 8 > storage) { storage = 2 * storage + 8; out = (unsigned char *)realloc(out, sizeof(*out) * storage); if (out == NULL) { enc->error = -1; return; } enc->buf = out; enc->storage = storage; } // Need to add 1 byte here since enc->cnt always counts 1 byte less // (enc->cnt = -9) to ensure correct operation uint8_t num_bytes_ready = (s >> 3) + 1; // Update "c" to contain the number of non-ready bits in "low". Since "low" // has 64-bit capacity, we need to add the (64 - 40) cushion bits and take // off the number of ready bits. c += 24 - (num_bytes_ready << 3); // Prepare "output" and update "low" uint64_t output = low >> c; low = low & (((uint64_t)1 << c) - 1); // Prepare data and carry mask uint64_t mask = (uint64_t)1 << (num_bytes_ready << 3); uint64_t carry = output & mask; mask = mask - 0x01; output = output & mask; // Write data in a single operation write_enc_data_to_out_buf(out, offs, output, carry, &enc->offs, num_bytes_ready); // Update state of the encoder: enc->cnt to contain the number of residual // bits s = c + d - 24; } enc->low = low << d; enc->rng = rng << d; enc->cnt = s; } /*Initializes the encoder. size: The initial size of the buffer, in bytes.*/ void od_ec_enc_init(od_ec_enc *enc, uint32_t size) { od_ec_enc_reset(enc); enc->buf = (unsigned char *)malloc(sizeof(*enc->buf) * size); enc->storage = size; if (size > 0 && enc->buf == NULL) { enc->storage = 0; enc->error = -1; } } /*Reinitializes the encoder.*/ void od_ec_enc_reset(od_ec_enc *enc) { enc->offs = 0; enc->low = 0; enc->rng = 0x8000; /*This is initialized to -9 so that it crosses zero after we've accumulated one byte + one carry bit.*/ enc->cnt = -9; enc->error = 0; #if OD_MEASURE_EC_OVERHEAD enc->entropy = 0; enc->nb_symbols = 0; #endif } /*Frees the buffers used by the encoder.*/ void od_ec_enc_clear(od_ec_enc *enc) { free(enc->buf); } /*Encodes a symbol given its frequency in Q15. fl: CDF_PROB_TOP minus the cumulative frequency of all symbols that come before the one to be encoded. fh: CDF_PROB_TOP minus the cumulative frequency of all symbols up to and including the one to be encoded.*/ static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh, int s, int nsyms) { od_ec_enc_window l; unsigned r; unsigned u; unsigned v; l = enc->low; r = enc->rng; assert(32768U <= r); assert(fh <= fl); assert(fl <= 32768U); assert(7 - EC_PROB_SHIFT >= 0); const int N = nsyms - 1; if (fl < CDF_PROB_TOP) { u = ((r >> 8) * (uint32_t)(fl >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB * (N - (s - 1)); v = ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB * (N - (s + 0)); l += r - u; r = u - v; } else { r -= ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB * (N - (s + 0)); } od_ec_enc_normalize(enc, l, r); #if OD_MEASURE_EC_OVERHEAD enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / CDF_PROB_TOP.); enc->nb_symbols++; #endif } /*Encode a single binary value. val: The value to encode (0 or 1). f: The probability that the val is one, scaled by 32768.*/ void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) { od_ec_enc_window l; unsigned r; unsigned v; assert(0 < f); assert(f < 32768U); l = enc->low; r = enc->rng; assert(32768U <= r); v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)); v += EC_MIN_PROB; if (val) l += r - v; r = val ? v : r - v; od_ec_enc_normalize(enc, l, r); #if OD_MEASURE_EC_OVERHEAD enc->entropy -= OD_LOG2((double)(val ? f : (32768 - f)) / 32768.); enc->nb_symbols++; #endif } /*Encodes a symbol given a cumulative distribution function (CDF) table in Q15. s: The index of the symbol to encode. icdf: 32768 minus the CDF, such that symbol s falls in the range [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]). The values must be monotonically decreasing, and icdf[nsyms - 1] must be 0. nsyms: The number of symbols in the alphabet. This should be at most 16.*/ void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *icdf, int nsyms) { (void)nsyms; assert(s >= 0); assert(s < nsyms); assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP)); od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s], s, nsyms); } #if OD_MEASURE_EC_OVERHEAD #include #endif /*Indicates that there are no more symbols to encode. All remaining output bytes are flushed to the output buffer. od_ec_enc_reset() should be called before using the encoder again. bytes: Returns the size of the encoded data in the returned buffer. Return: A pointer to the start of the final buffer, or NULL if there was an encoding error.*/ unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) { unsigned char *out; uint32_t storage; uint32_t offs; od_ec_enc_window m; od_ec_enc_window e; od_ec_enc_window l; int c; int s; if (enc->error) return NULL; #if OD_MEASURE_EC_OVERHEAD { uint32_t tell; /* Don't count the 1 bit we lose to raw bits as overhead. */ tell = od_ec_enc_tell(enc) - 1; fprintf(stderr, "overhead: %f%%\n", 100 * (tell - enc->entropy) / enc->entropy); fprintf(stderr, "efficiency: %f bits/symbol\n", (double)tell / enc->nb_symbols); } #endif l = enc->low; c = enc->cnt; s = 10; m = 0x3FFF; e = ((l + m) & ~m) | (m + 1); s += c; offs = enc->offs; /*Make sure there's enough room for the entropy-coded bits.*/ out = enc->buf; storage = enc->storage; const int s_bits = (s + 7) >> 3; int b = OD_MAXI(s_bits, 0); if (offs + b > storage) { storage = offs + b; out = (unsigned char *)realloc(out, sizeof(*out) * storage); if (out == NULL) { enc->error = -1; return NULL; } enc->buf = out; enc->storage = storage; } /*We output the minimum number of bits that ensures that the symbols encoded thus far will be decoded correctly regardless of the bits that follow.*/ if (s > 0) { uint64_t n; n = ((uint64_t)1 << (c + 16)) - 1; do { assert(offs < storage); uint16_t val = (uint16_t)(e >> (c + 16)); out[offs] = (unsigned char)(val & 0x00FF); if (val & 0x0100) { assert(offs > 0); propagate_carry_bwd(out, offs - 1); } offs++; e &= n; s -= 8; c -= 8; n >>= 8; } while (s > 0); } *nbytes = offs; return out; } /*Returns the number of bits "used" by the encoded symbols so far. This same number can be computed in either the encoder or the decoder, and is suitable for making coding decisions. Warning: The value returned by this function can decrease compared to an earlier call, even after encoding more data, if there is an encoding error (i.e., a failure to allocate enough space for the output buffer). Return: The number of bits. This will always be slightly larger than the exact value (e.g., all rounding error is in the positive direction).*/ int od_ec_enc_tell(const od_ec_enc *enc) { /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra bit, which we reserve for terminating the stream.*/ return (enc->cnt + 10) + enc->offs * 8; } /*Returns the number of bits "used" by the encoded symbols so far. This same number can be computed in either the encoder or the decoder, and is suitable for making coding decisions. Warning: The value returned by this function can decrease compared to an earlier call, even after encoding more data, if there is an encoding error (i.e., a failure to allocate enough space for the output buffer). Return: The number of bits scaled by 2**OD_BITRES. This will always be slightly larger than the exact value (e.g., all rounding error is in the positive direction).*/ uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) { return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng); } aom-3.12.1/aom_dsp/entenc.h000066400000000000000000000065471477627663500154640ustar00rootroot00000000000000/* * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_ENTENC_H_ #define AOM_AOM_DSP_ENTENC_H_ #include #include "aom_dsp/entcode.h" #include "aom_util/endian_inl.h" #ifdef __cplusplus extern "C" { #endif typedef uint64_t od_ec_enc_window; typedef struct od_ec_enc od_ec_enc; #define OD_MEASURE_EC_OVERHEAD (0) /*The entropy encoder context.*/ struct od_ec_enc { /*Buffered output. This contains only the raw bits until the final call to od_ec_enc_done(), where all the arithmetic-coded data gets prepended to it.*/ unsigned char *buf; /*The size of the buffer.*/ uint32_t storage; /*The offset at which the next entropy-coded byte will be written.*/ uint32_t offs; /*The low end of the current range.*/ od_ec_enc_window low; /*The number of values in the current range.*/ uint16_t rng; /*The number of bits of data in the current value.*/ int16_t cnt; /*Nonzero if an error occurred.*/ int error; #if OD_MEASURE_EC_OVERHEAD double entropy; int nb_symbols; #endif }; /*See entenc.c for further documentation.*/ void od_ec_enc_init(od_ec_enc *enc, uint32_t size) OD_ARG_NONNULL(1); void od_ec_enc_reset(od_ec_enc *enc) OD_ARG_NONNULL(1); void od_ec_enc_clear(od_ec_enc *enc) OD_ARG_NONNULL(1); void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f_q15) OD_ARG_NONNULL(1); void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms) OD_ARG_NONNULL(1) OD_ARG_NONNULL(3); void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) OD_ARG_NONNULL(1); OD_WARN_UNUSED_RESULT unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); OD_WARN_UNUSED_RESULT int od_ec_enc_tell(const od_ec_enc *enc) OD_ARG_NONNULL(1); OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) OD_ARG_NONNULL(1); // buf is the frame bitbuffer, offs is where carry to be added static inline void propagate_carry_bwd(unsigned char *buf, uint32_t offs) { uint16_t sum, carry = 1; do { sum = (uint16_t)buf[offs] + 1; buf[offs--] = (unsigned char)sum; carry = sum >> 8; } while (carry); } // Convert to big-endian byte order and write data to buffer adding the // carry-bit static inline void write_enc_data_to_out_buf(unsigned char *out, uint32_t offs, uint64_t output, uint64_t carry, uint32_t *enc_offs, uint8_t num_bytes_ready) { const uint64_t reg = HToBE64(output << ((8 - num_bytes_ready) << 3)); memcpy(&out[offs], ®, 8); // Propagate carry backwards if exists if (carry) { assert(offs > 0); propagate_carry_bwd(out, offs - 1); } *enc_offs = offs + num_bytes_ready; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_ENTENC_H_ aom-3.12.1/aom_dsp/fastssim.c000066400000000000000000000377511477627663500160350ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. * * This code was originally written by: Nathan E. Egge, at the Daala * project. */ #include #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/ssim.h" typedef struct fs_level fs_level; typedef struct fs_ctx fs_ctx; #define SSIM_C1 (255 * 255 * 0.01 * 0.01) #define SSIM_C2 (255 * 255 * 0.03 * 0.03) #define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01) #define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01) #define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03) #define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03) #define MAX_SSIM_DB 100.0 #define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b)) #define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b)) struct fs_level { uint32_t *im1; uint32_t *im2; double *ssim; int w; int h; }; struct fs_ctx { fs_level *level; int nlevels; unsigned *col_buf; }; static int fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { unsigned char *data; size_t data_size; int lw; int lh; int l; lw = (_w + 1) >> 1; lh = (_h + 1) >> 1; data_size = _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf); for (l = 0; l < _nlevels; l++) { size_t im_size; size_t level_size; im_size = lw * (size_t)lh; level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); level_size += sizeof(*_ctx->level[l].ssim) - 1; level_size /= sizeof(*_ctx->level[l].ssim); level_size += im_size; level_size *= sizeof(*_ctx->level[l].ssim); data_size += level_size; lw = (lw + 1) >> 1; lh = (lh + 1) >> 1; } data = (unsigned char *)malloc(data_size); if (!data) return -1; _ctx->level = (fs_level *)data; _ctx->nlevels = _nlevels; data += _nlevels * sizeof(*_ctx->level); lw = (_w + 1) >> 1; lh = (_h + 1) >> 1; for (l = 0; l < _nlevels; l++) { size_t im_size; size_t level_size; _ctx->level[l].w = lw; _ctx->level[l].h = lh; im_size = lw * (size_t)lh; level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); level_size += sizeof(*_ctx->level[l].ssim) - 1; level_size /= sizeof(*_ctx->level[l].ssim); level_size *= sizeof(*_ctx->level[l].ssim); _ctx->level[l].im1 = (uint32_t *)data; _ctx->level[l].im2 = _ctx->level[l].im1 + im_size; data += level_size; _ctx->level[l].ssim = (double *)data; data += im_size * sizeof(*_ctx->level[l].ssim); lw = (lw + 1) >> 1; lh = (lh + 1) >> 1; } _ctx->col_buf = (unsigned *)data; return 0; } static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); } static void fs_downsample_level(fs_ctx *_ctx, int _l) { const uint32_t *src1; const uint32_t *src2; uint32_t *dst1; uint32_t *dst2; int w2; int h2; int w; int h; int i; int j; w = _ctx->level[_l].w; h = _ctx->level[_l].h; dst1 = _ctx->level[_l].im1; dst2 = _ctx->level[_l].im2; w2 = _ctx->level[_l - 1].w; h2 = _ctx->level[_l - 1].h; src1 = _ctx->level[_l - 1].im1; src2 = _ctx->level[_l - 1].im2; for (j = 0; j < h; j++) { int j0offs; int j1offs; j0offs = 2 * j * w2; j1offs = FS_MINI(2 * j + 1, h2) * w2; for (i = 0; i < w; i++) { int i0; int i1; i0 = 2 * i; i1 = FS_MINI(i0 + 1, w2); dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] + src1[j1offs + i0] + src1[j1offs + i1]; dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] + src2[j1offs + i0] + src2[j1offs + i1]; } } } static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1, int _s1ystride, const uint8_t *_src2, int _s2ystride, int _w, int _h, uint32_t shift, int buf_is_hbd) { uint32_t *dst1; uint32_t *dst2; int w; int h; int i; int j; w = _ctx->level[0].w; h = _ctx->level[0].h; dst1 = _ctx->level[0].im1; dst2 = _ctx->level[0].im2; for (j = 0; j < h; j++) { int j0; int j1; j0 = 2 * j; j1 = FS_MINI(j0 + 1, _h); for (i = 0; i < w; i++) { int i0; int i1; i0 = 2 * i; i1 = FS_MINI(i0 + 1, _w); if (!buf_is_hbd) { dst1[j * w + i] = _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] + _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1]; dst2[j * w + i] = _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] + _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1]; } else { uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1); uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2); dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) + (src1s[j0 * _s1ystride + i1] >> shift) + (src1s[j1 * _s1ystride + i0] >> shift) + (src1s[j1 * _s1ystride + i1] >> shift); dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) + (src2s[j0 * _s2ystride + i1] >> shift) + (src2s[j1 * _s2ystride + i0] >> shift) + (src2s[j1 * _s2ystride + i1] >> shift); } } } } static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { unsigned *col_sums_x; unsigned *col_sums_y; uint32_t *im1; uint32_t *im2; double *ssim; double c1; int w; int h; int j0offs; int j1offs; int i; int j; double ssim_c1 = SSIM_C1; if (bit_depth == 10) ssim_c1 = SSIM_C1_10; if (bit_depth == 12) ssim_c1 = SSIM_C1_12; w = _ctx->level[_l].w; h = _ctx->level[_l].h; col_sums_x = _ctx->col_buf; col_sums_y = col_sums_x + w; im1 = _ctx->level[_l].im1; im2 = _ctx->level[_l].im2; for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i]; for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i]; for (j = 1; j < 4; j++) { j1offs = FS_MINI(j, h - 1) * w; for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; } ssim = _ctx->level[_l].ssim; c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l)); for (j = 0; j < h; j++) { unsigned mux; unsigned muy; int i0; int i1; mux = 5 * col_sums_x[0]; muy = 5 * col_sums_y[0]; for (i = 1; i < 4; i++) { i1 = FS_MINI(i, w - 1); mux += col_sums_x[i1]; muy += col_sums_y[i1]; } for (i = 0; i < w; i++) { ssim[j * w + i] *= (2 * mux * (double)muy + c1) / (mux * (double)mux + muy * (double)muy + c1); if (i + 1 < w) { i0 = FS_MAXI(0, i - 4); i1 = FS_MINI(i + 4, w - 1); mux += col_sums_x[i1] - col_sums_x[i0]; muy += col_sums_x[i1] - col_sums_x[i0]; } } if (j + 1 < h) { j0offs = FS_MAXI(0, j - 4) * w; for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i]; for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i]; j1offs = FS_MINI(j + 4, h - 1) * w; for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; } } } #define FS_COL_SET(_col, _joffs, _ioffs) \ do { \ unsigned gx; \ unsigned gy; \ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ col_sums_gx2[(_col)] = gx * (double)gx; \ col_sums_gy2[(_col)] = gy * (double)gy; \ col_sums_gxgy[(_col)] = gx * (double)gy; \ } while (0) #define FS_COL_ADD(_col, _joffs, _ioffs) \ do { \ unsigned gx; \ unsigned gy; \ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ col_sums_gx2[(_col)] += gx * (double)gx; \ col_sums_gy2[(_col)] += gy * (double)gy; \ col_sums_gxgy[(_col)] += gx * (double)gy; \ } while (0) #define FS_COL_SUB(_col, _joffs, _ioffs) \ do { \ unsigned gx; \ unsigned gy; \ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ col_sums_gx2[(_col)] -= gx * (double)gx; \ col_sums_gy2[(_col)] -= gy * (double)gy; \ col_sums_gxgy[(_col)] -= gx * (double)gy; \ } while (0) #define FS_COL_COPY(_col1, _col2) \ do { \ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \ } while (0) #define FS_COL_HALVE(_col1, _col2) \ do { \ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \ } while (0) #define FS_COL_DOUBLE(_col1, _col2) \ do { \ col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \ col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \ } while (0) static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) { uint32_t *im1; uint32_t *im2; unsigned *gx_buf; unsigned *gy_buf; double *ssim; double col_sums_gx2[8]; double col_sums_gy2[8]; double col_sums_gxgy[8]; double c2; int stride; int w; int h; int i; int j; double ssim_c2 = SSIM_C2; if (bit_depth == 10) ssim_c2 = SSIM_C2_10; if (bit_depth == 12) ssim_c2 = SSIM_C2_12; w = _ctx->level[_l].w; h = _ctx->level[_l].h; im1 = _ctx->level[_l].im1; im2 = _ctx->level[_l].im2; ssim = _ctx->level[_l].ssim; gx_buf = _ctx->col_buf; stride = w + 8; gy_buf = gx_buf + 8 * stride; memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf)); c2 = ssim_c2 * (1 << 4 * _l) * 16 * 104; for (j = 0; j < h + 4; j++) { if (j < h - 1) { for (i = 0; i < w - 1; i++) { unsigned g1; unsigned g2; unsigned gx; unsigned gy; g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]); g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]); gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]); g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]); gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); gx_buf[(j & 7) * stride + i + 4] = gx; gy_buf[(j & 7) * stride + i + 4] = gy; } } else { memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf)); memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf)); } if (j >= 4) { int k; col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0; col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0; col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] = col_sums_gxgy[0] = 0; for (i = 4; i < 8; i++) { FS_COL_SET(i, -1, 0); FS_COL_ADD(i, 0, 0); for (k = 1; k < 8 - i; k++) { FS_COL_DOUBLE(i, i); FS_COL_ADD(i, -k - 1, 0); FS_COL_ADD(i, k, 0); } } for (i = 0; i < w; i++) { double mugx2; double mugy2; double mugxgy; mugx2 = col_sums_gx2[0]; for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k]; mugy2 = col_sums_gy2[0]; for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k]; mugxgy = col_sums_gxgy[0]; for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k]; ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2); if (i + 1 < w) { FS_COL_SET(0, -1, 1); FS_COL_ADD(0, 0, 1); FS_COL_SUB(2, -3, 2); FS_COL_SUB(2, 2, 2); FS_COL_HALVE(1, 2); FS_COL_SUB(3, -4, 3); FS_COL_SUB(3, 3, 3); FS_COL_HALVE(2, 3); FS_COL_COPY(3, 4); FS_COL_DOUBLE(4, 5); FS_COL_ADD(4, -4, 5); FS_COL_ADD(4, 3, 5); FS_COL_DOUBLE(5, 6); FS_COL_ADD(5, -3, 6); FS_COL_ADD(5, 2, 6); FS_COL_DOUBLE(6, 7); FS_COL_ADD(6, -2, 7); FS_COL_ADD(6, 1, 7); FS_COL_SET(7, -1, 8); FS_COL_ADD(7, 0, 8); } } } } } #define FS_NLEVELS (4) /*These weights were derived from the default weights found in Wang's original Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}. We drop the finest scale and renormalize the rest to sum to 1.*/ static const double FS_WEIGHTS[FS_NLEVELS] = { 0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625 }; static double fs_average(fs_ctx *_ctx, int _l) { double *ssim; double ret; int w; int h; int i; int j; w = _ctx->level[_l].w; h = _ctx->level[_l].h; ssim = _ctx->level[_l].ssim; ret = 0; for (j = 0; j < h; j++) for (i = 0; i < w; i++) ret += ssim[j * w + i]; return pow(ret / (w * h), FS_WEIGHTS[_l]); } static double convert_ssim_db(double _ssim, double _weight) { assert(_weight >= _ssim); if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB; return 10 * (log10(_weight) - log10(_weight - _ssim)); } static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst, int _dystride, int _w, int _h, uint32_t _bd, uint32_t _shift, int buf_is_hbd) { fs_ctx ctx; double ret; int l; ret = 1; if (fs_ctx_init(&ctx, _w, _h, FS_NLEVELS)) return 99.0; fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _shift, buf_is_hbd); for (l = 0; l < FS_NLEVELS - 1; l++) { fs_calc_structure(&ctx, l, _bd); ret *= fs_average(&ctx, l); fs_downsample_level(&ctx, l + 1); } fs_calc_structure(&ctx, l, _bd); fs_apply_luminance(&ctx, l, _bd); ret *= fs_average(&ctx, l); fs_ctx_clear(&ctx); return ret; } double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *ssim_y, double *ssim_u, double *ssim_v, uint32_t bd, uint32_t in_bd) { double ssimv; uint32_t bd_shift = 0; assert(bd >= in_bd); assert(source->flags == dest->flags); int buf_is_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH; bd_shift = bd - in_bd; *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer, dest->y_stride, source->y_crop_width, source->y_crop_height, in_bd, bd_shift, buf_is_hbd); *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer, dest->uv_stride, source->uv_crop_width, source->uv_crop_height, in_bd, bd_shift, buf_is_hbd); *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer, dest->uv_stride, source->uv_crop_width, source->uv_crop_height, in_bd, bd_shift, buf_is_hbd); ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v)); return convert_ssim_db(ssimv, 1.0); } aom-3.12.1/aom_dsp/fft.c000066400000000000000000000212461477627663500147530ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/fft_common.h" #include "config/aom_dsp_rtcd.h" static inline void simple_transpose(const float *A, float *B, int n) { for (int y = 0; y < n; y++) { for (int x = 0; x < n; x++) { B[y * n + x] = A[x * n + y]; } } } // The 1d transform is real to complex and packs the complex results in // a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real // components, followed by the n/2 - 1 imaginary components). After the // transform is done on the rows, the first n/2 + 1 columns are real, and // the remaining are the imaginary components. After the transform on the // columns, the region of [0, n/2]x[0, n/2] contains the real part of // fft of the real columns. The real part of the 2d fft also includes the // imaginary part of transformed imaginary columns. This function assembles // the correct outputs while putting the real and imaginary components // next to each other. static inline void unpack_2d_output(const float *col_fft, float *output, int n) { for (int y = 0; y <= n / 2; ++y) { const int y2 = y + n / 2; const int y_extra = y2 > n / 2 && y2 < n; for (int x = 0; x <= n / 2; ++x) { const int x2 = x + n / 2; const int x_extra = x2 > n / 2 && x2 < n; output[2 * (y * n + x)] = col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0); output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) + (x_extra ? col_fft[y * n + x2] : 0); if (y_extra) { output[2 * ((n - y) * n + x)] = col_fft[y * n + x] + (x_extra && y_extra ? col_fft[y2 * n + x2] : 0); output[2 * ((n - y) * n + x) + 1] = -(y_extra ? col_fft[y2 * n + x] : 0) + (x_extra ? col_fft[y * n + x2] : 0); } } } } void aom_fft_2d_gen(const float *input, float *temp, float *output, int n, aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose, aom_fft_unpack_func_t unpack, int vec_size) { for (int x = 0; x < n; x += vec_size) { tform(input + x, output + x, n); } transpose(output, temp, n); for (int x = 0; x < n; x += vec_size) { tform(temp + x, output + x, n); } transpose(output, temp, n); unpack(temp, output, n); } static inline void store_float(float *output, float input) { *output = input; } static inline float add_float(float a, float b) { return a + b; } static inline float sub_float(float a, float b) { return a - b; } static inline float mul_float(float a, float b) { return a * b; } GEN_FFT_2(void, float, float, float, *, store_float) GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float, sub_float) GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float, sub_float, mul_float) GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float, sub_float, mul_float) GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float, sub_float, mul_float) void aom_fft2x2_float_c(const float *input, float *temp, float *output) { aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose, unpack_2d_output, 1); } void aom_fft4x4_float_c(const float *input, float *temp, float *output) { aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose, unpack_2d_output, 1); } void aom_fft8x8_float_c(const float *input, float *temp, float *output) { aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose, unpack_2d_output, 1); } void aom_fft16x16_float_c(const float *input, float *temp, float *output) { aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose, unpack_2d_output, 1); } void aom_fft32x32_float_c(const float *input, float *temp, float *output) { aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose, unpack_2d_output, 1); } void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n, aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi, aom_fft_1d_func_t ifft_multi, aom_fft_transpose_func_t transpose, int vec_size) { // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft // and get real outputs. for (int y = 0; y <= n / 2; ++y) { output[y * n] = input[2 * y * n]; output[y * n + 1] = input[2 * (y * n + n / 2)]; } for (int y = n / 2 + 1; y < n; ++y) { output[y * n] = input[2 * (y - n / 2) * n + 1]; output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1]; } for (int i = 0; i < 2; i += vec_size) { ifft_multi(output + i, temp + i, n); } // For the other columns, since we don't have a full ifft for complex inputs // we have to split them into the real and imaginary counterparts. // Pack the real component, then the imaginary components. for (int y = 0; y < n; ++y) { for (int x = 1; x < n / 2; ++x) { output[y * n + (x + 1)] = input[2 * (y * n + x)]; } for (int x = 1; x < n / 2; ++x) { output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1]; } } for (int y = 2; y < vec_size; y++) { fft_single(output + y, temp + y, n); } // This is the part that can be sped up with SIMD for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) { fft_multi(output + y, temp + y, n); } // Put the 0 and n/2 th results in the correct place. for (int x = 0; x < n; ++x) { output[x] = temp[x * n]; output[(n / 2) * n + x] = temp[x * n + 1]; } // This rearranges and transposes. for (int y = 1; y < n / 2; ++y) { // Fill in the real columns for (int x = 0; x <= n / 2; ++x) { output[x + y * n] = temp[(y + 1) + x * n] + ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0); } for (int x = n / 2 + 1; x < n; ++x) { output[x + y * n] = temp[(y + 1) + (n - x) * n] - temp[(y + n / 2) + ((n - x) + n / 2) * n]; } // Fill in the imag columns for (int x = 0; x <= n / 2; ++x) { output[x + (y + n / 2) * n] = temp[(y + n / 2) + x * n] - ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0); } for (int x = n / 2 + 1; x < n; ++x) { output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] + temp[(y + n / 2) + (n - x) * n]; } } for (int y = 0; y < n; y += vec_size) { ifft_multi(output + y, temp + y, n); } transpose(temp, output, n); } GEN_IFFT_2(static void, float, float, float, *, store_float) GEN_IFFT_4(static void, float, float, float, *, store_float, (float), add_float, sub_float) GEN_IFFT_8(static void, float, float, float, *, store_float, (float), add_float, sub_float, mul_float) GEN_IFFT_16(static void, float, float, float, *, store_float, (float), add_float, sub_float, mul_float) GEN_IFFT_32(static void, float, float, float, *, store_float, (float), add_float, sub_float, mul_float) void aom_ifft2x2_float_c(const float *input, float *temp, float *output) { aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float, aom_ifft1d_2_float, simple_transpose, 1); } void aom_ifft4x4_float_c(const float *input, float *temp, float *output) { aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float, aom_ifft1d_4_float, simple_transpose, 1); } void aom_ifft8x8_float_c(const float *input, float *temp, float *output) { aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float, aom_ifft1d_8_float, simple_transpose, 1); } void aom_ifft16x16_float_c(const float *input, float *temp, float *output) { aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1); } void aom_ifft32x32_float_c(const float *input, float *temp, float *output) { aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1); } aom-3.12.1/aom_dsp/fft_common.h000066400000000000000000002304571477627663500163360ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_FFT_COMMON_H_ #define AOM_AOM_DSP_FFT_COMMON_H_ #ifdef __cplusplus extern "C" { #endif /*!\brief A function pointer for computing 1d fft and ifft. * * The function will point to an implementation for a specific transform size, * and may perform the transforms using vectorized instructions. * * For a non-vectorized forward transforms of size n, the input and output * buffers will be size n. The output takes advantage of conjugate symmetry and * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where * (r_{j}, i_{j}) is the complex output for index j. * * An inverse transform will assume that the complex "input" is packed * similarly. Its output will be real. * * Non-vectorized transforms (e.g., on a single row) would use a stride = 1. * * Vectorized implementations are parallelized along the columns so that the fft * can be performed on multiple columns at a time. In such cases the data block * for input and output is typically square (n x n) and the stride will * correspond to the spacing between rows. At minimum, the input size must be * n x simd_vector_length. * * \param[in] input Input buffer. See above for size restrictions. * \param[out] output Output buffer. See above for size restrictions. * \param[in] stride The spacing in number of elements between rows * (or elements) */ typedef void (*aom_fft_1d_func_t)(const float *input, float *output, int stride); // Declare some of the forward non-vectorized transforms which are used in some // of the vectorized implementations void aom_fft1d_2_float(const float *input, float *output, int stride); void aom_fft1d_4_float(const float *input, float *output, int stride); void aom_fft1d_8_float(const float *input, float *output, int stride); void aom_fft1d_16_float(const float *input, float *output, int stride); void aom_fft1d_32_float(const float *input, float *output, int stride); /**\!brief Function pointer for transposing a matrix of floats. * * \param[in] input Input buffer (size n x n) * \param[out] output Output buffer (size n x n) * \param[in] n Extent of one dimension of the square matrix. */ typedef void (*aom_fft_transpose_func_t)(const float *input, float *output, int n); /**\!brief Function pointer for re-arranging intermediate 2d transform results. * * After re-arrangement, the real and imaginary components will be packed * tightly next to each other. * * \param[in] input Input buffer (size n x n) * \param[out] output Output buffer (size 2 x n x n) * \param[in] n Extent of one dimension of the square matrix. */ typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n); /*!\brief Performs a 2d fft with the given functions. * * This generator function allows for multiple different implementations of 2d * fft with different vector operations, without having to redefine the main * body multiple times. * * \param[in] input Input buffer to run the transform on (size n x n) * \param[out] temp Working buffer for computing the transform (size n x n) * \param[out] output Output buffer (size 2 x n x n) * \param[in] tform Forward transform function * \param[in] transpose Transpose function (for n x n matrix) * \param[in] unpack Unpack function used to massage outputs to correct form * \param[in] vec_size Vector size (the transform is done vec_size units at * a time) */ void aom_fft_2d_gen(const float *input, float *temp, float *output, int n, aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose, aom_fft_unpack_func_t unpack, int vec_size); /*!\brief Perform a 2d inverse fft with the given helper functions * * \param[in] input Input buffer to run the transform on (size 2 x n x n) * \param[out] temp Working buffer for computations (size 2 x n x n) * \param[out] output Output buffer (size n x n) * \param[in] fft_single Forward transform function (non vectorized) * \param[in] fft_multi Forward transform function (vectorized) * \param[in] ifft_multi Inverse transform function (vectorized) * \param[in] transpose Transpose function (for n x n matrix) * \param[in] vec_size Vector size (the transform is done vec_size * units at a time) */ void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n, aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi, aom_fft_1d_func_t ifft_multi, aom_fft_transpose_func_t transpose, int vec_size); #ifdef __cplusplus } #endif // The macros below define 1D fft/ifft for different data types and for // different simd vector intrinsic types. #define GEN_FFT_2(ret, suffix, T, T_VEC, load, store) \ ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \ const T_VEC i0 = load(input + 0 * stride); \ const T_VEC i1 = load(input + 1 * stride); \ store(output + 0 * stride, i0 + i1); \ store(output + 1 * stride, i0 - i1); \ } #define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \ ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) { \ const T_VEC kWeight0 = constant(0.0f); \ const T_VEC i0 = load(input + 0 * stride); \ const T_VEC i1 = load(input + 1 * stride); \ const T_VEC i2 = load(input + 2 * stride); \ const T_VEC i3 = load(input + 3 * stride); \ const T_VEC w0 = add(i0, i2); \ const T_VEC w1 = sub(i0, i2); \ const T_VEC w2 = add(i1, i3); \ const T_VEC w3 = sub(i1, i3); \ store(output + 0 * stride, add(w0, w2)); \ store(output + 1 * stride, w1); \ store(output + 2 * stride, sub(w0, w2)); \ store(output + 3 * stride, sub(kWeight0, w3)); \ } #define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \ ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) { \ const T_VEC kWeight0 = constant(0.0f); \ const T_VEC kWeight2 = constant(0.707107f); \ const T_VEC i0 = load(input + 0 * stride); \ const T_VEC i1 = load(input + 1 * stride); \ const T_VEC i2 = load(input + 2 * stride); \ const T_VEC i3 = load(input + 3 * stride); \ const T_VEC i4 = load(input + 4 * stride); \ const T_VEC i5 = load(input + 5 * stride); \ const T_VEC i6 = load(input + 6 * stride); \ const T_VEC i7 = load(input + 7 * stride); \ const T_VEC w0 = add(i0, i4); \ const T_VEC w1 = sub(i0, i4); \ const T_VEC w2 = add(i2, i6); \ const T_VEC w3 = sub(i2, i6); \ const T_VEC w4 = add(w0, w2); \ const T_VEC w5 = sub(w0, w2); \ const T_VEC w7 = add(i1, i5); \ const T_VEC w8 = sub(i1, i5); \ const T_VEC w9 = add(i3, i7); \ const T_VEC w10 = sub(i3, i7); \ const T_VEC w11 = add(w7, w9); \ const T_VEC w12 = sub(w7, w9); \ store(output + 0 * stride, add(w4, w11)); \ store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10)))); \ store(output + 2 * stride, w5); \ store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10)))); \ store(output + 4 * stride, sub(w4, w11)); \ store(output + 5 * stride, \ sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8)))); \ store(output + 6 * stride, sub(kWeight0, w12)); \ store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8)))); \ } #define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ mul) \ ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) { \ const T_VEC kWeight0 = constant(0.0f); \ const T_VEC kWeight2 = constant(0.707107f); \ const T_VEC kWeight3 = constant(0.92388f); \ const T_VEC kWeight4 = constant(0.382683f); \ const T_VEC i0 = load(input + 0 * stride); \ const T_VEC i1 = load(input + 1 * stride); \ const T_VEC i2 = load(input + 2 * stride); \ const T_VEC i3 = load(input + 3 * stride); \ const T_VEC i4 = load(input + 4 * stride); \ const T_VEC i5 = load(input + 5 * stride); \ const T_VEC i6 = load(input + 6 * stride); \ const T_VEC i7 = load(input + 7 * stride); \ const T_VEC i8 = load(input + 8 * stride); \ const T_VEC i9 = load(input + 9 * stride); \ const T_VEC i10 = load(input + 10 * stride); \ const T_VEC i11 = load(input + 11 * stride); \ const T_VEC i12 = load(input + 12 * stride); \ const T_VEC i13 = load(input + 13 * stride); \ const T_VEC i14 = load(input + 14 * stride); \ const T_VEC i15 = load(input + 15 * stride); \ const T_VEC w0 = add(i0, i8); \ const T_VEC w1 = sub(i0, i8); \ const T_VEC w2 = add(i4, i12); \ const T_VEC w3 = sub(i4, i12); \ const T_VEC w4 = add(w0, w2); \ const T_VEC w5 = sub(w0, w2); \ const T_VEC w7 = add(i2, i10); \ const T_VEC w8 = sub(i2, i10); \ const T_VEC w9 = add(i6, i14); \ const T_VEC w10 = sub(i6, i14); \ const T_VEC w11 = add(w7, w9); \ const T_VEC w12 = sub(w7, w9); \ const T_VEC w14 = add(w4, w11); \ const T_VEC w15 = sub(w4, w11); \ const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \ sub(sub(kWeight0, w3), \ mul(kWeight2, add(w10, w8))) }; \ const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \ sub(w3, mul(kWeight2, add(w10, w8))) }; \ const T_VEC w19 = add(i1, i9); \ const T_VEC w20 = sub(i1, i9); \ const T_VEC w21 = add(i5, i13); \ const T_VEC w22 = sub(i5, i13); \ const T_VEC w23 = add(w19, w21); \ const T_VEC w24 = sub(w19, w21); \ const T_VEC w26 = add(i3, i11); \ const T_VEC w27 = sub(i3, i11); \ const T_VEC w28 = add(i7, i15); \ const T_VEC w29 = sub(i7, i15); \ const T_VEC w30 = add(w26, w28); \ const T_VEC w31 = sub(w26, w28); \ const T_VEC w33 = add(w23, w30); \ const T_VEC w34 = sub(w23, w30); \ const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \ sub(sub(kWeight0, w22), \ mul(kWeight2, add(w29, w27))) }; \ const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \ sub(w22, mul(kWeight2, add(w29, w27))) }; \ store(output + 0 * stride, add(w14, w33)); \ store(output + 1 * stride, \ add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \ store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31)))); \ store(output + 3 * stride, \ add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \ store(output + 4 * stride, w15); \ store(output + 5 * stride, \ add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])), \ mul(kWeight3, w37[1])))); \ store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31)))); \ store(output + 7 * stride, \ add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])), \ mul(kWeight4, w35[1])))); \ store(output + 8 * stride, sub(w14, w33)); \ store(output + 9 * stride, \ add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \ store(output + 10 * stride, \ sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24)))); \ store(output + 11 * stride, \ add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \ store(output + 12 * stride, sub(kWeight0, w34)); \ store(output + 13 * stride, \ sub(sub(kWeight0, w18[1]), \ sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))); \ store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24)))); \ store(output + 15 * stride, \ sub(sub(kWeight0, w16[1]), \ sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))); \ } #define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ mul) \ ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) { \ const T_VEC kWeight0 = constant(0.0f); \ const T_VEC kWeight2 = constant(0.707107f); \ const T_VEC kWeight3 = constant(0.92388f); \ const T_VEC kWeight4 = constant(0.382683f); \ const T_VEC kWeight5 = constant(0.980785f); \ const T_VEC kWeight6 = constant(0.19509f); \ const T_VEC kWeight7 = constant(0.83147f); \ const T_VEC kWeight8 = constant(0.55557f); \ const T_VEC i0 = load(input + 0 * stride); \ const T_VEC i1 = load(input + 1 * stride); \ const T_VEC i2 = load(input + 2 * stride); \ const T_VEC i3 = load(input + 3 * stride); \ const T_VEC i4 = load(input + 4 * stride); \ const T_VEC i5 = load(input + 5 * stride); \ const T_VEC i6 = load(input + 6 * stride); \ const T_VEC i7 = load(input + 7 * stride); \ const T_VEC i8 = load(input + 8 * stride); \ const T_VEC i9 = load(input + 9 * stride); \ const T_VEC i10 = load(input + 10 * stride); \ const T_VEC i11 = load(input + 11 * stride); \ const T_VEC i12 = load(input + 12 * stride); \ const T_VEC i13 = load(input + 13 * stride); \ const T_VEC i14 = load(input + 14 * stride); \ const T_VEC i15 = load(input + 15 * stride); \ const T_VEC i16 = load(input + 16 * stride); \ const T_VEC i17 = load(input + 17 * stride); \ const T_VEC i18 = load(input + 18 * stride); \ const T_VEC i19 = load(input + 19 * stride); \ const T_VEC i20 = load(input + 20 * stride); \ const T_VEC i21 = load(input + 21 * stride); \ const T_VEC i22 = load(input + 22 * stride); \ const T_VEC i23 = load(input + 23 * stride); \ const T_VEC i24 = load(input + 24 * stride); \ const T_VEC i25 = load(input + 25 * stride); \ const T_VEC i26 = load(input + 26 * stride); \ const T_VEC i27 = load(input + 27 * stride); \ const T_VEC i28 = load(input + 28 * stride); \ const T_VEC i29 = load(input + 29 * stride); \ const T_VEC i30 = load(input + 30 * stride); \ const T_VEC i31 = load(input + 31 * stride); \ const T_VEC w0 = add(i0, i16); \ const T_VEC w1 = sub(i0, i16); \ const T_VEC w2 = add(i8, i24); \ const T_VEC w3 = sub(i8, i24); \ const T_VEC w4 = add(w0, w2); \ const T_VEC w5 = sub(w0, w2); \ const T_VEC w7 = add(i4, i20); \ const T_VEC w8 = sub(i4, i20); \ const T_VEC w9 = add(i12, i28); \ const T_VEC w10 = sub(i12, i28); \ const T_VEC w11 = add(w7, w9); \ const T_VEC w12 = sub(w7, w9); \ const T_VEC w14 = add(w4, w11); \ const T_VEC w15 = sub(w4, w11); \ const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \ sub(sub(kWeight0, w3), \ mul(kWeight2, add(w10, w8))) }; \ const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \ sub(w3, mul(kWeight2, add(w10, w8))) }; \ const T_VEC w19 = add(i2, i18); \ const T_VEC w20 = sub(i2, i18); \ const T_VEC w21 = add(i10, i26); \ const T_VEC w22 = sub(i10, i26); \ const T_VEC w23 = add(w19, w21); \ const T_VEC w24 = sub(w19, w21); \ const T_VEC w26 = add(i6, i22); \ const T_VEC w27 = sub(i6, i22); \ const T_VEC w28 = add(i14, i30); \ const T_VEC w29 = sub(i14, i30); \ const T_VEC w30 = add(w26, w28); \ const T_VEC w31 = sub(w26, w28); \ const T_VEC w33 = add(w23, w30); \ const T_VEC w34 = sub(w23, w30); \ const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \ sub(sub(kWeight0, w22), \ mul(kWeight2, add(w29, w27))) }; \ const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \ sub(w22, mul(kWeight2, add(w29, w27))) }; \ const T_VEC w38 = add(w14, w33); \ const T_VEC w39 = sub(w14, w33); \ const T_VEC w40[2] = { \ add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))), \ add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0]))) \ }; \ const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))), \ sub(sub(kWeight0, w12), \ mul(kWeight2, add(w31, w24))) }; \ const T_VEC w42[2] = { \ add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))), \ add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0]))) \ }; \ const T_VEC w44[2] = { \ add(w18[0], \ sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \ sub(sub(kWeight0, w18[1]), \ sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))) \ }; \ const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))), \ sub(w12, mul(kWeight2, add(w31, w24))) }; \ const T_VEC w46[2] = { \ add(w16[0], \ sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \ sub(sub(kWeight0, w16[1]), \ sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))) \ }; \ const T_VEC w47 = add(i1, i17); \ const T_VEC w48 = sub(i1, i17); \ const T_VEC w49 = add(i9, i25); \ const T_VEC w50 = sub(i9, i25); \ const T_VEC w51 = add(w47, w49); \ const T_VEC w52 = sub(w47, w49); \ const T_VEC w54 = add(i5, i21); \ const T_VEC w55 = sub(i5, i21); \ const T_VEC w56 = add(i13, i29); \ const T_VEC w57 = sub(i13, i29); \ const T_VEC w58 = add(w54, w56); \ const T_VEC w59 = sub(w54, w56); \ const T_VEC w61 = add(w51, w58); \ const T_VEC w62 = sub(w51, w58); \ const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))), \ sub(sub(kWeight0, w50), \ mul(kWeight2, add(w57, w55))) }; \ const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))), \ sub(w50, mul(kWeight2, add(w57, w55))) }; \ const T_VEC w66 = add(i3, i19); \ const T_VEC w67 = sub(i3, i19); \ const T_VEC w68 = add(i11, i27); \ const T_VEC w69 = sub(i11, i27); \ const T_VEC w70 = add(w66, w68); \ const T_VEC w71 = sub(w66, w68); \ const T_VEC w73 = add(i7, i23); \ const T_VEC w74 = sub(i7, i23); \ const T_VEC w75 = add(i15, i31); \ const T_VEC w76 = sub(i15, i31); \ const T_VEC w77 = add(w73, w75); \ const T_VEC w78 = sub(w73, w75); \ const T_VEC w80 = add(w70, w77); \ const T_VEC w81 = sub(w70, w77); \ const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))), \ sub(sub(kWeight0, w69), \ mul(kWeight2, add(w76, w74))) }; \ const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))), \ sub(w69, mul(kWeight2, add(w76, w74))) }; \ const T_VEC w85 = add(w61, w80); \ const T_VEC w86 = sub(w61, w80); \ const T_VEC w87[2] = { \ add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))), \ add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0]))) \ }; \ const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))), \ sub(sub(kWeight0, w59), \ mul(kWeight2, add(w78, w71))) }; \ const T_VEC w89[2] = { \ add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))), \ add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0]))) \ }; \ const T_VEC w91[2] = { \ add(w65[0], \ sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \ sub(sub(kWeight0, w65[1]), \ sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1]))) \ }; \ const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))), \ sub(w59, mul(kWeight2, add(w78, w71))) }; \ const T_VEC w93[2] = { \ add(w63[0], \ sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \ sub(sub(kWeight0, w63[1]), \ sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1]))) \ }; \ store(output + 0 * stride, add(w38, w85)); \ store(output + 1 * stride, \ add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1])))); \ store(output + 2 * stride, \ add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1])))); \ store(output + 3 * stride, \ add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1])))); \ store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81)))); \ store(output + 5 * stride, \ add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1])))); \ store(output + 6 * stride, \ add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1])))); \ store(output + 7 * stride, \ add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1])))); \ store(output + 8 * stride, w39); \ store(output + 9 * stride, \ add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])), \ mul(kWeight5, w93[1])))); \ store(output + 10 * stride, \ add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])), \ mul(kWeight3, w92[1])))); \ store(output + 11 * stride, \ add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])), \ mul(kWeight7, w91[1])))); \ store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81)))); \ store(output + 13 * stride, \ add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])), \ mul(kWeight8, w89[1])))); \ store(output + 14 * stride, \ add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])), \ mul(kWeight4, w88[1])))); \ store(output + 15 * stride, \ add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])), \ mul(kWeight6, w87[1])))); \ store(output + 16 * stride, sub(w38, w85)); \ store(output + 17 * stride, \ add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0])))); \ store(output + 18 * stride, \ add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0])))); \ store(output + 19 * stride, \ add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0])))); \ store(output + 20 * stride, \ sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62)))); \ store(output + 21 * stride, \ add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0])))); \ store(output + 22 * stride, \ add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0])))); \ store(output + 23 * stride, \ add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0])))); \ store(output + 24 * stride, sub(kWeight0, w86)); \ store(output + 25 * stride, \ sub(sub(kWeight0, w46[1]), \ sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1])))); \ store(output + 26 * stride, \ sub(sub(kWeight0, w45[1]), \ sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1])))); \ store(output + 27 * stride, \ sub(sub(kWeight0, w44[1]), \ sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1])))); \ store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62)))); \ store(output + 29 * stride, \ sub(sub(kWeight0, w42[1]), \ sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1])))); \ store(output + 30 * stride, \ sub(sub(kWeight0, w41[1]), \ sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1])))); \ store(output + 31 * stride, \ sub(sub(kWeight0, w40[1]), \ sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1])))); \ } #define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store) \ ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \ const T_VEC i0 = load(input + 0 * stride); \ const T_VEC i1 = load(input + 1 * stride); \ store(output + 0 * stride, i0 + i1); \ store(output + 1 * stride, i0 - i1); \ } #define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \ ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) { \ const T_VEC kWeight0 = constant(0.0f); \ const T_VEC i0 = load(input + 0 * stride); \ const T_VEC i1 = load(input + 1 * stride); \ const T_VEC i2 = load(input + 2 * stride); \ const T_VEC i3 = load(input + 3 * stride); \ const T_VEC w2 = add(i0, i2); \ const T_VEC w3 = sub(i0, i2); \ const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) }; \ const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) }; \ store(output + 0 * stride, add(w2, w4[0])); \ store(output + 1 * stride, add(w3, w5[1])); \ store(output + 2 * stride, sub(w2, w4[0])); \ store(output + 3 * stride, sub(w3, w5[1])); \ } #define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ mul) \ ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) { \ const T_VEC kWeight0 = constant(0.0f); \ const T_VEC kWeight2 = constant(0.707107f); \ const T_VEC i0 = load(input + 0 * stride); \ const T_VEC i1 = load(input + 1 * stride); \ const T_VEC i2 = load(input + 2 * stride); \ const T_VEC i3 = load(input + 3 * stride); \ const T_VEC i4 = load(input + 4 * stride); \ const T_VEC i5 = load(input + 5 * stride); \ const T_VEC i6 = load(input + 6 * stride); \ const T_VEC i7 = load(input + 7 * stride); \ const T_VEC w6 = add(i0, i4); \ const T_VEC w7 = sub(i0, i4); \ const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) }; \ const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) }; \ const T_VEC w10[2] = { add(w6, w8[0]), w8[1] }; \ const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) }; \ const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) }; \ const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] }; \ const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) }; \ const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) }; \ const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) }; \ const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) }; \ const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) }; \ const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) }; \ const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) }; \ const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) }; \ store(output + 0 * stride, add(w10[0], w18[0])); \ store(output + 1 * stride, \ add(w12[0], mul(kWeight2, add(w20[0], w20[1])))); \ store(output + 2 * stride, add(w11[0], w19[1])); \ store(output + 3 * stride, \ sub(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \ store(output + 4 * stride, sub(w10[0], w18[0])); \ store(output + 5 * stride, \ add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])), \ mul(kWeight2, w20[1])))); \ store(output + 6 * stride, sub(w11[0], w19[1])); \ store(output + 7 * stride, \ add(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \ } #define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ mul) \ ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) { \ const T_VEC kWeight0 = constant(0.0f); \ const T_VEC kWeight2 = constant(0.707107f); \ const T_VEC kWeight3 = constant(0.92388f); \ const T_VEC kWeight4 = constant(0.382683f); \ const T_VEC i0 = load(input + 0 * stride); \ const T_VEC i1 = load(input + 1 * stride); \ const T_VEC i2 = load(input + 2 * stride); \ const T_VEC i3 = load(input + 3 * stride); \ const T_VEC i4 = load(input + 4 * stride); \ const T_VEC i5 = load(input + 5 * stride); \ const T_VEC i6 = load(input + 6 * stride); \ const T_VEC i7 = load(input + 7 * stride); \ const T_VEC i8 = load(input + 8 * stride); \ const T_VEC i9 = load(input + 9 * stride); \ const T_VEC i10 = load(input + 10 * stride); \ const T_VEC i11 = load(input + 11 * stride); \ const T_VEC i12 = load(input + 12 * stride); \ const T_VEC i13 = load(input + 13 * stride); \ const T_VEC i14 = load(input + 14 * stride); \ const T_VEC i15 = load(input + 15 * stride); \ const T_VEC w14 = add(i0, i8); \ const T_VEC w15 = sub(i0, i8); \ const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) }; \ const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) }; \ const T_VEC w18[2] = { add(w14, w16[0]), w16[1] }; \ const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) }; \ const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) }; \ const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] }; \ const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) }; \ const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) }; \ const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) }; \ const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) }; \ const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) }; \ const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) }; \ const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) }; \ const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) }; \ const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) }; \ const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) }; \ const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))), \ add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \ const T_VEC w33[2] = { add(w20[0], \ sub(sub(kWeight0, mul(kWeight2, w28[0])), \ mul(kWeight2, w28[1]))), \ add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \ const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) }; \ const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) }; \ const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \ sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \ const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \ add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \ const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) }; \ const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) }; \ const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) }; \ const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) }; \ const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \ const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \ const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \ const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \ const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) }; \ const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) }; \ const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) }; \ const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) }; \ const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) }; \ const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) }; \ const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) }; \ const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) }; \ const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) }; \ const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) }; \ const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))), \ add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \ const T_VEC w57[2] = { add(w44[0], \ sub(sub(kWeight0, mul(kWeight2, w52[0])), \ mul(kWeight2, w52[1]))), \ add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \ const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) }; \ const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) }; \ const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \ sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \ const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \ add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \ store(output + 0 * stride, add(w30[0], w54[0])); \ store(output + 1 * stride, \ add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1])))); \ store(output + 2 * stride, \ add(w34[0], mul(kWeight2, add(w58[0], w58[1])))); \ store(output + 3 * stride, \ add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1])))); \ store(output + 4 * stride, add(w31[0], w55[1])); \ store(output + 5 * stride, \ sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \ store(output + 6 * stride, \ sub(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \ store(output + 7 * stride, \ sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \ store(output + 8 * stride, sub(w30[0], w54[0])); \ store(output + 9 * stride, \ add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])), \ mul(kWeight4, w56[1])))); \ store(output + 10 * stride, \ add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])), \ mul(kWeight2, w58[1])))); \ store(output + 11 * stride, \ add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])), \ mul(kWeight3, w60[1])))); \ store(output + 12 * stride, sub(w31[0], w55[1])); \ store(output + 13 * stride, \ add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \ store(output + 14 * stride, \ add(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \ store(output + 15 * stride, \ add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \ } #define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ mul) \ ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) { \ const T_VEC kWeight0 = constant(0.0f); \ const T_VEC kWeight2 = constant(0.707107f); \ const T_VEC kWeight3 = constant(0.92388f); \ const T_VEC kWeight4 = constant(0.382683f); \ const T_VEC kWeight5 = constant(0.980785f); \ const T_VEC kWeight6 = constant(0.19509f); \ const T_VEC kWeight7 = constant(0.83147f); \ const T_VEC kWeight8 = constant(0.55557f); \ const T_VEC i0 = load(input + 0 * stride); \ const T_VEC i1 = load(input + 1 * stride); \ const T_VEC i2 = load(input + 2 * stride); \ const T_VEC i3 = load(input + 3 * stride); \ const T_VEC i4 = load(input + 4 * stride); \ const T_VEC i5 = load(input + 5 * stride); \ const T_VEC i6 = load(input + 6 * stride); \ const T_VEC i7 = load(input + 7 * stride); \ const T_VEC i8 = load(input + 8 * stride); \ const T_VEC i9 = load(input + 9 * stride); \ const T_VEC i10 = load(input + 10 * stride); \ const T_VEC i11 = load(input + 11 * stride); \ const T_VEC i12 = load(input + 12 * stride); \ const T_VEC i13 = load(input + 13 * stride); \ const T_VEC i14 = load(input + 14 * stride); \ const T_VEC i15 = load(input + 15 * stride); \ const T_VEC i16 = load(input + 16 * stride); \ const T_VEC i17 = load(input + 17 * stride); \ const T_VEC i18 = load(input + 18 * stride); \ const T_VEC i19 = load(input + 19 * stride); \ const T_VEC i20 = load(input + 20 * stride); \ const T_VEC i21 = load(input + 21 * stride); \ const T_VEC i22 = load(input + 22 * stride); \ const T_VEC i23 = load(input + 23 * stride); \ const T_VEC i24 = load(input + 24 * stride); \ const T_VEC i25 = load(input + 25 * stride); \ const T_VEC i26 = load(input + 26 * stride); \ const T_VEC i27 = load(input + 27 * stride); \ const T_VEC i28 = load(input + 28 * stride); \ const T_VEC i29 = load(input + 29 * stride); \ const T_VEC i30 = load(input + 30 * stride); \ const T_VEC i31 = load(input + 31 * stride); \ const T_VEC w30 = add(i0, i16); \ const T_VEC w31 = sub(i0, i16); \ const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) }; \ const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) }; \ const T_VEC w34[2] = { add(w30, w32[0]), w32[1] }; \ const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) }; \ const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) }; \ const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] }; \ const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) }; \ const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) }; \ const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) }; \ const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) }; \ const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \ const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \ const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \ const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \ const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) }; \ const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) }; \ const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))), \ add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) }; \ const T_VEC w49[2] = { add(w36[0], \ sub(sub(kWeight0, mul(kWeight2, w44[0])), \ mul(kWeight2, w44[1]))), \ add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) }; \ const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) }; \ const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) }; \ const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \ sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \ const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \ add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \ const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) }; \ const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) }; \ const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) }; \ const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) }; \ const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) }; \ const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) }; \ const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) }; \ const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) }; \ const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) }; \ const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) }; \ const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) }; \ const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) }; \ const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) }; \ const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) }; \ const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) }; \ const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) }; \ const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) }; \ const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) }; \ const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))), \ add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) }; \ const T_VEC w73[2] = { add(w60[0], \ sub(sub(kWeight0, mul(kWeight2, w68[0])), \ mul(kWeight2, w68[1]))), \ add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) }; \ const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) }; \ const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) }; \ const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \ sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \ const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \ add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \ const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) }; \ const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) }; \ const T_VEC w80[2] = { \ add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))), \ add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0]))) \ }; \ const T_VEC w81[2] = { \ add(w48[0], \ sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))), \ add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1]))) \ }; \ const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))), \ add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) }; \ const T_VEC w83[2] = { add(w50[0], \ sub(sub(kWeight0, mul(kWeight2, w74[0])), \ mul(kWeight2, w74[1]))), \ add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) }; \ const T_VEC w84[2] = { \ add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))), \ add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0]))) \ }; \ const T_VEC w85[2] = { \ add(w52[0], \ sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))), \ add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1]))) \ }; \ const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) }; \ const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) }; \ const T_VEC w88[2] = { \ sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \ add(w49[1], \ sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0]))) \ }; \ const T_VEC w89[2] = { \ add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \ add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0]))) \ }; \ const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \ sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \ const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \ add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \ const T_VEC w92[2] = { \ sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \ add(w53[1], \ sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0]))) \ }; \ const T_VEC w93[2] = { \ add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \ add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0]))) \ }; \ const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) }; \ const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) }; \ const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) }; \ const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) }; \ const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) }; \ const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) }; \ const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) }; \ const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) }; \ const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) }; \ const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) }; \ const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) }; \ const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) }; \ const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) }; \ const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) }; \ const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) }; \ const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) }; \ const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) }; \ const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) }; \ const T_VEC w112[2] = { \ add(w100[0], mul(kWeight2, add(w108[0], w108[1]))), \ add(w100[1], mul(kWeight2, sub(w108[1], w108[0]))) \ }; \ const T_VEC w113[2] = { \ add(w100[0], \ sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \ add(w100[1], mul(kWeight2, sub(w108[0], w108[1]))) \ }; \ const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) }; \ const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) }; \ const T_VEC w116[2] = { \ sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \ sub(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \ }; \ const T_VEC w117[2] = { \ add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \ add(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \ }; \ const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) }; \ const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) }; \ const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) }; \ const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) }; \ const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) }; \ const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) }; \ const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) }; \ const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) }; \ const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) }; \ const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) }; \ const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) }; \ const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) }; \ const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) }; \ const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) }; \ const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) }; \ const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) }; \ const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) }; \ const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) }; \ const T_VEC w136[2] = { \ add(w124[0], mul(kWeight2, add(w132[0], w132[1]))), \ add(w124[1], mul(kWeight2, sub(w132[1], w132[0]))) \ }; \ const T_VEC w137[2] = { \ add(w124[0], \ sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \ add(w124[1], mul(kWeight2, sub(w132[0], w132[1]))) \ }; \ const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) }; \ const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) }; \ const T_VEC w140[2] = { \ sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \ sub(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \ }; \ const T_VEC w141[2] = { \ add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \ add(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \ }; \ const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) }; \ const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) }; \ const T_VEC w144[2] = { \ add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))), \ add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0]))) \ }; \ const T_VEC w145[2] = { \ add(w112[0], \ sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \ add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1]))) \ }; \ const T_VEC w146[2] = { \ add(w114[0], mul(kWeight2, add(w138[0], w138[1]))), \ add(w114[1], mul(kWeight2, sub(w138[1], w138[0]))) \ }; \ const T_VEC w147[2] = { \ add(w114[0], \ sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \ add(w114[1], mul(kWeight2, sub(w138[0], w138[1]))) \ }; \ const T_VEC w148[2] = { \ add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))), \ add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0]))) \ }; \ const T_VEC w149[2] = { \ add(w116[0], \ sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \ add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1]))) \ }; \ const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) }; \ const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) }; \ const T_VEC w152[2] = { \ sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \ add(w113[1], \ sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0]))) \ }; \ const T_VEC w153[2] = { \ add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \ add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0]))) \ }; \ const T_VEC w154[2] = { \ sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \ sub(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \ }; \ const T_VEC w155[2] = { \ add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \ add(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \ }; \ const T_VEC w156[2] = { \ sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \ add(w117[1], \ sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0]))) \ }; \ const T_VEC w157[2] = { \ add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \ add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0]))) \ }; \ store(output + 0 * stride, add(w78[0], w142[0])); \ store(output + 1 * stride, \ add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1])))); \ store(output + 2 * stride, \ add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1])))); \ store(output + 3 * stride, \ add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1])))); \ store(output + 4 * stride, \ add(w86[0], mul(kWeight2, add(w150[0], w150[1])))); \ store(output + 5 * stride, \ add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1])))); \ store(output + 6 * stride, \ add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1])))); \ store(output + 7 * stride, \ add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1])))); \ store(output + 8 * stride, add(w79[0], w143[1])); \ store(output + 9 * stride, \ sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \ store(output + 10 * stride, \ sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \ store(output + 11 * stride, \ sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \ store(output + 12 * stride, \ sub(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \ store(output + 13 * stride, \ sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \ store(output + 14 * stride, \ sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \ store(output + 15 * stride, \ sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \ store(output + 16 * stride, sub(w78[0], w142[0])); \ store(output + 17 * stride, \ add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])), \ mul(kWeight6, w144[1])))); \ store(output + 18 * stride, \ add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])), \ mul(kWeight4, w146[1])))); \ store(output + 19 * stride, \ add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])), \ mul(kWeight8, w148[1])))); \ store(output + 20 * stride, \ add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])), \ mul(kWeight2, w150[1])))); \ store(output + 21 * stride, \ add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])), \ mul(kWeight7, w152[1])))); \ store(output + 22 * stride, \ add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])), \ mul(kWeight3, w154[1])))); \ store(output + 23 * stride, \ add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])), \ mul(kWeight5, w156[1])))); \ store(output + 24 * stride, sub(w79[0], w143[1])); \ store(output + 25 * stride, \ add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \ store(output + 26 * stride, \ add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \ store(output + 27 * stride, \ add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \ store(output + 28 * stride, \ add(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \ store(output + 29 * stride, \ add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \ store(output + 30 * stride, \ add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \ store(output + 31 * stride, \ add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \ } #endif // AOM_AOM_DSP_FFT_COMMON_H_ aom-3.12.1/aom_dsp/flow_estimation/000077500000000000000000000000001477627663500172265ustar00rootroot00000000000000aom-3.12.1/aom_dsp/flow_estimation/arm/000077500000000000000000000000001477627663500200055ustar00rootroot00000000000000aom-3.12.1/aom_dsp/flow_estimation/arm/disflow_neon.c000066400000000000000000000261471477627663500226510ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom_dsp/flow_estimation/disflow.h" #include #include #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_dsp/flow_estimation/arm/disflow_neon.h" #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" // Compare two regions of width x height pixels, one rooted at position // (x, y) in src and the other at (x + u, y + v) in ref. // This function returns the sum of squared pixel differences between // the two regions. static inline void compute_flow_error(const uint8_t *src, const uint8_t *ref, int width, int height, int stride, int x, int y, double u, double v, int16_t *dt) { // Split offset into integer and fractional parts, and compute cubic // interpolation kernels const int u_int = (int)floor(u); const int v_int = (int)floor(v); const double u_frac = u - floor(u); const double v_frac = v - floor(v); int h_kernel[4]; int v_kernel[4]; get_cubic_kernel_int(u_frac, h_kernel); get_cubic_kernel_int(v_frac, v_kernel); int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]; // Clamp coordinates so that all pixels we fetch will remain within the // allocated border region, but allow them to go far enough out that // the border pixels' values do not change. // Since we are calculating an 8x8 block, the bottom-right pixel // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic // interpolation has 4 taps, meaning that the output of pixel // (x_w, y_w) depends on the pixels in the range // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]). // // Thus the most extreme coordinates which will be fetched are // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9). const int x0 = clamp(x + u_int, -9, width); const int y0 = clamp(y + v_int, -9, height); // Horizontal convolution. const uint8_t *ref_start = ref + (y0 - 1) * stride + (x0 - 1); int16x4_t h_filter = vmovn_s32(vld1q_s32(h_kernel)); for (int i = 0; i < DISFLOW_PATCH_SIZE + 3; ++i) { uint8x16_t r = vld1q_u8(ref_start + i * stride); uint16x8_t r0 = vmovl_u8(vget_low_u8(r)); uint16x8_t r1 = vmovl_u8(vget_high_u8(r)); int16x8_t s0 = vreinterpretq_s16_u16(r0); int16x8_t s1 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 1)); int16x8_t s2 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 2)); int16x8_t s3 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 3)); int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(s0), h_filter, 0); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s1), h_filter, 1); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), h_filter, 2); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), h_filter, 3); int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(s0), h_filter, 0); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s1), h_filter, 1); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), h_filter, 2); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), h_filter, 3); // 6 is the maximum allowable number of extra bits which will avoid // the intermediate values overflowing an int16_t. The most extreme // intermediate value occurs when: // * The input pixels are [0, 255, 255, 0] // * u_frac = 0.5 // In this case, the un-scaled output is 255 * 1.125 = 286.875. // As an integer with 6 fractional bits, that is 18360, which fits // in an int16_t. But with 7 fractional bits it would be 36720, // which is too large. int16x8_t sum = vcombine_s16(vrshrn_n_s32(sum_lo, DISFLOW_INTERP_BITS - 6), vrshrn_n_s32(sum_hi, DISFLOW_INTERP_BITS - 6)); vst1q_s16(tmp_ + i * DISFLOW_PATCH_SIZE, sum); } // Vertical convolution. int16x4_t v_filter = vmovn_s32(vld1q_s32(v_kernel)); int16_t *tmp_start = tmp_ + DISFLOW_PATCH_SIZE; for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) { int16x8_t t0 = vld1q_s16(tmp_start + (i - 1) * DISFLOW_PATCH_SIZE); int16x8_t t1 = vld1q_s16(tmp_start + i * DISFLOW_PATCH_SIZE); int16x8_t t2 = vld1q_s16(tmp_start + (i + 1) * DISFLOW_PATCH_SIZE); int16x8_t t3 = vld1q_s16(tmp_start + (i + 2) * DISFLOW_PATCH_SIZE); int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(t0), v_filter, 0); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t1), v_filter, 1); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t2), v_filter, 2); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t3), v_filter, 3); int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(t0), v_filter, 0); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t1), v_filter, 1); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t2), v_filter, 2); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t3), v_filter, 3); uint8x8_t s = vld1_u8(src + (i + y) * stride + x); int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, 3)); // This time, we have to round off the 6 extra bits which were kept // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits // of precision to match the scale of the dx and dy arrays. sum_lo = vrshrq_n_s32(sum_lo, DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2); sum_hi = vrshrq_n_s32(sum_hi, DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2); int32x4_t err_lo = vsubw_s16(sum_lo, vget_low_s16(s_s16)); int32x4_t err_hi = vsubw_s16(sum_hi, vget_high_s16(s_s16)); vst1q_s16(dt + i * DISFLOW_PATCH_SIZE, vcombine_s16(vmovn_s32(err_lo), vmovn_s32(err_hi))); } } // Computes the components of the system of equations used to solve for // a flow vector. // // The flow equations are a least-squares system, derived as follows: // // For each pixel in the patch, we calculate the current error `dt`, // and the x and y gradients `dx` and `dy` of the source patch. // This means that, to first order, the squared error for this pixel is // // (dt + u * dx + v * dy)^2 // // where (u, v) are the incremental changes to the flow vector. // // We then want to find the values of u and v which minimize the sum // of the squared error across all pixels. Conveniently, this fits exactly // into the form of a least squares problem, with one equation // // u * dx + v * dy = -dt // // for each pixel. // // Summing across all pixels in a square window of size DISFLOW_PATCH_SIZE, // and absorbing the - sign elsewhere, this results in the least squares system // // M = |sum(dx * dx) sum(dx * dy)| // |sum(dx * dy) sum(dy * dy)| // // b = |sum(dx * dt)| // |sum(dy * dt)| static inline void compute_flow_matrix(const int16_t *dx, int dx_stride, const int16_t *dy, int dy_stride, double *M_inv) { int32x4_t sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0) }; for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { int16x8_t x = vld1q_s16(dx + i * dx_stride); int16x8_t y = vld1q_s16(dy + i * dy_stride); sum[0] = vmlal_s16(sum[0], vget_low_s16(x), vget_low_s16(x)); sum[0] = vmlal_s16(sum[0], vget_high_s16(x), vget_high_s16(x)); sum[1] = vmlal_s16(sum[1], vget_low_s16(x), vget_low_s16(y)); sum[1] = vmlal_s16(sum[1], vget_high_s16(x), vget_high_s16(y)); sum[3] = vmlal_s16(sum[3], vget_low_s16(y), vget_low_s16(y)); sum[3] = vmlal_s16(sum[3], vget_high_s16(y), vget_high_s16(y)); } sum[2] = sum[1]; int32x4_t res = horizontal_add_4d_s32x4(sum); // Apply regularization // We follow the standard regularization method of adding `k * I` before // inverting. This ensures that the matrix will be invertible. // // Setting the regularization strength k to 1 seems to work well here, as // typical values coming from the other equations are very large (1e5 to // 1e6, with an upper limit of around 6e7, at the time of writing). // It also preserves the property that all matrix values are whole numbers, // which is convenient for integerized SIMD implementation. double M0 = (double)vgetq_lane_s32(res, 0) + 1; double M1 = (double)vgetq_lane_s32(res, 1); double M2 = (double)vgetq_lane_s32(res, 2); double M3 = (double)vgetq_lane_s32(res, 3) + 1; // Invert matrix M. double det = (M0 * M3) - (M1 * M2); assert(det >= 1); const double det_inv = 1 / det; M_inv[0] = M3 * det_inv; M_inv[1] = -M1 * det_inv; M_inv[2] = -M2 * det_inv; M_inv[3] = M0 * det_inv; } static inline void compute_flow_vector(const int16_t *dx, int dx_stride, const int16_t *dy, int dy_stride, const int16_t *dt, int dt_stride, int *b) { int32x4_t b_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { int16x8_t dx16 = vld1q_s16(dx + i * dx_stride); int16x8_t dy16 = vld1q_s16(dy + i * dy_stride); int16x8_t dt16 = vld1q_s16(dt + i * dt_stride); b_s32[0] = vmlal_s16(b_s32[0], vget_low_s16(dx16), vget_low_s16(dt16)); b_s32[0] = vmlal_s16(b_s32[0], vget_high_s16(dx16), vget_high_s16(dt16)); b_s32[1] = vmlal_s16(b_s32[1], vget_low_s16(dy16), vget_low_s16(dt16)); b_s32[1] = vmlal_s16(b_s32[1], vget_high_s16(dy16), vget_high_s16(dt16)); } int32x4_t b_red = horizontal_add_2d_s32(b_s32[0], b_s32[1]); vst1_s32(b, add_pairwise_s32x4(b_red)); } void aom_compute_flow_at_point_neon(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v) { double M_inv[4]; int b[2]; int16_t dt[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; // Compute gradients within this patch const uint8_t *src_patch = &src[y * stride + x]; sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE); sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE); compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M_inv); for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { compute_flow_error(src, ref, width, height, stride, x, y, *u, *v, dt); compute_flow_vector(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, dt, DISFLOW_PATCH_SIZE, b); // Solve flow equations to find a better estimate for the flow vector // at this point const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1]; const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1]; *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2); *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2); if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) { // Stop iteration when we're close to convergence break; } } } aom-3.12.1/aom_dsp/flow_estimation/arm/disflow_neon.h000066400000000000000000000111541477627663500226460ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_ #define AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_ #include "aom_dsp/flow_estimation/disflow.h" #include #include #include "aom_dsp/arm/mem_neon.h" #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" static inline void get_cubic_kernel_dbl(double x, double kernel[4]) { // Check that the fractional position is in range. // // Note: x is calculated from, e.g., `u_frac = u - floor(u)`. // Mathematically, this implies that 0 <= x < 1. However, in practice it is // possible to have x == 1 due to floating point rounding. This is fine, // and we still interpolate correctly if we allow x = 1. assert(0 <= x && x <= 1); double x2 = x * x; double x3 = x2 * x; kernel[0] = -0.5 * x + x2 - 0.5 * x3; kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3; kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3; kernel[3] = -0.5 * x2 + 0.5 * x3; } static inline void get_cubic_kernel_int(double x, int kernel[4]) { double kernel_dbl[4]; get_cubic_kernel_dbl(x, kernel_dbl); kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS)); kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS)); kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS)); kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS)); } static inline void sobel_filter_x(const uint8_t *src, int src_stride, int16_t *dst, int dst_stride) { int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; // Horizontal filter, using kernel {1, 0, -1}. const uint8_t *src_start = src - 1 * src_stride - 1; for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) { uint8x16_t s = vld1q_u8(src_start + i * src_stride); uint8x8_t s0 = vget_low_u8(s); uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2)); // Given that the kernel is {1, 0, -1} the convolution is a simple // subtraction. int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s0, s2)); vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, diff); } // Vertical filter, using kernel {1, 2, 1}. // This kernel can be split into two 2-taps kernels of value {1, 1}. // That way we need only 3 add operations to perform the convolution, one of // which can be reused for the next line. int16x8_t s0 = vld1q_s16(tmp); int16x8_t s1 = vld1q_s16(tmp + DISFLOW_PATCH_SIZE); int16x8_t sum01 = vaddq_s16(s0, s1); for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { int16x8_t s2 = vld1q_s16(tmp + (i + 2) * DISFLOW_PATCH_SIZE); int16x8_t sum12 = vaddq_s16(s1, s2); int16x8_t sum = vaddq_s16(sum01, sum12); vst1q_s16(dst + i * dst_stride, sum); sum01 = sum12; s1 = s2; } } static inline void sobel_filter_y(const uint8_t *src, int src_stride, int16_t *dst, int dst_stride) { int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; // Horizontal filter, using kernel {1, 2, 1}. // This kernel can be split into two 2-taps kernels of value {1, 1}. // That way we need only 3 add operations to perform the convolution. const uint8_t *src_start = src - 1 * src_stride - 1; for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) { uint8x16_t s = vld1q_u8(src_start + i * src_stride); uint8x8_t s0 = vget_low_u8(s); uint8x8_t s1 = vget_low_u8(vextq_u8(s, s, 1)); uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2)); uint16x8_t sum01 = vaddl_u8(s0, s1); uint16x8_t sum12 = vaddl_u8(s1, s2); uint16x8_t sum = vaddq_u16(sum01, sum12); vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, vreinterpretq_s16_u16(sum)); } // Vertical filter, using kernel {1, 0, -1}. // Load the whole block at once to avoid redundant loads during convolution. int16x8_t t[10]; load_s16_8x10(tmp, DISFLOW_PATCH_SIZE, &t[0], &t[1], &t[2], &t[3], &t[4], &t[5], &t[6], &t[7], &t[8], &t[9]); for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { // Given that the kernel is {1, 0, -1} the convolution is a simple // subtraction. int16x8_t diff = vsubq_s16(t[i], t[i + 2]); vst1q_s16(dst + i * dst_stride, diff); } } #endif // AOM_AOM_DSP_FLOW_ESTIMATION_ARM_DISFLOW_NEON_H_ aom-3.12.1/aom_dsp/flow_estimation/arm/disflow_sve.c000066400000000000000000000260341477627663500225020ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom_dsp/flow_estimation/disflow.h" #include #include #include #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_dsp/flow_estimation/arm/disflow_neon.h" #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = { 0, 2, 4, 6, 1, 3, 5, 7, }; // Compare two regions of width x height pixels, one rooted at position // (x, y) in src and the other at (x + u, y + v) in ref. // This function returns the sum of squared pixel differences between // the two regions. static inline void compute_flow_error(const uint8_t *src, const uint8_t *ref, int width, int height, int stride, int x, int y, double u, double v, int16_t *dt) { // Split offset into integer and fractional parts, and compute cubic // interpolation kernels const int u_int = (int)floor(u); const int v_int = (int)floor(v); const double u_frac = u - floor(u); const double v_frac = v - floor(v); int h_kernel[4]; int v_kernel[4]; get_cubic_kernel_int(u_frac, h_kernel); get_cubic_kernel_int(v_frac, v_kernel); int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]; // Clamp coordinates so that all pixels we fetch will remain within the // allocated border region, but allow them to go far enough out that // the border pixels' values do not change. // Since we are calculating an 8x8 block, the bottom-right pixel // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic // interpolation has 4 taps, meaning that the output of pixel // (x_w, y_w) depends on the pixels in the range // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]). // // Thus the most extreme coordinates which will be fetched are // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9). const int x0 = clamp(x + u_int, -9, width); const int y0 = clamp(y + v_int, -9, height); // Horizontal convolution. const uint8_t *ref_start = ref + (y0 - 1) * stride + (x0 - 1); const int16x4_t h_kernel_s16 = vmovn_s32(vld1q_s32(h_kernel)); const int16x8_t h_filter = vcombine_s16(h_kernel_s16, vdup_n_s16(0)); const uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); for (int i = 0; i < DISFLOW_PATCH_SIZE + 3; ++i) { svuint16_t r0 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 0); svuint16_t r1 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 1); svuint16_t r2 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 2); svuint16_t r3 = svld1ub_u16(svptrue_b16(), ref_start + i * stride + 3); int16x8_t s0 = vreinterpretq_s16_u16(svget_neonq_u16(r0)); int16x8_t s1 = vreinterpretq_s16_u16(svget_neonq_u16(r1)); int16x8_t s2 = vreinterpretq_s16_u16(svget_neonq_u16(r2)); int16x8_t s3 = vreinterpretq_s16_u16(svget_neonq_u16(r3)); int64x2_t sum04 = aom_svdot_lane_s16(vdupq_n_s64(0), s0, h_filter, 0); int64x2_t sum15 = aom_svdot_lane_s16(vdupq_n_s64(0), s1, h_filter, 0); int64x2_t sum26 = aom_svdot_lane_s16(vdupq_n_s64(0), s2, h_filter, 0); int64x2_t sum37 = aom_svdot_lane_s16(vdupq_n_s64(0), s3, h_filter, 0); int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); // 6 is the maximum allowable number of extra bits which will avoid // the intermediate values overflowing an int16_t. The most extreme // intermediate value occurs when: // * The input pixels are [0, 255, 255, 0] // * u_frac = 0.5 // In this case, the un-scaled output is 255 * 1.125 = 286.875. // As an integer with 6 fractional bits, that is 18360, which fits // in an int16_t. But with 7 fractional bits it would be 36720, // which is too large. int16x8_t res = vcombine_s16(vrshrn_n_s32(res0, DISFLOW_INTERP_BITS - 6), vrshrn_n_s32(res1, DISFLOW_INTERP_BITS - 6)); res = aom_tbl_s16(res, idx); vst1q_s16(tmp_ + i * DISFLOW_PATCH_SIZE, res); } // Vertical convolution. int16x4_t v_filter = vmovn_s32(vld1q_s32(v_kernel)); int16_t *tmp_start = tmp_ + DISFLOW_PATCH_SIZE; for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) { int16x8_t t0 = vld1q_s16(tmp_start + (i - 1) * DISFLOW_PATCH_SIZE); int16x8_t t1 = vld1q_s16(tmp_start + i * DISFLOW_PATCH_SIZE); int16x8_t t2 = vld1q_s16(tmp_start + (i + 1) * DISFLOW_PATCH_SIZE); int16x8_t t3 = vld1q_s16(tmp_start + (i + 2) * DISFLOW_PATCH_SIZE); int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(t0), v_filter, 0); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t1), v_filter, 1); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t2), v_filter, 2); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t3), v_filter, 3); int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(t0), v_filter, 0); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t1), v_filter, 1); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t2), v_filter, 2); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t3), v_filter, 3); uint8x8_t s = vld1_u8(src + (i + y) * stride + x); int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, 3)); // This time, we have to round off the 6 extra bits which were kept // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits // of precision to match the scale of the dx and dy arrays. sum_lo = vrshrq_n_s32(sum_lo, DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2); sum_hi = vrshrq_n_s32(sum_hi, DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2); int32x4_t err_lo = vsubw_s16(sum_lo, vget_low_s16(s_s16)); int32x4_t err_hi = vsubw_s16(sum_hi, vget_high_s16(s_s16)); vst1q_s16(dt + i * DISFLOW_PATCH_SIZE, vcombine_s16(vmovn_s32(err_lo), vmovn_s32(err_hi))); } } // Computes the components of the system of equations used to solve for // a flow vector. // // The flow equations are a least-squares system, derived as follows: // // For each pixel in the patch, we calculate the current error `dt`, // and the x and y gradients `dx` and `dy` of the source patch. // This means that, to first order, the squared error for this pixel is // // (dt + u * dx + v * dy)^2 // // where (u, v) are the incremental changes to the flow vector. // // We then want to find the values of u and v which minimize the sum // of the squared error across all pixels. Conveniently, this fits exactly // into the form of a least squares problem, with one equation // // u * dx + v * dy = -dt // // for each pixel. // // Summing across all pixels in a square window of size DISFLOW_PATCH_SIZE, // and absorbing the - sign elsewhere, this results in the least squares system // // M = |sum(dx * dx) sum(dx * dy)| // |sum(dx * dy) sum(dy * dy)| // // b = |sum(dx * dt)| // |sum(dy * dt)| static inline void compute_flow_matrix(const int16_t *dx, int dx_stride, const int16_t *dy, int dy_stride, double *M_inv) { int64x2_t sum[3] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0) }; for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { int16x8_t x = vld1q_s16(dx + i * dx_stride); int16x8_t y = vld1q_s16(dy + i * dy_stride); sum[0] = aom_sdotq_s16(sum[0], x, x); sum[1] = aom_sdotq_s16(sum[1], x, y); sum[2] = aom_sdotq_s16(sum[2], y, y); } sum[0] = vpaddq_s64(sum[0], sum[1]); sum[2] = vpaddq_s64(sum[1], sum[2]); int32x4_t res = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); // Apply regularization // We follow the standard regularization method of adding `k * I` before // inverting. This ensures that the matrix will be invertible. // // Setting the regularization strength k to 1 seems to work well here, as // typical values coming from the other equations are very large (1e5 to // 1e6, with an upper limit of around 6e7, at the time of writing). // It also preserves the property that all matrix values are whole numbers, // which is convenient for integerized SIMD implementation. double M0 = (double)vgetq_lane_s32(res, 0) + 1; double M1 = (double)vgetq_lane_s32(res, 1); double M2 = (double)vgetq_lane_s32(res, 2); double M3 = (double)vgetq_lane_s32(res, 3) + 1; // Invert matrix M. double det = (M0 * M3) - (M1 * M2); assert(det >= 1); const double det_inv = 1 / det; M_inv[0] = M3 * det_inv; M_inv[1] = -M1 * det_inv; M_inv[2] = -M2 * det_inv; M_inv[3] = M0 * det_inv; } static inline void compute_flow_vector(const int16_t *dx, int dx_stride, const int16_t *dy, int dy_stride, const int16_t *dt, int dt_stride, int *b) { int64x2_t b_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { int16x8_t dx16 = vld1q_s16(dx + i * dx_stride); int16x8_t dy16 = vld1q_s16(dy + i * dy_stride); int16x8_t dt16 = vld1q_s16(dt + i * dt_stride); b_s64[0] = aom_sdotq_s16(b_s64[0], dx16, dt16); b_s64[1] = aom_sdotq_s16(b_s64[1], dy16, dt16); } b_s64[0] = vpaddq_s64(b_s64[0], b_s64[1]); vst1_s32(b, vmovn_s64(b_s64[0])); } void aom_compute_flow_at_point_sve(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v) { double M_inv[4]; int b[2]; int16_t dt[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; // Compute gradients within this patch const uint8_t *src_patch = &src[y * stride + x]; sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE); sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE); compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M_inv); for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { compute_flow_error(src, ref, width, height, stride, x, y, *u, *v, dt); compute_flow_vector(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, dt, DISFLOW_PATCH_SIZE, b); // Solve flow equations to find a better estimate for the flow vector // at this point const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1]; const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1]; *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2); *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2); if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) { // Stop iteration when we're close to convergence break; } } } aom-3.12.1/aom_dsp/flow_estimation/corner_detect.c000066400000000000000000000131321477627663500222120ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include #include "third_party/fastfeat/fast.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_mem/aom_mem.h" #include "aom_util/aom_pthread.h" #include "av1/common/common.h" #define FAST_BARRIER 18 size_t av1_get_corner_list_size(void) { return sizeof(CornerList); } CornerList *av1_alloc_corner_list(void) { CornerList *corners = (CornerList *)aom_calloc(1, sizeof(*corners)); if (!corners) { return NULL; } corners->valid = false; #if CONFIG_MULTITHREAD pthread_mutex_init(&corners->mutex, NULL); #endif // CONFIG_MULTITHREAD return corners; } static bool compute_corner_list(const YV12_BUFFER_CONFIG *frame, int bit_depth, int downsample_level, CornerList *corners) { ImagePyramid *pyr = frame->y_pyramid; const int layers = aom_compute_pyramid(frame, bit_depth, downsample_level + 1, pyr); if (layers < 0) { return false; } // Clamp downsampling ratio base on max number of layers allowed // for this frame size downsample_level = layers - 1; const uint8_t *buf = pyr->layers[downsample_level].buffer; int width = pyr->layers[downsample_level].width; int height = pyr->layers[downsample_level].height; int stride = pyr->layers[downsample_level].stride; int *scores = NULL; int num_corners; xy *const frame_corners_xy = aom_fast9_detect_nonmax( buf, width, height, stride, FAST_BARRIER, &scores, &num_corners); if (num_corners < 0) return false; if (num_corners <= MAX_CORNERS) { // Use all detected corners for (int i = 0; i < num_corners; i++) { corners->corners[2 * i + 0] = frame_corners_xy[i].x * (1 << downsample_level); corners->corners[2 * i + 1] = frame_corners_xy[i].y * (1 << downsample_level); } corners->num_corners = num_corners; } else { // There are more than MAX_CORNERS corners avilable, so pick out a subset // of the sharpest corners, as these will be the most useful for flow // estimation int histogram[256]; av1_zero(histogram); for (int i = 0; i < num_corners; i++) { assert(FAST_BARRIER <= scores[i] && scores[i] <= 255); histogram[scores[i]] += 1; } int threshold = -1; int found_corners = 0; for (int bucket = 255; bucket >= 0; bucket--) { if (found_corners + histogram[bucket] > MAX_CORNERS) { // Set threshold here threshold = bucket; break; } found_corners += histogram[bucket]; } assert(threshold != -1 && "Failed to select a valid threshold"); int copied_corners = 0; for (int i = 0; i < num_corners; i++) { if (scores[i] > threshold) { assert(copied_corners < MAX_CORNERS); corners->corners[2 * copied_corners + 0] = frame_corners_xy[i].x * (1 << downsample_level); corners->corners[2 * copied_corners + 1] = frame_corners_xy[i].y * (1 << downsample_level); copied_corners += 1; } } assert(copied_corners == found_corners); corners->num_corners = copied_corners; } free(scores); free(frame_corners_xy); return true; } bool av1_compute_corner_list(const YV12_BUFFER_CONFIG *frame, int bit_depth, int downsample_level, CornerList *corners) { assert(corners); #if CONFIG_MULTITHREAD pthread_mutex_lock(&corners->mutex); #endif // CONFIG_MULTITHREAD if (!corners->valid) { corners->valid = compute_corner_list(frame, bit_depth, downsample_level, corners); } bool valid = corners->valid; #if CONFIG_MULTITHREAD pthread_mutex_unlock(&corners->mutex); #endif // CONFIG_MULTITHREAD return valid; } #ifndef NDEBUG // Check if a corner list has already been computed. // This is mostly a debug helper - as it is necessary to hold corners->mutex // while reading the valid flag, we cannot just write: // assert(corners->valid); // This function allows the check to be correctly written as: // assert(aom_is_corner_list_valid(corners)); bool aom_is_corner_list_valid(CornerList *corners) { assert(corners); // Per the comments in the CornerList struct, we must take this mutex // before reading or writing the "valid" flag, and hold it while computing // the pyramid, to ensure proper behaviour if multiple threads call this // function simultaneously #if CONFIG_MULTITHREAD pthread_mutex_lock(&corners->mutex); #endif // CONFIG_MULTITHREAD bool valid = corners->valid; #if CONFIG_MULTITHREAD pthread_mutex_unlock(&corners->mutex); #endif // CONFIG_MULTITHREAD return valid; } #endif void av1_invalidate_corner_list(CornerList *corners) { if (corners) { #if CONFIG_MULTITHREAD pthread_mutex_lock(&corners->mutex); #endif // CONFIG_MULTITHREAD corners->valid = false; #if CONFIG_MULTITHREAD pthread_mutex_unlock(&corners->mutex); #endif // CONFIG_MULTITHREAD } } void av1_free_corner_list(CornerList *corners) { if (corners) { #if CONFIG_MULTITHREAD pthread_mutex_destroy(&corners->mutex); #endif // CONFIG_MULTITHREAD aom_free(corners); } } aom-3.12.1/aom_dsp/flow_estimation/corner_detect.h000066400000000000000000000052371477627663500222260ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_DETECT_H_ #define AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_DETECT_H_ #include #include #include #include #include "aom_dsp/pyramid.h" #include "aom_util/aom_pthread.h" #ifdef __cplusplus extern "C" { #endif #define MAX_CORNERS 4096 typedef struct corner_list { #if CONFIG_MULTITHREAD // Mutex which is used to prevent the corner list from being computed twice // at the same time // // Semantics: // * This mutex must be held whenever reading or writing the `valid` flag // // * This mutex must also be held while computing the image pyramid, // to ensure that only one thread may do so at a time. // // * However, once you have read the valid flag and seen a true value, // it is safe to drop the mutex and read from the remaining fields. // This is because, once the image pyramid is computed, its contents // will not be changed until the parent frame buffer is recycled, // which will not happen until there are no more outstanding references // to the frame buffer. pthread_mutex_t mutex; #endif // CONFIG_MULTITHREAD // Flag indicating whether the corner list contains valid data bool valid; // Number of corners found int num_corners; // (x, y) coordinates of each corner int corners[2 * MAX_CORNERS]; } CornerList; size_t av1_get_corner_list_size(void); CornerList *av1_alloc_corner_list(void); bool av1_compute_corner_list(const YV12_BUFFER_CONFIG *frame, int bit_depth, int downsample_level, CornerList *corners); #ifndef NDEBUG // Check if a corner list has already been computed. // This is mostly a debug helper - as it is necessary to hold corners->mutex // while reading the valid flag, we cannot just write: // assert(corners->valid); // This function allows the check to be correctly written as: // assert(aom_is_corner_list_valid(corners)); bool aom_is_corner_list_valid(CornerList *corners); #endif void av1_invalidate_corner_list(CornerList *corners); void av1_free_corner_list(CornerList *corners); #ifdef __cplusplus } #endif #endif // AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_DETECT_H_ aom-3.12.1/aom_dsp/flow_estimation/corner_match.c000066400000000000000000000265161477627663500220500ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_dsp/flow_estimation/corner_match.h" #include "aom_dsp/flow_estimation/disflow.h" #include "aom_dsp/flow_estimation/flow_estimation.h" #include "aom_dsp/flow_estimation/ransac.h" #include "aom_dsp/pyramid.h" #include "aom_scale/yv12config.h" #define THRESHOLD_NCC 0.75 /* Compute mean and standard deviation of pixels in a window of size MATCH_SZ by MATCH_SZ centered at (x, y). Store results into *mean and *one_over_stddev Note: The output of this function is scaled by MATCH_SZ, as in *mean = MATCH_SZ * and *one_over_stddev = 1 / (MATCH_SZ * ) Combined with the fact that we return 1/stddev rather than the standard deviation itself, this allows us to completely avoid divisions in aom_compute_correlation, which is much hotter than this function is. Returns true if this feature point is usable, false otherwise. */ bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev) { int sum = 0; int sumsq = 0; for (int i = 0; i < MATCH_SZ; ++i) { for (int j = 0; j < MATCH_SZ; ++j) { sum += frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; sumsq += frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] * frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; } } *mean = (double)sum / MATCH_SZ; const double variance = sumsq - (*mean) * (*mean); if (variance < MIN_FEATURE_VARIANCE) { *one_over_stddev = 0.0; return false; } *one_over_stddev = 1.0 / sqrt(variance); return true; } /* Compute corr(frame1, frame2) over a window of size MATCH_SZ by MATCH_SZ. To save on computation, the mean and (1 divided by the) standard deviation of the window in each frame are precomputed and passed into this function as arguments. */ double aom_compute_correlation_c(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2) { int v1, v2; int cross = 0; for (int i = 0; i < MATCH_SZ; ++i) { for (int j = 0; j < MATCH_SZ; ++j) { v1 = frame1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)]; v2 = frame2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)]; cross += v1 * v2; } } // Note: In theory, the calculations here "should" be // covariance = cross / N^2 - mean1 * mean2 // correlation = covariance / (stddev1 * stddev2). // // However, because of the scaling in aom_compute_mean_stddev, the // lines below actually calculate // covariance * N^2 = cross - (mean1 * N) * (mean2 * N) // correlation = (covariance * N^2) / ((stddev1 * N) * (stddev2 * N)) // // ie. we have removed the need for a division, and still end up with the // correct unscaled correlation (ie, in the range [-1, +1]) double covariance = cross - mean1 * mean2; double correlation = covariance * (one_over_stddev1 * one_over_stddev2); return correlation; } static int is_eligible_point(int pointx, int pointy, int width, int height) { return (pointx >= MATCH_SZ_BY2 && pointy >= MATCH_SZ_BY2 && pointx + MATCH_SZ_BY2 < width && pointy + MATCH_SZ_BY2 < height); } static int is_eligible_distance(int point1x, int point1y, int point2x, int point2y, int width, int height) { const int thresh = (width < height ? height : width) >> 4; return ((point1x - point2x) * (point1x - point2x) + (point1y - point2y) * (point1y - point2y)) <= thresh * thresh; } typedef struct { int x; int y; double mean; double one_over_stddev; int best_match_idx; double best_match_corr; } PointInfo; static int determine_correspondence(const unsigned char *src, const int *src_corners, int num_src_corners, const unsigned char *ref, const int *ref_corners, int num_ref_corners, int width, int height, int src_stride, int ref_stride, Correspondence *correspondences) { PointInfo *src_point_info = NULL; PointInfo *ref_point_info = NULL; int num_correspondences = 0; src_point_info = (PointInfo *)aom_calloc(num_src_corners, sizeof(*src_point_info)); if (!src_point_info) { goto finished; } ref_point_info = (PointInfo *)aom_calloc(num_ref_corners, sizeof(*ref_point_info)); if (!ref_point_info) { goto finished; } // First pass (linear): // Filter corner lists and compute per-patch means and standard deviations, // for the src and ref frames independently int src_point_count = 0; for (int i = 0; i < num_src_corners; i++) { int src_x = src_corners[2 * i]; int src_y = src_corners[2 * i + 1]; if (!is_eligible_point(src_x, src_y, width, height)) continue; PointInfo *point = &src_point_info[src_point_count]; point->x = src_x; point->y = src_y; point->best_match_corr = THRESHOLD_NCC; if (!aom_compute_mean_stddev(src, src_stride, src_x, src_y, &point->mean, &point->one_over_stddev)) continue; src_point_count++; } if (src_point_count == 0) { goto finished; } int ref_point_count = 0; for (int j = 0; j < num_ref_corners; j++) { int ref_x = ref_corners[2 * j]; int ref_y = ref_corners[2 * j + 1]; if (!is_eligible_point(ref_x, ref_y, width, height)) continue; PointInfo *point = &ref_point_info[ref_point_count]; point->x = ref_x; point->y = ref_y; point->best_match_corr = THRESHOLD_NCC; if (!aom_compute_mean_stddev(ref, ref_stride, ref_x, ref_y, &point->mean, &point->one_over_stddev)) continue; ref_point_count++; } if (ref_point_count == 0) { goto finished; } // Second pass (quadratic): // For each pair of points, compute correlation, and use this to determine // the best match of each corner, in both directions for (int i = 0; i < src_point_count; ++i) { PointInfo *src_point = &src_point_info[i]; for (int j = 0; j < ref_point_count; ++j) { PointInfo *ref_point = &ref_point_info[j]; if (!is_eligible_distance(src_point->x, src_point->y, ref_point->x, ref_point->y, width, height)) continue; double corr = aom_compute_correlation( src, src_stride, src_point->x, src_point->y, src_point->mean, src_point->one_over_stddev, ref, ref_stride, ref_point->x, ref_point->y, ref_point->mean, ref_point->one_over_stddev); if (corr > src_point->best_match_corr) { src_point->best_match_idx = j; src_point->best_match_corr = corr; } if (corr > ref_point->best_match_corr) { ref_point->best_match_idx = i; ref_point->best_match_corr = corr; } } } // Third pass (linear): // Scan through source corners, generating a correspondence for each corner // iff ref_best_match[src_best_match[i]] == i // Then refine the generated correspondences using optical flow for (int i = 0; i < src_point_count; i++) { PointInfo *point = &src_point_info[i]; // Skip corners which were not matched, or which didn't find // a good enough match if (point->best_match_corr < THRESHOLD_NCC) continue; PointInfo *match_point = &ref_point_info[point->best_match_idx]; if (match_point->best_match_idx == i) { // Refine match using optical flow and store const int sx = point->x; const int sy = point->y; const int rx = match_point->x; const int ry = match_point->y; double u = (double)(rx - sx); double v = (double)(ry - sy); const int patch_tl_x = sx - DISFLOW_PATCH_CENTER; const int patch_tl_y = sy - DISFLOW_PATCH_CENTER; aom_compute_flow_at_point(src, ref, patch_tl_x, patch_tl_y, width, height, src_stride, &u, &v); Correspondence *correspondence = &correspondences[num_correspondences]; correspondence->x = (double)sx; correspondence->y = (double)sy; correspondence->rx = (double)sx + u; correspondence->ry = (double)sy + v; num_correspondences++; } } finished: aom_free(src_point_info); aom_free(ref_point_info); return num_correspondences; } bool av1_compute_global_motion_feature_match( TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, int bit_depth, int downsample_level, MotionModel *motion_models, int num_motion_models, bool *mem_alloc_failed) { int num_correspondences; Correspondence *correspondences; ImagePyramid *src_pyramid = src->y_pyramid; CornerList *src_corners = src->corners; ImagePyramid *ref_pyramid = ref->y_pyramid; CornerList *ref_corners = ref->corners; // Precompute information we will need about each frame if (aom_compute_pyramid(src, bit_depth, 1, src_pyramid) < 0) { *mem_alloc_failed = true; return false; } if (!av1_compute_corner_list(src, bit_depth, downsample_level, src_corners)) { *mem_alloc_failed = true; return false; } if (aom_compute_pyramid(ref, bit_depth, 1, ref_pyramid) < 0) { *mem_alloc_failed = true; return false; } if (!av1_compute_corner_list(src, bit_depth, downsample_level, ref_corners)) { *mem_alloc_failed = true; return false; } const uint8_t *src_buffer = src_pyramid->layers[0].buffer; const int src_width = src_pyramid->layers[0].width; const int src_height = src_pyramid->layers[0].height; const int src_stride = src_pyramid->layers[0].stride; const uint8_t *ref_buffer = ref_pyramid->layers[0].buffer; assert(ref_pyramid->layers[0].width == src_width); assert(ref_pyramid->layers[0].height == src_height); const int ref_stride = ref_pyramid->layers[0].stride; // find correspondences between the two images correspondences = (Correspondence *)aom_malloc(src_corners->num_corners * sizeof(*correspondences)); if (!correspondences) { *mem_alloc_failed = true; return false; } num_correspondences = determine_correspondence( src_buffer, src_corners->corners, src_corners->num_corners, ref_buffer, ref_corners->corners, ref_corners->num_corners, src_width, src_height, src_stride, ref_stride, correspondences); bool result = ransac(correspondences, num_correspondences, type, motion_models, num_motion_models, mem_alloc_failed); aom_free(correspondences); return result; } aom-3.12.1/aom_dsp/flow_estimation/corner_match.h000066400000000000000000000031431477627663500220440ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_MATCH_H_ #define AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_MATCH_H_ #include #include #include #include #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_dsp/flow_estimation/flow_estimation.h" #include "aom_scale/yv12config.h" #ifdef __cplusplus extern "C" { #endif #define MATCH_SZ 16 #define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2) #define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ) // Minimum threshold for the variance of a patch, in order for it to be // considered useful for matching. // This is evaluated against the scaled variance MATCH_SZ_SQ * sigma^2, // so a setting of 1 * MATCH_SZ_SQ corresponds to an unscaled variance of 1 #define MIN_FEATURE_VARIANCE (1 * MATCH_SZ_SQ) bool av1_compute_global_motion_feature_match( TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, int bit_depth, int downsample_level, MotionModel *motion_models, int num_motion_models, bool *mem_alloc_failed); #ifdef __cplusplus } #endif #endif // AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_MATCH_H_ aom-3.12.1/aom_dsp/flow_estimation/disflow.c000066400000000000000000000774441477627663500210610ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Dense Inverse Search flow algorithm // Paper: https://arxiv.org/abs/1603.03590 #include #include #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_dsp/flow_estimation/disflow.h" #include "aom_dsp/flow_estimation/ransac.h" #include "aom_dsp/pyramid.h" #include "aom_mem/aom_mem.h" #include "config/aom_dsp_rtcd.h" // Amount to downsample the flow field by. // e.g., DOWNSAMPLE_SHIFT = 2 (DOWNSAMPLE_FACTOR == 4) means we calculate // one flow point for each 4x4 pixel region of the frame // Must be a power of 2 #define DOWNSAMPLE_SHIFT 3 #define DOWNSAMPLE_FACTOR (1 << DOWNSAMPLE_SHIFT) // Filters used when upscaling the flow field from one pyramid level // to another. See upscale_flow_component for details on kernel selection #define FLOW_UPSCALE_TAPS 4 // Number of outermost flow field entries (on each edge) which can't be // computed, because the patch they correspond to extends outside of the // frame // The border is (DISFLOW_PATCH_SIZE >> 1) pixels, which is // (DISFLOW_PATCH_SIZE >> 1) >> DOWNSAMPLE_SHIFT many flow field entries #define FLOW_BORDER_INNER ((DISFLOW_PATCH_SIZE >> 1) >> DOWNSAMPLE_SHIFT) // Number of extra padding entries on each side of the flow field. // These samples are added so that we do not need to apply clamping when // interpolating or upsampling the flow field #define FLOW_BORDER_OUTER (FLOW_UPSCALE_TAPS / 2) // When downsampling the flow field, each flow field entry covers a square // region of pixels in the image pyramid. This value is equal to the position // of the center of that region, as an offset from the top/left edge. // // Note: Using ((DOWNSAMPLE_FACTOR - 1) / 2) is equivalent to the more // natural expression ((DOWNSAMPLE_FACTOR / 2) - 1), // unless DOWNSAMPLE_FACTOR == 1 (ie, no downsampling), in which case // this gives the correct offset of 0 instead of -1. #define UPSAMPLE_CENTER_OFFSET ((DOWNSAMPLE_FACTOR - 1) / 2) static double flow_upscale_filter[2][FLOW_UPSCALE_TAPS] = { // Cubic interpolation kernels for phase=0.75 and phase=0.25, respectively { -3 / 128., 29 / 128., 111 / 128., -9 / 128. }, { -9 / 128., 111 / 128., 29 / 128., -3 / 128. } }; static inline void get_cubic_kernel_dbl(double x, double kernel[4]) { // Check that the fractional position is in range. // // Note: x is calculated from, e.g., `u_frac = u - floor(u)`. // Mathematically, this implies that 0 <= x < 1. However, in practice it is // possible to have x == 1 due to floating point rounding. This is fine, // and we still interpolate correctly if we allow x = 1. assert(0 <= x && x <= 1); double x2 = x * x; double x3 = x2 * x; kernel[0] = -0.5 * x + x2 - 0.5 * x3; kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3; kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3; kernel[3] = -0.5 * x2 + 0.5 * x3; } static inline void get_cubic_kernel_int(double x, int kernel[4]) { double kernel_dbl[4]; get_cubic_kernel_dbl(x, kernel_dbl); kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS)); kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS)); kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS)); kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS)); } static inline double get_cubic_value_dbl(const double *p, const double kernel[4]) { return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] + kernel[3] * p[3]; } static inline int get_cubic_value_int(const int *p, const int kernel[4]) { return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] + kernel[3] * p[3]; } static inline double bicubic_interp_one(const double *arr, int stride, const double h_kernel[4], const double v_kernel[4]) { double tmp[1 * 4]; // Horizontal convolution for (int i = -1; i < 3; ++i) { tmp[i + 1] = get_cubic_value_dbl(&arr[i * stride - 1], h_kernel); } // Vertical convolution return get_cubic_value_dbl(tmp, v_kernel); } static int determine_disflow_correspondence(const ImagePyramid *src_pyr, const ImagePyramid *ref_pyr, CornerList *corners, const FlowField *flow, Correspondence *correspondences) { const int width = flow->width; const int height = flow->height; const int stride = flow->stride; int num_correspondences = 0; for (int i = 0; i < corners->num_corners; ++i) { const int x0 = corners->corners[2 * i]; const int y0 = corners->corners[2 * i + 1]; // Offset points, to compensate for the fact that (say) a flow field entry // at horizontal index i, is nominally associated with the pixel at // horizontal coordinate (i << DOWNSAMPLE_FACTOR) + UPSAMPLE_CENTER_OFFSET // This offset must be applied before we split the coordinate into integer // and fractional parts, in order for the interpolation to be correct. const int x = x0 - UPSAMPLE_CENTER_OFFSET; const int y = y0 - UPSAMPLE_CENTER_OFFSET; // Split the pixel coordinates into integer flow field coordinates and // an offset for interpolation const int flow_x = x >> DOWNSAMPLE_SHIFT; const double flow_sub_x = (x & (DOWNSAMPLE_FACTOR - 1)) / (double)DOWNSAMPLE_FACTOR; const int flow_y = y >> DOWNSAMPLE_SHIFT; const double flow_sub_y = (y & (DOWNSAMPLE_FACTOR - 1)) / (double)DOWNSAMPLE_FACTOR; // Exclude points which would sample from the outer border of the flow // field, as this would give lower-quality results. // // Note: As we never read from the border region at pyramid level 0, we // can skip filling it in. If the conditions here are removed, or any // other logic is added which reads from this border region, then // compute_flow_field() will need to be modified to call // fill_flow_field_borders() at pyramid level 0 to set up the correct // border data. if (flow_x < 1 || (flow_x + 2) >= width) continue; if (flow_y < 1 || (flow_y + 2) >= height) continue; double h_kernel[4]; double v_kernel[4]; get_cubic_kernel_dbl(flow_sub_x, h_kernel); get_cubic_kernel_dbl(flow_sub_y, v_kernel); double flow_u = bicubic_interp_one(&flow->u[flow_y * stride + flow_x], stride, h_kernel, v_kernel); double flow_v = bicubic_interp_one(&flow->v[flow_y * stride + flow_x], stride, h_kernel, v_kernel); // Refine the interpolated flow vector one last time const int patch_tl_x = x0 - DISFLOW_PATCH_CENTER; const int patch_tl_y = y0 - DISFLOW_PATCH_CENTER; aom_compute_flow_at_point( src_pyr->layers[0].buffer, ref_pyr->layers[0].buffer, patch_tl_x, patch_tl_y, src_pyr->layers[0].width, src_pyr->layers[0].height, src_pyr->layers[0].stride, &flow_u, &flow_v); // Use original points (without offsets) when filling in correspondence // array correspondences[num_correspondences].x = x0; correspondences[num_correspondences].y = y0; correspondences[num_correspondences].rx = x0 + flow_u; correspondences[num_correspondences].ry = y0 + flow_v; num_correspondences++; } return num_correspondences; } // Compare two regions of width x height pixels, one rooted at position // (x, y) in src and the other at (x + u, y + v) in ref. // This function returns the sum of squared pixel differences between // the two regions. static inline void compute_flow_vector(const uint8_t *src, const uint8_t *ref, int width, int height, int stride, int x, int y, double u, double v, const int16_t *dx, const int16_t *dy, int *b) { memset(b, 0, 2 * sizeof(*b)); // Split offset into integer and fractional parts, and compute cubic // interpolation kernels const int u_int = (int)floor(u); const int v_int = (int)floor(v); const double u_frac = u - floor(u); const double v_frac = v - floor(v); int h_kernel[4]; int v_kernel[4]; get_cubic_kernel_int(u_frac, h_kernel); get_cubic_kernel_int(v_frac, v_kernel); // Storage for intermediate values between the two convolution directions int tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]; int *tmp = tmp_ + DISFLOW_PATCH_SIZE; // Offset by one row // Clamp coordinates so that all pixels we fetch will remain within the // allocated border region, but allow them to go far enough out that // the border pixels' values do not change. // Since we are calculating an 8x8 block, the bottom-right pixel // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic // interpolation has 4 taps, meaning that the output of pixel // (x_w, y_w) depends on the pixels in the range // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]). // // Thus the most extreme coordinates which will be fetched are // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9). const int x0 = clamp(x + u_int, -9, width); const int y0 = clamp(y + v_int, -9, height); // Horizontal convolution for (int i = -1; i < DISFLOW_PATCH_SIZE + 2; ++i) { const int y_w = y0 + i; for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) { const int x_w = x0 + j; int arr[4]; arr[0] = (int)ref[y_w * stride + (x_w - 1)]; arr[1] = (int)ref[y_w * stride + (x_w + 0)]; arr[2] = (int)ref[y_w * stride + (x_w + 1)]; arr[3] = (int)ref[y_w * stride + (x_w + 2)]; // Apply kernel and round, keeping 6 extra bits of precision. // // 6 is the maximum allowable number of extra bits which will avoid // the intermediate values overflowing an int16_t. The most extreme // intermediate value occurs when: // * The input pixels are [0, 255, 255, 0] // * u_frac = 0.5 // In this case, the un-scaled output is 255 * 1.125 = 286.875. // As an integer with 6 fractional bits, that is 18360, which fits // in an int16_t. But with 7 fractional bits it would be 36720, // which is too large. tmp[i * DISFLOW_PATCH_SIZE + j] = ROUND_POWER_OF_TWO( get_cubic_value_int(arr, h_kernel), DISFLOW_INTERP_BITS - 6); } } // Vertical convolution for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) { for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) { const int *p = &tmp[i * DISFLOW_PATCH_SIZE + j]; const int arr[4] = { p[-DISFLOW_PATCH_SIZE], p[0], p[DISFLOW_PATCH_SIZE], p[2 * DISFLOW_PATCH_SIZE] }; const int result = get_cubic_value_int(arr, v_kernel); // Apply kernel and round. // This time, we have to round off the 6 extra bits which were kept // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits // of precision to match the scale of the dx and dy arrays. const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2; const int warped = ROUND_POWER_OF_TWO(result, round_bits); const int src_px = src[(x + j) + (y + i) * stride] << 3; const int dt = warped - src_px; b[0] += dx[i * DISFLOW_PATCH_SIZE + j] * dt; b[1] += dy[i * DISFLOW_PATCH_SIZE + j] * dt; } } } static inline void sobel_filter(const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int dir) { int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; // Sobel filter kernel // This must have an overall scale factor equal to DISFLOW_DERIV_SCALE, // in order to produce correctly scaled outputs. // To work out the scale factor, we multiply two factors: // // * For the derivative filter (sobel_a), comparing our filter // image[x - 1] - image[x + 1] // to the standard form // d/dx image[x] = image[x+1] - image[x] // tells us that we're actually calculating -2 * d/dx image[2] // // * For the smoothing filter (sobel_b), all coefficients are positive // so the scale factor is just the sum of the coefficients // // Thus we need to make sure that DISFLOW_DERIV_SCALE = 2 * sum(sobel_b) // (and take care of the - sign from sobel_a elsewhere) static const int16_t sobel_a[3] = { 1, 0, -1 }; static const int16_t sobel_b[3] = { 1, 2, 1 }; const int taps = 3; // horizontal filter const int16_t *h_kernel = dir ? sobel_a : sobel_b; for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) { for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { int sum = 0; for (int k = 0; k < taps; ++k) { sum += h_kernel[k] * src[y * src_stride + (x + k - 1)]; } tmp[y * DISFLOW_PATCH_SIZE + x] = sum; } } // vertical filter const int16_t *v_kernel = dir ? sobel_b : sobel_a; for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) { for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { int sum = 0; for (int k = 0; k < taps; ++k) { sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x]; } dst[y * dst_stride + x] = sum; } } } // Computes the components of the system of equations used to solve for // a flow vector. // // The flow equations are a least-squares system, derived as follows: // // For each pixel in the patch, we calculate the current error `dt`, // and the x and y gradients `dx` and `dy` of the source patch. // This means that, to first order, the squared error for this pixel is // // (dt + u * dx + v * dy)^2 // // where (u, v) are the incremental changes to the flow vector. // // We then want to find the values of u and v which minimize the sum // of the squared error across all pixels. Conveniently, this fits exactly // into the form of a least squares problem, with one equation // // u * dx + v * dy = -dt // // for each pixel. // // Summing across all pixels in a square window of size DISFLOW_PATCH_SIZE, // and absorbing the - sign elsewhere, this results in the least squares system // // M = |sum(dx * dx) sum(dx * dy)| // |sum(dx * dy) sum(dy * dy)| // // b = |sum(dx * dt)| // |sum(dy * dt)| static inline void compute_flow_matrix(const int16_t *dx, int dx_stride, const int16_t *dy, int dy_stride, double *M) { int tmp[4] = { 0 }; for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { for (int j = 0; j < DISFLOW_PATCH_SIZE; j++) { tmp[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j]; tmp[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j]; // Don't compute tmp[2], as it should be equal to tmp[1] tmp[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j]; } } // Apply regularization // We follow the standard regularization method of adding `k * I` before // inverting. This ensures that the matrix will be invertible. // // Setting the regularization strength k to 1 seems to work well here, as // typical values coming from the other equations are very large (1e5 to // 1e6, with an upper limit of around 6e7, at the time of writing). // It also preserves the property that all matrix values are whole numbers, // which is convenient for integerized SIMD implementation. tmp[0] += 1; tmp[3] += 1; tmp[2] = tmp[1]; M[0] = (double)tmp[0]; M[1] = (double)tmp[1]; M[2] = (double)tmp[2]; M[3] = (double)tmp[3]; } // Try to invert the matrix M // Note: Due to the nature of how a least-squares matrix is constructed, all of // the eigenvalues will be >= 0, and therefore det M >= 0 as well. // The regularization term `+ k * I` further ensures that det M >= k^2. // As mentioned in compute_flow_matrix(), here we use k = 1, so det M >= 1. // So we don't have to worry about non-invertible matrices here. static inline void invert_2x2(const double *M, double *M_inv) { double det = (M[0] * M[3]) - (M[1] * M[2]); assert(det >= 1); const double det_inv = 1 / det; M_inv[0] = M[3] * det_inv; M_inv[1] = -M[1] * det_inv; M_inv[2] = -M[2] * det_inv; M_inv[3] = M[0] * det_inv; } void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v) { double M[4]; double M_inv[4]; int b[2]; int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; // Compute gradients within this patch const uint8_t *src_patch = &src[y * stride + x]; sobel_filter(src_patch, stride, dx, DISFLOW_PATCH_SIZE, 1); sobel_filter(src_patch, stride, dy, DISFLOW_PATCH_SIZE, 0); compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M); invert_2x2(M, M_inv); for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy, b); // Solve flow equations to find a better estimate for the flow vector // at this point const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1]; const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1]; *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2); *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2); if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) { // Stop iteration when we're close to convergence break; } } } static void fill_flow_field_borders(double *flow, int width, int height, int stride) { // Calculate the bounds of the rectangle which was filled in by // compute_flow_field() before calling this function. // These indices are inclusive on both ends. const int left_index = FLOW_BORDER_INNER; const int right_index = (width - FLOW_BORDER_INNER - 1); const int top_index = FLOW_BORDER_INNER; const int bottom_index = (height - FLOW_BORDER_INNER - 1); // Left area for (int i = top_index; i <= bottom_index; i += 1) { double *row = flow + i * stride; const double left = row[left_index]; for (int j = -FLOW_BORDER_OUTER; j < left_index; j++) { row[j] = left; } } // Right area for (int i = top_index; i <= bottom_index; i += 1) { double *row = flow + i * stride; const double right = row[right_index]; for (int j = right_index + 1; j < width + FLOW_BORDER_OUTER; j++) { row[j] = right; } } // Top area const double *top_row = flow + top_index * stride - FLOW_BORDER_OUTER; for (int i = -FLOW_BORDER_OUTER; i < top_index; i++) { double *row = flow + i * stride - FLOW_BORDER_OUTER; size_t length = width + 2 * FLOW_BORDER_OUTER; memcpy(row, top_row, length * sizeof(*row)); } // Bottom area const double *bottom_row = flow + bottom_index * stride - FLOW_BORDER_OUTER; for (int i = bottom_index + 1; i < height + FLOW_BORDER_OUTER; i++) { double *row = flow + i * stride - FLOW_BORDER_OUTER; size_t length = width + 2 * FLOW_BORDER_OUTER; memcpy(row, bottom_row, length * sizeof(*row)); } } // Upscale one component of the flow field, from a size of // cur_width x cur_height to a size of (2*cur_width) x (2*cur_height), storing // the result back into the same buffer. This function also scales the flow // vector by 2, so that when we move to the next pyramid level down, the implied // motion vector is the same. // // The temporary buffer tmpbuf must be large enough to hold an intermediate // array of size stride * cur_height, *plus* FLOW_BORDER_OUTER rows above and // below. In other words, indices from -FLOW_BORDER_OUTER * stride to // (cur_height + FLOW_BORDER_OUTER) * stride - 1 must be valid. // // Note that the same stride is used for u before and after upscaling // and for the temporary buffer, for simplicity. // // A note on phasing: // // The flow fields at two adjacent pyramid levels are offset from each other, // and we need to account for this in the construction of the interpolation // kernels. // // Consider an 8x8 pixel patch at pyramid level n. This is split into four // patches at pyramid level n-1. Bringing these patches back up to pyramid level // n, each sub-patch covers 4x4 pixels, and between them they cover the same // 8x8 region. // // Therefore, at pyramid level n, two adjacent patches look like this: // // + - - - - - - - + - - - - - - - + // | | | // | x x | x x | // | | | // | # | # | // | | | // | x x | x x | // | | | // + - - - - - - - + - - - - - - - + // // where # marks the center of a patch at pyramid level n (the input to this // function), and x marks the center of a patch at pyramid level n-1 (the output // of this function). // // By counting pixels (marked by +, -, and |), we can see that the flow vectors // at pyramid level n-1 are offset relative to the flow vectors at pyramid // level n, by 1/4 of the larger (input) patch size. Therefore, our // interpolation kernels need to have phases of 0.25 and 0.75. // // In addition, in order to handle the frame edges correctly, we need to // generate one output vector to the left and one to the right of each input // vector, even though these must be interpolated using different source points. static void upscale_flow_component(double *flow, int cur_width, int cur_height, int stride, double *tmpbuf) { const int half_len = FLOW_UPSCALE_TAPS / 2; // Check that the outer border is large enough to avoid needing to clamp // the source locations assert(half_len <= FLOW_BORDER_OUTER); // Horizontal upscale and multiply by 2 for (int i = 0; i < cur_height; i++) { for (int j = 0; j < cur_width; j++) { double left = 0; for (int k = -half_len; k < half_len; k++) { left += flow[i * stride + (j + k)] * flow_upscale_filter[0][k + half_len]; } tmpbuf[i * stride + (2 * j + 0)] = 2.0 * left; // Right output pixel is 0.25 units to the right of the input pixel double right = 0; for (int k = -(half_len - 1); k < (half_len + 1); k++) { right += flow[i * stride + (j + k)] * flow_upscale_filter[1][k + (half_len - 1)]; } tmpbuf[i * stride + (2 * j + 1)] = 2.0 * right; } } // Fill in top and bottom borders of tmpbuf const double *top_row = &tmpbuf[0]; for (int i = -FLOW_BORDER_OUTER; i < 0; i++) { double *row = &tmpbuf[i * stride]; memcpy(row, top_row, 2 * cur_width * sizeof(*row)); } const double *bottom_row = &tmpbuf[(cur_height - 1) * stride]; for (int i = cur_height; i < cur_height + FLOW_BORDER_OUTER; i++) { double *row = &tmpbuf[i * stride]; memcpy(row, bottom_row, 2 * cur_width * sizeof(*row)); } // Vertical upscale int upscaled_width = cur_width * 2; for (int i = 0; i < cur_height; i++) { for (int j = 0; j < upscaled_width; j++) { double top = 0; for (int k = -half_len; k < half_len; k++) { top += tmpbuf[(i + k) * stride + j] * flow_upscale_filter[0][k + half_len]; } flow[(2 * i) * stride + j] = top; double bottom = 0; for (int k = -(half_len - 1); k < (half_len + 1); k++) { bottom += tmpbuf[(i + k) * stride + j] * flow_upscale_filter[1][k + (half_len - 1)]; } flow[(2 * i + 1) * stride + j] = bottom; } } } // make sure flow_u and flow_v start at 0 static bool compute_flow_field(const ImagePyramid *src_pyr, const ImagePyramid *ref_pyr, int n_levels, FlowField *flow) { bool mem_status = true; double *flow_u = flow->u; double *flow_v = flow->v; double *tmpbuf0; double *tmpbuf; if (n_levels < 2) { // tmpbuf not needed tmpbuf0 = NULL; tmpbuf = NULL; } else { // This line must match the calculation of cur_flow_height below const int layer1_height = src_pyr->layers[1].height >> DOWNSAMPLE_SHIFT; const size_t tmpbuf_size = (layer1_height + 2 * FLOW_BORDER_OUTER) * flow->stride; tmpbuf0 = aom_malloc(tmpbuf_size * sizeof(*tmpbuf0)); if (!tmpbuf0) { mem_status = false; goto free_tmpbuf; } tmpbuf = tmpbuf0 + FLOW_BORDER_OUTER * flow->stride; } // Compute flow field from coarsest to finest level of the pyramid // // Note: We stop after refining pyramid level 1 and interpolating it to // generate an initial flow field at level 0. We do *not* refine the dense // flow field at level 0. Instead, we wait until we have generated // correspondences by interpolating this flow field, and then refine the // correspondences themselves. This is both faster and gives better output // compared to refining the flow field at level 0 and then interpolating. for (int level = n_levels - 1; level >= 1; --level) { const PyramidLayer *cur_layer = &src_pyr->layers[level]; const int cur_width = cur_layer->width; const int cur_height = cur_layer->height; const int cur_stride = cur_layer->stride; const uint8_t *src_buffer = cur_layer->buffer; const uint8_t *ref_buffer = ref_pyr->layers[level].buffer; const int cur_flow_width = cur_width >> DOWNSAMPLE_SHIFT; const int cur_flow_height = cur_height >> DOWNSAMPLE_SHIFT; const int cur_flow_stride = flow->stride; for (int i = FLOW_BORDER_INNER; i < cur_flow_height - FLOW_BORDER_INNER; i += 1) { for (int j = FLOW_BORDER_INNER; j < cur_flow_width - FLOW_BORDER_INNER; j += 1) { const int flow_field_idx = i * cur_flow_stride + j; // Calculate the position of a patch of size DISFLOW_PATCH_SIZE pixels, // which is centered on the region covered by this flow field entry const int patch_center_x = (j << DOWNSAMPLE_SHIFT) + UPSAMPLE_CENTER_OFFSET; // In pixels const int patch_center_y = (i << DOWNSAMPLE_SHIFT) + UPSAMPLE_CENTER_OFFSET; // In pixels const int patch_tl_x = patch_center_x - DISFLOW_PATCH_CENTER; const int patch_tl_y = patch_center_y - DISFLOW_PATCH_CENTER; assert(patch_tl_x >= 0); assert(patch_tl_y >= 0); aom_compute_flow_at_point(src_buffer, ref_buffer, patch_tl_x, patch_tl_y, cur_width, cur_height, cur_stride, &flow_u[flow_field_idx], &flow_v[flow_field_idx]); } } // Fill in the areas which we haven't explicitly computed, with copies // of the outermost values which we did compute fill_flow_field_borders(flow_u, cur_flow_width, cur_flow_height, cur_flow_stride); fill_flow_field_borders(flow_v, cur_flow_width, cur_flow_height, cur_flow_stride); if (level > 0) { const int upscale_flow_width = cur_flow_width << 1; const int upscale_flow_height = cur_flow_height << 1; const int upscale_stride = flow->stride; upscale_flow_component(flow_u, cur_flow_width, cur_flow_height, cur_flow_stride, tmpbuf); upscale_flow_component(flow_v, cur_flow_width, cur_flow_height, cur_flow_stride, tmpbuf); // If we didn't fill in the rightmost column or bottommost row during // upsampling (in order to keep the ratio to exactly 2), fill them // in here by copying the next closest column/row const PyramidLayer *next_layer = &src_pyr->layers[level - 1]; const int next_flow_width = next_layer->width >> DOWNSAMPLE_SHIFT; const int next_flow_height = next_layer->height >> DOWNSAMPLE_SHIFT; // Rightmost column if (next_flow_width > upscale_flow_width) { assert(next_flow_width == upscale_flow_width + 1); for (int i = 0; i < upscale_flow_height; i++) { const int index = i * upscale_stride + upscale_flow_width; flow_u[index] = flow_u[index - 1]; flow_v[index] = flow_v[index - 1]; } } // Bottommost row if (next_flow_height > upscale_flow_height) { assert(next_flow_height == upscale_flow_height + 1); for (int j = 0; j < next_flow_width; j++) { const int index = upscale_flow_height * upscale_stride + j; flow_u[index] = flow_u[index - upscale_stride]; flow_v[index] = flow_v[index - upscale_stride]; } } } } free_tmpbuf: aom_free(tmpbuf0); return mem_status; } static FlowField *alloc_flow_field(int frame_width, int frame_height) { FlowField *flow = (FlowField *)aom_malloc(sizeof(FlowField)); if (flow == NULL) return NULL; // Calculate the size of the bottom (largest) layer of the flow pyramid flow->width = frame_width >> DOWNSAMPLE_SHIFT; flow->height = frame_height >> DOWNSAMPLE_SHIFT; flow->stride = flow->width + 2 * FLOW_BORDER_OUTER; const size_t flow_size = flow->stride * (size_t)(flow->height + 2 * FLOW_BORDER_OUTER); flow->buf0 = aom_calloc(2 * flow_size, sizeof(*flow->buf0)); if (!flow->buf0) { aom_free(flow); return NULL; } flow->u = flow->buf0 + FLOW_BORDER_OUTER * flow->stride + FLOW_BORDER_OUTER; flow->v = flow->u + flow_size; return flow; } static void free_flow_field(FlowField *flow) { aom_free(flow->buf0); aom_free(flow); } // Compute flow field between `src` and `ref`, and then use that flow to // compute a global motion model relating the two frames. // // Following the convention in flow_estimation.h, the flow vectors are computed // at fixed points in `src` and point to the corresponding locations in `ref`, // regardless of the temporal ordering of the frames. bool av1_compute_global_motion_disflow( TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, int bit_depth, int downsample_level, MotionModel *motion_models, int num_motion_models, bool *mem_alloc_failed) { // Precompute information we will need about each frame ImagePyramid *src_pyramid = src->y_pyramid; CornerList *src_corners = src->corners; ImagePyramid *ref_pyramid = ref->y_pyramid; const int src_layers = aom_compute_pyramid(src, bit_depth, DISFLOW_PYRAMID_LEVELS, src_pyramid); const int ref_layers = aom_compute_pyramid(ref, bit_depth, DISFLOW_PYRAMID_LEVELS, ref_pyramid); if (src_layers < 0 || ref_layers < 0) { *mem_alloc_failed = true; return false; } if (!av1_compute_corner_list(src, bit_depth, downsample_level, src_corners)) { *mem_alloc_failed = true; return false; } assert(src_layers == ref_layers); const int src_width = src_pyramid->layers[0].width; const int src_height = src_pyramid->layers[0].height; assert(ref_pyramid->layers[0].width == src_width); assert(ref_pyramid->layers[0].height == src_height); FlowField *flow = alloc_flow_field(src_width, src_height); if (!flow) { *mem_alloc_failed = true; return false; } if (!compute_flow_field(src_pyramid, ref_pyramid, src_layers, flow)) { *mem_alloc_failed = true; free_flow_field(flow); return false; } // find correspondences between the two images using the flow field Correspondence *correspondences = aom_malloc(src_corners->num_corners * sizeof(*correspondences)); if (!correspondences) { *mem_alloc_failed = true; free_flow_field(flow); return false; } const int num_correspondences = determine_disflow_correspondence( src_pyramid, ref_pyramid, src_corners, flow, correspondences); bool result = ransac(correspondences, num_correspondences, type, motion_models, num_motion_models, mem_alloc_failed); aom_free(correspondences); free_flow_field(flow); return result; } aom-3.12.1/aom_dsp/flow_estimation/disflow.h000066400000000000000000000074071477627663500210560ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_FLOW_ESTIMATION_DISFLOW_H_ #define AOM_AOM_DSP_FLOW_ESTIMATION_DISFLOW_H_ #include #include "aom_dsp/flow_estimation/flow_estimation.h" #include "aom_scale/yv12config.h" #ifdef __cplusplus extern "C" { #endif // Number of pyramid levels in disflow computation #define DISFLOW_PYRAMID_LEVELS 12 // Size of square patches in the disflow dense grid // Must be a power of 2 #define DISFLOW_PATCH_SIZE_LOG2 3 #define DISFLOW_PATCH_SIZE (1 << DISFLOW_PATCH_SIZE_LOG2) // Center point of square patch #define DISFLOW_PATCH_CENTER ((DISFLOW_PATCH_SIZE / 2) - 1) // Overall scale of the `dx`, `dy` and `dt` arrays in the disflow code // In other words, the various derivatives are calculated with an internal // precision of (8 + DISFLOW_DERIV_SCALE_LOG2) bits, from an 8-bit input. // // This must be carefully synchronized with the code in sobel_filter() // (which fills the dx and dy arrays) and compute_flow_error() (which // fills dt); see the comments in those functions for more details #define DISFLOW_DERIV_SCALE_LOG2 3 #define DISFLOW_DERIV_SCALE (1 << DISFLOW_DERIV_SCALE_LOG2) // Scale factor applied to each step in the main refinement loop // // This should be <= 1.0 to avoid overshoot. Values below 1.0 // may help in some cases, but slow convergence overall, so // will require careful tuning. // TODO(rachelbarker): Tune this value #define DISFLOW_STEP_SIZE 1.0 // Step size at which we should terminate iteration // The idea here is that, if we take a step which is much smaller than 1px in // size, then the values won't change much from iteration to iteration, so // many future steps will also be small, and that won't have much effect // on the ultimate result. So we can terminate early. // // To look at it another way, when we take a small step, that means that // either we're near to convergence (so can stop), or we're stuck in a // shallow valley and will take many iterations to get unstuck. // // Solving the latter properly requires fancier methods, such as "gradient // descent with momentum". For now, we terminate to avoid wasting a ton of // time on points which are either nearly-converged or stuck. // // Terminating at 1/8 px seems to give good results for global motion estimation #define DISFLOW_STEP_SIZE_THRESOLD (1. / 8.) // Max number of iterations if warp convergence is not found #define DISFLOW_MAX_ITR 4 // Internal precision of cubic interpolation filters // The limiting factor here is that: // * Before integerizing, the maximum value of any kernel tap is 1.0 // * After integerizing, each tap must fit into an int16_t. // Thus the largest multiplier we can get away with is 2^14 = 16384, // as 2^15 = 32768 is too large to fit in an int16_t. #define DISFLOW_INTERP_BITS 14 typedef struct { // Start of allocation for u and v buffers double *buf0; // x and y directions of flow, per patch double *u; double *v; // Sizes of the above arrays int width; int height; int stride; } FlowField; bool av1_compute_global_motion_disflow( TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, int bit_depth, int downsample_level, MotionModel *motion_models, int num_motion_models, bool *mem_alloc_failed); #ifdef __cplusplus } #endif #endif // AOM_AOM_DSP_FLOW_ESTIMATION_DISFLOW_H_ aom-3.12.1/aom_dsp/flow_estimation/flow_estimation.c000066400000000000000000000043111477627663500225740ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_dsp/flow_estimation/corner_match.h" #include "aom_dsp/flow_estimation/disflow.h" #include "aom_dsp/flow_estimation/flow_estimation.h" #include "aom_ports/mem.h" #include "aom_scale/yv12config.h" // clang-format off const double kIdentityParams[MAX_PARAMDIM] = { 0.0, 0.0, 1.0, 0.0, 0.0, 1.0 }; // clang-format on // Compute a global motion model between the given source and ref frames. // // As is standard for video codecs, the resulting model maps from (x, y) // coordinates in `src` to the corresponding points in `ref`, regardless // of the temporal order of the two frames. // // Returns true if global motion estimation succeeded, false if not. // The output models should only be used if this function succeeds. bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, int bit_depth, GlobalMotionMethod gm_method, int downsample_level, MotionModel *motion_models, int num_motion_models, bool *mem_alloc_failed) { switch (gm_method) { case GLOBAL_MOTION_METHOD_FEATURE_MATCH: return av1_compute_global_motion_feature_match( type, src, ref, bit_depth, downsample_level, motion_models, num_motion_models, mem_alloc_failed); case GLOBAL_MOTION_METHOD_DISFLOW: return av1_compute_global_motion_disflow( type, src, ref, bit_depth, downsample_level, motion_models, num_motion_models, mem_alloc_failed); default: assert(0 && "Unknown global motion estimation type"); } return false; } aom-3.12.1/aom_dsp/flow_estimation/flow_estimation.h000066400000000000000000000061471477627663500226120ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_FLOW_ESTIMATION_H_ #define AOM_AOM_DSP_FLOW_ESTIMATION_H_ #include "aom_dsp/pyramid.h" #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_ports/mem.h" #include "aom_scale/yv12config.h" #ifdef __cplusplus extern "C" { #endif #define MAX_PARAMDIM 6 #define MIN_INLIER_PROB 0.1 /* clang-format off */ enum { IDENTITY = 0, // identity transformation, 0-parameter TRANSLATION = 1, // translational motion 2-parameter ROTZOOM = 2, // simplified affine with rotation + zoom only, 4-parameter AFFINE = 3, // affine, 6-parameter TRANS_TYPES, } UENUM1BYTE(TransformationType); /* clang-format on */ // number of parameters used by each transformation in TransformationTypes static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 }; // Available methods which can be used for global motion estimation typedef enum { GLOBAL_MOTION_METHOD_FEATURE_MATCH, GLOBAL_MOTION_METHOD_DISFLOW, GLOBAL_MOTION_METHOD_LAST = GLOBAL_MOTION_METHOD_DISFLOW, GLOBAL_MOTION_METHODS } GlobalMotionMethod; typedef struct { double params[MAX_PARAMDIM]; int *inliers; int num_inliers; } MotionModel; // Data structure to store a single correspondence point during global // motion search. // // A correspondence (x, y) -> (rx, ry) means that point (x, y) in the // source frame corresponds to point (rx, ry) in the ref frame. typedef struct { double x, y; double rx, ry; } Correspondence; // Which global motion method should we use in practice? // Disflow is both faster and gives better results than feature matching in // practically all cases, so we use disflow by default static const GlobalMotionMethod default_global_motion_method = GLOBAL_MOTION_METHOD_DISFLOW; extern const double kIdentityParams[MAX_PARAMDIM]; // Compute a global motion model between the given source and ref frames. // // As is standard for video codecs, the resulting model maps from (x, y) // coordinates in `src` to the corresponding points in `ref`, regardless // of the temporal order of the two frames. // // Returns true if global motion estimation succeeded, false if not. // The output models should only be used if this function succeeds. bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, int bit_depth, GlobalMotionMethod gm_method, int downsample_level, MotionModel *motion_models, int num_motion_models, bool *mem_alloc_failed); #ifdef __cplusplus } #endif #endif // AOM_AOM_DSP_FLOW_ESTIMATION_H_ aom-3.12.1/aom_dsp/flow_estimation/ransac.c000066400000000000000000000424201477627663500206430ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include #include #include #include "aom_dsp/flow_estimation/ransac.h" #include "aom_dsp/mathutils.h" #include "aom_mem/aom_mem.h" // TODO(rachelbarker): Remove dependence on code in av1/encoder/ #include "av1/encoder/random.h" #define MAX_MINPTS 4 #define MINPTS_MULTIPLIER 5 #define INLIER_THRESHOLD 1.25 #define INLIER_THRESHOLD_SQUARED (INLIER_THRESHOLD * INLIER_THRESHOLD) // Number of initial models to generate #define NUM_TRIALS 20 // Number of times to refine the best model found #define NUM_REFINES 5 // Flag to enable functions for finding TRANSLATION type models. // // These modes are not considered currently due to a spec bug (see comments // in gm_get_motion_vector() in av1/common/mv.h). Thus we don't need to compile // the corresponding search functions, but it is nice to keep the source around // but disabled, for completeness. #define ALLOW_TRANSLATION_MODELS 0 typedef struct { int num_inliers; double sse; // Sum of squared errors of inliers int *inlier_indices; } RANSAC_MOTION; //////////////////////////////////////////////////////////////////////////////// // ransac typedef bool (*FindTransformationFunc)(const Correspondence *points, const int *indices, int num_indices, double *params); typedef void (*ScoreModelFunc)(const double *mat, const Correspondence *points, int num_points, RANSAC_MOTION *model); // vtable-like structure which stores all of the information needed by RANSAC // for a particular model type typedef struct { FindTransformationFunc find_transformation; ScoreModelFunc score_model; // The minimum number of points which can be passed to find_transformation // to generate a model. // // This should be set as small as possible. This is due to an observation // from section 4 of "Optimal Ransac" by A. Hast, J. Nysjö and // A. Marchetti (https://dspace5.zcu.cz/bitstream/11025/6869/1/Hast.pdf): // using the minimum possible number of points in the initial model maximizes // the chance that all of the selected points are inliers. // // That paper proposes a method which can deal with models which are // contaminated by outliers, which helps in cases where the inlier fraction // is low. However, for our purposes, global motion only gives significant // gains when the inlier fraction is high. // // So we do not use the method from this paper, but we do find that // minimizing the number of points used for initial model fitting helps // make the best use of the limited number of models we consider. int minpts; } RansacModelInfo; #if ALLOW_TRANSLATION_MODELS static void score_translation(const double *mat, const Correspondence *points, int num_points, RANSAC_MOTION *model) { model->num_inliers = 0; model->sse = 0.0; for (int i = 0; i < num_points; ++i) { const double x1 = points[i].x; const double y1 = points[i].y; const double x2 = points[i].rx; const double y2 = points[i].ry; const double proj_x = x1 + mat[0]; const double proj_y = y1 + mat[1]; const double dx = proj_x - x2; const double dy = proj_y - y2; const double sse = dx * dx + dy * dy; if (sse < INLIER_THRESHOLD_SQUARED) { model->inlier_indices[model->num_inliers++] = i; model->sse += sse; } } } #endif // ALLOW_TRANSLATION_MODELS static void score_affine(const double *mat, const Correspondence *points, int num_points, RANSAC_MOTION *model) { model->num_inliers = 0; model->sse = 0.0; for (int i = 0; i < num_points; ++i) { const double x1 = points[i].x; const double y1 = points[i].y; const double x2 = points[i].rx; const double y2 = points[i].ry; const double proj_x = mat[2] * x1 + mat[3] * y1 + mat[0]; const double proj_y = mat[4] * x1 + mat[5] * y1 + mat[1]; const double dx = proj_x - x2; const double dy = proj_y - y2; const double sse = dx * dx + dy * dy; if (sse < INLIER_THRESHOLD_SQUARED) { model->inlier_indices[model->num_inliers++] = i; model->sse += sse; } } } #if ALLOW_TRANSLATION_MODELS static bool find_translation(const Correspondence *points, const int *indices, int num_indices, double *params) { double sumx = 0; double sumy = 0; for (int i = 0; i < num_indices; ++i) { int index = indices[i]; const double sx = points[index].x; const double sy = points[index].y; const double dx = points[index].rx; const double dy = points[index].ry; sumx += dx - sx; sumy += dy - sy; } params[0] = sumx / np; params[1] = sumy / np; params[2] = 1; params[3] = 0; params[4] = 0; params[5] = 1; return true; } #endif // ALLOW_TRANSLATION_MODELS static bool find_rotzoom(const Correspondence *points, const int *indices, int num_indices, double *params) { const int n = 4; // Size of least-squares problem double mat[4 * 4]; // Accumulator for A'A double y[4]; // Accumulator for A'b double a[4]; // Single row of A double b; // Single element of b least_squares_init(mat, y, n); for (int i = 0; i < num_indices; ++i) { int index = indices[i]; const double sx = points[index].x; const double sy = points[index].y; const double dx = points[index].rx; const double dy = points[index].ry; a[0] = 1; a[1] = 0; a[2] = sx; a[3] = sy; b = dx; least_squares_accumulate(mat, y, a, b, n); a[0] = 0; a[1] = 1; a[2] = sy; a[3] = -sx; b = dy; least_squares_accumulate(mat, y, a, b, n); } // Fill in params[0] .. params[3] with output model if (!least_squares_solve(mat, y, params, n)) { return false; } // Fill in remaining parameters params[4] = -params[3]; params[5] = params[2]; return true; } static bool find_affine(const Correspondence *points, const int *indices, int num_indices, double *params) { // Note: The least squares problem for affine models is 6-dimensional, // but it splits into two independent 3-dimensional subproblems. // Solving these two subproblems separately and recombining at the end // results in less total computation than solving the 6-dimensional // problem directly. // // The two subproblems correspond to all the parameters which contribute // to the x output of the model, and all the parameters which contribute // to the y output, respectively. const int n = 3; // Size of each least-squares problem double mat[2][3 * 3]; // Accumulator for A'A double y[2][3]; // Accumulator for A'b double x[2][3]; // Output vector double a[2][3]; // Single row of A double b[2]; // Single element of b least_squares_init(mat[0], y[0], n); least_squares_init(mat[1], y[1], n); for (int i = 0; i < num_indices; ++i) { int index = indices[i]; const double sx = points[index].x; const double sy = points[index].y; const double dx = points[index].rx; const double dy = points[index].ry; a[0][0] = 1; a[0][1] = sx; a[0][2] = sy; b[0] = dx; least_squares_accumulate(mat[0], y[0], a[0], b[0], n); a[1][0] = 1; a[1][1] = sx; a[1][2] = sy; b[1] = dy; least_squares_accumulate(mat[1], y[1], a[1], b[1], n); } if (!least_squares_solve(mat[0], y[0], x[0], n)) { return false; } if (!least_squares_solve(mat[1], y[1], x[1], n)) { return false; } // Rearrange least squares result to form output model params[0] = x[0][0]; params[1] = x[1][0]; params[2] = x[0][1]; params[3] = x[0][2]; params[4] = x[1][1]; params[5] = x[1][2]; return true; } // Return -1 if 'a' is a better motion, 1 if 'b' is better, 0 otherwise. static int compare_motions(const void *arg_a, const void *arg_b) { const RANSAC_MOTION *motion_a = (RANSAC_MOTION *)arg_a; const RANSAC_MOTION *motion_b = (RANSAC_MOTION *)arg_b; if (motion_a->num_inliers > motion_b->num_inliers) return -1; if (motion_a->num_inliers < motion_b->num_inliers) return 1; if (motion_a->sse < motion_b->sse) return -1; if (motion_a->sse > motion_b->sse) return 1; return 0; } static bool is_better_motion(const RANSAC_MOTION *motion_a, const RANSAC_MOTION *motion_b) { return compare_motions(motion_a, motion_b) < 0; } // Returns true on success, false on error static bool ransac_internal(const Correspondence *matched_points, int npoints, MotionModel *motion_models, int num_desired_motions, const RansacModelInfo *model_info, bool *mem_alloc_failed) { assert(npoints >= 0); int i = 0; int minpts = model_info->minpts; bool ret_val = true; unsigned int seed = (unsigned int)npoints; int indices[MAX_MINPTS] = { 0 }; // Store information for the num_desired_motions best transformations found // and the worst motion among them, as well as the motion currently under // consideration. RANSAC_MOTION *motions, *worst_kept_motion = NULL; RANSAC_MOTION current_motion; // Store the parameters and the indices of the inlier points for the motion // currently under consideration. double params_this_motion[MAX_PARAMDIM]; // Initialize output models, as a fallback in case we can't find a model for (i = 0; i < num_desired_motions; i++) { memcpy(motion_models[i].params, kIdentityParams, MAX_PARAMDIM * sizeof(*(motion_models[i].params))); motion_models[i].num_inliers = 0; } if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) { return false; } int min_inliers = AOMMAX((int)(MIN_INLIER_PROB * npoints), minpts); motions = (RANSAC_MOTION *)aom_calloc(num_desired_motions, sizeof(RANSAC_MOTION)); // Allocate one large buffer which will be carved up to store the inlier // indices for the current motion plus the num_desired_motions many // output models // This allows us to keep the allocation/deallocation logic simple, without // having to (for example) check that `motions` is non-null before allocating // the inlier arrays int *inlier_buffer = (int *)aom_malloc(sizeof(*inlier_buffer) * npoints * (num_desired_motions + 1)); if (!(motions && inlier_buffer)) { ret_val = false; *mem_alloc_failed = true; goto finish_ransac; } // Once all our allocations are known-good, we can fill in our structures worst_kept_motion = motions; for (i = 0; i < num_desired_motions; ++i) { motions[i].inlier_indices = inlier_buffer + i * npoints; } memset(¤t_motion, 0, sizeof(current_motion)); current_motion.inlier_indices = inlier_buffer + num_desired_motions * npoints; for (int trial_count = 0; trial_count < NUM_TRIALS; trial_count++) { lcg_pick(npoints, minpts, indices, &seed); if (!model_info->find_transformation(matched_points, indices, minpts, params_this_motion)) { continue; } model_info->score_model(params_this_motion, matched_points, npoints, ¤t_motion); if (current_motion.num_inliers < min_inliers) { // Reject models with too few inliers continue; } if (is_better_motion(¤t_motion, worst_kept_motion)) { // This motion is better than the worst currently kept motion. Remember // the inlier points and sse. The parameters for each kept motion // will be recomputed later using only the inliers. worst_kept_motion->num_inliers = current_motion.num_inliers; worst_kept_motion->sse = current_motion.sse; // Rather than copying the (potentially many) inlier indices from // current_motion.inlier_indices to worst_kept_motion->inlier_indices, // we can swap the underlying pointers. // // This is okay because the next time current_motion.inlier_indices // is used will be in the next trial, where we ignore its previous // contents anyway. And both arrays will be deallocated together at the // end of this function, so there are no lifetime issues. int *tmp = worst_kept_motion->inlier_indices; worst_kept_motion->inlier_indices = current_motion.inlier_indices; current_motion.inlier_indices = tmp; // Determine the new worst kept motion and its num_inliers and sse. for (i = 0; i < num_desired_motions; ++i) { if (is_better_motion(worst_kept_motion, &motions[i])) { worst_kept_motion = &motions[i]; } } } } // Sort the motions, best first. qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions); // Refine each of the best N models using iterative estimation. // // The idea here is loosely based on the iterative method from // "Locally Optimized RANSAC" by O. Chum, J. Matas and Josef Kittler: // https://cmp.felk.cvut.cz/ftp/articles/matas/chum-dagm03.pdf // // However, we implement a simpler version than their proposal, and simply // refit the model repeatedly until the number of inliers stops increasing, // with a cap on the number of iterations to defend against edge cases which // only improve very slowly. for (i = 0; i < num_desired_motions; ++i) { if (motions[i].num_inliers <= 0) { // Output model has already been initialized to the identity model, // so just skip setup continue; } bool bad_model = false; for (int refine_count = 0; refine_count < NUM_REFINES; refine_count++) { int num_inliers = motions[i].num_inliers; assert(num_inliers >= min_inliers); if (!model_info->find_transformation(matched_points, motions[i].inlier_indices, num_inliers, params_this_motion)) { // In the unlikely event that this model fitting fails, we don't have a // good fallback. So leave this model set to the identity model bad_model = true; break; } // Score the newly generated model model_info->score_model(params_this_motion, matched_points, npoints, ¤t_motion); // At this point, there are three possibilities: // 1) If we found more inliers, keep refining. // 2) If we found the same number of inliers but a lower SSE, we want to // keep the new model, but further refinement is unlikely to gain much. // So commit to this new model // 3) It is possible, but very unlikely, that the new model will have // fewer inliers. If it does happen, we probably just lost a few // borderline inliers. So treat the same as case (2). if (current_motion.num_inliers > motions[i].num_inliers) { motions[i].num_inliers = current_motion.num_inliers; motions[i].sse = current_motion.sse; int *tmp = motions[i].inlier_indices; motions[i].inlier_indices = current_motion.inlier_indices; current_motion.inlier_indices = tmp; } else { // Refined model is no better, so stop // This shouldn't be significantly worse than the previous model, // so it's fine to use the parameters in params_this_motion. // This saves us from having to cache the previous iteration's params. break; } } if (bad_model) continue; // Fill in output struct memcpy(motion_models[i].params, params_this_motion, MAX_PARAMDIM * sizeof(*motion_models[i].params)); for (int j = 0; j < motions[i].num_inliers; j++) { int index = motions[i].inlier_indices[j]; const Correspondence *corr = &matched_points[index]; motion_models[i].inliers[2 * j + 0] = (int)rint(corr->x); motion_models[i].inliers[2 * j + 1] = (int)rint(corr->y); } motion_models[i].num_inliers = motions[i].num_inliers; } finish_ransac: aom_free(inlier_buffer); aom_free(motions); return ret_val; } static const RansacModelInfo ransac_model_info[TRANS_TYPES] = { // IDENTITY { NULL, NULL, 0 }, // TRANSLATION #if ALLOW_TRANSLATION_MODELS { find_translation, score_translation, 1 }, #else { NULL, NULL, 0 }, #endif // ROTZOOM { find_rotzoom, score_affine, 2 }, // AFFINE { find_affine, score_affine, 3 }, }; // Returns true on success, false on error bool ransac(const Correspondence *matched_points, int npoints, TransformationType type, MotionModel *motion_models, int num_desired_motions, bool *mem_alloc_failed) { #if ALLOW_TRANSLATION_MODELS assert(type > IDENTITY && type < TRANS_TYPES); #else assert(type > TRANSLATION && type < TRANS_TYPES); #endif // ALLOW_TRANSLATION_MODELS return ransac_internal(matched_points, npoints, motion_models, num_desired_motions, &ransac_model_info[type], mem_alloc_failed); } aom-3.12.1/aom_dsp/flow_estimation/ransac.h000066400000000000000000000020761477627663500206530ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_FLOW_ESTIMATION_RANSAC_H_ #define AOM_AOM_DSP_FLOW_ESTIMATION_RANSAC_H_ #include #include #include #include #include #include "aom_dsp/flow_estimation/flow_estimation.h" #ifdef __cplusplus extern "C" { #endif bool ransac(const Correspondence *matched_points, int npoints, TransformationType type, MotionModel *motion_models, int num_desired_motions, bool *mem_alloc_failed); #ifdef __cplusplus } #endif #endif // AOM_AOM_DSP_FLOW_ESTIMATION_RANSAC_H_ aom-3.12.1/aom_dsp/flow_estimation/x86/000077500000000000000000000000001477627663500176535ustar00rootroot00000000000000aom-3.12.1/aom_dsp/flow_estimation/x86/corner_match_avx2.c000066400000000000000000000125021477627663500234230ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" #include "aom_dsp/flow_estimation/corner_match.h" DECLARE_ALIGNED(32, static const uint16_t, ones_array[16]) = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; #if MATCH_SZ != 16 #error "Need to apply pixel mask in corner_match_avx2.c if MATCH_SZ != 16" #endif /* Compute mean and standard deviation of pixels in a window of size MATCH_SZ by MATCH_SZ centered at (x, y). Store results into *mean and *one_over_stddev Note: The output of this function is scaled by MATCH_SZ, as in *mean = MATCH_SZ * and *one_over_stddev = 1 / (MATCH_SZ * ) Combined with the fact that we return 1/stddev rather than the standard deviation itself, this allows us to completely avoid divisions in aom_compute_correlation, which is much hotter than this function is. Returns true if this feature point is usable, false otherwise. */ bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev) { __m256i sum_vec = _mm256_setzero_si256(); __m256i sumsq_vec = _mm256_setzero_si256(); frame += (y - MATCH_SZ_BY2) * stride + (x - MATCH_SZ_BY2); for (int i = 0; i < MATCH_SZ; ++i) { const __m256i v = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame)); sum_vec = _mm256_add_epi16(sum_vec, v); sumsq_vec = _mm256_add_epi32(sumsq_vec, _mm256_madd_epi16(v, v)); frame += stride; } // Reduce sum_vec and sumsq_vec into single values // Start by reducing each vector to 8x32-bit values, hadd() to perform 8 // additions, sum vertically to do 4 more, then the last 2 in scalar code. const __m256i ones = _mm256_load_si256((__m256i *)ones_array); const __m256i partial_sum = _mm256_madd_epi16(sum_vec, ones); const __m256i tmp_8x32 = _mm256_hadd_epi32(partial_sum, sumsq_vec); const __m128i tmp_4x32 = _mm_add_epi32(_mm256_extracti128_si256(tmp_8x32, 0), _mm256_extracti128_si256(tmp_8x32, 1)); const int sum = _mm_extract_epi32(tmp_4x32, 0) + _mm_extract_epi32(tmp_4x32, 1); const int sumsq = _mm_extract_epi32(tmp_4x32, 2) + _mm_extract_epi32(tmp_4x32, 3); *mean = (double)sum / MATCH_SZ; const double variance = sumsq - (*mean) * (*mean); if (variance < MIN_FEATURE_VARIANCE) { *one_over_stddev = 0.0; return false; } *one_over_stddev = 1.0 / sqrt(variance); return true; } /* Compute corr(frame1, frame2) over a window of size MATCH_SZ by MATCH_SZ. To save on computation, the mean and (1 divided by the) standard deviation of the window in each frame are precomputed and passed into this function as arguments. */ double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2) { __m256i cross_vec = _mm256_setzero_si256(); frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); for (int i = 0; i < MATCH_SZ; ++i) { const __m256i v1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame1)); const __m256i v2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame2)); cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1, v2)); frame1 += stride1; frame2 += stride2; } // Sum cross_vec into a single value const __m128i tmp = _mm_add_epi32(_mm256_extracti128_si256(cross_vec, 0), _mm256_extracti128_si256(cross_vec, 1)); const int cross = _mm_extract_epi32(tmp, 0) + _mm_extract_epi32(tmp, 1) + _mm_extract_epi32(tmp, 2) + _mm_extract_epi32(tmp, 3); // Note: In theory, the calculations here "should" be // covariance = cross / N^2 - mean1 * mean2 // correlation = covariance / (stddev1 * stddev2). // // However, because of the scaling in aom_compute_mean_stddev, the // lines below actually calculate // covariance * N^2 = cross - (mean1 * N) * (mean2 * N) // correlation = (covariance * N^2) / ((stddev1 * N) * (stddev2 * N)) // // ie. we have removed the need for a division, and still end up with the // correct unscaled correlation (ie, in the range [-1, +1]) const double covariance = cross - mean1 * mean2; const double correlation = covariance * (one_over_stddev1 * one_over_stddev2); return correlation; } aom-3.12.1/aom_dsp/flow_estimation/x86/corner_match_sse4.c000066400000000000000000000136461477627663500234330ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include #include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" #include "aom_dsp/flow_estimation/corner_match.h" DECLARE_ALIGNED(16, static const uint16_t, ones_array[8]) = { 1, 1, 1, 1, 1, 1, 1, 1 }; #if MATCH_SZ != 16 #error "Need to apply pixel mask in corner_match_sse4.c if MATCH_SZ != 16" #endif /* Compute mean and standard deviation of pixels in a window of size MATCH_SZ by MATCH_SZ centered at (x, y). Store results into *mean and *one_over_stddev Note: The output of this function is scaled by MATCH_SZ, as in *mean = MATCH_SZ * and *one_over_stddev = 1 / (MATCH_SZ * ) Combined with the fact that we return 1/stddev rather than the standard deviation itself, this allows us to completely avoid divisions in aom_compute_correlation, which is much hotter than this function is. Returns true if this feature point is usable, false otherwise. */ bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev) { // 8 16-bit partial sums of pixels // Each lane sums at most 2*MATCH_SZ pixels, which can have values up to 255, // and is therefore at most 2*MATCH_SZ*255, which is > 2^8 but < 2^16. // Thus this value is safe to store in 16 bits. __m128i sum_vec = _mm_setzero_si128(); // 8 32-bit partial sums of squares __m128i sumsq_vec_l = _mm_setzero_si128(); __m128i sumsq_vec_r = _mm_setzero_si128(); frame += (y - MATCH_SZ_BY2) * stride + (x - MATCH_SZ_BY2); for (int i = 0; i < MATCH_SZ; ++i) { const __m128i v = _mm_loadu_si128((__m128i *)frame); const __m128i v_l = _mm_cvtepu8_epi16(v); const __m128i v_r = _mm_cvtepu8_epi16(_mm_srli_si128(v, 8)); sum_vec = _mm_add_epi16(sum_vec, _mm_add_epi16(v_l, v_r)); sumsq_vec_l = _mm_add_epi32(sumsq_vec_l, _mm_madd_epi16(v_l, v_l)); sumsq_vec_r = _mm_add_epi32(sumsq_vec_r, _mm_madd_epi16(v_r, v_r)); frame += stride; } // Reduce sum_vec and sumsq_vec into single values // Start by reducing each vector to 4x32-bit values, hadd() to perform four // additions, then perform the last two additions in scalar code. const __m128i ones = _mm_load_si128((__m128i *)ones_array); const __m128i partial_sum = _mm_madd_epi16(sum_vec, ones); const __m128i partial_sumsq = _mm_add_epi32(sumsq_vec_l, sumsq_vec_r); const __m128i tmp = _mm_hadd_epi32(partial_sum, partial_sumsq); const int sum = _mm_extract_epi32(tmp, 0) + _mm_extract_epi32(tmp, 1); const int sumsq = _mm_extract_epi32(tmp, 2) + _mm_extract_epi32(tmp, 3); *mean = (double)sum / MATCH_SZ; const double variance = sumsq - (*mean) * (*mean); if (variance < MIN_FEATURE_VARIANCE) { *one_over_stddev = 0.0; return false; } *one_over_stddev = 1.0 / sqrt(variance); return true; } /* Compute corr(frame1, frame2) over a window of size MATCH_SZ by MATCH_SZ. To save on computation, the mean and (1 divided by the) standard deviation of the window in each frame are precomputed and passed into this function as arguments. */ double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2) { // 8 32-bit partial sums of products __m128i cross_vec_l = _mm_setzero_si128(); __m128i cross_vec_r = _mm_setzero_si128(); frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); for (int i = 0; i < MATCH_SZ; ++i) { const __m128i v1 = _mm_loadu_si128((__m128i *)frame1); const __m128i v2 = _mm_loadu_si128((__m128i *)frame2); const __m128i v1_l = _mm_cvtepu8_epi16(v1); const __m128i v1_r = _mm_cvtepu8_epi16(_mm_srli_si128(v1, 8)); const __m128i v2_l = _mm_cvtepu8_epi16(v2); const __m128i v2_r = _mm_cvtepu8_epi16(_mm_srli_si128(v2, 8)); cross_vec_l = _mm_add_epi32(cross_vec_l, _mm_madd_epi16(v1_l, v2_l)); cross_vec_r = _mm_add_epi32(cross_vec_r, _mm_madd_epi16(v1_r, v2_r)); frame1 += stride1; frame2 += stride2; } // Sum cross_vec into a single value const __m128i tmp = _mm_add_epi32(cross_vec_l, cross_vec_r); const int cross = _mm_extract_epi32(tmp, 0) + _mm_extract_epi32(tmp, 1) + _mm_extract_epi32(tmp, 2) + _mm_extract_epi32(tmp, 3); // Note: In theory, the calculations here "should" be // covariance = cross / N^2 - mean1 * mean2 // correlation = covariance / (stddev1 * stddev2). // // However, because of the scaling in aom_compute_mean_stddev, the // lines below actually calculate // covariance * N^2 = cross - (mean1 * N) * (mean2 * N) // correlation = (covariance * N^2) / ((stddev1 * N) * (stddev2 * N)) // // ie. we have removed the need for a division, and still end up with the // correct unscaled correlation (ie, in the range [-1, +1]) const double covariance = cross - mean1 * mean2; const double correlation = covariance * (one_over_stddev1 * one_over_stddev2); return correlation; } aom-3.12.1/aom_dsp/flow_estimation/x86/disflow_avx2.c000066400000000000000000000445351477627663500224410ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/flow_estimation/disflow.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" #include "config/aom_dsp_rtcd.h" #if DISFLOW_PATCH_SIZE != 8 #error "Need to change disflow_avx2.c if DISFLOW_PATCH_SIZE != 8" #endif // Compute horizontal and vertical kernels and return them packed into a // register. The coefficient ordering is: // h0, h1, v0, v1, h2, h3, v2, v3 // This is chosen because it takes less work than fully separating the kernels, // but it is separated enough that we can pick out each coefficient pair in the // main compute_flow_at_point function static inline __m128i compute_cubic_kernels(double u, double v) { const __m128d x = _mm_set_pd(v, u); const __m128d x2 = _mm_mul_pd(x, x); const __m128d x3 = _mm_mul_pd(x2, x); // Macro to multiply a value v by a constant coefficient c #define MULC(c, v) _mm_mul_pd(_mm_set1_pd(c), v) // Compute floating-point kernel // Note: To ensure results are bit-identical to the C code, we need to perform // exactly the same sequence of operations here as in the C code. __m128d k0 = _mm_sub_pd(_mm_add_pd(MULC(-0.5, x), x2), MULC(0.5, x3)); __m128d k1 = _mm_add_pd(_mm_sub_pd(_mm_set1_pd(1.0), MULC(2.5, x2)), MULC(1.5, x3)); __m128d k2 = _mm_sub_pd(_mm_add_pd(MULC(0.5, x), MULC(2.0, x2)), MULC(1.5, x3)); __m128d k3 = _mm_add_pd(MULC(-0.5, x2), MULC(0.5, x3)); #undef MULC // Integerize __m128d prec = _mm_set1_pd((double)(1 << DISFLOW_INTERP_BITS)); k0 = _mm_round_pd(_mm_mul_pd(k0, prec), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); k1 = _mm_round_pd(_mm_mul_pd(k1, prec), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); k2 = _mm_round_pd(_mm_mul_pd(k2, prec), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); k3 = _mm_round_pd(_mm_mul_pd(k3, prec), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); const __m128i c0 = _mm_cvtpd_epi32(k0); const __m128i c1 = _mm_cvtpd_epi32(k1); const __m128i c2 = _mm_cvtpd_epi32(k2); const __m128i c3 = _mm_cvtpd_epi32(k3); // Rearrange results and convert down to 16 bits, giving the target output // ordering const __m128i c01 = _mm_unpacklo_epi32(c0, c1); const __m128i c23 = _mm_unpacklo_epi32(c2, c3); return _mm_packs_epi32(c01, c23); } // Compare two regions of width x height pixels, one rooted at position // (x, y) in src and the other at (x + u, y + v) in ref. // This function returns the sum of squared pixel differences between // the two regions. // // TODO(rachelbarker): Test speed/quality impact of using bilinear interpolation // instad of bicubic interpolation static inline void compute_flow_vector(const uint8_t *src, const uint8_t *ref, int width, int height, int stride, int x, int y, double u, double v, const int16_t *dx, const int16_t *dy, int *b) { const __m256i zero = _mm256_setzero_si256(); // Accumulate 8 32-bit partial sums for each element of b // These will be flattened at the end. __m256i b0_acc = _mm256_setzero_si256(); __m256i b1_acc = _mm256_setzero_si256(); // Split offset into integer and fractional parts, and compute cubic // interpolation kernels const int u_int = (int)floor(u); const int v_int = (int)floor(v); const double u_frac = u - floor(u); const double v_frac = v - floor(v); const __m128i kernels = compute_cubic_kernels(u_frac, v_frac); // Storage for intermediate values between the two convolution directions // In the AVX2 implementation, this needs a dummy row at the end, because // we generate 2 rows at a time but the total number of rows is odd. // So we generate one more row than we actually need. DECLARE_ALIGNED(32, int16_t, tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 4)]); int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; // Offset by one row // Clamp coordinates so that all pixels we fetch will remain within the // allocated border region, but allow them to go far enough out that // the border pixels' values do not change. // Since we are calculating an 8x8 block, the bottom-right pixel // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic // interpolation has 4 taps, meaning that the output of pixel // (x_w, y_w) depends on the pixels in the range // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]). // // Thus the most extreme coordinates which will be fetched are // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9). const int x0 = clamp(x + u_int, -9, width); const int y0 = clamp(y + v_int, -9, height); // Horizontal convolution // Prepare the kernel vectors // We split the kernel into two vectors with kernel indices: // 0, 1, 0, 1, 0, 1, 0, 1, and // 2, 3, 2, 3, 2, 3, 2, 3 __m256i h_kernel_01 = _mm256_broadcastd_epi32(kernels); __m256i h_kernel_23 = _mm256_broadcastd_epi32(_mm_srli_si128(kernels, 8)); __m256i round_const_h = _mm256_set1_epi32(1 << (DISFLOW_INTERP_BITS - 6 - 1)); for (int i = -1; i < DISFLOW_PATCH_SIZE + 2; i += 2) { const int y_w = y0 + i; const uint8_t *ref_row = &ref[y_w * stride + (x0 - 1)]; int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; // Load this row of pixels. // For an 8x8 patch, we need to load the 8 image pixels + 3 extras, // for a total of 11 pixels. Here we load 16 pixels, but only use // the first 11. __m256i row = yy_loadu2_128((__m128i *)(ref_row + stride), (__m128i *)ref_row); // Expand pixels to int16s // We must use unpacks here, as we have one row in each 128-bit lane // and want to handle each of those independently. // This is in contrast to _mm256_cvtepu8_epi16(), which takes a single // 128-bit input and widens it to 256 bits. __m256i px_0to7_i16 = _mm256_unpacklo_epi8(row, zero); __m256i px_4to10_i16 = _mm256_unpacklo_epi8(_mm256_srli_si256(row, 4), zero); // Compute first four outputs // input pixels 0, 1, 1, 2, 2, 3, 3, 4 // * kernel 0, 1, 0, 1, 0, 1, 0, 1 __m256i px0 = _mm256_unpacklo_epi16(px_0to7_i16, _mm256_srli_si256(px_0to7_i16, 2)); // input pixels 2, 3, 3, 4, 4, 5, 5, 6 // * kernel 2, 3, 2, 3, 2, 3, 2, 3 __m256i px1 = _mm256_unpacklo_epi16(_mm256_srli_si256(px_0to7_i16, 4), _mm256_srli_si256(px_0to7_i16, 6)); // Convolve with kernel and sum 2x2 boxes to form first 4 outputs __m256i sum0 = _mm256_add_epi32(_mm256_madd_epi16(px0, h_kernel_01), _mm256_madd_epi16(px1, h_kernel_23)); __m256i out0 = _mm256_srai_epi32(_mm256_add_epi32(sum0, round_const_h), DISFLOW_INTERP_BITS - 6); // Compute second four outputs __m256i px2 = _mm256_unpacklo_epi16(px_4to10_i16, _mm256_srli_si256(px_4to10_i16, 2)); __m256i px3 = _mm256_unpacklo_epi16(_mm256_srli_si256(px_4to10_i16, 4), _mm256_srli_si256(px_4to10_i16, 6)); __m256i sum1 = _mm256_add_epi32(_mm256_madd_epi16(px2, h_kernel_01), _mm256_madd_epi16(px3, h_kernel_23)); // Round by just enough bits that the result is // guaranteed to fit into an i16. Then the next stage can use 16 x 16 -> 32 // bit multiplies, which should be a fair bit faster than 32 x 32 -> 32 // as it does now // This means shifting down so we have 6 extra bits, for a maximum value // of +18360, which can occur if u_frac == 0.5 and the input pixels are // {0, 255, 255, 0}. __m256i out1 = _mm256_srai_epi32(_mm256_add_epi32(sum1, round_const_h), DISFLOW_INTERP_BITS - 6); _mm256_storeu_si256((__m256i *)tmp_row, _mm256_packs_epi32(out0, out1)); } // Vertical convolution const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2; __m256i round_const_v = _mm256_set1_epi32(1 << (round_bits - 1)); __m256i v_kernel_01 = _mm256_broadcastd_epi32(_mm_srli_si128(kernels, 4)); __m256i v_kernel_23 = _mm256_broadcastd_epi32(_mm_srli_si128(kernels, 12)); for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) { int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; // Load 5 rows of 8 x 16-bit values, and pack into 4 registers // holding rows {0, 1}, {1, 2}, {2, 3}, {3, 4} __m128i row0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); __m128i row1 = _mm_loadu_si128((__m128i *)tmp_row); __m128i row2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); __m128i row3 = _mm_loadu_si128((__m128i *)(tmp_row + 2 * DISFLOW_PATCH_SIZE)); __m128i row4 = _mm_loadu_si128((__m128i *)(tmp_row + 3 * DISFLOW_PATCH_SIZE)); __m256i px0 = _mm256_set_m128i(row1, row0); __m256i px1 = _mm256_set_m128i(row2, row1); __m256i px2 = _mm256_set_m128i(row3, row2); __m256i px3 = _mm256_set_m128i(row4, row3); // We want to calculate px0 * v_kernel[0] + px1 * v_kernel[1] + ... , // but each multiply expands its output to 32 bits. So we need to be // a little clever about how we do this __m256i sum0 = _mm256_add_epi32( _mm256_madd_epi16(_mm256_unpacklo_epi16(px0, px1), v_kernel_01), _mm256_madd_epi16(_mm256_unpacklo_epi16(px2, px3), v_kernel_23)); __m256i sum1 = _mm256_add_epi32( _mm256_madd_epi16(_mm256_unpackhi_epi16(px0, px1), v_kernel_01), _mm256_madd_epi16(_mm256_unpackhi_epi16(px2, px3), v_kernel_23)); __m256i sum0_rounded = _mm256_srai_epi32(_mm256_add_epi32(sum0, round_const_v), round_bits); __m256i sum1_rounded = _mm256_srai_epi32(_mm256_add_epi32(sum1, round_const_v), round_bits); __m256i warped = _mm256_packs_epi32(sum0_rounded, sum1_rounded); __m128i src_pixels_u8 = xx_loadu_2x64(&src[(y + i + 1) * stride + x], &src[(y + i) * stride + x]); __m256i src_pixels = _mm256_slli_epi16(_mm256_cvtepu8_epi16(src_pixels_u8), 3); // Calculate delta from the target patch __m256i dt = _mm256_sub_epi16(warped, src_pixels); // Load 2x8 elements each of dx and dt, to pair with the 2x8 elements of dt // that we have just computed. Then compute 2x8 partial sums of dx * dt // and dy * dt, implicitly sum to give 2x4 partial sums of each, and // accumulate. __m256i dx_row = _mm256_loadu_si256((__m256i *)&dx[i * DISFLOW_PATCH_SIZE]); __m256i dy_row = _mm256_loadu_si256((__m256i *)&dy[i * DISFLOW_PATCH_SIZE]); b0_acc = _mm256_add_epi32(b0_acc, _mm256_madd_epi16(dx_row, dt)); b1_acc = _mm256_add_epi32(b1_acc, _mm256_madd_epi16(dy_row, dt)); } // Flatten the two sets of partial sums to find the final value of b // We need to set b[0] = sum(b0_acc), b[1] = sum(b1_acc). // We need to do 14 additions in total; a `hadd` instruction can take care // of eight of them, then a vertical sum can do four more, leaving two // scalar additions. __m256i partial_sum_256 = _mm256_hadd_epi32(b0_acc, b1_acc); __m128i partial_sum = _mm_add_epi32(_mm256_extracti128_si256(partial_sum_256, 0), _mm256_extracti128_si256(partial_sum_256, 1)); b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1); b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3); } // Compute the x and y gradients of the source patch in a single pass, // and store into dx and dy respectively. static inline void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx, int16_t *dy) { const __m256i zero = _mm256_setzero_si256(); // Loop setup: Load the first two rows (of 10 input rows) and apply // the horizontal parts of the two filters __m256i row_m1_0 = yy_loadu2_128((__m128i *)(src - 1), (__m128i *)(src - src_stride - 1)); __m256i row_m1_0_a = _mm256_unpacklo_epi8(row_m1_0, zero); __m256i row_m1_0_b = _mm256_unpacklo_epi8(_mm256_srli_si256(row_m1_0, 1), zero); __m256i row_m1_0_c = _mm256_unpacklo_epi8(_mm256_srli_si256(row_m1_0, 2), zero); __m256i row_m1_0_hsmooth = _mm256_add_epi16(_mm256_add_epi16(row_m1_0_a, row_m1_0_c), _mm256_slli_epi16(row_m1_0_b, 1)); __m256i row_m1_0_hdiff = _mm256_sub_epi16(row_m1_0_a, row_m1_0_c); // Main loop: For each pair of output rows (i, i+1): // * Load rows (i+1, i+2) and apply both horizontal filters // * Apply vertical filters and store results // * Shift rows for next iteration for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) { // Load rows (i+1, i+2) and apply both horizontal filters const __m256i row_p1_p2 = yy_loadu2_128((__m128i *)(src + (i + 2) * src_stride - 1), (__m128i *)(src + (i + 1) * src_stride - 1)); const __m256i row_p1_p2_a = _mm256_unpacklo_epi8(row_p1_p2, zero); const __m256i row_p1_p2_b = _mm256_unpacklo_epi8(_mm256_srli_si256(row_p1_p2, 1), zero); const __m256i row_p1_p2_c = _mm256_unpacklo_epi8(_mm256_srli_si256(row_p1_p2, 2), zero); const __m256i row_p1_p2_hsmooth = _mm256_add_epi16(_mm256_add_epi16(row_p1_p2_a, row_p1_p2_c), _mm256_slli_epi16(row_p1_p2_b, 1)); const __m256i row_p1_p2_hdiff = _mm256_sub_epi16(row_p1_p2_a, row_p1_p2_c); // Apply vertical filters and store results // dx = vertical smooth(horizontal diff(input)) // dy = vertical diff(horizontal smooth(input)) const __m256i row_0_p1_hdiff = _mm256_permute2x128_si256(row_m1_0_hdiff, row_p1_p2_hdiff, 0x21); const __m256i dx_row = _mm256_add_epi16(_mm256_add_epi16(row_m1_0_hdiff, row_p1_p2_hdiff), _mm256_slli_epi16(row_0_p1_hdiff, 1)); const __m256i dy_row = _mm256_sub_epi16(row_m1_0_hsmooth, row_p1_p2_hsmooth); _mm256_storeu_si256((__m256i *)(dx + i * DISFLOW_PATCH_SIZE), dx_row); _mm256_storeu_si256((__m256i *)(dy + i * DISFLOW_PATCH_SIZE), dy_row); // Shift rows for next iteration // This allows a lot of work to be reused, reducing the number of // horizontal filtering operations from 2*3*8 = 48 to 2*10 = 20 row_m1_0_hsmooth = row_p1_p2_hsmooth; row_m1_0_hdiff = row_p1_p2_hdiff; } } static inline void compute_flow_matrix(const int16_t *dx, int dx_stride, const int16_t *dy, int dy_stride, double *M) { __m256i acc[4] = { 0 }; for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) { __m256i dx_row = _mm256_loadu_si256((__m256i *)&dx[i * dx_stride]); __m256i dy_row = _mm256_loadu_si256((__m256i *)&dy[i * dy_stride]); acc[0] = _mm256_add_epi32(acc[0], _mm256_madd_epi16(dx_row, dx_row)); acc[1] = _mm256_add_epi32(acc[1], _mm256_madd_epi16(dx_row, dy_row)); // Don't compute acc[2], as it should be equal to acc[1] acc[3] = _mm256_add_epi32(acc[3], _mm256_madd_epi16(dy_row, dy_row)); } // Condense sums __m256i partial_sum_0 = _mm256_hadd_epi32(acc[0], acc[1]); __m256i partial_sum_1 = _mm256_hadd_epi32(acc[1], acc[3]); __m256i result_256 = _mm256_hadd_epi32(partial_sum_0, partial_sum_1); __m128i result = _mm_add_epi32(_mm256_extracti128_si256(result_256, 0), _mm256_extracti128_si256(result_256, 1)); // Apply regularization // We follow the standard regularization method of adding `k * I` before // inverting. This ensures that the matrix will be invertible. // // Setting the regularization strength k to 1 seems to work well here, as // typical values coming from the other equations are very large (1e5 to // 1e6, with an upper limit of around 6e7, at the time of writing). // It also preserves the property that all matrix values are whole numbers, // which is convenient for integerized SIMD implementation. result = _mm_add_epi32(result, _mm_set_epi32(1, 0, 0, 1)); // Convert results to doubles and store _mm256_storeu_pd(M, _mm256_cvtepi32_pd(result)); } // Try to invert the matrix M // Note: Due to the nature of how a least-squares matrix is constructed, all of // the eigenvalues will be >= 0, and therefore det M >= 0 as well. // The regularization term `+ k * I` further ensures that det M >= k^2. // As mentioned in compute_flow_matrix(), here we use k = 1, so det M >= 1. // So we don't have to worry about non-invertible matrices here. static inline void invert_2x2(const double *M, double *M_inv) { double det = (M[0] * M[3]) - (M[1] * M[2]); assert(det >= 1); const double det_inv = 1 / det; M_inv[0] = M[3] * det_inv; M_inv[1] = -M[1] * det_inv; M_inv[2] = -M[2] * det_inv; M_inv[3] = M[0] * det_inv; } void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v) { DECLARE_ALIGNED(32, double, M[4]); DECLARE_ALIGNED(32, double, M_inv[4]); DECLARE_ALIGNED(32, int16_t, dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); DECLARE_ALIGNED(32, int16_t, dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); int b[2]; // Compute gradients within this patch const uint8_t *src_patch = &src[y * stride + x]; sobel_filter(src_patch, stride, dx, dy); compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M); invert_2x2(M, M_inv); for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy, b); // Solve flow equations to find a better estimate for the flow vector // at this point const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1]; const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1]; *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2); *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2); if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) { // Stop iteration when we're close to convergence break; } } } aom-3.12.1/aom_dsp/flow_estimation/x86/disflow_sse4.c000066400000000000000000000415101477627663500224250ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/flow_estimation/disflow.h" #include "aom_dsp/x86/synonyms.h" #include "config/aom_dsp_rtcd.h" #if DISFLOW_PATCH_SIZE != 8 #error "Need to change disflow_sse4.c if DISFLOW_PATCH_SIZE != 8" #endif // Compute horizontal and vertical kernels and return them packed into a // register. The coefficient ordering is: // h0, h1, v0, v1, h2, h3, v2, v3 // This is chosen because it takes less work than fully separating the kernels, // but it is separated enough that we can pick out each coefficient pair in the // main compute_flow_at_point function static inline __m128i compute_cubic_kernels(double u, double v) { const __m128d x = _mm_set_pd(v, u); const __m128d x2 = _mm_mul_pd(x, x); const __m128d x3 = _mm_mul_pd(x2, x); // Macro to multiply a value v by a constant coefficient c #define MULC(c, v) _mm_mul_pd(_mm_set1_pd(c), v) // Compute floating-point kernel // Note: To ensure results are bit-identical to the C code, we need to perform // exactly the same sequence of operations here as in the C code. __m128d k0 = _mm_sub_pd(_mm_add_pd(MULC(-0.5, x), x2), MULC(0.5, x3)); __m128d k1 = _mm_add_pd(_mm_sub_pd(_mm_set1_pd(1.0), MULC(2.5, x2)), MULC(1.5, x3)); __m128d k2 = _mm_sub_pd(_mm_add_pd(MULC(0.5, x), MULC(2.0, x2)), MULC(1.5, x3)); __m128d k3 = _mm_add_pd(MULC(-0.5, x2), MULC(0.5, x3)); #undef MULC // Integerize __m128d prec = _mm_set1_pd((double)(1 << DISFLOW_INTERP_BITS)); k0 = _mm_round_pd(_mm_mul_pd(k0, prec), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); k1 = _mm_round_pd(_mm_mul_pd(k1, prec), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); k2 = _mm_round_pd(_mm_mul_pd(k2, prec), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); k3 = _mm_round_pd(_mm_mul_pd(k3, prec), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); const __m128i c0 = _mm_cvtpd_epi32(k0); const __m128i c1 = _mm_cvtpd_epi32(k1); const __m128i c2 = _mm_cvtpd_epi32(k2); const __m128i c3 = _mm_cvtpd_epi32(k3); // Rearrange results and convert down to 16 bits, giving the target output // ordering const __m128i c01 = _mm_unpacklo_epi32(c0, c1); const __m128i c23 = _mm_unpacklo_epi32(c2, c3); return _mm_packs_epi32(c01, c23); } // Compare two regions of width x height pixels, one rooted at position // (x, y) in src and the other at (x + u, y + v) in ref. // This function returns the sum of squared pixel differences between // the two regions. // // TODO(rachelbarker): Test speed/quality impact of using bilinear interpolation // instad of bicubic interpolation static inline void compute_flow_vector(const uint8_t *src, const uint8_t *ref, int width, int height, int stride, int x, int y, double u, double v, const int16_t *dx, const int16_t *dy, int *b) { // This function is written to do 8x8 convolutions only assert(DISFLOW_PATCH_SIZE == 8); // Accumulate 4 32-bit partial sums for each element of b // These will be flattened at the end. __m128i b0_acc = _mm_setzero_si128(); __m128i b1_acc = _mm_setzero_si128(); // Split offset into integer and fractional parts, and compute cubic // interpolation kernels const int u_int = (int)floor(u); const int v_int = (int)floor(v); const double u_frac = u - floor(u); const double v_frac = v - floor(v); const __m128i kernels = compute_cubic_kernels(u_frac, v_frac); // Storage for intermediate values between the two convolution directions DECLARE_ALIGNED(16, int16_t, tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]); int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; // Offset by one row // Clamp coordinates so that all pixels we fetch will remain within the // allocated border region, but allow them to go far enough out that // the border pixels' values do not change. // Since we are calculating an 8x8 block, the bottom-right pixel // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic // interpolation has 4 taps, meaning that the output of pixel // (x_w, y_w) depends on the pixels in the range // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]). // // Thus the most extreme coordinates which will be fetched are // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9). const int x0 = clamp(x + u_int, -9, width); const int y0 = clamp(y + v_int, -9, height); // Horizontal convolution // Prepare the kernel vectors // We split the kernel into two vectors with kernel indices: // 0, 1, 0, 1, 0, 1, 0, 1, and // 2, 3, 2, 3, 2, 3, 2, 3 __m128i h_kernel_01 = _mm_set1_epi32(_mm_extract_epi32(kernels, 0)); __m128i h_kernel_23 = _mm_set1_epi32(_mm_extract_epi32(kernels, 2)); __m128i round_const_h = _mm_set1_epi32(1 << (DISFLOW_INTERP_BITS - 6 - 1)); for (int i = -1; i < DISFLOW_PATCH_SIZE + 2; ++i) { const int y_w = y0 + i; const uint8_t *ref_row = &ref[y_w * stride + (x0 - 1)]; int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; // Load this row of pixels. // For an 8x8 patch, we need to load the 8 image pixels + 3 extras, // for a total of 11 pixels. Here we load 16 pixels, but only use // the first 11. __m128i row = _mm_loadu_si128((__m128i *)ref_row); // Expand pixels to int16s __m128i px_0to7_i16 = _mm_cvtepu8_epi16(row); __m128i px_4to10_i16 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 4)); // Compute first four outputs // input pixels 0, 1, 1, 2, 2, 3, 3, 4 // * kernel 0, 1, 0, 1, 0, 1, 0, 1 __m128i px0 = _mm_unpacklo_epi16(px_0to7_i16, _mm_srli_si128(px_0to7_i16, 2)); // input pixels 2, 3, 3, 4, 4, 5, 5, 6 // * kernel 2, 3, 2, 3, 2, 3, 2, 3 __m128i px1 = _mm_unpacklo_epi16(_mm_srli_si128(px_0to7_i16, 4), _mm_srli_si128(px_0to7_i16, 6)); // Convolve with kernel and sum 2x2 boxes to form first 4 outputs __m128i sum0 = _mm_add_epi32(_mm_madd_epi16(px0, h_kernel_01), _mm_madd_epi16(px1, h_kernel_23)); __m128i out0 = _mm_srai_epi32(_mm_add_epi32(sum0, round_const_h), DISFLOW_INTERP_BITS - 6); // Compute second four outputs __m128i px2 = _mm_unpacklo_epi16(px_4to10_i16, _mm_srli_si128(px_4to10_i16, 2)); __m128i px3 = _mm_unpacklo_epi16(_mm_srli_si128(px_4to10_i16, 4), _mm_srli_si128(px_4to10_i16, 6)); __m128i sum1 = _mm_add_epi32(_mm_madd_epi16(px2, h_kernel_01), _mm_madd_epi16(px3, h_kernel_23)); // Round by just enough bits that the result is // guaranteed to fit into an i16. Then the next stage can use 16 x 16 -> 32 // bit multiplies, which should be a fair bit faster than 32 x 32 -> 32 // as it does now // This means shifting down so we have 6 extra bits, for a maximum value // of +18360, which can occur if u_frac == 0.5 and the input pixels are // {0, 255, 255, 0}. __m128i out1 = _mm_srai_epi32(_mm_add_epi32(sum1, round_const_h), DISFLOW_INTERP_BITS - 6); _mm_storeu_si128((__m128i *)tmp_row, _mm_packs_epi32(out0, out1)); } // Vertical convolution const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2; __m128i round_const_v = _mm_set1_epi32(1 << (round_bits - 1)); __m128i v_kernel_01 = _mm_set1_epi32(_mm_extract_epi32(kernels, 1)); __m128i v_kernel_23 = _mm_set1_epi32(_mm_extract_epi32(kernels, 3)); for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) { int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; // Load 4 rows of 8 x 16-bit values __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); __m128i px1 = _mm_loadu_si128((__m128i *)tmp_row); __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); __m128i px3 = _mm_loadu_si128((__m128i *)(tmp_row + 2 * DISFLOW_PATCH_SIZE)); // We want to calculate px0 * v_kernel[0] + px1 * v_kernel[1] + ... , // but each multiply expands its output to 32 bits. So we need to be // a little clever about how we do this __m128i sum0 = _mm_add_epi32( _mm_madd_epi16(_mm_unpacklo_epi16(px0, px1), v_kernel_01), _mm_madd_epi16(_mm_unpacklo_epi16(px2, px3), v_kernel_23)); __m128i sum1 = _mm_add_epi32( _mm_madd_epi16(_mm_unpackhi_epi16(px0, px1), v_kernel_01), _mm_madd_epi16(_mm_unpackhi_epi16(px2, px3), v_kernel_23)); __m128i sum0_rounded = _mm_srai_epi32(_mm_add_epi32(sum0, round_const_v), round_bits); __m128i sum1_rounded = _mm_srai_epi32(_mm_add_epi32(sum1, round_const_v), round_bits); __m128i warped = _mm_packs_epi32(sum0_rounded, sum1_rounded); __m128i src_pixels_u8 = _mm_loadl_epi64((__m128i *)&src[(y + i) * stride + x]); __m128i src_pixels = _mm_slli_epi16(_mm_cvtepu8_epi16(src_pixels_u8), 3); // Calculate delta from the target patch __m128i dt = _mm_sub_epi16(warped, src_pixels); // Load 8 elements each of dx and dt, to pair with the 8 elements of dt // that we have just computed. Then compute 8 partial sums of dx * dt // and dy * dt, implicitly sum to give 4 partial sums of each, and // accumulate. __m128i dx_row = _mm_loadu_si128((__m128i *)&dx[i * DISFLOW_PATCH_SIZE]); __m128i dy_row = _mm_loadu_si128((__m128i *)&dy[i * DISFLOW_PATCH_SIZE]); b0_acc = _mm_add_epi32(b0_acc, _mm_madd_epi16(dx_row, dt)); b1_acc = _mm_add_epi32(b1_acc, _mm_madd_epi16(dy_row, dt)); } // Flatten the two sets of partial sums to find the final value of b // We need to set b[0] = sum(b0_acc), b[1] = sum(b1_acc). // We need to do 6 additions in total; a `hadd` instruction can take care // of four of them, leaving two scalar additions. __m128i partial_sum = _mm_hadd_epi32(b0_acc, b1_acc); b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1); b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3); } // Compute the x and y gradients of the source patch in a single pass, // and store into dx and dy respectively. static inline void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx, int16_t *dy) { // Loop setup: Load the first two rows (of 10 input rows) and apply // the horizontal parts of the two filters __m128i row_m1 = _mm_loadu_si128((__m128i *)(src - src_stride - 1)); __m128i row_m1_a = _mm_cvtepu8_epi16(row_m1); __m128i row_m1_b = _mm_cvtepu8_epi16(_mm_srli_si128(row_m1, 1)); __m128i row_m1_c = _mm_cvtepu8_epi16(_mm_srli_si128(row_m1, 2)); __m128i row_m1_hsmooth = _mm_add_epi16(_mm_add_epi16(row_m1_a, row_m1_c), _mm_slli_epi16(row_m1_b, 1)); __m128i row_m1_hdiff = _mm_sub_epi16(row_m1_a, row_m1_c); __m128i row = _mm_loadu_si128((__m128i *)(src - 1)); __m128i row_a = _mm_cvtepu8_epi16(row); __m128i row_b = _mm_cvtepu8_epi16(_mm_srli_si128(row, 1)); __m128i row_c = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2)); __m128i row_hsmooth = _mm_add_epi16(_mm_add_epi16(row_a, row_c), _mm_slli_epi16(row_b, 1)); __m128i row_hdiff = _mm_sub_epi16(row_a, row_c); // Main loop: For each of the 8 output rows: // * Load row i+1 and apply both horizontal filters // * Apply vertical filters and store results // * Shift rows for next iteration for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { // Load row i+1 and apply both horizontal filters const __m128i row_p1 = _mm_loadu_si128((__m128i *)(src + (i + 1) * src_stride - 1)); const __m128i row_p1_a = _mm_cvtepu8_epi16(row_p1); const __m128i row_p1_b = _mm_cvtepu8_epi16(_mm_srli_si128(row_p1, 1)); const __m128i row_p1_c = _mm_cvtepu8_epi16(_mm_srli_si128(row_p1, 2)); const __m128i row_p1_hsmooth = _mm_add_epi16( _mm_add_epi16(row_p1_a, row_p1_c), _mm_slli_epi16(row_p1_b, 1)); const __m128i row_p1_hdiff = _mm_sub_epi16(row_p1_a, row_p1_c); // Apply vertical filters and store results // dx = vertical smooth(horizontal diff(input)) // dy = vertical diff(horizontal smooth(input)) const __m128i dx_row = _mm_add_epi16(_mm_add_epi16(row_m1_hdiff, row_p1_hdiff), _mm_slli_epi16(row_hdiff, 1)); const __m128i dy_row = _mm_sub_epi16(row_m1_hsmooth, row_p1_hsmooth); _mm_storeu_si128((__m128i *)(dx + i * DISFLOW_PATCH_SIZE), dx_row); _mm_storeu_si128((__m128i *)(dy + i * DISFLOW_PATCH_SIZE), dy_row); // Shift rows for next iteration // This allows a lot of work to be reused, reducing the number of // horizontal filtering operations from 2*3*8 = 48 to 2*10 = 20 row_m1_hsmooth = row_hsmooth; row_m1_hdiff = row_hdiff; row_hsmooth = row_p1_hsmooth; row_hdiff = row_p1_hdiff; } } static inline void compute_flow_matrix(const int16_t *dx, int dx_stride, const int16_t *dy, int dy_stride, double *M) { __m128i acc[4] = { 0 }; for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { __m128i dx_row = _mm_loadu_si128((__m128i *)&dx[i * dx_stride]); __m128i dy_row = _mm_loadu_si128((__m128i *)&dy[i * dy_stride]); acc[0] = _mm_add_epi32(acc[0], _mm_madd_epi16(dx_row, dx_row)); acc[1] = _mm_add_epi32(acc[1], _mm_madd_epi16(dx_row, dy_row)); // Don't compute acc[2], as it should be equal to acc[1] acc[3] = _mm_add_epi32(acc[3], _mm_madd_epi16(dy_row, dy_row)); } // Condense sums __m128i partial_sum_0 = _mm_hadd_epi32(acc[0], acc[1]); __m128i partial_sum_1 = _mm_hadd_epi32(acc[1], acc[3]); __m128i result = _mm_hadd_epi32(partial_sum_0, partial_sum_1); // Apply regularization // We follow the standard regularization method of adding `k * I` before // inverting. This ensures that the matrix will be invertible. // // Setting the regularization strength k to 1 seems to work well here, as // typical values coming from the other equations are very large (1e5 to // 1e6, with an upper limit of around 6e7, at the time of writing). // It also preserves the property that all matrix values are whole numbers, // which is convenient for integerized SIMD implementation. result = _mm_add_epi32(result, _mm_set_epi32(1, 0, 0, 1)); // Convert results to doubles and store _mm_storeu_pd(M, _mm_cvtepi32_pd(result)); _mm_storeu_pd(M + 2, _mm_cvtepi32_pd(_mm_srli_si128(result, 8))); } // Try to invert the matrix M // Note: Due to the nature of how a least-squares matrix is constructed, all of // the eigenvalues will be >= 0, and therefore det M >= 0 as well. // The regularization term `+ k * I` further ensures that det M >= k^2. // As mentioned in compute_flow_matrix(), here we use k = 1, so det M >= 1. // So we don't have to worry about non-invertible matrices here. static inline void invert_2x2(const double *M, double *M_inv) { double det = (M[0] * M[3]) - (M[1] * M[2]); assert(det >= 1); const double det_inv = 1 / det; M_inv[0] = M[3] * det_inv; M_inv[1] = -M[1] * det_inv; M_inv[2] = -M[2] * det_inv; M_inv[3] = M[0] * det_inv; } void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v) { DECLARE_ALIGNED(16, double, M[4]); DECLARE_ALIGNED(16, double, M_inv[4]); DECLARE_ALIGNED(16, int16_t, dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); DECLARE_ALIGNED(16, int16_t, dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); int b[2]; // Compute gradients within this patch const uint8_t *src_patch = &src[y * stride + x]; sobel_filter(src_patch, stride, dx, dy); compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M); invert_2x2(M, M_inv); for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy, b); // Solve flow equations to find a better estimate for the flow vector // at this point const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1]; const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1]; *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2); *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2); if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) { // Stop iteration when we're close to convergence break; } } } aom-3.12.1/aom_dsp/fwd_txfm.c000066400000000000000000000201461477627663500160100ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom_dsp/txfm_common.h" #include "config/aom_dsp_rtcd.h" void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. // We need an intermediate buffer between passes. tran_low_t intermediate[4 * 4]; const tran_low_t *in_low = NULL; tran_low_t *out = intermediate; // Do the two transform passes for (int pass = 0; pass < 2; ++pass) { tran_high_t in_high[4]; // canbe16 tran_high_t step[4]; // canbe16 tran_low_t temp[4]; for (int i = 0; i < 4; ++i) { // Load inputs. if (pass == 0) { in_high[0] = input[0 * stride] * 16; in_high[1] = input[1 * stride] * 16; in_high[2] = input[2 * stride] * 16; in_high[3] = input[3 * stride] * 16; if (i == 0 && in_high[0]) { ++in_high[0]; } ++input; // Next column } else { assert(in_low != NULL); in_high[0] = in_low[0 * 4]; in_high[1] = in_low[1 * 4]; in_high[2] = in_low[2 * 4]; in_high[3] = in_low[3 * 4]; ++in_low; // Next column (which is a transposed row) } // Transform. step[0] = in_high[0] + in_high[3]; step[1] = in_high[1] + in_high[2]; step[2] = in_high[1] - in_high[2]; step[3] = in_high[0] - in_high[3]; temp[0] = (tran_low_t)fdct_round_shift((step[0] + step[1]) * cospi_16_64); temp[2] = (tran_low_t)fdct_round_shift((step[0] - step[1]) * cospi_16_64); temp[1] = (tran_low_t)fdct_round_shift(step[2] * cospi_24_64 + step[3] * cospi_8_64); temp[3] = (tran_low_t)fdct_round_shift(-step[2] * cospi_8_64 + step[3] * cospi_24_64); // Only transpose the first pass. if (pass == 0) { out[0] = temp[0]; out[1] = temp[1]; out[2] = temp[2]; out[3] = temp[3]; out += 4; } else { out[0 * 4] = temp[0]; out[1 * 4] = temp[1]; out[2 * 4] = temp[2]; out[3 * 4] = temp[3]; ++out; } } // Setup in/out for next pass. in_low = intermediate; out = output; } for (int i = 0; i < 4; ++i) { for (int j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2; } } void aom_fdct4x4_lp_c(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. // We need an intermediate buffer between passes. int16_t intermediate[4 * 4]; const int16_t *in_low = NULL; int16_t *out = intermediate; // Do the two transform passes for (int pass = 0; pass < 2; ++pass) { int32_t in_high[4]; // canbe16 int32_t step[4]; // canbe16 int16_t temp[4]; for (int i = 0; i < 4; ++i) { // Load inputs. if (pass == 0) { in_high[0] = input[0 * stride] * 16; in_high[1] = input[1 * stride] * 16; in_high[2] = input[2 * stride] * 16; in_high[3] = input[3 * stride] * 16; ++input; if (i == 0 && in_high[0]) { ++in_high[0]; } } else { assert(in_low != NULL); in_high[0] = in_low[0 * 4]; in_high[1] = in_low[1 * 4]; in_high[2] = in_low[2 * 4]; in_high[3] = in_low[3 * 4]; ++in_low; } // Transform. step[0] = in_high[0] + in_high[3]; step[1] = in_high[1] + in_high[2]; step[2] = in_high[1] - in_high[2]; step[3] = in_high[0] - in_high[3]; temp[0] = (int16_t)fdct_round_shift((step[0] + step[1]) * cospi_16_64); temp[2] = (int16_t)fdct_round_shift((step[0] - step[1]) * cospi_16_64); temp[1] = (int16_t)fdct_round_shift(step[2] * cospi_24_64 + step[3] * cospi_8_64); temp[3] = (int16_t)fdct_round_shift(-step[2] * cospi_8_64 + step[3] * cospi_24_64); // Only transpose the first pass. if (pass == 0) { out[0] = temp[0]; out[1] = temp[1]; out[2] = temp[2]; out[3] = temp[3]; out += 4; } else { out[0 * 4] = temp[0]; out[1 * 4] = temp[1]; out[2 * 4] = temp[2]; out[3 * 4] = temp[3]; ++out; } } // Setup in/out for next pass. in_low = intermediate; out = output; } for (int i = 0; i < 4; ++i) { for (int j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2; } } #if CONFIG_INTERNAL_STATS void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { int i, j; tran_low_t intermediate[64]; int pass; tran_low_t *output = intermediate; const tran_low_t *in = NULL; // Transform columns for (pass = 0; pass < 2; ++pass) { tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 tran_high_t t0, t1, t2, t3; // needs32 tran_high_t x0, x1, x2, x3; // canbe16 for (i = 0; i < 8; i++) { // stage 1 if (pass == 0) { s0 = (input[0 * stride] + input[7 * stride]) * 4; s1 = (input[1 * stride] + input[6 * stride]) * 4; s2 = (input[2 * stride] + input[5 * stride]) * 4; s3 = (input[3 * stride] + input[4 * stride]) * 4; s4 = (input[3 * stride] - input[4 * stride]) * 4; s5 = (input[2 * stride] - input[5 * stride]) * 4; s6 = (input[1 * stride] - input[6 * stride]) * 4; s7 = (input[0 * stride] - input[7 * stride]) * 4; ++input; } else { s0 = in[0 * 8] + in[7 * 8]; s1 = in[1 * 8] + in[6 * 8]; s2 = in[2 * 8] + in[5 * 8]; s3 = in[3 * 8] + in[4 * 8]; s4 = in[3 * 8] - in[4 * 8]; s5 = in[2 * 8] - in[5 * 8]; s6 = in[1 * 8] - in[6 * 8]; s7 = in[0 * 8] - in[7 * 8]; ++in; } // fdct4(step, step); x0 = s0 + s3; x1 = s1 + s2; x2 = s1 - s2; x3 = s0 - s3; t0 = (x0 + x1) * cospi_16_64; t1 = (x0 - x1) * cospi_16_64; t2 = x2 * cospi_24_64 + x3 * cospi_8_64; t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; output[0] = (tran_low_t)fdct_round_shift(t0); output[2] = (tran_low_t)fdct_round_shift(t2); output[4] = (tran_low_t)fdct_round_shift(t1); output[6] = (tran_low_t)fdct_round_shift(t3); // Stage 2 t0 = (s6 - s5) * cospi_16_64; t1 = (s6 + s5) * cospi_16_64; t2 = fdct_round_shift(t0); t3 = fdct_round_shift(t1); // Stage 3 x0 = s4 + t2; x1 = s4 - t2; x2 = s7 - t3; x3 = s7 + t3; // Stage 4 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; t1 = x1 * cospi_12_64 + x2 * cospi_20_64; t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; output[1] = (tran_low_t)fdct_round_shift(t0); output[3] = (tran_low_t)fdct_round_shift(t2); output[5] = (tran_low_t)fdct_round_shift(t1); output[7] = (tran_low_t)fdct_round_shift(t3); output += 8; } in = intermediate; output = final_output; } // Rows for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2; } } #endif // CONFIG_INTERNAL_STATS #if CONFIG_AV1_HIGHBITDEPTH && CONFIG_INTERNAL_STATS void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { aom_fdct8x8_c(input, final_output, stride); } #endif aom-3.12.1/aom_dsp/grain_params.h000066400000000000000000000114171477627663500166430ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief Describes film grain parameters * */ #ifndef AOM_AOM_DSP_GRAIN_PARAMS_H_ #define AOM_AOM_DSP_GRAIN_PARAMS_H_ #ifdef __cplusplus extern "C" { #endif #include #include #include "config/aom_config.h" /*!\brief Structure containing film grain synthesis parameters for a frame * * This structure contains input parameters for film grain synthesis */ typedef struct { // This structure is compared element-by-element in the function // aom_check_grain_params_equiv: this function must be updated if any changes // are made to this structure. int apply_grain; int update_parameters; // 8 bit values int scaling_points_y[14][2]; int num_y_points; // value: 0..14 // 8 bit values int scaling_points_cb[10][2]; int num_cb_points; // value: 0..10 // 8 bit values int scaling_points_cr[10][2]; int num_cr_points; // value: 0..10 int scaling_shift; // values : 8..11 int ar_coeff_lag; // values: 0..3 // 8 bit values int ar_coeffs_y[24]; int ar_coeffs_cb[25]; int ar_coeffs_cr[25]; // Shift value: AR coeffs range // 6: [-2, 2) // 7: [-1, 1) // 8: [-0.5, 0.5) // 9: [-0.25, 0.25) int ar_coeff_shift; // values : 6..9 int cb_mult; // 8 bits int cb_luma_mult; // 8 bits int cb_offset; // 9 bits int cr_mult; // 8 bits int cr_luma_mult; // 8 bits int cr_offset; // 9 bits int overlap_flag; int clip_to_restricted_range; unsigned int bit_depth; // video bit depth int chroma_scaling_from_luma; int grain_scale_shift; uint16_t random_seed; // This structure is compared element-by-element in the function // aom_check_grain_params_equiv: this function must be updated if any changes // are made to this structure. } aom_film_grain_t; /*!\brief Check if two film grain parameters structs are equivalent * * Check if two film grain parameters are equal, except for the * update_parameters and random_seed elements which are ignored. * * \param[in] pa The first set of parameters to compare * \param[in] pb The second set of parameters to compare * \return Returns 1 if the params are equivalent, 0 otherwise */ static inline int aom_check_grain_params_equiv( const aom_film_grain_t *const pa, const aom_film_grain_t *const pb) { if (pa->apply_grain != pb->apply_grain) return 0; // Don't compare update_parameters if (pa->num_y_points != pb->num_y_points) return 0; if (memcmp(pa->scaling_points_y, pb->scaling_points_y, pa->num_y_points * 2 * sizeof(*pa->scaling_points_y)) != 0) return 0; if (pa->num_cb_points != pb->num_cb_points) return 0; if (memcmp(pa->scaling_points_cb, pb->scaling_points_cb, pa->num_cb_points * 2 * sizeof(*pa->scaling_points_cb)) != 0) return 0; if (pa->num_cr_points != pb->num_cr_points) return 0; if (memcmp(pa->scaling_points_cr, pb->scaling_points_cr, pa->num_cr_points * 2 * sizeof(*pa->scaling_points_cr)) != 0) return 0; if (pa->scaling_shift != pb->scaling_shift) return 0; if (pa->ar_coeff_lag != pb->ar_coeff_lag) return 0; const int num_pos = 2 * pa->ar_coeff_lag * (pa->ar_coeff_lag + 1); if (memcmp(pa->ar_coeffs_y, pb->ar_coeffs_y, num_pos * sizeof(*pa->ar_coeffs_y)) != 0) return 0; if (memcmp(pa->ar_coeffs_cb, pb->ar_coeffs_cb, num_pos * sizeof(*pa->ar_coeffs_cb)) != 0) return 0; if (memcmp(pa->ar_coeffs_cr, pb->ar_coeffs_cr, num_pos * sizeof(*pa->ar_coeffs_cr)) != 0) return 0; if (pa->ar_coeff_shift != pb->ar_coeff_shift) return 0; if (pa->cb_mult != pb->cb_mult) return 0; if (pa->cb_luma_mult != pb->cb_luma_mult) return 0; if (pa->cb_offset != pb->cb_offset) return 0; if (pa->cr_mult != pb->cr_mult) return 0; if (pa->cr_luma_mult != pb->cr_luma_mult) return 0; if (pa->cr_offset != pb->cr_offset) return 0; if (pa->overlap_flag != pb->overlap_flag) return 0; if (pa->clip_to_restricted_range != pb->clip_to_restricted_range) return 0; if (pa->bit_depth != pb->bit_depth) return 0; if (pa->chroma_scaling_from_luma != pb->chroma_scaling_from_luma) return 0; if (pa->grain_scale_shift != pb->grain_scale_shift) return 0; return 1; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_GRAIN_PARAMS_H_ aom-3.12.1/aom_dsp/grain_table.c000066400000000000000000000316451477627663500164470ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief This file has the implementation details of the grain table. * * The file format is an ascii representation for readability and * editability. Array parameters are separated from the non-array * parameters and prefixed with a few characters to make for easy * localization with a parameter set. Each entry is prefixed with "E" * and the other parameters are only specified if "update-parms" is * non-zero. * * filmgrn1 * E * p ... * sY ... * sCb ... * sCr ... * cY .... * cCb .... * cCr .... * E ... */ #include #include #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/grain_table.h" #include "aom_mem/aom_mem.h" static const char kFileMagic[8] = "filmgrn1"; static void grain_table_entry_read(FILE *file, struct aom_internal_error_info *error_info, aom_film_grain_table_entry_t *entry) { aom_film_grain_t *pars = &entry->params; int num_read = fscanf(file, "E %" PRId64 " %" PRId64 " %d %hd %d\n", &entry->start_time, &entry->end_time, &pars->apply_grain, &pars->random_seed, &pars->update_parameters); if (num_read == 0 && feof(file)) return; if (num_read != 5) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to read entry header. Read %d != 5", num_read); return; } if (pars->update_parameters) { num_read = fscanf(file, "p %d %d %d %d %d %d %d %d %d %d %d %d\n", &pars->ar_coeff_lag, &pars->ar_coeff_shift, &pars->grain_scale_shift, &pars->scaling_shift, &pars->chroma_scaling_from_luma, &pars->overlap_flag, &pars->cb_mult, &pars->cb_luma_mult, &pars->cb_offset, &pars->cr_mult, &pars->cr_luma_mult, &pars->cr_offset); if (num_read != 12) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to read entry params. Read %d != 12", num_read); return; } if (!fscanf(file, "\tsY %d ", &pars->num_y_points)) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to read num y points"); return; } for (int i = 0; i < pars->num_y_points; ++i) { if (2 != fscanf(file, "%d %d", &pars->scaling_points_y[i][0], &pars->scaling_points_y[i][1])) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to read y scaling points"); return; } } if (!fscanf(file, "\n\tsCb %d", &pars->num_cb_points)) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to read num cb points"); return; } for (int i = 0; i < pars->num_cb_points; ++i) { if (2 != fscanf(file, "%d %d", &pars->scaling_points_cb[i][0], &pars->scaling_points_cb[i][1])) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to read cb scaling points"); return; } } if (!fscanf(file, "\n\tsCr %d", &pars->num_cr_points)) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to read num cr points"); return; } for (int i = 0; i < pars->num_cr_points; ++i) { if (2 != fscanf(file, "%d %d", &pars->scaling_points_cr[i][0], &pars->scaling_points_cr[i][1])) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to read cr scaling points"); return; } } if (fscanf(file, "\n\tcY")) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to read Y coeffs header (cY)"); return; } const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); for (int i = 0; i < n; ++i) { if (1 != fscanf(file, "%d", &pars->ar_coeffs_y[i])) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to read Y coeffs"); return; } } if (fscanf(file, "\n\tcCb")) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to read Cb coeffs header (cCb)"); return; } for (int i = 0; i <= n; ++i) { if (1 != fscanf(file, "%d", &pars->ar_coeffs_cb[i])) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to read Cb coeffs"); return; } } if (fscanf(file, "\n\tcCr")) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable read to Cr coeffs header (cCr)"); return; } for (int i = 0; i <= n; ++i) { if (1 != fscanf(file, "%d", &pars->ar_coeffs_cr[i])) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to read Cr coeffs"); return; } } (void)fscanf(file, "\n"); } } static void grain_table_entry_write(FILE *file, aom_film_grain_table_entry_t *entry) { const aom_film_grain_t *pars = &entry->params; fprintf(file, "E %" PRId64 " %" PRId64 " %d %d %d\n", entry->start_time, entry->end_time, pars->apply_grain, pars->random_seed, pars->update_parameters); if (pars->update_parameters) { fprintf(file, "\tp %d %d %d %d %d %d %d %d %d %d %d %d\n", pars->ar_coeff_lag, pars->ar_coeff_shift, pars->grain_scale_shift, pars->scaling_shift, pars->chroma_scaling_from_luma, pars->overlap_flag, pars->cb_mult, pars->cb_luma_mult, pars->cb_offset, pars->cr_mult, pars->cr_luma_mult, pars->cr_offset); fprintf(file, "\tsY %d ", pars->num_y_points); for (int i = 0; i < pars->num_y_points; ++i) { fprintf(file, " %d %d", pars->scaling_points_y[i][0], pars->scaling_points_y[i][1]); } fprintf(file, "\n\tsCb %d", pars->num_cb_points); for (int i = 0; i < pars->num_cb_points; ++i) { fprintf(file, " %d %d", pars->scaling_points_cb[i][0], pars->scaling_points_cb[i][1]); } fprintf(file, "\n\tsCr %d", pars->num_cr_points); for (int i = 0; i < pars->num_cr_points; ++i) { fprintf(file, " %d %d", pars->scaling_points_cr[i][0], pars->scaling_points_cr[i][1]); } fprintf(file, "\n\tcY"); const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); for (int i = 0; i < n; ++i) { fprintf(file, " %d", pars->ar_coeffs_y[i]); } fprintf(file, "\n\tcCb"); for (int i = 0; i <= n; ++i) { fprintf(file, " %d", pars->ar_coeffs_cb[i]); } fprintf(file, "\n\tcCr"); for (int i = 0; i <= n; ++i) { fprintf(file, " %d", pars->ar_coeffs_cr[i]); } fprintf(file, "\n"); } } // TODO(https://crbug.com/aomedia/3228): Update this function to return an // integer status. void aom_film_grain_table_append(aom_film_grain_table_t *t, int64_t time_stamp, int64_t end_time, const aom_film_grain_t *grain) { if (!t->tail || memcmp(grain, &t->tail->params, sizeof(*grain))) { aom_film_grain_table_entry_t *new_tail = aom_malloc(sizeof(*new_tail)); if (!new_tail) return; memset(new_tail, 0, sizeof(*new_tail)); if (t->tail) t->tail->next = new_tail; if (!t->head) t->head = new_tail; t->tail = new_tail; new_tail->start_time = time_stamp; new_tail->end_time = end_time; new_tail->params = *grain; } else { t->tail->end_time = AOMMAX(t->tail->end_time, end_time); t->tail->start_time = AOMMIN(t->tail->start_time, time_stamp); } } int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp, int64_t end_time, int erase, aom_film_grain_t *grain) { aom_film_grain_table_entry_t *entry = t->head; aom_film_grain_table_entry_t *prev_entry = NULL; uint16_t random_seed = grain ? grain->random_seed : 0; if (grain) memset(grain, 0, sizeof(*grain)); while (entry) { aom_film_grain_table_entry_t *next = entry->next; if (time_stamp >= entry->start_time && time_stamp < entry->end_time) { if (grain) { *grain = entry->params; if (time_stamp != 0) grain->random_seed = random_seed; } if (!erase) return 1; const int64_t entry_end_time = entry->end_time; if (time_stamp <= entry->start_time && end_time >= entry->end_time) { if (t->tail == entry) t->tail = prev_entry; if (prev_entry) { prev_entry->next = entry->next; } else { t->head = entry->next; } aom_free(entry); } else if (time_stamp <= entry->start_time && end_time < entry->end_time) { entry->start_time = end_time; } else if (time_stamp > entry->start_time && end_time >= entry->end_time) { entry->end_time = time_stamp; } else { aom_film_grain_table_entry_t *new_entry = aom_malloc(sizeof(*new_entry)); if (!new_entry) return 0; new_entry->next = entry->next; new_entry->start_time = end_time; new_entry->end_time = entry->end_time; new_entry->params = entry->params; entry->next = new_entry; entry->end_time = time_stamp; if (t->tail == entry) t->tail = new_entry; } // If segments aren't aligned, delete from the beginning of subsequent // segments if (end_time > entry_end_time) { // Ignoring the return value here is safe since we're erasing from the // beginning of subsequent entries. aom_film_grain_table_lookup(t, entry_end_time, end_time, /*erase=*/1, NULL); } return 1; } prev_entry = entry; entry = next; } return 0; } aom_codec_err_t aom_film_grain_table_read( aom_film_grain_table_t *t, const char *filename, struct aom_internal_error_info *error_info) { FILE *file = fopen(filename, "rb"); if (!file) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open %s", filename); return error_info->error_code; } error_info->error_code = AOM_CODEC_OK; // Read in one extra character as there should be white space after // the header. char magic[9]; if (!fread(magic, 9, 1, file) || memcmp(magic, kFileMagic, 8)) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to read (or invalid) file magic"); fclose(file); return error_info->error_code; } aom_film_grain_table_entry_t *prev_entry = NULL; while (!feof(file)) { aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry)); if (!entry) { aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, "Unable to allocate grain table entry"); break; } memset(entry, 0, sizeof(*entry)); grain_table_entry_read(file, error_info, entry); entry->next = NULL; if (prev_entry) prev_entry->next = entry; if (!t->head) t->head = entry; t->tail = entry; prev_entry = entry; if (error_info->error_code != AOM_CODEC_OK) break; } fclose(file); return error_info->error_code; } aom_codec_err_t aom_film_grain_table_write( const aom_film_grain_table_t *t, const char *filename, struct aom_internal_error_info *error_info) { error_info->error_code = AOM_CODEC_OK; FILE *file = fopen(filename, "wb"); if (!file) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open file %s", filename); return error_info->error_code; } if (!fwrite(kFileMagic, 8, 1, file)) { aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to write file magic"); fclose(file); return error_info->error_code; } fprintf(file, "\n"); aom_film_grain_table_entry_t *entry = t->head; while (entry) { grain_table_entry_write(file, entry); entry = entry->next; } fclose(file); return error_info->error_code; } void aom_film_grain_table_free(aom_film_grain_table_t *t) { aom_film_grain_table_entry_t *entry = t->head; while (entry) { aom_film_grain_table_entry_t *next = entry->next; aom_free(entry); entry = next; } memset(t, 0, sizeof(*t)); } aom-3.12.1/aom_dsp/grain_table.h000066400000000000000000000073141477627663500164500ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief A table mapping from time to corresponding film grain parameters. * * In order to apply grain synthesis in the decoder, the film grain parameters * need to be signalled in the encoder. The film grain parameters are time * varying, and for two-pass encoding (and denoiser implementation flexibility) * it is common to denoise the video and do parameter estimation before encoding * the denoised video. * * The film grain table is used to provide this flexibility and is used as a * parameter that is passed to the encoder. * * Further, if regraining is to be done in say a single pass mode, or in two * pass within the encoder (before frames are added to the lookahead buffer), * this data structure can be used to keep track of on-the-fly estimated grain * parameters, that are then extracted from the table before the encoded frame * is written. */ #ifndef AOM_AOM_DSP_GRAIN_TABLE_H_ #define AOM_AOM_DSP_GRAIN_TABLE_H_ #ifdef __cplusplus extern "C" { #endif #include "aom_dsp/grain_params.h" #include "aom/internal/aom_codec_internal.h" typedef struct aom_film_grain_table_entry_t { aom_film_grain_t params; int64_t start_time; int64_t end_time; struct aom_film_grain_table_entry_t *next; } aom_film_grain_table_entry_t; typedef struct { aom_film_grain_table_entry_t *head; aom_film_grain_table_entry_t *tail; } aom_film_grain_table_t; /*!\brief Add a mapping from [time_stamp, end_time) to the given grain * parameters * * \param[in,out] table The grain table * \param[in] time_stamp The start time stamp * \param[in] end_stamp The end time_stamp * \param[in] grain The grain parameters */ void aom_film_grain_table_append(aom_film_grain_table_t *table, int64_t time_stamp, int64_t end_time, const aom_film_grain_t *grain); /*!\brief Look-up (and optionally erase) the grain parameters for the given time * * \param[in] table The grain table * \param[in] time_stamp The start time stamp * \param[in] end_stamp The end time_stamp * \param[in] erase Whether the time segment can be deleted * \param[out] grain The output grain parameters */ int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp, int64_t end_time, int erase, aom_film_grain_t *grain); /*!\brief Reads the grain table from a file. * * \param[out] table The grain table * \param[in] filename The file to read from * \param[in] error_info Error info for tracking errors */ aom_codec_err_t aom_film_grain_table_read( aom_film_grain_table_t *table, const char *filename, struct aom_internal_error_info *error_info); /*!\brief Writes the grain table from a file. * * \param[out] table The grain table * \param[in] filename The file to read from * \param[in] error_info Error info for tracking errors */ aom_codec_err_t aom_film_grain_table_write( const aom_film_grain_table_t *t, const char *filename, struct aom_internal_error_info *error_info); void aom_film_grain_table_free(aom_film_grain_table_t *t); #ifdef __cplusplus } #endif #endif // AOM_AOM_DSP_GRAIN_TABLE_H_ aom-3.12.1/aom_dsp/intrapred.c000066400000000000000000000741751477627663500161750ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/intrapred_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/bitops.h" static inline void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { int r; (void)left; for (r = 0; r < bh; r++) { memcpy(dst, above, bw); dst += stride; } } static inline void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { int r; (void)above; for (r = 0; r < bh; r++) { memset(dst, left[r], bw); dst += stride; } } static inline int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; } static inline uint16_t paeth_predictor_single(uint16_t left, uint16_t top, uint16_t top_left) { const int base = top + left - top_left; const int p_left = abs_diff(base, left); const int p_top = abs_diff(base, top); const int p_top_left = abs_diff(base, top_left); // Return nearest to base of left, top and top_left. return (p_left <= p_top && p_left <= p_top_left) ? left : (p_top <= p_top_left) ? top : top_left; } static inline void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { int r, c; const uint8_t ytop_left = above[-1]; for (r = 0; r < bh; r++) { for (c = 0; c < bw; c++) dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left); dst += stride; } } // Some basic checks on weights for smooth predictor. #define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \ pred_scale) \ assert(weights_w[0] < weights_scale); \ assert(weights_h[0] < weights_scale); \ assert(weights_scale - weights_w[bw - 1] < weights_scale); \ assert(weights_scale - weights_h[bh - 1] < weights_scale); \ assert(pred_scale < 31) // ensures no overflow when calculating predictor. #define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits)) static inline void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { const uint8_t below_pred = left[bh - 1]; // estimated by bottom-left pixel const uint8_t right_pred = above[bw - 1]; // estimated by top-right pixel const uint8_t *const sm_weights_w = smooth_weights + bw - 4; const uint8_t *const sm_weights_h = smooth_weights + bh - 4; // scale = 2 * 2^SMOOTH_WEIGHT_LOG2_SCALE const int log2_scale = 1 + SMOOTH_WEIGHT_LOG2_SCALE; const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE); sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale, log2_scale + sizeof(*dst)); int r; for (r = 0; r < bh; ++r) { int c; for (c = 0; c < bw; ++c) { const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred }; const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r], sm_weights_w[c], scale - sm_weights_w[c] }; uint32_t this_pred = 0; int i; assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]); for (i = 0; i < 4; ++i) { this_pred += weights[i] * pixels[i]; } dst[c] = divide_round(this_pred, log2_scale); } dst += stride; } } static inline void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { const uint8_t below_pred = left[bh - 1]; // estimated by bottom-left pixel const uint8_t *const sm_weights = smooth_weights + bh - 4; // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE; const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE); sm_weights_sanity_checks(sm_weights, sm_weights, scale, log2_scale + sizeof(*dst)); int r; for (r = 0; r < bh; r++) { int c; for (c = 0; c < bw; ++c) { const uint8_t pixels[] = { above[c], below_pred }; const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] }; uint32_t this_pred = 0; assert(scale >= sm_weights[r]); int i; for (i = 0; i < 2; ++i) { this_pred += weights[i] * pixels[i]; } dst[c] = divide_round(this_pred, log2_scale); } dst += stride; } } static inline void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { const uint8_t right_pred = above[bw - 1]; // estimated by top-right pixel const uint8_t *const sm_weights = smooth_weights + bw - 4; // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE; const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE); sm_weights_sanity_checks(sm_weights, sm_weights, scale, log2_scale + sizeof(*dst)); int r; for (r = 0; r < bh; r++) { int c; for (c = 0; c < bw; ++c) { const uint8_t pixels[] = { left[r], right_pred }; const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] }; uint32_t this_pred = 0; assert(scale >= sm_weights[c]); int i; for (i = 0; i < 2; ++i) { this_pred += weights[i] * pixels[i]; } dst[c] = divide_round(this_pred, log2_scale); } dst += stride; } } static inline void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { int r; (void)above; (void)left; for (r = 0; r < bh; r++) { memset(dst, 128, bw); dst += stride; } } static inline void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { int i, r, expected_dc, sum = 0; (void)above; for (i = 0; i < bh; i++) sum += left[i]; expected_dc = (sum + (bh >> 1)) / bh; for (r = 0; r < bh; r++) { memset(dst, expected_dc, bw); dst += stride; } } static inline void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { int i, r, expected_dc, sum = 0; (void)left; for (i = 0; i < bw; i++) sum += above[i]; expected_dc = (sum + (bw >> 1)) / bw; for (r = 0; r < bh; r++) { memset(dst, expected_dc, bw); dst += stride; } } static inline void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { int i, r, expected_dc, sum = 0; const int count = bw + bh; for (i = 0; i < bw; i++) { sum += above[i]; } for (i = 0; i < bh; i++) { sum += left[i]; } expected_dc = (sum + (count >> 1)) / count; for (r = 0; r < bh; r++) { memset(dst, expected_dc, bw); dst += stride; } } static inline int divide_using_multiply_shift(int num, int shift1, int multiplier, int shift2) { const int interm = num >> shift1; return interm * multiplier >> shift2; } // The constants (multiplier and shifts) for a given block size are obtained // as follows: // - Let sum_w_h = block width + block height. // - Shift 'sum_w_h' right until we reach an odd number. Let the number of // shifts for that block size be called 'shift1' (see the parameter in // dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2 // possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect // block]. // - Find multipliers for (i) dividing by 3, and (ii) dividing by 5, // using the "Algorithm 1" in: // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632 // by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd // shift will be 16, regardless of the block size. // Note: For low bitdepth, assembly code may be optimized by using smaller // constants for smaller block sizes, where the range of the 'sum' is // restricted to fewer bits. #define DC_MULTIPLIER_1X2 0x5556 #define DC_MULTIPLIER_1X4 0x3334 #define DC_SHIFT2 16 static inline void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int shift1, int multiplier) { int sum = 0; for (int i = 0; i < bw; i++) { sum += above[i]; } for (int i = 0; i < bh; i++) { sum += left[i]; } const int expected_dc = divide_using_multiply_shift( sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2); assert(expected_dc < (1 << 8)); for (int r = 0; r < bh; r++) { memset(dst, expected_dc, bw); dst += stride; } } #undef DC_SHIFT2 void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2); } void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4); } void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2); } void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4); } void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2); } void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4); } void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2); } void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2); } #undef DC_MULTIPLIER_1X2 #undef DC_MULTIPLIER_1X4 #if CONFIG_AV1_HIGHBITDEPTH static inline void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { int r; (void)left; (void)bd; for (r = 0; r < bh; r++) { memcpy(dst, above, bw * sizeof(uint16_t)); dst += stride; } } static inline void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { int r; (void)above; (void)bd; for (r = 0; r < bh; r++) { aom_memset16(dst, left[r], bw); dst += stride; } } static inline void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { int r, c; const uint16_t ytop_left = above[-1]; (void)bd; for (r = 0; r < bh; r++) { for (c = 0; c < bw; c++) dst[c] = paeth_predictor_single(left[r], above[c], ytop_left); dst += stride; } } static inline void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { (void)bd; const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel const uint8_t *const sm_weights_w = smooth_weights + bw - 4; const uint8_t *const sm_weights_h = smooth_weights + bh - 4; // scale = 2 * 2^SMOOTH_WEIGHT_LOG2_SCALE const int log2_scale = 1 + SMOOTH_WEIGHT_LOG2_SCALE; const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE); sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale, log2_scale + sizeof(*dst)); int r; for (r = 0; r < bh; ++r) { int c; for (c = 0; c < bw; ++c) { const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred }; const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r], sm_weights_w[c], scale - sm_weights_w[c] }; uint32_t this_pred = 0; int i; assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]); for (i = 0; i < 4; ++i) { this_pred += weights[i] * pixels[i]; } dst[c] = divide_round(this_pred, log2_scale); } dst += stride; } } static inline void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { (void)bd; const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel const uint8_t *const sm_weights = smooth_weights + bh - 4; // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE; const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE); sm_weights_sanity_checks(sm_weights, sm_weights, scale, log2_scale + sizeof(*dst)); int r; for (r = 0; r < bh; r++) { int c; for (c = 0; c < bw; ++c) { const uint16_t pixels[] = { above[c], below_pred }; const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] }; uint32_t this_pred = 0; assert(scale >= sm_weights[r]); int i; for (i = 0; i < 2; ++i) { this_pred += weights[i] * pixels[i]; } dst[c] = divide_round(this_pred, log2_scale); } dst += stride; } } static inline void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { (void)bd; const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel const uint8_t *const sm_weights = smooth_weights + bw - 4; // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE; const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE); sm_weights_sanity_checks(sm_weights, sm_weights, scale, log2_scale + sizeof(*dst)); int r; for (r = 0; r < bh; r++) { int c; for (c = 0; c < bw; ++c) { const uint16_t pixels[] = { left[r], right_pred }; const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] }; uint32_t this_pred = 0; assert(scale >= sm_weights[c]); int i; for (i = 0; i < 2; ++i) { this_pred += weights[i] * pixels[i]; } dst[c] = divide_round(this_pred, log2_scale); } dst += stride; } } static inline void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { int r; (void)above; (void)left; for (r = 0; r < bh; r++) { aom_memset16(dst, 128 << (bd - 8), bw); dst += stride; } } static inline void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { int i, r, expected_dc, sum = 0; (void)above; (void)bd; for (i = 0; i < bh; i++) sum += left[i]; expected_dc = (sum + (bh >> 1)) / bh; for (r = 0; r < bh; r++) { aom_memset16(dst, expected_dc, bw); dst += stride; } } static inline void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { int i, r, expected_dc, sum = 0; (void)left; (void)bd; for (i = 0; i < bw; i++) sum += above[i]; expected_dc = (sum + (bw >> 1)) / bw; for (r = 0; r < bh; r++) { aom_memset16(dst, expected_dc, bw); dst += stride; } } static inline void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { int i, r, expected_dc, sum = 0; const int count = bw + bh; (void)bd; for (i = 0; i < bw; i++) { sum += above[i]; } for (i = 0; i < bh; i++) { sum += left[i]; } expected_dc = (sum + (count >> 1)) / count; for (r = 0; r < bh; r++) { aom_memset16(dst, expected_dc, bw); dst += stride; } } // Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but // assume 2nd shift of 17 bits instead of 16. // Note: Strictly speaking, 2nd shift needs to be 17 only when: // - bit depth == 12, and // - bw + bh is divisible by 5 (as opposed to divisible by 3). // All other cases can use half the multipliers with a shift of 16 instead. // This special optimization can be used when writing assembly code. #define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB // Note: This constant is odd, but a smaller even constant (0x199a) with the // appropriate shift should work for neon in 8/10-bit. #define HIGHBD_DC_MULTIPLIER_1X4 0x6667 #define HIGHBD_DC_SHIFT2 17 static inline void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd, int shift1, uint32_t multiplier) { int sum = 0; (void)bd; for (int i = 0; i < bw; i++) { sum += above[i]; } for (int i = 0; i < bh; i++) { sum += left[i]; } const int expected_dc = divide_using_multiply_shift( sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2); assert(expected_dc < (1 << bd)); for (int r = 0; r < bh; r++) { aom_memset16(dst, expected_dc, bw); dst += stride; } } #undef HIGHBD_DC_SHIFT2 void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2, HIGHBD_DC_MULTIPLIER_1X2); } void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2, HIGHBD_DC_MULTIPLIER_1X2); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2, HIGHBD_DC_MULTIPLIER_1X4); } void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2, HIGHBD_DC_MULTIPLIER_1X4); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3, HIGHBD_DC_MULTIPLIER_1X2); } void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3, HIGHBD_DC_MULTIPLIER_1X2); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3, HIGHBD_DC_MULTIPLIER_1X4); } void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3, HIGHBD_DC_MULTIPLIER_1X4); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4, HIGHBD_DC_MULTIPLIER_1X2); } void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4, HIGHBD_DC_MULTIPLIER_1X2); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4, HIGHBD_DC_MULTIPLIER_1X4); } void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4, HIGHBD_DC_MULTIPLIER_1X4); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5, HIGHBD_DC_MULTIPLIER_1X2); } void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5, HIGHBD_DC_MULTIPLIER_1X2); } #undef HIGHBD_DC_MULTIPLIER_1X2 #undef HIGHBD_DC_MULTIPLIER_1X4 #endif // CONFIG_AV1_HIGHBITDEPTH // This serves as a wrapper function, so that all the prediction functions // can be unified and accessed as a pointer array. Note that the boundary // above and left are not necessarily used all the time. #define intra_pred_sized(type, width, height) \ void aom_##type##_predictor_##width##x##height##_c( \ uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \ const uint8_t *left) { \ type##_predictor(dst, stride, width, height, above, left); \ } #if CONFIG_AV1_HIGHBITDEPTH #define intra_pred_highbd_sized(type, width, height) \ void aom_highbd_##type##_predictor_##width##x##height##_c( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ const uint16_t *left, int bd) { \ highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \ } #else // !CONFIG_AV1_HIGHBITDEPTH #define intra_pred_highbd_sized(type, width, height) #endif // CONFIG_AV1_HIGHBITDEPTH /* clang-format off */ #if CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER #define intra_pred_rectangular(type) \ intra_pred_sized(type, 4, 8) \ intra_pred_sized(type, 8, 4) \ intra_pred_sized(type, 8, 16) \ intra_pred_sized(type, 16, 8) \ intra_pred_sized(type, 16, 32) \ intra_pred_sized(type, 32, 16) \ intra_pred_sized(type, 32, 64) \ intra_pred_sized(type, 64, 32) \ intra_pred_highbd_sized(type, 4, 8) \ intra_pred_highbd_sized(type, 8, 4) \ intra_pred_highbd_sized(type, 8, 16) \ intra_pred_highbd_sized(type, 16, 8) \ intra_pred_highbd_sized(type, 16, 32) \ intra_pred_highbd_sized(type, 32, 16) \ intra_pred_highbd_sized(type, 32, 64) \ intra_pred_highbd_sized(type, 64, 32) #else #define intra_pred_rectangular(type) \ intra_pred_sized(type, 4, 8) \ intra_pred_sized(type, 8, 4) \ intra_pred_sized(type, 8, 16) \ intra_pred_sized(type, 16, 8) \ intra_pred_sized(type, 16, 32) \ intra_pred_sized(type, 32, 16) \ intra_pred_sized(type, 32, 64) \ intra_pred_sized(type, 64, 32) \ intra_pred_sized(type, 4, 16) \ intra_pred_sized(type, 16, 4) \ intra_pred_sized(type, 8, 32) \ intra_pred_sized(type, 32, 8) \ intra_pred_sized(type, 16, 64) \ intra_pred_sized(type, 64, 16) \ intra_pred_highbd_sized(type, 4, 8) \ intra_pred_highbd_sized(type, 8, 4) \ intra_pred_highbd_sized(type, 8, 16) \ intra_pred_highbd_sized(type, 16, 8) \ intra_pred_highbd_sized(type, 16, 32) \ intra_pred_highbd_sized(type, 32, 16) \ intra_pred_highbd_sized(type, 32, 64) \ intra_pred_highbd_sized(type, 64, 32) \ intra_pred_highbd_sized(type, 4, 16) \ intra_pred_highbd_sized(type, 16, 4) \ intra_pred_highbd_sized(type, 8, 32) \ intra_pred_highbd_sized(type, 32, 8) \ intra_pred_highbd_sized(type, 16, 64) \ intra_pred_highbd_sized(type, 64, 16) #endif // CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER #define intra_pred_above_4x4(type) \ intra_pred_sized(type, 8, 8) \ intra_pred_sized(type, 16, 16) \ intra_pred_sized(type, 32, 32) \ intra_pred_sized(type, 64, 64) \ intra_pred_highbd_sized(type, 4, 4) \ intra_pred_highbd_sized(type, 8, 8) \ intra_pred_highbd_sized(type, 16, 16) \ intra_pred_highbd_sized(type, 32, 32) \ intra_pred_highbd_sized(type, 64, 64) \ intra_pred_rectangular(type) #define intra_pred_allsizes(type) \ intra_pred_sized(type, 4, 4) \ intra_pred_above_4x4(type) #define intra_pred_square(type) \ intra_pred_sized(type, 4, 4) \ intra_pred_sized(type, 8, 8) \ intra_pred_sized(type, 16, 16) \ intra_pred_sized(type, 32, 32) \ intra_pred_sized(type, 64, 64) \ intra_pred_highbd_sized(type, 4, 4) \ intra_pred_highbd_sized(type, 8, 8) \ intra_pred_highbd_sized(type, 16, 16) \ intra_pred_highbd_sized(type, 32, 32) \ intra_pred_highbd_sized(type, 64, 64) intra_pred_allsizes(v) intra_pred_allsizes(h) intra_pred_allsizes(smooth) intra_pred_allsizes(smooth_v) intra_pred_allsizes(smooth_h) intra_pred_allsizes(paeth) intra_pred_allsizes(dc_128) intra_pred_allsizes(dc_left) intra_pred_allsizes(dc_top) intra_pred_square(dc) /* clang-format on */ #undef intra_pred_allsizes aom-3.12.1/aom_dsp/intrapred_common.h000066400000000000000000000047301477627663500175400ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_INTRAPRED_COMMON_H_ #define AOM_AOM_DSP_INTRAPRED_COMMON_H_ #include "config/aom_config.h" // Weights are quadratic from '1' to '1 / block_size', scaled by // 2^SMOOTH_WEIGHT_LOG2_SCALE. #define SMOOTH_WEIGHT_LOG2_SCALE 8 // Note these arrays are aligned to ensure NEON loads using a cast to uint32_t* // have sufficient alignment. Using 8 preserves the potential for an alignment // hint in load_weight_w8(). For that case, this could be increased to 16 to // allow an aligned load in x86. DECLARE_ALIGNED(8, static const uint8_t, smooth_weights[]) = { // bs = 4 255, 149, 85, 64, // bs = 8 255, 197, 146, 105, 73, 50, 37, 32, // bs = 16 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, // bs = 32 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, // bs = 64 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4 }; DECLARE_ALIGNED(8, static const uint16_t, smooth_weights_u16[]) = { // block dimension = 4 255, 149, 85, 64, // block dimension = 8 255, 197, 146, 105, 73, 50, 37, 32, // block dimension = 16 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, // block dimension = 32 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, // block dimension = 64 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4 }; #endif // AOM_AOM_DSP_INTRAPRED_COMMON_H_ aom-3.12.1/aom_dsp/loopfilter.c000066400000000000000000001233141477627663500163520ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/mem.h" static inline int8_t signed_char_clamp(int t) { return (int8_t)clamp(t, -128, 127); } #if CONFIG_AV1_HIGHBITDEPTH static inline int16_t signed_char_clamp_high(int t, int bd) { switch (bd) { case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1); case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1); case 8: default: return (int16_t)clamp(t, -128, 128 - 1); } } #endif // should we apply any filter at all: 11111111 yes, 00000000 no static inline int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1) { int8_t mask = 0; mask |= (abs(p1 - p0) > limit) * -1; mask |= (abs(q1 - q0) > limit) * -1; mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; return ~mask; } static inline int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, uint8_t q2, uint8_t q3) { int8_t mask = 0; mask |= (abs(p3 - p2) > limit) * -1; mask |= (abs(p2 - p1) > limit) * -1; mask |= (abs(p1 - p0) > limit) * -1; mask |= (abs(q1 - q0) > limit) * -1; mask |= (abs(q2 - q1) > limit) * -1; mask |= (abs(q3 - q2) > limit) * -1; mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; return ~mask; } static inline int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, uint8_t q2) { int8_t mask = 0; mask |= (abs(p2 - p1) > limit) * -1; mask |= (abs(p1 - p0) > limit) * -1; mask |= (abs(q1 - q0) > limit) * -1; mask |= (abs(q2 - q1) > limit) * -1; mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; return ~mask; } static inline int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, uint8_t q2) { int8_t mask = 0; mask |= (abs(p1 - p0) > thresh) * -1; mask |= (abs(q1 - q0) > thresh) * -1; mask |= (abs(p2 - p0) > thresh) * -1; mask |= (abs(q2 - q0) > thresh) * -1; return ~mask; } static inline int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, uint8_t q2, uint8_t q3) { int8_t mask = 0; mask |= (abs(p1 - p0) > thresh) * -1; mask |= (abs(q1 - q0) > thresh) * -1; mask |= (abs(p2 - p0) > thresh) * -1; mask |= (abs(q2 - q0) > thresh) * -1; mask |= (abs(p3 - p0) > thresh) * -1; mask |= (abs(q3 - q0) > thresh) * -1; return ~mask; } // is there high edge variance internal edge: 11111111 yes, 00000000 no static inline int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1) { int8_t hev = 0; hev |= (abs(p1 - p0) > thresh) * -1; hev |= (abs(q1 - q0) > thresh) * -1; return hev; } static inline void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { int8_t filter1, filter2; const int8_t ps1 = (int8_t)(*op1 ^ 0x80); const int8_t ps0 = (int8_t)(*op0 ^ 0x80); const int8_t qs0 = (int8_t)(*oq0 ^ 0x80); const int8_t qs1 = (int8_t)(*oq1 ^ 0x80); const int8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); // add outer taps if we have high edge variance int8_t filter = signed_char_clamp(ps1 - qs1) & hev; // inner taps filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; // save bottom 3 bits so that we round one side +4 and the other +3 // if it equals 4 we'll set to adjust by -1 to account for the fact // we'd round 3 the other way filter1 = signed_char_clamp(filter + 4) >> 3; filter2 = signed_char_clamp(filter + 3) >> 3; *oq0 = (uint8_t)(signed_char_clamp(qs0 - filter1) ^ 0x80); *op0 = (uint8_t)(signed_char_clamp(ps0 + filter2) ^ 0x80); // outer tap adjustments filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; *oq1 = (uint8_t)(signed_char_clamp(qs1 - filter) ^ 0x80); *op1 = (uint8_t)(signed_char_clamp(ps1 + filter) ^ 0x80); } void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; int count = 4; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { const uint8_t p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p]; const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); ++s; } } void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0); aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1); } void aom_lpf_horizontal_4_quad_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0) { aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0); aom_lpf_horizontal_4_c(s + 4, p, blimit0, limit0, thresh0); aom_lpf_horizontal_4_c(s + 8, p, blimit0, limit0, thresh0); aom_lpf_horizontal_4_c(s + 12, p, blimit0, limit0, thresh0); } void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; int count = 4; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { const uint8_t p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1]; const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); filter4(mask, *thresh, s - 2, s - 1, s, s + 1); s += pitch; } } void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0); aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); } void aom_lpf_vertical_4_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0) { aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0); aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0); aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0); aom_lpf_vertical_4_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0); } static inline void filter6(int8_t mask, uint8_t thresh, int8_t flat, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) { if (flat && mask) { const uint8_t p2 = *op2, p1 = *op1, p0 = *op0; const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2; // 5-tap filter [1, 2, 2, 2, 1] *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3); *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3); *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3); *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3); } else { filter4(mask, thresh, op1, op0, oq0, oq1); } } static inline void filter8(int8_t mask, uint8_t thresh, int8_t flat, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, uint8_t *oq3) { if (flat && mask) { const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; // 7-tap filter [1, 1, 1, 2, 1, 1, 1] *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); } else { filter4(mask, thresh, op1, op0, oq0, oq1); } } void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; int count = 4; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p]; const int8_t mask = filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2); const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2); filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p); ++s; } } void aom_lpf_horizontal_6_dual_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0); aom_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1); } void aom_lpf_horizontal_6_quad_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0) { aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0); aom_lpf_horizontal_6_c(s + 4, p, blimit0, limit0, thresh0); aom_lpf_horizontal_6_c(s + 8, p, blimit0, limit0, thresh0); aom_lpf_horizontal_6_c(s + 12, p, blimit0, limit0, thresh0); } void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; int count = 4; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p); ++s; } } void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1); } void aom_lpf_horizontal_8_quad_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0) { aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); aom_lpf_horizontal_8_c(s + 4, p, blimit0, limit0, thresh0); aom_lpf_horizontal_8_c(s + 8, p, blimit0, limit0, thresh0); aom_lpf_horizontal_8_c(s + 12, p, blimit0, limit0, thresh0); } void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; int count = 4; for (i = 0; i < count; ++i) { const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2]; const int8_t mask = filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2); const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2); filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2); s += pitch; } } void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0); aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); } void aom_lpf_vertical_6_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0) { aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0); aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0); aom_lpf_vertical_6_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0); aom_lpf_vertical_6_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0); } void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; int count = 4; for (i = 0; i < count; ++i) { const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3); s += pitch; } } void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0); aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); } void aom_lpf_vertical_8_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0) { aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0); aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0); aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0); aom_lpf_vertical_8_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0); } static inline void filter14(int8_t mask, uint8_t thresh, int8_t flat, int8_t flat2, uint8_t *op6, uint8_t *op5, uint8_t *op4, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, uint8_t *oq3, uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) { if (flat2 && flat && mask) { const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, q5 = *oq5, q6 = *oq6; // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1] *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0, 4); *op4 = ROUND_POWER_OF_TWO( p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4); *op3 = ROUND_POWER_OF_TWO( p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4); *op2 = ROUND_POWER_OF_TWO( p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3, 4); *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4, 4); *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5, 4); *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2 + q3 + q4 + q5 + q6, 4); *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + q2 * 2 + q3 + q4 + q5 + q6 * 2, 4); *oq2 = ROUND_POWER_OF_TWO( p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3, 4); *oq3 = ROUND_POWER_OF_TWO( p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4); *oq4 = ROUND_POWER_OF_TWO( p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4); *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7, 4); } else { filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); } } static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count) { int i; int step = 4; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < step * count; ++i) { const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p], p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p], q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6); filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p); ++s; } } void aom_lpf_horizontal_14_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); } void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1); mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1); } void aom_lpf_horizontal_14_quad_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0) { mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1); mb_lpf_horizontal_edge_w(s + 4, p, blimit0, limit0, thresh0, 1); mb_lpf_horizontal_edge_w(s + 8, p, blimit0, limit0, thresh0, 1); mb_lpf_horizontal_edge_w(s + 12, p, blimit0, limit0, thresh0, 1); } static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count) { int i; for (i = 0; i < count; ++i) { const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4], q5 = s[5], q6 = s[6]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6); filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6); s += p; } } void aom_lpf_vertical_14_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4); } void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4); mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4); } void aom_lpf_vertical_14_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0) { mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4); mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit0, limit0, thresh0, 4); mb_lpf_vertical_edge_w(s + 8 * pitch, pitch, blimit0, limit0, thresh0, 4); mb_lpf_vertical_edge_w(s + 12 * pitch, pitch, blimit0, limit0, thresh0, 4); } #if CONFIG_AV1_HIGHBITDEPTH // Should we apply any filter at all: 11111111 yes, 00000000 no ? static inline int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit, uint16_t p1, uint16_t p0, uint16_t q0, uint16_t q1, int bd) { int8_t mask = 0; int16_t limit16 = (uint16_t)limit << (bd - 8); int16_t blimit16 = (uint16_t)blimit << (bd - 8); mask |= (abs(p1 - p0) > limit16) * -1; mask |= (abs(q1 - q0) > limit16) * -1; mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; return ~mask; } // Should we apply any filter at all: 11111111 yes, 00000000 no ? static inline int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, uint16_t p3, uint16_t p2, uint16_t p1, uint16_t p0, uint16_t q0, uint16_t q1, uint16_t q2, uint16_t q3, int bd) { int8_t mask = 0; int16_t limit16 = (uint16_t)limit << (bd - 8); int16_t blimit16 = (uint16_t)blimit << (bd - 8); mask |= (abs(p3 - p2) > limit16) * -1; mask |= (abs(p2 - p1) > limit16) * -1; mask |= (abs(p1 - p0) > limit16) * -1; mask |= (abs(q1 - q0) > limit16) * -1; mask |= (abs(q2 - q1) > limit16) * -1; mask |= (abs(q3 - q2) > limit16) * -1; mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; return ~mask; } static inline int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit, uint16_t p2, uint16_t p1, uint16_t p0, uint16_t q0, uint16_t q1, uint16_t q2, int bd) { int8_t mask = 0; int16_t limit16 = (uint16_t)limit << (bd - 8); int16_t blimit16 = (uint16_t)blimit << (bd - 8); mask |= (abs(p2 - p1) > limit16) * -1; mask |= (abs(p1 - p0) > limit16) * -1; mask |= (abs(q1 - q0) > limit16) * -1; mask |= (abs(q2 - q1) > limit16) * -1; mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; return ~mask; } static inline int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2, uint16_t p1, uint16_t p0, uint16_t q0, uint16_t q1, uint16_t q2, int bd) { int8_t mask = 0; int16_t thresh16 = (uint16_t)thresh << (bd - 8); mask |= (abs(p1 - p0) > thresh16) * -1; mask |= (abs(q1 - q0) > thresh16) * -1; mask |= (abs(p2 - p0) > thresh16) * -1; mask |= (abs(q2 - q0) > thresh16) * -1; return ~mask; } static inline int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2, uint16_t p1, uint16_t p0, uint16_t q0, uint16_t q1, uint16_t q2, uint16_t q3, int bd) { int8_t mask = 0; int16_t thresh16 = (uint16_t)thresh << (bd - 8); mask |= (abs(p1 - p0) > thresh16) * -1; mask |= (abs(q1 - q0) > thresh16) * -1; mask |= (abs(p2 - p0) > thresh16) * -1; mask |= (abs(q2 - q0) > thresh16) * -1; mask |= (abs(p3 - p0) > thresh16) * -1; mask |= (abs(q3 - q0) > thresh16) * -1; return ~mask; } // Is there high edge variance internal edge: // 11111111_11111111 yes, 00000000_00000000 no ? static inline int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0, uint16_t q0, uint16_t q1, int bd) { int16_t hev = 0; int16_t thresh16 = (uint16_t)thresh << (bd - 8); hev |= (abs(p1 - p0) > thresh16) * -1; hev |= (abs(q1 - q0) > thresh16) * -1; return hev; } static inline void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, int bd) { int16_t filter1, filter2; // ^0x80 equivalent to subtracting 0x80 from the values to turn them // into -128 to +127 instead of 0 to 255. int shift = bd - 8; const int16_t ps1 = (int16_t)*op1 - (0x80 << shift); const int16_t ps0 = (int16_t)*op0 - (0x80 << shift); const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift); const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift); const int16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd); // Add outer taps if we have high edge variance. int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev; // Inner taps. filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask; // Save bottom 3 bits so that we round one side +4 and the other +3 // if it equals 4 we'll set to adjust by -1 to account for the fact // we'd round 3 the other way. filter1 = signed_char_clamp_high(filter + 4, bd) >> 3; filter2 = signed_char_clamp_high(filter + 3, bd) >> 3; *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift); *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift); // Outer tap adjustments. filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift); *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift); } void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; int count = 4; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { const uint16_t p1 = s[-2 * p]; const uint16_t p0 = s[-p]; const uint16_t q0 = s[0 * p]; const uint16_t q1 = s[1 * p]; const int8_t mask = highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd); ++s; } } void aom_highbd_lpf_horizontal_4_dual_c( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd); aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd); } void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; int count = 4; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { const uint16_t p1 = s[-2], p0 = s[-1]; const uint16_t q0 = s[0], q1 = s[1]; const int8_t mask = highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd); s += pitch; } } void aom_highbd_lpf_vertical_4_dual_c( uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd); aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, bd); } static inline void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat, uint16_t *op2, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, int bd) { if (flat && mask) { const uint16_t p2 = *op2, p1 = *op1, p0 = *op0; const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2; // 5-tap filter [1, 2, 2, 2, 1] *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3); *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3); *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3); *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3); } else { highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); } } static inline void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat, uint16_t *op3, uint16_t *op2, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, uint16_t *oq3, int bd) { if (flat && mask) { const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; // 7-tap filter [1, 1, 1, 2, 1, 1, 1] *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); } else { highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); } } void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; int count = 4; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd); ++s; } } void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; int count = 4; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p]; const int8_t mask = highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd); const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd); highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, bd); ++s; } } void aom_highbd_lpf_horizontal_6_dual_c( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0, bd); aom_highbd_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1, bd); } void aom_highbd_lpf_horizontal_8_dual_c( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd); aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd); } void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; int count = 4; for (i = 0; i < count; ++i) { const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint16_t q0 = s[0], q1 = s[1], q2 = s[2]; const int8_t mask = highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd); const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd); highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2, bd); s += pitch; } } void aom_highbd_lpf_vertical_6_dual_c( uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0, bd); aom_highbd_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, bd); } void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; int count = 4; for (i = 0; i < count; ++i) { const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, bd); s += pitch; } } void aom_highbd_lpf_vertical_8_dual_c( uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd); aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, bd); } static inline void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat, int8_t flat2, uint16_t *op6, uint16_t *op5, uint16_t *op4, uint16_t *op3, uint16_t *op2, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, uint16_t *oq3, uint16_t *oq4, uint16_t *oq5, uint16_t *oq6, int bd) { if (flat2 && flat && mask) { const uint16_t p6 = *op6; const uint16_t p5 = *op5; const uint16_t p4 = *op4; const uint16_t p3 = *op3; const uint16_t p2 = *op2; const uint16_t p1 = *op1; const uint16_t p0 = *op0; const uint16_t q0 = *oq0; const uint16_t q1 = *oq1; const uint16_t q2 = *oq2; const uint16_t q3 = *oq3; const uint16_t q4 = *oq4; const uint16_t q5 = *oq5; const uint16_t q6 = *oq6; // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1] *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0, 4); *op4 = ROUND_POWER_OF_TWO( p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4); *op3 = ROUND_POWER_OF_TWO( p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4); *op2 = ROUND_POWER_OF_TWO( p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3, 4); *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + q0 + q1 + q2 + q3 + q4, 4); *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1 + q2 + q3 + q4 + q5, 4); *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2 + q3 + q4 + q5 + q6, 4); *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + q2 * 2 + q3 + q4 + q5 + q6 * 2, 4); *oq2 = ROUND_POWER_OF_TWO( p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3, 4); *oq3 = ROUND_POWER_OF_TWO( p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4); *oq4 = ROUND_POWER_OF_TWO( p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4); *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7, 4); } else { highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3, bd); } } static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd) { int i; int step = 4; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < step * count; ++i) { const uint16_t p3 = s[-4 * p]; const uint16_t p2 = s[-3 * p]; const uint16_t p1 = s[-2 * p]; const uint16_t p0 = s[-p]; const uint16_t q0 = s[0 * p]; const uint16_t q1 = s[1 * p]; const uint16_t q2 = s[2 * p]; const uint16_t q3 = s[3 * p]; const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat2 = highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p], s[5 * p], s[6 * p], bd); highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd); ++s; } } void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd); } void aom_highbd_lpf_horizontal_14_dual_c( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd); highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd); } static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd) { int i; for (i = 0; i < count; ++i) { const uint16_t p3 = s[-4]; const uint16_t p2 = s[-3]; const uint16_t p1 = s[-2]; const uint16_t p0 = s[-1]; const uint16_t q0 = s[0]; const uint16_t q1 = s[1]; const uint16_t q2 = s[2]; const uint16_t q3 = s[3]; const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat2 = highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd); highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, bd); s += p; } } void aom_highbd_lpf_vertical_14_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd); } void aom_highbd_lpf_vertical_14_dual_c( uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd); highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4, bd); } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/mathutils.h000066400000000000000000000110261477627663500162060ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_MATHUTILS_H_ #define AOM_AOM_DSP_MATHUTILS_H_ #include #include #include #include "aom_dsp/aom_dsp_common.h" static const double TINY_NEAR_ZERO = 1.0E-16; // Solves Ax = b, where x and b are column vectors of size nx1 and A is nxn static inline int linsolve(int n, double *A, int stride, double *b, double *x) { int i, j, k; double c; // Forward elimination for (k = 0; k < n - 1; k++) { // Bring the largest magnitude to the diagonal position for (i = n - 1; i > k; i--) { if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) { for (j = 0; j < n; j++) { c = A[i * stride + j]; A[i * stride + j] = A[(i - 1) * stride + j]; A[(i - 1) * stride + j] = c; } c = b[i]; b[i] = b[i - 1]; b[i - 1] = c; } } for (i = k; i < n - 1; i++) { if (fabs(A[k * stride + k]) < TINY_NEAR_ZERO) return 0; c = A[(i + 1) * stride + k] / A[k * stride + k]; for (j = 0; j < n; j++) A[(i + 1) * stride + j] -= c * A[k * stride + j]; b[i + 1] -= c * b[k]; } } // Backward substitution for (i = n - 1; i >= 0; i--) { if (fabs(A[i * stride + i]) < TINY_NEAR_ZERO) return 0; c = 0; for (j = i + 1; j <= n - 1; j++) c += A[i * stride + j] * x[j]; x[i] = (b[i] - c) / A[i * stride + i]; } return 1; } //////////////////////////////////////////////////////////////////////////////// // Least-squares // Solves for n-dim x in a least squares sense to minimize |Ax - b|^2 // The solution is simply x = (A'A)^-1 A'b or simply the solution for // the system: A'A x = A'b // // This process is split into three steps in order to avoid needing to // explicitly allocate the A matrix, which may be very large if there // are many equations to solve. // // The process for using this is (in pseudocode): // // Allocate mat (size n*n), y (size n), a (size n), x (size n) // least_squares_init(mat, y, n) // for each equation a . x = b { // least_squares_accumulate(mat, y, a, b, n) // } // least_squares_solve(mat, y, x, n) // // where: // * mat, y are accumulators for the values A'A and A'b respectively, // * a, b are the coefficients of each individual equation, // * x is the result vector // * and n is the problem size static inline void least_squares_init(double *mat, double *y, int n) { memset(mat, 0, n * n * sizeof(double)); memset(y, 0, n * sizeof(double)); } // Round the given positive value to nearest integer static AOM_FORCE_INLINE int iroundpf(float x) { assert(x >= 0.0); return (int)(x + 0.5f); } static inline void least_squares_accumulate(double *mat, double *y, const double *a, double b, int n) { for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { mat[i * n + j] += a[i] * a[j]; } } for (int i = 0; i < n; i++) { y[i] += a[i] * b; } } static inline int least_squares_solve(double *mat, double *y, double *x, int n) { return linsolve(n, mat, n, y, x); } // Matrix multiply static inline void multiply_mat(const double *m1, const double *m2, double *res, const int m1_rows, const int inner_dim, const int m2_cols) { double sum; int row, col, inner; for (row = 0; row < m1_rows; ++row) { for (col = 0; col < m2_cols; ++col) { sum = 0; for (inner = 0; inner < inner_dim; ++inner) sum += m1[row * inner_dim + inner] * m2[inner * m2_cols + col]; *(res++) = sum; } } } static inline float approx_exp(float y) { #define A ((1 << 23) / 0.69314718056f) // (1 << 23) / ln(2) #define B \ 127 // Offset for the exponent according to IEEE floating point standard. #define C 60801 // Magic number controls the accuracy of approximation union { float as_float; int32_t as_int32; } container; container.as_int32 = ((int32_t)(y * A)) + ((B << 23) - C); return container.as_float; #undef A #undef B #undef C } #endif // AOM_AOM_DSP_MATHUTILS_H_ aom-3.12.1/aom_dsp/noise_model.c000066400000000000000000002063131477627663500164710ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/mathutils.h" #include "aom_dsp/noise_model.h" #include "aom_dsp/noise_util.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "aom_scale/yv12config.h" #define kLowPolyNumParams 3 static const int kMaxLag = 4; // Defines a function that can be used to obtain the mean of a block for the // provided data type (uint8_t, or uint16_t) #define GET_BLOCK_MEAN(INT_TYPE, suffix) \ static double get_block_mean_##suffix(const INT_TYPE *data, int w, int h, \ int stride, int x_o, int y_o, \ int block_size) { \ const int max_h = AOMMIN(h - y_o, block_size); \ const int max_w = AOMMIN(w - x_o, block_size); \ double block_mean = 0; \ for (int y = 0; y < max_h; ++y) { \ for (int x = 0; x < max_w; ++x) { \ block_mean += data[(y_o + y) * stride + x_o + x]; \ } \ } \ return block_mean / (max_w * max_h); \ } GET_BLOCK_MEAN(uint8_t, lowbd) GET_BLOCK_MEAN(uint16_t, highbd) static inline double get_block_mean(const uint8_t *data, int w, int h, int stride, int x_o, int y_o, int block_size, int use_highbd) { if (use_highbd) return get_block_mean_highbd((const uint16_t *)data, w, h, stride, x_o, y_o, block_size); return get_block_mean_lowbd(data, w, h, stride, x_o, y_o, block_size); } // Defines a function that can be used to obtain the variance of a block // for the provided data type (uint8_t, or uint16_t) #define GET_NOISE_VAR(INT_TYPE, suffix) \ static double get_noise_var_##suffix( \ const INT_TYPE *data, const INT_TYPE *denoised, int stride, int w, \ int h, int x_o, int y_o, int block_size_x, int block_size_y) { \ const int max_h = AOMMIN(h - y_o, block_size_y); \ const int max_w = AOMMIN(w - x_o, block_size_x); \ double noise_var = 0; \ double noise_mean = 0; \ for (int y = 0; y < max_h; ++y) { \ for (int x = 0; x < max_w; ++x) { \ double noise = (double)data[(y_o + y) * stride + x_o + x] - \ denoised[(y_o + y) * stride + x_o + x]; \ noise_mean += noise; \ noise_var += noise * noise; \ } \ } \ noise_mean /= (max_w * max_h); \ return noise_var / (max_w * max_h) - noise_mean * noise_mean; \ } GET_NOISE_VAR(uint8_t, lowbd) GET_NOISE_VAR(uint16_t, highbd) static inline double get_noise_var(const uint8_t *data, const uint8_t *denoised, int w, int h, int stride, int x_o, int y_o, int block_size_x, int block_size_y, int use_highbd) { if (use_highbd) return get_noise_var_highbd((const uint16_t *)data, (const uint16_t *)denoised, w, h, stride, x_o, y_o, block_size_x, block_size_y); return get_noise_var_lowbd(data, denoised, w, h, stride, x_o, y_o, block_size_x, block_size_y); } static void equation_system_clear(aom_equation_system_t *eqns) { const int n = eqns->n; memset(eqns->A, 0, sizeof(*eqns->A) * n * n); memset(eqns->x, 0, sizeof(*eqns->x) * n); memset(eqns->b, 0, sizeof(*eqns->b) * n); } static void equation_system_copy(aom_equation_system_t *dst, const aom_equation_system_t *src) { const int n = dst->n; memcpy(dst->A, src->A, sizeof(*dst->A) * n * n); memcpy(dst->x, src->x, sizeof(*dst->x) * n); memcpy(dst->b, src->b, sizeof(*dst->b) * n); } static int equation_system_init(aom_equation_system_t *eqns, int n) { eqns->A = (double *)aom_malloc(sizeof(*eqns->A) * n * n); eqns->b = (double *)aom_malloc(sizeof(*eqns->b) * n); eqns->x = (double *)aom_malloc(sizeof(*eqns->x) * n); eqns->n = n; if (!eqns->A || !eqns->b || !eqns->x) { fprintf(stderr, "Failed to allocate system of equations of size %d\n", n); aom_free(eqns->A); aom_free(eqns->b); aom_free(eqns->x); memset(eqns, 0, sizeof(*eqns)); return 0; } equation_system_clear(eqns); return 1; } static int equation_system_solve(aom_equation_system_t *eqns) { const int n = eqns->n; double *b = (double *)aom_malloc(sizeof(*b) * n); double *A = (double *)aom_malloc(sizeof(*A) * n * n); int ret = 0; if (A == NULL || b == NULL) { fprintf(stderr, "Unable to allocate temp values of size %dx%d\n", n, n); aom_free(b); aom_free(A); return 0; } memcpy(A, eqns->A, sizeof(*eqns->A) * n * n); memcpy(b, eqns->b, sizeof(*eqns->b) * n); ret = linsolve(n, A, eqns->n, b, eqns->x); aom_free(b); aom_free(A); if (ret == 0) { return 0; } return 1; } static void equation_system_add(aom_equation_system_t *dest, aom_equation_system_t *src) { const int n = dest->n; int i, j; for (i = 0; i < n; ++i) { for (j = 0; j < n; ++j) { dest->A[i * n + j] += src->A[i * n + j]; } dest->b[i] += src->b[i]; } } static void equation_system_free(aom_equation_system_t *eqns) { if (!eqns) return; aom_free(eqns->A); aom_free(eqns->b); aom_free(eqns->x); memset(eqns, 0, sizeof(*eqns)); } static void noise_strength_solver_clear(aom_noise_strength_solver_t *solver) { equation_system_clear(&solver->eqns); solver->num_equations = 0; solver->total = 0; } static void noise_strength_solver_add(aom_noise_strength_solver_t *dest, aom_noise_strength_solver_t *src) { equation_system_add(&dest->eqns, &src->eqns); dest->num_equations += src->num_equations; dest->total += src->total; } // Return the number of coefficients required for the given parameters static int num_coeffs(const aom_noise_model_params_t params) { const int n = 2 * params.lag + 1; switch (params.shape) { case AOM_NOISE_SHAPE_DIAMOND: return params.lag * (params.lag + 1); case AOM_NOISE_SHAPE_SQUARE: return (n * n) / 2; } return 0; } static int noise_state_init(aom_noise_state_t *state, int n, int bit_depth) { const int kNumBins = 20; if (!equation_system_init(&state->eqns, n)) { fprintf(stderr, "Failed initialization noise state with size %d\n", n); return 0; } state->ar_gain = 1.0; state->num_observations = 0; return aom_noise_strength_solver_init(&state->strength_solver, kNumBins, bit_depth); } static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) { const double kTolerance = 1e-6; const int last = eqns->n - 1; // Set all of the AR coefficients to zero, but try to solve for correlation // with the luma channel memset(eqns->x, 0, sizeof(*eqns->x) * eqns->n); if (fabs(eqns->A[last * eqns->n + last]) > kTolerance) { eqns->x[last] = eqns->b[last] / eqns->A[last * eqns->n + last]; } } int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) { if (!lut) return 0; if (num_points <= 0) return 0; lut->num_points = 0; lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points)); if (!lut->points) return 0; lut->num_points = num_points; memset(lut->points, 0, sizeof(*lut->points) * num_points); return 1; } void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut) { if (!lut) return; aom_free(lut->points); memset(lut, 0, sizeof(*lut)); } double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut, double x) { int i = 0; // Constant extrapolation for x < x_0. if (x < lut->points[0][0]) return lut->points[0][1]; for (i = 0; i < lut->num_points - 1; ++i) { if (x >= lut->points[i][0] && x <= lut->points[i + 1][0]) { const double a = (x - lut->points[i][0]) / (lut->points[i + 1][0] - lut->points[i][0]); return lut->points[i + 1][1] * a + lut->points[i][1] * (1.0 - a); } } // Constant extrapolation for x > x_{n-1} return lut->points[lut->num_points - 1][1]; } static double noise_strength_solver_get_bin_index( const aom_noise_strength_solver_t *solver, double value) { const double val = fclamp(value, solver->min_intensity, solver->max_intensity); const double range = solver->max_intensity - solver->min_intensity; return (solver->num_bins - 1) * (val - solver->min_intensity) / range; } static double noise_strength_solver_get_value( const aom_noise_strength_solver_t *solver, double x) { const double bin = noise_strength_solver_get_bin_index(solver, x); const int bin_i0 = (int)floor(bin); const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1); const double a = bin - bin_i0; return (1.0 - a) * solver->eqns.x[bin_i0] + a * solver->eqns.x[bin_i1]; } void aom_noise_strength_solver_add_measurement( aom_noise_strength_solver_t *solver, double block_mean, double noise_std) { const double bin = noise_strength_solver_get_bin_index(solver, block_mean); const int bin_i0 = (int)floor(bin); const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1); const double a = bin - bin_i0; const int n = solver->num_bins; solver->eqns.A[bin_i0 * n + bin_i0] += (1.0 - a) * (1.0 - a); solver->eqns.A[bin_i1 * n + bin_i0] += a * (1.0 - a); solver->eqns.A[bin_i1 * n + bin_i1] += a * a; solver->eqns.A[bin_i0 * n + bin_i1] += a * (1.0 - a); solver->eqns.b[bin_i0] += (1.0 - a) * noise_std; solver->eqns.b[bin_i1] += a * noise_std; solver->total += noise_std; solver->num_equations++; } int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver) { // Add regularization proportional to the number of constraints const int n = solver->num_bins; const double kAlpha = 2.0 * (double)(solver->num_equations) / n; int result = 0; double mean = 0; // Do this in a non-destructive manner so it is not confusing to the caller double *old_A = solver->eqns.A; double *A = (double *)aom_malloc(sizeof(*A) * n * n); if (!A) { fprintf(stderr, "Unable to allocate copy of A\n"); return 0; } memcpy(A, old_A, sizeof(*A) * n * n); for (int i = 0; i < n; ++i) { const int i_lo = AOMMAX(0, i - 1); const int i_hi = AOMMIN(n - 1, i + 1); A[i * n + i_lo] -= kAlpha; A[i * n + i] += 2 * kAlpha; A[i * n + i_hi] -= kAlpha; } // Small regularization to give average noise strength mean = solver->total / solver->num_equations; for (int i = 0; i < n; ++i) { A[i * n + i] += 1.0 / 8192.; solver->eqns.b[i] += mean / 8192.; } solver->eqns.A = A; result = equation_system_solve(&solver->eqns); solver->eqns.A = old_A; aom_free(A); return result; } int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver, int num_bins, int bit_depth) { if (!solver) return 0; memset(solver, 0, sizeof(*solver)); solver->num_bins = num_bins; solver->min_intensity = 0; solver->max_intensity = (1 << bit_depth) - 1; solver->total = 0; solver->num_equations = 0; return equation_system_init(&solver->eqns, num_bins); } void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver) { if (!solver) return; equation_system_free(&solver->eqns); } double aom_noise_strength_solver_get_center( const aom_noise_strength_solver_t *solver, int i) { const double range = solver->max_intensity - solver->min_intensity; const int n = solver->num_bins; return ((double)i) / (n - 1) * range + solver->min_intensity; } // Computes the residual if a point were to be removed from the lut. This is // calculated as the area between the output of the solver and the line segment // that would be formed between [x_{i - 1}, x_{i + 1}). static void update_piecewise_linear_residual( const aom_noise_strength_solver_t *solver, const aom_noise_strength_lut_t *lut, double *residual, int start, int end) { const double dx = 255. / solver->num_bins; for (int i = AOMMAX(start, 1); i < AOMMIN(end, lut->num_points - 1); ++i) { const int lower = AOMMAX(0, (int)floor(noise_strength_solver_get_bin_index( solver, lut->points[i - 1][0]))); const int upper = AOMMIN(solver->num_bins - 1, (int)ceil(noise_strength_solver_get_bin_index( solver, lut->points[i + 1][0]))); double r = 0; for (int j = lower; j <= upper; ++j) { const double x = aom_noise_strength_solver_get_center(solver, j); if (x < lut->points[i - 1][0]) continue; if (x >= lut->points[i + 1][0]) continue; const double y = solver->eqns.x[j]; const double a = (x - lut->points[i - 1][0]) / (lut->points[i + 1][0] - lut->points[i - 1][0]); const double estimate_y = lut->points[i - 1][1] * (1.0 - a) + lut->points[i + 1][1] * a; r += fabs(y - estimate_y); } residual[i] = r * dx; } } int aom_noise_strength_solver_fit_piecewise( const aom_noise_strength_solver_t *solver, int max_output_points, aom_noise_strength_lut_t *lut) { // The tolerance is normalized to be give consistent results between // different bit-depths. const double kTolerance = solver->max_intensity * 0.00625 / 255.0; if (!aom_noise_strength_lut_init(lut, solver->num_bins)) { fprintf(stderr, "Failed to init lut\n"); return 0; } for (int i = 0; i < solver->num_bins; ++i) { lut->points[i][0] = aom_noise_strength_solver_get_center(solver, i); lut->points[i][1] = solver->eqns.x[i]; } if (max_output_points < 0) { max_output_points = solver->num_bins; } double *residual = (double *)aom_malloc(solver->num_bins * sizeof(*residual)); if (!residual) { aom_noise_strength_lut_free(lut); return 0; } memset(residual, 0, sizeof(*residual) * solver->num_bins); update_piecewise_linear_residual(solver, lut, residual, 0, solver->num_bins); // Greedily remove points if there are too many or if it doesn't hurt local // approximation (never remove the end points) while (lut->num_points > 2) { int min_index = 1; for (int j = 1; j < lut->num_points - 1; ++j) { if (residual[j] < residual[min_index]) { min_index = j; } } const double dx = lut->points[min_index + 1][0] - lut->points[min_index - 1][0]; const double avg_residual = residual[min_index] / dx; if (lut->num_points <= max_output_points && avg_residual > kTolerance) { break; } const int num_remaining = lut->num_points - min_index - 1; memmove(lut->points + min_index, lut->points + min_index + 1, sizeof(lut->points[0]) * num_remaining); lut->num_points--; update_piecewise_linear_residual(solver, lut, residual, min_index - 1, min_index + 1); } aom_free(residual); return 1; } int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder, int block_size, int bit_depth, int use_highbd) { const int n = block_size * block_size; aom_equation_system_t eqns; double *AtA_inv = 0; double *A = 0; int x = 0, y = 0, i = 0, j = 0; block_finder->A = NULL; block_finder->AtA_inv = NULL; if (!equation_system_init(&eqns, kLowPolyNumParams)) { fprintf(stderr, "Failed to init equation system for block_size=%d\n", block_size); return 0; } AtA_inv = (double *)aom_malloc(kLowPolyNumParams * kLowPolyNumParams * sizeof(*AtA_inv)); A = (double *)aom_malloc(kLowPolyNumParams * n * sizeof(*A)); if (AtA_inv == NULL || A == NULL) { fprintf(stderr, "Failed to alloc A or AtA_inv for block_size=%d\n", block_size); aom_free(AtA_inv); aom_free(A); equation_system_free(&eqns); return 0; } block_finder->A = A; block_finder->AtA_inv = AtA_inv; block_finder->block_size = block_size; block_finder->normalization = (1 << bit_depth) - 1; block_finder->use_highbd = use_highbd; for (y = 0; y < block_size; ++y) { const double yd = ((double)y - block_size / 2.) / (block_size / 2.); for (x = 0; x < block_size; ++x) { const double xd = ((double)x - block_size / 2.) / (block_size / 2.); const double coords[3] = { yd, xd, 1 }; const int row = y * block_size + x; A[kLowPolyNumParams * row + 0] = yd; A[kLowPolyNumParams * row + 1] = xd; A[kLowPolyNumParams * row + 2] = 1; for (i = 0; i < kLowPolyNumParams; ++i) { for (j = 0; j < kLowPolyNumParams; ++j) { eqns.A[kLowPolyNumParams * i + j] += coords[i] * coords[j]; } } } } // Lazy inverse using existing equation solver. for (i = 0; i < kLowPolyNumParams; ++i) { memset(eqns.b, 0, sizeof(*eqns.b) * kLowPolyNumParams); eqns.b[i] = 1; equation_system_solve(&eqns); for (j = 0; j < kLowPolyNumParams; ++j) { AtA_inv[j * kLowPolyNumParams + i] = eqns.x[j]; } } equation_system_free(&eqns); return 1; } void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder) { if (!block_finder) return; aom_free(block_finder->A); aom_free(block_finder->AtA_inv); memset(block_finder, 0, sizeof(*block_finder)); } void aom_flat_block_finder_extract_block( const aom_flat_block_finder_t *block_finder, const uint8_t *const data, int w, int h, int stride, int offsx, int offsy, double *plane, double *block) { const int block_size = block_finder->block_size; const int n = block_size * block_size; const double *A = block_finder->A; const double *AtA_inv = block_finder->AtA_inv; double plane_coords[kLowPolyNumParams]; double AtA_inv_b[kLowPolyNumParams]; int xi, yi, i; if (block_finder->use_highbd) { const uint16_t *const data16 = (const uint16_t *const)data; for (yi = 0; yi < block_size; ++yi) { const int y = clamp(offsy + yi, 0, h - 1); for (xi = 0; xi < block_size; ++xi) { const int x = clamp(offsx + xi, 0, w - 1); block[yi * block_size + xi] = ((double)data16[y * stride + x]) / block_finder->normalization; } } } else { for (yi = 0; yi < block_size; ++yi) { const int y = clamp(offsy + yi, 0, h - 1); for (xi = 0; xi < block_size; ++xi) { const int x = clamp(offsx + xi, 0, w - 1); block[yi * block_size + xi] = ((double)data[y * stride + x]) / block_finder->normalization; } } } multiply_mat(block, A, AtA_inv_b, 1, n, kLowPolyNumParams); multiply_mat(AtA_inv, AtA_inv_b, plane_coords, kLowPolyNumParams, kLowPolyNumParams, 1); multiply_mat(A, plane_coords, plane, n, kLowPolyNumParams, 1); for (i = 0; i < n; ++i) { block[i] -= plane[i]; } } typedef struct { int index; float score; } index_and_score_t; static int compare_scores(const void *a, const void *b) { const float diff = ((index_and_score_t *)a)->score - ((index_and_score_t *)b)->score; if (diff < 0) return -1; else if (diff > 0) return 1; return 0; } int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder, const uint8_t *const data, int w, int h, int stride, uint8_t *flat_blocks) { // The gradient-based features used in this code are based on: // A. Kokaram, D. Kelly, H. Denman and A. Crawford, "Measuring noise // correlation for improved video denoising," 2012 19th, ICIP. // The thresholds are more lenient to allow for correct grain modeling // if extreme cases. const int block_size = block_finder->block_size; const int n = block_size * block_size; const double kTraceThreshold = 0.15 / (32 * 32); const double kRatioThreshold = 1.25; const double kNormThreshold = 0.08 / (32 * 32); const double kVarThreshold = 0.005 / (double)n; const int num_blocks_w = (w + block_size - 1) / block_size; const int num_blocks_h = (h + block_size - 1) / block_size; int num_flat = 0; double *plane = (double *)aom_malloc(n * sizeof(*plane)); double *block = (double *)aom_malloc(n * sizeof(*block)); index_and_score_t *scores = (index_and_score_t *)aom_malloc( num_blocks_w * num_blocks_h * sizeof(*scores)); if (plane == NULL || block == NULL || scores == NULL) { fprintf(stderr, "Failed to allocate memory for block of size %d\n", n); aom_free(plane); aom_free(block); aom_free(scores); return -1; } #ifdef NOISE_MODEL_LOG_SCORE fprintf(stderr, "score = ["); #endif for (int by = 0; by < num_blocks_h; ++by) { for (int bx = 0; bx < num_blocks_w; ++bx) { // Compute gradient covariance matrix. aom_flat_block_finder_extract_block(block_finder, data, w, h, stride, bx * block_size, by * block_size, plane, block); double Gxx = 0, Gxy = 0, Gyy = 0; double mean = 0; double var = 0; for (int yi = 1; yi < block_size - 1; ++yi) { for (int xi = 1; xi < block_size - 1; ++xi) { const double gx = (block[yi * block_size + xi + 1] - block[yi * block_size + xi - 1]) / 2; const double gy = (block[yi * block_size + xi + block_size] - block[yi * block_size + xi - block_size]) / 2; Gxx += gx * gx; Gxy += gx * gy; Gyy += gy * gy; const double value = block[yi * block_size + xi]; mean += value; var += value * value; } } mean /= (block_size - 2) * (block_size - 2); // Normalize gradients by block_size. Gxx /= ((block_size - 2) * (block_size - 2)); Gxy /= ((block_size - 2) * (block_size - 2)); Gyy /= ((block_size - 2) * (block_size - 2)); var = var / ((block_size - 2) * (block_size - 2)) - mean * mean; { const double trace = Gxx + Gyy; const double det = Gxx * Gyy - Gxy * Gxy; const double e1 = (trace + sqrt(trace * trace - 4 * det)) / 2.; const double e2 = (trace - sqrt(trace * trace - 4 * det)) / 2.; const double norm = e1; // Spectral norm const double ratio = (e1 / AOMMAX(e2, 1e-6)); const int is_flat = (trace < kTraceThreshold) && (ratio < kRatioThreshold) && (norm < kNormThreshold) && (var > kVarThreshold); // The following weights are used to combine the above features to give // a sigmoid score for flatness. If the input was normalized to [0,100] // the magnitude of these values would be close to 1 (e.g., weights // corresponding to variance would be a factor of 10000x smaller). // The weights are given in the following order: // [{var}, {ratio}, {trace}, {norm}, offset] // with one of the most discriminative being simply the variance. const double weights[5] = { -6682, -0.2056, 13087, -12434, 2.5694 }; double sum_weights = weights[0] * var + weights[1] * ratio + weights[2] * trace + weights[3] * norm + weights[4]; // clamp the value to [-25.0, 100.0] to prevent overflow sum_weights = fclamp(sum_weights, -25.0, 100.0); const float score = (float)(1.0 / (1 + exp(-sum_weights))); flat_blocks[by * num_blocks_w + bx] = is_flat ? 255 : 0; scores[by * num_blocks_w + bx].score = var > kVarThreshold ? score : 0; scores[by * num_blocks_w + bx].index = by * num_blocks_w + bx; #ifdef NOISE_MODEL_LOG_SCORE fprintf(stderr, "%g %g %g %g %g %d ", score, var, ratio, trace, norm, is_flat); #endif num_flat += is_flat; } } #ifdef NOISE_MODEL_LOG_SCORE fprintf(stderr, "\n"); #endif } #ifdef NOISE_MODEL_LOG_SCORE fprintf(stderr, "];\n"); #endif // Find the top-scored blocks (most likely to be flat) and set the flat blocks // be the union of the thresholded results and the top 10th percentile of the // scored results. qsort(scores, num_blocks_w * num_blocks_h, sizeof(*scores), &compare_scores); const int top_nth_percentile = num_blocks_w * num_blocks_h * 90 / 100; const float score_threshold = scores[top_nth_percentile].score; for (int i = 0; i < num_blocks_w * num_blocks_h; ++i) { if (scores[i].score >= score_threshold) { num_flat += flat_blocks[scores[i].index] == 0; flat_blocks[scores[i].index] |= 1; } } aom_free(block); aom_free(plane); aom_free(scores); return num_flat; } int aom_noise_model_init(aom_noise_model_t *model, const aom_noise_model_params_t params) { const int n = num_coeffs(params); const int lag = params.lag; const int bit_depth = params.bit_depth; int x = 0, y = 0, i = 0, c = 0; memset(model, 0, sizeof(*model)); if (params.lag < 1) { fprintf(stderr, "Invalid noise param: lag = %d must be >= 1\n", params.lag); return 0; } if (params.lag > kMaxLag) { fprintf(stderr, "Invalid noise param: lag = %d must be <= %d\n", params.lag, kMaxLag); return 0; } if (!(params.bit_depth == 8 || params.bit_depth == 10 || params.bit_depth == 12)) { return 0; } memcpy(&model->params, ¶ms, sizeof(params)); for (c = 0; c < 3; ++c) { if (!noise_state_init(&model->combined_state[c], n + (c > 0), bit_depth)) { fprintf(stderr, "Failed to allocate noise state for channel %d\n", c); aom_noise_model_free(model); return 0; } if (!noise_state_init(&model->latest_state[c], n + (c > 0), bit_depth)) { fprintf(stderr, "Failed to allocate noise state for channel %d\n", c); aom_noise_model_free(model); return 0; } } model->n = n; model->coords = (int(*)[2])aom_malloc(sizeof(*model->coords) * n); if (!model->coords) { aom_noise_model_free(model); return 0; } for (y = -lag; y <= 0; ++y) { const int max_x = y == 0 ? -1 : lag; for (x = -lag; x <= max_x; ++x) { switch (params.shape) { case AOM_NOISE_SHAPE_DIAMOND: if (abs(x) <= y + lag) { model->coords[i][0] = x; model->coords[i][1] = y; ++i; } break; case AOM_NOISE_SHAPE_SQUARE: model->coords[i][0] = x; model->coords[i][1] = y; ++i; break; default: fprintf(stderr, "Invalid shape\n"); aom_noise_model_free(model); return 0; } } } assert(i == n); return 1; } void aom_noise_model_free(aom_noise_model_t *model) { int c = 0; if (!model) return; aom_free(model->coords); for (c = 0; c < 3; ++c) { equation_system_free(&model->latest_state[c].eqns); equation_system_free(&model->combined_state[c].eqns); equation_system_free(&model->latest_state[c].strength_solver.eqns); equation_system_free(&model->combined_state[c].strength_solver.eqns); } memset(model, 0, sizeof(*model)); } // Extracts the neighborhood defined by coords around point (x, y) from // the difference between the data and denoised images. Also extracts the // entry (possibly downsampled) for (x, y) in the alt_data (e.g., luma). #define EXTRACT_AR_ROW(INT_TYPE, suffix) \ static double extract_ar_row_##suffix( \ int(*coords)[2], int num_coords, const INT_TYPE *const data, \ const INT_TYPE *const denoised, int stride, int sub_log2[2], \ const INT_TYPE *const alt_data, const INT_TYPE *const alt_denoised, \ int alt_stride, int x, int y, double *buffer) { \ for (int i = 0; i < num_coords; ++i) { \ const int x_i = x + coords[i][0], y_i = y + coords[i][1]; \ buffer[i] = \ (double)data[y_i * stride + x_i] - denoised[y_i * stride + x_i]; \ } \ const double val = \ (double)data[y * stride + x] - denoised[y * stride + x]; \ \ if (alt_data && alt_denoised) { \ double avg_data = 0, avg_denoised = 0; \ int num_samples = 0; \ for (int dy_i = 0; dy_i < (1 << sub_log2[1]); dy_i++) { \ const int y_up = (y << sub_log2[1]) + dy_i; \ for (int dx_i = 0; dx_i < (1 << sub_log2[0]); dx_i++) { \ const int x_up = (x << sub_log2[0]) + dx_i; \ avg_data += alt_data[y_up * alt_stride + x_up]; \ avg_denoised += alt_denoised[y_up * alt_stride + x_up]; \ num_samples++; \ } \ } \ buffer[num_coords] = (avg_data - avg_denoised) / num_samples; \ } \ return val; \ } EXTRACT_AR_ROW(uint8_t, lowbd) EXTRACT_AR_ROW(uint16_t, highbd) static int add_block_observations( aom_noise_model_t *noise_model, int c, const uint8_t *const data, const uint8_t *const denoised, int w, int h, int stride, int sub_log2[2], const uint8_t *const alt_data, const uint8_t *const alt_denoised, int alt_stride, const uint8_t *const flat_blocks, int block_size, int num_blocks_w, int num_blocks_h) { const int lag = noise_model->params.lag; const int num_coords = noise_model->n; const double normalization = (1 << noise_model->params.bit_depth) - 1; double *A = noise_model->latest_state[c].eqns.A; double *b = noise_model->latest_state[c].eqns.b; double *buffer = (double *)aom_malloc(sizeof(*buffer) * (num_coords + 1)); const int n = noise_model->latest_state[c].eqns.n; if (!buffer) { fprintf(stderr, "Unable to allocate buffer of size %d\n", num_coords + 1); return 0; } for (int by = 0; by < num_blocks_h; ++by) { const int y_o = by * (block_size >> sub_log2[1]); for (int bx = 0; bx < num_blocks_w; ++bx) { const int x_o = bx * (block_size >> sub_log2[0]); if (!flat_blocks[by * num_blocks_w + bx]) { continue; } int y_start = (by > 0 && flat_blocks[(by - 1) * num_blocks_w + bx]) ? 0 : lag; int x_start = (bx > 0 && flat_blocks[by * num_blocks_w + bx - 1]) ? 0 : lag; int y_end = AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]), block_size >> sub_log2[1]); int x_end = AOMMIN( (w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]) - lag, (bx + 1 < num_blocks_w && flat_blocks[by * num_blocks_w + bx + 1]) ? (block_size >> sub_log2[0]) : ((block_size >> sub_log2[0]) - lag)); for (int y = y_start; y < y_end; ++y) { for (int x = x_start; x < x_end; ++x) { const double val = noise_model->params.use_highbd ? extract_ar_row_highbd(noise_model->coords, num_coords, (const uint16_t *const)data, (const uint16_t *const)denoised, stride, sub_log2, (const uint16_t *const)alt_data, (const uint16_t *const)alt_denoised, alt_stride, x + x_o, y + y_o, buffer) : extract_ar_row_lowbd(noise_model->coords, num_coords, data, denoised, stride, sub_log2, alt_data, alt_denoised, alt_stride, x + x_o, y + y_o, buffer); for (int i = 0; i < n; ++i) { for (int j = 0; j < n; ++j) { A[i * n + j] += (buffer[i] * buffer[j]) / (normalization * normalization); } b[i] += (buffer[i] * val) / (normalization * normalization); } noise_model->latest_state[c].num_observations++; } } } } aom_free(buffer); return 1; } static void add_noise_std_observations( aom_noise_model_t *noise_model, int c, const double *coeffs, const uint8_t *const data, const uint8_t *const denoised, int w, int h, int stride, int sub_log2[2], const uint8_t *const alt_data, int alt_stride, const uint8_t *const flat_blocks, int block_size, int num_blocks_w, int num_blocks_h) { const int num_coords = noise_model->n; aom_noise_strength_solver_t *noise_strength_solver = &noise_model->latest_state[c].strength_solver; const aom_noise_strength_solver_t *noise_strength_luma = &noise_model->latest_state[0].strength_solver; const double luma_gain = noise_model->latest_state[0].ar_gain; const double noise_gain = noise_model->latest_state[c].ar_gain; for (int by = 0; by < num_blocks_h; ++by) { const int y_o = by * (block_size >> sub_log2[1]); for (int bx = 0; bx < num_blocks_w; ++bx) { const int x_o = bx * (block_size >> sub_log2[0]); if (!flat_blocks[by * num_blocks_w + bx]) { continue; } const int num_samples_h = AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]), block_size >> sub_log2[1]); const int num_samples_w = AOMMIN((w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]), (block_size >> sub_log2[0])); // Make sure that we have a reasonable amount of samples to consider the // block if (num_samples_w * num_samples_h > block_size) { const double block_mean = get_block_mean( alt_data ? alt_data : data, w, h, alt_data ? alt_stride : stride, x_o << sub_log2[0], y_o << sub_log2[1], block_size, noise_model->params.use_highbd); const double noise_var = get_noise_var( data, denoised, stride, w >> sub_log2[0], h >> sub_log2[1], x_o, y_o, block_size >> sub_log2[0], block_size >> sub_log2[1], noise_model->params.use_highbd); // We want to remove the part of the noise that came from being // correlated with luma. Note that the noise solver for luma must // have already been run. const double luma_strength = c > 0 ? luma_gain * noise_strength_solver_get_value( noise_strength_luma, block_mean) : 0; const double corr = c > 0 ? coeffs[num_coords] : 0; // Chroma noise: // N(0, noise_var) = N(0, uncorr_var) + corr * N(0, luma_strength^2) // The uncorrelated component: // uncorr_var = noise_var - (corr * luma_strength)^2 // But don't allow fully correlated noise (hence the max), since the // synthesis cannot model it. const double uncorr_std = sqrt( AOMMAX(noise_var / 16, noise_var - pow(corr * luma_strength, 2))); // After we've removed correlation with luma, undo the gain that will // come from running the IIR filter. const double adjusted_strength = uncorr_std / noise_gain; aom_noise_strength_solver_add_measurement( noise_strength_solver, block_mean, adjusted_strength); } } } } // Return true if the noise estimate appears to be different from the combined // (multi-frame) estimate. The difference is measured by checking whether the // AR coefficients have diverged (using a threshold on normalized cross // correlation), or whether the noise strength has changed. static int is_noise_model_different(aom_noise_model_t *const noise_model) { // These thresholds are kind of arbitrary and will likely need further tuning // (or exported as parameters). The threshold on noise strength is a weighted // difference between the noise strength histograms const double kCoeffThreshold = 0.9; const double kStrengthThreshold = 0.005 * (1 << (noise_model->params.bit_depth - 8)); for (int c = 0; c < 1; ++c) { const double corr = aom_normalized_cross_correlation(noise_model->latest_state[c].eqns.x, noise_model->combined_state[c].eqns.x, noise_model->combined_state[c].eqns.n); if (corr < kCoeffThreshold) return 1; const double dx = 1.0 / noise_model->latest_state[c].strength_solver.num_bins; const aom_equation_system_t *latest_eqns = &noise_model->latest_state[c].strength_solver.eqns; const aom_equation_system_t *combined_eqns = &noise_model->combined_state[c].strength_solver.eqns; double diff = 0; double total_weight = 0; for (int j = 0; j < latest_eqns->n; ++j) { double weight = 0; for (int i = 0; i < latest_eqns->n; ++i) { weight += latest_eqns->A[i * latest_eqns->n + j]; } weight = sqrt(weight); diff += weight * fabs(latest_eqns->x[j] - combined_eqns->x[j]); total_weight += weight; } if (diff * dx / total_weight > kStrengthThreshold) return 1; } return 0; } static int ar_equation_system_solve(aom_noise_state_t *state, int is_chroma) { const int ret = equation_system_solve(&state->eqns); state->ar_gain = 1.0; if (!ret) return ret; // Update the AR gain from the equation system as it will be used to fit // the noise strength as a function of intensity. In the Yule-Walker // equations, the diagonal should be the variance of the correlated noise. // In the case of the least squares estimate, there will be some variability // in the diagonal. So use the mean of the diagonal as the estimate of // overall variance (this works for least squares or Yule-Walker formulation). double var = 0; const int n = state->eqns.n; for (int i = 0; i < (state->eqns.n - is_chroma); ++i) { var += state->eqns.A[i * n + i] / state->num_observations; } var /= (n - is_chroma); // Keep track of E(Y^2) = + E(X^2) // In the case that we are using chroma and have an estimate of correlation // with luma we adjust that estimate slightly to remove the correlated bits by // subtracting out the last column of a scaled by our correlation estimate // from b. E(y^2) = double sum_covar = 0; for (int i = 0; i < state->eqns.n - is_chroma; ++i) { double bi = state->eqns.b[i]; if (is_chroma) { bi -= state->eqns.A[i * n + (n - 1)] * state->eqns.x[n - 1]; } sum_covar += (bi * state->eqns.x[i]) / state->num_observations; } // Now, get an estimate of the variance of uncorrelated noise signal and use // it to determine the gain of the AR filter. const double noise_var = AOMMAX(var - sum_covar, 1e-6); state->ar_gain = AOMMAX(1, sqrt(AOMMAX(var / noise_var, 1e-6))); return ret; } aom_noise_status_t aom_noise_model_update( aom_noise_model_t *const noise_model, const uint8_t *const data[3], const uint8_t *const denoised[3], int w, int h, int stride[3], int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size) { const int num_blocks_w = (w + block_size - 1) / block_size; const int num_blocks_h = (h + block_size - 1) / block_size; int y_model_different = 0; int num_blocks = 0; int i = 0, channel = 0; if (block_size <= 1) { fprintf(stderr, "block_size = %d must be > 1\n", block_size); return AOM_NOISE_STATUS_INVALID_ARGUMENT; } if (block_size < noise_model->params.lag * 2 + 1) { fprintf(stderr, "block_size = %d must be >= %d\n", block_size, noise_model->params.lag * 2 + 1); return AOM_NOISE_STATUS_INVALID_ARGUMENT; } // Clear the latest equation system for (i = 0; i < 3; ++i) { equation_system_clear(&noise_model->latest_state[i].eqns); noise_model->latest_state[i].num_observations = 0; noise_strength_solver_clear(&noise_model->latest_state[i].strength_solver); } // Check that we have enough flat blocks for (i = 0; i < num_blocks_h * num_blocks_w; ++i) { if (flat_blocks[i]) { num_blocks++; } } if (num_blocks <= 1) { fprintf(stderr, "Not enough flat blocks to update noise estimate\n"); return AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS; } for (channel = 0; channel < 3; ++channel) { int no_subsampling[2] = { 0, 0 }; const uint8_t *alt_data = channel > 0 ? data[0] : 0; const uint8_t *alt_denoised = channel > 0 ? denoised[0] : 0; int *sub = channel > 0 ? chroma_sub_log2 : no_subsampling; const int is_chroma = channel != 0; if (!data[channel] || !denoised[channel]) break; if (!add_block_observations(noise_model, channel, data[channel], denoised[channel], w, h, stride[channel], sub, alt_data, alt_denoised, stride[0], flat_blocks, block_size, num_blocks_w, num_blocks_h)) { fprintf(stderr, "Adding block observation failed\n"); return AOM_NOISE_STATUS_INTERNAL_ERROR; } if (!ar_equation_system_solve(&noise_model->latest_state[channel], is_chroma)) { if (is_chroma) { set_chroma_coefficient_fallback_soln( &noise_model->latest_state[channel].eqns); } else { fprintf(stderr, "Solving latest noise equation system failed %d!\n", channel); return AOM_NOISE_STATUS_INTERNAL_ERROR; } } add_noise_std_observations( noise_model, channel, noise_model->latest_state[channel].eqns.x, data[channel], denoised[channel], w, h, stride[channel], sub, alt_data, stride[0], flat_blocks, block_size, num_blocks_w, num_blocks_h); if (!aom_noise_strength_solver_solve( &noise_model->latest_state[channel].strength_solver)) { fprintf(stderr, "Solving latest noise strength failed!\n"); return AOM_NOISE_STATUS_INTERNAL_ERROR; } // Check noise characteristics and return if error. if (channel == 0 && noise_model->combined_state[channel].strength_solver.num_equations > 0 && is_noise_model_different(noise_model)) { y_model_different = 1; } // Don't update the combined stats if the y model is different. if (y_model_different) continue; noise_model->combined_state[channel].num_observations += noise_model->latest_state[channel].num_observations; equation_system_add(&noise_model->combined_state[channel].eqns, &noise_model->latest_state[channel].eqns); if (!ar_equation_system_solve(&noise_model->combined_state[channel], is_chroma)) { if (is_chroma) { set_chroma_coefficient_fallback_soln( &noise_model->combined_state[channel].eqns); } else { fprintf(stderr, "Solving combined noise equation system failed %d!\n", channel); return AOM_NOISE_STATUS_INTERNAL_ERROR; } } noise_strength_solver_add( &noise_model->combined_state[channel].strength_solver, &noise_model->latest_state[channel].strength_solver); if (!aom_noise_strength_solver_solve( &noise_model->combined_state[channel].strength_solver)) { fprintf(stderr, "Solving combined noise strength failed!\n"); return AOM_NOISE_STATUS_INTERNAL_ERROR; } } return y_model_different ? AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE : AOM_NOISE_STATUS_OK; } void aom_noise_model_save_latest(aom_noise_model_t *noise_model) { for (int c = 0; c < 3; c++) { equation_system_copy(&noise_model->combined_state[c].eqns, &noise_model->latest_state[c].eqns); equation_system_copy(&noise_model->combined_state[c].strength_solver.eqns, &noise_model->latest_state[c].strength_solver.eqns); noise_model->combined_state[c].strength_solver.num_equations = noise_model->latest_state[c].strength_solver.num_equations; noise_model->combined_state[c].num_observations = noise_model->latest_state[c].num_observations; noise_model->combined_state[c].ar_gain = noise_model->latest_state[c].ar_gain; } } int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model, aom_film_grain_t *film_grain) { if (noise_model->params.lag > 3) { fprintf(stderr, "params.lag = %d > 3\n", noise_model->params.lag); return 0; } uint16_t random_seed = film_grain->random_seed; memset(film_grain, 0, sizeof(*film_grain)); film_grain->random_seed = random_seed; film_grain->apply_grain = 1; film_grain->update_parameters = 1; film_grain->ar_coeff_lag = noise_model->params.lag; // Convert the scaling functions to 8 bit values aom_noise_strength_lut_t scaling_points[3]; if (!aom_noise_strength_solver_fit_piecewise( &noise_model->combined_state[0].strength_solver, 14, scaling_points + 0)) { return 0; } if (!aom_noise_strength_solver_fit_piecewise( &noise_model->combined_state[1].strength_solver, 10, scaling_points + 1)) { aom_noise_strength_lut_free(scaling_points + 0); return 0; } if (!aom_noise_strength_solver_fit_piecewise( &noise_model->combined_state[2].strength_solver, 10, scaling_points + 2)) { aom_noise_strength_lut_free(scaling_points + 0); aom_noise_strength_lut_free(scaling_points + 1); return 0; } // Both the domain and the range of the scaling functions in the film_grain // are normalized to 8-bit (e.g., they are implicitly scaled during grain // synthesis). const double strength_divisor = 1 << (noise_model->params.bit_depth - 8); double max_scaling_value = 1e-4; for (int c = 0; c < 3; ++c) { for (int i = 0; i < scaling_points[c].num_points; ++i) { scaling_points[c].points[i][0] = AOMMIN(255, scaling_points[c].points[i][0] / strength_divisor); scaling_points[c].points[i][1] = AOMMIN(255, scaling_points[c].points[i][1] / strength_divisor); max_scaling_value = AOMMAX(scaling_points[c].points[i][1], max_scaling_value); } } // Scaling_shift values are in the range [8,11] const int max_scaling_value_log2 = clamp((int)floor(log2(max_scaling_value) + 1), 2, 5); film_grain->scaling_shift = 5 + (8 - max_scaling_value_log2); const double scale_factor = 1 << (8 - max_scaling_value_log2); film_grain->num_y_points = scaling_points[0].num_points; film_grain->num_cb_points = scaling_points[1].num_points; film_grain->num_cr_points = scaling_points[2].num_points; int(*film_grain_scaling[3])[2] = { film_grain->scaling_points_y, film_grain->scaling_points_cb, film_grain->scaling_points_cr, }; for (int c = 0; c < 3; c++) { for (int i = 0; i < scaling_points[c].num_points; ++i) { film_grain_scaling[c][i][0] = (int)(scaling_points[c].points[i][0] + 0.5); film_grain_scaling[c][i][1] = clamp( (int)(scale_factor * scaling_points[c].points[i][1] + 0.5), 0, 255); } } aom_noise_strength_lut_free(scaling_points + 0); aom_noise_strength_lut_free(scaling_points + 1); aom_noise_strength_lut_free(scaling_points + 2); // Convert the ar_coeffs into 8-bit values const int n_coeff = noise_model->combined_state[0].eqns.n; double max_coeff = 1e-4, min_coeff = -1e-4; double y_corr[2] = { 0, 0 }; double avg_luma_strength = 0; for (int c = 0; c < 3; c++) { aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns; for (int i = 0; i < n_coeff; ++i) { max_coeff = AOMMAX(max_coeff, eqns->x[i]); min_coeff = AOMMIN(min_coeff, eqns->x[i]); } // Since the correlation between luma/chroma was computed in an already // scaled space, we adjust it in the un-scaled space. aom_noise_strength_solver_t *solver = &noise_model->combined_state[c].strength_solver; // Compute a weighted average of the strength for the channel. double average_strength = 0, total_weight = 0; for (int i = 0; i < solver->eqns.n; ++i) { double w = 0; for (int j = 0; j < solver->eqns.n; ++j) { w += solver->eqns.A[i * solver->eqns.n + j]; } w = sqrt(w); average_strength += solver->eqns.x[i] * w; total_weight += w; } if (total_weight == 0) average_strength = 1; else average_strength /= total_weight; if (c == 0) { avg_luma_strength = average_strength; } else { y_corr[c - 1] = avg_luma_strength * eqns->x[n_coeff] / average_strength; max_coeff = AOMMAX(max_coeff, y_corr[c - 1]); min_coeff = AOMMIN(min_coeff, y_corr[c - 1]); } } // Shift value: AR coeffs range (values 6-9) // 6: [-2, 2), 7: [-1, 1), 8: [-0.5, 0.5), 9: [-0.25, 0.25) film_grain->ar_coeff_shift = clamp(7 - (int)AOMMAX(1 + floor(log2(max_coeff)), ceil(log2(-min_coeff))), 6, 9); double scale_ar_coeff = 1 << film_grain->ar_coeff_shift; int *ar_coeffs[3] = { film_grain->ar_coeffs_y, film_grain->ar_coeffs_cb, film_grain->ar_coeffs_cr, }; for (int c = 0; c < 3; ++c) { aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns; for (int i = 0; i < n_coeff; ++i) { ar_coeffs[c][i] = clamp((int)round(scale_ar_coeff * eqns->x[i]), -128, 127); } if (c > 0) { ar_coeffs[c][n_coeff] = clamp((int)round(scale_ar_coeff * y_corr[c - 1]), -128, 127); } } // At the moment, the noise modeling code assumes that the chroma scaling // functions are a function of luma. film_grain->cb_mult = 128; // 8 bits film_grain->cb_luma_mult = 192; // 8 bits film_grain->cb_offset = 256; // 9 bits film_grain->cr_mult = 128; // 8 bits film_grain->cr_luma_mult = 192; // 8 bits film_grain->cr_offset = 256; // 9 bits film_grain->chroma_scaling_from_luma = 0; film_grain->grain_scale_shift = 0; film_grain->overlap_flag = 1; return 1; } static void pointwise_multiply(const float *a, float *b, int n) { for (int i = 0; i < n; ++i) { b[i] *= a[i]; } } static float *get_half_cos_window(int block_size) { float *window_function = (float *)aom_malloc(block_size * block_size * sizeof(*window_function)); if (!window_function) return NULL; for (int y = 0; y < block_size; ++y) { const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2); for (int x = 0; x < block_size; ++x) { const double cos_xd = cos((.5 + x) * PI / block_size - PI / 2); window_function[y * block_size + x] = (float)(cos_yd * cos_xd); } } return window_function; } #define DITHER_AND_QUANTIZE(INT_TYPE, suffix) \ static void dither_and_quantize_##suffix( \ float *result, int result_stride, INT_TYPE *denoised, int w, int h, \ int stride, int chroma_sub_w, int chroma_sub_h, int block_size, \ float block_normalization) { \ for (int y = 0; y < (h >> chroma_sub_h); ++y) { \ for (int x = 0; x < (w >> chroma_sub_w); ++x) { \ const int result_idx = \ (y + (block_size >> chroma_sub_h)) * result_stride + x + \ (block_size >> chroma_sub_w); \ INT_TYPE new_val = (INT_TYPE)AOMMIN( \ AOMMAX(result[result_idx] * block_normalization + 0.5f, 0), \ block_normalization); \ const float err = \ -(((float)new_val) / block_normalization - result[result_idx]); \ denoised[y * stride + x] = new_val; \ if (x + 1 < (w >> chroma_sub_w)) { \ result[result_idx + 1] += err * 7.0f / 16.0f; \ } \ if (y + 1 < (h >> chroma_sub_h)) { \ if (x > 0) { \ result[result_idx + result_stride - 1] += err * 3.0f / 16.0f; \ } \ result[result_idx + result_stride] += err * 5.0f / 16.0f; \ if (x + 1 < (w >> chroma_sub_w)) { \ result[result_idx + result_stride + 1] += err * 1.0f / 16.0f; \ } \ } \ } \ } \ } DITHER_AND_QUANTIZE(uint8_t, lowbd) DITHER_AND_QUANTIZE(uint16_t, highbd) int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3], int w, int h, int stride[3], int chroma_sub[2], float *noise_psd[3], int block_size, int bit_depth, int use_highbd) { float *plane = NULL, *block = NULL, *window_full = NULL, *window_chroma = NULL; double *block_d = NULL, *plane_d = NULL; struct aom_noise_tx_t *tx_full = NULL; struct aom_noise_tx_t *tx_chroma = NULL; const int num_blocks_w = (w + block_size - 1) / block_size; const int num_blocks_h = (h + block_size - 1) / block_size; const int result_stride = (num_blocks_w + 2) * block_size; const int result_height = (num_blocks_h + 2) * block_size; float *result = NULL; int init_success = 1; aom_flat_block_finder_t block_finder_full; aom_flat_block_finder_t block_finder_chroma; const float kBlockNormalization = (float)((1 << bit_depth) - 1); if (chroma_sub[0] != chroma_sub[1]) { fprintf(stderr, "aom_wiener_denoise_2d doesn't handle different chroma " "subsampling\n"); return 0; } init_success &= aom_flat_block_finder_init(&block_finder_full, block_size, bit_depth, use_highbd); result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride * sizeof(*result)); plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane)); block = (float *)aom_memalign(32, 2 * block_size * block_size * sizeof(*block)); block_d = (double *)aom_malloc(block_size * block_size * sizeof(*block_d)); plane_d = (double *)aom_malloc(block_size * block_size * sizeof(*plane_d)); window_full = get_half_cos_window(block_size); tx_full = aom_noise_tx_malloc(block_size); if (chroma_sub[0] != 0) { init_success &= aom_flat_block_finder_init(&block_finder_chroma, block_size >> chroma_sub[0], bit_depth, use_highbd); window_chroma = get_half_cos_window(block_size >> chroma_sub[0]); tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]); } else { window_chroma = window_full; tx_chroma = tx_full; } init_success &= (tx_full != NULL) && (tx_chroma != NULL) && (plane != NULL) && (plane_d != NULL) && (block != NULL) && (block_d != NULL) && (window_full != NULL) && (window_chroma != NULL) && (result != NULL); for (int c = init_success ? 0 : 3; c < 3; ++c) { float *window_function = c == 0 ? window_full : window_chroma; aom_flat_block_finder_t *block_finder = &block_finder_full; const int chroma_sub_h = c > 0 ? chroma_sub[1] : 0; const int chroma_sub_w = c > 0 ? chroma_sub[0] : 0; struct aom_noise_tx_t *tx = (c > 0 && chroma_sub[0] > 0) ? tx_chroma : tx_full; if (!data[c] || !denoised[c]) continue; if (c > 0 && chroma_sub[0] != 0) { block_finder = &block_finder_chroma; } memset(result, 0, sizeof(*result) * result_stride * result_height); // Do overlapped block processing (half overlapped). The block rows can // easily be done in parallel for (int offsy = 0; offsy < (block_size >> chroma_sub_h); offsy += (block_size >> chroma_sub_h) / 2) { for (int offsx = 0; offsx < (block_size >> chroma_sub_w); offsx += (block_size >> chroma_sub_w) / 2) { // Pad the boundary when processing each block-set. for (int by = -1; by < num_blocks_h; ++by) { for (int bx = -1; bx < num_blocks_w; ++bx) { const int pixels_per_block = (block_size >> chroma_sub_w) * (block_size >> chroma_sub_h); aom_flat_block_finder_extract_block( block_finder, data[c], w >> chroma_sub_w, h >> chroma_sub_h, stride[c], bx * (block_size >> chroma_sub_w) + offsx, by * (block_size >> chroma_sub_h) + offsy, plane_d, block_d); for (int j = 0; j < pixels_per_block; ++j) { block[j] = (float)block_d[j]; plane[j] = (float)plane_d[j]; } pointwise_multiply(window_function, block, pixels_per_block); aom_noise_tx_forward(tx, block); aom_noise_tx_filter(tx, noise_psd[c]); aom_noise_tx_inverse(tx, block); // Apply window function to the plane approximation (we will apply // it to the sum of plane + block when composing the results). pointwise_multiply(window_function, plane, pixels_per_block); for (int y = 0; y < (block_size >> chroma_sub_h); ++y) { const int y_result = y + (by + 1) * (block_size >> chroma_sub_h) + offsy; for (int x = 0; x < (block_size >> chroma_sub_w); ++x) { const int x_result = x + (bx + 1) * (block_size >> chroma_sub_w) + offsx; result[y_result * result_stride + x_result] += (block[y * (block_size >> chroma_sub_w) + x] + plane[y * (block_size >> chroma_sub_w) + x]) * window_function[y * (block_size >> chroma_sub_w) + x]; } } } } } } if (use_highbd) { dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c], w, h, stride[c], chroma_sub_w, chroma_sub_h, block_size, kBlockNormalization); } else { dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h, stride[c], chroma_sub_w, chroma_sub_h, block_size, kBlockNormalization); } } aom_free(result); aom_free(plane); aom_free(block); aom_free(plane_d); aom_free(block_d); aom_free(window_full); aom_noise_tx_free(tx_full); aom_flat_block_finder_free(&block_finder_full); if (chroma_sub[0] != 0) { aom_flat_block_finder_free(&block_finder_chroma); aom_free(window_chroma); aom_noise_tx_free(tx_chroma); } return init_success; } struct aom_denoise_and_model_t { int block_size; int bit_depth; float noise_level; // Size of current denoised buffer and flat_block buffer int width; int height; int y_stride; int uv_stride; int num_blocks_w; int num_blocks_h; // Buffers for image and noise_psd allocated on the fly float *noise_psd[3]; uint8_t *denoised[3]; uint8_t *flat_blocks; aom_flat_block_finder_t flat_block_finder; aom_noise_model_t noise_model; }; struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth, int block_size, float noise_level) { struct aom_denoise_and_model_t *ctx = (struct aom_denoise_and_model_t *)aom_malloc( sizeof(struct aom_denoise_and_model_t)); if (!ctx) { fprintf(stderr, "Unable to allocate denoise_and_model struct\n"); return NULL; } memset(ctx, 0, sizeof(*ctx)); ctx->block_size = block_size; ctx->noise_level = noise_level; ctx->bit_depth = bit_depth; ctx->noise_psd[0] = (float *)aom_malloc(sizeof(*ctx->noise_psd[0]) * block_size * block_size); ctx->noise_psd[1] = (float *)aom_malloc(sizeof(*ctx->noise_psd[1]) * block_size * block_size); ctx->noise_psd[2] = (float *)aom_malloc(sizeof(*ctx->noise_psd[2]) * block_size * block_size); if (!ctx->noise_psd[0] || !ctx->noise_psd[1] || !ctx->noise_psd[2]) { fprintf(stderr, "Unable to allocate noise PSD buffers\n"); aom_denoise_and_model_free(ctx); return NULL; } return ctx; } void aom_denoise_and_model_free(struct aom_denoise_and_model_t *ctx) { aom_free(ctx->flat_blocks); for (int i = 0; i < 3; ++i) { aom_free(ctx->denoised[i]); aom_free(ctx->noise_psd[i]); } aom_noise_model_free(&ctx->noise_model); aom_flat_block_finder_free(&ctx->flat_block_finder); aom_free(ctx); } static int denoise_and_model_realloc_if_necessary( struct aom_denoise_and_model_t *ctx, const YV12_BUFFER_CONFIG *sd) { if (ctx->width == sd->y_width && ctx->height == sd->y_height && ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride) return 1; const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; const int block_size = ctx->block_size; ctx->width = sd->y_width; ctx->height = sd->y_height; ctx->y_stride = sd->y_stride; ctx->uv_stride = sd->uv_stride; for (int i = 0; i < 3; ++i) { aom_free(ctx->denoised[i]); ctx->denoised[i] = NULL; } aom_free(ctx->flat_blocks); ctx->flat_blocks = NULL; ctx->denoised[0] = (uint8_t *)aom_malloc((sd->y_stride * sd->y_height) << use_highbd); ctx->denoised[1] = (uint8_t *)aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd); ctx->denoised[2] = (uint8_t *)aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd); if (!ctx->denoised[0] || !ctx->denoised[1] || !ctx->denoised[2]) { fprintf(stderr, "Unable to allocate denoise buffers\n"); return 0; } ctx->num_blocks_w = (sd->y_width + ctx->block_size - 1) / ctx->block_size; ctx->num_blocks_h = (sd->y_height + ctx->block_size - 1) / ctx->block_size; ctx->flat_blocks = (uint8_t *)aom_malloc(ctx->num_blocks_w * ctx->num_blocks_h); if (!ctx->flat_blocks) { fprintf(stderr, "Unable to allocate flat_blocks buffer\n"); return 0; } aom_flat_block_finder_free(&ctx->flat_block_finder); if (!aom_flat_block_finder_init(&ctx->flat_block_finder, ctx->block_size, ctx->bit_depth, use_highbd)) { fprintf(stderr, "Unable to init flat block finder\n"); return 0; } const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3, ctx->bit_depth, use_highbd }; aom_noise_model_free(&ctx->noise_model); if (!aom_noise_model_init(&ctx->noise_model, params)) { fprintf(stderr, "Unable to init noise model\n"); return 0; } // Simply use a flat PSD (although we could use the flat blocks to estimate // PSD) those to estimate an actual noise PSD) const float y_noise_level = aom_noise_psd_get_default_value(ctx->block_size, ctx->noise_level); const float uv_noise_level = aom_noise_psd_get_default_value( ctx->block_size >> sd->subsampling_x, ctx->noise_level); for (int i = 0; i < block_size * block_size; ++i) { ctx->noise_psd[0][i] = y_noise_level; ctx->noise_psd[1][i] = ctx->noise_psd[2][i] = uv_noise_level; } return 1; } // TODO(aomedia:3151): Handle a monochrome image (sd->u_buffer and sd->v_buffer // are null pointers) correctly. int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, const YV12_BUFFER_CONFIG *sd, aom_film_grain_t *film_grain, int apply_denoise) { const int block_size = ctx->block_size; const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; uint8_t *raw_data[3] = { use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->y_buffer) : sd->y_buffer, use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->u_buffer) : sd->u_buffer, use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->v_buffer) : sd->v_buffer, }; const uint8_t *const data[3] = { raw_data[0], raw_data[1], raw_data[2] }; int strides[3] = { sd->y_stride, sd->uv_stride, sd->uv_stride }; int chroma_sub_log2[2] = { sd->subsampling_x, sd->subsampling_y }; if (!denoise_and_model_realloc_if_necessary(ctx, sd)) { fprintf(stderr, "Unable to realloc buffers\n"); return 0; } aom_flat_block_finder_run(&ctx->flat_block_finder, data[0], sd->y_width, sd->y_height, strides[0], ctx->flat_blocks); if (!aom_wiener_denoise_2d(data, ctx->denoised, sd->y_width, sd->y_height, strides, chroma_sub_log2, ctx->noise_psd, block_size, ctx->bit_depth, use_highbd)) { fprintf(stderr, "Unable to denoise image\n"); return 0; } const aom_noise_status_t status = aom_noise_model_update( &ctx->noise_model, data, (const uint8_t *const *)ctx->denoised, sd->y_width, sd->y_height, strides, chroma_sub_log2, ctx->flat_blocks, block_size); int have_noise_estimate = 0; if (status == AOM_NOISE_STATUS_OK) { have_noise_estimate = 1; } else if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) { aom_noise_model_save_latest(&ctx->noise_model); have_noise_estimate = 1; } else { // Unable to update noise model; proceed if we have a previous estimate. have_noise_estimate = (ctx->noise_model.combined_state[0].strength_solver.num_equations > 0); } film_grain->apply_grain = 0; if (have_noise_estimate) { if (!aom_noise_model_get_grain_parameters(&ctx->noise_model, film_grain)) { fprintf(stderr, "Unable to get grain parameters.\n"); return 0; } if (!film_grain->random_seed) { film_grain->random_seed = 7391; } if (apply_denoise) { memcpy(raw_data[0], ctx->denoised[0], (strides[0] * sd->y_height) << use_highbd); if (!sd->monochrome) { memcpy(raw_data[1], ctx->denoised[1], (strides[1] * sd->uv_height) << use_highbd); memcpy(raw_data[2], ctx->denoised[2], (strides[2] * sd->uv_height) << use_highbd); } } } return 1; } aom-3.12.1/aom_dsp/noise_model.h000066400000000000000000000324111477627663500164720ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_NOISE_MODEL_H_ #define AOM_AOM_DSP_NOISE_MODEL_H_ #ifdef __cplusplus extern "C" { #endif // __cplusplus #include #include "aom_dsp/grain_params.h" #include "aom_ports/mem.h" #include "aom_scale/yv12config.h" /*!\brief Wrapper of data required to represent linear system of eqns and soln. */ typedef struct { double *A; double *b; double *x; int n; } aom_equation_system_t; /*!\brief Representation of a piecewise linear curve * * Holds n points as (x, y) pairs, that store the curve. */ typedef struct { double (*points)[2]; int num_points; } aom_noise_strength_lut_t; /*!\brief Init the noise strength lut with the given number of points*/ int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points); /*!\brief Frees the noise strength lut. */ void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut); /*!\brief Evaluate the lut at the point x. * * \param[in] lut The lut data. * \param[in] x The coordinate to evaluate the lut. */ double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut, double x); /*!\brief Helper struct to model noise strength as a function of intensity. * * Internally, this structure holds a representation of a linear system * of equations that models noise strength (standard deviation) as a * function of intensity. The mapping is initially stored using a * piecewise representation with evenly spaced bins that cover the entire * domain from [min_intensity, max_intensity]. Each observation (x,y) gives a * constraint of the form: * y_{i} (1 - a) + y_{i+1} a = y * where y_{i} is the value of bin i and x_{i} <= x <= x_{i+1} and * a = x/(x_{i+1} - x{i}). The equation system holds the corresponding * normal equations. * * As there may be missing data, the solution is regularized to get a * complete set of values for the bins. A reduced representation after * solving can be obtained by getting the corresponding noise_strength_lut_t. */ typedef struct { aom_equation_system_t eqns; double min_intensity; double max_intensity; int num_bins; int num_equations; double total; } aom_noise_strength_solver_t; /*!\brief Initializes the noise solver with the given number of bins. * * Returns 0 if initialization fails. * * \param[in] solver The noise solver to be initialized. * \param[in] num_bins Number of bins to use in the internal representation. * \param[in] bit_depth The bit depth used to derive {min,max}_intensity. */ int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver, int num_bins, int bit_depth); void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver); /*!\brief Gets the x coordinate of bin i. * * \param[in] i The bin whose coordinate to query. */ double aom_noise_strength_solver_get_center( const aom_noise_strength_solver_t *solver, int i); /*!\brief Add an observation of the block mean intensity to its noise strength. * * \param[in] block_mean The average block intensity, * \param[in] noise_std The observed noise strength. */ void aom_noise_strength_solver_add_measurement( aom_noise_strength_solver_t *solver, double block_mean, double noise_std); /*!\brief Solves the current set of equations for the noise strength. */ int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver); /*!\brief Fits a reduced piecewise linear lut to the internal solution * * \param[in] max_num_points The maximum number of output points * \param[out] lut The output piecewise linear lut. */ int aom_noise_strength_solver_fit_piecewise( const aom_noise_strength_solver_t *solver, int max_num_points, aom_noise_strength_lut_t *lut); /*!\brief Helper for holding precomputed data for finding flat blocks. * * Internally a block is modeled with a low-order polynomial model. A * planar model would be a bunch of equations like: * <[y_i x_i 1], [a_1, a_2, a_3]> = b_i * for each point in the block. The system matrix A with row i as [y_i x_i 1] * is maintained as is the inverse, inv(A'*A), so that the plane parameters * can be fit for each block. */ typedef struct { double *AtA_inv; double *A; int num_params; // The number of parameters used for internal low-order model int block_size; // The block size the finder was initialized with double normalization; // Normalization factor (1 / (2^(bit_depth) - 1)) int use_highbd; // Whether input data should be interpreted as uint16 } aom_flat_block_finder_t; /*!\brief Init the block_finder with the given block size, bit_depth */ int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder, int block_size, int bit_depth, int use_highbd); void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder); /*!\brief Helper to extract a block and low order "planar" model. */ void aom_flat_block_finder_extract_block( const aom_flat_block_finder_t *block_finder, const uint8_t *const data, int w, int h, int stride, int offsx, int offsy, double *plane, double *block); /*!\brief Runs the flat block finder on the input data. * * Find flat blocks in the input image data. Returns a map of * flat_blocks, where the value of flat_blocks map will be non-zero * when a block is determined to be flat. A higher value indicates a bigger * confidence in the decision. */ int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder, const uint8_t *const data, int w, int h, int stride, uint8_t *flat_blocks); // The noise shape indicates the allowed coefficients in the AR model. enum { AOM_NOISE_SHAPE_DIAMOND = 0, AOM_NOISE_SHAPE_SQUARE = 1 } UENUM1BYTE(aom_noise_shape); // The parameters of the noise model include the shape type, lag, the // bit depth of the input images provided, and whether the input images // will be using uint16 (or uint8) representation. typedef struct { aom_noise_shape shape; int lag; int bit_depth; int use_highbd; } aom_noise_model_params_t; /*!\brief State of a noise model estimate for a single channel. * * This contains a system of equations that can be used to solve * for the auto-regressive coefficients as well as a noise strength * solver that can be used to model noise strength as a function of * intensity. */ typedef struct { aom_equation_system_t eqns; aom_noise_strength_solver_t strength_solver; int num_observations; // The number of observations in the eqn system double ar_gain; // The gain of the current AR filter } aom_noise_state_t; /*!\brief Complete model of noise for a planar video * * This includes a noise model for the latest frame and an aggregated * estimate over all previous frames that had similar parameters. */ typedef struct { aom_noise_model_params_t params; aom_noise_state_t combined_state[3]; // Combined state per channel aom_noise_state_t latest_state[3]; // Latest state per channel int (*coords)[2]; // Offsets (x,y) of the coefficient samples int n; // Number of parameters (size of coords) int bit_depth; } aom_noise_model_t; /*!\brief Result of a noise model update. */ enum { AOM_NOISE_STATUS_OK = 0, AOM_NOISE_STATUS_INVALID_ARGUMENT, AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS, AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, AOM_NOISE_STATUS_INTERNAL_ERROR, } UENUM1BYTE(aom_noise_status_t); /*!\brief Initializes a noise model with the given parameters. * * Returns 0 on failure. */ int aom_noise_model_init(aom_noise_model_t *model, const aom_noise_model_params_t params); void aom_noise_model_free(aom_noise_model_t *model); /*!\brief Updates the noise model with a new frame observation. * * Updates the noise model with measurements from the given input frame and a * denoised variant of it. Noise is sampled from flat blocks using the flat * block map. * * Returns a noise_status indicating if the update was successful. If the * Update was successful, the combined_state is updated with measurements from * the provided frame. If status is OK or DIFFERENT_NOISE_TYPE, the latest noise * state will be updated with measurements from the provided frame. * * \param[in,out] noise_model The noise model to be updated * \param[in] data Raw frame data * \param[in] denoised Denoised frame data. * \param[in] w Frame width * \param[in] h Frame height * \param[in] strides Stride of the planes * \param[in] chroma_sub_log2 Chroma subsampling for planes != 0. * \param[in] flat_blocks A map to blocks that have been determined flat * \param[in] block_size The size of blocks. */ aom_noise_status_t aom_noise_model_update( aom_noise_model_t *const noise_model, const uint8_t *const data[3], const uint8_t *const denoised[3], int w, int h, int strides[3], int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size); /*\brief Save the "latest" estimate into the "combined" estimate. * * This is meant to be called when the noise modeling detected a change * in parameters (or for example, if a user wanted to reset estimation at * a shot boundary). */ void aom_noise_model_save_latest(aom_noise_model_t *noise_model); /*!\brief Converts the noise_model parameters to the corresponding * grain_parameters. * * The noise structs in this file are suitable for estimation (e.g., using * floats), but the grain parameters in the bitstream are quantized. This * function does the conversion by selecting the correct quantization levels. */ int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model, aom_film_grain_t *film_grain); /*!\brief Perform a Wiener filter denoising in 2D using the provided noise psd. * * \param[in] data Raw frame data * \param[out] denoised Denoised frame data * \param[in] w Frame width * \param[in] h Frame height * \param[in] stride Stride of the planes * \param[in] chroma_sub_log2 Chroma subsampling for planes != 0. * \param[in] noise_psd The power spectral density of the noise * \param[in] block_size The size of blocks * \param[in] bit_depth Bit depth of the image * \param[in] use_highbd If true, uint8 pointers are interpreted as * uint16 and stride is measured in uint16. * This must be true when bit_depth >= 10. */ int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3], int w, int h, int stride[3], int chroma_sub_log2[2], float *noise_psd[3], int block_size, int bit_depth, int use_highbd); struct aom_denoise_and_model_t; /*!\brief Denoise the buffer and model the residual noise. * * This is meant to be called sequentially on input frames. The input buffer * is denoised and the residual noise is modelled. The current noise estimate * is populated in film_grain. Returns true on success. The grain.apply_grain * parameter will be true when the input buffer was successfully denoised and * grain was modelled. Returns false on error. * * \param[in] ctx Struct allocated with * aom_denoise_and_model_alloc that holds some * buffers for denoising and the current noise * estimate. * \param[in,out] sd The raw input buffer to be denoised. * \param[out] grain Output film grain parameters * \param[in] apply_denoise Whether or not to apply the denoising to the * frame that will be encoded */ int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, const YV12_BUFFER_CONFIG *sd, aom_film_grain_t *grain, int apply_denoise); /*!\brief Allocates a context that can be used for denoising and noise modeling. * * \param[in] bit_depth Bit depth of buffers this will be run on. * \param[in] block_size Block size for noise modeling and flat block * estimation * \param[in] noise_level The noise_level (2.5 for moderate noise, and 5 for * higher levels of noise) */ struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth, int block_size, float noise_level); /*!\brief Frees the denoise context allocated with aom_denoise_and_model_alloc */ void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model); #ifdef __cplusplus } // extern "C" #endif // __cplusplus #endif // AOM_AOM_DSP_NOISE_MODEL_H_ aom-3.12.1/aom_dsp/noise_util.c000066400000000000000000000155301477627663500163450ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "aom_dsp/noise_util.h" #include "aom_dsp/fft_common.h" #include "aom_mem/aom_mem.h" #include "config/aom_dsp_rtcd.h" float aom_noise_psd_get_default_value(int block_size, float factor) { return (factor * factor / 10000) * block_size * block_size / 8; } // Internal representation of noise transform. It keeps track of the // transformed data and a temporary working buffer to use during the // transform. struct aom_noise_tx_t { float *tx_block; float *temp; int block_size; void (*fft)(const float *, float *, float *); void (*ifft)(const float *, float *, float *); }; struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size) { struct aom_noise_tx_t *noise_tx = (struct aom_noise_tx_t *)aom_malloc(sizeof(struct aom_noise_tx_t)); if (!noise_tx) return NULL; memset(noise_tx, 0, sizeof(*noise_tx)); switch (block_size) { case 2: noise_tx->fft = aom_fft2x2_float; noise_tx->ifft = aom_ifft2x2_float; break; case 4: noise_tx->fft = aom_fft4x4_float; noise_tx->ifft = aom_ifft4x4_float; break; case 8: noise_tx->fft = aom_fft8x8_float; noise_tx->ifft = aom_ifft8x8_float; break; case 16: noise_tx->fft = aom_fft16x16_float; noise_tx->ifft = aom_ifft16x16_float; break; case 32: noise_tx->fft = aom_fft32x32_float; noise_tx->ifft = aom_ifft32x32_float; break; default: aom_free(noise_tx); fprintf(stderr, "Unsupported block size %d\n", block_size); return NULL; } noise_tx->block_size = block_size; noise_tx->tx_block = (float *)aom_memalign( 32, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size); noise_tx->temp = (float *)aom_memalign( 32, 2 * sizeof(*noise_tx->temp) * block_size * block_size); if (!noise_tx->tx_block || !noise_tx->temp) { aom_noise_tx_free(noise_tx); return NULL; } // Clear the buffers up front. Some outputs of the forward transform are // real only (the imaginary component will never be touched) memset(noise_tx->tx_block, 0, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size); memset(noise_tx->temp, 0, 2 * sizeof(*noise_tx->temp) * block_size * block_size); return noise_tx; } void aom_noise_tx_forward(struct aom_noise_tx_t *noise_tx, const float *data) { noise_tx->fft(data, noise_tx->temp, noise_tx->tx_block); } void aom_noise_tx_filter(struct aom_noise_tx_t *noise_tx, const float *psd) { const int block_size = noise_tx->block_size; const float kBeta = 1.1f; const float kEps = 1e-6f; for (int y = 0; y < block_size; ++y) { for (int x = 0; x < block_size; ++x) { int i = y * block_size + x; float *c = noise_tx->tx_block + 2 * i; const float c0 = AOMMAX((float)fabs(c[0]), 1e-8f); const float c1 = AOMMAX((float)fabs(c[1]), 1e-8f); const float p = c0 * c0 + c1 * c1; if (p > kBeta * psd[i] && p > 1e-6) { noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps); noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps); } else { noise_tx->tx_block[2 * i + 0] *= (kBeta - 1.0f) / kBeta; noise_tx->tx_block[2 * i + 1] *= (kBeta - 1.0f) / kBeta; } } } } void aom_noise_tx_inverse(struct aom_noise_tx_t *noise_tx, float *data) { const int n = noise_tx->block_size * noise_tx->block_size; noise_tx->ifft(noise_tx->tx_block, noise_tx->temp, data); for (int i = 0; i < n; ++i) { data[i] /= n; } } void aom_noise_tx_add_energy(const struct aom_noise_tx_t *noise_tx, float *psd) { const int block_size = noise_tx->block_size; for (int yb = 0; yb < block_size; ++yb) { for (int xb = 0; xb <= block_size / 2; ++xb) { float *c = noise_tx->tx_block + 2 * (yb * block_size + xb); psd[yb * block_size + xb] += c[0] * c[0] + c[1] * c[1]; } } } void aom_noise_tx_free(struct aom_noise_tx_t *noise_tx) { if (!noise_tx) return; aom_free(noise_tx->tx_block); aom_free(noise_tx->temp); aom_free(noise_tx); } double aom_normalized_cross_correlation(const double *a, const double *b, int n) { double c = 0; double a_len = 0; double b_len = 0; for (int i = 0; i < n; ++i) { a_len += a[i] * a[i]; b_len += b[i] * b[i]; c += a[i] * b[i]; } return c / (sqrt(a_len) * sqrt(b_len)); } int aom_noise_data_validate(const double *data, int w, int h) { const double kVarianceThreshold = 2; const double kMeanThreshold = 2; int x = 0, y = 0; int ret_value = 1; double var = 0, mean = 0; double *mean_x, *mean_y, *var_x, *var_y; // Check that noise variance is not increasing in x or y // and that the data is zero mean. mean_x = (double *)aom_calloc(w, sizeof(*mean_x)); var_x = (double *)aom_calloc(w, sizeof(*var_x)); mean_y = (double *)aom_calloc(h, sizeof(*mean_x)); var_y = (double *)aom_calloc(h, sizeof(*var_y)); if (!(mean_x && var_x && mean_y && var_y)) { aom_free(mean_x); aom_free(mean_y); aom_free(var_x); aom_free(var_y); return 0; } for (y = 0; y < h; ++y) { for (x = 0; x < w; ++x) { const double d = data[y * w + x]; var_x[x] += d * d; var_y[y] += d * d; mean_x[x] += d; mean_y[y] += d; var += d * d; mean += d; } } mean /= (w * h); var = var / (w * h) - mean * mean; for (y = 0; y < h; ++y) { mean_y[y] /= h; var_y[y] = var_y[y] / h - mean_y[y] * mean_y[y]; if (fabs(var_y[y] - var) >= kVarianceThreshold) { fprintf(stderr, "Variance distance too large %f %f\n", var_y[y], var); ret_value = 0; break; } if (fabs(mean_y[y] - mean) >= kMeanThreshold) { fprintf(stderr, "Mean distance too large %f %f\n", mean_y[y], mean); ret_value = 0; break; } } for (x = 0; x < w; ++x) { mean_x[x] /= w; var_x[x] = var_x[x] / w - mean_x[x] * mean_x[x]; if (fabs(var_x[x] - var) >= kVarianceThreshold) { fprintf(stderr, "Variance distance too large %f %f\n", var_x[x], var); ret_value = 0; break; } if (fabs(mean_x[x] - mean) >= kMeanThreshold) { fprintf(stderr, "Mean distance too large %f %f\n", mean_x[x], mean); ret_value = 0; break; } } aom_free(mean_x); aom_free(mean_y); aom_free(var_x); aom_free(var_y); return ret_value; } aom-3.12.1/aom_dsp/noise_util.h000066400000000000000000000057011477627663500163510ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_NOISE_UTIL_H_ #define AOM_AOM_DSP_NOISE_UTIL_H_ #ifdef __cplusplus extern "C" { #endif // __cplusplus // aom_noise_tx_t is an abstraction of a transform that is used for denoising. // It is meant to be lightweight and does hold the transformed data (as // the user should not be manipulating the transformed data directly). struct aom_noise_tx_t; // Allocates and returns a aom_noise_tx_t useful for denoising the given // block_size. The resulting aom_noise_tx_t should be free'd with // aom_noise_tx_free. struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size); void aom_noise_tx_free(struct aom_noise_tx_t *aom_noise_tx); // Transforms the internal data and holds it in the aom_noise_tx's internal // buffer. For compatibility with existing SIMD implementations, "data" must // be 32-byte aligned. void aom_noise_tx_forward(struct aom_noise_tx_t *aom_noise_tx, const float *data); // Filters aom_noise_tx's internal data using the provided noise power spectral // density. The PSD must be at least block_size * block_size and should be // populated with a constant or via estimates taken from // aom_noise_tx_add_energy. void aom_noise_tx_filter(struct aom_noise_tx_t *aom_noise_tx, const float *psd); // Performs an inverse transform using the internal transform data. // For compatibility with existing SIMD implementations, "data" must be 32-byte // aligned. void aom_noise_tx_inverse(struct aom_noise_tx_t *aom_noise_tx, float *data); // Aggregates the power of the buffered transform data into the psd buffer. void aom_noise_tx_add_energy(const struct aom_noise_tx_t *aom_noise_tx, float *psd); // Returns a default value suitable for denosing a transform of the given // block_size. The noise "factor" determines the strength of the noise to // be removed. A value of about 2.5 can be used for moderate denoising, // where a value of 5.0 can be used for a high level of denoising. float aom_noise_psd_get_default_value(int block_size, float factor); // Computes normalized cross correlation of two vectors a and b of length n. double aom_normalized_cross_correlation(const double *a, const double *b, int n); // Validates the correlated noise in the data buffer of size (w, h). int aom_noise_data_validate(const double *data, int w, int h); #ifdef __cplusplus } // extern "C" #endif // __cplusplus #endif // AOM_AOM_DSP_NOISE_UTIL_H_ aom-3.12.1/aom_dsp/odintrin.c000066400000000000000000000665151477627663500160320ustar00rootroot00000000000000/* * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /* clang-format off */ #include "aom_dsp/odintrin.h" /*Constants for use with OD_DIVU_SMALL(). See \cite{Rob05} for details on computing these constants. @INPROCEEDINGS{Rob05, author="Arch D. Robison", title="{N}-bit Unsigned Division via {N}-bit Multiply-Add", booktitle="Proc. of the 17th IEEE Symposium on Computer Arithmetic (ARITH'05)", pages="131--139", address="Cape Cod, MA", month=Jun, year=2005 }*/ uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2] = { { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xAAAAAAAB, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xCCCCCCCD, 0 }, { 0xAAAAAAAB, 0 }, { 0x92492492, 0x92492492 }, { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xE38E38E4, 0 }, { 0xCCCCCCCD, 0 }, { 0xBA2E8BA3, 0 }, { 0xAAAAAAAB, 0 }, { 0x9D89D89E, 0 }, { 0x92492492, 0x92492492 }, { 0x88888889, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xF0F0F0F1, 0 }, { 0xE38E38E4, 0 }, { 0xD79435E5, 0xD79435E5 }, { 0xCCCCCCCD, 0 }, { 0xC30C30C3, 0xC30C30C3 }, { 0xBA2E8BA3, 0 }, { 0xB21642C9, 0 }, { 0xAAAAAAAB, 0 }, { 0xA3D70A3E, 0 }, { 0x9D89D89E, 0 }, { 0x97B425ED, 0x97B425ED }, { 0x92492492, 0x92492492 }, { 0x8D3DCB09, 0 }, { 0x88888889, 0 }, { 0x84210842, 0x84210842 }, { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xF83E0F84, 0 }, { 0xF0F0F0F1, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA }, { 0xE38E38E4, 0 }, { 0xDD67C8A6, 0xDD67C8A6 }, { 0xD79435E5, 0xD79435E5 }, { 0xD20D20D2, 0xD20D20D2 }, { 0xCCCCCCCD, 0 }, { 0xC7CE0C7D, 0 }, { 0xC30C30C3, 0xC30C30C3 }, { 0xBE82FA0C, 0 }, { 0xBA2E8BA3, 0 }, { 0xB60B60B6, 0xB60B60B6 }, { 0xB21642C9, 0 }, { 0xAE4C415D, 0 }, { 0xAAAAAAAB, 0 }, { 0xA72F053A, 0 }, { 0xA3D70A3E, 0 }, { 0xA0A0A0A1, 0 }, { 0x9D89D89E, 0 }, { 0x9A90E7D9, 0x9A90E7D9 }, { 0x97B425ED, 0x97B425ED }, { 0x94F2094F, 0x94F2094F }, { 0x92492492, 0x92492492 }, { 0x8FB823EE, 0x8FB823EE }, { 0x8D3DCB09, 0 }, { 0x8AD8F2FC, 0 }, { 0x88888889, 0 }, { 0x864B8A7E, 0 }, { 0x84210842, 0x84210842 }, { 0x82082082, 0x82082082 }, { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFC0FC0FD, 0 }, { 0xF83E0F84, 0 }, { 0xF4898D60, 0 }, { 0xF0F0F0F1, 0 }, { 0xED7303B6, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA }, { 0xE6C2B449, 0 }, { 0xE38E38E4, 0 }, { 0xE070381C, 0xE070381C }, { 0xDD67C8A6, 0xDD67C8A6 }, { 0xDA740DA8, 0 }, { 0xD79435E5, 0xD79435E5 }, { 0xD4C77B04, 0 }, { 0xD20D20D2, 0xD20D20D2 }, { 0xCF6474A9, 0 }, { 0xCCCCCCCD, 0 }, { 0xCA4587E7, 0 }, { 0xC7CE0C7D, 0 }, { 0xC565C87C, 0 }, { 0xC30C30C3, 0xC30C30C3 }, { 0xC0C0C0C1, 0 }, { 0xBE82FA0C, 0 }, { 0xBC52640C, 0 }, { 0xBA2E8BA3, 0 }, { 0xB81702E1, 0 }, { 0xB60B60B6, 0xB60B60B6 }, { 0xB40B40B4, 0xB40B40B4 }, { 0xB21642C9, 0 }, { 0xB02C0B03, 0 }, { 0xAE4C415D, 0 }, { 0xAC769184, 0xAC769184 }, { 0xAAAAAAAB, 0 }, { 0xA8E83F57, 0xA8E83F57 }, { 0xA72F053A, 0 }, { 0xA57EB503, 0 }, { 0xA3D70A3E, 0 }, { 0xA237C32B, 0xA237C32B }, { 0xA0A0A0A1, 0 }, { 0x9F1165E7, 0x9F1165E7 }, { 0x9D89D89E, 0 }, { 0x9C09C09C, 0x9C09C09C }, { 0x9A90E7D9, 0x9A90E7D9 }, { 0x991F1A51, 0x991F1A51 }, { 0x97B425ED, 0x97B425ED }, { 0x964FDA6C, 0x964FDA6C }, { 0x94F2094F, 0x94F2094F }, { 0x939A85C4, 0x939A85C4 }, { 0x92492492, 0x92492492 }, { 0x90FDBC09, 0x90FDBC09 }, { 0x8FB823EE, 0x8FB823EE }, { 0x8E78356D, 0x8E78356D }, { 0x8D3DCB09, 0 }, { 0x8C08C08C, 0x8C08C08C }, { 0x8AD8F2FC, 0 }, { 0x89AE408A, 0 }, { 0x88888889, 0 }, { 0x8767AB5F, 0x8767AB5F }, { 0x864B8A7E, 0 }, { 0x85340853, 0x85340853 }, { 0x84210842, 0x84210842 }, { 0x83126E98, 0 }, { 0x82082082, 0x82082082 }, { 0x81020408, 0x81020408 }, { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFE03F810, 0 }, { 0xFC0FC0FD, 0 }, { 0xFA232CF3, 0 }, { 0xF83E0F84, 0 }, { 0xF6603D99, 0 }, { 0xF4898D60, 0 }, { 0xF2B9D649, 0 }, { 0xF0F0F0F1, 0 }, { 0xEF2EB720, 0 }, { 0xED7303B6, 0 }, { 0xEBBDB2A6, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA }, { 0xE865AC7C, 0 }, { 0xE6C2B449, 0 }, { 0xE525982B, 0 }, { 0xE38E38E4, 0 }, { 0xE1FC780F, 0 }, { 0xE070381C, 0xE070381C }, { 0xDEE95C4D, 0 }, { 0xDD67C8A6, 0xDD67C8A6 }, { 0xDBEB61EF, 0 }, { 0xDA740DA8, 0 }, { 0xD901B204, 0 }, { 0xD79435E5, 0xD79435E5 }, { 0xD62B80D7, 0 }, { 0xD4C77B04, 0 }, { 0xD3680D37, 0 }, { 0xD20D20D2, 0xD20D20D2 }, { 0xD0B69FCC, 0 }, { 0xCF6474A9, 0 }, { 0xCE168A77, 0xCE168A77 }, { 0xCCCCCCCD, 0 }, { 0xCB8727C1, 0 }, { 0xCA4587E7, 0 }, { 0xC907DA4F, 0 }, { 0xC7CE0C7D, 0 }, { 0xC6980C6A, 0 }, { 0xC565C87C, 0 }, { 0xC4372F86, 0 }, { 0xC30C30C3, 0xC30C30C3 }, { 0xC1E4BBD6, 0 }, { 0xC0C0C0C1, 0 }, { 0xBFA02FE8, 0xBFA02FE8 }, { 0xBE82FA0C, 0 }, { 0xBD691047, 0xBD691047 }, { 0xBC52640C, 0 }, { 0xBB3EE722, 0 }, { 0xBA2E8BA3, 0 }, { 0xB92143FA, 0xB92143FA }, { 0xB81702E1, 0 }, { 0xB70FBB5A, 0xB70FBB5A }, { 0xB60B60B6, 0xB60B60B6 }, { 0xB509E68B, 0 }, { 0xB40B40B4, 0xB40B40B4 }, { 0xB30F6353, 0 }, { 0xB21642C9, 0 }, { 0xB11FD3B8, 0xB11FD3B8 }, { 0xB02C0B03, 0 }, { 0xAF3ADDC7, 0 }, { 0xAE4C415D, 0 }, { 0xAD602B58, 0xAD602B58 }, { 0xAC769184, 0xAC769184 }, { 0xAB8F69E3, 0 }, { 0xAAAAAAAB, 0 }, { 0xA9C84A48, 0 }, { 0xA8E83F57, 0xA8E83F57 }, { 0xA80A80A8, 0xA80A80A8 }, { 0xA72F053A, 0 }, { 0xA655C439, 0xA655C439 }, { 0xA57EB503, 0 }, { 0xA4A9CF1E, 0 }, { 0xA3D70A3E, 0 }, { 0xA3065E40, 0 }, { 0xA237C32B, 0xA237C32B }, { 0xA16B312F, 0 }, { 0xA0A0A0A1, 0 }, { 0x9FD809FE, 0 }, { 0x9F1165E7, 0x9F1165E7 }, { 0x9E4CAD24, 0 }, { 0x9D89D89E, 0 }, { 0x9CC8E161, 0 }, { 0x9C09C09C, 0x9C09C09C }, { 0x9B4C6F9F, 0 }, { 0x9A90E7D9, 0x9A90E7D9 }, { 0x99D722DB, 0 }, { 0x991F1A51, 0x991F1A51 }, { 0x9868C80A, 0 }, { 0x97B425ED, 0x97B425ED }, { 0x97012E02, 0x97012E02 }, { 0x964FDA6C, 0x964FDA6C }, { 0x95A02568, 0x95A02568 }, { 0x94F2094F, 0x94F2094F }, { 0x94458094, 0x94458094 }, { 0x939A85C4, 0x939A85C4 }, { 0x92F11384, 0x92F11384 }, { 0x92492492, 0x92492492 }, { 0x91A2B3C5, 0 }, { 0x90FDBC09, 0x90FDBC09 }, { 0x905A3863, 0x905A3863 }, { 0x8FB823EE, 0x8FB823EE }, { 0x8F1779DA, 0 }, { 0x8E78356D, 0x8E78356D }, { 0x8DDA5202, 0x8DDA5202 }, { 0x8D3DCB09, 0 }, { 0x8CA29C04, 0x8CA29C04 }, { 0x8C08C08C, 0x8C08C08C }, { 0x8B70344A, 0x8B70344A }, { 0x8AD8F2FC, 0 }, { 0x8A42F870, 0x8A42F870 }, { 0x89AE408A, 0 }, { 0x891AC73B, 0 }, { 0x88888889, 0 }, { 0x87F78088, 0 }, { 0x8767AB5F, 0x8767AB5F }, { 0x86D90545, 0 }, { 0x864B8A7E, 0 }, { 0x85BF3761, 0x85BF3761 }, { 0x85340853, 0x85340853 }, { 0x84A9F9C8, 0x84A9F9C8 }, { 0x84210842, 0x84210842 }, { 0x83993052, 0x83993052 }, { 0x83126E98, 0 }, { 0x828CBFBF, 0 }, { 0x82082082, 0x82082082 }, { 0x81848DA9, 0 }, { 0x81020408, 0x81020408 }, { 0x80808081, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFF00FF01, 0 }, { 0xFE03F810, 0 }, { 0xFD08E551, 0 }, { 0xFC0FC0FD, 0 }, { 0xFB188566, 0 }, { 0xFA232CF3, 0 }, { 0xF92FB222, 0 }, { 0xF83E0F84, 0 }, { 0xF74E3FC3, 0 }, { 0xF6603D99, 0 }, { 0xF57403D6, 0 }, { 0xF4898D60, 0 }, { 0xF3A0D52D, 0 }, { 0xF2B9D649, 0 }, { 0xF1D48BCF, 0 }, { 0xF0F0F0F1, 0 }, { 0xF00F00F0, 0xF00F00F0 }, { 0xEF2EB720, 0 }, { 0xEE500EE5, 0xEE500EE5 }, { 0xED7303B6, 0 }, { 0xEC979119, 0 }, { 0xEBBDB2A6, 0 }, { 0xEAE56404, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA }, { 0xE9396520, 0 }, { 0xE865AC7C, 0 }, { 0xE79372E3, 0 }, { 0xE6C2B449, 0 }, { 0xE5F36CB0, 0xE5F36CB0 }, { 0xE525982B, 0 }, { 0xE45932D8, 0 }, { 0xE38E38E4, 0 }, { 0xE2C4A689, 0 }, { 0xE1FC780F, 0 }, { 0xE135A9CA, 0 }, { 0xE070381C, 0xE070381C }, { 0xDFAC1F75, 0 }, { 0xDEE95C4D, 0 }, { 0xDE27EB2D, 0 }, { 0xDD67C8A6, 0xDD67C8A6 }, { 0xDCA8F159, 0 }, { 0xDBEB61EF, 0 }, { 0xDB2F171E, 0 }, { 0xDA740DA8, 0 }, { 0xD9BA4257, 0 }, { 0xD901B204, 0 }, { 0xD84A598F, 0 }, { 0xD79435E5, 0xD79435E5 }, { 0xD6DF43FD, 0 }, { 0xD62B80D7, 0 }, { 0xD578E97D, 0 }, { 0xD4C77B04, 0 }, { 0xD417328A, 0 }, { 0xD3680D37, 0 }, { 0xD2BA083C, 0 }, { 0xD20D20D2, 0xD20D20D2 }, { 0xD161543E, 0xD161543E }, { 0xD0B69FCC, 0 }, { 0xD00D00D0, 0xD00D00D0 }, { 0xCF6474A9, 0 }, { 0xCEBCF8BC, 0 }, { 0xCE168A77, 0xCE168A77 }, { 0xCD712753, 0 }, { 0xCCCCCCCD, 0 }, { 0xCC29786D, 0 }, { 0xCB8727C1, 0 }, { 0xCAE5D85F, 0xCAE5D85F }, { 0xCA4587E7, 0 }, { 0xC9A633FD, 0 }, { 0xC907DA4F, 0 }, { 0xC86A7890, 0xC86A7890 }, { 0xC7CE0C7D, 0 }, { 0xC73293D8, 0 }, { 0xC6980C6A, 0 }, { 0xC5FE7403, 0xC5FE7403 }, { 0xC565C87C, 0 }, { 0xC4CE07B0, 0xC4CE07B0 }, { 0xC4372F86, 0 }, { 0xC3A13DE6, 0xC3A13DE6 }, { 0xC30C30C3, 0xC30C30C3 }, { 0xC2780614, 0 }, { 0xC1E4BBD6, 0 }, { 0xC152500C, 0xC152500C }, { 0xC0C0C0C1, 0 }, { 0xC0300C03, 0xC0300C03 }, { 0xBFA02FE8, 0xBFA02FE8 }, { 0xBF112A8B, 0 }, { 0xBE82FA0C, 0 }, { 0xBDF59C92, 0 }, { 0xBD691047, 0xBD691047 }, { 0xBCDD535E, 0 }, { 0xBC52640C, 0 }, { 0xBBC8408D, 0 }, { 0xBB3EE722, 0 }, { 0xBAB65610, 0xBAB65610 }, { 0xBA2E8BA3, 0 }, { 0xB9A7862A, 0xB9A7862A }, { 0xB92143FA, 0xB92143FA }, { 0xB89BC36D, 0 }, { 0xB81702E1, 0 }, { 0xB79300B8, 0 }, { 0xB70FBB5A, 0xB70FBB5A }, { 0xB68D3134, 0xB68D3134 }, { 0xB60B60B6, 0xB60B60B6 }, { 0xB58A4855, 0xB58A4855 }, { 0xB509E68B, 0 }, { 0xB48A39D4, 0xB48A39D4 }, { 0xB40B40B4, 0xB40B40B4 }, { 0xB38CF9B0, 0xB38CF9B0 }, { 0xB30F6353, 0 }, { 0xB2927C2A, 0 }, { 0xB21642C9, 0 }, { 0xB19AB5C5, 0 }, { 0xB11FD3B8, 0xB11FD3B8 }, { 0xB0A59B42, 0 }, { 0xB02C0B03, 0 }, { 0xAFB321A1, 0xAFB321A1 }, { 0xAF3ADDC7, 0 }, { 0xAEC33E20, 0 }, { 0xAE4C415D, 0 }, { 0xADD5E632, 0xADD5E632 }, { 0xAD602B58, 0xAD602B58 }, { 0xACEB0F89, 0xACEB0F89 }, { 0xAC769184, 0xAC769184 }, { 0xAC02B00B, 0 }, { 0xAB8F69E3, 0 }, { 0xAB1CBDD4, 0 }, { 0xAAAAAAAB, 0 }, { 0xAA392F36, 0 }, { 0xA9C84A48, 0 }, { 0xA957FAB5, 0xA957FAB5 }, { 0xA8E83F57, 0xA8E83F57 }, { 0xA8791709, 0 }, { 0xA80A80A8, 0xA80A80A8 }, { 0xA79C7B17, 0 }, { 0xA72F053A, 0 }, { 0xA6C21DF7, 0 }, { 0xA655C439, 0xA655C439 }, { 0xA5E9F6ED, 0xA5E9F6ED }, { 0xA57EB503, 0 }, { 0xA513FD6C, 0 }, { 0xA4A9CF1E, 0 }, { 0xA4402910, 0xA4402910 }, { 0xA3D70A3E, 0 }, { 0xA36E71A3, 0 }, { 0xA3065E40, 0 }, { 0xA29ECF16, 0xA29ECF16 }, { 0xA237C32B, 0xA237C32B }, { 0xA1D13986, 0 }, { 0xA16B312F, 0 }, { 0xA105A933, 0 }, { 0xA0A0A0A1, 0 }, { 0xA03C1689, 0 }, { 0x9FD809FE, 0 }, { 0x9F747A15, 0x9F747A15 }, { 0x9F1165E7, 0x9F1165E7 }, { 0x9EAECC8D, 0x9EAECC8D }, { 0x9E4CAD24, 0 }, { 0x9DEB06C9, 0x9DEB06C9 }, { 0x9D89D89E, 0 }, { 0x9D2921C4, 0 }, { 0x9CC8E161, 0 }, { 0x9C69169B, 0x9C69169B }, { 0x9C09C09C, 0x9C09C09C }, { 0x9BAADE8E, 0x9BAADE8E }, { 0x9B4C6F9F, 0 }, { 0x9AEE72FD, 0 }, { 0x9A90E7D9, 0x9A90E7D9 }, { 0x9A33CD67, 0x9A33CD67 }, { 0x99D722DB, 0 }, { 0x997AE76B, 0x997AE76B }, { 0x991F1A51, 0x991F1A51 }, { 0x98C3BAC7, 0x98C3BAC7 }, { 0x9868C80A, 0 }, { 0x980E4156, 0x980E4156 }, { 0x97B425ED, 0x97B425ED }, { 0x975A7510, 0 }, { 0x97012E02, 0x97012E02 }, { 0x96A8500A, 0 }, { 0x964FDA6C, 0x964FDA6C }, { 0x95F7CC73, 0 }, { 0x95A02568, 0x95A02568 }, { 0x9548E498, 0 }, { 0x94F2094F, 0x94F2094F }, { 0x949B92DE, 0 }, { 0x94458094, 0x94458094 }, { 0x93EFD1C5, 0x93EFD1C5 }, { 0x939A85C4, 0x939A85C4 }, { 0x93459BE7, 0 }, { 0x92F11384, 0x92F11384 }, { 0x929CEBF5, 0 }, { 0x92492492, 0x92492492 }, { 0x91F5BCB9, 0 }, { 0x91A2B3C5, 0 }, { 0x91500915, 0x91500915 }, { 0x90FDBC09, 0x90FDBC09 }, { 0x90ABCC02, 0x90ABCC02 }, { 0x905A3863, 0x905A3863 }, { 0x90090090, 0x90090090 }, { 0x8FB823EE, 0x8FB823EE }, { 0x8F67A1E4, 0 }, { 0x8F1779DA, 0 }, { 0x8EC7AB3A, 0 }, { 0x8E78356D, 0x8E78356D }, { 0x8E2917E1, 0 }, { 0x8DDA5202, 0x8DDA5202 }, { 0x8D8BE340, 0 }, { 0x8D3DCB09, 0 }, { 0x8CF008CF, 0x8CF008CF }, { 0x8CA29C04, 0x8CA29C04 }, { 0x8C55841D, 0 }, { 0x8C08C08C, 0x8C08C08C }, { 0x8BBC50C9, 0 }, { 0x8B70344A, 0x8B70344A }, { 0x8B246A88, 0 }, { 0x8AD8F2FC, 0 }, { 0x8A8DCD20, 0 }, { 0x8A42F870, 0x8A42F870 }, { 0x89F8746A, 0 }, { 0x89AE408A, 0 }, { 0x89645C4F, 0x89645C4F }, { 0x891AC73B, 0 }, { 0x88D180CD, 0x88D180CD }, { 0x88888889, 0 }, { 0x883FDDF0, 0x883FDDF0 }, { 0x87F78088, 0 }, { 0x87AF6FD6, 0 }, { 0x8767AB5F, 0x8767AB5F }, { 0x872032AC, 0x872032AC }, { 0x86D90545, 0 }, { 0x869222B2, 0 }, { 0x864B8A7E, 0 }, { 0x86053C34, 0x86053C34 }, { 0x85BF3761, 0x85BF3761 }, { 0x85797B91, 0x85797B91 }, { 0x85340853, 0x85340853 }, { 0x84EEDD36, 0 }, { 0x84A9F9C8, 0x84A9F9C8 }, { 0x84655D9C, 0 }, { 0x84210842, 0x84210842 }, { 0x83DCF94E, 0 }, { 0x83993052, 0x83993052 }, { 0x8355ACE4, 0 }, { 0x83126E98, 0 }, { 0x82CF7504, 0 }, { 0x828CBFBF, 0 }, { 0x824A4E61, 0 }, { 0x82082082, 0x82082082 }, { 0x81C635BC, 0x81C635BC }, { 0x81848DA9, 0 }, { 0x814327E4, 0 }, { 0x81020408, 0x81020408 }, { 0x80C121B3, 0 }, { 0x80808081, 0 }, { 0x80402010, 0x80402010 }, { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFF803FE1, 0 }, { 0xFF00FF01, 0 }, { 0xFE823CA6, 0 }, { 0xFE03F810, 0 }, { 0xFD863087, 0 }, { 0xFD08E551, 0 }, { 0xFC8C15B5, 0 }, { 0xFC0FC0FD, 0 }, { 0xFB93E673, 0 }, { 0xFB188566, 0 }, { 0xFA9D9D20, 0 }, { 0xFA232CF3, 0 }, { 0xF9A9342D, 0 }, { 0xF92FB222, 0 }, { 0xF8B6A622, 0xF8B6A622 }, { 0xF83E0F84, 0 }, { 0xF7C5ED9D, 0 }, { 0xF74E3FC3, 0 }, { 0xF6D7054E, 0 }, { 0xF6603D99, 0 }, { 0xF5E9E7FD, 0 }, { 0xF57403D6, 0 }, { 0xF4FE9083, 0 }, { 0xF4898D60, 0 }, { 0xF414F9CE, 0 }, { 0xF3A0D52D, 0 }, { 0xF32D1EE0, 0 }, { 0xF2B9D649, 0 }, { 0xF246FACC, 0 }, { 0xF1D48BCF, 0 }, { 0xF16288B9, 0 }, { 0xF0F0F0F1, 0 }, { 0xF07FC3E0, 0xF07FC3E0 }, { 0xF00F00F0, 0xF00F00F0 }, { 0xEF9EA78C, 0 }, { 0xEF2EB720, 0 }, { 0xEEBF2F19, 0 }, { 0xEE500EE5, 0xEE500EE5 }, { 0xEDE155F4, 0 }, { 0xED7303B6, 0 }, { 0xED05179C, 0xED05179C }, { 0xEC979119, 0 }, { 0xEC2A6FA0, 0xEC2A6FA0 }, { 0xEBBDB2A6, 0 }, { 0xEB5159A0, 0 }, { 0xEAE56404, 0 }, { 0xEA79D14A, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA }, { 0xE9A3D25E, 0xE9A3D25E }, { 0xE9396520, 0 }, { 0xE8CF58AB, 0 }, { 0xE865AC7C, 0 }, { 0xE7FC600F, 0 }, { 0xE79372E3, 0 }, { 0xE72AE476, 0 }, { 0xE6C2B449, 0 }, { 0xE65AE1DC, 0 }, { 0xE5F36CB0, 0xE5F36CB0 }, { 0xE58C544A, 0 }, { 0xE525982B, 0 }, { 0xE4BF37D9, 0 }, { 0xE45932D8, 0 }, { 0xE3F388AF, 0 }, { 0xE38E38E4, 0 }, { 0xE32942FF, 0 }, { 0xE2C4A689, 0 }, { 0xE260630B, 0 }, { 0xE1FC780F, 0 }, { 0xE198E520, 0 }, { 0xE135A9CA, 0 }, { 0xE0D2C59A, 0 }, { 0xE070381C, 0xE070381C }, { 0xE00E00E0, 0xE00E00E0 }, { 0xDFAC1F75, 0 }, { 0xDF4A9369, 0 }, { 0xDEE95C4D, 0 }, { 0xDE8879B3, 0 }, { 0xDE27EB2D, 0 }, { 0xDDC7B04D, 0 }, { 0xDD67C8A6, 0xDD67C8A6 }, { 0xDD0833CE, 0 }, { 0xDCA8F159, 0 }, { 0xDC4A00DD, 0 }, { 0xDBEB61EF, 0 }, { 0xDB8D1428, 0 }, { 0xDB2F171E, 0 }, { 0xDAD16A6B, 0 }, { 0xDA740DA8, 0 }, { 0xDA17006D, 0xDA17006D }, { 0xD9BA4257, 0 }, { 0xD95DD300, 0 }, { 0xD901B204, 0 }, { 0xD8A5DEFF, 0 }, { 0xD84A598F, 0 }, { 0xD7EF2152, 0 }, { 0xD79435E5, 0xD79435E5 }, { 0xD73996E9, 0 }, { 0xD6DF43FD, 0 }, { 0xD6853CC1, 0 }, { 0xD62B80D7, 0 }, { 0xD5D20FDF, 0 }, { 0xD578E97D, 0 }, { 0xD5200D52, 0xD5200D52 }, { 0xD4C77B04, 0 }, { 0xD46F3235, 0 }, { 0xD417328A, 0 }, { 0xD3BF7BA9, 0 }, { 0xD3680D37, 0 }, { 0xD310E6DB, 0 }, { 0xD2BA083C, 0 }, { 0xD2637101, 0 }, { 0xD20D20D2, 0xD20D20D2 }, { 0xD1B71759, 0 }, { 0xD161543E, 0xD161543E }, { 0xD10BD72C, 0 }, { 0xD0B69FCC, 0 }, { 0xD061ADCA, 0 }, { 0xD00D00D0, 0xD00D00D0 }, { 0xCFB8988C, 0 }, { 0xCF6474A9, 0 }, { 0xCF1094D4, 0 }, { 0xCEBCF8BC, 0 }, { 0xCE69A00D, 0 }, { 0xCE168A77, 0xCE168A77 }, { 0xCDC3B7A9, 0xCDC3B7A9 }, { 0xCD712753, 0 }, { 0xCD1ED924, 0 }, { 0xCCCCCCCD, 0 }, { 0xCC7B0200, 0 }, { 0xCC29786D, 0 }, { 0xCBD82FC7, 0 }, { 0xCB8727C1, 0 }, { 0xCB36600D, 0 }, { 0xCAE5D85F, 0xCAE5D85F }, { 0xCA95906C, 0 }, { 0xCA4587E7, 0 }, { 0xC9F5BE86, 0 }, { 0xC9A633FD, 0 }, { 0xC956E803, 0xC956E803 }, { 0xC907DA4F, 0 }, { 0xC8B90A96, 0 }, { 0xC86A7890, 0xC86A7890 }, { 0xC81C23F5, 0xC81C23F5 }, { 0xC7CE0C7D, 0 }, { 0xC78031E0, 0xC78031E0 }, { 0xC73293D8, 0 }, { 0xC6E5321D, 0 }, { 0xC6980C6A, 0 }, { 0xC64B2278, 0xC64B2278 }, { 0xC5FE7403, 0xC5FE7403 }, { 0xC5B200C6, 0 }, { 0xC565C87C, 0 }, { 0xC519CAE0, 0xC519CAE0 }, { 0xC4CE07B0, 0xC4CE07B0 }, { 0xC4827EA8, 0xC4827EA8 }, { 0xC4372F86, 0 }, { 0xC3EC1A06, 0 }, { 0xC3A13DE6, 0xC3A13DE6 }, { 0xC3569AE6, 0 }, { 0xC30C30C3, 0xC30C30C3 }, { 0xC2C1FF3E, 0 }, { 0xC2780614, 0 }, { 0xC22E4507, 0 }, { 0xC1E4BBD6, 0 }, { 0xC19B6A42, 0 }, { 0xC152500C, 0xC152500C }, { 0xC1096CF6, 0 }, { 0xC0C0C0C1, 0 }, { 0xC0784B2F, 0 }, { 0xC0300C03, 0xC0300C03 }, { 0xBFE80300, 0 }, { 0xBFA02FE8, 0xBFA02FE8 }, { 0xBF589280, 0 }, { 0xBF112A8B, 0 }, { 0xBEC9F7CE, 0 }, { 0xBE82FA0C, 0 }, { 0xBE3C310C, 0 }, { 0xBDF59C92, 0 }, { 0xBDAF3C64, 0 }, { 0xBD691047, 0xBD691047 }, { 0xBD231803, 0 }, { 0xBCDD535E, 0 }, { 0xBC97C21E, 0xBC97C21E }, { 0xBC52640C, 0 }, { 0xBC0D38EE, 0xBC0D38EE }, { 0xBBC8408D, 0 }, { 0xBB837AB1, 0 }, { 0xBB3EE722, 0 }, { 0xBAFA85A9, 0xBAFA85A9 }, { 0xBAB65610, 0xBAB65610 }, { 0xBA725820, 0xBA725820 }, { 0xBA2E8BA3, 0 }, { 0xB9EAF063, 0 }, { 0xB9A7862A, 0xB9A7862A }, { 0xB9644CC4, 0 }, { 0xB92143FA, 0xB92143FA }, { 0xB8DE6B9A, 0 }, { 0xB89BC36D, 0 }, { 0xB8594B41, 0 }, { 0xB81702E1, 0 }, { 0xB7D4EA19, 0xB7D4EA19 }, { 0xB79300B8, 0 }, { 0xB7514689, 0 }, { 0xB70FBB5A, 0xB70FBB5A }, { 0xB6CE5EF9, 0xB6CE5EF9 }, { 0xB68D3134, 0xB68D3134 }, { 0xB64C31D9, 0 }, { 0xB60B60B6, 0xB60B60B6 }, { 0xB5CABD9B, 0 }, { 0xB58A4855, 0xB58A4855 }, { 0xB54A00B5, 0xB54A00B5 }, { 0xB509E68B, 0 }, { 0xB4C9F9A5, 0 }, { 0xB48A39D4, 0xB48A39D4 }, { 0xB44AA6E9, 0xB44AA6E9 }, { 0xB40B40B4, 0xB40B40B4 }, { 0xB3CC0706, 0 }, { 0xB38CF9B0, 0xB38CF9B0 }, { 0xB34E1884, 0 }, { 0xB30F6353, 0 }, { 0xB2D0D9EF, 0 }, { 0xB2927C2A, 0 }, { 0xB25449D7, 0 }, { 0xB21642C9, 0 }, { 0xB1D866D1, 0xB1D866D1 }, { 0xB19AB5C5, 0 }, { 0xB15D2F76, 0 }, { 0xB11FD3B8, 0xB11FD3B8 }, { 0xB0E2A260, 0xB0E2A260 }, { 0xB0A59B42, 0 }, { 0xB068BE31, 0 }, { 0xB02C0B03, 0 }, { 0xAFEF818C, 0 }, { 0xAFB321A1, 0xAFB321A1 }, { 0xAF76EB19, 0 }, { 0xAF3ADDC7, 0 }, { 0xAEFEF982, 0 }, { 0xAEC33E20, 0 }, { 0xAE87AB76, 0xAE87AB76 }, { 0xAE4C415D, 0 }, { 0xAE10FFA9, 0 }, { 0xADD5E632, 0xADD5E632 }, { 0xAD9AF4D0, 0 }, { 0xAD602B58, 0xAD602B58 }, { 0xAD2589A4, 0 }, { 0xACEB0F89, 0xACEB0F89 }, { 0xACB0BCE1, 0xACB0BCE1 }, { 0xAC769184, 0xAC769184 }, { 0xAC3C8D4A, 0 }, { 0xAC02B00B, 0 }, { 0xABC8F9A0, 0xABC8F9A0 }, { 0xAB8F69E3, 0 }, { 0xAB5600AC, 0 }, { 0xAB1CBDD4, 0 }, { 0xAAE3A136, 0 }, { 0xAAAAAAAB, 0 }, { 0xAA71DA0D, 0 }, { 0xAA392F36, 0 }, { 0xAA00AA01, 0 }, { 0xA9C84A48, 0 }, { 0xA9900FE6, 0 }, { 0xA957FAB5, 0xA957FAB5 }, { 0xA9200A92, 0xA9200A92 }, { 0xA8E83F57, 0xA8E83F57 }, { 0xA8B098E0, 0xA8B098E0 }, { 0xA8791709, 0 }, { 0xA841B9AD, 0 }, { 0xA80A80A8, 0xA80A80A8 }, { 0xA7D36BD8, 0 }, { 0xA79C7B17, 0 }, { 0xA765AE44, 0 }, { 0xA72F053A, 0 }, { 0xA6F87FD6, 0xA6F87FD6 }, { 0xA6C21DF7, 0 }, { 0xA68BDF79, 0 }, { 0xA655C439, 0xA655C439 }, { 0xA61FCC16, 0xA61FCC16 }, { 0xA5E9F6ED, 0xA5E9F6ED }, { 0xA5B4449D, 0 }, { 0xA57EB503, 0 }, { 0xA54947FE, 0 }, { 0xA513FD6C, 0 }, { 0xA4DED52C, 0xA4DED52C }, { 0xA4A9CF1E, 0 }, { 0xA474EB1F, 0xA474EB1F }, { 0xA4402910, 0xA4402910 }, { 0xA40B88D0, 0 }, { 0xA3D70A3E, 0 }, { 0xA3A2AD39, 0xA3A2AD39 }, { 0xA36E71A3, 0 }, { 0xA33A575A, 0xA33A575A }, { 0xA3065E40, 0 }, { 0xA2D28634, 0 }, { 0xA29ECF16, 0xA29ECF16 }, { 0xA26B38C9, 0 }, { 0xA237C32B, 0xA237C32B }, { 0xA2046E1F, 0xA2046E1F }, { 0xA1D13986, 0 }, { 0xA19E2540, 0 }, { 0xA16B312F, 0 }, { 0xA1385D35, 0 }, { 0xA105A933, 0 }, { 0xA0D3150C, 0 }, { 0xA0A0A0A1, 0 }, { 0xA06E4BD4, 0xA06E4BD4 }, { 0xA03C1689, 0 }, { 0xA00A00A0, 0xA00A00A0 }, { 0x9FD809FE, 0 }, { 0x9FA63284, 0 }, { 0x9F747A15, 0x9F747A15 }, { 0x9F42E095, 0x9F42E095 }, { 0x9F1165E7, 0x9F1165E7 }, { 0x9EE009EE, 0x9EE009EE }, { 0x9EAECC8D, 0x9EAECC8D }, { 0x9E7DADA9, 0 }, { 0x9E4CAD24, 0 }, { 0x9E1BCAE3, 0 }, { 0x9DEB06C9, 0x9DEB06C9 }, { 0x9DBA60BB, 0x9DBA60BB }, { 0x9D89D89E, 0 }, { 0x9D596E54, 0x9D596E54 }, { 0x9D2921C4, 0 }, { 0x9CF8F2D1, 0x9CF8F2D1 }, { 0x9CC8E161, 0 }, { 0x9C98ED58, 0 }, { 0x9C69169B, 0x9C69169B }, { 0x9C395D10, 0x9C395D10 }, { 0x9C09C09C, 0x9C09C09C }, { 0x9BDA4124, 0x9BDA4124 }, { 0x9BAADE8E, 0x9BAADE8E }, { 0x9B7B98C0, 0 }, { 0x9B4C6F9F, 0 }, { 0x9B1D6311, 0x9B1D6311 }, { 0x9AEE72FD, 0 }, { 0x9ABF9F48, 0x9ABF9F48 }, { 0x9A90E7D9, 0x9A90E7D9 }, { 0x9A624C97, 0 }, { 0x9A33CD67, 0x9A33CD67 }, { 0x9A056A31, 0 }, { 0x99D722DB, 0 }, { 0x99A8F74C, 0 }, { 0x997AE76B, 0x997AE76B }, { 0x994CF320, 0x994CF320 }, { 0x991F1A51, 0x991F1A51 }, { 0x98F15CE7, 0 }, { 0x98C3BAC7, 0x98C3BAC7 }, { 0x989633DB, 0x989633DB }, { 0x9868C80A, 0 }, { 0x983B773B, 0 }, { 0x980E4156, 0x980E4156 }, { 0x97E12644, 0x97E12644 }, { 0x97B425ED, 0x97B425ED }, { 0x97874039, 0 }, { 0x975A7510, 0 }, { 0x972DC45B, 0 }, { 0x97012E02, 0x97012E02 }, { 0x96D4B1EF, 0 }, { 0x96A8500A, 0 }, { 0x967C083B, 0 }, { 0x964FDA6C, 0x964FDA6C }, { 0x9623C686, 0x9623C686 }, { 0x95F7CC73, 0 }, { 0x95CBEC1B, 0 }, { 0x95A02568, 0x95A02568 }, { 0x95747844, 0 }, { 0x9548E498, 0 }, { 0x951D6A4E, 0 }, { 0x94F2094F, 0x94F2094F }, { 0x94C6C187, 0 }, { 0x949B92DE, 0 }, { 0x94707D3F, 0 }, { 0x94458094, 0x94458094 }, { 0x941A9CC8, 0x941A9CC8 }, { 0x93EFD1C5, 0x93EFD1C5 }, { 0x93C51F76, 0 }, { 0x939A85C4, 0x939A85C4 }, { 0x9370049C, 0 }, { 0x93459BE7, 0 }, { 0x931B4B91, 0 }, { 0x92F11384, 0x92F11384 }, { 0x92C6F3AC, 0x92C6F3AC }, { 0x929CEBF5, 0 }, { 0x9272FC48, 0x9272FC48 }, { 0x92492492, 0x92492492 }, { 0x921F64BF, 0 }, { 0x91F5BCB9, 0 }, { 0x91CC2C6C, 0x91CC2C6C }, { 0x91A2B3C5, 0 }, { 0x917952AF, 0 }, { 0x91500915, 0x91500915 }, { 0x9126D6E5, 0 }, { 0x90FDBC09, 0x90FDBC09 }, { 0x90D4B86F, 0 }, { 0x90ABCC02, 0x90ABCC02 }, { 0x9082F6B0, 0 }, { 0x905A3863, 0x905A3863 }, { 0x9031910A, 0 }, { 0x90090090, 0x90090090 }, { 0x8FE086E3, 0 }, { 0x8FB823EE, 0x8FB823EE }, { 0x8F8FD7A0, 0 }, { 0x8F67A1E4, 0 }, { 0x8F3F82A8, 0x8F3F82A8 }, { 0x8F1779DA, 0 }, { 0x8EEF8766, 0 }, { 0x8EC7AB3A, 0 }, { 0x8E9FE542, 0x8E9FE542 }, { 0x8E78356D, 0x8E78356D }, { 0x8E509BA8, 0x8E509BA8 }, { 0x8E2917E1, 0 }, { 0x8E01AA05, 0 }, { 0x8DDA5202, 0x8DDA5202 }, { 0x8DB30FC6, 0x8DB30FC6 }, { 0x8D8BE340, 0 }, { 0x8D64CC5C, 0 }, { 0x8D3DCB09, 0 }, { 0x8D16DF35, 0x8D16DF35 }, { 0x8CF008CF, 0x8CF008CF }, { 0x8CC947C5, 0 }, { 0x8CA29C04, 0x8CA29C04 }, { 0x8C7C057D, 0 }, { 0x8C55841D, 0 }, { 0x8C2F17D2, 0x8C2F17D2 }, { 0x8C08C08C, 0x8C08C08C }, { 0x8BE27E39, 0x8BE27E39 }, { 0x8BBC50C9, 0 }, { 0x8B963829, 0x8B963829 }, { 0x8B70344A, 0x8B70344A }, { 0x8B4A451A, 0 }, { 0x8B246A88, 0 }, { 0x8AFEA483, 0x8AFEA483 }, { 0x8AD8F2FC, 0 }, { 0x8AB355E0, 0x8AB355E0 }, { 0x8A8DCD20, 0 }, { 0x8A6858AB, 0 }, { 0x8A42F870, 0x8A42F870 }, { 0x8A1DAC60, 0x8A1DAC60 }, { 0x89F8746A, 0 }, { 0x89D3507D, 0 }, { 0x89AE408A, 0 }, { 0x89894480, 0 }, { 0x89645C4F, 0x89645C4F }, { 0x893F87E8, 0x893F87E8 }, { 0x891AC73B, 0 }, { 0x88F61A37, 0x88F61A37 }, { 0x88D180CD, 0x88D180CD }, { 0x88ACFAEE, 0 }, { 0x88888889, 0 }, { 0x8864298F, 0 }, { 0x883FDDF0, 0x883FDDF0 }, { 0x881BA59E, 0 }, { 0x87F78088, 0 }, { 0x87D36EA0, 0 }, { 0x87AF6FD6, 0 }, { 0x878B841B, 0 }, { 0x8767AB5F, 0x8767AB5F }, { 0x8743E595, 0 }, { 0x872032AC, 0x872032AC }, { 0x86FC9296, 0x86FC9296 }, { 0x86D90545, 0 }, { 0x86B58AA8, 0 }, { 0x869222B2, 0 }, { 0x866ECD53, 0x866ECD53 }, { 0x864B8A7E, 0 }, { 0x86285A23, 0x86285A23 }, { 0x86053C34, 0x86053C34 }, { 0x85E230A3, 0x85E230A3 }, { 0x85BF3761, 0x85BF3761 }, { 0x859C5060, 0x859C5060 }, { 0x85797B91, 0x85797B91 }, { 0x8556B8E7, 0x8556B8E7 }, { 0x85340853, 0x85340853 }, { 0x851169C7, 0x851169C7 }, { 0x84EEDD36, 0 }, { 0x84CC6290, 0 }, { 0x84A9F9C8, 0x84A9F9C8 }, { 0x8487A2D1, 0 }, { 0x84655D9C, 0 }, { 0x84432A1B, 0x84432A1B }, { 0x84210842, 0x84210842 }, { 0x83FEF802, 0x83FEF802 }, { 0x83DCF94E, 0 }, { 0x83BB0C18, 0 }, { 0x83993052, 0x83993052 }, { 0x837765F0, 0x837765F0 }, { 0x8355ACE4, 0 }, { 0x83340520, 0x83340520 }, { 0x83126E98, 0 }, { 0x82F0E93D, 0x82F0E93D }, { 0x82CF7504, 0 }, { 0x82AE11DE, 0 }, { 0x828CBFBF, 0 }, { 0x826B7E99, 0x826B7E99 }, { 0x824A4E61, 0 }, { 0x82292F08, 0 }, { 0x82082082, 0x82082082 }, { 0x81E722C2, 0x81E722C2 }, { 0x81C635BC, 0x81C635BC }, { 0x81A55963, 0 }, { 0x81848DA9, 0 }, { 0x8163D283, 0 }, { 0x814327E4, 0 }, { 0x81228DBF, 0 }, { 0x81020408, 0x81020408 }, { 0x80E18AB3, 0 }, { 0x80C121B3, 0 }, { 0x80A0C8FB, 0x80A0C8FB }, { 0x80808081, 0 }, { 0x80604836, 0x80604836 }, { 0x80402010, 0x80402010 }, { 0x80200802, 0x80200802 }, { 0xFFFFFFFF, 0xFFFFFFFF } }; aom-3.12.1/aom_dsp/odintrin.h000066400000000000000000000047661477627663500160370ustar00rootroot00000000000000/* * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /* clang-format off */ #ifndef AOM_AOM_DSP_ODINTRIN_H_ #define AOM_AOM_DSP_ODINTRIN_H_ #include #include #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/bitops.h" #ifdef __cplusplus extern "C" { #endif typedef int od_coeff; #define OD_DIVU_DMAX (1024) extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2]; /*Translate unsigned division by small divisors into multiplications.*/ #define OD_DIVU_SMALL(_x, _d) \ ((uint32_t)((OD_DIVU_SMALL_CONSTS[(_d)-1][0] * (uint64_t)(_x) + \ OD_DIVU_SMALL_CONSTS[(_d)-1][1]) >> \ 32) >> \ (OD_ILOG_NZ(_d) - 1)) #define OD_DIVU(_x, _d) \ (((_d) < OD_DIVU_DMAX) ? (OD_DIVU_SMALL((_x), (_d))) : ((_x) / (_d))) #define OD_MINI AOMMIN #define OD_MAXI AOMMAX #define OD_CLAMPI(min, val, max) (OD_MAXI(min, OD_MINI(val, max))) /*Integer logarithm (base 2) of a nonzero unsigned 32-bit integer. OD_ILOG_NZ(x) = (int)floor(log2(x)) + 1.*/ #define OD_ILOG_NZ(x) (1 + get_msb(x)) /*Enable special features for gcc and compatible compilers.*/ #if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__) #define OD_GNUC_PREREQ(maj, min, pat) \ ((__GNUC__ << 16) + (__GNUC_MINOR__ << 8) + __GNUC_PATCHLEVEL__ >= \ ((maj) << 16) + ((min) << 8) + pat) // NOLINT #else #define OD_GNUC_PREREQ(maj, min, pat) (0) #endif #if OD_GNUC_PREREQ(3, 4, 0) #define OD_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) #else #define OD_WARN_UNUSED_RESULT #endif #if OD_GNUC_PREREQ(3, 4, 0) #define OD_ARG_NONNULL(x) __attribute__((__nonnull__(x))) #else #define OD_ARG_NONNULL(x) #endif /*All of these macros should expect floats as arguments.*/ # define OD_SIGNMASK(a) (-((a) < 0)) # define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b)) #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_ODINTRIN_H_ aom-3.12.1/aom_dsp/prob.h000066400000000000000000000147661477627663500151540ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_PROB_H_ #define AOM_AOM_DSP_PROB_H_ #include #include #include "config/aom_config.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/entcode.h" #include "aom_ports/bitops.h" #include "aom_ports/mem.h" #ifdef __cplusplus extern "C" { #endif typedef uint16_t aom_cdf_prob; #define CDF_SIZE(x) ((x) + 1) #define CDF_PROB_BITS 15 #define CDF_PROB_TOP (1 << CDF_PROB_BITS) /*The value stored in an iCDF is CDF_PROB_TOP minus the actual cumulative probability (an "inverse" CDF). This function converts from one representation to the other (and is its own inverse).*/ #define AOM_ICDF(x) (CDF_PROB_TOP - (x)) #define AOM_CDF2(a0) AOM_ICDF(a0), AOM_ICDF(CDF_PROB_TOP), 0 #define AOM_CDF3(a0, a1) AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(CDF_PROB_TOP), 0 #define AOM_CDF4(a0, a1, a2) \ AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(CDF_PROB_TOP), 0 #define AOM_CDF5(a0, a1, a2, a3) \ AOM_ICDF(a0) \ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(CDF_PROB_TOP), 0 #define AOM_CDF6(a0, a1, a2, a3, a4) \ AOM_ICDF(a0) \ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), \ AOM_ICDF(CDF_PROB_TOP), 0 #define AOM_CDF7(a0, a1, a2, a3, a4, a5) \ AOM_ICDF(a0) \ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ AOM_ICDF(CDF_PROB_TOP), 0 #define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6) \ AOM_ICDF(a0) \ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ AOM_ICDF(a6), AOM_ICDF(CDF_PROB_TOP), 0 #define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7) \ AOM_ICDF(a0) \ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(CDF_PROB_TOP), 0 #define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8) \ AOM_ICDF(a0) \ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(CDF_PROB_TOP), 0 #define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ AOM_ICDF(a0) \ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), \ AOM_ICDF(CDF_PROB_TOP), 0 #define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ AOM_ICDF(a0) \ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ AOM_ICDF(CDF_PROB_TOP), 0 #define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \ AOM_ICDF(a0) \ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ AOM_ICDF(a11), AOM_ICDF(CDF_PROB_TOP), 0 #define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \ AOM_ICDF(a0) \ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(CDF_PROB_TOP), 0 #define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \ AOM_ICDF(a0) \ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(CDF_PROB_TOP), 0 #define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \ a14) \ AOM_ICDF(a0) \ , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(a14), \ AOM_ICDF(CDF_PROB_TOP), 0 static inline uint8_t get_prob(unsigned int num, unsigned int den) { assert(den != 0); { const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den); // (p > 255) ? 255 : (p < 1) ? 1 : p; const int clipped_prob = p | ((255 - p) >> 23) | (p == 0); return (uint8_t)clipped_prob; } } static inline void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) { assert(nsymbs < 17); const int count = cdf[nsymbs]; // rate is computed in the spec as: // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2) // In this case cdf[N] is |count|. // Min(FloorLog2(N), 2) is 1 for nsymbs == {2, 3} and 2 for all // nsymbs > 3. So the equation becomes: // 4 + (count > 15) + (count > 31) + (nsymbs > 3). // Note that the largest value for count is 32 (it is not incremented beyond // 32). So using that information: // count >> 4 is 0 for count from 0 to 15. // count >> 4 is 1 for count from 16 to 31. // count >> 4 is 2 for count == 31. // Now, the equation becomes: // 4 + (count >> 4) + (nsymbs > 3). const int rate = 4 + (count >> 4) + (nsymbs > 3); int i = 0; do { if (i < val) { cdf[i] += (CDF_PROB_TOP - cdf[i]) >> rate; } else { cdf[i] -= cdf[i] >> rate; } } while (++i < nsymbs - 1); cdf[nsymbs] += (count < 32); } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_PROB_H_ aom-3.12.1/aom_dsp/psnr.c000066400000000000000000000375031477627663500151610ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/psnr.h" #include "aom_scale/yv12config.h" #if CONFIG_INTERNAL_STATS #define STATIC #else #define STATIC static #endif // CONFIG_INTERNAL_STATS STATIC double aom_sse_to_psnr(double samples, double peak, double sse) { if (sse > 0.0) { const double psnr = 10.0 * log10(samples * peak * peak / sse); return psnr > MAX_PSNR ? MAX_PSNR : psnr; } else { return MAX_PSNR; } } #undef STATIC static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int w, int h) { int i, j; int64_t sse = 0; for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { const int diff = a[j] - b[j]; sse += diff * diff; } a += a_stride; b += b_stride; } return sse; } #if CONFIG_AV1_HIGHBITDEPTH static int64_t encoder_highbd_sse(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int w, int h) { const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); int64_t sse = 0; for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { const int diff = a[j] - b[j]; sse += diff * diff; } a += a_stride; b += b_stride; } return sse; } #endif // CONFIG_AV1_HIGHBITDEPTH static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height) { const int dw = width % 16; const int dh = height % 16; int64_t total_sse = 0; int x, y; if (dw > 0) { total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride, dw, height); } if (dh > 0) { total_sse += encoder_sse(&a[(height - dh) * a_stride], a_stride, &b[(height - dh) * b_stride], b_stride, width - dw, dh); } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; for (x = 0; x < width / 16; ++x) { total_sse += aom_sse(pa, a_stride, pb, b_stride, 16, 16); pa += 16; pb += 16; } a += 16 * a_stride; b += 16 * b_stride; } return total_sse; } #if CONFIG_AV1_HIGHBITDEPTH static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int width, int height, unsigned int input_shift) { const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); int64_t total_sse = 0; int x, y; for (y = 0; y < height; ++y) { for (x = 0; x < width; ++x) { int64_t diff; diff = (a[x] >> input_shift) - (b[x] >> input_shift); total_sse += diff * diff; } a += a_stride; b += b_stride; } return total_sse; } static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height) { int64_t total_sse = 0; int x, y; const int dw = width % 16; const int dh = height % 16; if (dw > 0) { total_sse += encoder_highbd_sse(&a[width - dw], a_stride, &b[width - dw], b_stride, dw, height); } if (dh > 0) { total_sse += encoder_highbd_sse(&a[(height - dh) * a_stride], a_stride, &b[(height - dh) * b_stride], b_stride, width - dw, dh); } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; for (x = 0; x < width / 16; ++x) { total_sse += aom_highbd_sse(pa, a_stride, pb, b_stride, 16, 16); pa += 16; pb += 16; } a += 16 * a_stride; b += 16 * b_stride; } return total_sse; } #endif // CONFIG_AV1_HIGHBITDEPTH uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, int vstart, int height) { return aom_var_2d_u8(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride, width, height) / (width * height); } uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, int vstart, int height) { return aom_var_2d_u8(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride, width, height) / (width * height); } uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, int vstart, int height) { return aom_var_2d_u8(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride, width, height) / (width * height); } int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height) { return get_sse(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride, b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, width, height); } int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) { assert(a->y_crop_width == b->y_crop_width); assert(a->y_crop_height == b->y_crop_height); return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, a->y_crop_width, a->y_crop_height); } int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height) { return get_sse(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride, b->u_buffer + vstart * b->uv_stride + hstart, b->uv_stride, width, height); } int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) { assert(a->uv_crop_width == b->uv_crop_width); assert(a->uv_crop_height == b->uv_crop_height); return get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride, a->uv_crop_width, a->uv_crop_height); } int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height) { return get_sse(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride, b->v_buffer + vstart * b->uv_stride + hstart, b->uv_stride, width, height); } int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) { assert(a->uv_crop_width == b->uv_crop_width); assert(a->uv_crop_height == b->uv_crop_height); return get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride, a->uv_crop_width, a->uv_crop_height); } #if CONFIG_AV1_HIGHBITDEPTH uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, int vstart, int height) { return aom_var_2d_u16(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride, width, height) / (width * height); } uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, int vstart, int height) { return aom_var_2d_u16(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride, width, height) / (width * height); } uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, int vstart, int height) { return aom_var_2d_u16(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride, width, height) / (width * height); } int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height) { return highbd_get_sse( a->y_buffer + vstart * a->y_stride + hstart, a->y_stride, b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, width, height); } int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) { assert(a->y_crop_width == b->y_crop_width); assert(a->y_crop_height == b->y_crop_height); assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, a->y_crop_width, a->y_crop_height); } int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height) { return highbd_get_sse(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride, b->u_buffer + vstart * b->uv_stride + hstart, b->uv_stride, width, height); } int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) { assert(a->uv_crop_width == b->uv_crop_width); assert(a->uv_crop_height == b->uv_crop_height); assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); return highbd_get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride, a->uv_crop_width, a->uv_crop_height); } int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height) { return highbd_get_sse(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride, b->v_buffer + vstart * b->uv_stride + hstart, b->uv_stride, width, height); } int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) { assert(a->uv_crop_width == b->uv_crop_width); assert(a->uv_crop_height == b->uv_crop_height); assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride, a->uv_crop_width, a->uv_crop_height); } #endif // CONFIG_AV1_HIGHBITDEPTH int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int plane, int highbd) { #if CONFIG_AV1_HIGHBITDEPTH if (highbd) { switch (plane) { case 0: return aom_highbd_get_y_sse(a, b); case 1: return aom_highbd_get_u_sse(a, b); case 2: return aom_highbd_get_v_sse(a, b); default: assert(plane >= 0 && plane <= 2); return 0; } } else { switch (plane) { case 0: return aom_get_y_sse(a, b); case 1: return aom_get_u_sse(a, b); case 2: return aom_get_v_sse(a, b); default: assert(plane >= 0 && plane <= 2); return 0; } } #else (void)highbd; switch (plane) { case 0: return aom_get_y_sse(a, b); case 1: return aom_get_u_sse(a, b); case 2: return aom_get_v_sse(a, b); default: assert(plane >= 0 && plane <= 2); return 0; } #endif } #if CONFIG_AV1_HIGHBITDEPTH void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, uint32_t bit_depth, uint32_t in_bit_depth) { assert(a->y_crop_width == b->y_crop_width); assert(a->y_crop_height == b->y_crop_height); assert(a->uv_crop_width == b->uv_crop_width); assert(a->uv_crop_height == b->uv_crop_height); const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; const int heights[3] = { a->y_crop_height, a->uv_crop_height, a->uv_crop_height }; const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; int i; uint64_t total_sse = 0; uint32_t total_samples = 0; #if CONFIG_LIBVMAF_PSNR_PEAK double peak = (double)(255 << (in_bit_depth - 8)); #else double peak = (double)((1 << in_bit_depth) - 1); #endif // CONFIG_LIBVMAF_PSNR_PEAK const unsigned int input_shift = bit_depth - in_bit_depth; for (i = 0; i < 3; ++i) { const int w = widths[i]; const int h = heights[i]; const uint32_t samples = w * h; uint64_t sse; if (a->flags & YV12_FLAG_HIGHBITDEPTH) { if (input_shift) { sse = highbd_get_sse_shift(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h, input_shift); } else { sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h); } } else { sse = get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h); } psnr->sse[1 + i] = sse; psnr->samples[1 + i] = samples; psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse); total_sse += sse; total_samples += samples; } psnr->sse[0] = total_sse; psnr->samples[0] = total_samples; psnr->psnr[0] = aom_sse_to_psnr((double)total_samples, peak, (double)total_sse); // Compute PSNR based on stream bit depth if ((a->flags & YV12_FLAG_HIGHBITDEPTH) && (in_bit_depth < bit_depth)) { #if CONFIG_LIBVMAF_PSNR_PEAK peak = (double)(255 << (bit_depth - 8)); #else peak = (double)((1 << bit_depth) - 1); #endif // CONFIG_LIBVMAF_PSNR_PEAK total_sse = 0; total_samples = 0; for (i = 0; i < 3; ++i) { const int w = widths[i]; const int h = heights[i]; const uint32_t samples = w * h; uint64_t sse; sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h); psnr->sse_hbd[1 + i] = sse; psnr->samples_hbd[1 + i] = samples; psnr->psnr_hbd[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse); total_sse += sse; total_samples += samples; } psnr->sse_hbd[0] = total_sse; psnr->samples_hbd[0] = total_samples; psnr->psnr_hbd[0] = aom_sse_to_psnr((double)total_samples, peak, (double)total_sse); } } #endif void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr) { assert(a->y_crop_width == b->y_crop_width); assert(a->y_crop_height == b->y_crop_height); assert(a->uv_crop_width == b->uv_crop_width); assert(a->uv_crop_height == b->uv_crop_height); static const double peak = 255.0; const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; const int heights[3] = { a->y_crop_height, a->uv_crop_height, a->uv_crop_height }; const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; int i; uint64_t total_sse = 0; uint32_t total_samples = 0; for (i = 0; i < 3; ++i) { const int w = widths[i]; const int h = heights[i]; const uint32_t samples = w * h; const uint64_t sse = get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h); psnr->sse[1 + i] = sse; psnr->samples[1 + i] = samples; psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse); total_sse += sse; total_samples += samples; } psnr->sse[0] = total_sse; psnr->samples[0] = total_samples; psnr->psnr[0] = aom_sse_to_psnr((double)total_samples, peak, (double)total_sse); } aom-3.12.1/aom_dsp/psnr.h000066400000000000000000000113311477627663500151550ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_PSNR_H_ #define AOM_AOM_DSP_PSNR_H_ #include "aom_scale/yv12config.h" #include "config/aom_config.h" #define MAX_PSNR 100.0 #ifdef __cplusplus extern "C" { #endif typedef struct { double psnr[4]; // total/y/u/v uint64_t sse[4]; // total/y/u/v uint32_t samples[4]; // total/y/u/v double psnr_hbd[4]; // total/y/u/v when input-bit-depth < bit-depth uint64_t sse_hbd[4]; // total/y/u/v when input-bit-depth < bit-depth uint32_t samples_hbd[4]; // total/y/u/v when input-bit-depth < bit-depth } PSNR_STATS; #if CONFIG_INTERNAL_STATS /*!\brief Converts SSE to PSNR * * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PSNR). * * \param[in] samples Number of samples * \param[in] peak Max sample value * \param[in] sse Sum of squared errors */ double aom_sse_to_psnr(double samples, double peak, double sse); #endif // CONFIG_INTERNAL_STATS uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, int vstart, int height); uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, int vstart, int height); uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, int vstart, int height); int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height); int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height); int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height); int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int plane, int highbd); #if CONFIG_AV1_HIGHBITDEPTH uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, int vstart, int height); uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, int vstart, int height); uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, int vstart, int height); int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height); int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height); int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height); int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, unsigned int bit_depth, unsigned int in_bit_depth); #endif void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr); double aom_psnrhvs(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *phvs_y, double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_PSNR_H_ aom-3.12.1/aom_dsp/psnrhvs.c000066400000000000000000000273571477627663500157100ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. * * This code was originally written by: Gregory Maxwell, at the Daala * project. */ #include #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/psnr.h" #include "aom_dsp/ssim.h" static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, int xstride) { int i, j; (void)xstride; aom_fdct8x8(x, y, ystride); for (i = 0; i < 8; i++) for (j = 0; j < 8; j++) *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; } #if CONFIG_AV1_HIGHBITDEPTH static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, int xstride) { int i, j; (void)xstride; aom_highbd_fdct8x8(x, y, ystride); for (i = 0; i < 8; i++) for (j = 0; j < 8; j++) *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; } #endif // CONFIG_AV1_HIGHBITDEPTH /* Normalized inverse quantization matrix for 8x8 DCT at the point of * transparency. This is not the JPEG based matrix from the paper, this one gives a slightly higher MOS agreement.*/ static const double csf_y[8][8] = { { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334, 0.678296995242, 0.466224900598, 0.3265091542 }, { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963, 0.868920337363, 0.61280991668, 0.436405793551 }, { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257, 0.670882927016, 0.501731932449, 0.372504254596 }, { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554, 0.48309405692, 0.380429446972, 0.295774038565 }, { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676, 0.352889268808, 0.283006984131, 0.226951348204 }, { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692, 0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 }, { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972, 0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 }, { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565, 0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 } }; static const double csf_cb420[8][8] = { { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788, 0.898018824055, 0.74725392039, 0.615105596242 }, { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972, 1.17428548929, 0.996404342439, 0.830890433625 }, { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362, 0.960060382087, 0.849823426169, 0.731221236837 }, { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099, 0.751437590932, 0.685398513368, 0.608694761374 }, { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187, 0.605503172737, 0.55002013668, 0.495804539034 }, { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932, 0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 }, { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368, 0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 }, { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374, 0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 } }; static const double csf_cr420[8][8] = { { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469, 0.867069376285, 0.721500455585, 0.593906509971 }, { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198, 1.13381474809, 0.962064122248, 0.802254508198 }, { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848, 0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 }, { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195, 0.725539939514, 0.661776842059, 0.587716619023 }, { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286, 0.584635025748, 0.531064164893, 0.478717061273 }, { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514, 0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 }, { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059, 0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 }, { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023, 0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 } }; static double convert_score_db(double _score, double _weight, int16_t pix_max) { assert(_score * _weight >= 0.0); if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR; return 10 * (log10(pix_max * pix_max) - log10(_weight * _score)); } static double calc_psnrhvs(const unsigned char *src, int _systride, const unsigned char *dst, int _dystride, double _par, int _w, int _h, int _step, const double _csf[8][8], uint32_t _shift, int buf_is_hbd, int16_t pix_max, int luma) { double ret; const uint8_t *_src8 = src; const uint8_t *_dst8 = dst; const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src); const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst); DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]); DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]); DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]); DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]); double mask[8][8]; int pixels; int x; int y; float sum1; float sum2; float delt; (void)_par; ret = pixels = 0; sum1 = sum2 = delt = 0.0f; for (y = 0; y < _h; y++) { for (x = 0; x < _w; x++) { if (!buf_is_hbd) { sum1 += _src8[y * _systride + x]; sum2 += _dst8[y * _dystride + x]; } else { sum1 += _src16[y * _systride + x] >> _shift; sum2 += _dst16[y * _dystride + x] >> _shift; } } } if (luma) delt = (sum1 - sum2) / (_w * _h); /*In the PSNR-HVS-M paper[1] the authors describe the construction of their masking table as "we have used the quantization table for the color component Y of JPEG [6] that has been also obtained on the basis of CSF. Note that the values in quantization table JPEG have been normalized and then squared." Their CSF matrix (from PSNR-HVS) was also constructed from the JPEG matrices. I can not find any obvious scheme of normalizing to produce their table, but if I multiply their CSF by 0.3885746225901003 and square the result I get their masking table. I have no idea where this constant comes from, but deviating from it too greatly hurts MOS agreement. [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli, Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking of DCT basis functions", CD-ROM Proceedings of the Third International Workshop on Video Processing and Quality Metrics for Consumer Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p. Suggested in aomedia issue#2363: 0.3885746225901003 is a reciprocal of the maximum coefficient (2.573509) of the old JPEG based matrix from the paper. Since you are not using that, divide by actual maximum coefficient. */ for (x = 0; x < 8; x++) for (y = 0; y < 8; y++) mask[x][y] = (_csf[x][y] / _csf[1][0]) * (_csf[x][y] / _csf[1][0]); for (y = 0; y < _h - 7; y += _step) { for (x = 0; x < _w - 7; x += _step) { int i; int j; int n = 0; double s_gx = 0; double s_gy = 0; double g = 0; double s_gmean = 0; double s_gvar = 0; double s_mask = 0; for (i = 0; i < 8; i++) { for (j = 0; j < 8; j++) { if (!buf_is_hbd) { dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)]; dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)]; } else { dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift; dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift; } dct_d[i * 8 + j] += (int)(delt + 0.5f); } } for (i = 1; i < 7; i++) { for (j = 1; j < 7; j++) { s_gx = (dct_s[(i - 1) * 8 + j - 1] * 3 - dct_s[(i - 1) * 8 + j + 1] * 3 + dct_s[i * 8 + j - 1] * 10 - dct_s[i * 8 + j + 1] * 10 + dct_s[(i + 1) * 8 + j - 1] * 3 - dct_s[(i + 1) * 8 + j + 1] * 3) / (pix_max * 16.f); s_gy = (dct_s[(i - 1) * 8 + j - 1] * 3 - dct_s[(i + 1) * 8 + j - 1] * 3 + dct_s[(i - 1) * 8 + j] * 10 - dct_s[(i + 1) * 8 + j] * 10 + dct_s[(i - 1) * 8 + j + 1] * 3 - dct_s[(i + 1) * 8 + j + 1] * 3) / (pix_max * 16.f); g = sqrt(s_gx * s_gx + s_gy * s_gy); if (g > 0.1f) n++; s_gmean += g; } } s_gvar = 1.f / (36 - n + 1) * s_gmean / 36.f; #if CONFIG_AV1_HIGHBITDEPTH if (!buf_is_hbd) { od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); } else { hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); } #else od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); #endif // CONFIG_AV1_HIGHBITDEPTH for (i = 0; i < 8; i++) for (j = (i == 0); j < 8; j++) s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j]; s_mask = sqrt(s_mask * s_gvar) / 8.f; for (i = 0; i < 8; i++) { for (j = 0; j < 8; j++) { double err; err = fabs((double)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j])); if (i != 0 || j != 0) err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j]; ret += (err * _csf[i][j]) * (err * _csf[i][j]); pixels++; } } } } if (pixels <= 0) return 0; ret /= pixels; ret += 0.04 * delt * delt; return ret; } double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst, double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs, uint32_t bd, uint32_t in_bd) { double psnrhvs; const double par = 1.0; const int step = 7; uint32_t bd_shift = 0; assert(bd == 8 || bd == 10 || bd == 12); assert(bd >= in_bd); assert(src->flags == dst->flags); const int buf_is_hbd = src->flags & YV12_FLAG_HIGHBITDEPTH; int16_t pix_max = 255; if (in_bd == 10) pix_max = 1023; else if (in_bd == 12) pix_max = 4095; bd_shift = bd - in_bd; *y_psnrhvs = calc_psnrhvs(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, par, src->y_crop_width, src->y_crop_height, step, csf_y, bd_shift, buf_is_hbd, pix_max, 1); *u_psnrhvs = calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, par, src->uv_crop_width, src->uv_crop_height, step, csf_cb420, bd_shift, buf_is_hbd, pix_max, 0); *v_psnrhvs = calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, par, src->uv_crop_width, src->uv_crop_height, step, csf_cr420, bd_shift, buf_is_hbd, pix_max, 0); psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs)); return convert_score_db(psnrhvs, 1.0, pix_max); } aom-3.12.1/aom_dsp/pyramid.c000066400000000000000000000441651477627663500156460ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom_dsp/pyramid.h" #include "aom_mem/aom_mem.h" #include "aom_ports/bitops.h" #include "aom_util/aom_pthread.h" // TODO(rachelbarker): Move needed code from av1/ to aom_dsp/ #include "av1/common/resize.h" #include #include // Lifecycle: // * Frame buffer alloc code calls aom_get_pyramid_alloc_size() // to work out how much space is needed for a given number of pyramid // levels. This is counted in the size checked against the max allocation // limit // * Then calls aom_alloc_pyramid() to actually create the pyramid // * Pyramid is initially marked as containing no valid data // * Each pyramid layer is computed on-demand, the first time it is requested // * Whenever frame buffer is reused, reset the counter of filled levels. // This invalidates all of the existing pyramid levels. // * Whenever frame buffer is resized, reallocate pyramid size_t aom_get_pyramid_alloc_size(int width, int height, bool image_is_16bit) { // Allocate the maximum possible number of layers for this width and height const int msb = get_msb(AOMMIN(width, height)); const int n_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); size_t alloc_size = 0; alloc_size += sizeof(ImagePyramid); alloc_size += n_levels * sizeof(PyramidLayer); // Calculate how much memory is needed for downscaled frame buffers size_t buffer_size = 0; // Work out if we need to allocate a few extra bytes for alignment. // aom_memalign() will ensure that the start of the allocation is aligned // to a multiple of PYRAMID_ALIGNMENT. But we want the first image pixel // to be aligned, not the first byte of the allocation. // // In the loop below, we ensure that the stride of every image is a multiple // of PYRAMID_ALIGNMENT. Thus the allocated size of each pyramid level will // also be a multiple of PYRAMID_ALIGNMENT. Thus, as long as we can get the // first pixel in the first pyramid layer aligned properly, that will // automatically mean that the first pixel of every row of every layer is // properly aligned too. // // Thus all we need to consider is the first pixel in the first layer. // This is located at offset // extra_bytes + level_stride * PYRAMID_PADDING + PYRAMID_PADDING // bytes into the buffer. Since level_stride is a multiple of // PYRAMID_ALIGNMENT, we can ignore that. So we need // extra_bytes + PYRAMID_PADDING = multiple of PYRAMID_ALIGNMENT // // To solve this, we can round PYRAMID_PADDING up to the next multiple // of PYRAMID_ALIGNMENT, then subtract the orginal value to calculate // how many extra bytes are needed. size_t first_px_offset = (PYRAMID_PADDING + PYRAMID_ALIGNMENT - 1) & ~(PYRAMID_ALIGNMENT - 1); size_t extra_bytes = first_px_offset - PYRAMID_PADDING; buffer_size += extra_bytes; // If the original image is stored in an 8-bit buffer, then we can point the // lowest pyramid level at that buffer rather than allocating a new one. int first_allocated_level = image_is_16bit ? 0 : 1; for (int level = first_allocated_level; level < n_levels; level++) { int level_width = width >> level; int level_height = height >> level; // Allocate padding for each layer int padded_width = level_width + 2 * PYRAMID_PADDING; int padded_height = level_height + 2 * PYRAMID_PADDING; // Align the layer stride to be a multiple of PYRAMID_ALIGNMENT // This ensures that, as long as the top-left pixel in this pyramid level is // properly aligned, then so will the leftmost pixel in every row of the // pyramid level. int level_stride = (padded_width + PYRAMID_ALIGNMENT - 1) & ~(PYRAMID_ALIGNMENT - 1); buffer_size += level_stride * padded_height; } alloc_size += buffer_size; return alloc_size; } ImagePyramid *aom_alloc_pyramid(int width, int height, bool image_is_16bit) { // Allocate the maximum possible number of layers for this width and height const int msb = get_msb(AOMMIN(width, height)); const int n_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); ImagePyramid *pyr = aom_calloc(1, sizeof(*pyr)); if (!pyr) { return NULL; } pyr->layers = aom_calloc(n_levels, sizeof(*pyr->layers)); if (!pyr->layers) { aom_free(pyr); return NULL; } pyr->max_levels = n_levels; pyr->filled_levels = 0; // Compute sizes and offsets for each pyramid level // These are gathered up first, so that we can allocate all pyramid levels // in a single buffer size_t buffer_size = 0; size_t *layer_offsets = aom_calloc(n_levels, sizeof(*layer_offsets)); if (!layer_offsets) { aom_free(pyr->layers); aom_free(pyr); return NULL; } // Work out if we need to allocate a few extra bytes for alignment. // aom_memalign() will ensure that the start of the allocation is aligned // to a multiple of PYRAMID_ALIGNMENT. But we want the first image pixel // to be aligned, not the first byte of the allocation. // // In the loop below, we ensure that the stride of every image is a multiple // of PYRAMID_ALIGNMENT. Thus the allocated size of each pyramid level will // also be a multiple of PYRAMID_ALIGNMENT. Thus, as long as we can get the // first pixel in the first pyramid layer aligned properly, that will // automatically mean that the first pixel of every row of every layer is // properly aligned too. // // Thus all we need to consider is the first pixel in the first layer. // This is located at offset // extra_bytes + level_stride * PYRAMID_PADDING + PYRAMID_PADDING // bytes into the buffer. Since level_stride is a multiple of // PYRAMID_ALIGNMENT, we can ignore that. So we need // extra_bytes + PYRAMID_PADDING = multiple of PYRAMID_ALIGNMENT // // To solve this, we can round PYRAMID_PADDING up to the next multiple // of PYRAMID_ALIGNMENT, then subtract the orginal value to calculate // how many extra bytes are needed. size_t first_px_offset = (PYRAMID_PADDING + PYRAMID_ALIGNMENT - 1) & ~(PYRAMID_ALIGNMENT - 1); size_t extra_bytes = first_px_offset - PYRAMID_PADDING; buffer_size += extra_bytes; // If the original image is stored in an 8-bit buffer, then we can point the // lowest pyramid level at that buffer rather than allocating a new one. int first_allocated_level = image_is_16bit ? 0 : 1; for (int level = first_allocated_level; level < n_levels; level++) { PyramidLayer *layer = &pyr->layers[level]; int level_width = width >> level; int level_height = height >> level; // Allocate padding for each layer int padded_width = level_width + 2 * PYRAMID_PADDING; int padded_height = level_height + 2 * PYRAMID_PADDING; // Align the layer stride to be a multiple of PYRAMID_ALIGNMENT // This ensures that, as long as the top-left pixel in this pyramid level is // properly aligned, then so will the leftmost pixel in every row of the // pyramid level. int level_stride = (padded_width + PYRAMID_ALIGNMENT - 1) & ~(PYRAMID_ALIGNMENT - 1); size_t level_alloc_start = buffer_size; size_t level_start = level_alloc_start + PYRAMID_PADDING * level_stride + PYRAMID_PADDING; buffer_size += level_stride * padded_height; layer_offsets[level] = level_start; layer->width = level_width; layer->height = level_height; layer->stride = level_stride; } pyr->buffer_alloc = aom_memalign(PYRAMID_ALIGNMENT, buffer_size * sizeof(*pyr->buffer_alloc)); if (!pyr->buffer_alloc) { aom_free(pyr->layers); aom_free(pyr); aom_free(layer_offsets); return NULL; } // Fill in pointers for each level // If image is 8-bit, then the lowest level is left unconfigured for now, // and will be set up properly when the pyramid is filled in for (int level = first_allocated_level; level < n_levels; level++) { PyramidLayer *layer = &pyr->layers[level]; layer->buffer = pyr->buffer_alloc + layer_offsets[level]; } #if CONFIG_MULTITHREAD pthread_mutex_init(&pyr->mutex, NULL); #endif // CONFIG_MULTITHREAD aom_free(layer_offsets); return pyr; } // Fill the border region of a pyramid frame. // This must be called after the main image area is filled out. // `img_buf` should point to the first pixel in the image area, // ie. it should be pyr->level_buffer + pyr->level_loc[level]. static inline void fill_border(uint8_t *img_buf, const int width, const int height, const int stride) { // Fill left and right areas for (int row = 0; row < height; row++) { uint8_t *row_start = &img_buf[row * stride]; uint8_t left_pixel = row_start[0]; memset(row_start - PYRAMID_PADDING, left_pixel, PYRAMID_PADDING); uint8_t right_pixel = row_start[width - 1]; memset(row_start + width, right_pixel, PYRAMID_PADDING); } // Fill top area for (int row = -PYRAMID_PADDING; row < 0; row++) { uint8_t *row_start = &img_buf[row * stride]; memcpy(row_start - PYRAMID_PADDING, img_buf - PYRAMID_PADDING, width + 2 * PYRAMID_PADDING); } // Fill bottom area uint8_t *last_row_start = &img_buf[(height - 1) * stride]; for (int row = height; row < height + PYRAMID_PADDING; row++) { uint8_t *row_start = &img_buf[row * stride]; memcpy(row_start - PYRAMID_PADDING, last_row_start - PYRAMID_PADDING, width + 2 * PYRAMID_PADDING); } } // Compute downsampling pyramid for a frame // // This function will ensure that the first `n_levels` levels of the pyramid // are filled, unless the frame is too small to have this many levels. // In that case, we will fill all available levels and then stop. // // Returns the actual number of levels filled, capped at n_levels, // or -1 on error. // // This must only be called while holding frame_pyr->mutex static inline int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, int n_levels, ImagePyramid *frame_pyr) { int already_filled_levels = frame_pyr->filled_levels; // This condition should already be enforced by aom_compute_pyramid assert(n_levels <= frame_pyr->max_levels); if (already_filled_levels >= n_levels) { return n_levels; } const int frame_width = frame->y_crop_width; const int frame_height = frame->y_crop_height; const int frame_stride = frame->y_stride; assert((frame_width >> n_levels) >= 0); assert((frame_height >> n_levels) >= 0); if (already_filled_levels == 0) { // Fill in largest level from the original image PyramidLayer *first_layer = &frame_pyr->layers[0]; if (frame->flags & YV12_FLAG_HIGHBITDEPTH) { // For frames stored in a 16-bit buffer, we need to downconvert to 8 bits assert(first_layer->width == frame_width); assert(first_layer->height == frame_height); uint16_t *frame_buffer = CONVERT_TO_SHORTPTR(frame->y_buffer); uint8_t *pyr_buffer = first_layer->buffer; int pyr_stride = first_layer->stride; for (int y = 0; y < frame_height; y++) { uint16_t *frame_row = frame_buffer + y * frame_stride; uint8_t *pyr_row = pyr_buffer + y * pyr_stride; for (int x = 0; x < frame_width; x++) { pyr_row[x] = frame_row[x] >> (bit_depth - 8); } } fill_border(pyr_buffer, frame_width, frame_height, pyr_stride); } else { // For frames stored in an 8-bit buffer, we don't need to copy anything - // we can just reference the original image buffer first_layer->buffer = frame->y_buffer; first_layer->width = frame_width; first_layer->height = frame_height; first_layer->stride = frame_stride; } already_filled_levels = 1; } // Fill in the remaining levels through progressive downsampling for (int level = already_filled_levels; level < n_levels; ++level) { bool mem_status = false; PyramidLayer *prev_layer = &frame_pyr->layers[level - 1]; uint8_t *prev_buffer = prev_layer->buffer; int prev_stride = prev_layer->stride; PyramidLayer *this_layer = &frame_pyr->layers[level]; uint8_t *this_buffer = this_layer->buffer; int this_width = this_layer->width; int this_height = this_layer->height; int this_stride = this_layer->stride; // The width and height of the previous layer that needs to be considered to // derive the current layer frame. const int input_layer_width = this_width << 1; const int input_layer_height = this_height << 1; // Compute the this pyramid level by downsampling the current level. // // We downsample by a factor of exactly 2, clipping the rightmost and // bottommost pixel off of the current level if needed. We do this for // two main reasons: // // 1) In the disflow code, when stepping from a higher pyramid level to a // lower pyramid level, we need to not just interpolate the flow field // but also to scale each flow vector by the upsampling ratio. // So it is much more convenient if this ratio is simply 2. // // 2) Up/downsampling by a factor of 2 can be implemented much more // efficiently than up/downsampling by a generic ratio. // TODO(rachelbarker): Use optimized downsample-by-2 function // SIMD support has been added specifically for cases where the downsample // factor is exactly 2. In such instances, horizontal and vertical resizing // is performed utilizing the down2_symeven() function, which considers the // even dimensions of the input layer. if (should_resize_by_half(input_layer_height, input_layer_width, this_height, this_width)) { assert(input_layer_height % 2 == 0 && input_layer_width % 2 == 0 && "Input width or height cannot be odd."); mem_status = av1_resize_plane_to_half( prev_buffer, input_layer_height, input_layer_width, prev_stride, this_buffer, this_height, this_width, this_stride); } else { mem_status = av1_resize_plane(prev_buffer, input_layer_height, input_layer_width, prev_stride, this_buffer, this_height, this_width, this_stride); } // Terminate early in cases of memory allocation failure. if (!mem_status) { frame_pyr->filled_levels = n_levels; return -1; } fill_border(this_buffer, this_width, this_height, this_stride); } frame_pyr->filled_levels = n_levels; return n_levels; } // Fill out a downsampling pyramid for a given frame. // // The top level (index 0) will always be an 8-bit copy of the input frame, // regardless of the input bit depth. Additional levels are then downscaled // by powers of 2. // // This function will ensure that the first `n_levels` levels of the pyramid // are filled, unless the frame is too small to have this many levels. // In that case, we will fill all available levels and then stop. // No matter how small the frame is, at least one level is guaranteed // to be filled. // // Returns the actual number of levels filled, capped at n_levels, // or -1 on error. int aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, int n_levels, ImagePyramid *pyr) { assert(pyr); // Per the comments in the ImagePyramid struct, we must take this mutex // before reading or writing the filled_levels field, and hold it while // computing any additional pyramid levels, to ensure proper behaviour // when multithreading is used #if CONFIG_MULTITHREAD pthread_mutex_lock(&pyr->mutex); #endif // CONFIG_MULTITHREAD n_levels = AOMMIN(n_levels, pyr->max_levels); int result = n_levels; if (pyr->filled_levels < n_levels) { // Compute any missing levels that we need result = fill_pyramid(frame, bit_depth, n_levels, pyr); } // At this point, as long as result >= 0, the requested number of pyramid // levels are guaranteed to be valid, and can be safely read from without // holding the mutex any further assert(IMPLIES(result >= 0, pyr->filled_levels >= n_levels)); #if CONFIG_MULTITHREAD pthread_mutex_unlock(&pyr->mutex); #endif // CONFIG_MULTITHREAD return result; } #ifndef NDEBUG // Check if a pyramid has already been computed to at least n levels // This is mostly a debug helper - as it is necessary to hold pyr->mutex // while reading the number of already-computed levels, we cannot just write: // assert(pyr->filled_levels >= n_levels); // This function allows the check to be correctly written as: // assert(aom_is_pyramid_valid(pyr, n_levels)); // // Note: This deliberately does not restrict n_levels based on the maximum // number of permitted levels for the frame size. This allows the check to // catch cases where the caller forgets to handle the case where // max_levels is less than the requested number of levels bool aom_is_pyramid_valid(ImagePyramid *pyr, int n_levels) { assert(pyr); // Per the comments in the ImagePyramid struct, we must take this mutex // before reading or writing the filled_levels field, to ensure proper // behaviour when multithreading is used #if CONFIG_MULTITHREAD pthread_mutex_lock(&pyr->mutex); #endif // CONFIG_MULTITHREAD bool result = (pyr->filled_levels >= n_levels); #if CONFIG_MULTITHREAD pthread_mutex_unlock(&pyr->mutex); #endif // CONFIG_MULTITHREAD return result; } #endif // Mark a pyramid as no longer containing valid data. // This must be done whenever the corresponding frame buffer is reused void aom_invalidate_pyramid(ImagePyramid *pyr) { if (pyr) { #if CONFIG_MULTITHREAD pthread_mutex_lock(&pyr->mutex); #endif // CONFIG_MULTITHREAD pyr->filled_levels = 0; #if CONFIG_MULTITHREAD pthread_mutex_unlock(&pyr->mutex); #endif // CONFIG_MULTITHREAD } } // Release the memory associated with a pyramid void aom_free_pyramid(ImagePyramid *pyr) { if (pyr) { #if CONFIG_MULTITHREAD pthread_mutex_destroy(&pyr->mutex); #endif // CONFIG_MULTITHREAD aom_free(pyr->buffer_alloc); aom_free(pyr->layers); aom_free(pyr); } } aom-3.12.1/aom_dsp/pyramid.h000066400000000000000000000121341477627663500156420ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_PYRAMID_H_ #define AOM_AOM_DSP_PYRAMID_H_ #include #include #include #include "config/aom_config.h" #include "aom_scale/yv12config.h" #include "aom_util/aom_pthread.h" #ifdef __cplusplus extern "C" { #endif // Minimum dimensions of a downsampled image #define MIN_PYRAMID_SIZE_LOG2 3 #define MIN_PYRAMID_SIZE (1 << MIN_PYRAMID_SIZE_LOG2) // Size of border around each pyramid image, in pixels // Similarly to the border around regular image buffers, this border is filled // with copies of the outermost pixels of the frame, to allow for more efficient // convolution code // TODO(rachelbarker): How many pixels do we actually need here? // I think we only need 9 for disflow, but how many for corner matching? #define PYRAMID_PADDING 16 // Byte alignment of each line within the image pyramids. // That is, the first pixel inside the image (ie, not in the border region), // on each row of each pyramid level, is aligned to this byte alignment. // This value must be a power of 2. #define PYRAMID_ALIGNMENT 32 typedef struct { uint8_t *buffer; int width; int height; int stride; } PyramidLayer; // Struct for an image pyramid typedef struct image_pyramid { #if CONFIG_MULTITHREAD // Mutex which is used to prevent the pyramid being computed twice at the // same time // // Semantics: // * This mutex must be held whenever reading or writing the // `filled_levels` field // // * This mutex must also be held while computing the image pyramid, // to ensure that only one thread may do so at a time. // // * However, once you have read the filled_levels field and observed // a value N, it is safe to drop the mutex and read from the remaining // fields, including the first N pyramid levels (but no higher). // Note that filled_levels must be read once and cached in a local variable // in order for this to be safe - it cannot be re-read without retaking // the mutex. // // This works because, once the image pyramid is computed, its contents // will not be changed until the parent frame buffer is recycled, // which will not happen until there are no more outstanding references // to the frame buffer. pthread_mutex_t mutex; #endif // Maximum number of levels for the given frame size // We always allocate enough memory for this many levels, as the memory // cost of higher levels of the pyramid is minimal. int max_levels; // Number of levels which currently hold valid data int filled_levels; // Pointer to allocated buffer uint8_t *buffer_alloc; // Data for each level // The `buffer` pointers inside this array point into the region which // is stored in the `buffer_alloc` field here PyramidLayer *layers; } ImagePyramid; size_t aom_get_pyramid_alloc_size(int width, int height, bool image_is_16bit); ImagePyramid *aom_alloc_pyramid(int width, int height, bool image_is_16bit); // Fill out a downsampling pyramid for a given frame. // // The top level (index 0) will always be an 8-bit copy of the input frame, // regardless of the input bit depth. Additional levels are then downscaled // by powers of 2. // // This function will ensure that the first `n_levels` levels of the pyramid // are filled, unless the frame is too small to have this many levels. // In that case, we will fill all available levels and then stop. // // Returns the actual number of levels filled, capped at n_levels, // or -1 on error. int aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, int n_levels, ImagePyramid *pyr); #ifndef NDEBUG // Check if a pyramid has already been computed to at least n levels // This is mostly a debug helper - as it is necessary to hold pyr->mutex // while reading the number of already-computed levels, we cannot just write: // assert(pyr->filled_levels >= n_levels); // This function allows the check to be correctly written as: // assert(aom_is_pyramid_valid(pyr, n_levels)); // // Note: This deliberately does not restrict n_levels based on the maximum // number of permitted levels for the frame size. This allows the check to // catch cases where the caller forgets to handle the case where // max_levels is less than the requested number of levels bool aom_is_pyramid_valid(ImagePyramid *pyr, int n_levels); #endif // Mark a pyramid as no longer containing valid data. // This must be done whenever the corresponding frame buffer is reused void aom_invalidate_pyramid(ImagePyramid *pyr); // Release the memory associated with a pyramid void aom_free_pyramid(ImagePyramid *pyr); #ifdef __cplusplus } #endif #endif // AOM_AOM_DSP_PYRAMID_H_ aom-3.12.1/aom_dsp/quantize.c000066400000000000000000000522661477627663500160420ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom_dsp/quantize.h" #include "aom_mem/aom_mem.h" #include "config/aom_dsp_rtcd.h" #if !CONFIG_REALTIME_ONLY void aom_quantize_b_adaptive_helper_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) { const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; int i, non_zero_count = (int)n_coeffs, eob = -1; (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); int prescan_add[2]; for (i = 0; i < 2; ++i) prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); // Pre-scan pass for (i = (int)n_coeffs - 1; i >= 0; i--) { const int rc = scan[i]; const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); const int coeff = coeff_ptr[rc] * wt; const int prescan_add_val = prescan_add[rc != 0]; if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) non_zero_count--; else break; } // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. #if SKIP_EOB_FACTOR_ADJUST int first = -1; #endif // SKIP_EOB_FACTOR_ADJUST for (i = 0; i < non_zero_count; i++) { const int rc = scan[i]; const int coeff = coeff_ptr[rc]; const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; int tmp32; const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { int64_t tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), INT16_MIN, INT16_MAX); tmp *= wt; tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * quant_shift_ptr[rc != 0]) >> (16 - log_scale + AOM_QM_BITS)); // quantization qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); const int dequant = (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); if (tmp32) { eob = i; #if SKIP_EOB_FACTOR_ADJUST if (first == -1) first = i; #endif // SKIP_EOB_FACTOR_ADJUST } } } #if SKIP_EOB_FACTOR_ADJUST if (eob >= 0 && first == eob) { const int rc = scan[eob]; if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); const int coeff = coeff_ptr[rc] * wt; const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; const int prescan_add_val = ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) { qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; eob = -1; } } } #endif // SKIP_EOB_FACTOR_ADJUST *eob_ptr = eob + 1; } #endif // !CONFIG_REALTIME_ONLY void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) { const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; int i, non_zero_count = (int)n_coeffs, eob = -1; (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); // Pre-scan pass for (i = (int)n_coeffs - 1; i >= 0; i--) { const int rc = scan[i]; const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); const int coeff = coeff_ptr[rc] * wt; if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) && coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS))) non_zero_count--; else break; } // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. for (i = 0; i < non_zero_count; i++) { const int rc = scan[i]; const int coeff = coeff_ptr[rc]; const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; int tmp32; const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { int64_t tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), INT16_MIN, INT16_MAX); tmp *= wt; tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * quant_shift_ptr[rc != 0]) >> (16 - log_scale + AOM_QM_BITS)); // quantization qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); const int dequant = (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); if (tmp32) eob = i; } } *eob_ptr = eob + 1; } #if CONFIG_AV1_HIGHBITDEPTH #if !CONFIG_REALTIME_ONLY void aom_highbd_quantize_b_adaptive_helper_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) { const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; (void)iscan; int i, non_zero_count = (int)n_coeffs, eob = -1; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); int prescan_add[2]; for (i = 0; i < 2; ++i) prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); // Pre-scan pass for (i = (int)n_coeffs - 1; i >= 0; i--) { const int rc = scan[i]; const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); const int coeff = coeff_ptr[rc] * wt; const int prescan_add_val = prescan_add[rc != 0]; if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) non_zero_count--; else break; } // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. #if SKIP_EOB_FACTOR_ADJUST int first = -1; #endif // SKIP_EOB_FACTOR_ADJUST for (i = 0; i < non_zero_count; i++) { const int rc = scan[i]; const int coeff = coeff_ptr[rc]; const int coeff_sign = AOMSIGN(coeff); const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); const int64_t tmpw = tmp1 * wt; const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> (16 - log_scale + AOM_QM_BITS)); qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); const int dequant = (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); if (abs_qcoeff) { eob = i; #if SKIP_EOB_FACTOR_ADJUST if (first == -1) first = eob; #endif // SKIP_EOB_FACTOR_ADJUST } } } #if SKIP_EOB_FACTOR_ADJUST if (eob >= 0 && first == eob) { const int rc = scan[eob]; if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); const int coeff = coeff_ptr[rc] * wt; const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; const int prescan_add_val = ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) { qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; eob = -1; } } } #endif // SKIP_EOB_FACTOR_ADJUST *eob_ptr = eob + 1; } #endif // !CONFIG_REALTIME_ONLY void aom_highbd_quantize_b_helper_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) { int i, eob = -1; const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; int dequant; int idx_arr[4096]; (void)iscan; int idx = 0; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); // Pre-scan pass for (i = 0; i < n_coeffs; i++) { const int rc = scan[i]; const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); const int coeff = coeff_ptr[rc] * wt; // If the coefficient is out of the base ZBIN range, keep it for // quantization. if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) || coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS))) idx_arr[idx++] = i; } // Quantization pass: only process the coefficients selected in // pre-scan pass. Note: idx can be zero. for (i = 0; i < idx; i++) { const int rc = scan[idx_arr[i]]; const int coeff = coeff_ptr[rc]; const int coeff_sign = AOMSIGN(coeff); const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); const int64_t tmpw = tmp1 * wt; const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> (16 - log_scale + AOM_QM_BITS)); qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); dequant = (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); if (abs_qcoeff) eob = idx_arr[i]; } *eob_ptr = eob + 1; } #endif // CONFIG_AV1_HIGHBITDEPTH #if !CONFIG_REALTIME_ONLY /* These functions should only be called when quantisation matrices are not used. */ void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0); } void aom_quantize_b_32x32_adaptive_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1); } void aom_quantize_b_64x64_adaptive_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2); } #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_quantize_b_adaptive_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0); } void aom_highbd_quantize_b_32x32_adaptive_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1); } void aom_highbd_quantize_b_64x64_adaptive_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2); } #endif // CONFIG_AV1_HIGHBITDEPTH #endif // !CONFIG_REALTIME_ONLY void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0); } void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1); } void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2); } #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0); } void aom_highbd_quantize_b_32x32_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1); } void aom_highbd_quantize_b_64x64_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2); } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/quantize.h000066400000000000000000000140301477627663500160320ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_QUANTIZE_H_ #define AOM_AOM_DSP_QUANTIZE_H_ #include "config/aom_config.h" #include "aom_dsp/aom_dsp_common.h" #ifdef __cplusplus extern "C" { #endif #define EOB_FACTOR 325 #define SKIP_EOB_FACTOR_ADJUST 200 void aom_quantize_b_adaptive_helper_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale); void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void aom_quantize_b_32x32_adaptive_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void aom_quantize_b_64x64_adaptive_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_quantize_b_adaptive_helper_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale); void aom_highbd_quantize_b_adaptive_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void aom_highbd_quantize_b_32x32_adaptive_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void aom_highbd_quantize_b_64x64_adaptive_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #endif // CONFIG_AV1_HIGHBITDEPTH void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale); void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_quantize_b_helper_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale); void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #endif // CONFIG_AV1_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_QUANTIZE_H_ aom-3.12.1/aom_dsp/recenter.h000066400000000000000000000035731477627663500160130ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_RECENTER_H_ #define AOM_AOM_DSP_RECENTER_H_ #include "config/aom_config.h" #include "aom/aom_integer.h" // Inverse recenters a non-negative literal v around a reference r static inline uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) { if (v > (r << 1)) return v; else if ((v & 1) == 0) return (v >> 1) + r; else return r - ((v + 1) >> 1); } // Inverse recenters a non-negative literal v in [0, n-1] around a // reference r also in [0, n-1] static inline uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) { if ((r << 1) <= n) { return inv_recenter_nonneg(r, v); } else { return n - 1 - inv_recenter_nonneg(n - 1 - r, v); } } // Recenters a non-negative literal v around a reference r static inline uint16_t recenter_nonneg(uint16_t r, uint16_t v) { if (v > (r << 1)) return v; else if (v >= r) return ((v - r) << 1); else return ((r - v) << 1) - 1; } // Recenters a non-negative literal v in [0, n-1] around a // reference r also in [0, n-1] static inline uint16_t recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) { if ((r << 1) <= n) { return recenter_nonneg(r, v); } else { return recenter_nonneg(n - 1 - r, n - 1 - v); } } #endif // AOM_AOM_DSP_RECENTER_H_ aom-3.12.1/aom_dsp/sad.c000066400000000000000000000316271477627663500147470ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "aom_dsp/blend.h" /* Sum the difference between every corresponding element of the buffers. */ static inline unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height) { int y, x; unsigned int sad = 0; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { sad += abs(a[x] - b[x]); } a += a_stride; b += b_stride; } return sad; } #define SADMXN(m, n) \ unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride) { \ return sad(src, src_stride, ref, ref_stride, m, n); \ } #define SADMXN_AVG(m, n) \ unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred) { \ uint8_t comp_pred[m * n]; \ aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ return sad(src, src_stride, comp_pred, m, m, n); \ } #define SADMXN_SKIP(m, n) \ unsigned int aom_sad_skip_##m##x##n##_c(const uint8_t *src, int src_stride, \ const uint8_t *ref, \ int ref_stride) { \ return 2 * sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \ } #define SADMXN_NO_SKIP(m, n) \ SADMXN(m, n) \ SADMXN_AVG(m, n) #define SADMXN_NO_AVG(m, n) \ SADMXN(m, n) \ SADMXN_SKIP(m, n) #define SADMXN_ALL(m, n) \ SADMXN(m, n) \ SADMXN_AVG(m, n) \ SADMXN_SKIP(m, n) // Calculate sad against 4 reference locations and store each in sad_array #define SAD_MXNX4D_NO_SKIP(m, n) \ void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ int i; \ for (i = 0; i < 4; ++i) { \ sad_array[i] = \ aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \ } \ } #define SAD_MXNX4D(m, n) \ SAD_MXNX4D_NO_SKIP(m, n) \ void aom_sad_skip_##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ int i; \ for (i = 0; i < 4; ++i) { \ sad_array[i] = 2 * sad(src, 2 * src_stride, ref_array[i], \ 2 * ref_stride, (m), (n / 2)); \ } \ } // Call SIMD version of aom_sad_mxnx4d if the 3d version is unavailable. #define SAD_MXNX3D(m, n) \ void aom_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ aom_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride, sad_array); \ } // 128x128 SADMXN_ALL(128, 128) SAD_MXNX4D(128, 128) SAD_MXNX3D(128, 128) // 128x64 SADMXN_ALL(128, 64) SAD_MXNX4D(128, 64) SAD_MXNX3D(128, 64) // 64x128 SADMXN_ALL(64, 128) SAD_MXNX4D(64, 128) SAD_MXNX3D(64, 128) // 64x64 SADMXN_ALL(64, 64) SAD_MXNX4D(64, 64) SAD_MXNX3D(64, 64) // 64x32 SADMXN_ALL(64, 32) SAD_MXNX4D(64, 32) SAD_MXNX3D(64, 32) // 32x64 SADMXN_ALL(32, 64) SAD_MXNX4D(32, 64) SAD_MXNX3D(32, 64) // 32x32 SADMXN_ALL(32, 32) SAD_MXNX4D(32, 32) SAD_MXNX3D(32, 32) // 32x16 SADMXN_ALL(32, 16) SAD_MXNX4D(32, 16) SAD_MXNX3D(32, 16) // 16x32 SADMXN_ALL(16, 32) SAD_MXNX4D(16, 32) SAD_MXNX3D(16, 32) // 16x16 SADMXN_ALL(16, 16) SAD_MXNX4D(16, 16) SAD_MXNX3D(16, 16) // 16x8 SADMXN_NO_SKIP(16, 8) SAD_MXNX4D_NO_SKIP(16, 8) SAD_MXNX3D(16, 8) // 8x16 SADMXN_ALL(8, 16) SAD_MXNX4D(8, 16) SAD_MXNX3D(8, 16) // 8x8 SADMXN_NO_SKIP(8, 8) SAD_MXNX4D_NO_SKIP(8, 8) SAD_MXNX3D(8, 8) // 8x4 SADMXN(8, 4) SAD_MXNX4D_NO_SKIP(8, 4) SAD_MXNX3D(8, 4) // 4x8 SADMXN(4, 8) SAD_MXNX4D_NO_SKIP(4, 8) SAD_MXNX3D(4, 8) // 4x4 SADMXN(4, 4) SAD_MXNX4D_NO_SKIP(4, 4) SAD_MXNX3D(4, 4) #if !CONFIG_REALTIME_ONLY SADMXN_NO_AVG(4, 16) SAD_MXNX4D(4, 16) SADMXN(16, 4) SAD_MXNX4D_NO_SKIP(16, 4) SADMXN_ALL(8, 32) SAD_MXNX4D(8, 32) SADMXN_NO_SKIP(32, 8) SAD_MXNX4D_NO_SKIP(32, 8) SADMXN_ALL(16, 64) SAD_MXNX4D(16, 64) SADMXN_ALL(64, 16) SAD_MXNX4D(64, 16) SAD_MXNX3D(4, 16) SAD_MXNX3D(16, 4) SAD_MXNX3D(8, 32) SAD_MXNX3D(32, 8) SAD_MXNX3D(16, 64) SAD_MXNX3D(64, 16) #endif // !CONFIG_REALTIME_ONLY #if CONFIG_AV1_HIGHBITDEPTH static inline unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int width, int height) { int y, x; unsigned int sad = 0; const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { sad += abs(a[x] - b[x]); } a += a_stride; b += b_stride; } return sad; } static inline unsigned int highbd_sadb(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int width, int height) { int y, x; unsigned int sad = 0; const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { sad += abs(a[x] - b[x]); } a += a_stride; b += b_stride; } return sad; } #define HIGHBD_SADMXN(m, n) \ unsigned int aom_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ const uint8_t *ref, \ int ref_stride) { \ return highbd_sad(src, src_stride, ref, ref_stride, m, n); \ } #define HIGHBD_SADMXN_AVG(m, n) \ unsigned int aom_highbd_sad##m##x##n##_avg_c( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred) { \ uint16_t comp_pred[m * n]; \ uint8_t *const comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred); \ aom_highbd_comp_avg_pred(comp_pred8, second_pred, m, n, ref, ref_stride); \ return highbd_sadb(src, src_stride, comp_pred8, m, m, n); \ } #define HIGHBD_SADMXN_SKIP(m, n) \ unsigned int aom_highbd_sad_skip_##m##x##n##_c( \ const uint8_t *src, int src_stride, const uint8_t *ref, \ int ref_stride) { \ return 2 * \ highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \ } #define HIGHBD_SADMXN_NO_SKIP(m, n) \ HIGHBD_SADMXN(m, n) \ HIGHBD_SADMXN_AVG(m, n) #define HIGHBD_SADMXN_NO_AVG(m, n) \ HIGHBD_SADMXN(m, n) \ HIGHBD_SADMXN_SKIP(m, n) #define HIGHBD_SADMXN_ALL(m, n) \ HIGHBD_SADMXN(m, n) \ HIGHBD_SADMXN_AVG(m, n) \ HIGHBD_SADMXN_SKIP(m, n) #define HIGHBD_SAD_MXNX4D_NO_SKIP(m, n) \ void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ int i; \ for (i = 0; i < 4; ++i) { \ sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \ ref_array[i], ref_stride); \ } \ } #define HIGHBD_SAD_MXNX4D(m, n) \ HIGHBD_SAD_MXNX4D_NO_SKIP(m, n) \ void aom_highbd_sad_skip_##m##x##n##x4d_c( \ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ int i; \ for (i = 0; i < 4; ++i) { \ sad_array[i] = 2 * highbd_sad(src, 2 * src_stride, ref_array[i], \ 2 * ref_stride, (m), (n / 2)); \ } \ } // Call SIMD version of aom_highbd_sad_mxnx4d if the 3d version is unavailable. #define HIGHBD_SAD_MXNX3D(m, n) \ void aom_highbd_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ aom_highbd_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride, \ sad_array); \ } // 128x128 HIGHBD_SADMXN_ALL(128, 128) HIGHBD_SAD_MXNX4D(128, 128) HIGHBD_SAD_MXNX3D(128, 128) // 128x64 HIGHBD_SADMXN_ALL(128, 64) HIGHBD_SAD_MXNX4D(128, 64) HIGHBD_SAD_MXNX3D(128, 64) // 64x128 HIGHBD_SADMXN_ALL(64, 128) HIGHBD_SAD_MXNX4D(64, 128) HIGHBD_SAD_MXNX3D(64, 128) // 64x64 HIGHBD_SADMXN_ALL(64, 64) HIGHBD_SAD_MXNX4D(64, 64) HIGHBD_SAD_MXNX3D(64, 64) // 64x32 HIGHBD_SADMXN_ALL(64, 32) HIGHBD_SAD_MXNX4D(64, 32) HIGHBD_SAD_MXNX3D(64, 32) // 32x64 HIGHBD_SADMXN_ALL(32, 64) HIGHBD_SAD_MXNX4D(32, 64) HIGHBD_SAD_MXNX3D(32, 64) // 32x32 HIGHBD_SADMXN_ALL(32, 32) HIGHBD_SAD_MXNX4D(32, 32) HIGHBD_SAD_MXNX3D(32, 32) // 32x16 HIGHBD_SADMXN_ALL(32, 16) HIGHBD_SAD_MXNX4D(32, 16) HIGHBD_SAD_MXNX3D(32, 16) // 16x32 HIGHBD_SADMXN_ALL(16, 32) HIGHBD_SAD_MXNX4D(16, 32) HIGHBD_SAD_MXNX3D(16, 32) // 16x16 HIGHBD_SADMXN_ALL(16, 16) HIGHBD_SAD_MXNX4D(16, 16) HIGHBD_SAD_MXNX3D(16, 16) // 16x8 HIGHBD_SADMXN_NO_SKIP(16, 8) HIGHBD_SAD_MXNX4D_NO_SKIP(16, 8) HIGHBD_SAD_MXNX3D(16, 8) // 8x16 HIGHBD_SADMXN_ALL(8, 16) HIGHBD_SAD_MXNX4D(8, 16) HIGHBD_SAD_MXNX3D(8, 16) // 8x8 HIGHBD_SADMXN_NO_SKIP(8, 8) HIGHBD_SAD_MXNX4D_NO_SKIP(8, 8) HIGHBD_SAD_MXNX3D(8, 8) // 8x4 HIGHBD_SADMXN(8, 4) HIGHBD_SAD_MXNX4D_NO_SKIP(8, 4) HIGHBD_SAD_MXNX3D(8, 4) // 4x8 HIGHBD_SADMXN(4, 8) HIGHBD_SAD_MXNX4D_NO_SKIP(4, 8) HIGHBD_SAD_MXNX3D(4, 8) // 4x4 HIGHBD_SADMXN(4, 4) HIGHBD_SAD_MXNX4D_NO_SKIP(4, 4) HIGHBD_SAD_MXNX3D(4, 4) #if !CONFIG_REALTIME_ONLY HIGHBD_SADMXN_NO_AVG(4, 16) HIGHBD_SAD_MXNX4D(4, 16) HIGHBD_SADMXN(16, 4) HIGHBD_SAD_MXNX4D_NO_SKIP(16, 4) HIGHBD_SADMXN_ALL(8, 32) HIGHBD_SAD_MXNX4D(8, 32) HIGHBD_SADMXN_NO_SKIP(32, 8) HIGHBD_SAD_MXNX4D_NO_SKIP(32, 8) HIGHBD_SADMXN_ALL(16, 64) HIGHBD_SAD_MXNX4D(16, 64) HIGHBD_SADMXN_ALL(64, 16) HIGHBD_SAD_MXNX4D(64, 16) HIGHBD_SAD_MXNX3D(4, 16) HIGHBD_SAD_MXNX3D(16, 4) HIGHBD_SAD_MXNX3D(8, 32) HIGHBD_SAD_MXNX3D(32, 8) HIGHBD_SAD_MXNX3D(16, 64) HIGHBD_SAD_MXNX3D(64, 16) #endif // !CONFIG_REALTIME_ONLY #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/sad_av1.c000066400000000000000000000177371477627663500155240ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "aom_dsp/blend.h" static inline unsigned int masked_sad(const uint8_t *src, int src_stride, const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *m, int m_stride, int width, int height) { int y, x; unsigned int sad = 0; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { const int16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]); sad += abs(pred - src[x]); } src += src_stride; a += a_stride; b += b_stride; m += m_stride; } return sad; } #define MASKSADMxN(m, n) \ unsigned int aom_masked_sad##m##x##n##_c( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ int invert_mask) { \ if (!invert_mask) \ return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \ msk_stride, m, n); \ else \ return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \ msk_stride, m, n); \ } /* clang-format off */ MASKSADMxN(128, 128) MASKSADMxN(128, 64) MASKSADMxN(64, 128) MASKSADMxN(64, 64) MASKSADMxN(64, 32) MASKSADMxN(32, 64) MASKSADMxN(32, 32) MASKSADMxN(32, 16) MASKSADMxN(16, 32) MASKSADMxN(16, 16) MASKSADMxN(16, 8) MASKSADMxN(8, 16) MASKSADMxN(8, 8) MASKSADMxN(8, 4) MASKSADMxN(4, 8) MASKSADMxN(4, 4) #if !CONFIG_REALTIME_ONLY MASKSADMxN(4, 16) MASKSADMxN(16, 4) MASKSADMxN(8, 32) MASKSADMxN(32, 8) MASKSADMxN(16, 64) MASKSADMxN(64, 16) #endif // !CONFIG_REALTIME_ONLY /* clang-format on */ #if CONFIG_AV1_HIGHBITDEPTH static inline unsigned int highbd_masked_sad( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, int width, int height) { int y, x; unsigned int sad = 0; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]); sad += abs(pred - src[x]); } src += src_stride; a += a_stride; b += b_stride; m += m_stride; } return sad; } #define HIGHBD_MASKSADMXN(m, n) \ unsigned int aom_highbd_masked_sad##m##x##n##_c( \ const uint8_t *src8, int src_stride, const uint8_t *ref8, \ int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ int msk_stride, int invert_mask) { \ if (!invert_mask) \ return highbd_masked_sad(src8, src_stride, ref8, ref_stride, \ second_pred8, m, msk, msk_stride, m, n); \ else \ return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \ ref_stride, msk, msk_stride, m, n); \ } HIGHBD_MASKSADMXN(128, 128) HIGHBD_MASKSADMXN(128, 64) HIGHBD_MASKSADMXN(64, 128) HIGHBD_MASKSADMXN(64, 64) HIGHBD_MASKSADMXN(64, 32) HIGHBD_MASKSADMXN(32, 64) HIGHBD_MASKSADMXN(32, 32) HIGHBD_MASKSADMXN(32, 16) HIGHBD_MASKSADMXN(16, 32) HIGHBD_MASKSADMXN(16, 16) HIGHBD_MASKSADMXN(16, 8) HIGHBD_MASKSADMXN(8, 16) HIGHBD_MASKSADMXN(8, 8) HIGHBD_MASKSADMXN(8, 4) HIGHBD_MASKSADMXN(4, 8) HIGHBD_MASKSADMXN(4, 4) #if !CONFIG_REALTIME_ONLY HIGHBD_MASKSADMXN(4, 16) HIGHBD_MASKSADMXN(16, 4) HIGHBD_MASKSADMXN(8, 32) HIGHBD_MASKSADMXN(32, 8) HIGHBD_MASKSADMXN(16, 64) HIGHBD_MASKSADMXN(64, 16) #endif // !CONFIG_REALTIME_ONLY #endif // CONFIG_AV1_HIGHBITDEPTH #if !CONFIG_REALTIME_ONLY // pre: predictor being evaluated // wsrc: target weighted prediction (has been *4096 to keep precision) // mask: 2d weights (scaled by 4096) static inline unsigned int obmc_sad(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int width, int height) { int y, x; unsigned int sad = 0; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); pre += pre_stride; wsrc += width; mask += width; } return sad; } #define OBMCSADMxN(m, n) \ unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \ const int32_t *wsrc, \ const int32_t *mask) { \ return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ } /* clang-format off */ OBMCSADMxN(128, 128) OBMCSADMxN(128, 64) OBMCSADMxN(64, 128) OBMCSADMxN(64, 64) OBMCSADMxN(64, 32) OBMCSADMxN(32, 64) OBMCSADMxN(32, 32) OBMCSADMxN(32, 16) OBMCSADMxN(16, 32) OBMCSADMxN(16, 16) OBMCSADMxN(16, 8) OBMCSADMxN(8, 16) OBMCSADMxN(8, 8) OBMCSADMxN(8, 4) OBMCSADMxN(4, 8) OBMCSADMxN(4, 4) OBMCSADMxN(4, 16) OBMCSADMxN(16, 4) OBMCSADMxN(8, 32) OBMCSADMxN(32, 8) OBMCSADMxN(16, 64) OBMCSADMxN(64, 16) /* clang-format on */ #if CONFIG_AV1_HIGHBITDEPTH static inline unsigned int highbd_obmc_sad( const uint8_t *pre8, int pre_stride, const int32_t *wsrc, const int32_t *mask, int width, int height) { int y, x; unsigned int sad = 0; const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); for (y = 0; y < height; y++) { for (x = 0; x < width; x++) sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); pre += pre_stride; wsrc += width; mask += width; } return sad; } #define HIGHBD_OBMCSADMXN(m, n) \ unsigned int aom_highbd_obmc_sad##m##x##n##_c( \ const uint8_t *ref, int ref_stride, const int32_t *wsrc, \ const int32_t *mask) { \ return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ } /* clang-format off */ HIGHBD_OBMCSADMXN(128, 128) HIGHBD_OBMCSADMXN(128, 64) HIGHBD_OBMCSADMXN(64, 128) HIGHBD_OBMCSADMXN(64, 64) HIGHBD_OBMCSADMXN(64, 32) HIGHBD_OBMCSADMXN(32, 64) HIGHBD_OBMCSADMXN(32, 32) HIGHBD_OBMCSADMXN(32, 16) HIGHBD_OBMCSADMXN(16, 32) HIGHBD_OBMCSADMXN(16, 16) HIGHBD_OBMCSADMXN(16, 8) HIGHBD_OBMCSADMXN(8, 16) HIGHBD_OBMCSADMXN(8, 8) HIGHBD_OBMCSADMXN(8, 4) HIGHBD_OBMCSADMXN(4, 8) HIGHBD_OBMCSADMXN(4, 4) HIGHBD_OBMCSADMXN(4, 16) HIGHBD_OBMCSADMXN(16, 4) HIGHBD_OBMCSADMXN(8, 32) HIGHBD_OBMCSADMXN(32, 8) HIGHBD_OBMCSADMXN(16, 64) HIGHBD_OBMCSADMXN(64, 16) /* clang-format on */ #endif // CONFIG_AV1_HIGHBITDEPTH #endif // !CONFIG_REALTIME_ONLY aom-3.12.1/aom_dsp/simd/000077500000000000000000000000001477627663500147575ustar00rootroot00000000000000aom-3.12.1/aom_dsp/simd/v128_intrinsics.h000066400000000000000000000315771477627663500201120ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ #define AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ #include #include #include #include "aom_dsp/simd/v128_intrinsics_c.h" #include "aom_dsp/simd/v64_intrinsics.h" /* Fallback to plain, unoptimised C. */ typedef c_v128 v128; SIMD_INLINE uint32_t v128_low_u32(v128 a) { return c_v128_low_u32(a); } SIMD_INLINE v64 v128_low_v64(v128 a) { return c_v128_low_v64(a); } SIMD_INLINE v64 v128_high_v64(v128 a) { return c_v128_high_v64(a); } SIMD_INLINE v128 v128_from_64(uint64_t hi, uint64_t lo) { return c_v128_from_64(hi, lo); } SIMD_INLINE v128 v128_from_v64(v64 hi, v64 lo) { return c_v128_from_v64(hi, lo); } SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { return c_v128_from_32(a, b, c, d); } SIMD_INLINE v128 v128_load_unaligned(const void *p) { return c_v128_load_unaligned(p); } SIMD_INLINE v128 v128_load_aligned(const void *p) { return c_v128_load_aligned(p); } SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { c_v128_store_unaligned(p, a); } SIMD_INLINE void v128_store_aligned(void *p, v128 a) { c_v128_store_aligned(p, a); } SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) { return c_v128_align(a, b, c); } SIMD_INLINE v128 v128_zero(void) { return c_v128_zero(); } SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); } SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); } SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); } SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); } SIMD_INLINE c_sad128_internal v128_sad_u8_init(void) { return c_v128_sad_u8_init(); } SIMD_INLINE c_sad128_internal v128_sad_u8(c_sad128_internal s, v128 a, v128 b) { return c_v128_sad_u8(s, a, b); } SIMD_INLINE uint32_t v128_sad_u8_sum(c_sad128_internal s) { return c_v128_sad_u8_sum(s); } SIMD_INLINE c_ssd128_internal v128_ssd_u8_init(void) { return c_v128_ssd_u8_init(); } SIMD_INLINE c_ssd128_internal v128_ssd_u8(c_ssd128_internal s, v128 a, v128 b) { return c_v128_ssd_u8(s, a, b); } SIMD_INLINE uint32_t v128_ssd_u8_sum(c_ssd128_internal s) { return c_v128_ssd_u8_sum(s); } SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { return c_v128_dotp_su8(a, b); } SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { return c_v128_dotp_s16(a, b); } SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { return c_v128_dotp_s32(a, b); } SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); } SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); } SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return c_v128_xor(a, b); } SIMD_INLINE v128 v128_and(v128 a, v128 b) { return c_v128_and(a, b); } SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); } SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); } SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); } SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return c_v128_sadd_u8(a, b); } SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return c_v128_sadd_s8(a, b); } SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); } SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); } SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return c_v128_add_64(a, b); } SIMD_INLINE v128 v128_padd_u8(v128 a) { return c_v128_padd_u8(a); } SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); } SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); } SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); } SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return c_v128_ssub_s8(a, b); } SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); } SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); } SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); } SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); } SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return c_v128_sub_64(a, b); } SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); } SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); } SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); } SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { return c_v128_mullo_s16(a, b); } SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { return c_v128_mulhi_s16(a, b); } SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { return c_v128_mullo_s32(a, b); } SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); } SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); } SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return c_v128_movemask_8(a); } SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { return c_v128_blend_8(a, b, c); } SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); } SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); } SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) { return c_v128_rdavg_u16(a, b); } SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); } SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); } SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); } SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); } SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); } SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); } SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); } SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { return c_v128_min_s32(a, b); } SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { return c_v128_max_s32(a, b); } SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); } SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); } SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { return c_v128_ziplo_16(a, b); } SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { return c_v128_ziphi_16(a, b); } SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { return c_v128_ziplo_32(a, b); } SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { return c_v128_ziphi_32(a, b); } SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { return c_v128_ziplo_64(a, b); } SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { return c_v128_ziphi_64(a, b); } SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return c_v128_zip_8(a, b); } SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return c_v128_zip_16(a, b); } SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return c_v128_zip_32(a, b); } SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) { return c_v128_unziplo_8(a, b); } SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) { return c_v128_unziphi_8(a, b); } SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) { return c_v128_unziplo_16(a, b); } SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) { return c_v128_unziphi_16(a, b); } SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) { return c_v128_unziplo_32(a, b); } SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) { return c_v128_unziphi_32(a, b); } SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { return c_v128_unpack_u8_s16(a); } SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { return c_v128_unpacklo_u8_s16(a); } SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { return c_v128_unpackhi_u8_s16(a); } SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { return c_v128_unpack_s8_s16(a); } SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { return c_v128_unpacklo_s8_s16(a); } SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { return c_v128_unpackhi_s8_s16(a); } SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { return c_v128_pack_s32_s16(a, b); } SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { return c_v128_pack_s32_u16(a, b); } SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { return c_v128_pack_s16_u8(a, b); } SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { return c_v128_pack_s16_s8(a, b); } SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { return c_v128_unpack_u16_s32(a); } SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { return c_v128_unpack_s16_s32(a); } SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { return c_v128_unpacklo_u16_s32(a); } SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { return c_v128_unpacklo_s16_s32(a); } SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { return c_v128_unpackhi_u16_s32(a); } SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { return c_v128_unpackhi_s16_s32(a); } SIMD_INLINE v128 v128_shuffle_8(v128 a, v128 pattern) { return c_v128_shuffle_8(a, pattern); } SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return c_v128_cmpgt_s8(a, b); } SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return c_v128_cmplt_s8(a, b); } SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return c_v128_cmpeq_8(a, b); } SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) { return c_v128_cmpgt_s16(a, b); } SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { return c_v128_cmplt_s16(a, b); } SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); } SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) { return c_v128_cmpgt_s32(a, b); } SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) { return c_v128_cmplt_s32(a, b); } SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return c_v128_cmpeq_32(a, b); } SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { return c_v128_shl_8(a, c); } SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { return c_v128_shr_u8(a, c); } SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { return c_v128_shr_s8(a, c); } SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { return c_v128_shl_16(a, c); } SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { return c_v128_shr_u16(a, c); } SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { return c_v128_shr_s16(a, c); } SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { return c_v128_shl_32(a, c); } SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { return c_v128_shr_u32(a, c); } SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { return c_v128_shr_s32(a, c); } SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { return c_v128_shl_64(a, c); } SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { return c_v128_shr_u64(a, c); } SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { return c_v128_shr_s64(a, c); } SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { return c_v128_shr_n_byte(a, n); } SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { return c_v128_shl_n_byte(a, n); } SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int n) { return c_v128_shl_n_8(a, n); } SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int n) { return c_v128_shl_n_16(a, n); } SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) { return c_v128_shl_n_32(a, n); } SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int n) { return c_v128_shl_n_64(a, n); } SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) { return c_v128_shr_n_u8(a, n); } SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int n) { return c_v128_shr_n_u16(a, n); } SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) { return c_v128_shr_n_u32(a, n); } SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int n) { return c_v128_shr_n_u64(a, n); } SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) { return c_v128_shr_n_s8(a, n); } SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int n) { return c_v128_shr_n_s16(a, n); } SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) { return c_v128_shr_n_s32(a, n); } SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int n) { return c_v128_shr_n_s64(a, n); } typedef uint32_t sad128_internal_u16; SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) { return c_v128_sad_u16_init(); } SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, v128 b) { return c_v128_sad_u16(s, a, b); } SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { return c_v128_sad_u16_sum(s); } typedef uint64_t ssd128_internal_s16; SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return c_v128_ssd_s16_init(); } SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, v128 b) { return c_v128_ssd_s16(s, a, b); } SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { return c_v128_ssd_s16_sum(s); } #endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ aom-3.12.1/aom_dsp/simd/v128_intrinsics_c.h000066400000000000000000000666671477627663500204240ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ #define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ #include #include #include #include "config/aom_config.h" #include "aom_dsp/simd/v64_intrinsics_c.h" typedef union { uint8_t u8[16]; uint16_t u16[8]; uint32_t u32[4]; uint64_t u64[2]; int8_t s8[16]; int16_t s16[8]; int32_t s32[4]; int64_t s64[2]; c_v64 v64[2]; } c_v128; SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; } SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; } SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; } SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) { c_v128 t; t.u64[1] = hi; t.u64[0] = lo; return t; } SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) { c_v128 t; t.v64[1] = hi; t.v64[0] = lo; return t; } SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { c_v128 t; t.u32[3] = a; t.u32[2] = b; t.u32[1] = c; t.u32[0] = d; return t; } SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) { c_v128 t; memcpy(&t, p, 16); return t; } SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) { if (SIMD_CHECK && (uintptr_t)p & 15) { fprintf(stderr, "Error: unaligned v128 load at %p\n", p); abort(); } return c_v128_load_unaligned(p); } SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) { memcpy(p, &a, 16); } SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) { if (SIMD_CHECK && (uintptr_t)p & 15) { fprintf(stderr, "Error: unaligned v128 store at %p\n", p); abort(); } c_v128_store_unaligned(p, a); } SIMD_INLINE c_v128 c_v128_zero(void) { c_v128 t; t.u64[1] = t.u64[0] = 0; return t; } SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) { c_v128 t; t.v64[1] = t.v64[0] = c_v64_dup_8(x); return t; } SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) { c_v128 t; t.v64[1] = t.v64[0] = c_v64_dup_16(x); return t; } SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) { c_v128 t; t.v64[1] = t.v64[0] = c_v64_dup_32(x); return t; } SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) { c_v128 t; t.u64[1] = t.u64[0] = x; return t; } SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) { return c_v64_dotp_su8(a.v64[1], b.v64[1]) + c_v64_dotp_su8(a.v64[0], b.v64[0]); } SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) { return c_v64_dotp_s16(a.v64[1], b.v64[1]) + c_v64_dotp_s16(a.v64[0], b.v64[0]); } SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) { // 32 bit products, 64 bit sum return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) + (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) + (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) + (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]); } SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) { return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]); } typedef struct { uint32_t val; int count; } c_sad128_internal; SIMD_INLINE c_sad128_internal c_v128_sad_u8_init(void) { c_sad128_internal t; t.val = t.count = 0; return t; } /* Implementation dependent return value. Result must be finalised with * v128_sad_u8_sum(). The result for more than 32 v128_sad_u8() calls is * undefined. */ SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a, c_v128 b) { int c; for (c = 0; c < 16; c++) s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; s.count++; if (SIMD_CHECK && s.count > 32) { fprintf(stderr, "Error: sad called 32 times returning an undefined result\n"); abort(); } return s; } SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s.val; } typedef uint32_t c_ssd128_internal; SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init(void) { return 0; } /* Implementation dependent return value. Result must be finalised with * v128_ssd_u8_sum(). */ SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a, c_v128 b) { int c; for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); return s; } SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; } SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]), c_v64_or(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]), c_v64_xor(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]), c_v64_and(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]), c_v64_andn(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]), c_v64_add_8(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]), c_v64_add_16(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]), c_v64_sadd_u8(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]), c_v64_sadd_s8(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]), c_v64_sadd_s16(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]), c_v64_add_32(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) { // Two complement overflow (silences sanitizers) return c_v128_from_64( a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1 : a.v64[1].u64 + b.v64[1].u64, a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1 : a.v64[0].u64 + b.v64[0].u64); } SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) { c_v128 t; t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3]; t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5]; t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7]; return t; } SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) { c_v128 t; t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1]; t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3]; t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5]; t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7]; t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9]; t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11]; t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13]; t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15]; return t; } SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]), c_v64_sub_8(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]), c_v64_ssub_u8(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]), c_v64_ssub_s8(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]), c_v64_sub_16(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]), c_v64_ssub_s16(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]), c_v64_ssub_u16(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]), c_v64_sub_32(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) { // Two complement underflow (silences sanitizers) return c_v128_from_64( a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1 : a.v64[1].u64 - b.v64[1].u64, a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1 : a.v64[0].u64 - b.v64[0].u64); } SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) { return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0])); } SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) { return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0])); } SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) { c_v64 lo_bits = c_v64_mullo_s16(a, b); c_v64 hi_bits = c_v64_mulhi_s16(a, b); return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits), c_v64_ziplo_16(hi_bits, lo_bits)); } SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]), c_v64_mullo_s16(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]), c_v64_mulhi_s16(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]), c_v64_mullo_s32(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]), c_v64_madd_s16(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]), c_v64_madd_us8(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]), c_v64_avg_u8(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]), c_v64_rdavg_u8(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]), c_v64_rdavg_u16(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]), c_v64_avg_u16(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]), c_v64_min_u8(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]), c_v64_max_u8(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]), c_v64_min_s8(a.v64[0], b.v64[0])); } SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) { return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) | ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) | ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) | ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) | ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) | ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) | ((a.s8[0] < 0) << 0); } SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) { c_v128 t; for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i]; return t; } SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]), c_v64_max_s8(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]), c_v64_min_s16(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]), c_v64_max_s16(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) { c_v128 t; int c; for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c]; return t; } SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) { c_v128 t; int c; for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c]; return t; } SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]), c_v64_ziplo_8(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]), c_v64_ziplo_8(a.v64[1], b.v64[1])); } SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]), c_v64_ziplo_16(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]), c_v64_ziplo_16(a.v64[1], b.v64[1])); } SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]), c_v64_ziplo_32(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]), c_v64_ziplo_32(a.v64[1], b.v64[1])); } SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) { return c_v128_from_v64(a.v64[0], b.v64[0]); } SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) { return c_v128_from_v64(a.v64[1], b.v64[1]); } SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) { return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b)); } SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) { return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b)); } SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) { return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b)); } SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) { c_v128 t; if (mode) { t.u8[15] = b.u8[15]; t.u8[14] = b.u8[13]; t.u8[13] = b.u8[11]; t.u8[12] = b.u8[9]; t.u8[11] = b.u8[7]; t.u8[10] = b.u8[5]; t.u8[9] = b.u8[3]; t.u8[8] = b.u8[1]; t.u8[7] = a.u8[15]; t.u8[6] = a.u8[13]; t.u8[5] = a.u8[11]; t.u8[4] = a.u8[9]; t.u8[3] = a.u8[7]; t.u8[2] = a.u8[5]; t.u8[1] = a.u8[3]; t.u8[0] = a.u8[1]; } else { t.u8[15] = a.u8[14]; t.u8[14] = a.u8[12]; t.u8[13] = a.u8[10]; t.u8[12] = a.u8[8]; t.u8[11] = a.u8[6]; t.u8[10] = a.u8[4]; t.u8[9] = a.u8[2]; t.u8[8] = a.u8[0]; t.u8[7] = b.u8[14]; t.u8[6] = b.u8[12]; t.u8[5] = b.u8[10]; t.u8[4] = b.u8[8]; t.u8[3] = b.u8[6]; t.u8[2] = b.u8[4]; t.u8[1] = b.u8[2]; t.u8[0] = b.u8[0]; } return t; } SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) { return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1) : _c_v128_unzip_8(a, b, 0); } SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) { return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0) : _c_v128_unzip_8(b, a, 1); } SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) { c_v128 t; if (mode) { t.u16[7] = b.u16[7]; t.u16[6] = b.u16[5]; t.u16[5] = b.u16[3]; t.u16[4] = b.u16[1]; t.u16[3] = a.u16[7]; t.u16[2] = a.u16[5]; t.u16[1] = a.u16[3]; t.u16[0] = a.u16[1]; } else { t.u16[7] = a.u16[6]; t.u16[6] = a.u16[4]; t.u16[5] = a.u16[2]; t.u16[4] = a.u16[0]; t.u16[3] = b.u16[6]; t.u16[2] = b.u16[4]; t.u16[1] = b.u16[2]; t.u16[0] = b.u16[0]; } return t; } SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) { return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1) : _c_v128_unzip_16(a, b, 0); } SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) { return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0) : _c_v128_unzip_16(b, a, 1); } SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) { c_v128 t; if (mode) { t.u32[3] = b.u32[3]; t.u32[2] = b.u32[1]; t.u32[1] = a.u32[3]; t.u32[0] = a.u32[1]; } else { t.u32[3] = a.u32[2]; t.u32[2] = a.u32[0]; t.u32[1] = b.u32[2]; t.u32[0] = b.u32[0]; } return t; } SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) { return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1) : _c_v128_unzip_32(a, b, 0); } SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) { return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0) : _c_v128_unzip_32(b, a, 1); } SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) { return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a)); } SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) { return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]), c_v64_unpacklo_u8_s16(a.v64[0])); } SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) { return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]), c_v64_unpacklo_u8_s16(a.v64[1])); } SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) { return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a)); } SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) { return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]), c_v64_unpacklo_s8_s16(a.v64[0])); } SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) { return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]), c_v64_unpacklo_s8_s16(a.v64[1])); } SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]), c_v64_pack_s32_s16(b.v64[1], b.v64[0])); } SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]), c_v64_pack_s32_u16(b.v64[1], b.v64[0])); } SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]), c_v64_pack_s16_u8(b.v64[1], b.v64[0])); } SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]), c_v64_pack_s16_s8(b.v64[1], b.v64[0])); } SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) { return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a)); } SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) { return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a)); } SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) { return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]), c_v64_unpacklo_u16_s32(a.v64[0])); } SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) { return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]), c_v64_unpacklo_s16_s32(a.v64[0])); } SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) { return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]), c_v64_unpacklo_u16_s32(a.v64[1])); } SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) { return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]), c_v64_unpacklo_s16_s32(a.v64[1])); } SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) { c_v128 t; int c; for (c = 0; c < 16; c++) t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15) : pattern.u8[c] & 15]; return t; } SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]), c_v64_cmpgt_s8(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]), c_v64_cmplt_s8(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]), c_v64_cmpeq_8(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]), c_v64_cmpgt_s16(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]), c_v64_cmplt_s16(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]), c_v64_cmpeq_16(a.v64[0], b.v64[0])); } SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) { c_v128 t; int c; for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]); return t; } SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) { c_v128 t; int c; for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]); return t; } SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) { c_v128 t; int c; for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]); return t; } SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) { if (n == 0) return a; if (n < 8) return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n), c_v64_shr_n_byte(a.v64[0], 8 - n)), c_v64_shl_n_byte(a.v64[0], n)); else return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero()); } SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) { if (n == 0) return a; if (n < 8) return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n), c_v64_or(c_v64_shr_n_byte(a.v64[0], n), c_v64_shl_n_byte(a.v64[1], 8 - n))); else return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8)); } SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) { if (SIMD_CHECK && c > 15) { fprintf(stderr, "Error: undefined alignment %d\n", c); abort(); } return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c)) : b; } SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c)); } SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c)); } SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c)); } SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c)); } SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c), c_v64_shr_u16(a.v64[0], c)); } SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c), c_v64_shr_s16(a.v64[0], c)); } SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c)); } SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c), c_v64_shr_u32(a.v64[0], c)); } SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c), c_v64_shr_s32(a.v64[0], c)); } SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) { a.v64[1].u64 <<= c; a.v64[0].u64 <<= c; return c_v128_from_v64(a.v64[1], a.v64[0]); } SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) { a.v64[1].u64 >>= c; a.v64[0].u64 >>= c; return c_v128_from_v64(a.v64[1], a.v64[0]); } SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) { a.v64[1].s64 >>= c; a.v64[0].s64 >>= c; return c_v128_from_v64(a.v64[1], a.v64[0]); } SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) { return c_v128_shl_8(a, n); } SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) { return c_v128_shl_16(a, n); } SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) { return c_v128_shl_32(a, n); } SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) { return c_v128_shl_64(a, n); } SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) { return c_v128_shr_u8(a, n); } SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) { return c_v128_shr_u16(a, n); } SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) { return c_v128_shr_u32(a, n); } SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) { return c_v128_shr_u64(a, n); } SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) { return c_v128_shr_s8(a, n); } SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) { return c_v128_shr_s16(a, n); } SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) { return c_v128_shr_s32(a, n); } SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) { return c_v128_shr_s64(a, n); } typedef uint32_t c_sad128_internal_u16; SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init(void) { return 0; } /* Implementation dependent return value. Result must be finalised with * v128_sad_u16_sum(). */ SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s, c_v128 a, c_v128 b) { int c; for (c = 0; c < 8; c++) s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c]; return s; } SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; } typedef uint64_t c_ssd128_internal_s16; SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init(void) { return 0; } /* Implementation dependent return value. Result must be finalised with * v128_ssd_s16_sum(). */ SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s, c_v128 a, c_v128 b) { int c; for (c = 0; c < 8; c++) s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) * (int32_t)(int16_t)(a.s16[c] - b.s16[c]); return s; } SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; } #endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ aom-3.12.1/aom_dsp/simd/v128_intrinsics_x86.h000066400000000000000000000506431477627663500206120ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ #define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ #include #include "aom_dsp/simd/v64_intrinsics_x86.h" typedef __m128i v128; SIMD_INLINE uint32_t v128_low_u32(v128 a) { return (uint32_t)_mm_cvtsi128_si32(a); } SIMD_INLINE v64 v128_low_v64(v128 a) { return _mm_unpacklo_epi64(a, v64_zero()); } SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); } SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return _mm_unpacklo_epi64(b, a); } SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) { return v128_from_v64(v64_from_64(a), v64_from_64(b)); } SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { return _mm_set_epi32((int)a, (int)b, (int)c, (int)d); } SIMD_INLINE v128 v128_load_aligned(const void *p) { return _mm_load_si128((__m128i *)p); } SIMD_INLINE v128 v128_load_unaligned(const void *p) { #if defined(__SSSE3__) return _mm_lddqu_si128((__m128i *)p); #else return _mm_loadu_si128((__m128i *)p); #endif } SIMD_INLINE void v128_store_aligned(void *p, v128 a) { _mm_store_si128((__m128i *)p, a); } SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { _mm_storeu_si128((__m128i *)p, a); } // The following function requires an immediate. // Some compilers will check this during optimisation, others wont. #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) #if defined(__SSSE3__) SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) { return c ? _mm_alignr_epi8(a, b, c) : b; } #else #define v128_align(a, b, c) \ ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) #endif #else #if defined(__SSSE3__) #define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b)) #else #define v128_align(a, b, c) \ ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) #endif #endif SIMD_INLINE v128 v128_zero(void) { return _mm_setzero_si128(); } SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); } SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); } SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); } SIMD_INLINE v128 v128_dup_64(uint64_t x) { // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers return _mm_set_epi32((int32_t)(x >> 32), (int32_t)x, (int32_t)(x >> 32), (int32_t)x); } SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); } SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); } SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); } SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); } SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); } SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); } SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); } SIMD_INLINE v128 v128_padd_s16(v128 a) { return _mm_madd_epi16(a, _mm_set1_epi16(1)); } SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); } SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); } SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); } SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); } SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); } SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); } SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); } SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); } SIMD_INLINE v128 v128_abs_s16(v128 a) { #if defined(__SSSE3__) return _mm_abs_epi16(a); #else return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); #endif } SIMD_INLINE v128 v128_abs_s8(v128 a) { #if defined(__SSSE3__) return _mm_abs_epi8(a); #else v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128()); return _mm_xor_si128(sign, _mm_add_epi8(a, sign)); #endif } SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return _mm_unpacklo_epi8(b, a); } SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return _mm_unpackhi_epi8(b, a); } SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { return _mm_unpacklo_epi16(b, a); } SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { return _mm_unpackhi_epi16(b, a); } SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { return _mm_unpacklo_epi32(b, a); } SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { return _mm_unpackhi_epi32(b, a); } SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { return _mm_unpacklo_epi64(b, a); } SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { return _mm_unpackhi_epi64(b, a); } SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); } SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); } SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); } SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) { return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8)); } SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) { #if defined(__SSSE3__) #ifdef __x86_64__ v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL); #else v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200); #endif return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order), _mm_shuffle_epi8(a, order)); #else return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1)); #endif } SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) { return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)); } SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) { #if defined(__SSSE3__) #ifdef __x86_64__ v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL); #else v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100); #endif return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order), _mm_shuffle_epi8(a, order)); #else return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2)); #endif } SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) { return _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1))); } SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) { return _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0))); } SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { return _mm_unpacklo_epi8(a, _mm_setzero_si128()); } SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { return _mm_unpacklo_epi8(a, _mm_setzero_si128()); } SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { return _mm_unpackhi_epi8(a, _mm_setzero_si128()); } SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); } SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); } SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); } SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { return _mm_packs_epi32(b, a); } SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { #if defined(__SSE4_1__) return _mm_packus_epi32(b, a); #else return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)), v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b))); #endif } SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { return _mm_packus_epi16(b, a); } SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { return _mm_packs_epi16(b, a); } SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { return _mm_unpacklo_epi16(a, _mm_setzero_si128()); } SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); } SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { return _mm_unpacklo_epi16(a, _mm_setzero_si128()); } SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); } SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { return _mm_unpackhi_epi16(a, _mm_setzero_si128()); } SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); } SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { #if defined(__SSSE3__) return _mm_shuffle_epi8(x, pattern); #else v128 output; unsigned char *input = (unsigned char *)&x; unsigned char *index = (unsigned char *)&pattern; unsigned char *selected = (unsigned char *)&output; int counter; for (counter = 0; counter < 16; counter++) { selected[counter] = input[index[counter] & 15]; } return output; #endif } SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b)); v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b)); v128 t = v128_add_32(t1, t2); t = v128_add_32(t, _mm_srli_si128(t, 8)); t = v128_add_32(t, _mm_srli_si128(t, 4)); return (int32_t)v128_low_u32(t); } SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { v128 r = _mm_madd_epi16(a, b); #if defined(__SSE4_1__) && defined(__x86_64__) v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r), _mm_cvtepi32_epi64(_mm_srli_si128(r, 8))); return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8))); #else return (int64_t)_mm_cvtsi128_si32(r) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12)); #endif } SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { v128 t = _mm_sad_epu8(a, _mm_setzero_si128()); return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t)); } typedef v128 sad128_internal; SIMD_INLINE sad128_internal v128_sad_u8_init(void) { return _mm_setzero_si128(); } /* Implementation dependent return value. Result must be finalised with v128_sad_sum(). The result for more than 32 v128_sad_u8() calls is undefined. */ SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { return _mm_add_epi64(s, _mm_sad_epu8(a, b)); } SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s))); } typedef int32_t ssd128_internal; SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) { return 0; } /* Implementation dependent return value. Result must be finalised with * v128_ssd_sum(). */ SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { v128 z = _mm_setzero_si128(); v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z)); v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z)); v128 rl = _mm_madd_epi16(l, l); v128 rh = _mm_madd_epi16(h, h); v128 r = _mm_add_epi32(rl, rh); r = _mm_add_epi32(r, _mm_srli_si128(r, 8)); r = _mm_add_epi32(r, _mm_srli_si128(r, 4)); return s + _mm_cvtsi128_si32(r); } SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; } SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); } SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); } SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); } SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); } SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { v64 lo_bits = v64_mullo_s16(a, b); v64 hi_bits = v64_mulhi_s16(a, b); return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits), v64_ziplo_16(hi_bits, lo_bits)); } SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { return _mm_mullo_epi16(a, b); } SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { return _mm_mulhi_epi16(a, b); } SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { #if defined(__SSE4_1__) return _mm_mullo_epi32(a, b); #else return _mm_unpacklo_epi32( _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8), _mm_shuffle_epi32( _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8)); #endif } SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { v128 r = v128_mullo_s32(a, b); return (int64_t)_mm_cvtsi128_si32(r) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12)); } SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); } SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { #if defined(__SSSE3__) return _mm_maddubs_epi16(a, b); #else return _mm_packs_epi32( _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)), _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()), _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8))); #endif } SIMD_INLINE v128 v128_padd_u8(v128 a) { return v128_madd_us8(a, _mm_set1_epi8(1)); } SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); } SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return _mm_sub_epi8(_mm_avg_epu8(a, b), _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1))); } SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) { return _mm_sub_epi16(_mm_avg_epu16(a, b), _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1))); } SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); } SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); } SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); } SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { #if defined(__SSE4_1__) return _mm_min_epi8(a, b); #else v128 mask = _mm_cmplt_epi8(a, b); return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); #endif } SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); } SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { #if defined(__SSE4_1__) return _mm_blendv_epi8(a, b, c); #else c = _mm_cmplt_epi8(c, v128_zero()); return v128_or(v128_and(b, c), v128_andn(a, c)); #endif } SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { #if defined(__SSE4_1__) return _mm_max_epi8(a, b); #else v128 mask = _mm_cmplt_epi8(b, a); return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); #endif } SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); } SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); } SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { #if defined(__SSE4_1__) return _mm_min_epi32(a, b); #else v128 mask = _mm_cmplt_epi32(a, b); return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); #endif } SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { #if defined(__SSE4_1__) return _mm_max_epi32(a, b); #else v128 mask = _mm_cmplt_epi32(b, a); return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); #endif } SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); } SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); } SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); } SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) { return _mm_cmpgt_epi16(a, b); } SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { return _mm_cmplt_epi16(a, b); } SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); } SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) { return _mm_cmpgt_epi32(a, b); } SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) { return _mm_cmplt_epi32(a, b); } SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); } SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)), _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c))); } SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)), _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c))); } SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { __m128i x = _mm_cvtsi32_si128((int)(c + 8)); return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x), _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x)); } SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { return _mm_sll_epi64(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { return _mm_srl_epi64(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { // _mm_sra_epi64 is missing in gcc? return v128_from_64((uint64_t)((int64_t)v64_u64(v128_high_v64(a)) >> c), (uint64_t)((int64_t)v64_u64(v128_low_v64(a)) >> c)); // return _mm_sra_epi64(a, _mm_cvtsi32_si128((int)c)); } /* These intrinsics require immediate values, so we must use #defines to enforce that. */ #define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127) #define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127) #define v128_shl_n_8(a, c) \ _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c)) #define v128_shr_n_u8(a, c) \ _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c)) #define v128_shr_n_s8(a, c) \ _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \ _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8)) #define v128_shl_n_16(a, c) _mm_slli_epi16(a, c) #define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c) #define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c) #define v128_shl_n_32(a, c) _mm_slli_epi32(a, c) #define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c) #define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c) #define v128_shl_n_64(a, c) _mm_slli_epi64(a, c) #define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c) #define v128_shr_n_s64(a, c) \ v128_shr_s64(a, c) // _mm_srai_epi64 missing in gcc? typedef v128 sad128_internal_u16; SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) { return v128_zero(); } /* Implementation dependent return value. Result must be finalised with * v128_sad_u16_sum(). */ SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, v128 b) { #if defined(__SSE4_1__) v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b)); #else v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)), v128_xor(b, v128_dup_16(32768))); t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)), v128_or(v128_and(a, t), v128_andn(b, t))); #endif return v128_add_32( s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t))); } SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) + v128_low_u32(v128_shr_n_byte(s, 8)) + v128_low_u32(v128_shr_n_byte(s, 12)); } typedef v128 ssd128_internal_s16; SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); } /* Implementation dependent return value. Result must be finalised with * v128_ssd_s16_sum(). */ SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, v128 b) { v128 d = v128_sub_16(a, b); d = v128_madd_s16(d, d); return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()), _mm_unpacklo_epi32(d, v128_zero()))); } SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s)); } #endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ aom-3.12.1/aom_dsp/simd/v256_intrinsics.h000066400000000000000000000334501477627663500201040ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ #define AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ #include #include #include #include "aom_dsp/simd/v256_intrinsics_c.h" #include "aom_dsp/simd/v128_intrinsics.h" #include "aom_dsp/simd/v64_intrinsics.h" /* Fallback to plain, unoptimised C. */ typedef c_v256 v256; SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); } SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); } SIMD_INLINE uint64_t v256_low_u64(v256 a) { return c_v256_low_u64(a); } SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); } SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); } SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) { return c_v256_from_v128(hi, lo); } SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { return c_v256_from_64(a, b, c, d); } SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { return c_v256_from_v64(a, b, c, d); } SIMD_INLINE v256 v256_load_unaligned(const void *p) { return c_v256_load_unaligned(p); } SIMD_INLINE v256 v256_load_aligned(const void *p) { return c_v256_load_aligned(p); } SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { c_v256_store_unaligned(p, a); } SIMD_INLINE void v256_store_aligned(void *p, v256 a) { c_v256_store_aligned(p, a); } SIMD_INLINE v256 v256_align(v256 a, v256 b, unsigned int c) { return c_v256_align(a, b, c); } SIMD_INLINE v256 v256_zero(void) { return c_v256_zero(); } SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); } SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); } SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); } SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); } SIMD_INLINE c_sad256_internal v256_sad_u8_init(void) { return c_v256_sad_u8_init(); } SIMD_INLINE c_sad256_internal v256_sad_u8(c_sad256_internal s, v256 a, v256 b) { return c_v256_sad_u8(s, a, b); } SIMD_INLINE uint32_t v256_sad_u8_sum(c_sad256_internal s) { return c_v256_sad_u8_sum(s); } SIMD_INLINE c_ssd256_internal v256_ssd_u8_init(void) { return c_v256_ssd_u8_init(); } SIMD_INLINE c_ssd256_internal v256_ssd_u8(c_ssd256_internal s, v256 a, v256 b) { return c_v256_ssd_u8(s, a, b); } SIMD_INLINE uint32_t v256_ssd_u8_sum(c_ssd256_internal s) { return c_v256_ssd_u8_sum(s); } SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16_init(void) { return c_v256_ssd_s16_init(); } SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16(c_ssd256_internal_s16 s, v256 a, v256 b) { return c_v256_ssd_s16(s, a, b); } SIMD_INLINE uint64_t v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return c_v256_ssd_s16_sum(s); } SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { return c_v256_dotp_su8(a, b); } SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { return c_v256_dotp_s16(a, b); } SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { return c_v256_dotp_s32(a, b); } SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); } SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); } SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); } SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); } SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); } SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); } SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); } SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return c_v256_sadd_s8(a, b); } SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return c_v256_sadd_u8(a, b); } SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); } SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); } SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return c_v256_add_64(a, b); } SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return c_v256_sub_64(a, b); } SIMD_INLINE v256 v256_padd_u8(v256 a) { return c_v256_padd_u8(a); } SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); } SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); } SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); } SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); } SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); } SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); } SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return c_v256_ssub_u16(a, b); } SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); } SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); } SIMD_INLINE v256 v256_abs_s8(v256 a) { return c_v256_abs_s8(a); } SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); } SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { return c_v256_mullo_s16(a, b); } SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { return c_v256_mulhi_s16(a, b); } SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { return c_v256_mullo_s32(a, b); } SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); } SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); } SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return c_v256_movemask_8(a); } SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { return c_v256_blend_8(a, b, c); } SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); } SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); } SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { return c_v256_rdavg_u16(a, b); } SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); } SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); } SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); } SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); } SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); } SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); } SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); } SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return c_v256_min_s32(a, b); } SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return c_v256_max_s32(a, b); } SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); } SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); } SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); } SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); } SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); } SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); } SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); } SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); } SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { return c_v256_ziplo_128(a, b); } SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { return c_v256_ziphi_128(a, b); } SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); } SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); } SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); } SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { return c_v256_unziplo_8(a, b); } SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { return c_v256_unziphi_8(a, b); } SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { return c_v256_unziplo_16(a, b); } SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { return c_v256_unziphi_16(a, b); } SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { return c_v256_unziplo_32(a, b); } SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { return c_v256_unziphi_32(a, b); } SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { return c_v256_unziplo_64(a, b); } SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { return c_v256_unziphi_64(a, b); } SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); } SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { return c_v256_unpacklo_u8_s16(a); } SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { return c_v256_unpackhi_u8_s16(a); } SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { return c_v256_unpack_s8_s16(a); } SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { return c_v256_unpacklo_s8_s16(a); } SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { return c_v256_unpackhi_s8_s16(a); } SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { return c_v256_pack_s32_s16(a, b); } SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { return c_v256_pack_s32_u16(a, b); } SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { return c_v256_pack_s16_u8(a, b); } SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { return c_v256_pack_s16_s8(a, b); } SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { return c_v256_unpack_u16_s32(a); } SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { return c_v256_unpack_s16_s32(a); } SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { return c_v256_unpacklo_u16_s32(a); } SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { return c_v256_unpacklo_s16_s32(a); } SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { return c_v256_unpackhi_u16_s32(a); } SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { return c_v256_unpackhi_s16_s32(a); } SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { return c_v256_shuffle_8(a, pattern); } SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) { return c_v256_wideshuffle_8(a, b, pattern); } SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { return c_v256_pshuffle_8(a, pattern); } SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); } SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); } SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); } SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { return c_v256_cmpgt_s16(a, b); } SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { return c_v256_cmplt_s16(a, b); } SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); } SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { return c_v256_cmpeq_32(a, b); } SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { return c_v256_cmpgt_s32(a, b); } SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { return c_v256_cmplt_s32(a, b); } SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { return c_v256_shl_8(a, c); } SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { return c_v256_shr_u8(a, c); } SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { return c_v256_shr_s8(a, c); } SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { return c_v256_shl_16(a, c); } SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { return c_v256_shr_u16(a, c); } SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { return c_v256_shr_s16(a, c); } SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { return c_v256_shl_32(a, c); } SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { return c_v256_shr_u32(a, c); } SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { return c_v256_shr_s32(a, c); } SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) { return c_v256_shl_64(a, c); } SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) { return c_v256_shr_u64(a, c); } SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) { return c_v256_shr_s64(a, c); } SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) { return c_v256_shr_n_byte(a, n); } SIMD_INLINE v256 v256_shl_n_byte(v256 a, unsigned int n) { return c_v256_shl_n_byte(a, n); } SIMD_INLINE v256 v256_shl_n_8(v256 a, unsigned int n) { return c_v256_shl_n_8(a, n); } SIMD_INLINE v256 v256_shl_n_16(v256 a, unsigned int n) { return c_v256_shl_n_16(a, n); } SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) { return c_v256_shl_n_32(a, n); } SIMD_INLINE v256 v256_shl_n_64(v256 a, unsigned int n) { return c_v256_shl_n_64(a, n); } SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) { return c_v256_shr_n_u8(a, n); } SIMD_INLINE v256 v256_shr_n_u16(v256 a, unsigned int n) { return c_v256_shr_n_u16(a, n); } SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) { return c_v256_shr_n_u32(a, n); } SIMD_INLINE v256 v256_shr_n_u64(v256 a, unsigned int n) { return c_v256_shr_n_u64(a, n); } SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) { return c_v256_shr_n_s8(a, n); } SIMD_INLINE v256 v256_shr_n_s16(v256 a, unsigned int n) { return c_v256_shr_n_s16(a, n); } SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) { return c_v256_shr_n_s32(a, n); } SIMD_INLINE v256 v256_shr_n_s64(v256 a, unsigned int n) { return c_v256_shr_n_s64(a, n); } SIMD_INLINE v256 v256_shr_n_word(v256 a, unsigned int n) { return c_v256_shr_n_word(a, n); } SIMD_INLINE v256 v256_shl_n_word(v256 a, unsigned int n) { return c_v256_shl_n_word(a, n); } typedef uint32_t sad256_internal_u16; SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { return c_v256_sad_u16_init(); } SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, v256 b) { return c_v256_sad_u16(s, a, b); } SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { return c_v256_sad_u16_sum(s); } #endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ aom-3.12.1/aom_dsp/simd/v256_intrinsics_c.h000066400000000000000000000744601477627663500204140ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ #define AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ #include #include #include #include "config/aom_config.h" #include "aom_dsp/simd/v128_intrinsics_c.h" typedef union { uint8_t u8[32]; uint16_t u16[16]; uint32_t u32[8]; uint64_t u64[4]; int8_t s8[32]; int16_t s16[16]; int32_t s32[8]; int64_t s64[4]; c_v64 v64[4]; c_v128 v128[2]; } c_v256; SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; } SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; } SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; } SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; } SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; } SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) { c_v256 t; t.v128[1] = hi; t.v128[0] = lo; return t; } SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { c_v256 t; t.u64[3] = a; t.u64[2] = b; t.u64[1] = c; t.u64[0] = d; return t; } SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) { c_v256 t; t.u64[3] = a.u64; t.u64[2] = b.u64; t.u64[1] = c.u64; t.u64[0] = d.u64; return t; } SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) { c_v256 t; memcpy(&t, p, 32); return t; } SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) { if (SIMD_CHECK && (uintptr_t)p & 31) { fprintf(stderr, "Error: unaligned v256 load at %p\n", p); abort(); } return c_v256_load_unaligned(p); } SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) { memcpy(p, &a, 32); } SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) { if (SIMD_CHECK && (uintptr_t)p & 31) { fprintf(stderr, "Error: unaligned v256 store at %p\n", p); abort(); } c_v256_store_unaligned(p, a); } SIMD_INLINE c_v256 c_v256_zero(void) { c_v256 t; t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0; return t; } SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) { c_v256 t; t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x); return t; } SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) { c_v256 t; t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x); return t; } SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) { c_v256 t; t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x); return t; } SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) { c_v256 t; t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x; return t; } SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) { return c_v128_dotp_su8(a.v128[1], b.v128[1]) + c_v128_dotp_su8(a.v128[0], b.v128[0]); } SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) { return c_v128_dotp_s16(a.v128[1], b.v128[1]) + c_v128_dotp_s16(a.v128[0], b.v128[0]); } SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) { return c_v128_dotp_s32(a.v128[1], b.v128[1]) + c_v128_dotp_s32(a.v128[0], b.v128[0]); } SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) { return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]); } typedef struct { uint32_t val; int count; } c_sad256_internal; SIMD_INLINE c_sad256_internal c_v256_sad_u8_init(void) { c_sad256_internal t; t.val = t.count = 0; return t; } /* Implementation dependent return value. Result must be finalised with v256_sad_u8_sum(). The result for more than 16 v256_sad_u8() calls is undefined. */ SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a, c_v256 b) { int c; for (c = 0; c < 32; c++) s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; s.count++; if (SIMD_CHECK && s.count > 32) { fprintf(stderr, "Error: sad called 32 times returning an undefined result\n"); abort(); } return s; } SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s.val; } typedef uint32_t c_ssd256_internal; SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init(void) { return 0; } /* Implementation dependent return value. Result must be finalised with * v256_ssd_u8_sum(). */ SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a, c_v256 b) { int c; for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); return s; } SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; } SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]), c_v128_or(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]), c_v128_xor(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]), c_v128_and(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]), c_v128_andn(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]), c_v128_add_8(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]), c_v128_add_16(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]), c_v128_sadd_s8(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]), c_v128_sadd_u8(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]), c_v128_sadd_s16(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]), c_v128_add_32(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]), c_v128_add_64(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]), c_v128_sub_64(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) { c_v256 t; for (int i = 0; i < 16; i++) t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1]; return t; } SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) { c_v256 t; t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3]; t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5]; t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7]; t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9]; t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11]; t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13]; t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15]; return t; } SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]), c_v128_sub_8(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]), c_v128_ssub_u8(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]), c_v128_ssub_s8(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]), c_v128_sub_16(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]), c_v128_ssub_s16(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]), c_v128_ssub_u16(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]), c_v128_sub_32(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) { return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0])); } SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) { return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0])); } SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) { c_v128 lo_bits = c_v128_mullo_s16(a, b); c_v128 hi_bits = c_v128_mulhi_s16(a, b); return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits), c_v128_ziplo_16(hi_bits, lo_bits)); } SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]), c_v128_mullo_s16(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]), c_v128_mulhi_s16(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]), c_v128_mullo_s32(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]), c_v128_madd_s16(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]), c_v128_madd_us8(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]), c_v128_avg_u8(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]), c_v128_rdavg_u8(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]), c_v128_rdavg_u16(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]), c_v128_avg_u16(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]), c_v128_min_u8(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]), c_v128_max_u8(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]), c_v128_min_s8(a.v128[0], b.v128[0])); } SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) { return ((uint32_t)(a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) | ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) | ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) | ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) | ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) | ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) | ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) | ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) | ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) | ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) | ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) | ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) | ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) | ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) | ((a.s8[0] < 0) << 0); } SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) { c_v256 t; for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i]; return t; } SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]), c_v128_max_s8(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]), c_v128_min_s16(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]), c_v128_max_s16(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]), c_v128_min_s32(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]), c_v128_max_s32(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]), c_v128_ziplo_8(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]), c_v128_ziplo_8(a.v128[1], b.v128[1])); } SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]), c_v128_ziplo_16(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]), c_v128_ziplo_16(a.v128[1], b.v128[1])); } SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]), c_v128_ziplo_32(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]), c_v128_ziplo_32(a.v128[1], b.v128[1])); } SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]), c_v128_ziplo_64(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]), c_v128_ziplo_64(a.v128[1], b.v128[1])); } SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) { return c_v256_from_v128(a.v128[0], b.v128[0]); } SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) { return c_v256_from_v128(a.v128[1], b.v128[1]); } SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) { return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b)); } SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) { return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b)); } SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) { return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b)); } SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) { c_v256 t; int i; if (mode) { for (i = 0; i < 16; i++) { t.u8[i] = a.u8[i * 2 + 1]; t.u8[i + 16] = b.u8[i * 2 + 1]; } } else { for (i = 0; i < 16; i++) { t.u8[i] = b.u8[i * 2]; t.u8[i + 16] = a.u8[i * 2]; } } return t; } SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) { return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1) : _c_v256_unzip_8(a, b, 0); } SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) { return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0) : _c_v256_unzip_8(b, a, 1); } SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) { c_v256 t; int i; if (mode) { for (i = 0; i < 8; i++) { t.u16[i] = a.u16[i * 2 + 1]; t.u16[i + 8] = b.u16[i * 2 + 1]; } } else { for (i = 0; i < 8; i++) { t.u16[i] = b.u16[i * 2]; t.u16[i + 8] = a.u16[i * 2]; } } return t; } SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) { return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1) : _c_v256_unzip_16(a, b, 0); } SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) { return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0) : _c_v256_unzip_16(b, a, 1); } SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) { c_v256 t; if (mode) { t.u32[7] = b.u32[7]; t.u32[6] = b.u32[5]; t.u32[5] = b.u32[3]; t.u32[4] = b.u32[1]; t.u32[3] = a.u32[7]; t.u32[2] = a.u32[5]; t.u32[1] = a.u32[3]; t.u32[0] = a.u32[1]; } else { t.u32[7] = a.u32[6]; t.u32[6] = a.u32[4]; t.u32[5] = a.u32[2]; t.u32[4] = a.u32[0]; t.u32[3] = b.u32[6]; t.u32[2] = b.u32[4]; t.u32[1] = b.u32[2]; t.u32[0] = b.u32[0]; } return t; } SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) { return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1) : _c_v256_unzip_32(a, b, 0); } SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) { return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0) : _c_v256_unzip_32(b, a, 1); } SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) { c_v256 t; if (mode) { t.u64[3] = b.u64[3]; t.u64[2] = b.u64[1]; t.u64[1] = a.u64[3]; t.u64[0] = a.u64[1]; } else { t.u64[3] = a.u64[2]; t.u64[2] = a.u64[0]; t.u64[1] = b.u64[2]; t.u64[0] = b.u64[0]; } return t; } SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) { return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1) : _c_v256_unzip_64(a, b, 0); } SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) { return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0) : _c_v256_unzip_64(b, a, 1); } SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) { return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a)); } SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) { return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]), c_v128_unpacklo_u8_s16(a.v128[0])); } SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) { return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]), c_v128_unpacklo_u8_s16(a.v128[1])); } SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) { return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a)); } SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) { return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]), c_v128_unpacklo_s8_s16(a.v128[0])); } SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) { return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]), c_v128_unpacklo_s8_s16(a.v128[1])); } SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]), c_v128_pack_s32_s16(b.v128[1], b.v128[0])); } SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]), c_v128_pack_s32_u16(b.v128[1], b.v128[0])); } SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]), c_v128_pack_s16_u8(b.v128[1], b.v128[0])); } SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]), c_v128_pack_s16_s8(b.v128[1], b.v128[0])); } SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) { return c_v256_from_v128(c_v128_unpackhi_u16_s32(a), c_v128_unpacklo_u16_s32(a)); } SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) { return c_v256_from_v128(c_v128_unpackhi_s16_s32(a), c_v128_unpacklo_s16_s32(a)); } SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) { return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]), c_v128_unpacklo_u16_s32(a.v128[0])); } SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) { return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]), c_v128_unpacklo_s16_s32(a.v128[0])); } SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) { return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]), c_v128_unpacklo_u16_s32(a.v128[1])); } SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) { return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]), c_v128_unpacklo_s16_s32(a.v128[1])); } SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) { c_v256 t; int c; for (c = 0; c < 32; c++) t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31) : pattern.u8[c] & 31]; return t; } SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) { c_v256 t; int c; for (c = 0; c < 32; c++) t.u8[c] = (pattern.u8[c] < 32 ? b.u8 : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31) : pattern.u8[c] & 31]; return t; } // Pairwise / dual-lane shuffle: shuffle two 128 bit lates. SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) { return c_v256_from_v128( c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)), c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern))); } SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]), c_v128_cmpgt_s8(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]), c_v128_cmplt_s8(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]), c_v128_cmpeq_8(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]), c_v128_cmpgt_s16(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]), c_v128_cmplt_s16(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]), c_v128_cmpeq_16(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]), c_v128_cmpgt_s32(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]), c_v128_cmplt_s32(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]), c_v128_cmpeq_32(a.v128[0], b.v128[0])); } SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) { if (n == 0) return a; if (n < 16) return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n), c_v128_shr_n_byte(a.v128[0], 16 - n)), c_v128_shl_n_byte(a.v128[0], n)); else if (n > 16) return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16), c_v128_zero()); else return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero()); } SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) { if (n == 0) return a; if (n < 16) return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n), c_v128_or(c_v128_shr_n_byte(a.v128[0], n), c_v128_shl_n_byte(a.v128[1], 16 - n))); else if (n > 16) return c_v256_from_v128(c_v128_zero(), c_v128_shr_n_byte(a.v128[1], n - 16)); else return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a)); } SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, unsigned int c) { if (SIMD_CHECK && c > 31) { fprintf(stderr, "Error: undefined alignment %d\n", c); abort(); } return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c)) : b; } SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, unsigned int c) { return c_v256_from_v128(c_v128_shl_8(a.v128[1], c), c_v128_shl_8(a.v128[0], c)); } SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, unsigned int c) { return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c), c_v128_shr_u8(a.v128[0], c)); } SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, unsigned int c) { return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c), c_v128_shr_s8(a.v128[0], c)); } SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, unsigned int c) { return c_v256_from_v128(c_v128_shl_16(a.v128[1], c), c_v128_shl_16(a.v128[0], c)); } SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, unsigned int c) { return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c), c_v128_shr_u16(a.v128[0], c)); } SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, unsigned int c) { return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c), c_v128_shr_s16(a.v128[0], c)); } SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, unsigned int c) { return c_v256_from_v128(c_v128_shl_32(a.v128[1], c), c_v128_shl_32(a.v128[0], c)); } SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, unsigned int c) { return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c), c_v128_shr_u32(a.v128[0], c)); } SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) { return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c), c_v128_shr_s32(a.v128[0], c)); } SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) { c_v256 t; if (SIMD_CHECK && n > 63) { fprintf(stderr, "Error: undefined s64 shift right %d\n", n); abort(); } t.s64[3] = a.s64[3] >> n; t.s64[2] = a.s64[2] >> n; t.s64[1] = a.s64[1] >> n; t.s64[0] = a.s64[0] >> n; return t; } SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) { c_v256 t; if (SIMD_CHECK && n > 63) { fprintf(stderr, "Error: undefined s64 shift right %d\n", n); abort(); } t.u64[3] = a.u64[3] >> n; t.u64[2] = a.u64[2] >> n; t.u64[1] = a.u64[1] >> n; t.u64[0] = a.u64[0] >> n; return t; } SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) { c_v256 t; if (SIMD_CHECK && n > 63) { fprintf(stderr, "Error: undefined s64 shift right %d\n", n); abort(); } t.u64[3] = a.u64[3] << n; t.u64[2] = a.u64[2] << n; t.u64[1] = a.u64[1] << n; t.u64[0] = a.u64[0] << n; return t; } SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) { return c_v256_shl_8(a, n); } SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, unsigned int n) { return c_v256_shl_16(a, n); } SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) { return c_v256_shl_32(a, n); } SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) { return c_v256_shl_64(a, n); } SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) { return c_v256_shr_u8(a, n); } SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, unsigned int n) { return c_v256_shr_u16(a, n); } SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) { return c_v256_shr_u32(a, n); } SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) { return c_v256_shr_u64(a, n); } SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) { return c_v256_shr_s8(a, n); } SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, unsigned int n) { return c_v256_shr_s16(a, n); } SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) { return c_v256_shr_s32(a, n); } SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) { return c_v256_shr_s64(a, n); } SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) { return c_v256_shr_n_byte(a, 2 * n); } SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) { return c_v256_shl_n_byte(a, 2 * n); } typedef uint32_t c_sad256_internal_u16; SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init(void) { return 0; } /* Implementation dependent return value. Result must be finalised with v256_sad_u16_sum(). */ SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s, c_v256 a, c_v256 b) { int c; for (c = 0; c < 16; c++) s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c]; return s; } SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; } typedef uint64_t c_ssd256_internal_s16; SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init(void) { return 0; } /* Implementation dependent return value. Result must be finalised with * v256_ssd_s16_sum(). */ SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s, c_v256 a, c_v256 b) { int c; for (c = 0; c < 16; c++) s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) * (int32_t)(int16_t)(a.s16[c] - b.s16[c]); return s; } SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; } #endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ aom-3.12.1/aom_dsp/simd/v256_intrinsics_v128.h000066400000000000000000000664241477627663500206730ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ #define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ #include "config/aom_config.h" #if HAVE_NEON #error "Do not use this file for Neon" #endif #if HAVE_SSE2 #include "aom_dsp/simd/v128_intrinsics_x86.h" #else #include "aom_dsp/simd/v128_intrinsics.h" #endif typedef struct { v128 val[2]; } v256; SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); } SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); } SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); } SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; } SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; } SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) { v256 t; t.val[1] = hi; t.val[0] = lo; return t; } SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d)); } SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d)); } SIMD_INLINE v256 v256_load_unaligned(const void *p) { return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16), v128_load_unaligned(p)); } SIMD_INLINE v256 v256_load_aligned(const void *p) { return v256_from_v128(v128_load_aligned((uint8_t *)p + 16), v128_load_aligned(p)); } SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { v128_store_unaligned(p, a.val[0]); v128_store_unaligned((uint8_t *)p + 16, a.val[1]); } SIMD_INLINE void v256_store_aligned(void *p, v256 a) { v128_store_aligned(p, a.val[0]); v128_store_aligned((uint8_t *)p + 16, a.val[1]); } SIMD_INLINE v256 v256_zero(void) { return v256_from_v128(v128_zero(), v128_zero()); } SIMD_INLINE v256 v256_dup_8(uint8_t x) { v128 t = v128_dup_8(x); return v256_from_v128(t, t); } SIMD_INLINE v256 v256_dup_16(uint16_t x) { v128 t = v128_dup_16(x); return v256_from_v128(t, t); } SIMD_INLINE v256 v256_dup_32(uint32_t x) { v128 t = v128_dup_32(x); return v256_from_v128(t, t); } SIMD_INLINE v256 v256_dup_64(uint64_t x) { v128 t = v128_dup_64(x); return v256_from_v128(t, t); } SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]); } SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]); } SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]); } SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]); } typedef struct { sad128_internal val[2]; } sad256_internal; SIMD_INLINE sad256_internal v256_sad_u8_init(void) { sad256_internal t; t.val[1] = v128_sad_u8_init(); t.val[0] = v128_sad_u8_init(); return t; } /* Implementation dependent return value. Result must be finalised with v256_sad_u8_sum(). The result for more than 16 v256_sad_u8() calls is undefined. */ SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { sad256_internal t; t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]); t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]); return t; } SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]); } typedef struct { ssd128_internal val[2]; } ssd256_internal; SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) { ssd256_internal t; t.val[1] = v128_ssd_u8_init(); t.val[0] = v128_ssd_u8_init(); return t; } /* Implementation dependent return value. Result must be finalised with * v256_ssd_u8_sum(). */ SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { ssd256_internal t; t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]); t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]); return t; } SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]); } SIMD_INLINE v256 v256_or(v256 a, v256 b) { return v256_from_v128(v128_or(a.val[1], b.val[1]), v128_or(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return v256_from_v128(v128_xor(a.val[1], b.val[1]), v128_xor(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_and(v256 a, v256 b) { return v256_from_v128(v128_and(a.val[1], b.val[1]), v128_and(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return v256_from_v128(v128_andn(a.val[1], b.val[1]), v128_andn(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return v256_from_v128(v128_add_8(a.val[1], b.val[1]), v128_add_8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return v256_from_v128(v128_add_16(a.val[1], b.val[1]), v128_add_16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]), v128_sadd_s8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]), v128_sadd_u8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]), v128_sadd_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return v256_from_v128(v128_add_32(a.val[1], b.val[1]), v128_add_32(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return v256_from_v128(v128_add_64(a.val[1], b.val[1]), v128_add_64(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_padd_u8(v256 a) { return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0])); } SIMD_INLINE v256 v256_padd_s16(v256 a) { return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0])); } SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return v256_from_v128(v128_sub_8(a.val[1], b.val[1]), v128_sub_8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]), v128_ssub_u8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]), v128_ssub_s8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return v256_from_v128(v128_sub_16(a.val[1], b.val[1]), v128_sub_16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]), v128_ssub_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]), v128_ssub_u16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return v256_from_v128(v128_sub_32(a.val[1], b.val[1]), v128_sub_32(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return v256_from_v128(v128_sub_64(a.val[1], b.val[1]), v128_sub_64(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_abs_s16(v256 a) { return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0])); } SIMD_INLINE v256 v256_abs_s8(v256 a) { return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0])); } SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { v128 lo_bits = v128_mullo_s16(a, b); v128 hi_bits = v128_mulhi_s16(a, b); return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits), v128_ziplo_16(hi_bits, lo_bits)); } SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]), v128_mullo_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]), v128_mulhi_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]), v128_mullo_s32(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]), v128_madd_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]), v128_madd_us8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]), v128_avg_u8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]), v128_rdavg_u8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]), v128_rdavg_u16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]), v128_avg_u16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return v256_from_v128(v128_min_u8(a.val[1], b.val[1]), v128_min_u8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return v256_from_v128(v128_max_u8(a.val[1], b.val[1]), v128_max_u8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return v256_from_v128(v128_min_s8(a.val[1], b.val[1]), v128_min_s8(a.val[0], b.val[0])); } SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return (v128_movemask_8(v256_high_v128(a)) << 16) | v128_movemask_8(v256_low_v128(a)); } SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]), v128_blend_8(a.val[0], b.val[0], c.val[0])); } SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return v256_from_v128(v128_max_s8(a.val[1], b.val[1]), v128_max_s8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return v256_from_v128(v128_min_s16(a.val[1], b.val[1]), v128_min_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return v256_from_v128(v128_max_s16(a.val[1], b.val[1]), v128_max_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return v256_from_v128(v128_min_s32(a.val[1], b.val[1]), v128_min_s32(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return v256_from_v128(v128_max_s32(a.val[1], b.val[1]), v128_max_s32(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]), v128_ziplo_8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]), v128_ziplo_8(a.val[1], b.val[1])); } SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]), v128_ziplo_16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]), v128_ziplo_16(a.val[1], b.val[1])); } SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]), v128_ziplo_32(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]), v128_ziplo_32(a.val[1], b.val[1])); } SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]), v128_ziplo_64(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]), v128_ziplo_64(a.val[1], b.val[1])); } SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { return v256_from_v128(a.val[0], b.val[0]); } SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { return v256_from_v128(a.val[1], b.val[1]); } SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b)); } SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b)); } SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); } SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]), v128_unziplo_8(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]), v128_unziphi_8(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]), v128_unziplo_16(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]), v128_unziphi_16(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]), v128_unziplo_32(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]), v128_unziphi_32(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { #if HAVE_SSE2 return v256_from_v128( _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]), _mm_castsi128_pd(a.val[1]), 0)), _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]), _mm_castsi128_pd(b.val[1]), 0))); #else return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]), v128_low_v64(b.val[1]), v128_low_v64(b.val[0])); #endif } SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { #if HAVE_SSE2 return v256_from_v128( _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]), _mm_castsi128_pd(a.val[1]), 3)), _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]), _mm_castsi128_pd(b.val[1]), 3))); #else return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]), v128_high_v64(b.val[1]), v128_high_v64(b.val[0])); #endif } SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a)); } SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]), v128_unpacklo_u8_s16(a.val[0])); } SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]), v128_unpacklo_u8_s16(a.val[1])); } SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a)); } SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]), v128_unpacklo_s8_s16(a.val[0])); } SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]), v128_unpacklo_s8_s16(a.val[1])); } SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]), v128_pack_s32_s16(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]), v128_pack_s32_u16(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]), v128_pack_s16_u8(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]), v128_pack_s16_s8(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a)); } SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a)); } SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]), v128_unpacklo_u16_s32(a.val[0])); } SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]), v128_unpacklo_s16_s32(a.val[0])); } SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]), v128_unpacklo_u16_s32(a.val[1])); } SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]), v128_unpacklo_s16_s32(a.val[1])); } SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]), v128_cmpgt_s8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]), v128_cmplt_s8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]), v128_cmpeq_8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]), v128_cmpgt_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]), v128_cmplt_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]), v128_cmpeq_16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]), v128_cmpgt_s32(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]), v128_cmplt_s32(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]), v128_cmpeq_32(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) { v128 c16 = v128_dup_8(16); v128 maskhi = v128_cmplt_s8(pattern.val[1], c16); v128 masklo = v128_cmplt_s8(pattern.val[0], c16); return v256_from_v128( v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)), v128_shuffle_8(x.val[0], pattern.val[1]), maskhi), v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)), v128_shuffle_8(x.val[0], pattern.val[0]), masklo)); } SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) { v128 c16 = v128_dup_8(16); v128 c32 = v128_dup_8(32); v128 c48 = v128_dup_8(48); v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]); v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]); v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]); v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]); v256 r1 = v256_from_v128( v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)), v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)), maskhi48), v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)), v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)), masklo48)); v256 r2 = v256_from_v128( v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)), v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16), v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)), v128_shuffle_8(y.val[0], pattern.val[0]), masklo16)); return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern)); } SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { return v256_from_v128( v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)), v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern))); } SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) { return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c)); } SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) { return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c)); } SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) { return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c)); } SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) { return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c)); } SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) { return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c)); } SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) { return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c)); } SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) { return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c)); } SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) { return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c)); } SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) { return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c)); } SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) { return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c)); } SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) { return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c)); } SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) { return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c)); } /* These intrinsics require immediate values, so we must use #defines to enforce that. */ #define v256_shl_n_byte(a, n) \ ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n), \ v128_shr_n_byte(a.val[0], 16 - (n))), \ v128_shl_n_byte(a.val[0], (n))) \ : v256_from_v128( \ (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \ v128_zero())) #define v256_shr_n_byte(a, n) \ (n == 0 \ ? a \ : ((n) < 16 \ ? v256_from_v128(v128_shr_n_byte(a.val[1], n), \ v128_or(v128_shr_n_byte(a.val[0], n), \ v128_shl_n_byte(a.val[1], 16 - (n)))) \ : v256_from_v128( \ v128_zero(), \ (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1]))) #define v256_align(a, b, c) \ ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b) #define v256_shl_n_8(a, n) \ v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n)) #define v256_shl_n_16(a, n) \ v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n)) #define v256_shl_n_32(a, n) \ v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n)) #define v256_shl_n_64(a, n) \ v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n)) #define v256_shr_n_u8(a, n) \ v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n)) #define v256_shr_n_u16(a, n) \ v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n)) #define v256_shr_n_u32(a, n) \ v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n)) #define v256_shr_n_u64(a, n) \ v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n)) #define v256_shr_n_s8(a, n) \ v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n)) #define v256_shr_n_s16(a, n) \ v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n)) #define v256_shr_n_s32(a, n) \ v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n)) #define v256_shr_n_s64(a, n) \ v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n)) #define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n)) #define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n)) typedef struct { sad128_internal_u16 val[2]; } sad256_internal_u16; SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { sad256_internal_u16 t; t.val[1] = v128_sad_u16_init(); t.val[0] = v128_sad_u16_init(); return t; } /* Implementation dependent return value. Result must be finalised with v256_sad_u16_sum(). The result for more than 16 v256_sad_u16() calls is undefined. */ SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, v256 b) { sad256_internal_u16 t; t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]); t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]); return t; } SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]); } typedef struct { ssd128_internal_s16 val[2]; } ssd256_internal_s16; SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { ssd256_internal_s16 t; t.val[1] = v128_ssd_s16_init(); t.val[0] = v128_ssd_s16_init(); return t; } /* Implementation dependent return value. Result must be finalised with * v256_ssd_s16_sum(). */ SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a, v256 b) { ssd256_internal_s16 t; t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]); t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]); return t; } SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]); } #endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ aom-3.12.1/aom_dsp/simd/v256_intrinsics_x86.h000066400000000000000000000636041477627663500206150ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ #define AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ #if !defined(__AVX2__) #include "aom_dsp/simd/v256_intrinsics_v128.h" #else // The _m256i type seems to cause problems for g++'s mangling prior to // version 5, but adding -fabi-version=0 fixes this. #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 && \ defined(__AVX2__) && defined(__cplusplus) #pragma GCC optimize "-fabi-version=0" #endif #include #include "aom_dsp/simd/v128_intrinsics_x86.h" typedef __m256i v256; SIMD_INLINE uint32_t v256_low_u32(v256 a) { return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0)); } SIMD_INLINE v64 v256_low_v64(v256 a) { return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero()); } SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); } SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); } SIMD_INLINE v128 v256_high_v128(v256 a) { return _mm256_extracti128_si256(a, 1); } SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) { // gcc seems to be missing _mm256_set_m128i() return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1); } SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d)); } SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { return _mm256_set_epi64x((int64_t)a, (int64_t)b, (int64_t)c, (int64_t)d); } SIMD_INLINE v256 v256_load_aligned(const void *p) { return _mm256_load_si256((const __m256i *)p); } SIMD_INLINE v256 v256_load_unaligned(const void *p) { return _mm256_loadu_si256((const __m256i *)p); } SIMD_INLINE void v256_store_aligned(void *p, v256 a) { _mm256_store_si256((__m256i *)p, a); } SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { _mm256_storeu_si256((__m256i *)p, a); } SIMD_INLINE v256 v256_zero(void) { return _mm256_setzero_si256(); } SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8((char)x); } SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16((short)x); } SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32((int)x); } SIMD_INLINE v256 v256_dup_64(uint64_t x) { return _mm256_set1_epi64x((int64_t)x); } SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); } SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); } SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); } SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); } SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return _mm256_adds_epi16(a, b); } SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); } SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); } SIMD_INLINE v256 v256_padd_u8(v256 a) { return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1)); } SIMD_INLINE v256 v256_padd_s16(v256 a) { return _mm256_madd_epi16(a, _mm256_set1_epi16(1)); } SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); } SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); } SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); } SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); } SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return _mm256_subs_epi16(a, b); } SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return _mm256_subs_epu16(a, b); } SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); } SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); } SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); } SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); } // AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit // lanes of lower or upper halves of a 256bit vector because the // unpack/pack intrinsics operate on the 256 bit input vector as 2 // independent 128 bit vectors. SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return _mm256_unpacklo_epi8( _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); } SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return _mm256_unpackhi_epi8( _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); } SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return _mm256_unpacklo_epi16( _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); } SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return _mm256_unpackhi_epi16( _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); } SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return _mm256_unpacklo_epi32( _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); } SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return _mm256_unpackhi_epi32( _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); } SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return _mm256_unpacklo_epi64( _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); } SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return _mm256_unpackhi_epi64( _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); } SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { return _mm256_permute2x128_si256(a, b, 0x02); } SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { return _mm256_permute2x128_si256(a, b, 0x13); } SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b)); } SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b)); } SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); } SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { return _mm256_permute4x64_epi64( _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)), _MM_SHUFFLE(3, 1, 2, 0)); } SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1)); } SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { return _mm256_permute4x64_epi64( _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)), _MM_SHUFFLE(3, 1, 2, 0)); } SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2)); } SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { return _mm256_permute4x64_epi64( _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _MM_SHUFFLE(3, 1, 3, 1))), _MM_SHUFFLE(3, 1, 2, 0)); } SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { return _mm256_permute4x64_epi64( _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _MM_SHUFFLE(2, 0, 2, 0))), _MM_SHUFFLE(3, 1, 2, 0)); } SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { return _mm256_permute4x64_epi64( _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 15)), _MM_SHUFFLE(3, 1, 2, 0)); } SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { return _mm256_permute4x64_epi64( _mm256_castpd_si256( _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)), _MM_SHUFFLE(3, 1, 2, 0)); } SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return _mm256_cvtepu8_epi16(a); } SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { return _mm256_unpacklo_epi8( _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), _mm256_setzero_si256()); } SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { return _mm256_unpackhi_epi8( _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), _mm256_setzero_si256()); } SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a)); } SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { return _mm256_srai_epi16( _mm256_unpacklo_epi8( a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), 8); } SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { return _mm256_srai_epi16( _mm256_unpackhi_epi8( a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), 8); } SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a), _MM_SHUFFLE(3, 1, 2, 0)); } SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a), _MM_SHUFFLE(3, 1, 2, 0)); } SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a), _MM_SHUFFLE(3, 1, 2, 0)); } SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a), _MM_SHUFFLE(3, 1, 2, 0)); } SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { return _mm256_cvtepu16_epi32(a); } SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { return _mm256_cvtepi16_epi32(a); } SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { return _mm256_unpacklo_epi16( _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), _mm256_setzero_si256()); } SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { return _mm256_srai_epi32( _mm256_unpacklo_epi16( a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), 16); } SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { return _mm256_unpackhi_epi16( _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), _mm256_setzero_si256()); } SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { return _mm256_srai_epi32( _mm256_unpackhi_epi16( a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), 16); } SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { return _mm256_blendv_epi8( _mm256_shuffle_epi8( _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern), _mm256_shuffle_epi8( _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern), _mm256_cmpgt_epi8(v256_dup_8(16), pattern)); } SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) { v256 c32 = v256_dup_8(32); v256 p32 = v256_sub_8(pattern, c32); v256 r1 = _mm256_blendv_epi8( _mm256_shuffle_epi8( _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32), _mm256_shuffle_epi8( _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32), _mm256_cmpgt_epi8(v256_dup_8(48), pattern)); v256 r2 = _mm256_blendv_epi8( _mm256_shuffle_epi8( _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern), _mm256_shuffle_epi8( _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern), _mm256_cmpgt_epi8(v256_dup_8(16), pattern)); return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern)); } SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { return _mm256_shuffle_epi8(a, pattern); } SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b)); v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b)); t1 = _mm256_add_epi32(t1, t2); v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0), _mm256_extracti128_si256(t1, 1)); t = _mm_add_epi32(t, _mm_srli_si128(t, 8)); t = _mm_add_epi32(t, _mm_srli_si128(t, 4)); return (int32_t)v128_low_u32(t); } SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { v256 r = _mm256_madd_epi16(a, b); #if defined(__x86_64__) v128 t; r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)), _mm256_cvtepi32_epi64(v256_low_v128(r))); t = v256_low_v128(_mm256_add_epi64( r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1)))); return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8))); #else v128 l = v256_low_v128(r); v128 h = v256_high_v128(r); return (int64_t)_mm_cvtsi128_si32(l) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) + (int64_t)_mm_cvtsi128_si32(h) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12)); #endif } SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { v256 r = _mm256_mullo_epi32(a, b); #if defined(__x86_64__) v128 t; r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)), _mm256_cvtepi32_epi64(v256_low_v128(r))); t = v256_low_v128(_mm256_add_epi64( r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1)))); return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8))); #else v128 l = v256_low_v128(r); v128 h = v256_high_v128(r); return (int64_t)_mm_cvtsi128_si32(l) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) + (int64_t)_mm_cvtsi128_si32(h) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12)); #endif } SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256()); v128 lo = v256_low_v128(t); v128 hi = v256_high_v128(t); lo = v128_add_32(lo, hi); return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo)); } typedef v256 sad256_internal; SIMD_INLINE sad256_internal v256_sad_u8_init(void) { return _mm256_setzero_si256(); } /* Implementation dependent return value. Result must be finalised with v256_sad_u8_sum(). The result for more than 32 v256_sad_u8() calls is undefined. */ SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { return _mm256_add_epi64(s, _mm256_sad_epu8(a, b)); } SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s)); return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t))); } typedef v256 ssd256_internal; SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) { return _mm256_setzero_si256(); } /* Implementation dependent return value. Result must be finalised with * v256_ssd_u8_sum(). */ SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()), _mm256_unpacklo_epi8(b, _mm256_setzero_si256())); v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()), _mm256_unpackhi_epi8(b, _mm256_setzero_si256())); v256 rl = _mm256_madd_epi16(l, l); v256 rh = _mm256_madd_epi16(h, h); v128 c = _mm_cvtsi32_si128(32); rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8)); rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4)); rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8)); rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4)); return _mm256_add_epi64( s, _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c)); } SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s)); return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t))); } SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); } SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); } SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); } SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); } SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) { v128 lo_bits = v128_mullo_s16(a, b); v128 hi_bits = v128_mulhi_s16(a, b); return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits), v128_ziplo_16(hi_bits, lo_bits)); } SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { return _mm256_mullo_epi16(a, b); } SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { return _mm256_mulhi_epi16(a, b); } SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { return _mm256_mullo_epi32(a, b); } SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return _mm256_madd_epi16(a, b); } SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return _mm256_maddubs_epi16(a, b); } SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); } SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return _mm256_sub_epi8( _mm256_avg_epu8(a, b), _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1))); } SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { return _mm256_sub_epi16( _mm256_avg_epu16(a, b), _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1))); } SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); } SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); } SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); } SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); } SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return (uint32_t)_mm256_movemask_epi8(a); } SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { return _mm256_blendv_epi8(a, b, c); } SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); } SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); } SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); } SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); } SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); } SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return _mm256_cmpgt_epi8(a, b); } SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return _mm256_cmpgt_epi8(b, a); } SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return _mm256_cmpeq_epi8(a, b); } SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { return _mm256_cmpgt_epi16(a, b); } SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { return _mm256_cmpgt_epi16(b, a); } SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return _mm256_cmpeq_epi16(a, b); } SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { return _mm256_cmpgt_epi32(a, b); } SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { return _mm256_cmpgt_epi32(b, a); } SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { return _mm256_cmpeq_epi32(a, b); } SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { return _mm256_and_si256(_mm256_set1_epi8((char)(0xff << c)), _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c))); } SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { return _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> c)), _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c))); } SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { __m128i x = _mm_cvtsi32_si128((int)(c + 8)); return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x), _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x)); } SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { return _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { return _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { return _mm256_sra_epi16(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { return _mm256_sll_epi32(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { return _mm256_srl_epi32(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { return _mm256_sra_epi32(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) { return _mm256_sll_epi64(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) { return _mm256_srl_epi64(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) { #if defined(__AVX512VL__) return _mm256_sra_epi64(a, _mm_cvtsi32_si128((int)c)); #else return v256_from_v128(v128_shr_s64(v256_high_v128(a), c), v128_shr_s64(v256_low_v128(a), c)); #endif } /* These intrinsics require immediate values, so we must use #defines to enforce that. */ // _mm256_slli_si256 works on 128 bit lanes and can't be used #define v256_shl_n_byte(a, n) \ ((n) < 16 ? v256_from_v128( \ v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \ v128_shl_n_byte(v256_low_v128(a), n)) \ : _mm256_inserti128_si256( \ _mm256_setzero_si256(), \ v128_shl_n_byte(v256_low_v128(a), (n)-16), 1)) // _mm256_srli_si256 works on 128 bit lanes and can't be used #define v256_shr_n_byte(a, n) \ ((n) < 16 \ ? _mm256_alignr_epi8( \ _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \ : ((n) == 16 ? _mm256_permute2x128_si256(_mm256_setzero_si256(), a, 3) \ : _mm256_inserti128_si256( \ _mm256_setzero_si256(), \ v128_shr_n_byte(v256_high_v128(a), (n)-16), 0))) // _mm256_alignr_epi8 works on two 128 bit lanes and can't be used #define v256_align(a, b, c) \ ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b) #define v256_shl_n_8(a, c) \ _mm256_and_si256(_mm256_set1_epi8((char)(0xff << (c))), \ _mm256_slli_epi16(a, c)) #define v256_shr_n_u8(a, c) \ _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> (c))), \ _mm256_srli_epi16(a, c)) #define v256_shr_n_s8(a, c) \ _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \ _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8)) #define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c) #define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c) #define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c) #define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c) #define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c) #define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c) #define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c) #define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c) #define v256_shr_n_s64(a, c) \ v256_shr_s64((a), (c)) // _mm256_srai_epi64 broken in gcc? #define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n)) #define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n)) typedef v256 sad256_internal_u16; SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { return v256_zero(); } /* Implementation dependent return value. Result must be finalised with * v256_sad_u16_sum(). */ SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, v256 b) { #if defined(__SSE4_1__) v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b)); #else v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)), v256_xor(b, v256_dup_16(32768))); t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)), v256_or(v256_and(a, t), v256_andn(b, t))); #endif return v256_add_32( s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t))); } SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s)); return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) + v128_low_u32(v128_shr_n_byte(t, 8)) + v128_low_u32(v128_shr_n_byte(t, 12)); } typedef v256 ssd256_internal_s16; SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { return v256_zero(); } /* Implementation dependent return value. Result must be finalised with * v256_ssd_s16_sum(). */ SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a, v256 b) { v256 d = v256_sub_16(a, b); d = v256_madd_s16(d, d); return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()), _mm256_unpacklo_epi32(d, v256_zero()))); } SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s)); return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t)); } #endif #endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ aom-3.12.1/aom_dsp/simd/v64_intrinsics.h000066400000000000000000000231671477627663500200250ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ #define AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ #include #include #include "aom_dsp/simd/v64_intrinsics_c.h" /* Fallback to plain, unoptimised C. */ typedef c_v64 v64; SIMD_INLINE uint32_t v64_low_u32(v64 a) { return c_v64_low_u32(a); } SIMD_INLINE uint32_t v64_high_u32(v64 a) { return c_v64_high_u32(a); } SIMD_INLINE int32_t v64_low_s32(v64 a) { return c_v64_low_s32(a); } SIMD_INLINE int32_t v64_high_s32(v64 a) { return c_v64_high_s32(a); } SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { return c_v64_from_32(x, y); } SIMD_INLINE v64 v64_from_64(uint64_t x) { return c_v64_from_64(x); } SIMD_INLINE uint64_t v64_u64(v64 x) { return c_v64_u64(x); } SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { return c_v64_from_16(a, b, c, d); } SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { return c_u32_load_unaligned(p); } SIMD_INLINE uint32_t u32_load_aligned(const void *p) { return c_u32_load_aligned(p); } SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { c_u32_store_unaligned(p, a); } SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { c_u32_store_aligned(p, a); } SIMD_INLINE v64 v64_load_unaligned(const void *p) { return c_v64_load_unaligned(p); } SIMD_INLINE v64 v64_load_aligned(const void *p) { return c_v64_load_aligned(p); } SIMD_INLINE void v64_store_unaligned(void *p, v64 a) { c_v64_store_unaligned(p, a); } SIMD_INLINE void v64_store_aligned(void *p, v64 a) { c_v64_store_aligned(p, a); } SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) { return c_v64_align(a, b, c); } SIMD_INLINE v64 v64_zero(void) { return c_v64_zero(); } SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); } SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); } SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); } SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); } SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); } SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return c_v64_sadd_u8(a, b); } SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return c_v64_sadd_s8(a, b); } SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); } SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); } SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); } SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return c_v64_ssub_u8(a, b); } SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return c_v64_ssub_s8(a, b); } SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return c_v64_sub_16(a, b); } SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); } SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return c_v64_ssub_u16(a, b); } SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); } SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); } SIMD_INLINE v64 v64_abs_s8(v64 a) { return c_v64_abs_s8(a); } SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); } SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); } SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return c_v64_ziplo_16(a, b); } SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { return c_v64_ziphi_16(a, b); } SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return c_v64_ziplo_32(a, b); } SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { return c_v64_ziphi_32(a, b); } SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { return c_v64_unziplo_8(a, b); } SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { return c_v64_unziphi_8(a, b); } SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { return c_v64_unziplo_16(a, b); } SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { return c_v64_unziphi_16(a, b); } SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return c_v64_unpacklo_u8_s16(a); } SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return c_v64_unpackhi_u8_s16(a); } SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { return c_v64_unpacklo_s8_s16(a); } SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); } SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) { return c_v64_pack_s32_s16(a, b); } SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) { return c_v64_pack_s32_u16(a, b); } SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) { return c_v64_pack_s16_u8(a, b); } SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) { return c_v64_pack_s16_s8(a, b); } SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) { return c_v64_unpacklo_u16_s32(a); } SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) { return c_v64_unpacklo_s16_s32(a); } SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) { return c_v64_unpackhi_u16_s32(a); } SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) { return c_v64_unpackhi_s16_s32(a); } SIMD_INLINE v64 v64_shuffle_8(v64 a, v64 pattern) { return c_v64_shuffle_8(a, pattern); } SIMD_INLINE c_sad64_internal v64_sad_u8_init(void) { return c_v64_sad_u8_init(); } SIMD_INLINE c_sad64_internal v64_sad_u8(c_sad64_internal s, v64 a, v64 b) { return c_v64_sad_u8(s, a, b); } SIMD_INLINE uint32_t v64_sad_u8_sum(c_sad64_internal s) { return c_v64_sad_u8_sum(s); } SIMD_INLINE c_ssd64_internal v64_ssd_u8_init(void) { return c_v64_ssd_u8_init(); } SIMD_INLINE c_ssd64_internal v64_ssd_u8(c_ssd64_internal s, v64 a, v64 b) { return c_v64_ssd_u8(s, a, b); } SIMD_INLINE uint32_t v64_ssd_u8_sum(c_ssd64_internal s) { return c_v64_ssd_u8_sum(s); } SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); } SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { return c_v64_dotp_s16(a, b); } SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { return c_v64_hadd_u8(a); } SIMD_INLINE int64_t v64_hadd_s16(v64 a) { return c_v64_hadd_s16(a); } SIMD_INLINE v64 v64_or(v64 a, v64 b) { return c_v64_or(a, b); } SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return c_v64_xor(a, b); } SIMD_INLINE v64 v64_and(v64 a, v64 b) { return c_v64_and(a, b); } SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return c_v64_andn(a, b); } SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return c_v64_mullo_s16(a, b); } SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return c_v64_mulhi_s16(a, b); } SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { return c_v64_mullo_s32(a, b); } SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return c_v64_madd_s16(a, b); } SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); } SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); } SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); } SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { return c_v64_rdavg_u16(a, b); } SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); } SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); } SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); } SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { return c_v64_min_s8(a, b); } SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { return c_v64_max_s8(a, b); } SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return c_v64_min_s16(a, b); } SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return c_v64_max_s16(a, b); } SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return c_v64_cmpgt_s8(a, b); } SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return c_v64_cmplt_s8(a, b); } SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return c_v64_cmpeq_8(a, b); } SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return c_v64_cmpgt_s16(a, b); } SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return c_v64_cmplt_s16(a, b); } SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return c_v64_cmpeq_16(a, b); } SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int n) { return c_v64_shl_8(a, n); } SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int n) { return c_v64_shr_u8(a, n); } SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int n) { return c_v64_shr_s8(a, n); } SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int n) { return c_v64_shl_16(a, n); } SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int n) { return c_v64_shr_u16(a, n); } SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int n) { return c_v64_shr_s16(a, n); } SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int n) { return c_v64_shl_32(a, n); } SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int n) { return c_v64_shr_u32(a, n); } SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int n) { return c_v64_shr_s32(a, n); } SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int n) { return c_v64_shr_n_byte(a, n); } SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int n) { return c_v64_shl_n_byte(a, n); } SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return c_v64_shl_n_8(a, c); } SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return c_v64_shr_n_u8(a, c); } SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return c_v64_shr_n_s8(a, c); } SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return c_v64_shl_n_16(a, c); } SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { return c_v64_shr_n_u16(a, c); } SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { return c_v64_shr_n_s16(a, c); } SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return c_v64_shl_n_32(a, c); } SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { return c_v64_shr_n_u32(a, c); } SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { return c_v64_shr_n_s32(a, c); } #endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ aom-3.12.1/aom_dsp/simd/v64_intrinsics_c.h000066400000000000000000000556341477627663500203330ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ #define AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ /* Note: This implements the intrinsics in plain, unoptimised C. Intended for reference, porting or debugging. */ #include #include #include "config/aom_config.h" typedef union { uint8_t u8[8]; uint16_t u16[4]; uint32_t u32[2]; uint64_t u64; int8_t s8[8]; int16_t s16[4]; int32_t s32[2]; int64_t s64; } c_v64; SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { return a.u32[!!CONFIG_BIG_ENDIAN]; } SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) { return a.u32[!CONFIG_BIG_ENDIAN]; } SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { return a.s32[!!CONFIG_BIG_ENDIAN]; } SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) { return a.s32[!CONFIG_BIG_ENDIAN]; } SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) { c_v64 t; t.u32[!CONFIG_BIG_ENDIAN] = x; t.u32[!!CONFIG_BIG_ENDIAN] = y; return t; } SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) { c_v64 t; t.u64 = x; return t; } SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; } SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { c_v64 t; if (CONFIG_BIG_ENDIAN) { t.u16[0] = a; t.u16[1] = b; t.u16[2] = c; t.u16[3] = d; } else { t.u16[3] = a; t.u16[2] = b; t.u16[1] = c; t.u16[0] = d; } return t; } SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) { uint32_t t; uint8_t *pp = (uint8_t *)p; uint8_t *q = (uint8_t *)&t; int c; for (c = 0; c < 4; c++) q[c] = pp[c]; return t; } SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) { uint8_t *pp = (uint8_t *)p; uint8_t *q = (uint8_t *)&a; int c; for (c = 0; c < 4; c++) pp[c] = q[c]; } SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) { if (SIMD_CHECK && (uintptr_t)p & 3) { fprintf(stderr, "Error: Unaligned u32 load at %p\n", p); abort(); } return c_u32_load_unaligned(p); } SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) { if (SIMD_CHECK && (uintptr_t)p & 3) { fprintf(stderr, "Error: Unaligned u32 store at %p\n", p); abort(); } c_u32_store_unaligned(p, a); } SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) { c_v64 t; uint8_t *pp = (uint8_t *)p; uint8_t *q = (uint8_t *)&t; int c; for (c = 0; c < 8; c++) q[c] = pp[c]; return t; } SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) { if (SIMD_CHECK && (uintptr_t)p & 7) { fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p); abort(); } return c_v64_load_unaligned(p); } SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) { uint8_t *q = (uint8_t *)p; uint8_t *r = (uint8_t *)&a; int c; for (c = 0; c < 8; c++) q[c] = r[c]; } SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) { if (SIMD_CHECK && (uintptr_t)p & 7) { fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p); abort(); } c_v64_store_unaligned(p, a); } SIMD_INLINE c_v64 c_v64_zero(void) { c_v64 t; t.u64 = 0; return t; } SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) { c_v64 t; t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] = t.u8[7] = x; return t; } SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) { c_v64 t; t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x; return t; } SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) { c_v64 t; t.u32[0] = t.u32[1] = x; return t; } SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] + b.u8[c]); return t; } SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] + b.u16[c]); return t; } SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 8; c++) t.u8[c] = SIMD_CLAMP((int16_t)a.u8[c] + (int16_t)b.u8[c], 0, 255); return t; } SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 8; c++) t.s8[c] = SIMD_CLAMP((int16_t)a.s8[c] + (int16_t)b.s8[c], -128, 127); return t; } SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 4; c++) t.s16[c] = SIMD_CLAMP((int32_t)a.s16[c] + (int32_t)b.s16[c], -32768, 32767); return t; } SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) { c_v64 t; t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]); t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]); return t; } SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] - b.u8[c]); return t; } SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c]; return t; } SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 8; c++) { int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c]; t.s8[c] = SIMD_CLAMP(d, -128, 127); } return t; } SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] - b.u16[c]); return t; } SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 4; c++) t.s16[c] = SIMD_CLAMP((int32_t)a.s16[c] - (int32_t)b.s16[c], -32768, 32767); return t; } SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 4; c++) t.u16[c] = (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c]; return t; } SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) { c_v64 t; t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]); t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]); return t; } SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) { c_v64 t; int c; for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)((int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c]); return t; } SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) { c_v64 t; int c; for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)((int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c]); return t; } SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) { c_v64 t; if (mode) { t.u8[7] = a.u8[7]; t.u8[6] = b.u8[7]; t.u8[5] = a.u8[6]; t.u8[4] = b.u8[6]; t.u8[3] = a.u8[5]; t.u8[2] = b.u8[5]; t.u8[1] = a.u8[4]; t.u8[0] = b.u8[4]; } else { t.u8[7] = a.u8[3]; t.u8[6] = b.u8[3]; t.u8[5] = a.u8[2]; t.u8[4] = b.u8[2]; t.u8[3] = a.u8[1]; t.u8[2] = b.u8[1]; t.u8[1] = a.u8[0]; t.u8[0] = b.u8[0]; } return t; } SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) { return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0); } SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) { return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1); } SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) { c_v64 t; if (mode) { t.u16[3] = a.u16[3]; t.u16[2] = b.u16[3]; t.u16[1] = a.u16[2]; t.u16[0] = b.u16[2]; } else { t.u16[3] = a.u16[1]; t.u16[2] = b.u16[1]; t.u16[1] = a.u16[0]; t.u16[0] = b.u16[0]; } return t; } SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) { return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0); } SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) { return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1); } SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) { c_v64 t; if (mode) { t.u32[1] = a.u32[1]; t.u32[0] = b.u32[1]; } else { t.u32[1] = a.u32[0]; t.u32[0] = b.u32[0]; } return t; } SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) { return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0); } SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) { return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1); } SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) { c_v64 t; if (mode) { t.u8[7] = b.u8[7]; t.u8[6] = b.u8[5]; t.u8[5] = b.u8[3]; t.u8[4] = b.u8[1]; t.u8[3] = a.u8[7]; t.u8[2] = a.u8[5]; t.u8[1] = a.u8[3]; t.u8[0] = a.u8[1]; } else { t.u8[7] = a.u8[6]; t.u8[6] = a.u8[4]; t.u8[5] = a.u8[2]; t.u8[4] = a.u8[0]; t.u8[3] = b.u8[6]; t.u8[2] = b.u8[4]; t.u8[1] = b.u8[2]; t.u8[0] = b.u8[0]; } return t; } SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) { return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0); } SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) { return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1); } SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) { c_v64 t; if (mode) { t.u16[3] = b.u16[3]; t.u16[2] = b.u16[1]; t.u16[1] = a.u16[3]; t.u16[0] = a.u16[1]; } else { t.u16[3] = a.u16[2]; t.u16[2] = a.u16[0]; t.u16[1] = b.u16[2]; t.u16[0] = b.u16[0]; } return t; } SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) { return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1) : _c_v64_unzip_16(a, b, 0); } SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) { return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0) : _c_v64_unzip_16(b, a, 1); } SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) { c_v64 t; int endian = !!CONFIG_BIG_ENDIAN * 4; t.s16[3] = (int16_t)a.u8[3 + endian]; t.s16[2] = (int16_t)a.u8[2 + endian]; t.s16[1] = (int16_t)a.u8[1 + endian]; t.s16[0] = (int16_t)a.u8[0 + endian]; return t; } SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) { c_v64 t; int endian = !!CONFIG_BIG_ENDIAN * 4; t.s16[3] = (int16_t)a.u8[7 - endian]; t.s16[2] = (int16_t)a.u8[6 - endian]; t.s16[1] = (int16_t)a.u8[5 - endian]; t.s16[0] = (int16_t)a.u8[4 - endian]; return t; } SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) { c_v64 t; int endian = !!CONFIG_BIG_ENDIAN * 4; t.s16[3] = (int16_t)a.s8[3 + endian]; t.s16[2] = (int16_t)a.s8[2 + endian]; t.s16[1] = (int16_t)a.s8[1 + endian]; t.s16[0] = (int16_t)a.s8[0 + endian]; return t; } SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) { c_v64 t; int endian = !!CONFIG_BIG_ENDIAN * 4; t.s16[3] = (int16_t)a.s8[7 - endian]; t.s16[2] = (int16_t)a.s8[6 - endian]; t.s16[1] = (int16_t)a.s8[5 - endian]; t.s16[0] = (int16_t)a.s8[4 - endian]; return t; } SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) { c_v64 t; if (CONFIG_BIG_ENDIAN) { c_v64 u = a; a = b; b = u; } t.s16[3] = SIMD_CLAMP(a.s32[1], -32768, 32767); t.s16[2] = SIMD_CLAMP(a.s32[0], -32768, 32767); t.s16[1] = SIMD_CLAMP(b.s32[1], -32768, 32767); t.s16[0] = SIMD_CLAMP(b.s32[0], -32768, 32767); return t; } SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) { c_v64 t; if (CONFIG_BIG_ENDIAN) { c_v64 u = a; a = b; b = u; } t.u16[3] = SIMD_CLAMP(a.s32[1], 0, 65535); t.u16[2] = SIMD_CLAMP(a.s32[0], 0, 65535); t.u16[1] = SIMD_CLAMP(b.s32[1], 0, 65535); t.u16[0] = SIMD_CLAMP(b.s32[0], 0, 65535); return t; } SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) { c_v64 t; if (CONFIG_BIG_ENDIAN) { c_v64 u = a; a = b; b = u; } t.u8[7] = SIMD_CLAMP(a.s16[3], 0, 255); t.u8[6] = SIMD_CLAMP(a.s16[2], 0, 255); t.u8[5] = SIMD_CLAMP(a.s16[1], 0, 255); t.u8[4] = SIMD_CLAMP(a.s16[0], 0, 255); t.u8[3] = SIMD_CLAMP(b.s16[3], 0, 255); t.u8[2] = SIMD_CLAMP(b.s16[2], 0, 255); t.u8[1] = SIMD_CLAMP(b.s16[1], 0, 255); t.u8[0] = SIMD_CLAMP(b.s16[0], 0, 255); return t; } SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) { c_v64 t; if (CONFIG_BIG_ENDIAN) { c_v64 u = a; a = b; b = u; } t.s8[7] = SIMD_CLAMP(a.s16[3], -128, 127); t.s8[6] = SIMD_CLAMP(a.s16[2], -128, 127); t.s8[5] = SIMD_CLAMP(a.s16[1], -128, 127); t.s8[4] = SIMD_CLAMP(a.s16[0], -128, 127); t.s8[3] = SIMD_CLAMP(b.s16[3], -128, 127); t.s8[2] = SIMD_CLAMP(b.s16[2], -128, 127); t.s8[1] = SIMD_CLAMP(b.s16[1], -128, 127); t.s8[0] = SIMD_CLAMP(b.s16[0], -128, 127); return t; } SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) { c_v64 t; t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2]; t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2]; return t; } SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) { c_v64 t; t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2]; t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2]; return t; } SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) { c_v64 t; t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2]; t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2]; return t; } SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) { c_v64 t; t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2]; t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2]; return t; } SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) { c_v64 t; int c; for (c = 0; c < 8; c++) { if (SIMD_CHECK && (pattern.u8[c] & ~7)) { fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n", pattern.u8[c], c); abort(); } t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7]; } return t; } SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) { return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] + a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] + a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0]; } SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) { return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) + (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]); } SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) { return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] + a.u8[0]; } SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) { return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0]; } typedef struct { uint32_t val; int count; } c_sad64_internal; SIMD_INLINE c_sad64_internal c_v64_sad_u8_init(void) { c_sad64_internal t; t.val = t.count = 0; return t; } /* Implementation dependent return value. Result must be finalised with v64_sad_u8_sum(). The result for more than 32 v64_sad_u8() calls is undefined. */ SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a, c_v64 b) { int c; for (c = 0; c < 8; c++) s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; s.count++; if (SIMD_CHECK && s.count > 32) { fprintf(stderr, "Error: sad called 32 times returning an undefined result\n"); abort(); } return s; } SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s.val; } typedef uint32_t c_ssd64_internal; /* Implementation dependent return value. Result must be finalised with * v64_ssd_u8_sum(). */ SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init(void) { return 0; } SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a, c_v64 b) { int c; for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); return s; } SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; } SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) { c_v64 t; t.u64 = a.u64 | b.u64; return t; } SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) { c_v64 t; t.u64 = a.u64 ^ b.u64; return t; } SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) { c_v64 t; t.u64 = a.u64 & b.u64; return t; } SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) { c_v64 t; t.u64 = a.u64 & ~b.u64; return t; } SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]); return t; } SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16; return t; } SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) { c_v64 t; t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]); t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]); return t; } SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) { c_v64 t; t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1]; t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3]; return t; } SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) { c_v64 t; int32_t u; u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1]; t.s16[0] = SIMD_CLAMP(u, -32768, 32767); u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3]; t.s16[1] = SIMD_CLAMP(u, -32768, 32767); u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5]; t.s16[2] = SIMD_CLAMP(u, -32768, 32767); u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7]; t.s16[3] = SIMD_CLAMP(u, -32768, 32767); return t; } SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1; return t; } SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1; return t; } SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1; return t; } SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1; return t; } SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c]; return t; } SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c]; return t; } SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c]; return t; } SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c]; return t; } SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c]; return t; } SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c]; return t; } SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]); return t; } SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]); return t; } SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]); return t; } SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]); return t; } SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]); return t; } SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) { c_v64 t; int c; for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]); return t; } SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) { c_v64 t; int c; if (SIMD_CHECK && n > 7) { fprintf(stderr, "Error: Undefined u8 shift left %d\n", n); abort(); } for (c = 0; c < 8; c++) t.s8[c] = (int8_t)(a.u8[c] << n); return t; } SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) { c_v64 t; int c; if (SIMD_CHECK && n > 7) { fprintf(stderr, "Error: Undefined u8 shift right %d\n", n); abort(); } for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n; return t; } SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) { c_v64 t; int c; if (SIMD_CHECK && n > 7) { fprintf(stderr, "Error: Undefined s8 shift right %d\n", n); abort(); } for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n; return t; } SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) { c_v64 t; int c; if (SIMD_CHECK && n > 15) { fprintf(stderr, "Error: Undefined u16 shift left %d\n", n); abort(); } for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] << n); return t; } SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) { c_v64 t; int c; if (SIMD_CHECK && n > 15) { fprintf(stderr, "Error: Undefined u16 shift right %d\n", n); abort(); } for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n; return t; } SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) { c_v64 t; int c; if (SIMD_CHECK && n > 15) { fprintf(stderr, "Error: undefined s16 shift right %d\n", n); abort(); } for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n; return t; } SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) { c_v64 t; if (SIMD_CHECK && n > 31) { fprintf(stderr, "Error: undefined u32 shift left %d\n", n); abort(); } t.u32[1] = a.u32[1] << n; t.u32[0] = a.u32[0] << n; return t; } SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) { c_v64 t; if (SIMD_CHECK && n > 31) { fprintf(stderr, "Error: undefined u32 shift right %d\n", n); abort(); } t.u32[1] = a.u32[1] >> n; t.u32[0] = a.u32[0] >> n; return t; } SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) { c_v64 t; if (SIMD_CHECK && n > 31) { fprintf(stderr, "Error: undefined s32 shift right %d\n", n); abort(); } t.s32[1] = a.s32[1] >> n; t.s32[0] = a.s32[0] >> n; return t; } SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) { c_v64 t; t.u64 = x.u64 >> i * 8; return t; } SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) { c_v64 t; t.u64 = x.u64 << i * 8; return t; } SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) { if (SIMD_CHECK && c > 7) { fprintf(stderr, "Error: undefined alignment %d\n", c); abort(); } return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b; } SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) { return c_v64_shl_8(a, c); } SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) { return c_v64_shr_u8(a, c); } SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) { return c_v64_shr_s8(a, c); } SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) { return c_v64_shl_16(a, c); } SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) { return c_v64_shr_u16(a, c); } SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) { return c_v64_shr_s16(a, c); } SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) { return c_v64_shl_32(a, c); } SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) { return c_v64_shr_u32(a, c); } SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) { return c_v64_shr_s32(a, c); } #endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ aom-3.12.1/aom_dsp/simd/v64_intrinsics_x86.h000066400000000000000000000355201477627663500205260ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ #define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ #include #if defined(__SSSE3__) #include #endif #if defined(__SSE4_1__) #include #endif typedef __m128i v64; SIMD_INLINE uint32_t v64_low_u32(v64 a) { return (uint32_t)_mm_cvtsi128_si32(a); } SIMD_INLINE uint32_t v64_high_u32(v64 a) { return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4)); } SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); } SIMD_INLINE int32_t v64_high_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4)); } SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { return _mm_packs_epi32( _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d), _mm_setzero_si128()); } SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { return _mm_set_epi32(0, 0, (int32_t)x, (int32_t)y); } SIMD_INLINE v64 v64_from_64(uint64_t x) { #ifdef __x86_64__ return _mm_cvtsi64_si128((int64_t)x); #else return _mm_set_epi32(0, 0, (int32_t)(x >> 32), (int32_t)x); #endif } SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32); } SIMD_INLINE uint32_t u32_load_aligned(const void *p) { return *((uint32_t *)p); } SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { return *((uint32_t *)p); } SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { *((uint32_t *)p) = a; } SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { *((uint32_t *)p) = a; } SIMD_INLINE v64 v64_load_aligned(const void *p) { return _mm_loadl_epi64((__m128i *)p); } SIMD_INLINE v64 v64_load_unaligned(const void *p) { return _mm_loadl_epi64((__m128i *)p); } SIMD_INLINE void v64_store_aligned(void *p, v64 a) { _mm_storel_epi64((__m128i *)p, a); } SIMD_INLINE void v64_store_unaligned(void *p, v64 a) { _mm_storel_epi64((__m128i *)p, a); } #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) #define v64_align(a, b, c) \ ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b) #else #define v64_align(a, b, c) \ ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \ : (b)) #endif SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); } SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); } SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); } SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); } SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); } SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); } SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); } SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); } SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); } SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); } SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); } SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); } SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); } SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); } SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); } SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); } SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); } SIMD_INLINE v64 v64_abs_s16(v64 a) { #if defined(__SSSE3__) return _mm_abs_epi16(a); #else return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); #endif } SIMD_INLINE v64 v64_abs_s8(v64 a) { #if defined(__SSSE3__) return _mm_abs_epi8(a); #else v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128()); return _mm_xor_si128(sign, _mm_add_epi8(a, sign)); #endif } SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); } SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8); } SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); } SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8); } SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); } SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8); } SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) { __m128i t = _mm_unpacklo_epi64(b, a); return _mm_packs_epi32(t, t); } SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) { #if defined(__SSE4_1__) __m128i t = _mm_unpacklo_epi64(b, a); return _mm_packus_epi32(t, t); #else const int32_t ah = SIMD_CLAMP(v64_high_s32(a), 0, 65535); const int32_t al = SIMD_CLAMP(v64_low_s32(a), 0, 65535); const int32_t bh = SIMD_CLAMP(v64_high_s32(b), 0, 65535); const int32_t bl = SIMD_CLAMP(v64_low_s32(b), 0, 65535); return v64_from_16(ah, al, bh, bl); #endif } SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) { __m128i t = _mm_unpacklo_epi64(b, a); return _mm_packus_epi16(t, t); } SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) { __m128i t = _mm_unpacklo_epi64(b, a); return _mm_packs_epi16(t, t); } SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { #if defined(__SSSE3__) return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), v64_from_64(0x0f0d0b0907050301LL)); #else return _mm_packus_epi16( _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)), _mm_setzero_si128()); #endif } SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { #if defined(__SSSE3__) return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), v64_from_64(0x0e0c0a0806040200LL)); #else return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1)); #endif } SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { #if defined(__SSSE3__) return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), v64_from_64(0x0f0e0b0a07060302LL)); #else return _mm_packs_epi32( _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)), _mm_setzero_si128()); #endif } SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { #if defined(__SSSE3__) return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), v64_from_64(0x0d0c090805040100LL)); #else return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2)); #endif } SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return _mm_unpacklo_epi8(a, _mm_setzero_si128()); } SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8); } SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); } SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8); } SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) { return _mm_unpacklo_epi16(a, _mm_setzero_si128()); } SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) { return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16); } SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) { return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8); } SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) { return _mm_srli_si128( _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8); } SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) { #if defined(__SSSE3__) return _mm_shuffle_epi8(x, pattern); #else v64 output; unsigned char *input = (unsigned char *)&x; unsigned char *index = (unsigned char *)&pattern; unsigned char *selected = (unsigned char *)&output; int counter; for (counter = 0; counter < 8; counter++) { selected[counter] = input[index[counter]]; } return output; #endif } SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), _mm_unpacklo_epi8(b, _mm_setzero_si128())); t = _mm_add_epi32(t, _mm_srli_si128(t, 8)); t = _mm_add_epi32(t, _mm_srli_si128(t, 4)); return (int32_t)v64_low_u32(t); } SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { __m128i r = _mm_madd_epi16(a, b); #if defined(__SSE4_1__) && defined(__x86_64__) __m128i x = _mm_cvtepi32_epi64(r); return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8))); #else return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + (int64_t)_mm_cvtsi128_si32(r); #endif } SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128())); } SIMD_INLINE int64_t v64_hadd_s16(v64 a) { return v64_dotp_s16(a, v64_dup_16(1)); } typedef v64 sad64_internal; SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return _mm_setzero_si128(); } /* Implementation dependent return value. Result must be finalised with v64_sad_u8_sum(). The result for more than 32 v64_sad_u8() calls is undefined. */ SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { return _mm_add_epi64(s, _mm_sad_epu8(a, b)); } SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); } typedef v64 ssd64_internal; SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return _mm_setzero_si128(); } /* Implementation dependent return value. Result must be finalised with * v64_ssd_u8_sum(). */ SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b)); v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b)); v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h)); return _mm_add_epi64( s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4)))); } SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); } SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); } SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); } SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); } SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); } SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); } SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); } SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { #if defined(__SSE4_1__) return _mm_mullo_epi32(a, b); #else return _mm_unpacklo_epi32( _mm_mul_epu32(a, b), _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4))); #endif } SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); } SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { #if defined(__SSSE3__) return _mm_maddubs_epi16(a, b); #else __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)); return _mm_packs_epi32(t, t); #endif } SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); } SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return _mm_sub_epi8(_mm_avg_epu8(a, b), _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1))); } SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { return _mm_sub_epi16(_mm_avg_epu16(a, b), _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1))); } SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); } SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); } SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); } SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { #if defined(__SSE4_1__) return _mm_min_epi8(a, b); #else v64 mask = _mm_cmplt_epi8(a, b); return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); #endif } SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { #if defined(__SSE4_1__) return _mm_max_epi8(a, b); #else v64 mask = _mm_cmplt_epi8(b, a); return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); #endif } SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); } SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); } SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); } SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); } SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); } SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); } SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); } SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); } SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) { return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)), _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c))); } SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) { return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)), _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c))); } SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) { return _mm_packs_epi16( _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128((int)(c + 8))), a); } SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) { return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) { return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) { return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) { return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) { return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c)); } SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c)); } /* These intrinsics require immediate values, so we must use #defines to enforce that. */ #define v64_shl_n_byte(a, c) _mm_slli_si128(a, c) #define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8) #define v64_shl_n_8(a, c) \ _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c)) #define v64_shr_n_u8(a, c) \ _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c)) #define v64_shr_n_s8(a, c) \ _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a) #define v64_shl_n_16(a, c) _mm_slli_epi16(a, c) #define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c) #define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c) #define v64_shl_n_32(a, c) _mm_slli_epi32(a, c) #define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c) #define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c) #endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ aom-3.12.1/aom_dsp/sse.c000066400000000000000000000031201477627663500147550ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /* * Sum the square of the difference between every corresponding element of the * buffers. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" int64_t aom_sse_c(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height) { int y, x; int64_t sse = 0; for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { const int32_t diff = abs(a[x] - b[x]); sse += diff * diff; } a += a_stride; b += b_stride; } return sse; } #if CONFIG_AV1_HIGHBITDEPTH int64_t aom_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int width, int height) { int y, x; int64_t sse = 0; uint16_t *a = CONVERT_TO_SHORTPTR(a8); uint16_t *b = CONVERT_TO_SHORTPTR(b8); for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]); sse += diff * diff; } a += a_stride; b += b_stride; } return sse; } #endif aom-3.12.1/aom_dsp/ssim.c000066400000000000000000000420131477627663500151420ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/ssim.h" #include "aom_ports/mem.h" void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr) { int i, j; for (i = 0; i < 8; i++, s += sp, r += rp) { for (j = 0; j < 8; j++) { *sum_s += s[j]; *sum_r += r[j]; *sum_sq_s += s[j] * s[j]; *sum_sq_r += r[j] * r[j]; *sum_sxr += s[j] * r[j]; } } } static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2 static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2 static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2 static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2 static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, uint32_t sum_sq_r, uint32_t sum_sxr, int count, uint32_t bd) { double ssim_n, ssim_d; int64_t c1 = 0, c2 = 0; if (bd == 8) { // scale the constants by number of pixels c1 = (cc1 * count * count) >> 12; c2 = (cc2 * count * count) >> 12; } else if (bd == 10) { c1 = (cc1_10 * count * count) >> 12; c2 = (cc2_10 * count * count) >> 12; } else if (bd == 12) { c1 = (cc1_12 * count * count) >> 12; c2 = (cc2_12 * count * count) >> 12; } else { assert(0); // Return similarity as zero for unsupported bit-depth values. return 0; } ssim_n = (2.0 * sum_s * sum_r + c1) * (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2); ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) * ((double)count * sum_sq_s - (double)sum_s * sum_s + (double)count * sum_sq_r - (double)sum_r * sum_r + c2); return ssim_n / ssim_d; } static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; aom_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8); } // We are using a 8x8 moving window with starting location of each 8x8 window // on the 4x4 pixel grid. Such arrangement allows the windows to overlap // block boundaries to penalize blocking artifacts. double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1, int stride_img2, int width, int height) { int i, j; int samples = 0; double ssim_total = 0; // sample point start with each 4x4 location for (i = 0; i <= height - 8; i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { for (j = 0; j <= width - 8; j += 4) { double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2); ssim_total += v; samples++; } } ssim_total /= samples; return ssim_total; } #if CONFIG_INTERNAL_STATS void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *weight, double *fast_ssim) { double abc[3]; for (int i = 0; i < 3; ++i) { const int is_uv = i > 0; abc[i] = aom_ssim2(source->buffers[i], dest->buffers[i], source->strides[is_uv], dest->strides[is_uv], source->crop_widths[is_uv], source->crop_heights[is_uv]); } *weight = 1; *fast_ssim = abc[0] * .8 + .1 * (abc[1] + abc[2]); } // traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity // // Re working out the math -> // // ssim(x,y) = (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) / // ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2)) // // mean(x) = sum(x) / n // // cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n) // // var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n) // // ssim(x,y) = // (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) / // (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) * // ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+ // (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2))) // // factoring out n*n // // ssim(x,y) = // (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) / // (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) * // (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2)) // // Replace c1 with n*n * c1 for the final step that leads to this code: // The final step scales by 12 bits so we don't lose precision in the constants. static double ssimv_similarity(const Ssimv *sv, int64_t n) { // Scale the constants by number of pixels. const int64_t c1 = (cc1 * n * n) >> 12; const int64_t c2 = (cc2 * n * n) >> 12; const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) / (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1); // Since these variables are unsigned sums, convert to double so // math is done in double arithmetic. const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); return l * v; } // The first term of the ssim metric is a luminance factor. // // (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1) // // This luminance factor is super sensitive to the dark side of luminance // values and completely insensitive on the white side. check out 2 sets // (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60 // 2*250*252/ (250^2+252^2) => .99999997 // // As a result in this tweaked version of the calculation in which the // luminance is taken as percentage off from peak possible. // // 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count // static double ssimv_similarity2(const Ssimv *sv, int64_t n) { // Scale the constants by number of pixels. const int64_t c1 = (cc1 * n * n) >> 12; const int64_t c2 = (cc2 * n * n) >> 12; const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n; const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1); // Since these variables are unsigned, sums convert to double so // math is done in double arithmetic. const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); return l * v; } static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2, int img2_pitch, Ssimv *sv) { aom_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r, &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr); } double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, int img2_pitch, int width, int height, Ssimv *sv2, Metrics *m, int do_inconsistency) { double dssim_total = 0; double ssim_total = 0; double ssim2_total = 0; double inconsistency_total = 0; int i, j; int c = 0; double norm; double old_ssim_total = 0; // We can sample points as frequently as we like start with 1 per 4x4. for (i = 0; i < height; i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { for (j = 0; j < width; j += 4, ++c) { Ssimv sv = { 0, 0, 0, 0, 0, 0 }; double ssim; double ssim2; double dssim; uint32_t var_new; uint32_t var_old; uint32_t mean_new; uint32_t mean_old; double ssim_new; double ssim_old; // Not sure there's a great way to handle the edge pixels // in ssim when using a window. Seems biased against edge pixels // however you handle this. This uses only samples that are // fully in the frame. if (j + 8 <= width && i + 8 <= height) { ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv); } ssim = ssimv_similarity(&sv, 64); ssim2 = ssimv_similarity2(&sv, 64); sv.ssim = ssim2; // dssim is calculated to use as an actual error metric and // is scaled up to the same range as sum square error. // Since we are subsampling every 16th point maybe this should be // *16 ? dssim = 255 * 255 * (1 - ssim2) / 2; // Here I introduce a new error metric: consistency-weighted // SSIM-inconsistency. This metric isolates frames where the // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much // sharper or blurrier than the others. Higher values indicate a // temporally inconsistent SSIM. There are two ideas at work: // // 1) 'SSIM-inconsistency': the total inconsistency value // reflects how much SSIM values are changing between this // source / reference frame pair and the previous pair. // // 2) 'consistency-weighted': weights de-emphasize areas in the // frame where the scene content has changed. Changes in scene // content are detected via changes in local variance and local // mean. // // Thus the overall measure reflects how inconsistent the SSIM // values are, over consistent regions of the frame. // // The metric has three terms: // // term 1 -> uses change in scene Variance to weight error score // 2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2) // larger changes from one frame to the next mean we care // less about consistency. // // term 2 -> uses change in local scene luminance to weight error // 2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2) // larger changes from one frame to the next mean we care // less about consistency. // // term3 -> measures inconsistency in ssim scores between frames // 1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2). // // This term compares the ssim score for the same location in 2 // subsequent frames. var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64; var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64; mean_new = sv.sum_s; mean_old = sv2[c].sum_s; ssim_new = sv.ssim; ssim_old = sv2[c].ssim; if (do_inconsistency) { // We do the metric once for every 4x4 block in the image. Since // we are scaling the error to SSE for use in a psnr calculation // 1.0 = 4x4x255x255 the worst error we can possibly have. static const double kScaling = 4. * 4 * 255 * 255; // The constants have to be non 0 to avoid potential divide by 0 // issues other than that they affect kind of a weighting between // the terms. No testing of what the right terms should be has been // done. static const double c1 = 1, c2 = 1, c3 = 1; // This measures how much consistent variance is in two consecutive // source frames. 1.0 means they have exactly the same variance. const double variance_term = (2.0 * var_old * var_new + c1) / (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1); // This measures how consistent the local mean are between two // consecutive frames. 1.0 means they have exactly the same mean. const double mean_term = (2.0 * mean_old * mean_new + c2) / (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2); // This measures how consistent the ssims of two // consecutive frames is. 1.0 means they are exactly the same. double ssim_term = pow((2.0 * ssim_old * ssim_new + c3) / (ssim_old * ssim_old + ssim_new * ssim_new + c3), 5); double this_inconsistency; // Floating point math sometimes makes this > 1 by a tiny bit. // We want the metric to scale between 0 and 1.0 so we can convert // it to an snr scaled value. if (ssim_term > 1) ssim_term = 1; // This converts the consistency metric to an inconsistency metric // ( so we can scale it like psnr to something like sum square error. // The reason for the variance and mean terms is the assumption that // if there are big changes in the source we shouldn't penalize // inconsistency in ssim scores a bit less as it will be less visible // to the user. this_inconsistency = (1 - ssim_term) * variance_term * mean_term; this_inconsistency *= kScaling; inconsistency_total += this_inconsistency; } sv2[c] = sv; ssim_total += ssim; ssim2_total += ssim2; dssim_total += dssim; old_ssim_total += ssim_old; } old_ssim_total += 0; } norm = 1. / (width / 4) / (height / 4); ssim_total *= norm; ssim2_total *= norm; m->ssim2 = ssim2_total; m->ssim = ssim_total; if (old_ssim_total == 0) inconsistency_total = 0; m->ssimc = inconsistency_total; m->dssim = dssim_total; return inconsistency_total; } #endif // CONFIG_INTERNAL_STATS #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr) { int i, j; for (i = 0; i < 8; i++, s += sp, r += rp) { for (j = 0; j < 8; j++) { *sum_s += s[j]; *sum_r += r[j]; *sum_sq_s += s[j] * s[j]; *sum_sq_r += r[j] * r[j]; *sum_sxr += s[j] * r[j]; } } } static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t bd, uint32_t shift) { uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift), sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd); } double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1, int stride_img2, int width, int height, uint32_t bd, uint32_t shift) { int i, j; int samples = 0; double ssim_total = 0; // sample point start with each 4x4 location for (i = 0; i <= height - 8; i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { for (j = 0; j <= width - 8; j += 4) { double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1, CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd, shift); ssim_total += v; samples++; } } ssim_total /= samples; return ssim_total; } #if CONFIG_INTERNAL_STATS void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *weight, uint32_t bd, uint32_t in_bd, double *fast_ssim) { assert(bd >= in_bd); uint32_t shift = bd - in_bd; double abc[3]; for (int i = 0; i < 3; ++i) { const int is_uv = i > 0; abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i], source->strides[is_uv], dest->strides[is_uv], source->crop_widths[is_uv], source->crop_heights[is_uv], in_bd, shift); } weight[0] = 1; fast_ssim[0] = abc[0] * .8 + .1 * (abc[1] + abc[2]); if (bd > in_bd) { // Compute SSIM based on stream bit depth shift = 0; for (int i = 0; i < 3; ++i) { const int is_uv = i > 0; abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i], source->strides[is_uv], dest->strides[is_uv], source->crop_widths[is_uv], source->crop_heights[is_uv], bd, shift); } weight[1] = 1; fast_ssim[1] = abc[0] * .8 + .1 * (abc[1] + abc[2]); } } #endif // CONFIG_INTERNAL_STATS #endif // CONFIG_AV1_HIGHBITDEPTH #if CONFIG_INTERNAL_STATS void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig, const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth, const uint32_t in_bit_depth, int is_hbd, double *weight, double *frame_ssim2) { #if CONFIG_AV1_HIGHBITDEPTH if (is_hbd) { aom_highbd_calc_ssim(orig, recon, weight, bit_depth, in_bit_depth, frame_ssim2); return; } #else (void)bit_depth; (void)in_bit_depth; (void)is_hbd; #endif // CONFIG_AV1_HIGHBITDEPTH aom_lowbd_calc_ssim(orig, recon, weight, frame_ssim2); } #endif // CONFIG_INTERNAL_STATS aom-3.12.1/aom_dsp/ssim.h000066400000000000000000000063201477627663500151500ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_SSIM_H_ #define AOM_AOM_DSP_SSIM_H_ #ifdef __cplusplus extern "C" { #endif #include "config/aom_config.h" #if CONFIG_INTERNAL_STATS #include "aom_scale/yv12config.h" // metrics used for calculating ssim, ssim2, dssim, and ssimc typedef struct { // source sum ( over 8x8 region ) uint32_t sum_s; // reference sum (over 8x8 region ) uint32_t sum_r; // source sum squared ( over 8x8 region ) uint32_t sum_sq_s; // reference sum squared (over 8x8 region ) uint32_t sum_sq_r; // sum of source times reference (over 8x8 region) uint32_t sum_sxr; // calculated ssim score between source and reference double ssim; } Ssimv; // metrics collected on a frame basis typedef struct { // ssim consistency error metric ( see code for explanation ) double ssimc; // standard ssim double ssim; // revised ssim ( see code for explanation) double ssim2; // ssim restated as an error metric like sse double dssim; // dssim converted to decibels double dssimd; // ssimc converted to decibels double ssimcd; } Metrics; double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, int img2_pitch, int width, int height, Ssimv *sv2, Metrics *m, int do_inconsistency); void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *weight, double *fast_ssim); double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *ssim_y, double *ssim_u, double *ssim_v, uint32_t bd, uint32_t in_bd); #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *weight, uint32_t bd, uint32_t in_bd, double *fast_ssim); #endif // CONFIG_AV1_HIGHBITDEPTH void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig, const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth, const uint32_t in_bit_depth, int is_hbd, double *weight, double *frame_ssim2); #endif // CONFIG_INTERNAL_STATS double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1, int stride_img2, int width, int height); #if CONFIG_AV1_HIGHBITDEPTH double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1, int stride_img2, int width, int height, uint32_t bd, uint32_t shift); #endif // CONFIG_AV1_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_SSIM_H_ aom-3.12.1/aom_dsp/subtract.c000066400000000000000000000032601477627663500160170ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" void aom_subtract_block_c(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src, ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) { int r, c; for (r = 0; r < rows; r++) { for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c]; diff += diff_stride; pred += pred_stride; src += src_stride; } } #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src8, ptrdiff_t src_stride, const uint8_t *pred8, ptrdiff_t pred_stride) { int r, c; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); for (r = 0; r < rows; r++) { for (c = 0; c < cols; c++) { diff[c] = src[c] - pred[c]; } diff += diff_stride; pred += pred_stride; src += src_stride; } } #endif aom-3.12.1/aom_dsp/sum_squares.c000066400000000000000000000043371477627663500165450ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride, int width, int height) { int r, c; uint64_t ss = 0; for (r = 0; r < height; r++) { for (c = 0; c < width; c++) { const int16_t v = src[c]; ss += v * v; } src += src_stride; } return ss; } uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t n) { uint64_t ss = 0; do { const int16_t v = *src++; ss += v * v; } while (--n); return ss; } uint64_t aom_var_2d_u8_c(uint8_t *src, int src_stride, int width, int height) { int r, c; uint64_t ss = 0, s = 0; for (r = 0; r < height; r++) { for (c = 0; c < width; c++) { const uint8_t v = src[c]; ss += v * v; s += v; } src += src_stride; } return (ss - s * s / (width * height)); } #if CONFIG_AV1_HIGHBITDEPTH uint64_t aom_var_2d_u16_c(uint8_t *src, int src_stride, int width, int height) { uint16_t *srcp = CONVERT_TO_SHORTPTR(src); int r, c; uint64_t ss = 0, s = 0; for (r = 0; r < height; r++) { for (c = 0; c < width; c++) { const uint16_t v = srcp[c]; ss += v * v; s += v; } srcp += src_stride; } return (ss - s * s / (width * height)); } #endif // CONFIG_AV1_HIGHBITDEPTH uint64_t aom_sum_sse_2d_i16_c(const int16_t *src, int src_stride, int width, int height, int *sum) { int r, c; int16_t *srcp = (int16_t *)src; int64_t ss = 0; for (r = 0; r < height; r++) { for (c = 0; c < width; c++) { const int16_t v = srcp[c]; ss += v * v; *sum += v; } srcp += src_stride; } return ss; } aom-3.12.1/aom_dsp/txfm_common.h000066400000000000000000000136201477627663500165240ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_TXFM_COMMON_H_ #define AOM_AOM_DSP_TXFM_COMMON_H_ #include "aom_dsp/aom_dsp_common.h" // Constants and Macros used by all idct/dct functions #define DCT_CONST_BITS 14 #define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1)) #define UNIT_QUANT_SHIFT 2 #define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT) // block transform size enum { TX_4X4, // 4x4 transform TX_8X8, // 8x8 transform TX_16X16, // 16x16 transform TX_32X32, // 32x32 transform TX_64X64, // 64x64 transform TX_4X8, // 4x8 transform TX_8X4, // 8x4 transform TX_8X16, // 8x16 transform TX_16X8, // 16x8 transform TX_16X32, // 16x32 transform TX_32X16, // 32x16 transform TX_32X64, // 32x64 transform TX_64X32, // 64x32 transform TX_4X16, // 4x16 transform TX_16X4, // 16x4 transform TX_8X32, // 8x32 transform TX_32X8, // 32x8 transform TX_16X64, // 16x64 transform TX_64X16, // 64x16 transform TX_SIZES_ALL, // Includes rectangular transforms TX_SIZES = TX_4X8, // Does NOT include rectangular transforms TX_SIZES_LARGEST = TX_64X64, TX_INVALID = 255 // Invalid transform size } UENUM1BYTE(TX_SIZE); enum { DCT_DCT, // DCT in both horizontal and vertical ADST_DCT, // ADST in vertical, DCT in horizontal DCT_ADST, // DCT in vertical, ADST in horizontal ADST_ADST, // ADST in both directions FLIPADST_DCT, // FLIPADST in vertical, DCT in horizontal DCT_FLIPADST, // DCT in vertical, FLIPADST in horizontal FLIPADST_FLIPADST, // FLIPADST in both directions ADST_FLIPADST, // ADST in vertical, FLIPADST in horizontal FLIPADST_ADST, // FLIPADST in vertical, ADST in horizontal IDTX, // Identity in both directions V_DCT, // DCT in vertical, identity in horizontal H_DCT, // Identity in vertical, DCT in horizontal V_ADST, // ADST in vertical, identity in horizontal H_ADST, // Identity in vertical, ADST in horizontal V_FLIPADST, // FLIPADST in vertical, identity in horizontal H_FLIPADST, // Identity in vertical, FLIPADST in horizontal TX_TYPES, DCT_ADST_TX_MASK = 0x000F, // Either DCT or ADST in each direction TX_TYPE_INVALID = 255, // Invalid transform type } UENUM1BYTE(TX_TYPE); enum { // DCT only EXT_TX_SET_DCTONLY, // DCT + Identity only EXT_TX_SET_DCT_IDTX, // Discrete Trig transforms w/o flip (4) + Identity (1) EXT_TX_SET_DTT4_IDTX, // Discrete Trig transforms w/o flip (4) + Identity (1) + 1D Hor/vert DCT (2) EXT_TX_SET_DTT4_IDTX_1DDCT, // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver DCT (2) EXT_TX_SET_DTT9_IDTX_1DDCT, // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6) EXT_TX_SET_ALL16, EXT_TX_SET_TYPES } UENUM1BYTE(TxSetType); typedef struct txfm_param { // for both forward and inverse transforms TX_TYPE tx_type; TX_SIZE tx_size; int lossless; int bd; // are the pixel buffers octets or shorts? This should collapse to // bd==8 implies !is_hbd, but that's not certain right now. int is_hbd; TxSetType tx_set_type; // for inverse transforms only int eob; } TxfmParam; // Constants: // for (int i = 1; i< 32; ++i) // printf("static const int cospi_%d_64 = %.0f;\n", i, // round(16384 * cos(i*PI/64))); // Note: sin(k*Pi/64) = cos((32-k)*Pi/64) static const tran_high_t cospi_1_64 = 16364; static const tran_high_t cospi_2_64 = 16305; static const tran_high_t cospi_3_64 = 16207; static const tran_high_t cospi_4_64 = 16069; static const tran_high_t cospi_5_64 = 15893; static const tran_high_t cospi_6_64 = 15679; static const tran_high_t cospi_7_64 = 15426; static const tran_high_t cospi_8_64 = 15137; static const tran_high_t cospi_9_64 = 14811; static const tran_high_t cospi_10_64 = 14449; static const tran_high_t cospi_11_64 = 14053; static const tran_high_t cospi_12_64 = 13623; static const tran_high_t cospi_13_64 = 13160; static const tran_high_t cospi_14_64 = 12665; static const tran_high_t cospi_15_64 = 12140; static const tran_high_t cospi_16_64 = 11585; static const tran_high_t cospi_17_64 = 11003; static const tran_high_t cospi_18_64 = 10394; static const tran_high_t cospi_19_64 = 9760; static const tran_high_t cospi_20_64 = 9102; static const tran_high_t cospi_21_64 = 8423; static const tran_high_t cospi_22_64 = 7723; static const tran_high_t cospi_23_64 = 7005; static const tran_high_t cospi_24_64 = 6270; static const tran_high_t cospi_25_64 = 5520; static const tran_high_t cospi_26_64 = 4756; static const tran_high_t cospi_27_64 = 3981; static const tran_high_t cospi_28_64 = 3196; static const tran_high_t cospi_29_64 = 2404; static const tran_high_t cospi_30_64 = 1606; static const tran_high_t cospi_31_64 = 804; // 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 static const tran_high_t sinpi_1_9 = 5283; static const tran_high_t sinpi_2_9 = 9929; static const tran_high_t sinpi_3_9 = 13377; static const tran_high_t sinpi_4_9 = 15212; // 16384 * sqrt(2) static const tran_high_t Sqrt2 = 23170; static const tran_high_t InvSqrt2 = 11585; static inline tran_high_t fdct_round_shift(tran_high_t input) { tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); return rv; } #endif // AOM_AOM_DSP_TXFM_COMMON_H_ aom-3.12.1/aom_dsp/variance.c000066400000000000000000001417641477627663500157740ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/blend.h" #include "aom_dsp/variance.h" #include "av1/common/filter.h" #include "av1/common/reconinter.h" #if !CONFIG_REALTIME_ONLY uint32_t aom_get_mb_ss_c(const int16_t *a) { unsigned int i, sum = 0; for (i = 0; i < 256; ++i) { sum += a[i] * a[i]; } return sum; } #endif // !CONFIG_REALTIME_ONLY static void variance(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int w, int h, uint32_t *sse, int *sum) { int i, j; int tsum = 0; uint32_t tsse = 0; for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { const int diff = a[j] - b[j]; tsum += diff; tsse += diff * diff; } a += a_stride; b += b_stride; } *sum = tsum; *sse = tsse; } uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int w, int h) { uint32_t sse; int sum; variance(a, a_stride, b, b_stride, w, h, &sse, &sum); return sse; } // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal // or vertical direction to produce the filtered output block. Used to implement // the first-pass of 2-D separable filter. // // Produces int16_t output to retain precision for the next pass. Two filter // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). // It defines the offset required to move from one input to the next. static void var_filter_block2d_bil_first_pass_c( const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { b[j] = ROUND_POWER_OF_TWO( (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); ++a; } a += src_pixels_per_line - output_width; b += output_width; } } // Applies a 1-D 2-tap bilinear filter to the source block in either horizontal // or vertical direction to produce the filtered output block. Used to implement // the second-pass of 2-D separable filter. // // Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two // filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the // filter is applied horizontally (pixel_step = 1) or vertically // (pixel_step = stride). It defines the offset required to move from one input // to the next. Output is 8-bit. static void var_filter_block2d_bil_second_pass_c( const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { b[j] = ROUND_POWER_OF_TWO( (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); ++a; } a += src_pixels_per_line - output_width; b += output_width; } } #define VAR(W, H) \ uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ const uint8_t *b, int b_stride, \ uint32_t *sse) { \ int sum; \ variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ } #define SUBPIX_VAR(W, H) \ uint32_t aom_sub_pixel_variance##W##x##H##_c( \ const uint8_t *a, int a_stride, int xoffset, int yoffset, \ const uint8_t *b, int b_stride, uint32_t *sse) { \ uint16_t fdata3[(H + 1) * W]; \ uint8_t temp2[H * W]; \ \ var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ bilinear_filters_2t[xoffset]); \ var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ bilinear_filters_2t[yoffset]); \ \ return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ } #define SUBPIX_AVG_VAR(W, H) \ uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \ const uint8_t *a, int a_stride, int xoffset, int yoffset, \ const uint8_t *b, int b_stride, uint32_t *sse, \ const uint8_t *second_pred) { \ uint16_t fdata3[(H + 1) * W]; \ uint8_t temp2[H * W]; \ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ \ var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ bilinear_filters_2t[xoffset]); \ var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ bilinear_filters_2t[yoffset]); \ \ aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ \ return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ } void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8) { // Loop over 4 8x8 blocks. Process one 8x32 block. for (int k = 0; k < 4; k++) { variance(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8, &sse8x8[k], &sum8x8[k]); } // Calculate variance at 8x8 level and total sse, sum of 8x32 block. *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3]; *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3]; for (int i = 0; i < 4; i++) var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6); } void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16) { int sum16x16[2] = { 0 }; // Loop over two consecutive 16x16 blocks and process as one 16x32 block. for (int k = 0; k < 2; k++) { variance(src_ptr + (k * 16), source_stride, ref_ptr + (k * 16), ref_stride, 16, 16, &sse16x16[k], &sum16x16[k]); } // Calculate variance at 16x16 level and total sse, sum of 16x32 block. *tot_sse += sse16x16[0] + sse16x16[1]; *tot_sum += sum16x16[0] + sum16x16[1]; for (int i = 0; i < 2; i++) var16x16[i] = sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8); } /* Identical to the variance call except it does not calculate the * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in * variable. */ #define MSE(W, H) \ uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ const uint8_t *b, int b_stride, \ uint32_t *sse) { \ int sum; \ variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ return *sse; \ } /* All three forms of the variance are available in the same sizes. */ #define VARIANCES(W, H) \ VAR(W, H) \ SUBPIX_VAR(W, H) \ SUBPIX_AVG_VAR(W, H) VARIANCES(128, 128) VARIANCES(128, 64) VARIANCES(64, 128) VARIANCES(64, 64) VARIANCES(64, 32) VARIANCES(32, 64) VARIANCES(32, 32) VARIANCES(32, 16) VARIANCES(16, 32) VARIANCES(16, 16) VARIANCES(16, 8) VARIANCES(8, 16) VARIANCES(8, 8) VARIANCES(8, 4) VARIANCES(4, 8) VARIANCES(4, 4) // Realtime mode doesn't use rectangular blocks. #if !CONFIG_REALTIME_ONLY VARIANCES(4, 16) VARIANCES(16, 4) VARIANCES(8, 32) VARIANCES(32, 8) VARIANCES(16, 64) VARIANCES(64, 16) #endif MSE(16, 16) MSE(16, 8) MSE(8, 16) MSE(8, 8) void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride) { int i, j; for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { const int tmp = pred[j] + ref[j]; comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); } comp_pred += width; pred += width; ref += ref_stride; } } #if CONFIG_AV1_HIGHBITDEPTH static void highbd_variance64(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int w, int h, uint64_t *sse, int64_t *sum) { const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); int64_t tsum = 0; uint64_t tsse = 0; for (int i = 0; i < h; ++i) { int32_t lsum = 0; for (int j = 0; j < w; ++j) { const int diff = a[j] - b[j]; lsum += diff; tsse += (uint32_t)(diff * diff); } tsum += lsum; a += a_stride; b += b_stride; } *sum = tsum; *sse = tsse; } uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int w, int h) { uint64_t sse; int64_t sum; highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum); return sse; } static void highbd_8_variance(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int w, int h, uint32_t *sse, int *sum) { uint64_t sse_long = 0; int64_t sum_long = 0; highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); *sse = (uint32_t)sse_long; *sum = (int)sum_long; } static void highbd_10_variance(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int w, int h, uint32_t *sse, int *sum) { uint64_t sse_long = 0; int64_t sum_long = 0; highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); } static void highbd_12_variance(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int w, int h, uint32_t *sse, int *sum) { uint64_t sse_long = 0; int64_t sum_long = 0; highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); } #define HIGHBD_VAR(W, H) \ uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ const uint8_t *b, int b_stride, \ uint32_t *sse) { \ int sum; \ highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ } \ \ uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ const uint8_t *b, int b_stride, \ uint32_t *sse) { \ int sum; \ int64_t var; \ highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ return (var >= 0) ? (uint32_t)var : 0; \ } \ \ uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ const uint8_t *b, int b_stride, \ uint32_t *sse) { \ int sum; \ int64_t var; \ highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ return (var >= 0) ? (uint32_t)var : 0; \ } #define HIGHBD_MSE(W, H) \ uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride, \ uint32_t *sse) { \ int sum; \ highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ return *sse; \ } \ \ uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride, \ uint32_t *sse) { \ int sum; \ highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ return *sse; \ } \ \ uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride, \ uint32_t *sse) { \ int sum; \ highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ return *sse; \ } void aom_highbd_var_filter_block2d_bil_first_pass( const uint8_t *src_ptr8, uint16_t *output_ptr, unsigned int src_pixels_per_line, int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter) { unsigned int i, j; uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { output_ptr[j] = ROUND_POWER_OF_TWO( (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], FILTER_BITS); ++src_ptr; } // Next row... src_ptr += src_pixels_per_line - output_width; output_ptr += output_width; } } void aom_highbd_var_filter_block2d_bil_second_pass( const uint16_t *src_ptr, uint16_t *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { output_ptr[j] = ROUND_POWER_OF_TWO( (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], FILTER_BITS); ++src_ptr; } src_ptr += src_pixels_per_line - output_width; output_ptr += output_width; } } #define HIGHBD_SUBPIX_VAR(W, H) \ uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *dst, int dst_stride, uint32_t *sse) { \ uint16_t fdata3[(H + 1) * W]; \ uint16_t temp2[H * W]; \ \ aom_highbd_var_filter_block2d_bil_first_pass( \ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ dst, dst_stride, sse); \ } \ \ uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *dst, int dst_stride, uint32_t *sse) { \ uint16_t fdata3[(H + 1) * W]; \ uint16_t temp2[H * W]; \ \ aom_highbd_var_filter_block2d_bil_first_pass( \ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ dst, dst_stride, sse); \ } \ \ uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *dst, int dst_stride, uint32_t *sse) { \ uint16_t fdata3[(H + 1) * W]; \ uint16_t temp2[H * W]; \ \ aom_highbd_var_filter_block2d_bil_first_pass( \ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ dst, dst_stride, sse); \ } #define HIGHBD_SUBPIX_AVG_VAR(W, H) \ uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *dst, int dst_stride, uint32_t *sse, \ const uint8_t *second_pred) { \ uint16_t fdata3[(H + 1) * W]; \ uint16_t temp2[H * W]; \ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ \ aom_highbd_var_filter_block2d_bil_first_pass( \ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W); \ \ return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ dst, dst_stride, sse); \ } \ \ uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *dst, int dst_stride, uint32_t *sse, \ const uint8_t *second_pred) { \ uint16_t fdata3[(H + 1) * W]; \ uint16_t temp2[H * W]; \ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ \ aom_highbd_var_filter_block2d_bil_first_pass( \ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W); \ \ return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ dst, dst_stride, sse); \ } \ \ uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *dst, int dst_stride, uint32_t *sse, \ const uint8_t *second_pred) { \ uint16_t fdata3[(H + 1) * W]; \ uint16_t temp2[H * W]; \ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ \ aom_highbd_var_filter_block2d_bil_first_pass( \ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W); \ \ return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ dst, dst_stride, sse); \ } \ \ /* All three forms of the variance are available in the same sizes. */ #define HIGHBD_VARIANCES(W, H) \ HIGHBD_VAR(W, H) \ HIGHBD_SUBPIX_VAR(W, H) \ HIGHBD_SUBPIX_AVG_VAR(W, H) HIGHBD_VARIANCES(128, 128) HIGHBD_VARIANCES(128, 64) HIGHBD_VARIANCES(64, 128) HIGHBD_VARIANCES(64, 64) HIGHBD_VARIANCES(64, 32) HIGHBD_VARIANCES(32, 64) HIGHBD_VARIANCES(32, 32) HIGHBD_VARIANCES(32, 16) HIGHBD_VARIANCES(16, 32) HIGHBD_VARIANCES(16, 16) HIGHBD_VARIANCES(16, 8) HIGHBD_VARIANCES(8, 16) HIGHBD_VARIANCES(8, 8) HIGHBD_VARIANCES(8, 4) HIGHBD_VARIANCES(4, 8) HIGHBD_VARIANCES(4, 4) // Realtime mode doesn't use 4x rectangular blocks. #if !CONFIG_REALTIME_ONLY HIGHBD_VARIANCES(4, 16) HIGHBD_VARIANCES(16, 4) HIGHBD_VARIANCES(8, 32) HIGHBD_VARIANCES(32, 8) HIGHBD_VARIANCES(16, 64) HIGHBD_VARIANCES(64, 16) #endif HIGHBD_MSE(16, 16) HIGHBD_MSE(16, 8) HIGHBD_MSE(8, 16) HIGHBD_MSE(8, 8) void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride) { int i, j; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { const int tmp = pred[j] + ref[j]; comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); } comp_pred += width; pred += width; ref += ref_stride; } } #endif // CONFIG_AV1_HIGHBITDEPTH void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { int i, j; const uint8_t *src0 = invert_mask ? pred : ref; const uint8_t *src1 = invert_mask ? ref : pred; const int stride0 = invert_mask ? width : ref_stride; const int stride1 = invert_mask ? ref_stride : width; for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]); } comp_pred += width; src0 += stride0; src1 += stride1; mask += mask_stride; } } #define MASK_SUBPIX_VAR(W, H) \ unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ const uint8_t *msk, int msk_stride, int invert_mask, \ unsigned int *sse) { \ uint16_t fdata3[(H + 1) * W]; \ uint8_t temp2[H * W]; \ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ \ var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, W, \ bilinear_filters_2t[xoffset]); \ var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ bilinear_filters_2t[yoffset]); \ \ aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \ invert_mask); \ return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \ } MASK_SUBPIX_VAR(4, 4) MASK_SUBPIX_VAR(4, 8) MASK_SUBPIX_VAR(8, 4) MASK_SUBPIX_VAR(8, 8) MASK_SUBPIX_VAR(8, 16) MASK_SUBPIX_VAR(16, 8) MASK_SUBPIX_VAR(16, 16) MASK_SUBPIX_VAR(16, 32) MASK_SUBPIX_VAR(32, 16) MASK_SUBPIX_VAR(32, 32) MASK_SUBPIX_VAR(32, 64) MASK_SUBPIX_VAR(64, 32) MASK_SUBPIX_VAR(64, 64) MASK_SUBPIX_VAR(64, 128) MASK_SUBPIX_VAR(128, 64) MASK_SUBPIX_VAR(128, 128) // Realtime mode doesn't use 4x rectangular blocks. #if !CONFIG_REALTIME_ONLY MASK_SUBPIX_VAR(4, 16) MASK_SUBPIX_VAR(16, 4) MASK_SUBPIX_VAR(8, 32) MASK_SUBPIX_VAR(32, 8) MASK_SUBPIX_VAR(16, 64) MASK_SUBPIX_VAR(64, 16) #endif #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { int i, j; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { if (!invert_mask) comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]); else comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]); } comp_pred += width; pred += width; ref += ref_stride; mask += mask_stride; } } #define HIGHBD_MASK_SUBPIX_VAR(W, H) \ unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ const uint8_t *msk, int msk_stride, int invert_mask, \ unsigned int *sse) { \ uint16_t fdata3[(H + 1) * W]; \ uint16_t temp2[H * W]; \ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ \ aom_highbd_var_filter_block2d_bil_first_pass( \ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ invert_mask); \ \ return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ ref, ref_stride, sse); \ } \ \ unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ const uint8_t *msk, int msk_stride, int invert_mask, \ unsigned int *sse) { \ uint16_t fdata3[(H + 1) * W]; \ uint16_t temp2[H * W]; \ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ \ aom_highbd_var_filter_block2d_bil_first_pass( \ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ invert_mask); \ \ return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ ref, ref_stride, sse); \ } \ \ unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ const uint8_t *msk, int msk_stride, int invert_mask, \ unsigned int *sse) { \ uint16_t fdata3[(H + 1) * W]; \ uint16_t temp2[H * W]; \ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ \ aom_highbd_var_filter_block2d_bil_first_pass( \ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ invert_mask); \ \ return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ ref, ref_stride, sse); \ } HIGHBD_MASK_SUBPIX_VAR(4, 4) HIGHBD_MASK_SUBPIX_VAR(4, 8) HIGHBD_MASK_SUBPIX_VAR(8, 4) HIGHBD_MASK_SUBPIX_VAR(8, 8) HIGHBD_MASK_SUBPIX_VAR(8, 16) HIGHBD_MASK_SUBPIX_VAR(16, 8) HIGHBD_MASK_SUBPIX_VAR(16, 16) HIGHBD_MASK_SUBPIX_VAR(16, 32) HIGHBD_MASK_SUBPIX_VAR(32, 16) HIGHBD_MASK_SUBPIX_VAR(32, 32) HIGHBD_MASK_SUBPIX_VAR(32, 64) HIGHBD_MASK_SUBPIX_VAR(64, 32) HIGHBD_MASK_SUBPIX_VAR(64, 64) HIGHBD_MASK_SUBPIX_VAR(64, 128) HIGHBD_MASK_SUBPIX_VAR(128, 64) HIGHBD_MASK_SUBPIX_VAR(128, 128) #if !CONFIG_REALTIME_ONLY HIGHBD_MASK_SUBPIX_VAR(4, 16) HIGHBD_MASK_SUBPIX_VAR(16, 4) HIGHBD_MASK_SUBPIX_VAR(8, 32) HIGHBD_MASK_SUBPIX_VAR(32, 8) HIGHBD_MASK_SUBPIX_VAR(16, 64) HIGHBD_MASK_SUBPIX_VAR(64, 16) #endif #endif // CONFIG_AV1_HIGHBITDEPTH #if !CONFIG_REALTIME_ONLY static inline void obmc_variance(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int w, int h, unsigned int *sse, int *sum) { int i, j; unsigned int tsse = 0; int tsum = 0; for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12); tsum += diff; tsse += diff * diff; } pre += pre_stride; wsrc += w; mask += w; } *sse = tsse; *sum = tsum; } #define OBMC_VAR(W, H) \ unsigned int aom_obmc_variance##W##x##H##_c( \ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ const int32_t *mask, unsigned int *sse) { \ int sum; \ obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ } #define OBMC_SUBPIX_VAR(W, H) \ unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ uint16_t fdata3[(H + 1) * W]; \ uint8_t temp2[H * W]; \ \ var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, W, \ bilinear_filters_2t[xoffset]); \ var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ bilinear_filters_2t[yoffset]); \ \ return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \ } OBMC_VAR(4, 4) OBMC_SUBPIX_VAR(4, 4) OBMC_VAR(4, 8) OBMC_SUBPIX_VAR(4, 8) OBMC_VAR(8, 4) OBMC_SUBPIX_VAR(8, 4) OBMC_VAR(8, 8) OBMC_SUBPIX_VAR(8, 8) OBMC_VAR(8, 16) OBMC_SUBPIX_VAR(8, 16) OBMC_VAR(16, 8) OBMC_SUBPIX_VAR(16, 8) OBMC_VAR(16, 16) OBMC_SUBPIX_VAR(16, 16) OBMC_VAR(16, 32) OBMC_SUBPIX_VAR(16, 32) OBMC_VAR(32, 16) OBMC_SUBPIX_VAR(32, 16) OBMC_VAR(32, 32) OBMC_SUBPIX_VAR(32, 32) OBMC_VAR(32, 64) OBMC_SUBPIX_VAR(32, 64) OBMC_VAR(64, 32) OBMC_SUBPIX_VAR(64, 32) OBMC_VAR(64, 64) OBMC_SUBPIX_VAR(64, 64) OBMC_VAR(64, 128) OBMC_SUBPIX_VAR(64, 128) OBMC_VAR(128, 64) OBMC_SUBPIX_VAR(128, 64) OBMC_VAR(128, 128) OBMC_SUBPIX_VAR(128, 128) OBMC_VAR(4, 16) OBMC_SUBPIX_VAR(4, 16) OBMC_VAR(16, 4) OBMC_SUBPIX_VAR(16, 4) OBMC_VAR(8, 32) OBMC_SUBPIX_VAR(8, 32) OBMC_VAR(32, 8) OBMC_SUBPIX_VAR(32, 8) OBMC_VAR(16, 64) OBMC_SUBPIX_VAR(16, 64) OBMC_VAR(64, 16) OBMC_SUBPIX_VAR(64, 16) #if CONFIG_AV1_HIGHBITDEPTH static inline void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, const int32_t *mask, int w, int h, uint64_t *sse, int64_t *sum) { int i, j; uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); uint64_t tsse = 0; int64_t tsum = 0; for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12); tsum += diff; tsse += diff * diff; } pre += pre_stride; wsrc += w; mask += w; } *sse = tsse; *sum = tsum; } static inline void highbd_obmc_variance(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, const int32_t *mask, int w, int h, unsigned int *sse, int *sum) { int64_t sum64; uint64_t sse64; highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); *sum = (int)sum64; *sse = (unsigned int)sse64; } static inline void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, const int32_t *mask, int w, int h, unsigned int *sse, int *sum) { int64_t sum64; uint64_t sse64; highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); } static inline void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, const int32_t *mask, int w, int h, unsigned int *sse, int *sum) { int64_t sum64; uint64_t sse64; highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); } #define HIGHBD_OBMC_VAR(W, H) \ unsigned int aom_highbd_8_obmc_variance##W##x##H##_c( \ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ const int32_t *mask, unsigned int *sse) { \ int sum; \ highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ } \ \ unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ const int32_t *mask, unsigned int *sse) { \ int sum; \ int64_t var; \ highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ return (var >= 0) ? (uint32_t)var : 0; \ } \ \ unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ const int32_t *mask, unsigned int *sse) { \ int sum; \ int64_t var; \ highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ return (var >= 0) ? (uint32_t)var : 0; \ } #define HIGHBD_OBMC_SUBPIX_VAR(W, H) \ unsigned int aom_highbd_8_obmc_sub_pixel_variance##W##x##H##_c( \ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ uint16_t fdata3[(H + 1) * W]; \ uint16_t temp2[H * W]; \ \ aom_highbd_var_filter_block2d_bil_first_pass( \ pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ return aom_highbd_8_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ W, wsrc, mask, sse); \ } \ \ unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ uint16_t fdata3[(H + 1) * W]; \ uint16_t temp2[H * W]; \ \ aom_highbd_var_filter_block2d_bil_first_pass( \ pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ W, wsrc, mask, sse); \ } \ \ unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ uint16_t fdata3[(H + 1) * W]; \ uint16_t temp2[H * W]; \ \ aom_highbd_var_filter_block2d_bil_first_pass( \ pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ W, wsrc, mask, sse); \ } HIGHBD_OBMC_VAR(4, 4) HIGHBD_OBMC_SUBPIX_VAR(4, 4) HIGHBD_OBMC_VAR(4, 8) HIGHBD_OBMC_SUBPIX_VAR(4, 8) HIGHBD_OBMC_VAR(8, 4) HIGHBD_OBMC_SUBPIX_VAR(8, 4) HIGHBD_OBMC_VAR(8, 8) HIGHBD_OBMC_SUBPIX_VAR(8, 8) HIGHBD_OBMC_VAR(8, 16) HIGHBD_OBMC_SUBPIX_VAR(8, 16) HIGHBD_OBMC_VAR(16, 8) HIGHBD_OBMC_SUBPIX_VAR(16, 8) HIGHBD_OBMC_VAR(16, 16) HIGHBD_OBMC_SUBPIX_VAR(16, 16) HIGHBD_OBMC_VAR(16, 32) HIGHBD_OBMC_SUBPIX_VAR(16, 32) HIGHBD_OBMC_VAR(32, 16) HIGHBD_OBMC_SUBPIX_VAR(32, 16) HIGHBD_OBMC_VAR(32, 32) HIGHBD_OBMC_SUBPIX_VAR(32, 32) HIGHBD_OBMC_VAR(32, 64) HIGHBD_OBMC_SUBPIX_VAR(32, 64) HIGHBD_OBMC_VAR(64, 32) HIGHBD_OBMC_SUBPIX_VAR(64, 32) HIGHBD_OBMC_VAR(64, 64) HIGHBD_OBMC_SUBPIX_VAR(64, 64) HIGHBD_OBMC_VAR(64, 128) HIGHBD_OBMC_SUBPIX_VAR(64, 128) HIGHBD_OBMC_VAR(128, 64) HIGHBD_OBMC_SUBPIX_VAR(128, 64) HIGHBD_OBMC_VAR(128, 128) HIGHBD_OBMC_SUBPIX_VAR(128, 128) HIGHBD_OBMC_VAR(4, 16) HIGHBD_OBMC_SUBPIX_VAR(4, 16) HIGHBD_OBMC_VAR(16, 4) HIGHBD_OBMC_SUBPIX_VAR(16, 4) HIGHBD_OBMC_VAR(8, 32) HIGHBD_OBMC_SUBPIX_VAR(8, 32) HIGHBD_OBMC_VAR(32, 8) HIGHBD_OBMC_SUBPIX_VAR(32, 8) HIGHBD_OBMC_VAR(16, 64) HIGHBD_OBMC_SUBPIX_VAR(16, 64) HIGHBD_OBMC_VAR(64, 16) HIGHBD_OBMC_SUBPIX_VAR(64, 16) #endif // CONFIG_AV1_HIGHBITDEPTH #endif // !CONFIG_REALTIME_ONLY uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, int sstride, int w, int h) { uint64_t sum = 0; for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j]; sum += e * e; } } return sum; } uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, int w, int h) { uint16_t *src_temp = src; uint8_t *dst_temp = dst; const int num_blks = 16 / w; int64_t sum = 0; for (int i = 0; i < num_blks; i++) { sum += aom_mse_wxh_16bit_c(dst_temp, dstride, src_temp, w, w, h); dst_temp += w; src_temp += (w * h); } return sum; } #if CONFIG_AV1_HIGHBITDEPTH uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src, int sstride, int w, int h) { uint64_t sum = 0; for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { int e = dst[i * dstride + j] - src[i * sstride + j]; sum += e * e; } } return sum; } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/variance.h000066400000000000000000000124761477627663500157760ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_VARIANCE_H_ #define AOM_AOM_DSP_VARIANCE_H_ #include "config/aom_config.h" #include "aom/aom_integer.h" #ifdef __cplusplus extern "C" { #endif #define FILTER_BITS 7 #define FILTER_WEIGHT 128 typedef unsigned int (*aom_sad_fn_t)(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride); typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *second_pred); typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b, int b_stride, int n); typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride, const uint8_t *const b_array[], int b_stride, unsigned int *sad_array); typedef unsigned int (*aom_variance_fn_t)(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, unsigned int *sse); typedef unsigned int (*aom_subpixvariance_fn_t)(const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, int b_stride, unsigned int *sse); typedef unsigned int (*aom_subp_avg_variance_fn_t)( const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, int b_stride, unsigned int *sse, const uint8_t *second_pred); typedef unsigned int (*aom_dist_wtd_sad_avg_fn_t)( const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param); typedef unsigned int (*aom_dist_wtd_subp_avg_variance_fn_t)( const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, int b_stride, unsigned int *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param); typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask); typedef unsigned int (*aom_masked_subpixvariance_fn_t)( const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse); typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride, const int32_t *wsrc, const int32_t *msk); typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred, int pred_stride, const int32_t *wsrc, const int32_t *msk, unsigned int *sse); typedef unsigned int (*aom_obmc_subpixvariance_fn_t)( const uint8_t *pred, int pred_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *msk, unsigned int *sse); typedef struct aom_variance_vtable { aom_sad_fn_t sdf; // Same as normal sad, but downsample the rows by a factor of 2. aom_sad_fn_t sdsf; aom_sad_avg_fn_t sdaf; aom_variance_fn_t vf; aom_subpixvariance_fn_t svf; aom_subp_avg_variance_fn_t svaf; aom_sad_multi_d_fn_t sdx4df; aom_sad_multi_d_fn_t sdx3df; // Same as sadx4, but downsample the rows by a factor of 2. aom_sad_multi_d_fn_t sdsx4df; aom_masked_sad_fn_t msdf; aom_masked_subpixvariance_fn_t msvf; aom_obmc_sad_fn_t osdf; aom_obmc_variance_fn_t ovf; aom_obmc_subpixvariance_fn_t osvf; } aom_variance_fn_ptr_t; void aom_highbd_var_filter_block2d_bil_first_pass( const uint8_t *src_ptr8, uint16_t *output_ptr, unsigned int src_pixels_per_line, int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter); void aom_highbd_var_filter_block2d_bil_second_pass( const uint16_t *src_ptr, uint16_t *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter); uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int w, int h); uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int w, int h); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_VARIANCE_H_ aom-3.12.1/aom_dsp/vmaf.c000066400000000000000000000145601477627663500151260ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom_dsp/vmaf.h" #include #include #include #include #ifdef _WIN32 #include #else #include #endif #include "aom_dsp/blend.h" static void vmaf_fatal_error(const char *message) { fprintf(stderr, "Fatal error: %s\n", message); exit(EXIT_FAILURE); } void aom_init_vmaf_model(VmafModel **vmaf_model, const char *model_path) { if (*vmaf_model != NULL) return; VmafModelConfig model_cfg; model_cfg.flags = VMAF_MODEL_FLAG_DISABLE_CLIP; model_cfg.name = "vmaf"; if (vmaf_model_load_from_path(vmaf_model, &model_cfg, model_path)) { vmaf_fatal_error("Failed to load VMAF model."); } } void aom_close_vmaf_model(VmafModel *vmaf_model) { vmaf_model_destroy(vmaf_model); } static void copy_picture(const int bit_depth, const YV12_BUFFER_CONFIG *src, VmafPicture *dst) { const int width = src->y_width; const int height = src->y_height; if (bit_depth > 8) { uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src->y_buffer); uint16_t *dst_ptr = dst->data[0]; for (int row = 0; row < height; ++row) { memcpy(dst_ptr, src_ptr, width * sizeof(dst_ptr[0])); src_ptr += src->y_stride; dst_ptr += dst->stride[0] / 2; } } else { uint8_t *src_ptr = src->y_buffer; uint8_t *dst_ptr = (uint8_t *)dst->data[0]; for (int row = 0; row < height; ++row) { memcpy(dst_ptr, src_ptr, width * sizeof(dst_ptr[0])); src_ptr += src->y_stride; dst_ptr += dst->stride[0]; } } } void aom_init_vmaf_context(VmafContext **vmaf_context, VmafModel *vmaf_model, bool cal_vmaf_neg) { // TODO(sdeng): make them CLI arguments. VmafConfiguration cfg; cfg.log_level = VMAF_LOG_LEVEL_NONE; cfg.n_threads = 0; cfg.n_subsample = 0; cfg.cpumask = 0; if (vmaf_init(vmaf_context, cfg)) { vmaf_fatal_error("Failed to init VMAF context."); } if (cal_vmaf_neg) { VmafFeatureDictionary *vif_feature = NULL; if (vmaf_feature_dictionary_set(&vif_feature, "vif_enhn_gain_limit", "1.0")) { vmaf_fatal_error("Failed to set vif_enhn_gain_limit."); } if (vmaf_model_feature_overload(vmaf_model, "float_vif", vif_feature)) { vmaf_fatal_error("Failed to use feature float_vif."); } VmafFeatureDictionary *adm_feature = NULL; if (vmaf_feature_dictionary_set(&adm_feature, "adm_enhn_gain_limit", "1.0")) { vmaf_fatal_error("Failed to set adm_enhn_gain_limit."); } if (vmaf_model_feature_overload(vmaf_model, "adm", adm_feature)) { vmaf_fatal_error("Failed to use feature float_adm."); } } VmafFeatureDictionary *motion_force_zero = NULL; if (vmaf_feature_dictionary_set(&motion_force_zero, "motion_force_zero", "1")) { vmaf_fatal_error("Failed to set motion_force_zero."); } if (vmaf_model_feature_overload(vmaf_model, "float_motion", motion_force_zero)) { vmaf_fatal_error("Failed to use feature float_motion."); } if (vmaf_use_features_from_model(*vmaf_context, vmaf_model)) { vmaf_fatal_error("Failed to load feature extractors from VMAF model."); } } void aom_close_vmaf_context(VmafContext *vmaf_context) { if (vmaf_close(vmaf_context)) { vmaf_fatal_error("Failed to close VMAF context."); } } void aom_calc_vmaf(VmafModel *vmaf_model, const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *distorted, int bit_depth, bool cal_vmaf_neg, double *vmaf) { VmafContext *vmaf_context; aom_init_vmaf_context(&vmaf_context, vmaf_model, cal_vmaf_neg); const int frame_index = 0; VmafPicture ref, dist; if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width, source->y_height) || vmaf_picture_alloc(&dist, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width, source->y_height)) { vmaf_fatal_error("Failed to alloc VMAF pictures."); } copy_picture(bit_depth, source, &ref); copy_picture(bit_depth, distorted, &dist); if (vmaf_read_pictures(vmaf_context, &ref, &dist, /*picture index=*/frame_index)) { vmaf_fatal_error("Failed to read VMAF pictures."); } if (vmaf_read_pictures(vmaf_context, NULL, NULL, 0)) { vmaf_fatal_error("Failed to flush context."); } vmaf_picture_unref(&ref); vmaf_picture_unref(&dist); vmaf_score_at_index(vmaf_context, vmaf_model, vmaf, frame_index); aom_close_vmaf_context(vmaf_context); } void aom_read_vmaf_image(VmafContext *vmaf_context, const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *distorted, int bit_depth, int frame_index) { VmafPicture ref, dist; if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width, source->y_height) || vmaf_picture_alloc(&dist, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width, source->y_height)) { vmaf_fatal_error("Failed to alloc VMAF pictures."); } copy_picture(bit_depth, source, &ref); copy_picture(bit_depth, distorted, &dist); if (vmaf_read_pictures(vmaf_context, &ref, &dist, /*picture index=*/frame_index)) { vmaf_fatal_error("Failed to read VMAF pictures."); } vmaf_picture_unref(&ref); vmaf_picture_unref(&dist); } double aom_calc_vmaf_at_index(VmafContext *vmaf_context, VmafModel *vmaf_model, int frame_index) { double vmaf; if (vmaf_score_at_index(vmaf_context, vmaf_model, &vmaf, frame_index)) { vmaf_fatal_error("Failed to calc VMAF scores."); } return vmaf; } void aom_flush_vmaf_context(VmafContext *vmaf_context) { if (vmaf_read_pictures(vmaf_context, NULL, NULL, 0)) { vmaf_fatal_error("Failed to flush context."); } } aom-3.12.1/aom_dsp/vmaf.h000066400000000000000000000031371477627663500151310ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_VMAF_H_ #define AOM_AOM_DSP_VMAF_H_ #include #include #include "aom_scale/yv12config.h" void aom_init_vmaf_context(VmafContext **vmaf_context, VmafModel *vmaf_model, bool cal_vmaf_neg); void aom_close_vmaf_context(VmafContext *vmaf_context); void aom_init_vmaf_model(VmafModel **vmaf_model, const char *model_path); void aom_close_vmaf_model(VmafModel *vmaf_model); void aom_calc_vmaf(VmafModel *vmaf_model, const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *distorted, int bit_depth, bool cal_vmaf_neg, double *vmaf); void aom_read_vmaf_image(VmafContext *vmaf_context, const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *distorted, int bit_depth, int frame_index); double aom_calc_vmaf_at_index(VmafContext *vmaf_context, VmafModel *vmaf_model, int frame_index); void aom_flush_vmaf_context(VmafContext *vmaf_context); #endif // AOM_AOM_DSP_VMAF_H_ aom-3.12.1/aom_dsp/x86/000077500000000000000000000000001477627663500144505ustar00rootroot00000000000000aom-3.12.1/aom_dsp/x86/adaptive_quantize_avx2.c000066400000000000000000000235171477627663500213010ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/quantize.h" #include "aom_dsp/x86/quantize_x86.h" static inline void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr, __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr, __m256i *shift) { *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr)); *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); *round = _mm256_permute4x64_epi64(*round, 0x54); *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); *quant = _mm256_permute4x64_epi64(*quant, 0x54); *dequant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr)); *shift = _mm256_permute4x64_epi64(*shift, 0x54); } static inline __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) { const __m256i coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr)); const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); return _mm256_packs_epi32(coeff1, coeff2); } static inline void update_mask1_avx2(__m256i *cmp_mask, const int16_t *iscan_ptr, int *is_found, __m256i *mask) { __m256i temp_mask = _mm256_setzero_si256(); if (_mm256_movemask_epi8(*cmp_mask)) { __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr)); temp_mask = _mm256_and_si256(*cmp_mask, iscan); *is_found = 1; } *mask = _mm256_max_epi16(temp_mask, *mask); } static inline void update_mask0_avx2(__m256i *qcoeff, __m256i *threshold, const int16_t *iscan_ptr, int *is_found, __m256i *mask) { __m256i zero = _mm256_setzero_si256(); __m256i coeff[2], cmp_mask0, cmp_mask1; coeff[0] = _mm256_unpacklo_epi16(*qcoeff, zero); coeff[1] = _mm256_unpackhi_epi16(*qcoeff, zero); coeff[0] = _mm256_slli_epi32(coeff[0], AOM_QM_BITS); cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]); coeff[1] = _mm256_slli_epi32(coeff[1], AOM_QM_BITS); cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]); cmp_mask0 = _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8); update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask); } static inline void calculate_qcoeff_avx2(__m256i *coeff, const __m256i *round, const __m256i *quant, const __m256i *shift) { __m256i tmp, qcoeff; qcoeff = _mm256_adds_epi16(*coeff, *round); tmp = _mm256_mulhi_epi16(qcoeff, *quant); qcoeff = _mm256_add_epi16(tmp, qcoeff); *coeff = _mm256_mulhi_epi16(qcoeff, *shift); } static inline __m256i calculate_dqcoeff_avx2(__m256i qcoeff, __m256i dequant) { return _mm256_mullo_epi16(qcoeff, dequant); } static inline void store_coefficients_avx2(__m256i coeff_vals, tran_low_t *coeff_ptr) { __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); _mm256_store_si256((__m256i *)(coeff_ptr), coeff_vals_lo); _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); } void aom_quantize_b_adaptive_avx2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int index = 16; int non_zero_count = 0; int non_zero_count_prescan_add_zero = 0; int is_found0 = 0, is_found1 = 0; int eob = -1; const __m256i zero = _mm256_setzero_si256(); __m256i zbin, round, quant, dequant, shift; __m256i coeff, qcoeff; __m256i cmp_mask, mask0 = zero, mask1 = zero; __m128i temp_mask0, temp_mask1; int prescan_add[2]; int thresh[2]; const qm_val_t wt = (1 << AOM_QM_BITS); for (int i = 0; i < 2; ++i) { prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; } __m256i threshold[2]; threshold[0] = _mm256_set1_epi32(thresh[0]); threshold[1] = _mm256_set1_epi32(thresh[1]); threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe); #if SKIP_EOB_FACTOR_ADJUST int first = -1; #endif // Setup global values. load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant, quant_shift_ptr, &shift); // Do DC and first 15 AC. coeff = load_coefficients_avx2(coeff_ptr); qcoeff = _mm256_abs_epi16(coeff); update_mask0_avx2(&qcoeff, threshold, iscan, &is_found0, &mask0); __m256i temp0 = _mm256_cmpgt_epi16(qcoeff, zbin); zbin = _mm256_unpackhi_epi64(zbin, zbin); cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8); update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1); threshold[0] = threshold[1]; if (_mm256_movemask_epi8(cmp_mask) == 0) { _mm256_store_si256((__m256i *)(qcoeff_ptr), zero); _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero); round = _mm256_unpackhi_epi64(round, round); quant = _mm256_unpackhi_epi64(quant, quant); shift = _mm256_unpackhi_epi64(shift, shift); dequant = _mm256_unpackhi_epi64(dequant, dequant); } else { calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift); round = _mm256_unpackhi_epi64(round, round); quant = _mm256_unpackhi_epi64(quant, quant); shift = _mm256_unpackhi_epi64(shift, shift); // Reinsert signs qcoeff = _mm256_sign_epi16(qcoeff, coeff); // Mask out zbin threshold coeffs qcoeff = _mm256_and_si256(qcoeff, temp0); store_coefficients_avx2(qcoeff, qcoeff_ptr); coeff = calculate_dqcoeff_avx2(qcoeff, dequant); dequant = _mm256_unpackhi_epi64(dequant, dequant); store_coefficients_avx2(coeff, dqcoeff_ptr); } // AC only loop. while (index < n_coeffs) { coeff = load_coefficients_avx2(coeff_ptr + index); qcoeff = _mm256_abs_epi16(coeff); update_mask0_avx2(&qcoeff, threshold, iscan + index, &is_found0, &mask0); temp0 = _mm256_cmpgt_epi16(qcoeff, zbin); cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8); update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1); if (_mm256_movemask_epi8(cmp_mask) == 0) { _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero); _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero); index += 16; continue; } calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift); qcoeff = _mm256_sign_epi16(qcoeff, coeff); qcoeff = _mm256_and_si256(qcoeff, temp0); store_coefficients_avx2(qcoeff, qcoeff_ptr + index); coeff = calculate_dqcoeff_avx2(qcoeff, dequant); store_coefficients_avx2(coeff, dqcoeff_ptr + index); index += 16; } if (is_found0) { temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0), _mm256_extracti128_si256(mask0, 1)); non_zero_count = calculate_non_zero_count(temp_mask0); } if (is_found1) { temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1), _mm256_extracti128_si256(mask1, 1)); non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1); } for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { const int rc = scan[i]; qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; } for (int i = non_zero_count - 1; i >= 0; i--) { const int rc = scan[i]; if (qcoeff_ptr[rc]) { eob = i; break; } } *eob_ptr = eob + 1; #if SKIP_EOB_FACTOR_ADJUST // TODO(Aniket): Experiment the following loop with intrinsic by combining // with the quantization loop above for (int i = 0; i < non_zero_count; i++) { const int rc = scan[i]; const int qcoeff0 = qcoeff_ptr[rc]; if (qcoeff0) { first = i; break; } } if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { const int rc = scan[(*eob_ptr - 1)]; if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { const int coeff0 = coeff_ptr[rc] * wt; const int coeff_sign = AOMSIGN(coeff0); const int abs_coeff = (coeff0 ^ coeff_sign) - coeff_sign; const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; const int prescan_add_val = ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); if (abs_coeff < (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; *eob_ptr = 0; } } } #endif } aom-3.12.1/aom_dsp/x86/adaptive_quantize_sse2.c000066400000000000000000000561121477627663500212720ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/quantize.h" #include "aom_dsp/x86/quantize_x86.h" void aom_quantize_b_adaptive_sse2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int index = 16; int non_zero_count = 0; int non_zero_count_prescan_add_zero = 0; int is_found0 = 0, is_found1 = 0; int eob = -1; const __m128i zero = _mm_setzero_si128(); __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i cmp_mask0, cmp_mask1; __m128i all_zero; __m128i mask0 = zero, mask1 = zero; int prescan_add[2]; int thresh[4]; const qm_val_t wt = (1 << AOM_QM_BITS); for (int i = 0; i < 2; ++i) { prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; } thresh[2] = thresh[3] = thresh[1]; __m128i threshold[2]; threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); #if SKIP_EOB_FACTOR_ADJUST int first = -1; #endif // Setup global values. load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant, quant_shift_ptr, &shift); // Do DC and first 15 AC. coeff0 = load_coefficients(coeff_ptr); coeff1 = load_coefficients(coeff_ptr + 8); // Poor man's abs(). coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1); threshold[0] = threshold[1]; all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_movemask_epi8(all_zero) == 0) { _mm_store_si128((__m128i *)(qcoeff_ptr), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); dequant = _mm_unpackhi_epi64(dequant, dequant); } else { calculate_qcoeff(&qcoeff0, round, quant, shift); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); calculate_qcoeff(&qcoeff1, round, quant, shift); // Reinsert signs qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr); store_coefficients(qcoeff1, qcoeff_ptr + 8); coeff0 = calculate_dqcoeff(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = calculate_dqcoeff(qcoeff1, dequant); store_coefficients(coeff0, dqcoeff_ptr); store_coefficients(coeff1, dqcoeff_ptr + 8); } // AC only loop. while (index < n_coeffs) { coeff0 = load_coefficients(coeff_ptr + index); coeff1 = load_coefficients(coeff_ptr + index + 8); coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, &mask0); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1); all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_movemask_epi8(all_zero) == 0) { _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); index += 16; continue; } calculate_qcoeff(&qcoeff0, round, quant, shift); calculate_qcoeff(&qcoeff1, round, quant, shift); qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr + index); store_coefficients(qcoeff1, qcoeff_ptr + index + 8); coeff0 = calculate_dqcoeff(qcoeff0, dequant); coeff1 = calculate_dqcoeff(qcoeff1, dequant); store_coefficients(coeff0, dqcoeff_ptr + index); store_coefficients(coeff1, dqcoeff_ptr + index + 8); index += 16; } if (is_found0) non_zero_count = calculate_non_zero_count(mask0); if (is_found1) non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { const int rc = scan[i]; qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; } for (int i = non_zero_count - 1; i >= 0; i--) { const int rc = scan[i]; if (qcoeff_ptr[rc]) { eob = i; break; } } *eob_ptr = eob + 1; #if SKIP_EOB_FACTOR_ADJUST // TODO(Aniket): Experiment the following loop with intrinsic by combining // with the quantization loop above for (int i = 0; i < non_zero_count; i++) { const int rc = scan[i]; const int qcoeff = qcoeff_ptr[rc]; if (qcoeff) { first = i; break; } } if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { const int rc = scan[(*eob_ptr - 1)]; if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { const int coeff = coeff_ptr[rc] * wt; const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; const int prescan_add_val = ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); if (abs_coeff < (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; *eob_ptr = 0; } } } #endif } void aom_quantize_b_32x32_adaptive_sse2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int index = 16; const int log_scale = 1; int non_zero_count = 0; int non_zero_count_prescan_add_zero = 0; int is_found0 = 0, is_found1 = 0; int eob = -1; const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i log_scale_vec = _mm_set1_epi16(log_scale); __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i cmp_mask0, cmp_mask1; __m128i all_zero; __m128i mask0 = zero, mask1 = zero; const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; int prescan_add[2]; int thresh[4]; const qm_val_t wt = (1 << AOM_QM_BITS); for (int i = 0; i < 2; ++i) { prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; } thresh[2] = thresh[3] = thresh[1]; __m128i threshold[2]; threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); #if SKIP_EOB_FACTOR_ADJUST int first = -1; #endif // Setup global values. zbin = _mm_load_si128((const __m128i *)zbin_ptr); round = _mm_load_si128((const __m128i *)round_ptr); quant = _mm_load_si128((const __m128i *)quant_ptr); dequant = _mm_load_si128((const __m128i *)dequant_ptr); shift = _mm_load_si128((const __m128i *)quant_shift_ptr); // Shift with rounding. zbin = _mm_add_epi16(zbin, log_scale_vec); round = _mm_add_epi16(round, log_scale_vec); zbin = _mm_srli_epi16(zbin, log_scale); round = _mm_srli_epi16(round, log_scale); zbin = _mm_sub_epi16(zbin, one); // Do DC and first 15 AC. coeff0 = load_coefficients(coeff_ptr); coeff1 = load_coefficients(coeff_ptr + 8); coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1); threshold[0] = threshold[1]; all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_movemask_epi8(all_zero) == 0) { _mm_store_si128((__m128i *)(qcoeff_ptr), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); dequant = _mm_unpackhi_epi64(dequant, dequant); } else { calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); // Reinsert signs qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr); store_coefficients(qcoeff1, qcoeff_ptr + 8); calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr, &log_scale); dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, dqcoeff_ptr + 8, &log_scale); } // AC only loop. while (index < n_coeffs) { coeff0 = load_coefficients(coeff_ptr + index); coeff1 = load_coefficients(coeff_ptr + index + 8); coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, &mask0); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1); all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_movemask_epi8(all_zero) == 0) { _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); index += 16; continue; } calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr + index); store_coefficients(qcoeff1, qcoeff_ptr + index + 8); calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr + index, &log_scale); calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, dqcoeff_ptr + index + 8, &log_scale); index += 16; } if (is_found0) non_zero_count = calculate_non_zero_count(mask0); if (is_found1) non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { const int rc = scan[i]; qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; } for (int i = non_zero_count - 1; i >= 0; i--) { const int rc = scan[i]; if (qcoeff_ptr[rc]) { eob = i; break; } } *eob_ptr = eob + 1; #if SKIP_EOB_FACTOR_ADJUST // TODO(Aniket): Experiment the following loop with intrinsic by combining // with the quantization loop above for (int i = 0; i < non_zero_count; i++) { const int rc = scan[i]; const int qcoeff = qcoeff_ptr[rc]; if (qcoeff) { first = i; break; } } if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { const int rc = scan[(*eob_ptr - 1)]; if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { const int coeff = coeff_ptr[rc] * wt; const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; const int prescan_add_val = ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; *eob_ptr = 0; } } } #endif } void aom_quantize_b_64x64_adaptive_sse2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int index = 16; const int log_scale = 2; int non_zero_count = 0; int non_zero_count_prescan_add_zero = 0; int is_found0 = 0, is_found1 = 0; int eob = -1; const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i log_scale_vec = _mm_set1_epi16(log_scale); __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i cmp_mask0, cmp_mask1; __m128i all_zero; __m128i mask0 = zero, mask1 = zero; const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; int prescan_add[2]; int thresh[4]; const qm_val_t wt = (1 << AOM_QM_BITS); for (int i = 0; i < 2; ++i) { prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; } thresh[2] = thresh[3] = thresh[1]; __m128i threshold[2]; threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); #if SKIP_EOB_FACTOR_ADJUST int first = -1; #endif // Setup global values. zbin = _mm_load_si128((const __m128i *)zbin_ptr); round = _mm_load_si128((const __m128i *)round_ptr); quant = _mm_load_si128((const __m128i *)quant_ptr); dequant = _mm_load_si128((const __m128i *)dequant_ptr); shift = _mm_load_si128((const __m128i *)quant_shift_ptr); // Shift with rounding. zbin = _mm_add_epi16(zbin, log_scale_vec); round = _mm_add_epi16(round, log_scale_vec); zbin = _mm_srli_epi16(zbin, log_scale); round = _mm_srli_epi16(round, log_scale); zbin = _mm_sub_epi16(zbin, one); // Do DC and first 15 AC. coeff0 = load_coefficients(coeff_ptr); coeff1 = load_coefficients(coeff_ptr + 8); coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1); threshold[0] = threshold[1]; all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_movemask_epi8(all_zero) == 0) { _mm_store_si128((__m128i *)(qcoeff_ptr), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); dequant = _mm_unpackhi_epi64(dequant, dequant); } else { calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); // Reinsert signs qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr); store_coefficients(qcoeff1, qcoeff_ptr + 8); calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr, &log_scale); dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, dqcoeff_ptr + 8, &log_scale); } // AC only loop. while (index < n_coeffs) { coeff0 = load_coefficients(coeff_ptr + index); coeff1 = load_coefficients(coeff_ptr + index + 8); coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, &mask0); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1); all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_movemask_epi8(all_zero) == 0) { _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); index += 16; continue; } calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr + index); store_coefficients(qcoeff1, qcoeff_ptr + index + 8); calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr + index, &log_scale); calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, dqcoeff_ptr + index + 8, &log_scale); index += 16; } if (is_found0) non_zero_count = calculate_non_zero_count(mask0); if (is_found1) non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { const int rc = scan[i]; qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; } for (int i = non_zero_count - 1; i >= 0; i--) { const int rc = scan[i]; if (qcoeff_ptr[rc]) { eob = i; break; } } *eob_ptr = eob + 1; #if SKIP_EOB_FACTOR_ADJUST // TODO(Aniket): Experiment the following loop with intrinsic by combining // with the quantization loop above for (int i = 0; i < non_zero_count; i++) { const int rc = scan[i]; const int qcoeff = qcoeff_ptr[rc]; if (qcoeff) { first = i; break; } } if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { const int rc = scan[(*eob_ptr - 1)]; if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { const int coeff = coeff_ptr[rc] * wt; const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; const int prescan_add_val = ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; *eob_ptr = 0; } } } #endif } aom-3.12.1/aom_dsp/x86/aom_convolve_copy_avx2.c000066400000000000000000000203761477627663500213050ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" static inline void copy_128(const uint8_t *src, uint8_t *dst) { __m256i s[4]; s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32)); s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32)); s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32)); s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32)); _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]); _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]); _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]); _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]); } void aom_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { // The w == 16 case uses _mm_store_si128(), which requires its output address // be aligned on a 16-byte boundary. if (w == 16) { assert(!((intptr_t)dst % 16)); assert(!(dst_stride % 16)); } if (w == 2) { do { memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; h -= 2; } while (h); } else if (w == 4) { do { memmove(dst, src, 4 * sizeof(*src)); src += src_stride; dst += dst_stride; memmove(dst, src, 4 * sizeof(*src)); src += src_stride; dst += dst_stride; h -= 2; } while (h); } else if (w == 8) { do { __m128i s[2]; s[0] = _mm_loadl_epi64((__m128i *)src); src += src_stride; s[1] = _mm_loadl_epi64((__m128i *)src); src += src_stride; _mm_storel_epi64((__m128i *)dst, s[0]); dst += dst_stride; _mm_storel_epi64((__m128i *)dst, s[1]); dst += dst_stride; h -= 2; } while (h); } else if (w == 16) { do { __m128i s[2]; s[0] = _mm_loadu_si128((__m128i *)src); src += src_stride; s[1] = _mm_loadu_si128((__m128i *)src); src += src_stride; _mm_store_si128((__m128i *)dst, s[0]); dst += dst_stride; _mm_store_si128((__m128i *)dst, s[1]); dst += dst_stride; h -= 2; } while (h); } else if (w == 32) { do { __m256i s[2]; s[0] = _mm256_loadu_si256((__m256i *)src); src += src_stride; s[1] = _mm256_loadu_si256((__m256i *)src); src += src_stride; _mm256_storeu_si256((__m256i *)dst, s[0]); dst += dst_stride; _mm256_storeu_si256((__m256i *)dst, s[1]); dst += dst_stride; h -= 2; } while (h); } else if (w == 64) { do { __m256i s[4]; s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32)); s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32)); src += src_stride; s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32)); s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32)); src += src_stride; _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]); _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]); dst += dst_stride; _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]); _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]); dst += dst_stride; h -= 2; } while (h); } else { do { copy_128(src, dst); src += src_stride; dst += dst_stride; copy_128(src, dst); src += src_stride; dst += dst_stride; h -= 2; } while (h); } } #if CONFIG_AV1_HIGHBITDEPTH static inline void highbd_copy_64(const uint16_t *src, uint16_t *dst) { __m256i s[4]; s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16)); s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16)); _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]); _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]); _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]); _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]); } static inline void highbd_copy_128(const uint16_t *src, uint16_t *dst) { __m256i s[8]; s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16)); s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16)); s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16)); s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16)); s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16)); s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16)); _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]); _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]); _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]); _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]); _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]); _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]); _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]); _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]); } void aom_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h) { // The w == 8 case uses _mm_store_si128(), which requires its output address // be aligned on a 16-byte boundary. if (w == 8) { assert(!((intptr_t)dst % 16)); assert(!(dst_stride % 8)); } if (w == 2) { do { memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; h -= 2; } while (h); } else if (w == 4) { do { __m128i s[2]; s[0] = _mm_loadl_epi64((__m128i *)src); src += src_stride; s[1] = _mm_loadl_epi64((__m128i *)src); src += src_stride; _mm_storel_epi64((__m128i *)dst, s[0]); dst += dst_stride; _mm_storel_epi64((__m128i *)dst, s[1]); dst += dst_stride; h -= 2; } while (h); } else if (w == 8) { do { __m128i s[2]; s[0] = _mm_loadu_si128((__m128i *)src); src += src_stride; s[1] = _mm_loadu_si128((__m128i *)src); src += src_stride; _mm_store_si128((__m128i *)dst, s[0]); dst += dst_stride; _mm_store_si128((__m128i *)dst, s[1]); dst += dst_stride; h -= 2; } while (h); } else if (w == 16) { do { __m256i s[2]; s[0] = _mm256_loadu_si256((__m256i *)src); src += src_stride; s[1] = _mm256_loadu_si256((__m256i *)src); src += src_stride; _mm256_storeu_si256((__m256i *)dst, s[0]); dst += dst_stride; _mm256_storeu_si256((__m256i *)dst, s[1]); dst += dst_stride; h -= 2; } while (h); } else if (w == 32) { do { __m256i s[4]; s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); src += src_stride; s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); src += src_stride; _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]); _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]); dst += dst_stride; _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]); _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]); dst += dst_stride; h -= 2; } while (h); } else if (w == 64) { do { highbd_copy_64(src, dst); src += src_stride; dst += dst_stride; highbd_copy_64(src, dst); src += src_stride; dst += dst_stride; h -= 2; } while (h); } else { assert(w == 128); do { highbd_copy_128(src, dst); src += src_stride; dst += dst_stride; highbd_copy_128(src, dst); src += src_stride; dst += dst_stride; h -= 2; } while (h); } } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/x86/aom_convolve_copy_sse2.c000066400000000000000000000262021477627663500212730ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" static inline void copy_128(const uint8_t *src, uint8_t *dst) { __m128i s[8]; s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16)); s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16)); s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16)); s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16)); s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16)); s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16)); _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]); _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]); _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]); _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]); _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]); _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]); _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]); _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]); } void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { // The w >= 16 cases use _mm_store_si128(), which requires its output address // be aligned on a 16-byte boundary. if (w >= 16) { assert(!((intptr_t)dst % 16)); assert(!(dst_stride % 16)); } if (w == 2) { do { memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; h -= 2; } while (h); } else if (w == 4) { do { memmove(dst, src, 4 * sizeof(*src)); src += src_stride; dst += dst_stride; memmove(dst, src, 4 * sizeof(*src)); src += src_stride; dst += dst_stride; h -= 2; } while (h); } else if (w == 8) { do { __m128i s[2]; s[0] = _mm_loadl_epi64((__m128i *)src); src += src_stride; s[1] = _mm_loadl_epi64((__m128i *)src); src += src_stride; _mm_storel_epi64((__m128i *)dst, s[0]); dst += dst_stride; _mm_storel_epi64((__m128i *)dst, s[1]); dst += dst_stride; h -= 2; } while (h); } else if (w == 16) { do { __m128i s[2]; s[0] = _mm_loadu_si128((__m128i *)src); src += src_stride; s[1] = _mm_loadu_si128((__m128i *)src); src += src_stride; _mm_store_si128((__m128i *)dst, s[0]); dst += dst_stride; _mm_store_si128((__m128i *)dst, s[1]); dst += dst_stride; h -= 2; } while (h); } else if (w == 32) { do { __m128i s[4]; s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); src += src_stride; s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); src += src_stride; _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]); _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]); dst += dst_stride; _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]); _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]); dst += dst_stride; h -= 2; } while (h); } else if (w == 64) { do { __m128i s[8]; s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16)); s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16)); src += src_stride; s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16)); s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16)); src += src_stride; _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]); _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]); _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]); _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]); dst += dst_stride; _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]); _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]); _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]); _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]); dst += dst_stride; h -= 2; } while (h); } else { do { copy_128(src, dst); src += src_stride; dst += dst_stride; copy_128(src, dst); src += src_stride; dst += dst_stride; h -= 2; } while (h); } } #if CONFIG_AV1_HIGHBITDEPTH static inline void highbd_copy_64(const uint16_t *src, uint16_t *dst) { __m128i s[8]; s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8)); s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8)); s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8)); s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8)); _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]); _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]); _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]); _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]); _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]); _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]); } static inline void highbd_copy_128(const uint16_t *src, uint16_t *dst) { __m128i s[16]; s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8)); s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8)); s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8)); s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8)); s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8)); s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8)); s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8)); s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8)); s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8)); s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8)); s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8)); s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8)); _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]); _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]); _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]); _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]); _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]); _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]); _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]); _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]); _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]); _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]); _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]); _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]); _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]); _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]); } void aom_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h) { // The w >= 8 cases use _mm_store_si128(), which requires its output address // be aligned on a 16-byte boundary. if (w >= 8) { assert(!((intptr_t)dst % 16)); assert(!(dst_stride % 8)); } if (w == 2) { do { __m128i s = _mm_loadl_epi64((__m128i *)src); *(int *)dst = _mm_cvtsi128_si32(s); src += src_stride; dst += dst_stride; s = _mm_loadl_epi64((__m128i *)src); *(int *)dst = _mm_cvtsi128_si32(s); src += src_stride; dst += dst_stride; h -= 2; } while (h); } else if (w == 4) { do { __m128i s[2]; s[0] = _mm_loadl_epi64((__m128i *)src); src += src_stride; s[1] = _mm_loadl_epi64((__m128i *)src); src += src_stride; _mm_storel_epi64((__m128i *)dst, s[0]); dst += dst_stride; _mm_storel_epi64((__m128i *)dst, s[1]); dst += dst_stride; h -= 2; } while (h); } else if (w == 8) { do { __m128i s[2]; s[0] = _mm_loadu_si128((__m128i *)src); src += src_stride; s[1] = _mm_loadu_si128((__m128i *)src); src += src_stride; _mm_store_si128((__m128i *)dst, s[0]); dst += dst_stride; _mm_store_si128((__m128i *)dst, s[1]); dst += dst_stride; h -= 2; } while (h); } else if (w == 16) { do { __m128i s[4]; s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); src += src_stride; s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); src += src_stride; _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); dst += dst_stride; _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]); _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]); dst += dst_stride; h -= 2; } while (h); } else if (w == 32) { do { __m128i s[8]; s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); src += src_stride; s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); src += src_stride; _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]); _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]); dst += dst_stride; _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]); _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]); _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]); _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]); dst += dst_stride; h -= 2; } while (h); } else if (w == 64) { do { highbd_copy_64(src, dst); src += src_stride; dst += dst_stride; highbd_copy_64(src, dst); src += src_stride; dst += dst_stride; h -= 2; } while (h); } else { do { highbd_copy_128(src, dst); src += src_stride; dst += dst_stride; highbd_copy_128(src, dst); src += src_stride; dst += dst_stride; h -= 2; } while (h); } } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm000066400000000000000000000400741477627663500222140ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "aom_ports/x86_abi_support.asm" ;Note: tap3 and tap4 have to be applied and added after other taps to avoid ;overflow. %macro HIGH_GET_FILTERS_4 0 mov rdx, arg(5) ;filter ptr mov rcx, 0x00000040 movdqa xmm7, [rdx] ;load filters pshuflw xmm0, xmm7, 0b ;k0 pshuflw xmm1, xmm7, 01010101b ;k1 pshuflw xmm2, xmm7, 10101010b ;k2 pshuflw xmm3, xmm7, 11111111b ;k3 psrldq xmm7, 8 pshuflw xmm4, xmm7, 0b ;k4 pshuflw xmm5, xmm7, 01010101b ;k5 pshuflw xmm6, xmm7, 10101010b ;k6 pshuflw xmm7, xmm7, 11111111b ;k7 punpcklwd xmm0, xmm6 punpcklwd xmm2, xmm5 punpcklwd xmm3, xmm4 punpcklwd xmm1, xmm7 movdqa k0k6, xmm0 movdqa k2k5, xmm2 movdqa k3k4, xmm3 movdqa k1k7, xmm1 movq xmm6, rcx pshufd xmm6, xmm6, 0 movdqa krd, xmm6 ;Compute max and min values of a pixel mov rdx, 0x00010001 movsxd rcx, DWORD PTR arg(6) ;bps movq xmm0, rdx movq xmm1, rcx pshufd xmm0, xmm0, 0b movdqa xmm2, xmm0 psllw xmm0, xmm1 psubw xmm0, xmm2 pxor xmm1, xmm1 movdqa max, xmm0 ;max value (for clamping) movdqa min, xmm1 ;min value (for clamping) %endm %macro HIGH_APPLY_FILTER_4 1 punpcklwd xmm0, xmm6 ;two row in one register punpcklwd xmm1, xmm7 punpcklwd xmm2, xmm5 punpcklwd xmm3, xmm4 pmaddwd xmm0, k0k6 ;multiply the filter factors pmaddwd xmm1, k1k7 pmaddwd xmm2, k2k5 pmaddwd xmm3, k3k4 paddd xmm0, xmm1 ;sum paddd xmm0, xmm2 paddd xmm0, xmm3 paddd xmm0, krd ;rounding psrad xmm0, 7 ;shift packssdw xmm0, xmm0 ;pack to word ;clamp the values pminsw xmm0, max pmaxsw xmm0, min %if %1 movq xmm1, [rdi] pavgw xmm0, xmm1 %endif movq [rdi], xmm0 %endm %macro HIGH_GET_FILTERS 0 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr mov rcx, 0x00000040 movdqa xmm7, [rdx] ;load filters pshuflw xmm0, xmm7, 0b ;k0 pshuflw xmm1, xmm7, 01010101b ;k1 pshuflw xmm2, xmm7, 10101010b ;k2 pshuflw xmm3, xmm7, 11111111b ;k3 pshufhw xmm4, xmm7, 0b ;k4 pshufhw xmm5, xmm7, 01010101b ;k5 pshufhw xmm6, xmm7, 10101010b ;k6 pshufhw xmm7, xmm7, 11111111b ;k7 punpcklqdq xmm2, xmm2 punpcklqdq xmm3, xmm3 punpcklwd xmm0, xmm1 punpckhwd xmm6, xmm7 punpckhwd xmm2, xmm5 punpckhwd xmm3, xmm4 movdqa k0k1, xmm0 ;store filter factors on stack movdqa k6k7, xmm6 movdqa k2k5, xmm2 movdqa k3k4, xmm3 movq xmm6, rcx pshufd xmm6, xmm6, 0 movdqa krd, xmm6 ;rounding ;Compute max and min values of a pixel mov rdx, 0x00010001 movsxd rcx, DWORD PTR arg(6) ;bps movq xmm0, rdx movq xmm1, rcx pshufd xmm0, xmm0, 0b movdqa xmm2, xmm0 psllw xmm0, xmm1 psubw xmm0, xmm2 pxor xmm1, xmm1 movdqa max, xmm0 ;max value (for clamping) movdqa min, xmm1 ;min value (for clamping) %endm %macro LOAD_VERT_8 1 movdqu xmm0, [rsi + %1] ;0 movdqu xmm1, [rsi + rax + %1] ;1 movdqu xmm6, [rsi + rdx * 2 + %1] ;6 lea rsi, [rsi + rax] movdqu xmm7, [rsi + rdx * 2 + %1] ;7 movdqu xmm2, [rsi + rax + %1] ;2 movdqu xmm3, [rsi + rax * 2 + %1] ;3 movdqu xmm4, [rsi + rdx + %1] ;4 movdqu xmm5, [rsi + rax * 4 + %1] ;5 %endm %macro HIGH_APPLY_FILTER_8 2 movdqu temp, xmm4 movdqa xmm4, xmm0 punpcklwd xmm0, xmm1 punpckhwd xmm4, xmm1 movdqa xmm1, xmm6 punpcklwd xmm6, xmm7 punpckhwd xmm1, xmm7 movdqa xmm7, xmm2 punpcklwd xmm2, xmm5 punpckhwd xmm7, xmm5 movdqu xmm5, temp movdqu temp, xmm4 movdqa xmm4, xmm3 punpcklwd xmm3, xmm5 punpckhwd xmm4, xmm5 movdqu xmm5, temp pmaddwd xmm0, k0k1 pmaddwd xmm5, k0k1 pmaddwd xmm6, k6k7 pmaddwd xmm1, k6k7 pmaddwd xmm2, k2k5 pmaddwd xmm7, k2k5 pmaddwd xmm3, k3k4 pmaddwd xmm4, k3k4 paddd xmm0, xmm6 paddd xmm0, xmm2 paddd xmm0, xmm3 paddd xmm5, xmm1 paddd xmm5, xmm7 paddd xmm5, xmm4 paddd xmm0, krd ;rounding paddd xmm5, krd psrad xmm0, 7 ;shift psrad xmm5, 7 packssdw xmm0, xmm5 ;pack back to word ;clamp the values pminsw xmm0, max pmaxsw xmm0, min %if %1 movdqu xmm1, [rdi + %2] pavgw xmm0, xmm1 %endif movdqu [rdi + %2], xmm0 %endm SECTION .text ;void aom_highbd_filter_block1d4_v8_sse2 ;( ; const uint16_t *src_ptr, ; const ptrdiff_t src_pitch, ; uint16_t *output_ptr, ; ptrdiff_t out_pitch, ; unsigned int output_height, ; const int16_t *filter, ; int bd ;) globalsym(aom_highbd_filter_block1d4_v8_sse2) sym(aom_highbd_filter_block1d4_v8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 SAVE_XMM 7 push rsi push rdi push rbx ; end prolog ALIGN_STACK 16, rax sub rsp, 16 * 7 %define k0k6 [rsp + 16 * 0] %define k2k5 [rsp + 16 * 1] %define k3k4 [rsp + 16 * 2] %define k1k7 [rsp + 16 * 3] %define krd [rsp + 16 * 4] %define max [rsp + 16 * 5] %define min [rsp + 16 * 6] HIGH_GET_FILTERS_4 mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr movsxd rax, DWORD PTR arg(1) ;pixels_per_line movsxd rbx, DWORD PTR arg(3) ;out_pitch lea rax, [rax + rax] ;bytes per line lea rbx, [rbx + rbx] lea rdx, [rax + rax * 2] movsxd rcx, DWORD PTR arg(4) ;output_height .loop: movq xmm0, [rsi] ;load src: row 0 movq xmm1, [rsi + rax] ;1 movq xmm6, [rsi + rdx * 2] ;6 lea rsi, [rsi + rax] movq xmm7, [rsi + rdx * 2] ;7 movq xmm2, [rsi + rax] ;2 movq xmm3, [rsi + rax * 2] ;3 movq xmm4, [rsi + rdx] ;4 movq xmm5, [rsi + rax * 4] ;5 HIGH_APPLY_FILTER_4 0 lea rdi, [rdi + rbx] dec rcx jnz .loop add rsp, 16 * 7 pop rsp pop rbx ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret ;void aom_highbd_filter_block1d8_v8_sse2 ;( ; const uint16_t *src_ptr, ; const ptrdiff_t src_pitch, ; uint16_t *output_ptr, ; ptrdiff_t out_pitch, ; unsigned int output_height, ; const int16_t *filter, ; int bd ;) globalsym(aom_highbd_filter_block1d8_v8_sse2) sym(aom_highbd_filter_block1d8_v8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 SAVE_XMM 7 push rsi push rdi push rbx ; end prolog ALIGN_STACK 16, rax sub rsp, 16 * 8 %define k0k1 [rsp + 16 * 0] %define k6k7 [rsp + 16 * 1] %define k2k5 [rsp + 16 * 2] %define k3k4 [rsp + 16 * 3] %define krd [rsp + 16 * 4] %define temp [rsp + 16 * 5] %define max [rsp + 16 * 6] %define min [rsp + 16 * 7] HIGH_GET_FILTERS movsxd rax, DWORD PTR arg(1) ;pixels_per_line movsxd rbx, DWORD PTR arg(3) ;out_pitch lea rax, [rax + rax] ;bytes per line lea rbx, [rbx + rbx] lea rdx, [rax + rax * 2] movsxd rcx, DWORD PTR arg(4) ;output_height .loop: LOAD_VERT_8 0 HIGH_APPLY_FILTER_8 0, 0 lea rdi, [rdi + rbx] dec rcx jnz .loop add rsp, 16 * 8 pop rsp pop rbx ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret ;void aom_highbd_filter_block1d16_v8_sse2 ;( ; const uint16_t *src_ptr, ; const ptrdiff_t src_pitch, ; uint16_t *output_ptr, ; ptrdiff_t out_pitch, ; unsigned int output_height, ; const int16_t *filter, ; int bd ;) globalsym(aom_highbd_filter_block1d16_v8_sse2) sym(aom_highbd_filter_block1d16_v8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 SAVE_XMM 7 push rsi push rdi push rbx ; end prolog ALIGN_STACK 16, rax sub rsp, 16 * 8 %define k0k1 [rsp + 16 * 0] %define k6k7 [rsp + 16 * 1] %define k2k5 [rsp + 16 * 2] %define k3k4 [rsp + 16 * 3] %define krd [rsp + 16 * 4] %define temp [rsp + 16 * 5] %define max [rsp + 16 * 6] %define min [rsp + 16 * 7] HIGH_GET_FILTERS movsxd rax, DWORD PTR arg(1) ;pixels_per_line movsxd rbx, DWORD PTR arg(3) ;out_pitch lea rax, [rax + rax] ;bytes per line lea rbx, [rbx + rbx] lea rdx, [rax + rax * 2] movsxd rcx, DWORD PTR arg(4) ;output_height .loop: LOAD_VERT_8 0 HIGH_APPLY_FILTER_8 0, 0 sub rsi, rax LOAD_VERT_8 16 HIGH_APPLY_FILTER_8 0, 16 add rdi, rbx dec rcx jnz .loop add rsp, 16 * 8 pop rsp pop rbx ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret ;void aom_highbd_filter_block1d4_h8_sse2 ;( ; const uint16_t *src_ptr, ; const ptrdiff_t src_pitch, ; uint16_t *output_ptr, ; ptrdiff_t out_pitch, ; unsigned int output_height, ; const int16_t *filter, ; int bd ;) globalsym(aom_highbd_filter_block1d4_h8_sse2) sym(aom_highbd_filter_block1d4_h8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 SAVE_XMM 7 push rsi push rdi ; end prolog ALIGN_STACK 16, rax sub rsp, 16 * 7 %define k0k6 [rsp + 16 * 0] %define k2k5 [rsp + 16 * 1] %define k3k4 [rsp + 16 * 2] %define k1k7 [rsp + 16 * 3] %define krd [rsp + 16 * 4] %define max [rsp + 16 * 5] %define min [rsp + 16 * 6] HIGH_GET_FILTERS_4 mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr movsxd rax, DWORD PTR arg(1) ;pixels_per_line movsxd rdx, DWORD PTR arg(3) ;out_pitch lea rax, [rax + rax] ;bytes per line lea rdx, [rdx + rdx] movsxd rcx, DWORD PTR arg(4) ;output_height .loop: movdqu xmm0, [rsi - 6] ;load src movdqu xmm4, [rsi + 2] movdqa xmm1, xmm0 movdqa xmm6, xmm4 movdqa xmm7, xmm4 movdqa xmm2, xmm0 movdqa xmm3, xmm0 movdqa xmm5, xmm4 psrldq xmm1, 2 psrldq xmm6, 4 psrldq xmm7, 6 psrldq xmm2, 4 psrldq xmm3, 6 psrldq xmm5, 2 HIGH_APPLY_FILTER_4 0 lea rsi, [rsi + rax] lea rdi, [rdi + rdx] dec rcx jnz .loop add rsp, 16 * 7 pop rsp ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret ;void aom_highbd_filter_block1d8_h8_sse2 ;( ; const uint16_t *src_ptr, ; const ptrdiff_t src_pitch, ; uint16_t *output_ptr, ; ptrdiff_t out_pitch, ; unsigned int output_height, ; const int16_t *filter, ; int bd ;) globalsym(aom_highbd_filter_block1d8_h8_sse2) sym(aom_highbd_filter_block1d8_h8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 SAVE_XMM 7 push rsi push rdi ; end prolog ALIGN_STACK 16, rax sub rsp, 16 * 8 %define k0k1 [rsp + 16 * 0] %define k6k7 [rsp + 16 * 1] %define k2k5 [rsp + 16 * 2] %define k3k4 [rsp + 16 * 3] %define krd [rsp + 16 * 4] %define temp [rsp + 16 * 5] %define max [rsp + 16 * 6] %define min [rsp + 16 * 7] HIGH_GET_FILTERS movsxd rax, DWORD PTR arg(1) ;pixels_per_line movsxd rdx, DWORD PTR arg(3) ;out_pitch lea rax, [rax + rax] ;bytes per line lea rdx, [rdx + rdx] movsxd rcx, DWORD PTR arg(4) ;output_height .loop: movdqu xmm0, [rsi - 6] ;load src movdqu xmm1, [rsi - 4] movdqu xmm2, [rsi - 2] movdqu xmm3, [rsi] movdqu xmm4, [rsi + 2] movdqu xmm5, [rsi + 4] movdqu xmm6, [rsi + 6] movdqu xmm7, [rsi + 8] HIGH_APPLY_FILTER_8 0, 0 lea rsi, [rsi + rax] lea rdi, [rdi + rdx] dec rcx jnz .loop add rsp, 16 * 8 pop rsp ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret ;void aom_highbd_filter_block1d16_h8_sse2 ;( ; const uint16_t *src_ptr, ; const ptrdiff_t src_pitch, ; uint16_t *output_ptr, ; ptrdiff_t out_pitch, ; unsigned int output_height, ; const int16_t *filter, ; int bd ;) globalsym(aom_highbd_filter_block1d16_h8_sse2) sym(aom_highbd_filter_block1d16_h8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 SAVE_XMM 7 push rsi push rdi ; end prolog ALIGN_STACK 16, rax sub rsp, 16 * 8 %define k0k1 [rsp + 16 * 0] %define k6k7 [rsp + 16 * 1] %define k2k5 [rsp + 16 * 2] %define k3k4 [rsp + 16 * 3] %define krd [rsp + 16 * 4] %define temp [rsp + 16 * 5] %define max [rsp + 16 * 6] %define min [rsp + 16 * 7] HIGH_GET_FILTERS movsxd rax, DWORD PTR arg(1) ;pixels_per_line movsxd rdx, DWORD PTR arg(3) ;out_pitch lea rax, [rax + rax] ;bytes per line lea rdx, [rdx + rdx] movsxd rcx, DWORD PTR arg(4) ;output_height .loop: movdqu xmm0, [rsi - 6] ;load src movdqu xmm1, [rsi - 4] movdqu xmm2, [rsi - 2] movdqu xmm3, [rsi] movdqu xmm4, [rsi + 2] movdqu xmm5, [rsi + 4] movdqu xmm6, [rsi + 6] movdqu xmm7, [rsi + 8] HIGH_APPLY_FILTER_8 0, 0 movdqu xmm0, [rsi + 10] ;load src movdqu xmm1, [rsi + 12] movdqu xmm2, [rsi + 14] movdqu xmm3, [rsi + 16] movdqu xmm4, [rsi + 18] movdqu xmm5, [rsi + 20] movdqu xmm6, [rsi + 22] movdqu xmm7, [rsi + 24] HIGH_APPLY_FILTER_8 0, 16 lea rsi, [rsi + rax] lea rdi, [rdi + rdx] dec rcx jnz .loop add rsp, 16 * 8 pop rsp ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret aom-3.12.1/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm000066400000000000000000000216651477627663500234530ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "aom_ports/x86_abi_support.asm" %macro HIGH_GET_PARAM_4 0 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr mov rcx, 0x00000040 movdqa xmm3, [rdx] ;load filters pshuflw xmm4, xmm3, 11111111b ;k3 psrldq xmm3, 8 pshuflw xmm3, xmm3, 0b ;k4 punpcklwd xmm4, xmm3 ;k3k4 movq xmm3, rcx ;rounding pshufd xmm3, xmm3, 0 mov rdx, 0x00010001 movsxd rcx, DWORD PTR arg(6) ;bps movq xmm5, rdx movq xmm2, rcx pshufd xmm5, xmm5, 0b movdqa xmm1, xmm5 psllw xmm5, xmm2 psubw xmm5, xmm1 ;max value (for clamping) pxor xmm2, xmm2 ;min value (for clamping) movsxd rax, DWORD PTR arg(1) ;pixels_per_line movsxd rdx, DWORD PTR arg(3) ;out_pitch movsxd rcx, DWORD PTR arg(4) ;output_height %endm %macro HIGH_APPLY_FILTER_4 1 punpcklwd xmm0, xmm1 ;two row in one register pmaddwd xmm0, xmm4 ;multiply the filter factors paddd xmm0, xmm3 ;rounding psrad xmm0, 7 ;shift packssdw xmm0, xmm0 ;pack to word ;clamp the values pminsw xmm0, xmm5 pmaxsw xmm0, xmm2 %if %1 movq xmm1, [rdi] pavgw xmm0, xmm1 %endif movq [rdi], xmm0 lea rsi, [rsi + 2*rax] lea rdi, [rdi + 2*rdx] dec rcx %endm %macro HIGH_GET_PARAM 0 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr mov rcx, 0x00000040 movdqa xmm6, [rdx] ;load filters pshuflw xmm7, xmm6, 11111111b ;k3 pshufhw xmm6, xmm6, 0b ;k4 psrldq xmm6, 8 punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4 movq xmm4, rcx ;rounding pshufd xmm4, xmm4, 0 mov rdx, 0x00010001 movsxd rcx, DWORD PTR arg(6) ;bps movq xmm3, rdx movq xmm5, rcx pshufd xmm3, xmm3, 0b movdqa xmm1, xmm3 psllw xmm3, xmm5 psubw xmm3, xmm1 ;max value (for clamping) pxor xmm5, xmm5 ;min value (for clamping) movdqa max, xmm3 movdqa min, xmm5 movsxd rax, DWORD PTR arg(1) ;pixels_per_line movsxd rdx, DWORD PTR arg(3) ;out_pitch movsxd rcx, DWORD PTR arg(4) ;output_height %endm %macro HIGH_APPLY_FILTER_8 1 movdqa xmm6, xmm0 punpckhwd xmm6, xmm1 punpcklwd xmm0, xmm1 pmaddwd xmm6, xmm7 pmaddwd xmm0, xmm7 paddd xmm6, xmm4 ;rounding paddd xmm0, xmm4 ;rounding psrad xmm6, 7 ;shift psrad xmm0, 7 ;shift packssdw xmm0, xmm6 ;pack back to word ;clamp the values pminsw xmm0, max pmaxsw xmm0, min %if %1 movdqu xmm1, [rdi] pavgw xmm0, xmm1 %endif movdqu [rdi], xmm0 ;store the result lea rsi, [rsi + 2*rax] lea rdi, [rdi + 2*rdx] dec rcx %endm %macro HIGH_APPLY_FILTER_16 1 movdqa xmm5, xmm0 movdqa xmm6, xmm2 punpckhwd xmm5, xmm1 punpckhwd xmm6, xmm3 punpcklwd xmm0, xmm1 punpcklwd xmm2, xmm3 pmaddwd xmm5, xmm7 pmaddwd xmm6, xmm7 pmaddwd xmm0, xmm7 pmaddwd xmm2, xmm7 paddd xmm5, xmm4 ;rounding paddd xmm6, xmm4 paddd xmm0, xmm4 paddd xmm2, xmm4 psrad xmm5, 7 ;shift psrad xmm6, 7 psrad xmm0, 7 psrad xmm2, 7 packssdw xmm0, xmm5 ;pack back to word packssdw xmm2, xmm6 ;pack back to word ;clamp the values pminsw xmm0, max pmaxsw xmm0, min pminsw xmm2, max pmaxsw xmm2, min %if %1 movdqu xmm1, [rdi] movdqu xmm3, [rdi + 16] pavgw xmm0, xmm1 pavgw xmm2, xmm3 %endif movdqu [rdi], xmm0 ;store the result movdqu [rdi + 16], xmm2 ;store the result lea rsi, [rsi + 2*rax] lea rdi, [rdi + 2*rdx] dec rcx %endm SECTION .text globalsym(aom_highbd_filter_block1d4_v2_sse2) sym(aom_highbd_filter_block1d4_v2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 push rsi push rdi ; end prolog HIGH_GET_PARAM_4 .loop: movq xmm0, [rsi] ;load src movq xmm1, [rsi + 2*rax] HIGH_APPLY_FILTER_4 0 jnz .loop ; begin epilog pop rdi pop rsi UNSHADOW_ARGS pop rbp ret globalsym(aom_highbd_filter_block1d8_v2_sse2) sym(aom_highbd_filter_block1d8_v2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 SAVE_XMM 8 push rsi push rdi ; end prolog ALIGN_STACK 16, rax sub rsp, 16 * 2 %define max [rsp + 16 * 0] %define min [rsp + 16 * 1] HIGH_GET_PARAM .loop: movdqu xmm0, [rsi] ;0 movdqu xmm1, [rsi + 2*rax] ;1 HIGH_APPLY_FILTER_8 0 jnz .loop add rsp, 16 * 2 pop rsp ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret globalsym(aom_highbd_filter_block1d16_v2_sse2) sym(aom_highbd_filter_block1d16_v2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 SAVE_XMM 9 push rsi push rdi ; end prolog ALIGN_STACK 16, rax sub rsp, 16 * 2 %define max [rsp + 16 * 0] %define min [rsp + 16 * 1] HIGH_GET_PARAM .loop: movdqu xmm0, [rsi] ;0 movdqu xmm2, [rsi + 16] movdqu xmm1, [rsi + 2*rax] ;1 movdqu xmm3, [rsi + 2*rax + 16] HIGH_APPLY_FILTER_16 0 jnz .loop add rsp, 16 * 2 pop rsp ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret globalsym(aom_highbd_filter_block1d4_h2_sse2) sym(aom_highbd_filter_block1d4_h2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 push rsi push rdi ; end prolog HIGH_GET_PARAM_4 .loop: movdqu xmm0, [rsi] ;load src movdqa xmm1, xmm0 psrldq xmm1, 2 HIGH_APPLY_FILTER_4 0 jnz .loop ; begin epilog pop rdi pop rsi UNSHADOW_ARGS pop rbp ret globalsym(aom_highbd_filter_block1d8_h2_sse2) sym(aom_highbd_filter_block1d8_h2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 SAVE_XMM 8 push rsi push rdi ; end prolog ALIGN_STACK 16, rax sub rsp, 16 * 2 %define max [rsp + 16 * 0] %define min [rsp + 16 * 1] HIGH_GET_PARAM .loop: movdqu xmm0, [rsi] ;load src movdqu xmm1, [rsi + 2] HIGH_APPLY_FILTER_8 0 jnz .loop add rsp, 16 * 2 pop rsp ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret globalsym(aom_highbd_filter_block1d16_h2_sse2) sym(aom_highbd_filter_block1d16_h2_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 SAVE_XMM 9 push rsi push rdi ; end prolog ALIGN_STACK 16, rax sub rsp, 16 * 2 %define max [rsp + 16 * 0] %define min [rsp + 16 * 1] HIGH_GET_PARAM .loop: movdqu xmm0, [rsi] ;load src movdqu xmm1, [rsi + 2] movdqu xmm2, [rsi + 16] movdqu xmm3, [rsi + 18] HIGH_APPLY_FILTER_16 0 jnz .loop add rsp, 16 * 2 pop rsp ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret aom-3.12.1/aom_dsp/x86/aom_quantize_avx.c000066400000000000000000000244761477627663500202030ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/bitdepth_conversion_sse2.h" #include "aom_dsp/x86/quantize_x86.h" static inline void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, tran_low_t *dqcoeff) { const __m128i low = _mm_mullo_epi16(qcoeff, dequant); const __m128i high = _mm_mulhi_epi16(qcoeff, dequant); const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); } void aom_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); const __m256i big_zero = _mm256_setzero_si256(); int index; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; __m128i qcoeff0, qcoeff1; __m128i cmp_mask0, cmp_mask1; __m128i all_zero; __m128i eob = zero, eob0; (void)scan; *eob_ptr = 0; load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant, quant_shift_ptr, &shift); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); coeff1 = load_tran_low(coeff_ptr + 8); qcoeff0 = _mm_abs_epi16(coeff0); qcoeff1 = _mm_abs_epi16(coeff1); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_test_all_zeros(all_zero, all_zero)) { _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero); _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero); if (n_coeffs == 16) return; round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); dequant = _mm_unpackhi_epi64(dequant, dequant); } else { calculate_qcoeff(&qcoeff0, round, quant, shift); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); calculate_qcoeff(&qcoeff1, round, quant, shift); // Reinsert signs qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_tran_low(qcoeff0, qcoeff_ptr); store_tran_low(qcoeff1, qcoeff_ptr + 8); calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); } // AC only loop. for (index = 16; index < n_coeffs; index += 16) { coeff0 = load_tran_low(coeff_ptr + index); coeff1 = load_tran_low(coeff_ptr + index + 8); qcoeff0 = _mm_abs_epi16(coeff0); qcoeff1 = _mm_abs_epi16(coeff1); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_test_all_zeros(all_zero, all_zero)) { _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero); _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero); continue; } calculate_qcoeff(&qcoeff0, round, quant, shift); calculate_qcoeff(&qcoeff1, round, quant, shift); qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_tran_low(qcoeff0, qcoeff_ptr + index); store_tran_low(qcoeff1, qcoeff_ptr + index + 8); calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } *eob_ptr = accumulate_eob(eob); } void aom_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m256i big_zero = _mm256_setzero_si256(); int index; const int log_scale = 1; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; __m128i qcoeff0, qcoeff1; __m128i cmp_mask0, cmp_mask1; __m128i all_zero; __m128i eob = zero, eob0; (void)scan; // Setup global values. // The 32x32 halves zbin and round. zbin = _mm_load_si128((const __m128i *)zbin_ptr); // Shift with rounding. zbin = _mm_add_epi16(zbin, one); zbin = _mm_srli_epi16(zbin, 1); // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so // it is a strict "greater" comparison. zbin = _mm_sub_epi16(zbin, one); round = _mm_load_si128((const __m128i *)round_ptr); round = _mm_add_epi16(round, one); round = _mm_srli_epi16(round, 1); quant = _mm_load_si128((const __m128i *)quant_ptr); dequant = _mm_load_si128((const __m128i *)dequant_ptr); shift = _mm_load_si128((const __m128i *)quant_shift_ptr); // Do DC and first 15 AC. coeff0 = load_tran_low(coeff_ptr); coeff1 = load_tran_low(coeff_ptr + 8); qcoeff0 = _mm_abs_epi16(coeff0); qcoeff1 = _mm_abs_epi16(coeff1); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC. cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_test_all_zeros(all_zero, all_zero)) { _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero); _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); dequant = _mm_unpackhi_epi64(dequant, dequant); } else { calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); // Reinsert signs. qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); // Mask out zbin threshold coeffs. qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_tran_low(qcoeff0, qcoeff_ptr); store_tran_low(qcoeff1, qcoeff_ptr + 8); calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr, &log_scale); dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, dqcoeff_ptr + 8, &log_scale); eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); } // AC only loop. for (index = 16; index < n_coeffs; index += 16) { coeff0 = load_tran_low(coeff_ptr + index); coeff1 = load_tran_low(coeff_ptr + index + 8); qcoeff0 = _mm_abs_epi16(coeff0); qcoeff1 = _mm_abs_epi16(coeff1); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_test_all_zeros(all_zero, all_zero)) { _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero); _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero); continue; } calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_tran_low(qcoeff0, qcoeff_ptr + index); store_tran_low(qcoeff1, qcoeff_ptr + index + 8); calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr + index, &log_scale); calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, dqcoeff_ptr + index + 8, &log_scale); eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } *eob_ptr = accumulate_eob(eob); } aom-3.12.1/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c000066400000000000000000001666211477627663500222550ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/x86/convolve.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/synonyms_avx2.h" #include "aom_ports/mem.h" #if defined(__clang__) #if (__clang_major__ > 0 && __clang_major__ < 3) || \ (__clang_major__ == 3 && __clang_minor__ <= 3) || \ (defined(__APPLE__) && defined(__apple_build_version__) && \ ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ (__clang_major__ == 5 && __clang_minor__ == 0))) #define MM256_BROADCASTSI128_SI256(x) \ _mm_broadcastsi128_si256((__m128i const *)&(x)) #else // clang > 3.3, and not 5.0 on macosx. #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) #endif // clang <= 3.3 #elif defined(__GNUC__) #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) #define MM256_BROADCASTSI128_SI256(x) \ _mm_broadcastsi128_si256((__m128i const *)&(x)) #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) #else // gcc > 4.7 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) #endif // gcc <= 4.6 #else // !(gcc || clang) #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) #endif // __clang__ static inline void xx_storeu2_epi32(const uint8_t *output_ptr, const ptrdiff_t stride, const __m256i *a) { *((int *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a)); *((int *)(output_ptr + stride)) = _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1)); } static inline __m256i xx_loadu2_epi64(const void *hi, const void *lo) { __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo))); a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1); return a; } static inline void xx_storeu2_epi64(const uint8_t *output_ptr, const ptrdiff_t stride, const __m256i *a) { _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); _mm_storel_epi64((__m128i *)(output_ptr + stride), _mm256_extractf128_si256(*a, 1)); } static inline void xx_store2_mi128(const uint8_t *output_ptr, const ptrdiff_t stride, const __m256i *a) { _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); _mm_store_si128((__m128i *)(output_ptr + stride), _mm256_extractf128_si256(*a, 1)); } static void aom_filter_block1d4_h4_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; unsigned int i; ptrdiff_t src_stride, dst_stride; src_ptr -= 3; addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2)); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i -= 2) { // load the 2 strides of source srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr); // filter the source buffer srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); // multiply 4 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b1_1 = _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve result srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); src_ptr += src_stride; xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); output_ptr += dst_stride; } // if the number of strides is odd. // process only 4 bytes if (i > 0) { __m128i srcReg1, srcRegFilt1_1; srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); // multiply 4 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); // shift by 6 bit each 16 bit srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve result srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); // save 4 bytes *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); } } static void aom_filter_block1d4_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg32, filt1Reg, filt2Reg; __m256i firstFilters, secondFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2; __m256i srcReg32b1; unsigned int i; ptrdiff_t src_stride, dst_stride; src_ptr -= 3; addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 32 bits firstFilters = _mm256_shuffle_epi32(filtersReg32, 0); // duplicate only the second 32 bits secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55); filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2); filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32)); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i -= 2) { // load the 2 strides of source srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr); // filter the source buffer srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); // multiply 4 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); // filter the source buffer srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); // multiply 4 adjacent elements with the filter and add the result srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); srcRegFilt32b1_1 = _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve result srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); src_ptr += src_stride; xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); output_ptr += dst_stride; } // if the number of strides is odd. // process only 4 bytes if (i > 0) { __m128i srcReg1, srcRegFilt1_1; __m128i srcRegFilt2; srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); // multiply 4 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); // filter the source buffer srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); // multiply 4 adjacent elements with the filter and add the result srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); // shift by 6 bit each 16 bit srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve result srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); // save 4 bytes *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); } } static void aom_filter_block1d8_h4_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg32, filt2Reg, filt3Reg; __m256i secondFilters, thirdFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, filtersReg32; unsigned int i; ptrdiff_t src_stride, dst_stride; src_ptr -= 3; addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); // multiply the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i -= 2) { // load the 2 strides of source srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr); // filter the source buffer srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); // shrink to 8 bit each 16 bits srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1); src_ptr += src_stride; xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); output_ptr += dst_stride; } // if the number of strides is odd. // process only 8 bytes if (i > 0) { __m128i srcReg1, srcRegFilt1_1; __m128i srcRegFilt2, srcRegFilt3; srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); // filter the source buffer srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3); // shift by 6 bit each 16 bit srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); // shrink to 8 bit each 16 bits srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); // save 8 bytes _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); } } static void aom_filter_block1d8_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1; unsigned int i; ptrdiff_t src_stride, dst_stride; src_ptr -= 3; addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i -= 2) { // load the 2 strides of source srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr); // filter the source buffer srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve result srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); src_ptr += src_stride; xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); output_ptr += dst_stride; } // if the number of strides is odd. // process only 8 bytes if (i > 0) { __m128i srcReg1, srcRegFilt1_1; __m128i srcRegFilt2, srcRegFilt3; srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); // shift by 6 bit each 16 bit srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); // save 8 bytes _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); } } static void aom_filter_block1d16_h4_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg32, filt2Reg, filt3Reg; __m256i secondFilters, thirdFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, srcReg32b2, filtersReg32; unsigned int i; ptrdiff_t src_stride, dst_stride; src_ptr -= 3; addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); // multiply the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i -= 2) { // load the 2 strides of source srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr); // filter the source buffer srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = yy_loadu2_128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); // filter the source buffer srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve result srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); src_ptr += src_stride; xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); output_ptr += dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { __m256i srcReg1, srcReg12; __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1; srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr)); srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94); // filter the source buffer srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg); srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters); srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters); // add and saturate the results together srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3); // shift by 6 bit each 16 bit srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32); srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1); srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8); // save 16 bytes _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcRegFilt1_1)); } } static void aom_filter_block1d16_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, srcReg32b2, filtersReg32; unsigned int i; ptrdiff_t src_stride, dst_stride; src_ptr -= 3; addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i -= 2) { // load the 2 strides of source srcReg32b1 = yy_loadu2_128(src_ptr + src_pixels_per_line, src_ptr); // filter the source buffer srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = yy_loadu2_128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); // filter the source buffer srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16( srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2)); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve result srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); src_ptr += src_stride; xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); output_ptr += dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; __m128i srcRegFilt2, srcRegFilt3; srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); // reading the next 16 bytes // (part of it was being read by earlier read) srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); // filter the source buffer srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); // shift by 6 bit each 16 bit srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32)); srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); // save 16 bytes _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1); } } static void aom_filter_block1d8_v4_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i filtersReg32, addFilterReg32; __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; __m256i srcReg23_34_lo, srcReg45_56_lo; __m256i resReg23_34_lo, resReg45_56_lo; __m256i resReglo, resReg; __m256i secondFilters, thirdFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); srcReg4x = _mm256_castsi128_si256( _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); // have consecutive loads on the same 256 register srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); for (i = output_height; i > 1; i -= 2) { // load the last 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register srcReg5x = _mm256_castsi128_si256( _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); srcReg45 = _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); srcReg6x = _mm256_castsi128_si256( _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); srcReg56 = _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); // merge every two consecutive registers srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); // multiply 2 adjacent elements with the filter and add the result resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); // add and saturate the results together resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); // shift by 6 bit each 16 bit resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); resReglo = _mm256_srai_epi16(resReglo, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReg = _mm256_packus_epi16(resReglo, resReglo); src_ptr += src_stride; xx_storeu2_epi64(output_ptr, out_pitch, &resReg); output_ptr += dst_stride; // save part of the registers for next strides srcReg23_34_lo = srcReg45_56_lo; srcReg4x = srcReg6x; } } static void aom_filter_block1d8_v8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg32; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; __m256i srcReg32b11, srcReg32b12, filtersReg32; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; // load 16 bytes 7 times in stride of src_pitch srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr); srcReg32b3 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); srcReg32b5 = xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); srcReg32b7 = _mm256_castsi128_si256( _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); // merge every two consecutive registers except the last one srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); for (i = output_height; i > 1; i -= 2) { // load the last 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register srcReg32b8 = _mm256_castsi128_si256( _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7))); srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, _mm256_castsi256_si128(srcReg32b8), 1); srcReg32b9 = _mm256_castsi128_si256( _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8))); srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, _mm256_castsi256_si128(srcReg32b9), 1); // merge every two consecutive registers // save srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); // multiply 2 adjacent elements with the filter and add the result srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_adds_epi16(srcReg32b8, srcReg32b12)); // shift by 6 bit each 16 bit srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256()); src_ptr += src_stride; xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1); output_ptr += dst_stride; // save part of the registers for next strides srcReg32b10 = srcReg32b11; srcReg32b11 = srcReg32b2; srcReg32b2 = srcReg32b4; srcReg32b7 = srcReg32b9; } if (i > 0) { __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8; // load the last 16 bytes srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together srcRegFilt4 = _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), _mm256_castsi256_si128(firstFilters)); srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); // multiply 2 adjacent elements with the filter and add the result srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), _mm256_castsi256_si128(secondFilters)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); // shift by 6 bit each 16 bit srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve result srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128()); // save 8 bytes _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1); } } static void aom_filter_block1d16_v4_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i filtersReg32, addFilterReg32; __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi; __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi; __m256i resReglo, resReghi, resReg; __m256i secondFilters, thirdFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; srcReg23 = yy_loadu2_128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); srcReg4x = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); // have consecutive loads on the same 256 register srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34); for (i = output_height; i > 1; i -= 2) { // load the last 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register srcReg5x = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); srcReg45 = _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); srcReg6x = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); srcReg56 = _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); // merge every two consecutive registers srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56); // multiply 2 adjacent elements with the filter and add the result resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); // add and saturate the results together resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); // multiply 2 adjacent elements with the filter and add the result resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters); resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters); // add and saturate the results together resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi); // shift by 6 bit each 16 bit resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); resReghi = _mm256_adds_epi16(resReghi, addFilterReg32); resReglo = _mm256_srai_epi16(resReglo, 6); resReghi = _mm256_srai_epi16(resReghi, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReg = _mm256_packus_epi16(resReglo, resReghi); src_ptr += src_stride; xx_store2_mi128(output_ptr, out_pitch, &resReg); output_ptr += dst_stride; // save part of the registers for next strides srcReg23_34_lo = srcReg45_56_lo; srcReg23_34_hi = srcReg45_56_hi; srcReg4x = srcReg6x; } } static void aom_filter_block1d16_v8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg32; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; __m256i srcReg32b11, srcReg32b12, filtersReg32; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; // load 16 bytes 7 times in stride of src_pitch srcReg32b1 = yy_loadu2_128(src_ptr + src_pitch, src_ptr); srcReg32b3 = yy_loadu2_128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); srcReg32b5 = yy_loadu2_128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); srcReg32b7 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); // merge every two consecutive registers except the last one srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); // save srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); for (i = output_height; i > 1; i -= 2) { // load the last 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register srcReg32b8 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, _mm256_castsi256_si128(srcReg32b8), 1); srcReg32b9 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, _mm256_castsi256_si128(srcReg32b9), 1); // merge every two consecutive registers // save srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); // multiply 2 adjacent elements with the filter and add the result srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_adds_epi16(srcReg32b8, srcReg32b12)); // multiply 2 adjacent elements with the filter and add the result srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); // add and saturate the results together srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_adds_epi16(srcReg32b8, srcReg32b12)); // shift by 6 bit each 16 bit srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32); srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); src_ptr += src_stride; xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1); output_ptr += dst_stride; // save part of the registers for next strides srcReg32b10 = srcReg32b11; srcReg32b1 = srcReg32b3; srcReg32b11 = srcReg32b2; srcReg32b3 = srcReg32b5; srcReg32b2 = srcReg32b4; srcReg32b5 = srcReg32b7; srcReg32b7 = srcReg32b9; } if (i > 0) { __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; // load the last 16 bytes srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together srcRegFilt4 = _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); srcRegFilt7 = _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), _mm256_castsi256_si128(firstFilters)); srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), _mm256_castsi256_si128(firstFilters)); srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); // multiply 2 adjacent elements with the filter and add the result srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), _mm256_castsi256_si128(secondFilters)); srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), _mm256_castsi256_si128(secondFilters)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), _mm256_castsi256_si128(thirdFilters)); srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7)); // shift by 6 bit each 16 bit srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32)); srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); // save 16 bytes _mm_store_si128((__m128i *)output_ptr, srcRegFilt1); } } static void aom_filter_block1d4_v4_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i filtersReg32, addFilterReg32; __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; __m256i srcReg23_34_lo, srcReg45_56_lo; __m256i srcReg2345_3456_lo; __m256i resReglo, resReg; __m256i firstFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); srcReg4x = _mm256_castsi128_si256( _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); // have consecutive loads on the same 256 register srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); for (i = output_height; i > 1; i -= 2) { // load the last 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register srcReg5x = _mm256_castsi128_si256( _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); srcReg45 = _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); srcReg6x = _mm256_castsi128_si256( _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); srcReg56 = _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); // merge every two consecutive registers srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); // multiply 2 adjacent elements with the filter and add the result resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters); resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256()); // shift by 6 bit each 16 bit resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); resReglo = _mm256_srai_epi16(resReglo, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReg = _mm256_packus_epi16(resReglo, resReglo); src_ptr += src_stride; xx_storeu2_epi32(output_ptr, out_pitch, &resReg); output_ptr += dst_stride; // save part of the registers for next strides srcReg23_34_lo = srcReg45_56_lo; srcReg4x = srcReg6x; } } #if HAVE_AVX2 && HAVE_SSSE3 filter8_1dfunction aom_filter_block1d4_v8_ssse3; filter8_1dfunction aom_filter_block1d16_v2_ssse3; filter8_1dfunction aom_filter_block1d16_h2_ssse3; filter8_1dfunction aom_filter_block1d8_v2_ssse3; filter8_1dfunction aom_filter_block1d8_h2_ssse3; filter8_1dfunction aom_filter_block1d4_v2_ssse3; filter8_1dfunction aom_filter_block1d4_h2_ssse3; #define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3 #define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3 #define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3 #define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3 #define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3 #define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3 #define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3 // void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); // void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2) FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2) #endif // HAVE_AX2 && HAVE_SSSE3 aom-3.12.1/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c000066400000000000000000001010731477627663500224230ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve.h" #include "aom_dsp/x86/convolve_sse2.h" #include "aom_dsp/x86/convolve_ssse3.h" #include "aom_dsp/x86/mem_sse2.h" #include "aom_dsp/x86/transpose_sse2.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "aom_ports/emmintrin_compat.h" DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; DECLARE_ALIGNED(32, static const uint8_t, filtd4[]) = { 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, }; static void aom_filter_block1d4_h4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; unsigned int i; src_ptr -= 3; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); filt1Reg = _mm_load_si128((__m128i const *)(filtd4)); for (i = output_height; i > 0; i -= 1) { // load the 2 strides of source srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); // filter the source buffer srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg); // multiply 4 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve result srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); src_ptr += src_pixels_per_line; *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); output_ptr += output_pitch; } } static void aom_filter_block1d4_v4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32; __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45, srcReg6, srcReg56; __m128i srcReg23_34_lo, srcReg45_56_lo; __m128i srcReg2345_3456_lo, srcReg2345_3456_hi; __m128i resReglo, resReghi; __m128i firstFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3); srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); // have consecutive loads on the same 256 register srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4); srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34); for (i = output_height; i > 1; i -= 2) { srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5); srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6); // merge every two consecutive registers srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56); srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo); // multiply 2 adjacent elements with the filter and add the result resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters); resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters); resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128()); resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128()); // shift by 6 bit each 16 bit resReglo = _mm_adds_epi16(resReglo, addFilterReg32); resReghi = _mm_adds_epi16(resReghi, addFilterReg32); resReglo = _mm_srai_epi16(resReglo, 6); resReghi = _mm_srai_epi16(resReghi, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReglo = _mm_packus_epi16(resReglo, resReglo); resReghi = _mm_packus_epi16(resReghi, resReghi); src_ptr += src_stride; *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReglo); *((int *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi); output_ptr += dst_stride; // save part of the registers for next strides srcReg23_34_lo = srcReg45_56_lo; srcReg4 = srcReg6; } } static void aom_filter_block1d8_h4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32, filt2Reg, filt3Reg; __m128i secondFilters, thirdFilters; __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; __m128i srcReg32b1; unsigned int i; src_ptr -= 3; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32)); filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2)); for (i = output_height; i > 0; i -= 1) { srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); // filter the source buffer srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); // shrink to 8 bit each 16 bits srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); src_ptr += src_pixels_per_line; _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1); output_ptr += output_pitch; } } static void aom_filter_block1d8_v4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; __m128i srcReg23, srcReg34, srcReg45, srcReg56; __m128i resReg23, resReg34, resReg45, resReg56; __m128i resReg23_45, resReg34_56; __m128i addFilterReg32, secondFilters, thirdFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the second 16 bits (third and forth byte) // across 128 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 128 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3); srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); // have consecutive loads on the same 256 register srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4); for (i = output_height; i > 1; i -= 2) { srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5); srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6); // multiply 2 adjacent elements with the filter and add the result resReg23 = _mm_maddubs_epi16(srcReg23, secondFilters); resReg34 = _mm_maddubs_epi16(srcReg34, secondFilters); resReg45 = _mm_maddubs_epi16(srcReg45, thirdFilters); resReg56 = _mm_maddubs_epi16(srcReg56, thirdFilters); // add and saturate the results together resReg23_45 = _mm_adds_epi16(resReg23, resReg45); resReg34_56 = _mm_adds_epi16(resReg34, resReg56); // shift by 6 bit each 16 bit resReg23_45 = _mm_adds_epi16(resReg23_45, addFilterReg32); resReg34_56 = _mm_adds_epi16(resReg34_56, addFilterReg32); resReg23_45 = _mm_srai_epi16(resReg23_45, 6); resReg34_56 = _mm_srai_epi16(resReg34_56, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReg23_45 = _mm_packus_epi16(resReg23_45, _mm_setzero_si128()); resReg34_56 = _mm_packus_epi16(resReg34_56, _mm_setzero_si128()); src_ptr += src_stride; _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45)); _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56)); output_ptr += dst_stride; // save part of the registers for next strides srcReg23 = srcReg45; srcReg34 = srcReg56; srcReg4 = srcReg6; } } static void aom_filter_block1d16_h4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32, filt2Reg, filt3Reg; __m128i secondFilters, thirdFilters; __m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m128i srcReg32b1, srcReg32b2; unsigned int i; src_ptr -= 3; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32)); filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2)); for (i = output_height; i > 0; i -= 1) { srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); // filter the source buffer srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); // reading stride of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); // filter the source buffer srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg); srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32); srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve result srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); src_ptr += src_pixels_per_line; _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1); output_ptr += output_pitch; } } static void aom_filter_block1d16_v4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; __m128i resReg23_45, resReg34_56; __m128i addFilterReg32, secondFilters, thirdFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the second 16 bits (third and forth byte) // across 128 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 128 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3); srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); // have consecutive loads on the same 256 register srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4); for (i = output_height; i > 1; i -= 2) { srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5); srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6); // multiply 2 adjacent elements with the filter and add the result resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters); resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters); resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters); resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters); // add and saturate the results together resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); // multiply 2 adjacent elements with the filter and add the result resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters); resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters); resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters); resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters); // add and saturate the results together resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi); resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi); // shift by 6 bit each 16 bit resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32); resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32); resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6); resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi); resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi); src_ptr += src_stride; _mm_store_si128((__m128i *)output_ptr, (resReg23_45)); _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56)); output_ptr += dst_stride; // save part of the registers for next strides srcReg23_lo = srcReg45_lo; srcReg34_lo = srcReg56_lo; srcReg23_hi = srcReg45_hi; srcReg34_hi = srcReg56_hi; srcReg4 = srcReg6; } } static inline __m128i shuffle_filter_convolve8_8_ssse3( const __m128i *const s, const int16_t *const filter) { __m128i f[4]; shuffle_filter_ssse3(filter, f); return convolve8_8_ssse3(s, f); } static void filter_horiz_w8_ssse3(const uint8_t *const src, const ptrdiff_t src_stride, uint8_t *const dst, const int16_t *const x_filter) { __m128i s[8], ss[4], temp; load_8bit_8x8(src, src_stride, s); // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 transpose_16bit_4x8(s, ss); temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); } static void transpose8x8_to_dst(const uint8_t *const src, const ptrdiff_t src_stride, uint8_t *const dst, const ptrdiff_t dst_stride) { __m128i s[8]; load_8bit_8x8(src, src_stride, s); transpose_8bit_8x8(s, s); store_8bit_8x8(s, dst, dst_stride); } static void scaledconvolve_horiz_w8(const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const InterpKernel *const x_filters, const int x0_q4, const int x_step_q4, const int w, const int h) { DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); int x, y, z; src -= SUBPEL_TAPS / 2 - 1; // This function processes 8x8 areas. The intermediate height is not always // a multiple of 8, so force it to be a multiple of 8 here. y = h + (8 - (h & 0x7)); do { int x_q4 = x0_q4; for (x = 0; x < w; x += 8) { // process 8 src_x steps for (z = 0; z < 8; ++z) { const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; if (x_q4 & SUBPEL_MASK) { filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter); } else { int i; for (i = 0; i < 8; ++i) { temp[z * 8 + i] = src_x[i * src_stride + 3]; } } x_q4 += x_step_q4; } // transpose the 8x8 filters values back to dst transpose8x8_to_dst(temp, 8, dst + x, dst_stride); } src += src_stride * 8; dst += dst_stride * 8; } while (y -= 8); } static void filter_horiz_w4_ssse3(const uint8_t *const src, const ptrdiff_t src_stride, uint8_t *const dst, const int16_t *const filter) { __m128i s[4]; __m128i temp; load_8bit_8x4(src, src_stride, s); transpose_16bit_4x4(s, s); temp = shuffle_filter_convolve8_8_ssse3(s, filter); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 4 bytes *(int *)dst = _mm_cvtsi128_si32(temp); } static void transpose4x4_to_dst(const uint8_t *const src, const ptrdiff_t src_stride, uint8_t *const dst, const ptrdiff_t dst_stride) { __m128i s[4]; load_8bit_4x4(src, src_stride, s); s[0] = transpose_8bit_4x4(s); s[1] = _mm_srli_si128(s[0], 4); s[2] = _mm_srli_si128(s[0], 8); s[3] = _mm_srli_si128(s[0], 12); store_8bit_4x4(s, dst, dst_stride); } static void scaledconvolve_horiz_w4(const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const InterpKernel *const x_filters, const int x0_q4, const int x_step_q4, const int w, const int h) { DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); int x, y, z; src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; y += 4) { int x_q4 = x0_q4; for (x = 0; x < w; x += 4) { // process 4 src_x steps for (z = 0; z < 4; ++z) { const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; if (x_q4 & SUBPEL_MASK) { filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter); } else { int i; for (i = 0; i < 4; ++i) { temp[z * 4 + i] = src_x[i * src_stride + 3]; } } x_q4 += x_step_q4; } // transpose the 4x4 filters values back to dst transpose4x4_to_dst(temp, 4, dst + x, dst_stride); } src += src_stride * 4; dst += dst_stride * 4; } } static __m128i filter_vert_kernel(const __m128i *const s, const int16_t *const filter) { __m128i ss[4]; __m128i temp; // 00 10 01 11 02 12 03 13 ss[0] = _mm_unpacklo_epi8(s[0], s[1]); // 20 30 21 31 22 32 23 33 ss[1] = _mm_unpacklo_epi8(s[2], s[3]); // 40 50 41 51 42 52 43 53 ss[2] = _mm_unpacklo_epi8(s[4], s[5]); // 60 70 61 71 62 72 63 73 ss[3] = _mm_unpacklo_epi8(s[6], s[7]); temp = shuffle_filter_convolve8_8_ssse3(ss, filter); // shrink to 8 bit each 16 bits return _mm_packus_epi16(temp, temp); } static void filter_vert_w4_ssse3(const uint8_t *const src, const ptrdiff_t src_stride, uint8_t *const dst, const int16_t *const filter) { __m128i s[8]; __m128i temp; load_8bit_4x8(src, src_stride, s); temp = filter_vert_kernel(s, filter); // save only 4 bytes *(int *)dst = _mm_cvtsi128_si32(temp); } static void scaledconvolve_vert_w4( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, const ptrdiff_t dst_stride, const InterpKernel *const y_filters, const int y0_q4, const int y_step_q4, const int w, const int h) { int y; int y_q4 = y0_q4; src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (y = 0; y < h; ++y) { const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; if (y_q4 & SUBPEL_MASK) { filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); } else { memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); } y_q4 += y_step_q4; } } static void filter_vert_w8_ssse3(const uint8_t *const src, const ptrdiff_t src_stride, uint8_t *const dst, const int16_t *const filter) { __m128i s[8], temp; load_8bit_8x8(src, src_stride, s); temp = filter_vert_kernel(s, filter); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); } static void scaledconvolve_vert_w8( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, const ptrdiff_t dst_stride, const InterpKernel *const y_filters, const int y0_q4, const int y_step_q4, const int w, const int h) { int y; int y_q4 = y0_q4; src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (y = 0; y < h; ++y) { const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; if (y_q4 & SUBPEL_MASK) { filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); } else { memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); } y_q4 += y_step_q4; } } static void filter_vert_w16_ssse3(const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, const int16_t *const filter, const int w) { int i; __m128i f[4]; shuffle_filter_ssse3(filter, f); for (i = 0; i < w; i += 16) { __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi; loadu_8bit_16x8(src, src_stride, s); // merge the result together s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]); s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]); s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]); s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]); s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]); s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]); s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]); s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]); temp_lo = convolve8_8_ssse3(s_lo, f); temp_hi = convolve8_8_ssse3(s_hi, f); // shrink to 8 bit each 16 bits, the first lane contain the first convolve // result and the second lane contain the second convolve result temp_hi = _mm_packus_epi16(temp_lo, temp_hi); src += 16; // save 16 bytes convolve result _mm_store_si128((__m128i *)&dst[i], temp_hi); } } static void scaledconvolve_vert_w16( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, const ptrdiff_t dst_stride, const InterpKernel *const y_filters, const int y0_q4, const int y_step_q4, const int w, const int h) { int y; int y_q4 = y0_q4; src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (y = 0; y < h; ++y) { const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; if (y_q4 & SUBPEL_MASK) { filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter, w); } else { memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); } y_q4 += y_step_q4; } } void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. // (2) Interpolate temp vertically to derive the sub-pixel result. // Deriving the maximum number of rows in the temp buffer (135): // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). // --Largest block size is 64x64 pixels. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the // original frame (in 1/16th pixel units). // --Must round-up because block may be located at sub-pixel position. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. // --Require an additional 8 rows for the horiz_w8 transpose tail. // When calling in frame scaling function, the smallest scaling factor is x1/4 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still // big enough. DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; assert(w <= 64); assert(h <= 64); assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); assert(x_step_q4 <= 64); if (w >= 8) { scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, filter, x0_q4, x_step_q4, w, intermediate_height); } else { scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, filter, x0_q4, x_step_q4, w, intermediate_height); } if (w >= 16) { scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, y0_q4, y_step_q4, w, h); } else if (w == 8) { scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, y0_q4, y_step_q4, w, h); } else { scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, y0_q4, y_step_q4, w, h); } } filter8_1dfunction aom_filter_block1d16_v8_ssse3; filter8_1dfunction aom_filter_block1d16_h8_ssse3; filter8_1dfunction aom_filter_block1d8_v8_ssse3; filter8_1dfunction aom_filter_block1d8_h8_ssse3; filter8_1dfunction aom_filter_block1d4_v8_ssse3; filter8_1dfunction aom_filter_block1d4_h8_ssse3; filter8_1dfunction aom_filter_block1d16_v2_ssse3; filter8_1dfunction aom_filter_block1d16_h2_ssse3; filter8_1dfunction aom_filter_block1d8_v2_ssse3; filter8_1dfunction aom_filter_block1d8_h2_ssse3; filter8_1dfunction aom_filter_block1d4_v2_ssse3; filter8_1dfunction aom_filter_block1d4_h2_ssse3; // void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); // void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3) FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3) aom-3.12.1/aom_dsp/x86/aom_subpixel_8t_ssse3.asm000066400000000000000000000732371477627663500214100ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "third_party/x86inc/x86inc.asm" SECTION_RODATA pw_64: times 8 dw 64 even_byte_mask: times 8 dw 0x00ff ; %define USE_PMULHRSW ; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss ; when using this instruction. ; ; The add order below (based on ffav1) must be followed to prevent outranges. ; x = k0k1 + k4k5 ; y = k2k3 + k6k7 ; z = signed SAT(x + y) SECTION .text %define LOCAL_VARS_SIZE 16*6 %macro SETUP_LOCAL_VARS 0 ; TODO(slavarnway): using xmm registers for these on AOM_ARCH_X86_64 + ; pmaddubsw has a higher latency on some platforms, this might be eased by ; interleaving the instructions. %define k0k1 [rsp + 16*0] %define k2k3 [rsp + 16*1] %define k4k5 [rsp + 16*2] %define k6k7 [rsp + 16*3] packsswb m4, m4 ; TODO(slavarnway): multiple pshufb instructions had a higher latency on ; some platforms. pshuflw m0, m4, 0b ;k0_k1 pshuflw m1, m4, 01010101b ;k2_k3 pshuflw m2, m4, 10101010b ;k4_k5 pshuflw m3, m4, 11111111b ;k6_k7 punpcklqdq m0, m0 punpcklqdq m1, m1 punpcklqdq m2, m2 punpcklqdq m3, m3 mova k0k1, m0 mova k2k3, m1 mova k4k5, m2 mova k6k7, m3 %if AOM_ARCH_X86_64 %define krd m12 %define tmp0 [rsp + 16*4] %define tmp1 [rsp + 16*5] mova krd, [GLOBAL(pw_64)] %else %define krd [rsp + 16*4] %if CONFIG_PIC=0 mova m6, [GLOBAL(pw_64)] %else ; build constants without accessing global memory pcmpeqb m6, m6 ;all ones psrlw m6, 15 psllw m6, 6 ;aka pw_64 %endif mova krd, m6 %endif %endm ;------------------------------------------------------------------------------- %if AOM_ARCH_X86_64 %define LOCAL_VARS_SIZE_H4 0 %else %define LOCAL_VARS_SIZE_H4 16*4 %endif %macro SUBPIX_HFILTER4 1 cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ src, sstride, dst, dstride, height, filter mova m4, [filterq] packsswb m4, m4 %if AOM_ARCH_X86_64 %define k0k1k4k5 m8 %define k2k3k6k7 m9 %define krd m10 mova krd, [GLOBAL(pw_64)] pshuflw k0k1k4k5, m4, 0b ;k0_k1 pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 %else %define k0k1k4k5 [rsp + 16*0] %define k2k3k6k7 [rsp + 16*1] %define krd [rsp + 16*2] pshuflw m6, m4, 0b ;k0_k1 pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 pshuflw m7, m4, 01010101b ;k2_k3 pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 %if CONFIG_PIC=0 mova m1, [GLOBAL(pw_64)] %else ; build constants without accessing global memory pcmpeqb m1, m1 ;all ones psrlw m1, 15 psllw m1, 6 ;aka pw_64 %endif mova k0k1k4k5, m6 mova k2k3k6k7, m7 mova krd, m1 %endif dec heightd .loop: ;Do two rows at once movu m4, [srcq - 3] movu m5, [srcq + sstrideq - 3] punpckhbw m1, m4, m4 punpcklbw m4, m4 punpckhbw m3, m5, m5 punpcklbw m5, m5 palignr m0, m1, m4, 1 pmaddubsw m0, k0k1k4k5 palignr m1, m4, 5 pmaddubsw m1, k2k3k6k7 palignr m2, m3, m5, 1 pmaddubsw m2, k0k1k4k5 palignr m3, m5, 5 pmaddubsw m3, k2k3k6k7 punpckhqdq m4, m0, m2 punpcklqdq m0, m2 punpckhqdq m5, m1, m3 punpcklqdq m1, m3 paddsw m0, m4 paddsw m1, m5 %ifidn %1, h8_avg movd m4, [dstq] movd m5, [dstq + dstrideq] %endif paddsw m0, m1 paddsw m0, krd psraw m0, 7 %ifidn %1, h8_add_src pxor m3, m3 movu m4, [srcq] movu m5, [srcq + sstrideq] punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2 punpcklbw m4, m3 paddsw m0, m4 %endif packuswb m0, m0 psrldq m1, m0, 4 %ifidn %1, h8_avg pavgb m0, m4 pavgb m1, m5 %endif movd [dstq], m0 movd [dstq + dstrideq], m1 lea srcq, [srcq + sstrideq ] prefetcht0 [srcq + 4 * sstrideq - 3] lea srcq, [srcq + sstrideq ] lea dstq, [dstq + 2 * dstrideq ] prefetcht0 [srcq + 2 * sstrideq - 3] sub heightd, 2 jg .loop ; Do last row if output_height is odd jne .done movu m4, [srcq - 3] punpckhbw m1, m4, m4 punpcklbw m4, m4 palignr m0, m1, m4, 1 palignr m1, m4, 5 pmaddubsw m0, k0k1k4k5 pmaddubsw m1, k2k3k6k7 psrldq m2, m0, 8 psrldq m3, m1, 8 paddsw m0, m2 paddsw m1, m3 paddsw m0, m1 paddsw m0, krd psraw m0, 7 %ifidn %1, h8_add_src pxor m3, m3 movu m4, [srcq] punpcklbw m4, m3 paddsw m0, m4 %endif packuswb m0, m0 %ifidn %1, h8_avg movd m4, [dstq] pavgb m0, m4 %endif movd [dstq], m0 .done: REP_RET %endm ;------------------------------------------------------------------------------- %macro SUBPIX_HFILTER8 1 cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ src, sstride, dst, dstride, height, filter mova m4, [filterq] SETUP_LOCAL_VARS dec heightd .loop: ;Do two rows at once movu m0, [srcq - 3] movu m4, [srcq + sstrideq - 3] punpckhbw m1, m0, m0 punpcklbw m0, m0 palignr m5, m1, m0, 13 pmaddubsw m5, k6k7 palignr m2, m1, m0, 5 palignr m3, m1, m0, 9 palignr m1, m0, 1 pmaddubsw m1, k0k1 punpckhbw m6, m4, m4 punpcklbw m4, m4 pmaddubsw m2, k2k3 pmaddubsw m3, k4k5 palignr m7, m6, m4, 13 palignr m0, m6, m4, 5 pmaddubsw m7, k6k7 paddsw m1, m3 paddsw m2, m5 paddsw m1, m2 %ifidn %1, h8_avg movh m2, [dstq] movhps m2, [dstq + dstrideq] %endif palignr m5, m6, m4, 9 palignr m6, m4, 1 pmaddubsw m0, k2k3 pmaddubsw m6, k0k1 paddsw m1, krd pmaddubsw m5, k4k5 psraw m1, 7 paddsw m0, m7 paddsw m6, m5 paddsw m6, m0 paddsw m6, krd psraw m6, 7 %ifidn %1, h8_add_src pxor m3, m3 movu m4, [srcq] movu m5, [srcq + sstrideq] punpcklbw m4, m3 punpcklbw m5, m3 paddsw m1, m4 paddsw m6, m5 %endif packuswb m1, m6 %ifidn %1, h8_avg pavgb m1, m2 %endif movh [dstq], m1 movhps [dstq + dstrideq], m1 lea srcq, [srcq + sstrideq ] prefetcht0 [srcq + 4 * sstrideq - 3] lea srcq, [srcq + sstrideq ] lea dstq, [dstq + 2 * dstrideq ] prefetcht0 [srcq + 2 * sstrideq - 3] sub heightd, 2 jg .loop ; Do last row if output_height is odd jne .done movu m0, [srcq - 3] punpckhbw m3, m0, m0 punpcklbw m0, m0 palignr m1, m3, m0, 1 palignr m2, m3, m0, 5 palignr m4, m3, m0, 13 palignr m3, m0, 9 pmaddubsw m1, k0k1 pmaddubsw m2, k2k3 pmaddubsw m3, k4k5 pmaddubsw m4, k6k7 paddsw m1, m3 paddsw m4, m2 paddsw m1, m4 paddsw m1, krd psraw m1, 7 %ifidn %1, h8_add_src pxor m6, m6 movu m5, [srcq] punpcklbw m5, m6 paddsw m1, m5 %endif packuswb m1, m1 %ifidn %1, h8_avg movh m0, [dstq] pavgb m1, m0 %endif movh [dstq], m1 .done: REP_RET %endm ;------------------------------------------------------------------------------- %macro SUBPIX_HFILTER16 1 cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ src, sstride, dst, dstride, height, filter mova m4, [filterq] SETUP_LOCAL_VARS .loop: prefetcht0 [srcq + 2 * sstrideq -3] movu m0, [srcq - 3] movu m4, [srcq - 2] pmaddubsw m0, k0k1 pmaddubsw m4, k0k1 movu m1, [srcq - 1] movu m5, [srcq + 0] pmaddubsw m1, k2k3 pmaddubsw m5, k2k3 movu m2, [srcq + 1] movu m6, [srcq + 2] pmaddubsw m2, k4k5 pmaddubsw m6, k4k5 movu m3, [srcq + 3] movu m7, [srcq + 4] pmaddubsw m3, k6k7 pmaddubsw m7, k6k7 paddsw m0, m2 paddsw m1, m3 paddsw m0, m1 paddsw m4, m6 paddsw m5, m7 paddsw m4, m5 paddsw m0, krd paddsw m4, krd psraw m0, 7 psraw m4, 7 %ifidn %1, h8_add_src %if AOM_ARCH_X86=1 && CONFIG_PIC=1 pcmpeqb m2, m2 ;all ones psrlw m2, 8 ;even_byte_mask %else mova m2, [GLOBAL(even_byte_mask)] %endif movu m5, [srcq] mova m7, m5 pand m5, m2 psrlw m7, 8 paddsw m0, m5 paddsw m4, m7 %endif packuswb m0, m0 packuswb m4, m4 punpcklbw m0, m4 %ifidn %1, h8_avg pavgb m0, [dstq] %endif lea srcq, [srcq + sstrideq] mova [dstq], m0 lea dstq, [dstq + dstrideq] dec heightd jnz .loop REP_RET %endm INIT_XMM ssse3 SUBPIX_HFILTER16 h8 SUBPIX_HFILTER8 h8 SUBPIX_HFILTER4 h8 ;------------------------------------------------------------------------------- ; TODO(Linfeng): Detect cpu type and choose the code with better performance. %define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1 %if AOM_ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON %define NUM_GENERAL_REG_USED 9 %else %define NUM_GENERAL_REG_USED 6 %endif %macro SUBPIX_VFILTER 2 cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ src, sstride, dst, dstride, height, filter mova m4, [filterq] SETUP_LOCAL_VARS %ifidn %2, 8 %define movx movh %else %define movx movd %endif dec heightd %if AOM_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON %if AOM_ARCH_X86_64 %define src1q r7 %define sstride6q r8 %define dst_stride dstrideq %else %define src1q filterq %define sstride6q dstrideq %define dst_stride dstridemp %endif mov src1q, srcq add src1q, sstrideq lea sstride6q, [sstrideq + sstrideq * 4] add sstride6q, sstrideq ;pitch * 6 .loop: ;Do two rows at once movx m0, [srcq ] ;A movx m1, [src1q ] ;B punpcklbw m0, m1 ;A B movx m2, [srcq + sstrideq * 2 ] ;C pmaddubsw m0, k0k1 mova m6, m2 movx m3, [src1q + sstrideq * 2] ;D punpcklbw m2, m3 ;C D pmaddubsw m2, k2k3 movx m4, [srcq + sstrideq * 4 ] ;E mova m7, m4 movx m5, [src1q + sstrideq * 4] ;F punpcklbw m4, m5 ;E F pmaddubsw m4, k4k5 punpcklbw m1, m6 ;A B next iter movx m6, [srcq + sstride6q ] ;G punpcklbw m5, m6 ;E F next iter punpcklbw m3, m7 ;C D next iter pmaddubsw m5, k4k5 movx m7, [src1q + sstride6q ] ;H punpcklbw m6, m7 ;G H pmaddubsw m6, k6k7 pmaddubsw m3, k2k3 pmaddubsw m1, k0k1 paddsw m0, m4 paddsw m2, m6 movx m6, [srcq + sstrideq * 8 ] ;H next iter punpcklbw m7, m6 pmaddubsw m7, k6k7 paddsw m0, m2 paddsw m0, krd psraw m0, 7 paddsw m1, m5 %ifidn %1, v8_add_src pxor m6, m6 movu m4, [srcq] punpcklbw m4, m6 paddsw m0, m4 %endif packuswb m0, m0 paddsw m3, m7 paddsw m1, m3 paddsw m1, krd psraw m1, 7 %ifidn %1, v8_add_src movu m4, [src1q] punpcklbw m4, m6 paddsw m1, m4 %endif lea srcq, [srcq + sstrideq * 2 ] lea src1q, [src1q + sstrideq * 2] packuswb m1, m1 %ifidn %1, v8_avg movx m2, [dstq] pavgb m0, m2 %endif movx [dstq], m0 add dstq, dst_stride %ifidn %1, v8_avg movx m3, [dstq] pavgb m1, m3 %endif movx [dstq], m1 add dstq, dst_stride sub heightd, 2 jg .loop ; Do last row if output_height is odd jne .done movx m0, [srcq ] ;A movx m1, [srcq + sstrideq ] ;B movx m6, [srcq + sstride6q ] ;G punpcklbw m0, m1 ;A B movx m7, [src1q + sstride6q ] ;H pmaddubsw m0, k0k1 movx m2, [srcq + sstrideq * 2 ] ;C punpcklbw m6, m7 ;G H movx m3, [src1q + sstrideq * 2] ;D pmaddubsw m6, k6k7 movx m4, [srcq + sstrideq * 4 ] ;E punpcklbw m2, m3 ;C D movx m5, [src1q + sstrideq * 4] ;F punpcklbw m4, m5 ;E F pmaddubsw m2, k2k3 pmaddubsw m4, k4k5 paddsw m2, m6 paddsw m0, m4 paddsw m0, m2 paddsw m0, krd psraw m0, 7 %ifidn %1, v8_add_src pxor m6, m6 movu m4, [srcq] punpcklbw m4, m6 paddsw m0, m4 %endif packuswb m0, m0 %ifidn %1, v8_avg movx m1, [dstq] pavgb m0, m1 %endif movx [dstq], m0 %else ; AOM_ARCH_X86_64 movx m0, [srcq ] ;A movx m1, [srcq + sstrideq ] ;B lea srcq, [srcq + sstrideq * 2 ] movx m2, [srcq] ;C movx m3, [srcq + sstrideq] ;D lea srcq, [srcq + sstrideq * 2 ] movx m4, [srcq] ;E movx m5, [srcq + sstrideq] ;F lea srcq, [srcq + sstrideq * 2 ] movx m6, [srcq] ;G punpcklbw m0, m1 ;A B punpcklbw m1, m2 ;A B next iter punpcklbw m2, m3 ;C D punpcklbw m3, m4 ;C D next iter punpcklbw m4, m5 ;E F punpcklbw m5, m6 ;E F next iter .loop: ;Do two rows at once movx m7, [srcq + sstrideq] ;H lea srcq, [srcq + sstrideq * 2 ] movx m14, [srcq] ;H next iter punpcklbw m6, m7 ;G H punpcklbw m7, m14 ;G H next iter pmaddubsw m8, m0, k0k1 pmaddubsw m9, m1, k0k1 mova m0, m2 mova m1, m3 pmaddubsw m10, m2, k2k3 pmaddubsw m11, m3, k2k3 mova m2, m4 mova m3, m5 pmaddubsw m4, k4k5 pmaddubsw m5, k4k5 paddsw m8, m4 paddsw m9, m5 mova m4, m6 mova m5, m7 pmaddubsw m6, k6k7 pmaddubsw m7, k6k7 paddsw m10, m6 paddsw m11, m7 paddsw m8, m10 paddsw m9, m11 mova m6, m14 paddsw m8, krd paddsw m9, krd psraw m8, 7 psraw m9, 7 %ifidn %2, 4 packuswb m8, m8 packuswb m9, m9 %else packuswb m8, m9 %endif %ifidn %1, v8_avg movx m7, [dstq] %ifidn %2, 4 movx m10, [dstq + dstrideq] pavgb m9, m10 %else movhpd m7, [dstq + dstrideq] %endif pavgb m8, m7 %endif movx [dstq], m8 %ifidn %2, 4 movx [dstq + dstrideq], m9 %else movhpd [dstq + dstrideq], m8 %endif lea dstq, [dstq + dstrideq * 2 ] sub heightd, 2 jg .loop ; Do last row if output_height is odd jne .done movx m7, [srcq + sstrideq] ;H punpcklbw m6, m7 ;G H pmaddubsw m0, k0k1 pmaddubsw m2, k2k3 pmaddubsw m4, k4k5 pmaddubsw m6, k6k7 paddsw m0, m4 paddsw m2, m6 paddsw m0, m2 paddsw m0, krd psraw m0, 7 packuswb m0, m0 %ifidn %1, v8_avg movx m1, [dstq] pavgb m0, m1 %endif movx [dstq], m0 %endif ; AOM_ARCH_X86_64 .done: REP_RET %endm ;------------------------------------------------------------------------------- %macro SUBPIX_VFILTER16 1 cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ src, sstride, dst, dstride, height, filter mova m4, [filterq] SETUP_LOCAL_VARS %if AOM_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON %if AOM_ARCH_X86_64 %define src1q r7 %define sstride6q r8 %define dst_stride dstrideq %else %define src1q filterq %define sstride6q dstrideq %define dst_stride dstridemp %endif lea src1q, [srcq + sstrideq] lea sstride6q, [sstrideq + sstrideq * 4] add sstride6q, sstrideq ;pitch * 6 .loop: movh m0, [srcq ] ;A movh m1, [src1q ] ;B movh m2, [srcq + sstrideq * 2 ] ;C movh m3, [src1q + sstrideq * 2] ;D movh m4, [srcq + sstrideq * 4 ] ;E movh m5, [src1q + sstrideq * 4] ;F punpcklbw m0, m1 ;A B movh m6, [srcq + sstride6q] ;G punpcklbw m2, m3 ;C D movh m7, [src1q + sstride6q] ;H punpcklbw m4, m5 ;E F pmaddubsw m0, k0k1 movh m3, [srcq + 8] ;A pmaddubsw m2, k2k3 punpcklbw m6, m7 ;G H movh m5, [srcq + sstrideq + 8] ;B pmaddubsw m4, k4k5 punpcklbw m3, m5 ;A B movh m7, [srcq + sstrideq * 2 + 8] ;C pmaddubsw m6, k6k7 movh m5, [src1q + sstrideq * 2 + 8] ;D punpcklbw m7, m5 ;C D paddsw m2, m6 pmaddubsw m3, k0k1 movh m1, [srcq + sstrideq * 4 + 8] ;E paddsw m0, m4 pmaddubsw m7, k2k3 movh m6, [src1q + sstrideq * 4 + 8] ;F punpcklbw m1, m6 ;E F paddsw m0, m2 paddsw m0, krd movh m2, [srcq + sstride6q + 8] ;G pmaddubsw m1, k4k5 movh m5, [src1q + sstride6q + 8] ;H psraw m0, 7 punpcklbw m2, m5 ;G H pmaddubsw m2, k6k7 paddsw m7, m2 paddsw m3, m1 paddsw m3, m7 paddsw m3, krd psraw m3, 7 %ifidn %1, v8_add_src pxor m6, m6 movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down mova m5, m4 punpcklbw m4, m6 punpckhbw m5, m6 paddsw m0, m4 paddsw m3, m5 %endif packuswb m0, m3 add srcq, sstrideq add src1q, sstrideq %ifidn %1, v8_avg pavgb m0, [dstq] %endif mova [dstq], m0 add dstq, dst_stride dec heightd jnz .loop REP_RET %else ; AOM_ARCH_X86_64 dec heightd movu m1, [srcq ] ;A movu m3, [srcq + sstrideq ] ;B lea srcq, [srcq + sstrideq * 2] punpcklbw m0, m1, m3 ;A B punpckhbw m1, m3 ;A B movu m5, [srcq] ;C punpcklbw m2, m3, m5 ;A B next iter punpckhbw m3, m5 ;A B next iter mova tmp0, m2 ;store to stack mova tmp1, m3 ;store to stack movu m7, [srcq + sstrideq] ;D lea srcq, [srcq + sstrideq * 2] punpcklbw m4, m5, m7 ;C D punpckhbw m5, m7 ;C D movu m9, [srcq] ;E punpcklbw m6, m7, m9 ;C D next iter punpckhbw m7, m9 ;C D next iter movu m11, [srcq + sstrideq] ;F lea srcq, [srcq + sstrideq * 2] punpcklbw m8, m9, m11 ;E F punpckhbw m9, m11 ;E F movu m2, [srcq] ;G punpcklbw m10, m11, m2 ;E F next iter punpckhbw m11, m2 ;E F next iter .loop: ;Do two rows at once pmaddubsw m13, m0, k0k1 mova m0, m4 pmaddubsw m14, m8, k4k5 pmaddubsw m15, m4, k2k3 mova m4, m8 paddsw m13, m14 movu m3, [srcq + sstrideq] ;H lea srcq, [srcq + sstrideq * 2] punpcklbw m14, m2, m3 ;G H mova m8, m14 pmaddubsw m14, k6k7 paddsw m15, m14 paddsw m13, m15 paddsw m13, krd psraw m13, 7 pmaddubsw m14, m1, k0k1 pmaddubsw m1, m9, k4k5 pmaddubsw m15, m5, k2k3 paddsw m14, m1 mova m1, m5 mova m5, m9 punpckhbw m2, m3 ;G H mova m9, m2 pmaddubsw m2, k6k7 paddsw m15, m2 paddsw m14, m15 paddsw m14, krd psraw m14, 7 packuswb m13, m14 %ifidn %1, v8_avg pavgb m13, [dstq] %endif mova [dstq], m13 ; next iter pmaddubsw m15, tmp0, k0k1 pmaddubsw m14, m10, k4k5 pmaddubsw m13, m6, k2k3 paddsw m15, m14 mova tmp0, m6 mova m6, m10 movu m2, [srcq] ;G next iter punpcklbw m14, m3, m2 ;G H next iter mova m10, m14 pmaddubsw m14, k6k7 paddsw m13, m14 paddsw m15, m13 paddsw m15, krd psraw m15, 7 pmaddubsw m14, tmp1, k0k1 mova tmp1, m7 pmaddubsw m13, m7, k2k3 mova m7, m11 pmaddubsw m11, k4k5 paddsw m14, m11 punpckhbw m3, m2 ;G H next iter mova m11, m3 pmaddubsw m3, k6k7 paddsw m13, m3 paddsw m14, m13 paddsw m14, krd psraw m14, 7 packuswb m15, m14 %ifidn %1, v8_avg pavgb m15, [dstq + dstrideq] %endif mova [dstq + dstrideq], m15 lea dstq, [dstq + dstrideq * 2] sub heightd, 2 jg .loop ; Do last row if output_height is odd jne .done movu m3, [srcq + sstrideq] ;H punpcklbw m6, m2, m3 ;G H punpckhbw m2, m3 ;G H pmaddubsw m0, k0k1 pmaddubsw m1, k0k1 pmaddubsw m4, k2k3 pmaddubsw m5, k2k3 pmaddubsw m8, k4k5 pmaddubsw m9, k4k5 pmaddubsw m6, k6k7 pmaddubsw m2, k6k7 paddsw m0, m8 paddsw m1, m9 paddsw m4, m6 paddsw m5, m2 paddsw m0, m4 paddsw m1, m5 paddsw m0, krd paddsw m1, krd psraw m0, 7 psraw m1, 7 packuswb m0, m1 %ifidn %1, v8_avg pavgb m0, [dstq] %endif mova [dstq], m0 .done: REP_RET %endif ; AOM_ARCH_X86_64 %endm INIT_XMM ssse3 SUBPIX_VFILTER16 v8 SUBPIX_VFILTER v8, 8 SUBPIX_VFILTER v8, 4 aom-3.12.1/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm000066400000000000000000000142241477627663500226310ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "aom_ports/x86_abi_support.asm" %macro GET_PARAM_4 0 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr mov ecx, 0x01000100 movdqa xmm3, [rdx] ;load filters psrldq xmm3, 6 packsswb xmm3, xmm3 pshuflw xmm3, xmm3, 0b ;k3_k4 movd xmm2, ecx ;rounding_shift pshufd xmm2, xmm2, 0 movsxd rax, DWORD PTR arg(1) ;pixels_per_line movsxd rdx, DWORD PTR arg(3) ;out_pitch movsxd rcx, DWORD PTR arg(4) ;output_height %endm %macro APPLY_FILTER_4 1 punpcklbw xmm0, xmm1 pmaddubsw xmm0, xmm3 pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7) packuswb xmm0, xmm0 ;pack to byte %if %1 movd xmm1, [rdi] pavgb xmm0, xmm1 %endif movd [rdi], xmm0 lea rsi, [rsi + rax] lea rdi, [rdi + rdx] dec rcx %endm %macro GET_PARAM 0 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr mov ecx, 0x01000100 movdqa xmm7, [rdx] ;load filters psrldq xmm7, 6 packsswb xmm7, xmm7 pshuflw xmm7, xmm7, 0b ;k3_k4 punpcklwd xmm7, xmm7 movd xmm6, ecx ;rounding_shift pshufd xmm6, xmm6, 0 movsxd rax, DWORD PTR arg(1) ;pixels_per_line movsxd rdx, DWORD PTR arg(3) ;out_pitch movsxd rcx, DWORD PTR arg(4) ;output_height %endm %macro APPLY_FILTER_8 1 punpcklbw xmm0, xmm1 pmaddubsw xmm0, xmm7 pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) packuswb xmm0, xmm0 ;pack back to byte %if %1 movq xmm1, [rdi] pavgb xmm0, xmm1 %endif movq [rdi], xmm0 ;store the result lea rsi, [rsi + rax] lea rdi, [rdi + rdx] dec rcx %endm %macro APPLY_FILTER_16 1 punpcklbw xmm0, xmm1 punpckhbw xmm2, xmm1 pmaddubsw xmm0, xmm7 pmaddubsw xmm2, xmm7 pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) pmulhrsw xmm2, xmm6 packuswb xmm0, xmm2 ;pack back to byte %if %1 movdqu xmm1, [rdi] pavgb xmm0, xmm1 %endif movdqu [rdi], xmm0 ;store the result lea rsi, [rsi + rax] lea rdi, [rdi + rdx] dec rcx %endm SECTION .text globalsym(aom_filter_block1d4_v2_ssse3) sym(aom_filter_block1d4_v2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 push rsi push rdi ; end prolog GET_PARAM_4 .loop: movd xmm0, [rsi] ;load src movd xmm1, [rsi + rax] APPLY_FILTER_4 0 jnz .loop ; begin epilog pop rdi pop rsi UNSHADOW_ARGS pop rbp ret globalsym(aom_filter_block1d8_v2_ssse3) sym(aom_filter_block1d8_v2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rsi push rdi ; end prolog GET_PARAM .loop: movq xmm0, [rsi] ;0 movq xmm1, [rsi + rax] ;1 APPLY_FILTER_8 0 jnz .loop ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret globalsym(aom_filter_block1d16_v2_ssse3) sym(aom_filter_block1d16_v2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rsi push rdi ; end prolog GET_PARAM .loop: movdqu xmm0, [rsi] ;0 movdqu xmm1, [rsi + rax] ;1 movdqa xmm2, xmm0 APPLY_FILTER_16 0 jnz .loop ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret globalsym(aom_filter_block1d4_h2_ssse3) sym(aom_filter_block1d4_h2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 push rsi push rdi ; end prolog GET_PARAM_4 .loop: movdqu xmm0, [rsi] ;load src movdqa xmm1, xmm0 psrldq xmm1, 1 APPLY_FILTER_4 0 jnz .loop ; begin epilog pop rdi pop rsi UNSHADOW_ARGS pop rbp ret globalsym(aom_filter_block1d8_h2_ssse3) sym(aom_filter_block1d8_h2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rsi push rdi ; end prolog GET_PARAM .loop: movdqu xmm0, [rsi] ;load src movdqa xmm1, xmm0 psrldq xmm1, 1 APPLY_FILTER_8 0 jnz .loop ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret globalsym(aom_filter_block1d16_h2_ssse3) sym(aom_filter_block1d16_h2_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rsi push rdi ; end prolog GET_PARAM .loop: movdqu xmm0, [rsi] ;load src movdqu xmm1, [rsi + 1] movdqa xmm2, xmm0 APPLY_FILTER_16 0 jnz .loop ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret aom-3.12.1/aom_dsp/x86/avg_intrin_avx2.c000066400000000000000000001104221477627663500177140ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/bitdepth_conversion_avx2.h" #include "aom_dsp/x86/synonyms_avx2.h" #include "aom_ports/mem.h" static inline void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero, __m256i *out_lo, __m256i *out_hi) { const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in); *out_lo = _mm256_unpacklo_epi16(in, sign_bits); *out_hi = _mm256_unpackhi_epi16(in, sign_bits); } static void hadamard_col8x2_avx2(__m256i *in, int iter) { __m256i a0 = in[0]; __m256i a1 = in[1]; __m256i a2 = in[2]; __m256i a3 = in[3]; __m256i a4 = in[4]; __m256i a5 = in[5]; __m256i a6 = in[6]; __m256i a7 = in[7]; __m256i b0 = _mm256_add_epi16(a0, a1); __m256i b1 = _mm256_sub_epi16(a0, a1); __m256i b2 = _mm256_add_epi16(a2, a3); __m256i b3 = _mm256_sub_epi16(a2, a3); __m256i b4 = _mm256_add_epi16(a4, a5); __m256i b5 = _mm256_sub_epi16(a4, a5); __m256i b6 = _mm256_add_epi16(a6, a7); __m256i b7 = _mm256_sub_epi16(a6, a7); a0 = _mm256_add_epi16(b0, b2); a1 = _mm256_add_epi16(b1, b3); a2 = _mm256_sub_epi16(b0, b2); a3 = _mm256_sub_epi16(b1, b3); a4 = _mm256_add_epi16(b4, b6); a5 = _mm256_add_epi16(b5, b7); a6 = _mm256_sub_epi16(b4, b6); a7 = _mm256_sub_epi16(b5, b7); if (iter == 0) { b0 = _mm256_add_epi16(a0, a4); b7 = _mm256_add_epi16(a1, a5); b3 = _mm256_add_epi16(a2, a6); b4 = _mm256_add_epi16(a3, a7); b2 = _mm256_sub_epi16(a0, a4); b6 = _mm256_sub_epi16(a1, a5); b1 = _mm256_sub_epi16(a2, a6); b5 = _mm256_sub_epi16(a3, a7); a0 = _mm256_unpacklo_epi16(b0, b1); a1 = _mm256_unpacklo_epi16(b2, b3); a2 = _mm256_unpackhi_epi16(b0, b1); a3 = _mm256_unpackhi_epi16(b2, b3); a4 = _mm256_unpacklo_epi16(b4, b5); a5 = _mm256_unpacklo_epi16(b6, b7); a6 = _mm256_unpackhi_epi16(b4, b5); a7 = _mm256_unpackhi_epi16(b6, b7); b0 = _mm256_unpacklo_epi32(a0, a1); b1 = _mm256_unpacklo_epi32(a4, a5); b2 = _mm256_unpackhi_epi32(a0, a1); b3 = _mm256_unpackhi_epi32(a4, a5); b4 = _mm256_unpacklo_epi32(a2, a3); b5 = _mm256_unpacklo_epi32(a6, a7); b6 = _mm256_unpackhi_epi32(a2, a3); b7 = _mm256_unpackhi_epi32(a6, a7); in[0] = _mm256_unpacklo_epi64(b0, b1); in[1] = _mm256_unpackhi_epi64(b0, b1); in[2] = _mm256_unpacklo_epi64(b2, b3); in[3] = _mm256_unpackhi_epi64(b2, b3); in[4] = _mm256_unpacklo_epi64(b4, b5); in[5] = _mm256_unpackhi_epi64(b4, b5); in[6] = _mm256_unpacklo_epi64(b6, b7); in[7] = _mm256_unpackhi_epi64(b6, b7); } else { in[0] = _mm256_add_epi16(a0, a4); in[7] = _mm256_add_epi16(a1, a5); in[3] = _mm256_add_epi16(a2, a6); in[4] = _mm256_add_epi16(a3, a7); in[2] = _mm256_sub_epi16(a0, a4); in[6] = _mm256_sub_epi16(a1, a5); in[1] = _mm256_sub_epi16(a2, a6); in[5] = _mm256_sub_epi16(a3, a7); } } void aom_hadamard_lp_8x8_dual_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { __m256i src[8]; src[0] = _mm256_loadu_si256((const __m256i *)src_diff); src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride)); hadamard_col8x2_avx2(src, 0); hadamard_col8x2_avx2(src, 1); _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[0], src[1], 0x20)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[2], src[3], 0x20)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[4], src[5], 0x20)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[6], src[7], 0x20)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[0], src[1], 0x31)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[2], src[3], 0x31)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[4], src[5], 0x31)); coeff += 16; _mm256_storeu_si256((__m256i *)coeff, _mm256_permute2x128_si256(src[6], src[7], 0x31)); } static inline void hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff, int is_final) { DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); int16_t *t_coeff = temp_coeff; int16_t *coeff16 = (int16_t *)coeff; int idx; for (idx = 0; idx < 2; ++idx) { const int16_t *src_ptr = src_diff + idx * 8 * src_stride; aom_hadamard_lp_8x8_dual_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); } for (idx = 0; idx < 64; idx += 16) { const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); __m256i b0 = _mm256_add_epi16(coeff0, coeff1); __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); __m256i b2 = _mm256_add_epi16(coeff2, coeff3); __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); b0 = _mm256_srai_epi16(b0, 1); b1 = _mm256_srai_epi16(b1, 1); b2 = _mm256_srai_epi16(b2, 1); b3 = _mm256_srai_epi16(b3, 1); if (is_final) { store_tran_low(_mm256_add_epi16(b0, b2), coeff); store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); coeff += 16; } else { _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2)); _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3)); _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2)); _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3)); coeff16 += 16; } t_coeff += 16; } } void aom_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { hadamard_16x16_avx2(src_diff, src_stride, coeff, 1); } void aom_hadamard_lp_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { int16_t *t_coeff = coeff; for (int idx = 0; idx < 2; ++idx) { const int16_t *src_ptr = src_diff + idx * 8 * src_stride; aom_hadamard_lp_8x8_dual_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); } for (int idx = 0; idx < 64; idx += 16) { const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); __m256i b0 = _mm256_add_epi16(coeff0, coeff1); __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); __m256i b2 = _mm256_add_epi16(coeff2, coeff3); __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); b0 = _mm256_srai_epi16(b0, 1); b1 = _mm256_srai_epi16(b1, 1); b2 = _mm256_srai_epi16(b2, 1); b3 = _mm256_srai_epi16(b3, 1); _mm256_storeu_si256((__m256i *)coeff, _mm256_add_epi16(b0, b2)); _mm256_storeu_si256((__m256i *)(coeff + 64), _mm256_add_epi16(b1, b3)); _mm256_storeu_si256((__m256i *)(coeff + 128), _mm256_sub_epi16(b0, b2)); _mm256_storeu_si256((__m256i *)(coeff + 192), _mm256_sub_epi16(b1, b3)); coeff += 16; t_coeff += 16; } } void aom_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { // For high bitdepths, it is unnecessary to store_tran_low // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the // next stage. Output to an intermediate buffer first, then store_tran_low() // in the final stage. DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); int16_t *t_coeff = temp_coeff; int idx; __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, b3_lo; __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, b3_hi; __m256i b0, b1, b2, b3; const __m256i zero = _mm256_setzero_si256(); for (idx = 0; idx < 4; ++idx) { // src_diff: 9 bit, dynamic range [-255, 255] const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; hadamard_16x16_avx2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 256), 0); } for (idx = 0; idx < 256; idx += 16) { const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); // Sign extend 16 bit to 32 bit. sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi); sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi); sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi); sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi); b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo); b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi); b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo); b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi); b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo); b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi); b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo); b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi); b0_lo = _mm256_srai_epi32(b0_lo, 2); b1_lo = _mm256_srai_epi32(b1_lo, 2); b2_lo = _mm256_srai_epi32(b2_lo, 2); b3_lo = _mm256_srai_epi32(b3_lo, 2); b0_hi = _mm256_srai_epi32(b0_hi, 2); b1_hi = _mm256_srai_epi32(b1_hi, 2); b2_hi = _mm256_srai_epi32(b2_hi, 2); b3_hi = _mm256_srai_epi32(b3_hi, 2); b0 = _mm256_packs_epi32(b0_lo, b0_hi); b1 = _mm256_packs_epi32(b1_lo, b1_hi); b2 = _mm256_packs_epi32(b2_lo, b2_hi); b3 = _mm256_packs_epi32(b3_lo, b3_hi); store_tran_low(_mm256_add_epi16(b0, b2), coeff); store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256); store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512); store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768); coeff += 16; t_coeff += 16; } } #if CONFIG_AV1_HIGHBITDEPTH static void highbd_hadamard_col8_avx2(__m256i *in, int iter) { __m256i a0 = in[0]; __m256i a1 = in[1]; __m256i a2 = in[2]; __m256i a3 = in[3]; __m256i a4 = in[4]; __m256i a5 = in[5]; __m256i a6 = in[6]; __m256i a7 = in[7]; __m256i b0 = _mm256_add_epi32(a0, a1); __m256i b1 = _mm256_sub_epi32(a0, a1); __m256i b2 = _mm256_add_epi32(a2, a3); __m256i b3 = _mm256_sub_epi32(a2, a3); __m256i b4 = _mm256_add_epi32(a4, a5); __m256i b5 = _mm256_sub_epi32(a4, a5); __m256i b6 = _mm256_add_epi32(a6, a7); __m256i b7 = _mm256_sub_epi32(a6, a7); a0 = _mm256_add_epi32(b0, b2); a1 = _mm256_add_epi32(b1, b3); a2 = _mm256_sub_epi32(b0, b2); a3 = _mm256_sub_epi32(b1, b3); a4 = _mm256_add_epi32(b4, b6); a5 = _mm256_add_epi32(b5, b7); a6 = _mm256_sub_epi32(b4, b6); a7 = _mm256_sub_epi32(b5, b7); if (iter == 0) { b0 = _mm256_add_epi32(a0, a4); b7 = _mm256_add_epi32(a1, a5); b3 = _mm256_add_epi32(a2, a6); b4 = _mm256_add_epi32(a3, a7); b2 = _mm256_sub_epi32(a0, a4); b6 = _mm256_sub_epi32(a1, a5); b1 = _mm256_sub_epi32(a2, a6); b5 = _mm256_sub_epi32(a3, a7); a0 = _mm256_unpacklo_epi32(b0, b1); a1 = _mm256_unpacklo_epi32(b2, b3); a2 = _mm256_unpackhi_epi32(b0, b1); a3 = _mm256_unpackhi_epi32(b2, b3); a4 = _mm256_unpacklo_epi32(b4, b5); a5 = _mm256_unpacklo_epi32(b6, b7); a6 = _mm256_unpackhi_epi32(b4, b5); a7 = _mm256_unpackhi_epi32(b6, b7); b0 = _mm256_unpacklo_epi64(a0, a1); b1 = _mm256_unpacklo_epi64(a4, a5); b2 = _mm256_unpackhi_epi64(a0, a1); b3 = _mm256_unpackhi_epi64(a4, a5); b4 = _mm256_unpacklo_epi64(a2, a3); b5 = _mm256_unpacklo_epi64(a6, a7); b6 = _mm256_unpackhi_epi64(a2, a3); b7 = _mm256_unpackhi_epi64(a6, a7); in[0] = _mm256_permute2x128_si256(b0, b1, 0x20); in[1] = _mm256_permute2x128_si256(b0, b1, 0x31); in[2] = _mm256_permute2x128_si256(b2, b3, 0x20); in[3] = _mm256_permute2x128_si256(b2, b3, 0x31); in[4] = _mm256_permute2x128_si256(b4, b5, 0x20); in[5] = _mm256_permute2x128_si256(b4, b5, 0x31); in[6] = _mm256_permute2x128_si256(b6, b7, 0x20); in[7] = _mm256_permute2x128_si256(b6, b7, 0x31); } else { in[0] = _mm256_add_epi32(a0, a4); in[7] = _mm256_add_epi32(a1, a5); in[3] = _mm256_add_epi32(a2, a6); in[4] = _mm256_add_epi32(a3, a7); in[2] = _mm256_sub_epi32(a0, a4); in[6] = _mm256_sub_epi32(a1, a5); in[1] = _mm256_sub_epi32(a2, a6); in[5] = _mm256_sub_epi32(a3, a7); } } void aom_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { __m128i src16[8]; __m256i src32[8]; src16[0] = _mm_loadu_si128((const __m128i *)src_diff); src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride)); src32[0] = _mm256_cvtepi16_epi32(src16[0]); src32[1] = _mm256_cvtepi16_epi32(src16[1]); src32[2] = _mm256_cvtepi16_epi32(src16[2]); src32[3] = _mm256_cvtepi16_epi32(src16[3]); src32[4] = _mm256_cvtepi16_epi32(src16[4]); src32[5] = _mm256_cvtepi16_epi32(src16[5]); src32[6] = _mm256_cvtepi16_epi32(src16[6]); src32[7] = _mm256_cvtepi16_epi32(src16[7]); highbd_hadamard_col8_avx2(src32, 0); highbd_hadamard_col8_avx2(src32, 1); _mm256_storeu_si256((__m256i *)coeff, src32[0]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[1]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[2]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[3]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[4]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[5]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[6]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[7]); } void aom_highbd_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int idx; tran_low_t *t_coeff = coeff; for (idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; aom_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64); } for (idx = 0; idx < 64; idx += 8) { __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); __m256i b0 = _mm256_add_epi32(coeff0, coeff1); __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); __m256i b2 = _mm256_add_epi32(coeff2, coeff3); __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); b0 = _mm256_srai_epi32(b0, 1); b1 = _mm256_srai_epi32(b1, 1); b2 = _mm256_srai_epi32(b2, 1); b3 = _mm256_srai_epi32(b3, 1); coeff0 = _mm256_add_epi32(b0, b2); coeff1 = _mm256_add_epi32(b1, b3); coeff2 = _mm256_sub_epi32(b0, b2); coeff3 = _mm256_sub_epi32(b1, b3); _mm256_storeu_si256((__m256i *)coeff, coeff0); _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1); _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2); _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3); coeff += 8; t_coeff += 8; } } void aom_highbd_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int idx; tran_low_t *t_coeff = coeff; for (idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; aom_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256); } for (idx = 0; idx < 256; idx += 8) { __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); __m256i b0 = _mm256_add_epi32(coeff0, coeff1); __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); __m256i b2 = _mm256_add_epi32(coeff2, coeff3); __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); b0 = _mm256_srai_epi32(b0, 2); b1 = _mm256_srai_epi32(b1, 2); b2 = _mm256_srai_epi32(b2, 2); b3 = _mm256_srai_epi32(b3, 2); coeff0 = _mm256_add_epi32(b0, b2); coeff1 = _mm256_add_epi32(b1, b3); coeff2 = _mm256_sub_epi32(b0, b2); coeff3 = _mm256_sub_epi32(b1, b3); _mm256_storeu_si256((__m256i *)coeff, coeff0); _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1); _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2); _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3); coeff += 8; t_coeff += 8; } } #endif // CONFIG_AV1_HIGHBITDEPTH int aom_satd_avx2(const tran_low_t *coeff, int length) { __m256i accum = _mm256_setzero_si256(); int i; for (i = 0; i < length; i += 8, coeff += 8) { const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff); const __m256i abs = _mm256_abs_epi32(src_line); accum = _mm256_add_epi32(accum, abs); } { // 32 bit horizontal add const __m256i a = _mm256_srli_si256(accum, 8); const __m256i b = _mm256_add_epi32(accum, a); const __m256i c = _mm256_srli_epi64(b, 32); const __m256i d = _mm256_add_epi32(b, c); const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), _mm256_extractf128_si256(d, 1)); return _mm_cvtsi128_si32(accum_128); } } int aom_satd_lp_avx2(const int16_t *coeff, int length) { const __m256i one = _mm256_set1_epi16(1); __m256i accum = _mm256_setzero_si256(); for (int i = 0; i < length; i += 16) { const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff); const __m256i abs = _mm256_abs_epi16(src_line); const __m256i sum = _mm256_madd_epi16(abs, one); accum = _mm256_add_epi32(accum, sum); coeff += 16; } { // 32 bit horizontal add const __m256i a = _mm256_srli_si256(accum, 8); const __m256i b = _mm256_add_epi32(accum, a); const __m256i c = _mm256_srli_epi64(b, 32); const __m256i d = _mm256_add_epi32(b, c); const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), _mm256_extractf128_si256(d, 1)); return _mm_cvtsi128_si32(accum_128); } } void aom_avg_8x8_quad_avx2(const uint8_t *s, int p, int x16_idx, int y16_idx, int *avg) { const uint8_t *s_y0 = s + y16_idx * p + x16_idx; const uint8_t *s_y1 = s_y0 + 8 * p; __m256i sum0, sum1, s0, s1, s2, s3, u0; u0 = _mm256_setzero_si256(); s0 = _mm256_sad_epu8(yy_loadu2_128(s_y1, s_y0), u0); s1 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + p, s_y0 + p), u0); s2 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 2 * p, s_y0 + 2 * p), u0); s3 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 3 * p, s_y0 + 3 * p), u0); sum0 = _mm256_add_epi16(s0, s1); sum1 = _mm256_add_epi16(s2, s3); s0 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 4 * p, s_y0 + 4 * p), u0); s1 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 5 * p, s_y0 + 5 * p), u0); s2 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 6 * p, s_y0 + 6 * p), u0); s3 = _mm256_sad_epu8(yy_loadu2_128(s_y1 + 7 * p, s_y0 + 7 * p), u0); sum0 = _mm256_add_epi16(sum0, _mm256_add_epi16(s0, s1)); sum1 = _mm256_add_epi16(sum1, _mm256_add_epi16(s2, s3)); sum0 = _mm256_add_epi16(sum0, sum1); // (avg + 32) >> 6 __m256i rounding = _mm256_set1_epi32(32); sum0 = _mm256_add_epi32(sum0, rounding); sum0 = _mm256_srli_epi32(sum0, 6); __m128i lo = _mm256_castsi256_si128(sum0); __m128i hi = _mm256_extracti128_si256(sum0, 1); avg[0] = _mm_cvtsi128_si32(lo); avg[1] = _mm_extract_epi32(lo, 2); avg[2] = _mm_cvtsi128_si32(hi); avg[3] = _mm_extract_epi32(hi, 2); } void aom_int_pro_row_avx2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor) { // SIMD implementation assumes width and height to be multiple of 16 and 2 // respectively. For any odd width or height, SIMD support needs to be added. assert(width % 16 == 0 && height % 2 == 0); if (width % 32 == 0) { const __m256i zero = _mm256_setzero_si256(); for (int wd = 0; wd < width; wd += 32) { const uint8_t *ref_tmp = ref + wd; int16_t *hbuf_tmp = hbuf + wd; __m256i s0 = zero; __m256i s1 = zero; int idx = 0; do { __m256i src_line = _mm256_loadu_si256((const __m256i *)ref_tmp); __m256i t0 = _mm256_unpacklo_epi8(src_line, zero); __m256i t1 = _mm256_unpackhi_epi8(src_line, zero); s0 = _mm256_add_epi16(s0, t0); s1 = _mm256_add_epi16(s1, t1); ref_tmp += ref_stride; src_line = _mm256_loadu_si256((const __m256i *)ref_tmp); t0 = _mm256_unpacklo_epi8(src_line, zero); t1 = _mm256_unpackhi_epi8(src_line, zero); s0 = _mm256_add_epi16(s0, t0); s1 = _mm256_add_epi16(s1, t1); ref_tmp += ref_stride; idx += 2; } while (idx < height); s0 = _mm256_srai_epi16(s0, norm_factor); s1 = _mm256_srai_epi16(s1, norm_factor); _mm_storeu_si128((__m128i *)(hbuf_tmp), _mm256_castsi256_si128(s0)); _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), _mm256_castsi256_si128(s1)); _mm_storeu_si128((__m128i *)(hbuf_tmp + 16), _mm256_extractf128_si256(s0, 1)); _mm_storeu_si128((__m128i *)(hbuf_tmp + 24), _mm256_extractf128_si256(s1, 1)); } } else if (width % 16 == 0) { aom_int_pro_row_sse2(hbuf, ref, ref_stride, width, height, norm_factor); } } static inline void load_from_src_buf(const uint8_t *ref1, __m256i *src, const int stride) { src[0] = _mm256_loadu_si256((const __m256i *)ref1); src[1] = _mm256_loadu_si256((const __m256i *)(ref1 + stride)); src[2] = _mm256_loadu_si256((const __m256i *)(ref1 + (2 * stride))); src[3] = _mm256_loadu_si256((const __m256i *)(ref1 + (3 * stride))); } #define CALC_TOT_SAD_AND_STORE \ /* r00 r10 x x r01 r11 x x | r02 r12 x x r03 r13 x x */ \ const __m256i r01 = _mm256_add_epi16(_mm256_slli_si256(r1, 2), r0); \ /* r00 r10 r20 x r01 r11 r21 x | r02 r12 r22 x r03 r13 r23 x */ \ const __m256i r012 = _mm256_add_epi16(_mm256_slli_si256(r2, 4), r01); \ /* r00 r10 r20 r30 r01 r11 r21 r31 | r02 r12 r22 r32 r03 r13 r23 r33 */ \ const __m256i result0 = _mm256_add_epi16(_mm256_slli_si256(r3, 6), r012); \ \ const __m128i results0 = _mm_add_epi16( \ _mm256_castsi256_si128(result0), _mm256_extractf128_si256(result0, 1)); \ const __m128i results1 = \ _mm_add_epi16(results0, _mm_srli_si128(results0, 8)); \ _mm_storel_epi64((__m128i *)vbuf, _mm_srli_epi16(results1, norm_factor)); static inline void aom_int_pro_col_16wd_avx2(int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int height, int norm_factor) { const __m256i zero = _mm256_setzero_si256(); int ht = 0; // Post sad operation, the data is present in lower 16-bit of each 64-bit lane // and higher 16-bits are Zero. Here, we are processing 8 rows at a time to // utilize the higher 16-bits efficiently. do { __m256i src_00 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(ref))); src_00 = _mm256_inserti128_si256( src_00, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 4)), 1); __m256i src_01 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(ref + ref_stride * 1))); src_01 = _mm256_inserti128_si256( src_01, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 5)), 1); __m256i src_10 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(ref + ref_stride * 2))); src_10 = _mm256_inserti128_si256( src_10, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 6)), 1); __m256i src_11 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(ref + ref_stride * 3))); src_11 = _mm256_inserti128_si256( src_11, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 7)), 1); // s00 x x x s01 x x x | s40 x x x s41 x x x const __m256i s0 = _mm256_sad_epu8(src_00, zero); // s10 x x x s11 x x x | s50 x x x s51 x x x const __m256i s1 = _mm256_sad_epu8(src_01, zero); // s20 x x x s21 x x x | s60 x x x s61 x x x const __m256i s2 = _mm256_sad_epu8(src_10, zero); // s30 x x x s31 x x x | s70 x x x s71 x x x const __m256i s3 = _mm256_sad_epu8(src_11, zero); // s00 s10 x x x x x x | s40 s50 x x x x x x const __m256i s0_lo = _mm256_unpacklo_epi16(s0, s1); // s01 s11 x x x x x x | s41 s51 x x x x x x const __m256i s0_hi = _mm256_unpackhi_epi16(s0, s1); // s20 s30 x x x x x x | s60 s70 x x x x x x const __m256i s1_lo = _mm256_unpacklo_epi16(s2, s3); // s21 s31 x x x x x x | s61 s71 x x x x x x const __m256i s1_hi = _mm256_unpackhi_epi16(s2, s3); // s0 s1 x x x x x x | s4 s5 x x x x x x const __m256i s0_add = _mm256_add_epi16(s0_lo, s0_hi); // s2 s3 x x x x x x | s6 s7 x x x x x x const __m256i s1_add = _mm256_add_epi16(s1_lo, s1_hi); // s1 s1 s2 s3 s4 s5 s6 s7 const __m128i results = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_unpacklo_epi32(s0_add, s1_add), 0x08)); _mm_storeu_si128((__m128i *)vbuf, _mm_srli_epi16(results, norm_factor)); vbuf += 8; ref += (ref_stride << 3); ht += 8; } while (ht < height); } void aom_int_pro_col_avx2(int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor) { assert(width % 16 == 0); if (width == 128) { const __m256i zero = _mm256_setzero_si256(); for (int ht = 0; ht < height; ht += 4) { __m256i src[16]; // Load source data. load_from_src_buf(ref, &src[0], ref_stride); load_from_src_buf(ref + 32, &src[4], ref_stride); load_from_src_buf(ref + 64, &src[8], ref_stride); load_from_src_buf(ref + 96, &src[12], ref_stride); // Row0 output: r00 x x x r01 x x x | r02 x x x r03 x x x const __m256i s0 = _mm256_add_epi16(_mm256_sad_epu8(src[0], zero), _mm256_sad_epu8(src[4], zero)); const __m256i s1 = _mm256_add_epi16(_mm256_sad_epu8(src[8], zero), _mm256_sad_epu8(src[12], zero)); const __m256i r0 = _mm256_add_epi16(s0, s1); // Row1 output: r10 x x x r11 x x x | r12 x x x r13 x x x const __m256i s2 = _mm256_add_epi16(_mm256_sad_epu8(src[1], zero), _mm256_sad_epu8(src[5], zero)); const __m256i s3 = _mm256_add_epi16(_mm256_sad_epu8(src[9], zero), _mm256_sad_epu8(src[13], zero)); const __m256i r1 = _mm256_add_epi16(s2, s3); // Row2 output: r20 x x x r21 x x x | r22 x x x r23 x x x const __m256i s4 = _mm256_add_epi16(_mm256_sad_epu8(src[2], zero), _mm256_sad_epu8(src[6], zero)); const __m256i s5 = _mm256_add_epi16(_mm256_sad_epu8(src[10], zero), _mm256_sad_epu8(src[14], zero)); const __m256i r2 = _mm256_add_epi16(s4, s5); // Row3 output: r30 x x x r31 x x x | r32 x x x r33 x x x const __m256i s6 = _mm256_add_epi16(_mm256_sad_epu8(src[3], zero), _mm256_sad_epu8(src[7], zero)); const __m256i s7 = _mm256_add_epi16(_mm256_sad_epu8(src[11], zero), _mm256_sad_epu8(src[15], zero)); const __m256i r3 = _mm256_add_epi16(s6, s7); CALC_TOT_SAD_AND_STORE vbuf += 4; ref += ref_stride << 2; } } else if (width == 64) { const __m256i zero = _mm256_setzero_si256(); for (int ht = 0; ht < height; ht += 4) { __m256i src[8]; // Load source data. load_from_src_buf(ref, &src[0], ref_stride); load_from_src_buf(ref + 32, &src[4], ref_stride); // Row0 output: r00 x x x r01 x x x | r02 x x x r03 x x x const __m256i s0 = _mm256_sad_epu8(src[0], zero); const __m256i s1 = _mm256_sad_epu8(src[4], zero); const __m256i r0 = _mm256_add_epi16(s0, s1); // Row1 output: r10 x x x r11 x x x | r12 x x x r13 x x x const __m256i s2 = _mm256_sad_epu8(src[1], zero); const __m256i s3 = _mm256_sad_epu8(src[5], zero); const __m256i r1 = _mm256_add_epi16(s2, s3); // Row2 output: r20 x x x r21 x x x | r22 x x x r23 x x x const __m256i s4 = _mm256_sad_epu8(src[2], zero); const __m256i s5 = _mm256_sad_epu8(src[6], zero); const __m256i r2 = _mm256_add_epi16(s4, s5); // Row3 output: r30 x x x r31 x x x | r32 x x x r33 x x x const __m256i s6 = _mm256_sad_epu8(src[3], zero); const __m256i s7 = _mm256_sad_epu8(src[7], zero); const __m256i r3 = _mm256_add_epi16(s6, s7); CALC_TOT_SAD_AND_STORE vbuf += 4; ref += ref_stride << 2; } } else if (width == 32) { assert(height % 2 == 0); const __m256i zero = _mm256_setzero_si256(); for (int ht = 0; ht < height; ht += 4) { __m256i src[4]; // Load source data. load_from_src_buf(ref, &src[0], ref_stride); // s00 x x x s01 x x x s02 x x x s03 x x x const __m256i r0 = _mm256_sad_epu8(src[0], zero); // s10 x x x s11 x x x s12 x x x s13 x x x const __m256i r1 = _mm256_sad_epu8(src[1], zero); // s20 x x x s21 x x x s22 x x x s23 x x x const __m256i r2 = _mm256_sad_epu8(src[2], zero); // s30 x x x s31 x x x s32 x x x s33 x x x const __m256i r3 = _mm256_sad_epu8(src[3], zero); CALC_TOT_SAD_AND_STORE vbuf += 4; ref += ref_stride << 2; } } else if (width == 16) { aom_int_pro_col_16wd_avx2(vbuf, ref, ref_stride, height, norm_factor); } } static inline void calc_vector_mean_sse_64wd(const int16_t *ref, const int16_t *src, __m256i *mean, __m256i *sse) { const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)src); const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(src + 16)); const __m256i src_line2 = _mm256_loadu_si256((const __m256i *)(src + 32)); const __m256i src_line3 = _mm256_loadu_si256((const __m256i *)(src + 48)); const __m256i ref_line0 = _mm256_loadu_si256((const __m256i *)ref); const __m256i ref_line1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); const __m256i ref_line2 = _mm256_loadu_si256((const __m256i *)(ref + 32)); const __m256i ref_line3 = _mm256_loadu_si256((const __m256i *)(ref + 48)); const __m256i diff0 = _mm256_sub_epi16(ref_line0, src_line0); const __m256i diff1 = _mm256_sub_epi16(ref_line1, src_line1); const __m256i diff2 = _mm256_sub_epi16(ref_line2, src_line2); const __m256i diff3 = _mm256_sub_epi16(ref_line3, src_line3); const __m256i diff_sqr0 = _mm256_madd_epi16(diff0, diff0); const __m256i diff_sqr1 = _mm256_madd_epi16(diff1, diff1); const __m256i diff_sqr2 = _mm256_madd_epi16(diff2, diff2); const __m256i diff_sqr3 = _mm256_madd_epi16(diff3, diff3); *mean = _mm256_add_epi16(*mean, _mm256_add_epi16(diff0, diff1)); *mean = _mm256_add_epi16(*mean, diff2); *mean = _mm256_add_epi16(*mean, diff3); *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(diff_sqr0, diff_sqr1)); *sse = _mm256_add_epi32(*sse, diff_sqr2); *sse = _mm256_add_epi32(*sse, diff_sqr3); } #define CALC_VAR_FROM_MEAN_SSE(mean, sse) \ { \ mean = _mm256_madd_epi16(mean, _mm256_set1_epi16(1)); \ mean = _mm256_hadd_epi32(mean, sse); \ mean = _mm256_add_epi32(mean, _mm256_bsrli_epi128(mean, 4)); \ const __m128i result = _mm_add_epi32(_mm256_castsi256_si128(mean), \ _mm256_extractf128_si256(mean, 1)); \ /*(mean * mean): dynamic range 31 bits.*/ \ const int mean_int = _mm_extract_epi32(result, 0); \ const int sse_int = _mm_extract_epi32(result, 2); \ const unsigned int mean_abs = abs(mean_int); \ var = sse_int - ((mean_abs * mean_abs) >> (bwl + 2)); \ } // ref: [0 - 510] // src: [0 - 510] // bwl: {2, 3, 4, 5} int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl) { const int width = 4 << bwl; assert(width % 16 == 0 && width <= 128); int var = 0; // Instead of having a loop over width 16, considered loop unrolling to avoid // some addition operations. if (width == 128) { __m256i mean = _mm256_setzero_si256(); __m256i sse = _mm256_setzero_si256(); calc_vector_mean_sse_64wd(src, ref, &mean, &sse); calc_vector_mean_sse_64wd(src + 64, ref + 64, &mean, &sse); CALC_VAR_FROM_MEAN_SSE(mean, sse) } else if (width == 64) { __m256i mean = _mm256_setzero_si256(); __m256i sse = _mm256_setzero_si256(); calc_vector_mean_sse_64wd(src, ref, &mean, &sse); CALC_VAR_FROM_MEAN_SSE(mean, sse) } else if (width == 32) { const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)src); const __m256i ref_line0 = _mm256_loadu_si256((const __m256i *)ref); const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(src + 16)); const __m256i ref_line1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); const __m256i diff0 = _mm256_sub_epi16(ref_line0, src_line0); const __m256i diff1 = _mm256_sub_epi16(ref_line1, src_line1); const __m256i diff_sqr0 = _mm256_madd_epi16(diff0, diff0); const __m256i diff_sqr1 = _mm256_madd_epi16(diff1, diff1); const __m256i sse = _mm256_add_epi32(diff_sqr0, diff_sqr1); __m256i mean = _mm256_add_epi16(diff0, diff1); CALC_VAR_FROM_MEAN_SSE(mean, sse) } else if (width == 16) { const __m256i src_line = _mm256_loadu_si256((const __m256i *)src); const __m256i ref_line = _mm256_loadu_si256((const __m256i *)ref); __m256i mean = _mm256_sub_epi16(ref_line, src_line); const __m256i sse = _mm256_madd_epi16(mean, mean); CALC_VAR_FROM_MEAN_SSE(mean, sse) } return var; } aom-3.12.1/aom_dsp/x86/avg_intrin_sse2.c000066400000000000000000000627071477627663500177240ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/bitdepth_conversion_sse2.h" #include "aom_dsp/x86/mem_sse2.h" #include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" static inline void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero, __m128i *out_lo, __m128i *out_hi) { const __m128i sign_bits = _mm_cmplt_epi16(in, zero); *out_lo = _mm_unpacklo_epi16(in, sign_bits); *out_hi = _mm_unpackhi_epi16(in, sign_bits); } static inline __m128i invert_sign_32_sse2(__m128i a, __m128i sign) { a = _mm_xor_si128(a, sign); return _mm_sub_epi32(a, sign); } void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max) { __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; u0 = _mm_setzero_si128(); // Row 0 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0); diff = _mm_subs_epi16(s0, d0); negdiff = _mm_subs_epi16(u0, diff); absdiff0 = _mm_max_epi16(diff, negdiff); // Row 1 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0); diff = _mm_subs_epi16(s0, d0); negdiff = _mm_subs_epi16(u0, diff); absdiff = _mm_max_epi16(diff, negdiff); maxabsdiff = _mm_max_epi16(absdiff0, absdiff); minabsdiff = _mm_min_epi16(absdiff0, absdiff); // Row 2 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0); diff = _mm_subs_epi16(s0, d0); negdiff = _mm_subs_epi16(u0, diff); absdiff = _mm_max_epi16(diff, negdiff); maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); minabsdiff = _mm_min_epi16(minabsdiff, absdiff); // Row 3 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0); diff = _mm_subs_epi16(s0, d0); negdiff = _mm_subs_epi16(u0, diff); absdiff = _mm_max_epi16(diff, negdiff); maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); minabsdiff = _mm_min_epi16(minabsdiff, absdiff); // Row 4 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0); diff = _mm_subs_epi16(s0, d0); negdiff = _mm_subs_epi16(u0, diff); absdiff = _mm_max_epi16(diff, negdiff); maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); minabsdiff = _mm_min_epi16(minabsdiff, absdiff); // Row 5 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0); diff = _mm_subs_epi16(s0, d0); negdiff = _mm_subs_epi16(u0, diff); absdiff = _mm_max_epi16(diff, negdiff); maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); minabsdiff = _mm_min_epi16(minabsdiff, absdiff); // Row 6 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0); diff = _mm_subs_epi16(s0, d0); negdiff = _mm_subs_epi16(u0, diff); absdiff = _mm_max_epi16(diff, negdiff); maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); minabsdiff = _mm_min_epi16(minabsdiff, absdiff); // Row 7 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0); diff = _mm_subs_epi16(s0, d0); negdiff = _mm_subs_epi16(u0, diff); absdiff = _mm_max_epi16(diff, negdiff); maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); minabsdiff = _mm_min_epi16(minabsdiff, absdiff); maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8)); maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32)); maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16)); *max = _mm_extract_epi16(maxabsdiff, 0); minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8)); minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32)); minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16)); *min = _mm_extract_epi16(minabsdiff, 0); } unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) { __m128i sum0, sum1, s0, s1, s2, s3, u0; unsigned int avg = 0; u0 = _mm_setzero_si128(); s0 = loadh_epi64((const __m128i *)(s + p), _mm_loadl_epi64((const __m128i *)(s))); s1 = loadh_epi64((const __m128i *)(s + 3 * p), _mm_loadl_epi64((const __m128i *)(s + 2 * p))); s2 = loadh_epi64((const __m128i *)(s + 5 * p), _mm_loadl_epi64((const __m128i *)(s + 4 * p))); s3 = loadh_epi64((const __m128i *)(s + 7 * p), _mm_loadl_epi64((const __m128i *)(s + 6 * p))); s0 = _mm_sad_epu8(s0, u0); s1 = _mm_sad_epu8(s1, u0); s2 = _mm_sad_epu8(s2, u0); s3 = _mm_sad_epu8(s3, u0); sum0 = _mm_add_epi16(s0, s1); sum1 = _mm_add_epi16(s2, s3); sum0 = _mm_add_epi16(sum0, sum1); sum0 = _mm_add_epi16(sum0, _mm_srli_si128(sum0, 8)); avg = _mm_cvtsi128_si32(sum0); return (avg + 32) >> 6; } static void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) { __m128i sum0, sum1, s0, s1, s2, s3, u0; u0 = _mm_setzero_si128(); s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s)), u0); s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + p)), u0); s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 2 * p)), u0); s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 3 * p)), u0); sum0 = _mm_add_epi16(s0, s1); sum1 = _mm_add_epi16(s2, s3); s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 4 * p)), u0); s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 5 * p)), u0); s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 6 * p)), u0); s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 7 * p)), u0); sum0 = _mm_add_epi16(sum0, _mm_add_epi16(s0, s1)); sum1 = _mm_add_epi16(sum1, _mm_add_epi16(s2, s3)); sum0 = _mm_add_epi16(sum0, sum1); // (avg + 32) >> 6 __m128i rounding = _mm_set1_epi32(32); sum0 = _mm_add_epi32(sum0, rounding); sum0 = _mm_srli_epi32(sum0, 6); avg[0] = _mm_cvtsi128_si32(sum0); avg[1] = _mm_extract_epi16(sum0, 4); } void aom_avg_8x8_quad_sse2(const uint8_t *s, int p, int x16_idx, int y16_idx, int *avg) { const uint8_t *s_ptr = s + y16_idx * p + x16_idx; for (int k = 0; k < 2; k++) { calc_avg_8x8_dual_sse2(s_ptr, p, avg + k * 2); s_ptr += 8 * p; } } unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) { __m128i s0, s1, u0; unsigned int avg = 0; u0 = _mm_setzero_si128(); s0 = _mm_unpacklo_epi32(xx_loadl_32(s), xx_loadl_32(s + p)); s1 = _mm_unpacklo_epi32(xx_loadl_32(s + p * 2), xx_loadl_32(s + p * 3)); s0 = _mm_sad_epu8(s0, u0); s1 = _mm_sad_epu8(s1, u0); s0 = _mm_add_epi16(s0, s1); avg = _mm_cvtsi128_si32(s0); return (avg + 8) >> 4; } static inline void hadamard_col4_sse2(__m128i *in, int iter) { const __m128i a0 = in[0]; const __m128i a1 = in[1]; const __m128i a2 = in[2]; const __m128i a3 = in[3]; const __m128i b0 = _mm_srai_epi16(_mm_add_epi16(a0, a1), 1); const __m128i b1 = _mm_srai_epi16(_mm_sub_epi16(a0, a1), 1); const __m128i b2 = _mm_srai_epi16(_mm_add_epi16(a2, a3), 1); const __m128i b3 = _mm_srai_epi16(_mm_sub_epi16(a2, a3), 1); in[0] = _mm_add_epi16(b0, b2); in[1] = _mm_add_epi16(b1, b3); in[2] = _mm_sub_epi16(b0, b2); in[3] = _mm_sub_epi16(b1, b3); if (iter == 0) { const __m128i ba = _mm_unpacklo_epi16(in[0], in[1]); const __m128i dc = _mm_unpacklo_epi16(in[2], in[3]); const __m128i dcba_lo = _mm_unpacklo_epi32(ba, dc); const __m128i dcba_hi = _mm_unpackhi_epi32(ba, dc); in[0] = dcba_lo; in[1] = _mm_srli_si128(dcba_lo, 8); in[2] = dcba_hi; in[3] = _mm_srli_si128(dcba_hi, 8); } } void aom_hadamard_4x4_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { __m128i src[4]; src[0] = _mm_loadl_epi64((const __m128i *)src_diff); src[1] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride)); src[2] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride)); src[3] = _mm_loadl_epi64((const __m128i *)(src_diff + src_stride)); hadamard_col4_sse2(src, 0); hadamard_col4_sse2(src, 1); store_tran_low(_mm_unpacklo_epi64(src[0], src[1]), coeff); coeff += 8; store_tran_low(_mm_unpacklo_epi64(src[2], src[3]), coeff); } static inline void hadamard_col8_sse2(__m128i *in, int iter) { __m128i a0 = in[0]; __m128i a1 = in[1]; __m128i a2 = in[2]; __m128i a3 = in[3]; __m128i a4 = in[4]; __m128i a5 = in[5]; __m128i a6 = in[6]; __m128i a7 = in[7]; __m128i b0 = _mm_add_epi16(a0, a1); __m128i b1 = _mm_sub_epi16(a0, a1); __m128i b2 = _mm_add_epi16(a2, a3); __m128i b3 = _mm_sub_epi16(a2, a3); __m128i b4 = _mm_add_epi16(a4, a5); __m128i b5 = _mm_sub_epi16(a4, a5); __m128i b6 = _mm_add_epi16(a6, a7); __m128i b7 = _mm_sub_epi16(a6, a7); a0 = _mm_add_epi16(b0, b2); a1 = _mm_add_epi16(b1, b3); a2 = _mm_sub_epi16(b0, b2); a3 = _mm_sub_epi16(b1, b3); a4 = _mm_add_epi16(b4, b6); a5 = _mm_add_epi16(b5, b7); a6 = _mm_sub_epi16(b4, b6); a7 = _mm_sub_epi16(b5, b7); if (iter == 0) { b0 = _mm_add_epi16(a0, a4); b7 = _mm_add_epi16(a1, a5); b3 = _mm_add_epi16(a2, a6); b4 = _mm_add_epi16(a3, a7); b2 = _mm_sub_epi16(a0, a4); b6 = _mm_sub_epi16(a1, a5); b1 = _mm_sub_epi16(a2, a6); b5 = _mm_sub_epi16(a3, a7); a0 = _mm_unpacklo_epi16(b0, b1); a1 = _mm_unpacklo_epi16(b2, b3); a2 = _mm_unpackhi_epi16(b0, b1); a3 = _mm_unpackhi_epi16(b2, b3); a4 = _mm_unpacklo_epi16(b4, b5); a5 = _mm_unpacklo_epi16(b6, b7); a6 = _mm_unpackhi_epi16(b4, b5); a7 = _mm_unpackhi_epi16(b6, b7); b0 = _mm_unpacklo_epi32(a0, a1); b1 = _mm_unpacklo_epi32(a4, a5); b2 = _mm_unpackhi_epi32(a0, a1); b3 = _mm_unpackhi_epi32(a4, a5); b4 = _mm_unpacklo_epi32(a2, a3); b5 = _mm_unpacklo_epi32(a6, a7); b6 = _mm_unpackhi_epi32(a2, a3); b7 = _mm_unpackhi_epi32(a6, a7); in[0] = _mm_unpacklo_epi64(b0, b1); in[1] = _mm_unpackhi_epi64(b0, b1); in[2] = _mm_unpacklo_epi64(b2, b3); in[3] = _mm_unpackhi_epi64(b2, b3); in[4] = _mm_unpacklo_epi64(b4, b5); in[5] = _mm_unpackhi_epi64(b4, b5); in[6] = _mm_unpacklo_epi64(b6, b7); in[7] = _mm_unpackhi_epi64(b6, b7); } else { in[0] = _mm_add_epi16(a0, a4); in[7] = _mm_add_epi16(a1, a5); in[3] = _mm_add_epi16(a2, a6); in[4] = _mm_add_epi16(a3, a7); in[2] = _mm_sub_epi16(a0, a4); in[6] = _mm_sub_epi16(a1, a5); in[1] = _mm_sub_epi16(a2, a6); in[5] = _mm_sub_epi16(a3, a7); } } static inline void hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff, int is_final) { __m128i src[8]; src[0] = _mm_load_si128((const __m128i *)src_diff); src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride)); hadamard_col8_sse2(src, 0); hadamard_col8_sse2(src, 1); if (is_final) { store_tran_low(src[0], coeff); coeff += 8; store_tran_low(src[1], coeff); coeff += 8; store_tran_low(src[2], coeff); coeff += 8; store_tran_low(src[3], coeff); coeff += 8; store_tran_low(src[4], coeff); coeff += 8; store_tran_low(src[5], coeff); coeff += 8; store_tran_low(src[6], coeff); coeff += 8; store_tran_low(src[7], coeff); } else { int16_t *coeff16 = (int16_t *)coeff; _mm_store_si128((__m128i *)coeff16, src[0]); coeff16 += 8; _mm_store_si128((__m128i *)coeff16, src[1]); coeff16 += 8; _mm_store_si128((__m128i *)coeff16, src[2]); coeff16 += 8; _mm_store_si128((__m128i *)coeff16, src[3]); coeff16 += 8; _mm_store_si128((__m128i *)coeff16, src[4]); coeff16 += 8; _mm_store_si128((__m128i *)coeff16, src[5]); coeff16 += 8; _mm_store_si128((__m128i *)coeff16, src[6]); coeff16 += 8; _mm_store_si128((__m128i *)coeff16, src[7]); } } void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { hadamard_8x8_sse2(src_diff, src_stride, coeff, 1); } static inline void hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { __m128i src[8]; src[0] = _mm_load_si128((const __m128i *)src_diff); src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride)); hadamard_col8_sse2(src, 0); hadamard_col8_sse2(src, 1); _mm_store_si128((__m128i *)coeff, src[0]); coeff += 8; _mm_store_si128((__m128i *)coeff, src[1]); coeff += 8; _mm_store_si128((__m128i *)coeff, src[2]); coeff += 8; _mm_store_si128((__m128i *)coeff, src[3]); coeff += 8; _mm_store_si128((__m128i *)coeff, src[4]); coeff += 8; _mm_store_si128((__m128i *)coeff, src[5]); coeff += 8; _mm_store_si128((__m128i *)coeff, src[6]); coeff += 8; _mm_store_si128((__m128i *)coeff, src[7]); } void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { hadamard_lp_8x8_sse2(src_diff, src_stride, coeff); } void aom_hadamard_lp_8x8_dual_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { for (int i = 0; i < 2; i++) { hadamard_lp_8x8_sse2(src_diff + (i * 8), src_stride, coeff + (i * 64)); } } void aom_hadamard_lp_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { for (int idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; hadamard_lp_8x8_sse2(src_ptr, src_stride, coeff + idx * 64); } int16_t *t_coeff = coeff; for (int idx = 0; idx < 64; idx += 8) { __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64)); __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128)); __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192)); __m128i b0 = _mm_add_epi16(coeff0, coeff1); __m128i b1 = _mm_sub_epi16(coeff0, coeff1); __m128i b2 = _mm_add_epi16(coeff2, coeff3); __m128i b3 = _mm_sub_epi16(coeff2, coeff3); b0 = _mm_srai_epi16(b0, 1); b1 = _mm_srai_epi16(b1, 1); b2 = _mm_srai_epi16(b2, 1); b3 = _mm_srai_epi16(b3, 1); coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); coeff2 = _mm_sub_epi16(b0, b2); coeff3 = _mm_sub_epi16(b1, b3); _mm_store_si128((__m128i *)t_coeff, coeff0); _mm_store_si128((__m128i *)(t_coeff + 64), coeff1); _mm_store_si128((__m128i *)(t_coeff + 128), coeff2); _mm_store_si128((__m128i *)(t_coeff + 192), coeff3); t_coeff += 8; } } static inline void hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff, int is_final) { // For high bitdepths, it is unnecessary to store_tran_low // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the // next stage. Output to an intermediate buffer first, then store_tran_low() // in the final stage. DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); int16_t *t_coeff = temp_coeff; int16_t *coeff16 = (int16_t *)coeff; int idx; for (idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64), 0); } for (idx = 0; idx < 64; idx += 8) { __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64)); __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128)); __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192)); __m128i b0 = _mm_add_epi16(coeff0, coeff1); __m128i b1 = _mm_sub_epi16(coeff0, coeff1); __m128i b2 = _mm_add_epi16(coeff2, coeff3); __m128i b3 = _mm_sub_epi16(coeff2, coeff3); b0 = _mm_srai_epi16(b0, 1); b1 = _mm_srai_epi16(b1, 1); b2 = _mm_srai_epi16(b2, 1); b3 = _mm_srai_epi16(b3, 1); coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); coeff2 = _mm_sub_epi16(b0, b2); coeff3 = _mm_sub_epi16(b1, b3); if (is_final) { store_tran_low_offset_4(coeff0, coeff); store_tran_low_offset_4(coeff1, coeff + 64); store_tran_low_offset_4(coeff2, coeff + 128); store_tran_low_offset_4(coeff3, coeff + 192); coeff += 4; } else { _mm_store_si128((__m128i *)coeff16, coeff0); _mm_store_si128((__m128i *)(coeff16 + 64), coeff1); _mm_store_si128((__m128i *)(coeff16 + 128), coeff2); _mm_store_si128((__m128i *)(coeff16 + 192), coeff3); coeff16 += 8; } t_coeff += 8; // Increment the pointer additionally by 0 and 8 in alternate // iterations(instead of 8) to ensure the coherency with the implementation // of store_tran_low_offset_4() coeff += (((idx >> 3) & 1) << 3); } } void aom_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { hadamard_16x16_sse2(src_diff, src_stride, coeff, 1); } void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { // For high bitdepths, it is unnecessary to store_tran_low // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the // next stage. Output to an intermediate buffer first, then store_tran_low() // in the final stage. DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); int16_t *t_coeff = temp_coeff; int idx; __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, b3_lo; __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, b3_hi; __m128i b0, b1, b2, b3; const __m128i zero = _mm_setzero_si128(); for (idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; hadamard_16x16_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 256), 0); } for (idx = 0; idx < 256; idx += 8) { __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256)); __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512)); __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768)); // Sign extend 16 bit to 32 bit. sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi); sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi); sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi); sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi); b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo); b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi); b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo); b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi); b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo); b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi); b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo); b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi); b0_lo = _mm_srai_epi32(b0_lo, 2); b1_lo = _mm_srai_epi32(b1_lo, 2); b2_lo = _mm_srai_epi32(b2_lo, 2); b3_lo = _mm_srai_epi32(b3_lo, 2); b0_hi = _mm_srai_epi32(b0_hi, 2); b1_hi = _mm_srai_epi32(b1_hi, 2); b2_hi = _mm_srai_epi32(b2_hi, 2); b3_hi = _mm_srai_epi32(b3_hi, 2); b0 = _mm_packs_epi32(b0_lo, b0_hi); b1 = _mm_packs_epi32(b1_lo, b1_hi); b2 = _mm_packs_epi32(b2_lo, b2_hi); b3 = _mm_packs_epi32(b3_lo, b3_hi); coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); store_tran_low_offset_4(coeff0, coeff); store_tran_low_offset_4(coeff1, coeff + 256); coeff2 = _mm_sub_epi16(b0, b2); coeff3 = _mm_sub_epi16(b1, b3); store_tran_low_offset_4(coeff2, coeff + 512); store_tran_low_offset_4(coeff3, coeff + 768); // Increment the pointer by 4 and 12 in alternate iterations(instead of 8) // to ensure the coherency with the implementation of // store_tran_low_offset_4() coeff += (4 + (((idx >> 3) & 1) << 3)); t_coeff += 8; } } int aom_satd_sse2(const tran_low_t *coeff, int length) { int i; const __m128i zero = _mm_setzero_si128(); __m128i accum = zero; for (i = 0; i < length; i += 4) { const __m128i src_line = _mm_load_si128((const __m128i *)coeff); const __m128i coeff_sign = _mm_srai_epi32(src_line, 31); const __m128i abs_coeff = invert_sign_32_sse2(src_line, coeff_sign); accum = _mm_add_epi32(accum, abs_coeff); coeff += 4; } { // cascading summation of accum __m128i hi = _mm_srli_si128(accum, 8); accum = _mm_add_epi32(accum, hi); hi = _mm_srli_epi64(accum, 32); accum = _mm_add_epi32(accum, hi); } return _mm_cvtsi128_si32(accum); } int aom_satd_lp_sse2(const int16_t *coeff, int length) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); __m128i accum = zero; for (int i = 0; i < length; i += 16) { const __m128i src_line0 = _mm_loadu_si128((const __m128i *)coeff); const __m128i src_line1 = _mm_loadu_si128((const __m128i *)(coeff + 8)); const __m128i inv0 = _mm_sub_epi16(zero, src_line0); const __m128i inv1 = _mm_sub_epi16(zero, src_line1); const __m128i abs0 = _mm_max_epi16(src_line0, inv0); // abs(src_line) const __m128i abs1 = _mm_max_epi16(src_line1, inv1); // abs(src_line) const __m128i sum0 = _mm_madd_epi16(abs0, one); const __m128i sum1 = _mm_madd_epi16(abs1, one); accum = _mm_add_epi32(accum, sum0); accum = _mm_add_epi32(accum, sum1); coeff += 16; } { // cascading summation of accum __m128i hi = _mm_srli_si128(accum, 8); accum = _mm_add_epi32(accum, hi); hi = _mm_srli_epi64(accum, 32); accum = _mm_add_epi32(accum, hi); } return _mm_cvtsi128_si32(accum); } void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor) { // SIMD implementation assumes width and height to be multiple of 16 and 2 // respectively. For any odd width or height, SIMD support needs to be added. assert(width % 16 == 0 && height % 2 == 0); __m128i zero = _mm_setzero_si128(); for (int wd = 0; wd < width; wd += 16) { const uint8_t *ref_tmp = ref + wd; int16_t *hbuf_tmp = hbuf + wd; __m128i s0 = zero; __m128i s1 = zero; int idx = 0; do { __m128i src_line = _mm_loadu_si128((const __m128i *)ref_tmp); __m128i t0 = _mm_unpacklo_epi8(src_line, zero); __m128i t1 = _mm_unpackhi_epi8(src_line, zero); s0 = _mm_add_epi16(s0, t0); s1 = _mm_add_epi16(s1, t1); ref_tmp += ref_stride; src_line = _mm_loadu_si128((const __m128i *)ref_tmp); t0 = _mm_unpacklo_epi8(src_line, zero); t1 = _mm_unpackhi_epi8(src_line, zero); s0 = _mm_add_epi16(s0, t0); s1 = _mm_add_epi16(s1, t1); ref_tmp += ref_stride; idx += 2; } while (idx < height); s0 = _mm_srai_epi16(s0, norm_factor); s1 = _mm_srai_epi16(s1, norm_factor); _mm_storeu_si128((__m128i *)(hbuf_tmp), s0); _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), s1); } } void aom_int_pro_col_sse2(int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor) { // SIMD implementation assumes width to be multiple of 16. assert(width % 16 == 0); for (int ht = 0; ht < height; ht++) { const uint8_t *ref_tmp = ref + (ht * ref_stride); __m128i zero = _mm_setzero_si128(); __m128i s0 = zero; __m128i s1, src_line; for (int i = 0; i < width; i += 16) { src_line = _mm_loadu_si128((const __m128i *)ref_tmp); s1 = _mm_sad_epu8(src_line, zero); s0 = _mm_add_epi16(s0, s1); ref_tmp += 16; } s1 = _mm_srli_si128(s0, 8); s0 = _mm_add_epi16(s0, s1); vbuf[ht] = _mm_cvtsi128_si32(s0) >> norm_factor; } } aom-3.12.1/aom_dsp/x86/avg_intrin_sse4.c000066400000000000000000000042071477627663500177150ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" // ref: [0 - 510] // src: [0 - 510] // bwl: {2, 3, 4, 5} int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl) { const int width = 4 << bwl; assert(width % 16 == 0); const __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1); __m128i mean = _mm_setzero_si128(); __m128i sse = _mm_setzero_si128(); for (int i = 0; i < width; i += 16) { const __m128i src_line = _mm_loadu_si128((const __m128i *)src); const __m128i ref_line = _mm_loadu_si128((const __m128i *)ref); const __m128i src_line2 = _mm_loadu_si128((const __m128i *)(src + 8)); const __m128i ref_line2 = _mm_loadu_si128((const __m128i *)(ref + 8)); __m128i diff = _mm_sub_epi16(ref_line, src_line); const __m128i diff2 = _mm_sub_epi16(ref_line2, src_line2); __m128i diff_sqr = _mm_madd_epi16(diff, diff); const __m128i diff_sqr2 = _mm_madd_epi16(diff2, diff2); diff = _mm_add_epi16(diff, diff2); diff_sqr = _mm_add_epi32(diff_sqr, diff_sqr2); sse = _mm_add_epi32(sse, diff_sqr); mean = _mm_add_epi16(mean, diff); src += 16; ref += 16; } // m0 m1 m2 m3 mean = _mm_madd_epi16(mean, k_one_epi16); // m0+m1 m2+m3 s0+s1 s2+s3 __m128i result = _mm_hadd_epi32(mean, sse); // m0+m1+m2+m3 s0+s1+s2+s3 x x result = _mm_add_epi32(result, _mm_bsrli_si128(result, 4)); // (mean * mean): dynamic range 31 bits. const int mean_int = _mm_extract_epi32(result, 0); const int sse_int = _mm_extract_epi32(result, 2); const unsigned int mean_abs = abs(mean_int); const int var = sse_int - ((mean_abs * mean_abs) >> (bwl + 2)); return var; } aom-3.12.1/aom_dsp/x86/bitdepth_conversion_avx2.h000066400000000000000000000024331477627663500216330ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" static inline __m256i load_tran_low(const tran_low_t *a) { const __m256i a_low = _mm256_loadu_si256((const __m256i *)a); const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8)); return _mm256_packs_epi32(a_low, a_high); } static inline void store_tran_low(__m256i a, tran_low_t *b) { const __m256i one = _mm256_set1_epi16(1); const __m256i a_hi = _mm256_mulhi_epi16(a, one); const __m256i a_lo = _mm256_mullo_epi16(a, one); const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi); const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi); _mm256_storeu_si256((__m256i *)b, a_1); _mm256_storeu_si256((__m256i *)(b + 8), a_2); } aom-3.12.1/aom_dsp/x86/bitdepth_conversion_sse2.h000066400000000000000000000035521477627663500216320ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" // Load 8 16 bit values. If the source is 32 bits then pack down with // saturation. static inline __m128i load_tran_low(const tran_low_t *a) { const __m128i a_low = _mm_load_si128((const __m128i *)a); return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); } static inline void unpack_trans(__m128i a, __m128i *a_1, __m128i *a_2) { const __m128i one = _mm_set1_epi16(1); const __m128i a_hi = _mm_mulhi_epi16(a, one); const __m128i a_lo = _mm_mullo_epi16(a, one); *a_1 = _mm_unpacklo_epi16(a_lo, a_hi); *a_2 = _mm_unpackhi_epi16(a_lo, a_hi); } // Store 8 16 bit values. If the destination is 32 bits then sign extend the // values by multiplying by 1. static inline void store_tran_low(__m128i a, tran_low_t *b) { __m128i a_1, a_2; unpack_trans(a, &a_1, &a_2); _mm_store_si128((__m128i *)(b), a_1); _mm_store_si128((__m128i *)(b + 4), a_2); } // Stores the second result at an offset of 8 (instead of 4) to match the output // with that of AVX2 implementation and the function is similar to // store_tran_low(). static inline void store_tran_low_offset_4(__m128i a, tran_low_t *b) { __m128i a_1, a_2; unpack_trans(a, &a_1, &a_2); _mm_store_si128((__m128i *)(b), a_1); _mm_store_si128((__m128i *)(b + 8), a_2); } aom-3.12.1/aom_dsp/x86/blend_a64_hmask_sse4.c000066400000000000000000000031061477627663500204730ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom/aom_integer.h" #include "config/aom_dsp_rtcd.h" // To start out, just dispatch to the function using the 2D mask and // pass mask stride as 0. This can be improved upon if necessary. void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h) { aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 0, w, h, 0, 0); } #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_blend_a64_hmask_sse4_1( uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd) { aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride, src1_8, src1_stride, mask, 0, w, h, 0, 0, bd); } #endif aom-3.12.1/aom_dsp/x86/blend_a64_mask_avx2.c000066400000000000000000001614161477627663500203360ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // SSE4.1 #include // AVX2 #include #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" #include "aom_dsp/x86/blend_sse4.h" #include "aom_dsp/x86/blend_mask_sse4.h" #include "config/aom_dsp_rtcd.h" static inline void blend_a64_d16_mask_w16_avx2( uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval, int shift) { const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0); const __m256i s0_0 = yy_loadu_256(src0); const __m256i s1_0 = yy_loadu_256(src1); __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0), _mm256_unpacklo_epi16(*m0, max_minus_m0)); __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0), _mm256_unpackhi_epi16(*m0, max_minus_m0)); res0_lo = _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift); res0_hi = _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift); const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi); __m256i res = _mm256_packus_epi16(res0, res0); res = _mm256_permute4x64_epi64(res, 0xd8); _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res)); } static inline void blend_a64_d16_mask_w32_avx2( uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset, const __m256i *v_maxval, int shift) { const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0); const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1); const __m256i s0_0 = yy_loadu_256(src0); const __m256i s0_1 = yy_loadu_256(src0 + 16); const __m256i s1_0 = yy_loadu_256(src1); const __m256i s1_1 = yy_loadu_256(src1 + 16); __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0), _mm256_unpacklo_epi16(*m0, max_minus_m0)); __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0), _mm256_unpackhi_epi16(*m0, max_minus_m0)); __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1), _mm256_unpacklo_epi16(*m1, max_minus_m1)); __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1), _mm256_unpackhi_epi16(*m1, max_minus_m1)); res0_lo = _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift); res0_hi = _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift); res1_lo = _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift); res1_hi = _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift); const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi); const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi); __m256i res = _mm256_packus_epi16(res0, res1); res = _mm256_permute4x64_epi64(res, 0xd8); _mm256_storeu_si256((__m256i *)(dst), res); } static inline void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, const __m256i *round_offset, int shift) { const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); for (int i = 0; i < h; ++i) { const __m128i m = xx_loadu_128(mask); const __m256i m0 = _mm256_cvtepu8_epi16(m); blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval, shift); mask += mask_stride; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, const __m256i *round_offset, int shift) { const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 32) { const __m256i m = yy_loadu_256(mask + j); const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m)); const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1)); blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, round_offset, &v_maxval, shift); } mask += mask_stride; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, const __m256i *round_offset, int shift) { const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const __m256i one_b = _mm256_set1_epi8(1); const __m256i two_w = _mm256_set1_epi16(2); for (int i = 0; i < h; ++i) { const __m256i m_i00 = yy_loadu_256(mask); const __m256i m_i10 = yy_loadu_256(mask + mask_stride); const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10); const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b); const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2); blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval, shift); mask += mask_stride << 1; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, const __m256i *round_offset, int shift) { const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const __m256i one_b = _mm256_set1_epi8(1); const __m256i two_w = _mm256_set1_epi16(2); for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 32) { const __m256i m_i00 = yy_loadu_256(mask + 2 * j); const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32); const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j); const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32); const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10); const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11); const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b); const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b); const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2); const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2); blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, round_offset, &v_maxval, shift); } mask += mask_stride << 1; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, const __m256i *round_offset, int shift) { const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const __m256i one_b = _mm256_set1_epi8(1); const __m256i zeros = _mm256_setzero_si256(); for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 16) { const __m256i m_i00 = yy_loadu_256(mask + 2 * j); const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b); const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros); blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0, round_offset, &v_maxval, shift); } mask += mask_stride; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, const __m256i *round_offset, int shift) { const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const __m256i one_b = _mm256_set1_epi8(1); const __m256i zeros = _mm256_setzero_si256(); for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 32) { const __m256i m_i00 = yy_loadu_256(mask + 2 * j); const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32); const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b); const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b); const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros); const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros); blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, round_offset, &v_maxval, shift); } mask += mask_stride; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, const __m256i *round_offset, int shift) { const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const __m128i zeros = _mm_setzero_si128(); for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 16) { const __m128i m_i00 = xx_loadu_128(mask + j); const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); const __m256i m0 = _mm256_cvtepu8_epi16(m_ac); blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0, round_offset, &v_maxval, shift); } mask += mask_stride << 1; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, const __m256i *round_offset, int shift) { const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const __m256i zeros = _mm256_setzero_si256(); for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 32) { const __m256i m_i00 = yy_loadu_256(mask + j); const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j); const __m256i m_ac = _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros); const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac)); const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1)); blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, round_offset, &v_maxval, shift); } mask += mask_stride << 1; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } void aom_lowbd_blend_a64_d16_mask_avx2( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params) { const int bd = 8; const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int round_offset = ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - (1 << (round_bits - 1))) << AOM_BLEND_A64_ROUND_BITS; const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); assert(h >= 4); assert(w >= 4); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); const __m128i v_round_offset = _mm_set1_epi32(round_offset); const __m256i y_round_offset = _mm256_set1_epi32(round_offset); if (subw == 0 && subh == 0) { switch (w) { case 4: aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift); break; case 8: aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift); break; case 16: lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &y_round_offset, shift); break; default: lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, &y_round_offset, shift); break; } } else if (subw == 1 && subh == 1) { switch (w) { case 4: aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift); break; case 8: aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift); break; case 16: lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &y_round_offset, shift); break; default: lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, &y_round_offset, shift); break; } } else if (subw == 1 && subh == 0) { switch (w) { case 4: aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift); break; case 8: aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift); break; case 16: lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, &y_round_offset, shift); break; default: lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, &y_round_offset, shift); break; } } else { switch (w) { case 4: aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift); break; case 8: aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift); break; case 16: lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, &y_round_offset, shift); break; default: lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, &y_round_offset, shift); break; } } } static inline __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1, const __m256i *v_m0_b, const __m256i *v_m1_b, const int32_t bits) { const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0)); const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1)); const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8); const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8); const __m256i v_p0_w = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b), _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w); const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8); return v_res; } static inline __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1, const __m256i *v_m0_b, const __m256i *v_m1_b, const int32_t bits) { const __m256i v_s0_b = yy_loadu_256(src0); const __m256i v_s1_b = yy_loadu_256(src1); const __m256i v_p0_w = _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b), _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); const __m256i v_p1_w = _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b), _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b)); const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits); const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w); return v_res; } static inline void blend_a64_mask_sx_sy_w16_avx2( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h) { const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); do { const __m256i v_ral_b = yy_loadu_256(mask); const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride); const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); const __m256i v_rvsbl_w = _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2); const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w); const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b)); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); } static inline void blend_a64_mask_sx_sy_w32n_avx2( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); do { int c; for (c = 0; c < w; c += 32) { const __m256i v_ral_b = yy_loadu_256(mask + 2 * c); const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32); const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c); const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32); const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b); const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b); const __m256i v_rvsbl_w = _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); const __m256i v_rvsbh_w = _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b); const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w); const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2); const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2); const __m256i v_m0_b = _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8); const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); const __m256i v_res_b = blend_32_u8_avx2( src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); yy_storeu_256(dst + c, v_res_b); } dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); } static inline void blend_a64_mask_sx_sy_avx2( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); switch (w) { case 4: do { const __m128i v_ra_b = xx_loadl_64(mask); const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); break; case 8: do { const __m128i v_ra_b = xx_loadu_128(mask); const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); break; case 16: blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h); break; default: blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); break; } } static inline void blend_a64_mask_sx_w16_avx2( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h) { const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m256i v_zmask_b = _mm256_set1_epi16(0xff); do { const __m256i v_rl_b = yy_loadu_256(mask); const __m256i v_al_b = _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1)); const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b); const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256()); const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b)); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); } static inline void blend_a64_mask_sx_w32n_avx2( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle); const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); do { int c; for (c = 0; c < w; c += 32) { const __m256i v_r0_b = yy_loadu_256(mask + 2 * c); const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32); const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b); const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b); const __m256i v_al_b = _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8)); const __m256i v_ah_b = _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8)); const __m256i v_m0_b = _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8); const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); const __m256i v_res_b = blend_32_u8_avx2( src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); yy_storeu_256(dst + c, v_res_b); } dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); } static inline void blend_a64_mask_sx_avx2( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); switch (w) { case 4: do { const __m128i v_r_b = xx_loadl_64(mask); const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); break; case 8: do { const __m128i v_r_b = xx_loadu_128(mask); const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); break; case 16: blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h); break; default: blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); break; } } static inline void blend_a64_mask_sy_w16_avx2( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h) { const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); do { const __m128i v_ra_b = xx_loadu_128(mask); const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); } static inline void blend_a64_mask_sy_w32n_avx2( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); do { int c; for (c = 0; c < w; c += 32) { const __m256i v_ra_b = yy_loadu_256(mask + c); const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride); const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b); const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); const __m256i v_res_b = blend_32_u8_avx2( src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); yy_storeu_256(dst + c, v_res_b); } dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); } static inline void blend_a64_mask_sy_avx2( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); switch (w) { case 4: do { const __m128i v_ra_b = xx_loadl_32(mask); const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); break; case 8: do { const __m128i v_ra_b = xx_loadl_64(mask); const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); break; case 16: blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h); break; default: blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); } } static inline void blend_a64_mask_w32n_avx2( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); do { int c; for (c = 0; c < w; c += 32) { const __m256i v_m0_b = yy_loadu_256(mask + c); const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); const __m256i v_res_b = blend_32_u8_avx2( src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); yy_storeu_256(dst + c, v_res_b); } dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); } static inline void blend_a64_mask_avx2( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); switch (w) { case 4: do { const __m128i v_m0_b = xx_loadl_32(mask); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); break; case 8: do { const __m128i v_m0_b = xx_loadl_64(mask); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); break; case 16: do { const __m128i v_m0_b = xx_loadu_128(mask); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); break; default: blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); } } void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh) { assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, subw, subh); } else { if (subw & subh) { blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); } else if (subw) { blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); } else if (subh) { blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); } else { blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); } } } #if CONFIG_AV1_HIGHBITDEPTH ////////////////////////////////////////////////////////////////////////////// // aom_highbd_blend_a64_d16_mask_avx2() ////////////////////////////////////////////////////////////////////////////// static inline void highbd_blend_a64_d16_mask_w4_avx2( uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0, const __m256i *round_offset, int shift, const __m256i *clip_low, const __m256i *clip_high, const __m256i *mask_max) { // Load 4x u16 pixels from each of 4 rows from each source const __m256i s0 = yy_loadu_4x64(src0 + 3 * src0_stride, src0 + 2 * src0_stride, src0 + 1 * src0_stride, src0 + 0 * src0_stride); const __m256i s1 = yy_loadu_4x64(src1 + 3 * src1_stride, src1 + 2 * src1_stride, src1 + 1 * src1_stride, src1 + 0 * src1_stride); // Generate the inverse mask const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0); // Multiply each mask by the respective source const __m256i mul0_highs = _mm256_mulhi_epu16(*mask0, s0); const __m256i mul0_lows = _mm256_mullo_epi16(*mask0, s0); const __m256i mul0h = _mm256_unpackhi_epi16(mul0_lows, mul0_highs); const __m256i mul0l = _mm256_unpacklo_epi16(mul0_lows, mul0_highs); // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within // lanes Later, packs does the same again which cancels this out with no need // for a permute. The intermediate values being reordered makes no difference const __m256i mul1_highs = _mm256_mulhi_epu16(mask1, s1); const __m256i mul1_lows = _mm256_mullo_epi16(mask1, s1); const __m256i mul1h = _mm256_unpackhi_epi16(mul1_lows, mul1_highs); const __m256i mul1l = _mm256_unpacklo_epi16(mul1_lows, mul1_highs); const __m256i sumh = _mm256_add_epi32(mul0h, mul1h); const __m256i suml = _mm256_add_epi32(mul0l, mul1l); const __m256i roundh = _mm256_srai_epi32(_mm256_sub_epi32(sumh, *round_offset), shift); const __m256i roundl = _mm256_srai_epi32(_mm256_sub_epi32(suml, *round_offset), shift); const __m256i pack = _mm256_packs_epi32(roundl, roundh); const __m256i clip = _mm256_min_epi16(_mm256_max_epi16(pack, *clip_low), *clip_high); // _mm256_extract_epi64 doesn't exist on x86, so do it the old-fashioned way: const __m128i cliph = _mm256_extracti128_si256(clip, 1); xx_storel_64(dst + 3 * dst_stride, _mm_srli_si128(cliph, 8)); xx_storel_64(dst + 2 * dst_stride, cliph); const __m128i clipl = _mm256_castsi256_si128(clip); xx_storel_64(dst + 1 * dst_stride, _mm_srli_si128(clipl, 8)); xx_storel_64(dst + 0 * dst_stride, clipl); } static inline void highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2( uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, const __m256i *round_offset, int shift, const __m256i *clip_low, const __m256i *clip_high, const __m256i *mask_max) { do { // Load 8x u8 pixels from each of 4 rows of the mask, pad each to u16 const __m128i mask08 = _mm_set_epi32(*(int32_t *)(mask + 3 * mask_stride), *(int32_t *)(mask + 2 * mask_stride), *(int32_t *)(mask + 1 * mask_stride), *(int32_t *)(mask + 0 * mask_stride)); const __m256i mask0 = _mm256_cvtepu8_epi16(mask08); highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0, round_offset, shift, clip_low, clip_high, mask_max); dst += dst_stride * 4; src0 += src0_stride * 4; src1 += src1_stride * 4; mask += mask_stride * 4; } while (h -= 4); } static inline void highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2( uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, const __m256i *round_offset, int shift, const __m256i *clip_low, const __m256i *clip_high, const __m256i *mask_max) { const __m256i one_b = _mm256_set1_epi8(1); const __m256i two_w = _mm256_set1_epi16(2); do { // Load 8 pixels from each of 8 rows of mask, // (saturating) add together rows then use madd to add adjacent pixels // Finally, divide each value by 4 (with rounding) const __m256i m0246 = _mm256_set_epi64x(*(int64_t *)(mask + 6 * mask_stride), *(int64_t *)(mask + 4 * mask_stride), *(int64_t *)(mask + 2 * mask_stride), *(int64_t *)(mask + 0 * mask_stride)); const __m256i m1357 = _mm256_set_epi64x(*(int64_t *)(mask + 7 * mask_stride), *(int64_t *)(mask + 5 * mask_stride), *(int64_t *)(mask + 3 * mask_stride), *(int64_t *)(mask + 1 * mask_stride)); const __m256i addrows = _mm256_adds_epu8(m0246, m1357); const __m256i adjacent = _mm256_maddubs_epi16(addrows, one_b); const __m256i mask0 = _mm256_srli_epi16(_mm256_add_epi16(adjacent, two_w), 2); highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0, round_offset, shift, clip_low, clip_high, mask_max); dst += dst_stride * 4; src0 += src0_stride * 4; src1 += src1_stride * 4; mask += mask_stride * 8; } while (h -= 4); } static inline void highbd_blend_a64_d16_mask_w8_avx2( uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a, const __m256i *mask0b, const __m256i *round_offset, int shift, const __m256i *clip_low, const __m256i *clip_high, const __m256i *mask_max) { // Load 8x u16 pixels from each of 4 rows from each source const __m256i s0a = yy_loadu2_128(src0 + 0 * src0_stride, src0 + 1 * src0_stride); const __m256i s0b = yy_loadu2_128(src0 + 2 * src0_stride, src0 + 3 * src0_stride); const __m256i s1a = yy_loadu2_128(src1 + 0 * src1_stride, src1 + 1 * src1_stride); const __m256i s1b = yy_loadu2_128(src1 + 2 * src1_stride, src1 + 3 * src1_stride); // Generate inverse masks const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a); const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b); // Multiply sources by respective masks const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a); const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a); const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs); const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs); // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within // lanes Later, packs does the same again which cancels this out with no need // for a permute. The intermediate values being reordered makes no difference const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a); const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a); const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs); const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs); const __m256i sumah = _mm256_add_epi32(mul0ah, mul1ah); const __m256i sumal = _mm256_add_epi32(mul0al, mul1al); const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b); const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b); const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs); const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs); const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b); const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b); const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs); const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs); const __m256i sumbh = _mm256_add_epi32(mul0bh, mul1bh); const __m256i sumbl = _mm256_add_epi32(mul0bl, mul1bl); // Divide down each result, with rounding const __m256i roundah = _mm256_srai_epi32(_mm256_sub_epi32(sumah, *round_offset), shift); const __m256i roundal = _mm256_srai_epi32(_mm256_sub_epi32(sumal, *round_offset), shift); const __m256i roundbh = _mm256_srai_epi32(_mm256_sub_epi32(sumbh, *round_offset), shift); const __m256i roundbl = _mm256_srai_epi32(_mm256_sub_epi32(sumbl, *round_offset), shift); // Pack each i32 down to an i16 with saturation, then clip to valid range const __m256i packa = _mm256_packs_epi32(roundal, roundah); const __m256i clipa = _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high); const __m256i packb = _mm256_packs_epi32(roundbl, roundbh); const __m256i clipb = _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high); // Store 8x u16 pixels to each of 4 rows in the destination yy_storeu2_128(dst + 0 * dst_stride, dst + 1 * dst_stride, clipa); yy_storeu2_128(dst + 2 * dst_stride, dst + 3 * dst_stride, clipb); } static inline void highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2( uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask, int mask_stride, int h, const __m256i *round_offset, int shift, const __m256i *clip_low, const __m256i *clip_high, const __m256i *mask_max) { do { // Load 8x u8 pixels from each of 4 rows in the mask const __m128i mask0a8 = _mm_set_epi64x(*(int64_t *)mask, *(uint64_t *)(mask + mask_stride)); const __m128i mask0b8 = _mm_set_epi64x(*(int64_t *)(mask + 2 * mask_stride), *(int64_t *)(mask + 3 * mask_stride)); const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8); const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8); highbd_blend_a64_d16_mask_w8_avx2( dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max); dst += dst_stride * 4; src0 += src0_stride * 4; src1 += src1_stride * 4; mask += mask_stride * 4; } while (h -= 4); } static inline void highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2( uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask, int mask_stride, int h, const __m256i *round_offset, int shift, const __m256i *clip_low, const __m256i *clip_high, const __m256i *mask_max) { const __m256i one_b = _mm256_set1_epi8(1); const __m256i two_w = _mm256_set1_epi16(2); do { // Load 16x u8 pixels from each of 8 rows in the mask, // (saturating) add together rows then use madd to add adjacent pixels // Finally, divide each value by 4 (with rounding) const __m256i m02 = yy_loadu2_128(mask + 0 * mask_stride, mask + 2 * mask_stride); const __m256i m13 = yy_loadu2_128(mask + 1 * mask_stride, mask + 3 * mask_stride); const __m256i m0123 = _mm256_maddubs_epi16(_mm256_adds_epu8(m02, m13), one_b); const __m256i mask_0a = _mm256_srli_epi16(_mm256_add_epi16(m0123, two_w), 2); const __m256i m46 = yy_loadu2_128(mask + 4 * mask_stride, mask + 6 * mask_stride); const __m256i m57 = yy_loadu2_128(mask + 5 * mask_stride, mask + 7 * mask_stride); const __m256i m4567 = _mm256_maddubs_epi16(_mm256_adds_epu8(m46, m57), one_b); const __m256i mask_0b = _mm256_srli_epi16(_mm256_add_epi16(m4567, two_w), 2); highbd_blend_a64_d16_mask_w8_avx2( dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a, &mask_0b, round_offset, shift, clip_low, clip_high, mask_max); dst += dst_stride * 4; src0 += src0_stride * 4; src1 += src1_stride * 4; mask += mask_stride * 8; } while (h -= 4); } static inline void highbd_blend_a64_d16_mask_w16_avx2( uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a, const __m256i *mask0b, const __m256i *round_offset, int shift, const __m256i *clip_low, const __m256i *clip_high, const __m256i *mask_max) { // Load 16x pixels from each of 2 rows from each source const __m256i s0a = yy_loadu_256(src0); const __m256i s0b = yy_loadu_256(src0 + src0_stride); const __m256i s1a = yy_loadu_256(src1); const __m256i s1b = yy_loadu_256(src1 + src1_stride); // Calculate inverse masks const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a); const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b); // Multiply each source by appropriate mask const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a); const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a); const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs); const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs); // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within // lanes Later, packs does the same again which cancels this out with no need // for a permute. The intermediate values being reordered makes no difference const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a); const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a); const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs); const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs); const __m256i mulah = _mm256_add_epi32(mul0ah, mul1ah); const __m256i mulal = _mm256_add_epi32(mul0al, mul1al); const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b); const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b); const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs); const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs); const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b); const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b); const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs); const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs); const __m256i mulbh = _mm256_add_epi32(mul0bh, mul1bh); const __m256i mulbl = _mm256_add_epi32(mul0bl, mul1bl); const __m256i resah = _mm256_srai_epi32(_mm256_sub_epi32(mulah, *round_offset), shift); const __m256i resal = _mm256_srai_epi32(_mm256_sub_epi32(mulal, *round_offset), shift); const __m256i resbh = _mm256_srai_epi32(_mm256_sub_epi32(mulbh, *round_offset), shift); const __m256i resbl = _mm256_srai_epi32(_mm256_sub_epi32(mulbl, *round_offset), shift); // Signed saturating pack from i32 to i16: const __m256i packa = _mm256_packs_epi32(resal, resah); const __m256i packb = _mm256_packs_epi32(resbl, resbh); // Clip the values to the valid range const __m256i clipa = _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high); const __m256i clipb = _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high); // Store 16 pixels yy_storeu_256(dst, clipa); yy_storeu_256(dst + dst_stride, clipb); } static inline void highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask, int mask_stride, int h, int w, const __m256i *round_offset, int shift, const __m256i *clip_low, const __m256i *clip_high, const __m256i *mask_max) { for (int i = 0; i < h; i += 2) { for (int j = 0; j < w; j += 16) { // Load 16x u8 alpha-mask values from each of two rows and pad to u16 const __m128i masks_a8 = xx_loadu_128(mask + j); const __m128i masks_b8 = xx_loadu_128(mask + mask_stride + j); const __m256i mask0a = _mm256_cvtepu8_epi16(masks_a8); const __m256i mask0b = _mm256_cvtepu8_epi16(masks_b8); highbd_blend_a64_d16_mask_w16_avx2( dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride, &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max); } dst += dst_stride * 2; src0 += src0_stride * 2; src1 += src1_stride * 2; mask += mask_stride * 2; } } static inline void highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask, int mask_stride, int h, int w, const __m256i *round_offset, int shift, const __m256i *clip_low, const __m256i *clip_high, const __m256i *mask_max) { const __m256i one_b = _mm256_set1_epi8(1); const __m256i two_w = _mm256_set1_epi16(2); for (int i = 0; i < h; i += 2) { for (int j = 0; j < w; j += 16) { // Load 32x u8 alpha-mask values from each of four rows // (saturating) add pairs of rows, then use madd to add adjacent values // Finally, divide down each result with rounding const __m256i m0 = yy_loadu_256(mask + 0 * mask_stride + 2 * j); const __m256i m1 = yy_loadu_256(mask + 1 * mask_stride + 2 * j); const __m256i m2 = yy_loadu_256(mask + 2 * mask_stride + 2 * j); const __m256i m3 = yy_loadu_256(mask + 3 * mask_stride + 2 * j); const __m256i m01_8 = _mm256_adds_epu8(m0, m1); const __m256i m23_8 = _mm256_adds_epu8(m2, m3); const __m256i m01 = _mm256_maddubs_epi16(m01_8, one_b); const __m256i m23 = _mm256_maddubs_epi16(m23_8, one_b); const __m256i mask0a = _mm256_srli_epi16(_mm256_add_epi16(m01, two_w), 2); const __m256i mask0b = _mm256_srli_epi16(_mm256_add_epi16(m23, two_w), 2); highbd_blend_a64_d16_mask_w16_avx2( dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride, &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max); } dst += dst_stride * 2; src0 += src0_stride * 2; src1 += src1_stride * 2; mask += mask_stride * 4; } } void aom_highbd_blend_a64_d16_mask_avx2( uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd) { uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int32_t round_offset = ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - (1 << (round_bits - 1))) << AOM_BLEND_A64_ROUND_BITS; const __m256i v_round_offset = _mm256_set1_epi32(round_offset); const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; const __m256i clip_low = _mm256_setzero_si256(); const __m256i clip_high = _mm256_set1_epi16((1 << bd) - 1); const __m256i mask_max = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); assert(h >= 4); assert(w >= 4); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); if (subw == 0 && subh == 0) { switch (w) { case 4: highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, &mask_max); break; case 8: highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, &mask_max); break; default: // >= 16 highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, &mask_max); break; } } else if (subw == 1 && subh == 1) { switch (w) { case 4: highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, &mask_max); break; case 8: highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, &mask_max); break; default: // >= 16 highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, &mask_max); break; } } else { // Sub-sampling in only one axis doesn't seem to happen very much, so fall // back to the vanilla C implementation instead of having all the optimised // code for these. aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, subw, subh, conv_params, bd); } } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/x86/blend_a64_mask_sse4.c000066400000000000000000001762531477627663500203410ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // SSE4.1 #include #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/blend.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/blend_sse4.h" #include "aom_dsp/x86/blend_mask_sse4.h" #include "config/aom_dsp_rtcd.h" ////////////////////////////////////////////////////////////////////////////// // No sub-sampling ////////////////////////////////////////////////////////////////////////////// static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_m0_b = xx_loadl_32(mask); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); } static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_m0_b = xx_loadl_64(mask); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); } static void blend_a64_mask_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { const __m128i v_m0_b = xx_loadu_128(mask + c); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); } ////////////////////////////////////////////////////////////////////////////// // Horizontal sub-sampling ////////////////////////////////////////////////////////////////////////////// static void blend_a64_mask_sx_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_r_b = xx_loadl_64(mask); const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); } static void blend_a64_mask_sx_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_r_b = xx_loadu_128(mask); const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); } static void blend_a64_mask_sx_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { const __m128i v_r0_b = xx_loadu_128(mask + 2 * c); const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16); const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b); const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b); const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b); const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b); const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); } ////////////////////////////////////////////////////////////////////////////// // Vertical sub-sampling ////////////////////////////////////////////////////////////////////////////// static void blend_a64_mask_sy_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_ra_b = xx_loadl_32(mask); const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); } static void blend_a64_mask_sy_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_ra_b = xx_loadl_64(mask); const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); } static void blend_a64_mask_sy_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { const __m128i v_ra_b = xx_loadu_128(mask + c); const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride); const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); } ////////////////////////////////////////////////////////////////////////////// // Horizontal and Vertical sub-sampling ////////////////////////////////////////////////////////////////////////////// static void blend_a64_mask_sx_sy_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); (void)w; do { const __m128i v_ra_b = xx_loadl_64(mask); const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); } static void blend_a64_mask_sx_sy_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); (void)w; do { const __m128i v_ra_b = xx_loadu_128(mask); const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); } static void blend_a64_mask_sx_sy_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { const __m128i v_ral_b = xx_loadu_128(mask + 2 * c); const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16); const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c); const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16); const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b); const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b); const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b); const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b); const __m128i v_rvsbl_w = _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b); const __m128i v_rvsbh_w = _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b); const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w); const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w); const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2); const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2); const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w); const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); const __m128i v_res_b = blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); } ////////////////////////////////////////////////////////////////////////////// // Dispatch ////////////////////////////////////////////////////////////////////////////// void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh) { typedef void (*blend_fn)( uint8_t * dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h); // Dimensions are: width_index X subx X suby static const blend_fn blend[3][2][2] = { { // w % 16 == 0 { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 }, { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } }, { // w == 4 { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 }, { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } }, { // w == 8 { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 }, { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } } }; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, subw, subh); } else { blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); } } #if CONFIG_AV1_HIGHBITDEPTH ////////////////////////////////////////////////////////////////////////////// // No sub-sampling ////////////////////////////////////////////////////////////////////////////// static inline void blend_a64_mask_bn_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { const __m128i v_m0_b = xx_loadl_32(mask); const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); xx_storel_64(dst, v_res_w); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); } static void blend_a64_mask_b10_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, blend_4_b10); } static void blend_a64_mask_b12_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, blend_4_b12); } static inline void blend_a64_mask_bn_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { int c; for (c = 0; c < w; c += 8) { const __m128i v_m0_b = xx_loadl_64(mask + c); const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); xx_storeu_128(dst + c, v_res_w); } dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); } static void blend_a64_mask_b10_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, blend_8_b10); } static void blend_a64_mask_b12_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, blend_8_b12); } ////////////////////////////////////////////////////////////////////////////// // Horizontal sub-sampling ////////////////////////////////////////////////////////////////////////////// static inline void blend_a64_mask_bn_sx_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { const __m128i v_zmask_b = _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { const __m128i v_r_b = xx_loadl_64(mask); const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); xx_storel_64(dst, v_res_w); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); } static void blend_a64_mask_b10_sx_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, blend_4_b10); } static void blend_a64_mask_b12_sx_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, blend_4_b12); } static inline void blend_a64_mask_bn_sx_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, blend_unit_fn blend) { const __m128i v_zmask_b = _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { int c; for (c = 0; c < w; c += 8) { const __m128i v_r_b = xx_loadu_128(mask + 2 * c); const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); xx_storeu_128(dst + c, v_res_w); } dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } while (--h); } static void blend_a64_mask_b10_sx_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, blend_8_b10); } static void blend_a64_mask_b12_sx_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, blend_8_b12); } ////////////////////////////////////////////////////////////////////////////// // Vertical sub-sampling ////////////////////////////////////////////////////////////////////////////// static inline void blend_a64_mask_bn_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { const __m128i v_ra_b = xx_loadl_32(mask); const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); xx_storel_64(dst, v_res_w); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); } static void blend_a64_mask_b10_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, blend_4_b10); } static void blend_a64_mask_b12_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, blend_4_b12); } static inline void blend_a64_mask_bn_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { int c; for (c = 0; c < w; c += 8) { const __m128i v_ra_b = xx_loadl_64(mask + c); const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride); const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); xx_storeu_128(dst + c, v_res_w); } dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); } static void blend_a64_mask_b10_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, blend_8_b10); } static void blend_a64_mask_b12_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, blend_8_b12); } ////////////////////////////////////////////////////////////////////////////// // Horizontal and Vertical sub-sampling ////////////////////////////////////////////////////////////////////////////// static inline void blend_a64_mask_bn_sx_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { const __m128i v_zmask_b = _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { const __m128i v_ra_b = xx_loadl_64(mask); const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); xx_storel_64(dst, v_res_w); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); } static void blend_a64_mask_b10_sx_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, blend_4_b10); } static void blend_a64_mask_b12_sx_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, blend_4_b12); } static inline void blend_a64_mask_bn_sx_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, blend_unit_fn blend) { const __m128i v_zmask_b = _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { int c; for (c = 0; c < w; c += 8) { const __m128i v_ra_b = xx_loadu_128(mask + 2 * c); const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride); const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); xx_storeu_128(dst + c, v_res_w); } dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 2 * mask_stride; } while (--h); } static void blend_a64_mask_b10_sx_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, blend_8_b10); } static void blend_a64_mask_b12_sx_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, blend_8_b12); } ////////////////////////////////////////////////////////////////////////////// // Dispatch ////////////////////////////////////////////////////////////////////////////// void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd) { typedef void (*blend_fn)( uint16_t * dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h); // Dimensions are: bd_index X width_index X subw X subh static const blend_fn blend[2][2][2][2] = { { // bd == 8 or 10 { // w % 8 == 0 { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 }, { blend_a64_mask_b10_sx_w8n_sse4_1, blend_a64_mask_b10_sx_sy_w8n_sse4_1 } }, { // w == 4 { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 }, { blend_a64_mask_b10_sx_w4_sse4_1, blend_a64_mask_b10_sx_sy_w4_sse4_1 } } }, { // bd == 12 { // w % 8 == 0 { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 }, { blend_a64_mask_b12_sx_w8n_sse4_1, blend_a64_mask_b12_sx_sy_w8n_sse4_1 } }, { // w == 4 { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 }, { blend_a64_mask_b12_sx_w4_sse4_1, blend_a64_mask_b12_sx_sy_w4_sse4_1 } } } }; assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); assert(bd == 8 || bd == 10 || bd == 12); if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, src1_stride, mask, mask_stride, w, h, subw, subh, bd); } else { uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0]( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h); } } #endif // CONFIG_AV1_HIGHBITDEPTH static inline void blend_a64_d16_mask_w16_sse41( uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset, const __m128i *v_maxval, int shift) { const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0); const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1); const __m128i s0_0 = xx_loadu_128(src0); const __m128i s0_1 = xx_loadu_128(src0 + 8); const __m128i s1_0 = xx_loadu_128(src1); const __m128i s1_1 = xx_loadu_128(src1 + 8); __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0), _mm_unpacklo_epi16(*m0, max_minus_m0)); __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0), _mm_unpackhi_epi16(*m0, max_minus_m0)); __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1), _mm_unpacklo_epi16(*m1, max_minus_m1)); __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1), _mm_unpackhi_epi16(*m1, max_minus_m1)); res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift); res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift); res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift); res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift); const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi); const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi); const __m128i res = _mm_packus_epi16(res0, res1); _mm_storeu_si128((__m128i *)(dst), res); } static inline void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, const __m128i *round_offset, int shift) { const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 16) { const __m128i m = xx_loadu_128(mask + j); const __m128i m0 = _mm_cvtepu8_epi16(m); const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8)); blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, round_offset, &v_maxval, shift); } mask += mask_stride; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, const __m128i *round_offset, int shift) { const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const __m128i one_b = _mm_set1_epi8(1); const __m128i two_w = _mm_set1_epi16(2); for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 16) { const __m128i m_i00 = xx_loadu_128(mask + 2 * j); const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j); const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16); const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10); const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11); const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b); const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b); const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2); const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2); blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, round_offset, &v_maxval, shift); } mask += mask_stride << 1; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, const __m128i *round_offset, int shift) { const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const __m128i one_b = _mm_set1_epi8(1); const __m128i zeros = _mm_setzero_si128(); for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 16) { const __m128i m_i00 = xx_loadu_128(mask + 2 * j); const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b); const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b); const __m128i m0 = _mm_avg_epu16(m0_ac, zeros); const __m128i m1 = _mm_avg_epu16(m1_ac, zeros); blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, round_offset, &v_maxval, shift); } mask += mask_stride; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, const __m128i *round_offset, int shift) { const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const __m128i zeros = _mm_setzero_si128(); for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 16) { const __m128i m_i00 = xx_loadu_128(mask + j); const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); const __m128i m0 = _mm_cvtepu8_epi16(m_ac); const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8)); blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, round_offset, &v_maxval, shift); } mask += mask_stride << 1; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } void aom_lowbd_blend_a64_d16_mask_sse4_1( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params) { const int bd = 8; const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int round_offset = ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - (1 << (round_bits - 1))) << AOM_BLEND_A64_ROUND_BITS; const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); assert(h >= 4); assert(w >= 4); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); const __m128i v_round_offset = _mm_set1_epi32(round_offset); if (subw == 0 && subh == 0) { switch (w) { case 4: aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift); break; case 8: aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift); break; default: lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, &v_round_offset, shift); break; } } else if (subw == 1 && subh == 1) { switch (w) { case 4: aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift); break; case 8: aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift); break; default: lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, &v_round_offset, shift); break; } } else if (subw == 1 && subh == 0) { switch (w) { case 4: aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift); break; case 8: aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift); break; default: lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, &v_round_offset, shift); break; } } else { switch (w) { case 4: aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift); break; case 8: aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift); break; default: lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, &v_round_offset, shift); break; } } } ////////////////////////////////////////////////////////////////////////////// // aom_highbd_blend_a64_d16_mask_sse4_1() ////////////////////////////////////////////////////////////////////////////// #if CONFIG_AV1_HIGHBITDEPTH static inline void highbd_blend_a64_d16_mask_w4_sse4_1( uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a, const __m128i *mask0b, const __m128i *round_offset, int shift, const __m128i *clip_low, const __m128i *clip_high, const __m128i *mask_max) { // Load 4 pixels from each of 4 rows from each source const __m128i s0a = xx_loadu_2x64(src0, src0 + src0_stride); const __m128i s0b = xx_loadu_2x64(src0 + 2 * src0_stride, src0 + 3 * src0_stride); const __m128i s1a = xx_loadu_2x64(src1, src1 + src1_stride); const __m128i s1b = xx_loadu_2x64(src1 + 2 * src1_stride, src1 + 3 * src1_stride); // Generate the inverse masks const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a); const __m128i mask1b = _mm_sub_epi16(*mask_max, *mask0b); // Multiply each mask by the respective source const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a); const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a); const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs); const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs); const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a); const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a); const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs); const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs); const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b); const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b); const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs); const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs); const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b); const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b); const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs); const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs); const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah); const __m128i sumal = _mm_add_epi32(mul0al, mul1al); const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh); const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl); const __m128i roundah = _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift); const __m128i roundbh = _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift); const __m128i roundal = _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift); const __m128i roundbl = _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift); const __m128i packa = _mm_packs_epi32(roundal, roundah); const __m128i packb = _mm_packs_epi32(roundbl, roundbh); const __m128i clipa = _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high); const __m128i clipb = _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high); xx_storel_64(dst, _mm_srli_si128(clipa, 8)); xx_storel_64(dst + dst_stride, clipa); xx_storel_64(dst + 2 * dst_stride, _mm_srli_si128(clipb, 8)); xx_storel_64(dst + 3 * dst_stride, clipb); } static inline void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, const __m128i *round_offset, int shift, const __m128i *clip_low, const __m128i *clip_high, const __m128i *mask_max) { do { const __m128i mask0a8 = _mm_set_epi32(0, 0, *(int32_t *)mask, *(int32_t *)(mask + mask_stride)); const __m128i mask0b8 = _mm_set_epi32(0, 0, *(int32_t *)(mask + 2 * mask_stride), *(int32_t *)(mask + 3 * mask_stride)); const __m128i mask0a = _mm_cvtepu8_epi16(mask0a8); const __m128i mask0b = _mm_cvtepu8_epi16(mask0b8); highbd_blend_a64_d16_mask_w4_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max); dst += dst_stride * 4; src0 += src0_stride * 4; src1 += src1_stride * 4; mask += mask_stride * 4; } while (h -= 4); } static inline void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, const __m128i *round_offset, int shift, const __m128i *clip_low, const __m128i *clip_high, const __m128i *mask_max) { const __m128i one_b = _mm_set1_epi8(1); const __m128i two_w = _mm_set1_epi16(2); do { // Load 8 pixels from each of 8 rows of mask, // (saturating) add together rows then use madd to add adjacent pixels // Finally, divide each value by 4 (with rounding) const __m128i m02 = _mm_set_epi64x(*(int64_t *)(mask), *(int64_t *)(mask + 2 * mask_stride)); const __m128i m13 = _mm_set_epi64x(*(int64_t *)(mask + mask_stride), *(int64_t *)(mask + 3 * mask_stride)); const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b); const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2); const __m128i m46 = _mm_set_epi64x(*(int64_t *)(mask + 4 * mask_stride), *(int64_t *)(mask + 6 * mask_stride)); const __m128i m57 = _mm_set_epi64x(*(int64_t *)(mask + 5 * mask_stride), *(int64_t *)(mask + 7 * mask_stride)); const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b); const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2); highbd_blend_a64_d16_mask_w4_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a, &mask_0b, round_offset, shift, clip_low, clip_high, mask_max); dst += dst_stride * 4; src0 += src0_stride * 4; src1 += src1_stride * 4; mask += mask_stride * 8; } while (h -= 4); } static inline void highbd_blend_a64_d16_mask_w8_sse4_1( uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a, const __m128i *mask0b, const __m128i *round_offset, int shift, const __m128i *clip_low, const __m128i *clip_high, const __m128i *max_mask) { // Load 8x pixels from each of 2 rows from each source const __m128i s0a = xx_loadu_128(src0); const __m128i s0b = xx_loadu_128(src0 + src0_stride); const __m128i s1a = xx_loadu_128(src1); const __m128i s1b = xx_loadu_128(src1 + src1_stride); // Generate inverse masks const __m128i mask1a = _mm_sub_epi16(*max_mask, *mask0a); const __m128i mask1b = _mm_sub_epi16(*max_mask, *mask0b); // Multiply sources by respective masks const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a); const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a); const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs); const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs); const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a); const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a); const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs); const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs); const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah); const __m128i sumal = _mm_add_epi32(mul0al, mul1al); const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b); const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b); const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs); const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs); const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b); const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b); const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs); const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs); const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh); const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl); const __m128i roundah = _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift); const __m128i roundal = _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift); const __m128i roundbh = _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift); const __m128i roundbl = _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift); const __m128i packa = _mm_packs_epi32(roundal, roundah); const __m128i clipa = _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high); const __m128i packb = _mm_packs_epi32(roundbl, roundbh); const __m128i clipb = _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high); xx_storeu_128(dst, clipa); xx_storeu_128(dst + dst_stride, clipb); } static inline void highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, const __m128i *round_offset, int shift, const __m128i *clip_low, const __m128i *clip_high, const __m128i *max_mask) { do { const __m128i mask0a = _mm_cvtepu8_epi16(xx_loadl_64(mask)); const __m128i mask0b = _mm_cvtepu8_epi16(xx_loadl_64(mask + mask_stride)); highbd_blend_a64_d16_mask_w8_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, max_mask); dst += dst_stride * 2; src0 += src0_stride * 2; src1 += src1_stride * 2; mask += mask_stride * 2; } while (h -= 2); } static inline void highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, const __m128i *round_offset, int shift, const __m128i *clip_low, const __m128i *clip_high, const __m128i *max_mask) { const __m128i one_b = _mm_set1_epi8(1); const __m128i two_w = _mm_set1_epi16(2); do { const __m128i mask_thisrowa = xx_loadu_128(mask); const __m128i mask_nextrowa = xx_loadu_128(mask + mask_stride); const __m128i mask_thisrowb = xx_loadu_128(mask + 2 * mask_stride); const __m128i mask_nextrowb = xx_loadu_128(mask + 3 * mask_stride); const __m128i mask_bothrowsa = _mm_adds_epu8(mask_thisrowa, mask_nextrowa); const __m128i mask_bothrowsb = _mm_adds_epu8(mask_thisrowb, mask_nextrowb); const __m128i mask_16a = _mm_maddubs_epi16(mask_bothrowsa, one_b); const __m128i mask_16b = _mm_maddubs_epi16(mask_bothrowsb, one_b); const __m128i mask_sa = _mm_srli_epi16(_mm_add_epi16(mask_16a, two_w), 2); const __m128i mask_sb = _mm_srli_epi16(_mm_add_epi16(mask_16b, two_w), 2); highbd_blend_a64_d16_mask_w8_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_sa, &mask_sb, round_offset, shift, clip_low, clip_high, max_mask); dst += dst_stride * 2; src0 += src0_stride * 2; src1 += src1_stride * 2; mask += mask_stride * 4; } while (h -= 2); } static inline void highbd_blend_a64_d16_mask_w16_sse4_1( uint16_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, const __m128i *round_offset, int shift, const __m128i *mask0l, const __m128i *mask0h, const __m128i *clip_low, const __m128i *clip_high, const __m128i *mask_max) { // Load 16x u16 pixels for this row from each src const __m128i s0l = xx_loadu_128(src0); const __m128i s0h = xx_loadu_128(src0 + 8); const __m128i s1l = xx_loadu_128(src1); const __m128i s1h = xx_loadu_128(src1 + 8); // Calculate inverse masks const __m128i mask1h = _mm_sub_epi16(*mask_max, *mask0h); const __m128i mask1l = _mm_sub_epi16(*mask_max, *mask0l); const __m128i mul0_highs = _mm_mulhi_epu16(*mask0h, s0h); const __m128i mul0_lows = _mm_mullo_epi16(*mask0h, s0h); const __m128i mul0h = _mm_unpackhi_epi16(mul0_lows, mul0_highs); const __m128i mul0l = _mm_unpacklo_epi16(mul0_lows, mul0_highs); const __m128i mul1_highs = _mm_mulhi_epu16(mask1h, s1h); const __m128i mul1_lows = _mm_mullo_epi16(mask1h, s1h); const __m128i mul1h = _mm_unpackhi_epi16(mul1_lows, mul1_highs); const __m128i mul1l = _mm_unpacklo_epi16(mul1_lows, mul1_highs); const __m128i mulhh = _mm_add_epi32(mul0h, mul1h); const __m128i mulhl = _mm_add_epi32(mul0l, mul1l); const __m128i mul2_highs = _mm_mulhi_epu16(*mask0l, s0l); const __m128i mul2_lows = _mm_mullo_epi16(*mask0l, s0l); const __m128i mul2h = _mm_unpackhi_epi16(mul2_lows, mul2_highs); const __m128i mul2l = _mm_unpacklo_epi16(mul2_lows, mul2_highs); const __m128i mul3_highs = _mm_mulhi_epu16(mask1l, s1l); const __m128i mul3_lows = _mm_mullo_epi16(mask1l, s1l); const __m128i mul3h = _mm_unpackhi_epi16(mul3_lows, mul3_highs); const __m128i mul3l = _mm_unpacklo_epi16(mul3_lows, mul3_highs); const __m128i mullh = _mm_add_epi32(mul2h, mul3h); const __m128i mulll = _mm_add_epi32(mul2l, mul3l); const __m128i reshh = _mm_srai_epi32(_mm_sub_epi32(mulhh, *round_offset), shift); const __m128i reshl = _mm_srai_epi32(_mm_sub_epi32(mulhl, *round_offset), shift); const __m128i reslh = _mm_srai_epi32(_mm_sub_epi32(mullh, *round_offset), shift); const __m128i resll = _mm_srai_epi32(_mm_sub_epi32(mulll, *round_offset), shift); // Signed saturating pack from i32 to i16: const __m128i packh = _mm_packs_epi32(reshl, reshh); const __m128i packl = _mm_packs_epi32(resll, reslh); // Clip the values to the valid range const __m128i cliph = _mm_min_epi16(_mm_max_epi16(packh, *clip_low), *clip_high); const __m128i clipl = _mm_min_epi16(_mm_max_epi16(packl, *clip_low), *clip_high); // Store 16 pixels xx_storeu_128(dst, clipl); xx_storeu_128(dst + 8, cliph); } static inline void highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, const __m128i *round_offset, int shift, const __m128i *clip_low, const __m128i *clip_high, const __m128i *mask_max) { for (int i = 0; i < h; i++) { for (int j = 0; j < w; j += 16) { // Load 16x u8 alpha-mask values and pad to u16 const __m128i masks_u8 = xx_loadu_128(mask + j); const __m128i mask0l = _mm_cvtepu8_epi16(masks_u8); const __m128i mask0h = _mm_cvtepu8_epi16(_mm_srli_si128(masks_u8, 8)); highbd_blend_a64_d16_mask_w16_sse4_1( dst + j, src0 + j, src1 + j, round_offset, shift, &mask0l, &mask0h, clip_low, clip_high, mask_max); } dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride; } } static inline void highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, const __m128i *round_offset, int shift, const __m128i *clip_low, const __m128i *clip_high, const __m128i *mask_max) { const __m128i one_b = _mm_set1_epi8(1); const __m128i two_w = _mm_set1_epi16(2); for (int i = 0; i < h; i++) { for (int j = 0; j < w; j += 16) { const __m128i m_i00 = xx_loadu_128(mask + 2 * j); const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j); const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16); const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10); const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11); const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b); const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b); const __m128i mask_l = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2); const __m128i mask_h = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2); highbd_blend_a64_d16_mask_w16_sse4_1( dst + j, src0 + j, src1 + j, round_offset, shift, &mask_l, &mask_h, clip_low, clip_high, mask_max); } dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += mask_stride * 2; } } void aom_highbd_blend_a64_d16_mask_sse4_1( uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd) { uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int32_t round_offset = ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - (1 << (round_bits - 1))) << AOM_BLEND_A64_ROUND_BITS; const __m128i v_round_offset = _mm_set1_epi32(round_offset); const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; const __m128i clip_low = _mm_setzero_si128(); const __m128i clip_high = _mm_set1_epi16((1 << bd) - 1); const __m128i mask_max = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); assert(h >= 4); assert(w >= 4); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); if (subw == 0 && subh == 0) { switch (w) { case 4: highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, &mask_max); break; case 8: highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, &mask_max); break; default: // >=16 highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, &mask_max); break; } } else if (subw == 1 && subh == 1) { switch (w) { case 4: highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, &mask_max); break; case 8: highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, &mask_max); break; default: // >=16 highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, &mask_max); break; } } else { // Sub-sampling in only one axis doesn't seem to happen very much, so fall // back to the vanilla C implementation instead of having all the optimised // code for these. aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, subw, subh, conv_params, bd); } } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/x86/blend_a64_vmask_sse4.c000066400000000000000000000251211477627663500205120ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // SSE4.1 #include #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/blend.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/blend_sse4.h" #include "config/aom_dsp_rtcd.h" ////////////////////////////////////////////////////////////////////////////// // Implementation - No sub-sampling ////////////////////////////////////////////////////////////////////////////// static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); (void)w; do { const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w); const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); xx_storel_32(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 1; } while (--h); } static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); (void)w; do { const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w); const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); xx_storel_64(dst, v_res_b); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 1; } while (--h); } static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { int c; const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); for (c = 0; c < w; c += 16) { const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w); const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w); const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); xx_storeu_128(dst + c, v_res_b); } dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 1; } while (--h); } ////////////////////////////////////////////////////////////////////////////// // Dispatch ////////////////////////////////////////////////////////////////////////////// void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h) { typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h); // Dimension: width_index static const blend_fn blend[9] = { blend_a64_vmask_w16n_sse4_1, // w % 16 == 0 aom_blend_a64_vmask_c, // w == 1 aom_blend_a64_vmask_c, // w == 2 NULL, // INVALID blend_a64_vmask_w4_sse4_1, // w == 4 NULL, // INVALID NULL, // INVALID NULL, // INVALID blend_a64_vmask_w8_sse4_1, // w == 8 }; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w, h); } #if CONFIG_AV1_HIGHBITDEPTH ////////////////////////////////////////////////////////////////////////////// // Implementation - No sub-sampling ////////////////////////////////////////////////////////////////////////////// static inline void blend_a64_vmask_bn_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); xx_storel_64(dst, v_res_w); dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 1; } while (--h); } static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h) { (void)w; blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h, blend_4_b10); } static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h) { (void)w; blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h, blend_4_b12); } static inline void blend_a64_vmask_bn_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { int c; const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); for (c = 0; c < w; c += 8) { const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); xx_storeu_128(dst + c, v_res_w); } dst += dst_stride; src0 += src0_stride; src1 += src1_stride; mask += 1; } while (--h); } static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h) { blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w, h, blend_8_b10); } static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h) { blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w, h, blend_8_b12); } ////////////////////////////////////////////////////////////////////////////// // Dispatch ////////////////////////////////////////////////////////////////////////////// void aom_highbd_blend_a64_vmask_sse4_1( uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd) { typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h); // Dimensions are: bd_index X width_index static const blend_fn blend[2][2] = { { // bd == 8 or 10 blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0 blend_a64_vmask_b10_w4_sse4_1, // w == 4 }, { // bd == 12 blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0 blend_a64_vmask_b12_w4_sse4_1, // w == 4 } }; assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); assert(bd == 8 || bd == 10 || bd == 12); if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, src1_stride, mask, w, h, bd); } else { uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w, h); } } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/x86/blend_mask_sse4.h000066400000000000000000000230151477627663500176570ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ #define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ #include // SSE4.1 #include #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/blend.h" #include "aom_dsp/x86/synonyms.h" #include "config/aom_dsp_rtcd.h" static inline void blend_a64_d16_mask_w4_sse41( uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, int shift) { const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); const __m128i s0 = xx_loadl_64(src0); const __m128i s1 = xx_loadl_64(src1); const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1); const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m); const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m); const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset); const __m128i res_d = _mm_srai_epi32(res_c, shift); const __m128i res_e = _mm_packs_epi32(res_d, res_d); const __m128i res = _mm_packus_epi16(res_e, res_e); xx_storel_32(dst, res); } static inline void blend_a64_d16_mask_w8_sse41( uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, int shift) { const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); const __m128i s0 = xx_loadu_128(src0); const __m128i s1 = xx_loadu_128(src1); __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1), _mm_unpacklo_epi16(*m, max_minus_m)); __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1), _mm_unpackhi_epi16(*m, max_minus_m)); res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift); res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift); const __m128i res_e = _mm_packs_epi32(res_lo, res_hi); const __m128i res = _mm_packus_epi16(res_e, res_e); _mm_storel_epi64((__m128i *)(dst), res); } static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, const __m128i *round_offset, int shift) { const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); for (int i = 0; i < h; ++i) { const __m128i m0 = xx_loadl_32(mask); const __m128i m = _mm_cvtepu8_epi16(m0); blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, shift); mask += mask_stride; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, const __m128i *round_offset, int shift) { const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); for (int i = 0; i < h; ++i) { const __m128i m0 = xx_loadl_64(mask); const __m128i m = _mm_cvtepu8_epi16(m0); blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, shift); mask += mask_stride; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, const __m128i *round_offset, int shift) { const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const __m128i one_b = _mm_set1_epi8(1); const __m128i two_w = _mm_set1_epi16(2); for (int i = 0; i < h; ++i) { const __m128i m_i0 = xx_loadl_64(mask); const __m128i m_i1 = xx_loadl_64(mask + mask_stride); const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); const __m128i m = _mm_srli_epi16(m_acbd_2, 2); blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, shift); mask += mask_stride << 1; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, const __m128i *round_offset, int shift) { const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const __m128i one_b = _mm_set1_epi8(1); const __m128i two_w = _mm_set1_epi16(2); for (int i = 0; i < h; ++i) { const __m128i m_i0 = xx_loadu_128(mask); const __m128i m_i1 = xx_loadu_128(mask + mask_stride); const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); const __m128i m = _mm_srli_epi16(m_acbd_2, 2); blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, shift); mask += mask_stride << 1; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, const __m128i *round_offset, int shift) { const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const __m128i one_b = _mm_set1_epi8(1); const __m128i zeros = _mm_setzero_si128(); for (int i = 0; i < h; ++i) { const __m128i m_i0 = xx_loadl_64(mask); const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); const __m128i m = _mm_avg_epu16(m_ac, zeros); blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, shift); mask += mask_stride; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, const __m128i *round_offset, int shift) { const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const __m128i one_b = _mm_set1_epi8(1); const __m128i zeros = _mm_setzero_si128(); for (int i = 0; i < h; ++i) { const __m128i m_i0 = xx_loadu_128(mask); const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); const __m128i m = _mm_avg_epu16(m_ac, zeros); blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, shift); mask += mask_stride; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, const __m128i *round_offset, int shift) { const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const __m128i zeros = _mm_setzero_si128(); for (int i = 0; i < h; ++i) { const __m128i m_i0 = xx_loadl_64(mask); const __m128i m_i1 = xx_loadl_64(mask + mask_stride); const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, shift); mask += mask_stride << 1; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, const __m128i *round_offset, int shift) { const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const __m128i zeros = _mm_setzero_si128(); for (int i = 0; i < h; ++i) { const __m128i m_i0 = xx_loadl_64(mask); const __m128i m_i1 = xx_loadl_64(mask + mask_stride); const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, shift); mask += mask_stride << 1; dst += dst_stride; src0 += src0_stride; src1 += src1_stride; } } #endif // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ aom-3.12.1/aom_dsp/x86/blend_sse4.h000066400000000000000000000161731477627663500166530ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_ #define AOM_AOM_DSP_X86_BLEND_SSE4_H_ #include "aom_dsp/blend.h" #include "aom_dsp/x86/synonyms.h" static const uint8_t g_blend_a64_mask_shuffle[32] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, }; ////////////////////////////////////////////////////////////////////////////// // Common kernels ////////////////////////////////////////////////////////////////////////////// static inline __m128i blend_4(const uint8_t *src0, const uint8_t *src1, const __m128i *v_m0_w, const __m128i *v_m1_w) { const __m128i v_s0_b = xx_loadl_32(src0); const __m128i v_s1_b = xx_loadl_32(src1); const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); return v_res_w; } static inline __m128i blend_8(const uint8_t *src0, const uint8_t *src1, const __m128i *v_m0_w, const __m128i *v_m1_w) { const __m128i v_s0_b = xx_loadl_64(src0); const __m128i v_s1_b = xx_loadl_64(src1); const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); return v_res_w; } static inline __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1, const __m128i *v_m0_b, const __m128i *v_m1_b, const __m128i *rounding) { const __m128i v_s0_b = xx_loadl_32(src0); const __m128i v_s1_b = xx_loadl_32(src1); const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); return v_res; } static inline __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1, const __m128i *v_m0_b, const __m128i *v_m1_b, const __m128i *rounding) { const __m128i v_s0_b = xx_loadl_64(src0); const __m128i v_s1_b = xx_loadl_64(src1); const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); return v_res; } static inline __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1, const __m128i *v_m0_b, const __m128i *v_m1_b, const __m128i *rounding) { const __m128i v_s0_b = xx_loadu_128(src0); const __m128i v_s1_b = xx_loadu_128(src1); const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b), _mm_unpackhi_epi8(*v_m0_b, *v_m1_b)); const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding); const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding); const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w); return v_res; } typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1, const __m128i v_m0_w, const __m128i v_m1_w); static inline __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1, const __m128i v_m0_w, const __m128i v_m1_w) { const __m128i v_s0_w = xx_loadl_64(src0); const __m128i v_s1_w = xx_loadl_64(src1); const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); return v_res_w; } static inline __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1, const __m128i v_m0_w, const __m128i v_m1_w) { const __m128i v_s0_w = xx_loadu_128(src0); const __m128i v_s1_w = xx_loadu_128(src1); const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); return v_res_w; } static inline __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1, const __m128i v_m0_w, const __m128i v_m1_w) { const __m128i v_s0_w = xx_loadl_64(src0); const __m128i v_s1_w = xx_loadl_64(src1); // Interleave const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); // Multiply-Add const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w); // Scale const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1); // Pack const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d); // Round const __m128i v_res_w = xx_round_epu16(v_pssum_d); return v_res_w; } static inline __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1, const __m128i v_m0_w, const __m128i v_m1_w) { const __m128i v_s0_w = xx_loadu_128(src0); const __m128i v_s1_w = xx_loadu_128(src1); // Interleave const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w); const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w); // Multiply-Add const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w); const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w); // Scale const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1); const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1); // Pack const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d); // Round const __m128i v_res_w = xx_round_epu16(v_pssum_d); return v_res_w; } #endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_ aom-3.12.1/aom_dsp/x86/blk_sse_sum_avx2.c000066400000000000000000000164701477627663500200720ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" static inline void accumulate_sse_sum(__m256i regx_sum, __m256i regx2_sum, int *x_sum, int64_t *x2_sum) { __m256i sum_buffer, sse_buffer; __m128i out_buffer; // Accumulate the various elements of register into first element. sum_buffer = _mm256_permute2f128_si256(regx_sum, regx_sum, 1); regx_sum = _mm256_add_epi32(sum_buffer, regx_sum); regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 8)); regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 4)); sse_buffer = _mm256_permute2f128_si256(regx2_sum, regx2_sum, 1); regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum); regx2_sum = _mm256_add_epi64(regx2_sum, _mm256_srli_si256(regx2_sum, 8)); out_buffer = _mm256_castsi256_si128(regx_sum); *x_sum += _mm_cvtsi128_si32(out_buffer); out_buffer = _mm256_castsi256_si128(regx2_sum); #if AOM_ARCH_X86_64 *x2_sum += _mm_cvtsi128_si64(out_buffer); #else { int64_t tmp; _mm_storel_epi64((__m128i *)&tmp, out_buffer); *x2_sum += tmp; } #endif } static inline void sse_sum_wd4_avx2(const int16_t *data, int stride, int bh, int *x_sum, int64_t *x2_sum) { __m128i row1, row2, row3; __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer, temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer; const int16_t *data_tmp = data; __m256i one = _mm256_set1_epi16(1); regx_sum = _mm256_setzero_si256(); regx2_sum = regx_sum; sum_buffer = _mm256_setzero_si256(); sse_buffer = sum_buffer; for (int j = 0; j < (bh >> 2); ++j) { // Load 4 rows at a time. row1 = _mm_loadl_epi64((__m128i const *)(data_tmp)); row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + stride)); row1 = _mm_unpacklo_epi64(row1, row2); row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + 2 * stride)); row3 = _mm_loadl_epi64((__m128i const *)(data_tmp + 3 * stride)); row2 = _mm_unpacklo_epi64(row2, row3); load_pixels = _mm256_insertf128_si256(_mm256_castsi128_si256(row1), row2, 1); row_sum_buffer = _mm256_madd_epi16(load_pixels, one); row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels); sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer); sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer); data_tmp += 4 * stride; } // To prevent 32-bit variable overflow, unpack the elements to 64-bit. temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256()); temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256()); sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2); regx_sum = _mm256_add_epi32(sum_buffer, regx_sum); regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum); accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum); } static inline void sse_sum_wd8_avx2(const int16_t *data, int stride, int bh, int *x_sum, int64_t *x2_sum) { __m128i load_128bit, load_next_128bit; __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer, temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer; const int16_t *data_tmp = data; __m256i one = _mm256_set1_epi16(1); regx_sum = _mm256_setzero_si256(); regx2_sum = regx_sum; sum_buffer = _mm256_setzero_si256(); sse_buffer = sum_buffer; for (int j = 0; j < (bh >> 1); ++j) { // Load 2 rows at a time. load_128bit = _mm_loadu_si128((__m128i const *)(data_tmp)); load_next_128bit = _mm_loadu_si128((__m128i const *)(data_tmp + stride)); load_pixels = _mm256_insertf128_si256(_mm256_castsi128_si256(load_128bit), load_next_128bit, 1); row_sum_buffer = _mm256_madd_epi16(load_pixels, one); row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels); sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer); sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer); data_tmp += 2 * stride; } temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256()); temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256()); sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2); regx_sum = _mm256_add_epi32(sum_buffer, regx_sum); regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum); accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum); } static inline void sse_sum_wd16_avx2(const int16_t *data, int stride, int bh, int *x_sum, int64_t *x2_sum, int loop_count) { __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer, temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer; const int16_t *data_tmp = data; __m256i one = _mm256_set1_epi16(1); regx_sum = _mm256_setzero_si256(); regx2_sum = regx_sum; sum_buffer = _mm256_setzero_si256(); sse_buffer = sum_buffer; for (int i = 0; i < loop_count; ++i) { data_tmp = data + 16 * i; for (int j = 0; j < bh; ++j) { load_pixels = _mm256_lddqu_si256((__m256i const *)(data_tmp)); row_sum_buffer = _mm256_madd_epi16(load_pixels, one); row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels); sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer); sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer); data_tmp += stride; } } temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256()); temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256()); sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2); regx_sum = _mm256_add_epi32(sum_buffer, regx_sum); regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum); accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum); } void aom_get_blk_sse_sum_avx2(const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum) { *x_sum = 0; *x2_sum = 0; if ((bh & 3) == 0) { switch (bw) { // For smaller block widths, compute multiple rows simultaneously. case 4: sse_sum_wd4_avx2(data, stride, bh, x_sum, x2_sum); break; case 8: sse_sum_wd8_avx2(data, stride, bh, x_sum, x2_sum); break; case 16: case 32: sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4); break; case 64: // 32-bit variables will overflow for 64 rows at a single time, so // compute 32 rows at a time. if (bh <= 32) { sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4); } else { sse_sum_wd16_avx2(data, stride, 32, x_sum, x2_sum, bw >> 4); sse_sum_wd16_avx2(data + 32 * stride, stride, 32, x_sum, x2_sum, bw >> 4); } break; default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum); } } else { aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum); } } aom-3.12.1/aom_dsp/x86/blk_sse_sum_sse2.c000066400000000000000000000117361477627663500200660ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" static inline void sse_sum_wd4_sse2(const int16_t *data, int stride, int bh, int *x_sum, int64_t *x2_sum) { const int16_t *data_tmp = data; __m128i temp_buffer1, temp_buffer2; __m128i load_pixels_low, load_pixels_hi, sum_buffer, sse_buffer; __m128i one = _mm_set1_epi16(1); __m128i regx_sum = _mm_setzero_si128(); __m128i regx2_sum = regx_sum; for (int j = 0; j < (bh >> 1); ++j) { // Load 2 rows (8 pixels) at a time. load_pixels_low = _mm_loadl_epi64((__m128i const *)(data_tmp)); load_pixels_hi = _mm_loadl_epi64((__m128i const *)(data_tmp + stride)); load_pixels_low = _mm_unpacklo_epi64(load_pixels_low, load_pixels_hi); sum_buffer = _mm_madd_epi16(load_pixels_low, one); sse_buffer = _mm_madd_epi16(load_pixels_low, load_pixels_low); regx_sum = _mm_add_epi32(sum_buffer, regx_sum); regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum); data_tmp += 2 * stride; } regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8)); regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4)); *x_sum = _mm_cvtsi128_si32(regx_sum); temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128()); temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128()); regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2); regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8)); #if AOM_ARCH_X86_64 *x2_sum += _mm_cvtsi128_si64(regx2_sum); #else { int64_t tmp; _mm_storel_epi64((__m128i *)&tmp, regx2_sum); *x2_sum += tmp; } #endif } static inline void sse_sum_wd8_sse2(const int16_t *data, int stride, int bh, int *x_sum, int64_t *x2_sum, int loop_cycles) { const int16_t *data_tmp; __m128i temp_buffer1, temp_buffer2; __m128i one = _mm_set1_epi16(1); __m128i regx_sum = _mm_setzero_si128(); __m128i regx2_sum = regx_sum; __m128i load_pixels, sum_buffer, sse_buffer; for (int i = 0; i < loop_cycles; ++i) { data_tmp = data + (8 * i); for (int j = 0; j < bh; ++j) { // Load 1 row (8-pixels) at a time. load_pixels = _mm_loadu_si128((__m128i const *)(data_tmp)); sum_buffer = _mm_madd_epi16(load_pixels, one); sse_buffer = _mm_madd_epi16(load_pixels, load_pixels); regx_sum = _mm_add_epi32(sum_buffer, regx_sum); regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum); data_tmp += stride; } } regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8)); regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4)); *x_sum += _mm_cvtsi128_si32(regx_sum); temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128()); temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128()); regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2); regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8)); #if AOM_ARCH_X86_64 *x2_sum += _mm_cvtsi128_si64(regx2_sum); #else { int64_t tmp; _mm_storel_epi64((__m128i *)&tmp, regx2_sum); *x2_sum += tmp; } #endif } // This functions adds SSE2 Support for the functions 'get_blk_sse_sum_c' void aom_get_blk_sse_sum_sse2(const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum) { *x_sum = 0; *x2_sum = 0; if ((bh & 3) == 0) { switch (bw) { case 4: sse_sum_wd4_sse2(data, stride, bh, x_sum, x2_sum); break; case 8: case 16: sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3); break; // For widths 32 and 64, the registers may overflow. So compute // partial widths at a time. case 32: if (bh <= 32) { sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3); break; } else { sse_sum_wd8_sse2(data, stride, 32, x_sum, x2_sum, bw >> 3); sse_sum_wd8_sse2(data + 32 * stride, stride, 32, x_sum, x2_sum, bw >> 3); break; } case 64: if (bh <= 16) { sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3); break; } else { for (int i = 0; i < bh; i += 16) sse_sum_wd8_sse2(data + i * stride, stride, 16, x_sum, x2_sum, bw >> 3); break; } default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum); } } else { aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum); } } aom-3.12.1/aom_dsp/x86/common_avx2.h000066400000000000000000000154131477627663500170550ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_ #define AOM_AOM_DSP_X86_COMMON_AVX2_H_ #include #include "config/aom_config.h" // Note: in and out could have the same value static inline void mm256_transpose_16x16(const __m256i *in, __m256i *out) { __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]); __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]); __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]); __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]); __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]); __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]); __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]); __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]); __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]); __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]); __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]); __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]); // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2); __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2); __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3); __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3); __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6); __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6); __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7); __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7); __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a); __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a); __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b); __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b); __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e); __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e); __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f); __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f); // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39 // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79 // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9 // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9 // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5); tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5); tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6); tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6); tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c); tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c); tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d); tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d); tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e); tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e); tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f); tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f); // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8 // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9 // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); } #endif // AOM_AOM_DSP_X86_COMMON_AVX2_H_ aom-3.12.1/aom_dsp/x86/convolve.h000066400000000000000000000340171477627663500164610ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_CONVOLVE_H_ #define AOM_AOM_DSP_X86_CONVOLVE_H_ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter); #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ void aom_convolve8_##name##_##opt( \ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ const int16_t *filter_y, int y_step_q4, int w, int h) { \ (void)filter_x; \ (void)x_step_q4; \ (void)filter_y; \ (void)y_step_q4; \ assert((-128 <= filter[3]) && (filter[3] <= 127)); \ assert(step_q4 == 16); \ if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \ (filter[2] | filter[5])) { \ while (w >= 16) { \ aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ dst_stride, h, filter); \ src += 16; \ dst += 16; \ w -= 16; \ } \ while (w >= 8) { \ aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ dst_stride, h, filter); \ src += 8; \ dst += 8; \ w -= 8; \ } \ while (w >= 4) { \ aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ dst_stride, h, filter); \ src += 4; \ dst += 4; \ w -= 4; \ } \ } else if (filter[0] | filter[1] | filter[2]) { \ while (w >= 16) { \ aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ dst_stride, h, filter); \ src += 16; \ dst += 16; \ w -= 16; \ } \ while (w >= 8) { \ aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ dst_stride, h, filter); \ src += 8; \ dst += 8; \ w -= 8; \ } \ while (w >= 4) { \ aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ dst_stride, h, filter); \ src += 4; \ dst += 4; \ w -= 4; \ } \ } else { \ while (w >= 16) { \ aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \ dst_stride, h, filter); \ src += 16; \ dst += 16; \ w -= 16; \ } \ while (w >= 8) { \ aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \ dst_stride, h, filter); \ src += 8; \ dst += 8; \ w -= 8; \ } \ while (w >= 4) { \ aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \ dst_stride, h, filter); \ src += 4; \ dst += 4; \ w -= 4; \ } \ } \ if (w) { \ aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \ x_step_q4, filter_y, y_step_q4, w, h); \ } \ } #if CONFIG_AV1_HIGHBITDEPTH typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, const ptrdiff_t src_pitch, uint16_t *output_ptr, ptrdiff_t out_pitch, unsigned int output_height, const int16_t *filter, int bd); #define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ void aom_highbd_convolve8_##name##_##opt( \ const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ if (step_q4 == 16 && filter[3] != 128) { \ if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \ (filter[2] | filter[5])) { \ while (w >= 16) { \ aom_highbd_filter_block1d16_##dir##4_##avg##opt( \ src_start, src_stride, dst, dst_stride, h, filter, bd); \ src += 16; \ dst += 16; \ w -= 16; \ } \ while (w >= 8) { \ aom_highbd_filter_block1d8_##dir##4_##avg##opt( \ src_start, src_stride, dst, dst_stride, h, filter, bd); \ src += 8; \ dst += 8; \ w -= 8; \ } \ while (w >= 4) { \ aom_highbd_filter_block1d4_##dir##4_##avg##opt( \ src_start, src_stride, dst, dst_stride, h, filter, bd); \ src += 4; \ dst += 4; \ w -= 4; \ } \ } else if (filter[0] | filter[1] | filter[2]) { \ while (w >= 16) { \ aom_highbd_filter_block1d16_##dir##8_##avg##opt( \ src_start, src_stride, dst, dst_stride, h, filter, bd); \ src += 16; \ dst += 16; \ w -= 16; \ } \ while (w >= 8) { \ aom_highbd_filter_block1d8_##dir##8_##avg##opt( \ src_start, src_stride, dst, dst_stride, h, filter, bd); \ src += 8; \ dst += 8; \ w -= 8; \ } \ while (w >= 4) { \ aom_highbd_filter_block1d4_##dir##8_##avg##opt( \ src_start, src_stride, dst, dst_stride, h, filter, bd); \ src += 4; \ dst += 4; \ w -= 4; \ } \ } else { \ while (w >= 16) { \ aom_highbd_filter_block1d16_##dir##2_##avg##opt( \ src, src_stride, dst, dst_stride, h, filter, bd); \ src += 16; \ dst += 16; \ w -= 16; \ } \ while (w >= 8) { \ aom_highbd_filter_block1d8_##dir##2_##avg##opt( \ src, src_stride, dst, dst_stride, h, filter, bd); \ src += 8; \ dst += 8; \ w -= 8; \ } \ while (w >= 4) { \ aom_highbd_filter_block1d4_##dir##2_##avg##opt( \ src, src_stride, dst, dst_stride, h, filter, bd); \ src += 4; \ dst += 4; \ w -= 4; \ } \ } \ } \ if (w) { \ aom_highbd_convolve8_##name##_c( \ CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst), \ dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ } \ } #endif // CONFIG_AV1_HIGHBITDEPTH #endif // AOM_AOM_DSP_X86_CONVOLVE_H_ aom-3.12.1/aom_dsp/x86/convolve_avx2.h000066400000000000000000001604141477627663500174220ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ #define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ #include #include "aom_ports/mem.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" // filters for 16 DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, }; DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = { 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, }; DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = { 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255 }; DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 }; DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 }; DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; #define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP \ for (i = 0; i < (im_h - 2); i += 2) { \ __m256i data = _mm256_castsi128_si256( \ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ data = _mm256_inserti128_si256( \ data, \ _mm_loadu_si128( \ (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \ 1); \ __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt); \ res = \ _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ } \ __m256i data_1 = _mm256_castsi128_si256( \ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt); \ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); #define CONVOLVE_SR_VERTICAL_FILTER_4TAP \ __m256i s[6]; \ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ \ s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ s[3] = _mm256_unpackhi_epi16(src_0, src_1); \ s[4] = _mm256_unpackhi_epi16(src_2, src_3); \ \ for (i = 0; i < h; i += 2) { \ const int16_t *data = &im_block[i * im_stride]; \ const __m256i s4 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \ const __m256i s5 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \ s[2] = _mm256_unpacklo_epi16(s4, s5); \ s[5] = _mm256_unpackhi_epi16(s4, s5); \ \ __m256i res_a = convolve_4tap(s, coeffs_v + 1); \ __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1); \ \ res_a = \ _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ res_b = \ _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ const __m256i res_a_round = _mm256_sra_epi32( \ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ const __m256i res_b_round = _mm256_sra_epi32( \ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ \ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ if (w - j > 4) { \ _mm_storel_epi64(p_0, res_0); \ _mm_storel_epi64(p_1, res_1); \ } else if (w == 4) { \ xx_storel_32(p_0, res_0); \ xx_storel_32(p_1, res_1); \ } else { \ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ } \ \ s[0] = s[1]; \ s[1] = s[2]; \ s[3] = s[4]; \ s[4] = s[5]; \ } #define CONVOLVE_SR_HORIZONTAL_FILTER_6TAP \ for (i = 0; i < (im_h - 2); i += 2) { \ __m256i data = _mm256_castsi128_si256( \ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ data = _mm256_inserti128_si256( \ data, \ _mm_loadu_si128( \ (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \ 1); \ \ __m256i res = convolve_lowbd_x_6tap(data, coeffs_h, filt); \ res = \ _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ } \ \ __m256i data_1 = _mm256_castsi128_si256( \ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ \ __m256i res = convolve_lowbd_x_6tap(data_1, coeffs_h, filt); \ \ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ \ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); #define CONVOLVE_SR_VERTICAL_FILTER_6TAP \ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ \ __m256i s[8]; \ s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ \ s[3] = _mm256_unpackhi_epi16(src_0, src_1); \ s[4] = _mm256_unpackhi_epi16(src_2, src_3); \ \ for (i = 0; i < h; i += 2) { \ const int16_t *data = &im_block[i * im_stride]; \ \ const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \ const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \ \ s[2] = _mm256_unpacklo_epi16(s6, s7); \ s[5] = _mm256_unpackhi_epi16(s6, s7); \ \ __m256i res_a = convolve_6tap(s, coeffs_v); \ __m256i res_b = convolve_6tap(s + 3, coeffs_v); \ \ res_a = \ _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ res_b = \ _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ \ const __m256i res_a_round = _mm256_sra_epi32( \ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ const __m256i res_b_round = _mm256_sra_epi32( \ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ \ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ \ const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ \ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ if (w - j > 4) { \ _mm_storel_epi64(p_0, res_0); \ _mm_storel_epi64(p_1, res_1); \ } else if (w == 4) { \ xx_storel_32(p_0, res_0); \ xx_storel_32(p_1, res_1); \ } else { \ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ } \ \ s[0] = s[1]; \ s[1] = s[2]; \ \ s[3] = s[4]; \ s[4] = s[5]; \ } #define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \ for (i = 0; i < (im_h - 2); i += 2) { \ __m256i data = _mm256_castsi128_si256( \ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ data = _mm256_inserti128_si256( \ data, \ _mm_loadu_si128( \ (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \ 1); \ \ __m256i res = convolve_lowbd_x(data, coeffs_h, filt); \ res = \ _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ } \ \ __m256i data_1 = _mm256_castsi128_si256( \ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ \ __m256i res = convolve_lowbd_x(data_1, coeffs_h, filt); \ \ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ \ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); #define CONVOLVE_SR_VERTICAL_FILTER_8TAP \ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ \ __m256i s[8]; \ s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ \ s[4] = _mm256_unpackhi_epi16(src_0, src_1); \ s[5] = _mm256_unpackhi_epi16(src_2, src_3); \ s[6] = _mm256_unpackhi_epi16(src_4, src_5); \ \ for (i = 0; i < h; i += 2) { \ const int16_t *data = &im_block[i * im_stride]; \ \ const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ \ s[3] = _mm256_unpacklo_epi16(s6, s7); \ s[7] = _mm256_unpackhi_epi16(s6, s7); \ \ __m256i res_a = convolve(s, coeffs_v); \ __m256i res_b = convolve(s + 4, coeffs_v); \ \ res_a = \ _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ res_b = \ _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ \ const __m256i res_a_round = _mm256_sra_epi32( \ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ const __m256i res_b_round = _mm256_sra_epi32( \ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ \ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ \ const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ \ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ if (w - j > 4) { \ _mm_storel_epi64(p_0, res_0); \ _mm_storel_epi64(p_1, res_1); \ } else if (w == 4) { \ xx_storel_32(p_0, res_0); \ xx_storel_32(p_1, res_1); \ } else { \ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ } \ \ s[0] = s[1]; \ s[1] = s[2]; \ s[2] = s[3]; \ \ s[4] = s[5]; \ s[5] = s[6]; \ s[6] = s[7]; \ } #define CONVOLVE_SR_HORIZONTAL_FILTER_12TAP \ const __m256i v_zero = _mm256_setzero_si256(); \ __m256i s[12]; \ if (w <= 4) { \ for (i = 0; i < im_h; i += 2) { \ const __m256i data = _mm256_permute2x128_si256( \ _mm256_castsi128_si256( \ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), \ _mm256_castsi128_si256(_mm_loadu_si128( \ (__m128i *)(&src_ptr[i * src_stride + src_stride + j]))), \ 0x20); \ const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); \ const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); \ const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); \ const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); \ \ const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); \ const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); \ \ s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); \ s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); \ s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); \ s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); \ s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); \ s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); \ \ const __m256i res_lo = convolve_12taps(s, coeffs_h); \ \ __m256i res_32b_lo = _mm256_sra_epi32( \ _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12); \ __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); \ const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0); \ const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1); \ if (w > 2) { \ _mm_storel_epi64((__m128i *)&im_block[i * im_stride], res_0); \ _mm_storel_epi64((__m128i *)&im_block[i * im_stride + im_stride], \ res_1); \ } else { \ uint32_t horiz_2; \ horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0); \ im_block[i * im_stride] = (uint16_t)horiz_2; \ im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16); \ horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1); \ im_block[i * im_stride + im_stride] = (uint16_t)horiz_2; \ im_block[i * im_stride + im_stride + 1] = (uint16_t)(horiz_2 >> 16); \ } \ } \ } else { \ for (i = 0; i < im_h; i++) { \ const __m256i data = _mm256_permute2x128_si256( \ _mm256_castsi128_si256( \ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), \ _mm256_castsi128_si256( \ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j + 4]))), \ 0x20); \ const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); \ const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); \ \ const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); \ const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); \ \ const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); \ const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); \ \ s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); \ s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); \ s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); \ s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); \ s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); \ s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); \ \ const __m256i res_lo = convolve_12taps(s, coeffs_h); \ \ __m256i res_32b_lo = _mm256_sra_epi32( \ _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12); \ \ __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); \ _mm_store_si128((__m128i *)&im_block[i * im_stride], \ _mm256_extracti128_si256( \ _mm256_permute4x64_epi64(res_16b_lo, 0x88), 0)); \ } \ } #define CONVOLVE_SR_VERTICAL_FILTER_12TAP \ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ __m256i src_6 = _mm256_loadu_si256((__m256i *)(im_block + 6 * im_stride)); \ __m256i src_7 = _mm256_loadu_si256((__m256i *)(im_block + 7 * im_stride)); \ __m256i src_8 = _mm256_loadu_si256((__m256i *)(im_block + 8 * im_stride)); \ __m256i src_9 = _mm256_loadu_si256((__m256i *)(im_block + 9 * im_stride)); \ \ s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ s[3] = _mm256_unpacklo_epi16(src_6, src_7); \ s[4] = _mm256_unpacklo_epi16(src_8, src_9); \ \ s[6] = _mm256_unpackhi_epi16(src_0, src_1); \ s[7] = _mm256_unpackhi_epi16(src_2, src_3); \ s[8] = _mm256_unpackhi_epi16(src_4, src_5); \ s[9] = _mm256_unpackhi_epi16(src_6, src_7); \ s[10] = _mm256_unpackhi_epi16(src_8, src_9); \ \ for (i = 0; i < h; i += 2) { \ const int16_t *data = &im_block[i * im_stride]; \ \ const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 10 * im_stride)); \ const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 11 * im_stride)); \ \ s[5] = _mm256_unpacklo_epi16(s6, s7); \ s[11] = _mm256_unpackhi_epi16(s6, s7); \ \ __m256i res_a = convolve_12taps(s, coeffs_v); \ __m256i res_b = convolve_12taps(s + 6, coeffs_v); \ \ res_a = \ _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ res_b = \ _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ \ const __m256i res_a_round = _mm256_sra_epi32( \ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ const __m256i res_b_round = _mm256_sra_epi32( \ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ \ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ \ const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ \ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ if (w - j > 4) { \ _mm_storel_epi64(p_0, res_0); \ _mm_storel_epi64(p_1, res_1); \ } else if (w == 4) { \ xx_storel_32(p_0, res_0); \ xx_storel_32(p_1, res_1); \ } else { \ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ } \ \ s[0] = s[1]; \ s[1] = s[2]; \ s[2] = s[3]; \ s[3] = s[4]; \ s[4] = s[5]; \ \ s[6] = s[7]; \ s[7] = s[8]; \ s[8] = s[9]; \ s[9] = s[10]; \ s[10] = s[11]; \ } #define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP \ do { \ for (i = 0; i < im_h; i += 2) { \ __m256i data = \ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); \ if (i + 1 < im_h) \ data = _mm256_inserti128_si256( \ data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \ src_h += (src_stride << 1); \ __m256i res = convolve_lowbd_x(data, coeffs_x, filt); \ \ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), \ round_shift_h); \ \ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ } \ } while (0) #define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP \ do { \ __m256i s[8]; \ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ \ s[0] = _mm256_unpacklo_epi16(s0, s1); \ s[1] = _mm256_unpacklo_epi16(s2, s3); \ s[2] = _mm256_unpacklo_epi16(s4, s5); \ \ s[4] = _mm256_unpackhi_epi16(s0, s1); \ s[5] = _mm256_unpackhi_epi16(s2, s3); \ s[6] = _mm256_unpackhi_epi16(s4, s5); \ \ for (i = 0; i < h; i += 2) { \ const int16_t *data = &im_block[i * im_stride]; \ \ const __m256i s6 = \ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ const __m256i s7 = \ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ \ s[3] = _mm256_unpacklo_epi16(s6, s7); \ s[7] = _mm256_unpackhi_epi16(s6, s7); \ \ const __m256i res_a = convolve(s, coeffs_y); \ const __m256i res_a_round = _mm256_sra_epi32( \ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ \ if (w - j > 4) { \ const __m256i res_b = convolve(s + 4, coeffs_y); \ const __m256i res_b_round = _mm256_sra_epi32( \ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ \ if (do_average) { \ const __m256i data_ref_0 = \ load_line2_avx2(&dst[i * dst_stride + j], \ &dst[i * dst_stride + j + dst_stride]); \ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \ &wt, use_dist_wtd_comp_avg); \ \ const __m256i round_result = convolve_rounding( \ &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ \ const __m256i res_8 = \ _mm256_packus_epi16(round_result, round_result); \ const __m128i res_0 = _mm256_castsi256_si128(res_8); \ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ \ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); \ _mm_storel_epi64( \ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \ } else { \ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ \ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ res_1); \ } \ } else { \ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ \ if (do_average) { \ const __m256i data_ref_0 = \ load_line2_avx2(&dst[i * dst_stride + j], \ &dst[i * dst_stride + j + dst_stride]); \ \ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \ &wt, use_dist_wtd_comp_avg); \ \ const __m256i round_result = convolve_rounding( \ &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ \ const __m256i res_8 = \ _mm256_packus_epi16(round_result, round_result); \ const __m128i res_0 = _mm256_castsi256_si128(res_8); \ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ \ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); \ *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = \ _mm_cvtsi128_si32(res_1); \ \ } else { \ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ \ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ res_1); \ } \ } \ \ s[0] = s[1]; \ s[1] = s[2]; \ s[2] = s[3]; \ \ s[4] = s[5]; \ s[5] = s[6]; \ s[6] = s[7]; \ } \ } while (0) static inline void prepare_coeffs_lowbd( const InterpFilterParams *const filter_params, const int subpel_q4, __m256i *const coeffs /* [4] */) { const int16_t *const filter = av1_get_interp_filter_subpel_kernel( filter_params, subpel_q4 & SUBPEL_MASK); const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); // right shift all filter co-efficients by 1 to reduce the bits required. // This extra right shift will be taken care of at the end while rounding // the result. // Since all filter co-efficients are even, this change will not affect the // end result assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), _mm_set1_epi16((short)0xffff))); const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); // coeffs 0 1 0 1 0 1 0 1 coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); // coeffs 2 3 2 3 2 3 2 3 coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); // coeffs 4 5 4 5 4 5 4 5 coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); // coeffs 6 7 6 7 6 7 6 7 coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); } static inline void prepare_coeffs_6t_lowbd( const InterpFilterParams *const filter_params, const int subpel_q4, __m256i *const coeffs /* [4] */) { const int16_t *const filter = av1_get_interp_filter_subpel_kernel( filter_params, subpel_q4 & SUBPEL_MASK); const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); // right shift all filter co-efficients by 1 to reduce the bits required. // This extra right shift will be taken care of at the end while rounding // the result. // Since all filter co-efficients are even, this change will not affect the // end result assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), _mm_set1_epi16((int16_t)0xffff))); const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); // coeffs 1 2 1 2 1 2 1 2 coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u)); // coeffs 3 4 3 4 3 4 3 4 coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); // coeffs 5 6 5 6 5 6 5 6 coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au)); } static inline void prepare_coeffs_6t( const InterpFilterParams *const filter_params, const int subpel_q4, __m256i *const coeffs /* [4] */) { const int16_t *filter = av1_get_interp_filter_subpel_kernel( filter_params, subpel_q4 & SUBPEL_MASK); const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); // coeffs 1 2 1 2 1 2 1 2 coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); // coeffs 3 4 3 4 3 4 3 4 coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); // coeffs 5 6 5 6 5 6 5 6 coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); } static inline void prepare_coeffs(const InterpFilterParams *const filter_params, const int subpel_q4, __m256i *const coeffs /* [4] */) { const int16_t *filter = av1_get_interp_filter_subpel_kernel( filter_params, subpel_q4 & SUBPEL_MASK); const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); // coeffs 0 1 0 1 0 1 0 1 coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); // coeffs 2 3 2 3 2 3 2 3 coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); // coeffs 4 5 4 5 4 5 4 5 coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); // coeffs 6 7 6 7 6 7 6 7 coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); } static inline void prepare_coeffs_12taps( const InterpFilterParams *const filter_params, const int subpel_q4, __m256i *const coeffs /* [4] */) { const int16_t *filter = av1_get_interp_filter_subpel_kernel( filter_params, subpel_q4 & SUBPEL_MASK); __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); // coeffs 0 1 0 1 0 1 0 1 coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); // coeffs 2 3 2 3 2 3 2 3 coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); // coeffs 4 5 4 5 4 5 4 5 coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); // coeffs 6 7 6 7 6 7 6 7 coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); // coeffs 8 9 10 11 0 0 0 0 coeff_8 = _mm_loadl_epi64((__m128i *)(filter + 8)); coeff = _mm256_broadcastq_epi64(coeff_8); coeffs[4] = _mm256_shuffle_epi32(coeff, 0x00); // coeffs 8 9 8 9 8 9 8 9 coeffs[5] = _mm256_shuffle_epi32(coeff, 0x55); // coeffs 10 11 10 11.. 10 11 } static inline __m256i convolve_lowbd(const __m256i *const s, const __m256i *const coeffs) { const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), _mm256_add_epi16(res_23, res_67)); return res; } static inline __m256i convolve_lowbd_6tap(const __m256i *const s, const __m256i *const coeffs) { const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23); return res; } static inline __m256i convolve_lowbd_4tap(const __m256i *const s, const __m256i *const coeffs) { const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 const __m256i res = _mm256_add_epi16(res_45, res_23); return res; } static inline __m256i convolve_6tap(const __m256i *const s, const __m256i *const coeffs) { const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2); return res; } static inline __m256i convolve_12taps(const __m256i *const s, const __m256i *const coeffs) { const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); const __m256i res_4 = _mm256_madd_epi16(s[4], coeffs[4]); const __m256i res_5 = _mm256_madd_epi16(s[5], coeffs[5]); const __m256i res1 = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), _mm256_add_epi32(res_2, res_3)); const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_4, res_5), res1); return res; } static inline __m256i convolve(const __m256i *const s, const __m256i *const coeffs) { const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), _mm256_add_epi32(res_2, res_3)); return res; } static inline __m256i convolve_4tap(const __m256i *const s, const __m256i *const coeffs) { const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); const __m256i res = _mm256_add_epi32(res_1, res_2); return res; } static inline __m256i convolve_lowbd_x(const __m256i data, const __m256i *const coeffs, const __m256i *const filt) { __m256i s[4]; s[0] = _mm256_shuffle_epi8(data, filt[0]); s[1] = _mm256_shuffle_epi8(data, filt[1]); s[2] = _mm256_shuffle_epi8(data, filt[2]); s[3] = _mm256_shuffle_epi8(data, filt[3]); return convolve_lowbd(s, coeffs); } static inline __m256i convolve_lowbd_x_6tap(const __m256i data, const __m256i *const coeffs, const __m256i *const filt) { __m256i s[4]; s[0] = _mm256_shuffle_epi8(data, filt[0]); s[1] = _mm256_shuffle_epi8(data, filt[1]); s[2] = _mm256_shuffle_epi8(data, filt[2]); return convolve_lowbd_6tap(s, coeffs); } static inline __m256i convolve_lowbd_x_4tap(const __m256i data, const __m256i *const coeffs, const __m256i *const filt) { __m256i s[2]; s[0] = _mm256_shuffle_epi8(data, filt[0]); s[1] = _mm256_shuffle_epi8(data, filt[1]); return convolve_lowbd_4tap(s, coeffs); } static inline void add_store_aligned_256(CONV_BUF_TYPE *const dst, const __m256i *const res, const int do_average) { __m256i d; if (do_average) { d = _mm256_load_si256((__m256i *)dst); d = _mm256_add_epi32(d, *res); d = _mm256_srai_epi32(d, 1); } else { d = *res; } _mm256_store_si256((__m256i *)dst, d); } static inline __m256i comp_avg(const __m256i *const data_ref_0, const __m256i *const res_unsigned, const __m256i *const wt, const int use_dist_wtd_comp_avg) { __m256i res; if (use_dist_wtd_comp_avg) { const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); res = _mm256_packs_epi32(res_lo, res_hi); } else { const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); res = _mm256_srai_epi16(wt_res, 1); } return res; } static inline __m256i convolve_rounding(const __m256i *const res_unsigned, const __m256i *const offset_const, const __m256i *const round_const, const int round_shift) { const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); const __m256i res_round = _mm256_srai_epi16( _mm256_add_epi16(res_signed, *round_const), round_shift); return res_round; } static inline __m256i highbd_comp_avg(const __m256i *const data_ref_0, const __m256i *const res_unsigned, const __m256i *const wt0, const __m256i *const wt1, const int use_dist_wtd_comp_avg) { __m256i res; if (use_dist_wtd_comp_avg) { const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); } else { const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); res = _mm256_srai_epi32(wt_res, 1); } return res; } static inline __m256i highbd_convolve_rounding( const __m256i *const res_unsigned, const __m256i *const offset_const, const __m256i *const round_const, const int round_shift) { const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); const __m256i res_round = _mm256_srai_epi32( _mm256_add_epi32(res_signed, *round_const), round_shift); return res_round; } #endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ aom-3.12.1/aom_dsp/x86/convolve_common_intrin.h000066400000000000000000000101451477627663500214100ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ #define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ // Note: // This header file should be put below any x86 intrinsics head file static inline void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res, const int do_average) { __m128i d; if (do_average) { d = _mm_load_si128((__m128i *)dst); d = _mm_add_epi32(d, *res); d = _mm_srai_epi32(d, 1); } else { d = *res; } _mm_store_si128((__m128i *)dst, d); } static inline void prepare_coeffs_12tap(const InterpFilterParams *filter_params, int subpel_q4, __m128i *coeffs /* [6] */) { const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel( filter_params, subpel_q4 & SUBPEL_MASK); __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); coeffs[0] = _mm_shuffle_epi32(coeffs_y, 0); // coeffs 0 1 0 1 0 1 0 1 coeffs[1] = _mm_shuffle_epi32(coeffs_y, 85); // coeffs 2 3 2 3 2 3 2 3 coeffs[2] = _mm_shuffle_epi32(coeffs_y, 170); // coeffs 4 5 4 5 4 5 4 5 coeffs[3] = _mm_shuffle_epi32(coeffs_y, 255); // coeffs 6 7 6 7 6 7 6 7 coeffs_y = _mm_loadl_epi64((__m128i *)(y_filter + 8)); coeffs[4] = _mm_shuffle_epi32(coeffs_y, 0); // coeffs 8 9 8 9 8 9 8 9 coeffs[5] = _mm_shuffle_epi32(coeffs_y, 85); // coeffs 10 11 10 11 10 11 10 11 } static inline __m128i convolve_12tap(const __m128i *s, const __m128i *coeffs) { const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]); const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]); const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]); const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]); const __m128i d4 = _mm_madd_epi16(s[4], coeffs[4]); const __m128i d5 = _mm_madd_epi16(s[5], coeffs[5]); const __m128i d_0123 = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3)); const __m128i d = _mm_add_epi32(_mm_add_epi32(d4, d5), d_0123); return d; } static inline __m128i convolve_lo_x_12tap(const __m128i *s, const __m128i *coeffs, const __m128i zero) { __m128i ss[6]; ss[0] = _mm_unpacklo_epi8(s[0], zero); // 0 1 1 2 2 3 3 4 ss[1] = _mm_unpacklo_epi8(s[1], zero); // 2 3 3 4 4 5 5 6 ss[2] = _mm_unpacklo_epi8(s[2], zero); // 4 5 5 6 6 7 7 8 ss[3] = _mm_unpacklo_epi8(s[3], zero); // 6 7 7 8 8 9 9 10 ss[4] = _mm_unpackhi_epi8(s[2], zero); // 8 9 9 10 10 11 11 12 ss[5] = _mm_unpackhi_epi8(s[3], zero); // 10 11 11 12 12 13 13 14 return convolve_12tap(ss, coeffs); } static inline __m128i convolve_lo_y_12tap(const __m128i *s, const __m128i *coeffs) { __m128i ss[6]; const __m128i zero = _mm_setzero_si128(); ss[0] = _mm_unpacklo_epi8(s[0], zero); ss[1] = _mm_unpacklo_epi8(s[2], zero); ss[2] = _mm_unpacklo_epi8(s[4], zero); ss[3] = _mm_unpacklo_epi8(s[6], zero); ss[4] = _mm_unpacklo_epi8(s[8], zero); ss[5] = _mm_unpacklo_epi8(s[10], zero); return convolve_12tap(ss, coeffs); } static inline __m128i convolve_hi_y_12tap(const __m128i *s, const __m128i *coeffs) { __m128i ss[6]; const __m128i zero = _mm_setzero_si128(); ss[0] = _mm_unpackhi_epi8(s[0], zero); ss[1] = _mm_unpackhi_epi8(s[2], zero); ss[2] = _mm_unpackhi_epi8(s[4], zero); ss[3] = _mm_unpackhi_epi8(s[6], zero); ss[4] = _mm_unpackhi_epi8(s[8], zero); ss[5] = _mm_unpackhi_epi8(s[10], zero); return convolve_12tap(ss, coeffs); } #endif // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ aom-3.12.1/aom_dsp/x86/convolve_sse2.h000066400000000000000000000114401477627663500174100ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ #define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ #include "config/aom_scale_rtcd.h" // Note: // This header file should be put below any x86 intrinsics head file static inline void prepare_coeffs(const InterpFilterParams *const filter_params, const int subpel_q4, __m128i *const coeffs /* [4] */) { const int16_t *filter = av1_get_interp_filter_subpel_kernel( filter_params, subpel_q4 & SUBPEL_MASK); const __m128i coeff = _mm_loadu_si128((__m128i *)filter); // coeffs 0 1 0 1 0 1 0 1 coeffs[0] = _mm_shuffle_epi32(coeff, 0x00); // coeffs 2 3 2 3 2 3 2 3 coeffs[1] = _mm_shuffle_epi32(coeff, 0x55); // coeffs 4 5 4 5 4 5 4 5 coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa); // coeffs 6 7 6 7 6 7 6 7 coeffs[3] = _mm_shuffle_epi32(coeff, 0xff); } static inline __m128i convolve(const __m128i *const s, const __m128i *const coeffs) { const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]); const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]); const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]); const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]); const __m128i res = _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3)); return res; } static inline __m128i convolve_lo_x(const __m128i *const s, const __m128i *const coeffs) { __m128i ss[4]; ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128()); ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128()); return convolve(ss, coeffs); } static inline __m128i convolve_lo_y(const __m128i *const s, const __m128i *const coeffs) { __m128i ss[4]; ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128()); ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128()); return convolve(ss, coeffs); } static inline __m128i convolve_hi_y(const __m128i *const s, const __m128i *const coeffs) { __m128i ss[4]; ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128()); ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128()); ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128()); ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128()); return convolve(ss, coeffs); } static inline __m128i comp_avg(const __m128i *const data_ref_0, const __m128i *const res_unsigned, const __m128i *const wt, const int use_dist_wtd_avg) { __m128i res; if (use_dist_wtd_avg) { const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned); const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned); const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt); const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt); const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); res = _mm_packs_epi32(res_lo, res_hi); } else { const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned); res = _mm_srai_epi16(wt_res, 1); } return res; } static inline __m128i convolve_rounding(const __m128i *const res_unsigned, const __m128i *const offset_const, const __m128i *const round_const, const int round_shift) { const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const); const __m128i res_round = _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift); return res_round; } static inline __m128i highbd_convolve_rounding_sse2( const __m128i *const res_unsigned, const __m128i *const offset_const, const __m128i *const round_const, const int round_shift) { const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const); const __m128i res_round = _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift); return res_round; } #endif // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ aom-3.12.1/aom_dsp/x86/convolve_sse4_1.h000066400000000000000000000041001477627663500176250ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ #define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ // Note: // This header file should be put below any x86 intrinsics head file static inline void mult_add_store(CONV_BUF_TYPE *const dst, const __m128i *const res, const __m128i *const wt0, const __m128i *const wt1, const int do_average) { __m128i d; if (do_average) { d = _mm_load_si128((__m128i *)dst); d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1)); d = _mm_srai_epi32(d, DIST_PRECISION_BITS); } else { d = *res; } _mm_store_si128((__m128i *)dst, d); } static inline __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0, const __m128i *const res_unsigned, const __m128i *const wt0, const __m128i *const wt1, const int use_dist_wtd_avg) { __m128i res; if (use_dist_wtd_avg) { const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0); const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1); const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res); res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS); } else { const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned); res = _mm_srai_epi32(wt_res, 1); } return res; } #endif // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ aom-3.12.1/aom_dsp/x86/convolve_ssse3.h000066400000000000000000000041031477627663500175720ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_ #define AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_ #include // SSSE3 static inline void shuffle_filter_ssse3(const int16_t *const filter, __m128i *const f) { const __m128i f_values = _mm_load_si128((const __m128i *)filter); // pack and duplicate the filter values f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); } static inline __m128i convolve8_8_ssse3(const __m128i *const s, const __m128i *const f) { // multiply 2 adjacent elements with the filter and add the result const __m128i k_64 = _mm_set1_epi16(1 << 6); const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); __m128i sum1, sum2; // sum the results together, saturating only on the final step // adding x0 with x2 and x1 with x3 is the only order that prevents // outranges for all filters sum1 = _mm_add_epi16(x0, x2); sum2 = _mm_add_epi16(x1, x3); // add the rounding offset early to avoid another saturated add sum1 = _mm_add_epi16(sum1, k_64); sum1 = _mm_adds_epi16(sum1, sum2); // shift by 7 bit each 16 bit sum1 = _mm_srai_epi16(sum1, 7); return sum1; } #endif // AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_ aom-3.12.1/aom_dsp/x86/fft_avx2.c000066400000000000000000000064641477627663500163450ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/fft_common.h" extern void aom_transpose_float_sse2(const float *A, float *B, int n); extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output, int n); // Generate the 1d forward transforms for float using _mm256 GEN_FFT_8(static inline void, avx2, float, __m256, _mm256_load_ps, _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, _mm256_mul_ps) GEN_FFT_16(static inline void, avx2, float, __m256, _mm256_load_ps, _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, _mm256_mul_ps) GEN_FFT_32(static inline void, avx2, float, __m256, _mm256_load_ps, _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, _mm256_mul_ps) void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) { aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2, aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); } void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) { aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2, aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); } void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) { aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2, aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); } // Generate the 1d inverse transforms for float using _mm256 GEN_IFFT_8(static inline void, avx2, float, __m256, _mm256_load_ps, _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, _mm256_mul_ps) GEN_IFFT_16(static inline void, avx2, float, __m256, _mm256_load_ps, _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, _mm256_mul_ps) GEN_IFFT_32(static inline void, avx2, float, __m256, _mm256_load_ps, _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, _mm256_mul_ps) void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) { aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2, aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8); } void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) { aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, aom_fft1d_16_avx2, aom_ifft1d_16_avx2, aom_transpose_float_sse2, 8); } void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) { aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, aom_fft1d_32_avx2, aom_ifft1d_32_avx2, aom_transpose_float_sse2, 8); } aom-3.12.1/aom_dsp/x86/fft_sse2.c000066400000000000000000000162041477627663500163320ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the s * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/fft_common.h" static inline void transpose4x4(const float *A, float *B, const int lda, const int ldb) { __m128 row1 = _mm_load_ps(&A[0 * lda]); __m128 row2 = _mm_load_ps(&A[1 * lda]); __m128 row3 = _mm_load_ps(&A[2 * lda]); __m128 row4 = _mm_load_ps(&A[3 * lda]); _MM_TRANSPOSE4_PS(row1, row2, row3, row4); _mm_store_ps(&B[0 * ldb], row1); _mm_store_ps(&B[1 * ldb], row2); _mm_store_ps(&B[2 * ldb], row3); _mm_store_ps(&B[3 * ldb], row4); } // Referenced by fft_avx2.c. void aom_transpose_float_sse2(const float *A, float *B, int n); void aom_transpose_float_sse2(const float *A, float *B, int n) { for (int y = 0; y < n; y += 4) { for (int x = 0; x < n; x += 4) { transpose4x4(A + y * n + x, B + x * n + y, n, n); } } } // Referenced by fft_avx2.c. void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n); void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) { const int n2 = n / 2; output[0] = packed[0]; output[1] = 0; output[2 * (n2 * n)] = packed[n2 * n]; output[2 * (n2 * n) + 1] = 0; output[2 * n2] = packed[n2]; output[2 * n2 + 1] = 0; output[2 * (n2 * n + n2)] = packed[n2 * n + n2]; output[2 * (n2 * n + n2) + 1] = 0; for (int c = 1; c < n2; ++c) { output[2 * (0 * n + c)] = packed[c]; output[2 * (0 * n + c) + 1] = packed[c + n2]; output[2 * (n2 * n + c) + 0] = packed[n2 * n + c]; output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2]; } for (int r = 1; r < n2; ++r) { output[2 * (r * n + 0)] = packed[r * n]; output[2 * (r * n + 0) + 1] = packed[(r + n2) * n]; output[2 * (r * n + n2) + 0] = packed[r * n + n2]; output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2]; for (int c = 1; c < AOMMIN(n2, 4); ++c) { output[2 * (r * n + c)] = packed[r * n + c] - packed[(r + n2) * n + c + n2]; output[2 * (r * n + c) + 1] = packed[(r + n2) * n + c] + packed[r * n + c + n2]; } for (int c = 4; c < n2; c += 4) { __m128 real1 = _mm_load_ps(packed + r * n + c); __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2); __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c); __m128 imag2 = _mm_load_ps(packed + r * n + c + n2); real1 = _mm_sub_ps(real1, real2); imag1 = _mm_add_ps(imag1, imag2); _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1)); _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1)); } int r2 = r + n2; int r3 = n - r2; output[2 * (r2 * n + 0)] = packed[r3 * n]; output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n]; output[2 * (r2 * n + n2)] = packed[r3 * n + n2]; output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2]; for (int c = 1; c < AOMMIN(4, n2); ++c) { output[2 * (r2 * n + c)] = packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2]; output[2 * (r2 * n + c) + 1] = -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2]; } for (int c = 4; c < n2; c += 4) { __m128 real1 = _mm_load_ps(packed + r3 * n + c); __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2); __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c); __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2); real1 = _mm_add_ps(real1, real2); imag1 = _mm_sub_ps(imag2, imag1); _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1)); _mm_store_ps(output + 2 * (r2 * n + c + 2), _mm_unpackhi_ps(real1, imag1)); } } } // Generate definitions for 1d transforms using float and __mm128 GEN_FFT_4(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, _mm_set1_ps, _mm_add_ps, _mm_sub_ps) GEN_FFT_8(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) GEN_FFT_16(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) GEN_FFT_32(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) { aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2, aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); } void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) { aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2, aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); } void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) { aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2, aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); } void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) { aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2, aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); } // Generate definitions for 1d inverse transforms using float and mm128 GEN_IFFT_4(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, _mm_set1_ps, _mm_add_ps, _mm_sub_ps) GEN_IFFT_8(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) GEN_IFFT_16(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) GEN_IFFT_32(static inline void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) { aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2, aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4); } void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) { aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2, aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4); } void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) { aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, aom_fft1d_16_sse2, aom_ifft1d_16_sse2, aom_transpose_float_sse2, 4); } void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) { aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, aom_fft1d_32_sse2, aom_ifft1d_32_sse2, aom_transpose_float_sse2, 4); } aom-3.12.1/aom_dsp/x86/fwd_txfm_impl_sse2.h000066400000000000000000000567731477627663500204360ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // SSE2 #include "config/aom_dsp_rtcd.h" #include "aom_dsp/txfm_common.h" #include "aom_dsp/x86/fwd_txfm_sse2.h" #include "aom_dsp/x86/txfm_common_sse2.h" #include "aom_ports/mem.h" // TODO(jingning) The high bit-depth functions need rework for performance. // After we properly fix the high bit-depth function implementations, this // file's dependency should be substantially simplified. #if DCT_HIGH_BIT_DEPTH #define ADD_EPI16 _mm_adds_epi16 #define SUB_EPI16 _mm_subs_epi16 #else #define ADD_EPI16 _mm_add_epi16 #define SUB_EPI16 _mm_sub_epi16 #endif #if defined(FDCT4x4_2D_HELPER) static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0, __m128i *in1) { // Constants // These are the coefficients used for the multiplies. // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), // where cospi_N_64 = cos(N pi /64) const __m128i k__cospi_A = octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); const __m128i k__cospi_B = octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); const __m128i k__cospi_C = octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64); const __m128i k__cospi_D = octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64); const __m128i k__cospi_E = octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); const __m128i k__cospi_F = octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); const __m128i k__cospi_G = octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64); const __m128i k__cospi_H = octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); // This second rounding constant saves doing some extra adds at the end const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1)); const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2; const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); // Load inputs. *in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); *in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); *in1 = _mm_unpacklo_epi64( *in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); *in0 = _mm_unpacklo_epi64( *in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); // in0 = [i0 i1 i2 i3 iC iD iE iF] // in1 = [i4 i5 i6 i7 i8 i9 iA iB] // multiply by 16 to give some extra precision *in0 = _mm_slli_epi16(*in0, 4); *in1 = _mm_slli_epi16(*in1, 4); // if (i == 0 && input[0]) input[0] += 1; // add 1 to the upper left pixel if it is non-zero, which helps reduce // the round-trip error { // The mask will only contain whether the first value is zero, all // other comparison will fail as something shifted by 4 (above << 4) // can never be equal to one. To increment in the non-zero case, we // add the mask and one for the first element: // - if zero, mask = -1, v = v - 1 + 1 = v // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 __m128i mask = _mm_cmpeq_epi16(*in0, k__nonzero_bias_a); *in0 = _mm_add_epi16(*in0, mask); *in0 = _mm_add_epi16(*in0, k__nonzero_bias_b); } // There are 4 total stages, alternating between an add/subtract stage // followed by an multiply-and-add stage. { // Stage 1: Add/subtract // in0 = [i0 i1 i2 i3 iC iD iE iF] // in1 = [i4 i5 i6 i7 i8 i9 iA iB] const __m128i r0 = _mm_unpacklo_epi16(*in0, *in1); const __m128i r1 = _mm_unpackhi_epi16(*in0, *in1); // r0 = [i0 i4 i1 i5 i2 i6 i3 i7] // r1 = [iC i8 iD i9 iE iA iF iB] const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4); const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4); // r2 = [i0 i4 i1 i5 i3 i7 i2 i6] // r3 = [iC i8 iD i9 iF iB iE iA] const __m128i t0 = _mm_add_epi16(r2, r3); const __m128i t1 = _mm_sub_epi16(r2, r3); // t0 = [a0 a4 a1 a5 a3 a7 a2 a6] // t1 = [aC a8 aD a9 aF aB aE aA] // Stage 2: multiply by constants (which gets us into 32 bits). // The constants needed here are: // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16] // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16] // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08] // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24] const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D); // Then add and right-shift to get back to 16-bit range const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // w0 = [b0 b1 b7 b6] // w1 = [b8 b9 bF bE] // w2 = [b4 b5 b3 b2] // w3 = [bC bD bB bA] const __m128i x0 = _mm_packs_epi32(w0, w1); const __m128i x1 = _mm_packs_epi32(w2, w3); // x0 = [b0 b1 b7 b6 b8 b9 bF bE] // x1 = [b4 b5 b3 b2 bC bD bB bA] *in0 = _mm_shuffle_epi32(x0, 0xD8); *in1 = _mm_shuffle_epi32(x1, 0x8D); // in0 = [b0 b1 b8 b9 b7 b6 bF bE] // in1 = [b3 b2 bB bA b4 b5 bC bD] } { // vertical DCTs finished. Now we do the horizontal DCTs. // Stage 3: Add/subtract const __m128i t0 = ADD_EPI16(*in0, *in1); const __m128i t1 = SUB_EPI16(*in0, *in1); // Stage 4: multiply by constants (which gets us into 32 bits). { // The constants needed here are: // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16] // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16] // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24] // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08] const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E); const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F); const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H); // Then add and right-shift to get back to 16-bit range // but this combines the final right-shift as well to save operations // This unusual rounding operations is to maintain bit-accurate // compatibility with the c version of this function which has two // rounding steps in a row. const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2); *in0 = _mm_packs_epi32(w0, w2); *in1 = _mm_packs_epi32(w1, w3); } } } #endif // defined(FDCT4x4_2D_HELPER) #if defined(FDCT4x4_2D) void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { // This 2D transform implements 4 vertical 1D transforms followed // by 4 horizontal 1D transforms. The multiplies and adds are as given // by Chen, Smith and Fralick ('77). The commands for moving the data // around have been minimized by hand. // For the purposes of the comments, the 16 inputs are referred to at i0 // through iF (in raster order), intermediate variables are a0, b0, c0 // through f, and correspond to the in-place computations mapped to input // locations. The outputs, o0 through oF are labeled according to the // output locations. __m128i in0, in1; FDCT4x4_2D_HELPER(input, stride, &in0, &in1); // Post-condition (v + 1) >> 2 is now incorporated into previous // add and right-shift commands. Only 2 store instructions needed // because we are using the fact that 1/3 are stored just after 0/2. storeu_output(&in0, output + 0 * 4); storeu_output(&in1, output + 2 * 4); } #endif // defined(FDCT4x4_2D) #if defined(FDCT4x4_2D_LP) void FDCT4x4_2D_LP(const int16_t *input, int16_t *output, int stride) { __m128i in0, in1; FDCT4x4_2D_HELPER(input, stride, &in0, &in1); _mm_storeu_si128((__m128i *)(output + 0 * 4), in0); _mm_storeu_si128((__m128i *)(output + 2 * 4), in1); } #endif // defined(FDCT4x4_2D_LP) #if CONFIG_INTERNAL_STATS void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { int pass; // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); #if DCT_HIGH_BIT_DEPTH int overflow; #endif // Load input __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); // Pre-condition input (shift by two) in0 = _mm_slli_epi16(in0, 2); in1 = _mm_slli_epi16(in1, 2); in2 = _mm_slli_epi16(in2, 2); in3 = _mm_slli_epi16(in3, 2); in4 = _mm_slli_epi16(in4, 2); in5 = _mm_slli_epi16(in5, 2); in6 = _mm_slli_epi16(in6, 2); in7 = _mm_slli_epi16(in7, 2); // We do two passes, first the columns, then the rows. The results of the // first pass are transposed so that the same column code can be reused. The // results of the second pass are also transposed so that the rows (processed // as columns) are put back in row positions. for (pass = 0; pass < 2; pass++) { // To store results of each pass before the transpose. __m128i res0, res1, res2, res3, res4, res5, res6, res7; // Add/subtract const __m128i q0 = ADD_EPI16(in0, in7); const __m128i q1 = ADD_EPI16(in1, in6); const __m128i q2 = ADD_EPI16(in2, in5); const __m128i q3 = ADD_EPI16(in3, in4); const __m128i q4 = SUB_EPI16(in3, in4); const __m128i q5 = SUB_EPI16(in2, in5); const __m128i q6 = SUB_EPI16(in1, in6); const __m128i q7 = SUB_EPI16(in0, in7); #if DCT_HIGH_BIT_DEPTH if (pass == 1) { overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); if (overflow) { aom_highbd_fdct8x8_c(input, output, stride); return; } } #endif // DCT_HIGH_BIT_DEPTH // Work on first four results { // Add/subtract const __m128i r0 = ADD_EPI16(q0, q3); const __m128i r1 = ADD_EPI16(q1, q2); const __m128i r2 = SUB_EPI16(q1, q2); const __m128i r3 = SUB_EPI16(q0, q3); #if DCT_HIGH_BIT_DEPTH overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); if (overflow) { aom_highbd_fdct8x8_c(input, output, stride); return; } #endif // DCT_HIGH_BIT_DEPTH // Interleave to do the multiply by constants which gets us into 32bits { const __m128i t0 = _mm_unpacklo_epi16(r0, r1); const __m128i t1 = _mm_unpackhi_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i t3 = _mm_unpackhi_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res0 = _mm_packs_epi32(w0, w1); res4 = _mm_packs_epi32(w2, w3); res2 = _mm_packs_epi32(w4, w5); res6 = _mm_packs_epi32(w6, w7); #if DCT_HIGH_BIT_DEPTH overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6); if (overflow) { aom_highbd_fdct8x8_c(input, output, stride); return; } #endif // DCT_HIGH_BIT_DEPTH } } // Work on next four results { // Interleave to do the multiply by constants which gets us into 32bits const __m128i d0 = _mm_unpacklo_epi16(q6, q5); const __m128i d1 = _mm_unpackhi_epi16(q6, q5); const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); // dct_const_round_shift const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); // Combine const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); #if DCT_HIGH_BIT_DEPTH overflow = check_epi16_overflow_x2(&r0, &r1); if (overflow) { aom_highbd_fdct8x8_c(input, output, stride); return; } #endif // DCT_HIGH_BIT_DEPTH { // Add/subtract const __m128i x0 = ADD_EPI16(q4, r0); const __m128i x1 = SUB_EPI16(q4, r0); const __m128i x2 = SUB_EPI16(q7, r1); const __m128i x3 = ADD_EPI16(q7, r1); #if DCT_HIGH_BIT_DEPTH overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); if (overflow) { aom_highbd_fdct8x8_c(input, output, stride); return; } #endif // DCT_HIGH_BIT_DEPTH // Interleave to do the multiply by constants which gets us into 32bits { const __m128i t0 = _mm_unpacklo_epi16(x0, x3); const __m128i t1 = _mm_unpackhi_epi16(x0, x3); const __m128i t2 = _mm_unpacklo_epi16(x1, x2); const __m128i t3 = _mm_unpackhi_epi16(x1, x2); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res1 = _mm_packs_epi32(w0, w1); res7 = _mm_packs_epi32(w2, w3); res5 = _mm_packs_epi32(w4, w5); res3 = _mm_packs_epi32(w6, w7); #if DCT_HIGH_BIT_DEPTH overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3); if (overflow) { aom_highbd_fdct8x8_c(input, output, stride); return; } #endif // DCT_HIGH_BIT_DEPTH } } } // Transpose the 8x8. { // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // 40 41 42 43 44 45 46 47 // 50 51 52 53 54 55 56 57 // 60 61 62 63 64 65 66 67 // 70 71 72 73 74 75 76 77 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 // 04 14 05 15 06 16 07 17 // 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 // 60 70 61 71 62 72 63 73 // 54 54 55 55 56 56 57 57 // 64 74 65 75 66 76 67 77 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // 00 10 20 30 01 11 21 31 // 40 50 60 70 41 51 61 71 // 02 12 22 32 03 13 23 33 // 42 52 62 72 43 53 63 73 // 04 14 24 34 05 15 21 36 // 44 54 64 74 45 55 61 76 // 06 16 26 36 07 17 27 37 // 46 56 66 76 47 57 67 77 in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 } } // Post-condition output and store it { // Post-condition (division by two) // division of two 16 bits signed numbers using shifts // n / 2 = (n - (n >> 15)) >> 1 const __m128i sign_in0 = _mm_srai_epi16(in0, 15); const __m128i sign_in1 = _mm_srai_epi16(in1, 15); const __m128i sign_in2 = _mm_srai_epi16(in2, 15); const __m128i sign_in3 = _mm_srai_epi16(in3, 15); const __m128i sign_in4 = _mm_srai_epi16(in4, 15); const __m128i sign_in5 = _mm_srai_epi16(in5, 15); const __m128i sign_in6 = _mm_srai_epi16(in6, 15); const __m128i sign_in7 = _mm_srai_epi16(in7, 15); in0 = _mm_sub_epi16(in0, sign_in0); in1 = _mm_sub_epi16(in1, sign_in1); in2 = _mm_sub_epi16(in2, sign_in2); in3 = _mm_sub_epi16(in3, sign_in3); in4 = _mm_sub_epi16(in4, sign_in4); in5 = _mm_sub_epi16(in5, sign_in5); in6 = _mm_sub_epi16(in6, sign_in6); in7 = _mm_sub_epi16(in7, sign_in7); in0 = _mm_srai_epi16(in0, 1); in1 = _mm_srai_epi16(in1, 1); in2 = _mm_srai_epi16(in2, 1); in3 = _mm_srai_epi16(in3, 1); in4 = _mm_srai_epi16(in4, 1); in5 = _mm_srai_epi16(in5, 1); in6 = _mm_srai_epi16(in6, 1); in7 = _mm_srai_epi16(in7, 1); // store results store_output(&in0, (output + 0 * 8)); store_output(&in1, (output + 1 * 8)); store_output(&in2, (output + 2 * 8)); store_output(&in3, (output + 3 * 8)); store_output(&in4, (output + 4 * 8)); store_output(&in5, (output + 5 * 8)); store_output(&in6, (output + 6 * 8)); store_output(&in7, (output + 7 * 8)); } } #endif // CONFIG_INTERNAL_STATS #undef ADD_EPI16 #undef SUB_EPI16 aom-3.12.1/aom_dsp/x86/fwd_txfm_sse2.c000066400000000000000000000023101477627663500173620ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // SSE2 #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/x86/fwd_txfm_sse2.h" #define DCT_HIGH_BIT_DEPTH 0 #define FDCT4x4_2D_HELPER fdct4x4_helper #define FDCT4x4_2D aom_fdct4x4_sse2 #define FDCT4x4_2D_LP aom_fdct4x4_lp_sse2 #define FDCT8x8_2D aom_fdct8x8_sse2 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h" #undef FDCT4x4_2D_HELPER #undef FDCT4x4_2D #undef FDCT4x4_2D_LP #undef FDCT8x8_2D #if CONFIG_AV1_HIGHBITDEPTH #undef DCT_HIGH_BIT_DEPTH #define DCT_HIGH_BIT_DEPTH 1 #define FDCT8x8_2D aom_highbd_fdct8x8_sse2 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT #undef FDCT8x8_2D #endif aom-3.12.1/aom_dsp/x86/fwd_txfm_sse2.h000066400000000000000000000154631477627663500174040ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ #define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ #ifdef __cplusplus extern "C" { #endif static inline __m128i k_madd_epi32(__m128i a, __m128i b) { __m128i buf0, buf1; buf0 = _mm_mul_epu32(a, b); a = _mm_srli_epi64(a, 32); b = _mm_srli_epi64(b, 32); buf1 = _mm_mul_epu32(a, b); return _mm_add_epi64(buf0, buf1); } static inline __m128i k_packs_epi64(__m128i a, __m128i b) { __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); return _mm_unpacklo_epi64(buf0, buf1); } static inline int check_epi16_overflow_x2(const __m128i *preg0, const __m128i *preg1) { const __m128i max_overflow = _mm_set1_epi16(0x7fff); const __m128i min_overflow = _mm_set1_epi16((short)0x8000); __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), _mm_cmpeq_epi16(*preg0, min_overflow)); __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), _mm_cmpeq_epi16(*preg1, min_overflow)); cmp0 = _mm_or_si128(cmp0, cmp1); return _mm_movemask_epi8(cmp0); } static inline int check_epi16_overflow_x4(const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, const __m128i *preg3) { const __m128i max_overflow = _mm_set1_epi16(0x7fff); const __m128i min_overflow = _mm_set1_epi16((short)0x8000); __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), _mm_cmpeq_epi16(*preg0, min_overflow)); __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), _mm_cmpeq_epi16(*preg1, min_overflow)); __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow), _mm_cmpeq_epi16(*preg2, min_overflow)); __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow), _mm_cmpeq_epi16(*preg3, min_overflow)); cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)); return _mm_movemask_epi8(cmp0); } static inline int check_epi16_overflow_x8( const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, const __m128i *preg6, const __m128i *preg7) { int res0, res1; res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); return res0 + res1; } static inline int check_epi16_overflow_x12( const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) { int res0, res1; res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); return res0 + res1; } static inline int check_epi16_overflow_x16( const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, const __m128i *preg15) { int res0, res1; res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); if (!res0) { res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); } return res0 + res1; } static inline int check_epi16_overflow_x32( const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, const __m128i *preg30, const __m128i *preg31) { int res0, res1; res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); if (!res0) { res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); if (!res1) { res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); if (!res0) { res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19); if (!res1) { res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23); if (!res0) { res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27); if (!res1) res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31); } } } } } return res0 + res1; } static inline void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { const __m128i zero = _mm_setzero_si128(); const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); _mm_store_si128((__m128i *)(dst_ptr), out0); _mm_store_si128((__m128i *)(dst_ptr + 4), out1); } static inline void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { const __m128i zero = _mm_setzero_si128(); const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); _mm_storeu_si128((__m128i *)(dst_ptr), out0); _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ aom-3.12.1/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm000066400000000000000000000171341477627663500211340ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "third_party/x86inc/x86inc.asm" SECTION_RODATA pw_11585x2: times 8 dw 23170 pd_8192: times 4 dd 8192 %macro TRANSFORM_COEFFS 2 pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1 %endmacro TRANSFORM_COEFFS 11585, 11585 TRANSFORM_COEFFS 15137, 6270 TRANSFORM_COEFFS 16069, 3196 TRANSFORM_COEFFS 9102, 13623 %macro STORE_OUTPUT 2 ; index, result ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); ; _mm_store_si128((__m128i *)(dst_ptr), out0); ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1); pxor m11, m11 pcmpgtw m11, m%2 movdqa m12, m%2 punpcklwd m%2, m11 punpckhwd m12, m11 mova [outputq + 4*%1 + 0], m%2 mova [outputq + 4*%1 + 16], m12 %endmacro SECTION .text %if AOM_ARCH_X86_64 INIT_XMM ssse3 cglobal fdct8x8, 3, 5, 13, input, output, stride mova m8, [GLOBAL(pd_8192)] mova m12, [GLOBAL(pw_11585x2)] lea r3, [2 * strideq] lea r4, [4 * strideq] mova m0, [inputq] mova m1, [inputq + r3] lea inputq, [inputq + r4] mova m2, [inputq] mova m3, [inputq + r3] lea inputq, [inputq + r4] mova m4, [inputq] mova m5, [inputq + r3] lea inputq, [inputq + r4] mova m6, [inputq] mova m7, [inputq + r3] ; left shift by 2 to increase forward transformation precision psllw m0, 2 psllw m1, 2 psllw m2, 2 psllw m3, 2 psllw m4, 2 psllw m5, 2 psllw m6, 2 psllw m7, 2 ; column transform ; stage 1 paddw m10, m0, m7 psubw m0, m7 paddw m9, m1, m6 psubw m1, m6 paddw m7, m2, m5 psubw m2, m5 paddw m6, m3, m4 psubw m3, m4 ; stage 2 paddw m5, m9, m7 psubw m9, m7 paddw m4, m10, m6 psubw m10, m6 paddw m7, m1, m2 psubw m1, m2 ; stage 3 paddw m6, m4, m5 psubw m4, m5 pmulhrsw m1, m12 pmulhrsw m7, m12 ; sin(pi / 8), cos(pi / 8) punpcklwd m2, m10, m9 punpckhwd m10, m9 pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] pmaddwd m2, [GLOBAL(pw_6270_m15137)] pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] pmaddwd m10, [GLOBAL(pw_6270_m15137)] paddd m5, m8 paddd m2, m8 paddd m9, m8 paddd m10, m8 psrad m5, 14 psrad m2, 14 psrad m9, 14 psrad m10, 14 packssdw m5, m9 packssdw m2, m10 pmulhrsw m6, m12 pmulhrsw m4, m12 paddw m9, m3, m1 psubw m3, m1 paddw m10, m0, m7 psubw m0, m7 ; stage 4 ; sin(pi / 16), cos(pi / 16) punpcklwd m1, m10, m9 punpckhwd m10, m9 pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] pmaddwd m1, [GLOBAL(pw_3196_m16069)] pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] pmaddwd m10, [GLOBAL(pw_3196_m16069)] paddd m7, m8 paddd m1, m8 paddd m9, m8 paddd m10, m8 psrad m7, 14 psrad m1, 14 psrad m9, 14 psrad m10, 14 packssdw m7, m9 packssdw m1, m10 ; sin(3 * pi / 16), cos(3 * pi / 16) punpcklwd m11, m0, m3 punpckhwd m0, m3 pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] pmaddwd m11, [GLOBAL(pw_13623_m9102)] pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] pmaddwd m0, [GLOBAL(pw_13623_m9102)] paddd m9, m8 paddd m11, m8 paddd m3, m8 paddd m0, m8 psrad m9, 14 psrad m11, 14 psrad m3, 14 psrad m0, 14 packssdw m9, m3 packssdw m11, m0 ; transpose ; stage 1 punpcklwd m0, m6, m7 punpcklwd m3, m5, m11 punpckhwd m6, m7 punpckhwd m5, m11 punpcklwd m7, m4, m9 punpcklwd m10, m2, m1 punpckhwd m4, m9 punpckhwd m2, m1 ; stage 2 punpckldq m9, m0, m3 punpckldq m1, m6, m5 punpckhdq m0, m3 punpckhdq m6, m5 punpckldq m3, m7, m10 punpckldq m5, m4, m2 punpckhdq m7, m10 punpckhdq m4, m2 ; stage 3 punpcklqdq m10, m9, m3 punpckhqdq m9, m3 punpcklqdq m2, m0, m7 punpckhqdq m0, m7 punpcklqdq m3, m1, m5 punpckhqdq m1, m5 punpcklqdq m7, m6, m4 punpckhqdq m6, m4 ; row transform ; stage 1 paddw m5, m10, m6 psubw m10, m6 paddw m4, m9, m7 psubw m9, m7 paddw m6, m2, m1 psubw m2, m1 paddw m7, m0, m3 psubw m0, m3 ;stage 2 paddw m1, m5, m7 psubw m5, m7 paddw m3, m4, m6 psubw m4, m6 paddw m7, m9, m2 psubw m9, m2 ; stage 3 punpcklwd m6, m1, m3 punpckhwd m1, m3 pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] pmaddwd m6, [GLOBAL(pw_11585_m11585)] pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] pmaddwd m1, [GLOBAL(pw_11585_m11585)] paddd m2, m8 paddd m6, m8 paddd m3, m8 paddd m1, m8 psrad m2, 14 psrad m6, 14 psrad m3, 14 psrad m1, 14 packssdw m2, m3 packssdw m6, m1 pmulhrsw m7, m12 pmulhrsw m9, m12 punpcklwd m3, m5, m4 punpckhwd m5, m4 pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] pmaddwd m3, [GLOBAL(pw_6270_m15137)] pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] pmaddwd m5, [GLOBAL(pw_6270_m15137)] paddd m1, m8 paddd m3, m8 paddd m4, m8 paddd m5, m8 psrad m1, 14 psrad m3, 14 psrad m4, 14 psrad m5, 14 packssdw m1, m4 packssdw m3, m5 paddw m4, m0, m9 psubw m0, m9 paddw m5, m10, m7 psubw m10, m7 ; stage 4 punpcklwd m9, m5, m4 punpckhwd m5, m4 pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] pmaddwd m9, [GLOBAL(pw_3196_m16069)] pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] pmaddwd m5, [GLOBAL(pw_3196_m16069)] paddd m7, m8 paddd m9, m8 paddd m4, m8 paddd m5, m8 psrad m7, 14 psrad m9, 14 psrad m4, 14 psrad m5, 14 packssdw m7, m4 packssdw m9, m5 punpcklwd m4, m10, m0 punpckhwd m10, m0 pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] pmaddwd m4, [GLOBAL(pw_13623_m9102)] pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] pmaddwd m10, [GLOBAL(pw_13623_m9102)] paddd m5, m8 paddd m4, m8 paddd m0, m8 paddd m10, m8 psrad m5, 14 psrad m4, 14 psrad m0, 14 psrad m10, 14 packssdw m5, m0 packssdw m4, m10 ; transpose ; stage 1 punpcklwd m0, m2, m7 punpcklwd m10, m1, m4 punpckhwd m2, m7 punpckhwd m1, m4 punpcklwd m7, m6, m5 punpcklwd m4, m3, m9 punpckhwd m6, m5 punpckhwd m3, m9 ; stage 2 punpckldq m5, m0, m10 punpckldq m9, m2, m1 punpckhdq m0, m10 punpckhdq m2, m1 punpckldq m10, m7, m4 punpckldq m1, m6, m3 punpckhdq m7, m4 punpckhdq m6, m3 ; stage 3 punpcklqdq m4, m5, m10 punpckhqdq m5, m10 punpcklqdq m3, m0, m7 punpckhqdq m0, m7 punpcklqdq m10, m9, m1 punpckhqdq m9, m1 punpcklqdq m7, m2, m6 punpckhqdq m2, m6 psraw m1, m4, 15 psraw m6, m5, 15 psraw m8, m3, 15 psraw m11, m0, 15 psubw m4, m1 psubw m5, m6 psubw m3, m8 psubw m0, m11 psraw m4, 1 psraw m5, 1 psraw m3, 1 psraw m0, 1 psraw m1, m10, 15 psraw m6, m9, 15 psraw m8, m7, 15 psraw m11, m2, 15 psubw m10, m1 psubw m9, m6 psubw m7, m8 psubw m2, m11 psraw m10, 1 psraw m9, 1 psraw m7, 1 psraw m2, 1 STORE_OUTPUT 0, 4 STORE_OUTPUT 8, 5 STORE_OUTPUT 16, 3 STORE_OUTPUT 24, 0 STORE_OUTPUT 32, 10 STORE_OUTPUT 40, 9 STORE_OUTPUT 48, 7 STORE_OUTPUT 56, 2 RET %endif aom-3.12.1/aom_dsp/x86/highbd_adaptive_quantize_avx2.c000066400000000000000000000453431477627663500226070ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/quantize.h" #include "aom_dsp/x86/quantize_x86.h" static inline void highbd_load_b_values_avx2( const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr, __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr, __m256i *shift) { *zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr)); *zbin = _mm256_sub_epi32(*zbin, _mm256_set1_epi32(1)); *round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr)); *quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr)); *dequant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr)); *shift = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)shift_ptr)); } static inline void highbd_update_mask1_avx2(__m256i *cmp_mask, const int16_t *iscan_ptr, int *is_found, __m256i *mask) { __m256i temp_mask = _mm256_setzero_si256(); if (_mm256_movemask_epi8(*cmp_mask)) { __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr)); temp_mask = _mm256_and_si256(*cmp_mask, iscan); *is_found = 1; } *mask = _mm256_max_epi16(temp_mask, *mask); } static inline void highbd_update_mask0_avx2(__m256i *qcoeff0, __m256i *qcoeff1, __m256i *threshold, const int16_t *iscan_ptr, int *is_found, __m256i *mask) { __m256i coeff[2], cmp_mask0, cmp_mask1; coeff[0] = _mm256_slli_epi32(*qcoeff0, AOM_QM_BITS); cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]); coeff[1] = _mm256_slli_epi32(*qcoeff1, AOM_QM_BITS); cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]); cmp_mask0 = _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8); highbd_update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask); } static inline void highbd_mul_shift_avx2(const __m256i *x, const __m256i *y, __m256i *p, const int shift) { __m256i prod_lo = _mm256_mul_epi32(*x, *y); __m256i prod_hi = _mm256_srli_epi64(*x, 32); const __m256i mult_hi = _mm256_srli_epi64(*y, 32); prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); prod_lo = _mm256_srli_epi64(prod_lo, shift); prod_hi = _mm256_srli_epi64(prod_hi, shift); prod_hi = _mm256_slli_epi64(prod_hi, 32); *p = _mm256_blend_epi32(prod_lo, prod_hi, 0xaa); } static inline void highbd_calculate_qcoeff_avx2(__m256i *coeff, const __m256i *round, const __m256i *quant, const __m256i *shift, const int *log_scale) { __m256i tmp, qcoeff; qcoeff = _mm256_add_epi32(*coeff, *round); highbd_mul_shift_avx2(&qcoeff, quant, &tmp, 16); qcoeff = _mm256_add_epi32(tmp, qcoeff); highbd_mul_shift_avx2(&qcoeff, shift, coeff, 16 - *log_scale); } static inline __m256i highbd_calculate_dqcoeff_avx2(__m256i qcoeff, __m256i dequant) { return _mm256_mullo_epi32(qcoeff, dequant); } static inline __m256i highbd_calculate_dqcoeff_log_scale_avx2( __m256i qcoeff, __m256i dequant, const int log_scale) { __m256i abs_coeff = _mm256_abs_epi32(qcoeff); highbd_mul_shift_avx2(&abs_coeff, &dequant, &abs_coeff, log_scale); return _mm256_sign_epi32(abs_coeff, qcoeff); } static inline void highbd_store_coefficients_avx2(__m256i coeff0, __m256i coeff1, tran_low_t *coeff_ptr) { _mm256_store_si256((__m256i *)(coeff_ptr), coeff0); _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff1); } void aom_highbd_quantize_b_adaptive_avx2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int index = 16; int non_zero_count = 0; int non_zero_count_prescan_add_zero = 0; int is_found0 = 0, is_found1 = 0; int eob = -1; const __m256i zero = _mm256_setzero_si256(); __m256i zbin, round, quant, dequant, shift; __m256i coeff0, qcoeff0, coeff1, qcoeff1; __m256i cmp_mask, mask0 = zero, mask1 = zero; __m128i temp_mask0, temp_mask1; int prescan_add[2]; int thresh[2]; const int log_scale = 0; const qm_val_t wt = (1 << AOM_QM_BITS); for (int i = 0; i < 2; ++i) { prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; } __m256i threshold[2]; threshold[0] = _mm256_set1_epi32(thresh[0]); threshold[1] = _mm256_set1_epi32(thresh[1]); threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe); #if SKIP_EOB_FACTOR_ADJUST int first = -1; #endif // Setup global values. highbd_load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant, quant_shift_ptr, &shift); // Do DC and first 15 AC. coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr)); qcoeff0 = _mm256_abs_epi32(coeff0); coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); qcoeff1 = _mm256_abs_epi32(coeff1); highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); zbin = _mm256_unpackhi_epi64(zbin, zbin); __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1); threshold[0] = threshold[1]; if (_mm256_movemask_epi8(cmp_mask) == 0) { _mm256_store_si256((__m256i *)(qcoeff_ptr), zero); _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero); round = _mm256_unpackhi_epi64(round, round); quant = _mm256_unpackhi_epi64(quant, quant); shift = _mm256_unpackhi_epi64(shift, shift); dequant = _mm256_unpackhi_epi64(dequant, dequant); } else { highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); round = _mm256_unpackhi_epi64(round, round); quant = _mm256_unpackhi_epi64(quant, quant); shift = _mm256_unpackhi_epi64(shift, shift); highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); // Reinsert signs qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); // Mask out zbin threshold coeffs qcoeff0 = _mm256_and_si256(qcoeff0, temp0); qcoeff1 = _mm256_and_si256(qcoeff1, temp1); highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr); coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant); dequant = _mm256_unpackhi_epi64(dequant, dequant); coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant); highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr); } // AC only loop. while (index < n_coeffs) { coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index)); qcoeff0 = _mm256_abs_epi32(coeff0); coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8)); qcoeff1 = _mm256_abs_epi32(coeff1); highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, &mask0); temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1); if (_mm256_movemask_epi8(cmp_mask) == 0) { _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero); _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero); index += 16; continue; } highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); qcoeff0 = _mm256_and_si256(qcoeff0, temp0); qcoeff1 = _mm256_and_si256(qcoeff1, temp1); highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index); coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant); coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant); highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index); index += 16; } if (is_found0) { temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0), _mm256_extracti128_si256(mask0, 1)); non_zero_count = calculate_non_zero_count(temp_mask0); } if (is_found1) { temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1), _mm256_extracti128_si256(mask1, 1)); non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1); } for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { const int rc = scan[i]; qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; } for (int i = non_zero_count - 1; i >= 0; i--) { const int rc = scan[i]; if (qcoeff_ptr[rc]) { eob = i; break; } } *eob_ptr = eob + 1; #if SKIP_EOB_FACTOR_ADJUST // TODO(Aniket): Experiment the following loop with intrinsic by combining // with the quantization loop above for (int i = 0; i < non_zero_count; i++) { const int rc = scan[i]; const int qcoeff = qcoeff_ptr[rc]; if (qcoeff) { first = i; break; } } if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { const int rc = scan[(*eob_ptr - 1)]; if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { const int coeff = coeff_ptr[rc] * wt; const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; const int prescan_add_val = ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); if (abs_coeff < (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; *eob_ptr = 0; } } } #endif } void aom_highbd_quantize_b_32x32_adaptive_avx2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int index = 16; int non_zero_count = 0; int non_zero_count_prescan_add_zero = 0; int is_found0 = 0, is_found1 = 0; int eob = -1; const int log_scale = 1; const __m256i zero = _mm256_setzero_si256(); __m256i zbin, round, quant, dequant, shift; __m256i coeff0, qcoeff0, coeff1, qcoeff1; __m256i cmp_mask, mask0 = zero, mask1 = zero; __m128i temp_mask0, temp_mask1; const __m256i one = _mm256_set1_epi32(1); const __m256i log_scale_vec = _mm256_set1_epi32(log_scale); int prescan_add[2]; int thresh[2]; const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; const qm_val_t wt = (1 << AOM_QM_BITS); for (int i = 0; i < 2; ++i) { prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; } __m256i threshold[2]; threshold[0] = _mm256_set1_epi32(thresh[0]); threshold[1] = _mm256_set1_epi32(thresh[1]); threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe); #if SKIP_EOB_FACTOR_ADJUST int first = -1; #endif // Setup global values. zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr)); round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr)); quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr)); dequant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr)); shift = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_shift_ptr)); // Shift with rounding. zbin = _mm256_add_epi32(zbin, log_scale_vec); round = _mm256_add_epi32(round, log_scale_vec); zbin = _mm256_srli_epi32(zbin, log_scale); round = _mm256_srli_epi32(round, log_scale); zbin = _mm256_sub_epi32(zbin, one); // Do DC and first 15 AC. coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr)); qcoeff0 = _mm256_abs_epi32(coeff0); coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); qcoeff1 = _mm256_abs_epi32(coeff1); highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); zbin = _mm256_permute2x128_si256(zbin, zbin, 0x11); __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1); threshold[0] = threshold[1]; if (_mm256_movemask_epi8(cmp_mask) == 0) { _mm256_store_si256((__m256i *)(qcoeff_ptr), zero); _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero); round = _mm256_permute2x128_si256(round, round, 0x11); quant = _mm256_permute2x128_si256(quant, quant, 0x11); shift = _mm256_permute2x128_si256(shift, shift, 0x11); dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11); } else { highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); round = _mm256_permute2x128_si256(round, round, 0x11); quant = _mm256_permute2x128_si256(quant, quant, 0x11); shift = _mm256_permute2x128_si256(shift, shift, 0x11); highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); // Reinsert signs qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); // Mask out zbin threshold coeffs qcoeff0 = _mm256_and_si256(qcoeff0, temp0); qcoeff1 = _mm256_and_si256(qcoeff1, temp1); highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr); coeff0 = highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale); dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11); coeff1 = highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale); highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr); } // AC only loop. while (index < n_coeffs) { coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index)); qcoeff0 = _mm256_abs_epi32(coeff0); coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8)); qcoeff1 = _mm256_abs_epi32(coeff1); highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, &mask0); temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1); if (_mm256_movemask_epi8(cmp_mask) == 0) { _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero); _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero); _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero); index += 16; continue; } highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); qcoeff0 = _mm256_and_si256(qcoeff0, temp0); qcoeff1 = _mm256_and_si256(qcoeff1, temp1); highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index); coeff0 = highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale); coeff1 = highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale); highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index); index += 16; } if (is_found0) { temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0), _mm256_extracti128_si256(mask0, 1)); non_zero_count = calculate_non_zero_count(temp_mask0); } if (is_found1) { temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1), _mm256_extracti128_si256(mask1, 1)); non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1); } for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { const int rc = scan[i]; qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; } for (int i = non_zero_count - 1; i >= 0; i--) { const int rc = scan[i]; if (qcoeff_ptr[rc]) { eob = i; break; } } *eob_ptr = eob + 1; #if SKIP_EOB_FACTOR_ADJUST // TODO(Aniket): Experiment the following loop with intrinsic by combining // with the quantization loop above for (int i = 0; i < non_zero_count; i++) { const int rc = scan[i]; const int qcoeff = qcoeff_ptr[rc]; if (qcoeff) { first = i; break; } } if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { const int rc = scan[(*eob_ptr - 1)]; if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { const int coeff = coeff_ptr[rc] * wt; const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; const int prescan_add_val = ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; *eob_ptr = 0; } } } #endif } aom-3.12.1/aom_dsp/x86/highbd_adaptive_quantize_sse2.c000066400000000000000000000661411477627663500226020ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/quantize.h" #include "aom_dsp/x86/quantize_x86.h" static inline __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) { a = _mm_xor_si128(a, sign); return _mm_sub_epi64(a, sign); } static inline void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y, __m128i *p, const int shift) { __m128i sign = _mm_srai_epi32(*y, 31); __m128i sign_lo = _mm_unpacklo_epi32(sign, sign); __m128i sign_hi = _mm_unpackhi_epi32(sign, sign); __m128i abs_y = invert_sign_32_sse2(*y, sign); __m128i prod_lo = _mm_mul_epu32(*x, abs_y); __m128i prod_hi = _mm_srli_epi64(*x, 32); const __m128i mult_hi = _mm_srli_epi64(abs_y, 32); prod_hi = _mm_mul_epu32(prod_hi, mult_hi); prod_lo = highbd_invert_sign_64bit_sse2(prod_lo, sign_lo); prod_hi = highbd_invert_sign_64bit_sse2(prod_hi, sign_hi); prod_lo = _mm_srli_epi64(prod_lo, shift); const __m128i mask = _mm_set_epi32(0, -1, 0, -1); prod_lo = _mm_and_si128(prod_lo, mask); prod_hi = _mm_srli_epi64(prod_hi, shift); prod_hi = _mm_slli_epi64(prod_hi, 32); *p = _mm_or_si128(prod_lo, prod_hi); } static inline void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round, const __m128i *quant, const __m128i *shift, const int *log_scale) { __m128i tmp, qcoeff; qcoeff = _mm_add_epi32(*coeff, *round); highbd_mul_shift_sse2(&qcoeff, quant, &tmp, 16); qcoeff = _mm_add_epi32(tmp, qcoeff); highbd_mul_shift_sse2(&qcoeff, shift, coeff, 16 - *log_scale); } static inline void highbd_update_mask1(__m128i *cmp_mask0, const int16_t *iscan_ptr, int *is_found, __m128i *mask) { __m128i temp_mask = _mm_setzero_si128(); if (_mm_movemask_epi8(*cmp_mask0)) { __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr)); __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0); temp_mask = mask0; *is_found = 1; } *mask = _mm_max_epi16(temp_mask, *mask); } static inline void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1, __m128i *threshold, const int16_t *iscan_ptr, int *is_found, __m128i *mask) { __m128i coeff[2], cmp_mask0, cmp_mask1; coeff[0] = _mm_slli_epi32(*qcoeff0, AOM_QM_BITS); cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]); coeff[1] = _mm_slli_epi32(*qcoeff1, AOM_QM_BITS); cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]); cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1); highbd_update_mask1(&cmp_mask0, iscan_ptr, is_found, mask); } static inline __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant, const int log_scale) { __m128i coeff_sign = _mm_srai_epi32(qcoeff, 31); __m128i abs_coeff = invert_sign_32_sse2(qcoeff, coeff_sign); highbd_mul_shift_sse2(&abs_coeff, &dequant, &abs_coeff, log_scale); return invert_sign_32_sse2(abs_coeff, coeff_sign); } void aom_highbd_quantize_b_adaptive_sse2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int index = 8; const int log_scale = 0; int non_zero_count = 0; int non_zero_count_prescan_add_zero = 0; int is_found0 = 0, is_found1 = 0; int eob = -1; const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi32(1); __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i cmp_mask0, cmp_mask1, cmp_mask; __m128i all_zero; __m128i mask0 = zero, mask1 = zero; int prescan_add[2]; int thresh[4]; const qm_val_t wt = (1 << AOM_QM_BITS); for (int i = 0; i < 2; ++i) { prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; } thresh[2] = thresh[3] = thresh[1]; __m128i threshold[2]; threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); #if SKIP_EOB_FACTOR_ADJUST int first = -1; #endif // Setup global values. zbin = _mm_load_si128((const __m128i *)zbin_ptr); round = _mm_load_si128((const __m128i *)round_ptr); quant = _mm_load_si128((const __m128i *)quant_ptr); dequant = _mm_load_si128((const __m128i *)dequant_ptr); shift = _mm_load_si128((const __m128i *)quant_shift_ptr); __m128i zbin_sign = _mm_srai_epi16(zbin, 15); __m128i round_sign = _mm_srai_epi16(round, 15); __m128i quant_sign = _mm_srai_epi16(quant, 15); __m128i dequant_sign = _mm_srai_epi16(dequant, 15); __m128i shift_sign = _mm_srai_epi16(shift, 15); zbin = _mm_unpacklo_epi16(zbin, zbin_sign); round = _mm_unpacklo_epi16(round, round_sign); quant = _mm_unpacklo_epi16(quant, quant_sign); dequant = _mm_unpacklo_epi16(dequant, dequant_sign); shift = _mm_unpacklo_epi16(shift, shift_sign); zbin = _mm_sub_epi32(zbin, one); // Do DC and first 15 AC. coeff0 = _mm_load_si128((__m128i *)(coeff_ptr)); coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); coeff0_sign = _mm_srai_epi32(coeff0, 31); coeff1_sign = _mm_srai_epi32(coeff1, 31); qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1); threshold[0] = threshold[1]; all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_movemask_epi8(all_zero) == 0) { _mm_store_si128((__m128i *)(qcoeff_ptr), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); dequant = _mm_unpackhi_epi64(dequant, dequant); } else { highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); // Reinsert signs qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0); _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1); coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0); _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1); } // AC only loop. while (index < n_coeffs) { coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index)); coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4)); coeff0_sign = _mm_srai_epi32(coeff0, 31); coeff1_sign = _mm_srai_epi32(coeff1, 31); qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, &mask0); cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1); all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_movemask_epi8(all_zero) == 0) { _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); index += 8; continue; } highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1); coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1); index += 8; } if (is_found0) non_zero_count = calculate_non_zero_count(mask0); if (is_found1) non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { const int rc = scan[i]; qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; } for (int i = non_zero_count - 1; i >= 0; i--) { const int rc = scan[i]; if (qcoeff_ptr[rc]) { eob = i; break; } } *eob_ptr = eob + 1; #if SKIP_EOB_FACTOR_ADJUST // TODO(Aniket): Experiment the following loop with intrinsic by combining // with the quantization loop above for (int i = 0; i < non_zero_count; i++) { const int rc = scan[i]; const int qcoeff = qcoeff_ptr[rc]; if (qcoeff) { first = i; break; } } if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { const int rc = scan[(*eob_ptr - 1)]; if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { const int coeff = coeff_ptr[rc] * wt; const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; const int prescan_add_val = ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); if (abs_coeff < (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; *eob_ptr = 0; } } } #endif } void aom_highbd_quantize_b_32x32_adaptive_sse2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int index = 8; const int log_scale = 1; int non_zero_count = 0; int non_zero_count_prescan_add_zero = 0; int is_found0 = 0, is_found1 = 0; int eob = -1; const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi32(1); const __m128i log_scale_vec = _mm_set1_epi32(log_scale); __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i cmp_mask0, cmp_mask1, cmp_mask; __m128i all_zero; __m128i mask0 = zero, mask1 = zero; const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; int prescan_add[2]; int thresh[4]; const qm_val_t wt = (1 << AOM_QM_BITS); for (int i = 0; i < 2; ++i) { prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; } thresh[2] = thresh[3] = thresh[1]; __m128i threshold[2]; threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); #if SKIP_EOB_FACTOR_ADJUST int first = -1; #endif // Setup global values. zbin = _mm_load_si128((const __m128i *)zbin_ptr); round = _mm_load_si128((const __m128i *)round_ptr); quant = _mm_load_si128((const __m128i *)quant_ptr); dequant = _mm_load_si128((const __m128i *)dequant_ptr); shift = _mm_load_si128((const __m128i *)quant_shift_ptr); __m128i zbin_sign = _mm_srai_epi16(zbin, 15); __m128i round_sign = _mm_srai_epi16(round, 15); __m128i quant_sign = _mm_srai_epi16(quant, 15); __m128i dequant_sign = _mm_srai_epi16(dequant, 15); __m128i shift_sign = _mm_srai_epi16(shift, 15); zbin = _mm_unpacklo_epi16(zbin, zbin_sign); round = _mm_unpacklo_epi16(round, round_sign); quant = _mm_unpacklo_epi16(quant, quant_sign); dequant = _mm_unpacklo_epi16(dequant, dequant_sign); shift = _mm_unpacklo_epi16(shift, shift_sign); // Shift with rounding. zbin = _mm_add_epi32(zbin, log_scale_vec); round = _mm_add_epi32(round, log_scale_vec); zbin = _mm_srli_epi32(zbin, log_scale); round = _mm_srli_epi32(round, log_scale); zbin = _mm_sub_epi32(zbin, one); // Do DC and first 15 AC. coeff0 = _mm_load_si128((__m128i *)(coeff_ptr)); coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); coeff0_sign = _mm_srai_epi32(coeff0, 31); coeff1_sign = _mm_srai_epi32(coeff1, 31); qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1); threshold[0] = threshold[1]; all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_movemask_epi8(all_zero) == 0) { _mm_store_si128((__m128i *)(qcoeff_ptr), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); dequant = _mm_unpackhi_epi64(dequant, dequant); } else { highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); // Reinsert signs qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0); _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1); coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0); _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1); } // AC only loop. while (index < n_coeffs) { coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index)); coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4)); coeff0_sign = _mm_srai_epi32(coeff0, 31); coeff1_sign = _mm_srai_epi32(coeff1, 31); qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, &mask0); cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1); all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_movemask_epi8(all_zero) == 0) { _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); index += 8; continue; } highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1); coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1); index += 8; } if (is_found0) non_zero_count = calculate_non_zero_count(mask0); if (is_found1) non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { const int rc = scan[i]; qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; } for (int i = non_zero_count - 1; i >= 0; i--) { const int rc = scan[i]; if (qcoeff_ptr[rc]) { eob = i; break; } } *eob_ptr = eob + 1; #if SKIP_EOB_FACTOR_ADJUST // TODO(Aniket): Experiment the following loop with intrinsic by combining // with the quantization loop above for (int i = 0; i < non_zero_count; i++) { const int rc = scan[i]; const int qcoeff = qcoeff_ptr[rc]; if (qcoeff) { first = i; break; } } if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { const int rc = scan[(*eob_ptr - 1)]; if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { const int coeff = coeff_ptr[rc] * wt; const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; const int prescan_add_val = ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; *eob_ptr = 0; } } } #endif } void aom_highbd_quantize_b_64x64_adaptive_sse2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int index = 8; const int log_scale = 2; int non_zero_count = 0; int non_zero_count_prescan_add_zero = 0; int is_found0 = 0, is_found1 = 0; int eob = -1; const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi32(1); const __m128i log_scale_vec = _mm_set1_epi32(log_scale); __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i cmp_mask0, cmp_mask1, cmp_mask; __m128i all_zero; __m128i mask0 = zero, mask1 = zero; const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; int prescan_add[2]; int thresh[4]; const qm_val_t wt = (1 << AOM_QM_BITS); for (int i = 0; i < 2; ++i) { prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; } thresh[2] = thresh[3] = thresh[1]; __m128i threshold[2]; threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); #if SKIP_EOB_FACTOR_ADJUST int first = -1; #endif // Setup global values. zbin = _mm_load_si128((const __m128i *)zbin_ptr); round = _mm_load_si128((const __m128i *)round_ptr); quant = _mm_load_si128((const __m128i *)quant_ptr); dequant = _mm_load_si128((const __m128i *)dequant_ptr); shift = _mm_load_si128((const __m128i *)quant_shift_ptr); __m128i zbin_sign = _mm_srai_epi16(zbin, 15); __m128i round_sign = _mm_srai_epi16(round, 15); __m128i quant_sign = _mm_srai_epi16(quant, 15); __m128i dequant_sign = _mm_srai_epi16(dequant, 15); __m128i shift_sign = _mm_srai_epi16(shift, 15); zbin = _mm_unpacklo_epi16(zbin, zbin_sign); round = _mm_unpacklo_epi16(round, round_sign); quant = _mm_unpacklo_epi16(quant, quant_sign); dequant = _mm_unpacklo_epi16(dequant, dequant_sign); shift = _mm_unpacklo_epi16(shift, shift_sign); // Shift with rounding. zbin = _mm_add_epi32(zbin, log_scale_vec); round = _mm_add_epi32(round, log_scale_vec); zbin = _mm_srli_epi32(zbin, log_scale); round = _mm_srli_epi32(round, log_scale); zbin = _mm_sub_epi32(zbin, one); // Do DC and first 15 AC. coeff0 = _mm_load_si128((__m128i *)(coeff_ptr)); coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); coeff0_sign = _mm_srai_epi32(coeff0, 31); coeff1_sign = _mm_srai_epi32(coeff1, 31); qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1); threshold[0] = threshold[1]; all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_movemask_epi8(all_zero) == 0) { _mm_store_si128((__m128i *)(qcoeff_ptr), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); dequant = _mm_unpackhi_epi64(dequant, dequant); } else { highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); // Reinsert signs qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0); _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1); coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0); _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1); } // AC only loop. while (index < n_coeffs) { coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index)); coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4)); coeff0_sign = _mm_srai_epi32(coeff0, 31); coeff1_sign = _mm_srai_epi32(coeff1, 31); qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, &mask0); cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1); all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_movemask_epi8(all_zero) == 0) { _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); index += 8; continue; } highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1); coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1); index += 8; } if (is_found0) non_zero_count = calculate_non_zero_count(mask0); if (is_found1) non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { const int rc = scan[i]; qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; } for (int i = non_zero_count - 1; i >= 0; i--) { const int rc = scan[i]; if (qcoeff_ptr[rc]) { eob = i; break; } } *eob_ptr = eob + 1; #if SKIP_EOB_FACTOR_ADJUST // TODO(Aniket): Experiment the following loop with intrinsic by combining // with the quantization loop above for (int i = 0; i < non_zero_count; i++) { const int rc = scan[i]; const int qcoeff = qcoeff_ptr[rc]; if (qcoeff) { first = i; break; } } if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { const int rc = scan[(*eob_ptr - 1)]; if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { const int coeff = coeff_ptr[rc] * wt; const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; const int prescan_add_val = ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; *eob_ptr = 0; } } } #endif } aom-3.12.1/aom_dsp/x86/highbd_convolve_avx2.c000066400000000000000000001352141477627663500207220ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "aom_dsp/x86/convolve.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/synonyms.h" // ----------------------------------------------------------------------------- // Copy and average static const uint8_t ip_shuffle_f2f3[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; static const uint8_t ip_shuffle_f4f5[32] = { 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 }; void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd); void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd); void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd) { if (filter_params_y->taps == 12) { av1_highbd_convolve_y_sr_ssse3(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn, bd); return; } int i, j; const int fo_vert = filter_params_y->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_vert * src_stride; __m256i s[8], coeffs_y[4]; const int bits = FILTER_BITS; const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); const __m256i clip_pixel = _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m256i zero = _mm256_setzero_si256(); prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); for (j = 0; j < w; j += 8) { const uint16_t *data = &src_ptr[j]; /* Vertical filter */ { __m256i src6; __m256i s01 = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), 0x20); __m256i s12 = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), 0x20); __m256i s23 = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), 0x20); __m256i s34 = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), 0x20); __m256i s45 = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), 0x20); src6 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); __m256i s56 = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), src6, 0x20); s[0] = _mm256_unpacklo_epi16(s01, s12); s[1] = _mm256_unpacklo_epi16(s23, s34); s[2] = _mm256_unpacklo_epi16(s45, s56); s[4] = _mm256_unpackhi_epi16(s01, s12); s[5] = _mm256_unpackhi_epi16(s23, s34); s[6] = _mm256_unpackhi_epi16(s45, s56); for (i = 0; i < h; i += 2) { data = &src_ptr[i * src_stride + j]; const __m256i s67 = _mm256_permute2x128_si256( src6, _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), 0x20); src6 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); const __m256i s78 = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), src6, 0x20); s[3] = _mm256_unpacklo_epi16(s67, s78); s[7] = _mm256_unpackhi_epi16(s67, s78); const __m256i res_a = convolve(s, coeffs_y); __m256i res_a_round = _mm256_sra_epi32( _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); if (w - j > 4) { const __m256i res_b = convolve(s + 4, coeffs_y); __m256i res_b_round = _mm256_sra_epi32( _mm256_add_epi32(res_b, round_const_bits), round_shift_bits); __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); res_16bit = _mm256_max_epi16(res_16bit, zero); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], _mm256_castsi256_si128(res_16bit)); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], _mm256_extracti128_si256(res_16bit, 1)); } else if (w == 4) { res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); res_a_round = _mm256_max_epi16(res_a_round, zero); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], _mm256_castsi256_si128(res_a_round)); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], _mm256_extracti128_si256(res_a_round, 1)); } else { res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); res_a_round = _mm256_max_epi16(res_a_round, zero); xx_storel_32(&dst[i * dst_stride + j], _mm256_castsi256_si128(res_a_round)); xx_storel_32(&dst[i * dst_stride + j + dst_stride], _mm256_extracti128_si256(res_a_round, 1)); } s[0] = s[1]; s[1] = s[2]; s[2] = s[3]; s[4] = s[5]; s[5] = s[6]; s[6] = s[7]; } } } } void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd) { if (filter_params_x->taps == 12) { av1_highbd_convolve_x_sr_ssse3(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params, bd); return; } int i, j; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_horiz; // Check that, even with 12-bit input, the intermediate values will fit // into an unsigned 16-bit intermediate array. assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); __m256i s[4], coeffs_x[4]; const __m256i round_const_x = _mm256_set1_epi32(((1 << conv_params->round_0) >> 1)); const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); const int bits = FILTER_BITS - conv_params->round_0; const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); const __m256i clip_pixel = _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m256i zero = _mm256_setzero_si256(); assert(bits >= 0); assert((FILTER_BITS - conv_params->round_1) >= 0 || ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); for (j = 0; j < w; j += 8) { /* Horizontal filter */ for (i = 0; i < h; i += 2) { const __m256i row0 = _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); __m256i row1 = _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); // even pixels s[0] = _mm256_alignr_epi8(r1, r0, 0); s[1] = _mm256_alignr_epi8(r1, r0, 4); s[2] = _mm256_alignr_epi8(r1, r0, 8); s[3] = _mm256_alignr_epi8(r1, r0, 12); __m256i res_even = convolve(s, coeffs_x); res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), round_shift_x); // odd pixels s[0] = _mm256_alignr_epi8(r1, r0, 2); s[1] = _mm256_alignr_epi8(r1, r0, 6); s[2] = _mm256_alignr_epi8(r1, r0, 10); s[3] = _mm256_alignr_epi8(r1, r0, 14); __m256i res_odd = convolve(s, coeffs_x); res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), round_shift_x); res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits), round_shift_bits); res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits), round_shift_bits); __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); res = _mm256_min_epi16(res, clip_pixel); res = _mm256_max_epi16(res, zero); if (w - j > 4) { _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], _mm256_castsi256_si128(res)); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], _mm256_extracti128_si256(res, 1)); } else if (w == 4) { _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], _mm256_castsi256_si128(res)); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], _mm256_extracti128_si256(res, 1)); } else { xx_storel_32(&dst[i * dst_stride + j], _mm256_castsi256_si128(res)); xx_storel_32(&dst[i * dst_stride + j + dst_stride], _mm256_extracti128_si256(res, 1)); } } } } #define CONV8_ROUNDING_BITS (7) // ----------------------------------------------------------------------------- // Horizontal and vertical filtering static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 }; static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 }; static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; // ----------------------------------------------------------------------------- // Horizontal Filtering static inline void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) { const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0); const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1); const __m256i c = _mm256_permutevar8x32_epi32(*s, idx); p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6 p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7 p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4 p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5 } // Note: // Shared by 8x2 and 16x1 block static inline void pack_16_pixels(const __m256i *s0, const __m256i *s1, __m256i *x /*x[8]*/) { __m256i pp[8]; pack_pixels(s0, pp); pack_pixels(s1, &pp[4]); x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20); x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20); x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20); x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20); x[4] = x[2]; x[5] = x[3]; x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31); x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); } static inline void pack_8x1_pixels(const uint16_t *src, __m256i *x) { __m256i pp[8]; __m256i s0; s0 = _mm256_loadu_si256((const __m256i *)src); pack_pixels(&s0, pp); x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); } static inline void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride, __m256i *x) { __m256i s0, s1; s0 = _mm256_loadu_si256((const __m256i *)src); s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); pack_16_pixels(&s0, &s1, x); } static inline void pack_16x1_pixels(const uint16_t *src, __m256i *x) { __m256i s0, s1; s0 = _mm256_loadu_si256((const __m256i *)src); s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); pack_16_pixels(&s0, &s1, x); } // Note: // Shared by horizontal and vertical filtering static inline void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { const __m128i h = _mm_loadu_si128((const __m128i *)filter); const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); const __m256i p0 = _mm256_set1_epi32(0x03020100); const __m256i p1 = _mm256_set1_epi32(0x07060504); const __m256i p2 = _mm256_set1_epi32(0x0b0a0908); const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c); f[0] = _mm256_shuffle_epi8(hh, p0); f[1] = _mm256_shuffle_epi8(hh, p1); f[2] = _mm256_shuffle_epi8(hh, p2); f[3] = _mm256_shuffle_epi8(hh, p3); } static inline void pack_filters_4tap(const int16_t *filter, __m256i *f /*f[4]*/) { const __m128i h = _mm_loadu_si128((const __m128i *)filter); const __m256i coeff = _mm256_broadcastsi128_si256(h); // coeffs 2 3 2 3 2 3 2 3 f[0] = _mm256_shuffle_epi32(coeff, 0x55); // coeffs 4 5 4 5 4 5 4 5 f[1] = _mm256_shuffle_epi32(coeff, 0xaa); } static inline void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, const __m256i *fil /*fil[4]*/, __m256i *y) { __m256i a, a0, a1; a0 = _mm256_madd_epi16(fil[0], sig[0]); a1 = _mm256_madd_epi16(fil[3], sig[3]); a = _mm256_add_epi32(a0, a1); a0 = _mm256_madd_epi16(fil[1], sig[1]); a1 = _mm256_madd_epi16(fil[2], sig[2]); { const __m256i min = _mm256_min_epi32(a0, a1); a = _mm256_add_epi32(a, min); } { const __m256i max = _mm256_max_epi32(a0, a1); a = _mm256_add_epi32(a, max); } { const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); a = _mm256_add_epi32(a, rounding); *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); } } static inline void store_8x1_pixels(const __m256i *y, const __m256i *mask, uint16_t *dst) { const __m128i a0 = _mm256_castsi256_si128(*y); const __m128i a1 = _mm256_extractf128_si256(*y, 1); __m128i res = _mm_packus_epi32(a0, a1); res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); _mm_storeu_si128((__m128i *)dst, res); } static inline void store_8x2_pixels(const __m256i *y0, const __m256i *y1, const __m256i *mask, uint16_t *dst, ptrdiff_t pitch) { __m256i a = _mm256_packus_epi32(*y0, *y1); a = _mm256_min_epi16(a, *mask); _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); } static inline void store_16x1_pixels(const __m256i *y0, const __m256i *y1, const __m256i *mask, uint16_t *dst) { __m256i a = _mm256_packus_epi32(*y0, *y1); a = _mm256_min_epi16(a, *mask); _mm256_storeu_si256((__m256i *)dst, a); } static void aom_highbd_filter_block1d8_h8_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m256i signal[8], res0, res1; const __m256i max = _mm256_set1_epi16((1 << bd) - 1); __m256i ff[4]; pack_filters(filter, ff); src_ptr -= 3; do { pack_8x2_pixels(src_ptr, src_pitch, signal); filter_8x1_pixels(signal, ff, &res0); filter_8x1_pixels(&signal[4], ff, &res1); store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); height -= 2; src_ptr += src_pitch << 1; dst_ptr += dst_pitch << 1; } while (height > 1); if (height > 0) { pack_8x1_pixels(src_ptr, signal); filter_8x1_pixels(signal, ff, &res0); store_8x1_pixels(&res0, &max, dst_ptr); } } static void aom_highbd_filter_block1d16_h8_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m256i signal[8], res0, res1; const __m256i max = _mm256_set1_epi16((1 << bd) - 1); __m256i ff[4]; pack_filters(filter, ff); src_ptr -= 3; do { pack_16x1_pixels(src_ptr, signal); filter_8x1_pixels(signal, ff, &res0); filter_8x1_pixels(&signal[4], ff, &res1); store_16x1_pixels(&res0, &res1, &max, dst_ptr); height -= 1; src_ptr += src_pitch; dst_ptr += dst_pitch; } while (height > 0); } static void aom_highbd_filter_block1d4_h4_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); __m256i ff[2], s[2]; uint32_t i; const __m256i clip_pixel = _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m256i zero = _mm256_setzero_si256(); static const uint8_t shuffle_mask[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask); __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3); __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5); pack_filters_4tap(filter, ff); src_ptr -= 3; for (i = 0; i <= (height - 2); i += 2) { __m256i row0 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2])); __m256i row1 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)&src_ptr[(i + 1) * src_pitch + 2])); s[0] = _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1); s[1] = _mm256_alignr_epi8(s[0], s[0], 4); s[0] = _mm256_shuffle_epi8(s[0], mask); s[1] = _mm256_shuffle_epi8(s[1], mask); __m256i res = convolve_4tap(s, ff); res = _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); res = _mm256_packs_epi32(res, res); res = _mm256_min_epi16(res, clip_pixel); res = _mm256_max_epi16(res, zero); _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], _mm256_castsi256_si128(res)); _mm_storel_epi64((__m128i *)&dst_ptr[(i + 1) * dst_pitch], _mm256_extracti128_si256(res, 1)); } if (height % 2 != 0) { i = height - 1; const __m256i row0_0 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2])); const __m256i row0_1 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 6])); const __m256i r0 = _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1); s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3); s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5); __m256i res = convolve_4tap(s, ff); res = _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); res = _mm256_packs_epi32(res, res); res = _mm256_min_epi16(res, clip_pixel); res = _mm256_max_epi16(res, zero); _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], _mm256_castsi256_si128(res)); } } static void aom_highbd_filter_block1d8_h4_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); __m256i ff[2], s[2]; uint32_t i = 0; const __m256i clip_pixel = _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m256i zero = _mm256_setzero_si256(); static const uint8_t shuffle_mask[32] = { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 }; __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask); __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3); __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5); pack_filters_4tap(filter, ff); src_ptr -= 3; /* Horizontal filter */ for (i = 0; i <= (height - 2); i += 2) { const __m256i row0 = _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]); __m256i row1 = _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_pitch + 2]); const __m256i r0 = _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1); const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); // even pixels s[0] = r0; s[1] = _mm256_alignr_epi8(r1, r0, 4); __m256i res_even = convolve_4tap(s, ff); res_even = _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding), CONV8_ROUNDING_BITS); // odd pixels s[0] = _mm256_alignr_epi8(r1, r0, 2); s[1] = _mm256_alignr_epi8(r1, r0, 6); __m256i res_odd = convolve_4tap(s, ff); res_odd = _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding), CONV8_ROUNDING_BITS); __m256i res = _mm256_packs_epi32(res_even, res_odd); res = _mm256_shuffle_epi8(res, mask); res = _mm256_min_epi16(res, clip_pixel); res = _mm256_max_epi16(res, zero); _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch], _mm256_castsi256_si128(res)); _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], _mm256_extracti128_si256(res, 1)); } if (height % 2 != 0) { i = height - 1; const __m256i row0_0 = _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]); const __m256i row0_1 = _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 6]); const __m256i r0 = _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1); s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3); s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5); __m256i res = convolve_4tap(s, ff); res = _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); res = _mm256_packs_epi32(res, res); res = _mm256_min_epi16(res, clip_pixel); res = _mm256_max_epi16(res, zero); _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], _mm256_castsi256_si128(res)); _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + 4], _mm256_extracti128_si256(res, 1)); } } static void aom_highbd_filter_block1d16_h4_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { aom_highbd_filter_block1d8_h4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch, height, filter, bd); aom_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8, dst_pitch, height, filter, bd); } // ----------------------------------------------------------------------------- // 2-tap horizontal filtering static inline void pack_2t_filter(const int16_t *filter, __m256i *f) { const __m128i h = _mm_loadu_si128((const __m128i *)filter); const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); const __m256i p = _mm256_set1_epi32(0x09080706); f[0] = _mm256_shuffle_epi8(hh, p); } // can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels() // the difference is s0/s1 specifies first and second rows or, // first 16 samples and 8-sample shifted 16 samples static inline void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1, __m256i *sig) { const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); __m256i x0 = _mm256_shuffle_epi8(*s0, sf2); __m256i x1 = _mm256_shuffle_epi8(*s1, sf2); __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx); __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx); r0 = _mm256_shuffle_epi8(r0, sf2); r1 = _mm256_shuffle_epi8(r1, sf2); sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20); sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20); } static inline void pack_8x2_2t_pixels(const uint16_t *src, const ptrdiff_t pitch, __m256i *sig) { const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); pack_16_2t_pixels(&r0, &r1, sig); } static inline void pack_16x1_2t_pixels(const uint16_t *src, __m256i *sig /*sig[2]*/) { const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8)); pack_16_2t_pixels(&r0, &r1, sig); } static inline void pack_8x1_2t_pixels(const uint16_t *src, __m256i *sig /*sig[2]*/) { const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); __m256i r0 = _mm256_loadu_si256((const __m256i *)src); __m256i x0 = _mm256_shuffle_epi8(r0, sf2); r0 = _mm256_permutevar8x32_epi32(r0, idx); r0 = _mm256_shuffle_epi8(r0, sf2); sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20); } // can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels() static inline void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, __m256i *y0, __m256i *y1) { const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); __m256i x0 = _mm256_madd_epi16(sig[0], *f); __m256i x1 = _mm256_madd_epi16(sig[1], *f); x0 = _mm256_add_epi32(x0, rounding); x1 = _mm256_add_epi32(x1, rounding); *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); } static inline void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, __m256i *y0) { const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); __m256i x0 = _mm256_madd_epi16(sig[0], *f); x0 = _mm256_add_epi32(x0, rounding); *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); } static void aom_highbd_filter_block1d8_h2_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m256i signal[2], res0, res1; const __m256i max = _mm256_set1_epi16((1 << bd) - 1); __m256i ff; pack_2t_filter(filter, &ff); src_ptr -= 3; do { pack_8x2_2t_pixels(src_ptr, src_pitch, signal); filter_16_2t_pixels(signal, &ff, &res0, &res1); store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); height -= 2; src_ptr += src_pitch << 1; dst_ptr += dst_pitch << 1; } while (height > 1); if (height > 0) { pack_8x1_2t_pixels(src_ptr, signal); filter_8x1_2t_pixels(signal, &ff, &res0); store_8x1_pixels(&res0, &max, dst_ptr); } } static void aom_highbd_filter_block1d16_h2_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m256i signal[2], res0, res1; const __m256i max = _mm256_set1_epi16((1 << bd) - 1); __m256i ff; pack_2t_filter(filter, &ff); src_ptr -= 3; do { pack_16x1_2t_pixels(src_ptr, signal); filter_16_2t_pixels(signal, &ff, &res0, &res1); store_16x1_pixels(&res0, &res1, &max, dst_ptr); height -= 1; src_ptr += src_pitch; dst_ptr += dst_pitch; } while (height > 0); } // ----------------------------------------------------------------------------- // Vertical Filtering static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src)); __m256i s1 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch))); __m256i s2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src + 2 * pitch))); __m256i s3 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src + 3 * pitch))); __m256i s4 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src + 4 * pitch))); __m256i s5 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src + 5 * pitch))); __m256i s6 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src + 6 * pitch))); s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1); s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1); s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1); s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1); s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1); sig[0] = _mm256_unpacklo_epi16(s0, s1); sig[4] = _mm256_unpackhi_epi16(s0, s1); sig[1] = _mm256_unpacklo_epi16(s2, s3); sig[5] = _mm256_unpackhi_epi16(s2, s3); sig[2] = _mm256_unpacklo_epi16(s4, s5); sig[6] = _mm256_unpackhi_epi16(s4, s5); sig[8] = s6; } static inline void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { // base + 7th row __m256i s0 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src + 7 * pitch))); // base + 8th row __m256i s1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src + 8 * pitch))); __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1); __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); sig[3] = _mm256_unpacklo_epi16(s2, s3); sig[7] = _mm256_unpackhi_epi16(s2, s3); sig[8] = s1; } static inline void filter_8x9_pixels(const __m256i *sig, const __m256i *f, __m256i *y0, __m256i *y1) { filter_8x1_pixels(sig, f, y0); filter_8x1_pixels(&sig[4], f, y1); } static inline void update_pixels(__m256i *sig) { int i; for (i = 0; i < 3; ++i) { sig[i] = sig[i + 1]; sig[i + 4] = sig[i + 5]; } } static void aom_highbd_filter_block1d8_v8_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m256i signal[9], res0, res1; const __m256i max = _mm256_set1_epi16((1 << bd) - 1); __m256i ff[4]; pack_filters(filter, ff); pack_8x9_init(src_ptr, src_pitch, signal); do { pack_8x9_pixels(src_ptr, src_pitch, signal); filter_8x9_pixels(signal, ff, &res0, &res1); store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); update_pixels(signal); src_ptr += src_pitch << 1; dst_ptr += dst_pitch << 1; height -= 2; } while (height > 0); } static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { __m256i u0, u1, u2, u3; // load 0-6 rows const __m256i s0 = _mm256_loadu_si256((const __m256i *)src); const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch)); const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch)); const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch)); const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch)); const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch)); u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high sig[0] = _mm256_unpacklo_epi16(u0, u2); sig[4] = _mm256_unpackhi_epi16(u0, u2); sig[8] = _mm256_unpacklo_epi16(u1, u3); sig[12] = _mm256_unpackhi_epi16(u1, u3); u0 = _mm256_permute2x128_si256(s2, s3, 0x20); u1 = _mm256_permute2x128_si256(s2, s3, 0x31); u2 = _mm256_permute2x128_si256(s3, s4, 0x20); u3 = _mm256_permute2x128_si256(s3, s4, 0x31); sig[1] = _mm256_unpacklo_epi16(u0, u2); sig[5] = _mm256_unpackhi_epi16(u0, u2); sig[9] = _mm256_unpacklo_epi16(u1, u3); sig[13] = _mm256_unpackhi_epi16(u1, u3); u0 = _mm256_permute2x128_si256(s4, s5, 0x20); u1 = _mm256_permute2x128_si256(s4, s5, 0x31); u2 = _mm256_permute2x128_si256(s5, s6, 0x20); u3 = _mm256_permute2x128_si256(s5, s6, 0x31); sig[2] = _mm256_unpacklo_epi16(u0, u2); sig[6] = _mm256_unpackhi_epi16(u0, u2); sig[10] = _mm256_unpacklo_epi16(u1, u3); sig[14] = _mm256_unpackhi_epi16(u1, u3); sig[16] = s6; } static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { // base + 7th row const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch)); // base + 8th row const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch)); __m256i u0, u1, u2, u3; u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20); u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31); u2 = _mm256_permute2x128_si256(s7, s8, 0x20); u3 = _mm256_permute2x128_si256(s7, s8, 0x31); sig[3] = _mm256_unpacklo_epi16(u0, u2); sig[7] = _mm256_unpackhi_epi16(u0, u2); sig[11] = _mm256_unpacklo_epi16(u1, u3); sig[15] = _mm256_unpackhi_epi16(u1, u3); sig[16] = s8; } static inline void filter_16x9_pixels(const __m256i *sig, const __m256i *f, __m256i *y0, __m256i *y1) { __m256i res[4]; int i; for (i = 0; i < 4; ++i) { filter_8x1_pixels(&sig[i << 2], f, &res[i]); } { const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); } } static inline void store_16x2_pixels(const __m256i *y0, const __m256i *y1, const __m256i *mask, uint16_t *dst, ptrdiff_t pitch) { __m256i p = _mm256_min_epi16(*y0, *mask); _mm256_storeu_si256((__m256i *)dst, p); p = _mm256_min_epi16(*y1, *mask); _mm256_storeu_si256((__m256i *)(dst + pitch), p); } static void update_16x9_pixels(__m256i *sig) { update_pixels(&sig[0]); update_pixels(&sig[8]); } static void aom_highbd_filter_block1d16_v8_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m256i signal[17], res0, res1; const __m256i max = _mm256_set1_epi16((1 << bd) - 1); __m256i ff[4]; pack_filters(filter, ff); pack_16x9_init(src_ptr, src_pitch, signal); do { pack_16x9_pixels(src_ptr, src_pitch, signal); filter_16x9_pixels(signal, ff, &res0, &res1); store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); update_16x9_pixels(signal); src_ptr += src_pitch << 1; dst_ptr += dst_pitch << 1; height -= 2; } while (height > 0); } static void aom_highbd_filter_block1d4_v4_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { const int bits = FILTER_BITS; const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); const __m256i clip_pixel = _mm256_set1_epi32(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m256i zero = _mm256_setzero_si256(); uint32_t i; __m256i s[2], ff[2]; pack_filters_4tap(filter, ff); const uint16_t *data = src_ptr; /* Vertical filter */ { __m128i s2 = _mm_loadl_epi64((__m128i *)(data + 2 * src_pitch)); __m128i s3 = _mm_loadl_epi64((__m128i *)(data + 3 * src_pitch)); __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1); __m128i s4 = _mm_loadl_epi64((__m128i *)(data + 4 * src_pitch)); __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1); s[0] = _mm256_unpacklo_epi16(s23, s34); for (i = 0; i < height; i += 2) { data = &src_ptr[i * src_pitch]; __m128i s5 = _mm_loadl_epi64((__m128i *)(data + 5 * src_pitch)); __m128i s6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_pitch)); __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1); __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1); s[1] = _mm256_unpacklo_epi16(s45, s56); const __m256i res_a = convolve_4tap(s, ff); __m256i res_a_round = _mm256_sra_epi32( _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); __m256i res_16bit = _mm256_min_epi32(res_a_round, clip_pixel); res_16bit = _mm256_max_epi32(res_16bit, zero); res_16bit = _mm256_packs_epi32(res_16bit, res_16bit); _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], _mm256_castsi256_si128(res_16bit)); _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], _mm256_extracti128_si256(res_16bit, 1)); s[0] = s[1]; s4 = s6; } } } static void aom_highbd_filter_block1d8_v4_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { const int bits = FILTER_BITS; const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); const __m256i clip_pixel = _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m256i zero = _mm256_setzero_si256(); __m256i s[4], ff[2]; uint32_t i; pack_filters_4tap(filter, ff); const uint16_t *data = src_ptr; /* Vertical filter */ { __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_pitch)); __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_pitch)); __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1); __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_pitch)); __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1); s[0] = _mm256_unpacklo_epi16(s23, s34); s[2] = _mm256_unpackhi_epi16(s23, s34); for (i = 0; i < height; i += 2) { data = &src_ptr[i * src_pitch]; __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_pitch)); __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_pitch)); __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1); __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1); s[1] = _mm256_unpacklo_epi16(s45, s56); s[3] = _mm256_unpackhi_epi16(s45, s56); const __m256i res_a = convolve_4tap(s, ff); __m256i res_a_round = _mm256_sra_epi32( _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); const __m256i res_b = convolve_4tap(s + 2, ff); __m256i res_b_round = _mm256_sra_epi32( _mm256_add_epi32(res_b, round_const_bits), round_shift_bits); __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); res_16bit = _mm256_max_epi16(res_16bit, zero); _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch], _mm256_castsi256_si128(res_16bit)); _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], _mm256_extracti128_si256(res_16bit, 1)); s[0] = s[1]; s[2] = s[3]; s4 = s6; } } } static void aom_highbd_filter_block1d16_v4_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { aom_highbd_filter_block1d8_v4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch, height, filter, bd); aom_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8, dst_pitch, height, filter, bd); } // ----------------------------------------------------------------------------- // 2-tap vertical filtering static void pack_16x2_init(const uint16_t *src, __m256i *sig) { sig[2] = _mm256_loadu_si256((const __m256i *)src); } static inline void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { // load the next row const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch)); sig[0] = _mm256_unpacklo_epi16(sig[2], u); sig[1] = _mm256_unpackhi_epi16(sig[2], u); sig[2] = u; } static inline void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, __m256i *y0, __m256i *y1) { filter_16_2t_pixels(sig, f, y0, y1); } static void aom_highbd_filter_block1d16_v2_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m256i signal[3], res0, res1; const __m256i max = _mm256_set1_epi16((1 << bd) - 1); __m256i ff; pack_2t_filter(filter, &ff); pack_16x2_init(src_ptr, signal); do { pack_16x2_2t_pixels(src_ptr, src_pitch, signal); filter_16x2_2t_pixels(signal, &ff, &res0, &res1); store_16x1_pixels(&res0, &res1, &max, dst_ptr); src_ptr += src_pitch; dst_ptr += dst_pitch; height -= 1; } while (height > 0); } static inline void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { const __m128i h = _mm_loadu_si128((const __m128i *)filter); const __m128i p = _mm_set1_epi32(0x09080706); f[0] = _mm_shuffle_epi8(h, p); } static void pack_8x2_init(const uint16_t *src, __m128i *sig) { sig[2] = _mm_loadu_si128((const __m128i *)src); } static inline void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch, __m128i *sig) { // load the next row const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch)); sig[0] = _mm_unpacklo_epi16(sig[2], u); sig[1] = _mm_unpackhi_epi16(sig[2], u); sig[2] = u; } static inline void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, __m128i *y0, __m128i *y1) { const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); __m128i x0 = _mm_madd_epi16(sig[0], *f); __m128i x1 = _mm_madd_epi16(sig[1], *f); x0 = _mm_add_epi32(x0, rounding); x1 = _mm_add_epi32(x1, rounding); *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS); *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); } static inline void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, const __m128i *mask, uint16_t *dst) { __m128i res = _mm_packus_epi32(*y0, *y1); res = _mm_min_epi16(res, *mask); _mm_storeu_si128((__m128i *)dst, res); } static void aom_highbd_filter_block1d8_v2_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m128i signal[3], res0, res1; const __m128i max = _mm_set1_epi16((1 << bd) - 1); __m128i ff; pack_8x1_2t_filter(filter, &ff); pack_8x2_init(src_ptr, signal); do { pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); filter_8_2t_pixels(signal, &ff, &res0, &res1); store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr); src_ptr += src_pitch; dst_ptr += dst_pitch; height -= 1; } while (height > 0); } void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, ptrdiff_t, uint32_t, const int16_t *, int); void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, ptrdiff_t, uint32_t, const int16_t *, int); void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, ptrdiff_t, uint32_t, const int16_t *, int); void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, ptrdiff_t, uint32_t, const int16_t *, int); #define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2 #define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2 #define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2 #define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2) HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2) #undef HIGHBD_FUNC aom-3.12.1/aom_dsp/x86/highbd_convolve_sse2.c000066400000000000000000000374511477627663500207220ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/x86/convolve.h" // ----------------------------------------------------------------------------- static void aom_highbd_filter_block1d4_v4_sse2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m128i filtersReg; __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; __m128i srcReg23_lo, srcReg34_lo; __m128i srcReg45_lo, srcReg56_lo; __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; __m128i resReg23_45_lo, resReg34_56_lo; __m128i resReg23_45, resReg34_56; __m128i addFilterReg64, secondFilters, thirdFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; const __m128i max = _mm_set1_epi16((1 << bd) - 1); addFilterReg64 = _mm_set1_epi32(64); filtersReg = _mm_loadu_si128((const __m128i *)filter); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 // multiply the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = dst_pitch << 1; srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3); srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4); for (i = height; i > 1; i -= 2) { srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5); srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6); // multiply 2 adjacent elements with the filter and add the result resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters); resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters); resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters); resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters); resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo); resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo); // shift by 7 bit each 32 bit resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64); resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64); resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7); resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7); // shrink to 16 bit each 32 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128()); resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128()); resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128()); resReg23_45 = _mm_min_epi16(resReg23_45, max); resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128()); resReg34_56 = _mm_min_epi16(resReg34_56, max); src_ptr += src_stride; _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45)); _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56)); dst_ptr += dst_stride; // save part of the registers for next strides srcReg23_lo = srcReg45_lo; srcReg34_lo = srcReg56_lo; srcReg4 = srcReg6; } } static void aom_highbd_filter_block1d4_h4_sse2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m128i filtersReg; __m128i addFilterReg64; __m128i secondFilters, thirdFilters; __m128i srcRegFilt32b1_1; __m128i srcReg32b1; unsigned int i; src_ptr -= 3; addFilterReg64 = _mm_set1_epi32(64); filtersReg = _mm_loadu_si128((const __m128i *)filter); const __m128i max = _mm_set1_epi16((1 << bd) - 1); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 for (i = height; i > 0; i -= 1) { srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2)); __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2); __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4); __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6); __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1); __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1); ss_23 = _mm_madd_epi16(ss_23, secondFilters); ss_45 = _mm_madd_epi16(ss_45, thirdFilters); srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45); // shift by 7 bit each 32 bit srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64); srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7); srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128()); srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max); src_ptr += src_pitch; _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1); dst_ptr += dst_pitch; } } static void aom_highbd_filter_block1d8_v4_sse2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m128i filtersReg; __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; __m128i resReg23_45, resReg34_56; __m128i addFilterReg64, secondFilters, thirdFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; const __m128i max = _mm_set1_epi16((1 << bd) - 1); addFilterReg64 = _mm_set1_epi32(64); filtersReg = _mm_loadu_si128((const __m128i *)filter); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = dst_pitch << 1; srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3); srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3); srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4); srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4); for (i = height; i > 1; i -= 2) { srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5); srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5); srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6); srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6); // multiply 2 adjacent elements with the filter and add the result resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters); resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters); resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters); resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters); resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo); resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo); // multiply 2 adjacent elements with the filter and add the result resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters); resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters); resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters); resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters); resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi); resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi); // shift by 7 bit each 32 bit resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64); resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64); resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64); resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64); resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7); resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7); resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7); resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7); // shrink to 16 bit each 32 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi); resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi); resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128()); resReg23_45 = _mm_min_epi16(resReg23_45, max); resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128()); resReg34_56 = _mm_min_epi16(resReg34_56, max); src_ptr += src_stride; _mm_store_si128((__m128i *)dst_ptr, (resReg23_45)); _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56)); dst_ptr += dst_stride; // save part of the registers for next strides srcReg23_lo = srcReg45_lo; srcReg23_hi = srcReg45_hi; srcReg34_lo = srcReg56_lo; srcReg34_hi = srcReg56_hi; srcReg4 = srcReg6; } } static void aom_highbd_filter_block1d8_h4_sse2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m128i filtersReg; __m128i addFilterReg64; __m128i secondFilters, thirdFilters; __m128i srcRegFilt32b1_1, srcRegFilt32b1_2; __m128i srcReg32b1, srcReg32b2; unsigned int i; src_ptr -= 3; addFilterReg64 = _mm_set1_epi32(64); filtersReg = _mm_loadu_si128((const __m128i *)filter); const __m128i max = _mm_set1_epi16((1 << bd) - 1); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 for (i = height; i > 0; i -= 1) { srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2)); srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6)); __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4); __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4); __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2); __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters); __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters); srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2); __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6); __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2); __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6); __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2); __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2); d1 = _mm_madd_epi16(ss_3, secondFilters); d2 = _mm_madd_epi16(ss_5, thirdFilters); srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); // shift by 7 bit each 32 bit res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64); res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64); res_lo_1 = _mm_srai_epi32(res_lo_1, 7); res_hi_1 = _mm_srai_epi32(res_hi_1, 7); srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1); srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max); src_ptr += src_pitch; _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1); dst_ptr += dst_pitch; } } static void aom_highbd_filter_block1d16_v4_sse2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch, height, filter, bd); aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8), dst_pitch, height, filter, bd); } static void aom_highbd_filter_block1d16_h4_sse2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch, height, filter, bd); aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8), dst_pitch, height, filter, bd); } // From aom_dsp/x86/aom_high_subpixel_8t_sse2.asm highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2; // From aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2; // void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, // ptrdiff_t src_stride, // uint8_t *dst, // ptrdiff_t dst_stride, // const int16_t *filter_x, // int x_step_q4, // const int16_t *filter_y, // int y_step_q4, // int w, int h, int bd); // void aom_highbd_convolve8_vert_sse2(const uint8_t *src, // ptrdiff_t src_stride, // uint8_t *dst, // ptrdiff_t dst_stride, // const int16_t *filter_x, // int x_step_q4, // const int16_t *filter_y, // int y_step_q4, // int w, int h, int bd); HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2) HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2) aom-3.12.1/aom_dsp/x86/highbd_convolve_ssse3.c000066400000000000000000000416421477627663500211030ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "aom_dsp/x86/convolve_sse2.h" #include "aom_dsp/x86/convolve_common_intrin.h" void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd) { int i, j; const int fo_vert = filter_params_y->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_vert * src_stride; const int bits = FILTER_BITS; const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); const __m128i clip_pixel = _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m128i zero = _mm_setzero_si128(); if (filter_params_y->taps == 12) { __m128i s[24], coeffs_y[6]; prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs_y); for (j = 0; j < w; j += 8) { const uint16_t *data = &src_ptr[j]; /* Vertical filter */ __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); __m128i s9 = _mm_loadu_si128((__m128i *)(data + 9 * src_stride)); __m128i s10 = _mm_loadu_si128((__m128i *)(data + 10 * src_stride)); s[0] = _mm_unpacklo_epi16(s0, s1); s[1] = _mm_unpacklo_epi16(s2, s3); s[2] = _mm_unpacklo_epi16(s4, s5); s[3] = _mm_unpacklo_epi16(s6, s7); s[4] = _mm_unpacklo_epi16(s8, s9); s[6] = _mm_unpackhi_epi16(s0, s1); s[7] = _mm_unpackhi_epi16(s2, s3); s[8] = _mm_unpackhi_epi16(s4, s5); s[9] = _mm_unpackhi_epi16(s6, s7); s[10] = _mm_unpackhi_epi16(s8, s9); s[12] = _mm_unpacklo_epi16(s1, s2); s[13] = _mm_unpacklo_epi16(s3, s4); s[14] = _mm_unpacklo_epi16(s5, s6); s[15] = _mm_unpacklo_epi16(s7, s8); s[16] = _mm_unpacklo_epi16(s9, s10); s[18] = _mm_unpackhi_epi16(s1, s2); s[19] = _mm_unpackhi_epi16(s3, s4); s[20] = _mm_unpackhi_epi16(s5, s6); s[21] = _mm_unpackhi_epi16(s7, s8); s[22] = _mm_unpackhi_epi16(s9, s10); for (i = 0; i < h; i += 2) { data = &src_ptr[i * src_stride + j]; __m128i s11 = _mm_loadu_si128((__m128i *)(data + 11 * src_stride)); __m128i s12 = _mm_loadu_si128((__m128i *)(data + 12 * src_stride)); s[5] = _mm_unpacklo_epi16(s10, s11); s[11] = _mm_unpackhi_epi16(s10, s11); s[17] = _mm_unpacklo_epi16(s11, s12); s[23] = _mm_unpackhi_epi16(s11, s12); const __m128i res_a0 = convolve_12tap(s, coeffs_y); __m128i res_a_round0 = _mm_sra_epi32( _mm_add_epi32(res_a0, round_const_bits), round_shift_bits); const __m128i res_a1 = convolve_12tap(s + 12, coeffs_y); __m128i res_a_round1 = _mm_sra_epi32( _mm_add_epi32(res_a1, round_const_bits), round_shift_bits); if (w - j > 4) { const __m128i res_b0 = convolve_12tap(s + 6, coeffs_y); __m128i res_b_round0 = _mm_sra_epi32( _mm_add_epi32(res_b0, round_const_bits), round_shift_bits); const __m128i res_b1 = convolve_12tap(s + 18, coeffs_y); __m128i res_b_round1 = _mm_sra_epi32( _mm_add_epi32(res_b1, round_const_bits), round_shift_bits); __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); res_16bit0 = _mm_max_epi16(res_16bit0, zero); __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); res_16bit1 = _mm_max_epi16(res_16bit1, zero); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], res_16bit1); } else if (w == 4) { res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); res_a_round0 = _mm_max_epi16(res_a_round0, zero); res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); res_a_round1 = _mm_max_epi16(res_a_round1, zero); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], res_a_round1); } else { res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); res_a_round0 = _mm_max_epi16(res_a_round0, zero); res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); res_a_round1 = _mm_max_epi16(res_a_round1, zero); *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res_a_round0); *((int *)(&dst[i * dst_stride + j + dst_stride])) = _mm_cvtsi128_si32(res_a_round1); } s[0] = s[1]; s[1] = s[2]; s[2] = s[3]; s[3] = s[4]; s[4] = s[5]; s[6] = s[7]; s[7] = s[8]; s[8] = s[9]; s[9] = s[10]; s[10] = s[11]; s[12] = s[13]; s[13] = s[14]; s[14] = s[15]; s[15] = s[16]; s[16] = s[17]; s[18] = s[19]; s[19] = s[20]; s[20] = s[21]; s[21] = s[22]; s[22] = s[23]; s10 = s12; } } } else { __m128i s[16], coeffs_y[4]; prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); for (j = 0; j < w; j += 8) { const uint16_t *data = &src_ptr[j]; /* Vertical filter */ { __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); s[0] = _mm_unpacklo_epi16(s0, s1); s[1] = _mm_unpacklo_epi16(s2, s3); s[2] = _mm_unpacklo_epi16(s4, s5); s[4] = _mm_unpackhi_epi16(s0, s1); s[5] = _mm_unpackhi_epi16(s2, s3); s[6] = _mm_unpackhi_epi16(s4, s5); s[0 + 8] = _mm_unpacklo_epi16(s1, s2); s[1 + 8] = _mm_unpacklo_epi16(s3, s4); s[2 + 8] = _mm_unpacklo_epi16(s5, s6); s[4 + 8] = _mm_unpackhi_epi16(s1, s2); s[5 + 8] = _mm_unpackhi_epi16(s3, s4); s[6 + 8] = _mm_unpackhi_epi16(s5, s6); for (i = 0; i < h; i += 2) { data = &src_ptr[i * src_stride + j]; __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); s[3] = _mm_unpacklo_epi16(s6, s7); s[7] = _mm_unpackhi_epi16(s6, s7); s[3 + 8] = _mm_unpacklo_epi16(s7, s8); s[7 + 8] = _mm_unpackhi_epi16(s7, s8); const __m128i res_a0 = convolve(s, coeffs_y); __m128i res_a_round0 = _mm_sra_epi32( _mm_add_epi32(res_a0, round_const_bits), round_shift_bits); const __m128i res_a1 = convolve(s + 8, coeffs_y); __m128i res_a_round1 = _mm_sra_epi32( _mm_add_epi32(res_a1, round_const_bits), round_shift_bits); if (w - j > 4) { const __m128i res_b0 = convolve(s + 4, coeffs_y); __m128i res_b_round0 = _mm_sra_epi32( _mm_add_epi32(res_b0, round_const_bits), round_shift_bits); const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); __m128i res_b_round1 = _mm_sra_epi32( _mm_add_epi32(res_b1, round_const_bits), round_shift_bits); __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); res_16bit0 = _mm_max_epi16(res_16bit0, zero); __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); res_16bit1 = _mm_max_epi16(res_16bit1, zero); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], res_16bit1); } else if (w == 4) { res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); res_a_round0 = _mm_max_epi16(res_a_round0, zero); res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); res_a_round1 = _mm_max_epi16(res_a_round1, zero); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], res_a_round1); } else { res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); res_a_round0 = _mm_max_epi16(res_a_round0, zero); res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); res_a_round1 = _mm_max_epi16(res_a_round1, zero); *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res_a_round0); *((int *)(&dst[i * dst_stride + j + dst_stride])) = _mm_cvtsi128_si32(res_a_round1); } s[0] = s[1]; s[1] = s[2]; s[2] = s[3]; s[4] = s[5]; s[5] = s[6]; s[6] = s[7]; s[0 + 8] = s[1 + 8]; s[1 + 8] = s[2 + 8]; s[2 + 8] = s[3 + 8]; s[4 + 8] = s[5 + 8]; s[5 + 8] = s[6 + 8]; s[6 + 8] = s[7 + 8]; s6 = s8; } } } } } void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd) { int i, j; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_horiz; // Check that, even with 12-bit input, the intermediate values will fit // into an unsigned 16-bit intermediate array. assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); const __m128i round_const_x = _mm_set1_epi32(((1 << conv_params->round_0) >> 1)); const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); const int bits = FILTER_BITS - conv_params->round_0; const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); const __m128i clip_pixel = _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m128i zero = _mm_setzero_si128(); if (filter_params_x->taps == 12) { __m128i s[6], coeffs_x[6]; prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs_x); for (j = 0; j < w; j += 8) { /* Horizontal filter */ { for (i = 0; i < h; i += 1) { const __m128i row00 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); const __m128i row01 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); const __m128i row02 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 16)]); // even pixels s[0] = _mm_alignr_epi8(row01, row00, 0); s[1] = _mm_alignr_epi8(row01, row00, 4); s[2] = _mm_alignr_epi8(row01, row00, 8); s[3] = _mm_alignr_epi8(row01, row00, 12); s[4] = _mm_alignr_epi8(row02, row01, 0); s[5] = _mm_alignr_epi8(row02, row01, 4); __m128i res_even = convolve_12tap(s, coeffs_x); res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x); res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits), round_shift_bits); // odd pixels s[0] = _mm_alignr_epi8(row01, row00, 2); s[1] = _mm_alignr_epi8(row01, row00, 6); s[2] = _mm_alignr_epi8(row01, row00, 10); s[3] = _mm_alignr_epi8(row01, row00, 14); s[4] = _mm_alignr_epi8(row02, row01, 2); s[5] = _mm_alignr_epi8(row02, row01, 6); __m128i res_odd = convolve_12tap(s, coeffs_x); res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x); res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits), round_shift_bits); __m128i res_even1 = _mm_packs_epi32(res_even, res_even); __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); res = _mm_min_epi16(res, clip_pixel); res = _mm_max_epi16(res, zero); if (w - j > 4) { _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); } else if (w == 4) { _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res); } else { *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res); } } } } } else { __m128i s[4], coeffs_x[4]; prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); for (j = 0; j < w; j += 8) { /* Horizontal filter */ { for (i = 0; i < h; i += 1) { const __m128i row00 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); const __m128i row01 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); // even pixels s[0] = _mm_alignr_epi8(row01, row00, 0); s[1] = _mm_alignr_epi8(row01, row00, 4); s[2] = _mm_alignr_epi8(row01, row00, 8); s[3] = _mm_alignr_epi8(row01, row00, 12); __m128i res_even = convolve(s, coeffs_x); res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x); // odd pixels s[0] = _mm_alignr_epi8(row01, row00, 2); s[1] = _mm_alignr_epi8(row01, row00, 6); s[2] = _mm_alignr_epi8(row01, row00, 10); s[3] = _mm_alignr_epi8(row01, row00, 14); __m128i res_odd = convolve(s, coeffs_x); res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x); res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits), round_shift_bits); res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits), round_shift_bits); __m128i res_even1 = _mm_packs_epi32(res_even, res_even); __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); res = _mm_min_epi16(res, clip_pixel); res = _mm_max_epi16(res, zero); if (w - j > 4) { _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); } else if (w == 4) { _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res); } else { *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res); } } } } } } aom-3.12.1/aom_dsp/x86/highbd_intrapred_asm_sse2.asm000066400000000000000000000173101477627663500222450ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "third_party/x86inc/x86inc.asm" SECTION_RODATA pw_4: times 8 dw 4 pw_8: times 8 dw 8 pw_16: times 4 dd 16 pw_32: times 4 dd 32 SECTION .text INIT_XMM sse2 cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset GET_GOT goffsetq movq m0, [aboveq] movq m2, [leftq] paddw m0, m2 pshuflw m1, m0, 0xe paddw m0, m1 pshuflw m1, m0, 0x1 paddw m0, m1 paddw m0, [GLOBAL(pw_4)] psraw m0, 3 pshuflw m0, m0, 0x0 movq [dstq ], m0 movq [dstq+strideq*2], m0 lea dstq, [dstq+strideq*4] movq [dstq ], m0 movq [dstq+strideq*2], m0 RESTORE_GOT RET INIT_XMM sse2 cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset GET_GOT goffsetq pxor m1, m1 mova m0, [aboveq] mova m2, [leftq] DEFINE_ARGS dst, stride, stride3, one mov oned, 0x00010001 lea stride3q, [strideq*3] movd m3, oned pshufd m3, m3, 0x0 paddw m0, m2 pmaddwd m0, m3 packssdw m0, m1 pmaddwd m0, m3 packssdw m0, m1 pmaddwd m0, m3 paddw m0, [GLOBAL(pw_8)] psrlw m0, 4 pshuflw m0, m0, 0x0 punpcklqdq m0, m0 mova [dstq ], m0 mova [dstq+strideq*2 ], m0 mova [dstq+strideq*4 ], m0 mova [dstq+stride3q*2], m0 lea dstq, [dstq+strideq*8] mova [dstq ], m0 mova [dstq+strideq*2 ], m0 mova [dstq+strideq*4 ], m0 mova [dstq+stride3q*2], m0 RESTORE_GOT RET INIT_XMM sse2 cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset GET_GOT goffsetq pxor m1, m1 mova m0, [aboveq] mova m3, [aboveq+16] mova m2, [leftq] mova m4, [leftq+16] DEFINE_ARGS dst, stride, stride3, lines4 lea stride3q, [strideq*3] mov lines4d, 4 paddw m0, m2 paddw m0, m3 paddw m0, m4 movhlps m2, m0 paddw m0, m2 punpcklwd m0, m1 movhlps m2, m0 paddd m0, m2 punpckldq m0, m1 movhlps m2, m0 paddd m0, m2 paddd m0, [GLOBAL(pw_16)] psrad m0, 5 pshuflw m0, m0, 0x0 punpcklqdq m0, m0 .loop: mova [dstq ], m0 mova [dstq +16], m0 mova [dstq+strideq*2 ], m0 mova [dstq+strideq*2 +16], m0 mova [dstq+strideq*4 ], m0 mova [dstq+strideq*4 +16], m0 mova [dstq+stride3q*2 ], m0 mova [dstq+stride3q*2+16], m0 lea dstq, [dstq+strideq*8] dec lines4d jnz .loop RESTORE_GOT REP_RET INIT_XMM sse2 cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset GET_GOT goffsetq mova m0, [aboveq] mova m2, [aboveq+16] mova m3, [aboveq+32] mova m4, [aboveq+48] paddw m0, m2 paddw m3, m4 mova m2, [leftq] mova m4, [leftq+16] mova m5, [leftq+32] mova m6, [leftq+48] paddw m2, m4 paddw m5, m6 paddw m0, m3 paddw m2, m5 pxor m1, m1 paddw m0, m2 DEFINE_ARGS dst, stride, stride3, lines4 lea stride3q, [strideq*3] mov lines4d, 8 movhlps m2, m0 paddw m0, m2 punpcklwd m0, m1 movhlps m2, m0 paddd m0, m2 punpckldq m0, m1 movhlps m2, m0 paddd m0, m2 paddd m0, [GLOBAL(pw_32)] psrad m0, 6 pshuflw m0, m0, 0x0 punpcklqdq m0, m0 .loop: mova [dstq ], m0 mova [dstq +16 ], m0 mova [dstq +32 ], m0 mova [dstq +48 ], m0 mova [dstq+strideq*2 ], m0 mova [dstq+strideq*2+16 ], m0 mova [dstq+strideq*2+32 ], m0 mova [dstq+strideq*2+48 ], m0 mova [dstq+strideq*4 ], m0 mova [dstq+strideq*4+16 ], m0 mova [dstq+strideq*4+32 ], m0 mova [dstq+strideq*4+48 ], m0 mova [dstq+stride3q*2 ], m0 mova [dstq+stride3q*2 +16], m0 mova [dstq+stride3q*2 +32], m0 mova [dstq+stride3q*2 +48], m0 lea dstq, [dstq+strideq*8] dec lines4d jnz .loop RESTORE_GOT REP_RET INIT_XMM sse2 cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above movq m0, [aboveq] movq [dstq ], m0 movq [dstq+strideq*2], m0 lea dstq, [dstq+strideq*4] movq [dstq ], m0 movq [dstq+strideq*2], m0 RET INIT_XMM sse2 cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above mova m0, [aboveq] DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] mova [dstq ], m0 mova [dstq+strideq*2 ], m0 mova [dstq+strideq*4 ], m0 mova [dstq+stride3q*2], m0 lea dstq, [dstq+strideq*8] mova [dstq ], m0 mova [dstq+strideq*2 ], m0 mova [dstq+strideq*4 ], m0 mova [dstq+stride3q*2], m0 RET INIT_XMM sse2 cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above mova m0, [aboveq] mova m1, [aboveq+16] DEFINE_ARGS dst, stride, stride3, nlines4 lea stride3q, [strideq*3] mov nlines4d, 4 .loop: mova [dstq ], m0 mova [dstq +16], m1 mova [dstq+strideq*2 ], m0 mova [dstq+strideq*2 +16], m1 mova [dstq+strideq*4 ], m0 mova [dstq+strideq*4 +16], m1 mova [dstq+stride3q*2 ], m0 mova [dstq+stride3q*2+16], m1 lea dstq, [dstq+strideq*8] dec nlines4d jnz .loop REP_RET INIT_XMM sse2 cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above mova m0, [aboveq] mova m1, [aboveq+16] mova m2, [aboveq+32] mova m3, [aboveq+48] DEFINE_ARGS dst, stride, stride3, nlines4 lea stride3q, [strideq*3] mov nlines4d, 8 .loop: mova [dstq ], m0 mova [dstq +16], m1 mova [dstq +32], m2 mova [dstq +48], m3 mova [dstq+strideq*2 ], m0 mova [dstq+strideq*2 +16], m1 mova [dstq+strideq*2 +32], m2 mova [dstq+strideq*2 +48], m3 mova [dstq+strideq*4 ], m0 mova [dstq+strideq*4 +16], m1 mova [dstq+strideq*4 +32], m2 mova [dstq+strideq*4 +48], m3 mova [dstq+stride3q*2 ], m0 mova [dstq+stride3q*2 +16], m1 mova [dstq+stride3q*2 +32], m2 mova [dstq+stride3q*2 +48], m3 lea dstq, [dstq+strideq*8] dec nlines4d jnz .loop REP_RET aom-3.12.1/aom_dsp/x86/highbd_intrapred_sse2.c000066400000000000000000001113311477627663500210450ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" // ----------------------------------------------------------------------------- // H_PRED void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); (void)above; (void)bd; _mm_storel_epi64((__m128i *)dst, row0); dst += stride; _mm_storel_epi64((__m128i *)dst, row1); dst += stride; _mm_storel_epi64((__m128i *)dst, row2); dst += stride; _mm_storel_epi64((__m128i *)dst, row3); } void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); dst += stride << 2; left += 4; aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); } void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i left_u16 = _mm_load_si128((const __m128i *)left); const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); (void)above; (void)bd; _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); dst += stride; _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); dst += stride; _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); dst += stride; _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); } void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i left_u16 = _mm_load_si128((const __m128i *)left); const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); (void)above; (void)bd; _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); dst += stride; _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); dst += stride; _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); dst += stride; _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); dst += stride; _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); dst += stride; _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); dst += stride; _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); dst += stride; _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); } void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); dst += stride << 3; left += 8; aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); } static inline void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, const __m128i *row) { const __m128i val = _mm_unpacklo_epi64(*row, *row); _mm_store_si128((__m128i *)*dst, val); _mm_store_si128((__m128i *)(*dst + 8), val); *dst += stride; } static inline void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, const __m128i *row) { const __m128i val = _mm_unpackhi_epi64(*row, *row); _mm_store_si128((__m128i *)(*dst), val); _mm_store_si128((__m128i *)(*dst + 8), val); *dst += stride; } static inline void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride, const uint16_t *left) { const __m128i left_u16 = _mm_load_si128((const __m128i *)left); const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); h_store_16_unpacklo(&dst, stride, &row0); h_store_16_unpacklo(&dst, stride, &row1); h_store_16_unpacklo(&dst, stride, &row2); h_store_16_unpacklo(&dst, stride, &row3); h_store_16_unpackhi(&dst, stride, &row4); h_store_16_unpackhi(&dst, stride, &row5); h_store_16_unpackhi(&dst, stride, &row6); h_store_16_unpackhi(&dst, stride, &row7); } void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)above; (void)bd; h_predictor_16x8(dst, stride, left); } void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { int i; (void)above; (void)bd; for (i = 0; i < 2; i++, left += 8) { h_predictor_16x8(dst, stride, left); dst += stride << 3; } } void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { int i; (void)above; (void)bd; for (i = 0; i < 4; i++, left += 8) { h_predictor_16x8(dst, stride, left); dst += stride << 3; } } static inline void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, const __m128i *row) { const __m128i val = _mm_unpacklo_epi64(*row, *row); _mm_store_si128((__m128i *)(*dst), val); _mm_store_si128((__m128i *)(*dst + 8), val); _mm_store_si128((__m128i *)(*dst + 16), val); _mm_store_si128((__m128i *)(*dst + 24), val); *dst += stride; } static inline void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, const __m128i *row) { const __m128i val = _mm_unpackhi_epi64(*row, *row); _mm_store_si128((__m128i *)(*dst), val); _mm_store_si128((__m128i *)(*dst + 8), val); _mm_store_si128((__m128i *)(*dst + 16), val); _mm_store_si128((__m128i *)(*dst + 24), val); *dst += stride; } static inline void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride, const uint16_t *left) { const __m128i left_u16 = _mm_load_si128((const __m128i *)left); const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); h_store_32_unpacklo(&dst, stride, &row0); h_store_32_unpacklo(&dst, stride, &row1); h_store_32_unpacklo(&dst, stride, &row2); h_store_32_unpacklo(&dst, stride, &row3); h_store_32_unpackhi(&dst, stride, &row4); h_store_32_unpackhi(&dst, stride, &row5); h_store_32_unpackhi(&dst, stride, &row6); h_store_32_unpackhi(&dst, stride, &row7); } void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { int i; (void)above; (void)bd; for (i = 0; i < 2; i++, left += 8) { h_predictor_32x8(dst, stride, left); dst += stride << 3; } } void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { int i; (void)above; (void)bd; for (i = 0; i < 4; i++, left += 8) { h_predictor_32x8(dst, stride, left); dst += stride << 3; } } // ----------------------------------------------------------------------------- // DC_TOP, DC_LEFT, DC_128 // 4x4 static inline __m128i dc_sum_4(const uint16_t *ref) { const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); const __m128i a = _mm_add_epi16(_dcba, _xxdc); return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); } static inline void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, const __m128i *dc) { const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); int i; for (i = 0; i < 4; ++i, dst += stride) { _mm_storel_epi64((__m128i *)dst, dc_dup); } } void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i two = _mm_cvtsi32_si128(2); const __m128i sum = dc_sum_4(left); const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); (void)above; (void)bd; dc_store_4x4(dst, stride, &dc); } void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i two = _mm_cvtsi32_si128(2); const __m128i sum = dc_sum_4(above); const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); (void)left; (void)bd; dc_store_4x4(dst, stride, &dc); } void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); (void)above; (void)left; dc_store_4x4(dst, stride, &dc_dup); } // ----------------------------------------------------------------------------- // 4x8 static inline void dc_store_4x8(uint16_t *dst, ptrdiff_t stride, const __m128i *dc) { const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); int i; for (i = 0; i < 8; ++i, dst += stride) { _mm_storel_epi64((__m128i *)dst, dc_dup); } } // Shared with DC 8xh static inline __m128i dc_sum_8(const uint16_t *ref) { const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); const __m128i a = _mm_add_epi16(_dcba, _xxdc); return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); } void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i sum = dc_sum_8(left); const __m128i four = _mm_cvtsi32_si128(4); const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); (void)above; (void)bd; dc_store_4x8(dst, stride, &dc); } void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i two = _mm_cvtsi32_si128(2); const __m128i sum = dc_sum_4(above); const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); (void)left; (void)bd; dc_store_4x8(dst, stride, &dc); } void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); (void)above; (void)left; dc_store_4x8(dst, stride, &dc_dup); } // ----------------------------------------------------------------------------- // 8xh static inline void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height, const __m128i *dc) { const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); int i; for (i = 0; i < height; ++i, dst += stride) { _mm_store_si128((__m128i *)dst, dc_dup); } } // ----------------------------------------------------------------------------- // DC_TOP static inline void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride, int height, const uint16_t *above) { const __m128i four = _mm_cvtsi32_si128(4); const __m128i sum = dc_sum_8(above); const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); dc_store_8xh(dst, stride, height, &dc); } void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)left; (void)bd; dc_top_predictor_8xh(dst, stride, 4, above); } void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)left; (void)bd; dc_top_predictor_8xh(dst, stride, 8, above); } void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)left; (void)bd; dc_top_predictor_8xh(dst, stride, 16, above); } // ----------------------------------------------------------------------------- // DC_LEFT void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i two = _mm_cvtsi32_si128(2); const __m128i sum = dc_sum_4(left); const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); (void)above; (void)bd; dc_store_8xh(dst, stride, 4, &dc); } void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i four = _mm_cvtsi32_si128(4); const __m128i sum = dc_sum_8(left); const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); (void)above; (void)bd; dc_store_8xh(dst, stride, 8, &dc); } // Shared with DC 16xh static inline __m128i dc_sum_16(const uint16_t *ref) { const __m128i sum_lo = dc_sum_8(ref); const __m128i sum_hi = dc_sum_8(ref + 8); return _mm_add_epi16(sum_lo, sum_hi); } void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i eight = _mm_cvtsi32_si128(8); const __m128i sum = dc_sum_16(left); const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); (void)above; (void)bd; dc_store_8xh(dst, stride, 16, &dc); } // ----------------------------------------------------------------------------- // DC_128 static inline void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride, int height, int bd) { const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); dc_store_8xh(dst, stride, height, &dc_dup); } void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)above; (void)left; dc_128_predictor_8xh(dst, stride, 4, bd); } void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)above; (void)left; dc_128_predictor_8xh(dst, stride, 8, bd); } void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)above; (void)left; dc_128_predictor_8xh(dst, stride, 16, bd); } // ----------------------------------------------------------------------------- // 16xh static inline void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height, const __m128i *dc) { const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); int i; for (i = 0; i < height; ++i, dst += stride) { _mm_store_si128((__m128i *)dst, dc_dup); _mm_store_si128((__m128i *)(dst + 8), dc_dup); } } // ----------------------------------------------------------------------------- // DC_LEFT void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i four = _mm_cvtsi32_si128(4); const __m128i sum = dc_sum_8(left); const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); (void)above; (void)bd; dc_store_16xh(dst, stride, 8, &dc); } void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i eight = _mm_cvtsi32_si128(8); const __m128i sum = dc_sum_16(left); const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); (void)above; (void)bd; dc_store_16xh(dst, stride, 16, &dc); } // Shared with 32xh static inline __m128i dc_sum_32(const uint16_t *ref) { const __m128i zero = _mm_setzero_si128(); const __m128i sum_a = dc_sum_16(ref); const __m128i sum_b = dc_sum_16(ref + 16); // 12 bit bd will outrange, so expand to 32 bit before adding final total return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), _mm_unpacklo_epi16(sum_b, zero)); } void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i sixteen = _mm_cvtsi32_si128(16); const __m128i sum = dc_sum_32(left); const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); (void)above; (void)bd; dc_store_16xh(dst, stride, 32, &dc); } // ----------------------------------------------------------------------------- // DC_TOP void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i eight = _mm_cvtsi32_si128(8); const __m128i sum = dc_sum_16(above); const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); (void)left; (void)bd; dc_store_16xh(dst, stride, 8, &dc); } void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i eight = _mm_cvtsi32_si128(8); const __m128i sum = dc_sum_16(above); const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); (void)left; (void)bd; dc_store_16xh(dst, stride, 16, &dc); } void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i eight = _mm_cvtsi32_si128(8); const __m128i sum = dc_sum_16(above); const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); (void)left; (void)bd; dc_store_16xh(dst, stride, 32, &dc); } // ----------------------------------------------------------------------------- // DC_128 void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); (void)above; (void)left; dc_store_16xh(dst, stride, 8, &dc_dup); } void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); (void)above; (void)left; dc_store_16xh(dst, stride, 16, &dc_dup); } void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); (void)above; (void)left; dc_store_16xh(dst, stride, 32, &dc_dup); } // ----------------------------------------------------------------------------- // 32xh static inline void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height, const __m128i *dc) { const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); int i; for (i = 0; i < height; ++i, dst += stride) { _mm_store_si128((__m128i *)dst, dc_dup); _mm_store_si128((__m128i *)(dst + 8), dc_dup); _mm_store_si128((__m128i *)(dst + 16), dc_dup); _mm_store_si128((__m128i *)(dst + 24), dc_dup); } } void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i eight = _mm_cvtsi32_si128(8); const __m128i sum = dc_sum_16(left); const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); (void)above; (void)bd; dc_store_32xh(dst, stride, 16, &dc); } void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i sixteen = _mm_cvtsi32_si128(16); const __m128i sum = dc_sum_32(left); const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); (void)above; (void)bd; dc_store_32xh(dst, stride, 32, &dc); } void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i sixteen = _mm_cvtsi32_si128(16); const __m128i sum = dc_sum_32(above); const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); (void)left; (void)bd; dc_store_32xh(dst, stride, 16, &dc); } void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); (void)above; (void)left; dc_store_32xh(dst, stride, 16, &dc_dup); } void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i sixteen = _mm_cvtsi32_si128(16); const __m128i sum = dc_sum_32(above); const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); (void)left; (void)bd; dc_store_32xh(dst, stride, 32, &dc); } void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); (void)above; (void)left; dc_store_32xh(dst, stride, 32, &dc_dup); } // ----------------------------------------------------------------------------- // V_PRED void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)left; (void)bd; const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above); int i; for (i = 0; i < 2; ++i) { _mm_storel_epi64((__m128i *)dst, above_u16); _mm_storel_epi64((__m128i *)(dst + stride), above_u16); _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16); _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16); dst += stride << 2; } } void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)left; (void)bd; const __m128i above_u16 = _mm_load_si128((const __m128i *)above); _mm_store_si128((__m128i *)dst, above_u16); _mm_store_si128((__m128i *)(dst + stride), above_u16); _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); } void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)left; (void)bd; const __m128i above_u16 = _mm_load_si128((const __m128i *)above); int i; for (i = 0; i < 4; ++i) { _mm_store_si128((__m128i *)dst, above_u16); _mm_store_si128((__m128i *)(dst + stride), above_u16); _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); dst += stride << 2; } } void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)left; (void)bd; const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); int i; for (i = 0; i < 2; ++i) { _mm_store_si128((__m128i *)dst, above0_u16); _mm_store_si128((__m128i *)(dst + 8), above1_u16); dst += stride; _mm_store_si128((__m128i *)dst, above0_u16); _mm_store_si128((__m128i *)(dst + 8), above1_u16); dst += stride; _mm_store_si128((__m128i *)dst, above0_u16); _mm_store_si128((__m128i *)(dst + 8), above1_u16); dst += stride; _mm_store_si128((__m128i *)dst, above0_u16); _mm_store_si128((__m128i *)(dst + 8), above1_u16); dst += stride; } } void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)left; (void)bd; const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); int i; for (i = 0; i < 8; ++i) { _mm_store_si128((__m128i *)dst, above0_u16); _mm_store_si128((__m128i *)(dst + 8), above1_u16); dst += stride; _mm_store_si128((__m128i *)dst, above0_u16); _mm_store_si128((__m128i *)(dst + 8), above1_u16); dst += stride; _mm_store_si128((__m128i *)dst, above0_u16); _mm_store_si128((__m128i *)(dst + 8), above1_u16); dst += stride; _mm_store_si128((__m128i *)dst, above0_u16); _mm_store_si128((__m128i *)(dst + 8), above1_u16); dst += stride; } } void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)left; (void)bd; const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16)); const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24)); int i; for (i = 0; i < 4; ++i) { _mm_store_si128((__m128i *)dst, above0_u16); _mm_store_si128((__m128i *)(dst + 8), above1_u16); _mm_store_si128((__m128i *)(dst + 16), above2_u16); _mm_store_si128((__m128i *)(dst + 24), above3_u16); dst += stride; _mm_store_si128((__m128i *)dst, above0_u16); _mm_store_si128((__m128i *)(dst + 8), above1_u16); _mm_store_si128((__m128i *)(dst + 16), above2_u16); _mm_store_si128((__m128i *)(dst + 24), above3_u16); dst += stride; _mm_store_si128((__m128i *)dst, above0_u16); _mm_store_si128((__m128i *)(dst + 8), above1_u16); _mm_store_si128((__m128i *)(dst + 16), above2_u16); _mm_store_si128((__m128i *)(dst + 24), above3_u16); dst += stride; _mm_store_si128((__m128i *)dst, above0_u16); _mm_store_si128((__m128i *)(dst + 8), above1_u16); _mm_store_si128((__m128i *)(dst + 16), above2_u16); _mm_store_si128((__m128i *)(dst + 24), above3_u16); dst += stride; } } // ----------------------------------------------------------------------------- // DC_PRED void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)bd; const __m128i sum_above = dc_sum_4(above); const __m128i sum_left = dc_sum_8(left); const __m128i sum = _mm_add_epi16(sum_above, sum_left); uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); sum32 >>= 16; sum32 += 6; sum32 /= 12; const __m128i row = _mm_set1_epi16((int16_t)sum32); int i; for (i = 0; i < 4; ++i) { _mm_storel_epi64((__m128i *)dst, row); dst += stride; _mm_storel_epi64((__m128i *)dst, row); dst += stride; } } void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)bd; const __m128i sum_left = dc_sum_4(left); const __m128i sum_above = dc_sum_8(above); const __m128i sum = _mm_add_epi16(sum_above, sum_left); uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); sum32 >>= 16; sum32 += 6; sum32 /= 12; const __m128i row = _mm_set1_epi16((int16_t)sum32); _mm_store_si128((__m128i *)dst, row); dst += stride; _mm_store_si128((__m128i *)dst, row); dst += stride; _mm_store_si128((__m128i *)dst, row); dst += stride; _mm_store_si128((__m128i *)dst, row); } void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)bd; __m128i sum_left = dc_sum_16(left); __m128i sum_above = dc_sum_8(above); const __m128i zero = _mm_setzero_si128(); sum_left = _mm_unpacklo_epi16(sum_left, zero); sum_above = _mm_unpacklo_epi16(sum_above, zero); const __m128i sum = _mm_add_epi32(sum_left, sum_above); uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); sum32 += 12; sum32 /= 24; const __m128i row = _mm_set1_epi16((int16_t)sum32); int i; for (i = 0; i < 4; ++i) { _mm_store_si128((__m128i *)dst, row); dst += stride; _mm_store_si128((__m128i *)dst, row); dst += stride; _mm_store_si128((__m128i *)dst, row); dst += stride; _mm_store_si128((__m128i *)dst, row); dst += stride; } } void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)bd; __m128i sum_left = dc_sum_8(left); __m128i sum_above = dc_sum_16(above); const __m128i zero = _mm_setzero_si128(); sum_left = _mm_unpacklo_epi16(sum_left, zero); sum_above = _mm_unpacklo_epi16(sum_above, zero); const __m128i sum = _mm_add_epi32(sum_left, sum_above); uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); sum32 += 12; sum32 /= 24; const __m128i row = _mm_set1_epi16((int16_t)sum32); int i; for (i = 0; i < 2; ++i) { _mm_store_si128((__m128i *)dst, row); _mm_store_si128((__m128i *)(dst + 8), row); dst += stride; _mm_store_si128((__m128i *)dst, row); _mm_store_si128((__m128i *)(dst + 8), row); dst += stride; _mm_store_si128((__m128i *)dst, row); _mm_store_si128((__m128i *)(dst + 8), row); dst += stride; _mm_store_si128((__m128i *)dst, row); _mm_store_si128((__m128i *)(dst + 8), row); dst += stride; } } void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)bd; __m128i sum_left = dc_sum_32(left); __m128i sum_above = dc_sum_16(above); const __m128i zero = _mm_setzero_si128(); sum_above = _mm_unpacklo_epi16(sum_above, zero); const __m128i sum = _mm_add_epi32(sum_left, sum_above); uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); sum32 += 24; sum32 /= 48; const __m128i row = _mm_set1_epi16((int16_t)sum32); int i; for (i = 0; i < 8; ++i) { _mm_store_si128((__m128i *)dst, row); _mm_store_si128((__m128i *)(dst + 8), row); dst += stride; _mm_store_si128((__m128i *)dst, row); _mm_store_si128((__m128i *)(dst + 8), row); dst += stride; _mm_store_si128((__m128i *)dst, row); _mm_store_si128((__m128i *)(dst + 8), row); dst += stride; _mm_store_si128((__m128i *)dst, row); _mm_store_si128((__m128i *)(dst + 8), row); dst += stride; } } void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { (void)bd; __m128i sum_left = dc_sum_16(left); __m128i sum_above = dc_sum_32(above); const __m128i zero = _mm_setzero_si128(); sum_left = _mm_unpacklo_epi16(sum_left, zero); const __m128i sum = _mm_add_epi32(sum_left, sum_above); uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); sum32 += 24; sum32 /= 48; const __m128i row = _mm_set1_epi16((int16_t)sum32); int i; for (i = 0; i < 4; ++i) { _mm_store_si128((__m128i *)dst, row); _mm_store_si128((__m128i *)(dst + 8), row); _mm_store_si128((__m128i *)(dst + 16), row); _mm_store_si128((__m128i *)(dst + 24), row); dst += stride; _mm_store_si128((__m128i *)dst, row); _mm_store_si128((__m128i *)(dst + 8), row); _mm_store_si128((__m128i *)(dst + 16), row); _mm_store_si128((__m128i *)(dst + 24), row); dst += stride; _mm_store_si128((__m128i *)dst, row); _mm_store_si128((__m128i *)(dst + 8), row); _mm_store_si128((__m128i *)(dst + 16), row); _mm_store_si128((__m128i *)(dst + 24), row); dst += stride; _mm_store_si128((__m128i *)dst, row); _mm_store_si128((__m128i *)(dst + 8), row); _mm_store_si128((__m128i *)(dst + 16), row); _mm_store_si128((__m128i *)(dst + 24), row); dst += stride; } } aom-3.12.1/aom_dsp/x86/highbd_loopfilter_avx2.c000066400000000000000000000055601477627663500212460ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/x86/common_avx2.h" #include "aom_dsp/x86/lpf_common_sse2.h" #include "aom/aom_integer.h" void aom_highbd_lpf_horizontal_14_dual_avx2( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd); } void aom_highbd_lpf_vertical_14_dual_avx2( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd); } void aom_highbd_lpf_horizontal_4_dual_avx2( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd); } void aom_highbd_lpf_horizontal_8_dual_avx2( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd); } void aom_highbd_lpf_vertical_4_dual_avx2( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd); } void aom_highbd_lpf_vertical_8_dual_avx2( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd); } aom-3.12.1/aom_dsp/x86/highbd_loopfilter_sse2.c000066400000000000000000001773251477627663500212530ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // SSE2 #include "config/aom_dsp_rtcd.h" #include "aom_dsp/x86/lpf_common_sse2.h" static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max, __m128i *pixel) { *pixel = _mm_min_epi16(*pixel, *max); *pixel = _mm_max_epi16(*pixel, *min); } static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) { return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); } static inline void get_limit(const uint8_t *bl, const uint8_t *l, const uint8_t *t, int bd, __m128i *blt, __m128i *lt, __m128i *thr, __m128i *t80_out) { const int shift = bd - 8; const __m128i zero = _mm_setzero_si128(); __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero); *blt = _mm_slli_epi16(x, shift); x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero); *lt = _mm_slli_epi16(x, shift); x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); *thr = _mm_slli_epi16(x, shift); *t80_out = _mm_set1_epi16(1 << (bd - 1)); } static inline void get_limit_dual( const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out, __m128i *t80_out) { const int shift = bd - 8; const __m128i zero = _mm_setzero_si128(); __m128i x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero); __m128i x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero); x0 = _mm_unpacklo_epi64(x0, x1); *blt_out = _mm_slli_epi16(x0, shift); x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero); x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero); x0 = _mm_unpacklo_epi64(x0, x1); *lt_out = _mm_slli_epi16(x0, shift); x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero); x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero); x0 = _mm_unpacklo_epi64(x0, x1); *thr_out = _mm_slli_epi16(x0, shift); *t80_out = _mm_set1_epi16(1 << (bd - 1)); } static inline void load_highbd_pixel(const uint16_t *s, int size, int pitch, __m128i *p, __m128i *q) { int i; for (i = 0; i < size; i++) { p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch)); q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch)); } } static inline void highbd_filter_mask_dual(const __m128i *p, const __m128i *q, const __m128i *l, const __m128i *bl, __m128i *mask) { __m128i abs_p0q0 = abs_diff16(p[0], q[0]); __m128i abs_p1q1 = abs_diff16(p[1], q[1]); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i ffff = _mm_set1_epi16((short)0xFFFF); __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); int i; for (i = 1; i < 4; ++i) { max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1])); max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1])); } max = _mm_subs_epu16(max, *l); *mask = _mm_cmpeq_epi16(max, zero); // return ~mask } static inline void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x, __m128i *p1p0, __m128i *q1q0, __m128i *abs_p1p0, __m128i *l, __m128i *bl, __m128i *t, __m128i *hev, __m128i *mask) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i ffff = _mm_set1_epi16((short)0xFFFF); __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0; __m128i max, max01, h; *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]); *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]); abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0); abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1); abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // divide by 2 max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; // So taking maximums continues to work: max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); *abs_p1p0 = abs_diff16(pq[0], pq[1]); abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8); max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0); // mask |= (abs(*p1 - *p0) > limit) * -1; // mask |= (abs(*q1 - *q0) > limit) * -1; h = _mm_subs_epu16(max01, *t); *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); // replicate for the further "merged variables" usage *hev = _mm_unpacklo_epi64(*hev, *hev); max = _mm_max_epi16(max, max01); int i; for (i = 2; i < x; ++i) { max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1])); } max = _mm_max_epi16(max, _mm_srli_si128(max, 8)); max = _mm_subs_epu16(max, *l); *mask = _mm_cmpeq_epi16(max, zero); // ~mask } static inline void flat_mask_internal(const __m128i *th, const __m128i *pq, int start, int end, __m128i *flat) { int i; __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]), abs_diff16(pq[start + 1], pq[0])); for (i = start + 2; i < end; ++i) { max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0])); } max = _mm_max_epi16(max, _mm_srli_si128(max, 8)); __m128i ft; ft = _mm_subs_epu16(max, *th); const __m128i zero = _mm_setzero_si128(); *flat = _mm_cmpeq_epi16(ft, zero); } static inline void flat_mask_internal_dual(const __m128i *th, const __m128i *p, const __m128i *q, int start, int end, __m128i *flat) { int i; __m128i max = _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0])); for (i = start + 1; i < end; ++i) { max = _mm_max_epi16(max, abs_diff16(p[i], p[0])); max = _mm_max_epi16(max, abs_diff16(q[i], q[0])); } __m128i ft; ft = _mm_subs_epu16(max, *th); const __m128i zero = _mm_setzero_si128(); *flat = _mm_cmpeq_epi16(ft, zero); } static inline void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat, __m128i *flat2, int bd) { // check the distance 1,2,3 against 0 __m128i th = _mm_set1_epi16(1); th = _mm_slli_epi16(th, bd - 8); flat_mask_internal(&th, pq, 1, 4, flat); flat_mask_internal(&th, pq, 4, 7, flat2); } static inline void highbd_flat_mask4_dual_sse2(const __m128i *p, const __m128i *q, __m128i *flat, __m128i *flat2, int bd) { // check the distance 1,2,3 against 0 __m128i th = _mm_set1_epi16(1); th = _mm_slli_epi16(th, bd - 8); flat_mask_internal_dual(&th, p, q, 1, 4, flat); flat_mask_internal_dual(&th, p, q, 4, 7, flat2); } static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0, __m128i *hev, __m128i *mask, __m128i *qs1qs0, __m128i *ps1ps0, __m128i *t80, int bd) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i pmax = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80); const __m128i pmin = _mm_subs_epi16(zero, *t80); const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4); __m128i ps1ps0_work, qs1qs0_work, work; __m128i filt, filter2filter1, filter2filt, filter1filt; ps1ps0_work = _mm_subs_epi16(*p1p0, *t80); qs1qs0_work = _mm_subs_epi16(*q1q0, *t80); work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work); pixel_clamp(&pmin, &pmax, &work); filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev); filt = _mm_subs_epi16(filt, work); filt = _mm_subs_epi16(filt, work); filt = _mm_subs_epi16(filt, work); // (aom_filter + 3 * (qs0 - ps0)) & mask pixel_clamp(&pmin, &pmax, &filt); filt = _mm_and_si128(filt, *mask); filt = _mm_unpacklo_epi64(filt, filt); filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */ pixel_clamp(&pmin, &pmax, &filter2filter1); filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */ filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1); // filt >> 1 filt = _mm_adds_epi16(filt, one); filt = _mm_srai_epi16(filt, 1); filt = _mm_andnot_si128(*hev, filt); filter2filt = _mm_unpackhi_epi64(filter2filter1, filt); filter1filt = _mm_unpacklo_epi64(filter2filter1, filt); qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt); ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt); pixel_clamp(&pmin, &pmax, &qs1qs0_work); pixel_clamp(&pmin, &pmax, &ps1ps0_work); *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80); *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80); } static inline void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps, __m128i *qs, const __m128i *mask, const __m128i *th, int bd, __m128i *t80) { __m128i ps0 = _mm_subs_epi16(p[0], *t80); __m128i ps1 = _mm_subs_epi16(p[1], *t80); __m128i qs0 = _mm_subs_epi16(q[0], *t80); __m128i qs1 = _mm_subs_epi16(q[1], *t80); const __m128i one = _mm_set1_epi16(1); const __m128i pmax = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80); const __m128i zero = _mm_setzero_si128(); const __m128i pmin = _mm_subs_epi16(zero, *t80); __m128i filter = _mm_subs_epi16(ps1, qs1); pixel_clamp(&pmin, &pmax, &filter); // hev_filter __m128i hev; const __m128i abs_p1p0 = abs_diff16(p[1], p[0]); const __m128i abs_q1q0 = abs_diff16(q[1], q[0]); __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0); h = _mm_subs_epu16(h, *th); const __m128i ffff = _mm_cmpeq_epi16(h, h); hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); filter = _mm_and_si128(filter, hev); const __m128i x = _mm_subs_epi16(qs0, ps0); filter = _mm_adds_epi16(filter, x); filter = _mm_adds_epi16(filter, x); filter = _mm_adds_epi16(filter, x); pixel_clamp(&pmin, &pmax, &filter); filter = _mm_and_si128(filter, *mask); const __m128i t3 = _mm_set1_epi16(3); const __m128i t4 = _mm_set1_epi16(4); __m128i filter1 = _mm_adds_epi16(filter, t4); __m128i filter2 = _mm_adds_epi16(filter, t3); pixel_clamp(&pmin, &pmax, &filter1); pixel_clamp(&pmin, &pmax, &filter2); filter1 = _mm_srai_epi16(filter1, 3); filter2 = _mm_srai_epi16(filter2, 3); qs0 = _mm_subs_epi16(qs0, filter1); pixel_clamp(&pmin, &pmax, &qs0); ps0 = _mm_adds_epi16(ps0, filter2); pixel_clamp(&pmin, &pmax, &ps0); qs[0] = _mm_adds_epi16(qs0, *t80); ps[0] = _mm_adds_epi16(ps0, *t80); filter = _mm_adds_epi16(filter1, one); filter = _mm_srai_epi16(filter, 1); filter = _mm_andnot_si128(hev, filter); qs1 = _mm_subs_epi16(qs1, filter); pixel_clamp(&pmin, &pmax, &qs1); ps1 = _mm_adds_epi16(ps1, filter); pixel_clamp(&pmin, &pmax, &ps1); qs[1] = _mm_adds_epi16(qs1, *t80); ps[1] = _mm_adds_epi16(ps1, *t80); } static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt, const unsigned char *lt, const unsigned char *thr, int bd) { int i; const __m128i zero = _mm_setzero_si128(); __m128i blimit, limit, thresh; __m128i t80; get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80); for (i = 0; i < 7; i++) { pq[i] = _mm_unpacklo_epi64(p[i], q[i]); } __m128i mask, hevhev; __m128i p1p0, q1q0, abs_p1p0; highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, &thresh, &hevhev, &mask); __m128i ps0ps1, qs0qs1; // filter4 highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd); __m128i flat, flat2; highbd_flat_mask4_sse2(pq, &flat, &flat2, bd); flat = _mm_and_si128(flat, mask); flat2 = _mm_and_si128(flat2, flat); // replicate for the further "merged variables" usage flat = _mm_unpacklo_epi64(flat, flat); flat2 = _mm_unpacklo_epi64(flat2, flat2); // flat and wide flat calculations // if flat ==0 then flat2 is zero as well and we don't need any calc below // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { __m128i flat_p[3], flat_q[3], flat_pq[3]; __m128i flat2_p[6], flat2_q[6]; __m128i flat2_pq[6]; __m128i sum_p6, sum_p3; const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); __m128i work0, work0_0, work0_1, sum_p_0; __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3])); __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1])); sum_p = _mm_add_epi16(sum_p, sum_lp); __m128i sum_lq = _mm_srli_si128(sum_lp, 8); __m128i sum_q = _mm_srli_si128(sum_p, 8); sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0])); flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])); sum_p6 = _mm_add_epi16(pq[6], pq[6]); sum_p3 = _mm_add_epi16(pq[3], pq[3]); sum_q = _mm_sub_epi16(sum_p_0, pq[5]); sum_p = _mm_sub_epi16(sum_p_0, q[5]); work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]); work0_1 = _mm_add_epi16(sum_p6, _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0]))); sum_lq = _mm_sub_epi16(sum_lp, pq[2]); sum_lp = _mm_sub_epi16(sum_lp, q[2]); work0 = _mm_add_epi16(sum_p3, pq[1]); flat_p[1] = _mm_add_epi16(sum_lp, work0); flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); sum_lp = _mm_sub_epi16(sum_lp, q[1]); sum_lq = _mm_sub_epi16(sum_lq, pq[1]); sum_p3 = _mm_add_epi16(sum_p3, pq[3]); work0 = _mm_add_epi16(sum_p3, pq[2]); flat_p[2] = _mm_add_epi16(sum_lp, work0); flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); int flat2_mask = (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); if (flat2_mask) { flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0])); flat2_q[0] = _mm_add_epi16( sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0])); flat2_p[1] = _mm_add_epi16(sum_p, work0_1); flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); flat2_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); flat2_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); sum_p = _mm_sub_epi16(sum_p, q[4]); sum_q = _mm_sub_epi16(sum_q, pq[4]); sum_p6 = _mm_add_epi16(sum_p6, pq[6]); work0 = _mm_add_epi16(sum_p6, _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1]))); flat2_p[2] = _mm_add_epi16(sum_p, work0); flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); flat2_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); sum_p6 = _mm_add_epi16(sum_p6, pq[6]); sum_p = _mm_sub_epi16(sum_p, q[3]); sum_q = _mm_sub_epi16(sum_q, pq[3]); work0 = _mm_add_epi16(sum_p6, _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2]))); flat2_p[3] = _mm_add_epi16(sum_p, work0); flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); flat2_pq[3] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); sum_p6 = _mm_add_epi16(sum_p6, pq[6]); sum_p = _mm_sub_epi16(sum_p, q[2]); sum_q = _mm_sub_epi16(sum_q, pq[2]); work0 = _mm_add_epi16(sum_p6, _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3]))); flat2_p[4] = _mm_add_epi16(sum_p, work0); flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); flat2_pq[4] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); sum_p6 = _mm_add_epi16(sum_p6, pq[6]); sum_p = _mm_sub_epi16(sum_p, q[1]); sum_q = _mm_sub_epi16(sum_q, pq[1]); work0 = _mm_add_epi16(sum_p6, _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4]))); flat2_p[5] = _mm_add_epi16(sum_p, work0); flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); flat2_pq[5] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); } // flat2 // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // highbd_filter8 pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); for (i = 0; i < 3; i++) { pq[i] = _mm_andnot_si128(flat, pq[i]); flat_pq[i] = _mm_and_si128(flat, flat_pq[i]); pq[i] = _mm_or_si128(pq[i], flat_pq[i]); } // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if (flat2_mask) { for (i = 0; i < 6; i++) { pq[i] = _mm_andnot_si128(flat2, pq[i]); flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]); pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values } } } else { pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); } } void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { __m128i p[7], q[7], pq[7]; int i; for (i = 0; i < 7; i++) { p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch)); q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch)); } highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd); for (i = 0; i < 6; i++) { _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]); _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8)); } } static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0, const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1, const uint8_t *thr1, int bd) { __m128i blimit, limit, thresh, t80; const __m128i zero = _mm_setzero_si128(); get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh, &t80); __m128i mask; highbd_filter_mask_dual(p, q, &limit, &blimit, &mask); __m128i flat, flat2; highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd); flat = _mm_and_si128(flat, mask); flat2 = _mm_and_si128(flat2, flat); __m128i ps[2], qs[2]; highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80); // flat and wide flat calculations // if flat ==0 then flat2 is zero as well and we don't need any calc below // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { __m128i flat_p[3], flat_q[3]; __m128i flat2_p[6], flat2_q[6]; const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3])); __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3])); __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1])); sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp); __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1])); sum_q = _mm_add_epi16(sum_q, sum_lq); sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q)); sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); flat_p[0] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3); flat_q[0] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3); __m128i sum_p6 = _mm_add_epi16(p[6], p[6]); __m128i sum_q6 = _mm_add_epi16(q[6], q[6]); __m128i sum_p3 = _mm_add_epi16(p[3], p[3]); __m128i sum_q3 = _mm_add_epi16(q[3], q[3]); sum_q = _mm_sub_epi16(sum_p_0, p[5]); __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]); sum_lq = _mm_sub_epi16(sum_lp, p[2]); sum_lp = _mm_sub_epi16(sum_lp, q[2]); flat_p[1] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3); flat_q[1] = _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3); sum_lp = _mm_sub_epi16(sum_lp, q[1]); sum_lq = _mm_sub_epi16(sum_lq, p[1]); sum_p3 = _mm_add_epi16(sum_p3, p[3]); sum_q3 = _mm_add_epi16(sum_q3, q[3]); flat_p[2] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3); flat_q[2] = _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3); int flat2_mask = (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); if (flat2_mask) { flat2_p[0] = _mm_srli_epi16( _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]), _mm_add_epi16(p[1], q[0]))), 4); flat2_q[0] = _mm_srli_epi16( _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]), _mm_add_epi16(p[0], q[1]))), 4); flat2_p[1] = _mm_srli_epi16( _mm_add_epi16( sum_p, _mm_add_epi16(sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))), 4); flat2_q[1] = _mm_srli_epi16( _mm_add_epi16( sum_q, _mm_add_epi16(sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))), 4); sum_p6 = _mm_add_epi16(sum_p6, p[6]); sum_q6 = _mm_add_epi16(sum_q6, q[6]); sum_p = _mm_sub_epi16(sum_p, q[4]); sum_q = _mm_sub_epi16(sum_q, p[4]); flat2_p[2] = _mm_srli_epi16( _mm_add_epi16( sum_p, _mm_add_epi16(sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))), 4); flat2_q[2] = _mm_srli_epi16( _mm_add_epi16( sum_q, _mm_add_epi16(sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))), 4); sum_p6 = _mm_add_epi16(sum_p6, p[6]); sum_q6 = _mm_add_epi16(sum_q6, q[6]); sum_p = _mm_sub_epi16(sum_p, q[3]); sum_q = _mm_sub_epi16(sum_q, p[3]); flat2_p[3] = _mm_srli_epi16( _mm_add_epi16( sum_p, _mm_add_epi16(sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))), 4); flat2_q[3] = _mm_srli_epi16( _mm_add_epi16( sum_q, _mm_add_epi16(sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))), 4); sum_p6 = _mm_add_epi16(sum_p6, p[6]); sum_q6 = _mm_add_epi16(sum_q6, q[6]); sum_p = _mm_sub_epi16(sum_p, q[2]); sum_q = _mm_sub_epi16(sum_q, p[2]); flat2_p[4] = _mm_srli_epi16( _mm_add_epi16( sum_p, _mm_add_epi16(sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))), 4); flat2_q[4] = _mm_srli_epi16( _mm_add_epi16( sum_q, _mm_add_epi16(sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))), 4); sum_p6 = _mm_add_epi16(sum_p6, p[6]); sum_q6 = _mm_add_epi16(sum_q6, q[6]); sum_p = _mm_sub_epi16(sum_p, q[1]); sum_q = _mm_sub_epi16(sum_q, p[1]); flat2_p[5] = _mm_srli_epi16( _mm_add_epi16( sum_p, _mm_add_epi16(sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))), 4); flat2_q[5] = _mm_srli_epi16( _mm_add_epi16( sum_q, _mm_add_epi16(sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))), 4); } // highbd_filter8 int i; for (i = 0; i < 2; i++) { ps[i] = _mm_andnot_si128(flat, ps[i]); flat_p[i] = _mm_and_si128(flat, flat_p[i]); p[i] = _mm_or_si128(ps[i], flat_p[i]); qs[i] = _mm_andnot_si128(flat, qs[i]); flat_q[i] = _mm_and_si128(flat, flat_q[i]); q[i] = _mm_or_si128(qs[i], flat_q[i]); } p[2] = _mm_andnot_si128(flat, p[2]); // p2 remains unchanged if !(flat && mask) flat_p[2] = _mm_and_si128(flat, flat_p[2]); // when (flat && mask) p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values q[2] = _mm_andnot_si128(flat, q[2]); flat_q[2] = _mm_and_si128(flat, flat_q[2]); q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values for (i = 0; i < 2; i++) { ps[i] = _mm_andnot_si128(flat, ps[i]); flat_p[i] = _mm_and_si128(flat, flat_p[i]); p[i] = _mm_or_si128(ps[i], flat_p[i]); qs[i] = _mm_andnot_si128(flat, qs[i]); flat_q[i] = _mm_and_si128(flat, flat_q[i]); q[i] = _mm_or_si128(qs[i], flat_q[i]); } // highbd_filter16 if (flat2_mask) { for (i = 0; i < 6; i++) { // p[i] remains unchanged if !(flat2 && flat && mask) p[i] = _mm_andnot_si128(flat2, p[i]); flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); // get values for when (flat2 && flat && mask) p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values q[i] = _mm_andnot_si128(flat2, q[i]); flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); q[i] = _mm_or_si128(q[i], flat2_q[i]); } } } else { p[0] = ps[0]; q[0] = qs[0]; p[1] = ps[1]; q[1] = qs[1]; } } void aom_highbd_lpf_horizontal_14_dual_sse2( uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, int bd) { __m128i p[7], q[7]; int i; load_highbd_pixel(s, 7, pitch, p, q); highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd); for (i = 0; i < 6; i++) { _mm_storeu_si128((__m128i *)(s - (i + 1) * pitch), p[i]); _mm_storeu_si128((__m128i *)(s + i * pitch), q[i]); } } static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2( __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh, int bd) { __m128i blimit, limit, thresh; __m128i mask, hev, flat; __m128i pq[3]; __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0; __m128i flat_p1p0, flat_q0q1; pq[0] = _mm_unpacklo_epi64(*p0, *q0); pq[1] = _mm_unpacklo_epi64(*p1, *q1); pq[2] = _mm_unpacklo_epi64(*p2, *q2); const __m128i zero = _mm_setzero_si128(); const __m128i four = _mm_set1_epi16(4); __m128i t80; const __m128i one = _mm_set1_epi16(0x1); get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, &thresh, &hev, &mask); // lp filter highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); // flat_mask flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0); flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); flat = _mm_cmpeq_epi16(flat, zero); flat = _mm_and_si128(flat, mask); // replicate for the further "merged variables" usage flat = _mm_unpacklo_epi64(flat, flat); // 5 tap filter // need it only if flat !=0 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { __m128i workp_a, workp_b, workp_c; __m128i pq0x2_pq1, pq1_pq2; // op1 pq0x2_pq1 = _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]); // p0 *2 + p1 pq1_pq2 = _mm_add_epi16(pq[1], pq[2]); // p1 + p2 workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0); workp_b = _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 // op0 workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 workp_a = _mm_add_epi16(workp_a, workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 workp_b = _mm_unpacklo_epi64(workp_a, workp_b); flat_p1p0 = _mm_srli_epi16(workp_b, 3); // oq0 workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]), pq[1]); // p0 * 2 + p1 + q0 * 2 + q1 + 4 workp_b = _mm_srli_si128(pq1_pq2, 8); workp_a = _mm_add_epi16( workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 // workp_shft0 = _mm_srli_epi16(workp_a, 3); // oq1 workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]), pq[0]); // p0 + q0 * 2 + q1 * 2 + q2 + 4 workp_b = _mm_add_epi16(*q2, *q2); workp_b = _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 workp_a = _mm_unpacklo_epi64(workp_a, workp_b); flat_q0q1 = _mm_srli_epi16(workp_a, 3); qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); q1q0 = _mm_and_si128(flat, flat_q0q1); *q1q0_out = _mm_or_si128(qs1qs0, q1q0); ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); p1p0 = _mm_and_si128(flat, flat_p1p0); *p1p0_out = _mm_or_si128(ps1ps0, p1p0); } } static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0, const unsigned char *_thresh0, const unsigned char *_blimit1, const unsigned char *_limit1, const unsigned char *_thresh1, int bd) { const __m128i zero = _mm_setzero_si128(); __m128i blimit0, limit0, thresh0; __m128i t80; __m128i mask, flat, work; __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1; __m128i op1, op0, oq0, oq1; const __m128i four = _mm_set1_epi16(4); const __m128i one = _mm_set1_epi16(0x1); const __m128i ffff = _mm_cmpeq_epi16(one, one); get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, &blimit0, &limit0, &thresh0, &t80); abs_p2p1 = abs_diff16(*p2, *p1); abs_p1p0 = abs_diff16(*p1, *p0); abs_q1q0 = abs_diff16(*q1, *q0); abs_q2q1 = abs_diff16(*q2, *q1); abs_p0q0 = abs_diff16(*p0, *q0); abs_p1q1 = abs_diff16(*p1, *q1); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; // So taking maximums continues to work: mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); mask = _mm_max_epi16(abs_q2q1, mask); work = _mm_max_epi16(abs_p1p0, abs_q1q0); mask = _mm_max_epi16(work, mask); mask = _mm_max_epi16(mask, abs_p2p1); mask = _mm_subs_epu16(mask, limit0); mask = _mm_cmpeq_epi16(mask, zero); // lp filter __m128i ps[2], qs[2], p[2], q[2]; { p[0] = *p0; p[1] = *p1; q[0] = *q0; q[1] = *q1; // filter_mask and hev_mask highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); } // flat_mask flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0)); flat = _mm_max_epi16(flat, work); flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); flat = _mm_cmpeq_epi16(flat, zero); flat = _mm_and_si128(flat, mask); // flat & mask // 5 tap filter // need it only if flat !=0 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { __m128i workp_a, workp_b, workp_shft0, workp_shft1; // op1 workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0), _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4 workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0); workp_shft0 = _mm_add_epi16( workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4 op1 = _mm_srli_epi16(workp_shft0, 3); // op0 workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1 workp_a = _mm_add_epi16(workp_a, workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4 op0 = _mm_srli_epi16(workp_a, 3); // oq0 workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2), *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4 workp_b = _mm_add_epi16(*q1, *q2); workp_shft0 = _mm_add_epi16( workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4 oq0 = _mm_srli_epi16(workp_shft0, 3); // oq1 workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1), *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4 workp_b = _mm_add_epi16(*q2, *q2); workp_shft1 = _mm_add_epi16( workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 oq1 = _mm_srli_epi16(workp_shft1, 3); qs[0] = _mm_andnot_si128(flat, qs[0]); oq0 = _mm_and_si128(flat, oq0); *q0 = _mm_or_si128(qs[0], oq0); qs[1] = _mm_andnot_si128(flat, qs[1]); oq1 = _mm_and_si128(flat, oq1); *q1 = _mm_or_si128(qs[1], oq1); ps[0] = _mm_andnot_si128(flat, ps[0]); op0 = _mm_and_si128(flat, op0); *p0 = _mm_or_si128(ps[0], op0); ps[1] = _mm_andnot_si128(flat, ps[1]); op1 = _mm_and_si128(flat, op1); *p1 = _mm_or_si128(ps[1], op1); } else { *q0 = qs[0]; *q1 = qs[1]; *p0 = ps[0]; *p1 = ps[1]; } } void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh, int bd) { __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out; p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out, _blimit, _limit, _thresh, bd); _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8)); _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out); _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out); _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8)); } void aom_highbd_lpf_horizontal_6_dual_sse2( uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, int bd) { __m128i p2, p1, p0, q0, q1, q2; p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd); _mm_storeu_si128((__m128i *)(s - 2 * p), p1); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); _mm_storeu_si128((__m128i *)(s + 0 * p), q0); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); } static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh, int bd) { const __m128i zero = _mm_setzero_si128(); __m128i blimit, limit, thresh; __m128i mask, hev, flat; __m128i pq[4]; __m128i p1p0, q1q0, ps1ps0, qs1qs0; __m128i work_a, opq2, flat_p1p0, flat_q0q1; pq[0] = _mm_unpacklo_epi64(*p0, *q0); pq[1] = _mm_unpacklo_epi64(*p1, *q1); pq[2] = _mm_unpacklo_epi64(*p2, *q2); pq[3] = _mm_unpacklo_epi64(*p3, *q3); __m128i abs_p1p0; const __m128i four = _mm_set1_epi16(4); __m128i t80; const __m128i one = _mm_set1_epi16(0x1); get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, &thresh, &hev, &mask); // lp filter highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); // flat_mask4 flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0])); flat = _mm_max_epi16(abs_p1p0, flat); flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); flat = _mm_cmpeq_epi16(flat, zero); flat = _mm_and_si128(flat, mask); // replicate for the further "merged variables" usage flat = _mm_unpacklo_epi64(flat, flat); if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1; // Added before shift for rounding part of ROUND_POWER_OF_TWO // o*p2 workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); workp_c = _mm_add_epi16(workp_a, workp_c); // o*p1 workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); workp_shft0 = _mm_add_epi16(workp_a, workp_b); // o*p0 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0); workp_shft1 = _mm_add_epi16(workp_a, workp_b); flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3); // oq0 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0); workp_shft0 = _mm_add_epi16(workp_a, workp_b); // oq1 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1); workp_shft1 = _mm_add_epi16(workp_a, workp_b); flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3); // oq2 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); workp_a = _mm_add_epi16(workp_a, workp_b); opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3); qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); q1q0 = _mm_and_si128(flat, flat_q0q1); *q1q0_out = _mm_or_si128(qs1qs0, q1q0); ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); p1p0 = _mm_and_si128(flat, flat_p1p0); *p1p0_out = _mm_or_si128(ps1ps0, p1p0); work_a = _mm_andnot_si128(flat, pq[2]); *p2 = _mm_and_si128(flat, opq2); *p2 = _mm_or_si128(work_a, *p2); *q2 = _mm_srli_si128(*p2, 8); } } static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0, const unsigned char *_limit0, const unsigned char *_thresh0, const unsigned char *_blimit1, const unsigned char *_limit1, const unsigned char *_thresh1, int bd) { __m128i blimit0, limit0, thresh0; __m128i t80; __m128i mask, flat; __m128i work_a, op2, oq2, op1, op0, oq0, oq1; __m128i abs_p1q1, abs_p0q0, work0, work1, work2; const __m128i zero = _mm_setzero_si128(); const __m128i four = _mm_set1_epi16(4); const __m128i one = _mm_set1_epi16(0x1); const __m128i ffff = _mm_cmpeq_epi16(one, one); get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, &blimit0, &limit0, &thresh0, &t80); abs_p0q0 = abs_diff16(*p0, *q0); abs_p1q1 = abs_diff16(*p1, *q1); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2 > blimit) * -1; // So taking maximums continues to work: mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1)); work1 = _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0)); // tbu 4 flat work0 = _mm_max_epi16(work0, work1); work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3)); work2 = _mm_max_epi16(work2, work0); mask = _mm_max_epi16(work2, mask); mask = _mm_subs_epu16(mask, limit0); mask = _mm_cmpeq_epi16(mask, zero); // lp filter __m128i ps[2], qs[2], p[2], q[2]; { p[0] = *p0; p[1] = *p1; q[0] = *q0; q[1] = *q1; // filter_mask and hev_mask highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); } flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0)); flat = _mm_max_epi16(work1, flat); work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0)); flat = _mm_max_epi16(work0, flat); flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); flat = _mm_cmpeq_epi16(flat, zero); flat = _mm_and_si128(flat, mask); // flat & mask // filter8 need it only if flat !=0 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { __m128i workp_a, workp_b; // Added before shift for rounding part of ROUND_POWER_OF_TWO // o*p2 workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); // o*p1 workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); // o*p0 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0); op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); // oq0 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0); oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); // oq1 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1); oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); // oq2 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); qs[0] = _mm_andnot_si128(flat, qs[0]); oq0 = _mm_and_si128(flat, oq0); *q0 = _mm_or_si128(qs[0], oq0); qs[1] = _mm_andnot_si128(flat, qs[1]); oq1 = _mm_and_si128(flat, oq1); *q1 = _mm_or_si128(qs[1], oq1); ps[0] = _mm_andnot_si128(flat, ps[0]); op0 = _mm_and_si128(flat, op0); *p0 = _mm_or_si128(ps[0], op0); ps[1] = _mm_andnot_si128(flat, ps[1]); op1 = _mm_and_si128(flat, op1); *p1 = _mm_or_si128(ps[1], op1); work_a = _mm_andnot_si128(flat, *q2); *q2 = _mm_and_si128(flat, oq2); *q2 = _mm_or_si128(work_a, *q2); work_a = _mm_andnot_si128(flat, *p2); *p2 = _mm_and_si128(flat, op2); *p2 = _mm_or_si128(work_a, *p2); } else { *q0 = qs[0]; *q1 = qs[1]; *p0 = ps[0]; *p1 = ps[1]; } } void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh, int bd) { __m128i p2, p1, p0, q0, q1, q2, p3, q3; __m128i q1q0, p1p0; p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, _blimit, _limit, _thresh, bd); _mm_storel_epi64((__m128i *)(s - 3 * p), p2); _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); _mm_storel_epi64((__m128i *)(s + 2 * p), q2); } void aom_highbd_lpf_horizontal_8_dual_sse2( uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, int bd) { __m128i p2, p1, p0, q0, q1, q2, p3, q3; p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, _blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd); _mm_storeu_si128((__m128i *)(s - 3 * p), p2); _mm_storeu_si128((__m128i *)(s - 2 * p), p1); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); _mm_storeu_si128((__m128i *)(s + 0 * p), q0); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); _mm_storeu_si128((__m128i *)(s + 2 * p), q2); } static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2( __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out, __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh, int bd) { __m128i blimit, limit, thresh; __m128i mask, hev; __m128i p1p0, q1q0; __m128i pq[2]; __m128i abs_p1p0; __m128i t80; get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); pq[0] = _mm_unpacklo_epi64(*p0, *q0); pq[1] = _mm_unpacklo_epi64(*p1, *q1); highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, &thresh, &hev, &mask); highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); } static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2( __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps, __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, int bd) { __m128i blimit0, limit0, thresh0; __m128i mask, flat; __m128i p[2], q[2]; const __m128i zero = _mm_setzero_si128(); __m128i abs_p0q0 = abs_diff16(*q0, *p0); __m128i abs_p1q1 = abs_diff16(*q1, *p1); __m128i abs_p1p0 = abs_diff16(*p1, *p0); __m128i abs_q1q0 = abs_diff16(*q1, *q0); const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); const __m128i one = _mm_set1_epi16(1); __m128i t80; get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, &blimit0, &limit0, &thresh0, &t80); // filter_mask and hev_mask flat = _mm_max_epi16(abs_p1p0, abs_q1q0); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; // So taking maximums continues to work: mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); mask = _mm_max_epi16(flat, mask); mask = _mm_subs_epu16(mask, limit0); mask = _mm_cmpeq_epi16(mask, zero); p[0] = *p0; p[1] = *p1; q[0] = *q0; q[1] = *q1; highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); } void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh, int bd) { __m128i p1p0, q1q0; __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit, _thresh, bd); _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); } void aom_highbd_lpf_horizontal_4_dual_sse2( uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, int bd) { __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); __m128i ps[2], qs[2]; highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd); _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]); _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]); _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]); _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]); } void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { __m128i x0, x1, x2, x3, d0, d1, d2, d3; __m128i p1p0, q1q0; __m128i p1, q1; x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3); highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit, thresh, bd); p1 = _mm_srli_si128(p1p0, 8); q1 = _mm_srli_si128(q1q0, 8); // transpose from 8x4 to 4x8 highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); } void aom_highbd_lpf_vertical_4_dual_sse2( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { __m128i x0, x1, x2, x3, x4, x5, x6, x7; __m128i d0, d1, d2, d3, d4, d5, d6, d7; __m128i ps[2], qs[2]; x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p)); x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p)); x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p)); x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p)); highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1, &d2, &d3); highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd); highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4); _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5); _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6); _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7); } void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { __m128i d0, d1, d2, d3, d4, d5, d6, d7; __m128i x3, x2, x1, x0, p0, q0; __m128i p1p0, q1q0; x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p)); x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p)); x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p)); x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p)); highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit, limit, thresh, bd); p0 = _mm_srli_si128(p1p0, 8); q0 = _mm_srli_si128(q1q0, 8); highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); } void aom_highbd_lpf_vertical_6_dual_sse2( uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, int bd) { __m128i d0, d1, d2, d3, d4, d5, d6, d7; __m128i x0, x1, x2, x3, x4, x5, x6, x7; __m128i p0, q0, p1, q1, p2, q2; x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p)); x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p)); x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p)); x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p)); x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p)); x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p)); x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p)); x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p)); highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1, &p0, &q0, &q1, &q2, &d6, &d7); highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd); highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4); _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5); _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6); _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7); } void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { __m128i d0, d1, d2, d3, d4, d5, d6, d7; __m128i p2, p1, p0, p3, q0; __m128i q1q0, p1p0; p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p)); p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p)); p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p)); p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p)); highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); // Loop filtering highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, blimit, limit, thresh, bd); p0 = _mm_srli_si128(p1p0, 8); q0 = _mm_srli_si128(q1q0, 8); highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1, &d2, &d3); _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0); _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1); _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2); _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3); } void aom_highbd_lpf_vertical_8_dual_sse2( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { __m128i x0, x1, x2, x3, x4, x5, x6, x7; __m128i d0, d1, d2, d3, d4, d5, d6, d7; x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p)); x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p)); x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p)); x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p)); x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p)); x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p)); x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p)); x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p)); highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd); highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7); _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0); _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1); _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2); _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3); _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4); _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5); _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6); _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7); } void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { __m128i q[7], p[7], pq[7]; __m128i p6, p5, p4, p3; __m128i p6_2, p5_2, p4_2, p3_2; __m128i d0, d1, d2, d3; __m128i d0_2, d1_2, d2_2, d3_2, d7_2; p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch)); p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch)); p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch)); p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch)); highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]); p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch)); p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7_2); highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd); highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2], &pq[1], &pq[0], &d0, &d1, &d2, &d3); q[0] = _mm_srli_si128(pq[0], 8); q[1] = _mm_srli_si128(pq[1], 8); q[2] = _mm_srli_si128(pq[2], 8); q[3] = _mm_srli_si128(pq[3], 8); q[4] = _mm_srli_si128(pq[4], 8); q[5] = _mm_srli_si128(pq[5], 8); highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7_2, &d0_2, &d1_2, &d2_2, &d3_2); _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0); _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2); _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1); _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2); _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2); _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2); _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3); _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2); } void aom_highbd_lpf_vertical_14_dual_sse2( uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { __m128i q[7], p[7]; __m128i p6, p5, p4, p3, p2, p1, p0, q0; __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2; __m128i d0, d7; __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out; p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch)); p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch)); p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch)); p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch)); p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch)); p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch)); p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch)); q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch)); highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]); p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch)); p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch)); p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch)); p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch)); q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch)); highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2, &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7); highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd); highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0], &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out, &d6_out, &d7_out); _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out); _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out); _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out); _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out); _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out); _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out); _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out); _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out); highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7, &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out, &d6_out, &d7_out); _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out); _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out); _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out); _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out); _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out); _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out); _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out); _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out); } aom-3.12.1/aom_dsp/x86/highbd_quantize_intrin_avx2.c000066400000000000000000000254251477627663500223140ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" static inline void init_one_qp(const __m128i *p, __m256i *qp) { const __m128i sign = _mm_srai_epi16(*p, 15); const __m128i dc = _mm_unpacklo_epi16(*p, sign); const __m128i ac = _mm_unpackhi_epi16(*p, sign); *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); } static inline void update_qp(__m256i *qp) { int i; for (i = 0; i < 5; ++i) { qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11); } } static inline void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *dequant_ptr, const int16_t *quant_shift_ptr, __m256i *qp, int log_scale) { const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr); const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr); init_one_qp(&zbin, &qp[0]); init_one_qp(&round, &qp[1]); init_one_qp(&quant, &qp[2]); init_one_qp(&dequant, &qp[3]); init_one_qp(&quant_shift, &qp[4]); if (log_scale > 0) { const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1))); qp[0] = _mm256_add_epi32(qp[0], rnd); qp[0] = _mm256_srai_epi32(qp[0], log_scale); qp[1] = _mm256_add_epi32(qp[1], rnd); qp[1] = _mm256_srai_epi32(qp[1], log_scale); } // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when // calculating the zbin mask. qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1)); } // Note: // *x is vector multiplied by *y which is 16 int32_t parallel multiplication // and right shift 16. The output, 16 int32_t is save in *p. static inline __m256i mm256_mul_shift_epi32(const __m256i *x, const __m256i *y) { __m256i prod_lo = _mm256_mul_epi32(*x, *y); __m256i prod_hi = _mm256_srli_epi64(*x, 32); const __m256i mult_hi = _mm256_srli_epi64(*y, 32); prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); prod_lo = _mm256_srli_epi64(prod_lo, 16); const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); prod_lo = _mm256_and_si256(prod_lo, mask); prod_hi = _mm256_srli_epi64(prod_hi, 16); prod_hi = _mm256_slli_epi64(prod_hi, 32); return _mm256_or_si256(prod_lo, prod_hi); } static AOM_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) { const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask); const __m256i packed_nz_mask_perm = _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); const __m256i iscan = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm); const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm); return _mm256_max_epi16(eobmax, nz_iscan); } // Get the max eob from the lower 128 bits. static AOM_FORCE_INLINE uint16_t get_max_eob(__m256i eob) { __m256i eob_s; eob_s = _mm256_shuffle_epi32(eob, 0xe); eob = _mm256_max_epi16(eob, eob_s); eob_s = _mm256_shufflelo_epi16(eob, 0xe); eob = _mm256_max_epi16(eob, eob_s); eob_s = _mm256_shufflelo_epi16(eob, 1); eob = _mm256_max_epi16(eob, eob_s); return (uint16_t)_mm256_extract_epi16(eob, 0); } static AOM_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x, const __m256i *y, int log_scale) { __m256i prod_lo = _mm256_mul_epi32(*x, *y); __m256i prod_hi = _mm256_srli_epi64(*x, 32); const __m256i mult_hi = _mm256_srli_epi64(*y, 32); prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale); const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); prod_lo = _mm256_and_si256(prod_lo, mask); prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale); prod_hi = _mm256_slli_epi64(prod_hi, 32); return _mm256_or_si256(prod_lo, prod_hi); } static AOM_FORCE_INLINE void quantize_logscale( const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob, int log_scale) { const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); const __m256i abs_coeff = _mm256_abs_epi32(coeff); const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]); if (UNLIKELY(_mm256_movemask_epi8(zbin_mask) == 0)) { const __m256i zero = _mm256_setzero_si256(); _mm256_storeu_si256((__m256i *)qcoeff, zero); _mm256_storeu_si256((__m256i *)dqcoeff, zero); return; } const __m256i tmp_rnd = _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask); // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0); const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd); // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> // (16 - log_scale + AOM_QM_BITS)); const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], log_scale); const __m256i abs_dq = _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), log_scale); const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); const __m256i q = _mm256_sign_epi32(abs_q, coeff); const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); _mm256_storeu_si256((__m256i *)qcoeff, q); _mm256_storeu_si256((__m256i *)dqcoeff, dq); *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); } static AOM_FORCE_INLINE void quantize(const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); const __m256i abs_coeff = _mm256_abs_epi32(coeff); const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]); if (UNLIKELY(_mm256_movemask_epi8(zbin_mask) == 0)) { const __m256i zero = _mm256_setzero_si256(); _mm256_storeu_si256((__m256i *)qcoeff, zero); _mm256_storeu_si256((__m256i *)dqcoeff, zero); return; } const __m256i tmp_rnd = _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask); const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]); const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd); const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]); const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]); const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); const __m256i q = _mm256_sign_epi32(abs_q, coeff); const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); _mm256_storeu_si256((__m256i *)qcoeff, q); _mm256_storeu_si256((__m256i *)dqcoeff, dq); *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); } void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { (void)scan; const int step = 8; __m256i eob = _mm256_setzero_si256(); __m256i qp[5]; init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0); quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan += step; n_coeffs -= step; update_qp(qp); while (n_coeffs > 0) { quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan += step; n_coeffs -= step; } *eob_ptr = get_max_eob(eob); } void aom_highbd_quantize_b_32x32_avx2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { (void)scan; const unsigned int step = 8; __m256i eob = _mm256_setzero_si256(); __m256i qp[5]; init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1); quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 1); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan += step; n_coeffs -= step; update_qp(qp); while (n_coeffs > 0) { quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 1); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan += step; n_coeffs -= step; } *eob_ptr = get_max_eob(eob); } void aom_highbd_quantize_b_64x64_avx2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { (void)scan; const int step = 8; __m256i eob = _mm256_setzero_si256(); __m256i qp[5]; init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 2); quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 2); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan += step; n_coeffs -= step; update_qp(qp); while (n_coeffs > 0) { quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 2); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan += step; n_coeffs -= step; } *eob_ptr = get_max_eob(eob); } aom-3.12.1/aom_dsp/x86/highbd_quantize_intrin_sse2.c000066400000000000000000000177531477627663500223150ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "config/aom_dsp_rtcd.h" void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int i, j, non_zero_regs = (int)count / 4, eob_i = -1; __m128i zbins[2]; __m128i nzbins[2]; zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[0]); zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); nzbins[0] = _mm_setzero_si128(); nzbins[1] = _mm_setzero_si128(); nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); (void)scan; memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); // Pre-scan pass for (i = ((int)count / 4) - 1; i >= 0; i--) { __m128i coeffs, cmp1, cmp2; int test; coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); cmp1 = _mm_and_si128(cmp1, cmp2); test = _mm_movemask_epi8(cmp1); if (test == 0xffff) non_zero_regs--; else break; } // Quantization pass: for (i = 0; i < non_zero_regs; i++) { __m128i coeffs, coeffs_sign, tmp1, tmp2; int test; int abs_coeff[4]; int coeff_sign[4]; coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); coeffs_sign = _mm_srai_epi32(coeffs, 31); coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); tmp1 = _mm_or_si128(tmp1, tmp2); test = _mm_movemask_epi8(tmp1); _mm_storeu_si128((__m128i *)abs_coeff, coeffs); _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); for (j = 0; j < 4; j++) { if (test & (1 << (4 * j))) { int k = 4 * i + j; const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; const uint32_t abs_qcoeff = (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); qcoeff_ptr[k] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j]; dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; } } } *eob_ptr = eob_i + 1; } void aom_highbd_quantize_b_32x32_sse2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { __m128i zbins[2]; __m128i nzbins[2]; int idx = 0; int idx_arr[1024]; int i, eob = -1; const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); (void)scan; zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); zbins[1] = _mm_set1_epi32(zbin1_tmp); nzbins[0] = _mm_setzero_si128(); nzbins[1] = _mm_setzero_si128(); nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); // Pre-scan pass for (i = 0; i < n_coeffs / 4; i++) { __m128i coeffs, cmp1, cmp2; int test; coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); cmp1 = _mm_and_si128(cmp1, cmp2); test = _mm_movemask_epi8(cmp1); if (!(test & 0xf)) idx_arr[idx++] = i * 4; if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; } // Quantization pass: only process the coefficients selected in // pre-scan pass. Note: idx can be zero. for (i = 0; i < idx; i++) { const int rc = idx_arr[i]; const int coeff = coeff_ptr[rc]; const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; const uint32_t abs_qcoeff = (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; } *eob_ptr = eob + 1; } void aom_highbd_quantize_b_64x64_sse2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { __m128i zbins[2]; __m128i nzbins[2]; int idx = 0; int idx_arr[1024]; int i, eob = -1; const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2); const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2); (void)scan; zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); zbins[1] = _mm_set1_epi32(zbin1_tmp); nzbins[0] = _mm_setzero_si128(); nzbins[1] = _mm_setzero_si128(); nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); // Pre-scan pass for (i = 0; i < n_coeffs / 4; i++) { __m128i coeffs, cmp1, cmp2; int test; coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); cmp1 = _mm_and_si128(cmp1, cmp2); test = _mm_movemask_epi8(cmp1); if (!(test & 0xf)) idx_arr[idx++] = i * 4; if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; } // Quantization pass: only process the coefficients selected in // pre-scan pass. Note: idx can be zero. for (i = 0; i < idx; i++) { const int rc = idx_arr[i]; const int coeff = coeff_ptr[rc]; const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2); const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; const uint32_t abs_qcoeff = (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14); qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4; if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; } *eob_ptr = eob + 1; } aom-3.12.1/aom_dsp/x86/highbd_sad4d_sse2.asm000066400000000000000000000243201477627663500204130ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "third_party/x86inc/x86inc.asm" SECTION .text ; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end %macro HIGH_PROCESS_4x2x4 5-6 0 movh m0, [srcq +%2*2] %if %1 == 1 movu m4, [ref1q+%3*2] movu m5, [ref2q+%3*2] movu m6, [ref3q+%3*2] movu m7, [ref4q+%3*2] movhps m0, [srcq +%4*2] movhps m4, [ref1q+%5*2] movhps m5, [ref2q+%5*2] movhps m6, [ref3q+%5*2] movhps m7, [ref4q+%5*2] mova m3, m0 mova m2, m0 psubusw m3, m4 psubusw m2, m5 psubusw m4, m0 psubusw m5, m0 por m4, m3 por m5, m2 pmaddwd m4, m1 pmaddwd m5, m1 mova m3, m0 mova m2, m0 psubusw m3, m6 psubusw m2, m7 psubusw m6, m0 psubusw m7, m0 por m6, m3 por m7, m2 pmaddwd m6, m1 pmaddwd m7, m1 %else movu m2, [ref1q+%3*2] movhps m0, [srcq +%4*2] movhps m2, [ref1q+%5*2] mova m3, m0 psubusw m3, m2 psubusw m2, m0 por m2, m3 pmaddwd m2, m1 paddd m4, m2 movu m2, [ref2q+%3*2] mova m3, m0 movhps m2, [ref2q+%5*2] psubusw m3, m2 psubusw m2, m0 por m2, m3 pmaddwd m2, m1 paddd m5, m2 movu m2, [ref3q+%3*2] mova m3, m0 movhps m2, [ref3q+%5*2] psubusw m3, m2 psubusw m2, m0 por m2, m3 pmaddwd m2, m1 paddd m6, m2 movu m2, [ref4q+%3*2] mova m3, m0 movhps m2, [ref4q+%5*2] psubusw m3, m2 psubusw m2, m0 por m2, m3 pmaddwd m2, m1 paddd m7, m2 %endif %if %6 == 1 lea srcq, [srcq +src_strideq*4] lea ref1q, [ref1q+ref_strideq*4] lea ref2q, [ref2q+ref_strideq*4] lea ref3q, [ref3q+ref_strideq*4] lea ref4q, [ref4q+ref_strideq*4] %endif %endmacro ; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end %macro HIGH_PROCESS_8x2x4 5-6 0 ; 1st 8 px mova m0, [srcq +%2*2] %if %1 == 1 movu m4, [ref1q+%3*2] movu m5, [ref2q+%3*2] movu m6, [ref3q+%3*2] movu m7, [ref4q+%3*2] mova m3, m0 mova m2, m0 psubusw m3, m4 psubusw m2, m5 psubusw m4, m0 psubusw m5, m0 por m4, m3 por m5, m2 pmaddwd m4, m1 pmaddwd m5, m1 mova m3, m0 mova m2, m0 psubusw m3, m6 psubusw m2, m7 psubusw m6, m0 psubusw m7, m0 por m6, m3 por m7, m2 pmaddwd m6, m1 pmaddwd m7, m1 %else mova m3, m0 movu m2, [ref1q+%3*2] psubusw m3, m2 psubusw m2, m0 por m2, m3 mova m3, m0 pmaddwd m2, m1 paddd m4, m2 movu m2, [ref2q+%3*2] psubusw m3, m2 psubusw m2, m0 por m2, m3 mova m3, m0 pmaddwd m2, m1 paddd m5, m2 movu m2, [ref3q+%3*2] psubusw m3, m2 psubusw m2, m0 por m2, m3 mova m3, m0 pmaddwd m2, m1 paddd m6, m2 movu m2, [ref4q+%3*2] psubusw m3, m2 psubusw m2, m0 por m2, m3 pmaddwd m2, m1 paddd m7, m2 %endif ; 2nd 8 px mova m0, [srcq +(%4)*2] mova m3, m0 movu m2, [ref1q+(%5)*2] psubusw m3, m2 psubusw m2, m0 por m2, m3 mova m3, m0 pmaddwd m2, m1 paddd m4, m2 movu m2, [ref2q+(%5)*2] psubusw m3, m2 psubusw m2, m0 por m2, m3 mova m3, m0 pmaddwd m2, m1 paddd m5, m2 movu m2, [ref3q+(%5)*2] psubusw m3, m2 psubusw m2, m0 por m2, m3 mova m3, m0 pmaddwd m2, m1 paddd m6, m2 movu m2, [ref4q+(%5)*2] psubusw m3, m2 psubusw m2, m0 %if %6 == 1 lea srcq, [srcq +src_strideq*4] lea ref1q, [ref1q+ref_strideq*4] lea ref2q, [ref2q+ref_strideq*4] lea ref3q, [ref3q+ref_strideq*4] lea ref4q, [ref4q+ref_strideq*4] %endif por m2, m3 pmaddwd m2, m1 paddd m7, m2 %endmacro ; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end %macro HIGH_PROCESS_16x2x4 5-6 0 HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8) HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6 %endmacro ; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end %macro HIGH_PROCESS_32x2x4 5-6 0 HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16) HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6 %endmacro ; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end %macro HIGH_PROCESS_64x2x4 5-6 0 HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32) HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6 %endmacro ; void aom_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride, ; uint8_t *ref[4], int ref_stride, ; uint32_t res[4]); ; Macro Arguments: ; 1: Width ; 2: Height ; 3: If 0, then normal sad, if 2, then skip every other row %macro HIGH_SADNXN4D 2-3 0 %if %3 == 0 ; normal sad %if AOM_ARCH_X86_64 cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ res, ref2, ref3, ref4 %else cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ ref2, ref3, ref4 %endif ; AOM_ARCH_X86_64 %else ; %3 == 2, downsample %if AOM_ARCH_X86_64 cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ res, ref2, ref3, ref4 %else cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ ref2, ref3, ref4 %endif ; AOM_ARCH_X86_64 %endif ; sad/avg/skip ; set m1 push srcq mov srcd, 0x00010001 movd m1, srcd pshufd m1, m1, 0x0 pop srcq %if %3 == 2 ; skip rows lea src_strided, [2*src_strided] lea ref_strided, [2*ref_strided] %endif ; skip rows movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided mov ref2q, [ref1q+gprsize*1] mov ref3q, [ref1q+gprsize*2] mov ref4q, [ref1q+gprsize*3] mov ref1q, [ref1q+gprsize*0] ; convert byte pointers to short pointers shl srcq, 1 shl ref2q, 1 shl ref3q, 1 shl ref4q, 1 shl ref1q, 1 HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 %if %3 == 2 ; Downsampling by two %define num_rep (%2-8)/4 %else %define num_rep (%2-4)/2 %endif %rep num_rep HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 %endrep %undef rep HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 ; N.B. HIGH_PROCESS outputs dwords (32 bits) ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM movhlps m0, m4 movhlps m1, m5 movhlps m2, m6 movhlps m3, m7 paddd m4, m0 paddd m5, m1 paddd m6, m2 paddd m7, m3 punpckldq m4, m5 punpckldq m6, m7 movhlps m0, m4 movhlps m1, m6 paddd m4, m0 paddd m6, m1 punpcklqdq m4, m6 %if %3 == 2 ; skip rows pslld m4, 1 %endif movifnidn r4, r4mp movu [r4], m4 RET %endmacro INIT_XMM sse2 HIGH_SADNXN4D 64, 64 HIGH_SADNXN4D 64, 32 HIGH_SADNXN4D 32, 64 HIGH_SADNXN4D 32, 32 HIGH_SADNXN4D 32, 16 HIGH_SADNXN4D 16, 32 HIGH_SADNXN4D 16, 16 HIGH_SADNXN4D 16, 8 HIGH_SADNXN4D 8, 16 HIGH_SADNXN4D 8, 8 HIGH_SADNXN4D 8, 4 HIGH_SADNXN4D 4, 8 HIGH_SADNXN4D 4, 4 HIGH_SADNXN4D 4, 16 HIGH_SADNXN4D 16, 4 HIGH_SADNXN4D 8, 32 HIGH_SADNXN4D 32, 8 HIGH_SADNXN4D 16, 64 HIGH_SADNXN4D 64, 16 HIGH_SADNXN4D 64, 64, 2 HIGH_SADNXN4D 64, 32, 2 HIGH_SADNXN4D 32, 64, 2 HIGH_SADNXN4D 32, 32, 2 HIGH_SADNXN4D 32, 16, 2 HIGH_SADNXN4D 16, 32, 2 HIGH_SADNXN4D 16, 16, 2 HIGH_SADNXN4D 8, 16, 2 HIGH_SADNXN4D 4, 16, 2 HIGH_SADNXN4D 8, 32, 2 HIGH_SADNXN4D 16, 64, 2 HIGH_SADNXN4D 64, 16, 2 aom-3.12.1/aom_dsp/x86/highbd_sad_avx2.c000066400000000000000000000650411477627663500176360ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/synonyms_avx2.h" #include "aom_ports/mem.h" // SAD static inline unsigned int get_sad_from_mm256_epi32(const __m256i *v) { // input 8 32-bit summation __m128i lo128, hi128; __m256i u = _mm256_srli_si256(*v, 8); u = _mm256_add_epi32(u, *v); // 4 32-bit summation hi128 = _mm256_extracti128_si256(u, 1); lo128 = _mm256_castsi256_si128(u); lo128 = _mm_add_epi32(hi128, lo128); // 2 32-bit summation hi128 = _mm_srli_si128(lo128, 4); lo128 = _mm_add_epi32(lo128, hi128); return (unsigned int)_mm_cvtsi128_si32(lo128); } static inline void highbd_sad16x4_core_avx2(__m256i *s, __m256i *r, __m256i *sad_acc) { const __m256i zero = _mm256_setzero_si256(); int i; for (i = 0; i < 4; i++) { s[i] = _mm256_sub_epi16(s[i], r[i]); s[i] = _mm256_abs_epi16(s[i]); } s[0] = _mm256_add_epi16(s[0], s[1]); s[0] = _mm256_add_epi16(s[0], s[2]); s[0] = _mm256_add_epi16(s[0], s[3]); r[0] = _mm256_unpacklo_epi16(s[0], zero); r[1] = _mm256_unpackhi_epi16(s[0], zero); r[0] = _mm256_add_epi32(r[0], r[1]); *sad_acc = _mm256_add_epi32(*sad_acc, r[0]); } // If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD. static inline void sad16x4(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, const uint16_t *sec_ptr, __m256i *sad_acc) { __m256i s[4], r[4]; s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); if (sec_ptr) { r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); r[1] = _mm256_avg_epu16( r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); r[2] = _mm256_avg_epu16( r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); r[3] = _mm256_avg_epu16( r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); } highbd_sad16x4_core_avx2(s, r, sad_acc); } static AOM_FORCE_INLINE unsigned int aom_highbd_sad16xN_avx2(int N, const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); int i; __m256i sad = _mm256_setzero_si256(); for (i = 0; i < N; i += 4) { sad16x4(src_ptr, src_stride, ref_ptr, ref_stride, NULL, &sad); src_ptr += src_stride << 2; ref_ptr += ref_stride << 2; } return (unsigned int)get_sad_from_mm256_epi32(&sad); } static void sad32x4(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, const uint16_t *sec_ptr, __m256i *sad_acc) { __m256i s[4], r[4]; int row_sections = 0; while (row_sections < 2) { s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16)); r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16)); if (sec_ptr) { r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); r[1] = _mm256_avg_epu16( r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); r[2] = _mm256_avg_epu16( r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); r[3] = _mm256_avg_epu16( r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); sec_ptr += 32 << 1; } highbd_sad16x4_core_avx2(s, r, sad_acc); row_sections += 1; src_ptr += src_stride << 1; ref_ptr += ref_stride << 1; } } static AOM_FORCE_INLINE unsigned int aom_highbd_sad32xN_avx2(int N, const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride) { __m256i sad = _mm256_setzero_si256(); uint16_t *srcp = CONVERT_TO_SHORTPTR(src); uint16_t *refp = CONVERT_TO_SHORTPTR(ref); const int left_shift = 2; int i; for (i = 0; i < N; i += 4) { sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad); srcp += src_stride << left_shift; refp += ref_stride << left_shift; } return get_sad_from_mm256_epi32(&sad); } static void sad64x2(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, const uint16_t *sec_ptr, __m256i *sad_acc) { __m256i s[4], r[4]; int i; for (i = 0; i < 2; i++) { s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); if (sec_ptr) { r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); r[1] = _mm256_avg_epu16( r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); r[2] = _mm256_avg_epu16( r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); r[3] = _mm256_avg_epu16( r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); sec_ptr += 64; } highbd_sad16x4_core_avx2(s, r, sad_acc); src_ptr += src_stride; ref_ptr += ref_stride; } } static AOM_FORCE_INLINE unsigned int aom_highbd_sad64xN_avx2(int N, const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride) { __m256i sad = _mm256_setzero_si256(); uint16_t *srcp = CONVERT_TO_SHORTPTR(src); uint16_t *refp = CONVERT_TO_SHORTPTR(ref); const int left_shift = 1; int i; for (i = 0; i < N; i += 2) { sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad); srcp += src_stride << left_shift; refp += ref_stride << left_shift; } return get_sad_from_mm256_epi32(&sad); } static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr, const uint16_t *sec_ptr, __m256i *sad_acc) { __m256i s[4], r[4]; int i; for (i = 0; i < 2; i++) { s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); if (sec_ptr) { r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); r[1] = _mm256_avg_epu16( r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); r[2] = _mm256_avg_epu16( r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); r[3] = _mm256_avg_epu16( r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); sec_ptr += 64; } highbd_sad16x4_core_avx2(s, r, sad_acc); src_ptr += 64; ref_ptr += 64; } } static AOM_FORCE_INLINE unsigned int aom_highbd_sad128xN_avx2( int N, const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride) { __m256i sad = _mm256_setzero_si256(); uint16_t *srcp = CONVERT_TO_SHORTPTR(src); uint16_t *refp = CONVERT_TO_SHORTPTR(ref); int row = 0; while (row < N) { sad128x1(srcp, refp, NULL, &sad); srcp += src_stride; refp += ref_stride; row++; } return get_sad_from_mm256_epi32(&sad); } #define HIGHBD_SADMXN_AVX2(m, n) \ unsigned int aom_highbd_sad##m##x##n##_avx2( \ const uint8_t *src, int src_stride, const uint8_t *ref, \ int ref_stride) { \ return aom_highbd_sad##m##xN_avx2(n, src, src_stride, ref, ref_stride); \ } #define HIGHBD_SAD_SKIP_MXN_AVX2(m, n) \ unsigned int aom_highbd_sad_skip_##m##x##n##_avx2( \ const uint8_t *src, int src_stride, const uint8_t *ref, \ int ref_stride) { \ return 2 * aom_highbd_sad##m##xN_avx2((n / 2), src, 2 * src_stride, ref, \ 2 * ref_stride); \ } HIGHBD_SADMXN_AVX2(16, 8) HIGHBD_SADMXN_AVX2(16, 16) HIGHBD_SADMXN_AVX2(16, 32) HIGHBD_SADMXN_AVX2(32, 16) HIGHBD_SADMXN_AVX2(32, 32) HIGHBD_SADMXN_AVX2(32, 64) HIGHBD_SADMXN_AVX2(64, 32) HIGHBD_SADMXN_AVX2(64, 64) HIGHBD_SADMXN_AVX2(64, 128) HIGHBD_SADMXN_AVX2(128, 64) HIGHBD_SADMXN_AVX2(128, 128) #if !CONFIG_REALTIME_ONLY HIGHBD_SADMXN_AVX2(16, 4) HIGHBD_SADMXN_AVX2(16, 64) HIGHBD_SADMXN_AVX2(32, 8) HIGHBD_SADMXN_AVX2(64, 16) #endif // !CONFIG_REALTIME_ONLY HIGHBD_SAD_SKIP_MXN_AVX2(16, 16) HIGHBD_SAD_SKIP_MXN_AVX2(16, 32) HIGHBD_SAD_SKIP_MXN_AVX2(32, 16) HIGHBD_SAD_SKIP_MXN_AVX2(32, 32) HIGHBD_SAD_SKIP_MXN_AVX2(32, 64) HIGHBD_SAD_SKIP_MXN_AVX2(64, 32) HIGHBD_SAD_SKIP_MXN_AVX2(64, 64) HIGHBD_SAD_SKIP_MXN_AVX2(64, 128) HIGHBD_SAD_SKIP_MXN_AVX2(128, 64) HIGHBD_SAD_SKIP_MXN_AVX2(128, 128) #if !CONFIG_REALTIME_ONLY HIGHBD_SAD_SKIP_MXN_AVX2(16, 64) HIGHBD_SAD_SKIP_MXN_AVX2(64, 16) #endif // !CONFIG_REALTIME_ONLY unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { __m256i sad = _mm256_setzero_si256(); uint16_t *srcp = CONVERT_TO_SHORTPTR(src); uint16_t *refp = CONVERT_TO_SHORTPTR(ref); uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); // Next 4 rows srcp += src_stride << 2; refp += ref_stride << 2; secp += 64; sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); return get_sad_from_mm256_epi32(&sad); } unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { const int left_shift = 3; uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride, second_pred); src += src_stride << left_shift; ref += ref_stride << left_shift; second_pred += 16 << left_shift; sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride, second_pred); return sum; } unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { const int left_shift = 4; uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride, second_pred); src += src_stride << left_shift; ref += ref_stride << left_shift; second_pred += 16 << left_shift; sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride, second_pred); return sum; } #if !CONFIG_REALTIME_ONLY unsigned int aom_highbd_sad16x64_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { const int left_shift = 5; uint32_t sum = aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride, second_pred); src += src_stride << left_shift; ref += ref_stride << left_shift; second_pred += 16 << left_shift; sum += aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride, second_pred); return sum; } unsigned int aom_highbd_sad32x8_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { __m256i sad = _mm256_setzero_si256(); uint16_t *srcp = CONVERT_TO_SHORTPTR(src); uint16_t *refp = CONVERT_TO_SHORTPTR(ref); uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); const int left_shift = 2; int row_section = 0; while (row_section < 2) { sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad); srcp += src_stride << left_shift; refp += ref_stride << left_shift; secp += 32 << left_shift; row_section += 1; } return get_sad_from_mm256_epi32(&sad); } #endif // !CONFIG_REALTIME_ONLY unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { __m256i sad = _mm256_setzero_si256(); uint16_t *srcp = CONVERT_TO_SHORTPTR(src); uint16_t *refp = CONVERT_TO_SHORTPTR(ref); uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); const int left_shift = 2; int row_section = 0; while (row_section < 4) { sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad); srcp += src_stride << left_shift; refp += ref_stride << left_shift; secp += 32 << left_shift; row_section += 1; } return get_sad_from_mm256_epi32(&sad); } unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { const int left_shift = 4; uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride, second_pred); src += src_stride << left_shift; ref += ref_stride << left_shift; second_pred += 32 << left_shift; sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride, second_pred); return sum; } unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { const int left_shift = 5; uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride, second_pred); src += src_stride << left_shift; ref += ref_stride << left_shift; second_pred += 32 << left_shift; sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride, second_pred); return sum; } #if !CONFIG_REALTIME_ONLY unsigned int aom_highbd_sad64x16_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { __m256i sad = _mm256_setzero_si256(); uint16_t *srcp = CONVERT_TO_SHORTPTR(src); uint16_t *refp = CONVERT_TO_SHORTPTR(ref); uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); const int left_shift = 1; int row_section = 0; while (row_section < 8) { sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad); srcp += src_stride << left_shift; refp += ref_stride << left_shift; secp += 64 << left_shift; row_section += 1; } return get_sad_from_mm256_epi32(&sad); } #endif // !CONFIG_REALTIME_ONLY unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { __m256i sad = _mm256_setzero_si256(); uint16_t *srcp = CONVERT_TO_SHORTPTR(src); uint16_t *refp = CONVERT_TO_SHORTPTR(ref); uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); const int left_shift = 1; int row_section = 0; while (row_section < 16) { sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad); srcp += src_stride << left_shift; refp += ref_stride << left_shift; secp += 64 << left_shift; row_section += 1; } return get_sad_from_mm256_epi32(&sad); } unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { const int left_shift = 5; uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride, second_pred); src += src_stride << left_shift; ref += ref_stride << left_shift; second_pred += 64 << left_shift; sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride, second_pred); return sum; } unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { const int left_shift = 6; uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride, second_pred); src += src_stride << left_shift; ref += ref_stride << left_shift; second_pred += 64 << left_shift; sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride, second_pred); return sum; } unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { __m256i sad = _mm256_setzero_si256(); uint16_t *srcp = CONVERT_TO_SHORTPTR(src); uint16_t *refp = CONVERT_TO_SHORTPTR(ref); uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); int row = 0; while (row < 64) { sad128x1(srcp, refp, secp, &sad); srcp += src_stride; refp += ref_stride; secp += 16 << 3; row += 1; } return get_sad_from_mm256_epi32(&sad); } unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { unsigned int sum; const int left_shift = 6; sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride, second_pred); src += src_stride << left_shift; ref += ref_stride << left_shift; second_pred += 128 << left_shift; sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride, second_pred); return sum; } // SAD 4D // Combine 4 __m256i input vectors v to uint32_t result[4] static inline void get_4d_sad_from_mm256_epi32(const __m256i *v, uint32_t *res) { __m256i u0, u1, u2, u3; const __m256i mask = _mm256_set1_epi64x(~0u); __m128i sad; // 8 32-bit summation u0 = _mm256_srli_si256(v[0], 4); u1 = _mm256_srli_si256(v[1], 4); u2 = _mm256_srli_si256(v[2], 4); u3 = _mm256_srli_si256(v[3], 4); u0 = _mm256_add_epi32(u0, v[0]); u1 = _mm256_add_epi32(u1, v[1]); u2 = _mm256_add_epi32(u2, v[2]); u3 = _mm256_add_epi32(u3, v[3]); u0 = _mm256_and_si256(u0, mask); u1 = _mm256_and_si256(u1, mask); u2 = _mm256_and_si256(u2, mask); u3 = _mm256_and_si256(u3, mask); // 4 32-bit summation, evenly positioned u1 = _mm256_slli_si256(u1, 4); u3 = _mm256_slli_si256(u3, 4); u0 = _mm256_or_si256(u0, u1); u2 = _mm256_or_si256(u2, u3); // 8 32-bit summation, interleaved u1 = _mm256_unpacklo_epi64(u0, u2); u3 = _mm256_unpackhi_epi64(u0, u2); u0 = _mm256_add_epi32(u1, u3); sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1), _mm256_castsi256_si128(u0)); _mm_storeu_si128((__m128i *)res, sad); } static void convert_pointers(const uint8_t *const ref8[], const uint16_t *ref[]) { ref[0] = CONVERT_TO_SHORTPTR(ref8[0]); ref[1] = CONVERT_TO_SHORTPTR(ref8[1]); ref[2] = CONVERT_TO_SHORTPTR(ref8[2]); ref[3] = CONVERT_TO_SHORTPTR(ref8[3]); } static void init_sad(__m256i *s) { s[0] = _mm256_setzero_si256(); s[1] = _mm256_setzero_si256(); s[2] = _mm256_setzero_si256(); s[3] = _mm256_setzero_si256(); } static AOM_FORCE_INLINE void aom_highbd_sadMxNxD_avx2( int M, int N, int D, const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { __m256i sad_vec[4]; const uint16_t *refp[4]; const uint16_t *keep = CONVERT_TO_SHORTPTR(src); const uint16_t *srcp; const int shift_for_rows = (M < 128) + (M < 64); const int row_units = 1 << shift_for_rows; int i, r; init_sad(sad_vec); convert_pointers(ref_array, refp); for (i = 0; i < D; ++i) { srcp = keep; for (r = 0; r < N; r += row_units) { if (M == 128) { sad128x1(srcp, refp[i], NULL, &sad_vec[i]); } else if (M == 64) { sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]); } else if (M == 32) { sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); } else if (M == 16) { sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); } else { assert(0); } srcp += src_stride << shift_for_rows; refp[i] += ref_stride << shift_for_rows; } } get_4d_sad_from_mm256_epi32(sad_vec, sad_array); } #define HIGHBD_SAD_MXNX4D_AVX2(m, n) \ void aom_highbd_sad##m##x##n##x4d_avx2( \ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ aom_highbd_sadMxNxD_avx2(m, n, 4, src, src_stride, ref_array, ref_stride, \ sad_array); \ } #define HIGHBD_SAD_SKIP_MXNX4D_AVX2(m, n) \ void aom_highbd_sad_skip_##m##x##n##x4d_avx2( \ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ aom_highbd_sadMxNxD_avx2(m, (n / 2), 4, src, 2 * src_stride, ref_array, \ 2 * ref_stride, sad_array); \ sad_array[0] <<= 1; \ sad_array[1] <<= 1; \ sad_array[2] <<= 1; \ sad_array[3] <<= 1; \ } #define HIGHBD_SAD_MXNX3D_AVX2(m, n) \ void aom_highbd_sad##m##x##n##x3d_avx2( \ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ int ref_stride, uint32_t sad_array[4]) { \ aom_highbd_sadMxNxD_avx2(m, n, 3, src, src_stride, ref_array, ref_stride, \ sad_array); \ } HIGHBD_SAD_MXNX4D_AVX2(16, 8) HIGHBD_SAD_MXNX4D_AVX2(16, 16) HIGHBD_SAD_MXNX4D_AVX2(16, 32) HIGHBD_SAD_MXNX4D_AVX2(32, 16) HIGHBD_SAD_MXNX4D_AVX2(32, 32) HIGHBD_SAD_MXNX4D_AVX2(32, 64) HIGHBD_SAD_MXNX4D_AVX2(64, 32) HIGHBD_SAD_MXNX4D_AVX2(64, 64) HIGHBD_SAD_MXNX4D_AVX2(64, 128) HIGHBD_SAD_MXNX4D_AVX2(128, 64) HIGHBD_SAD_MXNX4D_AVX2(128, 128) #if !CONFIG_REALTIME_ONLY HIGHBD_SAD_MXNX4D_AVX2(16, 4) HIGHBD_SAD_MXNX4D_AVX2(16, 64) HIGHBD_SAD_MXNX4D_AVX2(32, 8) HIGHBD_SAD_MXNX4D_AVX2(64, 16) #endif // !CONFIG_REALTIME_ONLY HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 16) HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 32) HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 16) HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 32) HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 64) HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 32) HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 64) HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 128) HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 64) HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 128) #if !CONFIG_REALTIME_ONLY HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 64) HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 16) #endif // !CONFIG_REALTIME_ONLY HIGHBD_SAD_MXNX3D_AVX2(16, 8) HIGHBD_SAD_MXNX3D_AVX2(16, 16) HIGHBD_SAD_MXNX3D_AVX2(16, 32) HIGHBD_SAD_MXNX3D_AVX2(32, 16) HIGHBD_SAD_MXNX3D_AVX2(32, 32) HIGHBD_SAD_MXNX3D_AVX2(32, 64) HIGHBD_SAD_MXNX3D_AVX2(64, 32) HIGHBD_SAD_MXNX3D_AVX2(64, 64) HIGHBD_SAD_MXNX3D_AVX2(64, 128) HIGHBD_SAD_MXNX3D_AVX2(128, 64) HIGHBD_SAD_MXNX3D_AVX2(128, 128) #if !CONFIG_REALTIME_ONLY HIGHBD_SAD_MXNX3D_AVX2(16, 4) HIGHBD_SAD_MXNX3D_AVX2(16, 64) HIGHBD_SAD_MXNX3D_AVX2(32, 8) HIGHBD_SAD_MXNX3D_AVX2(64, 16) #endif // !CONFIG_REALTIME_ONLY aom-3.12.1/aom_dsp/x86/highbd_sad_sse2.asm000066400000000000000000000411231477627663500201630ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "third_party/x86inc/x86inc.asm" SECTION .text ; Macro Arguments ; Arg 1: Width ; Arg 2: Height ; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit ; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows ; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7 %macro HIGH_SAD_FN 4-5 7 %if %4 == 0 %if %3 == 5 cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows %else ; %3 == 7 cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 %elif %4 == 1 ; avg %if %3 == 5 cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \ second_pred, n_rows %else ; %3 == 7 cglobal highbd_sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, %5, src, src_stride, \ ref, ref_stride, \ second_pred, \ src_stride3, ref_stride3 %if AOM_ARCH_X86_64 %define n_rowsd r7d %else ; x86-32 %define n_rowsd dword r0m %endif ; x86-32/64 %endif ; %3 == 5/7 %else ; %4 == 2, skip rows %if %3 == 5 cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows %else ; %3 == 7 cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 %endif ; sad/avg/skip %if %4 == 2 ; double the stride if we are skipping rows lea src_strided, [src_strided*2] lea ref_strided, [ref_strided*2] %endif movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided %if %3 == 7 lea src_stride3q, [src_strideq*3] lea ref_stride3q, [ref_strideq*3] %endif ; %3 == 7 ; convert src, ref & second_pred to short ptrs (from byte ptrs) shl srcq, 1 shl refq, 1 %if %4 == 1 shl second_predq, 1 %endif %endmacro ; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD64XN 1-2 0 HIGH_SAD_FN 64, %1, 5, %2 %if %2 == 2 ; skip rows, so divide number of rows by 2 mov n_rowsd, %1/2 %else mov n_rowsd, %1 %endif pxor m0, m0 pxor m6, m6 .loop: ; first half of each row movu m1, [refq] movu m2, [refq+16] movu m3, [refq+32] movu m4, [refq+48] %if %2 == 1 pavgw m1, [second_predq+mmsize*0] pavgw m2, [second_predq+mmsize*1] pavgw m3, [second_predq+mmsize*2] pavgw m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif mova m5, [srcq] psubusw m5, m1 psubusw m1, [srcq] por m1, m5 mova m5, [srcq+16] psubusw m5, m2 psubusw m2, [srcq+16] por m2, m5 mova m5, [srcq+32] psubusw m5, m3 psubusw m3, [srcq+32] por m3, m5 mova m5, [srcq+48] psubusw m5, m4 psubusw m4, [srcq+48] por m4, m5 paddw m1, m2 paddw m3, m4 movhlps m2, m1 movhlps m4, m3 paddw m1, m2 paddw m3, m4 punpcklwd m1, m6 punpcklwd m3, m6 paddd m0, m1 paddd m0, m3 ; second half of each row movu m1, [refq+64] movu m2, [refq+80] movu m3, [refq+96] movu m4, [refq+112] %if %2 == 1 pavgw m1, [second_predq+mmsize*0] pavgw m2, [second_predq+mmsize*1] pavgw m3, [second_predq+mmsize*2] pavgw m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif mova m5, [srcq+64] psubusw m5, m1 psubusw m1, [srcq+64] por m1, m5 mova m5, [srcq+80] psubusw m5, m2 psubusw m2, [srcq+80] por m2, m5 mova m5, [srcq+96] psubusw m5, m3 psubusw m3, [srcq+96] por m3, m5 mova m5, [srcq+112] psubusw m5, m4 psubusw m4, [srcq+112] por m4, m5 paddw m1, m2 paddw m3, m4 movhlps m2, m1 movhlps m4, m3 paddw m1, m2 paddw m3, m4 punpcklwd m1, m6 punpcklwd m3, m6 lea refq, [refq+ref_strideq*2] paddd m0, m1 lea srcq, [srcq+src_strideq*2] paddd m0, m3 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 %if %2 == 2 ; we skipped rows, so we need to double the sad pslld m0, 1 %endif movd eax, m0 RET %endmacro INIT_XMM sse2 HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2 HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2 %if CONFIG_REALTIME_ONLY==0 HIGH_SAD64XN 16 ; highbd_sad64x16_sse2 HIGH_SAD64XN 16, 1 ; highbd_sad64x16_avg_sse2 HIGH_SAD64XN 16, 2 ; highbd_sad_skip_64x16_sse2 %endif ; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD32XN 1-2 0 HIGH_SAD_FN 32, %1, 5, %2 %if %2 == 2 ; skip rows, so divide number of rows by 2 mov n_rowsd, %1/2 %else mov n_rowsd, %1 %endif pxor m0, m0 pxor m6, m6 .loop: movu m1, [refq] movu m2, [refq+16] movu m3, [refq+32] movu m4, [refq+48] %if %2 == 1 pavgw m1, [second_predq+mmsize*0] pavgw m2, [second_predq+mmsize*1] pavgw m3, [second_predq+mmsize*2] pavgw m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif mova m5, [srcq] psubusw m5, m1 psubusw m1, [srcq] por m1, m5 mova m5, [srcq+16] psubusw m5, m2 psubusw m2, [srcq+16] por m2, m5 mova m5, [srcq+32] psubusw m5, m3 psubusw m3, [srcq+32] por m3, m5 mova m5, [srcq+48] psubusw m5, m4 psubusw m4, [srcq+48] por m4, m5 paddw m1, m2 paddw m3, m4 movhlps m2, m1 movhlps m4, m3 paddw m1, m2 paddw m3, m4 punpcklwd m1, m6 punpcklwd m3, m6 lea refq, [refq+ref_strideq*2] paddd m0, m1 lea srcq, [srcq+src_strideq*2] paddd m0, m3 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 %if %2 == 2 ; we skipped rows, so we need to double the sad pslld m0, 1 %endif movd eax, m0 RET %endmacro INIT_XMM sse2 HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2 HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2 HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2 %if CONFIG_REALTIME_ONLY==0 HIGH_SAD32XN 8 ; highbd_sad32x8_sse2 HIGH_SAD32XN 8, 1 ; highbd_sad32x8_avg_sse2 %endif ; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD16XN 1-2 0 HIGH_SAD_FN 16, %1, 5, %2 %if %2 == 2 ; skip rows, so divide number of rows by 2 mov n_rowsd, %1/4 %else mov n_rowsd, %1/2 %endif pxor m0, m0 pxor m6, m6 .loop: movu m1, [refq] movu m2, [refq+16] movu m3, [refq+ref_strideq*2] movu m4, [refq+ref_strideq*2+16] %if %2 == 1 pavgw m1, [second_predq+mmsize*0] pavgw m2, [second_predq+16] pavgw m3, [second_predq+mmsize*2] pavgw m4, [second_predq+mmsize*2+16] lea second_predq, [second_predq+mmsize*4] %endif mova m5, [srcq] psubusw m5, m1 psubusw m1, [srcq] por m1, m5 mova m5, [srcq+16] psubusw m5, m2 psubusw m2, [srcq+16] por m2, m5 mova m5, [srcq+src_strideq*2] psubusw m5, m3 psubusw m3, [srcq+src_strideq*2] por m3, m5 mova m5, [srcq+src_strideq*2+16] psubusw m5, m4 psubusw m4, [srcq+src_strideq*2+16] por m4, m5 paddw m1, m2 paddw m3, m4 movhlps m2, m1 movhlps m4, m3 paddw m1, m2 paddw m3, m4 punpcklwd m1, m6 punpcklwd m3, m6 lea refq, [refq+ref_strideq*4] paddd m0, m1 lea srcq, [srcq+src_strideq*4] paddd m0, m3 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 %if %2 == 2 ; we skipped rows, so we need to double the sad pslld m0, 1 %endif movd eax, m0 RET %endmacro INIT_XMM sse2 HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2 HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2 %if CONFIG_REALTIME_ONLY==0 HIGH_SAD16XN 64 ; highbd_sad16x64_sse2 HIGH_SAD16XN 4 ; highbd_sad16x4_sse2 HIGH_SAD16XN 64, 1 ; highbd_sad16x64_avg_sse2 HIGH_SAD16XN 64, 2 ; highbd_sad_skip_16x64_sse2 %endif ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD8XN 1-2 0 HIGH_SAD_FN 8, %1, 7, %2, 8 %if %2 == 2 ; skip rows, so divide number of rows by 2 mov n_rowsd, %1/8 %else mov n_rowsd, %1/4 %endif pxor m0, m0 pxor m6, m6 .loop: movu m1, [refq] movu m2, [refq+ref_strideq*2] movu m3, [refq+ref_strideq*4] movu m4, [refq+ref_stride3q*2] %if %2 == 1 pavgw m1, [second_predq+mmsize*0] pavgw m2, [second_predq+mmsize*1] pavgw m3, [second_predq+mmsize*2] pavgw m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif mova m7, m1 movu m5, [srcq] psubusw m1, m5 psubusw m5, m7 por m1, m5 mova m7, m2 movu m5, [srcq+src_strideq*2] psubusw m2, m5 psubusw m5, m7 por m2, m5 mova m7, m3 movu m5, [srcq+src_strideq*4] psubusw m3, m5 psubusw m5, m7 por m3, m5 mova m7, m4 movu m5, [srcq+src_stride3q*2] psubusw m4, m5 psubusw m5, m7 por m4, m5 paddw m1, m2 paddw m3, m4 movhlps m2, m1 movhlps m4, m3 paddw m1, m2 paddw m3, m4 punpcklwd m1, m6 punpcklwd m3, m6 lea refq, [refq+ref_strideq*8] paddd m0, m1 lea srcq, [srcq+src_strideq*8] paddd m0, m3 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 %if %2 == 2 ; we skipped rows, so we need to double the sad pslld m0, 1 %endif movd eax, m0 RET %endmacro INIT_XMM sse2 HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2 %if CONFIG_REALTIME_ONLY==0 HIGH_SAD8XN 32 ; highbd_sad8x32_sse2 HIGH_SAD8XN 32, 1 ; highbd_sad8x32_avg_sse2 HIGH_SAD8XN 32, 2 ; highbd_sad_skip_8x32_sse2 %endif ; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD4XN 1-2 0 HIGH_SAD_FN 4, %1, 7, %2 %if %2 == 2 ; skip rows, so divide number of rows by 2 mov n_rowsd, %1/8 %else mov n_rowsd, %1/4 %endif pxor m0, m0 pxor m6, m6 .loop: movq m1, [refq] movq m2, [refq+ref_strideq*2] movq m3, [refq+ref_strideq*4] movq m4, [refq+ref_stride3q*2] punpcklwd m1, m3 punpcklwd m2, m4 %if %2 == 1 movq m3, [second_predq+8*0] movq m5, [second_predq+8*2] punpcklwd m3, m5 movq m4, [second_predq+8*1] movq m5, [second_predq+8*3] punpcklwd m4, m5 lea second_predq, [second_predq+8*4] pavgw m1, m3 pavgw m2, m4 %endif movq m5, [srcq] movq m3, [srcq+src_strideq*4] punpcklwd m5, m3 movdqa m3, m1 psubusw m1, m5 psubusw m5, m3 por m1, m5 movq m5, [srcq+src_strideq*2] movq m4, [srcq+src_stride3q*2] punpcklwd m5, m4 movdqa m4, m2 psubusw m2, m5 psubusw m5, m4 por m2, m5 paddw m1, m2 movdqa m2, m1 punpcklwd m1, m6 punpckhwd m2, m6 lea refq, [refq+ref_strideq*8] paddd m0, m1 lea srcq, [srcq+src_strideq*8] paddd m0, m2 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 %if %2 == 2 ; we skipped rows, so we need to double the sad pslld m0, 1 %endif movd eax, m0 RET %endmacro INIT_XMM sse2 HIGH_SAD4XN 8 ; highbd_sad4x8_sse2 HIGH_SAD4XN 4 ; highbd_sad4x4_sse2 %if CONFIG_REALTIME_ONLY==0 HIGH_SAD4XN 16 ; highbd_sad4x16_sse2 HIGH_SAD4XN 16, 2 ; highbd_sad_skip_4x16_sse2 %endif aom-3.12.1/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm000066400000000000000000000752221477627663500236060ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "third_party/x86inc/x86inc.asm" SECTION_RODATA pw_8: times 8 dw 8 bilin_filter_m_sse2: times 8 dw 16 times 8 dw 0 times 8 dw 14 times 8 dw 2 times 8 dw 12 times 8 dw 4 times 8 dw 10 times 8 dw 6 times 16 dw 8 times 8 dw 6 times 8 dw 10 times 8 dw 4 times 8 dw 12 times 8 dw 2 times 8 dw 14 SECTION .text ; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, ; int x_offset, int y_offset, ; const uint8_t *dst, ptrdiff_t dst_stride, ; int height, unsigned int *sse); ; ; This function returns the SE and stores SSE in the given pointer. %macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse psubw %3, %4 psubw %1, %2 mova %4, %3 ; make copies to manipulate to calc sum mova %2, %1 ; use originals for calc sse pmaddwd %3, %3 paddw %4, %2 pmaddwd %1, %1 movhlps %2, %4 paddd %6, %3 paddw %4, %2 pxor %2, %2 pcmpgtw %2, %4 ; mask for 0 > %4 (sum) punpcklwd %4, %2 ; sign-extend word to dword paddd %6, %1 paddd %5, %4 %endmacro %macro STORE_AND_RET 0 %if mmsize == 16 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. ; We have to sign-extend it before adding the words within the register ; and outputing to a dword. movhlps m3, m7 movhlps m4, m6 paddd m7, m3 paddd m6, m4 pshufd m3, m7, 0x1 pshufd m4, m6, 0x1 paddd m7, m3 paddd m6, m4 mov r1, ssem ; r1 = unsigned int *sse movd [r1], m7 ; store sse movd eax, m6 ; store sum as return value %endif RET %endmacro %macro INC_SRC_BY_SRC_STRIDE 0 %if AOM_ARCH_X86=1 && CONFIG_PIC=1 add srcq, src_stridemp add srcq, src_stridemp %else lea srcq, [srcq + src_strideq*2] %endif %endmacro %macro SUBPEL_VARIANCE 1-2 0 ; W %define bilin_filter_m bilin_filter_m_sse2 %define filter_idx_shift 5 %if AOM_ARCH_X86_64 %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ x_offset, y_offset, \ dst, dst_stride, \ sec, sec_stride, height, sse %define sec_str sec_strideq %else cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ x_offset, y_offset, \ dst, dst_stride, height, sse %endif %define block_height heightd %define bilin_filter sseq %else %if CONFIG_PIC=1 %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, \ dst, dst_stride, \ sec, sec_stride, height, sse %define block_height dword heightm %define sec_str sec_stridemp %else cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, \ dst, dst_stride, height, sse %define block_height heightd %endif ; reuse argument stack space %define g_bilin_filterm x_offsetm %define g_pw_8m y_offsetm ; Store bilin_filter and pw_8 location in stack %if GET_GOT_DEFINED == 1 GET_GOT eax add esp, 4 ; restore esp %endif lea ecx, [GLOBAL(bilin_filter_m)] mov g_bilin_filterm, ecx lea ecx, [GLOBAL(pw_8)] mov g_pw_8m, ecx LOAD_IF_USED 0, 1 ; load eax, ecx back %else %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, \ dst, dst_stride, \ sec, sec_stride, height, sse %define block_height dword heightm %define sec_str sec_stridemp %else cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, \ dst, dst_stride, height, sse %define block_height heightd %endif %define bilin_filter bilin_filter_m %endif %endif ASSERT %1 <= 16 ; m6 overflows if w > 16 pxor m6, m6 ; sum pxor m7, m7 ; sse %if %1 < 16 sar block_height, 1 %endif %if %2 == 1 ; avg shl sec_str, 1 %endif ; FIXME(rbultje) replace by jumptable? test x_offsetd, x_offsetd jnz .x_nonzero ; x_offset == 0 test y_offsetd, y_offsetd jnz .x_zero_y_nonzero ; x_offset == 0 && y_offset == 0 .x_zero_y_zero_loop: %if %1 == 16 movu m0, [srcq] movu m2, [srcq + 16] mova m1, [dstq] mova m3, [dstq + 16] %if %2 == 1 ; avg pavgw m0, [secq] pavgw m2, [secq+16] %endif SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq + src_strideq*2] lea dstq, [dstq + dst_strideq*2] %if %2 == 1 ; avg add secq, sec_str %endif %else ; %1 < 16 movu m0, [srcq] movu m2, [srcq + src_strideq*2] mova m1, [dstq] mova m3, [dstq + dst_strideq*2] %if %2 == 1 ; avg pavgw m0, [secq] add secq, sec_str pavgw m2, [secq] %endif SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq + src_strideq*4] lea dstq, [dstq + dst_strideq*4] %if %2 == 1 ; avg add secq, sec_str %endif %endif dec block_height jg .x_zero_y_zero_loop STORE_AND_RET .x_zero_y_nonzero: cmp y_offsetd, 8 jne .x_zero_y_nonhalf ; x_offset == 0 && y_offset == 0.5 .x_zero_y_half_loop: %if %1 == 16 movu m0, [srcq] movu m1, [srcq+16] movu m4, [srcq+src_strideq*2] movu m5, [srcq+src_strideq*2+16] mova m2, [dstq] mova m3, [dstq+16] pavgw m0, m4 pavgw m1, m5 %if %2 == 1 ; avg pavgw m0, [secq] pavgw m1, [secq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*2] lea dstq, [dstq + dst_strideq*2] %if %2 == 1 ; avg add secq, sec_str %endif %else ; %1 < 16 movu m0, [srcq] movu m1, [srcq+src_strideq*2] movu m5, [srcq+src_strideq*4] mova m2, [dstq] mova m3, [dstq+dst_strideq*2] pavgw m0, m1 pavgw m1, m5 %if %2 == 1 ; avg pavgw m0, [secq] add secq, sec_str pavgw m1, [secq] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*4] lea dstq, [dstq + dst_strideq*4] %if %2 == 1 ; avg add secq, sec_str %endif %endif dec block_height jg .x_zero_y_half_loop STORE_AND_RET .x_zero_y_nonhalf: ; x_offset == 0 && y_offset == bilin interpolation %if AOM_ARCH_X86_64 lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if AOM_ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+y_offsetq] mova m9, [bilin_filter+y_offsetq+16] mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 %else ; x86-32 or mmx %if AOM_ARCH_X86=1 && CONFIG_PIC=1 ; x_offset == 0, reuse x_offset reg %define tempq x_offsetq add y_offsetq, g_bilin_filterm %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] mov tempq, g_pw_8m %define filter_rnd [tempq] %else add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] %define filter_rnd [GLOBAL(pw_8)] %endif %endif .x_zero_y_other_loop: %if %1 == 16 movu m0, [srcq] movu m1, [srcq + 16] movu m4, [srcq+src_strideq*2] movu m5, [srcq+src_strideq*2+16] mova m2, [dstq] mova m3, [dstq+16] ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of ; instructions is the same (5), but it is 1 mul instead of 2, so might be ; slightly faster because of pmullw latency. It would also cut our rodata ; tables in half for this function, and save 1-2 registers on x86-64. pmullw m1, filter_y_a pmullw m5, filter_y_b paddw m1, filter_rnd pmullw m0, filter_y_a pmullw m4, filter_y_b paddw m0, filter_rnd paddw m1, m5 paddw m0, m4 psrlw m1, 4 psrlw m0, 4 %if %2 == 1 ; avg pavgw m0, [secq] pavgw m1, [secq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*2] lea dstq, [dstq + dst_strideq*2] %if %2 == 1 ; avg add secq, sec_str %endif %else ; %1 < 16 movu m0, [srcq] movu m1, [srcq+src_strideq*2] movu m5, [srcq+src_strideq*4] mova m4, m1 mova m2, [dstq] mova m3, [dstq+dst_strideq*2] pmullw m1, filter_y_a pmullw m5, filter_y_b paddw m1, filter_rnd pmullw m0, filter_y_a pmullw m4, filter_y_b paddw m0, filter_rnd paddw m1, m5 paddw m0, m4 psrlw m1, 4 psrlw m0, 4 %if %2 == 1 ; avg pavgw m0, [secq] add secq, sec_str pavgw m1, [secq] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*4] lea dstq, [dstq + dst_strideq*4] %if %2 == 1 ; avg add secq, sec_str %endif %endif dec block_height jg .x_zero_y_other_loop %undef filter_y_a %undef filter_y_b %undef filter_rnd STORE_AND_RET .x_nonzero: cmp x_offsetd, 8 jne .x_nonhalf ; x_offset == 0.5 test y_offsetd, y_offsetd jnz .x_half_y_nonzero ; x_offset == 0.5 && y_offset == 0 .x_half_y_zero_loop: %if %1 == 16 movu m0, [srcq] movu m1, [srcq + 16] movu m4, [srcq + 2] movu m5, [srcq + 18] mova m2, [dstq] mova m3, [dstq + 16] pavgw m0, m4 pavgw m1, m5 %if %2 == 1 ; avg pavgw m0, [secq] pavgw m1, [secq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*2] lea dstq, [dstq + dst_strideq*2] %if %2 == 1 ; avg add secq, sec_str %endif %else ; %1 < 16 movu m0, [srcq] movu m1, [srcq + src_strideq*2] movu m4, [srcq + 2] movu m5, [srcq + src_strideq*2 + 2] mova m2, [dstq] mova m3, [dstq + dst_strideq*2] pavgw m0, m4 pavgw m1, m5 %if %2 == 1 ; avg pavgw m0, [secq] add secq, sec_str pavgw m1, [secq] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*4] lea dstq, [dstq + dst_strideq*4] %if %2 == 1 ; avg add secq, sec_str %endif %endif dec block_height jg .x_half_y_zero_loop STORE_AND_RET .x_half_y_nonzero: cmp y_offsetd, 8 jne .x_half_y_nonhalf ; x_offset == 0.5 && y_offset == 0.5 %if %1 == 16 movu m0, [srcq] movu m1, [srcq+16] movu m2, [srcq+2] movu m3, [srcq+18] lea srcq, [srcq + src_strideq*2] pavgw m0, m2 pavgw m1, m3 .x_half_y_half_loop: movu m2, [srcq] movu m3, [srcq + 16] movu m4, [srcq + 2] movu m5, [srcq + 18] pavgw m2, m4 pavgw m3, m5 pavgw m0, m2 pavgw m1, m3 mova m4, [dstq] mova m5, [dstq + 16] %if %2 == 1 ; avg pavgw m0, [secq] pavgw m1, [secq+16] %endif SUM_SSE m0, m4, m1, m5, m6, m7 mova m0, m2 mova m1, m3 lea srcq, [srcq + src_strideq*2] lea dstq, [dstq + dst_strideq*2] %if %2 == 1 ; avg add secq, sec_str %endif %else ; %1 < 16 movu m0, [srcq] movu m2, [srcq+2] lea srcq, [srcq + src_strideq*2] pavgw m0, m2 .x_half_y_half_loop: movu m2, [srcq] movu m3, [srcq + src_strideq*2] movu m4, [srcq + 2] movu m5, [srcq + src_strideq*2 + 2] pavgw m2, m4 pavgw m3, m5 pavgw m0, m2 pavgw m2, m3 mova m4, [dstq] mova m5, [dstq + dst_strideq*2] %if %2 == 1 ; avg pavgw m0, [secq] add secq, sec_str pavgw m2, [secq] %endif SUM_SSE m0, m4, m2, m5, m6, m7 mova m0, m3 lea srcq, [srcq + src_strideq*4] lea dstq, [dstq + dst_strideq*4] %if %2 == 1 ; avg add secq, sec_str %endif %endif dec block_height jg .x_half_y_half_loop STORE_AND_RET .x_half_y_nonhalf: ; x_offset == 0.5 && y_offset == bilin interpolation %if AOM_ARCH_X86_64 lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if AOM_ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+y_offsetq] mova m9, [bilin_filter+y_offsetq+16] mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 %else ; x86_32 %if AOM_ARCH_X86=1 && CONFIG_PIC=1 ; x_offset == 0.5. We can reuse x_offset reg %define tempq x_offsetq add y_offsetq, g_bilin_filterm %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] mov tempq, g_pw_8m %define filter_rnd [tempq] %else add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] %define filter_rnd [GLOBAL(pw_8)] %endif %endif %if %1 == 16 movu m0, [srcq] movu m1, [srcq+16] movu m2, [srcq+2] movu m3, [srcq+18] lea srcq, [srcq + src_strideq*2] pavgw m0, m2 pavgw m1, m3 .x_half_y_other_loop: movu m2, [srcq] movu m3, [srcq+16] movu m4, [srcq+2] movu m5, [srcq+18] pavgw m2, m4 pavgw m3, m5 mova m4, m2 mova m5, m3 pmullw m1, filter_y_a pmullw m3, filter_y_b paddw m1, filter_rnd paddw m1, m3 pmullw m0, filter_y_a pmullw m2, filter_y_b paddw m0, filter_rnd psrlw m1, 4 paddw m0, m2 mova m2, [dstq] psrlw m0, 4 mova m3, [dstq+16] %if %2 == 1 ; avg pavgw m0, [secq] pavgw m1, [secq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 mova m0, m4 mova m1, m5 lea srcq, [srcq + src_strideq*2] lea dstq, [dstq + dst_strideq*2] %if %2 == 1 ; avg add secq, sec_str %endif %else ; %1 < 16 movu m0, [srcq] movu m2, [srcq+2] lea srcq, [srcq + src_strideq*2] pavgw m0, m2 .x_half_y_other_loop: movu m2, [srcq] movu m3, [srcq+src_strideq*2] movu m4, [srcq+2] movu m5, [srcq+src_strideq*2+2] pavgw m2, m4 pavgw m3, m5 mova m4, m2 mova m5, m3 pmullw m4, filter_y_a pmullw m3, filter_y_b paddw m4, filter_rnd paddw m4, m3 pmullw m0, filter_y_a pmullw m2, filter_y_b paddw m0, filter_rnd psrlw m4, 4 paddw m0, m2 mova m2, [dstq] psrlw m0, 4 mova m3, [dstq+dst_strideq*2] %if %2 == 1 ; avg pavgw m0, [secq] add secq, sec_str pavgw m4, [secq] %endif SUM_SSE m0, m2, m4, m3, m6, m7 mova m0, m5 lea srcq, [srcq + src_strideq*4] lea dstq, [dstq + dst_strideq*4] %if %2 == 1 ; avg add secq, sec_str %endif %endif dec block_height jg .x_half_y_other_loop %undef filter_y_a %undef filter_y_b %undef filter_rnd STORE_AND_RET .x_nonhalf: test y_offsetd, y_offsetd jnz .x_nonhalf_y_nonzero ; x_offset == bilin interpolation && y_offset == 0 %if AOM_ARCH_X86_64 lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if AOM_ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+x_offsetq] mova m9, [bilin_filter+x_offsetq+16] mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 %else ; x86-32 %if AOM_ARCH_X86=1 && CONFIG_PIC=1 ; y_offset == 0. We can reuse y_offset reg. %define tempq y_offsetq add x_offsetq, g_bilin_filterm %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] mov tempq, g_pw_8m %define filter_rnd [tempq] %else add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] %define filter_rnd [GLOBAL(pw_8)] %endif %endif .x_other_y_zero_loop: %if %1 == 16 movu m0, [srcq] movu m1, [srcq+16] movu m2, [srcq+2] movu m3, [srcq+18] mova m4, [dstq] mova m5, [dstq+16] pmullw m1, filter_x_a pmullw m3, filter_x_b paddw m1, filter_rnd pmullw m0, filter_x_a pmullw m2, filter_x_b paddw m0, filter_rnd paddw m1, m3 paddw m0, m2 psrlw m1, 4 psrlw m0, 4 %if %2 == 1 ; avg pavgw m0, [secq] pavgw m1, [secq+16] %endif SUM_SSE m0, m4, m1, m5, m6, m7 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] %if %2 == 1 ; avg add secq, sec_str %endif %else ; %1 < 16 movu m0, [srcq] movu m1, [srcq+src_strideq*2] movu m2, [srcq+2] movu m3, [srcq+src_strideq*2+2] mova m4, [dstq] mova m5, [dstq+dst_strideq*2] pmullw m1, filter_x_a pmullw m3, filter_x_b paddw m1, filter_rnd pmullw m0, filter_x_a pmullw m2, filter_x_b paddw m0, filter_rnd paddw m1, m3 paddw m0, m2 psrlw m1, 4 psrlw m0, 4 %if %2 == 1 ; avg pavgw m0, [secq] add secq, sec_str pavgw m1, [secq] %endif SUM_SSE m0, m4, m1, m5, m6, m7 lea srcq, [srcq+src_strideq*4] lea dstq, [dstq+dst_strideq*4] %if %2 == 1 ; avg add secq, sec_str %endif %endif dec block_height jg .x_other_y_zero_loop %undef filter_x_a %undef filter_x_b %undef filter_rnd STORE_AND_RET .x_nonhalf_y_nonzero: cmp y_offsetd, 8 jne .x_nonhalf_y_nonhalf ; x_offset == bilin interpolation && y_offset == 0.5 %if AOM_ARCH_X86_64 lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if AOM_ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+x_offsetq] mova m9, [bilin_filter+x_offsetq+16] mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 %else ; x86-32 %if AOM_ARCH_X86=1 && CONFIG_PIC=1 ; y_offset == 0.5. We can reuse y_offset reg. %define tempq y_offsetq add x_offsetq, g_bilin_filterm %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] mov tempq, g_pw_8m %define filter_rnd [tempq] %else add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] %define filter_rnd [GLOBAL(pw_8)] %endif %endif %if %1 == 16 movu m0, [srcq] movu m1, [srcq+16] movu m2, [srcq+2] movu m3, [srcq+18] pmullw m0, filter_x_a pmullw m2, filter_x_b paddw m0, filter_rnd pmullw m1, filter_x_a pmullw m3, filter_x_b paddw m1, filter_rnd paddw m0, m2 paddw m1, m3 psrlw m0, 4 psrlw m1, 4 lea srcq, [srcq+src_strideq*2] .x_other_y_half_loop: movu m2, [srcq] movu m3, [srcq+16] movu m4, [srcq+2] movu m5, [srcq+18] pmullw m2, filter_x_a pmullw m4, filter_x_b paddw m2, filter_rnd pmullw m3, filter_x_a pmullw m5, filter_x_b paddw m3, filter_rnd paddw m2, m4 paddw m3, m5 mova m4, [dstq] mova m5, [dstq+16] psrlw m2, 4 psrlw m3, 4 pavgw m0, m2 pavgw m1, m3 %if %2 == 1 ; avg pavgw m0, [secq] pavgw m1, [secq+16] %endif SUM_SSE m0, m4, m1, m5, m6, m7 mova m0, m2 mova m1, m3 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] %if %2 == 1 ; avg add secq, sec_str %endif %else ; %1 < 16 movu m0, [srcq] movu m2, [srcq+2] pmullw m0, filter_x_a pmullw m2, filter_x_b paddw m0, filter_rnd paddw m0, m2 psrlw m0, 4 lea srcq, [srcq+src_strideq*2] .x_other_y_half_loop: movu m2, [srcq] movu m3, [srcq+src_strideq*2] movu m4, [srcq+2] movu m5, [srcq+src_strideq*2+2] pmullw m2, filter_x_a pmullw m4, filter_x_b paddw m2, filter_rnd pmullw m3, filter_x_a pmullw m5, filter_x_b paddw m3, filter_rnd paddw m2, m4 paddw m3, m5 mova m4, [dstq] mova m5, [dstq+dst_strideq*2] psrlw m2, 4 psrlw m3, 4 pavgw m0, m2 pavgw m2, m3 %if %2 == 1 ; avg pavgw m0, [secq] add secq, sec_str pavgw m2, [secq] %endif SUM_SSE m0, m4, m2, m5, m6, m7 mova m0, m3 lea srcq, [srcq+src_strideq*4] lea dstq, [dstq+dst_strideq*4] %if %2 == 1 ; avg add secq, sec_str %endif %endif dec block_height jg .x_other_y_half_loop %undef filter_x_a %undef filter_x_b %undef filter_rnd STORE_AND_RET .x_nonhalf_y_nonhalf: ; loading filter - this is same as in 8-bit depth %if AOM_ARCH_X86_64 lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 shl y_offsetd, filter_idx_shift %if AOM_ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+x_offsetq] mova m9, [bilin_filter+x_offsetq+16] mova m10, [bilin_filter+y_offsetq] mova m11, [bilin_filter+y_offsetq+16] mova m12, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_y_a m10 %define filter_y_b m11 %define filter_rnd m12 %else ; x86-32 %if AOM_ARCH_X86=1 && CONFIG_PIC=1 ; In this case, there is NO unused register. Used src_stride register. Later, ; src_stride has to be loaded from stack when it is needed. %define tempq src_strideq mov tempq, g_bilin_filterm add x_offsetq, tempq add y_offsetq, tempq %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] mov tempq, g_pw_8m %define filter_rnd [tempq] %else add x_offsetq, bilin_filter add y_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] %define filter_rnd [GLOBAL(pw_8)] %endif %endif ; end of load filter ; x_offset == bilin interpolation && y_offset == bilin interpolation %if %1 == 16 movu m0, [srcq] movu m2, [srcq+2] movu m1, [srcq+16] movu m3, [srcq+18] pmullw m0, filter_x_a pmullw m2, filter_x_b paddw m0, filter_rnd pmullw m1, filter_x_a pmullw m3, filter_x_b paddw m1, filter_rnd paddw m0, m2 paddw m1, m3 psrlw m0, 4 psrlw m1, 4 INC_SRC_BY_SRC_STRIDE .x_other_y_other_loop: movu m2, [srcq] movu m4, [srcq+2] movu m3, [srcq+16] movu m5, [srcq+18] pmullw m2, filter_x_a pmullw m4, filter_x_b paddw m2, filter_rnd pmullw m3, filter_x_a pmullw m5, filter_x_b paddw m3, filter_rnd paddw m2, m4 paddw m3, m5 psrlw m2, 4 psrlw m3, 4 mova m4, m2 mova m5, m3 pmullw m0, filter_y_a pmullw m2, filter_y_b paddw m0, filter_rnd pmullw m1, filter_y_a pmullw m3, filter_y_b paddw m0, m2 paddw m1, filter_rnd mova m2, [dstq] paddw m1, m3 psrlw m0, 4 psrlw m1, 4 mova m3, [dstq+16] %if %2 == 1 ; avg pavgw m0, [secq] pavgw m1, [secq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 mova m0, m4 mova m1, m5 INC_SRC_BY_SRC_STRIDE lea dstq, [dstq + dst_strideq * 2] %if %2 == 1 ; avg add secq, sec_str %endif %else ; %1 < 16 movu m0, [srcq] movu m2, [srcq+2] pmullw m0, filter_x_a pmullw m2, filter_x_b paddw m0, filter_rnd paddw m0, m2 psrlw m0, 4 INC_SRC_BY_SRC_STRIDE .x_other_y_other_loop: movu m2, [srcq] movu m4, [srcq+2] INC_SRC_BY_SRC_STRIDE movu m3, [srcq] movu m5, [srcq+2] pmullw m2, filter_x_a pmullw m4, filter_x_b paddw m2, filter_rnd pmullw m3, filter_x_a pmullw m5, filter_x_b paddw m3, filter_rnd paddw m2, m4 paddw m3, m5 psrlw m2, 4 psrlw m3, 4 mova m4, m2 mova m5, m3 pmullw m0, filter_y_a pmullw m2, filter_y_b paddw m0, filter_rnd pmullw m4, filter_y_a pmullw m3, filter_y_b paddw m0, m2 paddw m4, filter_rnd mova m2, [dstq] paddw m4, m3 psrlw m0, 4 psrlw m4, 4 mova m3, [dstq+dst_strideq*2] %if %2 == 1 ; avg pavgw m0, [secq] add secq, sec_str pavgw m4, [secq] %endif SUM_SSE m0, m2, m4, m3, m6, m7 mova m0, m5 INC_SRC_BY_SRC_STRIDE lea dstq, [dstq + dst_strideq * 4] %if %2 == 1 ; avg add secq, sec_str %endif %endif dec block_height jg .x_other_y_other_loop %undef filter_x_a %undef filter_x_b %undef filter_y_a %undef filter_y_b %undef filter_rnd STORE_AND_RET %endmacro INIT_XMM sse2 SUBPEL_VARIANCE 8 SUBPEL_VARIANCE 16 INIT_XMM sse2 SUBPEL_VARIANCE 8, 1 SUBPEL_VARIANCE 16, 1 aom-3.12.1/aom_dsp/x86/highbd_subtract_sse2.c000066400000000000000000000266361477627663500207210ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride, const uint16_t *src, ptrdiff_t src_stride, const uint16_t *pred, ptrdiff_t pred_stride); static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride, const uint16_t *src, ptrdiff_t src_stride, const uint16_t *pred, ptrdiff_t pred_stride) { __m128i u0, u1, u2, u3; __m128i v0, v1, v2, v3; __m128i x0, x1, x2, x3; int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride)); u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride)); u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride)); u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride)); v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride)); v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride)); v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride)); v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride)); x0 = _mm_sub_epi16(u0, v0); x1 = _mm_sub_epi16(u1, v1); x2 = _mm_sub_epi16(u2, v2); x3 = _mm_sub_epi16(u3, v3); _mm_storel_epi64((__m128i *)store_diff, x0); store_diff = (int64_t *)(diff + 1 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x1); store_diff = (int64_t *)(diff + 2 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x2); store_diff = (int64_t *)(diff + 3 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x3); } static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride, const uint16_t *src, ptrdiff_t src_stride, const uint16_t *pred, ptrdiff_t pred_stride) { __m128i u0, u1, u2, u3, u4, u5, u6, u7; __m128i v0, v1, v2, v3, v4, v5, v6, v7; __m128i x0, x1, x2, x3, x4, x5, x6, x7; int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride)); u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride)); u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride)); u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride)); u4 = _mm_loadl_epi64((__m128i const *)(src + 4 * src_stride)); u5 = _mm_loadl_epi64((__m128i const *)(src + 5 * src_stride)); u6 = _mm_loadl_epi64((__m128i const *)(src + 6 * src_stride)); u7 = _mm_loadl_epi64((__m128i const *)(src + 7 * src_stride)); v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride)); v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride)); v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride)); v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride)); v4 = _mm_loadl_epi64((__m128i const *)(pred + 4 * pred_stride)); v5 = _mm_loadl_epi64((__m128i const *)(pred + 5 * pred_stride)); v6 = _mm_loadl_epi64((__m128i const *)(pred + 6 * pred_stride)); v7 = _mm_loadl_epi64((__m128i const *)(pred + 7 * pred_stride)); x0 = _mm_sub_epi16(u0, v0); x1 = _mm_sub_epi16(u1, v1); x2 = _mm_sub_epi16(u2, v2); x3 = _mm_sub_epi16(u3, v3); x4 = _mm_sub_epi16(u4, v4); x5 = _mm_sub_epi16(u5, v5); x6 = _mm_sub_epi16(u6, v6); x7 = _mm_sub_epi16(u7, v7); _mm_storel_epi64((__m128i *)store_diff, x0); store_diff = (int64_t *)(diff + 1 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x1); store_diff = (int64_t *)(diff + 2 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x2); store_diff = (int64_t *)(diff + 3 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x3); store_diff = (int64_t *)(diff + 4 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x4); store_diff = (int64_t *)(diff + 5 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x5); store_diff = (int64_t *)(diff + 6 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x6); store_diff = (int64_t *)(diff + 7 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x7); } static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride, const uint16_t *src, ptrdiff_t src_stride, const uint16_t *pred, ptrdiff_t pred_stride) { __m128i u0, u1, u2, u3; __m128i v0, v1, v2, v3; __m128i x0, x1, x2, x3; u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); x0 = _mm_sub_epi16(u0, v0); x1 = _mm_sub_epi16(u1, v1); x2 = _mm_sub_epi16(u2, v2); x3 = _mm_sub_epi16(u3, v3); _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0); _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1); _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2); _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3); } static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride, const uint16_t *src, ptrdiff_t src_stride, const uint16_t *pred, ptrdiff_t pred_stride) { __m128i u0, u1, u2, u3, u4, u5, u6, u7; __m128i v0, v1, v2, v3, v4, v5, v6, v7; __m128i x0, x1, x2, x3, x4, x5, x6, x7; u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride)); u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride)); u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride)); u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride)); v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride)); v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride)); v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride)); v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride)); x0 = _mm_sub_epi16(u0, v0); x1 = _mm_sub_epi16(u1, v1); x2 = _mm_sub_epi16(u2, v2); x3 = _mm_sub_epi16(u3, v3); x4 = _mm_sub_epi16(u4, v4); x5 = _mm_sub_epi16(u5, v5); x6 = _mm_sub_epi16(u6, v6); x7 = _mm_sub_epi16(u7, v7); _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0); _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1); _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2); _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3); _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4); _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5); _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6); _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7); } #define STACK_V(h, fun) \ do { \ fun(diff, diff_stride, src, src_stride, pred, pred_stride); \ fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \ pred + pred_stride * h, pred_stride); \ } while (0) #define STACK_H(w, fun) \ do { \ fun(diff, diff_stride, src, src_stride, pred, pred_stride); \ fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \ } while (0) #define SUBTRACT_FUN(size) \ static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride, \ const uint16_t *src, ptrdiff_t src_stride, \ const uint16_t *pred, ptrdiff_t pred_stride) SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); } SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); } SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); } SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); } SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); } SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); } SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); } SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); } SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); } SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); } SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); } SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); } SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); } SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); } SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); } SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); } SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); } SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); } static SubtractWxHFuncType getSubtractFunc(int rows, int cols) { if (rows == 4) { if (cols == 4) return subtract_4x4; if (cols == 8) return subtract_8x4; if (cols == 16) return subtract_16x4; } if (rows == 8) { if (cols == 4) return subtract_4x8; if (cols == 8) return subtract_8x8; if (cols == 16) return subtract_16x8; if (cols == 32) return subtract_32x8; } if (rows == 16) { if (cols == 4) return subtract_4x16; if (cols == 8) return subtract_8x16; if (cols == 16) return subtract_16x16; if (cols == 32) return subtract_32x16; if (cols == 64) return subtract_64x16; } if (rows == 32) { if (cols == 8) return subtract_8x32; if (cols == 16) return subtract_16x32; if (cols == 32) return subtract_32x32; if (cols == 64) return subtract_64x32; } if (rows == 64) { if (cols == 16) return subtract_16x64; if (cols == 32) return subtract_32x64; if (cols == 64) return subtract_64x64; if (cols == 128) return subtract_128x64; } if (rows == 128) { if (cols == 64) return subtract_64x128; if (cols == 128) return subtract_128x128; } assert(0); return NULL; } void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src8, ptrdiff_t src_stride, const uint8_t *pred8, ptrdiff_t pred_stride) { uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); SubtractWxHFuncType func; func = getSubtractFunc(rows, cols); func(diff, diff_stride, src, src_stride, pred, pred_stride); } aom-3.12.1/aom_dsp/x86/highbd_variance_avx2.c000066400000000000000000001112561477627663500206570ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include // AVX2 #include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/synonyms.h" typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, uint32_t *sse, int *sum); static uint32_t aom_highbd_var_filter_block2d_bil_avx2( const uint8_t *src_ptr8, unsigned int src_pixels_per_line, int pixel_step, unsigned int output_height, unsigned int output_width, const uint32_t xoffset, const uint32_t yoffset, const uint8_t *dst_ptr8, int dst_stride, uint32_t *sse) { const __m256i filter1 = _mm256_set1_epi32((int)(bilinear_filters_2t[xoffset][1] << 16) | bilinear_filters_2t[xoffset][0]); const __m256i filter2 = _mm256_set1_epi32((int)(bilinear_filters_2t[yoffset][1] << 16) | bilinear_filters_2t[yoffset][0]); const __m256i one = _mm256_set1_epi16(1); const int bitshift = 0x40; (void)pixel_step; unsigned int i, j, prev = 0, curr = 2; uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); uint16_t *dst_ptr = CONVERT_TO_SHORTPTR(dst_ptr8); uint16_t *src_ptr_ref = src_ptr; uint16_t *dst_ptr_ref = dst_ptr; int64_t sum_long = 0; uint64_t sse_long = 0; unsigned int rshift = 0, inc = 1; __m256i rbias = _mm256_set1_epi32(bitshift); __m256i opointer[8]; unsigned int range; if (xoffset == 0) { if (yoffset == 0) { // xoffset==0 && yoffset==0 range = output_width / 16; if (output_height == 8) inc = 2; if (output_height == 4) inc = 4; for (j = 0; j < range * output_height * inc / 16; j++) { if (j % (output_height * inc / 16) == 0) { src_ptr = src_ptr_ref; src_ptr_ref += 16; dst_ptr = dst_ptr_ref; dst_ptr_ref += 16; } __m256i sum1 = _mm256_setzero_si256(); __m256i sse1 = _mm256_setzero_si256(); for (i = 0; i < 16 / inc; ++i) { __m256i V_S_SRC = _mm256_loadu_si256((const __m256i *)src_ptr); src_ptr += src_pixels_per_line; __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); dst_ptr += dst_stride; __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); sum1 = _mm256_add_epi16(sum1, V_R_SUB); sse1 = _mm256_add_epi32(sse1, V_R_MAD); } __m256i v_sum0 = _mm256_madd_epi16(sum1, one); __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } rshift = get_msb(output_height) + get_msb(output_width); } else if (yoffset == 4) { // xoffset==0 && yoffset==4 range = output_width / 16; if (output_height == 8) inc = 2; if (output_height == 4) inc = 4; for (j = 0; j < range * output_height * inc / 16; j++) { if (j % (output_height * inc / 16) == 0) { src_ptr = src_ptr_ref; src_ptr_ref += 16; dst_ptr = dst_ptr_ref; dst_ptr_ref += 16; opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr); src_ptr += src_pixels_per_line; curr = 0; } __m256i sum1 = _mm256_setzero_si256(); __m256i sse1 = _mm256_setzero_si256(); for (i = 0; i < 16 / inc; ++i) { prev = curr; curr = (curr == 0) ? 1 : 0; opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr); src_ptr += src_pixels_per_line; __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]); __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); dst_ptr += dst_stride; __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); sum1 = _mm256_add_epi16(sum1, V_R_SUB); sse1 = _mm256_add_epi32(sse1, V_R_MAD); } __m256i v_sum0 = _mm256_madd_epi16(sum1, one); __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } rshift = get_msb(output_height) + get_msb(output_width); } else { // xoffset==0 && yoffset==1,2,3,5,6,7 range = output_width / 16; if (output_height == 8) inc = 2; if (output_height == 4) inc = 4; for (j = 0; j < range * output_height * inc / 16; j++) { if (j % (output_height * inc / 16) == 0) { src_ptr = src_ptr_ref; src_ptr_ref += 16; dst_ptr = dst_ptr_ref; dst_ptr_ref += 16; opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr); src_ptr += src_pixels_per_line; curr = 0; } __m256i sum1 = _mm256_setzero_si256(); __m256i sse1 = _mm256_setzero_si256(); for (i = 0; i < 16 / inc; ++i) { prev = curr; curr = (curr == 0) ? 1 : 0; opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr); src_ptr += src_pixels_per_line; __m256i V_S_M1 = _mm256_unpacklo_epi16(opointer[prev], opointer[curr]); __m256i V_S_M2 = _mm256_unpackhi_epi16(opointer[prev], opointer[curr]); __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2); __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2); __m256i V_S_S1 = _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7); __m256i V_S_S2 = _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7); __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2); __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); dst_ptr += dst_stride; __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); sum1 = _mm256_add_epi16(sum1, V_R_SUB); sse1 = _mm256_add_epi32(sse1, V_R_MAD); } __m256i v_sum0 = _mm256_madd_epi16(sum1, one); __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } rshift = get_msb(output_height) + get_msb(output_width); } } else if (xoffset == 4) { if (yoffset == 0) { // xoffset==4 && yoffset==0 range = output_width / 16; if (output_height == 8) inc = 2; if (output_height == 4) inc = 4; for (j = 0; j < range * output_height * inc / 16; j++) { if (j % (output_height * inc / 16) == 0) { src_ptr = src_ptr_ref; src_ptr_ref += 16; dst_ptr = dst_ptr_ref; dst_ptr_ref += 16; __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); src_ptr += src_pixels_per_line; opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2); curr = 0; } __m256i sum1 = _mm256_setzero_si256(); __m256i sse1 = _mm256_setzero_si256(); for (i = 0; i < 16 / inc; ++i) { prev = curr; curr = (curr == 0) ? 1 : 0; __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); src_ptr += src_pixels_per_line; opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2); __m256i V_S_M1 = _mm256_unpacklo_epi16(opointer[prev], opointer[curr]); __m256i V_S_M2 = _mm256_unpackhi_epi16(opointer[prev], opointer[curr]); __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2); __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2); __m256i V_S_S1 = _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7); __m256i V_S_S2 = _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7); __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2); __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); dst_ptr += dst_stride; __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); sum1 = _mm256_add_epi16(sum1, V_R_SUB); sse1 = _mm256_add_epi32(sse1, V_R_MAD); } __m256i v_sum0 = _mm256_madd_epi16(sum1, one); __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } rshift = get_msb(output_height) + get_msb(output_width); } else if (yoffset == 4) { // xoffset==4 && yoffset==4 range = output_width / 16; if (output_height == 8) inc = 2; if (output_height == 4) inc = 4; for (j = 0; j < range * output_height * inc / 16; j++) { if (j % (output_height * inc / 16) == 0) { src_ptr = src_ptr_ref; src_ptr_ref += 16; dst_ptr = dst_ptr_ref; dst_ptr_ref += 16; __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); src_ptr += src_pixels_per_line; opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2); curr = 0; } __m256i sum1 = _mm256_setzero_si256(); __m256i sse1 = _mm256_setzero_si256(); for (i = 0; i < 16 / inc; ++i) { prev = curr; curr = (curr == 0) ? 1 : 0; __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); src_ptr += src_pixels_per_line; opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2); __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]); __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); dst_ptr += dst_stride; __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); sum1 = _mm256_add_epi16(sum1, V_R_SUB); sse1 = _mm256_add_epi32(sse1, V_R_MAD); } __m256i v_sum0 = _mm256_madd_epi16(sum1, one); __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } rshift = get_msb(output_height) + get_msb(output_width); } else { // xoffset==4 && yoffset==1,2,3,5,6,7 range = output_width / 16; if (output_height == 8) inc = 2; if (output_height == 4) inc = 4; for (j = 0; j < range * output_height * inc / 16; j++) { if (j % (output_height * inc / 16) == 0) { src_ptr = src_ptr_ref; src_ptr_ref += 16; dst_ptr = dst_ptr_ref; dst_ptr_ref += 16; __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); src_ptr += src_pixels_per_line; opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2); curr = 0; } __m256i sum1 = _mm256_setzero_si256(); __m256i sse1 = _mm256_setzero_si256(); for (i = 0; i < 16 / inc; ++i) { prev = curr; curr = (curr == 0) ? 1 : 0; __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); src_ptr += src_pixels_per_line; opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2); __m256i V_S_M1 = _mm256_unpacklo_epi16(opointer[prev], opointer[curr]); __m256i V_S_M2 = _mm256_unpackhi_epi16(opointer[prev], opointer[curr]); __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2); __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2); __m256i V_S_S1 = _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7); __m256i V_S_S2 = _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7); __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2); __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); dst_ptr += dst_stride; __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); sum1 = _mm256_add_epi16(sum1, V_R_SUB); sse1 = _mm256_add_epi32(sse1, V_R_MAD); } __m256i v_sum0 = _mm256_madd_epi16(sum1, one); __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } rshift = get_msb(output_height) + get_msb(output_width); } } else if (yoffset == 0) { // xoffset==1,2,3,5,6,7 && yoffset==0 range = output_width / 16; if (output_height == 8) inc = 2; if (output_height == 4) inc = 4; for (j = 0; j < range * output_height * inc / 16; j++) { if (j % (output_height * inc / 16) == 0) { src_ptr = src_ptr_ref; src_ptr_ref += 16; dst_ptr = dst_ptr_ref; dst_ptr_ref += 16; curr = 0; } __m256i sum1 = _mm256_setzero_si256(); __m256i sse1 = _mm256_setzero_si256(); for (i = 0; i < 16 / inc; ++i) { __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); src_ptr += src_pixels_per_line; __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2); __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2); __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1); __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1); __m256i V_V_S1 = _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7); __m256i V_V_S2 = _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7); opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2); __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); dst_ptr += dst_stride; __m256i V_R_SUB = _mm256_sub_epi16(opointer[curr], V_D_DST); __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); sum1 = _mm256_add_epi16(sum1, V_R_SUB); sse1 = _mm256_add_epi32(sse1, V_R_MAD); } __m256i v_sum0 = _mm256_madd_epi16(sum1, one); __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } rshift = get_msb(output_height) + get_msb(output_width); } else if (yoffset == 4) { // xoffset==1,2,3,5,6,7 && yoffset==4 range = output_width / 16; if (output_height == 8) inc = 2; if (output_height == 4) inc = 4; for (j = 0; j < range * output_height * inc / 16; j++) { if (j % (output_height * inc / 16) == 0) { src_ptr = src_ptr_ref; src_ptr_ref += 16; dst_ptr = dst_ptr_ref; dst_ptr_ref += 16; __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); src_ptr += src_pixels_per_line; __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2); __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2); __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1); __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1); __m256i V_H_S1 = _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7); __m256i V_H_S2 = _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7); opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2); curr = 0; } __m256i sum1 = _mm256_setzero_si256(); __m256i sse1 = _mm256_setzero_si256(); for (i = 0; i < 16 / inc; ++i) { prev = curr; curr = (curr == 0) ? 1 : 0; __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); src_ptr += src_pixels_per_line; __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2); __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2); __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1); __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1); __m256i V_V_S1 = _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7); __m256i V_V_S2 = _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7); opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2); __m256i V_S_SRC = _mm256_avg_epu16(opointer[prev], opointer[curr]); __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); dst_ptr += dst_stride; __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); sum1 = _mm256_add_epi16(sum1, V_R_SUB); sse1 = _mm256_add_epi32(sse1, V_R_MAD); } __m256i v_sum0 = _mm256_madd_epi16(sum1, one); __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } rshift = get_msb(output_height) + get_msb(output_width); } else { // xoffset==1,2,3,5,6,7 && yoffset==1,2,3,5,6,7 range = output_width / 16; if (output_height == 8) inc = 2; if (output_height == 4) inc = 4; unsigned int nloop = 16 / inc; for (j = 0; j < range * output_height * inc / 16; j++) { if (j % (output_height * inc / 16) == 0) { src_ptr = src_ptr_ref; src_ptr_ref += 16; dst_ptr = dst_ptr_ref; dst_ptr_ref += 16; __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); src_ptr += src_pixels_per_line; __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2); __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2); __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1); __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1); __m256i V_H_S1 = _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7); __m256i V_H_S2 = _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7); opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2); curr = 0; } __m256i sum1 = _mm256_setzero_si256(); __m256i sse1 = _mm256_setzero_si256(); for (i = 0; i < nloop; ++i) { prev = curr; curr = !curr; __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); src_ptr += src_pixels_per_line; __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2); __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2); __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1); __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1); __m256i V_V_S1 = _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7); __m256i V_V_S2 = _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7); opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2); __m256i V_S_M1 = _mm256_unpacklo_epi16(opointer[prev], opointer[curr]); __m256i V_S_M2 = _mm256_unpackhi_epi16(opointer[prev], opointer[curr]); __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2); __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2); __m256i V_S_S1 = _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7); __m256i V_S_S2 = _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7); __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2); __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); dst_ptr += dst_stride; __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); sum1 = _mm256_add_epi16(sum1, V_R_SUB); sse1 = _mm256_add_epi32(sse1, V_R_MAD); } __m256i v_sum0 = _mm256_madd_epi16(sum1, one); __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); sum_long += _mm_extract_epi32(v_d, 0); sse_long += _mm_extract_epi32(v_d, 1); } rshift = get_msb(output_height) + get_msb(output_width); } *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); int sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); int32_t var = *sse - (uint32_t)(((int64_t)sum * sum) >> rshift); return (var > 0) ? var : 0; } static void highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, uint32_t *sse, int *sum) { __m256i v_sum_d = _mm256_setzero_si256(); __m256i v_sse_d = _mm256_setzero_si256(); for (int i = 0; i < 8; i += 2) { const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src); const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref); const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride)); __m256i v_p_a = _mm256_castsi128_si256(v_p_a0); __m256i v_p_b = _mm256_castsi128_si256(v_p_b0); v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1); v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1); const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); src += src_stride * 2; ref += ref_stride * 2; } __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d)); __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1)); __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01); __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); *sum = _mm_extract_epi32(v_d, 0); *sse = _mm_extract_epi32(v_d, 1); } static void highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, uint32_t *sse, int *sum) { __m256i v_sum_d = _mm256_setzero_si256(); __m256i v_sse_d = _mm256_setzero_si256(); const __m256i one = _mm256_set1_epi16(1); for (int i = 0; i < 16; ++i) { const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src); const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref); const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); src += src_stride; ref += ref_stride; } __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one); __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); *sum = _mm_extract_epi32(v_d, 0); *sse = _mm_extract_epi32(v_d, 1); } static void highbd_10_variance_avx2(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int w, int h, uint32_t *sse, int *sum, high_variance_fn_t var_fn, int block_size) { int i, j; uint64_t sse_long = 0; int32_t sum_long = 0; for (i = 0; i < h; i += block_size) { for (j = 0; j < w; j += block_size) { unsigned int sse0; int sum0; var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, ref_stride, &sse0, &sum0); sse_long += sse0; sum_long += sum0; } } *sum = ROUND_POWER_OF_TWO(sum_long, 2); *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); } #define VAR_FN(w, h, block_size, shift) \ uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ const uint8_t *src8, int src_stride, const uint8_t *ref8, \ int ref_stride, uint32_t *sse) { \ int sum; \ int64_t var; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ highbd_10_variance_avx2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ highbd_calc##block_size##x##block_size##var_avx2, \ block_size); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ return (var >= 0) ? (uint32_t)var : 0; \ } VAR_FN(128, 128, 16, 14) VAR_FN(128, 64, 16, 13) VAR_FN(64, 128, 16, 13) VAR_FN(64, 64, 16, 12) VAR_FN(64, 32, 16, 11) VAR_FN(32, 64, 16, 11) VAR_FN(32, 32, 16, 10) VAR_FN(32, 16, 16, 9) VAR_FN(16, 32, 16, 9) VAR_FN(16, 16, 16, 8) VAR_FN(16, 8, 8, 7) VAR_FN(8, 16, 8, 7) VAR_FN(8, 8, 8, 6) #if !CONFIG_REALTIME_ONLY VAR_FN(16, 64, 16, 10) VAR_FN(32, 8, 8, 8) VAR_FN(64, 16, 16, 10) VAR_FN(8, 32, 8, 8) #endif // !CONFIG_REALTIME_ONLY #undef VAR_FN unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, unsigned int *sse) { int sum; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); highbd_10_variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, highbd_calc16x16var_avx2, 16); return *sse; } #define SSE2_HEIGHT(H) \ uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr); SSE2_HEIGHT(8) SSE2_HEIGHT(16) #undef SSE2_HEIGHT #define HIGHBD_SUBPIX_VAR(W, H) \ uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *dst, int dst_stride, uint32_t *sse) { \ if (W == 8 && H == 16) \ return aom_highbd_10_sub_pixel_variance8x16_sse2( \ src, src_stride, xoffset, yoffset, dst, dst_stride, sse); \ else if (W == 8 && H == 8) \ return aom_highbd_10_sub_pixel_variance8x8_sse2( \ src, src_stride, xoffset, yoffset, dst, dst_stride, sse); \ else \ return aom_highbd_var_filter_block2d_bil_avx2( \ src, src_stride, 1, H, W, xoffset, yoffset, dst, dst_stride, sse); \ } HIGHBD_SUBPIX_VAR(128, 128) HIGHBD_SUBPIX_VAR(128, 64) HIGHBD_SUBPIX_VAR(64, 128) HIGHBD_SUBPIX_VAR(64, 64) HIGHBD_SUBPIX_VAR(64, 32) HIGHBD_SUBPIX_VAR(32, 64) HIGHBD_SUBPIX_VAR(32, 32) HIGHBD_SUBPIX_VAR(32, 16) HIGHBD_SUBPIX_VAR(16, 32) HIGHBD_SUBPIX_VAR(16, 16) HIGHBD_SUBPIX_VAR(16, 8) HIGHBD_SUBPIX_VAR(8, 16) HIGHBD_SUBPIX_VAR(8, 8) #undef HIGHBD_SUBPIX_VAR static uint64_t mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride, uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i reg0_4x16, reg1_4x16, reg2_4x16, reg3_4x16; __m256i src0_8x16, src1_8x16, src_16x16; __m256i dst0_8x16, dst1_8x16, dst_16x16; __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64; __m256i sub_result; const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128()); __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128()); for (int i = 0; i < h; i += 4) { reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride])); reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride])); reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 2) * dstride])); reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 3) * dstride])); dst0_8x16 = _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16)); dst1_8x16 = _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16)); dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20); reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride])); reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride])); reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride])); reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride])); src0_8x16 = _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16)); src1_8x16 = _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16)); src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20); sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16)); src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros); dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros); src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16); dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16); res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros); res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros); res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros); res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros); square_result = _mm256_add_epi64( square_result, _mm256_add_epi64( _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64), res3_4x64)); } const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(square_result), _mm256_extracti128_si256(square_result, 1)); const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); xx_storel_64(&sum, sum_1x64); return sum; } static uint64_t mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride, uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m256i src0_8x16, src1_8x16, src_16x16; __m256i dst0_8x16, dst1_8x16, dst_16x16; __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64; __m256i sub_result; const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128()); __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128()); for (int i = 0; i < h; i += 2) { dst0_8x16 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&dst[i * dstride])); dst1_8x16 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)&dst[(i + 1) * dstride])); dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20); src0_8x16 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride])); src1_8x16 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride])); src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20); sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16)); src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros); dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros); src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16); dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16); res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros); res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros); res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros); res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros); square_result = _mm256_add_epi64( square_result, _mm256_add_epi64( _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64), res3_4x64)); } const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(square_result), _mm256_extracti128_si256(square_result, 1)); const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); xx_storel_64(&sum, sum_1x64); return sum; } uint64_t aom_mse_wxh_16bit_highbd_avx2(uint16_t *dst, int dstride, uint16_t *src, int sstride, int w, int h) { assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must satisfy"); switch (w) { case 4: return mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); case 8: return mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } aom-3.12.1/aom_dsp/x86/highbd_variance_impl_sse2.asm000066400000000000000000000235041477627663500222300ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "aom_ports/x86_abi_support.asm" SECTION .text ;unsigned int aom_highbd_calc16x16var_sse2 ;( ; unsigned char * src_ptr, ; int source_stride, ; unsigned char * ref_ptr, ; int recon_stride, ; unsigned int * SSE, ; int * Sum ;) globalsym(aom_highbd_calc16x16var_sse2) sym(aom_highbd_calc16x16var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rbx push rsi push rdi ; end prolog mov rsi, arg(0) ;[src_ptr] mov rdi, arg(2) ;[ref_ptr] movsxd rax, DWORD PTR arg(1) ;[source_stride] movsxd rdx, DWORD PTR arg(3) ;[recon_stride] add rax, rax ; source stride in bytes add rdx, rdx ; recon stride in bytes ; Prefetch data prefetcht0 [rsi] prefetcht0 [rsi+16] prefetcht0 [rsi+rax] prefetcht0 [rsi+rax+16] lea rbx, [rsi+rax*2] prefetcht0 [rbx] prefetcht0 [rbx+16] prefetcht0 [rbx+rax] prefetcht0 [rbx+rax+16] prefetcht0 [rdi] prefetcht0 [rdi+16] prefetcht0 [rdi+rdx] prefetcht0 [rdi+rdx+16] lea rbx, [rdi+rdx*2] prefetcht0 [rbx] prefetcht0 [rbx+16] prefetcht0 [rbx+rdx] prefetcht0 [rbx+rdx+16] pxor xmm0, xmm0 ; clear xmm0 for unpack pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs pxor xmm6, xmm6 ; clear xmm6 for accumulating sse mov rcx, 16 .var16loop: movdqu xmm1, XMMWORD PTR [rsi] movdqu xmm2, XMMWORD PTR [rdi] lea rbx, [rsi+rax*2] prefetcht0 [rbx] prefetcht0 [rbx+16] prefetcht0 [rbx+rax] prefetcht0 [rbx+rax+16] lea rbx, [rdi+rdx*2] prefetcht0 [rbx] prefetcht0 [rbx+16] prefetcht0 [rbx+rdx] prefetcht0 [rbx+rdx+16] pxor xmm5, xmm5 psubw xmm1, xmm2 movdqu xmm3, XMMWORD PTR [rsi+16] paddw xmm5, xmm1 pmaddwd xmm1, xmm1 movdqu xmm2, XMMWORD PTR [rdi+16] paddd xmm6, xmm1 psubw xmm3, xmm2 movdqu xmm1, XMMWORD PTR [rsi+rax] paddw xmm5, xmm3 pmaddwd xmm3, xmm3 movdqu xmm2, XMMWORD PTR [rdi+rdx] paddd xmm6, xmm3 psubw xmm1, xmm2 movdqu xmm3, XMMWORD PTR [rsi+rax+16] paddw xmm5, xmm1 pmaddwd xmm1, xmm1 movdqu xmm2, XMMWORD PTR [rdi+rdx+16] paddd xmm6, xmm1 psubw xmm3, xmm2 paddw xmm5, xmm3 pmaddwd xmm3, xmm3 paddd xmm6, xmm3 movdqa xmm1, xmm5 movdqa xmm2, xmm5 pcmpgtw xmm1, xmm0 pcmpeqw xmm2, xmm0 por xmm1, xmm2 pcmpeqw xmm1, xmm0 movdqa xmm2, xmm5 punpcklwd xmm5, xmm1 punpckhwd xmm2, xmm1 paddd xmm7, xmm5 paddd xmm7, xmm2 lea rsi, [rsi + 2*rax] lea rdi, [rdi + 2*rdx] sub rcx, 2 jnz .var16loop movdqa xmm4, xmm6 punpckldq xmm6, xmm0 punpckhdq xmm4, xmm0 movdqa xmm5, xmm7 paddd xmm6, xmm4 punpckldq xmm7, xmm0 punpckhdq xmm5, xmm0 paddd xmm7, xmm5 movdqa xmm4, xmm6 movdqa xmm5, xmm7 psrldq xmm4, 8 psrldq xmm5, 8 paddd xmm6, xmm4 paddd xmm7, xmm5 mov rdi, arg(4) ; [SSE] mov rax, arg(5) ; [Sum] movd DWORD PTR [rdi], xmm6 movd DWORD PTR [rax], xmm7 ; begin epilog pop rdi pop rsi pop rbx RESTORE_XMM UNSHADOW_ARGS pop rbp ret ;unsigned int aom_highbd_calc8x8var_sse2 ;( ; unsigned char * src_ptr, ; int source_stride, ; unsigned char * ref_ptr, ; int recon_stride, ; unsigned int * SSE, ; int * Sum ;) globalsym(aom_highbd_calc8x8var_sse2) sym(aom_highbd_calc8x8var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 push rbx push rsi push rdi ; end prolog mov rsi, arg(0) ;[src_ptr] mov rdi, arg(2) ;[ref_ptr] movsxd rax, DWORD PTR arg(1) ;[source_stride] movsxd rdx, DWORD PTR arg(3) ;[recon_stride] add rax, rax ; source stride in bytes add rdx, rdx ; recon stride in bytes ; Prefetch data prefetcht0 [rsi] prefetcht0 [rsi+rax] lea rbx, [rsi+rax*2] prefetcht0 [rbx] prefetcht0 [rbx+rax] prefetcht0 [rdi] prefetcht0 [rdi+rdx] lea rbx, [rdi+rdx*2] prefetcht0 [rbx] prefetcht0 [rbx+rdx] pxor xmm0, xmm0 ; clear xmm0 for unpack pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs pxor xmm6, xmm6 ; clear xmm6 for accumulating sse mov rcx, 8 .var8loop: movdqu xmm1, XMMWORD PTR [rsi] movdqu xmm2, XMMWORD PTR [rdi] lea rbx, [rsi+rax*4] prefetcht0 [rbx] prefetcht0 [rbx+rax] lea rbx, [rbx+rax*2] prefetcht0 [rbx] prefetcht0 [rbx+rax] lea rbx, [rdi+rdx*4] prefetcht0 [rbx] prefetcht0 [rbx+rdx] lea rbx, [rbx+rdx*2] prefetcht0 [rbx] prefetcht0 [rbx+rdx] pxor xmm5, xmm5 psubw xmm1, xmm2 movdqu xmm3, XMMWORD PTR [rsi+rax] paddw xmm5, xmm1 pmaddwd xmm1, xmm1 movdqu xmm2, XMMWORD PTR [rdi+rdx] paddd xmm6, xmm1 lea rsi, [rsi + 2*rax] lea rdi, [rdi + 2*rdx] psubw xmm3, xmm2 movdqu xmm1, XMMWORD PTR [rsi] paddw xmm5, xmm3 pmaddwd xmm3, xmm3 movdqu xmm2, XMMWORD PTR [rdi] paddd xmm6, xmm3 psubw xmm1, xmm2 movdqu xmm3, XMMWORD PTR [rsi+rax] paddw xmm5, xmm1 pmaddwd xmm1, xmm1 movdqu xmm2, XMMWORD PTR [rdi+rdx] paddd xmm6, xmm1 psubw xmm3, xmm2 paddw xmm5, xmm3 pmaddwd xmm3, xmm3 paddd xmm6, xmm3 movdqa xmm1, xmm5 movdqa xmm2, xmm5 pcmpgtw xmm1, xmm0 pcmpeqw xmm2, xmm0 por xmm1, xmm2 pcmpeqw xmm1, xmm0 movdqa xmm2, xmm5 punpcklwd xmm5, xmm1 punpckhwd xmm2, xmm1 paddd xmm7, xmm5 paddd xmm7, xmm2 lea rsi, [rsi + 2*rax] lea rdi, [rdi + 2*rdx] sub rcx, 4 jnz .var8loop movdqa xmm4, xmm6 punpckldq xmm6, xmm0 punpckhdq xmm4, xmm0 movdqa xmm5, xmm7 paddd xmm6, xmm4 punpckldq xmm7, xmm0 punpckhdq xmm5, xmm0 paddd xmm7, xmm5 movdqa xmm4, xmm6 movdqa xmm5, xmm7 psrldq xmm4, 8 psrldq xmm5, 8 paddd xmm6, xmm4 paddd xmm7, xmm5 mov rdi, arg(4) ; [SSE] mov rax, arg(5) ; [Sum] movd DWORD PTR [rdi], xmm6 movd DWORD PTR [rax], xmm7 ; begin epilog pop rdi pop rsi pop rbx RESTORE_XMM UNSHADOW_ARGS pop rbp ret aom-3.12.1/aom_dsp/x86/highbd_variance_sse2.c000066400000000000000000001137071477627663500206560ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include // SSE2 #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" #include "av1/common/filter.h" #include "av1/common/reconinter.h" typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, uint32_t *sse, int *sum); uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, uint32_t *sse, int *sum); uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, uint32_t *sse, int *sum); static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int w, int h, uint32_t *sse, int *sum, high_variance_fn_t var_fn, int block_size) { int i, j; *sse = 0; *sum = 0; for (i = 0; i < h; i += block_size) { for (j = 0; j < w; j += block_size) { unsigned int sse0; int sum0; var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, ref_stride, &sse0, &sum0); *sse += sse0; *sum += sum0; } } } static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int w, int h, uint32_t *sse, int *sum, high_variance_fn_t var_fn, int block_size) { int i, j; uint64_t sse_long = 0; int32_t sum_long = 0; for (i = 0; i < h; i += block_size) { for (j = 0; j < w; j += block_size) { unsigned int sse0; int sum0; var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, ref_stride, &sse0, &sum0); sse_long += sse0; sum_long += sum0; } } *sum = ROUND_POWER_OF_TWO(sum_long, 2); *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); } static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int w, int h, uint32_t *sse, int *sum, high_variance_fn_t var_fn, int block_size) { int i, j; uint64_t sse_long = 0; int32_t sum_long = 0; for (i = 0; i < h; i += block_size) { for (j = 0; j < w; j += block_size) { unsigned int sse0; int sum0; var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, ref_stride, &sse0, &sum0); sse_long += sse0; sum_long += sum0; } } *sum = ROUND_POWER_OF_TWO(sum_long, 4); *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); } #define VAR_FN(w, h, block_size, shift) \ uint32_t aom_highbd_8_variance##w##x##h##_sse2( \ const uint8_t *src8, int src_stride, const uint8_t *ref8, \ int ref_stride, uint32_t *sse) { \ int sum; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ highbd_8_variance_sse2( \ src, src_stride, ref, ref_stride, w, h, sse, &sum, \ aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ } \ \ uint32_t aom_highbd_10_variance##w##x##h##_sse2( \ const uint8_t *src8, int src_stride, const uint8_t *ref8, \ int ref_stride, uint32_t *sse) { \ int sum; \ int64_t var; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ highbd_10_variance_sse2( \ src, src_stride, ref, ref_stride, w, h, sse, &sum, \ aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ return (var >= 0) ? (uint32_t)var : 0; \ } \ \ uint32_t aom_highbd_12_variance##w##x##h##_sse2( \ const uint8_t *src8, int src_stride, const uint8_t *ref8, \ int ref_stride, uint32_t *sse) { \ int sum; \ int64_t var; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ highbd_12_variance_sse2( \ src, src_stride, ref, ref_stride, w, h, sse, &sum, \ aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ return (var >= 0) ? (uint32_t)var : 0; \ } VAR_FN(128, 128, 16, 14) VAR_FN(128, 64, 16, 13) VAR_FN(64, 128, 16, 13) VAR_FN(64, 64, 16, 12) VAR_FN(64, 32, 16, 11) VAR_FN(32, 64, 16, 11) VAR_FN(32, 32, 16, 10) VAR_FN(32, 16, 16, 9) VAR_FN(16, 32, 16, 9) VAR_FN(16, 16, 16, 8) VAR_FN(16, 8, 8, 7) VAR_FN(8, 16, 8, 7) VAR_FN(8, 8, 8, 6) #if !CONFIG_REALTIME_ONLY VAR_FN(8, 32, 8, 8) VAR_FN(32, 8, 8, 8) VAR_FN(16, 64, 16, 10) VAR_FN(64, 16, 16, 10) #endif // !CONFIG_REALTIME_ONLY #undef VAR_FN unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, unsigned int *sse) { int sum; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, aom_highbd_calc16x16var_sse2, 16); return *sse; } unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, unsigned int *sse) { int sum; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, aom_highbd_calc16x16var_sse2, 16); return *sse; } unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, unsigned int *sse) { int sum; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, aom_highbd_calc16x16var_sse2, 16); return *sse; } unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, unsigned int *sse) { int sum; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, aom_highbd_calc8x8var_sse2, 8); return *sse; } unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, unsigned int *sse) { int sum; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, aom_highbd_calc8x8var_sse2, 8); return *sse; } unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, unsigned int *sse) { int sum; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, aom_highbd_calc8x8var_sse2, 8); return *sse; } // The 2 unused parameters are place holders for PIC enabled build. // These definitions are for functions defined in // highbd_subpel_variance_impl_sse2.asm #define DECL(w, opt) \ int aom_highbd_sub_pixel_variance##w##xh_##opt( \ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ const uint16_t *dst, ptrdiff_t dst_stride, int height, \ unsigned int *sse, void *unused0, void *unused); #define DECLS(opt) \ DECL(8, opt) \ DECL(16, opt) DECLS(sse2) #undef DECLS #undef DECL #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ int se = 0; \ unsigned int sse = 0; \ unsigned int sse2; \ int row_rep = (w > 64) ? 2 : 1; \ for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ src += wd_64 * 64; \ dst += wd_64 * 64; \ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse2, \ NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf) { \ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ dst_stride, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ dst_stride, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ } \ } \ } \ *sse_ptr = sse; \ return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ } \ \ uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ int64_t var; \ uint32_t sse; \ uint64_t long_sse = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ int se = 0; \ int row_rep = (w > 64) ? 2 : 1; \ for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ src += wd_64 * 64; \ dst += wd_64 * 64; \ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ NULL); \ se += se2; \ long_sse += sse; \ if (w > wf) { \ uint32_t sse2; \ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf * 2) { \ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ dst_stride, h, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ dst_stride, h, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ } \ } \ } \ se = ROUND_POWER_OF_TWO(se, 2); \ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 4); \ *sse_ptr = sse; \ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ return (var >= 0) ? (uint32_t)var : 0; \ } \ \ uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ int start_row; \ uint32_t sse; \ int se = 0; \ int64_t var; \ uint64_t long_sse = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ int row_rep = (w > 64) ? 2 : 1; \ for (start_row = 0; start_row < h; start_row += 16) { \ uint32_t sse2; \ int height = h - start_row < 16 ? h - start_row : 16; \ uint16_t *src_tmp = src + (start_row * src_stride); \ uint16_t *dst_tmp = dst + (start_row * dst_stride); \ for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ src_tmp += wd_64 * 64; \ dst_tmp += wd_64 * 64; \ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ src_tmp, src_stride, x_offset, y_offset, dst_tmp, dst_stride, \ height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf) { \ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ src_tmp + wf, src_stride, x_offset, y_offset, dst_tmp + wf, \ dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf * 2) { \ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ src_tmp + 2 * wf, src_stride, x_offset, y_offset, \ dst_tmp + 2 * wf, dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ src_tmp + 3 * wf, src_stride, x_offset, y_offset, \ dst_tmp + 3 * wf, dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ } \ } \ } \ } \ se = ROUND_POWER_OF_TWO(se, 4); \ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ *sse_ptr = sse; \ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ return (var >= 0) ? (uint32_t)var : 0; \ } #if CONFIG_REALTIME_ONLY #define FNS(opt) \ FN(128, 128, 16, 7, 7, opt, (int64_t)) \ FN(128, 64, 16, 7, 6, opt, (int64_t)) \ FN(64, 128, 16, 6, 7, opt, (int64_t)) \ FN(64, 64, 16, 6, 6, opt, (int64_t)) \ FN(64, 32, 16, 6, 5, opt, (int64_t)) \ FN(32, 64, 16, 5, 6, opt, (int64_t)) \ FN(32, 32, 16, 5, 5, opt, (int64_t)) \ FN(32, 16, 16, 5, 4, opt, (int64_t)) \ FN(16, 32, 16, 4, 5, opt, (int64_t)) \ FN(16, 16, 16, 4, 4, opt, (int64_t)) \ FN(16, 8, 16, 4, 3, opt, (int64_t)) \ FN(8, 16, 8, 3, 4, opt, (int64_t)) \ FN(8, 8, 8, 3, 3, opt, (int64_t)) \ FN(8, 4, 8, 3, 2, opt, (int64_t)) #else // !CONFIG_REALTIME_ONLY #define FNS(opt) \ FN(128, 128, 16, 7, 7, opt, (int64_t)) \ FN(128, 64, 16, 7, 6, opt, (int64_t)) \ FN(64, 128, 16, 6, 7, opt, (int64_t)) \ FN(64, 64, 16, 6, 6, opt, (int64_t)) \ FN(64, 32, 16, 6, 5, opt, (int64_t)) \ FN(32, 64, 16, 5, 6, opt, (int64_t)) \ FN(32, 32, 16, 5, 5, opt, (int64_t)) \ FN(32, 16, 16, 5, 4, opt, (int64_t)) \ FN(16, 32, 16, 4, 5, opt, (int64_t)) \ FN(16, 16, 16, 4, 4, opt, (int64_t)) \ FN(16, 8, 16, 4, 3, opt, (int64_t)) \ FN(8, 16, 8, 3, 4, opt, (int64_t)) \ FN(8, 8, 8, 3, 3, opt, (int64_t)) \ FN(8, 4, 8, 3, 2, opt, (int64_t)) \ FN(16, 4, 16, 4, 2, opt, (int64_t)) \ FN(8, 32, 8, 3, 5, opt, (int64_t)) \ FN(32, 8, 16, 5, 3, opt, (int64_t)) \ FN(16, 64, 16, 4, 6, opt, (int64_t)) \ FN(64, 16, 16, 6, 4, opt, (int64_t)) #endif // CONFIG_REALTIME_ONLY FNS(sse2) #undef FNS #undef FN // The 2 unused parameters are place holders for PIC enabled build. #define DECL(w, opt) \ int aom_highbd_sub_pixel_avg_variance##w##xh_##opt( \ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \ ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ void *unused); #define DECLS(opt) \ DECL(16, opt) \ DECL(8, opt) DECLS(sse2) #undef DECL #undef DECLS #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ const uint8_t *sec8) { \ uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, \ sec + wf, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ } \ } \ *sse_ptr = sse; \ return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ } \ \ uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ const uint8_t *sec8) { \ int64_t var; \ uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, \ sec + wf, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ } \ } \ se = ROUND_POWER_OF_TWO(se, 2); \ sse = ROUND_POWER_OF_TWO(sse, 4); \ *sse_ptr = sse; \ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ return (var >= 0) ? (uint32_t)var : 0; \ } \ \ uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ const uint8_t *sec8) { \ int start_row; \ int64_t var; \ uint32_t sse; \ int se = 0; \ uint64_t long_sse = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ for (start_row = 0; start_row < h; start_row += 16) { \ uint32_t sse2; \ int height = h - start_row < 16 ? h - start_row : 16; \ int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + (start_row * src_stride), src_stride, x_offset, y_offset, \ dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \ w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf) { \ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + wf + (start_row * src_stride), src_stride, x_offset, \ y_offset, dst + wf + (start_row * dst_stride), dst_stride, \ sec + wf + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf * 2) { \ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 2 * wf + (start_row * src_stride), src_stride, x_offset, \ y_offset, dst + 2 * wf + (start_row * dst_stride), dst_stride, \ sec + 2 * wf + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 3 * wf + (start_row * src_stride), src_stride, x_offset, \ y_offset, dst + 3 * wf + (start_row * dst_stride), dst_stride, \ sec + 3 * wf + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ } \ } \ } \ se = ROUND_POWER_OF_TWO(se, 4); \ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ *sse_ptr = sse; \ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ return (var >= 0) ? (uint32_t)var : 0; \ } #if CONFIG_REALTIME_ONLY #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t)) \ FN(64, 32, 16, 6, 5, opt, (int64_t)) \ FN(32, 64, 16, 5, 6, opt, (int64_t)) \ FN(32, 32, 16, 5, 5, opt, (int64_t)) \ FN(32, 16, 16, 5, 4, opt, (int64_t)) \ FN(16, 32, 16, 4, 5, opt, (int64_t)) \ FN(16, 16, 16, 4, 4, opt, (int64_t)) \ FN(16, 8, 16, 4, 3, opt, (int64_t)) \ FN(8, 16, 8, 3, 4, opt, (int64_t)) \ FN(8, 8, 8, 3, 3, opt, (int64_t)) \ FN(8, 4, 8, 3, 2, opt, (int64_t)) #else // !CONFIG_REALTIME_ONLY #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t)) \ FN(64, 32, 16, 6, 5, opt, (int64_t)) \ FN(32, 64, 16, 5, 6, opt, (int64_t)) \ FN(32, 32, 16, 5, 5, opt, (int64_t)) \ FN(32, 16, 16, 5, 4, opt, (int64_t)) \ FN(16, 32, 16, 4, 5, opt, (int64_t)) \ FN(16, 16, 16, 4, 4, opt, (int64_t)) \ FN(16, 8, 16, 4, 3, opt, (int64_t)) \ FN(8, 16, 8, 3, 4, opt, (int64_t)) \ FN(8, 8, 8, 3, 3, opt, (int64_t)) \ FN(8, 4, 8, 3, 2, opt, (int64_t)) \ FN(16, 4, 16, 4, 2, opt, (int64_t)) \ FN(8, 32, 8, 3, 5, opt, (int64_t)) \ FN(32, 8, 16, 5, 3, opt, (int64_t)) \ FN(16, 64, 16, 4, 6, opt, (int64_t)) \ FN(64, 16, 16, 6, 4, opt, (int64_t)) #endif // CONFIG_REALTIME_ONLY FNS(sse2) #undef FNS #undef FN static uint64_t mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride, uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i reg0_4x16, reg1_4x16; __m128i src_8x16; __m128i dst_8x16; __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64; __m128i sub_result_8x16; const __m128i zeros = _mm_setzero_si128(); __m128i square_result = _mm_setzero_si128(); for (int i = 0; i < h; i += 2) { reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride])); reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride])); dst_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16); reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride])); reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride])); src_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16); sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16); res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros); res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros); res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32); res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32); res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros); res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros); res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros); res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros); square_result = _mm_add_epi64( square_result, _mm_add_epi64( _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64), res3_4x64)); } const __m128i sum_1x64 = _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8)); xx_storel_64(&sum, sum_1x64); return sum; } static uint64_t mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride, uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i src_8x16; __m128i dst_8x16; __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64; __m128i sub_result_8x16; const __m128i zeros = _mm_setzero_si128(); __m128i square_result = _mm_setzero_si128(); for (int i = 0; i < h; i++) { dst_8x16 = _mm_loadu_si128((__m128i *)&dst[i * dstride]); src_8x16 = _mm_loadu_si128((__m128i *)&src[i * sstride]); sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16); res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros); res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros); res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32); res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32); res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros); res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros); res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros); res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros); square_result = _mm_add_epi64( square_result, _mm_add_epi64( _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64), res3_4x64)); } const __m128i sum_1x64 = _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8)); xx_storel_64(&sum, sum_1x64); return sum; } uint64_t aom_mse_wxh_16bit_highbd_sse2(uint16_t *dst, int dstride, uint16_t *src, int sstride, int w, int h) { assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must satisfy"); switch (w) { case 4: return mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); case 8: return mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } aom-3.12.1/aom_dsp/x86/highbd_variance_sse4.c000066400000000000000000000200561477627663500206520ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include /* SSE4.1 */ #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/variance.h" #include "aom_dsp/aom_filter.h" static inline void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, uint64_t *sse, int64_t *sum) { __m128i u0, u1, u2, u3; __m128i s0, s1, s2, s3; __m128i t0, t1, x0, y0; __m128i a0, a1, a2, a3; __m128i b0, b1, b2, b3; __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1); uint16_t *a = CONVERT_TO_SHORTPTR(a8); uint16_t *b = CONVERT_TO_SHORTPTR(b8); a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride)); a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride)); a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride)); a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride)); b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride)); b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride)); b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride)); b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride)); u0 = _mm_unpacklo_epi16(a0, a1); u1 = _mm_unpacklo_epi16(a2, a3); u2 = _mm_unpacklo_epi16(b0, b1); u3 = _mm_unpacklo_epi16(b2, b3); s0 = _mm_sub_epi16(u0, u2); s1 = _mm_sub_epi16(u1, u3); t0 = _mm_madd_epi16(s0, k_one_epi16); t1 = _mm_madd_epi16(s1, k_one_epi16); s2 = _mm_hadd_epi32(t0, t1); s3 = _mm_hadd_epi32(s2, s2); y0 = _mm_hadd_epi32(s3, s3); t0 = _mm_madd_epi16(s0, s0); t1 = _mm_madd_epi16(s1, s1); s2 = _mm_hadd_epi32(t0, t1); s3 = _mm_hadd_epi32(s2, s2); x0 = _mm_hadd_epi32(s3, s3); *sse = (uint64_t)_mm_extract_epi32(x0, 0); *sum = (int64_t)_mm_extract_epi32(y0, 0); } uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, uint32_t *sse) { int64_t sum, diff; uint64_t local_sse; variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); *sse = (uint32_t)local_sse; diff = (int64_t)*sse - ((sum * sum) >> 4); return (diff >= 0) ? (uint32_t)diff : 0; } uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, uint32_t *sse) { int64_t sum, diff; uint64_t local_sse; variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4); sum = ROUND_POWER_OF_TWO(sum, 2); diff = (int64_t)*sse - ((sum * sum) >> 4); return (diff >= 0) ? (uint32_t)diff : 0; } uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, uint32_t *sse) { int64_t sum, diff; uint64_t local_sse; variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8); sum = ROUND_POWER_OF_TWO(sum, 4); diff = (int64_t)*sse - ((sum * sum) >> 4); return diff >= 0 ? (uint32_t)diff : 0; } // Sub-pixel uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1( const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *dst, int dst_stride, uint32_t *sse) { uint16_t fdata3[(4 + 1) * 4]; uint16_t temp2[4 * 4]; aom_highbd_var_filter_block2d_bil_first_pass( src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, bilinear_filters_2t[yoffset]); return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride, sse); } uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1( const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *dst, int dst_stride, uint32_t *sse) { uint16_t fdata3[(4 + 1) * 4]; uint16_t temp2[4 * 4]; aom_highbd_var_filter_block2d_bil_first_pass( src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, bilinear_filters_2t[yoffset]); return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride, sse); } uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1( const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *dst, int dst_stride, uint32_t *sse) { uint16_t fdata3[(4 + 1) * 4]; uint16_t temp2[4 * 4]; aom_highbd_var_filter_block2d_bil_first_pass( src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, bilinear_filters_2t[yoffset]); return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride, sse); } // Sub-pixel average uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1( const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *dst, int dst_stride, uint32_t *sse, const uint8_t *second_pred) { uint16_t fdata3[(4 + 1) * 4]; uint16_t temp2[4 * 4]; DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); aom_highbd_var_filter_block2d_bil_first_pass( src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, bilinear_filters_2t[yoffset]); aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), 4); return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, sse); } uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1( const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *dst, int dst_stride, uint32_t *sse, const uint8_t *second_pred) { uint16_t fdata3[(4 + 1) * 4]; uint16_t temp2[4 * 4]; DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); aom_highbd_var_filter_block2d_bil_first_pass( src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, bilinear_filters_2t[yoffset]); aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), 4); return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, sse); } uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1( const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *dst, int dst_stride, uint32_t *sse, const uint8_t *second_pred) { uint16_t fdata3[(4 + 1) * 4]; uint16_t temp2[4 * 4]; DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); aom_highbd_var_filter_block2d_bil_first_pass( src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, bilinear_filters_2t[yoffset]); aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), 4); return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, sse); } aom-3.12.1/aom_dsp/x86/intrapred_asm_sse2.asm000066400000000000000000000425071477627663500207460ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "third_party/x86inc/x86inc.asm" SECTION_RODATA pb_1: times 16 db 1 pw_4: times 8 dw 4 pw_8: times 8 dw 8 pw_16: times 8 dw 16 pw_32: times 8 dw 32 dc_128: times 16 db 128 pw2_4: times 8 dw 2 pw2_8: times 8 dw 4 pw2_16: times 8 dw 8 pw2_32: times 8 dw 16 SECTION .text INIT_XMM sse2 cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset GET_GOT goffsetq movd m2, [leftq] movd m0, [aboveq] pxor m1, m1 punpckldq m0, m2 psadbw m0, m1 paddw m0, [GLOBAL(pw_4)] psraw m0, 3 pshuflw m0, m0, 0x0 packuswb m0, m0 movd [dstq ], m0 movd [dstq+strideq], m0 lea dstq, [dstq+strideq*2] movd [dstq ], m0 movd [dstq+strideq], m0 RESTORE_GOT RET INIT_XMM sse2 cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset movifnidn leftq, leftmp GET_GOT goffsetq pxor m1, m1 movd m0, [leftq] psadbw m0, m1 paddw m0, [GLOBAL(pw2_4)] psraw m0, 2 pshuflw m0, m0, 0x0 packuswb m0, m0 movd [dstq ], m0 movd [dstq+strideq], m0 lea dstq, [dstq+strideq*2] movd [dstq ], m0 movd [dstq+strideq], m0 RESTORE_GOT RET INIT_XMM sse2 cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset GET_GOT goffsetq pxor m1, m1 movd m0, [aboveq] psadbw m0, m1 paddw m0, [GLOBAL(pw2_4)] psraw m0, 2 pshuflw m0, m0, 0x0 packuswb m0, m0 movd [dstq ], m0 movd [dstq+strideq], m0 lea dstq, [dstq+strideq*2] movd [dstq ], m0 movd [dstq+strideq], m0 RESTORE_GOT RET INIT_XMM sse2 cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset GET_GOT goffsetq pxor m1, m1 movq m0, [aboveq] movq m2, [leftq] DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] psadbw m0, m1 psadbw m2, m1 paddw m0, m2 paddw m0, [GLOBAL(pw_8)] psraw m0, 4 punpcklbw m0, m0 pshuflw m0, m0, 0x0 movq [dstq ], m0 movq [dstq+strideq ], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] movq [dstq ], m0 movq [dstq+strideq ], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 RESTORE_GOT RET INIT_XMM sse2 cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset GET_GOT goffsetq pxor m1, m1 movq m0, [aboveq] DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] psadbw m0, m1 paddw m0, [GLOBAL(pw2_8)] psraw m0, 3 punpcklbw m0, m0 pshuflw m0, m0, 0x0 movq [dstq ], m0 movq [dstq+strideq ], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] movq [dstq ], m0 movq [dstq+strideq ], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 RESTORE_GOT RET INIT_XMM sse2 cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset movifnidn leftq, leftmp GET_GOT goffsetq pxor m1, m1 movq m0, [leftq] DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] psadbw m0, m1 paddw m0, [GLOBAL(pw2_8)] psraw m0, 3 punpcklbw m0, m0 pshuflw m0, m0, 0x0 movq [dstq ], m0 movq [dstq+strideq ], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] movq [dstq ], m0 movq [dstq+strideq ], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 RESTORE_GOT RET INIT_XMM sse2 cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset GET_GOT goffsetq DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] movd m0, [GLOBAL(dc_128)] movd [dstq ], m0 movd [dstq+strideq ], m0 movd [dstq+strideq*2], m0 movd [dstq+stride3q ], m0 RESTORE_GOT RET INIT_XMM sse2 cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset GET_GOT goffsetq DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] movq m0, [GLOBAL(dc_128)] movq [dstq ], m0 movq [dstq+strideq ], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] movq [dstq ], m0 movq [dstq+strideq ], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 RESTORE_GOT RET INIT_XMM sse2 cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset GET_GOT goffsetq pxor m1, m1 mova m0, [aboveq] mova m2, [leftq] DEFINE_ARGS dst, stride, stride3, lines4 lea stride3q, [strideq*3] mov lines4d, 4 psadbw m0, m1 psadbw m2, m1 paddw m0, m2 movhlps m2, m0 paddw m0, m2 paddw m0, [GLOBAL(pw_16)] psraw m0, 5 pshuflw m0, m0, 0x0 punpcklqdq m0, m0 packuswb m0, m0 .loop: mova [dstq ], m0 mova [dstq+strideq ], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] dec lines4d jnz .loop RESTORE_GOT REP_RET INIT_XMM sse2 cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset GET_GOT goffsetq pxor m1, m1 mova m0, [aboveq] DEFINE_ARGS dst, stride, stride3, lines4 lea stride3q, [strideq*3] mov lines4d, 4 psadbw m0, m1 movhlps m2, m0 paddw m0, m2 paddw m0, [GLOBAL(pw2_16)] psraw m0, 4 pshuflw m0, m0, 0x0 punpcklqdq m0, m0 packuswb m0, m0 .loop: mova [dstq ], m0 mova [dstq+strideq ], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] dec lines4d jnz .loop RESTORE_GOT REP_RET INIT_XMM sse2 cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset GET_GOT goffsetq pxor m1, m1 mova m0, [leftq] DEFINE_ARGS dst, stride, stride3, lines4 lea stride3q, [strideq*3] mov lines4d, 4 psadbw m0, m1 movhlps m2, m0 paddw m0, m2 paddw m0, [GLOBAL(pw2_16)] psraw m0, 4 pshuflw m0, m0, 0x0 punpcklqdq m0, m0 packuswb m0, m0 .loop: mova [dstq ], m0 mova [dstq+strideq ], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] dec lines4d jnz .loop RESTORE_GOT REP_RET INIT_XMM sse2 cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset GET_GOT goffsetq DEFINE_ARGS dst, stride, stride3, lines4 lea stride3q, [strideq*3] mov lines4d, 4 mova m0, [GLOBAL(dc_128)] .loop: mova [dstq ], m0 mova [dstq+strideq ], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] dec lines4d jnz .loop RESTORE_GOT RET INIT_XMM sse2 cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset GET_GOT goffsetq pxor m1, m1 mova m0, [aboveq] mova m2, [aboveq+16] mova m3, [leftq] mova m4, [leftq+16] DEFINE_ARGS dst, stride, stride3, lines4 lea stride3q, [strideq*3] mov lines4d, 8 psadbw m0, m1 psadbw m2, m1 psadbw m3, m1 psadbw m4, m1 paddw m0, m2 paddw m0, m3 paddw m0, m4 movhlps m2, m0 paddw m0, m2 paddw m0, [GLOBAL(pw_32)] psraw m0, 6 pshuflw m0, m0, 0x0 punpcklqdq m0, m0 packuswb m0, m0 .loop: mova [dstq ], m0 mova [dstq +16], m0 mova [dstq+strideq ], m0 mova [dstq+strideq +16], m0 mova [dstq+strideq*2 ], m0 mova [dstq+strideq*2+16], m0 mova [dstq+stride3q ], m0 mova [dstq+stride3q +16], m0 lea dstq, [dstq+strideq*4] dec lines4d jnz .loop RESTORE_GOT REP_RET INIT_XMM sse2 cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset GET_GOT goffsetq pxor m1, m1 mova m0, [aboveq] mova m2, [aboveq+16] DEFINE_ARGS dst, stride, stride3, lines4 lea stride3q, [strideq*3] mov lines4d, 8 psadbw m0, m1 psadbw m2, m1 paddw m0, m2 movhlps m2, m0 paddw m0, m2 paddw m0, [GLOBAL(pw2_32)] psraw m0, 5 pshuflw m0, m0, 0x0 punpcklqdq m0, m0 packuswb m0, m0 .loop: mova [dstq ], m0 mova [dstq +16], m0 mova [dstq+strideq ], m0 mova [dstq+strideq +16], m0 mova [dstq+strideq*2 ], m0 mova [dstq+strideq*2+16], m0 mova [dstq+stride3q ], m0 mova [dstq+stride3q +16], m0 lea dstq, [dstq+strideq*4] dec lines4d jnz .loop RESTORE_GOT REP_RET INIT_XMM sse2 cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset GET_GOT goffsetq pxor m1, m1 mova m0, [leftq] mova m2, [leftq+16] DEFINE_ARGS dst, stride, stride3, lines4 lea stride3q, [strideq*3] mov lines4d, 8 psadbw m0, m1 psadbw m2, m1 paddw m0, m2 movhlps m2, m0 paddw m0, m2 paddw m0, [GLOBAL(pw2_32)] psraw m0, 5 pshuflw m0, m0, 0x0 punpcklqdq m0, m0 packuswb m0, m0 .loop: mova [dstq ], m0 mova [dstq +16], m0 mova [dstq+strideq ], m0 mova [dstq+strideq +16], m0 mova [dstq+strideq*2 ], m0 mova [dstq+strideq*2+16], m0 mova [dstq+stride3q ], m0 mova [dstq+stride3q +16], m0 lea dstq, [dstq+strideq*4] dec lines4d jnz .loop RESTORE_GOT REP_RET INIT_XMM sse2 cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset GET_GOT goffsetq DEFINE_ARGS dst, stride, stride3, lines4 lea stride3q, [strideq*3] mov lines4d, 8 mova m0, [GLOBAL(dc_128)] .loop: mova [dstq ], m0 mova [dstq +16], m0 mova [dstq+strideq ], m0 mova [dstq+strideq +16], m0 mova [dstq+strideq*2 ], m0 mova [dstq+strideq*2+16], m0 mova [dstq+stride3q ], m0 mova [dstq+stride3q +16], m0 lea dstq, [dstq+strideq*4] dec lines4d jnz .loop RESTORE_GOT RET INIT_XMM sse2 cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above movd m0, [aboveq] movd [dstq ], m0 movd [dstq+strideq], m0 lea dstq, [dstq+strideq*2] movd [dstq ], m0 movd [dstq+strideq], m0 RET INIT_XMM sse2 cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above movq m0, [aboveq] DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] movq [dstq ], m0 movq [dstq+strideq ], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] movq [dstq ], m0 movq [dstq+strideq ], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 RET INIT_XMM sse2 cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above mova m0, [aboveq] DEFINE_ARGS dst, stride, stride3, nlines4 lea stride3q, [strideq*3] mov nlines4d, 4 .loop: mova [dstq ], m0 mova [dstq+strideq ], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] dec nlines4d jnz .loop REP_RET INIT_XMM sse2 cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above mova m0, [aboveq] mova m1, [aboveq+16] DEFINE_ARGS dst, stride, stride3, nlines4 lea stride3q, [strideq*3] mov nlines4d, 8 .loop: mova [dstq ], m0 mova [dstq +16], m1 mova [dstq+strideq ], m0 mova [dstq+strideq +16], m1 mova [dstq+strideq*2 ], m0 mova [dstq+strideq*2+16], m1 mova [dstq+stride3q ], m0 mova [dstq+stride3q +16], m1 lea dstq, [dstq+strideq*4] dec nlines4d jnz .loop REP_RET INIT_XMM sse2 cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left movifnidn leftq, leftmp movd m0, [leftq] punpcklbw m0, m0 punpcklbw m0, m0 pshufd m1, m0, 0x1 movd [dstq ], m0 movd [dstq+strideq], m1 pshufd m2, m0, 0x2 lea dstq, [dstq+strideq*2] pshufd m3, m0, 0x3 movd [dstq ], m2 movd [dstq+strideq], m3 RET INIT_XMM sse2 cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left movifnidn leftq, leftmp mov lineq, -2 DEFINE_ARGS dst, stride, line, left, stride3 lea stride3q, [strideq*3] movq m0, [leftq ] punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8 .loop: pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1 pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2 movq [dstq ], m1 movq [dstq+strideq], m2 pshuflw m1, m0, 0xaa pshuflw m2, m0, 0xff movq [dstq+strideq*2], m1 movq [dstq+stride3q ], m2 pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 inc lineq lea dstq, [dstq+strideq*4] jnz .loop REP_RET INIT_XMM sse2 cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left movifnidn leftq, leftmp mov lineq, -4 DEFINE_ARGS dst, stride, line, left, stride3 lea stride3q, [strideq*3] .loop: movd m0, [leftq] punpcklbw m0, m0 punpcklbw m0, m0 ; l1 to l4 each repeated 4 times pshufd m1, m0, 0x0 ; l1 repeated 16 times pshufd m2, m0, 0x55 ; l2 repeated 16 times mova [dstq ], m1 mova [dstq+strideq ], m2 pshufd m1, m0, 0xaa pshufd m2, m0, 0xff mova [dstq+strideq*2], m1 mova [dstq+stride3q ], m2 inc lineq lea leftq, [leftq+4 ] lea dstq, [dstq+strideq*4] jnz .loop REP_RET INIT_XMM sse2 cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left movifnidn leftq, leftmp mov lineq, -8 DEFINE_ARGS dst, stride, line, left, stride3 lea stride3q, [strideq*3] .loop: movd m0, [leftq] punpcklbw m0, m0 punpcklbw m0, m0 ; l1 to l4 each repeated 4 times pshufd m1, m0, 0x0 ; l1 repeated 16 times pshufd m2, m0, 0x55 ; l2 repeated 16 times mova [dstq ], m1 mova [dstq+16 ], m1 mova [dstq+strideq ], m2 mova [dstq+strideq+16 ], m2 pshufd m1, m0, 0xaa pshufd m2, m0, 0xff mova [dstq+strideq*2 ], m1 mova [dstq+strideq*2+16], m1 mova [dstq+stride3q ], m2 mova [dstq+stride3q+16 ], m2 inc lineq lea leftq, [leftq+4 ] lea dstq, [dstq+strideq*4] jnz .loop REP_RET aom-3.12.1/aom_dsp/x86/intrapred_avx2.c000066400000000000000000005473101477627663500175560ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #include "aom_dsp/x86/intrapred_x86.h" #include "aom_dsp/x86/intrapred_utils.h" #include "aom_dsp/x86/lpf_common_sse2.h" static inline __m256i dc_sum_64(const uint8_t *ref) { const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref); const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32)); const __m256i zero = _mm256_setzero_si256(); __m256i y0 = _mm256_sad_epu8(x0, zero); __m256i y1 = _mm256_sad_epu8(x1, zero); y0 = _mm256_add_epi64(y0, y1); __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1); y0 = _mm256_add_epi64(u0, y0); u0 = _mm256_unpackhi_epi64(y0, y0); return _mm256_add_epi16(y0, u0); } static inline __m256i dc_sum_32(const uint8_t *ref) { const __m256i x = _mm256_loadu_si256((const __m256i *)ref); const __m256i zero = _mm256_setzero_si256(); __m256i y = _mm256_sad_epu8(x, zero); __m256i u = _mm256_permute2x128_si256(y, y, 1); y = _mm256_add_epi64(u, y); u = _mm256_unpackhi_epi64(y, y); return _mm256_add_epi16(y, u); } static inline void row_store_32xh(const __m256i *r, int height, uint8_t *dst, ptrdiff_t stride) { for (int i = 0; i < height; ++i) { _mm256_storeu_si256((__m256i *)dst, *r); dst += stride; } } static inline void row_store_32x2xh(const __m256i *r0, const __m256i *r1, int height, uint8_t *dst, ptrdiff_t stride) { for (int i = 0; i < height; ++i) { _mm256_storeu_si256((__m256i *)dst, *r0); _mm256_storeu_si256((__m256i *)(dst + 32), *r1); dst += stride; } } static inline void row_store_64xh(const __m256i *r, int height, uint8_t *dst, ptrdiff_t stride) { for (int i = 0; i < height; ++i) { _mm256_storeu_si256((__m256i *)dst, *r); _mm256_storeu_si256((__m256i *)(dst + 32), *r); dst += stride; } } #if CONFIG_AV1_HIGHBITDEPTH static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 }, { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 }, { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 }, { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, }; static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = { { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }, { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 }, { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 }, { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 } }; static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = { { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }, { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }, { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 }, { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 }, { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 }, { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 }, { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 }, { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 } }; static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0 }, { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0 }, { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0 }, { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0 }, { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 }, { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 }, { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 }, { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff } }; #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static inline void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) { __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; r0 = _mm_unpacklo_epi16(x[0], x[1]); r1 = _mm_unpacklo_epi16(x[2], x[3]); r2 = _mm_unpacklo_epi16(x[4], x[5]); r3 = _mm_unpacklo_epi16(x[6], x[7]); r4 = _mm_unpacklo_epi16(x[8], x[9]); r5 = _mm_unpacklo_epi16(x[10], x[11]); r6 = _mm_unpacklo_epi16(x[12], x[13]); r7 = _mm_unpacklo_epi16(x[14], x[15]); r8 = _mm_unpacklo_epi32(r0, r1); r9 = _mm_unpackhi_epi32(r0, r1); r10 = _mm_unpacklo_epi32(r2, r3); r11 = _mm_unpackhi_epi32(r2, r3); r12 = _mm_unpacklo_epi32(r4, r5); r13 = _mm_unpackhi_epi32(r4, r5); r14 = _mm_unpacklo_epi32(r6, r7); r15 = _mm_unpackhi_epi32(r6, r7); r0 = _mm_unpacklo_epi64(r8, r9); r1 = _mm_unpackhi_epi64(r8, r9); r2 = _mm_unpacklo_epi64(r10, r11); r3 = _mm_unpackhi_epi64(r10, r11); r4 = _mm_unpacklo_epi64(r12, r13); r5 = _mm_unpackhi_epi64(r12, r13); r6 = _mm_unpacklo_epi64(r14, r15); r7 = _mm_unpackhi_epi64(r14, r15); d[0] = _mm_unpacklo_epi64(r0, r2); d[1] = _mm_unpacklo_epi64(r4, r6); d[2] = _mm_unpacklo_epi64(r1, r3); d[3] = _mm_unpacklo_epi64(r5, r7); d[4] = _mm_unpackhi_epi64(r0, r2); d[5] = _mm_unpackhi_epi64(r4, r6); d[6] = _mm_unpackhi_epi64(r1, r3); d[7] = _mm_unpackhi_epi64(r5, r7); } static inline void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) { __m256i w0, w1, w2, w3, ww0, ww1; w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13 w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33 w2 = _mm256_unpackhi_epi16(x[0], x[1]); // 40 50 41 51 42 52 43 53 w3 = _mm256_unpackhi_epi16(x[2], x[3]); // 60 70 61 71 62 72 63 73 ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static inline void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) { __m256i w0, w1, w2, w3, ww0, ww1; w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13 w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33 w2 = _mm256_unpacklo_epi16(x[4], x[5]); // 40 50 41 51 42 52 43 53 w3 = _mm256_unpacklo_epi16(x[6], x[7]); // 60 70 61 71 62 72 63 73 ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 w0 = _mm256_unpackhi_epi16(x[0], x[1]); // 04 14 05 15 06 16 07 17 w1 = _mm256_unpackhi_epi16(x[2], x[3]); // 24 34 25 35 26 36 27 37 w2 = _mm256_unpackhi_epi16(x[4], x[5]); // 44 54 45 55 46 56 47 57 w3 = _mm256_unpackhi_epi16(x[6], x[7]); // 64 74 65 75 66 76 67 77 ww0 = _mm256_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 ww1 = _mm256_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75 d[4] = _mm256_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74 d[5] = _mm256_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75 ww0 = _mm256_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 ww1 = _mm256_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77 d[6] = _mm256_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76 d[7] = _mm256_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77 } static inline void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) { __m256i w0, w1, w2, w3, ww0, ww1; __m256i dd[16]; w0 = _mm256_unpacklo_epi16(x[0], x[1]); w1 = _mm256_unpacklo_epi16(x[2], x[3]); w2 = _mm256_unpacklo_epi16(x[4], x[5]); w3 = _mm256_unpacklo_epi16(x[6], x[7]); ww0 = _mm256_unpacklo_epi32(w0, w1); // ww1 = _mm256_unpacklo_epi32(w2, w3); // dd[0] = _mm256_unpacklo_epi64(ww0, ww1); dd[1] = _mm256_unpackhi_epi64(ww0, ww1); ww0 = _mm256_unpackhi_epi32(w0, w1); // ww1 = _mm256_unpackhi_epi32(w2, w3); // dd[2] = _mm256_unpacklo_epi64(ww0, ww1); dd[3] = _mm256_unpackhi_epi64(ww0, ww1); w0 = _mm256_unpackhi_epi16(x[0], x[1]); w1 = _mm256_unpackhi_epi16(x[2], x[3]); w2 = _mm256_unpackhi_epi16(x[4], x[5]); w3 = _mm256_unpackhi_epi16(x[6], x[7]); ww0 = _mm256_unpacklo_epi32(w0, w1); // ww1 = _mm256_unpacklo_epi32(w2, w3); // dd[4] = _mm256_unpacklo_epi64(ww0, ww1); dd[5] = _mm256_unpackhi_epi64(ww0, ww1); ww0 = _mm256_unpackhi_epi32(w0, w1); // ww1 = _mm256_unpackhi_epi32(w2, w3); // dd[6] = _mm256_unpacklo_epi64(ww0, ww1); dd[7] = _mm256_unpackhi_epi64(ww0, ww1); w0 = _mm256_unpacklo_epi16(x[8], x[9]); w1 = _mm256_unpacklo_epi16(x[10], x[11]); w2 = _mm256_unpacklo_epi16(x[12], x[13]); w3 = _mm256_unpacklo_epi16(x[14], x[15]); ww0 = _mm256_unpacklo_epi32(w0, w1); ww1 = _mm256_unpacklo_epi32(w2, w3); dd[8] = _mm256_unpacklo_epi64(ww0, ww1); dd[9] = _mm256_unpackhi_epi64(ww0, ww1); ww0 = _mm256_unpackhi_epi32(w0, w1); ww1 = _mm256_unpackhi_epi32(w2, w3); dd[10] = _mm256_unpacklo_epi64(ww0, ww1); dd[11] = _mm256_unpackhi_epi64(ww0, ww1); w0 = _mm256_unpackhi_epi16(x[8], x[9]); w1 = _mm256_unpackhi_epi16(x[10], x[11]); w2 = _mm256_unpackhi_epi16(x[12], x[13]); w3 = _mm256_unpackhi_epi16(x[14], x[15]); ww0 = _mm256_unpacklo_epi32(w0, w1); ww1 = _mm256_unpacklo_epi32(w2, w3); dd[12] = _mm256_unpacklo_epi64(ww0, ww1); dd[13] = _mm256_unpackhi_epi64(ww0, ww1); ww0 = _mm256_unpackhi_epi32(w0, w1); ww1 = _mm256_unpackhi_epi32(w2, w3); dd[14] = _mm256_unpacklo_epi64(ww0, ww1); dd[15] = _mm256_unpackhi_epi64(ww0, ww1); for (int i = 0; i < 8; i++) { d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1); d[i + 8] = _mm256_insertf128_si256(dd[i + 8], _mm256_extracti128_si256(dd[i], 1), 0); } } #endif // CONFIG_AV1_HIGHBITDEPTH void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i sum_above = dc_sum_32(above); __m256i sum_left = dc_sum_32(left); sum_left = _mm256_add_epi16(sum_left, sum_above); const __m256i thirtytwo = _mm256_set1_epi16(32); sum_left = _mm256_add_epi16(sum_left, thirtytwo); sum_left = _mm256_srai_epi16(sum_left, 6); const __m256i zero = _mm256_setzero_si256(); __m256i row = _mm256_shuffle_epi8(sum_left, zero); row_store_32xh(&row, 32, dst, stride); } void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m256i sum = dc_sum_32(above); (void)left; const __m256i sixteen = _mm256_set1_epi16(16); sum = _mm256_add_epi16(sum, sixteen); sum = _mm256_srai_epi16(sum, 5); const __m256i zero = _mm256_setzero_si256(); __m256i row = _mm256_shuffle_epi8(sum, zero); row_store_32xh(&row, 32, dst, stride); } void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m256i sum = dc_sum_32(left); (void)above; const __m256i sixteen = _mm256_set1_epi16(16); sum = _mm256_add_epi16(sum, sixteen); sum = _mm256_srai_epi16(sum, 5); const __m256i zero = _mm256_setzero_si256(); __m256i row = _mm256_shuffle_epi8(sum, zero); row_store_32xh(&row, 32, dst, stride); } void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m256i row = _mm256_set1_epi8((int8_t)0x80); row_store_32xh(&row, 32, dst, stride); } void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i row = _mm256_loadu_si256((const __m256i *)above); (void)left; row_store_32xh(&row, 32, dst, stride); } // There are 32 rows togeter. This function does line: // 0,1,2,3, and 16,17,18,19. The next call would do // 4,5,6,7, and 20,21,22,23. So 4 times of calling // would finish 32 rows. static inline void h_predictor_32x8line(const __m256i *row, uint8_t *dst, ptrdiff_t stride) { __m256i t[4]; __m256i m = _mm256_setzero_si256(); const __m256i inc = _mm256_set1_epi8(4); int i; for (i = 0; i < 4; i++) { t[i] = _mm256_shuffle_epi8(*row, m); __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0); __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11); _mm256_storeu_si256((__m256i *)dst, r0); _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1); dst += stride; m = _mm256_add_epi8(m, inc); } } void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; const __m256i left_col = _mm256_loadu_si256((__m256i const *)left); __m256i u = _mm256_unpacklo_epi8(left_col, left_col); __m256i v = _mm256_unpacklo_epi8(u, u); h_predictor_32x8line(&v, dst, stride); dst += stride << 2; v = _mm256_unpackhi_epi8(u, u); h_predictor_32x8line(&v, dst, stride); dst += stride << 2; u = _mm256_unpackhi_epi8(left_col, left_col); v = _mm256_unpacklo_epi8(u, u); h_predictor_32x8line(&v, dst, stride); dst += stride << 2; v = _mm256_unpackhi_epi8(u, u); h_predictor_32x8line(&v, dst, stride); } // ----------------------------------------------------------------------------- // Rectangle void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i top_sum = dc_sum_32_sse2(above); __m128i left_sum = dc_sum_16_sse2(left); left_sum = _mm_add_epi16(top_sum, left_sum); uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum); sum += 24; sum /= 48; const __m256i row = _mm256_set1_epi8((int8_t)sum); row_store_32xh(&row, 16, dst, stride); } void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i sum_above = dc_sum_32(above); __m256i sum_left = dc_sum_64(left); sum_left = _mm256_add_epi16(sum_left, sum_above); uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); sum += 48; sum /= 96; const __m256i row = _mm256_set1_epi8((int8_t)sum); row_store_32xh(&row, 64, dst, stride); } void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i sum_above = dc_sum_64(above); __m256i sum_left = dc_sum_64(left); sum_left = _mm256_add_epi16(sum_left, sum_above); uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); sum += 64; sum /= 128; const __m256i row = _mm256_set1_epi8((int8_t)sum); row_store_64xh(&row, 64, dst, stride); } void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i sum_above = dc_sum_64(above); __m256i sum_left = dc_sum_32(left); sum_left = _mm256_add_epi16(sum_left, sum_above); uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); sum += 48; sum /= 96; const __m256i row = _mm256_set1_epi8((int8_t)sum); row_store_64xh(&row, 32, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i sum_above = dc_sum_64(above); __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left)); sum_left = _mm256_add_epi16(sum_left, sum_above); uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); sum += 40; sum /= 80; const __m256i row = _mm256_set1_epi8((int8_t)sum); row_store_64xh(&row, 16, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m256i sum = dc_sum_32(above); (void)left; const __m256i sixteen = _mm256_set1_epi16(16); sum = _mm256_add_epi16(sum, sixteen); sum = _mm256_srai_epi16(sum, 5); const __m256i zero = _mm256_setzero_si256(); __m256i row = _mm256_shuffle_epi8(sum, zero); row_store_32xh(&row, 16, dst, stride); } void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m256i sum = dc_sum_32(above); (void)left; const __m256i sixteen = _mm256_set1_epi16(16); sum = _mm256_add_epi16(sum, sixteen); sum = _mm256_srai_epi16(sum, 5); const __m256i zero = _mm256_setzero_si256(); __m256i row = _mm256_shuffle_epi8(sum, zero); row_store_32xh(&row, 64, dst, stride); } void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m256i sum = dc_sum_64(above); (void)left; const __m256i thirtytwo = _mm256_set1_epi16(32); sum = _mm256_add_epi16(sum, thirtytwo); sum = _mm256_srai_epi16(sum, 6); const __m256i zero = _mm256_setzero_si256(); __m256i row = _mm256_shuffle_epi8(sum, zero); row_store_64xh(&row, 64, dst, stride); } void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m256i sum = dc_sum_64(above); (void)left; const __m256i thirtytwo = _mm256_set1_epi16(32); sum = _mm256_add_epi16(sum, thirtytwo); sum = _mm256_srai_epi16(sum, 6); const __m256i zero = _mm256_setzero_si256(); __m256i row = _mm256_shuffle_epi8(sum, zero); row_store_64xh(&row, 32, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m256i sum = dc_sum_64(above); (void)left; const __m256i thirtytwo = _mm256_set1_epi16(32); sum = _mm256_add_epi16(sum, thirtytwo); sum = _mm256_srai_epi16(sum, 6); const __m256i zero = _mm256_setzero_si256(); __m256i row = _mm256_shuffle_epi8(sum, zero); row_store_64xh(&row, 16, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i sum = dc_sum_16_sse2(left); (void)above; const __m128i eight = _mm_set1_epi16(8); sum = _mm_add_epi16(sum, eight); sum = _mm_srai_epi16(sum, 4); const __m128i zero = _mm_setzero_si128(); const __m128i r = _mm_shuffle_epi8(sum, zero); const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); row_store_32xh(&row, 16, dst, stride); } void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m256i sum = dc_sum_64(left); (void)above; const __m256i thirtytwo = _mm256_set1_epi16(32); sum = _mm256_add_epi16(sum, thirtytwo); sum = _mm256_srai_epi16(sum, 6); const __m256i zero = _mm256_setzero_si256(); __m256i row = _mm256_shuffle_epi8(sum, zero); row_store_32xh(&row, 64, dst, stride); } void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m256i sum = dc_sum_64(left); (void)above; const __m256i thirtytwo = _mm256_set1_epi16(32); sum = _mm256_add_epi16(sum, thirtytwo); sum = _mm256_srai_epi16(sum, 6); const __m256i zero = _mm256_setzero_si256(); __m256i row = _mm256_shuffle_epi8(sum, zero); row_store_64xh(&row, 64, dst, stride); } void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m256i sum = dc_sum_32(left); (void)above; const __m256i sixteen = _mm256_set1_epi16(16); sum = _mm256_add_epi16(sum, sixteen); sum = _mm256_srai_epi16(sum, 5); const __m256i zero = _mm256_setzero_si256(); __m256i row = _mm256_shuffle_epi8(sum, zero); row_store_64xh(&row, 32, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i sum = dc_sum_16_sse2(left); (void)above; const __m128i eight = _mm_set1_epi16(8); sum = _mm_add_epi16(sum, eight); sum = _mm_srai_epi16(sum, 4); const __m128i zero = _mm_setzero_si128(); const __m128i r = _mm_shuffle_epi8(sum, zero); const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); row_store_64xh(&row, 16, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m256i row = _mm256_set1_epi8((int8_t)0x80); row_store_32xh(&row, 16, dst, stride); } void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m256i row = _mm256_set1_epi8((int8_t)0x80); row_store_32xh(&row, 64, dst, stride); } void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m256i row = _mm256_set1_epi8((int8_t)0x80); row_store_64xh(&row, 64, dst, stride); } void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m256i row = _mm256_set1_epi8((int8_t)0x80); row_store_64xh(&row, 32, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m256i row = _mm256_set1_epi8((int8_t)0x80); row_store_64xh(&row, 16, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i row = _mm256_loadu_si256((const __m256i *)above); (void)left; row_store_32xh(&row, 16, dst, stride); } void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i row = _mm256_loadu_si256((const __m256i *)above); (void)left; row_store_32xh(&row, 64, dst, stride); } void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); (void)left; row_store_32x2xh(&row0, &row1, 64, dst, stride); } void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); (void)left; row_store_32x2xh(&row0, &row1, 32, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); (void)left; row_store_32x2xh(&row0, &row1, 16, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // ----------------------------------------------------------------------------- // PAETH_PRED // Return 16 16-bit pixels in one row (__m256i) static inline __m256i paeth_pred(const __m256i *left, const __m256i *top, const __m256i *topleft) { const __m256i base = _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft); __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left)); __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top)); __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft)); __m256i mask1 = _mm256_cmpgt_epi16(pl, pt); mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl)); __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl); pl = _mm256_andnot_si256(mask1, *left); ptl = _mm256_and_si256(mask2, *topleft); pt = _mm256_andnot_si256(mask2, *top); pt = _mm256_or_si256(pt, ptl); pt = _mm256_and_si256(mask1, pt); return _mm256_or_si256(pt, pl); } // Return 16 8-bit pixels in one row (__m128i) static inline __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top, const __m256i *topleft) { const __m256i p0 = paeth_pred(left, top, topleft); const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); const __m256i p = _mm256_packus_epi16(p0, p1); return _mm256_castsi256_si128(p); } static inline __m256i get_top_vector(const uint8_t *above) { const __m128i x = _mm_load_si128((const __m128i *)above); const __m128i zero = _mm_setzero_si128(); const __m128i t0 = _mm_unpacklo_epi8(x, zero); const __m128i t1 = _mm_unpackhi_epi8(x, zero); return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1); } void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i x = _mm_loadl_epi64((const __m128i *)left); const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]); __m256i rep = _mm256_set1_epi16((short)0x8000); const __m256i one = _mm256_set1_epi16(1); const __m256i top = get_top_vector(above); int i; for (i = 0; i < 8; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); _mm_store_si128((__m128i *)dst, row); dst += stride; rep = _mm256_add_epi16(rep, one); } } static inline __m256i get_left_vector(const uint8_t *left) { const __m128i x = _mm_load_si128((const __m128i *)left); return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); } void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i l = get_left_vector(left); const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]); __m256i rep = _mm256_set1_epi16((short)0x8000); const __m256i one = _mm256_set1_epi16(1); const __m256i top = get_top_vector(above); int i; for (i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); _mm_store_si128((__m128i *)dst, row); dst += stride; rep = _mm256_add_epi16(rep, one); } } void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m256i l = get_left_vector(left); const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]); __m256i rep = _mm256_set1_epi16((short)0x8000); const __m256i one = _mm256_set1_epi16(1); const __m256i top = get_top_vector(above); int i; for (i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); _mm_store_si128((__m128i *)dst, row); dst += stride; rep = _mm256_add_epi16(rep, one); } l = get_left_vector(left + 16); rep = _mm256_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); _mm_store_si128((__m128i *)dst, row); dst += stride; rep = _mm256_add_epi16(rep, one); } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]); const __m256i one = _mm256_set1_epi16(1); const __m256i top = get_top_vector(above); for (int j = 0; j < 4; ++j) { const __m256i l = get_left_vector(left + j * 16); __m256i rep = _mm256_set1_epi16((short)0x8000); for (int i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); _mm_store_si128((__m128i *)dst, row); dst += stride; rep = _mm256_add_epi16(rep, one); } } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // Return 32 8-bit pixels in one row (__m256i) static inline __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0, const __m256i *top1, const __m256i *topleft) { __m256i p0 = paeth_pred(left, top0, topleft); __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); const __m256i x0 = _mm256_packus_epi16(p0, p1); p0 = paeth_pred(left, top1, topleft); p1 = _mm256_permute4x64_epi64(p0, 0xe); const __m256i x1 = _mm256_packus_epi16(p0, p1); return _mm256_permute2x128_si256(x0, x1, 0x20); } void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i l = get_left_vector(left); const __m256i t0 = get_top_vector(above); const __m256i t1 = get_top_vector(above + 16); const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); __m256i rep = _mm256_set1_epi16((short)0x8000); const __m256i one = _mm256_set1_epi16(1); int i; for (i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl); _mm256_storeu_si256((__m256i *)dst, r); dst += stride; rep = _mm256_add_epi16(rep, one); } } void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m256i l = get_left_vector(left); const __m256i t0 = get_top_vector(above); const __m256i t1 = get_top_vector(above + 16); const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); __m256i rep = _mm256_set1_epi16((short)0x8000); const __m256i one = _mm256_set1_epi16(1); int i; for (i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); _mm_store_si128((__m128i *)dst, r0); _mm_store_si128((__m128i *)(dst + 16), r1); dst += stride; rep = _mm256_add_epi16(rep, one); } l = get_left_vector(left + 16); rep = _mm256_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); _mm_store_si128((__m128i *)dst, r0); _mm_store_si128((__m128i *)(dst + 16), r1); dst += stride; rep = _mm256_add_epi16(rep, one); } } void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i t0 = get_top_vector(above); const __m256i t1 = get_top_vector(above + 16); const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); const __m256i one = _mm256_set1_epi16(1); int i, j; for (j = 0; j < 4; ++j) { const __m256i l = get_left_vector(left + j * 16); __m256i rep = _mm256_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); _mm_store_si128((__m128i *)dst, r0); _mm_store_si128((__m128i *)(dst + 16), r1); dst += stride; rep = _mm256_add_epi16(rep, one); } } } void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i t0 = get_top_vector(above); const __m256i t1 = get_top_vector(above + 16); const __m256i t2 = get_top_vector(above + 32); const __m256i t3 = get_top_vector(above + 48); const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); const __m256i one = _mm256_set1_epi16(1); int i, j; for (j = 0; j < 2; ++j) { const __m256i l = get_left_vector(left + j * 16); __m256i rep = _mm256_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); _mm_store_si128((__m128i *)dst, r0); _mm_store_si128((__m128i *)(dst + 16), r1); _mm_store_si128((__m128i *)(dst + 32), r2); _mm_store_si128((__m128i *)(dst + 48), r3); dst += stride; rep = _mm256_add_epi16(rep, one); } } } void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i t0 = get_top_vector(above); const __m256i t1 = get_top_vector(above + 16); const __m256i t2 = get_top_vector(above + 32); const __m256i t3 = get_top_vector(above + 48); const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); const __m256i one = _mm256_set1_epi16(1); int i, j; for (j = 0; j < 4; ++j) { const __m256i l = get_left_vector(left + j * 16); __m256i rep = _mm256_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); _mm_store_si128((__m128i *)dst, r0); _mm_store_si128((__m128i *)(dst + 16), r1); _mm_store_si128((__m128i *)(dst + 32), r2); _mm_store_si128((__m128i *)(dst + 48), r3); dst += stride; rep = _mm256_add_epi16(rep, one); } } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i t0 = get_top_vector(above); const __m256i t1 = get_top_vector(above + 16); const __m256i t2 = get_top_vector(above + 32); const __m256i t3 = get_top_vector(above + 48); const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); const __m256i one = _mm256_set1_epi16(1); int i; const __m256i l = get_left_vector(left); __m256i rep = _mm256_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { const __m256i l16 = _mm256_shuffle_epi8(l, rep); const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); _mm_store_si128((__m128i *)dst, r0); _mm_store_si128((__m128i *)(dst + 16), r1); _mm_store_si128((__m128i *)(dst + 32), r2); _mm_store_si128((__m128i *)(dst + 48), r3); dst += stride; rep = _mm256_add_epi16(rep, one); } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #if CONFIG_AV1_HIGHBITDEPTH static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2( int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { const int frac_bits = 6 - upsample_above; const int max_base_x = ((N + 4) - 1) << upsample_above; assert(dx > 0); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0, a1, a32, a16; __m256i diff, c3f; __m128i a_mbase_x, max_base_x128, base_inc128, mask128; __m128i a0_128, a1_128; a16 = _mm256_set1_epi16(16); a_mbase_x = _mm_set1_epi16(above[max_base_x]); max_base_x128 = _mm_set1_epi16(max_base_x); c3f = _mm256_set1_epi16(0x3f); int x = dx; for (int r = 0; r < N; r++) { __m256i b, res, shift; __m128i res1; int base = x >> frac_bits; if (base >= max_base_x) { for (int i = r; i < N; ++i) { dst[i] = a_mbase_x; // save 4 values } return; } a0_128 = _mm_loadu_si128((__m128i *)(above + base)); a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1)); if (upsample_above) { a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]); a1_128 = _mm_srli_si128(a0_128, 8); base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8, base + 10, base + 12, base + 14); shift = _mm256_srli_epi16( _mm256_and_si256( _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), _mm256_set1_epi16(0x3f)), 1); } else { base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4, base + 5, base + 6, base + 7); shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); } a0 = _mm256_castsi128_si256(a0_128); a1 = _mm256_castsi128_si256(a1_128); diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi16(diff, shift); res = _mm256_add_epi16(a32, b); res = _mm256_srli_epi16(res, 5); res1 = _mm256_castsi256_si128(res); mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128); dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128); x += dx; } } static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2( int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { const int frac_bits = 6 - upsample_above; const int max_base_x = ((N + 4) - 1) << upsample_above; assert(dx > 0); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0, a1, a32, a16; __m256i diff; __m128i a_mbase_x, max_base_x128, base_inc128, mask128; a16 = _mm256_set1_epi32(16); a_mbase_x = _mm_set1_epi16(above[max_base_x]); max_base_x128 = _mm_set1_epi32(max_base_x); int x = dx; for (int r = 0; r < N; r++) { __m256i b, res, shift; __m128i res1; int base = x >> frac_bits; if (base >= max_base_x) { for (int i = r; i < N; ++i) { dst[i] = a_mbase_x; // save 4 values } return; } a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); if (upsample_above) { a0 = _mm256_permutevar8x32_epi32( a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1)); base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6); shift = _mm256_srli_epi32( _mm256_and_si256( _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), _mm256_set1_epi32(0x3f)), 1); } else { base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3); shift = _mm256_srli_epi32( _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); } diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi32(diff, shift); res = _mm256_add_epi32(a32, b); res = _mm256_srli_epi32(res, 5); res1 = _mm256_castsi256_si128(res); res1 = _mm_packus_epi32(res1, res1); mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128); mask128 = _mm_packs_epi32(mask128, mask128); // goto 16 bit dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128); x += dx; } } static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, int upsample_above, int dx, int bd) { __m128i dstvec[16]; if (bd < 12) { highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above, dx); } else { highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above, upsample_above, dx); } for (int i = 0; i < N; i++) { _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); } } static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2( int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { const int frac_bits = 6 - upsample_above; const int max_base_x = ((8 + N) - 1) << upsample_above; assert(dx > 0); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0, a1, a0_1, a1_1, a32, a16; __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; a16 = _mm256_set1_epi32(16); a_mbase_x = _mm256_set1_epi16(above[max_base_x]); max_base_x256 = _mm256_set1_epi32(max_base_x); int x = dx; for (int r = 0; r < N; r++) { __m256i b, res, res1, shift; int base = x >> frac_bits; if (base >= max_base_x) { for (int i = r; i < N; ++i) { dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values } return; } a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); if (upsample_above) { a0 = _mm256_permutevar8x32_epi32( a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1)); a0_1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8))); a0_1 = _mm256_permutevar8x32_epi32( a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1)); a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1); a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1); base_inc256 = _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8, base + 10, base + 12, base + 14); shift = _mm256_srli_epi32( _mm256_and_si256( _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), _mm256_set1_epi32(0x3f)), 1); } else { base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3, base + 4, base + 5, base + 6, base + 7); shift = _mm256_srli_epi32( _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); } diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi32(diff, shift); res = _mm256_add_epi32(a32, b); res = _mm256_srli_epi32(res, 5); res1 = _mm256_packus_epi32( res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256); mask256 = _mm256_packs_epi32( mask256, _mm256_castsi128_si256( _mm256_extracti128_si256(mask256, 1))); // goto 16 bit res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); dst[r] = _mm256_castsi256_si128(res1); x += dx; } } static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2( int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { const int frac_bits = 6 - upsample_above; const int max_base_x = ((8 + N) - 1) << upsample_above; assert(dx > 0); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0, a1, a32, a16, c3f; __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; __m128i a0_x128, a1_x128; a16 = _mm256_set1_epi16(16); a_mbase_x = _mm256_set1_epi16(above[max_base_x]); max_base_x256 = _mm256_set1_epi16(max_base_x); c3f = _mm256_set1_epi16(0x3f); int x = dx; for (int r = 0; r < N; r++) { __m256i b, res, res1, shift; int base = x >> frac_bits; if (base >= max_base_x) { for (int i = r; i < N; ++i) { dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values } return; } a0_x128 = _mm_loadu_si128((__m128i *)(above + base)); if (upsample_above) { __m128i mask, atmp0, atmp1, atmp2, atmp3; a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8)); atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]); atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]); atmp2 = _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16)); atmp3 = _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16)); mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15)); a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask); mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16), _mm_set1_epi8(15)); a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask); base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6, base + 8, base + 10, base + 12, base + 14, 0, 0, 0, 0, 0, 0, 0, 0); shift = _mm256_srli_epi16( _mm256_and_si256( _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f), 1); } else { a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1)); base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3, base + 4, base + 5, base + 6, base + 7, 0, 0, 0, 0, 0, 0, 0, 0); shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); } a0 = _mm256_castsi128_si256(a0_x128); a1 = _mm256_castsi128_si256(a1_x128); diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi16(diff, shift); res = _mm256_add_epi16(a32, b); res = _mm256_srli_epi16(res, 5); mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256); dst[r] = _mm256_castsi256_si128(res1); x += dx; } } static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, int upsample_above, int dx, int bd) { __m128i dstvec[32]; if (bd < 12) { highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above, dx); } else { highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above, upsample_above, dx); } for (int i = 0; i < N; i++) { _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); } } static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2( int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { // here upsample_above is 0 by design of av1_use_intra_edge_upsample (void)upsample_above; const int frac_bits = 6; const int max_base_x = ((16 + N) - 1); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0, a0_1, a1, a1_1, a32, a16; __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; a16 = _mm256_set1_epi32(16); a_mbase_x = _mm256_set1_epi16(above[max_base_x]); max_base_x256 = _mm256_set1_epi16(max_base_x); int x = dx; for (int r = 0; r < N; r++) { __m256i b, res[2], res1; int base = x >> frac_bits; if (base >= max_base_x) { for (int i = r; i < N; ++i) { dstvec[i] = a_mbase_x; // save 16 values } return; } __m256i shift = _mm256_srli_epi32( _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi32(diff, shift); res[0] = _mm256_add_epi32(a32, b); res[0] = _mm256_srli_epi32(res[0], 5); res[0] = _mm256_packus_epi32( res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); int mdif = max_base_x - base; if (mdif > 8) { a0_1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8))); a1_1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9))); diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi32(diff, shift); res[1] = _mm256_add_epi32(a32, b); res[1] = _mm256_srli_epi32(res[1], 5); res[1] = _mm256_packus_epi32( res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); } else { res[1] = a_mbase_x; } res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), 1); // 16 16bit values base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3, base + 4, base + 5, base + 6, base + 7, base + 8, base + 9, base + 10, base + 11, base + 12, base + 13, base + 14, base + 15); mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256); x += dx; } } static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2( int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { // here upsample_above is 0 by design of av1_use_intra_edge_upsample (void)upsample_above; const int frac_bits = 6; const int max_base_x = ((16 + N) - 1); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0, a1, a32, a16, c3f; __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; a16 = _mm256_set1_epi16(16); a_mbase_x = _mm256_set1_epi16(above[max_base_x]); max_base_x256 = _mm256_set1_epi16(max_base_x); c3f = _mm256_set1_epi16(0x3f); int x = dx; for (int r = 0; r < N; r++) { __m256i b, res; int base = x >> frac_bits; if (base >= max_base_x) { for (int i = r; i < N; ++i) { dstvec[i] = a_mbase_x; // save 16 values } return; } __m256i shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); a0 = _mm256_loadu_si256((__m256i *)(above + base)); a1 = _mm256_loadu_si256((__m256i *)(above + base + 1)); diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi16(diff, shift); res = _mm256_add_epi16(a32, b); res = _mm256_srli_epi16(res, 5); // 16 16bit values base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3, base + 4, base + 5, base + 6, base + 7, base + 8, base + 9, base + 10, base + 11, base + 12, base + 13, base + 14, base + 15); mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256); x += dx; } } static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, int upsample_above, int dx, int bd) { __m256i dstvec[64]; if (bd < 12) { highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above, dx); } else { highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above, upsample_above, dx); } for (int i = 0; i < N; i++) { _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); } } static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2( int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { // here upsample_above is 0 by design of av1_use_intra_edge_upsample (void)upsample_above; const int frac_bits = 6; const int max_base_x = ((32 + N) - 1); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0, a0_1, a1, a1_1, a32, a16, c3f; __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; a16 = _mm256_set1_epi32(16); a_mbase_x = _mm256_set1_epi16(above[max_base_x]); max_base_x256 = _mm256_set1_epi16(max_base_x); c3f = _mm256_set1_epi16(0x3f); int x = dx; for (int r = 0; r < N; r++) { __m256i b, res[2], res1; int base = x >> frac_bits; if (base >= max_base_x) { for (int i = r; i < N; ++i) { dstvec[i] = a_mbase_x; // save 32 values dstvec[i + N] = a_mbase_x; } return; } __m256i shift = _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1); for (int j = 0; j < 32; j += 16) { int mdif = max_base_x - (base + j); if (mdif <= 0) { res1 = a_mbase_x; } else { a0 = _mm256_cvtepu16_epi32( _mm_loadu_si128((__m128i *)(above + base + j))); a1 = _mm256_cvtepu16_epi32( _mm_loadu_si128((__m128i *)(above + base + 1 + j))); diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi32(diff, shift); res[0] = _mm256_add_epi32(a32, b); res[0] = _mm256_srli_epi32(res[0], 5); res[0] = _mm256_packus_epi32( res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); if (mdif > 8) { a0_1 = _mm256_cvtepu16_epi32( _mm_loadu_si128((__m128i *)(above + base + 8 + j))); a1_1 = _mm256_cvtepu16_epi32( _mm_loadu_si128((__m128i *)(above + base + 9 + j))); diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi32(diff, shift); res[1] = _mm256_add_epi32(a32, b); res[1] = _mm256_srli_epi32(res[1], 5); res[1] = _mm256_packus_epi32( res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); } else { res[1] = a_mbase_x; } res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), 1); // 16 16bit values base_inc256 = _mm256_setr_epi16( base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, base + j + 5, base + j + 6, base + j + 7, base + j + 8, base + j + 9, base + j + 10, base + j + 11, base + j + 12, base + j + 13, base + j + 14, base + j + 15); mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); } if (!j) { dstvec[r] = res1; } else { dstvec[r + N] = res1; } } x += dx; } } static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2( int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { // here upsample_above is 0 by design of av1_use_intra_edge_upsample (void)upsample_above; const int frac_bits = 6; const int max_base_x = ((32 + N) - 1); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0, a1, a32, a16, c3f; __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; a16 = _mm256_set1_epi16(16); a_mbase_x = _mm256_set1_epi16(above[max_base_x]); max_base_x256 = _mm256_set1_epi16(max_base_x); c3f = _mm256_set1_epi16(0x3f); int x = dx; for (int r = 0; r < N; r++) { __m256i b, res; int base = x >> frac_bits; if (base >= max_base_x) { for (int i = r; i < N; ++i) { dstvec[i] = a_mbase_x; // save 32 values dstvec[i + N] = a_mbase_x; } return; } __m256i shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); for (int j = 0; j < 32; j += 16) { int mdif = max_base_x - (base + j); if (mdif <= 0) { res = a_mbase_x; } else { a0 = _mm256_loadu_si256((__m256i *)(above + base + j)); a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j)); diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi16(diff, shift); res = _mm256_add_epi16(a32, b); res = _mm256_srli_epi16(res, 5); base_inc256 = _mm256_setr_epi16( base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, base + j + 5, base + j + 6, base + j + 7, base + j + 8, base + j + 9, base + j + 10, base + j + 11, base + j + 12, base + j + 13, base + j + 14, base + j + 15); mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); res = _mm256_blendv_epi8(a_mbase_x, res, mask256); } if (!j) { dstvec[r] = res; } else { dstvec[r + N] = res; } } x += dx; } } static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, int upsample_above, int dx, int bd) { __m256i dstvec[128]; if (bd < 12) { highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx); } else { highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx); } for (int i = 0; i < N; i++) { _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]); } } static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, int upsample_above, int dx) { // here upsample_above is 0 by design of av1_use_intra_edge_upsample (void)upsample_above; const int frac_bits = 6; const int max_base_x = ((64 + N) - 1); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0, a0_1, a1, a1_1, a32, a16; __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; a16 = _mm256_set1_epi32(16); a_mbase_x = _mm256_set1_epi16(above[max_base_x]); max_base_x256 = _mm256_set1_epi16(max_base_x); int x = dx; for (int r = 0; r < N; r++, dst += stride) { __m256i b, res[2], res1; int base = x >> frac_bits; if (base >= max_base_x) { for (int i = r; i < N; ++i) { _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x); _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x); dst += stride; } return; } __m256i shift = _mm256_srli_epi32( _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); __m128i a0_128, a0_1_128, a1_128, a1_1_128; for (int j = 0; j < 64; j += 16) { int mdif = max_base_x - (base + j); if (mdif <= 0) { _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x); } else { a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); a0 = _mm256_cvtepu16_epi32(a0_128); a1 = _mm256_cvtepu16_epi32(a1_128); diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi32(diff, shift); res[0] = _mm256_add_epi32(a32, b); res[0] = _mm256_srli_epi32(res[0], 5); res[0] = _mm256_packus_epi32( res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); if (mdif > 8) { a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j)); a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j)); a0_1 = _mm256_cvtepu16_epi32(a0_1_128); a1_1 = _mm256_cvtepu16_epi32(a1_1_128); diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi32(diff, shift); res[1] = _mm256_add_epi32(a32, b); res[1] = _mm256_srli_epi32(res[1], 5); res[1] = _mm256_packus_epi32( res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); } else { res[1] = a_mbase_x; } res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), 1); // 16 16bit values base_inc256 = _mm256_setr_epi16( base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, base + j + 5, base + j + 6, base + j + 7, base + j + 8, base + j + 9, base + j + 10, base + j + 11, base + j + 12, base + j + 13, base + j + 14, base + j + 15); mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); _mm256_storeu_si256((__m256i *)(dst + j), res1); } } x += dx; } } static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, int upsample_above, int dx) { // here upsample_above is 0 by design of av1_use_intra_edge_upsample (void)upsample_above; const int frac_bits = 6; const int max_base_x = ((64 + N) - 1); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0, a1, a32, a16, c3f; __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; a16 = _mm256_set1_epi16(16); a_mbase_x = _mm256_set1_epi16(above[max_base_x]); max_base_x256 = _mm256_set1_epi16(max_base_x); c3f = _mm256_set1_epi16(0x3f); int x = dx; for (int r = 0; r < N; r++, dst += stride) { __m256i b, res; int base = x >> frac_bits; if (base >= max_base_x) { for (int i = r; i < N; ++i) { _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x); _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x); dst += stride; } return; } __m256i shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); for (int j = 0; j < 64; j += 16) { int mdif = max_base_x - (base + j); if (mdif <= 0) { _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x); } else { a0 = _mm256_loadu_si256((__m256i *)(above + base + j)); a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j)); diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi16(diff, shift); res = _mm256_add_epi16(a32, b); res = _mm256_srli_epi16(res, 5); base_inc256 = _mm256_setr_epi16( base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, base + j + 5, base + j + 6, base + j + 7, base + j + 8, base + j + 9, base + j + 10, base + j + 11, base + j + 12, base + j + 13, base + j + 14, base + j + 15); mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); res = _mm256_blendv_epi8(a_mbase_x, res, mask256); _mm256_storeu_si256((__m256i *)(dst + j), res); // 16 16bit values } } x += dx; } } // Directional prediction, zone 1: 0 < angle < 90 void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd) { (void)left; (void)dy; switch (bw) { case 4: highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx, bd); break; case 8: highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx, bd); break; case 16: highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx, bd); break; case 32: highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx, bd); break; case 64: if (bd < 12) { highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx); } else { highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx); } break; default: break; } return; } static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc, uint16_t *dst, ptrdiff_t pitchDst) { __m256i r[16]; __m256i d[16]; for (int j = 0; j < 16; j++) { r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc)); } highbd_transpose16x16_avx2(r, d); for (int j = 0; j < 16; j++) { _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]); } } static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc, uint16_t *dst, ptrdiff_t pitchDst, int width, int height) { for (int j = 0; j < height; j += 16) for (int i = 0; i < width; i += 16) highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc, dst + j * pitchDst + i, pitchDst); } static void highbd_dr_prediction_32bit_z2_Nx4_avx2( int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy) { const int min_base_x = -(1 << upsample_above); const int min_base_y = -(1 << upsample_left); const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; assert(dx > 0); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0_x, a1_x, a32, a16; __m256i diff; __m128i c3f, min_base_y128; a16 = _mm256_set1_epi32(16); c3f = _mm_set1_epi32(0x3f); min_base_y128 = _mm_set1_epi32(min_base_y); for (int r = 0; r < N; r++) { __m256i b, res, shift; __m128i resx, resy, resxy; __m128i a0_x128, a1_x128; int y = r + 1; int base_x = (-y * dx) >> frac_bits_x; int base_shift = 0; if (base_x < (min_base_x - 1)) { base_shift = (min_base_x - base_x - 1) >> upsample_above; } int base_min_diff = (min_base_x - base_x + upsample_above) >> upsample_above; if (base_min_diff > 4) { base_min_diff = 4; } else { if (base_min_diff < 0) base_min_diff = 0; } if (base_shift > 3) { a0_x = _mm256_setzero_si256(); a1_x = _mm256_setzero_si256(); shift = _mm256_setzero_si256(); } else { a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); if (upsample_above) { a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx4[base_shift]); a1_x128 = _mm_srli_si128(a0_x128, 8); shift = _mm256_castsi128_si256(_mm_srli_epi32( _mm_and_si128( _mm_slli_epi32( _mm_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, (3 << 6) - y * dx), upsample_above), c3f), 1)); } else { a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); a1_x128 = _mm_srli_si128(a0_x128, 2); shift = _mm256_castsi128_si256(_mm_srli_epi32( _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, (3 << 6) - y * dx), c3f), 1)); } a0_x = _mm256_cvtepu16_epi32(a0_x128); a1_x = _mm256_cvtepu16_epi32(a1_x128); } // y calc __m128i a0_y, a1_y, shifty; if (base_x < min_base_x) { __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; DECLARE_ALIGNED(32, int, base_y_c[4]); r6 = _mm_set1_epi32(r << 6); dy128 = _mm_set1_epi32(dy); c1234 = _mm_setr_epi32(1, 2, 3, 4); y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128)); base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y); mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128); base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); _mm_store_si128((__m128i *)base_y_c, base_y_c128); a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]]); a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], left[base_y_c[3] + 1]); if (upsample_left) { shifty = _mm_srli_epi32( _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1); } else { shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1); } a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); shift = _mm256_inserti128_si256(shift, shifty, 1); } diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi32(diff, shift); res = _mm256_add_epi32(a32, b); res = _mm256_srli_epi32(res, 5); resx = _mm256_castsi256_si128(res); resx = _mm_packus_epi32(resx, resx); resy = _mm256_extracti128_si256(res, 1); resy = _mm_packus_epi32(resy, resy); resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); _mm_storel_epi64((__m128i *)(dst), resxy); dst += stride; } } static void highbd_dr_prediction_z2_Nx4_avx2( int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy) { const int min_base_x = -(1 << upsample_above); const int min_base_y = -(1 << upsample_left); const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; assert(dx > 0); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0_x, a1_x, a32, a16; __m256i diff; __m128i c3f, min_base_y128; a16 = _mm256_set1_epi16(16); c3f = _mm_set1_epi16(0x3f); min_base_y128 = _mm_set1_epi16(min_base_y); for (int r = 0; r < N; r++) { __m256i b, res, shift; __m128i resx, resy, resxy; __m128i a0_x128, a1_x128; int y = r + 1; int base_x = (-y * dx) >> frac_bits_x; int base_shift = 0; if (base_x < (min_base_x - 1)) { base_shift = (min_base_x - base_x - 1) >> upsample_above; } int base_min_diff = (min_base_x - base_x + upsample_above) >> upsample_above; if (base_min_diff > 4) { base_min_diff = 4; } else { if (base_min_diff < 0) base_min_diff = 0; } if (base_shift > 3) { a0_x = _mm256_setzero_si256(); a1_x = _mm256_setzero_si256(); shift = _mm256_setzero_si256(); } else { a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); if (upsample_above) { a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx4[base_shift]); a1_x128 = _mm_srli_si128(a0_x128, 8); shift = _mm256_castsi128_si256(_mm_srli_epi16( _mm_and_si128( _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, (3 << 6) - y * dx, 0, 0, 0, 0), upsample_above), c3f), 1)); } else { a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); a1_x128 = _mm_srli_si128(a0_x128, 2); shift = _mm256_castsi128_si256(_mm_srli_epi16( _mm_and_si128( _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, (3 << 6) - y * dx, 0, 0, 0, 0), c3f), 1)); } a0_x = _mm256_castsi128_si256(a0_x128); a1_x = _mm256_castsi128_si256(a1_x128); } // y calc __m128i a0_y, a1_y, shifty; if (base_x < min_base_x) { __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; DECLARE_ALIGNED(32, int16_t, base_y_c[8]); r6 = _mm_set1_epi16(r << 6); dy128 = _mm_set1_epi16(dy); c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0); y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); _mm_store_si128((__m128i *)base_y_c, base_y_c128); a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0, 0, 0); if (upsample_left) { shifty = _mm_srli_epi16( _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); } else { shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); } a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); shift = _mm256_inserti128_si256(shift, shifty, 1); } diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi16(diff, shift); res = _mm256_add_epi16(a32, b); res = _mm256_srli_epi16(res, 5); resx = _mm256_castsi256_si128(res); resy = _mm256_extracti128_si256(res, 1); resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); _mm_storel_epi64((__m128i *)(dst), resxy); dst += stride; } } static void highbd_dr_prediction_32bit_z2_Nx8_avx2( int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy) { const int min_base_x = -(1 << upsample_above); const int min_base_y = -(1 << upsample_left); const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256; __m256i diff; __m128i a0_x128, a1_x128; a16 = _mm256_set1_epi32(16); c3f = _mm256_set1_epi32(0x3f); min_base_y256 = _mm256_set1_epi32(min_base_y); for (int r = 0; r < N; r++) { __m256i b, res, shift; __m128i resx, resy, resxy; int y = r + 1; int base_x = (-y * dx) >> frac_bits_x; int base_shift = 0; if (base_x < (min_base_x - 1)) { base_shift = (min_base_x - base_x - 1) >> upsample_above; } int base_min_diff = (min_base_x - base_x + upsample_above) >> upsample_above; if (base_min_diff > 8) { base_min_diff = 8; } else { if (base_min_diff < 0) base_min_diff = 0; } if (base_shift > 7) { resx = _mm_setzero_si128(); } else { a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); if (upsample_above) { __m128i mask, atmp0, atmp1, atmp2, atmp3; a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift)); atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[base_shift]); atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[base_shift]); atmp2 = _mm_shuffle_epi8( a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); atmp3 = _mm_shuffle_epi8( a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift], _mm_set1_epi8(15)); a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask); mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16), _mm_set1_epi8(15)); a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask); shift = _mm256_srli_epi32( _mm256_and_si256( _mm256_slli_epi32( _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, (3 << 6) - y * dx, (4 << 6) - y * dx, (5 << 6) - y * dx, (6 << 6) - y * dx, (7 << 6) - y * dx), upsample_above), c3f), 1); } else { a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift)); a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); shift = _mm256_srli_epi32( _mm256_and_si256( _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, (3 << 6) - y * dx, (4 << 6) - y * dx, (5 << 6) - y * dx, (6 << 6) - y * dx, (7 << 6) - y * dx), c3f), 1); } a0_x = _mm256_cvtepu16_epi32(a0_x128); a1_x = _mm256_cvtepu16_epi32(a1_x128); diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi32(diff, shift); res = _mm256_add_epi32(a32, b); res = _mm256_srli_epi32(res, 5); resx = _mm256_castsi256_si128(_mm256_packus_epi32( res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); } // y calc if (base_x < min_base_x) { DECLARE_ALIGNED(32, int, base_y_c[8]); __m256i r6, c256, dy256, y_c256, base_y_c256, mask256; r6 = _mm256_set1_epi32(r << 6); dy256 = _mm256_set1_epi32(dy); c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y); mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); _mm256_store_si256((__m256i *)base_y_c, base_y_c256); a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], left[base_y_c[6]], left[base_y_c[7]])); a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], left[base_y_c[6] + 1], left[base_y_c[7] + 1])); if (upsample_left) { shift = _mm256_srli_epi32( _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f), 1); } else { shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1); } diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi32(diff, shift); res = _mm256_add_epi32(a32, b); res = _mm256_srli_epi32(res, 5); resy = _mm256_castsi256_si128(_mm256_packus_epi32( res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); } else { resy = resx; } resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); _mm_storeu_si128((__m128i *)(dst), resxy); dst += stride; } } static void highbd_dr_prediction_z2_Nx8_avx2( int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy) { const int min_base_x = -(1 << upsample_above); const int min_base_y = -(1 << upsample_left); const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m128i c3f, min_base_y128; __m256i a0_x, a1_x, diff, a32, a16; __m128i a0_x128, a1_x128; a16 = _mm256_set1_epi16(16); c3f = _mm_set1_epi16(0x3f); min_base_y128 = _mm_set1_epi16(min_base_y); for (int r = 0; r < N; r++) { __m256i b, res, shift; __m128i resx, resy, resxy; int y = r + 1; int base_x = (-y * dx) >> frac_bits_x; int base_shift = 0; if (base_x < (min_base_x - 1)) { base_shift = (min_base_x - base_x - 1) >> upsample_above; } int base_min_diff = (min_base_x - base_x + upsample_above) >> upsample_above; if (base_min_diff > 8) { base_min_diff = 8; } else { if (base_min_diff < 0) base_min_diff = 0; } if (base_shift > 7) { a0_x = _mm256_setzero_si256(); a1_x = _mm256_setzero_si256(); shift = _mm256_setzero_si256(); } else { a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); if (upsample_above) { __m128i mask, atmp0, atmp1, atmp2, atmp3; a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift)); atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[base_shift]); atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[base_shift]); atmp2 = _mm_shuffle_epi8( a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); atmp3 = _mm_shuffle_epi8( a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift], _mm_set1_epi8(15)); a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask); mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16), _mm_set1_epi8(15)); a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask); shift = _mm256_castsi128_si256(_mm_srli_epi16( _mm_and_si128( _mm_slli_epi16( _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, (3 << 6) - y * dx, (4 << 6) - y * dx, (5 << 6) - y * dx, (6 << 6) - y * dx, (7 << 6) - y * dx), upsample_above), c3f), 1)); } else { a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift)); a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); shift = _mm256_castsi128_si256(_mm_srli_epi16( _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, (3 << 6) - y * dx, (4 << 6) - y * dx, (5 << 6) - y * dx, (6 << 6) - y * dx, (7 << 6) - y * dx), c3f), 1)); } a0_x = _mm256_castsi128_si256(a0_x128); a1_x = _mm256_castsi128_si256(a1_x128); } // y calc __m128i a0_y, a1_y, shifty; if (base_x < min_base_x) { DECLARE_ALIGNED(32, int16_t, base_y_c[8]); __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; r6 = _mm_set1_epi16(r << 6); dy128 = _mm_set1_epi16(dy); c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); _mm_store_si128((__m128i *)base_y_c, base_y_c128); a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], left[base_y_c[6]], left[base_y_c[7]]); a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], left[base_y_c[6] + 1], left[base_y_c[7] + 1]); if (upsample_left) { shifty = _mm_srli_epi16( _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1); } else { shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); } a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); shift = _mm256_inserti128_si256(shift, shifty, 1); } diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi16(diff, shift); res = _mm256_add_epi16(a32, b); res = _mm256_srli_epi16(res, 5); resx = _mm256_castsi256_si128(res); resy = _mm256_extracti128_si256(res, 1); resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); _mm_storeu_si128((__m128i *)(dst), resxy); dst += stride; } } static void highbd_dr_prediction_32bit_z2_HxW_avx2( int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy) { // here upsample_above and upsample_left are 0 by design of // av1_use_intra_edge_upsample const int min_base_x = -1; const int min_base_y = -1; (void)upsample_above; (void)upsample_left; const int frac_bits_x = 6; const int frac_bits_y = 6; // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1; __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8; __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128; DECLARE_ALIGNED(32, int, base_y_c[16]); a16 = _mm256_set1_epi32(16); c1 = _mm256_srli_epi32(a16, 4); c8 = _mm256_srli_epi32(a16, 1); min_base_y256 = _mm256_set1_epi32(min_base_y); c3f = _mm256_set1_epi32(0x3f); dy256 = _mm256_set1_epi32(dy); c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); c1234 = _mm256_add_epi32(c0123, c1); for (int r = 0; r < H; r++) { __m256i b, res, shift, ydx; __m256i resx[2], resy[2]; __m256i resxy, j256, r6; for (int j = 0; j < W; j += 16) { j256 = _mm256_set1_epi32(j); int y = r + 1; ydx = _mm256_set1_epi32(y * dx); int base_x = ((j << 6) - y * dx) >> frac_bits_x; int base_shift = 0; if ((base_x) < (min_base_x - 1)) { base_shift = (min_base_x - base_x - 1); } int base_min_diff = (min_base_x - base_x); if (base_min_diff > 16) { base_min_diff = 16; } else { if (base_min_diff < 0) base_min_diff = 0; } if (base_shift > 7) { resx[0] = _mm256_setzero_si256(); } else { a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1)); a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); a0_x = _mm256_cvtepu16_epi32(a0_x128); a1_x = _mm256_cvtepu16_epi32(a1_x128); r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6); shift = _mm256_srli_epi32( _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1); diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi32(diff, shift); res = _mm256_add_epi32(a32, b); res = _mm256_srli_epi32(res, 5); resx[0] = _mm256_packus_epi32( res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); } int base_shift8 = 0; if ((base_x + 8) < (min_base_x - 1)) { base_shift8 = (min_base_x - (base_x + 8) - 1); } if (base_shift8 > 7) { resx[1] = _mm256_setzero_si256(); } else { a0_1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8)); a1_1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9)); a0_1_x128 = _mm_shuffle_epi8(a0_1_x128, *(__m128i *)HighbdLoadMaskx[base_shift8]); a1_1_x128 = _mm_shuffle_epi8(a1_1_x128, *(__m128i *)HighbdLoadMaskx[base_shift8]); a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128); a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128); r6 = _mm256_slli_epi32( _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6); shift = _mm256_srli_epi32( _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1); diff = _mm256_sub_epi32(a1_1_x, a0_1_x); // a[x+1] - a[x] a32 = _mm256_slli_epi32(a0_1_x, 5); // a[x] * 32 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi32(diff, shift); resx[1] = _mm256_add_epi32(a32, b); resx[1] = _mm256_srli_epi32(resx[1], 5); resx[1] = _mm256_packus_epi32( resx[1], _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1))); } resx[0] = _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]), 1); // 16 16bit values // y calc resy[0] = _mm256_setzero_si256(); if ((base_x < min_base_x)) { __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256; r6 = _mm256_set1_epi32(r << 6); c256 = _mm256_add_epi32(j256, c1234); y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y); mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); _mm256_store_si256((__m256i *)base_y_c, base_y_c256); c256 = _mm256_add_epi32(c256, c8); y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y); mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256); a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], left[base_y_c[6]], left[base_y_c[7]])); a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], left[base_y_c[6] + 1], left[base_y_c[7] + 1])); shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1); diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi32(diff, shift); res = _mm256_add_epi32(a32, b); res = _mm256_srli_epi32(res, 5); resy[0] = _mm256_packus_epi32( res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], left[base_y_c[15]])); a1_y = _mm256_cvtepu16_epi32( _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1], left[base_y_c[10] + 1], left[base_y_c[11] + 1], left[base_y_c[12] + 1], left[base_y_c[13] + 1], left[base_y_c[14] + 1], left[base_y_c[15] + 1])); shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1); diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi32(diff, shift); res = _mm256_add_epi32(a32, b); res = _mm256_srli_epi32(res, 5); resy[1] = _mm256_packus_epi32( res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); resy[0] = _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]), 1); // 16 16bit values } resxy = _mm256_blendv_epi8(resx[0], resy[0], *(__m256i *)HighbdBaseMask[base_min_diff]); _mm256_storeu_si256((__m256i *)(dst + j), resxy); } // for j dst += stride; } } static void highbd_dr_prediction_z2_HxW_avx2( int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy) { // here upsample_above and upsample_left are 0 by design of // av1_use_intra_edge_upsample const int min_base_x = -1; const int min_base_y = -1; (void)upsample_above; (void)upsample_left; const int frac_bits_x = 6; const int frac_bits_y = 6; // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0_x, a1_x, a32, a16, c3f, c1; __m256i diff, min_base_y256, dy256, c1234, c0123; DECLARE_ALIGNED(32, int16_t, base_y_c[16]); a16 = _mm256_set1_epi16(16); c1 = _mm256_srli_epi16(a16, 4); min_base_y256 = _mm256_set1_epi16(min_base_y); c3f = _mm256_set1_epi16(0x3f); dy256 = _mm256_set1_epi16(dy); c0123 = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); c1234 = _mm256_add_epi16(c0123, c1); for (int r = 0; r < H; r++) { __m256i b, res, shift; __m256i resx, resy, ydx; __m256i resxy, j256, r6; __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128; int y = r + 1; ydx = _mm256_set1_epi16((short)(y * dx)); for (int j = 0; j < W; j += 16) { j256 = _mm256_set1_epi16(j); int base_x = ((j << 6) - y * dx) >> frac_bits_x; int base_shift = 0; if ((base_x) < (min_base_x - 1)) { base_shift = (min_base_x - (base_x)-1); } int base_min_diff = (min_base_x - base_x); if (base_min_diff > 16) { base_min_diff = 16; } else { if (base_min_diff < 0) base_min_diff = 0; } if (base_shift < 8) { a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1)); a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); a0_x = _mm256_castsi128_si256(a0_x128); a1_x = _mm256_castsi128_si256(a1_x128); } else { a0_x = _mm256_setzero_si256(); a1_x = _mm256_setzero_si256(); } int base_shift1 = 0; if (base_shift > 8) { base_shift1 = base_shift - 8; } if (base_shift1 < 8) { a0_1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8)); a1_1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9)); a0_1_x128 = _mm_shuffle_epi8(a0_1_x128, *(__m128i *)HighbdLoadMaskx[base_shift1]); a1_1_x128 = _mm_shuffle_epi8(a1_1_x128, *(__m128i *)HighbdLoadMaskx[base_shift1]); a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1); a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1); } r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6); shift = _mm256_srli_epi16( _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1); diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi16(diff, shift); res = _mm256_add_epi16(a32, b); resx = _mm256_srli_epi16(res, 5); // 16 16-bit values // y calc resy = _mm256_setzero_si256(); __m256i a0_y, a1_y, shifty; if ((base_x < min_base_x)) { __m256i c256, y_c256, base_y_c256, mask256, mul16; r6 = _mm256_set1_epi16(r << 6); c256 = _mm256_add_epi16(j256, c1234); mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256), _mm256_srli_epi16(min_base_y256, 1)); y_c256 = _mm256_sub_epi16(r6, mul16); base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y); mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256); base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); _mm256_store_si256((__m256i *)base_y_c, base_y_c256); a0_y = _mm256_setr_epi16( left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], left[base_y_c[15]]); base_y_c256 = _mm256_add_epi16(base_y_c256, c1); _mm256_store_si256((__m256i *)base_y_c, base_y_c256); a1_y = _mm256_setr_epi16( left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], left[base_y_c[15]]); shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1); diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi16(diff, shifty); res = _mm256_add_epi16(a32, b); resy = _mm256_srli_epi16(res, 5); } resxy = _mm256_blendv_epi8(resx, resy, *(__m256i *)HighbdBaseMask[base_min_diff]); _mm256_storeu_si256((__m256i *)(dst + j), resxy); } // for j dst += stride; } } // Directional prediction, zone 2: 90 < angle < 180 void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd) { (void)bd; assert(dx > 0); assert(dy > 0); switch (bw) { case 4: if (bd < 12) { highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above, upsample_left, dx, dy); } else { highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above, upsample_left, dx, dy); } break; case 8: if (bd < 12) { highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above, upsample_left, dx, dy); } else { highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above, upsample_left, dx, dy); } break; default: if (bd < 12) { highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left, upsample_above, upsample_left, dx, dy); } else { highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left, upsample_above, upsample_left, dx, dy); } break; } } // Directional prediction, zone 3 functions static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { __m128i dstvec[4], d[4]; if (bd < 12) { highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, dy); } highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0], &d[1], &d[2], &d[3]); _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); return; } static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { __m128i dstvec[8], d[8]; if (bd < 12) { highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, dy); } highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); for (int i = 0; i < 8; i++) { _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); } } static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { __m128i dstvec[4], d[8]; if (bd < 12) { highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, dy); } highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); for (int i = 0; i < 8; i++) { _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); } } static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { __m128i dstvec[8], d[4]; if (bd < 12) { highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, dy); } highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2], &d[3]); _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]); _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]); _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]); _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]); } static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { __m256i dstvec[8], d[8]; if (bd < 12) { highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left, upsample_left, dy); } highbd_transpose8x16_16x8_avx2(dstvec, d); for (int i = 0; i < 8; i++) { _mm_storeu_si128((__m128i *)(dst + i * stride), _mm256_castsi256_si128(d[i])); } for (int i = 8; i < 16; i++) { _mm_storeu_si128((__m128i *)(dst + i * stride), _mm256_extracti128_si256(d[i - 8], 1)); } } static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { __m128i dstvec[16], d[16]; if (bd < 12) { highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left, upsample_left, dy); } for (int i = 0; i < 16; i += 8) { highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i], &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i], &dstvec[6 + i], &dstvec[7 + i], &d[0 + i], &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i], &d[5 + i], &d[6 + i], &d[7 + i]); } for (int i = 0; i < 8; i++) { _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]); } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { __m256i dstvec[4], d[4], d1; if (bd < 12) { highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left, upsample_left, dy); } highbd_transpose4x16_avx2(dstvec, d); for (int i = 0; i < 4; i++) { _mm_storel_epi64((__m128i *)(dst + i * stride), _mm256_castsi256_si128(d[i])); d1 = _mm256_bsrli_epi128(d[i], 8); _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride), _mm256_castsi256_si128(d1)); _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), _mm256_extracti128_si256(d[i], 1)); _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride), _mm256_extracti128_si256(d1, 1)); } } static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { __m128i dstvec[16], d[8]; if (bd < 12) { highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left, upsample_left, dy); } highbd_transpose16x4_8x8_sse2(dstvec, d); _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]); _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]); _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]); _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]); _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]); _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]); _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]); _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]); } static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { __m256i dstvec[16], d[16]; if (bd < 12) { highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy); } for (int i = 0; i < 16; i += 8) { highbd_transpose8x16_16x8_avx2(dstvec + i, d + i); } for (int i = 0; i < 8; i++) { _mm_storeu_si128((__m128i *)(dst + i * stride), _mm256_castsi256_si128(d[i])); } for (int i = 0; i < 8; i++) { _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride), _mm256_extracti128_si256(d[i], 1)); } for (int i = 8; i < 16; i++) { _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride), _mm256_castsi256_si128(d[i])); } for (int i = 8; i < 16; i++) { _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride), _mm256_extracti128_si256(d[i], 1)); } } static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { __m128i dstvec[32], d[32]; if (bd < 12) { highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left, upsample_left, dy); } for (int i = 0; i < 32; i += 8) { highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i], &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i], &dstvec[6 + i], &dstvec[7 + i], &d[0 + i], &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i], &d[5 + i], &d[6 + i], &d[7 + i]); } for (int i = 0; i < 8; i++) { _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]); _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]); _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]); } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { __m256i dstvec[16], d[16]; if (bd < 12) { highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left, upsample_left, dy); } highbd_transpose16x16_avx2(dstvec, d); for (int i = 0; i < 16; i++) { _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]); } } static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { __m256i dstvec[64], d[16]; if (bd < 12) { highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy); } highbd_transpose16x16_avx2(dstvec, d); for (int j = 0; j < 16; j++) { _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]); } highbd_transpose16x16_avx2(dstvec + 16, d); for (int j = 0; j < 16; j++) { _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]); } highbd_transpose16x16_avx2(dstvec + 32, d); for (int j = 0; j < 16; j++) { _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]); } highbd_transpose16x16_avx2(dstvec + 48, d); for (int j = 0; j < 16; j++) { _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]); } } static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]); if (bd < 12) { highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy); } highbd_transpose(dstT, 64, dst, stride, 64, 64); } static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { __m256i dstvec[32], d[32]; if (bd < 12) { highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy); } for (int i = 0; i < 32; i += 8) { highbd_transpose8x16_16x8_avx2(dstvec + i, d + i); } // store for (int j = 0; j < 32; j += 16) { for (int i = 0; i < 8; i++) { _mm_storeu_si128((__m128i *)(dst + (i + j) * stride), _mm256_castsi256_si128(d[(i + j)])); } for (int i = 0; i < 8; i++) { _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8), _mm256_castsi256_si128(d[(i + j) + 8])); } for (int i = 8; i < 16; i++) { _mm256_storeu_si256( (__m256i *)(dst + (i + j) * stride), _mm256_inserti128_si256( d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0)); } } } static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { __m256i dstvec[32], d[16]; if (bd < 12) { highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left, upsample_left, dy); } for (int i = 0; i < 32; i += 16) { highbd_transpose16x16_avx2((dstvec + i), d); for (int j = 0; j < 16; j++) { _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]); } } } static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { uint16_t dstT[64 * 32]; if (bd < 12) { highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy); } highbd_transpose(dstT, 64, dst, stride, 32, 64); } static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]); highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd); highbd_transpose(dstT, 32, dst, stride, 64, 32); return; } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]); if (bd < 12) { highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy); } highbd_transpose(dstT, 64, dst, stride, 16, 64); } static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy, int bd) { __m256i dstvec[64], d[16]; if (bd < 12) { highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left, dy); } else { highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left, upsample_left, dy); } for (int i = 0; i < 64; i += 16) { highbd_transpose16x16_avx2((dstvec + i), d); for (int j = 0; j < 16; j++) { _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]); } } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd) { (void)above; (void)dx; assert(dx == 1); assert(dy > 0); if (bw == bh) { switch (bw) { case 4: highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy, bd); break; case 8: highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy, bd); break; case 16: highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy, bd); break; case 32: highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy, bd); break; case 64: highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy, bd); break; } } else { if (bw < bh) { if (bw + bw == bh) { switch (bw) { case 4: highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy, bd); break; case 8: highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy, bd); break; case 16: highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy, bd); break; case 32: highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy, bd); break; } } else { switch (bw) { #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER case 4: highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy, bd); break; case 8: highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy, bd); break; case 16: highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy, bd); break; #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER } } } else { if (bh + bh == bw) { switch (bh) { case 4: highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy, bd); break; case 8: highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy, bd); break; case 16: highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy, bd); break; case 32: highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy, bd); break; } } else { switch (bh) { #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER case 4: highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy, bd); break; case 8: highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy, bd); break; case 16: highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy, bd); break; #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER } } } } return; } #endif // CONFIG_AV1_HIGHBITDEPTH // Low bit depth functions static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, }; /* clang-format on */ static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2( int H, int W, __m128i *dst, const uint8_t *above, int upsample_above, int dx) { const int frac_bits = 6 - upsample_above; const int max_base_x = ((W + H) - 1) << upsample_above; assert(dx > 0); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0, a1, a32, a16; __m256i diff, c3f; __m128i a_mbase_x; a16 = _mm256_set1_epi16(16); a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]); c3f = _mm256_set1_epi16(0x3f); int x = dx; for (int r = 0; r < W; r++) { __m256i b, res, shift; __m128i res1, a0_128, a1_128; int base = x >> frac_bits; int base_max_diff = (max_base_x - base) >> upsample_above; if (base_max_diff <= 0) { for (int i = r; i < W; ++i) { dst[i] = a_mbase_x; // save 4 values } return; } if (base_max_diff > H) base_max_diff = H; a0_128 = _mm_loadu_si128((__m128i *)(above + base)); a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1)); if (upsample_above) { a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]); a1_128 = _mm_srli_si128(a0_128, 8); shift = _mm256_srli_epi16( _mm256_and_si256( _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f), 1); } else { shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); } a0 = _mm256_cvtepu8_epi16(a0_128); a1 = _mm256_cvtepu8_epi16(a1_128); diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi16(diff, shift); res = _mm256_add_epi16(a32, b); res = _mm256_srli_epi16(res, 5); res = _mm256_packus_epi16( res, _mm256_castsi128_si256( _mm256_extracti128_si256(res, 1))); // goto 8 bit res1 = _mm256_castsi256_si128(res); // 16 8bit values dst[r] = _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]); x += dx; } } static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int upsample_above, int dx) { __m128i dstvec[16]; dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx); for (int i = 0; i < N; i++) { *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]); } } static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int upsample_above, int dx) { __m128i dstvec[32]; dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx); for (int i = 0; i < N; i++) { _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); } } static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int upsample_above, int dx) { __m128i dstvec[64]; dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx); for (int i = 0; i < N; i++) { _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); } } static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2( int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) { // here upsample_above is 0 by design of av1_use_intra_edge_upsample (void)upsample_above; const int frac_bits = 6; const int max_base_x = ((32 + N) - 1); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0, a1, a32, a16; __m256i a_mbase_x, diff, c3f; a16 = _mm256_set1_epi16(16); a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]); c3f = _mm256_set1_epi16(0x3f); int x = dx; for (int r = 0; r < N; r++) { __m256i b, res, res16[2]; __m128i a0_128, a1_128; int base = x >> frac_bits; int base_max_diff = (max_base_x - base); if (base_max_diff <= 0) { for (int i = r; i < N; ++i) { dstvec[i] = a_mbase_x; // save 32 values } return; } if (base_max_diff > 32) base_max_diff = 32; __m256i shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); for (int j = 0, jj = 0; j < 32; j += 16, jj++) { int mdiff = base_max_diff - j; if (mdiff <= 0) { res16[jj] = a_mbase_x; } else { a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1)); a0 = _mm256_cvtepu8_epi16(a0_128); a1 = _mm256_cvtepu8_epi16(a1_128); diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi16(diff, shift); res = _mm256_add_epi16(a32, b); res = _mm256_srli_epi16(res, 5); res16[jj] = _mm256_packus_epi16( res, _mm256_castsi128_si256( _mm256_extracti128_si256(res, 1))); // 16 8bit values } } res16[1] = _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]), 1); // 32 8bit values dstvec[r] = _mm256_blendv_epi8( a_mbase_x, res16[1], *(__m256i *)BaseMask[base_max_diff]); // 32 8bit values x += dx; } } static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int upsample_above, int dx) { __m256i dstvec[64]; dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx); for (int i = 0; i < N; i++) { _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); } } static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int upsample_above, int dx) { // here upsample_above is 0 by design of av1_use_intra_edge_upsample (void)upsample_above; const int frac_bits = 6; const int max_base_x = ((64 + N) - 1); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i a0, a1, a32, a16; __m256i a_mbase_x, diff, c3f; __m128i max_base_x128, base_inc128, mask128; a16 = _mm256_set1_epi16(16); a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]); max_base_x128 = _mm_set1_epi8(max_base_x); c3f = _mm256_set1_epi16(0x3f); int x = dx; for (int r = 0; r < N; r++, dst += stride) { __m256i b, res; int base = x >> frac_bits; if (base >= max_base_x) { for (int i = r; i < N; ++i) { _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); dst += stride; } return; } __m256i shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); __m128i a0_128, a1_128, res128; for (int j = 0; j < 64; j += 16) { int mdif = max_base_x - (base + j); if (mdif <= 0) { _mm_storeu_si128((__m128i *)(dst + j), _mm256_castsi256_si128(a_mbase_x)); } else { a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); a0 = _mm256_cvtepu8_epi16(a0_128); a1 = _mm256_cvtepu8_epi16(a1_128); diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi16(diff, shift); res = _mm256_add_epi16(a32, b); res = _mm256_srli_epi16(res, 5); res = _mm256_packus_epi16( res, _mm256_castsi128_si256( _mm256_extracti128_si256(res, 1))); // 16 8bit values base_inc128 = _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1), (int8_t)(base + j + 2), (int8_t)(base + j + 3), (int8_t)(base + j + 4), (int8_t)(base + j + 5), (int8_t)(base + j + 6), (int8_t)(base + j + 7), (int8_t)(base + j + 8), (int8_t)(base + j + 9), (int8_t)(base + j + 10), (int8_t)(base + j + 11), (int8_t)(base + j + 12), (int8_t)(base + j + 13), (int8_t)(base + j + 14), (int8_t)(base + j + 15)); mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128), _mm_setzero_si128()); res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x), _mm256_castsi256_si128(res), mask128); _mm_storeu_si128((__m128i *)(dst + j), res128); } } x += dx; } } // Directional prediction, zone 1: 0 < angle < 90 void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy) { (void)left; (void)dy; switch (bw) { case 4: dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx); break; case 8: dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx); break; case 16: dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx); break; case 32: dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx); break; case 64: dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx); break; default: break; } return; } static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy) { const int min_base_x = -(1 << upsample_above); const int min_base_y = -(1 << upsample_left); const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; assert(dx > 0); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m128i a0_x, a1_x, a32, a16, diff; __m128i c3f, min_base_y128, c1234, dy128; a16 = _mm_set1_epi16(16); c3f = _mm_set1_epi16(0x3f); min_base_y128 = _mm_set1_epi16(min_base_y); c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0); dy128 = _mm_set1_epi16(dy); for (int r = 0; r < N; r++) { __m128i b, res, shift, r6, ydx; __m128i resx, resy, resxy; __m128i a0_x128, a1_x128; int y = r + 1; int base_x = (-y * dx) >> frac_bits_x; int base_shift = 0; if (base_x < (min_base_x - 1)) { base_shift = (min_base_x - base_x - 1) >> upsample_above; } int base_min_diff = (min_base_x - base_x + upsample_above) >> upsample_above; if (base_min_diff > 4) { base_min_diff = 4; } else { if (base_min_diff < 0) base_min_diff = 0; } if (base_shift > 3) { a0_x = _mm_setzero_si128(); a1_x = _mm_setzero_si128(); shift = _mm_setzero_si128(); } else { a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); ydx = _mm_set1_epi16(y * dx); r6 = _mm_slli_epi16(c1234, 6); if (upsample_above) { a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]); a1_x128 = _mm_srli_si128(a0_x128, 8); shift = _mm_srli_epi16( _mm_and_si128( _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), 1); } else { a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); a1_x128 = _mm_srli_si128(a0_x128, 1); shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); } a0_x = _mm_cvtepu8_epi16(a0_x128); a1_x = _mm_cvtepu8_epi16(a1_x128); } // y calc __m128i a0_y, a1_y, shifty; if (base_x < min_base_x) { DECLARE_ALIGNED(32, int16_t, base_y_c[8]); __m128i y_c128, base_y_c128, mask128, c1234_; c1234_ = _mm_srli_si128(c1234, 2); r6 = _mm_set1_epi16(r << 6); y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128)); base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); _mm_store_si128((__m128i *)base_y_c, base_y_c128); a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4)); _mm_store_si128((__m128i *)base_y_c, base_y_c128); a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); if (upsample_left) { shifty = _mm_srli_epi16( _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); } else { shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); } a0_x = _mm_unpacklo_epi64(a0_x, a0_y); a1_x = _mm_unpacklo_epi64(a1_x, a1_y); shift = _mm_unpacklo_epi64(shift, shifty); } diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm_mullo_epi16(diff, shift); res = _mm_add_epi16(a32, b); res = _mm_srli_epi16(res, 5); resx = _mm_packus_epi16(res, res); resy = _mm_srli_si128(resx, 4); resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); *(int *)(dst) = _mm_cvtsi128_si32(resxy); dst += stride; } } static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy) { const int min_base_x = -(1 << upsample_above); const int min_base_y = -(1 << upsample_left); const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m256i diff, a32, a16; __m256i a0_x, a1_x; __m128i a0_x128, a1_x128, min_base_y128, c3f; __m128i c1234, dy128; a16 = _mm256_set1_epi16(16); c3f = _mm_set1_epi16(0x3f); min_base_y128 = _mm_set1_epi16(min_base_y); dy128 = _mm_set1_epi16(dy); c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); for (int r = 0; r < N; r++) { __m256i b, res, shift; __m128i resx, resy, resxy, r6, ydx; int y = r + 1; int base_x = (-y * dx) >> frac_bits_x; int base_shift = 0; if (base_x < (min_base_x - 1)) { base_shift = (min_base_x - base_x - 1) >> upsample_above; } int base_min_diff = (min_base_x - base_x + upsample_above) >> upsample_above; if (base_min_diff > 8) { base_min_diff = 8; } else { if (base_min_diff < 0) base_min_diff = 0; } if (base_shift > 7) { a0_x = _mm256_setzero_si256(); a1_x = _mm256_setzero_si256(); shift = _mm256_setzero_si256(); } else { a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); ydx = _mm_set1_epi16(y * dx); r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6); if (upsample_above) { a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]); a1_x128 = _mm_srli_si128(a0_x128, 8); shift = _mm256_castsi128_si256(_mm_srli_epi16( _mm_and_si128( _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), 1)); } else { a1_x128 = _mm_srli_si128(a0_x128, 1); a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]); shift = _mm256_castsi128_si256( _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1)); } a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128)); a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128)); } // y calc __m128i a0_y, a1_y, shifty; if (base_x < min_base_x) { DECLARE_ALIGNED(32, int16_t, base_y_c[16]); __m128i y_c128, base_y_c128, mask128; r6 = _mm_set1_epi16(r << 6); y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); _mm_store_si128((__m128i *)base_y_c, base_y_c128); a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], left[base_y_c[6]], left[base_y_c[7]]); base_y_c128 = _mm_add_epi16( base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4)); _mm_store_si128((__m128i *)base_y_c, base_y_c128); a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], left[base_y_c[6]], left[base_y_c[7]]); if (upsample_left) { shifty = _mm_srli_epi16( _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); } else { shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); } a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); shift = _mm256_inserti128_si256(shift, shifty, 1); } diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi16(diff, shift); res = _mm256_add_epi16(a32, b); res = _mm256_srli_epi16(res, 5); resx = _mm_packus_epi16(_mm256_castsi256_si128(res), _mm256_castsi256_si128(res)); resy = _mm256_extracti128_si256(res, 1); resy = _mm_packus_epi16(resy, resy); resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); _mm_storel_epi64((__m128i *)(dst), resxy); dst += stride; } } static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy) { // here upsample_above and upsample_left are 0 by design of // av1_use_intra_edge_upsample const int min_base_x = -1; const int min_base_y = -1; (void)upsample_above; (void)upsample_left; const int frac_bits_x = 6; const int frac_bits_y = 6; __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123; __m256i diff, min_base_y256, c3f, shifty, dy256, c1; __m128i a0_x128, a1_x128; DECLARE_ALIGNED(32, int16_t, base_y_c[16]); a16 = _mm256_set1_epi16(16); c1 = _mm256_srli_epi16(a16, 4); min_base_y256 = _mm256_set1_epi16(min_base_y); c3f = _mm256_set1_epi16(0x3f); dy256 = _mm256_set1_epi16(dy); c0123 = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); c1234 = _mm256_add_epi16(c0123, c1); for (int r = 0; r < H; r++) { __m256i b, res, shift, j256, r6, ydx; __m128i resx, resy; __m128i resxy; int y = r + 1; ydx = _mm256_set1_epi16((int16_t)(y * dx)); int base_x = (-y * dx) >> frac_bits_x; for (int j = 0; j < W; j += 16) { j256 = _mm256_set1_epi16(j); int base_shift = 0; if ((base_x + j) < (min_base_x - 1)) { base_shift = (min_base_x - (base_x + j) - 1); } int base_min_diff = (min_base_x - base_x - j); if (base_min_diff > 16) { base_min_diff = 16; } else { if (base_min_diff < 0) base_min_diff = 0; } if (base_shift < 16) { a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j)); a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j)); a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]); a0_x = _mm256_cvtepu8_epi16(a0_x128); a1_x = _mm256_cvtepu8_epi16(a1_x128); r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6); shift = _mm256_srli_epi16( _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1); diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi16(diff, shift); res = _mm256_add_epi16(a32, b); res = _mm256_srli_epi16(res, 5); // 16 16-bit values resx = _mm256_castsi256_si128(_mm256_packus_epi16( res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); } else { resx = _mm_setzero_si128(); } // y calc if (base_x < min_base_x) { __m256i c256, y_c256, base_y_c256, mask256, mul16; r6 = _mm256_set1_epi16(r << 6); c256 = _mm256_add_epi16(j256, c1234); mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256), _mm256_srli_epi16(min_base_y256, 1)); y_c256 = _mm256_sub_epi16(r6, mul16); base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y); mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256); base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256); int16_t min_y = (int16_t)_mm_extract_epi16( _mm256_extracti128_si256(base_y_c256, 1), 7); int16_t max_y = (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0); int16_t offset_diff = max_y - min_y; if (offset_diff < 16) { __m256i min_y256 = _mm256_set1_epi16(min_y); __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256); __m128i base_y_offset128 = _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0), _mm256_extracti128_si256(base_y_offset, 1)); __m128i a0_y128 = _mm_maskload_epi32( (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]); __m128i a1_y128 = _mm_maskload_epi32((int *)(left + min_y + 1), *(__m128i *)LoadMaskz2[offset_diff / 4]); a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128); a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128); a0_y = _mm256_cvtepu8_epi16(a0_y128); a1_y = _mm256_cvtepu8_epi16(a1_y128); } else { base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); _mm256_store_si256((__m256i *)base_y_c, base_y_c256); a0_y = _mm256_setr_epi16( left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], left[base_y_c[15]]); base_y_c256 = _mm256_add_epi16(base_y_c256, c1); _mm256_store_si256((__m256i *)base_y_c, base_y_c256); a1_y = _mm256_setr_epi16( left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], left[base_y_c[15]]); } shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1); diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm256_mullo_epi16(diff, shifty); res = _mm256_add_epi16(a32, b); res = _mm256_srli_epi16(res, 5); // 16 16-bit values resy = _mm256_castsi256_si128(_mm256_packus_epi16( res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); } else { resy = _mm_setzero_si128(); } resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); _mm_storeu_si128((__m128i *)(dst + j), resxy); } // for j dst += stride; } } // Directional prediction, zone 2: 90 < angle < 180 void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy) { assert(dx > 0); assert(dy > 0); switch (bw) { case 4: dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above, upsample_left, dx, dy); break; case 8: dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above, upsample_left, dx, dy); break; default: dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left, upsample_above, upsample_left, dx, dy); break; } return; } // z3 functions static inline void transpose16x32_avx2(__m256i *x, __m256i *d) { __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; __m256i w10, w11, w12, w13, w14, w15; w0 = _mm256_unpacklo_epi8(x[0], x[1]); w1 = _mm256_unpacklo_epi8(x[2], x[3]); w2 = _mm256_unpacklo_epi8(x[4], x[5]); w3 = _mm256_unpacklo_epi8(x[6], x[7]); w8 = _mm256_unpacklo_epi8(x[8], x[9]); w9 = _mm256_unpacklo_epi8(x[10], x[11]); w10 = _mm256_unpacklo_epi8(x[12], x[13]); w11 = _mm256_unpacklo_epi8(x[14], x[15]); w4 = _mm256_unpacklo_epi16(w0, w1); w5 = _mm256_unpacklo_epi16(w2, w3); w12 = _mm256_unpacklo_epi16(w8, w9); w13 = _mm256_unpacklo_epi16(w10, w11); w6 = _mm256_unpacklo_epi32(w4, w5); w7 = _mm256_unpackhi_epi32(w4, w5); w14 = _mm256_unpacklo_epi32(w12, w13); w15 = _mm256_unpackhi_epi32(w12, w13); // Store first 4-line result d[0] = _mm256_unpacklo_epi64(w6, w14); d[1] = _mm256_unpackhi_epi64(w6, w14); d[2] = _mm256_unpacklo_epi64(w7, w15); d[3] = _mm256_unpackhi_epi64(w7, w15); w4 = _mm256_unpackhi_epi16(w0, w1); w5 = _mm256_unpackhi_epi16(w2, w3); w12 = _mm256_unpackhi_epi16(w8, w9); w13 = _mm256_unpackhi_epi16(w10, w11); w6 = _mm256_unpacklo_epi32(w4, w5); w7 = _mm256_unpackhi_epi32(w4, w5); w14 = _mm256_unpacklo_epi32(w12, w13); w15 = _mm256_unpackhi_epi32(w12, w13); // Store second 4-line result d[4] = _mm256_unpacklo_epi64(w6, w14); d[5] = _mm256_unpackhi_epi64(w6, w14); d[6] = _mm256_unpacklo_epi64(w7, w15); d[7] = _mm256_unpackhi_epi64(w7, w15); // upper half w0 = _mm256_unpackhi_epi8(x[0], x[1]); w1 = _mm256_unpackhi_epi8(x[2], x[3]); w2 = _mm256_unpackhi_epi8(x[4], x[5]); w3 = _mm256_unpackhi_epi8(x[6], x[7]); w8 = _mm256_unpackhi_epi8(x[8], x[9]); w9 = _mm256_unpackhi_epi8(x[10], x[11]); w10 = _mm256_unpackhi_epi8(x[12], x[13]); w11 = _mm256_unpackhi_epi8(x[14], x[15]); w4 = _mm256_unpacklo_epi16(w0, w1); w5 = _mm256_unpacklo_epi16(w2, w3); w12 = _mm256_unpacklo_epi16(w8, w9); w13 = _mm256_unpacklo_epi16(w10, w11); w6 = _mm256_unpacklo_epi32(w4, w5); w7 = _mm256_unpackhi_epi32(w4, w5); w14 = _mm256_unpacklo_epi32(w12, w13); w15 = _mm256_unpackhi_epi32(w12, w13); // Store first 4-line result d[8] = _mm256_unpacklo_epi64(w6, w14); d[9] = _mm256_unpackhi_epi64(w6, w14); d[10] = _mm256_unpacklo_epi64(w7, w15); d[11] = _mm256_unpackhi_epi64(w7, w15); w4 = _mm256_unpackhi_epi16(w0, w1); w5 = _mm256_unpackhi_epi16(w2, w3); w12 = _mm256_unpackhi_epi16(w8, w9); w13 = _mm256_unpackhi_epi16(w10, w11); w6 = _mm256_unpacklo_epi32(w4, w5); w7 = _mm256_unpackhi_epi32(w4, w5); w14 = _mm256_unpacklo_epi32(w12, w13); w15 = _mm256_unpackhi_epi32(w12, w13); // Store second 4-line result d[12] = _mm256_unpacklo_epi64(w6, w14); d[13] = _mm256_unpackhi_epi64(w6, w14); d[14] = _mm256_unpacklo_epi64(w7, w15); d[15] = _mm256_unpackhi_epi64(w7, w15); } static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[4], d[4]; dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy); transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0], &d[1], &d[2], &d[3]); *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]); *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]); *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]); *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]); return; } static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[8], d[8]; dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy); transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2], &d[3]); _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8)); _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]); _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8)); _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]); _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8)); _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]); _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8)); } static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[4], d[8]; dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy); transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); for (int i = 0; i < 8; i++) { *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); } } static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[8], d[4]; dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy); transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2], &d[3]); _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); } static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[8], d[8]; dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy); transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3, dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d, d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7); for (int i = 0; i < 8; i++) { _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), _mm_srli_si128(d[i], 8)); } } static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[16], d[16]; dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy); transpose16x8_8x16_sse2( &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); for (int i = 0; i < 8; i++) { _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[4], d[16]; dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy); transpose4x16_sse2(dstvec, d); for (int i = 0; i < 16; i++) { *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); } } static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[16], d[8]; dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy); for (int i = 4; i < 8; i++) { d[i] = _mm_setzero_si128(); } transpose16x8_8x16_sse2( &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); for (int i = 0; i < 4; i++) { _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); } } static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m256i dstvec[16], d[16]; dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy); for (int i = 8; i < 16; i++) { dstvec[i] = _mm256_setzero_si256(); } transpose16x32_avx2(dstvec, d); for (int i = 0; i < 16; i++) { _mm_storel_epi64((__m128i *)(dst + i * stride), _mm256_castsi256_si128(d[i])); } for (int i = 0; i < 16; i++) { _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), _mm256_extracti128_si256(d[i], 1)); } } static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[32], d[16]; dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy); transpose16x8_8x16_sse2( &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); transpose16x8_8x16_sse2( &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16], &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16], &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16], &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16], &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8], &d[6 + 8], &d[7 + 8]); for (int i = 0; i < 8; i++) { _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]); } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[16], d[16]; dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy); transpose16x16_sse2(dstvec, d); for (int i = 0; i < 16; i++) { _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); } } static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m256i dstvec[32], d[32]; dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy); transpose16x32_avx2(dstvec, d); transpose16x32_avx2(dstvec + 16, d + 16); for (int j = 0; j < 16; j++) { _mm_storeu_si128((__m128i *)(dst + j * stride), _mm256_castsi256_si128(d[j])); _mm_storeu_si128((__m128i *)(dst + j * stride + 16), _mm256_castsi256_si128(d[j + 16])); } for (int j = 0; j < 16; j++) { _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), _mm256_extracti128_si256(d[j], 1)); _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), _mm256_extracti128_si256(d[j + 16], 1)); } } static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]); dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy); transpose(dstT, 64, dst, stride, 64, 64); } static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m256i dstvec[16], d[16]; dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy); transpose16x32_avx2(dstvec, d); // store for (int j = 0; j < 16; j++) { _mm_storeu_si128((__m128i *)(dst + j * stride), _mm256_castsi256_si128(d[j])); _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), _mm256_extracti128_si256(d[j], 1)); } } static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[32], d[16]; dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy); for (int i = 0; i < 32; i += 16) { transpose16x16_sse2((dstvec + i), d); for (int j = 0; j < 16; j++) { _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); } } } static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8_t dstT[64 * 32]; dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy); transpose(dstT, 64, dst, stride, 32, 64); } static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8_t dstT[32 * 64]; dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy); transpose(dstT, 32, dst, stride, 64, 32); return; } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8_t dstT[64 * 16]; dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy); transpose(dstT, 64, dst, stride, 16, 64); } static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[64], d[16]; dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy); for (int i = 0; i < 64; i += 16) { transpose16x16_sse2((dstvec + i), d); for (int j = 0; j < 16; j++) { _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); } } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy) { (void)above; (void)dx; assert(dx == 1); assert(dy > 0); if (bw == bh) { switch (bw) { case 4: dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy); break; case 8: dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy); break; case 16: dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy); break; case 32: dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy); break; case 64: dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy); break; } } else { if (bw < bh) { if (bw + bw == bh) { switch (bw) { case 4: dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy); break; case 8: dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy); break; case 16: dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy); break; case 32: dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy); break; } } else { switch (bw) { #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER case 4: dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy); break; case 8: dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy); break; case 16: dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy); break; #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER } } } else { if (bh + bh == bw) { switch (bh) { case 4: dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy); break; case 8: dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy); break; case 16: dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy); break; case 32: dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy); break; } } else { switch (bh) { #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER case 4: dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy); break; case 8: dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy); break; case 16: dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy); break; #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER } } } } } aom-3.12.1/aom_dsp/x86/intrapred_sse2.c000066400000000000000000001553131477627663500175500ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom_dsp/x86/intrapred_x86.h" #include "config/aom_dsp_rtcd.h" static inline void dc_store_4xh(uint32_t dc, int height, uint8_t *dst, ptrdiff_t stride) { for (int i = 0; i < height; i += 2) { *(uint32_t *)dst = dc; dst += stride; *(uint32_t *)dst = dc; dst += stride; } } static inline void dc_store_8xh(const __m128i *row, int height, uint8_t *dst, ptrdiff_t stride) { int i; for (i = 0; i < height; ++i) { _mm_storel_epi64((__m128i *)dst, *row); dst += stride; } } static inline void dc_store_16xh(const __m128i *row, int height, uint8_t *dst, ptrdiff_t stride) { int i; for (i = 0; i < height; ++i) { _mm_store_si128((__m128i *)dst, *row); dst += stride; } } static inline void dc_store_32xh(const __m128i *row, int height, uint8_t *dst, ptrdiff_t stride) { int i; for (i = 0; i < height; ++i) { _mm_store_si128((__m128i *)dst, *row); _mm_store_si128((__m128i *)(dst + 16), *row); dst += stride; } } static inline void dc_store_64xh(const __m128i *row, int height, uint8_t *dst, ptrdiff_t stride) { for (int i = 0; i < height; ++i) { _mm_store_si128((__m128i *)dst, *row); _mm_store_si128((__m128i *)(dst + 16), *row); _mm_store_si128((__m128i *)(dst + 32), *row); _mm_store_si128((__m128i *)(dst + 48), *row); dst += stride; } } static inline __m128i dc_sum_4(const uint8_t *ref) { __m128i x = _mm_loadl_epi64((__m128i const *)ref); const __m128i zero = _mm_setzero_si128(); x = _mm_unpacklo_epi8(x, zero); return _mm_sad_epu8(x, zero); } static inline __m128i dc_sum_8(const uint8_t *ref) { __m128i x = _mm_loadl_epi64((__m128i const *)ref); const __m128i zero = _mm_setzero_si128(); return _mm_sad_epu8(x, zero); } static inline __m128i dc_sum_64(const uint8_t *ref) { __m128i x0 = _mm_load_si128((__m128i const *)ref); __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32)); __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48)); const __m128i zero = _mm_setzero_si128(); x0 = _mm_sad_epu8(x0, zero); x1 = _mm_sad_epu8(x1, zero); x2 = _mm_sad_epu8(x2, zero); x3 = _mm_sad_epu8(x3, zero); x0 = _mm_add_epi16(x0, x1); x2 = _mm_add_epi16(x2, x3); x0 = _mm_add_epi16(x0, x2); const __m128i high = _mm_unpackhi_epi64(x0, x0); return _mm_add_epi16(x0, high); } #define DC_MULTIPLIER_1X2 0x5556 #define DC_MULTIPLIER_1X4 0x3334 #define DC_SHIFT2 16 static inline int divide_using_multiply_shift(int num, int shift1, int multiplier) { const int interm = num >> shift1; return interm * multiplier >> DC_SHIFT2; } // ----------------------------------------------------------------------------- // DC_PRED void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i sum_left = dc_sum_8(left); __m128i sum_above = dc_sum_4(above); sum_above = _mm_add_epi16(sum_left, sum_above); uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); sum += 6; sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((int8_t)sum); const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row); dc_store_4xh(pred, 8, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i sum_left = dc_sum_16_sse2(left); __m128i sum_above = dc_sum_4(above); sum_above = _mm_add_epi16(sum_left, sum_above); uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); sum += 10; sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); const __m128i row = _mm_set1_epi8((int8_t)sum); const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row); dc_store_4xh(pred, 16, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i sum_left = dc_sum_4(left); __m128i sum_above = dc_sum_8(above); sum_above = _mm_add_epi16(sum_above, sum_left); uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); sum += 6; sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((int8_t)sum); dc_store_8xh(&row, 4, dst, stride); } void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i sum_left = dc_sum_16_sse2(left); __m128i sum_above = dc_sum_8(above); sum_above = _mm_add_epi16(sum_above, sum_left); uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); sum += 12; sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((int8_t)sum); dc_store_8xh(&row, 16, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i sum_left = dc_sum_32_sse2(left); __m128i sum_above = dc_sum_8(above); sum_above = _mm_add_epi16(sum_above, sum_left); uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); sum += 20; sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); const __m128i row = _mm_set1_epi8((int8_t)sum); dc_store_8xh(&row, 32, dst, stride); } void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i sum_left = dc_sum_4(left); __m128i sum_above = dc_sum_16_sse2(above); sum_above = _mm_add_epi16(sum_above, sum_left); uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); sum += 10; sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); const __m128i row = _mm_set1_epi8((int8_t)sum); dc_store_16xh(&row, 4, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i sum_left = dc_sum_8(left); __m128i sum_above = dc_sum_16_sse2(above); sum_above = _mm_add_epi16(sum_above, sum_left); uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); sum += 12; sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((int8_t)sum); dc_store_16xh(&row, 8, dst, stride); } void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i sum_left = dc_sum_32_sse2(left); __m128i sum_above = dc_sum_16_sse2(above); sum_above = _mm_add_epi16(sum_left, sum_above); uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); sum += 24; sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((int8_t)sum); dc_store_16xh(&row, 32, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i sum_left = dc_sum_64(left); __m128i sum_above = dc_sum_16_sse2(above); sum_above = _mm_add_epi16(sum_left, sum_above); uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); sum += 40; sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); const __m128i row = _mm_set1_epi8((int8_t)sum); dc_store_16xh(&row, 64, dst, stride); } void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i sum_above = dc_sum_32_sse2(above); const __m128i sum_left = dc_sum_8(left); sum_above = _mm_add_epi16(sum_above, sum_left); uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); sum += 20; sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); const __m128i row = _mm_set1_epi8((int8_t)sum); dc_store_32xh(&row, 8, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i sum_above = dc_sum_32_sse2(above); const __m128i sum_left = dc_sum_16_sse2(left); sum_above = _mm_add_epi16(sum_above, sum_left); uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); sum += 24; sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((int8_t)sum); dc_store_32xh(&row, 16, dst, stride); } void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i sum_above = dc_sum_32_sse2(above); const __m128i sum_left = dc_sum_64(left); sum_above = _mm_add_epi16(sum_above, sum_left); uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); sum += 48; sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((int8_t)sum); dc_store_32xh(&row, 64, dst, stride); } void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i sum_above = dc_sum_64(above); const __m128i sum_left = dc_sum_64(left); sum_above = _mm_add_epi16(sum_above, sum_left); uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); sum += 64; sum /= 128; const __m128i row = _mm_set1_epi8((int8_t)sum); dc_store_64xh(&row, 64, dst, stride); } void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i sum_above = dc_sum_64(above); const __m128i sum_left = dc_sum_32_sse2(left); sum_above = _mm_add_epi16(sum_above, sum_left); uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); sum += 48; sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((int8_t)sum); dc_store_64xh(&row, 32, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i sum_above = dc_sum_64(above); const __m128i sum_left = dc_sum_16_sse2(left); sum_above = _mm_add_epi16(sum_above, sum_left); uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); sum += 40; sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); const __m128i row = _mm_set1_epi8((int8_t)sum); dc_store_64xh(&row, 16, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // ----------------------------------------------------------------------------- // DC_TOP void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; __m128i sum_above = dc_sum_4(above); const __m128i two = _mm_set1_epi16(2); sum_above = _mm_add_epi16(sum_above, two); sum_above = _mm_srai_epi16(sum_above, 2); sum_above = _mm_shufflelo_epi16(sum_above, 0); sum_above = _mm_packus_epi16(sum_above, sum_above); const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above); dc_store_4xh(pred, 8, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; __m128i sum_above = dc_sum_4(above); const __m128i two = _mm_set1_epi16(2); sum_above = _mm_add_epi16(sum_above, two); sum_above = _mm_srai_epi16(sum_above, 2); sum_above = _mm_shufflelo_epi16(sum_above, 0); sum_above = _mm_packus_epi16(sum_above, sum_above); const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above); dc_store_4xh(pred, 16, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; __m128i sum_above = dc_sum_8(above); const __m128i four = _mm_set1_epi16(4); sum_above = _mm_add_epi16(sum_above, four); sum_above = _mm_srai_epi16(sum_above, 3); sum_above = _mm_unpacklo_epi8(sum_above, sum_above); const __m128i row = _mm_shufflelo_epi16(sum_above, 0); dc_store_8xh(&row, 4, dst, stride); } void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; __m128i sum_above = dc_sum_8(above); const __m128i four = _mm_set1_epi16(4); sum_above = _mm_add_epi16(sum_above, four); sum_above = _mm_srai_epi16(sum_above, 3); sum_above = _mm_unpacklo_epi8(sum_above, sum_above); const __m128i row = _mm_shufflelo_epi16(sum_above, 0); dc_store_8xh(&row, 16, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; __m128i sum_above = dc_sum_8(above); const __m128i four = _mm_set1_epi16(4); sum_above = _mm_add_epi16(sum_above, four); sum_above = _mm_srai_epi16(sum_above, 3); sum_above = _mm_unpacklo_epi8(sum_above, sum_above); const __m128i row = _mm_shufflelo_epi16(sum_above, 0); dc_store_8xh(&row, 32, dst, stride); } void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; __m128i sum_above = dc_sum_16_sse2(above); const __m128i eight = _mm_set1_epi16(8); sum_above = _mm_add_epi16(sum_above, eight); sum_above = _mm_srai_epi16(sum_above, 4); sum_above = _mm_unpacklo_epi8(sum_above, sum_above); sum_above = _mm_shufflelo_epi16(sum_above, 0); const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); dc_store_16xh(&row, 4, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; __m128i sum_above = dc_sum_16_sse2(above); const __m128i eight = _mm_set1_epi16(8); sum_above = _mm_add_epi16(sum_above, eight); sum_above = _mm_srai_epi16(sum_above, 4); sum_above = _mm_unpacklo_epi8(sum_above, sum_above); sum_above = _mm_shufflelo_epi16(sum_above, 0); const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); dc_store_16xh(&row, 8, dst, stride); } void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; __m128i sum_above = dc_sum_16_sse2(above); const __m128i eight = _mm_set1_epi16(8); sum_above = _mm_add_epi16(sum_above, eight); sum_above = _mm_srai_epi16(sum_above, 4); sum_above = _mm_unpacklo_epi8(sum_above, sum_above); sum_above = _mm_shufflelo_epi16(sum_above, 0); const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); dc_store_16xh(&row, 32, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; __m128i sum_above = dc_sum_16_sse2(above); const __m128i eight = _mm_set1_epi16(8); sum_above = _mm_add_epi16(sum_above, eight); sum_above = _mm_srai_epi16(sum_above, 4); sum_above = _mm_unpacklo_epi8(sum_above, sum_above); sum_above = _mm_shufflelo_epi16(sum_above, 0); const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); dc_store_16xh(&row, 64, dst, stride); } void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; __m128i sum_above = dc_sum_32_sse2(above); const __m128i sixteen = _mm_set1_epi16(16); sum_above = _mm_add_epi16(sum_above, sixteen); sum_above = _mm_srai_epi16(sum_above, 5); sum_above = _mm_unpacklo_epi8(sum_above, sum_above); sum_above = _mm_shufflelo_epi16(sum_above, 0); const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); dc_store_32xh(&row, 8, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; __m128i sum_above = dc_sum_32_sse2(above); const __m128i sixteen = _mm_set1_epi16(16); sum_above = _mm_add_epi16(sum_above, sixteen); sum_above = _mm_srai_epi16(sum_above, 5); sum_above = _mm_unpacklo_epi8(sum_above, sum_above); sum_above = _mm_shufflelo_epi16(sum_above, 0); const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); dc_store_32xh(&row, 16, dst, stride); } void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; __m128i sum_above = dc_sum_32_sse2(above); const __m128i sixteen = _mm_set1_epi16(16); sum_above = _mm_add_epi16(sum_above, sixteen); sum_above = _mm_srai_epi16(sum_above, 5); sum_above = _mm_unpacklo_epi8(sum_above, sum_above); sum_above = _mm_shufflelo_epi16(sum_above, 0); const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); dc_store_32xh(&row, 64, dst, stride); } void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; __m128i sum_above = dc_sum_64(above); const __m128i thirtytwo = _mm_set1_epi16(32); sum_above = _mm_add_epi16(sum_above, thirtytwo); sum_above = _mm_srai_epi16(sum_above, 6); sum_above = _mm_unpacklo_epi8(sum_above, sum_above); sum_above = _mm_shufflelo_epi16(sum_above, 0); const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); dc_store_64xh(&row, 64, dst, stride); } void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; __m128i sum_above = dc_sum_64(above); const __m128i thirtytwo = _mm_set1_epi16(32); sum_above = _mm_add_epi16(sum_above, thirtytwo); sum_above = _mm_srai_epi16(sum_above, 6); sum_above = _mm_unpacklo_epi8(sum_above, sum_above); sum_above = _mm_shufflelo_epi16(sum_above, 0); const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); dc_store_64xh(&row, 32, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; __m128i sum_above = dc_sum_64(above); const __m128i thirtytwo = _mm_set1_epi16(32); sum_above = _mm_add_epi16(sum_above, thirtytwo); sum_above = _mm_srai_epi16(sum_above, 6); sum_above = _mm_unpacklo_epi8(sum_above, sum_above); sum_above = _mm_shufflelo_epi16(sum_above, 0); const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); dc_store_64xh(&row, 16, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // ----------------------------------------------------------------------------- // DC_LEFT void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i sum_left = dc_sum_8(left); const __m128i four = _mm_set1_epi16(4); sum_left = _mm_add_epi16(sum_left, four); sum_left = _mm_srai_epi16(sum_left, 3); sum_left = _mm_shufflelo_epi16(sum_left, 0); sum_left = _mm_packus_epi16(sum_left, sum_left); const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left); dc_store_4xh(pred, 8, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i sum_left = dc_sum_16_sse2(left); const __m128i eight = _mm_set1_epi16(8); sum_left = _mm_add_epi16(sum_left, eight); sum_left = _mm_srai_epi16(sum_left, 4); sum_left = _mm_shufflelo_epi16(sum_left, 0); sum_left = _mm_packus_epi16(sum_left, sum_left); const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left); dc_store_4xh(pred, 16, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i sum_left = dc_sum_4(left); const __m128i two = _mm_set1_epi16(2); sum_left = _mm_add_epi16(sum_left, two); sum_left = _mm_srai_epi16(sum_left, 2); sum_left = _mm_unpacklo_epi8(sum_left, sum_left); const __m128i row = _mm_shufflelo_epi16(sum_left, 0); dc_store_8xh(&row, 4, dst, stride); } void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i sum_left = dc_sum_16_sse2(left); const __m128i eight = _mm_set1_epi16(8); sum_left = _mm_add_epi16(sum_left, eight); sum_left = _mm_srai_epi16(sum_left, 4); sum_left = _mm_unpacklo_epi8(sum_left, sum_left); const __m128i row = _mm_shufflelo_epi16(sum_left, 0); dc_store_8xh(&row, 16, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i sum_left = dc_sum_32_sse2(left); const __m128i sixteen = _mm_set1_epi16(16); sum_left = _mm_add_epi16(sum_left, sixteen); sum_left = _mm_srai_epi16(sum_left, 5); sum_left = _mm_unpacklo_epi8(sum_left, sum_left); const __m128i row = _mm_shufflelo_epi16(sum_left, 0); dc_store_8xh(&row, 32, dst, stride); } void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i sum_left = dc_sum_4(left); const __m128i two = _mm_set1_epi16(2); sum_left = _mm_add_epi16(sum_left, two); sum_left = _mm_srai_epi16(sum_left, 2); sum_left = _mm_unpacklo_epi8(sum_left, sum_left); sum_left = _mm_shufflelo_epi16(sum_left, 0); const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); dc_store_16xh(&row, 4, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i sum_left = dc_sum_8(left); const __m128i four = _mm_set1_epi16(4); sum_left = _mm_add_epi16(sum_left, four); sum_left = _mm_srai_epi16(sum_left, 3); sum_left = _mm_unpacklo_epi8(sum_left, sum_left); sum_left = _mm_shufflelo_epi16(sum_left, 0); const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); dc_store_16xh(&row, 8, dst, stride); } void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i sum_left = dc_sum_32_sse2(left); const __m128i sixteen = _mm_set1_epi16(16); sum_left = _mm_add_epi16(sum_left, sixteen); sum_left = _mm_srai_epi16(sum_left, 5); sum_left = _mm_unpacklo_epi8(sum_left, sum_left); sum_left = _mm_shufflelo_epi16(sum_left, 0); const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); dc_store_16xh(&row, 32, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i sum_left = dc_sum_64(left); const __m128i thirtytwo = _mm_set1_epi16(32); sum_left = _mm_add_epi16(sum_left, thirtytwo); sum_left = _mm_srai_epi16(sum_left, 6); sum_left = _mm_unpacklo_epi8(sum_left, sum_left); sum_left = _mm_shufflelo_epi16(sum_left, 0); const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); dc_store_16xh(&row, 64, dst, stride); } void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i sum_left = dc_sum_8(left); const __m128i four = _mm_set1_epi16(4); sum_left = _mm_add_epi16(sum_left, four); sum_left = _mm_srai_epi16(sum_left, 3); sum_left = _mm_unpacklo_epi8(sum_left, sum_left); sum_left = _mm_shufflelo_epi16(sum_left, 0); const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); dc_store_32xh(&row, 8, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i sum_left = dc_sum_16_sse2(left); const __m128i eight = _mm_set1_epi16(8); sum_left = _mm_add_epi16(sum_left, eight); sum_left = _mm_srai_epi16(sum_left, 4); sum_left = _mm_unpacklo_epi8(sum_left, sum_left); sum_left = _mm_shufflelo_epi16(sum_left, 0); const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); dc_store_32xh(&row, 16, dst, stride); } void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i sum_left = dc_sum_64(left); const __m128i thirtytwo = _mm_set1_epi16(32); sum_left = _mm_add_epi16(sum_left, thirtytwo); sum_left = _mm_srai_epi16(sum_left, 6); sum_left = _mm_unpacklo_epi8(sum_left, sum_left); sum_left = _mm_shufflelo_epi16(sum_left, 0); const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); dc_store_32xh(&row, 64, dst, stride); } void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i sum_left = dc_sum_64(left); const __m128i thirtytwo = _mm_set1_epi16(32); sum_left = _mm_add_epi16(sum_left, thirtytwo); sum_left = _mm_srai_epi16(sum_left, 6); sum_left = _mm_unpacklo_epi8(sum_left, sum_left); sum_left = _mm_shufflelo_epi16(sum_left, 0); const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); dc_store_64xh(&row, 64, dst, stride); } void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i sum_left = dc_sum_32_sse2(left); const __m128i sixteen = _mm_set1_epi16(16); sum_left = _mm_add_epi16(sum_left, sixteen); sum_left = _mm_srai_epi16(sum_left, 5); sum_left = _mm_unpacklo_epi8(sum_left, sum_left); sum_left = _mm_shufflelo_epi16(sum_left, 0); const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); dc_store_64xh(&row, 32, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i sum_left = dc_sum_16_sse2(left); const __m128i eight = _mm_set1_epi16(8); sum_left = _mm_add_epi16(sum_left, eight); sum_left = _mm_srai_epi16(sum_left, 4); sum_left = _mm_unpacklo_epi8(sum_left, sum_left); sum_left = _mm_shufflelo_epi16(sum_left, 0); const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); dc_store_64xh(&row, 16, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // ----------------------------------------------------------------------------- // DC_128 void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const uint32_t pred = 0x80808080; dc_store_4xh(pred, 8, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const uint32_t pred = 0x80808080; dc_store_4xh(pred, 16, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m128i row = _mm_set1_epi8((int8_t)128); dc_store_8xh(&row, 4, dst, stride); } void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m128i row = _mm_set1_epi8((int8_t)128); dc_store_8xh(&row, 16, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m128i row = _mm_set1_epi8((int8_t)128); dc_store_8xh(&row, 32, dst, stride); } void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m128i row = _mm_set1_epi8((int8_t)128); dc_store_16xh(&row, 4, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m128i row = _mm_set1_epi8((int8_t)128); dc_store_16xh(&row, 8, dst, stride); } void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m128i row = _mm_set1_epi8((int8_t)128); dc_store_16xh(&row, 32, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m128i row = _mm_set1_epi8((int8_t)128); dc_store_16xh(&row, 64, dst, stride); } void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m128i row = _mm_set1_epi8((int8_t)128); dc_store_32xh(&row, 8, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m128i row = _mm_set1_epi8((int8_t)128); dc_store_32xh(&row, 16, dst, stride); } void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m128i row = _mm_set1_epi8((int8_t)128); dc_store_32xh(&row, 64, dst, stride); } void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m128i row = _mm_set1_epi8((int8_t)128); dc_store_64xh(&row, 64, dst, stride); } void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m128i row = _mm_set1_epi8((int8_t)128); dc_store_64xh(&row, 32, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; (void)left; const __m128i row = _mm_set1_epi8((int8_t)128); dc_store_64xh(&row, 16, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // ----------------------------------------------------------------------------- // V_PRED void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint32_t pred = *(uint32_t *)above; (void)left; dc_store_4xh(pred, 8, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint32_t pred = *(uint32_t *)above; (void)left; dc_store_4xh(pred, 16, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i row = _mm_loadl_epi64((__m128i const *)above); (void)left; dc_store_8xh(&row, 4, dst, stride); } void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i row = _mm_loadl_epi64((__m128i const *)above); (void)left; dc_store_8xh(&row, 16, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i row = _mm_loadl_epi64((__m128i const *)above); (void)left; dc_store_8xh(&row, 32, dst, stride); } void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i row = _mm_load_si128((__m128i const *)above); (void)left; dc_store_16xh(&row, 4, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i row = _mm_load_si128((__m128i const *)above); (void)left; dc_store_16xh(&row, 8, dst, stride); } void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i row = _mm_load_si128((__m128i const *)above); (void)left; dc_store_16xh(&row, 32, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i row = _mm_load_si128((__m128i const *)above); (void)left; dc_store_16xh(&row, 64, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static inline void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int height) { const __m128i row0 = _mm_load_si128((__m128i const *)above); const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); for (int i = 0; i < height; ++i) { _mm_store_si128((__m128i *)dst, row0); _mm_store_si128((__m128i *)(dst + 16), row1); dst += stride; } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_predictor_32xh(dst, stride, above, 8); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_predictor_32xh(dst, stride, above, 16); } void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_predictor_32xh(dst, stride, above, 64); } static inline void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int height) { const __m128i row0 = _mm_load_si128((__m128i const *)above); const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32)); const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48)); for (int i = 0; i < height; ++i) { _mm_store_si128((__m128i *)dst, row0); _mm_store_si128((__m128i *)(dst + 16), row1); _mm_store_si128((__m128i *)(dst + 32), row2); _mm_store_si128((__m128i *)(dst + 48), row3); dst += stride; } } void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_predictor_64xh(dst, stride, above, 64); } void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_predictor_64xh(dst, stride, above, 32); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; v_predictor_64xh(dst, stride, above, 16); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // ----------------------------------------------------------------------------- // H_PRED void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i left_col = _mm_loadl_epi64((__m128i const *)left); left_col = _mm_unpacklo_epi8(left_col, left_col); __m128i row0 = _mm_shufflelo_epi16(left_col, 0); __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); *(int *)dst = _mm_cvtsi128_si32(row0); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row1); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row2); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row3); dst += stride; left_col = _mm_unpackhi_epi64(left_col, left_col); row0 = _mm_shufflelo_epi16(left_col, 0); row1 = _mm_shufflelo_epi16(left_col, 0x55); row2 = _mm_shufflelo_epi16(left_col, 0xaa); row3 = _mm_shufflelo_epi16(left_col, 0xff); *(int *)dst = _mm_cvtsi128_si32(row0); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row1); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row2); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row3); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; const __m128i left_col = _mm_load_si128((__m128i const *)left); __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); *(int *)dst = _mm_cvtsi128_si32(row0); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row1); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row2); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row3); dst += stride; left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); row0 = _mm_shufflelo_epi16(left_col_low, 0); row1 = _mm_shufflelo_epi16(left_col_low, 0x55); row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); row3 = _mm_shufflelo_epi16(left_col_low, 0xff); *(int *)dst = _mm_cvtsi128_si32(row0); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row1); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row2); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row3); dst += stride; row0 = _mm_shufflelo_epi16(left_col_high, 0); row1 = _mm_shufflelo_epi16(left_col_high, 0x55); row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); row3 = _mm_shufflelo_epi16(left_col_high, 0xff); *(int *)dst = _mm_cvtsi128_si32(row0); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row1); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row2); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row3); dst += stride; left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); row0 = _mm_shufflelo_epi16(left_col_high, 0); row1 = _mm_shufflelo_epi16(left_col_high, 0x55); row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); row3 = _mm_shufflelo_epi16(left_col_high, 0xff); *(int *)dst = _mm_cvtsi128_si32(row0); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row1); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row2); dst += stride; *(int *)dst = _mm_cvtsi128_si32(row3); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; __m128i left_col = _mm_loadl_epi64((__m128i const *)left); left_col = _mm_unpacklo_epi8(left_col, left_col); __m128i row0 = _mm_shufflelo_epi16(left_col, 0); __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); _mm_storel_epi64((__m128i *)dst, row0); dst += stride; _mm_storel_epi64((__m128i *)dst, row1); dst += stride; _mm_storel_epi64((__m128i *)dst, row2); dst += stride; _mm_storel_epi64((__m128i *)dst, row3); } static inline void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int count) { (void)above; for (int i = 0; i < count; ++i) { const __m128i left_col = _mm_load_si128((__m128i const *)left); __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); _mm_storel_epi64((__m128i *)dst, row0); dst += stride; _mm_storel_epi64((__m128i *)dst, row1); dst += stride; _mm_storel_epi64((__m128i *)dst, row2); dst += stride; _mm_storel_epi64((__m128i *)dst, row3); dst += stride; left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); row0 = _mm_shufflelo_epi16(left_col_low, 0); row1 = _mm_shufflelo_epi16(left_col_low, 0x55); row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); row3 = _mm_shufflelo_epi16(left_col_low, 0xff); _mm_storel_epi64((__m128i *)dst, row0); dst += stride; _mm_storel_epi64((__m128i *)dst, row1); dst += stride; _mm_storel_epi64((__m128i *)dst, row2); dst += stride; _mm_storel_epi64((__m128i *)dst, row3); dst += stride; row0 = _mm_shufflelo_epi16(left_col_high, 0); row1 = _mm_shufflelo_epi16(left_col_high, 0x55); row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); row3 = _mm_shufflelo_epi16(left_col_high, 0xff); _mm_storel_epi64((__m128i *)dst, row0); dst += stride; _mm_storel_epi64((__m128i *)dst, row1); dst += stride; _mm_storel_epi64((__m128i *)dst, row2); dst += stride; _mm_storel_epi64((__m128i *)dst, row3); dst += stride; left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); row0 = _mm_shufflelo_epi16(left_col_high, 0); row1 = _mm_shufflelo_epi16(left_col_high, 0x55); row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); row3 = _mm_shufflelo_epi16(left_col_high, 0xff); _mm_storel_epi64((__m128i *)dst, row0); dst += stride; _mm_storel_epi64((__m128i *)dst, row1); dst += stride; _mm_storel_epi64((__m128i *)dst, row2); dst += stride; _mm_storel_epi64((__m128i *)dst, row3); dst += stride; left += 16; } } void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { h_predictor_8x16xc(dst, stride, above, left, 1); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { h_predictor_8x16xc(dst, stride, above, left, 2); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static inline void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst, ptrdiff_t stride) { int i; for (i = 0; i < h; ++i) { _mm_store_si128((__m128i *)dst, row[i]); dst += stride; } } static inline void repeat_low_4pixels(const __m128i *x, __m128i *row) { const __m128i u0 = _mm_shufflelo_epi16(*x, 0); const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55); const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa); const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff); row[0] = _mm_unpacklo_epi64(u0, u0); row[1] = _mm_unpacklo_epi64(u1, u1); row[2] = _mm_unpacklo_epi64(u2, u2); row[3] = _mm_unpacklo_epi64(u3, u3); } static inline void repeat_high_4pixels(const __m128i *x, __m128i *row) { const __m128i u0 = _mm_shufflehi_epi16(*x, 0); const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55); const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa); const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff); row[0] = _mm_unpackhi_epi64(u0, u0); row[1] = _mm_unpackhi_epi64(u1, u1); row[2] = _mm_unpackhi_epi64(u2, u2); row[3] = _mm_unpackhi_epi64(u3, u3); } // Process 16x8, first 4 rows // Use first 8 bytes of left register: xxxxxxxx33221100 static inline void h_prediction_16x8_1(const __m128i *left, uint8_t *dst, ptrdiff_t stride) { __m128i row[4]; repeat_low_4pixels(left, row); h_pred_store_16xh(row, 4, dst, stride); } // Process 16x8, second 4 rows // Use second 8 bytes of left register: 77665544xxxxxxxx static inline void h_prediction_16x8_2(const __m128i *left, uint8_t *dst, ptrdiff_t stride) { __m128i row[4]; repeat_high_4pixels(left, row); h_pred_store_16xh(row, 4, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); h_prediction_16x8_1(&left_col_8p, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); h_prediction_16x8_1(&left_col_8p, dst, stride); dst += stride << 2; h_prediction_16x8_2(&left_col_8p, dst, stride); } static inline void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int count) { int i = 0; do { const __m128i left_col = _mm_load_si128((const __m128i *)left); const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col); h_prediction_16x8_1(&left_col_8p_lo, dst, stride); dst += stride << 2; h_prediction_16x8_2(&left_col_8p_lo, dst, stride); dst += stride << 2; const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col); h_prediction_16x8_1(&left_col_8p_hi, dst, stride); dst += stride << 2; h_prediction_16x8_2(&left_col_8p_hi, dst, stride); dst += stride << 2; left += 16; i++; } while (i < count); } void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; h_predictor_16xh(dst, stride, left, 2); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; h_predictor_16xh(dst, stride, left, 4); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static inline void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst, ptrdiff_t stride) { int i; for (i = 0; i < h; ++i) { _mm_store_si128((__m128i *)dst, row[i]); _mm_store_si128((__m128i *)(dst + 16), row[i]); dst += stride; } } // Process 32x8, first 4 rows // Use first 8 bytes of left register: xxxxxxxx33221100 static inline void h_prediction_32x8_1(const __m128i *left, uint8_t *dst, ptrdiff_t stride) { __m128i row[4]; repeat_low_4pixels(left, row); h_pred_store_32xh(row, 4, dst, stride); } // Process 32x8, second 4 rows // Use second 8 bytes of left register: 77665544xxxxxxxx static inline void h_prediction_32x8_2(const __m128i *left, uint8_t *dst, ptrdiff_t stride) { __m128i row[4]; repeat_high_4pixels(left, row); h_pred_store_32xh(row, 4, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i left_col, left_col_8p; (void)above; left_col = _mm_load_si128((const __m128i *)left); left_col_8p = _mm_unpacklo_epi8(left_col, left_col); h_prediction_32x8_1(&left_col_8p, dst, stride); dst += stride << 2; h_prediction_32x8_2(&left_col_8p, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i left_col, left_col_8p; (void)above; left_col = _mm_load_si128((const __m128i *)left); left_col_8p = _mm_unpacklo_epi8(left_col, left_col); h_prediction_32x8_1(&left_col_8p, dst, stride); dst += stride << 2; h_prediction_32x8_2(&left_col_8p, dst, stride); dst += stride << 2; left_col_8p = _mm_unpackhi_epi8(left_col, left_col); h_prediction_32x8_1(&left_col_8p, dst, stride); dst += stride << 2; h_prediction_32x8_2(&left_col_8p, dst, stride); } static inline void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int height) { int i = height >> 2; do { __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]); left4 = _mm_unpacklo_epi8(left4, left4); left4 = _mm_unpacklo_epi8(left4, left4); const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); _mm_store_si128((__m128i *)dst, r0); _mm_store_si128((__m128i *)(dst + 16), r0); _mm_store_si128((__m128i *)(dst + stride), r1); _mm_store_si128((__m128i *)(dst + stride + 16), r1); const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); _mm_store_si128((__m128i *)(dst + stride * 2), r2); _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); _mm_store_si128((__m128i *)(dst + stride * 3), r3); _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); left += 4; dst += stride * 4; } while (--i); } void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; h_predictor_32xh(dst, stride, left, 64); } static inline void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int height) { int i = height >> 2; do { __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]); left4 = _mm_unpacklo_epi8(left4, left4); left4 = _mm_unpacklo_epi8(left4, left4); const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); _mm_store_si128((__m128i *)dst, r0); _mm_store_si128((__m128i *)(dst + 16), r0); _mm_store_si128((__m128i *)(dst + 32), r0); _mm_store_si128((__m128i *)(dst + 48), r0); _mm_store_si128((__m128i *)(dst + stride), r1); _mm_store_si128((__m128i *)(dst + stride + 16), r1); _mm_store_si128((__m128i *)(dst + stride + 32), r1); _mm_store_si128((__m128i *)(dst + stride + 48), r1); const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); _mm_store_si128((__m128i *)(dst + stride * 2), r2); _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2); _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2); _mm_store_si128((__m128i *)(dst + stride * 3), r3); _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3); _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3); left += 4; dst += stride * 4; } while (--i); } void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; h_predictor_64xh(dst, stride, left, 64); } void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; h_predictor_64xh(dst, stride, left, 32); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; h_predictor_64xh(dst, stride, left, 16); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER aom-3.12.1/aom_dsp/x86/intrapred_sse4.c000066400000000000000000001452721477627663500175550ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // SSE2 #include /* SSE4.1 */ #include "config/av1_rtcd.h" #include "aom_dsp/x86/intrapred_x86.h" #include "aom_dsp/x86/intrapred_utils.h" #include "aom_dsp/x86/lpf_common_sse2.h" // Low bit depth functions static DECLARE_ALIGNED(16, uint8_t, Mask[2][33][16]) = { { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff } }, { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, }, }; /* clang-format on */ static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_sse4_1( int H, int W, __m128i *dst, const uint8_t *above, int upsample_above, int dx) { const int frac_bits = 6 - upsample_above; const int max_base_x = ((W + H) - 1) << upsample_above; assert(dx > 0); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m128i a0, a1, a32, a16; __m128i diff, c3f; __m128i a_mbase_x; a16 = _mm_set1_epi16(16); a_mbase_x = _mm_set1_epi8((char)above[max_base_x]); c3f = _mm_set1_epi16(0x3f); int x = dx; for (int r = 0; r < W; r++) { __m128i b, res, res1, shift; __m128i a0_above, a1_above; int base = x >> frac_bits; int base_max_diff = (max_base_x - base) >> upsample_above; if (base_max_diff <= 0) { for (int i = r; i < W; ++i) { dst[i] = a_mbase_x; // save 4 values } return; } if (base_max_diff > H) base_max_diff = H; a0_above = _mm_loadu_si128((__m128i *)(above + base)); a1_above = _mm_loadu_si128((__m128i *)(above + base + 1)); if (upsample_above) { a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[0]); a1_above = _mm_srli_si128(a0_above, 8); shift = _mm_srli_epi16( _mm_and_si128(_mm_slli_epi16(_mm_set1_epi16(x), upsample_above), c3f), 1); } else { shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1); } // lower half a0 = _mm_cvtepu8_epi16(a0_above); a1 = _mm_cvtepu8_epi16(a1_above); diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm_mullo_epi16(diff, shift); res = _mm_add_epi16(a32, b); res = _mm_srli_epi16(res, 5); // uppar half a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8)); a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8)); diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm_mullo_epi16(diff, shift); res1 = _mm_add_epi16(a32, b); res1 = _mm_srli_epi16(res1, 5); res = _mm_packus_epi16(res, res1); dst[r] = _mm_blendv_epi8(a_mbase_x, res, *(__m128i *)Mask[0][base_max_diff]); x += dx; } } static void dr_prediction_z1_4xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int upsample_above, int dx) { __m128i dstvec[16]; dr_prediction_z1_HxW_internal_sse4_1(4, N, dstvec, above, upsample_above, dx); for (int i = 0; i < N; i++) { *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]); } } static void dr_prediction_z1_8xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int upsample_above, int dx) { __m128i dstvec[32]; dr_prediction_z1_HxW_internal_sse4_1(8, N, dstvec, above, upsample_above, dx); for (int i = 0; i < N; i++) { _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); } } static void dr_prediction_z1_16xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int upsample_above, int dx) { __m128i dstvec[64]; dr_prediction_z1_HxW_internal_sse4_1(16, N, dstvec, above, upsample_above, dx); for (int i = 0; i < N; i++) { _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); } } static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_sse4_1( int N, __m128i *dstvec, __m128i *dstvec_h, const uint8_t *above, int upsample_above, int dx) { // here upsample_above is 0 by design of av1_use_intra_edge_upsample (void)upsample_above; const int frac_bits = 6; const int max_base_x = ((32 + N) - 1); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m128i a0, a1, a32, a16; __m128i a_mbase_x, diff, c3f; a16 = _mm_set1_epi16(16); a_mbase_x = _mm_set1_epi8((char)above[max_base_x]); c3f = _mm_set1_epi16(0x3f); int x = dx; for (int r = 0; r < N; r++) { __m128i b, res, res1, res16[2]; __m128i a0_above, a1_above; int base = x >> frac_bits; int base_max_diff = (max_base_x - base); if (base_max_diff <= 0) { for (int i = r; i < N; ++i) { dstvec[i] = a_mbase_x; // save 32 values dstvec_h[i] = a_mbase_x; } return; } if (base_max_diff > 32) base_max_diff = 32; __m128i shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1); for (int j = 0, jj = 0; j < 32; j += 16, jj++) { int mdiff = base_max_diff - j; if (mdiff <= 0) { res16[jj] = a_mbase_x; } else { a0_above = _mm_loadu_si128((__m128i *)(above + base + j)); a1_above = _mm_loadu_si128((__m128i *)(above + base + j + 1)); // lower half a0 = _mm_cvtepu8_epi16(a0_above); a1 = _mm_cvtepu8_epi16(a1_above); diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm_mullo_epi16(diff, shift); res = _mm_add_epi16(a32, b); res = _mm_srli_epi16(res, 5); // uppar half a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8)); a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8)); diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm_mullo_epi16(diff, shift); res1 = _mm_add_epi16(a32, b); res1 = _mm_srli_epi16(res1, 5); res16[jj] = _mm_packus_epi16(res, res1); // 16 8bit values } } dstvec[r] = _mm_blendv_epi8(a_mbase_x, res16[0], *(__m128i *)Mask[0][base_max_diff]); // 16 8bit values dstvec_h[r] = _mm_blendv_epi8(a_mbase_x, res16[1], *(__m128i *)Mask[1][base_max_diff]); // 16 8bit values x += dx; } } static void dr_prediction_z1_32xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int upsample_above, int dx) { __m128i dstvec[64], dstvec_h[64]; dr_prediction_z1_32xN_internal_sse4_1(N, dstvec, dstvec_h, above, upsample_above, dx); for (int i = 0; i < N; i++) { _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); _mm_storeu_si128((__m128i *)(dst + stride * i + 16), dstvec_h[i]); } } static void dr_prediction_z1_64xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, int upsample_above, int dx) { // here upsample_above is 0 by design of av1_use_intra_edge_upsample (void)upsample_above; const int frac_bits = 6; const int max_base_x = ((64 + N) - 1); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m128i a0, a1, a32, a16; __m128i a_mbase_x, diff, c3f; __m128i max_base, base_inc, mask; a16 = _mm_set1_epi16(16); a_mbase_x = _mm_set1_epi8((char)above[max_base_x]); max_base = _mm_set1_epi8(max_base_x); c3f = _mm_set1_epi16(0x3f); int x = dx; for (int r = 0; r < N; r++, dst += stride) { __m128i b, res, res1; int base = x >> frac_bits; if (base >= max_base_x) { for (int i = r; i < N; ++i) { _mm_storeu_si128((__m128i *)dst, a_mbase_x); // save 32 values _mm_storeu_si128((__m128i *)(dst + 16), a_mbase_x); _mm_storeu_si128((__m128i *)(dst + 32), a_mbase_x); _mm_storeu_si128((__m128i *)(dst + 48), a_mbase_x); dst += stride; } return; } __m128i shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1); // 8 element __m128i a0_above, a1_above, res_val; for (int j = 0; j < 64; j += 16) { int mdif = max_base_x - (base + j); if (mdif <= 0) { _mm_storeu_si128((__m128i *)(dst + j), a_mbase_x); } else { a0_above = _mm_loadu_si128((__m128i *)(above + base + j)); // load 16 element a1_above = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); // lower half a0 = _mm_cvtepu8_epi16(a0_above); a1 = _mm_cvtepu8_epi16(a1_above); diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm_mullo_epi16(diff, shift); res = _mm_add_epi16(a32, b); res = _mm_srli_epi16(res, 5); // uppar half a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8)); a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8)); diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm_mullo_epi16(diff, shift); res1 = _mm_add_epi16(a32, b); res1 = _mm_srli_epi16(res1, 5); res = _mm_packus_epi16(res, res1); // 16 8bit values base_inc = _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1), (int8_t)(base + j + 2), (int8_t)(base + j + 3), (int8_t)(base + j + 4), (int8_t)(base + j + 5), (int8_t)(base + j + 6), (int8_t)(base + j + 7), (int8_t)(base + j + 8), (int8_t)(base + j + 9), (int8_t)(base + j + 10), (int8_t)(base + j + 11), (int8_t)(base + j + 12), (int8_t)(base + j + 13), (int8_t)(base + j + 14), (int8_t)(base + j + 15)); mask = _mm_cmpgt_epi8(_mm_subs_epu8(max_base, base_inc), _mm_setzero_si128()); res_val = _mm_blendv_epi8(a_mbase_x, res, mask); _mm_storeu_si128((__m128i *)(dst + j), res_val); } } x += dx; } } // Directional prediction, zone 1: 0 < angle < 90 void av1_dr_prediction_z1_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy) { (void)left; (void)dy; switch (bw) { case 4: dr_prediction_z1_4xN_sse4_1(bh, dst, stride, above, upsample_above, dx); break; case 8: dr_prediction_z1_8xN_sse4_1(bh, dst, stride, above, upsample_above, dx); break; case 16: dr_prediction_z1_16xN_sse4_1(bh, dst, stride, above, upsample_above, dx); break; case 32: dr_prediction_z1_32xN_sse4_1(bh, dst, stride, above, upsample_above, dx); break; case 64: dr_prediction_z1_64xN_sse4_1(bh, dst, stride, above, upsample_above, dx); break; default: assert(0 && "Invalid block size"); } return; } static void dr_prediction_z2_Nx4_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy) { const int min_base_x = -(1 << upsample_above); const int min_base_y = -(1 << upsample_left); const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; assert(dx > 0); // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m128i a0_x, a1_x, a32, diff; const __m128i c3f = _mm_set1_epi16(0x3f); const __m128i min_y_base = _mm_set1_epi16(min_base_y); const __m128i c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0); const __m128i dy_reg = _mm_set1_epi16(dy); const __m128i a16 = _mm_set1_epi16(16); for (int r = 0; r < N; r++) { __m128i b, res, shift, r6, ydx; __m128i resx, resy, resxy; __m128i a0_above, a1_above; int y = r + 1; int base_x = (-y * dx) >> frac_bits_x; int base_shift = 0; if (base_x < (min_base_x - 1)) { base_shift = (min_base_x - base_x - 1) >> upsample_above; } int base_min_diff = (min_base_x - base_x + upsample_above) >> upsample_above; if (base_min_diff > 4) { base_min_diff = 4; } else { if (base_min_diff < 0) base_min_diff = 0; } if (base_shift > 3) { a0_x = _mm_setzero_si128(); a1_x = _mm_setzero_si128(); shift = _mm_setzero_si128(); } else { a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); ydx = _mm_set1_epi16(y * dx); r6 = _mm_slli_epi16(c1234, 6); if (upsample_above) { a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]); a1_above = _mm_srli_si128(a0_above, 8); shift = _mm_srli_epi16( _mm_and_si128( _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), 1); } else { a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]); a1_above = _mm_srli_si128(a0_above, 1); shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); } a0_x = _mm_cvtepu8_epi16(a0_above); a1_x = _mm_cvtepu8_epi16(a1_above); } // y calc __m128i a0_y, a1_y, shifty; if (base_x < min_base_x) { DECLARE_ALIGNED(32, int16_t, base_y_c[8]); __m128i y_c, base_y_c_reg, mask, c1234_; c1234_ = _mm_srli_si128(c1234, 2); r6 = _mm_set1_epi16(r << 6); y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy_reg)); base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y); mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg); base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg); _mm_store_si128((__m128i *)base_y_c, base_y_c_reg); a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4)); _mm_store_si128((__m128i *)base_y_c, base_y_c_reg); a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); if (upsample_left) { shifty = _mm_srli_epi16( _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1); } else { shifty = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1); } a0_x = _mm_unpacklo_epi64(a0_x, a0_y); a1_x = _mm_unpacklo_epi64(a1_x, a1_y); shift = _mm_unpacklo_epi64(shift, shifty); } diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm_mullo_epi16(diff, shift); res = _mm_add_epi16(a32, b); res = _mm_srli_epi16(res, 5); resx = _mm_packus_epi16(res, res); resy = _mm_srli_si128(resx, 4); resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]); *(int *)(dst) = _mm_cvtsi128_si32(resxy); dst += stride; } } static void dr_prediction_z2_Nx8_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy) { const int min_base_x = -(1 << upsample_above); const int min_base_y = -(1 << upsample_left); const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; // pre-filter above pixels // store in temp buffers: // above[x] * 32 + 16 // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 __m128i diff, a32; __m128i a0_x, a1_x, a0_y, a1_y; __m128i a0_above, a1_above; const __m128i a16 = _mm_set1_epi16(16); const __m128i c3f = _mm_set1_epi16(0x3f); const __m128i min_y_base = _mm_set1_epi16(min_base_y); const __m128i dy_reg = _mm_set1_epi16(dy); const __m128i c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); for (int r = 0; r < N; r++) { __m128i b, res, res1, shift; __m128i resx, resy, resxy, r6, ydx; int y = r + 1; int base_x = (-y * dx) >> frac_bits_x; int base_shift = 0; if (base_x < (min_base_x - 1)) { base_shift = (min_base_x - base_x - 1) >> upsample_above; } int base_min_diff = (min_base_x - base_x + upsample_above) >> upsample_above; if (base_min_diff > 8) { base_min_diff = 8; } else { if (base_min_diff < 0) base_min_diff = 0; } if (base_shift > 7) { resx = _mm_setzero_si128(); } else { a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); ydx = _mm_set1_epi16(y * dx); r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6); if (upsample_above) { a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]); a1_above = _mm_srli_si128(a0_above, 8); shift = _mm_srli_epi16( _mm_and_si128( _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), 1); } else { a1_above = _mm_srli_si128(a0_above, 1); a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]); a1_above = _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]); shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); } a0_x = _mm_cvtepu8_epi16(a0_above); a1_x = _mm_cvtepu8_epi16(a1_above); diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm_mullo_epi16(diff, shift); res = _mm_add_epi16(a32, b); res = _mm_srli_epi16(res, 5); resx = _mm_packus_epi16(res, res); } // y calc if (base_x < min_base_x) { DECLARE_ALIGNED(32, int16_t, base_y_c[16]); __m128i y_c, base_y_c_reg, mask; r6 = _mm_set1_epi16(r << 6); y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy_reg)); base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y); mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg); base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg); _mm_store_si128((__m128i *)base_y_c, base_y_c_reg); a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], left[base_y_c[6]], left[base_y_c[7]]); base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4)); _mm_store_si128((__m128i *)base_y_c, base_y_c_reg); a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], left[base_y_c[6]], left[base_y_c[7]]); if (upsample_left) { shift = _mm_srli_epi16( _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1); } else { shift = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1); } diff = _mm_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] a32 = _mm_slli_epi16(a0_y, 5); // a[x] * 32 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm_mullo_epi16(diff, shift); res1 = _mm_add_epi16(a32, b); res1 = _mm_srli_epi16(res1, 5); resy = _mm_packus_epi16(res1, res1); resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]); _mm_storel_epi64((__m128i *)dst, resxy); } else { _mm_storel_epi64((__m128i *)dst, resx); } dst += stride; } } static void dr_prediction_z2_HxW_sse4_1(int H, int W, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy) { // here upsample_above and upsample_left are 0 by design of // av1_use_intra_edge_upsample const int min_base_x = -1; const int min_base_y = -1; (void)upsample_above; (void)upsample_left; const int frac_bits_x = 6; const int frac_bits_y = 6; __m128i a0_x, a1_x, a0_y, a1_y, a0_y_h, a1_y_h, a32; __m128i diff, shifty, shifty_h; __m128i a0_above, a1_above; DECLARE_ALIGNED(32, int16_t, base_y_c[16]); const __m128i a16 = _mm_set1_epi16(16); const __m128i c1 = _mm_srli_epi16(a16, 4); const __m128i min_y_base = _mm_set1_epi16(min_base_y); const __m128i c3f = _mm_set1_epi16(0x3f); const __m128i dy256 = _mm_set1_epi16(dy); const __m128i c0123 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); const __m128i c0123_h = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); const __m128i c1234 = _mm_add_epi16(c0123, c1); const __m128i c1234_h = _mm_add_epi16(c0123_h, c1); for (int r = 0; r < H; r++) { __m128i b, res, res1, shift, reg_j, r6, ydx; __m128i resx, resy; __m128i resxy; int y = r + 1; ydx = _mm_set1_epi16((int16_t)(y * dx)); int base_x = (-y * dx) >> frac_bits_x; for (int j = 0; j < W; j += 16) { reg_j = _mm_set1_epi16(j); int base_shift = 0; if ((base_x + j) < (min_base_x - 1)) { base_shift = (min_base_x - (base_x + j) - 1); } int base_min_diff = (min_base_x - base_x - j); if (base_min_diff > 16) { base_min_diff = 16; } else { if (base_min_diff < 0) base_min_diff = 0; } if (base_shift < 16) { a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j)); a1_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j)); a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]); a1_above = _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]); a0_x = _mm_cvtepu8_epi16(a0_above); a1_x = _mm_cvtepu8_epi16(a1_above); r6 = _mm_slli_epi16(_mm_add_epi16(c0123, reg_j), 6); shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm_mullo_epi16(diff, shift); res = _mm_add_epi16(a32, b); res = _mm_srli_epi16(res, 5); // 16 16-bit values a0_x = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8)); a1_x = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8)); r6 = _mm_slli_epi16(_mm_add_epi16(c0123_h, reg_j), 6); shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm_mullo_epi16(diff, shift); res1 = _mm_add_epi16(a32, b); res1 = _mm_srli_epi16(res1, 5); // 16 16-bit values resx = _mm_packus_epi16(res, res1); } else { resx = _mm_setzero_si128(); } // y calc if (base_x < min_base_x) { __m128i c_reg, c_reg_h, y_reg, y_reg_h, base_y, base_y_h; __m128i mask, mask_h, mul16, mul16_h; r6 = _mm_set1_epi16(r << 6); c_reg = _mm_add_epi16(reg_j, c1234); c_reg_h = _mm_add_epi16(reg_j, c1234_h); mul16 = _mm_min_epu16(_mm_mullo_epi16(c_reg, dy256), _mm_srli_epi16(min_y_base, 1)); mul16_h = _mm_min_epu16(_mm_mullo_epi16(c_reg_h, dy256), _mm_srli_epi16(min_y_base, 1)); y_reg = _mm_sub_epi16(r6, mul16); y_reg_h = _mm_sub_epi16(r6, mul16_h); base_y = _mm_srai_epi16(y_reg, frac_bits_y); base_y_h = _mm_srai_epi16(y_reg_h, frac_bits_y); mask = _mm_cmpgt_epi16(min_y_base, base_y); mask_h = _mm_cmpgt_epi16(min_y_base, base_y_h); base_y = _mm_blendv_epi8(base_y, min_y_base, mask); base_y_h = _mm_blendv_epi8(base_y_h, min_y_base, mask_h); int16_t min_y = (int16_t)_mm_extract_epi16(base_y_h, 7); int16_t max_y = (int16_t)_mm_extract_epi16(base_y, 0); int16_t offset_diff = max_y - min_y; if (offset_diff < 16) { __m128i min_y_reg = _mm_set1_epi16(min_y); __m128i base_y_offset = _mm_sub_epi16(base_y, min_y_reg); __m128i base_y_offset_h = _mm_sub_epi16(base_y_h, min_y_reg); __m128i y_offset = _mm_packs_epi16(base_y_offset, base_y_offset_h); __m128i a0_mask = _mm_loadu_si128((__m128i *)(left + min_y)); __m128i a1_mask = _mm_loadu_si128((__m128i *)(left + min_y + 1)); __m128i LoadMask = _mm_loadu_si128((__m128i *)(LoadMaskz2[offset_diff / 4])); a0_mask = _mm_and_si128(a0_mask, LoadMask); a1_mask = _mm_and_si128(a1_mask, LoadMask); a0_mask = _mm_shuffle_epi8(a0_mask, y_offset); a1_mask = _mm_shuffle_epi8(a1_mask, y_offset); a0_y = _mm_cvtepu8_epi16(a0_mask); a1_y = _mm_cvtepu8_epi16(a1_mask); a0_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a0_mask, 8)); a1_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a1_mask, 8)); } else { base_y = _mm_andnot_si128(mask, base_y); base_y_h = _mm_andnot_si128(mask_h, base_y_h); _mm_store_si128((__m128i *)base_y_c, base_y); _mm_store_si128((__m128i *)&base_y_c[8], base_y_h); a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], left[base_y_c[6]], left[base_y_c[7]]); a0_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], left[base_y_c[15]]); base_y = _mm_add_epi16(base_y, c1); base_y_h = _mm_add_epi16(base_y_h, c1); _mm_store_si128((__m128i *)base_y_c, base_y); _mm_store_si128((__m128i *)&base_y_c[8], base_y_h); a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], left[base_y_c[6]], left[base_y_c[7]]); a1_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], left[base_y_c[15]]); } shifty = _mm_srli_epi16(_mm_and_si128(y_reg, c3f), 1); shifty_h = _mm_srli_epi16(_mm_and_si128(y_reg_h, c3f), 1); diff = _mm_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] a32 = _mm_slli_epi16(a0_y, 5); // a[x] * 32 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm_mullo_epi16(diff, shifty); res = _mm_add_epi16(a32, b); res = _mm_srli_epi16(res, 5); // 16 16-bit values diff = _mm_sub_epi16(a1_y_h, a0_y_h); // a[x+1] - a[x] a32 = _mm_slli_epi16(a0_y_h, 5); // a[x] * 32 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 b = _mm_mullo_epi16(diff, shifty_h); res1 = _mm_add_epi16(a32, b); res1 = _mm_srli_epi16(res1, 5); // 16 16-bit values resy = _mm_packus_epi16(res, res1); } else { resy = _mm_setzero_si128(); } resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]); _mm_storeu_si128((__m128i *)(dst + j), resxy); } // for j dst += stride; } } // Directional prediction, zone 2: 90 < angle < 180 void av1_dr_prediction_z2_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy) { assert(dx > 0); assert(dy > 0); switch (bw) { case 4: dr_prediction_z2_Nx4_sse4_1(bh, dst, stride, above, left, upsample_above, upsample_left, dx, dy); break; case 8: dr_prediction_z2_Nx8_sse4_1(bh, dst, stride, above, left, upsample_above, upsample_left, dx, dy); break; default: dr_prediction_z2_HxW_sse4_1(bh, bw, dst, stride, above, left, upsample_above, upsample_left, dx, dy); } return; } // z3 functions static void dr_prediction_z3_4x4_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[4], d[4]; dr_prediction_z1_HxW_internal_sse4_1(4, 4, dstvec, left, upsample_left, dy); transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0], &d[1], &d[2], &d[3]); *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]); *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]); *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]); *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]); return; } static void dr_prediction_z3_8x8_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[8], d[8]; dr_prediction_z1_HxW_internal_sse4_1(8, 8, dstvec, left, upsample_left, dy); transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2], &d[3]); _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8)); _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]); _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8)); _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]); _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8)); _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]); _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8)); } static void dr_prediction_z3_4x8_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[4], d[8]; dr_prediction_z1_HxW_internal_sse4_1(8, 4, dstvec, left, upsample_left, dy); transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); for (int i = 0; i < 8; i++) { *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); } } static void dr_prediction_z3_8x4_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[8], d[4]; dr_prediction_z1_HxW_internal_sse4_1(4, 8, dstvec, left, upsample_left, dy); transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2], &d[3]); _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); } static void dr_prediction_z3_8x16_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[8], d[8]; dr_prediction_z1_HxW_internal_sse4_1(16, 8, dstvec, left, upsample_left, dy); transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3, dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d, d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7); for (int i = 0; i < 8; i++) { _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), _mm_srli_si128(d[i], 8)); } } static void dr_prediction_z3_16x8_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[16], d[16]; dr_prediction_z1_HxW_internal_sse4_1(8, 16, dstvec, left, upsample_left, dy); transpose16x8_8x16_sse2( &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); for (int i = 0; i < 8; i++) { _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static void dr_prediction_z3_4x16_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[4], d[16]; dr_prediction_z1_HxW_internal_sse4_1(16, 4, dstvec, left, upsample_left, dy); transpose4x16_sse2(dstvec, d); for (int i = 0; i < 16; i++) { *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); } } static void dr_prediction_z3_16x4_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[16], d[8]; dr_prediction_z1_HxW_internal_sse4_1(4, 16, dstvec, left, upsample_left, dy); for (int i = 4; i < 8; i++) { d[i] = _mm_setzero_si128(); } transpose16x8_8x16_sse2( &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); for (int i = 0; i < 4; i++) { _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); } } static void dr_prediction_z3_8x32_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[16], d[16], dstvec_h[16], d_h[16]; dr_prediction_z1_32xN_internal_sse4_1(8, dstvec, dstvec_h, left, upsample_left, dy); for (int i = 8; i < 16; i++) { dstvec[i] = _mm_setzero_si128(); dstvec_h[i] = _mm_setzero_si128(); } transpose16x16_sse2(dstvec, d); transpose16x16_sse2(dstvec_h, d_h); for (int i = 0; i < 16; i++) { _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); } for (int i = 0; i < 16; i++) { _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), d_h[i]); } } static void dr_prediction_z3_32x8_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[32], d[16]; dr_prediction_z1_HxW_internal_sse4_1(8, 32, dstvec, left, upsample_left, dy); transpose16x8_8x16_sse2( &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); transpose16x8_8x16_sse2( &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16], &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16], &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16], &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16], &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8], &d[6 + 8], &d[7 + 8]); for (int i = 0; i < 8; i++) { _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]); } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static void dr_prediction_z3_16x16_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[16], d[16]; dr_prediction_z1_HxW_internal_sse4_1(16, 16, dstvec, left, upsample_left, dy); transpose16x16_sse2(dstvec, d); for (int i = 0; i < 16; i++) { _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); } } static void dr_prediction_z3_32x32_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[32], d[32], dstvec_h[32], d_h[32]; dr_prediction_z1_32xN_internal_sse4_1(32, dstvec, dstvec_h, left, upsample_left, dy); transpose16x16_sse2(dstvec, d); transpose16x16_sse2(dstvec_h, d_h); transpose16x16_sse2(dstvec + 16, d + 16); transpose16x16_sse2(dstvec_h + 16, d_h + 16); for (int j = 0; j < 16; j++) { _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]); _mm_storeu_si128((__m128i *)(dst + j * stride + 16), d[j + 16]); } for (int j = 0; j < 16; j++) { _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]); _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), d_h[j + 16]); } } static void dr_prediction_z3_64x64_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8_t dstT[64 * 64]; dr_prediction_z1_64xN_sse4_1(64, dstT, 64, left, upsample_left, dy); transpose(dstT, 64, dst, stride, 64, 64); } static void dr_prediction_z3_16x32_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[16], d[16], dstvec_h[16], d_h[16]; dr_prediction_z1_32xN_internal_sse4_1(16, dstvec, dstvec_h, left, upsample_left, dy); transpose16x16_sse2(dstvec, d); transpose16x16_sse2(dstvec_h, d_h); // store for (int j = 0; j < 16; j++) { _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]); _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]); } } static void dr_prediction_z3_32x16_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[32], d[16]; dr_prediction_z1_HxW_internal_sse4_1(16, 32, dstvec, left, upsample_left, dy); for (int i = 0; i < 32; i += 16) { transpose16x16_sse2((dstvec + i), d); for (int j = 0; j < 16; j++) { _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); } } } static void dr_prediction_z3_32x64_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8_t dstT[64 * 32]; dr_prediction_z1_64xN_sse4_1(32, dstT, 64, left, upsample_left, dy); transpose(dstT, 64, dst, stride, 32, 64); } static void dr_prediction_z3_64x32_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8_t dstT[32 * 64]; dr_prediction_z1_32xN_sse4_1(64, dstT, 32, left, upsample_left, dy); transpose(dstT, 32, dst, stride, 64, 32); return; } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static void dr_prediction_z3_16x64_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8_t dstT[64 * 16]; dr_prediction_z1_64xN_sse4_1(16, dstT, 64, left, upsample_left, dy); transpose(dstT, 64, dst, stride, 16, 64); } static void dr_prediction_z3_64x16_sse4_1(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { __m128i dstvec[64], d[16]; dr_prediction_z1_HxW_internal_sse4_1(16, 64, dstvec, left, upsample_left, dy); for (int i = 0; i < 64; i += 16) { transpose16x16_sse2(dstvec + i, d); for (int j = 0; j < 16; j++) { _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); } } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void av1_dr_prediction_z3_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy) { (void)above; (void)dx; assert(dx == 1); assert(dy > 0); if (bw == bh) { switch (bw) { case 4: dr_prediction_z3_4x4_sse4_1(dst, stride, left, upsample_left, dy); break; case 8: dr_prediction_z3_8x8_sse4_1(dst, stride, left, upsample_left, dy); break; case 16: dr_prediction_z3_16x16_sse4_1(dst, stride, left, upsample_left, dy); break; case 32: dr_prediction_z3_32x32_sse4_1(dst, stride, left, upsample_left, dy); break; case 64: dr_prediction_z3_64x64_sse4_1(dst, stride, left, upsample_left, dy); break; default: assert(0 && "Invalid block size"); } } else { if (bw < bh) { if (bw + bw == bh) { switch (bw) { case 4: dr_prediction_z3_4x8_sse4_1(dst, stride, left, upsample_left, dy); break; case 8: dr_prediction_z3_8x16_sse4_1(dst, stride, left, upsample_left, dy); break; case 16: dr_prediction_z3_16x32_sse4_1(dst, stride, left, upsample_left, dy); break; case 32: dr_prediction_z3_32x64_sse4_1(dst, stride, left, upsample_left, dy); break; default: assert(0 && "Invalid block size"); } } else { switch (bw) { #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER case 4: dr_prediction_z3_4x16_sse4_1(dst, stride, left, upsample_left, dy); break; case 8: dr_prediction_z3_8x32_sse4_1(dst, stride, left, upsample_left, dy); break; case 16: dr_prediction_z3_16x64_sse4_1(dst, stride, left, upsample_left, dy); break; default: assert(0 && "Invalid block size"); #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER } } } else { if (bh + bh == bw) { switch (bh) { case 4: dr_prediction_z3_8x4_sse4_1(dst, stride, left, upsample_left, dy); break; case 8: dr_prediction_z3_16x8_sse4_1(dst, stride, left, upsample_left, dy); break; case 16: dr_prediction_z3_32x16_sse4_1(dst, stride, left, upsample_left, dy); break; case 32: dr_prediction_z3_64x32_sse4_1(dst, stride, left, upsample_left, dy); break; default: assert(0 && "Invalid block size"); } } else { switch (bh) { #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER case 4: dr_prediction_z3_16x4_sse4_1(dst, stride, left, upsample_left, dy); break; case 8: dr_prediction_z3_32x8_sse4_1(dst, stride, left, upsample_left, dy); break; case 16: dr_prediction_z3_64x16_sse4_1(dst, stride, left, upsample_left, dy); break; default: assert(0 && "Invalid block size"); #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER } } } } } aom-3.12.1/aom_dsp/x86/intrapred_ssse3.c000066400000000000000000004272121477627663500177340ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/intrapred_common.h" // ----------------------------------------------------------------------------- // PAETH_PRED // Return 8 16-bit pixels in one row static inline __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top, const __m128i *topleft) { const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft); __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left)); __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top)); __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft)); __m128i mask1 = _mm_cmpgt_epi16(pl, pt); mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl)); __m128i mask2 = _mm_cmpgt_epi16(pt, ptl); pl = _mm_andnot_si128(mask1, *left); ptl = _mm_and_si128(mask2, *topleft); pt = _mm_andnot_si128(mask2, *top); pt = _mm_or_si128(pt, ptl); pt = _mm_and_si128(mask1, pt); return _mm_or_si128(pl, pt); } void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i l = _mm_loadl_epi64((const __m128i *)left); const __m128i t = _mm_loadl_epi64((const __m128i *)above); const __m128i zero = _mm_setzero_si128(); const __m128i t16 = _mm_unpacklo_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); int i; for (i = 0; i < 4; ++i) { const __m128i l16 = _mm_shuffle_epi8(l, rep); const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); dst += stride; rep = _mm_add_epi16(rep, one); } } void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i l = _mm_loadl_epi64((const __m128i *)left); const __m128i t = _mm_loadl_epi64((const __m128i *)above); const __m128i zero = _mm_setzero_si128(); const __m128i t16 = _mm_unpacklo_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); int i; for (i = 0; i < 8; ++i) { const __m128i l16 = _mm_shuffle_epi8(l, rep); const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); dst += stride; rep = _mm_add_epi16(rep, one); } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i l = _mm_load_si128((const __m128i *)left); const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]); const __m128i zero = _mm_setzero_si128(); const __m128i t16 = _mm_unpacklo_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); for (int i = 0; i < 16; ++i) { const __m128i l16 = _mm_shuffle_epi8(l, rep); const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); dst += stride; rep = _mm_add_epi16(rep, one); } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i l = _mm_loadl_epi64((const __m128i *)left); const __m128i t = _mm_loadl_epi64((const __m128i *)above); const __m128i zero = _mm_setzero_si128(); const __m128i t16 = _mm_unpacklo_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); int i; for (i = 0; i < 4; ++i) { const __m128i l16 = _mm_shuffle_epi8(l, rep); const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); dst += stride; rep = _mm_add_epi16(rep, one); } } void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i l = _mm_loadl_epi64((const __m128i *)left); const __m128i t = _mm_loadl_epi64((const __m128i *)above); const __m128i zero = _mm_setzero_si128(); const __m128i t16 = _mm_unpacklo_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); int i; for (i = 0; i < 8; ++i) { const __m128i l16 = _mm_shuffle_epi8(l, rep); const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); dst += stride; rep = _mm_add_epi16(rep, one); } } void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i l = _mm_load_si128((const __m128i *)left); const __m128i t = _mm_loadl_epi64((const __m128i *)above); const __m128i zero = _mm_setzero_si128(); const __m128i t16 = _mm_unpacklo_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); int i; for (i = 0; i < 16; ++i) { const __m128i l16 = _mm_shuffle_epi8(l, rep); const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); dst += stride; rep = _mm_add_epi16(rep, one); } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i t = _mm_loadl_epi64((const __m128i *)above); const __m128i zero = _mm_setzero_si128(); const __m128i t16 = _mm_unpacklo_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); const __m128i one = _mm_set1_epi16(1); for (int j = 0; j < 2; ++j) { const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); __m128i rep = _mm_set1_epi16((short)0x8000); for (int i = 0; i < 16; ++i) { const __m128i l16 = _mm_shuffle_epi8(l, rep); const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); dst += stride; rep = _mm_add_epi16(rep, one); } } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // Return 16 8-bit pixels in one row static inline __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0, const __m128i *top1, const __m128i *topleft) { const __m128i p0 = paeth_8x1_pred(left, top0, topleft); const __m128i p1 = paeth_8x1_pred(left, top1, topleft); return _mm_packus_epi16(p0, p1); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]); const __m128i t = _mm_load_si128((const __m128i *)above); const __m128i zero = _mm_setzero_si128(); const __m128i top0 = _mm_unpacklo_epi8(t, zero); const __m128i top1 = _mm_unpackhi_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); for (int i = 0; i < 4; ++i) { const __m128i l16 = _mm_shuffle_epi8(l, rep); const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); _mm_store_si128((__m128i *)dst, row); dst += stride; rep = _mm_add_epi16(rep, one); } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i l = _mm_loadl_epi64((const __m128i *)left); const __m128i t = _mm_load_si128((const __m128i *)above); const __m128i zero = _mm_setzero_si128(); const __m128i top0 = _mm_unpacklo_epi8(t, zero); const __m128i top1 = _mm_unpackhi_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); int i; for (i = 0; i < 8; ++i) { const __m128i l16 = _mm_shuffle_epi8(l, rep); const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); _mm_store_si128((__m128i *)dst, row); dst += stride; rep = _mm_add_epi16(rep, one); } } void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i l = _mm_load_si128((const __m128i *)left); const __m128i t = _mm_load_si128((const __m128i *)above); const __m128i zero = _mm_setzero_si128(); const __m128i top0 = _mm_unpacklo_epi8(t, zero); const __m128i top1 = _mm_unpackhi_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); int i; for (i = 0; i < 16; ++i) { const __m128i l16 = _mm_shuffle_epi8(l, rep); const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); _mm_store_si128((__m128i *)dst, row); dst += stride; rep = _mm_add_epi16(rep, one); } } void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i l = _mm_load_si128((const __m128i *)left); const __m128i t = _mm_load_si128((const __m128i *)above); const __m128i zero = _mm_setzero_si128(); const __m128i top0 = _mm_unpacklo_epi8(t, zero); const __m128i top1 = _mm_unpackhi_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); __m128i l16; int i; for (i = 0; i < 16; ++i) { l16 = _mm_shuffle_epi8(l, rep); const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); _mm_store_si128((__m128i *)dst, row); dst += stride; rep = _mm_add_epi16(rep, one); } l = _mm_load_si128((const __m128i *)(left + 16)); rep = _mm_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { l16 = _mm_shuffle_epi8(l, rep); const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); _mm_store_si128((__m128i *)dst, row); dst += stride; rep = _mm_add_epi16(rep, one); } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i t = _mm_load_si128((const __m128i *)above); const __m128i zero = _mm_setzero_si128(); const __m128i top0 = _mm_unpacklo_epi8(t, zero); const __m128i top1 = _mm_unpackhi_epi8(t, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); const __m128i one = _mm_set1_epi16(1); for (int j = 0; j < 4; ++j) { const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); __m128i rep = _mm_set1_epi16((short)0x8000); for (int i = 0; i < 16; ++i) { const __m128i l16 = _mm_shuffle_epi8(l, rep); const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); _mm_store_si128((__m128i *)dst, row); dst += stride; rep = _mm_add_epi16(rep, one); } } } void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i a = _mm_load_si128((const __m128i *)above); const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); const __m128i zero = _mm_setzero_si128(); const __m128i al = _mm_unpacklo_epi8(a, zero); const __m128i ah = _mm_unpackhi_epi8(a, zero); const __m128i bl = _mm_unpacklo_epi8(b, zero); const __m128i bh = _mm_unpackhi_epi8(b, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); const __m128i l = _mm_loadl_epi64((const __m128i *)left); __m128i l16; for (int i = 0; i < 8; ++i) { l16 = _mm_shuffle_epi8(l, rep); const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); _mm_store_si128((__m128i *)dst, r32l); _mm_store_si128((__m128i *)(dst + 16), r32h); dst += stride; rep = _mm_add_epi16(rep, one); } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i a = _mm_load_si128((const __m128i *)above); const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); const __m128i zero = _mm_setzero_si128(); const __m128i al = _mm_unpacklo_epi8(a, zero); const __m128i ah = _mm_unpackhi_epi8(a, zero); const __m128i bl = _mm_unpacklo_epi8(b, zero); const __m128i bh = _mm_unpackhi_epi8(b, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); __m128i l = _mm_load_si128((const __m128i *)left); __m128i l16; int i; for (i = 0; i < 16; ++i) { l16 = _mm_shuffle_epi8(l, rep); const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); _mm_store_si128((__m128i *)dst, r32l); _mm_store_si128((__m128i *)(dst + 16), r32h); dst += stride; rep = _mm_add_epi16(rep, one); } } void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i a = _mm_load_si128((const __m128i *)above); const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); const __m128i zero = _mm_setzero_si128(); const __m128i al = _mm_unpacklo_epi8(a, zero); const __m128i ah = _mm_unpackhi_epi8(a, zero); const __m128i bl = _mm_unpacklo_epi8(b, zero); const __m128i bh = _mm_unpackhi_epi8(b, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); __m128i rep = _mm_set1_epi16((short)0x8000); const __m128i one = _mm_set1_epi16(1); __m128i l = _mm_load_si128((const __m128i *)left); __m128i l16; int i; for (i = 0; i < 16; ++i) { l16 = _mm_shuffle_epi8(l, rep); const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); _mm_store_si128((__m128i *)dst, r32l); _mm_store_si128((__m128i *)(dst + 16), r32h); dst += stride; rep = _mm_add_epi16(rep, one); } rep = _mm_set1_epi16((short)0x8000); l = _mm_load_si128((const __m128i *)(left + 16)); for (i = 0; i < 16; ++i) { l16 = _mm_shuffle_epi8(l, rep); const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); _mm_store_si128((__m128i *)dst, r32l); _mm_store_si128((__m128i *)(dst + 16), r32h); dst += stride; rep = _mm_add_epi16(rep, one); } } void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i a = _mm_load_si128((const __m128i *)above); const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); const __m128i zero = _mm_setzero_si128(); const __m128i al = _mm_unpacklo_epi8(a, zero); const __m128i ah = _mm_unpackhi_epi8(a, zero); const __m128i bl = _mm_unpacklo_epi8(b, zero); const __m128i bh = _mm_unpackhi_epi8(b, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); const __m128i one = _mm_set1_epi16(1); __m128i l16; int i, j; for (j = 0; j < 4; ++j) { const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); __m128i rep = _mm_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { l16 = _mm_shuffle_epi8(l, rep); const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); _mm_store_si128((__m128i *)dst, r32l); _mm_store_si128((__m128i *)(dst + 16), r32h); dst += stride; rep = _mm_add_epi16(rep, one); } } } void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i a = _mm_load_si128((const __m128i *)above); const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); const __m128i zero = _mm_setzero_si128(); const __m128i al = _mm_unpacklo_epi8(a, zero); const __m128i ah = _mm_unpackhi_epi8(a, zero); const __m128i bl = _mm_unpacklo_epi8(b, zero); const __m128i bh = _mm_unpackhi_epi8(b, zero); const __m128i cl = _mm_unpacklo_epi8(c, zero); const __m128i ch = _mm_unpackhi_epi8(c, zero); const __m128i dl = _mm_unpacklo_epi8(d, zero); const __m128i dh = _mm_unpackhi_epi8(d, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); const __m128i one = _mm_set1_epi16(1); __m128i l16; int i, j; for (j = 0; j < 2; ++j) { const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); __m128i rep = _mm_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { l16 = _mm_shuffle_epi8(l, rep); const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); _mm_store_si128((__m128i *)dst, r0); _mm_store_si128((__m128i *)(dst + 16), r1); _mm_store_si128((__m128i *)(dst + 32), r2); _mm_store_si128((__m128i *)(dst + 48), r3); dst += stride; rep = _mm_add_epi16(rep, one); } } } void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i a = _mm_load_si128((const __m128i *)above); const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); const __m128i zero = _mm_setzero_si128(); const __m128i al = _mm_unpacklo_epi8(a, zero); const __m128i ah = _mm_unpackhi_epi8(a, zero); const __m128i bl = _mm_unpacklo_epi8(b, zero); const __m128i bh = _mm_unpackhi_epi8(b, zero); const __m128i cl = _mm_unpacklo_epi8(c, zero); const __m128i ch = _mm_unpackhi_epi8(c, zero); const __m128i dl = _mm_unpacklo_epi8(d, zero); const __m128i dh = _mm_unpackhi_epi8(d, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); const __m128i one = _mm_set1_epi16(1); __m128i l16; int i, j; for (j = 0; j < 4; ++j) { const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); __m128i rep = _mm_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { l16 = _mm_shuffle_epi8(l, rep); const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); _mm_store_si128((__m128i *)dst, r0); _mm_store_si128((__m128i *)(dst + 16), r1); _mm_store_si128((__m128i *)(dst + 32), r2); _mm_store_si128((__m128i *)(dst + 48), r3); dst += stride; rep = _mm_add_epi16(rep, one); } } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i a = _mm_load_si128((const __m128i *)above); const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); const __m128i zero = _mm_setzero_si128(); const __m128i al = _mm_unpacklo_epi8(a, zero); const __m128i ah = _mm_unpackhi_epi8(a, zero); const __m128i bl = _mm_unpacklo_epi8(b, zero); const __m128i bh = _mm_unpackhi_epi8(b, zero); const __m128i cl = _mm_unpacklo_epi8(c, zero); const __m128i ch = _mm_unpackhi_epi8(c, zero); const __m128i dl = _mm_unpacklo_epi8(d, zero); const __m128i dh = _mm_unpackhi_epi8(d, zero); const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); const __m128i one = _mm_set1_epi16(1); __m128i l16; int i; const __m128i l = _mm_load_si128((const __m128i *)left); __m128i rep = _mm_set1_epi16((short)0x8000); for (i = 0; i < 16; ++i) { l16 = _mm_shuffle_epi8(l, rep); const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); _mm_store_si128((__m128i *)dst, r0); _mm_store_si128((__m128i *)(dst + 16), r1); _mm_store_si128((__m128i *)(dst + 32), r2); _mm_store_si128((__m128i *)(dst + 48), r3); dst += stride; rep = _mm_add_epi16(rep, one); } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // ----------------------------------------------------------------------------- // SMOOTH_PRED // pixels[0]: above and below_pred interleave vector // pixels[1]: left vector // pixels[2]: right_pred vector static inline void load_pixel_w4(const uint8_t *above, const uint8_t *left, int height, __m128i *pixels) { __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]); if (height == 4) pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]); else if (height == 8) pixels[1] = _mm_loadl_epi64(((const __m128i *)left)); else pixels[1] = _mm_loadu_si128(((const __m128i *)left)); pixels[2] = _mm_set1_epi16((int16_t)above[3]); const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]); const __m128i zero = _mm_setzero_si128(); d = _mm_unpacklo_epi8(d, zero); pixels[0] = _mm_unpacklo_epi16(d, bp); } // weight_h[0]: weight_h vector // weight_h[1]: scale - weight_h vector // weight_h[2]: same as [0], second half for height = 16 only // weight_h[3]: same as [1], second half for height = 16 only // weight_w[0]: weights_w and scale - weights_w interleave vector static inline void load_weight_w4(int height, __m128i *weight_h, __m128i *weight_w) { const __m128i zero = _mm_setzero_si128(); const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE)); const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]); weight_h[0] = _mm_unpacklo_epi8(t, zero); weight_h[1] = _mm_sub_epi16(d, weight_h[0]); weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); if (height == 8) { const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]); weight_h[0] = _mm_unpacklo_epi8(weight, zero); weight_h[1] = _mm_sub_epi16(d, weight_h[0]); } else if (height == 16) { const __m128i weight = _mm_loadu_si128((const __m128i *)&smooth_weights[12]); weight_h[0] = _mm_unpacklo_epi8(weight, zero); weight_h[1] = _mm_sub_epi16(d, weight_h[0]); weight_h[2] = _mm_unpackhi_epi8(weight, zero); weight_h[3] = _mm_sub_epi16(d, weight_h[2]); } } static inline void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh, const __m128i *ww, int h, uint8_t *dst, ptrdiff_t stride, int second_half) { const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE)); const __m128i one = _mm_set1_epi16(1); const __m128i inc = _mm_set1_epi16(0x202); const __m128i gat = _mm_set1_epi32(0xc080400); __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) : _mm_set1_epi16((short)0x8000); __m128i d = _mm_set1_epi16(0x100); for (int i = 0; i < h; ++i) { const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); __m128i s = _mm_madd_epi16(pixel[0], wh_sc); __m128i b = _mm_shuffle_epi8(pixel[1], rep); b = _mm_unpacklo_epi16(b, pixel[2]); __m128i sum = _mm_madd_epi16(b, ww[0]); sum = _mm_add_epi32(s, sum); sum = _mm_add_epi32(sum, round); sum = _mm_srai_epi32(sum, 1 + SMOOTH_WEIGHT_LOG2_SCALE); sum = _mm_shuffle_epi8(sum, gat); *(int *)dst = _mm_cvtsi128_si32(sum); dst += stride; rep = _mm_add_epi16(rep, one); d = _mm_add_epi16(d, inc); } } void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i pixels[3]; load_pixel_w4(above, left, 4, pixels); __m128i wh[4], ww[2]; load_weight_w4(4, wh, ww); smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0); } void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i pixels[3]; load_pixel_w4(above, left, 8, pixels); __m128i wh[4], ww[2]; load_weight_w4(8, wh, ww); smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i pixels[3]; load_pixel_w4(above, left, 16, pixels); __m128i wh[4], ww[2]; load_weight_w4(16, wh, ww); smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0); dst += stride << 3; smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // pixels[0]: above and below_pred interleave vector, first half // pixels[1]: above and below_pred interleave vector, second half // pixels[2]: left vector // pixels[3]: right_pred vector // pixels[4]: above and below_pred interleave vector, first half // pixels[5]: above and below_pred interleave vector, second half // pixels[6]: left vector + 16 // pixels[7]: right_pred vector static inline void load_pixel_w8(const uint8_t *above, const uint8_t *left, int height, __m128i *pixels) { const __m128i zero = _mm_setzero_si128(); const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]); __m128i d = _mm_loadl_epi64((const __m128i *)above); d = _mm_unpacklo_epi8(d, zero); pixels[0] = _mm_unpacklo_epi16(d, bp); pixels[1] = _mm_unpackhi_epi16(d, bp); pixels[3] = _mm_set1_epi16((int16_t)above[7]); if (height == 4) { pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]); } else if (height == 8) { pixels[2] = _mm_loadl_epi64((const __m128i *)left); } else if (height == 16) { pixels[2] = _mm_load_si128((const __m128i *)left); } else { pixels[2] = _mm_load_si128((const __m128i *)left); pixels[4] = pixels[0]; pixels[5] = pixels[1]; pixels[6] = _mm_load_si128((const __m128i *)(left + 16)); pixels[7] = pixels[3]; } } // weight_h[0]: weight_h vector // weight_h[1]: scale - weight_h vector // weight_h[2]: same as [0], offset 8 // weight_h[3]: same as [1], offset 8 // weight_h[4]: same as [0], offset 16 // weight_h[5]: same as [1], offset 16 // weight_h[6]: same as [0], offset 24 // weight_h[7]: same as [1], offset 24 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half // weight_w[1]: weights_w and scale - weights_w interleave vector, second half static inline void load_weight_w8(int height, __m128i *weight_h, __m128i *weight_w) { const __m128i zero = _mm_setzero_si128(); const int we_offset = height < 8 ? 0 : 4; __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]); weight_h[0] = _mm_unpacklo_epi8(we, zero); const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE)); weight_h[1] = _mm_sub_epi16(d, weight_h[0]); if (height == 4) { we = _mm_srli_si128(we, 4); __m128i tmp1 = _mm_unpacklo_epi8(we, zero); __m128i tmp2 = _mm_sub_epi16(d, tmp1); weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2); weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2); } else { weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); } if (height == 16) { we = _mm_loadu_si128((const __m128i *)&smooth_weights[12]); weight_h[0] = _mm_unpacklo_epi8(we, zero); weight_h[1] = _mm_sub_epi16(d, weight_h[0]); weight_h[2] = _mm_unpackhi_epi8(we, zero); weight_h[3] = _mm_sub_epi16(d, weight_h[2]); } else if (height == 32) { const __m128i weight_lo = _mm_loadu_si128((const __m128i *)&smooth_weights[28]); weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero); weight_h[1] = _mm_sub_epi16(d, weight_h[0]); weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero); weight_h[3] = _mm_sub_epi16(d, weight_h[2]); const __m128i weight_hi = _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]); weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero); weight_h[5] = _mm_sub_epi16(d, weight_h[4]); weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero); weight_h[7] = _mm_sub_epi16(d, weight_h[6]); } } static inline void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh, const __m128i *ww, int h, uint8_t *dst, ptrdiff_t stride, int second_half) { const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE)); const __m128i one = _mm_set1_epi16(1); const __m128i inc = _mm_set1_epi16(0x202); const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) : _mm_set1_epi16((short)0x8000); __m128i d = _mm_set1_epi16(0x100); int i; for (i = 0; i < h; ++i) { const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); __m128i b = _mm_shuffle_epi8(pixels[2], rep); b = _mm_unpacklo_epi16(b, pixels[3]); __m128i sum0 = _mm_madd_epi16(b, ww[0]); __m128i sum1 = _mm_madd_epi16(b, ww[1]); s0 = _mm_add_epi32(s0, sum0); s0 = _mm_add_epi32(s0, round); s0 = _mm_srai_epi32(s0, 1 + SMOOTH_WEIGHT_LOG2_SCALE); s1 = _mm_add_epi32(s1, sum1); s1 = _mm_add_epi32(s1, round); s1 = _mm_srai_epi32(s1, 1 + SMOOTH_WEIGHT_LOG2_SCALE); sum0 = _mm_packus_epi16(s0, s1); sum0 = _mm_shuffle_epi8(sum0, gat); _mm_storel_epi64((__m128i *)dst, sum0); dst += stride; rep = _mm_add_epi16(rep, one); d = _mm_add_epi16(d, inc); } } void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i pixels[4]; load_pixel_w8(above, left, 4, pixels); __m128i wh[4], ww[2]; load_weight_w8(4, wh, ww); smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0); } void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i pixels[4]; load_pixel_w8(above, left, 8, pixels); __m128i wh[4], ww[2]; load_weight_w8(8, wh, ww); smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); } void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i pixels[4]; load_pixel_w8(above, left, 16, pixels); __m128i wh[4], ww[2]; load_weight_w8(16, wh, ww); smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); dst += stride << 3; smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i pixels[8]; load_pixel_w8(above, left, 32, pixels); __m128i wh[8], ww[2]; load_weight_w8(32, wh, ww); smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0); dst += stride << 3; smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1); dst += stride << 3; smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0); dst += stride << 3; smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // TODO(slavarnway): Visual Studio only supports restrict when /std:c11 // (available in 2019+) or greater is specified; __restrict can be used in that // case. This should be moved to rtcd and used consistently between the // function declarations and definitions to avoid warnings in Visual Studio // when defining LIBAOM_RESTRICT to restrict or __restrict. #if defined(_MSC_VER) #define LIBAOM_RESTRICT #else #define LIBAOM_RESTRICT restrict #endif static AOM_FORCE_INLINE __m128i Load4(const void *src) { // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32 // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a // movss instruction. // // Until compiler support of _mm_loadu_si32 is widespread, use of // _mm_loadu_si32 is banned. int val; memcpy(&val, src, sizeof(val)); return _mm_cvtsi32_si128(val); } static AOM_FORCE_INLINE __m128i LoadLo8(const void *a) { return _mm_loadl_epi64((const __m128i *)(a)); } static AOM_FORCE_INLINE __m128i LoadUnaligned16(const void *a) { return _mm_loadu_si128((const __m128i *)(a)); } static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) { const int val = _mm_cvtsi128_si32(x); memcpy(dst, &val, sizeof(val)); } static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) { _mm_storel_epi64((__m128i *)(a), v); } static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) { _mm_storeu_si128((__m128i *)(a), v); } static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) { return _mm_unpacklo_epi8((x), _mm_setzero_si128()); } static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) { const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128()); return _mm_unpacklo_epi16(tmp, _mm_setzero_si128()); } static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) { return _mm_unpacklo_epi16((x), _mm_setzero_si128()); } static void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column, int width, int height) { const uint8_t *const sm_weights_h = smooth_weights + height - 4; const uint8_t *const sm_weights_w = smooth_weights + width - 4; const __m128i zero = _mm_setzero_si128(); const __m128i scale_value = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i bottom_left = _mm_cvtsi32_si128(left_column[height - 1]); const __m128i top_right = _mm_set1_epi16(top_row[width - 1]); const __m128i round = _mm_set1_epi32(1 << SMOOTH_WEIGHT_LOG2_SCALE); for (int y = 0; y < height; ++y) { const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]); const __m128i left_y = _mm_cvtsi32_si128(left_column[y]); const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y); __m128i scaled_bottom_left = _mm_mullo_epi16(scale_m_weights_y, bottom_left); const __m128i weight_left_y = _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0); scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round); scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0); for (int x = 0; x < width; x += 8) { const __m128i top_x = LoadLo8(top_row + x); const __m128i weights_x = LoadLo8(sm_weights_w + x); const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x); const __m128i top_weights_x_lo = cvtepu8_epi16(top_weights_x); const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero); // Here opposite weights and pixels are multiplied, where the order of // interleaving is indicated in the names. __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y); __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y); // |scaled_bottom_left| is always scaled by the same weight each row, so // we only derive |scaled_top_right| values here. const __m128i inverted_weights_x = _mm_sub_epi16(scale_value, cvtepu8_epi16(weights_x)); const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights_x, top_right); const __m128i scaled_top_right_lo = cvtepu16_epi32(scaled_top_right); const __m128i scaled_top_right_hi = _mm_unpackhi_epi16(scaled_top_right, zero); pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left); pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left); pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo); pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi); // The round value for RightShiftWithRounding was added with // |scaled_bottom_left|. pred_lo = _mm_srli_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE)); pred_hi = _mm_srli_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE)); const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); StoreLo8(dst + x, _mm_packus_epi16(pred, pred)); } dst += stride; } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { smooth_predictor_wxh(dst, stride, above, left, 16, 4); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { smooth_predictor_wxh(dst, stride, above, left, 16, 8); } void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { smooth_predictor_wxh(dst, stride, above, left, 16, 16); } void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { smooth_predictor_wxh(dst, stride, above, left, 16, 32); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { smooth_predictor_wxh(dst, stride, above, left, 16, 64); } void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { smooth_predictor_wxh(dst, stride, above, left, 32, 8); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { smooth_predictor_wxh(dst, stride, above, left, 32, 16); } void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { smooth_predictor_wxh(dst, stride, above, left, 32, 32); } void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { smooth_predictor_wxh(dst, stride, above, left, 32, 64); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { smooth_predictor_wxh(dst, stride, above, left, 64, 16); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { smooth_predictor_wxh(dst, stride, above, left, 64, 32); } void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { smooth_predictor_wxh(dst, stride, above, left, 64, 64); } // ----------------------------------------------------------------------------- // Smooth horizontal/vertical helper functions. // For Horizontal, pixels1 and pixels2 are the same repeated value. For // Vertical, weights1 and weights2 are the same, and scaled_corner1 and // scaled_corner2 are the same. static AOM_FORCE_INLINE void write_smooth_directional_sum16( uint8_t *LIBAOM_RESTRICT dst, const __m128i pixels1, const __m128i pixels2, const __m128i weights1, const __m128i weights2, const __m128i scaled_corner1, const __m128i scaled_corner2, const __m128i round) { const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1); const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2); const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1); const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2); // Equivalent to RightShiftWithRounding(pred[x][y], 8). const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8); const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8); StoreUnaligned16(dst, _mm_packus_epi16(pred1, pred2)); } static AOM_FORCE_INLINE __m128i smooth_directional_sum8( const __m128i pixels, const __m128i weights, const __m128i scaled_corner) { const __m128i weighted_px = _mm_mullo_epi16(pixels, weights); return _mm_add_epi16(scaled_corner, weighted_px); } static AOM_FORCE_INLINE void write_smooth_directional_sum8( uint8_t *LIBAOM_RESTRICT dst, const __m128i *pixels, const __m128i *weights, const __m128i *scaled_corner, const __m128i *round) { const __m128i pred_sum = smooth_directional_sum8(*pixels, *weights, *scaled_corner); // Equivalent to RightShiftWithRounding(pred[x][y], 8). const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, *round), 8); StoreLo8(dst, _mm_packus_epi16(pred, pred)); } // ----------------------------------------------------------------------------- // SMOOTH_V_PRED static AOM_FORCE_INLINE void load_smooth_vertical_pixels4( const uint8_t *LIBAOM_RESTRICT above, const uint8_t *LIBAOM_RESTRICT left, const int height, __m128i *pixels) { __m128i top = Load4(above); const __m128i bottom_left = _mm_set1_epi16(left[height - 1]); top = cvtepu8_epi16(top); pixels[0] = _mm_unpacklo_epi16(top, bottom_left); } // |weight_array| alternates weight vectors from the table with their inverted // (256-w) counterparts. This is precomputed by the compiler when the weights // table is visible to this module. Removing this visibility can cut speed by up // to half in both 4xH and 8xH transforms. static AOM_FORCE_INLINE void load_smooth_vertical_weights4( const uint8_t *LIBAOM_RESTRICT weight_array, const int height, __m128i *weights) { const __m128i inverter = _mm_set1_epi16(256); if (height == 4) { const __m128i weight = Load4(weight_array); weights[0] = cvtepu8_epi16(weight); weights[1] = _mm_sub_epi16(inverter, weights[0]); } else if (height == 8) { const __m128i weight = LoadLo8(weight_array + 4); weights[0] = cvtepu8_epi16(weight); weights[1] = _mm_sub_epi16(inverter, weights[0]); } else { const __m128i weight = LoadUnaligned16(weight_array + 12); const __m128i zero = _mm_setzero_si128(); weights[0] = cvtepu8_epi16(weight); weights[1] = _mm_sub_epi16(inverter, weights[0]); weights[2] = _mm_unpackhi_epi8(weight, zero); weights[3] = _mm_sub_epi16(inverter, weights[2]); } } static AOM_FORCE_INLINE void write_smooth_vertical4xh( const __m128i *pixel, const __m128i *weight, const int height, uint8_t *LIBAOM_RESTRICT dst, const ptrdiff_t stride) { const __m128i pred_round = _mm_set1_epi32(128); const __m128i mask_increment = _mm_set1_epi16(0x0202); const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400); __m128i y_select = _mm_set1_epi16(0x0100); for (int y = 0; y < height; ++y) { const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select); const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select); const __m128i alternate_weights = _mm_unpacklo_epi16(weight_y, inverted_weight_y); // Here the pixel vector is top_row[0], corner, top_row[1], corner, ... // The madd instruction yields four results of the form: // (top_row[x] * weight[y] + corner * inverted_weight[y]) __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights); sum = _mm_add_epi32(sum, pred_round); sum = _mm_srai_epi32(sum, 8); sum = _mm_shuffle_epi8(sum, cvtepu8_epi32); Store4(dst, sum); dst += stride; y_select = _mm_add_epi16(y_select, mask_increment); } } void aom_smooth_v_predictor_4x4_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { __m128i pixels; load_smooth_vertical_pixels4(top_row, left_column, 4, &pixels); __m128i weights[2]; load_smooth_vertical_weights4(smooth_weights, 4, weights); write_smooth_vertical4xh(&pixels, weights, 4, dst, stride); } void aom_smooth_v_predictor_4x8_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { __m128i pixels; load_smooth_vertical_pixels4(top_row, left_column, 8, &pixels); __m128i weights[2]; load_smooth_vertical_weights4(smooth_weights, 8, weights); write_smooth_vertical4xh(&pixels, weights, 8, dst, stride); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_v_predictor_4x16_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { __m128i pixels; load_smooth_vertical_pixels4(top_row, left_column, 16, &pixels); __m128i weights[4]; load_smooth_vertical_weights4(smooth_weights, 16, weights); write_smooth_vertical4xh(&pixels, weights, 8, dst, stride); dst += stride << 3; write_smooth_vertical4xh(&pixels, &weights[2], 8, dst, stride); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_v_predictor_8x4_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i bottom_left = _mm_set1_epi16(left_column[3]); const __m128i weights = cvtepu8_epi16(Load4(smooth_weights)); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i inverted_weights = _mm_sub_epi16(scale, weights); const __m128i scaled_bottom_left = _mm_mullo_epi16(inverted_weights, bottom_left); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); __m128i y_select = _mm_set1_epi32(0x01000100); const __m128i top = cvtepu8_epi16(LoadLo8(top_row)); __m128i weights_y = _mm_shuffle_epi8(weights, y_select); __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, &round); dst += stride; y_select = _mm_set1_epi32(0x03020302); weights_y = _mm_shuffle_epi8(weights, y_select); scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, &round); dst += stride; y_select = _mm_set1_epi32(0x05040504); weights_y = _mm_shuffle_epi8(weights, y_select); scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, &round); dst += stride; y_select = _mm_set1_epi32(0x07060706); weights_y = _mm_shuffle_epi8(weights, y_select); scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, &round); } void aom_smooth_v_predictor_8x8_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i bottom_left = _mm_set1_epi16(left_column[7]); const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i inverted_weights = _mm_sub_epi16(scale, weights); const __m128i scaled_bottom_left = _mm_mullo_epi16(inverted_weights, bottom_left); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); const __m128i top = cvtepu8_epi16(LoadLo8(top_row)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, &round); dst += stride; } } void aom_smooth_v_predictor_8x16_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i bottom_left = _mm_set1_epi16(left_column[15]); const __m128i weights = LoadUnaligned16(smooth_weights + 12); const __m128i weights1 = cvtepu8_epi16(weights); const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i scaled_bottom_left1 = _mm_mullo_epi16(inverted_weights1, bottom_left); const __m128i scaled_bottom_left2 = _mm_mullo_epi16(inverted_weights2, bottom_left); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); const __m128i top = cvtepu8_epi16(LoadLo8(top_row)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left1, y_select); write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, &round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left2, y_select); write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, &round); dst += stride; } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_v_predictor_8x32_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i zero = _mm_setzero_si128(); const __m128i bottom_left = _mm_set1_epi16(left_column[31]); const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); const __m128i weights1 = cvtepu8_epi16(weights_lo); const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero); const __m128i weights3 = cvtepu8_epi16(weights_hi); const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); const __m128i scaled_bottom_left1 = _mm_mullo_epi16(inverted_weights1, bottom_left); const __m128i scaled_bottom_left2 = _mm_mullo_epi16(inverted_weights2, bottom_left); const __m128i scaled_bottom_left3 = _mm_mullo_epi16(inverted_weights3, bottom_left); const __m128i scaled_bottom_left4 = _mm_mullo_epi16(inverted_weights4, bottom_left); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); const __m128i top = cvtepu8_epi16(LoadLo8(top_row)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left1, y_select); write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, &round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left2, y_select); write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, &round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left3, y_select); write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, &round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left4, y_select); write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, &round); dst += stride; } } void aom_smooth_v_predictor_16x4_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i bottom_left = _mm_set1_epi16(left_column[3]); const __m128i weights = cvtepu8_epi16(Load4(smooth_weights)); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i inverted_weights = _mm_sub_epi16(scale, weights); const __m128i scaled_bottom_left = _mm_mullo_epi16(inverted_weights, bottom_left); const __m128i round = _mm_set1_epi16(128); const __m128i top = LoadUnaligned16(top_row); const __m128i top_lo = cvtepu8_epi16(top); const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8)); __m128i y_select = _mm_set1_epi32(0x01000100); __m128i weights_y = _mm_shuffle_epi8(weights, y_select); __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; y_select = _mm_set1_epi32(0x03020302); weights_y = _mm_shuffle_epi8(weights, y_select); scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; y_select = _mm_set1_epi32(0x05040504); weights_y = _mm_shuffle_epi8(weights, y_select); scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; y_select = _mm_set1_epi32(0x07060706); weights_y = _mm_shuffle_epi8(weights, y_select); scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_v_predictor_16x8_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i bottom_left = _mm_set1_epi16(left_column[7]); const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i inverted_weights = _mm_sub_epi16(scale, weights); const __m128i scaled_bottom_left = _mm_mullo_epi16(inverted_weights, bottom_left); const __m128i round = _mm_set1_epi16(128); const __m128i top = LoadUnaligned16(top_row); const __m128i top_lo = cvtepu8_epi16(top); const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } } void aom_smooth_v_predictor_16x16_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i bottom_left = _mm_set1_epi16(left_column[15]); const __m128i zero = _mm_setzero_si128(); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i weights = LoadUnaligned16(smooth_weights + 12); const __m128i weights_lo = cvtepu8_epi16(weights); const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero); const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo); const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi); const __m128i scaled_bottom_left_lo = _mm_mullo_epi16(inverted_weights_lo, bottom_left); const __m128i scaled_bottom_left_hi = _mm_mullo_epi16(inverted_weights_hi, bottom_left); const __m128i round = _mm_set1_epi16(128); const __m128i top = LoadUnaligned16(top_row); const __m128i top_lo = cvtepu8_epi16(top); const __m128i top_hi = _mm_unpackhi_epi8(top, zero); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left_lo, y_select); write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left_hi, y_select); write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } } void aom_smooth_v_predictor_16x32_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i bottom_left = _mm_set1_epi16(left_column[31]); const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i zero = _mm_setzero_si128(); const __m128i weights1 = cvtepu8_epi16(weights_lo); const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero); const __m128i weights3 = cvtepu8_epi16(weights_hi); const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); const __m128i scaled_bottom_left1 = _mm_mullo_epi16(inverted_weights1, bottom_left); const __m128i scaled_bottom_left2 = _mm_mullo_epi16(inverted_weights2, bottom_left); const __m128i scaled_bottom_left3 = _mm_mullo_epi16(inverted_weights3, bottom_left); const __m128i scaled_bottom_left4 = _mm_mullo_epi16(inverted_weights4, bottom_left); const __m128i round = _mm_set1_epi16(128); const __m128i top = LoadUnaligned16(top_row); const __m128i top_lo = cvtepu8_epi16(top); const __m128i top_hi = _mm_unpackhi_epi8(top, zero); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left1, y_select); write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left2, y_select); write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left3, y_select); write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left4, y_select); write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_v_predictor_16x64_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i bottom_left = _mm_set1_epi16(left_column[63]); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i round = _mm_set1_epi16(128); const __m128i zero = _mm_setzero_si128(); const __m128i top = LoadUnaligned16(top_row); const __m128i top_lo = cvtepu8_epi16(top); const __m128i top_hi = _mm_unpackhi_epi8(top, zero); const uint8_t *weights_base_ptr = smooth_weights + 60; for (int left_offset = 0; left_offset < 64; left_offset += 16) { const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset); const __m128i weights_lo = cvtepu8_epi16(weights); const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero); const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo); const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi); const __m128i scaled_bottom_left_lo = _mm_mullo_epi16(inverted_weights_lo, bottom_left); const __m128i scaled_bottom_left_hi = _mm_mullo_epi16(inverted_weights_hi, bottom_left); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left_lo, y_select); write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left_hi, y_select); write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } } } void aom_smooth_v_predictor_32x8_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i zero = _mm_setzero_si128(); const __m128i bottom_left = _mm_set1_epi16(left_column[7]); const __m128i top_lo = LoadUnaligned16(top_row); const __m128i top_hi = LoadUnaligned16(top_row + 16); const __m128i top1 = cvtepu8_epi16(top_lo); const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero); const __m128i top3 = cvtepu8_epi16(top_hi); const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero); __m128i scale = _mm_set1_epi16(256); const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); const __m128i inverted_weights = _mm_sub_epi16(scale, weights); const __m128i scaled_bottom_left = _mm_mullo_epi16(inverted_weights, bottom_left); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_v_predictor_32x16_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i zero = _mm_setzero_si128(); const __m128i bottom_left = _mm_set1_epi16(left_column[15]); const __m128i top_lo = LoadUnaligned16(top_row); const __m128i top_hi = LoadUnaligned16(top_row + 16); const __m128i top1 = cvtepu8_epi16(top_lo); const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero); const __m128i top3 = cvtepu8_epi16(top_hi); const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero); const __m128i weights = LoadUnaligned16(smooth_weights + 12); const __m128i weights1 = cvtepu8_epi16(weights); const __m128i weights2 = _mm_unpackhi_epi8(weights, zero); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i scaled_bottom_left1 = _mm_mullo_epi16(inverted_weights1, bottom_left); const __m128i scaled_bottom_left2 = _mm_mullo_epi16(inverted_weights2, bottom_left); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left1, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left2, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } } void aom_smooth_v_predictor_32x32_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i bottom_left = _mm_set1_epi16(left_column[31]); const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); const __m128i zero = _mm_setzero_si128(); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i top_lo = LoadUnaligned16(top_row); const __m128i top_hi = LoadUnaligned16(top_row + 16); const __m128i top1 = cvtepu8_epi16(top_lo); const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero); const __m128i top3 = cvtepu8_epi16(top_hi); const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero); const __m128i weights1 = cvtepu8_epi16(weights_lo); const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero); const __m128i weights3 = cvtepu8_epi16(weights_hi); const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); const __m128i scaled_bottom_left1 = _mm_mullo_epi16(inverted_weights1, bottom_left); const __m128i scaled_bottom_left2 = _mm_mullo_epi16(inverted_weights2, bottom_left); const __m128i scaled_bottom_left3 = _mm_mullo_epi16(inverted_weights3, bottom_left); const __m128i scaled_bottom_left4 = _mm_mullo_epi16(inverted_weights4, bottom_left); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left1, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left2, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left3, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left4, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } } void aom_smooth_v_predictor_32x64_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i zero = _mm_setzero_si128(); const __m128i bottom_left = _mm_set1_epi16(left_column[63]); const __m128i top_lo = LoadUnaligned16(top_row); const __m128i top_hi = LoadUnaligned16(top_row + 16); const __m128i top1 = cvtepu8_epi16(top_lo); const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero); const __m128i top3 = cvtepu8_epi16(top_hi); const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); const uint8_t *weights_base_ptr = smooth_weights + 60; for (int left_offset = 0; left_offset < 64; left_offset += 16) { const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset); const __m128i weights_lo = cvtepu8_epi16(weights); const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero); const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo); const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi); const __m128i scaled_bottom_left_lo = _mm_mullo_epi16(inverted_weights_lo, bottom_left); const __m128i scaled_bottom_left_hi = _mm_mullo_epi16(inverted_weights_hi, bottom_left); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left_lo, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left_hi, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_v_predictor_64x16_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i bottom_left = _mm_set1_epi16(left_column[15]); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i zero = _mm_setzero_si128(); const __m128i top_lolo = LoadUnaligned16(top_row); const __m128i top_lohi = LoadUnaligned16(top_row + 16); const __m128i top1 = cvtepu8_epi16(top_lolo); const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero); const __m128i top3 = cvtepu8_epi16(top_lohi); const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero); const __m128i weights = LoadUnaligned16(smooth_weights + 12); const __m128i weights1 = cvtepu8_epi16(weights); const __m128i weights2 = _mm_unpackhi_epi8(weights, zero); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i top_hilo = LoadUnaligned16(top_row + 32); const __m128i top_hihi = LoadUnaligned16(top_row + 48); const __m128i top5 = cvtepu8_epi16(top_hilo); const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero); const __m128i top7 = cvtepu8_epi16(top_hihi); const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero); const __m128i scaled_bottom_left1 = _mm_mullo_epi16(inverted_weights1, bottom_left); const __m128i scaled_bottom_left2 = _mm_mullo_epi16(inverted_weights2, bottom_left); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left1, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left2, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_v_predictor_64x32_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i zero = _mm_setzero_si128(); const __m128i bottom_left = _mm_set1_epi16(left_column[31]); const __m128i top_lolo = LoadUnaligned16(top_row); const __m128i top_lohi = LoadUnaligned16(top_row + 16); const __m128i top1 = cvtepu8_epi16(top_lolo); const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero); const __m128i top3 = cvtepu8_epi16(top_lohi); const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero); const __m128i top_hilo = LoadUnaligned16(top_row + 32); const __m128i top_hihi = LoadUnaligned16(top_row + 48); const __m128i top5 = cvtepu8_epi16(top_hilo); const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero); const __m128i top7 = cvtepu8_epi16(top_hihi); const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero); const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); const __m128i weights1 = cvtepu8_epi16(weights_lo); const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero); const __m128i weights3 = cvtepu8_epi16(weights_hi); const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); const __m128i scaled_bottom_left1 = _mm_mullo_epi16(inverted_weights1, bottom_left); const __m128i scaled_bottom_left2 = _mm_mullo_epi16(inverted_weights2, bottom_left); const __m128i scaled_bottom_left3 = _mm_mullo_epi16(inverted_weights3, bottom_left); const __m128i scaled_bottom_left4 = _mm_mullo_epi16(inverted_weights4, bottom_left); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left1, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left2, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left3, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left4, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } } void aom_smooth_v_predictor_64x64_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i zero = _mm_setzero_si128(); const __m128i bottom_left = _mm_set1_epi16(left_column[63]); const __m128i top_lolo = LoadUnaligned16(top_row); const __m128i top_lohi = LoadUnaligned16(top_row + 16); const __m128i top1 = cvtepu8_epi16(top_lolo); const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero); const __m128i top3 = cvtepu8_epi16(top_lohi); const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero); const __m128i top_hilo = LoadUnaligned16(top_row + 32); const __m128i top_hihi = LoadUnaligned16(top_row + 48); const __m128i top5 = cvtepu8_epi16(top_hilo); const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero); const __m128i top7 = cvtepu8_epi16(top_hihi); const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i round = _mm_set1_epi16(128); const uint8_t *weights_base_ptr = smooth_weights + 60; for (int left_offset = 0; left_offset < 64; left_offset += 16) { const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset); const __m128i weights_lo = cvtepu8_epi16(weights); const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero); const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo); const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi); const __m128i scaled_bottom_left_lo = _mm_mullo_epi16(inverted_weights_lo, bottom_left); const __m128i scaled_bottom_left_hi = _mm_mullo_epi16(inverted_weights_hi, bottom_left); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left_lo, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select); const __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left_hi, y_select); write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, scaled_bottom_left_y, scaled_bottom_left_y, round); dst += stride; } } } // ----------------------------------------------------------------------------- // SMOOTH_H_PRED static AOM_FORCE_INLINE void write_smooth_horizontal_sum4( uint8_t *LIBAOM_RESTRICT dst, const __m128i *left_y, const __m128i *weights, const __m128i *scaled_top_right, const __m128i *round) { const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights); const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y); // Equivalent to RightShiftWithRounding(pred[x][y], 8). const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8); const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400); Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8)); } void aom_smooth_h_predictor_4x4_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi32(top_row[3]); const __m128i left = cvtepu8_epi32(Load4(left_column)); const __m128i weights = cvtepu8_epi32(Load4(smooth_weights)); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i inverted_weights = _mm_sub_epi32(scale, weights); const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); __m128i left_y = _mm_shuffle_epi32(left, 0); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0x55); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0xaa); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0xff); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); } void aom_smooth_h_predictor_4x8_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi32(top_row[3]); const __m128i weights = cvtepu8_epi32(Load4(smooth_weights)); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i inverted_weights = _mm_sub_epi32(scale, weights); const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); __m128i left = cvtepu8_epi32(Load4(left_column)); __m128i left_y = _mm_shuffle_epi32(left, 0); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0x55); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0xaa); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0xff); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left = cvtepu8_epi32(Load4(left_column + 4)); left_y = _mm_shuffle_epi32(left, 0); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0x55); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0xaa); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0xff); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_h_predictor_4x16_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi32(top_row[3]); const __m128i weights = cvtepu8_epi32(Load4(smooth_weights)); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i inverted_weights = _mm_sub_epi32(scale, weights); const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); __m128i left = cvtepu8_epi32(Load4(left_column)); __m128i left_y = _mm_shuffle_epi32(left, 0); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0x55); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0xaa); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0xff); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left = cvtepu8_epi32(Load4(left_column + 4)); left_y = _mm_shuffle_epi32(left, 0); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0x55); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0xaa); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0xff); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left = cvtepu8_epi32(Load4(left_column + 8)); left_y = _mm_shuffle_epi32(left, 0); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0x55); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0xaa); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0xff); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left = cvtepu8_epi32(Load4(left_column + 12)); left_y = _mm_shuffle_epi32(left, 0); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0x55); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0xaa); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; left_y = _mm_shuffle_epi32(left, 0xff); write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, &round); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V, // |pixels| is a segment of the top row or the whole top row, and |weights| is // repeated. void aom_smooth_h_predictor_8x4_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi16(top_row[7]); const __m128i left = cvtepu8_epi16(Load4(left_column)); const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i inverted_weights = _mm_sub_epi16(scale, weights); const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); __m128i y_select = _mm_set1_epi32(0x01000100); __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; y_select = _mm_set1_epi32(0x03020302); left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; y_select = _mm_set1_epi32(0x05040504); left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; y_select = _mm_set1_epi32(0x07060706); left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, &round); } void aom_smooth_h_predictor_8x8_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi16(top_row[7]); const __m128i left = cvtepu8_epi16(LoadLo8(left_column)); const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i inverted_weights = _mm_sub_epi16(scale, weights); const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; } } void aom_smooth_h_predictor_8x16_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi16(top_row[7]); const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i inverted_weights = _mm_sub_epi16(scale, weights); const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); __m128i left = cvtepu8_epi16(LoadLo8(left_column)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; } left = cvtepu8_epi16(LoadLo8(left_column + 8)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_h_predictor_8x32_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi16(top_row[7]); const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i inverted_weights = _mm_sub_epi16(scale, weights); const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); __m128i left = cvtepu8_epi16(LoadLo8(left_column)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; } left = cvtepu8_epi16(LoadLo8(left_column + 8)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; } left = cvtepu8_epi16(LoadLo8(left_column + 16)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; } left = cvtepu8_epi16(LoadLo8(left_column + 24)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, &round); dst += stride; } } void aom_smooth_h_predictor_16x4_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi16(top_row[15]); const __m128i left = cvtepu8_epi16(Load4(left_column)); const __m128i weights = LoadUnaligned16(smooth_weights + 12); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i weights1 = cvtepu8_epi16(weights); const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i scaled_top_right1 = _mm_mullo_epi16(inverted_weights1, top_right); const __m128i scaled_top_right2 = _mm_mullo_epi16(inverted_weights2, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); __m128i y_mask = _mm_set1_epi32(0x01000100); __m128i left_y = _mm_shuffle_epi8(left, y_mask); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); dst += stride; y_mask = _mm_set1_epi32(0x03020302); left_y = _mm_shuffle_epi8(left, y_mask); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); dst += stride; y_mask = _mm_set1_epi32(0x05040504); left_y = _mm_shuffle_epi8(left, y_mask); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); dst += stride; y_mask = _mm_set1_epi32(0x07060706); left_y = _mm_shuffle_epi8(left, y_mask); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_h_predictor_16x8_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi16(top_row[15]); const __m128i left = cvtepu8_epi16(LoadLo8(left_column)); const __m128i weights = LoadUnaligned16(smooth_weights + 12); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i weights1 = cvtepu8_epi16(weights); const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i scaled_top_right1 = _mm_mullo_epi16(inverted_weights1, top_right); const __m128i scaled_top_right2 = _mm_mullo_epi16(inverted_weights2, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); dst += stride; } } void aom_smooth_h_predictor_16x16_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi16(top_row[15]); const __m128i weights = LoadUnaligned16(smooth_weights + 12); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i weights1 = cvtepu8_epi16(weights); const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i scaled_top_right1 = _mm_mullo_epi16(inverted_weights1, top_right); const __m128i scaled_top_right2 = _mm_mullo_epi16(inverted_weights2, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); __m128i left = cvtepu8_epi16(LoadLo8(left_column)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); dst += stride; } left = cvtepu8_epi16(LoadLo8(left_column + 8)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); dst += stride; } } void aom_smooth_h_predictor_16x32_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi16(top_row[15]); const __m128i weights = LoadUnaligned16(smooth_weights + 12); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i weights1 = cvtepu8_epi16(weights); const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i scaled_top_right1 = _mm_mullo_epi16(inverted_weights1, top_right); const __m128i scaled_top_right2 = _mm_mullo_epi16(inverted_weights2, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); __m128i left = cvtepu8_epi16(LoadLo8(left_column)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); dst += stride; } left = cvtepu8_epi16(LoadLo8(left_column + 8)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); dst += stride; } left = cvtepu8_epi16(LoadLo8(left_column + 16)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); dst += stride; } left = cvtepu8_epi16(LoadLo8(left_column + 24)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); dst += stride; } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_h_predictor_16x64_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi16(top_row[15]); const __m128i weights = LoadUnaligned16(smooth_weights + 12); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i weights1 = cvtepu8_epi16(weights); const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i scaled_top_right1 = _mm_mullo_epi16(inverted_weights1, top_right); const __m128i scaled_top_right2 = _mm_mullo_epi16(inverted_weights2, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); for (int left_offset = 0; left_offset < 64; left_offset += 8) { const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); dst += stride; } } } void aom_smooth_h_predictor_32x8_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi16(top_row[31]); const __m128i left = cvtepu8_epi16(LoadLo8(left_column)); const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i weights1 = cvtepu8_epi16(weights_lo); const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8)); const __m128i weights3 = cvtepu8_epi16(weights_hi); const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8)); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); const __m128i scaled_top_right1 = _mm_mullo_epi16(inverted_weights1, top_right); const __m128i scaled_top_right2 = _mm_mullo_epi16(inverted_weights2, top_right); const __m128i scaled_top_right3 = _mm_mullo_epi16(inverted_weights3, top_right); const __m128i scaled_top_right4 = _mm_mullo_epi16(inverted_weights4, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { __m128i y_select = _mm_set1_epi32(y_mask); __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, scaled_top_right3, scaled_top_right4, round); dst += stride; } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_h_predictor_32x16_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi16(top_row[31]); const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column)); const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i weights1 = cvtepu8_epi16(weights_lo); const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8)); const __m128i weights3 = cvtepu8_epi16(weights_hi); const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8)); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); const __m128i scaled_top_right1 = _mm_mullo_epi16(inverted_weights1, top_right); const __m128i scaled_top_right2 = _mm_mullo_epi16(inverted_weights2, top_right); const __m128i scaled_top_right3 = _mm_mullo_epi16(inverted_weights3, top_right); const __m128i scaled_top_right4 = _mm_mullo_epi16(inverted_weights4, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { __m128i y_select = _mm_set1_epi32(y_mask); __m128i left_y = _mm_shuffle_epi8(left1, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, scaled_top_right3, scaled_top_right4, round); dst += stride; } const __m128i left2 = cvtepu8_epi16(LoadLo8((const uint8_t *)left_column + 8)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { __m128i y_select = _mm_set1_epi32(y_mask); __m128i left_y = _mm_shuffle_epi8(left2, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, scaled_top_right3, scaled_top_right4, round); dst += stride; } } void aom_smooth_h_predictor_32x32_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi16(top_row[31]); const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i weights1 = cvtepu8_epi16(weights_lo); const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8)); const __m128i weights3 = cvtepu8_epi16(weights_hi); const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8)); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); const __m128i scaled_top_right1 = _mm_mullo_epi16(inverted_weights1, top_right); const __m128i scaled_top_right2 = _mm_mullo_epi16(inverted_weights2, top_right); const __m128i scaled_top_right3 = _mm_mullo_epi16(inverted_weights3, top_right); const __m128i scaled_top_right4 = _mm_mullo_epi16(inverted_weights4, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); __m128i left = cvtepu8_epi16(LoadLo8(left_column)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { __m128i y_select = _mm_set1_epi32(y_mask); __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, scaled_top_right3, scaled_top_right4, round); dst += stride; } left = cvtepu8_epi16(LoadLo8(left_column + 8)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { __m128i y_select = _mm_set1_epi32(y_mask); __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, scaled_top_right3, scaled_top_right4, round); dst += stride; } left = cvtepu8_epi16(LoadLo8(left_column + 16)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { __m128i y_select = _mm_set1_epi32(y_mask); __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, scaled_top_right3, scaled_top_right4, round); dst += stride; } left = cvtepu8_epi16(LoadLo8(left_column + 24)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { __m128i y_select = _mm_set1_epi32(y_mask); __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, scaled_top_right3, scaled_top_right4, round); dst += stride; } } void aom_smooth_h_predictor_32x64_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi16(top_row[31]); const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i weights1 = cvtepu8_epi16(weights_lo); const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8)); const __m128i weights3 = cvtepu8_epi16(weights_hi); const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8)); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); const __m128i scaled_top_right1 = _mm_mullo_epi16(inverted_weights1, top_right); const __m128i scaled_top_right2 = _mm_mullo_epi16(inverted_weights2, top_right); const __m128i scaled_top_right3 = _mm_mullo_epi16(inverted_weights3, top_right); const __m128i scaled_top_right4 = _mm_mullo_epi16(inverted_weights4, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); for (int left_offset = 0; left_offset < 64; left_offset += 8) { const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, scaled_top_right3, scaled_top_right4, round); dst += stride; } } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_h_predictor_64x16_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi16(top_row[63]); const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column)); const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60); const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i weights1 = cvtepu8_epi16(weights_lolo); const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8)); const __m128i weights3 = cvtepu8_epi16(weights_lohi); const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8)); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); const __m128i scaled_top_right1 = _mm_mullo_epi16(inverted_weights1, top_right); const __m128i scaled_top_right2 = _mm_mullo_epi16(inverted_weights2, top_right); const __m128i scaled_top_right3 = _mm_mullo_epi16(inverted_weights3, top_right); const __m128i scaled_top_right4 = _mm_mullo_epi16(inverted_weights4, top_right); const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92); const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108); const __m128i weights5 = cvtepu8_epi16(weights_hilo); const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8)); const __m128i weights7 = cvtepu8_epi16(weights_hihi); const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8)); const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5); const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6); const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7); const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8); const __m128i scaled_top_right5 = _mm_mullo_epi16(inverted_weights5, top_right); const __m128i scaled_top_right6 = _mm_mullo_epi16(inverted_weights6, top_right); const __m128i scaled_top_right7 = _mm_mullo_epi16(inverted_weights7, top_right); const __m128i scaled_top_right8 = _mm_mullo_epi16(inverted_weights8, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { __m128i y_select = _mm_set1_epi32(y_mask); __m128i left_y = _mm_shuffle_epi8(left1, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, scaled_top_right3, scaled_top_right4, round); write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, scaled_top_right5, scaled_top_right6, round); write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, scaled_top_right7, scaled_top_right8, round); dst += stride; } const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { __m128i y_select = _mm_set1_epi32(y_mask); __m128i left_y = _mm_shuffle_epi8(left2, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, scaled_top_right3, scaled_top_right4, round); write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, scaled_top_right5, scaled_top_right6, round); write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, scaled_top_right7, scaled_top_right8, round); dst += stride; } } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void aom_smooth_h_predictor_64x32_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi16(top_row[63]); const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column)); const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60); const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i weights1 = cvtepu8_epi16(weights_lolo); const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8)); const __m128i weights3 = cvtepu8_epi16(weights_lohi); const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8)); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); const __m128i scaled_top_right1 = _mm_mullo_epi16(inverted_weights1, top_right); const __m128i scaled_top_right2 = _mm_mullo_epi16(inverted_weights2, top_right); const __m128i scaled_top_right3 = _mm_mullo_epi16(inverted_weights3, top_right); const __m128i scaled_top_right4 = _mm_mullo_epi16(inverted_weights4, top_right); const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92); const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108); const __m128i weights5 = cvtepu8_epi16(weights_hilo); const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8)); const __m128i weights7 = cvtepu8_epi16(weights_hihi); const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8)); const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5); const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6); const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7); const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8); const __m128i scaled_top_right5 = _mm_mullo_epi16(inverted_weights5, top_right); const __m128i scaled_top_right6 = _mm_mullo_epi16(inverted_weights6, top_right); const __m128i scaled_top_right7 = _mm_mullo_epi16(inverted_weights7, top_right); const __m128i scaled_top_right8 = _mm_mullo_epi16(inverted_weights8, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left1, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, scaled_top_right3, scaled_top_right4, round); write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, scaled_top_right5, scaled_top_right6, round); write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, scaled_top_right7, scaled_top_right8, round); dst += stride; } const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left2, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, scaled_top_right3, scaled_top_right4, round); write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, scaled_top_right5, scaled_top_right6, round); write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, scaled_top_right7, scaled_top_right8, round); dst += stride; } const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left3, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, scaled_top_right3, scaled_top_right4, round); write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, scaled_top_right5, scaled_top_right6, round); write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, scaled_top_right7, scaled_top_right8, round); dst += stride; } const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left4, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, scaled_top_right3, scaled_top_right4, round); write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, scaled_top_right5, scaled_top_right6, round); write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, scaled_top_right7, scaled_top_right8, round); dst += stride; } } void aom_smooth_h_predictor_64x64_ssse3( uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, const uint8_t *LIBAOM_RESTRICT top_row, const uint8_t *LIBAOM_RESTRICT left_column) { const __m128i top_right = _mm_set1_epi16(top_row[63]); const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60); const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76); const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); const __m128i weights1 = cvtepu8_epi16(weights_lolo); const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8)); const __m128i weights3 = cvtepu8_epi16(weights_lohi); const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8)); const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); const __m128i scaled_top_right1 = _mm_mullo_epi16(inverted_weights1, top_right); const __m128i scaled_top_right2 = _mm_mullo_epi16(inverted_weights2, top_right); const __m128i scaled_top_right3 = _mm_mullo_epi16(inverted_weights3, top_right); const __m128i scaled_top_right4 = _mm_mullo_epi16(inverted_weights4, top_right); const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92); const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108); const __m128i weights5 = cvtepu8_epi16(weights_hilo); const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8)); const __m128i weights7 = cvtepu8_epi16(weights_hihi); const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8)); const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5); const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6); const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7); const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8); const __m128i scaled_top_right5 = _mm_mullo_epi16(inverted_weights5, top_right); const __m128i scaled_top_right6 = _mm_mullo_epi16(inverted_weights6, top_right); const __m128i scaled_top_right7 = _mm_mullo_epi16(inverted_weights7, top_right); const __m128i scaled_top_right8 = _mm_mullo_epi16(inverted_weights8, top_right); const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); for (int left_offset = 0; left_offset < 64; left_offset += 8) { const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset)); for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { const __m128i y_select = _mm_set1_epi32(y_mask); const __m128i left_y = _mm_shuffle_epi8(left, y_select); write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, scaled_top_right1, scaled_top_right2, round); write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, scaled_top_right3, scaled_top_right4, round); write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, scaled_top_right5, scaled_top_right6, round); write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, scaled_top_right7, scaled_top_right8, round); dst += stride; } } } aom-3.12.1/aom_dsp/x86/intrapred_utils.h000066400000000000000000000160131477627663500200320ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_ #define AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_ #include // SSE2 #include "aom/aom_integer.h" #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = { { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 }, { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 }, { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 }, { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 }, { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 }, { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 }, { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 } }; static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }, { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }, { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = { { -1, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, 0, 0, 0, 0, 0, 0 }, { -1, -1, -1, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, 0, 0, 0, 0 }, { -1, -1, -1, -1, -1, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, 0, 0 }, { -1, -1, -1, -1, -1, -1, -1, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1 }, }; static inline void transpose4x16_sse2(__m128i *x, __m128i *d) { __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3; w0 = _mm_unpacklo_epi8(x[0], x[1]); w1 = _mm_unpacklo_epi8(x[2], x[3]); w2 = _mm_unpackhi_epi8(x[0], x[1]); w3 = _mm_unpackhi_epi8(x[2], x[3]); ww0 = _mm_unpacklo_epi16(w0, w1); ww1 = _mm_unpacklo_epi16(w2, w3); ww2 = _mm_unpackhi_epi16(w0, w1); ww3 = _mm_unpackhi_epi16(w2, w3); w0 = _mm_unpacklo_epi32(ww0, ww1); w2 = _mm_unpacklo_epi32(ww2, ww3); w1 = _mm_unpackhi_epi32(ww0, ww1); w3 = _mm_unpackhi_epi32(ww2, ww3); d[0] = _mm_unpacklo_epi64(w0, w2); d[1] = _mm_unpackhi_epi64(w0, w2); d[2] = _mm_unpacklo_epi64(w1, w3); d[3] = _mm_unpackhi_epi64(w1, w3); d[4] = _mm_srli_si128(d[0], 8); d[5] = _mm_srli_si128(d[1], 8); d[6] = _mm_srli_si128(d[2], 8); d[7] = _mm_srli_si128(d[3], 8); d[8] = _mm_srli_si128(d[0], 4); d[9] = _mm_srli_si128(d[1], 4); d[10] = _mm_srli_si128(d[2], 4); d[11] = _mm_srli_si128(d[3], 4); d[12] = _mm_srli_si128(d[0], 12); d[13] = _mm_srli_si128(d[1], 12); d[14] = _mm_srli_si128(d[2], 12); d[15] = _mm_srli_si128(d[3], 12); } static inline void transpose16x16_sse2(__m128i *x, __m128i *d) { __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; __m128i w10, w11, w12, w13, w14, w15; w0 = _mm_unpacklo_epi8(x[0], x[1]); w1 = _mm_unpacklo_epi8(x[2], x[3]); w2 = _mm_unpacklo_epi8(x[4], x[5]); w3 = _mm_unpacklo_epi8(x[6], x[7]); w8 = _mm_unpacklo_epi8(x[8], x[9]); w9 = _mm_unpacklo_epi8(x[10], x[11]); w10 = _mm_unpacklo_epi8(x[12], x[13]); w11 = _mm_unpacklo_epi8(x[14], x[15]); w4 = _mm_unpacklo_epi16(w0, w1); w5 = _mm_unpacklo_epi16(w2, w3); w12 = _mm_unpacklo_epi16(w8, w9); w13 = _mm_unpacklo_epi16(w10, w11); w6 = _mm_unpacklo_epi32(w4, w5); w7 = _mm_unpackhi_epi32(w4, w5); w14 = _mm_unpacklo_epi32(w12, w13); w15 = _mm_unpackhi_epi32(w12, w13); // Store first 4-line result d[0] = _mm_unpacklo_epi64(w6, w14); d[1] = _mm_unpackhi_epi64(w6, w14); d[2] = _mm_unpacklo_epi64(w7, w15); d[3] = _mm_unpackhi_epi64(w7, w15); w4 = _mm_unpackhi_epi16(w0, w1); w5 = _mm_unpackhi_epi16(w2, w3); w12 = _mm_unpackhi_epi16(w8, w9); w13 = _mm_unpackhi_epi16(w10, w11); w6 = _mm_unpacklo_epi32(w4, w5); w7 = _mm_unpackhi_epi32(w4, w5); w14 = _mm_unpacklo_epi32(w12, w13); w15 = _mm_unpackhi_epi32(w12, w13); // Store second 4-line result d[4] = _mm_unpacklo_epi64(w6, w14); d[5] = _mm_unpackhi_epi64(w6, w14); d[6] = _mm_unpacklo_epi64(w7, w15); d[7] = _mm_unpackhi_epi64(w7, w15); // upper half w0 = _mm_unpackhi_epi8(x[0], x[1]); w1 = _mm_unpackhi_epi8(x[2], x[3]); w2 = _mm_unpackhi_epi8(x[4], x[5]); w3 = _mm_unpackhi_epi8(x[6], x[7]); w8 = _mm_unpackhi_epi8(x[8], x[9]); w9 = _mm_unpackhi_epi8(x[10], x[11]); w10 = _mm_unpackhi_epi8(x[12], x[13]); w11 = _mm_unpackhi_epi8(x[14], x[15]); w4 = _mm_unpacklo_epi16(w0, w1); w5 = _mm_unpacklo_epi16(w2, w3); w12 = _mm_unpacklo_epi16(w8, w9); w13 = _mm_unpacklo_epi16(w10, w11); w6 = _mm_unpacklo_epi32(w4, w5); w7 = _mm_unpackhi_epi32(w4, w5); w14 = _mm_unpacklo_epi32(w12, w13); w15 = _mm_unpackhi_epi32(w12, w13); // Store first 4-line result d[8] = _mm_unpacklo_epi64(w6, w14); d[9] = _mm_unpackhi_epi64(w6, w14); d[10] = _mm_unpacklo_epi64(w7, w15); d[11] = _mm_unpackhi_epi64(w7, w15); w4 = _mm_unpackhi_epi16(w0, w1); w5 = _mm_unpackhi_epi16(w2, w3); w12 = _mm_unpackhi_epi16(w8, w9); w13 = _mm_unpackhi_epi16(w10, w11); w6 = _mm_unpacklo_epi32(w4, w5); w7 = _mm_unpackhi_epi32(w4, w5); w14 = _mm_unpacklo_epi32(w12, w13); w15 = _mm_unpackhi_epi32(w12, w13); // Store second 4-line result d[12] = _mm_unpacklo_epi64(w6, w14); d[13] = _mm_unpackhi_epi64(w6, w14); d[14] = _mm_unpacklo_epi64(w7, w15); d[15] = _mm_unpackhi_epi64(w7, w15); } static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst, ptrdiff_t pitchDst) { __m128i r[16]; __m128i d[16]; for (int j = 0; j < 16; j++) { r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc)); } transpose16x16_sse2(r, d); for (int j = 0; j < 16; j++) { _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]); } } static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst, ptrdiff_t pitchDst, int width, int height) { for (int j = 0; j < height; j += 16) for (int i = 0; i < width; i += 16) transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc, dst + j * pitchDst + i, pitchDst); } #endif // AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_ aom-3.12.1/aom_dsp/x86/intrapred_x86.h000066400000000000000000000026101477627663500173150ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_INTRAPRED_X86_H_ #define AOM_AOM_DSP_X86_INTRAPRED_X86_H_ #include // SSE2 #include "aom/aom_integer.h" #include "config/aom_config.h" static inline __m128i dc_sum_16_sse2(const uint8_t *ref) { __m128i x = _mm_load_si128((__m128i const *)ref); const __m128i zero = _mm_setzero_si128(); x = _mm_sad_epu8(x, zero); const __m128i high = _mm_unpackhi_epi64(x, x); return _mm_add_epi16(x, high); } static inline __m128i dc_sum_32_sse2(const uint8_t *ref) { __m128i x0 = _mm_load_si128((__m128i const *)ref); __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); const __m128i zero = _mm_setzero_si128(); x0 = _mm_sad_epu8(x0, zero); x1 = _mm_sad_epu8(x1, zero); x0 = _mm_add_epi16(x0, x1); const __m128i high = _mm_unpackhi_epi64(x0, x0); return _mm_add_epi16(x0, high); } #endif // AOM_AOM_DSP_X86_INTRAPRED_X86_H_ aom-3.12.1/aom_dsp/x86/loopfilter_avx2.c000066400000000000000000001303121477627663500177330ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include /* AVX2 */ #include "config/aom_dsp_rtcd.h" DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 }; void aom_lpf_horizontal_6_quad_avx2(unsigned char *s, int p, const unsigned char *_blimit0, const unsigned char *_limit0, const unsigned char *_thresh0) { __m256i p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; __m128i p2, p1, p0, q0, q1, q2; __m128i mask, flat; const __m128i thresh_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0])); const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0])); const __m128i blimit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0])); const __m128i zero = _mm_setzero_si128(); const __m128i ff = _mm_cmpeq_epi8(zero, zero); p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); p2 = _mm256_castsi256_si128(p256_2); p1 = _mm256_castsi256_si128(p256_1); p0 = _mm256_castsi256_si128(p256_0); q0 = _mm256_castsi256_si128(q256_0); q1 = _mm256_castsi256_si128(q256_1); q2 = _mm256_castsi256_si128(q256_2); { __m128i work; const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(flat, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2))); mask = _mm_max_epu8(work, mask); mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return; // loop filter { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); const __m128i t80 = _mm_set1_epi8((int8_t)0x80); const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); const __m128i one = _mm_set1_epi8(1); __m128i hev; hev = _mm_subs_epu8(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); __m128i ps1 = _mm_xor_si128(p1, t80); __m128i ps0 = _mm_xor_si128(p0, t80); __m128i qs0 = _mm_xor_si128(q0, t80); __m128i qs1 = _mm_xor_si128(q1, t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; __m128i flat_p1, flat_p0, flat_q0, flat_q1; filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); work_a = _mm_subs_epi8(qs0, ps0); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); work_a = _mm_cmpgt_epi8(zero, filter1); filter1 = _mm_srli_epi16(filter1, 3); work_a = _mm_and_si128(work_a, te0); filter1 = _mm_and_si128(filter1, t1f); filter1 = _mm_or_si128(filter1, work_a); qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); work_a = _mm_cmpgt_epi8(zero, filter2); filter2 = _mm_srli_epi16(filter2, 3); work_a = _mm_and_si128(work_a, te0); filter2 = _mm_and_si128(filter2, t1f); filter2 = _mm_or_si128(filter2, work_a); ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); filt = _mm_adds_epi8(filter1, t1); work_a = _mm_cmpgt_epi8(zero, filt); filt = _mm_srli_epi16(filt, 1); work_a = _mm_and_si128(work_a, t80); filt = _mm_and_si128(filt, t7f); filt = _mm_or_si128(filt, work_a); filt = _mm_andnot_si128(hev, filt); ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); __m128i work; work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); flat = _mm_max_epu8(work, flat); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { const __m256i four = _mm256_set1_epi16(4); __m256i pixetFilter, add, res; const __m256i filter = _mm256_load_si256((__m256i const *)filt_loopfilter_avx2); p256_2 = _mm256_shuffle_epi8(p256_2, filter); p256_1 = _mm256_shuffle_epi8(p256_1, filter); p256_0 = _mm256_shuffle_epi8(p256_0, filter); q256_0 = _mm256_shuffle_epi8(q256_0, filter); q256_1 = _mm256_shuffle_epi8(q256_1, filter); q256_2 = _mm256_shuffle_epi8(q256_2, filter); pixetFilter = _mm256_slli_epi16( _mm256_add_epi16(p256_2, _mm256_add_epi16(p256_1, p256_0)), 1); pixetFilter = _mm256_add_epi16(pixetFilter, _mm256_add_epi16(p256_2, q256_0)); pixetFilter = _mm256_add_epi16(four, pixetFilter); res = _mm256_srli_epi16(pixetFilter, 3); flat_p1 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168)); p1 = _mm_andnot_si128(flat, ps1); flat_p1 = _mm_and_si128(flat, flat_p1); p1 = _mm_or_si128(flat_p1, p1); add = _mm256_add_epi16(_mm256_sub_epi16(q256_1, p256_2), _mm256_sub_epi16(q256_0, p256_2)); pixetFilter = _mm256_add_epi16(pixetFilter, add); res = _mm256_srli_epi16(pixetFilter, 3); flat_p0 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168)); p0 = _mm_andnot_si128(flat, ps0); flat_p0 = _mm_and_si128(flat, flat_p0); p0 = _mm_or_si128(flat_p0, p0); add = _mm256_add_epi16(_mm256_sub_epi16(q256_2, p256_2), _mm256_sub_epi16(q256_1, p256_1)); pixetFilter = _mm256_add_epi16(pixetFilter, add); res = _mm256_srli_epi16(pixetFilter, 3); flat_q0 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168)); q0 = _mm_andnot_si128(flat, qs0); flat_q0 = _mm_and_si128(flat, flat_q0); q0 = _mm_or_si128(flat_q0, q0); add = _mm256_add_epi16(_mm256_sub_epi16(q256_2, p256_1), _mm256_sub_epi16(q256_2, p256_0)); pixetFilter = _mm256_add_epi16(pixetFilter, add); res = _mm256_srli_epi16(pixetFilter, 3); flat_q1 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168)); q1 = _mm_andnot_si128(flat, qs1); flat_q1 = _mm_and_si128(flat, flat_q1); q1 = _mm_or_si128(flat_q1, q1); _mm_storeu_si128((__m128i *)(s - 3 * p), p2); _mm_storeu_si128((__m128i *)(s - 2 * p), p1); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); _mm_storeu_si128((__m128i *)(s - 0 * p), q0); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); _mm_storeu_si128((__m128i *)(s + 2 * p), q2); } else { _mm_storeu_si128((__m128i *)(s - 2 * p), ps1); _mm_storeu_si128((__m128i *)(s - 1 * p), ps0); _mm_storeu_si128((__m128i *)(s - 0 * p), qs0); _mm_storeu_si128((__m128i *)(s + 1 * p), qs1); } } } void aom_lpf_horizontal_8_quad_avx2(unsigned char *s, int p, const unsigned char *_blimit0, const unsigned char *_limit0, const unsigned char *_thresh0) { __m256i p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; __m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i mask, flat; const __m128i thresh_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0])); const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0])); const __m128i blimit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0])); const __m128i zero = _mm_setzero_si128(); const __m128i ff = _mm_cmpeq_epi8(zero, zero); p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p))); p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p))); p3 = _mm256_castsi256_si128(p256_3); p2 = _mm256_castsi256_si128(p256_2); p1 = _mm256_castsi256_si128(p256_1); p0 = _mm256_castsi256_si128(p256_0); q0 = _mm256_castsi256_si128(q256_0); q1 = _mm256_castsi256_si128(q256_1); q2 = _mm256_castsi256_si128(q256_2); q3 = _mm256_castsi256_si128(q256_3); { __m128i work; const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(flat, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); mask = _mm_max_epu8(work, mask); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); mask = _mm_max_epu8(work, mask); mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return; // loop filter { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); const __m128i t80 = _mm_set1_epi8((int8_t)0x80); const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); const __m128i one = _mm_set1_epi8(1); __m128i hev; hev = _mm_subs_epu8(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); __m128i ps1 = _mm_xor_si128(p1, t80); __m128i ps0 = _mm_xor_si128(p0, t80); __m128i qs0 = _mm_xor_si128(q0, t80); __m128i qs1 = _mm_xor_si128(q1, t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; __m128i flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2; filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); work_a = _mm_subs_epi8(qs0, ps0); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); work_a = _mm_cmpgt_epi8(zero, filter1); filter1 = _mm_srli_epi16(filter1, 3); work_a = _mm_and_si128(work_a, te0); filter1 = _mm_and_si128(filter1, t1f); filter1 = _mm_or_si128(filter1, work_a); qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); work_a = _mm_cmpgt_epi8(zero, filter2); filter2 = _mm_srli_epi16(filter2, 3); work_a = _mm_and_si128(work_a, te0); filter2 = _mm_and_si128(filter2, t1f); filter2 = _mm_or_si128(filter2, work_a); ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); filt = _mm_adds_epi8(filter1, t1); work_a = _mm_cmpgt_epi8(zero, filt); filt = _mm_srli_epi16(filt, 1); work_a = _mm_and_si128(work_a, t80); filt = _mm_and_si128(filt, t7f); filt = _mm_or_si128(filt, work_a); filt = _mm_andnot_si128(hev, filt); ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); __m128i work; work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); flat = _mm_max_epu8(work, flat); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); flat = _mm_max_epu8(work, flat); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { const __m256i four = _mm256_set1_epi16(4); __m256i pixetFilter_p2p1p0, p2p1p0, q2q1q0, pixetFilter_q2q1q0, sum_p, sum_q, res_p, res_q; const __m256i filter = _mm256_load_si256((__m256i const *)filt_loopfilter_avx2); p256_3 = _mm256_shuffle_epi8(p256_3, filter); p256_2 = _mm256_shuffle_epi8(p256_2, filter); p256_1 = _mm256_shuffle_epi8(p256_1, filter); p256_0 = _mm256_shuffle_epi8(p256_0, filter); q256_0 = _mm256_shuffle_epi8(q256_0, filter); q256_1 = _mm256_shuffle_epi8(q256_1, filter); q256_2 = _mm256_shuffle_epi8(q256_2, filter); q256_3 = _mm256_shuffle_epi8(q256_3, filter); p2p1p0 = _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1)); q2q1q0 = _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1)); pixetFilter_p2p1p0 = _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0)); pixetFilter_q2q1q0 = pixetFilter_p2p1p0; pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3); res_p = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3); flat_p0 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); p0 = _mm_andnot_si128(flat, ps0); flat_p0 = _mm_and_si128(flat, flat_p0); p0 = _mm_or_si128(flat_p0, p0); pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3); res_q = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3); flat_q0 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); q0 = _mm_andnot_si128(flat, qs0); flat_q0 = _mm_and_si128(flat, flat_q0); q0 = _mm_or_si128(flat_q0, q0); sum_p = _mm256_sub_epi16(p256_3, q256_2); pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p); res_p = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3); flat_p1 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); p1 = _mm_andnot_si128(flat, ps1); flat_p1 = _mm_and_si128(flat, flat_p1); p1 = _mm_or_si128(flat_p1, p1); sum_q = _mm256_sub_epi16(q256_3, p256_2); pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q); res_q = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3); flat_q1 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); q1 = _mm_andnot_si128(flat, qs1); flat_q1 = _mm_and_si128(flat, flat_q1); q1 = _mm_or_si128(flat_q1, q1); sum_p = _mm256_sub_epi16(p256_3, q256_1); pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p); res_p = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3); flat_p2 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); p2 = _mm_andnot_si128(flat, p2); flat_p2 = _mm_and_si128(flat, flat_p2); p2 = _mm_or_si128(flat_p2, p2); sum_q = _mm256_sub_epi16(q256_3, p256_1); pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q); res_q = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3); flat_q2 = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); q2 = _mm_andnot_si128(flat, q2); flat_q2 = _mm_and_si128(flat, flat_q2); q2 = _mm_or_si128(flat_q2, q2); _mm_storeu_si128((__m128i *)(s - 3 * p), p2); _mm_storeu_si128((__m128i *)(s - 2 * p), p1); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); _mm_storeu_si128((__m128i *)(s - 0 * p), q0); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); _mm_storeu_si128((__m128i *)(s + 2 * p), q2); } else { _mm_storeu_si128((__m128i *)(s - 2 * p), ps1); _mm_storeu_si128((__m128i *)(s - 1 * p), ps0); _mm_storeu_si128((__m128i *)(s - 0 * p), qs0); _mm_storeu_si128((__m128i *)(s + 1 * p), qs1); } } } static inline void trans_store_16x16_lpf_vert14(unsigned char *in0, int in_p, unsigned char *out, int out_p, int is_store_avx2) { const __m128i x0 = _mm_loadu_si128((__m128i *)in0); const __m128i x1 = _mm_loadu_si128((__m128i *)(in0 + in_p * 1)); const __m128i x2 = _mm_loadu_si128((__m128i *)(in0 + in_p * 2)); const __m128i x3 = _mm_loadu_si128((__m128i *)(in0 + in_p * 3)); const __m128i x4 = _mm_loadu_si128((__m128i *)(in0 + in_p * 4)); const __m128i x5 = _mm_loadu_si128((__m128i *)(in0 + in_p * 5)); const __m128i x6 = _mm_loadu_si128((__m128i *)(in0 + in_p * 6)); const __m128i x7 = _mm_loadu_si128((__m128i *)(in0 + in_p * 7)); const __m256i y0 = _mm256_insertf128_si256( _mm256_castsi128_si256(x0), _mm_loadu_si128((__m128i *)(in0 + in_p * 8)), 0x1); const __m256i y1 = _mm256_insertf128_si256( _mm256_castsi128_si256(x1), _mm_loadu_si128((__m128i *)(in0 + in_p * 9)), 0x1); const __m256i y2 = _mm256_insertf128_si256( _mm256_castsi128_si256(x2), _mm_loadu_si128((__m128i *)(in0 + in_p * 10)), 0x1); const __m256i y3 = _mm256_insertf128_si256( _mm256_castsi128_si256(x3), _mm_loadu_si128((__m128i *)(in0 + in_p * 11)), 0x1); const __m256i y4 = _mm256_insertf128_si256( _mm256_castsi128_si256(x4), _mm_loadu_si128((__m128i *)(in0 + in_p * 12)), 0x1); const __m256i y5 = _mm256_insertf128_si256( _mm256_castsi128_si256(x5), _mm_loadu_si128((__m128i *)(in0 + in_p * 13)), 0x1); const __m256i y6 = _mm256_insertf128_si256( _mm256_castsi128_si256(x6), _mm_loadu_si128((__m128i *)(in0 + in_p * 14)), 0x1); const __m256i y7 = _mm256_insertf128_si256( _mm256_castsi128_si256(x7), _mm_loadu_si128((__m128i *)(in0 + in_p * 15)), 0x1); const __m256i y_s00 = _mm256_unpacklo_epi8(y0, y1); const __m256i y_s01 = _mm256_unpackhi_epi8(y0, y1); const __m256i y_s02 = _mm256_unpacklo_epi8(y2, y3); const __m256i y_s03 = _mm256_unpackhi_epi8(y2, y3); const __m256i y_s04 = _mm256_unpacklo_epi8(y4, y5); const __m256i y_s05 = _mm256_unpackhi_epi8(y4, y5); const __m256i y_s06 = _mm256_unpacklo_epi8(y6, y7); const __m256i y_s07 = _mm256_unpackhi_epi8(y6, y7); const __m256i y_s10 = _mm256_unpacklo_epi16(y_s00, y_s02); const __m256i y_s11 = _mm256_unpackhi_epi16(y_s00, y_s02); const __m256i y_s12 = _mm256_unpacklo_epi16(y_s01, y_s03); const __m256i y_s13 = _mm256_unpackhi_epi16(y_s01, y_s03); const __m256i y_s14 = _mm256_unpacklo_epi16(y_s04, y_s06); const __m256i y_s15 = _mm256_unpackhi_epi16(y_s04, y_s06); const __m256i y_s16 = _mm256_unpacklo_epi16(y_s05, y_s07); const __m256i y_s17 = _mm256_unpackhi_epi16(y_s05, y_s07); const __m256i y_s20 = _mm256_unpacklo_epi32(y_s10, y_s14); const __m256i y_s21 = _mm256_unpackhi_epi32(y_s10, y_s14); const __m256i y_s22 = _mm256_unpacklo_epi32(y_s11, y_s15); const __m256i y_s23 = _mm256_unpackhi_epi32(y_s11, y_s15); const __m256i y_s24 = _mm256_unpacklo_epi32(y_s12, y_s16); const __m256i y_s25 = _mm256_unpackhi_epi32(y_s12, y_s16); const __m256i y_s26 = _mm256_unpacklo_epi32(y_s13, y_s17); const __m256i y_s27 = _mm256_unpackhi_epi32(y_s13, y_s17); const __m256i row_s01 = _mm256_permute4x64_epi64(y_s20, 0xd8); const __m256i row_s23 = _mm256_permute4x64_epi64(y_s21, 0xd8); const __m256i row_s45 = _mm256_permute4x64_epi64(y_s22, 0xd8); const __m256i row_s67 = _mm256_permute4x64_epi64(y_s23, 0xd8); const __m256i row_s89 = _mm256_permute4x64_epi64(y_s24, 0xd8); const __m256i row_s1011 = _mm256_permute4x64_epi64(y_s25, 0xd8); const __m256i row_s1213 = _mm256_permute4x64_epi64(y_s26, 0xd8); const __m256i row_s1415 = _mm256_permute4x64_epi64(y_s27, 0xd8); if (is_store_avx2) { _mm256_storeu_si256((__m256i *)(out), row_s01); _mm256_storeu_si256((__m256i *)(out + (2 * out_p)), row_s23); _mm256_storeu_si256((__m256i *)(out + (4 * out_p)), row_s45); _mm256_storeu_si256((__m256i *)(out + (6 * out_p)), row_s67); _mm256_storeu_si256((__m256i *)(out + (8 * out_p)), row_s89); _mm256_storeu_si256((__m256i *)(out + (10 * out_p)), row_s1011); _mm256_storeu_si256((__m256i *)(out + (12 * out_p)), row_s1213); _mm256_storeu_si256((__m256i *)(out + (14 * out_p)), row_s1415); } else { _mm_storeu_si128((__m128i *)(out), _mm256_castsi256_si128(row_s01)); _mm_storeu_si128((__m128i *)(out + (2 * out_p)), _mm256_castsi256_si128(row_s23)); _mm_storeu_si128((__m128i *)(out + (4 * out_p)), _mm256_castsi256_si128(row_s45)); _mm_storeu_si128((__m128i *)(out + (6 * out_p)), _mm256_castsi256_si128(row_s67)); _mm_storeu_si128((__m128i *)(out + (8 * out_p)), _mm256_castsi256_si128(row_s89)); _mm_storeu_si128((__m128i *)(out + (10 * out_p)), _mm256_castsi256_si128(row_s1011)); _mm_storeu_si128((__m128i *)(out + (12 * out_p)), _mm256_castsi256_si128(row_s1213)); _mm_storeu_si128((__m128i *)(out + (14 * out_p)), _mm256_castsi256_si128(row_s1415)); _mm_storeu_si128((__m128i *)(out + (1 * out_p)), _mm256_extracti128_si256(row_s01, 1)); _mm_storeu_si128((__m128i *)(out + (3 * out_p)), _mm256_extracti128_si256(row_s23, 1)); _mm_storeu_si128((__m128i *)(out + (5 * out_p)), _mm256_extracti128_si256(row_s45, 1)); _mm_storeu_si128((__m128i *)(out + (7 * out_p)), _mm256_extracti128_si256(row_s67, 1)); _mm_storeu_si128((__m128i *)(out + (9 * out_p)), _mm256_extracti128_si256(row_s89, 1)); _mm_storeu_si128((__m128i *)(out + (11 * out_p)), _mm256_extracti128_si256(row_s1011, 1)); _mm_storeu_si128((__m128i *)(out + (13 * out_p)), _mm256_extracti128_si256(row_s1213, 1)); _mm_storeu_si128((__m128i *)(out + (15 * out_p)), _mm256_extracti128_si256(row_s1415, 1)); } } void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p, const unsigned char *_blimit0, const unsigned char *_limit0, const unsigned char *_thresh0) { __m128i mask, flat; const __m128i zero = _mm_setzero_si128(); const __m128i ff = _mm_cmpeq_epi8(zero, zero); __m256i p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p))); __m256i p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); __m256i p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); __m256i p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); __m256i q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); __m256i q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); __m256i q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); __m256i q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p))); __m128i p3 = _mm256_castsi256_si128(p256_3); __m128i p2 = _mm256_castsi256_si128(p256_2); __m128i p1 = _mm256_castsi256_si128(p256_1); __m128i p0 = _mm256_castsi256_si128(p256_0); __m128i q0 = _mm256_castsi256_si128(q256_0); __m128i q1 = _mm256_castsi256_si128(q256_1); __m128i q2 = _mm256_castsi256_si128(q256_2); __m128i q3 = _mm256_castsi256_si128(q256_3); { const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0])); const __m128i blimit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0])); const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(flat, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; __m128i work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); mask = _mm_max_epu8(work, mask); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); mask = _mm_max_epu8(work, mask); mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return; // loop filter { const __m128i thresh_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0])); const __m128i one = _mm_set1_epi8(1); const __m128i t3 = _mm_set1_epi8(3); const __m128i t4 = _mm_add_epi8(one, t3); const __m128i t80 = _mm_set1_epi8((int8_t)0x80); const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t7f = _mm_sub_epi8(t80, one); __m128i hev = _mm_subs_epu8(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); __m128i ps1 = _mm_xor_si128(p1, t80); __m128i ps0 = _mm_xor_si128(p0, t80); __m128i qs0 = _mm_xor_si128(q0, t80); __m128i qs1 = _mm_xor_si128(q1, t80); __m128i filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); __m128i work_a = _mm_subs_epi8(qs0, ps0); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_and_si128(filt, mask); __m128i filter1 = _mm_adds_epi8(filt, t4); __m128i filter2 = _mm_adds_epi8(filt, t3); work_a = _mm_cmpgt_epi8(zero, filter1); filter1 = _mm_srli_epi16(filter1, 3); work_a = _mm_and_si128(work_a, te0); filter1 = _mm_and_si128(filter1, t1f); filter1 = _mm_or_si128(filter1, work_a); qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); work_a = _mm_cmpgt_epi8(zero, filter2); filter2 = _mm_srli_epi16(filter2, 3); work_a = _mm_and_si128(work_a, te0); filter2 = _mm_and_si128(filter2, t1f); filter2 = _mm_or_si128(filter2, work_a); ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); filt = _mm_adds_epi8(filter1, one); work_a = _mm_cmpgt_epi8(zero, filt); filt = _mm_srli_epi16(filt, 1); work_a = _mm_and_si128(work_a, t80); filt = _mm_and_si128(filt, t7f); filt = _mm_or_si128(filt, work_a); filt = _mm_andnot_si128(hev, filt); ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); // Derive flat __m256i p0q0256 = _mm256_blend_epi32(p256_0, q256_0, 0xf0); __m256i p2q2256 = _mm256_blend_epi32(p256_2, q256_2, 0xf0); __m256i p3q3256 = _mm256_blend_epi32(p256_3, q256_3, 0xf0); const __m256i ps0qs0256 = _mm256_insertf128_si256(_mm256_castsi128_si256(ps0), qs0, 0x1); const __m256i ps1qs1256 = _mm256_insertf128_si256(_mm256_castsi128_si256(ps1), qs1, 0x1); const __m256i work01 = _mm256_or_si256(_mm256_subs_epu8(p2q2256, p0q0256), _mm256_subs_epu8(p0q0256, p2q2256)); const __m256i work02 = _mm256_or_si256(_mm256_subs_epu8(p3q3256, p0q0256), _mm256_subs_epu8(p0q0256, p3q3256)); const __m256i max0_256 = _mm256_max_epu8(work01, work02); const __m128i max1_256 = _mm_max_epu8(_mm256_castsi256_si128(max0_256), _mm256_extractf128_si256(max0_256, 1)); flat = _mm_max_epu8(max1_256, flat); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // flat and wide flat calculations if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { const __m256i flat256 = _mm256_insertf128_si256(_mm256_castsi128_si256(flat), flat, 0x1); const __m256i eight = _mm256_set1_epi16(8); const __m256i four = _mm256_set1_epi16(4); __m256i p256_4 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s - 5 * p))); __m256i q256_4 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s + 4 * p))); __m256i p256_5 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s - 6 * p))); __m256i q256_5 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s + 5 * p))); __m256i p256_6 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s - 7 * p))); __m256i q256_6 = _mm256_castpd_si256( _mm256_broadcast_pd((__m128d const *)(s + 6 * p))); // Derive flat2 __m256i p4q4256 = _mm256_blend_epi32(p256_4, q256_4, 0xf0); __m256i p5q5256 = _mm256_blend_epi32(p256_5, q256_5, 0xf0); const __m256i p6q6256 = _mm256_blend_epi32(p256_6, q256_6, 0xf0); const __m256i work1 = _mm256_or_si256(_mm256_subs_epu8(p4q4256, p0q0256), _mm256_subs_epu8(p0q0256, p4q4256)); const __m256i work2 = _mm256_or_si256(_mm256_subs_epu8(p5q5256, p0q0256), _mm256_subs_epu8(p0q0256, p5q5256)); const __m256i work3 = _mm256_or_si256(_mm256_subs_epu8(p6q6256, p0q0256), _mm256_subs_epu8(p0q0256, p6q6256)); __m256i flat2_256 = _mm256_max_epu8(work1, work2); flat2_256 = _mm256_max_epu8(flat2_256, work3); __m128i flat2 = _mm_max_epu8(_mm256_castsi256_si128(flat2_256), _mm256_extractf128_si256(flat2_256, 1)); flat2 = _mm_subs_epu8(flat2, one); flat2 = _mm_cmpeq_epi8(flat2, zero); flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask const __m256i filter = _mm256_load_si256((__m256i const *)filt_loopfilter_avx2); p256_3 = _mm256_shuffle_epi8(p256_3, filter); p256_2 = _mm256_shuffle_epi8(p256_2, filter); p256_1 = _mm256_shuffle_epi8(p256_1, filter); p256_0 = _mm256_shuffle_epi8(p256_0, filter); q256_0 = _mm256_shuffle_epi8(q256_0, filter); q256_1 = _mm256_shuffle_epi8(q256_1, filter); q256_2 = _mm256_shuffle_epi8(q256_2, filter); q256_3 = _mm256_shuffle_epi8(q256_3, filter); const __m256i p2p1p0 = _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1)); const __m256i q2q1q0 = _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1)); __m256i pixetFilter_p2p1p0 = _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0)); __m256i pixetFilter_q2q1q0 = pixetFilter_p2p1p0; // Derive p0 and q0 pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3); __m256i res_p = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3); pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3); __m256i res_q = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3); __m256i flat_p0q0 = _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); p0q0256 = _mm256_andnot_si256(flat256, ps0qs0256); flat_p0q0 = _mm256_and_si256(flat256, flat_p0q0); p0q0256 = _mm256_or_si256(flat_p0q0, p0q0256); p0 = _mm256_castsi256_si128(p0q0256); q0 = _mm256_extractf128_si256(p0q0256, 1); // Derive p1 and q1 __m256i sum_p = _mm256_sub_epi16(p256_3, q256_2); pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p); res_p = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3); __m256i sum_q = _mm256_sub_epi16(q256_3, p256_2); pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q); res_q = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3); __m256i flat_p1q1 = _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); __m256i p1q1256 = _mm256_andnot_si256(flat256, ps1qs1256); flat_p1q1 = _mm256_and_si256(flat256, flat_p1q1); p1q1256 = _mm256_or_si256(flat_p1q1, p1q1256); p1 = _mm256_castsi256_si128(p1q1256); q1 = _mm256_extractf128_si256(p1q1256, 1); // Derive p2 and q2 sum_p = _mm256_sub_epi16(p256_3, q256_1); pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p); res_p = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3); sum_q = _mm256_sub_epi16(q256_3, p256_1); pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q); res_q = _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3); __m256i flat_p2q2 = _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); p2q2256 = _mm256_andnot_si256(flat256, p2q2256); flat_p2q2 = _mm256_and_si256(flat256, flat_p2q2); p2q2256 = _mm256_or_si256(flat_p2q2, p2q2256); p2 = _mm256_castsi256_si128(p2q2256); q2 = _mm256_extractf128_si256(p2q2256, 1); if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { flat2_256 = _mm256_insertf128_si256(_mm256_castsi128_si256(flat2), flat2, 0x1); p256_6 = _mm256_shuffle_epi8(p256_6, filter); p256_5 = _mm256_shuffle_epi8(p256_5, filter); p256_4 = _mm256_shuffle_epi8(p256_4, filter); q256_4 = _mm256_shuffle_epi8(q256_4, filter); q256_5 = _mm256_shuffle_epi8(q256_5, filter); q256_6 = _mm256_shuffle_epi8(q256_6, filter); __m256i pixelFilter_p = _mm256_add_epi16(p256_5, _mm256_add_epi16(p256_4, p256_3)); __m256i pixelFilter_q = _mm256_add_epi16(q256_5, _mm256_add_epi16(q256_4, q256_3)); pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p2p1p0); pixelFilter_q = _mm256_add_epi16(pixelFilter_q, q2q1q0); pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p256_0); pixelFilter_q = _mm256_add_epi16(pixelFilter_q, q256_0); pixelFilter_p = _mm256_add_epi16( eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); pixelFilter_q = pixelFilter_p; // Derive p0 and q0 pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_1), pixelFilter_p); res_p = _mm256_srli_epi16(pixelFilter_p, 4); pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_1), pixelFilter_q); res_q = _mm256_srli_epi16(pixelFilter_q, 4); __m256i flat2_p0q0 = _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); p0q0256 = _mm256_andnot_si256(flat2_256, p0q0256); flat2_p0q0 = _mm256_and_si256(flat2_256, flat2_p0q0); p0q0256 = _mm256_or_si256(flat2_p0q0, p0q0256); p0 = _mm256_castsi256_si128(p0q0256); q0 = _mm256_extractf128_si256(p0q0256, 1); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); _mm_storeu_si128((__m128i *)(s - 0 * p), q0); // Derive p1 and q1 sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_5), _mm256_sub_epi16(p256_2, q256_0)); pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p); res_p = _mm256_srli_epi16(pixelFilter_p, 4); sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_5), _mm256_sub_epi16(q256_2, p256_0)); pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q); res_q = _mm256_srli_epi16(pixelFilter_q, 4); __m256i flat2_p1q1 = _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); p1q1256 = _mm256_andnot_si256(flat2_256, p1q1256); flat2_p1q1 = _mm256_and_si256(flat2_256, flat2_p1q1); p1q1256 = _mm256_or_si256(flat2_p1q1, p1q1256); p1 = _mm256_castsi256_si128(p1q1256); q1 = _mm256_extractf128_si256(p1q1256, 1); _mm_storeu_si128((__m128i *)(s - 2 * p), p1); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); // Derive p2 and q2 sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_4), _mm256_sub_epi16(p256_3, p256_0)); pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p); res_p = _mm256_srli_epi16(pixelFilter_p, 4); sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_4), _mm256_sub_epi16(q256_3, q256_0)); pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q); res_q = _mm256_srli_epi16(pixelFilter_q, 4); __m256i flat2_p2q2 = _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); p2q2256 = _mm256_andnot_si256(flat2_256, p2q2256); flat2_p2q2 = _mm256_and_si256(flat2_256, flat2_p2q2); p2q2256 = _mm256_or_si256(flat2_p2q2, p2q2256); p2 = _mm256_castsi256_si128(p2q2256); q2 = _mm256_extractf128_si256(p2q2256, 1); _mm_storeu_si128((__m128i *)(s - 3 * p), p2); _mm_storeu_si128((__m128i *)(s + 2 * p), q2); // Derive p3 and q3 sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_3), _mm256_sub_epi16(p256_4, p256_1)); pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p); res_p = _mm256_srli_epi16(pixelFilter_p, 4); sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_3), _mm256_sub_epi16(q256_4, q256_1)); pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q); res_q = _mm256_srli_epi16(pixelFilter_q, 4); __m256i flat2_p3q3 = _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); p3q3256 = _mm256_andnot_si256(flat2_256, p3q3256); flat2_p3q3 = _mm256_and_si256(flat2_256, flat2_p3q3); p3q3256 = _mm256_or_si256(flat2_p3q3, p3q3256); p3 = _mm256_castsi256_si128(p3q3256); q3 = _mm256_extractf128_si256(p3q3256, 1); _mm_storeu_si128((__m128i *)(s - 4 * p), p3); _mm_storeu_si128((__m128i *)(s + 3 * p), q3); // Derive p4 and q4 sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_2), _mm256_sub_epi16(p256_5, p256_2)); pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p); res_p = _mm256_srli_epi16(pixelFilter_p, 4); sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_2), _mm256_sub_epi16(q256_5, q256_2)); pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q); res_q = _mm256_srli_epi16(pixelFilter_q, 4); __m256i flat2_p4q4 = _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); p4q4256 = _mm256_andnot_si256(flat2_256, p4q4256); flat2_p4q4 = _mm256_and_si256(flat2_256, flat2_p4q4); p4q4256 = _mm256_or_si256(flat2_p4q4, p4q4256); _mm_storeu_si128((__m128i *)(s - 5 * p), _mm256_castsi256_si128(p4q4256)); _mm_storeu_si128((__m128i *)(s + 4 * p), _mm256_extractf128_si256(p4q4256, 1)); // Derive p5 and q5 sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_1), _mm256_sub_epi16(p256_6, p256_3)); pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p); res_p = _mm256_srli_epi16(pixelFilter_p, 4); sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_1), _mm256_sub_epi16(q256_6, q256_3)); pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q); res_q = _mm256_srli_epi16(pixelFilter_q, 4); __m256i flat2_p5q5 = _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); p5q5256 = _mm256_andnot_si256(flat2_256, p5q5256); flat2_p5q5 = _mm256_and_si256(flat2_256, flat2_p5q5); p5q5256 = _mm256_or_si256(flat2_p5q5, p5q5256); _mm_storeu_si128((__m128i *)(s - 6 * p), _mm256_castsi256_si128(p5q5256)); _mm_storeu_si128((__m128i *)(s + 5 * p), _mm256_extractf128_si256(p5q5256, 1)); } else { _mm_storeu_si128((__m128i *)(s - 3 * p), p2); _mm_storeu_si128((__m128i *)(s - 2 * p), p1); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); _mm_storeu_si128((__m128i *)(s - 0 * p), q0); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); _mm_storeu_si128((__m128i *)(s + 2 * p), q2); } } else { _mm_storeu_si128((__m128i *)(s - 2 * p), ps1); _mm_storeu_si128((__m128i *)(s - 1 * p), ps0); _mm_storeu_si128((__m128i *)(s - 0 * p), qs0); _mm_storeu_si128((__m128i *)(s + 1 * p), qs1); } } } void aom_lpf_vertical_14_quad_avx2(unsigned char *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0) { DECLARE_ALIGNED(16, unsigned char, t_dst[256]); // Transpose 16x16 trans_store_16x16_lpf_vert14(s - 8, pitch, t_dst, 16, 1); // Loop filtering aom_lpf_horizontal_14_quad_avx2(t_dst + 8 * 16, 16, _blimit0, _limit0, _thresh0); // Transpose back trans_store_16x16_lpf_vert14(t_dst, 16, s - 8, pitch, 0); } aom-3.12.1/aom_dsp/x86/loopfilter_sse2.c000066400000000000000000003536431477627663500177450ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // SSE2 #include "config/aom_dsp_rtcd.h" #include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" #include "aom_ports/emmintrin_compat.h" #include "aom_dsp/x86/lpf_common_sse2.h" static inline __m128i abs_diff(__m128i a, __m128i b) { return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); } // this function treats its input as 2 parallel 8x4 matrices, transposes each of // them to 4x8 independently while flipping the second matrix horizontally. // Used for 14 taps pq pairs creation static inline void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *q0p0, __m128i *q1p1, __m128i *q2p2, __m128i *q3p3, __m128i *q4p4, __m128i *q5p5, __m128i *q6p6, __m128i *q7p7) { __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3; w0 = _mm_unpacklo_epi8( *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 w1 = _mm_unpacklo_epi8( *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 w2 = _mm_unpackhi_epi8( *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115 w3 = _mm_unpackhi_epi8( *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315 ww0 = _mm_unpacklo_epi16( w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 ww1 = _mm_unpackhi_epi16( w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 ww2 = _mm_unpacklo_epi16( w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311 ww3 = _mm_unpackhi_epi16( w2, w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315 *q7p7 = _mm_unpacklo_epi32( ww0, _mm_srli_si128( ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx *q6p6 = _mm_unpackhi_epi32( _mm_slli_si128(ww0, 4), ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx *q5p5 = _mm_unpackhi_epi32( ww0, _mm_slli_si128( ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx *q4p4 = _mm_unpacklo_epi32( _mm_srli_si128(ww0, 12), ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx *q3p3 = _mm_unpacklo_epi32( ww1, _mm_srli_si128( ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx *q2p2 = _mm_unpackhi_epi32( _mm_slli_si128(ww1, 4), ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx *q1p1 = _mm_unpackhi_epi32( ww1, _mm_slli_si128( ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx *q0p0 = _mm_unpacklo_epi32( _mm_srli_si128(ww1, 12), ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx } // this function treats its input as 2 parallel 8x4 matrices, transposes each of // them independently while flipping the second matrix horizontaly Used for 14 // taps filter pq pairs inverse static inline void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, __m128i *x5, __m128i *x6, __m128i *x7, __m128i *pq0, __m128i *pq1, __m128i *pq2, __m128i *pq3) { __m128i w10, w11, w12, w13; __m128i w0, w1, w2, w3, w4, w5; __m128i d0, d1, d2, d3; w0 = _mm_unpacklo_epi8( *x0, *x1); // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 w1 = _mm_unpacklo_epi8( *x2, *x3); // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 w2 = _mm_unpacklo_epi8( *x4, *x5); // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 w3 = _mm_unpacklo_epi8( *x6, *x7); // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 w4 = _mm_unpacklo_epi16( w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 w5 = _mm_unpacklo_epi16( w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 d0 = _mm_unpacklo_epi32( w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 d2 = _mm_unpackhi_epi32( w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 w10 = _mm_unpacklo_epi8( *x7, *x6); // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13 w11 = _mm_unpacklo_epi8( *x5, *x4); // q xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33 w12 = _mm_unpacklo_epi8( *x3, *x2); // q xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53 w13 = _mm_unpacklo_epi8( *x1, *x0); // q xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73 w4 = _mm_unpackhi_epi16( w10, w11); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 w5 = _mm_unpackhi_epi16( w12, w13); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 d1 = _mm_unpacklo_epi32( w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 d3 = _mm_unpackhi_epi32( w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 *pq0 = _mm_unpacklo_epi64(d0, d1); // pq *pq1 = _mm_unpackhi_epi64(d0, d1); // pq *pq2 = _mm_unpacklo_epi64(d2, d3); // pq *pq3 = _mm_unpackhi_epi64(d2, d3); // pq } static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0, __m128i *hev, __m128i *mask, __m128i *qs1qs0, __m128i *ps1ps0) { __m128i filter, filter2filter1, work; __m128i ps1ps0_work, qs1qs0_work; __m128i hev1; const __m128i t3t4 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4); const __m128i t80 = _mm_set1_epi8((char)0x80); const __m128i ff = _mm_cmpeq_epi8(t80, t80); ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */ qs1qs0_work = _mm_xor_si128(*q1q0, t80); /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work); filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev); /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ filter = _mm_subs_epi8(filter, work); filter = _mm_subs_epi8(filter, work); filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ filter = _mm_and_si128(filter, *mask); /* & mask */ filter = _mm_unpacklo_epi32(filter, filter); /* filter1 = signed_char_clamp(filter + 4) >> 3; */ /* filter2 = signed_char_clamp(filter + 3) >> 3; */ filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); // goto 16 bit filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1); /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ filter = _mm_unpacklo_epi8(filter, filter); // goto 16 bit filter = _mm_srai_epi16(filter, 9); /* round */ filter = _mm_packs_epi16(filter, filter); filter = _mm_andnot_si128(*hev, filter); filter = _mm_unpacklo_epi32(filter, filter); filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter); hev1 = _mm_srli_si128(filter2filter1, 8); /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1); /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1); *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */ *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */ } static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0, __m128i *hev, __m128i *mask, __m128i *qs1qs0, __m128i *ps1ps0) { const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); const __m128i t80 = _mm_set1_epi8((char)0x80); __m128i filter, filter2filter1, work; __m128i ps1ps0_work, qs1qs0_work; __m128i hev1; const __m128i ff = _mm_cmpeq_epi8(t80, t80); ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */ qs1qs0_work = _mm_xor_si128(*q1q0, t80); /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work); filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev); /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ filter = _mm_subs_epi8(filter, work); filter = _mm_subs_epi8(filter, work); filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ filter = _mm_and_si128(filter, *mask); /* & mask */ filter = _mm_unpacklo_epi64(filter, filter); /* filter1 = signed_char_clamp(filter + 4) >> 3; */ /* filter2 = signed_char_clamp(filter + 3) >> 3; */ filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ filter = _mm_srai_epi16(filter, 11); /* >> 3 */ filter2filter1 = _mm_packs_epi16(filter2filter1, filter); /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ filter = _mm_unpacklo_epi8(filter, filter); filter = _mm_srai_epi16(filter, 9); /* round */ filter = _mm_packs_epi16(filter, filter); filter = _mm_andnot_si128(*hev, filter); hev1 = _mm_unpackhi_epi64(filter2filter1, filter); filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1); /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1); *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */ *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */ } static AOM_FORCE_INLINE void lpf_internal_4_sse2( __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit, __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { __m128i q1p1, q0p0, p1p0, q1q0; __m128i abs_p0q0, abs_p1q1; __m128i mask, flat, hev; const __m128i zero = _mm_setzero_si128(); q1p1 = _mm_unpacklo_epi32(*p1, *q1); q0p0 = _mm_unpacklo_epi32(*p0, *q0); p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); q1q0 = _mm_srli_si128(p1p0, 8); /* (abs(q1 - q0), abs(p1 - p0) */ flat = abs_diff(q1p1, q0p0); /* abs(p1 - q1), abs(p0 - q0) */ __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); hev = _mm_unpacklo_epi8(flat, zero); hev = _mm_cmpgt_epi16(hev, *thresh); hev = _mm_packs_epi16(hev, hev); hev = _mm_unpacklo_epi32(hev, hev); abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4); /* abs(p1 - q1) */ abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); mask = _mm_unpacklo_epi32(mask, flat); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4)); filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); } static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2( __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit, __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { __m128i q1p1, q0p0, p1p0, q1q0; __m128i abs_p0q0, abs_p1q1; __m128i mask, hev; const __m128i zero = _mm_setzero_si128(); q1p1 = _mm_unpacklo_epi64(*p1, *q1); q0p0 = _mm_unpacklo_epi64(*p0, *q0); p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); /* (abs(q1 - q0), abs(p1 - p0) */ __m128i flat = abs_diff(q1p1, q0p0); /* abs(p1 - q1), abs(p0 - q0) */ const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); hev = _mm_unpacklo_epi8(flat, zero); hev = _mm_cmpgt_epi16(hev, *thresh); hev = _mm_packs_epi16(hev, hev); /* const int8_t mask = filter_mask2(*limit, *blimit, */ /* p1, p0, q0, q1); */ abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); mask = _mm_unpacklo_epi64(mask, flat); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); } void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh) { const __m128i zero = _mm_setzero_si128(); __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit), _mm_loadl_epi64((const __m128i *)_limit)); __m128i thresh = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); __m128i qs1qs0, ps1ps0; __m128i p1, p0, q0, q1; p1 = xx_loadl_32(s - 2 * p); p0 = xx_loadl_32(s - 1 * p); q0 = xx_loadl_32(s - 0 * p); q1 = xx_loadl_32(s + 1 * p); lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0); xx_storel_32(s - 1 * p, ps1ps0); xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4)); xx_storel_32(s + 0 * p, qs1qs0); xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4)); } void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh) { __m128i p1p0, q1q0; __m128i p1, p0, q0, q1; const __m128i zero = _mm_setzero_si128(); __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit), _mm_loadl_epi64((const __m128i *)_limit)); __m128i thresh = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); __m128i x0, x1, x2, x3; __m128i d0, d1, d2, d3; x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1); lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0); // Transpose 8x4 to 4x8 p1 = _mm_srli_si128(p1p0, 4); q1 = _mm_srli_si128(q1q0, 4); transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); xx_storel_32(s + 0 * p - 2, d0); xx_storel_32(s + 1 * p - 2, d1); xx_storel_32(s + 2 * p - 2, d2); xx_storel_32(s + 3 * p - 2, d3); } static inline void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) { xx_storel_32(s - (num + 1) * p, x); xx_storel_32(s + num * p, _mm_srli_si128(x, 4)); } static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2( __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, __m128i *thresh) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); __m128i mask, hev, flat, flat2; __m128i qs0ps0, qs1ps1; __m128i p1p0, q1q0, qs1qs0, ps1ps0; __m128i abs_p1p0; p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1); q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1); { __m128i abs_p1q1, abs_p0q0, abs_q1q0; __m128i fe, ff, work; abs_p1p0 = abs_diff(*q1p1, *q0p0); abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); fe = _mm_set1_epi8((char)0xfe); ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); abs_p0q0 = abs_diff(p1p0, q1q0); abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, *thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); // replicate for the further "merged variables" usage hev = _mm_unpacklo_epi64(hev, hev); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2)); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); } // lp filter - the same for 6, 8 and 14 versions filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0); qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0); // loopfilter done __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; __m128i work; flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); flat = _mm_max_epu8(abs_p1p0, flat); flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); // if flat ==0 then flat2 is zero as well and we don't need any calc below // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // flat and wide flat calculations const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; __m128i pixelFilter_p, pixelFilter_q; __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; __m128i sum_p6, sum_q6; __m128i sum_p3, sum_q3, res_p, res_q; p6_16 = _mm_unpacklo_epi8(*q6p6, zero); p5_16 = _mm_unpacklo_epi8(*q5p5, zero); p4_16 = _mm_unpacklo_epi8(*q4p4, zero); p3_16 = _mm_unpacklo_epi8(*q3p3, zero); p2_16 = _mm_unpacklo_epi8(*q2p2, zero); p1_16 = _mm_unpacklo_epi8(*q1p1, zero); p0_16 = _mm_unpacklo_epi8(*q0p0, zero); q0_16 = _mm_unpackhi_epi8(*q0p0, zero); q1_16 = _mm_unpackhi_epi8(*q1p1, zero); q2_16 = _mm_unpackhi_epi8(*q2p2, zero); q3_16 = _mm_unpackhi_epi8(*q3p3, zero); q4_16 = _mm_unpackhi_epi8(*q4p4, zero); q5_16 = _mm_unpackhi_epi8(*q5p5, zero); q6_16 = _mm_unpackhi_epi8(*q6p6, zero); pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16)); pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16)); pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); pixetFilter_p2p1p0 = _mm_add_epi16( four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(_mm_add_epi16(p6_16, p0_16), _mm_add_epi16(p1_16, q0_16))), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(_mm_add_epi16(q6_16, q0_16), _mm_add_epi16(p0_16, q1_16))), 4); flat2_q0p0 = _mm_packus_epi16(res_p, res_q); res_p = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); res_q = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); flat_q0p0 = _mm_packus_epi16(res_p, res_q); sum_p6 = _mm_add_epi16(p6_16, p6_16); sum_q6 = _mm_add_epi16(q6_16, q6_16); sum_p3 = _mm_add_epi16(p3_16, p3_16); sum_q3 = _mm_add_epi16(q3_16, q3_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); res_p = _mm_srli_epi16( _mm_add_epi16( pixelFilter_p, _mm_add_epi16(sum_p6, _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))), 4); res_q = _mm_srli_epi16( _mm_add_epi16( pixelFilter_q, _mm_add_epi16(sum_q6, _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))), 4); flat2_q1p1 = _mm_packus_epi16(res_p, res_q); pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); res_q = _mm_srli_epi16( _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); flat_q1p1 = _mm_packus_epi16(res_p, res_q); pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); sum_p3 = _mm_add_epi16(sum_p3, p3_16); sum_q3 = _mm_add_epi16(sum_q3, q3_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); res_q = _mm_srli_epi16( _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); flat_q2p2 = _mm_packus_epi16(res_p, res_q); // work with flat2 flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); work = abs_diff(*q6p6, *q0p0); flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); flat2 = _mm_subs_epu8(flat2, one); flat2 = _mm_cmpeq_epi8(flat2, zero); flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat = _mm_unpacklo_epi64(flat, flat); *q2p2 = _mm_andnot_si128(flat, *q2p2); flat_q2p2 = _mm_and_si128(flat, flat_q2p2); *q2p2 = _mm_or_si128(*q2p2, flat_q2p2); qs1ps1 = _mm_andnot_si128(flat, qs1ps1); flat_q1p1 = _mm_and_si128(flat, flat_q1p1); *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); qs0ps0 = _mm_andnot_si128(flat, qs0ps0); flat_q0p0 = _mm_and_si128(flat, flat_q0p0); *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); sum_p6 = _mm_add_epi16(sum_p6, p6_16); sum_q6 = _mm_add_epi16(sum_q6, q6_16); res_p = _mm_srli_epi16( _mm_add_epi16( pixelFilter_p, _mm_add_epi16(sum_p6, _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))), 4); res_q = _mm_srli_epi16( _mm_add_epi16( pixelFilter_q, _mm_add_epi16(sum_q6, _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))), 4); flat2_q2p2 = _mm_packus_epi16(res_p, res_q); sum_p6 = _mm_add_epi16(sum_p6, p6_16); sum_q6 = _mm_add_epi16(sum_q6, q6_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); res_p = _mm_srli_epi16( _mm_add_epi16( pixelFilter_p, _mm_add_epi16(sum_p6, _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))), 4); res_q = _mm_srli_epi16( _mm_add_epi16( pixelFilter_q, _mm_add_epi16(sum_q6, _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))), 4); flat2_q3p3 = _mm_packus_epi16(res_p, res_q); sum_p6 = _mm_add_epi16(sum_p6, p6_16); sum_q6 = _mm_add_epi16(sum_q6, q6_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); res_p = _mm_srli_epi16( _mm_add_epi16( pixelFilter_p, _mm_add_epi16(sum_p6, _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))), 4); res_q = _mm_srli_epi16( _mm_add_epi16( pixelFilter_q, _mm_add_epi16(sum_q6, _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))), 4); flat2_q4p4 = _mm_packus_epi16(res_p, res_q); sum_p6 = _mm_add_epi16(sum_p6, p6_16); sum_q6 = _mm_add_epi16(sum_q6, q6_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); res_p = _mm_srli_epi16( _mm_add_epi16( pixelFilter_p, _mm_add_epi16(sum_p6, _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))), 4); res_q = _mm_srli_epi16( _mm_add_epi16( pixelFilter_q, _mm_add_epi16(sum_q6, _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))), 4); flat2_q5p5 = _mm_packus_epi16(res_p, res_q); // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat2 = _mm_unpacklo_epi64(flat2, flat2); *q5p5 = _mm_andnot_si128(flat2, *q5p5); flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5); *q4p4 = _mm_andnot_si128(flat2, *q4p4); flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4); *q3p3 = _mm_andnot_si128(flat2, *q3p3); flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3); *q2p2 = _mm_andnot_si128(flat2, *q2p2); flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2); *q1p1 = _mm_andnot_si128(flat2, *q1p1); flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1); *q0p0 = _mm_andnot_si128(flat2, *q0p0); flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0); } } else { *q0p0 = qs0ps0; *q1p1 = qs1ps1; } } static AOM_FORCE_INLINE void lpf_internal_14_sse2( __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, __m128i *thresh) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); __m128i mask, hev, flat, flat2; __m128i flat2_pq[6], flat_pq[3]; __m128i qs0ps0, qs1ps1; __m128i p1p0, q1q0, qs1qs0, ps1ps0; __m128i abs_p1p0; p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1); q1q0 = _mm_srli_si128(p1p0, 8); __m128i fe, ff, work; { __m128i abs_p1q1, abs_p0q0, abs_q1q0; abs_p1p0 = abs_diff(*q1p1, *q0p0); abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); fe = _mm_set1_epi8((char)0xfe); ff = _mm_cmpeq_epi8(fe, fe); abs_p0q0 = abs_diff(p1p0, q1q0); abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, *thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); // replicate for the further "merged variables" usage hev = _mm_unpacklo_epi32(hev, hev); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); mask = _mm_unpacklo_epi32(mask, zero); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2)); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); } // lp filter - the same for 6, 8 and 14 versions filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0); qs1ps1 = _mm_srli_si128(qs0ps0, 8); // loopfilter done flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); flat = _mm_max_epu8(abs_p1p0, flat); flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); flat = _mm_unpacklo_epi32(flat, flat); flat = _mm_unpacklo_epi64(flat, flat); // if flat ==0 then flat2 is zero as well and we don't need any calc below // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // flat and wide flat calculations __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; __m128i pq_16[7]; const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); __m128i sum_p6; __m128i sum_p3; pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero); pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero); pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero); pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero); pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero); pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero); pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero); q0_16 = _mm_srli_si128(pq_16[0], 8); q1_16 = _mm_srli_si128(pq_16[1], 8); q2_16 = _mm_srli_si128(pq_16[2], 8); q3_16 = _mm_srli_si128(pq_16[3], 8); q4_16 = _mm_srli_si128(pq_16[4], 8); q5_16 = _mm_srli_si128(pq_16[5], 8); __m128i flat_p[3], flat_q[3]; __m128i flat2_p[6], flat2_q[6]; __m128i work0, work0_0, work0_1, sum_p_0; __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3])); __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1])); sum_p = _mm_add_epi16(sum_p, sum_lp); __m128i sum_lq = _mm_srli_si128(sum_lp, 8); __m128i sum_q = _mm_srli_si128(sum_p, 8); sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0])); flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16)); sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]); sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]); sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]); sum_p = _mm_sub_epi16(sum_p_0, q5_16); work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]); work0_1 = _mm_add_epi16( sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0]))); sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]); sum_lp = _mm_sub_epi16(sum_lp, q2_16); work0 = _mm_add_epi16(sum_p3, pq_16[1]); flat_p[1] = _mm_add_epi16(sum_lp, work0); flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]); flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]); sum_lp = _mm_sub_epi16(sum_lp, q1_16); sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]); sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]); work0 = _mm_add_epi16(sum_p3, pq_16[2]); flat_p[2] = _mm_add_epi16(sum_lp, work0); flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); work = abs_diff(*q6p6, *q0p0); flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4)); flat2 = _mm_subs_epu8(flat2, one); flat2 = _mm_cmpeq_epi8(flat2, zero); flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask flat2 = _mm_unpacklo_epi32(flat2, flat2); // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ qs0ps0 = _mm_andnot_si128(flat, qs0ps0); flat_pq[0] = _mm_and_si128(flat, flat_pq[0]); *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]); qs1ps1 = _mm_andnot_si128(flat, qs1ps1); flat_pq[1] = _mm_and_si128(flat, flat_pq[1]); *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]); *q2p2 = _mm_andnot_si128(flat, *q2p2); flat_pq[2] = _mm_and_si128(flat, flat_pq[2]); *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]); if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16)); flat2_q[0] = _mm_add_epi16( sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0])); flat2_p[1] = _mm_add_epi16(sum_p, work0_1); flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); flat2_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); flat2_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]); flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]); sum_p = _mm_sub_epi16(sum_p, q4_16); sum_q = _mm_sub_epi16(sum_q, pq_16[4]); sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); work0 = _mm_add_epi16( sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1]))); flat2_p[2] = _mm_add_epi16(sum_p, work0); flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); flat2_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]); sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); sum_p = _mm_sub_epi16(sum_p, q3_16); sum_q = _mm_sub_epi16(sum_q, pq_16[3]); work0 = _mm_add_epi16( sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2]))); flat2_p[3] = _mm_add_epi16(sum_p, work0); flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); flat2_pq[3] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]); sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); sum_p = _mm_sub_epi16(sum_p, q2_16); sum_q = _mm_sub_epi16(sum_q, pq_16[2]); work0 = _mm_add_epi16( sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3]))); flat2_p[4] = _mm_add_epi16(sum_p, work0); flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); flat2_pq[4] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]); sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); sum_p = _mm_sub_epi16(sum_p, q1_16); sum_q = _mm_sub_epi16(sum_q, pq_16[1]); work0 = _mm_add_epi16( sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4]))); flat2_p[5] = _mm_add_epi16(sum_p, work0); flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); flat2_pq[5] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]); // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ *q0p0 = _mm_andnot_si128(flat2, *q0p0); flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]); *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]); *q1p1 = _mm_andnot_si128(flat2, *q1p1); flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]); *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]); *q2p2 = _mm_andnot_si128(flat2, *q2p2); flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]); *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]); *q3p3 = _mm_andnot_si128(flat2, *q3p3); flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]); *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]); *q4p4 = _mm_andnot_si128(flat2, *q4p4); flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]); *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]); *q5p5 = _mm_andnot_si128(flat2, *q5p5); flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]); *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]); } } else { *q0p0 = qs0ps0; *q1p1 = qs1ps1; } } void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; __m128i blimit = _mm_load_si128((const __m128i *)_blimit); __m128i limit = _mm_load_si128((const __m128i *)_limit); __m128i thresh = _mm_load_si128((const __m128i *)_thresh); q4p4 = _mm_unpacklo_epi32(xx_loadl_32(s - 5 * p), xx_loadl_32(s + 4 * p)); q3p3 = _mm_unpacklo_epi32(xx_loadl_32(s - 4 * p), xx_loadl_32(s + 3 * p)); q2p2 = _mm_unpacklo_epi32(xx_loadl_32(s - 3 * p), xx_loadl_32(s + 2 * p)); q1p1 = _mm_unpacklo_epi32(xx_loadl_32(s - 2 * p), xx_loadl_32(s + 1 * p)); q0p0 = _mm_unpacklo_epi32(xx_loadl_32(s - 1 * p), xx_loadl_32(s - 0 * p)); q5p5 = _mm_unpacklo_epi32(xx_loadl_32(s - 6 * p), xx_loadl_32(s + 5 * p)); q6p6 = _mm_unpacklo_epi32(xx_loadl_32(s - 7 * p), xx_loadl_32(s + 6 * p)); lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, &limit, &thresh); store_buffer_horz_8(q0p0, p, 0, s); store_buffer_horz_8(q1p1, p, 1, s); store_buffer_horz_8(q2p2, p, 2, s); store_buffer_horz_8(q3p3, p, 3, s); store_buffer_horz_8(q4p4, p, 4, s); store_buffer_horz_8(q5p5, p, 5, s); } static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2( __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, __m128i *thresh) { const __m128i zero = _mm_setzero_si128(); __m128i mask, hev, flat; __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1; __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16; __m128i ps1ps0, qs1qs0; q2p2 = _mm_unpacklo_epi64(*p2, *q2); q1p1 = _mm_unpacklo_epi64(*p1, *q1); q0p0 = _mm_unpacklo_epi64(*p0, *q0); *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); const __m128i one = _mm_set1_epi8(1); const __m128i fe = _mm_set1_epi8((char)0xfe); const __m128i ff = _mm_cmpeq_epi8(fe, fe); { // filter_mask and hev_mask __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; abs_p1p0 = abs_diff(q1p1, q0p0); abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); abs_p0q0 = abs_diff(*p1p0, *q1q0); abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); // considering sse doesn't have unsigned elements comparison the idea is // to find at least one case when X > limit, it means the corresponding // mask bit is set. // to achieve that we find global max value of all inputs of abs(x-y) or // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set // otherwise - not flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, *thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); // replicate for the further "merged variables" usage hev = _mm_unpacklo_epi64(hev, hev); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; work = abs_diff(q2p2, q1p1); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); // lp filter - the same for 6, 8 and 14 versions filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0); // flat_mask flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); // replicate for the further "merged variables" usage flat = _mm_unpacklo_epi64(flat, flat); } // 5 tap filter // need it only if flat !=0 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { const __m128i four = _mm_set1_epi16(4); __m128i workp_a, workp_b, workp_shft0, workp_shft1; p2_16 = _mm_unpacklo_epi8(*p2, zero); p1_16 = _mm_unpacklo_epi8(*p1, zero); p0_16 = _mm_unpacklo_epi8(*p0, zero); q0_16 = _mm_unpacklo_epi8(*q0, zero); q1_16 = _mm_unpacklo_epi8(*q1, zero); q2_16 = _mm_unpacklo_epi8(*q2, zero); // op1 workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16), _mm_add_epi16(p1_16, p1_16)); // p0 *2 + p1 * 2 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p2_16); // p2 + p0 * 2 + p1 * 2 + 4 workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16); workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 // op0 workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16); // q0 * 2 + q1 workp_a = _mm_add_epi16(workp_a, workp_b); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 workp_shft1 = _mm_srli_epi16(workp_a, 3); flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0); // oq0 workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16), p1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4 workp_b = _mm_add_epi16(q1_16, q2_16); workp_a = _mm_add_epi16( workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 workp_shft0 = _mm_srli_epi16(workp_a, 3); // oq1 workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16), p0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4 workp_b = _mm_add_epi16(q2_16, q2_16); workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); qs1qs0 = _mm_andnot_si128(flat, *q1q0); *q1q0 = _mm_and_si128(flat, flat_q0q1); *q1q0 = _mm_or_si128(qs1qs0, *q1q0); ps1ps0 = _mm_andnot_si128(flat, *p1p0); *p1p0 = _mm_and_si128(flat, flat_p1p0); *p1p0 = _mm_or_si128(ps1ps0, *p1p0); } } static AOM_FORCE_INLINE void lpf_internal_6_sse2( __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, __m128i *thresh) { const __m128i zero = _mm_setzero_si128(); __m128i mask, hev, flat; __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1; __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16; __m128i ps1ps0, qs1qs0; q2p2 = _mm_unpacklo_epi32(*p2, *q2); q1p1 = _mm_unpacklo_epi32(*p1, *q1); q0p0 = _mm_unpacklo_epi32(*p0, *q0); *p1p0 = _mm_unpacklo_epi32(*p0, *p1); *q1q0 = _mm_unpacklo_epi32(*q0, *q1); const __m128i one = _mm_set1_epi8(1); const __m128i fe = _mm_set1_epi8((char)0xfe); const __m128i ff = _mm_cmpeq_epi8(fe, fe); { // filter_mask and hev_mask __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; abs_p1p0 = abs_diff(q1p1, q0p0); abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); abs_p0q0 = abs_diff(*p1p0, *q1q0); abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); // considering sse doesn't have unsigned elements comparison the idea is // to find at least one case when X > limit, it means the corresponding // mask bit is set. // to achieve that we find global max value of all inputs of abs(x-y) or // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set // otherwise - not flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, *thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); // replicate for the further "merged variables" usage hev = _mm_unpacklo_epi32(hev, hev); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); mask = _mm_unpacklo_epi32(mask, zero); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; work = abs_diff(q2p2, q1p1); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); // lp filter - the same for 6, 8 and 14 versions filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0); // flat_mask flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); // replicate for the further "merged variables" usage flat = _mm_unpacklo_epi32(flat, flat); flat = _mm_unpacklo_epi64(flat, flat); } // 5 tap filter // need it only if flat !=0 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { const __m128i four = _mm_set1_epi16(4); __m128i workp_a, workp_b, workp_c; __m128i pq0x2_pq1, pq1_pq2; pq2_16 = _mm_unpacklo_epi8(q2p2, zero); pq1_16 = _mm_unpacklo_epi8(q1p1, zero); pq0_16 = _mm_unpacklo_epi8(q0p0, zero); q0_16 = _mm_srli_si128(pq0_16, 8); q2_16 = _mm_srli_si128(pq2_16, 8); // op1 pq0x2_pq1 = _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16); // p0 *2 + p1 pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16); // p1 + p2 workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16); workp_b = _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 // op0 workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 workp_a = _mm_add_epi16(workp_a, workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 workp_b = _mm_unpacklo_epi64(workp_a, workp_b); workp_b = _mm_srli_epi16(workp_b, 3); flat_p1p0 = _mm_packus_epi16(workp_b, workp_b); // oq0 workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16), pq1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4 workp_b = _mm_srli_si128(pq1_pq2, 8); workp_a = _mm_add_epi16( workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 // workp_shft0 = _mm_srli_epi16(workp_a, 3); // oq1 workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16), pq0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4 workp_b = _mm_add_epi16(q2_16, q2_16); workp_b = _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 workp_a = _mm_unpacklo_epi64(workp_a, workp_b); workp_a = _mm_srli_epi16(workp_a, 3); flat_q0q1 = _mm_packus_epi16(workp_a, workp_a); qs1qs0 = _mm_andnot_si128(flat, *q1q0); *q1q0 = _mm_and_si128(flat, flat_q0q1); *q1q0 = _mm_or_si128(qs1qs0, *q1q0); ps1ps0 = _mm_andnot_si128(flat, *p1p0); *p1p0 = _mm_and_si128(flat, flat_p1p0); *p1p0 = _mm_or_si128(ps1ps0, *p1p0); } } void aom_lpf_horizontal_6_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { __m128i p2, p1, p0, q0, q1, q2; __m128i p1p0, q1q0; __m128i blimit = _mm_load_si128((__m128i *)_blimit); __m128i limit = _mm_load_si128((__m128i *)_limit); __m128i thresh = _mm_load_si128((__m128i *)_thresh); p2 = xx_loadl_32(s - 3 * p); p1 = xx_loadl_32(s - 2 * p); p0 = xx_loadl_32(s - 1 * p); q0 = xx_loadl_32(s - 0 * p); q1 = xx_loadl_32(s + 1 * p); q2 = xx_loadl_32(s + 2 * p); lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, &limit, &thresh); xx_storel_32(s - 1 * p, p1p0); xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4)); xx_storel_32(s + 0 * p, q1q0); xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4)); } void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p, const unsigned char *_blimit0, const unsigned char *_limit0, const unsigned char *_thresh0, const unsigned char *_blimit1, const unsigned char *_limit1, const unsigned char *_thresh1) { __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), _mm_load_si128((__m128i *)_blimit1)); __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), _mm_load_si128((__m128i *)_limit1)); __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), _mm_load_si128((__m128i *)_thresh1)); __m128i p2, p1, p0, q0, q1, q2; __m128i p1p0, q1q0; p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, &limit, &thresh); _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); } static AOM_FORCE_INLINE void lpf_internal_8_sse2( __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, __m128i *blimit, __m128i *limit, __m128i *thresh) { const __m128i zero = _mm_setzero_si128(); __m128i mask, hev, flat; __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, flat_p1p0, flat_q0q1; __m128i q2p2, q1p1, q0p0; __m128i q1q0, p1p0, ps1ps0, qs1qs0; __m128i work_pq, opq2, pq2; q3p3 = _mm_unpacklo_epi32(*p3, *q3); q2p2 = _mm_unpacklo_epi32(*p2, *q2); q1p1 = _mm_unpacklo_epi32(*p1, *q1); q0p0 = _mm_unpacklo_epi32(*p0, *q0); p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); // p1p0 q1q0 q1q0 = _mm_srli_si128(p1p0, 8); // filter_mask and hev_mask // considering sse doesn't have unsigned elements comparison the idea is to // find at least one case when X > limit, it means the corresponding mask // bit is set. // to achieve that we find global max value of all inputs of abs(x-y) or // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set // otherwise - not const __m128i one = _mm_set1_epi8(1); const __m128i fe = _mm_set1_epi8((char)0xfe); const __m128i ff = _mm_cmpeq_epi8(fe, fe); __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; abs_p1p0 = abs_diff(q1p1, q0p0); abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); abs_p0q0 = abs_diff(p1p0, q1q0); abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, *thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); // replicate for the further "merged variables" usage hev = _mm_unpacklo_epi32(hev, hev); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); mask = _mm_unpacklo_epi32(mask, zero); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); // lp filter - the same for 6, 8 and 14 versions filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); // flat_mask4 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); flat = _mm_max_epu8(abs_p1p0, flat); flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); // replicate for the further "merged variables" usage flat = _mm_unpacklo_epi32(flat, flat); flat = _mm_unpacklo_epi64(flat, flat); // filter8 need it only if flat !=0 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { const __m128i four = _mm_set1_epi16(4); __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2; p2_16 = _mm_unpacklo_epi8(*p2, zero); p1_16 = _mm_unpacklo_epi8(*p1, zero); p0_16 = _mm_unpacklo_epi8(*p0, zero); q0_16 = _mm_unpacklo_epi8(*q0, zero); q1_16 = _mm_unpacklo_epi8(*q1, zero); q2_16 = _mm_unpacklo_epi8(*q2, zero); p3_16 = _mm_unpacklo_epi8(*p3, zero); q3_16 = _mm_unpacklo_epi8(*q3, zero); // op2 workp_a = _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); workp_shft2 = _mm_add_epi16(workp_a, workp_b); // op1 workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); workp_c = _mm_add_epi16(workp_a, workp_b); // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); // op0 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16); workp_d = _mm_add_epi16(workp_a, workp_b); // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); workp_c = _mm_unpacklo_epi64(workp_d, workp_c); workp_c = _mm_srli_epi16(workp_c, 3); flat_p1p0 = _mm_packus_epi16(workp_c, workp_c); // oq0 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16); // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); workp_c = _mm_add_epi16(workp_a, workp_b); // oq1 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16); workp_d = _mm_add_epi16(workp_a, workp_b); // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); workp_c = _mm_unpacklo_epi64(workp_c, workp_d); workp_c = _mm_srli_epi16(workp_c, 3); flat_q0q1 = _mm_packus_epi16(workp_c, workp_c); // oq2 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); workp_shft1 = _mm_add_epi16(workp_a, workp_b); workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1); workp_c = _mm_srli_epi16(workp_c, 3); opq2 = _mm_packus_epi16(workp_c, workp_c); work_pq = _mm_andnot_si128(flat, q2p2); pq2 = _mm_and_si128(flat, opq2); *p2 = _mm_or_si128(work_pq, pq2); *q2 = _mm_srli_si128(*p2, 4); qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); q1q0 = _mm_and_si128(flat, flat_q0q1); *q1q0_out = _mm_or_si128(qs1qs0, q1q0); ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); p1p0 = _mm_and_si128(flat, flat_p1p0); *p1p0_out = _mm_or_si128(ps1ps0, p1p0); } } static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2( __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, __m128i *blimit, __m128i *limit, __m128i *thresh) { const __m128i zero = _mm_setzero_si128(); __m128i mask, hev, flat; __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, flat_p1p0, flat_q0q1; __m128i q2p2, q1p1, q0p0; __m128i q1q0, p1p0, ps1ps0, qs1qs0; __m128i work_pq, opq2, pq2; q3p3 = _mm_unpacklo_epi64(*p3, *q3); q2p2 = _mm_unpacklo_epi64(*p2, *q2); q1p1 = _mm_unpacklo_epi64(*p1, *q1); q0p0 = _mm_unpacklo_epi64(*p0, *q0); p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); { // filter_mask and hev_mask // considering sse doesn't have unsigned elements comparison the idea is to // find at least one case when X > limit, it means the corresponding mask // bit is set. // to achieve that we find global max value of all inputs of abs(x-y) or // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set // otherwise - not const __m128i one = _mm_set1_epi8(1); const __m128i fe = _mm_set1_epi8((char)0xfe); const __m128i ff = _mm_cmpeq_epi8(fe, fe); __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; abs_p1p0 = abs_diff(q1p1, q0p0); abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); abs_p0q0 = abs_diff(p1p0, q1q0); abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, *thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); // replicate for the further "merged variables" usage hev = _mm_unpacklo_epi64(hev, hev); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); // lp filter - the same for 6, 8 and 14 versions filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); // flat_mask4 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); flat = _mm_max_epu8(abs_p1p0, flat); flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); // replicate for the further "merged variables" usage flat = _mm_unpacklo_epi64(flat, flat); } // filter8 need it only if flat !=0 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { const __m128i four = _mm_set1_epi16(4); __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2; p2_16 = _mm_unpacklo_epi8(*p2, zero); p1_16 = _mm_unpacklo_epi8(*p1, zero); p0_16 = _mm_unpacklo_epi8(*p0, zero); q0_16 = _mm_unpacklo_epi8(*q0, zero); q1_16 = _mm_unpacklo_epi8(*q1, zero); q2_16 = _mm_unpacklo_epi8(*q2, zero); p3_16 = _mm_unpacklo_epi8(*p3, zero); q3_16 = _mm_unpacklo_epi8(*q3, zero); // op2 workp_a = _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); // op1 workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); // op0 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16); workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0); // oq0 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16); workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); // oq1 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16); workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); // oq2 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); opq2 = _mm_packus_epi16(workp_shft2, workp_shft1); work_pq = _mm_andnot_si128(flat, q2p2); pq2 = _mm_and_si128(flat, opq2); *p2 = _mm_or_si128(work_pq, pq2); *q2 = _mm_srli_si128(*p2, 8); qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); q1q0 = _mm_and_si128(flat, flat_q0q1); *q1q0_out = _mm_or_si128(qs1qs0, q1q0); ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); p1p0 = _mm_and_si128(flat, flat_p1p0); *p1p0_out = _mm_or_si128(ps1ps0, p1p0); } } void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { __m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i q1q0, p1p0; __m128i blimit = _mm_load_si128((const __m128i *)_blimit); __m128i limit = _mm_load_si128((const __m128i *)_limit); __m128i thresh = _mm_load_si128((const __m128i *)_thresh); p3 = xx_loadl_32(s - 4 * p); p2 = xx_loadl_32(s - 3 * p); p1 = xx_loadl_32(s - 2 * p); p0 = xx_loadl_32(s - 1 * p); q0 = xx_loadl_32(s - 0 * p); q1 = xx_loadl_32(s + 1 * p); q2 = xx_loadl_32(s + 2 * p); q3 = xx_loadl_32(s + 3 * p); lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, &limit, &thresh); xx_storel_32(s - 1 * p, p1p0); xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4)); xx_storel_32(s + 0 * p, q1q0); xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4)); xx_storel_32(s - 3 * p, p2); xx_storel_32(s + 2 * p, q2); } void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p, const unsigned char *_blimit0, const unsigned char *_limit0, const unsigned char *_thresh0, const unsigned char *_blimit1, const unsigned char *_limit1, const unsigned char *_thresh1) { __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), _mm_load_si128((const __m128i *)_blimit1)); __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), _mm_load_si128((const __m128i *)_limit1)); __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0), _mm_load_si128((const __m128i *)_thresh1)); q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)), _mm_loadl_epi64((__m128i *)(s + 4 * p))); q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), _mm_loadl_epi64((__m128i *)(s + 3 * p))); q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), _mm_loadl_epi64((__m128i *)(s + 2 * p))); q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), _mm_loadl_epi64((__m128i *)(s + 1 * p))); q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), _mm_loadl_epi64((__m128i *)(s - 0 * p))); q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)), _mm_loadl_epi64((__m128i *)(s + 5 * p))); q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)), _mm_loadl_epi64((__m128i *)(s + 6 * p))); lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, &limit, &thresh); _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8)); _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8)); _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8)); _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8)); _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8)); _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8)); } void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1) { __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), _mm_load_si128((__m128i *)_blimit1)); __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), _mm_load_si128((__m128i *)_limit1)); __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), _mm_load_si128((__m128i *)_thresh1)); __m128i p2, p1, p0, q0, q1, q2, p3, q3; __m128i q1q0, p1p0; p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, &limit, &thresh); _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); _mm_storel_epi64((__m128i *)(s - 3 * p), p2); _mm_storel_epi64((__m128i *)(s + 2 * p), q2); } void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, const unsigned char *_blimit0, const unsigned char *_limit0, const unsigned char *_thresh0, const unsigned char *_blimit1, const unsigned char *_limit1, const unsigned char *_thresh1) { __m128i p1, p0, q0, q1; __m128i qs1qs0, ps1ps0; p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); const __m128i zero = _mm_setzero_si128(); const __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), _mm_load_si128((const __m128i *)_blimit1)); const __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), _mm_load_si128((const __m128i *)_limit1)); __m128i l = _mm_unpacklo_epi64(blimit, limit); __m128i thresh0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero); __m128i thresh1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero); __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8)); _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8)); } void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1) { __m128i p0, q0, q1, p1; __m128i x0, x1, x2, x3, x4, x5, x6, x7; __m128i d0, d1, d2, d3, d4, d5, d6, d7; __m128i qs1qs0, ps1ps0; const __m128i zero = _mm_setzero_si128(); const __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), _mm_load_si128((const __m128i *)_blimit1)); const __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), _mm_load_si128((const __m128i *)_limit1)); __m128i l = _mm_unpacklo_epi64(blimit, limit); __m128i thresh0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero); __m128i thresh1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero); __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); x0 = _mm_loadl_epi64((__m128i *)((s - 2))); x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p)); x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p)); x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p)); x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p)); x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p)); x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p)); x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p)); transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0, &q1); lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); p1 = _mm_srli_si128(ps1ps0, 8); q1 = _mm_srli_si128(qs1qs0, 8); transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); xx_storel_32((s - 2 + 0 * p), d0); xx_storel_32((s - 2 + 1 * p), d1); xx_storel_32((s - 2 + 2 * p), d2); xx_storel_32((s - 2 + 3 * p), d3); xx_storel_32((s - 2 + 4 * p), d4); xx_storel_32((s - 2 + 5 * p), d5); xx_storel_32((s - 2 + 6 * p), d6); xx_storel_32((s - 2 + 7 * p), d7); } void aom_lpf_vertical_6_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { __m128i d0, d1, d2, d3, d4, d5, d6, d7; __m128i x2, x1, x0, x3; __m128i p0, q0; __m128i p1p0, q1q0; __m128i blimit = _mm_load_si128((__m128i *)_blimit); __m128i limit = _mm_load_si128((__m128i *)_limit); __m128i thresh = _mm_load_si128((__m128i *)_thresh); x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p)); x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p)); x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p)); x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p)); transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit, &limit, &thresh); p0 = _mm_srli_si128(p1p0, 4); q0 = _mm_srli_si128(q1q0, 4); transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); xx_storel_32(s + 0 * p - 2, d0); xx_storel_32(s + 1 * p - 2, d1); xx_storel_32(s + 2 * p - 2, d2); xx_storel_32(s + 3 * p - 2, d3); } void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1) { __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), _mm_load_si128((__m128i *)_blimit1)); __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), _mm_load_si128((__m128i *)_limit1)); __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), _mm_load_si128((__m128i *)_thresh1)); __m128i d0, d1, d2, d3, d4, d5, d6, d7; __m128i x0, x1, x2, x3, x4, x5, x6, x7; __m128i p0, q0; __m128i p1p0, q1q0; __m128i d0d1, d2d3, d4d5, d6d7; x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p)); x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p)); x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p)); x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p)); x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p)); x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p)); x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p)); x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p)); transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5, &d6d7); d1 = _mm_srli_si128(d0d1, 8); d3 = _mm_srli_si128(d2d3, 8); d5 = _mm_srli_si128(d4d5, 8); d7 = _mm_srli_si128(d6d7, 8); lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, &blimit, &limit, &thresh); p0 = _mm_srli_si128(p1p0, 8); q0 = _mm_srli_si128(q1q0, 8); transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); xx_storel_32((s - 2 + 0 * p), d0); xx_storel_32((s - 2 + 1 * p), d1); xx_storel_32((s - 2 + 2 * p), d2); xx_storel_32((s - 2 + 3 * p), d3); xx_storel_32((s - 2 + 4 * p), d4); xx_storel_32((s - 2 + 5 * p), d5); xx_storel_32((s - 2 + 6 * p), d6); xx_storel_32((s - 2 + 7 * p), d7); } void aom_lpf_vertical_8_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { __m128i d0, d1, d2, d3, d4, d5, d6, d7; __m128i p0, q0; __m128i x2, x1, x0, x3; __m128i q1q0, p1p0; __m128i blimit = _mm_load_si128((const __m128i *)_blimit); __m128i limit = _mm_load_si128((const __m128i *)_limit); __m128i thresh = _mm_load_si128((const __m128i *)_thresh); x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p)); x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p)); x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p)); x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p)); transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); // Loop filtering lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, &blimit, &limit, &thresh); p0 = _mm_srli_si128(p1p0, 4); q0 = _mm_srli_si128(q1q0, 4); transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1, &d2, &d3); _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0); _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1); _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2); _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3); } void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1) { __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), _mm_load_si128((__m128i *)_blimit1)); __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), _mm_load_si128((__m128i *)_limit1)); __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), _mm_load_si128((__m128i *)_thresh1)); __m128i x0, x1, x2, x3, x4, x5, x6, x7; __m128i d1, d3, d5, d7; __m128i q1q0, p1p0; __m128i p1, q1; __m128i d0d1, d2d3, d4d5, d6d7; x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p)); x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p)); x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p)); x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p)); x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p)); x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p)); x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p)); x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p)); transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5, &d6d7); d1 = _mm_srli_si128(d0d1, 8); d3 = _mm_srli_si128(d2d3, 8); d5 = _mm_srli_si128(d4d5, 8); d7 = _mm_srli_si128(d6d7, 8); lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, &q1q0, &p1p0, &blimit, &limit, &thresh); p1 = _mm_srli_si128(p1p0, 8); q1 = _mm_srli_si128(q1q0, 8); transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1, &d2d3, &d4d5, &d6d7); _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1); _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8)); _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3); _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8)); _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5); _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8)); _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7); _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8)); } void aom_lpf_vertical_14_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; __m128i x6, x5, x4, x3; __m128i pq0, pq1, pq2, pq3; __m128i blimit = _mm_load_si128((__m128i *)_blimit); __m128i limit = _mm_load_si128((__m128i *)_limit); __m128i thresh = _mm_load_si128((__m128i *)_thresh); x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p)); x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p)); x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p)); x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p)); transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4, &q5p5, &q6p6, &q7p7); lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, &limit, &thresh); transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &pq0, &pq1, &pq2, &pq3); _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0); _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1); _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2); _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3); } void aom_lpf_vertical_14_dual_sse2( unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1) { __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; __m128i x7, x6, x5, x4, x3, x2, x1, x0; __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15; __m128i q0, q1, q2, q3, q7; __m128i p0p1, p2p3, p4p5, p6p7; __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), _mm_load_si128((const __m128i *)_blimit1)); __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), _mm_load_si128((const __m128i *)_limit1)); __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0), _mm_load_si128((const __m128i *)_thresh1)); x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p)); x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p)); x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p)); x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p)); x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p)); x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p)); x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p)); x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p)); transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3, &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15); q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8)); q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8)); q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8)); q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8)); q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8)); q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8)); q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8)); q7 = _mm_srli_si128(d14d15, 8); lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, &limit, &thresh); x0 = _mm_srli_si128(q0p0, 8); x1 = _mm_srli_si128(q1p1, 8); x2 = _mm_srli_si128(q2p2, 8); x3 = _mm_srli_si128(q3p3, 8); x4 = _mm_srli_si128(q4p4, 8); x5 = _mm_srli_si128(q5p5, 8); x6 = _mm_srli_si128(q6p6, 8); transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1, &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3); _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1); _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3); _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5); _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7); _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0); _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1); _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2); _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3); } static inline __m128i filter_add2_sub2(const __m128i *const total, const __m128i *const a1, const __m128i *const a2, const __m128i *const s1, const __m128i *const s2) { __m128i x = _mm_add_epi16(*a1, *total); x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2); return x; } static inline __m128i filter8_mask(const __m128i *const flat, const __m128i *const other_filt, const __m128i *const f8_lo, const __m128i *const f8_hi) { const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3)); const __m128i result = _mm_and_si128(*flat, f8); return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); } static inline __m128i filter16_mask(const __m128i *const flat, const __m128i *const other_filt, const __m128i *const f_lo, const __m128i *const f_hi) { const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4)); const __m128i result = _mm_and_si128(*flat, f); return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); } void aom_lpf_horizontal_14_quad_sse2(unsigned char *s, int p, const unsigned char *_blimit0, const unsigned char *_limit0, const unsigned char *_thresh0) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0); const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0); const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0); __m128i mask, hev, flat, flat2; __m128i p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; __m128i q6, q5; __m128i op2, op1, op0, oq0, oq1, oq2; __m128i max_abs_p1p0q1q0; p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); { const __m128i abs_p1p0 = abs_diff(p1, p0); const __m128i abs_q1q0 = abs_diff(q1, q0); const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i ff = _mm_cmpeq_epi8(zero, zero); __m128i abs_p0q0 = abs_diff(p0, q0); __m128i abs_p1q1 = abs_diff(p1, q1); __m128i work; max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2)); mask = _mm_max_epu8(work, mask); work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); mask = _mm_max_epu8(work, mask); mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return; { __m128i work; work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); flat = _mm_max_epu8(work, max_abs_p1p0q1q0); work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0)); flat = _mm_max_epu8(work, flat); work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0)); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0)); flat2 = _mm_max_epu8(work, flat2); work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0)); flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_subs_epu8(flat2, one); flat2 = _mm_cmpeq_epi8(flat2, zero); flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // filter4 { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); const __m128i t80 = _mm_set1_epi8((int8_t)0x80); const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); const __m128i ff = _mm_cmpeq_epi8(t4, t4); __m128i filt; __m128i work_a; __m128i filter1, filter2; op1 = _mm_xor_si128(p1, t80); op0 = _mm_xor_si128(p0, t80); oq0 = _mm_xor_si128(q0, t80); oq1 = _mm_xor_si128(q1, t80); hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); work_a = _mm_subs_epi8(oq0, op0); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); work_a = _mm_cmpgt_epi8(zero, filter1); filter1 = _mm_srli_epi16(filter1, 3); work_a = _mm_and_si128(work_a, te0); filter1 = _mm_and_si128(filter1, t1f); filter1 = _mm_or_si128(filter1, work_a); oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); work_a = _mm_cmpgt_epi8(zero, filter2); filter2 = _mm_srli_epi16(filter2, 3); work_a = _mm_and_si128(work_a, te0); filter2 = _mm_and_si128(filter2, t1f); filter2 = _mm_or_si128(filter2, work_a); op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); filt = _mm_adds_epi8(filter1, t1); work_a = _mm_cmpgt_epi8(zero, filt); filt = _mm_srli_epi16(filt, 1); work_a = _mm_and_si128(work_a, t80); filt = _mm_and_si128(filt, t7f); filt = _mm_or_si128(filt, work_a); filt = _mm_andnot_si128(hev, filt); op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // filter8 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { const __m128i four = _mm_set1_epi16(4); const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); __m128i f8_lo, f8_hi; f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four), _mm_add_epi16(p3_lo, p2_lo)); f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo), _mm_add_epi16(p2_lo, p1_lo)); f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four), _mm_add_epi16(p3_hi, p2_hi)); f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi), _mm_add_epi16(p2_hi, p1_hi)); f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi); f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo); f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi); op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo); f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi); op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo); f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi); oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo); f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi); oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo); f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi); oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // wide flat calculations if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { const __m128i eight = _mm_set1_epi16(8); const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero); const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero); const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero); const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero); const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero); const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero); const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero); const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero); const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero); const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero); const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero); const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero); __m128i f_lo; __m128i f_hi; f_lo = _mm_sub_epi16(_mm_slli_epi16(p6_lo, 3), p6_lo); f_lo = _mm_add_epi16(_mm_slli_epi16(p5_lo, 1), f_lo); f_lo = _mm_add_epi16(_mm_slli_epi16(p4_lo, 1), f_lo); f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo), _mm_add_epi16(p2_lo, p1_lo)); f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo); f_lo = _mm_add_epi16(f_lo, eight); f_hi = _mm_sub_epi16(_mm_slli_epi16(p6_hi, 3), p6_hi); f_hi = _mm_add_epi16(_mm_slli_epi16(p5_hi, 1), f_hi); f_hi = _mm_add_epi16(_mm_slli_epi16(p4_hi, 1), f_hi); f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi), _mm_add_epi16(p2_hi, p1_hi)); f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); f_hi = _mm_add_epi16(f_hi, eight); p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); _mm_storeu_si128((__m128i *)(s - 6 * p), p5); f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p3_lo, &p6_lo, &p6_lo); f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p3_hi, &p6_hi, &p6_hi); p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); _mm_storeu_si128((__m128i *)(s - 5 * p), p4); f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p2_lo, &p6_lo, &p5_lo); f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p2_hi, &p6_hi, &p5_hi); p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); _mm_storeu_si128((__m128i *)(s - 4 * p), p3); f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p1_lo, &p6_lo, &p4_lo); f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p1_hi, &p6_hi, &p4_hi); op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); _mm_storeu_si128((__m128i *)(s - 3 * p), op2); f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p0_lo, &p6_lo, &p3_lo); f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p0_hi, &p6_hi, &p3_hi); op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); _mm_storeu_si128((__m128i *)(s - 2 * p), op1); f_lo = filter_add2_sub2(&f_lo, &q5_lo, &q0_lo, &p6_lo, &p2_lo); f_hi = filter_add2_sub2(&f_hi, &q5_hi, &q0_hi, &p6_hi, &p2_hi); op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); _mm_storeu_si128((__m128i *)(s - 1 * p), op0); f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q1_lo, &p6_lo, &p1_lo); f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q1_hi, &p6_hi, &p1_hi); oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q2_lo, &p5_lo, &p0_lo); f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q2_hi, &p5_hi, &p0_hi); oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q3_lo, &p4_lo, &q0_lo); f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q3_hi, &p4_hi, &q0_hi); oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q4_lo, &p3_lo, &q1_lo); f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q4_hi, &p3_hi, &q1_hi); q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); _mm_storeu_si128((__m128i *)(s + 3 * p), q3); f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q5_lo, &p2_lo, &q2_lo); f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q5_hi, &p2_hi, &q2_hi); q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); _mm_storeu_si128((__m128i *)(s + 4 * p), q4); f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q6_lo, &p1_lo, &q3_lo); f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q6_hi, &p1_hi, &q3_hi); q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); _mm_storeu_si128((__m128i *)(s + 5 * p), q5); } else { _mm_storeu_si128((__m128i *)(s - 3 * p), op2); _mm_storeu_si128((__m128i *)(s - 2 * p), op1); _mm_storeu_si128((__m128i *)(s - 1 * p), op0); _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); } } else { _mm_storeu_si128((__m128i *)(s - 2 * p), op1); _mm_storeu_si128((__m128i *)(s - 1 * p), op0); _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); } } } void aom_lpf_horizontal_8_quad_sse2(unsigned char *s, int p, const unsigned char *_blimit0, const unsigned char *_limit0, const unsigned char *_thresh0) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0); const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0); const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0); __m128i mask, hev, flat; __m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i op2, op1, op0, oq0, oq1, oq2; __m128i max_abs_p1p0q1q0; p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); { const __m128i abs_p1p0 = abs_diff(p1, p0); const __m128i abs_q1q0 = abs_diff(q1, q0); const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i ff = _mm_cmpeq_epi8(zero, zero); __m128i abs_p0q0 = abs_diff(p0, q0); __m128i abs_p1q1 = abs_diff(p1, q1); __m128i work; max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2)); mask = _mm_max_epu8(work, mask); work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); mask = _mm_max_epu8(work, mask); mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return; { __m128i work; work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); flat = _mm_max_epu8(work, max_abs_p1p0q1q0); work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0)); flat = _mm_max_epu8(work, flat); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // filter4 { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); const __m128i t80 = _mm_set1_epi8((int8_t)0x80); const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); const __m128i ff = _mm_cmpeq_epi8(t4, t4); __m128i filt; __m128i work_a; __m128i filter1, filter2; op1 = _mm_xor_si128(p1, t80); op0 = _mm_xor_si128(p0, t80); oq0 = _mm_xor_si128(q0, t80); oq1 = _mm_xor_si128(q1, t80); hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); work_a = _mm_subs_epi8(oq0, op0); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); work_a = _mm_cmpgt_epi8(zero, filter1); filter1 = _mm_srli_epi16(filter1, 3); work_a = _mm_and_si128(work_a, te0); filter1 = _mm_and_si128(filter1, t1f); filter1 = _mm_or_si128(filter1, work_a); oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); work_a = _mm_cmpgt_epi8(zero, filter2); filter2 = _mm_srli_epi16(filter2, 3); work_a = _mm_and_si128(work_a, te0); filter2 = _mm_and_si128(filter2, t1f); filter2 = _mm_or_si128(filter2, work_a); op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); filt = _mm_adds_epi8(filter1, t1); work_a = _mm_cmpgt_epi8(zero, filt); filt = _mm_srli_epi16(filt, 1); work_a = _mm_and_si128(work_a, t80); filt = _mm_and_si128(filt, t7f); filt = _mm_or_si128(filt, work_a); filt = _mm_andnot_si128(hev, filt); op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // filter8 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { const __m128i four = _mm_set1_epi16(4); const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); __m128i f8_lo, f8_hi; f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four), _mm_add_epi16(p3_lo, p2_lo)); f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo), _mm_add_epi16(p2_lo, p1_lo)); f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four), _mm_add_epi16(p3_hi, p2_hi)); f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi), _mm_add_epi16(p2_hi, p1_hi)); f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi); _mm_storeu_si128((__m128i *)(s - 3 * p), op2); f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo); f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi); op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); _mm_storeu_si128((__m128i *)(s - 2 * p), op1); f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo); f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi); op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); _mm_storeu_si128((__m128i *)(s - 1 * p), op0); f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo); f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi); oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo); f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi); oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo); f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi); oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi); _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); } else { _mm_storeu_si128((__m128i *)(s - 2 * p), op1); _mm_storeu_si128((__m128i *)(s - 1 * p), op0); _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); } } } void aom_lpf_horizontal_6_quad_sse2(unsigned char *s, int p, const unsigned char *_blimit0, const unsigned char *_limit0, const unsigned char *_thresh0) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0); const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0); const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0); __m128i mask, hev, flat; __m128i p2, p1, p0, q0, q1, q2; __m128i op1, op0, oq0, oq1; __m128i max_abs_p1p0q1q0; p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); { const __m128i abs_p1p0 = abs_diff(p1, p0); const __m128i abs_q1q0 = abs_diff(q1, q0); const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i ff = _mm_cmpeq_epi8(zero, zero); __m128i abs_p0q0 = abs_diff(p0, q0); __m128i abs_p1q1 = abs_diff(p1, q1); __m128i work; max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(q2, q1)); mask = _mm_max_epu8(work, mask); mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return; { __m128i work; work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); flat = _mm_max_epu8(work, max_abs_p1p0q1q0); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // filter4 { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); const __m128i t80 = _mm_set1_epi8((int8_t)0x80); const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); const __m128i ff = _mm_cmpeq_epi8(t4, t4); __m128i filt; __m128i work_a; __m128i filter1, filter2; op1 = _mm_xor_si128(p1, t80); op0 = _mm_xor_si128(p0, t80); oq0 = _mm_xor_si128(q0, t80); oq1 = _mm_xor_si128(q1, t80); hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); work_a = _mm_subs_epi8(oq0, op0); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); work_a = _mm_cmpgt_epi8(zero, filter1); filter1 = _mm_srli_epi16(filter1, 3); work_a = _mm_and_si128(work_a, te0); filter1 = _mm_and_si128(filter1, t1f); filter1 = _mm_or_si128(filter1, work_a); oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); work_a = _mm_cmpgt_epi8(zero, filter2); filter2 = _mm_srli_epi16(filter2, 3); work_a = _mm_and_si128(work_a, te0); filter2 = _mm_and_si128(filter2, t1f); filter2 = _mm_or_si128(filter2, work_a); op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); filt = _mm_adds_epi8(filter1, t1); work_a = _mm_cmpgt_epi8(zero, filt); filt = _mm_srli_epi16(filt, 1); work_a = _mm_and_si128(work_a, t80); filt = _mm_and_si128(filt, t7f); filt = _mm_or_si128(filt, work_a); filt = _mm_andnot_si128(hev, filt); op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // filter6 if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { const __m128i four = _mm_set1_epi16(4); const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); __m128i f8_lo, f8_hi; f8_lo = _mm_add_epi16(_mm_add_epi16(p2_lo, four), _mm_add_epi16(p2_lo, p2_lo)); f8_lo = _mm_add_epi16(_mm_add_epi16(p1_lo, f8_lo), _mm_add_epi16(p1_lo, p0_lo)); f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); f8_hi = _mm_add_epi16(_mm_add_epi16(p2_hi, four), _mm_add_epi16(p2_hi, p2_hi)); f8_hi = _mm_add_epi16(_mm_add_epi16(p1_hi, f8_hi), _mm_add_epi16(p1_hi, p0_hi)); f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); _mm_storeu_si128((__m128i *)(s - 2 * p), op1); f8_lo = filter_add2_sub2(&f8_lo, &q0_lo, &q1_lo, &p2_lo, &p2_lo); f8_hi = filter_add2_sub2(&f8_hi, &q0_hi, &q1_hi, &p2_hi, &p2_hi); op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); _mm_storeu_si128((__m128i *)(s - 1 * p), op0); f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &q2_lo, &p1_lo, &p2_lo); f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &q2_hi, &p1_hi, &p2_hi); oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &q2_lo, &p0_lo, &p1_lo); f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &q2_hi, &p0_hi, &p1_hi); oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); } else { _mm_storeu_si128((__m128i *)(s - 2 * p), op1); _mm_storeu_si128((__m128i *)(s - 1 * p), op0); _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); } } } void aom_lpf_horizontal_4_quad_sse2(unsigned char *s, int p, const unsigned char *_blimit0, const unsigned char *_limit0, const unsigned char *_thresh0) { const __m128i zero = _mm_setzero_si128(); const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0); const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0); const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0); __m128i mask, hev; __m128i p1, p0, q0, q1; __m128i op1, op0, oq0, oq1; __m128i max_abs_p1p0q1q0; p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); { const __m128i abs_p1p0 = abs_diff(p1, p0); const __m128i abs_q1q0 = abs_diff(q1, q0); const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i ff = _mm_cmpeq_epi8(zero, zero); __m128i abs_p0q0 = abs_diff(p0, q0); __m128i abs_p1q1 = abs_diff(p1, q1); max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return; // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // filter4 { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); const __m128i t80 = _mm_set1_epi8((int8_t)0x80); const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); const __m128i ff = _mm_cmpeq_epi8(t4, t4); __m128i filt; __m128i work_a; __m128i filter1, filter2; op1 = _mm_xor_si128(p1, t80); op0 = _mm_xor_si128(p0, t80); oq0 = _mm_xor_si128(q0, t80); oq1 = _mm_xor_si128(q1, t80); hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); work_a = _mm_subs_epi8(oq0, op0); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); work_a = _mm_cmpgt_epi8(zero, filter1); filter1 = _mm_srli_epi16(filter1, 3); work_a = _mm_and_si128(work_a, te0); filter1 = _mm_and_si128(filter1, t1f); filter1 = _mm_or_si128(filter1, work_a); oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); work_a = _mm_cmpgt_epi8(zero, filter2); filter2 = _mm_srli_epi16(filter2, 3); work_a = _mm_and_si128(work_a, te0); filter2 = _mm_and_si128(filter2, t1f); filter2 = _mm_or_si128(filter2, work_a); op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); filt = _mm_adds_epi8(filter1, t1); work_a = _mm_cmpgt_epi8(zero, filt); filt = _mm_srli_epi16(filt, 1); work_a = _mm_and_si128(work_a, t80); filt = _mm_and_si128(filt, t7f); filt = _mm_or_si128(filt, work_a); filt = _mm_andnot_si128(hev, filt); op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); _mm_storeu_si128((__m128i *)(s - 2 * p), op1); _mm_storeu_si128((__m128i *)(s - 1 * p), op0); _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); } } void aom_lpf_vertical_14_quad_sse2(unsigned char *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0) { DECLARE_ALIGNED(16, unsigned char, t_dst[256]); // Transpose 16x16 transpose_16x8(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16); transpose_16x8(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16); // Loop filtering aom_lpf_horizontal_14_quad(t_dst + 8 * 16, 16, _blimit0, _limit0, _thresh0); // Transpose back transpose_16x8(t_dst, t_dst + 8 * 16, 16, s - 8, pitch); transpose_16x8(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch); } void aom_lpf_vertical_8_quad_sse2(uint8_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0) { DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); // Transpose 16x8 transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); // Loop filtering aom_lpf_horizontal_8_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0); // Transpose back transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch); } void aom_lpf_vertical_6_quad_sse2(uint8_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0) { DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); // Transpose 16x8:: (wxh) 8x16 to 16x8 transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); // Loop filtering aom_lpf_horizontal_6_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0); // Transpose back:: (wxh) 16x8 to 8x16 transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch); } void aom_lpf_vertical_4_quad_sse2(uint8_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0) { DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); // Transpose 16x8 transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); // Loop filtering aom_lpf_horizontal_4_quad_sse2(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0); // Transpose back transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch); } aom-3.12.1/aom_dsp/x86/lpf_common_sse2.h000066400000000000000000000726271477627663500177240ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ #define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ #include // SSE2 #include "config/aom_config.h" #define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8) #define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8) static inline void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, __m128i *x5, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5) { __m128i w0, w1, w2, w3, w4, w5, ww0; // 00 01 02 03 04 05 xx xx // 10 11 12 13 14 15 xx xx // 20 21 22 23 24 25 xx xx // 30 31 32 33 34 35 xx xx // 40 41 42 43 44 45 xx xx // 50 51 52 53 54 55 xx xx w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 *d0 = _mm_unpacklo_epi64(ww0, w2); // 00 10 20 30 40 50 41 51 *d1 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4)); // 01 11 21 31 41 51 xx xx ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 *d2 = _mm_unpacklo_epi64(ww0, _mm_srli_si128(w2, 8)); // 02 12 22 32 42 52 xx xx w3 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 xx xx xx xx w4 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 xx xx xx xx w5 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 xx xx xx xx *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4)); // 03 13 23 33 43 53 ww0 = _mm_unpacklo_epi32(w3, w4); // 04 14 24 34 05 15 25 35 *d4 = _mm_unpacklo_epi64(ww0, w5); // 04 14 24 34 44 54 45 55 *d5 = _mm_unpackhi_epi64(ww0, _mm_slli_si128(w5, 4)); // 05 15 25 35 45 55 xx xx } static inline void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3) { __m128i zero = _mm_setzero_si128(); __m128i w0, w1, ww0, ww1; w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 ww1 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 *d0 = _mm_unpacklo_epi64(ww0, zero); // 00 10 20 30 xx xx xx xx *d1 = _mm_unpackhi_epi64(ww0, zero); // 01 11 21 31 xx xx xx xx *d2 = _mm_unpacklo_epi64(ww1, zero); // 02 12 22 32 xx xx xx xx *d3 = _mm_unpackhi_epi64(ww1, zero); // 03 13 23 33 xx xx xx xx } static inline void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { __m128i w0, w1, ww2, ww3; __m128i zero = _mm_setzero_si128(); w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 ww2 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 ww3 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 *d4 = _mm_unpacklo_epi64(ww2, zero); // 04 14 24 34 xx xx xx xx *d5 = _mm_unpackhi_epi64(ww2, zero); // 05 15 25 35 xx xx xx xx *d6 = _mm_unpacklo_epi64(ww3, zero); // 06 16 26 36 xx xx xx xx *d7 = _mm_unpackhi_epi64(ww3, zero); // 07 17 27 37 xx xx xx xx } // here in and out pointers (x and d) should be different! we don't store their // values inside static inline void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { // input // x0 00 01 02 03 04 05 06 07 // x1 10 11 12 13 14 15 16 17 // x2 20 21 22 23 24 25 26 27 // x3 30 31 32 33 34 35 36 37 // output // 00 10 20 30 xx xx xx xx // 01 11 21 31 xx xx xx xx // 02 12 22 32 xx xx xx xx // 03 13 23 33 xx xx xx xx // 04 14 24 34 xx xx xx xx // 05 15 25 35 xx xx xx xx // 06 16 26 36 xx xx xx xx // 07 17 27 37 xx xx xx xx highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3); highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7); } static inline void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3) { __m128i w0, w1, w2, w3, ww0, ww1; // x0 00 01 02 03 04 05 06 07 // x1 10 11 12 13 14 15 16 17 // x2 20 21 22 23 24 25 26 27 // x3 30 31 32 33 34 35 36 37 // x4 40 41 42 43 44 45 46 47 // x5 50 51 52 53 54 55 56 57 // x6 60 61 62 63 64 65 66 67 // x7 70 71 72 73 74 75 76 77 w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 w3 = _mm_unpacklo_epi16(*x6, *x7); // 60 70 61 71 62 72 63 73 ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 ww1 = _mm_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 *d0 = _mm_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 *d1 = _mm_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 ww1 = _mm_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 *d2 = _mm_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 *d3 = _mm_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 } static inline void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { __m128i w0, w1, w2, w3, ww0, ww1; // x0 00 01 02 03 04 05 06 07 // x1 10 11 12 13 14 15 16 17 // x2 20 21 22 23 24 25 26 27 // x3 30 31 32 33 34 35 36 37 // x4 40 41 42 43 44 45 46 47 // x5 50 51 52 53 54 55 56 57 // x6 60 61 62 63 64 65 66 67 // x7 70 71 72 73 74 75 76 77 w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 w2 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 46 56 47 57 w3 = _mm_unpackhi_epi16(*x6, *x7); // 64 74 65 75 66 76 67 77 ww0 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 ww1 = _mm_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75 *d4 = _mm_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74 *d5 = _mm_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75 ww0 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 ww1 = _mm_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77 *d6 = _mm_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76 *d7 = _mm_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77 } // here in and out pointers (x and d) should be different! we don't store their // values inside static inline void highbd_transpose8x8_sse2( __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3); highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7); } // here in and out pointers (x and d arrays) should be different! we don't store // their values inside static inline void highbd_transpose8x16_sse2( __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4, d5, d6, d7); highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1, x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1, d4 + 1, d5 + 1, d6 + 1, d7 + 1); } // Low bit depth functions static inline void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3) { // input // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx // output // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx __m128i w0, w1; w0 = _mm_unpacklo_epi8( *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 w1 = _mm_unpacklo_epi8( *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 *d0 = _mm_unpacklo_epi16( w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 *d1 = _mm_srli_si128(*d0, 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx *d2 = _mm_srli_si128(*d0, 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx *d3 = _mm_srli_si128(*d0, 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx } static inline void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { // input // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx // output // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx __m128i w0, w1, ww0, ww1; w0 = _mm_unpacklo_epi8( *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 w1 = _mm_unpacklo_epi8( *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 ww0 = _mm_unpacklo_epi16( w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 ww1 = _mm_unpackhi_epi16( w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx *d1 = _mm_srli_si128(ww0, 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx *d2 = _mm_srli_si128(ww0, 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx *d3 = _mm_srli_si128(ww0, 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx *d5 = _mm_srli_si128(ww1, 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx *d6 = _mm_srli_si128(ww1, 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx *d7 = _mm_srli_si128(ww1, 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx } static inline void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3) { // input // x0 00 01 02 03 04 05 06 07 // x1 10 11 12 13 14 15 16 17 // x2 20 21 22 23 24 25 26 27 // x3 30 31 32 33 34 35 36 37 // x4 40 41 42 43 44 45 46 47 // x5 50 51 52 53 54 55 56 57 // x6 60 61 62 63 64 65 66 67 // x7 70 71 72 73 74 75 76 77 // output // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx __m128i w0, w1, w2, w3, w4, w5; w0 = _mm_unpacklo_epi8( *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 w1 = _mm_unpacklo_epi8( *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 w2 = _mm_unpacklo_epi8( *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 w3 = _mm_unpacklo_epi8( *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 w4 = _mm_unpacklo_epi16( w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 w5 = _mm_unpacklo_epi16( w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 *d0 = _mm_unpacklo_epi32( w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 *d1 = _mm_srli_si128(*d0, 8); *d2 = _mm_unpackhi_epi32( w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 *d3 = _mm_srli_si128(*d2, 8); } static inline void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, __m128i *d4d5, __m128i *d6d7) { __m128i w0, w1, w2, w3, w4, w5, w6, w7; // x0 00 01 02 03 04 05 06 07 // x1 10 11 12 13 14 15 16 17 w0 = _mm_unpacklo_epi8( *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 // x2 20 21 22 23 24 25 26 27 // x3 30 31 32 33 34 35 36 37 w1 = _mm_unpacklo_epi8( *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 // x4 40 41 42 43 44 45 46 47 // x5 50 51 52 53 54 55 56 57 w2 = _mm_unpacklo_epi8( *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 // x6 60 61 62 63 64 65 66 67 // x7 70 71 72 73 74 75 76 77 w3 = _mm_unpacklo_epi8( *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 w4 = _mm_unpacklo_epi16( w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 w5 = _mm_unpacklo_epi16( w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 *d0d1 = _mm_unpacklo_epi32( w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 *d2d3 = _mm_unpackhi_epi32( w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 w6 = _mm_unpackhi_epi16( w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 w7 = _mm_unpackhi_epi16( w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 *d4d5 = _mm_unpacklo_epi32( w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 *d6d7 = _mm_unpackhi_epi32( w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 } static inline void transpose16x8_8x16_sse2( __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9, __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14, __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; __m128i w10, w11, w12, w13, w14, w15; w0 = _mm_unpacklo_epi8(*x0, *x1); w1 = _mm_unpacklo_epi8(*x2, *x3); w2 = _mm_unpacklo_epi8(*x4, *x5); w3 = _mm_unpacklo_epi8(*x6, *x7); w8 = _mm_unpacklo_epi8(*x8, *x9); w9 = _mm_unpacklo_epi8(*x10, *x11); w10 = _mm_unpacklo_epi8(*x12, *x13); w11 = _mm_unpacklo_epi8(*x14, *x15); w4 = _mm_unpacklo_epi16(w0, w1); w5 = _mm_unpacklo_epi16(w2, w3); w12 = _mm_unpacklo_epi16(w8, w9); w13 = _mm_unpacklo_epi16(w10, w11); w6 = _mm_unpacklo_epi32(w4, w5); w7 = _mm_unpackhi_epi32(w4, w5); w14 = _mm_unpacklo_epi32(w12, w13); w15 = _mm_unpackhi_epi32(w12, w13); // Store first 4-line result *d0 = _mm_unpacklo_epi64(w6, w14); *d1 = _mm_unpackhi_epi64(w6, w14); *d2 = _mm_unpacklo_epi64(w7, w15); *d3 = _mm_unpackhi_epi64(w7, w15); w4 = _mm_unpackhi_epi16(w0, w1); w5 = _mm_unpackhi_epi16(w2, w3); w12 = _mm_unpackhi_epi16(w8, w9); w13 = _mm_unpackhi_epi16(w10, w11); w6 = _mm_unpacklo_epi32(w4, w5); w7 = _mm_unpackhi_epi32(w4, w5); w14 = _mm_unpacklo_epi32(w12, w13); w15 = _mm_unpackhi_epi32(w12, w13); // Store second 4-line result *d4 = _mm_unpacklo_epi64(w6, w14); *d5 = _mm_unpackhi_epi64(w6, w14); *d6 = _mm_unpacklo_epi64(w7, w15); *d7 = _mm_unpackhi_epi64(w7, w15); } static inline void transpose8x16_16x8_sse2( __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11, __m128i *d12d13, __m128i *d14d15) { __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; __m128i w10, w11, w12, w13, w14, w15; w0 = _mm_unpacklo_epi8(*x0, *x1); w1 = _mm_unpacklo_epi8(*x2, *x3); w2 = _mm_unpacklo_epi8(*x4, *x5); w3 = _mm_unpacklo_epi8(*x6, *x7); w8 = _mm_unpackhi_epi8(*x0, *x1); w9 = _mm_unpackhi_epi8(*x2, *x3); w10 = _mm_unpackhi_epi8(*x4, *x5); w11 = _mm_unpackhi_epi8(*x6, *x7); w4 = _mm_unpacklo_epi16(w0, w1); w5 = _mm_unpacklo_epi16(w2, w3); w12 = _mm_unpacklo_epi16(w8, w9); w13 = _mm_unpacklo_epi16(w10, w11); w6 = _mm_unpacklo_epi32(w4, w5); w7 = _mm_unpackhi_epi32(w4, w5); w14 = _mm_unpacklo_epi32(w12, w13); w15 = _mm_unpackhi_epi32(w12, w13); // Store first 4-line result *d0d1 = _mm_unpacklo_epi64(w6, w14); *d2d3 = _mm_unpackhi_epi64(w6, w14); *d4d5 = _mm_unpacklo_epi64(w7, w15); *d6d7 = _mm_unpackhi_epi64(w7, w15); w4 = _mm_unpackhi_epi16(w0, w1); w5 = _mm_unpackhi_epi16(w2, w3); w12 = _mm_unpackhi_epi16(w8, w9); w13 = _mm_unpackhi_epi16(w10, w11); w6 = _mm_unpacklo_epi32(w4, w5); w7 = _mm_unpackhi_epi32(w4, w5); w14 = _mm_unpacklo_epi32(w12, w13); w15 = _mm_unpackhi_epi32(w12, w13); // Store second 4-line result *d8d9 = _mm_unpacklo_epi64(w6, w14); *d10d11 = _mm_unpackhi_epi64(w6, w14); *d12d13 = _mm_unpacklo_epi64(w7, w15); *d14d15 = _mm_unpackhi_epi64(w7, w15); } static inline void transpose_16x8(unsigned char *in0, unsigned char *in1, int in_p, unsigned char *out, int out_p) { __m128i x0, x1, x2, x3, x4, x5, x6, x7; __m128i x8, x9, x10, x11, x12, x13, x14, x15; x0 = _mm_loadl_epi64((__m128i *)in0); x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); x0 = _mm_unpacklo_epi8(x0, x1); x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); x1 = _mm_unpacklo_epi8(x2, x3); x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); x2 = _mm_unpacklo_epi8(x4, x5); x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); x3 = _mm_unpacklo_epi8(x6, x7); x4 = _mm_unpacklo_epi16(x0, x1); x8 = _mm_loadl_epi64((__m128i *)in1); x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); x8 = _mm_unpacklo_epi8(x8, x9); x5 = _mm_unpacklo_epi16(x2, x3); x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); x9 = _mm_unpacklo_epi8(x10, x11); x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); x10 = _mm_unpacklo_epi8(x12, x13); x12 = _mm_unpacklo_epi16(x8, x9); x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); x11 = _mm_unpacklo_epi8(x14, x15); x13 = _mm_unpacklo_epi16(x10, x11); x6 = _mm_unpacklo_epi32(x4, x5); x7 = _mm_unpackhi_epi32(x4, x5); x14 = _mm_unpacklo_epi32(x12, x13); x15 = _mm_unpackhi_epi32(x12, x13); // Store first 4-line result _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); x4 = _mm_unpackhi_epi16(x0, x1); x5 = _mm_unpackhi_epi16(x2, x3); x12 = _mm_unpackhi_epi16(x8, x9); x13 = _mm_unpackhi_epi16(x10, x11); x6 = _mm_unpacklo_epi32(x4, x5); x7 = _mm_unpackhi_epi32(x4, x5); x14 = _mm_unpacklo_epi32(x12, x13); x15 = _mm_unpackhi_epi32(x12, x13); // Store second 4-line result _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); } static inline void transpose_16x8_to_8x16(unsigned char *src, int in_p, unsigned char *dst, int out_p) { // a0 b0 c0 d0 e0 f0 g0 h0 A0 B0 C0 D0 E0 F0 G0 H0 // a1 b1 c1 d1 e1 f1 g1 h1 A1 B1 C1 D1 E1 F1 G1 H1 // a2 b2 c2 d2 e2 f2 g2 h2 A2 B2 C2 D2 E2 F2 G2 H2 // a3 b3 c3 d3 e3 f3 g3 h3 A3 B3 C3 D3 E3 F3 G3 H3 // a4 b4 c4 d4 e4 f4 g4 h4 A4 B4 C4 D4 E4 F4 G4 H4 // a5 b5 c5 d5 e5 f5 g5 h5 A5 B5 C5 D5 E5 F5 G5 H5 // a6 b6 c6 d6 e6 f6 g6 h6 A6 B6 C6 D6 E6 F6 G6 H6 // a7 b7 c7 d7 e7 f7 g7 h7 A7 B7 C7 D7 E7 F7 G7 H7 const __m128i x0 = _mm_loadu_si128((__m128i *)(src)); const __m128i x1 = _mm_loadu_si128((__m128i *)(src + (1 * in_p))); const __m128i x2 = _mm_loadu_si128((__m128i *)(src + (2 * in_p))); const __m128i x3 = _mm_loadu_si128((__m128i *)(src + (3 * in_p))); const __m128i x4 = _mm_loadu_si128((__m128i *)(src + (4 * in_p))); const __m128i x5 = _mm_loadu_si128((__m128i *)(src + (5 * in_p))); const __m128i x6 = _mm_loadu_si128((__m128i *)(src + (6 * in_p))); const __m128i x7 = _mm_loadu_si128((__m128i *)(src + (7 * in_p))); // a0 a1 b0 b1 c0 c1 d0 d1 A0 A1 B0 B1 C0 C1 D0 D1 // e0 e1 f0 f1 g0 g1 h0 h1 E0 E1 F0 F1 G0 G1 H0 H1 // a2 a3 b2 b3 c2 c3 d2 d3 A2 A3 B2 B3 C2 C3 D2 D3 // e2 e3 f2 f3 g2 g3 h2 h3 E2 E3 F2 F3 G2 G3 H2 H3 // a4 a5 b4 b5 c4 c5 d4 d5 A4 A5 B4 B5 C4 C5 D4 D5 // e4 e5 f4 f5 g4 g5 h4 h5 E4 E5 F4 F5 G4 G5 H4 H5 // a6 a7 b6 b7 c6 c7 d6 d7 A6 A7 B6 B7 C6 C7 D6 D7 // e6 e7 f6 f7 g6 g7 h6 h7 E6 E7 F6 F7 G6 G7 H6 H7 const __m128i x_s10 = _mm_unpacklo_epi8(x0, x1); const __m128i x_s11 = _mm_unpackhi_epi8(x0, x1); const __m128i x_s12 = _mm_unpacklo_epi8(x2, x3); const __m128i x_s13 = _mm_unpackhi_epi8(x2, x3); const __m128i x_s14 = _mm_unpacklo_epi8(x4, x5); const __m128i x_s15 = _mm_unpackhi_epi8(x4, x5); const __m128i x_s16 = _mm_unpacklo_epi8(x6, x7); const __m128i x_s17 = _mm_unpackhi_epi8(x6, x7); // a0 a1 a2 a3 b0 b1 b2 b3 | A0 A1 A2 A3 B0 B1 B2 B3 // c0 c1 c2 c3 d0 d1 d2 d3 | C0 C1 C2 C3 D0 D1 D2 D3 // e0 e1 e2 e3 f0 f1 f2 f3 | E0 E1 E2 E3 F0 F1 F2 F3 // g0 g1 g2 g3 h0 h1 h2 h3 | G0 G1 G2 G3 H0 H1 H2 H3 // a4 a5 a6 a7 b4 b5 b6 b7 | A4 A5 A6 A7 B4 B5 B6 B7 // c4 c5 c6 c7 d4 d5 d6 d7 | C4 C5 C6 C7 D4 D5 D6 D7 // e4 e5 e6 e7 f4 f5 f6 f7 | E4 E5 E6 E7 F4 F5 F6 F7 // g4 g5 g6 g7 h4 h5 h6 h7 | G4 G5 G6 G7 H4 H5 H6 H7 const __m128i x_s20 = _mm_unpacklo_epi16(x_s10, x_s12); const __m128i x_s21 = _mm_unpackhi_epi16(x_s10, x_s12); const __m128i x_s22 = _mm_unpacklo_epi16(x_s11, x_s13); const __m128i x_s23 = _mm_unpackhi_epi16(x_s11, x_s13); const __m128i x_s24 = _mm_unpacklo_epi16(x_s14, x_s16); const __m128i x_s25 = _mm_unpackhi_epi16(x_s14, x_s16); const __m128i x_s26 = _mm_unpacklo_epi16(x_s15, x_s17); const __m128i x_s27 = _mm_unpackhi_epi16(x_s15, x_s17); // a0 a1 a2 a3 a4 a5 a6 a7 | A0 A1 A2 A3 A4 A5 A6 A7 // b0 b1 b2 b3 b4 b5 b6 b7 | B0 B1 B2 B3 B4 B5 B6 B7 // c0 c1 c2 c3 c4 c5 c6 c7 | C0 C1 C2 C3 C4 C5 C6 C7 // d0 d1 d2 d3 d4 d5 d6 d7 | D0 D1 D2 D3 D4 D5 D6 D7 // e0 e1 e2 e3 e4 e5 e6 e7 | E0 E1 E2 E3 E4 E5 E6 E7 // f0 f1 f2 f3 f4 f5 f6 f7 | F0 F1 F2 F3 F4 F5 F6 F7 // g0 g1 g2 g3 g4 g5 g6 g7 | G0 G1 G2 G3 G4 G5 G6 G7 // h0 h1 h2 h3 h4 h5 h6 h7 | H0 H1 H2 H3 H4 H5 H6 H7 const __m128i x_s30 = _mm_unpacklo_epi32(x_s20, x_s24); const __m128i x_s31 = _mm_unpackhi_epi32(x_s20, x_s24); const __m128i x_s32 = _mm_unpacklo_epi32(x_s21, x_s25); const __m128i x_s33 = _mm_unpackhi_epi32(x_s21, x_s25); const __m128i x_s34 = _mm_unpacklo_epi32(x_s22, x_s26); const __m128i x_s35 = _mm_unpackhi_epi32(x_s22, x_s26); const __m128i x_s36 = _mm_unpacklo_epi32(x_s23, x_s27); const __m128i x_s37 = _mm_unpackhi_epi32(x_s23, x_s27); mm_storelu(dst, x_s30); mm_storehu(dst + (1 * out_p), x_s30); mm_storelu(dst + (2 * out_p), x_s31); mm_storehu(dst + (3 * out_p), x_s31); mm_storelu(dst + (4 * out_p), x_s32); mm_storehu(dst + (5 * out_p), x_s32); mm_storelu(dst + (6 * out_p), x_s33); mm_storehu(dst + (7 * out_p), x_s33); mm_storelu(dst + (8 * out_p), x_s34); mm_storehu(dst + (9 * out_p), x_s34); mm_storelu(dst + (10 * out_p), x_s35); mm_storehu(dst + (11 * out_p), x_s35); mm_storelu(dst + (12 * out_p), x_s36); mm_storehu(dst + (13 * out_p), x_s36); mm_storelu(dst + (14 * out_p), x_s37); mm_storehu(dst + (15 * out_p), x_s37); } static inline void transpose_8xn(unsigned char *src[], int in_p, unsigned char *dst[], int out_p, int num_8x8_to_transpose) { int idx8x8 = 0; __m128i x0, x1, x2, x3, x4, x5, x6, x7; do { unsigned char *in = src[idx8x8]; unsigned char *out = dst[idx8x8]; x0 = _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 x1 = _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 x0 = _mm_unpacklo_epi8(x0, x1); x2 = _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 x3 = _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 x1 = _mm_unpacklo_epi8(x2, x3); x4 = _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 x5 = _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 x2 = _mm_unpacklo_epi8(x4, x5); x6 = _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 x7 = _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 x3 = _mm_unpacklo_epi8(x6, x7); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 x4 = _mm_unpacklo_epi16(x0, x1); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 x5 = _mm_unpacklo_epi16(x2, x3); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 x6 = _mm_unpacklo_epi32(x4, x5); mm_storelu(out + 0 * out_p, x6); // 00 10 20 30 40 50 60 70 mm_storehu(out + 1 * out_p, x6); // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 x7 = _mm_unpackhi_epi32(x4, x5); mm_storelu(out + 2 * out_p, x7); // 02 12 22 32 42 52 62 72 mm_storehu(out + 3 * out_p, x7); // 03 13 23 33 43 53 63 73 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 x4 = _mm_unpackhi_epi16(x0, x1); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 x5 = _mm_unpackhi_epi16(x2, x3); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 x6 = _mm_unpacklo_epi32(x4, x5); mm_storelu(out + 4 * out_p, x6); // 04 14 24 34 44 54 64 74 mm_storehu(out + 5 * out_p, x6); // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 x7 = _mm_unpackhi_epi32(x4, x5); mm_storelu(out + 6 * out_p, x7); // 06 16 26 36 46 56 66 76 mm_storehu(out + 7 * out_p, x7); // 07 17 27 37 47 57 67 77 } while (++idx8x8 < num_8x8_to_transpose); } #endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ aom-3.12.1/aom_dsp/x86/masked_sad_intrin_avx2.c000066400000000000000000000367301477627663500212430ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/blend.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" #include "aom_dsp/x86/masked_sad_intrin_ssse3.h" static inline unsigned int masked_sad32xh_avx2( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { int x, y; __m256i res = _mm256_setzero_si256(); const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); const __m256i round_scale = _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 32) { const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]); const __m256i m_inv = _mm256_sub_epi8(mask_max, m); // Calculate 16 predicted pixels. // Note that the maximum value of any entry of 'pred_l' or 'pred_r' // is 64 * 255, so we have plenty of space to add rounding constants. const __m256i data_l = _mm256_unpacklo_epi8(a, b); const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); const __m256i data_r = _mm256_unpackhi_epi8(a, b); const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. res = _mm256_shuffle_epi32(res, 0xd8); res = _mm256_permute4x64_epi64(res, 0xd8); res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int32_t sad = _mm256_extract_epi32(res, 0); return sad; } static inline unsigned int masked_sad16xh_avx2( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { int y; __m256i res = _mm256_setzero_si256(); const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); const __m256i round_scale = _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y += 2) { const __m256i src = yy_loadu2_128(src_ptr + src_stride, src_ptr); const __m256i a = yy_loadu2_128(a_ptr + a_stride, a_ptr); const __m256i b = yy_loadu2_128(b_ptr + b_stride, b_ptr); const __m256i m = yy_loadu2_128(m_ptr + m_stride, m_ptr); const __m256i m_inv = _mm256_sub_epi8(mask_max, m); // Calculate 16 predicted pixels. // Note that the maximum value of any entry of 'pred_l' or 'pred_r' // is 64 * 255, so we have plenty of space to add rounding constants. const __m256i data_l = _mm256_unpacklo_epi8(a, b); const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); const __m256i data_r = _mm256_unpackhi_epi8(a, b); const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); src_ptr += src_stride << 1; a_ptr += a_stride << 1; b_ptr += b_stride << 1; m_ptr += m_stride << 1; } // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. res = _mm256_shuffle_epi32(res, 0xd8); res = _mm256_permute4x64_epi64(res, 0xd8); res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int32_t sad = _mm256_extract_epi32(res, 0); return sad; } static inline unsigned int aom_masked_sad_avx2( const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, int m, int n) { unsigned int sad; if (!invert_mask) { switch (m) { case 4: sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, second_pred, m, msk, msk_stride, n); break; case 8: sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, second_pred, m, msk, msk_stride, n); break; case 16: sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred, m, msk, msk_stride, n); break; default: sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred, m, msk, msk_stride, m, n); break; } } else { switch (m) { case 4: sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref, ref_stride, msk, msk_stride, n); break; case 8: sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref, ref_stride, msk, msk_stride, n); break; case 16: sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref, ref_stride, msk, msk_stride, n); break; default: sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref, ref_stride, msk, msk_stride, m, n); break; } } return sad; } #define MASKSADMXN_AVX2(m, n) \ unsigned int aom_masked_sad##m##x##n##_avx2( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ int invert_mask) { \ return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \ msk, msk_stride, invert_mask, m, n); \ } MASKSADMXN_AVX2(4, 4) MASKSADMXN_AVX2(4, 8) MASKSADMXN_AVX2(8, 4) MASKSADMXN_AVX2(8, 8) MASKSADMXN_AVX2(8, 16) MASKSADMXN_AVX2(16, 8) MASKSADMXN_AVX2(16, 16) MASKSADMXN_AVX2(16, 32) MASKSADMXN_AVX2(32, 16) MASKSADMXN_AVX2(32, 32) MASKSADMXN_AVX2(32, 64) MASKSADMXN_AVX2(64, 32) MASKSADMXN_AVX2(64, 64) MASKSADMXN_AVX2(64, 128) MASKSADMXN_AVX2(128, 64) MASKSADMXN_AVX2(128, 128) #if !CONFIG_REALTIME_ONLY MASKSADMXN_AVX2(4, 16) MASKSADMXN_AVX2(16, 4) MASKSADMXN_AVX2(8, 32) MASKSADMXN_AVX2(32, 8) MASKSADMXN_AVX2(16, 64) MASKSADMXN_AVX2(64, 16) #endif // !CONFIG_REALTIME_ONLY #if CONFIG_AV1_HIGHBITDEPTH static inline unsigned int highbd_masked_sad8xh_avx2( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int y; __m256i res = _mm256_setzero_si256(); const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m256i round_const = _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m256i one = _mm256_set1_epi16(1); for (y = 0; y < height; y += 2) { const __m256i src = yy_loadu2_128(src_ptr + src_stride, src_ptr); const __m256i a = yy_loadu2_128(a_ptr + a_stride, a_ptr); const __m256i b = yy_loadu2_128(b_ptr + b_stride, b_ptr); // Zero-extend mask to 16 bits const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)(m_ptr)), _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride)))); const __m256i m_inv = _mm256_sub_epi16(mask_max, m); const __m256i data_l = _mm256_unpacklo_epi16(a, b); const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m256i data_r = _mm256_unpackhi_epi16(a, b); const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, // so it is safe to do signed saturation here. const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); // There is no 16-bit SAD instruction, so we have to synthesize // an 8-element SAD. We do this by storing 4 32-bit partial SADs, // and accumulating them at the end const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); src_ptr += src_stride << 1; a_ptr += a_stride << 1; b_ptr += b_stride << 1; m_ptr += m_stride << 1; } // At this point, we have four 32-bit partial SADs stored in 'res'. res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); return sad; } static inline unsigned int highbd_masked_sad16xh_avx2( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int x, y; __m256i res = _mm256_setzero_si256(); const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m256i round_const = _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m256i one = _mm256_set1_epi16(1); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); // Zero-extend mask to 16 bits const __m256i m = _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x])); const __m256i m_inv = _mm256_sub_epi16(mask_max, m); const __m256i data_l = _mm256_unpacklo_epi16(a, b); const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m256i data_r = _mm256_unpackhi_epi16(a, b); const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, // so it is safe to do signed saturation here. const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); // There is no 16-bit SAD instruction, so we have to synthesize // an 8-element SAD. We do this by storing 4 32-bit partial SADs, // and accumulating them at the end const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have four 32-bit partial SADs stored in 'res'. res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); return sad; } static inline unsigned int aom_highbd_masked_sad_avx2( const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, int m, int n) { unsigned int sad; if (!invert_mask) { switch (m) { case 4: sad = aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, second_pred, m, msk, msk_stride, n); break; case 8: sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride, second_pred, m, msk, msk_stride, n); break; default: sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred, m, msk, msk_stride, m, n); break; } } else { switch (m) { case 4: sad = aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref, ref_stride, msk, msk_stride, n); break; case 8: sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref, ref_stride, msk, msk_stride, n); break; default: sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref, ref_stride, msk, msk_stride, m, n); break; } } return sad; } #define HIGHBD_MASKSADMXN_AVX2(m, n) \ unsigned int aom_highbd_masked_sad##m##x##n##_avx2( \ const uint8_t *src8, int src_stride, const uint8_t *ref8, \ int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ int msk_stride, int invert_mask) { \ return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \ second_pred8, msk, msk_stride, \ invert_mask, m, n); \ } HIGHBD_MASKSADMXN_AVX2(4, 4) HIGHBD_MASKSADMXN_AVX2(4, 8) HIGHBD_MASKSADMXN_AVX2(8, 4) HIGHBD_MASKSADMXN_AVX2(8, 8) HIGHBD_MASKSADMXN_AVX2(8, 16) HIGHBD_MASKSADMXN_AVX2(16, 8) HIGHBD_MASKSADMXN_AVX2(16, 16) HIGHBD_MASKSADMXN_AVX2(16, 32) HIGHBD_MASKSADMXN_AVX2(32, 16) HIGHBD_MASKSADMXN_AVX2(32, 32) HIGHBD_MASKSADMXN_AVX2(32, 64) HIGHBD_MASKSADMXN_AVX2(64, 32) HIGHBD_MASKSADMXN_AVX2(64, 64) HIGHBD_MASKSADMXN_AVX2(64, 128) HIGHBD_MASKSADMXN_AVX2(128, 64) HIGHBD_MASKSADMXN_AVX2(128, 128) #if !CONFIG_REALTIME_ONLY HIGHBD_MASKSADMXN_AVX2(4, 16) HIGHBD_MASKSADMXN_AVX2(16, 4) HIGHBD_MASKSADMXN_AVX2(8, 32) HIGHBD_MASKSADMXN_AVX2(32, 8) HIGHBD_MASKSADMXN_AVX2(16, 64) HIGHBD_MASKSADMXN_AVX2(64, 16) #endif // !CONFIG_REALTIME_ONLY #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/x86/masked_sad_intrin_ssse3.c000066400000000000000000000442641477627663500214240ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/blend.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/masked_sad_intrin_ssse3.h" // For width a multiple of 16 static inline unsigned int masked_sad_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height); #define MASKSADMXN_SSSE3(m, n) \ unsigned int aom_masked_sad##m##x##n##_ssse3( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ int invert_mask) { \ if (!invert_mask) \ return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred, \ m, msk, msk_stride, m, n); \ else \ return masked_sad_ssse3(src, src_stride, second_pred, m, ref, \ ref_stride, msk, msk_stride, m, n); \ } #define MASKSAD8XN_SSSE3(n) \ unsigned int aom_masked_sad8x##n##_ssse3( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ int invert_mask) { \ if (!invert_mask) \ return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \ second_pred, 8, msk, msk_stride, n); \ else \ return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \ ref_stride, msk, msk_stride, n); \ } #define MASKSAD4XN_SSSE3(n) \ unsigned int aom_masked_sad4x##n##_ssse3( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ int invert_mask) { \ if (!invert_mask) \ return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \ second_pred, 4, msk, msk_stride, n); \ else \ return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \ ref_stride, msk, msk_stride, n); \ } MASKSADMXN_SSSE3(128, 128) MASKSADMXN_SSSE3(128, 64) MASKSADMXN_SSSE3(64, 128) MASKSADMXN_SSSE3(64, 64) MASKSADMXN_SSSE3(64, 32) MASKSADMXN_SSSE3(32, 64) MASKSADMXN_SSSE3(32, 32) MASKSADMXN_SSSE3(32, 16) MASKSADMXN_SSSE3(16, 32) MASKSADMXN_SSSE3(16, 16) MASKSADMXN_SSSE3(16, 8) MASKSAD8XN_SSSE3(16) MASKSAD8XN_SSSE3(8) MASKSAD8XN_SSSE3(4) MASKSAD4XN_SSSE3(8) MASKSAD4XN_SSSE3(4) #if !CONFIG_REALTIME_ONLY MASKSAD4XN_SSSE3(16) MASKSADMXN_SSSE3(16, 4) MASKSAD8XN_SSSE3(32) MASKSADMXN_SSSE3(32, 8) MASKSADMXN_SSSE3(16, 64) MASKSADMXN_SSSE3(64, 16) #endif // !CONFIG_REALTIME_ONLY static inline unsigned int masked_sad_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { int x, y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]); const __m128i m_inv = _mm_sub_epi8(mask_max, m); // Calculate 16 predicted pixels. // Note that the maximum value of any entry of 'pred_l' or 'pred_r' // is 64 * 255, so we have plenty of space to add rounding constants. const __m128i data_l = _mm_unpacklo_epi8(a, b); const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi8(a, b); const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packus_epi16(pred_l, pred_r); res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. unsigned int sad = (unsigned int)(_mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8))); return sad; } unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y += 2) { const __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr); const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]); const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr); const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]); const __m128i m = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); const __m128i m_inv = _mm_sub_epi8(mask_max, m); const __m128i data_l = _mm_unpacklo_epi8(a0, b0); const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpacklo_epi8(a1, b1); const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packus_epi16(pred_l, pred_r); res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } unsigned int sad = (unsigned int)(_mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8))); return sad; } unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y += 2) { // Load two rows at a time, this seems to be a bit faster // than four rows at a time in this case. const __m128i src = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr), _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride])); const __m128i a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)a_ptr), _mm_cvtsi32_si128(*(int *)&a_ptr[a_stride])); const __m128i b = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr), _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride])); const __m128i m = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr), _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride])); const __m128i m_inv = _mm_sub_epi8(mask_max, m); const __m128i data = _mm_unpacklo_epi8(a, b); const __m128i mask = _mm_unpacklo_epi8(m, m_inv); __m128i pred_16bit = _mm_maddubs_epi16(data, mask); pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128()); res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } // At this point, the SAD is stored in lane 0 of 'res' return (unsigned int)_mm_cvtsi128_si32(res); } #if CONFIG_AV1_HIGHBITDEPTH // For width a multiple of 8 static inline unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height); #define HIGHBD_MASKSADMXN_SSSE3(m, n) \ unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \ const uint8_t *src8, int src_stride, const uint8_t *ref8, \ int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ int msk_stride, int invert_mask) { \ if (!invert_mask) \ return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride, \ second_pred8, m, msk, msk_stride, m, n); \ else \ return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \ ref_stride, msk, msk_stride, m, n); \ } #define HIGHBD_MASKSAD4XN_SSSE3(n) \ unsigned int aom_highbd_masked_sad4x##n##_ssse3( \ const uint8_t *src8, int src_stride, const uint8_t *ref8, \ int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ int msk_stride, int invert_mask) { \ if (!invert_mask) \ return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, \ ref_stride, second_pred8, 4, msk, \ msk_stride, n); \ else \ return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \ ref8, ref_stride, msk, msk_stride, \ n); \ } HIGHBD_MASKSADMXN_SSSE3(128, 128) HIGHBD_MASKSADMXN_SSSE3(128, 64) HIGHBD_MASKSADMXN_SSSE3(64, 128) HIGHBD_MASKSADMXN_SSSE3(64, 64) HIGHBD_MASKSADMXN_SSSE3(64, 32) HIGHBD_MASKSADMXN_SSSE3(32, 64) HIGHBD_MASKSADMXN_SSSE3(32, 32) HIGHBD_MASKSADMXN_SSSE3(32, 16) HIGHBD_MASKSADMXN_SSSE3(16, 32) HIGHBD_MASKSADMXN_SSSE3(16, 16) HIGHBD_MASKSADMXN_SSSE3(16, 8) HIGHBD_MASKSADMXN_SSSE3(8, 16) HIGHBD_MASKSADMXN_SSSE3(8, 8) HIGHBD_MASKSADMXN_SSSE3(8, 4) HIGHBD_MASKSAD4XN_SSSE3(8) HIGHBD_MASKSAD4XN_SSSE3(4) #if !CONFIG_REALTIME_ONLY HIGHBD_MASKSAD4XN_SSSE3(16) HIGHBD_MASKSADMXN_SSSE3(16, 4) HIGHBD_MASKSADMXN_SSSE3(8, 32) HIGHBD_MASKSADMXN_SSSE3(32, 8) HIGHBD_MASKSADMXN_SSSE3(16, 64) HIGHBD_MASKSADMXN_SSSE3(64, 16) #endif // !CONFIG_REALTIME_ONLY static inline unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int x, y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 8) { const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, // so it is safe to do signed saturation here. const __m128i pred = _mm_packs_epi32(pred_l, pred_r); // There is no 16-bit SAD instruction, so we have to synthesize // an 8-element SAD. We do this by storing 4 32-bit partial SADs, // and accumulating them at the end const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have four 32-bit partial SADs stored in 'res'. res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return sad; } unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y += 2) { const __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr), _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride])); const __m128i b = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr), _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride])); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr), _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packs_epi32(pred_l, pred_r); const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return sad; } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/x86/masked_sad_intrin_ssse3.h000066400000000000000000000033331477627663500214210ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ #define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height); unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height); unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int height); #endif // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ aom-3.12.1/aom_dsp/x86/masked_variance_intrin_ssse3.c000066400000000000000000001370221477627663500224400ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/blend.h" #include "aom_dsp/x86/masked_variance_intrin_ssse3.h" #include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" // For width a multiple of 16 static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset, int yoffset, uint8_t *dst, int w, int h); static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset, int yoffset, uint8_t *dst, int h); static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset, int yoffset, uint8_t *dst, int h); // For width a multiple of 16 static void masked_variance(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height, unsigned int *sse, int *sum_); static void masked_variance8xh(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, const uint8_t *b_ptr, const uint8_t *m_ptr, int m_stride, int height, unsigned int *sse, int *sum_); static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, const uint8_t *b_ptr, const uint8_t *m_ptr, int m_stride, int height, unsigned int *sse, int *sum_); #define MASK_SUBPIX_VAR_SSSE3(W, H) \ unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ const uint8_t *msk, int msk_stride, int invert_mask, \ unsigned int *sse) { \ int sum; \ uint8_t temp[(H + 1) * W]; \ \ bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ \ if (!invert_mask) \ masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ msk_stride, W, H, sse, &sum); \ else \ masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ msk_stride, W, H, sse, &sum); \ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ } #define MASK_SUBPIX_VAR8XH_SSSE3(H) \ unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ const uint8_t *msk, int msk_stride, int invert_mask, \ unsigned int *sse) { \ int sum; \ uint8_t temp[(H + 1) * 8]; \ \ bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H); \ \ if (!invert_mask) \ masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \ H, sse, &sum); \ else \ masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \ H, sse, &sum); \ return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H)); \ } #define MASK_SUBPIX_VAR4XH_SSSE3(H) \ unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ const uint8_t *msk, int msk_stride, int invert_mask, \ unsigned int *sse) { \ int sum; \ uint8_t temp[(H + 1) * 4]; \ \ bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ \ if (!invert_mask) \ masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \ H, sse, &sum); \ else \ masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \ H, sse, &sum); \ return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ } MASK_SUBPIX_VAR_SSSE3(128, 128) MASK_SUBPIX_VAR_SSSE3(128, 64) MASK_SUBPIX_VAR_SSSE3(64, 128) MASK_SUBPIX_VAR_SSSE3(64, 64) MASK_SUBPIX_VAR_SSSE3(64, 32) MASK_SUBPIX_VAR_SSSE3(32, 64) MASK_SUBPIX_VAR_SSSE3(32, 32) MASK_SUBPIX_VAR_SSSE3(32, 16) MASK_SUBPIX_VAR_SSSE3(16, 32) MASK_SUBPIX_VAR_SSSE3(16, 16) MASK_SUBPIX_VAR_SSSE3(16, 8) MASK_SUBPIX_VAR8XH_SSSE3(16) MASK_SUBPIX_VAR8XH_SSSE3(8) MASK_SUBPIX_VAR8XH_SSSE3(4) MASK_SUBPIX_VAR4XH_SSSE3(8) MASK_SUBPIX_VAR4XH_SSSE3(4) #if !CONFIG_REALTIME_ONLY MASK_SUBPIX_VAR4XH_SSSE3(16) MASK_SUBPIX_VAR_SSSE3(16, 4) MASK_SUBPIX_VAR8XH_SSSE3(32) MASK_SUBPIX_VAR_SSSE3(32, 8) MASK_SUBPIX_VAR_SSSE3(64, 16) MASK_SUBPIX_VAR_SSSE3(16, 64) #endif // !CONFIG_REALTIME_ONLY static inline __m128i filter_block(const __m128i a, const __m128i b, const __m128i filter) { __m128i v0 = _mm_unpacklo_epi8(a, b); v0 = _mm_maddubs_epi16(v0, filter); v0 = xx_roundn_epu16(v0, FILTER_BITS); __m128i v1 = _mm_unpackhi_epi8(a, b); v1 = _mm_maddubs_epi16(v1, filter); v1 = xx_roundn_epu16(v1, FILTER_BITS); return _mm_packus_epi16(v0, v1); } static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset, int yoffset, uint8_t *dst, int w, int h) { int i, j; // Horizontal filter if (xoffset == 0) { uint8_t *b = dst; for (i = 0; i < h + 1; ++i) { for (j = 0; j < w; j += 16) { __m128i x = _mm_loadu_si128((__m128i *)&src[j]); _mm_storeu_si128((__m128i *)&b[j], x); } src += src_stride; b += w; } } else if (xoffset == 4) { uint8_t *b = dst; for (i = 0; i < h + 1; ++i) { for (j = 0; j < w; j += 16) { __m128i x = _mm_loadu_si128((__m128i *)&src[j]); __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]); __m128i z = _mm_alignr_epi8(y, x, 1); _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z)); } src += src_stride; b += w; } } else { uint8_t *b = dst; const uint8_t *hfilter = bilinear_filters_2t[xoffset]; const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); for (i = 0; i < h + 1; ++i) { for (j = 0; j < w; j += 16) { const __m128i x = _mm_loadu_si128((__m128i *)&src[j]); const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]); const __m128i z = _mm_alignr_epi8(y, x, 1); const __m128i res = filter_block(x, z, hfilter_vec); _mm_storeu_si128((__m128i *)&b[j], res); } src += src_stride; b += w; } } // Vertical filter if (yoffset == 0) { // The data is already in 'dst', so no need to filter } else if (yoffset == 4) { for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 16) { __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y)); } dst += w; } } else { const uint8_t *vfilter = bilinear_filters_2t[yoffset]; const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 16) { const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); const __m128i res = filter_block(x, y, vfilter_vec); _mm_storeu_si128((__m128i *)&dst[j], res); } dst += w; } } } static inline __m128i filter_block_2rows(const __m128i *a0, const __m128i *b0, const __m128i *a1, const __m128i *b1, const __m128i *filter) { __m128i v0 = _mm_unpacklo_epi8(*a0, *b0); v0 = _mm_maddubs_epi16(v0, *filter); v0 = xx_roundn_epu16(v0, FILTER_BITS); __m128i v1 = _mm_unpacklo_epi8(*a1, *b1); v1 = _mm_maddubs_epi16(v1, *filter); v1 = xx_roundn_epu16(v1, FILTER_BITS); return _mm_packus_epi16(v0, v1); } static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset, int yoffset, uint8_t *dst, int h) { int i; // Horizontal filter if (xoffset == 0) { uint8_t *b = dst; for (i = 0; i < h + 1; ++i) { __m128i x = _mm_loadl_epi64((__m128i *)src); _mm_storel_epi64((__m128i *)b, x); src += src_stride; b += 8; } } else if (xoffset == 4) { uint8_t *b = dst; for (i = 0; i < h + 1; ++i) { __m128i x = _mm_loadu_si128((__m128i *)src); __m128i z = _mm_srli_si128(x, 1); _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z)); src += src_stride; b += 8; } } else { uint8_t *b = dst; const uint8_t *hfilter = bilinear_filters_2t[xoffset]; const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); for (i = 0; i < h; i += 2) { const __m128i x0 = _mm_loadu_si128((__m128i *)src); const __m128i z0 = _mm_srli_si128(x0, 1); const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]); const __m128i z1 = _mm_srli_si128(x1, 1); const __m128i res = filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec); _mm_storeu_si128((__m128i *)b, res); src += src_stride * 2; b += 16; } // Handle i = h separately const __m128i x0 = _mm_loadu_si128((__m128i *)src); const __m128i z0 = _mm_srli_si128(x0, 1); __m128i v0 = _mm_unpacklo_epi8(x0, z0); v0 = _mm_maddubs_epi16(v0, hfilter_vec); v0 = xx_roundn_epu16(v0, FILTER_BITS); _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0)); } // Vertical filter if (yoffset == 0) { // The data is already in 'dst', so no need to filter } else if (yoffset == 4) { for (i = 0; i < h; ++i) { __m128i x = _mm_loadl_epi64((__m128i *)dst); __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]); _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y)); dst += 8; } } else { const uint8_t *vfilter = bilinear_filters_2t[yoffset]; const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); for (i = 0; i < h; i += 2) { const __m128i x = _mm_loadl_epi64((__m128i *)dst); const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]); const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]); const __m128i res = filter_block_2rows(&x, &y, &y, &z, &vfilter_vec); _mm_storeu_si128((__m128i *)dst, res); dst += 16; } } } static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset, int yoffset, uint8_t *dst, int h) { int i; // Horizontal filter if (xoffset == 0) { uint8_t *b = dst; for (i = 0; i < h + 1; ++i) { __m128i x = xx_loadl_32((__m128i *)src); xx_storel_32(b, x); src += src_stride; b += 4; } } else if (xoffset == 4) { uint8_t *b = dst; for (i = 0; i < h + 1; ++i) { __m128i x = _mm_loadl_epi64((__m128i *)src); __m128i z = _mm_srli_si128(x, 1); xx_storel_32(b, _mm_avg_epu8(x, z)); src += src_stride; b += 4; } } else { uint8_t *b = dst; const uint8_t *hfilter = bilinear_filters_2t[xoffset]; const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); for (i = 0; i < h; i += 4) { const __m128i x0 = _mm_loadl_epi64((__m128i *)src); const __m128i z0 = _mm_srli_si128(x0, 1); const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]); const __m128i z1 = _mm_srli_si128(x1, 1); const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]); const __m128i z2 = _mm_srli_si128(x2, 1); const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]); const __m128i z3 = _mm_srli_si128(x3, 1); const __m128i a0 = _mm_unpacklo_epi32(x0, x1); const __m128i b0 = _mm_unpacklo_epi32(z0, z1); const __m128i a1 = _mm_unpacklo_epi32(x2, x3); const __m128i b1 = _mm_unpacklo_epi32(z2, z3); const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &hfilter_vec); _mm_storeu_si128((__m128i *)b, res); src += src_stride * 4; b += 16; } // Handle i = h separately const __m128i x = _mm_loadl_epi64((__m128i *)src); const __m128i z = _mm_srli_si128(x, 1); __m128i v0 = _mm_unpacklo_epi8(x, z); v0 = _mm_maddubs_epi16(v0, hfilter_vec); v0 = xx_roundn_epu16(v0, FILTER_BITS); xx_storel_32(b, _mm_packus_epi16(v0, v0)); } // Vertical filter if (yoffset == 0) { // The data is already in 'dst', so no need to filter } else if (yoffset == 4) { for (i = 0; i < h; ++i) { __m128i x = xx_loadl_32((__m128i *)dst); __m128i y = xx_loadl_32((__m128i *)&dst[4]); xx_storel_32(dst, _mm_avg_epu8(x, y)); dst += 4; } } else { const uint8_t *vfilter = bilinear_filters_2t[yoffset]; const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); for (i = 0; i < h; i += 4) { const __m128i a = xx_loadl_32((__m128i *)dst); const __m128i b = xx_loadl_32((__m128i *)&dst[4]); const __m128i c = xx_loadl_32((__m128i *)&dst[8]); const __m128i d = xx_loadl_32((__m128i *)&dst[12]); const __m128i e = xx_loadl_32((__m128i *)&dst[16]); const __m128i a0 = _mm_unpacklo_epi32(a, b); const __m128i b0 = _mm_unpacklo_epi32(b, c); const __m128i a1 = _mm_unpacklo_epi32(c, d); const __m128i b1 = _mm_unpacklo_epi32(d, e); const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &vfilter_vec); _mm_storeu_si128((__m128i *)dst, res); dst += 16; } } } static inline void accumulate_block(const __m128i *src, const __m128i *a, const __m128i *b, const __m128i *m, __m128i *sum, __m128i *sum_sq) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i m_inv = _mm_sub_epi8(mask_max, *m); // Calculate 16 predicted pixels. // Note that the maximum value of any entry of 'pred_l' or 'pred_r' // is 64 * 255, so we have plenty of space to add rounding constants. const __m128i data_l = _mm_unpacklo_epi8(*a, *b); const __m128i mask_l = _mm_unpacklo_epi8(*m, m_inv); __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi8(*a, *b); const __m128i mask_r = _mm_unpackhi_epi8(*m, m_inv); __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); const __m128i src_l = _mm_unpacklo_epi8(*src, zero); const __m128i src_r = _mm_unpackhi_epi8(*src, zero); const __m128i diff_l = _mm_sub_epi16(pred_l, src_l); const __m128i diff_r = _mm_sub_epi16(pred_r, src_r); // Update partial sums and partial sums of squares *sum = _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one)); *sum_sq = _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l), _mm_madd_epi16(diff_r, diff_r))); } static void masked_variance(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height, unsigned int *sse, int *sum_) { int x, y; __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]); accumulate_block(&src, &a, &b, &m, &sum, &sum_sq); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // Reduce down to a single sum and sum of squares sum = _mm_hadd_epi32(sum, sum_sq); sum = _mm_hadd_epi32(sum, sum); *sum_ = _mm_cvtsi128_si32(sum); *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); } static void masked_variance8xh(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, const uint8_t *b_ptr, const uint8_t *m_ptr, int m_stride, int height, unsigned int *sse, int *sum_) { int y; __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); for (y = 0; y < height; y += 2) { __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); const __m128i m = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); accumulate_block(&src, &a, &b, &m, &sum, &sum_sq); src_ptr += src_stride * 2; a_ptr += 16; b_ptr += 16; m_ptr += m_stride * 2; } // Reduce down to a single sum and sum of squares sum = _mm_hadd_epi32(sum, sum_sq); sum = _mm_hadd_epi32(sum, sum); *sum_ = _mm_cvtsi128_si32(sum); *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); } static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, const uint8_t *b_ptr, const uint8_t *m_ptr, int m_stride, int height, unsigned int *sse, int *sum_) { int y; __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); for (y = 0; y < height; y += 4) { // Load four rows at a time __m128i src = _mm_setr_epi32(*(int *)src_ptr, *(int *)&src_ptr[src_stride], *(int *)&src_ptr[src_stride * 2], *(int *)&src_ptr[src_stride * 3]); const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); const __m128i m = _mm_setr_epi32(*(int *)m_ptr, *(int *)&m_ptr[m_stride], *(int *)&m_ptr[m_stride * 2], *(int *)&m_ptr[m_stride * 3]); accumulate_block(&src, &a, &b, &m, &sum, &sum_sq); src_ptr += src_stride * 4; a_ptr += 16; b_ptr += 16; m_ptr += m_stride * 4; } // Reduce down to a single sum and sum of squares sum = _mm_hadd_epi32(sum, sum_sq); sum = _mm_hadd_epi32(sum, sum); *sum_ = _mm_cvtsi128_si32(sum); *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); } #if CONFIG_AV1_HIGHBITDEPTH // For width a multiple of 8 static void highbd_bilinear_filter(const uint16_t *src, int src_stride, int xoffset, int yoffset, uint16_t *dst, int w, int h); static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride, int xoffset, int yoffset, uint16_t *dst, int h); // For width a multiple of 8 static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride, const uint16_t *a_ptr, int a_stride, const uint16_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height, uint64_t *sse, int *sum_); static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, const uint16_t *a_ptr, const uint16_t *b_ptr, const uint8_t *m_ptr, int m_stride, int height, int *sse, int *sum_); #define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H) \ unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3( \ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ uint64_t sse64; \ int sum; \ uint16_t temp[(H + 1) * W]; \ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ \ highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ \ if (!invert_mask) \ highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ msk_stride, W, H, &sse64, &sum); \ else \ highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ msk_stride, W, H, &sse64, &sum); \ *sse = (uint32_t)sse64; \ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ } \ unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ uint64_t sse64; \ int sum; \ int64_t var; \ uint16_t temp[(H + 1) * W]; \ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ \ highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ \ if (!invert_mask) \ highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ msk_stride, W, H, &sse64, &sum); \ else \ highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ msk_stride, W, H, &sse64, &sum); \ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4); \ sum = ROUND_POWER_OF_TWO(sum, 2); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ return (var >= 0) ? (uint32_t)var : 0; \ } \ unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ uint64_t sse64; \ int sum; \ int64_t var; \ uint16_t temp[(H + 1) * W]; \ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ \ highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ \ if (!invert_mask) \ highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ msk_stride, W, H, &sse64, &sum); \ else \ highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ msk_stride, W, H, &sse64, &sum); \ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8); \ sum = ROUND_POWER_OF_TWO(sum, 4); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ return (var >= 0) ? (uint32_t)var : 0; \ } #define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H) \ unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3( \ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ int sse_; \ int sum; \ uint16_t temp[(H + 1) * 4]; \ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ \ highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ \ if (!invert_mask) \ highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ msk_stride, H, &sse_, &sum); \ else \ highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ msk_stride, H, &sse_, &sum); \ *sse = (uint32_t)sse_; \ return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ } \ unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3( \ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ int sse_; \ int sum; \ int64_t var; \ uint16_t temp[(H + 1) * 4]; \ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ \ highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ \ if (!invert_mask) \ highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ msk_stride, H, &sse_, &sum); \ else \ highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ msk_stride, H, &sse_, &sum); \ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4); \ sum = ROUND_POWER_OF_TWO(sum, 2); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \ return (var >= 0) ? (uint32_t)var : 0; \ } \ unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3( \ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ int sse_; \ int sum; \ int64_t var; \ uint16_t temp[(H + 1) * 4]; \ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ \ highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ \ if (!invert_mask) \ highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ msk_stride, H, &sse_, &sum); \ else \ highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ msk_stride, H, &sse_, &sum); \ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8); \ sum = ROUND_POWER_OF_TWO(sum, 4); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \ return (var >= 0) ? (uint32_t)var : 0; \ } HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128) HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64) HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128) HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64) HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32) HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64) HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32) HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16) HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32) HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16) HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8) HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16) HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8) HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4) HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8) HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4) #if !CONFIG_REALTIME_ONLY HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16) HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4) HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32) HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8) HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64) HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16) #endif // !CONFIG_REALTIME_ONLY static inline __m128i highbd_filter_block(const __m128i a, const __m128i b, const __m128i filter) { __m128i v0 = _mm_unpacklo_epi16(a, b); v0 = _mm_madd_epi16(v0, filter); v0 = xx_roundn_epu32(v0, FILTER_BITS); __m128i v1 = _mm_unpackhi_epi16(a, b); v1 = _mm_madd_epi16(v1, filter); v1 = xx_roundn_epu32(v1, FILTER_BITS); return _mm_packs_epi32(v0, v1); } static void highbd_bilinear_filter(const uint16_t *src, int src_stride, int xoffset, int yoffset, uint16_t *dst, int w, int h) { int i, j; // Horizontal filter if (xoffset == 0) { uint16_t *b = dst; for (i = 0; i < h + 1; ++i) { for (j = 0; j < w; j += 8) { __m128i x = _mm_loadu_si128((__m128i *)&src[j]); _mm_storeu_si128((__m128i *)&b[j], x); } src += src_stride; b += w; } } else if (xoffset == 4) { uint16_t *b = dst; for (i = 0; i < h + 1; ++i) { for (j = 0; j < w; j += 8) { __m128i x = _mm_loadu_si128((__m128i *)&src[j]); __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]); __m128i z = _mm_alignr_epi8(y, x, 2); _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z)); } src += src_stride; b += w; } } else { uint16_t *b = dst; const uint8_t *hfilter = bilinear_filters_2t[xoffset]; const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16)); for (i = 0; i < h + 1; ++i) { for (j = 0; j < w; j += 8) { const __m128i x = _mm_loadu_si128((__m128i *)&src[j]); const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]); const __m128i z = _mm_alignr_epi8(y, x, 2); const __m128i res = highbd_filter_block(x, z, hfilter_vec); _mm_storeu_si128((__m128i *)&b[j], res); } src += src_stride; b += w; } } // Vertical filter if (yoffset == 0) { // The data is already in 'dst', so no need to filter } else if (yoffset == 4) { for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y)); } dst += w; } } else { const uint8_t *vfilter = bilinear_filters_2t[yoffset]; const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16)); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); const __m128i res = highbd_filter_block(x, y, vfilter_vec); _mm_storeu_si128((__m128i *)&dst[j], res); } dst += w; } } } static inline __m128i highbd_filter_block_2rows(const __m128i *a0, const __m128i *b0, const __m128i *a1, const __m128i *b1, const __m128i *filter) { __m128i v0 = _mm_unpacklo_epi16(*a0, *b0); v0 = _mm_madd_epi16(v0, *filter); v0 = xx_roundn_epu32(v0, FILTER_BITS); __m128i v1 = _mm_unpacklo_epi16(*a1, *b1); v1 = _mm_madd_epi16(v1, *filter); v1 = xx_roundn_epu32(v1, FILTER_BITS); return _mm_packs_epi32(v0, v1); } static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride, int xoffset, int yoffset, uint16_t *dst, int h) { int i; // Horizontal filter if (xoffset == 0) { uint16_t *b = dst; for (i = 0; i < h + 1; ++i) { __m128i x = _mm_loadl_epi64((__m128i *)src); _mm_storel_epi64((__m128i *)b, x); src += src_stride; b += 4; } } else if (xoffset == 4) { uint16_t *b = dst; for (i = 0; i < h + 1; ++i) { __m128i x = _mm_loadu_si128((__m128i *)src); __m128i z = _mm_srli_si128(x, 2); _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z)); src += src_stride; b += 4; } } else { uint16_t *b = dst; const uint8_t *hfilter = bilinear_filters_2t[xoffset]; const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16)); for (i = 0; i < h; i += 2) { const __m128i x0 = _mm_loadu_si128((__m128i *)src); const __m128i z0 = _mm_srli_si128(x0, 2); const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]); const __m128i z1 = _mm_srli_si128(x1, 2); const __m128i res = highbd_filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec); _mm_storeu_si128((__m128i *)b, res); src += src_stride * 2; b += 8; } // Process i = h separately __m128i x = _mm_loadu_si128((__m128i *)src); __m128i z = _mm_srli_si128(x, 2); __m128i v0 = _mm_unpacklo_epi16(x, z); v0 = _mm_madd_epi16(v0, hfilter_vec); v0 = xx_roundn_epu32(v0, FILTER_BITS); _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0)); } // Vertical filter if (yoffset == 0) { // The data is already in 'dst', so no need to filter } else if (yoffset == 4) { for (i = 0; i < h; ++i) { __m128i x = _mm_loadl_epi64((__m128i *)dst); __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]); _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y)); dst += 4; } } else { const uint8_t *vfilter = bilinear_filters_2t[yoffset]; const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16)); for (i = 0; i < h; i += 2) { const __m128i x = _mm_loadl_epi64((__m128i *)dst); const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]); const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]); const __m128i res = highbd_filter_block_2rows(&x, &y, &y, &z, &vfilter_vec); _mm_storeu_si128((__m128i *)dst, res); dst += 8; } } } static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride, const uint16_t *a_ptr, int a_stride, const uint16_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height, uint64_t *sse, int *sum_) { int x, y; // Note on bit widths: // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26, // so this can be kept as four 32-bit values. // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38, // so this must be stored as two 64-bit values. __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i zero = _mm_setzero_si128(); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 8) { const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); const __m128i m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero); const __m128i m_inv = _mm_sub_epi16(mask_max, m); // Calculate 8 predicted pixels. const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i src_l = _mm_unpacklo_epi16(src, zero); const __m128i src_r = _mm_unpackhi_epi16(src, zero); __m128i diff_l = _mm_sub_epi32(pred_l, src_l); __m128i diff_r = _mm_sub_epi32(pred_r, src_r); // Update partial sums and partial sums of squares sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r)); // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit // field, but the range of values is only [-(2^12 - 1), 2^12 - 1]. // So we can re-pack into 16-bit fields and use _mm_madd_epi16 // to calculate the squares and partially sum them. const __m128i tmp = _mm_packs_epi32(diff_l, diff_r); const __m128i prod = _mm_madd_epi16(tmp, tmp); // Then we want to sign-extend to 64 bits and accumulate const __m128i sign = _mm_srai_epi32(prod, 31); const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign); const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign); sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // Reduce down to a single sum and sum of squares sum = _mm_hadd_epi32(sum, zero); sum = _mm_hadd_epi32(sum, zero); *sum_ = _mm_cvtsi128_si32(sum); sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8)); _mm_storel_epi64((__m128i *)sse, sum_sq); } static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, const uint16_t *a_ptr, const uint16_t *b_ptr, const uint8_t *m_ptr, int m_stride, int height, int *sse, int *sum_) { int y; // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions). // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18 // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30. // So we can safely pack sum_sq into 32-bit fields, which is slightly more // convenient. __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i zero = _mm_setzero_si128(); for (y = 0; y < height; y += 2) { __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); const __m128i m = _mm_unpacklo_epi8( _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr), _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])), zero); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i src_l = _mm_unpacklo_epi16(src, zero); const __m128i src_r = _mm_unpackhi_epi16(src, zero); __m128i diff_l = _mm_sub_epi32(pred_l, src_l); __m128i diff_r = _mm_sub_epi32(pred_r, src_r); // Update partial sums and partial sums of squares sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r)); const __m128i tmp = _mm_packs_epi32(diff_l, diff_r); const __m128i prod = _mm_madd_epi16(tmp, tmp); sum_sq = _mm_add_epi32(sum_sq, prod); src_ptr += src_stride * 2; a_ptr += 8; b_ptr += 8; m_ptr += m_stride * 2; } // Reduce down to a single sum and sum of squares sum = _mm_hadd_epi32(sum, sum_sq); sum = _mm_hadd_epi32(sum, zero); *sum_ = _mm_cvtsi128_si32(sum); *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); } #endif // CONFIG_AV1_HIGHBITDEPTH void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { const uint8_t *src0 = invert_mask ? pred : ref; const uint8_t *src1 = invert_mask ? ref : pred; const int stride0 = invert_mask ? width : ref_stride; const int stride1 = invert_mask ? ref_stride : width; assert(height % 2 == 0); int i = 0; if (width == 8) { comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1, mask, mask_stride); } else if (width == 16) { do { comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred); comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1, mask + mask_stride, comp_pred + width); comp_pred += (width << 1); src0 += (stride0 << 1); src1 += (stride1 << 1); mask += (mask_stride << 1); i += 2; } while (i < height); } else { do { for (int x = 0; x < width; x += 32) { comp_mask_pred_16_ssse3(src0 + x, src1 + x, mask + x, comp_pred); comp_mask_pred_16_ssse3(src0 + x + 16, src1 + x + 16, mask + x + 16, comp_pred + 16); comp_pred += 32; } src0 += (stride0); src1 += (stride1); mask += (mask_stride); i += 1; } while (i < height); } } aom-3.12.1/aom_dsp/x86/masked_variance_intrin_ssse3.h000066400000000000000000000074121477627663500224440ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ #define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/blend.h" static inline void comp_mask_pred_16_ssse3(const uint8_t *src0, const uint8_t *src1, const uint8_t *mask, uint8_t *dst) { const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i round_offset = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0)); const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1)); const __m128i aA = _mm_load_si128((const __m128i *)(mask)); const __m128i maA = _mm_sub_epi8(alpha_max, aA); const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1); const __m128i aaAL = _mm_unpacklo_epi8(aA, maA); const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1); const __m128i aaAH = _mm_unpackhi_epi8(aA, maA); const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL); const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH); const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset); const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset); _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH)); } static inline void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height, const uint8_t *src0, int stride0, const uint8_t *src1, int stride1, const uint8_t *mask, int mask_stride) { int i = 0; const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const __m128i round_offset = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { // odd line A const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0)); const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1)); const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask)); // even line B const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0)); const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1)); const __m128i a = _mm_castps_si128(_mm_loadh_pi( _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride))); const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1); const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1); const __m128i ma = _mm_sub_epi8(alpha_max, a); const __m128i aaA = _mm_unpacklo_epi8(a, ma); const __m128i aaB = _mm_unpackhi_epi8(a, ma); const __m128i blendA = _mm_maddubs_epi16(ssA, aaA); const __m128i blendB = _mm_maddubs_epi16(ssB, aaB); const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset); const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset); const __m128i round = _mm_packus_epi16(roundA, roundB); // comp_pred's stride == width == 8 _mm_store_si128((__m128i *)(comp_pred), round); comp_pred += (8 << 1); src0 += (stride0 << 1); src1 += (stride1 << 1); mask += (mask_stride << 1); i += 2; } while (i < height); } #endif // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ aom-3.12.1/aom_dsp/x86/mem_sse2.h000066400000000000000000000147331477627663500163430ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_ #define AOM_AOM_DSP_X86_MEM_SSE2_H_ #include // SSE2 #include #include "config/aom_config.h" #include "aom/aom_integer.h" static inline int16_t loadu_int16(const void *src) { int16_t v; memcpy(&v, src, sizeof(v)); return v; } static inline int32_t loadu_int32(const void *src) { int32_t v; memcpy(&v, src, sizeof(v)); return v; } static inline int64_t loadu_int64(const void *src) { int64_t v; memcpy(&v, src, sizeof(v)); return v; } static inline void _mm_storeh_epi64(__m128i *const d, const __m128i s) { _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s)); } static inline __m128i loadh_epi64(const void *const src, const __m128i s) { return _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); } static inline __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src, const int byte_stride) { return _mm_setr_epi32(loadu_int32((int8_t *)src + 0 * byte_stride), loadu_int32((int8_t *)src + 1 * byte_stride), loadu_int32((int8_t *)src + 2 * byte_stride), loadu_int32((int8_t *)src + 3 * byte_stride)); } static inline __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src, const int byte_stride) { __m128i dst; dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride)); dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst); return dst; } static inline void store_8bit_8x4_from_16x2(const __m128i *const s, uint8_t *const d, const ptrdiff_t stride) { _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]); _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]); _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]); } static inline void store_8bit_4x4(const __m128i *const s, uint8_t *const d, const ptrdiff_t stride) { *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]); *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]); *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]); *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]); } static inline void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d, const ptrdiff_t stride) { __m128i ss[4]; ss[0] = s; ss[1] = _mm_srli_si128(s, 4); ss[2] = _mm_srli_si128(s, 8); ss[3] = _mm_srli_si128(s, 12); store_8bit_4x4(ss, d, stride); } static inline void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride, __m128i *const d) { d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride)); d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride)); d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride)); d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride)); } static inline void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride, __m128i *const d) { load_8bit_4x4(s + 0 * stride, stride, &d[0]); load_8bit_4x4(s + 4 * stride, stride, &d[4]); } static inline void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride, __m128i *const d) { d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride)); d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride)); d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride)); d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride)); } static inline void loadu_8bit_16x4(const uint8_t *const s, const ptrdiff_t stride, __m128i *const d) { d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride)); d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride)); d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride)); d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride)); } static inline void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride, __m128i *const d) { load_8bit_8x4(s + 0 * stride, stride, &d[0]); load_8bit_8x4(s + 4 * stride, stride, &d[4]); } static inline void load_8bit_16x8(const uint8_t *const s, const ptrdiff_t stride, __m128i *const d) { d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride)); d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride)); d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride)); d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride)); d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride)); d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride)); d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride)); d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride)); } static inline void loadu_8bit_16x8(const uint8_t *const s, const ptrdiff_t stride, __m128i *const d) { loadu_8bit_16x4(s + 0 * stride, stride, &d[0]); loadu_8bit_16x4(s + 4 * stride, stride, &d[4]); } static inline void store_8bit_8x8(const __m128i *const s, uint8_t *const d, const ptrdiff_t stride) { _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]); _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]); _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]); _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]); _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]); _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]); _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]); } static inline void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d, const ptrdiff_t stride) { _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]); _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]); _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]); _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]); } #endif // AOM_AOM_DSP_X86_MEM_SSE2_H_ aom-3.12.1/aom_dsp/x86/obmc_intrinsic_sse4.h000066400000000000000000000042271477627663500205660ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ #define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ #include #include "aom_dsp/x86/obmc_intrinsic_ssse3.h" #include "aom_dsp/x86/synonyms.h" static inline void obmc_variance_w4(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *const sse, int *const sum, const int h) { const int pre_step = pre_stride - 4; int n = 0; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_d = _mm_setzero_si128(); assert(IS_POWER_OF_TWO(h)); do { const __m128i v_p_b = xx_loadl_32(pre + n); const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n)); const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n)); const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); n += 4; if (n % 4 == 0) pre += pre_step; } while (n < 4 * h); *sum = xx_hsum_epi32_si32(v_sum_d); *sse = xx_hsum_epi32_si32(v_sse_d); } #endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ aom-3.12.1/aom_dsp/x86/obmc_intrinsic_ssse3.h000066400000000000000000000034661477627663500207540ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ #define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ #include #include "config/aom_config.h" static inline int32_t xx_hsum_epi32_si32(__m128i v_d) { v_d = _mm_hadd_epi32(v_d, v_d); v_d = _mm_hadd_epi32(v_d, v_d); return _mm_cvtsi128_si32(v_d); } static inline int64_t xx_hsum_epi64_si64(__m128i v_q) { v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8)); #if AOM_ARCH_X86_64 return _mm_cvtsi128_si64(v_q); #else { int64_t tmp; _mm_storel_epi64((__m128i *)&tmp, v_q); return tmp; } #endif } static inline int64_t xx_hsum_epi32_si64(__m128i v_d) { const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128()); const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d); const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d); return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); } // This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits) static inline __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); const __m128i v_tmp_d = _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d); return _mm_srai_epi32(v_tmp_d, bits); } #endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ aom-3.12.1/aom_dsp/x86/obmc_sad_avx2.c000066400000000000000000000236271477627663500173350ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/x86/obmc_intrinsic_ssse3.h" #include "aom_dsp/x86/synonyms.h" //////////////////////////////////////////////////////////////////////////////// // 8 bit //////////////////////////////////////////////////////////////////////////////// static inline unsigned int obmc_sad_w4_avx2(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int height) { int n = 0; __m256i v_sad_d = _mm256_setzero_si256(); const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); do { const __m128i v_p_b_0 = xx_loadl_32(pre); const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride); const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1); const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n)); const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d); const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d); // Rounded absolute difference const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d); const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12); v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d); n += 8; pre += pre_stride << 1; } while (n < 8 * (height >> 1)); __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); return xx_hsum_epi32_si32(v_sad_d_0); } static inline unsigned int obmc_sad_w8n_avx2( const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int width, const int height) { const int pre_step = pre_stride - width; int n = 0; __m256i v_sad_d = _mm256_setzero_si256(); const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); assert(width >= 8); assert(IS_POWER_OF_TWO(width)); do { const __m128i v_p0_b = xx_loadl_64(pre + n); const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n)); const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d); // Rounded absolute difference const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d); const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12); v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d); n += 8; if ((n & (width - 1)) == 0) pre += pre_step; } while (n < width * height); __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); return xx_hsum_epi32_si32(v_sad_d_0); } #define OBMCSADWXH(w, h) \ unsigned int aom_obmc_sad##w##x##h##_avx2( \ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ const int32_t *msk) { \ if (w == 4) { \ return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h); \ } else { \ return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \ } \ } OBMCSADWXH(128, 128) OBMCSADWXH(128, 64) OBMCSADWXH(64, 128) OBMCSADWXH(64, 64) OBMCSADWXH(64, 32) OBMCSADWXH(32, 64) OBMCSADWXH(32, 32) OBMCSADWXH(32, 16) OBMCSADWXH(16, 32) OBMCSADWXH(16, 16) OBMCSADWXH(16, 8) OBMCSADWXH(8, 16) OBMCSADWXH(8, 8) OBMCSADWXH(8, 4) OBMCSADWXH(4, 8) OBMCSADWXH(4, 4) OBMCSADWXH(4, 16) OBMCSADWXH(16, 4) OBMCSADWXH(8, 32) OBMCSADWXH(32, 8) OBMCSADWXH(16, 64) OBMCSADWXH(64, 16) //////////////////////////////////////////////////////////////////////////////// // High bit-depth //////////////////////////////////////////////////////////////////////////////// #if CONFIG_AV1_HIGHBITDEPTH static inline unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int height) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); int n = 0; __m256i v_sad_d = _mm256_setzero_si256(); const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); do { const __m128i v_p_w_0 = xx_loadl_64(pre); const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride); const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1); const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n)); const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d); const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d); // Rounded absolute difference const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d); const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12); v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d); n += 8; pre += pre_stride << 1; } while (n < 8 * (height >> 1)); __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); return xx_hsum_epi32_si32(v_sad_d_0); } static inline unsigned int hbd_obmc_sad_w8n_avx2( const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int width, const int height) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - width; int n = 0; __m256i v_sad_d = _mm256_setzero_si256(); const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); assert(width >= 8); assert(IS_POWER_OF_TWO(width)); do { const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n)); const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n)); const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d); // Rounded absolute difference const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d); const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12); v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d); n += 8; if (n % width == 0) pre += pre_step; } while (n < width * height); __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); return xx_hsum_epi32_si32(v_sad_d_0); } #define HBD_OBMCSADWXH(w, h) \ unsigned int aom_highbd_obmc_sad##w##x##h##_avx2( \ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ const int32_t *mask) { \ if (w == 4) { \ return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h); \ } else { \ return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \ } \ } HBD_OBMCSADWXH(128, 128) HBD_OBMCSADWXH(128, 64) HBD_OBMCSADWXH(64, 128) HBD_OBMCSADWXH(64, 64) HBD_OBMCSADWXH(64, 32) HBD_OBMCSADWXH(32, 64) HBD_OBMCSADWXH(32, 32) HBD_OBMCSADWXH(32, 16) HBD_OBMCSADWXH(16, 32) HBD_OBMCSADWXH(16, 16) HBD_OBMCSADWXH(16, 8) HBD_OBMCSADWXH(8, 16) HBD_OBMCSADWXH(8, 8) HBD_OBMCSADWXH(8, 4) HBD_OBMCSADWXH(4, 8) HBD_OBMCSADWXH(4, 4) HBD_OBMCSADWXH(4, 16) HBD_OBMCSADWXH(16, 4) HBD_OBMCSADWXH(8, 32) HBD_OBMCSADWXH(32, 8) HBD_OBMCSADWXH(16, 64) HBD_OBMCSADWXH(64, 16) #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/x86/obmc_sad_sse4.c000066400000000000000000000225011477627663500173210ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/x86/obmc_intrinsic_ssse3.h" #include "aom_dsp/x86/synonyms.h" //////////////////////////////////////////////////////////////////////////////// // 8 bit //////////////////////////////////////////////////////////////////////////////// static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int height) { const int pre_step = pre_stride - 4; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); do { const __m128i v_p_b = xx_loadl_32(pre + n); const __m128i v_m_d = xx_load_128(mask + n); const __m128i v_w_d = xx_load_128(wsrc + n); const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); // Rounded absolute difference const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); n += 4; if (n % 4 == 0) pre += pre_step; } while (n < 4 * height); return xx_hsum_epi32_si32(v_sad_d); } static AOM_FORCE_INLINE unsigned int obmc_sad_w8n( const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int width, const int height) { const int pre_step = pre_stride - width; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); assert(width >= 8); assert(IS_POWER_OF_TWO(width)); do { const __m128i v_p1_b = xx_loadl_32(pre + n + 4); const __m128i v_m1_d = xx_load_128(mask + n + 4); const __m128i v_w1_d = xx_load_128(wsrc + n + 4); const __m128i v_p0_b = xx_loadl_32(pre + n); const __m128i v_m0_d = xx_load_128(mask + n); const __m128i v_w0_d = xx_load_128(wsrc + n); const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b); const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); // Rounded absolute difference const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); n += 8; if (n % width == 0) pre += pre_step; } while (n < width * height); return xx_hsum_epi32_si32(v_sad_d); } #define OBMCSADWXH(w, h) \ unsigned int aom_obmc_sad##w##x##h##_sse4_1( \ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ const int32_t *msk) { \ if (w == 4) { \ return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \ } else { \ return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \ } \ } OBMCSADWXH(128, 128) OBMCSADWXH(128, 64) OBMCSADWXH(64, 128) OBMCSADWXH(64, 64) OBMCSADWXH(64, 32) OBMCSADWXH(32, 64) OBMCSADWXH(32, 32) OBMCSADWXH(32, 16) OBMCSADWXH(16, 32) OBMCSADWXH(16, 16) OBMCSADWXH(16, 8) OBMCSADWXH(8, 16) OBMCSADWXH(8, 8) OBMCSADWXH(8, 4) OBMCSADWXH(4, 8) OBMCSADWXH(4, 4) OBMCSADWXH(4, 16) OBMCSADWXH(16, 4) OBMCSADWXH(8, 32) OBMCSADWXH(32, 8) OBMCSADWXH(16, 64) OBMCSADWXH(64, 16) //////////////////////////////////////////////////////////////////////////////// // High bit-depth //////////////////////////////////////////////////////////////////////////////// #if CONFIG_AV1_HIGHBITDEPTH static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int height) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - 4; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); do { const __m128i v_p_w = xx_loadl_64(pre + n); const __m128i v_m_d = xx_load_128(mask + n); const __m128i v_w_d = xx_load_128(wsrc + n); const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); // Rounded absolute difference const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); n += 4; if (n % 4 == 0) pre += pre_step; } while (n < 4 * height); return xx_hsum_epi32_si32(v_sad_d); } static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n( const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int width, const int height) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - width; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); assert(width >= 8); assert(IS_POWER_OF_TWO(width)); do { const __m128i v_p1_w = xx_loadl_64(pre + n + 4); const __m128i v_m1_d = xx_load_128(mask + n + 4); const __m128i v_w1_d = xx_load_128(wsrc + n + 4); const __m128i v_p0_w = xx_loadl_64(pre + n); const __m128i v_m0_d = xx_load_128(mask + n); const __m128i v_w0_d = xx_load_128(wsrc + n); const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); // Rounded absolute difference const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); n += 8; if (n % width == 0) pre += pre_step; } while (n < width * height); return xx_hsum_epi32_si32(v_sad_d); } #define HBD_OBMCSADWXH(w, h) \ unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1( \ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ const int32_t *mask) { \ if (w == 4) { \ return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \ } else { \ return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \ } \ } HBD_OBMCSADWXH(128, 128) HBD_OBMCSADWXH(128, 64) HBD_OBMCSADWXH(64, 128) HBD_OBMCSADWXH(64, 64) HBD_OBMCSADWXH(64, 32) HBD_OBMCSADWXH(32, 64) HBD_OBMCSADWXH(32, 32) HBD_OBMCSADWXH(32, 16) HBD_OBMCSADWXH(16, 32) HBD_OBMCSADWXH(16, 16) HBD_OBMCSADWXH(16, 8) HBD_OBMCSADWXH(8, 16) HBD_OBMCSADWXH(8, 8) HBD_OBMCSADWXH(8, 4) HBD_OBMCSADWXH(4, 8) HBD_OBMCSADWXH(4, 4) HBD_OBMCSADWXH(4, 16) HBD_OBMCSADWXH(16, 4) HBD_OBMCSADWXH(8, 32) HBD_OBMCSADWXH(32, 8) HBD_OBMCSADWXH(16, 64) HBD_OBMCSADWXH(64, 16) #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/x86/obmc_variance_avx2.c000066400000000000000000000164561477627663500203600ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/obmc_intrinsic_sse4.h" //////////////////////////////////////////////////////////////////////////////// // 8 bit //////////////////////////////////////////////////////////////////////////////// static inline void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *const sse, int *const sum, const int w, const int h) { int n = 0, width, height = h; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_d = _mm_setzero_si128(); const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); __m128i v_d; const uint8_t *pre_temp; assert(w >= 8); assert(IS_POWER_OF_TWO(w)); assert(IS_POWER_OF_TWO(h)); do { width = w; pre_temp = pre; do { const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp); const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n)); const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n)); const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d); const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d); const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31); const __m256i v_tmp_d = _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d); const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12); const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d); const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1); const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d); const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); pre_temp += 8; n += 8; width -= 8; } while (width > 0); pre += pre_stride; height -= 1; } while (height > 0); v_d = _mm_hadd_epi32(v_sum_d, v_sse_d); v_d = _mm_hadd_epi32(v_d, v_d); *sum = _mm_cvtsi128_si32(v_d); *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(v_d, 4)); } static inline void obmc_variance_w16n(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *const sse, int *const sum, const int w, const int h) { int n = 0, width, height = h; __m256i v_d; __m128i res0; const uint8_t *pre_temp; const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); __m256i v_sum_d = _mm256_setzero_si256(); __m256i v_sse_d = _mm256_setzero_si256(); assert(w >= 16); assert(IS_POWER_OF_TWO(w)); assert(IS_POWER_OF_TWO(h)); do { width = w; pre_temp = pre; do { const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp); const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n)); const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n)); const __m256i v_m1_d = _mm256_loadu_si256((__m256i const *)(mask + n + 8)); const __m256i v_w1_d = _mm256_loadu_si256((__m256i const *)(wsrc + n + 8)); const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b); const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8)); const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d); const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d); const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31); const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31); const __m256i v_tmp0_d = _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d); const __m256i v_tmp1_d = _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d); const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12); const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12); const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d); const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d); const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w); v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d); v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d); pre_temp += 16; n += 16; width -= 16; } while (width > 0); pre += pre_stride; height -= 1; } while (height > 0); v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d); v_d = _mm256_hadd_epi32(v_d, v_d); res0 = _mm256_castsi256_si128(v_d); res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1)); *sum = _mm_cvtsi128_si32(res0); *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(res0, 4)); } #define OBMCVARWXH(W, H) \ unsigned int aom_obmc_variance##W##x##H##_avx2( \ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ const int32_t *mask, unsigned int *sse) { \ int sum; \ if (W == 4) { \ obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ } else if (W == 8) { \ obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ } else { \ obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ } \ \ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ } OBMCVARWXH(128, 128) OBMCVARWXH(128, 64) OBMCVARWXH(64, 128) OBMCVARWXH(64, 64) OBMCVARWXH(64, 32) OBMCVARWXH(32, 64) OBMCVARWXH(32, 32) OBMCVARWXH(32, 16) OBMCVARWXH(16, 32) OBMCVARWXH(16, 16) OBMCVARWXH(16, 8) OBMCVARWXH(8, 16) OBMCVARWXH(8, 8) OBMCVARWXH(8, 4) OBMCVARWXH(4, 8) OBMCVARWXH(4, 4) OBMCVARWXH(4, 16) OBMCVARWXH(16, 4) OBMCVARWXH(8, 32) OBMCVARWXH(32, 8) OBMCVARWXH(16, 64) OBMCVARWXH(64, 16) aom-3.12.1/aom_dsp/x86/obmc_variance_sse4.c000066400000000000000000000342561477627663500203540ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/obmc_intrinsic_sse4.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/variance_impl_ssse3.h" //////////////////////////////////////////////////////////////////////////////// // 8 bit //////////////////////////////////////////////////////////////////////////////// static inline void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *const sse, int *const sum, const int w, const int h) { const int pre_step = pre_stride - w; int n = 0; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_d = _mm_setzero_si128(); assert(w >= 8); assert(IS_POWER_OF_TWO(w)); assert(IS_POWER_OF_TWO(h)); do { const __m128i v_p1_b = xx_loadl_32(pre + n + 4); const __m128i v_m1_d = xx_load_128(mask + n + 4); const __m128i v_w1_d = xx_load_128(wsrc + n + 4); const __m128i v_p0_b = xx_loadl_32(pre + n); const __m128i v_m0_d = xx_load_128(mask + n); const __m128i v_w0_d = xx_load_128(wsrc + n); const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b); const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12); const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12); const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d); const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d); v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); n += 8; if (n % w == 0) pre += pre_step; } while (n < w * h); *sum = xx_hsum_epi32_si32(v_sum_d); *sse = xx_hsum_epi32_si32(v_sse_d); } #define OBMCVARWXH(W, H) \ unsigned int aom_obmc_variance##W##x##H##_sse4_1( \ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ const int32_t *mask, unsigned int *sse) { \ int sum; \ if (W == 4) { \ obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ } else { \ obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ } \ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ } OBMCVARWXH(128, 128) OBMCVARWXH(128, 64) OBMCVARWXH(64, 128) OBMCVARWXH(64, 64) OBMCVARWXH(64, 32) OBMCVARWXH(32, 64) OBMCVARWXH(32, 32) OBMCVARWXH(32, 16) OBMCVARWXH(16, 32) OBMCVARWXH(16, 16) OBMCVARWXH(16, 8) OBMCVARWXH(8, 16) OBMCVARWXH(8, 8) OBMCVARWXH(8, 4) OBMCVARWXH(4, 8) OBMCVARWXH(4, 4) OBMCVARWXH(4, 16) OBMCVARWXH(16, 4) OBMCVARWXH(8, 32) OBMCVARWXH(32, 8) OBMCVARWXH(16, 64) OBMCVARWXH(64, 16) #include "config/aom_dsp_rtcd.h" #define OBMC_SUBPIX_VAR(W, H) \ uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1( \ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ uint16_t fdata3[(H + 1) * W]; \ uint8_t temp2[H * W]; \ \ aom_var_filter_block2d_bil_first_pass_ssse3( \ pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ aom_var_filter_block2d_bil_second_pass_ssse3( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse); \ } OBMC_SUBPIX_VAR(128, 128) OBMC_SUBPIX_VAR(128, 64) OBMC_SUBPIX_VAR(64, 128) OBMC_SUBPIX_VAR(64, 64) OBMC_SUBPIX_VAR(64, 32) OBMC_SUBPIX_VAR(32, 64) OBMC_SUBPIX_VAR(32, 32) OBMC_SUBPIX_VAR(32, 16) OBMC_SUBPIX_VAR(16, 32) OBMC_SUBPIX_VAR(16, 16) OBMC_SUBPIX_VAR(16, 8) OBMC_SUBPIX_VAR(8, 16) OBMC_SUBPIX_VAR(8, 8) OBMC_SUBPIX_VAR(8, 4) OBMC_SUBPIX_VAR(4, 8) OBMC_SUBPIX_VAR(4, 4) OBMC_SUBPIX_VAR(4, 16) OBMC_SUBPIX_VAR(16, 4) OBMC_SUBPIX_VAR(8, 32) OBMC_SUBPIX_VAR(32, 8) OBMC_SUBPIX_VAR(16, 64) OBMC_SUBPIX_VAR(64, 16) //////////////////////////////////////////////////////////////////////////////// // High bit-depth //////////////////////////////////////////////////////////////////////////////// #if CONFIG_AV1_HIGHBITDEPTH static inline void hbd_obmc_variance_w4( const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - 4; int n = 0; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_d = _mm_setzero_si128(); assert(IS_POWER_OF_TWO(h)); do { const __m128i v_p_w = xx_loadl_64(pre + n); const __m128i v_m_d = xx_load_128(mask + n); const __m128i v_w_d = xx_load_128(wsrc + n); const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); n += 4; if (n % 4 == 0) pre += pre_step; } while (n < 4 * h); *sum = xx_hsum_epi32_si32(v_sum_d); *sse = xx_hsum_epi32_si32(v_sse_d); } static inline void hbd_obmc_variance_w8n( const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w, const int h) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - w; int n = 0; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_d = _mm_setzero_si128(); assert(w >= 8); assert(IS_POWER_OF_TWO(w)); assert(IS_POWER_OF_TWO(h)); do { const __m128i v_p1_w = xx_loadl_64(pre + n + 4); const __m128i v_m1_d = xx_load_128(mask + n + 4); const __m128i v_w1_d = xx_load_128(wsrc + n + 4); const __m128i v_p0_w = xx_loadl_64(pre + n); const __m128i v_m0_d = xx_load_128(mask + n); const __m128i v_w0_d = xx_load_128(wsrc + n); const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12); const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12); const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d); const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d); v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); n += 8; if (n % w == 0) pre += pre_step; } while (n < w * h); *sum += xx_hsum_epi32_si64(v_sum_d); *sse += xx_hsum_epi32_si64(v_sse_d); } static inline void highbd_8_obmc_variance(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, const int32_t *mask, int w, int h, unsigned int *sse, int *sum) { int64_t sum64 = 0; uint64_t sse64 = 0; if (w == 4) { hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); } else { hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); } *sum = (int)sum64; *sse = (unsigned int)sse64; } static inline void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, const int32_t *mask, int w, int h, unsigned int *sse, int *sum) { int64_t sum64 = 0; uint64_t sse64 = 0; if (w == 4) { hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); } else if (w < 128 || h < 128) { hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); } else { assert(w == 128 && h == 128); do { hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, 64); pre8 += 64 * pre_stride; wsrc += 64 * w; mask += 64 * w; h -= 64; } while (h > 0); } *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); } static inline void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, const int32_t *mask, int w, int h, unsigned int *sse, int *sum) { int64_t sum64 = 0; uint64_t sse64 = 0; int max_pel_allowed_per_ovf = 512; if (w == 4) { hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); } else if (w * h <= max_pel_allowed_per_ovf) { hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); } else { int h_per_ovf = max_pel_allowed_per_ovf / w; assert(max_pel_allowed_per_ovf % w == 0); do { hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h_per_ovf); pre8 += h_per_ovf * pre_stride; wsrc += h_per_ovf * w; mask += h_per_ovf * w; h -= h_per_ovf; } while (h > 0); } *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); } #define HBD_OBMCVARWXH(W, H) \ unsigned int aom_highbd_8_obmc_variance##W##x##H##_sse4_1( \ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ const int32_t *mask, unsigned int *sse) { \ int sum; \ highbd_8_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ } \ \ unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1( \ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ const int32_t *mask, unsigned int *sse) { \ int sum; \ int64_t var; \ highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ return (var >= 0) ? (uint32_t)var : 0; \ } \ \ unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1( \ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ const int32_t *mask, unsigned int *sse) { \ int sum; \ int64_t var; \ highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ return (var >= 0) ? (uint32_t)var : 0; \ } HBD_OBMCVARWXH(128, 128) HBD_OBMCVARWXH(128, 64) HBD_OBMCVARWXH(64, 128) HBD_OBMCVARWXH(64, 64) HBD_OBMCVARWXH(64, 32) HBD_OBMCVARWXH(32, 64) HBD_OBMCVARWXH(32, 32) HBD_OBMCVARWXH(32, 16) HBD_OBMCVARWXH(16, 32) HBD_OBMCVARWXH(16, 16) HBD_OBMCVARWXH(16, 8) HBD_OBMCVARWXH(8, 16) HBD_OBMCVARWXH(8, 8) HBD_OBMCVARWXH(8, 4) HBD_OBMCVARWXH(4, 8) HBD_OBMCVARWXH(4, 4) HBD_OBMCVARWXH(4, 16) HBD_OBMCVARWXH(16, 4) HBD_OBMCVARWXH(8, 32) HBD_OBMCVARWXH(32, 8) HBD_OBMCVARWXH(16, 64) HBD_OBMCVARWXH(64, 16) #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/x86/quantize_avx2.c000066400000000000000000000314341477627663500174210ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/quantize_x86.h" static inline void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr, __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr, __m256i *shift, int log_scale) { *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr)); *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); if (log_scale > 0) { const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); *zbin = _mm256_add_epi16(*zbin, rnd); *zbin = _mm256_srai_epi16(*zbin, log_scale); } // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16) *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); *round = _mm256_permute4x64_epi64(*round, 0x54); if (log_scale > 0) { const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); *round = _mm256_add_epi16(*round, rnd); *round = _mm256_srai_epi16(*round, log_scale); } *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); *quant = _mm256_permute4x64_epi64(*quant, 0x54); *dequant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr)); *shift = _mm256_permute4x64_epi64(*shift, 0x54); } static inline __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) { const __m256i coeff1 = _mm256_load_si256((__m256i *)coeff_ptr); const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); return _mm256_packs_epi32(coeff1, coeff2); } static inline void store_coefficients_avx2(__m256i coeff_vals, tran_low_t *coeff_ptr) { __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); _mm256_store_si256((__m256i *)coeff_ptr, coeff_vals_lo); _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); } static AOM_FORCE_INLINE __m256i quantize_b_logscale0_16( const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) { const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); if (_mm256_movemask_epi8(v_zbin_mask) == 0) { _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); return _mm256_setzero_si256(); } // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0 const __m256i v_tmp_rnd = _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * // quant_shift_ptr[rc != 0]) >> // (16 - log_scale + AOM_QM_BITS)); const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift); const __m256i v_nz_mask = _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant); store_coefficients_avx2(v_qcoeff, qcoeff_ptr); store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr); return v_nz_mask; } static inline __m256i get_max_lane_eob(const int16_t *iscan, __m256i v_eobmax, __m256i v_mask) { const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8); const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask); const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask); return _mm256_max_epi16(v_eobmax, v_nz_iscan); } static inline int16_t accumulate_eob256(__m256i eob256) { const __m128i eob_lo = _mm256_castsi256_si128(eob256); const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1); __m128i eob = _mm_max_epi16(eob_lo, eob_hi); __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); eob = _mm_max_epi16(eob, eob_shuffled); return _mm_extract_epi16(eob, 1); } void aom_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { (void)scan; __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; __m256i v_eobmax = _mm256_setzero_si256(); load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, &v_quant_shift, 0); // Do DC and first 15 AC. __m256i v_nz_mask = quantize_b_logscale0_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, &v_dequant, &v_round, &v_zbin, &v_quant_shift); v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); v_round = _mm256_unpackhi_epi64(v_round, v_round); v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); for (intptr_t count = n_coeffs - 16; count > 0; count -= 16) { coeff_ptr += 16; qcoeff_ptr += 16; dqcoeff_ptr += 16; iscan += 16; v_nz_mask = quantize_b_logscale0_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, &v_dequant, &v_round, &v_zbin, &v_quant_shift); v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); } *eob_ptr = accumulate_eob256(v_eobmax); } static AOM_FORCE_INLINE __m256i quantize_b_logscale_16( const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift, int log_scale) { const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); if (_mm256_movemask_epi8(v_zbin_mask) == 0) { _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); return _mm256_setzero_si256(); } // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0 const __m256i v_tmp_rnd = _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * // quant_shift_ptr[rc != 0]) >> // (16 - log_scale + AOM_QM_BITS)); const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); const __m256i v_tmp32_hi = _mm256_slli_epi16( _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), log_scale); const __m256i v_tmp32_lo = _mm256_srli_epi16( _mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 16 - log_scale); const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo); const __m256i v_dqcoeff_hi = _mm256_slli_epi16( _mm256_mulhi_epi16(v_tmp32, *v_dequant), 16 - log_scale); const __m256i v_dqcoeff_lo = _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32, *v_dequant), log_scale); const __m256i v_dqcoeff = _mm256_sign_epi16(_mm256_or_si256(v_dqcoeff_hi, v_dqcoeff_lo), v_coeff); const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); const __m256i v_nz_mask = _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); store_coefficients_avx2(v_qcoeff, qcoeff_ptr); store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr); return v_nz_mask; } static AOM_FORCE_INLINE void quantize_b_no_qmatrix_avx2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *iscan, int log_scale) { __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; __m256i v_eobmax = _mm256_setzero_si256(); load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, &v_quant_shift, log_scale); // Do DC and first 15 AC. __m256i v_nz_mask = quantize_b_logscale_16( coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, &v_dequant, &v_round, &v_zbin, &v_quant_shift, log_scale); v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); v_round = _mm256_unpackhi_epi64(v_round, v_round); v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); for (intptr_t count = n_coeffs - 16; count > 0; count -= 16) { coeff_ptr += 16; qcoeff_ptr += 16; dqcoeff_ptr += 16; iscan += 16; v_nz_mask = quantize_b_logscale_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, &v_dequant, &v_round, &v_zbin, &v_quant_shift, log_scale); v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); } *eob_ptr = accumulate_eob256(v_eobmax); } void aom_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { (void)scan; quantize_b_no_qmatrix_avx2(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, iscan, 1); } void aom_quantize_b_64x64_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { (void)scan; quantize_b_no_qmatrix_avx2(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, iscan, 2); } aom-3.12.1/aom_dsp/x86/quantize_sse2.c000066400000000000000000000103111477627663500174040ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/quantize_x86.h" void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { const __m128i zero = _mm_setzero_si128(); int index = 16; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i cmp_mask0, cmp_mask1; __m128i eob, eob0; (void)scan_ptr; // Setup global values. load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant, quant_shift_ptr, &shift); // Do DC and first 15 AC. coeff0 = load_coefficients(coeff_ptr); coeff1 = load_coefficients(coeff_ptr + 8); // Poor man's abs(). coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); calculate_qcoeff(&qcoeff0, round, quant, shift); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); calculate_qcoeff(&qcoeff1, round, quant, shift); // Reinsert signs qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr); store_coefficients(qcoeff1, qcoeff_ptr + 8); coeff0 = calculate_dqcoeff(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = calculate_dqcoeff(qcoeff1, dequant); store_coefficients(coeff0, dqcoeff_ptr); store_coefficients(coeff1, dqcoeff_ptr + 8); eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); // AC only loop. while (index < n_coeffs) { coeff0 = load_coefficients(coeff_ptr + index); coeff1 = load_coefficients(coeff_ptr + index + 8); coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); calculate_qcoeff(&qcoeff0, round, quant, shift); calculate_qcoeff(&qcoeff1, round, quant, shift); qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr + index); store_coefficients(qcoeff1, qcoeff_ptr + index + 8); coeff0 = calculate_dqcoeff(qcoeff0, dequant); coeff1 = calculate_dqcoeff(qcoeff1, dequant); store_coefficients(coeff0, dqcoeff_ptr + index); store_coefficients(coeff1, dqcoeff_ptr + index + 8); eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, index, zero); eob = _mm_max_epi16(eob, eob0); index += 16; } *eob_ptr = accumulate_eob(eob); } aom-3.12.1/aom_dsp/x86/quantize_ssse3.c000066400000000000000000000166031477627663500176020ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/quantize_x86.h" static inline void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round, const __m128i quant, const __m128i *shift) { __m128i tmp, qcoeff, tmp1; qcoeff = _mm_adds_epi16(*coeff, round); tmp = _mm_mulhi_epi16(qcoeff, quant); qcoeff = _mm_add_epi16(tmp, qcoeff); tmp = _mm_mullo_epi16(qcoeff, *shift); tmp = _mm_srli_epi16(tmp, 14); tmp1 = _mm_mulhi_epi16(qcoeff, *shift); tmp1 = _mm_slli_epi16(tmp1, 2); *coeff = _mm_or_si128(tmp, tmp1); } static inline void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff, const __m128i dequant, const __m128i zero, tran_low_t *dqcoeff) { // Un-sign to bias rounding like C. const __m128i coeff = _mm_abs_epi16(qcoeff); const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff); const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff); const __m128i low = _mm_mullo_epi16(coeff, dequant); const __m128i high = _mm_mulhi_epi16(coeff, dequant); __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); // "Divide" by 4. dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 2); dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 2); dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0); dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1); _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); } void aom_quantize_b_64x64_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i two = _mm_set1_epi16(2); int index; __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1, qcoeff0, qcoeff1; __m128i cmp_mask0, cmp_mask1, all_zero; __m128i eob = zero, eob0; (void)scan; (void)n_coeffs; // Setup global values. zbin = _mm_load_si128((const __m128i *)zbin_ptr); round = _mm_load_si128((const __m128i *)round_ptr); quant = _mm_load_si128((const __m128i *)quant_ptr); dequant = _mm_load_si128((const __m128i *)dequant_ptr); shift = _mm_load_si128((const __m128i *)quant_shift_ptr); // Shift with rounding. zbin = _mm_add_epi16(zbin, two); round = _mm_add_epi16(round, two); zbin = _mm_srli_epi16(zbin, 2); round = _mm_srli_epi16(round, 2); zbin = _mm_sub_epi16(zbin, one); // Do DC and first 15 AC. coeff0 = load_coefficients(coeff_ptr); coeff1 = load_coefficients(coeff_ptr + 8); qcoeff0 = _mm_abs_epi16(coeff0); qcoeff1 = _mm_abs_epi16(coeff1); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); zbin = _mm_unpackhi_epi64(zbin, zbin); cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_movemask_epi8(all_zero) == 0) { _mm_store_si128((__m128i *)(qcoeff_ptr), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); dequant = _mm_unpackhi_epi64(dequant, dequant); } else { calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift); round = _mm_unpackhi_epi64(round, round); quant = _mm_unpackhi_epi64(quant, quant); shift = _mm_unpackhi_epi64(shift, shift); calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift); // Reinsert signs. qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); // Mask out zbin threshold coeffs. qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr); store_coefficients(qcoeff1, qcoeff_ptr + 8); calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, dqcoeff_ptr); dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, dqcoeff_ptr + 8); eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); } // AC only loop. for (index = 16; index < 1024; index += 16) { coeff0 = load_coefficients(coeff_ptr + index); coeff1 = load_coefficients(coeff_ptr + index + 8); qcoeff0 = _mm_abs_epi16(coeff0); qcoeff1 = _mm_abs_epi16(coeff1); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); if (_mm_movemask_epi8(all_zero) == 0) { _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); continue; } calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift); calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift); qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr + index); store_coefficients(qcoeff1, qcoeff_ptr + index + 8); calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, dqcoeff_ptr + index); calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, dqcoeff_ptr + 8 + index); eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } *eob_ptr = accumulate_eob(eob); } aom-3.12.1/aom_dsp/x86/quantize_ssse3_x86_64.asm000066400000000000000000000325001477627663500211500ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "third_party/x86inc/x86inc.asm" SECTION_RODATA pw_1: times 8 dw 1 SECTION .text %macro QUANTIZE_FN 2 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ shift, qcoeff, dqcoeff, dequant, \ eob, scan, iscan ; actual quantize loop - setup pointers, rounders, etc. movifnidn coeffq, coeffmp movifnidn ncoeffq, ncoeffmp movifnidn zbinq, zbinmp movifnidn roundq, roundmp movifnidn quantq, quantmp movifnidn dequantq, dequantmp mova m0, [zbinq] ; m0 = zbin mova m1, [roundq] ; m1 = round mova m2, [quantq] ; m2 = quant %ifidn %1, b_32x32 pcmpeqw m5, m5 psrlw m5, 15 paddw m0, m5 paddw m1, m5 psrlw m0, 1 ; m0 = (m0 + 1) / 2 psrlw m1, 1 ; m1 = (m1 + 1) / 2 %endif mova m3, [dequantq] ; m3 = dequant mov r2, shiftmp psubw m0, [GLOBAL(pw_1)] mova m4, [r2] ; m4 = shift mov r3, qcoeffmp mov r4, dqcoeffmp mov r5, iscanmp pxor m5, m5 ; m5 = dedicated zero DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob lea coeffq, [ coeffq+ncoeffq*4] lea qcoeffq, [ qcoeffq+ncoeffq*4] lea dqcoeffq, [dqcoeffq+ncoeffq*4] lea iscanq, [ iscanq+ncoeffq*2] neg ncoeffq ; get DC and first 15 AC coeffs ; coeff stored as 32bit numbers & require 16bit numbers mova m9, [ coeffq+ncoeffq*4+ 0] packssdw m9, [ coeffq+ncoeffq*4+16] mova m10, [ coeffq+ncoeffq*4+32] packssdw m10, [ coeffq+ncoeffq*4+48] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin punpckhqdq m0, m0 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin paddsw m6, m1 ; m6 += round punpckhqdq m1, m1 paddsw m11, m1 ; m11 += round pmulhw m8, m6, m2 ; m8 = m6*q>>16 punpckhqdq m2, m2 pmulhw m13, m11, m2 ; m13 = m11*q>>16 paddw m8, m6 ; m8 += m6 paddw m13, m11 ; m13 += m11 %ifidn %1, b_32x32 pmullw m5, m8, m4 ; store the lower 16 bits of m8*qsh %endif pmulhw m8, m4 ; m8 = m8*qsh>>16 %ifidn %1, b_32x32 psllw m8, 1 psrlw m5, 15 por m8, m5 %endif punpckhqdq m4, m4 %ifidn %1, b_32x32 pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh %endif pmulhw m13, m4 ; m13 = m13*qsh>>16 %ifidn %1, b_32x32 psllw m13, 1 psrlw m5, 15 por m13, m5 pxor m5, m5 ; reset m5 to zero register %endif psignw m8, m9 ; m8 = reinsert sign psignw m13, m10 ; m13 = reinsert sign pand m8, m7 pand m13, m12 ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff mova m11, m8 mova m6, m8 pcmpgtw m5, m8 punpcklwd m11, m5 punpckhwd m6, m5 mova [qcoeffq+ncoeffq*4+ 0], m11 mova [qcoeffq+ncoeffq*4+16], m6 pxor m5, m5 mova m11, m13 mova m6, m13 pcmpgtw m5, m13 punpcklwd m11, m5 punpckhwd m6, m5 mova [qcoeffq+ncoeffq*4+32], m11 mova [qcoeffq+ncoeffq*4+48], m6 pxor m5, m5 ; reset m5 to zero register %ifidn %1, b_32x32 pabsw m8, m8 pabsw m13, m13 %endif pmullw m8, m3 ; dqc[i] = qc[i] * q punpckhqdq m3, m3 pmullw m13, m3 ; dqc[i] = qc[i] * q %ifidn %1, b_32x32 psrlw m8, 1 psrlw m13, 1 psignw m8, m9 psignw m13, m10 %endif ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff mova m11, m8 mova m6, m8 pcmpgtw m5, m8 punpcklwd m11, m5 punpckhwd m6, m5 mova [dqcoeffq+ncoeffq*4+ 0], m11 mova [dqcoeffq+ncoeffq*4+16], m6 pxor m5, m5 mova m11, m13 mova m6, m13 pcmpgtw m5, m13 punpcklwd m11, m5 punpckhwd m6, m5 mova [dqcoeffq+ncoeffq*4+32], m11 mova [dqcoeffq+ncoeffq*4+48], m6 pxor m5, m5 ; reset m5 to zero register pcmpeqw m8, m5 ; m8 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] psubw m6, m7 ; m6 = scan[i] + 1 psubw m11, m12 ; m11 = scan[i] + 1 pandn m8, m6 ; m8 = max(eob) pandn m13, m11 ; m13 = max(eob) pmaxsw m8, m13 add ncoeffq, mmsize jz .accumulate_eob .ac_only_loop: ; pack coeff from 32bit to 16bit array mova m9, [ coeffq+ncoeffq*4+ 0] packssdw m9, [ coeffq+ncoeffq*4+16] mova m10, [ coeffq+ncoeffq*4+32] packssdw m10, [ coeffq+ncoeffq*4+48] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin %ifidn %1, b_32x32 pmovmskb r6d, m7 pmovmskb r2d, m12 or r6, r2 jz .skip_iter %endif paddsw m6, m1 ; m6 += round paddsw m11, m1 ; m11 += round pmulhw m14, m6, m2 ; m14 = m6*q>>16 pmulhw m13, m11, m2 ; m13 = m11*q>>16 paddw m14, m6 ; m14 += m6 paddw m13, m11 ; m13 += m11 %ifidn %1, b_32x32 pmullw m5, m14, m4 ; store the lower 16 bits of m14*qsh %endif pmulhw m14, m4 ; m14 = m14*qsh>>16 %ifidn %1, b_32x32 psllw m14, 1 psrlw m5, 15 por m14, m5 pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh %endif pmulhw m13, m4 ; m13 = m13*qsh>>16 %ifidn %1, b_32x32 psllw m13, 1 psrlw m5, 15 por m13, m5 pxor m5, m5 ; reset m5 to zero register %endif psignw m14, m9 ; m14 = reinsert sign psignw m13, m10 ; m13 = reinsert sign pand m14, m7 pand m13, m12 ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff pxor m11, m11 mova m11, m14 mova m6, m14 pcmpgtw m5, m14 punpcklwd m11, m5 punpckhwd m6, m5 mova [qcoeffq+ncoeffq*4+ 0], m11 mova [qcoeffq+ncoeffq*4+16], m6 pxor m5, m5 mova m11, m13 mova m6, m13 pcmpgtw m5, m13 punpcklwd m11, m5 punpckhwd m6, m5 mova [qcoeffq+ncoeffq*4+32], m11 mova [qcoeffq+ncoeffq*4+48], m6 pxor m5, m5 ; reset m5 to zero register %ifidn %1, b_32x32 pabsw m14, m14 pabsw m13, m13 %endif pmullw m14, m3 ; dqc[i] = qc[i] * q pmullw m13, m3 ; dqc[i] = qc[i] * q %ifidn %1, b_32x32 psrlw m14, 1 psrlw m13, 1 psignw m14, m9 psignw m13, m10 %endif ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff mova m11, m14 mova m6, m14 pcmpgtw m5, m14 punpcklwd m11, m5 punpckhwd m6, m5 mova [dqcoeffq+ncoeffq*4+ 0], m11 mova [dqcoeffq+ncoeffq*4+16], m6 pxor m5, m5 mova m11, m13 mova m6, m13 pcmpgtw m5, m13 punpcklwd m11, m5 punpckhwd m6, m5 mova [dqcoeffq+ncoeffq*4+32], m11 mova [dqcoeffq+ncoeffq*4+48], m6 pxor m5, m5 pcmpeqw m14, m5 ; m14 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] psubw m6, m7 ; m6 = scan[i] + 1 psubw m11, m12 ; m11 = scan[i] + 1 pandn m14, m6 ; m14 = max(eob) pandn m13, m11 ; m13 = max(eob) pmaxsw m8, m14 pmaxsw m8, m13 add ncoeffq, mmsize jl .ac_only_loop %ifidn %1, b_32x32 jmp .accumulate_eob .skip_iter: mova [qcoeffq+ncoeffq*4+ 0], m5 mova [qcoeffq+ncoeffq*4+16], m5 mova [qcoeffq+ncoeffq*4+32], m5 mova [qcoeffq+ncoeffq*4+48], m5 mova [dqcoeffq+ncoeffq*4+ 0], m5 mova [dqcoeffq+ncoeffq*4+16], m5 mova [dqcoeffq+ncoeffq*4+32], m5 mova [dqcoeffq+ncoeffq*4+48], m5 add ncoeffq, mmsize jl .ac_only_loop %endif .accumulate_eob: ; horizontally accumulate/max eobs and write into [eob] memory pointer mov r2, eobmp pshufd m7, m8, 0xe pmaxsw m8, m7 pshuflw m7, m8, 0xe pmaxsw m8, m7 pshuflw m7, m8, 0x1 pmaxsw m8, m7 pextrw r6, m8, 0 mov [r2], r6 RET %endmacro INIT_XMM ssse3 QUANTIZE_FN b, 9 QUANTIZE_FN b_32x32, 9 aom-3.12.1/aom_dsp/x86/quantize_x86.h000066400000000000000000000204521477627663500171710ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom/aom_integer.h" static inline void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, const int16_t *round_ptr, __m128i *round, const int16_t *quant_ptr, __m128i *quant, const int16_t *dequant_ptr, __m128i *dequant, const int16_t *shift_ptr, __m128i *shift) { *zbin = _mm_load_si128((const __m128i *)zbin_ptr); *round = _mm_load_si128((const __m128i *)round_ptr); *quant = _mm_load_si128((const __m128i *)quant_ptr); *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1)); *dequant = _mm_load_si128((const __m128i *)dequant_ptr); *shift = _mm_load_si128((const __m128i *)shift_ptr); } // With ssse3 and later abs() and sign() are preferred. static inline __m128i invert_sign_sse2(__m128i a, __m128i sign) { a = _mm_xor_si128(a, sign); return _mm_sub_epi16(a, sign); } static inline __m128i invert_sign_32_sse2(__m128i a, __m128i sign) { a = _mm_xor_si128(a, sign); return _mm_sub_epi32(a, sign); } static inline void calculate_qcoeff(__m128i *coeff, const __m128i round, const __m128i quant, const __m128i shift) { __m128i tmp, qcoeff; qcoeff = _mm_adds_epi16(*coeff, round); tmp = _mm_mulhi_epi16(qcoeff, quant); qcoeff = _mm_add_epi16(tmp, qcoeff); *coeff = _mm_mulhi_epi16(qcoeff, shift); } static inline void calculate_qcoeff_log_scale(__m128i *coeff, const __m128i round, const __m128i quant, const __m128i *shift, const int *log_scale) { __m128i tmp, tmp1, qcoeff; qcoeff = _mm_adds_epi16(*coeff, round); tmp = _mm_mulhi_epi16(qcoeff, quant); qcoeff = _mm_add_epi16(tmp, qcoeff); tmp = _mm_mullo_epi16(qcoeff, *shift); tmp = _mm_srli_epi16(tmp, (16 - *log_scale)); tmp1 = _mm_mulhi_epi16(qcoeff, *shift); tmp1 = _mm_slli_epi16(tmp1, *log_scale); *coeff = _mm_or_si128(tmp, tmp1); } static inline __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { return _mm_mullo_epi16(qcoeff, dequant); } static inline void calculate_dqcoeff_and_store_log_scale(__m128i qcoeff, __m128i dequant, const __m128i zero, tran_low_t *dqcoeff, const int *log_scale) { // calculate abs __m128i coeff_sign = _mm_srai_epi16(qcoeff, 15); __m128i coeff = invert_sign_sse2(qcoeff, coeff_sign); const __m128i sign_0 = _mm_unpacklo_epi16(coeff_sign, zero); const __m128i sign_1 = _mm_unpackhi_epi16(coeff_sign, zero); const __m128i low = _mm_mullo_epi16(coeff, dequant); const __m128i high = _mm_mulhi_epi16(coeff, dequant); __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, *log_scale); dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, *log_scale); dqcoeff32_0 = invert_sign_32_sse2(dqcoeff32_0, sign_0); dqcoeff32_1 = invert_sign_32_sse2(dqcoeff32_1, sign_1); _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); } // Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing // to zbin to add 1 to the index in 'scan'. static inline __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, const __m128i zbin_mask0, const __m128i zbin_mask1, const int16_t *scan_ptr, const int index, const __m128i zero) { const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero); __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index)); __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8)); __m128i eob0, eob1; // Add one to convert from indices to counts scan0 = _mm_sub_epi16(scan0, zbin_mask0); scan1 = _mm_sub_epi16(scan1, zbin_mask1); eob0 = _mm_andnot_si128(zero_coeff0, scan0); eob1 = _mm_andnot_si128(zero_coeff1, scan1); return _mm_max_epi16(eob0, eob1); } static inline int16_t accumulate_eob(__m128i eob) { __m128i eob_shuffled; eob_shuffled = _mm_shuffle_epi32(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); eob = _mm_max_epi16(eob, eob_shuffled); return _mm_extract_epi16(eob, 1); } static inline __m128i load_coefficients(const tran_low_t *coeff_ptr) { assert(sizeof(tran_low_t) == 4); const __m128i coeff1 = _mm_load_si128((__m128i *)(coeff_ptr)); const __m128i coeff2 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); return _mm_packs_epi32(coeff1, coeff2); } static inline void store_coefficients(__m128i coeff_vals, tran_low_t *coeff_ptr) { assert(sizeof(tran_low_t) == 4); __m128i one = _mm_set1_epi16(1); __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); } static inline void update_mask1(__m128i *cmp_mask0, __m128i *cmp_mask1, const int16_t *iscan_ptr, int *is_found, __m128i *mask) { __m128i all_zero; __m128i temp_mask = _mm_setzero_si128(); all_zero = _mm_or_si128(*cmp_mask0, *cmp_mask1); if (_mm_movemask_epi8(all_zero)) { __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr)); __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0); __m128i iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8)); __m128i mask1 = _mm_and_si128(*cmp_mask1, iscan1); temp_mask = _mm_max_epi16(mask0, mask1); *is_found = 1; } *mask = _mm_max_epi16(temp_mask, *mask); } static inline void update_mask0(__m128i *qcoeff0, __m128i *qcoeff1, __m128i *threshold, const int16_t *iscan_ptr, int *is_found, __m128i *mask) { __m128i zero = _mm_setzero_si128(); __m128i coeff[4], cmp_mask0, cmp_mask1, cmp_mask2, cmp_mask3; coeff[0] = _mm_unpacklo_epi16(*qcoeff0, zero); coeff[1] = _mm_unpackhi_epi16(*qcoeff0, zero); coeff[2] = _mm_unpacklo_epi16(*qcoeff1, zero); coeff[3] = _mm_unpackhi_epi16(*qcoeff1, zero); coeff[0] = _mm_slli_epi32(coeff[0], AOM_QM_BITS); cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]); coeff[1] = _mm_slli_epi32(coeff[1], AOM_QM_BITS); cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]); coeff[2] = _mm_slli_epi32(coeff[2], AOM_QM_BITS); cmp_mask2 = _mm_cmpgt_epi32(coeff[2], threshold[1]); coeff[3] = _mm_slli_epi32(coeff[3], AOM_QM_BITS); cmp_mask3 = _mm_cmpgt_epi32(coeff[3], threshold[1]); cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1); cmp_mask1 = _mm_packs_epi32(cmp_mask2, cmp_mask3); update_mask1(&cmp_mask0, &cmp_mask1, iscan_ptr, is_found, mask); } static inline int calculate_non_zero_count(__m128i mask) { __m128i mask0, mask1; int non_zero_count = 0; mask0 = _mm_unpackhi_epi64(mask, mask); mask1 = _mm_max_epi16(mask0, mask); mask0 = _mm_shuffle_epi32(mask1, 1); mask0 = _mm_max_epi16(mask0, mask1); mask1 = _mm_srli_epi32(mask0, 16); mask0 = _mm_max_epi16(mask0, mask1); non_zero_count = _mm_extract_epi16(mask0, 0) + 1; return non_zero_count; } aom-3.12.1/aom_dsp/x86/sad4d_avx2.c000066400000000000000000000305201477627663500165530ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // AVX2 #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/synonyms_avx2.h" static AOM_FORCE_INLINE void aggregate_and_store_sum(uint32_t res[4], const __m256i *sum_ref0, const __m256i *sum_ref1, const __m256i *sum_ref2, const __m256i *sum_ref3) { // In sum_ref-i the result is saved in the first 4 bytes and the other 4 // bytes are zeroed. // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 // 0, 0, 1, 1 __m256i sum_ref01 = _mm256_castps_si256(_mm256_shuffle_ps( _mm256_castsi256_ps(*sum_ref0), _mm256_castsi256_ps(*sum_ref1), _MM_SHUFFLE(2, 0, 2, 0))); // 2, 2, 3, 3 __m256i sum_ref23 = _mm256_castps_si256(_mm256_shuffle_ps( _mm256_castsi256_ps(*sum_ref2), _mm256_castsi256_ps(*sum_ref3), _MM_SHUFFLE(2, 0, 2, 0))); // sum adjacent 32 bit integers __m256i sum_ref0123 = _mm256_hadd_epi32(sum_ref01, sum_ref23); // add the low 128 bit to the high 128 bit __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(sum_ref0123), _mm256_extractf128_si256(sum_ref0123, 1)); _mm_storeu_si128((__m128i *)(res), sum); } static AOM_FORCE_INLINE void aom_sadMxNx4d_avx2( int M, int N, const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) { __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; int i, j; const uint8_t *ref0, *ref1, *ref2, *ref3; ref0 = ref[0]; ref1 = ref[1]; ref2 = ref[2]; ref3 = ref[3]; sum_ref0 = _mm256_setzero_si256(); sum_ref2 = _mm256_setzero_si256(); sum_ref1 = _mm256_setzero_si256(); sum_ref3 = _mm256_setzero_si256(); for (i = 0; i < N; i++) { for (j = 0; j < M; j += 32) { // load src and all refs src_reg = _mm256_loadu_si256((const __m256i *)(src + j)); ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j)); ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j)); ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j)); ref3_reg = _mm256_loadu_si256((const __m256i *)(ref3 + j)); // sum of the absolute differences between every ref-i to src ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); // sum every ref-i sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); } src += src_stride; ref0 += ref_stride; ref1 += ref_stride; ref2 += ref_stride; ref3 += ref_stride; } aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &sum_ref3); } static AOM_FORCE_INLINE void aom_sadMxNx3d_avx2( int M, int N, const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) { __m256i src_reg, ref0_reg, ref1_reg, ref2_reg; __m256i sum_ref0, sum_ref1, sum_ref2; int i, j; const uint8_t *ref0, *ref1, *ref2; const __m256i zero = _mm256_setzero_si256(); ref0 = ref[0]; ref1 = ref[1]; ref2 = ref[2]; sum_ref0 = _mm256_setzero_si256(); sum_ref2 = _mm256_setzero_si256(); sum_ref1 = _mm256_setzero_si256(); for (i = 0; i < N; i++) { for (j = 0; j < M; j += 32) { // load src and all refs src_reg = _mm256_loadu_si256((const __m256i *)(src + j)); ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j)); ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j)); ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j)); // sum of the absolute differences between every ref-i to src ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); // sum every ref-i sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); } src += src_stride; ref0 += ref_stride; ref1 += ref_stride; ref2 += ref_stride; } aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &zero); } #define SADMXN_AVX2(m, n) \ void aom_sad##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \ const uint8_t *const ref[4], int ref_stride, \ uint32_t res[4]) { \ aom_sadMxNx4d_avx2(m, n, src, src_stride, ref, ref_stride, res); \ } \ void aom_sad##m##x##n##x3d_avx2(const uint8_t *src, int src_stride, \ const uint8_t *const ref[4], int ref_stride, \ uint32_t res[4]) { \ aom_sadMxNx3d_avx2(m, n, src, src_stride, ref, ref_stride, res); \ } SADMXN_AVX2(32, 16) SADMXN_AVX2(32, 32) SADMXN_AVX2(32, 64) SADMXN_AVX2(64, 32) SADMXN_AVX2(64, 64) SADMXN_AVX2(64, 128) SADMXN_AVX2(128, 64) SADMXN_AVX2(128, 128) #if !CONFIG_REALTIME_ONLY SADMXN_AVX2(32, 8) SADMXN_AVX2(64, 16) #endif // !CONFIG_REALTIME_ONLY #define SAD_SKIP_MXN_AVX2(m, n) \ void aom_sad_skip_##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \ const uint8_t *const ref[4], \ int ref_stride, uint32_t res[4]) { \ aom_sadMxNx4d_avx2(m, ((n) >> 1), src, 2 * src_stride, ref, \ 2 * ref_stride, res); \ res[0] <<= 1; \ res[1] <<= 1; \ res[2] <<= 1; \ res[3] <<= 1; \ } SAD_SKIP_MXN_AVX2(32, 16) SAD_SKIP_MXN_AVX2(32, 32) SAD_SKIP_MXN_AVX2(32, 64) SAD_SKIP_MXN_AVX2(64, 32) SAD_SKIP_MXN_AVX2(64, 64) SAD_SKIP_MXN_AVX2(64, 128) SAD_SKIP_MXN_AVX2(128, 64) SAD_SKIP_MXN_AVX2(128, 128) #if !CONFIG_REALTIME_ONLY SAD_SKIP_MXN_AVX2(64, 16) #endif // !CONFIG_REALTIME_ONLY static AOM_FORCE_INLINE void aom_sad16xNx3d_avx2(int N, const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) { __m256i src_reg, ref0_reg, ref1_reg, ref2_reg; __m256i sum_ref0, sum_ref1, sum_ref2; const uint8_t *ref0, *ref1, *ref2; const __m256i zero = _mm256_setzero_si256(); assert(N % 2 == 0); ref0 = ref[0]; ref1 = ref[1]; ref2 = ref[2]; sum_ref0 = _mm256_setzero_si256(); sum_ref2 = _mm256_setzero_si256(); sum_ref1 = _mm256_setzero_si256(); for (int i = 0; i < N; i += 2) { // load src and all refs src_reg = yy_loadu2_128(src + src_stride, src); ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0); ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1); ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2); // sum of the absolute differences between every ref-i to src ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); // sum every ref-i sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); src += 2 * src_stride; ref0 += 2 * ref_stride; ref1 += 2 * ref_stride; ref2 += 2 * ref_stride; } aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &zero); } static AOM_FORCE_INLINE void aom_sad16xNx4d_avx2(int N, const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) { __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; const uint8_t *ref0, *ref1, *ref2, *ref3; assert(N % 2 == 0); ref0 = ref[0]; ref1 = ref[1]; ref2 = ref[2]; ref3 = ref[3]; sum_ref0 = _mm256_setzero_si256(); sum_ref2 = _mm256_setzero_si256(); sum_ref1 = _mm256_setzero_si256(); sum_ref3 = _mm256_setzero_si256(); for (int i = 0; i < N; i += 2) { // load src and all refs src_reg = yy_loadu2_128(src + src_stride, src); ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0); ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1); ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2); ref3_reg = yy_loadu2_128(ref3 + ref_stride, ref3); // sum of the absolute differences between every ref-i to src ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); // sum every ref-i sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); src += 2 * src_stride; ref0 += 2 * ref_stride; ref1 += 2 * ref_stride; ref2 += 2 * ref_stride; ref3 += 2 * ref_stride; } aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &sum_ref3); } #define SAD16XNX3_AVX2(n) \ void aom_sad16x##n##x3d_avx2(const uint8_t *src, int src_stride, \ const uint8_t *const ref[4], int ref_stride, \ uint32_t res[4]) { \ aom_sad16xNx3d_avx2(n, src, src_stride, ref, ref_stride, res); \ } #define SAD16XNX4_AVX2(n) \ void aom_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \ const uint8_t *const ref[4], int ref_stride, \ uint32_t res[4]) { \ aom_sad16xNx4d_avx2(n, src, src_stride, ref, ref_stride, res); \ } SAD16XNX4_AVX2(32) SAD16XNX4_AVX2(16) SAD16XNX4_AVX2(8) SAD16XNX3_AVX2(32) SAD16XNX3_AVX2(16) SAD16XNX3_AVX2(8) #if !CONFIG_REALTIME_ONLY SAD16XNX3_AVX2(64) SAD16XNX3_AVX2(4) SAD16XNX4_AVX2(64) SAD16XNX4_AVX2(4) #endif // !CONFIG_REALTIME_ONLY #define SAD_SKIP_16XN_AVX2(n) \ void aom_sad_skip_16x##n##x4d_avx2(const uint8_t *src, int src_stride, \ const uint8_t *const ref[4], \ int ref_stride, uint32_t res[4]) { \ aom_sad16xNx4d_avx2(((n) >> 1), src, 2 * src_stride, ref, 2 * ref_stride, \ res); \ res[0] <<= 1; \ res[1] <<= 1; \ res[2] <<= 1; \ res[3] <<= 1; \ } SAD_SKIP_16XN_AVX2(32) SAD_SKIP_16XN_AVX2(16) #if !CONFIG_REALTIME_ONLY SAD_SKIP_16XN_AVX2(64) #endif // !CONFIG_REALTIME_ONLY aom-3.12.1/aom_dsp/x86/sad4d_sse2.asm000066400000000000000000000300331477627663500171040ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "third_party/x86inc/x86inc.asm" SECTION .text ; 'spill_src_stride' affect a lot how the code works. ; ; When 'spill_src_stride' is false, the 'src_strideq' resides in ; register, [srcq + src_strideq + offset] is allowed, so we can simply ; use such form to access src memory and don't bother to update 'srcq' ; at each line. We only update 'srcq' each two-lines using a compact ; LEA instruction like [srcq+src_strideq*2]. ; ; When 'spill_src_stride' is true, the 'src_strideq' resides in memory. ; we cannot use above form to access memory, we have to update ; 'srcq' at each line break. As we process two parts (first,second) ; together in each macro function, the second part may also sit ; in the next line, which means we also need to possibly add ; one 'src_strideq' to 'srcq' before processing second part. %macro HANDLE_SECOND_OFFSET 0 %if spill_src_stride %define second_offset 0 add srcq, src_strideq %else %define second_offset (src_strideq) %endif %endmacro ; This is specically designed to handle when src_strideq is a ; memory position, under such case, we can not accomplish ; complex address calculation using LEA, and fall back to ; using simple ADD instruction at each line ending. %macro ADVANCE_END_OF_TWO_LINES 0 %if spill_src_stride add srcq, src_strideq %else lea srcq, [srcq+src_strideq*2] %endif ; note: ref_stride is never spilled when processing two lines lea ref1q, [ref1q+ref_strideq*2] lea ref2q, [ref2q+ref_strideq*2] lea ref3q, [ref3q+ref_strideq*2] lea ref4q, [ref4q+ref_strideq*2] %endmacro ; PROCESS_4x2x4 first %macro PROCESS_4x2x4 1 movd m0, [srcq] HANDLE_SECOND_OFFSET %if %1 == 1 movd m6, [ref1q] movd m4, [ref2q] movd m7, [ref3q] movd m5, [ref4q] movd m1, [srcq + second_offset] movd m2, [ref1q+ref_strideq] punpckldq m0, m1 punpckldq m6, m2 movd m1, [ref2q+ref_strideq] movd m2, [ref3q+ref_strideq] movd m3, [ref4q+ref_strideq] punpckldq m4, m1 punpckldq m7, m2 punpckldq m5, m3 movlhps m0, m0 movlhps m6, m4 movlhps m7, m5 psadbw m6, m0 psadbw m7, m0 %else movd m1, [ref1q] movd m5, [ref1q+ref_strideq] movd m2, [ref2q] movd m4, [ref2q+ref_strideq] punpckldq m1, m5 punpckldq m2, m4 movd m3, [ref3q] movd m5, [ref3q+ref_strideq] punpckldq m3, m5 movd m4, [ref4q] movd m5, [ref4q+ref_strideq] punpckldq m4, m5 movd m5, [srcq + second_offset] punpckldq m0, m5 movlhps m0, m0 movlhps m1, m2 movlhps m3, m4 psadbw m1, m0 psadbw m3, m0 paddd m6, m1 paddd m7, m3 %endif %endmacro ; PROCESS_8x2x4 first %macro PROCESS_8x2x4 1 movh m0, [srcq] HANDLE_SECOND_OFFSET %if %1 == 1 movh m4, [ref1q] movh m5, [ref2q] movh m6, [ref3q] movh m7, [ref4q] movhps m0, [srcq + second_offset] movhps m4, [ref1q+ref_strideq] movhps m5, [ref2q+ref_strideq] movhps m6, [ref3q+ref_strideq] movhps m7, [ref4q+ref_strideq] psadbw m4, m0 psadbw m5, m0 psadbw m6, m0 psadbw m7, m0 %else movh m1, [ref1q] movh m2, [ref2q] movhps m0, [srcq + second_offset] movhps m1, [ref1q+ref_strideq] movhps m2, [ref2q+ref_strideq] psadbw m1, m0 psadbw m2, m0 paddd m4, m1 paddd m5, m2 movh m1, [ref3q] movhps m1, [ref3q+ref_strideq] movh m2, [ref4q] movhps m2, [ref4q+ref_strideq] psadbw m1, m0 psadbw m2, m0 paddd m6, m1 paddd m7, m2 %endif %endmacro ; PROCESS_FIRST_MMSIZE %macro PROCESS_FIRST_MMSIZE 0 mova m0, [srcq] movu m4, [ref1q] movu m5, [ref2q] movu m6, [ref3q] movu m7, [ref4q] psadbw m4, m0 psadbw m5, m0 psadbw m6, m0 psadbw m7, m0 %endmacro ; PROCESS_16x1x4 offset %macro PROCESS_16x1x4 1 mova m0, [srcq + %1] movu m1, [ref1q + ref_offsetq + %1] movu m2, [ref2q + ref_offsetq + %1] psadbw m1, m0 psadbw m2, m0 paddd m4, m1 paddd m5, m2 movu m1, [ref3q + ref_offsetq + %1] movu m2, [ref4q + ref_offsetq + %1] psadbw m1, m0 psadbw m2, m0 paddd m6, m1 paddd m7, m2 %endmacro ; void aom_sadNxNx4d_sse2(uint8_t *src, int src_stride, ; uint8_t *ref[4], int ref_stride, ; uint32_t res[4]); ; Macro Arguments: ; 1: Width ; 2: Height ; 3: If 0, then normal sad, else skip rows %macro SADNXN4D 2-3 0 %define spill_src_stride 0 %define spill_ref_stride 0 %define spill_cnt 0 ; Whether a shared offset should be used instead of adding strides to ; each reference array. With this option, only one line will be processed ; per loop iteration. %define use_ref_offset (%1 >= mmsize) ; Remove loops in the 4x4 and 8x4 case %define use_loop (use_ref_offset || %2 > 4) %if %3 == 1 ; skip rows %if AOM_ARCH_X86_64 %if use_ref_offset cglobal sad_skip_%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, \ ref2, ref3, ref4, cnt, ref_offset %elif use_loop cglobal sad_skip_%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, \ ref2, ref3, ref4, cnt %else cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, \ ref2, ref3, ref4 %endif %else %if use_ref_offset cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, \ ref4 %define spill_src_stride 1 %define spill_ref_stride 1 %elif use_loop cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, \ ref3, ref4 %define spill_src_stride 1 %else cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, \ ref3, ref4 %endif %endif %else ; normal sad %if AOM_ARCH_X86_64 %if use_ref_offset cglobal sad%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, ref2, \ ref3, ref4, cnt, ref_offset %elif use_loop cglobal sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, ref2, \ ref3, ref4, cnt %else cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, ref2, \ ref3, ref4 %endif %else %if use_ref_offset cglobal sad%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, ref4 %define spill_src_stride 1 %define spill_ref_stride 1 %elif use_loop cglobal sad%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, ref3, ref4 %define spill_src_stride 1 %else cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, ref3, \ ref4 %endif %endif %endif %if spill_src_stride %define src_strideq r1mp %define src_strided r1mp %endif %if spill_ref_stride %define ref_strideq r3mp %define ref_strided r3mp %endif %if spill_cnt SUB rsp, 4 %define cntd word [rsp] %endif %if %3 == 1 sal src_strided, 1 sal ref_strided, 1 %endif movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided mov ref2q, [ref1q+gprsize*1] mov ref3q, [ref1q+gprsize*2] mov ref4q, [ref1q+gprsize*3] mov ref1q, [ref1q+gprsize*0] ; Is the loop for this wxh in another function? ; If so, we jump into that function for the loop and returning %define external_loop (use_ref_offset && %1 > mmsize && %1 != %2) %if use_ref_offset PROCESS_FIRST_MMSIZE %if %1 > mmsize mov ref_offsetq, 0 mov cntd, %2 >> %3 ; Jump part way into the loop for the square version of this width %if %3 == 1 jmp mangle(private_prefix %+ _sad_skip_%1x%1x4d %+ SUFFIX).midloop %else jmp mangle(private_prefix %+ _sad%1x%1x4d %+ SUFFIX).midloop %endif %else mov ref_offsetq, ref_strideq add srcq, src_strideq mov cntd, (%2 >> %3) - 1 %endif %if external_loop == 0 .loop: ; Unrolled horizontal loop %assign h_offset 0 %rep %1/mmsize PROCESS_16x1x4 h_offset %if h_offset == 0 ; The first row of the first column is done outside the loop and jumps here .midloop: %endif %assign h_offset h_offset+mmsize %endrep add srcq, src_strideq add ref_offsetq, ref_strideq sub cntd, 1 jnz .loop %endif %else PROCESS_%1x2x4 1 ADVANCE_END_OF_TWO_LINES %if use_loop mov cntd, (%2/2 >> %3) - 1 .loop: %endif PROCESS_%1x2x4 0 %if use_loop ADVANCE_END_OF_TWO_LINES sub cntd, 1 jnz .loop %endif %endif %if spill_cnt ; Undo stack allocation for cnt ADD rsp, 4 %endif %if external_loop == 0 %if %3 == 0 %define resultq r4 %define resultmp r4mp %endif ; Undo modifications on parameters on the stack %if %3 == 1 %if spill_src_stride shr src_strided, 1 %endif %if spill_ref_stride shr ref_strided, 1 %endif %endif %if %1 > 4 pslldq m5, 4 pslldq m7, 4 por m4, m5 por m6, m7 mova m5, m4 mova m7, m6 punpcklqdq m4, m6 punpckhqdq m5, m7 paddd m4, m5 %if %3 == 1 pslld m4, 1 %endif movifnidn resultq, resultmp movu [resultq], m4 RET %else pshufd m6, m6, 0x08 pshufd m7, m7, 0x08 %if %3 == 1 pslld m6, 1 pslld m7, 1 %endif movifnidn resultq, resultmp movq [resultq+0], m6 movq [resultq+8], m7 RET %endif %endif ; external_loop == 0 %endmacro INIT_XMM sse2 SADNXN4D 128, 128 SADNXN4D 128, 64 SADNXN4D 64, 128 SADNXN4D 64, 64 SADNXN4D 64, 32 SADNXN4D 32, 64 SADNXN4D 32, 32 SADNXN4D 32, 16 SADNXN4D 16, 32 SADNXN4D 16, 16 SADNXN4D 16, 8 SADNXN4D 8, 16 SADNXN4D 8, 8 SADNXN4D 8, 4 SADNXN4D 4, 8 SADNXN4D 4, 4 %if CONFIG_REALTIME_ONLY==0 SADNXN4D 4, 16 SADNXN4D 16, 4 SADNXN4D 8, 32 SADNXN4D 32, 8 SADNXN4D 16, 64 SADNXN4D 64, 16 %endif SADNXN4D 128, 128, 1 SADNXN4D 128, 64, 1 SADNXN4D 64, 128, 1 SADNXN4D 64, 64, 1 SADNXN4D 64, 32, 1 SADNXN4D 32, 64, 1 SADNXN4D 32, 32, 1 SADNXN4D 32, 16, 1 SADNXN4D 16, 32, 1 SADNXN4D 16, 16, 1 SADNXN4D 8, 16, 1 %if CONFIG_REALTIME_ONLY==0 SADNXN4D 4, 16, 1 SADNXN4D 8, 32, 1 SADNXN4D 16, 64, 1 SADNXN4D 64, 16, 1 %endif aom-3.12.1/aom_dsp/x86/sad_avx2.c000066400000000000000000000252551477627663500163340ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" static inline unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { int i; __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; __m256i sum_sad = _mm256_setzero_si256(); __m256i sum_sad_h; __m128i sum_sad128; for (i = 0; i < h; i++) { ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); sad1_reg = _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); sad2_reg = _mm256_sad_epu8( ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); ref_ptr += ref_stride; src_ptr += src_stride; } sum_sad_h = _mm256_srli_si256(sum_sad, 8); sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); _mm256_zeroupper(); return res; } static inline unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int h) { int i; __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; __m256i sum_sad = _mm256_setzero_si256(); __m256i sum_sad_h; __m128i sum_sad128; int ref2_stride = ref_stride << 1; int src2_stride = src_stride << 1; int max = h >> 1; for (i = 0; i < max; i++) { ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); sad1_reg = _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); sad2_reg = _mm256_sad_epu8( ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); ref_ptr += ref2_stride; src_ptr += src2_stride; } sum_sad_h = _mm256_srli_si256(sum_sad, 8); sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); _mm256_zeroupper(); return res; } #define FSAD64_H(h) \ unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ } #define FSADS64_H(h) \ unsigned int aom_sad_skip_64x##h##_avx2( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride) { \ return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ h / 2); \ } #define FSAD32_H(h) \ unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ } #define FSADS32_H(h) \ unsigned int aom_sad_skip_32x##h##_avx2( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride) { \ return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ h / 2); \ } #define FSAD64 \ FSAD64_H(64) \ FSAD64_H(32) \ FSADS64_H(64) \ FSADS64_H(32) #define FSAD32 \ FSAD32_H(64) \ FSAD32_H(32) \ FSAD32_H(16) \ FSADS32_H(64) \ FSADS32_H(32) \ FSADS32_H(16) /* clang-format off */ FSAD64 FSAD32 /* clang-format on */ #undef FSAD64 #undef FSAD32 #undef FSAD64_H #undef FSAD32_H #define FSADAVG64_H(h) \ unsigned int aom_sad64x##h##_avg_avx2( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred) { \ int i; \ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ __m256i sum_sad = _mm256_setzero_si256(); \ __m256i sum_sad_h; \ __m128i sum_sad128; \ for (i = 0; i < h; i++) { \ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ ref1_reg = _mm256_avg_epu8( \ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ ref2_reg = _mm256_avg_epu8( \ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ sad1_reg = _mm256_sad_epu8( \ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ sad2_reg = _mm256_sad_epu8( \ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ sum_sad = \ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ ref_ptr += ref_stride; \ src_ptr += src_stride; \ second_pred += 64; \ } \ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ _mm256_zeroupper(); \ return res; \ } #define FSADAVG32_H(h) \ unsigned int aom_sad32x##h##_avg_avx2( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred) { \ int i; \ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ __m256i sum_sad = _mm256_setzero_si256(); \ __m256i sum_sad_h; \ __m128i sum_sad128; \ int ref2_stride = ref_stride << 1; \ int src2_stride = src_stride << 1; \ int max = h >> 1; \ for (i = 0; i < max; i++) { \ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ ref1_reg = _mm256_avg_epu8( \ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ ref2_reg = _mm256_avg_epu8( \ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ sad1_reg = _mm256_sad_epu8( \ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ sad2_reg = _mm256_sad_epu8( \ ref2_reg, \ _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ sum_sad = \ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ ref_ptr += ref2_stride; \ src_ptr += src2_stride; \ second_pred += 64; \ } \ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ _mm256_zeroupper(); \ return res; \ } #define FSADAVG64 \ FSADAVG64_H(64) \ FSADAVG64_H(32) #define FSADAVG32 \ FSADAVG32_H(64) \ FSADAVG32_H(32) \ FSADAVG32_H(16) /* clang-format off */ FSADAVG64 FSADAVG32 /* clang-format on */ #undef FSADAVG64 #undef FSADAVG32 #undef FSADAVG64_H #undef FSADAVG32_H aom-3.12.1/aom_dsp/x86/sad_impl_avx2.c000066400000000000000000000165201477627663500173500ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { __m256i s1, s2, r1, r2; __m256i sum = _mm256_setzero_si256(); __m128i sum_i128; int i; for (i = 0; i < 16; ++i) { r1 = _mm256_loadu_si256((__m256i const *)ref_ptr); r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr)); s2 = _mm256_sad_epu8( r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2)); ref_ptr += ref_stride << 1; src_ptr += src_stride << 1; } sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8)); sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1), _mm256_castsi256_si128(sum)); return (unsigned int)_mm_cvtsi128_si32(sum_i128); } static unsigned int sad64x32(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { unsigned int half_width = 32; uint32_t sum = sad32x32(src_ptr, src_stride, ref_ptr, ref_stride); src_ptr += half_width; ref_ptr += half_width; sum += sad32x32(src_ptr, src_stride, ref_ptr, ref_stride); return sum; } static unsigned int sad64x64(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { uint32_t sum = sad64x32(src_ptr, src_stride, ref_ptr, ref_stride); src_ptr += src_stride << 5; ref_ptr += ref_stride << 5; sum += sad64x32(src_ptr, src_stride, ref_ptr, ref_stride); return sum; } unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { unsigned int half_width = 64; uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); src_ptr += half_width; ref_ptr += half_width; sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); return sum; } unsigned int aom_sad64x128_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); src_ptr += src_stride << 6; ref_ptr += ref_stride << 6; sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); return sum; } unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { uint32_t sum = aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride); src_ptr += src_stride << 6; ref_ptr += ref_stride << 6; sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride); return sum; } unsigned int aom_sad_skip_128x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { const uint32_t half_width = 64; uint32_t sum = sad64x32(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2); src_ptr += half_width; ref_ptr += half_width; sum += sad64x32(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2); return 2 * sum; } unsigned int aom_sad_skip_64x128_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { const uint32_t sum = sad64x64(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride); return 2 * sum; } unsigned int aom_sad_skip_128x128_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { const uint32_t sum = aom_sad128x64_avx2(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride); return 2 * sum; } static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int h, const uint8_t *second_pred, const int second_pred_stride) { int i; __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; __m256i sum_sad = _mm256_setzero_si256(); __m256i sum_sad_h; __m128i sum_sad128; for (i = 0; i < h; i++) { ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); ref1_reg = _mm256_avg_epu8( ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); ref2_reg = _mm256_avg_epu8( ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); sad1_reg = _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); sad2_reg = _mm256_sad_epu8( ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); ref_ptr += ref_stride; src_ptr += src_stride; second_pred += second_pred_stride; } sum_sad_h = _mm256_srli_si256(sum_sad, 8); sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); return (unsigned int)_mm_cvtsi128_si32(sum_sad128); } unsigned int aom_sad64x128_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred) { uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, second_pred, 64); src_ptr += src_stride << 6; ref_ptr += ref_stride << 6; second_pred += 64 << 6; sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, second_pred, 64); return sum; } unsigned int aom_sad128x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred) { unsigned int half_width = 64; uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, second_pred, 128); src_ptr += half_width; ref_ptr += half_width; second_pred += half_width; sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, second_pred, 128); return sum; } unsigned int aom_sad128x128_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred) { uint32_t sum = aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, second_pred); src_ptr += src_stride << 6; ref_ptr += ref_stride << 6; second_pred += 128 << 6; sum += aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, second_pred); return sum; } aom-3.12.1/aom_dsp/x86/sad_sse2.asm000066400000000000000000000315721477627663500166650ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "third_party/x86inc/x86inc.asm" SECTION .text ; Macro Arguments ; Arg 1: Width ; Arg 2: Height ; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit ; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows %macro SAD_FN 4 %if %4 == 0 ; normal sad %if %3 == 5 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows %else ; %3 == 7 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 %elif %4 == 2 ; skip %if %3 == 5 cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows %else ; %3 == 7 cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 %else %if %3 == 5 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ second_pred, n_rows %else ; %3 == 7 cglobal sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, 6, src, src_stride, \ ref, ref_stride, \ second_pred, \ src_stride3, ref_stride3 %if AOM_ARCH_X86_64 %define n_rowsd r7d %else ; x86-32 %define n_rowsd dword r0m %endif ; x86-32/64 %endif ; %3 == 5/7 %endif ; sad/avg/skip %if %4 == 2; skip rows so double the stride lea src_strided, [src_strided*2] lea ref_strided, [ref_strided*2] %endif ; %4 skip movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided %if %3 == 7 lea src_stride3q, [src_strideq*3] lea ref_stride3q, [ref_strideq*3] %endif ; %3 == 7 %endmacro ; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD128XN 1-2 0 SAD_FN 128, %1, 5, %2 %if %2 == 2 mov n_rowsd, %1/2 %else mov n_rowsd, %1 %endif pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+16] movu m3, [refq+32] movu m4, [refq+48] %if %2 == 1 pavgb m1, [second_predq+mmsize*0] pavgb m2, [second_predq+mmsize*1] pavgb m3, [second_predq+mmsize*2] pavgb m4, [second_predq+mmsize*3] %endif psadbw m1, [srcq] psadbw m2, [srcq+16] psadbw m3, [srcq+32] psadbw m4, [srcq+48] paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 movu m1, [refq+64] movu m2, [refq+80] movu m3, [refq+96] movu m4, [refq+112] %if %2 == 1 pavgb m1, [second_predq+mmsize*4] pavgb m2, [second_predq+mmsize*5] pavgb m3, [second_predq+mmsize*6] pavgb m4, [second_predq+mmsize*7] lea second_predq, [second_predq+mmsize*8] %endif psadbw m1, [srcq+64] psadbw m2, [srcq+80] psadbw m3, [srcq+96] psadbw m4, [srcq+112] add refq, ref_strideq add srcq, src_strideq paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 sub n_rowsd, 1 jg .loop movhlps m1, m0 paddd m0, m1 %if %2 == 2 ; we skipped rows, so now we need to double the sad pslld m0, 1 %endif movd eax, m0 RET %endmacro INIT_XMM sse2 SAD128XN 128 ; sad128x128_sse2 SAD128XN 128, 1 ; sad128x128_avg_sse2 SAD128XN 128, 2 ; sad_skip_128x128_sse2 SAD128XN 64 ; sad128x64_sse2 SAD128XN 64, 1 ; sad128x64_avg_sse2 SAD128XN 64, 2 ; sad_skip_128x64_sse2 ; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD64XN 1-2 0 SAD_FN 64, %1, 5, %2 %if %2 == 2 mov n_rowsd, %1/2 %else mov n_rowsd, %1 %endif pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+16] movu m3, [refq+32] movu m4, [refq+48] %if %2 == 1 pavgb m1, [second_predq+mmsize*0] pavgb m2, [second_predq+mmsize*1] pavgb m3, [second_predq+mmsize*2] pavgb m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif psadbw m1, [srcq] psadbw m2, [srcq+16] psadbw m3, [srcq+32] psadbw m4, [srcq+48] paddd m1, m2 paddd m3, m4 add refq, ref_strideq paddd m0, m1 add srcq, src_strideq paddd m0, m3 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 %if %2 == 2 ; we skipped rows, so now we need to double the sad pslld m0, 1 %endif movd eax, m0 RET %endmacro INIT_XMM sse2 SAD64XN 128 ; sad64x128_sse2 SAD64XN 64 ; sad64x64_sse2 SAD64XN 32 ; sad64x32_sse2 SAD64XN 128, 1 ; sad64x128_avg_sse2 SAD64XN 64, 1 ; sad64x64_avg_sse2 SAD64XN 32, 1 ; sad64x32_avg_sse2 SAD64XN 128, 2 ; sad_skip_64x128_sse2 SAD64XN 64, 2 ; sad_skip_64x64_sse2 SAD64XN 32, 2 ; sad_skip_64x32_sse2 %if CONFIG_REALTIME_ONLY==0 SAD64XN 16 ; sad64x16_sse2 SAD64XN 16, 1 ; sad64x16_avg_sse2 SAD64XN 16, 2 ; sad_skip_64x16_sse2 %endif ; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD32XN 1-2 0 SAD_FN 32, %1, 5, %2 %if %2 == 2 mov n_rowsd, %1/4 %else mov n_rowsd, %1/2 %endif pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+16] movu m3, [refq+ref_strideq] movu m4, [refq+ref_strideq+16] %if %2 == 1 pavgb m1, [second_predq+mmsize*0] pavgb m2, [second_predq+mmsize*1] pavgb m3, [second_predq+mmsize*2] pavgb m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif psadbw m1, [srcq] psadbw m2, [srcq+16] psadbw m3, [srcq+src_strideq] psadbw m4, [srcq+src_strideq+16] paddd m1, m2 paddd m3, m4 lea refq, [refq+ref_strideq*2] paddd m0, m1 lea srcq, [srcq+src_strideq*2] paddd m0, m3 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 %if %2 == 2 ; we skipped rows, so now we need to double the sad pslld m0, 1 %endif movd eax, m0 RET %endmacro INIT_XMM sse2 SAD32XN 64 ; sad32x64_sse2 SAD32XN 32 ; sad32x32_sse2 SAD32XN 16 ; sad32x16_sse2 SAD32XN 64, 1 ; sad32x64_avg_sse2 SAD32XN 32, 1 ; sad32x32_avg_sse2 SAD32XN 16, 1 ; sad32x16_avg_sse2 SAD32XN 64, 2 ; sad_skip_32x64_sse2 SAD32XN 32, 2 ; sad_skip_32x32_sse2 SAD32XN 16, 2 ; sad_skip_32x16_sse2 %if CONFIG_REALTIME_ONLY==0 SAD32XN 8 ; sad32x8_sse2 SAD32XN 8, 1 ; sad32x8_avg_sse2 %endif ; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD16XN 1-2 0 SAD_FN 16, %1, 7, %2 %if %2 == 2 mov n_rowsd, %1/8 %else mov n_rowsd, %1/4 %endif pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+ref_strideq] movu m3, [refq+ref_strideq*2] movu m4, [refq+ref_stride3q] %if %2 == 1 pavgb m1, [second_predq+mmsize*0] pavgb m2, [second_predq+mmsize*1] pavgb m3, [second_predq+mmsize*2] pavgb m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif psadbw m1, [srcq] psadbw m2, [srcq+src_strideq] psadbw m3, [srcq+src_strideq*2] psadbw m4, [srcq+src_stride3q] paddd m1, m2 paddd m3, m4 lea refq, [refq+ref_strideq*4] paddd m0, m1 lea srcq, [srcq+src_strideq*4] paddd m0, m3 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 %if %2 == 2 ; we skipped rows, so now we need to double the sad pslld m0, 1 %endif movd eax, m0 RET %endmacro INIT_XMM sse2 SAD16XN 32 ; sad16x32_sse2 SAD16XN 16 ; sad16x16_sse2 SAD16XN 8 ; sad16x8_sse2 SAD16XN 32, 1 ; sad16x32_avg_sse2 SAD16XN 16, 1 ; sad16x16_avg_sse2 SAD16XN 8, 1 ; sad16x8_avg_sse2 SAD16XN 32, 2 ; sad_skip_16x32_sse2 SAD16XN 16, 2 ; sad_skip_16x16_sse2 %if CONFIG_REALTIME_ONLY==0 SAD16XN 64 ; sad16x64_sse2 SAD16XN 4 ; sad16x4_sse2 SAD16XN 64, 1 ; sad16x64_avg_sse2 SAD16XN 64, 2 ; sad_skip_16x64_sse2 %endif ; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD8XN 1-2 0 SAD_FN 8, %1, 7, %2 %if %2 == 2 mov n_rowsd, %1/8 %else mov n_rowsd, %1/4 %endif pxor m0, m0 .loop: movh m1, [refq] movhps m1, [refq+ref_strideq] movh m2, [refq+ref_strideq*2] movhps m2, [refq+ref_stride3q] %if %2 == 1 pavgb m1, [second_predq+mmsize*0] pavgb m2, [second_predq+mmsize*1] lea second_predq, [second_predq+mmsize*2] %endif movh m3, [srcq] movhps m3, [srcq+src_strideq] movh m4, [srcq+src_strideq*2] movhps m4, [srcq+src_stride3q] psadbw m1, m3 psadbw m2, m4 lea refq, [refq+ref_strideq*4] paddd m0, m1 lea srcq, [srcq+src_strideq*4] paddd m0, m2 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 %if %2 == 2 ; we skipped rows, so now we need to double the sad pslld m0, 1 %endif movd eax, m0 RET %endmacro INIT_XMM sse2 SAD8XN 16 ; sad8x16_sse2 SAD8XN 8 ; sad8x8_sse2 SAD8XN 4 ; sad8x4_sse2 SAD8XN 16, 1 ; sad8x16_avg_sse2 SAD8XN 8, 1 ; sad8x8_avg_sse2 SAD8XN 16, 2 ; sad_skip_8x16_sse2 SAD8XN 8, 2 ; sad_skip_8x8_sse2 %if CONFIG_REALTIME_ONLY==0 SAD8XN 32 ; sad8x32_sse2 SAD8XN 32, 1 ; sad8x32_avg_sse2 SAD8XN 32, 2 ; sad_skip_8x32_sse2 %endif ; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD4XN 1-2 0 SAD_FN 4, %1, 7, %2 %if %2 == 2 mov n_rowsd, %1/8 %else mov n_rowsd, %1/4 %endif pxor m0, m0 .loop: movd m1, [refq] movd m2, [refq+ref_strideq] movd m3, [refq+ref_strideq*2] movd m4, [refq+ref_stride3q] punpckldq m1, m2 punpckldq m3, m4 movlhps m1, m3 %if %2 == 1 pavgb m1, [second_predq+mmsize*0] lea second_predq, [second_predq+mmsize*1] %endif movd m2, [srcq] movd m5, [srcq+src_strideq] movd m4, [srcq+src_strideq*2] movd m3, [srcq+src_stride3q] punpckldq m2, m5 punpckldq m4, m3 movlhps m2, m4 psadbw m1, m2 lea refq, [refq+ref_strideq*4] paddd m0, m1 lea srcq, [srcq+src_strideq*4] dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 %if %2 == 2 ; we skipped rows, so now we need to double the sad pslld m0, 1 %endif movd eax, m0 RET %endmacro INIT_XMM sse2 SAD4XN 8 ; sad4x8_sse2 SAD4XN 4 ; sad4x4_sse2 %if CONFIG_REALTIME_ONLY==0 SAD4XN 16 ; sad4x16_sse2 SAD4XN 16, 2 ; sad_skip_4x16_sse2 %endif aom-3.12.1/aom_dsp/x86/sse_avx2.c000066400000000000000000000334521477627663500163550ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" static inline void sse_w32_avx2(__m256i *sum, const uint8_t *a, const uint8_t *b) { const __m256i v_a0 = yy_loadu_256(a); const __m256i v_b0 = yy_loadu_256(b); const __m256i zero = _mm256_setzero_si256(); const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero); const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero); const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero); const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero); const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w); const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w); *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w)); } static inline int64_t summary_all_avx2(const __m256i *sum_all) { int64_t sum; __m256i zero = _mm256_setzero_si256(); const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero); const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero); const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), _mm256_extracti128_si256(sum_4x64, 1)); const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); xx_storel_64(&sum, sum_1x64); return sum; } #if CONFIG_AV1_HIGHBITDEPTH static inline void summary_32_avx2(const __m256i *sum32, __m256i *sum) { const __m256i sum0_4x64 = _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32)); const __m256i sum1_4x64 = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1)); const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); *sum = _mm256_add_epi64(*sum, sum_4x64); } static inline int64_t summary_4x64_avx2(const __m256i sum_4x64) { int64_t sum; const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), _mm256_extracti128_si256(sum_4x64, 1)); const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); xx_storel_64(&sum, sum_1x64); return sum; } #endif static inline void sse_w4x4_avx2(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, __m256i *sum) { const __m128i v_a0 = xx_loadl_32(a); const __m128i v_a1 = xx_loadl_32(a + a_stride); const __m128i v_a2 = xx_loadl_32(a + a_stride * 2); const __m128i v_a3 = xx_loadl_32(a + a_stride * 3); const __m128i v_b0 = xx_loadl_32(b); const __m128i v_b1 = xx_loadl_32(b + b_stride); const __m128i v_b2 = xx_loadl_32(b + b_stride * 2); const __m128i v_b3 = xx_loadl_32(b + b_stride * 3); const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1), _mm_unpacklo_epi32(v_a2, v_a3)); const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1), _mm_unpacklo_epi32(v_b2, v_b3)); const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123); const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123); const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); } static inline void sse_w8x2_avx2(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, __m256i *sum) { const __m128i v_a0 = xx_loadl_64(a); const __m128i v_a1 = xx_loadl_64(a + a_stride); const __m128i v_b0 = xx_loadl_64(b); const __m128i v_b1 = xx_loadl_64(b + b_stride); const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1)); const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1)); const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); } int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height) { int32_t y = 0; int64_t sse = 0; __m256i sum = _mm256_setzero_si256(); __m256i zero = _mm256_setzero_si256(); switch (width) { case 4: do { sse_w4x4_avx2(a, a_stride, b, b_stride, &sum); a += a_stride << 2; b += b_stride << 2; y += 4; } while (y < height); sse = summary_all_avx2(&sum); break; case 8: do { sse_w8x2_avx2(a, a_stride, b, b_stride, &sum); a += a_stride << 1; b += b_stride << 1; y += 2; } while (y < height); sse = summary_all_avx2(&sum); break; case 16: do { const __m128i v_a0 = xx_loadu_128(a); const __m128i v_a1 = xx_loadu_128(a + a_stride); const __m128i v_b0 = xx_loadu_128(b); const __m128i v_b1 = xx_loadu_128(b + b_stride); const __m256i v_a = _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01); const __m256i v_b = _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01); const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero); const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero); const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero); const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero); const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl); const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu); const __m256i temp = _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub), _mm256_madd_epi16(v_bsub, v_bsub)); sum = _mm256_add_epi32(sum, temp); a += a_stride << 1; b += b_stride << 1; y += 2; } while (y < height); sse = summary_all_avx2(&sum); break; case 32: do { sse_w32_avx2(&sum, a, b); a += a_stride; b += b_stride; y += 1; } while (y < height); sse = summary_all_avx2(&sum); break; case 64: do { sse_w32_avx2(&sum, a, b); sse_w32_avx2(&sum, a + 32, b + 32); a += a_stride; b += b_stride; y += 1; } while (y < height); sse = summary_all_avx2(&sum); break; case 128: do { sse_w32_avx2(&sum, a, b); sse_w32_avx2(&sum, a + 32, b + 32); sse_w32_avx2(&sum, a + 64, b + 64); sse_w32_avx2(&sum, a + 96, b + 96); a += a_stride; b += b_stride; y += 1; } while (y < height); sse = summary_all_avx2(&sum); break; default: if ((width & 0x07) == 0) { do { int i = 0; do { sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); i += 8; } while (i < width); a += a_stride << 1; b += b_stride << 1; y += 2; } while (y < height); } else { do { int i = 0; do { sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); const uint8_t *a2 = a + i + (a_stride << 1); const uint8_t *b2 = b + i + (b_stride << 1); sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum); i += 8; } while (i + 4 < width); sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum); a += a_stride << 2; b += b_stride << 2; y += 4; } while (y < height); } sse = summary_all_avx2(&sum); break; } return sse; } #if CONFIG_AV1_HIGHBITDEPTH static inline void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a, const uint16_t *b) { const __m256i v_a_w = yy_loadu_256(a); const __m256i v_b_w = yy_loadu_256(b); const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); } static inline void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a, int a_stride, const uint16_t *b, int b_stride) { const __m128i v_a0 = xx_loadl_64(a); const __m128i v_a1 = xx_loadl_64(a + a_stride); const __m128i v_a2 = xx_loadl_64(a + a_stride * 2); const __m128i v_a3 = xx_loadl_64(a + a_stride * 3); const __m128i v_b0 = xx_loadl_64(b); const __m128i v_b1 = xx_loadl_64(b + b_stride); const __m128i v_b2 = xx_loadl_64(b + b_stride * 2); const __m128i v_b3 = xx_loadl_64(b + b_stride * 3); const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1), _mm_unpacklo_epi64(v_a2, v_a3)); const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1), _mm_unpacklo_epi64(v_b2, v_b3)); const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); } static inline void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a, int a_stride, const uint16_t *b, int b_stride) { const __m256i v_a_w = yy_loadu2_128(a + a_stride, a); const __m256i v_b_w = yy_loadu2_128(b + b_stride, b); const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); } int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int width, int height) { int32_t y = 0; int64_t sse = 0; uint16_t *a = CONVERT_TO_SHORTPTR(a8); uint16_t *b = CONVERT_TO_SHORTPTR(b8); __m256i sum = _mm256_setzero_si256(); switch (width) { case 4: do { highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride); a += a_stride << 2; b += b_stride << 2; y += 4; } while (y < height); sse = summary_all_avx2(&sum); break; case 8: do { highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride); a += a_stride << 1; b += b_stride << 1; y += 2; } while (y < height); sse = summary_all_avx2(&sum); break; case 16: do { highbd_sse_w16_avx2(&sum, a, b); a += a_stride; b += b_stride; y += 1; } while (y < height); sse = summary_all_avx2(&sum); break; case 32: do { int l = 0; __m256i sum32 = _mm256_setzero_si256(); do { highbd_sse_w16_avx2(&sum32, a, b); highbd_sse_w16_avx2(&sum32, a + 16, b + 16); a += a_stride; b += b_stride; l += 1; } while (l < 64 && l < (height - y)); summary_32_avx2(&sum32, &sum); y += 64; } while (y < height); sse = summary_4x64_avx2(sum); break; case 64: do { int l = 0; __m256i sum32 = _mm256_setzero_si256(); do { highbd_sse_w16_avx2(&sum32, a, b); highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); a += a_stride; b += b_stride; l += 1; } while (l < 32 && l < (height - y)); summary_32_avx2(&sum32, &sum); y += 32; } while (y < height); sse = summary_4x64_avx2(sum); break; case 128: do { int l = 0; __m256i sum32 = _mm256_setzero_si256(); do { highbd_sse_w16_avx2(&sum32, a, b); highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4); highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5); highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6); highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7); a += a_stride; b += b_stride; l += 1; } while (l < 16 && l < (height - y)); summary_32_avx2(&sum32, &sum); y += 16; } while (y < height); sse = summary_4x64_avx2(sum); break; default: if (width & 0x7) { do { int i = 0; __m256i sum32 = _mm256_setzero_si256(); do { highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); const uint16_t *a2 = a + i + (a_stride << 1); const uint16_t *b2 = b + i + (b_stride << 1); highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride); i += 8; } while (i + 4 < width); highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride); summary_32_avx2(&sum32, &sum); a += a_stride << 2; b += b_stride << 2; y += 4; } while (y < height); } else { do { int l = 0; __m256i sum32 = _mm256_setzero_si256(); do { int i = 0; do { highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); i += 8; } while (i < width); a += a_stride << 1; b += b_stride << 1; l += 2; } while (l < 8 && l < (height - y)); summary_32_avx2(&sum32, &sum); y += 8; } while (y < height); } sse = summary_4x64_avx2(sum); break; } return sse; } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/x86/sse_sse4.c000066400000000000000000000276321477627663500163560ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/synonyms.h" static inline int64_t summary_all_sse4(const __m128i *sum_all) { int64_t sum; const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all); const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8)); const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1); const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); xx_storel_64(&sum, sum_1x64); return sum; } #if CONFIG_AV1_HIGHBITDEPTH static inline void summary_32_sse4(const __m128i *sum32, __m128i *sum64) { const __m128i sum0 = _mm_cvtepu32_epi64(*sum32); const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8)); *sum64 = _mm_add_epi64(sum0, *sum64); *sum64 = _mm_add_epi64(sum1, *sum64); } #endif static inline void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, const uint8_t *b) { const __m128i v_a0 = xx_loadu_128(a); const __m128i v_b0 = xx_loadu_128(b); const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0); const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8)); const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0); const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8)); const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w); const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w); *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w)); *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w)); } static inline void sse4x2_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, __m128i *sum) { const __m128i v_a0 = xx_loadl_32(a); const __m128i v_a1 = xx_loadl_32(a + a_stride); const __m128i v_b0 = xx_loadl_32(b); const __m128i v_b1 = xx_loadl_32(b + b_stride); const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); } static inline void sse8_sse4_1(const uint8_t *a, const uint8_t *b, __m128i *sum) { const __m128i v_a0 = xx_loadl_64(a); const __m128i v_b0 = xx_loadl_64(b); const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); } int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height) { int y = 0; int64_t sse = 0; __m128i sum = _mm_setzero_si128(); switch (width) { case 4: do { sse4x2_sse4_1(a, a_stride, b, b_stride, &sum); a += a_stride << 1; b += b_stride << 1; y += 2; } while (y < height); sse = summary_all_sse4(&sum); break; case 8: do { sse8_sse4_1(a, b, &sum); a += a_stride; b += b_stride; y += 1; } while (y < height); sse = summary_all_sse4(&sum); break; case 16: do { sse_w16_sse4_1(&sum, a, b); a += a_stride; b += b_stride; y += 1; } while (y < height); sse = summary_all_sse4(&sum); break; case 32: do { sse_w16_sse4_1(&sum, a, b); sse_w16_sse4_1(&sum, a + 16, b + 16); a += a_stride; b += b_stride; y += 1; } while (y < height); sse = summary_all_sse4(&sum); break; case 64: do { sse_w16_sse4_1(&sum, a, b); sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); a += a_stride; b += b_stride; y += 1; } while (y < height); sse = summary_all_sse4(&sum); break; case 128: do { sse_w16_sse4_1(&sum, a, b); sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4); sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5); sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6); sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7); a += a_stride; b += b_stride; y += 1; } while (y < height); sse = summary_all_sse4(&sum); break; default: if (width & 0x07) { do { int i = 0; do { sse8_sse4_1(a + i, b + i, &sum); sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum); i += 8; } while (i + 4 < width); sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum); a += (a_stride << 1); b += (b_stride << 1); y += 2; } while (y < height); } else { do { int i = 0; do { sse8_sse4_1(a + i, b + i, &sum); i += 8; } while (i < width); a += a_stride; b += b_stride; y += 1; } while (y < height); } sse = summary_all_sse4(&sum); break; } return sse; } #if CONFIG_AV1_HIGHBITDEPTH static inline void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a, int a_stride, const uint16_t *b, int b_stride) { const __m128i v_a0 = xx_loadl_64(a); const __m128i v_a1 = xx_loadl_64(a + a_stride); const __m128i v_b0 = xx_loadl_64(b); const __m128i v_b1 = xx_loadl_64(b + b_stride); const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); } static inline void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a, const uint16_t *b) { const __m128i v_a_w = xx_loadu_128(a); const __m128i v_b_w = xx_loadu_128(b); const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); } int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int width, int height) { int32_t y = 0; int64_t sse = 0; uint16_t *a = CONVERT_TO_SHORTPTR(a8); uint16_t *b = CONVERT_TO_SHORTPTR(b8); __m128i sum = _mm_setzero_si128(); switch (width) { case 4: do { highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride); a += a_stride << 1; b += b_stride << 1; y += 2; } while (y < height); sse = summary_all_sse4(&sum); break; case 8: do { highbd_sse_w8_sse4_1(&sum, a, b); a += a_stride; b += b_stride; y += 1; } while (y < height); sse = summary_all_sse4(&sum); break; case 16: do { int l = 0; __m128i sum32 = _mm_setzero_si128(); do { highbd_sse_w8_sse4_1(&sum32, a, b); highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8); a += a_stride; b += b_stride; l += 1; } while (l < 64 && l < (height - y)); summary_32_sse4(&sum32, &sum); y += 64; } while (y < height); xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); break; case 32: do { int l = 0; __m128i sum32 = _mm_setzero_si128(); do { highbd_sse_w8_sse4_1(&sum32, a, b); highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); a += a_stride; b += b_stride; l += 1; } while (l < 32 && l < (height - y)); summary_32_sse4(&sum32, &sum); y += 32; } while (y < height); xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); break; case 64: do { int l = 0; __m128i sum32 = _mm_setzero_si128(); do { highbd_sse_w8_sse4_1(&sum32, a, b); highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); a += a_stride; b += b_stride; l += 1; } while (l < 16 && l < (height - y)); summary_32_sse4(&sum32, &sum); y += 16; } while (y < height); xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); break; case 128: do { int l = 0; __m128i sum32 = _mm_setzero_si128(); do { highbd_sse_w8_sse4_1(&sum32, a, b); highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8); highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9); highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10); highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11); highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12); highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13); highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14); highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15); a += a_stride; b += b_stride; l += 1; } while (l < 8 && l < (height - y)); summary_32_sse4(&sum32, &sum); y += 8; } while (y < height); xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); break; default: if (width & 0x7) { do { __m128i sum32 = _mm_setzero_si128(); int i = 0; do { highbd_sse_w8_sse4_1(&sum32, a + i, b + i); highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride); i += 8; } while (i + 4 < width); highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride); a += (a_stride << 1); b += (b_stride << 1); y += 2; summary_32_sse4(&sum32, &sum); } while (y < height); } else { do { int l = 0; __m128i sum32 = _mm_setzero_si128(); do { int i = 0; do { highbd_sse_w8_sse4_1(&sum32, a + i, b + i); i += 8; } while (i < width); a += a_stride; b += b_stride; l += 1; } while (l < 8 && l < (height - y)); summary_32_sse4(&sum32, &sum); y += 8; } while (y < height); } xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); break; } return sse; } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/x86/ssim_sse2_x86_64.asm000066400000000000000000000074751477627663500201140ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "aom_ports/x86_abi_support.asm" ; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr %macro TABULATE_SSIM 0 paddusw xmm15, xmm3 ; sum_s paddusw xmm14, xmm4 ; sum_r movdqa xmm1, xmm3 pmaddwd xmm1, xmm1 paddd xmm13, xmm1 ; sum_sq_s movdqa xmm2, xmm4 pmaddwd xmm2, xmm2 paddd xmm12, xmm2 ; sum_sq_r pmaddwd xmm3, xmm4 paddd xmm11, xmm3 ; sum_sxr %endmacro ; Sum across the register %1 starting with q words %macro SUM_ACROSS_Q 1 movdqa xmm2,%1 punpckldq %1,xmm0 punpckhdq xmm2,xmm0 paddq %1,xmm2 movdqa xmm2,%1 punpcklqdq %1,xmm0 punpckhqdq xmm2,xmm0 paddq %1,xmm2 %endmacro ; Sum across the register %1 starting with q words %macro SUM_ACROSS_W 1 movdqa xmm1, %1 punpcklwd %1,xmm0 punpckhwd xmm1,xmm0 paddd %1, xmm1 SUM_ACROSS_Q %1 %endmacro SECTION .text ;void aom_ssim_parms_8x8_sse2( ; unsigned char *s, ; int sp, ; unsigned char *r, ; int rp ; uint32_t *sum_s, ; uint32_t *sum_r, ; uint32_t *sum_sq_s, ; uint32_t *sum_sq_r, ; uint32_t *sum_sxr); ; ; TODO: Use parm passing through structure, probably don't need the pxors ; ( calling app will initialize to 0 ) could easily fit everything in sse2 ; without too much hastle, and can probably do better estimates with psadw ; or pavgb At this point this is just meant to be first pass for calculating ; all the parms needed for 16x16 ssim so we can play with dssim as distortion ; in mode selection code. globalsym(aom_ssim_parms_8x8_sse2) sym(aom_ssim_parms_8x8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 SAVE_XMM 15 push rsi push rdi ; end prolog mov rsi, arg(0) ;s mov rcx, arg(1) ;sp mov rdi, arg(2) ;r mov rax, arg(3) ;rp pxor xmm0, xmm0 pxor xmm15,xmm15 ;sum_s pxor xmm14,xmm14 ;sum_r pxor xmm13,xmm13 ;sum_sq_s pxor xmm12,xmm12 ;sum_sq_r pxor xmm11,xmm11 ;sum_sxr mov rdx, 8 ;row counter .NextRow: ;grab source and reference pixels movq xmm3, [rsi] movq xmm4, [rdi] punpcklbw xmm3, xmm0 ; low_s punpcklbw xmm4, xmm0 ; low_r TABULATE_SSIM add rsi, rcx ; next s row add rdi, rax ; next r row dec rdx ; counter jnz .NextRow SUM_ACROSS_W xmm15 SUM_ACROSS_W xmm14 SUM_ACROSS_Q xmm13 SUM_ACROSS_Q xmm12 SUM_ACROSS_Q xmm11 mov rdi,arg(4) movd [rdi], xmm15; mov rdi,arg(5) movd [rdi], xmm14; mov rdi,arg(6) movd [rdi], xmm13; mov rdi,arg(7) movd [rdi], xmm12; mov rdi,arg(8) movd [rdi], xmm11; ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret aom-3.12.1/aom_dsp/x86/subpel_variance_ssse3.asm000066400000000000000000001230531477627663500214400ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "third_party/x86inc/x86inc.asm" SECTION_RODATA pw_8: times 8 dw 8 bilin_filter_m_ssse3: times 8 db 16, 0 times 8 db 14, 2 times 8 db 12, 4 times 8 db 10, 6 times 16 db 8 times 8 db 6, 10 times 8 db 4, 12 times 8 db 2, 14 SECTION .text ; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, ; int x_offset, int y_offset, ; const uint8_t *dst, ptrdiff_t dst_stride, ; int height, unsigned int *sse); ; ; This function returns the SE and stores SSE in the given pointer. %macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse psubw %3, %4 psubw %1, %2 paddw %5, %3 pmaddwd %3, %3 paddw %5, %1 pmaddwd %1, %1 paddd %6, %3 paddd %6, %1 %endmacro %macro STORE_AND_RET 1 %if %1 > 4 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. ; We have to sign-extend it before adding the words within the register ; and outputing to a dword. pcmpgtw m5, m6 ; mask for 0 > x movhlps m3, m7 punpcklwd m4, m6, m5 punpckhwd m6, m5 ; sign-extend m6 word->dword paddd m7, m3 paddd m6, m4 pshufd m3, m7, 0x1 movhlps m4, m6 paddd m7, m3 paddd m6, m4 mov r1, ssem ; r1 = unsigned int *sse pshufd m4, m6, 0x1 movd [r1], m7 ; store sse paddd m6, m4 movd raxd, m6 ; store sum as return value %else ; 4xh pshuflw m4, m6, 0xe pshuflw m3, m7, 0xe paddw m6, m4 paddd m7, m3 pcmpgtw m5, m6 ; mask for 0 > x mov r1, ssem ; r1 = unsigned int *sse punpcklwd m6, m5 ; sign-extend m6 word->dword movd [r1], m7 ; store sse pshuflw m4, m6, 0xe paddd m6, m4 movd raxd, m6 ; store sum as return value %endif RET %endmacro %macro INC_SRC_BY_SRC_STRIDE 0 %if AOM_ARCH_X86=1 && CONFIG_PIC=1 add srcq, src_stridemp %else add srcq, src_strideq %endif %endmacro %macro SUBPEL_VARIANCE 1-2 0 ; W %if cpuflag(ssse3) %define bilin_filter_m bilin_filter_m_ssse3 %define filter_idx_shift 4 %endif ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses ; 11, not 13, if the registers are ordered correctly. May make a minor speed ; difference on Win64 %if AOM_ARCH_X86_64 %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ x_offset, y_offset, dst, dst_stride, \ sec, sec_stride, height, sse %define sec_str sec_strideq %else cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ x_offset, y_offset, dst, dst_stride, \ height, sse %endif %define block_height heightd %define bilin_filter sseq %else %if CONFIG_PIC=1 %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, dst, dst_stride, \ sec, sec_stride, height, sse %define block_height dword heightm %define sec_str sec_stridemp %else cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, dst, dst_stride, \ height, sse %define block_height heightd %endif ; reuse argument stack space %define g_bilin_filterm x_offsetm %define g_pw_8m y_offsetm ;Store bilin_filter and pw_8 location in stack %if GET_GOT_DEFINED == 1 GET_GOT eax add esp, 4 ; restore esp %endif lea ecx, [GLOBAL(bilin_filter_m)] mov g_bilin_filterm, ecx lea ecx, [GLOBAL(pw_8)] mov g_pw_8m, ecx LOAD_IF_USED 0, 1 ; load eax, ecx back %else %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, \ dst, dst_stride, sec, sec_stride, \ height, sse %define block_height dword heightm %define sec_str sec_stridemp %else cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, dst, dst_stride, \ height, sse %define block_height heightd %endif %define bilin_filter bilin_filter_m %endif %endif %if %1 == 4 %define movx movd %else %define movx movh %endif ASSERT %1 <= 16 ; m6 overflows if w > 16 pxor m6, m6 ; sum pxor m7, m7 ; sse ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we ; could perhaps use it for something more productive then pxor m5, m5 ; dedicated zero register %if %1 < 16 sar block_height, 1 %if %2 == 1 ; avg shl sec_str, 1 %endif %endif ; FIXME(rbultje) replace by jumptable? test x_offsetd, x_offsetd jnz .x_nonzero ; x_offset == 0 test y_offsetd, y_offsetd jnz .x_zero_y_nonzero ; x_offset == 0 && y_offset == 0 .x_zero_y_zero_loop: %if %1 == 16 movu m0, [srcq] mova m1, [dstq] %if %2 == 1 ; avg pavgb m0, [secq] punpckhbw m3, m1, m5 punpcklbw m1, m5 %endif punpckhbw m2, m0, m5 punpcklbw m0, m5 %if %2 == 0 ; !avg punpckhbw m3, m1, m5 punpcklbw m1, m5 %endif SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq add dstq, dst_strideq %else ; %1 < 16 movx m0, [srcq] %if %2 == 1 ; avg %if %1 > 4 movhps m0, [srcq+src_strideq] %else ; 4xh movx m1, [srcq+src_strideq] punpckldq m0, m1 %endif %else ; !avg movx m2, [srcq+src_strideq] %endif movx m1, [dstq] movx m3, [dstq+dst_strideq] %if %2 == 1 ; avg %if %1 > 4 pavgb m0, [secq] %else movh m2, [secq] pavgb m0, m2 %endif punpcklbw m3, m5 punpcklbw m1, m5 %if %1 > 4 punpckhbw m2, m0, m5 punpcklbw m0, m5 %else ; 4xh punpcklbw m0, m5 movhlps m2, m0 %endif %else ; !avg punpcklbw m0, m5 punpcklbw m2, m5 punpcklbw m3, m5 punpcklbw m1, m5 %endif SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] %endif %if %2 == 1 ; avg add secq, sec_str %endif dec block_height jg .x_zero_y_zero_loop STORE_AND_RET %1 .x_zero_y_nonzero: cmp y_offsetd, 4 jne .x_zero_y_nonhalf ; x_offset == 0 && y_offset == 0.5 .x_zero_y_half_loop: %if %1 == 16 movu m0, [srcq] movu m4, [srcq+src_strideq] mova m1, [dstq] pavgb m0, m4 punpckhbw m3, m1, m5 %if %2 == 1 ; avg pavgb m0, [secq] %endif punpcklbw m1, m5 punpckhbw m2, m0, m5 punpcklbw m0, m5 SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq add dstq, dst_strideq %else ; %1 < 16 movx m0, [srcq] movx m2, [srcq+src_strideq] %if %2 == 1 ; avg %if %1 > 4 movhps m2, [srcq+src_strideq*2] %else ; 4xh movx m1, [srcq+src_strideq*2] punpckldq m2, m1 %endif movx m1, [dstq] %if %1 > 4 movlhps m0, m2 %else ; 4xh punpckldq m0, m2 %endif movx m3, [dstq+dst_strideq] pavgb m0, m2 punpcklbw m1, m5 %if %1 > 4 pavgb m0, [secq] punpcklbw m3, m5 punpckhbw m2, m0, m5 punpcklbw m0, m5 %else ; 4xh movh m4, [secq] pavgb m0, m4 punpcklbw m3, m5 punpcklbw m0, m5 movhlps m2, m0 %endif %else ; !avg movx m4, [srcq+src_strideq*2] movx m1, [dstq] pavgb m0, m2 movx m3, [dstq+dst_strideq] pavgb m2, m4 punpcklbw m0, m5 punpcklbw m2, m5 punpcklbw m3, m5 punpcklbw m1, m5 %endif SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] %endif %if %2 == 1 ; avg add secq, sec_str %endif dec block_height jg .x_zero_y_half_loop STORE_AND_RET %1 .x_zero_y_nonhalf: ; x_offset == 0 && y_offset == bilin interpolation %if AOM_ARCH_X86_64 lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if AOM_ARCH_X86_64 && %1 > 4 mova m8, [bilin_filter+y_offsetq] %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+y_offsetq+16] %endif mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 %else ; x86-32 or mmx %if AOM_ARCH_X86=1 && CONFIG_PIC=1 ; x_offset == 0, reuse x_offset reg %define tempq x_offsetq add y_offsetq, g_bilin_filterm %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] mov tempq, g_pw_8m %define filter_rnd [tempq] %else add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] %define filter_rnd [GLOBAL(pw_8)] %endif %endif .x_zero_y_other_loop: %if %1 == 16 movu m0, [srcq] movu m4, [srcq+src_strideq] mova m1, [dstq] %if cpuflag(ssse3) punpckhbw m2, m0, m4 punpcklbw m0, m4 pmaddubsw m2, filter_y_a pmaddubsw m0, filter_y_a paddw m2, filter_rnd paddw m0, filter_rnd %else punpckhbw m2, m0, m5 punpckhbw m3, m4, m5 punpcklbw m0, m5 punpcklbw m4, m5 ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of ; instructions is the same (5), but it is 1 mul instead of 2, so might be ; slightly faster because of pmullw latency. It would also cut our rodata ; tables in half for this function, and save 1-2 registers on x86-64. pmullw m2, filter_y_a pmullw m3, filter_y_b paddw m2, filter_rnd pmullw m0, filter_y_a pmullw m4, filter_y_b paddw m0, filter_rnd paddw m2, m3 paddw m0, m4 %endif psraw m2, 4 psraw m0, 4 %if %2 == 1 ; avg ; FIXME(rbultje) pipeline packuswb m0, m2 pavgb m0, [secq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %endif punpckhbw m3, m1, m5 punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq add dstq, dst_strideq %else ; %1 < 16 movx m0, [srcq] movx m2, [srcq+src_strideq] movx m4, [srcq+src_strideq*2] movx m3, [dstq+dst_strideq] %if cpuflag(ssse3) movx m1, [dstq] punpcklbw m0, m2 punpcklbw m2, m4 pmaddubsw m0, filter_y_a pmaddubsw m2, filter_y_a punpcklbw m3, m5 paddw m2, filter_rnd paddw m0, filter_rnd %else punpcklbw m0, m5 punpcklbw m2, m5 punpcklbw m4, m5 pmullw m0, filter_y_a pmullw m1, m2, filter_y_b punpcklbw m3, m5 paddw m0, filter_rnd pmullw m2, filter_y_a pmullw m4, filter_y_b paddw m0, m1 paddw m2, filter_rnd movx m1, [dstq] paddw m2, m4 %endif psraw m0, 4 psraw m2, 4 %if %2 == 1 ; avg ; FIXME(rbultje) pipeline %if %1 == 4 movlhps m0, m2 %endif packuswb m0, m2 %if %1 > 4 pavgb m0, [secq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else ; 4xh movh m2, [secq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 %endif %endif punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] %endif %if %2 == 1 ; avg add secq, sec_str %endif dec block_height jg .x_zero_y_other_loop %undef filter_y_a %undef filter_y_b %undef filter_rnd STORE_AND_RET %1 .x_nonzero: cmp x_offsetd, 4 jne .x_nonhalf ; x_offset == 0.5 test y_offsetd, y_offsetd jnz .x_half_y_nonzero ; x_offset == 0.5 && y_offset == 0 .x_half_y_zero_loop: %if %1 == 16 movu m0, [srcq] movu m4, [srcq+1] mova m1, [dstq] pavgb m0, m4 punpckhbw m3, m1, m5 %if %2 == 1 ; avg pavgb m0, [secq] %endif punpcklbw m1, m5 punpckhbw m2, m0, m5 punpcklbw m0, m5 SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq add dstq, dst_strideq %else ; %1 < 16 movx m0, [srcq] movx m4, [srcq+1] %if %2 == 1 ; avg %if %1 > 4 movhps m0, [srcq+src_strideq] movhps m4, [srcq+src_strideq+1] %else ; 4xh movx m1, [srcq+src_strideq] punpckldq m0, m1 movx m2, [srcq+src_strideq+1] punpckldq m4, m2 %endif movx m1, [dstq] movx m3, [dstq+dst_strideq] pavgb m0, m4 punpcklbw m3, m5 %if %1 > 4 pavgb m0, [secq] punpcklbw m1, m5 punpckhbw m2, m0, m5 punpcklbw m0, m5 %else ; 4xh movh m2, [secq] pavgb m0, m2 punpcklbw m1, m5 punpcklbw m0, m5 movhlps m2, m0 %endif %else ; !avg movx m2, [srcq+src_strideq] movx m1, [dstq] pavgb m0, m4 movx m4, [srcq+src_strideq+1] movx m3, [dstq+dst_strideq] pavgb m2, m4 punpcklbw m0, m5 punpcklbw m2, m5 punpcklbw m3, m5 punpcklbw m1, m5 %endif SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] %endif %if %2 == 1 ; avg add secq, sec_str %endif dec block_height jg .x_half_y_zero_loop STORE_AND_RET %1 .x_half_y_nonzero: cmp y_offsetd, 4 jne .x_half_y_nonhalf ; x_offset == 0.5 && y_offset == 0.5 %if %1 == 16 movu m0, [srcq] movu m3, [srcq+1] add srcq, src_strideq pavgb m0, m3 .x_half_y_half_loop: movu m4, [srcq] movu m3, [srcq+1] mova m1, [dstq] pavgb m4, m3 punpckhbw m3, m1, m5 pavgb m0, m4 %if %2 == 1 ; avg punpcklbw m1, m5 pavgb m0, [secq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else punpckhbw m2, m0, m5 punpcklbw m0, m5 punpcklbw m1, m5 %endif SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 add srcq, src_strideq add dstq, dst_strideq %else ; %1 < 16 movx m0, [srcq] movx m3, [srcq+1] add srcq, src_strideq pavgb m0, m3 .x_half_y_half_loop: movx m2, [srcq] movx m3, [srcq+1] %if %2 == 1 ; avg %if %1 > 4 movhps m2, [srcq+src_strideq] movhps m3, [srcq+src_strideq+1] %else movx m1, [srcq+src_strideq] punpckldq m2, m1 movx m1, [srcq+src_strideq+1] punpckldq m3, m1 %endif pavgb m2, m3 %if %1 > 4 movlhps m0, m2 movhlps m4, m2 %else ; 4xh punpckldq m0, m2 pshuflw m4, m2, 0xe %endif movx m1, [dstq] pavgb m0, m2 movx m3, [dstq+dst_strideq] %if %1 > 4 pavgb m0, [secq] %else movh m2, [secq] pavgb m0, m2 %endif punpcklbw m3, m5 punpcklbw m1, m5 %if %1 > 4 punpckhbw m2, m0, m5 punpcklbw m0, m5 %else punpcklbw m0, m5 movhlps m2, m0 %endif %else ; !avg movx m4, [srcq+src_strideq] movx m1, [srcq+src_strideq+1] pavgb m2, m3 pavgb m4, m1 pavgb m0, m2 pavgb m2, m4 movx m1, [dstq] movx m3, [dstq+dst_strideq] punpcklbw m0, m5 punpcklbw m2, m5 punpcklbw m3, m5 punpcklbw m1, m5 %endif SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] %endif %if %2 == 1 ; avg add secq, sec_str %endif dec block_height jg .x_half_y_half_loop STORE_AND_RET %1 .x_half_y_nonhalf: ; x_offset == 0.5 && y_offset == bilin interpolation %if AOM_ARCH_X86_64 lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if AOM_ARCH_X86_64 && %1 > 4 mova m8, [bilin_filter+y_offsetq] %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+y_offsetq+16] %endif mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 %else ;x86_32 %if AOM_ARCH_X86=1 && CONFIG_PIC=1 ; x_offset == 0.5. We can reuse x_offset reg %define tempq x_offsetq add y_offsetq, g_bilin_filterm %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] mov tempq, g_pw_8m %define filter_rnd [tempq] %else add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] %define filter_rnd [GLOBAL(pw_8)] %endif %endif %if %1 == 16 movu m0, [srcq] movu m3, [srcq+1] add srcq, src_strideq pavgb m0, m3 .x_half_y_other_loop: movu m4, [srcq] movu m2, [srcq+1] mova m1, [dstq] pavgb m4, m2 %if cpuflag(ssse3) punpckhbw m2, m0, m4 punpcklbw m0, m4 pmaddubsw m2, filter_y_a pmaddubsw m0, filter_y_a paddw m2, filter_rnd paddw m0, filter_rnd psraw m2, 4 %else punpckhbw m2, m0, m5 punpckhbw m3, m4, m5 pmullw m2, filter_y_a pmullw m3, filter_y_b paddw m2, filter_rnd punpcklbw m0, m5 paddw m2, m3 punpcklbw m3, m4, m5 pmullw m0, filter_y_a pmullw m3, filter_y_b paddw m0, filter_rnd psraw m2, 4 paddw m0, m3 %endif punpckhbw m3, m1, m5 psraw m0, 4 %if %2 == 1 ; avg ; FIXME(rbultje) pipeline packuswb m0, m2 pavgb m0, [secq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %endif punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 add srcq, src_strideq add dstq, dst_strideq %else ; %1 < 16 movx m0, [srcq] movx m3, [srcq+1] add srcq, src_strideq pavgb m0, m3 %if notcpuflag(ssse3) punpcklbw m0, m5 %endif .x_half_y_other_loop: movx m2, [srcq] movx m1, [srcq+1] movx m4, [srcq+src_strideq] movx m3, [srcq+src_strideq+1] pavgb m2, m1 pavgb m4, m3 movx m3, [dstq+dst_strideq] %if cpuflag(ssse3) movx m1, [dstq] punpcklbw m0, m2 punpcklbw m2, m4 pmaddubsw m0, filter_y_a pmaddubsw m2, filter_y_a punpcklbw m3, m5 paddw m0, filter_rnd paddw m2, filter_rnd %else punpcklbw m2, m5 punpcklbw m4, m5 pmullw m0, filter_y_a pmullw m1, m2, filter_y_b punpcklbw m3, m5 paddw m0, filter_rnd pmullw m2, filter_y_a paddw m0, m1 pmullw m1, m4, filter_y_b paddw m2, filter_rnd paddw m2, m1 movx m1, [dstq] %endif psraw m0, 4 psraw m2, 4 %if %2 == 1 ; avg ; FIXME(rbultje) pipeline %if %1 == 4 movlhps m0, m2 %endif packuswb m0, m2 %if %1 > 4 pavgb m0, [secq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else movh m2, [secq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 %endif %endif punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] %endif %if %2 == 1 ; avg add secq, sec_str %endif dec block_height jg .x_half_y_other_loop %undef filter_y_a %undef filter_y_b %undef filter_rnd STORE_AND_RET %1 .x_nonhalf: test y_offsetd, y_offsetd jnz .x_nonhalf_y_nonzero ; x_offset == bilin interpolation && y_offset == 0 %if AOM_ARCH_X86_64 lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if AOM_ARCH_X86_64 && %1 > 4 mova m8, [bilin_filter+x_offsetq] %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] %endif mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 %else ; x86-32 %if AOM_ARCH_X86=1 && CONFIG_PIC=1 ;y_offset == 0. We can reuse y_offset reg. %define tempq y_offsetq add x_offsetq, g_bilin_filterm %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] mov tempq, g_pw_8m %define filter_rnd [tempq] %else add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] %define filter_rnd [GLOBAL(pw_8)] %endif %endif .x_other_y_zero_loop: %if %1 == 16 movu m0, [srcq] movu m4, [srcq+1] mova m1, [dstq] %if cpuflag(ssse3) punpckhbw m2, m0, m4 punpcklbw m0, m4 pmaddubsw m2, filter_x_a pmaddubsw m0, filter_x_a paddw m2, filter_rnd paddw m0, filter_rnd %else punpckhbw m2, m0, m5 punpckhbw m3, m4, m5 punpcklbw m0, m5 punpcklbw m4, m5 pmullw m2, filter_x_a pmullw m3, filter_x_b paddw m2, filter_rnd pmullw m0, filter_x_a pmullw m4, filter_x_b paddw m0, filter_rnd paddw m2, m3 paddw m0, m4 %endif psraw m2, 4 psraw m0, 4 %if %2 == 1 ; avg ; FIXME(rbultje) pipeline packuswb m0, m2 pavgb m0, [secq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %endif punpckhbw m3, m1, m5 punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq add dstq, dst_strideq %else ; %1 < 16 movx m0, [srcq] movx m1, [srcq+1] movx m2, [srcq+src_strideq] movx m4, [srcq+src_strideq+1] movx m3, [dstq+dst_strideq] %if cpuflag(ssse3) punpcklbw m0, m1 movx m1, [dstq] punpcklbw m2, m4 pmaddubsw m0, filter_x_a pmaddubsw m2, filter_x_a punpcklbw m3, m5 paddw m0, filter_rnd paddw m2, filter_rnd %else punpcklbw m0, m5 punpcklbw m1, m5 punpcklbw m2, m5 punpcklbw m4, m5 pmullw m0, filter_x_a pmullw m1, filter_x_b punpcklbw m3, m5 paddw m0, filter_rnd pmullw m2, filter_x_a pmullw m4, filter_x_b paddw m0, m1 paddw m2, filter_rnd movx m1, [dstq] paddw m2, m4 %endif psraw m0, 4 psraw m2, 4 %if %2 == 1 ; avg ; FIXME(rbultje) pipeline %if %1 == 4 movlhps m0, m2 %endif packuswb m0, m2 %if %1 > 4 pavgb m0, [secq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else movh m2, [secq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 %endif %endif punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] %endif %if %2 == 1 ; avg add secq, sec_str %endif dec block_height jg .x_other_y_zero_loop %undef filter_x_a %undef filter_x_b %undef filter_rnd STORE_AND_RET %1 .x_nonhalf_y_nonzero: cmp y_offsetd, 4 jne .x_nonhalf_y_nonhalf ; x_offset == bilin interpolation && y_offset == 0.5 %if AOM_ARCH_X86_64 lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if AOM_ARCH_X86_64 && %1 > 4 mova m8, [bilin_filter+x_offsetq] %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] %endif mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 %else ; x86-32 %if AOM_ARCH_X86=1 && CONFIG_PIC=1 ; y_offset == 0.5. We can reuse y_offset reg. %define tempq y_offsetq add x_offsetq, g_bilin_filterm %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] mov tempq, g_pw_8m %define filter_rnd [tempq] %else add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] %define filter_rnd [GLOBAL(pw_8)] %endif %endif %if %1 == 16 movu m0, [srcq] movu m1, [srcq+1] %if cpuflag(ssse3) punpckhbw m2, m0, m1 punpcklbw m0, m1 pmaddubsw m2, filter_x_a pmaddubsw m0, filter_x_a paddw m2, filter_rnd paddw m0, filter_rnd %else punpckhbw m2, m0, m5 punpckhbw m3, m1, m5 punpcklbw m0, m5 punpcklbw m1, m5 pmullw m0, filter_x_a pmullw m1, filter_x_b paddw m0, filter_rnd pmullw m2, filter_x_a pmullw m3, filter_x_b paddw m2, filter_rnd paddw m0, m1 paddw m2, m3 %endif psraw m0, 4 psraw m2, 4 add srcq, src_strideq packuswb m0, m2 .x_other_y_half_loop: movu m4, [srcq] movu m3, [srcq+1] %if cpuflag(ssse3) mova m1, [dstq] punpckhbw m2, m4, m3 punpcklbw m4, m3 pmaddubsw m2, filter_x_a pmaddubsw m4, filter_x_a paddw m2, filter_rnd paddw m4, filter_rnd psraw m2, 4 psraw m4, 4 packuswb m4, m2 pavgb m0, m4 punpckhbw m3, m1, m5 punpcklbw m1, m5 %else punpckhbw m2, m4, m5 punpckhbw m1, m3, m5 punpcklbw m4, m5 punpcklbw m3, m5 pmullw m4, filter_x_a pmullw m3, filter_x_b paddw m4, filter_rnd pmullw m2, filter_x_a pmullw m1, filter_x_b paddw m2, filter_rnd paddw m4, m3 paddw m2, m1 mova m1, [dstq] psraw m4, 4 psraw m2, 4 punpckhbw m3, m1, m5 ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we ; have a 1-register shortage to be able to store the backup of the bilin ; filtered second line as words as cache for the next line. Packing into ; a byte costs 1 pack and 2 unpacks, but saves a register. packuswb m4, m2 punpcklbw m1, m5 pavgb m0, m4 %endif %if %2 == 1 ; avg ; FIXME(rbultje) pipeline pavgb m0, [secq] %endif punpckhbw m2, m0, m5 punpcklbw m0, m5 SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 add srcq, src_strideq add dstq, dst_strideq %else ; %1 < 16 movx m0, [srcq] movx m1, [srcq+1] %if cpuflag(ssse3) punpcklbw m0, m1 pmaddubsw m0, filter_x_a paddw m0, filter_rnd %else punpcklbw m0, m5 punpcklbw m1, m5 pmullw m0, filter_x_a pmullw m1, filter_x_b paddw m0, filter_rnd paddw m0, m1 %endif add srcq, src_strideq psraw m0, 4 .x_other_y_half_loop: movx m2, [srcq] movx m1, [srcq+1] movx m4, [srcq+src_strideq] movx m3, [srcq+src_strideq+1] %if cpuflag(ssse3) punpcklbw m2, m1 punpcklbw m4, m3 pmaddubsw m2, filter_x_a pmaddubsw m4, filter_x_a movx m1, [dstq] movx m3, [dstq+dst_strideq] paddw m2, filter_rnd paddw m4, filter_rnd %else punpcklbw m2, m5 punpcklbw m1, m5 punpcklbw m4, m5 punpcklbw m3, m5 pmullw m2, filter_x_a pmullw m1, filter_x_b paddw m2, filter_rnd pmullw m4, filter_x_a pmullw m3, filter_x_b paddw m4, filter_rnd paddw m2, m1 movx m1, [dstq] paddw m4, m3 movx m3, [dstq+dst_strideq] %endif psraw m2, 4 psraw m4, 4 pavgw m0, m2 pavgw m2, m4 %if %2 == 1 ; avg ; FIXME(rbultje) pipeline - also consider going to bytes here %if %1 == 4 movlhps m0, m2 %endif packuswb m0, m2 %if %1 > 4 pavgb m0, [secq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else movh m2, [secq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 %endif %endif punpcklbw m3, m5 punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 lea srcq, [srcq+src_strideq*2] lea dstq, [dstq+dst_strideq*2] %endif %if %2 == 1 ; avg add secq, sec_str %endif dec block_height jg .x_other_y_half_loop %undef filter_x_a %undef filter_x_b %undef filter_rnd STORE_AND_RET %1 .x_nonhalf_y_nonhalf: %if AOM_ARCH_X86_64 lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift shl y_offsetd, filter_idx_shift %if AOM_ARCH_X86_64 && %1 > 4 mova m8, [bilin_filter+x_offsetq] %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] %endif mova m10, [bilin_filter+y_offsetq] %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m11, [bilin_filter+y_offsetq+16] %endif mova m12, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_y_a m10 %define filter_y_b m11 %define filter_rnd m12 %else ; x86-32 %if AOM_ARCH_X86=1 && CONFIG_PIC=1 ; In this case, there is NO unused register. Used src_stride register. Later, ; src_stride has to be loaded from stack when it is needed. %define tempq src_strideq mov tempq, g_bilin_filterm add x_offsetq, tempq add y_offsetq, tempq %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] mov tempq, g_pw_8m %define filter_rnd [tempq] %else add x_offsetq, bilin_filter add y_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] %define filter_rnd [GLOBAL(pw_8)] %endif %endif ; x_offset == bilin interpolation && y_offset == bilin interpolation %if %1 == 16 movu m0, [srcq] movu m1, [srcq+1] %if cpuflag(ssse3) punpckhbw m2, m0, m1 punpcklbw m0, m1 pmaddubsw m2, filter_x_a pmaddubsw m0, filter_x_a paddw m2, filter_rnd paddw m0, filter_rnd %else punpckhbw m2, m0, m5 punpckhbw m3, m1, m5 punpcklbw m0, m5 punpcklbw m1, m5 pmullw m0, filter_x_a pmullw m1, filter_x_b paddw m0, filter_rnd pmullw m2, filter_x_a pmullw m3, filter_x_b paddw m2, filter_rnd paddw m0, m1 paddw m2, m3 %endif psraw m0, 4 psraw m2, 4 INC_SRC_BY_SRC_STRIDE packuswb m0, m2 .x_other_y_other_loop: %if cpuflag(ssse3) movu m4, [srcq] movu m3, [srcq+1] mova m1, [dstq] punpckhbw m2, m4, m3 punpcklbw m4, m3 pmaddubsw m2, filter_x_a pmaddubsw m4, filter_x_a punpckhbw m3, m1, m5 paddw m2, filter_rnd paddw m4, filter_rnd psraw m2, 4 psraw m4, 4 packuswb m4, m2 punpckhbw m2, m0, m4 punpcklbw m0, m4 pmaddubsw m2, filter_y_a pmaddubsw m0, filter_y_a punpcklbw m1, m5 paddw m2, filter_rnd paddw m0, filter_rnd psraw m2, 4 psraw m0, 4 %else movu m3, [srcq] movu m4, [srcq+1] punpckhbw m1, m3, m5 punpckhbw m2, m4, m5 punpcklbw m3, m5 punpcklbw m4, m5 pmullw m3, filter_x_a pmullw m4, filter_x_b paddw m3, filter_rnd pmullw m1, filter_x_a pmullw m2, filter_x_b paddw m1, filter_rnd paddw m3, m4 paddw m1, m2 psraw m3, 4 psraw m1, 4 packuswb m4, m3, m1 punpckhbw m2, m0, m5 punpcklbw m0, m5 pmullw m2, filter_y_a pmullw m1, filter_y_b paddw m2, filter_rnd pmullw m0, filter_y_a pmullw m3, filter_y_b paddw m2, m1 mova m1, [dstq] paddw m0, filter_rnd psraw m2, 4 paddw m0, m3 punpckhbw m3, m1, m5 psraw m0, 4 punpcklbw m1, m5 %endif %if %2 == 1 ; avg ; FIXME(rbultje) pipeline packuswb m0, m2 pavgb m0, [secq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %endif SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 INC_SRC_BY_SRC_STRIDE add dstq, dst_strideq %else ; %1 < 16 movx m0, [srcq] movx m1, [srcq+1] %if cpuflag(ssse3) punpcklbw m0, m1 pmaddubsw m0, filter_x_a paddw m0, filter_rnd %else punpcklbw m0, m5 punpcklbw m1, m5 pmullw m0, filter_x_a pmullw m1, filter_x_b paddw m0, filter_rnd paddw m0, m1 %endif psraw m0, 4 %if cpuflag(ssse3) packuswb m0, m0 %endif INC_SRC_BY_SRC_STRIDE .x_other_y_other_loop: movx m2, [srcq] movx m1, [srcq+1] INC_SRC_BY_SRC_STRIDE movx m4, [srcq] movx m3, [srcq+1] %if cpuflag(ssse3) punpcklbw m2, m1 punpcklbw m4, m3 pmaddubsw m2, filter_x_a pmaddubsw m4, filter_x_a movx m3, [dstq+dst_strideq] movx m1, [dstq] paddw m2, filter_rnd paddw m4, filter_rnd psraw m2, 4 psraw m4, 4 packuswb m2, m2 packuswb m4, m4 punpcklbw m0, m2 punpcklbw m2, m4 pmaddubsw m0, filter_y_a pmaddubsw m2, filter_y_a punpcklbw m3, m5 paddw m0, filter_rnd paddw m2, filter_rnd psraw m0, 4 psraw m2, 4 punpcklbw m1, m5 %else punpcklbw m2, m5 punpcklbw m1, m5 punpcklbw m4, m5 punpcklbw m3, m5 pmullw m2, filter_x_a pmullw m1, filter_x_b paddw m2, filter_rnd pmullw m4, filter_x_a pmullw m3, filter_x_b paddw m4, filter_rnd paddw m2, m1 paddw m4, m3 psraw m2, 4 psraw m4, 4 pmullw m0, filter_y_a pmullw m3, m2, filter_y_b paddw m0, filter_rnd pmullw m2, filter_y_a pmullw m1, m4, filter_y_b paddw m2, filter_rnd paddw m0, m3 movx m3, [dstq+dst_strideq] paddw m2, m1 movx m1, [dstq] psraw m0, 4 psraw m2, 4 punpcklbw m3, m5 punpcklbw m1, m5 %endif %if %2 == 1 ; avg ; FIXME(rbultje) pipeline %if %1 == 4 movlhps m0, m2 %endif packuswb m0, m2 %if %1 > 4 pavgb m0, [secq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else movh m2, [secq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 %endif %endif SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 INC_SRC_BY_SRC_STRIDE lea dstq, [dstq+dst_strideq*2] %endif %if %2 == 1 ; avg add secq, sec_str %endif dec block_height jg .x_other_y_other_loop %undef filter_x_a %undef filter_x_b %undef filter_y_a %undef filter_y_b %undef filter_rnd %undef movx STORE_AND_RET %1 %endmacro ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical ; between the ssse3 and non-ssse3 version. It may make sense to merge their ; code in the sense that the ssse3 version would jump to the appropriate ; location in the sse/2 version, rather than duplicating that code in the ; binary. INIT_XMM ssse3 SUBPEL_VARIANCE 4 SUBPEL_VARIANCE 8 SUBPEL_VARIANCE 16 INIT_XMM ssse3 SUBPEL_VARIANCE 4, 1 SUBPEL_VARIANCE 8, 1 SUBPEL_VARIANCE 16, 1 aom-3.12.1/aom_dsp/x86/subtract_avx2.c000066400000000000000000000106251477627663500174070ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" static inline void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr, const uint8_t *pred_ptr) { __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr)); __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr)); __m256i set_one_minusone = _mm256_set1_epi32((int)0xff01ff01); __m256i diff0 = _mm256_unpacklo_epi8(s, p); __m256i diff1 = _mm256_unpackhi_epi8(s, p); diff0 = _mm256_maddubs_epi16(diff0, set_one_minusone); diff1 = _mm256_maddubs_epi16(diff1, set_one_minusone); _mm256_store_si256((__m256i *)(diff_ptr), _mm256_permute2x128_si256(diff0, diff1, 0x20)); _mm256_store_si256((__m256i *)(diff_ptr + 16), _mm256_permute2x128_si256(diff0, diff1, 0x31)); } static inline void subtract_block_16xn_avx2( int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { for (int32_t j = 0; j < rows; ++j) { __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr)); __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr)); __m256i s_0 = _mm256_cvtepu8_epi16(s); __m256i p_0 = _mm256_cvtepu8_epi16(p); const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); _mm256_store_si256((__m256i *)(diff_ptr), d_0); src_ptr += src_stride; pred_ptr += pred_stride; diff_ptr += diff_stride; } } static inline void subtract_block_32xn_avx2( int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { for (int32_t j = 0; j < rows; ++j) { subtract32_avx2(diff_ptr, src_ptr, pred_ptr); src_ptr += src_stride; pred_ptr += pred_stride; diff_ptr += diff_stride; } } static inline void subtract_block_64xn_avx2( int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { for (int32_t j = 0; j < rows; ++j) { subtract32_avx2(diff_ptr, src_ptr, pred_ptr); subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); src_ptr += src_stride; pred_ptr += pred_stride; diff_ptr += diff_stride; } } static inline void subtract_block_128xn_avx2( int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { for (int32_t j = 0; j < rows; ++j) { subtract32_avx2(diff_ptr, src_ptr, pred_ptr); subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64); subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96); src_ptr += src_stride; pred_ptr += pred_stride; diff_ptr += diff_stride; } } void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { switch (cols) { case 16: subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, pred_ptr, pred_stride); break; case 32: subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, pred_ptr, pred_stride); break; case 64: subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, pred_ptr, pred_stride); break; case 128: subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, pred_ptr, pred_stride); break; default: aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, pred_ptr, pred_stride); break; } } aom-3.12.1/aom_dsp/x86/subtract_sse2.asm000066400000000000000000000105661477627663500177450ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "third_party/x86inc/x86inc.asm" SECTION .text ; void aom_subtract_block(int rows, int cols, ; int16_t *diff, ptrdiff_t diff_stride, ; const uint8_t *src, ptrdiff_t src_stride, ; const uint8_t *pred, ptrdiff_t pred_stride) INIT_XMM sse2 cglobal subtract_block, 7, 7, 8, \ rows, cols, diff, diff_stride, src, src_stride, \ pred, pred_stride %define pred_str colsq pxor m7, m7 ; dedicated zero register cmp colsd, 4 je .case_4 cmp colsd, 8 je .case_8 cmp colsd, 16 je .case_16 cmp colsd, 32 je .case_32 cmp colsd, 64 je .case_64 %macro loop16 6 mova m0, [srcq+%1] mova m4, [srcq+%2] movu m1, [predq+%3] movu m5, [predq+%4] punpckhbw m2, m0, m7 punpckhbw m3, m1, m7 punpcklbw m0, m7 punpcklbw m1, m7 psubw m2, m3 psubw m0, m1 punpckhbw m1, m4, m7 punpckhbw m3, m5, m7 punpcklbw m4, m7 punpcklbw m5, m7 psubw m1, m3 psubw m4, m5 mova [diffq+mmsize*0+%5], m0 mova [diffq+mmsize*1+%5], m2 mova [diffq+mmsize*0+%6], m4 mova [diffq+mmsize*1+%6], m1 %endmacro mov pred_str, pred_stridemp .loop_128: loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize, 8*mmsize, 10*mmsize loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize lea diffq, [diffq+diff_strideq*2] add predq, pred_str add srcq, src_strideq sub rowsd, 1 jnz .loop_128 RET .case_64: mov pred_str, pred_stridemp .loop_64: loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize lea diffq, [diffq+diff_strideq*2] add predq, pred_str add srcq, src_strideq dec rowsd jg .loop_64 RET .case_32: mov pred_str, pred_stridemp .loop_32: loop16 0, mmsize, 0, mmsize, 0, 2*mmsize lea diffq, [diffq+diff_strideq*2] add predq, pred_str add srcq, src_strideq dec rowsd jg .loop_32 RET .case_16: mov pred_str, pred_stridemp .loop_16: loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 lea diffq, [diffq+diff_strideq*4] lea predq, [predq+pred_str*2] lea srcq, [srcq+src_strideq*2] sub rowsd, 2 jg .loop_16 RET %macro loop_h 0 movh m0, [srcq] movh m2, [srcq+src_strideq] movh m1, [predq] movh m3, [predq+pred_str] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 psubw m0, m1 psubw m2, m3 mova [diffq], m0 mova [diffq+diff_strideq*2], m2 %endmacro .case_8: mov pred_str, pred_stridemp .loop_8: loop_h lea diffq, [diffq+diff_strideq*4] lea srcq, [srcq+src_strideq*2] lea predq, [predq+pred_str*2] sub rowsd, 2 jg .loop_8 RET INIT_MMX .case_4: mov pred_str, pred_stridemp .loop_4: loop_h lea diffq, [diffq+diff_strideq*4] lea srcq, [srcq+src_strideq*2] lea predq, [predq+pred_str*2] sub rowsd, 2 jg .loop_4 emms RET aom-3.12.1/aom_dsp/x86/sum_squares_avx2.c000066400000000000000000000306501477627663500201270ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" #include "aom_dsp/x86/sum_squares_sse2.h" #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride, int width, int height) { uint64_t result; __m256i v_acc_q = _mm256_setzero_si256(); const __m256i v_zext_mask_q = _mm256_set1_epi64x(~0u); for (int col = 0; col < height; col += 4) { __m256i v_acc_d = _mm256_setzero_si256(); for (int row = 0; row < width; row += 16) { const int16_t *tempsrc = src + row; const __m256i v_val_0_w = _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride)); const __m256i v_val_1_w = _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride)); const __m256i v_val_2_w = _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride)); const __m256i v_val_3_w = _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride)); const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w); const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w); const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w); const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w); const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d); const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d); const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d); v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d); } v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q)); v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32)); src += 4 * stride; } __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q); __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1); __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value); result_64_2_int = _mm_add_epi64( result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int)); xx_storel_64(&result, result_64_2_int); return result; } uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width, int height) { if (LIKELY(width == 4 && height == 4)) { return aom_sum_squares_2d_i16_4x4_sse2(src, stride); } else if (LIKELY(width == 4 && (height & 3) == 0)) { return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height); } else if (LIKELY(width == 8 && (height & 3) == 0)) { return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) { return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height); } else { return aom_sum_squares_2d_i16_c(src, stride, width, height); } } static uint64_t aom_sum_sse_2d_i16_nxn_avx2(const int16_t *src, int stride, int width, int height, int *sum) { uint64_t result; const __m256i zero_reg = _mm256_setzero_si256(); const __m256i one_reg = _mm256_set1_epi16(1); __m256i v_sse_total = zero_reg; __m256i v_sum_total = zero_reg; for (int col = 0; col < height; col += 4) { __m256i v_sse_row = zero_reg; for (int row = 0; row < width; row += 16) { const int16_t *tempsrc = src + row; const __m256i v_val_0_w = _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride)); const __m256i v_val_1_w = _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride)); const __m256i v_val_2_w = _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride)); const __m256i v_val_3_w = _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride)); const __m256i v_sum_01 = _mm256_add_epi16(v_val_0_w, v_val_1_w); const __m256i v_sum_23 = _mm256_add_epi16(v_val_2_w, v_val_3_w); __m256i v_sum_0123 = _mm256_add_epi16(v_sum_01, v_sum_23); v_sum_0123 = _mm256_madd_epi16(v_sum_0123, one_reg); v_sum_total = _mm256_add_epi32(v_sum_total, v_sum_0123); const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w); const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w); const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w); const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w); const __m256i v_sq_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d); const __m256i v_sq_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d); const __m256i v_sq_0123_d = _mm256_add_epi32(v_sq_01_d, v_sq_23_d); v_sse_row = _mm256_add_epi32(v_sse_row, v_sq_0123_d); } const __m256i v_sse_row_low = _mm256_unpacklo_epi32(v_sse_row, zero_reg); const __m256i v_sse_row_hi = _mm256_unpackhi_epi32(v_sse_row, zero_reg); v_sse_row = _mm256_add_epi64(v_sse_row_low, v_sse_row_hi); v_sse_total = _mm256_add_epi64(v_sse_total, v_sse_row); src += 4 * stride; } const __m128i v_sum_total_low = _mm256_castsi256_si128(v_sum_total); const __m128i v_sum_total_hi = _mm256_extracti128_si256(v_sum_total, 1); __m128i sum_128bit = _mm_add_epi32(v_sum_total_hi, v_sum_total_low); sum_128bit = _mm_add_epi32(sum_128bit, _mm_srli_si128(sum_128bit, 8)); sum_128bit = _mm_add_epi32(sum_128bit, _mm_srli_si128(sum_128bit, 4)); *sum += _mm_cvtsi128_si32(sum_128bit); __m128i v_sse_total_lo = _mm256_castsi256_si128(v_sse_total); __m128i v_sse_total_hi = _mm256_extracti128_si256(v_sse_total, 1); __m128i sse_128bit = _mm_add_epi64(v_sse_total_lo, v_sse_total_hi); sse_128bit = _mm_add_epi64(sse_128bit, _mm_unpackhi_epi64(sse_128bit, sse_128bit)); xx_storel_64(&result, sse_128bit); return result; } uint64_t aom_sum_sse_2d_i16_avx2(const int16_t *src, int src_stride, int width, int height, int *sum) { if (LIKELY(width == 4 && height == 4)) { return aom_sum_sse_2d_i16_4x4_sse2(src, src_stride, sum); } else if (LIKELY(width == 4 && (height & 3) == 0)) { return aom_sum_sse_2d_i16_4xn_sse2(src, src_stride, height, sum); } else if (LIKELY(width == 8 && (height & 3) == 0)) { return aom_sum_sse_2d_i16_nxn_sse2(src, src_stride, width, height, sum); } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) { return aom_sum_sse_2d_i16_nxn_avx2(src, src_stride, width, height, sum); } else { return aom_sum_sse_2d_i16_c(src, src_stride, width, height, sum); } } // Accumulate sum of 16-bit elements in the vector static inline int32_t mm256_accumulate_epi16(__m256i vec_a) { __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1); __m128i vtmp2 = _mm256_castsi256_si128(vec_a); vtmp1 = _mm_add_epi16(vtmp1, vtmp2); vtmp2 = _mm_srli_si128(vtmp1, 8); vtmp1 = _mm_add_epi16(vtmp1, vtmp2); vtmp2 = _mm_srli_si128(vtmp1, 4); vtmp1 = _mm_add_epi16(vtmp1, vtmp2); vtmp2 = _mm_srli_si128(vtmp1, 2); vtmp1 = _mm_add_epi16(vtmp1, vtmp2); return _mm_extract_epi16(vtmp1, 0); } // Accumulate sum of 32-bit elements in the vector static inline int32_t mm256_accumulate_epi32(__m256i vec_a) { __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1); __m128i vtmp2 = _mm256_castsi256_si128(vec_a); vtmp1 = _mm_add_epi32(vtmp1, vtmp2); vtmp2 = _mm_srli_si128(vtmp1, 8); vtmp1 = _mm_add_epi32(vtmp1, vtmp2); vtmp2 = _mm_srli_si128(vtmp1, 4); vtmp1 = _mm_add_epi32(vtmp1, vtmp2); return _mm_cvtsi128_si32(vtmp1); } uint64_t aom_var_2d_u8_avx2(uint8_t *src, int src_stride, int width, int height) { uint8_t *srcp; uint64_t s = 0, ss = 0; __m256i vzero = _mm256_setzero_si256(); __m256i v_acc_sum = vzero; __m256i v_acc_sqs = vzero; int i, j; // Process 32 elements in a row for (i = 0; i < width - 31; i += 32) { srcp = src + i; // Process 8 columns at a time for (j = 0; j < height - 7; j += 8) { __m256i vsrc[8]; for (int k = 0; k < 8; k++) { vsrc[k] = _mm256_loadu_si256((__m256i *)srcp); srcp += src_stride; } for (int k = 0; k < 8; k++) { __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc[k], vzero); __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc[k], vzero); v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0); v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1); __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0); __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1); v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1); } // Update total sum and clear the vectors s += mm256_accumulate_epi16(v_acc_sum); ss += mm256_accumulate_epi32(v_acc_sqs); v_acc_sum = vzero; v_acc_sqs = vzero; } // Process remaining rows (height not a multiple of 8) for (; j < height; j++) { __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp); __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc, vzero); __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc, vzero); v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0); v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1); __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0); __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1); v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1); srcp += src_stride; } // Update total sum and clear the vectors s += mm256_accumulate_epi16(v_acc_sum); ss += mm256_accumulate_epi32(v_acc_sqs); v_acc_sum = vzero; v_acc_sqs = vzero; } // Process the remaining area using C srcp = src; for (int k = 0; k < height; k++) { for (int m = i; m < width; m++) { uint8_t val = srcp[m]; s += val; ss += val * val; } srcp += src_stride; } return (ss - s * s / (width * height)); } #if CONFIG_AV1_HIGHBITDEPTH uint64_t aom_var_2d_u16_avx2(uint8_t *src, int src_stride, int width, int height) { uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp; uint64_t s = 0, ss = 0; __m256i vzero = _mm256_setzero_si256(); __m256i v_acc_sum = vzero; __m256i v_acc_sqs = vzero; int i, j; // Process 16 elements in a row for (i = 0; i < width - 15; i += 16) { srcp = srcp1 + i; // Process 8 columns at a time for (j = 0; j < height - 8; j += 8) { __m256i vsrc[8]; for (int k = 0; k < 8; k++) { vsrc[k] = _mm256_loadu_si256((__m256i *)srcp); srcp += src_stride; } for (int k = 0; k < 8; k++) { __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc[k], vzero); __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc[k], vzero); v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum); v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum); __m256i vsqs0 = _mm256_madd_epi16(vsrc[k], vsrc[k]); v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); } // Update total sum and clear the vectors s += mm256_accumulate_epi32(v_acc_sum); ss += mm256_accumulate_epi32(v_acc_sqs); v_acc_sum = vzero; v_acc_sqs = vzero; } // Process remaining rows (height not a multiple of 8) for (; j < height; j++) { __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp); __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc, vzero); __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc, vzero); v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum); v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum); __m256i vsqs0 = _mm256_madd_epi16(vsrc, vsrc); v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); srcp += src_stride; } // Update total sum and clear the vectors s += mm256_accumulate_epi32(v_acc_sum); ss += mm256_accumulate_epi32(v_acc_sqs); v_acc_sum = vzero; v_acc_sqs = vzero; } // Process the remaining area using C srcp = srcp1; for (int k = 0; k < height; k++) { for (int m = i; m < width; m++) { uint16_t val = srcp[m]; s += val; ss += val * val; } srcp += src_stride; } return (ss - s * s / (width * height)); } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/x86/sum_squares_sse2.c000066400000000000000000000417761477627663500201360ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/sum_squares_sse2.h" #include "config/aom_dsp_rtcd.h" static inline __m128i xx_loadh_64(__m128i a, const void *b) { const __m128d ad = _mm_castsi128_pd(a); return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b)); } static inline uint64_t xx_cvtsi128_si64(__m128i a) { #if AOM_ARCH_X86_64 return (uint64_t)_mm_cvtsi128_si64(a); #else { uint64_t tmp; _mm_storel_epi64((__m128i *)&tmp, a); return tmp; } #endif } static inline __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) { const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride); const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride); const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride); const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride); const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w); const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w); return _mm_add_epi32(v_sq_01_d, v_sq_23_d); } uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) { const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride); __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8)); return (uint64_t)_mm_cvtsi128_si32(v_sum_d); } uint64_t aom_sum_sse_2d_i16_4x4_sse2(const int16_t *src, int stride, int *sum) { const __m128i one_reg = _mm_set1_epi16(1); const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride); const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride); __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride); __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride); __m128i v_sum_0123_d = _mm_add_epi16(v_val_01_w, v_val_23_w); v_sum_0123_d = _mm_madd_epi16(v_sum_0123_d, one_reg); v_sum_0123_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_si128(v_sum_0123_d, 8)); v_sum_0123_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_si128(v_sum_0123_d, 4)); *sum = _mm_cvtsi128_si32(v_sum_0123_d); const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w); const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w); __m128i v_sq_0123_d = _mm_add_epi32(v_sq_01_d, v_sq_23_d); v_sq_0123_d = _mm_add_epi32(v_sq_0123_d, _mm_srli_si128(v_sq_0123_d, 8)); v_sq_0123_d = _mm_add_epi32(v_sq_0123_d, _mm_srli_si128(v_sq_0123_d, 4)); return (uint64_t)_mm_cvtsi128_si32(v_sq_0123_d); } uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, int height) { int r = 0; __m128i v_acc_q = _mm_setzero_si128(); do { const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride); v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d); src += stride << 2; r += 4; } while (r < height); const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u); __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32), _mm_and_si128(v_acc_q, v_zext_mask_q)); v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8)); return xx_cvtsi128_si64(v_acc_64); } uint64_t aom_sum_sse_2d_i16_4xn_sse2(const int16_t *src, int stride, int height, int *sum) { int r = 0; uint64_t sse = 0; do { int curr_sum = 0; sse += aom_sum_sse_2d_i16_4x4_sse2(src, stride, &curr_sum); *sum += curr_sum; src += stride << 2; r += 4; } while (r < height); return sse; } #ifdef __GNUC__ // This prevents GCC/Clang from inlining this function into // aom_sum_squares_2d_i16_sse2, which in turn saves some stack // maintenance instructions in the common case of 4x4. __attribute__((noinline)) #endif uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, int height) { int r = 0; const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u); __m128i v_acc_q = _mm_setzero_si128(); do { __m128i v_acc_d = _mm_setzero_si128(); int c = 0; do { const int16_t *b = src + c; const __m128i v_val_0_w = xx_load_128(b + 0 * stride); const __m128i v_val_1_w = xx_load_128(b + 1 * stride); const __m128i v_val_2_w = xx_load_128(b + 2 * stride); const __m128i v_val_3_w = xx_load_128(b + 3 * stride); const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); c += 8; } while (c < width); v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); src += 4 * stride; r += 4; } while (r < height); v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); return xx_cvtsi128_si64(v_acc_q); } #ifdef __GNUC__ // This prevents GCC/Clang from inlining this function into // aom_sum_sse_2d_i16_nxn_sse2, which in turn saves some stack // maintenance instructions in the common case of 4x4. __attribute__((noinline)) #endif uint64_t aom_sum_sse_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, int height, int *sum) { int r = 0; uint64_t result; const __m128i zero_reg = _mm_setzero_si128(); const __m128i one_reg = _mm_set1_epi16(1); __m128i v_sse_total = zero_reg; __m128i v_sum_total = zero_reg; do { int c = 0; __m128i v_sse_row = zero_reg; do { const int16_t *b = src + c; __m128i v_val_0_w = xx_load_128(b + 0 * stride); __m128i v_val_1_w = xx_load_128(b + 1 * stride); __m128i v_val_2_w = xx_load_128(b + 2 * stride); __m128i v_val_3_w = xx_load_128(b + 3 * stride); const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); const __m128i v_sq_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); const __m128i v_sq_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); const __m128i v_sq_0123_d = _mm_add_epi32(v_sq_01_d, v_sq_23_d); v_sse_row = _mm_add_epi32(v_sse_row, v_sq_0123_d); const __m128i v_sum_01 = _mm_add_epi16(v_val_0_w, v_val_1_w); const __m128i v_sum_23 = _mm_add_epi16(v_val_2_w, v_val_3_w); __m128i v_sum_0123_d = _mm_add_epi16(v_sum_01, v_sum_23); v_sum_0123_d = _mm_madd_epi16(v_sum_0123_d, one_reg); v_sum_total = _mm_add_epi32(v_sum_total, v_sum_0123_d); c += 8; } while (c < width); const __m128i v_sse_row_low = _mm_unpacklo_epi32(v_sse_row, zero_reg); const __m128i v_sse_row_hi = _mm_unpackhi_epi32(v_sse_row, zero_reg); v_sse_row = _mm_add_epi64(v_sse_row_low, v_sse_row_hi); v_sse_total = _mm_add_epi64(v_sse_total, v_sse_row); src += 4 * stride; r += 4; } while (r < height); v_sum_total = _mm_add_epi32(v_sum_total, _mm_srli_si128(v_sum_total, 8)); v_sum_total = _mm_add_epi32(v_sum_total, _mm_srli_si128(v_sum_total, 4)); *sum += _mm_cvtsi128_si32(v_sum_total); v_sse_total = _mm_add_epi64(v_sse_total, _mm_srli_si128(v_sse_total, 8)); xx_storel_64(&result, v_sse_total); return result; } uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width, int height) { // 4 elements per row only requires half an XMM register, so this // must be a special case, but also note that over 75% of all calls // are with size == 4, so it is also the common case. if (LIKELY(width == 4 && height == 4)) { return aom_sum_squares_2d_i16_4x4_sse2(src, stride); } else if (LIKELY(width == 4 && (height & 3) == 0)) { return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height); } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) { // Generic case return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); } else { return aom_sum_squares_2d_i16_c(src, stride, width, height); } } uint64_t aom_sum_sse_2d_i16_sse2(const int16_t *src, int src_stride, int width, int height, int *sum) { if (LIKELY(width == 4 && height == 4)) { return aom_sum_sse_2d_i16_4x4_sse2(src, src_stride, sum); } else if (LIKELY(width == 4 && (height & 3) == 0)) { return aom_sum_sse_2d_i16_4xn_sse2(src, src_stride, height, sum); } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) { // Generic case return aom_sum_sse_2d_i16_nxn_sse2(src, src_stride, width, height, sum); } else { return aom_sum_sse_2d_i16_c(src, src_stride, width, height, sum); } } ////////////////////////////////////////////////////////////////////////////// // 1D version ////////////////////////////////////////////////////////////////////////////// static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { const __m128i v_zext_mask_q = _mm_set1_epi64x(~0u); __m128i v_acc0_q = _mm_setzero_si128(); __m128i v_acc1_q = _mm_setzero_si128(); const int16_t *const end = src + n; assert(n % 64 == 0); while (src < end) { const __m128i v_val_0_w = xx_load_128(src); const __m128i v_val_1_w = xx_load_128(src + 8); const __m128i v_val_2_w = xx_load_128(src + 16); const __m128i v_val_3_w = xx_load_128(src + 24); const __m128i v_val_4_w = xx_load_128(src + 32); const __m128i v_val_5_w = xx_load_128(src + 40); const __m128i v_val_6_w = xx_load_128(src + 48); const __m128i v_val_7_w = xx_load_128(src + 56); const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q)); v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32)); src += 64; } v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); return xx_cvtsi128_si64(v_acc0_q); } uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) { if (n % 64 == 0) { return aom_sum_squares_i16_64n_sse2(src, n); } else if (n > 64) { const uint32_t k = n & ~63u; return aom_sum_squares_i16_64n_sse2(src, k) + aom_sum_squares_i16_c(src + k, n - k); } else { return aom_sum_squares_i16_c(src, n); } } // Accumulate sum of 16-bit elements in the vector static inline int32_t mm_accumulate_epi16(__m128i vec_a) { __m128i vtmp = _mm_srli_si128(vec_a, 8); vec_a = _mm_add_epi16(vec_a, vtmp); vtmp = _mm_srli_si128(vec_a, 4); vec_a = _mm_add_epi16(vec_a, vtmp); vtmp = _mm_srli_si128(vec_a, 2); vec_a = _mm_add_epi16(vec_a, vtmp); return _mm_extract_epi16(vec_a, 0); } // Accumulate sum of 32-bit elements in the vector static inline int32_t mm_accumulate_epi32(__m128i vec_a) { __m128i vtmp = _mm_srli_si128(vec_a, 8); vec_a = _mm_add_epi32(vec_a, vtmp); vtmp = _mm_srli_si128(vec_a, 4); vec_a = _mm_add_epi32(vec_a, vtmp); return _mm_cvtsi128_si32(vec_a); } uint64_t aom_var_2d_u8_sse2(uint8_t *src, int src_stride, int width, int height) { uint8_t *srcp; uint64_t s = 0, ss = 0; __m128i vzero = _mm_setzero_si128(); __m128i v_acc_sum = vzero; __m128i v_acc_sqs = vzero; int i, j; // Process 16 elements in a row for (i = 0; i < width - 15; i += 16) { srcp = src + i; // Process 8 columns at a time for (j = 0; j < height - 7; j += 8) { __m128i vsrc[8]; for (int k = 0; k < 8; k++) { vsrc[k] = _mm_loadu_si128((__m128i *)srcp); srcp += src_stride; } for (int k = 0; k < 8; k++) { __m128i vsrc0 = _mm_unpacklo_epi8(vsrc[k], vzero); __m128i vsrc1 = _mm_unpackhi_epi8(vsrc[k], vzero); v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0); v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1); __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0); __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1); v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0); v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1); } // Update total sum and clear the vectors s += mm_accumulate_epi16(v_acc_sum); ss += mm_accumulate_epi32(v_acc_sqs); v_acc_sum = vzero; v_acc_sqs = vzero; } // Process remaining rows (height not a multiple of 8) for (; j < height; j++) { __m128i vsrc = _mm_loadu_si128((__m128i *)srcp); __m128i vsrc0 = _mm_unpacklo_epi8(vsrc, vzero); __m128i vsrc1 = _mm_unpackhi_epi8(vsrc, vzero); v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0); v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1); __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0); __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1); v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0); v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1); srcp += src_stride; } // Update total sum and clear the vectors s += mm_accumulate_epi16(v_acc_sum); ss += mm_accumulate_epi32(v_acc_sqs); v_acc_sum = vzero; v_acc_sqs = vzero; } // Process the remaining area using C srcp = src; for (int k = 0; k < height; k++) { for (int m = i; m < width; m++) { uint8_t val = srcp[m]; s += val; ss += val * val; } srcp += src_stride; } return (ss - s * s / (width * height)); } #if CONFIG_AV1_HIGHBITDEPTH uint64_t aom_var_2d_u16_sse2(uint8_t *src, int src_stride, int width, int height) { uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp; uint64_t s = 0, ss = 0; __m128i vzero = _mm_setzero_si128(); __m128i v_acc_sum = vzero; __m128i v_acc_sqs = vzero; int i, j; // Process 8 elements in a row for (i = 0; i < width - 8; i += 8) { srcp = srcp1 + i; // Process 8 columns at a time for (j = 0; j < height - 8; j += 8) { __m128i vsrc[8]; for (int k = 0; k < 8; k++) { vsrc[k] = _mm_loadu_si128((__m128i *)srcp); srcp += src_stride; } for (int k = 0; k < 8; k++) { __m128i vsrc0 = _mm_unpacklo_epi16(vsrc[k], vzero); __m128i vsrc1 = _mm_unpackhi_epi16(vsrc[k], vzero); v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum); v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum); __m128i vsqs0 = _mm_madd_epi16(vsrc[k], vsrc[k]); v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0); } // Update total sum and clear the vectors s += mm_accumulate_epi32(v_acc_sum); ss += mm_accumulate_epi32(v_acc_sqs); v_acc_sum = vzero; v_acc_sqs = vzero; } // Process remaining rows (height not a multiple of 8) for (; j < height; j++) { __m128i vsrc = _mm_loadu_si128((__m128i *)srcp); __m128i vsrc0 = _mm_unpacklo_epi16(vsrc, vzero); __m128i vsrc1 = _mm_unpackhi_epi16(vsrc, vzero); v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum); v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum); __m128i vsqs0 = _mm_madd_epi16(vsrc, vsrc); v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0); srcp += src_stride; } // Update total sum and clear the vectors s += mm_accumulate_epi32(v_acc_sum); ss += mm_accumulate_epi32(v_acc_sqs); v_acc_sum = vzero; v_acc_sqs = vzero; } // Process the remaining area using C srcp = srcp1; for (int k = 0; k < height; k++) { for (int m = i; m < width; m++) { uint16_t val = srcp[m]; s += val; ss += val * val; } srcp += src_stride; } return (ss - s * s / (width * height)); } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/aom_dsp/x86/sum_squares_sse2.h000066400000000000000000000025021477627663500201230ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_ #define AOM_DSP_X86_SUM_SQUARES_SSE2_H_ uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, int height); uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, int height); uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride); uint64_t aom_sum_sse_2d_i16_4x4_sse2(const int16_t *src, int stride, int *sum); uint64_t aom_sum_sse_2d_i16_4xn_sse2(const int16_t *src, int stride, int height, int *sum); uint64_t aom_sum_sse_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, int height, int *sum); #endif // AOM_DSP_X86_SUM_SQUARES_SSE2_H_ aom-3.12.1/aom_dsp/x86/synonyms.h000066400000000000000000000101001477627663500165100ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_SYNONYMS_H_ #define AOM_AOM_DSP_X86_SYNONYMS_H_ #include #include #include "config/aom_config.h" #include "aom/aom_integer.h" /** * Various reusable shorthands for x86 SIMD intrinsics. * * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers. * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers. */ // Loads and stores to do away with the tedium of casting the address // to the right type. static inline __m128i xx_loadl_32(const void *a) { int val; memcpy(&val, a, sizeof(val)); return _mm_cvtsi32_si128(val); } static inline __m128i xx_loadl_64(const void *a) { return _mm_loadl_epi64((const __m128i *)a); } static inline __m128i xx_load_128(const void *a) { return _mm_load_si128((const __m128i *)a); } static inline __m128i xx_loadu_128(const void *a) { return _mm_loadu_si128((const __m128i *)a); } // Load 64 bits from each of hi and low, and pack into an SSE register // Since directly loading as `int64_t`s and using _mm_set_epi64 may violate // the strict aliasing rule, this takes a different approach static inline __m128i xx_loadu_2x64(const void *hi, const void *lo) { return _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)lo), _mm_loadl_epi64((const __m128i *)hi)); } static inline void xx_storel_32(void *const a, const __m128i v) { const int val = _mm_cvtsi128_si32(v); memcpy(a, &val, sizeof(val)); } static inline void xx_storel_64(void *const a, const __m128i v) { _mm_storel_epi64((__m128i *)a, v); } static inline void xx_store_128(void *const a, const __m128i v) { _mm_store_si128((__m128i *)a, v); } static inline void xx_storeu_128(void *const a, const __m128i v) { _mm_storeu_si128((__m128i *)a, v); } // Fill an SSE register using an interleaved pair of values, ie. set the // 8 channels to {a, b, a, b, a, b, a, b}, using the same channel ordering // as when a register is stored to / loaded from memory. // // This is useful for rearranging filter kernels for use with the _mm_madd_epi16 // instruction static inline __m128i xx_set2_epi16(int16_t a, int16_t b) { return _mm_setr_epi16(a, b, a, b, a, b, a, b); } static inline __m128i xx_round_epu16(__m128i v_val_w) { return _mm_avg_epu16(v_val_w, _mm_setzero_si128()); } static inline __m128i xx_roundn_epu16(__m128i v_val_w, int bits) { const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1); return _mm_avg_epu16(v_s_w, _mm_setzero_si128()); } static inline __m128i xx_roundn_epu32(__m128i v_val_d, int bits) { const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); return _mm_srli_epi32(v_tmp_d, bits); } static inline __m128i xx_roundn_epi16_unsigned(__m128i v_val_d, int bits) { const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1); const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d); return _mm_srai_epi16(v_tmp_d, bits); } // This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits) static inline __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) { const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); return _mm_srai_epi32(v_tmp_d, bits); } static inline __m128i xx_roundn_epi16(__m128i v_val_d, int bits) { const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1); const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15); const __m128i v_tmp_d = _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d); return _mm_srai_epi16(v_tmp_d, bits); } #endif // AOM_AOM_DSP_X86_SYNONYMS_H_ aom-3.12.1/aom_dsp/x86/synonyms_avx2.h000066400000000000000000000071751477627663500174720ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ #define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ #include #include "config/aom_config.h" #include "aom/aom_integer.h" /** * Various reusable shorthands for x86 SIMD intrinsics. * * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers. * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers. */ // Loads and stores to do away with the tedium of casting the address // to the right type. static inline __m256i yy_load_256(const void *a) { return _mm256_load_si256((const __m256i *)a); } static inline __m256i yy_loadu_256(const void *a) { return _mm256_loadu_si256((const __m256i *)a); } static inline void yy_store_256(void *const a, const __m256i v) { _mm256_store_si256((__m256i *)a, v); } static inline void yy_storeu_256(void *const a, const __m256i v) { _mm256_storeu_si256((__m256i *)a, v); } // Fill an AVX register using an interleaved pair of values, ie. set the // 16 channels to {a, b} repeated 8 times, using the same channel ordering // as when a register is stored to / loaded from memory. // // This is useful for rearranging filter kernels for use with the _mm_madd_epi16 // instruction static inline __m256i yy_set2_epi16(int16_t a, int16_t b) { return _mm256_setr_epi16(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b); } // Some compilers don't have _mm256_set_m128i defined in immintrin.h. We // therefore define an equivalent function using a different intrinsic. // ([ hi ], [ lo ]) -> [ hi ][ lo ] static inline __m256i yy_set_m128i(__m128i hi, __m128i lo) { return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); } // This behaves similarly to _mm256_set_epi64x(), but avoids undefined // sanitizer warnings when loading values from unaligned buffers using // `*(int64_t *)val`. static inline __m256i yy_loadu_4x64(const void *e3, const void *e2, const void *e1, const void *e0) { __m128d v0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e0)); __m128d v01 = _mm_loadh_pd(v0, (const double *)e1); __m128d v2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i *)e2)); __m128d v23 = _mm_loadh_pd(v2, (const double *)e3); // Note this can be replaced with // `_mm256_castpd_si256(_mm256_set_m128d(v23, v01))` if immintrin.h contains // _mm256_set_m128d() with all supported compilers. This version is used to // match the behavior with yy_set_m128i(). return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01)); } static inline __m256i yy_loadu2_128(const void *hi, const void *lo) { __m128i mhi = _mm_loadu_si128((const __m128i *)(hi)); __m128i mlo = _mm_loadu_si128((const __m128i *)(lo)); return yy_set_m128i(mhi, mlo); } static inline void yy_storeu2_128(void *hi, void *lo, const __m256i a) { _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1)); _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a)); } static inline __m256i yy_roundn_epu16(__m256i v_val_w, int bits) { const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1); return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256()); } #endif // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ aom-3.12.1/aom_dsp/x86/transpose_sse2.h000066400000000000000000000356231477627663500176040ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ #define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ #include // SSE2 #include "config/aom_config.h" static inline __m128i transpose_8bit_4x4(const __m128i *const in) { // Unpack 8 bit elements. Goes from: // in[0]: 00 01 02 03 // in[1]: 10 11 12 13 // in[2]: 20 21 22 23 // in[3]: 30 31 32 33 // to: // a0: 00 10 01 11 02 12 03 13 // a1: 20 30 21 31 22 32 23 33 const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); // Unpack 16 bit elements resulting in: // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 return _mm_unpacklo_epi16(a0, a1); } static inline void transpose_8bit_8x8(const __m128i *const in, __m128i *const out) { // Unpack 8 bit elements. Goes from: // in[0]: 00 01 02 03 04 05 06 07 // in[1]: 10 11 12 13 14 15 16 17 // in[2]: 20 21 22 23 24 25 26 27 // in[3]: 30 31 32 33 34 35 36 37 // in[4]: 40 41 42 43 44 45 46 47 // in[5]: 50 51 52 53 54 55 56 57 // in[6]: 60 61 62 63 64 65 66 67 // in[7]: 70 71 72 73 74 75 76 77 // to: // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]); const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]); // Unpack 16 bit elements resulting in: // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 const __m128i b0 = _mm_unpacklo_epi16(a0, a1); const __m128i b1 = _mm_unpackhi_epi16(a0, a1); const __m128i b2 = _mm_unpacklo_epi16(a2, a3); const __m128i b3 = _mm_unpackhi_epi16(a2, a3); // Unpack 32 bit elements resulting in: // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 const __m128i c0 = _mm_unpacklo_epi32(b0, b2); const __m128i c1 = _mm_unpackhi_epi32(b0, b2); const __m128i c2 = _mm_unpacklo_epi32(b1, b3); const __m128i c3 = _mm_unpackhi_epi32(b1, b3); // Unpack 64 bit elements resulting in: // out[0]: 00 10 20 30 40 50 60 70 // out[1]: 01 11 21 31 41 51 61 71 // out[2]: 02 12 22 32 42 52 62 72 // out[3]: 03 13 23 33 43 53 63 73 // out[4]: 04 14 24 34 44 54 64 74 // out[5]: 05 15 25 35 45 55 65 75 // out[6]: 06 16 26 36 46 56 66 76 // out[7]: 07 17 27 37 47 57 67 77 out[0] = _mm_unpacklo_epi64(c0, c0); out[1] = _mm_unpackhi_epi64(c0, c0); out[2] = _mm_unpacklo_epi64(c1, c1); out[3] = _mm_unpackhi_epi64(c1, c1); out[4] = _mm_unpacklo_epi64(c2, c2); out[5] = _mm_unpackhi_epi64(c2, c2); out[6] = _mm_unpacklo_epi64(c3, c3); out[7] = _mm_unpackhi_epi64(c3, c3); } static inline void transpose_16bit_4x4(const __m128i *const in, __m128i *const out) { // Unpack 16 bit elements. Goes from: // in[0]: 00 01 02 03 XX XX XX XX // in[1]: 10 11 12 13 XX XX XX XX // in[2]: 20 21 22 23 XX XX XX XX // in[3]: 30 31 32 33 XX XX XX XX // to: // a0: 00 10 01 11 02 12 03 13 // a1: 20 30 21 31 22 32 23 33 const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); // Unpack 32 bit elements resulting in: // out[0]: 00 10 20 30 01 11 21 31 // out[1]: 01 11 21 31 __ __ __ __ // out[2]: 02 12 22 32 03 13 23 33 // out[3]: 03 13 23 33 __ __ __ __ // // Note: The high 64 bits of the output registers are shown for informational // purposes only. Callers should only use the low 64 bits of the output // registers. "__" indicates zeros. out[0] = _mm_unpacklo_epi32(a0, a1); out[1] = _mm_srli_si128(out[0], 8); out[2] = _mm_unpackhi_epi32(a0, a1); out[3] = _mm_srli_si128(out[2], 8); } static inline void transpose_16bit_4x8(const __m128i *const in, __m128i *const out) { // Unpack 16 bit elements. Goes from: // in[0]: 00 01 02 03 XX XX XX XX // in[1]: 10 11 12 13 XX XX XX XX // in[2]: 20 21 22 23 XX XX XX XX // in[3]: 30 31 32 33 XX XX XX XX // in[4]: 40 41 42 43 XX XX XX XX // in[5]: 50 51 52 53 XX XX XX XX // in[6]: 60 61 62 63 XX XX XX XX // in[7]: 70 71 72 73 XX XX XX XX // to: // a0: 00 10 01 11 02 12 03 13 // a1: 20 30 21 31 22 32 23 33 // a2: 40 50 41 51 42 52 43 53 // a3: 60 70 61 71 62 72 63 73 const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); // Unpack 32 bit elements resulting in: // b0: 00 10 20 30 01 11 21 31 // b1: 40 50 60 70 41 51 61 71 // b2: 02 12 22 32 03 13 23 33 // b3: 42 52 62 72 43 53 63 73 const __m128i b0 = _mm_unpacklo_epi32(a0, a1); const __m128i b1 = _mm_unpacklo_epi32(a2, a3); const __m128i b2 = _mm_unpackhi_epi32(a0, a1); const __m128i b3 = _mm_unpackhi_epi32(a2, a3); // Unpack 64 bit elements resulting in: // out[0]: 00 10 20 30 40 50 60 70 // out[1]: 01 11 21 31 41 51 61 71 // out[2]: 02 12 22 32 42 52 62 72 // out[3]: 03 13 23 33 43 53 63 73 out[0] = _mm_unpacklo_epi64(b0, b1); out[1] = _mm_unpackhi_epi64(b0, b1); out[2] = _mm_unpacklo_epi64(b2, b3); out[3] = _mm_unpackhi_epi64(b2, b3); } static inline void transpose_16bit_8x4(const __m128i *const in, __m128i *const out) { // Unpack 16 bit elements. Goes from: // in[0]: 00 01 02 03 04 05 06 07 // in[1]: 10 11 12 13 14 15 16 17 // in[2]: 20 21 22 23 24 25 26 27 // in[3]: 30 31 32 33 34 35 36 37 // to: // a0: 00 10 01 11 02 12 03 13 // a1: 20 30 21 31 22 32 23 33 // a4: 04 14 05 15 06 16 07 17 // a5: 24 34 25 35 26 36 27 37 const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); // Unpack 32 bit elements resulting in: // b0: 00 10 20 30 01 11 21 31 // b2: 04 14 24 34 05 15 25 35 // b4: 02 12 22 32 03 13 23 33 // b6: 06 16 26 36 07 17 27 37 const __m128i b0 = _mm_unpacklo_epi32(a0, a1); const __m128i b2 = _mm_unpacklo_epi32(a4, a5); const __m128i b4 = _mm_unpackhi_epi32(a0, a1); const __m128i b6 = _mm_unpackhi_epi32(a4, a5); // Unpack 64 bit elements resulting in: // out[0]: 00 10 20 30 XX XX XX XX // out[1]: 01 11 21 31 XX XX XX XX // out[2]: 02 12 22 32 XX XX XX XX // out[3]: 03 13 23 33 XX XX XX XX // out[4]: 04 14 24 34 XX XX XX XX // out[5]: 05 15 25 35 XX XX XX XX // out[6]: 06 16 26 36 XX XX XX XX // out[7]: 07 17 27 37 XX XX XX XX const __m128i zeros = _mm_setzero_si128(); out[0] = _mm_unpacklo_epi64(b0, zeros); out[1] = _mm_unpackhi_epi64(b0, zeros); out[2] = _mm_unpacklo_epi64(b4, zeros); out[3] = _mm_unpackhi_epi64(b4, zeros); out[4] = _mm_unpacklo_epi64(b2, zeros); out[5] = _mm_unpackhi_epi64(b2, zeros); out[6] = _mm_unpacklo_epi64(b6, zeros); out[7] = _mm_unpackhi_epi64(b6, zeros); } static inline void transpose_16bit_8x8(const __m128i *const in, __m128i *const out) { // Unpack 16 bit elements. Goes from: // in[0]: 00 01 02 03 04 05 06 07 // in[1]: 10 11 12 13 14 15 16 17 // in[2]: 20 21 22 23 24 25 26 27 // in[3]: 30 31 32 33 34 35 36 37 // in[4]: 40 41 42 43 44 45 46 47 // in[5]: 50 51 52 53 54 55 56 57 // in[6]: 60 61 62 63 64 65 66 67 // in[7]: 70 71 72 73 74 75 76 77 // to: // a0: 00 10 01 11 02 12 03 13 // a1: 20 30 21 31 22 32 23 33 // a2: 40 50 41 51 42 52 43 53 // a3: 60 70 61 71 62 72 63 73 // a4: 04 14 05 15 06 16 07 17 // a5: 24 34 25 35 26 36 27 37 // a6: 44 54 45 55 46 56 47 57 // a7: 64 74 65 75 66 76 67 77 const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); // Unpack 32 bit elements resulting in: // b0: 00 10 20 30 01 11 21 31 // b1: 40 50 60 70 41 51 61 71 // b2: 04 14 24 34 05 15 25 35 // b3: 44 54 64 74 45 55 65 75 // b4: 02 12 22 32 03 13 23 33 // b5: 42 52 62 72 43 53 63 73 // b6: 06 16 26 36 07 17 27 37 // b7: 46 56 66 76 47 57 67 77 const __m128i b0 = _mm_unpacklo_epi32(a0, a1); const __m128i b1 = _mm_unpacklo_epi32(a2, a3); const __m128i b2 = _mm_unpacklo_epi32(a4, a5); const __m128i b3 = _mm_unpacklo_epi32(a6, a7); const __m128i b4 = _mm_unpackhi_epi32(a0, a1); const __m128i b5 = _mm_unpackhi_epi32(a2, a3); const __m128i b6 = _mm_unpackhi_epi32(a4, a5); const __m128i b7 = _mm_unpackhi_epi32(a6, a7); // Unpack 64 bit elements resulting in: // out[0]: 00 10 20 30 40 50 60 70 // out[1]: 01 11 21 31 41 51 61 71 // out[2]: 02 12 22 32 42 52 62 72 // out[3]: 03 13 23 33 43 53 63 73 // out[4]: 04 14 24 34 44 54 64 74 // out[5]: 05 15 25 35 45 55 65 75 // out[6]: 06 16 26 36 46 56 66 76 // out[7]: 07 17 27 37 47 57 67 77 out[0] = _mm_unpacklo_epi64(b0, b1); out[1] = _mm_unpackhi_epi64(b0, b1); out[2] = _mm_unpacklo_epi64(b4, b5); out[3] = _mm_unpackhi_epi64(b4, b5); out[4] = _mm_unpacklo_epi64(b2, b3); out[5] = _mm_unpackhi_epi64(b2, b3); out[6] = _mm_unpacklo_epi64(b6, b7); out[7] = _mm_unpackhi_epi64(b6, b7); } // Transpose in-place static inline void transpose_16bit_16x16(__m128i *const left, __m128i *const right) { __m128i tbuf[8]; transpose_16bit_8x8(left, left); transpose_16bit_8x8(right, tbuf); transpose_16bit_8x8(left + 8, right); transpose_16bit_8x8(right + 8, right + 8); left[8] = tbuf[0]; left[9] = tbuf[1]; left[10] = tbuf[2]; left[11] = tbuf[3]; left[12] = tbuf[4]; left[13] = tbuf[5]; left[14] = tbuf[6]; left[15] = tbuf[7]; } static inline void transpose_32bit_4x4(const __m128i *const in, __m128i *const out) { // Unpack 32 bit elements. Goes from: // in[0]: 00 01 02 03 // in[1]: 10 11 12 13 // in[2]: 20 21 22 23 // in[3]: 30 31 32 33 // to: // a0: 00 10 01 11 // a1: 20 30 21 31 // a2: 02 12 03 13 // a3: 22 32 23 33 const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); // Unpack 64 bit elements resulting in: // out[0]: 00 10 20 30 // out[1]: 01 11 21 31 // out[2]: 02 12 22 32 // out[3]: 03 13 23 33 out[0] = _mm_unpacklo_epi64(a0, a1); out[1] = _mm_unpackhi_epi64(a0, a1); out[2] = _mm_unpacklo_epi64(a2, a3); out[3] = _mm_unpackhi_epi64(a2, a3); } static inline void transpose_32bit_4x4x2(const __m128i *const in, __m128i *const out) { // Unpack 32 bit elements. Goes from: // in[0]: 00 01 02 03 // in[1]: 10 11 12 13 // in[2]: 20 21 22 23 // in[3]: 30 31 32 33 // in[4]: 04 05 06 07 // in[5]: 14 15 16 17 // in[6]: 24 25 26 27 // in[7]: 34 35 36 37 // to: // a0: 00 10 01 11 // a1: 20 30 21 31 // a2: 02 12 03 13 // a3: 22 32 23 33 // a4: 04 14 05 15 // a5: 24 34 25 35 // a6: 06 16 07 17 // a7: 26 36 27 37 const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]); const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]); const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]); const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]); // Unpack 64 bit elements resulting in: // out[0]: 00 10 20 30 // out[1]: 01 11 21 31 // out[2]: 02 12 22 32 // out[3]: 03 13 23 33 // out[4]: 04 14 24 34 // out[5]: 05 15 25 35 // out[6]: 06 16 26 36 // out[7]: 07 17 27 37 out[0] = _mm_unpacklo_epi64(a0, a1); out[1] = _mm_unpackhi_epi64(a0, a1); out[2] = _mm_unpacklo_epi64(a2, a3); out[3] = _mm_unpackhi_epi64(a2, a3); out[4] = _mm_unpacklo_epi64(a4, a5); out[5] = _mm_unpackhi_epi64(a4, a5); out[6] = _mm_unpacklo_epi64(a6, a7); out[7] = _mm_unpackhi_epi64(a6, a7); } static inline void transpose_32bit_8x4(const __m128i *const in, __m128i *const out) { // Unpack 32 bit elements. Goes from: // in[0]: 00 01 02 03 // in[1]: 04 05 06 07 // in[2]: 10 11 12 13 // in[3]: 14 15 16 17 // in[4]: 20 21 22 23 // in[5]: 24 25 26 27 // in[6]: 30 31 32 33 // in[7]: 34 35 36 37 // to: // a0: 00 10 01 11 // a1: 20 30 21 31 // a2: 02 12 03 13 // a3: 22 32 23 33 // a4: 04 14 05 15 // a5: 24 34 25 35 // a6: 06 16 07 17 // a7: 26 36 27 37 const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]); const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]); const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]); const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]); const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]); const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]); const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]); const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]); // Unpack 64 bit elements resulting in: // out[0]: 00 10 20 30 // out[1]: 01 11 21 31 // out[2]: 02 12 22 32 // out[3]: 03 13 23 33 // out[4]: 04 14 24 34 // out[5]: 05 15 25 35 // out[6]: 06 16 26 36 // out[7]: 07 17 27 37 out[0] = _mm_unpacklo_epi64(a0, a1); out[1] = _mm_unpackhi_epi64(a0, a1); out[2] = _mm_unpacklo_epi64(a2, a3); out[3] = _mm_unpackhi_epi64(a2, a3); out[4] = _mm_unpacklo_epi64(a4, a5); out[5] = _mm_unpackhi_epi64(a4, a5); out[6] = _mm_unpacklo_epi64(a6, a7); out[7] = _mm_unpackhi_epi64(a6, a7); } #endif // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ aom-3.12.1/aom_dsp/x86/txfm_common_avx2.h000066400000000000000000000312561477627663500201160ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ #define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ #include #include "aom/aom_integer.h" #include "aom_dsp/x86/synonyms.h" #ifdef __cplusplus extern "C" { #endif static inline __m256i pair_set_w16_epi16(int16_t a, int16_t b) { return _mm256_set1_epi32( (int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16))); } static inline void btf_16_w16_avx2(const __m256i w0, const __m256i w1, __m256i *in0, __m256i *in1, const __m256i _r, const int32_t cos_bit) { __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1); __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1); __m256i u0 = _mm256_madd_epi16(t0, w0); __m256i u1 = _mm256_madd_epi16(t1, w0); __m256i v0 = _mm256_madd_epi16(t0, w1); __m256i v1 = _mm256_madd_epi16(t1, w1); __m256i a0 = _mm256_add_epi32(u0, _r); __m256i a1 = _mm256_add_epi32(u1, _r); __m256i b0 = _mm256_add_epi32(v0, _r); __m256i b1 = _mm256_add_epi32(v1, _r); __m256i c0 = _mm256_srai_epi32(a0, cos_bit); __m256i c1 = _mm256_srai_epi32(a1, cos_bit); __m256i d0 = _mm256_srai_epi32(b0, cos_bit); __m256i d1 = _mm256_srai_epi32(b1, cos_bit); *in0 = _mm256_packs_epi32(c0, c1); *in1 = _mm256_packs_epi32(d0, d1); } static inline void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) { const __m256i _in0 = *in0; const __m256i _in1 = *in1; *in0 = _mm256_adds_epi16(_in0, _in1); *in1 = _mm256_subs_epi16(_in0, _in1); } static inline void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) { const __m256i _in0 = *in0; const __m256i _in1 = *in1; *in0 = _mm256_add_epi32(_in0, _in1); *in1 = _mm256_sub_epi32(_in0, _in1); } static inline void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1, __m256i in0, __m256i in1) { const __m256i _in0 = in0; const __m256i _in1 = in1; *out0 = _mm256_adds_epi16(_in0, _in1); *out1 = _mm256_subs_epi16(_in0, _in1); } static inline void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1, __m256i in0, __m256i in1) { const __m256i _in0 = in0; const __m256i _in1 = in1; *out0 = _mm256_add_epi32(_in0, _in1); *out1 = _mm256_sub_epi32(_in0, _in1); } static inline __m256i load_16bit_to_16bit_avx2(const int16_t *a) { return _mm256_load_si256((const __m256i *)a); } static inline void load_buffer_16bit_to_16bit_avx2(const int16_t *in, int stride, __m256i *out, int out_size) { for (int i = 0; i < out_size; ++i) { out[i] = load_16bit_to_16bit_avx2(in + i * stride); } } static inline void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in, int stride, __m256i *out, int out_size) { for (int i = 0; i < out_size; ++i) { out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride); } } static inline __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) { const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a); const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8)); return _mm256_permute4x64_epi64(b, 0xD8); } static inline void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in, int stride, __m256i *out, int out_size) { for (int i = 0; i < out_size; ++i) { out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride); } } static inline void transpose2_8x8_avx2(const __m256i *const in, __m256i *const out) { __m256i t[16], u[16]; // (1st, 2nd) ==> (lo, hi) // (0, 1) ==> (0, 1) // (2, 3) ==> (2, 3) // (4, 5) ==> (4, 5) // (6, 7) ==> (6, 7) for (int i = 0; i < 4; i++) { t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]); t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]); } // (1st, 2nd) ==> (lo, hi) // (0, 2) ==> (0, 2) // (1, 3) ==> (1, 3) // (4, 6) ==> (4, 6) // (5, 7) ==> (5, 7) for (int i = 0; i < 2; i++) { u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]); u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]); u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]); u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]); } // (1st, 2nd) ==> (lo, hi) // (0, 4) ==> (0, 1) // (1, 5) ==> (4, 5) // (2, 6) ==> (2, 3) // (3, 7) ==> (6, 7) for (int i = 0; i < 2; i++) { out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]); out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]); out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]); out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]); } } static inline void transpose_16bit_16x16_avx2(const __m256i *const in, __m256i *const out) { __m256i t[16]; #define LOADL(idx) \ t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \ t[idx] = _mm256_inserti128_si256( \ t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1); #define LOADR(idx) \ t[8 + idx] = \ _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \ t[8 + idx] = _mm256_inserti128_si256( \ t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1); // load left 8x16 LOADL(0) LOADL(1) LOADL(2) LOADL(3) LOADL(4) LOADL(5) LOADL(6) LOADL(7) // load right 8x16 LOADR(0) LOADR(1) LOADR(2) LOADR(3) LOADR(4) LOADR(5) LOADR(6) LOADR(7) // get the top 16x8 result transpose2_8x8_avx2(t, out); // get the bottom 16x8 result transpose2_8x8_avx2(&t[8], &out[8]); } static inline void transpose_16bit_16x8_avx2(const __m256i *const in, __m256i *const out) { const __m256i a0 = _mm256_unpacklo_epi16(in[0], in[1]); const __m256i a1 = _mm256_unpacklo_epi16(in[2], in[3]); const __m256i a2 = _mm256_unpacklo_epi16(in[4], in[5]); const __m256i a3 = _mm256_unpacklo_epi16(in[6], in[7]); const __m256i a4 = _mm256_unpackhi_epi16(in[0], in[1]); const __m256i a5 = _mm256_unpackhi_epi16(in[2], in[3]); const __m256i a6 = _mm256_unpackhi_epi16(in[4], in[5]); const __m256i a7 = _mm256_unpackhi_epi16(in[6], in[7]); const __m256i b0 = _mm256_unpacklo_epi32(a0, a1); const __m256i b1 = _mm256_unpacklo_epi32(a2, a3); const __m256i b2 = _mm256_unpacklo_epi32(a4, a5); const __m256i b3 = _mm256_unpacklo_epi32(a6, a7); const __m256i b4 = _mm256_unpackhi_epi32(a0, a1); const __m256i b5 = _mm256_unpackhi_epi32(a2, a3); const __m256i b6 = _mm256_unpackhi_epi32(a4, a5); const __m256i b7 = _mm256_unpackhi_epi32(a6, a7); out[0] = _mm256_unpacklo_epi64(b0, b1); out[1] = _mm256_unpackhi_epi64(b0, b1); out[2] = _mm256_unpacklo_epi64(b4, b5); out[3] = _mm256_unpackhi_epi64(b4, b5); out[4] = _mm256_unpacklo_epi64(b2, b3); out[5] = _mm256_unpackhi_epi64(b2, b3); out[6] = _mm256_unpacklo_epi64(b6, b7); out[7] = _mm256_unpackhi_epi64(b6, b7); } static inline void flip_buf_avx2(__m256i *in, __m256i *out, int size) { for (int i = 0; i < size; ++i) { out[size - i - 1] = in[i]; } } static inline void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) { if (bit < 0) { bit = -bit; __m256i round = _mm256_set1_epi16(1 << (bit - 1)); for (int i = 0; i < size; ++i) { in[i] = _mm256_adds_epi16(in[i], round); in[i] = _mm256_srai_epi16(in[i], bit); } } else if (bit > 0) { for (int i = 0; i < size; ++i) { in[i] = _mm256_slli_epi16(in[i], bit); } } } static inline __m256i round_shift_32_avx2(__m256i vec, int bit) { __m256i tmp, round; round = _mm256_set1_epi32(1 << (bit - 1)); tmp = _mm256_add_epi32(vec, round); return _mm256_srai_epi32(tmp, bit); } static inline void round_shift_array_32_avx2(__m256i *input, __m256i *output, const int size, const int bit) { if (bit > 0) { int i; for (i = 0; i < size; i++) { output[i] = round_shift_32_avx2(input[i], bit); } } else { int i; for (i = 0; i < size; i++) { output[i] = _mm256_slli_epi32(input[i], -bit); } } } static inline void round_shift_rect_array_32_avx2(__m256i *input, __m256i *output, const int size, const int bit, const int val) { const __m256i sqrt2 = _mm256_set1_epi32(val); if (bit > 0) { int i; for (i = 0; i < size; i++) { const __m256i r0 = round_shift_32_avx2(input[i], bit); const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0); output[i] = round_shift_32_avx2(r1, NewSqrt2Bits); } } else { int i; for (i = 0; i < size; i++) { const __m256i r0 = _mm256_slli_epi32(input[i], -bit); const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0); output[i] = round_shift_32_avx2(r1, NewSqrt2Bits); } } } static inline __m256i scale_round_avx2(const __m256i a, const int scale) { const __m256i scale_rounding = pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1)); const __m256i b = _mm256_madd_epi16(a, scale_rounding); return _mm256_srai_epi32(b, NewSqrt2Bits); } static inline void store_rect_16bit_to_32bit_w8_avx2(const __m256i a, int32_t *const b) { const __m256i one = _mm256_set1_epi16(1); const __m256i a_lo = _mm256_unpacklo_epi16(a, one); const __m256i a_hi = _mm256_unpackhi_epi16(a, one); const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2); const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2); const __m256i temp = _mm256_permute2f128_si256(b_lo, b_hi, 0x31); _mm_store_si128((__m128i *)b, _mm256_castsi256_si128(b_lo)); _mm_store_si128((__m128i *)(b + 4), _mm256_castsi256_si128(b_hi)); _mm256_store_si256((__m256i *)(b + 64), temp); } static inline void store_rect_buffer_16bit_to_32bit_w8_avx2( const __m256i *const in, int32_t *const out, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { store_rect_16bit_to_32bit_w8_avx2(in[i], out + i * stride); } } static inline void pack_reg(const __m128i *in1, const __m128i *in2, __m256i *out) { out[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[0]), in2[0], 0x1); out[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[1]), in2[1], 0x1); out[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[2]), in2[2], 0x1); out[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[3]), in2[3], 0x1); out[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[4]), in2[4], 0x1); out[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[5]), in2[5], 0x1); out[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[6]), in2[6], 0x1); out[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[7]), in2[7], 0x1); } static inline void extract_reg(const __m256i *in, __m128i *out1) { out1[0] = _mm256_castsi256_si128(in[0]); out1[1] = _mm256_castsi256_si128(in[1]); out1[2] = _mm256_castsi256_si128(in[2]); out1[3] = _mm256_castsi256_si128(in[3]); out1[4] = _mm256_castsi256_si128(in[4]); out1[5] = _mm256_castsi256_si128(in[5]); out1[6] = _mm256_castsi256_si128(in[6]); out1[7] = _mm256_castsi256_si128(in[7]); out1[8] = _mm256_extracti128_si256(in[0], 0x01); out1[9] = _mm256_extracti128_si256(in[1], 0x01); out1[10] = _mm256_extracti128_si256(in[2], 0x01); out1[11] = _mm256_extracti128_si256(in[3], 0x01); out1[12] = _mm256_extracti128_si256(in[4], 0x01); out1[13] = _mm256_extracti128_si256(in[5], 0x01); out1[14] = _mm256_extracti128_si256(in[6], 0x01); out1[15] = _mm256_extracti128_si256(in[7], 0x01); } #ifdef __cplusplus } #endif #endif // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ aom-3.12.1/aom_dsp/x86/txfm_common_sse2.h000066400000000000000000000024621477627663500201070ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ #define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ #include #include "aom/aom_integer.h" #include "aom_dsp/x86/synonyms.h" #define pair_set_epi16(a, b) \ _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16))) // Reverse the 8 16 bit words in __m128i static inline __m128i mm_reverse_epi16(const __m128i x) { const __m128i a = _mm_shufflelo_epi16(x, 0x1b); const __m128i b = _mm_shufflehi_epi16(a, 0x1b); return _mm_shuffle_epi32(b, 0x4e); } #define octa_set_epi16(a, b, c, d, e, f, g, h) \ _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \ (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h)) #endif // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ aom-3.12.1/aom_dsp/x86/variance_avx2.c000066400000000000000000001215601477627663500173510ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/x86/masked_variance_intrin_ssse3.h" #include "aom_dsp/x86/synonyms.h" static inline __m128i mm256_add_hi_lo_epi16(const __m256i val) { return _mm_add_epi16(_mm256_castsi256_si128(val), _mm256_extractf128_si256(val, 1)); } static inline __m128i mm256_add_hi_lo_epi32(const __m256i val) { return _mm_add_epi32(_mm256_castsi256_si128(val), _mm256_extractf128_si256(val, 1)); } static inline void variance_kernel_avx2(const __m256i src, const __m256i ref, __m256i *const sse, __m256i *const sum) { const __m256i adj_sub = _mm256_set1_epi16((short)0xff01); // (1,-1) // unpack into pairs of source and reference values const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref); const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref); // subtract adjacent elements using src*1 + ref*-1 const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub); const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub); const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); // add to the running totals *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1)); *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1)); } static inline int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum, unsigned int *const sse) { // extract the low lane and add it to the high lane const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse); // unpack sse and sum registers and add const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum); const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum); const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); // perform the final summation and extract the results const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); *((int *)sse) = _mm_cvtsi128_si32(res); return _mm_extract_epi32(res, 1); } // handle pixels (<= 512) static inline int variance_final_512_avx2(__m256i vsse, __m256i vsum, unsigned int *const sse) { // extract the low lane and add it to the high lane const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8)); const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64); return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse); } // handle 1024 pixels (32x32, 16x64, 64x16) static inline int variance_final_1024_avx2(__m256i vsse, __m256i vsum, unsigned int *const sse) { // extract the low lane and add it to the high lane const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); const __m128i vsum_64 = _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128), _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8))); return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse); } static inline __m256i sum_to_32bit_avx2(const __m256i sum) { const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum)); const __m256i sum_hi = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1)); return _mm256_add_epi32(sum_lo, sum_hi); } // handle 2048 pixels (32x64, 64x32) static inline int variance_final_2048_avx2(__m256i vsse, __m256i vsum, unsigned int *const sse) { vsum = sum_to_32bit_avx2(vsum); const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); } static inline void variance16_kernel_avx2( const uint8_t *const src, const int src_stride, const uint8_t *const ref, const int ref_stride, __m256i *const sse, __m256i *const sum) { const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride)); const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride)); const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1); const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1); variance_kernel_avx2(s, r, sse, sum); } static inline void variance32_kernel_avx2(const uint8_t *const src, const uint8_t *const ref, __m256i *const sse, __m256i *const sum) { const __m256i s = _mm256_loadu_si256((__m256i const *)(src)); const __m256i r = _mm256_loadu_si256((__m256i const *)(ref)); variance_kernel_avx2(s, r, sse, sum); } static inline void variance16_avx2(const uint8_t *src, const int src_stride, const uint8_t *ref, const int ref_stride, const int h, __m256i *const vsse, __m256i *const vsum) { *vsum = _mm256_setzero_si256(); for (int i = 0; i < h; i += 2) { variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); src += 2 * src_stride; ref += 2 * ref_stride; } } static inline void variance32_avx2(const uint8_t *src, const int src_stride, const uint8_t *ref, const int ref_stride, const int h, __m256i *const vsse, __m256i *const vsum) { *vsum = _mm256_setzero_si256(); for (int i = 0; i < h; i++) { variance32_kernel_avx2(src, ref, vsse, vsum); src += src_stride; ref += ref_stride; } } static inline void variance64_avx2(const uint8_t *src, const int src_stride, const uint8_t *ref, const int ref_stride, const int h, __m256i *const vsse, __m256i *const vsum) { *vsum = _mm256_setzero_si256(); for (int i = 0; i < h; i++) { variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); src += src_stride; ref += ref_stride; } } static inline void variance128_avx2(const uint8_t *src, const int src_stride, const uint8_t *ref, const int ref_stride, const int h, __m256i *const vsse, __m256i *const vsum) { *vsum = _mm256_setzero_si256(); for (int i = 0; i < h; i++) { variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum); variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum); src += src_stride; ref += ref_stride; } } #define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel) \ unsigned int aom_variance##bw##x##bh##_avx2( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ unsigned int *sse) { \ __m256i vsse = _mm256_setzero_si256(); \ __m256i vsum; \ variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \ const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse); \ return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ } AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512) AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512) AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512) AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512) AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024) AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048) AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048) #if !CONFIG_REALTIME_ONLY AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024) AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512) AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024) AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512) #endif #define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh) \ unsigned int aom_variance##bw##x##bh##_avx2( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ unsigned int *sse) { \ __m256i vsse = _mm256_setzero_si256(); \ __m256i vsum = _mm256_setzero_si256(); \ for (int i = 0; i < (bh / uh); i++) { \ __m256i vsum16; \ variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse, \ &vsum16); \ vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); \ src += uh * src_stride; \ ref += uh * ref_stride; \ } \ const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); \ const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); \ return *sse - (unsigned int)(((int64_t)sum * sum) >> bits); \ } AOM_VAR_LOOP_AVX2(64, 64, 12, 32) // 64x32 * ( 64/32) AOM_VAR_LOOP_AVX2(64, 128, 13, 32) // 64x32 * (128/32) AOM_VAR_LOOP_AVX2(128, 64, 13, 16) // 128x16 * ( 64/16) AOM_VAR_LOOP_AVX2(128, 128, 14, 16) // 128x16 * (128/16) unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse); return *sse; } static inline __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) { const __m256i d = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); } #if CONFIG_AV1_HIGHBITDEPTH static inline __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) { const __m256i d = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); } #endif // CONFIG_AV1_HIGHBITDEPTH static inline void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1, const __m256i a, uint8_t *comp_pred) { const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS; const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits)); const __m256i ma = _mm256_sub_epi8(alpha_max, a); const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1); const __m256i aaAL = _mm256_unpacklo_epi8(a, ma); const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1); const __m256i aaAH = _mm256_unpackhi_epi8(a, ma); const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL); const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH); const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset); const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset); const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH); _mm256_storeu_si256((__m256i *)(comp_pred), roundA); } void aom_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride) { int row = 0; if (width == 8) { do { const __m256i pred_0123 = _mm256_loadu_si256((const __m256i *)(pred)); const __m128i ref_0 = _mm_loadl_epi64((const __m128i *)(ref)); const __m128i ref_1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride)); const __m128i ref_2 = _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride)); const __m128i ref_3 = _mm_loadl_epi64((const __m128i *)(ref + 3 * ref_stride)); const __m128i ref_01 = _mm_unpacklo_epi64(ref_0, ref_1); const __m128i ref_23 = _mm_unpacklo_epi64(ref_2, ref_3); const __m256i ref_0123 = _mm256_inserti128_si256(_mm256_castsi128_si256(ref_01), ref_23, 1); const __m256i average = _mm256_avg_epu8(pred_0123, ref_0123); _mm256_storeu_si256((__m256i *)(comp_pred), average); row += 4; pred += 32; comp_pred += 32; ref += 4 * ref_stride; } while (row < height); } else if (width == 16) { do { const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred)); const __m256i pred_1 = _mm256_loadu_si256((const __m256i *)(pred + 32)); const __m256i tmp0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(ref))); const __m256i ref_0 = _mm256_inserti128_si256( tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1); const __m256i tmp1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride))); const __m256i ref_1 = _mm256_inserti128_si256( tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1); const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); _mm256_storeu_si256((__m256i *)(comp_pred), average_0); _mm256_storeu_si256((__m256i *)(comp_pred + 32), average_1); row += 4; pred += 64; comp_pred += 64; ref += 4 * ref_stride; } while (row < height); } else if (width == 32) { do { const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred)); const __m256i pred_1 = _mm256_loadu_si256((const __m256i *)(pred + 32)); const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref)); const __m256i ref_1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); _mm256_storeu_si256((__m256i *)(comp_pred), average_0); _mm256_storeu_si256((__m256i *)(comp_pred + 32), average_1); row += 2; pred += 64; comp_pred += 64; ref += 2 * ref_stride; } while (row < height); } else if (width % 64 == 0) { do { for (int x = 0; x < width; x += 64) { const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred + x)); const __m256i pred_1 = _mm256_loadu_si256((const __m256i *)(pred + x + 32)); const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x)); const __m256i ref_1 = _mm256_loadu_si256((const __m256i *)(ref + x + 32)); const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); _mm256_storeu_si256((__m256i *)(comp_pred + x), average_0); _mm256_storeu_si256((__m256i *)(comp_pred + x + 32), average_1); } row++; pred += width; comp_pred += width; ref += ref_stride; } while (row < height); } else { aom_comp_avg_pred_c(comp_pred, pred, width, height, ref, ref_stride); } } void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { int i = 0; const uint8_t *src0 = invert_mask ? pred : ref; const uint8_t *src1 = invert_mask ? ref : pred; const int stride0 = invert_mask ? width : ref_stride; const int stride1 = invert_mask ? ref_stride : width; if (width == 8) { comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1, mask, mask_stride); } else if (width == 16) { do { const __m256i sA0 = mm256_loadu2(src0 + stride0, src0); const __m256i sA1 = mm256_loadu2(src1 + stride1, src1); const __m256i aA = mm256_loadu2(mask + mask_stride, mask); src0 += (stride0 << 1); src1 += (stride1 << 1); mask += (mask_stride << 1); const __m256i sB0 = mm256_loadu2(src0 + stride0, src0); const __m256i sB1 = mm256_loadu2(src1 + stride1, src1); const __m256i aB = mm256_loadu2(mask + mask_stride, mask); src0 += (stride0 << 1); src1 += (stride1 << 1); mask += (mask_stride << 1); // comp_pred's stride == width == 16 comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred); comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32); comp_pred += (16 << 2); i += 4; } while (i < height); } else { do { for (int x = 0; x < width; x += 32) { const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0 + x)); const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1 + x)); const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask + x)); comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred); comp_pred += 32; } src0 += stride0; src1 += stride1; mask += mask_stride; i++; } while (i < height); } } #if CONFIG_AV1_HIGHBITDEPTH static inline __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1, const __m256i a) { const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m256i round_const = _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m256i a_inv = _mm256_sub_epi16(alpha_max, a); const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1); const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv); const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo); const __m256i pred_l = _mm256_srai_epi32( _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS); const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1); const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv); const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi); const __m256i pred_h = _mm256_srai_epi32( _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS); const __m256i comp = _mm256_packs_epi32(pred_l, pred_h); return comp; } void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { int i = 0; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); const uint16_t *src0 = invert_mask ? pred : ref; const uint16_t *src1 = invert_mask ? ref : pred; const int stride0 = invert_mask ? width : ref_stride; const int stride1 = invert_mask ? ref_stride : width; const __m256i zero = _mm256_setzero_si256(); if (width == 8) { do { const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0); const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1); const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask); const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8)); __m256i m = _mm256_castsi128_si256(m_l); m = _mm256_insertf128_si256(m, m_h, 1); const __m256i m_16 = _mm256_unpacklo_epi8(m, zero); const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp)); _mm_storeu_si128((__m128i *)(comp_pred + width), _mm256_extractf128_si256(comp, 1)); src0 += (stride0 << 1); src1 += (stride1 << 1); mask += (mask_stride << 1); comp_pred += (width << 1); i += 2; } while (i < height); } else if (width == 16) { do { const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0)); const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1)); const __m256i m_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); _mm256_storeu_si256((__m256i *)comp_pred, comp); src0 += stride0; src1 += stride1; mask += mask_stride; comp_pred += width; i += 1; } while (i < height); } else { do { for (int x = 0; x < width; x += 32) { const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0 + x)); const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + x + 16)); const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1 + x)); const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + x + 16)); const __m256i m01_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + x))); const __m256i m23_16 = _mm256_cvtepu8_epi16( _mm_loadu_si128((const __m128i *)(mask + x + 16))); const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16); const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16); _mm256_storeu_si256((__m256i *)comp_pred, comp); _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1); comp_pred += 32; } src0 += stride0; src1 += stride1; mask += mask_stride; i += 1; } while (i < height); } } #endif // CONFIG_AV1_HIGHBITDEPTH static uint64_t mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i dst0_4x8, dst1_4x8, dst2_4x8, dst3_4x8, dst_16x8; __m128i src0_4x16, src1_4x16, src2_4x16, src3_4x16; __m256i src0_8x16, src1_8x16, dst_16x16, src_16x16; __m256i res0_4x64, res1_4x64; __m256i sub_result; const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128()); __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128()); for (int i = 0; i < h; i += 4) { dst0_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride])); dst1_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride])); dst2_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 2) * dstride])); dst3_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 3) * dstride])); dst_16x8 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(dst0_4x8, dst1_4x8), _mm_unpacklo_epi32(dst2_4x8, dst3_4x8)); dst_16x16 = _mm256_cvtepu8_epi16(dst_16x8); src0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride])); src1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride])); src2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride])); src3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride])); src0_8x16 = _mm256_castsi128_si256(_mm_unpacklo_epi64(src0_4x16, src1_4x16)); src1_8x16 = _mm256_castsi128_si256(_mm_unpacklo_epi64(src2_4x16, src3_4x16)); src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20); // r15 r14 r13------------r1 r0 - 16 bit sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16)); // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit src_16x16 = _mm256_madd_epi16(sub_result, sub_result); // accumulation of result square_result = _mm256_add_epi32(square_result, src_16x16); } // s5 s4 s1 s0 - 64bit res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros); // s7 s6 s3 s2 - 64bit res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros); // r3 r2 r1 r0 - 64bit res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64); // r1+r3 r2+r0 - 64bit const __m128i sum_1x64 = _mm_add_epi64(_mm256_castsi256_si128(res0_4x64), _mm256_extracti128_si256(res0_4x64, 1)); xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8))); return sum; } // Compute mse of four consecutive 4x4 blocks. // In src buffer, each 4x4 block in a 32x32 filter block is stored sequentially. // Hence src_blk_stride is same as block width. Whereas dst buffer is a frame // buffer, thus dstride is a frame level stride. static uint64_t mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, int src_blk_stride, int h) { uint64_t sum = 0; __m128i dst0_16x8, dst1_16x8, dst2_16x8, dst3_16x8; __m256i dst0_16x16, dst1_16x16, dst2_16x16, dst3_16x16; __m256i res0_4x64, res1_4x64; __m256i sub_result_0, sub_result_1, sub_result_2, sub_result_3; const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128()); __m256i square_result = zeros; uint16_t *src_temp = src; for (int i = 0; i < h; i += 4) { dst0_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 0) * dstride])); dst1_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 1) * dstride])); dst2_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 2) * dstride])); dst3_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 3) * dstride])); // row0 of 1st,2nd, 3rd and 4th 4x4 blocks- d00 d10 d20 d30 dst0_16x16 = _mm256_cvtepu8_epi16(dst0_16x8); // row1 of 1st,2nd, 3rd and 4th 4x4 blocks - d01 d11 d21 d31 dst1_16x16 = _mm256_cvtepu8_epi16(dst1_16x8); // row2 of 1st,2nd, 3rd and 4th 4x4 blocks - d02 d12 d22 d32 dst2_16x16 = _mm256_cvtepu8_epi16(dst2_16x8); // row3 of 1st,2nd, 3rd and 4th 4x4 blocks - d03 d13 d23 d33 dst3_16x16 = _mm256_cvtepu8_epi16(dst3_16x8); // All rows of 1st 4x4 block - r00 r01 r02 r03 __m256i src0_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[0])); // All rows of 2nd 4x4 block - r10 r11 r12 r13 __m256i src1_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[src_blk_stride])); // All rows of 3rd 4x4 block - r20 r21 r22 r23 __m256i src2_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[2 * src_blk_stride])); // All rows of 4th 4x4 block - r30 r31 r32 r33 __m256i src3_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[3 * src_blk_stride])); // r00 r10 r02 r12 __m256i tmp0_16x16 = _mm256_unpacklo_epi64(src0_16x16, src1_16x16); // r01 r11 r03 r13 __m256i tmp1_16x16 = _mm256_unpackhi_epi64(src0_16x16, src1_16x16); // r20 r30 r22 r32 __m256i tmp2_16x16 = _mm256_unpacklo_epi64(src2_16x16, src3_16x16); // r21 r31 r23 r33 __m256i tmp3_16x16 = _mm256_unpackhi_epi64(src2_16x16, src3_16x16); // r00 r10 r20 r30 src0_16x16 = _mm256_permute2f128_si256(tmp0_16x16, tmp2_16x16, 0x20); // r01 r11 r21 r31 src1_16x16 = _mm256_permute2f128_si256(tmp1_16x16, tmp3_16x16, 0x20); // r02 r12 r22 r32 src2_16x16 = _mm256_permute2f128_si256(tmp0_16x16, tmp2_16x16, 0x31); // r03 r13 r23 r33 src3_16x16 = _mm256_permute2f128_si256(tmp1_16x16, tmp3_16x16, 0x31); // r15 r14 r13------------r1 r0 - 16 bit sub_result_0 = _mm256_abs_epi16(_mm256_sub_epi16(src0_16x16, dst0_16x16)); sub_result_1 = _mm256_abs_epi16(_mm256_sub_epi16(src1_16x16, dst1_16x16)); sub_result_2 = _mm256_abs_epi16(_mm256_sub_epi16(src2_16x16, dst2_16x16)); sub_result_3 = _mm256_abs_epi16(_mm256_sub_epi16(src3_16x16, dst3_16x16)); // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit src0_16x16 = _mm256_madd_epi16(sub_result_0, sub_result_0); src1_16x16 = _mm256_madd_epi16(sub_result_1, sub_result_1); src2_16x16 = _mm256_madd_epi16(sub_result_2, sub_result_2); src3_16x16 = _mm256_madd_epi16(sub_result_3, sub_result_3); // accumulation of result src0_16x16 = _mm256_add_epi32(src0_16x16, src1_16x16); src2_16x16 = _mm256_add_epi32(src2_16x16, src3_16x16); const __m256i square_result_0 = _mm256_add_epi32(src0_16x16, src2_16x16); square_result = _mm256_add_epi32(square_result, square_result_0); src_temp += 16; } // s5 s4 s1 s0 - 64bit res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros); // s7 s6 s3 s2 - 64bit res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros); // r3 r2 r1 r0 - 64bit res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64); // r1+r3 r2+r0 - 64bit const __m128i sum_1x64 = _mm_add_epi64(_mm256_castsi256_si128(res0_4x64), _mm256_extracti128_si256(res0_4x64, 1)); xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8))); return sum; } static uint64_t mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i dst0_8x8, dst1_8x8, dst3_16x8; __m256i src0_8x16, src1_8x16, src_16x16, dst_16x16; __m256i res0_4x64, res1_4x64; __m256i sub_result; const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128()); __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128()); for (int i = 0; i < h; i += 2) { dst0_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride])); dst1_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride])); dst3_16x8 = _mm_unpacklo_epi64(dst0_8x8, dst1_8x8); dst_16x16 = _mm256_cvtepu8_epi16(dst3_16x8); src0_8x16 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride])); src1_8x16 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride])); src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20); // r15 r14 r13 - - - r1 r0 - 16 bit sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16)); // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit src_16x16 = _mm256_madd_epi16(sub_result, sub_result); // accumulation of result square_result = _mm256_add_epi32(square_result, src_16x16); } // s5 s4 s1 s0 - 64bit res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros); // s7 s6 s3 s2 - 64bit res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros); // r3 r2 r1 r0 - 64bit res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64); // r1+r3 r2+r0 - 64bit const __m128i sum_1x64 = _mm_add_epi64(_mm256_castsi256_si128(res0_4x64), _mm256_extracti128_si256(res0_4x64, 1)); xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8))); return sum; } // Compute mse of two consecutive 8x8 blocks. // In src buffer, each 8x8 block in a 64x64 filter block is stored sequentially. // Hence src_blk_stride is same as block width. Whereas dst buffer is a frame // buffer, thus dstride is a frame level stride. static uint64_t mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, int src_blk_stride, int h) { uint64_t sum = 0; __m128i dst0_16x8, dst1_16x8; __m256i dst0_16x16, dst1_16x16; __m256i res0_4x64, res1_4x64; __m256i sub_result_0, sub_result_1; const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128()); __m256i square_result = zeros; uint16_t *src_temp = src; for (int i = 0; i < h; i += 2) { dst0_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 0) * dstride])); dst1_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 1) * dstride])); // row0 of 1st and 2nd 8x8 block - d00 d10 dst0_16x16 = _mm256_cvtepu8_epi16(dst0_16x8); // row1 of 1st and 2nd 8x8 block - d01 d11 dst1_16x16 = _mm256_cvtepu8_epi16(dst1_16x8); // 2 rows of 1st 8x8 block - r00 r01 __m256i src0_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[0])); // 2 rows of 2nd 8x8 block - r10 r11 __m256i src1_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[src_blk_stride])); // r00 r10 - 128bit __m256i tmp0_16x16 = _mm256_permute2f128_si256(src0_16x16, src1_16x16, 0x20); // r01 r11 - 128bit __m256i tmp1_16x16 = _mm256_permute2f128_si256(src0_16x16, src1_16x16, 0x31); // r15 r14 r13------------r1 r0 - 16 bit sub_result_0 = _mm256_abs_epi16(_mm256_sub_epi16(tmp0_16x16, dst0_16x16)); sub_result_1 = _mm256_abs_epi16(_mm256_sub_epi16(tmp1_16x16, dst1_16x16)); // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit each src0_16x16 = _mm256_madd_epi16(sub_result_0, sub_result_0); src1_16x16 = _mm256_madd_epi16(sub_result_1, sub_result_1); // accumulation of result src0_16x16 = _mm256_add_epi32(src0_16x16, src1_16x16); square_result = _mm256_add_epi32(square_result, src0_16x16); src_temp += 16; } // s5 s4 s1 s0 - 64bit res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros); // s7 s6 s3 s2 - 64bit res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros); // r3 r2 r1 r0 - 64bit res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64); // r1+r3 r2+r0 - 64bit const __m128i sum_1x64 = _mm_add_epi64(_mm256_castsi256_si128(res0_4x64), _mm256_extracti128_si256(res0_4x64, 1)); xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8))); return sum; } uint64_t aom_mse_wxh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, int sstride, int w, int h) { assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must be satisfied"); switch (w) { case 4: return mse_4xh_16bit_avx2(dst, dstride, src, sstride, h); case 8: return mse_8xh_16bit_avx2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } // Computes mse of two 8x8 or four 4x4 consecutive blocks. Luma plane uses 8x8 // block and Chroma uses 4x4 block. In src buffer, each block in a filter block // is stored sequentially. Hence src_blk_stride is same as block width. Whereas // dst buffer is a frame buffer, thus dstride is a frame level stride. uint64_t aom_mse_16xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, int w, int h) { assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must be satisfied"); switch (w) { case 4: return mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h); case 8: return mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h); default: assert(0 && "unsupported width"); return -1; } } static inline void calc_sum_sse_wd32_avx2(const uint8_t *src, const uint8_t *ref, __m256i set_one_minusone, __m256i sse_8x16[2], __m256i sum_8x16[2]) { const __m256i s00_256 = _mm256_loadu_si256((__m256i const *)(src)); const __m256i r00_256 = _mm256_loadu_si256((__m256i const *)(ref)); const __m256i u_low_256 = _mm256_unpacklo_epi8(s00_256, r00_256); const __m256i u_high_256 = _mm256_unpackhi_epi8(s00_256, r00_256); const __m256i diff0 = _mm256_maddubs_epi16(u_low_256, set_one_minusone); const __m256i diff1 = _mm256_maddubs_epi16(u_high_256, set_one_minusone); sse_8x16[0] = _mm256_add_epi32(sse_8x16[0], _mm256_madd_epi16(diff0, diff0)); sse_8x16[1] = _mm256_add_epi32(sse_8x16[1], _mm256_madd_epi16(diff1, diff1)); sum_8x16[0] = _mm256_add_epi16(sum_8x16[0], diff0); sum_8x16[1] = _mm256_add_epi16(sum_8x16[1], diff1); } static inline __m256i calc_sum_sse_order(__m256i *sse_hx16, __m256i *sum_hx16, unsigned int *tot_sse, int *tot_sum) { // s00 s01 s10 s11 s20 s21 s30 s31 const __m256i sse_results = _mm256_hadd_epi32(sse_hx16[0], sse_hx16[1]); // d00 d01 d02 d03 | d10 d11 d12 d13 | d20 d21 d22 d23 | d30 d31 d32 d33 const __m256i sum_result_r0 = _mm256_hadd_epi16(sum_hx16[0], sum_hx16[1]); // d00 d01 d10 d11 | d00 d02 d10 d11 | d20 d21 d30 d31 | d20 d21 d30 d31 const __m256i sum_result_1 = _mm256_hadd_epi16(sum_result_r0, sum_result_r0); // d00 d01 d10 d11 d20 d21 d30 d31 | X const __m256i sum_result_3 = _mm256_permute4x64_epi64(sum_result_1, 0x08); // d00 d01 d10 d11 d20 d21 d30 d31 const __m256i sum_results = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum_result_3)); // Add sum & sse registers appropriately to get total sum & sse separately. // s0 s1 d0 d1 s2 s3 d2 d3 const __m256i sum_sse_add = _mm256_hadd_epi32(sse_results, sum_results); // s0 s1 s2 s3 d0 d1 d2 d3 const __m256i sum_sse_order_add = _mm256_permute4x64_epi64(sum_sse_add, 0xd8); // s0+s1 s2+s3 s0+s1 s2+s3 d0+d1 d2+d3 d0+d1 d2+d3 const __m256i sum_sse_order_add_1 = _mm256_hadd_epi32(sum_sse_order_add, sum_sse_order_add); // s0 x x x | d0 x x x const __m256i sum_sse_order_add_final = _mm256_hadd_epi32(sum_sse_order_add_1, sum_sse_order_add_1); // s0 const uint32_t first_value = (uint32_t)_mm256_extract_epi32(sum_sse_order_add_final, 0); *tot_sse += first_value; // d0 const int second_value = _mm256_extract_epi32(sum_sse_order_add_final, 4); *tot_sum += second_value; return sum_sse_order_add; } static inline void get_var_sse_sum_8x8_quad_avx2( const uint8_t *src, int src_stride, const uint8_t *ref, const int ref_stride, const int h, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8) { assert(h <= 128); // May overflow for larger height. __m256i sse_8x16[2], sum_8x16[2]; sum_8x16[0] = _mm256_setzero_si256(); sse_8x16[0] = _mm256_setzero_si256(); sum_8x16[1] = sum_8x16[0]; sse_8x16[1] = sse_8x16[0]; const __m256i set_one_minusone = _mm256_set1_epi16((short)0xff01); for (int i = 0; i < h; i++) { // Process 8x32 block of one row. calc_sum_sse_wd32_avx2(src, ref, set_one_minusone, sse_8x16, sum_8x16); src += src_stride; ref += ref_stride; } const __m256i sum_sse_order_add = calc_sum_sse_order(sse_8x16, sum_8x16, tot_sse, tot_sum); // s0 s1 s2 s3 _mm_storeu_si128((__m128i *)sse8x8, _mm256_castsi256_si128(sum_sse_order_add)); // d0 d1 d2 d3 const __m128i sum_temp8x8 = _mm256_extractf128_si256(sum_sse_order_add, 1); _mm_storeu_si128((__m128i *)sum8x8, sum_temp8x8); // (d0xd0 >> 6)=f0 (d1xd1 >> 6)=f1 (d2xd2 >> 6)=f2 (d3xd3 >> 6)=f3 const __m128i mull_results = _mm_srli_epi32(_mm_mullo_epi32(sum_temp8x8, sum_temp8x8), 6); // s0-f0=v0 s1-f1=v1 s2-f2=v2 s3-f3=v3 const __m128i variance_8x8 = _mm_sub_epi32(_mm256_castsi256_si128(sum_sse_order_add), mull_results); // v0 v1 v2 v3 _mm_storeu_si128((__m128i *)var8x8, variance_8x8); } static inline void get_var_sse_sum_16x16_dual_avx2( const uint8_t *src, int src_stride, const uint8_t *ref, const int ref_stride, const int h, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16) { assert(h <= 128); // May overflow for larger height. __m256i sse_16x16[2], sum_16x16[2]; sum_16x16[0] = _mm256_setzero_si256(); sse_16x16[0] = _mm256_setzero_si256(); sum_16x16[1] = sum_16x16[0]; sse_16x16[1] = sse_16x16[0]; const __m256i set_one_minusone = _mm256_set1_epi16((short)0xff01); for (int i = 0; i < h; i++) { // Process 16x32 block of one row. calc_sum_sse_wd32_avx2(src, ref, set_one_minusone, sse_16x16, sum_16x16); src += src_stride; ref += ref_stride; } const __m256i sum_sse_order_add = calc_sum_sse_order(sse_16x16, sum_16x16, tot_sse, tot_sum); const __m256i sum_sse_order_add_1 = _mm256_hadd_epi32(sum_sse_order_add, sum_sse_order_add); // s0+s1 s2+s3 x x _mm_storel_epi64((__m128i *)sse16x16, _mm256_castsi256_si128(sum_sse_order_add_1)); // d0+d1 d2+d3 x x const __m128i sum_temp16x16 = _mm256_extractf128_si256(sum_sse_order_add_1, 1); // (d0xd0 >> 6)=f0 (d1xd1 >> 6)=f1 (d2xd2 >> 6)=f2 (d3xd3 >> 6)=f3 const __m128i mull_results = _mm_srli_epi32(_mm_mullo_epi32(sum_temp16x16, sum_temp16x16), 8); // s0-f0=v0 s1-f1=v1 s2-f2=v2 s3-f3=v3 const __m128i variance_16x16 = _mm_sub_epi32(_mm256_castsi256_si128(sum_sse_order_add_1), mull_results); // v0 v1 v2 v3 _mm_storel_epi64((__m128i *)var16x16, variance_16x16); } void aom_get_var_sse_sum_8x8_quad_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8) { get_var_sse_sum_8x8_quad_avx2(src_ptr, source_stride, ref_ptr, ref_stride, 8, sse8x8, sum8x8, tot_sse, tot_sum, var8x8); } void aom_get_var_sse_sum_16x16_dual_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16) { get_var_sse_sum_16x16_dual_avx2(src_ptr, source_stride, ref_ptr, ref_stride, 16, sse16x16, tot_sse, tot_sum, var16x16); } aom-3.12.1/aom_dsp/x86/variance_impl_avx2.c000066400000000000000000002035271477627663500203760ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // AVX2 #include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" /* clang-format off */ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, }; /* clang-format on */ #define FILTER_SRC(filter) \ /* filter the source */ \ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ \ /* add 8 to source */ \ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ \ /* divide source by 16 */ \ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); #define MERGE_WITH_SRC(src_reg, reg) \ exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); #define LOAD_SRC_DST \ /* load source and destination */ \ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ dst_reg = _mm256_loadu_si256((__m256i const *)(dst)); #define AVG_NEXT_SRC(src_reg, size_stride) \ src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ /* average between current and next stride source */ \ src_reg = _mm256_avg_epu8(src_reg, src_next_reg); #define MERGE_NEXT_SRC(src_reg, size_stride) \ src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ MERGE_WITH_SRC(src_reg, src_next_reg) #define CALC_SUM_SSE_INSIDE_LOOP \ /* expand each byte to 2 bytes */ \ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ /* source - dest */ \ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ /* caculate sum */ \ sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ /* calculate sse */ \ sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); // final calculation to sum and sse #define CALC_SUM_AND_SSE \ res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ \ sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ \ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); // Functions related to sub pixel variance width 16 #define LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ /* load source and destination of 2 rows and insert*/ \ src_reg = _mm256_inserti128_si256( \ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \ _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \ dst_reg = _mm256_inserti128_si256( \ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \ _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1); #define AVG_NEXT_SRC_INSERT(src_reg, size_stride) \ src_next_reg = _mm256_inserti128_si256( \ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \ _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1); \ /* average between current and next stride source */ \ src_reg = _mm256_avg_epu8(src_reg, src_next_reg); #define MERGE_NEXT_SRC_INSERT(src_reg, size_stride) \ src_next_reg = _mm256_inserti128_si256( \ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \ _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1); \ MERGE_WITH_SRC(src_reg, src_next_reg) #define LOAD_SRC_NEXT_BYTE_INSERT \ /* load source and another source from next row */ \ src_reg = _mm256_inserti128_si256( \ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \ _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \ /* load source and next row source from 1 byte onwards */ \ src_next_reg = _mm256_inserti128_si256( \ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \ _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1); #define LOAD_DST_INSERT \ dst_reg = _mm256_inserti128_si256( \ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \ _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1); #define LOAD_SRC_MERGE_128BIT(filter) \ __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \ __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \ __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); \ __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); \ __m128i filter_128bit = _mm256_castsi256_si128(filter); \ __m128i pw8_128bit = _mm256_castsi256_si128(pw8); #define FILTER_SRC_128BIT(filter) \ /* filter the source */ \ src_lo = _mm_maddubs_epi16(src_lo, filter); \ src_hi = _mm_maddubs_epi16(src_hi, filter); \ \ /* add 8 to source */ \ src_lo = _mm_add_epi16(src_lo, pw8_128bit); \ src_hi = _mm_add_epi16(src_hi, pw8_128bit); \ \ /* divide source by 16 */ \ src_lo = _mm_srai_epi16(src_lo, 4); \ src_hi = _mm_srai_epi16(src_hi, 4); // TODO(chiyotsai@google.com): These variance functions are macro-fied so we // don't have to manually optimize the individual for-loops. We could save some // binary size by optimizing the loops more carefully without duplicating the // codes with a macro. #define MAKE_SUB_PIXEL_VAR_32XH(height, log2height) \ static inline int aom_sub_pixel_variance32x##height##_imp_avx2( \ const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, unsigned int *sse) { \ __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \ __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; \ __m256i zero_reg; \ int i, sum; \ sum_reg = _mm256_setzero_si256(); \ sse_reg = _mm256_setzero_si256(); \ zero_reg = _mm256_setzero_si256(); \ \ /* x_offset = 0 and y_offset = 0 */ \ if (x_offset == 0) { \ if (y_offset == 0) { \ for (i = 0; i < height; i++) { \ LOAD_SRC_DST \ /* expend each byte to 2 bytes */ \ MERGE_WITH_SRC(src_reg, zero_reg) \ CALC_SUM_SSE_INSIDE_LOOP \ src += src_stride; \ dst += dst_stride; \ } \ /* x_offset = 0 and y_offset = 4 */ \ } else if (y_offset == 4) { \ __m256i src_next_reg; \ for (i = 0; i < height; i++) { \ LOAD_SRC_DST \ AVG_NEXT_SRC(src_reg, src_stride) \ /* expend each byte to 2 bytes */ \ MERGE_WITH_SRC(src_reg, zero_reg) \ CALC_SUM_SSE_INSIDE_LOOP \ src += src_stride; \ dst += dst_stride; \ } \ /* x_offset = 0 and y_offset = bilin interpolation */ \ } else { \ __m256i filter, pw8, src_next_reg; \ \ y_offset <<= 5; \ filter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ pw8 = _mm256_set1_epi16(8); \ for (i = 0; i < height; i++) { \ LOAD_SRC_DST \ MERGE_NEXT_SRC(src_reg, src_stride) \ FILTER_SRC(filter) \ CALC_SUM_SSE_INSIDE_LOOP \ src += src_stride; \ dst += dst_stride; \ } \ } \ /* x_offset = 4 and y_offset = 0 */ \ } else if (x_offset == 4) { \ if (y_offset == 0) { \ __m256i src_next_reg; \ for (i = 0; i < height; i++) { \ LOAD_SRC_DST \ AVG_NEXT_SRC(src_reg, 1) \ /* expand each byte to 2 bytes */ \ MERGE_WITH_SRC(src_reg, zero_reg) \ CALC_SUM_SSE_INSIDE_LOOP \ src += src_stride; \ dst += dst_stride; \ } \ /* x_offset = 4 and y_offset = 4 */ \ } else if (y_offset == 4) { \ __m256i src_next_reg, src_avg; \ /* load source and another source starting from the next */ \ /* following byte */ \ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ AVG_NEXT_SRC(src_reg, 1) \ for (i = 0; i < height; i++) { \ src_avg = src_reg; \ src += src_stride; \ LOAD_SRC_DST \ AVG_NEXT_SRC(src_reg, 1) \ /* average between previous average to current average */ \ src_avg = _mm256_avg_epu8(src_avg, src_reg); \ /* expand each byte to 2 bytes */ \ MERGE_WITH_SRC(src_avg, zero_reg) \ /* save current source average */ \ CALC_SUM_SSE_INSIDE_LOOP \ dst += dst_stride; \ } \ /* x_offset = 4 and y_offset = bilin interpolation */ \ } else { \ __m256i filter, pw8, src_next_reg, src_avg; \ y_offset <<= 5; \ filter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ pw8 = _mm256_set1_epi16(8); \ /* load source and another source starting from the next */ \ /* following byte */ \ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ AVG_NEXT_SRC(src_reg, 1) \ for (i = 0; i < height; i++) { \ /* save current source average */ \ src_avg = src_reg; \ src += src_stride; \ LOAD_SRC_DST \ AVG_NEXT_SRC(src_reg, 1) \ MERGE_WITH_SRC(src_avg, src_reg) \ FILTER_SRC(filter) \ CALC_SUM_SSE_INSIDE_LOOP \ dst += dst_stride; \ } \ } \ /* x_offset = bilin interpolation and y_offset = 0 */ \ } else { \ if (y_offset == 0) { \ __m256i filter, pw8, src_next_reg; \ x_offset <<= 5; \ filter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ pw8 = _mm256_set1_epi16(8); \ for (i = 0; i < height; i++) { \ LOAD_SRC_DST \ MERGE_NEXT_SRC(src_reg, 1) \ FILTER_SRC(filter) \ CALC_SUM_SSE_INSIDE_LOOP \ src += src_stride; \ dst += dst_stride; \ } \ /* x_offset = bilin interpolation and y_offset = 4 */ \ } else if (y_offset == 4) { \ __m256i filter, pw8, src_next_reg, src_pack; \ x_offset <<= 5; \ filter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ pw8 = _mm256_set1_epi16(8); \ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ MERGE_NEXT_SRC(src_reg, 1) \ FILTER_SRC(filter) \ /* convert each 16 bit to 8 bit to each low and high lane source */ \ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ for (i = 0; i < height; i++) { \ src += src_stride; \ LOAD_SRC_DST \ MERGE_NEXT_SRC(src_reg, 1) \ FILTER_SRC(filter) \ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ /* average between previous pack to the current */ \ src_pack = _mm256_avg_epu8(src_pack, src_reg); \ MERGE_WITH_SRC(src_pack, zero_reg) \ CALC_SUM_SSE_INSIDE_LOOP \ src_pack = src_reg; \ dst += dst_stride; \ } \ /* x_offset = bilin interpolation and y_offset = bilin interpolation \ */ \ } else { \ __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; \ x_offset <<= 5; \ xfilter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ y_offset <<= 5; \ yfilter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ pw8 = _mm256_set1_epi16(8); \ /* load source and another source starting from the next */ \ /* following byte */ \ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ MERGE_NEXT_SRC(src_reg, 1) \ \ FILTER_SRC(xfilter) \ /* convert each 16 bit to 8 bit to each low and high lane source */ \ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ for (i = 0; i < height; i++) { \ src += src_stride; \ LOAD_SRC_DST \ MERGE_NEXT_SRC(src_reg, 1) \ FILTER_SRC(xfilter) \ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ /* merge previous pack to current pack source */ \ MERGE_WITH_SRC(src_pack, src_reg) \ /* filter the source */ \ FILTER_SRC(yfilter) \ src_pack = src_reg; \ CALC_SUM_SSE_INSIDE_LOOP \ dst += dst_stride; \ } \ } \ } \ CALC_SUM_AND_SSE \ _mm256_zeroupper(); \ return sum; \ } \ unsigned int aom_sub_pixel_variance32x##height##_avx2( \ const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, unsigned int *sse) { \ const int sum = aom_sub_pixel_variance32x##height##_imp_avx2( \ src, src_stride, x_offset, y_offset, dst, dst_stride, sse); \ return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height)); \ } MAKE_SUB_PIXEL_VAR_32XH(64, 6) MAKE_SUB_PIXEL_VAR_32XH(32, 5) MAKE_SUB_PIXEL_VAR_32XH(16, 4) #define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, hf, wlog2, hlog2) \ unsigned int aom_sub_pixel_variance##w##x##h##_avx2( \ const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ unsigned int sse = 0; \ int se = 0; \ for (int i = 0; i < (w / wf); ++i) { \ const uint8_t *src_ptr = src; \ const uint8_t *dst_ptr = dst; \ for (int j = 0; j < (h / hf); ++j) { \ unsigned int sse2; \ const int se2 = aom_sub_pixel_variance##wf##x##hf##_imp_avx2( \ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ &sse2); \ dst_ptr += hf * dst_stride; \ src_ptr += hf * src_stride; \ se += se2; \ sse += sse2; \ } \ src += wf; \ dst += wf; \ } \ *sse_ptr = sse; \ return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ } // Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height. AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 64, 7, 7) AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 64, 7, 6) AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 64, 6, 7) AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 64, 6, 6) AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 32, 6, 5) #define MAKE_SUB_PIXEL_VAR_16XH(height, log2height) \ unsigned int aom_sub_pixel_variance16x##height##_avx2( \ const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, unsigned int *sse) { \ __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \ __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; \ __m256i zero_reg; \ int i, sum; \ sum_reg = _mm256_setzero_si256(); \ sse_reg = _mm256_setzero_si256(); \ zero_reg = _mm256_setzero_si256(); \ \ /* x_offset = 0 and y_offset = 0 */ \ if (x_offset == 0) { \ if (y_offset == 0) { \ for (i = 0; i < height; i += 2) { \ LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ /* expend each byte to 2 bytes */ \ MERGE_WITH_SRC(src_reg, zero_reg) \ CALC_SUM_SSE_INSIDE_LOOP \ src += (src_stride << 1); \ dst += (dst_stride << 1); \ } \ /* x_offset = 0 and y_offset = 4 */ \ } else if (y_offset == 4) { \ __m256i src_next_reg; \ for (i = 0; i < height; i += 2) { \ LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ AVG_NEXT_SRC_INSERT(src_reg, src_stride) \ /* expend each byte to 2 bytes */ \ MERGE_WITH_SRC(src_reg, zero_reg) \ CALC_SUM_SSE_INSIDE_LOOP \ src += (src_stride << 1); \ dst += (dst_stride << 1); \ } \ /* x_offset = 0 and y_offset = bilin interpolation */ \ } else { \ __m256i filter, pw8, src_next_reg; \ y_offset <<= 5; \ filter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ pw8 = _mm256_set1_epi16(8); \ for (i = 0; i < height; i += 2) { \ LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ MERGE_NEXT_SRC_INSERT(src_reg, src_stride) \ FILTER_SRC(filter) \ CALC_SUM_SSE_INSIDE_LOOP \ src += (src_stride << 1); \ dst += (dst_stride << 1); \ } \ } \ /* x_offset = 4 and y_offset = 0 */ \ } else if (x_offset == 4) { \ if (y_offset == 0) { \ __m256i src_next_reg; \ for (i = 0; i < height; i += 2) { \ LOAD_SRC_NEXT_BYTE_INSERT \ LOAD_DST_INSERT \ /* average between current and next stride source */ \ src_reg = _mm256_avg_epu8(src_reg, src_next_reg); \ /* expand each byte to 2 bytes */ \ MERGE_WITH_SRC(src_reg, zero_reg) \ CALC_SUM_SSE_INSIDE_LOOP \ src += (src_stride << 1); \ dst += (dst_stride << 1); \ } \ /* x_offset = 4 and y_offset = 4 */ \ } else if (y_offset == 4) { \ __m256i src_next_reg, src_avg, src_temp; \ /* load and insert source and next row source */ \ LOAD_SRC_NEXT_BYTE_INSERT \ src_avg = _mm256_avg_epu8(src_reg, src_next_reg); \ src += src_stride << 1; \ for (i = 0; i < height - 2; i += 2) { \ LOAD_SRC_NEXT_BYTE_INSERT \ src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg); \ src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21); \ src_temp = _mm256_avg_epu8(src_avg, src_temp); \ LOAD_DST_INSERT \ /* expand each byte to 2 bytes */ \ MERGE_WITH_SRC(src_temp, zero_reg) \ /* save current source average */ \ src_avg = src_next_reg; \ CALC_SUM_SSE_INSIDE_LOOP \ dst += dst_stride << 1; \ src += src_stride << 1; \ } \ /* last 2 rows processing happens here */ \ __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \ __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \ src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1); \ src_next_reg = _mm256_permute2x128_si256( \ src_avg, _mm256_castsi128_si256(src_reg_0), 0x21); \ LOAD_DST_INSERT \ src_avg = _mm256_avg_epu8(src_avg, src_next_reg); \ MERGE_WITH_SRC(src_avg, zero_reg) \ CALC_SUM_SSE_INSIDE_LOOP \ } else { \ /* x_offset = 4 and y_offset = bilin interpolation */ \ __m256i filter, pw8, src_next_reg, src_avg, src_temp; \ y_offset <<= 5; \ filter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ pw8 = _mm256_set1_epi16(8); \ /* load and insert source and next row source */ \ LOAD_SRC_NEXT_BYTE_INSERT \ src_avg = _mm256_avg_epu8(src_reg, src_next_reg); \ src += src_stride << 1; \ for (i = 0; i < height - 2; i += 2) { \ LOAD_SRC_NEXT_BYTE_INSERT \ src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg); \ src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21); \ LOAD_DST_INSERT \ MERGE_WITH_SRC(src_avg, src_temp) \ /* save current source average */ \ src_avg = src_next_reg; \ FILTER_SRC(filter) \ CALC_SUM_SSE_INSIDE_LOOP \ dst += dst_stride << 1; \ src += src_stride << 1; \ } \ /* last 2 rows processing happens here */ \ __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \ __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \ src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1); \ src_next_reg = _mm256_permute2x128_si256( \ src_avg, _mm256_castsi128_si256(src_reg_0), 0x21); \ LOAD_DST_INSERT \ MERGE_WITH_SRC(src_avg, src_next_reg) \ FILTER_SRC(filter) \ CALC_SUM_SSE_INSIDE_LOOP \ } \ /* x_offset = bilin interpolation and y_offset = 0 */ \ } else { \ if (y_offset == 0) { \ __m256i filter, pw8, src_next_reg; \ x_offset <<= 5; \ filter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ pw8 = _mm256_set1_epi16(8); \ for (i = 0; i < height; i += 2) { \ LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ MERGE_NEXT_SRC_INSERT(src_reg, 1) \ FILTER_SRC(filter) \ CALC_SUM_SSE_INSIDE_LOOP \ src += (src_stride << 1); \ dst += (dst_stride << 1); \ } \ /* x_offset = bilin interpolation and y_offset = 4 */ \ } else if (y_offset == 4) { \ __m256i filter, pw8, src_next_reg, src_pack; \ x_offset <<= 5; \ filter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ pw8 = _mm256_set1_epi16(8); \ /* load and insert source and next row source */ \ LOAD_SRC_NEXT_BYTE_INSERT \ MERGE_WITH_SRC(src_reg, src_next_reg) \ FILTER_SRC(filter) \ /* convert each 16 bit to 8 bit to each low and high lane source */ \ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ src += src_stride << 1; \ for (i = 0; i < height - 2; i += 2) { \ LOAD_SRC_NEXT_BYTE_INSERT \ LOAD_DST_INSERT \ MERGE_WITH_SRC(src_reg, src_next_reg) \ FILTER_SRC(filter) \ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21); \ /* average between previous pack to the current */ \ src_pack = _mm256_avg_epu8(src_pack, src_next_reg); \ MERGE_WITH_SRC(src_pack, zero_reg) \ CALC_SUM_SSE_INSIDE_LOOP \ src_pack = src_reg; \ src += src_stride << 1; \ dst += dst_stride << 1; \ } \ /* last 2 rows processing happens here */ \ LOAD_SRC_MERGE_128BIT(filter) \ LOAD_DST_INSERT \ FILTER_SRC_128BIT(filter_128bit) \ src_reg_0 = _mm_packus_epi16(src_lo, src_hi); \ src_next_reg = _mm256_permute2x128_si256( \ src_pack, _mm256_castsi128_si256(src_reg_0), 0x21); \ /* average between previous pack to the current */ \ src_pack = _mm256_avg_epu8(src_pack, src_next_reg); \ MERGE_WITH_SRC(src_pack, zero_reg) \ CALC_SUM_SSE_INSIDE_LOOP \ } else { \ /* x_offset = bilin interpolation and y_offset = bilin interpolation \ */ \ __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; \ x_offset <<= 5; \ xfilter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ y_offset <<= 5; \ yfilter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ pw8 = _mm256_set1_epi16(8); \ /* load and insert source and next row source */ \ LOAD_SRC_NEXT_BYTE_INSERT \ MERGE_WITH_SRC(src_reg, src_next_reg) \ FILTER_SRC(xfilter) \ /* convert each 16 bit to 8 bit to each low and high lane source */ \ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ src += src_stride << 1; \ for (i = 0; i < height - 2; i += 2) { \ LOAD_SRC_NEXT_BYTE_INSERT \ LOAD_DST_INSERT \ MERGE_WITH_SRC(src_reg, src_next_reg) \ FILTER_SRC(xfilter) \ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21); \ /* average between previous pack to the current */ \ MERGE_WITH_SRC(src_pack, src_next_reg) \ /* filter the source */ \ FILTER_SRC(yfilter) \ src_pack = src_reg; \ CALC_SUM_SSE_INSIDE_LOOP \ src += src_stride << 1; \ dst += dst_stride << 1; \ } \ /* last 2 rows processing happens here */ \ LOAD_SRC_MERGE_128BIT(xfilter) \ LOAD_DST_INSERT \ FILTER_SRC_128BIT(filter_128bit) \ src_reg_0 = _mm_packus_epi16(src_lo, src_hi); \ src_next_reg = _mm256_permute2x128_si256( \ src_pack, _mm256_castsi128_si256(src_reg_0), 0x21); \ MERGE_WITH_SRC(src_pack, src_next_reg) \ FILTER_SRC(yfilter) \ CALC_SUM_SSE_INSIDE_LOOP \ } \ } \ CALC_SUM_AND_SSE \ _mm256_zeroupper(); \ return *sse - (unsigned int)(((int64_t)sum * sum) >> (4 + log2height)); \ } MAKE_SUB_PIXEL_VAR_16XH(32, 5) MAKE_SUB_PIXEL_VAR_16XH(16, 4) MAKE_SUB_PIXEL_VAR_16XH(8, 3) #if !CONFIG_REALTIME_ONLY MAKE_SUB_PIXEL_VAR_16XH(64, 6) MAKE_SUB_PIXEL_VAR_16XH(4, 2) #endif #define MAKE_SUB_PIXEL_AVG_VAR_32XH(height, log2height) \ static int sub_pixel_avg_variance32x##height##_imp_avx2( \ const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, \ unsigned int *sse) { \ __m256i sec_reg; \ __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \ __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; \ __m256i zero_reg; \ int i, sum; \ sum_reg = _mm256_setzero_si256(); \ sse_reg = _mm256_setzero_si256(); \ zero_reg = _mm256_setzero_si256(); \ \ /* x_offset = 0 and y_offset = 0 */ \ if (x_offset == 0) { \ if (y_offset == 0) { \ for (i = 0; i < height; i++) { \ LOAD_SRC_DST \ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ src_reg = _mm256_avg_epu8(src_reg, sec_reg); \ sec += sec_stride; \ /* expend each byte to 2 bytes */ \ MERGE_WITH_SRC(src_reg, zero_reg) \ CALC_SUM_SSE_INSIDE_LOOP \ src += src_stride; \ dst += dst_stride; \ } \ } else if (y_offset == 4) { \ __m256i src_next_reg; \ for (i = 0; i < height; i++) { \ LOAD_SRC_DST \ AVG_NEXT_SRC(src_reg, src_stride) \ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ src_reg = _mm256_avg_epu8(src_reg, sec_reg); \ sec += sec_stride; \ /* expend each byte to 2 bytes */ \ MERGE_WITH_SRC(src_reg, zero_reg) \ CALC_SUM_SSE_INSIDE_LOOP \ src += src_stride; \ dst += dst_stride; \ } \ /* x_offset = 0 and y_offset = bilin interpolation */ \ } else { \ __m256i filter, pw8, src_next_reg; \ \ y_offset <<= 5; \ filter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ pw8 = _mm256_set1_epi16(8); \ for (i = 0; i < height; i++) { \ LOAD_SRC_DST \ MERGE_NEXT_SRC(src_reg, src_stride) \ FILTER_SRC(filter) \ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ src_reg = _mm256_avg_epu8(src_reg, sec_reg); \ sec += sec_stride; \ MERGE_WITH_SRC(src_reg, zero_reg) \ CALC_SUM_SSE_INSIDE_LOOP \ src += src_stride; \ dst += dst_stride; \ } \ } \ /* x_offset = 4 and y_offset = 0 */ \ } else if (x_offset == 4) { \ if (y_offset == 0) { \ __m256i src_next_reg; \ for (i = 0; i < height; i++) { \ LOAD_SRC_DST \ AVG_NEXT_SRC(src_reg, 1) \ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ src_reg = _mm256_avg_epu8(src_reg, sec_reg); \ sec += sec_stride; \ /* expand each byte to 2 bytes */ \ MERGE_WITH_SRC(src_reg, zero_reg) \ CALC_SUM_SSE_INSIDE_LOOP \ src += src_stride; \ dst += dst_stride; \ } \ /* x_offset = 4 and y_offset = 4 */ \ } else if (y_offset == 4) { \ __m256i src_next_reg, src_avg; \ /* load source and another source starting from the next */ \ /* following byte */ \ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ AVG_NEXT_SRC(src_reg, 1) \ for (i = 0; i < height; i++) { \ /* save current source average */ \ src_avg = src_reg; \ src += src_stride; \ LOAD_SRC_DST \ AVG_NEXT_SRC(src_reg, 1) \ /* average between previous average to current average */ \ src_avg = _mm256_avg_epu8(src_avg, src_reg); \ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ src_avg = _mm256_avg_epu8(src_avg, sec_reg); \ sec += sec_stride; \ /* expand each byte to 2 bytes */ \ MERGE_WITH_SRC(src_avg, zero_reg) \ CALC_SUM_SSE_INSIDE_LOOP \ dst += dst_stride; \ } \ /* x_offset = 4 and y_offset = bilin interpolation */ \ } else { \ __m256i filter, pw8, src_next_reg, src_avg; \ y_offset <<= 5; \ filter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ pw8 = _mm256_set1_epi16(8); \ /* load source and another source starting from the next */ \ /* following byte */ \ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ AVG_NEXT_SRC(src_reg, 1) \ for (i = 0; i < height; i++) { \ /* save current source average */ \ src_avg = src_reg; \ src += src_stride; \ LOAD_SRC_DST \ AVG_NEXT_SRC(src_reg, 1) \ MERGE_WITH_SRC(src_avg, src_reg) \ FILTER_SRC(filter) \ src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ src_avg = _mm256_avg_epu8(src_avg, sec_reg); \ /* expand each byte to 2 bytes */ \ MERGE_WITH_SRC(src_avg, zero_reg) \ sec += sec_stride; \ CALC_SUM_SSE_INSIDE_LOOP \ dst += dst_stride; \ } \ } \ /* x_offset = bilin interpolation and y_offset = 0 */ \ } else { \ if (y_offset == 0) { \ __m256i filter, pw8, src_next_reg; \ x_offset <<= 5; \ filter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ pw8 = _mm256_set1_epi16(8); \ for (i = 0; i < height; i++) { \ LOAD_SRC_DST \ MERGE_NEXT_SRC(src_reg, 1) \ FILTER_SRC(filter) \ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ src_reg = _mm256_avg_epu8(src_reg, sec_reg); \ MERGE_WITH_SRC(src_reg, zero_reg) \ sec += sec_stride; \ CALC_SUM_SSE_INSIDE_LOOP \ src += src_stride; \ dst += dst_stride; \ } \ /* x_offset = bilin interpolation and y_offset = 4 */ \ } else if (y_offset == 4) { \ __m256i filter, pw8, src_next_reg, src_pack; \ x_offset <<= 5; \ filter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ pw8 = _mm256_set1_epi16(8); \ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ MERGE_NEXT_SRC(src_reg, 1) \ FILTER_SRC(filter) \ /* convert each 16 bit to 8 bit to each low and high lane source */ \ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ for (i = 0; i < height; i++) { \ src += src_stride; \ LOAD_SRC_DST \ MERGE_NEXT_SRC(src_reg, 1) \ FILTER_SRC(filter) \ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ /* average between previous pack to the current */ \ src_pack = _mm256_avg_epu8(src_pack, src_reg); \ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ src_pack = _mm256_avg_epu8(src_pack, sec_reg); \ sec += sec_stride; \ MERGE_WITH_SRC(src_pack, zero_reg) \ src_pack = src_reg; \ CALC_SUM_SSE_INSIDE_LOOP \ dst += dst_stride; \ } \ /* x_offset = bilin interpolation and y_offset = bilin interpolation \ */ \ } else { \ __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; \ x_offset <<= 5; \ xfilter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ y_offset <<= 5; \ yfilter = _mm256_load_si256( \ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ pw8 = _mm256_set1_epi16(8); \ /* load source and another source starting from the next */ \ /* following byte */ \ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ MERGE_NEXT_SRC(src_reg, 1) \ \ FILTER_SRC(xfilter) \ /* convert each 16 bit to 8 bit to each low and high lane source */ \ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ for (i = 0; i < height; i++) { \ src += src_stride; \ LOAD_SRC_DST \ MERGE_NEXT_SRC(src_reg, 1) \ FILTER_SRC(xfilter) \ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ /* merge previous pack to current pack source */ \ MERGE_WITH_SRC(src_pack, src_reg) \ /* filter the source */ \ FILTER_SRC(yfilter) \ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ src_pack = _mm256_avg_epu8(src_pack, sec_reg); \ MERGE_WITH_SRC(src_pack, zero_reg) \ src_pack = src_reg; \ sec += sec_stride; \ CALC_SUM_SSE_INSIDE_LOOP \ dst += dst_stride; \ } \ } \ } \ CALC_SUM_AND_SSE \ _mm256_zeroupper(); \ return sum; \ } \ unsigned int aom_sub_pixel_avg_variance32x##height##_avx2( \ const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, unsigned int *sse, \ const uint8_t *sec_ptr) { \ const int sum = sub_pixel_avg_variance32x##height##_imp_avx2( \ src, src_stride, x_offset, y_offset, dst, dst_stride, sec_ptr, 32, \ sse); \ return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height)); \ } MAKE_SUB_PIXEL_AVG_VAR_32XH(64, 6) MAKE_SUB_PIXEL_AVG_VAR_32XH(32, 5) MAKE_SUB_PIXEL_AVG_VAR_32XH(16, 4) #define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, hf, wlog2, hlog2) \ unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2( \ const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ const uint8_t *sec) { \ unsigned int sse = 0; \ int se = 0; \ for (int i = 0; i < (w / wf); ++i) { \ const uint8_t *src_ptr = src; \ const uint8_t *dst_ptr = dst; \ const uint8_t *sec_ptr = sec; \ for (int j = 0; j < (h / hf); ++j) { \ unsigned int sse2; \ const int se2 = sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ sec_ptr, w, &sse2); \ dst_ptr += hf * dst_stride; \ src_ptr += hf * src_stride; \ sec_ptr += hf * w; \ se += se2; \ sse += sse2; \ } \ src += wf; \ dst += wf; \ sec += wf; \ } \ *sse_ptr = sse; \ return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ } // Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height. AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 64, 7, 7) AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 64, 7, 6) AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 64, 6, 7) AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 64, 6, 6) AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 32, 6, 5) aom-3.12.1/aom_dsp/x86/variance_impl_ssse3.c000066400000000000000000000110741477627663500205500ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/variance_impl_ssse3.h" void aom_var_filter_block2d_bil_first_pass_ssse3( const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter) { // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow // in computation using _mm_maddubs_epi16. // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow. const int16_t round = (1 << (FILTER_BITS - 1)) >> 1; const __m128i r = _mm_set1_epi16(round); const int8_t f0 = (int8_t)(filter[0] >> 1); const int8_t f1 = (int8_t)(filter[1] >> 1); const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1, f0, f1, f0, f1, f0, f1); unsigned int i, j; (void)pixel_step; if (output_width >= 8) { for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; j += 8) { // load source __m128i source_low = xx_loadl_64(a); __m128i source_hi = xx_loadl_64(a + 1); // unpack to: // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } __m128i source = _mm_unpacklo_epi8(source_low, source_hi); // b[i] = a[i] * filter[0] + a[i + 1] * filter[1] __m128i res = _mm_maddubs_epi16(source, filters); // round res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); xx_storeu_128(b, res); a += 8; b += 8; } a += src_pixels_per_line - output_width; } } else { const __m128i shuffle_mask = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); for (i = 0; i < output_height; ++i) { // load source, only first 5 values are meaningful: // { a[0], a[1], a[2], a[3], a[4], xxxx } __m128i source = xx_loadl_64(a); // shuffle, up to the first 8 are useful // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); __m128i res = _mm_maddubs_epi16(source_shuffle, filters); res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); xx_storel_64(b, res); a += src_pixels_per_line; b += output_width; } } } void aom_var_filter_block2d_bil_second_pass_ssse3( const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter) { const int16_t round = (1 << FILTER_BITS) >> 1; const __m128i r = _mm_set1_epi32(round); const __m128i filters = _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0], filter[1], filter[0], filter[1]); const __m128i shuffle_mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); const __m128i mask = _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; j += 4) { // load source as: // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] } __m128i source1 = xx_loadl_64(a); __m128i source2 = xx_loadl_64(a + pixel_step); __m128i source = _mm_unpacklo_epi64(source1, source2); // shuffle source to: // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] } __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); // b[i] = a[i] * filter[0] + a[w + i] * filter[1] __m128i res = _mm_madd_epi16(source_shuffle, filters); // round res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS); // shuffle to get each lower 8 bit of every 32 bit res = _mm_shuffle_epi8(res, mask); xx_storel_32(b, res); a += 4; b += 4; } a += src_pixels_per_line - output_width; } } aom-3.12.1/aom_dsp/x86/variance_impl_ssse3.h000066400000000000000000000022121477627663500205470ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_ #define AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_ #include void aom_var_filter_block2d_bil_first_pass_ssse3( const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter); void aom_var_filter_block2d_bil_second_pass_ssse3( const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter); #endif // AOM_AOM_DSP_X86_VARIANCE_IMPL_SSSE3_H_ aom-3.12.1/aom_dsp/x86/variance_sse2.c000066400000000000000000000577711477627663500173610ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include // SSE2 #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/blend.h" #include "aom_dsp/x86/mem_sse2.h" #include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" #if !CONFIG_REALTIME_ONLY unsigned int aom_get_mb_ss_sse2(const int16_t *src) { __m128i vsum = _mm_setzero_si128(); int i; for (i = 0; i < 32; ++i) { const __m128i v = xx_loadu_128(src); vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); src += 8; } vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); return (unsigned int)_mm_cvtsi128_si32(vsum); } #endif // !CONFIG_REALTIME_ONLY static inline __m128i load4x2_sse2(const uint8_t *const p, const int stride) { const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride)); const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride)); return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128()); } static inline __m128i load8_8to16_sse2(const uint8_t *const p) { const __m128i p0 = _mm_loadl_epi64((const __m128i *)p); return _mm_unpacklo_epi8(p0, _mm_setzero_si128()); } static inline void load16_8to16_sse2(const uint8_t *const p, __m128i *out) { const __m128i p0 = _mm_loadu_si128((const __m128i *)p); out[0] = _mm_unpacklo_epi8(p0, _mm_setzero_si128()); // lower 8 values out[1] = _mm_unpackhi_epi8(p0, _mm_setzero_si128()); // upper 8 values } // Accumulate 4 32bit numbers in val to 1 32bit number static inline unsigned int add32x4_sse2(__m128i val) { val = _mm_add_epi32(val, _mm_srli_si128(val, 8)); val = _mm_add_epi32(val, _mm_srli_si128(val, 4)); return (unsigned int)_mm_cvtsi128_si32(val); } // Accumulate 8 16bit in sum to 4 32bit number static inline __m128i sum_to_32bit_sse2(const __m128i sum) { const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16); const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16); return _mm_add_epi32(sum_lo, sum_hi); } static inline void variance_kernel_sse2(const __m128i src, const __m128i ref, __m128i *const sse, __m128i *const sum) { const __m128i diff = _mm_sub_epi16(src, ref); *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff)); *sum = _mm_add_epi16(*sum, diff); } // Can handle 128 pixels' diff sum (such as 8x16 or 16x8) // Slightly faster than variance_final_256_pel_sse2() // diff sum of 128 pixels can still fit in 16bit integer static inline void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum, unsigned int *const sse, int *const sum) { *sse = add32x4_sse2(vsse); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); *sum = (int16_t)_mm_extract_epi16(vsum, 0); } // Can handle 256 pixels' diff sum (such as 16x16) static inline void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum, unsigned int *const sse, int *const sum) { *sse = add32x4_sse2(vsse); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); *sum = (int16_t)_mm_extract_epi16(vsum, 0); *sum += (int16_t)_mm_extract_epi16(vsum, 1); } // Can handle 512 pixels' diff sum (such as 16x32 or 32x16) static inline void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum, unsigned int *const sse, int *const sum) { *sse = add32x4_sse2(vsse); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_unpacklo_epi16(vsum, vsum); vsum = _mm_srai_epi32(vsum, 16); *sum = (int)add32x4_sse2(vsum); } // Can handle 1024 pixels' diff sum (such as 32x32) static inline void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum, unsigned int *const sse, int *const sum) { *sse = add32x4_sse2(vsse); vsum = sum_to_32bit_sse2(vsum); *sum = (int)add32x4_sse2(vsum); } static inline void variance4_sse2(const uint8_t *src, const int src_stride, const uint8_t *ref, const int ref_stride, const int h, __m128i *const sse, __m128i *const sum) { assert(h <= 256); // May overflow for larger height. *sum = _mm_setzero_si128(); for (int i = 0; i < h; i += 2) { const __m128i s = load4x2_sse2(src, src_stride); const __m128i r = load4x2_sse2(ref, ref_stride); variance_kernel_sse2(s, r, sse, sum); src += 2 * src_stride; ref += 2 * ref_stride; } } static inline void variance8_sse2(const uint8_t *src, const int src_stride, const uint8_t *ref, const int ref_stride, const int h, __m128i *const sse, __m128i *const sum) { assert(h <= 128); // May overflow for larger height. *sum = _mm_setzero_si128(); *sse = _mm_setzero_si128(); for (int i = 0; i < h; i++) { const __m128i s = load8_8to16_sse2(src); const __m128i r = load8_8to16_sse2(ref); variance_kernel_sse2(s, r, sse, sum); src += src_stride; ref += ref_stride; } } static inline void variance16_kernel_sse2(const uint8_t *const src, const uint8_t *const ref, __m128i *const sse, __m128i *const sum) { const __m128i zero = _mm_setzero_si128(); const __m128i s = _mm_loadu_si128((const __m128i *)src); const __m128i r = _mm_loadu_si128((const __m128i *)ref); const __m128i src0 = _mm_unpacklo_epi8(s, zero); const __m128i ref0 = _mm_unpacklo_epi8(r, zero); const __m128i src1 = _mm_unpackhi_epi8(s, zero); const __m128i ref1 = _mm_unpackhi_epi8(r, zero); variance_kernel_sse2(src0, ref0, sse, sum); variance_kernel_sse2(src1, ref1, sse, sum); } static inline void variance16_sse2(const uint8_t *src, const int src_stride, const uint8_t *ref, const int ref_stride, const int h, __m128i *const sse, __m128i *const sum) { assert(h <= 64); // May overflow for larger height. *sum = _mm_setzero_si128(); for (int i = 0; i < h; ++i) { variance16_kernel_sse2(src, ref, sse, sum); src += src_stride; ref += ref_stride; } } static inline void variance32_sse2(const uint8_t *src, const int src_stride, const uint8_t *ref, const int ref_stride, const int h, __m128i *const sse, __m128i *const sum) { assert(h <= 32); // May overflow for larger height. // Don't initialize sse here since it's an accumulation. *sum = _mm_setzero_si128(); for (int i = 0; i < h; ++i) { variance16_kernel_sse2(src + 0, ref + 0, sse, sum); variance16_kernel_sse2(src + 16, ref + 16, sse, sum); src += src_stride; ref += ref_stride; } } static inline void variance64_sse2(const uint8_t *src, const int src_stride, const uint8_t *ref, const int ref_stride, const int h, __m128i *const sse, __m128i *const sum) { assert(h <= 16); // May overflow for larger height. *sum = _mm_setzero_si128(); for (int i = 0; i < h; ++i) { variance16_kernel_sse2(src + 0, ref + 0, sse, sum); variance16_kernel_sse2(src + 16, ref + 16, sse, sum); variance16_kernel_sse2(src + 32, ref + 32, sse, sum); variance16_kernel_sse2(src + 48, ref + 48, sse, sum); src += src_stride; ref += ref_stride; } } static inline void variance128_sse2(const uint8_t *src, const int src_stride, const uint8_t *ref, const int ref_stride, const int h, __m128i *const sse, __m128i *const sum) { assert(h <= 8); // May overflow for larger height. *sum = _mm_setzero_si128(); for (int i = 0; i < h; ++i) { for (int j = 0; j < 4; ++j) { const int offset0 = j << 5; const int offset1 = offset0 + 16; variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum); variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum); } src += src_stride; ref += ref_stride; } } void aom_get_var_sse_sum_8x8_quad_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8) { // Loop over 4 8x8 blocks. Process one 8x32 block. for (int k = 0; k < 4; k++) { const uint8_t *src = src_ptr; const uint8_t *ref = ref_ptr; __m128i vsum = _mm_setzero_si128(); __m128i vsse = _mm_setzero_si128(); for (int i = 0; i < 8; i++) { const __m128i s = load8_8to16_sse2(src + (k * 8)); const __m128i r = load8_8to16_sse2(ref + (k * 8)); const __m128i diff = _mm_sub_epi16(s, r); vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff, diff)); vsum = _mm_add_epi16(vsum, diff); src += src_stride; ref += ref_stride; } variance_final_128_pel_sse2(vsse, vsum, &sse8x8[k], &sum8x8[k]); } // Calculate variance at 8x8 level and total sse, sum of 8x32 block. *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3]; *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3]; for (int i = 0; i < 4; i++) var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6); } void aom_get_var_sse_sum_16x16_dual_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16) { int sum16x16[2] = { 0 }; // Loop over 2 16x16 blocks. Process one 16x32 block. for (int k = 0; k < 2; k++) { const uint8_t *src = src_ptr; const uint8_t *ref = ref_ptr; __m128i vsum = _mm_setzero_si128(); __m128i vsse = _mm_setzero_si128(); for (int i = 0; i < 16; i++) { __m128i s[2]; __m128i r[2]; load16_8to16_sse2(src + (k * 16), s); load16_8to16_sse2(ref + (k * 16), r); const __m128i diff0 = _mm_sub_epi16(s[0], r[0]); const __m128i diff1 = _mm_sub_epi16(s[1], r[1]); vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); vsum = _mm_add_epi16(vsum, _mm_add_epi16(diff0, diff1)); src += src_stride; ref += ref_stride; } variance_final_256_pel_sse2(vsse, vsum, &sse16x16[k], &sum16x16[k]); } // Calculate variance at 16x16 level and total sse, sum of 16x32 block. *tot_sse += sse16x16[0] + sse16x16[1]; *tot_sum += sum16x16[0] + sum16x16[1]; for (int i = 0; i < 2; i++) var16x16[i] = sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8); } #define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels) \ unsigned int aom_variance##bw##x##bh##_sse2( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ unsigned int *sse) { \ __m128i vsse = _mm_setzero_si128(); \ __m128i vsum; \ int sum = 0; \ variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \ variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum); \ assert(sum <= 255 * bw * bh); \ assert(sum >= -255 * bw * bh); \ return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ } AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128) AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128) AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128) AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128) AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128) AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128) AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256) AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512) AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512) AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024) #if !CONFIG_REALTIME_ONLY AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128) AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128) AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256) AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256) AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024) #endif #define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh) \ unsigned int aom_variance##bw##x##bh##_sse2( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ unsigned int *sse) { \ __m128i vsse = _mm_setzero_si128(); \ __m128i vsum = _mm_setzero_si128(); \ for (int i = 0; i < (bh / uh); ++i) { \ __m128i vsum16; \ variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse, \ &vsum16); \ vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); \ src += (src_stride * uh); \ ref += (ref_stride * uh); \ } \ *sse = add32x4_sse2(vsse); \ int sum = (int)add32x4_sse2(vsum); \ assert(sum <= 255 * bw * bh); \ assert(sum >= -255 * bw * bh); \ return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ } AOM_VAR_LOOP_SSE2(32, 64, 11, 32) // 32x32 * ( 64/32 ) AOM_VAR_LOOP_SSE2(64, 32, 11, 16) // 64x16 * ( 32/16 ) AOM_VAR_LOOP_SSE2(64, 64, 12, 16) // 64x16 * ( 64/16 ) AOM_VAR_LOOP_SSE2(64, 128, 13, 16) // 64x16 * ( 128/16 ) AOM_VAR_LOOP_SSE2(128, 64, 13, 8) // 128x8 * ( 64/8 ) AOM_VAR_LOOP_SSE2(128, 128, 14, 8) // 128x8 * ( 128/8 ) #if !CONFIG_REALTIME_ONLY AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024) #endif unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); return *sse; } unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); return *sse; } unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); return *sse; } unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); return *sse; } #if CONFIG_AV1_HIGHBITDEPTH static inline __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0, const __m128i s1, const __m128i a) { const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i a_inv = _mm_sub_epi16(alpha_max, a); const __m128i s_lo = _mm_unpacklo_epi16(s0, s1); const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv); const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo); const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i s_hi = _mm_unpackhi_epi16(s0, s1); const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv); const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi); const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i comp = _mm_packs_epi32(pred_l, pred_h); return comp; } void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { int i = 0; uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); const uint16_t *src0 = invert_mask ? pred : ref; const uint16_t *src1 = invert_mask ? ref : pred; const int stride0 = invert_mask ? width : ref_stride; const int stride1 = invert_mask ? ref_stride : width; const __m128i zero = _mm_setzero_si128(); if (width == 8) { do { const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0)); const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1)); const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask); const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero); const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16); _mm_storeu_si128((__m128i *)comp_pred, comp); src0 += stride0; src1 += stride1; mask += mask_stride; comp_pred += width; i += 1; } while (i < height); } else if (width == 16) { do { const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0)); const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8)); const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1)); const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8)); const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask); const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero); const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero); const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16); const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16); _mm_storeu_si128((__m128i *)comp_pred, comp); _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1); src0 += stride0; src1 += stride1; mask += mask_stride; comp_pred += width; i += 1; } while (i < height); } else { do { for (int x = 0; x < width; x += 32) { for (int j = 0; j < 2; j++) { const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + x + j * 16)); const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + x + 8 + j * 16)); const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + x + j * 16)); const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + x + 8 + j * 16)); const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + x + j * 16)); const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero); const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero); const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16); const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16); _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp); _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1); } comp_pred += 32; } src0 += stride0; src1 += stride1; mask += mask_stride; i += 1; } while (i < height); } } #endif // CONFIG_AV1_HIGHBITDEPTH static uint64_t mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i dst0_8x8, dst1_8x8, dst_16x8; __m128i src0_16x4, src1_16x4, src_16x8; __m128i res0_32x4, res0_64x2, res1_64x2; __m128i sub_result_16x8; const __m128i zeros = _mm_setzero_si128(); __m128i square_result = _mm_setzero_si128(); for (int i = 0; i < h; i += 2) { dst0_8x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride])); dst1_8x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride])); dst_16x8 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(dst0_8x8, dst1_8x8), zeros); src0_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride])); src1_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride])); src_16x8 = _mm_unpacklo_epi64(src0_16x4, src1_16x4); sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8); res0_32x4 = _mm_madd_epi16(sub_result_16x8, sub_result_16x8); res0_64x2 = _mm_unpacklo_epi32(res0_32x4, zeros); res1_64x2 = _mm_unpackhi_epi32(res0_32x4, zeros); square_result = _mm_add_epi64(square_result, _mm_add_epi64(res0_64x2, res1_64x2)); } const __m128i sum_64x1 = _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8)); xx_storel_64(&sum, sum_64x1); return sum; } static uint64_t mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i dst_8x8, dst_16x8; __m128i src_16x8; __m128i res0_32x4, res0_64x2, res1_64x2; __m128i sub_result_16x8; const __m128i zeros = _mm_setzero_si128(); __m128i square_result = _mm_setzero_si128(); for (int i = 0; i < h; i++) { dst_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride])); dst_16x8 = _mm_unpacklo_epi8(dst_8x8, zeros); src_16x8 = _mm_loadu_si128((__m128i *)&src[i * sstride]); sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8); res0_32x4 = _mm_madd_epi16(sub_result_16x8, sub_result_16x8); res0_64x2 = _mm_unpacklo_epi32(res0_32x4, zeros); res1_64x2 = _mm_unpackhi_epi32(res0_32x4, zeros); square_result = _mm_add_epi64(square_result, _mm_add_epi64(res0_64x2, res1_64x2)); } const __m128i sum_64x1 = _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8)); xx_storel_64(&sum, sum_64x1); return sum; } uint64_t aom_mse_wxh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, int sstride, int w, int h) { assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must satisfy"); switch (w) { case 4: return mse_4xh_16bit_sse2(dst, dstride, src, sstride, h); case 8: return mse_8xh_16bit_sse2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } uint64_t aom_mse_16xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, int w, int h) { assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must be satisfied"); const int num_blks = 16 / w; uint64_t sum = 0; for (int i = 0; i < num_blks; i++) { sum += aom_mse_wxh_16bit_sse2(dst, dstride, src, w, w, h); dst += w; src += (w * h); } return sum; } aom-3.12.1/aom_dsp/x86/variance_ssse3.c000066400000000000000000000250531477627663500175310ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" // The 2 unused parameters are place holders for PIC enabled build. // These definitions are for functions defined in subpel_variance.asm #define DECL(w, opt) \ int aom_sub_pixel_variance##w##xh_##opt( \ const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \ void *unused0, void *unused) #define DECLS(opt) \ DECL(4, opt); \ DECL(8, opt); \ DECL(16, opt) DECLS(ssse3); #undef DECLS #undef DECL #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \ const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ /*Avoid overflow in helper by capping height.*/ \ const int hf = AOMMIN(h, 64); \ unsigned int sse = 0; \ int se = 0; \ for (int i = 0; i < (w / wf); ++i) { \ const uint8_t *src_ptr = src; \ const uint8_t *dst_ptr = dst; \ for (int j = 0; j < (h / hf); ++j) { \ unsigned int sse2; \ const int se2 = aom_sub_pixel_variance##wf##xh_##opt( \ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \ &sse2, NULL, NULL); \ dst_ptr += hf * dst_stride; \ src_ptr += hf * src_stride; \ se += se2; \ sse += sse2; \ } \ src += wf; \ dst += wf; \ } \ *sse_ptr = sse; \ return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } #if !CONFIG_REALTIME_ONLY #define FNS(opt) \ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \ FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)) \ FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)) \ FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)) \ FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)) \ FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)) \ FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) \ FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)) \ FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)) \ FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)) \ FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)) \ FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)) \ FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) #else #define FNS(opt) \ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \ FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)) \ FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)) \ FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)) \ FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)) \ FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)) \ FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) #endif FNS(ssse3) #undef FNS #undef FN // The 2 unused parameters are place holders for PIC enabled build. #define DECL(w, opt) \ int aom_sub_pixel_avg_variance##w##xh_##opt( \ const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \ ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ void *unused) #define DECLS(opt) \ DECL(4, opt); \ DECL(8, opt); \ DECL(16, opt) DECLS(ssse3); #undef DECL #undef DECLS #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \ const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ const uint8_t *sec) { \ /*Avoid overflow in helper by capping height.*/ \ const int hf = AOMMIN(h, 64); \ unsigned int sse = 0; \ int se = 0; \ for (int i = 0; i < (w / wf); ++i) { \ const uint8_t *src_ptr = src; \ const uint8_t *dst_ptr = dst; \ const uint8_t *sec_ptr = sec; \ for (int j = 0; j < (h / hf); ++j) { \ unsigned int sse2; \ const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ sec_ptr, w, hf, &sse2, NULL, NULL); \ dst_ptr += hf * dst_stride; \ src_ptr += hf * src_stride; \ sec_ptr += hf * w; \ se += se2; \ sse += sse2; \ } \ src += wf; \ dst += wf; \ sec += wf; \ } \ *sse_ptr = sse; \ return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } #if !CONFIG_REALTIME_ONLY #define FNS(opt) \ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \ FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)) \ FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)) \ FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)) \ FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)) \ FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)) \ FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) \ FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)) \ FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)) \ FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)) \ FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)) \ FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)) \ FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) #else #define FNS(opt) \ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \ FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)) \ FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)) \ FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)) \ FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)) \ FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)) \ FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) #endif FNS(ssse3) #undef FNS #undef FN aom-3.12.1/aom_mem/000077500000000000000000000000001477627663500140135ustar00rootroot00000000000000aom-3.12.1/aom_mem/aom_mem.c000066400000000000000000000053231477627663500155740ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom_mem.h" #include #include #include #include "include/aom_mem_intrnl.h" #include "aom/aom_integer.h" static size_t GetAllocationPaddingSize(size_t align) { assert(align > 0); assert(align < SIZE_MAX - ADDRESS_STORAGE_SIZE); return align - 1 + ADDRESS_STORAGE_SIZE; } // Returns 0 in case of overflow of nmemb * size. static int check_size_argument_overflow(size_t nmemb, size_t size, size_t align) { if (nmemb == 0) return 1; const size_t alloc_padding = GetAllocationPaddingSize(align); #if defined(AOM_MAX_ALLOCABLE_MEMORY) assert(AOM_MAX_ALLOCABLE_MEMORY >= alloc_padding); assert(AOM_MAX_ALLOCABLE_MEMORY <= SIZE_MAX); if (size > (AOM_MAX_ALLOCABLE_MEMORY - alloc_padding) / nmemb) return 0; #else if (size > (SIZE_MAX - alloc_padding) / nmemb) return 0; #endif return 1; } static size_t *GetMallocAddressLocation(void *const mem) { return ((size_t *)mem) - 1; } static void SetActualMallocAddress(void *const mem, const void *const malloc_addr) { size_t *const malloc_addr_location = GetMallocAddressLocation(mem); *malloc_addr_location = (size_t)malloc_addr; } static void *GetActualMallocAddress(void *const mem) { const size_t *const malloc_addr_location = GetMallocAddressLocation(mem); return (void *)(*malloc_addr_location); } void *aom_memalign(size_t align, size_t size) { void *x = NULL; if (!check_size_argument_overflow(1, size, align)) return NULL; const size_t aligned_size = size + GetAllocationPaddingSize(align); void *const addr = malloc(aligned_size); if (addr) { x = aom_align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, align); SetActualMallocAddress(x, addr); } return x; } void *aom_malloc(size_t size) { return aom_memalign(DEFAULT_ALIGNMENT, size); } void *aom_calloc(size_t num, size_t size) { if (!check_size_argument_overflow(num, size, DEFAULT_ALIGNMENT)) return NULL; const size_t total_size = num * size; void *const x = aom_malloc(total_size); if (x) memset(x, 0, total_size); return x; } void aom_free(void *memblk) { if (memblk) { void *addr = GetActualMallocAddress(memblk); free(addr); } } aom-3.12.1/aom_mem/aom_mem.cmake000066400000000000000000000023301477627663500164250ustar00rootroot00000000000000# # Copyright (c) 2017, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # if(AOM_AOM_MEM_AOM_MEM_CMAKE_) return() endif() # AOM_AOM_MEM_AOM_MEM_CMAKE_ set(AOM_AOM_MEM_AOM_MEM_CMAKE_ 1) list(APPEND AOM_MEM_SOURCES "${AOM_ROOT}/aom_mem/aom_mem.c" "${AOM_ROOT}/aom_mem/aom_mem.h" "${AOM_ROOT}/aom_mem/include/aom_mem_intrnl.h") # Creates the aom_mem build target and makes libaom depend on it. The libaom # target must exist before this function is called. function(setup_aom_mem_targets) add_library(aom_mem OBJECT ${AOM_MEM_SOURCES}) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_mem PARENT_SCOPE) target_sources(aom PRIVATE $) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $) endif() endfunction() aom-3.12.1/aom_mem/aom_mem.h000066400000000000000000000051301477627663500155750ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_MEM_AOM_MEM_H_ #define AOM_AOM_MEM_AOM_MEM_H_ #include "aom/aom_integer.h" #include "config/aom_config.h" #if defined(__uClinux__) #include #endif #if defined(__cplusplus) extern "C" { #endif #ifndef AOM_MAX_ALLOCABLE_MEMORY #if SIZE_MAX > (1ULL << 32) #define AOM_MAX_ALLOCABLE_MEMORY 8589934592 // 8 GB #else // For 32-bit targets keep this below INT_MAX to avoid valgrind warnings. #define AOM_MAX_ALLOCABLE_MEMORY ((1ULL << 31) - (1 << 16)) #endif #endif void *aom_memalign(size_t align, size_t size); void *aom_malloc(size_t size); void *aom_calloc(size_t num, size_t size); void aom_free(void *memblk); static inline void *aom_memset16(void *dest, int val, size_t length) { size_t i; uint16_t *dest16 = (uint16_t *)dest; for (i = 0; i < length; i++) *dest16++ = val; return dest; } /*returns an addr aligned to the byte boundary specified by align*/ #define aom_align_addr(addr, align) \ (void *)(((uintptr_t)(addr) + ((align)-1)) & ~(uintptr_t)((align)-1)) #include #ifdef AOM_MEM_PLTFRM #include AOM_MEM_PLTFRM #endif #if CONFIG_DEBUG #define AOM_CHECK_MEM_ERROR(error_info, lval, expr) \ do { \ lval = (expr); \ if (!lval) \ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \ "Failed to allocate " #lval " at %s:%d", __FILE__, \ __LINE__); \ } while (0) #else #define AOM_CHECK_MEM_ERROR(error_info, lval, expr) \ do { \ lval = (expr); \ if (!lval) \ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \ "Failed to allocate " #lval); \ } while (0) #endif #if defined(__cplusplus) } #endif #endif // AOM_AOM_MEM_AOM_MEM_H_ aom-3.12.1/aom_mem/include/000077500000000000000000000000001477627663500154365ustar00rootroot00000000000000aom-3.12.1/aom_mem/include/aom_mem_intrnl.h000066400000000000000000000017431477627663500206140ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_ #define AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_ #include "config/aom_config.h" #define ADDRESS_STORAGE_SIZE sizeof(size_t) #ifndef DEFAULT_ALIGNMENT #if defined(VXWORKS) /*default addr alignment to use in calls to aom_* functions other than aom_memalign*/ #define DEFAULT_ALIGNMENT 32 #else #define DEFAULT_ALIGNMENT (2 * sizeof(void *)) /* NOLINT */ #endif #endif #endif // AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_ aom-3.12.1/aom_ports/000077500000000000000000000000001477627663500144045ustar00rootroot00000000000000aom-3.12.1/aom_ports/aarch32_cpudetect.c000066400000000000000000000050221477627663500200320ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Feature detection code for Armv7-A / AArch32. #include "arm_cpudetect.h" #if !CONFIG_RUNTIME_CPU_DETECT static int arm_get_cpu_caps(void) { // This function should actually be a no-op. There is no way to adjust any of // these because the RTCD tables do not exist: the functions are called // statically. int flags = 0; #if HAVE_NEON flags |= HAS_NEON; #endif // HAVE_NEON return flags; } #elif defined(_MSC_VER) // end !CONFIG_RUNTIME_CPU_DETECT static int arm_get_cpu_caps(void) { int flags = 0; #if HAVE_NEON // MSVC has no inline __asm support for Arm, but it does let you __emit // instructions via their assembled hex code. // All of these instructions should be essentially nops. __try { // VORR q0,q0,q0 __emit(0xF2200150); flags |= HAS_NEON; } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { // Ignore exception. } #endif // HAVE_NEON return flags; } #elif defined(AOM_USE_ANDROID_CPU_FEATURES) static int arm_get_cpu_caps(void) { int flags = 0; #if HAVE_NEON uint64_t features = android_getCpuFeatures(); if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON; #endif // HAVE_NEON return flags; } #elif defined(__linux__) // end defined(AOM_USE_ANDROID_CPU_FEATURES) #include // Define hwcap values ourselves: building with an old auxv header where these // hwcap values are not defined should not prevent features from being enabled. #define AOM_AARCH32_HWCAP_NEON (1 << 12) static int arm_get_cpu_caps(void) { int flags = 0; unsigned long hwcap = getauxval(AT_HWCAP); #if HAVE_NEON if (hwcap & AOM_AARCH32_HWCAP_NEON) flags |= HAS_NEON; #endif // HAVE_NEON return flags; } #else // end __linux__ #error \ "Runtime CPU detection selected, but no CPU detection method " \ "available for your platform. Rerun cmake with -DCONFIG_RUNTIME_CPU_DETECT=0." #endif int aom_arm_cpu_caps(void) { int flags = 0; if (arm_cpu_env_flags(&flags)) { return flags; } return arm_get_cpu_caps() & arm_cpu_env_mask(); } aom-3.12.1/aom_ports/aarch64_cpudetect.c000066400000000000000000000166631477627663500200540ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_config.h" #include "arm_cpudetect.h" #include "aom_ports/arm.h" #if defined(__APPLE__) #include #endif #if !CONFIG_RUNTIME_CPU_DETECT static int arm_get_cpu_caps(void) { // This function should actually be a no-op. There is no way to adjust any of // these because the RTCD tables do not exist: the functions are called // statically. int flags = 0; #if HAVE_NEON flags |= HAS_NEON; #endif // HAVE_NEON return flags; } #elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT // sysctlbyname() parameter documentation for instruction set characteristics: // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics static inline bool have_feature(const char *feature) { int64_t feature_present = 0; size_t size = sizeof(feature_present); if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) { return false; } return feature_present; } static int arm_get_cpu_caps(void) { int flags = 0; #if HAVE_NEON flags |= HAS_NEON; #endif // HAVE_NEON #if HAVE_ARM_CRC32 if (have_feature("hw.optional.armv8_crc32")) flags |= HAS_ARM_CRC32; #endif // HAVE_ARM_CRC32 #if HAVE_NEON_DOTPROD if (have_feature("hw.optional.arm.FEAT_DotProd")) flags |= HAS_NEON_DOTPROD; #endif // HAVE_NEON_DOTPROD #if HAVE_NEON_I8MM if (have_feature("hw.optional.arm.FEAT_I8MM")) flags |= HAS_NEON_I8MM; #endif // HAVE_NEON_I8MM return flags; } #elif defined(_WIN32) // end __APPLE__ static int arm_get_cpu_caps(void) { int flags = 0; // IsProcessorFeaturePresent() parameter documentation: // https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent#parameters #if HAVE_NEON flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. #endif // HAVE_NEON #if HAVE_ARM_CRC32 if (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)) { flags |= HAS_ARM_CRC32; } #endif // HAVE_ARM_CRC32 #if HAVE_NEON_DOTPROD // Support for PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE was added in Windows SDK // 20348, supported by Windows 11 and Windows Server 2022. #if defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) { flags |= HAS_NEON_DOTPROD; } #endif // defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) #endif // HAVE_NEON_DOTPROD #if HAVE_NEON_I8MM // Support for PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE was added in Windows SDK // 26100. #if defined(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE) // There's no PF_* flag that indicates whether plain I8MM is available // or not. But if SVE_I8MM is available, that also implies that // regular I8MM is available. if (IsProcessorFeaturePresent(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE)) { flags |= HAS_NEON_I8MM; } #endif // defined(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE) #endif // HAVE_NEON_I8MM #if HAVE_SVE // Support for PF_ARM_SVE_INSTRUCTIONS_AVAILABLE was added in Windows SDK 26100. #if defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) { flags |= HAS_SVE; } #endif // defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) #endif // HAVE_SVE #if HAVE_SVE2 // Support for PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE was added in Windows SDK // 26100. #if defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)) { flags |= HAS_SVE2; } #endif // defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) #endif // HAVE_SVE2 return flags; } #elif defined(AOM_USE_ANDROID_CPU_FEATURES) static int arm_get_cpu_caps(void) { int flags = 0; #if HAVE_NEON flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. #endif // HAVE_NEON return flags; } #elif defined(__linux__) // end defined(AOM_USE_ANDROID_CPU_FEATURES) #include // Define hwcap values ourselves: building with an old auxv header where these // hwcap values are not defined should not prevent features from being enabled. #define AOM_AARCH64_HWCAP_CRC32 (1 << 7) #define AOM_AARCH64_HWCAP_ASIMDDP (1 << 20) #define AOM_AARCH64_HWCAP_SVE (1 << 22) #define AOM_AARCH64_HWCAP2_SVE2 (1 << 1) #define AOM_AARCH64_HWCAP2_I8MM (1 << 13) static int arm_get_cpu_caps(void) { int flags = 0; #if HAVE_ARM_CRC32 || HAVE_NEON_DOTPROD || HAVE_SVE unsigned long hwcap = getauxval(AT_HWCAP); #endif #if HAVE_NEON_I8MM || HAVE_SVE2 unsigned long hwcap2 = getauxval(AT_HWCAP2); #endif #if HAVE_NEON flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. #endif // HAVE_NEON #if HAVE_ARM_CRC32 if (hwcap & AOM_AARCH64_HWCAP_CRC32) flags |= HAS_ARM_CRC32; #endif // HAVE_ARM_CRC32 #if HAVE_NEON_DOTPROD if (hwcap & AOM_AARCH64_HWCAP_ASIMDDP) flags |= HAS_NEON_DOTPROD; #endif // HAVE_NEON_DOTPROD #if HAVE_NEON_I8MM if (hwcap2 & AOM_AARCH64_HWCAP2_I8MM) flags |= HAS_NEON_I8MM; #endif // HAVE_NEON_I8MM #if HAVE_SVE if (hwcap & AOM_AARCH64_HWCAP_SVE) flags |= HAS_SVE; #endif // HAVE_SVE #if HAVE_SVE2 if (hwcap2 & AOM_AARCH64_HWCAP2_SVE2) flags |= HAS_SVE2; #endif // HAVE_SVE2 return flags; } #elif defined(__Fuchsia__) // end __linux__ #include #include // Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/894282. #ifndef ZX_ARM64_FEATURE_ISA_I8MM #define ZX_ARM64_FEATURE_ISA_I8MM ((uint32_t)(1u << 19)) #endif // Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/895083. #ifndef ZX_ARM64_FEATURE_ISA_SVE #define ZX_ARM64_FEATURE_ISA_SVE ((uint32_t)(1u << 20)) #endif static int arm_get_cpu_caps(void) { int flags = 0; #if HAVE_NEON flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. #endif // HAVE_NEON uint32_t features; zx_status_t status = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features); if (status != ZX_OK) return flags; #if HAVE_ARM_CRC32 if (features & ZX_ARM64_FEATURE_ISA_CRC32) flags |= HAS_ARM_CRC32; #endif // HAVE_ARM_CRC32 #if HAVE_NEON_DOTPROD if (features & ZX_ARM64_FEATURE_ISA_DP) flags |= HAS_NEON_DOTPROD; #endif // HAVE_NEON_DOTPROD #if HAVE_NEON_I8MM if (features & ZX_ARM64_FEATURE_ISA_I8MM) flags |= HAS_NEON_I8MM; #endif // HAVE_NEON_I8MM #if HAVE_SVE if (features & ZX_ARM64_FEATURE_ISA_SVE) flags |= HAS_SVE; #endif // HAVE_SVE return flags; } #else // end __Fuchsia__ #error \ "Runtime CPU detection selected, but no CPU detection method " \ "available for your platform. Rerun cmake with -DCONFIG_RUNTIME_CPU_DETECT=0." #endif int aom_arm_cpu_caps(void) { int flags = 0; if (!arm_cpu_env_flags(&flags)) { flags = arm_get_cpu_caps() & arm_cpu_env_mask(); } // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available. if (!(flags & HAS_NEON_DOTPROD)) flags &= ~HAS_NEON_I8MM; // Restrict flags: SVE assumes that FEAT_{DotProd,I8MM} are available. if (!(flags & HAS_NEON_DOTPROD)) flags &= ~HAS_SVE; if (!(flags & HAS_NEON_I8MM)) flags &= ~HAS_SVE; // Restrict flags: SVE2 assumes that FEAT_SVE is available. if (!(flags & HAS_SVE)) flags &= ~HAS_SVE2; return flags; } aom-3.12.1/aom_ports/aom_once.h000066400000000000000000000043001477627663500163320ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_PORTS_AOM_ONCE_H_ #define AOM_AOM_PORTS_AOM_ONCE_H_ #include "config/aom_config.h" /* Implement a function wrapper to guarantee initialization * thread-safety for library singletons. * * NOTE: This function uses static locks, and can only be * used with one common argument per compilation unit. So * * file1.c: * aom_once(foo); * ... * aom_once(foo); * * file2.c: * aom_once(bar); * * will ensure foo() and bar() are each called only once, but in * * file1.c: * aom_once(foo); * aom_once(bar): * * bar() will never be called because the lock is used up * by the call to foo(). */ #if CONFIG_MULTITHREAD && defined(_WIN32) #undef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #include /* Declare a per-compilation-unit state variable to track the progress * of calling func() only once. This must be at global scope because * local initializers are not thread-safe in MSVC prior to Visual * Studio 2015. */ static INIT_ONCE aom_init_once = INIT_ONCE_STATIC_INIT; static void aom_once(void (*func)(void)) { BOOL pending; InitOnceBeginInitialize(&aom_init_once, 0, &pending, NULL); if (!pending) { // Initialization has already completed. return; } func(); InitOnceComplete(&aom_init_once, 0, NULL); } #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H #include static void aom_once(void (*func)(void)) { static pthread_once_t lock = PTHREAD_ONCE_INIT; pthread_once(&lock, func); } #else /* Default version that performs no synchronization. */ static void aom_once(void (*func)(void)) { static volatile int done; if (!done) { func(); done = 1; } } #endif #endif // AOM_AOM_PORTS_AOM_ONCE_H_ aom-3.12.1/aom_ports/aom_ports.cmake000066400000000000000000000077151477627663500174230ustar00rootroot00000000000000# # Copyright (c) 2017, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # if(AOM_AOM_PORTS_AOM_PORTS_CMAKE_) return() endif() # AOM_AOM_PORTS_AOM_PORTS_CMAKE_ set(AOM_AOM_PORTS_AOM_PORTS_CMAKE_ 1) list(APPEND AOM_PORTS_INCLUDES "${AOM_ROOT}/aom_ports/aom_once.h" "${AOM_ROOT}/aom_ports/aom_timer.h" "${AOM_ROOT}/aom_ports/bitops.h" "${AOM_ROOT}/aom_ports/emmintrin_compat.h" "${AOM_ROOT}/aom_ports/mem.h" "${AOM_ROOT}/aom_ports/mem_ops.h" "${AOM_ROOT}/aom_ports/mem_ops_aligned.h" "${AOM_ROOT}/aom_ports/sanitizer.h") list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/float.asm") list(APPEND AOM_PORTS_INCLUDES_X86 "${AOM_ROOT}/aom_ports/x86_abi_support.asm") list(APPEND AOM_PORTS_SOURCES_AARCH32 "${AOM_ROOT}/aom_ports/aarch32_cpudetect.c") list(APPEND AOM_PORTS_SOURCES_AARCH64 "${AOM_ROOT}/aom_ports/aarch64_cpudetect.c") if(CONFIG_RUNTIME_CPU_DETECT AND ANDROID_NDK) include_directories(${ANDROID_NDK}/sources/android/cpufeatures) list(APPEND AOM_PORTS_SOURCES_ARM "${ANDROID_NDK}/sources/android/cpufeatures/cpu-features.c") endif() list(APPEND AOM_PORTS_SOURCES_PPC "${AOM_ROOT}/aom_ports/ppc.h" "${AOM_ROOT}/aom_ports/ppc_cpudetect.c") list(APPEND AOM_PORTS_SOURCES_RISCV "${AOM_ROOT}/aom_ports/riscv.h" "${AOM_ROOT}/aom_ports/riscv_cpudetect.c") # For arm and x86 targets: # # * Creates the aom_ports build target, adds the includes in aom_ports to the # target, and makes libaom depend on it. # # Otherwise: # # * Adds the includes in aom_ports to the libaom target. # # For all target platforms: # # * The libaom target must exist before this function is called. function(setup_aom_ports_targets) if(XCODE AND "${AOM_TARGET_CPU}" STREQUAL "x86_64") add_asm_library("aom_ports" "AOM_PORTS_ASM_X86") # Xcode is the only one set(aom_ports_is_embedded 1) set(aom_ports_has_symbols 1) elseif(WIN32 AND "${AOM_TARGET_CPU}" STREQUAL "x86_64") add_asm_library("aom_ports" "AOM_PORTS_ASM_X86") set(aom_ports_has_symbols 1) elseif("${AOM_TARGET_CPU}" STREQUAL "arm64") add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_AARCH64}) set(aom_ports_has_symbols 1) elseif("${AOM_TARGET_CPU}" MATCHES "arm") add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_AARCH32}) set(aom_ports_has_symbols 1) elseif("${AOM_TARGET_CPU}" MATCHES "ppc") add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC}) set(aom_ports_has_symbols 1) elseif("${AOM_TARGET_CPU}" MATCHES "riscv") add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_RISCV}) set(aom_ports_has_symbols 1) endif() if("${AOM_TARGET_CPU}" MATCHES "arm|ppc|riscv") target_sources(aom PRIVATE $) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $) endif() endif() # Note AOM_PORTS_INCLUDES_X86 are not added to the aom_ports, aom or # aom_static targets to avoid compilation issues in projects that enable ASM # language support in project(). These sources were never included in # libaom_srcs.*; if it becomes necessary for a particular generator another # method should be used. if(aom_ports_has_symbols) if(NOT aom_ports_is_embedded) target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES}) endif() set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) else() target_sources(aom PRIVATE ${AOM_PORTS_INCLUDES}) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE ${AOM_PORTS_INCLUDES}) endif() endif() endfunction() aom-3.12.1/aom_ports/aom_timer.h000066400000000000000000000054251477627663500165370ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_PORTS_AOM_TIMER_H_ #define AOM_AOM_PORTS_AOM_TIMER_H_ #include "config/aom_config.h" #if CONFIG_OS_SUPPORT #include #include #if defined(_WIN32) /* * Win32 specific includes */ #undef NOMINMAX #define NOMINMAX #undef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #include #else /* * POSIX specific includes */ #include /* timersub is not provided by msys at this time. */ #ifndef timersub #define timersub(a, b, result) \ do { \ (result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \ (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \ if ((result)->tv_usec < 0) { \ --(result)->tv_sec; \ (result)->tv_usec += 1000000; \ } \ } while (0) #endif #endif struct aom_usec_timer { #if defined(_WIN32) LARGE_INTEGER begin, end; #else struct timeval begin, end; #endif }; static inline void aom_usec_timer_start(struct aom_usec_timer *t) { #if defined(_WIN32) QueryPerformanceCounter(&t->begin); #else gettimeofday(&t->begin, NULL); #endif } static inline void aom_usec_timer_mark(struct aom_usec_timer *t) { #if defined(_WIN32) QueryPerformanceCounter(&t->end); #else gettimeofday(&t->end, NULL); #endif } static inline int64_t aom_usec_timer_elapsed(struct aom_usec_timer *t) { #if defined(_WIN32) LARGE_INTEGER freq, diff; diff.QuadPart = t->end.QuadPart - t->begin.QuadPart; QueryPerformanceFrequency(&freq); return diff.QuadPart * 1000000 / freq.QuadPart; #else struct timeval diff; timersub(&t->end, &t->begin, &diff); return ((int64_t)diff.tv_sec) * 1000000 + diff.tv_usec; #endif } #else /* CONFIG_OS_SUPPORT = 0*/ /* Empty timer functions if CONFIG_OS_SUPPORT = 0 */ #ifndef timersub #define timersub(a, b, result) #endif struct aom_usec_timer { void *dummy; }; static inline void aom_usec_timer_start(struct aom_usec_timer *t) { (void)t; } static inline void aom_usec_timer_mark(struct aom_usec_timer *t) { (void)t; } static inline int aom_usec_timer_elapsed(struct aom_usec_timer *t) { (void)t; return 0; } #endif /* CONFIG_OS_SUPPORT */ #endif // AOM_AOM_PORTS_AOM_TIMER_H_ aom-3.12.1/aom_ports/arm.h000066400000000000000000000027761477627663500153500ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_PORTS_ARM_H_ #define AOM_AOM_PORTS_ARM_H_ #include #include "config/aom_config.h" #ifdef __cplusplus extern "C" { #endif // Armv7-A optional Neon instructions, mandatory from Armv8.0-A. #define HAS_NEON (1 << 0) // Armv8.0-A optional CRC32 instructions, mandatory from Armv8.1-A. #define HAS_ARM_CRC32 (1 << 1) // Armv8.2-A optional Neon dot-product instructions, mandatory from Armv8.4-A. #define HAS_NEON_DOTPROD (1 << 2) // Armv8.2-A optional Neon i8mm instructions, mandatory from Armv8.6-A. #define HAS_NEON_I8MM (1 << 3) // Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A. #define HAS_SVE (1 << 4) // Armv9.0-A SVE2 instructions. #define HAS_SVE2 (1 << 5) int aom_arm_cpu_caps(void); // Earlier gcc compilers have issues with some neon intrinsics #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 4 && \ __GNUC_MINOR__ <= 6 #define AOM_INCOMPATIBLE_GCC #endif #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_PORTS_ARM_H_ aom-3.12.1/aom_ports/arm_cpudetect.h000066400000000000000000000031261477627663500173760ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom_ports/arm.h" #include "config/aom_config.h" #include #include #include #if defined(_WIN32) #undef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #undef WIN32_EXTRA_LEAN #define WIN32_EXTRA_LEAN #include #endif #ifdef WINAPI_FAMILY #include #if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) #define getenv(x) NULL #endif #endif #if defined(__ANDROID__) && (__ANDROID_API__ < 18) #define AOM_USE_ANDROID_CPU_FEATURES 1 // Use getauxval() when targeting (64-bit) Android with API level >= 18. // getauxval() is supported since Android API level 18 (Android 4.3.) // First Android version with 64-bit support was Android 5.x (API level 21). #include #endif static bool arm_cpu_env_flags(int *flags) { const char *env = getenv("AOM_SIMD_CAPS"); if (env && *env) { *flags = (int)strtol(env, NULL, 0); return true; } return false; } static int arm_cpu_env_mask(void) { const char *env = getenv("AOM_SIMD_CAPS_MASK"); return env && *env ? (int)strtol(env, NULL, 0) : ~0; } aom-3.12.1/aom_ports/bitops.h000066400000000000000000000063201477627663500160560ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_PORTS_BITOPS_H_ #define AOM_AOM_PORTS_BITOPS_H_ #include #include #include "config/aom_config.h" #ifdef _MSC_VER #if defined(_M_X64) || defined(_M_IX86) || defined(_M_ARM64) || defined(_M_ARM) #include #define USE_MSC_INTRINSICS #endif #endif #ifdef __cplusplus extern "C" { #endif // get_msb: // Returns (int)floor(log2(n)). n must be > 0. // These versions of get_msb() are only valid when n != 0 because all // of the optimized versions are undefined when n == 0: // GCC compiler: https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html // MSVC: https://learn.microsoft.com/en-us/cpp/intrinsics/compiler-intrinsics // use GNU builtins where available. #if defined(__GNUC__) && \ ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) static inline int get_msb(unsigned int n) { assert(n != 0); return 31 ^ __builtin_clz(n); } #elif defined(USE_MSC_INTRINSICS) #pragma intrinsic(_BitScanReverse) static inline int get_msb(unsigned int n) { unsigned long first_set_bit; assert(n != 0); _BitScanReverse(&first_set_bit, n); return first_set_bit; } #else static inline int get_msb(unsigned int n) { int log = 0; unsigned int value = n; assert(n != 0); for (int shift = 16; shift != 0; shift >>= 1) { const unsigned int x = value >> shift; if (x != 0) { value = x; log += shift; } } return log; } #endif #if defined(__GNUC__) && \ ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) static inline int aom_clzll(uint64_t n) { return __builtin_clzll(n); } #elif defined(USE_MSC_INTRINSICS) #if defined(_M_X64) || defined(_M_ARM64) #pragma intrinsic(_BitScanReverse64) #endif static inline int aom_clzll(uint64_t n) { assert(n != 0); unsigned long first_set_bit; // NOLINT(runtime/int) #if defined(_M_X64) || defined(_M_ARM64) const unsigned char bit_set = _BitScanReverse64(&first_set_bit, (unsigned __int64)n); #else // !(defined(_M_X64) || defined(_M_ARM64)) const unsigned long n_hi = (unsigned long)(n >> 32); // NOLINT(runtime/int) if (n_hi != 0) { const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi); assert(bit_set != 0); (void)bit_set; return 31 ^ (int)first_set_bit; } const unsigned char bit_set = _BitScanReverse(&first_set_bit, (unsigned long)n); // NOLINT(runtime/int) #endif assert(bit_set != 0); (void)bit_set; return 63 ^ (int)first_set_bit; } #undef USE_MSC_INTRINSICS #else static inline int aom_clzll(uint64_t n) { assert(n != 0); int res = 0; uint64_t high_bit = 1ULL << 63; while (!(n & high_bit)) { res++; n <<= 1; } return res; } #endif #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_PORTS_BITOPS_H_ aom-3.12.1/aom_ports/emmintrin_compat.h000066400000000000000000000034111477627663500201210ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_ #define AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_ #if defined(__GNUC__) && __GNUC__ < 4 /* From emmintrin.h (gcc 4.5.3) */ /* Casts between various SP, DP, INT vector types. Note that these do no conversion of values, they just change the type. */ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castpd_ps(__m128d __A) { return (__m128)__A; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castpd_si128(__m128d __A) { return (__m128i)__A; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castps_pd(__m128 __A) { return (__m128d)__A; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castps_si128(__m128 __A) { return (__m128i)__A; } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castsi128_ps(__m128i __A) { return (__m128)__A; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_castsi128_pd(__m128i __A) { return (__m128d)__A; } #endif #endif // AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_ aom-3.12.1/aom_ports/float.asm000066400000000000000000000015671477627663500162240ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; %include "aom_ports/x86_abi_support.asm" section .text %if LIBAOM_YASM_WIN64 globalsym(aom_winx64_fldcw) sym(aom_winx64_fldcw): sub rsp, 8 mov [rsp], rcx ; win x64 specific fldcw [rsp] add rsp, 8 ret globalsym(aom_winx64_fstcw) sym(aom_winx64_fstcw): sub rsp, 8 fstcw [rsp] mov rax, [rsp] add rsp, 8 ret %endif aom-3.12.1/aom_ports/mem.h000066400000000000000000000072051477627663500153370ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_PORTS_MEM_H_ #define AOM_AOM_PORTS_MEM_H_ #include "aom/aom_integer.h" #include "config/aom_config.h" #if defined(__GNUC__) || defined(__SUNPRO_C) #define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n))) #elif defined(_MSC_VER) #define DECLARE_ALIGNED(n, typ, val) __declspec(align(n)) typ val #else #warning No alignment directives known for this compiler. #define DECLARE_ALIGNED(n, typ, val) typ val #endif #if defined(__has_builtin) #define AOM_HAS_BUILTIN(x) __has_builtin(x) #else #define AOM_HAS_BUILTIN(x) 0 #endif #if !AOM_HAS_BUILTIN(__builtin_prefetch) && !defined(__GNUC__) #define __builtin_prefetch(x) #endif /* Shift down with rounding for use when n >= 0. Usually value >= 0, but the * macro can be used with a negative value if the direction of rounding is * acceptable. */ #define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n)) /* Shift down with rounding for signed integers, for use when n >= 0 */ #define ROUND_POWER_OF_TWO_SIGNED(value, n) \ (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \ : ROUND_POWER_OF_TWO((value), (n))) /* Shift down with rounding for use when n >= 0 (64-bit value). Usually * value >= 0, but the macro can be used with a negative value if the direction * of rounding is acceptable. */ #define ROUND_POWER_OF_TWO_64(value, n) \ (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n)) /* Shift down with rounding for signed integers, for use when n >= 0 (64-bit * value) */ #define ROUND_POWER_OF_TWO_SIGNED_64(value, n) \ (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \ : ROUND_POWER_OF_TWO_64((value), (n))) /* Shift down with ceil() for use when n >= 0 and value >= 0.*/ #define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n)) /* shift right or left depending on sign of n */ #define RIGHT_SIGNED_SHIFT(value, n) \ ((n) < 0 ? ((value) << (-(n))) : ((value) >> (n))) #define ALIGN_POWER_OF_TWO(value, n) \ (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1)) #define ALIGN_POWER_OF_TWO_UNSIGNED(value, n) \ (((value) + ((1u << (n)) - 1)) & ~((1u << (n)) - 1)) #define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y)) #define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1)) #define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1)) /*!\brief force enum to be unsigned 1 byte*/ #define UENUM1BYTE(enumvar) \ ; \ typedef uint8_t enumvar /*!\brief force enum to be signed 1 byte*/ #define SENUM1BYTE(enumvar) \ ; \ typedef int8_t enumvar /*!\brief force enum to be unsigned 2 byte*/ #define UENUM2BYTE(enumvar) \ ; \ typedef uint16_t enumvar /*!\brief force enum to be signed 2 byte*/ #define SENUM2BYTE(enumvar) \ ; \ typedef int16_t enumvar /*!\brief force enum to be unsigned 4 byte*/ #define UENUM4BYTE(enumvar) \ ; \ typedef uint32_t enumvar /*!\brief force enum to be unsigned 4 byte*/ #define SENUM4BYTE(enumvar) \ ; \ typedef int32_t enumvar #endif // AOM_AOM_PORTS_MEM_H_ aom-3.12.1/aom_ports/mem_ops.h000066400000000000000000000153401477627663500162170ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_PORTS_MEM_OPS_H_ #define AOM_AOM_PORTS_MEM_OPS_H_ /* \file * \brief Provides portable memory access primitives * * This function provides portable primitives for getting and setting of * signed and unsigned integers in 16, 24, and 32 bit sizes. The operations * can be performed on unaligned data regardless of hardware support for * unaligned accesses. * * The type used to pass the integral values may be changed by defining * MEM_VALUE_T with the appropriate type. The type given must be an integral * numeric type. * * The actual functions instantiated have the MEM_VALUE_T type name pasted * on to the symbol name. This allows the developer to instantiate these * operations for multiple types within the same translation unit. This is * of somewhat questionable utility, but the capability exists nonetheless. * Users not making use of this functionality should call the functions * without the type name appended, and the preprocessor will take care of * it. * * NOTE: This code is not supported on platforms where char > 1 octet ATM. */ #ifndef MAU_T /* Minimum Access Unit for this target */ #define MAU_T unsigned char #endif #ifndef MEM_VALUE_T #define MEM_VALUE_T int #endif #undef MEM_VALUE_T_SZ_BITS #define MEM_VALUE_T_SZ_BITS (sizeof(MEM_VALUE_T) << 3) #undef mem_ops_wrap_symbol #define mem_ops_wrap_symbol(fn) mem_ops_wrap_symbol2(fn, MEM_VALUE_T) #undef mem_ops_wrap_symbol2 #define mem_ops_wrap_symbol2(fn, typ) mem_ops_wrap_symbol3(fn, typ) #undef mem_ops_wrap_symbol3 #define mem_ops_wrap_symbol3(fn, typ) fn##_as_##typ /* * Include aligned access routines */ #define INCLUDED_BY_MEM_OPS_H #include "mem_ops_aligned.h" #undef INCLUDED_BY_MEM_OPS_H #undef mem_get_be16 #define mem_get_be16 mem_ops_wrap_symbol(mem_get_be16) static unsigned MEM_VALUE_T mem_get_be16(const void *vmem) { unsigned MEM_VALUE_T val; const MAU_T *mem = (const MAU_T *)vmem; val = mem[0] << 8; val |= mem[1]; return val; } #undef mem_get_be24 #define mem_get_be24 mem_ops_wrap_symbol(mem_get_be24) static unsigned MEM_VALUE_T mem_get_be24(const void *vmem) { unsigned MEM_VALUE_T val; const MAU_T *mem = (const MAU_T *)vmem; val = mem[0] << 16; val |= mem[1] << 8; val |= mem[2]; return val; } #undef mem_get_be32 #define mem_get_be32 mem_ops_wrap_symbol(mem_get_be32) static unsigned MEM_VALUE_T mem_get_be32(const void *vmem) { unsigned MEM_VALUE_T val; const MAU_T *mem = (const MAU_T *)vmem; val = ((unsigned MEM_VALUE_T)mem[0]) << 24; val |= mem[1] << 16; val |= mem[2] << 8; val |= mem[3]; return val; } #undef mem_get_le16 #define mem_get_le16 mem_ops_wrap_symbol(mem_get_le16) static unsigned MEM_VALUE_T mem_get_le16(const void *vmem) { unsigned MEM_VALUE_T val; const MAU_T *mem = (const MAU_T *)vmem; val = mem[1] << 8; val |= mem[0]; return val; } #undef mem_get_le24 #define mem_get_le24 mem_ops_wrap_symbol(mem_get_le24) static unsigned MEM_VALUE_T mem_get_le24(const void *vmem) { unsigned MEM_VALUE_T val; const MAU_T *mem = (const MAU_T *)vmem; val = mem[2] << 16; val |= mem[1] << 8; val |= mem[0]; return val; } #undef mem_get_le32 #define mem_get_le32 mem_ops_wrap_symbol(mem_get_le32) static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) { unsigned MEM_VALUE_T val; const MAU_T *mem = (const MAU_T *)vmem; val = ((unsigned MEM_VALUE_T)mem[3]) << 24; val |= mem[2] << 16; val |= mem[1] << 8; val |= mem[0]; return val; } #define mem_get_s_generic(end, sz) \ static inline signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) { \ const MAU_T *mem = (const MAU_T *)vmem; \ signed MEM_VALUE_T val = mem_get_##end##sz(mem); \ return (val << (MEM_VALUE_T_SZ_BITS - sz)) >> (MEM_VALUE_T_SZ_BITS - sz); \ } /* clang-format off */ #undef mem_get_sbe16 #define mem_get_sbe16 mem_ops_wrap_symbol(mem_get_sbe16) mem_get_s_generic(be, 16) #undef mem_get_sbe24 #define mem_get_sbe24 mem_ops_wrap_symbol(mem_get_sbe24) mem_get_s_generic(be, 24) #undef mem_get_sbe32 #define mem_get_sbe32 mem_ops_wrap_symbol(mem_get_sbe32) mem_get_s_generic(be, 32) #undef mem_get_sle16 #define mem_get_sle16 mem_ops_wrap_symbol(mem_get_sle16) mem_get_s_generic(le, 16) #undef mem_get_sle24 #define mem_get_sle24 mem_ops_wrap_symbol(mem_get_sle24) mem_get_s_generic(le, 24) #undef mem_get_sle32 #define mem_get_sle32 mem_ops_wrap_symbol(mem_get_sle32) mem_get_s_generic(le, 32) #undef mem_put_be16 #define mem_put_be16 mem_ops_wrap_symbol(mem_put_be16) static inline void mem_put_be16(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (MAU_T)((val >> 8) & 0xff); mem[1] = (MAU_T)((val >> 0) & 0xff); } #undef mem_put_be24 #define mem_put_be24 mem_ops_wrap_symbol(mem_put_be24) static inline void mem_put_be24(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (MAU_T)((val >> 16) & 0xff); mem[1] = (MAU_T)((val >> 8) & 0xff); mem[2] = (MAU_T)((val >> 0) & 0xff); } #undef mem_put_be32 #define mem_put_be32 mem_ops_wrap_symbol(mem_put_be32) static inline void mem_put_be32(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (MAU_T)((val >> 24) & 0xff); mem[1] = (MAU_T)((val >> 16) & 0xff); mem[2] = (MAU_T)((val >> 8) & 0xff); mem[3] = (MAU_T)((val >> 0) & 0xff); } #undef mem_put_le16 #define mem_put_le16 mem_ops_wrap_symbol(mem_put_le16) static inline void mem_put_le16(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (MAU_T)((val >> 0) & 0xff); mem[1] = (MAU_T)((val >> 8) & 0xff); } #undef mem_put_le24 #define mem_put_le24 mem_ops_wrap_symbol(mem_put_le24) static inline void mem_put_le24(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (MAU_T)((val >> 0) & 0xff); mem[1] = (MAU_T)((val >> 8) & 0xff); mem[2] = (MAU_T)((val >> 16) & 0xff); } #undef mem_put_le32 #define mem_put_le32 mem_ops_wrap_symbol(mem_put_le32) static inline void mem_put_le32(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (MAU_T)((val >> 0) & 0xff); mem[1] = (MAU_T)((val >> 8) & 0xff); mem[2] = (MAU_T)((val >> 16) & 0xff); mem[3] = (MAU_T)((val >> 24) & 0xff); } /* clang-format on */ #endif // AOM_AOM_PORTS_MEM_OPS_H_ aom-3.12.1/aom_ports/mem_ops_aligned.h000066400000000000000000000160471477627663500177070ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_ #define AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_ #include "aom/aom_integer.h" /* \file * \brief Provides portable memory access primitives for operating on aligned * data * * This file is split from mem_ops.h for easier maintenance. See mem_ops.h * for a more detailed description of these primitives. */ #ifndef INCLUDED_BY_MEM_OPS_H #error Include mem_ops.h, not mem_ops_aligned.h directly. #endif /* Architectures that provide instructions for doing this byte swapping * could redefine these macros. */ #define swap_endian_16(val, raw) \ do { \ val = (uint16_t)(((raw >> 8) & 0x00ff) | ((raw << 8) & 0xff00)); \ } while (0) #define swap_endian_32(val, raw) \ do { \ val = ((raw >> 24) & 0x000000ff) | ((raw >> 8) & 0x0000ff00) | \ ((raw << 8) & 0x00ff0000) | ((raw << 24) & 0xff000000); \ } while (0) #define swap_endian_16_se(val, raw) \ do { \ swap_endian_16(val, raw); \ val = ((val << 16) >> 16); \ } while (0) #define swap_endian_32_se(val, raw) swap_endian_32(val, raw) #define mem_get_ne_aligned_generic(end, sz) \ static inline unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \ const void *vmem) { \ const uint##sz##_t *mem = (const uint##sz##_t *)vmem; \ return *mem; \ } #define mem_get_sne_aligned_generic(end, sz) \ static inline signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \ const void *vmem) { \ const int##sz##_t *mem = (const int##sz##_t *)vmem; \ return *mem; \ } #define mem_get_se_aligned_generic(end, sz) \ static inline unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \ const void *vmem) { \ const uint##sz##_t *mem = (const uint##sz##_t *)vmem; \ unsigned MEM_VALUE_T val, raw = *mem; \ swap_endian_##sz(val, raw); \ return val; \ } #define mem_get_sse_aligned_generic(end, sz) \ static inline signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \ const void *vmem) { \ const int##sz##_t *mem = (const int##sz##_t *)vmem; \ unsigned MEM_VALUE_T val, raw = *mem; \ swap_endian_##sz##_se(val, raw); \ return val; \ } #define mem_put_ne_aligned_generic(end, sz) \ static inline void mem_put_##end##sz##_aligned(void *vmem, \ MEM_VALUE_T val) { \ uint##sz##_t *mem = (uint##sz##_t *)vmem; \ *mem = (uint##sz##_t)val; \ } #define mem_put_se_aligned_generic(end, sz) \ static inline void mem_put_##end##sz##_aligned(void *vmem, \ MEM_VALUE_T val) { \ uint##sz##_t *mem = (uint##sz##_t *)vmem, raw; \ swap_endian_##sz(raw, val); \ *mem = (uint##sz##_t)raw; \ } #include "config/aom_config.h" #if CONFIG_BIG_ENDIAN #define mem_get_be_aligned_generic(sz) mem_get_ne_aligned_generic(be, sz) #define mem_get_sbe_aligned_generic(sz) mem_get_sne_aligned_generic(be, sz) #define mem_get_le_aligned_generic(sz) mem_get_se_aligned_generic(le, sz) #define mem_get_sle_aligned_generic(sz) mem_get_sse_aligned_generic(le, sz) #define mem_put_be_aligned_generic(sz) mem_put_ne_aligned_generic(be, sz) #define mem_put_le_aligned_generic(sz) mem_put_se_aligned_generic(le, sz) #else #define mem_get_be_aligned_generic(sz) mem_get_se_aligned_generic(be, sz) #define mem_get_sbe_aligned_generic(sz) mem_get_sse_aligned_generic(be, sz) #define mem_get_le_aligned_generic(sz) mem_get_ne_aligned_generic(le, sz) #define mem_get_sle_aligned_generic(sz) mem_get_sne_aligned_generic(le, sz) #define mem_put_be_aligned_generic(sz) mem_put_se_aligned_generic(be, sz) #define mem_put_le_aligned_generic(sz) mem_put_ne_aligned_generic(le, sz) #endif /* clang-format off */ #undef mem_get_be16_aligned #define mem_get_be16_aligned mem_ops_wrap_symbol(mem_get_be16_aligned) mem_get_be_aligned_generic(16) #undef mem_get_be32_aligned #define mem_get_be32_aligned mem_ops_wrap_symbol(mem_get_be32_aligned) mem_get_be_aligned_generic(32) #undef mem_get_le16_aligned #define mem_get_le16_aligned mem_ops_wrap_symbol(mem_get_le16_aligned) mem_get_le_aligned_generic(16) #undef mem_get_le32_aligned #define mem_get_le32_aligned mem_ops_wrap_symbol(mem_get_le32_aligned) mem_get_le_aligned_generic(32) #undef mem_get_sbe16_aligned #define mem_get_sbe16_aligned mem_ops_wrap_symbol(mem_get_sbe16_aligned) mem_get_sbe_aligned_generic(16) #undef mem_get_sbe32_aligned #define mem_get_sbe32_aligned mem_ops_wrap_symbol(mem_get_sbe32_aligned) mem_get_sbe_aligned_generic(32) #undef mem_get_sle16_aligned #define mem_get_sle16_aligned mem_ops_wrap_symbol(mem_get_sle16_aligned) mem_get_sle_aligned_generic(16) #undef mem_get_sle32_aligned #define mem_get_sle32_aligned mem_ops_wrap_symbol(mem_get_sle32_aligned) mem_get_sle_aligned_generic(32) #undef mem_put_be16_aligned #define mem_put_be16_aligned mem_ops_wrap_symbol(mem_put_be16_aligned) mem_put_be_aligned_generic(16) #undef mem_put_be32_aligned #define mem_put_be32_aligned mem_ops_wrap_symbol(mem_put_be32_aligned) mem_put_be_aligned_generic(32) #undef mem_put_le16_aligned #define mem_put_le16_aligned mem_ops_wrap_symbol(mem_put_le16_aligned) mem_put_le_aligned_generic(16) #undef mem_put_le32_aligned #define mem_put_le32_aligned mem_ops_wrap_symbol(mem_put_le32_aligned) mem_put_le_aligned_generic(32) #undef mem_get_ne_aligned_generic #undef mem_get_se_aligned_generic #undef mem_get_sne_aligned_generic #undef mem_get_sse_aligned_generic #undef mem_put_ne_aligned_generic #undef mem_put_se_aligned_generic #undef swap_endian_16 #undef swap_endian_32 #undef swap_endian_16_se #undef swap_endian_32_se /* clang-format on */ #endif // AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_ aom-3.12.1/aom_ports/ppc.h000066400000000000000000000014461477627663500153440ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_PORTS_PPC_H_ #define AOM_AOM_PORTS_PPC_H_ #include #include "config/aom_config.h" #ifdef __cplusplus extern "C" { #endif #define HAS_VSX 0x01 int ppc_simd_caps(void); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_PORTS_PPC_H_ aom-3.12.1/aom_ports/ppc_cpudetect.c000066400000000000000000000037061477627663500174000ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include #include "config/aom_config.h" #include "aom_ports/ppc.h" #if CONFIG_RUNTIME_CPU_DETECT static int cpu_env_flags(int *flags) { char *env; env = getenv("AOM_SIMD_CAPS"); if (env && *env) { *flags = (int)strtol(env, NULL, 0); return 0; } *flags = 0; return -1; } static int cpu_env_mask(void) { char *env; env = getenv("AOM_SIMD_CAPS_MASK"); return env && *env ? (int)strtol(env, NULL, 0) : ~0; } int ppc_simd_caps(void) { int flags; int mask; int fd; ssize_t count; unsigned int i; uint64_t buf[64]; // If AOM_SIMD_CAPS_MASK is set then allow only those capabilities. if (!cpu_env_flags(&flags)) { return flags; } mask = cpu_env_mask(); fd = open("/proc/self/auxv", O_RDONLY); if (fd < 0) { return 0; } while ((count = read(fd, buf, sizeof(buf))) > 0) { for (i = 0; i < (count / sizeof(*buf)); i += 2) { if (buf[i] == AT_HWCAP) { #if HAVE_VSX if (buf[i + 1] & PPC_FEATURE_HAS_VSX) { flags |= HAS_VSX; } #endif // HAVE_VSX goto out_close; } else if (buf[i] == AT_NULL) { goto out_close; } } } out_close: close(fd); return flags & mask; } #else // If there is no RTCD the function pointers are not used and can not be // changed. int ppc_simd_caps(void) { return 0; } #endif // CONFIG_RUNTIME_CPU_DETECT aom-3.12.1/aom_ports/riscv.h000066400000000000000000000014561477627663500157110ustar00rootroot00000000000000/* * Copyright (c) 2025, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_PORTS_RISCV_H_ #define AOM_AOM_PORTS_RISCV_H_ #include #include "config/aom_config.h" #ifdef __cplusplus extern "C" { #endif #define HAS_RVV 0x01 int riscv_simd_caps(void); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_PORTS_RISCV_H_ aom-3.12.1/aom_ports/riscv_cpudetect.c000066400000000000000000000020741477627663500177410ustar00rootroot00000000000000/* * Copyright (c) 2025, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "aom_ports/riscv.h" #if CONFIG_RUNTIME_CPU_DETECT #include #define HWCAP_RVV (1 << ('v' - 'a')) int riscv_simd_caps(void) { int flags = 0; #if HAVE_RVV unsigned long hwcap = getauxval(AT_HWCAP); if (hwcap & HWCAP_RVV) flags |= HAS_RVV; #endif return flags; } #else // If there is no RTCD the function pointers are not used and can not be // changed. int riscv_simd_caps(void) { return 0; } #endif // CONFIG_RUNTIME_CPU_DETECT aom-3.12.1/aom_ports/sanitizer.h000066400000000000000000000025461477627663500165740ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_PORTS_SANITIZER_H_ #define AOM_AOM_PORTS_SANITIZER_H_ // AddressSanitizer support. // Define AOM_ADDRESS_SANITIZER if AddressSanitizer is used. // Clang. #if defined(__has_feature) #if __has_feature(address_sanitizer) #define AOM_ADDRESS_SANITIZER 1 #endif #endif // defined(__has_feature) // GCC. #if defined(__SANITIZE_ADDRESS__) #define AOM_ADDRESS_SANITIZER 1 #endif // defined(__SANITIZE_ADDRESS__) // Define the macros for AddressSanitizer manual memory poisoning. See // https://github.com/google/sanitizers/wiki/AddressSanitizerManualPoisoning. #if defined(AOM_ADDRESS_SANITIZER) #include #else #define ASAN_POISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size)) #define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size)) #endif #endif // AOM_AOM_PORTS_SANITIZER_H_ aom-3.12.1/aom_ports/x86.h000066400000000000000000000277531477627663500152200ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_PORTS_X86_H_ #define AOM_AOM_PORTS_X86_H_ #include #if defined(_MSC_VER) #include /* For __cpuidex, __rdtsc */ #endif #include "aom/aom_integer.h" #include "config/aom_config.h" #ifdef __cplusplus extern "C" { #endif typedef enum { AOM_CPU_UNKNOWN = -1, AOM_CPU_AMD, AOM_CPU_AMD_OLD, AOM_CPU_CENTAUR, AOM_CPU_CYRIX, AOM_CPU_INTEL, AOM_CPU_NEXGEN, AOM_CPU_NSC, AOM_CPU_RISE, AOM_CPU_SIS, AOM_CPU_TRANSMETA, AOM_CPU_TRANSMETA_OLD, AOM_CPU_UMC, AOM_CPU_VIA, AOM_CPU_LAST } aom_cpu_t; #if defined(__GNUC__) || defined(__ANDROID__) #if AOM_ARCH_X86_64 #define cpuid(func, func2, ax, bx, cx, dx) \ __asm__ __volatile__("cpuid \n\t" \ : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \ : "a"(func), "c"(func2)) #else #define cpuid(func, func2, ax, bx, cx, dx) \ __asm__ __volatile__( \ "mov %%ebx, %%edi \n\t" \ "cpuid \n\t" \ "xchg %%edi, %%ebx \n\t" \ : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ : "a"(func), "c"(func2)) #endif #elif defined(__SUNPRO_C) || \ defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/ #if AOM_ARCH_X86_64 #define cpuid(func, func2, ax, bx, cx, dx) \ asm volatile( \ "xchg %rsi, %rbx \n\t" \ "cpuid \n\t" \ "movl %ebx, %edi \n\t" \ "xchg %rsi, %rbx \n\t" \ : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ : "a"(func), "c"(func2)) #else #define cpuid(func, func2, ax, bx, cx, dx) \ asm volatile( \ "pushl %ebx \n\t" \ "cpuid \n\t" \ "movl %ebx, %edi \n\t" \ "popl %ebx \n\t" \ : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ : "a"(func), "c"(func2)) #endif #else /* end __SUNPRO__ */ #if AOM_ARCH_X86_64 #if defined(_MSC_VER) && _MSC_VER > 1500 #define cpuid(func, func2, a, b, c, d) \ do { \ int regs[4]; \ __cpuidex(regs, func, func2); \ a = regs[0]; \ b = regs[1]; \ c = regs[2]; \ d = regs[3]; \ } while (0) #else #define cpuid(func, func2, a, b, c, d) \ do { \ int regs[4]; \ __cpuid(regs, func); \ a = regs[0]; \ b = regs[1]; \ c = regs[2]; \ d = regs[3]; \ } while (0) #endif #else /* clang-format off */ #define cpuid(func, func2, a, b, c, d) \ __asm mov eax, func \ __asm mov ecx, func2 \ __asm cpuid \ __asm mov a, eax \ __asm mov b, ebx \ __asm mov c, ecx \ __asm mov d, edx #endif /* clang-format on */ #endif /* end others */ // NaCl has no support for xgetbv or the raw opcode. #if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__)) static inline uint64_t xgetbv(void) { const uint32_t ecx = 0; uint32_t eax, edx; // Use the raw opcode for xgetbv for compatibility with older toolchains. __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n" : "=a"(eax), "=d"(edx) : "c"(ecx)); return ((uint64_t)edx << 32) | eax; } #elif (defined(_M_X64) || defined(_M_IX86)) && defined(_MSC_FULL_VER) && \ _MSC_FULL_VER >= 160040219 // >= VS2010 SP1 #include #define xgetbv() _xgetbv(0) #elif defined(_MSC_VER) && defined(_M_IX86) static inline uint64_t xgetbv(void) { uint32_t eax_, edx_; __asm { xor ecx, ecx // ecx = 0 // Use the raw opcode for xgetbv for compatibility with older toolchains. __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0 mov eax_, eax mov edx_, edx } return ((uint64_t)edx_ << 32) | eax_; } #else #define xgetbv() 0U // no AVX for older x64 or unrecognized toolchains. #endif #if defined(_MSC_VER) && _MSC_VER >= 1700 #undef NOMINMAX #define NOMINMAX #undef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #include #if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP) #define getenv(x) NULL #endif #endif #define HAS_MMX 0x01 #define HAS_SSE 0x02 #define HAS_SSE2 0x04 #define HAS_SSE3 0x08 #define HAS_SSSE3 0x10 #define HAS_SSE4_1 0x20 #define HAS_AVX 0x40 #define HAS_AVX2 0x80 #define HAS_SSE4_2 0x100 #ifndef BIT #define BIT(n) (1u << (n)) #endif static inline int x86_simd_caps(void) { unsigned int flags = 0; unsigned int mask = ~0u; unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx; char *env; /* See if the CPU capabilities are being overridden by the environment */ env = getenv("AOM_SIMD_CAPS"); if (env && *env) return (int)strtol(env, NULL, 0); env = getenv("AOM_SIMD_CAPS_MASK"); if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0); /* Ensure that the CPUID instruction supports extended features */ cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx); if (max_cpuid_val < 1) return 0; /* Get the standard feature flags */ cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); if (reg_edx & BIT(23)) flags |= HAS_MMX; if (reg_edx & BIT(25)) flags |= HAS_SSE; /* aka xmm */ if (reg_edx & BIT(26)) flags |= HAS_SSE2; /* aka wmt */ if (reg_ecx & BIT(0)) flags |= HAS_SSE3; if (reg_ecx & BIT(9)) flags |= HAS_SSSE3; if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1; if (reg_ecx & BIT(20)) flags |= HAS_SSE4_2; // bits 27 (OSXSAVE) & 28 (256-bit AVX) if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) { // Check for OS-support of YMM state. Necessary for AVX and AVX2. if ((xgetbv() & 0x6) == 0x6) { flags |= HAS_AVX; if (max_cpuid_val >= 7) { /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); if (reg_ebx & BIT(5)) flags |= HAS_AVX2; } } } (void)reg_eax; // Avoid compiler warning on unused-but-set variable. return flags & mask; } // Fine-Grain Measurement Functions // // If you are timing a small region of code, access the timestamp counter // (TSC) via: // // unsigned int start = x86_tsc_start(); // ... // unsigned int end = x86_tsc_end(); // unsigned int diff = end - start; // // The start/end functions introduce a few more instructions than using // x86_readtsc directly, but prevent the CPU's out-of-order execution from // affecting the measurement (by having earlier/later instructions be evaluated // in the time interval). See the white paper, "How to Benchmark Code // Execution Times on Intel(R) IA-32 and IA-64 Instruction Set Architectures" by // Gabriele Paoloni for more information. // // If you are timing a large function (CPU time > a couple of seconds), use // x86_readtsc64 to read the timestamp counter in a 64-bit integer. The // out-of-order leakage that can occur is minimal compared to total runtime. static inline unsigned int x86_readtsc(void) { #if defined(__GNUC__) unsigned int tsc; __asm__ __volatile__("rdtsc\n\t" : "=a"(tsc) :); return tsc; #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) unsigned int tsc; asm volatile("rdtsc\n\t" : "=a"(tsc) :); return tsc; #else #if AOM_ARCH_X86_64 return (unsigned int)__rdtsc(); #else __asm rdtsc; #endif #endif } // 64-bit CPU cycle counter static inline uint64_t x86_readtsc64(void) { #if defined(__GNUC__) uint32_t hi, lo; __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); return ((uint64_t)hi << 32) | lo; #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) uint_t hi, lo; asm volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi)); return ((uint64_t)hi << 32) | lo; #else #if AOM_ARCH_X86_64 return (uint64_t)__rdtsc(); #else __asm rdtsc; #endif #endif } // 32-bit CPU cycle counter with a partial fence against out-of-order execution. static inline unsigned int x86_readtscp(void) { #if defined(__GNUC__) unsigned int tscp; __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :); return tscp; #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) unsigned int tscp; asm volatile("rdtscp\n\t" : "=a"(tscp) :); return tscp; #elif defined(_MSC_VER) unsigned int ui; return (unsigned int)__rdtscp(&ui); #else #if AOM_ARCH_X86_64 return (unsigned int)__rdtscp(); #else __asm rdtscp; #endif #endif } static inline unsigned int x86_tsc_start(void) { unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; // This call should not be removed. See function notes above. cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); // Avoid compiler warnings on unused-but-set variables. (void)reg_eax; (void)reg_ebx; (void)reg_ecx; (void)reg_edx; return x86_readtsc(); } static inline unsigned int x86_tsc_end(void) { uint32_t v = x86_readtscp(); unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; // This call should not be removed. See function notes above. cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); // Avoid compiler warnings on unused-but-set variables. (void)reg_eax; (void)reg_ebx; (void)reg_ecx; (void)reg_edx; return v; } #if defined(__GNUC__) #define x86_pause_hint() __asm__ __volatile__("pause \n\t") #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) #define x86_pause_hint() asm volatile("pause \n\t") #else #if AOM_ARCH_X86_64 #define x86_pause_hint() _mm_pause(); #else #define x86_pause_hint() __asm pause #endif #endif #if defined(__GNUC__) static void x87_set_control_word(unsigned short mode) { __asm__ __volatile__("fldcw %0" : : "m"(*&mode)); } static unsigned short x87_get_control_word(void) { unsigned short mode; __asm__ __volatile__("fstcw %0\n\t" : "=m"(*&mode) :); return mode; } #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) static void x87_set_control_word(unsigned short mode) { asm volatile("fldcw %0" : : "m"(*&mode)); } static unsigned short x87_get_control_word(void) { unsigned short mode; asm volatile("fstcw %0\n\t" : "=m"(*&mode) :); return mode; } #elif AOM_ARCH_X86_64 /* No fldcw intrinsics on Windows x64, punt to external asm */ extern void aom_winx64_fldcw(unsigned short mode); extern unsigned short aom_winx64_fstcw(void); #define x87_set_control_word aom_winx64_fldcw #define x87_get_control_word aom_winx64_fstcw #else static void x87_set_control_word(unsigned short mode) { __asm { fldcw mode } } static unsigned short x87_get_control_word(void) { unsigned short mode; __asm { fstcw mode } return mode; } #endif static inline unsigned int x87_set_double_precision(void) { unsigned int mode = x87_get_control_word(); // Intel 64 and IA-32 Architectures Developer's Manual: Vol. 1 // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf // 8.1.5.2 Precision Control Field // Bits 8 and 9 (0x300) of the x87 FPU Control Word ("Precision Control") // determine the number of bits used in floating point calculations. To match // later SSE instructions restrict x87 operations to Double Precision (0x200). // Precision PC Field // Single Precision (24-Bits) 00B // Reserved 01B // Double Precision (53-Bits) 10B // Extended Precision (64-Bits) 11B x87_set_control_word((mode & ~0x300u) | 0x200u); return mode; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_PORTS_X86_H_ aom-3.12.1/aom_ports/x86_abi_support.asm000066400000000000000000000226771477627663500201600ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "config/aom_config.asm" ; 32/64 bit compatibility macros ; ; In general, we make the source use 64 bit syntax, then twiddle with it using ; the preprocessor to get the 32 bit syntax on 32 bit platforms. ; %ifidn __OUTPUT_FORMAT__,elf32 %define ABI_IS_32BIT 1 %elifidn __OUTPUT_FORMAT__,macho32 %define ABI_IS_32BIT 1 %elifidn __OUTPUT_FORMAT__,win32 %define ABI_IS_32BIT 1 %elifidn __OUTPUT_FORMAT__,aout %define ABI_IS_32BIT 1 %else %define ABI_IS_32BIT 0 %endif %if ABI_IS_32BIT %define rax eax %define rbx ebx %define rcx ecx %define rdx edx %define rsi esi %define rdi edi %define rsp esp %define rbp ebp %define movsxd mov %macro movq 2 %ifidn %1,eax movd %1,%2 %elifidn %2,eax movd %1,%2 %elifidn %1,ebx movd %1,%2 %elifidn %2,ebx movd %1,%2 %elifidn %1,ecx movd %1,%2 %elifidn %2,ecx movd %1,%2 %elifidn %1,edx movd %1,%2 %elifidn %2,edx movd %1,%2 %elifidn %1,esi movd %1,%2 %elifidn %2,esi movd %1,%2 %elifidn %1,edi movd %1,%2 %elifidn %2,edi movd %1,%2 %elifidn %1,esp movd %1,%2 %elifidn %2,esp movd %1,%2 %elifidn %1,ebp movd %1,%2 %elifidn %2,ebp movd %1,%2 %else movq %1,%2 %endif %endmacro %endif ; LIBAOM_YASM_WIN64 ; Set LIBAOM_YASM_WIN64 if output is Windows 64bit so the code will work if x64 ; or win64 is defined on the Yasm command line. %ifidn __OUTPUT_FORMAT__,win64 %define LIBAOM_YASM_WIN64 1 %elifidn __OUTPUT_FORMAT__,x64 %define LIBAOM_YASM_WIN64 1 %else %define LIBAOM_YASM_WIN64 0 %endif ; Declare groups of platforms %ifidn __OUTPUT_FORMAT__,elf32 %define LIBAOM_ELF 1 %elifidn __OUTPUT_FORMAT__,elfx32 %define LIBAOM_ELF 1 %elifidn __OUTPUT_FORMAT__,elf64 %define LIBAOM_ELF 1 %else %define LIBAOM_ELF 0 %endif %ifidn __OUTPUT_FORMAT__,macho32 %define LIBAOM_MACHO 1 %elifidn __OUTPUT_FORMAT__,macho64 %define LIBAOM_MACHO 1 %else %define LIBAOM_MACHO 0 %endif ; sym() ; Return the proper symbol name for the target ABI. ; ; Certain ABIs, notably MS COFF and Darwin MACH-O, require that symbols ; with C linkage be prefixed with an underscore. ; %if LIBAOM_ELF || LIBAOM_YASM_WIN64 %define sym(x) x %else ; Mach-O / COFF %define sym(x) _ %+ x %endif ; globalsym() ; Return a global declaration with the proper decoration for the target ABI. ; ; When CHROMIUM is defined, include attributes to hide the symbol from the ; global namespace. ; ; Chromium doesn't like exported global symbols due to symbol clashing with ; plugins among other things. ; ; Requires Chromium's patched copy of yasm: ; http://src.chromium.org/viewvc/chrome?view=rev&revision=73761 ; http://www.tortall.net/projects/yasm/ticket/236 ; or nasm > 2.14. ; %ifdef CHROMIUM %ifdef __NASM_VER__ %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 ; nasm < 2.14 does not support :private_extern directive %fatal Must use nasm 2.14 or newer %endif %endif %if LIBAOM_ELF %define globalsym(x) global sym(x) %+ :function hidden %elif LIBAOM_MACHO %define globalsym(x) global sym(x) %+ :private_extern %else ; COFF / PE32+ %define globalsym(x) global sym(x) %endif %else %define globalsym(x) global sym(x) %endif ; arg() ; Return the address specification of the given argument ; %if ABI_IS_32BIT %define arg(x) [ebp+8+4*x] %else ; 64 bit ABI passes arguments in registers. This is a workaround to get up ; and running quickly. Relies on SHADOW_ARGS_TO_STACK %if LIBAOM_YASM_WIN64 %define arg(x) [rbp+16+8*x] %else %define arg(x) [rbp-8-8*x] %endif %endif ; REG_SZ_BYTES, REG_SZ_BITS ; Size of a register %if ABI_IS_32BIT %define REG_SZ_BYTES 4 %define REG_SZ_BITS 32 %else %define REG_SZ_BYTES 8 %define REG_SZ_BITS 64 %endif ; ALIGN_STACK ; This macro aligns the stack to the given alignment (in bytes). The stack ; is left such that the previous value of the stack pointer is the first ; argument on the stack (ie, the inverse of this macro is 'pop rsp.') ; This macro uses one temporary register, which is not preserved, and thus ; must be specified as an argument. %macro ALIGN_STACK 2 mov %2, rsp and rsp, -%1 lea rsp, [rsp - (%1 - REG_SZ_BYTES)] push %2 %endmacro ; ; The Microsoft assembler tries to impose a certain amount of type safety in ; its register usage. YASM doesn't recognize these directives, so we just ; %define them away to maintain as much compatibility as possible with the ; original inline assembler we're porting from. ; %idefine PTR %idefine XMMWORD %idefine MMWORD ; PIC macros ; %if ABI_IS_32BIT %if CONFIG_PIC=1 %ifidn __OUTPUT_FORMAT__,elf32 %define WRT_PLT wrt ..plt %macro GET_GOT 1 extern _GLOBAL_OFFSET_TABLE_ push %1 call %%get_got %%sub_offset: jmp %%exitGG %%get_got: mov %1, [esp] add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc ret %%exitGG: %undef GLOBAL %define GLOBAL(x) x + %1 wrt ..gotoff %undef RESTORE_GOT %define RESTORE_GOT pop %1 %endmacro %elifidn __OUTPUT_FORMAT__,macho32 %macro GET_GOT 1 push %1 call %%get_got %%get_got: pop %1 %undef GLOBAL %define GLOBAL(x) x + %1 - %%get_got %undef RESTORE_GOT %define RESTORE_GOT pop %1 %endmacro %endif %endif %ifdef CHROMIUM %ifidn __OUTPUT_FORMAT__,macho32 %define HIDDEN_DATA(x) x:private_extern %else %define HIDDEN_DATA(x) x %endif %else %define HIDDEN_DATA(x) x %endif %else %macro GET_GOT 1 %endmacro %define GLOBAL(x) rel x %ifidn __OUTPUT_FORMAT__,elf64 %define WRT_PLT wrt ..plt %define HIDDEN_DATA(x) x:data hidden %elifidn __OUTPUT_FORMAT__,elfx32 %define WRT_PLT wrt ..plt %define HIDDEN_DATA(x) x:data hidden %elifidn __OUTPUT_FORMAT__,macho64 %ifdef CHROMIUM %define HIDDEN_DATA(x) x:private_extern %else %define HIDDEN_DATA(x) x %endif %else %define HIDDEN_DATA(x) x %endif %endif %ifnmacro GET_GOT %macro GET_GOT 1 %endmacro %define GLOBAL(x) x %endif %ifndef RESTORE_GOT %define RESTORE_GOT %endif %ifndef WRT_PLT %define WRT_PLT %endif %if ABI_IS_32BIT %macro SHADOW_ARGS_TO_STACK 1 %endm %define UNSHADOW_ARGS %else %if LIBAOM_YASM_WIN64 %macro SHADOW_ARGS_TO_STACK 1 ; argc %if %1 > 0 mov arg(0),rcx %endif %if %1 > 1 mov arg(1),rdx %endif %if %1 > 2 mov arg(2),r8 %endif %if %1 > 3 mov arg(3),r9 %endif %endm %else %macro SHADOW_ARGS_TO_STACK 1 ; argc %if %1 > 0 push rdi %endif %if %1 > 1 push rsi %endif %if %1 > 2 push rdx %endif %if %1 > 3 push rcx %endif %if %1 > 4 push r8 %endif %if %1 > 5 push r9 %endif %if %1 > 6 %assign i %1-6 %assign off 16 %rep i mov rax,[rbp+off] push rax %assign off off+8 %endrep %endif %endm %endif %define UNSHADOW_ARGS mov rsp, rbp %endif ; Win64 ABI requires that XMM6:XMM15 are callee saved ; SAVE_XMM n, [u] ; store registers 6-n on the stack ; if u is specified, use unaligned movs. ; Win64 ABI requires 16 byte stack alignment, but then pushes an 8 byte return ; value. Typically we follow this up with 'push rbp' - re-aligning the stack - ; but in some cases this is not done and unaligned movs must be used. %if LIBAOM_YASM_WIN64 %macro SAVE_XMM 1-2 a %if %1 < 6 %error Only xmm registers 6-15 must be preserved %else %assign last_xmm %1 %define movxmm movdq %+ %2 %assign xmm_stack_space ((last_xmm - 5) * 16) sub rsp, xmm_stack_space %assign i 6 %rep (last_xmm - 5) movxmm [rsp + ((i - 6) * 16)], xmm %+ i %assign i i+1 %endrep %endif %endmacro %macro RESTORE_XMM 0 %ifndef last_xmm %error RESTORE_XMM must be paired with SAVE_XMM n %else %assign i last_xmm %rep (last_xmm - 5) movxmm xmm %+ i, [rsp +((i - 6) * 16)] %assign i i-1 %endrep add rsp, xmm_stack_space ; there are a couple functions which return from multiple places. ; otherwise, we could uncomment these: ; %undef last_xmm ; %undef xmm_stack_space ; %undef movxmm %endif %endmacro %else %macro SAVE_XMM 1-2 %endmacro %macro RESTORE_XMM 0 %endmacro %endif ; Name of the rodata section ; ; .rodata seems to be an elf-ism, as it doesn't work on OSX. ; %ifidn __OUTPUT_FORMAT__,macho64 %define SECTION_RODATA section .text %elifidn __OUTPUT_FORMAT__,macho32 %macro SECTION_RODATA 0 section .text %endmacro %elifidn __OUTPUT_FORMAT__,aout %define SECTION_RODATA section .data %else %define SECTION_RODATA section .rodata %endif ; Tell GNU ld that we don't require an executable stack. %ifidn __OUTPUT_FORMAT__,elf32 section .note.GNU-stack noalloc noexec nowrite progbits section .text %elifidn __OUTPUT_FORMAT__,elf64 section .note.GNU-stack noalloc noexec nowrite progbits section .text %elifidn __OUTPUT_FORMAT__,elfx32 section .note.GNU-stack noalloc noexec nowrite progbits section .text %endif aom-3.12.1/aom_scale/000077500000000000000000000000001477627663500143245ustar00rootroot00000000000000aom-3.12.1/aom_scale/aom_scale.cmake000066400000000000000000000026261477627663500172570ustar00rootroot00000000000000# # Copyright (c) 2017, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # if(AOM_AOM_SCALE_AOM_SCALE_CMAKE_) return() endif() # AOM_AOM_SCALE_AOM_SCALE_CMAKE_ set(AOM_AOM_SCALE_AOM_SCALE_CMAKE_ 1) list(APPEND AOM_SCALE_SOURCES "${AOM_ROOT}/aom_scale/generic/yv12config.c" "${AOM_ROOT}/aom_scale/generic/yv12extend.c" "${AOM_ROOT}/aom_scale/yv12config.h") # Creates the aom_scale build target and makes libaom depend on it. The libaom # target must exist before this function is called. function(setup_aom_scale_targets) add_library(aom_scale OBJECT ${AOM_SCALE_SOURCES}) target_sources(aom PRIVATE $) target_sources(aom PRIVATE $) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $) endif() # Pass the new lib targets up to the parent scope instance of # $AOM_LIB_TARGETS. set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_scale PARENT_SCOPE) endfunction() aom-3.12.1/aom_scale/aom_scale_rtcd.c000066400000000000000000000013031477627663500174240ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_config.h" #define RTCD_C #include "config/aom_scale_rtcd.h" #include "aom_ports/aom_once.h" void aom_scale_rtcd(void) { aom_once(setup_rtcd_internal); } aom-3.12.1/aom_scale/aom_scale_rtcd.pl000066400000000000000000000053661477627663500176320ustar00rootroot00000000000000## ## Copyright (c) 2017, Alliance for Open Media. All rights reserved. ## ## This source code is subject to the terms of the BSD 2 Clause License and ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ## was not distributed with this source code in the LICENSE file, you can ## obtain it at www.aomedia.org/license/software. If the Alliance for Open ## Media Patent License 1.0 was not distributed with this source code in the ## PATENTS file, you can obtain it at www.aomedia.org/license/patent. ## sub aom_scale_forward_decls() { print < struct yv12_buffer_config; EOF } forward_decls qw/aom_scale_forward_decls/; add_proto qw/int aom_yv12_realloc_with_new_border/, "struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes"; add_proto qw/void aom_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes"; add_proto qw/void aom_yv12_copy_frame/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes"; add_proto qw/void aom_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop"; add_proto qw/void aom_yv12_copy_u/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop"; add_proto qw/void aom_yv12_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop"; add_proto qw/void aom_yv12_partial_copy_y/, "const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2"; add_proto qw/void aom_yv12_partial_coloc_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend"; add_proto qw/void aom_yv12_partial_copy_u/, "const struct yv12_buffer_config *src_bc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_bc, int hstart2, int vstart2"; add_proto qw/void aom_yv12_partial_coloc_copy_u/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend"; add_proto qw/void aom_yv12_partial_copy_v/, "const struct yv12_buffer_config *src_bc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_bc, int hstart2, int vstart2"; add_proto qw/void aom_yv12_partial_coloc_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend"; add_proto qw/void aom_extend_frame_borders_plane_row/, "const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end"; add_proto qw/void aom_extend_frame_borders/, "struct yv12_buffer_config *ybf, int num_planes"; 1; aom-3.12.1/aom_scale/generic/000077500000000000000000000000001477627663500157405ustar00rootroot00000000000000aom-3.12.1/aom_scale/generic/yv12config.c000066400000000000000000000263751477627663500201100ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "aom/aom_image.h" #include "aom/internal/aom_image_internal.h" #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_dsp/pyramid.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "aom_scale/yv12config.h" #include "av1/common/enums.h" /**************************************************************************** * Exports ****************************************************************************/ /**************************************************************************** * ****************************************************************************/ // TODO(jkoleszar): Maybe replace this with struct aom_image int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) { if (ybf) { if (ybf->buffer_alloc_sz > 0) { aom_free(ybf->buffer_alloc); } #if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY if (ybf->y_pyramid) { aom_free_pyramid(ybf->y_pyramid); } if (ybf->corners) { av1_free_corner_list(ybf->corners); } #endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY aom_remove_metadata_from_frame_buffer(ybf); /* buffer_alloc isn't accessed by most functions. Rather y_buffer, u_buffer and v_buffer point to buffer_alloc and are used. Clear out all of this so that a freed pointer isn't inadvertently used */ memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG)); return 0; } return AOM_CODEC_MEM_ERROR; } static int realloc_frame_buffer_aligned( YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y, int use_highbitdepth, int border, int byte_alignment, aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb, void *cb_priv, const int y_stride, const uint64_t yplane_size, const uint64_t uvplane_size, const int aligned_width, const int aligned_height, const int uv_width, const int uv_height, const int uv_stride, const int uv_border_w, const int uv_border_h, bool alloc_pyramid, int alloc_y_plane_only) { if (ybf) { const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment; const uint64_t frame_size = (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size); uint8_t *buf = NULL; #if CONFIG_REALTIME_ONLY || !CONFIG_AV1_ENCODER // We should only need an 8-bit version of the source frame if we are // encoding in non-realtime mode (void)alloc_pyramid; assert(!alloc_pyramid); #endif // CONFIG_REALTIME_ONLY || !CONFIG_AV1_ENCODER #if defined AOM_MAX_ALLOCABLE_MEMORY // The size of ybf->buffer_alloc. uint64_t alloc_size = frame_size; #if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY // The size of ybf->y_pyramid if (alloc_pyramid) { alloc_size += aom_get_pyramid_alloc_size(width, height, use_highbitdepth); alloc_size += av1_get_corner_list_size(); } #endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY // The decoder may allocate REF_FRAMES frame buffers in the frame buffer // pool. Bound the total amount of allocated memory as if these REF_FRAMES // frame buffers were allocated in a single allocation. if (alloc_size > AOM_MAX_ALLOCABLE_MEMORY / REF_FRAMES) return AOM_CODEC_MEM_ERROR; #endif if (cb != NULL) { const int align_addr_extra_size = 31; const uint64_t external_frame_size = frame_size + align_addr_extra_size; assert(fb != NULL); if (external_frame_size != (size_t)external_frame_size) return AOM_CODEC_MEM_ERROR; // Allocation to hold larger frame, or first allocation. if (cb(cb_priv, (size_t)external_frame_size, fb) < 0) return AOM_CODEC_MEM_ERROR; if (fb->data == NULL || fb->size < external_frame_size) return AOM_CODEC_MEM_ERROR; ybf->buffer_alloc = (uint8_t *)aom_align_addr(fb->data, 32); #if defined(__has_feature) #if __has_feature(memory_sanitizer) // This memset is needed for fixing the issue of using uninitialized // value in msan test. It will cause a perf loss, so only do this for // msan test. memset(ybf->buffer_alloc, 0, (size_t)frame_size); #endif #endif } else if (frame_size > ybf->buffer_alloc_sz) { // Allocation to hold larger frame, or first allocation. aom_free(ybf->buffer_alloc); ybf->buffer_alloc = NULL; ybf->buffer_alloc_sz = 0; if (frame_size != (size_t)frame_size) return AOM_CODEC_MEM_ERROR; ybf->buffer_alloc = (uint8_t *)aom_memalign(32, (size_t)frame_size); if (!ybf->buffer_alloc) return AOM_CODEC_MEM_ERROR; ybf->buffer_alloc_sz = (size_t)frame_size; // This memset is needed for fixing valgrind error from C loop filter // due to access uninitialized memory in frame border. It could be // removed if border is totally removed. memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz); } ybf->y_crop_width = width; ybf->y_crop_height = height; ybf->y_width = aligned_width; ybf->y_height = aligned_height; ybf->y_stride = y_stride; ybf->uv_crop_width = (width + ss_x) >> ss_x; ybf->uv_crop_height = (height + ss_y) >> ss_y; ybf->uv_width = uv_width; ybf->uv_height = uv_height; ybf->uv_stride = uv_stride; ybf->border = border; ybf->frame_size = (size_t)frame_size; ybf->subsampling_x = ss_x; ybf->subsampling_y = ss_y; buf = ybf->buffer_alloc; if (use_highbitdepth) { // Store uint16 addresses when using 16bit framebuffers buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc); ybf->flags = YV12_FLAG_HIGHBITDEPTH; } else { ybf->flags = 0; } ybf->y_buffer = (uint8_t *)aom_align_addr( buf + (border * y_stride) + border, aom_byte_align); if (!alloc_y_plane_only) { ybf->u_buffer = (uint8_t *)aom_align_addr( buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w, aom_byte_align); ybf->v_buffer = (uint8_t *)aom_align_addr(buf + yplane_size + uvplane_size + (uv_border_h * uv_stride) + uv_border_w, aom_byte_align); } else { ybf->u_buffer = NULL; ybf->v_buffer = NULL; } ybf->use_external_reference_buffers = 0; #if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY if (ybf->y_pyramid) { aom_free_pyramid(ybf->y_pyramid); ybf->y_pyramid = NULL; } if (ybf->corners) { av1_free_corner_list(ybf->corners); ybf->corners = NULL; } if (alloc_pyramid) { ybf->y_pyramid = aom_alloc_pyramid(width, height, use_highbitdepth); if (!ybf->y_pyramid) return AOM_CODEC_MEM_ERROR; ybf->corners = av1_alloc_corner_list(); if (!ybf->corners) return AOM_CODEC_MEM_ERROR; } #endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY ybf->corrupted = 0; /* assume not corrupted by errors */ return 0; } return AOM_CODEC_MEM_ERROR; } static int calc_stride_and_planesize( const int ss_x, const int ss_y, const int aligned_width, const int aligned_height, const int border, const int byte_alignment, int alloc_y_plane_only, int *y_stride, int *uv_stride, uint64_t *yplane_size, uint64_t *uvplane_size, const int uv_height) { /* Only support allocating buffers that have a border that's a multiple * of 32. The border restriction is required to get 16-byte alignment of * the start of the chroma rows without introducing an arbitrary gap * between planes, which would break the semantics of things like * aom_img_set_rect(). */ if (border & 0x1f) return AOM_CODEC_MEM_ERROR; *y_stride = aom_calc_y_stride(aligned_width, border); *yplane_size = (aligned_height + 2 * border) * (uint64_t)(*y_stride) + byte_alignment; if (!alloc_y_plane_only) { *uv_stride = *y_stride >> ss_x; *uvplane_size = (uv_height + 2 * (border >> ss_y)) * (uint64_t)(*uv_stride) + byte_alignment; } else { *uv_stride = 0; *uvplane_size = 0; } return 0; } int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y, int use_highbitdepth, int border, int byte_alignment, aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb, void *cb_priv, bool alloc_pyramid, int alloc_y_plane_only) { if (ybf) { int y_stride = 0; int uv_stride = 0; uint64_t yplane_size = 0; uint64_t uvplane_size = 0; const int aligned_width = (width + 7) & ~7; const int aligned_height = (height + 7) & ~7; const int uv_width = aligned_width >> ss_x; const int uv_height = aligned_height >> ss_y; const int uv_border_w = border >> ss_x; const int uv_border_h = border >> ss_y; int error = calc_stride_and_planesize( ss_x, ss_y, aligned_width, aligned_height, border, byte_alignment, alloc_y_plane_only, &y_stride, &uv_stride, &yplane_size, &uvplane_size, uv_height); if (error) return error; return realloc_frame_buffer_aligned( ybf, width, height, ss_x, ss_y, use_highbitdepth, border, byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size, aligned_width, aligned_height, uv_width, uv_height, uv_stride, uv_border_w, uv_border_h, alloc_pyramid, alloc_y_plane_only); } return AOM_CODEC_MEM_ERROR; } int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y, int use_highbitdepth, int border, int byte_alignment, bool alloc_pyramid, int alloc_y_plane_only) { if (ybf) { aom_free_frame_buffer(ybf); return aom_realloc_frame_buffer( ybf, width, height, ss_x, ss_y, use_highbitdepth, border, byte_alignment, NULL, NULL, NULL, alloc_pyramid, alloc_y_plane_only); } return AOM_CODEC_MEM_ERROR; } void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf) { if (ybf && ybf->metadata) { aom_img_metadata_array_free(ybf->metadata); ybf->metadata = NULL; } } int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf, const aom_metadata_array_t *arr) { if (!ybf || !arr || !arr->metadata_array) return -1; if (ybf->metadata == arr) return 0; aom_remove_metadata_from_frame_buffer(ybf); ybf->metadata = aom_img_metadata_array_alloc(arr->sz); if (!ybf->metadata) return -1; for (size_t i = 0; i < ybf->metadata->sz; i++) { ybf->metadata->metadata_array[i] = aom_img_metadata_alloc( arr->metadata_array[i]->type, arr->metadata_array[i]->payload, arr->metadata_array[i]->sz, arr->metadata_array[i]->insert_flag); if (ybf->metadata->metadata_array[i] == NULL) { aom_img_metadata_array_free(ybf->metadata); ybf->metadata = NULL; return -1; } } ybf->metadata->sz = arr->sz; return 0; } aom-3.12.1/aom_scale/generic/yv12extend.c000066400000000000000000000431161477627663500201220ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_scale_rtcd.h" #include "aom/aom_integer.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "aom_scale/yv12config.h" static void extend_plane(uint8_t *const src, int src_stride, int width, int height, int extend_top, int extend_left, int extend_bottom, int extend_right, int v_start, int v_end) { assert(src != NULL); int i; const int linesize = extend_left + extend_right + width; assert(linesize <= src_stride); /* copy the left and right most columns out */ uint8_t *src_ptr1 = src + v_start * src_stride; uint8_t *src_ptr2 = src + v_start * src_stride + width - 1; uint8_t *dst_ptr1 = src + v_start * src_stride - extend_left; uint8_t *dst_ptr2 = src_ptr2 + 1; for (i = v_start; i < v_end; ++i) { memset(dst_ptr1, src_ptr1[0], extend_left); memset(dst_ptr2, src_ptr2[0], extend_right); src_ptr1 += src_stride; src_ptr2 += src_stride; dst_ptr1 += src_stride; dst_ptr2 += src_stride; } /* Now copy the top and bottom lines into each line of the respective * borders */ src_ptr1 = src - extend_left; dst_ptr1 = src_ptr1 + src_stride * -extend_top; for (i = 0; i < extend_top; ++i) { memcpy(dst_ptr1, src_ptr1, linesize); dst_ptr1 += src_stride; } src_ptr2 = src_ptr1 + src_stride * (height - 1); dst_ptr2 = src_ptr2; for (i = 0; i < extend_bottom; ++i) { dst_ptr2 += src_stride; memcpy(dst_ptr2, src_ptr2, linesize); } } #if CONFIG_AV1_HIGHBITDEPTH static void extend_plane_high(uint8_t *const src8, int src_stride, int width, int height, int extend_top, int extend_left, int extend_bottom, int extend_right, int v_start, int v_end) { int i; const int linesize = extend_left + extend_right + width; assert(linesize <= src_stride); uint16_t *src = CONVERT_TO_SHORTPTR(src8); /* copy the left and right most columns out */ uint16_t *src_ptr1 = src + v_start * src_stride; uint16_t *src_ptr2 = src + v_start * src_stride + width - 1; uint16_t *dst_ptr1 = src + v_start * src_stride - extend_left; uint16_t *dst_ptr2 = src_ptr2 + 1; for (i = v_start; i < v_end; ++i) { aom_memset16(dst_ptr1, src_ptr1[0], extend_left); aom_memset16(dst_ptr2, src_ptr2[0], extend_right); src_ptr1 += src_stride; src_ptr2 += src_stride; dst_ptr1 += src_stride; dst_ptr2 += src_stride; } /* Now copy the top and bottom lines into each line of the respective * borders */ src_ptr1 = src - extend_left; dst_ptr1 = src_ptr1 + src_stride * -extend_top; for (i = 0; i < extend_top; ++i) { memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t)); dst_ptr1 += src_stride; } src_ptr2 = src_ptr1 + src_stride * (height - 1); dst_ptr2 = src_ptr2; for (i = 0; i < extend_bottom; ++i) { dst_ptr2 += src_stride; memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t)); } } #endif // CONFIG_AV1_HIGHBITDEPTH void aom_extend_frame_borders_plane_row_c(const YV12_BUFFER_CONFIG *ybf, int plane, int v_start, int v_end) { const int ext_size = ybf->border; const int ss_x = ybf->subsampling_x; const int ss_y = ybf->subsampling_y; assert(ybf->y_height - ybf->y_crop_height < 16); assert(ybf->y_width - ybf->y_crop_width < 16); assert(ybf->y_height - ybf->y_crop_height >= 0); assert(ybf->y_width - ybf->y_crop_width >= 0); const int is_uv = plane > 0; const int top = ext_size >> (is_uv ? ss_y : 0); const int left = ext_size >> (is_uv ? ss_x : 0); const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv]; const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv]; const int extend_top_border = (v_start == 0); const int extend_bottom_border = (v_end == ybf->crop_heights[is_uv]); #if CONFIG_AV1_HIGHBITDEPTH if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { extend_plane_high(ybf->buffers[plane], ybf->strides[is_uv], ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], extend_top_border ? top : 0, left, extend_bottom_border ? bottom : 0, right, v_start, v_end); return; } #endif extend_plane(ybf->buffers[plane], ybf->strides[is_uv], ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], extend_top_border ? top : 0, left, extend_bottom_border ? bottom : 0, right, v_start, v_end); } void aom_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, const int num_planes) { assert(ybf->border % 2 == 0); assert(ybf->y_height - ybf->y_crop_height < 16); assert(ybf->y_width - ybf->y_crop_width < 16); assert(ybf->y_height - ybf->y_crop_height >= 0); assert(ybf->y_width - ybf->y_crop_width >= 0); #if CONFIG_AV1_HIGHBITDEPTH if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { for (int plane = 0; plane < num_planes; ++plane) { const int is_uv = plane > 0; const int plane_border = ybf->border >> is_uv; extend_plane_high( ybf->buffers[plane], ybf->strides[is_uv], ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], plane_border, plane_border, plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv], plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv], 0, ybf->crop_heights[is_uv]); } return; } #endif for (int plane = 0; plane < num_planes; ++plane) { const int is_uv = plane > 0; const int plane_border = ybf->border >> is_uv; extend_plane(ybf->buffers[plane], ybf->strides[is_uv], ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], plane_border, plane_border, plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv], plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv], 0, ybf->crop_heights[is_uv]); } } static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size, const int num_planes) { const int ss_x = ybf->subsampling_x; const int ss_y = ybf->subsampling_y; assert(ybf->y_height - ybf->y_crop_height < 16); assert(ybf->y_width - ybf->y_crop_width < 16); assert(ybf->y_height - ybf->y_crop_height >= 0); assert(ybf->y_width - ybf->y_crop_width >= 0); #if CONFIG_AV1_HIGHBITDEPTH if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { for (int plane = 0; plane < num_planes; ++plane) { const int is_uv = plane > 0; const int top = ext_size >> (is_uv ? ss_y : 0); const int left = ext_size >> (is_uv ? ss_x : 0); const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv]; const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv]; extend_plane_high(ybf->buffers[plane], ybf->strides[is_uv], ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top, left, bottom, right, 0, ybf->crop_heights[is_uv]); } return; } #endif for (int plane = 0; plane < num_planes; ++plane) { const int is_uv = plane > 0; const int top = ext_size >> (is_uv ? ss_y : 0); const int left = ext_size >> (is_uv ? ss_x : 0); const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv]; const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv]; extend_plane(ybf->buffers[plane], ybf->strides[is_uv], ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top, left, bottom, right, 0, ybf->crop_heights[is_uv]); } } void aom_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, const int num_planes) { extend_frame(ybf, ybf->border, num_planes); } #if CONFIG_AV1_HIGHBITDEPTH static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) { uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); uint16_t *src = CONVERT_TO_SHORTPTR(src8); memcpy(dst, src, num * sizeof(uint16_t)); } #endif // Copies the source image into the destination image and updates the // destination's UMV borders. // Note: The frames are assumed to be identical in size. void aom_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_bc, YV12_BUFFER_CONFIG *dst_bc, const int num_planes) { assert(src_bc->y_width == dst_bc->y_width); assert(src_bc->y_height == dst_bc->y_height); #if CONFIG_AV1_HIGHBITDEPTH assert((src_bc->flags & YV12_FLAG_HIGHBITDEPTH) == (dst_bc->flags & YV12_FLAG_HIGHBITDEPTH)); if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { for (int plane = 0; plane < num_planes; ++plane) { const uint8_t *plane_src = src_bc->buffers[plane]; uint8_t *plane_dst = dst_bc->buffers[plane]; const int is_uv = plane > 0; for (int row = 0; row < src_bc->heights[is_uv]; ++row) { memcpy_short_addr(plane_dst, plane_src, src_bc->widths[is_uv]); plane_src += src_bc->strides[is_uv]; plane_dst += dst_bc->strides[is_uv]; } } aom_yv12_extend_frame_borders_c(dst_bc, num_planes); return; } #endif for (int plane = 0; plane < num_planes; ++plane) { const uint8_t *plane_src = src_bc->buffers[plane]; uint8_t *plane_dst = dst_bc->buffers[plane]; const int is_uv = plane > 0; for (int row = 0; row < src_bc->heights[is_uv]; ++row) { memcpy(plane_dst, plane_src, src_bc->widths[is_uv]); plane_src += src_bc->strides[is_uv]; plane_dst += dst_bc->strides[is_uv]; } } aom_yv12_extend_frame_borders_c(dst_bc, num_planes); } void aom_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int use_crop) { int row; int width = use_crop ? src_ybc->y_crop_width : src_ybc->y_width; int height = use_crop ? src_ybc->y_crop_height : src_ybc->y_height; const uint8_t *src = src_ybc->y_buffer; uint8_t *dst = dst_ybc->y_buffer; #if CONFIG_AV1_HIGHBITDEPTH if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); for (row = 0; row < height; ++row) { memcpy(dst16, src16, width * sizeof(uint16_t)); src16 += src_ybc->y_stride; dst16 += dst_ybc->y_stride; } return; } #endif for (row = 0; row < height; ++row) { memcpy(dst, src, width); src += src_ybc->y_stride; dst += dst_ybc->y_stride; } } void aom_yv12_copy_u_c(const YV12_BUFFER_CONFIG *src_bc, YV12_BUFFER_CONFIG *dst_bc, int use_crop) { int row; int width = use_crop ? src_bc->uv_crop_width : src_bc->uv_width; int height = use_crop ? src_bc->uv_crop_height : src_bc->uv_height; const uint8_t *src = src_bc->u_buffer; uint8_t *dst = dst_bc->u_buffer; #if CONFIG_AV1_HIGHBITDEPTH if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); for (row = 0; row < height; ++row) { memcpy(dst16, src16, width * sizeof(uint16_t)); src16 += src_bc->uv_stride; dst16 += dst_bc->uv_stride; } return; } #endif for (row = 0; row < height; ++row) { memcpy(dst, src, width); src += src_bc->uv_stride; dst += dst_bc->uv_stride; } } void aom_yv12_copy_v_c(const YV12_BUFFER_CONFIG *src_bc, YV12_BUFFER_CONFIG *dst_bc, int use_crop) { int row; int width = use_crop ? src_bc->uv_crop_width : src_bc->uv_width; int height = use_crop ? src_bc->uv_crop_height : src_bc->uv_height; const uint8_t *src = src_bc->v_buffer; uint8_t *dst = dst_bc->v_buffer; #if CONFIG_AV1_HIGHBITDEPTH if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); for (row = 0; row < height; ++row) { memcpy(dst16, src16, width * sizeof(uint16_t)); src16 += src_bc->uv_stride; dst16 += dst_bc->uv_stride; } return; } #endif for (row = 0; row < height; ++row) { memcpy(dst, src, width); src += src_bc->uv_stride; dst += dst_bc->uv_stride; } } void aom_yv12_partial_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, int hstart1, int hend1, int vstart1, int vend1, YV12_BUFFER_CONFIG *dst_ybc, int hstart2, int vstart2) { int row; const uint8_t *src = src_ybc->y_buffer; uint8_t *dst = dst_ybc->y_buffer; #if CONFIG_AV1_HIGHBITDEPTH if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src + vstart1 * src_ybc->y_stride + hstart1); uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst + vstart2 * dst_ybc->y_stride + hstart2); for (row = vstart1; row < vend1; ++row) { memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t)); src16 += src_ybc->y_stride; dst16 += dst_ybc->y_stride; } return; } #endif src = (src + vstart1 * src_ybc->y_stride + hstart1); dst = (dst + vstart2 * dst_ybc->y_stride + hstart2); for (row = vstart1; row < vend1; ++row) { memcpy(dst, src, (hend1 - hstart1)); src += src_ybc->y_stride; dst += dst_ybc->y_stride; } } void aom_yv12_partial_coloc_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend, int vstart, int vend) { aom_yv12_partial_copy_y_c(src_ybc, hstart, hend, vstart, vend, dst_ybc, hstart, vstart); } void aom_yv12_partial_copy_u_c(const YV12_BUFFER_CONFIG *src_bc, int hstart1, int hend1, int vstart1, int vend1, YV12_BUFFER_CONFIG *dst_bc, int hstart2, int vstart2) { int row; const uint8_t *src = src_bc->u_buffer; uint8_t *dst = dst_bc->u_buffer; #if CONFIG_AV1_HIGHBITDEPTH if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1); uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2); for (row = vstart1; row < vend1; ++row) { memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t)); src16 += src_bc->uv_stride; dst16 += dst_bc->uv_stride; } return; } #endif src = (src + vstart1 * src_bc->uv_stride + hstart1); dst = (dst + vstart2 * dst_bc->uv_stride + hstart2); for (row = vstart1; row < vend1; ++row) { memcpy(dst, src, (hend1 - hstart1)); src += src_bc->uv_stride; dst += dst_bc->uv_stride; } } void aom_yv12_partial_coloc_copy_u_c(const YV12_BUFFER_CONFIG *src_bc, YV12_BUFFER_CONFIG *dst_bc, int hstart, int hend, int vstart, int vend) { aom_yv12_partial_copy_u_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart, vstart); } void aom_yv12_partial_copy_v_c(const YV12_BUFFER_CONFIG *src_bc, int hstart1, int hend1, int vstart1, int vend1, YV12_BUFFER_CONFIG *dst_bc, int hstart2, int vstart2) { int row; const uint8_t *src = src_bc->v_buffer; uint8_t *dst = dst_bc->v_buffer; #if CONFIG_AV1_HIGHBITDEPTH if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1); uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2); for (row = vstart1; row < vend1; ++row) { memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t)); src16 += src_bc->uv_stride; dst16 += dst_bc->uv_stride; } return; } #endif src = (src + vstart1 * src_bc->uv_stride + hstart1); dst = (dst + vstart2 * dst_bc->uv_stride + hstart2); for (row = vstart1; row < vend1; ++row) { memcpy(dst, src, (hend1 - hstart1)); src += src_bc->uv_stride; dst += dst_bc->uv_stride; } } void aom_yv12_partial_coloc_copy_v_c(const YV12_BUFFER_CONFIG *src_bc, YV12_BUFFER_CONFIG *dst_bc, int hstart, int hend, int vstart, int vend) { aom_yv12_partial_copy_v_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart, vstart); } int aom_yv12_realloc_with_new_border_c(YV12_BUFFER_CONFIG *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes) { if (ybf) { if (new_border == ybf->border) return 0; YV12_BUFFER_CONFIG new_buf; memset(&new_buf, 0, sizeof(new_buf)); const int error = aom_alloc_frame_buffer( &new_buf, ybf->y_crop_width, ybf->y_crop_height, ybf->subsampling_x, ybf->subsampling_y, ybf->flags & YV12_FLAG_HIGHBITDEPTH, new_border, byte_alignment, alloc_pyramid, 0); if (error) return error; // Copy image buffer aom_yv12_copy_frame(ybf, &new_buf, num_planes); // Extend up to new border aom_extend_frame_borders(&new_buf, num_planes); // Now free the old buffer and replace with the new aom_free_frame_buffer(ybf); memcpy(ybf, &new_buf, sizeof(new_buf)); return 0; } return -2; } aom-3.12.1/aom_scale/yv12config.h000066400000000000000000000157751477627663500165030ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_SCALE_YV12CONFIG_H_ #define AOM_AOM_SCALE_YV12CONFIG_H_ #ifdef __cplusplus extern "C" { #endif #include #include "config/aom_config.h" #include "aom/aom_codec.h" #include "aom/aom_frame_buffer.h" #include "aom/aom_integer.h" #include "aom/internal/aom_image_internal.h" /*!\cond */ #define AOMINNERBORDERINPIXELS 160 #define AOM_INTERP_EXTEND 4 #define AOM_BORDER_IN_PIXELS 288 #define AOM_ENC_NO_SCALE_BORDER 160 #define AOM_ENC_ALLINTRA_BORDER 64 #define AOM_DEC_BORDER_IN_PIXELS 64 #if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY struct image_pyramid; struct corner_list; #endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY /*!\endcond */ /*! * \brief YV12 frame buffer data structure */ typedef struct yv12_buffer_config { /*!\cond */ union { struct { // The aligned frame width of luma. // It is aligned to a multiple of 8: // y_width = (y_crop_width + 7) & ~7 int y_width; // The aligned frame width of chroma. // uv_width = y_width >> subsampling_x int uv_width; }; int widths[2]; }; union { struct { // The aligned frame height of luma. // It is aligned to a multiple of 8: // y_height = (y_crop_height + 7) & ~7 int y_height; // The aligned frame height of chroma. // uv_height = y_height >> subsampling_y int uv_height; }; int heights[2]; }; // The frame size en/decoded by AV1 union { struct { int y_crop_width; int uv_crop_width; }; int crop_widths[2]; }; union { struct { int y_crop_height; int uv_crop_height; }; int crop_heights[2]; }; union { struct { int y_stride; int uv_stride; }; int strides[2]; }; union { struct { uint8_t *y_buffer; uint8_t *u_buffer; uint8_t *v_buffer; }; uint8_t *buffers[3]; }; // Indicate whether y_buffer, u_buffer, and v_buffer points to the internally // allocated memory or external buffers. int use_external_reference_buffers; // This is needed to store y_buffer, u_buffer, and v_buffer when set reference // uses an external refernece, and restore those buffer pointers after the // external reference frame is no longer used. uint8_t *store_buf_adr[3]; // Global motion search data #if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY // 8-bit downsampling pyramid for the Y plane struct image_pyramid *y_pyramid; struct corner_list *corners; #endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY uint8_t *buffer_alloc; size_t buffer_alloc_sz; int border; size_t frame_size; int subsampling_x; int subsampling_y; unsigned int bit_depth; aom_color_primaries_t color_primaries; aom_transfer_characteristics_t transfer_characteristics; aom_matrix_coefficients_t matrix_coefficients; uint8_t monochrome; aom_chroma_sample_position_t chroma_sample_position; aom_color_range_t color_range; int render_width; int render_height; int corrupted; int flags; aom_metadata_array_t *metadata; /*!\endcond */ } YV12_BUFFER_CONFIG; /*!\cond */ #define YV12_FLAG_HIGHBITDEPTH 8 // Allocate a frame buffer // // If ybf currently contains an image, all associated memory will be freed and // then reallocated. In contrast, aom_realloc_frame_buffer() will reuse any // existing allocations where possible. So, if ybf is likely to already be // set up, please consider aom_realloc_frame_buffer() instead. // // See aom_realloc_frame_buffer() for the meanings of the arguments, and // available return values. int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y, int use_highbitdepth, int border, int byte_alignment, bool alloc_pyramid, int alloc_y_plane_only); // Updates the yv12 buffer config with the frame buffer. |byte_alignment| must // be a power of 2, from 32 to 1024. 0 sets legacy alignment. If cb is not // NULL, then libaom is using the frame buffer callbacks to handle memory. // If cb is not NULL, libaom will call cb with minimum size in bytes needed // to decode the current frame. If cb is NULL, libaom will allocate memory // internally to decode the current frame. // // If alloc_pyramid is true, then an image pyramid will be allocated // for use in global motion estimation. This is only needed if this frame // buffer will be used to store a source frame or a reference frame in // the encoder. Any other framebuffers (eg, intermediates for filtering, // or any buffer in the decoder) can set alloc_pyramid = false. // // Returns 0 on success. Returns < 0 on failure. int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y, int use_highbitdepth, int border, int byte_alignment, aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb, void *cb_priv, bool alloc_pyramid, int alloc_y_plane_only); int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf); /*!\endcond */ /*!\brief Removes metadata from YUV_BUFFER_CONFIG struct. * * Frees metadata in frame buffer. * Frame buffer metadata pointer will be set to NULL. * * \param[in] ybf Frame buffer struct pointer */ void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf); /*!\brief Copy metadata to YUV_BUFFER_CONFIG struct. * * Copies metadata to frame buffer. * Frame buffer will clear any previous metadata and will reallocate the * metadata array to the new metadata size. Then, it will copy the new metadata * array into it. * If arr metadata pointer points to the same address as current metadata in the * frame buffer, function will do nothing and return 0. * Returns 0 on success or -1 on failure. * * \param[in] ybf Frame buffer struct pointer * \param[in] arr Metadata array struct pointer */ int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf, const aom_metadata_array_t *arr); /*!\brief Calculate the stride required for the image. * * Calculates the stride value for an image from aligned width and border. * Returns the y stride value. * * \param[in] aligned_width Aligned width of the image * \param[in] border Border in pixels */ static inline int aom_calc_y_stride(int aligned_width, int border) { return ((aligned_width + 2 * border) + 31) & ~31; } #ifdef __cplusplus } #endif #endif // AOM_AOM_SCALE_YV12CONFIG_H_ aom-3.12.1/aom_util/000077500000000000000000000000001477627663500142125ustar00rootroot00000000000000aom-3.12.1/aom_util/aom_pthread.h000066400000000000000000000123561477627663500166550ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // // pthread.h wrapper #ifndef AOM_AOM_UTIL_AOM_PTHREAD_H_ #define AOM_AOM_UTIL_AOM_PTHREAD_H_ #include "config/aom_config.h" #if CONFIG_MULTITHREAD #ifdef __cplusplus extern "C" { #endif #if defined(_WIN32) && !HAVE_PTHREAD_H // Prevent leaking max/min macros. #undef NOMINMAX #define NOMINMAX #undef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #include // NOLINT #include // NOLINT #include // NOLINT #include // NOLINT typedef HANDLE pthread_t; typedef int pthread_attr_t; typedef CRITICAL_SECTION pthread_mutex_t; #if _WIN32_WINNT < 0x0600 #error _WIN32_WINNT must target Windows Vista / Server 2008 or newer. #endif typedef CONDITION_VARIABLE pthread_cond_t; #ifndef WINAPI_FAMILY_PARTITION #define WINAPI_PARTITION_DESKTOP 1 #define WINAPI_FAMILY_PARTITION(x) x #endif #if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) #define USE_CREATE_THREAD #endif //------------------------------------------------------------------------------ // simplistic pthread emulation layer // _beginthreadex requires __stdcall #if defined(__GNUC__) && \ (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) #define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall #else #define THREADFN unsigned int __stdcall #endif #define THREAD_EXIT_SUCCESS 0 static inline int pthread_attr_init(pthread_attr_t *attr) { (void)attr; return 0; } static inline int pthread_attr_destroy(pthread_attr_t *attr) { (void)attr; return 0; } static inline int pthread_attr_getstacksize(const pthread_attr_t *attr, size_t *stacksize) { (void)attr; (void)stacksize; return EINVAL; } static inline int pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize) { (void)attr; (void)stacksize; return EINVAL; } static inline int pthread_create(pthread_t *const thread, const pthread_attr_t *attr, unsigned int(__stdcall *start)(void *), void *arg) { (void)attr; #ifdef USE_CREATE_THREAD *thread = CreateThread(NULL, /* lpThreadAttributes */ 0, /* dwStackSize */ start, arg, 0, /* dwStackSize */ NULL); /* lpThreadId */ #else *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ 0, /* unsigned stack_size */ start, arg, 0, /* unsigned initflag */ NULL); /* unsigned *thrdaddr */ #endif if (*thread == NULL) return 1; SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); return 0; } static inline int pthread_join(pthread_t thread, void **value_ptr) { (void)value_ptr; return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) != WAIT_OBJECT_0 || CloseHandle(thread) == 0); } // Mutex static inline int pthread_mutex_init(pthread_mutex_t *const mutex, void *mutexattr) { (void)mutexattr; InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/); return 0; } static inline int pthread_mutex_trylock(pthread_mutex_t *const mutex) { return TryEnterCriticalSection(mutex) ? 0 : EBUSY; } static inline int pthread_mutex_lock(pthread_mutex_t *const mutex) { EnterCriticalSection(mutex); return 0; } static inline int pthread_mutex_unlock(pthread_mutex_t *const mutex) { LeaveCriticalSection(mutex); return 0; } static inline int pthread_mutex_destroy(pthread_mutex_t *const mutex) { DeleteCriticalSection(mutex); return 0; } // Condition static inline int pthread_cond_destroy(pthread_cond_t *const condition) { (void)condition; return 0; } static inline int pthread_cond_init(pthread_cond_t *const condition, void *cond_attr) { (void)cond_attr; InitializeConditionVariable(condition); return 0; } static inline int pthread_cond_signal(pthread_cond_t *const condition) { WakeConditionVariable(condition); return 0; } static inline int pthread_cond_broadcast(pthread_cond_t *const condition) { WakeAllConditionVariable(condition); return 0; } static inline int pthread_cond_wait(pthread_cond_t *const condition, pthread_mutex_t *const mutex) { int ok; ok = SleepConditionVariableCS(condition, mutex, INFINITE); return !ok; } #else // _WIN32 #include // NOLINT #define THREADFN void * #define THREAD_EXIT_SUCCESS NULL #endif #ifdef __cplusplus } // extern "C" #endif #endif // CONFIG_MULTITHREAD #endif // AOM_AOM_UTIL_AOM_PTHREAD_H_ aom-3.12.1/aom_util/aom_thread.c000066400000000000000000000204541477627663500164660ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // // Multi-threaded worker // // Original source: // https://chromium.googlesource.com/webm/libwebp // Enable GNU extensions in glibc so that we can call pthread_setname_np(). // This must be before any #include statements. #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include #include // for memset() #include "config/aom_config.h" #include "aom_mem/aom_mem.h" #include "aom_ports/sanitizer.h" #include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #if CONFIG_MULTITHREAD struct AVxWorkerImpl { pthread_mutex_t mutex_; pthread_cond_t condition_; pthread_t thread_; }; //------------------------------------------------------------------------------ static void execute(AVxWorker *const worker); // Forward declaration. static THREADFN thread_loop(void *ptr) { AVxWorker *const worker = (AVxWorker *)ptr; #ifdef __APPLE__ if (worker->thread_name != NULL) { // Apple's version of pthread_setname_np takes one argument and operates on // the current thread only. The maximum size of the thread_name buffer was // noted in the Chromium source code and was confirmed by experiments. If // thread_name is too long, pthread_setname_np returns -1 with errno // ENAMETOOLONG (63). char thread_name[64]; strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1); thread_name[sizeof(thread_name) - 1] = '\0'; pthread_setname_np(thread_name); } #elif (defined(__GLIBC__) && !defined(__GNU__)) || defined(__BIONIC__) if (worker->thread_name != NULL) { // Linux and Android require names (with nul) fit in 16 chars, otherwise // pthread_setname_np() returns ERANGE (34). char thread_name[16]; strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1); thread_name[sizeof(thread_name) - 1] = '\0'; pthread_setname_np(pthread_self(), thread_name); } #endif pthread_mutex_lock(&worker->impl_->mutex_); for (;;) { while (worker->status_ == AVX_WORKER_STATUS_OK) { // wait in idling mode pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); } if (worker->status_ == AVX_WORKER_STATUS_WORKING) { // When worker->status_ is AVX_WORKER_STATUS_WORKING, the main thread // doesn't change worker->status_ and will wait until the worker changes // worker->status_ to AVX_WORKER_STATUS_OK. See change_state(). So the // worker can safely call execute() without holding worker->impl_->mutex_. // When the worker reacquires worker->impl_->mutex_, worker->status_ must // still be AVX_WORKER_STATUS_WORKING. pthread_mutex_unlock(&worker->impl_->mutex_); execute(worker); pthread_mutex_lock(&worker->impl_->mutex_); assert(worker->status_ == AVX_WORKER_STATUS_WORKING); worker->status_ = AVX_WORKER_STATUS_OK; // signal to the main thread that we're done (for sync()) pthread_cond_signal(&worker->impl_->condition_); } else { assert(worker->status_ == AVX_WORKER_STATUS_NOT_OK); // finish the worker break; } } pthread_mutex_unlock(&worker->impl_->mutex_); return THREAD_EXIT_SUCCESS; // Thread is finished } // main thread state control static void change_state(AVxWorker *const worker, AVxWorkerStatus new_status) { // No-op when attempting to change state on a thread that didn't come up. // Checking status_ without acquiring the lock first would result in a data // race. if (worker->impl_ == NULL) return; pthread_mutex_lock(&worker->impl_->mutex_); if (worker->status_ >= AVX_WORKER_STATUS_OK) { // wait for the worker to finish while (worker->status_ != AVX_WORKER_STATUS_OK) { pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); } // assign new status and release the working thread if needed if (new_status != AVX_WORKER_STATUS_OK) { worker->status_ = new_status; pthread_cond_signal(&worker->impl_->condition_); } } pthread_mutex_unlock(&worker->impl_->mutex_); } #endif // CONFIG_MULTITHREAD //------------------------------------------------------------------------------ static void init(AVxWorker *const worker) { memset(worker, 0, sizeof(*worker)); worker->status_ = AVX_WORKER_STATUS_NOT_OK; } static int sync(AVxWorker *const worker) { #if CONFIG_MULTITHREAD change_state(worker, AVX_WORKER_STATUS_OK); #endif assert(worker->status_ <= AVX_WORKER_STATUS_OK); return !worker->had_error; } static int reset(AVxWorker *const worker) { int ok = 1; worker->had_error = 0; if (worker->status_ < AVX_WORKER_STATUS_OK) { #if CONFIG_MULTITHREAD worker->impl_ = (AVxWorkerImpl *)aom_calloc(1, sizeof(*worker->impl_)); if (worker->impl_ == NULL) { return 0; } if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) { goto Error; } if (pthread_cond_init(&worker->impl_->condition_, NULL)) { pthread_mutex_destroy(&worker->impl_->mutex_); goto Error; } pthread_attr_t attr; if (pthread_attr_init(&attr)) goto Error2; // Debug ASan builds require at least ~1MiB of stack; prevents // failures on macOS arm64 where the default is 512KiB. // See: https://crbug.com/aomedia/3379 #if defined(AOM_ADDRESS_SANITIZER) && defined(__APPLE__) && AOM_ARCH_ARM && \ !defined(NDEBUG) const size_t kMinStackSize = 1024 * 1024; #else const size_t kMinStackSize = 256 * 1024; #endif size_t stacksize; if (!pthread_attr_getstacksize(&attr, &stacksize)) { if (stacksize < kMinStackSize && pthread_attr_setstacksize(&attr, kMinStackSize)) { pthread_attr_destroy(&attr); goto Error2; } } pthread_mutex_lock(&worker->impl_->mutex_); ok = !pthread_create(&worker->impl_->thread_, &attr, thread_loop, worker); if (ok) worker->status_ = AVX_WORKER_STATUS_OK; pthread_mutex_unlock(&worker->impl_->mutex_); pthread_attr_destroy(&attr); if (!ok) { Error2: pthread_mutex_destroy(&worker->impl_->mutex_); pthread_cond_destroy(&worker->impl_->condition_); Error: aom_free(worker->impl_); worker->impl_ = NULL; return 0; } #else worker->status_ = AVX_WORKER_STATUS_OK; #endif } else if (worker->status_ > AVX_WORKER_STATUS_OK) { ok = sync(worker); } assert(!ok || (worker->status_ == AVX_WORKER_STATUS_OK)); return ok; } static void execute(AVxWorker *const worker) { if (worker->hook != NULL) { worker->had_error |= !worker->hook(worker->data1, worker->data2); } } static void launch(AVxWorker *const worker) { #if CONFIG_MULTITHREAD change_state(worker, AVX_WORKER_STATUS_WORKING); #else execute(worker); #endif } static void end(AVxWorker *const worker) { #if CONFIG_MULTITHREAD if (worker->impl_ != NULL) { change_state(worker, AVX_WORKER_STATUS_NOT_OK); pthread_join(worker->impl_->thread_, NULL); pthread_mutex_destroy(&worker->impl_->mutex_); pthread_cond_destroy(&worker->impl_->condition_); aom_free(worker->impl_); worker->impl_ = NULL; } #else worker->status_ = AVX_WORKER_STATUS_NOT_OK; assert(worker->impl_ == NULL); #endif assert(worker->status_ == AVX_WORKER_STATUS_NOT_OK); } //------------------------------------------------------------------------------ static AVxWorkerInterface g_worker_interface = { init, reset, sync, launch, execute, end }; int aom_set_worker_interface(const AVxWorkerInterface *const winterface) { if (winterface == NULL || winterface->init == NULL || winterface->reset == NULL || winterface->sync == NULL || winterface->launch == NULL || winterface->execute == NULL || winterface->end == NULL) { return 0; } g_worker_interface = *winterface; return 1; } const AVxWorkerInterface *aom_get_worker_interface(void) { return &g_worker_interface; } //------------------------------------------------------------------------------ aom-3.12.1/aom_util/aom_thread.h000066400000000000000000000076571477627663500165050ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // // Multi-threaded worker // // Original source: // https://chromium.googlesource.com/webm/libwebp #ifndef AOM_AOM_UTIL_AOM_THREAD_H_ #define AOM_AOM_UTIL_AOM_THREAD_H_ #ifdef __cplusplus extern "C" { #endif // State of the worker thread object typedef enum { AVX_WORKER_STATUS_NOT_OK = 0, // object is unusable AVX_WORKER_STATUS_OK, // ready to work AVX_WORKER_STATUS_WORKING // busy finishing the current task } AVxWorkerStatus; // Function to be called by the worker thread. Takes two opaque pointers as // arguments (data1 and data2). Should return true on success and return false // in case of error. typedef int (*AVxWorkerHook)(void *, void *); // Platform-dependent implementation details for the worker. typedef struct AVxWorkerImpl AVxWorkerImpl; // Synchronization object used to launch job in the worker thread typedef struct { AVxWorkerImpl *impl_; AVxWorkerStatus status_; // Thread name for the debugger. If not NULL, must point to a string that // outlives the worker thread. For portability, use a name <= 15 characters // long (not including the terminating NUL character). const char *thread_name; AVxWorkerHook hook; // hook to call void *data1; // first argument passed to 'hook' void *data2; // second argument passed to 'hook' int had_error; // true if a call to 'hook' returned false } AVxWorker; // The interface for all thread-worker related functions. All these functions // must be implemented. typedef struct { // Must be called first, before any other method. void (*init)(AVxWorker *const worker); // Must be called to initialize the object and spawn the thread. Re-entrant. // Will potentially launch the thread. Returns false in case of error. int (*reset)(AVxWorker *const worker); // Makes sure the previous work is finished. Returns true if worker->had_error // was not set and no error condition was triggered by the working thread. int (*sync)(AVxWorker *const worker); // Triggers the thread to call hook() with data1 and data2 arguments. These // hook/data1/data2 values can be changed at any time before calling this // function, but not be changed afterward until the next call to Sync(). void (*launch)(AVxWorker *const worker); // This function is similar to launch() except that it calls the // hook directly instead of using a thread. Convenient to bypass the thread // mechanism while still using the AVxWorker structs. sync() must // still be called afterward (for error reporting). void (*execute)(AVxWorker *const worker); // Kill the thread and terminate the object. To use the object again, one // must call reset() again. void (*end)(AVxWorker *const worker); } AVxWorkerInterface; // Install a new set of threading functions, overriding the defaults. This // should be done before any workers are started, i.e., before any encoding or // decoding takes place. The contents of the interface struct are copied, it // is safe to free the corresponding memory after this call. This function is // not thread-safe. Return false in case of invalid pointer or methods. int aom_set_worker_interface(const AVxWorkerInterface *const winterface); // Retrieve the currently set thread worker interface. const AVxWorkerInterface *aom_get_worker_interface(void); //------------------------------------------------------------------------------ #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_UTIL_AOM_THREAD_H_ aom-3.12.1/aom_util/aom_util.cmake000066400000000000000000000026561477627663500170360ustar00rootroot00000000000000# # Copyright (c) 2017, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # if(AOM_AOM_UTIL_AOM_UTIL_CMAKE_) return() endif() # AOM_AOM_UTIL_AOM_UTIL_CMAKE_ set(AOM_AOM_UTIL_AOM_UTIL_CMAKE_ 1) list(APPEND AOM_UTIL_SOURCES "${AOM_ROOT}/aom_util/aom_pthread.h" "${AOM_ROOT}/aom_util/aom_thread.c" "${AOM_ROOT}/aom_util/aom_thread.h" "${AOM_ROOT}/aom_util/endian_inl.h") if(CONFIG_BITSTREAM_DEBUG) list(APPEND AOM_UTIL_SOURCES "${AOM_ROOT}/aom_util/debug_util.c" "${AOM_ROOT}/aom_util/debug_util.h") endif() # Creates the aom_util build target and makes libaom depend on it. The libaom # target must exist before this function is called. function(setup_aom_util_targets) add_library(aom_util OBJECT ${AOM_UTIL_SOURCES}) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_util PARENT_SCOPE) target_sources(aom PRIVATE $) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $) endif() endfunction() aom-3.12.1/aom_util/debug_util.c000066400000000000000000000235031477627663500165040ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "aom_util/debug_util.h" static int frame_idx_w = 0; static int frame_idx_r = 0; void aom_bitstream_queue_set_frame_write(int frame_idx) { frame_idx_w = frame_idx; } int aom_bitstream_queue_get_frame_write(void) { return frame_idx_w; } void aom_bitstream_queue_set_frame_read(int frame_idx) { frame_idx_r = frame_idx; } int aom_bitstream_queue_get_frame_read(void) { return frame_idx_r; } #if CONFIG_BITSTREAM_DEBUG #define QUEUE_MAX_SIZE 4000000 static int result_queue[QUEUE_MAX_SIZE]; static int nsymbs_queue[QUEUE_MAX_SIZE]; static aom_cdf_prob cdf_queue[QUEUE_MAX_SIZE][16]; static int queue_r = 0; static int queue_w = 0; static int queue_prev_w = -1; static int skip_r = 0; static int skip_w = 0; void bitstream_queue_set_skip_write(int skip) { skip_w = skip; } void bitstream_queue_set_skip_read(int skip) { skip_r = skip; } void bitstream_queue_record_write(void) { queue_prev_w = queue_w; } void bitstream_queue_reset_write(void) { queue_w = queue_prev_w; } int bitstream_queue_get_write(void) { return queue_w; } int bitstream_queue_get_read(void) { return queue_r; } void bitstream_queue_pop(int *result, aom_cdf_prob *cdf, int *nsymbs) { if (!skip_r) { if (queue_w == queue_r) { printf("buffer underflow queue_w %d queue_r %d\n", queue_w, queue_r); assert(0); } *result = result_queue[queue_r]; *nsymbs = nsymbs_queue[queue_r]; memcpy(cdf, cdf_queue[queue_r], *nsymbs * sizeof(*cdf)); queue_r = (queue_r + 1) % QUEUE_MAX_SIZE; } } void bitstream_queue_push(int result, const aom_cdf_prob *cdf, int nsymbs) { // If you observe a CDF error: // - Set 'debug_cdf_mismatch' to true // - Set target_frame_idx_r and target_queue_r to where CDF error was reported // - Set a breakpoint in debugger at the 'fprintf' below. const bool debug_cdf_mismatch = false; if (debug_cdf_mismatch) { int target_frame_idx_r = 1; int target_queue_r = 18005; if (frame_idx_w == target_frame_idx_r && queue_w == target_queue_r) { fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n", frame_idx_w, queue_w); } } if (!skip_w) { result_queue[queue_w] = result; nsymbs_queue[queue_w] = nsymbs; memcpy(cdf_queue[queue_w], cdf, nsymbs * sizeof(*cdf)); queue_w = (queue_w + 1) % QUEUE_MAX_SIZE; if (queue_w == queue_r) { printf("buffer overflow queue_w %d queue_r %d\n", queue_w, queue_r); assert(0); } } } #endif // CONFIG_BITSTREAM_DEBUG #if CONFIG_MISMATCH_DEBUG static int frame_buf_idx_r = 0; static int frame_buf_idx_w = 0; static int max_frame_buf_num = 5; #define MAX_FRAME_STRIDE 1280 #define MAX_FRAME_HEIGHT 720 static uint16_t frame_pre[5][3][MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT]; // prediction only static uint16_t frame_tx[5][3][MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT]; // prediction + txfm static int frame_stride = MAX_FRAME_STRIDE; static int frame_height = MAX_FRAME_HEIGHT; static int frame_size = MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT; void mismatch_move_frame_idx_w(void) { frame_buf_idx_w = (frame_buf_idx_w + 1) % max_frame_buf_num; if (frame_buf_idx_w == frame_buf_idx_r) { printf("frame_buf overflow\n"); assert(0); } } void mismatch_reset_frame(int num_planes) { for (int plane = 0; plane < num_planes; ++plane) { memset(frame_pre[frame_buf_idx_w][plane], 0, sizeof(frame_pre[frame_buf_idx_w][plane][0]) * frame_size); memset(frame_tx[frame_buf_idx_w][plane], 0, sizeof(frame_tx[frame_buf_idx_w][plane][0]) * frame_size); } } void mismatch_move_frame_idx_r(void) { if (frame_buf_idx_w == frame_buf_idx_r) { printf("frame_buf underflow\n"); assert(0); } frame_buf_idx_r = (frame_buf_idx_r + 1) % max_frame_buf_num; } void mismatch_record_block_pre(const uint8_t *src, int src_stride, int frame_offset, int plane, int pixel_c, int pixel_r, int blk_w, int blk_h, int highbd) { if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { printf("frame_buf undersized\n"); assert(0); } const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; for (int r = 0; r < blk_h; ++r) { for (int c = 0; c < blk_w; ++c) { frame_pre[frame_buf_idx_w][plane] [(r + pixel_r) * frame_stride + c + pixel_c] = src16 ? src16[r * src_stride + c] : src[r * src_stride + c]; } } #if 0 int ref_frame_idx = 3; int ref_frame_offset = 4; int ref_plane = 1; int ref_pixel_c = 162; int ref_pixel_r = 16; if (frame_idx_w == ref_frame_idx && plane == ref_plane && frame_offset == ref_frame_offset && ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w && ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) { printf( "\nrecord_block_pre frame_idx %d frame_offset %d plane %d pixel_c %d pixel_r %d blk_w " "%d blk_h %d\n", frame_idx_w, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h); } #endif } void mismatch_record_block_tx(const uint8_t *src, int src_stride, int frame_offset, int plane, int pixel_c, int pixel_r, int blk_w, int blk_h, int highbd) { if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { printf("frame_buf undersized\n"); assert(0); } const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; for (int r = 0; r < blk_h; ++r) { for (int c = 0; c < blk_w; ++c) { frame_tx[frame_buf_idx_w][plane] [(r + pixel_r) * frame_stride + c + pixel_c] = src16 ? src16[r * src_stride + c] : src[r * src_stride + c]; } } #if 0 int ref_frame_idx = 3; int ref_frame_offset = 4; int ref_plane = 1; int ref_pixel_c = 162; int ref_pixel_r = 16; if (frame_idx_w == ref_frame_idx && plane == ref_plane && frame_offset == ref_frame_offset && ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w && ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) { printf( "\nrecord_block_tx frame_idx %d frame_offset %d plane %d pixel_c %d pixel_r %d blk_w " "%d blk_h %d\n", frame_idx_w, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h); } #endif } void mismatch_check_block_pre(const uint8_t *src, int src_stride, int frame_offset, int plane, int pixel_c, int pixel_r, int blk_w, int blk_h, int highbd) { if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { printf("frame_buf undersized\n"); assert(0); } const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; int mismatch = 0; for (int r = 0; r < blk_h; ++r) { for (int c = 0; c < blk_w; ++c) { if (frame_pre[frame_buf_idx_r][plane] [(r + pixel_r) * frame_stride + c + pixel_c] != (uint16_t)(src16 ? src16[r * src_stride + c] : src[r * src_stride + c])) { mismatch = 1; } } } if (mismatch) { printf( "\ncheck_block_pre failed frame_idx %d frame_offset %d plane %d " "pixel_c %d pixel_r " "%d blk_w %d blk_h %d\n", frame_idx_r, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h); printf("enc\n"); for (int rr = 0; rr < blk_h; ++rr) { for (int cc = 0; cc < blk_w; ++cc) { printf("%d ", frame_pre[frame_buf_idx_r][plane] [(rr + pixel_r) * frame_stride + cc + pixel_c]); } printf("\n"); } printf("dec\n"); for (int rr = 0; rr < blk_h; ++rr) { for (int cc = 0; cc < blk_w; ++cc) { printf("%d ", src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]); } printf("\n"); } assert(0); } } void mismatch_check_block_tx(const uint8_t *src, int src_stride, int frame_offset, int plane, int pixel_c, int pixel_r, int blk_w, int blk_h, int highbd) { if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { printf("frame_buf undersized\n"); assert(0); } const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; int mismatch = 0; for (int r = 0; r < blk_h; ++r) { for (int c = 0; c < blk_w; ++c) { if (frame_tx[frame_buf_idx_r][plane] [(r + pixel_r) * frame_stride + c + pixel_c] != (uint16_t)(src16 ? src16[r * src_stride + c] : src[r * src_stride + c])) { mismatch = 1; } } } if (mismatch) { printf( "\ncheck_block_tx failed frame_idx %d frame_offset %d plane %d pixel_c " "%d pixel_r " "%d blk_w %d blk_h %d\n", frame_idx_r, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h); printf("enc\n"); for (int rr = 0; rr < blk_h; ++rr) { for (int cc = 0; cc < blk_w; ++cc) { printf("%d ", frame_tx[frame_buf_idx_r][plane] [(rr + pixel_r) * frame_stride + cc + pixel_c]); } printf("\n"); } printf("dec\n"); for (int rr = 0; rr < blk_h; ++rr) { for (int cc = 0; cc < blk_w; ++cc) { printf("%d ", src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]); } printf("\n"); } assert(0); } } #endif // CONFIG_MISMATCH_DEBUG aom-3.12.1/aom_util/debug_util.h000066400000000000000000000057511477627663500165160ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AOM_UTIL_DEBUG_UTIL_H_ #define AOM_AOM_UTIL_DEBUG_UTIL_H_ #include "config/aom_config.h" #include "aom_dsp/prob.h" #ifdef __cplusplus extern "C" { #endif void aom_bitstream_queue_set_frame_write(int frame_idx); int aom_bitstream_queue_get_frame_write(void); void aom_bitstream_queue_set_frame_read(int frame_idx); int aom_bitstream_queue_get_frame_read(void); #if CONFIG_BITSTREAM_DEBUG /* This is a debug tool used to detect bitstream error. On encoder side, it * pushes each bit and probability into a queue before the bit is written into * the Arithmetic coder. On decoder side, whenever a bit is read out from the * Arithmetic coder, it pops out the reference bit and probability from the * queue as well. If the two results do not match, this debug tool will report * an error. This tool can be used to pin down the bitstream error precisely. * By combining gdb's backtrace method, we can detect which module causes the * bitstream error. */ int bitstream_queue_get_write(void); int bitstream_queue_get_read(void); void bitstream_queue_record_write(void); void bitstream_queue_reset_write(void); void bitstream_queue_pop(int *result, aom_cdf_prob *cdf, int *nsymbs); void bitstream_queue_push(int result, const aom_cdf_prob *cdf, int nsymbs); void bitstream_queue_set_skip_write(int skip); void bitstream_queue_set_skip_read(int skip); #endif // CONFIG_BITSTREAM_DEBUG #if CONFIG_MISMATCH_DEBUG void mismatch_move_frame_idx_w(); void mismatch_move_frame_idx_r(); void mismatch_reset_frame(int num_planes); void mismatch_record_block_pre(const uint8_t *src, int src_stride, int frame_offset, int plane, int pixel_c, int pixel_r, int blk_w, int blk_h, int highbd); void mismatch_record_block_tx(const uint8_t *src, int src_stride, int frame_offset, int plane, int pixel_c, int pixel_r, int blk_w, int blk_h, int highbd); void mismatch_check_block_pre(const uint8_t *src, int src_stride, int frame_offset, int plane, int pixel_c, int pixel_r, int blk_w, int blk_h, int highbd); void mismatch_check_block_tx(const uint8_t *src, int src_stride, int frame_offset, int plane, int pixel_c, int pixel_r, int blk_w, int blk_h, int highbd); #endif // CONFIG_MISMATCH_DEBUG #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AOM_UTIL_DEBUG_UTIL_H_ aom-3.12.1/aom_util/endian_inl.h000066400000000000000000000063741477627663500164750ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // // Endian related functions. #ifndef AOM_AOM_UTIL_ENDIAN_INL_H_ #define AOM_AOM_UTIL_ENDIAN_INL_H_ #include #include "config/aom_config.h" #include "aom/aom_integer.h" #if defined(__GNUC__) #define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__) #define LOCAL_GCC_PREREQ(maj, min) (LOCAL_GCC_VERSION >= (((maj) << 8) | (min))) #else #define LOCAL_GCC_VERSION 0 #define LOCAL_GCC_PREREQ(maj, min) 0 #endif // handle clang compatibility #ifndef __has_builtin #define __has_builtin(x) 0 #endif // some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__) #if !defined(WORDS_BIGENDIAN) && \ (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \ (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))) #define WORDS_BIGENDIAN #endif #if defined(WORDS_BIGENDIAN) #define HToLE32 BSwap32 #define HToLE16 BSwap16 #define HToBE64(x) (x) #define HToBE32(x) (x) #else #define HToLE32(x) (x) #define HToLE16(x) (x) #define HToBE64(X) BSwap64(X) #define HToBE32(X) BSwap32(X) #endif #if LOCAL_GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) #define HAVE_BUILTIN_BSWAP16 #endif #if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32) #define HAVE_BUILTIN_BSWAP32 #endif #if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64) #define HAVE_BUILTIN_BSWAP64 #endif static inline uint16_t BSwap16(uint16_t x) { #if defined(HAVE_BUILTIN_BSWAP16) return __builtin_bswap16(x); #elif defined(_MSC_VER) return _byteswap_ushort(x); #else // gcc will recognize a 'rorw $8, ...' here: return (x >> 8) | ((x & 0xff) << 8); #endif // HAVE_BUILTIN_BSWAP16 } static inline uint32_t BSwap32(uint32_t x) { #if defined(HAVE_BUILTIN_BSWAP32) return __builtin_bswap32(x); #elif defined(__i386__) || defined(__x86_64__) uint32_t swapped_bytes; __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x)); return swapped_bytes; #elif defined(_MSC_VER) return (uint32_t)_byteswap_ulong(x); #else return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24); #endif // HAVE_BUILTIN_BSWAP32 } static inline uint64_t BSwap64(uint64_t x) { #if defined(HAVE_BUILTIN_BSWAP64) return __builtin_bswap64(x); #elif defined(__x86_64__) uint64_t swapped_bytes; __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x)); return swapped_bytes; #elif defined(_MSC_VER) return (uint64_t)_byteswap_uint64(x); #else // generic code for swapping 64-bit values (suggested by bdb@) x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32); x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16); x = ((x & 0xff00ff00ff00ff00ull) >> 8) | ((x & 0x00ff00ff00ff00ffull) << 8); return x; #endif // HAVE_BUILTIN_BSWAP64 } #endif // AOM_AOM_UTIL_ENDIAN_INL_H_ aom-3.12.1/aomedia_logo_200.png000066400000000000000000000156141477627663500161160ustar00rootroot00000000000000PNG  IHDR;>^)tEXtSoftwareAdobe ImageReadyqe<&iTXtXML:com.adobe.xmp UIDATx] xŕ~=39tKl 9&KX r/a &Xp!! K8XCn6d>dɺ94g^R{Iާzիto@]Ea}zl')9 @+t{Ϧ#/fe\sQ`AA!fl:d 㻹e}ryWvt Ƚʦ@I @]- 6r&i\73vM ;잳v7m9 2fYazCV1.b0c-4"x}^q r 0mP[B}_ӹﻌ1n`\Es﵌21τߏ$+1n!(,*½x>c(ƿcj>)&Iя kvFKPuнO0RM]&|L3~'/r5}R/[-8|tCLWNHSnc KߦϠurIy@@P, c/T-{G4 :Px/O &d +3VC@_U3>?wcȸ=s!V!fRx<lxFB{yM { ]=< NeEZ%GR(-ʟҼL4=b}/Ynq{2Zxe|e `ó ~@tƵj@f4@fy.,µ,Sr]@|,~ngs\!X!Rh|}DwM? oin%aJ·&) \NCC63A5/sB;L̈A6. z?հ"*'X-uq @BUj5t1ϥ?b0+A>1f1R$fh ,>qHkߑƷ,;~O c_J@擳d)C?"e, p;IvrS:4x#9/0BY?`Mu@Xpεz绰-O&o6;㩜=rN9o д҄_3;Ε8gWX9 WƸ4[Y"osj$ς7"mfT*x42PגFU|4M7P9p^!(:r {NMѭokr! BoHҼ^O9jh&YM%];;Mq&]:.$ y8Ex;hSR$h5Ѱdzu+uj&`"ԇ^/-{fa2Iu9]0p&Ek\x 7a0\^˖RH/ ޲1Vƈٙ>'X4CM` cgQCuXj "Dy̩Lj"5CoQ\ϋm§~/wR C8X3+g]; 6GO-d.G'4jx,9\_S^oo6zs{q<Vsm(HhI(לx8!u_!Kt-M͇9dTJ8尮;''bIcr厯HSJV}oN~ NCfবslDkȂ\0`xbՂ`G/h4 EU0v4ظaUZI{&K+_LQ R-ۨcϔs^*}+Y6au|=NJpl-EÞ$1걛Ѹw-MN]MF?[*kS??%pmHo]1,n3=NS3BCjMdbDƚ36=hMM^Gힴid>݊;f9vcE {ҦY.OQWD[i aEL0'ӎ)8 QD>ܶisu|5Bx9ʪuHL|#_HupѦS*AzM%tl||$}Gk$FU; }l?MG0ڲ̔;2(jHJ? q#10Jxir2KiYU i}$[W׸5YD \^ nLyL|3(NNC !-:ԄIB\@Wg 6s"ࣚe[/w/Å~8P?ׁ #x1O-Eb?׃k#3Y7f* V/N߄^.y7ָi/ <5r8Ɍ0L&Pd"`+]1 B\x$gVXela;601qS9`~4+̂i'W/:By8z,(xA䅮 B( z+*odw|Hahd5&iWjjPXwmȲ5W a֩YF䁾tw- NRfd HEm֘WG#ʺ}Ek&m1*ӋA&8$PF b98C^xor^(WKM C"?!\@$wY=h\cB(,덖%$0`a66']oUtްa\-5|A(Axp J!YԝEYbeJ1W xaHDBD``p1_ cLIej`Dwܡw^J4fϤ\s?^Al .׷n U%#%5NyB+ï`|[~*WaN9Tt{d3~zpǜGRw9cм[ySrk BAt|A8Dn>& RBWv(ߏSuO#D7 4hDHT((:샾mCא` noӹ_M 8`g:lA[i'#:QP.f f YARy`~"՚$I`ꊖ9+O 5nP= qprkE1oB&0!l+R,Zg.OhaI;cv/Zc&w`r8}ۯ(vZ RR>YdކDݚUKL7sK9TfY|3XPxՐP{op<3ށ%CBaDnCdAX)%X'&R~;?+`;TU SNk`{HH"zQǙdd3Q˪uxKP& `[xOwt@ZrJ;˟(z4{9Б 7aGES 2sY"Njצ)P; κ eB'õq>)q6]0"i~3u(D^ ĥ5w H Rx?9"IޛHu Lýg\9_ܻY/D8.ǣqzqY ]u{7#R( " :/)'baJ ȽI=uHXXpր}P >ϋˠo0з8x$[ N& j$cIM,S" VO&k&°jCİһɺc[RGR@&mu\V'%%sN^+%EJg5HRґm AwmRX]U֎* n@[F8|\89|=|SAqI,fc=6 PQⱘ9#wiD䌨;nf <σ[s1}y&m-(B9$8o> U€WUTZCutuIO?: lJ3C$(ՅwxqfF@bƒ'M(9=V҃WBq9 '6^&WBV~3+^NBaATIݨMM0CHcm$x8I,:RǿG,S\hM3vvu2="A܊/Sq(`>5;$gBJ.yYcGT"ɨ\gL2m`ۍz8%zbU 镟{38j나Zq-:YJTfmZ!vΣw=MlwVrCoYv$ED0 /dIQ4:܁*`[ Hvt r;[!UIc`JI6r[R6t ԫ݃5Ŭ )CYF(&%aD iy1_X֙u }Np\vt ,z\-T$Kpx0o@ ˏ0 mi&MHnbanf8efskVaM!dɦ/ L!Ghz?E0YVocV"6"; K@pSCnL(mPޅK6 c,ZC[@.s.?I> 2k۞ix[HiD bZ/1?.kOb!6 cq]4yC%||;X">'77@9bMP@pn9awn:+ߟX\{f6̲iP5Kn lnl蝸Q Îebsu<20P`MI` %s}3IENDB`aom-3.12.1/apps/000077500000000000000000000000001477627663500133445ustar00rootroot00000000000000aom-3.12.1/apps/aomdec.c000066400000000000000000001072341477627663500147470ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include #include "config/aom_config.h" #if CONFIG_OS_SUPPORT #if HAVE_UNISTD_H #include // NOLINT #elif !defined(STDOUT_FILENO) #define STDOUT_FILENO 1 #endif #endif #include "aom/aom_decoder.h" #include "aom/aomdx.h" #include "aom_ports/aom_timer.h" #include "aom_ports/mem_ops.h" #include "common/args.h" #include "common/ivfdec.h" #include "common/md5_utils.h" #include "common/obudec.h" #include "common/tools_common.h" #if CONFIG_WEBM_IO #include "common/webmdec.h" #endif #include "common/rawenc.h" #include "common/y4menc.h" #if CONFIG_LIBYUV #include "third_party/libyuv/include/libyuv/scale.h" #endif static const char *exec_name; struct AvxDecInputContext { struct AvxInputContext *aom_input_ctx; struct ObuDecInputContext *obu_ctx; struct WebmInputContext *webm_ctx; }; static const arg_def_t help = ARG_DEF(NULL, "help", 0, "Show usage options and exit"); static const arg_def_t looparg = ARG_DEF(NULL, "loops", 1, "Number of times to decode the file"); static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use"); static const arg_def_t use_yv12 = ARG_DEF(NULL, "yv12", 0, "Output raw YV12 frames"); static const arg_def_t use_i420 = ARG_DEF(NULL, "i420", 0, "Output raw I420 frames"); static const arg_def_t flipuvarg = ARG_DEF(NULL, "flipuv", 0, "Flip the chroma planes in the output"); static const arg_def_t rawvideo = ARG_DEF(NULL, "rawvideo", 0, "Output raw YUV frames"); static const arg_def_t noblitarg = ARG_DEF(NULL, "noblit", 0, "Don't process the decoded frames"); static const arg_def_t progressarg = ARG_DEF(NULL, "progress", 0, "Show progress after each frame decodes"); static const arg_def_t limitarg = ARG_DEF(NULL, "limit", 1, "Stop decoding after n frames"); static const arg_def_t skiparg = ARG_DEF(NULL, "skip", 1, "Skip the first n input frames"); static const arg_def_t summaryarg = ARG_DEF(NULL, "summary", 0, "Show timing summary"); static const arg_def_t outputfile = ARG_DEF("o", "output", 1, "Output file name pattern (see below)"); static const arg_def_t threadsarg = ARG_DEF("t", "threads", 1, "Max threads to use"); static const arg_def_t rowmtarg = ARG_DEF(NULL, "row-mt", 1, "Enable row based multi-threading, default: 0"); static const arg_def_t verbosearg = ARG_DEF("v", "verbose", 0, "Show version string"); static const arg_def_t scalearg = ARG_DEF("S", "scale", 0, "Scale output frames uniformly"); static const arg_def_t continuearg = ARG_DEF("k", "keep-going", 0, "(debug) Continue decoding after error"); static const arg_def_t fb_arg = ARG_DEF(NULL, "frame-buffers", 1, "Number of frame buffers to use"); static const arg_def_t md5arg = ARG_DEF(NULL, "md5", 0, "Compute the MD5 sum of the decoded frame"); static const arg_def_t framestatsarg = ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)"); static const arg_def_t outbitdeptharg = ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames"); static const arg_def_t isannexb = ARG_DEF(NULL, "annexb", 0, "Bitstream is in Annex-B format"); static const arg_def_t oppointarg = ARG_DEF( NULL, "oppoint", 1, "Select an operating point of a scalable bitstream"); static const arg_def_t outallarg = ARG_DEF( NULL, "all-layers", 0, "Output all decoded frames of a scalable bitstream"); static const arg_def_t skipfilmgrain = ARG_DEF(NULL, "skip-film-grain", 0, "Skip film grain application"); static const arg_def_t *all_args[] = { &help, &codecarg, &use_yv12, &use_i420, &flipuvarg, &rawvideo, &noblitarg, &progressarg, &limitarg, &skiparg, &summaryarg, &outputfile, &threadsarg, &rowmtarg, &verbosearg, &scalearg, &fb_arg, &md5arg, &framestatsarg, &continuearg, &outbitdeptharg, &isannexb, &oppointarg, &outallarg, &skipfilmgrain, NULL }; #if CONFIG_LIBYUV // Returns 0 on success and returns -1 on failure. static inline int libyuv_scale(const aom_image_t *src, aom_image_t *dst, FilterModeEnum mode) { if (src->fmt != dst->fmt) { fprintf(stderr, "%s failed to scale output frame because format changed from %s to " "%s\n", exec_name, image_format_to_string(dst->fmt), image_format_to_string(src->fmt)); return -1; } if (src->fmt == AOM_IMG_FMT_I42016) { return I420Scale_16( (uint16_t *)src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y] / 2, (uint16_t *)src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U] / 2, (uint16_t *)src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V] / 2, src->d_w, src->d_h, (uint16_t *)dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y] / 2, (uint16_t *)dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U] / 2, (uint16_t *)dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V] / 2, dst->d_w, dst->d_h, mode); } if (src->fmt == AOM_IMG_FMT_I420) { return I420Scale(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y], src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U], src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V], src->d_w, src->d_h, dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U], dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V], dst->d_w, dst->d_h, mode); } fprintf(stderr, "%s cannot scale output frame of format %s\n", exec_name, image_format_to_string(src->fmt)); return -1; } #endif static void show_help(FILE *fout, int shorthelp) { fprintf(fout, "Usage: %s filename\n\n", exec_name); if (shorthelp) { fprintf(fout, "Use --help to see the full list of options.\n"); return; } fprintf(fout, "Options:\n"); arg_show_usage(fout, all_args); fprintf(fout, "\nOutput File Patterns:\n\n" " The -o argument specifies the name of the file(s) to " "write to. If the\n argument does not include any escape " "characters, the output will be\n written to a single file. " "Otherwise, the filename will be calculated by\n expanding " "the following escape characters:\n"); fprintf(fout, "\n\t%%w - Frame width" "\n\t%%h - Frame height" "\n\t%% - Frame number, zero padded to places (1..9)" "\n\n Pattern arguments are only supported in conjunction " "with the --yv12 and\n --i420 options. If the -o option is " "not specified, the output will be\n directed to stdout.\n"); fprintf(fout, "\nIncluded decoders:\n\n"); for (int i = 0; i < get_aom_decoder_count(); ++i) { aom_codec_iface_t *decoder = get_aom_decoder_by_index(i); fprintf(fout, " %-6s - %s\n", get_short_name_by_aom_decoder(decoder), aom_codec_iface_name(decoder)); } } void usage_exit(void) { show_help(stderr, 1); exit(EXIT_FAILURE); } static int raw_read_frame(struct AvxInputContext *input_ctx, uint8_t **buffer, size_t *bytes_read, size_t *buffer_size) { unsigned char raw_hdr[RAW_FRAME_HDR_SZ]; size_t frame_size = 0; if (read_from_input(input_ctx, RAW_FRAME_HDR_SZ, raw_hdr) != RAW_FRAME_HDR_SZ) { if (!input_eof(input_ctx)) aom_tools_warn("Failed to read RAW frame size\n"); } else { const size_t kCorruptFrameThreshold = 256 * 1024 * 1024; const size_t kFrameTooSmallThreshold = 256 * 1024; frame_size = mem_get_le32(raw_hdr); if (frame_size > kCorruptFrameThreshold) { aom_tools_warn("Read invalid frame size (%u)\n", (unsigned int)frame_size); frame_size = 0; } if (frame_size < kFrameTooSmallThreshold) { aom_tools_warn( "Warning: Read invalid frame size (%u) - not a raw file?\n", (unsigned int)frame_size); } if (frame_size > *buffer_size) { uint8_t *new_buf = realloc(*buffer, 2 * frame_size); if (new_buf) { *buffer = new_buf; *buffer_size = 2 * frame_size; } else { aom_tools_warn("Failed to allocate compressed data buffer\n"); frame_size = 0; } } } if (!input_eof(input_ctx)) { if (read_from_input(input_ctx, frame_size, *buffer) != frame_size) { aom_tools_warn("Failed to read full frame\n"); return 1; } *bytes_read = frame_size; } return 0; } static int read_frame(struct AvxDecInputContext *input, uint8_t **buf, size_t *bytes_in_buffer, size_t *buffer_size) { switch (input->aom_input_ctx->file_type) { #if CONFIG_WEBM_IO case FILE_TYPE_WEBM: return webm_read_frame(input->webm_ctx, buf, bytes_in_buffer, buffer_size); #endif case FILE_TYPE_RAW: return raw_read_frame(input->aom_input_ctx, buf, bytes_in_buffer, buffer_size); case FILE_TYPE_IVF: return ivf_read_frame(input->aom_input_ctx, buf, bytes_in_buffer, buffer_size, NULL); case FILE_TYPE_OBU: return obudec_read_temporal_unit(input->obu_ctx, buf, bytes_in_buffer, buffer_size); default: return 1; } } static int file_is_raw(struct AvxInputContext *input) { uint8_t buf[32]; int is_raw = 0; aom_codec_stream_info_t si; memset(&si, 0, sizeof(si)); if (buffer_input(input, 32, buf, /*buffered=*/true) == 32) { int i; if (mem_get_le32(buf) < 256 * 1024 * 1024) { for (i = 0; i < get_aom_decoder_count(); ++i) { aom_codec_iface_t *decoder = get_aom_decoder_by_index(i); if (!aom_codec_peek_stream_info(decoder, buf + 4, 32 - 4, &si)) { is_raw = 1; input->fourcc = get_fourcc_by_aom_decoder(decoder); input->width = si.w; input->height = si.h; input->framerate.numerator = 30; input->framerate.denominator = 1; break; } } } } rewind_detect(input); return is_raw; } static void show_progress(int frame_in, int frame_out, uint64_t dx_time) { fprintf(stderr, "%d decoded frames/%d showed frames in %" PRId64 " us (%.2f fps)\r", frame_in, frame_out, dx_time, (double)frame_out * 1000000.0 / (double)dx_time); } struct ExternalFrameBuffer { uint8_t *data; size_t size; int in_use; }; struct ExternalFrameBufferList { int num_external_frame_buffers; struct ExternalFrameBuffer *ext_fb; }; // Callback used by libaom to request an external frame buffer. |cb_priv| // Application private data passed into the set function. |min_size| is the // minimum size in bytes needed to decode the next frame. |fb| pointer to the // frame buffer. static int get_av1_frame_buffer(void *cb_priv, size_t min_size, aom_codec_frame_buffer_t *fb) { int i; struct ExternalFrameBufferList *const ext_fb_list = (struct ExternalFrameBufferList *)cb_priv; if (ext_fb_list == NULL) return -1; // Find a free frame buffer. for (i = 0; i < ext_fb_list->num_external_frame_buffers; ++i) { if (!ext_fb_list->ext_fb[i].in_use) break; } if (i == ext_fb_list->num_external_frame_buffers) return -1; if (ext_fb_list->ext_fb[i].size < min_size) { free(ext_fb_list->ext_fb[i].data); ext_fb_list->ext_fb[i].data = (uint8_t *)calloc(min_size, sizeof(uint8_t)); if (!ext_fb_list->ext_fb[i].data) return -1; ext_fb_list->ext_fb[i].size = min_size; } fb->data = ext_fb_list->ext_fb[i].data; fb->size = ext_fb_list->ext_fb[i].size; ext_fb_list->ext_fb[i].in_use = 1; // Set the frame buffer's private data to point at the external frame buffer. fb->priv = &ext_fb_list->ext_fb[i]; return 0; } // Callback used by libaom when there are no references to the frame buffer. // |cb_priv| user private data passed into the set function. |fb| pointer // to the frame buffer. static int release_av1_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) { struct ExternalFrameBuffer *const ext_fb = (struct ExternalFrameBuffer *)fb->priv; (void)cb_priv; ext_fb->in_use = 0; return 0; } static void generate_filename(const char *pattern, char *out, size_t q_len, unsigned int d_w, unsigned int d_h, unsigned int frame_in) { const char *p = pattern; char *q = out; do { char *next_pat = strchr(p, '%'); if (p == next_pat) { size_t pat_len; /* parse the pattern */ q[q_len - 1] = '\0'; switch (p[1]) { case 'w': snprintf(q, q_len - 1, "%d", d_w); break; case 'h': snprintf(q, q_len - 1, "%d", d_h); break; case '1': snprintf(q, q_len - 1, "%d", frame_in); break; case '2': snprintf(q, q_len - 1, "%02d", frame_in); break; case '3': snprintf(q, q_len - 1, "%03d", frame_in); break; case '4': snprintf(q, q_len - 1, "%04d", frame_in); break; case '5': snprintf(q, q_len - 1, "%05d", frame_in); break; case '6': snprintf(q, q_len - 1, "%06d", frame_in); break; case '7': snprintf(q, q_len - 1, "%07d", frame_in); break; case '8': snprintf(q, q_len - 1, "%08d", frame_in); break; case '9': snprintf(q, q_len - 1, "%09d", frame_in); break; default: die("Unrecognized pattern %%%c\n", p[1]); } pat_len = strlen(q); if (pat_len >= q_len - 1) die("Output filename too long.\n"); q += pat_len; p += 2; q_len -= pat_len; } else { size_t copy_len; /* copy the next segment */ if (!next_pat) copy_len = strlen(p); else copy_len = next_pat - p; if (copy_len >= q_len - 1) die("Output filename too long.\n"); memcpy(q, p, copy_len); q[copy_len] = '\0'; q += copy_len; p += copy_len; q_len -= copy_len; } } while (*p); } static int is_single_file(const char *outfile_pattern) { const char *p = outfile_pattern; do { p = strchr(p, '%'); if (p && p[1] >= '1' && p[1] <= '9') return 0; // pattern contains sequence number, so it's not unique if (p) p++; } while (p); return 1; } static void print_md5(unsigned char digest[16], const char *filename) { int i; for (i = 0; i < 16; ++i) printf("%02x", digest[i]); printf(" %s\n", filename); } static FILE *open_outfile(const char *name) { if (strcmp("-", name) == 0) { set_binary_mode(stdout); return stdout; } else { FILE *file = fopen(name, "wb"); if (!file) fatal("Failed to open output file '%s'", name); return file; } } static int main_loop(int argc, const char **argv_) { aom_codec_ctx_t decoder; char *fn = NULL; int i; int ret = EXIT_FAILURE; uint8_t *buf = NULL; size_t bytes_in_buffer = 0, buffer_size = 0; FILE *infile; int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0; int do_md5 = 0, progress = 0; int stop_after = 0, summary = 0, quiet = 1; int arg_skip = 0; int keep_going = 0; uint64_t dx_time = 0; struct arg arg; char **argv, **argi, **argj; int single_file; int use_y4m = 1; int opt_yv12 = 0; int opt_i420 = 0; int opt_raw = 0; aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING }; unsigned int fixed_output_bit_depth = 0; unsigned int is_annexb = 0; int frames_corrupted = 0; int dec_flags = 0; int do_scale = 0; int operating_point = 0; int output_all_layers = 0; int skip_film_grain = 0; int enable_row_mt = 0; aom_image_t *scaled_img = NULL; aom_image_t *img_shifted = NULL; int frame_avail, got_data, flush_decoder = 0; int num_external_frame_buffers = 0; struct ExternalFrameBufferList ext_fb_list = { 0, NULL }; const char *outfile_pattern = NULL; char outfile_name[PATH_MAX] = { 0 }; FILE *outfile = NULL; FILE *framestats_file = NULL; MD5Context md5_ctx; unsigned char md5_digest[16]; struct AvxDecInputContext input = { NULL, NULL, NULL }; struct AvxInputContext aom_input_ctx; memset(&aom_input_ctx, 0, sizeof(aom_input_ctx)); #if CONFIG_WEBM_IO struct WebmInputContext webm_ctx; memset(&webm_ctx, 0, sizeof(webm_ctx)); input.webm_ctx = &webm_ctx; #endif struct ObuDecInputContext obu_ctx = { NULL, NULL, 0, 0, 0 }; int is_ivf = 0; obu_ctx.avx_ctx = &aom_input_ctx; input.obu_ctx = &obu_ctx; input.aom_input_ctx = &aom_input_ctx; /* Parse command line */ exec_name = argv_[0]; argv = argv_dup(argc - 1, argv_ + 1); if (!argv) { fprintf(stderr, "Error allocating argument list\n"); return EXIT_FAILURE; } aom_codec_iface_t *interface = NULL; for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { memset(&arg, 0, sizeof(arg)); arg.argv_step = 1; if (arg_match(&arg, &help, argi)) { show_help(stdout, 0); exit(EXIT_SUCCESS); } else if (arg_match(&arg, &codecarg, argi)) { interface = get_aom_decoder_by_short_name(arg.val); if (!interface) die("Error: Unrecognized argument (%s) to --codec\n", arg.val); } else if (arg_match(&arg, &looparg, argi)) { // no-op } else if (arg_match(&arg, &outputfile, argi)) { outfile_pattern = arg.val; } else if (arg_match(&arg, &use_yv12, argi)) { use_y4m = 0; flipuv = 1; opt_yv12 = 1; opt_i420 = 0; opt_raw = 0; } else if (arg_match(&arg, &use_i420, argi)) { use_y4m = 0; flipuv = 0; opt_yv12 = 0; opt_i420 = 1; opt_raw = 0; } else if (arg_match(&arg, &rawvideo, argi)) { use_y4m = 0; opt_yv12 = 0; opt_i420 = 0; opt_raw = 1; } else if (arg_match(&arg, &flipuvarg, argi)) { flipuv = 1; } else if (arg_match(&arg, &noblitarg, argi)) { noblit = 1; } else if (arg_match(&arg, &progressarg, argi)) { progress = 1; } else if (arg_match(&arg, &limitarg, argi)) { stop_after = arg_parse_uint(&arg); } else if (arg_match(&arg, &skiparg, argi)) { arg_skip = arg_parse_uint(&arg); } else if (arg_match(&arg, &md5arg, argi)) { do_md5 = 1; } else if (arg_match(&arg, &framestatsarg, argi)) { framestats_file = fopen(arg.val, "w"); if (!framestats_file) { die("Error: Could not open --framestats file (%s) for writing.\n", arg.val); } } else if (arg_match(&arg, &summaryarg, argi)) { summary = 1; } else if (arg_match(&arg, &threadsarg, argi)) { cfg.threads = arg_parse_uint(&arg); #if !CONFIG_MULTITHREAD if (cfg.threads > 1) { die("Error: --threads=%d is not supported when CONFIG_MULTITHREAD = " "0.\n", cfg.threads); } #endif } else if (arg_match(&arg, &rowmtarg, argi)) { enable_row_mt = arg_parse_uint(&arg); } else if (arg_match(&arg, &verbosearg, argi)) { quiet = 0; } else if (arg_match(&arg, &scalearg, argi)) { do_scale = 1; } else if (arg_match(&arg, &fb_arg, argi)) { num_external_frame_buffers = arg_parse_uint(&arg); } else if (arg_match(&arg, &continuearg, argi)) { keep_going = 1; } else if (arg_match(&arg, &outbitdeptharg, argi)) { fixed_output_bit_depth = arg_parse_uint(&arg); } else if (arg_match(&arg, &isannexb, argi)) { is_annexb = 1; input.obu_ctx->is_annexb = 1; } else if (arg_match(&arg, &oppointarg, argi)) { operating_point = arg_parse_int(&arg); } else if (arg_match(&arg, &outallarg, argi)) { output_all_layers = 1; } else if (arg_match(&arg, &skipfilmgrain, argi)) { skip_film_grain = 1; } else { argj++; } } /* Check for unrecognized options */ for (argi = argv; *argi; argi++) if (argi[0][0] == '-' && strlen(argi[0]) > 1) die("Error: Unrecognized option %s\n", *argi); /* Handle non-option arguments */ fn = argv[0]; if (!fn) { free(argv); fprintf(stderr, "No input file specified!\n"); usage_exit(); } const bool using_file = strcmp(fn, "-") != 0; /* Open file */ infile = using_file ? fopen(fn, "rb") : set_binary_mode(stdin); if (!infile) { fatal("Failed to open input file '%s'", using_file ? fn : "stdin"); } #if CONFIG_OS_SUPPORT /* Make sure we don't dump to the terminal, unless forced to with -o - */ if (!outfile_pattern && isatty(STDOUT_FILENO) && !do_md5 && !noblit) { fprintf(stderr, "Not dumping raw video to your terminal. Use '-o -' to " "override.\n"); free(argv); return EXIT_FAILURE; } #endif input.aom_input_ctx->filename = fn; input.aom_input_ctx->file = infile; // TODO(https://crbug.com/aomedia/1706): webm type does not support reading // from stdin yet, and file_is_webm is not using the detect buffer when // determining the type. Therefore it should only be checked when using a file // and needs to be checked prior to other types. if (false) { #if CONFIG_WEBM_IO } else if (using_file && file_is_webm(input.webm_ctx, input.aom_input_ctx)) { input.aom_input_ctx->file_type = FILE_TYPE_WEBM; #endif } else if (file_is_ivf(input.aom_input_ctx)) { input.aom_input_ctx->file_type = FILE_TYPE_IVF; is_ivf = 1; } else if (file_is_obu(&obu_ctx)) { input.aom_input_ctx->file_type = FILE_TYPE_OBU; } else if (file_is_raw(input.aom_input_ctx)) { input.aom_input_ctx->file_type = FILE_TYPE_RAW; } else { fprintf(stderr, "Unrecognized input file type.\n"); #if CONFIG_WEBM_IO if (!using_file) { fprintf(stderr, "aomdec does not support piped WebM input.\n"); } #else fprintf(stderr, "aomdec was built without WebM container support.\n"); #endif free(argv); return EXIT_FAILURE; } outfile_pattern = outfile_pattern ? outfile_pattern : "-"; single_file = is_single_file(outfile_pattern); if (!noblit && single_file) { generate_filename(outfile_pattern, outfile_name, PATH_MAX, aom_input_ctx.width, aom_input_ctx.height, 0); if (do_md5) MD5Init(&md5_ctx); else outfile = open_outfile(outfile_name); } if (use_y4m && !noblit) { if (!single_file) { fprintf(stderr, "YUV4MPEG2 not supported with output patterns," " try --i420 or --yv12 or --rawvideo.\n"); return EXIT_FAILURE; } #if CONFIG_WEBM_IO if (aom_input_ctx.file_type == FILE_TYPE_WEBM) { if (webm_guess_framerate(input.webm_ctx, input.aom_input_ctx)) { fprintf(stderr, "Failed to guess framerate -- error parsing " "webm file?\n"); return EXIT_FAILURE; } } #endif } aom_codec_iface_t *fourcc_interface = get_aom_decoder_by_fourcc(aom_input_ctx.fourcc); if (is_ivf && !fourcc_interface) fatal("Unsupported fourcc: %x\n", aom_input_ctx.fourcc); if (interface && fourcc_interface && interface != fourcc_interface) aom_tools_warn("Header indicates codec: %s\n", aom_codec_iface_name(fourcc_interface)); else interface = fourcc_interface; if (!interface) interface = get_aom_decoder_by_index(0); dec_flags = 0; if (aom_codec_dec_init(&decoder, interface, &cfg, dec_flags)) { fprintf(stderr, "Failed to initialize decoder: %s\n", aom_codec_error(&decoder)); goto fail2; } if (!quiet) fprintf(stderr, "%s\n", decoder.name); if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_IS_ANNEXB, is_annexb)) { fprintf(stderr, "Failed to set is_annexb: %s\n", aom_codec_error(&decoder)); goto fail; } if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_OPERATING_POINT, operating_point)) { fprintf(stderr, "Failed to set operating_point: %s\n", aom_codec_error(&decoder)); goto fail; } if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_OUTPUT_ALL_LAYERS, output_all_layers)) { fprintf(stderr, "Failed to set output_all_layers: %s\n", aom_codec_error(&decoder)); goto fail; } if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_SKIP_FILM_GRAIN, skip_film_grain)) { fprintf(stderr, "Failed to set skip_film_grain: %s\n", aom_codec_error(&decoder)); goto fail; } if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_ROW_MT, enable_row_mt)) { fprintf(stderr, "Failed to set row multithreading mode: %s\n", aom_codec_error(&decoder)); goto fail; } if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip); while (arg_skip) { if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break; arg_skip--; } if (num_external_frame_buffers > 0) { ext_fb_list.num_external_frame_buffers = num_external_frame_buffers; ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc( num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb)); if (!ext_fb_list.ext_fb) { fprintf(stderr, "Failed to allocate ExternalFrameBuffer\n"); goto fail; } if (aom_codec_set_frame_buffer_functions(&decoder, get_av1_frame_buffer, release_av1_frame_buffer, &ext_fb_list)) { fprintf(stderr, "Failed to configure external frame buffers: %s\n", aom_codec_error(&decoder)); goto fail; } } frame_avail = 1; got_data = 0; if (framestats_file) fprintf(framestats_file, "bytes,qp\r\n"); /* Decode file */ while (frame_avail || got_data) { aom_codec_iter_t iter = NULL; aom_image_t *img; struct aom_usec_timer timer; int corrupted = 0; frame_avail = 0; if (!stop_after || frame_in < stop_after) { if (!read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) { frame_avail = 1; frame_in++; aom_usec_timer_start(&timer); if (aom_codec_decode(&decoder, buf, bytes_in_buffer, NULL)) { const char *detail = aom_codec_error_detail(&decoder); aom_tools_warn("Failed to decode frame %d: %s", frame_in, aom_codec_error(&decoder)); if (detail) aom_tools_warn("Additional information: %s", detail); if (!keep_going) goto fail; } if (framestats_file) { int qp; if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AOMD_GET_LAST_QUANTIZER, &qp)) { aom_tools_warn("Failed AOMD_GET_LAST_QUANTIZER: %s", aom_codec_error(&decoder)); if (!keep_going) goto fail; } fprintf(framestats_file, "%d,%d\r\n", (int)bytes_in_buffer, qp); } aom_usec_timer_mark(&timer); dx_time += aom_usec_timer_elapsed(&timer); } else { flush_decoder = 1; } } else { flush_decoder = 1; } aom_usec_timer_start(&timer); if (flush_decoder) { // Flush the decoder. if (aom_codec_decode(&decoder, NULL, 0, NULL)) { aom_tools_warn("Failed to flush decoder: %s", aom_codec_error(&decoder)); } } aom_usec_timer_mark(&timer); dx_time += aom_usec_timer_elapsed(&timer); got_data = 0; // TODO(aomedia:3519): Change the prototype of aom_codec_get_frame_fn_t to // facilitate error handling. while ((img = aom_codec_get_frame(&decoder, &iter))) { ++frame_out; got_data = 1; if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AOMD_GET_FRAME_CORRUPTED, &corrupted)) { aom_tools_warn("Failed AOM_GET_FRAME_CORRUPTED: %s", aom_codec_error(&decoder)); if (!keep_going) goto fail; } frames_corrupted += corrupted; if (progress) show_progress(frame_in, frame_out, dx_time); if (!noblit) { const int PLANES_YUV[] = { AOM_PLANE_Y, AOM_PLANE_U, AOM_PLANE_V }; const int PLANES_YVU[] = { AOM_PLANE_Y, AOM_PLANE_V, AOM_PLANE_U }; const int *planes = flipuv ? PLANES_YVU : PLANES_YUV; if (do_scale) { if (frame_out == 1) { // If the output frames are to be scaled to a fixed display size // then use the width and height specified in the container. If // either of these is set to 0, use the display size set in the // first frame header. If that is unavailable, use the raw decoded // size of the first decoded frame. int render_width = aom_input_ctx.width; int render_height = aom_input_ctx.height; if (!render_width || !render_height) { int render_size[2]; if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_GET_DISPLAY_SIZE, render_size)) { // As last resort use size of first frame as display size. render_width = img->d_w; render_height = img->d_h; } else { render_width = render_size[0]; render_height = render_size[1]; } } scaled_img = aom_img_alloc(NULL, img->fmt, render_width, render_height, 16); if (!scaled_img) { fprintf(stderr, "Failed to allocate scaled image (%d x %d)\n", render_width, render_height); goto fail; } scaled_img->bit_depth = img->bit_depth; scaled_img->monochrome = img->monochrome; scaled_img->csp = img->csp; } if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) { #if CONFIG_LIBYUV if (libyuv_scale(img, scaled_img, kFilterBox) != 0) goto fail; img = scaled_img; #else fprintf( stderr, "Failed to scale output frame: %s.\n" "libyuv is required for scaling but is currently disabled.\n" "Be sure to specify -DCONFIG_LIBYUV=1 when running cmake.\n", aom_codec_error(&decoder)); goto fail; #endif } } // Default to codec bit depth if output bit depth not set unsigned int output_bit_depth; if (!fixed_output_bit_depth && single_file) { output_bit_depth = img->bit_depth; } else { output_bit_depth = fixed_output_bit_depth; } // Shift up or down if necessary if (output_bit_depth != 0) { if (!aom_shift_img(output_bit_depth, &img, &img_shifted)) { fprintf(stderr, "Error allocating image\n"); goto fail; } } aom_input_ctx.width = img->d_w; aom_input_ctx.height = img->d_h; int num_planes = (opt_raw && img->monochrome) ? 1 : 3; if (single_file) { if (use_y4m) { char y4m_buf[Y4M_BUFFER_SIZE] = { 0 }; size_t len = 0; if (frame_out == 1) { // Y4M file header len = y4m_write_file_header( y4m_buf, sizeof(y4m_buf), aom_input_ctx.width, aom_input_ctx.height, &aom_input_ctx.framerate, img->monochrome, img->csp, img->fmt, img->bit_depth, img->range); if (img->csp == AOM_CSP_COLOCATED) { fprintf(stderr, "Warning: Y4M lacks a colorspace for colocated " "chroma. Using a placeholder.\n"); } if (do_md5) { MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len); } else { fputs(y4m_buf, outfile); } } // Y4M frame header len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf)); if (do_md5) { MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len); y4m_update_image_md5(img, planes, &md5_ctx); } else { fputs(y4m_buf, outfile); y4m_write_image_file(img, planes, outfile); } } else { if (frame_out == 1) { // Check if --yv12 or --i420 options are consistent with the // bit-stream decoded if (opt_i420) { if (img->fmt != AOM_IMG_FMT_I420 && img->fmt != AOM_IMG_FMT_I42016) { fprintf(stderr, "Cannot produce i420 output for bit-stream.\n"); goto fail; } } if (opt_yv12) { if ((img->fmt != AOM_IMG_FMT_I420 && img->fmt != AOM_IMG_FMT_YV12) || img->bit_depth != 8) { fprintf(stderr, "Cannot produce yv12 output for bit-stream.\n"); goto fail; } } } if (do_md5) { raw_update_image_md5(img, planes, num_planes, &md5_ctx); } else { raw_write_image_file(img, planes, num_planes, outfile); } } } else { generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w, img->d_h, frame_in); if (do_md5) { MD5Init(&md5_ctx); if (use_y4m) { y4m_update_image_md5(img, planes, &md5_ctx); } else { raw_update_image_md5(img, planes, num_planes, &md5_ctx); } MD5Final(md5_digest, &md5_ctx); print_md5(md5_digest, outfile_name); } else { outfile = open_outfile(outfile_name); if (use_y4m) { y4m_write_image_file(img, planes, outfile); } else { raw_write_image_file(img, planes, num_planes, outfile); } fclose(outfile); } } } } } if (summary || progress) { show_progress(frame_in, frame_out, dx_time); fprintf(stderr, "\n"); } if (frames_corrupted) { fprintf(stderr, "WARNING: %d frames corrupted.\n", frames_corrupted); } else { ret = EXIT_SUCCESS; } fail: if (aom_codec_destroy(&decoder)) { fprintf(stderr, "Failed to destroy decoder: %s\n", aom_codec_error(&decoder)); } fail2: if (!noblit && single_file) { if (do_md5) { MD5Final(md5_digest, &md5_ctx); print_md5(md5_digest, outfile_name); } else { fclose(outfile); } } #if CONFIG_WEBM_IO if (input.aom_input_ctx->file_type == FILE_TYPE_WEBM) webm_free(input.webm_ctx); #endif if (input.aom_input_ctx->file_type == FILE_TYPE_OBU) obudec_free(input.obu_ctx); if (input.aom_input_ctx->file_type != FILE_TYPE_WEBM) free(buf); if (scaled_img) aom_img_free(scaled_img); if (img_shifted) aom_img_free(img_shifted); for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) { free(ext_fb_list.ext_fb[i].data); } free(ext_fb_list.ext_fb); fclose(infile); if (framestats_file) fclose(framestats_file); free(argv); return ret; } int main(int argc, const char **argv_) { unsigned int loops = 1, i; char **argv, **argi, **argj; struct arg arg; int error = 0; argv = argv_dup(argc - 1, argv_ + 1); if (!argv) { fprintf(stderr, "Error allocating argument list\n"); return EXIT_FAILURE; } for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { memset(&arg, 0, sizeof(arg)); arg.argv_step = 1; if (arg_match(&arg, &looparg, argi)) { loops = arg_parse_uint(&arg); break; } } free(argv); for (i = 0; !error && i < loops; i++) error = main_loop(argc, argv_); return error; } aom-3.12.1/apps/aomenc.c000066400000000000000000003130741477627663500147620ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "apps/aomenc.h" #include "config/aom_config.h" #include #include #include #include #include #include #include #if CONFIG_AV1_DECODER #include "aom/aom_decoder.h" #include "aom/aomdx.h" #endif #include "aom/aom_encoder.h" #include "aom/aom_integer.h" #include "aom/aomcx.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/aom_timer.h" #include "aom_ports/mem_ops.h" #include "common/args.h" #include "common/ivfenc.h" #include "common/tools_common.h" #include "common/warnings.h" #if CONFIG_WEBM_IO #include "common/webmenc.h" #endif #include "common/y4minput.h" #include "examples/encoder_util.h" #include "stats/aomstats.h" #include "stats/rate_hist.h" #if CONFIG_LIBYUV #include "third_party/libyuv/include/libyuv/scale.h" #endif /* Swallow warnings about unused results of fread/fwrite */ static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { return fread(ptr, size, nmemb, stream); } #define fread wrap_fread static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) { return fwrite(ptr, size, nmemb, stream); } #define fwrite wrap_fwrite static const char *exec_name; static AOM_TOOLS_FORMAT_PRINTF(3, 0) void warn_or_exit_on_errorv( aom_codec_ctx_t *ctx, int fatal, const char *s, va_list ap) { if (ctx->err) { const char *detail = aom_codec_error_detail(ctx); vfprintf(stderr, s, ap); fprintf(stderr, ": %s\n", aom_codec_error(ctx)); if (detail) fprintf(stderr, " %s\n", detail); if (fatal) { aom_codec_destroy(ctx); exit(EXIT_FAILURE); } } } static AOM_TOOLS_FORMAT_PRINTF(2, 3) void ctx_exit_on_error(aom_codec_ctx_t *ctx, const char *s, ...) { va_list ap; va_start(ap, s); warn_or_exit_on_errorv(ctx, 1, s, ap); va_end(ap); } static AOM_TOOLS_FORMAT_PRINTF(3, 4) void warn_or_exit_on_error( aom_codec_ctx_t *ctx, int fatal, const char *s, ...) { va_list ap; va_start(ap, s); warn_or_exit_on_errorv(ctx, fatal, s, ap); va_end(ap); } static int read_frame(struct AvxInputContext *input_ctx, aom_image_t *img) { FILE *f = input_ctx->file; y4m_input *y4m = &input_ctx->y4m; int shortread = 0; if (input_ctx->file_type == FILE_TYPE_Y4M) { if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0; } else { shortread = read_yuv_frame(input_ctx, img); } return !shortread; } static int file_is_y4m(const char detect[4]) { if (memcmp(detect, "YUV4", 4) == 0) { return 1; } return 0; } static int fourcc_is_ivf(const char detect[4]) { if (memcmp(detect, "DKIF", 4) == 0) { return 1; } return 0; } static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED, AOME_SET_ENABLEAUTOALTREF, AOME_SET_SHARPNESS, AOME_SET_STATIC_THRESHOLD, AV1E_SET_ROW_MT, AV1E_SET_FP_MT, AV1E_SET_TILE_COLUMNS, AV1E_SET_TILE_ROWS, AV1E_SET_ENABLE_TPL_MODEL, AV1E_SET_ENABLE_KEYFRAME_FILTERING, AOME_SET_ARNR_MAXFRAMES, AOME_SET_ARNR_STRENGTH, AOME_SET_TUNING, AOME_SET_CQ_LEVEL, AOME_SET_MAX_INTRA_BITRATE_PCT, AV1E_SET_MAX_INTER_BITRATE_PCT, AV1E_SET_GF_CBR_BOOST_PCT, AV1E_SET_LOSSLESS, AV1E_SET_ENABLE_CDEF, AV1E_SET_ENABLE_RESTORATION, AV1E_SET_ENABLE_RECT_PARTITIONS, AV1E_SET_ENABLE_AB_PARTITIONS, AV1E_SET_ENABLE_1TO4_PARTITIONS, AV1E_SET_MIN_PARTITION_SIZE, AV1E_SET_MAX_PARTITION_SIZE, AV1E_SET_ENABLE_DUAL_FILTER, AV1E_SET_ENABLE_CHROMA_DELTAQ, AV1E_SET_ENABLE_INTRA_EDGE_FILTER, AV1E_SET_ENABLE_ORDER_HINT, AV1E_SET_ENABLE_TX64, AV1E_SET_ENABLE_FLIP_IDTX, AV1E_SET_ENABLE_RECT_TX, AV1E_SET_ENABLE_DIST_WTD_COMP, AV1E_SET_ENABLE_MASKED_COMP, AV1E_SET_ENABLE_ONESIDED_COMP, AV1E_SET_ENABLE_INTERINTRA_COMP, AV1E_SET_ENABLE_SMOOTH_INTERINTRA, AV1E_SET_ENABLE_DIFF_WTD_COMP, AV1E_SET_ENABLE_INTERINTER_WEDGE, AV1E_SET_ENABLE_INTERINTRA_WEDGE, AV1E_SET_ENABLE_GLOBAL_MOTION, AV1E_SET_ENABLE_WARPED_MOTION, AV1E_SET_ENABLE_FILTER_INTRA, AV1E_SET_ENABLE_SMOOTH_INTRA, AV1E_SET_ENABLE_PAETH_INTRA, AV1E_SET_ENABLE_CFL_INTRA, AV1E_SET_ENABLE_DIAGONAL_INTRA, AV1E_SET_FORCE_VIDEO_MODE, AV1E_SET_ENABLE_OBMC, AV1E_SET_ENABLE_OVERLAY, AV1E_SET_ENABLE_PALETTE, AV1E_SET_ENABLE_INTRABC, AV1E_SET_ENABLE_ANGLE_DELTA, AV1E_SET_DISABLE_TRELLIS_QUANT, AV1E_SET_ENABLE_QM, AV1E_SET_QM_MIN, AV1E_SET_QM_MAX, AV1E_SET_REDUCED_TX_TYPE_SET, AV1E_SET_INTRA_DCT_ONLY, AV1E_SET_INTER_DCT_ONLY, AV1E_SET_INTRA_DEFAULT_TX_ONLY, AV1E_SET_QUANT_B_ADAPT, AV1E_SET_COEFF_COST_UPD_FREQ, AV1E_SET_MODE_COST_UPD_FREQ, AV1E_SET_MV_COST_UPD_FREQ, AV1E_SET_FRAME_PARALLEL_DECODING, AV1E_SET_ERROR_RESILIENT_MODE, AV1E_SET_AQ_MODE, AV1E_SET_DELTAQ_MODE, AV1E_SET_DELTAQ_STRENGTH, AV1E_SET_DELTALF_MODE, AV1E_SET_FRAME_PERIODIC_BOOST, AV1E_SET_NOISE_SENSITIVITY, AV1E_SET_TUNE_CONTENT, AV1E_SET_CDF_UPDATE_MODE, AV1E_SET_COLOR_PRIMARIES, AV1E_SET_TRANSFER_CHARACTERISTICS, AV1E_SET_MATRIX_COEFFICIENTS, AV1E_SET_CHROMA_SAMPLE_POSITION, AV1E_SET_MIN_GF_INTERVAL, AV1E_SET_MAX_GF_INTERVAL, AV1E_SET_GF_MIN_PYRAMID_HEIGHT, AV1E_SET_GF_MAX_PYRAMID_HEIGHT, AV1E_SET_SUPERBLOCK_SIZE, AV1E_SET_NUM_TG, AV1E_SET_MTU, AV1E_SET_TIMING_INFO_TYPE, AV1E_SET_FILM_GRAIN_TEST_VECTOR, AV1E_SET_FILM_GRAIN_TABLE, #if CONFIG_DENOISE AV1E_SET_DENOISE_NOISE_LEVEL, AV1E_SET_DENOISE_BLOCK_SIZE, AV1E_SET_ENABLE_DNL_DENOISING, #endif // CONFIG_DENOISE AV1E_SET_MAX_REFERENCE_FRAMES, AV1E_SET_REDUCED_REFERENCE_SET, AV1E_SET_ENABLE_REF_FRAME_MVS, AV1E_SET_TARGET_SEQ_LEVEL_IDX, AV1E_SET_TIER_MASK, AV1E_SET_MIN_CR, AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, AV1E_SET_CHROMA_SUBSAMPLING_X, AV1E_SET_CHROMA_SUBSAMPLING_Y, #if CONFIG_TUNE_VMAF AV1E_SET_VMAF_MODEL_PATH, #endif AV1E_SET_DV_COST_UPD_FREQ, AV1E_SET_PARTITION_INFO_PATH, AV1E_SET_ENABLE_DIRECTIONAL_INTRA, AV1E_SET_ENABLE_TX_SIZE_SEARCH, AV1E_SET_LOOPFILTER_CONTROL, AV1E_SET_AUTO_INTRA_TOOLS_OFF, AV1E_ENABLE_RATE_GUIDE_DELTAQ, AV1E_SET_RATE_DISTRIBUTION_INFO, 0 }; static const arg_def_t *const main_args[] = { &g_av1_codec_arg_defs.help, &g_av1_codec_arg_defs.use_cfg, &g_av1_codec_arg_defs.debugmode, &g_av1_codec_arg_defs.outputfile, &g_av1_codec_arg_defs.codecarg, &g_av1_codec_arg_defs.passes, &g_av1_codec_arg_defs.pass_arg, &g_av1_codec_arg_defs.fpf_name, &g_av1_codec_arg_defs.limit, &g_av1_codec_arg_defs.skip, &g_av1_codec_arg_defs.good_dl, &g_av1_codec_arg_defs.rt_dl, &g_av1_codec_arg_defs.ai_dl, &g_av1_codec_arg_defs.quietarg, &g_av1_codec_arg_defs.verbosearg, &g_av1_codec_arg_defs.psnrarg, &g_av1_codec_arg_defs.use_webm, &g_av1_codec_arg_defs.use_ivf, &g_av1_codec_arg_defs.use_obu, &g_av1_codec_arg_defs.q_hist_n, &g_av1_codec_arg_defs.rate_hist_n, &g_av1_codec_arg_defs.disable_warnings, &g_av1_codec_arg_defs.disable_warning_prompt, &g_av1_codec_arg_defs.recontest, NULL }; static const arg_def_t *const global_args[] = { &g_av1_codec_arg_defs.use_nv12, &g_av1_codec_arg_defs.use_yv12, &g_av1_codec_arg_defs.use_i420, &g_av1_codec_arg_defs.use_i422, &g_av1_codec_arg_defs.use_i444, &g_av1_codec_arg_defs.usage, &g_av1_codec_arg_defs.threads, &g_av1_codec_arg_defs.profile, &g_av1_codec_arg_defs.width, &g_av1_codec_arg_defs.height, &g_av1_codec_arg_defs.forced_max_frame_width, &g_av1_codec_arg_defs.forced_max_frame_height, #if CONFIG_WEBM_IO &g_av1_codec_arg_defs.stereo_mode, #endif &g_av1_codec_arg_defs.timebase, &g_av1_codec_arg_defs.framerate, &g_av1_codec_arg_defs.global_error_resilient, &g_av1_codec_arg_defs.bitdeptharg, &g_av1_codec_arg_defs.inbitdeptharg, &g_av1_codec_arg_defs.lag_in_frames, &g_av1_codec_arg_defs.large_scale_tile, &g_av1_codec_arg_defs.monochrome, &g_av1_codec_arg_defs.full_still_picture_hdr, &g_av1_codec_arg_defs.use_16bit_internal, &g_av1_codec_arg_defs.save_as_annexb, NULL }; static const arg_def_t *const rc_args[] = { &g_av1_codec_arg_defs.dropframe_thresh, &g_av1_codec_arg_defs.resize_mode, &g_av1_codec_arg_defs.resize_denominator, &g_av1_codec_arg_defs.resize_kf_denominator, &g_av1_codec_arg_defs.superres_mode, &g_av1_codec_arg_defs.superres_denominator, &g_av1_codec_arg_defs.superres_kf_denominator, &g_av1_codec_arg_defs.superres_qthresh, &g_av1_codec_arg_defs.superres_kf_qthresh, &g_av1_codec_arg_defs.end_usage, &g_av1_codec_arg_defs.target_bitrate, &g_av1_codec_arg_defs.min_quantizer, &g_av1_codec_arg_defs.max_quantizer, &g_av1_codec_arg_defs.undershoot_pct, &g_av1_codec_arg_defs.overshoot_pct, &g_av1_codec_arg_defs.buf_sz, &g_av1_codec_arg_defs.buf_initial_sz, &g_av1_codec_arg_defs.buf_optimal_sz, &g_av1_codec_arg_defs.bias_pct, &g_av1_codec_arg_defs.minsection_pct, &g_av1_codec_arg_defs.maxsection_pct, NULL }; static const arg_def_t *const kf_args[] = { &g_av1_codec_arg_defs.fwd_kf_enabled, &g_av1_codec_arg_defs.kf_min_dist, &g_av1_codec_arg_defs.kf_max_dist, &g_av1_codec_arg_defs.kf_disabled, &g_av1_codec_arg_defs.sframe_dist, &g_av1_codec_arg_defs.sframe_mode, NULL }; // TODO(bohanli): Currently all options are supported by the key & value API. // Consider removing the control ID usages? static const arg_def_t *const av1_ctrl_args[] = { &g_av1_codec_arg_defs.cpu_used_av1, &g_av1_codec_arg_defs.auto_altref, &g_av1_codec_arg_defs.sharpness, &g_av1_codec_arg_defs.static_thresh, &g_av1_codec_arg_defs.rowmtarg, &g_av1_codec_arg_defs.fpmtarg, &g_av1_codec_arg_defs.tile_cols, &g_av1_codec_arg_defs.tile_rows, &g_av1_codec_arg_defs.enable_tpl_model, &g_av1_codec_arg_defs.enable_keyframe_filtering, &g_av1_codec_arg_defs.arnr_maxframes, &g_av1_codec_arg_defs.arnr_strength, &g_av1_codec_arg_defs.tune_metric, &g_av1_codec_arg_defs.cq_level, &g_av1_codec_arg_defs.max_intra_rate_pct, &g_av1_codec_arg_defs.max_inter_rate_pct, &g_av1_codec_arg_defs.gf_cbr_boost_pct, &g_av1_codec_arg_defs.lossless, &g_av1_codec_arg_defs.enable_cdef, &g_av1_codec_arg_defs.enable_restoration, &g_av1_codec_arg_defs.enable_rect_partitions, &g_av1_codec_arg_defs.enable_ab_partitions, &g_av1_codec_arg_defs.enable_1to4_partitions, &g_av1_codec_arg_defs.min_partition_size, &g_av1_codec_arg_defs.max_partition_size, &g_av1_codec_arg_defs.enable_dual_filter, &g_av1_codec_arg_defs.enable_chroma_deltaq, &g_av1_codec_arg_defs.enable_intra_edge_filter, &g_av1_codec_arg_defs.enable_order_hint, &g_av1_codec_arg_defs.enable_tx64, &g_av1_codec_arg_defs.enable_flip_idtx, &g_av1_codec_arg_defs.enable_rect_tx, &g_av1_codec_arg_defs.enable_dist_wtd_comp, &g_av1_codec_arg_defs.enable_masked_comp, &g_av1_codec_arg_defs.enable_onesided_comp, &g_av1_codec_arg_defs.enable_interintra_comp, &g_av1_codec_arg_defs.enable_smooth_interintra, &g_av1_codec_arg_defs.enable_diff_wtd_comp, &g_av1_codec_arg_defs.enable_interinter_wedge, &g_av1_codec_arg_defs.enable_interintra_wedge, &g_av1_codec_arg_defs.enable_global_motion, &g_av1_codec_arg_defs.enable_warped_motion, &g_av1_codec_arg_defs.enable_filter_intra, &g_av1_codec_arg_defs.enable_smooth_intra, &g_av1_codec_arg_defs.enable_paeth_intra, &g_av1_codec_arg_defs.enable_cfl_intra, &g_av1_codec_arg_defs.enable_diagonal_intra, &g_av1_codec_arg_defs.force_video_mode, &g_av1_codec_arg_defs.enable_obmc, &g_av1_codec_arg_defs.enable_overlay, &g_av1_codec_arg_defs.enable_palette, &g_av1_codec_arg_defs.enable_intrabc, &g_av1_codec_arg_defs.enable_angle_delta, &g_av1_codec_arg_defs.disable_trellis_quant, &g_av1_codec_arg_defs.enable_qm, &g_av1_codec_arg_defs.qm_min, &g_av1_codec_arg_defs.qm_max, &g_av1_codec_arg_defs.reduced_tx_type_set, &g_av1_codec_arg_defs.use_intra_dct_only, &g_av1_codec_arg_defs.use_inter_dct_only, &g_av1_codec_arg_defs.use_intra_default_tx_only, &g_av1_codec_arg_defs.quant_b_adapt, &g_av1_codec_arg_defs.coeff_cost_upd_freq, &g_av1_codec_arg_defs.mode_cost_upd_freq, &g_av1_codec_arg_defs.mv_cost_upd_freq, &g_av1_codec_arg_defs.frame_parallel_decoding, &g_av1_codec_arg_defs.error_resilient_mode, &g_av1_codec_arg_defs.aq_mode, &g_av1_codec_arg_defs.deltaq_mode, &g_av1_codec_arg_defs.deltaq_strength, &g_av1_codec_arg_defs.deltalf_mode, &g_av1_codec_arg_defs.frame_periodic_boost, &g_av1_codec_arg_defs.noise_sens, &g_av1_codec_arg_defs.tune_content, &g_av1_codec_arg_defs.cdf_update_mode, &g_av1_codec_arg_defs.input_color_primaries, &g_av1_codec_arg_defs.input_transfer_characteristics, &g_av1_codec_arg_defs.input_matrix_coefficients, &g_av1_codec_arg_defs.input_chroma_sample_position, &g_av1_codec_arg_defs.min_gf_interval, &g_av1_codec_arg_defs.max_gf_interval, &g_av1_codec_arg_defs.gf_min_pyr_height, &g_av1_codec_arg_defs.gf_max_pyr_height, &g_av1_codec_arg_defs.superblock_size, &g_av1_codec_arg_defs.num_tg, &g_av1_codec_arg_defs.mtu_size, &g_av1_codec_arg_defs.timing_info, &g_av1_codec_arg_defs.film_grain_test, &g_av1_codec_arg_defs.film_grain_table, #if CONFIG_DENOISE &g_av1_codec_arg_defs.denoise_noise_level, &g_av1_codec_arg_defs.denoise_block_size, &g_av1_codec_arg_defs.enable_dnl_denoising, #endif // CONFIG_DENOISE &g_av1_codec_arg_defs.max_reference_frames, &g_av1_codec_arg_defs.reduced_reference_set, &g_av1_codec_arg_defs.enable_ref_frame_mvs, &g_av1_codec_arg_defs.target_seq_level_idx, &g_av1_codec_arg_defs.set_tier_mask, &g_av1_codec_arg_defs.set_min_cr, &g_av1_codec_arg_defs.vbr_corpus_complexity_lap, &g_av1_codec_arg_defs.input_chroma_subsampling_x, &g_av1_codec_arg_defs.input_chroma_subsampling_y, #if CONFIG_TUNE_VMAF &g_av1_codec_arg_defs.vmaf_model_path, #endif &g_av1_codec_arg_defs.dv_cost_upd_freq, &g_av1_codec_arg_defs.partition_info_path, &g_av1_codec_arg_defs.enable_directional_intra, &g_av1_codec_arg_defs.enable_tx_size_search, &g_av1_codec_arg_defs.loopfilter_control, &g_av1_codec_arg_defs.auto_intra_tools_off, &g_av1_codec_arg_defs.enable_rate_guide_deltaq, &g_av1_codec_arg_defs.rate_distribution_info, NULL, }; static const arg_def_t *const av1_key_val_args[] = { &g_av1_codec_arg_defs.passes, &g_av1_codec_arg_defs.two_pass_output, &g_av1_codec_arg_defs.second_pass_log, &g_av1_codec_arg_defs.fwd_kf_dist, &g_av1_codec_arg_defs.strict_level_conformance, &g_av1_codec_arg_defs.sb_qp_sweep, &g_av1_codec_arg_defs.dist_metric, &g_av1_codec_arg_defs.kf_max_pyr_height, &g_av1_codec_arg_defs.auto_tiles, NULL, }; static const arg_def_t *const no_args[] = { NULL }; static void show_help(FILE *fout, int shorthelp) { fprintf(fout, "Usage: %s -o dst_filename src_filename\n", exec_name); if (shorthelp) { fprintf(fout, "Use --help to see the full list of options.\n"); return; } fprintf(fout, "\nOptions:\n"); arg_show_usage(fout, main_args); fprintf(fout, "\nEncoder Global Options:\n"); arg_show_usage(fout, global_args); fprintf(fout, "\nRate Control Options:\n"); arg_show_usage(fout, rc_args); fprintf(fout, "\nKeyframe Placement Options:\n"); arg_show_usage(fout, kf_args); #if CONFIG_AV1_ENCODER fprintf(fout, "\nAV1 Specific Options:\n"); arg_show_usage(fout, av1_ctrl_args); arg_show_usage(fout, av1_key_val_args); #endif fprintf(fout, "\nStream timebase (--timebase):\n" " The desired precision of timestamps in the output, expressed\n" " in fractional seconds. Default is 1/1000.\n"); fprintf(fout, "\nIncluded encoders:\n\n"); const int num_encoder = get_aom_encoder_count(); for (int i = 0; i < num_encoder; ++i) { aom_codec_iface_t *encoder = get_aom_encoder_by_index(i); const char *defstr = (i == (num_encoder - 1)) ? "(default)" : ""; fprintf(fout, " %-6s - %s %s\n", get_short_name_by_aom_encoder(encoder), aom_codec_iface_name(encoder), defstr); } fprintf(fout, "\n "); fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n"); } void usage_exit(void) { show_help(stderr, 1); exit(EXIT_FAILURE); } #if CONFIG_AV1_ENCODER #define ARG_CTRL_CNT_MAX NELEMENTS(av1_arg_ctrl_map) #define ARG_KEY_VAL_CNT_MAX NELEMENTS(av1_key_val_args) #endif #if !CONFIG_WEBM_IO typedef int stereo_format_t; struct WebmOutputContext { int debug; }; #endif /* Per-stream configuration */ struct stream_config { struct aom_codec_enc_cfg cfg; const char *out_fn; const char *stats_fn; stereo_format_t stereo_fmt; int arg_ctrls[ARG_CTRL_CNT_MAX][2]; int arg_ctrl_cnt; const char *arg_key_vals[ARG_KEY_VAL_CNT_MAX][2]; int arg_key_val_cnt; int write_webm; const char *film_grain_filename; int write_ivf; // whether to use 16bit internal buffers int use_16bit_internal; #if CONFIG_TUNE_VMAF const char *vmaf_model_path; #endif const char *partition_info_path; unsigned int enable_rate_guide_deltaq; const char *rate_distribution_info; aom_color_range_t color_range; const char *two_pass_input; const char *two_pass_output; int two_pass_width; int two_pass_height; }; struct stream_state { int index; struct stream_state *next; struct stream_config config; FILE *file; struct rate_hist *rate_hist; struct WebmOutputContext webm_ctx; uint64_t psnr_sse_total[2]; uint64_t psnr_samples_total[2]; double psnr_totals[2][4]; int psnr_count[2]; int counts[64]; aom_codec_ctx_t encoder; unsigned int frames_out; uint64_t cx_time; size_t nbytes; stats_io_t stats; struct aom_image *img; aom_codec_ctx_t decoder; int mismatch_seen; unsigned int chroma_subsampling_x; unsigned int chroma_subsampling_y; const char *orig_out_fn; unsigned int orig_width; unsigned int orig_height; int orig_write_webm; int orig_write_ivf; char tmp_out_fn[1000]; }; static void validate_positive_rational(const char *msg, struct aom_rational *rat) { if (rat->den < 0) { rat->num *= -1; rat->den *= -1; } if (rat->num < 0) die("Error: %s must be positive\n", msg); if (!rat->den) die("Error: %s has zero denominator\n", msg); } static void init_config(cfg_options_t *config) { memset(config, 0, sizeof(cfg_options_t)); config->super_block_size = 0; // Dynamic config->max_partition_size = 128; config->min_partition_size = 4; config->disable_trellis_quant = 3; } /* Parses global config arguments into the AvxEncoderConfig. Note that * argv is modified and overwrites all parsed arguments. */ static void parse_global_config(struct AvxEncoderConfig *global, char ***argv) { char **argi, **argj; struct arg arg; const int num_encoder = get_aom_encoder_count(); char **argv_local = (char **)*argv; if (num_encoder < 1) die("Error: no valid encoder available\n"); /* Initialize default parameters */ memset(global, 0, sizeof(*global)); global->codec = get_aom_encoder_by_index(num_encoder - 1); global->passes = 0; global->color_type = I420; global->csp = AOM_CSP_UNKNOWN; global->show_psnr = 0; int cfg_included = 0; init_config(&global->encoder_config); for (argi = argj = argv_local; (*argj = *argi); argi += arg.argv_step) { arg.argv_step = 1; if (arg_match(&arg, &g_av1_codec_arg_defs.use_cfg, argi)) { if (!cfg_included) { parse_cfg(arg.val, &global->encoder_config); cfg_included = 1; } } else if (arg_match(&arg, &g_av1_codec_arg_defs.help, argi)) { show_help(stdout, 0); exit(EXIT_SUCCESS); } else if (arg_match(&arg, &g_av1_codec_arg_defs.codecarg, argi)) { global->codec = get_aom_encoder_by_short_name(arg.val); if (!global->codec) die("Error: Unrecognized argument (%s) to --codec\n", arg.val); } else if (arg_match(&arg, &g_av1_codec_arg_defs.passes, argi)) { global->passes = arg_parse_uint(&arg); if (global->passes < 1 || global->passes > 3) die("Error: Invalid number of passes (%d)\n", global->passes); } else if (arg_match(&arg, &g_av1_codec_arg_defs.pass_arg, argi)) { global->pass = arg_parse_uint(&arg); if (global->pass < 1 || global->pass > 3) die("Error: Invalid pass selected (%d)\n", global->pass); } else if (arg_match(&arg, &g_av1_codec_arg_defs.input_chroma_sample_position, argi)) { global->csp = arg_parse_enum(&arg); /* Flag is used by later code as well, preserve it. */ argj++; } else if (arg_match(&arg, &g_av1_codec_arg_defs.usage, argi)) { global->usage = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.good_dl, argi)) { global->usage = AOM_USAGE_GOOD_QUALITY; // Good quality usage } else if (arg_match(&arg, &g_av1_codec_arg_defs.rt_dl, argi)) { global->usage = AOM_USAGE_REALTIME; // Real-time usage } else if (arg_match(&arg, &g_av1_codec_arg_defs.ai_dl, argi)) { global->usage = AOM_USAGE_ALL_INTRA; // All intra usage } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_nv12, argi)) { global->color_type = NV12; } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_yv12, argi)) { global->color_type = YV12; } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_i420, argi)) { global->color_type = I420; } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_i422, argi)) { global->color_type = I422; } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_i444, argi)) { global->color_type = I444; } else if (arg_match(&arg, &g_av1_codec_arg_defs.quietarg, argi)) { global->quiet = 1; } else if (arg_match(&arg, &g_av1_codec_arg_defs.verbosearg, argi)) { global->verbose = 1; } else if (arg_match(&arg, &g_av1_codec_arg_defs.limit, argi)) { global->limit = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.skip, argi)) { global->skip_frames = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.psnrarg, argi)) { if (arg.val) global->show_psnr = arg_parse_int(&arg); else global->show_psnr = 1; } else if (arg_match(&arg, &g_av1_codec_arg_defs.recontest, argi)) { global->test_decode = arg_parse_enum_or_int(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.framerate, argi)) { global->framerate = arg_parse_rational(&arg); validate_positive_rational(arg.name, &global->framerate); global->have_framerate = 1; } else if (arg_match(&arg, &g_av1_codec_arg_defs.debugmode, argi)) { global->debug = 1; } else if (arg_match(&arg, &g_av1_codec_arg_defs.q_hist_n, argi)) { global->show_q_hist_buckets = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.rate_hist_n, argi)) { global->show_rate_hist_buckets = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.disable_warnings, argi)) { global->disable_warnings = 1; } else if (arg_match(&arg, &g_av1_codec_arg_defs.disable_warning_prompt, argi)) { global->disable_warning_prompt = 1; } else { argj++; } } if (global->pass) { /* DWIM: Assume the user meant passes=2 if pass=2 is specified */ if (global->pass > global->passes) { aom_tools_warn("Assuming --pass=%d implies --passes=%d\n", global->pass, global->pass); global->passes = global->pass; } } /* Validate global config */ if (global->passes == 0) { #if CONFIG_AV1_ENCODER // Make default AV1 passes = 2 until there is a better quality 1-pass // encoder if (global->codec != NULL) global->passes = (strcmp(get_short_name_by_aom_encoder(global->codec), "av1") == 0 && global->usage != AOM_USAGE_REALTIME) ? 2 : 1; #else global->passes = 1; #endif } if (global->usage == AOM_USAGE_REALTIME && global->passes > 1) { aom_tools_warn("Enforcing one-pass encoding in realtime mode\n"); if (global->pass > 1) die("Error: Invalid --pass=%d for one-pass encoding\n", global->pass); global->passes = 1; } if (global->usage == AOM_USAGE_ALL_INTRA && global->passes > 1) { aom_tools_warn("Enforcing one-pass encoding in all intra mode\n"); global->passes = 1; } } static void open_input_file(struct AvxInputContext *input, aom_chroma_sample_position_t csp) { /* Parse certain options from the input file, if possible */ input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb") : set_binary_mode(stdin); if (!input->file) fatal("Failed to open input file"); if (!fseeko(input->file, 0, SEEK_END)) { /* Input file is seekable. Figure out how long it is, so we can get * progress info. */ input->length = ftello(input->file); rewind(input->file); } /* Default to 1:1 pixel aspect ratio. */ input->pixel_aspect_ratio.numerator = 1; input->pixel_aspect_ratio.denominator = 1; /* For RAW input sources, these bytes will applied on the first frame * in read_frame(). */ input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file); input->detect.position = 0; if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) { if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, csp, input->only_i420) >= 0) { input->file_type = FILE_TYPE_Y4M; input->width = input->y4m.pic_w; input->height = input->y4m.pic_h; input->pixel_aspect_ratio.numerator = input->y4m.par_n; input->pixel_aspect_ratio.denominator = input->y4m.par_d; input->framerate.numerator = input->y4m.fps_n; input->framerate.denominator = input->y4m.fps_d; input->fmt = input->y4m.aom_fmt; input->bit_depth = input->y4m.bit_depth; input->color_range = input->y4m.color_range; } else fatal("Unsupported Y4M stream."); } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) { fatal("IVF is not supported as input."); } else { input->file_type = FILE_TYPE_RAW; } } static void close_input_file(struct AvxInputContext *input) { fclose(input->file); if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m); } static struct stream_state *new_stream(struct AvxEncoderConfig *global, struct stream_state *prev) { struct stream_state *stream; stream = calloc(1, sizeof(*stream)); if (stream == NULL) { fatal("Failed to allocate new stream."); } if (prev) { memcpy(stream, prev, sizeof(*stream)); stream->index++; prev->next = stream; } else { aom_codec_err_t res; /* Populate encoder configuration */ res = aom_codec_enc_config_default(global->codec, &stream->config.cfg, global->usage); if (res) fatal("Failed to get config: %s\n", aom_codec_err_to_string(res)); /* Change the default timebase to a high enough value so that the * encoder will always create strictly increasing timestamps. */ stream->config.cfg.g_timebase.den = 1000; /* Never use the library's default resolution, require it be parsed * from the file or set on the command line. */ stream->config.cfg.g_w = 0; stream->config.cfg.g_h = 0; /* Initialize remaining stream parameters */ stream->config.write_webm = 1; stream->config.write_ivf = 0; #if CONFIG_WEBM_IO stream->config.stereo_fmt = STEREO_FORMAT_MONO; stream->webm_ctx.last_pts_ns = -1; stream->webm_ctx.writer = NULL; stream->webm_ctx.segment = NULL; #endif /* Allows removal of the application version from the EBML tags */ stream->webm_ctx.debug = global->debug; memcpy(&stream->config.cfg.encoder_cfg, &global->encoder_config, sizeof(stream->config.cfg.encoder_cfg)); } /* Output files must be specified for each stream */ stream->config.out_fn = NULL; stream->config.two_pass_input = NULL; stream->config.two_pass_output = NULL; stream->config.two_pass_width = 0; stream->config.two_pass_height = 0; stream->next = NULL; return stream; } static void set_config_arg_ctrls(struct stream_config *config, int key, const struct arg *arg) { int j; if (key == AV1E_SET_FILM_GRAIN_TABLE) { config->film_grain_filename = arg->val; return; } // For target level, the settings should accumulate rather than overwrite, // so we simply append it. if (key == AV1E_SET_TARGET_SEQ_LEVEL_IDX) { j = config->arg_ctrl_cnt; assert(j < ARG_CTRL_CNT_MAX); config->arg_ctrls[j][0] = key; config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg); ++config->arg_ctrl_cnt; return; } /* Point either to the next free element or the first instance of this * control. */ for (j = 0; j < config->arg_ctrl_cnt; j++) if (config->arg_ctrls[j][0] == key) break; /* Update/insert */ assert(j < ARG_CTRL_CNT_MAX); config->arg_ctrls[j][0] = key; config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg); if (key == AOME_SET_ENABLEAUTOALTREF && config->arg_ctrls[j][1] > 1) { aom_tools_warn( "auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n"); config->arg_ctrls[j][1] = 1; } if (j == config->arg_ctrl_cnt) config->arg_ctrl_cnt++; } static void set_config_arg_key_vals(struct stream_config *config, const char *name, const struct arg *arg) { int j; const char *val = arg->val; // For target level, the settings should accumulate rather than overwrite, // so we simply append it. if (strcmp(name, "target-seq-level-idx") == 0) { j = config->arg_key_val_cnt; assert(j < ARG_KEY_VAL_CNT_MAX); config->arg_key_vals[j][0] = name; config->arg_key_vals[j][1] = val; ++config->arg_key_val_cnt; return; } /* Point either to the next free element or the first instance of this * option. */ for (j = 0; j < config->arg_key_val_cnt; j++) if (strcmp(name, config->arg_key_vals[j][0]) == 0) break; /* Update/insert */ assert(j < ARG_KEY_VAL_CNT_MAX); config->arg_key_vals[j][0] = name; config->arg_key_vals[j][1] = val; if (strcmp(name, g_av1_codec_arg_defs.auto_altref.long_name) == 0) { int auto_altref = arg_parse_int(arg); if (auto_altref > 1) { aom_tools_warn( "auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n"); config->arg_key_vals[j][1] = "1"; } } if (j == config->arg_key_val_cnt) config->arg_key_val_cnt++; } static int parse_stream_params(struct AvxEncoderConfig *global, struct stream_state *stream, char **argv) { char **argi, **argj; struct arg arg; const arg_def_t *const *ctrl_args = no_args; const arg_def_t *const *key_val_args = no_args; const int *ctrl_args_map = NULL; struct stream_config *config = &stream->config; int eos_mark_found = 0; int webm_forced = 0; // Handle codec specific options if (0) { #if CONFIG_AV1_ENCODER } else if (strcmp(get_short_name_by_aom_encoder(global->codec), "av1") == 0) { // TODO(jingning): Reuse AV1 specific encoder configuration parameters. // Consider to expand this set for AV1 encoder control. #if __STDC_VERSION__ >= 201112L _Static_assert(NELEMENTS(av1_ctrl_args) == NELEMENTS(av1_arg_ctrl_map), "The av1_ctrl_args and av1_arg_ctrl_map arrays must be of " "the same size."); #else assert(NELEMENTS(av1_ctrl_args) == NELEMENTS(av1_arg_ctrl_map)); #endif ctrl_args = av1_ctrl_args; ctrl_args_map = av1_arg_ctrl_map; key_val_args = av1_key_val_args; #endif } for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { arg.argv_step = 1; /* Once we've found an end-of-stream marker (--) we want to continue * shifting arguments but not consuming them. */ if (eos_mark_found) { argj++; continue; } else if (!strcmp(*argj, "--")) { eos_mark_found = 1; continue; } if (arg_match(&arg, &g_av1_codec_arg_defs.outputfile, argi)) { config->out_fn = arg.val; if (!webm_forced) { const size_t out_fn_len = strlen(config->out_fn); if (out_fn_len >= 4 && !strcmp(config->out_fn + out_fn_len - 4, ".ivf")) { config->write_webm = 0; config->write_ivf = 1; } else if (out_fn_len >= 4 && !strcmp(config->out_fn + out_fn_len - 4, ".obu")) { config->write_webm = 0; config->write_ivf = 0; } } } else if (arg_match(&arg, &g_av1_codec_arg_defs.fpf_name, argi)) { config->stats_fn = arg.val; } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_webm, argi)) { #if CONFIG_WEBM_IO config->write_webm = 1; webm_forced = 1; #else die("Error: --webm specified but webm is disabled."); #endif } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_ivf, argi)) { config->write_webm = 0; config->write_ivf = 1; } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_obu, argi)) { config->write_webm = 0; config->write_ivf = 0; } else if (arg_match(&arg, &g_av1_codec_arg_defs.threads, argi)) { config->cfg.g_threads = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.profile, argi)) { config->cfg.g_profile = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.width, argi)) { config->cfg.g_w = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.height, argi)) { config->cfg.g_h = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.forced_max_frame_width, argi)) { config->cfg.g_forced_max_frame_width = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.forced_max_frame_height, argi)) { config->cfg.g_forced_max_frame_height = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.bitdeptharg, argi)) { config->cfg.g_bit_depth = arg_parse_enum_or_int(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.inbitdeptharg, argi)) { config->cfg.g_input_bit_depth = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.input_chroma_subsampling_x, argi)) { stream->chroma_subsampling_x = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.input_chroma_subsampling_y, argi)) { stream->chroma_subsampling_y = arg_parse_uint(&arg); #if CONFIG_WEBM_IO } else if (arg_match(&arg, &g_av1_codec_arg_defs.stereo_mode, argi)) { config->stereo_fmt = arg_parse_enum_or_int(&arg); #endif } else if (arg_match(&arg, &g_av1_codec_arg_defs.timebase, argi)) { config->cfg.g_timebase = arg_parse_rational(&arg); validate_positive_rational(arg.name, &config->cfg.g_timebase); } else if (arg_match(&arg, &g_av1_codec_arg_defs.global_error_resilient, argi)) { config->cfg.g_error_resilient = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.lag_in_frames, argi)) { config->cfg.g_lag_in_frames = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.large_scale_tile, argi)) { config->cfg.large_scale_tile = arg_parse_uint(&arg); if (config->cfg.large_scale_tile) { global->codec = get_aom_encoder_by_short_name("av1"); } } else if (arg_match(&arg, &g_av1_codec_arg_defs.monochrome, argi)) { config->cfg.monochrome = 1; } else if (arg_match(&arg, &g_av1_codec_arg_defs.full_still_picture_hdr, argi)) { config->cfg.full_still_picture_hdr = 1; } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_16bit_internal, argi)) { config->use_16bit_internal = CONFIG_AV1_HIGHBITDEPTH; if (!config->use_16bit_internal) { aom_tools_warn("%s option ignored with CONFIG_AV1_HIGHBITDEPTH=0.\n", arg.name); } } else if (arg_match(&arg, &g_av1_codec_arg_defs.dropframe_thresh, argi)) { config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.resize_mode, argi)) { config->cfg.rc_resize_mode = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.resize_denominator, argi)) { config->cfg.rc_resize_denominator = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.resize_kf_denominator, argi)) { config->cfg.rc_resize_kf_denominator = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_mode, argi)) { config->cfg.rc_superres_mode = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_denominator, argi)) { config->cfg.rc_superres_denominator = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_kf_denominator, argi)) { config->cfg.rc_superres_kf_denominator = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_qthresh, argi)) { config->cfg.rc_superres_qthresh = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_kf_qthresh, argi)) { config->cfg.rc_superres_kf_qthresh = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.end_usage, argi)) { config->cfg.rc_end_usage = arg_parse_enum_or_int(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.target_bitrate, argi)) { config->cfg.rc_target_bitrate = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.min_quantizer, argi)) { config->cfg.rc_min_quantizer = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.max_quantizer, argi)) { config->cfg.rc_max_quantizer = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.undershoot_pct, argi)) { config->cfg.rc_undershoot_pct = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.overshoot_pct, argi)) { config->cfg.rc_overshoot_pct = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.buf_sz, argi)) { config->cfg.rc_buf_sz = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.buf_initial_sz, argi)) { config->cfg.rc_buf_initial_sz = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.buf_optimal_sz, argi)) { config->cfg.rc_buf_optimal_sz = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.bias_pct, argi)) { config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg); if (global->passes < 2) aom_tools_warn("option %s ignored in one-pass mode.\n", arg.name); } else if (arg_match(&arg, &g_av1_codec_arg_defs.minsection_pct, argi)) { config->cfg.rc_2pass_vbr_minsection_pct = arg_parse_uint(&arg); if (global->passes < 2) aom_tools_warn("option %s ignored in one-pass mode.\n", arg.name); } else if (arg_match(&arg, &g_av1_codec_arg_defs.maxsection_pct, argi)) { config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg); if (global->passes < 2) aom_tools_warn("option %s ignored in one-pass mode.\n", arg.name); } else if (arg_match(&arg, &g_av1_codec_arg_defs.fwd_kf_enabled, argi)) { config->cfg.fwd_kf_enabled = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.kf_min_dist, argi)) { config->cfg.kf_min_dist = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.kf_max_dist, argi)) { config->cfg.kf_max_dist = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.kf_disabled, argi)) { config->cfg.kf_mode = AOM_KF_DISABLED; } else if (arg_match(&arg, &g_av1_codec_arg_defs.sframe_dist, argi)) { config->cfg.sframe_dist = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.sframe_mode, argi)) { config->cfg.sframe_mode = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.save_as_annexb, argi)) { config->cfg.save_as_annexb = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.tile_width, argi)) { config->cfg.tile_width_count = arg_parse_list(&arg, config->cfg.tile_widths, MAX_TILE_WIDTHS); } else if (arg_match(&arg, &g_av1_codec_arg_defs.tile_height, argi)) { config->cfg.tile_height_count = arg_parse_list(&arg, config->cfg.tile_heights, MAX_TILE_HEIGHTS); #if CONFIG_TUNE_VMAF } else if (arg_match(&arg, &g_av1_codec_arg_defs.vmaf_model_path, argi)) { config->vmaf_model_path = arg.val; #endif } else if (arg_match(&arg, &g_av1_codec_arg_defs.partition_info_path, argi)) { config->partition_info_path = arg.val; } else if (arg_match(&arg, &g_av1_codec_arg_defs.enable_rate_guide_deltaq, argi)) { config->enable_rate_guide_deltaq = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.rate_distribution_info, argi)) { config->rate_distribution_info = arg.val; } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_fixed_qp_offsets, argi)) { config->cfg.use_fixed_qp_offsets = arg_parse_uint(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.fixed_qp_offsets, argi)) { config->cfg.use_fixed_qp_offsets = 1; } else if (global->usage == AOM_USAGE_REALTIME && arg_match(&arg, &g_av1_codec_arg_defs.enable_restoration, argi)) { if (arg_parse_uint(&arg) == 1) { aom_tools_warn("non-zero %s option ignored in realtime mode.\n", arg.name); } } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_input, argi)) { config->two_pass_input = arg.val; } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_output, argi)) { config->two_pass_output = arg.val; } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_width, argi)) { config->two_pass_width = arg_parse_int(&arg); } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_height, argi)) { config->two_pass_height = arg_parse_int(&arg); } else { int i, match = 0; // check if the control ID API supports this arg if (ctrl_args_map) { for (i = 0; ctrl_args[i]; i++) { if (arg_match(&arg, ctrl_args[i], argi)) { match = 1; set_config_arg_ctrls(config, ctrl_args_map[i], &arg); break; } } } if (!match) { // check if the key & value API supports this arg for (i = 0; key_val_args[i]; i++) { if (arg_match(&arg, key_val_args[i], argi)) { match = 1; set_config_arg_key_vals(config, key_val_args[i]->long_name, &arg); break; } } } if (!match) argj++; } } config->use_16bit_internal |= config->cfg.g_bit_depth > AOM_BITS_8; if (global->usage == AOM_USAGE_REALTIME && config->cfg.g_lag_in_frames != 0) { aom_tools_warn("non-zero lag-in-frames option ignored in realtime mode.\n"); config->cfg.g_lag_in_frames = 0; } if (global->usage == AOM_USAGE_ALL_INTRA) { if (config->cfg.g_lag_in_frames != 0) { aom_tools_warn( "non-zero lag-in-frames option ignored in all intra mode.\n"); config->cfg.g_lag_in_frames = 0; } if (config->cfg.kf_max_dist != 0) { aom_tools_warn( "non-zero max key frame distance option ignored in all intra " "mode.\n"); config->cfg.kf_max_dist = 0; } } // set the passes field using key & val API if (config->arg_key_val_cnt >= ARG_KEY_VAL_CNT_MAX) { die("Not enough buffer for the key & value API."); } config->arg_key_vals[config->arg_key_val_cnt][0] = "passes"; switch (global->passes) { case 0: config->arg_key_vals[config->arg_key_val_cnt][1] = "0"; break; case 1: config->arg_key_vals[config->arg_key_val_cnt][1] = "1"; break; case 2: config->arg_key_vals[config->arg_key_val_cnt][1] = "2"; break; case 3: config->arg_key_vals[config->arg_key_val_cnt][1] = "3"; break; default: die("Invalid value of --passes."); } config->arg_key_val_cnt++; // set the two_pass_output field if (!config->two_pass_output && global->passes == 3) { // If not specified, set the name of two_pass_output file here. snprintf(stream->tmp_out_fn, sizeof(stream->tmp_out_fn), "%.980s_pass2_%d.ivf", stream->config.out_fn, stream->index); stream->config.two_pass_output = stream->tmp_out_fn; } if (config->two_pass_output) { config->arg_key_vals[config->arg_key_val_cnt][0] = "two-pass-output"; config->arg_key_vals[config->arg_key_val_cnt][1] = config->two_pass_output; config->arg_key_val_cnt++; } return eos_mark_found; } #define FOREACH_STREAM(iterator, list) \ for (struct stream_state *iterator = list; iterator; \ iterator = iterator->next) static void validate_stream_config(const struct stream_state *stream, const struct AvxEncoderConfig *global) { const struct stream_state *streami; (void)global; if (!stream->config.cfg.g_w || !stream->config.cfg.g_h) fatal( "Stream %d: Specify stream dimensions with --width (-w) " " and --height (-h)", stream->index); /* Even if bit depth is set on the command line flag to be lower, * it is upgraded to at least match the input bit depth. */ assert(stream->config.cfg.g_input_bit_depth <= (unsigned int)stream->config.cfg.g_bit_depth); for (streami = stream; streami; streami = streami->next) { /* All streams require output files */ if (!streami->config.out_fn) fatal("Stream %d: Output file is required (specify with -o)", streami->index); /* Check for two streams outputting to the same file */ if (streami != stream) { const char *a = stream->config.out_fn; const char *b = streami->config.out_fn; if (!strcmp(a, b) && strcmp(a, "/dev/null") && strcmp(a, ":nul")) fatal("Stream %d: duplicate output file (from stream %d)", streami->index, stream->index); } /* Check for two streams sharing a stats file. */ if (streami != stream) { const char *a = stream->config.stats_fn; const char *b = streami->config.stats_fn; if (a && b && !strcmp(a, b)) fatal("Stream %d: duplicate stats file (from stream %d)", streami->index, stream->index); } } } static void set_stream_dimensions(struct stream_state *stream, unsigned int w, unsigned int h) { if (!stream->config.cfg.g_w) { if (!stream->config.cfg.g_h) stream->config.cfg.g_w = w; else stream->config.cfg.g_w = w * stream->config.cfg.g_h / h; } if (!stream->config.cfg.g_h) { stream->config.cfg.g_h = h * stream->config.cfg.g_w / w; } } static const char *file_type_to_string(enum VideoFileType t) { switch (t) { case FILE_TYPE_RAW: return "RAW"; case FILE_TYPE_Y4M: return "Y4M"; default: return "Other"; } } static void show_stream_config(struct stream_state *stream, struct AvxEncoderConfig *global, struct AvxInputContext *input) { #define SHOW(field) \ fprintf(stderr, " %-28s = %d\n", #field, stream->config.cfg.field) if (stream->index == 0) { fprintf(stderr, "Codec: %s\n", aom_codec_iface_name(global->codec)); fprintf(stderr, "Source file: %s File Type: %s Format: %s\n", input->filename, file_type_to_string(input->file_type), image_format_to_string(input->fmt)); } if (stream->next || stream->index) fprintf(stderr, "\nStream Index: %d\n", stream->index); fprintf(stderr, "Destination file: %s\n", stream->config.out_fn); fprintf(stderr, "Coding path: %s\n", stream->config.use_16bit_internal ? "HBD" : "LBD"); fprintf(stderr, "Encoder parameters:\n"); SHOW(g_usage); SHOW(g_threads); SHOW(g_profile); SHOW(g_w); SHOW(g_h); SHOW(g_bit_depth); SHOW(g_input_bit_depth); SHOW(g_timebase.num); SHOW(g_timebase.den); SHOW(g_error_resilient); SHOW(g_pass); SHOW(g_lag_in_frames); SHOW(large_scale_tile); SHOW(rc_dropframe_thresh); SHOW(rc_resize_mode); SHOW(rc_resize_denominator); SHOW(rc_resize_kf_denominator); SHOW(rc_superres_mode); SHOW(rc_superres_denominator); SHOW(rc_superres_kf_denominator); SHOW(rc_superres_qthresh); SHOW(rc_superres_kf_qthresh); SHOW(rc_end_usage); SHOW(rc_target_bitrate); SHOW(rc_min_quantizer); SHOW(rc_max_quantizer); SHOW(rc_undershoot_pct); SHOW(rc_overshoot_pct); SHOW(rc_buf_sz); SHOW(rc_buf_initial_sz); SHOW(rc_buf_optimal_sz); SHOW(rc_2pass_vbr_bias_pct); SHOW(rc_2pass_vbr_minsection_pct); SHOW(rc_2pass_vbr_maxsection_pct); SHOW(fwd_kf_enabled); SHOW(kf_mode); SHOW(kf_min_dist); SHOW(kf_max_dist); #define SHOW_PARAMS(field) \ fprintf(stderr, " %-28s = %d\n", #field, \ stream->config.cfg.encoder_cfg.field) if (global->encoder_config.init_by_cfg_file) { SHOW_PARAMS(super_block_size); SHOW_PARAMS(max_partition_size); SHOW_PARAMS(min_partition_size); SHOW_PARAMS(disable_ab_partition_type); SHOW_PARAMS(disable_rect_partition_type); SHOW_PARAMS(disable_1to4_partition_type); SHOW_PARAMS(disable_flip_idtx); SHOW_PARAMS(disable_cdef); SHOW_PARAMS(disable_lr); SHOW_PARAMS(disable_obmc); SHOW_PARAMS(disable_warp_motion); SHOW_PARAMS(disable_global_motion); SHOW_PARAMS(disable_dist_wtd_comp); SHOW_PARAMS(disable_diff_wtd_comp); SHOW_PARAMS(disable_inter_intra_comp); SHOW_PARAMS(disable_masked_comp); SHOW_PARAMS(disable_one_sided_comp); SHOW_PARAMS(disable_palette); SHOW_PARAMS(disable_intrabc); SHOW_PARAMS(disable_cfl); SHOW_PARAMS(disable_smooth_intra); SHOW_PARAMS(disable_filter_intra); SHOW_PARAMS(disable_dual_filter); SHOW_PARAMS(disable_intra_angle_delta); SHOW_PARAMS(disable_intra_edge_filter); SHOW_PARAMS(disable_tx_64x64); SHOW_PARAMS(disable_smooth_inter_intra); SHOW_PARAMS(disable_inter_inter_wedge); SHOW_PARAMS(disable_inter_intra_wedge); SHOW_PARAMS(disable_paeth_intra); SHOW_PARAMS(disable_trellis_quant); SHOW_PARAMS(disable_ref_frame_mv); SHOW_PARAMS(reduced_reference_set); SHOW_PARAMS(reduced_tx_type_set); } } static void open_output_file(struct stream_state *stream, struct AvxEncoderConfig *global, const struct AvxRational *pixel_aspect_ratio, const char *encoder_settings) { const char *fn = stream->config.out_fn; const struct aom_codec_enc_cfg *const cfg = &stream->config.cfg; if (cfg->g_pass == AOM_RC_FIRST_PASS) return; stream->file = strcmp(fn, "-") ? fopen(fn, "wb") : set_binary_mode(stdout); if (!stream->file) fatal("Failed to open output file"); if (stream->config.write_webm && fseek(stream->file, 0, SEEK_CUR)) fatal("WebM output to pipes not supported."); #if CONFIG_WEBM_IO if (stream->config.write_webm) { stream->webm_ctx.stream = stream->file; if (write_webm_file_header(&stream->webm_ctx, &stream->encoder, cfg, stream->config.stereo_fmt, get_fourcc_by_aom_encoder(global->codec), pixel_aspect_ratio, encoder_settings) != 0) { fatal("WebM writer initialization failed."); } } #else (void)pixel_aspect_ratio; (void)encoder_settings; #endif if (!stream->config.write_webm && stream->config.write_ivf) { ivf_write_file_header(stream->file, cfg, get_fourcc_by_aom_encoder(global->codec), 0); } } static void close_output_file(struct stream_state *stream, unsigned int fourcc) { const struct aom_codec_enc_cfg *const cfg = &stream->config.cfg; if (cfg->g_pass == AOM_RC_FIRST_PASS) return; #if CONFIG_WEBM_IO if (stream->config.write_webm) { if (write_webm_file_footer(&stream->webm_ctx) != 0) { fatal("WebM writer finalization failed."); } } #endif if (!stream->config.write_webm && stream->config.write_ivf) { if (!fseek(stream->file, 0, SEEK_SET)) ivf_write_file_header(stream->file, &stream->config.cfg, fourcc, stream->frames_out); } fclose(stream->file); } static void setup_pass(struct stream_state *stream, struct AvxEncoderConfig *global, int pass) { if (stream->config.stats_fn) { if (!stats_open_file(&stream->stats, stream->config.stats_fn, pass)) fatal("Failed to open statistics store"); } else { if (!stats_open_mem(&stream->stats, pass)) fatal("Failed to open statistics store"); } if (global->passes == 1) { stream->config.cfg.g_pass = AOM_RC_ONE_PASS; } else { switch (pass) { case 0: stream->config.cfg.g_pass = AOM_RC_FIRST_PASS; break; case 1: stream->config.cfg.g_pass = AOM_RC_SECOND_PASS; break; case 2: stream->config.cfg.g_pass = AOM_RC_THIRD_PASS; break; default: fatal("Failed to set pass"); } } if (pass) { stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats); } stream->cx_time = 0; stream->nbytes = 0; stream->frames_out = 0; } static void initialize_encoder(struct stream_state *stream, struct AvxEncoderConfig *global) { int i; int flags = 0; flags |= (global->show_psnr >= 1) ? AOM_CODEC_USE_PSNR : 0; flags |= stream->config.use_16bit_internal ? AOM_CODEC_USE_HIGHBITDEPTH : 0; /* Construct Encoder Context */ aom_codec_enc_init(&stream->encoder, global->codec, &stream->config.cfg, flags); ctx_exit_on_error(&stream->encoder, "Failed to initialize encoder"); for (i = 0; i < stream->config.arg_ctrl_cnt; i++) { int ctrl = stream->config.arg_ctrls[i][0]; int value = stream->config.arg_ctrls[i][1]; if (aom_codec_control(&stream->encoder, ctrl, value)) fprintf(stderr, "Error: Tried to set control %d = %d\n", ctrl, value); ctx_exit_on_error(&stream->encoder, "Failed to control codec"); } for (i = 0; i < stream->config.arg_key_val_cnt; i++) { const char *name = stream->config.arg_key_vals[i][0]; const char *val = stream->config.arg_key_vals[i][1]; if (aom_codec_set_option(&stream->encoder, name, val)) fprintf(stderr, "Error: Tried to set option %s = %s\n", name, val); ctx_exit_on_error(&stream->encoder, "Failed to set codec option"); } #if CONFIG_TUNE_VMAF if (stream->config.vmaf_model_path) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_VMAF_MODEL_PATH, stream->config.vmaf_model_path); ctx_exit_on_error(&stream->encoder, "Failed to set vmaf model path"); } #endif if (stream->config.partition_info_path) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_PARTITION_INFO_PATH, stream->config.partition_info_path); ctx_exit_on_error(&stream->encoder, "Failed to set partition info path"); } if (stream->config.enable_rate_guide_deltaq) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_ENABLE_RATE_GUIDE_DELTAQ, stream->config.enable_rate_guide_deltaq); ctx_exit_on_error(&stream->encoder, "Failed to enable rate guide deltaq"); } if (stream->config.rate_distribution_info) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_RATE_DISTRIBUTION_INFO, stream->config.rate_distribution_info); ctx_exit_on_error(&stream->encoder, "Failed to set rate distribution info"); } if (stream->config.film_grain_filename) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_FILM_GRAIN_TABLE, stream->config.film_grain_filename); ctx_exit_on_error(&stream->encoder, "Failed to set film grain table"); } AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_COLOR_RANGE, stream->config.color_range); ctx_exit_on_error(&stream->encoder, "Failed to set color range"); #if CONFIG_AV1_DECODER if (global->test_decode != TEST_DECODE_OFF) { aom_codec_iface_t *decoder = get_aom_decoder_by_short_name( get_short_name_by_aom_encoder(global->codec)); aom_codec_dec_cfg_t cfg = { 0, 0, 0, !stream->config.use_16bit_internal }; aom_codec_dec_init(&stream->decoder, decoder, &cfg, 0); if (strcmp(get_short_name_by_aom_encoder(global->codec), "av1") == 0) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_TILE_MODE, stream->config.cfg.large_scale_tile); ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_mode"); AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1D_SET_IS_ANNEXB, stream->config.cfg.save_as_annexb); ctx_exit_on_error(&stream->decoder, "Failed to set is_annexb"); AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_DECODE_TILE_ROW, -1); ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_row"); AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_DECODE_TILE_COL, -1); ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_col"); } } #endif } // Convert the input image 'img' to a monochrome image. The Y plane of the // output image is a shallow copy of the Y plane of the input image, therefore // the input image must remain valid for the lifetime of the output image. The U // and V planes of the output image are set to null pointers. The output image // format is AOM_IMG_FMT_I420 because libaom does not have AOM_IMG_FMT_I400. static void convert_image_to_monochrome(const struct aom_image *img, struct aom_image *monochrome_img) { *monochrome_img = *img; monochrome_img->fmt = AOM_IMG_FMT_I420; if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { monochrome_img->fmt |= AOM_IMG_FMT_HIGHBITDEPTH; } monochrome_img->monochrome = 1; monochrome_img->csp = AOM_CSP_UNKNOWN; monochrome_img->x_chroma_shift = 1; monochrome_img->y_chroma_shift = 1; monochrome_img->planes[AOM_PLANE_U] = NULL; monochrome_img->planes[AOM_PLANE_V] = NULL; monochrome_img->stride[AOM_PLANE_U] = 0; monochrome_img->stride[AOM_PLANE_V] = 0; monochrome_img->sz = 0; monochrome_img->bps = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8; monochrome_img->img_data = NULL; monochrome_img->img_data_owner = 0; monochrome_img->self_allocd = 0; } static void encode_frame(struct stream_state *stream, struct AvxEncoderConfig *global, struct aom_image *img, unsigned int frames_in) { aom_codec_pts_t frame_start, next_frame_start; struct aom_codec_enc_cfg *cfg = &stream->config.cfg; struct aom_usec_timer timer; frame_start = (cfg->g_timebase.den * (int64_t)(frames_in - 1) * global->framerate.den) / cfg->g_timebase.num / global->framerate.num; next_frame_start = (cfg->g_timebase.den * (int64_t)(frames_in)*global->framerate.den) / cfg->g_timebase.num / global->framerate.num; /* Scale if necessary */ if (img) { if ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) { if (img->fmt != AOM_IMG_FMT_I42016) { fprintf(stderr, "%s can only scale 4:2:0 inputs\n", exec_name); exit(EXIT_FAILURE); } #if CONFIG_LIBYUV if (!stream->img) { stream->img = aom_img_alloc(NULL, AOM_IMG_FMT_I42016, cfg->g_w, cfg->g_h, 16); } I420Scale_16( (uint16_t *)img->planes[AOM_PLANE_Y], img->stride[AOM_PLANE_Y] / 2, (uint16_t *)img->planes[AOM_PLANE_U], img->stride[AOM_PLANE_U] / 2, (uint16_t *)img->planes[AOM_PLANE_V], img->stride[AOM_PLANE_V] / 2, img->d_w, img->d_h, (uint16_t *)stream->img->planes[AOM_PLANE_Y], stream->img->stride[AOM_PLANE_Y] / 2, (uint16_t *)stream->img->planes[AOM_PLANE_U], stream->img->stride[AOM_PLANE_U] / 2, (uint16_t *)stream->img->planes[AOM_PLANE_V], stream->img->stride[AOM_PLANE_V] / 2, stream->img->d_w, stream->img->d_h, kFilterBox); img = stream->img; #else stream->encoder.err = 1; ctx_exit_on_error(&stream->encoder, "Stream %d: Failed to encode frame.\n" "libyuv is required for scaling but is currently " "disabled.\n" "Be sure to specify -DCONFIG_LIBYUV=1 when running " "cmake.\n", stream->index); #endif } } if (img && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) { if (img->fmt != AOM_IMG_FMT_I420 && img->fmt != AOM_IMG_FMT_YV12) { fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name); exit(EXIT_FAILURE); } #if CONFIG_LIBYUV if (!stream->img) stream->img = aom_img_alloc(NULL, AOM_IMG_FMT_I420, cfg->g_w, cfg->g_h, 16); I420Scale( img->planes[AOM_PLANE_Y], img->stride[AOM_PLANE_Y], img->planes[AOM_PLANE_U], img->stride[AOM_PLANE_U], img->planes[AOM_PLANE_V], img->stride[AOM_PLANE_V], img->d_w, img->d_h, stream->img->planes[AOM_PLANE_Y], stream->img->stride[AOM_PLANE_Y], stream->img->planes[AOM_PLANE_U], stream->img->stride[AOM_PLANE_U], stream->img->planes[AOM_PLANE_V], stream->img->stride[AOM_PLANE_V], stream->img->d_w, stream->img->d_h, kFilterBox); img = stream->img; #else stream->encoder.err = 1; ctx_exit_on_error(&stream->encoder, "Stream %d: Failed to encode frame.\n" "Scaling disabled in this configuration. \n" "To enable, configure with --enable-libyuv\n", stream->index); #endif } struct aom_image monochrome_img; if (img && cfg->monochrome) { convert_image_to_monochrome(img, &monochrome_img); img = &monochrome_img; } aom_usec_timer_start(&timer); aom_codec_encode(&stream->encoder, img, frame_start, (uint32_t)(next_frame_start - frame_start), 0); aom_usec_timer_mark(&timer); stream->cx_time += aom_usec_timer_elapsed(&timer); ctx_exit_on_error(&stream->encoder, "Stream %d: Failed to encode frame", stream->index); } static void update_quantizer_histogram(struct stream_state *stream) { if (stream->config.cfg.g_pass != AOM_RC_FIRST_PASS) { int q; AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AOME_GET_LAST_QUANTIZER_64, &q); ctx_exit_on_error(&stream->encoder, "Failed to read quantizer"); stream->counts[q]++; } } static void get_cx_data(struct stream_state *stream, struct AvxEncoderConfig *global, int *got_data) { const aom_codec_cx_pkt_t *pkt; const struct aom_codec_enc_cfg *cfg = &stream->config.cfg; aom_codec_iter_t iter = NULL; *got_data = 0; while ((pkt = aom_codec_get_cx_data(&stream->encoder, &iter))) { static size_t fsize = 0; static FileOffset ivf_header_pos = 0; switch (pkt->kind) { case AOM_CODEC_CX_FRAME_PKT: ++stream->frames_out; if (!global->quiet) fprintf(stderr, " %6luF", (unsigned long)pkt->data.frame.sz); update_rate_histogram(stream->rate_hist, cfg, pkt); #if CONFIG_WEBM_IO if (stream->config.write_webm) { if (write_webm_block(&stream->webm_ctx, cfg, pkt) != 0) { fatal("WebM writer failed."); } } #endif if (!stream->config.write_webm) { if (stream->config.write_ivf) { if (pkt->data.frame.partition_id <= 0) { ivf_header_pos = ftello(stream->file); fsize = pkt->data.frame.sz; ivf_write_frame_header(stream->file, pkt->data.frame.pts, fsize); } else { fsize += pkt->data.frame.sz; const FileOffset currpos = ftello(stream->file); fseeko(stream->file, ivf_header_pos, SEEK_SET); ivf_write_frame_size(stream->file, fsize); fseeko(stream->file, currpos, SEEK_SET); } } (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, stream->file); } stream->nbytes += pkt->data.raw.sz; *got_data = 1; #if CONFIG_AV1_DECODER if (global->test_decode != TEST_DECODE_OFF && !stream->mismatch_seen) { aom_codec_decode(&stream->decoder, pkt->data.frame.buf, pkt->data.frame.sz, NULL); if (stream->decoder.err) { warn_or_exit_on_error(&stream->decoder, global->test_decode == TEST_DECODE_FATAL, "Failed to decode frame %d in stream %d", stream->frames_out + 1, stream->index); stream->mismatch_seen = stream->frames_out + 1; } } #endif break; case AOM_CODEC_STATS_PKT: stream->frames_out++; stats_write(&stream->stats, pkt->data.twopass_stats.buf, pkt->data.twopass_stats.sz); stream->nbytes += pkt->data.raw.sz; break; case AOM_CODEC_PSNR_PKT: if (global->show_psnr >= 1) { int i; stream->psnr_sse_total[0] += pkt->data.psnr.sse[0]; stream->psnr_samples_total[0] += pkt->data.psnr.samples[0]; for (i = 0; i < 4; i++) { if (!global->quiet) fprintf(stderr, "%.3f ", pkt->data.psnr.psnr[i]); stream->psnr_totals[0][i] += pkt->data.psnr.psnr[i]; } stream->psnr_count[0]++; #if CONFIG_AV1_HIGHBITDEPTH if (stream->config.cfg.g_input_bit_depth < (unsigned int)stream->config.cfg.g_bit_depth) { stream->psnr_sse_total[1] += pkt->data.psnr.sse_hbd[0]; stream->psnr_samples_total[1] += pkt->data.psnr.samples_hbd[0]; for (i = 0; i < 4; i++) { if (!global->quiet) fprintf(stderr, "%.3f ", pkt->data.psnr.psnr_hbd[i]); stream->psnr_totals[1][i] += pkt->data.psnr.psnr_hbd[i]; } stream->psnr_count[1]++; } #endif } break; default: break; } } } static void show_psnr(struct stream_state *stream, double peak, int64_t bps) { int i; double ovpsnr; if (!stream->psnr_count[0]) return; fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index); ovpsnr = sse_to_psnr((double)stream->psnr_samples_total[0], peak, (double)stream->psnr_sse_total[0]); fprintf(stderr, " %.3f", ovpsnr); for (i = 0; i < 4; i++) { fprintf(stderr, " %.3f", stream->psnr_totals[0][i] / stream->psnr_count[0]); } if (bps > 0) { fprintf(stderr, " %7" PRId64 " bps", bps); } fprintf(stderr, " %7" PRId64 " ms", stream->cx_time / 1000); fprintf(stderr, "\n"); } #if CONFIG_AV1_HIGHBITDEPTH static void show_psnr_hbd(struct stream_state *stream, double peak, int64_t bps) { int i; double ovpsnr; // Compute PSNR based on stream bit depth if (!stream->psnr_count[1]) return; fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index); ovpsnr = sse_to_psnr((double)stream->psnr_samples_total[1], peak, (double)stream->psnr_sse_total[1]); fprintf(stderr, " %.3f", ovpsnr); for (i = 0; i < 4; i++) { fprintf(stderr, " %.3f", stream->psnr_totals[1][i] / stream->psnr_count[1]); } if (bps > 0) { fprintf(stderr, " %7" PRId64 " bps", bps); } fprintf(stderr, " %7" PRId64 " ms", stream->cx_time / 1000); fprintf(stderr, "\n"); } #endif static float usec_to_fps(uint64_t usec, unsigned int frames) { return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0); } static void test_decode(struct stream_state *stream, enum TestDecodeFatality fatal) { aom_image_t enc_img, dec_img; if (stream->mismatch_seen) return; /* Get the internal reference frame */ AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img); AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img); if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) != (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) { if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { aom_image_t enc_hbd_img; aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH, enc_img.d_w, enc_img.d_h, 16); aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img); enc_img = enc_hbd_img; } if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { aom_image_t dec_hbd_img; aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH, dec_img.d_w, dec_img.d_h, 16); aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img); dec_img = dec_hbd_img; } } ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame"); ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame"); if (!aom_compare_img(&enc_img, &dec_img)) { int y[4], u[4], v[4]; if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { aom_find_mismatch_high(&enc_img, &dec_img, y, u, v); } else { aom_find_mismatch(&enc_img, &dec_img, y, u, v); } stream->decoder.err = 1; warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL, "Stream %d: Encode/decode mismatch on frame %d at" " Y[%d, %d] {%d/%d}," " U[%d, %d] {%d/%d}," " V[%d, %d] {%d/%d}", stream->index, stream->frames_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1], v[2], v[3]); stream->mismatch_seen = stream->frames_out; } aom_img_free(&enc_img); aom_img_free(&dec_img); } static void print_time(const char *label, int64_t etl) { int64_t hours; int64_t mins; int64_t secs; if (etl >= 0) { hours = etl / 3600; etl -= hours * 3600; mins = etl / 60; etl -= mins * 60; secs = etl; fprintf(stderr, "[%3s %2" PRId64 ":%02" PRId64 ":%02" PRId64 "] ", label, hours, mins, secs); } else { fprintf(stderr, "[%3s unknown] ", label); } } static void clear_stream_count_state(struct stream_state *stream) { // PSNR counters for (int k = 0; k < 2; k++) { stream->psnr_sse_total[k] = 0; stream->psnr_samples_total[k] = 0; for (int i = 0; i < 4; i++) { stream->psnr_totals[k][i] = 0; } stream->psnr_count[k] = 0; } // q hist memset(stream->counts, 0, sizeof(stream->counts)); } // aomenc will downscale the second pass if: // 1. the specific pass is not given by commandline (aomenc will perform all // passes) // 2. there are more than 2 passes in total // 3. current pass is the second pass (the parameter pass starts with 0 so // pass == 1) static int pass_need_downscale(int global_pass, int global_passes, int pass) { return !global_pass && global_passes > 2 && pass == 1; } int main(int argc, const char **argv_) { int pass; aom_image_t raw; aom_image_t raw_shift; int allocated_raw_shift = 0; int do_16bit_internal = 0; int input_shift = 0; int frame_avail, got_data; struct AvxInputContext input; struct AvxEncoderConfig global; struct stream_state *streams = NULL; char **argv, **argi; uint64_t cx_time = 0; int stream_cnt = 0; int res = 0; int profile_updated = 0; memset(&input, 0, sizeof(input)); memset(&raw, 0, sizeof(raw)); exec_name = argv_[0]; /* Setup default input stream settings */ input.framerate.numerator = 30; input.framerate.denominator = 1; input.only_i420 = 1; input.bit_depth = 0; /* First parse the global configuration values, because we want to apply * other parameters on top of the default configuration provided by the * codec. */ argv = argv_dup(argc - 1, argv_ + 1); if (!argv) { fprintf(stderr, "Error allocating argument list\n"); return EXIT_FAILURE; } parse_global_config(&global, &argv); if (argc < 2) usage_exit(); switch (global.color_type) { case I420: input.fmt = AOM_IMG_FMT_I420; break; case I422: input.fmt = AOM_IMG_FMT_I422; break; case I444: input.fmt = AOM_IMG_FMT_I444; break; case YV12: input.fmt = AOM_IMG_FMT_YV12; break; case NV12: input.fmt = AOM_IMG_FMT_NV12; break; } { /* Now parse each stream's parameters. Using a local scope here * due to the use of 'stream' as loop variable in FOREACH_STREAM * loops */ struct stream_state *stream = NULL; do { stream = new_stream(&global, stream); stream_cnt++; if (!streams) streams = stream; } while (parse_stream_params(&global, stream, argv)); } /* Check for unrecognized options */ for (argi = argv; *argi; argi++) if (argi[0][0] == '-' && argi[0][1]) die("Error: Unrecognized option %s\n", *argi); FOREACH_STREAM(stream, streams) { check_encoder_config(global.disable_warning_prompt, &global, &stream->config.cfg); // If large_scale_tile = 1, only support to output to ivf format. if (stream->config.cfg.large_scale_tile && !stream->config.write_ivf) die("only support ivf output format while large-scale-tile=1\n"); } /* Handle non-option arguments */ input.filename = argv[0]; const char *orig_input_filename = input.filename; FOREACH_STREAM(stream, streams) { stream->orig_out_fn = stream->config.out_fn; stream->orig_width = stream->config.cfg.g_w; stream->orig_height = stream->config.cfg.g_h; stream->orig_write_ivf = stream->config.write_ivf; stream->orig_write_webm = stream->config.write_webm; } if (!input.filename) { fprintf(stderr, "No input file specified!\n"); usage_exit(); } /* Decide if other chroma subsamplings than 4:2:0 are supported */ if (get_fourcc_by_aom_encoder(global.codec) == AV1_FOURCC) input.only_i420 = 0; for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) { if (pass > 1) { FOREACH_STREAM(stream, streams) { clear_stream_count_state(stream); } } int frames_in = 0, seen_frames = 0; int64_t estimated_time_left = -1; int64_t average_rate = -1; int64_t lagged_count = 0; const int need_downscale = pass_need_downscale(global.pass, global.passes, pass); // Set the output to the specified two-pass output file, and // restore the width and height to the original values. FOREACH_STREAM(stream, streams) { if (need_downscale) { stream->config.out_fn = stream->config.two_pass_output; // Libaom currently only supports the ivf format for the third pass. stream->config.write_ivf = 1; stream->config.write_webm = 0; } else { stream->config.out_fn = stream->orig_out_fn; stream->config.write_ivf = stream->orig_write_ivf; stream->config.write_webm = stream->orig_write_webm; } stream->config.cfg.g_w = stream->orig_width; stream->config.cfg.g_h = stream->orig_height; } // For second pass in three-pass encoding, set the input to // the given two-pass-input file if available. If the scaled input is not // given, we will attempt to re-scale the original input. input.filename = orig_input_filename; const char *two_pass_input = NULL; if (need_downscale) { FOREACH_STREAM(stream, streams) { if (stream->config.two_pass_input) { two_pass_input = stream->config.two_pass_input; input.filename = two_pass_input; break; } } } open_input_file(&input, global.csp); /* If the input file doesn't specify its w/h (raw files), try to get * the data from the first stream's configuration. */ if (!input.width || !input.height) { if (two_pass_input) { FOREACH_STREAM(stream, streams) { if (stream->config.two_pass_width && stream->config.two_pass_height) { input.width = stream->config.two_pass_width; input.height = stream->config.two_pass_height; break; } } } else { FOREACH_STREAM(stream, streams) { if (stream->config.cfg.g_w && stream->config.cfg.g_h) { input.width = stream->config.cfg.g_w; input.height = stream->config.cfg.g_h; break; } } } } /* Update stream configurations from the input file's parameters */ if (!input.width || !input.height) { if (two_pass_input) { fatal( "Specify downscaled stream dimensions with --two-pass-width " " and --two-pass-height"); } else { fatal( "Specify stream dimensions with --width (-w) " " and --height (-h)"); } } if (need_downscale) { FOREACH_STREAM(stream, streams) { if (stream->config.two_pass_width && stream->config.two_pass_height) { stream->config.cfg.g_w = stream->config.two_pass_width; stream->config.cfg.g_h = stream->config.two_pass_height; } else if (two_pass_input) { stream->config.cfg.g_w = input.width; stream->config.cfg.g_h = input.height; } else if (stream->orig_width && stream->orig_height) { #if CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL stream->config.cfg.g_w = stream->orig_width; stream->config.cfg.g_h = stream->orig_height; #else // CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL stream->config.cfg.g_w = (stream->orig_width + 1) / 2; stream->config.cfg.g_h = (stream->orig_height + 1) / 2; #endif // CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL } else { #if CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL stream->config.cfg.g_w = input.width; stream->config.cfg.g_h = input.height; #else // CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL stream->config.cfg.g_w = (input.width + 1) / 2; stream->config.cfg.g_h = (input.height + 1) / 2; #endif // CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL } } } /* If input file does not specify bit-depth but input-bit-depth parameter * exists, assume that to be the input bit-depth. However, if the * input-bit-depth paramter does not exist, assume the input bit-depth * to be the same as the codec bit-depth. */ if (!input.bit_depth) { FOREACH_STREAM(stream, streams) { if (stream->config.cfg.g_input_bit_depth) input.bit_depth = stream->config.cfg.g_input_bit_depth; else input.bit_depth = stream->config.cfg.g_input_bit_depth = (int)stream->config.cfg.g_bit_depth; } if (input.bit_depth > 8) input.fmt |= AOM_IMG_FMT_HIGHBITDEPTH; } else { FOREACH_STREAM(stream, streams) { stream->config.cfg.g_input_bit_depth = input.bit_depth; } } FOREACH_STREAM(stream, streams) { if (input.fmt != AOM_IMG_FMT_I420 && input.fmt != AOM_IMG_FMT_I42016 && input.fmt != AOM_IMG_FMT_NV12) { /* Automatically upgrade if input is non-4:2:0 but a 4:2:0 profile was selected. */ switch (stream->config.cfg.g_profile) { case 0: if (input.bit_depth < 12 && (input.fmt == AOM_IMG_FMT_I444 || input.fmt == AOM_IMG_FMT_I44416)) { if (!stream->config.cfg.monochrome) { stream->config.cfg.g_profile = 1; profile_updated = 1; } } else if (input.bit_depth == 12 || ((input.fmt == AOM_IMG_FMT_I422 || input.fmt == AOM_IMG_FMT_I42216) && !stream->config.cfg.monochrome)) { stream->config.cfg.g_profile = 2; profile_updated = 1; } break; case 1: if (input.bit_depth == 12 || input.fmt == AOM_IMG_FMT_I422 || input.fmt == AOM_IMG_FMT_I42216) { stream->config.cfg.g_profile = 2; profile_updated = 1; } else if (input.bit_depth < 12 && (input.fmt == AOM_IMG_FMT_I420 || input.fmt == AOM_IMG_FMT_I42016)) { stream->config.cfg.g_profile = 0; profile_updated = 1; } break; case 2: if (input.bit_depth < 12 && (input.fmt == AOM_IMG_FMT_I444 || input.fmt == AOM_IMG_FMT_I44416)) { stream->config.cfg.g_profile = 1; profile_updated = 1; } else if (input.bit_depth < 12 && (input.fmt == AOM_IMG_FMT_I420 || input.fmt == AOM_IMG_FMT_I42016)) { stream->config.cfg.g_profile = 0; profile_updated = 1; } else if (input.bit_depth == 12 && input.file_type == FILE_TYPE_Y4M) { // Note that here the input file values for chroma subsampling // are used instead of those from the command line. AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_X, input.y4m.dst_c_dec_h >> 1); ctx_exit_on_error(&stream->encoder, "Failed to set chroma subsampling x"); AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_Y, input.y4m.dst_c_dec_v >> 1); ctx_exit_on_error(&stream->encoder, "Failed to set chroma subsampling y"); } else if (input.bit_depth == 12 && input.file_type == FILE_TYPE_RAW) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_X, stream->chroma_subsampling_x); ctx_exit_on_error(&stream->encoder, "Failed to set chroma subsampling x"); AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_Y, stream->chroma_subsampling_y); ctx_exit_on_error(&stream->encoder, "Failed to set chroma subsampling y"); } break; default: break; } } /* Automatically set the codec bit depth to match the input bit depth. * Upgrade the profile if required. */ if (stream->config.cfg.g_input_bit_depth > (unsigned int)stream->config.cfg.g_bit_depth) { stream->config.cfg.g_bit_depth = stream->config.cfg.g_input_bit_depth; if (!global.quiet) { fprintf(stderr, "Warning: automatically updating bit depth to %d to " "match input format.\n", stream->config.cfg.g_input_bit_depth); } } #if !CONFIG_AV1_HIGHBITDEPTH if (stream->config.cfg.g_bit_depth > 8) { fatal("Unsupported bit-depth with CONFIG_AV1_HIGHBITDEPTH=0\n"); } #endif // CONFIG_AV1_HIGHBITDEPTH if (stream->config.cfg.g_bit_depth > 10) { switch (stream->config.cfg.g_profile) { case 0: case 1: stream->config.cfg.g_profile = 2; profile_updated = 1; break; default: break; } } if (stream->config.cfg.g_bit_depth > 8) { stream->config.use_16bit_internal = 1; } if (profile_updated && !global.quiet) { fprintf(stderr, "Warning: automatically updating to profile %d to " "match input format.\n", stream->config.cfg.g_profile); } if (global.show_psnr == 2 && stream->config.cfg.g_input_bit_depth == (unsigned int)stream->config.cfg.g_bit_depth) { fprintf(stderr, "Warning: --psnr==2 and --psnr==1 will provide same " "results when input bit-depth == stream bit-depth, " "falling back to default psnr value\n"); global.show_psnr = 1; } if (global.show_psnr < 0 || global.show_psnr > 2) { fprintf(stderr, "Warning: --psnr can take only 0,1,2 as values," "falling back to default psnr value\n"); global.show_psnr = 1; } /* Set limit */ stream->config.cfg.g_limit = global.limit; } FOREACH_STREAM(stream, streams) { set_stream_dimensions(stream, input.width, input.height); stream->config.color_range = input.color_range; } FOREACH_STREAM(stream, streams) { validate_stream_config(stream, &global); } /* Ensure that --passes and --pass are consistent. If --pass is set and * --passes >= 2, ensure --fpf was set. */ if (global.pass > 0 && global.pass <= 3 && global.passes >= 2) { FOREACH_STREAM(stream, streams) { if (!stream->config.stats_fn) die("Stream %d: Must specify --fpf when --pass=%d" " and --passes=%d\n", stream->index, global.pass, global.passes); } } #if !CONFIG_WEBM_IO FOREACH_STREAM(stream, streams) { if (stream->config.write_webm) { stream->config.write_webm = 0; stream->config.write_ivf = 0; aom_tools_warn("aomenc compiled w/o WebM support. Writing OBU stream."); } } #endif /* Use the frame rate from the file only if none was specified * on the command-line. */ if (!global.have_framerate) { global.framerate.num = input.framerate.numerator; global.framerate.den = input.framerate.denominator; } FOREACH_STREAM(stream, streams) { stream->config.cfg.g_timebase.den = global.framerate.num; stream->config.cfg.g_timebase.num = global.framerate.den; } /* Show configuration */ if (global.verbose && pass == 0) { FOREACH_STREAM(stream, streams) { show_stream_config(stream, &global, &input); } } if (pass == (global.pass ? global.pass - 1 : 0)) { // The Y4M reader does its own allocation. if (input.file_type != FILE_TYPE_Y4M) { aom_img_alloc(&raw, input.fmt, input.width, input.height, 32); } FOREACH_STREAM(stream, streams) { stream->rate_hist = init_rate_histogram(&stream->config.cfg, &global.framerate); } } FOREACH_STREAM(stream, streams) { setup_pass(stream, &global, pass); } FOREACH_STREAM(stream, streams) { initialize_encoder(stream, &global); } FOREACH_STREAM(stream, streams) { char *encoder_settings = NULL; #if CONFIG_WEBM_IO // Test frameworks may compare outputs from different versions, but only // wish to check for bitstream changes. The encoder-settings tag, however, // can vary if the version is updated, even if no encoder algorithm // changes were made. To work around this issue, do not output // the encoder-settings tag when --debug is enabled (which is the flag // that test frameworks should use, when they want deterministic output // from the container format). if (stream->config.write_webm && !stream->webm_ctx.debug) { encoder_settings = extract_encoder_settings( aom_codec_version_str(), argv_, argc, input.filename); if (encoder_settings == NULL) { fprintf( stderr, "Warning: unable to extract encoder settings. Continuing...\n"); } } #endif open_output_file(stream, &global, &input.pixel_aspect_ratio, encoder_settings); free(encoder_settings); } if (strcmp(get_short_name_by_aom_encoder(global.codec), "av1") == 0) { // Check to see if at least one stream uses 16 bit internal. // Currently assume that the bit_depths for all streams using // highbitdepth are the same. FOREACH_STREAM(stream, streams) { if (stream->config.use_16bit_internal) { do_16bit_internal = 1; } input_shift = (int)stream->config.cfg.g_bit_depth - stream->config.cfg.g_input_bit_depth; } } frame_avail = 1; got_data = 0; while (frame_avail || got_data) { struct aom_usec_timer timer; if (!global.limit || frames_in < global.limit) { frame_avail = read_frame(&input, &raw); if (frame_avail) frames_in++; seen_frames = frames_in > global.skip_frames ? frames_in - global.skip_frames : 0; if (!global.quiet) { float fps = usec_to_fps(cx_time, seen_frames); fprintf(stderr, "\rPass %d/%d ", pass + 1, global.passes); if (stream_cnt == 1) fprintf(stderr, "frame %4d/%-4d %7" PRId64 "B ", frames_in, streams->frames_out, (int64_t)streams->nbytes); else fprintf(stderr, "frame %4d ", frames_in); fprintf(stderr, "%7" PRId64 " %s %.2f %s ", cx_time > 9999999 ? cx_time / 1000 : cx_time, cx_time > 9999999 ? "ms" : "us", fps >= 1.0 ? fps : fps * 60, fps >= 1.0 ? "fps" : "fpm"); print_time("ETA", estimated_time_left); // mingw-w64 gcc does not match msvc for stderr buffering behavior // and uses line buffering, thus the progress output is not // real-time. The fflush() is here to make sure the progress output // is sent out while the clip is being processed. fflush(stderr); } } else { frame_avail = 0; } if (frames_in > global.skip_frames) { aom_image_t *frame_to_encode; if (input_shift || (do_16bit_internal && input.bit_depth == 8)) { assert(do_16bit_internal); // Input bit depth and stream bit depth do not match, so up // shift frame to stream bit depth if (!allocated_raw_shift) { aom_img_alloc(&raw_shift, raw.fmt | AOM_IMG_FMT_HIGHBITDEPTH, input.width, input.height, 32); allocated_raw_shift = 1; } aom_img_upshift(&raw_shift, &raw, input_shift); frame_to_encode = &raw_shift; } else { frame_to_encode = &raw; } aom_usec_timer_start(&timer); if (do_16bit_internal) { assert(frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH); FOREACH_STREAM(stream, streams) { if (stream->config.use_16bit_internal) encode_frame(stream, &global, frame_avail ? frame_to_encode : NULL, frames_in); else assert(0); } } else { assert((frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH) == 0); FOREACH_STREAM(stream, streams) { encode_frame(stream, &global, frame_avail ? frame_to_encode : NULL, frames_in); } } aom_usec_timer_mark(&timer); cx_time += aom_usec_timer_elapsed(&timer); FOREACH_STREAM(stream, streams) { update_quantizer_histogram(stream); } got_data = 0; FOREACH_STREAM(stream, streams) { get_cx_data(stream, &global, &got_data); } if (!got_data && input.length && streams != NULL && !streams->frames_out) { lagged_count = global.limit ? seen_frames : ftello(input.file); } else if (input.length) { int64_t remaining; int64_t rate; if (global.limit) { const int64_t frame_in_lagged = (seen_frames - lagged_count) * 1000; rate = cx_time ? frame_in_lagged * (int64_t)1000000 / cx_time : 0; remaining = 1000 * (global.limit - global.skip_frames - seen_frames + lagged_count); } else { const int64_t input_pos = ftello(input.file); const int64_t input_pos_lagged = input_pos - lagged_count; const int64_t input_limit = input.length; rate = cx_time ? input_pos_lagged * (int64_t)1000000 / cx_time : 0; remaining = input_limit - input_pos + lagged_count; } average_rate = (average_rate <= 0) ? rate : (average_rate * 7 + rate) / 8; estimated_time_left = average_rate ? remaining / average_rate : -1; } if (got_data && global.test_decode != TEST_DECODE_OFF) { FOREACH_STREAM(stream, streams) { test_decode(stream, global.test_decode); } } } fflush(stdout); if (!global.quiet) fprintf(stderr, "\033[K"); } if (stream_cnt > 1) fprintf(stderr, "\n"); if (!global.quiet) { FOREACH_STREAM(stream, streams) { const int64_t bpf = seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0; const int64_t bps = bpf * global.framerate.num / global.framerate.den; fprintf(stderr, "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64 "b/f %7" PRId64 "b/s" " %7" PRId64 " %s (%.2f fps)\033[K\n", pass + 1, global.passes, frames_in, stream->frames_out, (int64_t)stream->nbytes, bpf, bps, stream->cx_time > 9999999 ? stream->cx_time / 1000 : stream->cx_time, stream->cx_time > 9999999 ? "ms" : "us", usec_to_fps(stream->cx_time, seen_frames)); // This instance of cr does not need fflush as it is followed by a // newline in the same string. } } if (global.show_psnr >= 1) { if (get_fourcc_by_aom_encoder(global.codec) == AV1_FOURCC) { FOREACH_STREAM(stream, streams) { int64_t bps = 0; if (global.show_psnr == 1) { if (stream->psnr_count[0] && seen_frames && global.framerate.den) { bps = (int64_t)stream->nbytes * 8 * (int64_t)global.framerate.num / global.framerate.den / seen_frames; } show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1, bps); } if (global.show_psnr == 2) { #if CONFIG_AV1_HIGHBITDEPTH if (stream->config.cfg.g_input_bit_depth < (unsigned int)stream->config.cfg.g_bit_depth) show_psnr_hbd(stream, (1 << stream->config.cfg.g_bit_depth) - 1, bps); #endif } } } else { FOREACH_STREAM(stream, streams) { show_psnr(stream, 255.0, 0); } } } if (pass == global.passes - 1) { FOREACH_STREAM(stream, streams) { int num_operating_points; int levels[32]; int target_levels[32]; aom_codec_control(&stream->encoder, AV1E_GET_NUM_OPERATING_POINTS, &num_operating_points); aom_codec_control(&stream->encoder, AV1E_GET_SEQ_LEVEL_IDX, levels); aom_codec_control(&stream->encoder, AV1E_GET_TARGET_SEQ_LEVEL_IDX, target_levels); for (int i = 0; i < num_operating_points; i++) { if (levels[i] > target_levels[i]) { if (levels[i] == 31) { aom_tools_warn( "Failed to encode to target level %d.%d for operating point " "%d. The output level is SEQ_LEVEL_MAX", 2 + (target_levels[i] >> 2), target_levels[i] & 3, i); } else { aom_tools_warn( "Failed to encode to target level %d.%d for operating point " "%d. The output level is %d.%d", 2 + (target_levels[i] >> 2), target_levels[i] & 3, i, 2 + (levels[i] >> 2), levels[i] & 3); } } } } } FOREACH_STREAM(stream, streams) { aom_codec_destroy(&stream->encoder); } if (global.test_decode != TEST_DECODE_OFF) { FOREACH_STREAM(stream, streams) { aom_codec_destroy(&stream->decoder); } } close_input_file(&input); if (global.test_decode == TEST_DECODE_FATAL) { FOREACH_STREAM(stream, streams) { res |= stream->mismatch_seen; } } FOREACH_STREAM(stream, streams) { close_output_file(stream, get_fourcc_by_aom_encoder(global.codec)); } FOREACH_STREAM(stream, streams) { stats_close(&stream->stats, global.passes - 1); } if (global.pass) break; } if (global.show_q_hist_buckets) { FOREACH_STREAM(stream, streams) { show_q_histogram(stream->counts, global.show_q_hist_buckets); } } if (global.show_rate_hist_buckets) { FOREACH_STREAM(stream, streams) { show_rate_histogram(stream->rate_hist, &stream->config.cfg, global.show_rate_hist_buckets); } } FOREACH_STREAM(stream, streams) { destroy_rate_histogram(stream->rate_hist); } #if CONFIG_INTERNAL_STATS /* TODO(jkoleszar): This doesn't belong in this executable. Do it for now, * to match some existing utilities. */ if (!(global.pass == 1 && global.passes == 2)) { FOREACH_STREAM(stream, streams) { FILE *f = fopen("opsnr.stt", "a"); if (stream->mismatch_seen) { fprintf(f, "First mismatch occurred in frame %d\n", stream->mismatch_seen); } else { fprintf(f, "No mismatch detected in recon buffers\n"); } fclose(f); } } #endif if (allocated_raw_shift) aom_img_free(&raw_shift); aom_img_free(&raw); free(argv); free(streams); return res ? EXIT_FAILURE : EXIT_SUCCESS; } aom-3.12.1/apps/aomenc.h000066400000000000000000000030751477627663500147640ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_APPS_AOMENC_H_ #define AOM_APPS_AOMENC_H_ #include "aom/aom_codec.h" #include "aom/aom_encoder.h" #include "av1/arg_defs.h" #ifdef __cplusplus extern "C" { #endif typedef enum { I420, // 4:2:0 8+ bit-depth I422, // 4:2:2 8+ bit-depth I444, // 4:4:4 8+ bit-depth YV12, // 4:2:0 with uv flipped, only 8-bit depth NV12, // 4:2:0 with uv interleaved, only 8-bit depth } ColorInputType; /* Configuration elements common to all streams. */ struct AvxEncoderConfig { aom_codec_iface_t *codec; int passes; int pass; unsigned int usage; ColorInputType color_type; int quiet; int verbose; int limit; int skip_frames; int show_psnr; enum TestDecodeFatality test_decode; int have_framerate; struct aom_rational framerate; int debug; int show_q_hist_buckets; int show_rate_hist_buckets; int disable_warnings; int disable_warning_prompt; int experimental_bitstream; aom_chroma_sample_position_t csp; cfg_options_t encoder_config; }; #ifdef __cplusplus } // extern "C" #endif #endif // AOM_APPS_AOMENC_H_ aom-3.12.1/av1/000077500000000000000000000000001477627663500130705ustar00rootroot00000000000000aom-3.12.1/av1/arg_defs.c000066400000000000000000001042441477627663500150130ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/arg_defs.h" static const struct arg_enum_list test_decode_enum[] = { { "off", TEST_DECODE_OFF }, { "fatal", TEST_DECODE_FATAL }, { "warn", TEST_DECODE_WARN }, { NULL, 0 } }; static const struct arg_enum_list bitdepth_enum[] = { { "8", AOM_BITS_8 }, { "10", AOM_BITS_10 }, { "12", AOM_BITS_12 }, { NULL, 0 } }; #if CONFIG_WEBM_IO static const struct arg_enum_list stereo_mode_enum[] = { { "mono", STEREO_FORMAT_MONO }, { "left-right", STEREO_FORMAT_LEFT_RIGHT }, { "bottom-top", STEREO_FORMAT_BOTTOM_TOP }, { "top-bottom", STEREO_FORMAT_TOP_BOTTOM }, { "right-left", STEREO_FORMAT_RIGHT_LEFT }, { NULL, 0 } }; #endif static const struct arg_enum_list end_usage_enum[] = { { "vbr", AOM_VBR }, { "cbr", AOM_CBR }, { "cq", AOM_CQ }, { "q", AOM_Q }, { NULL, 0 } }; static const struct arg_enum_list tuning_enum[] = { { "psnr", AOM_TUNE_PSNR }, { "ssim", AOM_TUNE_SSIM }, { "vmaf_with_preprocessing", AOM_TUNE_VMAF_WITH_PREPROCESSING }, { "vmaf_without_preprocessing", AOM_TUNE_VMAF_WITHOUT_PREPROCESSING }, { "vmaf", AOM_TUNE_VMAF_MAX_GAIN }, { "vmaf_neg", AOM_TUNE_VMAF_NEG_MAX_GAIN }, { "butteraugli", AOM_TUNE_BUTTERAUGLI }, { "vmaf_saliency_map", AOM_TUNE_VMAF_SALIENCY_MAP }, { "iq", AOM_TUNE_IQ }, { NULL, 0 } }; static const struct arg_enum_list dist_metric_enum[] = { { "psnr", AOM_DIST_METRIC_PSNR }, { "qm-psnr", AOM_DIST_METRIC_QM_PSNR }, { NULL, 0 } }; #if CONFIG_AV1_ENCODER static const struct arg_enum_list timing_info_enum[] = { { "unspecified", AOM_TIMING_UNSPECIFIED }, { "constant", AOM_TIMING_EQUAL }, { "model", AOM_TIMING_DEC_MODEL }, { NULL, 0 } }; static const struct arg_enum_list superblock_size_enum[] = { { "dynamic", AOM_SUPERBLOCK_SIZE_DYNAMIC }, { "64", AOM_SUPERBLOCK_SIZE_64X64 }, { "128", AOM_SUPERBLOCK_SIZE_128X128 }, { NULL, 0 } }; static const struct arg_enum_list matrix_coefficients_enum[] = { { "identity", AOM_CICP_MC_IDENTITY }, { "bt709", AOM_CICP_MC_BT_709 }, { "unspecified", AOM_CICP_MC_UNSPECIFIED }, { "fcc73", AOM_CICP_MC_FCC }, { "bt470bg", AOM_CICP_MC_BT_470_B_G }, { "bt601", AOM_CICP_MC_BT_601 }, { "smpte240", AOM_CICP_CP_SMPTE_240 }, { "ycgco", AOM_CICP_MC_SMPTE_YCGCO }, { "bt2020ncl", AOM_CICP_MC_BT_2020_NCL }, { "bt2020cl", AOM_CICP_MC_BT_2020_CL }, { "smpte2085", AOM_CICP_MC_SMPTE_2085 }, { "chromncl", AOM_CICP_MC_CHROMAT_NCL }, { "chromcl", AOM_CICP_MC_CHROMAT_CL }, { "ictcp", AOM_CICP_MC_ICTCP }, { NULL, 0 } }; static const struct arg_enum_list chroma_sample_position_enum[] = { { "unknown", AOM_CSP_UNKNOWN }, { "vertical", AOM_CSP_VERTICAL }, { "colocated", AOM_CSP_COLOCATED }, { NULL, 0 } }; static const struct arg_enum_list tune_content_enum[] = { { "default", AOM_CONTENT_DEFAULT }, { "screen", AOM_CONTENT_SCREEN }, { "film", AOM_CONTENT_FILM }, { NULL, 0 } }; static const struct arg_enum_list transfer_characteristics_enum[] = { { "unspecified", AOM_CICP_CP_UNSPECIFIED }, { "bt709", AOM_CICP_TC_BT_709 }, { "bt470m", AOM_CICP_TC_BT_470_M }, { "bt470bg", AOM_CICP_TC_BT_470_B_G }, { "bt601", AOM_CICP_TC_BT_601 }, { "smpte240", AOM_CICP_TC_SMPTE_240 }, { "lin", AOM_CICP_TC_LINEAR }, { "log100", AOM_CICP_TC_LOG_100 }, { "log100sq10", AOM_CICP_TC_LOG_100_SQRT10 }, { "iec61966", AOM_CICP_TC_IEC_61966 }, { "bt1361", AOM_CICP_TC_BT_1361 }, { "srgb", AOM_CICP_TC_SRGB }, { "bt2020-10bit", AOM_CICP_TC_BT_2020_10_BIT }, { "bt2020-12bit", AOM_CICP_TC_BT_2020_12_BIT }, { "smpte2084", AOM_CICP_TC_SMPTE_2084 }, { "hlg", AOM_CICP_TC_HLG }, { "smpte428", AOM_CICP_TC_SMPTE_428 }, { NULL, 0 } }; static const struct arg_enum_list color_primaries_enum[] = { { "bt709", AOM_CICP_CP_BT_709 }, { "unspecified", AOM_CICP_CP_UNSPECIFIED }, { "bt601", AOM_CICP_CP_BT_601 }, { "bt470m", AOM_CICP_CP_BT_470_M }, { "bt470bg", AOM_CICP_CP_BT_470_B_G }, { "smpte240", AOM_CICP_CP_SMPTE_240 }, { "film", AOM_CICP_CP_GENERIC_FILM }, { "bt2020", AOM_CICP_CP_BT_2020 }, { "xyz", AOM_CICP_CP_XYZ }, { "smpte431", AOM_CICP_CP_SMPTE_431 }, { "smpte432", AOM_CICP_CP_SMPTE_432 }, { "ebu3213", AOM_CICP_CP_EBU_3213 }, { NULL, 0 } }; #endif // CONFIG_AV1_ENCODER const av1_codec_arg_definitions_t g_av1_codec_arg_defs = { .help = ARG_DEF(NULL, "help", 0, "Show usage options and exit"), .debugmode = ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)"), .outputfile = ARG_DEF("o", "output", 1, "Output filename"), .use_nv12 = ARG_DEF(NULL, "nv12", 0, "Input file is NV12"), .use_yv12 = ARG_DEF(NULL, "yv12", 0, "Input file is YV12"), .use_i420 = ARG_DEF(NULL, "i420", 0, "Input file is I420 (default)"), .use_i422 = ARG_DEF(NULL, "i422", 0, "Input file is I422"), .use_i444 = ARG_DEF(NULL, "i444", 0, "Input file is I444"), .codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use"), .passes = ARG_DEF("p", "passes", 1, "Number of passes (1/2/3)"), .pass_arg = ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2/3)"), .fpf_name = ARG_DEF(NULL, "fpf", 1, "First pass statistics file name"), .limit = ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames"), .skip = ARG_DEF(NULL, "skip", 1, "Skip the first n input frames"), .good_dl = ARG_DEF(NULL, "good", 0, "Use Good Quality Deadline"), .rt_dl = ARG_DEF(NULL, "rt", 0, "Use Realtime Quality Deadline"), .ai_dl = ARG_DEF(NULL, "allintra", 0, "Use all intra mode"), .quietarg = ARG_DEF("q", "quiet", 0, "Do not print encode progress"), .verbosearg = ARG_DEF("v", "verbose", 0, "Show encoder parameters"), .psnrarg = ARG_DEF( NULL, "psnr", -1, "Show PSNR in status line " "(0: Disable PSNR status line display, 1: PSNR calculated using input " "bit-depth (default), 2: PSNR calculated using stream bit-depth); " "takes default option when arguments are not specified"), .use_cfg = ARG_DEF("c", "cfg", 1, "Config file to use"), .recontest = ARG_DEF_ENUM(NULL, "test-decode", 1, "Test encode/decode mismatch", test_decode_enum), .framerate = ARG_DEF(NULL, "fps", 1, "Stream frame rate (rate/scale)"), .use_webm = ARG_DEF(NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)"), .use_ivf = ARG_DEF(NULL, "ivf", 0, "Output IVF"), .use_obu = ARG_DEF(NULL, "obu", 0, "Output OBU"), .q_hist_n = ARG_DEF(NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)"), .rate_hist_n = ARG_DEF(NULL, "rate-hist", 1, "Show rate histogram (n-buckets)"), .disable_warnings = ARG_DEF(NULL, "disable-warnings", 0, "Disable warnings about potentially incorrect encode settings"), .disable_warning_prompt = ARG_DEF("y", "disable-warning-prompt", 0, "Display warnings, but do not prompt user to continue"), .bitdeptharg = ARG_DEF_ENUM("b", "bit-depth", 1, "Bit depth for codec", bitdepth_enum), .inbitdeptharg = ARG_DEF(NULL, "input-bit-depth", 1, "Bit depth of input"), .input_chroma_subsampling_x = ARG_DEF(NULL, "input-chroma-subsampling-x", 1, "Chroma subsampling x value"), .input_chroma_subsampling_y = ARG_DEF(NULL, "input-chroma-subsampling-y", 1, "Chroma subsampling y value"), .usage = ARG_DEF("u", "usage", 1, "Usage profile number to use (0: good, 1: rt, 2: allintra)"), .threads = ARG_DEF("t", "threads", 1, "Max number of threads to use"), .profile = ARG_DEF(NULL, "profile", 1, "Bitstream profile number to use"), .width = ARG_DEF("w", "width", 1, "Frame width"), .height = ARG_DEF("h", "height", 1, "Frame height"), .forced_max_frame_width = ARG_DEF(NULL, "forced_max_frame_width", 1, "Maximum frame width value to force"), .forced_max_frame_height = ARG_DEF(NULL, "forced_max_frame_height", 1, "Maximum frame height value to force"), #if CONFIG_WEBM_IO .stereo_mode = ARG_DEF_ENUM(NULL, "stereo-mode", 1, "Stereo 3D video format", stereo_mode_enum), #endif .timebase = ARG_DEF(NULL, "timebase", 1, "Output timestamp precision (fractional seconds)"), .global_error_resilient = ARG_DEF(NULL, "global-error-resilient", 1, "Enable global error resiliency features"), .lag_in_frames = ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag"), .large_scale_tile = ARG_DEF( NULL, "large-scale-tile", 1, "Large scale tile coding (0: off (default), 1: on (ivf output only))"), .monochrome = ARG_DEF(NULL, "monochrome", 0, "Monochrome video (no chroma planes)"), .full_still_picture_hdr = ARG_DEF(NULL, "full-still-picture-hdr", 0, "Use full header for still picture"), .use_16bit_internal = ARG_DEF(NULL, "use-16bit-internal", 0, "Force use of 16-bit pipeline"), .dropframe_thresh = ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)"), .resize_mode = ARG_DEF( NULL, "resize-mode", 1, "Frame resize mode (0: off (default), 1: fixed, 2: random, 3: dynamic)"), .resize_denominator = ARG_DEF(NULL, "resize-denominator", 1, "Frame resize denominator"), .resize_kf_denominator = ARG_DEF(NULL, "resize-kf-denominator", 1, "Frame resize keyframe denominator"), .superres_mode = ARG_DEF(NULL, "superres-mode", 1, "Frame super-resolution mode (0: disabled (default), 1: fixed, " "2: random, 3: qthresh, 4: auto)"), .superres_denominator = ARG_DEF(NULL, "superres-denominator", 1, "Frame super-resolution denominator"), .superres_kf_denominator = ARG_DEF(NULL, "superres-kf-denominator", 1, "Frame super-resolution keyframe denominator"), .superres_qthresh = ARG_DEF(NULL, "superres-qthresh", 1, "Frame super-resolution qindex threshold"), .superres_kf_qthresh = ARG_DEF(NULL, "superres-kf-qthresh", 1, "Frame super-resolution keyframe qindex threshold"), .end_usage = ARG_DEF_ENUM(NULL, "end-usage", 1, "Rate control mode", end_usage_enum), .target_bitrate = ARG_DEF(NULL, "target-bitrate", 1, "Bitrate (kbps)"), .min_quantizer = ARG_DEF(NULL, "min-q", 1, "Minimum (best) quantizer"), .max_quantizer = ARG_DEF(NULL, "max-q", 1, "Maximum (worst) quantizer"), .undershoot_pct = ARG_DEF(NULL, "undershoot-pct", 1, "Datarate undershoot (min) target (%)"), .overshoot_pct = ARG_DEF(NULL, "overshoot-pct", 1, "Datarate overshoot (max) target (%)"), .buf_sz = ARG_DEF(NULL, "buf-sz", 1, "Client buffer size (ms)"), .buf_initial_sz = ARG_DEF(NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)"), .buf_optimal_sz = ARG_DEF(NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)"), .bias_pct = ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)"), .minsection_pct = ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)"), .maxsection_pct = ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)"), .fwd_kf_enabled = ARG_DEF(NULL, "enable-fwd-kf", 1, "Enable forward reference keyframes"), .kf_min_dist = ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)"), .kf_max_dist = ARG_DEF(NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)"), .kf_disabled = ARG_DEF(NULL, "disable-kf", 0, "Disable keyframe placement"), .sframe_dist = ARG_DEF(NULL, "sframe-dist", 1, "S-Frame interval (frames)"), .sframe_mode = ARG_DEF(NULL, "sframe-mode", 1, "S-Frame insertion mode (1..2)"), .save_as_annexb = ARG_DEF(NULL, "annexb", 1, "Save as Annex-B"), .noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)"), .sharpness = ARG_DEF(NULL, "sharpness", 1, "Bias towards block sharpness in rate-distortion optimization of " "transform coefficients and (in all intra mode only) reduce " "block edge filtering for better sharpness (0..7), default is 0"), .static_thresh = ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold"), .auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames"), .arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)"), .arnr_strength = ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)"), .tune_metric = ARG_DEF_ENUM(NULL, "tune", 1, "Distortion metric tuned with", tuning_enum), .dist_metric = ARG_DEF_ENUM( NULL, "dist-metric", 1, "Distortion metric to use for in-block optimization", dist_metric_enum), .cq_level = ARG_DEF(NULL, "cq-level", 1, "Constant/Constrained Quality level"), .max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)"), #if CONFIG_AV1_ENCODER .cpu_used_av1 = ARG_DEF(NULL, "cpu-used", 1, "Speed setting (0..6 in good mode, 5..11 in realtime " "mode, 0..9 in all intra mode)"), .rowmtarg = ARG_DEF(NULL, "row-mt", 1, "Enable row based multi-threading (0: off, 1: on (default))"), .fpmtarg = ARG_DEF( NULL, "fp-mt", 1, "Enable frame parallel multi-threading (0: off (default), 1: on)"), .tile_cols = ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2"), .tile_rows = ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2"), .auto_tiles = ARG_DEF(NULL, "auto-tiles", 1, "Enable auto tiles (0: false (default), 1: true)"), .enable_tpl_model = ARG_DEF(NULL, "enable-tpl-model", 1, "RDO based on frame temporal dependency " "(0: off, 1: backward source based); " "required for deltaq mode"), .enable_keyframe_filtering = ARG_DEF( NULL, "enable-keyframe-filtering", 1, "Apply temporal filtering on key frame " "(0: no filter, 1: filter without overlay (default), " "2: filter with overlay - experimental, may break random access in " "players)"), .tile_width = ARG_DEF(NULL, "tile-width", 1, "Tile widths (comma separated)"), .tile_height = ARG_DEF(NULL, "tile-height", 1, "Tile heights (command separated)"), .lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)"), .enable_cdef = ARG_DEF( NULL, "enable-cdef", 1, "Enable the constrained directional enhancement filter (0: false, " "1: true (default), 2: disable for non-reference frames, 3: enable " "adaptively based on frame qindex)"), .enable_restoration = ARG_DEF(NULL, "enable-restoration", 1, "Enable the loop restoration filter (0: false " "(default in realtime mode), " "1: true (default in non-realtime mode))"), .enable_rect_partitions = ARG_DEF(NULL, "enable-rect-partitions", 1, "Enable rectangular partitions " "(0: false, 1: true (default))"), .enable_ab_partitions = ARG_DEF(NULL, "enable-ab-partitions", 1, "Enable ab partitions (0: false, 1: true (default))"), .enable_1to4_partitions = ARG_DEF(NULL, "enable-1to4-partitions", 1, "Enable 1:4 and 4:1 partitions " "(0: false, 1: true (default))"), .min_partition_size = ARG_DEF(NULL, "min-partition-size", 1, "Set min partition size " "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128); " "with 4k+ resolutions or higher speed settings, min " "partition size will have a minimum of 8"), .max_partition_size = ARG_DEF(NULL, "max-partition-size", 1, "Set max partition size " "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128)"), .enable_dual_filter = ARG_DEF(NULL, "enable-dual-filter", 1, "Enable dual filter " "(0: false, 1: true (default))"), .enable_chroma_deltaq = ARG_DEF(NULL, "enable-chroma-deltaq", 1, "Enable chroma delta quant " "(0: false (default), 1: true)"), .enable_intra_edge_filter = ARG_DEF(NULL, "enable-intra-edge-filter", 1, "Enable intra edge filtering " "(0: false, 1: true (default))"), .enable_order_hint = ARG_DEF(NULL, "enable-order-hint", 1, "Enable order hint " "(0: false, 1: true (default))"), .enable_tx64 = ARG_DEF(NULL, "enable-tx64", 1, "Enable 64-pt transform (0: false, 1: true (default))"), .enable_flip_idtx = ARG_DEF(NULL, "enable-flip-idtx", 1, "Enable extended transform type (0: false, 1: true (default)) " "including FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, " "ADST_FLIPADST, FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, " "H_ADST, V_FLIPADST, H_FLIPADST"), .enable_rect_tx = ARG_DEF(NULL, "enable-rect-tx", 1, "Enable rectangular transform (0: false, 1: true (default))"), .enable_dist_wtd_comp = ARG_DEF(NULL, "enable-dist-wtd-comp", 1, "Enable distance-weighted compound " "(0: false, 1: true (default))"), .enable_masked_comp = ARG_DEF(NULL, "enable-masked-comp", 1, "Enable masked (wedge/diff-wtd) compound " "(0: false, 1: true (default))"), .enable_onesided_comp = ARG_DEF(NULL, "enable-onesided-comp", 1, "Enable one sided compound " "(0: false, 1: true (default))"), .enable_interintra_comp = ARG_DEF(NULL, "enable-interintra-comp", 1, "Enable interintra compound " "(0: false, 1: true (default))"), .enable_smooth_interintra = ARG_DEF(NULL, "enable-smooth-interintra", 1, "Enable smooth interintra mode " "(0: false, 1: true (default))"), .enable_diff_wtd_comp = ARG_DEF(NULL, "enable-diff-wtd-comp", 1, "Enable difference-weighted compound " "(0: false, 1: true (default))"), .enable_interinter_wedge = ARG_DEF(NULL, "enable-interinter-wedge", 1, "Enable interinter wedge compound " "(0: false, 1: true (default))"), .enable_interintra_wedge = ARG_DEF(NULL, "enable-interintra-wedge", 1, "Enable interintra wedge compound " "(0: false, 1: true (default))"), .enable_global_motion = ARG_DEF(NULL, "enable-global-motion", 1, "Enable global motion " "(0: false, 1: true (default))"), .enable_warped_motion = ARG_DEF(NULL, "enable-warped-motion", 1, "Enable local warped motion " "(0: false, 1: true (default))"), .enable_filter_intra = ARG_DEF(NULL, "enable-filter-intra", 1, "Enable filter intra prediction mode " "(0: false, 1: true (default))"), .enable_smooth_intra = ARG_DEF(NULL, "enable-smooth-intra", 1, "Enable smooth intra prediction modes " "(0: false, 1: true (default))"), .enable_paeth_intra = ARG_DEF( NULL, "enable-paeth-intra", 1, "Enable Paeth intra prediction mode (0: false, 1: true (default))"), .enable_cfl_intra = ARG_DEF(NULL, "enable-cfl-intra", 1, "Enable chroma from luma intra prediction mode " "(0: false, 1: true (default))"), .enable_directional_intra = ARG_DEF(NULL, "enable-directional-intra", 1, "Enable directional intra prediction modes " "(0: false, 1: true (default))"), .enable_diagonal_intra = ARG_DEF(NULL, "enable-diagonal-intra", 1, "Enable diagonal (D45 to D203) intra prediction modes, which are " "a subset of directional modes; has no effect if " "enable-directional-intra is 0 (0: false, 1: true (default))"), .force_video_mode = ARG_DEF( NULL, "force-video-mode", 1, "Force video mode even for a single frame (0: false (default), 1: true)"), .enable_obmc = ARG_DEF(NULL, "enable-obmc", 1, "Enable OBMC (0: false, 1: true (default))"), .enable_overlay = ARG_DEF(NULL, "enable-overlay", 1, "Enable coding overlay frames (0: false, 1: true (default))"), .enable_palette = ARG_DEF(NULL, "enable-palette", 1, "Enable palette prediction mode (0: false, 1: true (default))"), .enable_intrabc = ARG_DEF(NULL, "enable-intrabc", 1, "Enable intra block copy prediction mode " "(0: false, 1: true (default))"), .enable_angle_delta = ARG_DEF(NULL, "enable-angle-delta", 1, "Enable intra angle delta (0: false, 1: true (default))"), .disable_trellis_quant = ARG_DEF( NULL, "disable-trellis-quant", 1, "Disable trellis optimization of quantized coefficients (0: false " "1: true 2: true for rd search 3: true for estimate yrd search " "(default))"), .enable_qm = ARG_DEF(NULL, "enable-qm", 1, "Enable quantisation matrices (0: false (default), 1: true)"), .qm_min = ARG_DEF( NULL, "qm-min", 1, "Min quant matrix flatness (0..15), default is 5 (4 for all intra mode)"), .qm_max = ARG_DEF(NULL, "qm-max", 1, "Max quant matrix flatness (0..15), default is 9 (10 for " "all intra mode)"), .reduced_tx_type_set = ARG_DEF(NULL, "reduced-tx-type-set", 1, "Use reduced set of transform types"), .use_intra_dct_only = ARG_DEF(NULL, "use-intra-dct-only", 1, "Use DCT only for INTRA modes"), .use_inter_dct_only = ARG_DEF(NULL, "use-inter-dct-only", 1, "Use DCT only for INTER modes"), .use_intra_default_tx_only = ARG_DEF(NULL, "use-intra-default-tx-only", 1, "Use Default-transform only for INTRA modes"), .quant_b_adapt = ARG_DEF(NULL, "quant-b-adapt", 1, "Use adaptive quantize_b"), .coeff_cost_upd_freq = ARG_DEF(NULL, "coeff-cost-upd-freq", 1, "Update freq for coeff costs. " "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"), .mode_cost_upd_freq = ARG_DEF(NULL, "mode-cost-upd-freq", 1, "Update freq for mode costs. " "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"), .mv_cost_upd_freq = ARG_DEF(NULL, "mv-cost-upd-freq", 1, "Update freq for mv costs. " "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"), .dv_cost_upd_freq = ARG_DEF(NULL, "dv-cost-upd-freq", 1, "Update freq for dv costs. " "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"), .num_tg = ARG_DEF(NULL, "num-tile-groups", 1, "Maximum number of tile groups, default is 1"), .mtu_size = ARG_DEF(NULL, "mtu-size", 1, "MTU size for a tile group, default is 0 (no MTU targeting), " "overrides maximum number of tile groups"), .timing_info = ARG_DEF_ENUM( NULL, "timing-info", 1, "Signal timing info in the bitstream (model only works for no " "hidden frames, no super-res yet):", timing_info_enum), #if CONFIG_TUNE_VMAF .vmaf_model_path = ARG_DEF(NULL, "vmaf-model-path", 1, "Path to the VMAF model file"), #endif .partition_info_path = ARG_DEF(NULL, "partition-info-path", 1, "Partition information read and write path"), .enable_rate_guide_deltaq = ARG_DEF(NULL, "enable-rate-guide-deltaq", 1, "Enable rate guide deltaq (1), by default off (0). " "It requires --deltaq-mode=3. " "If turned on, it requires an input file specified " "by --rate-distribution-info."), .rate_distribution_info = ARG_DEF(NULL, "rate-distribution-info", 1, "Rate distribution information input." "It requires --enable-rate-guide-deltaq=1."), .film_grain_test = ARG_DEF( NULL, "film-grain-test", 1, "Film grain test vectors (0: none (default), 1: test-1 2: test-2, " "... 16: test-16)"), .film_grain_table = ARG_DEF(NULL, "film-grain-table", 1, "Path to file containing film grain parameters"), #if CONFIG_DENOISE .denoise_noise_level = ARG_DEF(NULL, "denoise-noise-level", 1, "Amount of noise (from 0 = don't denoise, to 50)"), .denoise_block_size = ARG_DEF(NULL, "denoise-block-size", 1, "Denoise block size (default = 32)"), .enable_dnl_denoising = ARG_DEF(NULL, "enable-dnl-denoising", 1, "Apply denoising to the frame " "being encoded when denoise-noise-level is " "enabled (0: false, 1: true (default))"), #endif .enable_ref_frame_mvs = ARG_DEF(NULL, "enable-ref-frame-mvs", 1, "Enable temporal mv prediction (default is 1)"), .frame_parallel_decoding = ARG_DEF(NULL, "frame-parallel", 1, "Enable frame parallel decodability features " "(0: false (default), 1: true)"), .error_resilient_mode = ARG_DEF(NULL, "error-resilient", 1, "Enable error resilient features " "(0: false (default), 1: true)"), .aq_mode = ARG_DEF(NULL, "aq-mode", 1, "Adaptive quantization mode (0: off (default), 1: " "variance 2: complexity, " "3: cyclic refresh)"), .deltaq_mode = ARG_DEF(NULL, "deltaq-mode", 1, "Delta qindex mode (0: off, 1: deltaq objective (default), " "2: deltaq placeholder, 3: key frame visual quality, 4: user " "rating based visual quality optimization, 5: HDR video, 6: " "Variance Boost all intra); requires --enable-tpl-model=1"), .deltaq_strength = ARG_DEF(NULL, "deltaq-strength", 1, "Deltaq strength for" " --deltaq-mode=4 (%)"), .deltalf_mode = ARG_DEF(NULL, "delta-lf-mode", 1, "Enable delta-lf-mode (0: off (default), 1: on)"), .frame_periodic_boost = ARG_DEF(NULL, "frame-boost", 1, "Enable frame periodic boost (0: off (default), 1: on)"), .gf_cbr_boost_pct = ARG_DEF(NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)"), .max_inter_rate_pct = ARG_DEF(NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)"), .min_gf_interval = ARG_DEF( NULL, "min-gf-interval", 1, "Min gf/arf frame interval (default 0, indicating in-built behavior)"), .max_gf_interval = ARG_DEF( NULL, "max-gf-interval", 1, "Max gf/arf frame interval (default 0, indicating in-built behavior)"), .gf_min_pyr_height = ARG_DEF(NULL, "gf-min-pyr-height", 1, "Min height for GF group pyramid structure (0 (default) to 5)"), .gf_max_pyr_height = ARG_DEF( NULL, "gf-max-pyr-height", 1, "Maximum height for GF group pyramid structure (0 to 5 (default))"), .max_reference_frames = ARG_DEF(NULL, "max-reference-frames", 1, "Maximum number of reference frames allowed " "per frame (3 to 7 (default))"), .reduced_reference_set = ARG_DEF(NULL, "reduced-reference-set", 1, "Use reduced set of single and compound references (0: off " "(default), 1: on)"), .target_seq_level_idx = ARG_DEF(NULL, "target-seq-level-idx", 1, "Target sequence level index. " "Possible values are in the form of \"ABxy\". " "AB: Operating point (OP) index, " "xy: Target level index for the OP. " "E.g. \"0\" means target level index 0 (2.0) for the 0th OP, " "\"1019\" means target level index 19 (6.3) for the 10th OP."), .set_min_cr = ARG_DEF( NULL, "min-cr", 1, "Set minimum compression ratio. Take integer values. Default is 0. " "If non-zero, encoder will try to keep the compression ratio of " "each frame to be higher than the given value divided by 100."), .input_color_primaries = ARG_DEF_ENUM( NULL, "color-primaries", 1, "Color primaries (CICP) of input content:", color_primaries_enum), .input_transfer_characteristics = ARG_DEF_ENUM(NULL, "transfer-characteristics", 1, "Transfer characteristics (CICP) of input content:", transfer_characteristics_enum), .input_matrix_coefficients = ARG_DEF_ENUM( NULL, "matrix-coefficients", 1, "Matrix coefficients (CICP) of input content:", matrix_coefficients_enum), .input_chroma_sample_position = ARG_DEF_ENUM(NULL, "chroma-sample-position", 1, "The chroma sample position when chroma 4:2:0 is signaled:", chroma_sample_position_enum), .tune_content = ARG_DEF_ENUM(NULL, "tune-content", 1, "Tune content type", tune_content_enum), .cdf_update_mode = ARG_DEF(NULL, "cdf-update-mode", 1, "CDF update mode for entropy coding " "(0: no CDF update, 1: update CDF on all frames (default), " "2: selectively update CDF on some frames)"), .superblock_size = ARG_DEF_ENUM(NULL, "sb-size", 1, "Superblock size to use", superblock_size_enum), .set_tier_mask = ARG_DEF(NULL, "set-tier-mask", 1, "Set bit mask to specify which tier each of the 32 possible " "operating points conforms to. " "Bit value 0 (default): Main Tier, 1: High Tier."), .use_fixed_qp_offsets = ARG_DEF(NULL, "use-fixed-qp-offsets", 1, "Enable fixed QP offsets for frames at different levels of the " "pyramid. Selected automatically from --cq-level if " "--fixed-qp-offsets is not provided. If this option is not " "specified (default), offsets are adaptively chosen by the " "encoder."), .fixed_qp_offsets = ARG_DEF( NULL, "fixed-qp-offsets", 1, "Set fixed QP offsets for frames at different levels of the " "pyramid. Comma-separated list of 5 offsets for keyframe, ALTREF, " "and 3 levels of internal alt-refs. If this option is not " "specified (default), offsets are adaptively chosen by the " "encoder."), .vbr_corpus_complexity_lap = ARG_DEF( NULL, "vbr-corpus-complexity-lap", 1, "Set average corpus complexity per mb for single pass VBR using lap. " "(0..10000), default is 0"), .fwd_kf_dist = ARG_DEF(NULL, "fwd-kf-dist", -1, "Set distance between forward keyframes. A value of " "-1 (default) means no repetitive forward keyframes."), .enable_tx_size_search = ARG_DEF( NULL, "enable-tx-size-search", 1, "Enable transform size search to find the best size for each block. " "If false, transforms always have the largest possible size " "(0: false, 1: true (default)). Ignored in non rd pick mode in " "real-time coding."), .loopfilter_control = ARG_DEF( NULL, "loopfilter-control", 1, "Control loop filtering " "(0: Loopfilter disabled for all frames, 1: Enable loopfilter for all " "frames (default), 2: Disable loopfilter for non-reference frames, 3: " "Disable loopfilter for frames with low motion)"), .auto_intra_tools_off = ARG_DEF(NULL, "auto-intra-tools-off", 1, "Automatically turn off several intra coding tools for all intra " "mode; only in effect if --deltaq-mode=3"), .two_pass_input = ARG_DEF(NULL, "two-pass-input", 1, "The input file for the second pass for three-pass encoding"), .two_pass_output = ARG_DEF( NULL, "two-pass-output", 1, "The output file for the first two passes for three-pass encoding"), .two_pass_width = ARG_DEF(NULL, "two-pass-width", 1, "The width of two-pass-input"), .two_pass_height = ARG_DEF(NULL, "two-pass-height", 1, "The height of two-pass-input"), .second_pass_log = ARG_DEF("spf", "second-pass-log", 1, "Log file from second pass"), .strict_level_conformance = ARG_DEF(NULL, "strict-level-conformance", 1, "When set to 1, exit the encoder when it fails to encode " "to a given target level"), .kf_max_pyr_height = ARG_DEF( NULL, "kf-max-pyr-height", 1, "Maximum height of pyramid structure used for the GOP starting with a " "key frame (-1 to 5). When set to -1 (default), it does not have any " "effect. The actual maximum pyramid height will be the minimum of this " "value and the value of gf_max_pyr_height."), .sb_qp_sweep = ARG_DEF(NULL, "sb-qp-sweep", 1, "When set to 1, enable the superblock level qp sweep for a " "given lambda to minimize the rdcost."), #endif // CONFIG_AV1_ENCODER }; aom-3.12.1/av1/arg_defs.h000066400000000000000000000153751477627663500150260ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ARG_DEFS_H_ #define AOM_AV1_ARG_DEFS_H_ #ifdef __cplusplus extern "C" { #endif #include "config/aom_config.h" #include "common/args_helper.h" #if CONFIG_WEBM_IO #include "common/webmenc.h" #endif #include "aom/aomcx.h" enum TestDecodeFatality { TEST_DECODE_OFF, TEST_DECODE_FATAL, TEST_DECODE_WARN, }; typedef struct av1_codec_arg_definitions { arg_def_t help; arg_def_t debugmode; arg_def_t outputfile; arg_def_t use_nv12; arg_def_t use_yv12; arg_def_t use_i420; arg_def_t use_i422; arg_def_t use_i444; arg_def_t codecarg; arg_def_t passes; arg_def_t pass_arg; arg_def_t fpf_name; arg_def_t limit; arg_def_t skip; arg_def_t good_dl; arg_def_t rt_dl; arg_def_t ai_dl; arg_def_t quietarg; arg_def_t verbosearg; arg_def_t psnrarg; arg_def_t use_cfg; arg_def_t recontest; arg_def_t framerate; arg_def_t use_webm; arg_def_t use_ivf; arg_def_t use_obu; arg_def_t q_hist_n; arg_def_t rate_hist_n; arg_def_t disable_warnings; arg_def_t disable_warning_prompt; arg_def_t bitdeptharg; arg_def_t inbitdeptharg; arg_def_t input_chroma_subsampling_x; arg_def_t input_chroma_subsampling_y; arg_def_t usage; arg_def_t threads; arg_def_t profile; arg_def_t width; arg_def_t height; arg_def_t forced_max_frame_width; arg_def_t forced_max_frame_height; #if CONFIG_WEBM_IO arg_def_t stereo_mode; #endif arg_def_t timebase; arg_def_t global_error_resilient; arg_def_t lag_in_frames; arg_def_t large_scale_tile; arg_def_t monochrome; arg_def_t full_still_picture_hdr; arg_def_t use_16bit_internal; arg_def_t dropframe_thresh; arg_def_t resize_mode; arg_def_t resize_denominator; arg_def_t resize_kf_denominator; arg_def_t superres_mode; arg_def_t superres_denominator; arg_def_t superres_kf_denominator; arg_def_t superres_qthresh; arg_def_t superres_kf_qthresh; arg_def_t end_usage; arg_def_t target_bitrate; arg_def_t min_quantizer; arg_def_t max_quantizer; arg_def_t undershoot_pct; arg_def_t overshoot_pct; arg_def_t buf_sz; arg_def_t buf_initial_sz; arg_def_t buf_optimal_sz; arg_def_t bias_pct; arg_def_t minsection_pct; arg_def_t maxsection_pct; arg_def_t fwd_kf_enabled; arg_def_t kf_min_dist; arg_def_t kf_max_dist; arg_def_t kf_disabled; arg_def_t sframe_dist; arg_def_t sframe_mode; arg_def_t save_as_annexb; arg_def_t noise_sens; arg_def_t sharpness; arg_def_t static_thresh; arg_def_t auto_altref; arg_def_t arnr_maxframes; arg_def_t arnr_strength; arg_def_t tune_metric; arg_def_t dist_metric; arg_def_t cq_level; arg_def_t max_intra_rate_pct; #if CONFIG_AV1_ENCODER arg_def_t cpu_used_av1; arg_def_t rowmtarg; arg_def_t fpmtarg; arg_def_t tile_cols; arg_def_t tile_rows; arg_def_t auto_tiles; arg_def_t enable_tpl_model; arg_def_t enable_keyframe_filtering; arg_def_t tile_width; arg_def_t tile_height; arg_def_t lossless; arg_def_t enable_cdef; arg_def_t enable_restoration; arg_def_t enable_rect_partitions; arg_def_t enable_ab_partitions; arg_def_t enable_1to4_partitions; arg_def_t min_partition_size; arg_def_t max_partition_size; arg_def_t enable_dual_filter; arg_def_t enable_chroma_deltaq; arg_def_t enable_intra_edge_filter; arg_def_t enable_order_hint; arg_def_t enable_tx64; arg_def_t enable_flip_idtx; arg_def_t enable_rect_tx; arg_def_t enable_dist_wtd_comp; arg_def_t enable_masked_comp; arg_def_t enable_onesided_comp; arg_def_t enable_interintra_comp; arg_def_t enable_smooth_interintra; arg_def_t enable_diff_wtd_comp; arg_def_t enable_interinter_wedge; arg_def_t enable_interintra_wedge; arg_def_t enable_global_motion; arg_def_t enable_warped_motion; arg_def_t enable_filter_intra; arg_def_t enable_smooth_intra; arg_def_t enable_paeth_intra; arg_def_t enable_cfl_intra; arg_def_t enable_directional_intra; arg_def_t enable_diagonal_intra; arg_def_t force_video_mode; arg_def_t enable_obmc; arg_def_t enable_overlay; arg_def_t enable_palette; arg_def_t enable_intrabc; arg_def_t enable_angle_delta; arg_def_t disable_trellis_quant; arg_def_t enable_qm; arg_def_t qm_min; arg_def_t qm_max; arg_def_t reduced_tx_type_set; arg_def_t use_intra_dct_only; arg_def_t use_inter_dct_only; arg_def_t use_intra_default_tx_only; arg_def_t quant_b_adapt; arg_def_t coeff_cost_upd_freq; arg_def_t mode_cost_upd_freq; arg_def_t mv_cost_upd_freq; arg_def_t dv_cost_upd_freq; arg_def_t num_tg; arg_def_t mtu_size; arg_def_t timing_info; #if CONFIG_TUNE_VMAF arg_def_t vmaf_model_path; #endif arg_def_t partition_info_path; arg_def_t enable_rate_guide_deltaq; arg_def_t rate_distribution_info; arg_def_t film_grain_test; arg_def_t film_grain_table; #if CONFIG_DENOISE arg_def_t denoise_noise_level; arg_def_t denoise_block_size; arg_def_t enable_dnl_denoising; #endif arg_def_t enable_ref_frame_mvs; arg_def_t frame_parallel_decoding; arg_def_t error_resilient_mode; arg_def_t aq_mode; arg_def_t deltaq_mode; arg_def_t deltaq_strength; arg_def_t deltalf_mode; arg_def_t frame_periodic_boost; arg_def_t gf_cbr_boost_pct; arg_def_t max_inter_rate_pct; arg_def_t min_gf_interval; arg_def_t max_gf_interval; arg_def_t gf_min_pyr_height; arg_def_t gf_max_pyr_height; arg_def_t max_reference_frames; arg_def_t reduced_reference_set; arg_def_t target_seq_level_idx; arg_def_t set_min_cr; arg_def_t input_color_primaries; arg_def_t input_transfer_characteristics; arg_def_t input_matrix_coefficients; arg_def_t input_chroma_sample_position; arg_def_t tune_content; arg_def_t cdf_update_mode; arg_def_t superblock_size; arg_def_t set_tier_mask; arg_def_t use_fixed_qp_offsets; arg_def_t fixed_qp_offsets; arg_def_t vbr_corpus_complexity_lap; arg_def_t fwd_kf_dist; arg_def_t enable_tx_size_search; arg_def_t loopfilter_control; arg_def_t two_pass_input; arg_def_t two_pass_output; arg_def_t two_pass_width; arg_def_t two_pass_height; arg_def_t second_pass_log; arg_def_t auto_intra_tools_off; arg_def_t strict_level_conformance; arg_def_t kf_max_pyr_height; arg_def_t sb_qp_sweep; #endif // CONFIG_AV1_ENCODER } av1_codec_arg_definitions_t; extern const av1_codec_arg_definitions_t g_av1_codec_arg_defs; #ifdef __cplusplus } #endif #endif // AOM_AV1_ARG_DEFS_H_ aom-3.12.1/av1/av1.cmake000066400000000000000000001127021477627663500145640ustar00rootroot00000000000000# # Copyright (c) 2017, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # if(AOM_AV1_AV1_CMAKE_) return() endif() # AOM_AV1_AV1_CMAKE_ set(AOM_AV1_AV1_CMAKE_ 1) list(APPEND AOM_AV1_COMMON_SOURCES "${AOM_ROOT}/common/args_helper.h" "${AOM_ROOT}/common/args_helper.c" "${AOM_ROOT}/av1/arg_defs.h" "${AOM_ROOT}/av1/arg_defs.c" "${AOM_ROOT}/av1/av1_iface_common.h" "${AOM_ROOT}/av1/common/alloccommon.c" "${AOM_ROOT}/av1/common/alloccommon.h" "${AOM_ROOT}/av1/common/av1_common_int.h" "${AOM_ROOT}/av1/common/av1_inv_txfm1d.c" "${AOM_ROOT}/av1/common/av1_inv_txfm1d.h" "${AOM_ROOT}/av1/common/av1_inv_txfm1d_cfg.h" "${AOM_ROOT}/av1/common/av1_inv_txfm2d.c" "${AOM_ROOT}/av1/common/av1_loopfilter.c" "${AOM_ROOT}/av1/common/av1_loopfilter.h" "${AOM_ROOT}/av1/common/av1_txfm.c" "${AOM_ROOT}/av1/common/av1_txfm.h" "${AOM_ROOT}/av1/common/blockd.c" "${AOM_ROOT}/av1/common/blockd.h" "${AOM_ROOT}/av1/common/cdef.c" "${AOM_ROOT}/av1/common/cdef.h" "${AOM_ROOT}/av1/common/cdef_block.c" "${AOM_ROOT}/av1/common/cdef_block.h" "${AOM_ROOT}/av1/common/cfl.c" "${AOM_ROOT}/av1/common/cfl.h" "${AOM_ROOT}/av1/common/common.h" "${AOM_ROOT}/av1/common/common_data.c" "${AOM_ROOT}/av1/common/common_data.h" "${AOM_ROOT}/av1/common/convolve.c" "${AOM_ROOT}/av1/common/convolve.h" "${AOM_ROOT}/av1/common/debugmodes.c" "${AOM_ROOT}/av1/common/entropy.c" "${AOM_ROOT}/av1/common/entropy.h" "${AOM_ROOT}/av1/common/entropymode.c" "${AOM_ROOT}/av1/common/entropymode.h" "${AOM_ROOT}/av1/common/entropymv.c" "${AOM_ROOT}/av1/common/entropymv.h" "${AOM_ROOT}/av1/common/enums.h" "${AOM_ROOT}/av1/common/filter.h" "${AOM_ROOT}/av1/common/frame_buffers.c" "${AOM_ROOT}/av1/common/frame_buffers.h" "${AOM_ROOT}/av1/common/idct.c" "${AOM_ROOT}/av1/common/idct.h" "${AOM_ROOT}/av1/common/mv.h" "${AOM_ROOT}/av1/common/mvref_common.c" "${AOM_ROOT}/av1/common/mvref_common.h" "${AOM_ROOT}/av1/common/obu_util.c" "${AOM_ROOT}/av1/common/obu_util.h" "${AOM_ROOT}/av1/common/pred_common.c" "${AOM_ROOT}/av1/common/pred_common.h" "${AOM_ROOT}/av1/common/quant_common.c" "${AOM_ROOT}/av1/common/quant_common.h" "${AOM_ROOT}/av1/common/reconinter.c" "${AOM_ROOT}/av1/common/reconinter.h" "${AOM_ROOT}/av1/common/reconinter_template.inc" "${AOM_ROOT}/av1/common/reconintra.c" "${AOM_ROOT}/av1/common/reconintra.h" "${AOM_ROOT}/av1/common/resize.c" "${AOM_ROOT}/av1/common/resize.h" "${AOM_ROOT}/av1/common/restoration.c" "${AOM_ROOT}/av1/common/restoration.h" "${AOM_ROOT}/av1/common/scale.c" "${AOM_ROOT}/av1/common/scale.h" "${AOM_ROOT}/av1/common/scan.c" "${AOM_ROOT}/av1/common/scan.h" "${AOM_ROOT}/av1/common/seg_common.c" "${AOM_ROOT}/av1/common/seg_common.h" "${AOM_ROOT}/av1/common/thread_common.c" "${AOM_ROOT}/av1/common/thread_common.h" "${AOM_ROOT}/av1/common/tile_common.c" "${AOM_ROOT}/av1/common/tile_common.h" "${AOM_ROOT}/av1/common/timing.c" "${AOM_ROOT}/av1/common/timing.h" "${AOM_ROOT}/av1/common/token_cdfs.h" "${AOM_ROOT}/av1/common/txb_common.c" "${AOM_ROOT}/av1/common/txb_common.h" "${AOM_ROOT}/av1/common/warped_motion.c" "${AOM_ROOT}/av1/common/warped_motion.h") list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/av1_dx_iface.c" "${AOM_ROOT}/av1/decoder/decodeframe.c" "${AOM_ROOT}/av1/decoder/decodeframe.h" "${AOM_ROOT}/av1/decoder/decodemv.c" "${AOM_ROOT}/av1/decoder/decodemv.h" "${AOM_ROOT}/av1/decoder/decoder.c" "${AOM_ROOT}/av1/decoder/decoder.h" "${AOM_ROOT}/av1/decoder/decodetxb.c" "${AOM_ROOT}/av1/decoder/decodetxb.h" "${AOM_ROOT}/av1/decoder/detokenize.c" "${AOM_ROOT}/av1/decoder/detokenize.h" "${AOM_ROOT}/av1/decoder/dthread.h" "${AOM_ROOT}/av1/decoder/grain_synthesis.c" "${AOM_ROOT}/av1/decoder/grain_synthesis.h" "${AOM_ROOT}/av1/decoder/obu.h" "${AOM_ROOT}/av1/decoder/obu.c") list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/av1_cx_iface.c" "${AOM_ROOT}/av1/av1_cx_iface.h" "${AOM_ROOT}/av1/encoder/aq_complexity.c" "${AOM_ROOT}/av1/encoder/aq_complexity.h" "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c" "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h" "${AOM_ROOT}/av1/encoder/aq_variance.c" "${AOM_ROOT}/av1/encoder/aq_variance.h" "${AOM_ROOT}/av1/encoder/allintra_vis.c" "${AOM_ROOT}/av1/encoder/allintra_vis.h" "${AOM_ROOT}/av1/encoder/enc_enums.h" "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.c" "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.h" "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d_cfg.h" "${AOM_ROOT}/av1/encoder/av1_fwd_txfm2d.c" "${AOM_ROOT}/av1/encoder/av1_quantize.c" "${AOM_ROOT}/av1/encoder/av1_quantize.h" "${AOM_ROOT}/av1/encoder/bitstream.c" "${AOM_ROOT}/av1/encoder/bitstream.h" "${AOM_ROOT}/av1/encoder/block.h" "${AOM_ROOT}/av1/encoder/cnn.c" "${AOM_ROOT}/av1/encoder/cnn.h" "${AOM_ROOT}/av1/encoder/compound_type.c" "${AOM_ROOT}/av1/encoder/compound_type.h" "${AOM_ROOT}/av1/encoder/context_tree.c" "${AOM_ROOT}/av1/encoder/context_tree.h" "${AOM_ROOT}/av1/encoder/cost.c" "${AOM_ROOT}/av1/encoder/cost.h" "${AOM_ROOT}/av1/encoder/encodeframe.c" "${AOM_ROOT}/av1/encoder/encodeframe.h" "${AOM_ROOT}/av1/encoder/encodeframe_utils.c" "${AOM_ROOT}/av1/encoder/encodeframe_utils.h" "${AOM_ROOT}/av1/encoder/encodemb.c" "${AOM_ROOT}/av1/encoder/encodemb.h" "${AOM_ROOT}/av1/encoder/encodemv.c" "${AOM_ROOT}/av1/encoder/encodemv.h" "${AOM_ROOT}/av1/encoder/encode_strategy.c" "${AOM_ROOT}/av1/encoder/encode_strategy.h" "${AOM_ROOT}/av1/encoder/encoder.c" "${AOM_ROOT}/av1/encoder/encoder.h" "${AOM_ROOT}/av1/encoder/encoder_alloc.h" "${AOM_ROOT}/av1/encoder/encoder_utils.c" "${AOM_ROOT}/av1/encoder/encoder_utils.h" "${AOM_ROOT}/av1/encoder/encodetxb.c" "${AOM_ROOT}/av1/encoder/encodetxb.h" "${AOM_ROOT}/av1/encoder/ethread.c" "${AOM_ROOT}/av1/encoder/ethread.h" "${AOM_ROOT}/av1/encoder/extend.c" "${AOM_ROOT}/av1/encoder/extend.h" "${AOM_ROOT}/av1/encoder/external_partition.c" "${AOM_ROOT}/av1/encoder/external_partition.h" "${AOM_ROOT}/av1/encoder/firstpass.c" "${AOM_ROOT}/av1/encoder/firstpass.h" "${AOM_ROOT}/av1/encoder/global_motion.c" "${AOM_ROOT}/av1/encoder/global_motion.h" "${AOM_ROOT}/av1/encoder/global_motion_facade.c" "${AOM_ROOT}/av1/encoder/global_motion_facade.h" "${AOM_ROOT}/av1/encoder/gop_structure.c" "${AOM_ROOT}/av1/encoder/gop_structure.h" "${AOM_ROOT}/av1/encoder/grain_test_vectors.h" "${AOM_ROOT}/av1/encoder/hash.c" "${AOM_ROOT}/av1/encoder/hash.h" "${AOM_ROOT}/av1/encoder/hash_motion.c" "${AOM_ROOT}/av1/encoder/hash_motion.h" "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c" "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h" "${AOM_ROOT}/av1/encoder/interp_search.c" "${AOM_ROOT}/av1/encoder/interp_search.h" "${AOM_ROOT}/av1/encoder/level.c" "${AOM_ROOT}/av1/encoder/level.h" "${AOM_ROOT}/av1/encoder/lookahead.c" "${AOM_ROOT}/av1/encoder/lookahead.h" "${AOM_ROOT}/av1/encoder/mcomp.c" "${AOM_ROOT}/av1/encoder/mcomp.h" "${AOM_ROOT}/av1/encoder/mcomp_structs.h" "${AOM_ROOT}/av1/encoder/ml.c" "${AOM_ROOT}/av1/encoder/ml.h" "${AOM_ROOT}/av1/encoder/model_rd.h" "${AOM_ROOT}/av1/encoder/motion_search_facade.c" "${AOM_ROOT}/av1/encoder/motion_search_facade.h" "${AOM_ROOT}/av1/encoder/mv_prec.c" "${AOM_ROOT}/av1/encoder/mv_prec.h" "${AOM_ROOT}/av1/encoder/palette.c" "${AOM_ROOT}/av1/encoder/palette.h" "${AOM_ROOT}/av1/encoder/partition_search.h" "${AOM_ROOT}/av1/encoder/partition_search.c" "${AOM_ROOT}/av1/encoder/partition_strategy.h" "${AOM_ROOT}/av1/encoder/partition_strategy.c" "${AOM_ROOT}/av1/encoder/pass2_strategy.h" "${AOM_ROOT}/av1/encoder/pass2_strategy.c" "${AOM_ROOT}/av1/encoder/pickcdef.c" "${AOM_ROOT}/av1/encoder/pickcdef.h" "${AOM_ROOT}/av1/encoder/picklpf.c" "${AOM_ROOT}/av1/encoder/picklpf.h" "${AOM_ROOT}/av1/encoder/pickrst.c" "${AOM_ROOT}/av1/encoder/pickrst.h" "${AOM_ROOT}/av1/encoder/ratectrl.c" "${AOM_ROOT}/av1/encoder/ratectrl.h" "${AOM_ROOT}/av1/encoder/rc_utils.h" "${AOM_ROOT}/av1/encoder/rd.c" "${AOM_ROOT}/av1/encoder/rd.h" "${AOM_ROOT}/av1/encoder/rdopt.c" "${AOM_ROOT}/av1/encoder/nonrd_pickmode.c" "${AOM_ROOT}/av1/encoder/nonrd_opt.c" "${AOM_ROOT}/av1/encoder/nonrd_opt.h" "${AOM_ROOT}/av1/encoder/rdopt.h" "${AOM_ROOT}/av1/encoder/rdopt_data_defs.h" "${AOM_ROOT}/av1/encoder/rdopt_utils.h" "${AOM_ROOT}/av1/encoder/reconinter_enc.c" "${AOM_ROOT}/av1/encoder/reconinter_enc.h" "${AOM_ROOT}/av1/encoder/segmentation.c" "${AOM_ROOT}/av1/encoder/segmentation.h" "${AOM_ROOT}/av1/encoder/sorting_network.h" "${AOM_ROOT}/av1/encoder/speed_features.c" "${AOM_ROOT}/av1/encoder/speed_features.h" "${AOM_ROOT}/av1/encoder/superres_scale.c" "${AOM_ROOT}/av1/encoder/superres_scale.h" "${AOM_ROOT}/av1/encoder/svc_layercontext.c" "${AOM_ROOT}/av1/encoder/svc_layercontext.h" "${AOM_ROOT}/av1/encoder/temporal_filter.c" "${AOM_ROOT}/av1/encoder/temporal_filter.h" "${AOM_ROOT}/av1/encoder/tokenize.c" "${AOM_ROOT}/av1/encoder/tokenize.h" "${AOM_ROOT}/av1/encoder/tpl_model.c" "${AOM_ROOT}/av1/encoder/tpl_model.h" "${AOM_ROOT}/av1/encoder/tx_search.c" "${AOM_ROOT}/av1/encoder/tx_search.h" "${AOM_ROOT}/av1/encoder/txb_rdopt.c" "${AOM_ROOT}/av1/encoder/txb_rdopt.h" "${AOM_ROOT}/av1/encoder/txb_rdopt_utils.h" "${AOM_ROOT}/av1/encoder/intra_mode_search.c" "${AOM_ROOT}/av1/encoder/intra_mode_search.h" "${AOM_ROOT}/av1/encoder/intra_mode_search_utils.h" "${AOM_ROOT}/av1/encoder/wedge_utils.c" "${AOM_ROOT}/av1/encoder/var_based_part.c" "${AOM_ROOT}/av1/encoder/var_based_part.h" "${AOM_ROOT}/av1/encoder/av1_noise_estimate.c" "${AOM_ROOT}/av1/encoder/av1_noise_estimate.h" "${AOM_ROOT}/third_party/fastfeat/fast.c" "${AOM_ROOT}/third_party/fastfeat/fast.h" "${AOM_ROOT}/third_party/fastfeat/fast_9.c" "${AOM_ROOT}/third_party/fastfeat/nonmax.c" "${AOM_ROOT}/third_party/vector/vector.c" "${AOM_ROOT}/third_party/vector/vector.h" "${AOM_ROOT}/av1/encoder/dwt.c" "${AOM_ROOT}/av1/encoder/dwt.h") if(CONFIG_REALTIME_ONLY) list(REMOVE_ITEM AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/grain_test_vectors.h") endif() list(APPEND AOM_AV1_COMMON_INTRIN_SSE2 "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h" "${AOM_ROOT}/av1/common/x86/cfl_sse2.c" "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c" "${AOM_ROOT}/av1/common/x86/convolve_sse2.c" "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c" "${AOM_ROOT}/av1/common/x86/resize_sse2.c" "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c") list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3 "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.c" "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.h" "${AOM_ROOT}/av1/common/x86/cfl_ssse3.c" "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c" "${AOM_ROOT}/av1/common/x86/resize_ssse3.c") # Fallbacks to support Valgrind on 32-bit x86 list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3_X86 "${AOM_ROOT}/av1/common/x86/cdef_block_ssse3.c") list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1 "${AOM_ROOT}/av1/common/x86/av1_convolve_horiz_rs_sse4.c" "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c" "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.c" "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.h" "${AOM_ROOT}/av1/common/x86/cdef_block_sse4.c" "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c" "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c" "${AOM_ROOT}/av1/common/x86/intra_edge_sse4.c" "${AOM_ROOT}/av1/common/x86/reconinter_sse4.c" "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c" "${AOM_ROOT}/av1/common/x86/warp_plane_sse4.c") list(APPEND AOM_AV1_COMMON_INTRIN_AVX2 "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.c" "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.h" "${AOM_ROOT}/av1/common/x86/cdef_block_avx2.c" "${AOM_ROOT}/av1/common/x86/cfl_avx2.c" "${AOM_ROOT}/av1/common/x86/convolve_2d_avx2.c" "${AOM_ROOT}/av1/common/x86/convolve_avx2.c" "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c" "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c" "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c" "${AOM_ROOT}/av1/common/x86/resize_avx2.c" "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c" "${AOM_ROOT}/av1/common/x86/warp_plane_avx2.c" "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c") list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm" "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm") list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2 "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.c" "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.h" "${AOM_ROOT}/av1/encoder/x86/av1_k_means_sse2.c" "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c" "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c" "${AOM_ROOT}/av1/encoder/x86/error_intrin_sse2.c" "${AOM_ROOT}/av1/encoder/x86/reconinter_enc_sse2.c" "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c" "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c") # The functions defined in these files are removed from rtcd when # CONFIG_EXCLUDE_SIMD_MISMATCH=1. if(NOT CONFIG_EXCLUDE_SIMD_MISMATCH) list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c" "${AOM_ROOT}/av1/encoder/x86/ml_sse3.h") endif() list(APPEND AOM_AV1_ENCODER_ASM_SSSE3_X86_64 "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm") list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1 "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm1d_sse4.c" "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_sse4.c" "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c" "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c" "${AOM_ROOT}/av1/encoder/x86/rdopt_sse4.c" "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c") list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2 "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c" "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c" "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h" "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c" "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_avx2.c" "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c" "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c" "${AOM_ROOT}/av1/encoder/x86/rdopt_avx2.c" "${AOM_ROOT}/av1/encoder/x86/av1_k_means_avx2.c" "${AOM_ROOT}/av1/encoder/x86/temporal_filter_avx2.c" "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c") # The functions defined in these files are removed from rtcd when # CONFIG_EXCLUDE_SIMD_MISMATCH=1. if(NOT CONFIG_EXCLUDE_SIMD_MISMATCH) list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2 "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c" "${AOM_ROOT}/av1/encoder/x86/ml_avx2.c") endif() list(APPEND AOM_AV1_ENCODER_INTRIN_NEON "${AOM_ROOT}/av1/encoder/arm/av1_error_neon.c" "${AOM_ROOT}/av1/encoder/arm/av1_fwd_txfm2d_neon.c" "${AOM_ROOT}/av1/encoder/arm/av1_k_means_neon.c" "${AOM_ROOT}/av1/encoder/arm/cnn_neon.c" "${AOM_ROOT}/av1/encoder/arm/encodetxb_neon.c" "${AOM_ROOT}/av1/encoder/arm/highbd_fwd_txfm_neon.c" "${AOM_ROOT}/av1/encoder/arm/hybrid_fwd_txfm_neon.c" "${AOM_ROOT}/av1/encoder/arm/pickrst_neon.c" "${AOM_ROOT}/av1/encoder/arm/pickrst_neon.h" "${AOM_ROOT}/av1/encoder/arm/quantize_neon.c" "${AOM_ROOT}/av1/encoder/arm/rdopt_neon.c" "${AOM_ROOT}/av1/encoder/arm/reconinter_enc_neon.c" "${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon.c" "${AOM_ROOT}/av1/encoder/arm/wedge_utils_neon.c") # The functions defined in this file are removed from rtcd when # CONFIG_EXCLUDE_SIMD_MISMATCH=1. if(NOT CONFIG_EXCLUDE_SIMD_MISMATCH) list(APPEND AOM_AV1_ENCODER_INTRIN_NEON "${AOM_ROOT}/av1/encoder/arm/ml_neon.c") endif() list(APPEND AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD "${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon_dotprod.c") list(APPEND AOM_AV1_ENCODER_INTRIN_SVE "${AOM_ROOT}/av1/encoder/arm/av1_error_sve.c" "${AOM_ROOT}/av1/encoder/arm/pickrst_sve.c" "${AOM_ROOT}/av1/encoder/arm/wedge_utils_sve.c") list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32 "${AOM_ROOT}/av1/encoder/arm/hash_arm_crc32.c") list(APPEND AOM_AV1_COMMON_INTRIN_NEON "${AOM_ROOT}/av1/common/arm/av1_convolve_horiz_rs_neon.c" "${AOM_ROOT}/av1/common/arm/av1_convolve_scale_neon.c" "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c" "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h" "${AOM_ROOT}/av1/common/arm/av1_txfm_neon.c" "${AOM_ROOT}/av1/common/arm/blend_a64_hmask_neon.c" "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c" "${AOM_ROOT}/av1/common/arm/cdef_block_neon.c" "${AOM_ROOT}/av1/common/arm/cfl_neon.c" "${AOM_ROOT}/av1/common/arm/compound_convolve_neon.c" "${AOM_ROOT}/av1/common/arm/convolve_neon.c" "${AOM_ROOT}/av1/common/arm/convolve_neon.h" "${AOM_ROOT}/av1/common/arm/highbd_inv_txfm_neon.c" "${AOM_ROOT}/av1/common/arm/reconinter_neon.c" "${AOM_ROOT}/av1/common/arm/reconintra_neon.c" "${AOM_ROOT}/av1/common/arm/resize_neon.c" "${AOM_ROOT}/av1/common/arm/selfguided_neon.c" "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c" "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c") list(APPEND AOM_AV1_COMMON_INTRIN_NEON_DOTPROD "${AOM_ROOT}/av1/common/arm/av1_convolve_scale_neon_dotprod.c" "${AOM_ROOT}/av1/common/arm/compound_convolve_neon_dotprod.c" "${AOM_ROOT}/av1/common/arm/convolve_neon_dotprod.c" "${AOM_ROOT}/av1/common/arm/resize_neon_dotprod.c") list(APPEND AOM_AV1_COMMON_INTRIN_NEON_I8MM "${AOM_ROOT}/av1/common/arm/av1_convolve_scale_neon_i8mm.c" "${AOM_ROOT}/av1/common/arm/compound_convolve_neon_i8mm.c" "${AOM_ROOT}/av1/common/arm/convolve_neon_i8mm.c" "${AOM_ROOT}/av1/common/arm/resize_neon_i8mm.c" "${AOM_ROOT}/av1/common/arm/warp_plane_neon_i8mm.c") list(APPEND AOM_AV1_COMMON_INTRIN_SVE "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_sve.c" "${AOM_ROOT}/av1/common/arm/warp_plane_sve.c") list(APPEND AOM_AV1_COMMON_INTRIN_SVE2 "${AOM_ROOT}/av1/common/arm/convolve_sve2.c") list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2 "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c") list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c") list(APPEND AOM_AV1_COMMON_INTRIN_RVV "${AOM_ROOT}/av1/common/riscv/cdef_block_rvv.c") if(CONFIG_THREE_PASS) list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/thirdpass.c" "${AOM_ROOT}/av1/encoder/thirdpass.h") endif() if(CONFIG_TUNE_VMAF) list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/tune_vmaf.c" "${AOM_ROOT}/av1/encoder/tune_vmaf.h") endif() if(CONFIG_TUNE_BUTTERAUGLI) list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/tune_butteraugli.c" "${AOM_ROOT}/av1/encoder/tune_butteraugli.h") endif() if(CONFIG_SALIENCY_MAP) list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/saliency_map.c" "${AOM_ROOT}/av1/encoder/saliency_map.h") endif() if(CONFIG_OPTICAL_FLOW_API) list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/sparse_linear_solver.c" "${AOM_ROOT}/av1/encoder/sparse_linear_solver.h" "${AOM_ROOT}/av1/encoder/optical_flow.c" "${AOM_ROOT}/av1/encoder/optical_flow.h") endif() if(CONFIG_AV1_TEMPORAL_DENOISING) list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/av1_temporal_denoiser.c" "${AOM_ROOT}/av1/encoder/av1_temporal_denoiser.h") list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2 "${AOM_ROOT}/av1/encoder/x86/av1_temporal_denoiser_sse2.c") list(APPEND AOM_AV1_ENCODER_INTRIN_NEON "${AOM_ROOT}/av1/encoder/arm/av1_temporal_denoiser_neon.c") endif() if(CONFIG_AV1_HIGHBITDEPTH) list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3 "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c" "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_ssse3.c" "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c") list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1 "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse4.c" "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_sse4.c" "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c") list(APPEND AOM_AV1_COMMON_INTRIN_AVX2 "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c" "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_avx2.c" "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_avx2.c" "${AOM_ROOT}/av1/common/x86/highbd_warp_affine_avx2.c") list(APPEND AOM_AV1_COMMON_INTRIN_NEON "${AOM_ROOT}/av1/common/arm/highbd_compound_convolve_neon.c" "${AOM_ROOT}/av1/common/arm/highbd_convolve_horiz_rs_neon.c" "${AOM_ROOT}/av1/common/arm/highbd_convolve_neon.c" "${AOM_ROOT}/av1/common/arm/highbd_convolve_scale_neon.c" "${AOM_ROOT}/av1/common/arm/highbd_reconinter_neon.c" "${AOM_ROOT}/av1/common/arm/highbd_reconintra_neon.c" "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_neon.c" "${AOM_ROOT}/av1/common/arm/highbd_wiener_convolve_neon.c") list(APPEND AOM_AV1_COMMON_INTRIN_SVE2 "${AOM_ROOT}/av1/common/arm/highbd_compound_convolve_sve2.c" "${AOM_ROOT}/av1/common/arm/highbd_convolve_sve2.c") list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2 "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c" "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse2.c") list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1 "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c") list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2 "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c" "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c" "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c") list(APPEND AOM_AV1_ENCODER_INTRIN_NEON "${AOM_ROOT}/av1/encoder/arm/av1_highbd_quantize_neon.c" "${AOM_ROOT}/av1/encoder/arm/highbd_pickrst_neon.c" "${AOM_ROOT}/av1/encoder/arm/highbd_rdopt_neon.c" "${AOM_ROOT}/av1/encoder/arm/highbd_temporal_filter_neon.c") list(APPEND AOM_AV1_ENCODER_INTRIN_SVE "${AOM_ROOT}/av1/encoder/arm/highbd_pickrst_sve.c") endif() if(CONFIG_ACCOUNTING) list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/accounting.c" "${AOM_ROOT}/av1/decoder/accounting.h") endif() if(CONFIG_INSPECTION) list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/inspection.c" "${AOM_ROOT}/av1/decoder/inspection.h") endif() if(CONFIG_INTERNAL_STATS) list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/blockiness.c" "${AOM_ROOT}/av1/encoder/blockiness.h") endif() if(CONFIG_REALTIME_ONLY) if(NOT CONFIG_AV1_DECODER) list(REMOVE_ITEM AOM_AV1_COMMON_SOURCES "${AOM_ROOT}/av1/common/cfl.c" "${AOM_ROOT}/av1/common/cfl.h" "${AOM_ROOT}/av1/common/restoration.c" "${AOM_ROOT}/av1/common/restoration.h" "${AOM_ROOT}/av1/common/warped_motion.c" "${AOM_ROOT}/av1/common/warped_motion.h") list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSE2 "${AOM_ROOT}/av1/common/x86/cfl_sse2.c" "${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c" "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c") list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSE4_1 "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c" "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c" "${AOM_ROOT}/av1/common/x86/warp_plane_sse4.c") list( REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSSE3 "${AOM_ROOT}/av1/common/x86/cfl_ssse3.c" "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_ssse3.c") list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_AVX2 "${AOM_ROOT}/av1/common/x86/cfl_avx2.c" "${AOM_ROOT}/av1/common/x86/highbd_warp_affine_avx2.c" "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_avx2.c" "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c" "${AOM_ROOT}/av1/common/x86/warp_plane_avx2.c" "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c") list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_NEON "${AOM_ROOT}/av1/common/arm/cfl_neon.c" "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_neon.c" "${AOM_ROOT}/av1/common/arm/highbd_wiener_convolve_neon.c" "${AOM_ROOT}/av1/common/arm/selfguided_neon.c" "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c" "${AOM_ROOT}/av1/common/arm/warp_plane_neon.h" "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c") list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_NEON_I8MM "${AOM_ROOT}/av1/common/arm/warp_plane_neon_i8mm.c") list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SVE "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_sve.c" "${AOM_ROOT}/av1/common/arm/warp_plane_sve.c") endif() list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SSE2 "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse2.c" "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c") list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SSE4_1 "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c") list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_AVX2 "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c" "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c" "${AOM_ROOT}/av1/encoder/x86/temporal_filter_avx2.c" "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c") list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_NEON "${AOM_ROOT}/av1/encoder/arm/cnn_neon.c" "${AOM_ROOT}/av1/encoder/arm/highbd_pickrst_neon.c" "${AOM_ROOT}/av1/encoder/arm/highbd_temporal_filter_neon.c" "${AOM_ROOT}/av1/encoder/arm/pickrst_neon.c" "${AOM_ROOT}/av1/encoder/arm/pickrst_neon.h" "${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon.c") list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD "${AOM_ROOT}/av1/encoder/arm/temporal_filter_neon_dotprod.c") list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SVE "${AOM_ROOT}/av1/encoder/arm/pickrst_sve.c") list(REMOVE_ITEM AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/cnn.c" "${AOM_ROOT}/av1/encoder/cnn.h" "${AOM_ROOT}/av1/encoder/firstpass.c" "${AOM_ROOT}/av1/encoder/firstpass.h" "${AOM_ROOT}/av1/encoder/global_motion.c" "${AOM_ROOT}/av1/encoder/global_motion.h" "${AOM_ROOT}/av1/encoder/global_motion_facade.c" "${AOM_ROOT}/av1/encoder/global_motion_facade.h" "${AOM_ROOT}/av1/encoder/gop_structure.c" "${AOM_ROOT}/av1/encoder/gop_structure.h" "${AOM_ROOT}/av1/encoder/misc_model_weights.h" "${AOM_ROOT}/av1/encoder/partition_cnn_weights.h" "${AOM_ROOT}/av1/encoder/partition_model_weights.h" "${AOM_ROOT}/av1/encoder/pass2_strategy.c" "${AOM_ROOT}/av1/encoder/picklpf.h" "${AOM_ROOT}/av1/encoder/pickrst.c" "${AOM_ROOT}/av1/encoder/temporal_filter.c" "${AOM_ROOT}/av1/encoder/temporal_filter.h" "${AOM_ROOT}/av1/encoder/tpl_model.c" "${AOM_ROOT}/av1/encoder/tpl_model.h") endif() # Setup AV1 common/decoder/encoder targets. The libaom target must exist before # this function is called. function(setup_av1_targets) add_library(aom_av1_common OBJECT ${AOM_AV1_COMMON_SOURCES}) list(APPEND AOM_LIB_TARGETS aom_av1_common) target_sources(aom PRIVATE $) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $) endif() if(CONFIG_AV1_DECODER) add_library(aom_av1_decoder OBJECT ${AOM_AV1_DECODER_SOURCES}) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_decoder) target_sources(aom PRIVATE $) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $) endif() endif() if(CONFIG_AV1_ENCODER) add_library(aom_av1_encoder OBJECT ${AOM_AV1_ENCODER_SOURCES}) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_encoder) target_sources(aom PRIVATE $) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $) endif() endif() if(HAVE_SSE2) require_compiler_flag_nomsvc("-msse2" NO) add_intrinsics_object_library("-msse2" "sse2" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_SSE2") if(CONFIG_AV1_DECODER) if(AOM_AV1_DECODER_ASM_SSE2) add_asm_library("aom_av1_decoder_sse2" "AOM_AV1_DECODER_ASM_SSE2") endif() if(AOM_AV1_DECODER_INTRIN_SSE2) add_intrinsics_object_library("-msse2" "sse2" "aom_av1_decoder" "AOM_AV1_DECODER_INTRIN_SSE2") endif() endif() if(CONFIG_AV1_ENCODER) add_asm_library("aom_av1_encoder_sse2" "AOM_AV1_ENCODER_ASM_SSE2") add_intrinsics_object_library("-msse2" "sse2" "aom_av1_encoder" "AOM_AV1_ENCODER_INTRIN_SSE2") endif() endif() if(HAVE_SSE3) require_compiler_flag_nomsvc("-msse3" NO) if(CONFIG_AV1_ENCODER) add_intrinsics_object_library("-msse3" "sse3" "aom_av1_encoder" "AOM_AV1_ENCODER_INTRIN_SSE3") endif() endif() if(HAVE_SSSE3) require_compiler_flag_nomsvc("-mssse3" NO) add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_SSSE3") if(AOM_ARCH_X86) add_intrinsics_object_library("-mssse3" "ssse3_x86" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_SSSE3_X86") endif() if(CONFIG_AV1_DECODER) if(AOM_AV1_DECODER_INTRIN_SSSE3) add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_decoder" "AOM_AV1_DECODER_INTRIN_SSSE3") endif() endif() endif() if(HAVE_SSE4_1) require_compiler_flag_nomsvc("-msse4.1" NO) add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_SSE4_1") if(CONFIG_AV1_ENCODER) if("${AOM_TARGET_CPU}" STREQUAL "x86_64") add_asm_library("aom_av1_encoder_ssse3" "AOM_AV1_ENCODER_ASM_SSSE3_X86_64") endif() if(AOM_AV1_ENCODER_INTRIN_SSE4_1) add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_encoder" "AOM_AV1_ENCODER_INTRIN_SSE4_1") endif() endif() endif() if(HAVE_SSE4_2) require_compiler_flag_nomsvc("-msse4.2" NO) if(CONFIG_AV1_ENCODER) if(AOM_AV1_ENCODER_INTRIN_SSE4_2) add_intrinsics_object_library("-msse4.2" "sse42" "aom_av1_encoder" "AOM_AV1_ENCODER_INTRIN_SSE4_2") endif() endif() endif() if(HAVE_AVX2) require_compiler_flag_nomsvc("-mavx2" NO) add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_AVX2") if(CONFIG_AV1_ENCODER) add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_encoder" "AOM_AV1_ENCODER_INTRIN_AVX2") endif() endif() if(HAVE_NEON) add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_NEON") if(CONFIG_AV1_ENCODER) add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" "aom_av1_encoder" "AOM_AV1_ENCODER_INTRIN_NEON") endif() endif() if(HAVE_ARM_CRC32) if(CONFIG_AV1_ENCODER) add_intrinsics_object_library("${AOM_ARM_CRC32_FLAG}" "arm_crc32" "aom_av1_encoder" "AOM_AV1_ENCODER_INTRIN_ARM_CRC32") endif() endif() if(HAVE_NEON_DOTPROD) add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_NEON_DOTPROD") if(CONFIG_AV1_ENCODER) add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod" "aom_av1_encoder" "AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD") endif() endif() if(HAVE_NEON_I8MM) add_intrinsics_object_library("${AOM_NEON_I8MM_FLAG}" "neon_i8mm" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_NEON_I8MM") endif() if(HAVE_SVE) add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_SVE") if(CONFIG_AV1_ENCODER) add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_av1_encoder" "AOM_AV1_ENCODER_INTRIN_SVE") endif() endif() if(HAVE_SVE2) add_intrinsics_object_library("${AOM_SVE2_FLAG}" "sve2" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_SVE2") endif() if(HAVE_VSX) if(AOM_AV1_COMMON_INTRIN_VSX) add_intrinsics_object_library("-mvsx -maltivec" "vsx" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_VSX") endif() endif() if(HAVE_RVV) if(AOM_AV1_COMMON_INTRIN_RVV) add_intrinsics_object_library("-march=rv64gcv" "rvv" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_RVV") endif() endif() # Pass the new lib targets up to the parent scope instance of # $AOM_LIB_TARGETS. set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) endfunction() aom-3.12.1/av1/av1_cx_iface.c000066400000000000000000006435231477627663500155610ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include #include #include "config/aom_config.h" #include "config/aom_version.h" #include "aom/aomcx.h" #include "aom/aom_encoder.h" #include "aom/aom_external_partition.h" #include "aom/aom_image.h" #include "aom/internal/aom_codec_internal.h" #include "aom_dsp/flow_estimation/flow_estimation.h" #include "aom_mem/aom_mem.h" #include "aom_scale/yv12config.h" #include "aom_util/aom_pthread.h" #include "av1/av1_cx_iface.h" #include "av1/av1_iface_common.h" #include "av1/common/av1_common_int.h" #include "av1/common/enums.h" #include "av1/common/scale.h" #include "av1/encoder/bitstream.h" #include "av1/encoder/enc_enums.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encoder_alloc.h" #include "av1/encoder/encoder_utils.h" #include "av1/encoder/ethread.h" #include "av1/encoder/external_partition.h" #include "av1/encoder/firstpass.h" #include "av1/encoder/lookahead.h" #include "av1/encoder/rc_utils.h" #include "av1/arg_defs.h" #include "common/args_helper.h" struct av1_extracfg { int cpu_used; unsigned int enable_auto_alt_ref; unsigned int enable_auto_bwd_ref; unsigned int noise_sensitivity; unsigned int sharpness; unsigned int static_thresh; unsigned int row_mt; unsigned int fp_mt; unsigned int tile_columns; // log2 number of tile columns unsigned int tile_rows; // log2 number of tile rows unsigned int auto_tiles; unsigned int enable_tpl_model; unsigned int enable_keyframe_filtering; unsigned int arnr_max_frames; unsigned int arnr_strength; unsigned int min_gf_interval; unsigned int max_gf_interval; unsigned int gf_min_pyr_height; unsigned int gf_max_pyr_height; aom_tune_metric tuning; const char *vmaf_model_path; const char *partition_info_path; unsigned int enable_rate_guide_deltaq; const char *rate_distribution_info; aom_dist_metric dist_metric; unsigned int cq_level; // constrained quality level unsigned int rc_max_intra_bitrate_pct; unsigned int rc_max_inter_bitrate_pct; unsigned int gf_cbr_boost_pct; unsigned int lossless; unsigned int enable_cdef; unsigned int enable_restoration; unsigned int force_video_mode; unsigned int enable_obmc; unsigned int disable_trellis_quant; unsigned int enable_qm; unsigned int qm_y; unsigned int qm_u; unsigned int qm_v; unsigned int qm_min; unsigned int qm_max; unsigned int num_tg; unsigned int mtu_size; aom_timing_info_type_t timing_info_type; unsigned int frame_parallel_decoding_mode; int enable_dual_filter; unsigned int enable_chroma_deltaq; AQ_MODE aq_mode; DELTAQ_MODE deltaq_mode; int deltaq_strength; int deltalf_mode; unsigned int frame_periodic_boost; aom_tune_content content; aom_color_primaries_t color_primaries; aom_transfer_characteristics_t transfer_characteristics; aom_matrix_coefficients_t matrix_coefficients; aom_chroma_sample_position_t chroma_sample_position; int color_range; int render_width; int render_height; aom_superblock_size_t superblock_size; unsigned int single_tile_decoding; int error_resilient_mode; int s_frame_mode; int film_grain_test_vector; const char *film_grain_table_filename; unsigned int motion_vector_unit_test; #if CONFIG_FPMT_TEST unsigned int fpmt_unit_test; #endif unsigned int cdf_update_mode; int enable_rect_partitions; // enable rectangular partitions for sequence int enable_ab_partitions; // enable AB partitions for sequence int enable_1to4_partitions; // enable 1:4 and 4:1 partitions for sequence int min_partition_size; // min partition size [4,8,16,32,64,128] int max_partition_size; // max partition size [4,8,16,32,64,128] int enable_intra_edge_filter; // enable intra-edge filter for sequence int enable_order_hint; // enable order hint for sequence int enable_tx64; // enable 64-pt transform usage for sequence int enable_flip_idtx; // enable flip and identity transform types int enable_rect_tx; // enable rectangular transform usage for sequence int enable_dist_wtd_comp; // enable dist wtd compound for sequence int max_reference_frames; // maximum number of references per frame int enable_reduced_reference_set; // enable reduced set of references int enable_ref_frame_mvs; // sequence level int allow_ref_frame_mvs; // frame level int enable_masked_comp; // enable masked compound for sequence int enable_onesided_comp; // enable one sided compound for sequence int enable_interintra_comp; // enable interintra compound for sequence int enable_smooth_interintra; // enable smooth interintra mode usage int enable_diff_wtd_comp; // enable diff-wtd compound usage int enable_interinter_wedge; // enable interinter-wedge compound usage int enable_interintra_wedge; // enable interintra-wedge compound usage int enable_global_motion; // enable global motion usage for sequence int enable_warped_motion; // sequence level int allow_warped_motion; // frame level int enable_filter_intra; // enable filter intra for sequence int enable_smooth_intra; // enable smooth intra modes for sequence int enable_paeth_intra; // enable Paeth intra mode for sequence int enable_cfl_intra; // enable CFL uv intra mode for sequence int enable_directional_intra; // enable directional modes for sequence int enable_diagonal_intra; // enable D45 to D203 intra modes for sequence int enable_superres; int enable_overlay; // enable overlay for filtered arf frames int enable_palette; int enable_intrabc; int enable_angle_delta; #if CONFIG_DENOISE float noise_level; int noise_block_size; int enable_dnl_denoising; #endif unsigned int chroma_subsampling_x; unsigned int chroma_subsampling_y; int reduced_tx_type_set; int use_intra_dct_only; int use_inter_dct_only; int use_intra_default_tx_only; int enable_tx_size_search; int quant_b_adapt; unsigned int vbr_corpus_complexity_lap; AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS]; // Bit mask to specify which tier each of the 32 possible operating points // conforms to. unsigned int tier_mask; // min_cr / 100 is the target minimum compression ratio for each frame. unsigned int min_cr; COST_UPDATE_TYPE coeff_cost_upd_freq; COST_UPDATE_TYPE mode_cost_upd_freq; COST_UPDATE_TYPE mv_cost_upd_freq; COST_UPDATE_TYPE dv_cost_upd_freq; unsigned int ext_tile_debug; unsigned int sb_multipass_unit_test; // Total number of passes. If this number is -1, then we assume passes = 1 or // 2 (passes = 1 if pass == AOM_RC_ONE_PASS and passes = 2 otherwise). int passes; int fwd_kf_dist; LOOPFILTER_CONTROL loopfilter_control; // Indicates if the application of post-processing filters should be skipped // on reconstructed frame. unsigned int skip_postproc_filtering; // the name of the second pass output file when passes > 2 const char *two_pass_output; const char *second_pass_log; // Automatically determine whether to disable several intra tools // when "--deltaq-mode=3" is true. // Default as 0. // When set to 1, the encoder will analyze the reconstruction quality // as compared to the source image in the preprocessing pass. // If the recontruction quality is considered high enough, we disable // the following intra coding tools, for better encoding speed: // "--enable_smooth_intra", // "--enable_paeth_intra", // "--enable_cfl_intra", // "--enable_diagonal_intra". int auto_intra_tools_off; int strict_level_conformance; int kf_max_pyr_height; int sb_qp_sweep; }; #if !CONFIG_REALTIME_ONLY static const struct av1_extracfg default_extra_cfg = { 0, // cpu_used 1, // enable_auto_alt_ref 0, // enable_auto_bwd_ref 0, // noise_sensitivity 0, // sharpness 0, // static_thresh 1, // row_mt 0, // fp_mt 0, // tile_columns 0, // tile_rows 0, // auto_tiles 1, // enable_tpl_model 1, // enable_keyframe_filtering 7, // arnr_max_frames 5, // arnr_strength 0, // min_gf_interval; 0 -> default decision 0, // max_gf_interval; 0 -> default decision 0, // gf_min_pyr_height 5, // gf_max_pyr_height AOM_TUNE_PSNR, // tuning "/usr/local/share/model/vmaf_v0.6.1.json", // VMAF model path ".", // partition info path 0, // enable rate guide deltaq "./rate_map.txt", // rate distribution input AOM_DIST_METRIC_PSNR, // dist_metric 10, // cq_level 0, // rc_max_intra_bitrate_pct 0, // rc_max_inter_bitrate_pct 0, // gf_cbr_boost_pct 0, // lossless 1, // enable_cdef 1, // enable_restoration 0, // force_video_mode 1, // enable_obmc 3, // disable_trellis_quant 0, // enable_qm DEFAULT_QM_Y, // qm_y DEFAULT_QM_U, // qm_u DEFAULT_QM_V, // qm_v DEFAULT_QM_FIRST, // qm_min DEFAULT_QM_LAST, // qm_max 1, // max number of tile groups 0, // mtu_size AOM_TIMING_UNSPECIFIED, // No picture timing signaling in bitstream 0, // frame_parallel_decoding_mode 1, // enable dual filter 0, // enable delta quant in chroma planes NO_AQ, // aq_mode DELTA_Q_OBJECTIVE, // deltaq_mode 100, // deltaq_strength 0, // delta lf mode 0, // frame_periodic_boost AOM_CONTENT_DEFAULT, // content AOM_CICP_CP_UNSPECIFIED, // CICP color primaries AOM_CICP_TC_UNSPECIFIED, // CICP transfer characteristics AOM_CICP_MC_UNSPECIFIED, // CICP matrix coefficients AOM_CSP_UNKNOWN, // chroma sample position 0, // color range 0, // render width 0, // render height AOM_SUPERBLOCK_SIZE_DYNAMIC, // superblock_size 1, // this depends on large_scale_tile. 0, // error_resilient_mode off by default. 0, // s_frame_mode off by default. 0, // film_grain_test_vector NULL, // film_grain_table_filename 0, // motion_vector_unit_test #if CONFIG_FPMT_TEST 0, // fpmt_unit_test #endif 1, // CDF update mode 1, // enable rectangular partitions 1, // enable ab shape partitions 1, // enable 1:4 and 4:1 partitions 4, // min_partition_size 128, // max_partition_size 1, // enable intra edge filter 1, // frame order hint 1, // enable 64-pt transform usage 1, // enable flip and identity transform 1, // enable rectangular transform usage 1, // dist-wtd compound 7, // max_reference_frames 0, // enable_reduced_reference_set 1, // enable_ref_frame_mvs sequence level 1, // allow ref_frame_mvs frame level 1, // enable masked compound at sequence level 1, // enable one sided compound at sequence level 1, // enable interintra compound at sequence level 1, // enable smooth interintra mode 1, // enable difference-weighted compound 1, // enable interinter wedge compound 1, // enable interintra wedge compound 1, // enable_global_motion usage 1, // enable_warped_motion at sequence level 1, // allow_warped_motion at frame level 1, // enable filter intra at sequence level 1, // enable smooth intra modes usage for sequence 1, // enable Paeth intra mode usage for sequence 1, // enable CFL uv intra mode usage for sequence 1, // enable directional intra mode usage for sequence 1, // enable D45 to D203 intra mode usage for sequence 1, // superres 1, // enable overlay 1, // enable palette 1, // enable intrabc 1, // enable angle delta #if CONFIG_DENOISE 0, // noise_level 32, // noise_block_size 1, // enable_dnl_denoising #endif 0, // chroma_subsampling_x 0, // chroma_subsampling_y 0, // reduced_tx_type_set 0, // use_intra_dct_only 0, // use_inter_dct_only 0, // use_intra_default_tx_only 1, // enable_tx_size_search 0, // quant_b_adapt 0, // vbr_corpus_complexity_lap { SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, }, // target_seq_level_idx 0, // tier_mask 0, // min_cr COST_UPD_SB, // coeff_cost_upd_freq COST_UPD_SB, // mode_cost_upd_freq COST_UPD_SB, // mv_cost_upd_freq COST_UPD_SB, // dv_cost_upd_freq 0, // ext_tile_debug 0, // sb_multipass_unit_test -1, // passes -1, // fwd_kf_dist LOOPFILTER_ALL, // loopfilter_control 0, // skip_postproc_filtering NULL, // two_pass_output NULL, // second_pass_log 0, // auto_intra_tools_off 0, // strict_level_conformance -1, // kf_max_pyr_height 0, // sb_qp_sweep }; #else // Some settings are changed for realtime only build. static const struct av1_extracfg default_extra_cfg = { 10, // cpu_used 1, // enable_auto_alt_ref 0, // enable_auto_bwd_ref 0, // noise_sensitivity 0, // sharpness 0, // static_thresh 1, // row_mt 0, // fp_mt 0, // tile_columns 0, // tile_rows 0, // auto_tiles 0, // enable_tpl_model 0, // enable_keyframe_filtering 7, // arnr_max_frames 5, // arnr_strength 0, // min_gf_interval; 0 -> default decision 0, // max_gf_interval; 0 -> default decision 0, // gf_min_pyr_height 5, // gf_max_pyr_height AOM_TUNE_PSNR, // tuning "/usr/local/share/model/vmaf_v0.6.1.json", // VMAF model path ".", // partition info path 0, // enable rate guide deltaq "./rate_map.txt", // rate distribution input AOM_DIST_METRIC_PSNR, // dist_metric 10, // cq_level 300, // rc_max_intra_bitrate_pct 0, // rc_max_inter_bitrate_pct 0, // gf_cbr_boost_pct 0, // lossless 1, // enable_cdef 0, // enable_restoration 0, // force_video_mode 0, // enable_obmc 3, // disable_trellis_quant 0, // enable_qm DEFAULT_QM_Y, // qm_y DEFAULT_QM_U, // qm_u DEFAULT_QM_V, // qm_v DEFAULT_QM_FIRST, // qm_min DEFAULT_QM_LAST, // qm_max 1, // max number of tile groups 0, // mtu_size AOM_TIMING_UNSPECIFIED, // No picture timing signaling in bitstream 0, // frame_parallel_decoding_mode 0, // enable dual filter 0, // enable delta quant in chroma planes CYCLIC_REFRESH_AQ, // aq_mode NO_DELTA_Q, // deltaq_mode 100, // deltaq_strength 0, // delta lf mode 0, // frame_periodic_boost AOM_CONTENT_DEFAULT, // content AOM_CICP_CP_UNSPECIFIED, // CICP color primaries AOM_CICP_TC_UNSPECIFIED, // CICP transfer characteristics AOM_CICP_MC_UNSPECIFIED, // CICP matrix coefficients AOM_CSP_UNKNOWN, // chroma sample position 0, // color range 0, // render width 0, // render height AOM_SUPERBLOCK_SIZE_DYNAMIC, // superblock_size 1, // this depends on large_scale_tile. 0, // error_resilient_mode off by default. 0, // s_frame_mode off by default. 0, // film_grain_test_vector NULL, // film_grain_table_filename 0, // motion_vector_unit_test #if CONFIG_FPMT_TEST 0, // fpmt_unit_test #endif 1, // CDF update mode 0, // enable rectangular partitions 0, // enable ab shape partitions 0, // enable 1:4 and 4:1 partitions 4, // min_partition_size 128, // max_partition_size 0, // enable intra edge filter 0, // frame order hint 0, // enable 64-pt transform usage 1, // enable flip and identity transform 1, // enable rectangular transform usage 0, // dist-wtd compound 3, // max_reference_frames 0, // enable_reduced_reference_set 0, // enable_ref_frame_mvs sequence level 0, // allow ref_frame_mvs frame level 0, // enable masked compound at sequence level 0, // enable one sided compound at sequence level 0, // enable interintra compound at sequence level 0, // enable smooth interintra mode 0, // enable difference-weighted compound 0, // enable interinter wedge compound 0, // enable interintra wedge compound 0, // enable_global_motion usage 0, // enable_warped_motion at sequence level 0, // allow_warped_motion at frame level 0, // enable filter intra at sequence level 0, // enable smooth intra modes usage for sequence 0, // enable Paeth intra mode usage for sequence 0, // enable CFL uv intra mode usage for sequence 1, // enable directional intra mode usage for sequence 1, // enable D45 to D203 intra mode usage for sequence 0, // superres 0, // enable overlay 1, // enable palette 0, // enable intrabc 0, // enable angle delta #if CONFIG_DENOISE 0, // noise_level 32, // noise_block_size 1, // enable_dnl_denoising #endif 0, // chroma_subsampling_x 0, // chroma_subsampling_y 0, // reduced_tx_type_set 0, // use_intra_dct_only 0, // use_inter_dct_only 1, // use_intra_default_tx_only 1, // enable_tx_size_search 0, // quant_b_adapt 0, // vbr_corpus_complexity_lap { SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, }, // target_seq_level_idx 0, // tier_mask 0, // min_cr COST_UPD_OFF, // coeff_cost_upd_freq COST_UPD_OFF, // mode_cost_upd_freq COST_UPD_OFF, // mv_cost_upd_freq COST_UPD_OFF, // dv_cost_upd_freq 0, // ext_tile_debug 0, // sb_multipass_unit_test -1, // passes -1, // fwd_kf_dist LOOPFILTER_ALL, // loopfilter_control 0, // skip_postproc_filtering NULL, // two_pass_output NULL, // second_pass_log 0, // auto_intra_tools_off 0, // strict_level_conformance -1, // kf_max_pyr_height 0, // sb_qp_sweep }; #endif struct aom_codec_alg_priv { aom_codec_priv_t base; aom_codec_enc_cfg_t cfg; struct av1_extracfg extra_cfg; aom_rational64_t timestamp_ratio; aom_codec_pts_t pts_offset; unsigned char pts_offset_initialized; AV1EncoderConfig oxcf; AV1_PRIMARY *ppi; unsigned char *cx_data; size_t cx_data_sz; size_t pending_cx_data_sz; aom_image_t preview_img; aom_enc_frame_flags_t next_frame_flags; aom_codec_pkt_list_decl(256) pkt_list; unsigned int fixed_kf_cntr; // BufferPool that holds all reference frames. BufferPool *buffer_pool; // lookahead instance variables BufferPool *buffer_pool_lap; FIRSTPASS_STATS *frame_stats_buffer; // Number of stats buffers required for look ahead int num_lap_buffers; STATS_BUFFER_CTX stats_buf_context; bool monochrome_on_init; }; static inline int gcd(int64_t a, int b) { int remainder; while (b > 0) { remainder = (int)(a % b); a = b; b = remainder; } return (int)a; } static void reduce_ratio(aom_rational64_t *ratio) { const int denom = gcd(ratio->num, ratio->den); ratio->num /= denom; ratio->den /= denom; } // Called by encoder_encode() only. Must not be called by encoder_init() // because the `error` paramerer will be destroyed by aom_codec_enc_init_ver() // after encoder_init() returns an error. See the "IMPORTANT" comment in // aom_codec_enc_init_ver(). static aom_codec_err_t update_error_state( aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) { const aom_codec_err_t res = error->error_code; if (res != AOM_CODEC_OK) ctx->base.err_detail = error->has_detail ? error->detail : NULL; return res; } // This function deep copies a string src to *dst. For default string we will // use a string literal, and otherwise we will allocate memory for the string. static aom_codec_err_t allocate_and_set_string(const char *src, const char *default_src, const char **dst, char *err_detail) { if (!src) { snprintf(err_detail, ARG_ERR_MSG_MAX_LEN, "Null pointer given to a string parameter."); return AOM_CODEC_INVALID_PARAM; } if (*dst && strcmp(src, *dst) == 0) return AOM_CODEC_OK; // If the input is exactly the same as default, we will use the string // literal, so do not free here. if (*dst != default_src) { aom_free((void *)*dst); } if (default_src && strcmp(src, default_src) == 0) { // default_src should be a string literal *dst = default_src; } else { size_t len = strlen(src) + 1; char *tmp = aom_malloc(len * sizeof(*tmp)); if (!tmp) { snprintf(err_detail, ARG_ERR_MSG_MAX_LEN, "Failed to allocate memory for copying parameters."); return AOM_CODEC_MEM_ERROR; } memcpy(tmp, src, len); *dst = tmp; } return 0; } #undef ERROR #define ERROR(str) \ do { \ ctx->base.err_detail = str; \ return AOM_CODEC_INVALID_PARAM; \ } while (0) #define RANGE_CHECK(p, memb, lo, hi) \ do { \ if (!((p)->memb >= (lo) && (p)->memb <= (hi))) \ ERROR(#memb " out of range [" #lo ".." #hi "]"); \ } while (0) #define RANGE_CHECK_HI(p, memb, hi) \ do { \ if (!((p)->memb <= (hi))) ERROR(#memb " out of range [.." #hi "]"); \ } while (0) #define RANGE_CHECK_BOOL(p, memb) \ do { \ if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean"); \ } while (0) static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx, const aom_codec_enc_cfg_t *cfg, const struct av1_extracfg *extra_cfg) { RANGE_CHECK(cfg, g_w, 1, 65536); // 16 bits available RANGE_CHECK(cfg, g_h, 1, 65536); // 16 bits available RANGE_CHECK_HI(cfg, g_forced_max_frame_width, 65536); // 16 bits available RANGE_CHECK_HI(cfg, g_forced_max_frame_height, 65536); // 16 bits available if (cfg->g_forced_max_frame_width) { RANGE_CHECK_HI(cfg, g_w, cfg->g_forced_max_frame_width); } if (cfg->g_forced_max_frame_height) { RANGE_CHECK_HI(cfg, g_h, cfg->g_forced_max_frame_height); } // To avoid integer overflows when multiplying width by height (or values // derived from width and height) using the int type, impose a maximum frame // area (width * height) of 2^30. const unsigned int max_frame_width = cfg->g_forced_max_frame_width ? cfg->g_forced_max_frame_width : cfg->g_w; const unsigned int max_frame_height = cfg->g_forced_max_frame_height ? cfg->g_forced_max_frame_height : cfg->g_h; const int64_t max_frame_area = (int64_t)max_frame_width * max_frame_height; if (max_frame_area > (1 << 30)) { ERROR("max_frame_area out of range [..2^30]"); } RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000); RANGE_CHECK(cfg, g_timebase.num, 1, 1000000000); RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1); RANGE_CHECK_HI(cfg, rc_target_bitrate, 2000000); RANGE_CHECK_HI(cfg, rc_max_quantizer, 63); RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer); RANGE_CHECK_BOOL(extra_cfg, lossless); RANGE_CHECK_HI(extra_cfg, aq_mode, AQ_MODE_COUNT - 1); RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTA_Q_MODE_COUNT - 1); if (cfg->g_usage != ALLINTRA && extra_cfg->deltaq_mode == DELTA_Q_VARIANCE_BOOST) { ERROR("Variance Boost (deltaq_mode = 6) can only be set in all intra mode"); } RANGE_CHECK_HI(extra_cfg, deltalf_mode, 1); RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1); #if CONFIG_REALTIME_ONLY RANGE_CHECK(cfg, g_usage, AOM_USAGE_REALTIME, AOM_USAGE_REALTIME); #else RANGE_CHECK_HI(cfg, g_usage, AOM_USAGE_ALL_INTRA); #endif RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS); RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q); RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100); RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100); RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100); RANGE_CHECK(cfg, kf_mode, AOM_KF_DISABLED, AOM_KF_AUTO); RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100); RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_THIRD_PASS); RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS); if (cfg->g_usage == AOM_USAGE_ALL_INTRA) { RANGE_CHECK_HI(cfg, g_lag_in_frames, 0); RANGE_CHECK_HI(cfg, kf_max_dist, 0); } RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1); RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1); if (extra_cfg->max_gf_interval > 0) { RANGE_CHECK(extra_cfg, max_gf_interval, AOMMAX(2, extra_cfg->min_gf_interval), (MAX_LAG_BUFFERS - 1)); } RANGE_CHECK_HI(extra_cfg, gf_min_pyr_height, 5); RANGE_CHECK_HI(extra_cfg, gf_max_pyr_height, 5); if (extra_cfg->gf_min_pyr_height > extra_cfg->gf_max_pyr_height) { ERROR( "gf_min_pyr_height must be less than or equal to " "gf_max_pyramid_height"); } RANGE_CHECK_HI(cfg, rc_resize_mode, RESIZE_MODES - 1); RANGE_CHECK(cfg, rc_resize_denominator, SCALE_NUMERATOR, SCALE_NUMERATOR << 1); RANGE_CHECK(cfg, rc_resize_kf_denominator, SCALE_NUMERATOR, SCALE_NUMERATOR << 1); RANGE_CHECK_HI(cfg, rc_superres_mode, AOM_SUPERRES_AUTO); RANGE_CHECK(cfg, rc_superres_denominator, SCALE_NUMERATOR, SCALE_NUMERATOR << 1); RANGE_CHECK(cfg, rc_superres_kf_denominator, SCALE_NUMERATOR, SCALE_NUMERATOR << 1); RANGE_CHECK(cfg, rc_superres_qthresh, 1, 63); RANGE_CHECK(cfg, rc_superres_kf_qthresh, 1, 63); RANGE_CHECK_HI(extra_cfg, cdf_update_mode, 2); RANGE_CHECK_HI(extra_cfg, motion_vector_unit_test, 2); #if CONFIG_FPMT_TEST RANGE_CHECK_HI(extra_cfg, fpmt_unit_test, 1); #endif RANGE_CHECK_HI(extra_cfg, sb_multipass_unit_test, 1); RANGE_CHECK_HI(extra_cfg, ext_tile_debug, 1); RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 1); RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2); RANGE_CHECK(extra_cfg, cpu_used, 0, (cfg->g_usage == AOM_USAGE_REALTIME) ? 11 : 9); RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6); RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64, AOM_SUPERBLOCK_SIZE_DYNAMIC); RANGE_CHECK_HI(cfg, large_scale_tile, 1); RANGE_CHECK_HI(extra_cfg, single_tile_decoding, 1); RANGE_CHECK_HI(extra_cfg, enable_rate_guide_deltaq, 1); RANGE_CHECK_HI(extra_cfg, row_mt, 1); RANGE_CHECK_HI(extra_cfg, fp_mt, 1); RANGE_CHECK_HI(extra_cfg, tile_columns, 6); RANGE_CHECK_HI(extra_cfg, tile_rows, 6); RANGE_CHECK_HI(extra_cfg, auto_tiles, 1); RANGE_CHECK_HI(cfg, monochrome, 1); if (cfg->large_scale_tile && extra_cfg->aq_mode) ERROR( "Adaptive quantization are not supported in large scale tile " "coding."); RANGE_CHECK_HI(extra_cfg, sharpness, 7); RANGE_CHECK_HI(extra_cfg, arnr_max_frames, 15); RANGE_CHECK_HI(extra_cfg, arnr_strength, 6); RANGE_CHECK_HI(extra_cfg, cq_level, 63); RANGE_CHECK(cfg, g_bit_depth, AOM_BITS_8, AOM_BITS_12); RANGE_CHECK(cfg, g_input_bit_depth, 8, 12); RANGE_CHECK(extra_cfg, content, AOM_CONTENT_DEFAULT, AOM_CONTENT_INVALID - 1); if (cfg->g_pass >= AOM_RC_SECOND_PASS) { const size_t packet_sz = sizeof(FIRSTPASS_STATS); const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz); const FIRSTPASS_STATS *stats; if (cfg->rc_twopass_stats_in.buf == NULL) ERROR("rc_twopass_stats_in.buf not set."); if (cfg->rc_twopass_stats_in.sz % packet_sz) ERROR("rc_twopass_stats_in.sz indicates truncated packet."); if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz) ERROR("rc_twopass_stats_in requires at least two packets."); stats = (const FIRSTPASS_STATS *)cfg->rc_twopass_stats_in.buf + n_packets - 1; if ((int)(stats->count + 0.5) != n_packets - 1) ERROR("rc_twopass_stats_in missing EOS stats packet"); } if (extra_cfg->passes != -1 && cfg->g_pass == AOM_RC_ONE_PASS && extra_cfg->passes != 1) { ERROR("One pass encoding but passes != 1."); } if (extra_cfg->passes != -1 && (int)cfg->g_pass > extra_cfg->passes) { ERROR("Current pass is larger than total number of passes."); } if (cfg->g_profile == (unsigned int)PROFILE_1 && cfg->monochrome) { ERROR("Monochrome is not supported in profile 1"); } if (cfg->g_profile <= (unsigned int)PROFILE_1 && cfg->g_bit_depth > AOM_BITS_10) { ERROR("Codec bit-depth 12 not supported in profile < 2"); } if (cfg->g_profile <= (unsigned int)PROFILE_1 && cfg->g_input_bit_depth > 10) { ERROR("Source bit-depth 12 not supported in profile < 2"); } if (cfg->rc_end_usage == AOM_Q) { RANGE_CHECK_HI(cfg, use_fixed_qp_offsets, 1); } else { if (cfg->use_fixed_qp_offsets > 0) { ERROR("--use_fixed_qp_offsets can only be used with --end-usage=q"); } } RANGE_CHECK(extra_cfg, color_primaries, AOM_CICP_CP_BT_709, AOM_CICP_CP_EBU_3213); // Need to check range more precisely to // check for reserved values? RANGE_CHECK(extra_cfg, transfer_characteristics, AOM_CICP_TC_BT_709, AOM_CICP_TC_HLG); RANGE_CHECK(extra_cfg, matrix_coefficients, AOM_CICP_MC_IDENTITY, AOM_CICP_MC_ICTCP); RANGE_CHECK(extra_cfg, color_range, 0, 1); /* Average corpus complexity is supported only in the case of single pass * VBR*/ if (cfg->g_pass == AOM_RC_ONE_PASS && cfg->rc_end_usage == AOM_VBR) RANGE_CHECK_HI(extra_cfg, vbr_corpus_complexity_lap, MAX_VBR_CORPUS_COMPLEXITY); else if (extra_cfg->vbr_corpus_complexity_lap != 0) ERROR( "VBR corpus complexity is supported only in the case of single pass " "VBR mode."); #if !CONFIG_TUNE_BUTTERAUGLI if (extra_cfg->tuning == AOM_TUNE_BUTTERAUGLI) { ERROR( "This error may be related to the wrong configuration options: try to " "set -DCONFIG_TUNE_BUTTERAUGLI=1 at the time CMake is run."); } #endif #if !CONFIG_TUNE_VMAF if (extra_cfg->tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && extra_cfg->tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) { ERROR( "This error may be related to the wrong configuration options: try to " "set -DCONFIG_TUNE_VMAF=1 at the time CMake is run."); } #endif RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_IQ); RANGE_CHECK(extra_cfg, dist_metric, AOM_DIST_METRIC_PSNR, AOM_DIST_METRIC_QM_PSNR); RANGE_CHECK(extra_cfg, timing_info_type, AOM_TIMING_UNSPECIFIED, AOM_TIMING_DEC_MODEL); RANGE_CHECK(extra_cfg, film_grain_test_vector, 0, 16); if (extra_cfg->lossless) { if (extra_cfg->aq_mode != 0) ERROR("Only --aq_mode=0 can be used with --lossless=1."); if (extra_cfg->enable_chroma_deltaq) ERROR("Only --enable_chroma_deltaq=0 can be used with --lossless=1."); } RANGE_CHECK(extra_cfg, max_reference_frames, 3, 7); RANGE_CHECK(extra_cfg, enable_reduced_reference_set, 0, 1); RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1); RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1); RANGE_CHECK_HI(extra_cfg, disable_trellis_quant, 3); RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 3); RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 3); RANGE_CHECK(extra_cfg, mv_cost_upd_freq, 0, 3); RANGE_CHECK(extra_cfg, dv_cost_upd_freq, 0, 3); RANGE_CHECK(extra_cfg, min_partition_size, 4, 128); RANGE_CHECK(extra_cfg, max_partition_size, 4, 128); RANGE_CHECK_HI(extra_cfg, min_partition_size, extra_cfg->max_partition_size); for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { const int level_idx = extra_cfg->target_seq_level_idx[i]; if (!is_valid_seq_level_idx(level_idx) && level_idx != SEQ_LEVEL_KEEP_STATS) { ERROR("Target sequence level index is invalid"); } } RANGE_CHECK(extra_cfg, deltaq_strength, 0, 1000); RANGE_CHECK_HI(extra_cfg, loopfilter_control, 3); RANGE_CHECK_BOOL(extra_cfg, skip_postproc_filtering); RANGE_CHECK_HI(extra_cfg, enable_cdef, 3); RANGE_CHECK_BOOL(extra_cfg, auto_intra_tools_off); RANGE_CHECK_BOOL(extra_cfg, strict_level_conformance); RANGE_CHECK_BOOL(extra_cfg, sb_qp_sweep); RANGE_CHECK(extra_cfg, kf_max_pyr_height, -1, 5); if (extra_cfg->kf_max_pyr_height != -1 && extra_cfg->kf_max_pyr_height < (int)extra_cfg->gf_min_pyr_height) { ERROR( "The value of kf-max-pyr-height should not be smaller than " "gf-min-pyr-height"); } return AOM_CODEC_OK; } static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx, const aom_image_t *img) { switch (img->fmt) { case AOM_IMG_FMT_YV12: case AOM_IMG_FMT_NV12: case AOM_IMG_FMT_I420: case AOM_IMG_FMT_YV1216: case AOM_IMG_FMT_I42016: break; case AOM_IMG_FMT_I444: case AOM_IMG_FMT_I44416: if (ctx->cfg.g_profile == (unsigned int)PROFILE_0 && !ctx->cfg.monochrome) { ERROR("Invalid image format. I444 images not supported in profile."); } break; case AOM_IMG_FMT_I422: case AOM_IMG_FMT_I42216: if (ctx->cfg.g_profile != (unsigned int)PROFILE_2) { ERROR("Invalid image format. I422 images not supported in profile."); } break; default: ERROR( "Invalid image format. Only YV12, NV12, I420, I422, I444 images are " "supported."); break; } if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h) ERROR("Image size must match encoder init configuration size"); #if CONFIG_TUNE_BUTTERAUGLI if (ctx->extra_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { if (img->bit_depth > 8) { ERROR("Only 8 bit depth images supported in tune=butteraugli mode."); } if (img->mc != 0 && img->mc != AOM_CICP_MC_BT_709 && img->mc != AOM_CICP_MC_BT_601 && img->mc != AOM_CICP_MC_BT_470_B_G) { ERROR( "Only BT.709 and BT.601 matrix coefficients supported in " "tune=butteraugli mode. Identity matrix is treated as BT.601."); } } #endif return AOM_CODEC_OK; } static int get_image_bps(const aom_image_t *img) { switch (img->fmt) { case AOM_IMG_FMT_YV12: case AOM_IMG_FMT_NV12: case AOM_IMG_FMT_I420: return 12; case AOM_IMG_FMT_I422: return 16; case AOM_IMG_FMT_I444: return 24; case AOM_IMG_FMT_YV1216: case AOM_IMG_FMT_I42016: return 24; case AOM_IMG_FMT_I42216: return 32; case AOM_IMG_FMT_I44416: return 48; default: assert(0 && "Invalid image format"); break; } return 0; } // Set appropriate options to disable frame super-resolution. static void disable_superres(SuperResCfg *const superres_cfg) { superres_cfg->superres_mode = AOM_SUPERRES_NONE; superres_cfg->superres_scale_denominator = SCALE_NUMERATOR; superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR; superres_cfg->superres_qthresh = 255; superres_cfg->superres_kf_qthresh = 255; } static void set_auto_tiles(TileConfig *const tile_cfg, unsigned int width, unsigned int height, unsigned int threads) { int tile_cols_log2 = 0; int tile_rows_log2 = 0; if (threads < 2) return; // Avoid small tiles because they are particularly bad for coding. // Use no more tiles than the number of threads. Aim for one tile per // thread. Using more than one thread inside one tile could be less // efficient. Using more tiles than the number of threads would result // in a compression penalty without much benefit. const uint32_t kMinTileArea = 128 * 128; const uint32_t kMaxTiles = 32; uint32_t frame_area = width * height; uint32_t tiles = (frame_area + kMinTileArea - 1) / kMinTileArea; if (tiles > kMaxTiles) { tiles = kMaxTiles; } if (tiles > threads) { tiles = threads; } int tiles_log2 = (int)log2(tiles); // If the frame width is equal or greater than the height, use more tile // columns than tile rows. if (width >= height) { tile_cols_log2 = (tiles_log2 + 1) / 2; tile_rows_log2 = tiles_log2 - tile_cols_log2; } else { tile_rows_log2 = (tiles_log2 + 1) / 2; tile_cols_log2 = tiles_log2 - tile_rows_log2; } tile_cfg->tile_columns = tile_cols_log2; tile_cfg->tile_rows = tile_rows_log2; } static void update_default_encoder_config(const cfg_options_t *cfg, struct av1_extracfg *extra_cfg) { extra_cfg->enable_cdef = (cfg->disable_cdef == 0) ? 1 : 0; extra_cfg->enable_restoration = (cfg->disable_lr == 0); extra_cfg->superblock_size = (cfg->super_block_size == 64) ? AOM_SUPERBLOCK_SIZE_64X64 : (cfg->super_block_size == 128) ? AOM_SUPERBLOCK_SIZE_128X128 : AOM_SUPERBLOCK_SIZE_DYNAMIC; extra_cfg->enable_warped_motion = (cfg->disable_warp_motion == 0); extra_cfg->enable_dist_wtd_comp = (cfg->disable_dist_wtd_comp == 0); extra_cfg->enable_diff_wtd_comp = (cfg->disable_diff_wtd_comp == 0); extra_cfg->enable_dual_filter = (cfg->disable_dual_filter == 0); extra_cfg->enable_angle_delta = (cfg->disable_intra_angle_delta == 0); extra_cfg->enable_rect_partitions = (cfg->disable_rect_partition_type == 0); extra_cfg->enable_ab_partitions = (cfg->disable_ab_partition_type == 0); extra_cfg->enable_1to4_partitions = (cfg->disable_1to4_partition_type == 0); extra_cfg->max_partition_size = cfg->max_partition_size; extra_cfg->min_partition_size = cfg->min_partition_size; extra_cfg->enable_intra_edge_filter = (cfg->disable_intra_edge_filter == 0); extra_cfg->enable_tx64 = (cfg->disable_tx_64x64 == 0); extra_cfg->enable_flip_idtx = (cfg->disable_flip_idtx == 0); extra_cfg->enable_masked_comp = (cfg->disable_masked_comp == 0); extra_cfg->enable_interintra_comp = (cfg->disable_inter_intra_comp == 0); extra_cfg->enable_smooth_interintra = (cfg->disable_smooth_inter_intra == 0); extra_cfg->enable_interinter_wedge = (cfg->disable_inter_inter_wedge == 0); extra_cfg->enable_interintra_wedge = (cfg->disable_inter_intra_wedge == 0); extra_cfg->enable_global_motion = (cfg->disable_global_motion == 0); extra_cfg->enable_filter_intra = (cfg->disable_filter_intra == 0); extra_cfg->enable_smooth_intra = (cfg->disable_smooth_intra == 0); extra_cfg->enable_paeth_intra = (cfg->disable_paeth_intra == 0); extra_cfg->enable_cfl_intra = (cfg->disable_cfl == 0); extra_cfg->enable_obmc = (cfg->disable_obmc == 0); extra_cfg->enable_palette = (cfg->disable_palette == 0); extra_cfg->enable_intrabc = (cfg->disable_intrabc == 0); extra_cfg->disable_trellis_quant = cfg->disable_trellis_quant; extra_cfg->allow_ref_frame_mvs = (cfg->disable_ref_frame_mv == 0); extra_cfg->enable_ref_frame_mvs = (cfg->disable_ref_frame_mv == 0); extra_cfg->enable_onesided_comp = (cfg->disable_one_sided_comp == 0); extra_cfg->enable_reduced_reference_set = cfg->reduced_reference_set; extra_cfg->reduced_tx_type_set = cfg->reduced_tx_type_set; } static void set_encoder_config(AV1EncoderConfig *oxcf, const aom_codec_enc_cfg_t *cfg, struct av1_extracfg *extra_cfg) { if (cfg->encoder_cfg.init_by_cfg_file) { update_default_encoder_config(&cfg->encoder_cfg, extra_cfg); } TuneCfg *const tune_cfg = &oxcf->tune_cfg; FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; TileConfig *const tile_cfg = &oxcf->tile_cfg; ResizeCfg *const resize_cfg = &oxcf->resize_cfg; GFConfig *const gf_cfg = &oxcf->gf_cfg; PartitionCfg *const part_cfg = &oxcf->part_cfg; IntraModeCfg *const intra_mode_cfg = &oxcf->intra_mode_cfg; TxfmSizeTypeCfg *const txfm_cfg = &oxcf->txfm_cfg; CompoundTypeCfg *const comp_type_cfg = &oxcf->comp_type_cfg; SuperResCfg *const superres_cfg = &oxcf->superres_cfg; KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg; DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg; RateControlCfg *const rc_cfg = &oxcf->rc_cfg; QuantizationCfg *const q_cfg = &oxcf->q_cfg; ColorCfg *const color_cfg = &oxcf->color_cfg; InputCfg *const input_cfg = &oxcf->input_cfg; AlgoCfg *const algo_cfg = &oxcf->algo_cfg; ToolCfg *const tool_cfg = &oxcf->tool_cfg; oxcf->profile = cfg->g_profile; oxcf->max_threads = (int)cfg->g_threads; switch (cfg->g_usage) { case AOM_USAGE_REALTIME: oxcf->mode = REALTIME; break; case AOM_USAGE_ALL_INTRA: oxcf->mode = ALLINTRA; break; default: oxcf->mode = GOOD; break; } // Set frame-dimension related configuration. frm_dim_cfg->width = cfg->g_w; frm_dim_cfg->height = cfg->g_h; frm_dim_cfg->forced_max_frame_width = cfg->g_forced_max_frame_width; frm_dim_cfg->forced_max_frame_height = cfg->g_forced_max_frame_height; frm_dim_cfg->render_width = extra_cfg->render_width; frm_dim_cfg->render_height = extra_cfg->render_height; // Set input video related configuration. input_cfg->input_bit_depth = cfg->g_input_bit_depth; // guess a frame rate if out of whack, use 30 input_cfg->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num; if (cfg->g_pass >= AOM_RC_SECOND_PASS) { const size_t packet_sz = sizeof(FIRSTPASS_STATS); const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz); input_cfg->limit = n_packets - 1; } else { input_cfg->limit = cfg->g_limit; } input_cfg->chroma_subsampling_x = extra_cfg->chroma_subsampling_x; input_cfg->chroma_subsampling_y = extra_cfg->chroma_subsampling_y; if (input_cfg->init_framerate > 180) { input_cfg->init_framerate = 30; dec_model_cfg->timing_info_present = 0; } // Set Decoder model configuration. if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL || extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) { dec_model_cfg->timing_info_present = 1; dec_model_cfg->timing_info.num_units_in_display_tick = cfg->g_timebase.num; dec_model_cfg->timing_info.time_scale = cfg->g_timebase.den; dec_model_cfg->timing_info.num_ticks_per_picture = 1; } else { dec_model_cfg->timing_info_present = 0; } if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL) { dec_model_cfg->timing_info.equal_picture_interval = 1; dec_model_cfg->decoder_model_info_present_flag = 0; dec_model_cfg->display_model_info_present_flag = 1; } else if (extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) { dec_model_cfg->num_units_in_decoding_tick = cfg->g_timebase.num; dec_model_cfg->timing_info.equal_picture_interval = 0; dec_model_cfg->decoder_model_info_present_flag = 1; dec_model_cfg->display_model_info_present_flag = 1; } oxcf->pass = cfg->g_pass; // For backward compatibility, assume that if extra_cfg->passes==-1, then // passes = 1 or 2. if (extra_cfg->passes == -1) { if (cfg->g_pass == AOM_RC_ONE_PASS) { oxcf->passes = 1; } else { oxcf->passes = 2; } } else { oxcf->passes = extra_cfg->passes; } // Set Rate Control configuration. rc_cfg->max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct; rc_cfg->max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct; rc_cfg->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct; rc_cfg->mode = cfg->rc_end_usage; rc_cfg->min_cr = extra_cfg->min_cr; rc_cfg->best_allowed_q = extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_min_quantizer); rc_cfg->worst_allowed_q = extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_max_quantizer); rc_cfg->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level); rc_cfg->under_shoot_pct = cfg->rc_undershoot_pct; rc_cfg->over_shoot_pct = cfg->rc_overshoot_pct; rc_cfg->maximum_buffer_size_ms = cfg->rc_buf_sz; rc_cfg->starting_buffer_level_ms = cfg->rc_buf_initial_sz; rc_cfg->optimal_buffer_level_ms = cfg->rc_buf_optimal_sz; // Convert target bandwidth from Kbit/s to Bit/s rc_cfg->target_bandwidth = 1000 * cfg->rc_target_bitrate; rc_cfg->drop_frames_water_mark = cfg->rc_dropframe_thresh; rc_cfg->vbr_corpus_complexity_lap = extra_cfg->vbr_corpus_complexity_lap; rc_cfg->vbrbias = cfg->rc_2pass_vbr_bias_pct; rc_cfg->vbrmin_section = cfg->rc_2pass_vbr_minsection_pct; rc_cfg->vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct; // Set Toolset related configuration. tool_cfg->bit_depth = cfg->g_bit_depth; tool_cfg->cdef_control = (CDEF_CONTROL)extra_cfg->enable_cdef; tool_cfg->enable_restoration = (cfg->g_usage == AOM_USAGE_REALTIME) ? 0 : extra_cfg->enable_restoration; tool_cfg->force_video_mode = extra_cfg->force_video_mode; tool_cfg->enable_palette = extra_cfg->enable_palette; // FIXME(debargha): Should this be: // tool_cfg->enable_ref_frame_mvs = extra_cfg->allow_ref_frame_mvs & // extra_cfg->enable_order_hint ? // Disallow using temporal MVs while large_scale_tile = 1. tool_cfg->enable_ref_frame_mvs = extra_cfg->allow_ref_frame_mvs && !cfg->large_scale_tile; tool_cfg->superblock_size = extra_cfg->superblock_size; tool_cfg->enable_monochrome = cfg->monochrome; tool_cfg->full_still_picture_hdr = cfg->full_still_picture_hdr != 0; tool_cfg->enable_dual_filter = extra_cfg->enable_dual_filter; tool_cfg->enable_order_hint = extra_cfg->enable_order_hint; tool_cfg->enable_interintra_comp = extra_cfg->enable_interintra_comp; tool_cfg->ref_frame_mvs_present = extra_cfg->enable_ref_frame_mvs & extra_cfg->enable_order_hint; // Explicitly disable global motion in a few cases: // * For realtime mode, we never search global motion, and disabling // it here prevents later code from allocating buffers we don't need // * For large scale tile mode, some of the intended use cases expect // all frame headers to be identical. This breaks if global motion is // used, since global motion data is stored in the frame header. // eg, see test/lightfield_test.sh, which checks that all frame headers // are the same. tool_cfg->enable_global_motion = extra_cfg->enable_global_motion && cfg->g_usage != AOM_USAGE_REALTIME && !cfg->large_scale_tile; tool_cfg->error_resilient_mode = cfg->g_error_resilient | extra_cfg->error_resilient_mode; tool_cfg->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode; // Set Quantization related configuration. q_cfg->using_qm = extra_cfg->enable_qm; q_cfg->qm_minlevel = extra_cfg->qm_min; q_cfg->qm_maxlevel = extra_cfg->qm_max; q_cfg->quant_b_adapt = extra_cfg->quant_b_adapt; q_cfg->enable_chroma_deltaq = extra_cfg->enable_chroma_deltaq; q_cfg->aq_mode = extra_cfg->aq_mode; q_cfg->deltaq_mode = extra_cfg->deltaq_mode; q_cfg->deltaq_strength = extra_cfg->deltaq_strength; q_cfg->use_fixed_qp_offsets = cfg->use_fixed_qp_offsets && (rc_cfg->mode == AOM_Q); q_cfg->enable_hdr_deltaq = (q_cfg->deltaq_mode == DELTA_Q_HDR) && (cfg->g_bit_depth == AOM_BITS_10) && (extra_cfg->color_primaries == AOM_CICP_CP_BT_2020); tool_cfg->enable_deltalf_mode = (q_cfg->deltaq_mode != NO_DELTA_Q) && extra_cfg->deltalf_mode; // Set cost update frequency configuration. oxcf->cost_upd_freq.coeff = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq; oxcf->cost_upd_freq.mode = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq; // Avoid MV cost update for allintra encoding mode. oxcf->cost_upd_freq.mv = (cfg->kf_max_dist != 0) ? (COST_UPDATE_TYPE)extra_cfg->mv_cost_upd_freq : COST_UPD_OFF; oxcf->cost_upd_freq.dv = (COST_UPDATE_TYPE)extra_cfg->dv_cost_upd_freq; // Set frame resize mode configuration. resize_cfg->resize_mode = (RESIZE_MODE)cfg->rc_resize_mode; resize_cfg->resize_scale_denominator = (uint8_t)cfg->rc_resize_denominator; resize_cfg->resize_kf_scale_denominator = (uint8_t)cfg->rc_resize_kf_denominator; if (resize_cfg->resize_mode == RESIZE_FIXED && resize_cfg->resize_scale_denominator == SCALE_NUMERATOR && resize_cfg->resize_kf_scale_denominator == SCALE_NUMERATOR) resize_cfg->resize_mode = RESIZE_NONE; // Set encoder algorithm related configuration. algo_cfg->enable_overlay = extra_cfg->enable_overlay; algo_cfg->disable_trellis_quant = extra_cfg->disable_trellis_quant; algo_cfg->sharpness = extra_cfg->sharpness; algo_cfg->arnr_max_frames = extra_cfg->arnr_max_frames; algo_cfg->arnr_strength = extra_cfg->arnr_strength; algo_cfg->cdf_update_mode = (uint8_t)extra_cfg->cdf_update_mode; // TODO(any): Fix and Enable TPL for resize-mode > 0 algo_cfg->enable_tpl_model = resize_cfg->resize_mode ? 0 : extra_cfg->enable_tpl_model; algo_cfg->loopfilter_control = extra_cfg->loopfilter_control; algo_cfg->skip_postproc_filtering = extra_cfg->skip_postproc_filtering; // Set two-pass stats configuration. oxcf->twopass_stats_in = cfg->rc_twopass_stats_in; if (extra_cfg->two_pass_output) oxcf->two_pass_output = extra_cfg->two_pass_output; oxcf->second_pass_log = extra_cfg->second_pass_log; // Set Key frame configuration. kf_cfg->fwd_kf_enabled = cfg->fwd_kf_enabled; kf_cfg->auto_key = cfg->kf_mode == AOM_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist; kf_cfg->key_freq_min = cfg->kf_min_dist; kf_cfg->key_freq_max = cfg->kf_max_dist; kf_cfg->sframe_dist = cfg->sframe_dist; kf_cfg->sframe_mode = cfg->sframe_mode; kf_cfg->enable_sframe = extra_cfg->s_frame_mode; kf_cfg->enable_keyframe_filtering = extra_cfg->enable_keyframe_filtering; kf_cfg->fwd_kf_dist = extra_cfg->fwd_kf_dist; // Disable key frame filtering in all intra mode. if (cfg->kf_max_dist == 0) { kf_cfg->enable_keyframe_filtering = 0; } kf_cfg->enable_intrabc = extra_cfg->enable_intrabc; oxcf->speed = extra_cfg->cpu_used; // TODO(yunqingwang, any) In REALTIME mode, 1080p performance at speed 5 & 6 // is quite bad. Force to use speed 7 for now. Will investigate it when we // work on rd path optimization later. if (oxcf->mode == REALTIME && AOMMIN(cfg->g_w, cfg->g_h) >= 1080 && oxcf->speed < 7) oxcf->speed = 7; // Set Color related configuration. color_cfg->color_primaries = extra_cfg->color_primaries; color_cfg->transfer_characteristics = extra_cfg->transfer_characteristics; color_cfg->matrix_coefficients = extra_cfg->matrix_coefficients; color_cfg->color_range = extra_cfg->color_range; color_cfg->chroma_sample_position = extra_cfg->chroma_sample_position; // Set Group of frames configuration. // Force lag_in_frames to 0 for REALTIME mode gf_cfg->lag_in_frames = (oxcf->mode == REALTIME) ? 0 : clamp(cfg->g_lag_in_frames, 0, MAX_LAG_BUFFERS); gf_cfg->enable_auto_arf = extra_cfg->enable_auto_alt_ref; gf_cfg->enable_auto_brf = extra_cfg->enable_auto_bwd_ref; gf_cfg->min_gf_interval = extra_cfg->min_gf_interval; gf_cfg->max_gf_interval = extra_cfg->max_gf_interval; gf_cfg->gf_min_pyr_height = extra_cfg->gf_min_pyr_height; gf_cfg->gf_max_pyr_height = extra_cfg->gf_max_pyr_height; // Set tune related configuration. tune_cfg->tuning = extra_cfg->tuning; tune_cfg->vmaf_model_path = extra_cfg->vmaf_model_path; tune_cfg->content = extra_cfg->content; if (cfg->large_scale_tile) { tune_cfg->film_grain_test_vector = 0; tune_cfg->film_grain_table_filename = NULL; } else { tune_cfg->film_grain_test_vector = extra_cfg->film_grain_test_vector; tune_cfg->film_grain_table_filename = extra_cfg->film_grain_table_filename; } tune_cfg->dist_metric = extra_cfg->dist_metric; #if CONFIG_DENOISE oxcf->noise_level = extra_cfg->noise_level; oxcf->noise_block_size = extra_cfg->noise_block_size; oxcf->enable_dnl_denoising = extra_cfg->enable_dnl_denoising; #endif #if CONFIG_AV1_TEMPORAL_DENOISING // Temporal denoiser is for nonrd pickmode so disable it for speed < 7. // Also disable it for speed 7 for now since it needs to be modified for // the check_partition_merge_mode feature. if (cfg->g_bit_depth == AOM_BITS_8 && oxcf->speed > 7) { oxcf->noise_sensitivity = extra_cfg->noise_sensitivity; } else { oxcf->noise_sensitivity = 0; } #endif // Set Tile related configuration. tile_cfg->num_tile_groups = extra_cfg->num_tg; // In large-scale tile encoding mode, num_tile_groups is always 1. if (cfg->large_scale_tile) tile_cfg->num_tile_groups = 1; tile_cfg->mtu = extra_cfg->mtu_size; tile_cfg->enable_large_scale_tile = cfg->large_scale_tile; tile_cfg->enable_single_tile_decoding = (tile_cfg->enable_large_scale_tile) ? extra_cfg->single_tile_decoding : 0; if (extra_cfg->auto_tiles) { set_auto_tiles(tile_cfg, cfg->g_w, cfg->g_h, cfg->g_threads); extra_cfg->tile_columns = tile_cfg->tile_columns; extra_cfg->tile_rows = tile_cfg->tile_rows; } else { tile_cfg->tile_columns = extra_cfg->tile_columns; tile_cfg->tile_rows = extra_cfg->tile_rows; } tile_cfg->tile_width_count = AOMMIN(cfg->tile_width_count, MAX_TILE_COLS); tile_cfg->tile_height_count = AOMMIN(cfg->tile_height_count, MAX_TILE_ROWS); for (int i = 0; i < tile_cfg->tile_width_count; i++) { tile_cfg->tile_widths[i] = cfg->tile_widths[i]; } for (int i = 0; i < tile_cfg->tile_height_count; i++) { tile_cfg->tile_heights[i] = cfg->tile_heights[i]; } tile_cfg->enable_ext_tile_debug = extra_cfg->ext_tile_debug; if (tile_cfg->enable_large_scale_tile) { // The superblock_size can only be AOM_SUPERBLOCK_SIZE_64X64 or // AOM_SUPERBLOCK_SIZE_128X128 while tile_cfg->enable_large_scale_tile = 1. // If superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC, hard set it to // AOM_SUPERBLOCK_SIZE_64X64(default value in large_scale_tile). if (extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_64X64 && extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_128X128) tool_cfg->superblock_size = AOM_SUPERBLOCK_SIZE_64X64; } // Set reference frame related configuration. oxcf->ref_frm_cfg.max_reference_frames = extra_cfg->max_reference_frames; oxcf->ref_frm_cfg.enable_reduced_reference_set = extra_cfg->enable_reduced_reference_set; oxcf->ref_frm_cfg.enable_onesided_comp = extra_cfg->enable_onesided_comp; oxcf->row_mt = extra_cfg->row_mt; oxcf->fp_mt = extra_cfg->fp_mt; // Set motion mode related configuration. oxcf->motion_mode_cfg.enable_obmc = extra_cfg->enable_obmc; oxcf->motion_mode_cfg.enable_warped_motion = extra_cfg->enable_warped_motion; #if !CONFIG_REALTIME_ONLY if (cfg->g_usage == AOM_USAGE_REALTIME && oxcf->speed >= 7 && oxcf->tune_cfg.content == AOM_CONTENT_SCREEN) { // TODO(marpan): warped motion is causing a crash for RT mode with screen // in nonrd (speed >= 7), for non-realtime build. // Re-enable/allow when the issue is fixed. oxcf->motion_mode_cfg.enable_warped_motion = 0; oxcf->motion_mode_cfg.allow_warped_motion = 0; } else { oxcf->motion_mode_cfg.allow_warped_motion = (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion); } #else oxcf->motion_mode_cfg.allow_warped_motion = (cfg->g_usage == AOM_USAGE_REALTIME && oxcf->speed >= 7) ? false : (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion); #endif // Set partition related configuration. part_cfg->enable_rect_partitions = extra_cfg->enable_rect_partitions; part_cfg->enable_ab_partitions = extra_cfg->enable_ab_partitions; part_cfg->enable_1to4_partitions = extra_cfg->enable_1to4_partitions; part_cfg->min_partition_size = extra_cfg->min_partition_size; part_cfg->max_partition_size = extra_cfg->max_partition_size; // Set intra mode configuration. intra_mode_cfg->enable_angle_delta = extra_cfg->enable_angle_delta; intra_mode_cfg->enable_intra_edge_filter = extra_cfg->enable_intra_edge_filter; intra_mode_cfg->enable_filter_intra = extra_cfg->enable_filter_intra; intra_mode_cfg->enable_smooth_intra = extra_cfg->enable_smooth_intra; intra_mode_cfg->enable_paeth_intra = extra_cfg->enable_paeth_intra; intra_mode_cfg->enable_cfl_intra = extra_cfg->enable_cfl_intra; intra_mode_cfg->enable_directional_intra = extra_cfg->enable_directional_intra; intra_mode_cfg->enable_diagonal_intra = extra_cfg->enable_diagonal_intra; intra_mode_cfg->auto_intra_tools_off = extra_cfg->auto_intra_tools_off; // Set transform size/type configuration. txfm_cfg->enable_tx64 = extra_cfg->enable_tx64; txfm_cfg->enable_flip_idtx = extra_cfg->enable_flip_idtx; txfm_cfg->enable_rect_tx = extra_cfg->enable_rect_tx; txfm_cfg->reduced_tx_type_set = extra_cfg->reduced_tx_type_set; txfm_cfg->use_intra_dct_only = extra_cfg->use_intra_dct_only; txfm_cfg->use_inter_dct_only = extra_cfg->use_inter_dct_only; txfm_cfg->use_intra_default_tx_only = extra_cfg->use_intra_default_tx_only; txfm_cfg->enable_tx_size_search = extra_cfg->enable_tx_size_search; // Set compound type configuration. comp_type_cfg->enable_dist_wtd_comp = extra_cfg->enable_dist_wtd_comp & extra_cfg->enable_order_hint; comp_type_cfg->enable_masked_comp = extra_cfg->enable_masked_comp; comp_type_cfg->enable_diff_wtd_comp = extra_cfg->enable_masked_comp & extra_cfg->enable_diff_wtd_comp; comp_type_cfg->enable_interinter_wedge = extra_cfg->enable_masked_comp & extra_cfg->enable_interinter_wedge; comp_type_cfg->enable_smooth_interintra = extra_cfg->enable_interintra_comp && extra_cfg->enable_smooth_interintra; comp_type_cfg->enable_interintra_wedge = extra_cfg->enable_interintra_comp & extra_cfg->enable_interintra_wedge; // Set Super-resolution mode configuration. if (extra_cfg->lossless || cfg->large_scale_tile) { disable_superres(superres_cfg); } else { superres_cfg->superres_mode = cfg->rc_superres_mode; superres_cfg->superres_scale_denominator = (uint8_t)cfg->rc_superres_denominator; superres_cfg->superres_kf_scale_denominator = (uint8_t)cfg->rc_superres_kf_denominator; superres_cfg->superres_qthresh = av1_quantizer_to_qindex(cfg->rc_superres_qthresh); superres_cfg->superres_kf_qthresh = av1_quantizer_to_qindex(cfg->rc_superres_kf_qthresh); if (superres_cfg->superres_mode == AOM_SUPERRES_FIXED && superres_cfg->superres_scale_denominator == SCALE_NUMERATOR && superres_cfg->superres_kf_scale_denominator == SCALE_NUMERATOR) { disable_superres(superres_cfg); } if (superres_cfg->superres_mode == AOM_SUPERRES_QTHRESH && superres_cfg->superres_qthresh == 255 && superres_cfg->superres_kf_qthresh == 255) { disable_superres(superres_cfg); } } superres_cfg->enable_superres = (superres_cfg->superres_mode != AOM_SUPERRES_NONE) && extra_cfg->enable_superres; if (!superres_cfg->enable_superres) { disable_superres(superres_cfg); } if (input_cfg->limit == 1) { // still picture mode, display model and timing is meaningless dec_model_cfg->display_model_info_present_flag = 0; dec_model_cfg->timing_info_present = 0; } oxcf->save_as_annexb = cfg->save_as_annexb; // Set unit test related configuration. oxcf->unit_test_cfg.motion_vector_unit_test = extra_cfg->motion_vector_unit_test; oxcf->unit_test_cfg.sb_multipass_unit_test = extra_cfg->sb_multipass_unit_test; oxcf->border_in_pixels = av1_get_enc_border_size(av1_is_resize_needed(oxcf), (oxcf->kf_cfg.key_freq_max == 0), BLOCK_128X128); memcpy(oxcf->target_seq_level_idx, extra_cfg->target_seq_level_idx, sizeof(oxcf->target_seq_level_idx)); oxcf->tier_mask = extra_cfg->tier_mask; oxcf->partition_info_path = extra_cfg->partition_info_path; oxcf->enable_rate_guide_deltaq = extra_cfg->enable_rate_guide_deltaq; oxcf->rate_distribution_info = extra_cfg->rate_distribution_info; oxcf->strict_level_conformance = extra_cfg->strict_level_conformance; oxcf->kf_max_pyr_height = extra_cfg->kf_max_pyr_height; oxcf->sb_qp_sweep = extra_cfg->sb_qp_sweep; } AV1EncoderConfig av1_get_encoder_config(const aom_codec_enc_cfg_t *cfg) { AV1EncoderConfig oxcf; struct av1_extracfg extra_cfg = default_extra_cfg; set_encoder_config(&oxcf, cfg, &extra_cfg); return oxcf; } static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx, const aom_codec_enc_cfg_t *cfg) { aom_codec_err_t res; int force_key = 0; if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) { if (cfg->g_lag_in_frames > 1 || cfg->g_pass != AOM_RC_ONE_PASS) ERROR("Cannot change width or height after initialization"); // Note: function encoder_set_config() is allowed to be called multiple // times. However, when the original frame width or height is less than two // times of the new frame width or height, a forced key frame should be // used. To make sure the correct detection of a forced key frame, we need // to update the frame width and height only when the actual encoding is // performed. cpi->last_coded_width and cpi->last_coded_height are used to // track the actual coded frame size. if (ctx->ppi->cpi->last_coded_width && ctx->ppi->cpi->last_coded_height && (!valid_ref_frame_size(ctx->ppi->cpi->last_coded_width, ctx->ppi->cpi->last_coded_height, cfg->g_w, cfg->g_h) || ((int)cfg->g_w > ctx->ppi->cpi->last_coded_width) || ((int)cfg->g_h > ctx->ppi->cpi->last_coded_height))) { force_key = 1; } } if (ctx->monochrome_on_init && cfg->monochrome == 0) { // TODO(aomedia:3465): Allow this case to work without requiring re-init // of encoder. ERROR("Cannot change to monochrome = 0 after init with monochrome"); } // Prevent increasing lag_in_frames. This check is stricter than it needs // to be -- the limit is not increasing past the first lag_in_frames // value, but we don't track the initial config, only the last successful // config. if (cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames) ERROR("Cannot increase lag_in_frames"); // Prevent changing lag_in_frames if Lookahead Processing is enabled if (cfg->g_lag_in_frames != ctx->cfg.g_lag_in_frames && ctx->num_lap_buffers > 0) ERROR("Cannot change lag_in_frames if LAP is enabled"); res = validate_config(ctx, cfg, &ctx->extra_cfg); if (res == AOM_CODEC_OK) { ctx->cfg = *cfg; set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); // On profile change, request a key frame force_key |= ctx->ppi->seq_params.profile != ctx->oxcf.profile; bool is_sb_size_changed = false; av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed); for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) { av1_change_config(ctx->ppi->parallel_cpi[i], &ctx->oxcf, is_sb_size_changed); } if (ctx->ppi->cpi_lap != NULL) { av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf, is_sb_size_changed); } } if (force_key) ctx->next_frame_flags |= AOM_EFLAG_FORCE_KF; return res; } static aom_fixed_buf_t *encoder_get_global_headers(aom_codec_alg_priv_t *ctx) { return av1_get_global_headers(ctx->ppi); } static aom_codec_err_t ctrl_get_quantizer(aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return AOM_CODEC_INVALID_PARAM; *arg = av1_get_quantizer(ctx->ppi->cpi); return AOM_CODEC_OK; } static aom_codec_err_t ctrl_get_quantizer64(aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return AOM_CODEC_INVALID_PARAM; *arg = av1_qindex_to_quantizer(av1_get_quantizer(ctx->ppi->cpi)); return AOM_CODEC_OK; } static aom_codec_err_t ctrl_get_loopfilter_level(aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return AOM_CODEC_INVALID_PARAM; *arg = ctx->ppi->cpi->common.lf.filter_level[0]; return AOM_CODEC_OK; } static aom_codec_err_t ctrl_get_baseline_gf_interval(aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return AOM_CODEC_INVALID_PARAM; *arg = ctx->ppi->p_rc.baseline_gf_interval; return AOM_CODEC_OK; } static aom_codec_err_t update_encoder_cfg(aom_codec_alg_priv_t *ctx) { set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); av1_check_fpmt_config(ctx->ppi, &ctx->oxcf); bool is_sb_size_changed = false; av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed); for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) { AV1_COMP *const cpi = ctx->ppi->parallel_cpi[i]; struct aom_internal_error_info *const error = cpi->common.error; if (setjmp(error->jmp)) { error->setjmp = 0; return error->error_code; } error->setjmp = 1; av1_change_config(cpi, &ctx->oxcf, is_sb_size_changed); error->setjmp = 0; } if (ctx->ppi->cpi_lap != NULL) { AV1_COMP *const cpi_lap = ctx->ppi->cpi_lap; struct aom_internal_error_info *const error = cpi_lap->common.error; if (setjmp(error->jmp)) { error->setjmp = 0; return error->error_code; } error->setjmp = 1; av1_change_config(cpi_lap, &ctx->oxcf, is_sb_size_changed); error->setjmp = 0; } return AOM_CODEC_OK; } static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx, const struct av1_extracfg *extra_cfg) { const aom_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg); if (res == AOM_CODEC_OK) { ctx->extra_cfg = *extra_cfg; return update_encoder_cfg(ctx); } return res; } static aom_codec_err_t ctrl_set_cpuused(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.cpu_used = CAST(AOME_SET_CPUUSED, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_auto_alt_ref(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_auto_alt_ref = CAST(AOME_SET_ENABLEAUTOALTREF, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_auto_bwd_ref(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_auto_bwd_ref = CAST(AOME_SET_ENABLEAUTOBWDREF, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_noise_sensitivity(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.noise_sensitivity = CAST(AV1E_SET_NOISE_SENSITIVITY, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_sharpness(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.sharpness = CAST(AOME_SET_SHARPNESS, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_static_thresh(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.static_thresh = CAST(AOME_SET_STATIC_THRESHOLD, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx, va_list args) { unsigned int row_mt = CAST(AV1E_SET_ROW_MT, args); if (row_mt == ctx->extra_cfg.row_mt) return AOM_CODEC_OK; struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.row_mt = row_mt; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_tile_columns(aom_codec_alg_priv_t *ctx, va_list args) { // If the control AUTO_TILES is used (set to 1) then don't override // the tile_columns set via the AUTO_TILES control. if (ctx->extra_cfg.auto_tiles) { ERROR("AUTO_TILES is set so AV1E_SET_TILE_COLUMNS should not be called."); } struct av1_extracfg extra_cfg = ctx->extra_cfg; unsigned int tile_columns = CAST(AV1E_SET_TILE_COLUMNS, args); if (tile_columns == extra_cfg.tile_columns) return AOM_CODEC_OK; extra_cfg.tile_columns = tile_columns; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_tile_rows(aom_codec_alg_priv_t *ctx, va_list args) { // If the control AUTO_TILES is used (set to 1) then don't override // the tile_rows set via the AUTO_TILES control. if (ctx->extra_cfg.auto_tiles) { ERROR("AUTO_TILES is set so AV1E_SET_TILE_ROWS should not be called."); } struct av1_extracfg extra_cfg = ctx->extra_cfg; unsigned int tile_rows = CAST(AV1E_SET_TILE_ROWS, args); if (tile_rows == extra_cfg.tile_rows) return AOM_CODEC_OK; extra_cfg.tile_rows = tile_rows; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_tpl_model(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; const unsigned int tpl_model_arg = CAST(AV1E_SET_ENABLE_TPL_MODEL, args); #if CONFIG_REALTIME_ONLY if (tpl_model_arg) { ERROR("TPL model can't be turned on in realtime only build."); } #endif extra_cfg.enable_tpl_model = tpl_model_arg; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_keyframe_filtering( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_keyframe_filtering = CAST(AV1E_SET_ENABLE_KEYFRAME_FILTERING, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_arnr_max_frames(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.arnr_max_frames = CAST(AOME_SET_ARNR_MAXFRAMES, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_arnr_strength(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.arnr_strength = CAST(AOME_SET_ARNR_STRENGTH, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t handle_tuning(aom_codec_alg_priv_t *ctx, struct av1_extracfg *extra_cfg) { if (extra_cfg->tuning == AOM_TUNE_IQ) { if (ctx->cfg.g_usage != AOM_USAGE_ALL_INTRA) return AOM_CODEC_INCAPABLE; // Enable QMs as they've been found to be beneficial for images, when used // with alternative QM formulas: // - aom_get_qmlevel_allintra() // - aom_get_qmlevel_luma_iq() // - aom_get_qmlevel_444_chroma_iq() extra_cfg->enable_qm = 1; extra_cfg->qm_min = QM_FIRST_IQ; extra_cfg->qm_max = QM_LAST_IQ; // We can turn on loop filter sharpness, as frames do not have to serve as // references to others. extra_cfg->sharpness = 7; // Using the QM-PSNR metric was found to be beneficial for images (over the // default PSNR metric), as it correlates better with subjective image // quality consistency and better SSIMULACRA 2 scores. extra_cfg->dist_metric = AOM_DIST_METRIC_QM_PSNR; // CDEF_ALL has been found to blur images at medium and high quality // qindexes, so let's use a version that adapts CDEF strength on frame // qindexes. CDEF_ADAPTIVE strengths look like this for varying qindexes: // - CDEF off: 0 - 32 // - Reduced strength: 33 - 220 // - Full strength: 221 - 255 extra_cfg->enable_cdef = CDEF_ADAPTIVE; // Enable chroma deltaq so the encoder can factor in chroma subsampling and // adjust chroma quality when necessary. extra_cfg->enable_chroma_deltaq = 1; // Enable "Variance Boost" deltaq mode, optimized for images. extra_cfg->deltaq_mode = DELTA_Q_VARIANCE_BOOST; } return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_tuning(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.tuning = CAST(AOME_SET_TUNING, args); aom_codec_err_t err = handle_tuning(ctx, &extra_cfg); if (err != AOM_CODEC_OK) return err; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_cq_level(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.cq_level = CAST(AOME_SET_CQ_LEVEL, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_rc_max_intra_bitrate_pct( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.rc_max_intra_bitrate_pct = CAST(AOME_SET_MAX_INTRA_BITRATE_PCT, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_rc_max_inter_bitrate_pct( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.rc_max_inter_bitrate_pct = CAST(AOME_SET_MAX_INTER_BITRATE_PCT, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.gf_cbr_boost_pct = CAST(AV1E_SET_GF_CBR_BOOST_PCT, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_lossless(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.lossless = CAST(AV1E_SET_LOSSLESS, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_cdef(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_cdef = CAST(AV1E_SET_ENABLE_CDEF, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_restoration(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; const unsigned int restoration_arg = CAST(AV1E_SET_ENABLE_RESTORATION, args); #if CONFIG_REALTIME_ONLY if (restoration_arg) { ERROR("Restoration can't be turned on in realtime only build."); } #endif extra_cfg.enable_restoration = restoration_arg; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_force_video_mode(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.force_video_mode = CAST(AV1E_SET_FORCE_VIDEO_MODE, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_obmc(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; const unsigned int obmc_arg = CAST(AV1E_SET_ENABLE_OBMC, args); #if CONFIG_REALTIME_ONLY if (obmc_arg) { ERROR("OBMC can't be enabled in realtime only build."); } #endif extra_cfg.enable_obmc = obmc_arg; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_disable_trellis_quant(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.disable_trellis_quant = CAST(AV1E_SET_DISABLE_TRELLIS_QUANT, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_qm(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_qm = CAST(AV1E_SET_ENABLE_QM, args); #if !CONFIG_QUANT_MATRIX if (extra_cfg.enable_qm) { ERROR("QM can't be enabled with CONFIG_QUANT_MATRIX=0."); } #endif return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_qm_y(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.qm_y = CAST(AV1E_SET_QM_Y, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_qm_u(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.qm_u = CAST(AV1E_SET_QM_U, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_qm_v(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.qm_v = CAST(AV1E_SET_QM_V, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_qm_min(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.qm_min = CAST(AV1E_SET_QM_MIN, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_qm_max(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.qm_max = CAST(AV1E_SET_QM_MAX, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_num_tg(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.num_tg = CAST(AV1E_SET_NUM_TG, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_mtu(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.mtu_size = CAST(AV1E_SET_MTU, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_timing_info_type(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.timing_info_type = CAST(AV1E_SET_TIMING_INFO_TYPE, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_dual_filter(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_dual_filter = CAST(AV1E_SET_ENABLE_DUAL_FILTER, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_chroma_deltaq(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_chroma_deltaq = CAST(AV1E_SET_ENABLE_CHROMA_DELTAQ, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_rect_partitions( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_rect_partitions = CAST(AV1E_SET_ENABLE_RECT_PARTITIONS, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_ab_partitions(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_ab_partitions = CAST(AV1E_SET_ENABLE_AB_PARTITIONS, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_1to4_partitions( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_1to4_partitions = CAST(AV1E_SET_ENABLE_1TO4_PARTITIONS, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_min_partition_size(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.min_partition_size = CAST(AV1E_SET_MIN_PARTITION_SIZE, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_max_partition_size(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.max_partition_size = CAST(AV1E_SET_MAX_PARTITION_SIZE, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_intra_edge_filter( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_intra_edge_filter = CAST(AV1E_SET_ENABLE_INTRA_EDGE_FILTER, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_order_hint(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_order_hint = CAST(AV1E_SET_ENABLE_ORDER_HINT, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_tx64(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_tx64 = CAST(AV1E_SET_ENABLE_TX64, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_flip_idtx(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_flip_idtx = CAST(AV1E_SET_ENABLE_FLIP_IDTX, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_rect_tx(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_rect_tx = CAST(AV1E_SET_ENABLE_RECT_TX, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_dist_wtd_comp(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_dist_wtd_comp = CAST(AV1E_SET_ENABLE_DIST_WTD_COMP, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_max_reference_frames(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.max_reference_frames = CAST(AV1E_SET_MAX_REFERENCE_FRAMES, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_reduced_reference_set( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_reduced_reference_set = CAST(AV1E_SET_REDUCED_REFERENCE_SET, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_ref_frame_mvs(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_ref_frame_mvs = CAST(AV1E_SET_ENABLE_REF_FRAME_MVS, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_allow_ref_frame_mvs(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.allow_ref_frame_mvs = CAST(AV1E_SET_ALLOW_REF_FRAME_MVS, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_masked_comp(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_masked_comp = CAST(AV1E_SET_ENABLE_MASKED_COMP, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_onesided_comp(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_onesided_comp = CAST(AV1E_SET_ENABLE_ONESIDED_COMP, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_interintra_comp( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_interintra_comp = CAST(AV1E_SET_ENABLE_INTERINTRA_COMP, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_smooth_interintra( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_smooth_interintra = CAST(AV1E_SET_ENABLE_SMOOTH_INTERINTRA, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_diff_wtd_comp(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_diff_wtd_comp = CAST(AV1E_SET_ENABLE_DIFF_WTD_COMP, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_interinter_wedge( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_interinter_wedge = CAST(AV1E_SET_ENABLE_INTERINTER_WEDGE, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_interintra_wedge( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_interintra_wedge = CAST(AV1E_SET_ENABLE_INTERINTRA_WEDGE, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_global_motion(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; const int global_motion_arg = CAST(AV1E_SET_ENABLE_GLOBAL_MOTION, args); #if CONFIG_REALTIME_ONLY if (global_motion_arg) { ERROR("Global motion can't be enabled in realtime only build."); } #endif extra_cfg.enable_global_motion = global_motion_arg; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_warped_motion(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; const int warped_motion_arg = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args); #if CONFIG_REALTIME_ONLY if (warped_motion_arg) { ERROR("Warped motion can't be enabled in realtime only build."); } #endif extra_cfg.enable_warped_motion = warped_motion_arg; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_allow_warped_motion(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.allow_warped_motion = CAST(AV1E_SET_ALLOW_WARPED_MOTION, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_filter_intra(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_filter_intra = CAST(AV1E_SET_ENABLE_FILTER_INTRA, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_smooth_intra(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_smooth_intra = CAST(AV1E_SET_ENABLE_SMOOTH_INTRA, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_directional_intra( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_directional_intra = CAST(AV1E_SET_ENABLE_DIRECTIONAL_INTRA, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_diagonal_intra(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_diagonal_intra = CAST(AV1E_SET_ENABLE_DIAGONAL_INTRA, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_paeth_intra(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_paeth_intra = CAST(AV1E_SET_ENABLE_PAETH_INTRA, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_cfl_intra(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_cfl_intra = CAST(AV1E_SET_ENABLE_CFL_INTRA, args); #if CONFIG_REALTIME_ONLY if (extra_cfg.enable_cfl_intra) { ERROR("cfl can't be turned on in realtime only build."); } #endif return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_superres(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_superres = CAST(AV1E_SET_ENABLE_SUPERRES, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_overlay(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_overlay = CAST(AV1E_SET_ENABLE_OVERLAY, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_palette(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_palette = CAST(AV1E_SET_ENABLE_PALETTE, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_intrabc(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_intrabc = CAST(AV1E_SET_ENABLE_INTRABC, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_angle_delta(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_angle_delta = CAST(AV1E_SET_ENABLE_ANGLE_DELTA, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_error_resilient_mode(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.error_resilient_mode = CAST(AV1E_SET_ERROR_RESILIENT_MODE, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_s_frame_mode(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.s_frame_mode = CAST(AV1E_SET_S_FRAME_MODE, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_frame_parallel_decoding_mode( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.frame_parallel_decoding_mode = CAST(AV1E_SET_FRAME_PARALLEL_DECODING, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_single_tile_decoding(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.single_tile_decoding = CAST(AV1E_SET_SINGLE_TILE_DECODING, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_aq_mode(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.aq_mode = CAST(AV1E_SET_AQ_MODE, args); // Skip AQ mode if using fixed QP for current frame. if (ctx->ppi->cpi->rc.use_external_qp_one_pass) extra_cfg.aq_mode = 0; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_reduced_tx_type_set(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.reduced_tx_type_set = CAST(AV1E_SET_REDUCED_TX_TYPE_SET, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_intra_dct_only(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.use_intra_dct_only = CAST(AV1E_SET_INTRA_DCT_ONLY, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_inter_dct_only(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.use_inter_dct_only = CAST(AV1E_SET_INTER_DCT_ONLY, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_intra_default_tx_only(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.use_intra_default_tx_only = CAST(AV1E_SET_INTRA_DEFAULT_TX_ONLY, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_tx_size_search(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_tx_size_search = CAST(AV1E_SET_ENABLE_TX_SIZE_SEARCH, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_quant_b_adapt(aom_codec_alg_priv_t *ctx, va_list args) { #if CONFIG_REALTIME_ONLY (void)ctx; (void)args; return AOM_CODEC_INCAPABLE; #else struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.quant_b_adapt = CAST(AV1E_SET_QUANT_B_ADAPT, args); return update_extra_cfg(ctx, &extra_cfg); #endif } static aom_codec_err_t ctrl_set_vbr_corpus_complexity_lap( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.vbr_corpus_complexity_lap = CAST(AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_coeff_cost_upd_freq(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.coeff_cost_upd_freq = CAST(AV1E_SET_COEFF_COST_UPD_FREQ, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_mode_cost_upd_freq(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.mode_cost_upd_freq = CAST(AV1E_SET_MODE_COST_UPD_FREQ, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_mv_cost_upd_freq(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.mv_cost_upd_freq = CAST(AV1E_SET_MV_COST_UPD_FREQ, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_dv_cost_upd_freq(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.dv_cost_upd_freq = CAST(AV1E_SET_DV_COST_UPD_FREQ, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_vmaf_model_path(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; const char *str = CAST(AV1E_SET_VMAF_MODEL_PATH, args); const aom_codec_err_t ret = allocate_and_set_string( str, default_extra_cfg.vmaf_model_path, &extra_cfg.vmaf_model_path, ctx->ppi->error.detail); if (ret != AOM_CODEC_OK) return ret; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_partition_info_path(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; const char *str = CAST(AV1E_SET_PARTITION_INFO_PATH, args); const aom_codec_err_t ret = allocate_and_set_string( str, default_extra_cfg.partition_info_path, &extra_cfg.partition_info_path, ctx->ppi->error.detail); if (ret != AOM_CODEC_OK) return ret; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_enable_rate_guide_deltaq(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_rate_guide_deltaq = CAST(AV1E_ENABLE_RATE_GUIDE_DELTAQ, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_rate_distribution_info( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; const char *str = CAST(AV1E_SET_RATE_DISTRIBUTION_INFO, args); const aom_codec_err_t ret = allocate_and_set_string( str, default_extra_cfg.rate_distribution_info, &extra_cfg.rate_distribution_info, ctx->ppi->error.detail); if (ret != AOM_CODEC_OK) return ret; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_film_grain_test_vector( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.film_grain_test_vector = CAST(AV1E_SET_FILM_GRAIN_TEST_VECTOR, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_film_grain_table(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; const char *str = CAST(AV1E_SET_FILM_GRAIN_TABLE, args); if (str == NULL) { // this parameter allows NULL as its value extra_cfg.film_grain_table_filename = str; } else { #if CONFIG_REALTIME_ONLY ERROR("film_grain removed from realtime only build."); #endif const aom_codec_err_t ret = allocate_and_set_string( str, default_extra_cfg.film_grain_table_filename, &extra_cfg.film_grain_table_filename, ctx->ppi->error.detail); if (ret != AOM_CODEC_OK) return ret; } return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_denoise_noise_level(aom_codec_alg_priv_t *ctx, va_list args) { #if !CONFIG_DENOISE (void)ctx; (void)args; return AOM_CODEC_INCAPABLE; #else struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.noise_level = ((float)CAST(AV1E_SET_DENOISE_NOISE_LEVEL, args)) / 10.0f; return update_extra_cfg(ctx, &extra_cfg); #endif } static aom_codec_err_t ctrl_set_denoise_block_size(aom_codec_alg_priv_t *ctx, va_list args) { #if !CONFIG_DENOISE (void)ctx; (void)args; return AOM_CODEC_INCAPABLE; #else struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.noise_block_size = CAST(AV1E_SET_DENOISE_BLOCK_SIZE, args); return update_extra_cfg(ctx, &extra_cfg); #endif } static aom_codec_err_t ctrl_set_enable_dnl_denoising(aom_codec_alg_priv_t *ctx, va_list args) { #if !CONFIG_DENOISE (void)ctx; (void)args; return AOM_CODEC_INCAPABLE; #else struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_dnl_denoising = CAST(AV1E_SET_ENABLE_DNL_DENOISING, args); return update_extra_cfg(ctx, &extra_cfg); #endif } static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; const DELTAQ_MODE deltaq_arg = CAST(AV1E_SET_DELTAQ_MODE, args); #if CONFIG_REALTIME_ONLY if (deltaq_arg > NO_DELTA_Q) { ERROR("Delta Q mode can't be enabled in realtime only build."); } #endif extra_cfg.deltaq_mode = deltaq_arg; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_deltaq_strength(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.deltaq_strength = CAST(AV1E_SET_DELTAQ_STRENGTH, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_deltalf_mode(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.deltalf_mode = CAST(AV1E_SET_DELTALF_MODE, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_min_gf_interval(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.min_gf_interval = CAST(AV1E_SET_MIN_GF_INTERVAL, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_max_gf_interval(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.max_gf_interval = CAST(AV1E_SET_MAX_GF_INTERVAL, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_gf_min_pyr_height(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.gf_min_pyr_height = CAST(AV1E_SET_GF_MIN_PYRAMID_HEIGHT, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_gf_max_pyr_height(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.gf_max_pyr_height = CAST(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_frame_periodic_boost(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.frame_periodic_boost = CAST(AV1E_SET_FRAME_PERIODIC_BOOST, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_enable_motion_vector_unit_test( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.motion_vector_unit_test = CAST(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_enable_fpmt_unit_test(aom_codec_alg_priv_t *ctx, va_list args) { #if !CONFIG_FPMT_TEST (void)args; (void)ctx; return AOM_CODEC_INCAPABLE; #else struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.fpmt_unit_test = CAST(AV1E_SET_FP_MT_UNIT_TEST, args); ctx->ppi->fpmt_unit_test_cfg = (extra_cfg.fpmt_unit_test == 1) ? PARALLEL_ENCODE : PARALLEL_SIMULATION_ENCODE; return update_extra_cfg(ctx, &extra_cfg); #endif } static aom_codec_err_t ctrl_enable_ext_tile_debug(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.ext_tile_debug = CAST(AV1E_ENABLE_EXT_TILE_DEBUG, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_target_seq_level_idx(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; const int val = CAST(AV1E_SET_TARGET_SEQ_LEVEL_IDX, args); const int level = val % 100; const int operating_point_idx = val / 100; if (operating_point_idx < 0 || operating_point_idx >= MAX_NUM_OPERATING_POINTS) { char *const err_string = ctx->ppi->error.detail; snprintf(err_string, ARG_ERR_MSG_MAX_LEN, "Invalid operating point index: %d", operating_point_idx); ctx->base.err_detail = err_string; return AOM_CODEC_INVALID_PARAM; } extra_cfg.target_seq_level_idx[operating_point_idx] = (AV1_LEVEL)level; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_tier_mask(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.tier_mask = CAST(AV1E_SET_TIER_MASK, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_min_cr(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.min_cr = CAST(AV1E_SET_MIN_CR, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_enable_sb_multipass_unit_test( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.sb_multipass_unit_test = CAST(AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_enable_sb_qp_sweep(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.sb_qp_sweep = CAST(AV1E_ENABLE_SB_QP_SWEEP, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_external_partition(aom_codec_alg_priv_t *ctx, va_list args) { AV1_COMP *const cpi = ctx->ppi->cpi; aom_ext_part_funcs_t funcs = *CAST(AV1E_SET_EXTERNAL_PARTITION, args); aom_ext_part_config_t config; // TODO(chengchen): verify the sb_size has been set at this point. config.superblock_size = cpi->common.seq_params->sb_size; const aom_codec_err_t status = av1_ext_part_create(funcs, config, &cpi->ext_part_controller); return status; } static aom_codec_err_t ctrl_set_loopfilter_control(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.loopfilter_control = CAST(AV1E_SET_LOOPFILTER_CONTROL, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_skip_postproc_filtering( aom_codec_alg_priv_t *ctx, va_list args) { // Skipping the application of post-processing filters is allowed only // for ALLINTRA mode. if (ctx->cfg.g_usage != AOM_USAGE_ALL_INTRA) return AOM_CODEC_INCAPABLE; struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.skip_postproc_filtering = CAST(AV1E_SET_SKIP_POSTPROC_FILTERING, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_rtc_external_rc(aom_codec_alg_priv_t *ctx, va_list args) { ctx->ppi->cpi->rc.rtc_external_ratectrl = CAST(AV1E_SET_RTC_EXTERNAL_RC, args); return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_quantizer_one_pass(aom_codec_alg_priv_t *ctx, va_list args) { const int qp = CAST(AV1E_SET_QUANTIZER_ONE_PASS, args); if (qp < 0 || qp > 63) return AOM_CODEC_INVALID_PARAM; aom_codec_enc_cfg_t *cfg = &ctx->cfg; struct av1_extracfg extra_cfg = ctx->extra_cfg; cfg->rc_min_quantizer = cfg->rc_max_quantizer = qp; extra_cfg.aq_mode = 0; ctx->ppi->cpi->rc.use_external_qp_one_pass = 1; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_bitrate_one_pass_cbr(aom_codec_alg_priv_t *ctx, va_list args) { AV1_PRIMARY *const ppi = ctx->ppi; AV1_COMP *const cpi = ppi->cpi; AV1EncoderConfig *oxcf = &cpi->oxcf; if (!is_one_pass_rt_params(cpi) || oxcf->rc_cfg.mode != AOM_CBR || cpi->ppi->use_svc || ppi->num_fp_contexts != 1 || ppi->cpi_lap != NULL) { return AOM_CODEC_INVALID_PARAM; } const int new_bitrate = CAST(AV1E_SET_BITRATE_ONE_PASS_CBR, args); ctx->cfg.rc_target_bitrate = new_bitrate; oxcf->rc_cfg.target_bandwidth = new_bitrate * 1000; set_primary_rc_buffer_sizes(oxcf, ppi); av1_new_framerate(cpi, cpi->framerate); check_reset_rc_flag(cpi); return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_max_consec_frame_drop_cbr( aom_codec_alg_priv_t *ctx, va_list args) { AV1_PRIMARY *const ppi = ctx->ppi; AV1_COMP *const cpi = ppi->cpi; const int max_consec_drop = CAST(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, args); if (max_consec_drop < 0) return AOM_CODEC_INVALID_PARAM; cpi->rc.max_consec_drop = max_consec_drop; cpi->rc.drop_count_consec = 0; return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_max_consec_frame_drop_ms_cbr( aom_codec_alg_priv_t *ctx, va_list args) { AV1_PRIMARY *const ppi = ctx->ppi; AV1_COMP *const cpi = ppi->cpi; const int max_consec_drop_ms = CAST(AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR, args); if (max_consec_drop_ms < 0) return AOM_CODEC_INVALID_PARAM; // max_consec_drop_ms will be converted to frame units inside encoder // based on framerate (which can change dynamically). ctx->oxcf.rc_cfg.max_consec_drop_ms = max_consec_drop_ms; cpi->rc.drop_count_consec = 0; return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_svc_frame_drop_mode(aom_codec_alg_priv_t *ctx, va_list args) { AV1_PRIMARY *const ppi = ctx->ppi; AV1_COMP *const cpi = ppi->cpi; cpi->svc.framedrop_mode = CAST(AV1E_SET_SVC_FRAME_DROP_MODE, args); if (cpi->svc.framedrop_mode != AOM_LAYER_DROP && cpi->svc.framedrop_mode != AOM_FULL_SUPERFRAME_DROP) return AOM_CODEC_INVALID_PARAM; else return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_auto_tiles(aom_codec_alg_priv_t *ctx, va_list args) { unsigned int auto_tiles = CAST(AV1E_SET_AUTO_TILES, args); if (auto_tiles == ctx->extra_cfg.auto_tiles) return AOM_CODEC_OK; struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.auto_tiles = auto_tiles; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_postencode_drop_rtc(aom_codec_alg_priv_t *ctx, va_list args) { AV1_PRIMARY *const ppi = ctx->ppi; AV1_COMP *const cpi = ppi->cpi; int enable_postencode_drop = CAST(AV1E_SET_POSTENCODE_DROP_RTC, args); if (enable_postencode_drop > 1 || enable_postencode_drop < 0) return AOM_CODEC_INVALID_PARAM; cpi->rc.postencode_drop = enable_postencode_drop; return AOM_CODEC_OK; } #if !CONFIG_REALTIME_ONLY static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer, STATS_BUFFER_CTX *stats_buf_context, int num_lap_buffers) { aom_codec_err_t res = AOM_CODEC_OK; int size = get_stats_buf_size(num_lap_buffers, MAX_LAG_BUFFERS); *frame_stats_buffer = (FIRSTPASS_STATS *)aom_calloc(size, sizeof(FIRSTPASS_STATS)); if (*frame_stats_buffer == NULL) return AOM_CODEC_MEM_ERROR; stats_buf_context->stats_in_start = *frame_stats_buffer; stats_buf_context->stats_in_end = stats_buf_context->stats_in_start; stats_buf_context->stats_in_buf_end = stats_buf_context->stats_in_start + size; stats_buf_context->total_left_stats = aom_calloc(1, sizeof(FIRSTPASS_STATS)); if (stats_buf_context->total_left_stats == NULL) return AOM_CODEC_MEM_ERROR; av1_twopass_zero_stats(stats_buf_context->total_left_stats); stats_buf_context->total_stats = aom_calloc(1, sizeof(FIRSTPASS_STATS)); if (stats_buf_context->total_stats == NULL) return AOM_CODEC_MEM_ERROR; av1_twopass_zero_stats(stats_buf_context->total_stats); return res; } #endif aom_codec_err_t av1_create_context_and_bufferpool(AV1_PRIMARY *ppi, AV1_COMP **p_cpi, BufferPool **p_buffer_pool, const AV1EncoderConfig *oxcf, COMPRESSOR_STAGE stage, int lap_lag_in_frames) { aom_codec_err_t res = AOM_CODEC_OK; BufferPool *buffer_pool = *p_buffer_pool; if (buffer_pool == NULL) { buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool)); if (buffer_pool == NULL) return AOM_CODEC_MEM_ERROR; buffer_pool->num_frame_bufs = (oxcf->mode == ALLINTRA) ? FRAME_BUFFERS_ALLINTRA : FRAME_BUFFERS; buffer_pool->frame_bufs = (RefCntBuffer *)aom_calloc( buffer_pool->num_frame_bufs, sizeof(*buffer_pool->frame_bufs)); if (buffer_pool->frame_bufs == NULL) { buffer_pool->num_frame_bufs = 0; aom_free(buffer_pool); return AOM_CODEC_MEM_ERROR; } #if CONFIG_MULTITHREAD if (pthread_mutex_init(&buffer_pool->pool_mutex, NULL)) { aom_free(buffer_pool->frame_bufs); buffer_pool->frame_bufs = NULL; buffer_pool->num_frame_bufs = 0; aom_free(buffer_pool); return AOM_CODEC_MEM_ERROR; } #endif *p_buffer_pool = buffer_pool; } *p_cpi = av1_create_compressor(ppi, oxcf, buffer_pool, stage, lap_lag_in_frames); if (*p_cpi == NULL) res = AOM_CODEC_MEM_ERROR; return res; } static aom_codec_err_t ctrl_set_fp_mt(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.fp_mt = CAST(AV1E_SET_FP_MT, args); const aom_codec_err_t result = update_extra_cfg(ctx, &extra_cfg); int num_fp_contexts = 1; if (ctx->ppi->num_fp_contexts == 1) { num_fp_contexts = av1_compute_num_fp_contexts(ctx->ppi, &ctx->ppi->parallel_cpi[0]->oxcf); if (num_fp_contexts > 1) { int i; for (i = 1; i < num_fp_contexts; i++) { int res = av1_create_context_and_bufferpool( ctx->ppi, &ctx->ppi->parallel_cpi[i], &ctx->buffer_pool, &ctx->oxcf, ENCODE_STAGE, -1); if (res != AOM_CODEC_OK) { return res; } #if !CONFIG_REALTIME_ONLY ctx->ppi->parallel_cpi[i]->twopass_frame.stats_in = ctx->ppi->twopass.stats_buf_ctx->stats_in_start; #endif } } } ctx->ppi->num_fp_contexts = num_fp_contexts; return result; } static aom_codec_err_t ctrl_set_auto_intra_tools_off(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.auto_intra_tools_off = CAST(AV1E_SET_AUTO_INTRA_TOOLS_OFF, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) { aom_codec_err_t res = AOM_CODEC_OK; if (ctx->priv == NULL) { aom_codec_alg_priv_t *const priv = aom_calloc(1, sizeof(*priv)); if (priv == NULL) return AOM_CODEC_MEM_ERROR; ctx->priv = (aom_codec_priv_t *)priv; ctx->priv->init_flags = ctx->init_flags; // Update the reference to the config structure to an internal copy. assert(ctx->config.enc); priv->cfg = *ctx->config.enc; ctx->config.enc = &priv->cfg; priv->extra_cfg = default_extra_cfg; // Special handling: // By default, if omitted: --enable-cdef=1, --qm-min=5, and --qm-max=9 // Here we set its default values to 0, 4, and 10 respectively when // --allintra is turned on. // However, if users set --enable-cdef, --qm-min, or --qm-max, either from // the command line or aom_codec_control(), the encoder still respects it. if (priv->cfg.g_usage == AOM_USAGE_ALL_INTRA) { // CDEF has been found to blur images, so it's disabled in all-intra mode priv->extra_cfg.enable_cdef = 0; // These QM min/max values have been found to be beneficial for images, // when used with an alternative QM formula (see // aom_get_qmlevel_allintra()). // These values could also be beneficial for other usage modes, but // further testing is required. priv->extra_cfg.qm_min = DEFAULT_QM_FIRST_ALLINTRA; priv->extra_cfg.qm_max = DEFAULT_QM_LAST_ALLINTRA; } av1_initialize_enc(priv->cfg.g_usage, priv->cfg.rc_end_usage); res = validate_config(priv, &priv->cfg, &priv->extra_cfg); if (res == AOM_CODEC_OK) { int *num_lap_buffers = &priv->num_lap_buffers; int lap_lag_in_frames = 0; *num_lap_buffers = 0; priv->timestamp_ratio.den = priv->cfg.g_timebase.den; priv->timestamp_ratio.num = (int64_t)priv->cfg.g_timebase.num * TICKS_PER_SEC; reduce_ratio(&priv->timestamp_ratio); set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg); if (priv->oxcf.rc_cfg.mode != AOM_CBR && priv->oxcf.pass == AOM_RC_ONE_PASS && priv->oxcf.mode == GOOD) { // Enable look ahead - enabled for AOM_Q, AOM_CQ, AOM_VBR *num_lap_buffers = AOMMIN((int)priv->cfg.g_lag_in_frames, AOMMIN(MAX_LAP_BUFFERS, priv->oxcf.kf_cfg.key_freq_max + SCENE_CUT_KEY_TEST_INTERVAL)); if ((int)priv->cfg.g_lag_in_frames - (*num_lap_buffers) >= LAP_LAG_IN_FRAMES) { lap_lag_in_frames = LAP_LAG_IN_FRAMES; } } priv->oxcf.use_highbitdepth = (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0; priv->monochrome_on_init = priv->cfg.monochrome; priv->ppi = av1_create_primary_compressor(&priv->pkt_list.head, *num_lap_buffers, &priv->oxcf); if (!priv->ppi) return AOM_CODEC_MEM_ERROR; #if !CONFIG_REALTIME_ONLY res = create_stats_buffer(&priv->frame_stats_buffer, &priv->stats_buf_context, *num_lap_buffers); if (res != AOM_CODEC_OK) return res; assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS); int size = get_stats_buf_size(*num_lap_buffers, MAX_LAG_BUFFERS); for (int i = 0; i < size; i++) priv->ppi->twopass.frame_stats_arr[i] = &priv->frame_stats_buffer[i]; priv->ppi->twopass.stats_buf_ctx = &priv->stats_buf_context; #endif assert(priv->ppi->num_fp_contexts >= 1); res = av1_create_context_and_bufferpool( priv->ppi, &priv->ppi->parallel_cpi[0], &priv->buffer_pool, &priv->oxcf, ENCODE_STAGE, -1); if (res != AOM_CODEC_OK) { priv->base.err_detail = "av1_create_context_and_bufferpool() failed"; return res; } #if !CONFIG_REALTIME_ONLY priv->ppi->parallel_cpi[0]->twopass_frame.stats_in = priv->ppi->twopass.stats_buf_ctx->stats_in_start; #endif priv->ppi->cpi = priv->ppi->parallel_cpi[0]; // Create another compressor if look ahead is enabled if (res == AOM_CODEC_OK && *num_lap_buffers) { res = av1_create_context_and_bufferpool( priv->ppi, &priv->ppi->cpi_lap, &priv->buffer_pool_lap, &priv->oxcf, LAP_STAGE, clamp(lap_lag_in_frames, 0, MAX_LAG_BUFFERS)); } } } return res; } void av1_destroy_context_and_bufferpool(AV1_COMP *cpi, BufferPool **p_buffer_pool) { av1_remove_compressor(cpi); if (*p_buffer_pool) { av1_free_ref_frame_buffers(*p_buffer_pool); #if CONFIG_MULTITHREAD pthread_mutex_destroy(&(*p_buffer_pool)->pool_mutex); #endif aom_free(*p_buffer_pool); *p_buffer_pool = NULL; } } static void destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context, FIRSTPASS_STATS *frame_stats_buffer) { aom_free(stats_buf_context->total_left_stats); aom_free(stats_buf_context->total_stats); aom_free(frame_stats_buffer); } static void check_and_free_string(const char *default_str, const char **ptr) { if (*ptr == default_str) { // Default should be a literal. Do not free. return; } aom_free((void *)*ptr); *ptr = NULL; } static void destroy_extra_config(struct av1_extracfg *extra_cfg) { #if CONFIG_TUNE_VMAF check_and_free_string(default_extra_cfg.vmaf_model_path, &extra_cfg->vmaf_model_path); #endif check_and_free_string(default_extra_cfg.two_pass_output, &extra_cfg->two_pass_output); check_and_free_string(default_extra_cfg.two_pass_output, &extra_cfg->second_pass_log); check_and_free_string(default_extra_cfg.partition_info_path, &extra_cfg->partition_info_path); check_and_free_string(default_extra_cfg.rate_distribution_info, &extra_cfg->rate_distribution_info); check_and_free_string(default_extra_cfg.film_grain_table_filename, &extra_cfg->film_grain_table_filename); } static aom_codec_err_t encoder_destroy(aom_codec_alg_priv_t *ctx) { free(ctx->cx_data); destroy_extra_config(&ctx->extra_cfg); if (ctx->ppi) { AV1_PRIMARY *ppi = ctx->ppi; for (int i = 0; i < MAX_PARALLEL_FRAMES - 1; i++) { if (ppi->parallel_frames_data[i].cx_data) { free(ppi->parallel_frames_data[i].cx_data); } } #if CONFIG_ENTROPY_STATS print_entropy_stats(ppi); #endif #if CONFIG_INTERNAL_STATS print_internal_stats(ppi); #endif for (int i = 0; i < MAX_PARALLEL_FRAMES; i++) { av1_destroy_context_and_bufferpool(ppi->parallel_cpi[i], &ctx->buffer_pool); } ppi->cpi = NULL; if (ppi->cpi_lap) { av1_destroy_context_and_bufferpool(ppi->cpi_lap, &ctx->buffer_pool_lap); } av1_remove_primary_compressor(ppi); } destroy_stats_buffer(&ctx->stats_buf_context, ctx->frame_stats_buffer); aom_free(ctx); return AOM_CODEC_OK; } static aom_codec_frame_flags_t get_frame_pkt_flags(const AV1_COMP *cpi, unsigned int lib_flags) { aom_codec_frame_flags_t flags = lib_flags << 16; if (lib_flags & FRAMEFLAGS_KEY) flags |= AOM_FRAME_IS_KEY; if (lib_flags & FRAMEFLAGS_INTRAONLY) flags |= AOM_FRAME_IS_INTRAONLY; if (lib_flags & FRAMEFLAGS_SWITCH) flags |= AOM_FRAME_IS_SWITCH; if (lib_flags & FRAMEFLAGS_ERROR_RESILIENT) flags |= AOM_FRAME_IS_ERROR_RESILIENT; if (cpi->droppable) flags |= AOM_FRAME_IS_DROPPABLE; return flags; } static inline int get_src_border_in_pixels(AV1_COMP *cpi, BLOCK_SIZE sb_size) { if (cpi->oxcf.mode != REALTIME || av1_is_resize_needed(&cpi->oxcf)) return cpi->oxcf.border_in_pixels; const int sb_size_in_pixels_log2 = mi_size_wide_log2[sb_size] + MI_SIZE_LOG2; const int sb_aligned_width = ALIGN_POWER_OF_TWO(cpi->oxcf.frm_dim_cfg.width, sb_size_in_pixels_log2); const int sb_aligned_height = ALIGN_POWER_OF_TWO(cpi->oxcf.frm_dim_cfg.height, sb_size_in_pixels_log2); // Align the border pixels to a multiple of 32. const int border_pixels_width = ALIGN_POWER_OF_TWO(sb_aligned_width - cpi->oxcf.frm_dim_cfg.width, 5); const int border_pixels_height = ALIGN_POWER_OF_TWO(sb_aligned_height - cpi->oxcf.frm_dim_cfg.height, 5); const int border_in_pixels = AOMMAX(AOMMAX(border_pixels_width, border_pixels_height), 32); return border_in_pixels; } // TODO(Mufaddal): Check feasibility of abstracting functions related to LAP // into a separate function. static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, const aom_image_t *img, aom_codec_pts_t pts, unsigned long duration, aom_enc_frame_flags_t enc_flags) { const size_t kMinCompressedSize = 8192; volatile aom_codec_err_t res = AOM_CODEC_OK; AV1_PRIMARY *const ppi = ctx->ppi; volatile aom_codec_pts_t ptsvol = pts; AV1_COMP_DATA cpi_data = { 0 }; cpi_data.timestamp_ratio = &ctx->timestamp_ratio; cpi_data.flush = !img; // LAP context AV1_COMP *cpi_lap = ppi->cpi_lap; if (ppi->cpi == NULL) return AOM_CODEC_INVALID_PARAM; ppi->cpi->last_coded_width = ppi->cpi->oxcf.frm_dim_cfg.width; ppi->cpi->last_coded_height = ppi->cpi->oxcf.frm_dim_cfg.height; if (ppi->lap_enabled && cpi_lap == NULL && ppi->cpi->oxcf.pass == AOM_RC_ONE_PASS) return AOM_CODEC_INVALID_PARAM; if (img != NULL) { res = validate_img(ctx, img); if (res == AOM_CODEC_OK) { const uint64_t uncompressed_frame_sz64 = (uint64_t)ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_w, 5) * ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_h, 5) * get_image_bps(img) / 8; #if UINT64_MAX > SIZE_MAX if (uncompressed_frame_sz64 > SIZE_MAX) return AOM_CODEC_MEM_ERROR; #endif const size_t uncompressed_frame_sz = (size_t)uncompressed_frame_sz64; // Due to the presence of no-show frames, the ctx->cx_data buffer holds // compressed data corresponding to multiple frames. As no-show frames are // not possible for all intra frame encoding with no forward key frames, // the buffer is allocated with a smaller size in this case. // // For pseudo random input, the compressed frame size is seen to exceed // the uncompressed frame size, but is less than 2 times the uncompressed // frame size. Hence the size of the buffer is chosen as 2 times the // uncompressed frame size. int multiplier = 8; if (ppi->cpi->oxcf.kf_cfg.key_freq_max == 0 && !ppi->cpi->oxcf.kf_cfg.fwd_kf_enabled) multiplier = 2; if (uncompressed_frame_sz > SIZE_MAX / multiplier) return AOM_CODEC_MEM_ERROR; size_t data_sz = uncompressed_frame_sz * multiplier; if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize; if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) { ctx->cx_data_sz = data_sz; free(ctx->cx_data); ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz); if (ctx->cx_data == NULL) { ctx->cx_data_sz = 0; return AOM_CODEC_MEM_ERROR; } } for (int i = 0; i < ppi->num_fp_contexts - 1; i++) { if (ppi->parallel_frames_data[i].cx_data == NULL || ppi->parallel_frames_data[i].cx_data_sz < data_sz) { ppi->parallel_frames_data[i].cx_data_sz = data_sz; free(ppi->parallel_frames_data[i].cx_data); ppi->parallel_frames_data[i].frame_size = 0; ppi->parallel_frames_data[i].cx_data = (unsigned char *)malloc(ppi->parallel_frames_data[i].cx_data_sz); if (ppi->parallel_frames_data[i].cx_data == NULL) { ppi->parallel_frames_data[i].cx_data_sz = 0; return AOM_CODEC_MEM_ERROR; } } } } } aom_codec_pkt_list_init(&ctx->pkt_list); volatile aom_enc_frame_flags_t flags = enc_flags; // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(ppi->error.jmp)) { ppi->error.setjmp = 0; res = update_error_state(ctx, &ppi->error); return res; } ppi->error.setjmp = 1; if (ppi->use_svc && ppi->cpi->svc.use_flexible_mode == 0 && flags == 0) av1_set_svc_fixed_mode(ppi->cpi); // Note(yunqing): While applying encoding flags, always start from enabling // all, and then modifying according to the flags. Previous frame's flags are // overwritten. av1_apply_encoding_flags(ppi->cpi, flags); if (cpi_lap != NULL) { av1_apply_encoding_flags(cpi_lap, flags); } #if CONFIG_TUNE_VMAF if (ctx->extra_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && ctx->extra_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) { aom_init_vmaf_model(&ppi->cpi->vmaf_info.vmaf_model, ppi->cpi->oxcf.tune_cfg.vmaf_model_path); } #endif // Handle fixed keyframe intervals if (is_stat_generation_stage(ppi->cpi) || is_one_pass_rt_params(ppi->cpi)) { if (ctx->cfg.kf_mode == AOM_KF_AUTO && ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) { if (ppi->cpi->common.spatial_layer_id == 0 && ++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) { flags |= AOM_EFLAG_FORCE_KF; ctx->fixed_kf_cntr = 1; } } } if (res == AOM_CODEC_OK) { AV1_COMP *cpi = ppi->cpi; // Set up internal flags if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) ppi->b_calculate_psnr = 1; if (img != NULL) { if (!ctx->pts_offset_initialized) { ctx->pts_offset = ptsvol; ctx->pts_offset_initialized = 1; } if (ptsvol < ctx->pts_offset) { aom_internal_error(&ppi->error, AOM_CODEC_INVALID_PARAM, "pts is smaller than initial pts"); } ptsvol -= ctx->pts_offset; if (ptsvol > INT64_MAX / cpi_data.timestamp_ratio->num) { aom_internal_error( &ppi->error, AOM_CODEC_INVALID_PARAM, "conversion of relative pts to ticks would overflow"); } int64_t src_time_stamp = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol); #if ULONG_MAX > INT64_MAX if (duration > INT64_MAX) { aom_internal_error(&ppi->error, AOM_CODEC_INVALID_PARAM, "duration is too big"); } #endif if (ptsvol > INT64_MAX - (int64_t)duration) { aom_internal_error(&ppi->error, AOM_CODEC_INVALID_PARAM, "relative pts + duration is too big"); } aom_codec_pts_t pts_end = ptsvol + (int64_t)duration; if (pts_end > INT64_MAX / cpi_data.timestamp_ratio->num) { aom_internal_error( &ppi->error, AOM_CODEC_INVALID_PARAM, "conversion of relative pts + duration to ticks would overflow"); } int64_t src_end_time_stamp = timebase_units_to_ticks(cpi_data.timestamp_ratio, pts_end); YV12_BUFFER_CONFIG sd; res = image2yuvconfig(img, &sd); // When generating a monochrome stream, make |sd| a monochrome image. if (ctx->cfg.monochrome) { sd.u_buffer = sd.v_buffer = NULL; sd.uv_stride = 0; sd.monochrome = 1; } int use_highbitdepth = (sd.flags & YV12_FLAG_HIGHBITDEPTH) != 0; int subsampling_x = sd.subsampling_x; int subsampling_y = sd.subsampling_y; if (!ppi->lookahead) { int lag_in_frames = cpi_lap != NULL ? cpi_lap->oxcf.gf_cfg.lag_in_frames : cpi->oxcf.gf_cfg.lag_in_frames; AV1EncoderConfig *oxcf = &cpi->oxcf; const BLOCK_SIZE sb_size = av1_select_sb_size( oxcf, oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, ppi->number_spatial_layers); oxcf->border_in_pixels = av1_get_enc_border_size(av1_is_resize_needed(oxcf), oxcf->kf_cfg.key_freq_max == 0, sb_size); for (int i = 0; i < ppi->num_fp_contexts; i++) { ppi->parallel_cpi[i]->oxcf.border_in_pixels = oxcf->border_in_pixels; } const int src_border_in_pixels = get_src_border_in_pixels(cpi, sb_size); ppi->lookahead = av1_lookahead_init( cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height, subsampling_x, subsampling_y, use_highbitdepth, lag_in_frames, src_border_in_pixels, cpi->common.features.byte_alignment, ctx->num_lap_buffers, (cpi->oxcf.kf_cfg.key_freq_max == 0), cpi->alloc_pyramid); } if (!ppi->lookahead) aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate lag buffers"); for (int i = 0; i < ppi->num_fp_contexts; i++) { aom_codec_err_t err = av1_check_initial_width(ppi->parallel_cpi[i], use_highbitdepth, subsampling_x, subsampling_y); if (err != AOM_CODEC_OK) { aom_internal_error(&ppi->error, err, "av1_check_initial_width() failed"); } } if (cpi_lap != NULL) { aom_codec_err_t err = av1_check_initial_width( cpi_lap, use_highbitdepth, subsampling_x, subsampling_y); if (err != AOM_CODEC_OK) { aom_internal_error(&ppi->error, err, "av1_check_initial_width() failed"); } } // Store the original flags in to the frame buffer. Will extract the // key frame flag when we actually encode this frame. if (av1_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd, src_time_stamp, src_end_time_stamp)) { res = update_error_state(ctx, cpi->common.error); } ctx->next_frame_flags = 0; } cpi_data.cx_data = ctx->cx_data; cpi_data.cx_data_sz = ctx->cx_data_sz; /* Any pending invisible frames? */ if (ctx->pending_cx_data_sz) { cpi_data.cx_data += ctx->pending_cx_data_sz; cpi_data.cx_data_sz -= ctx->pending_cx_data_sz; /* TODO: this is a minimal check, the underlying codec doesn't respect * the buffer size anyway. */ if (cpi_data.cx_data_sz < ctx->cx_data_sz / 2) { aom_internal_error(&ppi->error, AOM_CODEC_ERROR, "Compressed data buffer too small"); } } int is_frame_visible = 0; int has_no_show_keyframe = 0; int num_workers = 0; if (cpi->oxcf.pass == AOM_RC_FIRST_PASS) { #if !CONFIG_REALTIME_ONLY num_workers = ppi->p_mt_info.num_mod_workers[MOD_FP] = av1_fp_compute_num_enc_workers(cpi); #endif } else { av1_compute_num_workers_for_mt(cpi); num_workers = av1_get_max_num_workers(cpi); } if (num_workers > 1 && ppi->p_mt_info.num_workers < num_workers) { // Obtain the maximum no. of frames that can be supported in a parallel // encode set. if (is_stat_consumption_stage(cpi)) { ppi->num_fp_contexts = av1_compute_num_fp_contexts(ppi, &cpi->oxcf); } if (ppi->p_mt_info.num_workers > 0) { av1_terminate_workers(ppi); free_thread_data(ppi); aom_free(ppi->p_mt_info.tile_thr_data); ppi->p_mt_info.tile_thr_data = NULL; aom_free(ppi->p_mt_info.workers); ppi->p_mt_info.workers = NULL; ppi->p_mt_info.num_workers = 0; for (int j = 0; j < ppi->num_fp_contexts; j++) { aom_free(ppi->parallel_cpi[j]->td.tctx); ppi->parallel_cpi[j]->td.tctx = NULL; } } av1_create_workers(ppi, num_workers); av1_init_tile_thread_data(ppi, cpi->oxcf.pass == AOM_RC_FIRST_PASS); } // Re-allocate thread data if workers for encoder multi-threading stage // exceeds prev_num_enc_workers. const int num_enc_workers = av1_get_num_mod_workers_for_alloc(&ppi->p_mt_info, MOD_ENC); if (ppi->p_mt_info.prev_num_enc_workers < num_enc_workers && num_enc_workers <= ppi->p_mt_info.num_workers) { free_thread_data(ppi); for (int j = 0; j < ppi->num_fp_contexts; j++) { aom_free(ppi->parallel_cpi[j]->td.tctx); ppi->parallel_cpi[j]->td.tctx = NULL; } av1_init_tile_thread_data(ppi, cpi->oxcf.pass == AOM_RC_FIRST_PASS); } for (int i = 0; i < ppi->num_fp_contexts; i++) { av1_init_frame_mt(ppi, ppi->parallel_cpi[i]); } if (cpi_lap != NULL) { av1_init_frame_mt(ppi, cpi_lap); } #if CONFIG_MULTITHREAD if (ppi->p_mt_info.num_workers > 1) { for (int i = 0; i < ppi->num_fp_contexts; i++) { av1_init_mt_sync(ppi->parallel_cpi[i], ppi->parallel_cpi[i]->oxcf.pass == AOM_RC_FIRST_PASS); } if (cpi_lap != NULL) { av1_init_mt_sync(cpi_lap, 1); } } #endif // CONFIG_MULTITHREAD // Call for LAP stage if (cpi_lap != NULL) { AV1_COMP_DATA cpi_lap_data = { 0 }; cpi_lap_data.flush = !img; cpi_lap_data.timestamp_ratio = &ctx->timestamp_ratio; const int status = av1_get_compressed_data(cpi_lap, &cpi_lap_data); if (status > AOM_CODEC_OK) { aom_internal_error_copy(&ppi->error, cpi_lap->common.error); } av1_post_encode_updates(cpi_lap, &cpi_lap_data); } // Recalculate the maximum number of frames that can be encoded in // parallel at the beginning of sub gop. if (is_stat_consumption_stage(cpi) && ppi->gf_group.size > 0 && cpi->gf_frame_index == ppi->gf_group.size) { ppi->num_fp_contexts = av1_compute_num_fp_contexts(ppi, &cpi->oxcf); } // Get the next visible frame. Invisible frames get packed with the next // visible frame. while (cpi_data.cx_data_sz >= ctx->cx_data_sz / 2 && !is_frame_visible) { int simulate_parallel_frame = 0; int status = -1; cpi->do_frame_data_update = true; cpi->ref_idx_to_skip = INVALID_IDX; cpi->ref_refresh_index = INVALID_IDX; cpi->refresh_idx_available = false; #if CONFIG_FPMT_TEST simulate_parallel_frame = cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0; if (simulate_parallel_frame) { if (ppi->num_fp_contexts > 1 && ppi->gf_group.size > 1) { if (cpi->gf_frame_index < ppi->gf_group.size) { calc_frame_data_update_flag(&ppi->gf_group, cpi->gf_frame_index, &cpi->do_frame_data_update); } } status = av1_get_compressed_data(cpi, &cpi_data); } #endif // CONFIG_FPMT_TEST if (!simulate_parallel_frame) { if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { status = av1_get_compressed_data(cpi, &cpi_data); } else if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1) { // In case of an error, longjmp() would be invoked and hence "status" // is set to AOM_CODEC_OK here. av1_compress_parallel_frames(ppi, &cpi_data); status = AOM_CODEC_OK; } else { // No possibility of failures from this function and hence "status" is // set to AOM_CODEC_OK here. cpi = av1_get_parallel_frame_enc_data(ppi, &cpi_data); status = AOM_CODEC_OK; } } if (status == -1) break; if (status != AOM_CODEC_OK) { aom_internal_error_copy(&ppi->error, cpi->common.error); } if (ppi->num_fp_contexts > 0 && frame_is_intra_only(&cpi->common)) { av1_init_sc_decisions(ppi); } ppi->seq_params_locked = 1; av1_post_encode_updates(cpi, &cpi_data); #if CONFIG_ENTROPY_STATS if (ppi->cpi->oxcf.pass != 1 && !cpi->common.show_existing_frame) av1_accumulate_frame_counts(&ppi->aggregate_fc, &cpi->counts); #endif #if CONFIG_INTERNAL_STATS if (ppi->cpi->oxcf.pass != 1) { ppi->total_time_compress_data += cpi->time_compress_data; ppi->total_recode_hits += cpi->frame_recode_hits; ppi->total_bytes += (uint64_t)cpi->bytes; for (int i = 0; i < MAX_MODES; i++) { ppi->total_mode_chosen_counts[i] += cpi->mode_chosen_counts[i]; } } #endif // CONFIG_INTERNAL_STATS if (!cpi_data.frame_size) continue; assert(cpi_data.cx_data != NULL && cpi_data.cx_data_sz != 0); if (cpi_data.frame_size > cpi_data.cx_data_sz) { aom_internal_error(&ppi->error, AOM_CODEC_ERROR, "cpi_data.cx_data buffer overflow"); } const int write_temporal_delimiter = !cpi->common.spatial_layer_id && !ctx->pending_cx_data_sz; if (write_temporal_delimiter) { uint32_t obu_header_size = 1; const uint32_t obu_payload_size = 0; const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size); const size_t move_offset = obu_header_size + length_field_size; assert(ctx->cx_data_sz == cpi_data.cx_data_sz); if (move_offset > ctx->cx_data_sz - cpi_data.frame_size) { aom_internal_error(&ppi->error, AOM_CODEC_ERROR, "ctx->cx_data buffer full"); } memmove(ctx->cx_data + move_offset, ctx->cx_data, cpi_data.frame_size); obu_header_size = av1_write_obu_header( &ppi->level_params, &cpi->frame_header_count, OBU_TEMPORAL_DELIMITER, ppi->seq_params.has_nonzero_operating_point_idc, /*is_layer_specific_obu=*/false, 0, ctx->cx_data); if (obu_header_size != 1) { aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL); } // OBUs are preceded/succeeded by an unsigned leb128 coded integer. if (av1_write_uleb_obu_size(obu_payload_size, ctx->cx_data + obu_header_size, length_field_size) != AOM_CODEC_OK) { aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL); } cpi_data.frame_size += move_offset; } if (ctx->oxcf.save_as_annexb) { if (av1_convert_sect5obus_to_annexb( cpi_data.cx_data, cpi_data.cx_data_sz, &cpi_data.frame_size) != AOM_CODEC_OK) { aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL); } // B_PRIME (add frame size) const size_t length_field_size = aom_uleb_size_in_bytes(cpi_data.frame_size); if (length_field_size > cpi_data.cx_data_sz - cpi_data.frame_size) { aom_internal_error(&ppi->error, AOM_CODEC_ERROR, "cpi_data.cx_data buffer full"); } memmove(cpi_data.cx_data + length_field_size, cpi_data.cx_data, cpi_data.frame_size); if (av1_write_uleb_obu_size(cpi_data.frame_size, cpi_data.cx_data, length_field_size) != AOM_CODEC_OK) { aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL); } cpi_data.frame_size += length_field_size; } ctx->pending_cx_data_sz += cpi_data.frame_size; cpi_data.cx_data += cpi_data.frame_size; cpi_data.cx_data_sz -= cpi_data.frame_size; is_frame_visible = cpi->common.show_frame; has_no_show_keyframe |= (!is_frame_visible && cpi->common.current_frame.frame_type == KEY_FRAME); } if (is_frame_visible) { // Add the frame packet to the list of returned packets. aom_codec_cx_pkt_t pkt; // decrement frames_left counter ppi->frames_left = AOMMAX(0, ppi->frames_left - 1); if (ctx->oxcf.save_as_annexb) { // B_PRIME (add TU size) size_t tu_size = ctx->pending_cx_data_sz; const size_t length_field_size = aom_uleb_size_in_bytes(tu_size); if (tu_size > ctx->cx_data_sz) { aom_internal_error(&ppi->error, AOM_CODEC_ERROR, "ctx->cx_data buffer overflow"); } if (length_field_size > ctx->cx_data_sz - tu_size) { aom_internal_error(&ppi->error, AOM_CODEC_ERROR, "ctx->cx_data buffer full"); } memmove(ctx->cx_data + length_field_size, ctx->cx_data, tu_size); if (av1_write_uleb_obu_size(tu_size, ctx->cx_data, length_field_size) != AOM_CODEC_OK) { aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL); } ctx->pending_cx_data_sz += length_field_size; } pkt.kind = AOM_CODEC_CX_FRAME_PKT; pkt.data.frame.buf = ctx->cx_data; pkt.data.frame.sz = ctx->pending_cx_data_sz; pkt.data.frame.partition_id = -1; pkt.data.frame.vis_frame_size = cpi_data.frame_size; pkt.data.frame.pts = ticks_to_timebase_units(cpi_data.timestamp_ratio, cpi_data.ts_frame_start) + ctx->pts_offset; pkt.data.frame.flags = get_frame_pkt_flags(cpi, cpi_data.lib_flags); if (has_no_show_keyframe) { // If one of the invisible frames in the packet is a keyframe, set // the delayed random access point flag. pkt.data.frame.flags |= AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT; } const int64_t duration64 = ticks_to_timebase_units( cpi_data.timestamp_ratio, cpi_data.ts_frame_end - cpi_data.ts_frame_start); if (duration64 > UINT32_MAX) { aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL); } pkt.data.frame.duration = (uint32_t)duration64; aom_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); ctx->pending_cx_data_sz = 0; } } ppi->error.setjmp = 0; return res; } static const aom_codec_cx_pkt_t *encoder_get_cxdata(aom_codec_alg_priv_t *ctx, aom_codec_iter_t *iter) { return aom_codec_pkt_list_get(&ctx->pkt_list.head, iter); } static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx, va_list args) { av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *); if (frame != NULL) { YV12_BUFFER_CONFIG sd; image2yuvconfig(&frame->img, &sd); av1_set_reference_enc(ctx->ppi->cpi, frame->idx, &sd); return AOM_CODEC_OK; } else { return AOM_CODEC_INVALID_PARAM; } } static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx, va_list args) { if (ctx->ppi->cpi->oxcf.algo_cfg.skip_postproc_filtering) return AOM_CODEC_INCAPABLE; av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *); if (frame != NULL) { YV12_BUFFER_CONFIG sd; image2yuvconfig(&frame->img, &sd); av1_copy_reference_enc(ctx->ppi->cpi, frame->idx, &sd); return AOM_CODEC_OK; } else { return AOM_CODEC_INVALID_PARAM; } } static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx, va_list args) { if (ctx->ppi->cpi->oxcf.algo_cfg.skip_postproc_filtering) return AOM_CODEC_INCAPABLE; av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *); if (frame != NULL) { YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->ppi->cpi->common, frame->idx); if (fb == NULL) return AOM_CODEC_ERROR; yuvconfig2image(&frame->img, fb, NULL); return AOM_CODEC_OK; } else { return AOM_CODEC_INVALID_PARAM; } } static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx, va_list args) { aom_image_t *const new_img = va_arg(args, aom_image_t *); if (new_img != NULL) { YV12_BUFFER_CONFIG new_frame; if (av1_get_last_show_frame(ctx->ppi->cpi, &new_frame) == 0) { yuvconfig2image(new_img, &new_frame, NULL); return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } else { return AOM_CODEC_INVALID_PARAM; } } static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx, va_list args) { aom_image_t *const new_img = va_arg(args, aom_image_t *); if (new_img != NULL) { YV12_BUFFER_CONFIG new_frame; if (av1_get_last_show_frame(ctx->ppi->cpi, &new_frame) == 0) { YV12_BUFFER_CONFIG sd; image2yuvconfig(new_img, &sd); return av1_copy_new_frame_enc(&ctx->ppi->cpi->common, &new_frame, &sd); } else { return AOM_CODEC_ERROR; } } else { return AOM_CODEC_INVALID_PARAM; } } static aom_image_t *encoder_get_preview(aom_codec_alg_priv_t *ctx) { YV12_BUFFER_CONFIG sd; if (av1_get_preview_raw_frame(ctx->ppi->cpi, &sd) == 0) { yuvconfig2image(&ctx->preview_img, &sd, NULL); return &ctx->preview_img; } else { return NULL; } } static aom_codec_err_t ctrl_use_reference(aom_codec_alg_priv_t *ctx, va_list args) { const int reference_flag = va_arg(args, int); av1_use_as_reference(&ctx->ppi->cpi->ext_flags.ref_frame_flags, reference_flag); return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_roi_map(aom_codec_alg_priv_t *ctx, va_list args) { (void)ctx; (void)args; // TODO(yaowu): Need to re-implement and test for AV1. return AOM_CODEC_INVALID_PARAM; } static aom_codec_err_t ctrl_set_active_map(aom_codec_alg_priv_t *ctx, va_list args) { aom_active_map_t *const map = va_arg(args, aom_active_map_t *); if (map) { if (!av1_set_active_map(ctx->ppi->cpi, map->active_map, (int)map->rows, (int)map->cols)) return AOM_CODEC_OK; else return AOM_CODEC_INVALID_PARAM; } else { return AOM_CODEC_INVALID_PARAM; } } static aom_codec_err_t ctrl_get_active_map(aom_codec_alg_priv_t *ctx, va_list args) { aom_active_map_t *const map = va_arg(args, aom_active_map_t *); if (map) { if (!av1_get_active_map(ctx->ppi->cpi, map->active_map, (int)map->rows, (int)map->cols)) return AOM_CODEC_OK; else return AOM_CODEC_INVALID_PARAM; } else { return AOM_CODEC_INVALID_PARAM; } } static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx, va_list args) { aom_scaling_mode_t *const mode = va_arg(args, aom_scaling_mode_t *); if (mode) { AV1EncoderConfig *const oxcf = ctx->ppi->seq_params_locked ? &ctx->ppi->cpi->oxcf : &ctx->oxcf; const int res = av1_set_internal_size(oxcf, &ctx->ppi->cpi->resize_pending_params, mode->h_scaling_mode, mode->v_scaling_mode); if (res == 0) { // update_encoder_cfg() is somewhat costly and this control may be called // multiple times, so update_encoder_cfg() is only called to ensure frame // and superblock sizes are updated before they're fixed by the first // encode call. if (ctx->ppi->seq_params_locked) { av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf); return AOM_CODEC_OK; } return update_encoder_cfg(ctx); } return AOM_CODEC_INVALID_PARAM; } else { return AOM_CODEC_INVALID_PARAM; } } static aom_codec_err_t ctrl_set_spatial_layer_id(aom_codec_alg_priv_t *ctx, va_list args) { const int spatial_layer_id = va_arg(args, int); if (spatial_layer_id >= MAX_NUM_SPATIAL_LAYERS) return AOM_CODEC_INVALID_PARAM; ctx->ppi->cpi->common.spatial_layer_id = spatial_layer_id; return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx, va_list args) { const int number_spatial_layers = va_arg(args, int); if (number_spatial_layers > MAX_NUM_SPATIAL_LAYERS) return AOM_CODEC_INVALID_PARAM; ctx->ppi->number_spatial_layers = number_spatial_layers; // update_encoder_cfg() is somewhat costly and this control may be called // multiple times, so update_encoder_cfg() is only called to ensure frame and // superblock sizes are updated before they're fixed by the first encode // call. if (!ctx->ppi->seq_params_locked) { return update_encoder_cfg(ctx); } return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_layer_id(aom_codec_alg_priv_t *ctx, va_list args) { aom_svc_layer_id_t *const data = va_arg(args, aom_svc_layer_id_t *); ctx->ppi->cpi->common.spatial_layer_id = data->spatial_layer_id; ctx->ppi->cpi->common.temporal_layer_id = data->temporal_layer_id; ctx->ppi->cpi->svc.spatial_layer_id = data->spatial_layer_id; ctx->ppi->cpi->svc.temporal_layer_id = data->temporal_layer_id; return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx, va_list args) { AV1_PRIMARY *const ppi = ctx->ppi; AV1_COMP *const cpi = ppi->cpi; aom_svc_params_t *const params = va_arg(args, aom_svc_params_t *); int64_t target_bandwidth = 0; ppi->number_spatial_layers = params->number_spatial_layers; ppi->number_temporal_layers = params->number_temporal_layers; cpi->svc.number_spatial_layers = params->number_spatial_layers; cpi->svc.number_temporal_layers = params->number_temporal_layers; if (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1) { unsigned int sl, tl; ctx->ppi->use_svc = 1; const int num_layers = ppi->number_spatial_layers * ppi->number_temporal_layers; for (int layer = 0; layer < num_layers; ++layer) { if (params->max_quantizers[layer] > 63 || params->min_quantizers[layer] < 0 || params->min_quantizers[layer] > params->max_quantizers[layer]) { return AOM_CODEC_INVALID_PARAM; } } if (!av1_alloc_layer_context(cpi, num_layers)) return AOM_CODEC_MEM_ERROR; for (sl = 0; sl < ppi->number_spatial_layers; ++sl) { for (tl = 0; tl < ppi->number_temporal_layers; ++tl) { const int layer = LAYER_IDS_TO_IDX(sl, tl, ppi->number_temporal_layers); LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; lc->max_q = params->max_quantizers[layer]; lc->min_q = params->min_quantizers[layer]; lc->scaling_factor_num = AOMMAX(1, params->scaling_factor_num[sl]); lc->scaling_factor_den = AOMMAX(1, params->scaling_factor_den[sl]); const int layer_target_bitrate = params->layer_target_bitrate[layer]; if (layer_target_bitrate > INT_MAX / 1000) { lc->layer_target_bitrate = INT_MAX; } else { lc->layer_target_bitrate = 1000 * layer_target_bitrate; } lc->framerate_factor = params->framerate_factor[tl]; if (tl == ppi->number_temporal_layers - 1) target_bandwidth += lc->layer_target_bitrate; } } if (ppi->seq_params_locked) { AV1EncoderConfig *const oxcf = &cpi->oxcf; // Keep ctx->oxcf in sync in case further codec controls are made prior // to encoding. ctx->oxcf.rc_cfg.target_bandwidth = oxcf->rc_cfg.target_bandwidth = target_bandwidth; set_primary_rc_buffer_sizes(oxcf, ppi); av1_update_layer_context_change_config(cpi, target_bandwidth); check_reset_rc_flag(cpi); } else { // Note av1_init_layer_context() relies on cpi->oxcf. The order of that // call and the ones in the other half of this block (which // update_encoder_cfg() transitively makes) is important. So we keep // ctx->oxcf and cpi->oxcf in sync here as update_encoder_cfg() will // overwrite cpi->oxcf with ctx->oxcf. ctx->oxcf.rc_cfg.target_bandwidth = cpi->oxcf.rc_cfg.target_bandwidth = target_bandwidth; SequenceHeader *const seq_params = &ppi->seq_params; seq_params->operating_points_cnt_minus_1 = ppi->number_spatial_layers * ppi->number_temporal_layers - 1; av1_init_layer_context(cpi); // update_encoder_cfg() is somewhat costly and this control may be called // multiple times, so update_encoder_cfg() is only called to ensure frame // and superblock sizes are updated before they're fixed by the first // encode call. return update_encoder_cfg(ctx); } } else if (!ppi->seq_params_locked) { // Ensure frame and superblock sizes are updated. return update_encoder_cfg(ctx); } av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf); return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_svc_ref_frame_config(aom_codec_alg_priv_t *ctx, va_list args) { AV1_COMP *const cpi = ctx->ppi->cpi; aom_svc_ref_frame_config_t *const data = va_arg(args, aom_svc_ref_frame_config_t *); cpi->ppi->rtc_ref.set_ref_frame_config = 1; for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; ++i) { if (data->reference[i] != 0 && data->reference[i] != 1) return AOM_CODEC_INVALID_PARAM; if (data->ref_idx[i] > 7 || data->ref_idx[i] < 0) return AOM_CODEC_INVALID_PARAM; cpi->ppi->rtc_ref.reference[i] = data->reference[i]; cpi->ppi->rtc_ref.ref_idx[i] = data->ref_idx[i]; } for (unsigned int i = 0; i < REF_FRAMES; ++i) { if (data->refresh[i] != 0 && data->refresh[i] != 1) return AOM_CODEC_INVALID_PARAM; cpi->ppi->rtc_ref.refresh[i] = data->refresh[i]; } cpi->svc.use_flexible_mode = 1; cpi->svc.ksvc_fixed_mode = 0; return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_svc_ref_frame_comp_pred( aom_codec_alg_priv_t *ctx, va_list args) { AV1_COMP *const cpi = ctx->ppi->cpi; aom_svc_ref_frame_comp_pred_t *const data = va_arg(args, aom_svc_ref_frame_comp_pred_t *); cpi->ppi->rtc_ref.ref_frame_comp[0] = data->use_comp_pred[0]; cpi->ppi->rtc_ref.ref_frame_comp[1] = data->use_comp_pred[1]; cpi->ppi->rtc_ref.ref_frame_comp[2] = data->use_comp_pred[2]; return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_tune_content(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.content = CAST(AV1E_SET_TUNE_CONTENT, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_cdf_update_mode(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.cdf_update_mode = CAST(AV1E_SET_CDF_UPDATE_MODE, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_color_primaries(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.color_primaries = CAST(AV1E_SET_COLOR_PRIMARIES, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_transfer_characteristics( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.transfer_characteristics = CAST(AV1E_SET_TRANSFER_CHARACTERISTICS, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_matrix_coefficients(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.matrix_coefficients = CAST(AV1E_SET_MATRIX_COEFFICIENTS, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_chroma_sample_position( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.chroma_sample_position = CAST(AV1E_SET_CHROMA_SAMPLE_POSITION, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_color_range(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.color_range = CAST(AV1E_SET_COLOR_RANGE, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_render_size(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; int *const render_size = va_arg(args, int *); extra_cfg.render_width = render_size[0]; extra_cfg.render_height = render_size[1]; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_superblock_size(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.superblock_size = CAST(AV1E_SET_SUPERBLOCK_SIZE, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_chroma_subsampling_x(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.chroma_subsampling_x = CAST(AV1E_SET_CHROMA_SUBSAMPLING_X, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_chroma_subsampling_y(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.chroma_subsampling_y = CAST(AV1E_SET_CHROMA_SUBSAMPLING_Y, args); return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t encoder_set_option(aom_codec_alg_priv_t *ctx, const char *name, const char *value) { if (ctx == NULL || name == NULL || value == NULL) return AOM_CODEC_INVALID_PARAM; struct av1_extracfg extra_cfg = ctx->extra_cfg; // Used to mock the argv with just one string "--{name}={value}" char *argv[2] = { NULL, "" }; size_t len = strlen(name) + strlen(value) + 4; char *const err_string = ctx->ppi->error.detail; #if __STDC_VERSION__ >= 201112L // We use the keyword _Static_assert because clang-cl does not allow the // convenience macro static_assert to be used in function scope. See // https://bugs.llvm.org/show_bug.cgi?id=48904. _Static_assert(sizeof(ctx->ppi->error.detail) >= ARG_ERR_MSG_MAX_LEN, "The size of the err_msg buffer for arg_match_helper must be " "at least ARG_ERR_MSG_MAX_LEN"); #else assert(sizeof(ctx->ppi->error.detail) >= ARG_ERR_MSG_MAX_LEN); #endif argv[0] = aom_malloc(len * sizeof(argv[1][0])); if (!argv[0]) return AOM_CODEC_MEM_ERROR; snprintf(argv[0], len, "--%s=%s", name, value); struct arg arg; aom_codec_err_t err = AOM_CODEC_OK; int match = 1; if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_keyframe_filtering, argv, err_string)) { extra_cfg.enable_keyframe_filtering = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.min_gf_interval, argv, err_string)) { extra_cfg.min_gf_interval = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_gf_interval, argv, err_string)) { extra_cfg.max_gf_interval = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.gf_min_pyr_height, argv, err_string)) { extra_cfg.gf_min_pyr_height = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.gf_max_pyr_height, argv, err_string)) { extra_cfg.gf_max_pyr_height = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cpu_used_av1, argv, err_string)) { extra_cfg.cpu_used = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.auto_altref, argv, err_string)) { extra_cfg.enable_auto_alt_ref = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.noise_sens, argv, err_string)) { extra_cfg.noise_sensitivity = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.sharpness, argv, err_string)) { extra_cfg.sharpness = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.static_thresh, argv, err_string)) { extra_cfg.static_thresh = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.rowmtarg, argv, err_string)) { extra_cfg.row_mt = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.fpmtarg, argv, err_string)) { extra_cfg.fp_mt = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_cols, argv, err_string)) { extra_cfg.tile_columns = arg_parse_uint_helper(&arg, err_string); if (extra_cfg.auto_tiles) { snprintf(err_string, ARG_ERR_MSG_MAX_LEN, "Cannot set tile-cols because auto-tiles is already set."); err = AOM_CODEC_INVALID_PARAM; } } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_rows, argv, err_string)) { extra_cfg.tile_rows = arg_parse_uint_helper(&arg, err_string); if (extra_cfg.auto_tiles) { snprintf(err_string, ARG_ERR_MSG_MAX_LEN, "Cannot set tile-rows because auto-tiles is already set."); err = AOM_CODEC_INVALID_PARAM; } } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.auto_tiles, argv, err_string)) { extra_cfg.auto_tiles = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_tpl_model, argv, err_string)) { extra_cfg.enable_tpl_model = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.arnr_maxframes, argv, err_string)) { extra_cfg.arnr_max_frames = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.arnr_strength, argv, err_string)) { extra_cfg.arnr_strength = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tune_metric, argv, err_string)) { extra_cfg.tuning = arg_parse_enum_helper(&arg, err_string); err = handle_tuning(ctx, &extra_cfg); } #if CONFIG_TUNE_VMAF else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.vmaf_model_path, argv, err_string)) { err = allocate_and_set_string(value, default_extra_cfg.vmaf_model_path, &extra_cfg.vmaf_model_path, err_string); } #endif else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.partition_info_path, argv, err_string)) { err = allocate_and_set_string(value, default_extra_cfg.partition_info_path, &extra_cfg.partition_info_path, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_rate_guide_deltaq, argv, err_string)) { extra_cfg.enable_rate_guide_deltaq = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.rate_distribution_info, argv, err_string)) { err = allocate_and_set_string(value, default_extra_cfg.rate_distribution_info, &extra_cfg.rate_distribution_info, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.dist_metric, argv, err_string)) { extra_cfg.dist_metric = arg_parse_enum_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cq_level, argv, err_string)) { extra_cfg.cq_level = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_intra_rate_pct, argv, err_string)) { extra_cfg.rc_max_intra_bitrate_pct = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_inter_rate_pct, argv, err_string)) { extra_cfg.rc_max_inter_bitrate_pct = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.gf_cbr_boost_pct, argv, err_string)) { extra_cfg.gf_cbr_boost_pct = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.lossless, argv, err_string)) { extra_cfg.lossless = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_cdef, argv, err_string)) { extra_cfg.enable_cdef = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_restoration, argv, err_string)) { extra_cfg.enable_restoration = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.force_video_mode, argv, err_string)) { extra_cfg.force_video_mode = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_obmc, argv, err_string)) { extra_cfg.enable_obmc = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.disable_trellis_quant, argv, err_string)) { extra_cfg.disable_trellis_quant = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_qm, argv, err_string)) { extra_cfg.enable_qm = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.qm_max, argv, err_string)) { extra_cfg.qm_max = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.qm_min, argv, err_string)) { extra_cfg.qm_min = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.num_tg, argv, err_string)) { extra_cfg.num_tg = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.mtu_size, argv, err_string)) { extra_cfg.mtu_size = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.timing_info, argv, err_string)) { extra_cfg.timing_info_type = arg_parse_enum_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.frame_parallel_decoding, argv, err_string)) { extra_cfg.frame_parallel_decoding_mode = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_dual_filter, argv, err_string)) { extra_cfg.enable_dual_filter = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_chroma_deltaq, argv, err_string)) { extra_cfg.enable_chroma_deltaq = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.aq_mode, argv, err_string)) { extra_cfg.aq_mode = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.deltaq_mode, argv, err_string)) { extra_cfg.deltaq_mode = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.deltaq_strength, argv, err_string)) { extra_cfg.deltaq_strength = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.deltalf_mode, argv, err_string)) { extra_cfg.deltalf_mode = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.frame_periodic_boost, argv, err_string)) { extra_cfg.frame_periodic_boost = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tune_content, argv, err_string)) { extra_cfg.content = arg_parse_enum_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.input_color_primaries, argv, err_string)) { extra_cfg.color_primaries = arg_parse_enum_helper(&arg, err_string); } else if (arg_match_helper( &arg, &g_av1_codec_arg_defs.input_transfer_characteristics, argv, err_string)) { extra_cfg.transfer_characteristics = arg_parse_enum_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.input_matrix_coefficients, argv, err_string)) { extra_cfg.matrix_coefficients = arg_parse_enum_helper(&arg, err_string); } else if (arg_match_helper( &arg, &g_av1_codec_arg_defs.input_chroma_sample_position, argv, err_string)) { extra_cfg.chroma_sample_position = arg_parse_enum_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.superblock_size, argv, err_string)) { extra_cfg.superblock_size = arg_parse_enum_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.error_resilient_mode, argv, err_string)) { extra_cfg.error_resilient_mode = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.sframe_mode, argv, err_string)) { extra_cfg.s_frame_mode = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.film_grain_test, argv, err_string)) { extra_cfg.film_grain_test_vector = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.film_grain_table, argv, err_string)) { if (value == NULL) { // this parameter allows NULL as its value extra_cfg.film_grain_table_filename = value; } else { err = allocate_and_set_string( value, default_extra_cfg.film_grain_table_filename, &extra_cfg.film_grain_table_filename, err_string); } } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cdf_update_mode, argv, err_string)) { extra_cfg.cdf_update_mode = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_rect_partitions, argv, err_string)) { extra_cfg.enable_rect_partitions = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_ab_partitions, argv, err_string)) { extra_cfg.enable_ab_partitions = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_1to4_partitions, argv, err_string)) { extra_cfg.enable_1to4_partitions = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.min_partition_size, argv, err_string)) { extra_cfg.min_partition_size = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_partition_size, argv, err_string)) { extra_cfg.max_partition_size = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_intra_edge_filter, argv, err_string)) { extra_cfg.enable_intra_edge_filter = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_order_hint, argv, err_string)) { extra_cfg.enable_order_hint = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_tx64, argv, err_string)) { extra_cfg.enable_tx64 = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_flip_idtx, argv, err_string)) { extra_cfg.enable_flip_idtx = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_rect_tx, argv, err_string)) { extra_cfg.enable_rect_tx = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_dist_wtd_comp, argv, err_string)) { extra_cfg.enable_dist_wtd_comp = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_reference_frames, argv, err_string)) { extra_cfg.max_reference_frames = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.reduced_reference_set, argv, err_string)) { extra_cfg.enable_reduced_reference_set = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_ref_frame_mvs, argv, err_string)) { extra_cfg.enable_ref_frame_mvs = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_masked_comp, argv, err_string)) { extra_cfg.enable_masked_comp = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_onesided_comp, argv, err_string)) { extra_cfg.enable_onesided_comp = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_interintra_comp, argv, err_string)) { extra_cfg.enable_interintra_comp = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_smooth_interintra, argv, err_string)) { extra_cfg.enable_smooth_interintra = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_diff_wtd_comp, argv, err_string)) { extra_cfg.enable_diff_wtd_comp = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_interinter_wedge, argv, err_string)) { extra_cfg.enable_interinter_wedge = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_interintra_wedge, argv, err_string)) { extra_cfg.enable_interintra_wedge = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_global_motion, argv, err_string)) { extra_cfg.enable_global_motion = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_warped_motion, argv, err_string)) { extra_cfg.enable_warped_motion = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_filter_intra, argv, err_string)) { extra_cfg.enable_filter_intra = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_smooth_intra, argv, err_string)) { extra_cfg.enable_smooth_intra = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_paeth_intra, argv, err_string)) { extra_cfg.enable_paeth_intra = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_cfl_intra, argv, err_string)) { extra_cfg.enable_cfl_intra = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_directional_intra, argv, err_string)) { extra_cfg.enable_directional_intra = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_diagonal_intra, argv, err_string)) { extra_cfg.enable_diagonal_intra = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_overlay, argv, err_string)) { extra_cfg.enable_overlay = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_palette, argv, err_string)) { extra_cfg.enable_palette = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_intrabc, argv, err_string)) { extra_cfg.enable_intrabc = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_angle_delta, argv, err_string)) { extra_cfg.enable_angle_delta = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.reduced_tx_type_set, argv, err_string)) { extra_cfg.reduced_tx_type_set = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.use_intra_dct_only, argv, err_string)) { extra_cfg.use_intra_dct_only = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.use_inter_dct_only, argv, err_string)) { extra_cfg.use_inter_dct_only = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.use_intra_default_tx_only, argv, err_string)) { extra_cfg.use_intra_default_tx_only = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.quant_b_adapt, argv, err_string)) { extra_cfg.quant_b_adapt = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.vbr_corpus_complexity_lap, argv, err_string)) { extra_cfg.vbr_corpus_complexity_lap = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.set_tier_mask, argv, err_string)) { extra_cfg.tier_mask = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.set_min_cr, argv, err_string)) { extra_cfg.min_cr = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.coeff_cost_upd_freq, argv, err_string)) { extra_cfg.coeff_cost_upd_freq = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.mode_cost_upd_freq, argv, err_string)) { extra_cfg.mode_cost_upd_freq = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.mv_cost_upd_freq, argv, err_string)) { extra_cfg.mv_cost_upd_freq = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.dv_cost_upd_freq, argv, err_string)) { extra_cfg.dv_cost_upd_freq = arg_parse_uint_helper(&arg, err_string); } #if CONFIG_DENOISE else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.denoise_noise_level, argv, err_string)) { extra_cfg.noise_level = (float)arg_parse_int_helper(&arg, err_string) / 10.0f; } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.denoise_block_size, argv, err_string)) { extra_cfg.noise_block_size = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_dnl_denoising, argv, err_string)) { extra_cfg.enable_dnl_denoising = arg_parse_uint_helper(&arg, err_string); } #endif else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.target_seq_level_idx, argv, err_string)) { const int val = arg_parse_int_helper(&arg, err_string); const int level = val % 100; const int operating_point_idx = val / 100; if (operating_point_idx < 0 || operating_point_idx >= MAX_NUM_OPERATING_POINTS) { snprintf(err_string, ARG_ERR_MSG_MAX_LEN, "Invalid operating point index: %d", operating_point_idx); err = AOM_CODEC_INVALID_PARAM; } else { extra_cfg.target_seq_level_idx[operating_point_idx] = (AV1_LEVEL)level; } } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.input_chroma_subsampling_x, argv, err_string)) { extra_cfg.chroma_subsampling_x = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.input_chroma_subsampling_y, argv, err_string)) { extra_cfg.chroma_subsampling_y = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.passes, argv, err_string)) { extra_cfg.passes = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.fwd_kf_dist, argv, err_string)) { extra_cfg.fwd_kf_dist = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.two_pass_output, argv, err_string)) { err = allocate_and_set_string(value, default_extra_cfg.two_pass_output, &extra_cfg.two_pass_output, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.second_pass_log, argv, err_string)) { err = allocate_and_set_string(value, default_extra_cfg.second_pass_log, &extra_cfg.second_pass_log, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.loopfilter_control, argv, err_string)) { extra_cfg.loopfilter_control = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.auto_intra_tools_off, argv, err_string)) { extra_cfg.auto_intra_tools_off = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.strict_level_conformance, argv, err_string)) { extra_cfg.strict_level_conformance = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.sb_qp_sweep, argv, err_string)) { extra_cfg.sb_qp_sweep = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.kf_max_pyr_height, argv, err_string)) { extra_cfg.kf_max_pyr_height = arg_parse_int_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_width, argv, err_string)) { ctx->cfg.tile_width_count = arg_parse_list_helper( &arg, ctx->cfg.tile_widths, MAX_TILE_WIDTHS, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_height, argv, err_string)) { ctx->cfg.tile_height_count = arg_parse_list_helper( &arg, ctx->cfg.tile_heights, MAX_TILE_HEIGHTS, err_string); } else { match = 0; snprintf(err_string, ARG_ERR_MSG_MAX_LEN, "Cannot find aom option %s", name); } aom_free(argv[0]); if (err != AOM_CODEC_OK) { ctx->base.err_detail = err_string; return err; } if (strlen(err_string) != 0) { ctx->base.err_detail = err_string; return AOM_CODEC_INVALID_PARAM; } ctx->base.err_detail = NULL; if (!match) { return AOM_CODEC_INVALID_PARAM; } return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_get_seq_level_idx(aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return AOM_CODEC_INVALID_PARAM; return av1_get_seq_level_idx(&ctx->ppi->seq_params, &ctx->ppi->level_params, arg); } static aom_codec_err_t ctrl_get_target_seq_level_idx(aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return AOM_CODEC_INVALID_PARAM; return av1_get_target_seq_level_idx(&ctx->ppi->seq_params, &ctx->ppi->level_params, arg); } static aom_codec_err_t ctrl_get_num_operating_points(aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return AOM_CODEC_INVALID_PARAM; *arg = ctx->ppi->seq_params.operating_points_cnt_minus_1 + 1; return AOM_CODEC_OK; } static aom_codec_err_t ctrl_get_luma_cdef_strength(aom_codec_alg_priv_t *ctx, va_list args) { int *arg = va_arg(args, int *); AV1_COMMON const *cm = &ctx->ppi->cpi->common; if (arg == NULL) return AOM_CODEC_INVALID_PARAM; memcpy(arg, cm->cdef_info.cdef_strengths, CDEF_MAX_STRENGTHS * sizeof(*arg)); return AOM_CODEC_OK; } static aom_codec_err_t ctrl_get_high_motion_content_screen_rtc( aom_codec_alg_priv_t *ctx, va_list args) { int *arg = va_arg(args, int *); AV1_COMP *const cpi = ctx->ppi->cpi; if (arg == NULL) return AOM_CODEC_INVALID_PARAM; *arg = cpi->rc.high_motion_content_screen_rtc; return AOM_CODEC_OK; } static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { AV1_COPY_REFERENCE, ctrl_copy_reference }, { AOME_USE_REFERENCE, ctrl_use_reference }, // Setters { AV1_SET_REFERENCE, ctrl_set_reference }, { AOME_SET_ROI_MAP, ctrl_set_roi_map }, { AOME_SET_ACTIVEMAP, ctrl_set_active_map }, { AOME_SET_SCALEMODE, ctrl_set_scale_mode }, { AOME_SET_SPATIAL_LAYER_ID, ctrl_set_spatial_layer_id }, { AOME_SET_CPUUSED, ctrl_set_cpuused }, { AOME_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref }, { AOME_SET_ENABLEAUTOBWDREF, ctrl_set_enable_auto_bwd_ref }, { AOME_SET_SHARPNESS, ctrl_set_sharpness }, { AOME_SET_STATIC_THRESHOLD, ctrl_set_static_thresh }, { AV1E_SET_ROW_MT, ctrl_set_row_mt }, { AV1E_SET_FP_MT, ctrl_set_fp_mt }, { AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns }, { AV1E_SET_TILE_ROWS, ctrl_set_tile_rows }, { AV1E_SET_ENABLE_TPL_MODEL, ctrl_set_enable_tpl_model }, { AV1E_SET_ENABLE_KEYFRAME_FILTERING, ctrl_set_enable_keyframe_filtering }, { AOME_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames }, { AOME_SET_ARNR_STRENGTH, ctrl_set_arnr_strength }, { AOME_SET_TUNING, ctrl_set_tuning }, { AOME_SET_CQ_LEVEL, ctrl_set_cq_level }, { AOME_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_rc_max_intra_bitrate_pct }, { AOME_SET_NUMBER_SPATIAL_LAYERS, ctrl_set_number_spatial_layers }, { AV1E_SET_MAX_INTER_BITRATE_PCT, ctrl_set_rc_max_inter_bitrate_pct }, { AV1E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct }, { AV1E_SET_LOSSLESS, ctrl_set_lossless }, { AV1E_SET_ENABLE_CDEF, ctrl_set_enable_cdef }, { AV1E_SET_ENABLE_RESTORATION, ctrl_set_enable_restoration }, { AV1E_SET_FORCE_VIDEO_MODE, ctrl_set_force_video_mode }, { AV1E_SET_ENABLE_OBMC, ctrl_set_enable_obmc }, { AV1E_SET_DISABLE_TRELLIS_QUANT, ctrl_set_disable_trellis_quant }, { AV1E_SET_ENABLE_QM, ctrl_set_enable_qm }, { AV1E_SET_QM_Y, ctrl_set_qm_y }, { AV1E_SET_QM_U, ctrl_set_qm_u }, { AV1E_SET_QM_V, ctrl_set_qm_v }, { AV1E_SET_QM_MIN, ctrl_set_qm_min }, { AV1E_SET_QM_MAX, ctrl_set_qm_max }, { AV1E_SET_NUM_TG, ctrl_set_num_tg }, { AV1E_SET_MTU, ctrl_set_mtu }, { AV1E_SET_TIMING_INFO_TYPE, ctrl_set_timing_info_type }, { AV1E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode }, { AV1E_SET_ERROR_RESILIENT_MODE, ctrl_set_error_resilient_mode }, { AV1E_SET_S_FRAME_MODE, ctrl_set_s_frame_mode }, { AV1E_SET_ENABLE_RECT_PARTITIONS, ctrl_set_enable_rect_partitions }, { AV1E_SET_ENABLE_AB_PARTITIONS, ctrl_set_enable_ab_partitions }, { AV1E_SET_ENABLE_1TO4_PARTITIONS, ctrl_set_enable_1to4_partitions }, { AV1E_SET_MIN_PARTITION_SIZE, ctrl_set_min_partition_size }, { AV1E_SET_MAX_PARTITION_SIZE, ctrl_set_max_partition_size }, { AV1E_SET_ENABLE_DUAL_FILTER, ctrl_set_enable_dual_filter }, { AV1E_SET_ENABLE_CHROMA_DELTAQ, ctrl_set_enable_chroma_deltaq }, { AV1E_SET_ENABLE_INTRA_EDGE_FILTER, ctrl_set_enable_intra_edge_filter }, { AV1E_SET_ENABLE_ORDER_HINT, ctrl_set_enable_order_hint }, { AV1E_SET_ENABLE_TX64, ctrl_set_enable_tx64 }, { AV1E_SET_ENABLE_FLIP_IDTX, ctrl_set_enable_flip_idtx }, { AV1E_SET_ENABLE_RECT_TX, ctrl_set_enable_rect_tx }, { AV1E_SET_ENABLE_DIST_WTD_COMP, ctrl_set_enable_dist_wtd_comp }, { AV1E_SET_MAX_REFERENCE_FRAMES, ctrl_set_max_reference_frames }, { AV1E_SET_REDUCED_REFERENCE_SET, ctrl_set_enable_reduced_reference_set }, { AV1E_SET_ENABLE_REF_FRAME_MVS, ctrl_set_enable_ref_frame_mvs }, { AV1E_SET_ALLOW_REF_FRAME_MVS, ctrl_set_allow_ref_frame_mvs }, { AV1E_SET_ENABLE_MASKED_COMP, ctrl_set_enable_masked_comp }, { AV1E_SET_ENABLE_ONESIDED_COMP, ctrl_set_enable_onesided_comp }, { AV1E_SET_ENABLE_INTERINTRA_COMP, ctrl_set_enable_interintra_comp }, { AV1E_SET_ENABLE_SMOOTH_INTERINTRA, ctrl_set_enable_smooth_interintra }, { AV1E_SET_ENABLE_DIFF_WTD_COMP, ctrl_set_enable_diff_wtd_comp }, { AV1E_SET_ENABLE_INTERINTER_WEDGE, ctrl_set_enable_interinter_wedge }, { AV1E_SET_ENABLE_INTERINTRA_WEDGE, ctrl_set_enable_interintra_wedge }, { AV1E_SET_ENABLE_GLOBAL_MOTION, ctrl_set_enable_global_motion }, { AV1E_SET_ENABLE_WARPED_MOTION, ctrl_set_enable_warped_motion }, { AV1E_SET_ALLOW_WARPED_MOTION, ctrl_set_allow_warped_motion }, { AV1E_SET_ENABLE_FILTER_INTRA, ctrl_set_enable_filter_intra }, { AV1E_SET_ENABLE_SMOOTH_INTRA, ctrl_set_enable_smooth_intra }, { AV1E_SET_ENABLE_PAETH_INTRA, ctrl_set_enable_paeth_intra }, { AV1E_SET_ENABLE_CFL_INTRA, ctrl_set_enable_cfl_intra }, { AV1E_SET_ENABLE_DIRECTIONAL_INTRA, ctrl_set_enable_directional_intra }, { AV1E_SET_ENABLE_DIAGONAL_INTRA, ctrl_set_enable_diagonal_intra }, { AV1E_SET_ENABLE_SUPERRES, ctrl_set_enable_superres }, { AV1E_SET_ENABLE_OVERLAY, ctrl_set_enable_overlay }, { AV1E_SET_ENABLE_PALETTE, ctrl_set_enable_palette }, { AV1E_SET_ENABLE_INTRABC, ctrl_set_enable_intrabc }, { AV1E_SET_ENABLE_ANGLE_DELTA, ctrl_set_enable_angle_delta }, { AV1E_SET_AQ_MODE, ctrl_set_aq_mode }, { AV1E_SET_REDUCED_TX_TYPE_SET, ctrl_set_reduced_tx_type_set }, { AV1E_SET_INTRA_DCT_ONLY, ctrl_set_intra_dct_only }, { AV1E_SET_INTER_DCT_ONLY, ctrl_set_inter_dct_only }, { AV1E_SET_INTRA_DEFAULT_TX_ONLY, ctrl_set_intra_default_tx_only }, { AV1E_SET_QUANT_B_ADAPT, ctrl_set_quant_b_adapt }, { AV1E_SET_COEFF_COST_UPD_FREQ, ctrl_set_coeff_cost_upd_freq }, { AV1E_SET_MODE_COST_UPD_FREQ, ctrl_set_mode_cost_upd_freq }, { AV1E_SET_MV_COST_UPD_FREQ, ctrl_set_mv_cost_upd_freq }, { AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode }, { AV1E_SET_DELTAQ_STRENGTH, ctrl_set_deltaq_strength }, { AV1E_SET_DELTALF_MODE, ctrl_set_deltalf_mode }, { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost }, { AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content }, { AV1E_SET_CDF_UPDATE_MODE, ctrl_set_cdf_update_mode }, { AV1E_SET_COLOR_PRIMARIES, ctrl_set_color_primaries }, { AV1E_SET_TRANSFER_CHARACTERISTICS, ctrl_set_transfer_characteristics }, { AV1E_SET_MATRIX_COEFFICIENTS, ctrl_set_matrix_coefficients }, { AV1E_SET_CHROMA_SAMPLE_POSITION, ctrl_set_chroma_sample_position }, { AV1E_SET_COLOR_RANGE, ctrl_set_color_range }, { AV1E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity }, { AV1E_SET_MIN_GF_INTERVAL, ctrl_set_min_gf_interval }, { AV1E_SET_MAX_GF_INTERVAL, ctrl_set_max_gf_interval }, { AV1E_SET_GF_MIN_PYRAMID_HEIGHT, ctrl_set_gf_min_pyr_height }, { AV1E_SET_GF_MAX_PYRAMID_HEIGHT, ctrl_set_gf_max_pyr_height }, { AV1E_SET_RENDER_SIZE, ctrl_set_render_size }, { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size }, { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding }, { AV1E_SET_VMAF_MODEL_PATH, ctrl_set_vmaf_model_path }, { AV1E_SET_PARTITION_INFO_PATH, ctrl_set_partition_info_path }, { AV1E_ENABLE_RATE_GUIDE_DELTAQ, ctrl_enable_rate_guide_deltaq }, { AV1E_SET_RATE_DISTRIBUTION_INFO, ctrl_set_rate_distribution_info }, { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector }, { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table }, { AV1E_SET_DENOISE_NOISE_LEVEL, ctrl_set_denoise_noise_level }, { AV1E_SET_DENOISE_BLOCK_SIZE, ctrl_set_denoise_block_size }, { AV1E_SET_ENABLE_DNL_DENOISING, ctrl_set_enable_dnl_denoising }, { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test }, { AV1E_SET_FP_MT_UNIT_TEST, ctrl_enable_fpmt_unit_test }, { AV1E_ENABLE_EXT_TILE_DEBUG, ctrl_enable_ext_tile_debug }, { AV1E_SET_TARGET_SEQ_LEVEL_IDX, ctrl_set_target_seq_level_idx }, { AV1E_SET_TIER_MASK, ctrl_set_tier_mask }, { AV1E_SET_MIN_CR, ctrl_set_min_cr }, { AV1E_SET_SVC_LAYER_ID, ctrl_set_layer_id }, { AV1E_SET_SVC_PARAMS, ctrl_set_svc_params }, { AV1E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config }, { AV1E_SET_SVC_REF_FRAME_COMP_PRED, ctrl_set_svc_ref_frame_comp_pred }, { AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, ctrl_set_vbr_corpus_complexity_lap }, { AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, ctrl_enable_sb_multipass_unit_test }, { AV1E_ENABLE_SB_QP_SWEEP, ctrl_enable_sb_qp_sweep }, { AV1E_SET_DV_COST_UPD_FREQ, ctrl_set_dv_cost_upd_freq }, { AV1E_SET_EXTERNAL_PARTITION, ctrl_set_external_partition }, { AV1E_SET_ENABLE_TX_SIZE_SEARCH, ctrl_set_enable_tx_size_search }, { AV1E_SET_LOOPFILTER_CONTROL, ctrl_set_loopfilter_control }, { AV1E_SET_SKIP_POSTPROC_FILTERING, ctrl_set_skip_postproc_filtering }, { AV1E_SET_AUTO_INTRA_TOOLS_OFF, ctrl_set_auto_intra_tools_off }, { AV1E_SET_RTC_EXTERNAL_RC, ctrl_set_rtc_external_rc }, { AV1E_SET_QUANTIZER_ONE_PASS, ctrl_set_quantizer_one_pass }, { AV1E_SET_BITRATE_ONE_PASS_CBR, ctrl_set_bitrate_one_pass_cbr }, { AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, ctrl_set_max_consec_frame_drop_cbr }, { AV1E_SET_SVC_FRAME_DROP_MODE, ctrl_set_svc_frame_drop_mode }, { AV1E_SET_AUTO_TILES, ctrl_set_auto_tiles }, { AV1E_SET_POSTENCODE_DROP_RTC, ctrl_set_postencode_drop_rtc }, { AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR, ctrl_set_max_consec_frame_drop_ms_cbr }, // Getters { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer }, { AOME_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 }, { AOME_GET_LOOPFILTER_LEVEL, ctrl_get_loopfilter_level }, { AV1_GET_REFERENCE, ctrl_get_reference }, { AV1E_GET_ACTIVEMAP, ctrl_get_active_map }, { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image }, { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image }, { AV1E_SET_CHROMA_SUBSAMPLING_X, ctrl_set_chroma_subsampling_x }, { AV1E_SET_CHROMA_SUBSAMPLING_Y, ctrl_set_chroma_subsampling_y }, { AV1E_GET_SEQ_LEVEL_IDX, ctrl_get_seq_level_idx }, { AV1E_GET_BASELINE_GF_INTERVAL, ctrl_get_baseline_gf_interval }, { AV1E_GET_TARGET_SEQ_LEVEL_IDX, ctrl_get_target_seq_level_idx }, { AV1E_GET_NUM_OPERATING_POINTS, ctrl_get_num_operating_points }, { AV1E_GET_LUMA_CDEF_STRENGTH, ctrl_get_luma_cdef_strength }, { AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC, ctrl_get_high_motion_content_screen_rtc }, CTRL_MAP_END, }; static const aom_codec_enc_cfg_t encoder_usage_cfg[] = { #if !CONFIG_REALTIME_ONLY { // NOLINT AOM_USAGE_GOOD_QUALITY, // g_usage - non-realtime usage 0, // g_threads 0, // g_profile 320, // g_w 240, // g_h 0, // g_limit 0, // g_forced_max_frame_width 0, // g_forced_max_frame_height AOM_BITS_8, // g_bit_depth 8, // g_input_bit_depth { 1, 30 }, // g_timebase 0, // g_error_resilient AOM_RC_ONE_PASS, // g_pass 35, // g_lag_in_frames 0, // rc_dropframe_thresh RESIZE_NONE, // rc_resize_mode SCALE_NUMERATOR, // rc_resize_denominator SCALE_NUMERATOR, // rc_resize_kf_denominator AOM_SUPERRES_NONE, // rc_superres_mode SCALE_NUMERATOR, // rc_superres_denominator SCALE_NUMERATOR, // rc_superres_kf_denominator 63, // rc_superres_qthresh 32, // rc_superres_kf_qthresh AOM_VBR, // rc_end_usage { NULL, 0 }, // rc_twopass_stats_in { NULL, 0 }, // rc_firstpass_mb_stats_in 256, // rc_target_bitrate 0, // rc_min_quantizer 63, // rc_max_quantizer 25, // rc_undershoot_pct 25, // rc_overshoot_pct 6000, // rc_buf_sz 4000, // rc_buf_initial_sz 5000, // rc_buf_optimal_sz 50, // rc_2pass_vbr_bias_pct 0, // rc_2pass_vbr_minsection_pct 2000, // rc_2pass_vbr_maxsection_pct // keyframing settings (kf) 0, // fwd_kf_enabled AOM_KF_AUTO, // kf_mode 0, // kf_min_dist 9999, // kf_max_dist 0, // sframe_dist 1, // sframe_mode 0, // large_scale_tile 0, // monochrome 0, // full_still_picture_hdr 0, // save_as_annexb 0, // tile_width_count 0, // tile_height_count { 0 }, // tile_widths { 0 }, // tile_heights 0, // use_fixed_qp_offsets { -1, -1, -1, -1, -1 }, // fixed_qp_offsets { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // encoder_cfg }, #endif // !CONFIG_REALTIME_ONLY { // NOLINT AOM_USAGE_REALTIME, // g_usage - real-time usage 0, // g_threads 0, // g_profile 320, // g_w 240, // g_h 0, // g_limit 0, // g_forced_max_frame_width 0, // g_forced_max_frame_height AOM_BITS_8, // g_bit_depth 8, // g_input_bit_depth { 1, 30 }, // g_timebase 0, // g_error_resilient AOM_RC_ONE_PASS, // g_pass 0, // g_lag_in_frames 0, // rc_dropframe_thresh RESIZE_NONE, // rc_resize_mode SCALE_NUMERATOR, // rc_resize_denominator SCALE_NUMERATOR, // rc_resize_kf_denominator AOM_SUPERRES_NONE, // rc_superres_mode SCALE_NUMERATOR, // rc_superres_denominator SCALE_NUMERATOR, // rc_superres_kf_denominator 63, // rc_superres_qthresh 32, // rc_superres_kf_qthresh AOM_CBR, // rc_end_usage { NULL, 0 }, // rc_twopass_stats_in { NULL, 0 }, // rc_firstpass_mb_stats_in 256, // rc_target_bitrate 0, // rc_min_quantizer 63, // rc_max_quantizer 50, // rc_undershoot_pct 50, // rc_overshoot_pct 1000, // rc_buf_sz 600, // rc_buf_initial_sz 600, // rc_buf_optimal_sz 50, // rc_2pass_vbr_bias_pct 0, // rc_2pass_vbr_minsection_pct 2000, // rc_2pass_vbr_maxsection_pct // keyframing settings (kf) 0, // fwd_kf_enabled AOM_KF_AUTO, // kf_mode 0, // kf_min_dist 9999, // kf_max_dist 0, // sframe_dist 1, // sframe_mode 0, // large_scale_tile 0, // monochrome 0, // full_still_picture_hdr 0, // save_as_annexb 0, // tile_width_count 0, // tile_height_count { 0 }, // tile_widths { 0 }, // tile_heights 0, // use_fixed_qp_offsets { -1, -1, -1, -1, -1 }, // fixed_qp_offsets { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // encoder_cfg }, #if !CONFIG_REALTIME_ONLY { // NOLINT AOM_USAGE_ALL_INTRA, // g_usage - all intra usage 0, // g_threads 0, // g_profile 320, // g_w 240, // g_h 0, // g_limit 0, // g_forced_max_frame_width 0, // g_forced_max_frame_height AOM_BITS_8, // g_bit_depth 8, // g_input_bit_depth { 1, 30 }, // g_timebase 0, // g_error_resilient AOM_RC_ONE_PASS, // g_pass 0, // g_lag_in_frames 0, // rc_dropframe_thresh RESIZE_NONE, // rc_resize_mode SCALE_NUMERATOR, // rc_resize_denominator SCALE_NUMERATOR, // rc_resize_kf_denominator AOM_SUPERRES_NONE, // rc_superres_mode SCALE_NUMERATOR, // rc_superres_denominator SCALE_NUMERATOR, // rc_superres_kf_denominator 63, // rc_superres_qthresh 32, // rc_superres_kf_qthresh AOM_Q, // rc_end_usage { NULL, 0 }, // rc_twopass_stats_in { NULL, 0 }, // rc_firstpass_mb_stats_in 256, // rc_target_bitrate 0, // rc_min_quantizer 63, // rc_max_quantizer 25, // rc_undershoot_pct 25, // rc_overshoot_pct 6000, // rc_buf_sz 4000, // rc_buf_initial_sz 5000, // rc_buf_optimal_sz 50, // rc_2pass_vbr_bias_pct 0, // rc_2pass_vbr_minsection_pct 2000, // rc_2pass_vbr_maxsection_pct // keyframing settings (kf) 0, // fwd_kf_enabled AOM_KF_DISABLED, // kf_mode 0, // kf_min_dist 0, // kf_max_dist 0, // sframe_dist 1, // sframe_mode 0, // large_scale_tile 0, // monochrome 0, // full_still_picture_hdr 0, // save_as_annexb 0, // tile_width_count 0, // tile_height_count { 0 }, // tile_widths { 0 }, // tile_heights 0, // use_fixed_qp_offsets { -1, -1, -1, -1, -1 }, // fixed_qp_offsets { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // encoder_cfg }, #endif // !CONFIG_REALTIME_ONLY }; // This data structure and function are exported in aom/aomcx.h #ifndef VERSION_STRING #define VERSION_STRING #endif aom_codec_iface_t aom_codec_av1_cx_algo = { "AOMedia Project AV1 Encoder" VERSION_STRING, AOM_CODEC_INTERNAL_ABI_VERSION, (CONFIG_AV1_HIGHBITDEPTH ? AOM_CODEC_CAP_HIGHBITDEPTH : 0) | AOM_CODEC_CAP_ENCODER | AOM_CODEC_CAP_PSNR, // aom_codec_caps_t encoder_init, // aom_codec_init_fn_t encoder_destroy, // aom_codec_destroy_fn_t encoder_ctrl_maps, // aom_codec_ctrl_fn_map_t { // NOLINT NULL, // aom_codec_peek_si_fn_t NULL, // aom_codec_get_si_fn_t NULL, // aom_codec_decode_fn_t NULL, // aom_codec_get_frame_fn_t NULL // aom_codec_set_fb_fn_t }, { // NOLINT NELEMENTS(encoder_usage_cfg), // cfg_count encoder_usage_cfg, // aom_codec_enc_cfg_t encoder_encode, // aom_codec_encode_fn_t encoder_get_cxdata, // aom_codec_get_cx_data_fn_t encoder_set_config, // aom_codec_enc_config_set_fn_t encoder_get_global_headers, // aom_codec_get_global_headers_fn_t encoder_get_preview // aom_codec_get_preview_frame_fn_t }, encoder_set_option // aom_codec_set_option_fn_t }; aom_codec_iface_t *aom_codec_av1_cx(void) { return &aom_codec_av1_cx_algo; } aom-3.12.1/av1/av1_cx_iface.h000066400000000000000000000026131477627663500155530ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_AV1_CX_IFACE_H_ #define AOM_AV1_AV1_CX_IFACE_H_ #include "av1/encoder/encoder.h" #include "aom/aom_encoder.h" #ifdef __cplusplus extern "C" { #endif AV1EncoderConfig av1_get_encoder_config(const aom_codec_enc_cfg_t *cfg); aom_codec_err_t av1_create_context_and_bufferpool(AV1_PRIMARY *ppi, AV1_COMP **p_cpi, BufferPool **p_buffer_pool, const AV1EncoderConfig *oxcf, COMPRESSOR_STAGE stage, int lap_lag_in_frames); void av1_destroy_context_and_bufferpool(AV1_COMP *cpi, BufferPool **p_buffer_pool); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_AV1_CX_IFACE_H_ aom-3.12.1/av1/av1_dx_iface.c000066400000000000000000001747151477627663500155640ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_version.h" #include "aom/internal/aom_codec_internal.h" #include "aom/internal/aom_image_internal.h" #include "aom/aomdx.h" #include "aom/aom_decoder.h" #include "aom/aom_image.h" #include "aom_dsp/bitreader_buffer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/mem.h" #include "aom_ports/mem_ops.h" #include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #include "av1/common/alloccommon.h" #include "av1/common/av1_common_int.h" #include "av1/common/frame_buffers.h" #include "av1/common/enums.h" #include "av1/common/obu_util.h" #include "av1/decoder/decoder.h" #include "av1/decoder/decodeframe.h" #include "av1/decoder/dthread.h" #include "av1/decoder/grain_synthesis.h" #include "av1/decoder/obu.h" #include "av1/av1_iface_common.h" struct aom_codec_alg_priv { aom_codec_priv_t base; aom_codec_dec_cfg_t cfg; aom_codec_stream_info_t si; aom_image_t img; int img_avail; int flushed; int invert_tile_order; RefCntBuffer *last_show_frame; // Last output frame buffer int byte_alignment; int skip_loop_filter; int skip_film_grain; int decode_tile_row; int decode_tile_col; unsigned int tile_mode; unsigned int ext_tile_debug; unsigned int row_mt; EXTERNAL_REFERENCES ext_refs; unsigned int is_annexb; int operating_point; int output_all_layers; AVxWorker *frame_worker; aom_image_t image_with_grain; aom_codec_frame_buffer_t grain_image_frame_buffers[MAX_NUM_SPATIAL_LAYERS]; size_t num_grain_image_frame_buffers; int need_resync; // wait for key/intra-only frame // BufferPool that holds all reference frames. Shared by all the FrameWorkers. BufferPool *buffer_pool; // External frame buffer info to save for AV1 common. void *ext_priv; // Private data associated with the external frame buffers. aom_get_frame_buffer_cb_fn_t get_ext_fb_cb; aom_release_frame_buffer_cb_fn_t release_ext_fb_cb; #if CONFIG_INSPECTION aom_inspect_cb inspect_cb; void *inspect_ctx; #endif }; static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx) { // This function only allocates space for the aom_codec_alg_priv_t // structure. More memory may be required at the time the stream // information becomes known. if (!ctx->priv) { aom_codec_alg_priv_t *const priv = (aom_codec_alg_priv_t *)aom_calloc(1, sizeof(*priv)); if (priv == NULL) return AOM_CODEC_MEM_ERROR; ctx->priv = (aom_codec_priv_t *)priv; ctx->priv->init_flags = ctx->init_flags; priv->flushed = 0; // TODO(tdaede): this should not be exposed to the API priv->cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING; if (ctx->config.dec) { priv->cfg = *ctx->config.dec; ctx->config.dec = &priv->cfg; } priv->num_grain_image_frame_buffers = 0; // Turn row_mt on by default. priv->row_mt = 1; // Turn on normal tile coding mode by default. // 0 is for normal tile coding mode, and 1 is for large scale tile coding // mode(refer to lightfield example). priv->tile_mode = 0; priv->decode_tile_row = -1; priv->decode_tile_col = -1; } return AOM_CODEC_OK; } static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) { if (ctx->frame_worker != NULL) { AVxWorker *const worker = ctx->frame_worker; aom_get_worker_interface()->end(worker); FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; if (frame_worker_data != NULL && frame_worker_data->pbi != NULL) { AV1Decoder *const pbi = frame_worker_data->pbi; aom_free(pbi->common.tpl_mvs); pbi->common.tpl_mvs = NULL; av1_remove_common(&pbi->common); av1_free_cdef_buffers(&pbi->common, &pbi->cdef_worker, &pbi->cdef_sync); av1_free_cdef_sync(&pbi->cdef_sync); av1_free_restoration_buffers(&pbi->common); av1_decoder_remove(pbi); } aom_free(frame_worker_data); } if (ctx->buffer_pool) { for (size_t i = 0; i < ctx->num_grain_image_frame_buffers; i++) { ctx->buffer_pool->release_fb_cb(ctx->buffer_pool->cb_priv, &ctx->grain_image_frame_buffers[i]); } av1_free_ref_frame_buffers(ctx->buffer_pool); av1_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers); #if CONFIG_MULTITHREAD pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex); #endif } aom_free(ctx->frame_worker); aom_free(ctx->buffer_pool); assert(!ctx->img.self_allocd); aom_img_free(&ctx->img); aom_free(ctx); return AOM_CODEC_OK; } static aom_codec_err_t parse_timing_info(struct aom_read_bit_buffer *rb) { const uint32_t num_units_in_display_tick = aom_rb_read_unsigned_literal(rb, 32); const uint32_t time_scale = aom_rb_read_unsigned_literal(rb, 32); if (num_units_in_display_tick == 0 || time_scale == 0) return AOM_CODEC_UNSUP_BITSTREAM; const uint8_t equal_picture_interval = aom_rb_read_bit(rb); if (equal_picture_interval) { const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb); if (num_ticks_per_picture_minus_1 == UINT32_MAX) { // num_ticks_per_picture_minus_1 cannot be (1 << 32) - 1. return AOM_CODEC_UNSUP_BITSTREAM; } } return AOM_CODEC_OK; } static aom_codec_err_t parse_decoder_model_info( struct aom_read_bit_buffer *rb, int *buffer_delay_length_minus_1) { *buffer_delay_length_minus_1 = aom_rb_read_literal(rb, 5); const uint32_t num_units_in_decoding_tick = aom_rb_read_unsigned_literal(rb, 32); const uint8_t buffer_removal_time_length_minus_1 = aom_rb_read_literal(rb, 5); const uint8_t frame_presentation_time_length_minus_1 = aom_rb_read_literal(rb, 5); (void)num_units_in_decoding_tick; (void)buffer_removal_time_length_minus_1; (void)frame_presentation_time_length_minus_1; return AOM_CODEC_OK; } static aom_codec_err_t parse_op_parameters_info( struct aom_read_bit_buffer *rb, int buffer_delay_length_minus_1) { const int n = buffer_delay_length_minus_1 + 1; const uint32_t decoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n); const uint32_t encoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n); const uint8_t low_delay_mode_flag = aom_rb_read_bit(rb); (void)decoder_buffer_delay; (void)encoder_buffer_delay; (void)low_delay_mode_flag; return AOM_CODEC_OK; } // Parses the operating points (including operating_point_idc, seq_level_idx, // and seq_tier) and then sets si->number_spatial_layers and // si->number_temporal_layers based on operating_point_idc[0]. static aom_codec_err_t parse_operating_points(struct aom_read_bit_buffer *rb, int is_reduced_header, aom_codec_stream_info_t *si) { int operating_point_idc0 = 0; if (is_reduced_header) { aom_rb_read_literal(rb, LEVEL_BITS); // level } else { uint8_t decoder_model_info_present_flag = 0; int buffer_delay_length_minus_1 = 0; aom_codec_err_t status; const uint8_t timing_info_present_flag = aom_rb_read_bit(rb); if (timing_info_present_flag) { if ((status = parse_timing_info(rb)) != AOM_CODEC_OK) return status; decoder_model_info_present_flag = aom_rb_read_bit(rb); if (decoder_model_info_present_flag) { if ((status = parse_decoder_model_info( rb, &buffer_delay_length_minus_1)) != AOM_CODEC_OK) return status; } } const uint8_t initial_display_delay_present_flag = aom_rb_read_bit(rb); const uint8_t operating_points_cnt_minus_1 = aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS); for (int i = 0; i < operating_points_cnt_minus_1 + 1; i++) { int operating_point_idc; operating_point_idc = aom_rb_read_literal(rb, OP_POINTS_IDC_BITS); if (i == 0) operating_point_idc0 = operating_point_idc; int seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS); // level if (seq_level_idx > 7) aom_rb_read_bit(rb); // tier if (decoder_model_info_present_flag) { const uint8_t decoder_model_present_for_this_op = aom_rb_read_bit(rb); if (decoder_model_present_for_this_op) { if ((status = parse_op_parameters_info( rb, buffer_delay_length_minus_1)) != AOM_CODEC_OK) return status; } } if (initial_display_delay_present_flag) { const uint8_t initial_display_delay_present_for_this_op = aom_rb_read_bit(rb); if (initial_display_delay_present_for_this_op) aom_rb_read_literal(rb, 4); // initial_display_delay_minus_1 } } } if (aom_get_num_layers_from_operating_point_idc( operating_point_idc0, &si->number_spatial_layers, &si->number_temporal_layers) != AOM_CODEC_OK) { return AOM_CODEC_ERROR; } return AOM_CODEC_OK; } static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data, size_t data_sz, aom_codec_stream_info_t *si, int *is_intra_only) { int intra_only_flag = 0; int got_sequence_header = 0; int found_keyframe = 0; if (data + data_sz <= data || data_sz < 1) return AOM_CODEC_INVALID_PARAM; si->w = 0; si->h = 0; si->is_kf = 0; // is_kf indicates whether the current packet contains a RAP ObuHeader obu_header; memset(&obu_header, 0, sizeof(obu_header)); size_t payload_size = 0; size_t bytes_read = 0; uint8_t reduced_still_picture_hdr = 0; aom_codec_err_t status = aom_read_obu_header_and_size( data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read); if (status != AOM_CODEC_OK) return status; // If the first OBU is a temporal delimiter, skip over it and look at the next // OBU in the bitstream if (obu_header.type == OBU_TEMPORAL_DELIMITER) { // Skip any associated payload (there shouldn't be one, but just in case) if (data_sz < bytes_read + payload_size) return AOM_CODEC_CORRUPT_FRAME; data += bytes_read + payload_size; data_sz -= bytes_read + payload_size; status = aom_read_obu_header_and_size( data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read); if (status != AOM_CODEC_OK) return status; } while (1) { data += bytes_read; data_sz -= bytes_read; if (data_sz < payload_size) return AOM_CODEC_CORRUPT_FRAME; // Check that the selected OBU is a sequence header if (obu_header.type == OBU_SEQUENCE_HEADER) { // Sanity check on sequence header size if (data_sz < 2) return AOM_CODEC_CORRUPT_FRAME; // Read a few values from the sequence header payload struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL }; av1_read_profile(&rb); // profile const uint8_t still_picture = aom_rb_read_bit(&rb); reduced_still_picture_hdr = aom_rb_read_bit(&rb); if (!still_picture && reduced_still_picture_hdr) { return AOM_CODEC_UNSUP_BITSTREAM; } status = parse_operating_points(&rb, reduced_still_picture_hdr, si); if (status != AOM_CODEC_OK) return status; int num_bits_width = aom_rb_read_literal(&rb, 4) + 1; int num_bits_height = aom_rb_read_literal(&rb, 4) + 1; int max_frame_width = aom_rb_read_literal(&rb, num_bits_width) + 1; int max_frame_height = aom_rb_read_literal(&rb, num_bits_height) + 1; si->w = max_frame_width; si->h = max_frame_height; got_sequence_header = 1; } else if (obu_header.type == OBU_FRAME_HEADER || obu_header.type == OBU_FRAME) { if (got_sequence_header && reduced_still_picture_hdr) { found_keyframe = 1; break; } else { // make sure we have enough bits to get the frame type out if (data_sz < 1) return AOM_CODEC_CORRUPT_FRAME; struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL }; const int show_existing_frame = aom_rb_read_bit(&rb); if (!show_existing_frame) { const FRAME_TYPE frame_type = (FRAME_TYPE)aom_rb_read_literal(&rb, 2); if (frame_type == KEY_FRAME) { found_keyframe = 1; break; // Stop here as no further OBUs will change the outcome. } else if (frame_type == INTRA_ONLY_FRAME) { intra_only_flag = 1; } } } } // skip past any unread OBU header data data += payload_size; data_sz -= payload_size; if (data_sz == 0) break; // exit if we're out of OBUs status = aom_read_obu_header_and_size( data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read); if (status != AOM_CODEC_OK) return status; } if (got_sequence_header && found_keyframe) si->is_kf = 1; if (is_intra_only != NULL) *is_intra_only = intra_only_flag; return AOM_CODEC_OK; } static aom_codec_err_t decoder_peek_si(const uint8_t *data, size_t data_sz, aom_codec_stream_info_t *si) { return decoder_peek_si_internal(data, data_sz, si, NULL); } static aom_codec_err_t decoder_get_si(aom_codec_alg_priv_t *ctx, aom_codec_stream_info_t *si) { memcpy(si, &ctx->si, sizeof(*si)); return AOM_CODEC_OK; } static void set_error_detail(aom_codec_alg_priv_t *ctx, const char *const error) { ctx->base.err_detail = error; } static aom_codec_err_t update_error_state( aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) { if (error->error_code) set_error_detail(ctx, error->has_detail ? error->detail : NULL); return error->error_code; } static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; AV1Decoder *const pbi = frame_worker_data->pbi; AV1_COMMON *const cm = &pbi->common; BufferPool *const pool = cm->buffer_pool; cm->cur_frame = NULL; cm->features.byte_alignment = ctx->byte_alignment; pbi->skip_loop_filter = ctx->skip_loop_filter; pbi->skip_film_grain = ctx->skip_film_grain; if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { pool->get_fb_cb = ctx->get_ext_fb_cb; pool->release_fb_cb = ctx->release_ext_fb_cb; pool->cb_priv = ctx->ext_priv; } else { pool->get_fb_cb = av1_get_frame_buffer; pool->release_fb_cb = av1_release_frame_buffer; if (av1_alloc_internal_frame_buffers(&pool->int_frame_buffers)) aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Failed to initialize internal frame buffers"); pool->cb_priv = &pool->int_frame_buffers; } } static int frame_worker_hook(void *arg1, void *arg2) { FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1; const uint8_t *data = frame_worker_data->data; (void)arg2; int result = av1_receive_compressed_data(frame_worker_data->pbi, frame_worker_data->data_size, &data); frame_worker_data->data_end = data; if (result != 0) { // Check decode result in serial decode. frame_worker_data->pbi->need_resync = 1; } return !result; } static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); ctx->last_show_frame = NULL; ctx->need_resync = 1; ctx->flushed = 0; ctx->buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool)); if (ctx->buffer_pool == NULL) return AOM_CODEC_MEM_ERROR; ctx->buffer_pool->num_frame_bufs = FRAME_BUFFERS; ctx->buffer_pool->frame_bufs = (RefCntBuffer *)aom_calloc( ctx->buffer_pool->num_frame_bufs, sizeof(*ctx->buffer_pool->frame_bufs)); if (ctx->buffer_pool->frame_bufs == NULL) { ctx->buffer_pool->num_frame_bufs = 0; aom_free(ctx->buffer_pool); ctx->buffer_pool = NULL; return AOM_CODEC_MEM_ERROR; } #if CONFIG_MULTITHREAD if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) { aom_free(ctx->buffer_pool->frame_bufs); ctx->buffer_pool->frame_bufs = NULL; ctx->buffer_pool->num_frame_bufs = 0; aom_free(ctx->buffer_pool); ctx->buffer_pool = NULL; set_error_detail(ctx, "Failed to allocate buffer pool mutex"); return AOM_CODEC_MEM_ERROR; } #endif ctx->frame_worker = (AVxWorker *)aom_malloc(sizeof(*ctx->frame_worker)); if (ctx->frame_worker == NULL) { set_error_detail(ctx, "Failed to allocate frame_worker"); return AOM_CODEC_MEM_ERROR; } AVxWorker *const worker = ctx->frame_worker; winterface->init(worker); worker->thread_name = "aom frameworker"; worker->data1 = aom_memalign(32, sizeof(FrameWorkerData)); if (worker->data1 == NULL) { winterface->end(worker); aom_free(worker); ctx->frame_worker = NULL; set_error_detail(ctx, "Failed to allocate frame_worker_data"); return AOM_CODEC_MEM_ERROR; } FrameWorkerData *frame_worker_data = (FrameWorkerData *)worker->data1; frame_worker_data->pbi = av1_decoder_create(ctx->buffer_pool); if (frame_worker_data->pbi == NULL) { winterface->end(worker); aom_free(frame_worker_data); aom_free(worker); ctx->frame_worker = NULL; set_error_detail(ctx, "Failed to allocate frame_worker_data->pbi"); return AOM_CODEC_MEM_ERROR; } frame_worker_data->frame_context_ready = 0; frame_worker_data->received_frame = 0; frame_worker_data->pbi->allow_lowbitdepth = ctx->cfg.allow_lowbitdepth; // If decoding in serial mode, FrameWorker thread could create tile worker // thread or loopfilter thread. frame_worker_data->pbi->max_threads = ctx->cfg.threads; frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order; frame_worker_data->pbi->common.tiles.large_scale = ctx->tile_mode; frame_worker_data->pbi->is_annexb = ctx->is_annexb; frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row; frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col; frame_worker_data->pbi->operating_point = ctx->operating_point; frame_worker_data->pbi->output_all_layers = ctx->output_all_layers; frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug; frame_worker_data->pbi->row_mt = ctx->row_mt; frame_worker_data->pbi->is_fwd_kf_present = 0; frame_worker_data->pbi->is_arf_frame_present = 0; worker->hook = frame_worker_hook; init_buffer_callbacks(ctx); return AOM_CODEC_OK; } static inline void check_resync(aom_codec_alg_priv_t *const ctx, const AV1Decoder *const pbi) { // Clear resync flag if worker got a key frame or intra only frame. if (ctx->need_resync == 1 && pbi->need_resync == 0 && frame_is_intra_only(&pbi->common)) ctx->need_resync = 0; } static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx, const uint8_t **data, size_t data_sz, void *user_priv) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); // Determine the stream parameters. Note that we rely on peek_si to // validate that we have a buffer that does not wrap around the top // of the heap. if (!ctx->si.h) { int is_intra_only = 0; ctx->si.is_annexb = ctx->is_annexb; const aom_codec_err_t res = decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only); if (res != AOM_CODEC_OK) return res; if (!ctx->si.is_kf && !is_intra_only) return AOM_CODEC_ERROR; } AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; frame_worker_data->data = *data; frame_worker_data->data_size = data_sz; frame_worker_data->user_priv = user_priv; frame_worker_data->received_frame = 1; frame_worker_data->pbi->common.tiles.large_scale = ctx->tile_mode; frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row; frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col; frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug; frame_worker_data->pbi->row_mt = ctx->row_mt; frame_worker_data->pbi->ext_refs = ctx->ext_refs; frame_worker_data->pbi->is_annexb = ctx->is_annexb; worker->had_error = 0; winterface->execute(worker); // Update data pointer after decode. *data = frame_worker_data->data_end; if (worker->had_error) return update_error_state(ctx, &frame_worker_data->pbi->error); check_resync(ctx, frame_worker_data->pbi); return AOM_CODEC_OK; } static void release_pending_output_frames(aom_codec_alg_priv_t *ctx) { // Release any pending output frames from the previous decoder_decode or // decoder_inspect call. We need to do this even if the decoder is being // flushed or the input arguments are invalid. if (ctx->frame_worker) { BufferPool *const pool = ctx->buffer_pool; lock_buffer_pool(pool); AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; struct AV1Decoder *pbi = frame_worker_data->pbi; for (size_t j = 0; j < pbi->num_output_frames; j++) { decrease_ref_count(pbi->output_frames[j], pool); } pbi->num_output_frames = 0; unlock_buffer_pool(pool); for (size_t j = 0; j < ctx->num_grain_image_frame_buffers; j++) { pool->release_fb_cb(pool->cb_priv, &ctx->grain_image_frame_buffers[j]); ctx->grain_image_frame_buffers[j].data = NULL; ctx->grain_image_frame_buffers[j].size = 0; ctx->grain_image_frame_buffers[j].priv = NULL; } ctx->num_grain_image_frame_buffers = 0; } } // This function enables the inspector to inspect non visible frames. static aom_codec_err_t decoder_inspect(aom_codec_alg_priv_t *ctx, const uint8_t *data, size_t data_sz, void *user_priv) { aom_codec_err_t res = AOM_CODEC_OK; release_pending_output_frames(ctx); /* Sanity checks */ /* NULL data ptr allowed if data_sz is 0 too */ if (data == NULL && data_sz == 0) { ctx->flushed = 1; return AOM_CODEC_OK; } if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM; // Reset flushed when receiving a valid frame. ctx->flushed = 0; const uint8_t *data_start = data; const uint8_t *data_end = data + data_sz; uint64_t frame_size; if (ctx->is_annexb) { // read the size of this temporal unit size_t length_of_size; uint64_t temporal_unit_size; if (aom_uleb_decode(data_start, data_sz, &temporal_unit_size, &length_of_size) != 0) { return AOM_CODEC_CORRUPT_FRAME; } data_start += length_of_size; if (temporal_unit_size > (size_t)(data_end - data_start)) return AOM_CODEC_CORRUPT_FRAME; data_end = data_start + temporal_unit_size; // read the size of this frame unit if (aom_uleb_decode(data_start, (size_t)(data_end - data_start), &frame_size, &length_of_size) != 0) { return AOM_CODEC_CORRUPT_FRAME; } data_start += length_of_size; if (frame_size > (size_t)(data_end - data_start)) return AOM_CODEC_CORRUPT_FRAME; } else { frame_size = (uint64_t)(data_end - data_start); } if (ctx->frame_worker == NULL) { res = init_decoder(ctx); if (res != AOM_CODEC_OK) return res; } FrameWorkerData *const frame_worker_data = (FrameWorkerData *)ctx->frame_worker->data1; AV1Decoder *const pbi = frame_worker_data->pbi; AV1_COMMON *const cm = &pbi->common; #if CONFIG_INSPECTION frame_worker_data->pbi->inspect_cb = ctx->inspect_cb; frame_worker_data->pbi->inspect_ctx = ctx->inspect_ctx; #endif res = av1_receive_compressed_data(frame_worker_data->pbi, (size_t)frame_size, &data_start); check_resync(ctx, frame_worker_data->pbi); if (ctx->frame_worker->had_error) return update_error_state(ctx, &frame_worker_data->pbi->error); // Allow extra zero bytes after the frame end while (data_start < data_end) { const uint8_t marker = data_start[0]; if (marker) break; ++data_start; } Av1DecodeReturn *data2 = (Av1DecodeReturn *)user_priv; data2->idx = -1; if (cm->cur_frame) { for (int i = 0; i < REF_FRAMES; ++i) if (cm->ref_frame_map[i] == cm->cur_frame) data2->idx = i; } data2->buf = data_start; data2->show_existing = cm->show_existing_frame; return res; } static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx, const uint8_t *data, size_t data_sz, void *user_priv) { aom_codec_err_t res = AOM_CODEC_OK; #if CONFIG_INSPECTION if (user_priv != 0) { return decoder_inspect(ctx, data, data_sz, user_priv); } #endif release_pending_output_frames(ctx); /* Sanity checks */ /* NULL data ptr allowed if data_sz is 0 too */ if (data == NULL && data_sz == 0) { ctx->flushed = 1; return AOM_CODEC_OK; } if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM; // Reset flushed when receiving a valid frame. ctx->flushed = 0; // Initialize the decoder worker on the first frame. if (ctx->frame_worker == NULL) { res = init_decoder(ctx); if (res != AOM_CODEC_OK) return res; } const uint8_t *data_start = data; const uint8_t *data_end = data + data_sz; if (ctx->is_annexb) { // read the size of this temporal unit size_t length_of_size; uint64_t temporal_unit_size; if (aom_uleb_decode(data_start, data_sz, &temporal_unit_size, &length_of_size) != 0) { return AOM_CODEC_CORRUPT_FRAME; } data_start += length_of_size; if (temporal_unit_size > (size_t)(data_end - data_start)) return AOM_CODEC_CORRUPT_FRAME; data_end = data_start + temporal_unit_size; } // Decode in serial mode. while (data_start < data_end) { uint64_t frame_size; if (ctx->is_annexb) { // read the size of this frame unit size_t length_of_size; if (aom_uleb_decode(data_start, (size_t)(data_end - data_start), &frame_size, &length_of_size) != 0) { return AOM_CODEC_CORRUPT_FRAME; } data_start += length_of_size; if (frame_size > (size_t)(data_end - data_start)) return AOM_CODEC_CORRUPT_FRAME; } else { frame_size = (uint64_t)(data_end - data_start); } res = decode_one(ctx, &data_start, (size_t)frame_size, user_priv); if (res != AOM_CODEC_OK) return res; // Allow extra zero bytes after the frame end while (data_start < data_end) { const uint8_t marker = data_start[0]; if (marker) break; ++data_start; } } return res; } typedef struct { BufferPool *pool; aom_codec_frame_buffer_t *fb; } AllocCbParam; static void *AllocWithGetFrameBufferCb(void *priv, size_t size) { AllocCbParam *param = (AllocCbParam *)priv; if (param->pool->get_fb_cb(param->pool->cb_priv, size, param->fb) < 0) return NULL; if (param->fb->data == NULL || param->fb->size < size) return NULL; return param->fb->data; } // If grain_params->apply_grain is false, returns img. Otherwise, adds film // grain to img, saves the result in grain_img, and returns grain_img. static aom_image_t *add_grain_if_needed(aom_codec_alg_priv_t *ctx, aom_image_t *img, aom_image_t *grain_img, aom_film_grain_t *grain_params) { if (!grain_params->apply_grain) return img; const int w_even = ALIGN_POWER_OF_TWO_UNSIGNED(img->d_w, 1); const int h_even = ALIGN_POWER_OF_TWO_UNSIGNED(img->d_h, 1); BufferPool *const pool = ctx->buffer_pool; aom_codec_frame_buffer_t *fb = &ctx->grain_image_frame_buffers[ctx->num_grain_image_frame_buffers]; AllocCbParam param; param.pool = pool; param.fb = fb; if (!aom_img_alloc_with_cb(grain_img, img->fmt, w_even, h_even, 16, AllocWithGetFrameBufferCb, ¶m)) { return NULL; } grain_img->user_priv = img->user_priv; grain_img->fb_priv = fb->priv; if (av1_add_film_grain(grain_params, img, grain_img)) { pool->release_fb_cb(pool->cb_priv, fb); return NULL; } ctx->num_grain_image_frame_buffers++; return grain_img; } // Copies and clears the metadata from AV1Decoder. static void move_decoder_metadata_to_img(AV1Decoder *pbi, aom_image_t *img) { if (pbi->metadata && img) { assert(!img->metadata); img->metadata = pbi->metadata; pbi->metadata = NULL; } } static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx, aom_codec_iter_t *iter) { aom_image_t *img = NULL; if (!iter) { return NULL; } // To avoid having to allocate any extra storage, treat 'iter' as // simply a pointer to an integer index uintptr_t *index = (uintptr_t *)iter; if (ctx->frame_worker == NULL) { return NULL; } const AVxWorkerInterface *const winterface = aom_get_worker_interface(); AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; AV1Decoder *const pbi = frame_worker_data->pbi; pbi->error.error_code = AOM_CODEC_OK; pbi->error.has_detail = 0; AV1_COMMON *const cm = &pbi->common; CommonTileParams *const tiles = &cm->tiles; // Wait for the frame from worker thread. if (!winterface->sync(worker)) { // Decoding failed. Release the worker thread. frame_worker_data->received_frame = 0; ctx->need_resync = 1; // TODO(aomedia:3519): Set an error code. Check if a different error code // should be used if ctx->flushed != 1. return NULL; } // Check if worker has received any frames. if (frame_worker_data->received_frame == 1) { frame_worker_data->received_frame = 0; check_resync(ctx, frame_worker_data->pbi); } YV12_BUFFER_CONFIG *sd; aom_film_grain_t *grain_params; if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd, &grain_params) != 0) { return NULL; } RefCntBuffer *const output_frame_buf = pbi->output_frames[*index]; ctx->last_show_frame = output_frame_buf; if (ctx->need_resync) return NULL; aom_img_remove_metadata(&ctx->img); yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv); move_decoder_metadata_to_img(pbi, &ctx->img); if (!pbi->ext_tile_debug && tiles->large_scale) { *index += 1; // Advance the iterator to point to the next image aom_img_remove_metadata(&ctx->img); yuvconfig2image(&ctx->img, &pbi->tile_list_outbuf, NULL); move_decoder_metadata_to_img(pbi, &ctx->img); img = &ctx->img; return img; } const int num_planes = av1_num_planes(cm); if (pbi->ext_tile_debug && tiles->single_tile_decoding && pbi->dec_tile_row >= 0) { int tile_width, tile_height; if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) { return NULL; } const int tile_row = AOMMIN(pbi->dec_tile_row, tiles->rows - 1); const int mi_row = tile_row * tile_height; const int ssy = ctx->img.y_chroma_shift; int plane; ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0]; if (num_planes > 1) { for (plane = 1; plane < MAX_MB_PLANE; ++plane) { ctx->img.planes[plane] += mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane]; } } ctx->img.d_h = AOMMIN(tile_height, cm->mi_params.mi_rows - mi_row) * MI_SIZE; } if (pbi->ext_tile_debug && tiles->single_tile_decoding && pbi->dec_tile_col >= 0) { int tile_width, tile_height; if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) { return NULL; } const int tile_col = AOMMIN(pbi->dec_tile_col, tiles->cols - 1); const int mi_col = tile_col * tile_width; const int ssx = ctx->img.x_chroma_shift; const int is_hbd = (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0; int plane; ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd); if (num_planes > 1) { for (plane = 1; plane < MAX_MB_PLANE; ++plane) { ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx) * (1 + is_hbd); } } ctx->img.d_w = AOMMIN(tile_width, cm->mi_params.mi_cols - mi_col) * MI_SIZE; } ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv; img = &ctx->img; img->temporal_id = output_frame_buf->temporal_id; img->spatial_id = output_frame_buf->spatial_id; if (pbi->skip_film_grain) grain_params->apply_grain = 0; aom_image_t *res = add_grain_if_needed(ctx, img, &ctx->image_with_grain, grain_params); if (!res) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; pbi->error.has_detail = 1; snprintf(pbi->error.detail, sizeof(pbi->error.detail), "Grain synthesis failed\n"); return res; } *index += 1; // Advance the iterator to point to the next image return res; } static aom_codec_err_t decoder_set_fb_fn( aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get, aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { if (cb_get == NULL || cb_release == NULL) { return AOM_CODEC_INVALID_PARAM; } if (ctx->frame_worker != NULL) { // If the decoder has already been initialized, do not accept changes to // the frame buffer functions. return AOM_CODEC_ERROR; } ctx->get_ext_fb_cb = cb_get; ctx->release_ext_fb_cb = cb_release; ctx->ext_priv = cb_priv; return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx, va_list args) { av1_ref_frame_t *const data = va_arg(args, av1_ref_frame_t *); if (data) { av1_ref_frame_t *const frame = data; YV12_BUFFER_CONFIG sd; AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; image2yuvconfig(&frame->img, &sd); return av1_set_reference_dec(&frame_worker_data->pbi->common, frame->idx, frame->use_external_ref, &sd); } else { return AOM_CODEC_INVALID_PARAM; } } static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx, va_list args) { const av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *); if (frame) { YV12_BUFFER_CONFIG sd; AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; image2yuvconfig(&frame->img, &sd); return av1_copy_reference_dec(frame_worker_data->pbi, frame->idx, &sd); } else { return AOM_CODEC_INVALID_PARAM; } } static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx, va_list args) { av1_ref_frame_t *data = va_arg(args, av1_ref_frame_t *); if (data) { YV12_BUFFER_CONFIG *fb; AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx); if (fb == NULL) return AOM_CODEC_ERROR; yuvconfig2image(&data->img, fb, NULL); return AOM_CODEC_OK; } else { return AOM_CODEC_INVALID_PARAM; } } static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx, va_list args) { aom_image_t *new_img = va_arg(args, aom_image_t *); if (new_img) { YV12_BUFFER_CONFIG new_frame; AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) { yuvconfig2image(new_img, &new_frame, NULL); return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } else { return AOM_CODEC_INVALID_PARAM; } } static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx, va_list args) { aom_image_t *img = va_arg(args, aom_image_t *); if (img) { YV12_BUFFER_CONFIG new_frame; AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) { YV12_BUFFER_CONFIG sd; image2yuvconfig(img, &sd); return av1_copy_new_frame_dec(&frame_worker_data->pbi->common, &new_frame, &sd); } else { return AOM_CODEC_ERROR; } } else { return AOM_CODEC_INVALID_PARAM; } } static aom_codec_err_t ctrl_get_last_ref_updates(aom_codec_alg_priv_t *ctx, va_list args) { int *const update_info = va_arg(args, int *); if (update_info) { if (ctx->frame_worker) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; *update_info = frame_worker_data->pbi->common.current_frame.refresh_frame_flags; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } return AOM_CODEC_INVALID_PARAM; } static aom_codec_err_t ctrl_get_last_quantizer(aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return AOM_CODEC_INVALID_PARAM; if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; *arg = ((FrameWorkerData *)ctx->frame_worker->data1) ->pbi->common.quant_params.base_qindex; return AOM_CODEC_OK; } static aom_codec_err_t ctrl_get_fwd_kf_value(aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return AOM_CODEC_INVALID_PARAM; if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; *arg = ((FrameWorkerData *)ctx->frame_worker->data1)->pbi->is_fwd_kf_present; return AOM_CODEC_OK; } static aom_codec_err_t ctrl_get_altref_present(aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return AOM_CODEC_INVALID_PARAM; if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; *arg = ((FrameWorkerData *)ctx->frame_worker->data1)->pbi->is_arf_frame_present; return AOM_CODEC_OK; } static aom_codec_err_t ctrl_get_frame_flags(aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return AOM_CODEC_INVALID_PARAM; if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; AV1Decoder *pbi = ((FrameWorkerData *)ctx->frame_worker->data1)->pbi; *arg = 0; switch (pbi->common.current_frame.frame_type) { case KEY_FRAME: *arg |= AOM_FRAME_IS_KEY; *arg |= AOM_FRAME_IS_INTRAONLY; if (!pbi->common.show_frame) { *arg |= AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT; } break; case INTRA_ONLY_FRAME: *arg |= AOM_FRAME_IS_INTRAONLY; break; case S_FRAME: *arg |= AOM_FRAME_IS_SWITCH; break; } if (pbi->common.features.error_resilient_mode) { *arg |= AOM_FRAME_IS_ERROR_RESILIENT; } return AOM_CODEC_OK; } static aom_codec_err_t ctrl_get_tile_info(aom_codec_alg_priv_t *ctx, va_list args) { aom_tile_info *const tile_info = va_arg(args, aom_tile_info *); if (tile_info) { if (ctx->frame_worker) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1Decoder *pbi = frame_worker_data->pbi; const CommonTileParams *tiles = &pbi->common.tiles; int tile_rows = tiles->rows; int tile_cols = tiles->cols; if (tiles->uniform_spacing) { tile_info->tile_rows = 1 << tiles->log2_rows; tile_info->tile_columns = 1 << tiles->log2_cols; } else { tile_info->tile_rows = tile_rows; tile_info->tile_columns = tile_cols; } for (int tile_col = 1; tile_col <= tile_cols; tile_col++) { tile_info->tile_widths[tile_col - 1] = tiles->col_start_sb[tile_col] - tiles->col_start_sb[tile_col - 1]; } for (int tile_row = 1; tile_row <= tile_rows; tile_row++) { tile_info->tile_heights[tile_row - 1] = tiles->row_start_sb[tile_row] - tiles->row_start_sb[tile_row - 1]; } tile_info->num_tile_groups = pbi->num_tile_groups; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } return AOM_CODEC_INVALID_PARAM; } static aom_codec_err_t ctrl_get_screen_content_tools_info( aom_codec_alg_priv_t *ctx, va_list args) { aom_screen_content_tools_info *const sc_info = va_arg(args, aom_screen_content_tools_info *); if (sc_info) { if (ctx->frame_worker) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1Decoder *pbi = frame_worker_data->pbi; sc_info->allow_screen_content_tools = pbi->common.features.allow_screen_content_tools; sc_info->allow_intrabc = pbi->common.features.allow_intrabc; sc_info->force_integer_mv = (int)pbi->common.features.cur_frame_force_integer_mv; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } return AOM_CODEC_INVALID_PARAM; } static aom_codec_err_t ctrl_get_still_picture(aom_codec_alg_priv_t *ctx, va_list args) { aom_still_picture_info *const still_picture_info = va_arg(args, aom_still_picture_info *); if (still_picture_info) { if (ctx->frame_worker) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1Decoder *pbi = frame_worker_data->pbi; still_picture_info->is_still_picture = (int)pbi->seq_params.still_picture; still_picture_info->is_reduced_still_picture_hdr = (int)(pbi->seq_params.reduced_still_picture_hdr); return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } return AOM_CODEC_INVALID_PARAM; } static aom_codec_err_t ctrl_get_sb_size(aom_codec_alg_priv_t *ctx, va_list args) { aom_superblock_size_t *const sb_size = va_arg(args, aom_superblock_size_t *); if (sb_size) { if (ctx->frame_worker) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1Decoder *pbi = frame_worker_data->pbi; if (pbi->seq_params.sb_size == BLOCK_128X128) { *sb_size = AOM_SUPERBLOCK_SIZE_128X128; } else { *sb_size = AOM_SUPERBLOCK_SIZE_64X64; } return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } return AOM_CODEC_INVALID_PARAM; } static aom_codec_err_t ctrl_get_show_existing_frame_flag( aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return AOM_CODEC_INVALID_PARAM; if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; *arg = ((FrameWorkerData *)ctx->frame_worker->data1) ->pbi->common.show_existing_frame; return AOM_CODEC_OK; } static aom_codec_err_t ctrl_get_s_frame_info(aom_codec_alg_priv_t *ctx, va_list args) { aom_s_frame_info *const s_frame_info = va_arg(args, aom_s_frame_info *); if (s_frame_info) { if (ctx->frame_worker) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1Decoder *pbi = frame_worker_data->pbi; s_frame_info->is_s_frame = pbi->sframe_info.is_s_frame; s_frame_info->is_s_frame_at_altref = pbi->sframe_info.is_s_frame_at_altref; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } return AOM_CODEC_INVALID_PARAM; } static aom_codec_err_t ctrl_get_frame_corrupted(aom_codec_alg_priv_t *ctx, va_list args) { int *corrupted = va_arg(args, int *); if (corrupted) { if (ctx->frame_worker) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; AV1Decoder *const pbi = frame_worker_data->pbi; if (pbi->seen_frame_header && pbi->num_output_frames == 0) return AOM_CODEC_ERROR; if (ctx->last_show_frame != NULL) *corrupted = ctx->last_show_frame->buf.corrupted; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } return AOM_CODEC_INVALID_PARAM; } static aom_codec_err_t ctrl_get_frame_size(aom_codec_alg_priv_t *ctx, va_list args) { int *const frame_size = va_arg(args, int *); if (frame_size) { if (ctx->frame_worker) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1_COMMON *const cm = &frame_worker_data->pbi->common; frame_size[0] = cm->width; frame_size[1] = cm->height; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } return AOM_CODEC_INVALID_PARAM; } static aom_codec_err_t ctrl_get_frame_header_info(aom_codec_alg_priv_t *ctx, va_list args) { aom_tile_data *const frame_header_info = va_arg(args, aom_tile_data *); if (frame_header_info) { if (ctx->frame_worker) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1Decoder *pbi = frame_worker_data->pbi; frame_header_info->coded_tile_data_size = pbi->obu_size_hdr.size; frame_header_info->coded_tile_data = pbi->obu_size_hdr.data; frame_header_info->extra_size = pbi->frame_header_size; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } return AOM_CODEC_INVALID_PARAM; } static aom_codec_err_t ctrl_get_tile_data(aom_codec_alg_priv_t *ctx, va_list args) { aom_tile_data *const tile_data = va_arg(args, aom_tile_data *); if (tile_data) { if (ctx->frame_worker) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1Decoder *pbi = frame_worker_data->pbi; tile_data->coded_tile_data_size = pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size; tile_data->coded_tile_data = pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } return AOM_CODEC_INVALID_PARAM; } static aom_codec_err_t ctrl_set_ext_ref_ptr(aom_codec_alg_priv_t *ctx, va_list args) { av1_ext_ref_frame_t *const data = va_arg(args, av1_ext_ref_frame_t *); if (data) { av1_ext_ref_frame_t *const ext_frames = data; ctx->ext_refs.num = ext_frames->num; for (int i = 0; i < ctx->ext_refs.num; i++) { image2yuvconfig(ext_frames->img++, &ctx->ext_refs.refs[i]); } return AOM_CODEC_OK; } else { return AOM_CODEC_INVALID_PARAM; } } static aom_codec_err_t ctrl_get_render_size(aom_codec_alg_priv_t *ctx, va_list args) { int *const render_size = va_arg(args, int *); if (render_size) { if (ctx->frame_worker) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1_COMMON *const cm = &frame_worker_data->pbi->common; render_size[0] = cm->render_width; render_size[1] = cm->render_height; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } return AOM_CODEC_INVALID_PARAM; } static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx, va_list args) { unsigned int *const bit_depth = va_arg(args, unsigned int *); AVxWorker *const worker = ctx->frame_worker; if (bit_depth) { if (worker) { FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1_COMMON *const cm = &frame_worker_data->pbi->common; *bit_depth = cm->seq_params->bit_depth; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } return AOM_CODEC_INVALID_PARAM; } static aom_img_fmt_t get_img_format(int subsampling_x, int subsampling_y, int use_highbitdepth) { aom_img_fmt_t fmt = 0; if (subsampling_x == 0 && subsampling_y == 0) fmt = AOM_IMG_FMT_I444; else if (subsampling_x == 1 && subsampling_y == 0) fmt = AOM_IMG_FMT_I422; else if (subsampling_x == 1 && subsampling_y == 1) fmt = AOM_IMG_FMT_I420; if (use_highbitdepth) fmt |= AOM_IMG_FMT_HIGHBITDEPTH; return fmt; } static aom_codec_err_t ctrl_get_img_format(aom_codec_alg_priv_t *ctx, va_list args) { aom_img_fmt_t *const img_fmt = va_arg(args, aom_img_fmt_t *); AVxWorker *const worker = ctx->frame_worker; if (img_fmt) { if (worker) { FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1_COMMON *const cm = &frame_worker_data->pbi->common; *img_fmt = get_img_format(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth); return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } return AOM_CODEC_INVALID_PARAM; } static aom_codec_err_t ctrl_get_tile_size(aom_codec_alg_priv_t *ctx, va_list args) { unsigned int *const tile_size = va_arg(args, unsigned int *); AVxWorker *const worker = ctx->frame_worker; if (tile_size) { if (worker) { FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1_COMMON *const cm = &frame_worker_data->pbi->common; int tile_width, tile_height; if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) { return AOM_CODEC_CORRUPT_FRAME; } *tile_size = ((tile_width * MI_SIZE) << 16) + tile_height * MI_SIZE; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } return AOM_CODEC_INVALID_PARAM; } static aom_codec_err_t ctrl_get_tile_count(aom_codec_alg_priv_t *ctx, va_list args) { unsigned int *const tile_count = va_arg(args, unsigned int *); if (tile_count) { AVxWorker *const worker = ctx->frame_worker; if (worker) { FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; *tile_count = frame_worker_data->pbi->tile_count_minus_1 + 1; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } return AOM_CODEC_INVALID_PARAM; } static aom_codec_err_t ctrl_get_base_q_idx(aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return AOM_CODEC_INVALID_PARAM; if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)ctx->frame_worker->data1; *arg = frame_worker_data->pbi->common.quant_params.base_qindex; return AOM_CODEC_OK; } static aom_codec_err_t ctrl_get_show_frame_flag(aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return AOM_CODEC_INVALID_PARAM; if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)ctx->frame_worker->data1; *arg = frame_worker_data->pbi->common.show_frame; return AOM_CODEC_OK; } static aom_codec_err_t ctrl_get_order_hint(aom_codec_alg_priv_t *ctx, va_list args) { unsigned int *const arg = va_arg(args, unsigned int *); if (arg == NULL) return AOM_CODEC_INVALID_PARAM; if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)ctx->frame_worker->data1; *arg = frame_worker_data->pbi->common.current_frame.order_hint; return AOM_CODEC_OK; } static aom_codec_err_t ctrl_get_mi_info(aom_codec_alg_priv_t *ctx, va_list args) { int mi_row = va_arg(args, int); int mi_col = va_arg(args, int); MB_MODE_INFO *mi = va_arg(args, MB_MODE_INFO *); if (mi == NULL) return AOM_CODEC_INVALID_PARAM; if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)ctx->frame_worker->data1; if (frame_worker_data == NULL) return AOM_CODEC_ERROR; AV1_COMMON *cm = &frame_worker_data->pbi->common; const int mi_rows = cm->mi_params.mi_rows; const int mi_cols = cm->mi_params.mi_cols; const int mi_stride = cm->mi_params.mi_stride; const int offset = mi_row * mi_stride + mi_col; if (mi_row < 0 || mi_row >= mi_rows || mi_col < 0 || mi_col >= mi_cols) { return AOM_CODEC_INVALID_PARAM; } memcpy(mi, cm->mi_params.mi_grid_base[offset], sizeof(*mi)); return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_invert_tile_order(aom_codec_alg_priv_t *ctx, va_list args) { ctx->invert_tile_order = va_arg(args, int); return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_byte_alignment(aom_codec_alg_priv_t *ctx, va_list args) { const int legacy_byte_alignment = 0; const int min_byte_alignment = 32; const int max_byte_alignment = 1024; const int byte_alignment = va_arg(args, int); if (byte_alignment != legacy_byte_alignment && (byte_alignment < min_byte_alignment || byte_alignment > max_byte_alignment || (byte_alignment & (byte_alignment - 1)) != 0)) return AOM_CODEC_INVALID_PARAM; ctx->byte_alignment = byte_alignment; if (ctx->frame_worker) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; frame_worker_data->pbi->common.features.byte_alignment = byte_alignment; } return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_skip_loop_filter(aom_codec_alg_priv_t *ctx, va_list args) { ctx->skip_loop_filter = va_arg(args, int); if (ctx->frame_worker) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; frame_worker_data->pbi->skip_loop_filter = ctx->skip_loop_filter; } return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_skip_film_grain(aom_codec_alg_priv_t *ctx, va_list args) { ctx->skip_film_grain = va_arg(args, int); if (ctx->frame_worker) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; frame_worker_data->pbi->skip_film_grain = ctx->skip_film_grain; } return AOM_CODEC_OK; } static aom_codec_err_t ctrl_get_accounting(aom_codec_alg_priv_t *ctx, va_list args) { #if !CONFIG_ACCOUNTING (void)ctx; (void)args; return AOM_CODEC_INCAPABLE; #else Accounting **acct = va_arg(args, Accounting **); if (acct) { if (ctx->frame_worker) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; AV1Decoder *pbi = frame_worker_data->pbi; *acct = &pbi->accounting; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } return AOM_CODEC_INVALID_PARAM; #endif } static aom_codec_err_t ctrl_set_decode_tile_row(aom_codec_alg_priv_t *ctx, va_list args) { ctx->decode_tile_row = va_arg(args, int); return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_decode_tile_col(aom_codec_alg_priv_t *ctx, va_list args) { ctx->decode_tile_col = va_arg(args, int); return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_tile_mode(aom_codec_alg_priv_t *ctx, va_list args) { ctx->tile_mode = va_arg(args, unsigned int); return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_is_annexb(aom_codec_alg_priv_t *ctx, va_list args) { ctx->is_annexb = va_arg(args, unsigned int); return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_operating_point(aom_codec_alg_priv_t *ctx, va_list args) { ctx->operating_point = va_arg(args, int); return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_output_all_layers(aom_codec_alg_priv_t *ctx, va_list args) { ctx->output_all_layers = va_arg(args, int); return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_inspection_callback(aom_codec_alg_priv_t *ctx, va_list args) { #if !CONFIG_INSPECTION (void)ctx; (void)args; return AOM_CODEC_INCAPABLE; #else aom_inspect_init *init = va_arg(args, aom_inspect_init *); ctx->inspect_cb = init->inspect_cb; ctx->inspect_ctx = init->inspect_ctx; return AOM_CODEC_OK; #endif } static aom_codec_err_t ctrl_ext_tile_debug(aom_codec_alg_priv_t *ctx, va_list args) { ctx->ext_tile_debug = va_arg(args, int); return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx, va_list args) { ctx->row_mt = va_arg(args, unsigned int); return AOM_CODEC_OK; } static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { AV1_COPY_REFERENCE, ctrl_copy_reference }, // Setters { AV1_SET_REFERENCE, ctrl_set_reference }, { AV1_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order }, { AV1_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment }, { AV1_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter }, { AV1_SET_DECODE_TILE_ROW, ctrl_set_decode_tile_row }, { AV1_SET_DECODE_TILE_COL, ctrl_set_decode_tile_col }, { AV1_SET_TILE_MODE, ctrl_set_tile_mode }, { AV1D_SET_IS_ANNEXB, ctrl_set_is_annexb }, { AV1D_SET_OPERATING_POINT, ctrl_set_operating_point }, { AV1D_SET_OUTPUT_ALL_LAYERS, ctrl_set_output_all_layers }, { AV1_SET_INSPECTION_CALLBACK, ctrl_set_inspection_callback }, { AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug }, { AV1D_SET_ROW_MT, ctrl_set_row_mt }, { AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr }, { AV1D_SET_SKIP_FILM_GRAIN, ctrl_set_skip_film_grain }, // Getters { AOMD_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted }, { AOMD_GET_LAST_QUANTIZER, ctrl_get_last_quantizer }, { AOMD_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates }, { AV1D_GET_BIT_DEPTH, ctrl_get_bit_depth }, { AV1D_GET_IMG_FORMAT, ctrl_get_img_format }, { AV1D_GET_TILE_SIZE, ctrl_get_tile_size }, { AV1D_GET_TILE_COUNT, ctrl_get_tile_count }, { AV1D_GET_DISPLAY_SIZE, ctrl_get_render_size }, { AV1D_GET_FRAME_SIZE, ctrl_get_frame_size }, { AV1_GET_ACCOUNTING, ctrl_get_accounting }, { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image }, { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image }, { AV1_GET_REFERENCE, ctrl_get_reference }, { AV1D_GET_FRAME_HEADER_INFO, ctrl_get_frame_header_info }, { AV1D_GET_TILE_DATA, ctrl_get_tile_data }, { AOMD_GET_FWD_KF_PRESENT, ctrl_get_fwd_kf_value }, { AOMD_GET_ALTREF_PRESENT, ctrl_get_altref_present }, { AOMD_GET_FRAME_FLAGS, ctrl_get_frame_flags }, { AOMD_GET_TILE_INFO, ctrl_get_tile_info }, { AOMD_GET_SCREEN_CONTENT_TOOLS_INFO, ctrl_get_screen_content_tools_info }, { AOMD_GET_STILL_PICTURE, ctrl_get_still_picture }, { AOMD_GET_SB_SIZE, ctrl_get_sb_size }, { AOMD_GET_SHOW_EXISTING_FRAME_FLAG, ctrl_get_show_existing_frame_flag }, { AOMD_GET_S_FRAME_INFO, ctrl_get_s_frame_info }, { AOMD_GET_SHOW_FRAME_FLAG, ctrl_get_show_frame_flag }, { AOMD_GET_BASE_Q_IDX, ctrl_get_base_q_idx }, { AOMD_GET_ORDER_HINT, ctrl_get_order_hint }, { AV1D_GET_MI_INFO, ctrl_get_mi_info }, CTRL_MAP_END, }; // This data structure and function are exported in aom/aomdx.h #ifndef VERSION_STRING #define VERSION_STRING #endif aom_codec_iface_t aom_codec_av1_dx_algo = { "AOMedia Project AV1 Decoder" VERSION_STRING, AOM_CODEC_INTERNAL_ABI_VERSION, AOM_CODEC_CAP_DECODER | AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER, // aom_codec_caps_t decoder_init, // aom_codec_init_fn_t decoder_destroy, // aom_codec_destroy_fn_t decoder_ctrl_maps, // aom_codec_ctrl_fn_map_t { // NOLINT decoder_peek_si, // aom_codec_peek_si_fn_t decoder_get_si, // aom_codec_get_si_fn_t decoder_decode, // aom_codec_decode_fn_t decoder_get_frame, // aom_codec_get_frame_fn_t decoder_set_fb_fn, // aom_codec_set_fb_fn_t }, { // NOLINT 0, NULL, // aom_codec_enc_cfg_t NULL, // aom_codec_encode_fn_t NULL, // aom_codec_get_cx_data_fn_t NULL, // aom_codec_enc_config_set_fn_t NULL, // aom_codec_get_global_headers_fn_t NULL // aom_codec_get_preview_frame_fn_t }, NULL // aom_codec_set_option_fn_t }; // Decoder interface for inspecting frame data. It uses decoder_inspect instead // of decoder_decode so it only decodes one frame at a time, whether the frame // is shown or not. aom_codec_iface_t aom_codec_av1_inspect_algo = { "AOMedia Project AV1 Decoder Inspector" VERSION_STRING, AOM_CODEC_INTERNAL_ABI_VERSION, AOM_CODEC_CAP_DECODER | AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER, // aom_codec_caps_t decoder_init, // aom_codec_init_fn_t decoder_destroy, // aom_codec_destroy_fn_t decoder_ctrl_maps, // aom_codec_ctrl_fn_map_t { // NOLINT decoder_peek_si, // aom_codec_peek_si_fn_t decoder_get_si, // aom_codec_get_si_fn_t decoder_inspect, // aom_codec_decode_fn_t decoder_get_frame, // aom_codec_get_frame_fn_t decoder_set_fb_fn, // aom_codec_set_fb_fn_t }, { // NOLINT 0, NULL, // aom_codec_enc_cfg_t NULL, // aom_codec_encode_fn_t NULL, // aom_codec_get_cx_data_fn_t NULL, // aom_codec_enc_config_set_fn_t NULL, // aom_codec_get_global_headers_fn_t NULL // aom_codec_get_preview_frame_fn_t }, NULL // aom_codec_set_option_fn_t }; aom_codec_iface_t *aom_codec_av1_dx(void) { return &aom_codec_av1_dx_algo; } aom-3.12.1/av1/av1_iface_common.h000066400000000000000000000127471477627663500164420ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_AV1_IFACE_COMMON_H_ #define AOM_AV1_AV1_IFACE_COMMON_H_ #include #include "aom_ports/mem.h" #include "aom_scale/yv12config.h" extern aom_codec_iface_t aom_codec_av1_inspect_algo; static inline void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12, void *user_priv) { /* aom_img_wrap() doesn't allow specifying independent strides for * the Y, U, and V planes, nor other alignment adjustments that * might be representable by a YV12_BUFFER_CONFIG, so we just * initialize all the fields. */ int bps; if (!yv12->subsampling_y) { if (!yv12->subsampling_x) { img->fmt = AOM_IMG_FMT_I444; bps = 24; } else { img->fmt = AOM_IMG_FMT_I422; bps = 16; } } else { img->fmt = AOM_IMG_FMT_I420; bps = 12; } img->cp = yv12->color_primaries; img->tc = yv12->transfer_characteristics; img->mc = yv12->matrix_coefficients; img->monochrome = yv12->monochrome; img->csp = yv12->chroma_sample_position; img->range = yv12->color_range; img->bit_depth = 8; img->w = yv12->y_width; img->h = yv12->y_height; img->d_w = yv12->y_crop_width; img->d_h = yv12->y_crop_height; img->r_w = yv12->render_width; img->r_h = yv12->render_height; img->x_chroma_shift = yv12->subsampling_x; img->y_chroma_shift = yv12->subsampling_y; img->planes[AOM_PLANE_Y] = yv12->y_buffer; img->planes[AOM_PLANE_U] = yv12->u_buffer; img->planes[AOM_PLANE_V] = yv12->v_buffer; img->stride[AOM_PLANE_Y] = yv12->y_stride; img->stride[AOM_PLANE_U] = yv12->uv_stride; img->stride[AOM_PLANE_V] = yv12->uv_stride; if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) { bps *= 2; // aom_image_t uses byte strides and a pointer to the first byte // of the image. img->fmt = (aom_img_fmt_t)(img->fmt | AOM_IMG_FMT_HIGHBITDEPTH); img->bit_depth = yv12->bit_depth; img->planes[AOM_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer); img->planes[AOM_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer); img->planes[AOM_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer); img->stride[AOM_PLANE_Y] = 2 * yv12->y_stride; img->stride[AOM_PLANE_U] = 2 * yv12->uv_stride; img->stride[AOM_PLANE_V] = 2 * yv12->uv_stride; } img->bps = bps; img->user_priv = user_priv; img->img_data = yv12->buffer_alloc; img->img_data_owner = 0; img->self_allocd = 0; img->sz = yv12->frame_size; assert(!yv12->metadata); img->metadata = NULL; } static inline aom_codec_err_t image2yuvconfig(const aom_image_t *img, YV12_BUFFER_CONFIG *yv12) { yv12->y_buffer = img->planes[AOM_PLANE_Y]; yv12->u_buffer = img->planes[AOM_PLANE_U]; yv12->v_buffer = img->planes[AOM_PLANE_V]; yv12->y_crop_width = img->d_w; yv12->y_crop_height = img->d_h; yv12->render_width = img->r_w; yv12->render_height = img->r_h; yv12->y_width = img->w; yv12->y_height = img->h; yv12->uv_width = (yv12->y_width + img->x_chroma_shift) >> img->x_chroma_shift; yv12->uv_height = (yv12->y_height + img->y_chroma_shift) >> img->y_chroma_shift; yv12->uv_crop_width = (yv12->y_crop_width + img->x_chroma_shift) >> img->x_chroma_shift; yv12->uv_crop_height = (yv12->y_crop_height + img->y_chroma_shift) >> img->y_chroma_shift; yv12->y_stride = img->stride[AOM_PLANE_Y]; yv12->uv_stride = img->stride[AOM_PLANE_U]; yv12->color_primaries = img->cp; yv12->transfer_characteristics = img->tc; yv12->matrix_coefficients = img->mc; yv12->monochrome = img->monochrome; yv12->chroma_sample_position = img->csp; yv12->color_range = img->range; if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { // In aom_image_t // planes point to uint8 address of start of data // stride counts uint8s to reach next row // In YV12_BUFFER_CONFIG // y_buffer, u_buffer, v_buffer point to uint16 address of data // stride and border counts in uint16s // This means that all the address calculations in the main body of code // should work correctly. // However, before we do any pixel operations we need to cast the address // to a uint16 ponter and double its value. yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer); yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer); yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer); yv12->y_stride >>= 1; yv12->uv_stride >>= 1; yv12->flags = YV12_FLAG_HIGHBITDEPTH; } else { yv12->flags = 0; } // Note(yunqing): if img is allocated the same as the frame buffer, y_stride // is 32-byte aligned. Also, handle the cases while allocating img without a // border or stride_align is less than 32. int border = (yv12->y_stride - (int)((img->w + 31) & ~31u)) / 2; yv12->border = (border < 0) ? 0 : border; yv12->subsampling_x = img->x_chroma_shift; yv12->subsampling_y = img->y_chroma_shift; yv12->metadata = img->metadata; return AOM_CODEC_OK; } #endif // AOM_AV1_AV1_IFACE_COMMON_H_ aom-3.12.1/av1/common/000077500000000000000000000000001477627663500143605ustar00rootroot00000000000000aom-3.12.1/av1/common/alloccommon.c000066400000000000000000000457351477627663500170450ustar00rootroot00000000000000/* * * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_config.h" #include "aom_mem/aom_mem.h" #include "aom_scale/yv12config.h" #include "aom_util/aom_pthread.h" #include "av1/common/alloccommon.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/cdef_block.h" #include "av1/common/entropymode.h" #include "av1/common/entropymv.h" #include "av1/common/enums.h" #include "av1/common/restoration.h" #include "av1/common/thread_common.h" int av1_get_MBs(int width, int height) { const int aligned_width = ALIGN_POWER_OF_TWO(width, 3); const int aligned_height = ALIGN_POWER_OF_TWO(height, 3); const int mi_cols = aligned_width >> MI_SIZE_LOG2; const int mi_rows = aligned_height >> MI_SIZE_LOG2; const int mb_cols = ROUND_POWER_OF_TWO(mi_cols, 2); const int mb_rows = ROUND_POWER_OF_TWO(mi_rows, 2); return mb_rows * mb_cols; } void av1_free_ref_frame_buffers(BufferPool *pool) { int i; for (i = 0; i < pool->num_frame_bufs; ++i) { if (pool->frame_bufs[i].ref_count > 0 && pool->frame_bufs[i].raw_frame_buffer.data != NULL) { pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer); pool->frame_bufs[i].raw_frame_buffer.data = NULL; pool->frame_bufs[i].raw_frame_buffer.size = 0; pool->frame_bufs[i].raw_frame_buffer.priv = NULL; pool->frame_bufs[i].ref_count = 0; } aom_free(pool->frame_bufs[i].mvs); pool->frame_bufs[i].mvs = NULL; aom_free(pool->frame_bufs[i].seg_map); pool->frame_bufs[i].seg_map = NULL; aom_free_frame_buffer(&pool->frame_bufs[i].buf); } aom_free(pool->frame_bufs); pool->frame_bufs = NULL; pool->num_frame_bufs = 0; } static inline void free_cdef_linebuf_conditional( AV1_COMMON *const cm, const size_t *new_linebuf_size) { CdefInfo *cdef_info = &cm->cdef_info; for (int plane = 0; plane < MAX_MB_PLANE; plane++) { if (new_linebuf_size[plane] != cdef_info->allocated_linebuf_size[plane]) { aom_free(cdef_info->linebuf[plane]); cdef_info->linebuf[plane] = NULL; } } } static inline void free_cdef_bufs_conditional(AV1_COMMON *const cm, uint16_t **colbuf, uint16_t **srcbuf, const size_t *new_colbuf_size, const size_t new_srcbuf_size) { CdefInfo *cdef_info = &cm->cdef_info; if (new_srcbuf_size != cdef_info->allocated_srcbuf_size) { aom_free(*srcbuf); *srcbuf = NULL; } for (int plane = 0; plane < MAX_MB_PLANE; plane++) { if (new_colbuf_size[plane] != cdef_info->allocated_colbuf_size[plane]) { aom_free(colbuf[plane]); colbuf[plane] = NULL; } } } static inline void free_cdef_bufs(uint16_t **colbuf, uint16_t **srcbuf) { aom_free(*srcbuf); *srcbuf = NULL; for (int plane = 0; plane < MAX_MB_PLANE; plane++) { aom_free(colbuf[plane]); colbuf[plane] = NULL; } } static inline void free_cdef_row_sync(AV1CdefRowSync **cdef_row_mt, const int num_mi_rows) { if (*cdef_row_mt == NULL) return; #if CONFIG_MULTITHREAD for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) { if ((*cdef_row_mt)[row_idx].row_mutex_ != NULL) { pthread_mutex_destroy((*cdef_row_mt)[row_idx].row_mutex_); aom_free((*cdef_row_mt)[row_idx].row_mutex_); } if ((*cdef_row_mt)[row_idx].row_cond_ != NULL) { pthread_cond_destroy((*cdef_row_mt)[row_idx].row_cond_); aom_free((*cdef_row_mt)[row_idx].row_cond_); } } #else (void)num_mi_rows; #endif // CONFIG_MULTITHREAD aom_free(*cdef_row_mt); *cdef_row_mt = NULL; } void av1_free_cdef_buffers(AV1_COMMON *const cm, AV1CdefWorkerData **cdef_worker, AV1CdefSync *cdef_sync) { CdefInfo *cdef_info = &cm->cdef_info; const int num_mi_rows = cdef_info->allocated_mi_rows; for (int plane = 0; plane < MAX_MB_PLANE; plane++) { aom_free(cdef_info->linebuf[plane]); cdef_info->linebuf[plane] = NULL; } // De-allocation of column buffer & source buffer (worker_0). free_cdef_bufs(cdef_info->colbuf, &cdef_info->srcbuf); free_cdef_row_sync(&cdef_sync->cdef_row_mt, num_mi_rows); if (cdef_info->allocated_num_workers < 2) return; if (*cdef_worker != NULL) { for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--) { // De-allocation of column buffer & source buffer for remaining workers. free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf); } aom_free(*cdef_worker); *cdef_worker = NULL; } } static inline void alloc_cdef_linebuf(AV1_COMMON *const cm, uint16_t **linebuf, const int num_planes) { CdefInfo *cdef_info = &cm->cdef_info; for (int plane = 0; plane < num_planes; plane++) { if (linebuf[plane] == NULL) CHECK_MEM_ERROR(cm, linebuf[plane], aom_malloc(cdef_info->allocated_linebuf_size[plane])); } } static inline void alloc_cdef_bufs(AV1_COMMON *const cm, uint16_t **colbuf, uint16_t **srcbuf, const int num_planes) { CdefInfo *cdef_info = &cm->cdef_info; if (*srcbuf == NULL) CHECK_MEM_ERROR(cm, *srcbuf, aom_memalign(16, cdef_info->allocated_srcbuf_size)); for (int plane = 0; plane < num_planes; plane++) { if (colbuf[plane] == NULL) CHECK_MEM_ERROR(cm, colbuf[plane], aom_malloc(cdef_info->allocated_colbuf_size[plane])); } } static inline void alloc_cdef_row_sync(AV1_COMMON *const cm, AV1CdefRowSync **cdef_row_mt, const int num_mi_rows) { if (*cdef_row_mt != NULL) return; CHECK_MEM_ERROR(cm, *cdef_row_mt, aom_calloc(num_mi_rows, sizeof(**cdef_row_mt))); #if CONFIG_MULTITHREAD for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) { CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_mutex_, aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_mutex_))); pthread_mutex_init((*cdef_row_mt)[row_idx].row_mutex_, NULL); CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_cond_, aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_cond_))); pthread_cond_init((*cdef_row_mt)[row_idx].row_cond_, NULL); } #endif // CONFIG_MULTITHREAD } void av1_alloc_cdef_buffers(AV1_COMMON *const cm, AV1CdefWorkerData **cdef_worker, AV1CdefSync *cdef_sync, int num_workers, int init_worker) { const int num_planes = av1_num_planes(cm); size_t new_linebuf_size[MAX_MB_PLANE] = { 0 }; size_t new_colbuf_size[MAX_MB_PLANE] = { 0 }; size_t new_srcbuf_size = 0; CdefInfo *const cdef_info = &cm->cdef_info; // Check for configuration change const int num_mi_rows = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; const int is_num_workers_changed = cdef_info->allocated_num_workers != num_workers; const int is_cdef_enabled = cm->seq_params->enable_cdef && !cm->tiles.single_tile_decoding; // num-bufs=3 represents ping-pong buffers for top linebuf, // followed by bottom linebuf. // ping-pong is to avoid top linebuf over-write by consecutive row. int num_bufs = 3; if (num_workers > 1) num_bufs = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; if (is_cdef_enabled) { // Calculate src buffer size new_srcbuf_size = sizeof(*cdef_info->srcbuf) * CDEF_INBUF_SIZE; for (int plane = 0; plane < num_planes; plane++) { const int shift = plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x; // Calculate top and bottom line buffer size const int luma_stride = ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4); new_linebuf_size[plane] = sizeof(*cdef_info->linebuf) * num_bufs * (CDEF_VBORDER << 1) * (luma_stride >> shift); // Calculate column buffer size const int block_height = (CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - shift)) * 2 * CDEF_VBORDER; new_colbuf_size[plane] = sizeof(*cdef_info->colbuf[plane]) * block_height * CDEF_HBORDER; } } // Free src, line and column buffers for worker 0 in case of reallocation free_cdef_linebuf_conditional(cm, new_linebuf_size); free_cdef_bufs_conditional(cm, cdef_info->colbuf, &cdef_info->srcbuf, new_colbuf_size, new_srcbuf_size); // The flag init_worker indicates if cdef_worker has to be allocated for the // frame. This is passed as 1 always from decoder. At encoder side, it is 0 // when called for parallel frames during FPMT (where cdef_worker is shared // across parallel frames) and 1 otherwise. if (*cdef_worker != NULL && init_worker) { if (is_num_workers_changed) { // Free src and column buffers for remaining workers in case of change in // num_workers for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--) free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf); aom_free(*cdef_worker); *cdef_worker = NULL; } else if (num_workers > 1) { // Free src and column buffers for remaining workers in case of // reallocation for (int idx = num_workers - 1; idx >= 1; idx--) free_cdef_bufs_conditional(cm, (*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf, new_colbuf_size, new_srcbuf_size); } } if (cdef_info->allocated_mi_rows != num_mi_rows) free_cdef_row_sync(&cdef_sync->cdef_row_mt, cdef_info->allocated_mi_rows); // Store allocated sizes for reallocation cdef_info->allocated_srcbuf_size = new_srcbuf_size; av1_copy(cdef_info->allocated_colbuf_size, new_colbuf_size); av1_copy(cdef_info->allocated_linebuf_size, new_linebuf_size); // Store configuration to check change in configuration cdef_info->allocated_mi_rows = num_mi_rows; cdef_info->allocated_num_workers = num_workers; if (!is_cdef_enabled) return; // Memory allocation of column buffer & source buffer (worker_0). alloc_cdef_bufs(cm, cdef_info->colbuf, &cdef_info->srcbuf, num_planes); alloc_cdef_linebuf(cm, cdef_info->linebuf, num_planes); if (num_workers < 2) return; if (init_worker) { if (*cdef_worker == NULL) CHECK_MEM_ERROR(cm, *cdef_worker, aom_calloc(num_workers, sizeof(**cdef_worker))); // Memory allocation of column buffer & source buffer for remaining workers. for (int idx = num_workers - 1; idx >= 1; idx--) alloc_cdef_bufs(cm, (*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf, num_planes); } alloc_cdef_row_sync(cm, &cdef_sync->cdef_row_mt, cdef_info->allocated_mi_rows); } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // Allocate buffers which are independent of restoration_unit_size void av1_alloc_restoration_buffers(AV1_COMMON *cm, bool is_sgr_enabled) { const int num_planes = av1_num_planes(cm); if (cm->rst_tmpbuf == NULL && is_sgr_enabled) { CHECK_MEM_ERROR(cm, cm->rst_tmpbuf, (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE)); } if (cm->rlbs == NULL) { CHECK_MEM_ERROR(cm, cm->rlbs, aom_malloc(sizeof(RestorationLineBuffers))); } // For striped loop restoration, we divide each plane into "stripes", // of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET // luma pixels to match the output from CDEF. We will need to store 2 * // RESTORATION_CTX_VERT lines of data for each stripe. int mi_h = cm->mi_params.mi_rows; const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2); const int num_stripes = (ext_h + 63) / 64; // Now we need to allocate enough space to store the line buffers for the // stripes const int frame_w = cm->superres_upscaled_width; const int use_highbd = cm->seq_params->use_highbitdepth; for (int p = 0; p < num_planes; ++p) { const int is_uv = p > 0; const int ss_x = is_uv && cm->seq_params->subsampling_x; const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ; const int stride = ALIGN_POWER_OF_TWO(plane_w, 5); const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT << use_highbd; RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries; if (buf_size != boundaries->stripe_boundary_size || boundaries->stripe_boundary_above == NULL || boundaries->stripe_boundary_below == NULL) { aom_free(boundaries->stripe_boundary_above); aom_free(boundaries->stripe_boundary_below); CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_above, (uint8_t *)aom_memalign(32, buf_size)); CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_below, (uint8_t *)aom_memalign(32, buf_size)); boundaries->stripe_boundary_size = buf_size; } boundaries->stripe_boundary_stride = stride; } } void av1_free_restoration_buffers(AV1_COMMON *cm) { int p; for (p = 0; p < MAX_MB_PLANE; ++p) av1_free_restoration_struct(&cm->rst_info[p]); aom_free(cm->rst_tmpbuf); cm->rst_tmpbuf = NULL; aom_free(cm->rlbs); cm->rlbs = NULL; for (p = 0; p < MAX_MB_PLANE; ++p) { RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries; aom_free(boundaries->stripe_boundary_above); aom_free(boundaries->stripe_boundary_below); boundaries->stripe_boundary_above = NULL; boundaries->stripe_boundary_below = NULL; } aom_free_frame_buffer(&cm->rst_frame); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void av1_free_above_context_buffers(CommonContexts *above_contexts) { int i; const int num_planes = above_contexts->num_planes; for (int tile_row = 0; tile_row < above_contexts->num_tile_rows; tile_row++) { for (i = 0; i < num_planes; i++) { if (above_contexts->entropy[i] == NULL) break; aom_free(above_contexts->entropy[i][tile_row]); above_contexts->entropy[i][tile_row] = NULL; } if (above_contexts->partition != NULL) { aom_free(above_contexts->partition[tile_row]); above_contexts->partition[tile_row] = NULL; } if (above_contexts->txfm != NULL) { aom_free(above_contexts->txfm[tile_row]); above_contexts->txfm[tile_row] = NULL; } } for (i = 0; i < num_planes; i++) { aom_free(above_contexts->entropy[i]); above_contexts->entropy[i] = NULL; } aom_free(above_contexts->partition); above_contexts->partition = NULL; aom_free(above_contexts->txfm); above_contexts->txfm = NULL; above_contexts->num_tile_rows = 0; above_contexts->num_mi_cols = 0; above_contexts->num_planes = 0; } void av1_free_context_buffers(AV1_COMMON *cm) { if (cm->mi_params.free_mi != NULL) cm->mi_params.free_mi(&cm->mi_params); av1_free_above_context_buffers(&cm->above_contexts); } int av1_alloc_above_context_buffers(CommonContexts *above_contexts, int num_tile_rows, int num_mi_cols, int num_planes) { const int aligned_mi_cols = ALIGN_POWER_OF_TWO(num_mi_cols, MAX_MIB_SIZE_LOG2); // Allocate above context buffers above_contexts->num_tile_rows = num_tile_rows; above_contexts->num_mi_cols = aligned_mi_cols; above_contexts->num_planes = num_planes; for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) { above_contexts->entropy[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc( num_tile_rows, sizeof(above_contexts->entropy[0])); if (!above_contexts->entropy[plane_idx]) return 1; } above_contexts->partition = (PARTITION_CONTEXT **)aom_calloc( num_tile_rows, sizeof(above_contexts->partition)); if (!above_contexts->partition) return 1; above_contexts->txfm = (TXFM_CONTEXT **)aom_calloc(num_tile_rows, sizeof(above_contexts->txfm)); if (!above_contexts->txfm) return 1; for (int tile_row = 0; tile_row < num_tile_rows; tile_row++) { for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) { above_contexts->entropy[plane_idx][tile_row] = (ENTROPY_CONTEXT *)aom_calloc( aligned_mi_cols, sizeof(*above_contexts->entropy[0][tile_row])); if (!above_contexts->entropy[plane_idx][tile_row]) return 1; } above_contexts->partition[tile_row] = (PARTITION_CONTEXT *)aom_calloc( aligned_mi_cols, sizeof(*above_contexts->partition[tile_row])); if (!above_contexts->partition[tile_row]) return 1; above_contexts->txfm[tile_row] = (TXFM_CONTEXT *)aom_calloc( aligned_mi_cols, sizeof(*above_contexts->txfm[tile_row])); if (!above_contexts->txfm[tile_row]) return 1; } return 0; } // Allocate the dynamically allocated arrays in 'mi_params' assuming // 'mi_params->set_mb_mi()' was already called earlier to initialize the rest of // the struct members. static int alloc_mi(CommonModeInfoParams *mi_params) { const int aligned_mi_rows = calc_mi_size(mi_params->mi_rows); const int mi_grid_size = mi_params->mi_stride * aligned_mi_rows; const int alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; const int alloc_mi_size = mi_params->mi_alloc_stride * (aligned_mi_rows / alloc_size_1d); if (mi_params->mi_alloc_size < alloc_mi_size || mi_params->mi_grid_size < mi_grid_size) { mi_params->free_mi(mi_params); mi_params->mi_alloc = aom_calloc(alloc_mi_size, sizeof(*mi_params->mi_alloc)); if (!mi_params->mi_alloc) return 1; mi_params->mi_alloc_size = alloc_mi_size; mi_params->mi_grid_base = (MB_MODE_INFO **)aom_calloc( mi_grid_size, sizeof(*mi_params->mi_grid_base)); if (!mi_params->mi_grid_base) return 1; mi_params->tx_type_map = aom_calloc(mi_grid_size, sizeof(*mi_params->tx_type_map)); if (!mi_params->tx_type_map) return 1; mi_params->mi_grid_size = mi_grid_size; } return 0; } int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height, BLOCK_SIZE min_partition_size) { CommonModeInfoParams *const mi_params = &cm->mi_params; mi_params->set_mb_mi(mi_params, width, height, min_partition_size); if (alloc_mi(mi_params)) goto fail; return 0; fail: // clear the mi_* values to force a realloc on resync mi_params->set_mb_mi(mi_params, 0, 0, BLOCK_4X4); av1_free_context_buffers(cm); return 1; } void av1_remove_common(AV1_COMMON *cm) { av1_free_context_buffers(cm); aom_free(cm->fc); cm->fc = NULL; aom_free(cm->default_frame_context); cm->default_frame_context = NULL; } void av1_init_mi_buffers(CommonModeInfoParams *mi_params) { mi_params->setup_mi(mi_params); } aom-3.12.1/av1/common/alloccommon.h000066400000000000000000000046541477627663500170450ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_ALLOCCOMMON_H_ #define AOM_AV1_COMMON_ALLOCCOMMON_H_ #define INVALID_IDX -1 // Invalid buffer index. #include #include "config/aom_config.h" #include "av1/common/enums.h" #ifdef __cplusplus extern "C" { #endif struct AV1Common; struct BufferPool; struct CommonContexts; struct CommonModeInfoParams; struct AV1CdefWorker; struct AV1CdefSyncData; void av1_remove_common(struct AV1Common *cm); int av1_alloc_above_context_buffers(struct CommonContexts *above_contexts, int num_tile_rows, int num_mi_cols, int num_planes); void av1_free_above_context_buffers(struct CommonContexts *above_contexts); int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height, BLOCK_SIZE min_partition_size); void av1_init_mi_buffers(struct CommonModeInfoParams *mi_params); void av1_free_context_buffers(struct AV1Common *cm); void av1_free_ref_frame_buffers(struct BufferPool *pool); void av1_alloc_cdef_buffers(struct AV1Common *const cm, struct AV1CdefWorker **cdef_worker, struct AV1CdefSyncData *cdef_sync, int num_workers, int init_worker); void av1_free_cdef_buffers(struct AV1Common *const cm, struct AV1CdefWorker **cdef_worker, struct AV1CdefSyncData *cdef_sync); #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void av1_alloc_restoration_buffers(struct AV1Common *cm, bool is_sgr_enabled); void av1_free_restoration_buffers(struct AV1Common *cm); #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height); void av1_free_state_buffers(struct AV1Common *cm); int av1_get_MBs(int width, int height); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_ALLOCCOMMON_H_ aom-3.12.1/av1/common/arm/000077500000000000000000000000001477627663500151375ustar00rootroot00000000000000aom-3.12.1/av1/common/arm/av1_convolve_horiz_rs_neon.c000066400000000000000000000152211477627663500226440ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "av1/common/resize.h" static inline uint8x8_t convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t filter) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0); sum = vmla_lane_s16(sum, s1, filter_lo, 1); sum = vmla_lane_s16(sum, s2, filter_lo, 2); sum = vmla_lane_s16(sum, s5, filter_hi, 1); sum = vmla_lane_s16(sum, s6, filter_hi, 2); sum = vmla_lane_s16(sum, s7, filter_hi, 3); sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3)); sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0)); return vqrshrun_n_s16(vcombine_s16(sum, vdup_n_s16(0)), FILTER_BITS); } static inline uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t filter) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0); sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3)); sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0)); return vqrshrun_n_s16(sum, FILTER_BITS); } void av1_convolve_horiz_rs_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filter, int x0_qn, int x_step_qn) { if ((w == 4 && h % 4 != 0) || (w % 8 == 0 && h % 8 != 0) || w % 8 != 0) { av1_convolve_horiz_rs_c(src, src_stride, dst, dst_stride, w, h, x_filter, x0_qn, x_step_qn); return; } DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; if (w == 4) { do { int x_qn = x0_qn; // Process a 4x4 tile. for (int r = 0; r < 4; ++r) { const uint8_t *const s = &src[x_qn >> RS_SCALE_SUBPEL_BITS]; const ptrdiff_t filter_offset = UPSCALE_NORMATIVE_TAPS * ((x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(x_filter + filter_offset); uint8x8_t t0, t1, t2, t3; load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); uint8x8_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); store_u8_4x1(&temp[r * 4], d0); x_qn += x_step_qn; } // Transpose the 4x4 result tile and store. uint8x8_t d01 = vld1_u8(temp + 0); uint8x8_t d23 = vld1_u8(temp + 8); transpose_elems_inplace_u8_4x4(&d01, &d23); store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01); store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23); dst += 4 * dst_stride; src += 4 * src_stride; h -= 4; } while (h > 0); } else { do { int x_qn = x0_qn; uint8_t *d = dst; int width = w; do { // Process an 8x8 tile. for (int r = 0; r < 8; ++r) { const uint8_t *const s = &src[x_qn >> RS_SCALE_SUBPEL_BITS]; const ptrdiff_t filter_offset = UPSCALE_NORMATIVE_TAPS * ((x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(x_filter + filter_offset); uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); vst1_u8(&temp[r * 8], d0); x_qn += x_step_qn; } // Transpose the 8x8 result tile and store. uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); d += 8; width -= 8; } while (width != 0); dst += 8 * dst_stride; src += 8 * src_stride; h -= 8; } while (h > 0); } } aom-3.12.1/av1/common/arm/av1_convolve_scale_neon.c000066400000000000000000000734521477627663500221060ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "av1/common/arm/convolve_scale_neon.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" static inline int16x4_t convolve8_4_h(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t filter, const int32x4_t horiz_const) { int16x4_t filter_lo = vget_low_s16(filter); int16x4_t filter_hi = vget_high_s16(filter); int32x4_t sum = horiz_const; sum = vmlal_lane_s16(sum, s0, filter_lo, 0); sum = vmlal_lane_s16(sum, s1, filter_lo, 1); sum = vmlal_lane_s16(sum, s2, filter_lo, 2); sum = vmlal_lane_s16(sum, s3, filter_lo, 3); sum = vmlal_lane_s16(sum, s4, filter_hi, 0); sum = vmlal_lane_s16(sum, s5, filter_hi, 1); sum = vmlal_lane_s16(sum, s6, filter_hi, 2); sum = vmlal_lane_s16(sum, s7, filter_hi, 3); return vshrn_n_s32(sum, ROUND0_BITS); } static inline int16x8_t convolve8_8_h(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t filter, const int16x8_t horiz_const) { int16x4_t filter_lo = vget_low_s16(filter); int16x4_t filter_hi = vget_high_s16(filter); int16x8_t sum = horiz_const; sum = vmlaq_lane_s16(sum, s0, filter_lo, 0); sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); sum = vmlaq_lane_s16(sum, s3, filter_lo, 3); sum = vmlaq_lane_s16(sum, s4, filter_hi, 0); sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); return vshrq_n_s16(sum, ROUND0_BITS - 1); } static inline void convolve_horiz_scale_8tap_neon(const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w, int h, const int16_t *x_filter, const int subpel_x_qn, const int x_step_qn) { DECLARE_ALIGNED(16, int16_t, temp[8 * 8]); const int bd = 8; if (w == 4) { // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts. const int32x4_t horiz_offset = vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); do { int x_qn = subpel_x_qn; // Process a 4x4 tile. for (int r = 0; r < 4; ++r) { const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(x_filter + filter_offset); uint8x8_t t0, t1, t2, t3; load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t d0 = convolve8_4_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset); vst1_s16(&temp[r * 4], d0); x_qn += x_step_qn; } // Transpose the 4x4 result tile and store. int16x4_t d0, d1, d2, d3; load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3); transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); dst += 4 * dst_stride; src += 4 * src_stride; h -= 4; } while (h > 0); } else { // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts. // The additional -1 is needed because we are halving the filter values. const int16x8_t horiz_offset = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2))); do { int x_qn = subpel_x_qn; int16_t *d = dst; int width = w; do { // Process an 8x8 tile. for (int r = 0; r < 8; ++r) { const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); int16x8_t filter = vld1q_s16(x_filter + filter_offset); // Filter values are all even so halve them to allow convolution // kernel computations to stay in 16-bit element types. filter = vshrq_n_s16(filter, 1); uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); int16x8_t d0 = convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset); vst1q_s16(&temp[r * 8], d0); x_qn += x_step_qn; } // Transpose the 8x8 result tile and store. int16x8_t d0, d1, d2, d3, d4, d5, d6, d7; load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); d += 8; width -= 8; } while (width != 0); dst += 8 * dst_stride; src += 8 * src_stride; h -= 8; } while (h > 0); } } static inline int16x4_t convolve6_4_h(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x8_t filter, const int32x4_t horiz_const) { int16x4_t filter_lo = vget_low_s16(filter); int16x4_t filter_hi = vget_high_s16(filter); int32x4_t sum = horiz_const; // Filter values at indices 0 and 7 are 0. sum = vmlal_lane_s16(sum, s0, filter_lo, 1); sum = vmlal_lane_s16(sum, s1, filter_lo, 2); sum = vmlal_lane_s16(sum, s2, filter_lo, 3); sum = vmlal_lane_s16(sum, s3, filter_hi, 0); sum = vmlal_lane_s16(sum, s4, filter_hi, 1); sum = vmlal_lane_s16(sum, s5, filter_hi, 2); return vshrn_n_s32(sum, ROUND0_BITS); } static inline int16x8_t convolve6_8_h(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t filter, const int16x8_t horiz_const) { int16x4_t filter_lo = vget_low_s16(filter); int16x4_t filter_hi = vget_high_s16(filter); int16x8_t sum = horiz_const; // Filter values at indices 0 and 7 are 0. sum = vmlaq_lane_s16(sum, s0, filter_lo, 1); sum = vmlaq_lane_s16(sum, s1, filter_lo, 2); sum = vmlaq_lane_s16(sum, s2, filter_lo, 3); sum = vmlaq_lane_s16(sum, s3, filter_hi, 0); sum = vmlaq_lane_s16(sum, s4, filter_hi, 1); sum = vmlaq_lane_s16(sum, s5, filter_hi, 2); // We halved the filter values so -1 from right shift. return vshrq_n_s16(sum, ROUND0_BITS - 1); } static inline void convolve_horiz_scale_6tap_neon(const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w, int h, const int16_t *x_filter, const int subpel_x_qn, const int x_step_qn) { DECLARE_ALIGNED(16, int16_t, temp[8 * 8]); const int bd = 8; if (w == 4) { // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts. const int32x4_t horiz_offset = vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); do { int x_qn = subpel_x_qn; // Process a 4x4 tile. for (int r = 0; r < 4; ++r) { const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(x_filter + filter_offset); uint8x8_t t0, t1, t2, t3; load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t s3 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t d0 = convolve6_4_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset); vst1_s16(&temp[r * 4], d0); x_qn += x_step_qn; } // Transpose the 4x4 result tile and store. int16x4_t d0, d1, d2, d3; load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3); transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); dst += 4 * dst_stride; src += 4 * src_stride; h -= 4; } while (h > 0); } else { // The shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts. // The additional -1 is needed because we are halving the filter values. const int16x8_t horiz_offset = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2))); do { int x_qn = subpel_x_qn; int16_t *d = dst; int width = w; do { // Process an 8x8 tile. for (int r = 0; r < 8; ++r) { const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); int16x8_t filter = vld1q_s16(x_filter + filter_offset); // Filter values are all even so halve them to allow convolution // kernel computations to stay in 16-bit element types. filter = vshrq_n_s16(filter, 1); uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t d0 = convolve6_8_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset); vst1q_s16(&temp[r * 8], d0); x_qn += x_step_qn; } // Transpose the 8x8 result tile and store. int16x8_t d0, d1, d2, d3, d4, d5, d6, d7; load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); d += 8; width -= 8; } while (width != 0); dst += 8 * dst_stride; src += 8 * src_stride; h -= 8; } while (h > 0); } } static inline void convolve_horiz_scale_2_8tap_neon( const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w, int h, const int16_t *x_filter) { const int bd = 8; if (w == 4) { // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. const int32x4_t horiz_offset = vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); const int16x8_t filter = vld1q_s16(x_filter); do { uint8x16_t t0, t1, t2, t3; load_u8_16x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_16x4(&t0, &t1, &t2, &t3); int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1))); int16x8_t tt2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2))); int16x8_t tt3 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3))); int16x8_t tt4 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); int16x8_t tt5 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1))); int16x8_t tt6 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2))); int16x8_t tt7 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3))); int16x4_t s0 = vget_low_s16(tt0); int16x4_t s1 = vget_low_s16(tt1); int16x4_t s2 = vget_low_s16(tt2); int16x4_t s3 = vget_low_s16(tt3); int16x4_t s4 = vget_high_s16(tt0); int16x4_t s5 = vget_high_s16(tt1); int16x4_t s6 = vget_high_s16(tt2); int16x4_t s7 = vget_high_s16(tt3); int16x4_t s8 = vget_low_s16(tt4); int16x4_t s9 = vget_low_s16(tt5); int16x4_t s10 = vget_low_s16(tt6); int16x4_t s11 = vget_low_s16(tt7); int16x4_t s12 = vget_high_s16(tt4); int16x4_t s13 = vget_high_s16(tt5); int16x4_t d0 = convolve8_4_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset); int16x4_t d1 = convolve8_4_h(s2, s3, s4, s5, s6, s7, s8, s9, filter, horiz_offset); int16x4_t d2 = convolve8_4_h(s4, s5, s6, s7, s8, s9, s10, s11, filter, horiz_offset); int16x4_t d3 = convolve8_4_h(s6, s7, s8, s9, s10, s11, s12, s13, filter, horiz_offset); transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); dst += 4 * dst_stride; src += 4 * src_stride; h -= 4; } while (h > 0); } else { // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // The additional -1 is needed because we are halving the filter values. const int16x8_t horiz_offset = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2))); // Filter values are all even so halve them to allow convolution // kernel computations to stay in 16-bit element types. const int16x8_t filter = vshrq_n_s16(vld1q_s16(x_filter), 1); do { const uint8_t *s = src; int16_t *d = dst; int width = w; uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); s += 8; int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); do { uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15; load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15); transpose_elems_u8_8x8(t8, t9, t10, t11, t12, t13, t14, t15, &t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13)); int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14)); int16x8_t s15 = vreinterpretq_s16_u16(vmovl_u8(t15)); int16x8_t d0 = convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset); int16x8_t d1 = convolve8_8_h(s2, s3, s4, s5, s6, s7, s8, s9, filter, horiz_offset); int16x8_t d2 = convolve8_8_h(s4, s5, s6, s7, s8, s9, s10, s11, filter, horiz_offset); int16x8_t d3 = convolve8_8_h(s6, s7, s8, s9, s10, s11, s12, s13, filter, horiz_offset); transpose_elems_inplace_s16_8x4(&d0, &d1, &d2, &d3); store_s16_4x8(d, dst_stride, vget_low_s16(d0), vget_low_s16(d1), vget_low_s16(d2), vget_low_s16(d3), vget_high_s16(d0), vget_high_s16(d1), vget_high_s16(d2), vget_high_s16(d3)); s0 = s8; s1 = s9; s2 = s10; s3 = s11; s4 = s12; s5 = s13; s6 = s14; s7 = s15; s += 8; d += 4; width -= 4; } while (width != 0); dst += 8 * dst_stride; src += 8 * src_stride; h -= 8; } while (h > 0); } } static inline void convolve_horiz_scale_2_6tap_neon( const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w, int h, const int16_t *x_filter) { const int bd = 8; if (w == 4) { // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. const int32x4_t horiz_offset = vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); const int16x8_t filter = vld1q_s16(x_filter); do { uint8x16_t t0, t1, t2, t3; load_u8_16x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_16x4(&t0, &t1, &t2, &t3); int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1))); int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2))); int16x8_t tt2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3))); int16x8_t tt3 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); int16x8_t tt4 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); int16x8_t tt5 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1))); int16x8_t tt6 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2))); int16x8_t tt7 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3))); int16x4_t s0 = vget_low_s16(tt0); int16x4_t s1 = vget_low_s16(tt1); int16x4_t s2 = vget_low_s16(tt2); int16x4_t s3 = vget_high_s16(tt3); int16x4_t s4 = vget_high_s16(tt0); int16x4_t s5 = vget_high_s16(tt1); int16x4_t s6 = vget_high_s16(tt2); int16x4_t s7 = vget_low_s16(tt4); int16x4_t s8 = vget_low_s16(tt5); int16x4_t s9 = vget_low_s16(tt6); int16x4_t s10 = vget_low_s16(tt7); int16x4_t s11 = vget_high_s16(tt4); int16x4_t d0 = convolve6_4_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset); int16x4_t d1 = convolve6_4_h(s2, s3, s4, s5, s6, s7, filter, horiz_offset); int16x4_t d2 = convolve6_4_h(s4, s5, s6, s7, s8, s9, filter, horiz_offset); int16x4_t d3 = convolve6_4_h(s6, s7, s8, s9, s10, s11, filter, horiz_offset); transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); dst += 4 * dst_stride; src += 4 * src_stride; h -= 4; } while (h > 0); } else { // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // The additional -1 is needed because we are halving the filter values. const int16x8_t horiz_offset = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2))); // Filter values are all even so halve them to allow convolution // kernel computations to stay in 16-bit element types. const int16x8_t filter = vshrq_n_s16(vld1q_s16(x_filter), 1); do { const uint8_t *s = src; int16_t *d = dst; int width = w; uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); s += 8; int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t7)); do { uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15; load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15); transpose_elems_u8_8x8(t8, t9, t10, t11, t12, t13, t14, t15, &t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13)); int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14)); int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15)); int16x8_t d0 = convolve6_8_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset); int16x8_t d1 = convolve6_8_h(s2, s3, s4, s5, s6, s7, filter, horiz_offset); int16x8_t d2 = convolve6_8_h(s4, s5, s6, s7, s8, s9, filter, horiz_offset); int16x8_t d3 = convolve6_8_h(s6, s7, s8, s9, s10, s11, filter, horiz_offset); transpose_elems_inplace_s16_8x4(&d0, &d1, &d2, &d3); store_s16_4x8(d, dst_stride, vget_low_s16(d0), vget_low_s16(d1), vget_low_s16(d2), vget_low_s16(d3), vget_high_s16(d0), vget_high_s16(d1), vget_high_s16(d2), vget_high_s16(d3)); s0 = s8; s1 = s9; s2 = s10; s3 = s11; s4 = s12; s5 = s13; s6 = s14; s += 8; d += 4; width -= 4; } while (width != 0); dst += 8 * dst_stride; src += 8 * src_stride; h -= 8; } while (h > 0); } } void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params) { if (w < 4 || h < 4) { av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn, y_step_qn, conv_params); return; } // For the interpolation 8-tap filters are used. assert(filter_params_y->taps <= 8 && filter_params_x->taps <= 8); DECLARE_ALIGNED(32, int16_t, im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps; int im_stride = MAX_SB_SIZE; CONV_BUF_TYPE *dst16 = conv_params->dst; const int dst16_stride = conv_params->dst_stride; // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2 // lines post both horizontally and vertically. const ptrdiff_t horiz_offset = filter_params_x->taps / 2 - 1; const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride; // Horizontal filter if (x_step_qn != 2 * (1 << SCALE_SUBPEL_BITS)) { if (filter_params_x->interp_filter == MULTITAP_SHARP) { convolve_horiz_scale_8tap_neon( src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn); } else { convolve_horiz_scale_6tap_neon( src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn); } } else { assert(subpel_x_qn < (1 << SCALE_SUBPEL_BITS)); // The filter index is calculated using the // ((subpel_x_qn + x * x_step_qn) & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS // equation, where the values of x are from 0 to w. If x_step_qn is a // multiple of SCALE_SUBPEL_MASK we can leave it out of the equation. const ptrdiff_t filter_offset = SUBPEL_TAPS * ((subpel_x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16_t *x_filter = filter_params_x->filter_ptr + filter_offset; // The source index is calculated using the (subpel_x_qn + x * x_step_qn) // >> SCALE_SUBPEL_BITS, where the values of x are from 0 to w. If // subpel_x_qn < (1 << SCALE_SUBPEL_BITS) and x_step_qn % (1 << // SCALE_SUBPEL_BITS) == 0, the source index can be determined using the // value x * (x_step_qn / (1 << SCALE_SUBPEL_BITS)). if (filter_params_x->interp_filter == MULTITAP_SHARP) { convolve_horiz_scale_2_8tap_neon(src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, im_h, x_filter); } else { convolve_horiz_scale_2_6tap_neon(src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, im_h, x_filter); } } // Vertical filter if (filter_params_y->interp_filter == MULTITAP_SHARP) { if (UNLIKELY(conv_params->is_compound)) { if (conv_params->do_average) { if (conv_params->use_dist_wtd_comp_avg) { compound_dist_wtd_convolve_vert_scale_8tap_neon( im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn); } else { compound_avg_convolve_vert_scale_8tap_neon( im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } else { compound_convolve_vert_scale_8tap_neon( im_block, im_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } else { convolve_vert_scale_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } else { if (UNLIKELY(conv_params->is_compound)) { if (conv_params->do_average) { if (conv_params->use_dist_wtd_comp_avg) { compound_dist_wtd_convolve_vert_scale_6tap_neon( im_block + im_stride, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn); } else { compound_avg_convolve_vert_scale_6tap_neon( im_block + im_stride, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } else { compound_convolve_vert_scale_6tap_neon( im_block + im_stride, im_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } else { convolve_vert_scale_6tap_neon( im_block + im_stride, im_stride, dst, dst_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } } aom-3.12.1/av1/common/arm/av1_convolve_scale_neon_dotprod.c000066400000000000000000000414071477627663500236340ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" #include "av1/common/arm/convolve_scale_neon.h" #include "av1/common/convolve.h" #include "av1/common/enums.h" #include "av1/common/filter.h" // clang-format off DECLARE_ALIGNED(16, static const uint8_t, kScale2DotProdPermuteTbl[32]) = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 }; // clang-format on static inline int16x4_t convolve8_4_h(const uint8x8_t s0, const uint8x8_t s1, const uint8x8_t s2, const uint8x8_t s3, const int8x8_t filter, const int32x4_t horiz_const) { const int8x16_t filters = vcombine_s8(filter, filter); uint8x16_t s01 = vcombine_u8(s0, s1); uint8x16_t s23 = vcombine_u8(s2, s3); // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t s01_128 = vreinterpretq_s8_u8(vsubq_u8(s01, vdupq_n_u8(128))); int8x16_t s23_128 = vreinterpretq_s8_u8(vsubq_u8(s23, vdupq_n_u8(128))); int32x4_t sum01 = vdotq_s32(horiz_const, s01_128, filters); int32x4_t sum23 = vdotq_s32(horiz_const, s23_128, filters); int32x4_t sum = vpaddq_s32(sum01, sum23); // We halved the filter values so -1 from right shift. return vshrn_n_s32(sum, ROUND0_BITS - 1); } static inline int16x8_t convolve8_8_h(const uint8x8_t s0, const uint8x8_t s1, const uint8x8_t s2, const uint8x8_t s3, const uint8x8_t s4, const uint8x8_t s5, const uint8x8_t s6, const uint8x8_t s7, const int8x8_t filter, const int32x4_t horiz_const) { const int8x16_t filters = vcombine_s8(filter, filter); uint8x16_t s01 = vcombine_u8(s0, s1); uint8x16_t s23 = vcombine_u8(s2, s3); uint8x16_t s45 = vcombine_u8(s4, s5); uint8x16_t s67 = vcombine_u8(s6, s7); // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t s01_128 = vreinterpretq_s8_u8(vsubq_u8(s01, vdupq_n_u8(128))); int8x16_t s23_128 = vreinterpretq_s8_u8(vsubq_u8(s23, vdupq_n_u8(128))); int8x16_t s45_128 = vreinterpretq_s8_u8(vsubq_u8(s45, vdupq_n_u8(128))); int8x16_t s67_128 = vreinterpretq_s8_u8(vsubq_u8(s67, vdupq_n_u8(128))); int32x4_t sum01 = vdotq_s32(horiz_const, s01_128, filters); int32x4_t sum23 = vdotq_s32(horiz_const, s23_128, filters); int32x4_t sum45 = vdotq_s32(horiz_const, s45_128, filters); int32x4_t sum67 = vdotq_s32(horiz_const, s67_128, filters); int32x4_t sum0123 = vpaddq_s32(sum01, sum23); int32x4_t sum4567 = vpaddq_s32(sum45, sum67); // We halved the filter values so -1 from right shift. return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), vshrn_n_s32(sum4567, ROUND0_BITS - 1)); } static inline void convolve_horiz_scale_neon_dotprod( const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w, int h, const int16_t *x_filter, const int subpel_x_qn, const int x_step_qn) { DECLARE_ALIGNED(16, int16_t, temp[8 * 8]); const int bd = 8; // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. const int32_t horiz_offset = (1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)); // The shim of 128 << FILTER_BITS is needed because we are subtracting 128 // from every source value. const int32_t dotprod_offset = 128 << FILTER_BITS; // Divide the total by 4: we halved the filter values and will use a pairwise // add in the convolution kernel. const int32x4_t horiz_offset_vec = vdupq_n_s32((horiz_offset + dotprod_offset) >> 2); if (w == 4) { do { int x_qn = subpel_x_qn; // Process a 4x4 tile. for (int r = 0; r < 4; r++) { const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); // Filter values are all even so halve them to fit in int8_t. const int8x8_t filter = vshrn_n_s16(vld1q_s16(x_filter + filter_offset), 1); uint8x8_t t0, t1, t2, t3; load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); int16x4_t d0 = convolve8_4_h(t0, t1, t2, t3, filter, horiz_offset_vec); vst1_s16(&temp[r * 4], d0); x_qn += x_step_qn; } // Transpose the 4x4 result tile and store. int16x4_t d0, d1, d2, d3; load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3); transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); dst += 4 * dst_stride; src += 4 * src_stride; h -= 4; } while (h > 0); } else { do { int x_qn = subpel_x_qn; int16_t *d = dst; int width = w; do { // Process an 8x8 tile. for (int r = 0; r < 8; r++) { const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); // Filter values are all even so halve them to fit in int8_t. int8x8_t filter = vshrn_n_s16(vld1q_s16(x_filter + filter_offset), 1); uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t d0 = convolve8_8_h(t0, t1, t2, t3, t4, t5, t6, t7, filter, horiz_offset_vec); vst1q_s16(&temp[r * 8], d0); x_qn += x_step_qn; } // Transpose the 8x8 result tile and store. int16x8_t d0, d1, d2, d3, d4, d5, d6, d7; load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); d += 8; width -= 8; } while (width != 0); dst += 8 * dst_stride; src += 8 * src_stride; h -= 8; } while (h > 0); } } static inline int16x4_t convolve8_4_h_scale_2(uint8x16_t samples, const int8x8_t filters, const int32x4_t horiz_const, const uint8x16x2_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t samples_128 = vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. // { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 } // { 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 } int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; int32x4_t sum = vdotq_lane_s32(horiz_const, perm_samples[0], filters, 0); sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1); // We halved the filter values so -1 from right shift. return vshrn_n_s32(sum, ROUND0_BITS - 1); } static inline int16x8_t convolve8_8_h_scale_2(uint8x16_t samples[2], const int8x8_t filters, const int32x4_t horiz_const, const uint8x16x2_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t samples0_128 = vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))); int8x16_t samples1_128 = vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128))); // Permute samples ready for dot product. // { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 } // { 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 } int8x16_t perm_samples[4] = { vqtbl1q_s8(samples0_128, permute_tbl.val[0]), vqtbl1q_s8(samples0_128, permute_tbl.val[1]), vqtbl1q_s8(samples1_128, permute_tbl.val[0]), vqtbl1q_s8(samples1_128, permute_tbl.val[1]) }; // First 4 output values. int32x4_t sum0123 = vdotq_lane_s32(horiz_const, perm_samples[0], filters, 0); sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filters, 1); // Second 4 output values. int32x4_t sum4567 = vdotq_lane_s32(horiz_const, perm_samples[2], filters, 0); sum4567 = vdotq_lane_s32(sum4567, perm_samples[3], filters, 1); // We halved the filter values so -1 from right shift. return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), vshrn_n_s32(sum4567, ROUND0_BITS - 1)); } static inline void convolve_horiz_scale_2_neon_dotprod( const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w, int h, const int16_t *x_filter) { const int bd = 8; // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. const int32_t horiz_offset = (1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)); // The shim of 128 << FILTER_BITS is needed because we are subtracting 128 // from every source value. const int32_t dotprod_offset = 128 << FILTER_BITS; // Divide the total by 2 because we halved the filter values. const int32x4_t horiz_offset_vec = vdupq_n_s32((horiz_offset + dotprod_offset) >> 1); const uint8x16x2_t permute_tbl = vld1q_u8_x2(kScale2DotProdPermuteTbl); // Filter values are all even so halve them to fit in int8_t. const int8x8_t filter = vshrn_n_s16(vld1q_s16(x_filter), 1); if (w == 4) { do { const uint8_t *s = src; int16_t *d = dst; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); int16x4_t d0 = convolve8_4_h_scale_2(s0, filter, horiz_offset_vec, permute_tbl); int16x4_t d1 = convolve8_4_h_scale_2(s1, filter, horiz_offset_vec, permute_tbl); int16x4_t d2 = convolve8_4_h_scale_2(s2, filter, horiz_offset_vec, permute_tbl); int16x4_t d3 = convolve8_4_h_scale_2(s3, filter, horiz_offset_vec, permute_tbl); store_s16_4x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 4; width -= 4; } while (width != 0); dst += 4 * dst_stride; src += 4 * src_stride; h -= 4; } while (h > 0); } else { do { const uint8_t *s = src; int16_t *d = dst; int width = w; do { uint8x16_t s0[2], s1[2], s2[2], s3[2]; load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); load_u8_16x4(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); int16x8_t d0 = convolve8_8_h_scale_2(s0, filter, horiz_offset_vec, permute_tbl); int16x8_t d1 = convolve8_8_h_scale_2(s1, filter, horiz_offset_vec, permute_tbl); int16x8_t d2 = convolve8_8_h_scale_2(s2, filter, horiz_offset_vec, permute_tbl); int16x8_t d3 = convolve8_8_h_scale_2(s3, filter, horiz_offset_vec, permute_tbl); store_s16_8x4(d, dst_stride, d0, d1, d2, d3); s += 16; d += 8; width -= 8; } while (width != 0); dst += 4 * dst_stride; src += 4 * src_stride; h -= 4; } while (h > 0); } } void av1_convolve_2d_scale_neon_dotprod( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params) { if (w < 4 || h < 4) { av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn, y_step_qn, conv_params); return; } // For the interpolation 8-tap filters are used. assert(filter_params_y->taps <= 8 && filter_params_x->taps <= 8); DECLARE_ALIGNED(32, int16_t, im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps; int im_stride = MAX_SB_SIZE; CONV_BUF_TYPE *dst16 = conv_params->dst; const int dst16_stride = conv_params->dst_stride; // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2 // lines post both horizontally and vertically. const ptrdiff_t horiz_offset = filter_params_x->taps / 2 - 1; const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride; // Horizontal filter if (x_step_qn != 2 * (1 << SCALE_SUBPEL_BITS)) { convolve_horiz_scale_neon_dotprod( src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn); } else { assert(subpel_x_qn < (1 << SCALE_SUBPEL_BITS)); // The filter index is calculated using the // ((subpel_x_qn + x * x_step_qn) & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS // equation, where the values of x are from 0 to w. If x_step_qn is a // multiple of SCALE_SUBPEL_MASK we can leave it out of the equation. const ptrdiff_t filter_offset = SUBPEL_TAPS * ((subpel_x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16_t *x_filter = filter_params_x->filter_ptr + filter_offset; // The source index is calculated using the (subpel_x_qn + x * x_step_qn) >> // SCALE_SUBPEL_BITS, where the values of x are from 0 to w. If subpel_x_qn // < (1 << SCALE_SUBPEL_BITS) and x_step_qn % (1 << SCALE_SUBPEL_BITS) == 0, // the source index can be determined using the value x * (x_step_qn / // (1 << SCALE_SUBPEL_BITS)). convolve_horiz_scale_2_neon_dotprod(src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, im_h, x_filter); } // Vertical filter if (filter_params_y->interp_filter == MULTITAP_SHARP) { if (UNLIKELY(conv_params->is_compound)) { if (conv_params->do_average) { if (conv_params->use_dist_wtd_comp_avg) { compound_dist_wtd_convolve_vert_scale_8tap_neon( im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn); } else { compound_avg_convolve_vert_scale_8tap_neon( im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } else { compound_convolve_vert_scale_8tap_neon( im_block, im_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } else { convolve_vert_scale_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } else { if (UNLIKELY(conv_params->is_compound)) { if (conv_params->do_average) { if (conv_params->use_dist_wtd_comp_avg) { compound_dist_wtd_convolve_vert_scale_6tap_neon( im_block + im_stride, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn); } else { compound_avg_convolve_vert_scale_6tap_neon( im_block + im_stride, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } else { compound_convolve_vert_scale_6tap_neon( im_block + im_stride, im_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } else { convolve_vert_scale_6tap_neon( im_block + im_stride, im_stride, dst, dst_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } } aom-3.12.1/av1/common/arm/av1_convolve_scale_neon_i8mm.c000066400000000000000000000373161477627663500230370ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" #include "av1/common/arm/convolve_scale_neon.h" #include "av1/common/convolve.h" #include "av1/common/enums.h" #include "av1/common/filter.h" // clang-format off DECLARE_ALIGNED(16, static const uint8_t, kScale2DotProdPermuteTbl[32]) = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 }; // clang-format on static inline int16x4_t convolve8_4_h(const uint8x8_t s0, const uint8x8_t s1, const uint8x8_t s2, const uint8x8_t s3, const int8x8_t filter, const int32x4_t horiz_const) { const int8x16_t filters = vcombine_s8(filter, filter); uint8x16_t s01 = vcombine_u8(s0, s1); uint8x16_t s23 = vcombine_u8(s2, s3); int32x4_t sum01 = vusdotq_s32(horiz_const, s01, filters); int32x4_t sum23 = vusdotq_s32(horiz_const, s23, filters); int32x4_t sum = vpaddq_s32(sum01, sum23); // We halved the filter values so -1 from right shift. return vshrn_n_s32(sum, ROUND0_BITS - 1); } static inline int16x8_t convolve8_8_h(const uint8x8_t s0, const uint8x8_t s1, const uint8x8_t s2, const uint8x8_t s3, const uint8x8_t s4, const uint8x8_t s5, const uint8x8_t s6, const uint8x8_t s7, const int8x8_t filter, const int32x4_t horiz_const) { const int8x16_t filters = vcombine_s8(filter, filter); uint8x16_t s01 = vcombine_u8(s0, s1); uint8x16_t s23 = vcombine_u8(s2, s3); uint8x16_t s45 = vcombine_u8(s4, s5); uint8x16_t s67 = vcombine_u8(s6, s7); int32x4_t sum01 = vusdotq_s32(horiz_const, s01, filters); int32x4_t sum23 = vusdotq_s32(horiz_const, s23, filters); int32x4_t sum45 = vusdotq_s32(horiz_const, s45, filters); int32x4_t sum67 = vusdotq_s32(horiz_const, s67, filters); int32x4_t sum0123 = vpaddq_s32(sum01, sum23); int32x4_t sum4567 = vpaddq_s32(sum45, sum67); // We halved the filter values so -1 from right shift. return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), vshrn_n_s32(sum4567, ROUND0_BITS - 1)); } static inline void convolve_horiz_scale_neon_i8mm(const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w, int h, const int16_t *x_filter, const int subpel_x_qn, const int x_step_qn) { DECLARE_ALIGNED(16, int16_t, temp[8 * 8]); const int bd = 8; // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // Divide the total by 4: we halved the filter values and will use a pairwise // add in the convolution kernel. const int32x4_t horiz_offset = vdupq_n_s32( ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) >> 2); if (w == 4) { do { int x_qn = subpel_x_qn; // Process a 4x4 tile. for (int r = 0; r < 4; r++) { const uint8_t *const s = &src[x_qn >> SCALE_SUBPEL_BITS]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); // Filter values are all even so halve them to fit in int8_t. const int8x8_t filter = vshrn_n_s16(vld1q_s16(x_filter + filter_offset), 1); uint8x8_t t0, t1, t2, t3; load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); int16x4_t d0 = convolve8_4_h(t0, t1, t2, t3, filter, horiz_offset); vst1_s16(&temp[r * 4], d0); x_qn += x_step_qn; } // Transpose the 4x4 result tile and store. int16x4_t d0, d1, d2, d3; load_s16_4x4(temp, 4, &d0, &d1, &d2, &d3); transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); dst += 4 * dst_stride; src += 4 * src_stride; h -= 4; } while (h > 0); } else { do { int x_qn = subpel_x_qn; int16_t *d = dst; int width = w; do { // Process an 8x8 tile. for (int r = 0; r < 8; r++) { const uint8_t *const s = &src[(x_qn >> SCALE_SUBPEL_BITS)]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); // Filter values are all even so halve them to fit in int8_t. const int8x8_t filter = vshrn_n_s16(vld1q_s16(x_filter + filter_offset), 1); uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t d0 = convolve8_8_h(t0, t1, t2, t3, t4, t5, t6, t7, filter, horiz_offset); vst1q_s16(&temp[r * 8], d0); x_qn += x_step_qn; } // Transpose the 8x8 result tile and store. int16x8_t d0, d1, d2, d3, d4, d5, d6, d7; load_s16_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); d += 8; width -= 8; } while (width != 0); dst += 8 * dst_stride; src += 8 * src_stride; h -= 8; } while (h > 0); } } static inline int16x4_t convolve8_4_h_scale_2(uint8x16_t samples, const int8x8_t filters, const int32x4_t horiz_const, const uint8x16x2_t permute_tbl) { // Permute samples ready for dot product. // { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 } // { 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 } uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), vqtbl1q_u8(samples, permute_tbl.val[1]) }; int32x4_t sum = vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0); sum = vusdotq_lane_s32(sum, perm_samples[1], filters, 1); // We halved the filter values so -1 from right shift. return vshrn_n_s32(sum, ROUND0_BITS - 1); } static inline int16x8_t convolve8_8_h_scale_2(uint8x16_t samples[2], const int8x8_t filters, const int32x4_t horiz_const, const uint8x16x2_t permute_tbl) { // Permute samples ready for dot product. // { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 } // { 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 } uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]), vqtbl1q_u8(samples[0], permute_tbl.val[1]), vqtbl1q_u8(samples[1], permute_tbl.val[0]), vqtbl1q_u8(samples[1], permute_tbl.val[1]) }; // First 4 output values. int32x4_t sum0123 = vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0); sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filters, 1); // Second 4 output values. int32x4_t sum4567 = vusdotq_lane_s32(horiz_const, perm_samples[2], filters, 0); sum4567 = vusdotq_lane_s32(sum4567, perm_samples[3], filters, 1); // We halved the filter values so -1 from right shift. return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), vshrn_n_s32(sum4567, ROUND0_BITS - 1)); } static inline void convolve_horiz_scale_2_neon_i8mm( const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w, int h, const int16_t *x_filter) { const int bd = 8; // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // The additional -1 is needed because we are halving the filter values. const int32x4_t horiz_offset = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2))); const uint8x16x2_t permute_tbl = vld1q_u8_x2(kScale2DotProdPermuteTbl); // Filter values are all even so halve them to fit in int8_t. const int8x8_t filter = vshrn_n_s16(vld1q_s16(x_filter), 1); if (w == 4) { do { const uint8_t *s = src; int16_t *d = dst; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); int16x4_t d0 = convolve8_4_h_scale_2(s0, filter, horiz_offset, permute_tbl); int16x4_t d1 = convolve8_4_h_scale_2(s1, filter, horiz_offset, permute_tbl); int16x4_t d2 = convolve8_4_h_scale_2(s2, filter, horiz_offset, permute_tbl); int16x4_t d3 = convolve8_4_h_scale_2(s3, filter, horiz_offset, permute_tbl); store_s16_4x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 4; width -= 4; } while (width != 0); dst += 4 * dst_stride; src += 4 * src_stride; h -= 4; } while (h > 0); } else { do { const uint8_t *s = src; int16_t *d = dst; int width = w; do { uint8x16_t s0[2], s1[2], s2[2], s3[2]; load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); load_u8_16x4(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); int16x8_t d0 = convolve8_8_h_scale_2(s0, filter, horiz_offset, permute_tbl); int16x8_t d1 = convolve8_8_h_scale_2(s1, filter, horiz_offset, permute_tbl); int16x8_t d2 = convolve8_8_h_scale_2(s2, filter, horiz_offset, permute_tbl); int16x8_t d3 = convolve8_8_h_scale_2(s3, filter, horiz_offset, permute_tbl); store_s16_8x4(d, dst_stride, d0, d1, d2, d3); s += 16; d += 8; width -= 8; } while (width != 0); dst += 4 * dst_stride; src += 4 * src_stride; h -= 4; } while (h > 0); } } void av1_convolve_2d_scale_neon_i8mm(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params) { if (w < 4 || h < 4) { av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn, y_step_qn, conv_params); return; } // For the interpolation 8-tap filters are used. assert(filter_params_y->taps <= 8 && filter_params_x->taps <= 8); DECLARE_ALIGNED(32, int16_t, im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps; int im_stride = MAX_SB_SIZE; CONV_BUF_TYPE *dst16 = conv_params->dst; const int dst16_stride = conv_params->dst_stride; // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2 // lines post both horizontally and vertically. const ptrdiff_t horiz_offset = filter_params_x->taps / 2 - 1; const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride; // Horizontal filter if (x_step_qn != 2 * (1 << SCALE_SUBPEL_BITS)) { convolve_horiz_scale_neon_i8mm( src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn); } else { assert(subpel_x_qn < (1 << SCALE_SUBPEL_BITS)); // The filter index is calculated using the // ((subpel_x_qn + x * x_step_qn) & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS // equation, where the values of x are from 0 to w. If x_step_qn is a // multiple of SCALE_SUBPEL_MASK we can leave it out of the equation. const ptrdiff_t filter_offset = SUBPEL_TAPS * ((subpel_x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16_t *x_filter = filter_params_x->filter_ptr + filter_offset; // The source index is calculated using the (subpel_x_qn + x * x_step_qn) >> // SCALE_SUBPEL_BITS, where the values of x are from 0 to w. If subpel_x_qn // < (1 << SCALE_SUBPEL_BITS) and x_step_qn % (1 << SCALE_SUBPEL_BITS) == 0, // the source index can be determined using the value x * (x_step_qn / // (1 << SCALE_SUBPEL_BITS)). convolve_horiz_scale_2_neon_i8mm(src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, im_h, x_filter); } // Vertical filter if (filter_params_y->interp_filter == MULTITAP_SHARP) { if (UNLIKELY(conv_params->is_compound)) { if (conv_params->do_average) { if (conv_params->use_dist_wtd_comp_avg) { compound_dist_wtd_convolve_vert_scale_8tap_neon( im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn); } else { compound_avg_convolve_vert_scale_8tap_neon( im_block, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } else { compound_convolve_vert_scale_8tap_neon( im_block, im_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } else { convolve_vert_scale_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } else { if (UNLIKELY(conv_params->is_compound)) { if (conv_params->do_average) { if (conv_params->use_dist_wtd_comp_avg) { compound_dist_wtd_convolve_vert_scale_6tap_neon( im_block + im_stride, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, conv_params, subpel_y_qn, y_step_qn); } else { compound_avg_convolve_vert_scale_6tap_neon( im_block + im_stride, im_stride, dst, dst_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } else { compound_convolve_vert_scale_6tap_neon( im_block + im_stride, im_stride, dst16, dst16_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } else { convolve_vert_scale_6tap_neon( im_block + im_stride, im_stride, dst, dst_stride, w, h, filter_params_y->filter_ptr, subpel_y_qn, y_step_qn); } } } aom-3.12.1/av1/common/arm/av1_inv_txfm_neon.c000066400000000000000000004562501477627663500207370ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" #include "aom_dsp/arm/transpose_neon.h" #include "av1/common/av1_inv_txfm1d.h" #include "av1/common/av1_inv_txfm1d_cfg.h" #include "av1/common/av1_txfm.h" #include "av1/common/enums.h" #include "av1/common/idct.h" #include "av1/common/arm/av1_inv_txfm_neon.h" // 1D itx types typedef enum ATTRIBUTE_PACKED { IDCT_1D, IADST_1D, IFLIPADST_1D = IADST_1D, IIDENTITY_1D, ITX_TYPES_1D, } ITX_TYPE_1D; static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D, IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D, }; static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { IDCT_1D, IDCT_1D, IADST_1D, IADST_1D, IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D, IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, }; // 1D functions static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = { { av1_idct4, av1_iadst4, av1_iidentity4_c }, { av1_idct8, av1_iadst8, av1_iidentity8_c }, { av1_idct16, av1_iadst16, av1_iidentity16_c }, { av1_idct32, NULL, NULL }, { av1_idct64, NULL, NULL }, }; static inline void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in, uint8_t *output, int stride, int flipud, const int height) { int j = flipud ? (height - 1) : 0; const int step = flipud ? -1 : 1; int16x8_t temp_output; for (int i = 0; i < height; ++i, j += step) { temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output))); temp_output = vaddq_s16(temp_output, in[j]); vst1_u8(output, vqmovun_s16(temp_output)); output += stride; } } static inline uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred, int16x8_t res0, int16x8_t res1) { int16x8_t temp_output[2]; uint8x16_t temp_output_8q; temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred))); temp_output[0] = vaddq_s16(temp_output[0], res0); temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred))); temp_output[1] = vaddq_s16(temp_output[1], res1); temp_output_8q = vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1])); return temp_output_8q; } static inline void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in, uint8_t *output, int stride, int flipud, int height) { uint8x16_t temp_output_8q; int j = flipud ? (height - 1) : 0; const int step = flipud ? -1 : 1; for (int i = 0; i < height; ++i, j += step) { temp_output_8q = vld1q_u8(output + i * stride); temp_output_8q = lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]); vst1q_u8((output + i * stride), temp_output_8q); } } static inline void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size, int value) { for (int i = 0; i < size; i++) { a[i] = vdupq_n_s16((int16_t)value); } } static inline void btf_16_lane_0_1_neon(const int16x8_t in0, const int16x8_t in1, const int16x4_t c, int16x8_t *t0, int16x8_t *t1) { int32x4_t s0[2], s1[2]; int16x4_t v0[2], v1[2]; s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1); s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1); s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0); s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0); v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); *t0 = vcombine_s16(v0[0], v0[1]); *t1 = vcombine_s16(v1[0], v1[1]); } static inline void btf_16_lane_1_0_neon(const int16x8_t in0, const int16x8_t in1, const int16x4_t c, int16x8_t *t0, int16x8_t *t1) { int32x4_t s0[2], s1[2]; int16x4_t v0[2], v1[2]; s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0); s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0); s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1); s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1); v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); *t0 = vcombine_s16(v0[0], v0[1]); *t1 = vcombine_s16(v1[0], v1[1]); } static inline void btf_16_lane_2_3_neon(const int16x8_t in0, const int16x8_t in1, const int16x4_t c, int16x8_t *t0, int16x8_t *t1) { int32x4_t s0[2], s1[2]; int16x4_t v0[2], v1[2]; s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3); s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3); s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2); s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2); v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); *t0 = vcombine_s16(v0[0], v0[1]); *t1 = vcombine_s16(v1[0], v1[1]); } static inline void btf_16_neon(const int16x8_t in0, int16_t coef1, int16_t coef2, int16x8_t *t0, int16x8_t *t1) { int32x4_t s0_l, s0_h, s1_l, s1_h; int16x4_t v0[2], v1[2]; s0_l = vmull_n_s16(vget_low_s16(in0), coef1); s0_h = vmull_n_s16(vget_high_s16(in0), coef1); s1_l = vmull_n_s16(vget_low_s16(in0), coef2); s1_h = vmull_n_s16(vget_high_s16(in0), coef2); v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT); v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT); v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT); v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT); *t0 = vcombine_s16(v0[0], v0[1]); *t1 = vcombine_s16(v1[0], v1[1]); } static inline void btf_16_lane_3_2_neon(const int16x8_t in0, const int16x8_t in1, const int16x4_t c, int16x8_t *t0, int16x8_t *t1) { int32x4_t s0[2], s1[2]; int16x4_t v0[2], v1[2]; s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2); s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2); s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3); s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3); v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); *t0 = vcombine_s16(v0[0], v0[1]); *t1 = vcombine_s16(v1[0], v1[1]); } static inline void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) { int32x4_t t0[2], t1[2]; int16x4_t v0[2], v1[2]; // Don't add/sub before multiply, which will overflow in iadst8. const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0); const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0); const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0); const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0); t0[0] = vaddq_s32(x0_lo, x1_lo); t0[1] = vaddq_s32(x0_hi, x1_hi); t1[0] = vsubq_s32(x0_lo, x1_lo); t1[1] = vsubq_s32(x0_hi, x1_hi); v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT); v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT); v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT); v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT); x[0] = vcombine_s16(v0[0], v0[1]); x[1] = vcombine_s16(v1[0], v1[1]); } static inline int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1, const int16_t c2, const int16_t c3) { int16x4_t val = vdup_n_s16(c0); val = vset_lane_s16(c1, val, 1); val = vset_lane_s16(c2, val, 2); val = vset_lane_s16(c3, val, 3); return val; } static inline void iadst8_neon(int16x8_t *const in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], (int16_t)cospi[20], (int16_t)cospi[44]); const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[36], (int16_t)cospi[28], (int16_t)cospi[52], (int16_t)cospi[12]); const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], (int16_t)cospi[16], (int16_t)cospi[48]); int16x8_t x[8]; int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; // Stage 1 x[0] = in[7]; x[1] = in[0]; x[2] = in[5]; x[3] = in[2]; x[4] = in[3]; x[5] = in[4]; x[6] = in[1]; x[7] = in[6]; // Stage 2 btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1); btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3); btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5); btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7); // Stage 3 x[0] = vqaddq_s16(s0, s4); x[1] = vqaddq_s16(s1, s5); x[2] = vqaddq_s16(s2, s6); x[3] = vqaddq_s16(s3, s7); x[4] = vqsubq_s16(s0, s4); x[5] = vqsubq_s16(s1, s5); x[6] = vqsubq_s16(s2, s6); x[7] = vqsubq_s16(s3, s7); // Stage 4 s0 = x[0]; s1 = x[1]; s2 = x[2]; s3 = x[3]; btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5); btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6); // Stage 5 x[0] = vqaddq_s16(s0, s2); x[1] = vqaddq_s16(s1, s3); x[2] = vqsubq_s16(s0, s2); x[3] = vqsubq_s16(s1, s3); x[4] = vqaddq_s16(s4, s6); x[5] = vqaddq_s16(s5, s7); x[6] = vqsubq_s16(s4, s6); x[7] = vqsubq_s16(s5, s7); // stage 6 btf_16_half_neon(x + 2, c2); btf_16_half_neon(x + 6, c2); // Stage 7 out[0] = x[0]; out[1] = vqnegq_s16(x[4]); out[2] = x[6]; out[3] = vqnegq_s16(x[2]); out[4] = x[3]; out[5] = vqnegq_s16(x[7]); out[6] = x[5]; out[7] = vqnegq_s16(x[1]); } static inline void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], (int16_t)cospi[16], (int16_t)cospi[48]); int16x8_t x[8]; int16x8_t s0, s1, s4, s5; // Stage 1 x[1] = in[0]; // Stage 2 btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1); // Stage 3 x[0] = s0; x[1] = s1; x[4] = s0; x[5] = s1; // Stage 4 s0 = x[0]; s1 = x[1]; btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5); // Stage 5 x[0] = s0; x[1] = s1; x[2] = s0; x[3] = s1; x[4] = s4; x[5] = s5; x[6] = s4; x[7] = s5; // stage 6 btf_16_half_neon(x + 2, c2); btf_16_half_neon(x + 6, c2); // Stage 7 out[0] = x[0]; out[1] = vqnegq_s16(x[4]); out[2] = x[6]; out[3] = vqnegq_s16(x[2]); out[4] = x[3]; out[5] = vqnegq_s16(x[7]); out[6] = x[5]; out[7] = vqnegq_s16(x[1]); } static inline void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1[8], step2[8]; const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], (int16_t)cospi[40], (int16_t)cospi[24]); const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], (int16_t)cospi[16], (int16_t)cospi[48]); // stage 2 btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]); btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]); // stage 3 btf_16_lane_0_1_neon(in[0], in[4], c1, &step2[0], &step2[1]); btf_16_lane_2_3_neon(in[2], in[6], c1, &step2[3], &step2[2]); step2[4] = vqaddq_s16(step1[4], step1[5]); step2[5] = vqsubq_s16(step1[4], step1[5]); step2[6] = vqsubq_s16(step1[7], step1[6]); step2[7] = vqaddq_s16(step1[7], step1[6]); // stage 4 step1[0] = vqaddq_s16(step2[0], step2[3]); step1[1] = vqaddq_s16(step2[1], step2[2]); step1[2] = vqsubq_s16(step2[1], step2[2]); step1[3] = vqsubq_s16(step2[0], step2[3]); btf_16_lane_0_1_neon(step2[6], step2[5], c1, &step1[6], &step1[5]); // stage 5 out[0] = vqaddq_s16(step1[0], step2[7]); out[1] = vqaddq_s16(step1[1], step1[6]); out[2] = vqaddq_s16(step1[2], step1[5]); out[3] = vqaddq_s16(step1[3], step2[4]); out[4] = vqsubq_s16(step1[3], step2[4]); out[5] = vqsubq_s16(step1[2], step1[5]); out[6] = vqsubq_s16(step1[1], step1[6]); out[7] = vqsubq_s16(step1[0], step2[7]); } static inline void idct8_low1_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1; int32x4_t t32[2]; // stage 1 // stage 2 // stage 3 t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]); t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]); step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), vrshrn_n_s32(t32[1], INV_COS_BIT)); // stage 4 // stage 5 out[0] = step1; out[1] = step1; out[2] = step1; out[3] = step1; out[4] = step1; out[5] = step1; out[6] = step1; out[7] = step1; } static void round_shift_array_16_neon(int16x8_t *arr, int size, int bit) { assert(!(size % 4)); if (!bit) return; const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit)); for (int i = 0; i < size; i++) { arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8); } } static inline void flip_buf_ud_neon(int16x8_t *input, int size) { int16x8_t temp[8]; for (int i = 0; i < size; ++i) { temp[i] = input[size - 1 - i]; } for (int i = 0; i < size; ++i) { input[i] = temp[i]; } } static inline void load_buffer_32bit_to_16bit_neon(const int32_t *input, int stride, int16x8_t *const a, int out_size) { for (int i = 0; i < out_size; ++i) { a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)), vmovn_s32(vld1q_s32(input + 4))); input += stride; } } static const int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, 4 * 5793 }; static inline void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output, int txw_idx, int8_t size, int bit) { const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit)); int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]); int16x4_t low_i16, high_i16; int32x4_t low_i32, high_i32; for (int i = 0; i < size; i++) { int32x4_t temp_out_low = vmull_s16(vget_low_s16(input[i]), scale); int32x4_t temp_out_high = vmull_s16(vget_high_s16(input[i]), scale); low_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_low, 12), dup_bits_n_32x4); high_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_high, 12), dup_bits_n_32x4); low_i16 = vqmovn_s32(low_i32); high_i16 = vqmovn_s32(high_i32); output[i] = vcombine_s16(low_i16, high_i16); } } static inline void round_shift_for_rect(int16x8_t *input, int16x8_t *output, int size) { int32x4_t out_low, out_high; int16x4_t low, high; for (int z = 0; z < size; ++z) { out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2); out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2); low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits); high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits); output[z] = vcombine_s16(low, high); } } static inline void idct16_low1_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1; int32x4_t t32[2]; // stage 4 t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]); t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]); step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), vrshrn_n_s32(t32[1], INV_COS_BIT)); // stage 6 // stage 7 out[0] = step1; out[1] = step1; out[2] = step1; out[3] = step1; out[4] = step1; out[5] = step1; out[6] = step1; out[7] = step1; out[8] = step1; out[9] = step1; out[10] = step1; out[11] = step1; out[12] = step1; out[13] = step1; out[14] = step1; out[15] = step1; } static inline void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1[16], step2[16]; const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], (int16_t)cospi[36], (int16_t)cospi[28]); const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], (int16_t)cospi[52], (int16_t)cospi[12]); const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], (int16_t)cospi[40], (int16_t)cospi[24]); const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], (int16_t)cospi[16], (int16_t)cospi[48]); const int16x4_t c4 = set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); // stage 2 btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]); btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]); btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]); btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]); step2[0] = in[0]; step2[1] = in[8]; step2[2] = in[4]; step2[3] = in[12]; step2[4] = in[2]; step2[5] = in[10]; step2[6] = in[6]; step2[7] = in[14]; // stage 3 btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]); btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]); step1[0] = step2[0]; step1[1] = step2[1]; step1[2] = step2[2]; step1[3] = step2[3]; step1[8] = vqaddq_s16(step2[8], step2[9]); step1[9] = vqsubq_s16(step2[8], step2[9]); step1[10] = vqsubq_s16(step2[11], step2[10]); step1[11] = vqaddq_s16(step2[11], step2[10]); step1[12] = vqaddq_s16(step2[12], step2[13]); step1[13] = vqsubq_s16(step2[12], step2[13]); step1[14] = vqsubq_s16(step2[15], step2[14]); step1[15] = vqaddq_s16(step2[15], step2[14]); // stage 4 btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]); btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]); btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); btf_16_lane_3_2_neon(step1[10], step1[13], c4, &step2[10], &step2[13]); step2[4] = vqaddq_s16(step1[4], step1[5]); step2[5] = vqsubq_s16(step1[4], step1[5]); step2[6] = vqsubq_s16(step1[7], step1[6]); step2[7] = vqaddq_s16(step1[7], step1[6]); step2[8] = step1[8]; step2[11] = step1[11]; step2[12] = step1[12]; step2[15] = step1[15]; // stage 5 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); step1[0] = vqaddq_s16(step2[0], step2[3]); step1[1] = vqaddq_s16(step2[1], step2[2]); step1[2] = vqsubq_s16(step2[1], step2[2]); step1[3] = vqsubq_s16(step2[0], step2[3]); step1[4] = step2[4]; step1[7] = step2[7]; step1[8] = vqaddq_s16(step2[8], step2[11]); step1[9] = vqaddq_s16(step2[9], step2[10]); step1[10] = vqsubq_s16(step2[9], step2[10]); step1[11] = vqsubq_s16(step2[8], step2[11]); step1[12] = vqsubq_s16(step2[15], step2[12]); step1[13] = vqsubq_s16(step2[14], step2[13]); step1[14] = vqaddq_s16(step2[14], step2[13]); step1[15] = vqaddq_s16(step2[15], step2[12]); // stage 6 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); step2[0] = vqaddq_s16(step1[0], step1[7]); step2[1] = vqaddq_s16(step1[1], step1[6]); step2[2] = vqaddq_s16(step1[2], step1[5]); step2[3] = vqaddq_s16(step1[3], step1[4]); step2[4] = vqsubq_s16(step1[3], step1[4]); step2[5] = vqsubq_s16(step1[2], step1[5]); step2[6] = vqsubq_s16(step1[1], step1[6]); step2[7] = vqsubq_s16(step1[0], step1[7]); step2[8] = step1[8]; step2[9] = step1[9]; step2[14] = step1[14]; step2[15] = step1[15]; // stage 7 out[0] = vqaddq_s16(step2[0], step2[15]); out[1] = vqaddq_s16(step2[1], step2[14]); out[2] = vqaddq_s16(step2[2], step2[13]); out[3] = vqaddq_s16(step2[3], step2[12]); out[4] = vqaddq_s16(step2[4], step2[11]); out[5] = vqaddq_s16(step2[5], step2[10]); out[6] = vqaddq_s16(step2[6], step2[9]); out[7] = vqaddq_s16(step2[7], step2[8]); out[8] = vqsubq_s16(step2[7], step2[8]); out[9] = vqsubq_s16(step2[6], step2[9]); out[10] = vqsubq_s16(step2[5], step2[10]); out[11] = vqsubq_s16(step2[4], step2[11]); out[12] = vqsubq_s16(step2[3], step2[12]); out[13] = vqsubq_s16(step2[2], step2[13]); out[14] = vqsubq_s16(step2[1], step2[14]); out[15] = vqsubq_s16(step2[0], step2[15]); } static inline void idct16_low8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1[16], step2[16]; const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], (int16_t)cospi[16], (int16_t)cospi[48]); const int16x4_t c1 = set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); // stage 1 // stage 2 step2[0] = in[0]; step2[2] = in[4]; step2[4] = in[2]; step2[6] = in[6]; btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]); btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]); btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]); btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]); // stage 3 btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]); step1[0] = step2[0]; step1[2] = step2[2]; step1[8] = vqaddq_s16(step2[8], step2[9]); step1[9] = vqsubq_s16(step2[8], step2[9]); step1[10] = vqsubq_s16(step2[11], step2[10]); step1[11] = vqaddq_s16(step2[11], step2[10]); step1[12] = vqaddq_s16(step2[12], step2[13]); step1[13] = vqsubq_s16(step2[12], step2[13]); step1[14] = vqsubq_s16(step2[15], step2[14]); step1[15] = vqaddq_s16(step2[15], step2[14]); // stage 4 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]); btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]); btf_16_lane_3_2_neon(step1[10], step1[13], c1, &step2[10], &step2[13]); step2[4] = vqaddq_s16(step1[4], step1[5]); step2[5] = vqsubq_s16(step1[4], step1[5]); step2[6] = vqsubq_s16(step1[7], step1[6]); step2[7] = vqaddq_s16(step1[7], step1[6]); step2[8] = step1[8]; step2[11] = step1[11]; step2[12] = step1[12]; step2[15] = step1[15]; // stage 5 btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]); step1[0] = vqaddq_s16(step2[0], step2[3]); step1[1] = vqaddq_s16(step2[1], step2[2]); step1[2] = vqsubq_s16(step2[1], step2[2]); step1[3] = vqsubq_s16(step2[0], step2[3]); step1[4] = step2[4]; step1[7] = step2[7]; step1[8] = vqaddq_s16(step2[8], step2[11]); step1[9] = vqaddq_s16(step2[9], step2[10]); step1[10] = vqsubq_s16(step2[9], step2[10]); step1[11] = vqsubq_s16(step2[8], step2[11]); step1[12] = vqsubq_s16(step2[15], step2[12]); step1[13] = vqsubq_s16(step2[14], step2[13]); step1[14] = vqaddq_s16(step2[14], step2[13]); step1[15] = vqaddq_s16(step2[15], step2[12]); // stage 6 btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]); btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]); step2[0] = vqaddq_s16(step1[0], step1[7]); step2[1] = vqaddq_s16(step1[1], step1[6]); step2[2] = vqaddq_s16(step1[2], step1[5]); step2[3] = vqaddq_s16(step1[3], step1[4]); step2[4] = vqsubq_s16(step1[3], step1[4]); step2[5] = vqsubq_s16(step1[2], step1[5]); step2[6] = vqsubq_s16(step1[1], step1[6]); step2[7] = vqsubq_s16(step1[0], step1[7]); step2[8] = step1[8]; step2[9] = step1[9]; step2[14] = step1[14]; step2[15] = step1[15]; // stage 7 out[0] = vqaddq_s16(step2[0], step2[15]); out[1] = vqaddq_s16(step2[1], step2[14]); out[2] = vqaddq_s16(step2[2], step2[13]); out[3] = vqaddq_s16(step2[3], step2[12]); out[4] = vqaddq_s16(step2[4], step2[11]); out[5] = vqaddq_s16(step2[5], step2[10]); out[6] = vqaddq_s16(step2[6], step2[9]); out[7] = vqaddq_s16(step2[7], step2[8]); out[8] = vqsubq_s16(step2[7], step2[8]); out[9] = vqsubq_s16(step2[6], step2[9]); out[10] = vqsubq_s16(step2[5], step2[10]); out[11] = vqsubq_s16(step2[4], step2[11]); out[12] = vqsubq_s16(step2[3], step2[12]); out[13] = vqsubq_s16(step2[2], step2[13]); out[14] = vqsubq_s16(step2[1], step2[14]); out[15] = vqsubq_s16(step2[0], step2[15]); } static inline void iadst16_neon(int16x8_t *const in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62], (int16_t)cospi[10], (int16_t)cospi[54]); const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46], (int16_t)cospi[26], (int16_t)cospi[38]); const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[34], (int16_t)cospi[30], (int16_t)cospi[42], (int16_t)cospi[22]); const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[50], (int16_t)cospi[14], (int16_t)cospi[58], (int16_t)cospi[6]); const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], (int16_t)cospi[40], (int16_t)cospi[24]); const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], (int16_t)cospi[16], (int16_t)cospi[48]); int16x8_t x[16]; int16x8_t t[14]; int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; int16x8_t s8, s9, s10, s11, s12, s13, s14, s15; // Stage 1 x[0] = in[15]; x[1] = in[0]; x[2] = in[13]; x[3] = in[2]; x[4] = in[11]; x[5] = in[4]; x[6] = in[9]; x[7] = in[6]; x[8] = in[7]; x[9] = in[8]; x[10] = in[5]; x[11] = in[10]; x[12] = in[3]; x[13] = in[12]; x[14] = in[1]; x[15] = in[14]; // Stage 2 btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1); btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3); btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5); btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7); btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9); btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11); btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13); btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15); // Stage 3 x[0] = vqaddq_s16(s0, s8); x[1] = vqaddq_s16(s1, s9); x[2] = vqaddq_s16(s2, s10); x[3] = vqaddq_s16(s3, s11); x[4] = vqaddq_s16(s4, s12); x[5] = vqaddq_s16(s5, s13); x[6] = vqaddq_s16(s6, s14); x[7] = vqaddq_s16(s7, s15); x[8] = vqsubq_s16(s0, s8); x[9] = vqsubq_s16(s1, s9); x[10] = vqsubq_s16(s2, s10); x[11] = vqsubq_s16(s3, s11); x[12] = vqsubq_s16(s4, s12); x[13] = vqsubq_s16(s5, s13); x[14] = vqsubq_s16(s6, s14); x[15] = vqsubq_s16(s7, s15); // Stage 4 t[0] = x[0]; t[1] = x[1]; t[2] = x[2]; t[3] = x[3]; t[4] = x[4]; t[5] = x[5]; t[6] = x[6]; t[7] = x[7]; btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9); btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11); btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12); btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14); // Stage 5 x[0] = vqaddq_s16(t[0], t[4]); x[1] = vqaddq_s16(t[1], t[5]); x[2] = vqaddq_s16(t[2], t[6]); x[3] = vqaddq_s16(t[3], t[7]); x[4] = vqsubq_s16(t[0], t[4]); x[5] = vqsubq_s16(t[1], t[5]); x[6] = vqsubq_s16(t[2], t[6]); x[7] = vqsubq_s16(t[3], t[7]); x[8] = vqaddq_s16(s8, s12); x[9] = vqaddq_s16(s9, s13); x[10] = vqaddq_s16(s10, s14); x[11] = vqaddq_s16(s11, s15); x[12] = vqsubq_s16(s8, s12); x[13] = vqsubq_s16(s9, s13); x[14] = vqsubq_s16(s10, s14); x[15] = vqsubq_s16(s11, s15); // stage 6 t[0] = x[0]; t[1] = x[1]; t[2] = x[2]; t[3] = x[3]; btf_16_lane_2_3_neon(x[4], x[5], c5, &s4, &s5); btf_16_lane_3_2_neon(x[7], x[6], c5, &s7, &s6); t[8] = x[8]; t[9] = x[9]; t[10] = x[10]; t[11] = x[11]; btf_16_lane_2_3_neon(x[12], x[13], c5, &s12, &s13); btf_16_lane_3_2_neon(x[15], x[14], c5, &s15, &s14); // Stage 7 x[0] = vqaddq_s16(t[0], t[2]); x[1] = vqaddq_s16(t[1], t[3]); x[2] = vqsubq_s16(t[0], t[2]); x[3] = vqsubq_s16(t[1], t[3]); x[4] = vqaddq_s16(s4, s6); x[5] = vqaddq_s16(s5, s7); x[6] = vqsubq_s16(s4, s6); x[7] = vqsubq_s16(s5, s7); x[8] = vqaddq_s16(t[8], t[10]); x[9] = vqaddq_s16(t[9], t[11]); x[10] = vqsubq_s16(t[8], t[10]); x[11] = vqsubq_s16(t[9], t[11]); x[12] = vqaddq_s16(s12, s14); x[13] = vqaddq_s16(s13, s15); x[14] = vqsubq_s16(s12, s14); x[15] = vqsubq_s16(s13, s15); // Stage 8 btf_16_half_neon(x + 2, c5); btf_16_half_neon(x + 6, c5); btf_16_half_neon(x + 10, c5); btf_16_half_neon(x + 14, c5); // Stage 9 out[0] = x[0]; out[1] = vqnegq_s16(x[8]); out[2] = x[12]; out[3] = vqnegq_s16(x[4]); out[4] = x[6]; out[5] = vqnegq_s16(x[14]); out[6] = x[10]; out[7] = vqnegq_s16(x[2]); out[8] = x[3]; out[9] = vqnegq_s16(x[11]); out[10] = x[15]; out[11] = vqnegq_s16(x[7]); out[12] = x[5]; out[13] = vqnegq_s16(x[13]); out[14] = x[9]; out[15] = vqnegq_s16(x[1]); } static inline void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], (int16_t)cospi[40], (int16_t)cospi[24]); const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], (int16_t)cospi[16], (int16_t)cospi[48]); int16x8_t x[16]; int16x8_t t[10]; int16x8_t s0, s1, s4, s5; int16x8_t s8, s9, s12, s13; // Stage 1 x[1] = in[0]; // Stage 2 btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1); // Stage 3 x[0] = s0; x[1] = s1; x[8] = s0; x[9] = s1; // Stage 4 t[0] = x[0]; t[1] = x[1]; btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9); // Stage 5 x[0] = t[0]; x[1] = t[1]; x[4] = t[0]; x[5] = t[1]; x[8] = s8; x[9] = s9; x[12] = s8; x[13] = s9; // stage 6 t[0] = x[0]; t[1] = x[1]; btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5); t[8] = x[8]; t[9] = x[9]; btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13); // Stage 7 x[0] = t[0]; x[1] = t[1]; x[2] = t[0]; x[3] = t[1]; x[4] = s4; x[5] = s5; x[6] = s4; x[7] = s5; x[8] = t[8]; x[9] = t[9]; x[10] = t[8]; x[11] = t[9]; x[12] = s12; x[13] = s13; x[14] = s12; x[15] = s13; // Stage 8 btf_16_half_neon(x + 2, c1); btf_16_half_neon(x + 6, c1); btf_16_half_neon(x + 10, c1); btf_16_half_neon(x + 14, c1); // Stage 9 out[0] = x[0]; out[1] = vqnegq_s16(x[8]); out[2] = x[12]; out[3] = vqnegq_s16(x[4]); out[4] = x[6]; out[5] = vqnegq_s16(x[14]); out[6] = x[10]; out[7] = vqnegq_s16(x[2]); out[8] = x[3]; out[9] = vqnegq_s16(x[11]); out[10] = x[15]; out[11] = vqnegq_s16(x[7]); out[12] = x[5]; out[13] = vqnegq_s16(x[13]); out[14] = x[9]; out[15] = vqnegq_s16(x[1]); } static inline void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], (int16_t)cospi[40], (int16_t)cospi[24]); const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], (int16_t)cospi[16], (int16_t)cospi[48]); int16x8_t x[16]; int16x8_t t[14]; int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; int16x8_t s8, s9, s10, s11, s12, s13, s14, s15; // Stage 1 x[1] = in[0]; x[3] = in[2]; x[5] = in[4]; x[7] = in[6]; x[8] = in[7]; x[10] = in[5]; x[12] = in[3]; x[14] = in[1]; // Stage 2 btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1); btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3); btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5); btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7); btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9); btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11); btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13); btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15); // Stage 3 x[0] = vqaddq_s16(s0, s8); x[1] = vqaddq_s16(s1, s9); x[2] = vqaddq_s16(s2, s10); x[3] = vqaddq_s16(s3, s11); x[4] = vqaddq_s16(s4, s12); x[5] = vqaddq_s16(s5, s13); x[6] = vqaddq_s16(s6, s14); x[7] = vqaddq_s16(s7, s15); x[8] = vqsubq_s16(s0, s8); x[9] = vqsubq_s16(s1, s9); x[10] = vqsubq_s16(s2, s10); x[11] = vqsubq_s16(s3, s11); x[12] = vqsubq_s16(s4, s12); x[13] = vqsubq_s16(s5, s13); x[14] = vqsubq_s16(s6, s14); x[15] = vqsubq_s16(s7, s15); // Stage 4 t[0] = x[0]; t[1] = x[1]; t[2] = x[2]; t[3] = x[3]; t[4] = x[4]; t[5] = x[5]; t[6] = x[6]; t[7] = x[7]; btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9); btf_16_lane_2_3_neon(x[10], x[11], c0, &s10, &s11); btf_16_lane_1_0_neon(x[13], x[12], c0, &s13, &s12); btf_16_lane_3_2_neon(x[15], x[14], c0, &s15, &s14); // Stage 5 x[0] = vqaddq_s16(t[0], t[4]); x[1] = vqaddq_s16(t[1], t[5]); x[2] = vqaddq_s16(t[2], t[6]); x[3] = vqaddq_s16(t[3], t[7]); x[4] = vqsubq_s16(t[0], t[4]); x[5] = vqsubq_s16(t[1], t[5]); x[6] = vqsubq_s16(t[2], t[6]); x[7] = vqsubq_s16(t[3], t[7]); x[8] = vqaddq_s16(s8, s12); x[9] = vqaddq_s16(s9, s13); x[10] = vqaddq_s16(s10, s14); x[11] = vqaddq_s16(s11, s15); x[12] = vqsubq_s16(s8, s12); x[13] = vqsubq_s16(s9, s13); x[14] = vqsubq_s16(s10, s14); x[15] = vqsubq_s16(s11, s15); // stage 6 t[0] = x[0]; t[1] = x[1]; t[2] = x[2]; t[3] = x[3]; btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5); btf_16_lane_3_2_neon(x[7], x[6], c1, &s7, &s6); t[8] = x[8]; t[9] = x[9]; t[10] = x[10]; t[11] = x[11]; btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13); btf_16_lane_3_2_neon(x[15], x[14], c1, &s15, &s14); // Stage 7 x[0] = vqaddq_s16(t[0], t[2]); x[1] = vqaddq_s16(t[1], t[3]); x[2] = vqsubq_s16(t[0], t[2]); x[3] = vqsubq_s16(t[1], t[3]); x[4] = vqaddq_s16(s4, s6); x[5] = vqaddq_s16(s5, s7); x[6] = vqsubq_s16(s4, s6); x[7] = vqsubq_s16(s5, s7); x[8] = vqaddq_s16(t[8], t[10]); x[9] = vqaddq_s16(t[9], t[11]); x[10] = vqsubq_s16(t[8], t[10]); x[11] = vqsubq_s16(t[9], t[11]); x[12] = vqaddq_s16(s12, s14); x[13] = vqaddq_s16(s13, s15); x[14] = vqsubq_s16(s12, s14); x[15] = vqsubq_s16(s13, s15); // Stage 8 btf_16_half_neon(x + 2, c1); btf_16_half_neon(x + 6, c1); btf_16_half_neon(x + 10, c1); btf_16_half_neon(x + 14, c1); // Stage 9 out[0] = x[0]; out[1] = vqnegq_s16(x[8]); out[2] = x[12]; out[3] = vqnegq_s16(x[4]); out[4] = x[6]; out[5] = vqnegq_s16(x[14]); out[6] = x[10]; out[7] = vqnegq_s16(x[2]); out[8] = x[3]; out[9] = vqnegq_s16(x[11]); out[10] = x[15]; out[11] = vqnegq_s16(x[7]); out[12] = x[5]; out[13] = vqnegq_s16(x[13]); out[14] = x[9]; out[15] = vqnegq_s16(x[1]); } static inline void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1[32], step2[32]; const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62], (int16_t)cospi[34], (int16_t)cospi[30]); const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46], (int16_t)cospi[50], (int16_t)cospi[14]); const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[10], (int16_t)cospi[54], (int16_t)cospi[42], (int16_t)cospi[22]); const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[26], (int16_t)cospi[38], (int16_t)cospi[58], (int16_t)cospi[6]); const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], (int16_t)cospi[36], (int16_t)cospi[28]); const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], (int16_t)cospi[52], (int16_t)cospi[12]); const int16x4_t c6 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], (int16_t)cospi[40], (int16_t)cospi[24]); const int16x4_t c7 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], (int16_t)cospi[16], (int16_t)cospi[48]); const int16x4_t c8 = set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); const int16x4_t c9 = set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); // stage 2 btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]); btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]); btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]); btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]); btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]); btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]); btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]); btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]); step2[0] = in[0]; step2[1] = in[16]; step2[2] = in[8]; step2[3] = in[24]; step2[4] = in[4]; step2[5] = in[20]; step2[6] = in[12]; step2[7] = in[28]; step2[8] = in[2]; step2[9] = in[18]; step2[10] = in[10]; step2[11] = in[26]; step2[12] = in[6]; step2[13] = in[22]; step2[14] = in[14]; step2[15] = in[30]; // stage 3 btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]); btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]); btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]); btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]); step1[0] = step2[0]; step1[1] = step2[1]; step1[2] = step2[2]; step1[3] = step2[3]; step1[4] = step2[4]; step1[5] = step2[5]; step1[6] = step2[6]; step1[7] = step2[7]; step1[16] = vqaddq_s16(step2[16], step2[17]); step1[17] = vqsubq_s16(step2[16], step2[17]); step1[18] = vqsubq_s16(step2[19], step2[18]); step1[19] = vqaddq_s16(step2[19], step2[18]); step1[20] = vqaddq_s16(step2[20], step2[21]); step1[21] = vqsubq_s16(step2[20], step2[21]); step1[22] = vqsubq_s16(step2[23], step2[22]); step1[23] = vqaddq_s16(step2[23], step2[22]); step1[24] = vqaddq_s16(step2[24], step2[25]); step1[25] = vqsubq_s16(step2[24], step2[25]); step1[26] = vqsubq_s16(step2[27], step2[26]); step1[27] = vqaddq_s16(step2[27], step2[26]); step1[28] = vqaddq_s16(step2[28], step2[29]); step1[29] = vqsubq_s16(step2[28], step2[29]); step1[30] = vqsubq_s16(step2[31], step2[30]); step1[31] = vqaddq_s16(step2[31], step2[30]); // stage 4 btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]); btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]); btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]); btf_16_lane_1_0_neon(step1[18], step1[29], c8, &step2[18], &step2[29]); btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]); btf_16_lane_3_2_neon(step1[22], step1[25], c8, &step2[22], &step2[25]); step2[0] = step1[0]; step2[1] = step1[1]; step2[2] = step1[2]; step2[3] = step1[3]; step2[8] = vqaddq_s16(step1[8], step1[9]); step2[9] = vqsubq_s16(step1[8], step1[9]); step2[10] = vqsubq_s16(step1[11], step1[10]); step2[11] = vqaddq_s16(step1[11], step1[10]); step2[12] = vqaddq_s16(step1[12], step1[13]); step2[13] = vqsubq_s16(step1[12], step1[13]); step2[14] = vqsubq_s16(step1[15], step1[14]); step2[15] = vqaddq_s16(step1[15], step1[14]); step2[16] = step1[16]; step2[19] = step1[19]; step2[20] = step1[20]; step2[23] = step1[23]; step2[24] = step1[24]; step2[27] = step1[27]; step2[28] = step1[28]; step2[31] = step1[31]; // stage 5 btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]); btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]); btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]); btf_16_lane_3_2_neon(step2[10], step2[13], c9, &step1[10], &step1[13]); step1[4] = vqaddq_s16(step2[4], step2[5]); step1[5] = vqsubq_s16(step2[4], step2[5]); step1[6] = vqsubq_s16(step2[7], step2[6]); step1[7] = vqaddq_s16(step2[7], step2[6]); step1[8] = step2[8]; step1[11] = step2[11]; step1[12] = step2[12]; step1[15] = step2[15]; step1[16] = vqaddq_s16(step2[16], step2[19]); step1[17] = vqaddq_s16(step2[17], step2[18]); step1[18] = vqsubq_s16(step2[17], step2[18]); step1[19] = vqsubq_s16(step2[16], step2[19]); step1[20] = vqsubq_s16(step2[23], step2[20]); step1[21] = vqsubq_s16(step2[22], step2[21]); step1[22] = vqaddq_s16(step2[22], step2[21]); step1[23] = vqaddq_s16(step2[23], step2[20]); step1[24] = vqaddq_s16(step2[24], step2[27]); step1[25] = vqaddq_s16(step2[25], step2[26]); step1[26] = vqsubq_s16(step2[25], step2[26]); step1[27] = vqsubq_s16(step2[24], step2[27]); step1[28] = vqsubq_s16(step2[31], step2[28]); step1[29] = vqsubq_s16(step2[30], step2[29]); step1[30] = vqaddq_s16(step2[30], step2[29]); step1[31] = vqaddq_s16(step2[31], step2[28]); // stage 6 btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]); btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]); btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]); btf_16_lane_3_2_neon(step1[20], step1[27], c9, &step2[20], &step2[27]); btf_16_lane_3_2_neon(step1[21], step1[26], c9, &step2[21], &step2[26]); step2[0] = vqaddq_s16(step1[0], step1[3]); step2[1] = vqaddq_s16(step1[1], step1[2]); step2[2] = vqsubq_s16(step1[1], step1[2]); step2[3] = vqsubq_s16(step1[0], step1[3]); step2[4] = step1[4]; step2[7] = step1[7]; step2[8] = vqaddq_s16(step1[8], step1[11]); step2[9] = vqaddq_s16(step1[9], step1[10]); step2[10] = vqsubq_s16(step1[9], step1[10]); step2[11] = vqsubq_s16(step1[8], step1[11]); step2[12] = vqsubq_s16(step1[15], step1[12]); step2[13] = vqsubq_s16(step1[14], step1[13]); step2[14] = vqaddq_s16(step1[14], step1[13]); step2[15] = vqaddq_s16(step1[15], step1[12]); step2[16] = step1[16]; step2[17] = step1[17]; step2[22] = step1[22]; step2[23] = step1[23]; step2[24] = step1[24]; step2[25] = step1[25]; step2[30] = step1[30]; step2[31] = step1[31]; // stage 7 btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]); btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]); step1[0] = vqaddq_s16(step2[0], step2[7]); step1[1] = vqaddq_s16(step2[1], step2[6]); step1[2] = vqaddq_s16(step2[2], step2[5]); step1[3] = vqaddq_s16(step2[3], step2[4]); step1[4] = vqsubq_s16(step2[3], step2[4]); step1[5] = vqsubq_s16(step2[2], step2[5]); step1[6] = vqsubq_s16(step2[1], step2[6]); step1[7] = vqsubq_s16(step2[0], step2[7]); step1[8] = step2[8]; step1[9] = step2[9]; step1[14] = step2[14]; step1[15] = step2[15]; step1[16] = vqaddq_s16(step2[16], step2[23]); step1[17] = vqaddq_s16(step2[17], step2[22]); step1[18] = vqaddq_s16(step2[18], step2[21]); step1[19] = vqaddq_s16(step2[19], step2[20]); step1[20] = vqsubq_s16(step2[19], step2[20]); step1[21] = vqsubq_s16(step2[18], step2[21]); step1[22] = vqsubq_s16(step2[17], step2[22]); step1[23] = vqsubq_s16(step2[16], step2[23]); step1[24] = vqsubq_s16(step2[31], step2[24]); step1[25] = vqsubq_s16(step2[30], step2[25]); step1[26] = vqsubq_s16(step2[29], step2[26]); step1[27] = vqsubq_s16(step2[28], step2[27]); step1[28] = vqaddq_s16(step2[27], step2[28]); step1[29] = vqaddq_s16(step2[26], step2[29]); step1[30] = vqaddq_s16(step2[25], step2[30]); step1[31] = vqaddq_s16(step2[24], step2[31]); // stage 8 btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]); btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]); btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]); btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]); step2[0] = vqaddq_s16(step1[0], step1[15]); step2[1] = vqaddq_s16(step1[1], step1[14]); step2[2] = vqaddq_s16(step1[2], step1[13]); step2[3] = vqaddq_s16(step1[3], step1[12]); step2[4] = vqaddq_s16(step1[4], step1[11]); step2[5] = vqaddq_s16(step1[5], step1[10]); step2[6] = vqaddq_s16(step1[6], step1[9]); step2[7] = vqaddq_s16(step1[7], step1[8]); step2[8] = vqsubq_s16(step1[7], step1[8]); step2[9] = vqsubq_s16(step1[6], step1[9]); step2[10] = vqsubq_s16(step1[5], step1[10]); step2[11] = vqsubq_s16(step1[4], step1[11]); step2[12] = vqsubq_s16(step1[3], step1[12]); step2[13] = vqsubq_s16(step1[2], step1[13]); step2[14] = vqsubq_s16(step1[1], step1[14]); step2[15] = vqsubq_s16(step1[0], step1[15]); step2[16] = step1[16]; step2[17] = step1[17]; step2[18] = step1[18]; step2[19] = step1[19]; step2[28] = step1[28]; step2[29] = step1[29]; step2[30] = step1[30]; step2[31] = step1[31]; // stage 9 out[0] = vqaddq_s16(step2[0], step2[31]); out[1] = vqaddq_s16(step2[1], step2[30]); out[2] = vqaddq_s16(step2[2], step2[29]); out[3] = vqaddq_s16(step2[3], step2[28]); out[4] = vqaddq_s16(step2[4], step2[27]); out[5] = vqaddq_s16(step2[5], step2[26]); out[6] = vqaddq_s16(step2[6], step2[25]); out[7] = vqaddq_s16(step2[7], step2[24]); out[8] = vqaddq_s16(step2[8], step2[23]); out[9] = vqaddq_s16(step2[9], step2[22]); out[10] = vqaddq_s16(step2[10], step2[21]); out[11] = vqaddq_s16(step2[11], step2[20]); out[12] = vqaddq_s16(step2[12], step2[19]); out[13] = vqaddq_s16(step2[13], step2[18]); out[14] = vqaddq_s16(step2[14], step2[17]); out[15] = vqaddq_s16(step2[15], step2[16]); out[16] = vqsubq_s16(step2[15], step2[16]); out[17] = vqsubq_s16(step2[14], step2[17]); out[18] = vqsubq_s16(step2[13], step2[18]); out[19] = vqsubq_s16(step2[12], step2[19]); out[20] = vqsubq_s16(step2[11], step2[20]); out[21] = vqsubq_s16(step2[10], step2[21]); out[22] = vqsubq_s16(step2[9], step2[22]); out[23] = vqsubq_s16(step2[8], step2[23]); out[24] = vqsubq_s16(step2[7], step2[24]); out[25] = vqsubq_s16(step2[6], step2[25]); out[26] = vqsubq_s16(step2[5], step2[26]); out[27] = vqsubq_s16(step2[4], step2[27]); out[28] = vqsubq_s16(step2[3], step2[28]); out[29] = vqsubq_s16(step2[2], step2[29]); out[30] = vqsubq_s16(step2[1], step2[30]); out[31] = vqsubq_s16(step2[0], step2[31]); } static inline void idct32_low1_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1; int32x4_t t32[2]; // stage 1 // stage 2 // stage 3 // stage 4 // stage 5 t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]); t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]); step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), vrshrn_n_s32(t32[1], INV_COS_BIT)); // stage 6 // stage 7 // stage 8 // stage 9 out[0] = step1; out[1] = step1; out[2] = step1; out[3] = step1; out[4] = step1; out[5] = step1; out[6] = step1; out[7] = step1; out[8] = step1; out[9] = step1; out[10] = step1; out[11] = step1; out[12] = step1; out[13] = step1; out[14] = step1; out[15] = step1; out[16] = step1; out[17] = step1; out[18] = step1; out[19] = step1; out[20] = step1; out[21] = step1; out[22] = step1; out[23] = step1; out[24] = step1; out[25] = step1; out[26] = step1; out[27] = step1; out[28] = step1; out[29] = step1; out[30] = step1; out[31] = step1; } static inline void idct32_low8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1[32], step2[32]; int32x4_t t32[16]; const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], (int16_t)cospi[40], (int16_t)cospi[24]); const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], (int16_t)cospi[16], cospi[48]); const int16x4_t c2 = set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); const int16x4_t c3 = set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); // stage 1 // stage 2 step2[0] = in[0]; step2[4] = in[4]; step2[8] = in[2]; step2[12] = in[6]; btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]); btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]); btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]); btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]); // stage 3 step1[0] = step2[0]; step1[4] = step2[4]; btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]); btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]); step1[16] = step2[16]; step1[17] = step2[16]; step1[18] = step2[19]; step1[19] = step2[19]; step1[20] = step2[20]; step1[21] = step2[20]; step1[22] = step2[23]; step1[23] = step2[23]; step1[24] = step2[24]; step1[25] = step2[24]; step1[26] = step2[27]; step1[27] = step2[27]; step1[28] = step2[28]; step1[29] = step2[28]; step1[30] = step2[31]; step1[31] = step2[31]; // stage 4 btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]); btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]); btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]); btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]); btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]); step2[0] = step1[0]; step2[8] = step1[8]; step2[9] = step1[8]; step2[10] = step1[11]; step2[11] = step1[11]; step2[12] = step1[12]; step2[13] = step1[12]; step2[14] = step1[15]; step2[15] = step1[15]; step2[16] = step1[16]; step2[19] = step1[19]; step2[20] = step1[20]; step2[23] = step1[23]; step2[24] = step1[24]; step2[27] = step1[27]; step2[28] = step1[28]; step2[31] = step1[31]; // stage 5 t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]); t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]); step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), vrshrn_n_s32(t32[1], INV_COS_BIT)); btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]); btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]); step1[4] = step2[4]; step1[5] = step2[4]; step1[6] = step2[7]; step1[7] = step2[7]; step1[8] = step2[8]; step1[11] = step2[11]; step1[12] = step2[12]; step1[15] = step2[15]; step1[16] = vqaddq_s16(step2[16], step2[19]); step1[17] = vqaddq_s16(step2[17], step2[18]); step1[18] = vqsubq_s16(step2[17], step2[18]); step1[19] = vqsubq_s16(step2[16], step2[19]); step1[20] = vqsubq_s16(step2[23], step2[20]); step1[21] = vqsubq_s16(step2[22], step2[21]); step1[22] = vqaddq_s16(step2[22], step2[21]); step1[23] = vqaddq_s16(step2[23], step2[20]); step1[24] = vqaddq_s16(step2[24], step2[27]); step1[25] = vqaddq_s16(step2[25], step2[26]); step1[26] = vqsubq_s16(step2[25], step2[26]); step1[27] = vqsubq_s16(step2[24], step2[27]); step1[28] = vqsubq_s16(step2[31], step2[28]); step1[29] = vqsubq_s16(step2[30], step2[29]); step1[30] = vqaddq_s16(step2[30], step2[29]); step1[31] = vqaddq_s16(step2[31], step2[28]); // stage 6 btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]); btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]); btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]); btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]); btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]); step2[0] = step1[0]; step2[1] = step1[0]; step2[2] = step1[0]; step2[3] = step1[0]; step2[4] = step1[4]; step2[7] = step1[7]; step2[8] = vqaddq_s16(step1[8], step1[11]); step2[9] = vqaddq_s16(step1[9], step1[10]); step2[10] = vqsubq_s16(step1[9], step1[10]); step2[11] = vqsubq_s16(step1[8], step1[11]); step2[12] = vqsubq_s16(step1[15], step1[12]); step2[13] = vqsubq_s16(step1[14], step1[13]); step2[14] = vqaddq_s16(step1[14], step1[13]); step2[15] = vqaddq_s16(step1[15], step1[12]); step2[16] = step1[16]; step2[17] = step1[17]; step2[22] = step1[22]; step2[23] = step1[23]; step2[24] = step1[24]; step2[25] = step1[25]; step2[30] = step1[30]; step2[31] = step1[31]; // stage 7 btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]); btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]); step1[0] = vqaddq_s16(step2[0], step2[7]); step1[1] = vqaddq_s16(step2[1], step2[6]); step1[2] = vqaddq_s16(step2[2], step2[5]); step1[3] = vqaddq_s16(step2[3], step2[4]); step1[4] = vqsubq_s16(step2[3], step2[4]); step1[5] = vqsubq_s16(step2[2], step2[5]); step1[6] = vqsubq_s16(step2[1], step2[6]); step1[7] = vqsubq_s16(step2[0], step2[7]); step1[8] = step2[8]; step1[9] = step2[9]; step1[14] = step2[14]; step1[15] = step2[15]; step1[16] = vqaddq_s16(step2[16], step2[23]); step1[17] = vqaddq_s16(step2[17], step2[22]); step1[18] = vqaddq_s16(step2[18], step2[21]); step1[19] = vqaddq_s16(step2[19], step2[20]); step1[20] = vqsubq_s16(step2[19], step2[20]); step1[21] = vqsubq_s16(step2[18], step2[21]); step1[22] = vqsubq_s16(step2[17], step2[22]); step1[23] = vqsubq_s16(step2[16], step2[23]); step1[24] = vqsubq_s16(step2[31], step2[24]); step1[25] = vqsubq_s16(step2[30], step2[25]); step1[26] = vqsubq_s16(step2[29], step2[26]); step1[27] = vqsubq_s16(step2[28], step2[27]); step1[28] = vqaddq_s16(step2[27], step2[28]); step1[29] = vqaddq_s16(step2[26], step2[29]); step1[30] = vqaddq_s16(step2[25], step2[30]); step1[31] = vqaddq_s16(step2[24], step2[31]); // stage 8 btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]); btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]); btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]); btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]); step2[0] = vqaddq_s16(step1[0], step1[15]); step2[1] = vqaddq_s16(step1[1], step1[14]); step2[2] = vqaddq_s16(step1[2], step1[13]); step2[3] = vqaddq_s16(step1[3], step1[12]); step2[4] = vqaddq_s16(step1[4], step1[11]); step2[5] = vqaddq_s16(step1[5], step1[10]); step2[6] = vqaddq_s16(step1[6], step1[9]); step2[7] = vqaddq_s16(step1[7], step1[8]); step2[8] = vqsubq_s16(step1[7], step1[8]); step2[9] = vqsubq_s16(step1[6], step1[9]); step2[10] = vqsubq_s16(step1[5], step1[10]); step2[11] = vqsubq_s16(step1[4], step1[11]); step2[12] = vqsubq_s16(step1[3], step1[12]); step2[13] = vqsubq_s16(step1[2], step1[13]); step2[14] = vqsubq_s16(step1[1], step1[14]); step2[15] = vqsubq_s16(step1[0], step1[15]); step2[16] = step1[16]; step2[17] = step1[17]; step2[18] = step1[18]; step2[19] = step1[19]; step2[28] = step1[28]; step2[29] = step1[29]; step2[30] = step1[30]; step2[31] = step1[31]; // stage 9 out[0] = vqaddq_s16(step2[0], step2[31]); out[1] = vqaddq_s16(step2[1], step2[30]); out[2] = vqaddq_s16(step2[2], step2[29]); out[3] = vqaddq_s16(step2[3], step2[28]); out[4] = vqaddq_s16(step2[4], step2[27]); out[5] = vqaddq_s16(step2[5], step2[26]); out[6] = vqaddq_s16(step2[6], step2[25]); out[7] = vqaddq_s16(step2[7], step2[24]); out[8] = vqaddq_s16(step2[8], step2[23]); out[9] = vqaddq_s16(step2[9], step2[22]); out[10] = vqaddq_s16(step2[10], step2[21]); out[11] = vqaddq_s16(step2[11], step2[20]); out[12] = vqaddq_s16(step2[12], step2[19]); out[13] = vqaddq_s16(step2[13], step2[18]); out[14] = vqaddq_s16(step2[14], step2[17]); out[15] = vqaddq_s16(step2[15], step2[16]); out[16] = vqsubq_s16(step2[15], step2[16]); out[17] = vqsubq_s16(step2[14], step2[17]); out[18] = vqsubq_s16(step2[13], step2[18]); out[19] = vqsubq_s16(step2[12], step2[19]); out[20] = vqsubq_s16(step2[11], step2[20]); out[21] = vqsubq_s16(step2[10], step2[21]); out[22] = vqsubq_s16(step2[9], step2[22]); out[23] = vqsubq_s16(step2[8], step2[23]); out[24] = vqsubq_s16(step2[7], step2[24]); out[25] = vqsubq_s16(step2[6], step2[25]); out[26] = vqsubq_s16(step2[5], step2[26]); out[27] = vqsubq_s16(step2[4], step2[27]); out[28] = vqsubq_s16(step2[3], step2[28]); out[29] = vqsubq_s16(step2[2], step2[29]); out[30] = vqsubq_s16(step2[1], step2[30]); out[31] = vqsubq_s16(step2[0], step2[31]); } static inline void idct32_low16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1[32], step2[32]; int32x4_t t32[16]; const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], (int16_t)cospi[40], (int16_t)cospi[24]); const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], (int16_t)cospi[16], (int16_t)cospi[48]); const int16x4_t c2 = set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); const int16x4_t c3 = set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); // stage 1 // stage 2 btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]); btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]); btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]); btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]); btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]); btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]); btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]); btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]); step2[0] = in[0]; step2[2] = in[8]; step2[4] = in[4]; step2[6] = in[12]; step2[8] = in[2]; step2[10] = in[10]; step2[12] = in[6]; step2[14] = in[14]; // stage 3 btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]); btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]); btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]); btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]); step1[0] = step2[0]; step1[2] = step2[2]; step1[4] = step2[4]; step1[6] = step2[6]; step1[16] = vqaddq_s16(step2[16], step2[17]); step1[17] = vqsubq_s16(step2[16], step2[17]); step1[18] = vqsubq_s16(step2[19], step2[18]); step1[19] = vqaddq_s16(step2[19], step2[18]); step1[20] = vqaddq_s16(step2[20], step2[21]); step1[21] = vqsubq_s16(step2[20], step2[21]); step1[22] = vqsubq_s16(step2[23], step2[22]); step1[23] = vqaddq_s16(step2[23], step2[22]); step1[24] = vqaddq_s16(step2[24], step2[25]); step1[25] = vqsubq_s16(step2[24], step2[25]); step1[26] = vqsubq_s16(step2[27], step2[26]); step1[27] = vqaddq_s16(step2[27], step2[26]); step1[28] = vqaddq_s16(step2[28], step2[29]); step1[29] = vqsubq_s16(step2[28], step2[29]); step1[30] = vqsubq_s16(step2[31], step2[30]); step1[31] = vqaddq_s16(step2[31], step2[30]); // stage 4 btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]); btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]); btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]); btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]); btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]); btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]); step2[0] = step1[0]; step2[2] = step1[2]; step2[8] = vqaddq_s16(step1[8], step1[9]); step2[9] = vqsubq_s16(step1[8], step1[9]); step2[10] = vqsubq_s16(step1[11], step1[10]); step2[11] = vqaddq_s16(step1[11], step1[10]); step2[12] = vqaddq_s16(step1[12], step1[13]); step2[13] = vqsubq_s16(step1[12], step1[13]); step2[14] = vqsubq_s16(step1[15], step1[14]); step2[15] = vqaddq_s16(step1[15], step1[14]); step2[16] = step1[16]; step2[19] = step1[19]; step2[20] = step1[20]; step2[23] = step1[23]; step2[24] = step1[24]; step2[27] = step1[27]; step2[28] = step1[28]; step2[31] = step1[31]; // stage 5 t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]); t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]); step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), vrshrn_n_s32(t32[1], INV_COS_BIT)); btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]); btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]); btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]); step1[4] = vqaddq_s16(step2[4], step2[5]); step1[5] = vqsubq_s16(step2[4], step2[5]); step1[6] = vqsubq_s16(step2[7], step2[6]); step1[7] = vqaddq_s16(step2[7], step2[6]); step1[8] = step2[8]; step1[11] = step2[11]; step1[12] = step2[12]; step1[15] = step2[15]; step1[16] = vqaddq_s16(step2[16], step2[19]); step1[17] = vqaddq_s16(step2[17], step2[18]); step1[18] = vqsubq_s16(step2[17], step2[18]); step1[19] = vqsubq_s16(step2[16], step2[19]); step1[20] = vqsubq_s16(step2[23], step2[20]); step1[21] = vqsubq_s16(step2[22], step2[21]); step1[22] = vqaddq_s16(step2[22], step2[21]); step1[23] = vqaddq_s16(step2[23], step2[20]); step1[24] = vqaddq_s16(step2[24], step2[27]); step1[25] = vqaddq_s16(step2[25], step2[26]); step1[26] = vqsubq_s16(step2[25], step2[26]); step1[27] = vqsubq_s16(step2[24], step2[27]); step1[28] = vqsubq_s16(step2[31], step2[28]); step1[29] = vqsubq_s16(step2[30], step2[29]); step1[30] = vqaddq_s16(step2[30], step2[29]); step1[31] = vqaddq_s16(step2[31], step2[28]); // stage 6 btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]); btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]); btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]); btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]); btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]); step2[0] = vqaddq_s16(step1[0], step1[3]); step2[1] = vqaddq_s16(step1[0], step1[2]); step2[2] = vqsubq_s16(step1[0], step1[2]); step2[3] = vqsubq_s16(step1[0], step1[3]); step2[4] = step1[4]; step2[7] = step1[7]; step2[8] = vqaddq_s16(step1[8], step1[11]); step2[9] = vqaddq_s16(step1[9], step1[10]); step2[10] = vqsubq_s16(step1[9], step1[10]); step2[11] = vqsubq_s16(step1[8], step1[11]); step2[12] = vqsubq_s16(step1[15], step1[12]); step2[13] = vqsubq_s16(step1[14], step1[13]); step2[14] = vqaddq_s16(step1[14], step1[13]); step2[15] = vqaddq_s16(step1[15], step1[12]); step2[16] = step1[16]; step2[17] = step1[17]; step2[22] = step1[22]; step2[23] = step1[23]; step2[24] = step1[24]; step2[25] = step1[25]; step2[30] = step1[30]; step2[31] = step1[31]; // stage 7 btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]); btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]); step1[0] = vqaddq_s16(step2[0], step2[7]); step1[1] = vqaddq_s16(step2[1], step2[6]); step1[2] = vqaddq_s16(step2[2], step2[5]); step1[3] = vqaddq_s16(step2[3], step2[4]); step1[4] = vqsubq_s16(step2[3], step2[4]); step1[5] = vqsubq_s16(step2[2], step2[5]); step1[6] = vqsubq_s16(step2[1], step2[6]); step1[7] = vqsubq_s16(step2[0], step2[7]); step1[8] = step2[8]; step1[9] = step2[9]; step1[14] = step2[14]; step1[15] = step2[15]; step1[16] = vqaddq_s16(step2[16], step2[23]); step1[17] = vqaddq_s16(step2[17], step2[22]); step1[18] = vqaddq_s16(step2[18], step2[21]); step1[19] = vqaddq_s16(step2[19], step2[20]); step1[20] = vqsubq_s16(step2[19], step2[20]); step1[21] = vqsubq_s16(step2[18], step2[21]); step1[22] = vqsubq_s16(step2[17], step2[22]); step1[23] = vqsubq_s16(step2[16], step2[23]); step1[24] = vqsubq_s16(step2[31], step2[24]); step1[25] = vqsubq_s16(step2[30], step2[25]); step1[26] = vqsubq_s16(step2[29], step2[26]); step1[27] = vqsubq_s16(step2[28], step2[27]); step1[28] = vqaddq_s16(step2[27], step2[28]); step1[29] = vqaddq_s16(step2[26], step2[29]); step1[30] = vqaddq_s16(step2[25], step2[30]); step1[31] = vqaddq_s16(step2[24], step2[31]); // stage 8 btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]); btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]); btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]); btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]); step2[0] = vqaddq_s16(step1[0], step1[15]); step2[1] = vqaddq_s16(step1[1], step1[14]); step2[2] = vqaddq_s16(step1[2], step1[13]); step2[3] = vqaddq_s16(step1[3], step1[12]); step2[4] = vqaddq_s16(step1[4], step1[11]); step2[5] = vqaddq_s16(step1[5], step1[10]); step2[6] = vqaddq_s16(step1[6], step1[9]); step2[7] = vqaddq_s16(step1[7], step1[8]); step2[8] = vqsubq_s16(step1[7], step1[8]); step2[9] = vqsubq_s16(step1[6], step1[9]); step2[10] = vqsubq_s16(step1[5], step1[10]); step2[11] = vqsubq_s16(step1[4], step1[11]); step2[12] = vqsubq_s16(step1[3], step1[12]); step2[13] = vqsubq_s16(step1[2], step1[13]); step2[14] = vqsubq_s16(step1[1], step1[14]); step2[15] = vqsubq_s16(step1[0], step1[15]); step2[16] = step1[16]; step2[17] = step1[17]; step2[18] = step1[18]; step2[19] = step1[19]; step2[28] = step1[28]; step2[29] = step1[29]; step2[30] = step1[30]; step2[31] = step1[31]; // stage 9 out[0] = vqaddq_s16(step2[0], step2[31]); out[1] = vqaddq_s16(step2[1], step2[30]); out[2] = vqaddq_s16(step2[2], step2[29]); out[3] = vqaddq_s16(step2[3], step2[28]); out[4] = vqaddq_s16(step2[4], step2[27]); out[5] = vqaddq_s16(step2[5], step2[26]); out[6] = vqaddq_s16(step2[6], step2[25]); out[7] = vqaddq_s16(step2[7], step2[24]); out[8] = vqaddq_s16(step2[8], step2[23]); out[9] = vqaddq_s16(step2[9], step2[22]); out[10] = vqaddq_s16(step2[10], step2[21]); out[11] = vqaddq_s16(step2[11], step2[20]); out[12] = vqaddq_s16(step2[12], step2[19]); out[13] = vqaddq_s16(step2[13], step2[18]); out[14] = vqaddq_s16(step2[14], step2[17]); out[15] = vqaddq_s16(step2[15], step2[16]); out[16] = vqsubq_s16(step2[15], step2[16]); out[17] = vqsubq_s16(step2[14], step2[17]); out[18] = vqsubq_s16(step2[13], step2[18]); out[19] = vqsubq_s16(step2[12], step2[19]); out[20] = vqsubq_s16(step2[11], step2[20]); out[21] = vqsubq_s16(step2[10], step2[21]); out[22] = vqsubq_s16(step2[9], step2[22]); out[23] = vqsubq_s16(step2[8], step2[23]); out[24] = vqsubq_s16(step2[7], step2[24]); out[25] = vqsubq_s16(step2[6], step2[25]); out[26] = vqsubq_s16(step2[5], step2[26]); out[27] = vqsubq_s16(step2[4], step2[27]); out[28] = vqsubq_s16(step2[3], step2[28]); out[29] = vqsubq_s16(step2[2], step2[29]); out[30] = vqsubq_s16(step2[1], step2[30]); out[31] = vqsubq_s16(step2[0], step2[31]); } static inline void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], (int16_t)cospi[16], (int16_t)cospi[48]); btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]); btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]); btf_16_lane_0_1_neon(step2[25], step2[22], c3, &step1[25], &step1[22]); btf_16_lane_0_1_neon(step2[24], step2[23], c3, &step1[24], &step1[23]); step1[0] = vqaddq_s16(step2[0], step2[15]); step1[1] = vqaddq_s16(step2[1], step2[14]); step1[2] = vqaddq_s16(step2[2], step2[13]); step1[3] = vqaddq_s16(step2[3], step2[12]); step1[4] = vqaddq_s16(step2[4], step2[11]); step1[5] = vqaddq_s16(step2[5], step2[10]); step1[6] = vqaddq_s16(step2[6], step2[9]); step1[7] = vqaddq_s16(step2[7], step2[8]); step1[8] = vqsubq_s16(step2[7], step2[8]); step1[9] = vqsubq_s16(step2[6], step2[9]); step1[10] = vqsubq_s16(step2[5], step2[10]); step1[11] = vqsubq_s16(step2[4], step2[11]); step1[12] = vqsubq_s16(step2[3], step2[12]); step1[13] = vqsubq_s16(step2[2], step2[13]); step1[14] = vqsubq_s16(step2[1], step2[14]); step1[15] = vqsubq_s16(step2[0], step2[15]); step1[16] = step2[16]; step1[17] = step2[17]; step1[18] = step2[18]; step1[19] = step2[19]; step1[28] = step2[28]; step1[29] = step2[29]; step1[30] = step2[30]; step1[31] = step2[31]; step1[32] = vqaddq_s16(step2[32], step2[47]); step1[33] = vqaddq_s16(step2[33], step2[46]); step1[34] = vqaddq_s16(step2[34], step2[45]); step1[35] = vqaddq_s16(step2[35], step2[44]); step1[36] = vqaddq_s16(step2[36], step2[43]); step1[37] = vqaddq_s16(step2[37], step2[42]); step1[38] = vqaddq_s16(step2[38], step2[41]); step1[39] = vqaddq_s16(step2[39], step2[40]); step1[40] = vqsubq_s16(step2[39], step2[40]); step1[41] = vqsubq_s16(step2[38], step2[41]); step1[42] = vqsubq_s16(step2[37], step2[42]); step1[43] = vqsubq_s16(step2[36], step2[43]); step1[44] = vqsubq_s16(step2[35], step2[44]); step1[45] = vqsubq_s16(step2[34], step2[45]); step1[46] = vqsubq_s16(step2[33], step2[46]); step1[47] = vqsubq_s16(step2[32], step2[47]); step1[48] = vqsubq_s16(step2[63], step2[48]); step1[49] = vqsubq_s16(step2[62], step2[49]); step1[50] = vqsubq_s16(step2[61], step2[50]); step1[51] = vqsubq_s16(step2[60], step2[51]); step1[52] = vqsubq_s16(step2[59], step2[52]); step1[53] = vqsubq_s16(step2[58], step2[53]); step1[54] = vqsubq_s16(step2[57], step2[54]); step1[55] = vqsubq_s16(step2[56], step2[55]); step1[56] = vqaddq_s16(step2[56], step2[55]); step1[57] = vqaddq_s16(step2[57], step2[54]); step1[58] = vqaddq_s16(step2[58], step2[53]); step1[59] = vqaddq_s16(step2[59], step2[52]); step1[60] = vqaddq_s16(step2[60], step2[51]); step1[61] = vqaddq_s16(step2[61], step2[50]); step1[62] = vqaddq_s16(step2[62], step2[49]); step1[63] = vqaddq_s16(step2[63], step2[48]); } static inline void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], (int16_t)cospi[16], (int16_t)cospi[48]); btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]); btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]); btf_16_lane_0_1_neon(step1[53], step1[42], c3, &step2[53], &step2[42]); btf_16_lane_0_1_neon(step1[52], step1[43], c3, &step2[52], &step2[43]); btf_16_lane_0_1_neon(step1[51], step1[44], c3, &step2[51], &step2[44]); btf_16_lane_0_1_neon(step1[50], step1[45], c3, &step2[50], &step2[45]); btf_16_lane_0_1_neon(step1[49], step1[46], c3, &step2[49], &step2[46]); btf_16_lane_0_1_neon(step1[48], step1[47], c3, &step2[48], &step2[47]); step2[0] = vqaddq_s16(step1[0], step1[31]); step2[1] = vqaddq_s16(step1[1], step1[30]); step2[2] = vqaddq_s16(step1[2], step1[29]); step2[3] = vqaddq_s16(step1[3], step1[28]); step2[4] = vqaddq_s16(step1[4], step1[27]); step2[5] = vqaddq_s16(step1[5], step1[26]); step2[6] = vqaddq_s16(step1[6], step1[25]); step2[7] = vqaddq_s16(step1[7], step1[24]); step2[8] = vqaddq_s16(step1[8], step1[23]); step2[9] = vqaddq_s16(step1[9], step1[22]); step2[10] = vqaddq_s16(step1[10], step1[21]); step2[11] = vqaddq_s16(step1[11], step1[20]); step2[12] = vqaddq_s16(step1[12], step1[19]); step2[13] = vqaddq_s16(step1[13], step1[18]); step2[14] = vqaddq_s16(step1[14], step1[17]); step2[15] = vqaddq_s16(step1[15], step1[16]); step2[16] = vqsubq_s16(step1[15], step1[16]); step2[17] = vqsubq_s16(step1[14], step1[17]); step2[18] = vqsubq_s16(step1[13], step1[18]); step2[19] = vqsubq_s16(step1[12], step1[19]); step2[20] = vqsubq_s16(step1[11], step1[20]); step2[21] = vqsubq_s16(step1[10], step1[21]); step2[22] = vqsubq_s16(step1[9], step1[22]); step2[23] = vqsubq_s16(step1[8], step1[23]); step2[24] = vqsubq_s16(step1[7], step1[24]); step2[25] = vqsubq_s16(step1[6], step1[25]); step2[26] = vqsubq_s16(step1[5], step1[26]); step2[27] = vqsubq_s16(step1[4], step1[27]); step2[28] = vqsubq_s16(step1[3], step1[28]); step2[29] = vqsubq_s16(step1[2], step1[29]); step2[30] = vqsubq_s16(step1[1], step1[30]); step2[31] = vqsubq_s16(step1[0], step1[31]); step2[32] = step1[32]; step2[33] = step1[33]; step2[34] = step1[34]; step2[35] = step1[35]; step2[36] = step1[36]; step2[37] = step1[37]; step2[38] = step1[38]; step2[39] = step1[39]; step2[56] = step1[56]; step2[57] = step1[57]; step2[58] = step1[58]; step2[59] = step1[59]; step2[60] = step1[60]; step2[61] = step1[61]; step2[62] = step1[62]; step2[63] = step1[63]; } static inline void idct64_low32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step2[64], step1[64]; const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], (int16_t)cospi[36], (int16_t)cospi[28]); const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], (int16_t)cospi[52], (int16_t)cospi[12]); const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], (int16_t)cospi[40], (int16_t)cospi[24]); const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], (int16_t)cospi[16], (int16_t)cospi[48]); const int16x4_t c4 = set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]), (int16_t)(-cospi[36]), (int16_t)(-cospi[28])); const int16x4_t c5 = set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]), (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); const int16x4_t c6 = set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); const int16x4_t c7 = set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); // stage 1 // stage 2 step2[0] = in[0]; step2[2] = in[16]; step2[4] = in[8]; step2[6] = in[24]; step2[8] = in[4]; step2[10] = in[20]; step2[12] = in[12]; step2[14] = in[28]; step2[16] = in[2]; step2[18] = in[18]; step2[20] = in[10]; step2[22] = in[26]; step2[24] = in[6]; step2[26] = in[22]; step2[28] = in[14]; step2[30] = in[30]; btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); btf_16_neon(in[31], -cospi[33], cospi[31], &step2[33], &step2[62]); btf_16_neon(in[17], cospi[47], cospi[17], &step2[34], &step2[61]); btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]); btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]); btf_16_neon(in[23], -cospi[41], cospi[23], &step2[37], &step2[58]); btf_16_neon(in[25], cospi[39], cospi[25], &step2[38], &step2[57]); btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); btf_16_neon(in[27], -cospi[37], cospi[27], &step2[41], &step2[54]); btf_16_neon(in[21], cospi[43], cospi[21], &step2[42], &step2[53]); btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]); btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]); btf_16_neon(in[19], -cospi[45], cospi[19], &step2[45], &step2[50]); btf_16_neon(in[29], cospi[35], cospi[29], &step2[46], &step2[49]); btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); // stage 3 step1[0] = step2[0]; step1[2] = step2[2]; step1[4] = step2[4]; step1[6] = step2[6]; step1[8] = step2[8]; step1[10] = step2[10]; step1[12] = step2[12]; step1[14] = step2[14]; btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); btf_16_neon(step2[30], -cospi[34], cospi[30], &step1[17], &step1[30]); btf_16_neon(step2[18], cospi[46], cospi[18], &step1[18], &step1[29]); btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]); btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]); btf_16_neon(step2[26], -cospi[42], cospi[22], &step1[21], &step1[26]); btf_16_neon(step2[22], cospi[38], cospi[26], &step1[22], &step1[25]); btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); step1[32] = vqaddq_s16(step2[32], step2[33]); step1[33] = vqsubq_s16(step2[32], step2[33]); step1[34] = vqsubq_s16(step2[35], step2[34]); step1[35] = vqaddq_s16(step2[35], step2[34]); step1[36] = vqaddq_s16(step2[36], step2[37]); step1[37] = vqsubq_s16(step2[36], step2[37]); step1[38] = vqsubq_s16(step2[39], step2[38]); step1[39] = vqaddq_s16(step2[39], step2[38]); step1[40] = vqaddq_s16(step2[40], step2[41]); step1[41] = vqsubq_s16(step2[40], step2[41]); step1[42] = vqsubq_s16(step2[43], step2[42]); step1[43] = vqaddq_s16(step2[43], step2[42]); step1[44] = vqaddq_s16(step2[44], step2[45]); step1[45] = vqsubq_s16(step2[44], step2[45]); step1[46] = vqsubq_s16(step2[47], step2[46]); step1[47] = vqaddq_s16(step2[47], step2[46]); step1[48] = vqaddq_s16(step2[48], step2[49]); step1[49] = vqsubq_s16(step2[48], step2[49]); step1[50] = vqsubq_s16(step2[51], step2[50]); step1[51] = vqaddq_s16(step2[51], step2[50]); step1[52] = vqaddq_s16(step2[52], step2[53]); step1[53] = vqsubq_s16(step2[52], step2[53]); step1[54] = vqsubq_s16(step2[55], step2[54]); step1[55] = vqaddq_s16(step2[55], step2[54]); step1[56] = vqaddq_s16(step2[56], step2[57]); step1[57] = vqsubq_s16(step2[56], step2[57]); step1[58] = vqsubq_s16(step2[59], step2[58]); step1[59] = vqaddq_s16(step2[59], step2[58]); step1[60] = vqaddq_s16(step2[60], step2[61]); step1[61] = vqsubq_s16(step2[60], step2[61]); step1[62] = vqsubq_s16(step2[63], step2[62]); step1[63] = vqaddq_s16(step2[63], step2[62]); // stage 4 step2[0] = step1[0]; step2[2] = step1[2]; step2[4] = step1[4]; step2[6] = step1[6]; btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); btf_16_neon(step1[14], -cospi[36], cospi[28], &step2[9], &step2[14]); btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]); btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]); btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]); btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]); btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]); btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]); btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]); step2[16] = vqaddq_s16(step1[16], step1[17]); step2[17] = vqsubq_s16(step1[16], step1[17]); step2[18] = vqsubq_s16(step1[19], step1[18]); step2[19] = vqaddq_s16(step1[19], step1[18]); step2[20] = vqaddq_s16(step1[20], step1[21]); step2[21] = vqsubq_s16(step1[20], step1[21]); step2[22] = vqsubq_s16(step1[23], step1[22]); step2[23] = vqaddq_s16(step1[23], step1[22]); step2[24] = vqaddq_s16(step1[24], step1[25]); step2[25] = vqsubq_s16(step1[24], step1[25]); step2[26] = vqsubq_s16(step1[27], step1[26]); step2[27] = vqaddq_s16(step1[27], step1[26]); step2[28] = vqaddq_s16(step1[28], step1[29]); step2[29] = vqsubq_s16(step1[28], step1[29]); step2[30] = vqsubq_s16(step1[31], step1[30]); step2[31] = vqaddq_s16(step1[31], step1[30]); step2[32] = step1[32]; step2[35] = step1[35]; step2[36] = step1[36]; step2[39] = step1[39]; step2[40] = step1[40]; step2[43] = step1[43]; step2[44] = step1[44]; step2[47] = step1[47]; step2[48] = step1[48]; step2[51] = step1[51]; step2[52] = step1[52]; step2[55] = step1[55]; step2[56] = step1[56]; step2[59] = step1[59]; step2[60] = step1[60]; step2[63] = step1[63]; // stage 5 step1[0] = step2[0]; step1[2] = step2[2]; btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]); btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]); btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]); btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]); step1[8] = vqaddq_s16(step2[8], step2[9]); step1[9] = vqsubq_s16(step2[8], step2[9]); step1[10] = vqsubq_s16(step2[11], step2[10]); step1[11] = vqaddq_s16(step2[11], step2[10]); step1[12] = vqaddq_s16(step2[12], step2[13]); step1[13] = vqsubq_s16(step2[12], step2[13]); step1[14] = vqsubq_s16(step2[15], step2[14]); step1[15] = vqaddq_s16(step2[15], step2[14]); step1[16] = step2[16]; step1[19] = step2[19]; step1[20] = step2[20]; step1[23] = step2[23]; step1[24] = step2[24]; step1[27] = step2[27]; step1[28] = step2[28]; step1[31] = step2[31]; step1[32] = vqaddq_s16(step2[32], step2[35]); step1[33] = vqaddq_s16(step2[33], step2[34]); step1[34] = vqsubq_s16(step2[33], step2[34]); step1[35] = vqsubq_s16(step2[32], step2[35]); step1[36] = vqsubq_s16(step2[39], step2[36]); step1[37] = vqsubq_s16(step2[38], step2[37]); step1[38] = vqaddq_s16(step2[38], step2[37]); step1[39] = vqaddq_s16(step2[39], step2[36]); step1[40] = vqaddq_s16(step2[40], step2[43]); step1[41] = vqaddq_s16(step2[41], step2[42]); step1[42] = vqsubq_s16(step2[41], step2[42]); step1[43] = vqsubq_s16(step2[40], step2[43]); step1[44] = vqsubq_s16(step2[47], step2[44]); step1[45] = vqsubq_s16(step2[46], step2[45]); step1[46] = vqaddq_s16(step2[46], step2[45]); step1[47] = vqaddq_s16(step2[47], step2[44]); step1[48] = vqaddq_s16(step2[48], step2[51]); step1[49] = vqaddq_s16(step2[49], step2[50]); step1[50] = vqsubq_s16(step2[49], step2[50]); step1[51] = vqsubq_s16(step2[48], step2[51]); step1[52] = vqsubq_s16(step2[55], step2[52]); step1[53] = vqsubq_s16(step2[54], step2[53]); step1[54] = vqaddq_s16(step2[54], step2[53]); step1[55] = vqaddq_s16(step2[55], step2[52]); step1[56] = vqaddq_s16(step2[56], step2[59]); step1[57] = vqaddq_s16(step2[57], step2[58]); step1[58] = vqsubq_s16(step2[57], step2[58]); step1[59] = vqsubq_s16(step2[56], step2[59]); step1[60] = vqsubq_s16(step2[63], step2[60]); step1[61] = vqsubq_s16(step2[62], step2[61]); step1[62] = vqaddq_s16(step2[62], step2[61]); step1[63] = vqaddq_s16(step2[63], step2[60]); // stage 6 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]); btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]); btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]); btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]); btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]); btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]); step2[4] = vqaddq_s16(step1[4], step1[5]); step2[5] = vqsubq_s16(step1[4], step1[5]); step2[6] = vqsubq_s16(step1[7], step1[6]); step2[7] = vqaddq_s16(step1[7], step1[6]); step2[8] = step1[8]; step2[11] = step1[11]; step2[12] = step1[12]; step2[15] = step1[15]; step2[16] = vqaddq_s16(step1[16], step1[19]); step2[17] = vqaddq_s16(step1[17], step1[18]); step2[18] = vqsubq_s16(step1[17], step1[18]); step2[19] = vqsubq_s16(step1[16], step1[19]); step2[20] = vqsubq_s16(step1[23], step1[20]); step2[21] = vqsubq_s16(step1[22], step1[21]); step2[22] = vqaddq_s16(step1[22], step1[21]); step2[23] = vqaddq_s16(step1[23], step1[20]); step2[24] = vqaddq_s16(step1[24], step1[27]); step2[25] = vqaddq_s16(step1[25], step1[26]); step2[26] = vqsubq_s16(step1[25], step1[26]); step2[27] = vqsubq_s16(step1[24], step1[27]); step2[28] = vqsubq_s16(step1[31], step1[28]); step2[29] = vqsubq_s16(step1[30], step1[29]); step2[30] = vqaddq_s16(step1[30], step1[29]); step2[31] = vqaddq_s16(step1[31], step1[28]); step2[32] = step1[32]; step2[33] = step1[33]; step2[38] = step1[38]; step2[39] = step1[39]; step2[40] = step1[40]; step2[41] = step1[41]; step2[46] = step1[46]; step2[47] = step1[47]; step2[48] = step1[48]; step2[49] = step1[49]; step2[54] = step1[54]; step2[55] = step1[55]; step2[56] = step1[56]; step2[57] = step1[57]; step2[62] = step1[62]; step2[63] = step1[63]; // stage 7 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]); btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]); step1[0] = vqaddq_s16(step2[0], step2[3]); step1[1] = vqaddq_s16(step2[1], step2[2]); step1[2] = vqsubq_s16(step2[1], step2[2]); step1[3] = vqsubq_s16(step2[0], step2[3]); step1[4] = step2[4]; step1[7] = step2[7]; step1[8] = vqaddq_s16(step2[8], step2[11]); step1[9] = vqaddq_s16(step2[9], step2[10]); step1[10] = vqsubq_s16(step2[9], step2[10]); step1[11] = vqsubq_s16(step2[8], step2[11]); step1[12] = vqsubq_s16(step2[15], step2[12]); step1[13] = vqsubq_s16(step2[14], step2[13]); step1[14] = vqaddq_s16(step2[14], step2[13]); step1[15] = vqaddq_s16(step2[15], step2[12]); step1[16] = step2[16]; step1[17] = step2[17]; step1[22] = step2[22]; step1[23] = step2[23]; step1[24] = step2[24]; step1[25] = step2[25]; step1[30] = step2[30]; step1[31] = step2[31]; step1[32] = vqaddq_s16(step2[32], step2[39]); step1[33] = vqaddq_s16(step2[33], step2[38]); step1[34] = vqaddq_s16(step2[34], step2[37]); step1[35] = vqaddq_s16(step2[35], step2[36]); step1[36] = vqsubq_s16(step2[35], step2[36]); step1[37] = vqsubq_s16(step2[34], step2[37]); step1[38] = vqsubq_s16(step2[33], step2[38]); step1[39] = vqsubq_s16(step2[32], step2[39]); step1[40] = vqsubq_s16(step2[47], step2[40]); step1[41] = vqsubq_s16(step2[46], step2[41]); step1[42] = vqsubq_s16(step2[45], step2[42]); step1[43] = vqsubq_s16(step2[44], step2[43]); step1[44] = vqaddq_s16(step2[43], step2[44]); step1[45] = vqaddq_s16(step2[42], step2[45]); step1[46] = vqaddq_s16(step2[41], step2[46]); step1[47] = vqaddq_s16(step2[40], step2[47]); step1[48] = vqaddq_s16(step2[48], step2[55]); step1[49] = vqaddq_s16(step2[49], step2[54]); step1[50] = vqaddq_s16(step2[50], step2[53]); step1[51] = vqaddq_s16(step2[51], step2[52]); step1[52] = vqsubq_s16(step2[51], step2[52]); step1[53] = vqsubq_s16(step2[50], step2[53]); step1[54] = vqsubq_s16(step2[49], step2[54]); step1[55] = vqsubq_s16(step2[48], step2[55]); step1[56] = vqsubq_s16(step2[63], step2[56]); step1[57] = vqsubq_s16(step2[62], step2[57]); step1[58] = vqsubq_s16(step2[61], step2[58]); step1[59] = vqsubq_s16(step2[60], step2[59]); step1[60] = vqaddq_s16(step2[59], step2[60]); step1[61] = vqaddq_s16(step2[58], step2[61]); step1[62] = vqaddq_s16(step2[57], step2[62]); step1[63] = vqaddq_s16(step2[56], step2[63]); // stage 8 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]); btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]); btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]); btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]); step2[0] = vqaddq_s16(step1[0], step1[7]); step2[1] = vqaddq_s16(step1[1], step1[6]); step2[2] = vqaddq_s16(step1[2], step1[5]); step2[3] = vqaddq_s16(step1[3], step1[4]); step2[4] = vqsubq_s16(step1[3], step1[4]); step2[5] = vqsubq_s16(step1[2], step1[5]); step2[6] = vqsubq_s16(step1[1], step1[6]); step2[7] = vqsubq_s16(step1[0], step1[7]); step2[8] = step1[8]; step2[9] = step1[9]; step2[14] = step1[14]; step2[15] = step1[15]; step2[16] = vqaddq_s16(step1[16], step1[23]); step2[17] = vqaddq_s16(step1[17], step1[22]); step2[18] = vqaddq_s16(step1[18], step1[21]); step2[19] = vqaddq_s16(step1[19], step1[20]); step2[20] = vqsubq_s16(step1[19], step1[20]); step2[21] = vqsubq_s16(step1[18], step1[21]); step2[22] = vqsubq_s16(step1[17], step1[22]); step2[23] = vqsubq_s16(step1[16], step1[23]); step2[24] = vqsubq_s16(step1[31], step1[24]); step2[25] = vqsubq_s16(step1[30], step1[25]); step2[26] = vqsubq_s16(step1[29], step1[26]); step2[27] = vqsubq_s16(step1[28], step1[27]); step2[28] = vqaddq_s16(step1[28], step1[27]); step2[29] = vqaddq_s16(step1[29], step1[26]); step2[30] = vqaddq_s16(step1[30], step1[25]); step2[31] = vqaddq_s16(step1[31], step1[24]); step2[32] = step1[32]; step2[33] = step1[33]; step2[34] = step1[34]; step2[35] = step1[35]; step2[44] = step1[44]; step2[45] = step1[45]; step2[46] = step1[46]; step2[47] = step1[47]; step2[48] = step1[48]; step2[49] = step1[49]; step2[50] = step1[50]; step2[51] = step1[51]; step2[60] = step1[60]; step2[61] = step1[61]; step2[62] = step1[62]; step2[63] = step1[63]; // stage 9 idct64_stage9_neon(step2, step1, cos_bit); // stage 10 idct64_stage10_neon(step1, step2, cos_bit); // stage 11 out[0] = vqaddq_s16(step2[0], step2[63]); out[1] = vqaddq_s16(step2[1], step2[62]); out[2] = vqaddq_s16(step2[2], step2[61]); out[3] = vqaddq_s16(step2[3], step2[60]); out[4] = vqaddq_s16(step2[4], step2[59]); out[5] = vqaddq_s16(step2[5], step2[58]); out[6] = vqaddq_s16(step2[6], step2[57]); out[7] = vqaddq_s16(step2[7], step2[56]); out[8] = vqaddq_s16(step2[8], step2[55]); out[9] = vqaddq_s16(step2[9], step2[54]); out[10] = vqaddq_s16(step2[10], step2[53]); out[11] = vqaddq_s16(step2[11], step2[52]); out[12] = vqaddq_s16(step2[12], step2[51]); out[13] = vqaddq_s16(step2[13], step2[50]); out[14] = vqaddq_s16(step2[14], step2[49]); out[15] = vqaddq_s16(step2[15], step2[48]); out[16] = vqaddq_s16(step2[16], step2[47]); out[17] = vqaddq_s16(step2[17], step2[46]); out[18] = vqaddq_s16(step2[18], step2[45]); out[19] = vqaddq_s16(step2[19], step2[44]); out[20] = vqaddq_s16(step2[20], step2[43]); out[21] = vqaddq_s16(step2[21], step2[42]); out[22] = vqaddq_s16(step2[22], step2[41]); out[23] = vqaddq_s16(step2[23], step2[40]); out[24] = vqaddq_s16(step2[24], step2[39]); out[25] = vqaddq_s16(step2[25], step2[38]); out[26] = vqaddq_s16(step2[26], step2[37]); out[27] = vqaddq_s16(step2[27], step2[36]); out[28] = vqaddq_s16(step2[28], step2[35]); out[29] = vqaddq_s16(step2[29], step2[34]); out[30] = vqaddq_s16(step2[30], step2[33]); out[31] = vqaddq_s16(step2[31], step2[32]); out[32] = vqsubq_s16(step2[31], step2[32]); out[33] = vqsubq_s16(step2[30], step2[33]); out[34] = vqsubq_s16(step2[29], step2[34]); out[35] = vqsubq_s16(step2[28], step2[35]); out[36] = vqsubq_s16(step2[27], step2[36]); out[37] = vqsubq_s16(step2[26], step2[37]); out[38] = vqsubq_s16(step2[25], step2[38]); out[39] = vqsubq_s16(step2[24], step2[39]); out[40] = vqsubq_s16(step2[23], step2[40]); out[41] = vqsubq_s16(step2[22], step2[41]); out[42] = vqsubq_s16(step2[21], step2[42]); out[43] = vqsubq_s16(step2[20], step2[43]); out[44] = vqsubq_s16(step2[19], step2[44]); out[45] = vqsubq_s16(step2[18], step2[45]); out[46] = vqsubq_s16(step2[17], step2[46]); out[47] = vqsubq_s16(step2[16], step2[47]); out[48] = vqsubq_s16(step2[15], step2[48]); out[49] = vqsubq_s16(step2[14], step2[49]); out[50] = vqsubq_s16(step2[13], step2[50]); out[51] = vqsubq_s16(step2[12], step2[51]); out[52] = vqsubq_s16(step2[11], step2[52]); out[53] = vqsubq_s16(step2[10], step2[53]); out[54] = vqsubq_s16(step2[9], step2[54]); out[55] = vqsubq_s16(step2[8], step2[55]); out[56] = vqsubq_s16(step2[7], step2[56]); out[57] = vqsubq_s16(step2[6], step2[57]); out[58] = vqsubq_s16(step2[5], step2[58]); out[59] = vqsubq_s16(step2[4], step2[59]); out[60] = vqsubq_s16(step2[3], step2[60]); out[61] = vqsubq_s16(step2[2], step2[61]); out[62] = vqsubq_s16(step2[1], step2[62]); out[63] = vqsubq_s16(step2[0], step2[63]); } static inline void idct64_low1_neon(int16x8_t *input, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step1; int32x4_t t32[2]; // stage 1 // stage 2 // stage 3 // stage 4 // stage 5 // stage 6 t32[0] = vmull_n_s16(vget_low_s16(input[0]), cospi[32]); t32[1] = vmull_n_s16(vget_high_s16(input[0]), cospi[32]); step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), vrshrn_n_s32(t32[1], INV_COS_BIT)); // stage 7 // stage 8 // stage 9 // stage 10 // stage 11 out[0] = step1; out[1] = step1; out[2] = step1; out[3] = step1; out[4] = step1; out[5] = step1; out[6] = step1; out[7] = step1; out[8] = step1; out[9] = step1; out[10] = step1; out[11] = step1; out[12] = step1; out[13] = step1; out[14] = step1; out[15] = step1; out[16] = step1; out[17] = step1; out[18] = step1; out[19] = step1; out[20] = step1; out[21] = step1; out[22] = step1; out[23] = step1; out[24] = step1; out[25] = step1; out[26] = step1; out[27] = step1; out[28] = step1; out[29] = step1; out[30] = step1; out[31] = step1; out[32] = step1; out[33] = step1; out[34] = step1; out[35] = step1; out[36] = step1; out[37] = step1; out[38] = step1; out[39] = step1; out[40] = step1; out[41] = step1; out[42] = step1; out[43] = step1; out[44] = step1; out[45] = step1; out[46] = step1; out[47] = step1; out[48] = step1; out[49] = step1; out[50] = step1; out[51] = step1; out[52] = step1; out[53] = step1; out[54] = step1; out[55] = step1; out[56] = step1; out[57] = step1; out[58] = step1; out[59] = step1; out[60] = step1; out[61] = step1; out[62] = step1; out[63] = step1; } static inline void idct64_low8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step2[64], step1[64]; const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], (int16_t)cospi[36], (int16_t)cospi[28]); const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], (int16_t)cospi[52], (int16_t)cospi[12]); const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], (int16_t)cospi[40], (int16_t)cospi[24]); const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], (int16_t)cospi[16], (int16_t)cospi[48]); const int16x4_t c4 = set_s16x4_neon((int16_t)(-cospi[36]), (int16_t)(-cospi[28]), (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); const int16x4_t c5 = set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); const int16x4_t c6 = set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); // stage 1 // stage 2 step2[0] = in[0]; step2[8] = in[4]; step2[16] = in[2]; step2[24] = in[6]; btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); // stage 3 step1[0] = step2[0]; step1[8] = step2[8]; btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); step1[32] = step2[32]; step1[33] = step2[32]; step1[38] = step2[39]; step1[39] = step2[39]; step1[40] = step2[40]; step1[41] = step2[40]; step1[46] = step2[47]; step1[47] = step2[47]; step1[48] = step2[48]; step1[49] = step2[48]; step1[54] = step2[55]; step1[55] = step2[55]; step1[56] = step2[56]; step1[57] = step2[56]; step1[62] = step2[63]; step1[63] = step2[63]; // stage 4 step2[0] = step1[0]; btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); btf_16_lane_1_0_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); btf_16_lane_3_2_neon(step1[46], step1[49], c4, &step2[46], &step2[49]); step2[16] = step1[16]; step2[17] = step1[16]; step2[22] = step1[23]; step2[23] = step1[23]; step2[24] = step1[24]; step2[25] = step1[24]; step2[30] = step1[31]; step2[31] = step1[31]; step2[32] = step1[32]; step2[39] = step1[39]; step2[40] = step1[40]; step2[47] = step1[47]; step2[48] = step1[48]; step2[55] = step1[55]; step2[56] = step1[56]; step2[63] = step1[63]; // stage 5 step1[0] = step2[0]; btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); btf_16_lane_3_2_neon(step2[22], step2[25], c5, &step1[22], &step1[25]); step1[8] = step2[8]; step1[9] = step2[8]; step1[14] = step2[15]; step1[15] = step2[15]; step1[16] = step2[16]; step1[23] = step2[23]; step1[24] = step2[24]; step1[31] = step2[31]; step1[32] = step2[32]; step1[33] = step2[33]; step1[34] = step2[33]; step1[35] = step2[32]; step1[36] = step2[39]; step1[37] = step2[38]; step1[38] = step2[38]; step1[39] = step2[39]; step1[40] = step2[40]; step1[41] = step2[41]; step1[42] = step2[41]; step1[43] = step2[40]; step1[44] = step2[47]; step1[45] = step2[46]; step1[46] = step2[46]; step1[47] = step2[47]; step1[48] = step2[48]; step1[49] = step2[49]; step1[50] = step2[49]; step1[51] = step2[48]; step1[52] = step2[55]; step1[53] = step2[54]; step1[54] = step2[54]; step1[55] = step2[55]; step1[56] = step2[56]; step1[57] = step2[57]; step1[58] = step2[57]; step1[59] = step2[56]; step1[60] = step2[63]; step1[61] = step2[62]; step1[62] = step2[62]; step1[63] = step2[63]; // stage 6 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); btf_16_lane_1_0_neon(step1[36], step1[59], c5, &step2[36], &step2[59]); btf_16_lane_1_0_neon(step1[37], step1[58], c5, &step2[37], &step2[58]); btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); btf_16_lane_3_2_neon(step1[44], step1[51], c5, &step2[44], &step2[51]); btf_16_lane_3_2_neon(step1[45], step1[50], c5, &step2[45], &step2[50]); step2[8] = step1[8]; step2[15] = step1[15]; step2[16] = step1[16]; step2[17] = step1[17]; step2[18] = step1[17]; step2[19] = step1[16]; step2[20] = step1[23]; step2[21] = step1[22]; step2[22] = step1[22]; step2[23] = step1[23]; step2[24] = step1[24]; step2[25] = step1[25]; step2[26] = step1[25]; step2[27] = step1[24]; step2[28] = step1[31]; step2[29] = step1[30]; step2[30] = step1[30]; step2[31] = step1[31]; step2[32] = step1[32]; step2[33] = step1[33]; step2[38] = step1[38]; step2[39] = step1[39]; step2[40] = step1[40]; step2[41] = step1[41]; step2[46] = step1[46]; step2[47] = step1[47]; step2[48] = step1[48]; step2[49] = step1[49]; step2[54] = step1[54]; step2[55] = step1[55]; step2[56] = step1[56]; step2[57] = step1[57]; step2[62] = step1[62]; step2[63] = step1[63]; // stage 7 btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); btf_16_lane_3_2_neon(step2[20], step2[27], c6, &step1[20], &step1[27]); btf_16_lane_3_2_neon(step2[21], step2[26], c6, &step1[21], &step1[26]); step1[0] = step2[0]; step1[1] = step2[1]; step1[2] = step2[1]; step1[3] = step2[0]; step1[8] = step2[8]; step1[9] = step2[9]; step1[10] = step2[9]; step1[11] = step2[8]; step1[12] = step2[15]; step1[13] = step2[14]; step1[14] = step2[14]; step1[15] = step2[15]; step1[16] = step2[16]; step1[17] = step2[17]; step1[22] = step2[22]; step1[23] = step2[23]; step1[24] = step2[24]; step1[25] = step2[25]; step1[30] = step2[30]; step1[31] = step2[31]; step1[32] = vqaddq_s16(step2[32], step2[39]); step1[33] = vqaddq_s16(step2[33], step2[38]); step1[34] = vqaddq_s16(step2[34], step2[37]); step1[35] = vqaddq_s16(step2[35], step2[36]); step1[36] = vqsubq_s16(step2[35], step2[36]); step1[37] = vqsubq_s16(step2[34], step2[37]); step1[38] = vqsubq_s16(step2[33], step2[38]); step1[39] = vqsubq_s16(step2[32], step2[39]); step1[40] = vqsubq_s16(step2[47], step2[40]); step1[41] = vqsubq_s16(step2[46], step2[41]); step1[42] = vqsubq_s16(step2[45], step2[42]); step1[43] = vqsubq_s16(step2[44], step2[43]); step1[44] = vqaddq_s16(step2[43], step2[44]); step1[45] = vqaddq_s16(step2[42], step2[45]); step1[46] = vqaddq_s16(step2[41], step2[46]); step1[47] = vqaddq_s16(step2[40], step2[47]); step1[48] = vqaddq_s16(step2[48], step2[55]); step1[49] = vqaddq_s16(step2[49], step2[54]); step1[50] = vqaddq_s16(step2[50], step2[53]); step1[51] = vqaddq_s16(step2[51], step2[52]); step1[52] = vqsubq_s16(step2[51], step2[52]); step1[53] = vqsubq_s16(step2[50], step2[53]); step1[54] = vqsubq_s16(step2[49], step2[54]); step1[55] = vqsubq_s16(step2[48], step2[55]); step1[56] = vqsubq_s16(step2[63], step2[56]); step1[57] = vqsubq_s16(step2[62], step2[57]); step1[58] = vqsubq_s16(step2[61], step2[58]); step1[59] = vqsubq_s16(step2[60], step2[59]); step1[60] = vqaddq_s16(step2[59], step2[60]); step1[61] = vqaddq_s16(step2[58], step2[61]); step1[62] = vqaddq_s16(step2[57], step2[62]); step1[63] = vqaddq_s16(step2[56], step2[63]); // stage 8 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); btf_16_lane_3_2_neon(step1[40], step1[55], c6, &step2[40], &step2[55]); btf_16_lane_3_2_neon(step1[41], step1[54], c6, &step2[41], &step2[54]); btf_16_lane_3_2_neon(step1[42], step1[53], c6, &step2[42], &step2[53]); btf_16_lane_3_2_neon(step1[43], step1[52], c6, &step2[43], &step2[52]); step2[0] = step1[0]; step2[1] = step1[1]; step2[2] = step1[2]; step2[3] = step1[3]; step2[4] = step1[3]; step2[5] = step1[2]; step2[6] = step1[1]; step2[7] = step1[0]; step2[8] = step1[8]; step2[9] = step1[9]; step2[14] = step1[14]; step2[15] = step1[15]; step2[16] = vqaddq_s16(step1[16], step1[23]); step2[17] = vqaddq_s16(step1[17], step1[22]); step2[18] = vqaddq_s16(step1[18], step1[21]); step2[19] = vqaddq_s16(step1[19], step1[20]); step2[20] = vqsubq_s16(step1[19], step1[20]); step2[21] = vqsubq_s16(step1[18], step1[21]); step2[22] = vqsubq_s16(step1[17], step1[22]); step2[23] = vqsubq_s16(step1[16], step1[23]); step2[24] = vqsubq_s16(step1[31], step1[24]); step2[25] = vqsubq_s16(step1[30], step1[25]); step2[26] = vqsubq_s16(step1[29], step1[26]); step2[27] = vqsubq_s16(step1[28], step1[27]); step2[28] = vqaddq_s16(step1[28], step1[27]); step2[29] = vqaddq_s16(step1[29], step1[26]); step2[30] = vqaddq_s16(step1[30], step1[25]); step2[31] = vqaddq_s16(step1[31], step1[24]); step2[32] = step1[32]; step2[33] = step1[33]; step2[34] = step1[34]; step2[35] = step1[35]; step2[44] = step1[44]; step2[45] = step1[45]; step2[46] = step1[46]; step2[47] = step1[47]; step2[48] = step1[48]; step2[49] = step1[49]; step2[50] = step1[50]; step2[51] = step1[51]; step2[60] = step1[60]; step2[61] = step1[61]; step2[62] = step1[62]; step2[63] = step1[63]; // stage 9 idct64_stage9_neon(step2, step1, cos_bit); // stage 10 idct64_stage10_neon(step1, step2, cos_bit); // stage 11 out[0] = vqaddq_s16(step2[0], step2[63]); out[1] = vqaddq_s16(step2[1], step2[62]); out[2] = vqaddq_s16(step2[2], step2[61]); out[3] = vqaddq_s16(step2[3], step2[60]); out[4] = vqaddq_s16(step2[4], step2[59]); out[5] = vqaddq_s16(step2[5], step2[58]); out[6] = vqaddq_s16(step2[6], step2[57]); out[7] = vqaddq_s16(step2[7], step2[56]); out[8] = vqaddq_s16(step2[8], step2[55]); out[9] = vqaddq_s16(step2[9], step2[54]); out[10] = vqaddq_s16(step2[10], step2[53]); out[11] = vqaddq_s16(step2[11], step2[52]); out[12] = vqaddq_s16(step2[12], step2[51]); out[13] = vqaddq_s16(step2[13], step2[50]); out[14] = vqaddq_s16(step2[14], step2[49]); out[15] = vqaddq_s16(step2[15], step2[48]); out[16] = vqaddq_s16(step2[16], step2[47]); out[17] = vqaddq_s16(step2[17], step2[46]); out[18] = vqaddq_s16(step2[18], step2[45]); out[19] = vqaddq_s16(step2[19], step2[44]); out[20] = vqaddq_s16(step2[20], step2[43]); out[21] = vqaddq_s16(step2[21], step2[42]); out[22] = vqaddq_s16(step2[22], step2[41]); out[23] = vqaddq_s16(step2[23], step2[40]); out[24] = vqaddq_s16(step2[24], step2[39]); out[25] = vqaddq_s16(step2[25], step2[38]); out[26] = vqaddq_s16(step2[26], step2[37]); out[27] = vqaddq_s16(step2[27], step2[36]); out[28] = vqaddq_s16(step2[28], step2[35]); out[29] = vqaddq_s16(step2[29], step2[34]); out[30] = vqaddq_s16(step2[30], step2[33]); out[31] = vqaddq_s16(step2[31], step2[32]); out[32] = vqsubq_s16(step2[31], step2[32]); out[33] = vqsubq_s16(step2[30], step2[33]); out[34] = vqsubq_s16(step2[29], step2[34]); out[35] = vqsubq_s16(step2[28], step2[35]); out[36] = vqsubq_s16(step2[27], step2[36]); out[37] = vqsubq_s16(step2[26], step2[37]); out[38] = vqsubq_s16(step2[25], step2[38]); out[39] = vqsubq_s16(step2[24], step2[39]); out[40] = vqsubq_s16(step2[23], step2[40]); out[41] = vqsubq_s16(step2[22], step2[41]); out[42] = vqsubq_s16(step2[21], step2[42]); out[43] = vqsubq_s16(step2[20], step2[43]); out[44] = vqsubq_s16(step2[19], step2[44]); out[45] = vqsubq_s16(step2[18], step2[45]); out[46] = vqsubq_s16(step2[17], step2[46]); out[47] = vqsubq_s16(step2[16], step2[47]); out[48] = vqsubq_s16(step2[15], step2[48]); out[49] = vqsubq_s16(step2[14], step2[49]); out[50] = vqsubq_s16(step2[13], step2[50]); out[51] = vqsubq_s16(step2[12], step2[51]); out[52] = vqsubq_s16(step2[11], step2[52]); out[53] = vqsubq_s16(step2[10], step2[53]); out[54] = vqsubq_s16(step2[9], step2[54]); out[55] = vqsubq_s16(step2[8], step2[55]); out[56] = vqsubq_s16(step2[7], step2[56]); out[57] = vqsubq_s16(step2[6], step2[57]); out[58] = vqsubq_s16(step2[5], step2[58]); out[59] = vqsubq_s16(step2[4], step2[59]); out[60] = vqsubq_s16(step2[3], step2[60]); out[61] = vqsubq_s16(step2[2], step2[61]); out[62] = vqsubq_s16(step2[1], step2[62]); out[63] = vqsubq_s16(step2[0], step2[63]); } static inline void idct64_low16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); int16x8_t step2[64], step1[64]; const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], (int16_t)cospi[36], (int16_t)cospi[28]); const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], (int16_t)cospi[52], (int16_t)cospi[12]); const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], (int16_t)cospi[40], (int16_t)cospi[24]); const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], (int16_t)cospi[16], (int16_t)cospi[48]); const int16x4_t c4 = set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]), (int16_t)(-cospi[36]), (int16_t)(-cospi[28])); const int16x4_t c5 = set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]), (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); const int16x4_t c6 = set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); const int16x4_t c7 = set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); // stage 1 // stage 2 step2[0] = in[0]; step2[4] = in[8]; step2[8] = in[4]; step2[12] = in[12]; step2[16] = in[2]; step2[20] = in[10]; step2[24] = in[6]; step2[28] = in[14]; btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]); btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]); btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]); btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]); btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); // stage 3 step1[0] = step2[0]; step1[4] = step2[4]; step1[8] = step2[8]; step1[12] = step2[12]; btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]); btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]); step1[32] = step2[32]; step1[33] = step2[32]; step1[34] = step2[35]; step1[35] = step2[35]; step1[36] = step2[36]; step1[37] = step2[36]; step1[38] = step2[39]; step1[39] = step2[39]; step1[40] = step2[40]; step1[41] = step2[40]; step1[42] = step2[43]; step1[43] = step2[43]; step1[44] = step2[44]; step1[45] = step2[44]; step1[46] = step2[47]; step1[47] = step2[47]; step1[48] = step2[48]; step1[49] = step2[48]; step1[50] = step2[51]; step1[51] = step2[51]; step1[52] = step2[52]; step1[53] = step2[52]; step1[54] = step2[55]; step1[55] = step2[55]; step1[56] = step2[56]; step1[57] = step2[56]; step1[58] = step2[59]; step1[59] = step2[59]; step1[60] = step2[60]; step1[61] = step2[60]; step1[62] = step2[63]; step1[63] = step2[63]; // stage 4 step2[0] = step1[0]; step2[4] = step1[4]; btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]); btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]); btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]); btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]); btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]); btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]); step2[16] = step1[16]; step2[17] = step1[16]; step2[18] = step1[19]; step2[19] = step1[19]; step2[20] = step1[20]; step2[21] = step1[20]; step2[22] = step1[23]; step2[23] = step1[23]; step2[24] = step1[24]; step2[25] = step1[24]; step2[26] = step1[27]; step2[27] = step1[27]; step2[28] = step1[28]; step2[29] = step1[28]; step2[30] = step1[31]; step2[31] = step1[31]; step2[32] = step1[32]; step2[35] = step1[35]; step2[36] = step1[36]; step2[39] = step1[39]; step2[40] = step1[40]; step2[43] = step1[43]; step2[44] = step1[44]; step2[47] = step1[47]; step2[48] = step1[48]; step2[51] = step1[51]; step2[52] = step1[52]; step2[55] = step1[55]; step2[56] = step1[56]; step2[59] = step1[59]; step2[60] = step1[60]; step2[63] = step1[63]; // stage 5 step1[0] = step2[0]; btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]); btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]); btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]); step1[8] = step2[8]; step1[9] = step2[8]; step1[10] = step2[11]; step1[11] = step2[11]; step1[12] = step2[12]; step1[13] = step2[12]; step1[14] = step2[15]; step1[15] = step2[15]; step1[16] = step2[16]; step1[19] = step2[19]; step1[20] = step2[20]; step1[23] = step2[23]; step1[24] = step2[24]; step1[27] = step2[27]; step1[28] = step2[28]; step1[31] = step2[31]; step1[32] = vqaddq_s16(step2[32], step2[35]); step1[33] = vqaddq_s16(step2[33], step2[34]); step1[34] = vqsubq_s16(step2[33], step2[34]); step1[35] = vqsubq_s16(step2[32], step2[35]); step1[36] = vqsubq_s16(step2[39], step2[36]); step1[37] = vqsubq_s16(step2[38], step2[37]); step1[38] = vqaddq_s16(step2[38], step2[37]); step1[39] = vqaddq_s16(step2[39], step2[36]); step1[40] = vqaddq_s16(step2[40], step2[43]); step1[41] = vqaddq_s16(step2[41], step2[42]); step1[42] = vqsubq_s16(step2[41], step2[42]); step1[43] = vqsubq_s16(step2[40], step2[43]); step1[44] = vqsubq_s16(step2[47], step2[44]); step1[45] = vqsubq_s16(step2[46], step2[45]); step1[46] = vqaddq_s16(step2[46], step2[45]); step1[47] = vqaddq_s16(step2[47], step2[44]); step1[48] = vqaddq_s16(step2[48], step2[51]); step1[49] = vqaddq_s16(step2[49], step2[50]); step1[50] = vqsubq_s16(step2[49], step2[50]); step1[51] = vqsubq_s16(step2[48], step2[51]); step1[52] = vqsubq_s16(step2[55], step2[52]); step1[53] = vqsubq_s16(step2[54], step2[53]); step1[54] = vqaddq_s16(step2[54], step2[53]); step1[55] = vqaddq_s16(step2[55], step2[52]); step1[56] = vqaddq_s16(step2[56], step2[59]); step1[57] = vqaddq_s16(step2[57], step2[58]); step1[58] = vqsubq_s16(step2[57], step2[58]); step1[59] = vqsubq_s16(step2[56], step2[59]); step1[60] = vqsubq_s16(step2[63], step2[60]); step1[61] = vqsubq_s16(step2[62], step2[61]); step1[62] = vqaddq_s16(step2[62], step2[61]); step1[63] = vqaddq_s16(step2[63], step2[60]); // stage 6 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]); btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]); btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]); btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]); btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]); step2[4] = step1[4]; step2[5] = step1[4]; step2[6] = step1[7]; step2[7] = step1[7]; step2[8] = step1[8]; step2[11] = step1[11]; step2[12] = step1[12]; step2[15] = step1[15]; step2[16] = vqaddq_s16(step1[16], step1[19]); step2[17] = vqaddq_s16(step1[17], step1[18]); step2[18] = vqsubq_s16(step1[17], step1[18]); step2[19] = vqsubq_s16(step1[16], step1[19]); step2[20] = vqsubq_s16(step1[23], step1[20]); step2[21] = vqsubq_s16(step1[22], step1[21]); step2[22] = vqaddq_s16(step1[22], step1[21]); step2[23] = vqaddq_s16(step1[23], step1[20]); step2[24] = vqaddq_s16(step1[24], step1[27]); step2[25] = vqaddq_s16(step1[25], step1[26]); step2[26] = vqsubq_s16(step1[25], step1[26]); step2[27] = vqsubq_s16(step1[24], step1[27]); step2[28] = vqsubq_s16(step1[31], step1[28]); step2[29] = vqsubq_s16(step1[30], step1[29]); step2[30] = vqaddq_s16(step1[30], step1[29]); step2[31] = vqaddq_s16(step1[31], step1[28]); step2[32] = step1[32]; step2[33] = step1[33]; step2[38] = step1[38]; step2[39] = step1[39]; step2[40] = step1[40]; step2[41] = step1[41]; step2[46] = step1[46]; step2[47] = step1[47]; step2[48] = step1[48]; step2[49] = step1[49]; step2[54] = step1[54]; step2[55] = step1[55]; step2[56] = step1[56]; step2[57] = step1[57]; step2[62] = step1[62]; step2[63] = step1[63]; // stage 7 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]); btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]); step1[0] = step2[0]; step1[1] = step2[1]; step1[2] = step2[1]; step1[3] = step2[0]; step1[4] = step2[4]; step1[7] = step2[7]; step1[8] = vqaddq_s16(step2[8], step2[11]); step1[9] = vqaddq_s16(step2[9], step2[10]); step1[10] = vqsubq_s16(step2[9], step2[10]); step1[11] = vqsubq_s16(step2[8], step2[11]); step1[12] = vqsubq_s16(step2[15], step2[12]); step1[13] = vqsubq_s16(step2[14], step2[13]); step1[14] = vqaddq_s16(step2[14], step2[13]); step1[15] = vqaddq_s16(step2[15], step2[12]); step1[16] = step2[16]; step1[17] = step2[17]; step1[22] = step2[22]; step1[23] = step2[23]; step1[24] = step2[24]; step1[25] = step2[25]; step1[30] = step2[30]; step1[31] = step2[31]; step1[32] = vqaddq_s16(step2[32], step2[39]); step1[33] = vqaddq_s16(step2[33], step2[38]); step1[34] = vqaddq_s16(step2[34], step2[37]); step1[35] = vqaddq_s16(step2[35], step2[36]); step1[36] = vqsubq_s16(step2[35], step2[36]); step1[37] = vqsubq_s16(step2[34], step2[37]); step1[38] = vqsubq_s16(step2[33], step2[38]); step1[39] = vqsubq_s16(step2[32], step2[39]); step1[40] = vqsubq_s16(step2[47], step2[40]); step1[41] = vqsubq_s16(step2[46], step2[41]); step1[42] = vqsubq_s16(step2[45], step2[42]); step1[43] = vqsubq_s16(step2[44], step2[43]); step1[44] = vqaddq_s16(step2[43], step2[44]); step1[45] = vqaddq_s16(step2[42], step2[45]); step1[46] = vqaddq_s16(step2[41], step2[46]); step1[47] = vqaddq_s16(step2[40], step2[47]); step1[48] = vqaddq_s16(step2[48], step2[55]); step1[49] = vqaddq_s16(step2[49], step2[54]); step1[50] = vqaddq_s16(step2[50], step2[53]); step1[51] = vqaddq_s16(step2[51], step2[52]); step1[52] = vqsubq_s16(step2[51], step2[52]); step1[53] = vqsubq_s16(step2[50], step2[53]); step1[54] = vqsubq_s16(step2[49], step2[54]); step1[55] = vqsubq_s16(step2[48], step2[55]); step1[56] = vqsubq_s16(step2[63], step2[56]); step1[57] = vqsubq_s16(step2[62], step2[57]); step1[58] = vqsubq_s16(step2[61], step2[58]); step1[59] = vqsubq_s16(step2[60], step2[59]); step1[60] = vqaddq_s16(step2[59], step2[60]); step1[61] = vqaddq_s16(step2[58], step2[61]); step1[62] = vqaddq_s16(step2[57], step2[62]); step1[63] = vqaddq_s16(step2[56], step2[63]); // stage 8 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]); btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]); btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]); btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]); step2[0] = vqaddq_s16(step1[0], step1[7]); step2[1] = vqaddq_s16(step1[1], step1[6]); step2[2] = vqaddq_s16(step1[2], step1[5]); step2[3] = vqaddq_s16(step1[3], step1[4]); step2[4] = vqsubq_s16(step1[3], step1[4]); step2[5] = vqsubq_s16(step1[2], step1[5]); step2[6] = vqsubq_s16(step1[1], step1[6]); step2[7] = vqsubq_s16(step1[0], step1[7]); step2[8] = step1[8]; step2[9] = step1[9]; step2[14] = step1[14]; step2[15] = step1[15]; step2[16] = vqaddq_s16(step1[16], step1[23]); step2[17] = vqaddq_s16(step1[17], step1[22]); step2[18] = vqaddq_s16(step1[18], step1[21]); step2[19] = vqaddq_s16(step1[19], step1[20]); step2[20] = vqsubq_s16(step1[19], step1[20]); step2[21] = vqsubq_s16(step1[18], step1[21]); step2[22] = vqsubq_s16(step1[17], step1[22]); step2[23] = vqsubq_s16(step1[16], step1[23]); step2[24] = vqsubq_s16(step1[31], step1[24]); step2[25] = vqsubq_s16(step1[30], step1[25]); step2[26] = vqsubq_s16(step1[29], step1[26]); step2[27] = vqsubq_s16(step1[28], step1[27]); step2[28] = vqaddq_s16(step1[28], step1[27]); step2[29] = vqaddq_s16(step1[29], step1[26]); step2[30] = vqaddq_s16(step1[30], step1[25]); step2[31] = vqaddq_s16(step1[31], step1[24]); step2[32] = step1[32]; step2[33] = step1[33]; step2[34] = step1[34]; step2[35] = step1[35]; step2[44] = step1[44]; step2[45] = step1[45]; step2[46] = step1[46]; step2[47] = step1[47]; step2[48] = step1[48]; step2[49] = step1[49]; step2[50] = step1[50]; step2[51] = step1[51]; step2[60] = step1[60]; step2[61] = step1[61]; step2[62] = step1[62]; step2[63] = step1[63]; // stage 9 idct64_stage9_neon(step2, step1, cos_bit); // stage 10 idct64_stage10_neon(step1, step2, cos_bit); // stage 11 out[0] = vqaddq_s16(step2[0], step2[63]); out[1] = vqaddq_s16(step2[1], step2[62]); out[2] = vqaddq_s16(step2[2], step2[61]); out[3] = vqaddq_s16(step2[3], step2[60]); out[4] = vqaddq_s16(step2[4], step2[59]); out[5] = vqaddq_s16(step2[5], step2[58]); out[6] = vqaddq_s16(step2[6], step2[57]); out[7] = vqaddq_s16(step2[7], step2[56]); out[8] = vqaddq_s16(step2[8], step2[55]); out[9] = vqaddq_s16(step2[9], step2[54]); out[10] = vqaddq_s16(step2[10], step2[53]); out[11] = vqaddq_s16(step2[11], step2[52]); out[12] = vqaddq_s16(step2[12], step2[51]); out[13] = vqaddq_s16(step2[13], step2[50]); out[14] = vqaddq_s16(step2[14], step2[49]); out[15] = vqaddq_s16(step2[15], step2[48]); out[16] = vqaddq_s16(step2[16], step2[47]); out[17] = vqaddq_s16(step2[17], step2[46]); out[18] = vqaddq_s16(step2[18], step2[45]); out[19] = vqaddq_s16(step2[19], step2[44]); out[20] = vqaddq_s16(step2[20], step2[43]); out[21] = vqaddq_s16(step2[21], step2[42]); out[22] = vqaddq_s16(step2[22], step2[41]); out[23] = vqaddq_s16(step2[23], step2[40]); out[24] = vqaddq_s16(step2[24], step2[39]); out[25] = vqaddq_s16(step2[25], step2[38]); out[26] = vqaddq_s16(step2[26], step2[37]); out[27] = vqaddq_s16(step2[27], step2[36]); out[28] = vqaddq_s16(step2[28], step2[35]); out[29] = vqaddq_s16(step2[29], step2[34]); out[30] = vqaddq_s16(step2[30], step2[33]); out[31] = vqaddq_s16(step2[31], step2[32]); out[32] = vqsubq_s16(step2[31], step2[32]); out[33] = vqsubq_s16(step2[30], step2[33]); out[34] = vqsubq_s16(step2[29], step2[34]); out[35] = vqsubq_s16(step2[28], step2[35]); out[36] = vqsubq_s16(step2[27], step2[36]); out[37] = vqsubq_s16(step2[26], step2[37]); out[38] = vqsubq_s16(step2[25], step2[38]); out[39] = vqsubq_s16(step2[24], step2[39]); out[40] = vqsubq_s16(step2[23], step2[40]); out[41] = vqsubq_s16(step2[22], step2[41]); out[42] = vqsubq_s16(step2[21], step2[42]); out[43] = vqsubq_s16(step2[20], step2[43]); out[44] = vqsubq_s16(step2[19], step2[44]); out[45] = vqsubq_s16(step2[18], step2[45]); out[46] = vqsubq_s16(step2[17], step2[46]); out[47] = vqsubq_s16(step2[16], step2[47]); out[48] = vqsubq_s16(step2[15], step2[48]); out[49] = vqsubq_s16(step2[14], step2[49]); out[50] = vqsubq_s16(step2[13], step2[50]); out[51] = vqsubq_s16(step2[12], step2[51]); out[52] = vqsubq_s16(step2[11], step2[52]); out[53] = vqsubq_s16(step2[10], step2[53]); out[54] = vqsubq_s16(step2[9], step2[54]); out[55] = vqsubq_s16(step2[8], step2[55]); out[56] = vqsubq_s16(step2[7], step2[56]); out[57] = vqsubq_s16(step2[6], step2[57]); out[58] = vqsubq_s16(step2[5], step2[58]); out[59] = vqsubq_s16(step2[4], step2[59]); out[60] = vqsubq_s16(step2[3], step2[60]); out[61] = vqsubq_s16(step2[2], step2[61]); out[62] = vqsubq_s16(step2[1], step2[62]); out[63] = vqsubq_s16(step2[0], step2[63]); } // Functions for blocks with eob at DC and within // topleft 8x8, 16x16, 32x32 corner static const transform_neon lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = { { { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL }, }, { { idct8_low1_neon, idct8_neon, NULL, NULL }, { iadst8_low1_neon, iadst8_neon, NULL, NULL }, { NULL, NULL, NULL, NULL } }, { { idct16_low1_neon, idct16_low8_neon, idct16_neon, NULL }, { iadst16_low1_neon, iadst16_low8_neon, iadst16_neon, NULL }, { NULL, NULL, NULL, NULL }, }, { { idct32_low1_neon, idct32_low8_neon, idct32_low16_neon, idct32_neon }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } }, { { idct64_low1_neon, idct64_low8_neon, idct64_low16_neon, idct64_low32_neon }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } } }; static inline void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { (void)tx_type; int16x8_t a[32 * 4]; int16x8_t b[32 * 4]; int eobx, eoby; get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3), 0); lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3), 0); const int buf_size_w_div8 = txfm_size_col >> 3; const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; const int input_stride = txfm_size_row; int temp_b = 0; for (int i = 0; i < buf_size_nonzero_h_div8; i++) { int16x8_t *cur_a = &a[i * txfm_size_col]; load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a, buf_size_nonzero_w); input += 8; if (abs(rect_type) == 1) { round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); } identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w, -shift[0]); for (int j = 0; j < buf_size_w_div8; ++j) { transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]); } temp_b += 8; } for (int j = 0; j < buf_size_w_div8; ++j) { identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row], txh_idx, txfm_size_row, -shift[1]); } if (txfm_size_col >= 16) { for (int i = 0; i < (txfm_size_col >> 4); i++) { lowbd_add_flip_buffer_16xn_neon( &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row); } } else if (txfm_size_col == 8) { lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row); } } static inline void lowbd_inv_txfm2d_add_v_identity_neon( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { int16x8_t a[16 * 2]; int16x8_t b[16 * 2]; int eobx, eoby, ud_flip, lr_flip; get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3), 0); const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int buf_size_w_div8 = txfm_size_col >> 3; const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; const int input_stride = txfm_size_row; const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; int temp_b = 0; const transform_neon row_txfm = lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; assert(row_txfm != NULL); get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < buf_size_nonzero_h_div8; i++) { int16x8_t *cur_a = &a[i * txfm_size_col]; load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a, buf_size_nonzero_w); input += 8; if (abs(rect_type) == 1) { round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); } row_txfm(cur_a, cur_a, INV_COS_BIT); round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]); if (lr_flip == 1) { for (int j = 0; j < buf_size_w_div8; ++j) { flip_buf_ud_neon(&cur_a[j * 8], 8); transpose_arrays_s16_8x8( &cur_a[j * 8], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]); } temp_b += 8; } else { for (int j = 0; j < buf_size_w_div8; ++j) { transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]); } temp_b += 8; } } for (int j = 0; j < buf_size_w_div8; ++j) { identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row], txh_idx, txfm_size_row, -shift[1]); } if (txfm_size_col >= 16) { for (int i = 0; i < (txfm_size_col >> 4); i++) { lowbd_add_flip_buffer_16xn_neon( &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row); } } else if (txfm_size_col == 8) { lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row); } } static inline void lowbd_inv_txfm2d_add_h_identity_neon( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { int16x8_t a[16 * 2]; int16x8_t b[16 * 2]; int eobx, eoby, ud_flip, lr_flip; get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3), 0); const int buf_size_w_div8 = txfm_size_col >> 3; const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; const int input_stride = txfm_size_row; const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; int temp_b = 0; const transform_neon col_txfm = lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; assert(col_txfm != NULL); get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < buf_size_nonzero_h_div8; i++) { int16x8_t *cur_a = &a[i * txfm_size_col]; load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a, buf_size_nonzero_w); input += 8; if (abs(rect_type) == 1) { round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); } identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w, -shift[0]); for (int j = 0; j < buf_size_w_div8; ++j) { transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]); } temp_b += 8; } for (int j = 0; j < buf_size_w_div8; ++j) { col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT); round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]); } if (txfm_size_col >= 16) { for (int i = 0; i < (txfm_size_col >> 4); i++) { lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2], output + 16 * i, stride, ud_flip, txfm_size_row); } } else if (txfm_size_col == 8) { lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row); } } static inline void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, int eob) { (void)eob; TX_SIZE tx_size = TX_4X4; DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]); int32_t *temp_in = txfm_buf; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); int32_t *temp_out = temp_in + buf_offset; int32_t *buf = temp_out + buf_offset; int32_t *buf_ptr = buf; const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 }; int r; const transform_1d_neon row_txfm = lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_neon col_txfm = lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < txfm_size_row; i++) { for (int c = 0; c < txfm_size_col; ++c) temp_in[c] = input[c * txfm_size_row]; row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); input++; buf_ptr += txfm_size_col; } for (int c = 0; c < txfm_size_col; ++c) { if (lr_flip == 0) { for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + c]; } else { // flip left right for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; } clamp_buf(temp_in, txfm_size_row, 16); col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); if (ud_flip == 0) { for (r = 0; r < txfm_size_row; ++r) { output[r * stride + c] = clip_pixel(output[r * stride + c] + temp_out[r]); } } else { // flip upside down for (r = 0; r < txfm_size_row; ++r) { output[r * stride + c] = clip_pixel(output[r * stride + c] + temp_out[txfm_size_row - r - 1]); } } } } static void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, int eob) { (void)eob; TX_SIZE tx_size = TX_4X8; DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]); int32_t *temp_in = txfm_buf; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); int32_t *temp_out = temp_in + buf_offset; int32_t *buf = temp_out + buf_offset; int32_t *buf_ptr = buf; const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16, 16 }; int r; const transform_1d_neon row_txfm = lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_neon col_txfm = lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < txfm_size_row; i++) { for (int c = 0; c < txfm_size_col; c++) temp_in[c] = round_shift((int64_t)input[c * txfm_size_row] * NewInvSqrt2, NewSqrt2Bits); row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); input++; buf_ptr += txfm_size_col; } for (int c = 0; c < txfm_size_col; ++c) { if (lr_flip == 0) { for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + c]; } else { // flip left right for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; } clamp_buf(temp_in, txfm_size_row, 16); col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); if (ud_flip == 0) { for (r = 0; r < txfm_size_row; ++r) { output[r * stride + c] = clip_pixel(output[r * stride + c] + temp_out[r]); } } else { // flip upside down for (r = 0; r < txfm_size_row; ++r) { output[r * stride + c] = clip_pixel(output[r * stride + c] + temp_out[txfm_size_row - r - 1]); } } } } static void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, int eob) { (void)eob; TX_SIZE tx_size = TX_8X4; DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]); int32_t *temp_in = txfm_buf; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); int32_t *temp_out = temp_in + buf_offset; int32_t *buf = temp_out + buf_offset; int32_t *buf_ptr = buf; const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16, 16 }; int r; const transform_1d_neon row_txfm = lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_neon col_txfm = lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < txfm_size_row; i++) { for (int c = 0; c < txfm_size_col; c++) temp_in[c] = round_shift((int64_t)input[c * txfm_size_row] * NewInvSqrt2, NewSqrt2Bits); row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); input++; buf_ptr += txfm_size_col; } for (int c = 0; c < txfm_size_col; ++c) { if (lr_flip == 0) { for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + c]; } else { // flip left right for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; } clamp_buf(temp_in, txfm_size_row, 16); col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); if (ud_flip == 0) { for (r = 0; r < txfm_size_row; ++r) { output[r * stride + c] = clip_pixel(output[r * stride + c] + temp_out[r]); } } else { // flip upside down for (r = 0; r < txfm_size_row; ++r) { output[r * stride + c] = clip_pixel(output[r * stride + c] + temp_out[txfm_size_row - r - 1]); } } } } static void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, int eob) { (void)eob; TX_SIZE tx_size = TX_4X16; DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); int32_t *temp_in = txfm_buf; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); int32_t *temp_out = temp_in + buf_offset; int32_t *buf = temp_out + buf_offset; int32_t *buf_ptr = buf; const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }; int r; const transform_1d_neon row_txfm = lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_neon col_txfm = lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < txfm_size_row; i++) { for (int c = 0; c < txfm_size_col; c++) temp_in[c] = input[c * txfm_size_row]; row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); input++; buf_ptr += txfm_size_col; } for (int c = 0; c < txfm_size_col; ++c) { if (lr_flip == 0) { for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + c]; } else { // flip left right for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; } clamp_buf(temp_in, txfm_size_row, 16); col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); if (ud_flip == 0) { for (r = 0; r < txfm_size_row; ++r) { output[r * stride + c] = clip_pixel(output[r * stride + c] + temp_out[r]); } } else { // flip upside down for (r = 0; r < txfm_size_row; ++r) { output[r * stride + c] = clip_pixel(output[r * stride + c] + temp_out[txfm_size_row - r - 1]); } } } } static void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, int eob) { (void)eob; TX_SIZE tx_size = TX_16X4; DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]); int32_t *temp_in = txfm_buf; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); int32_t *temp_out = temp_in + buf_offset; int32_t *buf = temp_out + buf_offset; int32_t *buf_ptr = buf; const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }; int r; const transform_1d_neon row_txfm = lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_neon col_txfm = lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < txfm_size_row; i++) { for (int c = 0; c < txfm_size_col; c++) temp_in[c] = input[c * txfm_size_row]; row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); input++; buf_ptr += txfm_size_col; } for (int c = 0; c < txfm_size_col; ++c) { if (lr_flip == 0) { for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + c]; } else { // flip left right for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; } clamp_buf(temp_in, txfm_size_row, 16); col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); if (ud_flip == 0) { for (r = 0; r < txfm_size_row; ++r) { output[r * stride + c] = clip_pixel(output[r * stride + c] + temp_out[r]); } } else { // flip upside down for (r = 0; r < txfm_size_row; ++r) { output[r * stride + c] = clip_pixel(output[r * stride + c] + temp_out[txfm_size_row - r - 1]); } } } } static inline void lowbd_inv_txfm2d_add_no_identity_neon( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { int16x8_t a[64 * 8]; int16x8_t b[64 * 8]; int eobx, eoby, ud_flip, lr_flip; get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int buf_size_w_div8 = txfm_size_col >> 3; const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; const int input_stride = AOMMIN(32, txfm_size_row); const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; int temp_b = 0; const transform_neon row_txfm = lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; const transform_neon col_txfm = lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; assert(col_txfm != NULL); assert(row_txfm != NULL); get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < buf_size_nonzero_h_div8; i++) { int16x8_t *cur_a = &a[i * txfm_size_col]; load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a, buf_size_nonzero_w); input += 8; if (abs(rect_type) == 1) { round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); } row_txfm(cur_a, cur_a, INV_COS_BIT); round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]); if (lr_flip == 1) { for (int j = 0; j < buf_size_w_div8; ++j) { flip_buf_ud_neon(&cur_a[j * 8], 8); transpose_arrays_s16_8x8( &cur_a[j * 8], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]); } temp_b += 8; } else { for (int j = 0; j < buf_size_w_div8; ++j) { transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]); } temp_b += 8; } } for (int j = 0; j < buf_size_w_div8; ++j) { col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT); round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]); } if (txfm_size_col >= 16) { for (int i = 0; i < (txfm_size_col >> 4); i++) { lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2], output + 16 * i, stride, ud_flip, txfm_size_row); } } else if (txfm_size_col == 8) { lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row); } } static inline void lowbd_inv_txfm2d_add_universe_neon( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { switch (tx_type) { case IDTX: lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size, eob); break; case H_DCT: case H_ADST: case H_FLIPADST: lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type, tx_size, eob); break; case V_DCT: case V_ADST: case V_FLIPADST: lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type, tx_size, eob); break; default: lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type, tx_size, eob); break; } } // This function is used by av1_inv_txfm2d_test.cc. void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob); void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { switch (tx_size) { case TX_4X4: lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, eob); break; case TX_4X8: lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, eob); break; case TX_8X4: lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, eob); break; case TX_4X16: lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, eob); break; case TX_16X4: lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, eob); break; default: lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type, tx_size, eob); break; } } void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param) { const TX_TYPE tx_type = txfm_param->tx_type; if (!txfm_param->lossless) { av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type, txfm_param->tx_size, txfm_param->eob); } else { av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); } } aom-3.12.1/av1/common/arm/av1_inv_txfm_neon.h000066400000000000000000000132111477627663500207260ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ #define AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "av1/common/enums.h" #include "av1/common/av1_inv_txfm1d.h" #include "av1/common/av1_inv_txfm1d_cfg.h" #include "av1/common/av1_txfm.h" typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output, const int8_t cos_bit, const int8_t *stage_ptr); typedef void (*transform_neon)(int16x8_t *input, int16x8_t *output, int8_t cos_bit); DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x16_default[16]) = { 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x32_default[32]) = { 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = { 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = { 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x32_default[32]) = { 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x16_default[16]) = { 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = { 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = { 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f, }; DECLARE_ALIGNED(16, static const int16_t *, av1_eob_to_eobxy_default[TX_SIZES_ALL]) = { NULL, av1_eob_to_eobxy_8x8_default, av1_eob_to_eobxy_16x16_default, av1_eob_to_eobxy_32x32_default, av1_eob_to_eobxy_32x32_default, NULL, NULL, av1_eob_to_eobxy_8x16_default, av1_eob_to_eobxy_16x8_default, av1_eob_to_eobxy_16x32_default, av1_eob_to_eobxy_32x16_default, av1_eob_to_eobxy_32x32_default, av1_eob_to_eobxy_32x32_default, NULL, NULL, av1_eob_to_eobxy_8x32_default, av1_eob_to_eobxy_32x8_default, av1_eob_to_eobxy_16x32_default, av1_eob_to_eobxy_32x16_default, }; static const int lowbd_txfm_all_1d_zeros_idx[32] = { 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, }; // Transform block width in log2 for eob (size of 64 map to 32) static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = { 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5, }; static const int eob_fill[32] = { 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, }; static inline void get_eobx_eoby_scan_default(int *eobx, int *eoby, TX_SIZE tx_size, int eob) { if (eob == 1) { *eobx = 0; *eoby = 0; return; } const int tx_w_log2 = tx_size_wide_log2_eob[tx_size]; const int eob_row = (eob - 1) >> tx_w_log2; const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row]; *eobx = eobxy & 0xFF; *eoby = eobxy >> 8; } static inline void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby, TX_SIZE tx_size, int eob) { eob -= 1; const int txfm_size_row = tx_size_high[tx_size]; const int eoby_max = AOMMIN(32, txfm_size_row) - 1; *eobx = eob / (eoby_max + 1); *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob]; } static inline void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby, TX_SIZE tx_size, int eob) { eob -= 1; const int txfm_size_col = tx_size_wide[tx_size]; const int eobx_max = AOMMIN(32, txfm_size_col) - 1; *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob]; const int temp_eoby = eob / (eobx_max + 1); assert(temp_eoby < 32); *eoby = eob_fill[temp_eoby]; } #endif // AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ aom-3.12.1/av1/common/arm/av1_txfm_neon.c000066400000000000000000000020071477627663500200460ustar00rootroot00000000000000/* * * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_ports/mem.h" void av1_round_shift_array_neon(int32_t *arr, int size, int bit) { assert(!(size % 4)); if (!bit) return; const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit)); for (int i = 0; i < size; i += 4) { int32x4_t tmp_q_s32 = vld1q_s32(arr); tmp_q_s32 = vrshlq_s32(tmp_q_s32, dup_bits_n_32x4); vst1q_s32(arr, tmp_q_s32); arr += 4; } } aom-3.12.1/av1/common/arm/blend_a64_hmask_neon.c000066400000000000000000000060151477627663500212450ustar00rootroot00000000000000/* * * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/blend_neon.h" #include "aom_dsp/arm/mem_neon.h" void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h) { assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 2); assert(w >= 2); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); if (w > 8) { do { int i = 0; do { uint8x16_t m0 = vld1q_u8(mask + i); uint8x16_t s0 = vld1q_u8(src0 + i); uint8x16_t s1 = vld1q_u8(src1 + i); uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1); vst1q_u8(dst + i, blend); i += 16; } while (i < w); src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else if (w == 8) { const uint8x8_t m0 = vld1_u8(mask); do { uint8x8_t s0 = vld1_u8(src0); uint8x8_t s1 = vld1_u8(src1); uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); vst1_u8(dst, blend); src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else if (w == 4) { const uint8x8_t m0 = load_unaligned_dup_u8_4x2(mask); do { uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride); uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride); uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); store_u8x4_strided_x2(dst, dst_stride, blend); src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } else if (w == 2 && h >= 16) { const uint8x8_t m0 = vreinterpret_u8_u16(vld1_dup_u16((uint16_t *)mask)); do { uint8x8_t s0 = load_unaligned_u8_2x2(src0, src0_stride); uint8x8_t s1 = load_unaligned_u8_2x2(src1, src1_stride); uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); store_u8x2_strided_x2(dst, dst_stride, blend); src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } else { aom_blend_a64_hmask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w, h); } } aom-3.12.1/av1/common/arm/blend_a64_vmask_neon.c000066400000000000000000000065451477627663500212730ustar00rootroot00000000000000/* * * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/blend.h" #include "aom_dsp/arm/blend_neon.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_ports/mem.h" #include "config/aom_dsp_rtcd.h" void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h) { assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 2); assert(w >= 2); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); if (w > 8) { do { uint8x16_t m0 = vdupq_n_u8(mask[0]); int i = 0; do { uint8x16_t s0 = vld1q_u8(src0 + i); uint8x16_t s1 = vld1q_u8(src1 + i); uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1); vst1q_u8(dst + i, blend); i += 16; } while (i < w); mask += 1; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else if (w == 8) { do { uint8x8_t m0 = vdup_n_u8(mask[0]); uint8x8_t s0 = vld1_u8(src0); uint8x8_t s1 = vld1_u8(src1); uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); vst1_u8(dst, blend); mask += 1; src0 += src0_stride; src1 += src1_stride; dst += dst_stride; } while (--h != 0); } else if (w == 4) { do { const uint16x4_t m0 = vdup_n_u16((uint16_t)mask[0]); const uint16x4_t m1 = vdup_n_u16((uint16_t)mask[1]); const uint8x8_t m = vmovn_u16(vcombine_u16(m0, m1)); uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride); uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride); uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1); store_u8x4_strided_x2(dst, dst_stride, blend); mask += 2; src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } else if (w == 2 && h >= 16) { do { uint16x4_t m0 = vdup_n_u16(0); m0 = vld1_lane_u16((uint16_t *)mask, m0, 0); uint8x8_t m = vzip_u8(vreinterpret_u8_u16(m0), vreinterpret_u8_u16(m0)).val[0]; uint8x8_t s0 = load_unaligned_u8_2x2(src0, src0_stride); uint8x8_t s1 = load_unaligned_u8_2x2(src1, src1_stride); uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1); store_u8x2_strided_x2(dst, dst_stride, blend); mask += 2; src0 += 2 * src0_stride; src1 += 2 * src1_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } else { aom_blend_a64_vmask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w, h); } } aom-3.12.1/av1/common/arm/cdef_block_neon.c000066400000000000000000001462241477627663500204060ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "av1/common/cdef_block.h" void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height) { do { const uint8_t *src_ptr = src; uint16_t *dst_ptr = dst; int w = 0; while (width - w >= 16) { uint8x16_t row = vld1q_u8(src_ptr + w); uint8x16x2_t row_u16 = { { row, vdupq_n_u8(0) } }; vst2q_u8((uint8_t *)(dst_ptr + w), row_u16); w += 16; } if (width - w >= 8) { uint8x8_t row = vld1_u8(src_ptr + w); vst1q_u16(dst_ptr + w, vmovl_u8(row)); w += 8; } if (width - w == 4) { for (int i = w; i < w + 4; i++) { dst_ptr[i] = src_ptr[i]; } } src += sstride; dst += dstride; } while (--height != 0); } #if CONFIG_AV1_HIGHBITDEPTH void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height) { do { const uint16_t *src_ptr = src; uint16_t *dst_ptr = dst; int w = 0; while (width - w >= 8) { uint16x8_t row = vld1q_u16(src_ptr + w); vst1q_u16(dst_ptr + w, row); w += 8; } if (width - w == 4) { uint16x4_t row = vld1_u16(src_ptr + w); vst1_u16(dst_ptr + w, row); } src += sstride; dst += dstride; } while (--height != 0); } #endif // CONFIG_AV1_HIGHBITDEPTH // partial A is a 16-bit vector of the form: // [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form: // [0 y1 y2 y3 y4 y5 y6 y7]. // This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... // (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1 // and const2. static inline uint32x4_t fold_mul_and_sum_neon(int16x8_t partiala, int16x8_t partialb, uint32x4_t const1, uint32x4_t const2) { // Reverse partial B. // pattern = { 12 13 10 11 8 9 6 7 4 5 2 3 0 1 14 15 }. uint8x16_t pattern = vreinterpretq_u8_u64( vcombine_u64(vcreate_u64((uint64_t)0x07060908 << 32 | 0x0b0a0d0c), vcreate_u64((uint64_t)0x0f0e0100 << 32 | 0x03020504))); #if AOM_ARCH_AARCH64 partialb = vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialb), pattern)); #else int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialb)), vget_high_s8(vreinterpretq_s8_s16(partialb)) } }; int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern))); int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern))); partialb = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi)); #endif // Square and add the corresponding x and y values. int32x4_t cost_lo = vmull_s16(vget_low_s16(partiala), vget_low_s16(partiala)); cost_lo = vmlal_s16(cost_lo, vget_low_s16(partialb), vget_low_s16(partialb)); int32x4_t cost_hi = vmull_s16(vget_high_s16(partiala), vget_high_s16(partiala)); cost_hi = vmlal_s16(cost_hi, vget_high_s16(partialb), vget_high_s16(partialb)); // Multiply by constant. uint32x4_t cost = vmulq_u32(vreinterpretq_u32_s32(cost_lo), const1); cost = vmlaq_u32(cost, vreinterpretq_u32_s32(cost_hi), const2); return cost; } // This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal // down-right, 6 is vertical). // // For each direction the lines are shifted so that we can perform a // basic sum on each vector element. For example, direction 5 is "south by // southeast", so we need to add the pixels along each line i below: // // 0 1 2 3 4 5 6 7 // 0 1 2 3 4 5 6 7 // 8 0 1 2 3 4 5 6 // 8 0 1 2 3 4 5 6 // 9 8 0 1 2 3 4 5 // 9 8 0 1 2 3 4 5 // 10 9 8 0 1 2 3 4 // 10 9 8 0 1 2 3 4 // // For this to fit nicely in vectors, the lines need to be shifted like so: // 0 1 2 3 4 5 6 7 // 0 1 2 3 4 5 6 7 // 8 0 1 2 3 4 5 6 // 8 0 1 2 3 4 5 6 // 9 8 0 1 2 3 4 5 // 9 8 0 1 2 3 4 5 // 10 9 8 0 1 2 3 4 // 10 9 8 0 1 2 3 4 // // In this configuration we can now perform SIMD additions to get the cost // along direction 5. Since this won't fit into a single 128-bit vector, we use // two of them to compute each half of the new configuration, and pad the empty // spaces with zeros. Similar shifting is done for other directions, except // direction 6 which is straightforward as it's the vertical direction. static inline uint32x4_t compute_vert_directions_neon(int16x8_t lines[8], uint32_t cost[4]) { const int16x8_t zero = vdupq_n_s16(0); // Partial sums for lines 0 and 1. int16x8_t partial4a = vextq_s16(zero, lines[0], 1); partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[1], 2)); int16x8_t partial4b = vextq_s16(lines[0], zero, 1); partial4b = vaddq_s16(partial4b, vextq_s16(lines[1], zero, 2)); int16x8_t tmp = vaddq_s16(lines[0], lines[1]); int16x8_t partial5a = vextq_s16(zero, tmp, 3); int16x8_t partial5b = vextq_s16(tmp, zero, 3); int16x8_t partial7a = vextq_s16(zero, tmp, 6); int16x8_t partial7b = vextq_s16(tmp, zero, 6); int16x8_t partial6 = tmp; // Partial sums for lines 2 and 3. partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[2], 3)); partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[3], 4)); partial4b = vaddq_s16(partial4b, vextq_s16(lines[2], zero, 3)); partial4b = vaddq_s16(partial4b, vextq_s16(lines[3], zero, 4)); tmp = vaddq_s16(lines[2], lines[3]); partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 4)); partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 4)); partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 5)); partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 5)); partial6 = vaddq_s16(partial6, tmp); // Partial sums for lines 4 and 5. partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[4], 5)); partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[5], 6)); partial4b = vaddq_s16(partial4b, vextq_s16(lines[4], zero, 5)); partial4b = vaddq_s16(partial4b, vextq_s16(lines[5], zero, 6)); tmp = vaddq_s16(lines[4], lines[5]); partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 5)); partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 5)); partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 4)); partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 4)); partial6 = vaddq_s16(partial6, tmp); // Partial sums for lines 6 and 7. partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[6], 7)); partial4a = vaddq_s16(partial4a, lines[7]); partial4b = vaddq_s16(partial4b, vextq_s16(lines[6], zero, 7)); tmp = vaddq_s16(lines[6], lines[7]); partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 6)); partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 6)); partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 3)); partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 3)); partial6 = vaddq_s16(partial6, tmp); uint32x4_t const0 = vreinterpretq_u32_u64( vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840), vcreate_u64((uint64_t)210 << 32 | 280))); uint32x4_t const1 = vreinterpretq_u32_u64( vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168), vcreate_u64((uint64_t)105 << 32 | 120))); uint32x4_t const2 = vreinterpretq_u32_u64( vcombine_u64(vcreate_u64(0), vcreate_u64((uint64_t)210 << 32 | 420))); uint32x4_t const3 = vreinterpretq_u32_u64( vcombine_u64(vcreate_u64((uint64_t)105 << 32 | 140), vcreate_u64((uint64_t)105 << 32 | 105))); // Compute costs in terms of partial sums. int32x4_t partial6_s32 = vmull_s16(vget_low_s16(partial6), vget_low_s16(partial6)); partial6_s32 = vmlal_s16(partial6_s32, vget_high_s16(partial6), vget_high_s16(partial6)); uint32x4_t costs[4]; costs[0] = fold_mul_and_sum_neon(partial4a, partial4b, const0, const1); costs[1] = fold_mul_and_sum_neon(partial5a, partial5b, const2, const3); costs[2] = vmulq_n_u32(vreinterpretq_u32_s32(partial6_s32), 105); costs[3] = fold_mul_and_sum_neon(partial7a, partial7b, const2, const3); costs[0] = horizontal_add_4d_u32x4(costs); vst1q_u32(cost, costs[0]); return costs[0]; } static inline uint32x4_t fold_mul_and_sum_pairwise_neon(int16x8_t partiala, int16x8_t partialb, int16x8_t partialc, uint32x4_t const0) { // Reverse partial c. // pattern = { 10 11 8 9 6 7 4 5 2 3 0 1 12 13 14 15 }. uint8x16_t pattern = vreinterpretq_u8_u64( vcombine_u64(vcreate_u64((uint64_t)0x05040706 << 32 | 0x09080b0a), vcreate_u64((uint64_t)0x0f0e0d0c << 32 | 0x01000302))); #if AOM_ARCH_AARCH64 partialc = vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialc), pattern)); #else int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialc)), vget_high_s8(vreinterpretq_s8_s16(partialc)) } }; int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern))); int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern))); partialc = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi)); #endif int32x4_t partiala_s32 = vpaddlq_s16(partiala); int32x4_t partialb_s32 = vpaddlq_s16(partialb); int32x4_t partialc_s32 = vpaddlq_s16(partialc); partiala_s32 = vmulq_s32(partiala_s32, partiala_s32); partialb_s32 = vmulq_s32(partialb_s32, partialb_s32); partialc_s32 = vmulq_s32(partialc_s32, partialc_s32); partiala_s32 = vaddq_s32(partiala_s32, partialc_s32); uint32x4_t cost = vmulq_n_u32(vreinterpretq_u32_s32(partialb_s32), 105); cost = vmlaq_u32(cost, vreinterpretq_u32_s32(partiala_s32), const0); return cost; } // This function computes the cost along directions 0, 1, 2, 3. (0 means // 45-degree up-right, 2 is horizontal). // // For direction 1 and 3 ("east northeast" and "east southeast") the shifted // lines need three vectors instead of two. For direction 1 for example, we need // to compute the sums along the line i below: // 0 0 1 1 2 2 3 3 // 1 1 2 2 3 3 4 4 // 2 2 3 3 4 4 5 5 // 3 3 4 4 5 5 6 6 // 4 4 5 5 6 6 7 7 // 5 5 6 6 7 7 8 8 // 6 6 7 7 8 8 9 9 // 7 7 8 8 9 9 10 10 // // Which means we need the following configuration: // 0 0 1 1 2 2 3 3 // 1 1 2 2 3 3 4 4 // 2 2 3 3 4 4 5 5 // 3 3 4 4 5 5 6 6 // 4 4 5 5 6 6 7 7 // 5 5 6 6 7 7 8 8 // 6 6 7 7 8 8 9 9 // 7 7 8 8 9 9 10 10 // // Three vectors are needed to compute this, as well as some extra pairwise // additions. static uint32x4_t compute_horiz_directions_neon(int16x8_t lines[8], uint32_t cost[4]) { const int16x8_t zero = vdupq_n_s16(0); // Compute diagonal directions (1, 2, 3). // Partial sums for lines 0 and 1. int16x8_t partial0a = lines[0]; partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[1], 7)); int16x8_t partial0b = vextq_s16(lines[1], zero, 7); int16x8_t partial1a = vaddq_s16(lines[0], vextq_s16(zero, lines[1], 6)); int16x8_t partial1b = vextq_s16(lines[1], zero, 6); int16x8_t partial3a = vextq_s16(lines[0], zero, 2); partial3a = vaddq_s16(partial3a, vextq_s16(lines[1], zero, 4)); int16x8_t partial3b = vextq_s16(zero, lines[0], 2); partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[1], 4)); // Partial sums for lines 2 and 3. partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[2], 6)); partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[3], 5)); partial0b = vaddq_s16(partial0b, vextq_s16(lines[2], zero, 6)); partial0b = vaddq_s16(partial0b, vextq_s16(lines[3], zero, 5)); partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[2], 4)); partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[3], 2)); partial1b = vaddq_s16(partial1b, vextq_s16(lines[2], zero, 4)); partial1b = vaddq_s16(partial1b, vextq_s16(lines[3], zero, 2)); partial3a = vaddq_s16(partial3a, vextq_s16(lines[2], zero, 6)); partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[2], 6)); partial3b = vaddq_s16(partial3b, lines[3]); // Partial sums for lines 4 and 5. partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[4], 4)); partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[5], 3)); partial0b = vaddq_s16(partial0b, vextq_s16(lines[4], zero, 4)); partial0b = vaddq_s16(partial0b, vextq_s16(lines[5], zero, 3)); partial1b = vaddq_s16(partial1b, lines[4]); partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[5], 6)); int16x8_t partial1c = vextq_s16(lines[5], zero, 6); partial3b = vaddq_s16(partial3b, vextq_s16(lines[4], zero, 2)); partial3b = vaddq_s16(partial3b, vextq_s16(lines[5], zero, 4)); int16x8_t partial3c = vextq_s16(zero, lines[4], 2); partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[5], 4)); // Partial sums for lines 6 and 7. partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[6], 2)); partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[7], 1)); partial0b = vaddq_s16(partial0b, vextq_s16(lines[6], zero, 2)); partial0b = vaddq_s16(partial0b, vextq_s16(lines[7], zero, 1)); partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[6], 4)); partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[7], 2)); partial1c = vaddq_s16(partial1c, vextq_s16(lines[6], zero, 4)); partial1c = vaddq_s16(partial1c, vextq_s16(lines[7], zero, 2)); partial3b = vaddq_s16(partial3b, vextq_s16(lines[6], zero, 6)); partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[6], 6)); partial3c = vaddq_s16(partial3c, lines[7]); // Special case for direction 2 as it's just a sum along each line. int16x8_t lines03[4] = { lines[0], lines[1], lines[2], lines[3] }; int16x8_t lines47[4] = { lines[4], lines[5], lines[6], lines[7] }; int32x4_t partial2a = horizontal_add_4d_s16x8(lines03); int32x4_t partial2b = horizontal_add_4d_s16x8(lines47); uint32x4_t partial2a_u32 = vreinterpretq_u32_s32(vmulq_s32(partial2a, partial2a)); uint32x4_t partial2b_u32 = vreinterpretq_u32_s32(vmulq_s32(partial2b, partial2b)); uint32x4_t const0 = vreinterpretq_u32_u64( vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840), vcreate_u64((uint64_t)210 << 32 | 280))); uint32x4_t const1 = vreinterpretq_u32_u64( vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168), vcreate_u64((uint64_t)105 << 32 | 120))); uint32x4_t const2 = vreinterpretq_u32_u64( vcombine_u64(vcreate_u64((uint64_t)210 << 32 | 420), vcreate_u64((uint64_t)105 << 32 | 140))); uint32x4_t costs[4]; costs[0] = fold_mul_and_sum_neon(partial0a, partial0b, const0, const1); costs[1] = fold_mul_and_sum_pairwise_neon(partial1a, partial1b, partial1c, const2); costs[2] = vaddq_u32(partial2a_u32, partial2b_u32); costs[2] = vmulq_n_u32(costs[2], 105); costs[3] = fold_mul_and_sum_pairwise_neon(partial3c, partial3b, partial3a, const2); costs[0] = horizontal_add_4d_u32x4(costs); vst1q_u32(cost, costs[0]); return costs[0]; } int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var, int coeff_shift) { uint32_t cost[8]; uint32_t best_cost = 0; int best_dir = 0; int16x8_t lines[8]; for (int i = 0; i < 8; i++) { uint16x8_t s = vld1q_u16(&img[i * stride]); lines[i] = vreinterpretq_s16_u16( vsubq_u16(vshlq_u16(s, vdupq_n_s16(-coeff_shift)), vdupq_n_u16(128))); } // Compute "mostly vertical" directions. uint32x4_t cost47 = compute_vert_directions_neon(lines, cost + 4); // Compute "mostly horizontal" directions. uint32x4_t cost03 = compute_horiz_directions_neon(lines, cost); // Find max cost as well as its index to get best_dir. // The max cost needs to be propagated in the whole vector to find its // position in the original cost vectors cost03 and cost47. uint32x4_t cost07 = vmaxq_u32(cost03, cost47); #if AOM_ARCH_AARCH64 best_cost = vmaxvq_u32(cost07); uint32x4_t max_cost = vdupq_n_u32(best_cost); uint8x16x2_t costs = { { vreinterpretq_u8_u32(vceqq_u32(max_cost, cost03)), vreinterpretq_u8_u32( vceqq_u32(max_cost, cost47)) } }; // idx = { 28, 24, 20, 16, 12, 8, 4, 0 }; uint8x8_t idx = vreinterpret_u8_u64(vcreate_u64(0x0004080c1014181cULL)); // Get the lowest 8 bit of each 32-bit elements and reverse them. uint8x8_t tbl = vqtbl2_u8(costs, idx); uint64_t a = vget_lane_u64(vreinterpret_u64_u8(tbl), 0); best_dir = aom_clzll(a) >> 3; #else uint32x2_t cost64 = vpmax_u32(vget_low_u32(cost07), vget_high_u32(cost07)); cost64 = vpmax_u32(cost64, cost64); uint32x4_t max_cost = vcombine_u32(cost64, cost64); best_cost = vget_lane_u32(cost64, 0); uint16x8_t costs = vcombine_u16(vmovn_u32(vceqq_u32(max_cost, cost03)), vmovn_u32(vceqq_u32(max_cost, cost47))); uint8x8_t idx = vand_u8(vmovn_u16(costs), vreinterpret_u8_u64(vcreate_u64(0x8040201008040201ULL))); int sum = horizontal_add_u8x8(idx); best_dir = get_msb(sum ^ (sum - 1)); #endif // Difference between the optimal variance and the variance along the // orthogonal direction. Again, the sum(x^2) terms cancel out. *var = best_cost - cost[(best_dir + 4) & 7]; // We'd normally divide by 840, but dividing by 1024 is close enough // for what we're going to do with this. *var >>= 10; return best_dir; } void cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var_out_1st, int32_t *var_out_2nd, int coeff_shift, int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { // Process first 8x8. *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift); // Process second 8x8. *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift); } // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp))) static inline int16x8_t constrain16(uint16x8_t a, uint16x8_t b, unsigned int threshold, int adjdamp) { uint16x8_t diff = vabdq_u16(a, b); const uint16x8_t a_gt_b = vcgtq_u16(a, b); const uint16x8_t s = vqsubq_u16(vdupq_n_u16(threshold), vshlq_u16(diff, vdupq_n_s16(-adjdamp))); const int16x8_t clip = vreinterpretq_s16_u16(vminq_u16(diff, s)); return vbslq_s16(a_gt_b, clip, vnegq_s16(clip)); } static inline void primary_filter(uint16x8_t s, uint16x8_t tap[4], const int *pri_taps, int pri_strength, int pri_damping, int16x8_t *sum) { // Near taps int16x8_t n0 = constrain16(tap[0], s, pri_strength, pri_damping); int16x8_t n1 = constrain16(tap[1], s, pri_strength, pri_damping); // sum += pri_taps[0] * (n0 + n1) n0 = vaddq_s16(n0, n1); *sum = vmlaq_n_s16(*sum, n0, pri_taps[0]); // Far taps int16x8_t f0 = constrain16(tap[2], s, pri_strength, pri_damping); int16x8_t f1 = constrain16(tap[3], s, pri_strength, pri_damping); // sum += pri_taps[1] * (f0 + f1) f0 = vaddq_s16(f0, f1); *sum = vmlaq_n_s16(*sum, f0, pri_taps[1]); } static inline void secondary_filter(uint16x8_t s, uint16x8_t tap[8], const int *sec_taps, int sec_strength, int sec_damping, int16x8_t *sum) { // Near taps int16x8_t s0 = constrain16(tap[0], s, sec_strength, sec_damping); int16x8_t s1 = constrain16(tap[1], s, sec_strength, sec_damping); int16x8_t s2 = constrain16(tap[2], s, sec_strength, sec_damping); int16x8_t s3 = constrain16(tap[3], s, sec_strength, sec_damping); // sum += sec_taps[0] * (p0 + p1 + p2 + p3) s0 = vaddq_s16(s0, s1); s2 = vaddq_s16(s2, s3); s0 = vaddq_s16(s0, s2); *sum = vmlaq_n_s16(*sum, s0, sec_taps[0]); // Far taps s0 = constrain16(tap[4], s, sec_strength, sec_damping); s1 = constrain16(tap[5], s, sec_strength, sec_damping); s2 = constrain16(tap[6], s, sec_strength, sec_damping); s3 = constrain16(tap[7], s, sec_strength, sec_damping); // sum += sec_taps[1] * (p0 + p1 + p2 + p3) s0 = vaddq_s16(s0, s1); s2 = vaddq_s16(s2, s3); s0 = vaddq_s16(s0, s2); *sum = vmlaq_n_s16(*sum, s0, sec_taps[1]); } void cdef_filter_8_0_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { uint16x8_t max, min; const uint16x8_t cdef_large_value_mask = vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE)); const int po1 = cdef_directions[dir][0]; const int po2 = cdef_directions[dir][1]; const int s1o1 = cdef_directions[dir + 2][0]; const int s1o2 = cdef_directions[dir + 2][1]; const int s2o1 = cdef_directions[dir - 2][0]; const int s2o2 = cdef_directions[dir - 2][1]; const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; const int *sec_taps = cdef_sec_taps; if (pri_strength) { pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); } if (sec_strength) { sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); } if (block_width == 8) { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; do { int16x8_t sum = vdupq_n_s16(0); uint16x8_t s = vld1q_u16(in); max = min = s; uint16x8_t pri_src[4]; // Primary near taps pri_src[0] = vld1q_u16(in + po1); pri_src[1] = vld1q_u16(in - po1); // Primary far taps pri_src[2] = vld1q_u16(in + po2); pri_src[3] = vld1q_u16(in - po2); primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); // The source is 16 bits, however, we only really care about the lower // 8 bits. The upper 8 bits contain the "large" flag. After the final // primary max has been calculated, zero out the upper 8 bits. Use this // to find the "16 bit" max. uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]), vreinterpretq_u8_u16(pri_src[1])); uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]), vreinterpretq_u8_u16(pri_src[3])); pri_max0 = vmaxq_u8(pri_max0, pri_max1); max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0), cdef_large_value_mask)); uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]); uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]); pri_min0 = vminq_u16(pri_min0, pri_min1); min = vminq_u16(min, pri_min0); uint16x8_t sec_src[8]; // Secondary near taps sec_src[0] = vld1q_u16(in + s1o1); sec_src[1] = vld1q_u16(in - s1o1); sec_src[2] = vld1q_u16(in + s2o1); sec_src[3] = vld1q_u16(in - s2o1); // Secondary far taps sec_src[4] = vld1q_u16(in + s1o2); sec_src[5] = vld1q_u16(in - s1o2); sec_src[6] = vld1q_u16(in + s2o2); sec_src[7] = vld1q_u16(in - s2o2); secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); // The source is 16 bits, however, we only really care about the lower // 8 bits. The upper 8 bits contain the "large" flag. After the final // primary max has been calculated, zero out the upper 8 bits. Use this // to find the "16 bit" max. uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]), vreinterpretq_u8_u16(sec_src[1])); uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]), vreinterpretq_u8_u16(sec_src[3])); uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]), vreinterpretq_u8_u16(sec_src[5])); uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]), vreinterpretq_u8_u16(sec_src[7])); sec_max0 = vmaxq_u8(sec_max0, sec_max1); sec_max2 = vmaxq_u8(sec_max2, sec_max3); sec_max0 = vmaxq_u8(sec_max0, sec_max2); max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0), cdef_large_value_mask)); uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); sec_min0 = vminq_u16(sec_min0, sec_min1); sec_min2 = vminq_u16(sec_min2, sec_min3); sec_min0 = vminq_u16(sec_min0, sec_min2); min = vminq_u16(min, sec_min0); // res = s + ((sum - (sum < 0) + 8) >> 4) sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)), vreinterpretq_s16_u16(max)); const uint8x8_t res_u8 = vqmovun_s16(res_s16); vst1_u8(dst8, res_u8); in += CDEF_BSTRIDE; dst8 += dstride; } while (--h != 0); } else { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; do { int16x8_t sum = vdupq_n_s16(0); uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); max = min = s; uint16x8_t pri_src[4]; // Primary near taps pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); // Primary far taps pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); // The source is 16 bits, however, we only really care about the lower // 8 bits. The upper 8 bits contain the "large" flag. After the final // primary max has been calculated, zero out the upper 8 bits. Use this // to find the "16 bit" max. uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]), vreinterpretq_u8_u16(pri_src[1])); uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]), vreinterpretq_u8_u16(pri_src[3])); pri_max0 = vmaxq_u8(pri_max0, pri_max1); max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0), cdef_large_value_mask)); uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]); uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]); pri_min1 = vminq_u16(pri_min1, pri_min2); min = vminq_u16(min, pri_min1); uint16x8_t sec_src[8]; // Secondary near taps sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); // Secondary far taps sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); // The source is 16 bits, however, we only really care about the lower // 8 bits. The upper 8 bits contain the "large" flag. After the final // primary max has been calculated, zero out the upper 8 bits. Use this // to find the "16 bit" max. uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]), vreinterpretq_u8_u16(sec_src[1])); uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]), vreinterpretq_u8_u16(sec_src[3])); uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]), vreinterpretq_u8_u16(sec_src[5])); uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]), vreinterpretq_u8_u16(sec_src[7])); sec_max0 = vmaxq_u8(sec_max0, sec_max1); sec_max2 = vmaxq_u8(sec_max2, sec_max3); sec_max0 = vmaxq_u8(sec_max0, sec_max2); max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0), cdef_large_value_mask)); uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); sec_min0 = vminq_u16(sec_min0, sec_min1); sec_min2 = vminq_u16(sec_min2, sec_min3); sec_min0 = vminq_u16(sec_min0, sec_min2); min = vminq_u16(min, sec_min0); // res = s + ((sum - (sum < 0) + 8) >> 4) sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)), vreinterpretq_s16_u16(max)); const uint8x8_t res_u8 = vqmovun_s16(res_s16); store_u8x4_strided_x2(dst8, dstride, res_u8); in += 2 * CDEF_BSTRIDE; dst8 += 2 * dstride; h -= 2; } while (h != 0); } } void cdef_filter_8_1_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)sec_strength; (void)sec_damping; const int po1 = cdef_directions[dir][0]; const int po2 = cdef_directions[dir][1]; const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; if (pri_strength) { pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); } if (block_width == 8) { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; do { int16x8_t sum = vdupq_n_s16(0); uint16x8_t s = vld1q_u16(in); uint16x8_t tap[4]; // Primary near taps tap[0] = vld1q_u16(in + po1); tap[1] = vld1q_u16(in - po1); // Primary far taps tap[2] = vld1q_u16(in + po2); tap[3] = vld1q_u16(in - po2); primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum); // res = s + ((sum - (sum < 0) + 8) >> 4) sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); const uint8x8_t res_u8 = vqmovun_s16(res_s16); vst1_u8(dst8, res_u8); in += CDEF_BSTRIDE; dst8 += dstride; } while (--h != 0); } else { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; do { int16x8_t sum = vdupq_n_s16(0); uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); uint16x8_t pri_src[4]; // Primary near taps pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); // Primary far taps pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); // res = s + ((sum - (sum < 0) + 8) >> 4) sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); const uint8x8_t res_u8 = vqmovun_s16(res_s16); store_u8x4_strided_x2(dst8, dstride, res_u8); in += 2 * CDEF_BSTRIDE; dst8 += 2 * dstride; h -= 2; } while (h != 0); } } void cdef_filter_8_2_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)pri_strength; (void)pri_damping; (void)coeff_shift; const int s1o1 = cdef_directions[dir + 2][0]; const int s1o2 = cdef_directions[dir + 2][1]; const int s2o1 = cdef_directions[dir - 2][0]; const int s2o2 = cdef_directions[dir - 2][1]; const int *sec_taps = cdef_sec_taps; if (sec_strength) { sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); } if (block_width == 8) { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; do { int16x8_t sum = vdupq_n_s16(0); uint16x8_t s = vld1q_u16(in); uint16x8_t sec_src[8]; // Secondary near taps sec_src[0] = vld1q_u16(in + s1o1); sec_src[1] = vld1q_u16(in - s1o1); sec_src[2] = vld1q_u16(in + s2o1); sec_src[3] = vld1q_u16(in - s2o1); // Secondary far taps sec_src[4] = vld1q_u16(in + s1o2); sec_src[5] = vld1q_u16(in - s1o2); sec_src[6] = vld1q_u16(in + s2o2); sec_src[7] = vld1q_u16(in - s2o2); secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); // res = s + ((sum - (sum < 0) + 8) >> 4) sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); const uint8x8_t res_u8 = vqmovun_s16(res_s16); vst1_u8(dst8, res_u8); in += CDEF_BSTRIDE; dst8 += dstride; } while (--h != 0); } else { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; do { int16x8_t sum = vdupq_n_s16(0); uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); uint16x8_t sec_src[8]; // Secondary near taps sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); // Secondary far taps sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); // res = s + ((sum - (sum < 0) + 8) >> 4) sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); const uint8x8_t res_u8 = vqmovun_s16(res_s16); store_u8x4_strided_x2(dst8, dstride, res_u8); in += 2 * CDEF_BSTRIDE; dst8 += 2 * dstride; h -= 2; } while (h != 0); } } void cdef_filter_8_3_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)pri_strength; (void)sec_strength; (void)dir; (void)pri_damping; (void)sec_damping; (void)coeff_shift; (void)block_width; if (block_width == 8) { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; do { const uint16x8_t s = vld1q_u16(in); const uint8x8_t res = vqmovn_u16(s); vst1_u8(dst8, res); in += CDEF_BSTRIDE; dst8 += dstride; } while (--h != 0); } else { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; do { const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); const uint8x8_t res = vqmovn_u16(s); store_u8x4_strided_x2(dst8, dstride, res); in += 2 * CDEF_BSTRIDE; dst8 += 2 * dstride; h -= 2; } while (h != 0); } } void cdef_filter_16_0_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { uint16x8_t max, min; const uint16x8_t cdef_large_value_mask = vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE)); const int po1 = cdef_directions[dir][0]; const int po2 = cdef_directions[dir][1]; const int s1o1 = cdef_directions[dir + 2][0]; const int s1o2 = cdef_directions[dir + 2][1]; const int s2o1 = cdef_directions[dir - 2][0]; const int s2o2 = cdef_directions[dir - 2][1]; const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; const int *sec_taps = cdef_sec_taps; if (pri_strength) { pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); } if (sec_strength) { sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); } if (block_width == 8) { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; do { int16x8_t sum = vdupq_n_s16(0); uint16x8_t s = vld1q_u16(in); max = min = s; uint16x8_t pri_src[4]; // Primary near taps pri_src[0] = vld1q_u16(in + po1); pri_src[1] = vld1q_u16(in - po1); // Primary far taps pri_src[2] = vld1q_u16(in + po2); pri_src[3] = vld1q_u16(in - po2); primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]); uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]); pri_min0 = vminq_u16(pri_min0, pri_min1); min = vminq_u16(min, pri_min0); /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask); pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask); pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask); pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask); uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]); uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]); pri_max0 = vmaxq_u16(pri_max0, pri_max1); max = vmaxq_u16(max, pri_max0); uint16x8_t sec_src[8]; // Secondary near taps sec_src[0] = vld1q_u16(in + s1o1); sec_src[1] = vld1q_u16(in - s1o1); sec_src[2] = vld1q_u16(in + s2o1); sec_src[3] = vld1q_u16(in - s2o1); // Secondary far taps sec_src[4] = vld1q_u16(in + s1o2); sec_src[5] = vld1q_u16(in - s1o2); sec_src[6] = vld1q_u16(in + s2o2); sec_src[7] = vld1q_u16(in - s2o2); secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); sec_min0 = vminq_u16(sec_min0, sec_min1); sec_min2 = vminq_u16(sec_min2, sec_min3); sec_min0 = vminq_u16(sec_min0, sec_min2); min = vminq_u16(min, sec_min0); /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask); sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask); sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask); sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask); sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask); sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask); sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask); sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask); uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]); uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]); uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]); uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]); sec_max0 = vmaxq_u16(sec_max0, sec_max1); sec_max2 = vmaxq_u16(sec_max2, sec_max3); sec_max0 = vmaxq_u16(sec_max0, sec_max2); max = vmaxq_u16(max, sec_max0); // res = s + ((sum - (sum < 0) + 8) >> 4) sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)), vreinterpretq_s16_u16(max)); vst1q_u16(dst16, vreinterpretq_u16_s16(res)); in += CDEF_BSTRIDE; dst16 += dstride; } while (--h != 0); } else { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; do { int16x8_t sum = vdupq_n_s16(0); uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); max = min = s; uint16x8_t pri_src[4]; // Primary near taps pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); // Primary far taps pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]); uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]); pri_min1 = vminq_u16(pri_min1, pri_min2); min = vminq_u16(min, pri_min1); /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask); pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask); pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask); pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask); uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]); uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]); pri_max0 = vmaxq_u16(pri_max0, pri_max1); max = vmaxq_u16(max, pri_max0); uint16x8_t sec_src[8]; // Secondary near taps sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); // Secondary far taps sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); sec_min0 = vminq_u16(sec_min0, sec_min1); sec_min2 = vminq_u16(sec_min2, sec_min3); sec_min0 = vminq_u16(sec_min0, sec_min2); min = vminq_u16(min, sec_min0); /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask); sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask); sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask); sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask); sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask); sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask); sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask); sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask); uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]); uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]); uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]); uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]); sec_max0 = vmaxq_u16(sec_max0, sec_max1); sec_max2 = vmaxq_u16(sec_max2, sec_max3); sec_max0 = vmaxq_u16(sec_max0, sec_max2); max = vmaxq_u16(max, sec_max0); // res = s + ((sum - (sum < 0) + 8) >> 4) sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)), vreinterpretq_s16_u16(max)); store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); in += 2 * CDEF_BSTRIDE; dst16 += 2 * dstride; h -= 2; } while (h != 0); } } void cdef_filter_16_1_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)sec_strength; (void)sec_damping; const int po1 = cdef_directions[dir][0]; const int po2 = cdef_directions[dir][1]; const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; if (pri_strength) { pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); } if (block_width == 8) { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; do { int16x8_t sum = vdupq_n_s16(0); uint16x8_t s = vld1q_u16(in); uint16x8_t tap[4]; // Primary near taps tap[0] = vld1q_u16(in + po1); tap[1] = vld1q_u16(in - po1); // Primary far taps tap[2] = vld1q_u16(in + po2); tap[3] = vld1q_u16(in - po2); primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum); // res = s + ((sum - (sum < 0) + 8) >> 4) sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); vst1q_u16(dst16, vreinterpretq_u16_s16(res)); in += CDEF_BSTRIDE; dst16 += dstride; } while (--h != 0); } else { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; do { int16x8_t sum = vdupq_n_s16(0); uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); uint16x8_t pri_src[4]; // Primary near taps pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); // Primary far taps pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); // res = s + ((sum - (sum < 0) + 8) >> 4) sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); in += 2 * CDEF_BSTRIDE; dst16 += 2 * dstride; h -= 2; } while (h != 0); } } void cdef_filter_16_2_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)pri_strength; (void)pri_damping; (void)coeff_shift; const int s1o1 = cdef_directions[dir + 2][0]; const int s1o2 = cdef_directions[dir + 2][1]; const int s2o1 = cdef_directions[dir - 2][0]; const int s2o2 = cdef_directions[dir - 2][1]; const int *sec_taps = cdef_sec_taps; if (sec_strength) { sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); } if (block_width == 8) { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; do { int16x8_t sum = vdupq_n_s16(0); uint16x8_t s = vld1q_u16(in); uint16x8_t sec_src[8]; // Secondary near taps sec_src[0] = vld1q_u16(in + s1o1); sec_src[1] = vld1q_u16(in - s1o1); sec_src[2] = vld1q_u16(in + s2o1); sec_src[3] = vld1q_u16(in - s2o1); // Secondary far taps sec_src[4] = vld1q_u16(in + s1o2); sec_src[5] = vld1q_u16(in - s1o2); sec_src[6] = vld1q_u16(in + s2o2); sec_src[7] = vld1q_u16(in - s2o2); secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); // res = s + ((sum - (sum < 0) + 8) >> 4) sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); vst1q_u16(dst16, vreinterpretq_u16_s16(res)); in += CDEF_BSTRIDE; dst16 += dstride; } while (--h != 0); } else { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; do { int16x8_t sum = vdupq_n_s16(0); uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); uint16x8_t sec_src[8]; // Secondary near taps sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); // Secondary far taps sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); // res = s + ((sum - (sum < 0) + 8) >> 4) sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); in += 2 * CDEF_BSTRIDE; dst16 += 2 * dstride; h -= 2; } while (h != 0); } } void cdef_filter_16_3_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)pri_strength; (void)sec_strength; (void)dir; (void)pri_damping; (void)sec_damping; (void)coeff_shift; (void)block_width; if (block_width == 8) { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; do { const uint16x8_t s = vld1q_u16(in); vst1q_u16(dst16, s); in += CDEF_BSTRIDE; dst16 += dstride; } while (--h != 0); } else { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; do { const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); store_u16x4_strided_x2(dst16, dstride, s); in += 2 * CDEF_BSTRIDE; dst16 += 2 * dstride; h -= 2; } while (h != 0); } } aom-3.12.1/av1/common/arm/cfl_neon.c000066400000000000000000000610471477627663500170760ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/arm/mem_neon.h" #include "av1/common/cfl.h" static inline void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset, int16x8_t sub) { vst1q_s16(dst + offset, vsubq_s16(vreinterpretq_s16_u16(vld1q_u16(src + offset)), sub)); } static inline uint16x8_t vldaddq_u16(const uint16_t *buf, size_t offset) { return vaddq_u16(vld1q_u16(buf), vld1q_u16(buf + offset)); } // Load half of a vector and duplicated in other half static inline uint8x8_t vldh_dup_u8(const uint8_t *ptr) { return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)ptr)); } // Store half of a vector. static inline void vsth_u16(uint16_t *ptr, uint16x4_t val) { vst1_lane_u32((uint32_t *)ptr, vreinterpret_u32_u16(val), 0); } // Store half of a vector. static inline void vsth_u8(uint8_t *ptr, uint8x8_t val) { vst1_lane_u32((uint32_t *)ptr, vreinterpret_u32_u8(val), 0); } static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; const int luma_stride = input_stride << 1; do { if (width == 4) { const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input)); const uint16x4_t sum = vpadal_u8(top, vldh_dup_u8(input + input_stride)); vsth_u16(pred_buf_q3, vshl_n_u16(sum, 1)); } else if (width == 8) { const uint16x4_t top = vpaddl_u8(vld1_u8(input)); const uint16x4_t sum = vpadal_u8(top, vld1_u8(input + input_stride)); vst1_u16(pred_buf_q3, vshl_n_u16(sum, 1)); } else if (width == 16) { const uint16x8_t top = vpaddlq_u8(vld1q_u8(input)); const uint16x8_t sum = vpadalq_u8(top, vld1q_u8(input + input_stride)); vst1q_u16(pred_buf_q3, vshlq_n_u16(sum, 1)); } else { const uint8x8x4_t top = vld4_u8(input); const uint8x8x4_t bot = vld4_u8(input + input_stride); // equivalent to a vpaddlq_u8 (because vld4q interleaves) const uint16x8_t top_0 = vaddl_u8(top.val[0], top.val[1]); // equivalent to a vpaddlq_u8 (because vld4q interleaves) const uint16x8_t bot_0 = vaddl_u8(bot.val[0], bot.val[1]); // equivalent to a vpaddlq_u8 (because vld4q interleaves) const uint16x8_t top_1 = vaddl_u8(top.val[2], top.val[3]); // equivalent to a vpaddlq_u8 (because vld4q interleaves) const uint16x8_t bot_1 = vaddl_u8(bot.val[2], bot.val[3]); uint16x8x2_t sum; sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1); sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1); vst2q_u16(pred_buf_q3, sum); } input += luma_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); } static void cfl_luma_subsampling_422_lbd_neon(const uint8_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; do { if (width == 4) { const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input)); vsth_u16(pred_buf_q3, vshl_n_u16(top, 2)); } else if (width == 8) { const uint16x4_t top = vpaddl_u8(vld1_u8(input)); vst1_u16(pred_buf_q3, vshl_n_u16(top, 2)); } else if (width == 16) { const uint16x8_t top = vpaddlq_u8(vld1q_u8(input)); vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 2)); } else { const uint8x8x4_t top = vld4_u8(input); uint16x8x2_t sum; // vaddl_u8 is equivalent to a vpaddlq_u8 (because vld4q interleaves) sum.val[0] = vshlq_n_u16(vaddl_u8(top.val[0], top.val[1]), 2); sum.val[1] = vshlq_n_u16(vaddl_u8(top.val[2], top.val[3]), 2); vst2q_u16(pred_buf_q3, sum); } input += input_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); } static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; do { if (width == 4) { const uint16x8_t top = vshll_n_u8(vldh_dup_u8(input), 3); vst1_u16(pred_buf_q3, vget_low_u16(top)); } else if (width == 8) { const uint16x8_t top = vshll_n_u8(vld1_u8(input), 3); vst1q_u16(pred_buf_q3, top); } else { const uint8x16_t top = vld1q_u8(input); vst1q_u16(pred_buf_q3, vshll_n_u8(vget_low_u8(top), 3)); vst1q_u16(pred_buf_q3 + 8, vshll_n_u8(vget_high_u8(top), 3)); if (width == 32) { const uint8x16_t next_top = vld1q_u8(input + 16); vst1q_u16(pred_buf_q3 + 16, vshll_n_u8(vget_low_u8(next_top), 3)); vst1q_u16(pred_buf_q3 + 24, vshll_n_u8(vget_high_u8(next_top), 3)); } } input += input_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); } #if CONFIG_AV1_HIGHBITDEPTH #if !AOM_ARCH_AARCH64 static uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) { return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)), vpadd_u16(vget_low_u16(b), vget_high_u16(b))); } #endif static void cfl_luma_subsampling_420_hbd_neon(const uint16_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; const int luma_stride = input_stride << 1; do { if (width == 4) { const uint16x4_t top = vld1_u16(input); const uint16x4_t bot = vld1_u16(input + input_stride); const uint16x4_t sum = vadd_u16(top, bot); const uint16x4_t hsum = vpadd_u16(sum, sum); vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 1)); } else if (width < 32) { const uint16x8_t top = vld1q_u16(input); const uint16x8_t bot = vld1q_u16(input + input_stride); const uint16x8_t sum = vaddq_u16(top, bot); if (width == 8) { const uint16x4_t hsum = vget_low_u16(vpaddq_u16(sum, sum)); vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 1)); } else { const uint16x8_t top_1 = vld1q_u16(input + 8); const uint16x8_t bot_1 = vld1q_u16(input + 8 + input_stride); const uint16x8_t sum_1 = vaddq_u16(top_1, bot_1); const uint16x8_t hsum = vpaddq_u16(sum, sum_1); vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 1)); } } else { const uint16x8x4_t top = vld4q_u16(input); const uint16x8x4_t bot = vld4q_u16(input + input_stride); // equivalent to a vpaddq_u16 (because vld4q interleaves) const uint16x8_t top_0 = vaddq_u16(top.val[0], top.val[1]); // equivalent to a vpaddq_u16 (because vld4q interleaves) const uint16x8_t bot_0 = vaddq_u16(bot.val[0], bot.val[1]); // equivalent to a vpaddq_u16 (because vld4q interleaves) const uint16x8_t top_1 = vaddq_u16(top.val[2], top.val[3]); // equivalent to a vpaddq_u16 (because vld4q interleaves) const uint16x8_t bot_1 = vaddq_u16(bot.val[2], bot.val[3]); uint16x8x2_t sum; sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1); sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1); vst2q_u16(pred_buf_q3, sum); } input += luma_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); } static void cfl_luma_subsampling_422_hbd_neon(const uint16_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; do { if (width == 4) { const uint16x4_t top = vld1_u16(input); const uint16x4_t hsum = vpadd_u16(top, top); vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 2)); } else if (width == 8) { const uint16x4x2_t top = vld2_u16(input); // equivalent to a vpadd_u16 (because vld2 interleaves) const uint16x4_t hsum = vadd_u16(top.val[0], top.val[1]); vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 2)); } else if (width == 16) { const uint16x8x2_t top = vld2q_u16(input); // equivalent to a vpaddq_u16 (because vld2q interleaves) const uint16x8_t hsum = vaddq_u16(top.val[0], top.val[1]); vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 2)); } else { const uint16x8x4_t top = vld4q_u16(input); // equivalent to a vpaddq_u16 (because vld4q interleaves) const uint16x8_t hsum_0 = vaddq_u16(top.val[0], top.val[1]); // equivalent to a vpaddq_u16 (because vld4q interleaves) const uint16x8_t hsum_1 = vaddq_u16(top.val[2], top.val[3]); uint16x8x2_t result = { { vshlq_n_u16(hsum_0, 2), vshlq_n_u16(hsum_1, 2) } }; vst2q_u16(pred_buf_q3, result); } input += input_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); } static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; do { if (width == 4) { const uint16x4_t top = vld1_u16(input); vst1_u16(pred_buf_q3, vshl_n_u16(top, 3)); } else if (width == 8) { const uint16x8_t top = vld1q_u16(input); vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 3)); } else if (width == 16) { uint16x8x2_t top = vld2q_u16(input); top.val[0] = vshlq_n_u16(top.val[0], 3); top.val[1] = vshlq_n_u16(top.val[1], 3); vst2q_u16(pred_buf_q3, top); } else { uint16x8x4_t top = vld4q_u16(input); top.val[0] = vshlq_n_u16(top.val[0], 3); top.val[1] = vshlq_n_u16(top.val[1], 3); top.val[2] = vshlq_n_u16(top.val[2], 3); top.val[3] = vshlq_n_u16(top.val[3], 3); vst4q_u16(pred_buf_q3, top); } input += input_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); } #endif // CONFIG_AV1_HIGHBITDEPTH CFL_GET_SUBSAMPLE_FUNCTION(neon) static inline void subtract_average_neon(const uint16_t *src, int16_t *dst, int width, int height, int round_offset, const int num_pel_log2) { const uint16_t *const end = src + height * CFL_BUF_LINE; // Round offset is not needed, because NEON will handle the rounding. (void)round_offset; // To optimize the use of the CPU pipeline, we process 4 rows per iteration const int step = 4 * CFL_BUF_LINE; // At this stage, the prediction buffer contains scaled reconstructed luma // pixels, which are positive integer and only require 15 bits. By using // unsigned integer for the sum, we can do one addition operation inside 16 // bits (8 lanes) before having to convert to 32 bits (4 lanes). const uint16_t *sum_buf = src; uint32x4_t sum_32x4 = vdupq_n_u32(0); do { // For all widths, we load, add and combine the data so it fits in 4 lanes. if (width == 4) { const uint16x4_t a0 = vadd_u16(vld1_u16(sum_buf), vld1_u16(sum_buf + CFL_BUF_LINE)); const uint16x4_t a1 = vadd_u16(vld1_u16(sum_buf + 2 * CFL_BUF_LINE), vld1_u16(sum_buf + 3 * CFL_BUF_LINE)); sum_32x4 = vaddq_u32(sum_32x4, vaddl_u16(a0, a1)); } else if (width == 8) { const uint16x8_t a0 = vldaddq_u16(sum_buf, CFL_BUF_LINE); const uint16x8_t a1 = vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, CFL_BUF_LINE); sum_32x4 = vpadalq_u16(sum_32x4, a0); sum_32x4 = vpadalq_u16(sum_32x4, a1); } else { const uint16x8_t row0 = vldaddq_u16(sum_buf, 8); const uint16x8_t row1 = vldaddq_u16(sum_buf + CFL_BUF_LINE, 8); const uint16x8_t row2 = vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, 8); const uint16x8_t row3 = vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE, 8); sum_32x4 = vpadalq_u16(sum_32x4, row0); sum_32x4 = vpadalq_u16(sum_32x4, row1); sum_32x4 = vpadalq_u16(sum_32x4, row2); sum_32x4 = vpadalq_u16(sum_32x4, row3); if (width == 32) { const uint16x8_t row0_1 = vldaddq_u16(sum_buf + 16, 8); const uint16x8_t row1_1 = vldaddq_u16(sum_buf + CFL_BUF_LINE + 16, 8); const uint16x8_t row2_1 = vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE + 16, 8); const uint16x8_t row3_1 = vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE + 16, 8); sum_32x4 = vpadalq_u16(sum_32x4, row0_1); sum_32x4 = vpadalq_u16(sum_32x4, row1_1); sum_32x4 = vpadalq_u16(sum_32x4, row2_1); sum_32x4 = vpadalq_u16(sum_32x4, row3_1); } } sum_buf += step; } while (sum_buf < end); // Permute and add in such a way that each lane contains the block sum. // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A] #if AOM_ARCH_AARCH64 sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4); sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4); #else uint32x4_t flip = vcombine_u32(vget_high_u32(sum_32x4), vget_low_u32(sum_32x4)); sum_32x4 = vaddq_u32(sum_32x4, flip); sum_32x4 = vaddq_u32(sum_32x4, vrev64q_u32(sum_32x4)); #endif // Computing the average could be done using scalars, but getting off the NEON // engine introduces latency, so we use vqrshrn. int16x4_t avg_16x4; // Constant propagation makes for some ugly code. switch (num_pel_log2) { case 4: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 4)); break; case 5: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 5)); break; case 6: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 6)); break; case 7: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 7)); break; case 8: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 8)); break; case 9: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 9)); break; case 10: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 10)); break; default: assert(0); } if (width == 4) { do { vst1_s16(dst, vsub_s16(vreinterpret_s16_u16(vld1_u16(src)), avg_16x4)); src += CFL_BUF_LINE; dst += CFL_BUF_LINE; } while (src < end); } else { const int16x8_t avg_16x8 = vcombine_s16(avg_16x4, avg_16x4); do { vldsubstq_s16(dst, src, 0, avg_16x8); vldsubstq_s16(dst, src, CFL_BUF_LINE, avg_16x8); vldsubstq_s16(dst, src, 2 * CFL_BUF_LINE, avg_16x8); vldsubstq_s16(dst, src, 3 * CFL_BUF_LINE, avg_16x8); if (width > 8) { vldsubstq_s16(dst, src, 8, avg_16x8); vldsubstq_s16(dst, src, 8 + CFL_BUF_LINE, avg_16x8); vldsubstq_s16(dst, src, 8 + 2 * CFL_BUF_LINE, avg_16x8); vldsubstq_s16(dst, src, 8 + 3 * CFL_BUF_LINE, avg_16x8); } if (width == 32) { vldsubstq_s16(dst, src, 16, avg_16x8); vldsubstq_s16(dst, src, 16 + CFL_BUF_LINE, avg_16x8); vldsubstq_s16(dst, src, 16 + 2 * CFL_BUF_LINE, avg_16x8); vldsubstq_s16(dst, src, 16 + 3 * CFL_BUF_LINE, avg_16x8); vldsubstq_s16(dst, src, 24, avg_16x8); vldsubstq_s16(dst, src, 24 + CFL_BUF_LINE, avg_16x8); vldsubstq_s16(dst, src, 24 + 2 * CFL_BUF_LINE, avg_16x8); vldsubstq_s16(dst, src, 24 + 3 * CFL_BUF_LINE, avg_16x8); } src += step; dst += step; } while (src < end); } } CFL_SUB_AVG_FN(neon) // Saturating negate 16-bit integers in a when the corresponding signed 16-bit // integer in b is negative. // Notes: // * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in // practice, as scaled_luma is the multiplication of two absolute values. // * In the Intel equivalent, elements in a are zeroed out when the // corresponding elements in b are zero. Because vsign is used twice in a // row, with b in the first call becoming a in the second call, there's no // impact from not zeroing out. static int16x4_t vsign_s16(int16x4_t a, int16x4_t b) { const int16x4_t mask = vshr_n_s16(b, 15); return veor_s16(vadd_s16(a, mask), mask); } // Saturating negate 16-bit integers in a when the corresponding signed 16-bit // integer in b is negative. // Notes: // * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in // practice, as scaled_luma is the multiplication of two absolute values. // * In the Intel equivalent, elements in a are zeroed out when the // corresponding elements in b are zero. Because vsignq is used twice in a // row, with b in the first call becoming a in the second call, there's no // impact from not zeroing out. static int16x8_t vsignq_s16(int16x8_t a, int16x8_t b) { const int16x8_t mask = vshrq_n_s16(b, 15); return veorq_s16(vaddq_s16(a, mask), mask); } static inline int16x4_t predict_w4(const int16_t *pred_buf_q3, int16x4_t alpha_sign, int abs_alpha_q12, int16x4_t dc) { const int16x4_t ac_q3 = vld1_s16(pred_buf_q3); const int16x4_t ac_sign = veor_s16(alpha_sign, ac_q3); int16x4_t scaled_luma = vqrdmulh_n_s16(vabs_s16(ac_q3), abs_alpha_q12); return vadd_s16(vsign_s16(scaled_luma, ac_sign), dc); } static inline int16x8_t predict_w8(const int16_t *pred_buf_q3, int16x8_t alpha_sign, int abs_alpha_q12, int16x8_t dc) { const int16x8_t ac_q3 = vld1q_s16(pred_buf_q3); const int16x8_t ac_sign = veorq_s16(alpha_sign, ac_q3); int16x8_t scaled_luma = vqrdmulhq_n_s16(vabsq_s16(ac_q3), abs_alpha_q12); return vaddq_s16(vsignq_s16(scaled_luma, ac_sign), dc); } static inline int16x8x2_t predict_w16(const int16_t *pred_buf_q3, int16x8_t alpha_sign, int abs_alpha_q12, int16x8_t dc) { const int16x8x2_t ac_q3 = vld1q_s16_x2(pred_buf_q3); const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]); const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]); const int16x8_t scaled_luma_0 = vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12); const int16x8_t scaled_luma_1 = vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12); int16x8x2_t result; result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc); result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc); return result; } static inline int16x8x4_t predict_w32(const int16_t *pred_buf_q3, int16x8_t alpha_sign, int abs_alpha_q12, int16x8_t dc) { const int16x8x4_t ac_q3 = vld1q_s16_x4(pred_buf_q3); const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]); const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]); const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]); const int16x8_t ac_sign_3 = veorq_s16(alpha_sign, ac_q3.val[3]); const int16x8_t scaled_luma_0 = vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12); const int16x8_t scaled_luma_1 = vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12); const int16x8_t scaled_luma_2 = vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[2]), abs_alpha_q12); const int16x8_t scaled_luma_3 = vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[3]), abs_alpha_q12); int16x8x4_t result; result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc); result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc); result.val[2] = vaddq_s16(vsignq_s16(scaled_luma_2, ac_sign_2), dc); result.val[3] = vaddq_s16(vsignq_s16(scaled_luma_3, ac_sign_3), dc); return result; } static inline void cfl_predict_lbd_neon(const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3, int width, int height) { const int16_t abs_alpha_q12 = abs(alpha_q3) << 9; const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE; if (width == 4) { const int16x4_t alpha_sign = vdup_n_s16(alpha_q3); const int16x4_t dc = vdup_n_s16(*dst); do { const int16x4_t pred = predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); vsth_u8(dst, vqmovun_s16(vcombine_s16(pred, pred))); dst += dst_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); } else { const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3); const int16x8_t dc = vdupq_n_s16(*dst); do { if (width == 8) { vst1_u8(dst, vqmovun_s16(predict_w8(pred_buf_q3, alpha_sign, abs_alpha_q12, dc))); } else if (width == 16) { const int16x8x2_t pred = predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]) } }; vst1_u8_x2(dst, predun); } else { const int16x8x4_t pred = predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); const uint8x8x4_t predun = { { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]), vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) } }; vst1_u8_x4(dst, predun); } dst += dst_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); } } CFL_PREDICT_FN(neon, lbd) #if CONFIG_AV1_HIGHBITDEPTH static inline uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) { return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0))); } static inline uint16x8_t clampq_s16(int16x8_t a, int16x8_t max) { return vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a, max), vdupq_n_s16(0))); } static inline uint16x8x2_t clamp2q_s16(int16x8x2_t a, int16x8_t max) { uint16x8x2_t result; result.val[0] = vreinterpretq_u16_s16( vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0))); result.val[1] = vreinterpretq_u16_s16( vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0))); return result; } static inline uint16x8x4_t clamp4q_s16(int16x8x4_t a, int16x8_t max) { uint16x8x4_t result; result.val[0] = vreinterpretq_u16_s16( vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0))); result.val[1] = vreinterpretq_u16_s16( vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0))); result.val[2] = vreinterpretq_u16_s16( vmaxq_s16(vminq_s16(a.val[2], max), vdupq_n_s16(0))); result.val[3] = vreinterpretq_u16_s16( vmaxq_s16(vminq_s16(a.val[3], max), vdupq_n_s16(0))); return result; } static inline void cfl_predict_hbd_neon(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd, int width, int height) { const int max = (1 << bd) - 1; const int16_t abs_alpha_q12 = abs(alpha_q3) << 9; const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE; if (width == 4) { const int16x4_t alpha_sign = vdup_n_s16(alpha_q3); const int16x4_t dc = vdup_n_s16(*dst); const int16x4_t max_16x4 = vdup_n_s16(max); do { const int16x4_t scaled_luma = predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); vst1_u16(dst, clamp_s16(scaled_luma, max_16x4)); dst += dst_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); } else { const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3); const int16x8_t dc = vdupq_n_s16(*dst); const int16x8_t max_16x8 = vdupq_n_s16(max); do { if (width == 8) { const int16x8_t pred = predict_w8(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); vst1q_u16(dst, clampq_s16(pred, max_16x8)); } else if (width == 16) { const int16x8x2_t pred = predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); vst1q_u16_x2(dst, clamp2q_s16(pred, max_16x8)); } else { const int16x8x4_t pred = predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); vst1q_u16_x4(dst, clamp4q_s16(pred, max_16x8)); } dst += dst_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); } } CFL_PREDICT_FN(neon, hbd) #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/av1/common/arm/compound_convolve_neon.c000066400000000000000000003151551477627663500220730ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "av1/common/arm/compound_convolve_neon.h" #include "config/aom_config.h" #include "config/av1_rtcd.h" static inline int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t x_filter, const int16x4_t horiz_const) { int16x4_t sum = horiz_const; sum = vmla_lane_s16(sum, s0, x_filter, 0); sum = vmla_lane_s16(sum, s1, x_filter, 1); sum = vmla_lane_s16(sum, s2, x_filter, 2); sum = vmla_lane_s16(sum, s3, x_filter, 3); // We halved the convolution filter values so -1 from the right shift. return vshr_n_s16(sum, ROUND0_BITS - 1); } static inline int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t x_filter, const int16x8_t horiz_const) { const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); int16x8_t sum = horiz_const; sum = vmlaq_lane_s16(sum, s0, x_filter_0_3, 0); sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1); sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2); sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3); sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0); sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1); sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2); sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3); // We halved the convolution filter values so -1 from the right shift. return vshrq_n_s16(sum, ROUND0_BITS - 1); } static inline void dist_wtd_convolve_2d_horiz_neon( const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, const int16_t *x_filter_ptr, const int im_h, int w) { const int bd = 8; const uint8_t *src_ptr = src; int16_t *dst_ptr = im_block; int dst_stride = im_stride; int height = im_h; if (w == 4) { // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // (The extra -1 is needed because we halved the filter values.) const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1))); // 4-tap filters are used for blocks having width <= 4. // Filter values are even, so halve to reduce intermediate precision reqs. const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); src_ptr += 2; do { uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); __builtin_prefetch(dst_ptr); int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const); vst1_s16(dst_ptr, d0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } else { // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // (The extra -1 is needed because we halved the filter values.) const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1))); // Filter values are even, so halve to reduce intermediate precision reqs. const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); #if AOM_ARCH_AARCH64 do { const uint8_t *s; int16_t *d = dst_ptr; int width = w; __builtin_prefetch(src_ptr + 0 * src_stride); __builtin_prefetch(src_ptr + 1 * src_stride); __builtin_prefetch(src_ptr + 2 * src_stride); __builtin_prefetch(src_ptr + 3 * src_stride); __builtin_prefetch(src_ptr + 4 * src_stride); __builtin_prefetch(src_ptr + 5 * src_stride); __builtin_prefetch(src_ptr + 6 * src_stride); __builtin_prefetch(src_ptr + 7 * src_stride); uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); s = src_ptr + 7; __builtin_prefetch(dst_ptr + 0 * dst_stride); __builtin_prefetch(dst_ptr + 1 * dst_stride); __builtin_prefetch(dst_ptr + 2 * dst_stride); __builtin_prefetch(dst_ptr + 3 * dst_stride); __builtin_prefetch(dst_ptr + 4 * dst_stride); __builtin_prefetch(dst_ptr + 5 * dst_stride); __builtin_prefetch(dst_ptr + 6 * dst_stride); __builtin_prefetch(dst_ptr + 7 * dst_stride); do { load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const); int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, horiz_const); int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, horiz_const); int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, horiz_const); int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11, x_filter, horiz_const); int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12, x_filter, horiz_const); int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13, x_filter, horiz_const); int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14, x_filter, horiz_const); transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); s0 = s8; s1 = s9; s2 = s10; s3 = s11; s4 = s12; s5 = s13; s6 = s14; s += 8; d += 8; width -= 8; } while (width > 0); src_ptr += 8 * src_stride; dst_ptr += 8 * dst_stride; height -= 8; } while (height > 8); #endif // AOM_ARCH_AARCH64 do { const uint8_t *s; int16_t *d = dst_ptr; int width = w; uint8x8_t t0 = vld1_u8(src_ptr); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 s = src_ptr + 8; __builtin_prefetch(dst_ptr); do { t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const); vst1q_s16(d, d0); s0 = s8; s += 8; d += 8; width -= 8; } while (width > 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } } void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { assert(w % 4 == 0); assert(h % 4 == 0); DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; const int im_h = h + clamped_y_taps - 1; const int im_stride = MAX_SB_SIZE; const int vert_offset = clamped_y_taps / 2 - 1; const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); const int16x8_t y_filter = vld1q_s16(y_filter_ptr); dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride, x_filter_ptr, im_h, w); if (clamped_y_taps == 6) { if (conv_params->do_average) { if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon( im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, w); } else { dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, w); } } else { dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params, y_filter, h, w); } } else { if (conv_params->do_average) { if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon( im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, w); } else { dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, w); } } else { dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params, y_filter, h, w); } } } static inline void dist_wtd_convolve_2d_copy_dist_wtd_avg_neon( const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, ConvolveParams *conv_params) { assert(w % 4 == 0); assert(h % 4 == 0); const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset); const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS)); const uint16_t fwd_offset = conv_params->fwd_offset; const uint16_t bck_offset = conv_params->bck_offset; CONV_BUF_TYPE *dst = conv_params->dst; const int dst_stride = conv_params->dst_stride; int height = h; if (w == 4) { do { uint8x8_t s0, s1, s2, s3; load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3); uint16x4_t d0 = vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits)); uint16x4_t d1 = vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits)); uint16x4_t d2 = vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits)); uint16x4_t d3 = vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits)); uint16x4_t dd0, dd1, dd2, dd3; load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d01, d23; compute_dist_wtd_avg_4x4( dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, vreinterpretq_s16_u16(round_offset_vec), &d01, &d23); store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01); store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; dst8 += 4 * dst8_stride; height -= 4; } while (height != 0); } else { do { const uint8_t *s = src; CONV_BUF_TYPE *d = dst; uint8_t *d_u8 = dst8; int width = w; do { uint8x8_t s0, s1, s2, s3; load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits); uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits); uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits); uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, vreinterpretq_s16_u16(round_offset_vec), &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); s += 8; d += 8; d_u8 += 8; width -= 8; } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; dst8 += 4 * dst8_stride; height -= 4; } while (height != 0); } } static inline void dist_wtd_convolve_2d_copy_avg_neon( const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, ConvolveParams *conv_params) { assert(w % 4 == 0); assert(h % 4 == 0); const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset); const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS)); CONV_BUF_TYPE *dst = conv_params->dst; const int dst_stride = conv_params->dst_stride; int height = h; if (w == 4) { do { uint8x8_t s0, s1, s2, s3; load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3); uint16x4_t d0 = vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits)); uint16x4_t d1 = vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits)); uint16x4_t d2 = vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits)); uint16x4_t d3 = vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits)); uint16x4_t dd0, dd1, dd2, dd3; load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d01, d23; compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, vreinterpretq_s16_u16(round_offset_vec), &d01, &d23); store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01); store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; dst8 += 4 * dst8_stride; height -= 4; } while (height != 0); } else { do { const uint8_t *s = src; CONV_BUF_TYPE *d = dst; uint8_t *d_u8 = dst8; int width = w; do { uint8x8_t s0, s1, s2, s3; load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits); uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits); uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits); uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, vreinterpretq_s16_u16(round_offset_vec), &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); s += 8; d += 8; d_u8 += 8; width -= 8; } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; dst8 += 4 * dst8_stride; height -= 4; } while (height != 0); } } static inline void dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride, int w, int h, ConvolveParams *conv_params) { assert(w % 4 == 0); assert(h % 4 == 0); const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset); const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS)); CONV_BUF_TYPE *dst = conv_params->dst; const int dst_stride = conv_params->dst_stride; int height = h; if (w == 4) { do { uint8x8_t s0, s1, s2, s3; load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3); uint16x4_t d0 = vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits)); uint16x4_t d1 = vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits)); uint16x4_t d2 = vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits)); uint16x4_t d3 = vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits)); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } else { do { const uint8_t *s = src; CONV_BUF_TYPE *d = dst; int width = w; do { uint8x8_t s0, s1, s2, s3; load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits); uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits); uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits); uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } } void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, ConvolveParams *conv_params) { if (conv_params->do_average) { if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { dist_wtd_convolve_2d_copy_dist_wtd_avg_neon( src, src_stride, dst8, dst8_stride, w, h, conv_params); } else { dist_wtd_convolve_2d_copy_avg_neon(src, src_stride, dst8, dst8_stride, w, h, conv_params); } } else { dist_wtd_convolve_2d_copy_neon(src, src_stride, w, h, conv_params); } } static inline uint16x4_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t x_filter, const int16x4_t round_offset) { int16x4_t sum = vmul_lane_s16(s0, x_filter, 0); sum = vmla_lane_s16(sum, s1, x_filter, 1); sum = vmla_lane_s16(sum, s2, x_filter, 2); sum = vmla_lane_s16(sum, s3, x_filter, 3); // We halved the convolution filter values so -1 from the right shift. int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1); return vreinterpret_u16_s16(res); } static inline uint16x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t x_filter, const int16x8_t round_offset) { const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); int16x8_t sum = vmulq_lane_s16(s0, x_filter_0_3, 0); sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1); sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2); sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3); sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0); sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1); sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2); sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3); // We halved the convolution filter values so -1 from the right shift. int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1); return vreinterpretq_u16_s16(res); } static inline void dist_wtd_convolve_x_dist_wtd_avg_neon( const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { assert(w % 4 == 0); assert(h % 4 == 0); const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); const uint16_t fwd_offset = conv_params->fwd_offset; const uint16_t bck_offset = conv_params->bck_offset; // Horizontal filter. const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; CONV_BUF_TYPE *dst_ptr = conv_params->dst; uint8_t *dst8_ptr = dst8; int dst_stride = conv_params->dst_stride; int height = h; if (w == 4) { // 4-tap filters are used for blocks having width <= 4. // Filter values are even, so halve to reduce intermediate precision reqs. const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); src_ptr += 2; do { uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); __builtin_prefetch(dst_ptr); __builtin_prefetch(dst8_ptr); int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter, vget_low_s16(round_offset_vec)); uint16x4_t dd0 = vld1_u16(dst_ptr); uint8x8_t d01; compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, vget_low_s16(round_offset_vec), &d01); store_u8_4x1(dst8_ptr, d01); src_ptr += src_stride; dst_ptr += dst_stride; dst8_ptr += dst8_stride; } while (--height != 0); } else { // Filter values are even, so halve to reduce intermediate precision reqs. const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); #if AOM_ARCH_AARCH64 while (height >= 8) { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int width = w; uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); __builtin_prefetch(d + 0 * dst_stride); __builtin_prefetch(d + 1 * dst_stride); __builtin_prefetch(d + 2 * dst_stride); __builtin_prefetch(d + 3 * dst_stride); __builtin_prefetch(d + 4 * dst_stride); __builtin_prefetch(d + 5 * dst_stride); __builtin_prefetch(d + 6 * dst_stride); __builtin_prefetch(d + 7 * dst_stride); s += 7; do { load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, round_offset_vec); uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, round_offset_vec); uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, round_offset_vec); uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, round_offset_vec); uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter, round_offset_vec); uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter, round_offset_vec); uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter, round_offset_vec); uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, x_filter, round_offset_vec); transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); uint16x8_t dd4, dd5, dd6, dd7; load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset, bck_offset, round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8); store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); s0 = s8; s1 = s9; s2 = s10; s3 = s11; s4 = s12; s5 = s13; s6 = s14; s += 8; d += 8; d_u8 += 8; width -= 8; } while (width != 0); src_ptr += 8 * src_stride; dst_ptr += 8 * dst_stride; dst8_ptr += 8 * dst8_stride; height -= 8; } #endif // AOM_ARCH_AARCH64 while (height > 0) { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int width = w; uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); __builtin_prefetch(d); s += 8; do { t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, round_offset_vec); uint16x8_t dd0 = vld1q_u16(d); uint8x8_t d0_u8; compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, round_offset_vec, &d0_u8); vst1_u8(d_u8, d0_u8); s0 = s8; s += 8; d += 8; d_u8 += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; dst8_ptr += dst8_stride; height--; } } } static inline void dist_wtd_convolve_x_avg_neon( const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { assert(w % 4 == 0); assert(h % 4 == 0); const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); // Horizontal filter. const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; CONV_BUF_TYPE *dst_ptr = conv_params->dst; uint8_t *dst8_ptr = dst8; int dst_stride = conv_params->dst_stride; int height = h; if (w == 4) { // 4-tap filters are used for blocks having width <= 4. // Filter values are even, so halve to reduce intermediate precision reqs. const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); src_ptr += 2; do { uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); __builtin_prefetch(dst_ptr); __builtin_prefetch(dst8_ptr); int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter, vget_low_s16(round_offset_vec)); uint16x4_t dd0 = vld1_u16(dst_ptr); uint8x8_t d01; compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); store_u8_4x1(dst8_ptr, d01); src_ptr += src_stride; dst_ptr += dst_stride; dst8_ptr += dst8_stride; } while (--height != 0); } else { // Filter values are even, so halve to reduce intermediate precision reqs. const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); #if AOM_ARCH_AARCH64 while (height >= 8) { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int width = w; uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); __builtin_prefetch(d + 0 * dst_stride); __builtin_prefetch(d + 1 * dst_stride); __builtin_prefetch(d + 2 * dst_stride); __builtin_prefetch(d + 3 * dst_stride); __builtin_prefetch(d + 4 * dst_stride); __builtin_prefetch(d + 5 * dst_stride); __builtin_prefetch(d + 6 * dst_stride); __builtin_prefetch(d + 7 * dst_stride); s += 7; do { load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, round_offset_vec); uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, round_offset_vec); uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, round_offset_vec); uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, round_offset_vec); uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter, round_offset_vec); uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter, round_offset_vec); uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter, round_offset_vec); uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, x_filter, round_offset_vec); transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); uint16x8_t dd4, dd5, dd6, dd7; load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8); store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); s0 = s8; s1 = s9; s2 = s10; s3 = s11; s4 = s12; s5 = s13; s6 = s14; s += 8; d += 8; d_u8 += 8; width -= 8; } while (width != 0); src_ptr += 8 * src_stride; dst_ptr += 8 * dst_stride; dst8_ptr += 8 * dst8_stride; height -= 8; } #endif // AOM_ARCH_AARCH64 while (height > 0) { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int width = w; uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); __builtin_prefetch(d); s += 8; do { t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, round_offset_vec); uint16x8_t dd0 = vld1q_u16(d); uint8x8_t d0_u8; compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); vst1_u8(d_u8, d0_u8); s0 = s8; s += 8; d += 8; d_u8 += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; dst8_ptr += dst8_stride; height--; } } } static inline void dist_wtd_convolve_x_neon( const uint8_t *src, int src_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { assert(w % 4 == 0); assert(h % 4 == 0); const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); // Horizontal filter. const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; CONV_BUF_TYPE *dst_ptr = conv_params->dst; int dst_stride = conv_params->dst_stride; int height = h; if (w == 4) { // 4-tap filters are used for blocks having width <= 4. // Filter values are even, so halve to reduce intermediate precision reqs. const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); src_ptr += 2; do { uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); __builtin_prefetch(dst_ptr); int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter, vget_low_s16(round_offset_vec)); vst1_u16(dst_ptr, d0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } else { // Filter values are even, so halve to reduce intermediate precision reqs. const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); #if AOM_ARCH_AARCH64 while (height >= 8) { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; int width = w; uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); __builtin_prefetch(d + 0 * dst_stride); __builtin_prefetch(d + 1 * dst_stride); __builtin_prefetch(d + 2 * dst_stride); __builtin_prefetch(d + 3 * dst_stride); __builtin_prefetch(d + 4 * dst_stride); __builtin_prefetch(d + 5 * dst_stride); __builtin_prefetch(d + 6 * dst_stride); __builtin_prefetch(d + 7 * dst_stride); s += 7; do { load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, round_offset_vec); uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, round_offset_vec); uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, round_offset_vec); uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, round_offset_vec); uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter, round_offset_vec); uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter, round_offset_vec); uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter, round_offset_vec); uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, x_filter, round_offset_vec); transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); s0 = s8; s1 = s9; s2 = s10; s3 = s11; s4 = s12; s5 = s13; s6 = s14; s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 8 * src_stride; dst_ptr += 8 * dst_stride; height -= 8; } #endif // AOM_ARCH_AARCH64 while (height > 0) { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; int width = w; uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); __builtin_prefetch(d); s = src_ptr + 8; do { t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, round_offset_vec); vst1q_u16(d, d0); s0 = s8; s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; height--; } } } void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { if (conv_params->do_average) { if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { dist_wtd_convolve_x_dist_wtd_avg_neon(src, src_stride, dst8, dst8_stride, w, h, filter_params_x, subpel_x_qn, conv_params); } else { dist_wtd_convolve_x_avg_neon(src, src_stride, dst8, dst8_stride, w, h, filter_params_x, subpel_x_qn, conv_params); } } else { dist_wtd_convolve_x_neon(src, src_stride, w, h, filter_params_x, subpel_x_qn, conv_params); } } static inline uint16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x8_t y_filter, const int16x4_t round_offset) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); // Filter values at indices 0 and 7 are 0. int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1); sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2); sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3); sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0); sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1); sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2); // We halved the convolution filter values so -1 from the right shift. int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1); return vreinterpret_u16_s16(res); } static inline uint16x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t y_filter, const int16x8_t round_offset) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); // Filter values at indices 0 and 7 are 0. int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 1); sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 2); sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 3); sum = vmlaq_lane_s16(sum, s3, y_filter_4_7, 0); sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 1); sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 2); // We halved the convolution filter values so -1 from the right shift. int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1); return vreinterpretq_u16_s16(res); } static inline void dist_wtd_convolve_y_6tap_dist_wtd_avg_neon( const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr, const int dst8_stride, int w, int h, const int16x8_t y_filter, ConvolveParams *conv_params) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); const uint16_t fwd_offset = conv_params->fwd_offset; const uint16_t bck_offset = conv_params->bck_offset; CONV_BUF_TYPE *dst_ptr = conv_params->dst; const int dst_stride = conv_params->dst_stride; int width = w; if (w == 4 || h == 4) { do { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int height = h; uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); s += 5 * src_stride; do { #if AOM_ARCH_AARCH64 t0 = load_unaligned_u8_4x1(s + 0 * src_stride); t1 = load_unaligned_u8_4x1(s + 1 * src_stride); t2 = load_unaligned_u8_4x1(s + 2 * src_stride); t3 = load_unaligned_u8_4x1(s + 3 * src_stride); int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t dd0, dd1, dd2, dd3; load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d01, d23; compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01, &d23); store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; d_u8 += 4 * dst8_stride; height -= 4; #else // !AOM_ARCH_AARCH64 t0 = load_unaligned_u8_4x1(s); int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t dd0 = vld1_u16(d); uint8x8_t d01; compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, vget_low_s16(round_offset_vec), &d01); store_u8_4x1(d_u8, d01); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s += src_stride; d += dst_stride; d_u8 += dst8_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 4; dst_ptr += 4; dst8_ptr += 4; width -= 4; } while (width != 0); } else { do { const uint8_t *s = src_ptr + (5 * src_stride); CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int height = h; uint8x8_t t0, t1, t2, t3, t4; load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); do { #if AOM_ARCH_AARCH64 uint8x8_t t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7)); uint16x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); uint16x8_t d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec); uint16x8_t d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); uint16x8_t d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec); uint16x8_t d4 = convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec); uint16x8_t d5 = convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec); uint16x8_t d6 = convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec); uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter, round_offset_vec); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); d_u8 += 4 * dst8_stride; uint16x8_t dd4, dd5, dd6, dd7; load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset, bck_offset, round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8); store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); d_u8 += 4 * dst8_stride; s0 = s8; s1 = s9; s2 = s10; s3 = s11; s4 = s12; s += 8 * src_stride; d += 8 * dst_stride; height -= 8; #else // !AOM_ARCH_AARCH64 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); uint16x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; uint16x8_t dd0 = vld1q_u16(d); uint8x8_t d0_u8; compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, round_offset_vec, &d0_u8); vst1_u8(d_u8, d0_u8); d_u8 += dst8_stride; s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 8; dst_ptr += 8; dst8_ptr += 8; width -= 8; } while (width != 0); } } static inline void dist_wtd_convolve_y_6tap_avg_neon( const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr, const int dst8_stride, int w, int h, const int16x8_t y_filter, ConvolveParams *conv_params) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); CONV_BUF_TYPE *dst_ptr = conv_params->dst; const int dst_stride = conv_params->dst_stride; int width = w; if (w == 4 || h == 4) { do { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int height = h; uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); s += 5 * src_stride; do { #if AOM_ARCH_AARCH64 t0 = load_unaligned_u8_4x1(s + 0 * src_stride); t1 = load_unaligned_u8_4x1(s + 1 * src_stride); t2 = load_unaligned_u8_4x1(s + 2 * src_stride); t3 = load_unaligned_u8_4x1(s + 3 * src_stride); int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t dd0, dd1, dd2, dd3; load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d01, d23; compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01, &d23); store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; d_u8 += 4 * dst8_stride; height -= 4; #else // !AOM_ARCH_AARCH64 t0 = load_unaligned_u8_4x1(s); int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t dd0 = vld1_u16(d); uint8x8_t d01; compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); store_u8_4x1(d_u8, d01); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s += src_stride; d += dst_stride; d_u8 += dst8_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 4; dst_ptr += 4; dst8_ptr += 4; width -= 4; } while (width != 0); } else { do { const uint8_t *s = src_ptr + (5 * src_stride); CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int height = h; uint8x8_t t0, t1, t2, t3, t4; load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); do { #if AOM_ARCH_AARCH64 uint8x8_t t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7)); uint16x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); uint16x8_t d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec); uint16x8_t d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); uint16x8_t d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec); uint16x8_t d4 = convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec); uint16x8_t d5 = convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec); uint16x8_t d6 = convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec); uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter, round_offset_vec); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); d_u8 += 4 * dst8_stride; uint16x8_t dd4, dd5, dd6, dd7; load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8); store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); d_u8 += 4 * dst8_stride; s0 = s8; s1 = s9; s2 = s10; s3 = s11; s4 = s12; s += 8 * src_stride; d += 8 * dst_stride; height -= 8; #else // !AOM_ARCH_AARCH64 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); uint16x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; uint16x8_t dd0 = vld1q_u16(d); uint8x8_t d0_u8; compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); vst1_u8(d_u8, d0_u8); d_u8 += dst8_stride; s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 8; dst_ptr += 8; dst8_ptr += 8; width -= 8; } while (width != 0); } } static inline void dist_wtd_convolve_y_6tap_neon(const uint8_t *src_ptr, int src_stride, int w, int h, const int16x8_t y_filter, ConvolveParams *conv_params) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); CONV_BUF_TYPE *dst_ptr = conv_params->dst; const int dst_stride = conv_params->dst_stride; int width = w; if (w == 4 || h == 4) { do { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; int height = h; uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); s += 5 * src_stride; do { #if AOM_ARCH_AARCH64 t0 = load_unaligned_u8_4x1(s + 0 * src_stride); t1 = load_unaligned_u8_4x1(s + 1 * src_stride); t2 = load_unaligned_u8_4x1(s + 2 * src_stride); t3 = load_unaligned_u8_4x1(s + 3 * src_stride); int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter, vget_low_s16(round_offset_vec)); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; #else // !AOM_ARCH_AARCH64 t0 = load_unaligned_u8_4x1(s); int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, vget_low_s16(round_offset_vec)); vst1_u16(d, d0); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 4; dst_ptr += 4; width -= 4; } while (width != 0); } else { do { const uint8_t *s = src_ptr + (5 * src_stride); CONV_BUF_TYPE *d = dst_ptr; int height = h; uint8x8_t t0, t1, t2, t3, t4; load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); do { #if AOM_ARCH_AARCH64 uint8x8_t t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7)); uint16x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); uint16x8_t d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec); uint16x8_t d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); uint16x8_t d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec); uint16x8_t d4 = convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec); uint16x8_t d5 = convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec); uint16x8_t d6 = convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec); uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter, round_offset_vec); store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); s0 = s8; s1 = s9; s2 = s10; s3 = s11; s4 = s12; s += 8 * src_stride; d += 8 * dst_stride; height -= 8; #else // !AOM_ARCH_AARCH64 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); uint16x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; vst1q_u16(d, d0); s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 8; dst_ptr += 8; width -= 8; } while (width != 0); } } static inline uint16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, const int16x4_t round_offset) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 0); sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1); sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2); sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3); sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0); sum = vmla_lane_s16(sum, s5, y_filter_4_7, 1); sum = vmla_lane_s16(sum, s6, y_filter_4_7, 2); sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3); // We halved the convolution filter values so -1 from the right shift. int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1); return vreinterpret_u16_s16(res); } static inline uint16x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, const int16x8_t round_offset) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 0); sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1); sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2); sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3); sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0); sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 1); sum = vmlaq_lane_s16(sum, s6, y_filter_4_7, 2); sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3); // We halved the convolution filter values so -1 from the right shift. int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1); return vreinterpretq_u16_s16(res); } static inline void dist_wtd_convolve_y_8tap_dist_wtd_avg_neon( const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr, const int dst8_stride, int w, int h, const int16x8_t y_filter, ConvolveParams *conv_params) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); const uint16_t fwd_offset = conv_params->fwd_offset; const uint16_t bck_offset = conv_params->bck_offset; CONV_BUF_TYPE *dst_ptr = conv_params->dst; const int dst_stride = conv_params->dst_stride; int width = w; if (w == 4 || h == 4) { do { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int height = h; __builtin_prefetch(s + 0 * src_stride); __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride); uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride); int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); __builtin_prefetch(d + 0 * dst_stride); __builtin_prefetch(d + 1 * dst_stride); __builtin_prefetch(d + 2 * dst_stride); __builtin_prefetch(d + 3 * dst_stride); s += 7 * src_stride; do { #if AOM_ARCH_AARCH64 t0 = load_unaligned_u8_4x1(s + 0 * src_stride); t1 = load_unaligned_u8_4x1(s + 1 * src_stride); t2 = load_unaligned_u8_4x1(s + 2 * src_stride); t3 = load_unaligned_u8_4x1(s + 3 * src_stride); int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, vget_low_s16(round_offset_vec)); __builtin_prefetch(d + 0 * dst_stride); __builtin_prefetch(d + 1 * dst_stride); __builtin_prefetch(d + 2 * dst_stride); __builtin_prefetch(d + 3 * dst_stride); __builtin_prefetch(d_u8 + 0 * dst8_stride); __builtin_prefetch(d_u8 + 1 * dst8_stride); __builtin_prefetch(d_u8 + 2 * dst8_stride); __builtin_prefetch(d_u8 + 3 * dst8_stride); uint16x4_t dd0, dd1, dd2, dd3; load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d01, d23; compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01, &d23); store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; d_u8 += 4 * dst8_stride; height -= 4; #else // !AOM_ARCH_AARCH64 t0 = load_unaligned_u8_4x1(s); int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, vget_low_s16(round_offset_vec)); __builtin_prefetch(d); uint16x4_t dd0 = vld1_u16(d); uint8x8_t d01; compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, vget_low_s16(round_offset_vec), &d01); store_u8_4x1(d_u8, d01); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s5 = s6; s6 = s7; s += src_stride; d += dst_stride; d_u8 += dst8_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 4; dst_ptr += 4; dst8_ptr += 4; width -= 4; } while (width != 0); } else { do { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int height = h; __builtin_prefetch(s + 0 * src_stride); __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); __builtin_prefetch(s + 4 * src_stride); __builtin_prefetch(s + 5 * src_stride); __builtin_prefetch(s + 6 * src_stride); __builtin_prefetch(s + 7 * src_stride); uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); s += 7 * src_stride; do { #if AOM_ARCH_AARCH64 uint8x8_t t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); __builtin_prefetch(dst_ptr + 0 * dst_stride); __builtin_prefetch(dst_ptr + 1 * dst_stride); __builtin_prefetch(dst_ptr + 2 * dst_stride); __builtin_prefetch(dst_ptr + 3 * dst_stride); uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec); uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec); uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec); uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec); uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, y_filter, round_offset_vec); uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, y_filter, round_offset_vec); uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, y_filter, round_offset_vec); __builtin_prefetch(d + 0 * dst8_stride); __builtin_prefetch(d + 1 * dst8_stride); __builtin_prefetch(d + 2 * dst8_stride); __builtin_prefetch(d + 3 * dst8_stride); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); d_u8 += 4 * dst8_stride; uint16x8_t dd4, dd5, dd6, dd7; load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset, bck_offset, round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8); store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); d_u8 += 4 * dst8_stride; s0 = s8; s1 = s9; s2 = s10; s3 = s11; s4 = s12; s5 = s13; s6 = s14; s += 8 * src_stride; d += 8 * dst_stride; height -= 8; #else // !AOM_ARCH_AARCH64 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); __builtin_prefetch(dst_ptr); uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s5 = s6; s6 = s7; __builtin_prefetch(d); uint16x8_t dd0 = vld1q_u16(d); uint8x8_t d0_u8; compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, round_offset_vec, &d0_u8); vst1_u8(d_u8, d0_u8); d_u8 += dst8_stride; s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 8; dst_ptr += 8; dst8_ptr += 8; width -= 8; } while (width != 0); } } static inline void dist_wtd_convolve_y_8tap_avg_neon( const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr, const int dst8_stride, int w, int h, const int16x8_t y_filter, ConvolveParams *conv_params) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); CONV_BUF_TYPE *dst_ptr = conv_params->dst; const int dst_stride = conv_params->dst_stride; int width = w; if (w == 4 || h == 4) { do { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int height = h; __builtin_prefetch(s + 0 * src_stride); __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride); uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride); int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); __builtin_prefetch(d + 0 * dst_stride); __builtin_prefetch(d + 1 * dst_stride); __builtin_prefetch(d + 2 * dst_stride); __builtin_prefetch(d + 3 * dst_stride); s += 7 * src_stride; do { #if AOM_ARCH_AARCH64 t0 = load_unaligned_u8_4x1(s + 0 * src_stride); t1 = load_unaligned_u8_4x1(s + 1 * src_stride); t2 = load_unaligned_u8_4x1(s + 2 * src_stride); t3 = load_unaligned_u8_4x1(s + 3 * src_stride); int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, vget_low_s16(round_offset_vec)); __builtin_prefetch(d + 0 * dst_stride); __builtin_prefetch(d + 1 * dst_stride); __builtin_prefetch(d + 2 * dst_stride); __builtin_prefetch(d + 3 * dst_stride); __builtin_prefetch(d_u8 + 0 * dst8_stride); __builtin_prefetch(d_u8 + 1 * dst8_stride); __builtin_prefetch(d_u8 + 2 * dst8_stride); __builtin_prefetch(d_u8 + 3 * dst8_stride); uint16x4_t dd0, dd1, dd2, dd3; load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d01, d23; compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01, &d23); store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; d_u8 += 4 * dst8_stride; height -= 4; #else // !AOM_ARCH_AARCH64 t0 = load_unaligned_u8_4x1(s); int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, vget_low_s16(round_offset_vec)); __builtin_prefetch(d); uint16x4_t dd0 = vld1_u16(d); uint8x8_t d01; compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); store_u8_4x1(d_u8, d01); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s5 = s6; s6 = s7; s += src_stride; d += dst_stride; d_u8 += dst8_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 4; dst_ptr += 4; dst8_ptr += 4; width -= 4; } while (width != 0); } else { do { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int height = h; __builtin_prefetch(s + 0 * src_stride); __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); __builtin_prefetch(s + 4 * src_stride); __builtin_prefetch(s + 5 * src_stride); __builtin_prefetch(s + 6 * src_stride); __builtin_prefetch(s + 7 * src_stride); uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); s += 7 * src_stride; do { #if AOM_ARCH_AARCH64 uint8x8_t t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); __builtin_prefetch(dst_ptr + 0 * dst_stride); __builtin_prefetch(dst_ptr + 1 * dst_stride); __builtin_prefetch(dst_ptr + 2 * dst_stride); __builtin_prefetch(dst_ptr + 3 * dst_stride); uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec); uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec); uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec); uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec); uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, y_filter, round_offset_vec); uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, y_filter, round_offset_vec); uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, y_filter, round_offset_vec); __builtin_prefetch(d + 0 * dst8_stride); __builtin_prefetch(d + 1 * dst8_stride); __builtin_prefetch(d + 2 * dst8_stride); __builtin_prefetch(d + 3 * dst8_stride); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); d_u8 += 4 * dst8_stride; uint16x8_t dd4, dd5, dd6, dd7; load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8); store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); d_u8 += 4 * dst8_stride; s0 = s8; s1 = s9; s2 = s10; s3 = s11; s4 = s12; s5 = s13; s6 = s14; s += 8 * src_stride; d += 8 * dst_stride; height -= 8; #else // !AOM_ARCH_AARCH64 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); __builtin_prefetch(dst_ptr); uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s5 = s6; s6 = s7; __builtin_prefetch(d); uint16x8_t dd0 = vld1q_u16(d); uint8x8_t d0_u8; compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); vst1_u8(d_u8, d0_u8); d_u8 += dst8_stride; s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 8; dst_ptr += 8; dst8_ptr += 8; width -= 8; } while (width != 0); } } static inline void dist_wtd_convolve_y_8tap_neon(const uint8_t *src_ptr, int src_stride, int w, int h, const int16x8_t y_filter, ConvolveParams *conv_params) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); CONV_BUF_TYPE *dst_ptr = conv_params->dst; const int dst_stride = conv_params->dst_stride; int width = w; if (w == 4 || h == 4) { do { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; int height = h; __builtin_prefetch(s + 0 * src_stride); __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride); uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride); int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); __builtin_prefetch(d + 0 * dst_stride); __builtin_prefetch(d + 1 * dst_stride); __builtin_prefetch(d + 2 * dst_stride); __builtin_prefetch(d + 3 * dst_stride); s += 7 * src_stride; do { #if AOM_ARCH_AARCH64 t0 = load_unaligned_u8_4x1(s + 0 * src_stride); t1 = load_unaligned_u8_4x1(s + 1 * src_stride); t2 = load_unaligned_u8_4x1(s + 2 * src_stride); t3 = load_unaligned_u8_4x1(s + 3 * src_stride); int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, vget_low_s16(round_offset_vec)); uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, vget_low_s16(round_offset_vec)); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; #else // !AOM_ARCH_AARCH64 t0 = load_unaligned_u8_4x1(s); int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, vget_low_s16(round_offset_vec)); vst1_u16(d, d0); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s5 = s6; s6 = s7; s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 4; dst_ptr += 4; width -= 4; } while (width != 0); } else { do { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; int height = h; __builtin_prefetch(s + 0 * src_stride); __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); __builtin_prefetch(s + 4 * src_stride); __builtin_prefetch(s + 5 * src_stride); __builtin_prefetch(s + 6 * src_stride); __builtin_prefetch(s + 7 * src_stride); uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); s += 7 * src_stride; do { #if AOM_ARCH_AARCH64 uint8x8_t t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); __builtin_prefetch(dst_ptr + 0 * dst_stride); __builtin_prefetch(dst_ptr + 1 * dst_stride); __builtin_prefetch(dst_ptr + 2 * dst_stride); __builtin_prefetch(dst_ptr + 3 * dst_stride); uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec); uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec); uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec); uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec); uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, y_filter, round_offset_vec); uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, y_filter, round_offset_vec); uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, y_filter, round_offset_vec); store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); s0 = s8; s1 = s9; s2 = s10; s3 = s11; s4 = s12; s5 = s13; s6 = s14; s += 8 * src_stride; d += 8 * dst_stride; height -= 8; #else // !AOM_ARCH_AARCH64 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); __builtin_prefetch(dst_ptr); uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s5 = s6; s6 = s7; vst1q_u16(d, d0); s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 8; dst_ptr += 8; width -= 8; } while (width != 0); } } void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params) { assert(w % 4 == 0); assert(h % 4 == 0); // Vertical filter. const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); // Filter values are even, so downshift by 1 to reduce intermediate // precision requirements. const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1); const int vert_offset = filter_params_y->taps / 2 - 1; const uint8_t *src_ptr = src - (vert_offset * src_stride); if (get_filter_tap(filter_params_y, subpel_y_qn) <= 6) { if (conv_params->do_average) { if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { dist_wtd_convolve_y_6tap_dist_wtd_avg_neon( src_ptr + src_stride, src_stride, dst8, dst8_stride, w, h, y_filter, conv_params); } else { dist_wtd_convolve_y_6tap_avg_neon(src_ptr + src_stride, src_stride, dst8, dst8_stride, w, h, y_filter, conv_params); } } else { dist_wtd_convolve_y_6tap_neon(src_ptr + src_stride, src_stride, w, h, y_filter, conv_params); } } else { if (conv_params->do_average) { if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(src_ptr, src_stride, dst8, dst8_stride, w, h, y_filter, conv_params); } else { dist_wtd_convolve_y_8tap_avg_neon(src_ptr, src_stride, dst8, dst8_stride, w, h, y_filter, conv_params); } } else { dist_wtd_convolve_y_8tap_neon(src_ptr, src_stride, w, h, y_filter, conv_params); } } } aom-3.12.1/av1/common/arm/compound_convolve_neon.h000066400000000000000000001200151477627663500220650ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_ #define AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_ #include #include "av1/common/convolve.h" #include "av1/common/enums.h" #include "av1/common/filter.h" static inline void compute_dist_wtd_avg_4x1(uint16x4_t dd0, uint16x4_t d0, const uint16_t fwd_offset, const uint16_t bck_offset, const int16x4_t round_offset, uint8x8_t *d0_u8) { uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset); blend0 = vmlal_n_u16(blend0, d0, bck_offset); uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS); int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset); int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0)); *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS); } static inline void compute_basic_avg_4x1(uint16x4_t dd0, uint16x4_t d0, const int16x4_t round_offset, uint8x8_t *d0_u8) { uint16x4_t avg0 = vhadd_u16(dd0, d0); int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset); int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0)); *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS); } static inline void compute_dist_wtd_avg_8x1(uint16x8_t dd0, uint16x8_t d0, const uint16_t fwd_offset, const uint16_t bck_offset, const int16x8_t round_offset, uint8x8_t *d0_u8) { uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset); blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset); uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset); blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset); uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS), vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS)); int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset); *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS); } static inline void compute_basic_avg_8x1(uint16x8_t dd0, uint16x8_t d0, const int16x8_t round_offset, uint8x8_t *d0_u8) { uint16x8_t avg0 = vhaddq_u16(dd0, d0); int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset); *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS); } static inline void compute_dist_wtd_avg_4x4( uint16x4_t dd0, uint16x4_t dd1, uint16x4_t dd2, uint16x4_t dd3, uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3, const uint16_t fwd_offset, const uint16_t bck_offset, const int16x8_t round_offset, uint8x8_t *d01_u8, uint8x8_t *d23_u8) { uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset); blend0 = vmlal_n_u16(blend0, d0, bck_offset); uint32x4_t blend1 = vmull_n_u16(dd1, fwd_offset); blend1 = vmlal_n_u16(blend1, d1, bck_offset); uint32x4_t blend2 = vmull_n_u16(dd2, fwd_offset); blend2 = vmlal_n_u16(blend2, d2, bck_offset); uint32x4_t blend3 = vmull_n_u16(dd3, fwd_offset); blend3 = vmlal_n_u16(blend3, d3, bck_offset); uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS); uint16x4_t avg1 = vshrn_n_u32(blend1, DIST_PRECISION_BITS); uint16x4_t avg2 = vshrn_n_u32(blend2, DIST_PRECISION_BITS); uint16x4_t avg3 = vshrn_n_u32(blend3, DIST_PRECISION_BITS); int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1)); int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3)); dst_01 = vsubq_s16(dst_01, round_offset); dst_23 = vsubq_s16(dst_23, round_offset); *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS); *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS); } static inline void compute_basic_avg_4x4(uint16x4_t dd0, uint16x4_t dd1, uint16x4_t dd2, uint16x4_t dd3, uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3, const int16x8_t round_offset, uint8x8_t *d01_u8, uint8x8_t *d23_u8) { uint16x4_t avg0 = vhadd_u16(dd0, d0); uint16x4_t avg1 = vhadd_u16(dd1, d1); uint16x4_t avg2 = vhadd_u16(dd2, d2); uint16x4_t avg3 = vhadd_u16(dd3, d3); int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1)); int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3)); dst_01 = vsubq_s16(dst_01, round_offset); dst_23 = vsubq_s16(dst_23, round_offset); *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS); *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS); } static inline void compute_dist_wtd_avg_8x4( uint16x8_t dd0, uint16x8_t dd1, uint16x8_t dd2, uint16x8_t dd3, uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3, const uint16_t fwd_offset, const uint16_t bck_offset, const int16x8_t round_offset, uint8x8_t *d0_u8, uint8x8_t *d1_u8, uint8x8_t *d2_u8, uint8x8_t *d3_u8) { uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset); blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset); uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset); blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset); uint32x4_t blend1_lo = vmull_n_u16(vget_low_u16(dd1), fwd_offset); blend1_lo = vmlal_n_u16(blend1_lo, vget_low_u16(d1), bck_offset); uint32x4_t blend1_hi = vmull_n_u16(vget_high_u16(dd1), fwd_offset); blend1_hi = vmlal_n_u16(blend1_hi, vget_high_u16(d1), bck_offset); uint32x4_t blend2_lo = vmull_n_u16(vget_low_u16(dd2), fwd_offset); blend2_lo = vmlal_n_u16(blend2_lo, vget_low_u16(d2), bck_offset); uint32x4_t blend2_hi = vmull_n_u16(vget_high_u16(dd2), fwd_offset); blend2_hi = vmlal_n_u16(blend2_hi, vget_high_u16(d2), bck_offset); uint32x4_t blend3_lo = vmull_n_u16(vget_low_u16(dd3), fwd_offset); blend3_lo = vmlal_n_u16(blend3_lo, vget_low_u16(d3), bck_offset); uint32x4_t blend3_hi = vmull_n_u16(vget_high_u16(dd3), fwd_offset); blend3_hi = vmlal_n_u16(blend3_hi, vget_high_u16(d3), bck_offset); uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS), vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS)); uint16x8_t avg1 = vcombine_u16(vshrn_n_u32(blend1_lo, DIST_PRECISION_BITS), vshrn_n_u32(blend1_hi, DIST_PRECISION_BITS)); uint16x8_t avg2 = vcombine_u16(vshrn_n_u32(blend2_lo, DIST_PRECISION_BITS), vshrn_n_u32(blend2_hi, DIST_PRECISION_BITS)); uint16x8_t avg3 = vcombine_u16(vshrn_n_u32(blend3_lo, DIST_PRECISION_BITS), vshrn_n_u32(blend3_hi, DIST_PRECISION_BITS)); int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset); int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset); int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset); int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset); *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS); *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS); *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS); *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS); } static inline void compute_basic_avg_8x4(uint16x8_t dd0, uint16x8_t dd1, uint16x8_t dd2, uint16x8_t dd3, uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3, const int16x8_t round_offset, uint8x8_t *d0_u8, uint8x8_t *d1_u8, uint8x8_t *d2_u8, uint8x8_t *d3_u8) { uint16x8_t avg0 = vhaddq_u16(dd0, d0); uint16x8_t avg1 = vhaddq_u16(dd1, d1); uint16x8_t avg2 = vhaddq_u16(dd2, d2); uint16x8_t avg3 = vhaddq_u16(dd3, d3); int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset); int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset); int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset); int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset); *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS); *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS); *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS); *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS); } static inline uint16x4_t convolve6_4_2d_v( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x8_t y_filter, const int32x4_t offset_const) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); int32x4_t sum = offset_const; // Filter values at indices 0 and 7 are 0. sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 1); sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2); sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3); sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0); sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1); sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2); return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); } static inline uint16x8_t convolve6_8_2d_v( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t y_filter, const int32x4_t offset_const) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); int32x4_t sum0 = offset_const; // Filter values at indices 0 and 7 are 0. sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2); int32x4_t sum1 = offset_const; sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2); return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); } static inline void dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon( int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride, ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); const uint16_t fwd_offset = conv_params->fwd_offset; const uint16_t bck_offset = conv_params->bck_offset; CONV_BUF_TYPE *dst_ptr = conv_params->dst; const int dst_stride = conv_params->dst_stride; if (w == 4) { int16x4_t s0, s1, s2, s3, s4; load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4); src_ptr += 5 * src_stride; do { #if AOM_ARCH_AARCH64 int16x4_t s5, s6, s7, s8; load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); uint16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); uint16x4_t d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); uint16x4_t d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); uint16x4_t d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); uint16x4_t dd0, dd1, dd2, dd3; load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d01_u8, d23_u8; compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01_u8, &d23_u8); store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); dst8_ptr += 4 * dst8_stride; s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; #else // !AOM_ARCH_AARCH64 int16x4_t s5 = vld1_s16(src_ptr); uint16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); uint16x4_t dd0 = vld1_u16(dst_ptr); uint8x8_t d01_u8; compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, vget_low_s16(round_offset_vec), &d01_u8); store_u8_4x1(dst8_ptr, d01_u8); dst8_ptr += dst8_stride; s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; src_ptr += src_stride; dst_ptr += dst_stride; h--; #endif // AOM_ARCH_AARCH64 } while (h != 0); } else { do { int16_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int height = h; int16x8_t s0, s1, s2, s3, s4; load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); s += 5 * src_stride; do { #if AOM_ARCH_AARCH64 int16x8_t s5, s6, s7, s8; load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); uint16x8_t d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); uint16x8_t d1 = convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); uint16x8_t d2 = convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); uint16x8_t d3 = convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); d_u8 += 4 * dst8_stride; s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; #else // !AOM_ARCH_AARCH64 int16x8_t s5 = vld1q_s16(s); uint16x8_t d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); uint16x8_t dd0 = vld1q_u16(d); uint8x8_t d0_u8; compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, round_offset_vec, &d0_u8); vst1_u8(d_u8, d0_u8); d_u8 += dst8_stride; s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 8; dst_ptr += 8; dst8_ptr += 8; w -= 8; } while (w != 0); } } static inline void dist_wtd_convolve_2d_vert_6tap_avg_neon( int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride, ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); CONV_BUF_TYPE *dst_ptr = conv_params->dst; const int dst_stride = conv_params->dst_stride; if (w == 4) { int16x4_t s0, s1, s2, s3, s4; load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4); src_ptr += 5 * src_stride; do { #if AOM_ARCH_AARCH64 int16x4_t s5, s6, s7, s8; load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); uint16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); uint16x4_t d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); uint16x4_t d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); uint16x4_t d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); uint16x4_t dd0, dd1, dd2, dd3; load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d01_u8, d23_u8; compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01_u8, &d23_u8); store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); dst8_ptr += 4 * dst8_stride; s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; #else // !AOM_ARCH_AARCH64 int16x4_t s5 = vld1_s16(src_ptr); uint16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); uint16x4_t dd0 = vld1_u16(dst_ptr); uint8x8_t d01_u8; compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8); store_u8_4x1(dst8_ptr, d01_u8); dst8_ptr += dst8_stride; s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; src_ptr += src_stride; dst_ptr += dst_stride; h--; #endif // AOM_ARCH_AARCH64 } while (h != 0); } else { do { int16_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int height = h; int16x8_t s0, s1, s2, s3, s4; load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); s += 5 * src_stride; do { #if AOM_ARCH_AARCH64 int16x8_t s5, s6, s7, s8; load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); uint16x8_t d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); uint16x8_t d1 = convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); uint16x8_t d2 = convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); uint16x8_t d3 = convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); d_u8 += 4 * dst8_stride; s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; #else // !AOM_ARCH_AARCH64 int16x8_t s5 = vld1q_s16(s); uint16x8_t d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); uint16x8_t dd0 = vld1q_u16(d); uint8x8_t d0_u8; compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); vst1_u8(d_u8, d0_u8); d_u8 += dst8_stride; s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 8; dst_ptr += 8; dst8_ptr += 8; w -= 8; } while (w != 0); } } static inline void dist_wtd_convolve_2d_vert_6tap_neon( int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); CONV_BUF_TYPE *dst_ptr = conv_params->dst; const int dst_stride = conv_params->dst_stride; if (w == 4) { int16x4_t s0, s1, s2, s3, s4; load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4); src_ptr += 5 * src_stride; do { #if AOM_ARCH_AARCH64 int16x4_t s5, s6, s7, s8; load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); uint16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); uint16x4_t d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); uint16x4_t d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); uint16x4_t d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; #else // !AOM_ARCH_AARCH64 int16x4_t s5 = vld1_s16(src_ptr); uint16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); vst1_u16(dst_ptr, d0); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; src_ptr += src_stride; dst_ptr += dst_stride; h--; #endif // AOM_ARCH_AARCH64 } while (h != 0); } else { do { int16_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; int height = h; int16x8_t s0, s1, s2, s3, s4; load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); s += 5 * src_stride; do { #if AOM_ARCH_AARCH64 int16x8_t s5, s6, s7, s8; load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); uint16x8_t d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); uint16x8_t d1 = convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); uint16x8_t d2 = convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); uint16x8_t d3 = convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; #else // !AOM_ARCH_AARCH64 int16x8_t s5 = vld1q_s16(s); uint16x8_t d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); vst1q_u16(d, d0); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline uint16x4_t convolve8_4_2d_v( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, const int32x4_t offset_const) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); int32x4_t sum = offset_const; sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 0); sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); } static inline uint16x8_t convolve8_8_2d_v( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, const int32x4_t offset_const) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); int32x4_t sum0 = offset_const; sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); int32x4_t sum1 = offset_const; sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); } static inline void dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon( int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride, ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); const uint16_t fwd_offset = conv_params->fwd_offset; const uint16_t bck_offset = conv_params->bck_offset; CONV_BUF_TYPE *dst_ptr = conv_params->dst; const int dst_stride = conv_params->dst_stride; if (w == 4) { int16x4_t s0, s1, s2, s3, s4, s5, s6; load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src_ptr += 7 * src_stride; do { #if AOM_ARCH_AARCH64 int16x4_t s7, s8, s9, s10; load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_const); uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, offset_const); uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, offset_const); uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, offset_const); uint16x4_t dd0, dd1, dd2, dd3; load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d01_u8, d23_u8; compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01_u8, &d23_u8); store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); dst8_ptr += 4 * dst8_stride; s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; #else // !AOM_ARCH_AARCH64 int16x4_t s7 = vld1_s16(src_ptr); uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_const); uint16x4_t dd0 = vld1_u16(dst_ptr); uint8x8_t d01_u8; compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, vget_low_s16(round_offset_vec), &d01_u8); store_u8_4x1(dst8_ptr, d01_u8); dst8_ptr += dst8_stride; s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s5 = s6; s6 = s7; src_ptr += src_stride; dst_ptr += dst_stride; h--; #endif // AOM_ARCH_AARCH64 } while (h != 0); } else { do { int16_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int height = h; int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; do { #if AOM_ARCH_AARCH64 int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_const); uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, offset_const); uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, offset_const); uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, offset_const); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); d_u8 += 4 * dst8_stride; s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; #else // !AOM_ARCH_AARCH64 int16x8_t s7 = vld1q_s16(s); uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_const); uint16x8_t dd0 = vld1q_u16(d); uint8x8_t d0_u8; compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, round_offset_vec, &d0_u8); vst1_u8(d_u8, d0_u8); d_u8 += dst8_stride; s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s5 = s6; s6 = s7; s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 8; dst_ptr += 8; dst8_ptr += 8; w -= 8; } while (w != 0); } } static inline void dist_wtd_convolve_2d_vert_8tap_avg_neon( int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride, ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); CONV_BUF_TYPE *dst_ptr = conv_params->dst; const int dst_stride = conv_params->dst_stride; if (w == 4) { int16x4_t s0, s1, s2, s3, s4, s5, s6; load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src_ptr += 7 * src_stride; do { #if AOM_ARCH_AARCH64 int16x4_t s7, s8, s9, s10; load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_const); uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, offset_const); uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, offset_const); uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, offset_const); uint16x4_t dd0, dd1, dd2, dd3; load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d01_u8, d23_u8; compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01_u8, &d23_u8); store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); dst8_ptr += 4 * dst8_stride; s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; #else // !AOM_ARCH_AARCH64 int16x4_t s7 = vld1_s16(src_ptr); uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_const); uint16x4_t dd0 = vld1_u16(dst_ptr); uint8x8_t d01_u8; compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8); store_u8_4x1(dst8_ptr, d01_u8); dst8_ptr += dst8_stride; s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s5 = s6; s6 = s7; src_ptr += src_stride; dst_ptr += dst_stride; h--; #endif // AOM_ARCH_AARCH64 } while (h != 0); } else { do { int16_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int height = h; int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; do { #if AOM_ARCH_AARCH64 int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_const); uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, offset_const); uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, offset_const); uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, offset_const); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); d_u8 += 4 * dst8_stride; s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; #else // !AOM_ARCH_AARCH64 int16x8_t s7 = vld1q_s16(s); uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_const); uint16x8_t dd0 = vld1q_u16(d); uint8x8_t d0_u8; compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); vst1_u8(d_u8, d0_u8); d_u8 += dst8_stride; s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s5 = s6; s6 = s7; s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 8; dst_ptr += 8; dst8_ptr += 8; w -= 8; } while (w != 0); } } static inline void dist_wtd_convolve_2d_vert_8tap_neon( int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); CONV_BUF_TYPE *dst_ptr = conv_params->dst; const int dst_stride = conv_params->dst_stride; if (w == 4) { int16x4_t s0, s1, s2, s3, s4, s5, s6; load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src_ptr += 7 * src_stride; do { #if AOM_ARCH_AARCH64 int16x4_t s7, s8, s9, s10; load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_const); uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, offset_const); uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, offset_const); uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, offset_const); store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; #else // !AOM_ARCH_AARCH64 int16x4_t s7 = vld1_s16(src_ptr); uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_const); vst1_u16(dst_ptr, d0); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s5 = s6; s6 = s7; src_ptr += src_stride; dst_ptr += dst_stride; h--; #endif // AOM_ARCH_AARCH64 } while (h != 0); } else { do { int16_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; int height = h; int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; do { #if AOM_ARCH_AARCH64 int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_const); uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, offset_const); uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, offset_const); uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, offset_const); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; #else // !AOM_ARCH_AARCH64 int16x8_t s7 = vld1q_s16(s); uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_const); vst1q_u16(d, d0); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s5 = s6; s6 = s7; s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } #endif // AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_ aom-3.12.1/av1/common/arm/compound_convolve_neon_dotprod.c000066400000000000000000000636001477627663500236210ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/mem_neon.h" #include "av1/common/arm/compound_convolve_neon.h" #include "config/aom_config.h" #include "config/av1_rtcd.h" DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; static inline int16x4_t convolve4_4_2d_h(uint8x16_t samples, const int8x8_t x_filter, const int32x4_t correction, const uint8x16_t range_limit, const uint8x16_t permute_tbl) { // Clamp sample range to [-128, 127] for 8-bit signed dot product. int8x16_t clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl); // Accumulate dot product into 'correction' to account for range clamp. int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, x_filter, 0); // We halved the convolution filter values so -1 from the right shift. return vshrn_n_s32(sum, ROUND0_BITS - 1); } static inline int16x8_t convolve8_8_2d_h(uint8x16_t samples, const int8x8_t x_filter, const int32x4_t correction, const uint8x16_t range_limit, const uint8x16x3_t permute_tbl) { int8x16_t clamped_samples, permuted_samples[3]; int32x4_t sum[2]; // Clamp sample range to [-128, 127] for 8-bit signed dot product. clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); // Permute samples ready for dot product. */ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); // Accumulate dot product into 'correction' to account for range clamp. // First 4 output values. sum[0] = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0); sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1); // Second 4 output values. sum[1] = vdotq_lane_s32(correction, permuted_samples[1], x_filter, 0); sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1); // Narrow and re-pack. // We halved the convolution filter values so -1 from the right shift. return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), vshrn_n_s32(sum[1], ROUND0_BITS - 1)); } static inline void dist_wtd_convolve_2d_horiz_neon_dotprod( const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, const int16_t *x_filter_ptr, const int im_h, int w) { const int bd = 8; // Dot product constants and other shims. const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts // - which are generally faster than rounding shifts on modern CPUs. const int32_t horiz_const = ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); // Halve the total because we will halve the filter values. const int32x4_t correction = vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); const uint8x16_t range_limit = vdupq_n_u8(128); const uint8_t *src_ptr = src; int16_t *dst_ptr = im_block; int dst_stride = im_stride; int height = im_h; if (w == 4) { const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); // 4-tap filters are used for blocks having width <= 4. // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); src_ptr += 2; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl); int16x4_t d1 = convolve4_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl); int16x4_t d2 = convolve4_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl); int16x4_t d3 = convolve4_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl); store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height > 4); do { uint8x16_t s0 = vld1q_u8(src_ptr); int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl); vst1_s16(dst_ptr, d0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } else { const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit, permute_tbl); int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, range_limit, permute_tbl); int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, range_limit, permute_tbl); int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, range_limit, permute_tbl); store_s16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width > 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height > 4); do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; do { uint8x16_t s0 = vld1q_u8(s); int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit, permute_tbl); vst1q_s16(d, d0); s += 8; d += 8; width -= 8; } while (width > 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } } void av1_dist_wtd_convolve_2d_neon_dotprod( const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { assert(w % 4 == 0); assert(h % 4 == 0); DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; const int im_h = h + clamped_y_taps - 1; const int im_stride = MAX_SB_SIZE; const int vert_offset = clamped_y_taps / 2 - 1; const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); const int16x8_t y_filter = vld1q_s16(y_filter_ptr); dist_wtd_convolve_2d_horiz_neon_dotprod(src_ptr, src_stride, im_block, im_stride, x_filter_ptr, im_h, w); if (clamped_y_taps == 6) { if (conv_params->do_average) { if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon( im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, w); } else { dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, w); } } else { dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params, y_filter, h, w); } } else { if (conv_params->do_average) { if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon( im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, w); } else { dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, w); } } else { dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params, y_filter, h, w); } } } static inline uint16x4_t convolve4_4_x(uint8x16_t samples, const int8x8_t x_filter, const int32x4_t correction, const uint8x16_t range_limit, const uint8x16_t permute_tbl) { // Clamp sample range to [-128, 127] for 8-bit signed dot product. int8x16_t clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl); // Accumulate dot product into 'correction' to account for range clamp. int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, x_filter, 0); // We halved the convolution filter values so -1 from the right shift. return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1)); } static inline uint16x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t x_filter, const int32x4_t correction, const uint8x16_t range_limit, const uint8x16x3_t permute_tbl) { int8x16_t clamped_samples, permuted_samples[3]; int32x4_t sum[2]; // Clamp sample range to [-128, 127] for 8-bit signed dot product. clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); // Permute samples ready for dot product. */ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); // Accumulate dot product into 'correction' to account for range clamp. // First 4 output values. sum[0] = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0); sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1); // Second 4 output values. sum[1] = vdotq_lane_s32(correction, permuted_samples[1], x_filter, 0); sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1); // Narrow and re-pack. // We halved the convolution filter values so -1 from the right shift. int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), vshrn_n_s32(sum[1], ROUND0_BITS - 1)); return vreinterpretq_u16_s16(res); } static inline void dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod( const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { assert(w % 4 == 0); assert(h % 4 == 0); const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); const uint16_t fwd_offset = conv_params->fwd_offset; const uint16_t bck_offset = conv_params->bck_offset; // Horizontal filter. const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); // Dot-product constants and other shims. const uint8x16_t range_limit = vdupq_n_u8(128); // Fold round_offset into the dot-product filter correction constant. The // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // Halve the total because we will halve the filter values. int32x4_t correction = vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) + (1 << (ROUND0_BITS - 1))) / 2); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; CONV_BUF_TYPE *dst_ptr = conv_params->dst; uint8_t *dst8_ptr = dst8; int dst_stride = conv_params->dst_stride; int height = h; if (w == 4) { const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); // 4-tap filters are used for blocks having width <= 4. // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); src_ptr += 2; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); uint16x4_t d0 = convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl); uint16x4_t d1 = convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl); uint16x4_t d2 = convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl); uint16x4_t d3 = convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl); uint16x4_t dd0, dd1, dd2, dd3; load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d01_u8, d23_u8; compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01_u8, &d23_u8); store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; dst8_ptr += 4 * dst8_stride; height -= 4; } while (height != 0); } else { const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); do { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x8_t d0 = convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl); uint16x8_t d1 = convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl); uint16x8_t d2 = convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl); uint16x8_t d3 = convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); s += 8; d += 8; d_u8 += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; dst8_ptr += 4 * dst8_stride; height -= 4; } while (height != 0); } } static inline void dist_wtd_convolve_x_avg_neon_dotprod( const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { assert(w % 4 == 0); assert(h % 4 == 0); const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); // Horizontal filter. const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); // Dot-product constants and other shims. const uint8x16_t range_limit = vdupq_n_u8(128); // Fold round_offset into the dot-product filter correction constant. The // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // Halve the total because we will halve the filter values. int32x4_t correction = vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) + (1 << (ROUND0_BITS - 1))) / 2); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; CONV_BUF_TYPE *dst_ptr = conv_params->dst; uint8_t *dst8_ptr = dst8; int dst_stride = conv_params->dst_stride; int height = h; if (w == 4) { const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); // 4-tap filters are used for blocks having width <= 4. // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); src_ptr += 2; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); uint16x4_t d0 = convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl); uint16x4_t d1 = convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl); uint16x4_t d2 = convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl); uint16x4_t d3 = convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl); uint16x4_t dd0, dd1, dd2, dd3; load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d01_u8, d23_u8; compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01_u8, &d23_u8); store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; dst8_ptr += 4 * dst8_stride; height -= 4; } while (height != 0); } else { const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); do { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; uint8_t *d_u8 = dst8_ptr; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x8_t d0 = convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl); uint16x8_t d1 = convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl); uint16x8_t d2 = convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl); uint16x8_t d3 = convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); s += 8; d += 8; d_u8 += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; dst8_ptr += 4 * dst8_stride; height -= 4; } while (height != 0); } } static inline void dist_wtd_convolve_x_neon_dotprod( const uint8_t *src, int src_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { assert(w % 4 == 0); assert(h % 4 == 0); const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); // Horizontal filter. const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); // Dot-product constants and other shims. const uint8x16_t range_limit = vdupq_n_u8(128); // Fold round_offset into the dot-product filter correction constant. The // additional shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // Halve the total because we will halve the vilter values. int32x4_t correction = vdupq_n_s32(((128 << FILTER_BITS) + (round_offset << ROUND0_BITS) + (1 << (ROUND0_BITS - 1))) / 2); const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - horiz_offset; CONV_BUF_TYPE *dst_ptr = conv_params->dst; int dst_stride = conv_params->dst_stride; int height = h; if (w == 4) { const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); // 4-tap filters are used for blocks having width <= 4. // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); src_ptr += 2; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); uint16x4_t d0 = convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl); uint16x4_t d1 = convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl); uint16x4_t d2 = convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl); uint16x4_t d3 = convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl); store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height != 0); } else { const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); do { const uint8_t *s = src_ptr; CONV_BUF_TYPE *d = dst_ptr; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x8_t d0 = convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl); uint16x8_t d1 = convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl); uint16x8_t d2 = convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl); uint16x8_t d3 = convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height != 0); } } void av1_dist_wtd_convolve_x_neon_dotprod( const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { if (conv_params->do_average) { if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod( src, src_stride, dst8, dst8_stride, w, h, filter_params_x, subpel_x_qn, conv_params); } else { dist_wtd_convolve_x_avg_neon_dotprod(src, src_stride, dst8, dst8_stride, w, h, filter_params_x, subpel_x_qn, conv_params); } } else { dist_wtd_convolve_x_neon_dotprod(src, src_stride, w, h, filter_params_x, subpel_x_qn, conv_params); } } aom-3.12.1/av1/common/arm/compound_convolve_neon_i8mm.c000066400000000000000000001043421477627663500230170ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/mem_neon.h" #include "av1/common/arm/compound_convolve_neon.h" #include "config/aom_config.h" #include "config/av1_rtcd.h" DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; DECLARE_ALIGNED(16, static const uint8_t, kMatMulPermuteTbl[32]) = { // clang-format off 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 // clang-format on }; static inline int16x4_t convolve6_4_2d_h(uint8x16_t samples, const int8x16_t x_filter, const uint8x16_t permute_tbl, const int32x4_t horiz_const) { // Permute samples ready for matrix multiply. // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register. int32x4_t sum = vusmmlaq_s32(horiz_const, permuted_samples, x_filter); // We halved the convolution filter values so -1 from the right shift. return vshrn_n_s32(sum, ROUND0_BITS - 1); } static inline int16x8_t convolve6_8_2d_h(uint8x16_t samples, const int8x16_t x_filter, const uint8x16x2_t permute_tbl, const int32x4_t horiz_const) { // Permute samples ready for matrix multiply. // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 } uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), vqtbl1q_u8(samples, permute_tbl.val[1]) }; // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register. int32x4_t sum0123 = vusmmlaq_s32(horiz_const, permuted_samples[0], x_filter); int32x4_t sum4567 = vusmmlaq_s32(horiz_const, permuted_samples[1], x_filter); // Narrow and re-pack. // We halved the convolution filter values so -1 from the right shift. return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), vshrn_n_s32(sum4567, ROUND0_BITS - 1)); } static inline void dist_wtd_convolve_2d_horiz_6tap_neon_i8mm( const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, const int16_t *x_filter_ptr, const int im_h, int w) { const int bd = 8; // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // (The extra -1 is needed because we halved the filter values.) const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1))); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); // Stagger the filter for use with the matrix multiply instructions. // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } const int8x16_t x_filter = vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8); const uint8_t *src_ptr = src; int16_t *dst_ptr = im_block; int dst_stride = im_stride; int height = im_h; if (w == 4) { const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl); do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); int16x4_t d0 = convolve6_4_2d_h(s0, x_filter, permute_tbl, horiz_const); int16x4_t d1 = convolve6_4_2d_h(s1, x_filter, permute_tbl, horiz_const); int16x4_t d2 = convolve6_4_2d_h(s2, x_filter, permute_tbl, horiz_const); int16x4_t d3 = convolve6_4_2d_h(s3, x_filter, permute_tbl, horiz_const); store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height > 4); do { uint8x16_t s0 = vld1q_u8(src_ptr); int16x4_t d0 = convolve6_4_2d_h(s0, x_filter, permute_tbl, horiz_const); vst1_s16(dst_ptr, d0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } else { const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl); do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); int16x8_t d0 = convolve6_8_2d_h(s0, x_filter, permute_tbl, horiz_const); int16x8_t d1 = convolve6_8_2d_h(s1, x_filter, permute_tbl, horiz_const); int16x8_t d2 = convolve6_8_2d_h(s2, x_filter, permute_tbl, horiz_const); int16x8_t d3 = convolve6_8_2d_h(s3, x_filter, permute_tbl, horiz_const); store_s16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width > 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height > 4); do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; do { uint8x16_t s0 = vld1q_u8(s); int16x8_t d0 = convolve6_8_2d_h(s0, x_filter, permute_tbl, horiz_const); vst1q_s16(d, d0); s += 8; d += 8; width -= 8; } while (width > 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } } static inline int16x8_t convolve8_8_2d_h(uint8x16_t samples, const int8x8_t x_filter, const uint8x16x3_t permute_tbl, const int32x4_t horiz_const) { uint8x16_t permuted_samples[3]; int32x4_t sum[2]; // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); // First 4 output values. sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], x_filter, 0); sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1); // Second 4 output values. sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], x_filter, 0); sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1); // Narrow and re-pack. // We halved the convolution filter values so -1 from the right shift. return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), vshrn_n_s32(sum[1], ROUND0_BITS - 1)); } static inline void dist_wtd_convolve_2d_horiz_8tap_neon_i8mm( const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, const int16_t *x_filter_ptr, const int im_h, int w) { const int bd = 8; // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // (The extra -1 is needed because we halved the filter values.) const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1))); const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); const uint8_t *src_ptr = src; int16_t *dst_ptr = im_block; int dst_stride = im_stride; int height = im_h; do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const); int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const); int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const); store_s16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width > 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height > 4); do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; do { uint8x16_t s0 = vld1q_u8(s); int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); vst1q_s16(d, d0); s += 8; d += 8; width -= 8; } while (width > 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } void av1_dist_wtd_convolve_2d_neon_i8mm( const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { assert(w % 4 == 0); assert(h % 4 == 0); DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps; const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; const int im_h = h + clamped_y_taps - 1; const int im_stride = MAX_SB_SIZE; const int vert_offset = clamped_y_taps / 2 - 1; const int horiz_offset = clamped_x_taps / 2 - 1; const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); const int16x8_t y_filter = vld1q_s16(y_filter_ptr); if (clamped_x_taps == 6) { dist_wtd_convolve_2d_horiz_6tap_neon_i8mm(src_ptr, src_stride, im_block, im_stride, x_filter_ptr, im_h, w); } else { dist_wtd_convolve_2d_horiz_8tap_neon_i8mm(src_ptr, src_stride, im_block, im_stride, x_filter_ptr, im_h, w); } if (clamped_y_taps == 6) { if (conv_params->do_average) { if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon( im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, w); } else { dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, w); } } else { dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params, y_filter, h, w); } } else { if (conv_params->do_average) { if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon( im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, w); } else { dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, w); } } else { dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params, y_filter, h, w); } } } static inline uint16x4_t convolve6_4_x(uint8x16_t samples, const int8x16_t x_filter, const uint8x16_t permute_tbl, const int32x4_t round_offset) { // Permute samples ready for matrix multiply. // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register. int32x4_t sum = vusmmlaq_s32(round_offset, permuted_samples, x_filter); // We halved the convolution filter values so -1 from the right shift. return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1)); } static inline uint16x8_t convolve6_8_x(uint8x16_t samples, const int8x16_t x_filter, const uint8x16x2_t permute_tbl, const int32x4_t round_offset) { // Permute samples ready for matrix multiply. // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 } uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), vqtbl1q_u8(samples, permute_tbl.val[1]) }; // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register. int32x4_t sum0123 = vusmmlaq_s32(round_offset, permuted_samples[0], x_filter); int32x4_t sum4567 = vusmmlaq_s32(round_offset, permuted_samples[1], x_filter); // Narrow and re-pack. // We halved the convolution filter values so -1 from the right shift. int16x8_t res = vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), vshrn_n_s32(sum4567, ROUND0_BITS - 1)); return vreinterpretq_u16_s16(res); } static inline uint16x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t x_filter, const uint8x16x3_t permute_tbl, const int32x4_t round_offset) { uint8x16_t permuted_samples[3]; int32x4_t sum[2]; // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); // First 4 output values. sum[0] = vusdotq_lane_s32(round_offset, permuted_samples[0], x_filter, 0); sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1); // Second 4 output values. sum[1] = vusdotq_lane_s32(round_offset, permuted_samples[1], x_filter, 0); sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1); // Narrow and re-pack. // We halved the convolution filter values so -1 from the right shift. int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), vshrn_n_s32(sum[1], ROUND0_BITS - 1)); return vreinterpretq_u16_s16(res); } static inline void dist_wtd_convolve_x_dist_wtd_avg_6tap_neon_i8mm( const uint8_t *src, int src_stride, uint16_t *dst, int dst_stride, uint8_t *dst8, int dst8_stride, int w, int h, const int16_t *x_filter_ptr, const uint16_t fwd_offset, const uint16_t bck_offset) { assert(w % 4 == 0); assert(h % 4 == 0); const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // (The extra -1 is needed because we halved the filter values.) const int32x4_t round_offset_shim = vdupq_n_s32( (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1))); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); // Stagger the filter for use with the matrix multiply instructions. // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } const int8x16_t x_filter = vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8); if (w == 4) { const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl); do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); uint16x4_t d0 = convolve6_4_x(s0, x_filter, permute_tbl, round_offset_shim); uint16x4_t d1 = convolve6_4_x(s1, x_filter, permute_tbl, round_offset_shim); uint16x4_t d2 = convolve6_4_x(s2, x_filter, permute_tbl, round_offset_shim); uint16x4_t d3 = convolve6_4_x(s3, x_filter, permute_tbl, round_offset_shim); uint16x4_t dd0, dd1, dd2, dd3; load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d01_u8, d23_u8; compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01_u8, &d23_u8); store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01_u8); store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23_u8); src += 4 * src_stride; dst += 4 * dst_stride; dst8 += 4 * dst8_stride; h -= 4; } while (h != 0); } else { const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl); do { const uint8_t *s = src; uint16_t *d = dst; uint8_t *d_u8 = dst8; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x8_t d0 = convolve6_8_x(s0, x_filter, permute_tbl, round_offset_shim); uint16x8_t d1 = convolve6_8_x(s1, x_filter, permute_tbl, round_offset_shim); uint16x8_t d2 = convolve6_8_x(s2, x_filter, permute_tbl, round_offset_shim); uint16x8_t d3 = convolve6_8_x(s3, x_filter, permute_tbl, round_offset_shim); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); s += 8; d += 8; d_u8 += 8; width -= 8; } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; dst8 += 4 * dst8_stride; h -= 4; } while (h != 0); } } static inline void dist_wtd_convolve_x_dist_wtd_avg_8tap_neon_i8mm( const uint8_t *src, int src_stride, uint16_t *dst, int dst_stride, uint8_t *dst8, int dst8_stride, int w, int h, const int16_t *x_filter_ptr, const uint16_t fwd_offset, const uint16_t bck_offset) { assert(w % 4 == 0); assert(h % 4 == 0); const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // (The extra -1 is needed because we halved the filter values.) const int32x4_t round_offset_shim = vdupq_n_s32( (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1))); const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); do { const uint8_t *s = src; uint16_t *d = dst; uint8_t *d_u8 = dst8; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim); uint16x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim); uint16x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim); uint16x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); s += 8; d += 8; d_u8 += 8; width -= 8; } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; dst8 += 4 * dst8_stride; h -= 4; } while (h != 0); } static inline void dist_wtd_convolve_x_avg_6tap_neon_i8mm( const uint8_t *src, int src_stride, uint16_t *dst, int dst_stride, uint8_t *dst8, int dst8_stride, int w, int h, const int16_t *x_filter_ptr) { assert(w % 4 == 0); assert(h % 4 == 0); const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // (The extra -1 is needed because we halved the filter values.) const int32x4_t round_offset_shim = vdupq_n_s32( (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1))); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); // Stagger the filter for use with the matrix multiply instructions. // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } const int8x16_t x_filter = vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8); if (w == 4) { const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl); do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); uint16x4_t d0 = convolve6_4_x(s0, x_filter, permute_tbl, round_offset_shim); uint16x4_t d1 = convolve6_4_x(s1, x_filter, permute_tbl, round_offset_shim); uint16x4_t d2 = convolve6_4_x(s2, x_filter, permute_tbl, round_offset_shim); uint16x4_t d3 = convolve6_4_x(s3, x_filter, permute_tbl, round_offset_shim); uint16x4_t dd0, dd1, dd2, dd3; load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d01_u8, d23_u8; compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01_u8, &d23_u8); store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01_u8); store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23_u8); src += 4 * src_stride; dst += 4 * dst_stride; dst8 += 4 * dst8_stride; h -= 4; } while (h != 0); } else { const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl); do { const uint8_t *s = src; uint16_t *d = dst; uint8_t *d_u8 = dst8; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x8_t d0 = convolve6_8_x(s0, x_filter, permute_tbl, round_offset_shim); uint16x8_t d1 = convolve6_8_x(s1, x_filter, permute_tbl, round_offset_shim); uint16x8_t d2 = convolve6_8_x(s2, x_filter, permute_tbl, round_offset_shim); uint16x8_t d3 = convolve6_8_x(s3, x_filter, permute_tbl, round_offset_shim); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); s += 8; d += 8; d_u8 += 8; width -= 8; } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; dst8 += 4 * dst8_stride; h -= 4; } while (h != 0); } } static inline void dist_wtd_convolve_x_avg_8tap_neon_i8mm( const uint8_t *src, int src_stride, uint16_t *dst, int dst_stride, uint8_t *dst8, int dst8_stride, int w, int h, const int16_t *x_filter_ptr) { assert(w % 4 == 0); assert(h % 4 == 0); const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // (The extra -1 is needed because we halved the filter values.) const int32x4_t round_offset_shim = vdupq_n_s32( (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1))); const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); do { const uint8_t *s = src; uint16_t *d = dst; uint8_t *d_u8 = dst8; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim); uint16x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim); uint16x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim); uint16x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim); uint16x8_t dd0, dd1, dd2, dd3; load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); s += 8; d += 8; d_u8 += 8; width -= 8; } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; dst8 += 4 * dst8_stride; h -= 4; } while (h != 0); } static inline void dist_wtd_convolve_x_6tap_neon_i8mm( const uint8_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filter_ptr) { assert(w % 4 == 0); assert(h % 4 == 0); const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // (The extra -1 is needed because we halved the filter values.) const int32x4_t round_offset_shim = vdupq_n_s32( (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1))); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); // Stagger the filter for use with the matrix multiply instructions. // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } const int8x16_t x_filter = vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8); if (w == 4) { const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl); do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); uint16x4_t d0 = convolve6_4_x(s0, x_filter, permute_tbl, round_offset_shim); uint16x4_t d1 = convolve6_4_x(s1, x_filter, permute_tbl, round_offset_shim); uint16x4_t d2 = convolve6_4_x(s2, x_filter, permute_tbl, round_offset_shim); uint16x4_t d3 = convolve6_4_x(s3, x_filter, permute_tbl, round_offset_shim); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h != 0); } else { const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl); do { const uint8_t *s = src; uint16_t *d = dst; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x8_t d0 = convolve6_8_x(s0, x_filter, permute_tbl, round_offset_shim); uint16x8_t d1 = convolve6_8_x(s1, x_filter, permute_tbl, round_offset_shim); uint16x8_t d2 = convolve6_8_x(s2, x_filter, permute_tbl, round_offset_shim); uint16x8_t d3 = convolve6_8_x(s3, x_filter, permute_tbl, round_offset_shim); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h != 0); } } static inline void dist_wtd_convolve_x_8tap_neon_i8mm( const uint8_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filter_ptr) { assert(w % 4 == 0); assert(h % 4 == 0); const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // (The extra -1 is needed because we halved the filter values.) const int32x4_t round_offset_shim = vdupq_n_s32( (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1))); const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); do { const uint8_t *s = src; uint16_t *d = dst; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim); uint16x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim); uint16x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim); uint16x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h != 0); } void av1_dist_wtd_convolve_x_neon_i8mm( const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK); src -= (SUBPEL_TAPS / 2 - 1); if (conv_params->do_average) { if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { if (filter_taps < 8) { dist_wtd_convolve_x_dist_wtd_avg_6tap_neon_i8mm( src + 1, src_stride, conv_params->dst, conv_params->dst_stride, dst8, dst8_stride, w, h, x_filter_ptr, conv_params->fwd_offset, conv_params->bck_offset); return; } dist_wtd_convolve_x_dist_wtd_avg_8tap_neon_i8mm( src, src_stride, conv_params->dst, conv_params->dst_stride, dst8, dst8_stride, w, h, x_filter_ptr, conv_params->fwd_offset, conv_params->bck_offset); } else { if (filter_taps < 8) { dist_wtd_convolve_x_avg_6tap_neon_i8mm( src + 1, src_stride, conv_params->dst, conv_params->dst_stride, dst8, dst8_stride, w, h, x_filter_ptr); return; } dist_wtd_convolve_x_avg_8tap_neon_i8mm(src, src_stride, conv_params->dst, conv_params->dst_stride, dst8, dst8_stride, w, h, x_filter_ptr); } } else { if (filter_taps < 8) { dist_wtd_convolve_x_6tap_neon_i8mm(src + 1, src_stride, conv_params->dst, conv_params->dst_stride, w, h, x_filter_ptr); return; } dist_wtd_convolve_x_8tap_neon_i8mm(src, src_stride, conv_params->dst, conv_params->dst_stride, w, h, x_filter_ptr); } } aom-3.12.1/av1/common/arm/convolve_neon.c000066400000000000000000002131261477627663500201620ustar00rootroot00000000000000/* * * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" #include "av1/common/arm/convolve_neon.h" static inline int16x4_t convolve12_4_x(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11, const int32x4_t horiz_const) { const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); int32x4_t sum = horiz_const; sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0); sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1); sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2); sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3); sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0); sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1); sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2); sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3); sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0); sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1); sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2); sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3); return vqrshrn_n_s32(sum, FILTER_BITS); } static inline void convolve_x_sr_12tap_neon(const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, const int dst_stride, int w, int h, const int16_t *x_filter_ptr) { const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right // shift by FILTER_BITS - instead of a first rounding right shift by // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - // ROUND0_BITS. const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1)); #if AOM_ARCH_AARCH64 do { const uint8_t *s = src_ptr; uint8_t *d = dst_ptr; int width = w; uint8x8_t t0, t1, t2, t3; load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); s += 11; do { load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t d0 = convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, x_filter_0_7, x_filter_8_11, horiz_const); int16x4_t d1 = convolve12_4_x(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, x_filter_0_7, x_filter_8_11, horiz_const); int16x4_t d2 = convolve12_4_x(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, x_filter_0_7, x_filter_8_11, horiz_const); int16x4_t d3 = convolve12_4_x(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, x_filter_0_7, x_filter_8_11, horiz_const); transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); store_u8x4_strided_x2(d, dst_stride, d01); store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s7 = s11; s8 = s12; s9 = s13; s10 = s14; s += 4; d += 4; width -= 4; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; } while (h != 0); #else // !AOM_ARCH_AARCH64 do { const uint8_t *s = src_ptr; uint8_t *d = dst_ptr; int width = w; do { uint8x16_t t0 = vld1q_u8(s); int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); int16x8_t tt8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); int16x4_t s0 = vget_low_s16(tt0); int16x4_t s4 = vget_high_s16(tt0); int16x4_t s8 = vget_low_s16(tt8); int16x4_t s12 = vget_high_s16(tt8); int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 int16x4_t s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8 int16x4_t s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9 int16x4_t s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10 int16x4_t s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12 int16x4_t s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13 int16x4_t s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14 int16x4_t d0 = convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, x_filter_0_7, x_filter_8_11, horiz_const); uint8x8_t dd0 = vqmovun_s16(vcombine_s16(d0, vdup_n_s16(0))); store_u8_4x1(d, dd0); s += 4; d += 4; width -= 4; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--h != 0); #endif // AOM_ARCH_AARCH64 } static inline uint8x8_t convolve4_8_x(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x4_t filter, int16x8_t horiz_const) { int16x8_t sum = horiz_const; sum = vmlaq_lane_s16(sum, s0, filter, 0); sum = vmlaq_lane_s16(sum, s1, filter, 1); sum = vmlaq_lane_s16(sum, s2, filter, 2); sum = vmlaq_lane_s16(sum, s3, filter, 3); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void convolve_x_sr_4tap_neon(const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, const int dst_stride, int w, int h, const int16_t *x_filter_ptr) { // All filter values are even, halve to reduce intermediate precision // requirements. const int16x4_t filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single // rounding right shift by FILTER_BITS - instead of a first rounding right // shift by ROUND0_BITS, followed by second rounding right shift by // FILTER_BITS - ROUND0_BITS. // The outermost -1 is needed because we will halve the filter values. const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1)); if (w == 4) { do { uint8x8_t t01[4]; t01[0] = load_unaligned_u8(src_ptr + 0, src_stride); t01[1] = load_unaligned_u8(src_ptr + 1, src_stride); t01[2] = load_unaligned_u8(src_ptr + 2, src_stride); t01[3] = load_unaligned_u8(src_ptr + 3, src_stride); int16x8_t s01[4]; s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0])); s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1])); s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2])); s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3])); uint8x8_t d01 = convolve4_8_x(s01[0], s01[1], s01[2], s01[3], filter, horiz_const); store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); src_ptr += 2 * src_stride; dst_ptr += 2 * dst_stride; h -= 2; } while (h != 0); } else { do { int width = w; const uint8_t *s = src_ptr; uint8_t *d = dst_ptr; do { uint8x8_t t0[4], t1[4]; load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]); load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]); int16x8_t s0[4], s1[4]; s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0])); s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1])); s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2])); s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3])); s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0])); s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1])); s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2])); s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3])); uint8x8_t d0 = convolve4_8_x(s0[0], s0[1], s0[2], s0[3], filter, horiz_const); uint8x8_t d1 = convolve4_8_x(s1[0], s1[1], s1[2], s1[3], filter, horiz_const); store_u8_8x2(d, dst_stride, d0, d1); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 2 * src_stride; dst_ptr += 2 * dst_stride; h -= 2; } while (h != 0); } } static inline uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t filter, const int16x8_t horiz_const) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); int16x8_t sum = horiz_const; sum = vmlaq_lane_s16(sum, s0, filter_lo, 0); sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); sum = vmlaq_lane_s16(sum, s3, filter_lo, 3); sum = vmlaq_lane_s16(sum, s4, filter_hi, 0); sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); // We halved the convolution filter values so - 1 from the right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { if (w == 2 || h == 2) { av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params); return; } const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; src -= horiz_offset; const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK); if (filter_taps > 8) { convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, x_filter_ptr); return; } if (filter_taps <= 4) { convolve_x_sr_4tap_neon(src + 2, src_stride, dst, dst_stride, w, h, x_filter_ptr); return; } // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single // rounding right shift by FILTER_BITS - instead of a first rounding right // shift by ROUND0_BITS, followed by second rounding right shift by // FILTER_BITS - ROUND0_BITS. // The outermost -1 is needed because we will halve the filter values. const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1)); // Filter values are even so halve to reduce precision requirements. const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); #if AOM_ARCH_AARCH64 while (h >= 8) { uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); int width = w; const uint8_t *s = src + 7; uint8_t *d = dst; __builtin_prefetch(d + 0 * dst_stride); __builtin_prefetch(d + 1 * dst_stride); __builtin_prefetch(d + 2 * dst_stride); __builtin_prefetch(d + 3 * dst_stride); __builtin_prefetch(d + 4 * dst_stride); __builtin_prefetch(d + 5 * dst_stride); __builtin_prefetch(d + 6 * dst_stride); __builtin_prefetch(d + 7 * dst_stride); do { uint8x8_t t8, t9, t10, t11, t12, t13, t14; load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14); transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13)); int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14)); uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const); uint8x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, horiz_const); uint8x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, horiz_const); uint8x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, horiz_const); uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter, horiz_const); uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, x_filter, horiz_const); uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, x_filter, horiz_const); uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, x_filter, horiz_const); transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); s0 = s8; s1 = s9; s2 = s10; s3 = s11; s4 = s12; s5 = s13; s6 = s14; s += 8; d += 8; width -= 8; } while (width != 0); src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; } #endif // AOM_ARCH_AARCH64 while (h-- != 0) { uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int width = w; const uint8_t *s = src + 8; uint8_t *d = dst; __builtin_prefetch(d); do { uint8x8_t t8 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const); vst1_u8(d, d0); s0 = s8; s += 8; d += 8; width -= 8; } while (width != 0); src += src_stride; dst += dst_stride; } } static inline uint8x8_t convolve4_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x4_t filter) { int16x8_t sum = vmulq_lane_s16(s0, filter, 0); sum = vmlaq_lane_s16(sum, s1, filter, 1); sum = vmlaq_lane_s16(sum, s2, filter, 2); sum = vmlaq_lane_s16(sum, s3, filter, 3); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void convolve_y_sr_4tap_neon(const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, int w, int h, const int16_t *filter_y) { // All filter values are even, halve to reduce intermediate precision // requirements. const int16x4_t filter = vshr_n_s16(vld1_s16(filter_y + 2), 1); if (w == 4) { uint8x8_t t01 = load_unaligned_u8(src + 0 * src_stride, src_stride); uint8x8_t t12 = load_unaligned_u8(src + 1 * src_stride, src_stride); int16x8_t s01 = vreinterpretq_s16_u16(vmovl_u8(t01)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); src += 2 * src_stride; do { uint8x8_t t23 = load_unaligned_u8(src + 0 * src_stride, src_stride); uint8x8_t t34 = load_unaligned_u8(src + 1 * src_stride, src_stride); uint8x8_t t45 = load_unaligned_u8(src + 2 * src_stride, src_stride); uint8x8_t t56 = load_unaligned_u8(src + 3 * src_stride, src_stride); int16x8_t s23 = vreinterpretq_s16_u16(vmovl_u8(t23)); int16x8_t s34 = vreinterpretq_s16_u16(vmovl_u8(t34)); int16x8_t s45 = vreinterpretq_s16_u16(vmovl_u8(t45)); int16x8_t s56 = vreinterpretq_s16_u16(vmovl_u8(t56)); uint8x8_t d01 = convolve4_8_y(s01, s12, s23, s34, filter); uint8x8_t d23 = convolve4_8_y(s23, s34, s45, s56, filter); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); s01 = s45; s12 = s56; src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { uint8x8_t t0, t1, t2; load_u8_8x3(src, src_stride, &t0, &t1, &t2); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int height = h; const uint8_t *s = src + 3 * src_stride; uint8_t *d = dst; do { uint8x8_t t3; load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t3)); uint8x8_t d0 = convolve4_8_y(s0, s1, s2, s3, filter); uint8x8_t d1 = convolve4_8_y(s1, s2, s3, s4, filter); uint8x8_t d2 = convolve4_8_y(s2, s3, s4, s5, filter); uint8x8_t d3 = convolve4_8_y(s3, s4, s5, s6, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src += 8; dst += 8; w -= 8; } while (w != 0); } } static inline int16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x8_t y_filter_0_7) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); // Filter values at indices 0 and 7 are 0. int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1); sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2); sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3); sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0); sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1); sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2); return sum; } static inline uint8x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t y_filters) { const int16x4_t y_filter_lo = vget_low_s16(y_filters); const int16x4_t y_filter_hi = vget_high_s16(y_filters); // Filter values at indices 0 and 7 are 0. int16x8_t sum = vmulq_lane_s16(s0, y_filter_lo, 1); sum = vmlaq_lane_s16(sum, s1, y_filter_lo, 2); sum = vmlaq_lane_s16(sum, s2, y_filter_lo, 3); sum = vmlaq_lane_s16(sum, s3, y_filter_hi, 0); sum = vmlaq_lane_s16(sum, s4, y_filter_hi, 1); sum = vmlaq_lane_s16(sum, s5, y_filter_hi, 2); // We halved the convolution filter values so -1 from the right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void convolve_y_sr_6tap_neon(const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, const int dst_stride, int w, int h, const int16x8_t y_filter) { if (w <= 4) { uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride); uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride); uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride); uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride); uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride); int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); src_ptr += 5 * src_stride; do { #if AOM_ARCH_AARCH64 uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride); uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride); uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride); uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride); int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8))); int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter); int16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter); int16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter); int16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter); // We halved the convolution filter values so -1 from the right shift. uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); store_u8x4_strided_x2(dst_ptr, dst_stride, d01); store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; #else // !AOM_ARCH_AARCH64 uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr); int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter); // We halved the convolution filter values so -1 from the right shift. uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1); store_u8_4x1(dst_ptr, d01); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; src_ptr += src_stride; dst_ptr += dst_stride; h--; #endif // AOM_ARCH_AARCH64 } while (h != 0); } else { do { const uint8_t *s = src_ptr; uint8_t *d = dst_ptr; int height = h; uint8x8_t t0, t1, t2, t3, t4; load_u8_8x5(s, src_stride, &t0, &t1, &t2, &t3, &t4); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); s += 5 * src_stride; do { #if AOM_ARCH_AARCH64 uint8x8_t t5, t6, t7, t8; load_u8_8x4(s, src_stride, &t5, &t6, &t7, &t8); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter); uint8x8_t d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter); uint8x8_t d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter); uint8x8_t d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; #else // !AOM_ARCH_AARCH64 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter); vst1_u8(d, d0); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline int16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t filter) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0); sum = vmla_lane_s16(sum, s1, filter_lo, 1); sum = vmla_lane_s16(sum, s2, filter_lo, 2); sum = vmla_lane_s16(sum, s3, filter_lo, 3); sum = vmla_lane_s16(sum, s4, filter_hi, 0); sum = vmla_lane_s16(sum, s5, filter_hi, 1); sum = vmla_lane_s16(sum, s6, filter_hi, 2); sum = vmla_lane_s16(sum, s7, filter_hi, 3); return sum; } static inline uint8x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t filter) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0); sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); sum = vmlaq_lane_s16(sum, s3, filter_lo, 3); sum = vmlaq_lane_s16(sum, s4, filter_hi, 0); sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); // We halved the convolution filter values so -1 from the right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void convolve_y_sr_8tap_neon(const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, const int dst_stride, int w, int h, const int16x8_t y_filter) { if (w <= 4) { uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride); uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride); uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride); uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride); uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride); uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 5 * src_stride); uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 6 * src_stride); int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); src_ptr += 7 * src_stride; do { #if AOM_ARCH_AARCH64 uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride); uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride); uint8x8_t t9 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride); uint8x8_t t10 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride); int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7))); int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8))); int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9))); int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10))); int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); int16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); int16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); int16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); // We halved the convolution filter values so -1 from the right shift. uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); store_u8x4_strided_x2(dst_ptr, dst_stride, d01); store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; #else // !AOM_ARCH_AARCH64 uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr); int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7))); int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); // We halved the convolution filter values so -1 from the right shift. uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1); store_u8_4x1(dst_ptr, d01); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s5 = s6; s6 = s7; src_ptr += src_stride; dst_ptr += dst_stride; h--; #endif // AOM_ARCH_AARCH64 } while (h != 0); } else { do { const uint8_t *s = src_ptr; uint8_t *d = dst_ptr; int height = h; uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); s += 7 * src_stride; do { #if AOM_ARCH_AARCH64 uint8x8_t t7, t8, t9, t10; load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); uint8x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); uint8x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); uint8x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; #else // !AOM_ARCH_AARCH64 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); vst1_u8(d, d0); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s5 = s6; s6 = s7; s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline int16x4_t convolve12_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); int16x4_t sum; sum = vmul_lane_s16(s0, y_filter_0_3, 0); sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1); sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2); sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3); sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0); sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3); sum = vmla_lane_s16(sum, s8, y_filter_8_11, 0); sum = vmla_lane_s16(sum, s9, y_filter_8_11, 1); sum = vmla_lane_s16(sum, s10, y_filter_8_11, 2); sum = vmla_lane_s16(sum, s11, y_filter_8_11, 3); // Saturating addition is required for the largest filter taps to avoid // overflow (while staying in 16-bit elements.) sum = vqadd_s16(sum, vmul_lane_s16(s5, y_filter_4_7, 1)); sum = vqadd_s16(sum, vmul_lane_s16(s6, y_filter_4_7, 2)); return sum; } static inline uint8x8_t convolve12_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t s8, const int16x8_t s9, const int16x8_t s10, const int16x8_t s11, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); int16x8_t sum; sum = vmulq_lane_s16(s0, y_filter_0_3, 0); sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1); sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2); sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3); sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0); sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3); sum = vmlaq_lane_s16(sum, s8, y_filter_8_11, 0); sum = vmlaq_lane_s16(sum, s9, y_filter_8_11, 1); sum = vmlaq_lane_s16(sum, s10, y_filter_8_11, 2); sum = vmlaq_lane_s16(sum, s11, y_filter_8_11, 3); // Saturating addition is required for the largest filter taps to avoid // overflow (while staying in 16-bit elements.) sum = vqaddq_s16(sum, vmulq_lane_s16(s5, y_filter_4_7, 1)); sum = vqaddq_s16(sum, vmulq_lane_s16(s6, y_filter_4_7, 2)); return vqrshrun_n_s16(sum, FILTER_BITS); } static inline void convolve_y_sr_12tap_neon(const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr) { const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); if (w <= 4) { uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9, &t10); int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8))); int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9))); int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10))); src_ptr += 11 * src_stride; do { uint8x8_t t11, t12, t13, t14; load_u8_8x4(src_ptr, src_stride, &t11, &t12, &t13, &t14); int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t11))); int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t12))); int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t13))); int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t14))); int16x4_t d0 = convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7, y_filter_8_11); int16x4_t d1 = convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7, y_filter_8_11); int16x4_t d2 = convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7, y_filter_8_11); int16x4_t d3 = convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7, y_filter_8_11); uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst_ptr, dst_stride, d01); store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s7 = s11; s8 = s12; s9 = s13; s10 = s14; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { const uint8_t *s = src_ptr; uint8_t *d = dst_ptr; int height = h; uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9, &t10); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); s += 11 * src_stride; do { uint8x8_t t11, t12, t13, t14; load_u8_8x4(s, src_stride, &t11, &t12, &t13, &t14); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13)); int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14)); uint8x8_t d0 = convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7, y_filter_8_11); uint8x8_t d1 = convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7, y_filter_8_11); uint8x8_t d2 = convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7, y_filter_8_11); uint8x8_t d3 = convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7, y_filter_8_11); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s7 = s11; s8 = s12; s9 = s13; s10 = s14; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) { if (w == 2 || h == 2) { av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn); return; } const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; const int vert_offset = clamped_y_taps / 2 - 1; src -= vert_offset * src_stride; const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); if (y_filter_taps > 8) { convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter_ptr); return; } // Filter values are even so halve to reduce precision requirements. const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1); if (y_filter_taps <= 4) { convolve_y_sr_4tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter_ptr); } else if (y_filter_taps == 6) { convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter); } else { convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter); } } static inline int16x4_t convolve12_4_2d_h( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11, const int32x4_t horiz_const) { const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); int32x4_t sum = horiz_const; sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0); sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1); sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2); sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3); sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0); sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1); sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2); sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3); sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0); sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1); sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2); sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3); return vshrn_n_s32(sum, ROUND0_BITS); } static inline void convolve_2d_sr_horiz_12tap_neon( const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, const int dst_stride, int w, int h, const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11) { const int bd = 8; // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts - // which are generally faster than rounding shifts on modern CPUs. const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); #if AOM_ARCH_AARCH64 do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; uint8x8_t t0, t1, t2, t3; load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); s += 11; do { load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); int16x4_t d0 = convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, x_filter_0_7, x_filter_8_11, horiz_const); int16x4_t d1 = convolve12_4_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, x_filter_0_7, x_filter_8_11, horiz_const); int16x4_t d2 = convolve12_4_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, x_filter_0_7, x_filter_8_11, horiz_const); int16x4_t d3 = convolve12_4_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, x_filter_0_7, x_filter_8_11, horiz_const); transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); store_s16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s7 = s11; s8 = s12; s9 = s13; s10 = s14; s += 4; d += 4; width -= 4; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; } while (h > 4); #endif // AOM_ARCH_AARCH64 do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; do { uint8x16_t t0 = vld1q_u8(s); int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); int16x4_t s0 = vget_low_s16(tt0); int16x4_t s4 = vget_high_s16(tt0); int16x4_t s8 = vget_low_s16(tt1); int16x4_t s12 = vget_high_s16(tt1); int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 int16x4_t s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8 int16x4_t s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9 int16x4_t s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10 int16x4_t s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12 int16x4_t s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13 int16x4_t s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14 int16x4_t d0 = convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, x_filter_0_7, x_filter_8_11, horiz_const); vst1_s16(d, d0); s += 4; d += 4; width -= 4; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--h != 0); } static inline int16x8_t convolve4_8_2d_h(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x4_t filter, const int16x8_t horiz_const) { int16x8_t sum = vmlaq_lane_s16(horiz_const, s0, filter, 0); sum = vmlaq_lane_s16(sum, s1, filter, 1); sum = vmlaq_lane_s16(sum, s2, filter, 2); sum = vmlaq_lane_s16(sum, s3, filter, 3); // We halved the filter values so -1 from right shift. return vshrq_n_s16(sum, ROUND0_BITS - 1); } static inline void convolve_2d_sr_horiz_4tap_neon( const uint8_t *src, ptrdiff_t src_stride, int16_t *dst, ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) { const int bd = 8; // All filter values are even, halve to reduce intermediate precision // requirements. const int16x4_t filter = vshr_n_s16(vld1_s16(filter_x + 2), 1); // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // (The extra -1 is needed because we halved the filter values.) const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1))); if (w == 4) { do { uint8x8_t t01[4]; t01[0] = load_unaligned_u8(src + 0, (int)src_stride); t01[1] = load_unaligned_u8(src + 1, (int)src_stride); t01[2] = load_unaligned_u8(src + 2, (int)src_stride); t01[3] = load_unaligned_u8(src + 3, (int)src_stride); int16x8_t s01[4]; s01[0] = vreinterpretq_s16_u16(vmovl_u8(t01[0])); s01[1] = vreinterpretq_s16_u16(vmovl_u8(t01[1])); s01[2] = vreinterpretq_s16_u16(vmovl_u8(t01[2])); s01[3] = vreinterpretq_s16_u16(vmovl_u8(t01[3])); int16x8_t d01 = convolve4_8_2d_h(s01[0], s01[1], s01[2], s01[3], filter, horiz_const); store_s16x4_strided_x2(dst, (int)dst_stride, d01); src += 2 * src_stride; dst += 2 * dst_stride; h -= 2; } while (h > 0); } else { do { int width = w; const uint8_t *s = src; int16_t *d = dst; do { uint8x8_t t0[4], t1[4]; load_u8_8x4(s + 0 * src_stride, 1, &t0[0], &t0[1], &t0[2], &t0[3]); load_u8_8x4(s + 1 * src_stride, 1, &t1[0], &t1[1], &t1[2], &t1[3]); int16x8_t s0[4]; s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0])); s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1])); s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2])); s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3])); int16x8_t s1[4]; s1[0] = vreinterpretq_s16_u16(vmovl_u8(t1[0])); s1[1] = vreinterpretq_s16_u16(vmovl_u8(t1[1])); s1[2] = vreinterpretq_s16_u16(vmovl_u8(t1[2])); s1[3] = vreinterpretq_s16_u16(vmovl_u8(t1[3])); int16x8_t d0 = convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const); int16x8_t d1 = convolve4_8_2d_h(s1[0], s1[1], s1[2], s1[3], filter, horiz_const); store_s16_8x2(d, dst_stride, d0, d1); s += 8; d += 8; width -= 8; } while (width != 0); src += 2 * src_stride; dst += 2 * dst_stride; h -= 2; } while (h > 2); do { const uint8_t *s = src; int16_t *d = dst; int width = w; do { uint8x8_t t0[4]; load_u8_8x4(s, 1, &t0[0], &t0[1], &t0[2], &t0[3]); int16x8_t s0[4]; s0[0] = vreinterpretq_s16_u16(vmovl_u8(t0[0])); s0[1] = vreinterpretq_s16_u16(vmovl_u8(t0[1])); s0[2] = vreinterpretq_s16_u16(vmovl_u8(t0[2])); s0[3] = vreinterpretq_s16_u16(vmovl_u8(t0[3])); int16x8_t d0 = convolve4_8_2d_h(s0[0], s0[1], s0[2], s0[3], filter, horiz_const); vst1q_s16(d, d0); s += 8; d += 8; width -= 8; } while (width != 0); src += src_stride; dst += dst_stride; } while (--h != 0); } } static inline int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t filter, const int16x8_t horiz_const) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); int16x8_t sum = horiz_const; sum = vmlaq_lane_s16(sum, s0, filter_lo, 0); sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); sum = vmlaq_lane_s16(sum, s3, filter_lo, 3); sum = vmlaq_lane_s16(sum, s4, filter_hi, 0); sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); // We halved the convolution filter values so -1 from the right shift. return vshrq_n_s16(sum, ROUND0_BITS - 1); } static inline void convolve_2d_sr_horiz_8tap_neon( const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, int im_h, const int16_t *x_filter_ptr) { const int bd = 8; const uint8_t *src_ptr = src; int16_t *dst_ptr = im_block; int dst_stride = im_stride; int height = im_h; // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // (The extra -1 is needed because we halved the filter values.) const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1))); // Filter values are even, so halve to reduce intermediate precision reqs. const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); #if AOM_ARCH_AARCH64 while (height > 8) { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); s += 7; do { load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const); int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, horiz_const); int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, horiz_const); int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, horiz_const); int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11, x_filter, horiz_const); int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12, x_filter, horiz_const); int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13, x_filter, horiz_const); int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14, x_filter, horiz_const); transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); s0 = s8; s1 = s9; s2 = s10; s3 = s11; s4 = s12; s5 = s13; s6 = s14; s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 8 * src_stride; dst_ptr += 8 * dst_stride; height -= 8; } #endif // AOM_ARCH_AARCH64 do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); do { uint8x8_t t1 = vld1_u8(s + 8); // a8 a9 a10 a11 a12 a13 a14 a15 int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, horiz_const); vst1q_s16(d, d0); s0 = s8; s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { if (w == 2 || h == 2) { av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); return; } const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; const int im_h = h + clamped_y_taps - 1; const int im_stride = MAX_SB_SIZE; const int vert_offset = clamped_y_taps / 2 - 1; const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); if (filter_params_x->taps > 8) { DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_0_7, x_filter_8_11); convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter_0_7, y_filter_8_11); } else { DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); if (x_filter_taps <= 4) { convolve_2d_sr_horiz_4tap_neon(src_ptr + 2, src_stride, im_block, im_stride, w, im_h, x_filter_ptr); } else { convolve_2d_sr_horiz_8tap_neon(src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr); } const int16x8_t y_filter = vld1q_s16(y_filter_ptr); if (clamped_y_taps <= 4) { convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter_ptr); } else if (clamped_y_taps == 6) { convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter); } else { convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter); } } } void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { assert(subpel_x_qn == 8); assert(filter_params_x->taps == 2); assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); (void)filter_params_x; (void)subpel_x_qn; (void)conv_params; if (w <= 4) { do { uint8x8_t s0_0 = vld1_u8(src); uint8x8_t s0_1 = vld1_u8(src + 1); uint8x8_t s1_0 = vld1_u8(src + src_stride); uint8x8_t s1_1 = vld1_u8(src + src_stride + 1); uint8x8_t d0 = vrhadd_u8(s0_0, s0_1); uint8x8_t d1 = vrhadd_u8(s1_0, s1_1); if (w == 2) { store_u8_2x1(dst + 0 * dst_stride, d0); store_u8_2x1(dst + 1 * dst_stride, d1); } else { store_u8_4x1(dst + 0 * dst_stride, d0); store_u8_4x1(dst + 1 * dst_stride, d1); } src += 2 * src_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } else if (w == 8) { do { uint8x8_t s0_0 = vld1_u8(src); uint8x8_t s0_1 = vld1_u8(src + 1); uint8x8_t s1_0 = vld1_u8(src + src_stride); uint8x8_t s1_1 = vld1_u8(src + src_stride + 1); uint8x8_t d0 = vrhadd_u8(s0_0, s0_1); uint8x8_t d1 = vrhadd_u8(s1_0, s1_1); vst1_u8(dst, d0); vst1_u8(dst + dst_stride, d1); src += 2 * src_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } else { do { const uint8_t *src_ptr = src; uint8_t *dst_ptr = dst; int width = w; do { uint8x16_t s0 = vld1q_u8(src_ptr); uint8x16_t s1 = vld1q_u8(src_ptr + 1); uint8x16_t d0 = vrhaddq_u8(s0, s1); vst1q_u8(dst_ptr, d0); src_ptr += 16; dst_ptr += 16; width -= 16; } while (width != 0); src += src_stride; dst += dst_stride; } while (--h != 0); } } void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) { assert(subpel_y_qn == 8); assert(filter_params_y->taps == 2); (void)filter_params_y; (void)subpel_y_qn; if (w <= 4) { do { uint8x8_t s0 = load_unaligned_u8_4x1(src); uint8x8_t s1 = load_unaligned_u8_4x1(src + src_stride); uint8x8_t s2 = load_unaligned_u8_4x1(src + 2 * src_stride); uint8x8_t d0 = vrhadd_u8(s0, s1); uint8x8_t d1 = vrhadd_u8(s1, s2); if (w == 2) { store_u8_2x1(dst + 0 * dst_stride, d0); store_u8_2x1(dst + 1 * dst_stride, d1); } else { store_u8_4x1(dst + 0 * dst_stride, d0); store_u8_4x1(dst + 1 * dst_stride, d1); } src += 2 * src_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } else if (w == 8) { do { uint8x8_t s0 = vld1_u8(src); uint8x8_t s1 = vld1_u8(src + src_stride); uint8x8_t s2 = vld1_u8(src + 2 * src_stride); uint8x8_t d0 = vrhadd_u8(s0, s1); uint8x8_t d1 = vrhadd_u8(s1, s2); vst1_u8(dst, d0); vst1_u8(dst + dst_stride, d1); src += 2 * src_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } else { do { const uint8_t *src_ptr = src; uint8_t *dst_ptr = dst; int height = h; do { uint8x16_t s0 = vld1q_u8(src_ptr); uint8x16_t s1 = vld1q_u8(src_ptr + src_stride); uint8x16_t d0 = vrhaddq_u8(s0, s1); vst1q_u8(dst_ptr, d0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); src += 16; dst += 16; w -= 16; } while (w != 0); } } void av1_convolve_2d_sr_intrabc_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { assert(subpel_x_qn == 8); assert(subpel_y_qn == 8); assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); (void)filter_params_x; (void)subpel_x_qn; (void)filter_params_y; (void)subpel_y_qn; (void)conv_params; uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; int im_h = h + 1; int im_stride = w; assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); uint16_t *im = im_block; // Horizontal filter. if (w <= 4) { do { uint8x8_t s0 = vld1_u8(src); uint8x8_t s1 = vld1_u8(src + 1); uint16x4_t sum = vget_low_u16(vaddl_u8(s0, s1)); // Safe to store the whole vector, the im buffer is big enough. vst1_u16(im, sum); src += src_stride; im += im_stride; } while (--im_h != 0); } else { do { const uint8_t *src_ptr = src; uint16_t *im_ptr = im; int width = w; do { uint8x8_t s0 = vld1_u8(src_ptr); uint8x8_t s1 = vld1_u8(src_ptr + 1); uint16x8_t sum = vaddl_u8(s0, s1); vst1q_u16(im_ptr, sum); src_ptr += 8; im_ptr += 8; width -= 8; } while (width != 0); src += src_stride; im += im_stride; } while (--im_h != 0); } im = im_block; // Vertical filter. if (w <= 4) { do { uint16x4_t s0 = vld1_u16(im); uint16x4_t s1 = vld1_u16(im + im_stride); uint16x4_t s2 = vld1_u16(im + 2 * im_stride); uint16x4_t sum0 = vadd_u16(s0, s1); uint16x4_t sum1 = vadd_u16(s1, s2); uint8x8_t d0 = vqrshrn_n_u16(vcombine_u16(sum0, vdup_n_u16(0)), 2); uint8x8_t d1 = vqrshrn_n_u16(vcombine_u16(sum1, vdup_n_u16(0)), 2); if (w == 2) { store_u8_2x1(dst + 0 * dst_stride, d0); store_u8_2x1(dst + 1 * dst_stride, d1); } else { store_u8_4x1(dst + 0 * dst_stride, d0); store_u8_4x1(dst + 1 * dst_stride, d1); } im += 2 * im_stride; dst += 2 * dst_stride; h -= 2; } while (h != 0); } else { do { uint16_t *im_ptr = im; uint8_t *dst_ptr = dst; int height = h; do { uint16x8_t s0 = vld1q_u16(im_ptr); uint16x8_t s1 = vld1q_u16(im_ptr + im_stride); uint16x8_t sum = vaddq_u16(s0, s1); uint8x8_t d0 = vqrshrn_n_u16(sum, 2); vst1_u8(dst_ptr, d0); im_ptr += im_stride; dst_ptr += dst_stride; } while (--height != 0); im += 8; dst += 8; w -= 8; } while (w != 0); } } aom-3.12.1/av1/common/arm/convolve_neon.h000066400000000000000000000574431477627663500201770ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ #define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ #include #include "config/aom_config.h" #include "aom_dsp/arm/mem_neon.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" static inline int32x4_t convolve12_4_2d_v( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0); sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0); sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1); sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2); sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3); return sum; } static inline uint8x8_t convolve12_8_2d_v( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t s8, const int16x8_t s9, const int16x8_t s10, const int16x8_t s11, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11, const int16x8_t sub_const) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3); int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3); int16x8_t res = vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS), vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS)); res = vsubq_s16(res, sub_const); return vqmovun_s16(res); } static inline void convolve_2d_sr_vert_12tap_neon( int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, int h, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) { const int bd = 8; const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); if (w <= 4) { int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; load_s16_4x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10); src_ptr += 11 * src_stride; do { int16x4_t s11, s12, s13, s14; load_s16_4x4(src_ptr, src_stride, &s11, &s12, &s13, &s14); int32x4_t d0 = convolve12_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7, y_filter_8_11); int32x4_t d1 = convolve12_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7, y_filter_8_11); int32x4_t d2 = convolve12_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7, y_filter_8_11); int32x4_t d3 = convolve12_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7, y_filter_8_11); int16x8_t dd01 = vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS), vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS)); int16x8_t dd23 = vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS), vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS)); dd01 = vsubq_s16(dd01, sub_const); dd23 = vsubq_s16(dd23, sub_const); uint8x8_t d01 = vqmovun_s16(dd01); uint8x8_t d23 = vqmovun_s16(dd23); store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s7 = s11; s8 = s12; s9 = s13; s10 = s14; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { int height = h; int16_t *s = src_ptr; uint8_t *d = dst_ptr; int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10); s += 11 * src_stride; do { int16x8_t s11, s12, s13, s14; load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14); uint8x8_t d0 = convolve12_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7, y_filter_8_11, sub_const); uint8x8_t d1 = convolve12_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7, y_filter_8_11, sub_const); uint8x8_t d2 = convolve12_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7, y_filter_8_11, sub_const); uint8x8_t d3 = convolve12_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7, y_filter_8_11, sub_const); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s7 = s11; s8 = s12; s9 = s13; s10 = s14; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline int16x4_t convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) { const int16x4_t y_filter_lo = vget_low_s16(y_filter); const int16x4_t y_filter_hi = vget_high_s16(y_filter); int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0); sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1); sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2); sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3); sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0); sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1); sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2); sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3); return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS); } static inline uint8x8_t convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, const int16x8_t sub_const) { const int16x4_t y_filter_lo = vget_low_s16(y_filter); const int16x4_t y_filter_hi = vget_high_s16(y_filter); int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3); int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3); int16x8_t res = vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS), vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS)); res = vsubq_s16(res, sub_const); return vqmovun_s16(res); } static inline void convolve_2d_sr_vert_8tap_neon(int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, int h, const int16x8_t y_filter) { const int bd = 8; const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); if (w <= 4) { int16x4_t s0, s1, s2, s3, s4, s5, s6; load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src_ptr += 7 * src_stride; do { #if AOM_ARCH_AARCH64 int16x4_t s7, s8, s9, s10; load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); int16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); int16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); int16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const)); uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const)); store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; #else // !AOM_ARCH_AARCH64 int16x4_t s7 = vld1_s16(src_ptr); int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const)); store_u8_4x1(dst_ptr, d01); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s5 = s6; s6 = s7; src_ptr += src_stride; dst_ptr += dst_stride; h--; #endif // AOM_ARCH_AARCH64 } while (h != 0); } else { // Width is a multiple of 8 and height is a multiple of 4. do { int height = h; int16_t *s = src_ptr; uint8_t *d = dst_ptr; int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; do { #if AOM_ARCH_AARCH64 int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, sub_const); uint8x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, sub_const); uint8x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, sub_const); uint8x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, sub_const); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; #else // !AOM_ARCH_AARCH64 int16x8_t s7 = vld1q_s16(s); uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, sub_const); vst1_u8(d, d0); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s5 = s6; s6 = s7; s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline int16x4_t convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x8_t y_filter) { const int16x4_t y_filter_lo = vget_low_s16(y_filter); const int16x4_t y_filter_hi = vget_high_s16(y_filter); int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 1); sum = vmlal_lane_s16(sum, s1, y_filter_lo, 2); sum = vmlal_lane_s16(sum, s2, y_filter_lo, 3); sum = vmlal_lane_s16(sum, s3, y_filter_hi, 0); sum = vmlal_lane_s16(sum, s4, y_filter_hi, 1); sum = vmlal_lane_s16(sum, s5, y_filter_hi, 2); return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS); } static inline uint8x8_t convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t y_filter, const int16x8_t sub_const) { const int16x4_t y_filter_lo = vget_low_s16(y_filter); const int16x4_t y_filter_hi = vget_high_s16(y_filter); int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_hi, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 2); int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_hi, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 2); int16x8_t res = vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS), vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS)); res = vsubq_s16(res, sub_const); return vqmovun_s16(res); } static inline void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, int h, const int16x8_t y_filter) { const int bd = 8; const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); if (w <= 4) { int16x4_t s0, s1, s2, s3, s4; load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4); src_ptr += 5 * src_stride; do { #if AOM_ARCH_AARCH64 int16x4_t s5, s6, s7, s8; load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter); int16x4_t d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter); int16x4_t d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter); int16x4_t d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter); uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const)); uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const)); store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; #else // !AOM_ARCH_AARCH64 int16x4_t s5 = vld1_s16(src_ptr); int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter); uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const)); store_u8_4x1(dst_ptr, d01); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; src_ptr += src_stride; dst_ptr += dst_stride; h--; #endif // AOM_ARCH_AARCH64 } while (h != 0); } else { // Width is a multiple of 8 and height is a multiple of 4. do { int height = h; int16_t *s = src_ptr; uint8_t *d = dst_ptr; int16x8_t s0, s1, s2, s3, s4; load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); s += 5 * src_stride; do { #if AOM_ARCH_AARCH64 int16x8_t s5, s6, s7, s8; load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); uint8x8_t d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const); uint8x8_t d1 = convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, sub_const); uint8x8_t d2 = convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, sub_const); uint8x8_t d3 = convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, sub_const); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; #else // !AOM_ARCH_AARCH64 int16x8_t s5 = vld1q_s16(s); uint8x8_t d0 = convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const); vst1_u8(d, d0); s0 = s1; s1 = s2; s2 = s3; s3 = s4; s4 = s5; s += src_stride; d += dst_stride; height--; #endif // AOM_ARCH_AARCH64 } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline int16x4_t convolve4_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t y_filter) { int32x4_t sum = vmull_lane_s16(s0, y_filter, 0); sum = vmlal_lane_s16(sum, s1, y_filter, 1); sum = vmlal_lane_s16(sum, s2, y_filter, 2); sum = vmlal_lane_s16(sum, s3, y_filter, 3); return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS); } static inline uint8x8_t convolve4_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x4_t y_filter, const int16x8_t sub_const) { int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter, 3); int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter, 3); int16x8_t res = vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS), vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS)); res = vsubq_s16(res, sub_const); return vqmovun_s16(res); } static inline void convolve_2d_sr_vert_4tap_neon(int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter) { const int bd = 8; const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); const int16x4_t filter = vld1_s16(y_filter + 2); if (w == 4) { int16x4_t s0, s1, s2; load_s16_4x3(src_ptr, src_stride, &s0, &s1, &s2); src_ptr += 3 * src_stride; do { int16x4_t s3, s4, s5, s6; load_s16_4x4(src_ptr, src_stride, &s3, &s4, &s5, &s6); int16x4_t d0 = convolve4_4_2d_v(s0, s1, s2, s3, filter); int16x4_t d1 = convolve4_4_2d_v(s1, s2, s3, s4, filter); int16x4_t d2 = convolve4_4_2d_v(s2, s3, s4, s5, filter); int16x4_t d3 = convolve4_4_2d_v(s3, s4, s5, s6, filter); uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const)); uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const)); store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; s2 = s6; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; } while (h != 0); } else { // Width is a multiple of 8 and height is a multiple of 4. do { int height = h; int16_t *s = src_ptr; uint8_t *d = dst_ptr; int16x8_t s0, s1, s2; load_s16_8x3(s, src_stride, &s0, &s1, &s2); s += 3 * src_stride; do { int16x8_t s3, s4, s5, s6; load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); uint8x8_t d0 = convolve4_8_2d_v(s0, s1, s2, s3, filter, sub_const); uint8x8_t d1 = convolve4_8_2d_v(s1, s2, s3, s4, filter, sub_const); uint8x8_t d2 = convolve4_8_2d_v(s2, s3, s4, s5, filter, sub_const); uint8x8_t d3 = convolve4_8_2d_v(s3, s4, s5, s6, filter, sub_const); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } #endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ aom-3.12.1/av1/common/arm/convolve_neon_dotprod.c000066400000000000000000001741341477627663500217220ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_ports/mem.h" #include "av1/common/arm/convolve_neon.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { // Shift left and insert new last column in transposed 4x4 block. 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, // Shift left and insert two new columns in transposed 4x4 block. 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, // Shift left and insert three new columns in transposed 4x4 block. 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; static inline int16x4_t convolve12_4_x(uint8x16_t samples, const int8x16_t filter, const uint8x16x3_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t samples_128 = vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), vqtbl1q_s8(samples_128, permute_tbl.val[1]), vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; // Dot product constants: // Accumulate into 128 << FILTER_BITS to account for range transform. // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding // right shift by FILTER_BITS - instead of a first rounding right shift by // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - // ROUND0_BITS. int32x4_t acc = vdupq_n_s32((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))); int32x4_t sum = vdotq_laneq_s32(acc, perm_samples[0], filter, 0); sum = vdotq_laneq_s32(sum, perm_samples[1], filter, 1); sum = vdotq_laneq_s32(sum, perm_samples[2], filter, 2); return vqrshrn_n_s32(sum, FILTER_BITS); } static inline uint8x8_t convolve12_8_x(uint8x16_t samples[2], const int8x16_t filter, const uint8x16x3_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t samples_128[2] = { vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))), vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128))) }; // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } int8x16_t perm_samples[4] = { vqtbl1q_s8(samples_128[0], permute_tbl.val[0]), vqtbl1q_s8(samples_128[0], permute_tbl.val[1]), vqtbl1q_s8(samples_128[0], permute_tbl.val[2]), vqtbl1q_s8(samples_128[1], permute_tbl.val[2]) }; // Dot product constants: // Accumulate into 128 << FILTER_BITS to account for range transform. // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding // right shift by FILTER_BITS - instead of a first rounding right shift by // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - // ROUND0_BITS. int32x4_t acc = vdupq_n_s32((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))); int32x4_t sum0123 = vdotq_laneq_s32(acc, perm_samples[0], filter, 0); sum0123 = vdotq_laneq_s32(sum0123, perm_samples[1], filter, 1); sum0123 = vdotq_laneq_s32(sum0123, perm_samples[2], filter, 2); int32x4_t sum4567 = vdotq_laneq_s32(acc, perm_samples[1], filter, 0); sum4567 = vdotq_laneq_s32(sum4567, perm_samples[2], filter, 1); sum4567 = vdotq_laneq_s32(sum4567, perm_samples[3], filter, 2); // Narrow and re-pack. int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum0123, FILTER_BITS), vqrshrn_n_s32(sum4567, FILTER_BITS)); return vqmovun_s16(sum_s16); } static inline void convolve_x_sr_12tap_neon_dotprod( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filter_ptr) { // The no-op filter should never be used here. assert(x_filter_ptr[5] != 128); const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr); const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8); const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0)); const int8x16_t filter = vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15)); const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); if (w <= 4) { do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl); int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl); int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl); int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl); uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); dst += 4 * dst_stride; src += 4 * src_stride; h -= 4; } while (h != 0); } else { do { const uint8_t *s = src; uint8_t *d = dst; int width = w; do { uint8x16_t s0[2], s1[2], s2[2], s3[2]; load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl); uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl); uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl); uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl); store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h != 0); } } static inline int16x4_t convolve4_4_x(const uint8x16_t samples, const int8x8_t filters, const uint8x16_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t samples_128 = vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl); // Dot product constants: // Accumulate into 128 << FILTER_BITS to account for range transform. // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding // right shift by FILTER_BITS - instead of a first rounding right shift by // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - // ROUND0_BITS. Halve the total because we halved the filter values. int32x4_t acc = vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2); int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0); // Further narrowing and packing is performed by the caller. return vmovn_s32(sum); } static inline uint8x8_t convolve4_8_x(const uint8x16_t samples, const int8x8_t filters, const uint8x16x2_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t samples_128 = vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; // Dot product constants: // Accumulate into 128 << FILTER_BITS to account for range transform. // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding // right shift by FILTER_BITS - instead of a first rounding right shift by // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - // ROUND0_BITS. Halve the total because we halved the filter values. int32x4_t acc = vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2); int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); // Narrow and re-pack. int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void convolve_x_sr_4tap_neon_dotprod( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x) { const int16x4_t x_filter = vld1_s16(filter_x + 2); // All 4-tap and bilinear filter values are even, so halve them to reduce // intermediate precision requirements. const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); if (width == 4) { const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl); do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); int16x4_t t0 = convolve4_4_x(s0, filter, permute_tbl); int16x4_t t1 = convolve4_4_x(s1, filter, permute_tbl); int16x4_t t2 = convolve4_4_x(s2, filter, permute_tbl); int16x4_t t3 = convolve4_4_x(s3, filter, permute_tbl); // We halved the filter values so -1 from right shift. uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } else { const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl); do { const uint8_t *s = src; uint8_t *d = dst; int w = width; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint8x8_t d0 = convolve4_8_x(s0, filter, permute_tbl); uint8x8_t d1 = convolve4_8_x(s1, filter, permute_tbl); uint8x8_t d2 = convolve4_8_x(s2, filter, permute_tbl); uint8x8_t d3 = convolve4_8_x(s3, filter, permute_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } } static inline uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter, const uint8x16x3_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t samples_128 = vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. */ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), vqtbl1q_s8(samples_128, permute_tbl.val[1]), vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; // Dot product constants: // Accumulate into 128 << FILTER_BITS to account for range transform. // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding // right shift by FILTER_BITS - instead of a first rounding right shift by // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - // ROUND0_BITS. Halve the total because we halved the filter values. int32x4_t acc = vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2); int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filter, 0); sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filter, 1); int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[1], filter, 0); sum4567 = vdotq_lane_s32(sum4567, perm_samples[2], filter, 1); // Narrow and re-pack. int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); // We halved the convolution filter values so - 1 from the right shift. return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1); } void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { if (w == 2 || h == 2) { av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params); return; } const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; src -= horiz_offset; const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK); if (filter_taps > 8) { convolve_x_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h, x_filter_ptr); return; } if (filter_taps <= 4) { convolve_x_sr_4tap_neon_dotprod(src + 2, src_stride, dst, dst_stride, w, h, x_filter_ptr); return; } const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); do { int width = w; const uint8_t *s = src; uint8_t *d = dst; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl); uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl); uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl); uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h != 0); } static inline void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, int8x8_t a3, int8x16_t *b) { // Transpose 8-bit elements and concatenate result rows as follows: // a0: 00, 01, 02, 03, XX, XX, XX, XX // a1: 10, 11, 12, 13, XX, XX, XX, XX // a2: 20, 21, 22, 23, XX, XX, XX, XX // a3: 30, 31, 32, 33, XX, XX, XX, XX // // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); int8x16_t a01 = vzipq_s8(a0q, a1q).val[0]; int8x16_t a23 = vzipq_s8(a2q, a3q).val[0]; int16x8_t a0123 = vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0]; *b = vreinterpretq_s8_s16(a0123); } static inline void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, int8x8_t a3, int8x16_t *b0, int8x16_t *b1) { // Transpose 8-bit elements and concatenate result rows as follows: // a0: 00, 01, 02, 03, 04, 05, 06, 07 // a1: 10, 11, 12, 13, 14, 15, 16, 17 // a2: 20, 21, 22, 23, 24, 25, 26, 27 // a3: 30, 31, 32, 33, 34, 35, 36, 37 // // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); int8x16_t a01 = vzipq_s8(a0q, a1q).val[0]; int8x16_t a23 = vzipq_s8(a2q, a3q).val[0]; int16x8x2_t a0123 = vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)); *b0 = vreinterpretq_s8_s16(a0123.val[0]); *b1 = vreinterpretq_s8_s16(a0123.val[1]); } static inline int16x4_t convolve12_4_y(const int8x16_t s0, const int8x16_t s1, const int8x16_t s2, const int8x8_t filters_0_7, const int8x8_t filters_4_11) { // The sample range transform and permutation are performed by the caller. // Accumulate into 128 << FILTER_BITS to account for range transform. const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS); int32x4_t sum = vdotq_lane_s32(acc, s0, filters_0_7, 0); sum = vdotq_lane_s32(sum, s1, filters_0_7, 1); sum = vdotq_lane_s32(sum, s2, filters_4_11, 1); // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } static inline uint8x8_t convolve12_8_y( const int8x16_t s0_lo, const int8x16_t s0_hi, const int8x16_t s1_lo, const int8x16_t s1_hi, const int8x16_t s2_lo, const int8x16_t s2_hi, const int8x8_t filters_0_7, const int8x8_t filters_4_11) { // The sample range transform and permutation are performed by the caller. // Accumulate into 128 << FILTER_BITS to account for range transform. const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS); int32x4_t sum0123 = vdotq_lane_s32(acc, s0_lo, filters_0_7, 0); sum0123 = vdotq_lane_s32(sum0123, s1_lo, filters_0_7, 1); sum0123 = vdotq_lane_s32(sum0123, s2_lo, filters_4_11, 1); int32x4_t sum4567 = vdotq_lane_s32(acc, s0_hi, filters_0_7, 0); sum4567 = vdotq_lane_s32(sum4567, s1_hi, filters_0_7, 1); sum4567 = vdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1); // Narrow and re-pack. int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567)); return vqrshrun_n_s16(sum, FILTER_BITS); } static inline void convolve_y_sr_12tap_neon_dotprod( const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr) { // The no-op filter should never be used here. assert(y_filter_ptr[5] != 128); const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr)); const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4)); const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); if (w == 4) { uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA; load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9, &tA); src_ptr += 11 * src_stride; // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128))); int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A; transpose_concat_4x4(s0, s1, s2, s3, &s0123); transpose_concat_4x4(s1, s2, s3, s4, &s1234); transpose_concat_4x4(s2, s3, s4, s5, &s2345); transpose_concat_4x4(s3, s4, s5, s6, &s3456); transpose_concat_4x4(s4, s5, s6, s7, &s4567); transpose_concat_4x4(s5, s6, s7, s8, &s5678); transpose_concat_4x4(s6, s7, s8, s9, &s6789); transpose_concat_4x4(s7, s8, s9, sA, &s789A); do { uint8x8_t tB, tC, tD, tE; load_u8_8x4(src_ptr, src_stride, &tB, &tC, &tD, &tE); int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128))); int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128))); int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128))); int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128))); int8x16_t s89AB, s9ABC, sABCD, sBCDE; transpose_concat_4x4(sB, sC, sD, sE, &sBCDE); // Merge new data into block from previous iteration. int8x16x2_t samples_LUT = { { s789A, sBCDE } }; s89AB = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); s9ABC = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); sABCD = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); int16x4_t d0 = convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11); int16x4_t d1 = convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11); int16x4_t d2 = convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11); int16x4_t d3 = convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11); uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; s3456 = s789A; s4567 = s89AB; s5678 = s9ABC; s6789 = sABCD; s789A = sBCDE; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { int height = h; const uint8_t *s = src_ptr; uint8_t *d = dst_ptr; uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA; load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9, &tA); s += 11 * src_stride; // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128))); // This operation combines a conventional transpose and the sample // permute (see horizontal case) required before computing the dot // product. int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, s789A_lo, s789A_hi; transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi); transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi); transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); do { uint8x8_t tB, tC, tD, tE; load_u8_8x4(s, src_stride, &tB, &tC, &tD, &tE); int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128))); int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128))); int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128))); int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128))); int8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi, sBCDE_lo, sBCDE_hi; transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi); // Merge new data into block from previous iteration. int8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } }; s89AB_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]); s9ABC_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]); sABCD_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]); int8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } }; s89AB_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]); s9ABC_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]); sABCD_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]); uint8x8_t d0 = convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo, s89AB_hi, filter_0_7, filter_4_11); uint8x8_t d1 = convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo, s9ABC_hi, filter_0_7, filter_4_11); uint8x8_t d2 = convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo, sABCD_hi, filter_0_7, filter_4_11); uint8x8_t d3 = convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo, sBCDE_hi, filter_0_7, filter_4_11); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; s1234_hi = s5678_hi; s2345_lo = s6789_lo; s2345_hi = s6789_hi; s3456_lo = s789A_lo; s3456_hi = s789A_hi; s4567_lo = s89AB_lo; s4567_hi = s89AB_hi; s5678_lo = s9ABC_lo; s5678_hi = s9ABC_hi; s6789_lo = sABCD_lo; s6789_hi = sABCD_hi; s789A_lo = sBCDE_lo; s789A_hi = sBCDE_hi; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline int16x4_t convolve8_4_y(const int8x16_t s0, const int8x16_t s1, const int8x8_t filters) { // The sample range transform and permutation are performed by the caller. // Accumulate into 128 << FILTER_BITS to account for range transform. const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS); int32x4_t sum = vdotq_lane_s32(acc, s0, filters, 0); sum = vdotq_lane_s32(sum, s1, filters, 1); // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } static inline uint8x8_t convolve8_8_y(const int8x16_t s0_lo, const int8x16_t s0_hi, const int8x16_t s1_lo, const int8x16_t s1_hi, const int8x8_t filters) { // The sample range transform and permutation are performed by the caller. // Accumulate into 128 << FILTER_BITS to account for range transform. const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS); int32x4_t sum0123 = vdotq_lane_s32(acc, s0_lo, filters, 0); sum0123 = vdotq_lane_s32(sum0123, s1_lo, filters, 1); int32x4_t sum4567 = vdotq_lane_s32(acc, s0_hi, filters, 0); sum4567 = vdotq_lane_s32(sum4567, s1_hi, filters, 1); // Narrow and re-pack. int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567)); return vqrshrun_n_s16(sum, FILTER_BITS); } static inline void convolve_y_sr_8tap_neon_dotprod( const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr) { const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr)); const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); if (w == 4) { uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); src_ptr += 7 * src_stride; // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); int8x16_t s0123, s1234, s2345, s3456; transpose_concat_4x4(s0, s1, s2, s3, &s0123); transpose_concat_4x4(s1, s2, s3, s4, &s1234); transpose_concat_4x4(s2, s3, s4, s5, &s2345); transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t t7, t8, t9, t10; load_u8_8x4(src_ptr, src_stride, &t7, &t8, &t9, &t10); int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); int8x16_t s4567, s5678, s6789, s78910; transpose_concat_4x4(s7, s8, s9, s10, &s78910); // Merge new data into block from previous iteration. int8x16x2_t samples_LUT = { { s3456, s78910 } }; s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); int16x4_t d0 = convolve8_4_y(s0123, s4567, filter); int16x4_t d1 = convolve8_4_y(s1234, s5678, filter); int16x4_t d2 = convolve8_4_y(s2345, s6789, filter); int16x4_t d3 = convolve8_4_y(s3456, s78910, filter); uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; s3456 = s78910; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { int height = h; const uint8_t *s = src_ptr; uint8_t *d = dst_ptr; uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); s += 7 * src_stride; // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); // This operation combines a conventional transpose and the sample // permute (see horizontal case) required before computing the dot // product. int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, s3456_lo, s3456_hi; transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t t7, t8, t9, t10; load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, s78910_lo, s78910_hi; transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); // Merge new data into block from previous iteration. int8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } }; s4567_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]); s5678_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]); s6789_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]); int8x16x2_t samples_LUT_hi = { { s3456_hi, s78910_hi } }; s4567_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]); s5678_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]); uint8x8_t d0 = convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter); uint8x8_t d1 = convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter); uint8x8_t d2 = convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter); uint8x8_t d3 = convolve8_8_y(s3456_lo, s3456_hi, s78910_lo, s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; s1234_hi = s5678_hi; s2345_lo = s6789_lo; s2345_hi = s6789_hi; s3456_lo = s78910_lo; s3456_hi = s78910_hi; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } void av1_convolve_y_sr_neon_dotprod(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) { if (w == 2 || h == 2) { av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn); return; } const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); if (y_filter_taps <= 6) { av1_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn); return; } const int vert_offset = y_filter_taps / 2 - 1; src -= vert_offset * src_stride; const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); if (y_filter_taps > 8) { convolve_y_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h, y_filter_ptr); return; } convolve_y_sr_8tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h, y_filter_ptr); } static inline int16x4_t convolve12_4_2d_h(uint8x16_t samples, const int8x16_t filters, const int32x4_t horiz_const, const uint8x16x3_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t samples_128 = vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), vqtbl1q_s8(samples_128, permute_tbl.val[1]), vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; // Accumulate dot product into 'correction' to account for range transform. int32x4_t sum = vdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0); sum = vdotq_laneq_s32(sum, perm_samples[1], filters, 1); sum = vdotq_laneq_s32(sum, perm_samples[2], filters, 2); // Narrow and re-pack. return vshrn_n_s32(sum, ROUND0_BITS); } static inline int16x8_t convolve12_8_2d_h(uint8x16_t samples[2], const int8x16_t filters, const int32x4_t correction, const uint8x16x3_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t samples_128[2] = { vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))), vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128))) }; // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } int8x16_t perm_samples[4] = { vqtbl1q_s8(samples_128[0], permute_tbl.val[0]), vqtbl1q_s8(samples_128[0], permute_tbl.val[1]), vqtbl1q_s8(samples_128[0], permute_tbl.val[2]), vqtbl1q_s8(samples_128[1], permute_tbl.val[2]) }; // Accumulate dot product into 'correction' to account for range transform. int32x4_t sum0123 = vdotq_laneq_s32(correction, perm_samples[0], filters, 0); sum0123 = vdotq_laneq_s32(sum0123, perm_samples[1], filters, 1); sum0123 = vdotq_laneq_s32(sum0123, perm_samples[2], filters, 2); int32x4_t sum4567 = vdotq_laneq_s32(correction, perm_samples[1], filters, 0); sum4567 = vdotq_laneq_s32(sum4567, perm_samples[2], filters, 1); sum4567 = vdotq_laneq_s32(sum4567, perm_samples[3], filters, 2); // Narrow and re-pack. return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS), vshrn_n_s32(sum4567, ROUND0_BITS)); } static inline void convolve_2d_sr_horiz_12tap_neon_dotprod( const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, const int dst_stride, int w, int h, const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11) { // The no-op filter should never be used here. assert(vgetq_lane_s16(x_filter_0_7, 5) != 128); const int bd = 8; // Narrow filter values to 8-bit. const int16x8x2_t x_filter_s16 = { { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) } }; const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), vmovn_s16(x_filter_s16.val[1])); // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. const int32_t horiz_const = ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); // Dot product constants. const int32x4_t correction = vdupq_n_s32((128 << FILTER_BITS) + horiz_const); const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); if (w <= 4) { do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, permute_tbl); int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, correction, permute_tbl); int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, correction, permute_tbl); int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, correction, permute_tbl); store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; } while (h > 4); do { uint8x16_t s0 = vld1q_u8(src_ptr); int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, permute_tbl); vst1_s16(dst_ptr, d0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--h != 0); } else { do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; do { uint8x16_t s0[2], s1[2], s2[2], s3[2]; load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, permute_tbl); int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction, permute_tbl); int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction, permute_tbl); int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction, permute_tbl); store_s16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; } while (h > 4); do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; do { uint8x16_t s0[2]; s0[0] = vld1q_u8(s); s0[1] = vld1q_u8(s + 4); int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, permute_tbl); vst1q_s16(d, d0); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--h != 0); } } static inline int16x4_t convolve4_4_2d_h(const uint8x16_t samples, const int8x8_t filters, const uint8x16_t permute_tbl, const int32x4_t correction) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t samples_128 = vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl); // Accumulate into 'correction' to account for range transform. int32x4_t sum = vdotq_lane_s32(correction, perm_samples, filters, 0); // We halved the convolution filter values so -1 from the right shift. return vshrn_n_s32(sum, ROUND0_BITS - 1); } static inline int16x8_t convolve4_8_2d_h(const uint8x16_t samples, const int8x8_t filters, const uint8x16x2_t permute_tbl, const int32x4_t correction) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t samples_128 = vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; // Accumulate into 'correction' to account for range transform. int32x4_t sum0123 = vdotq_lane_s32(correction, perm_samples[0], filters, 0); int32x4_t sum4567 = vdotq_lane_s32(correction, perm_samples[1], filters, 0); // Narrow and re-pack. // We halved the filter values so -1 from right shift. return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), vshrn_n_s32(sum4567, ROUND0_BITS - 1)); } static inline void convolve_2d_sr_horiz_4tap_neon_dotprod( const uint8_t *src, ptrdiff_t src_stride, int16_t *dst, ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) { const int bd = 8; const int16x4_t x_filter = vld1_s16(filter_x + 2); // All 4-tap and bilinear filter values are even, so halve them to reduce // intermediate precision requirements. const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. const int32_t horiz_const = ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); // Accumulate into 128 << FILTER_BITS to account for range transform. // Halve the total because we halved the filter values. const int32x4_t correction = vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); if (w == 4) { const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl); do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); int16x4_t d0 = convolve4_4_2d_h(s0, filter, permute_tbl, correction); int16x4_t d1 = convolve4_4_2d_h(s1, filter, permute_tbl, correction); int16x4_t d2 = convolve4_4_2d_h(s2, filter, permute_tbl, correction); int16x4_t d3 = convolve4_4_2d_h(s3, filter, permute_tbl, correction); store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h > 4); do { uint8x16_t s0 = vld1q_u8(src); int16x4_t d0 = convolve4_4_2d_h(s0, filter, permute_tbl, correction); vst1_s16(dst, d0); src += src_stride; dst += dst_stride; } while (--h != 0); } else { const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl); do { const uint8_t *s = src; int16_t *d = dst; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); int16x8_t d0 = convolve4_8_2d_h(s0, filter, permute_tbl, correction); int16x8_t d1 = convolve4_8_2d_h(s1, filter, permute_tbl, correction); int16x8_t d2 = convolve4_8_2d_h(s2, filter, permute_tbl, correction); int16x8_t d3 = convolve4_8_2d_h(s3, filter, permute_tbl, correction); store_s16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h > 4); do { const uint8_t *s = src; int16_t *d = dst; int width = w; do { uint8x16_t s0 = vld1q_u8(s); int16x8_t d0 = convolve4_8_2d_h(s0, filter, permute_tbl, correction); vst1q_s16(d, d0); s += 8; d += 8; width -= 8; } while (width != 0); src += src_stride; dst += dst_stride; } while (--h != 0); } } static inline int16x8_t convolve8_8_2d_h(uint8x16_t samples, const int8x8_t filters, const int32x4_t correction, const uint8x16x3_t permute_tbl) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t samples_128 = vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), vqtbl1q_s8(samples_128, permute_tbl.val[1]), vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; // Accumulate dot product into 'correction' to account for range transform. int32x4_t sum0123 = vdotq_lane_s32(correction, perm_samples[0], filters, 0); sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filters, 1); int32x4_t sum4567 = vdotq_lane_s32(correction, perm_samples[1], filters, 0); sum4567 = vdotq_lane_s32(sum4567, perm_samples[2], filters, 1); // Narrow and re-pack. // We halved the convolution filter values so -1 from the right shift. return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), vshrn_n_s32(sum4567, ROUND0_BITS - 1)); } static inline void convolve_2d_sr_horiz_8tap_neon_dotprod( const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, int im_h, const int16_t *x_filter_ptr) { const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); const int bd = 8; // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. const int32_t horiz_const = ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); // Halve the total because we halved the filter values. const int32x4_t correction = vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); const uint8_t *src_ptr = src; int16_t *dst_ptr = im_block; int dst_stride = im_stride; int height = im_h; const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, permute_tbl); int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, permute_tbl); int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, permute_tbl); int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, permute_tbl); store_s16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height > 4); do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; do { uint8x16_t s0 = vld1q_u8(s); int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, permute_tbl); vst1q_s16(d, d0); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } static inline void convolve_2d_sr_6tap_neon_dotprod( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) { const int16x8_t y_filter = vld1q_s16(y_filter_ptr); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); const int bd = 8; // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. const int32_t horiz_const = ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); // Accumulate into 128 << FILTER_BITS to account for range transform. // Halve the total because we halved the filter values. const int32x4_t correction = vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1)); const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); do { const uint8_t *s = src; uint8_t *d = dst; int height = h; uint8x16_t h_s0, h_s1, h_s2, h_s3, h_s4; load_u8_16x5(s, src_stride, &h_s0, &h_s1, &h_s2, &h_s3, &h_s4); s += 5 * src_stride; int16x8_t v_s0 = convolve8_8_2d_h(h_s0, x_filter, correction, permute_tbl); int16x8_t v_s1 = convolve8_8_2d_h(h_s1, x_filter, correction, permute_tbl); int16x8_t v_s2 = convolve8_8_2d_h(h_s2, x_filter, correction, permute_tbl); int16x8_t v_s3 = convolve8_8_2d_h(h_s3, x_filter, correction, permute_tbl); int16x8_t v_s4 = convolve8_8_2d_h(h_s4, x_filter, correction, permute_tbl); do { uint8x16_t h_s5, h_s6, h_s7, h_s8; load_u8_16x4(s, src_stride, &h_s5, &h_s6, &h_s7, &h_s8); int16x8_t v_s5 = convolve8_8_2d_h(h_s5, x_filter, correction, permute_tbl); int16x8_t v_s6 = convolve8_8_2d_h(h_s6, x_filter, correction, permute_tbl); int16x8_t v_s7 = convolve8_8_2d_h(h_s7, x_filter, correction, permute_tbl); int16x8_t v_s8 = convolve8_8_2d_h(h_s8, x_filter, correction, permute_tbl); uint8x8_t d0 = convolve6_8_2d_v(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, y_filter, vert_const); uint8x8_t d1 = convolve6_8_2d_v(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, y_filter, vert_const); uint8x8_t d2 = convolve6_8_2d_v(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, y_filter, vert_const); uint8x8_t d3 = convolve6_8_2d_v(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, y_filter, vert_const); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); v_s0 = v_s4; v_s1 = v_s5; v_s2 = v_s6; v_s3 = v_s7; v_s4 = v_s8; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src += 8; dst += 8; w -= 8; } while (w != 0); } static inline void convolve_2d_sr_4tap_neon_dotprod( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) { const int bd = 8; const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1)); const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); const int16x4_t x_filter_s16 = vld1_s16(x_filter_ptr + 2); // All 4-tap and bilinear filter values are even, so halve them to reduce // intermediate precision requirements. const int8x8_t x_filter = vshrn_n_s16(vcombine_s16(x_filter_s16, vdup_n_s16(0)), 1); // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. const int32_t horiz_const = ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); // Accumulate into 128 << FILTER_BITS to account for range transform. // Halve the total because we halved the filter values. const int32x4_t correction = vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2); if (w == 4) { const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl); uint8x16_t h_s0, h_s1, h_s2; load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); int16x4_t v_s0 = convolve4_4_2d_h(h_s0, x_filter, permute_tbl, correction); int16x4_t v_s1 = convolve4_4_2d_h(h_s1, x_filter, permute_tbl, correction); int16x4_t v_s2 = convolve4_4_2d_h(h_s2, x_filter, permute_tbl, correction); src += 3 * src_stride; do { uint8x16_t h_s3, h_s4, h_s5, h_s6; load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); int16x4_t v_s3 = convolve4_4_2d_h(h_s3, x_filter, permute_tbl, correction); int16x4_t v_s4 = convolve4_4_2d_h(h_s4, x_filter, permute_tbl, correction); int16x4_t v_s5 = convolve4_4_2d_h(h_s5, x_filter, permute_tbl, correction); int16x4_t v_s6 = convolve4_4_2d_h(h_s6, x_filter, permute_tbl, correction); int16x4_t d0 = convolve4_4_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter); int16x4_t d1 = convolve4_4_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter); int16x4_t d2 = convolve4_4_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter); int16x4_t d3 = convolve4_4_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter); uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), vert_const)); uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), vert_const)); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); v_s0 = v_s4; v_s1 = v_s5; v_s2 = v_s6; src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h != 0); } else { const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl); do { int height = h; const uint8_t *s = src; uint8_t *d = dst; uint8x16_t h_s0, h_s1, h_s2; load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); int16x8_t v_s0 = convolve4_8_2d_h(h_s0, x_filter, permute_tbl, correction); int16x8_t v_s1 = convolve4_8_2d_h(h_s1, x_filter, permute_tbl, correction); int16x8_t v_s2 = convolve4_8_2d_h(h_s2, x_filter, permute_tbl, correction); s += 3 * src_stride; do { uint8x16_t h_s3, h_s4, h_s5, h_s6; load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); int16x8_t v_s3 = convolve4_8_2d_h(h_s3, x_filter, permute_tbl, correction); int16x8_t v_s4 = convolve4_8_2d_h(h_s4, x_filter, permute_tbl, correction); int16x8_t v_s5 = convolve4_8_2d_h(h_s5, x_filter, permute_tbl, correction); int16x8_t v_s6 = convolve4_8_2d_h(h_s6, x_filter, permute_tbl, correction); uint8x8_t d0 = convolve4_8_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter, vert_const); uint8x8_t d1 = convolve4_8_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter, vert_const); uint8x8_t d2 = convolve4_8_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter, vert_const); uint8x8_t d3 = convolve4_8_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter, vert_const); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); v_s0 = v_s4; v_s1 = v_s5; v_s2 = v_s6; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src += 8; dst += 8; w -= 8; } while (w != 0); } } void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { if (w == 2 || h == 2) { av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); return; } const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; const int im_h = h + clamped_y_taps - 1; const int im_stride = MAX_SB_SIZE; const int vert_offset = clamped_y_taps / 2 - 1; const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); if (filter_params_x->taps > 8) { DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); convolve_2d_sr_horiz_12tap_neon_dotprod(src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_0_7, x_filter_8_11); convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter_0_7, y_filter_8_11); } else { if (x_filter_taps >= 6 && y_filter_taps == 6) { convolve_2d_sr_6tap_neon_dotprod(src_ptr, src_stride, dst, dst_stride, w, h, x_filter_ptr, y_filter_ptr); return; } if (x_filter_taps <= 4 && y_filter_taps <= 4) { convolve_2d_sr_4tap_neon_dotprod(src_ptr + 2, src_stride, dst, dst_stride, w, h, x_filter_ptr, y_filter_ptr); return; } DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); if (x_filter_taps <= 4) { convolve_2d_sr_horiz_4tap_neon_dotprod(src_ptr + 2, src_stride, im_block, im_stride, w, im_h, x_filter_ptr); } else { convolve_2d_sr_horiz_8tap_neon_dotprod(src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr); } const int16x8_t y_filter = vld1q_s16(y_filter_ptr); if (clamped_y_taps <= 4) { convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter_ptr); } else if (clamped_y_taps == 6) { convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter); } else { convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter); } } } aom-3.12.1/av1/common/arm/convolve_neon_i8mm.c000066400000000000000000001510651477627663500211170ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_ports/mem.h" #include "av1/common/arm/convolve_neon.h" #include "av1/common/arm/convolve_neon_i8mm.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { // Shift left and insert new last column in transposed 4x4 block. 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, // Shift left and insert two new columns in transposed 4x4 block. 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, // Shift left and insert three new columns in transposed 4x4 block. 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; static inline int16x4_t convolve12_4_x(uint8x16_t samples[2], const int8x16_t filter[2], const uint8x16_t permute_tbl, const int32x4_t horiz_const) { // Permute samples ready for matrix multiply. // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 } uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples[0], permute_tbl), vqtbl1q_u8(samples[1], permute_tbl) }; // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register. int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples[0], filter[0]); sum = vusmmlaq_s32(sum, perm_samples[1], filter[1]); return vqrshrn_n_s32(sum, FILTER_BITS); } static inline uint8x8_t convolve12_8_x(uint8x16_t samples[2], const int8x16_t filter[2], const uint8x16x2_t permute_tbl, const int32x4_t horiz_const) { // Permute samples ready for matrix multiply. // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 } // { 6, 7, 8, 9, 10, 11, 12, 13, 8, 9, 10, 11, 12, 13, 14, 15 } // { 10, 11, 12, 13, 14, 15, 16, 17, 12, 13, 14, 15, 16, 17, 18, 19 } uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]), vqtbl1q_u8(samples[0], permute_tbl.val[1]), vqtbl1q_u8(samples[1], permute_tbl.val[0]), vqtbl1q_u8(samples[1], permute_tbl.val[1]) }; // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register. int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter[0]); int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter[0]); sum0123 = vusmmlaq_s32(sum0123, perm_samples[2], filter[1]); sum4567 = vusmmlaq_s32(sum4567, perm_samples[3], filter[1]); // Narrow and re-pack. int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum0123, FILTER_BITS), vqrshrn_n_s32(sum4567, FILTER_BITS)); return vqmovun_s16(sum_s16); } static inline void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filter_ptr) { // The no-op filter should never be used here. assert(x_filter_ptr[5] != 128); // Split 12-tap filter into two 6-tap filters, masking the top two elements. // { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 } const int8x8_t mask = vcreate_s8(0x0000ffffffffffff); const int8x8_t filter_0 = vand_s8(vmovn_s16(vld1q_s16(x_filter_ptr)), mask); const int8x8_t filter_1 = vext_s8(vmovn_s16(vld1q_s16(x_filter_ptr + 4)), vdup_n_s8(0), 2); // Stagger each 6-tap filter to enable use of matrix multiply instructions. // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } const int8x16_t filter[2] = { vcombine_s8(filter_0, vext_s8(filter_0, filter_0, 7)), vcombine_s8(filter_1, vext_s8(filter_1, filter_1, 7)) }; // A shim of 1 << (ROUND0_BITS - 1) enables us to simplify computation in the // convolution kernels: Adding this shim enables us to use a single rounding // right shift by FILTER_BITS instead of two rounding right shifts: first by // ROUND0_BITS, and then subsequently by FILTER_BITS - ROUND0_BITS. const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1)); if (w <= 4) { const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl); do { uint8x16_t s0[2], s1[2], s2[2], s3[2]; load_u8_16x4(src, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); load_u8_16x4(src + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const); int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const); int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const); int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const); uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); dst += 4 * dst_stride; src += 4 * src_stride; h -= 4; } while (h != 0); } else { const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl); do { const uint8_t *s = src; uint8_t *d = dst; int width = w; do { uint8x16_t s0[2], s1[2], s2[2], s3[2]; load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); load_u8_16x4(s + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const); uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const); uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const); uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h != 0); } } static inline uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter, const uint8x16x3_t permute_tbl, const int32x4_t horiz_const) { // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]), vqtbl1q_u8(samples, permute_tbl.val[1]), vqtbl1q_u8(samples, permute_tbl.val[2]) }; int32x4_t sum0123 = vusdotq_lane_s32(horiz_const, perm_samples[0], filter, 0); sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filter, 1); int32x4_t sum4567 = vusdotq_lane_s32(horiz_const, perm_samples[1], filter, 0); sum4567 = vusdotq_lane_s32(sum4567, perm_samples[2], filter, 1); int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); // We halved the convolution filter values so - 1 from the right shift. return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1); } static inline void convolve_x_sr_8tap_neon_i8mm( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x, const int32x4_t horiz_const) { // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(filter_x), 1); const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); do { const uint8_t *s = src; uint8_t *d = dst; int w = width; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, horiz_const); uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, horiz_const); uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, horiz_const); uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, horiz_const); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } static inline int16x4_t convolve6_4_x(uint8x16_t samples, const int8x16_t filter, const uint8x16_t permute_tbl, const int32x4_t horiz_const) { // Permute samples ready for matrix multiply. // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl); // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register. int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples, filter); // Further narrowing and packing is performed by the caller. return vmovn_s32(sum); } static inline uint8x8_t convolve6_8_x(uint8x16_t samples, const int8x16_t filter, const uint8x16x2_t permute_tbl, const int32x4_t horiz_const) { // Permute samples ready for matrix multiply. // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 } uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), vqtbl1q_u8(samples, permute_tbl.val[1]) }; // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register. int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter); int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter); int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); // We halved the convolution filter values so - 1 from the right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void convolve_x_sr_6tap_neon_i8mm( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x, const int32x4_t horiz_const) { // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(filter_x), 1); // Stagger the filter for use with the matrix multiply instructions. // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } const int8x16_t x_filter = vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8); if (width == 4) { const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl); do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); int16x4_t t0 = convolve6_4_x(s0, x_filter, permute_tbl, horiz_const); int16x4_t t1 = convolve6_4_x(s1, x_filter, permute_tbl, horiz_const); int16x4_t t2 = convolve6_4_x(s2, x_filter, permute_tbl, horiz_const); int16x4_t t3 = convolve6_4_x(s3, x_filter, permute_tbl, horiz_const); // We halved the filter values so -1 from right shift. uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } else { const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl); do { const uint8_t *s = src; uint8_t *d = dst; int w = width; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); uint8x8_t d0 = convolve6_8_x(s0, x_filter, permute_tbl, horiz_const); uint8x8_t d1 = convolve6_8_x(s1, x_filter, permute_tbl, horiz_const); uint8x8_t d2 = convolve6_8_x(s2, x_filter, permute_tbl, horiz_const); uint8x8_t d3 = convolve6_8_x(s3, x_filter, permute_tbl, horiz_const); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } } void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { if (w == 2 || h == 2) { av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params); return; } const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; src -= horiz_offset; const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK); // A shim of 1 << (ROUND0_BITS - 1) enables us to simplify computation in the // convolution kernels: Adding this shim enables us to use a single rounding // right shift by FILTER_BITS instead of two rounding right shifts: first by // ROUND0_BITS, and then subsequently by FILTER_BITS - ROUND0_BITS. // Halve the total because we will halve the filter values. const int32x4_t horiz_const = vdupq_n_s32((1 << ((ROUND0_BITS - 1)) / 2)); if (filter_taps <= 6) { convolve_x_sr_6tap_neon_i8mm(src + 1, src_stride, dst, dst_stride, w, h, x_filter_ptr, horiz_const); return; } if (filter_taps > 8) { convolve_x_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h, x_filter_ptr); return; } convolve_x_sr_8tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h, x_filter_ptr, horiz_const); } static inline void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x16_t *b) { // Transpose 8-bit elements and concatenate result rows as follows: // a0: 00, 01, 02, 03, XX, XX, XX, XX // a1: 10, 11, 12, 13, XX, XX, XX, XX // a2: 20, 21, 22, 23, XX, XX, XX, XX // a3: 30, 31, 32, 33, XX, XX, XX, XX // // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0]; uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0]; uint16x8_t a0123 = vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)).val[0]; *b = vreinterpretq_u8_u16(a0123); } static inline void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x16_t *b0, uint8x16_t *b1) { // Transpose 8-bit elements and concatenate result rows as follows: // a0: 00, 01, 02, 03, 04, 05, 06, 07 // a1: 10, 11, 12, 13, 14, 15, 16, 17 // a2: 20, 21, 22, 23, 24, 25, 26, 27 // a3: 30, 31, 32, 33, 34, 35, 36, 37 // // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); uint8x16_t a01 = vzipq_u8(a0q, a1q).val[0]; uint8x16_t a23 = vzipq_u8(a2q, a3q).val[0]; uint16x8x2_t a0123 = vzipq_u16(vreinterpretq_u16_u8(a01), vreinterpretq_u16_u8(a23)); *b0 = vreinterpretq_u8_u16(a0123.val[0]); *b1 = vreinterpretq_u8_u16(a0123.val[1]); } static inline int16x4_t convolve12_4_y(const uint8x16_t s0, const uint8x16_t s1, const uint8x16_t s2, const int8x8_t filters_0_7, const int8x8_t filters_4_11) { int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters_0_7, 0); sum = vusdotq_lane_s32(sum, s1, filters_0_7, 1); sum = vusdotq_lane_s32(sum, s2, filters_4_11, 1); // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } static inline uint8x8_t convolve12_8_y( const uint8x16_t s0_lo, const uint8x16_t s0_hi, const uint8x16_t s1_lo, const uint8x16_t s1_hi, const uint8x16_t s2_lo, const uint8x16_t s2_hi, const int8x8_t filters_0_7, const int8x8_t filters_4_11) { int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0_lo, filters_0_7, 0); sum0123 = vusdotq_lane_s32(sum0123, s1_lo, filters_0_7, 1); sum0123 = vusdotq_lane_s32(sum0123, s2_lo, filters_4_11, 1); int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0_hi, filters_0_7, 0); sum4567 = vusdotq_lane_s32(sum4567, s1_hi, filters_0_7, 1); sum4567 = vusdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1); // Narrow and re-pack. int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567)); return vqrshrun_n_s16(sum, FILTER_BITS); } static inline void convolve_y_sr_12tap_neon_i8mm(const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr) { // The no-op filter should never be used here. assert(y_filter_ptr[5] != 128); const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr)); const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4)); const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); if (w == 4) { uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; load_u8_8x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &sA); src_ptr += 11 * src_stride; // This operation combines a conventional transpose and the sample permute // (see horizontal case) required before computing the dot product. uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A; transpose_concat_4x4(s0, s1, s2, s3, &s0123); transpose_concat_4x4(s1, s2, s3, s4, &s1234); transpose_concat_4x4(s2, s3, s4, s5, &s2345); transpose_concat_4x4(s3, s4, s5, s6, &s3456); transpose_concat_4x4(s4, s5, s6, s7, &s4567); transpose_concat_4x4(s5, s6, s7, s8, &s5678); transpose_concat_4x4(s6, s7, s8, s9, &s6789); transpose_concat_4x4(s7, s8, s9, sA, &s789A); do { uint8x8_t sB, sC, sD, sE; load_u8_8x4(src_ptr, src_stride, &sB, &sC, &sD, &sE); uint8x16_t s89AB, s9ABC, sABCD, sBCDE; transpose_concat_4x4(sB, sC, sD, sE, &sBCDE); // Merge new data into block from previous iteration. uint8x16x2_t samples_LUT = { { s789A, sBCDE } }; s89AB = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); s9ABC = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); sABCD = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); int16x4_t d0 = convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11); int16x4_t d1 = convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11); int16x4_t d2 = convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11); int16x4_t d3 = convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11); uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; s3456 = s789A; s4567 = s89AB; s5678 = s9ABC; s6789 = sABCD; s789A = sBCDE; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { int height = h; const uint8_t *s = src_ptr; uint8_t *d = dst_ptr; uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; load_u8_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &sA); s += 11 * src_stride; // This operation combines a conventional transpose and the sample // permute (see horizontal case) required before computing the dot // product. uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, s789A_lo, s789A_hi; transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi); transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi); transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); do { uint8x8_t sB, sC, sD, sE; load_u8_8x4(s, src_stride, &sB, &sC, &sD, &sE); uint8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi, sBCDE_lo, sBCDE_hi; transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi); // Merge new data into block from previous iteration. uint8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } }; s89AB_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]); s9ABC_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]); sABCD_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]); uint8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } }; s89AB_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]); s9ABC_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]); sABCD_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]); uint8x8_t d0 = convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo, s89AB_hi, filter_0_7, filter_4_11); uint8x8_t d1 = convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo, s9ABC_hi, filter_0_7, filter_4_11); uint8x8_t d2 = convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo, sABCD_hi, filter_0_7, filter_4_11); uint8x8_t d3 = convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo, sBCDE_hi, filter_0_7, filter_4_11); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; s1234_hi = s5678_hi; s2345_lo = s6789_lo; s2345_hi = s6789_hi; s3456_lo = s789A_lo; s3456_hi = s789A_hi; s4567_lo = s89AB_lo; s4567_hi = s89AB_hi; s5678_lo = s9ABC_lo; s5678_hi = s9ABC_hi; s6789_lo = sABCD_lo; s6789_hi = sABCD_hi; s789A_lo = sBCDE_lo; s789A_hi = sBCDE_hi; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline int16x4_t convolve8_4_y(const uint8x16_t s0, const uint8x16_t s1, const int8x8_t filters) { int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), s0, filters, 0); sum = vusdotq_lane_s32(sum, s1, filters, 1); // Further narrowing and packing is performed by the caller. return vqmovn_s32(sum); } static inline uint8x8_t convolve8_8_y(const uint8x16_t s0_lo, const uint8x16_t s0_hi, const uint8x16_t s1_lo, const uint8x16_t s1_hi, const int8x8_t filters) { int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0_lo, filters, 0); sum0123 = vusdotq_lane_s32(sum0123, s1_lo, filters, 1); int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0_hi, filters, 0); sum4567 = vusdotq_lane_s32(sum4567, s1_hi, filters, 1); // Narrow and re-pack. int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567)); return vqrshrun_n_s16(sum, FILTER_BITS); } static inline void convolve_y_sr_8tap_neon_i8mm(const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr) { const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr)); const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); if (w == 4) { uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src_ptr += 7 * src_stride; // This operation combines a conventional transpose and the sample permute // (see horizontal case) required before computing the dot product. uint8x16_t s0123, s1234, s2345, s3456; transpose_concat_4x4(s0, s1, s2, s3, &s0123); transpose_concat_4x4(s1, s2, s3, s4, &s1234); transpose_concat_4x4(s2, s3, s4, s5, &s2345); transpose_concat_4x4(s3, s4, s5, s6, &s3456); do { uint8x8_t s7, s8, s9, s10; load_u8_8x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); uint8x16_t s4567, s5678, s6789, s78910; transpose_concat_4x4(s7, s8, s9, s10, &s78910); // Merge new data into block from previous iteration. uint8x16x2_t samples_LUT = { { s3456, s78910 } }; s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); int16x4_t d0 = convolve8_4_y(s0123, s4567, filter); int16x4_t d1 = convolve8_4_y(s1234, s5678, filter); int16x4_t d2 = convolve8_4_y(s2345, s6789, filter); int16x4_t d3 = convolve8_4_y(s3456, s78910, filter); uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123 = s4567; s1234 = s5678; s2345 = s6789; s3456 = s78910; src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { int height = h; const uint8_t *s = src_ptr; uint8_t *d = dst_ptr; uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; // This operation combines a conventional transpose and the sample // permute (see horizontal case) required before computing the dot // product. uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, s3456_lo, s3456_hi; transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); do { uint8x8_t s7, s8, s9, s10; load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, s78910_lo, s78910_hi; transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); // Merge new data into block from previous iteration. uint8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } }; s4567_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]); s5678_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]); s6789_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]); uint8x16x2_t samples_LUT_hi = { { s3456_hi, s78910_hi } }; s4567_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]); s5678_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]); uint8x8_t d0 = convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter); uint8x8_t d1 = convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter); uint8x8_t d2 = convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter); uint8x8_t d3 = convolve8_8_y(s3456_lo, s3456_hi, s78910_lo, s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123_lo = s4567_lo; s0123_hi = s4567_hi; s1234_lo = s5678_lo; s1234_hi = s5678_hi; s2345_lo = s6789_lo; s2345_hi = s6789_hi; s3456_lo = s78910_lo; s3456_hi = s78910_hi; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } void av1_convolve_y_sr_neon_i8mm(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) { if (w == 2 || h == 2) { av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn); return; } const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); if (y_filter_taps <= 6) { av1_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn); return; } const int vert_offset = y_filter_taps / 2 - 1; src -= vert_offset * src_stride; const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); if (y_filter_taps > 8) { convolve_y_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h, y_filter_ptr); return; } convolve_y_sr_8tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h, y_filter_ptr); } static inline int16x8_t convolve8_8_2d_h(uint8x16_t samples, const int8x8_t filters, const uint8x16x3_t permute_tbl, const int32x4_t horiz_const) { // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } uint8x16_t perm_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]), vqtbl1q_u8(samples, permute_tbl.val[1]), vqtbl1q_u8(samples, permute_tbl.val[2]) }; int32x4_t sum0123 = vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0); sum0123 = vusdotq_lane_s32(sum0123, perm_samples[1], filters, 1); int32x4_t sum4567 = vusdotq_lane_s32(horiz_const, perm_samples[1], filters, 0); sum4567 = vusdotq_lane_s32(sum4567, perm_samples[2], filters, 1); // Narrow and re-pack. // We halved the convolution filter values so -1 from the right shift. return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), vshrn_n_s32(sum4567, ROUND0_BITS - 1)); } static inline void convolve_2d_sr_horiz_8tap_neon_i8mm( const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, int im_h, const int16_t *x_filter_ptr) { // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); const int bd = 8; // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // The outermost -1 is needed because we halved the filter values. const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1))); const uint8_t *src_ptr = src; int16_t *dst_ptr = im_block; int dst_stride = im_stride; int height = im_h; const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const); int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const); int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const); store_s16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height > 4); do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; do { uint8x16_t s0 = vld1q_u8(s); int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); vst1q_s16(d, d0); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } static inline int16x4_t convolve4_4_2d_h(const uint8x16_t samples, const int8x8_t filters, const uint8x16_t permute_tbl, const int32x4_t horiz_const) { // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl); int32x4_t sum = vusdotq_lane_s32(horiz_const, perm_samples, filters, 0); // We halved the convolution filter values so -1 from the right shift. return vshrn_n_s32(sum, ROUND0_BITS - 1); } static inline int16x8_t convolve4_8_2d_h(const uint8x16_t samples, const int8x8_t filters, const uint8x16x2_t permute_tbl, const int32x4_t horiz_const) { // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), vqtbl1q_u8(samples, permute_tbl.val[1]) }; int32x4_t sum0123 = vusdotq_lane_s32(horiz_const, perm_samples[0], filters, 0); int32x4_t sum4567 = vusdotq_lane_s32(horiz_const, perm_samples[1], filters, 0); // Narrow and re-pack. // We halved the filter values so -1 from right shift. return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), vshrn_n_s32(sum4567, ROUND0_BITS - 1)); } static inline void convolve_2d_sr_horiz_4tap_neon_i8mm( const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int width, int height, const int16_t *filter_x) { const int bd = 8; const int16x4_t x_filter = vld1_s16(filter_x + 2); // All 4-tap and bilinear filter values are even, so halve them to reduce // intermediate precision requirements. const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // Halve the total because we halved the filter values. const int32x4_t horiz_const = vdupq_n_s32( (((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) / 2)); if (width == 4) { const uint8x16_t perm_tbl = vld1q_u8(kDotProdPermuteTbl); do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); int16x4_t d0 = convolve4_4_2d_h(s0, filter, perm_tbl, horiz_const); int16x4_t d1 = convolve4_4_2d_h(s1, filter, perm_tbl, horiz_const); int16x4_t d2 = convolve4_4_2d_h(s2, filter, perm_tbl, horiz_const); int16x4_t d3 = convolve4_4_2d_h(s3, filter, perm_tbl, horiz_const); store_s16_4x4(dst, dst_stride, d0, d1, d2, d3); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 4); do { uint8x16_t s0 = vld1q_u8(src); int16x4_t d0 = convolve4_4_2d_h(s0, filter, perm_tbl, horiz_const); vst1_s16(dst, d0); src += src_stride; dst += dst_stride; } while (--height != 0); } else { const uint8x16x2_t perm_tbl = vld1q_u8_x2(kDotProdPermuteTbl); do { int w = width; const uint8_t *s = src; int16_t *d = dst; do { uint8x16_t s0, s1, s2, s3; load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); int16x8_t d0 = convolve4_8_2d_h(s0, filter, perm_tbl, horiz_const); int16x8_t d1 = convolve4_8_2d_h(s1, filter, perm_tbl, horiz_const); int16x8_t d2 = convolve4_8_2d_h(s2, filter, perm_tbl, horiz_const); int16x8_t d3 = convolve4_8_2d_h(s3, filter, perm_tbl, horiz_const); store_s16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 4); do { const uint8_t *s = src; int16_t *d = dst; int w = width; do { uint8x16_t s0 = vld1q_u8(s); int16x8_t d0 = convolve4_8_2d_h(s0, filter, perm_tbl, horiz_const); vst1q_s16(d, d0); s += 8; d += 8; w -= 8; } while (w != 0); src += src_stride; dst += dst_stride; } while (--height != 0); } } static inline int16x4_t convolve6_4_2d_h(uint8x16_t samples, const int8x16_t filter, const uint8x16_t permute_tbl, const int32x4_t horiz_const) { // Permute samples ready for matrix multiply. // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } uint8x16_t perm_samples = vqtbl1q_u8(samples, permute_tbl); // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register. int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples, filter); // We halved the convolution filter values so -1 from the right shift. return vshrn_n_s32(sum, ROUND0_BITS - 1); } static inline int16x8_t convolve6_8_2d_h(uint8x16_t samples, const int8x16_t filter, const uint8x16x2_t permute_tbl, const int32x4_t horiz_const) { // Permute samples ready for matrix multiply. // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 } uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), vqtbl1q_u8(samples, permute_tbl.val[1]) }; // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register. int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter); int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter); // Narrow and re-pack. // We halved the convolution filter values so -1 from the right shift. return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1), vshrn_n_s32(sum4567, ROUND0_BITS - 1)); } static inline void convolve_2d_sr_6tap_neon_i8mm(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) { const int16x8_t y_filter = vld1q_s16(y_filter_ptr); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); // Stagger the filter for use with the matrix multiply instructions. // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } const int8x16_t x_filter = vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8); const int bd = 8; // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding // shifts in convolution kernels - which are generally faster than rounding // shifts on modern CPUs. The outermost -1 is needed because we halved the // filter values. const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1))); const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1)); const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl); do { const uint8_t *s = src; uint8_t *d = dst; int height = h; uint8x16_t h_s0, h_s1, h_s2, h_s3, h_s4; load_u8_16x5(s, src_stride, &h_s0, &h_s1, &h_s2, &h_s3, &h_s4); s += 5 * src_stride; int16x8_t v_s0 = convolve6_8_2d_h(h_s0, x_filter, permute_tbl, horiz_const); int16x8_t v_s1 = convolve6_8_2d_h(h_s1, x_filter, permute_tbl, horiz_const); int16x8_t v_s2 = convolve6_8_2d_h(h_s2, x_filter, permute_tbl, horiz_const); int16x8_t v_s3 = convolve6_8_2d_h(h_s3, x_filter, permute_tbl, horiz_const); int16x8_t v_s4 = convolve6_8_2d_h(h_s4, x_filter, permute_tbl, horiz_const); do { uint8x16_t h_s5, h_s6, h_s7, h_s8; load_u8_16x4(s, src_stride, &h_s5, &h_s6, &h_s7, &h_s8); int16x8_t v_s5 = convolve6_8_2d_h(h_s5, x_filter, permute_tbl, horiz_const); int16x8_t v_s6 = convolve6_8_2d_h(h_s6, x_filter, permute_tbl, horiz_const); int16x8_t v_s7 = convolve6_8_2d_h(h_s7, x_filter, permute_tbl, horiz_const); int16x8_t v_s8 = convolve6_8_2d_h(h_s8, x_filter, permute_tbl, horiz_const); uint8x8_t d0 = convolve6_8_2d_v(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, y_filter, vert_const); uint8x8_t d1 = convolve6_8_2d_v(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, y_filter, vert_const); uint8x8_t d2 = convolve6_8_2d_v(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, y_filter, vert_const); uint8x8_t d3 = convolve6_8_2d_v(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, y_filter, vert_const); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); v_s0 = v_s4; v_s1 = v_s5; v_s2 = v_s6; v_s3 = v_s7; v_s4 = v_s8; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src += 8; dst += 8; w -= 8; } while (w != 0); } static inline void convolve_2d_sr_6tap_4tap_neon_i8mm( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) { const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); // Filter values are even, so halve to reduce intermediate precision reqs. const int8x8_t x_filter_s8 = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); // Stagger the filter for use with the matrix multiply instructions. // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } const int8x16_t x_filter = vcombine_s8(vext_s8(x_filter_s8, x_filter_s8, 1), x_filter_s8); const int bd = 8; // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. // Halve the total because we halved the filter values. const int32x4_t horiz_const = vdupq_n_s32( ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))) / 2); const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1)); if (w == 4) { const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl); uint8x16_t h_s0, h_s1, h_s2; load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); int16x4_t v_s0 = convolve6_4_2d_h(h_s0, x_filter, permute_tbl, horiz_const); int16x4_t v_s1 = convolve6_4_2d_h(h_s1, x_filter, permute_tbl, horiz_const); int16x4_t v_s2 = convolve6_4_2d_h(h_s2, x_filter, permute_tbl, horiz_const); src += 3 * src_stride; do { uint8x16_t h_s3, h_s4, h_s5, h_s6; load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); int16x4_t v_s3 = convolve6_4_2d_h(h_s3, x_filter, permute_tbl, horiz_const); int16x4_t v_s4 = convolve6_4_2d_h(h_s4, x_filter, permute_tbl, horiz_const); int16x4_t v_s5 = convolve6_4_2d_h(h_s5, x_filter, permute_tbl, horiz_const); int16x4_t v_s6 = convolve6_4_2d_h(h_s6, x_filter, permute_tbl, horiz_const); int16x4_t d0 = convolve4_4_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter); int16x4_t d1 = convolve4_4_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter); int16x4_t d2 = convolve4_4_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter); int16x4_t d3 = convolve4_4_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter); uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), vert_const)); uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), vert_const)); store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); v_s0 = v_s4; v_s1 = v_s5; v_s2 = v_s6; src += 4 * src_stride; dst += 4 * dst_stride; h -= 4; } while (h != 0); } else { const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl); do { int height = h; const uint8_t *s = src; uint8_t *d = dst; uint8x16_t h_s0, h_s1, h_s2; load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); int16x8_t v_s0 = convolve6_8_2d_h(h_s0, x_filter, permute_tbl, horiz_const); int16x8_t v_s1 = convolve6_8_2d_h(h_s1, x_filter, permute_tbl, horiz_const); int16x8_t v_s2 = convolve6_8_2d_h(h_s2, x_filter, permute_tbl, horiz_const); s += 3 * src_stride; do { uint8x16_t h_s3, h_s4, h_s5, h_s6; load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); int16x8_t v_s3 = convolve6_8_2d_h(h_s3, x_filter, permute_tbl, horiz_const); int16x8_t v_s4 = convolve6_8_2d_h(h_s4, x_filter, permute_tbl, horiz_const); int16x8_t v_s5 = convolve6_8_2d_h(h_s5, x_filter, permute_tbl, horiz_const); int16x8_t v_s6 = convolve6_8_2d_h(h_s6, x_filter, permute_tbl, horiz_const); uint8x8_t d0 = convolve4_8_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter, vert_const); uint8x8_t d1 = convolve4_8_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter, vert_const); uint8x8_t d2 = convolve4_8_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter, vert_const); uint8x8_t d3 = convolve4_8_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter, vert_const); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); v_s0 = v_s4; v_s1 = v_s5; v_s2 = v_s6; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src += 8; dst += 8; w -= 8; } while (w != 0); } } void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { if (w == 2 || h == 2) { av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); return; } const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; const int im_h = h + clamped_y_taps - 1; const int im_stride = MAX_SB_SIZE; const int vert_offset = clamped_y_taps / 2 - 1; const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); if (filter_params_x->taps > 8) { DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr); convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter_0_7, y_filter_8_11); } else { DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); if (x_filter_taps == 6 && y_filter_taps == 6) { convolve_2d_sr_6tap_neon_i8mm(src_ptr + 1, src_stride, dst, dst_stride, w, h, x_filter_ptr, y_filter_ptr); return; } // Used for both 6, 4 and 4, 4 horiz, vert filter tap combinations. if (x_filter_taps <= 6 && y_filter_taps <= 4) { convolve_2d_sr_6tap_4tap_neon_i8mm(src_ptr + 1, src_stride, dst, dst_stride, w, h, x_filter_ptr, y_filter_ptr); return; } if (x_filter_taps <= 4) { convolve_2d_sr_horiz_4tap_neon_i8mm(src_ptr + 2, src_stride, im_block, im_stride, w, im_h, x_filter_ptr); } else { convolve_2d_sr_horiz_8tap_neon_i8mm(src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr); } const int16x8_t y_filter = vld1q_s16(y_filter_ptr); if (clamped_y_taps <= 4) { convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter_ptr); } else if (clamped_y_taps == 6) { convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter); } else { convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter); } } } aom-3.12.1/av1/common/arm/convolve_neon_i8mm.h000066400000000000000000000166321477627663500211240ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_ #define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_ports/mem.h" DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; DECLARE_ALIGNED(16, static const uint8_t, kMatMulPermuteTbl[32]) = { // clang-format off 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 // clang-format on }; static inline int16x4_t convolve12_4_2d_h(uint8x16_t samples[2], const int8x16_t filter[2], const uint8x16_t permute_tbl, int32x4_t horiz_const) { // Permute samples ready for matrix multiply. // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 } uint8x16_t perm_samples[2] = { vqtbl1q_u8(samples[0], permute_tbl), vqtbl1q_u8(samples[1], permute_tbl) }; // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register. int32x4_t sum = vusmmlaq_s32(horiz_const, perm_samples[0], filter[0]); sum = vusmmlaq_s32(sum, perm_samples[1], filter[1]); // Narrow and re-pack. return vshrn_n_s32(sum, ROUND0_BITS); } static inline int16x8_t convolve12_8_2d_h(uint8x16_t samples[2], const int8x16_t filter[2], const uint8x16x2_t permute_tbl, const int32x4_t horiz_const) { /// Permute samples ready for matrix multiply. // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 } // { 6, 7, 8, 9, 10, 11, 12, 13, 8, 9, 10, 11, 12, 13, 14, 15 } // { 10, 11, 12, 13, 14, 15, 16, 17, 12, 13, 14, 15, 16, 17, 18, 19 } uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], permute_tbl.val[0]), vqtbl1q_u8(samples[0], permute_tbl.val[1]), vqtbl1q_u8(samples[1], permute_tbl.val[0]), vqtbl1q_u8(samples[1], permute_tbl.val[1]) }; // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register. int32x4_t sum0123 = vusmmlaq_s32(horiz_const, perm_samples[0], filter[0]); int32x4_t sum4567 = vusmmlaq_s32(horiz_const, perm_samples[1], filter[0]); sum0123 = vusmmlaq_s32(sum0123, perm_samples[2], filter[1]); sum4567 = vusmmlaq_s32(sum4567, perm_samples[3], filter[1]); // Narrow and re-pack. return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS), vshrn_n_s32(sum4567, ROUND0_BITS)); } static inline void convolve_2d_sr_horiz_12tap_neon_i8mm( const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, const int dst_stride, int w, int h, const int16_t *x_filter_ptr) { // The no-op filter should never be used here. assert(x_filter_ptr[5] != 128); const int bd = 8; // Split 12-tap filter into two 6-tap filters, masking the top two elements. // { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 } const int8x8_t mask = vcreate_s8(0x0000ffffffffffff); const int8x8_t filter_0 = vand_s8(vmovn_s16(vld1q_s16(x_filter_ptr)), mask); const int8x8_t filter_1 = vext_s8(vmovn_s16(vld1q_s16(x_filter_ptr + 4)), vdup_n_s8(0), 2); // Stagger each 6-tap filter to enable use of matrix multiply instructions. // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } const int8x16_t filter[2] = { vcombine_s8(filter_0, vext_s8(filter_0, filter_0, 7)), vcombine_s8(filter_1, vext_s8(filter_1, filter_1, 7)) }; // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts // in convolution kernels - which are generally faster than rounding shifts on // modern CPUs. const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); if (w <= 4) { const uint8x16_t permute_tbl = vld1q_u8(kMatMulPermuteTbl); do { uint8x16_t s0[2], s1[2], s2[2], s3[2]; load_u8_16x4(src_ptr, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); load_u8_16x4(src_ptr + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); int16x4_t d0 = convolve12_4_2d_h(s0, filter, permute_tbl, horiz_const); int16x4_t d1 = convolve12_4_2d_h(s1, filter, permute_tbl, horiz_const); int16x4_t d2 = convolve12_4_2d_h(s2, filter, permute_tbl, horiz_const); int16x4_t d3 = convolve12_4_2d_h(s3, filter, permute_tbl, horiz_const); store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; } while (h > 4); do { uint8x16_t s0[2]; s0[0] = vld1q_u8(src_ptr); s0[1] = vld1q_u8(src_ptr + 6); int16x4_t d0 = convolve12_4_2d_h(s0, filter, permute_tbl, horiz_const); vst1_s16(dst_ptr, d0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--h != 0); } else { const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl); do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; do { uint8x16_t s0[2], s1[2], s2[2], s3[2]; load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); load_u8_16x4(s + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); int16x8_t d0 = convolve12_8_2d_h(s0, filter, permute_tbl, horiz_const); int16x8_t d1 = convolve12_8_2d_h(s1, filter, permute_tbl, horiz_const); int16x8_t d2 = convolve12_8_2d_h(s2, filter, permute_tbl, horiz_const); int16x8_t d3 = convolve12_8_2d_h(s3, filter, permute_tbl, horiz_const); store_s16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; h -= 4; } while (h > 4); do { const uint8_t *s = src_ptr; int16_t *d = dst_ptr; int width = w; do { uint8x16_t s0[2]; s0[0] = vld1q_u8(s); s0[1] = vld1q_u8(s + 6); int16x8_t d0 = convolve12_8_2d_h(s0, filter, permute_tbl, horiz_const); vst1q_s16(d, d0); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--h != 0); } } #endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_I8MM_H_ aom-3.12.1/av1/common/arm/convolve_scale_neon.h000066400000000000000000001043401477627663500213330ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_ARM_CONVOLVE_SCALE_NEON_H_ #define AOM_AV1_COMMON_ARM_CONVOLVE_SCALE_NEON_H_ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" static inline int16x4_t compound_convolve8_4_v( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t filter, const int32x4_t offset_const) { const int16x4_t filter_0_3 = vget_low_s16(filter); const int16x4_t filter_4_7 = vget_high_s16(filter); int32x4_t sum = offset_const; sum = vmlal_lane_s16(sum, s0, filter_0_3, 0); sum = vmlal_lane_s16(sum, s1, filter_0_3, 1); sum = vmlal_lane_s16(sum, s2, filter_0_3, 2); sum = vmlal_lane_s16(sum, s3, filter_0_3, 3); sum = vmlal_lane_s16(sum, s4, filter_4_7, 0); sum = vmlal_lane_s16(sum, s5, filter_4_7, 1); sum = vmlal_lane_s16(sum, s6, filter_4_7, 2); sum = vmlal_lane_s16(sum, s7, filter_4_7, 3); return vshrn_n_s32(sum, COMPOUND_ROUND1_BITS); } static inline int16x8_t compound_convolve8_8_v( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t filter, const int32x4_t offset_const) { const int16x4_t filter_0_3 = vget_low_s16(filter); const int16x4_t filter_4_7 = vget_high_s16(filter); int32x4_t sum0 = offset_const; sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3); int32x4_t sum1 = offset_const; sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3); int16x4_t res0 = vshrn_n_s32(sum0, COMPOUND_ROUND1_BITS); int16x4_t res1 = vshrn_n_s32(sum1, COMPOUND_ROUND1_BITS); return vcombine_s16(res0, res1); } static inline void compound_convolve_vert_scale_8tap_neon( const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use // non-rounding shifts - which are generally faster than rounding shifts on // modern CPUs. const int32x4_t vert_offset = vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1))); int y_qn = subpel_y_qn; if (w == 4) { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset); vst1_u16(dst, vreinterpret_u16_s16(d0)); dst += dst_stride; y_qn += y_step_qn; } while (--h != 0); } else { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); int width = w; uint16_t *d = dst; do { int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset); vst1q_u16(d, vreinterpretq_u16_s16(d0)); s += 8; d += 8; width -= 8; } while (width != 0); dst += dst_stride; y_qn += y_step_qn; } while (--h != 0); } } static inline void compound_avg_convolve_vert_scale_8tap_neon( const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride, uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use // non-rounding shifts - which are generally faster than rounding shifts // on modern CPUs. const int32_t vert_offset_bits = (1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)); // For the averaging code path substract round offset and convolve round. const int32_t avg_offset_bits = (1 << (offset_bits + 1)) + (1 << offset_bits); const int32x4_t vert_offset = vdupq_n_s32(vert_offset_bits - avg_offset_bits); int y_qn = subpel_y_qn; if (w == 4) { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset); int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16)); int16x4_t avg = vhadd_s16(dd0, d0); int16x8_t d0_s16 = vcombine_s16(avg, vdup_n_s16(0)); uint8x8_t d0_u8 = vqrshrun_n_s16( d0_s16, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS); store_u8_4x1(dst8, d0_u8); dst16 += dst16_stride; dst8 += dst8_stride; y_qn += y_step_qn; } while (--h != 0); } else { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); int width = w; uint8_t *dst8_ptr = dst8; uint16_t *dst16_ptr = dst16; do { int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset); int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr)); int16x8_t avg = vhaddq_s16(dd0, d0); uint8x8_t d0_u8 = vqrshrun_n_s16( avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS); vst1_u8(dst8_ptr, d0_u8); s += 8; dst8_ptr += 8; dst16_ptr += 8; width -= 8; } while (width != 0); dst16 += dst16_stride; dst8 += dst8_stride; y_qn += y_step_qn; } while (--h != 0); } } static inline void compound_dist_wtd_convolve_vert_scale_8tap_neon( const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride, uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter, ConvolveParams *conv_params, int subpel_y_qn, int y_step_qn) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; int y_qn = subpel_y_qn; // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use // non-rounding shifts - which are generally faster than rounding shifts on // modern CPUs. const int32x4_t vert_offset = vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1))); // For the weighted averaging code path we have to substract round offset and // convolve round. The shim of 1 << (2 * FILTER_BITS - ROUND0_BITS - // COMPOUND_ROUND1_BITS - 1) enables us to use non-rounding shifts. The // additional shift by DIST_PRECISION_BITS is needed in order to merge two // shift calculations into one. const int32x4_t dist_wtd_offset = vdupq_n_s32( (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1 + DIST_PRECISION_BITS)) - (1 << (offset_bits - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS)) - (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1 + DIST_PRECISION_BITS))); const int16x4_t bck_offset = vdup_n_s16(conv_params->bck_offset); const int16x4_t fwd_offset = vdup_n_s16(conv_params->fwd_offset); if (w == 4) { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); int16x4_t d0 = compound_convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset); int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16)); int32x4_t dst_wtd_avg = vmlal_s16(dist_wtd_offset, bck_offset, d0); dst_wtd_avg = vmlal_s16(dst_wtd_avg, fwd_offset, dd0); int16x4_t d0_s16 = vshrn_n_s32( dst_wtd_avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS); uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16, vdup_n_s16(0))); store_u8_4x1(dst8, d0_u8); dst16 += dst16_stride; dst8 += dst8_stride; y_qn += y_step_qn; } while (--h != 0); } else { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); int width = w; uint8_t *dst8_ptr = dst8; uint16_t *dst16_ptr = dst16; do { int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); int16x8_t d0 = compound_convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset); int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr)); int32x4_t dst_wtd_avg0 = vmlal_s16(dist_wtd_offset, bck_offset, vget_low_s16(d0)); int32x4_t dst_wtd_avg1 = vmlal_s16(dist_wtd_offset, bck_offset, vget_high_s16(d0)); dst_wtd_avg0 = vmlal_s16(dst_wtd_avg0, fwd_offset, vget_low_s16(dd0)); dst_wtd_avg1 = vmlal_s16(dst_wtd_avg1, fwd_offset, vget_high_s16(dd0)); int16x4_t d0_s16_0 = vshrn_n_s32( dst_wtd_avg0, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS); int16x4_t d0_s16_1 = vshrn_n_s32( dst_wtd_avg1, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS); uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16_0, d0_s16_1)); vst1_u8(dst8_ptr, d0_u8); s += 8; dst8_ptr += 8; dst16_ptr += 8; width -= 8; } while (width != 0); dst16 += dst16_stride; dst8 += dst8_stride; y_qn += y_step_qn; } while (--h != 0); } } static inline uint8x8_t convolve8_4_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t filter, const int32x4_t offset_const) { const int16x4_t filter_0_3 = vget_low_s16(filter); const int16x4_t filter_4_7 = vget_high_s16(filter); int32x4_t sum = offset_const; sum = vmlal_lane_s16(sum, s0, filter_0_3, 0); sum = vmlal_lane_s16(sum, s1, filter_0_3, 1); sum = vmlal_lane_s16(sum, s2, filter_0_3, 2); sum = vmlal_lane_s16(sum, s3, filter_0_3, 3); sum = vmlal_lane_s16(sum, s4, filter_4_7, 0); sum = vmlal_lane_s16(sum, s5, filter_4_7, 1); sum = vmlal_lane_s16(sum, s6, filter_4_7, 2); sum = vmlal_lane_s16(sum, s7, filter_4_7, 3); int16x4_t res = vshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS); return vqmovun_s16(vcombine_s16(res, vdup_n_s16(0))); } static inline uint8x8_t convolve8_8_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t filter, const int32x4_t offset_const) { const int16x4_t filter_0_3 = vget_low_s16(filter); const int16x4_t filter_4_7 = vget_high_s16(filter); int32x4_t sum0 = offset_const; sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3); int32x4_t sum1 = offset_const; sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3); int16x4_t res0 = vshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS); int16x4_t res1 = vshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS); return vqmovun_s16(vcombine_s16(res0, res1)); } static inline void convolve_vert_scale_8tap_neon( const int16_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int round_1 = 2 * FILTER_BITS - ROUND0_BITS; // The shim of 1 << (round_1 - 1) enables us to use non-rounding shifts. int32x4_t vert_offset = vdupq_n_s32((1 << (round_1 - 1)) - (1 << (offset_bits - 1))); int y_qn = subpel_y_qn; if (w == 4) { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); uint8x8_t d = convolve8_4_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset); store_u8_4x1(dst, d); dst += dst_stride; y_qn += y_step_qn; } while (--h != 0); } else if (w == 8) { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); uint8x8_t d = convolve8_8_v(s0, s1, s2, s3, s4, s5, s6, s7, filter, vert_offset); vst1_u8(dst, d); dst += dst_stride; y_qn += y_step_qn; } while (--h != 0); } else { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; uint8_t *d = dst; int width = w; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); do { int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; load_s16_8x8(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0], &s5[0], &s6[0], &s7[0]); load_s16_8x8(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1], &s5[1], &s6[1], &s7[1]); uint8x8_t d0 = convolve8_8_v(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0], s6[0], s7[0], filter, vert_offset); uint8x8_t d1 = convolve8_8_v(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1], s6[1], s7[1], filter, vert_offset); vst1q_u8(d, vcombine_u8(d0, d1)); s += 16; d += 16; width -= 16; } while (width != 0); dst += dst_stride; y_qn += y_step_qn; } while (--h != 0); } } static inline int16x4_t compound_convolve6_4_v( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x8_t filter, const int32x4_t offset_const) { const int16x4_t filter_0_3 = vget_low_s16(filter); const int16x4_t filter_4_7 = vget_high_s16(filter); int32x4_t sum = offset_const; // Filter values at indices 0 and 7 are 0. sum = vmlal_lane_s16(sum, s0, filter_0_3, 1); sum = vmlal_lane_s16(sum, s1, filter_0_3, 2); sum = vmlal_lane_s16(sum, s2, filter_0_3, 3); sum = vmlal_lane_s16(sum, s3, filter_4_7, 0); sum = vmlal_lane_s16(sum, s4, filter_4_7, 1); sum = vmlal_lane_s16(sum, s5, filter_4_7, 2); return vshrn_n_s32(sum, COMPOUND_ROUND1_BITS); } static inline int16x8_t compound_convolve6_8_v( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t filter, const int32x4_t offset_const) { const int16x4_t filter_0_3 = vget_low_s16(filter); const int16x4_t filter_4_7 = vget_high_s16(filter); int32x4_t sum0 = offset_const; // Filter values at indices 0 and 7 are 0. sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2); int32x4_t sum1 = offset_const; sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2); int16x4_t res0 = vshrn_n_s32(sum0, COMPOUND_ROUND1_BITS); int16x4_t res1 = vshrn_n_s32(sum1, COMPOUND_ROUND1_BITS); return vcombine_s16(res0, res1); } static inline void compound_convolve_vert_scale_6tap_neon( const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use // non-rounding shifts - which are generally faster than rounding shifts on // modern CPUs. const int32x4_t vert_offset = vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1))); int y_qn = subpel_y_qn; if (w == 4) { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); int16x4_t s0, s1, s2, s3, s4, s5; load_s16_4x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5); int16x4_t d0 = compound_convolve6_4_v(s0, s1, s2, s3, s4, s5, filter, vert_offset); vst1_u16(dst, vreinterpret_u16_s16(d0)); dst += dst_stride; y_qn += y_step_qn; } while (--h != 0); } else { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); int width = w; uint16_t *d = dst; do { int16x8_t s0, s1, s2, s3, s4, s5; load_s16_8x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5); int16x8_t d0 = compound_convolve6_8_v(s0, s1, s2, s3, s4, s5, filter, vert_offset); vst1q_u16(d, vreinterpretq_u16_s16(d0)); s += 8; d += 8; width -= 8; } while (width != 0); dst += dst_stride; y_qn += y_step_qn; } while (--h != 0); } } static inline void compound_avg_convolve_vert_scale_6tap_neon( const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride, uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use // non-rounding shifts - which are generally faster than rounding shifts // on modern CPUs. const int32_t vert_offset_bits = (1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1)); // For the averaging code path substract round offset and convolve round. const int32_t avg_offset_bits = (1 << (offset_bits + 1)) + (1 << offset_bits); const int32x4_t vert_offset = vdupq_n_s32(vert_offset_bits - avg_offset_bits); int y_qn = subpel_y_qn; if (w == 4) { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); int16x4_t s0, s1, s2, s3, s4, s5; load_s16_4x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5); int16x4_t d0 = compound_convolve6_4_v(s0, s1, s2, s3, s4, s5, filter, vert_offset); int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16)); int16x4_t avg = vhadd_s16(dd0, d0); int16x8_t d0_s16 = vcombine_s16(avg, vdup_n_s16(0)); uint8x8_t d0_u8 = vqrshrun_n_s16( d0_s16, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS); store_u8_4x1(dst8, d0_u8); dst16 += dst16_stride; dst8 += dst8_stride; y_qn += y_step_qn; } while (--h != 0); } else { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); int width = w; uint8_t *dst8_ptr = dst8; uint16_t *dst16_ptr = dst16; do { int16x8_t s0, s1, s2, s3, s4, s5; load_s16_8x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5); int16x8_t d0 = compound_convolve6_8_v(s0, s1, s2, s3, s4, s5, filter, vert_offset); int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr)); int16x8_t avg = vhaddq_s16(dd0, d0); uint8x8_t d0_u8 = vqrshrun_n_s16( avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS); vst1_u8(dst8_ptr, d0_u8); s += 8; dst8_ptr += 8; dst16_ptr += 8; width -= 8; } while (width != 0); dst16 += dst16_stride; dst8 += dst8_stride; y_qn += y_step_qn; } while (--h != 0); } } static inline void compound_dist_wtd_convolve_vert_scale_6tap_neon( const int16_t *src, int src_stride, uint8_t *dst8, int dst8_stride, uint16_t *dst16, int dst16_stride, int w, int h, const int16_t *y_filter, ConvolveParams *conv_params, int subpel_y_qn, int y_step_qn) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; int y_qn = subpel_y_qn; // A shim of 1 << (COMPOUND_ROUND1_BITS - 1) enables us to use // non-rounding shifts - which are generally faster than rounding shifts on // modern CPUs. const int32x4_t vert_offset = vdupq_n_s32((1 << offset_bits) + (1 << (COMPOUND_ROUND1_BITS - 1))); // For the weighted averaging code path we have to substract round offset and // convolve round. The shim of 1 << (2 * FILTER_BITS - ROUND0_BITS - // COMPOUND_ROUND1_BITS - 1) enables us to use non-rounding shifts. The // additional shift by DIST_PRECISION_BITS is needed in order to merge two // shift calculations into one. const int32x4_t dist_wtd_offset = vdupq_n_s32( (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1 + DIST_PRECISION_BITS)) - (1 << (offset_bits - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS)) - (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1 + DIST_PRECISION_BITS))); const int16x4_t bck_offset = vdup_n_s16(conv_params->bck_offset); const int16x4_t fwd_offset = vdup_n_s16(conv_params->fwd_offset); if (w == 4) { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); int16x4_t s0, s1, s2, s3, s4, s5; load_s16_4x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5); int16x4_t d0 = compound_convolve6_4_v(s0, s1, s2, s3, s4, s5, filter, vert_offset); int16x4_t dd0 = vreinterpret_s16_u16(vld1_u16(dst16)); int32x4_t dst_wtd_avg = vmlal_s16(dist_wtd_offset, bck_offset, d0); dst_wtd_avg = vmlal_s16(dst_wtd_avg, fwd_offset, dd0); int16x4_t d0_s16 = vshrn_n_s32( dst_wtd_avg, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS); uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16, vdup_n_s16(0))); store_u8_4x1(dst8, d0_u8); dst16 += dst16_stride; dst8 += dst8_stride; y_qn += y_step_qn; } while (--h != 0); } else { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); int width = w; uint8_t *dst8_ptr = dst8; uint16_t *dst16_ptr = dst16; do { int16x8_t s0, s1, s2, s3, s4, s5; load_s16_8x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5); int16x8_t d0 = compound_convolve6_8_v(s0, s1, s2, s3, s4, s5, filter, vert_offset); int16x8_t dd0 = vreinterpretq_s16_u16(vld1q_u16(dst16_ptr)); int32x4_t dst_wtd_avg0 = vmlal_s16(dist_wtd_offset, bck_offset, vget_low_s16(d0)); int32x4_t dst_wtd_avg1 = vmlal_s16(dist_wtd_offset, bck_offset, vget_high_s16(d0)); dst_wtd_avg0 = vmlal_s16(dst_wtd_avg0, fwd_offset, vget_low_s16(dd0)); dst_wtd_avg1 = vmlal_s16(dst_wtd_avg1, fwd_offset, vget_high_s16(dd0)); int16x4_t d0_s16_0 = vshrn_n_s32( dst_wtd_avg0, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS); int16x4_t d0_s16_1 = vshrn_n_s32( dst_wtd_avg1, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS + DIST_PRECISION_BITS); uint8x8_t d0_u8 = vqmovun_s16(vcombine_s16(d0_s16_0, d0_s16_1)); vst1_u8(dst8_ptr, d0_u8); s += 8; dst8_ptr += 8; dst16_ptr += 8; width -= 8; } while (width != 0); dst16 += dst16_stride; dst8 += dst8_stride; y_qn += y_step_qn; } while (--h != 0); } } static inline uint8x8_t convolve6_4_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x8_t filter, const int32x4_t offset_const) { const int16x4_t filter_0_3 = vget_low_s16(filter); const int16x4_t filter_4_7 = vget_high_s16(filter); int32x4_t sum = offset_const; // Filter values at indices 0 and 7 are 0. sum = vmlal_lane_s16(sum, s0, filter_0_3, 1); sum = vmlal_lane_s16(sum, s1, filter_0_3, 2); sum = vmlal_lane_s16(sum, s2, filter_0_3, 3); sum = vmlal_lane_s16(sum, s3, filter_4_7, 0); sum = vmlal_lane_s16(sum, s4, filter_4_7, 1); sum = vmlal_lane_s16(sum, s5, filter_4_7, 2); int16x4_t res = vshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS); return vqmovun_s16(vcombine_s16(res, vdup_n_s16(0))); } static inline uint8x8_t convolve6_8_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t filter, const int32x4_t offset_const) { const int16x4_t filter_0_3 = vget_low_s16(filter); const int16x4_t filter_4_7 = vget_high_s16(filter); int32x4_t sum0 = offset_const; // Filter values at indices 0 and 7 are 0. sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2); int32x4_t sum1 = offset_const; sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2); int16x4_t res0 = vshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS); int16x4_t res1 = vshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS); return vqmovun_s16(vcombine_s16(res0, res1)); } static inline void convolve_vert_scale_6tap_neon( const int16_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *y_filter, int subpel_y_qn, int y_step_qn) { const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int round_1 = 2 * FILTER_BITS - ROUND0_BITS; // The shim of 1 << (round_1 - 1) enables us to use non-rounding shifts. int32x4_t vert_offset = vdupq_n_s32((1 << (round_1 - 1)) - (1 << (offset_bits - 1))); int y_qn = subpel_y_qn; if (w == 4) { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); int16x4_t s0, s1, s2, s3, s4, s5; load_s16_4x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5); uint8x8_t d = convolve6_4_v(s0, s1, s2, s3, s4, s5, filter, vert_offset); store_u8_4x1(dst, d); dst += dst_stride; y_qn += y_step_qn; } while (--h != 0); } else if (w == 8) { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); int16x8_t s0, s1, s2, s3, s4, s5; load_s16_8x6(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5); uint8x8_t d = convolve6_8_v(s0, s1, s2, s3, s4, s5, filter, vert_offset); vst1_u8(dst, d); dst += dst_stride; y_qn += y_step_qn; } while (--h != 0); } else { do { const int16_t *s = &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; uint8_t *d = dst; int width = w; const ptrdiff_t filter_offset = SUBPEL_TAPS * ((y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS); const int16x8_t filter = vld1q_s16(y_filter + filter_offset); do { int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2]; load_s16_8x6(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0], &s5[0]); load_s16_8x6(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1], &s5[1]); uint8x8_t d0 = convolve6_8_v(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0], filter, vert_offset); uint8x8_t d1 = convolve6_8_v(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1], filter, vert_offset); vst1q_u8(d, vcombine_u8(d0, d1)); s += 16; d += 16; width -= 16; } while (width != 0); dst += dst_stride; y_qn += y_step_qn; } while (--h != 0); } } #endif // AOM_AV1_COMMON_ARM_CONVOLVE_SCALE_NEON_H_ aom-3.12.1/av1/common/arm/convolve_sve2.c000066400000000000000000000201231477627663500200730ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/arm/aom_filter.h" #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" #include "av1/common/arm/highbd_convolve_sve2.h" #include "av1/common/arm/convolve_neon_i8mm.h" static inline int32x4_t highbd_convolve12_4_2d_v(int16x8_t s0[2], int16x8_t s1[2], int16x8_t s2[2], int16x8_t filter_0_7, int16x8_t filter_4_11) { int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0); sum01 = aom_svdot_lane_s16(sum01, s1[0], filter_0_7, 1); sum01 = aom_svdot_lane_s16(sum01, s2[0], filter_4_11, 1); int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0); sum23 = aom_svdot_lane_s16(sum23, s1[1], filter_0_7, 1); sum23 = aom_svdot_lane_s16(sum23, s2[1], filter_4_11, 1); return vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); } static inline void convolve_2d_sr_vert_12tap_sve2( const int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, const int dst_stride, int w, int h, const int16x8_t y_filter_0_7, const int16x8_t y_filter_4_11) { // The no-op filter should never be used here. assert(vgetq_lane_s16(y_filter_0_7, 5) != 128); const int bd = 8; const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); // Scale indices by size of the true vector length to avoid reading from an // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. uint16x8_t correction0 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); uint16x8_t correction1 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); uint16x8_t correction2 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); do { int16_t *s = (int16_t *)src_ptr; uint8_t *d = (uint8_t *)dst_ptr; int height = h; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &sA); s += 11 * src_stride; int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2], s6789[2], s789A[2]; // This operation combines a conventional transpose and the sample permute // required before computing the dot product. transpose_concat_4x4(s0, s1, s2, s3, s0123); transpose_concat_4x4(s1, s2, s3, s4, s1234); transpose_concat_4x4(s2, s3, s4, s5, s2345); transpose_concat_4x4(s3, s4, s5, s6, s3456); transpose_concat_4x4(s4, s5, s6, s7, s4567); transpose_concat_4x4(s5, s6, s7, s8, s5678); transpose_concat_4x4(s6, s7, s8, s9, s6789); transpose_concat_4x4(s7, s8, s9, sA, s789A); do { int16x4_t sB, sC, sD, sE; load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE); int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2]; transpose_concat_4x4(sB, sC, sD, sE, sBCDE); // Merge new data into block from previous iteration. aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB); aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC); aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD); int32x4_t d0 = highbd_convolve12_4_2d_v(s0123, s4567, s89AB, y_filter_0_7, y_filter_4_11); int32x4_t d1 = highbd_convolve12_4_2d_v(s1234, s5678, s9ABC, y_filter_0_7, y_filter_4_11); int32x4_t d2 = highbd_convolve12_4_2d_v(s2345, s6789, sABCD, y_filter_0_7, y_filter_4_11); int32x4_t d3 = highbd_convolve12_4_2d_v(s3456, s789A, sBCDE, y_filter_0_7, y_filter_4_11); int16x8_t dd01 = vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS), vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS)); int16x8_t dd23 = vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS), vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS)); dd01 = vsubq_s16(dd01, sub_const); dd23 = vsubq_s16(dd23, sub_const); uint8x8_t d01 = vqmovun_s16(dd01); uint8x8_t d23 = vqmovun_s16(dd23); store_u8x4_strided_x2(d + 0 * dst_stride, dst_stride, d01); store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123[0] = s4567[0]; s0123[1] = s4567[1]; s1234[0] = s5678[0]; s1234[1] = s5678[1]; s2345[0] = s6789[0]; s2345[1] = s6789[1]; s3456[0] = s789A[0]; s3456[1] = s789A[1]; s4567[0] = s89AB[0]; s4567[1] = s89AB[1]; s5678[0] = s9ABC[0]; s5678[1] = s9ABC[1]; s6789[0] = sABCD[0]; s6789[1] = sABCD[1]; s789A[0] = sBCDE[0]; s789A[1] = sBCDE[1]; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 4; dst_ptr += 4; w -= 4; } while (w != 0); } void av1_convolve_2d_sr_sve2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { if (w == 2 || h == 2) { av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); return; } if (filter_params_x->taps > 8) { const int im_h = h + filter_params_y->taps - 1; const int im_stride = MAX_SB_SIZE; const int vert_offset = filter_params_x->taps / 2 - 1; const int horiz_offset = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr); convolve_2d_sr_vert_12tap_sve2(im_block, im_stride, dst, dst_stride, w, h, y_filter_0_7, y_filter_4_11); } else { av1_convolve_2d_sr_neon_i8mm(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); } } aom-3.12.1/av1/common/arm/highbd_compound_convolve_neon.c000066400000000000000000002237071477627663500234010ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_ports/mem.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" #include "av1/common/arm/highbd_compound_convolve_neon.h" #include "av1/common/arm/highbd_convolve_neon.h" static inline uint16x4_t highbd_12_convolve6_4( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x8_t filter, const int32x4_t offset) { // Values at indices 0 and 7 of y_filter are zero. const int16x4_t filter_0_3 = vget_low_s16(filter); const int16x4_t filter_4_7 = vget_high_s16(filter); int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1); sum = vmlal_lane_s16(sum, s1, filter_0_3, 2); sum = vmlal_lane_s16(sum, s2, filter_0_3, 3); sum = vmlal_lane_s16(sum, s3, filter_4_7, 0); sum = vmlal_lane_s16(sum, s4, filter_4_7, 1); sum = vmlal_lane_s16(sum, s5, filter_4_7, 2); return vqshrun_n_s32(sum, ROUND0_BITS + 2); } static inline uint16x4_t highbd_convolve6_4( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x8_t filter, const int32x4_t offset) { // Values at indices 0 and 7 of y_filter are zero. const int16x4_t filter_0_3 = vget_low_s16(filter); const int16x4_t filter_4_7 = vget_high_s16(filter); int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1); sum = vmlal_lane_s16(sum, s1, filter_0_3, 2); sum = vmlal_lane_s16(sum, s2, filter_0_3, 3); sum = vmlal_lane_s16(sum, s3, filter_4_7, 0); sum = vmlal_lane_s16(sum, s4, filter_4_7, 1); sum = vmlal_lane_s16(sum, s5, filter_4_7, 2); return vqshrun_n_s32(sum, ROUND0_BITS); } static inline uint16x8_t highbd_12_convolve6_8( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t filter, const int32x4_t offset) { // Values at indices 0 and 7 of y_filter are zero. const int16x4_t filter_0_3 = vget_low_s16(filter); const int16x4_t filter_4_7 = vget_high_s16(filter); int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2); int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2); return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2), vqshrun_n_s32(sum1, ROUND0_BITS + 2)); } static inline uint16x8_t highbd_convolve6_8( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t filter, const int32x4_t offset) { // Values at indices 0 and 7 of y_filter are zero. const int16x4_t filter_0_3 = vget_low_s16(filter); const int16x4_t filter_4_7 = vget_high_s16(filter); int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2); int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2); return vcombine_u16(vqshrun_n_s32(sum0, 3), vqshrun_n_s32(sum1, ROUND0_BITS)); } static inline void highbd_12_dist_wtd_convolve_x_6tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *x_filter_ptr, const int offset) { const int32x4_t offset_vec = vdupq_n_s32(offset); const int16x8_t x_filter = vld1q_s16(x_filter_ptr); int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[6], s1[6], s2[6], s3[6]; load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5]); load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5]); load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5]); uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], x_filter, offset_vec); uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], x_filter, offset_vec); uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], x_filter, offset_vec); uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], x_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height != 0); } static inline void highbd_dist_wtd_convolve_x_6tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *x_filter_ptr, const int offset) { const int32x4_t offset_vec = vdupq_n_s32(offset); const int16x8_t x_filter = vld1q_s16(x_filter_ptr); int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[6], s1[6], s2[6], s3[6]; load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5]); load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5]); load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5]); uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], x_filter, offset_vec); uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], x_filter, offset_vec); uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], x_filter, offset_vec); uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], x_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height != 0); } static inline uint16x4_t highbd_12_convolve8_4( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t filter, const int32x4_t offset) { const int16x4_t filter_0_3 = vget_low_s16(filter); const int16x4_t filter_4_7 = vget_high_s16(filter); int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0); sum = vmlal_lane_s16(sum, s1, filter_0_3, 1); sum = vmlal_lane_s16(sum, s2, filter_0_3, 2); sum = vmlal_lane_s16(sum, s3, filter_0_3, 3); sum = vmlal_lane_s16(sum, s4, filter_4_7, 0); sum = vmlal_lane_s16(sum, s5, filter_4_7, 1); sum = vmlal_lane_s16(sum, s6, filter_4_7, 2); sum = vmlal_lane_s16(sum, s7, filter_4_7, 3); return vqshrun_n_s32(sum, ROUND0_BITS + 2); } static inline uint16x4_t highbd_convolve8_4( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t filter, const int32x4_t offset) { const int16x4_t filter_0_3 = vget_low_s16(filter); const int16x4_t filter_4_7 = vget_high_s16(filter); int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0); sum = vmlal_lane_s16(sum, s1, filter_0_3, 1); sum = vmlal_lane_s16(sum, s2, filter_0_3, 2); sum = vmlal_lane_s16(sum, s3, filter_0_3, 3); sum = vmlal_lane_s16(sum, s4, filter_4_7, 0); sum = vmlal_lane_s16(sum, s5, filter_4_7, 1); sum = vmlal_lane_s16(sum, s6, filter_4_7, 2); sum = vmlal_lane_s16(sum, s7, filter_4_7, 3); return vqshrun_n_s32(sum, ROUND0_BITS); } static inline uint16x8_t highbd_12_convolve8_8( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t filter, const int32x4_t offset) { const int16x4_t filter_0_3 = vget_low_s16(filter); const int16x4_t filter_4_7 = vget_high_s16(filter); int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3); int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3); return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2), vqshrun_n_s32(sum1, ROUND0_BITS + 2)); } static inline uint16x8_t highbd_convolve8_8( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t filter, const int32x4_t offset) { const int16x4_t filter_0_3 = vget_low_s16(filter); const int16x4_t filter_4_7 = vget_high_s16(filter); int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3); int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3); return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS), vqshrun_n_s32(sum1, ROUND0_BITS)); } static inline uint16x4_t highbd_12_convolve4_4_x(const int16x4_t s[4], const int16x4_t x_filter, const int32x4_t offset) { int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0); sum = vmlal_lane_s16(sum, s[1], x_filter, 1); sum = vmlal_lane_s16(sum, s[2], x_filter, 2); sum = vmlal_lane_s16(sum, s[3], x_filter, 3); return vqshrun_n_s32(sum, 5); } static inline uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4], const int16x4_t x_filter, const int32x4_t offset) { int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0); sum = vmlal_lane_s16(sum, s[1], x_filter, 1); sum = vmlal_lane_s16(sum, s[2], x_filter, 2); sum = vmlal_lane_s16(sum, s[3], x_filter, 3); return vqshrun_n_s32(sum, ROUND0_BITS); } static inline void highbd_12_dist_wtd_convolve_x_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *x_filter_ptr, const int offset) { const int32x4_t offset_vec = vdupq_n_s32(offset); if (w == 4) { // 4-tap filters are used for blocks having width == 4. const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); const int16_t *s = (const int16_t *)(src_ptr + 2); uint16_t *d = dst_ptr; do { int16x4_t s0[4], s1[4], s2[4], s3[4]; load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec); uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec); uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec); uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { const int16x8_t x_filter = vld1q_s16(x_filter_ptr); int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[8], s1[8], s2[8], s3[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7]); uint16x8_t d0 = highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], s0[7], x_filter, offset_vec); uint16x8_t d1 = highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], s1[7], x_filter, offset_vec); uint16x8_t d2 = highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], s2[7], x_filter, offset_vec); uint16x8_t d3 = highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], s3[7], x_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height != 0); } } static inline void highbd_dist_wtd_convolve_x_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *x_filter_ptr, const int offset) { const int32x4_t offset_vec = vdupq_n_s32(offset); if (w == 4) { // 4-tap filters are used for blocks having width == 4. const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); const int16_t *s = (const int16_t *)(src_ptr + 2); uint16_t *d = dst_ptr; do { int16x4_t s0[4], s1[4], s2[4], s3[4]; load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec); uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec); uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec); uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { const int16x8_t x_filter = vld1q_s16(x_filter_ptr); int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[8], s1[8], s2[8], s3[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7]); uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], s0[7], x_filter, offset_vec); uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], s1[7], x_filter, offset_vec); uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], s2[7], x_filter, offset_vec); uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], s3[7], x_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height != 0); } } void av1_highbd_dist_wtd_convolve_x_neon( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(16, uint16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); CONV_BUF_TYPE *dst16 = conv_params->dst; const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); int dst16_stride = conv_params->dst_stride; const int im_stride = MAX_SB_SIZE; const int horiz_offset = filter_params_x->taps / 2 - 1; assert(FILTER_BITS == COMPOUND_ROUND1_BITS); const int offset_convolve = (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1)); const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); src -= horiz_offset; // horizontal filter if (bd == 12) { if (conv_params->do_average) { if (x_filter_taps <= 6 && w != 4) { highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block, im_stride, w, h, x_filter_ptr, offset_convolve); } else { highbd_12_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride, w, h, x_filter_ptr, offset_convolve); } if (conv_params->use_dist_wtd_comp_avg) { highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params); } else { highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params); } } else { if (x_filter_taps <= 6 && w != 4) { highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16, dst16_stride, w, h, x_filter_ptr, offset_convolve); } else { highbd_12_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride, w, h, x_filter_ptr, offset_convolve); } } } else { if (conv_params->do_average) { if (x_filter_taps <= 6 && w != 4) { highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block, im_stride, w, h, x_filter_ptr, offset_convolve); } else { highbd_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride, w, h, x_filter_ptr, offset_convolve); } if (conv_params->use_dist_wtd_comp_avg) { highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params, bd); } else { highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params, bd); } } else { if (x_filter_taps <= 6 && w != 4) { highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16, dst16_stride, w, h, x_filter_ptr, offset_convolve); } else { highbd_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride, w, h, x_filter_ptr, offset_convolve); } } } } static inline void highbd_12_dist_wtd_convolve_y_6tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr, const int offset) { const int16x8_t y_filter = vld1q_s16(y_filter_ptr); const int32x4_t offset_vec = vdupq_n_s32(offset); if (w == 4) { const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x4_t s0, s1, s2, s3, s4; load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); s += 5 * src_stride; do { int16x4_t s5, s6, s7, s8; load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); uint16x4_t d0 = highbd_12_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); uint16x4_t d1 = highbd_12_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); uint16x4_t d2 = highbd_12_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); uint16x4_t d3 = highbd_12_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { int height = h; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x8_t s0, s1, s2, s3, s4; load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); s += 5 * src_stride; do { int16x8_t s5, s6, s7, s8; load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); uint16x8_t d0 = highbd_12_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); uint16x8_t d1 = highbd_12_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); uint16x8_t d2 = highbd_12_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); uint16x8_t d3 = highbd_12_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline void highbd_dist_wtd_convolve_y_6tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr, const int offset) { const int16x8_t y_filter = vld1q_s16(y_filter_ptr); const int32x4_t offset_vec = vdupq_n_s32(offset); if (w == 4) { const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x4_t s0, s1, s2, s3, s4; load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); s += 5 * src_stride; do { int16x4_t s5, s6, s7, s8; load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); uint16x4_t d0 = highbd_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); uint16x4_t d1 = highbd_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); uint16x4_t d2 = highbd_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); uint16x4_t d3 = highbd_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { int height = h; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x8_t s0, s1, s2, s3, s4; load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); s += 5 * src_stride; do { int16x8_t s5, s6, s7, s8; load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); uint16x8_t d0 = highbd_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); uint16x8_t d1 = highbd_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); uint16x8_t d2 = highbd_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); uint16x8_t d3 = highbd_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline uint16x4_t highbd_12_convolve4_4( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) { int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0); sum = vmlal_lane_s16(sum, s1, filter, 1); sum = vmlal_lane_s16(sum, s2, filter, 2); sum = vmlal_lane_s16(sum, s3, filter, 3); return vqshrun_n_s32(sum, ROUND0_BITS + 2); } static inline uint16x8_t highbd_12_convolve4_8( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) { int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3); int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3); return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2), vqshrun_n_s32(sum1, ROUND0_BITS + 2)); } static inline void highbd_12_dist_wtd_convolve_y_4tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr, const int offset) { const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); const int32x4_t offset_vec = vdupq_n_s32(offset); if (w == 4) { const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x4_t s0, s1, s2; load_s16_4x3(s, src_stride, &s0, &s1, &s2); s += 3 * src_stride; do { int16x4_t s3, s4, s5, s6; load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); uint16x4_t d0 = highbd_12_convolve4_4(s0, s1, s2, s3, y_filter, offset_vec); uint16x4_t d1 = highbd_12_convolve4_4(s1, s2, s3, s4, y_filter, offset_vec); uint16x4_t d2 = highbd_12_convolve4_4(s2, s3, s4, s5, y_filter, offset_vec); uint16x4_t d3 = highbd_12_convolve4_4(s3, s4, s5, s6, y_filter, offset_vec); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { int height = h; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x8_t s0, s1, s2; load_s16_8x3(s, src_stride, &s0, &s1, &s2); s += 3 * src_stride; do { int16x8_t s3, s4, s5, s6; load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); uint16x8_t d0 = highbd_12_convolve4_8(s0, s1, s2, s3, y_filter, offset_vec); uint16x8_t d1 = highbd_12_convolve4_8(s1, s2, s3, s4, y_filter, offset_vec); uint16x8_t d2 = highbd_12_convolve4_8(s2, s3, s4, s5, y_filter, offset_vec); uint16x8_t d3 = highbd_12_convolve4_8(s3, s4, s5, s6, y_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline uint16x4_t highbd_convolve4_4( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) { int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0); sum = vmlal_lane_s16(sum, s1, filter, 1); sum = vmlal_lane_s16(sum, s2, filter, 2); sum = vmlal_lane_s16(sum, s3, filter, 3); return vqshrun_n_s32(sum, ROUND0_BITS); } static inline uint16x8_t highbd_convolve4_8( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) { int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3); int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3); return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS), vqshrun_n_s32(sum1, ROUND0_BITS)); } static inline void highbd_dist_wtd_convolve_y_4tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr, const int offset) { const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); const int32x4_t offset_vec = vdupq_n_s32(offset); if (w == 4) { const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x4_t s0, s1, s2; load_s16_4x3(s, src_stride, &s0, &s1, &s2); s += 3 * src_stride; do { int16x4_t s3, s4, s5, s6; load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, y_filter, offset_vec); uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, y_filter, offset_vec); uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, y_filter, offset_vec); uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, y_filter, offset_vec); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { int height = h; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x8_t s0, s1, s2; load_s16_8x3(s, src_stride, &s0, &s1, &s2); s += 3 * src_stride; do { int16x8_t s3, s4, s5, s6; load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); uint16x8_t d0 = highbd_convolve4_8(s0, s1, s2, s3, y_filter, offset_vec); uint16x8_t d1 = highbd_convolve4_8(s1, s2, s3, s4, y_filter, offset_vec); uint16x8_t d2 = highbd_convolve4_8(s2, s3, s4, s5, y_filter, offset_vec); uint16x8_t d3 = highbd_convolve4_8(s3, s4, s5, s6, y_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline void highbd_12_dist_wtd_convolve_y_8tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr, const int offset) { const int16x8_t y_filter = vld1q_s16(y_filter_ptr); const int32x4_t offset_vec = vdupq_n_s32(offset); if (w == 4) { const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x4_t s0, s1, s2, s3, s4, s5, s6; load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; do { int16x4_t s7, s8, s9, s10; load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); uint16x4_t d0 = highbd_12_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_vec); uint16x4_t d1 = highbd_12_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, offset_vec); uint16x4_t d2 = highbd_12_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, offset_vec); uint16x4_t d3 = highbd_12_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, offset_vec); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { int height = h; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; do { int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); uint16x8_t d0 = highbd_12_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_vec); uint16x8_t d1 = highbd_12_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, offset_vec); uint16x8_t d2 = highbd_12_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, offset_vec); uint16x8_t d3 = highbd_12_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline void highbd_dist_wtd_convolve_y_8tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr, const int offset) { const int16x8_t y_filter = vld1q_s16(y_filter_ptr); const int32x4_t offset_vec = vdupq_n_s32(offset); if (w == 4) { const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x4_t s0, s1, s2, s3, s4, s5, s6; load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; do { int16x4_t s7, s8, s9, s10; load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); uint16x4_t d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_vec); uint16x4_t d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, offset_vec); uint16x4_t d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, offset_vec); uint16x4_t d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, offset_vec); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { int height = h; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; do { int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); uint16x8_t d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_vec); uint16x8_t d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, offset_vec); uint16x8_t d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, offset_vec); uint16x8_t d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } void av1_highbd_dist_wtd_convolve_y_neon( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(16, uint16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); CONV_BUF_TYPE *dst16 = conv_params->dst; const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); int dst16_stride = conv_params->dst_stride; const int im_stride = MAX_SB_SIZE; const int vert_offset = filter_params_y->taps / 2 - 1; assert(FILTER_BITS == COMPOUND_ROUND1_BITS); const int round_offset_conv = (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1)); const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); src -= vert_offset * src_stride; if (bd == 12) { if (conv_params->do_average) { if (y_filter_taps <= 4) { highbd_12_dist_wtd_convolve_y_4tap_neon( src + 2 * src_stride, src_stride, im_block, im_stride, w, h, y_filter_ptr, round_offset_conv); } else if (y_filter_taps == 6) { highbd_12_dist_wtd_convolve_y_6tap_neon( src + src_stride, src_stride, im_block, im_stride, w, h, y_filter_ptr, round_offset_conv); } else { highbd_12_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block, im_stride, w, h, y_filter_ptr, round_offset_conv); } if (conv_params->use_dist_wtd_comp_avg) { highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params); } else { highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params); } } else { if (y_filter_taps <= 4) { highbd_12_dist_wtd_convolve_y_4tap_neon( src + 2 * src_stride, src_stride, dst16, dst16_stride, w, h, y_filter_ptr, round_offset_conv); } else if (y_filter_taps == 6) { highbd_12_dist_wtd_convolve_y_6tap_neon( src + src_stride, src_stride, dst16, dst16_stride, w, h, y_filter_ptr, round_offset_conv); } else { highbd_12_dist_wtd_convolve_y_8tap_neon( src, src_stride, dst16, dst16_stride, w, h, y_filter_ptr, round_offset_conv); } } } else { if (conv_params->do_average) { if (y_filter_taps <= 4) { highbd_dist_wtd_convolve_y_4tap_neon(src + 2 * src_stride, src_stride, im_block, im_stride, w, h, y_filter_ptr, round_offset_conv); } else if (y_filter_taps == 6) { highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride, im_block, im_stride, w, h, y_filter_ptr, round_offset_conv); } else { highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block, im_stride, w, h, y_filter_ptr, round_offset_conv); } if (conv_params->use_dist_wtd_comp_avg) { highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params, bd); } else { highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params, bd); } } else { if (y_filter_taps <= 4) { highbd_dist_wtd_convolve_y_4tap_neon(src + 2 * src_stride, src_stride, dst16, dst16_stride, w, h, y_filter_ptr, round_offset_conv); } else if (y_filter_taps == 6) { highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride, dst16, dst16_stride, w, h, y_filter_ptr, round_offset_conv); } else { highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, dst16, dst16_stride, w, h, y_filter_ptr, round_offset_conv); } } } } static inline void highbd_2d_copy_neon(const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int round_bits, const int offset) { if (w <= 4) { const int16x4_t round_shift_s16 = vdup_n_s16(round_bits); const uint16x4_t offset_u16 = vdup_n_u16(offset); for (int y = 0; y < h; ++y) { const uint16x4_t s = vld1_u16(src_ptr + y * src_stride); uint16x4_t d = vshl_u16(s, round_shift_s16); d = vadd_u16(d, offset_u16); if (w == 2) { store_u16_2x1(dst_ptr + y * dst_stride, d); } else { vst1_u16(dst_ptr + y * dst_stride, d); } } } else { const int16x8_t round_shift_s16 = vdupq_n_s16(round_bits); const uint16x8_t offset_u16 = vdupq_n_u16(offset); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; x += 8) { const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x); uint16x8_t d = vshlq_u16(s, round_shift_s16); d = vaddq_u16(d, offset_u16); vst1q_u16(dst_ptr + y * dst_stride + x, d); } } } } void av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(16, uint16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); const int im_stride = MAX_SB_SIZE; CONV_BUF_TYPE *dst16 = conv_params->dst; int dst16_stride = conv_params->dst_stride; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int round_offset = (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; assert(round_bits >= 0); if (conv_params->do_average) { highbd_2d_copy_neon(src, src_stride, im_block, im_stride, w, h, round_bits, round_offset); } else { highbd_2d_copy_neon(src, src_stride, dst16, dst16_stride, w, h, round_bits, round_offset); } if (conv_params->do_average) { if (conv_params->use_dist_wtd_comp_avg) { if (bd == 12) { highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params); } else { highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params, bd); } } else { if (bd == 12) { highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params); } else { highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params, bd); } } } } static inline uint16x4_t highbd_convolve6_4_2d_v( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x8_t y_filter, const int32x4_t offset) { // Values at indices 0 and 7 of y_filter are zero. const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1); sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2); sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3); sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0); sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1); sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2); return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); } static inline uint16x8_t highbd_convolve6_8_2d_v( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t y_filter, const int32x4_t offset) { // Values at indices 0 and 7 of y_filter are zero. const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2); int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2); return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); } static inline void highbd_dist_wtd_convolve_2d_vert_6tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr, int offset) { const int16x8_t y_filter = vld1q_s16(y_filter_ptr); const int32x4_t offset_vec = vdupq_n_s32(offset); if (w == 4) { const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x4_t s0, s1, s2, s3, s4; load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); s += 5 * src_stride; do { int16x4_t s5, s6, s7, s8; load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); uint16x4_t d0 = highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); uint16x4_t d1 = highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); uint16x4_t d2 = highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); uint16x4_t d3 = highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { int height = h; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x8_t s0, s1, s2, s3, s4; load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); s += 5 * src_stride; do { int16x8_t s5, s6, s7, s8; load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); uint16x8_t d0 = highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); uint16x8_t d1 = highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); uint16x8_t d2 = highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); uint16x8_t d3 = highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline uint16x4_t highbd_convolve8_4_2d_v( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, const int32x4_t offset) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0); sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); } static inline uint16x8_t highbd_convolve8_8_2d_v( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, const int32x4_t offset) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); } static inline void highbd_dist_wtd_convolve_2d_vert_8tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr, int offset) { const int16x8_t y_filter = vld1q_s16(y_filter_ptr); const int32x4_t offset_vec = vdupq_n_s32(offset); if (w <= 4) { const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x4_t s0, s1, s2, s3, s4, s5, s6; load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; do { int16x4_t s7, s8, s9, s10; load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); uint16x4_t d0 = highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_vec); uint16x4_t d1 = highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, offset_vec); uint16x4_t d2 = highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, offset_vec); uint16x4_t d3 = highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, offset_vec); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { int height = h; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; do { int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); uint16x8_t d0 = highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset_vec); uint16x8_t d1 = highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, offset_vec); uint16x8_t d2 = highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, offset_vec); uint16x8_t d3 = highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline void highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *x_filter_ptr, const int offset) { // The smallest block height is 4, and the horizontal convolution needs to // process an extra (filter_taps/2 - 1) lines for the vertical convolution. assert(h >= 5); const int32x4_t offset_vec = vdupq_n_s32(offset); const int16x8_t x_filter = vld1q_s16(x_filter_ptr); int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[6], s1[6], s2[6], s3[6]; load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5]); load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5]); load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5]); uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], x_filter, offset_vec); uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], x_filter, offset_vec); uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], x_filter, offset_vec); uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], x_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height > 4); do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[6]; load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], x_filter, offset_vec); vst1q_u16(d, d0); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } static inline void highbd_dist_wtd_convolve_2d_horiz_6tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *x_filter_ptr, const int offset) { // The smallest block height is 4, and the horizontal convolution needs to // process an extra (filter_taps/2 - 1) lines for the vertical convolution. assert(h >= 5); const int32x4_t offset_vec = vdupq_n_s32(offset); const int16x8_t x_filter = vld1q_s16(x_filter_ptr); int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[6], s1[6], s2[6], s3[6]; load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5]); load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5]); load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5]); uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], x_filter, offset_vec); uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], x_filter, offset_vec); uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], x_filter, offset_vec); uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], x_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height > 4); do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[6]; load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], x_filter, offset_vec); vst1q_u16(d, d0); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } static inline void highbd_12_dist_wtd_convolve_2d_horiz_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *x_filter_ptr, const int offset) { // The smallest block height is 4, and the horizontal convolution needs to // process an extra (filter_taps/2 - 1) lines for the vertical convolution. assert(h >= 5); const int32x4_t offset_vec = vdupq_n_s32(offset); if (w == 4) { // 4-tap filters are used for blocks having width == 4. const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); const int16_t *s = (const int16_t *)(src_ptr + 1); uint16_t *d = dst_ptr; do { int16x4_t s0[4], s1[4], s2[4], s3[4]; load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec); uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec); uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec); uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h > 4); do { int16x4_t s0[4]; load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]); uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec); vst1_u16(d, d0); s += src_stride; d += dst_stride; } while (--h != 0); } else { const int16x8_t x_filter = vld1q_s16(x_filter_ptr); int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[8], s1[8], s2[8], s3[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7]); uint16x8_t d0 = highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], s0[7], x_filter, offset_vec); uint16x8_t d1 = highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], s1[7], x_filter, offset_vec); uint16x8_t d2 = highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], s2[7], x_filter, offset_vec); uint16x8_t d3 = highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], s3[7], x_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height > 4); do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); uint16x8_t d0 = highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], s0[7], x_filter, offset_vec); vst1q_u16(d, d0); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } } static inline void highbd_dist_wtd_convolve_2d_horiz_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *x_filter_ptr, const int offset) { // The smallest block height is 4, and the horizontal convolution needs to // process an extra (filter_taps/2 - 1) lines for the vertical convolution. assert(h >= 5); const int32x4_t offset_vec = vdupq_n_s32(offset); if (w == 4) { // 4-tap filters are used for blocks having width == 4. const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); const int16_t *s = (const int16_t *)(src_ptr + 1); uint16_t *d = dst_ptr; do { int16x4_t s0[4], s1[4], s2[4], s3[4]; load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec); uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec); uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec); uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h > 4); do { int16x4_t s0[4]; load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]); uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec); vst1_u16(d, d0); s += src_stride; d += dst_stride; } while (--h != 0); } else { const int16x8_t x_filter = vld1q_s16(x_filter_ptr); int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[8], s1[8], s2[8], s3[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7]); uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], s0[7], x_filter, offset_vec); uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], s1[7], x_filter, offset_vec); uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], s2[7], x_filter, offset_vec); uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], s3[7], x_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height > 4); do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], s0[7], x_filter, offset_vec); vst1q_u16(d, d0); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } } void av1_highbd_dist_wtd_convolve_2d_neon( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(16, uint16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); DECLARE_ALIGNED(16, uint16_t, im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); CONV_BUF_TYPE *dst16 = conv_params->dst; int dst16_stride = conv_params->dst_stride; const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps; const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; const int im_h = h + clamped_y_taps - 1; const int im_stride = MAX_SB_SIZE; const int vert_offset = clamped_y_taps / 2 - 1; const int horiz_offset = clamped_x_taps / 2 - 1; // The extra shim of (1 << (conv_params->round_0 - 1)) allows us to use a // faster non-rounding non-saturating left shift. const int round_offset_conv_x = (1 << (bd + FILTER_BITS - 1)) + (1 << (conv_params->round_0 - 1)); const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int round_offset_conv_y = (1 << y_offset_bits); const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); // horizontal filter if (bd == 12) { if (x_filter_taps <= 6 && w != 4) { highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon( src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, round_offset_conv_x); } else { highbd_12_dist_wtd_convolve_2d_horiz_neon( src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, round_offset_conv_x); } } else { if (x_filter_taps <= 6 && w != 4) { highbd_dist_wtd_convolve_2d_horiz_6tap_neon( src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, round_offset_conv_x); } else { highbd_dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, round_offset_conv_x); } } // vertical filter if (y_filter_taps <= 6) { if (conv_params->do_average) { highbd_dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, im_block2, im_stride, w, h, y_filter_ptr, round_offset_conv_y); } else { highbd_dist_wtd_convolve_2d_vert_6tap_neon( im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr, round_offset_conv_y); } } else { if (conv_params->do_average) { highbd_dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, im_block2, im_stride, w, h, y_filter_ptr, round_offset_conv_y); } else { highbd_dist_wtd_convolve_2d_vert_8tap_neon( im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr, round_offset_conv_y); } } // Do the compound averaging outside the loop, avoids branching within the // main loop if (conv_params->do_average) { if (conv_params->use_dist_wtd_comp_avg) { if (bd == 12) { highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, conv_params); } else { highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, conv_params, bd); } } else { if (bd == 12) { highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, conv_params); } else { highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, conv_params, bd); } } } } aom-3.12.1/av1/common/arm/highbd_compound_convolve_neon.h000066400000000000000000000241531477627663500234000ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_ports/mem.h" #define ROUND_SHIFT 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS static inline void highbd_12_comp_avg_neon(const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, ConvolveParams *conv_params) { const int offset_bits = 12 + 2 * FILTER_BITS - ROUND0_BITS - 2; const int offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); CONV_BUF_TYPE *ref_ptr = conv_params->dst; const int ref_stride = conv_params->dst_stride; const uint16x4_t offset_vec = vdup_n_u16((uint16_t)offset); const uint16x8_t max = vdupq_n_u16((1 << 12) - 1); if (w == 4) { do { const uint16x4_t src = vld1_u16(src_ptr); const uint16x4_t ref = vld1_u16(ref_ptr); uint16x4_t avg = vhadd_u16(src, ref); int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2); d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); vst1_u16(dst_ptr, d0_u16); src_ptr += src_stride; ref_ptr += ref_stride; dst_ptr += dst_stride; } while (--h != 0); } else { do { int width = w; const uint16_t *src = src_ptr; const uint16_t *ref = ref_ptr; uint16_t *dst = dst_ptr; do { const uint16x8_t s = vld1q_u16(src); const uint16x8_t r = vld1q_u16(ref); uint16x8_t avg = vhaddq_u16(s, r); int32x4_t d0_lo = vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); int32x4_t d0_hi = vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT - 2), vqrshrun_n_s32(d0_hi, ROUND_SHIFT - 2)); d0 = vminq_u16(d0, max); vst1q_u16(dst, d0); src += 8; ref += 8; dst += 8; width -= 8; } while (width != 0); src_ptr += src_stride; ref_ptr += ref_stride; dst_ptr += dst_stride; } while (--h != 0); } } static inline void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, ConvolveParams *conv_params, const int bd) { const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); CONV_BUF_TYPE *ref_ptr = conv_params->dst; const int ref_stride = conv_params->dst_stride; const uint16x4_t offset_vec = vdup_n_u16((uint16_t)offset); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); if (w == 4) { do { const uint16x4_t src = vld1_u16(src_ptr); const uint16x4_t ref = vld1_u16(ref_ptr); uint16x4_t avg = vhadd_u16(src, ref); int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT); d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); vst1_u16(dst_ptr, d0_u16); src_ptr += src_stride; ref_ptr += ref_stride; dst_ptr += dst_stride; } while (--h != 0); } else { do { int width = w; const uint16_t *src = src_ptr; const uint16_t *ref = ref_ptr; uint16_t *dst = dst_ptr; do { const uint16x8_t s = vld1q_u16(src); const uint16x8_t r = vld1q_u16(ref); uint16x8_t avg = vhaddq_u16(s, r); int32x4_t d0_lo = vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); int32x4_t d0_hi = vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT), vqrshrun_n_s32(d0_hi, ROUND_SHIFT)); d0 = vminq_u16(d0, max); vst1q_u16(dst, d0); src += 8; ref += 8; dst += 8; width -= 8; } while (width != 0); src_ptr += src_stride; ref_ptr += ref_stride; dst_ptr += dst_stride; } while (--h != 0); } } static inline void highbd_12_dist_wtd_comp_avg_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, ConvolveParams *conv_params) { const int offset_bits = 12 + 2 * FILTER_BITS - ROUND0_BITS - 2; const int offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); CONV_BUF_TYPE *ref_ptr = conv_params->dst; const int ref_stride = conv_params->dst_stride; const uint32x4_t offset_vec = vdupq_n_u32(offset); const uint16x8_t max = vdupq_n_u16((1 << 12) - 1); uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); // Weighted averaging if (w == 4) { do { const uint16x4_t src = vld1_u16(src_ptr); const uint16x4_t ref = vld1_u16(ref_ptr); uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2); d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); vst1_u16(dst_ptr, d0_u16); src_ptr += src_stride; dst_ptr += dst_stride; ref_ptr += ref_stride; } while (--h != 0); } else { do { int width = w; const uint16_t *src = src_ptr; const uint16_t *ref = ref_ptr; uint16_t *dst = dst_ptr; do { const uint16x8_t s = vld1q_u16(src); const uint16x8_t r = vld1q_u16(ref); uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT - 2), vqrshrun_n_s32(d1, ROUND_SHIFT - 2)); d01 = vminq_u16(d01, max); vst1q_u16(dst, d01); src += 8; ref += 8; dst += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; ref_ptr += ref_stride; } while (--h != 0); } } static inline void highbd_dist_wtd_comp_avg_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, ConvolveParams *conv_params, const int bd) { const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); CONV_BUF_TYPE *ref_ptr = conv_params->dst; const int ref_stride = conv_params->dst_stride; const uint32x4_t offset_vec = vdupq_n_u32(offset); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); // Weighted averaging if (w == 4) { do { const uint16x4_t src = vld1_u16(src_ptr); const uint16x4_t ref = vld1_u16(ref_ptr); uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT); d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); vst1_u16(dst_ptr, d0_u16); src_ptr += src_stride; dst_ptr += dst_stride; ref_ptr += ref_stride; } while (--h != 0); } else { do { int width = w; const uint16_t *src = src_ptr; const uint16_t *ref = ref_ptr; uint16_t *dst = dst_ptr; do { const uint16x8_t s = vld1q_u16(src); const uint16x8_t r = vld1q_u16(ref); uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT), vqrshrun_n_s32(d1, ROUND_SHIFT)); d01 = vminq_u16(d01, max); vst1q_u16(dst, d01); src += 8; ref += 8; dst += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; ref_ptr += ref_stride; } while (--h != 0); } } aom-3.12.1/av1/common/arm/highbd_compound_convolve_sve2.c000066400000000000000000001657001477627663500233170ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/aom_neon_sve2_bridge.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_ports/mem.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" #include "av1/common/filter.h" #include "av1/common/arm/highbd_compound_convolve_neon.h" #include "av1/common/arm/highbd_convolve_neon.h" #include "av1/common/arm/highbd_convolve_sve2.h" DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 0, 6, 7, 0, 1, 7, 0, 1, 2, }; static inline uint16x8_t highbd_12_convolve8_8_x(int16x8_t s0[8], int16x8_t filter, int64x2_t offset) { int64x2_t sum[8]; sum[0] = aom_sdotq_s16(offset, s0[0], filter); sum[1] = aom_sdotq_s16(offset, s0[1], filter); sum[2] = aom_sdotq_s16(offset, s0[2], filter); sum[3] = aom_sdotq_s16(offset, s0[3], filter); sum[4] = aom_sdotq_s16(offset, s0[4], filter); sum[5] = aom_sdotq_s16(offset, s0[5], filter); sum[6] = aom_sdotq_s16(offset, s0[6], filter); sum[7] = aom_sdotq_s16(offset, s0[7], filter); sum[0] = vpaddq_s64(sum[0], sum[1]); sum[2] = vpaddq_s64(sum[2], sum[3]); sum[4] = vpaddq_s64(sum[4], sum[5]); sum[6] = vpaddq_s64(sum[6], sum[7]); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6])); return vcombine_u16(vqrshrun_n_s32(sum0123, ROUND0_BITS + 2), vqrshrun_n_s32(sum4567, ROUND0_BITS + 2)); } static inline void highbd_12_dist_wtd_convolve_x_8tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *x_filter_ptr) { const int64x1_t offset_vec = vcreate_s64((1 << (12 + FILTER_BITS)) + (1 << (12 + FILTER_BITS - 1))); const int64x2_t offset_lo = vcombine_s64(offset_vec, vdup_n_s64(0)); const int16x8_t filter = vld1q_s16(x_filter_ptr); do { const int16_t *s = (const int16_t *)src; uint16_t *d = dst; int w = width; do { int16x8_t s0[8], s1[8], s2[8], s3[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7]); uint16x8_t d0 = highbd_12_convolve8_8_x(s0, filter, offset_lo); uint16x8_t d1 = highbd_12_convolve8_8_x(s1, filter, offset_lo); uint16x8_t d2 = highbd_12_convolve8_8_x(s2, filter, offset_lo); uint16x8_t d3 = highbd_12_convolve8_8_x(s3, filter, offset_lo); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } static inline uint16x8_t highbd_convolve8_8_x(int16x8_t s0[8], int16x8_t filter, int64x2_t offset) { int64x2_t sum[8]; sum[0] = aom_sdotq_s16(offset, s0[0], filter); sum[1] = aom_sdotq_s16(offset, s0[1], filter); sum[2] = aom_sdotq_s16(offset, s0[2], filter); sum[3] = aom_sdotq_s16(offset, s0[3], filter); sum[4] = aom_sdotq_s16(offset, s0[4], filter); sum[5] = aom_sdotq_s16(offset, s0[5], filter); sum[6] = aom_sdotq_s16(offset, s0[6], filter); sum[7] = aom_sdotq_s16(offset, s0[7], filter); sum[0] = vpaddq_s64(sum[0], sum[1]); sum[2] = vpaddq_s64(sum[2], sum[3]); sum[4] = vpaddq_s64(sum[4], sum[5]); sum[6] = vpaddq_s64(sum[6], sum[7]); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6])); return vcombine_u16(vqrshrun_n_s32(sum0123, ROUND0_BITS), vqrshrun_n_s32(sum4567, ROUND0_BITS)); } static inline void highbd_dist_wtd_convolve_x_8tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *x_filter_ptr, const int bd) { const int64x1_t offset_vec = vcreate_s64((1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1))); const int64x2_t offset_lo = vcombine_s64(offset_vec, vdup_n_s64(0)); const int16x8_t filter = vld1q_s16(x_filter_ptr); do { const int16_t *s = (const int16_t *)src; uint16_t *d = dst; int w = width; do { int16x8_t s0[8], s1[8], s2[8], s3[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7]); uint16x8_t d0 = highbd_convolve8_8_x(s0, filter, offset_lo); uint16x8_t d1 = highbd_convolve8_8_x(s1, filter, offset_lo); uint16x8_t d2 = highbd_convolve8_8_x(s2, filter, offset_lo); uint16x8_t d3 = highbd_convolve8_8_x(s3, filter, offset_lo); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } // clang-format off DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = { 0, 2, 4, 6, 1, 3, 5, 7, }; // clang-format on static inline uint16x4_t highbd_12_convolve4_4_x(int16x8_t s0, int16x8_t filter, int64x2_t offset, uint16x8x2_t permute_tbl) { int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]); int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]); int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0); int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); return vqrshrun_n_s32(sum0123, ROUND0_BITS + 2); } static inline uint16x8_t highbd_12_convolve4_8_x(int16x8_t s0[4], int16x8_t filter, int64x2_t offset, uint16x8_t tbl) { int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0); int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0); int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0); int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0); int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0415, ROUND0_BITS + 2), vqrshrun_n_s32(sum2637, ROUND0_BITS + 2)); return aom_tbl_u16(res, tbl); } static inline void highbd_12_dist_wtd_convolve_x_4tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *x_filter_ptr) { const int64x2_t offset = vdupq_n_s64((1 << (12 + FILTER_BITS)) + (1 << (12 + FILTER_BITS - 1))); const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); if (width == 4) { uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); const int16_t *s = (const int16_t *)(src); do { int16x8_t s0, s1, s2, s3; load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x4_t d0 = highbd_12_convolve4_4_x(s0, filter, offset, permute_tbl); uint16x4_t d1 = highbd_12_convolve4_4_x(s1, filter, offset, permute_tbl); uint16x4_t d2 = highbd_12_convolve4_4_x(s2, filter, offset, permute_tbl); uint16x4_t d3 = highbd_12_convolve4_4_x(s3, filter, offset, permute_tbl); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } else { uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); do { const int16_t *s = (const int16_t *)(src); uint16_t *d = dst; int w = width; do { int16x8_t s0[4], s1[4], s2[4], s3[4]; load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); uint16x8_t d0 = highbd_12_convolve4_8_x(s0, filter, offset, idx); uint16x8_t d1 = highbd_12_convolve4_8_x(s1, filter, offset, idx); uint16x8_t d2 = highbd_12_convolve4_8_x(s2, filter, offset, idx); uint16x8_t d3 = highbd_12_convolve4_8_x(s3, filter, offset, idx); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } } static inline uint16x4_t highbd_convolve4_4_x(int16x8_t s0, int16x8_t filter, int64x2_t offset, uint16x8x2_t permute_tbl) { int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]); int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]); int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0); int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); return vqrshrun_n_s32(sum0123, ROUND0_BITS); } static inline uint16x8_t highbd_convolve4_8_x(int16x8_t s0[4], int16x8_t filter, int64x2_t offset, uint16x8_t tbl) { int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0); int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0); int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0); int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0); int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0415, ROUND0_BITS), vqrshrun_n_s32(sum2637, ROUND0_BITS)); return aom_tbl_u16(res, tbl); } static inline void highbd_dist_wtd_convolve_x_4tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *x_filter_ptr, const int bd) { const int64x2_t offset = vdupq_n_s64((1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1))); const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); if (width == 4) { uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); const int16_t *s = (const int16_t *)(src); do { int16x8_t s0, s1, s2, s3; load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x4_t d0 = highbd_convolve4_4_x(s0, filter, offset, permute_tbl); uint16x4_t d1 = highbd_convolve4_4_x(s1, filter, offset, permute_tbl); uint16x4_t d2 = highbd_convolve4_4_x(s2, filter, offset, permute_tbl); uint16x4_t d3 = highbd_convolve4_4_x(s3, filter, offset, permute_tbl); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } else { uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); do { const int16_t *s = (const int16_t *)(src); uint16_t *d = dst; int w = width; do { int16x8_t s0[4], s1[4], s2[4], s3[4]; load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); uint16x8_t d0 = highbd_convolve4_8_x(s0, filter, offset, idx); uint16x8_t d1 = highbd_convolve4_8_x(s1, filter, offset, idx); uint16x8_t d2 = highbd_convolve4_8_x(s2, filter, offset, idx); uint16x8_t d3 = highbd_convolve4_8_x(s3, filter, offset, idx); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } } void av1_highbd_dist_wtd_convolve_x_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(16, uint16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); CONV_BUF_TYPE *dst16 = conv_params->dst; const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); if (x_filter_taps == 6) { av1_highbd_dist_wtd_convolve_x_neon(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params, bd); return; } int dst16_stride = conv_params->dst_stride; const int im_stride = MAX_SB_SIZE; const int horiz_offset = filter_params_x->taps / 2 - 1; assert(FILTER_BITS == COMPOUND_ROUND1_BITS); const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); src -= horiz_offset; if (bd == 12) { if (conv_params->do_average) { if (x_filter_taps <= 4) { highbd_12_dist_wtd_convolve_x_4tap_sve2(src + 2, src_stride, im_block, im_stride, w, h, x_filter_ptr); } else { highbd_12_dist_wtd_convolve_x_8tap_sve2(src, src_stride, im_block, im_stride, w, h, x_filter_ptr); } if (conv_params->use_dist_wtd_comp_avg) { highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params); } else { highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params); } } else { if (x_filter_taps <= 4) { highbd_12_dist_wtd_convolve_x_4tap_sve2( src + 2, src_stride, dst16, dst16_stride, w, h, x_filter_ptr); } else { highbd_12_dist_wtd_convolve_x_8tap_sve2( src, src_stride, dst16, dst16_stride, w, h, x_filter_ptr); } } } else { if (conv_params->do_average) { if (x_filter_taps <= 4) { highbd_dist_wtd_convolve_x_4tap_sve2(src + 2, src_stride, im_block, im_stride, w, h, x_filter_ptr, bd); } else { highbd_dist_wtd_convolve_x_8tap_sve2(src, src_stride, im_block, im_stride, w, h, x_filter_ptr, bd); } if (conv_params->use_dist_wtd_comp_avg) { highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params, bd); } else { highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params, bd); } } else { if (x_filter_taps <= 4) { highbd_dist_wtd_convolve_x_4tap_sve2( src + 2, src_stride, dst16, dst16_stride, w, h, x_filter_ptr, bd); } else { highbd_dist_wtd_convolve_x_8tap_sve2( src, src_stride, dst16, dst16_stride, w, h, x_filter_ptr, bd); } } } } static inline uint16x4_t highbd_12_convolve8_4_y(int16x8_t samples_lo[2], int16x8_t samples_hi[2], int16x8_t filter, int64x2_t offset) { int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); return vqrshrun_n_s32(sum0123, ROUND0_BITS + 2); } static inline uint16x8_t highbd_12_convolve8_8_y(int16x8_t samples_lo[4], int16x8_t samples_hi[4], int16x8_t filter, int64x2_t offset) { int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0); sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0); sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); return vcombine_u16(vqrshrun_n_s32(sum0123, ROUND0_BITS + 2), vqrshrun_n_s32(sum4567, ROUND0_BITS + 2)); } static inline void highbd_12_dist_wtd_convolve_y_8tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *y_filter_ptr) { const int64x2_t offset = vdupq_n_s64((1 << (12 + FILTER_BITS)) + (1 << (12 + FILTER_BITS - 1))); const int16x8_t y_filter = vld1q_s16(y_filter_ptr); uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); // Scale indices by size of the true vector length to avoid reading from an // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. uint16x8_t correction0 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); uint16x8_t correction1 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); uint16x8_t correction2 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); if (width == 4) { int16_t *s = (int16_t *)src; int16x4_t s0, s1, s2, s3, s4, s5, s6; load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; // This operation combines a conventional transpose and the sample permute // required before computing the dot product. int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; transpose_concat_4x4(s0, s1, s2, s3, s0123); transpose_concat_4x4(s1, s2, s3, s4, s1234); transpose_concat_4x4(s2, s3, s4, s5, s2345); transpose_concat_4x4(s3, s4, s5, s6, s3456); do { int16x4_t s7, s8, s9, s10; load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; // Transpose and shuffle the 4 lines that were loaded. transpose_concat_4x4(s7, s8, s9, s10, s789A); // Merge new data into block from previous iteration. aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); uint16x4_t d0 = highbd_12_convolve8_4_y(s0123, s4567, y_filter, offset); uint16x4_t d1 = highbd_12_convolve8_4_y(s1234, s5678, y_filter, offset); uint16x4_t d2 = highbd_12_convolve8_4_y(s2345, s6789, y_filter, offset); uint16x4_t d3 = highbd_12_convolve8_4_y(s3456, s789A, y_filter, offset); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123[0] = s4567[0]; s0123[1] = s4567[1]; s1234[0] = s5678[0]; s1234[1] = s5678[1]; s2345[0] = s6789[0]; s2345[1] = s6789[1]; s3456[0] = s789A[0]; s3456[1] = s789A[1]; s += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } else { do { int h = height; int16_t *s = (int16_t *)src; uint16_t *d = dst; int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; // This operation combines a conventional transpose and the sample permute // required before computing the dot product. int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; transpose_concat_8x4(s0, s1, s2, s3, s0123); transpose_concat_8x4(s1, s2, s3, s4, s1234); transpose_concat_8x4(s2, s3, s4, s5, s2345); transpose_concat_8x4(s3, s4, s5, s6, s3456); do { int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; // Transpose and shuffle the 4 lines that were loaded. transpose_concat_8x4(s7, s8, s9, s10, s789A); // Merge new data into block from previous iteration. aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); uint16x8_t d0 = highbd_12_convolve8_8_y(s0123, s4567, y_filter, offset); uint16x8_t d1 = highbd_12_convolve8_8_y(s1234, s5678, y_filter, offset); uint16x8_t d2 = highbd_12_convolve8_8_y(s2345, s6789, y_filter, offset); uint16x8_t d3 = highbd_12_convolve8_8_y(s3456, s789A, y_filter, offset); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123[0] = s4567[0]; s0123[1] = s4567[1]; s0123[2] = s4567[2]; s0123[3] = s4567[3]; s1234[0] = s5678[0]; s1234[1] = s5678[1]; s1234[2] = s5678[2]; s1234[3] = s5678[3]; s2345[0] = s6789[0]; s2345[1] = s6789[1]; s2345[2] = s6789[2]; s2345[3] = s6789[3]; s3456[0] = s789A[0]; s3456[1] = s789A[1]; s3456[2] = s789A[2]; s3456[3] = s789A[3]; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); src += 8; dst += 8; width -= 8; } while (width != 0); } } static inline uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2], int16x8_t samples_hi[2], int16x8_t filter, int64x2_t offset) { int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); return vqrshrun_n_s32(sum0123, ROUND0_BITS); } static inline uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4], int16x8_t samples_hi[4], int16x8_t filter, int64x2_t offset) { int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0); sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0); sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); return vcombine_u16(vqrshrun_n_s32(sum0123, ROUND0_BITS), vqrshrun_n_s32(sum4567, ROUND0_BITS)); } static inline void highbd_dist_wtd_convolve_y_8tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *y_filter_ptr, const int bd) { const int64x2_t offset = vdupq_n_s64((1 << (bd + FILTER_BITS)) + (1 << (bd + FILTER_BITS - 1))); const int16x8_t y_filter = vld1q_s16(y_filter_ptr); uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); // Scale indices by size of the true vector length to avoid reading from an // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. uint16x8_t correction0 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); uint16x8_t correction1 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); uint16x8_t correction2 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); if (width == 4) { int16_t *s = (int16_t *)src; int16x4_t s0, s1, s2, s3, s4, s5, s6; load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; // This operation combines a conventional transpose and the sample permute // required before computing the dot product. int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; transpose_concat_4x4(s0, s1, s2, s3, s0123); transpose_concat_4x4(s1, s2, s3, s4, s1234); transpose_concat_4x4(s2, s3, s4, s5, s2345); transpose_concat_4x4(s3, s4, s5, s6, s3456); do { int16x4_t s7, s8, s9, s10; load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; // Transpose and shuffle the 4 lines that were loaded. transpose_concat_4x4(s7, s8, s9, s10, s789A); // Merge new data into block from previous iteration. aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); uint16x4_t d0 = highbd_convolve8_4_y(s0123, s4567, y_filter, offset); uint16x4_t d1 = highbd_convolve8_4_y(s1234, s5678, y_filter, offset); uint16x4_t d2 = highbd_convolve8_4_y(s2345, s6789, y_filter, offset); uint16x4_t d3 = highbd_convolve8_4_y(s3456, s789A, y_filter, offset); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123[0] = s4567[0]; s0123[1] = s4567[1]; s1234[0] = s5678[0]; s1234[1] = s5678[1]; s2345[0] = s6789[0]; s2345[1] = s6789[1]; s3456[0] = s789A[0]; s3456[1] = s789A[1]; s += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } else { do { int h = height; int16_t *s = (int16_t *)src; uint16_t *d = dst; int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; // This operation combines a conventional transpose and the sample permute // required before computing the dot product. int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; transpose_concat_8x4(s0, s1, s2, s3, s0123); transpose_concat_8x4(s1, s2, s3, s4, s1234); transpose_concat_8x4(s2, s3, s4, s5, s2345); transpose_concat_8x4(s3, s4, s5, s6, s3456); do { int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; // Transpose and shuffle the 4 lines that were loaded. transpose_concat_8x4(s7, s8, s9, s10, s789A); // Merge new data into block from previous iteration. aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); uint16x8_t d0 = highbd_convolve8_8_y(s0123, s4567, y_filter, offset); uint16x8_t d1 = highbd_convolve8_8_y(s1234, s5678, y_filter, offset); uint16x8_t d2 = highbd_convolve8_8_y(s2345, s6789, y_filter, offset); uint16x8_t d3 = highbd_convolve8_8_y(s3456, s789A, y_filter, offset); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123[0] = s4567[0]; s0123[1] = s4567[1]; s0123[2] = s4567[2]; s0123[3] = s4567[3]; s1234[0] = s5678[0]; s1234[1] = s5678[1]; s1234[2] = s5678[2]; s1234[3] = s5678[3]; s2345[0] = s6789[0]; s2345[1] = s6789[1]; s2345[2] = s6789[2]; s2345[3] = s6789[3]; s3456[0] = s789A[0]; s3456[1] = s789A[1]; s3456[2] = s789A[2]; s3456[3] = s789A[3]; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); src += 8; dst += 8; width -= 8; } while (width != 0); } } void av1_highbd_dist_wtd_convolve_y_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(16, uint16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); CONV_BUF_TYPE *dst16 = conv_params->dst; const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); if (y_filter_taps != 8) { av1_highbd_dist_wtd_convolve_y_neon(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn, conv_params, bd); return; } int dst16_stride = conv_params->dst_stride; const int im_stride = MAX_SB_SIZE; const int vert_offset = filter_params_y->taps / 2 - 1; assert(FILTER_BITS == COMPOUND_ROUND1_BITS); const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); src -= vert_offset * src_stride; if (bd == 12) { if (conv_params->do_average) { highbd_12_dist_wtd_convolve_y_8tap_sve2(src, src_stride, im_block, im_stride, w, h, y_filter_ptr); if (conv_params->use_dist_wtd_comp_avg) { highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params); } else { highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params); } } else { highbd_12_dist_wtd_convolve_y_8tap_sve2(src, src_stride, dst16, dst16_stride, w, h, y_filter_ptr); } } else { if (conv_params->do_average) { highbd_dist_wtd_convolve_y_8tap_sve2(src, src_stride, im_block, im_stride, w, h, y_filter_ptr, bd); if (conv_params->use_dist_wtd_comp_avg) { highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params, bd); } else { highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, conv_params, bd); } } else { highbd_dist_wtd_convolve_y_8tap_sve2(src, src_stride, dst16, dst16_stride, w, h, y_filter_ptr, bd); } } } static inline void highbd_12_dist_wtd_convolve_2d_horiz_8tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *x_filter_ptr) { const int64x2_t offset = vdupq_n_s64(1 << (12 + FILTER_BITS - 2)); const int16x8_t filter = vld1q_s16(x_filter_ptr); // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know // that im_h % 4 = 3, so we can do the loop across the whole block 4 rows at // a time and then process the last 3 rows separately. do { const int16_t *s = (const int16_t *)src; uint16_t *d = dst; int w = width; do { int16x8_t s0[8], s1[8], s2[8], s3[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7]); uint16x8_t d0 = highbd_12_convolve8_8_x(s0, filter, offset); uint16x8_t d1 = highbd_12_convolve8_8_x(s1, filter, offset); uint16x8_t d2 = highbd_12_convolve8_8_x(s2, filter, offset); uint16x8_t d3 = highbd_12_convolve8_8_x(s3, filter, offset); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 4); // Process final 3 rows. const int16_t *s = (const int16_t *)src; do { int16x8_t s0[8], s1[8], s2[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); uint16x8_t d0 = highbd_12_convolve8_8_x(s0, filter, offset); uint16x8_t d1 = highbd_12_convolve8_8_x(s1, filter, offset); uint16x8_t d2 = highbd_12_convolve8_8_x(s2, filter, offset); store_u16_8x3(dst, dst_stride, d0, d1, d2); s += 8; dst += 8; width -= 8; } while (width != 0); } static inline void highbd_dist_wtd_convolve_2d_horiz_8tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *x_filter_ptr, const int bd) { const int64x2_t offset = vdupq_n_s64(1 << (bd + FILTER_BITS - 2)); const int16x8_t filter = vld1q_s16(x_filter_ptr); // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know // that im_h % 4 = 3, so we can do the loop across the whole block 4 rows at // a time and then process the last 3 rows separately. do { const int16_t *s = (const int16_t *)src; uint16_t *d = dst; int w = width; do { int16x8_t s0[8], s1[8], s2[8], s3[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7]); uint16x8_t d0 = highbd_convolve8_8_x(s0, filter, offset); uint16x8_t d1 = highbd_convolve8_8_x(s1, filter, offset); uint16x8_t d2 = highbd_convolve8_8_x(s2, filter, offset); uint16x8_t d3 = highbd_convolve8_8_x(s3, filter, offset); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 4); // Process final 3 rows. const int16_t *s = (const int16_t *)src; do { int16x8_t s0[8], s1[8], s2[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); uint16x8_t d0 = highbd_convolve8_8_x(s0, filter, offset); uint16x8_t d1 = highbd_convolve8_8_x(s1, filter, offset); uint16x8_t d2 = highbd_convolve8_8_x(s2, filter, offset); store_u16_8x3(dst, dst_stride, d0, d1, d2); s += 8; dst += 8; width -= 8; } while (width != 0); } static inline void highbd_12_dist_wtd_convolve_2d_horiz_4tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *x_filter_ptr) { const int64x2_t offset = vdupq_n_s64(1 << (12 + FILTER_BITS - 1)); const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know // that im_h % 4 = 3, so we can do the loop across the whole block 4 rows at // a time and then process the last 3 rows separately. if (width == 4) { uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); const int16_t *s = (const int16_t *)(src); do { int16x8_t s0, s1, s2, s3; load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x4_t d0 = highbd_12_convolve4_4_x(s0, filter, offset, permute_tbl); uint16x4_t d1 = highbd_12_convolve4_4_x(s1, filter, offset, permute_tbl); uint16x4_t d2 = highbd_12_convolve4_4_x(s2, filter, offset, permute_tbl); uint16x4_t d3 = highbd_12_convolve4_4_x(s3, filter, offset, permute_tbl); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 4); // Process final 3 rows. int16x8_t s0, s1, s2; load_s16_8x3(s, src_stride, &s0, &s1, &s2); uint16x4_t d0 = highbd_12_convolve4_4_x(s0, filter, offset, permute_tbl); uint16x4_t d1 = highbd_12_convolve4_4_x(s1, filter, offset, permute_tbl); uint16x4_t d2 = highbd_12_convolve4_4_x(s2, filter, offset, permute_tbl); store_u16_4x3(dst, dst_stride, d0, d1, d2); } else { uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); do { const int16_t *s = (const int16_t *)(src); uint16_t *d = dst; int w = width; do { int16x8_t s0[4], s1[4], s2[4], s3[4]; load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); uint16x8_t d0 = highbd_12_convolve4_8_x(s0, filter, offset, idx); uint16x8_t d1 = highbd_12_convolve4_8_x(s1, filter, offset, idx); uint16x8_t d2 = highbd_12_convolve4_8_x(s2, filter, offset, idx); uint16x8_t d3 = highbd_12_convolve4_8_x(s3, filter, offset, idx); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 4); // Process final 3 rows. const int16_t *s = (const int16_t *)(src); do { int16x8_t s0[4], s1[4], s2[4]; load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); uint16x8_t d0 = highbd_12_convolve4_8_x(s0, filter, offset, idx); uint16x8_t d1 = highbd_12_convolve4_8_x(s1, filter, offset, idx); uint16x8_t d2 = highbd_12_convolve4_8_x(s2, filter, offset, idx); store_u16_8x3(dst, dst_stride, d0, d1, d2); s += 8; dst += 8; width -= 8; } while (width != 0); } } static inline void highbd_dist_wtd_convolve_2d_horiz_4tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *x_filter_ptr, const int bd) { const int64x2_t offset = vdupq_n_s64(1 << (bd + FILTER_BITS - 1)); const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); // We are only doing 8-tap and 4-tap vertical convolutions, therefore we know // that im_h % 4 = 3, so we can do the loop across the whole block 4 rows at // a time and then process the last 3 rows separately. if (width == 4) { uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); const int16_t *s = (const int16_t *)(src); do { int16x8_t s0, s1, s2, s3; load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x4_t d0 = highbd_convolve4_4_x(s0, filter, offset, permute_tbl); uint16x4_t d1 = highbd_convolve4_4_x(s1, filter, offset, permute_tbl); uint16x4_t d2 = highbd_convolve4_4_x(s2, filter, offset, permute_tbl); uint16x4_t d3 = highbd_convolve4_4_x(s3, filter, offset, permute_tbl); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 4); // Process final 3 rows. int16x8_t s0, s1, s2; load_s16_8x3(s, src_stride, &s0, &s1, &s2); uint16x4_t d0 = highbd_convolve4_4_x(s0, filter, offset, permute_tbl); uint16x4_t d1 = highbd_convolve4_4_x(s1, filter, offset, permute_tbl); uint16x4_t d2 = highbd_convolve4_4_x(s2, filter, offset, permute_tbl); store_u16_4x3(dst, dst_stride, d0, d1, d2); } else { uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); do { const int16_t *s = (const int16_t *)(src); uint16_t *d = dst; int w = width; do { int16x8_t s0[4], s1[4], s2[4], s3[4]; load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); uint16x8_t d0 = highbd_convolve4_8_x(s0, filter, offset, idx); uint16x8_t d1 = highbd_convolve4_8_x(s1, filter, offset, idx); uint16x8_t d2 = highbd_convolve4_8_x(s2, filter, offset, idx); uint16x8_t d3 = highbd_convolve4_8_x(s3, filter, offset, idx); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 4); // Process final 3 rows. const int16_t *s = (const int16_t *)(src); do { int16x8_t s0[4], s1[4], s2[4]; load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); uint16x8_t d0 = highbd_convolve4_8_x(s0, filter, offset, idx); uint16x8_t d1 = highbd_convolve4_8_x(s1, filter, offset, idx); uint16x8_t d2 = highbd_convolve4_8_x(s2, filter, offset, idx); store_u16_8x3(dst, dst_stride, d0, d1, d2); s += 8; dst += 8; width -= 8; } while (width != 0); } } static inline uint16x4_t highbd_convolve8_4_2d_v(int16x8_t samples_lo[2], int16x8_t samples_hi[2], int16x8_t filter, int64x2_t offset) { int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); return vqrshrun_n_s32(sum0123, COMPOUND_ROUND1_BITS); } static inline uint16x8_t highbd_convolve8_8_2d_v(int16x8_t samples_lo[4], int16x8_t samples_hi[4], int16x8_t filter, int64x2_t offset) { int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0); sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0); sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); return vcombine_u16(vqrshrun_n_s32(sum0123, COMPOUND_ROUND1_BITS), vqrshrun_n_s32(sum4567, COMPOUND_ROUND1_BITS)); } static inline void highbd_dist_wtd_convolve_2d_vert_8tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *y_filter_ptr, int offset) { const int16x8_t y_filter = vld1q_s16(y_filter_ptr); const int64x2_t offset_s64 = vdupq_n_s64(offset); uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); // Scale indices by size of the true vector length to avoid reading from an // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. uint16x8_t correction0 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); uint16x8_t correction1 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); uint16x8_t correction2 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); if (width == 4) { int16_t *s = (int16_t *)src; int16x4_t s0, s1, s2, s3, s4, s5, s6; load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; // This operation combines a conventional transpose and the sample permute // required before computing the dot product. int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; transpose_concat_4x4(s0, s1, s2, s3, s0123); transpose_concat_4x4(s1, s2, s3, s4, s1234); transpose_concat_4x4(s2, s3, s4, s5, s2345); transpose_concat_4x4(s3, s4, s5, s6, s3456); do { int16x4_t s7, s8, s9, s10; load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; // Transpose and shuffle the 4 lines that were loaded. transpose_concat_4x4(s7, s8, s9, s10, s789A); // Merge new data into block from previous iteration. aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); uint16x4_t d0 = highbd_convolve8_4_2d_v(s0123, s4567, y_filter, offset_s64); uint16x4_t d1 = highbd_convolve8_4_2d_v(s1234, s5678, y_filter, offset_s64); uint16x4_t d2 = highbd_convolve8_4_2d_v(s2345, s6789, y_filter, offset_s64); uint16x4_t d3 = highbd_convolve8_4_2d_v(s3456, s789A, y_filter, offset_s64); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123[0] = s4567[0]; s0123[1] = s4567[1]; s1234[0] = s5678[0]; s1234[1] = s5678[1]; s2345[0] = s6789[0]; s2345[1] = s6789[1]; s3456[0] = s789A[0]; s3456[1] = s789A[1]; s += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } else { do { int h = height; int16_t *s = (int16_t *)src; uint16_t *d = dst; int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; // This operation combines a conventional transpose and the sample permute // required before computing the dot product. int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; transpose_concat_8x4(s0, s1, s2, s3, s0123); transpose_concat_8x4(s1, s2, s3, s4, s1234); transpose_concat_8x4(s2, s3, s4, s5, s2345); transpose_concat_8x4(s3, s4, s5, s6, s3456); do { int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; // Transpose and shuffle the 4 lines that were loaded. transpose_concat_8x4(s7, s8, s9, s10, s789A); // Merge new data into block from previous iteration. aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); uint16x8_t d0 = highbd_convolve8_8_2d_v(s0123, s4567, y_filter, offset_s64); uint16x8_t d1 = highbd_convolve8_8_2d_v(s1234, s5678, y_filter, offset_s64); uint16x8_t d2 = highbd_convolve8_8_2d_v(s2345, s6789, y_filter, offset_s64); uint16x8_t d3 = highbd_convolve8_8_2d_v(s3456, s789A, y_filter, offset_s64); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123[0] = s4567[0]; s0123[1] = s4567[1]; s0123[2] = s4567[2]; s0123[3] = s4567[3]; s1234[0] = s5678[0]; s1234[1] = s5678[1]; s1234[2] = s5678[2]; s1234[3] = s5678[3]; s2345[0] = s6789[0]; s2345[1] = s6789[1]; s2345[2] = s6789[2]; s2345[3] = s6789[3]; s3456[0] = s789A[0]; s3456[1] = s789A[1]; s3456[2] = s789A[2]; s3456[3] = s789A[3]; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); src += 8; dst += 8; width -= 8; } while (width != 0); } } static inline uint16x4_t highbd_convolve4_4_2d_v( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t filter, const int32x4_t offset) { int32x4_t sum = vmlal_lane_s16(offset, s0, filter, 0); sum = vmlal_lane_s16(sum, s1, filter, 1); sum = vmlal_lane_s16(sum, s2, filter, 2); sum = vmlal_lane_s16(sum, s3, filter, 3); return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); } static inline uint16x8_t highbd_convolve4_8_2d_v( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x4_t filter, const int32x4_t offset) { int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter, 3); int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter, 3); return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); } static inline void highbd_dist_wtd_convolve_2d_vert_4tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr, const int offset) { const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2); const int32x4_t offset_vec = vdupq_n_s32(offset); if (w == 4) { const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x4_t s0, s1, s2; load_s16_4x3(s, src_stride, &s0, &s1, &s2); s += 3 * src_stride; do { int16x4_t s3, s4, s5, s6; load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); uint16x4_t d0 = highbd_convolve4_4_2d_v(s0, s1, s2, s3, y_filter, offset_vec); uint16x4_t d1 = highbd_convolve4_4_2d_v(s1, s2, s3, s4, y_filter, offset_vec); uint16x4_t d2 = highbd_convolve4_4_2d_v(s2, s3, s4, s5, y_filter, offset_vec); uint16x4_t d3 = highbd_convolve4_4_2d_v(s3, s4, s5, s6, y_filter, offset_vec); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { do { int height = h; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x8_t s0, s1, s2; load_s16_8x3(s, src_stride, &s0, &s1, &s2); s += 3 * src_stride; do { int16x8_t s3, s4, s5, s6; load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); uint16x8_t d0 = highbd_convolve4_8_2d_v(s0, s1, s2, s3, y_filter, offset_vec); uint16x8_t d1 = highbd_convolve4_8_2d_v(s1, s2, s3, s4, y_filter, offset_vec); uint16x8_t d2 = highbd_convolve4_8_2d_v(s2, s3, s4, s5, y_filter, offset_vec); uint16x8_t d3 = highbd_convolve4_8_2d_v(s3, s4, s5, s6, y_filter, offset_vec); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } void av1_highbd_dist_wtd_convolve_2d_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(16, uint16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); DECLARE_ALIGNED(16, uint16_t, im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); CONV_BUF_TYPE *dst16 = conv_params->dst; int dst16_stride = conv_params->dst_stride; const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); const int clamped_x_taps = x_filter_taps < 4 ? 4 : x_filter_taps; const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; if (x_filter_taps == 6 || y_filter_taps == 6) { av1_highbd_dist_wtd_convolve_2d_neon( src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); return; } const int im_h = h + clamped_y_taps - 1; const int im_stride = MAX_SB_SIZE; const int vert_offset = clamped_y_taps / 2 - 1; const int horiz_offset = clamped_x_taps / 2 - 1; const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int round_offset_conv_y = (1 << y_offset_bits); const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); if (bd == 12) { if (x_filter_taps <= 4) { highbd_12_dist_wtd_convolve_2d_horiz_4tap_sve2( src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr); } else { highbd_12_dist_wtd_convolve_2d_horiz_8tap_sve2( src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr); } } else { if (x_filter_taps <= 4) { highbd_dist_wtd_convolve_2d_horiz_4tap_sve2( src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, bd); } else { highbd_dist_wtd_convolve_2d_horiz_8tap_sve2( src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, bd); } } if (conv_params->do_average) { if (y_filter_taps <= 4) { highbd_dist_wtd_convolve_2d_vert_4tap_neon(im_block, im_stride, im_block2, im_stride, w, h, y_filter_ptr, round_offset_conv_y); } else { highbd_dist_wtd_convolve_2d_vert_8tap_sve2(im_block, im_stride, im_block2, im_stride, w, h, y_filter_ptr, round_offset_conv_y); } if (conv_params->use_dist_wtd_comp_avg) { if (bd == 12) { highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, conv_params); } else { highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, conv_params, bd); } } else { if (bd == 12) { highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, conv_params); } else { highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, conv_params, bd); } } } else { if (y_filter_taps <= 4) { highbd_dist_wtd_convolve_2d_vert_4tap_neon( im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr, round_offset_conv_y); } else { highbd_dist_wtd_convolve_2d_vert_8tap_sve2( im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr, round_offset_conv_y); } } } aom-3.12.1/av1/common/arm/highbd_convolve_horiz_rs_neon.c000066400000000000000000000256471477627663500234170ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" #include "av1/common/arm/highbd_convolve_neon.h" #define UPSCALE_NORMATIVE_TAPS 8 void av1_highbd_convolve_horiz_rs_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd) { const int horiz_offset = UPSCALE_NORMATIVE_TAPS / 2 - 1; static const int32_t kIdx[4] = { 0, 1, 2, 3 }; const int32x4_t idx = vld1q_s32(kIdx); const int32x4_t subpel_mask = vdupq_n_s32(RS_SCALE_SUBPEL_MASK); const int32x4_t shift_s32 = vdupq_n_s32(-FILTER_BITS); const int32x4_t offset_s32 = vdupq_n_s32(0); const uint16x4_t max = vdup_n_u16((1 << bd) - 1); const uint16_t *src_ptr = src - horiz_offset; uint16_t *dst_ptr = dst; if (w <= 4) { int height = h; uint16_t *d = dst_ptr; do { int x_qn = x0_qn; // Load 4 src vectors at a time, they might be the same, but we have to // calculate the indices anyway. Doing it in SIMD and then storing the // indices is faster than having to calculate the expression // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times // Ideally this should be a gather using the indices, but NEON does not // have that, so have to emulate const int32x4_t xqn_idx = vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn); // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) = // 2 const int32x4_t src_idx = vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1); // Similarly for the filter vector indices, we calculate the filter // indices for 4 columns. First we calculate the indices: // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS // Then we calculate the actual pointers, multiplying with // UPSCALE_UPSCALE_NORMATIVE_TAPS // again shift left by 1 const int32x4_t x_filter4_idx = vshlq_n_s32( vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS), 1); // Even though pointers are unsigned 32/64-bit ints we do signed // addition The reason for this is that x_qn can be negative, leading to // negative offsets. Argon test // profile0_core/streams/test10573_11003.obu was failing because of // this. #if AOM_ARCH_AARCH64 uint64x2_t tmp4[2]; tmp4[0] = vreinterpretq_u64_s64(vaddw_s32( vdupq_n_s64((const int64_t)src_ptr), vget_low_s32(src_idx))); tmp4[1] = vreinterpretq_u64_s64(vaddw_s32( vdupq_n_s64((const int64_t)src_ptr), vget_high_s32(src_idx))); int16_t *src4_ptr[4]; uint64_t *tmp_ptr = (uint64_t *)&src4_ptr; vst1q_u64(tmp_ptr, tmp4[0]); vst1q_u64(tmp_ptr + 2, tmp4[1]); // filter vectors tmp4[0] = vreinterpretq_u64_s64(vmlal_s32( vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx), vdup_n_s32(UPSCALE_NORMATIVE_TAPS))); tmp4[1] = vreinterpretq_u64_s64(vmlal_s32( vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx), vdup_n_s32(UPSCALE_NORMATIVE_TAPS))); const int16_t *x_filter4_ptr[4]; tmp_ptr = (uint64_t *)&x_filter4_ptr; vst1q_u64(tmp_ptr, tmp4[0]); vst1q_u64(tmp_ptr + 2, tmp4[1]); #else uint32x4_t tmp4; tmp4 = vreinterpretq_u32_s32( vaddq_s32(vdupq_n_s32((const int32_t)src_ptr), src_idx)); int16_t *src4_ptr[4]; uint32_t *tmp_ptr = (uint32_t *)&src4_ptr; vst1q_u32(tmp_ptr, tmp4); // filter vectors tmp4 = vreinterpretq_u32_s32( vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx, vdupq_n_s32(UPSCALE_NORMATIVE_TAPS))); const int16_t *x_filter4_ptr[4]; tmp_ptr = (uint32_t *)&x_filter4_ptr; vst1q_u32(tmp_ptr, tmp4); #endif // AOM_ARCH_AARCH64 // Load source int16x8_t s0 = vld1q_s16(src4_ptr[0]); int16x8_t s1 = vld1q_s16(src4_ptr[1]); int16x8_t s2 = vld1q_s16(src4_ptr[2]); int16x8_t s3 = vld1q_s16(src4_ptr[3]); // Actually load the filters const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]); const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]); const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]); const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]); // Group low and high parts and transpose int16x4_t filters_lo[] = { vget_low_s16(x_filter0), vget_low_s16(x_filter1), vget_low_s16(x_filter2), vget_low_s16(x_filter3) }; int16x4_t filters_hi[] = { vget_high_s16(x_filter0), vget_high_s16(x_filter1), vget_high_s16(x_filter2), vget_high_s16(x_filter3) }; transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo); transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi); // Run the 2D Scale convolution uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16( s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32); d0 = vmin_u16(d0, max); if (w == 2) { store_u16_2x1(d, d0); } else { vst1_u16(d, d0); } src_ptr += src_stride; d += dst_stride; height--; } while (height > 0); } else { int height = h; do { int width = w; int x_qn = x0_qn; uint16_t *d = dst_ptr; const uint16_t *s = src_ptr; do { // Load 4 src vectors at a time, they might be the same, but we have to // calculate the indices anyway. Doing it in SIMD and then storing the // indices is faster than having to calculate the expression // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times // Ideally this should be a gather using the indices, but NEON does not // have that, so have to emulate const int32x4_t xqn_idx = vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn); // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) // = 2 const int32x4_t src_idx = vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1); // Similarly for the filter vector indices, we calculate the filter // indices for 4 columns. First we calculate the indices: // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS // Then we calculate the actual pointers, multiplying with // UPSCALE_UPSCALE_NORMATIVE_TAPS // again shift left by 1 const int32x4_t x_filter4_idx = vshlq_n_s32( vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS), 1); // Even though pointers are unsigned 32/64-bit ints we do signed // addition The reason for this is that x_qn can be negative, leading to // negative offsets. Argon test // profile0_core/streams/test10573_11003.obu was failing because of // this. #if AOM_ARCH_AARCH64 uint64x2_t tmp4[2]; tmp4[0] = vreinterpretq_u64_s64( vaddw_s32(vdupq_n_s64((const int64_t)s), vget_low_s32(src_idx))); tmp4[1] = vreinterpretq_u64_s64( vaddw_s32(vdupq_n_s64((const int64_t)s), vget_high_s32(src_idx))); int16_t *src4_ptr[4]; uint64_t *tmp_ptr = (uint64_t *)&src4_ptr; vst1q_u64(tmp_ptr, tmp4[0]); vst1q_u64(tmp_ptr + 2, tmp4[1]); // filter vectors tmp4[0] = vreinterpretq_u64_s64(vmlal_s32( vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx), vdup_n_s32(UPSCALE_NORMATIVE_TAPS))); tmp4[1] = vreinterpretq_u64_s64(vmlal_s32( vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx), vdup_n_s32(UPSCALE_NORMATIVE_TAPS))); const int16_t *x_filter4_ptr[4]; tmp_ptr = (uint64_t *)&x_filter4_ptr; vst1q_u64(tmp_ptr, tmp4[0]); vst1q_u64(tmp_ptr + 2, tmp4[1]); #else uint32x4_t tmp4; tmp4 = vreinterpretq_u32_s32( vaddq_s32(vdupq_n_s32((const int32_t)s), src_idx)); int16_t *src4_ptr[4]; uint32_t *tmp_ptr = (uint32_t *)&src4_ptr; vst1q_u32(tmp_ptr, tmp4); // filter vectors tmp4 = vreinterpretq_u32_s32( vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx, vdupq_n_s32(UPSCALE_NORMATIVE_TAPS))); const int16_t *x_filter4_ptr[4]; tmp_ptr = (uint32_t *)&x_filter4_ptr; vst1q_u32(tmp_ptr, tmp4); #endif // AOM_ARCH_AARCH64 // Load source int16x8_t s0 = vld1q_s16(src4_ptr[0]); int16x8_t s1 = vld1q_s16(src4_ptr[1]); int16x8_t s2 = vld1q_s16(src4_ptr[2]); int16x8_t s3 = vld1q_s16(src4_ptr[3]); // Actually load the filters const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]); const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]); const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]); const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]); // Group low and high parts and transpose int16x4_t filters_lo[] = { vget_low_s16(x_filter0), vget_low_s16(x_filter1), vget_low_s16(x_filter2), vget_low_s16(x_filter3) }; int16x4_t filters_hi[] = { vget_high_s16(x_filter0), vget_high_s16(x_filter1), vget_high_s16(x_filter2), vget_high_s16(x_filter3) }; transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo); transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi); // Run the 2D Scale X convolution uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16( s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32); d0 = vmin_u16(d0, max); vst1_u16(d, d0); x_qn += 4 * x_step_qn; d += 4; width -= 4; } while (width > 0); src_ptr += src_stride; dst_ptr += dst_stride; height--; } while (height > 0); } } aom-3.12.1/av1/common/arm/highbd_convolve_neon.c000066400000000000000000002372621477627663500214760ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_ports/mem.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" static inline uint16x4_t highbd_convolve6_4_y( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x8_t y_filter, const uint16x4_t max) { // Values at indices 0 and 7 of y_filter are zero. const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 1); sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2); sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3); sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0); sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1); sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2); uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve6_8_y( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t y_filter, const uint16x8_t max) { // Values at indices 0 and 7 of y_filter are zero. const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2); int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); return vminq_u16(res, max); } static inline void highbd_convolve_y_sr_6tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr, const int bd) { const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); if (w == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); const int16_t *s = (const int16_t *)(src_ptr + src_stride); uint16_t *d = dst_ptr; int16x4_t s0, s1, s2, s3, s4; load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); s += 5 * src_stride; do { int16x4_t s5, s6, s7, s8; load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); uint16x4_t d0 = highbd_convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter_0_7, max); uint16x4_t d1 = highbd_convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter_0_7, max); uint16x4_t d2 = highbd_convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter_0_7, max); uint16x4_t d3 = highbd_convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter_0_7, max); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); // Width is a multiple of 8 and height is a multiple of 4. do { int height = h; const int16_t *s = (const int16_t *)(src_ptr + src_stride); uint16_t *d = dst_ptr; int16x8_t s0, s1, s2, s3, s4; load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); s += 5 * src_stride; do { int16x8_t s5, s6, s7, s8; load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); uint16x8_t d0 = highbd_convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter_0_7, max); uint16x8_t d1 = highbd_convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter_0_7, max); uint16x8_t d2 = highbd_convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter_0_7, max); uint16x8_t d3 = highbd_convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter_0_7, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline uint16x4_t highbd_convolve8_4_y( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, const uint16x4_t max) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0); sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve8_8_y( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, const uint16x8_t max) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); return vminq_u16(res, max); } static inline void highbd_convolve_y_sr_8tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr, int bd) { const int16x8_t y_filter = vld1q_s16(y_filter_ptr); if (w == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x4_t s0, s1, s2, s3, s4, s5, s6; load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; do { int16x4_t s7, s8, s9, s10; load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); uint16x4_t d0 = highbd_convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max); uint16x4_t d1 = highbd_convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max); uint16x4_t d2 = highbd_convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max); uint16x4_t d3 = highbd_convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); do { int height = h; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; do { int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); uint16x8_t d0 = highbd_convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, max); uint16x8_t d1 = highbd_convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, max); uint16x8_t d2 = highbd_convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, max); uint16x8_t d3 = highbd_convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline uint16x4_t highbd_convolve12_4_y( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11, const uint16x4_t max) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0); sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0); sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1); sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2); sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3); uint16x4_t res = vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve12_8_y( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t s8, const int16x8_t s9, const int16x8_t s10, const int16x8_t s11, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11, const uint16x8_t max) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3); int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); return vminq_u16(res, max); } static inline void highbd_convolve_y_sr_12tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr, int bd) { const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); if (w == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10); s += 11 * src_stride; do { int16x4_t s11, s12, s13, s14; load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14); uint16x4_t d0 = highbd_convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7, y_filter_8_11, max); uint16x4_t d1 = highbd_convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7, y_filter_8_11, max); uint16x4_t d2 = highbd_convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7, y_filter_8_11, max); uint16x4_t d3 = highbd_convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7, y_filter_8_11, max); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s7 = s11; s8 = s12; s9 = s13; s10 = s14; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); do { int height = h; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10); s += 11 * src_stride; do { int16x8_t s11, s12, s13, s14; load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14); uint16x8_t d0 = highbd_convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7, y_filter_8_11, max); uint16x8_t d1 = highbd_convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7, y_filter_8_11, max); uint16x8_t d2 = highbd_convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7, y_filter_8_11, max); uint16x8_t d3 = highbd_convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7, y_filter_8_11, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s7 = s11; s8 = s12; s9 = s13; s10 = s14; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } void av1_highbd_convolve_y_sr_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd) { if (w == 2 || h == 2) { av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn, bd); return; } const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); const int vert_offset = filter_params_y->taps / 2 - 1; const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); src -= vert_offset * src_stride; if (y_filter_taps > 8) { highbd_convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter_ptr, bd); return; } if (y_filter_taps < 8) { highbd_convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter_ptr, bd); return; } highbd_convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter_ptr, bd); } static inline uint16x8_t highbd_convolve6_8_x(const int16x8_t s[6], const int16x8_t x_filter, const int32x4_t offset, const uint16x8_t max) { // Values at indices 0 and 7 of y_filter are zero. const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); int32x4_t sum0 = offset; sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2); int32x4_t sum1 = offset; sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), vqrshrun_n_s32(sum1, FILTER_BITS)); return vminq_u16(res, max); } static inline void highbd_convolve_x_sr_6tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, int bd) { const int16x8_t x_filter = vld1q_s16(x_filter_ptr); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); // This shim allows to do only one rounding shift instead of two. const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1)); int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[6], s1[6], s2[6], s3[6]; load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5]); load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5]); load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5]); uint16x8_t d0 = highbd_convolve6_8_x(s0, x_filter, offset, max); uint16x8_t d1 = highbd_convolve6_8_x(s1, x_filter, offset, max); uint16x8_t d2 = highbd_convolve6_8_x(s2, x_filter, offset, max); uint16x8_t d3 = highbd_convolve6_8_x(s3, x_filter, offset, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height != 0); } static inline uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4], const int16x4_t x_filter, const int32x4_t offset, const uint16x4_t max) { int32x4_t sum = offset; sum = vmlal_lane_s16(sum, s[0], x_filter, 0); sum = vmlal_lane_s16(sum, s[1], x_filter, 1); sum = vmlal_lane_s16(sum, s[2], x_filter, 2); sum = vmlal_lane_s16(sum, s[3], x_filter, 3); uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve8_8_x(const int16x8_t s[8], const int16x8_t x_filter, const int32x4_t offset, const uint16x8_t max) { const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); int32x4_t sum0 = offset; sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3); int32x4_t sum1 = offset; sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), vqrshrun_n_s32(sum1, FILTER_BITS)); return vminq_u16(res, max); } static inline void highbd_convolve_x_sr_neon(const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, int bd) { // This shim allows to do only one rounding shift instead of two. const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1)); if (w == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); // 4-tap filters are used for blocks having width == 4. const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); const int16_t *s = (const int16_t *)(src_ptr + 2); uint16_t *d = dst_ptr; do { int16x4_t s0[4], s1[4], s2[4], s3[4]; load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset, max); uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset, max); uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset, max); uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset, max); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); const int16x8_t x_filter = vld1q_s16(x_filter_ptr); int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[8], s1[8], s2[8], s3[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7]); uint16x8_t d0 = highbd_convolve8_8_x(s0, x_filter, offset, max); uint16x8_t d1 = highbd_convolve8_8_x(s1, x_filter, offset, max); uint16x8_t d2 = highbd_convolve8_8_x(s2, x_filter, offset, max); uint16x8_t d3 = highbd_convolve8_8_x(s3, x_filter, offset, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height != 0); } } static inline uint16x4_t highbd_convolve12_4_x(const int16x4_t s[12], const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11, const int32x4_t offset, const uint16x4_t max) { const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); int32x4_t sum = offset; sum = vmlal_lane_s16(sum, s[0], x_filter_0_3, 0); sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1); sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2); sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3); sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0); sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1); sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2); sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3); sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0); sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1); sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2); sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3); uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve12_8_x(const int16x8_t s[12], const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11, const int32x4_t offset, const uint16x8_t max) { const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); int32x4_t sum0 = offset; sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3); int32x4_t sum1 = offset; sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), vqrshrun_n_s32(sum1, FILTER_BITS)); return vminq_u16(res, max); } static inline void highbd_convolve_x_sr_12tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, int bd) { // This shim allows to do only one rounding shift instead of two. const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1)); const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); if (w == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x4_t s0[12], s1[12], s2[12], s3[12]; load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], &s0[11]); load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], &s1[11]); load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10], &s2[11]); load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10], &s3[11]); uint16x4_t d0 = highbd_convolve12_4_x(s0, x_filter_0_7, x_filter_8_11, offset, max); uint16x4_t d1 = highbd_convolve12_4_x(s1, x_filter_0_7, x_filter_8_11, offset, max); uint16x4_t d2 = highbd_convolve12_4_x(s2, x_filter_0_7, x_filter_8_11, offset, max); uint16x4_t d3 = highbd_convolve12_4_x(s3, x_filter_0_7, x_filter_8_11, offset, max); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[12], s1[12], s2[12], s3[12]; load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], &s0[11]); load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], &s1[11]); load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10], &s2[11]); load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10], &s3[11]); uint16x8_t d0 = highbd_convolve12_8_x(s0, x_filter_0_7, x_filter_8_11, offset, max); uint16x8_t d1 = highbd_convolve12_8_x(s1, x_filter_0_7, x_filter_8_11, offset, max); uint16x8_t d2 = highbd_convolve12_8_x(s2, x_filter_0_7, x_filter_8_11, offset, max); uint16x8_t d3 = highbd_convolve12_8_x(s3, x_filter_0_7, x_filter_8_11, offset, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height != 0); } } void av1_highbd_convolve_x_sr_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd) { if (w == 2 || h == 2) { av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params, bd); return; } const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); const int horiz_offset = filter_params_x->taps / 2 - 1; const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); src -= horiz_offset; if (x_filter_taps > 8) { highbd_convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, x_filter_ptr, conv_params, bd); return; } if (x_filter_taps <= 6 && w != 4) { highbd_convolve_x_sr_6tap_neon(src + 1, src_stride, dst, dst_stride, w, h, x_filter_ptr, conv_params, bd); return; } highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h, x_filter_ptr, conv_params, bd); } static inline uint16x4_t highbd_convolve6_4_2d_v( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x8_t y_filter, const int32x4_t round_shift, const int32x4_t offset, const uint16x4_t max) { // Values at indices 0 and 7 of y_filter are zero. const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1); sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2); sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3); sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0); sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1); sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2); sum = vshlq_s32(sum, round_shift); uint16x4_t res = vqmovun_s32(sum); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve6_8_2d_v( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t y_filter, const int32x4_t round_shift, const int32x4_t offset, const uint16x8_t max) { // Values at indices 0 and 7 of y_filter are zero. const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2); int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2); sum0 = vshlq_s32(sum0, round_shift); sum1 = vshlq_s32(sum1, round_shift); uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); return vminq_u16(res, max); } static inline void highbd_convolve_2d_sr_vert_6tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params, int bd, const int offset) { const int16x8_t y_filter = vld1q_s16(y_filter_ptr); const int32x4_t offset_s32 = vdupq_n_s32(offset); const int round1_shift = conv_params->round_1; const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift); if (w == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x4_t s0, s1, s2, s3, s4; load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); s += 5 * src_stride; do { int16x4_t s5, s6, s7, s8; load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); uint16x4_t d0 = highbd_convolve6_4_2d_v( s0, s1, s2, s3, s4, s5, y_filter, round1_shift_s32, offset_s32, max); uint16x4_t d1 = highbd_convolve6_4_2d_v( s1, s2, s3, s4, s5, s6, y_filter, round1_shift_s32, offset_s32, max); uint16x4_t d2 = highbd_convolve6_4_2d_v( s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32, max); uint16x4_t d3 = highbd_convolve6_4_2d_v( s3, s4, s5, s6, s7, s8, y_filter, round1_shift_s32, offset_s32, max); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); do { int height = h; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x8_t s0, s1, s2, s3, s4; load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); s += 5 * src_stride; do { int16x8_t s5, s6, s7, s8; load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); uint16x8_t d0 = highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, round1_shift_s32, offset_s32, max); uint16x8_t d1 = highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, round1_shift_s32, offset_s32, max); uint16x8_t d2 = highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32, max); uint16x8_t d3 = highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, round1_shift_s32, offset_s32, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline uint16x4_t highbd_convolve8_4_2d_v( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, const int32x4_t round_shift, const int32x4_t offset, const uint16x4_t max) { const int16x4_t y_filter_lo = vget_low_s16(y_filter); const int16x4_t y_filter_hi = vget_high_s16(y_filter); int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 0); sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1); sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2); sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3); sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0); sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1); sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2); sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3); sum = vshlq_s32(sum, round_shift); uint16x4_t res = vqmovun_s32(sum); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve8_8_2d_v( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, const int32x4_t round_shift, const int32x4_t offset, const uint16x8_t max) { const int16x4_t y_filter_lo = vget_low_s16(y_filter); const int16x4_t y_filter_hi = vget_high_s16(y_filter); int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3); int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3); sum0 = vshlq_s32(sum0, round_shift); sum1 = vshlq_s32(sum1, round_shift); uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); return vminq_u16(res, max); } static inline void highbd_convolve_2d_sr_vert_8tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params, int bd, const int offset) { const int16x8_t y_filter = vld1q_s16(y_filter_ptr); const int32x4_t offset_s32 = vdupq_n_s32(offset); const int round1_shift = conv_params->round_1; const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift); if (w == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x4_t s0, s1, s2, s3, s4, s5, s6; load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; do { int16x4_t s7, s8, s9, s10; load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); uint16x4_t d0 = highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32, max); uint16x4_t d1 = highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, round1_shift_s32, offset_s32, max); uint16x4_t d2 = highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, round1_shift_s32, offset_s32, max); uint16x4_t d3 = highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, round1_shift_s32, offset_s32, max); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); do { int height = h; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; do { int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); uint16x8_t d0 = highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32, max); uint16x8_t d1 = highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, round1_shift_s32, offset_s32, max); uint16x8_t d2 = highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, round1_shift_s32, offset_s32, max); uint16x8_t d3 = highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, round1_shift_s32, offset_s32, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline uint16x4_t highbd_convolve12_4_2d_v( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11, const int32x4_t round_shift, const int32x4_t offset, const uint16x4_t max) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0); sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0); sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1); sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2); sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3); sum = vshlq_s32(sum, round_shift); uint16x4_t res = vqmovun_s32(sum); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve12_8_2d_v( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t s8, const int16x8_t s9, const int16x8_t s10, const int16x8_t s11, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11, const int32x4_t round_shift, const int32x4_t offset, const uint16x8_t max) { const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3); int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3); sum0 = vshlq_s32(sum0, round_shift); sum1 = vshlq_s32(sum1, round_shift); uint16x8_t res = vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); return vminq_u16(res, max); } static inline void highbd_convolve_2d_sr_vert_12tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params, const int bd, const int offset) { const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); const int32x4_t offset_s32 = vdupq_n_s32(offset); const int round1_shift = conv_params->round_1; const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift); if (w == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10); s += 11 * src_stride; do { int16x4_t s11, s12, s13, s14; load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14); uint16x4_t d0 = highbd_convolve12_4_2d_v( s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7, y_filter_8_11, round1_shift_s32, offset_s32, max); uint16x4_t d1 = highbd_convolve12_4_2d_v( s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7, y_filter_8_11, round1_shift_s32, offset_s32, max); uint16x4_t d2 = highbd_convolve12_4_2d_v( s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7, y_filter_8_11, round1_shift_s32, offset_s32, max); uint16x4_t d3 = highbd_convolve12_4_2d_v( s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7, y_filter_8_11, round1_shift_s32, offset_s32, max); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s7 = s11; s8 = s12; s9 = s13; s10 = s14; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); do { int height = h; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10); s += 11 * src_stride; do { int16x8_t s11, s12, s13, s14; load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14); uint16x8_t d0 = highbd_convolve12_8_2d_v( s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7, y_filter_8_11, round1_shift_s32, offset_s32, max); uint16x8_t d1 = highbd_convolve12_8_2d_v( s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7, y_filter_8_11, round1_shift_s32, offset_s32, max); uint16x8_t d2 = highbd_convolve12_8_2d_v( s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7, y_filter_8_11, round1_shift_s32, offset_s32, max); uint16x8_t d3 = highbd_convolve12_8_2d_v( s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7, y_filter_8_11, round1_shift_s32, offset_s32, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; s5 = s9; s6 = s10; s7 = s11; s8 = s12; s9 = s13; s10 = s14; s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height != 0); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w != 0); } } static inline uint16x8_t highbd_convolve6_8_2d_h(const int16x8_t s[6], const int16x8_t x_filter, const int32x4_t shift_s32, const int32x4_t offset) { // Values at indices 0 and 7 of y_filter are zero. const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2); int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2); sum0 = vqrshlq_s32(sum0, shift_s32); sum1 = vqrshlq_s32(sum1, shift_s32); return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); } static inline void highbd_convolve_2d_sr_horiz_6tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, const int offset) { // The smallest block height processed by the SIMD functions is 4, and the // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines // for the vertical convolution. assert(h >= 5); const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0); const int32x4_t offset_s32 = vdupq_n_s32(offset); const int16x8_t x_filter = vld1q_s16(x_filter_ptr); int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[6], s1[6], s2[6], s3[6]; load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5]); load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5]); load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5]); uint16x8_t d0 = highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32); uint16x8_t d1 = highbd_convolve6_8_2d_h(s1, x_filter, shift_s32, offset_s32); uint16x8_t d2 = highbd_convolve6_8_2d_h(s2, x_filter, shift_s32, offset_s32); uint16x8_t d3 = highbd_convolve6_8_2d_h(s3, x_filter, shift_s32, offset_s32); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height > 4); do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[6]; load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); uint16x8_t d0 = highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32); vst1q_u16(d, d0); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } static inline uint16x4_t highbd_convolve4_4_2d_h(const int16x4_t s[4], const int16x4_t x_filter, const int32x4_t shift_s32, const int32x4_t offset) { int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0); sum = vmlal_lane_s16(sum, s[1], x_filter, 1); sum = vmlal_lane_s16(sum, s[2], x_filter, 2); sum = vmlal_lane_s16(sum, s[3], x_filter, 3); sum = vqrshlq_s32(sum, shift_s32); return vqmovun_s32(sum); } static inline uint16x8_t highbd_convolve8_8_2d_h(const int16x8_t s[8], const int16x8_t x_filter, const int32x4_t shift_s32, const int32x4_t offset) { const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3); int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3); sum0 = vqrshlq_s32(sum0, shift_s32); sum1 = vqrshlq_s32(sum1, shift_s32); return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); } static inline void highbd_convolve_2d_sr_horiz_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, const int offset) { // The smallest block height processed by the SIMD functions is 4, and the // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines // for the vertical convolution. assert(h >= 5); const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0); const int32x4_t offset_s32 = vdupq_n_s32(offset); if (w == 4) { // 4-tap filters are used for blocks having width <= 4. const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); const int16_t *s = (const int16_t *)(src_ptr + 1); uint16_t *d = dst_ptr; do { int16x4_t s0[4], s1[4], s2[4], s3[4]; load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); uint16x4_t d0 = highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32); uint16x4_t d1 = highbd_convolve4_4_2d_h(s1, x_filter, shift_s32, offset_s32); uint16x4_t d2 = highbd_convolve4_4_2d_h(s2, x_filter, shift_s32, offset_s32); uint16x4_t d3 = highbd_convolve4_4_2d_h(s3, x_filter, shift_s32, offset_s32); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h > 4); do { int16x4_t s0[4]; load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]); uint16x4_t d0 = highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32); vst1_u16(d, d0); s += src_stride; d += dst_stride; } while (--h != 0); } else { const int16x8_t x_filter = vld1q_s16(x_filter_ptr); int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[8], s1[8], s2[8], s3[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7]); uint16x8_t d0 = highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32); uint16x8_t d1 = highbd_convolve8_8_2d_h(s1, x_filter, shift_s32, offset_s32); uint16x8_t d2 = highbd_convolve8_8_2d_h(s2, x_filter, shift_s32, offset_s32); uint16x8_t d3 = highbd_convolve8_8_2d_h(s3, x_filter, shift_s32, offset_s32); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height > 4); do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); uint16x8_t d0 = highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32); vst1q_u16(d, d0); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } } static inline uint16x4_t highbd_convolve12_4_2d_h(const int16x4_t s[12], const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11, const int32x4_t shift_s32, const int32x4_t offset) { const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter_0_3, 0); sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1); sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2); sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3); sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0); sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1); sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2); sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3); sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0); sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1); sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2); sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3); sum = vqrshlq_s32(sum, shift_s32); return vqmovun_s32(sum); } static inline uint16x8_t highbd_convolve12_8_2d_h(const int16x8_t s[12], const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11, const int32x4_t shift_s32, const int32x4_t offset) { const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3); int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3); sum0 = vqrshlq_s32(sum0, shift_s32); sum1 = vqrshlq_s32(sum1, shift_s32); return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); } static inline void highbd_convolve_2d_sr_horiz_12tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, const int offset) { // The smallest block height processed by the SIMD functions is 4, and the // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines // for the vertical convolution. assert(h >= 5); const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0); const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); const int32x4_t offset_s32 = vdupq_n_s32(offset); if (w == 4) { const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x4_t s0[12], s1[12], s2[12], s3[12]; load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], &s0[11]); load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], &s1[11]); load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10], &s2[11]); load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10], &s3[11]); uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); uint16x4_t d1 = highbd_convolve12_4_2d_h(s1, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); uint16x4_t d2 = highbd_convolve12_4_2d_h(s2, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); uint16x4_t d3 = highbd_convolve12_4_2d_h(s3, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h > 4); do { int16x4_t s0[12]; load_s16_4x12(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], &s0[11]); uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); vst1_u16(d, d0); s += src_stride; d += dst_stride; } while (--h != 0); } else { int height = h; do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[12], s1[12], s2[12], s3[12]; load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], &s0[11]); load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], &s1[11]); load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10], &s2[11]); load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10], &s3[11]); uint16x8_t d0 = highbd_convolve12_8_2d_h( s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); uint16x8_t d1 = highbd_convolve12_8_2d_h( s1, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); uint16x8_t d2 = highbd_convolve12_8_2d_h( s2, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); uint16x8_t d3 = highbd_convolve12_8_2d_h( s3, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; height -= 4; } while (height > 4); do { int width = w; const int16_t *s = (const int16_t *)src_ptr; uint16_t *d = dst_ptr; do { int16x8_t s0[12]; load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], &s0[11]); uint16x8_t d0 = highbd_convolve12_8_2d_h( s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); vst1q_u16(d, d0); s += 8; d += 8; width -= 8; } while (width > 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); } } void av1_highbd_convolve_2d_sr_neon(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { if (w == 2 || h == 2) { av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); return; } DECLARE_ALIGNED(16, uint16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps; const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; const int im_h = h + clamped_y_taps - 1; const int im_stride = MAX_SB_SIZE; const int vert_offset = clamped_y_taps / 2 - 1; const int horiz_offset = clamped_x_taps / 2 - 1; const int x_offset_initial = (1 << (bd + FILTER_BITS - 1)); const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a // simple shift left instead of a rounding saturating shift left. const int y_offset = (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1)); const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); if (x_filter_taps > 8) { highbd_convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, conv_params, x_offset_initial); highbd_convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter_ptr, conv_params, bd, y_offset); return; } if (x_filter_taps <= 6 && w != 4) { highbd_convolve_2d_sr_horiz_6tap_neon(src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, conv_params, x_offset_initial); } else { highbd_convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, conv_params, x_offset_initial); } if (y_filter_taps <= 6) { highbd_convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter_ptr, conv_params, bd, y_offset); } else { highbd_convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter_ptr, conv_params, bd, y_offset); } } // Filter used is [64, 64]. void av1_highbd_convolve_x_sr_intrabc_neon( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd) { assert(subpel_x_qn == 8); assert(filter_params_x->taps == 2); assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); (void)filter_params_x; (void)subpel_x_qn; (void)conv_params; (void)bd; if (w <= 4) { do { uint16x4_t s0 = vld1_u16(src); uint16x4_t s1 = vld1_u16(src + 1); uint16x4_t d0 = vrhadd_u16(s0, s1); if (w == 2) { store_u16_2x1(dst, d0); } else { vst1_u16(dst, d0); } src += src_stride; dst += dst_stride; } while (--h != 0); } else { do { const uint16_t *src_ptr = src; uint16_t *dst_ptr = dst; int width = w; do { uint16x8_t s0 = vld1q_u16(src_ptr); uint16x8_t s1 = vld1q_u16(src_ptr + 1); uint16x8_t d0 = vrhaddq_u16(s0, s1); vst1q_u16(dst_ptr, d0); src_ptr += 8; dst_ptr += 8; width -= 8; } while (width != 0); src += src_stride; dst += dst_stride; } while (--h != 0); } } // Filter used is [64, 64]. void av1_highbd_convolve_y_sr_intrabc_neon( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd) { assert(subpel_y_qn == 8); assert(filter_params_y->taps == 2); (void)filter_params_y; (void)subpel_y_qn; (void)bd; if (w <= 4) { do { uint16x4_t s0 = vld1_u16(src); uint16x4_t s1 = vld1_u16(src + src_stride); uint16x4_t d0 = vrhadd_u16(s0, s1); if (w == 2) { store_u16_2x1(dst, d0); } else { vst1_u16(dst, d0); } src += src_stride; dst += dst_stride; } while (--h != 0); } else { do { const uint16_t *src_ptr = src; uint16_t *dst_ptr = dst; int height = h; do { uint16x8_t s0 = vld1q_u16(src_ptr); uint16x8_t s1 = vld1q_u16(src_ptr + src_stride); uint16x8_t d0 = vrhaddq_u16(s0, s1); vst1q_u16(dst_ptr, d0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--height != 0); src += 8; dst += 8; w -= 8; } while (w != 0); } } // Both horizontal and vertical passes use the same 2-tap filter: [64, 64]. void av1_highbd_convolve_2d_sr_intrabc_neon( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { assert(subpel_x_qn == 8); assert(subpel_y_qn == 8); assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); (void)filter_params_x; (void)subpel_x_qn; (void)filter_params_y; (void)subpel_y_qn; (void)conv_params; (void)bd; DECLARE_ALIGNED(16, uint16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); int im_h = h + 1; int im_stride = MAX_SB_SIZE; uint16x8_t vert_offset = vdupq_n_u16(1); uint16_t *im = im_block; // Horizontal filter. if (w <= 4) { do { uint16x4_t s0 = vld1_u16(src); uint16x4_t s1 = vld1_u16(src + 1); uint16x4_t d0 = vadd_u16(s0, s1); // Safe to store the whole vector, the im buffer is big enough. vst1_u16(im, d0); src += src_stride; im += im_stride; } while (--im_h != 0); } else { do { const uint16_t *src_ptr = src; uint16_t *im_ptr = im; int width = w; do { uint16x8_t s0 = vld1q_u16(src_ptr); uint16x8_t s1 = vld1q_u16(src_ptr + 1); uint16x8_t d0 = vaddq_u16(s0, s1); vst1q_u16(im_ptr, d0); src_ptr += 8; im_ptr += 8; width -= 8; } while (width != 0); src += src_stride; im += im_stride; } while (--im_h != 0); } im = im_block; // Vertical filter. if (w <= 4) { do { uint16x4_t s0 = vld1_u16(im); uint16x4_t s1 = vld1_u16(im + im_stride); uint16x4_t d0 = vhadd_u16(s0, s1); d0 = vhadd_u16(d0, vget_low_u16(vert_offset)); if (w == 2) { store_u16_2x1(dst, d0); } else { vst1_u16(dst, d0); } im += im_stride; dst += dst_stride; } while (--h != 0); } else { do { uint16_t *im_ptr = im; uint16_t *dst_ptr = dst; int height = h; do { uint16x8_t s0 = vld1q_u16(im_ptr); uint16x8_t s1 = vld1q_u16(im_ptr + im_stride); uint16x8_t d0 = vhaddq_u16(s0, s1); d0 = vhaddq_u16(d0, vert_offset); vst1q_u16(dst_ptr, d0); im_ptr += im_stride; dst_ptr += dst_stride; } while (--height != 0); im += 8; dst += 8; w -= 8; } while (w != 0); } } aom-3.12.1/av1/common/arm/highbd_convolve_neon.h000066400000000000000000000145001477627663500214670ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_NEON_H_ #define AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_NEON_H_ #include #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "av1/common/convolve.h" static inline int32x4_t highbd_convolve8_4_s32( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, const int32x4_t offset) { const int16x4_t y_filter_lo = vget_low_s16(y_filter); const int16x4_t y_filter_hi = vget_high_s16(y_filter); int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 0); sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1); sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2); sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3); sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0); sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1); sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2); sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3); return sum; } static inline uint16x4_t highbd_convolve8_4_sr_s32_s16( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, const int32x4_t shift_s32, const int32x4_t offset) { int32x4_t sum = highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset); sum = vqrshlq_s32(sum, shift_s32); return vqmovun_s32(sum); } // Like above but also perform round shifting and subtract correction term static inline uint16x4_t highbd_convolve8_4_srsub_s32_s16( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, const int32x4_t round_shift, const int32x4_t offset, const int32x4_t correction) { int32x4_t sum = highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset); sum = vsubq_s32(vqrshlq_s32(sum, round_shift), correction); return vqmovun_s32(sum); } static inline void highbd_convolve8_8_s32( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, const int32x4_t offset, int32x4_t *sum0, int32x4_t *sum1) { const int16x4_t y_filter_lo = vget_low_s16(y_filter); const int16x4_t y_filter_hi = vget_high_s16(y_filter); *sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 0); *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1); *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2); *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3); *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0); *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1); *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2); *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3); *sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 0); *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1); *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2); *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3); *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0); *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1); *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2); *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3); } // Like above but also perform round shifting and subtract correction term static inline uint16x8_t highbd_convolve8_8_srsub_s32_s16( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, const int32x4_t round_shift, const int32x4_t offset, const int32x4_t correction) { int32x4_t sum0; int32x4_t sum1; highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset, &sum0, &sum1); sum0 = vsubq_s32(vqrshlq_s32(sum0, round_shift), correction); sum1 = vsubq_s32(vqrshlq_s32(sum1, round_shift), correction); return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); } static inline int32x4_t highbd_convolve8_2d_scale_horiz4x8_s32( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x4_t *filters_lo, const int16x4_t *filters_hi, const int32x4_t offset) { int16x4_t s_lo[] = { vget_low_s16(s0), vget_low_s16(s1), vget_low_s16(s2), vget_low_s16(s3) }; int16x4_t s_hi[] = { vget_high_s16(s0), vget_high_s16(s1), vget_high_s16(s2), vget_high_s16(s3) }; transpose_array_inplace_u16_4x4((uint16x4_t *)s_lo); transpose_array_inplace_u16_4x4((uint16x4_t *)s_hi); int32x4_t sum = vmlal_s16(offset, s_lo[0], filters_lo[0]); sum = vmlal_s16(sum, s_lo[1], filters_lo[1]); sum = vmlal_s16(sum, s_lo[2], filters_lo[2]); sum = vmlal_s16(sum, s_lo[3], filters_lo[3]); sum = vmlal_s16(sum, s_hi[0], filters_hi[0]); sum = vmlal_s16(sum, s_hi[1], filters_hi[1]); sum = vmlal_s16(sum, s_hi[2], filters_hi[2]); sum = vmlal_s16(sum, s_hi[3], filters_hi[3]); return sum; } static inline uint16x4_t highbd_convolve8_2d_scale_horiz4x8_s32_s16( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x4_t *filters_lo, const int16x4_t *filters_hi, const int32x4_t shift_s32, const int32x4_t offset) { int32x4_t sum = highbd_convolve8_2d_scale_horiz4x8_s32( s0, s1, s2, s3, filters_lo, filters_hi, offset); sum = vqrshlq_s32(sum, shift_s32); return vqmovun_s32(sum); } #endif // AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_NEON_H_ aom-3.12.1/av1/common/arm/highbd_convolve_scale_neon.c000066400000000000000000000510361477627663500226360ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" #include "av1/common/arm/highbd_convolve_neon.h" static inline void highbd_dist_wtd_comp_avg_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, ConvolveParams *conv_params, const int round_bits, const int offset, const int bd) { CONV_BUF_TYPE *ref_ptr = conv_params->dst; const int ref_stride = conv_params->dst_stride; const int32x4_t round_shift = vdupq_n_s32(-round_bits); const uint32x4_t offset_vec = vdupq_n_u32(offset); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); // Weighted averaging if (w <= 4) { do { const uint16x4_t src = vld1_u16(src_ptr); const uint16x4_t ref = vld1_u16(ref_ptr); uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); d0 = vqrshlq_s32(d0, round_shift); uint16x4_t d0_u16 = vqmovun_s32(d0); d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); if (w == 2) { store_u16_2x1(dst_ptr, d0_u16); } else { vst1_u16(dst_ptr, d0_u16); } src_ptr += src_stride; dst_ptr += dst_stride; ref_ptr += ref_stride; } while (--h != 0); } else { do { int width = w; const uint16_t *src = src_ptr; const uint16_t *ref = ref_ptr; uint16_t *dst = dst_ptr; do { const uint16x8_t s = vld1q_u16(src); const uint16x8_t r = vld1q_u16(ref); uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); d0 = vqrshlq_s32(d0, round_shift); uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); d1 = vqrshlq_s32(d1, round_shift); uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1)); d01 = vminq_u16(d01, max); vst1q_u16(dst, d01); src += 8; ref += 8; dst += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; ref_ptr += ref_stride; } while (--h != 0); } } static inline void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, ConvolveParams *conv_params, const int round_bits, const int offset, const int bd) { CONV_BUF_TYPE *ref_ptr = conv_params->dst; const int ref_stride = conv_params->dst_stride; const int32x4_t round_shift = vdupq_n_s32(-round_bits); const uint16x4_t offset_vec = vdup_n_u16(offset); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); if (w <= 4) { do { const uint16x4_t src = vld1_u16(src_ptr); const uint16x4_t ref = vld1_u16(ref_ptr); uint16x4_t avg = vhadd_u16(src, ref); int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); d0 = vqrshlq_s32(d0, round_shift); uint16x4_t d0_u16 = vqmovun_s32(d0); d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); if (w == 2) { store_u16_2x1(dst_ptr, d0_u16); } else { vst1_u16(dst_ptr, d0_u16); } src_ptr += src_stride; ref_ptr += ref_stride; dst_ptr += dst_stride; } while (--h != 0); } else { do { int width = w; const uint16_t *src = src_ptr; const uint16_t *ref = ref_ptr; uint16_t *dst = dst_ptr; do { const uint16x8_t s = vld1q_u16(src); const uint16x8_t r = vld1q_u16(ref); uint16x8_t avg = vhaddq_u16(s, r); int32x4_t d0_lo = vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); int32x4_t d0_hi = vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); d0_lo = vqrshlq_s32(d0_lo, round_shift); d0_hi = vqrshlq_s32(d0_hi, round_shift); uint16x8_t d0 = vcombine_u16(vqmovun_s32(d0_lo), vqmovun_s32(d0_hi)); d0 = vminq_u16(d0, max); vst1q_u16(dst, d0); src += 8; ref += 8; dst += 8; width -= 8; } while (width != 0); src_ptr += src_stride; ref_ptr += ref_stride; dst_ptr += dst_stride; } while (--h != 0); } } static inline void highbd_convolve_2d_x_scale_8tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int subpel_x_qn, const int x_step_qn, const InterpFilterParams *filter_params, ConvolveParams *conv_params, const int offset) { static const uint32_t kIdx[4] = { 0, 1, 2, 3 }; const uint32x4_t idx = vld1q_u32(kIdx); const uint32x4_t subpel_mask = vdupq_n_u32(SCALE_SUBPEL_MASK); const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0); const int32x4_t offset_s32 = vdupq_n_s32(offset); if (w <= 4) { int height = h; uint16_t *d = dst_ptr; do { int x_qn = subpel_x_qn; // Load 4 src vectors at a time, they might be the same, but we have to // calculate the indices anyway. Doing it in SIMD and then storing the // indices is faster than having to calculate the expression // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times // Ideally this should be a gather using the indices, but NEON does not // have that, so have to emulate const uint32x4_t xqn_idx = vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn); // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) = // 2 const uint32x4_t src_idx_u32 = vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1); #if AOM_ARCH_AARCH64 uint64x2_t src4[2]; src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr), vget_low_u32(src_idx_u32)); src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr), vget_high_u32(src_idx_u32)); int16_t *src4_ptr[4]; uint64_t *tmp_ptr = (uint64_t *)&src4_ptr; vst1q_u64(tmp_ptr, src4[0]); vst1q_u64(tmp_ptr + 2, src4[1]); #else uint32x4_t src4; src4 = vaddq_u32(vdupq_n_u32((const uint32_t)src_ptr), src_idx_u32); int16_t *src4_ptr[4]; uint32_t *tmp_ptr = (uint32_t *)&src4_ptr; vst1q_u32(tmp_ptr, src4); #endif // AOM_ARCH_AARCH64 // Same for the filter vectors const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32( vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS)); int32_t x_filter4_idx[4]; vst1q_s32(x_filter4_idx, filter_idx_s32); const int16_t *x_filter4_ptr[4]; // Load source int16x8_t s0 = vld1q_s16(src4_ptr[0]); int16x8_t s1 = vld1q_s16(src4_ptr[1]); int16x8_t s2 = vld1q_s16(src4_ptr[2]); int16x8_t s3 = vld1q_s16(src4_ptr[3]); // We could easily do this using SIMD as well instead of calling the // inline function 4 times. x_filter4_ptr[0] = av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[0]); x_filter4_ptr[1] = av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[1]); x_filter4_ptr[2] = av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[2]); x_filter4_ptr[3] = av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[3]); // Actually load the filters const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]); const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]); const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]); const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]); // Group low and high parts and transpose int16x4_t filters_lo[] = { vget_low_s16(x_filter0), vget_low_s16(x_filter1), vget_low_s16(x_filter2), vget_low_s16(x_filter3) }; int16x4_t filters_hi[] = { vget_high_s16(x_filter0), vget_high_s16(x_filter1), vget_high_s16(x_filter2), vget_high_s16(x_filter3) }; transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo); transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi); // Run the 2D Scale convolution uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16( s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32); if (w == 2) { store_u16_2x1(d, d0); } else { vst1_u16(d, d0); } src_ptr += src_stride; d += dst_stride; height--; } while (height > 0); } else { int height = h; do { int width = w; int x_qn = subpel_x_qn; uint16_t *d = dst_ptr; const uint16_t *s = src_ptr; do { // Load 4 src vectors at a time, they might be the same, but we have to // calculate the indices anyway. Doing it in SIMD and then storing the // indices is faster than having to calculate the expression // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times // Ideally this should be a gather using the indices, but NEON does not // have that, so have to emulate const uint32x4_t xqn_idx = vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn); // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) // = 2 const uint32x4_t src_idx_u32 = vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1); #if AOM_ARCH_AARCH64 uint64x2_t src4[2]; src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)s), vget_low_u32(src_idx_u32)); src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)s), vget_high_u32(src_idx_u32)); int16_t *src4_ptr[4]; uint64_t *tmp_ptr = (uint64_t *)&src4_ptr; vst1q_u64(tmp_ptr, src4[0]); vst1q_u64(tmp_ptr + 2, src4[1]); #else uint32x4_t src4; src4 = vaddq_u32(vdupq_n_u32((const uint32_t)s), src_idx_u32); int16_t *src4_ptr[4]; uint32_t *tmp_ptr = (uint32_t *)&src4_ptr; vst1q_u32(tmp_ptr, src4); #endif // AOM_ARCH_AARCH64 // Same for the filter vectors const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32( vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS)); int32_t x_filter4_idx[4]; vst1q_s32(x_filter4_idx, filter_idx_s32); const int16_t *x_filter4_ptr[4]; // Load source int16x8_t s0 = vld1q_s16(src4_ptr[0]); int16x8_t s1 = vld1q_s16(src4_ptr[1]); int16x8_t s2 = vld1q_s16(src4_ptr[2]); int16x8_t s3 = vld1q_s16(src4_ptr[3]); // We could easily do this using SIMD as well instead of calling the // inline function 4 times. x_filter4_ptr[0] = av1_get_interp_filter_subpel_kernel( filter_params, x_filter4_idx[0]); x_filter4_ptr[1] = av1_get_interp_filter_subpel_kernel( filter_params, x_filter4_idx[1]); x_filter4_ptr[2] = av1_get_interp_filter_subpel_kernel( filter_params, x_filter4_idx[2]); x_filter4_ptr[3] = av1_get_interp_filter_subpel_kernel( filter_params, x_filter4_idx[3]); // Actually load the filters const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]); const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]); const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]); const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]); // Group low and high parts and transpose int16x4_t filters_lo[] = { vget_low_s16(x_filter0), vget_low_s16(x_filter1), vget_low_s16(x_filter2), vget_low_s16(x_filter3) }; int16x4_t filters_hi[] = { vget_high_s16(x_filter0), vget_high_s16(x_filter1), vget_high_s16(x_filter2), vget_high_s16(x_filter3) }; transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo); transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi); // Run the 2D Scale X convolution uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16( s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32); vst1_u16(d, d0); x_qn += 4 * x_step_qn; d += 4; width -= 4; } while (width > 0); src_ptr += src_stride; dst_ptr += dst_stride; height--; } while (height > 0); } } static inline void highbd_convolve_2d_y_scale_8tap_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int subpel_y_qn, const int y_step_qn, const InterpFilterParams *filter_params, const int round1_bits, const int offset) { const int32x4_t offset_s32 = vdupq_n_s32(1 << offset); const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_bits); if (w <= 4) { int height = h; uint16_t *d = dst_ptr; int y_qn = subpel_y_qn; do { const int16_t *s = (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx); const int16x8_t y_filter = vld1q_s16(y_filter_ptr); uint16x4_t d0 = highbd_convolve8_4_srsub_s32_s16( s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32, vdupq_n_s32(0)); if (w == 2) { store_u16_2x1(d, d0); } else { vst1_u16(d, d0); } y_qn += y_step_qn; d += dst_stride; height--; } while (height > 0); } else { int width = w; do { int height = h; int y_qn = subpel_y_qn; uint16_t *d = dst_ptr; do { const int16_t *s = (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx); const int16x8_t y_filter = vld1q_s16(y_filter_ptr); uint16x8_t d0 = highbd_convolve8_8_srsub_s32_s16( s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32, vdupq_n_s32(0)); vst1q_u16(d, d0); y_qn += y_step_qn; d += dst_stride; height--; } while (height > 0); src_ptr += 8; dst_ptr += 8; width -= 8; } while (width > 0); } } static inline void highbd_convolve_correct_offset_neon( const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, int w, int h, const int round_bits, const int offset, const int bd) { const int32x4_t round_shift_s32 = vdupq_n_s32(-round_bits); const int16x4_t offset_s16 = vdup_n_s16(offset); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); if (w <= 4) { for (int y = 0; y < h; ++y) { const int16x4_t s = vld1_s16((const int16_t *)src_ptr + y * src_stride); const int32x4_t d0 = vqrshlq_s32(vsubl_s16(s, offset_s16), round_shift_s32); uint16x4_t d = vqmovun_s32(d0); d = vmin_u16(d, vget_low_u16(max)); if (w == 2) { store_u16_2x1(dst_ptr + y * dst_stride, d); } else { vst1_u16(dst_ptr + y * dst_stride, d); } } } else { for (int y = 0; y < h; ++y) { for (int x = 0; x < w; x += 8) { // Subtract round offset and convolve round const int16x8_t s = vld1q_s16((const int16_t *)src_ptr + y * src_stride + x); const int32x4_t d0 = vqrshlq_s32(vsubl_s16(vget_low_s16(s), offset_s16), round_shift_s32); const int32x4_t d1 = vqrshlq_s32( vsubl_s16(vget_high_s16(s), offset_s16), round_shift_s32); uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1)); d01 = vminq_u16(d01, max); vst1q_u16(dst_ptr + y * dst_stride + x, d01); } } } } void av1_highbd_convolve_2d_scale_neon( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd) { uint16_t *im_block = (uint16_t *)aom_memalign( 16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP)); if (!im_block) return; uint16_t *im_block2 = (uint16_t *)aom_memalign( 16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP)); if (!im_block2) { aom_free(im_block); // free the first block and return. return; } int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps; const int im_stride = MAX_SB_SIZE; const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; assert(bits >= 0); const int vert_offset = filter_params_y->taps / 2 - 1; const int horiz_offset = filter_params_x->taps / 2 - 1; const int x_offset_bits = (1 << (bd + FILTER_BITS - 1)); const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int y_offset_correction = ((1 << (y_offset_bits - conv_params->round_1)) + (1 << (y_offset_bits - conv_params->round_1 - 1))); CONV_BUF_TYPE *dst16 = conv_params->dst; const int dst16_stride = conv_params->dst_stride; const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; highbd_convolve_2d_x_scale_8tap_neon( src_ptr, src_stride, im_block, im_stride, w, im_h, subpel_x_qn, x_step_qn, filter_params_x, conv_params, x_offset_bits); if (conv_params->is_compound && !conv_params->do_average) { highbd_convolve_2d_y_scale_8tap_neon( im_block, im_stride, dst16, dst16_stride, w, h, subpel_y_qn, y_step_qn, filter_params_y, conv_params->round_1, y_offset_bits); } else { highbd_convolve_2d_y_scale_8tap_neon( im_block, im_stride, im_block2, im_stride, w, h, subpel_y_qn, y_step_qn, filter_params_y, conv_params->round_1, y_offset_bits); } // Do the compound averaging outside the loop, avoids branching within the // main loop if (conv_params->is_compound) { if (conv_params->do_average) { if (conv_params->use_dist_wtd_comp_avg) { highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, conv_params, bits, y_offset_correction, bd); } else { highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, conv_params, bits, y_offset_correction, bd); } } } else { highbd_convolve_correct_offset_neon(im_block2, im_stride, dst, dst_stride, w, h, bits, y_offset_correction, bd); } aom_free(im_block); aom_free(im_block2); } aom-3.12.1/av1/common/arm/highbd_convolve_sve2.c000066400000000000000000002054451477627663500214140ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/aom_neon_sve2_bridge.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_ports/mem.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" #include "av1/common/arm/highbd_convolve_sve2.h" DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 0, 6, 7, 0, 1, 7, 0, 1, 2, }; static inline uint16x4_t convolve12_4_x( int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11, const int64x2_t offset, uint16x8x4_t permute_tbl, uint16x4_t max) { int16x8_t permuted_samples[6]; permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]); permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]); permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]); permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]); permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]); permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]); int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0); sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1); sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1); int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0); sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1); sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1); int32x4_t res0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); uint16x4_t res = vqrshrun_n_s32(res0123, FILTER_BITS); return vmin_u16(res, max); } static inline uint16x8_t convolve12_8_x(int16x8_t s0, int16x8_t s1, int16x8_t s2, int16x8_t filter_0_7, int16x8_t filter_4_11, int64x2_t offset, uint16x8x4_t permute_tbl, uint16x8_t max) { int16x8_t permuted_samples[8]; permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]); permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]); permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]); permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]); permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]); permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]); permuted_samples[6] = aom_tbl2_s16(s1, s2, permute_tbl.val[2]); permuted_samples[7] = aom_tbl2_s16(s1, s2, permute_tbl.val[3]); int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0); sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1); sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1); int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0); sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1); sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1); int64x2_t sum45 = aom_svdot_lane_s16(offset, permuted_samples[2], filter_0_7, 0); sum45 = aom_svdot_lane_s16(sum45, permuted_samples[4], filter_0_7, 1); sum45 = aom_svdot_lane_s16(sum45, permuted_samples[6], filter_4_11, 1); int64x2_t sum67 = aom_svdot_lane_s16(offset, permuted_samples[3], filter_0_7, 0); sum67 = aom_svdot_lane_s16(sum67, permuted_samples[5], filter_0_7, 1); sum67 = aom_svdot_lane_s16(sum67, permuted_samples[7], filter_4_11, 1); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), vqrshrun_n_s32(sum4567, FILTER_BITS)); return vminq_u16(res, max); } static inline void highbd_convolve_x_sr_12tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *y_filter_ptr, ConvolveParams *conv_params, int bd) { // This shim allows to do only one rounding shift instead of two. const int64x2_t offset = vdupq_n_s64(1 << (conv_params->round_0 - 1)); const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdTbl); // Scale indices by size of the true vector length to avoid reading from an // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. uint16x8_t correction0 = vreinterpretq_u16_u64(vcombine_u64( vdup_n_u64(0), vdup_n_u64(svcnth() * 0x0001000000000000ULL))); permute_tbl.val[2] = vaddq_u16(permute_tbl.val[2], correction0); uint16x8_t correction1 = vreinterpretq_u16_u64( vcombine_u64(vdup_n_u64(svcnth() * 0x0001000100000000ULL), vdup_n_u64(svcnth() * 0x0001000100010000ULL))); permute_tbl.val[3] = vaddq_u16(permute_tbl.val[3], correction1); if (width == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); const int16_t *s = (const int16_t *)src; do { int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6); load_s16_8x4(s + 8, src_stride, &s1, &s3, &s5, &s7); uint16x4_t d0 = convolve12_4_x(s0, s1, y_filter_0_7, y_filter_4_11, offset, permute_tbl, max); uint16x4_t d1 = convolve12_4_x(s2, s3, y_filter_0_7, y_filter_4_11, offset, permute_tbl, max); uint16x4_t d2 = convolve12_4_x(s4, s5, y_filter_0_7, y_filter_4_11, offset, permute_tbl, max); uint16x4_t d3 = convolve12_4_x(s6, s7, y_filter_0_7, y_filter_4_11, offset, permute_tbl, max); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); do { const int16_t *s = (const int16_t *)src; uint16_t *d = dst; int w = width; do { int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11; load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9); load_s16_8x4(s + 8, src_stride, &s1, &s4, &s7, &s10); load_s16_8x4(s + 16, src_stride, &s2, &s5, &s8, &s11); uint16x8_t d0 = convolve12_8_x(s0, s1, s2, y_filter_0_7, y_filter_4_11, offset, permute_tbl, max); uint16x8_t d1 = convolve12_8_x(s3, s4, s5, y_filter_0_7, y_filter_4_11, offset, permute_tbl, max); uint16x8_t d2 = convolve12_8_x(s6, s7, s8, y_filter_0_7, y_filter_4_11, offset, permute_tbl, max); uint16x8_t d3 = convolve12_8_x(s9, s10, s11, y_filter_0_7, y_filter_4_11, offset, permute_tbl, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } } static inline uint16x8_t convolve8_8_x(int16x8_t s0[8], int16x8_t filter, int64x2_t offset, uint16x8_t max) { int64x2_t sum[8]; sum[0] = aom_sdotq_s16(offset, s0[0], filter); sum[1] = aom_sdotq_s16(offset, s0[1], filter); sum[2] = aom_sdotq_s16(offset, s0[2], filter); sum[3] = aom_sdotq_s16(offset, s0[3], filter); sum[4] = aom_sdotq_s16(offset, s0[4], filter); sum[5] = aom_sdotq_s16(offset, s0[5], filter); sum[6] = aom_sdotq_s16(offset, s0[6], filter); sum[7] = aom_sdotq_s16(offset, s0[7], filter); sum[0] = vpaddq_s64(sum[0], sum[1]); sum[2] = vpaddq_s64(sum[2], sum[3]); sum[4] = vpaddq_s64(sum[4], sum[5]); sum[6] = vpaddq_s64(sum[6], sum[7]); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6])); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), vqrshrun_n_s32(sum4567, FILTER_BITS)); return vminq_u16(res, max); } static inline void highbd_convolve_x_sr_8tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *y_filter_ptr, ConvolveParams *conv_params, int bd) { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); // This shim allows to do only one rounding shift instead of two. const int64_t offset = 1 << (conv_params->round_0 - 1); const int64x2_t offset_lo = vcombine_s64((int64x1_t)(offset), vdup_n_s64(0)); const int16x8_t filter = vld1q_s16(y_filter_ptr); do { const int16_t *s = (const int16_t *)src; uint16_t *d = dst; int w = width; do { int16x8_t s0[8], s1[8], s2[8], s3[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7]); uint16x8_t d0 = convolve8_8_x(s0, filter, offset_lo, max); uint16x8_t d1 = convolve8_8_x(s1, filter, offset_lo, max); uint16x8_t d2 = convolve8_8_x(s2, filter, offset_lo, max); uint16x8_t d3 = convolve8_8_x(s3, filter, offset_lo, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } // clang-format off DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = { 0, 2, 4, 6, 1, 3, 5, 7, }; // clang-format on static inline uint16x4_t convolve4_4_x(int16x8_t s0, int16x8_t filter, int64x2_t offset, uint16x8x2_t permute_tbl, uint16x4_t max) { int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]); int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]); int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0); int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); return vmin_u16(res, max); } static inline uint16x8_t convolve4_8_x(int16x8_t s0[4], int16x8_t filter, int64x2_t offset, uint16x8_t tbl, uint16x8_t max) { int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0); int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0); int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0); int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0); int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0415, FILTER_BITS), vqrshrun_n_s32(sum2637, FILTER_BITS)); res = aom_tbl_u16(res, tbl); return vminq_u16(res, max); } static inline void highbd_convolve_x_sr_4tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *x_filter_ptr, ConvolveParams *conv_params, int bd) { // This shim allows to do only one rounding shift instead of two. const int64x2_t offset = vdupq_n_s64(1 << (conv_params->round_0 - 1)); const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); if (width == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); const int16_t *s = (const int16_t *)(src); do { int16x8_t s0, s1, s2, s3; load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x4_t d0 = convolve4_4_x(s0, filter, offset, permute_tbl, max); uint16x4_t d1 = convolve4_4_x(s1, filter, offset, permute_tbl, max); uint16x4_t d2 = convolve4_4_x(s2, filter, offset, permute_tbl, max); uint16x4_t d3 = convolve4_4_x(s3, filter, offset, permute_tbl, max); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); do { const int16_t *s = (const int16_t *)(src); uint16_t *d = dst; int w = width; do { int16x8_t s0[4], s1[4], s2[4], s3[4]; load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); uint16x8_t d0 = convolve4_8_x(s0, filter, offset, idx, max); uint16x8_t d1 = convolve4_8_x(s1, filter, offset, idx, max); uint16x8_t d2 = convolve4_8_x(s2, filter, offset, idx, max); uint16x8_t d3 = convolve4_8_x(s3, filter, offset, idx, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } } void av1_highbd_convolve_x_sr_sve2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd) { if (w == 2 || h == 2) { av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params, bd); return; } const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); if (x_filter_taps == 6) { av1_highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params, bd); return; } const int horiz_offset = filter_params_x->taps / 2 - 1; const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); src -= horiz_offset; if (x_filter_taps == 12) { highbd_convolve_x_sr_12tap_sve2(src, src_stride, dst, dst_stride, w, h, x_filter_ptr, conv_params, bd); return; } if (x_filter_taps == 8) { highbd_convolve_x_sr_8tap_sve2(src, src_stride, dst, dst_stride, w, h, x_filter_ptr, conv_params, bd); return; } highbd_convolve_x_sr_4tap_sve2(src + 2, src_stride, dst, dst_stride, w, h, x_filter_ptr, conv_params, bd); } static inline uint16x4_t highbd_convolve12_4_y(int16x8_t s0[2], int16x8_t s1[2], int16x8_t s2[2], int16x8_t filter_0_7, int16x8_t filter_4_11, uint16x4_t max) { int64x2_t sum[2]; sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0); sum[0] = aom_svdot_lane_s16(sum[0], s1[0], filter_0_7, 1); sum[0] = aom_svdot_lane_s16(sum[0], s2[0], filter_4_11, 1); sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0); sum[1] = aom_svdot_lane_s16(sum[1], s1[1], filter_0_7, 1); sum[1] = aom_svdot_lane_s16(sum[1], s2[1], filter_4_11, 1); int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1])); uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS); return vmin_u16(res, max); } static inline void highbd_convolve_y_sr_12tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *y_filter_ptr, int bd) { const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); // Scale indices by size of the true vector length to avoid reading from an // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. uint16x8_t correction0 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); uint16x8_t correction1 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); uint16x8_t correction2 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); const uint16x4_t max = vdup_n_u16((1 << bd) - 1); do { int16_t *s = (int16_t *)src; uint16_t *d = dst; int h = height; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &sA); s += 11 * src_stride; int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2], s6789[2], s789A[2]; transpose_concat_4x4(s0, s1, s2, s3, s0123); transpose_concat_4x4(s1, s2, s3, s4, s1234); transpose_concat_4x4(s2, s3, s4, s5, s2345); transpose_concat_4x4(s3, s4, s5, s6, s3456); transpose_concat_4x4(s4, s5, s6, s7, s4567); transpose_concat_4x4(s5, s6, s7, s8, s5678); transpose_concat_4x4(s6, s7, s8, s9, s6789); transpose_concat_4x4(s7, s8, s9, sA, s789A); do { int16x4_t sB, sC, sD, sE; load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE); int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2]; transpose_concat_4x4(sB, sC, sD, sE, sBCDE); // Use the above transpose and reuse data from the previous loop to get // the rest. aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB); aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC); aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD); uint16x4_t d0 = highbd_convolve12_4_y(s0123, s4567, s89AB, y_filter_0_7, y_filter_4_11, max); uint16x4_t d1 = highbd_convolve12_4_y(s1234, s5678, s9ABC, y_filter_0_7, y_filter_4_11, max); uint16x4_t d2 = highbd_convolve12_4_y(s2345, s6789, sABCD, y_filter_0_7, y_filter_4_11, max); uint16x4_t d3 = highbd_convolve12_4_y(s3456, s789A, sBCDE, y_filter_0_7, y_filter_4_11, max); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123[0] = s4567[0]; s0123[1] = s4567[1]; s1234[0] = s5678[0]; s1234[1] = s5678[1]; s2345[0] = s6789[0]; s2345[1] = s6789[1]; s3456[0] = s789A[0]; s3456[1] = s789A[1]; s4567[0] = s89AB[0]; s4567[1] = s89AB[1]; s5678[0] = s9ABC[0]; s5678[1] = s9ABC[1]; s6789[0] = sABCD[0]; s6789[1] = sABCD[1]; s789A[0] = sBCDE[0]; s789A[1] = sBCDE[1]; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); src += 4; dst += 4; width -= 4; } while (width != 0); } static inline uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2], int16x8_t samples_hi[2], int16x8_t filter, uint16x4_t max) { int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0); sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0); sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4], int16x8_t samples_hi[4], int16x8_t filter, uint16x8_t max) { int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0); sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0); sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); int64x2_t sum45 = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[2], filter, 0); sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); int64x2_t sum67 = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[3], filter, 0); sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), vqrshrun_n_s32(sum4567, FILTER_BITS)); return vminq_u16(res, max); } static void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y, int bd) { assert(width >= 4 && height >= 4); const int16x8_t y_filter = vld1q_s16(filter_y); uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); // Scale indices by size of the true vector length to avoid reading from an // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. uint16x8_t correction0 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); uint16x8_t correction1 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); uint16x8_t correction2 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); if (width == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); int16_t *s = (int16_t *)src; int16x4_t s0, s1, s2, s3, s4, s5, s6; load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; // This operation combines a conventional transpose and the sample permute // required before computing the dot product. int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; transpose_concat_4x4(s0, s1, s2, s3, s0123); transpose_concat_4x4(s1, s2, s3, s4, s1234); transpose_concat_4x4(s2, s3, s4, s5, s2345); transpose_concat_4x4(s3, s4, s5, s6, s3456); do { int16x4_t s7, s8, s9, s10; load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; // Transpose and shuffle the 4 lines that were loaded. transpose_concat_4x4(s7, s8, s9, s10, s789A); // Merge new data into block from previous iteration. aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); uint16x4_t d0 = highbd_convolve8_4_y(s0123, s4567, y_filter, max); uint16x4_t d1 = highbd_convolve8_4_y(s1234, s5678, y_filter, max); uint16x4_t d2 = highbd_convolve8_4_y(s2345, s6789, y_filter, max); uint16x4_t d3 = highbd_convolve8_4_y(s3456, s789A, y_filter, max); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123[0] = s4567[0]; s0123[1] = s4567[1]; s1234[0] = s5678[0]; s1234[1] = s5678[1]; s2345[0] = s6789[0]; s2345[1] = s6789[1]; s3456[0] = s789A[0]; s3456[1] = s789A[1]; s += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); do { int h = height; int16_t *s = (int16_t *)src; uint16_t *d = dst; int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; // This operation combines a conventional transpose and the sample permute // required before computing the dot product. int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; transpose_concat_8x4(s0, s1, s2, s3, s0123); transpose_concat_8x4(s1, s2, s3, s4, s1234); transpose_concat_8x4(s2, s3, s4, s5, s2345); transpose_concat_8x4(s3, s4, s5, s6, s3456); do { int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; // Transpose and shuffle the 4 lines that were loaded. transpose_concat_8x4(s7, s8, s9, s10, s789A); // Merge new data into block from previous iteration. aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); uint16x8_t d0 = highbd_convolve8_8_y(s0123, s4567, y_filter, max); uint16x8_t d1 = highbd_convolve8_8_y(s1234, s5678, y_filter, max); uint16x8_t d2 = highbd_convolve8_8_y(s2345, s6789, y_filter, max); uint16x8_t d3 = highbd_convolve8_8_y(s3456, s789A, y_filter, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123[0] = s4567[0]; s0123[1] = s4567[1]; s0123[2] = s4567[2]; s0123[3] = s4567[3]; s1234[0] = s5678[0]; s1234[1] = s5678[1]; s1234[2] = s5678[2]; s1234[3] = s5678[3]; s2345[0] = s6789[0]; s2345[1] = s6789[1]; s2345[2] = s6789[2]; s2345[3] = s6789[3]; s3456[0] = s789A[0]; s3456[1] = s789A[1]; s3456[2] = s789A[2]; s3456[3] = s789A[3]; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); src += 8; dst += 8; width -= 8; } while (width != 0); } } static inline uint16x4_t highbd_convolve4_4_y(int16x8_t samples[2], int16x8_t filter, uint16x4_t max) { int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[0], filter, 0); int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[1], filter, 0); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve4_8_y(int16x8_t samples[4], int16x8_t filter, uint16x8_t max) { int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[0], filter, 0); int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[1], filter, 0); int64x2_t sum45 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[2], filter, 0); int64x2_t sum67 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[3], filter, 0); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), vqrshrun_n_s32(sum4567, FILTER_BITS)); return vminq_u16(res, max); } static void highbd_convolve_y_sr_4tap_sve2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y, int bd) { assert(width >= 4 && height >= 4); const int16x8_t y_filter = vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0)); if (width == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); int16_t *s = (int16_t *)src; int16x4_t s0, s1, s2; load_s16_4x3(s, src_stride, &s0, &s1, &s2); s += 3 * src_stride; do { int16x4_t s3, s4, s5, s6; load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); // This operation combines a conventional transpose and the sample permute // required before computing the dot product. int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; transpose_concat_4x4(s0, s1, s2, s3, s0123); transpose_concat_4x4(s1, s2, s3, s4, s1234); transpose_concat_4x4(s2, s3, s4, s5, s2345); transpose_concat_4x4(s3, s4, s5, s6, s3456); uint16x4_t d0 = highbd_convolve4_4_y(s0123, y_filter, max); uint16x4_t d1 = highbd_convolve4_4_y(s1234, y_filter, max); uint16x4_t d2 = highbd_convolve4_4_y(s2345, y_filter, max); uint16x4_t d3 = highbd_convolve4_4_y(s3456, y_filter, max); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); // Shuffle everything up four rows. s0 = s4; s1 = s5; s2 = s6; s += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); do { int h = height; int16_t *s = (int16_t *)src; uint16_t *d = dst; int16x8_t s0, s1, s2; load_s16_8x3(s, src_stride, &s0, &s1, &s2); s += 3 * src_stride; do { int16x8_t s3, s4, s5, s6; load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); // This operation combines a conventional transpose and the sample // permute required before computing the dot product. int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; transpose_concat_8x4(s0, s1, s2, s3, s0123); transpose_concat_8x4(s1, s2, s3, s4, s1234); transpose_concat_8x4(s2, s3, s4, s5, s2345); transpose_concat_8x4(s3, s4, s5, s6, s3456); uint16x8_t d0 = highbd_convolve4_8_y(s0123, y_filter, max); uint16x8_t d1 = highbd_convolve4_8_y(s1234, y_filter, max); uint16x8_t d2 = highbd_convolve4_8_y(s2345, y_filter, max); uint16x8_t d3 = highbd_convolve4_8_y(s3456, y_filter, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); // Shuffle everything up four rows. s0 = s4; s1 = s5; s2 = s6; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); src += 8; dst += 8; width -= 8; } while (width != 0); } } void av1_highbd_convolve_y_sr_sve2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd) { if (w == 2 || h == 2) { av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn, bd); return; } const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); if (y_filter_taps == 6) { av1_highbd_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn, bd); return; } const int vert_offset = filter_params_y->taps / 2 - 1; const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); src -= vert_offset * src_stride; if (y_filter_taps > 8) { highbd_convolve_y_sr_12tap_sve2(src, src_stride, dst, dst_stride, w, h, y_filter_ptr, bd); return; } if (y_filter_taps == 4) { highbd_convolve_y_sr_4tap_sve2(src + 2 * src_stride, src_stride, dst, dst_stride, w, h, y_filter_ptr, bd); return; } highbd_convolve_y_sr_8tap_sve2(src, src_stride, dst, dst_stride, w, h, y_filter_ptr, bd); } static inline uint16x4_t convolve12_4_2d_h( int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11, const int64x2_t offset, int32x4_t shift, uint16x8x4_t permute_tbl) { int16x8_t permuted_samples[6]; permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]); permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]); permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]); permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]); permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]); permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]); int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0); sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1); sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1); int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0); sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1); sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); sum0123 = vqrshlq_s32(sum0123, shift); return vqmovun_s32(sum0123); } static inline uint16x8_t convolve12_8_2d_h(int16x8_t s0, int16x8_t s1, int16x8_t s2, int16x8_t filter_0_7, int16x8_t filter_4_11, int64x2_t offset, int32x4_t shift, uint16x8x4_t permute_tbl) { int16x8_t permuted_samples[8]; permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]); permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]); permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]); permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]); permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]); permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]); permuted_samples[6] = aom_tbl2_s16(s1, s2, permute_tbl.val[2]); permuted_samples[7] = aom_tbl2_s16(s1, s2, permute_tbl.val[3]); int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0); sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1); sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1); int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0); sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1); sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1); int64x2_t sum45 = aom_svdot_lane_s16(offset, permuted_samples[2], filter_0_7, 0); sum45 = aom_svdot_lane_s16(sum45, permuted_samples[4], filter_0_7, 1); sum45 = aom_svdot_lane_s16(sum45, permuted_samples[6], filter_4_11, 1); int64x2_t sum67 = aom_svdot_lane_s16(offset, permuted_samples[3], filter_0_7, 0); sum67 = aom_svdot_lane_s16(sum67, permuted_samples[5], filter_0_7, 1); sum67 = aom_svdot_lane_s16(sum67, permuted_samples[7], filter_4_11, 1); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); sum0123 = vqrshlq_s32(sum0123, shift); sum4567 = vqrshlq_s32(sum4567, shift); return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); } static inline void highbd_convolve_2d_sr_horiz_12tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *y_filter_ptr, ConvolveParams *conv_params, const int x_offset) { const int64x2_t offset = vdupq_n_s64(x_offset); const int32x4_t shift = vdupq_n_s32(-conv_params->round_0); const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdTbl); // Scale indices by size of the true vector length to avoid reading from an // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. uint16x8_t correction0 = vreinterpretq_u16_u64(vcombine_u64( vdup_n_u64(0), vdup_n_u64(svcnth() * 0x0001000000000000ULL))); permute_tbl.val[2] = vaddq_u16(permute_tbl.val[2], correction0); uint16x8_t correction1 = vreinterpretq_u16_u64( vcombine_u64(vdup_n_u64(svcnth() * 0x0001000100000000ULL), vdup_n_u64(svcnth() * 0x0001000100010000ULL))); permute_tbl.val[3] = vaddq_u16(permute_tbl.val[3], correction1); if (width == 4) { const int16_t *s = (const int16_t *)src; do { int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6); load_s16_8x4(s + 8, src_stride, &s1, &s3, &s5, &s7); uint16x4_t d0 = convolve12_4_2d_h(s0, s1, y_filter_0_7, y_filter_4_11, offset, shift, permute_tbl); uint16x4_t d1 = convolve12_4_2d_h(s2, s3, y_filter_0_7, y_filter_4_11, offset, shift, permute_tbl); uint16x4_t d2 = convolve12_4_2d_h(s4, s5, y_filter_0_7, y_filter_4_11, offset, shift, permute_tbl); uint16x4_t d3 = convolve12_4_2d_h(s6, s7, y_filter_0_7, y_filter_4_11, offset, shift, permute_tbl); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); dst += 4 * dst_stride; s += 4 * src_stride; height -= 4; } while (height > 0); } else { do { const int16_t *s = (const int16_t *)src; uint16_t *d = dst; int w = width; do { int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11; load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9); load_s16_8x4(s + 8, src_stride, &s1, &s4, &s7, &s10); load_s16_8x4(s + 16, src_stride, &s2, &s5, &s8, &s11); uint16x8_t d0 = convolve12_8_2d_h(s0, s1, s2, y_filter_0_7, y_filter_4_11, offset, shift, permute_tbl); uint16x8_t d1 = convolve12_8_2d_h(s3, s4, s5, y_filter_0_7, y_filter_4_11, offset, shift, permute_tbl); uint16x8_t d2 = convolve12_8_2d_h(s6, s7, s8, y_filter_0_7, y_filter_4_11, offset, shift, permute_tbl); uint16x8_t d3 = convolve12_8_2d_h(s9, s10, s11, y_filter_0_7, y_filter_4_11, offset, shift, permute_tbl); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 0); } } static inline uint16x8_t convolve8_8_2d_h(int16x8_t s0[8], int16x8_t filter, int64x2_t offset, int32x4_t shift) { int64x2_t sum[8]; sum[0] = aom_sdotq_s16(offset, s0[0], filter); sum[1] = aom_sdotq_s16(offset, s0[1], filter); sum[2] = aom_sdotq_s16(offset, s0[2], filter); sum[3] = aom_sdotq_s16(offset, s0[3], filter); sum[4] = aom_sdotq_s16(offset, s0[4], filter); sum[5] = aom_sdotq_s16(offset, s0[5], filter); sum[6] = aom_sdotq_s16(offset, s0[6], filter); sum[7] = aom_sdotq_s16(offset, s0[7], filter); sum[0] = vpaddq_s64(sum[0], sum[1]); sum[2] = vpaddq_s64(sum[2], sum[3]); sum[4] = vpaddq_s64(sum[4], sum[5]); sum[6] = vpaddq_s64(sum[6], sum[7]); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6])); sum0123 = vqrshlq_s32(sum0123, shift); sum4567 = vqrshlq_s32(sum4567, shift); return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); } static inline void highbd_convolve_2d_sr_horiz_8tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *y_filter_ptr, ConvolveParams *conv_params, const int x_offset) { const int64x2_t offset = vdupq_n_s64(x_offset); const int64x2_t offset_lo = vcombine_s64(vget_low_s64(offset), vdup_n_s64(0)); const int32x4_t shift = vdupq_n_s32(-conv_params->round_0); const int16x8_t filter = vld1q_s16(y_filter_ptr); do { const int16_t *s = (const int16_t *)src; uint16_t *d = dst; int w = width; do { int16x8_t s0[8], s1[8], s2[8], s3[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7]); uint16x8_t d0 = convolve8_8_2d_h(s0, filter, offset_lo, shift); uint16x8_t d1 = convolve8_8_2d_h(s1, filter, offset_lo, shift); uint16x8_t d2 = convolve8_8_2d_h(s2, filter, offset_lo, shift); uint16x8_t d3 = convolve8_8_2d_h(s3, filter, offset_lo, shift); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 0); } static inline uint16x4_t convolve4_4_2d_h(int16x8_t s0, int16x8_t filter, int64x2_t offset, int32x4_t shift, uint16x8x2_t permute_tbl) { int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]); int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]); int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0); int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); sum0123 = vqrshlq_s32(sum0123, shift); return vqmovun_s32(sum0123); } static inline uint16x8_t convolve4_8_2d_h(int16x8_t s0[8], int16x8_t filter, int64x2_t offset, int32x4_t shift, uint16x8_t tbl) { int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0); int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0); int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0); int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); sum0123 = vqrshlq_s32(sum0123, shift); sum4567 = vqrshlq_s32(sum4567, shift); uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); return aom_tbl_u16(res, tbl); } static inline void highbd_convolve_2d_sr_horiz_4tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *x_filter_ptr, ConvolveParams *conv_params, const int x_offset) { const int64x2_t offset = vdupq_n_s64(x_offset); const int32x4_t shift = vdupq_n_s32(-conv_params->round_0); const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); if (width == 4) { const int16_t *s = (const int16_t *)(src); uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); do { int16x8_t s0, s1, s2, s3; load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); uint16x4_t d0 = convolve4_4_2d_h(s0, filter, offset, shift, permute_tbl); uint16x4_t d1 = convolve4_4_2d_h(s1, filter, offset, shift, permute_tbl); uint16x4_t d2 = convolve4_4_2d_h(s2, filter, offset, shift, permute_tbl); uint16x4_t d3 = convolve4_4_2d_h(s3, filter, offset, shift, permute_tbl); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 0); } else { uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); do { const int16_t *s = (const int16_t *)(src); uint16_t *d = dst; int w = width; do { int16x8_t s0[8], s1[8], s2[8], s3[8]; load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], &s0[6], &s0[7]); load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], &s1[4], &s1[5], &s1[6], &s1[7]); load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], &s2[4], &s2[5], &s2[6], &s2[7]); load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], &s3[4], &s3[5], &s3[6], &s3[7]); uint16x8_t d0 = convolve4_8_2d_h(s0, filter, offset, shift, idx); uint16x8_t d1 = convolve4_8_2d_h(s1, filter, offset, shift, idx); uint16x8_t d2 = convolve4_8_2d_h(s2, filter, offset, shift, idx); uint16x8_t d3 = convolve4_8_2d_h(s3, filter, offset, shift, idx); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; w -= 8; } while (w != 0); src += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height > 0); } } static inline uint16x4_t highbd_convolve12_4_2d_v( int16x8_t s0[2], int16x8_t s1[2], int16x8_t s2[2], int16x8_t filter_0_7, int16x8_t filter_4_11, int32x4_t shift, int64x2_t offset, uint16x4_t max) { int64x2_t sum01 = aom_svdot_lane_s16(offset, s0[0], filter_0_7, 0); sum01 = aom_svdot_lane_s16(sum01, s1[0], filter_0_7, 1); sum01 = aom_svdot_lane_s16(sum01, s2[0], filter_4_11, 1); int64x2_t sum23 = aom_svdot_lane_s16(offset, s0[1], filter_0_7, 0); sum23 = aom_svdot_lane_s16(sum23, s1[1], filter_0_7, 1); sum23 = aom_svdot_lane_s16(sum23, s2[1], filter_4_11, 1); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); sum0123 = vshlq_s32(sum0123, shift); uint16x4_t res = vqmovun_s32(sum0123); return vmin_u16(res, max); } static inline void highbd_convolve_2d_sr_vert_12tap_sve2( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int width, int height, const int16_t *y_filter_ptr, ConvolveParams *conv_params, int bd, const int y_offset) { const int64x2_t offset = vdupq_n_s64(y_offset); const int32x4_t shift = vdupq_n_s32(-conv_params->round_1); const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); // Scale indices by size of the true vector length to avoid reading from an // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. uint16x8_t correction0 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); uint16x8_t correction1 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); uint16x8_t correction2 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); const uint16x4_t max = vdup_n_u16((1 << bd) - 1); do { int16_t *s = (int16_t *)src; uint16_t *d = (uint16_t *)dst; int h = height; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &sA); s += 11 * src_stride; int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2], s6789[2], s789A[2]; // This operation combines a conventional transpose and the sample permute // required before computing the dot product. transpose_concat_4x4(s0, s1, s2, s3, s0123); transpose_concat_4x4(s1, s2, s3, s4, s1234); transpose_concat_4x4(s2, s3, s4, s5, s2345); transpose_concat_4x4(s3, s4, s5, s6, s3456); transpose_concat_4x4(s4, s5, s6, s7, s4567); transpose_concat_4x4(s5, s6, s7, s8, s5678); transpose_concat_4x4(s6, s7, s8, s9, s6789); transpose_concat_4x4(s7, s8, s9, sA, s789A); do { int16x4_t sB, sC, sD, sE; load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE); int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2]; transpose_concat_4x4(sB, sC, sD, sE, sBCDE); // Use the above transpose and reuse data from the previous loop to get // the rest. aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB); aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC); aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD); uint16x4_t d0 = highbd_convolve12_4_2d_v( s0123, s4567, s89AB, y_filter_0_7, y_filter_4_11, shift, offset, max); uint16x4_t d1 = highbd_convolve12_4_2d_v( s1234, s5678, s9ABC, y_filter_0_7, y_filter_4_11, shift, offset, max); uint16x4_t d2 = highbd_convolve12_4_2d_v( s2345, s6789, sABCD, y_filter_0_7, y_filter_4_11, shift, offset, max); uint16x4_t d3 = highbd_convolve12_4_2d_v( s3456, s789A, sBCDE, y_filter_0_7, y_filter_4_11, shift, offset, max); store_u16_4x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123[0] = s4567[0]; s0123[1] = s4567[1]; s1234[0] = s5678[0]; s1234[1] = s5678[1]; s2345[0] = s6789[0]; s2345[1] = s6789[1]; s3456[0] = s789A[0]; s3456[1] = s789A[1]; s4567[0] = s89AB[0]; s4567[1] = s89AB[1]; s5678[0] = s9ABC[0]; s5678[1] = s9ABC[1]; s6789[0] = sABCD[0]; s6789[1] = sABCD[1]; s789A[0] = sBCDE[0]; s789A[1] = sBCDE[1]; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); src += 4; dst += 4; width -= 4; } while (width != 0); } static inline uint16x4_t highbd_convolve8_4_2d_v( int16x8_t samples_lo[2], int16x8_t samples_hi[2], int16x8_t filter, int32x4_t shift, int64x2_t offset, uint16x4_t max) { int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); sum0123 = vshlq_s32(sum0123, shift); uint16x4_t res = vqmovun_s32(sum0123); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve8_8_2d_v( int16x8_t samples_lo[4], int16x8_t samples_hi[4], int16x8_t filter, int32x4_t shift, int64x2_t offset, uint16x8_t max) { int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0); sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0); sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); sum0123 = vshlq_s32(sum0123, shift); sum4567 = vshlq_s32(sum4567, shift); uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); return vminq_u16(res, max); } static void highbd_convolve_2d_sr_vert_8tap_sve2( const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y, ConvolveParams *conv_params, int bd, const int y_offset) { assert(width >= 4 && height >= 4); const int64x2_t offset = vdupq_n_s64(y_offset); const int32x4_t shift = vdupq_n_s32(-conv_params->round_1); const int16x8_t y_filter = vld1q_s16(filter_y); uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); // Scale indices by size of the true vector length to avoid reading from an // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. uint16x8_t correction0 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); uint16x8_t correction1 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); uint16x8_t correction2 = vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); if (width == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); int16_t *s = (int16_t *)src; int16x4_t s0, s1, s2, s3, s4, s5, s6; load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; // This operation combines a conventional transpose and the sample permute // required before computing the dot product. int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; transpose_concat_4x4(s0, s1, s2, s3, s0123); transpose_concat_4x4(s1, s2, s3, s4, s1234); transpose_concat_4x4(s2, s3, s4, s5, s2345); transpose_concat_4x4(s3, s4, s5, s6, s3456); do { int16x4_t s7, s8, s9, s10; load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; // Transpose and shuffle the 4 lines that were loaded. transpose_concat_4x4(s7, s8, s9, s10, s789A); // Merge new data into block from previous iteration. aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); uint16x4_t d0 = highbd_convolve8_4_2d_v(s0123, s4567, y_filter, shift, offset, max); uint16x4_t d1 = highbd_convolve8_4_2d_v(s1234, s5678, y_filter, shift, offset, max); uint16x4_t d2 = highbd_convolve8_4_2d_v(s2345, s6789, y_filter, shift, offset, max); uint16x4_t d3 = highbd_convolve8_4_2d_v(s3456, s789A, y_filter, shift, offset, max); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123[0] = s4567[0]; s0123[1] = s4567[1]; s1234[0] = s5678[0]; s1234[1] = s5678[1]; s2345[0] = s6789[0]; s2345[1] = s6789[1]; s3456[0] = s789A[0]; s3456[1] = s789A[1]; s += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); do { int h = height; int16_t *s = (int16_t *)src; uint16_t *d = dst; int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; // This operation combines a conventional transpose and the sample permute // required before computing the dot product. int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; transpose_concat_8x4(s0, s1, s2, s3, s0123); transpose_concat_8x4(s1, s2, s3, s4, s1234); transpose_concat_8x4(s2, s3, s4, s5, s2345); transpose_concat_8x4(s3, s4, s5, s6, s3456); do { int16x8_t s7, s8, s9, s10; load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; // Transpose and shuffle the 4 lines that were loaded. transpose_concat_8x4(s7, s8, s9, s10, s789A); // Merge new data into block from previous iteration. aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); uint16x8_t d0 = highbd_convolve8_8_2d_v(s0123, s4567, y_filter, shift, offset, max); uint16x8_t d1 = highbd_convolve8_8_2d_v(s1234, s5678, y_filter, shift, offset, max); uint16x8_t d2 = highbd_convolve8_8_2d_v(s2345, s6789, y_filter, shift, offset, max); uint16x8_t d3 = highbd_convolve8_8_2d_v(s3456, s789A, y_filter, shift, offset, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); // Prepare block for next iteration - re-using as much as possible. // Shuffle everything up four rows. s0123[0] = s4567[0]; s0123[1] = s4567[1]; s0123[2] = s4567[2]; s0123[3] = s4567[3]; s1234[0] = s5678[0]; s1234[1] = s5678[1]; s1234[2] = s5678[2]; s1234[3] = s5678[3]; s2345[0] = s6789[0]; s2345[1] = s6789[1]; s2345[2] = s6789[2]; s2345[3] = s6789[3]; s3456[0] = s789A[0]; s3456[1] = s789A[1]; s3456[2] = s789A[2]; s3456[3] = s789A[3]; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); src += 8; dst += 8; width -= 8; } while (width != 0); } } static inline uint16x4_t highbd_convolve4_4_2d_v(int16x8_t samples[2], int16x8_t filter, int32x4_t shift, int64x2_t offset, uint16x4_t max) { int64x2_t sum01 = aom_svdot_lane_s16(offset, samples[0], filter, 0); int64x2_t sum23 = aom_svdot_lane_s16(offset, samples[1], filter, 0); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); sum0123 = vshlq_s32(sum0123, shift); uint16x4_t res = vqmovun_s32(sum0123); return vmin_u16(res, max); } static inline uint16x8_t highbd_convolve4_8_2d_v(int16x8_t samples[4], int16x8_t filter, int32x4_t shift, int64x2_t offset, uint16x8_t max) { int64x2_t sum01 = aom_svdot_lane_s16(offset, samples[0], filter, 0); int64x2_t sum23 = aom_svdot_lane_s16(offset, samples[1], filter, 0); int64x2_t sum45 = aom_svdot_lane_s16(offset, samples[2], filter, 0); int64x2_t sum67 = aom_svdot_lane_s16(offset, samples[3], filter, 0); int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); sum0123 = vshlq_s32(sum0123, shift); sum4567 = vshlq_s32(sum4567, shift); uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); return vminq_u16(res, max); } static void highbd_convolve_2d_sr_vert_4tap_sve2( const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int width, int height, const int16_t *filter_y, ConvolveParams *conv_params, int bd, const int y_offset) { assert(width >= 4 && height >= 4); const int64x2_t offset = vdupq_n_s64(y_offset); const int32x4_t shift = vdupq_n_s32(-conv_params->round_1); const int16x8_t y_filter = vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0)); if (width == 4) { const uint16x4_t max = vdup_n_u16((1 << bd) - 1); int16_t *s = (int16_t *)(src); int16x4_t s0, s1, s2; load_s16_4x3(s, src_stride, &s0, &s1, &s2); s += 3 * src_stride; do { int16x4_t s3, s4, s5, s6; load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); // This operation combines a conventional transpose and the sample permute // required before computing the dot product. int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; transpose_concat_4x4(s0, s1, s2, s3, s0123); transpose_concat_4x4(s1, s2, s3, s4, s1234); transpose_concat_4x4(s2, s3, s4, s5, s2345); transpose_concat_4x4(s3, s4, s5, s6, s3456); uint16x4_t d0 = highbd_convolve4_4_2d_v(s0123, y_filter, shift, offset, max); uint16x4_t d1 = highbd_convolve4_4_2d_v(s1234, y_filter, shift, offset, max); uint16x4_t d2 = highbd_convolve4_4_2d_v(s2345, y_filter, shift, offset, max); uint16x4_t d3 = highbd_convolve4_4_2d_v(s3456, y_filter, shift, offset, max); store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); // Shuffle everything up four rows. s0 = s4; s1 = s5; s2 = s6; s += 4 * src_stride; dst += 4 * dst_stride; height -= 4; } while (height != 0); } else { const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); do { int h = height; int16_t *s = (int16_t *)(src); uint16_t *d = dst; int16x8_t s0, s1, s2; load_s16_8x3(s, src_stride, &s0, &s1, &s2); s += 3 * src_stride; do { int16x8_t s3, s4, s5, s6; load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); // This operation combines a conventional transpose and the sample // permute required before computing the dot product. int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; transpose_concat_8x4(s0, s1, s2, s3, s0123); transpose_concat_8x4(s1, s2, s3, s4, s1234); transpose_concat_8x4(s2, s3, s4, s5, s2345); transpose_concat_8x4(s3, s4, s5, s6, s3456); uint16x8_t d0 = highbd_convolve4_8_2d_v(s0123, y_filter, shift, offset, max); uint16x8_t d1 = highbd_convolve4_8_2d_v(s1234, y_filter, shift, offset, max); uint16x8_t d2 = highbd_convolve4_8_2d_v(s2345, y_filter, shift, offset, max); uint16x8_t d3 = highbd_convolve4_8_2d_v(s3456, y_filter, shift, offset, max); store_u16_8x4(d, dst_stride, d0, d1, d2, d3); // Shuffle everything up four rows. s0 = s4; s1 = s5; s2 = s6; s += 4 * src_stride; d += 4 * dst_stride; h -= 4; } while (h != 0); src += 8; dst += 8; width -= 8; } while (width != 0); } } void av1_highbd_convolve_2d_sr_sve2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { if (w == 2 || h == 2) { av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); return; } DECLARE_ALIGNED(16, uint16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); if (x_filter_taps == 6 || y_filter_taps == 6) { av1_highbd_convolve_2d_sr_neon(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); return; } const int clamped_x_taps = x_filter_taps < 4 ? 4 : x_filter_taps; const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; const int im_stride = MAX_SB_SIZE; const int vert_offset = clamped_y_taps / 2 - 1; const int horiz_offset = clamped_x_taps / 2 - 1; const int x_offset = (1 << (bd + FILTER_BITS - 1)); const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a // simple shift left instead of a rounding saturating shift left. const int y_offset = (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1)); const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); const int im_h = h + clamped_y_taps - 1; if (x_filter_taps > 8) { highbd_convolve_2d_sr_horiz_12tap_sve2(src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, conv_params, x_offset); highbd_convolve_2d_sr_vert_12tap_sve2(im_block, im_stride, dst, dst_stride, w, h, y_filter_ptr, conv_params, bd, y_offset); return; } if (x_filter_taps <= 4) { highbd_convolve_2d_sr_horiz_4tap_sve2(src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, conv_params, x_offset); } else { highbd_convolve_2d_sr_horiz_8tap_sve2(src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, conv_params, x_offset); } if (y_filter_taps <= 4) { highbd_convolve_2d_sr_vert_4tap_sve2(im_block, im_stride, dst, dst_stride, w, h, y_filter_ptr, conv_params, bd, y_offset); } else { highbd_convolve_2d_sr_vert_8tap_sve2(im_block, im_stride, dst, dst_stride, w, h, y_filter_ptr, conv_params, bd, y_offset); } } aom-3.12.1/av1/common/arm/highbd_convolve_sve2.h000066400000000000000000000073141477627663500214140ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_SVE2_H_ #define AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_SVE2_H_ #include #include "aom_dsp/arm/aom_neon_sve2_bridge.h" // clang-format off DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = { // Shift left and insert new last column in transposed 4x4 block. 1, 2, 3, 0, 5, 6, 7, 4, // Shift left and insert two new columns in transposed 4x4 block. 2, 3, 0, 1, 6, 7, 4, 5, // Shift left and insert three new columns in transposed 4x4 block. 3, 0, 1, 2, 7, 4, 5, 6, }; // clang-format on static inline void transpose_concat_4x4(int16x4_t s0, int16x4_t s1, int16x4_t s2, int16x4_t s3, int16x8_t res[2]) { // Transpose 16-bit elements and concatenate result rows as follows: // s0: 00, 01, 02, 03 // s1: 10, 11, 12, 13 // s2: 20, 21, 22, 23 // s3: 30, 31, 32, 33 // // res[0]: 00 10 20 30 01 11 21 31 // res[1]: 02 12 22 32 03 13 23 33 int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0)); int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0)); int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0)); int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0)); int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q)); int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q)); int32x4x2_t s0123 = vzipq_s32(s01, s23); res[0] = vreinterpretq_s16_s32(s0123.val[0]); res[1] = vreinterpretq_s16_s32(s0123.val[1]); } static inline void transpose_concat_8x4(int16x8_t s0, int16x8_t s1, int16x8_t s2, int16x8_t s3, int16x8_t res[4]) { // Transpose 16-bit elements and concatenate result rows as follows: // s0: 00, 01, 02, 03, 04, 05, 06, 07 // s1: 10, 11, 12, 13, 14, 15, 16, 17 // s2: 20, 21, 22, 23, 24, 25, 26, 27 // s3: 30, 31, 32, 33, 34, 35, 36, 37 // // res[0]: 00 10 20 30 01 11 21 31 // res[1]: 02 12 22 32 03 13 23 33 // res[2]: 04 14 24 34 05 15 25 35 // res[3]: 06 16 26 36 07 17 27 37 int16x8x2_t tr01_16 = vzipq_s16(s0, s1); int16x8x2_t tr23_16 = vzipq_s16(s2, s3); int32x4x2_t tr01_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[0]), vreinterpretq_s32_s16(tr23_16.val[0])); int32x4x2_t tr23_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[1]), vreinterpretq_s32_s16(tr23_16.val[1])); res[0] = vreinterpretq_s16_s32(tr01_32.val[0]); res[1] = vreinterpretq_s16_s32(tr01_32.val[1]); res[2] = vreinterpretq_s16_s32(tr23_32.val[0]); res[3] = vreinterpretq_s16_s32(tr23_32.val[1]); } static inline void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4], uint16x8_t tbl, int16x8_t res[4]) { res[0] = aom_tbl2_s16(t0[0], t1[0], tbl); res[1] = aom_tbl2_s16(t0[1], t1[1], tbl); res[2] = aom_tbl2_s16(t0[2], t1[2], tbl); res[3] = aom_tbl2_s16(t0[3], t1[3], tbl); } static inline void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2], uint16x8_t tbl, int16x8_t res[2]) { res[0] = aom_tbl2_s16(t0[0], t1[0], tbl); res[1] = aom_tbl2_s16(t0[1], t1[1], tbl); } #endif // AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_SVE2_H_ aom-3.12.1/av1/common/arm/highbd_inv_txfm_neon.c000066400000000000000000006615761477627663500215060ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you canzip * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "av1/common/av1_inv_txfm1d_cfg.h" #include "av1/common/idct.h" #include "config/aom_config.h" #include "config/av1_rtcd.h" #if AOM_ARCH_AARCH64 #define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \ do { \ int32x4x2_t swap_low = vtrnq_s32(x0, x1); \ int32x4x2_t swap_high = vtrnq_s32(x2, x3); \ y0 = vreinterpretq_s32_s64( \ vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[0]), \ vreinterpretq_s64_s32(swap_high.val[0]))); \ y1 = vreinterpretq_s32_s64( \ vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[1]), \ vreinterpretq_s64_s32(swap_high.val[1]))); \ y2 = vreinterpretq_s32_s64( \ vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[0]), \ vreinterpretq_s64_s32(swap_high.val[0]))); \ y3 = vreinterpretq_s32_s64( \ vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[1]), \ vreinterpretq_s64_s32(swap_high.val[1]))); \ } while (0) #else #define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \ do { \ int32x4x2_t swap_low = vtrnq_s32(x0, x1); \ int32x4x2_t swap_high = vtrnq_s32(x2, x3); \ y0 = vextq_s32(vextq_s32(swap_low.val[0], swap_low.val[0], 2), \ swap_high.val[0], 2); \ y1 = vextq_s32(vextq_s32(swap_low.val[1], swap_low.val[1], 2), \ swap_high.val[1], 2); \ y2 = vextq_s32(swap_low.val[0], \ vextq_s32(swap_high.val[0], swap_high.val[0], 2), 2); \ y3 = vextq_s32(swap_low.val[1], \ vextq_s32(swap_high.val[1], swap_high.val[1], 2), 2); \ } while (0) #endif // AOM_ARCH_AARCH64 static inline void transpose_4x4(const int32x4_t *in, int32x4_t *out) { TRANSPOSE_4X4(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]); } static inline void transpose_8x8(const int32x4_t *in, int32x4_t *out) { TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]); TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]); TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]); TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13], out[15]); } static inline void round_shift_array_32_neon(int32x4_t *input, int32x4_t *output, const int size, const int bit) { const int32x4_t v_bit = vdupq_n_s32(-bit); for (int i = 0; i < size; i++) { output[i] = vrshlq_s32(input[i], v_bit); } } static inline void round_shift_rect_array_32_neon(int32x4_t *input, int32x4_t *output, const int size) { for (int i = 0; i < size; i++) { const int32x4_t r0 = vmulq_n_s32(input[i], NewInvSqrt2); output[i] = vrshrq_n_s32(r0, NewSqrt2Bits); } } static inline int32x4_t half_btf_neon_r(const int32_t *n0, const int32x4_t *w0, const int32_t *n1, const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) { int32x4_t x; x = vmlaq_n_s32(*rnding, *w0, *n0); x = vmlaq_n_s32(x, *w1, *n1); x = vshlq_s32(x, *v_bit); return x; } static inline int32x4_t half_btf_neon_mode11_r( const int32_t *n0, const int32x4_t *w0, const int32_t *n1, const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) { int32x4_t x; x = vmlaq_n_s32(*rnding, *w0, -*n0); x = vmlaq_n_s32(x, *w1, -*n1); x = vshlq_s32(x, *v_bit); return x; } static inline int32x4_t half_btf_neon_mode01_r( const int32_t *n0, const int32x4_t *w0, const int32_t *n1, const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) { int32x4_t x; x = vmlaq_n_s32(*rnding, *w0, *n0); x = vmlsq_n_s32(x, *w1, *n1); x = vshlq_s32(x, *v_bit); return x; } static inline int32x4_t half_btf_neon_mode10_r( const int32_t *n0, const int32x4_t *w0, const int32_t *n1, const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) { int32x4_t x; x = vmlaq_n_s32(*rnding, *w1, *n1); x = vmlsq_n_s32(x, *w0, *n0); x = vshlq_s32(x, *v_bit); return x; } static inline int32x4_t half_btf_0_neon_r(const int32_t *n0, const int32x4_t *w0, const int32x4_t *v_bit, const int32x4_t *rnding) { int32x4_t x; x = vmlaq_n_s32(*rnding, *w0, *n0); x = vshlq_s32(x, *v_bit); return x; } static inline int32x4_t half_btf_0_m_neon_r(const int32_t *n0, const int32x4_t *w0, const int32x4_t *v_bit, const int32x4_t *rnding) { int32x4_t x; x = vmlaq_n_s32(*rnding, *w0, -*n0); x = vshlq_s32(x, *v_bit); return x; } static inline void flip_buf_neon(int32x4_t *in, int32x4_t *out, int size) { for (int i = 0; i < size; ++i) { out[size - i - 1] = in[i]; } } typedef void (*fwd_transform_1d_neon)(int32x4_t *in, int32x4_t *out, int bit, const int num_cols); typedef void (*transform_1d_neon)(int32x4_t *in, int32x4_t *out, int32_t bit, int32_t do_cols, int32_t bd, int32_t out_shift); static inline uint16x8_t highbd_clamp_u16(uint16x8_t *u, const uint16x8_t *min, const uint16x8_t *max) { int16x8_t clamped; clamped = vminq_s16(vreinterpretq_s16_u16(*u), vreinterpretq_s16_u16(*max)); clamped = vmaxq_s16(clamped, vreinterpretq_s16_u16(*min)); return vreinterpretq_u16_s16(clamped); } static inline void round_shift_4x4(int32x4_t *in, int shift) { if (shift != 0) { const int32x4_t v_shift = vdupq_n_s32(-shift); in[0] = vrshlq_s32(in[0], v_shift); in[1] = vrshlq_s32(in[1], v_shift); in[2] = vrshlq_s32(in[2], v_shift); in[3] = vrshlq_s32(in[3], v_shift); } } static void round_shift_8x8(int32x4_t *in, int shift) { assert(shift != 0); const int32x4_t v_shift = vdupq_n_s32(-shift); in[0] = vrshlq_s32(in[0], v_shift); in[1] = vrshlq_s32(in[1], v_shift); in[2] = vrshlq_s32(in[2], v_shift); in[3] = vrshlq_s32(in[3], v_shift); in[4] = vrshlq_s32(in[4], v_shift); in[5] = vrshlq_s32(in[5], v_shift); in[6] = vrshlq_s32(in[6], v_shift); in[7] = vrshlq_s32(in[7], v_shift); in[8] = vrshlq_s32(in[8], v_shift); in[9] = vrshlq_s32(in[9], v_shift); in[10] = vrshlq_s32(in[10], v_shift); in[11] = vrshlq_s32(in[11], v_shift); in[12] = vrshlq_s32(in[12], v_shift); in[13] = vrshlq_s32(in[13], v_shift); in[14] = vrshlq_s32(in[14], v_shift); in[15] = vrshlq_s32(in[15], v_shift); } static void highbd_clamp_s32_neon(int32x4_t *in, int32x4_t *out, const int32x4_t *clamp_lo, const int32x4_t *clamp_hi, int size) { int32x4_t a0, a1; for (int i = 0; i < size; i += 4) { a0 = vmaxq_s32(in[i], *clamp_lo); out[i] = vminq_s32(a0, *clamp_hi); a1 = vmaxq_s32(in[i + 1], *clamp_lo); out[i + 1] = vminq_s32(a1, *clamp_hi); a0 = vmaxq_s32(in[i + 2], *clamp_lo); out[i + 2] = vminq_s32(a0, *clamp_hi); a1 = vmaxq_s32(in[i + 3], *clamp_lo); out[i + 3] = vminq_s32(a1, *clamp_hi); } } static inline uint16x8_t highbd_get_recon_8x8_neon(const uint16x8_t pred, int32x4_t res0, int32x4_t res1, const int bd) { const uint16x8_t v_zero = vdupq_n_u16(0); int32x4_t min_clip_val = vreinterpretq_s32_u16(v_zero); int32x4_t max_clip_val = vdupq_n_s32((1 << bd) - 1); uint16x8x2_t x; x.val[0] = vreinterpretq_u16_s32( vaddw_s16(res0, vreinterpret_s16_u16(vget_low_u16(pred)))); x.val[1] = vreinterpretq_u16_s32( vaddw_s16(res1, vreinterpret_s16_u16(vget_high_u16(pred)))); x.val[0] = vreinterpretq_u16_s32( vmaxq_s32(vreinterpretq_s32_u16(x.val[0]), min_clip_val)); x.val[0] = vreinterpretq_u16_s32( vminq_s32(vreinterpretq_s32_u16(x.val[0]), max_clip_val)); x.val[1] = vreinterpretq_u16_s32( vmaxq_s32(vreinterpretq_s32_u16(x.val[1]), min_clip_val)); x.val[1] = vreinterpretq_u16_s32( vminq_s32(vreinterpretq_s32_u16(x.val[1]), max_clip_val)); uint16x8_t res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])), vqmovn_u32(vreinterpretq_u32_u16(x.val[1]))); return res; } static inline uint16x4_t highbd_get_recon_4xn_neon(uint16x4_t pred, int32x4_t res0, const int bd) { uint16x4_t x0_ = vreinterpret_u16_s16( vmovn_s32(vaddw_s16(res0, vreinterpret_s16_u16(pred)))); uint16x8_t x0 = vcombine_u16(x0_, x0_); const uint16x8_t vmin = vdupq_n_u16(0); const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1); x0 = highbd_clamp_u16(&x0, &vmin, &vmax); return vget_low_u16(x0); } static inline void highbd_write_buffer_4xn_neon(int32x4_t *in, uint16_t *output, int stride, int flipud, int height, const int bd) { int j = flipud ? (height - 1) : 0; const int step = flipud ? -1 : 1; for (int i = 0; i < height; ++i, j += step) { uint16x4_t v = vld1_u16(output + i * stride); uint16x4_t u = highbd_get_recon_4xn_neon(v, in[j], bd); vst1_u16(output + i * stride, u); } } static inline void highbd_write_buffer_8xn_neon(int32x4_t *in, uint16_t *output, int stride, int flipud, int height, const int bd) { int j = flipud ? (height - 1) : 0; const int step = flipud ? -1 : 1; for (int i = 0; i < height; ++i, j += step) { uint16x8_t v = vld1q_u16(output + i * stride); uint16x8_t u = highbd_get_recon_8x8_neon(v, in[j], in[j + height], bd); vst1q_u16(output + i * stride, u); } } static inline void load_buffer_32bit_input(const int32_t *in, int stride, int32x4_t *out, int out_size) { for (int i = 0; i < out_size; ++i) { out[i] = vld1q_s32(in + i * stride); } } static inline void load_buffer_4x4(const int32_t *coeff, int32x4_t *in) { in[0] = vld1q_s32(coeff + 0); in[1] = vld1q_s32(coeff + 4); in[2] = vld1q_s32(coeff + 8); in[3] = vld1q_s32(coeff + 12); } static void addsub_neon(const int32x4_t in0, const int32x4_t in1, int32x4_t *out0, int32x4_t *out1, const int32x4_t *clamp_lo, const int32x4_t *clamp_hi) { int32x4_t a0 = vaddq_s32(in0, in1); int32x4_t a1 = vsubq_s32(in0, in1); a0 = vmaxq_s32(a0, *clamp_lo); a0 = vminq_s32(a0, *clamp_hi); a1 = vmaxq_s32(a1, *clamp_lo); a1 = vminq_s32(a1, *clamp_hi); *out0 = a0; *out1 = a1; } static void shift_and_clamp_neon(int32x4_t *in0, int32x4_t *in1, const int32x4_t *clamp_lo, const int32x4_t *clamp_hi, const int32x4_t *v_shift) { int32x4_t in0_w_offset = vrshlq_s32(*in0, *v_shift); int32x4_t in1_w_offset = vrshlq_s32(*in1, *v_shift); in0_w_offset = vmaxq_s32(in0_w_offset, *clamp_lo); in0_w_offset = vminq_s32(in0_w_offset, *clamp_hi); in1_w_offset = vmaxq_s32(in1_w_offset, *clamp_lo); in1_w_offset = vminq_s32(in1_w_offset, *clamp_hi); *in0 = in0_w_offset; *in1 = in1_w_offset; } static inline void idct32_stage4_neon(int32x4_t *bf1, const int32_t *cospi, const int32x4_t *v_bit, const int32x4_t *rnding) { int32x4_t temp1, temp2; temp1 = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30], v_bit, rnding); bf1[30] = half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], v_bit, rnding); bf1[17] = temp1; temp2 = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29], v_bit, rnding); bf1[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29], v_bit, rnding); bf1[18] = temp2; temp1 = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26], v_bit, rnding); bf1[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], v_bit, rnding); bf1[21] = temp1; temp2 = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25], v_bit, rnding); bf1[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25], v_bit, rnding); bf1[22] = temp2; } static inline void idct32_stage5_neon(int32x4_t *bf1, const int32_t *cospi, const int32x4_t *clamp_lo, const int32x4_t *clamp_hi, const int32x4_t *v_bit, const int32x4_t *rnding) { int32x4_t temp1, temp2; temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[9], &cospi[48], &bf1[14], v_bit, rnding); bf1[14] = half_btf_neon_r(&cospi[48], &bf1[9], &cospi[16], &bf1[14], v_bit, rnding); bf1[9] = temp1; temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[10], &cospi[16], &bf1[13], v_bit, rnding); bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf1[10], &cospi[48], &bf1[13], v_bit, rnding); bf1[10] = temp2; addsub_neon(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); addsub_neon(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); addsub_neon(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); addsub_neon(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); addsub_neon(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); addsub_neon(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); addsub_neon(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); addsub_neon(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); } static inline void idct32_stage6_neon(int32x4_t *bf1, const int32_t *cospi, const int32x4_t *clamp_lo, const int32x4_t *clamp_hi, const int32x4_t *v_bit, const int32x4_t *rnding) { int32x4_t temp1, temp2; temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], v_bit, rnding); bf1[6] = half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], v_bit, rnding); bf1[5] = temp1; addsub_neon(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); addsub_neon(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); addsub_neon(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); addsub_neon(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29], v_bit, rnding); bf1[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], v_bit, rnding); bf1[18] = temp1; temp2 = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28], v_bit, rnding); bf1[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], v_bit, rnding); bf1[19] = temp2; temp1 = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27], v_bit, rnding); bf1[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27], v_bit, rnding); bf1[20] = temp1; temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26], v_bit, rnding); bf1[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26], v_bit, rnding); bf1[21] = temp2; } static inline void idct32_stage7_neon(int32x4_t *bf1, const int32_t *cospi, const int32x4_t *clamp_lo, const int32x4_t *clamp_hi, const int32x4_t *v_bit, const int32x4_t *rnding) { int32x4_t temp1, temp2; addsub_neon(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); addsub_neon(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); addsub_neon(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); addsub_neon(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13], v_bit, rnding); bf1[13] = half_btf_neon_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13], v_bit, rnding); bf1[10] = temp1; temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12], v_bit, rnding); bf1[12] = half_btf_neon_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12], v_bit, rnding); bf1[11] = temp2; addsub_neon(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); addsub_neon(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); addsub_neon(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); addsub_neon(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); addsub_neon(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); addsub_neon(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); addsub_neon(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); addsub_neon(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); } static inline void idct32_stage8_neon(int32x4_t *bf1, const int32_t *cospi, const int32x4_t *clamp_lo, const int32x4_t *clamp_hi, const int32x4_t *v_bit, const int32x4_t *rnding) { int32x4_t temp1, temp2; addsub_neon(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); addsub_neon(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); addsub_neon(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); addsub_neon(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); addsub_neon(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); addsub_neon(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); addsub_neon(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); addsub_neon(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], v_bit, rnding); bf1[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], v_bit, rnding); bf1[20] = temp1; temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], v_bit, rnding); bf1[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], v_bit, rnding); bf1[21] = temp2; temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], v_bit, rnding); bf1[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], v_bit, rnding); bf1[22] = temp1; temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], v_bit, rnding); bf1[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], v_bit, rnding); bf1[23] = temp2; } static inline void idct32_stage9_neon(int32x4_t *bf1, int32x4_t *out, const int do_cols, const int bd, const int out_shift, const int32x4_t *clamp_lo, const int32x4_t *clamp_hi) { addsub_neon(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi); addsub_neon(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi); addsub_neon(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi); addsub_neon(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi); addsub_neon(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi); addsub_neon(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi); addsub_neon(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi); addsub_neon(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi); addsub_neon(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi); addsub_neon(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi); addsub_neon(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi); addsub_neon(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi); addsub_neon(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi); addsub_neon(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi); addsub_neon(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi); addsub_neon(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi); if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); for (int i = 0; i < 32; i += 8) { round_shift_4x4(out + i, out_shift); round_shift_4x4(out + i + 4, out_shift); } highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32); } } static void neg_shift_neon(const int32x4_t *in0, const int32x4_t *in1, int32x4_t *out0, int32x4_t *out1, const int32x4_t *clamp_lo, const int32x4_t *clamp_hi, const int32x4_t *v_shift, int32x4_t *offset) { int32x4_t a0 = vaddq_s32(*offset, *in0); int32x4_t a1 = vsubq_s32(*offset, *in1); a0 = vshlq_s32(a0, *v_shift); a1 = vshlq_s32(a1, *v_shift); a0 = vmaxq_s32(a0, *clamp_lo); a0 = vminq_s32(a0, *clamp_hi); a1 = vmaxq_s32(a1, *clamp_lo); a1 = vminq_s32(a1, *clamp_hi); *out0 = a0; *out1 = a1; } static void idct4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); int32x4_t u0, u1, u2, u3; int32x4_t v0, v1, v2, v3, x, y; // Stage 0-1-2 u0 = in[0]; u1 = in[1]; u2 = in[2]; u3 = in[3]; const int32x4_t v_bit = vdupq_n_s32(-bit); x = vmlaq_n_s32(rnding, u0, cospi[32]); y = vmulq_n_s32(u2, cospi[32]); v0 = vaddq_s32(x, y); v0 = vshlq_s32(v0, v_bit); v1 = vsubq_s32(x, y); v1 = vshlq_s32(v1, v_bit); x = vmlaq_n_s32(rnding, u1, cospi[48]); v2 = vmlsq_n_s32(x, u3, cospi[16]); v2 = vshlq_s32(v2, v_bit); x = vmlaq_n_s32(rnding, u1, cospi[16]); v3 = vmlaq_n_s32(x, u3, cospi[48]); v3 = vshlq_s32(v3, v_bit); // Stage 3 addsub_neon(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi); addsub_neon(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi); if (!do_cols) { log_range = AOMMAX(16, bd + 6); clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); const int32x4_t v_shift = vdupq_n_s32(-out_shift); shift_and_clamp_neon(out + 0, out + 3, &clamp_lo, &clamp_hi, &v_shift); shift_and_clamp_neon(out + 1, out + 2, &clamp_lo, &clamp_hi, &v_shift); } } static void iadst4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *sinpi = sinpi_arr(bit); const int32x4_t zero = vdupq_n_s32(0); int64x2_t rnding = vdupq_n_s64(1ll << (bit + 4 - 1)); const int32x2_t mul = vdup_n_s32(1 << 4); int32x4_t t; int32x4_t s0, s1, s2, s3, s4, s5, s6, s7; int32x4_t x0, x1, x2, x3; int32x4_t u0, u1, u2, u3; x0 = in[0]; x1 = in[1]; x2 = in[2]; x3 = in[3]; s0 = vmulq_n_s32(x0, sinpi[1]); s1 = vmulq_n_s32(x0, sinpi[2]); s2 = vmulq_n_s32(x1, sinpi[3]); s3 = vmulq_n_s32(x2, sinpi[4]); s4 = vmulq_n_s32(x2, sinpi[1]); s5 = vmulq_n_s32(x3, sinpi[2]); s6 = vmulq_n_s32(x3, sinpi[4]); t = vsubq_s32(x0, x2); s7 = vaddq_s32(t, x3); t = vaddq_s32(s0, s3); s0 = vaddq_s32(t, s5); t = vsubq_s32(s1, s4); s1 = vsubq_s32(t, s6); s3 = s2; s2 = vmulq_n_s32(s7, sinpi[3]); u0 = vaddq_s32(s0, s3); u1 = vaddq_s32(s1, s3); u2 = s2; t = vaddq_s32(s0, s1); u3 = vsubq_s32(t, s3); // u0 int32x4x2_t u0x; u0x.val[0] = vreinterpretq_s32_s64( vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul)); u0x.val[0] = vreinterpretq_s32_s64( vaddq_s64(vreinterpretq_s64_s32(u0x.val[0]), rnding)); u0 = vextq_s32(u0, zero, 1); u0x.val[1] = vreinterpretq_s32_s64( vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul)); u0x.val[1] = vreinterpretq_s32_s64( vaddq_s64(vreinterpretq_s64_s32(u0x.val[1]), rnding)); u0x.val[0] = vreinterpretq_s32_s16(vextq_s16( vreinterpretq_s16_s32(u0x.val[0]), vreinterpretq_s16_s32(zero), 1)); u0x.val[1] = vreinterpretq_s32_s16(vextq_s16( vreinterpretq_s16_s32(u0x.val[1]), vreinterpretq_s16_s32(zero), 1)); u0x = vzipq_s32(u0x.val[0], u0x.val[1]); #if AOM_ARCH_AARCH64 u0 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u0x.val[0]), vreinterpretq_s64_s32(u0x.val[1]))); #else u0 = vcombine_s32(vget_low_s32(u0x.val[0]), vget_low_s32(u0x.val[1])); #endif // AOM_ARCH_AARCH64 // u1 int32x4x2_t u1x; u1x.val[0] = vreinterpretq_s32_s64( vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul)); u1x.val[0] = vreinterpretq_s32_s64( vaddq_s64(vreinterpretq_s64_s32(u1x.val[0]), rnding)); u1 = vextq_s32(u1, zero, 1); u1x.val[1] = vreinterpretq_s32_s64( vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul)); u1x.val[1] = vreinterpretq_s32_s64( vaddq_s64(vreinterpretq_s64_s32(u1x.val[1]), rnding)); u1x.val[0] = vreinterpretq_s32_s16(vextq_s16( vreinterpretq_s16_s32(u1x.val[0]), vreinterpretq_s16_s32(zero), 1)); u1x.val[1] = vreinterpretq_s32_s16(vextq_s16( vreinterpretq_s16_s32(u1x.val[1]), vreinterpretq_s16_s32(zero), 1)); u1x = vzipq_s32(u1x.val[0], u1x.val[1]); #if AOM_ARCH_AARCH64 u1 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u1x.val[0]), vreinterpretq_s64_s32(u1x.val[1]))); #else u1 = vcombine_s32(vget_low_s32(u1x.val[0]), vget_low_s32(u1x.val[1])); #endif // AOM_ARCH_AARCH64 // u2 int32x4x2_t u2x; u2x.val[0] = vreinterpretq_s32_s64( vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul)); u2x.val[0] = vreinterpretq_s32_s64( vaddq_s64(vreinterpretq_s64_s32(u2x.val[0]), rnding)); u2 = vextq_s32(u2, zero, 1); u2x.val[1] = vreinterpretq_s32_s64( vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul)); u2x.val[1] = vreinterpretq_s32_s64( vaddq_s64(vreinterpretq_s64_s32(u2x.val[1]), rnding)); u2x.val[0] = vreinterpretq_s32_s16(vextq_s16( vreinterpretq_s16_s32(u2x.val[0]), vreinterpretq_s16_s32(zero), 1)); u2x.val[1] = vreinterpretq_s32_s16(vextq_s16( vreinterpretq_s16_s32(u2x.val[1]), vreinterpretq_s16_s32(zero), 1)); u2x = vzipq_s32(u2x.val[0], u2x.val[1]); #if AOM_ARCH_AARCH64 u2 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u2x.val[0]), vreinterpretq_s64_s32(u2x.val[1]))); #else u2 = vcombine_s32(vget_low_s32(u2x.val[0]), vget_low_s32(u2x.val[1])); #endif // AOM_ARCH_AARCH64 // u3 int32x4x2_t u3x; u3x.val[0] = vreinterpretq_s32_s64( vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul)); u3x.val[0] = vreinterpretq_s32_s64( vaddq_s64(vreinterpretq_s64_s32(u3x.val[0]), rnding)); u3 = vextq_s32(u3, zero, 1); u3x.val[1] = vreinterpretq_s32_s64( vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul)); u3x.val[1] = vreinterpretq_s32_s64( vaddq_s64(vreinterpretq_s64_s32(u3x.val[1]), rnding)); u3x.val[0] = vreinterpretq_s32_s16(vextq_s16( vreinterpretq_s16_s32(u3x.val[0]), vreinterpretq_s16_s32(zero), 1)); u3x.val[1] = vreinterpretq_s32_s16(vextq_s16( vreinterpretq_s16_s32(u3x.val[1]), vreinterpretq_s16_s32(zero), 1)); u3x = vzipq_s32(u3x.val[0], u3x.val[1]); #if AOM_ARCH_AARCH64 u3 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u3x.val[0]), vreinterpretq_s64_s32(u3x.val[1]))); #else u3 = vcombine_s32(vget_low_s32(u3x.val[0]), vget_low_s32(u3x.val[1])); #endif // AOM_ARCH_AARCH64 out[0] = u0; out[1] = u1; out[2] = u2; out[3] = u3; if (!do_cols) { const int log_range = AOMMAX(16, bd + 6); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); round_shift_4x4(out, out_shift); highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4); } } static void write_buffer_4x4(int32x4_t *in, uint16_t *output, int stride, int fliplr, int flipud, int shift, int bd) { uint32x4_t u0, u1, u2, u3; uint16x4_t v0, v1, v2, v3; round_shift_4x4(in, shift); v0 = vld1_u16(output + 0 * stride); v1 = vld1_u16(output + 1 * stride); v2 = vld1_u16(output + 2 * stride); v3 = vld1_u16(output + 3 * stride); if (fliplr) { u0 = vrev64q_u32(vreinterpretq_u32_s32(in[0])); in[0] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2)); u0 = vrev64q_u32(vreinterpretq_u32_s32(in[1])); in[1] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2)); u0 = vrev64q_u32(vreinterpretq_u32_s32(in[2])); in[2] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2)); u0 = vrev64q_u32(vreinterpretq_u32_s32(in[3])); in[3] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2)); } if (flipud) { u0 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v0); u1 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v1); u2 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v2); u3 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v3); } else { u0 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v0); u1 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v1); u2 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v2); u3 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v3); } uint16x8_t u4 = vcombine_u16(vqmovn_u32(u0), vqmovn_u32(u1)); uint16x8_t u5 = vcombine_u16(vqmovn_u32(u2), vqmovn_u32(u3)); const uint16x8_t vmin = vdupq_n_u16(0); const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1); u4 = highbd_clamp_u16(&u4, &vmin, &vmax); u5 = highbd_clamp_u16(&u5, &vmin, &vmax); vst1_u16(output + 0 * stride, vget_low_u16(u4)); vst1_u16(output + 1 * stride, vget_high_u16(u4)); vst1_u16(output + 2 * stride, vget_low_u16(u5)); vst1_u16(output + 3 * stride, vget_high_u16(u5)); } static void iidentity4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { (void)bit; int32x4_t zero = vdupq_n_s32(0); int32x2_t fact = vdup_n_s32(NewSqrt2); int32x4x2_t a0; const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1)); for (int i = 0; i < 4; i++) { a0.val[0] = vreinterpretq_s32_s64( vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact)); a0.val[0] = vreinterpretq_s32_s64( vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits)); a0.val[1] = vextq_s32(in[i], zero, 1); a0.val[1] = vreinterpretq_s32_s64( vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact)); a0.val[1] = vreinterpretq_s32_s64( vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits)); a0 = vzipq_s32(a0.val[0], a0.val[1]); #if AOM_ARCH_AARCH64 out[i] = vreinterpretq_s32_s64(vzip1q_s64( vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1]))); #else out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2); #endif } if (!do_cols) { const int log_range = AOMMAX(16, bd + 6); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); round_shift_4x4(out, out_shift); highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4); } } void av1_inv_txfm2d_add_4x4_neon(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { int32x4_t in[4]; const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4]; switch (tx_type) { case DCT_DCT: load_buffer_4x4(input, in); idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); transpose_4x4(in, in); idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case ADST_DCT: load_buffer_4x4(input, in); idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); transpose_4x4(in, in); iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case DCT_ADST: load_buffer_4x4(input, in); iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); transpose_4x4(in, in); idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case ADST_ADST: load_buffer_4x4(input, in); iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); transpose_4x4(in, in); iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case FLIPADST_DCT: load_buffer_4x4(input, in); idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); transpose_4x4(in, in); iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); break; case DCT_FLIPADST: load_buffer_4x4(input, in); iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); transpose_4x4(in, in); idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); break; case FLIPADST_FLIPADST: load_buffer_4x4(input, in); iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); transpose_4x4(in, in); iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd); break; case ADST_FLIPADST: load_buffer_4x4(input, in); iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); transpose_4x4(in, in); iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); break; case FLIPADST_ADST: load_buffer_4x4(input, in); iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); transpose_4x4(in, in); iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); break; case IDTX: load_buffer_4x4(input, in); iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0); transpose_4x4(in, in); iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case V_DCT: load_buffer_4x4(input, in); iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0); transpose_4x4(in, in); idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case H_DCT: load_buffer_4x4(input, in); idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); transpose_4x4(in, in); iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case V_ADST: load_buffer_4x4(input, in); iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0); transpose_4x4(in, in); iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case H_ADST: load_buffer_4x4(input, in); iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); transpose_4x4(in, in); iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case V_FLIPADST: load_buffer_4x4(input, in); iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0); transpose_4x4(in, in); iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); break; case H_FLIPADST: load_buffer_4x4(input, in); iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); transpose_4x4(in, in); iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); break; default: assert(0); } } // 8x8 static void load_buffer_8x8(const int32_t *coeff, int32x4_t *in) { in[0] = vld1q_s32(coeff + 0); in[1] = vld1q_s32(coeff + 4); in[2] = vld1q_s32(coeff + 8); in[3] = vld1q_s32(coeff + 12); in[4] = vld1q_s32(coeff + 16); in[5] = vld1q_s32(coeff + 20); in[6] = vld1q_s32(coeff + 24); in[7] = vld1q_s32(coeff + 28); in[8] = vld1q_s32(coeff + 32); in[9] = vld1q_s32(coeff + 36); in[10] = vld1q_s32(coeff + 40); in[11] = vld1q_s32(coeff + 44); in[12] = vld1q_s32(coeff + 48); in[13] = vld1q_s32(coeff + 52); in[14] = vld1q_s32(coeff + 56); in[15] = vld1q_s32(coeff + 60); } static void idct8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); int32x4_t u0, u1, u2, u3, u4, u5, u6, u7; int32x4_t v0, v1, v2, v3, v4, v5, v6, v7; int32x4_t x, y; int col; const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); const int32x4_t v_bit = vdupq_n_s32(-bit); // Note: // Even column: 0, 2, ..., 14 // Odd column: 1, 3, ..., 15 // one even column plus one odd column constructs one row (8 coeffs) // total we have 8 rows (8x8). for (col = 0; col < 2; ++col) { // stage 0 // stage 1 // stage 2 u0 = in[0 * 2 + col]; u1 = in[4 * 2 + col]; u2 = in[2 * 2 + col]; u3 = in[6 * 2 + col]; x = vmulq_n_s32(in[1 * 2 + col], cospi[56]); u4 = vmlaq_n_s32(x, in[7 * 2 + col], -cospi[8]); u4 = vaddq_s32(u4, rnding); u4 = vshlq_s32(u4, v_bit); x = vmulq_n_s32(in[1 * 2 + col], cospi[8]); u7 = vmlaq_n_s32(x, in[7 * 2 + col], cospi[56]); u7 = vaddq_s32(u7, rnding); u7 = vshlq_s32(u7, v_bit); x = vmulq_n_s32(in[5 * 2 + col], cospi[24]); u5 = vmlaq_n_s32(x, in[3 * 2 + col], -cospi[40]); u5 = vaddq_s32(u5, rnding); u5 = vshlq_s32(u5, v_bit); x = vmulq_n_s32(in[5 * 2 + col], cospi[40]); u6 = vmlaq_n_s32(x, in[3 * 2 + col], cospi[24]); u6 = vaddq_s32(u6, rnding); u6 = vshlq_s32(u6, v_bit); // stage 3 x = vmulq_n_s32(u0, cospi[32]); y = vmulq_n_s32(u1, cospi[32]); v0 = vaddq_s32(x, y); v0 = vaddq_s32(v0, rnding); v0 = vshlq_s32(v0, v_bit); v1 = vsubq_s32(x, y); v1 = vaddq_s32(v1, rnding); v1 = vshlq_s32(v1, v_bit); x = vmulq_n_s32(u2, cospi[48]); v2 = vmlaq_n_s32(x, u3, -cospi[16]); v2 = vaddq_s32(v2, rnding); v2 = vshlq_s32(v2, v_bit); x = vmulq_n_s32(u2, cospi[16]); v3 = vmlaq_n_s32(x, u3, cospi[48]); v3 = vaddq_s32(v3, rnding); v3 = vshlq_s32(v3, v_bit); addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); // stage 4 addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); u4 = v4; u7 = v7; x = vmulq_n_s32(v5, cospi[32]); y = vmulq_n_s32(v6, cospi[32]); u6 = vaddq_s32(y, x); u6 = vaddq_s32(u6, rnding); u6 = vshlq_s32(u6, v_bit); u5 = vsubq_s32(y, x); u5 = vaddq_s32(u5, rnding); u5 = vshlq_s32(u5, v_bit); // stage 5 addsub_neon(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo, &clamp_hi); addsub_neon(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo, &clamp_hi); addsub_neon(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo, &clamp_hi); addsub_neon(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo, &clamp_hi); } if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); round_shift_8x8(out, out_shift); highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16); } } static void iadst8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const int32x4_t kZero = vdupq_n_s32(0); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); int32x4_t u[8], v[8], x; const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); // stage 0-1-2 // (1) u[0] = vmlaq_n_s32(rnding, in[14], cospi[4]); u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]); u[0] = vshlq_s32(u[0], v_bit); u[1] = vmlaq_n_s32(rnding, in[14], cospi[60]); u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]); u[1] = vshlq_s32(u[1], v_bit); // (2) u[2] = vmlaq_n_s32(rnding, in[10], cospi[20]); u[2] = vmlaq_n_s32(u[2], in[4], cospi[44]); u[2] = vshlq_s32(u[2], v_bit); u[3] = vmlaq_n_s32(rnding, in[10], cospi[44]); u[3] = vmlsq_n_s32(u[3], in[4], cospi[20]); u[3] = vshlq_s32(u[3], v_bit); // (3) u[4] = vmlaq_n_s32(rnding, in[6], cospi[36]); u[4] = vmlaq_n_s32(u[4], in[8], cospi[28]); u[4] = vshlq_s32(u[4], v_bit); u[5] = vmlaq_n_s32(rnding, in[6], cospi[28]); u[5] = vmlsq_n_s32(u[5], in[8], cospi[36]); u[5] = vshlq_s32(u[5], v_bit); // (4) u[6] = vmlaq_n_s32(rnding, in[2], cospi[52]); u[6] = vmlaq_n_s32(u[6], in[12], cospi[12]); u[6] = vshlq_s32(u[6], v_bit); u[7] = vmlaq_n_s32(rnding, in[2], cospi[12]); u[7] = vmlsq_n_s32(u[7], in[12], cospi[52]); u[7] = vshlq_s32(u[7], v_bit); // stage 3 addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); // stage 4 u[0] = v[0]; u[1] = v[1]; u[2] = v[2]; u[3] = v[3]; u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]); u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]); u[4] = vshlq_s32(u[4], v_bit); u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]); u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]); u[5] = vshlq_s32(u[5], v_bit); u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]); u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]); u[6] = vshlq_s32(u[6], v_bit); u[7] = vmlaq_n_s32(rnding, v[7], cospi[48]); u[7] = vmlaq_n_s32(u[7], v[6], cospi[16]); u[7] = vshlq_s32(u[7], v_bit); // stage 5 addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); // stage 6 u[0] = v[0]; u[1] = v[1]; u[4] = v[4]; u[5] = v[5]; v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]); x = vmulq_n_s32(v[3], cospi[32]); u[2] = vaddq_s32(v[0], x); u[2] = vshlq_s32(u[2], v_bit); u[3] = vsubq_s32(v[0], x); u[3] = vshlq_s32(u[3], v_bit); v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]); x = vmulq_n_s32(v[7], cospi[32]); u[6] = vaddq_s32(v[0], x); u[6] = vshlq_s32(u[6], v_bit); u[7] = vsubq_s32(v[0], x); u[7] = vshlq_s32(u[7], v_bit); // stage 7 if (do_cols) { out[0] = u[0]; out[2] = vsubq_s32(kZero, u[4]); out[4] = u[6]; out[6] = vsubq_s32(kZero, u[2]); out[8] = u[3]; out[10] = vsubq_s32(kZero, u[7]); out[12] = u[5]; out[14] = vsubq_s32(kZero, u[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); const int32x4_t v_shift = vdupq_n_s32(-out_shift); int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); neg_shift_neon(&u[0], &u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[6], &u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[3], &u[7], out + 8, out + 10, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[5], &u[1], out + 12, out + 14, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); } // Odd 8 points: 1, 3, ..., 15 // stage 0 // stage 1 // stage 2 // (1) u[0] = vmlaq_n_s32(rnding, in[15], cospi[4]); u[0] = vmlaq_n_s32(u[0], in[1], cospi[60]); u[0] = vshlq_s32(u[0], v_bit); u[1] = vmlaq_n_s32(rnding, in[15], cospi[60]); u[1] = vmlsq_n_s32(u[1], in[1], cospi[4]); u[1] = vshlq_s32(u[1], v_bit); // (2) u[2] = vmlaq_n_s32(rnding, in[11], cospi[20]); u[2] = vmlaq_n_s32(u[2], in[5], cospi[44]); u[2] = vshlq_s32(u[2], v_bit); u[3] = vmlaq_n_s32(rnding, in[11], cospi[44]); u[3] = vmlsq_n_s32(u[3], in[5], cospi[20]); u[3] = vshlq_s32(u[3], v_bit); // (3) u[4] = vmlaq_n_s32(rnding, in[7], cospi[36]); u[4] = vmlaq_n_s32(u[4], in[9], cospi[28]); u[4] = vshlq_s32(u[4], v_bit); u[5] = vmlaq_n_s32(rnding, in[7], cospi[28]); u[5] = vmlsq_n_s32(u[5], in[9], cospi[36]); u[5] = vshlq_s32(u[5], v_bit); // (4) u[6] = vmlaq_n_s32(rnding, in[3], cospi[52]); u[6] = vmlaq_n_s32(u[6], in[13], cospi[12]); u[6] = vshlq_s32(u[6], v_bit); u[7] = vmlaq_n_s32(rnding, in[3], cospi[12]); u[7] = vmlsq_n_s32(u[7], in[13], cospi[52]); u[7] = vshlq_s32(u[7], v_bit); // stage 3 addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); // stage 4 u[0] = v[0]; u[1] = v[1]; u[2] = v[2]; u[3] = v[3]; u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]); u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]); u[4] = vshlq_s32(u[4], v_bit); u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]); u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]); u[5] = vshlq_s32(u[5], v_bit); u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]); u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]); u[6] = vshlq_s32(u[6], v_bit); u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]); u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]); u[7] = vshlq_s32(u[7], v_bit); // stage 5 addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); // stage 6 u[0] = v[0]; u[1] = v[1]; u[4] = v[4]; u[5] = v[5]; v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]); x = vmulq_n_s32(v[3], cospi[32]); u[2] = vaddq_s32(v[0], x); u[2] = vshlq_s32(u[2], v_bit); u[3] = vsubq_s32(v[0], x); u[3] = vshlq_s32(u[3], v_bit); v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]); x = vmulq_n_s32(v[7], cospi[32]); u[6] = vaddq_s32(v[0], x); u[6] = vshlq_s32(u[6], v_bit); u[7] = vsubq_s32(v[0], x); u[7] = vshlq_s32(u[7], v_bit); // stage 7 if (do_cols) { out[1] = u[0]; out[3] = vsubq_s32(kZero, u[4]); out[5] = u[6]; out[7] = vsubq_s32(kZero, u[2]); out[9] = u[3]; out[11] = vsubq_s32(kZero, u[7]); out[13] = u[5]; out[15] = vsubq_s32(kZero, u[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); const int32x4_t v_shift = vdupq_n_s32(-out_shift); int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); neg_shift_neon(&u[0], &u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[6], &u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[3], &u[7], out + 9, out + 11, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[5], &u[1], out + 13, out + 15, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); } } static void iidentity8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { (void)bit; out[0] = vaddq_s32(in[0], in[0]); out[1] = vaddq_s32(in[1], in[1]); out[2] = vaddq_s32(in[2], in[2]); out[3] = vaddq_s32(in[3], in[3]); out[4] = vaddq_s32(in[4], in[4]); out[5] = vaddq_s32(in[5], in[5]); out[6] = vaddq_s32(in[6], in[6]); out[7] = vaddq_s32(in[7], in[7]); if (!do_cols) { const int log_range = AOMMAX(16, bd + 6); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); round_shift_4x4(out, out_shift); round_shift_4x4(out + 4, out_shift); highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 8); } } static uint16x8_t get_recon_8x8(const uint16x8_t pred, int32x4_t res_lo, int32x4_t res_hi, int fliplr, int bd) { uint16x8x2_t x; if (fliplr) { res_lo = vrev64q_s32(res_lo); res_lo = vextq_s32(res_lo, res_lo, 2); res_hi = vrev64q_s32(res_hi); res_hi = vextq_s32(res_hi, res_hi, 2); x.val[0] = vreinterpretq_u16_s32( vaddw_s16(res_hi, vreinterpret_s16_u16(vget_low_u16(pred)))); x.val[1] = vreinterpretq_u16_s32( vaddw_s16(res_lo, vreinterpret_s16_u16(vget_high_u16(pred)))); } else { x.val[0] = vreinterpretq_u16_s32( vaddw_s16(res_lo, vreinterpret_s16_u16(vget_low_u16(pred)))); x.val[1] = vreinterpretq_u16_s32( vaddw_s16(res_hi, vreinterpret_s16_u16(vget_high_u16(pred)))); } uint16x8_t x2 = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])), vqmovn_u32(vreinterpretq_u32_u16(x.val[1]))); const uint16x8_t vmin = vdupq_n_u16(0); const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1); return highbd_clamp_u16(&x2, &vmin, &vmax); } static void write_buffer_8x8(int32x4_t *in, uint16_t *output, int stride, int fliplr, int flipud, int shift, int bd) { uint16x8_t u0, u1, u2, u3, u4, u5, u6, u7; uint16x8_t v0, v1, v2, v3, v4, v5, v6, v7; round_shift_8x8(in, shift); v0 = vld1q_u16(output + 0 * stride); v1 = vld1q_u16(output + 1 * stride); v2 = vld1q_u16(output + 2 * stride); v3 = vld1q_u16(output + 3 * stride); v4 = vld1q_u16(output + 4 * stride); v5 = vld1q_u16(output + 5 * stride); v6 = vld1q_u16(output + 6 * stride); v7 = vld1q_u16(output + 7 * stride); if (flipud) { u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd); u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd); u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd); u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd); u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd); u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd); u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd); u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd); } else { u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd); u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd); u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd); u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd); u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd); u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd); u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd); u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd); } vst1q_u16(output + 0 * stride, u0); vst1q_u16(output + 1 * stride, u1); vst1q_u16(output + 2 * stride, u2); vst1q_u16(output + 3 * stride, u3); vst1q_u16(output + 4 * stride, u4); vst1q_u16(output + 5 * stride, u5); vst1q_u16(output + 6 * stride, u6); vst1q_u16(output + 7 * stride, u7); } void av1_inv_txfm2d_add_8x8_neon(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { int32x4_t in[16], out[16]; const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8]; switch (tx_type) { case DCT_DCT: load_buffer_8x8(input, in); idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); break; case DCT_ADST: load_buffer_8x8(input, in); iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); break; case ADST_DCT: load_buffer_8x8(input, in); idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); break; case ADST_ADST: load_buffer_8x8(input, in); iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); break; case FLIPADST_DCT: load_buffer_8x8(input, in); idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd); break; case DCT_FLIPADST: load_buffer_8x8(input, in); iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd); break; case ADST_FLIPADST: load_buffer_8x8(input, in); iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd); break; case FLIPADST_FLIPADST: load_buffer_8x8(input, in); iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 1, 1, -shift[1], bd); break; case FLIPADST_ADST: load_buffer_8x8(input, in); iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd); break; default: assert(0); } } static void idct8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); int32x4_t x; const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); // stage 0-1-2-3 x = vmulq_n_s32(in[0], cospi[32]); x = vaddq_s32(vshlq_s32(x, v_bit), rnding); // stage 4-5 if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1))); clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1); int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); x = vaddq_s32(x, offset); x = vshlq_s32(x, vdupq_n_s32(-out_shift)); } x = vmaxq_s32(x, clamp_lo); x = vminq_s32(x, clamp_hi); out[0] = x; out[1] = x; out[2] = x; out[3] = x; out[4] = x; out[5] = x; out[6] = x; out[7] = x; } static void idct8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); int32x4_t u0, u1, u2, u3, u4, u5, u6, u7; int32x4_t v0, v1, v2, v3, v4, v5, v6, v7; int32x4_t x, y; const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); // stage 0 // stage 1 // stage 2 u0 = in[0]; u1 = in[4]; u2 = in[2]; u3 = in[6]; x = vmlaq_n_s32(rnding, in[1], cospi[56]); u4 = vmlaq_n_s32(x, in[7], -cospi[8]); u4 = vshlq_s32(u4, v_bit); x = vmlaq_n_s32(rnding, in[1], cospi[8]); u7 = vmlaq_n_s32(x, in[7], cospi[56]); u7 = vshlq_s32(u7, v_bit); x = vmlaq_n_s32(rnding, in[5], cospi[24]); u5 = vmlaq_n_s32(x, in[3], -cospi[40]); u5 = vshlq_s32(u5, v_bit); x = vmlaq_n_s32(rnding, in[5], cospi[40]); u6 = vmlaq_n_s32(x, in[3], cospi[24]); u6 = vshlq_s32(u6, v_bit); // stage 3 x = vmlaq_n_s32(rnding, u0, cospi[32]); y = vmulq_n_s32(u1, cospi[32]); v0 = vaddq_s32(x, y); v0 = vshlq_s32(v0, v_bit); v1 = vsubq_s32(x, y); v1 = vshlq_s32(v1, v_bit); x = vmlaq_n_s32(rnding, u2, cospi[48]); v2 = vmlaq_n_s32(x, u3, -cospi[16]); v2 = vshlq_s32(v2, v_bit); x = vmlaq_n_s32(rnding, u2, cospi[16]); v3 = vmlaq_n_s32(x, u3, cospi[48]); v3 = vshlq_s32(v3, v_bit); addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); // stage 4 addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); u4 = v4; u7 = v7; x = vmulq_n_s32(v5, cospi[32]); y = vmlaq_n_s32(rnding, v6, cospi[32]); u6 = vaddq_s32(y, x); u6 = vshlq_s32(u6, v_bit); u5 = vsubq_s32(y, x); u5 = vshlq_s32(u5, v_bit); // stage 5 addsub_neon(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi); addsub_neon(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi); addsub_neon(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi); addsub_neon(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi); if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); round_shift_4x4(out, out_shift); round_shift_4x4(out + 4, out_shift); highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 8); } } static void iadst8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); int32x4_t u[8], x; const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); // stage 0-2 u[0] = vmlaq_n_s32(rnding, in[0], cospi[60]); u[0] = vshlq_s32(u[0], v_bit); u[1] = vmlaq_n_s32(rnding, in[0], cospi[4]); u[1] = vshlq_s32(vnegq_s32(u[1]), v_bit); // stage 3-4 int32x4_t temp1, temp2; temp1 = vmlaq_n_s32(rnding, u[0], cospi[16]); temp1 = vmlaq_n_s32(temp1, u[1], cospi[48]); temp1 = vshlq_s32(temp1, v_bit); u[4] = temp1; temp2 = vmlaq_n_s32(rnding, u[0], cospi[48]); u[5] = vmlsq_n_s32(temp2, u[1], cospi[16]); u[5] = vshlq_s32(u[5], v_bit); // stage 5-6 temp1 = vmlaq_n_s32(rnding, u[0], cospi[32]); x = vmulq_n_s32(u[1], cospi[32]); u[2] = vaddq_s32(temp1, x); u[2] = vshlq_s32(u[2], v_bit); u[3] = vsubq_s32(temp1, x); u[3] = vshlq_s32(u[3], v_bit); temp1 = vmlaq_n_s32(rnding, u[4], cospi[32]); x = vmulq_n_s32(u[5], cospi[32]); u[6] = vaddq_s32(temp1, x); u[6] = vshlq_s32(u[6], v_bit); u[7] = vsubq_s32(temp1, x); u[7] = vshlq_s32(u[7], v_bit); // stage 7 if (do_cols) { out[0] = u[0]; out[1] = vnegq_s32(u[4]); out[2] = u[6]; out[3] = vnegq_s32(u[2]); out[4] = u[3]; out[5] = vnegq_s32(u[7]); out[6] = u[5]; out[7] = vnegq_s32(u[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); const int32x4_t v_shift = vdupq_n_s32(-out_shift); int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); } } static void iadst8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); // const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); int32x4_t u[8], v[8], x; const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); // stage 0-2 u[0] = vmlaq_n_s32(rnding, in[7], cospi[4]); u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]); u[0] = vshlq_s32(u[0], v_bit); u[1] = vmlaq_n_s32(rnding, in[7], cospi[60]); u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]); u[1] = vshlq_s32(u[1], v_bit); // (2) u[2] = vmlaq_n_s32(rnding, in[5], cospi[20]); u[2] = vmlaq_n_s32(u[2], in[2], cospi[44]); u[2] = vshlq_s32(u[2], v_bit); u[3] = vmlaq_n_s32(rnding, in[5], cospi[44]); u[3] = vmlsq_n_s32(u[3], in[2], cospi[20]); u[3] = vshlq_s32(u[3], v_bit); // (3) u[4] = vmlaq_n_s32(rnding, in[3], cospi[36]); u[4] = vmlaq_n_s32(u[4], in[4], cospi[28]); u[4] = vshlq_s32(u[4], v_bit); u[5] = vmlaq_n_s32(rnding, in[3], cospi[28]); u[5] = vmlsq_n_s32(u[5], in[4], cospi[36]); u[5] = vshlq_s32(u[5], v_bit); // (4) u[6] = vmulq_n_s32(in[1], cospi[52]); u[6] = vmlaq_n_s32(u[6], in[6], cospi[12]); u[6] = vaddq_s32(u[6], rnding); u[6] = vshlq_s32(u[6], v_bit); u[7] = vmulq_n_s32(in[1], cospi[12]); u[7] = vmlsq_n_s32(u[7], in[6], cospi[52]); u[7] = vaddq_s32(u[7], rnding); u[7] = vshlq_s32(u[7], v_bit); // stage 3 addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); // stage 4 u[0] = v[0]; u[1] = v[1]; u[2] = v[2]; u[3] = v[3]; u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]); u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]); u[4] = vshlq_s32(u[4], v_bit); u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]); u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]); u[5] = vshlq_s32(u[5], v_bit); u[6] = vmlsq_n_s32(rnding, v[6], cospi[48]); u[6] = vmlaq_n_s32(u[6], v[7], cospi[16]); u[6] = vshlq_s32(u[6], v_bit); u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]); u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]); u[7] = vshlq_s32(u[7], v_bit); // stage 5 addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); // stage 6 u[0] = v[0]; u[1] = v[1]; u[4] = v[4]; u[5] = v[5]; v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]); x = vmulq_n_s32(v[3], cospi[32]); u[2] = vaddq_s32(v[0], x); u[2] = vshlq_s32(u[2], v_bit); u[3] = vsubq_s32(v[0], x); u[3] = vshlq_s32(u[3], v_bit); v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]); x = vmulq_n_s32(v[7], cospi[32]); u[6] = vaddq_s32(v[0], x); u[6] = vshlq_s32(u[6], v_bit); u[7] = vsubq_s32(v[0], x); u[7] = vshlq_s32(u[7], v_bit); // stage 7 if (do_cols) { out[0] = u[0]; out[1] = vnegq_s32(u[4]); out[2] = u[6]; out[3] = vnegq_s32(u[2]); out[4] = u[3]; out[5] = vnegq_s32(u[7]); out[6] = u[5]; out[7] = vnegq_s32(u[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); const int32x4_t v_shift = vdupq_n_s32(-out_shift); int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); } } static void idct16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); // stage 0-4 in[0] = vmlaq_n_s32(rnding, in[0], cospi[32]); in[0] = vshlq_s32(in[0], v_bit); // stage 5-7 if (!do_cols) { log_range = AOMMAX(16, bd + 6); clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); if (out_shift != 0) { int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); in[0] = vaddq_s32(in[0], offset); in[0] = vshlq_s32(in[0], vdupq_n_s32(-out_shift)); } } in[0] = vmaxq_s32(in[0], clamp_lo); in[0] = vminq_s32(in[0], clamp_hi); out[0] = in[0]; out[1] = in[0]; out[2] = in[0]; out[3] = in[0]; out[4] = in[0]; out[5] = in[0]; out[6] = in[0]; out[7] = in[0]; out[8] = in[0]; out[9] = in[0]; out[10] = in[0]; out[11] = in[0]; out[12] = in[0]; out[13] = in[0]; out[14] = in[0]; out[15] = in[0]; } static void idct16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); int32x4_t u[16], x, y; // stage 0-1 u[0] = in[0]; u[2] = in[4]; u[4] = in[2]; u[6] = in[6]; u[8] = in[1]; u[10] = in[5]; u[12] = in[3]; u[14] = in[7]; // stage 2 u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding); u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding); u[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding); u[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding); u[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding); u[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding); u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding); u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding); // stage 3 u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding); u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding); u[5] = half_btf_0_m_neon_r(&cospi[40], &u[6], &v_bit, &rnding); u[6] = half_btf_0_neon_r(&cospi[24], &u[6], &v_bit, &rnding); addsub_neon(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); addsub_neon(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); addsub_neon(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); addsub_neon(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); // stage 4 x = vmlaq_n_s32(rnding, u[0], cospi[32]); u[0] = vshlq_s32(x, v_bit); u[1] = u[0]; u[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding); u[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding); addsub_neon(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); addsub_neon(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); x = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit, &rnding); u[14] = half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding); u[9] = x; y = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13], &v_bit, &rnding); u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], &v_bit, &rnding); u[10] = y; // stage 5 addsub_neon(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); addsub_neon(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); x = vmulq_n_s32(u[5], cospi[32]); y = vmlaq_n_s32(rnding, u[6], cospi[32]); u[5] = vsubq_s32(y, x); u[5] = vshlq_s32(u[5], v_bit); u[6] = vaddq_s32(y, x); u[6] = vshlq_s32(u[6], v_bit); addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); // stage 6 addsub_neon(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); addsub_neon(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); addsub_neon(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); addsub_neon(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); x = vmulq_n_s32(u[10], cospi[32]); y = vmlaq_n_s32(rnding, u[13], cospi[32]); u[10] = vsubq_s32(y, x); u[10] = vshlq_s32(u[10], v_bit); u[13] = vaddq_s32(x, y); u[13] = vshlq_s32(u[13], v_bit); x = vmulq_n_s32(u[11], cospi[32]); y = vmlaq_n_s32(rnding, u[12], cospi[32]); u[11] = vsubq_s32(y, x); u[11] = vshlq_s32(u[11], v_bit); u[12] = vaddq_s32(x, y); u[12] = vshlq_s32(u[12], v_bit); // stage 7 addsub_neon(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi); addsub_neon(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi); addsub_neon(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi); addsub_neon(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi); addsub_neon(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi); addsub_neon(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi); addsub_neon(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi); addsub_neon(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi); if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); round_shift_8x8(out, out_shift); highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16); } } static void iadst16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); int32x4_t v[16], x, y, temp1, temp2; const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); // stage 0 // stage 1 // stage 2 v[0] = vmlaq_n_s32(rnding, in[0], cospi[62]); v[0] = vshlq_s32(v[0], v_bit); v[1] = vmlsq_n_s32(rnding, in[0], cospi[2]); v[1] = vshlq_s32(v[1], v_bit); // stage 3 v[8] = v[0]; v[9] = v[1]; // stage 4 temp1 = vmlaq_n_s32(rnding, v[8], cospi[8]); temp1 = vmlaq_n_s32(temp1, v[9], cospi[56]); temp1 = vshlq_s32(temp1, v_bit); temp2 = vmlaq_n_s32(rnding, v[8], cospi[56]); temp2 = vmlsq_n_s32(temp2, v[9], cospi[8]); temp2 = vshlq_s32(temp2, v_bit); v[8] = temp1; v[9] = temp2; // stage 5 v[4] = v[0]; v[5] = v[1]; v[12] = v[8]; v[13] = v[9]; // stage 6 temp1 = vmlaq_n_s32(rnding, v[4], cospi[16]); temp1 = vmlaq_n_s32(temp1, v[5], cospi[48]); temp1 = vshlq_s32(temp1, v_bit); temp2 = vmlaq_n_s32(rnding, v[4], cospi[48]); temp2 = vmlsq_n_s32(temp2, v[5], cospi[16]); temp2 = vshlq_s32(temp2, v_bit); v[4] = temp1; v[5] = temp2; temp1 = vmlaq_n_s32(rnding, v[12], cospi[16]); temp1 = vmlaq_n_s32(temp1, v[13], cospi[48]); temp1 = vshlq_s32(temp1, v_bit); temp2 = vmlaq_n_s32(rnding, v[12], cospi[48]); temp2 = vmlsq_n_s32(temp2, v[13], cospi[16]); temp2 = vshlq_s32(temp2, v_bit); v[12] = temp1; v[13] = temp2; // stage 7 v[2] = v[0]; v[3] = v[1]; v[6] = v[4]; v[7] = v[5]; v[10] = v[8]; v[11] = v[9]; v[14] = v[12]; v[15] = v[13]; // stage 8 y = vmlaq_n_s32(rnding, v[2], cospi[32]); x = vmulq_n_s32(v[3], cospi[32]); v[2] = vaddq_s32(y, x); v[2] = vshlq_s32(v[2], v_bit); v[3] = vsubq_s32(y, x); v[3] = vshlq_s32(v[3], v_bit); y = vmlaq_n_s32(rnding, v[6], cospi[32]); x = vmulq_n_s32(v[7], cospi[32]); v[6] = vaddq_s32(y, x); v[6] = vshlq_s32(v[6], v_bit); v[7] = vsubq_s32(y, x); v[7] = vshlq_s32(v[7], v_bit); y = vmlaq_n_s32(rnding, v[10], cospi[32]); x = vmulq_n_s32(v[11], cospi[32]); v[10] = vaddq_s32(y, x); v[10] = vshlq_s32(v[10], v_bit); v[11] = vsubq_s32(y, x); v[11] = vshlq_s32(v[11], v_bit); y = vmlaq_n_s32(rnding, v[14], cospi[32]); x = vmulq_n_s32(v[15], cospi[32]); v[14] = vaddq_s32(y, x); v[14] = vshlq_s32(v[14], v_bit); v[15] = vsubq_s32(y, x); v[15] = vshlq_s32(v[15], v_bit); // stage 9 if (do_cols) { out[0] = v[0]; out[1] = vnegq_s32(v[8]); out[2] = v[12]; out[3] = vnegq_s32(v[4]); out[4] = v[6]; out[5] = vnegq_s32(v[14]); out[6] = v[10]; out[7] = vnegq_s32(v[2]); out[8] = v[3]; out[9] = vnegq_s32(v[11]); out[10] = v[15]; out[11] = vnegq_s32(v[7]); out[12] = v[5]; out[13] = vnegq_s32(v[13]); out[14] = v[9]; out[15] = vnegq_s32(v[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); const int32x4_t v_shift = vdupq_n_s32(-out_shift); int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); } } static void iadst16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); int32x4_t zero = vdupq_n_s32(0); int32x4_t u[16], x, y; const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); // stage 0-2 u[0] = vmlaq_n_s32(rnding, in[0], cospi[62]); u[0] = vshlq_s32(u[0], v_bit); u[1] = vmlsq_n_s32(rnding, in[0], cospi[2]); u[1] = vshlq_s32(u[1], v_bit); u[2] = vmlaq_n_s32(rnding, in[2], cospi[54]); u[2] = vshlq_s32(u[2], v_bit); u[3] = vmlsq_n_s32(rnding, in[2], cospi[10]); u[3] = vshlq_s32(u[3], v_bit); u[4] = vmlaq_n_s32(rnding, in[4], cospi[46]); u[4] = vshlq_s32(u[4], v_bit); u[5] = vmlsq_n_s32(rnding, in[4], cospi[18]); u[5] = vshlq_s32(u[5], v_bit); u[6] = vmlaq_n_s32(rnding, in[6], cospi[38]); u[6] = vshlq_s32(u[6], v_bit); u[7] = vmlsq_n_s32(rnding, in[6], cospi[26]); u[7] = vshlq_s32(u[7], v_bit); u[8] = vmlaq_n_s32(rnding, in[7], cospi[34]); u[8] = vshlq_s32(u[8], v_bit); u[9] = vmlaq_n_s32(rnding, in[7], cospi[30]); u[9] = vshlq_s32(u[9], v_bit); u[10] = vmlaq_n_s32(rnding, in[5], cospi[42]); u[10] = vshlq_s32(u[10], v_bit); u[11] = vmlaq_n_s32(rnding, in[5], cospi[22]); u[11] = vshlq_s32(u[11], v_bit); u[12] = vmlaq_n_s32(rnding, in[3], cospi[50]); u[12] = vshlq_s32(u[12], v_bit); u[13] = vmlaq_n_s32(rnding, in[3], cospi[14]); u[13] = vshlq_s32(u[13], v_bit); u[14] = vmlaq_n_s32(rnding, in[1], cospi[58]); u[14] = vshlq_s32(u[14], v_bit); u[15] = vmlaq_n_s32(rnding, in[1], cospi[6]); u[15] = vshlq_s32(u[15], v_bit); // stage 3 addsub_neon(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); addsub_neon(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); addsub_neon(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); addsub_neon(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); addsub_neon(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); addsub_neon(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); addsub_neon(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); addsub_neon(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); // stage 4 y = vmlaq_n_s32(rnding, u[8], cospi[56]); u[8] = vmlaq_n_s32(rnding, u[8], cospi[8]); u[8] = vmlaq_n_s32(u[8], u[9], cospi[56]); u[8] = vshlq_s32(u[8], v_bit); u[9] = vmlsq_n_s32(y, u[9], cospi[8]); u[9] = vshlq_s32(u[9], v_bit); y = vmlaq_n_s32(rnding, u[10], cospi[24]); u[10] = vmlaq_n_s32(rnding, u[10], cospi[40]); u[10] = vmlaq_n_s32(u[10], u[11], cospi[24]); u[10] = vshlq_s32(u[10], v_bit); u[11] = vmlsq_n_s32(y, u[11], cospi[40]); u[11] = vshlq_s32(u[11], v_bit); y = vmlaq_n_s32(rnding, u[12], cospi[8]); u[12] = vmlsq_n_s32(rnding, u[12], cospi[56]); u[12] = vmlaq_n_s32(u[12], u[13], cospi[8]); u[12] = vshlq_s32(u[12], v_bit); u[13] = vmlaq_n_s32(y, u[13], cospi[56]); u[13] = vshlq_s32(u[13], v_bit); y = vmlaq_n_s32(rnding, u[14], cospi[40]); u[14] = vmlsq_n_s32(rnding, u[14], cospi[24]); u[14] = vmlaq_n_s32(u[14], u[15], cospi[40]); u[14] = vshlq_s32(u[14], v_bit); u[15] = vmlaq_n_s32(y, u[15], cospi[24]); u[15] = vshlq_s32(u[15], v_bit); // stage 5 addsub_neon(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); addsub_neon(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); addsub_neon(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); addsub_neon(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); addsub_neon(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); addsub_neon(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); addsub_neon(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); addsub_neon(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); // stage 6 y = vmlaq_n_s32(rnding, u[4], cospi[48]); u[4] = vmlaq_n_s32(rnding, u[4], cospi[16]); u[4] = vmlaq_n_s32(u[4], u[5], cospi[48]); u[4] = vshlq_s32(u[4], v_bit); u[5] = vmlsq_n_s32(y, u[5], cospi[16]); u[5] = vshlq_s32(u[5], v_bit); y = vmlaq_n_s32(rnding, u[6], cospi[16]); u[6] = vmlsq_n_s32(rnding, u[6], cospi[48]); u[6] = vmlaq_n_s32(u[6], u[7], cospi[16]); u[6] = vshlq_s32(u[6], v_bit); u[7] = vmlaq_n_s32(y, u[7], cospi[48]); u[7] = vshlq_s32(u[7], v_bit); y = vmlaq_n_s32(rnding, u[12], cospi[48]); u[12] = vmulq_n_s32(u[12], cospi[16]); u[12] = vmlaq_n_s32(u[12], u[13], cospi[48]); u[12] = vshlq_s32(u[12], v_bit); u[13] = vmlsq_n_s32(y, u[13], cospi[16]); u[13] = vshlq_s32(u[13], v_bit); y = vmlaq_n_s32(rnding, u[14], cospi[16]); u[14] = vmlsq_n_s32(rnding, u[14], cospi[48]); u[14] = vmlaq_n_s32(u[14], u[15], cospi[16]); u[14] = vshlq_s32(u[14], v_bit); u[15] = vmlaq_n_s32(y, u[15], cospi[48]); u[15] = vshlq_s32(u[15], v_bit); // stage 7 addsub_neon(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); addsub_neon(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); addsub_neon(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); addsub_neon(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); addsub_neon(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); addsub_neon(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); addsub_neon(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); addsub_neon(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); // stage 8 y = vmlaq_n_s32(rnding, u[2], cospi[32]); x = vmulq_n_s32(u[3], cospi[32]); u[2] = vaddq_s32(y, x); u[2] = vshlq_s32(u[2], v_bit); u[3] = vsubq_s32(y, x); u[3] = vshlq_s32(u[3], v_bit); y = vmlaq_n_s32(rnding, u[6], cospi[32]); x = vmulq_n_s32(u[7], cospi[32]); u[6] = vaddq_s32(y, x); u[6] = vshlq_s32(u[6], v_bit); u[7] = vsubq_s32(y, x); u[7] = vshlq_s32(u[7], v_bit); y = vmlaq_n_s32(rnding, u[10], cospi[32]); x = vmulq_n_s32(u[11], cospi[32]); u[10] = vaddq_s32(y, x); u[10] = vshlq_s32(u[10], v_bit); u[11] = vsubq_s32(y, x); u[11] = vshlq_s32(u[11], v_bit); y = vmlaq_n_s32(rnding, u[14], cospi[32]); x = vmulq_n_s32(u[15], cospi[32]); u[14] = vaddq_s32(y, x); u[14] = vshlq_s32(u[14], v_bit); u[15] = vsubq_s32(y, x); u[15] = vshlq_s32(u[15], v_bit); // stage 9 if (do_cols) { out[0] = u[0]; out[1] = vsubq_s32(zero, u[8]); out[2] = u[12]; out[3] = vsubq_s32(zero, u[4]); out[4] = u[6]; out[5] = vsubq_s32(zero, u[14]); out[6] = u[10]; out[7] = vsubq_s32(zero, u[2]); out[8] = u[3]; out[9] = vsubq_s32(zero, u[11]); out[10] = u[15]; out[11] = vsubq_s32(zero, u[7]); out[12] = u[5]; out[13] = vsubq_s32(zero, u[13]); out[14] = u[9]; out[15] = vsubq_s32(zero, u[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); const int32x4_t v_shift = vdupq_n_s32(-out_shift); int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); neg_shift_neon(&u[0], &u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[12], &u[4], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[6], &u[14], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[10], &u[2], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[3], &u[11], out + 8, out + 9, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[15], &u[7], out + 10, out + 11, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[5], &u[13], out + 12, out + 13, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&u[9], &u[1], out + 14, out + 15, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); } } static void idct16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); int32x4_t u[16], v[16], x, y; const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); { // stage 0-1 u[0] = in[0]; u[1] = in[8]; u[2] = in[4]; u[3] = in[12]; u[4] = in[2]; u[5] = in[10]; u[6] = in[6]; u[7] = in[14]; u[8] = in[1]; u[9] = in[9]; u[10] = in[5]; u[11] = in[13]; u[12] = in[3]; u[13] = in[11]; u[14] = in[7]; u[15] = in[15]; // stage 2 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = u[4]; v[5] = u[5]; v[6] = u[6]; v[7] = u[7]; v[8] = half_btf_neon_mode01_r(&cospi[60], &u[8], &cospi[4], &u[15], &v_bit, &rnding); v[9] = half_btf_neon_mode01_r(&cospi[28], &u[9], &cospi[36], &u[14], &v_bit, &rnding); v[10] = half_btf_neon_mode01_r(&cospi[44], &u[10], &cospi[20], &u[13], &v_bit, &rnding); v[11] = half_btf_neon_mode01_r(&cospi[12], &u[11], &cospi[52], &u[12], &v_bit, &rnding); v[12] = half_btf_neon_r(&cospi[52], &u[11], &cospi[12], &u[12], &v_bit, &rnding); v[13] = half_btf_neon_r(&cospi[20], &u[10], &cospi[44], &u[13], &v_bit, &rnding); v[14] = half_btf_neon_r(&cospi[36], &u[9], &cospi[28], &u[14], &v_bit, &rnding); v[15] = half_btf_neon_r(&cospi[4], &u[8], &cospi[60], &u[15], &v_bit, &rnding); // stage 3 u[0] = v[0]; u[1] = v[1]; u[2] = v[2]; u[3] = v[3]; u[4] = half_btf_neon_mode01_r(&cospi[56], &v[4], &cospi[8], &v[7], &v_bit, &rnding); u[5] = half_btf_neon_mode01_r(&cospi[24], &v[5], &cospi[40], &v[6], &v_bit, &rnding); u[6] = half_btf_neon_r(&cospi[40], &v[5], &cospi[24], &v[6], &v_bit, &rnding); u[7] = half_btf_neon_r(&cospi[8], &v[4], &cospi[56], &v[7], &v_bit, &rnding); addsub_neon(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); addsub_neon(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); addsub_neon(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); addsub_neon(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); // stage 4 x = vmlaq_n_s32(rnding, u[0], cospi[32]); y = vmulq_n_s32(u[1], cospi[32]); v[0] = vaddq_s32(x, y); v[0] = vshlq_s32(v[0], v_bit); v[1] = vsubq_s32(x, y); v[1] = vshlq_s32(v[1], v_bit); v[2] = half_btf_neon_mode01_r(&cospi[48], &u[2], &cospi[16], &u[3], &v_bit, &rnding); v[3] = half_btf_neon_r(&cospi[16], &u[2], &cospi[48], &u[3], &v_bit, &rnding); addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); v[8] = u[8]; v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit, &rnding); v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13], &v_bit, &rnding); v[11] = u[11]; v[12] = u[12]; v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], &v_bit, &rnding); v[14] = half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding); v[15] = u[15]; // stage 5 addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); u[4] = v[4]; x = vmulq_n_s32(v[5], cospi[32]); y = vmlaq_n_s32(rnding, v[6], cospi[32]); u[5] = vsubq_s32(y, x); u[5] = vshlq_s32(u[5], v_bit); u[6] = vaddq_s32(y, x); u[6] = vshlq_s32(u[6], v_bit); u[7] = v[7]; addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); // stage 6 addsub_neon(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); addsub_neon(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); addsub_neon(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); addsub_neon(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); v[8] = u[8]; v[9] = u[9]; x = vmulq_n_s32(u[10], cospi[32]); y = vmlaq_n_s32(rnding, u[13], cospi[32]); v[10] = vsubq_s32(y, x); v[10] = vshlq_s32(v[10], v_bit); v[13] = vaddq_s32(x, y); v[13] = vshlq_s32(v[13], v_bit); x = vmulq_n_s32(u[11], cospi[32]); y = vmlaq_n_s32(rnding, u[12], cospi[32]); v[11] = vsubq_s32(y, x); v[11] = vshlq_s32(v[11], v_bit); v[12] = vaddq_s32(x, y); v[12] = vshlq_s32(v[12], v_bit); v[14] = u[14]; v[15] = u[15]; // stage 7 addsub_neon(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi); addsub_neon(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi); addsub_neon(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi); addsub_neon(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi); addsub_neon(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi); addsub_neon(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi); addsub_neon(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi); addsub_neon(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi); if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); round_shift_8x8(out, out_shift); highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16); } } } static void iadst16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); const int32x4_t zero = vdupq_n_s32(0); const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); int32x4_t u[16], v[16], x, y; // Calculate the column 0, 1, 2, 3 // stage 0 // stage 1 // stage 2 v[0] = vmlaq_n_s32(rnding, in[15], cospi[2]); v[0] = vmlaq_n_s32(v[0], in[0], cospi[62]); v[0] = vshlq_s32(v[0], v_bit); v[1] = vmlaq_n_s32(rnding, in[15], cospi[62]); v[1] = vmlsq_n_s32(v[1], in[0], cospi[2]); v[1] = vshlq_s32(v[1], v_bit); v[2] = vmlaq_n_s32(rnding, in[13], cospi[10]); v[2] = vmlaq_n_s32(v[2], in[2], cospi[54]); v[2] = vshlq_s32(v[2], v_bit); v[3] = vmlaq_n_s32(rnding, in[13], cospi[54]); v[3] = vmlsq_n_s32(v[3], in[2], cospi[10]); v[3] = vshlq_s32(v[3], v_bit); v[4] = vmlaq_n_s32(rnding, in[11], cospi[18]); v[4] = vmlaq_n_s32(v[4], in[4], cospi[46]); v[4] = vshlq_s32(v[4], v_bit); v[5] = vmlaq_n_s32(rnding, in[11], cospi[46]); v[5] = vmlsq_n_s32(v[5], in[4], cospi[18]); v[5] = vshlq_s32(v[5], v_bit); v[6] = vmlaq_n_s32(rnding, in[9], cospi[26]); v[6] = vmlaq_n_s32(v[6], in[6], cospi[38]); v[6] = vshlq_s32(v[6], v_bit); v[7] = vmlaq_n_s32(rnding, in[9], cospi[38]); v[7] = vmlsq_n_s32(v[7], in[6], cospi[26]); v[7] = vshlq_s32(v[7], v_bit); v[8] = vmlaq_n_s32(rnding, in[7], cospi[34]); v[8] = vmlaq_n_s32(v[8], in[8], cospi[30]); v[8] = vshlq_s32(v[8], v_bit); v[9] = vmlaq_n_s32(rnding, in[7], cospi[30]); v[9] = vmlsq_n_s32(v[9], in[8], cospi[34]); v[9] = vshlq_s32(v[9], v_bit); v[10] = vmlaq_n_s32(rnding, in[5], cospi[42]); v[10] = vmlaq_n_s32(v[10], in[10], cospi[22]); v[10] = vshlq_s32(v[10], v_bit); v[11] = vmlaq_n_s32(rnding, in[5], cospi[22]); v[11] = vmlsq_n_s32(v[11], in[10], cospi[42]); v[11] = vshlq_s32(v[11], v_bit); v[12] = vmlaq_n_s32(rnding, in[3], cospi[50]); v[12] = vmlaq_n_s32(v[12], in[12], cospi[14]); v[12] = vshlq_s32(v[12], v_bit); v[13] = vmlaq_n_s32(rnding, in[3], cospi[14]); v[13] = vmlsq_n_s32(v[13], in[12], cospi[50]); v[13] = vshlq_s32(v[13], v_bit); v[14] = vmlaq_n_s32(rnding, in[1], cospi[58]); v[14] = vmlaq_n_s32(v[14], in[14], cospi[6]); v[14] = vshlq_s32(v[14], v_bit); v[15] = vmlaq_n_s32(rnding, in[1], cospi[6]); v[15] = vmlsq_n_s32(v[15], in[14], cospi[58]); v[15] = vshlq_s32(v[15], v_bit); // stage 3 addsub_neon(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); addsub_neon(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); addsub_neon(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); addsub_neon(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); addsub_neon(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); addsub_neon(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); addsub_neon(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); addsub_neon(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); // stage 4 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = u[4]; v[5] = u[5]; v[6] = u[6]; v[7] = u[7]; v[8] = vmlaq_n_s32(rnding, u[8], cospi[8]); v[8] = vmlaq_n_s32(v[8], u[9], cospi[56]); v[8] = vshlq_s32(v[8], v_bit); v[9] = vmlaq_n_s32(rnding, u[8], cospi[56]); v[9] = vmlsq_n_s32(v[9], u[9], cospi[8]); v[9] = vshlq_s32(v[9], v_bit); v[10] = vmlaq_n_s32(rnding, u[10], cospi[40]); v[10] = vmlaq_n_s32(v[10], u[11], cospi[24]); v[10] = vshlq_s32(v[10], v_bit); v[11] = vmlaq_n_s32(rnding, u[10], cospi[24]); v[11] = vmlsq_n_s32(v[11], u[11], cospi[40]); v[11] = vshlq_s32(v[11], v_bit); v[12] = vmlaq_n_s32(rnding, u[12], -cospi[56]); v[12] = vmlaq_n_s32(v[12], u[13], cospi[8]); v[12] = vshlq_s32(v[12], v_bit); v[13] = vmlaq_n_s32(rnding, u[12], cospi[8]); v[13] = vmlsq_n_s32(v[13], u[13], -cospi[56]); v[13] = vshlq_s32(v[13], v_bit); v[14] = vmlaq_n_s32(rnding, u[14], -cospi[24]); v[14] = vmlaq_n_s32(v[14], u[15], cospi[40]); v[14] = vshlq_s32(v[14], v_bit); v[15] = vmlaq_n_s32(rnding, u[14], cospi[40]); v[15] = vmlsq_n_s32(v[15], u[15], -cospi[24]); v[15] = vshlq_s32(v[15], v_bit); // stage 5 addsub_neon(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); addsub_neon(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); addsub_neon(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); addsub_neon(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); addsub_neon(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); addsub_neon(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); addsub_neon(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); addsub_neon(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); // stage 6 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = vmlaq_n_s32(rnding, u[4], cospi[16]); v[4] = vmlaq_n_s32(v[4], u[5], cospi[48]); v[4] = vshlq_s32(v[4], v_bit); v[5] = vmlaq_n_s32(rnding, u[4], cospi[48]); v[5] = vmlsq_n_s32(v[5], u[5], cospi[16]); v[5] = vshlq_s32(v[5], v_bit); v[6] = vmlaq_n_s32(rnding, u[6], -cospi[48]); v[6] = vmlaq_n_s32(v[6], u[7], cospi[16]); v[6] = vshlq_s32(v[6], v_bit); v[7] = vmlaq_n_s32(rnding, u[6], cospi[16]); v[7] = vmlsq_n_s32(v[7], u[7], -cospi[48]); v[7] = vshlq_s32(v[7], v_bit); v[8] = u[8]; v[9] = u[9]; v[10] = u[10]; v[11] = u[11]; v[12] = vmlaq_n_s32(rnding, u[12], cospi[16]); v[12] = vmlaq_n_s32(v[12], u[13], cospi[48]); v[12] = vshlq_s32(v[12], v_bit); v[13] = vmlaq_n_s32(rnding, u[12], cospi[48]); v[13] = vmlsq_n_s32(v[13], u[13], cospi[16]); v[13] = vshlq_s32(v[13], v_bit); v[14] = vmlaq_n_s32(rnding, u[14], -cospi[48]); v[14] = vmlaq_n_s32(v[14], u[15], cospi[16]); v[14] = vshlq_s32(v[14], v_bit); v[15] = vmlaq_n_s32(rnding, u[14], cospi[16]); v[15] = vmlsq_n_s32(v[15], u[15], -cospi[48]); v[15] = vshlq_s32(v[15], v_bit); // stage 7 addsub_neon(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); addsub_neon(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); addsub_neon(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); addsub_neon(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); addsub_neon(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); addsub_neon(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); addsub_neon(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); addsub_neon(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); // stage 8 v[0] = u[0]; v[1] = u[1]; y = vmlaq_n_s32(rnding, u[2], cospi[32]); x = vmulq_n_s32(u[3], cospi[32]); v[2] = vaddq_s32(y, x); v[2] = vshlq_s32(v[2], v_bit); v[3] = vsubq_s32(y, x); v[3] = vshlq_s32(v[3], v_bit); v[4] = u[4]; v[5] = u[5]; y = vmlaq_n_s32(rnding, u[6], cospi[32]); x = vmulq_n_s32(u[7], cospi[32]); v[6] = vaddq_s32(y, x); v[6] = vshlq_s32(v[6], v_bit); v[7] = vsubq_s32(y, x); v[7] = vshlq_s32(v[7], v_bit); v[8] = u[8]; v[9] = u[9]; y = vmlaq_n_s32(rnding, u[10], cospi[32]); x = vmulq_n_s32(u[11], cospi[32]); v[10] = vaddq_s32(y, x); v[10] = vshlq_s32(v[10], v_bit); v[11] = vsubq_s32(y, x); v[11] = vshlq_s32(v[11], v_bit); v[12] = u[12]; v[13] = u[13]; y = vmlaq_n_s32(rnding, u[14], cospi[32]); x = vmulq_n_s32(u[15], cospi[32]); v[14] = vaddq_s32(y, x); v[14] = vshlq_s32(v[14], v_bit); v[15] = vsubq_s32(y, x); v[15] = vshlq_s32(v[15], v_bit); // stage 9 if (do_cols) { out[0] = v[0]; out[1] = vsubq_s32(zero, v[8]); out[2] = v[12]; out[3] = vsubq_s32(zero, v[4]); out[4] = v[6]; out[5] = vsubq_s32(zero, v[14]); out[6] = v[10]; out[7] = vsubq_s32(zero, v[2]); out[8] = v[3]; out[9] = vsubq_s32(zero, v[11]); out[10] = v[15]; out[11] = vsubq_s32(zero, v[7]); out[12] = v[5]; out[13] = vsubq_s32(zero, v[13]); out[14] = v[9]; out[15] = vsubq_s32(zero, v[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); const int32x4_t v_shift = vdupq_n_s32(-out_shift); int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out, &clamp_hi_out, &v_shift, &offset); } } static void iidentity16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { (void)bit; int32x2_t fact = vdup_n_s32(2 * NewSqrt2); int32x4x2_t a0; int32x4_t zero = vdupq_n_s32(0); const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1)); for (int i = 0; i < 16; i++) { a0.val[0] = vreinterpretq_s32_s64( vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact)); a0.val[0] = vreinterpretq_s32_s64( vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits)); a0.val[1] = vextq_s32(in[i], zero, 1); a0.val[1] = vreinterpretq_s32_s64( vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact)); a0.val[1] = vreinterpretq_s32_s64( vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits)); a0 = vzipq_s32(a0.val[0], a0.val[1]); #if AOM_ARCH_AARCH64 out[i] = vreinterpretq_s32_s64(vzip1q_s64( vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1]))); #else out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2); #endif } if (!do_cols) { const int log_range = AOMMAX(16, bd + 6); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); round_shift_8x8(out, out_shift); highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 16); } } static inline void idct64_stage8_neon(int32x4_t *u, const int32_t *cospi, const int32x4_t *clamp_lo, const int32x4_t *clamp_hi, const int32x4_t *v_bit, const int32x4_t *rnding) { int i; int32x4_t temp1, temp2, temp3, temp4; temp1 = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit, rnding); u[13] = half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit, rnding); u[10] = temp1; temp2 = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit, rnding); u[12] = half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit, rnding); u[11] = temp2; for (i = 16; i < 20; ++i) { addsub_neon(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); addsub_neon(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi); } temp1 = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59], v_bit, rnding); temp2 = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58], v_bit, rnding); temp3 = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57], v_bit, rnding); temp4 = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56], v_bit, rnding); u[56] = half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], v_bit, rnding); u[57] = half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], v_bit, rnding); u[58] = half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], v_bit, rnding); u[59] = half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], v_bit, rnding); u[36] = temp1; u[37] = temp2; u[38] = temp3; u[39] = temp4; temp1 = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55], v_bit, rnding); temp2 = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54], v_bit, rnding); temp3 = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53], v_bit, rnding); temp4 = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52], v_bit, rnding); u[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52], v_bit, rnding); u[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53], v_bit, rnding); u[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54], v_bit, rnding); u[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55], v_bit, rnding); u[40] = temp1; u[41] = temp2; u[42] = temp3; u[43] = temp4; } static inline void idct64_stage9_neon(int32x4_t *u, const int32_t *cospi, const int32x4_t *clamp_lo, const int32x4_t *clamp_hi, const int32x4_t *v_bit, const int32x4_t *rnding) { int i; int32x4_t temp1, temp2, temp3, temp4; for (i = 0; i < 8; ++i) { addsub_neon(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); } temp1 = half_btf_neon_mode10_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit, rnding); temp2 = half_btf_neon_mode10_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit, rnding); temp3 = half_btf_neon_mode10_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit, rnding); temp4 = half_btf_neon_mode10_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit, rnding); u[24] = half_btf_neon_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit, rnding); u[25] = half_btf_neon_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit, rnding); u[26] = half_btf_neon_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit, rnding); u[27] = half_btf_neon_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit, rnding); u[20] = temp1; u[21] = temp2; u[22] = temp3; u[23] = temp4; for (i = 32; i < 40; i++) { addsub_neon(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); } for (i = 48; i < 56; i++) { addsub_neon(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); } } static inline void idct64_stage10_neon(int32x4_t *u, const int32_t *cospi, const int32x4_t *clamp_lo, const int32x4_t *clamp_hi, const int32x4_t *v_bit, const int32x4_t *rnding) { int32x4_t temp1, temp2, temp3, temp4; for (int i = 0; i < 16; i++) { addsub_neon(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); } temp1 = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit, rnding); temp2 = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit, rnding); temp3 = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit, rnding); temp4 = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit, rnding); u[52] = half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit, rnding); u[53] = half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit, rnding); u[54] = half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit, rnding); u[55] = half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit, rnding); u[40] = temp1; u[41] = temp2; u[42] = temp3; u[43] = temp4; temp1 = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit, rnding); temp2 = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit, rnding); temp3 = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit, rnding); temp4 = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit, rnding); u[48] = half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit, rnding); u[49] = half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit, rnding); u[50] = half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit, rnding); u[51] = half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit, rnding); u[44] = temp1; u[45] = temp2; u[46] = temp3; u[47] = temp4; } static inline void idct64_stage11_neon(int32x4_t *u, int32x4_t *out, int do_cols, int bd, int out_shift, const int32x4_t *clamp_lo, const int32x4_t *clamp_hi) { for (int i = 0; i < 32; i++) { addsub_neon(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi); } if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); for (int i = 0; i < 64; i += 4) { round_shift_4x4(out + i, out_shift); highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out, 4); } } } static void idct64x64_low1_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); { int32x4_t x; // stage 1 // stage 2 // stage 3 // stage 4 // stage 5 // stage 6 x = half_btf_0_neon_r(&cospi[32], &in[0], &v_bit, &rnding); // stage 8 // stage 9 // stage 10 // stage 11 if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1))); clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1); if (out_shift != 0) { int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); x = vaddq_s32(x, offset); x = vshlq_s32(x, vdupq_n_s32(-out_shift)); } } x = vmaxq_s32(x, clamp_lo); x = vminq_s32(x, clamp_hi); out[0] = x; out[1] = x; out[2] = x; out[3] = x; out[4] = x; out[5] = x; out[6] = x; out[7] = x; out[8] = x; out[9] = x; out[10] = x; out[11] = x; out[12] = x; out[13] = x; out[14] = x; out[15] = x; out[16] = x; out[17] = x; out[18] = x; out[19] = x; out[20] = x; out[21] = x; out[22] = x; out[23] = x; out[24] = x; out[25] = x; out[26] = x; out[27] = x; out[28] = x; out[29] = x; out[30] = x; out[31] = x; out[32] = x; out[33] = x; out[34] = x; out[35] = x; out[36] = x; out[37] = x; out[38] = x; out[39] = x; out[40] = x; out[41] = x; out[42] = x; out[43] = x; out[44] = x; out[45] = x; out[46] = x; out[47] = x; out[48] = x; out[49] = x; out[50] = x; out[51] = x; out[52] = x; out[53] = x; out[54] = x; out[55] = x; out[56] = x; out[57] = x; out[58] = x; out[59] = x; out[60] = x; out[61] = x; out[62] = x; out[63] = x; } } static void idct64x64_low8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { int i, j; const int32_t *cospi = cospi_arr(bit); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); { int32x4_t u[64]; // stage 1 u[0] = in[0]; u[8] = in[4]; u[16] = in[2]; u[24] = in[6]; u[32] = in[1]; u[40] = in[5]; u[48] = in[3]; u[56] = in[7]; // stage 2 u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding); u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding); u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding); u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding); u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding); u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding); u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding); u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding); // stage 3 u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding); u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding); u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding); u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding); u[33] = u[32]; u[38] = u[39]; u[41] = u[40]; u[46] = u[47]; u[49] = u[48]; u[54] = u[55]; u[57] = u[56]; u[62] = u[63]; // stage 4 int32x4_t temp1, temp2; u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding); u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding); u[17] = u[16]; u[22] = u[23]; u[25] = u[24]; u[30] = u[31]; temp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62], &v_bit, &rnding); u[62] = half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding); u[33] = temp1; temp2 = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57], &v_bit, &rnding); u[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57], &v_bit, &rnding); u[57] = temp2; temp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54], &v_bit, &rnding); u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit, &rnding); u[41] = temp1; temp2 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49], &v_bit, &rnding); u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49], &v_bit, &rnding); u[46] = temp2; // stage 5 u[9] = u[8]; u[14] = u[15]; temp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30], &v_bit, &rnding); u[30] = half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding); u[17] = temp1; temp2 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25], &v_bit, &rnding); u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25], &v_bit, &rnding); u[22] = temp2; u[35] = u[32]; u[34] = u[33]; u[36] = u[39]; u[37] = u[38]; u[43] = u[40]; u[42] = u[41]; u[44] = u[47]; u[45] = u[46]; u[51] = u[48]; u[50] = u[49]; u[52] = u[55]; u[53] = u[54]; u[59] = u[56]; u[58] = u[57]; u[60] = u[63]; u[61] = u[62]; // stage 6 temp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); u[0] = temp1; temp2 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit, &rnding); u[14] = half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding); u[9] = temp2; u[19] = u[16]; u[18] = u[17]; u[20] = u[23]; u[21] = u[22]; u[27] = u[24]; u[26] = u[25]; u[28] = u[31]; u[29] = u[30]; temp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61], &v_bit, &rnding); u[61] = half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding); u[34] = temp1; temp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60], &v_bit, &rnding); u[60] = half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding); u[35] = temp2; temp1 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59], &v_bit, &rnding); u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59], &v_bit, &rnding); u[36] = temp1; temp2 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58], &v_bit, &rnding); u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58], &v_bit, &rnding); u[37] = temp2; temp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53], &v_bit, &rnding); u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit, &rnding); u[42] = temp1; temp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52], &v_bit, &rnding); u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit, &rnding); u[43] = temp2; temp1 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51], &v_bit, &rnding); u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51], &v_bit, &rnding); u[44] = temp1; temp2 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50], &v_bit, &rnding); u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50], &v_bit, &rnding); u[45] = temp2; // stage 7 u[3] = u[0]; u[2] = u[1]; u[11] = u[8]; u[10] = u[9]; u[12] = u[15]; u[13] = u[14]; temp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29], &v_bit, &rnding); u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit, &rnding); u[18] = temp1; temp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28], &v_bit, &rnding); u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit, &rnding); u[19] = temp2; temp1 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27], &v_bit, &rnding); u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27], &v_bit, &rnding); u[20] = temp1; temp2 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26], &v_bit, &rnding); u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26], &v_bit, &rnding); u[21] = temp2; for (i = 32; i < 64; i += 16) { for (j = i; j < i + 4; j++) { addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, &clamp_hi); } } // stage 8 u[7] = u[0]; u[6] = u[1]; u[5] = u[2]; u[4] = u[3]; u[9] = u[9]; idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); // stage 9 idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); // stage 10 idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); // stage 11 idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } } static void idct64x64_low16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { int i, j; const int32_t *cospi = cospi_arr(bit); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); { int32x4_t u[64]; int32x4_t tmp1, tmp2, tmp3, tmp4; // stage 1 u[0] = in[0]; u[32] = in[1]; u[36] = in[9]; u[40] = in[5]; u[44] = in[13]; u[48] = in[3]; u[52] = in[11]; u[56] = in[7]; u[60] = in[15]; u[16] = in[2]; u[20] = in[10]; u[24] = in[6]; u[28] = in[14]; u[4] = in[8]; u[8] = in[4]; u[12] = in[12]; // stage 2 u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding); u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding); u[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding); u[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding); u[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding); u[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding); u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding); u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding); u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding); u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding); u[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding); u[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding); u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding); u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding); u[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding); u[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding); // stage 3 u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding); u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding); u[19] = half_btf_0_m_neon_r(&cospi[50], &u[28], &v_bit, &rnding); u[28] = half_btf_0_neon_r(&cospi[14], &u[28], &v_bit, &rnding); u[27] = half_btf_0_neon_r(&cospi[10], &u[20], &v_bit, &rnding); u[20] = half_btf_0_neon_r(&cospi[54], &u[20], &v_bit, &rnding); u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding); u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding); u[33] = u[32]; u[34] = u[35]; u[37] = u[36]; u[38] = u[39]; u[41] = u[40]; u[42] = u[43]; u[45] = u[44]; u[46] = u[47]; u[49] = u[48]; u[50] = u[51]; u[53] = u[52]; u[54] = u[55]; u[57] = u[56]; u[58] = u[59]; u[61] = u[60]; u[62] = u[63]; // stage 4 u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding); u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding); u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding); u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding); u[17] = u[16]; u[18] = u[19]; u[21] = u[20]; u[22] = u[23]; u[25] = u[24]; u[26] = u[27]; u[29] = u[28]; u[30] = u[31]; tmp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62], &v_bit, &rnding); tmp2 = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61], &v_bit, &rnding); tmp3 = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58], &v_bit, &rnding); tmp4 = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57], &v_bit, &rnding); u[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57], &v_bit, &rnding); u[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit, &rnding); u[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61], &v_bit, &rnding); u[62] = half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding); u[33] = tmp1; u[34] = tmp2; u[37] = tmp3; u[38] = tmp4; tmp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54], &v_bit, &rnding); tmp2 = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53], &v_bit, &rnding); tmp3 = half_btf_neon_r(&cospi[52], &u[45], &cospi[12], &u[50], &v_bit, &rnding); tmp4 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49], &v_bit, &rnding); u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49], &v_bit, &rnding); u[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit, &rnding); u[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53], &v_bit, &rnding); u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit, &rnding); u[41] = tmp1; u[42] = tmp2; u[45] = tmp3; u[46] = tmp4; // stage 5 u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding); u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding); u[9] = u[8]; u[10] = u[11]; u[13] = u[12]; u[14] = u[15]; tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30], &v_bit, &rnding); tmp2 = half_btf_neon_mode11_r(&cospi[56], &u[18], &cospi[8], &u[29], &v_bit, &rnding); tmp3 = half_btf_neon_mode10_r(&cospi[40], &u[21], &cospi[24], &u[26], &v_bit, &rnding); tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25], &v_bit, &rnding); u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25], &v_bit, &rnding); u[26] = half_btf_neon_r(&cospi[24], &u[21], &cospi[40], &u[26], &v_bit, &rnding); u[29] = half_btf_neon_mode10_r(&cospi[8], &u[18], &cospi[56], &u[29], &v_bit, &rnding); u[30] = half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding); u[17] = tmp1; u[18] = tmp2; u[21] = tmp3; u[22] = tmp4; for (i = 32; i < 64; i += 8) { addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, &clamp_hi); addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, &clamp_hi); addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, &clamp_hi); addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, &clamp_hi); } // stage 6 tmp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); u[0] = tmp1; u[5] = u[4]; u[6] = u[7]; tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit, &rnding); u[14] = half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding); u[9] = tmp1; tmp2 = half_btf_neon_mode01_r(&cospi[48], &u[10], &cospi[16], &u[13], &v_bit, &rnding); u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], &v_bit, &rnding); u[10] = tmp2; for (i = 16; i < 32; i += 8) { addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, &clamp_hi); addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, &clamp_hi); addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, &clamp_hi); addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, &clamp_hi); } tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61], &v_bit, &rnding); tmp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60], &v_bit, &rnding); tmp3 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59], &v_bit, &rnding); tmp4 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58], &v_bit, &rnding); u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58], &v_bit, &rnding); u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59], &v_bit, &rnding); u[60] = half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding); u[61] = half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding); u[34] = tmp1; u[35] = tmp2; u[36] = tmp3; u[37] = tmp4; tmp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53], &v_bit, &rnding); tmp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52], &v_bit, &rnding); tmp3 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51], &v_bit, &rnding); tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50], &v_bit, &rnding); u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50], &v_bit, &rnding); u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51], &v_bit, &rnding); u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit, &rnding); u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit, &rnding); u[42] = tmp1; u[43] = tmp2; u[44] = tmp3; u[45] = tmp4; // stage 7 u[3] = u[0]; u[2] = u[1]; tmp1 = half_btf_neon_mode10_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit, &rnding); u[6] = half_btf_neon_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit, &rnding); u[5] = tmp1; addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29], &v_bit, &rnding); tmp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28], &v_bit, &rnding); tmp3 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27], &v_bit, &rnding); tmp4 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26], &v_bit, &rnding); u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26], &v_bit, &rnding); u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27], &v_bit, &rnding); u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit, &rnding); u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit, &rnding); u[18] = tmp1; u[19] = tmp2; u[20] = tmp3; u[21] = tmp4; for (i = 32; i < 64; i += 16) { for (j = i; j < i + 4; j++) { addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, &clamp_hi); } } // stage 8 for (i = 0; i < 4; ++i) { addsub_neon(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); } idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); // stage 9 idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); // stage 10 idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); // stage 11 idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } } static void idct64x64_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { int i, j; const int32_t *cospi = cospi_arr(bit); const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); { int32x4_t u[64], v[64]; // stage 1 u[32] = in[1]; u[34] = in[17]; u[36] = in[9]; u[38] = in[25]; u[40] = in[5]; u[42] = in[21]; u[44] = in[13]; u[46] = in[29]; u[48] = in[3]; u[50] = in[19]; u[52] = in[11]; u[54] = in[27]; u[56] = in[7]; u[58] = in[23]; u[60] = in[15]; u[62] = in[31]; v[16] = in[2]; v[18] = in[18]; v[20] = in[10]; v[22] = in[26]; v[24] = in[6]; v[26] = in[22]; v[28] = in[14]; v[30] = in[30]; u[8] = in[4]; u[10] = in[20]; u[12] = in[12]; u[14] = in[28]; v[4] = in[8]; v[6] = in[24]; u[0] = in[0]; u[2] = in[16]; // stage 2 v[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding); v[33] = half_btf_0_m_neon_r(&cospi[33], &u[62], &v_bit, &rnding); v[34] = half_btf_0_neon_r(&cospi[47], &u[34], &v_bit, &rnding); v[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding); v[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding); v[37] = half_btf_0_m_neon_r(&cospi[41], &u[58], &v_bit, &rnding); v[38] = half_btf_0_neon_r(&cospi[39], &u[38], &v_bit, &rnding); v[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding); v[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding); v[41] = half_btf_0_m_neon_r(&cospi[37], &u[54], &v_bit, &rnding); v[42] = half_btf_0_neon_r(&cospi[43], &u[42], &v_bit, &rnding); v[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding); v[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding); v[45] = half_btf_0_m_neon_r(&cospi[45], &u[50], &v_bit, &rnding); v[46] = half_btf_0_neon_r(&cospi[35], &u[46], &v_bit, &rnding); v[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding); v[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding); v[49] = half_btf_0_neon_r(&cospi[29], &u[46], &v_bit, &rnding); v[50] = half_btf_0_neon_r(&cospi[19], &u[50], &v_bit, &rnding); v[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding); v[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding); v[53] = half_btf_0_neon_r(&cospi[21], &u[42], &v_bit, &rnding); v[54] = half_btf_0_neon_r(&cospi[27], &u[54], &v_bit, &rnding); v[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding); v[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding); v[57] = half_btf_0_neon_r(&cospi[25], &u[38], &v_bit, &rnding); v[58] = half_btf_0_neon_r(&cospi[23], &u[58], &v_bit, &rnding); v[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding); v[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding); v[61] = half_btf_0_neon_r(&cospi[17], &u[34], &v_bit, &rnding); v[62] = half_btf_0_neon_r(&cospi[31], &u[62], &v_bit, &rnding); v[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding); // stage 3 u[16] = half_btf_0_neon_r(&cospi[62], &v[16], &v_bit, &rnding); u[17] = half_btf_0_m_neon_r(&cospi[34], &v[30], &v_bit, &rnding); u[18] = half_btf_0_neon_r(&cospi[46], &v[18], &v_bit, &rnding); u[19] = half_btf_0_m_neon_r(&cospi[50], &v[28], &v_bit, &rnding); u[20] = half_btf_0_neon_r(&cospi[54], &v[20], &v_bit, &rnding); u[21] = half_btf_0_m_neon_r(&cospi[42], &v[26], &v_bit, &rnding); u[22] = half_btf_0_neon_r(&cospi[38], &v[22], &v_bit, &rnding); u[23] = half_btf_0_m_neon_r(&cospi[58], &v[24], &v_bit, &rnding); u[24] = half_btf_0_neon_r(&cospi[6], &v[24], &v_bit, &rnding); u[25] = half_btf_0_neon_r(&cospi[26], &v[22], &v_bit, &rnding); u[26] = half_btf_0_neon_r(&cospi[22], &v[26], &v_bit, &rnding); u[27] = half_btf_0_neon_r(&cospi[10], &v[20], &v_bit, &rnding); u[28] = half_btf_0_neon_r(&cospi[14], &v[28], &v_bit, &rnding); u[29] = half_btf_0_neon_r(&cospi[18], &v[18], &v_bit, &rnding); u[30] = half_btf_0_neon_r(&cospi[30], &v[30], &v_bit, &rnding); u[31] = half_btf_0_neon_r(&cospi[2], &v[16], &v_bit, &rnding); for (i = 32; i < 64; i += 4) { addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, &clamp_hi); addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, &clamp_hi); } // stage 4 v[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding); v[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding); v[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding); v[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding); v[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding); v[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding); v[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding); v[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding); for (i = 16; i < 32; i += 4) { addsub_neon(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, &clamp_hi); addsub_neon(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, &clamp_hi); } for (i = 32; i < 64; i += 4) { v[i + 0] = u[i + 0]; v[i + 3] = u[i + 3]; } v[33] = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62], &v_bit, &rnding); v[34] = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61], &v_bit, &rnding); v[37] = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58], &v_bit, &rnding); v[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57], &v_bit, &rnding); v[41] = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54], &v_bit, &rnding); v[42] = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53], &v_bit, &rnding); v[45] = half_btf_neon_mode10_r(&cospi[52], &u[45], &cospi[12], &u[50], &v_bit, &rnding); v[46] = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49], &v_bit, &rnding); v[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49], &v_bit, &rnding); v[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit, &rnding); v[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53], &v_bit, &rnding); v[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit, &rnding); v[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57], &v_bit, &rnding); v[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit, &rnding); v[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61], &v_bit, &rnding); v[62] = half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding); // stage 5 u[4] = half_btf_0_neon_r(&cospi[56], &v[4], &v_bit, &rnding); u[5] = half_btf_0_m_neon_r(&cospi[40], &v[6], &v_bit, &rnding); u[6] = half_btf_0_neon_r(&cospi[24], &v[6], &v_bit, &rnding); u[7] = half_btf_0_neon_r(&cospi[8], &v[4], &v_bit, &rnding); for (i = 8; i < 16; i += 4) { addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, &clamp_hi); addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, &clamp_hi); } for (i = 16; i < 32; i += 4) { u[i + 0] = v[i + 0]; u[i + 3] = v[i + 3]; } u[17] = half_btf_neon_mode10_r(&cospi[8], &v[17], &cospi[56], &v[30], &v_bit, &rnding); u[18] = half_btf_neon_mode11_r(&cospi[56], &v[18], &cospi[8], &v[29], &v_bit, &rnding); u[21] = half_btf_neon_mode10_r(&cospi[40], &v[21], &cospi[24], &v[26], &v_bit, &rnding); u[22] = half_btf_neon_mode11_r(&cospi[24], &v[22], &cospi[40], &v[25], &v_bit, &rnding); u[25] = half_btf_neon_mode10_r(&cospi[40], &v[22], &cospi[24], &v[25], &v_bit, &rnding); u[26] = half_btf_neon_r(&cospi[24], &v[21], &cospi[40], &v[26], &v_bit, &rnding); u[29] = half_btf_neon_mode10_r(&cospi[8], &v[18], &cospi[56], &v[29], &v_bit, &rnding); u[30] = half_btf_neon_r(&cospi[56], &v[17], &cospi[8], &v[30], &v_bit, &rnding); for (i = 32; i < 64; i += 8) { addsub_neon(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, &clamp_hi); addsub_neon(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, &clamp_hi); addsub_neon(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, &clamp_hi); addsub_neon(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, &clamp_hi); } // stage 6 v[0] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); v[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); v[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding); v[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding); addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); for (i = 8; i < 16; i += 4) { v[i + 0] = u[i + 0]; v[i + 3] = u[i + 3]; } v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit, &rnding); v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13], &v_bit, &rnding); v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], &v_bit, &rnding); v[14] = half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding); for (i = 16; i < 32; i += 8) { addsub_neon(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, &clamp_hi); addsub_neon(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, &clamp_hi); addsub_neon(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, &clamp_hi); addsub_neon(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, &clamp_hi); } for (i = 32; i < 64; i += 8) { v[i + 0] = u[i + 0]; v[i + 1] = u[i + 1]; v[i + 6] = u[i + 6]; v[i + 7] = u[i + 7]; } v[34] = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61], &v_bit, &rnding); v[35] = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60], &v_bit, &rnding); v[36] = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59], &v_bit, &rnding); v[37] = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58], &v_bit, &rnding); v[42] = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53], &v_bit, &rnding); v[43] = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52], &v_bit, &rnding); v[44] = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51], &v_bit, &rnding); v[45] = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50], &v_bit, &rnding); v[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50], &v_bit, &rnding); v[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51], &v_bit, &rnding); v[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit, &rnding); v[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit, &rnding); v[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58], &v_bit, &rnding); v[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59], &v_bit, &rnding); v[60] = half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding); v[61] = half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding); // stage 7 addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); u[4] = v[4]; u[7] = v[7]; u[5] = half_btf_neon_mode10_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit, &rnding); u[6] = half_btf_neon_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit, &rnding); addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); for (i = 16; i < 32; i += 8) { u[i + 0] = v[i + 0]; u[i + 1] = v[i + 1]; u[i + 6] = v[i + 6]; u[i + 7] = v[i + 7]; } u[18] = half_btf_neon_mode10_r(&cospi[16], &v[18], &cospi[48], &v[29], &v_bit, &rnding); u[19] = half_btf_neon_mode10_r(&cospi[16], &v[19], &cospi[48], &v[28], &v_bit, &rnding); u[20] = half_btf_neon_mode11_r(&cospi[48], &v[20], &cospi[16], &v[27], &v_bit, &rnding); u[21] = half_btf_neon_mode11_r(&cospi[48], &v[21], &cospi[16], &v[26], &v_bit, &rnding); u[26] = half_btf_neon_mode10_r(&cospi[16], &v[21], &cospi[48], &v[26], &v_bit, &rnding); u[27] = half_btf_neon_mode10_r(&cospi[16], &v[20], &cospi[48], &v[27], &v_bit, &rnding); u[28] = half_btf_neon_r(&cospi[48], &v[19], &cospi[16], &v[28], &v_bit, &rnding); u[29] = half_btf_neon_r(&cospi[48], &v[18], &cospi[16], &v[29], &v_bit, &rnding); for (i = 32; i < 64; i += 16) { for (j = i; j < i + 4; j++) { addsub_neon(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); addsub_neon(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, &clamp_hi); } } // stage 8 for (i = 0; i < 4; ++i) { addsub_neon(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); } v[8] = u[8]; v[9] = u[9]; v[14] = u[14]; v[15] = u[15]; v[10] = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13], &v_bit, &rnding); v[11] = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12], &v_bit, &rnding); v[12] = half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], &v_bit, &rnding); v[13] = half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], &v_bit, &rnding); for (i = 16; i < 20; ++i) { addsub_neon(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); addsub_neon(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, &clamp_hi); } for (i = 32; i < 36; ++i) { v[i] = u[i]; v[i + 12] = u[i + 12]; v[i + 16] = u[i + 16]; v[i + 28] = u[i + 28]; } v[36] = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59], &v_bit, &rnding); v[37] = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58], &v_bit, &rnding); v[38] = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57], &v_bit, &rnding); v[39] = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56], &v_bit, &rnding); v[40] = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55], &v_bit, &rnding); v[41] = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54], &v_bit, &rnding); v[42] = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53], &v_bit, &rnding); v[43] = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52], &v_bit, &rnding); v[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52], &v_bit, &rnding); v[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53], &v_bit, &rnding); v[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54], &v_bit, &rnding); v[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55], &v_bit, &rnding); v[56] = half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], &v_bit, &rnding); v[57] = half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], &v_bit, &rnding); v[58] = half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], &v_bit, &rnding); v[59] = half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], &v_bit, &rnding); // stage 9 for (i = 0; i < 8; ++i) { addsub_neon(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); } for (i = 16; i < 20; ++i) { u[i] = v[i]; u[i + 12] = v[i + 12]; } u[20] = half_btf_neon_mode10_r(&cospi[32], &v[20], &cospi[32], &v[27], &v_bit, &rnding); u[21] = half_btf_neon_mode10_r(&cospi[32], &v[21], &cospi[32], &v[26], &v_bit, &rnding); u[22] = half_btf_neon_mode10_r(&cospi[32], &v[22], &cospi[32], &v[25], &v_bit, &rnding); u[23] = half_btf_neon_mode10_r(&cospi[32], &v[23], &cospi[32], &v[24], &v_bit, &rnding); u[24] = half_btf_neon_r(&cospi[32], &v[23], &cospi[32], &v[24], &v_bit, &rnding); u[25] = half_btf_neon_r(&cospi[32], &v[22], &cospi[32], &v[25], &v_bit, &rnding); u[26] = half_btf_neon_r(&cospi[32], &v[21], &cospi[32], &v[26], &v_bit, &rnding); u[27] = half_btf_neon_r(&cospi[32], &v[20], &cospi[32], &v[27], &v_bit, &rnding); for (i = 32; i < 40; i++) { addsub_neon(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); } for (i = 48; i < 56; i++) { addsub_neon(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); } // stage 10 for (i = 0; i < 16; i++) { addsub_neon(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); } for (i = 32; i < 40; i++) v[i] = u[i]; v[40] = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55], &v_bit, &rnding); v[41] = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54], &v_bit, &rnding); v[42] = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53], &v_bit, &rnding); v[43] = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52], &v_bit, &rnding); v[44] = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51], &v_bit, &rnding); v[45] = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50], &v_bit, &rnding); v[46] = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49], &v_bit, &rnding); v[47] = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48], &v_bit, &rnding); v[48] = half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], &v_bit, &rnding); v[49] = half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], &v_bit, &rnding); v[50] = half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], &v_bit, &rnding); v[51] = half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], &v_bit, &rnding); v[52] = half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], &v_bit, &rnding); v[53] = half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], &v_bit, &rnding); v[54] = half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], &v_bit, &rnding); v[55] = half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], &v_bit, &rnding); for (i = 56; i < 64; i++) v[i] = u[i]; // stage 11 for (i = 0; i < 32; i++) { addsub_neon(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo, &clamp_hi); } if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); for (i = 0; i < 64; i += 4) { round_shift_4x4(out + i, out_shift); highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out, 4); } } } } static void idct32x32_low1_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); int32x4_t bf1; const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); // stage 0-1 bf1 = in[0]; // stage 2-5 bf1 = half_btf_0_neon_r(&cospi[32], &bf1, &v_bit, &rnding); // stage 6-9 if (do_cols) { bf1 = vmaxq_s32(bf1, clamp_lo); bf1 = vminq_s32(bf1, clamp_hi); } else { const int log_range_out = AOMMAX(16, bd + 6); clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1))); clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1); if (out_shift != 0) { bf1 = vrshlq_s32(bf1, vdupq_n_s32(-out_shift)); } } bf1 = vmaxq_s32(bf1, clamp_lo); bf1 = vminq_s32(bf1, clamp_hi); for (int i = 0; i < 32; i++) out[i] = bf1; } static void idct32x32_low8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); int32x4_t bf1[32]; const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); // stage 0-1 bf1[0] = in[0]; bf1[4] = in[4]; bf1[8] = in[2]; bf1[12] = in[6]; bf1[16] = in[1]; bf1[20] = in[5]; bf1[24] = in[3]; bf1[28] = in[7]; // stage 2 bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding); bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding); bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding); bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding); bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding); bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding); bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding); bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding); // stage 3 bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding); bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding); bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding); bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding); bf1[17] = bf1[16]; bf1[18] = bf1[19]; bf1[21] = bf1[20]; bf1[22] = bf1[23]; bf1[25] = bf1[24]; bf1[26] = bf1[27]; bf1[29] = bf1[28]; bf1[30] = bf1[31]; // stage 4 : bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding); bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding); bf1[9] = bf1[8]; bf1[10] = bf1[11]; bf1[13] = bf1[12]; bf1[14] = bf1[15]; idct32_stage4_neon(bf1, cospi, &v_bit, &rnding); // stage 5 bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding); bf1[1] = bf1[0]; bf1[5] = bf1[4]; bf1[6] = bf1[7]; idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); // stage 6 bf1[3] = bf1[0]; bf1[2] = bf1[1]; idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); // stage 7 idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); // stage 8 idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); // stage 9 idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } static void idct32x32_low16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); int32x4_t bf1[32]; const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); // stage 0-1 bf1[0] = in[0]; bf1[2] = in[8]; bf1[4] = in[4]; bf1[6] = in[12]; bf1[8] = in[2]; bf1[10] = in[10]; bf1[12] = in[6]; bf1[14] = in[14]; bf1[16] = in[1]; bf1[18] = in[9]; bf1[20] = in[5]; bf1[22] = in[13]; bf1[24] = in[3]; bf1[26] = in[11]; bf1[28] = in[7]; bf1[30] = in[15]; // stage 2 bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding); bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding); bf1[17] = half_btf_0_m_neon_r(&cospi[34], &bf1[30], &v_bit, &rnding); bf1[30] = half_btf_0_neon_r(&cospi[30], &bf1[30], &v_bit, &rnding); bf1[29] = half_btf_0_neon_r(&cospi[18], &bf1[18], &v_bit, &rnding); bf1[18] = half_btf_0_neon_r(&cospi[46], &bf1[18], &v_bit, &rnding); bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding); bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding); bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding); bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding); bf1[21] = half_btf_0_m_neon_r(&cospi[42], &bf1[26], &v_bit, &rnding); bf1[26] = half_btf_0_neon_r(&cospi[22], &bf1[26], &v_bit, &rnding); bf1[25] = half_btf_0_neon_r(&cospi[26], &bf1[22], &v_bit, &rnding); bf1[22] = half_btf_0_neon_r(&cospi[38], &bf1[22], &v_bit, &rnding); bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding); bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding); // stage 3 bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding); bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding); bf1[9] = half_btf_0_m_neon_r(&cospi[36], &bf1[14], &v_bit, &rnding); bf1[14] = half_btf_0_neon_r(&cospi[28], &bf1[14], &v_bit, &rnding); bf1[13] = half_btf_0_neon_r(&cospi[20], &bf1[10], &v_bit, &rnding); bf1[10] = half_btf_0_neon_r(&cospi[44], &bf1[10], &v_bit, &rnding); bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding); bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding); addsub_neon(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); addsub_neon(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); addsub_neon(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); addsub_neon(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); addsub_neon(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); addsub_neon(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); addsub_neon(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); addsub_neon(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); // stage 4 bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding); bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding); bf1[5] = half_btf_0_m_neon_r(&cospi[40], &bf1[6], &v_bit, &rnding); bf1[6] = half_btf_0_neon_r(&cospi[24], &bf1[6], &v_bit, &rnding); addsub_neon(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); addsub_neon(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); addsub_neon(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); addsub_neon(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); idct32_stage4_neon(bf1, cospi, &v_bit, &rnding); // stage 5 bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding); bf1[1] = bf1[0]; bf1[3] = half_btf_0_neon_r(&cospi[16], &bf1[2], &v_bit, &rnding); bf1[2] = half_btf_0_neon_r(&cospi[48], &bf1[2], &v_bit, &rnding); addsub_neon(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); addsub_neon(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); // stage 6 addsub_neon(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); addsub_neon(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); // stage 7 idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); // stage 8 idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); // stage 9 idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } static void idct32x32_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); int32x4_t bf1[32], bf0[32]; const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); // stage 0 // stage 1 bf1[0] = in[0]; bf1[1] = in[16]; bf1[2] = in[8]; bf1[3] = in[24]; bf1[4] = in[4]; bf1[5] = in[20]; bf1[6] = in[12]; bf1[7] = in[28]; bf1[8] = in[2]; bf1[9] = in[18]; bf1[10] = in[10]; bf1[11] = in[26]; bf1[12] = in[6]; bf1[13] = in[22]; bf1[14] = in[14]; bf1[15] = in[30]; bf1[16] = in[1]; bf1[17] = in[17]; bf1[18] = in[9]; bf1[19] = in[25]; bf1[20] = in[5]; bf1[21] = in[21]; bf1[22] = in[13]; bf1[23] = in[29]; bf1[24] = in[3]; bf1[25] = in[19]; bf1[26] = in[11]; bf1[27] = in[27]; bf1[28] = in[7]; bf1[29] = in[23]; bf1[30] = in[15]; bf1[31] = in[31]; // stage 2 for (int i = 0; i < 16; i++) bf0[i] = bf1[i]; bf0[16] = half_btf_neon_mode01_r(&cospi[62], &bf1[16], &cospi[2], &bf1[31], &v_bit, &rnding); bf0[17] = half_btf_neon_mode01_r(&cospi[30], &bf1[17], &cospi[34], &bf1[30], &v_bit, &rnding); bf0[18] = half_btf_neon_mode01_r(&cospi[46], &bf1[18], &cospi[18], &bf1[29], &v_bit, &rnding); bf0[19] = half_btf_neon_mode01_r(&cospi[14], &bf1[19], &cospi[50], &bf1[28], &v_bit, &rnding); bf0[20] = half_btf_neon_mode01_r(&cospi[54], &bf1[20], &cospi[10], &bf1[27], &v_bit, &rnding); bf0[21] = half_btf_neon_mode01_r(&cospi[22], &bf1[21], &cospi[42], &bf1[26], &v_bit, &rnding); bf0[22] = half_btf_neon_mode01_r(&cospi[38], &bf1[22], &cospi[26], &bf1[25], &v_bit, &rnding); bf0[23] = half_btf_neon_mode01_r(&cospi[6], &bf1[23], &cospi[58], &bf1[24], &v_bit, &rnding); bf0[24] = half_btf_neon_r(&cospi[58], &bf1[23], &cospi[6], &bf1[24], &v_bit, &rnding); bf0[25] = half_btf_neon_r(&cospi[26], &bf1[22], &cospi[38], &bf1[25], &v_bit, &rnding); bf0[26] = half_btf_neon_r(&cospi[42], &bf1[21], &cospi[22], &bf1[26], &v_bit, &rnding); bf0[27] = half_btf_neon_r(&cospi[10], &bf1[20], &cospi[54], &bf1[27], &v_bit, &rnding); bf0[28] = half_btf_neon_r(&cospi[50], &bf1[19], &cospi[14], &bf1[28], &v_bit, &rnding); bf0[29] = half_btf_neon_r(&cospi[18], &bf1[18], &cospi[46], &bf1[29], &v_bit, &rnding); bf0[30] = half_btf_neon_r(&cospi[34], &bf1[17], &cospi[30], &bf1[30], &v_bit, &rnding); bf0[31] = half_btf_neon_r(&cospi[2], &bf1[16], &cospi[62], &bf1[31], &v_bit, &rnding); // stage 3 for (int i = 0; i < 8; i++) bf1[i] = bf0[i]; bf1[8] = half_btf_neon_mode01_r(&cospi[60], &bf0[8], &cospi[4], &bf0[15], &v_bit, &rnding); bf1[9] = half_btf_neon_mode01_r(&cospi[28], &bf0[9], &cospi[36], &bf0[14], &v_bit, &rnding); bf1[10] = half_btf_neon_mode01_r(&cospi[44], &bf0[10], &cospi[20], &bf0[13], &v_bit, &rnding); bf1[11] = half_btf_neon_mode01_r(&cospi[12], &bf0[11], &cospi[52], &bf0[12], &v_bit, &rnding); bf1[12] = half_btf_neon_r(&cospi[52], &bf0[11], &cospi[12], &bf0[12], &v_bit, &rnding); bf1[13] = half_btf_neon_r(&cospi[20], &bf0[10], &cospi[44], &bf0[13], &v_bit, &rnding); bf1[14] = half_btf_neon_r(&cospi[36], &bf0[9], &cospi[28], &bf0[14], &v_bit, &rnding); bf1[15] = half_btf_neon_r(&cospi[4], &bf0[8], &cospi[60], &bf0[15], &v_bit, &rnding); addsub_neon(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); addsub_neon(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); addsub_neon(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); addsub_neon(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); addsub_neon(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); addsub_neon(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); addsub_neon(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); addsub_neon(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); // stage 4 bf0[0] = bf1[0]; bf0[1] = bf1[1]; bf0[2] = bf1[2]; bf0[3] = bf1[3]; bf0[4] = half_btf_neon_mode01_r(&cospi[56], &bf1[4], &cospi[8], &bf1[7], &v_bit, &rnding); bf0[5] = half_btf_neon_mode01_r(&cospi[24], &bf1[5], &cospi[40], &bf1[6], &v_bit, &rnding); bf0[6] = half_btf_neon_r(&cospi[40], &bf1[5], &cospi[24], &bf1[6], &v_bit, &rnding); bf0[7] = half_btf_neon_r(&cospi[8], &bf1[4], &cospi[56], &bf1[7], &v_bit, &rnding); addsub_neon(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); addsub_neon(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); addsub_neon(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); addsub_neon(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); bf0[16] = bf1[16]; bf0[17] = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30], &v_bit, &rnding); bf0[18] = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29], &v_bit, &rnding); bf0[19] = bf1[19]; bf0[20] = bf1[20]; bf0[21] = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26], &v_bit, &rnding); bf0[22] = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25], &v_bit, &rnding); bf0[23] = bf1[23]; bf0[24] = bf1[24]; bf0[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25], &v_bit, &rnding); bf0[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], &v_bit, &rnding); bf0[27] = bf1[27]; bf0[28] = bf1[28]; bf0[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29], &v_bit, &rnding); bf0[30] = half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], &v_bit, &rnding); bf0[31] = bf1[31]; // stage 5 bf1[0] = half_btf_neon_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1], &v_bit, &rnding); bf1[1] = half_btf_neon_mode01_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1], &v_bit, &rnding); bf1[2] = half_btf_neon_mode01_r(&cospi[48], &bf0[2], &cospi[16], &bf0[3], &v_bit, &rnding); bf1[3] = half_btf_neon_r(&cospi[16], &bf0[2], &cospi[48], &bf0[3], &v_bit, &rnding); addsub_neon(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); addsub_neon(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); bf1[8] = bf0[8]; bf1[9] = half_btf_neon_mode10_r(&cospi[16], &bf0[9], &cospi[48], &bf0[14], &v_bit, &rnding); bf1[10] = half_btf_neon_mode11_r(&cospi[48], &bf0[10], &cospi[16], &bf0[13], &v_bit, &rnding); bf1[11] = bf0[11]; bf1[12] = bf0[12]; bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf0[10], &cospi[48], &bf0[13], &v_bit, &rnding); bf1[14] = half_btf_neon_r(&cospi[48], &bf0[9], &cospi[16], &bf0[14], &v_bit, &rnding); bf1[15] = bf0[15]; addsub_neon(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); addsub_neon(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); addsub_neon(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); addsub_neon(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); addsub_neon(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); addsub_neon(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); addsub_neon(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); addsub_neon(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); // stage 6 addsub_neon(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); addsub_neon(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); bf0[4] = bf1[4]; bf0[5] = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], &v_bit, &rnding); bf0[6] = half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], &v_bit, &rnding); bf0[7] = bf1[7]; addsub_neon(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); addsub_neon(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); addsub_neon(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); addsub_neon(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); bf0[16] = bf1[16]; bf0[17] = bf1[17]; bf0[18] = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29], &v_bit, &rnding); bf0[19] = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28], &v_bit, &rnding); bf0[20] = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27], &v_bit, &rnding); bf0[21] = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26], &v_bit, &rnding); bf0[22] = bf1[22]; bf0[23] = bf1[23]; bf0[24] = bf1[24]; bf0[25] = bf1[25]; bf0[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26], &v_bit, &rnding); bf0[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27], &v_bit, &rnding); bf0[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], &v_bit, &rnding); bf0[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], &v_bit, &rnding); bf0[30] = bf1[30]; bf0[31] = bf1[31]; // stage 7 addsub_neon(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); addsub_neon(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); addsub_neon(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); addsub_neon(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = half_btf_neon_mode10_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13], &v_bit, &rnding); bf1[11] = half_btf_neon_mode10_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12], &v_bit, &rnding); bf1[12] = half_btf_neon_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12], &v_bit, &rnding); bf1[13] = half_btf_neon_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13], &v_bit, &rnding); bf1[14] = bf0[14]; bf1[15] = bf0[15]; addsub_neon(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); addsub_neon(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); addsub_neon(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); addsub_neon(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); addsub_neon(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); addsub_neon(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); addsub_neon(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); addsub_neon(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); // stage 8 addsub_neon(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); addsub_neon(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); addsub_neon(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); addsub_neon(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); addsub_neon(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); addsub_neon(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); addsub_neon(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); addsub_neon(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); bf0[16] = bf1[16]; bf0[17] = bf1[17]; bf0[18] = bf1[18]; bf0[19] = bf1[19]; bf0[20] = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], &v_bit, &rnding); bf0[21] = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], &v_bit, &rnding); bf0[22] = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], &v_bit, &rnding); bf0[23] = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], &v_bit, &rnding); bf0[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], &v_bit, &rnding); bf0[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], &v_bit, &rnding); bf0[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], &v_bit, &rnding); bf0[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], &v_bit, &rnding); bf0[28] = bf1[28]; bf0[29] = bf1[29]; bf0[30] = bf1[30]; bf0[31] = bf1[31]; // stage 9 addsub_neon(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi); addsub_neon(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi); addsub_neon(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi); addsub_neon(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi); addsub_neon(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi); addsub_neon(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi); addsub_neon(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi); addsub_neon(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi); addsub_neon(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi); addsub_neon(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi); addsub_neon(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi); addsub_neon(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi); addsub_neon(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi); addsub_neon(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi); addsub_neon(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi); addsub_neon(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi); if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); round_shift_8x8(out, out_shift); round_shift_8x8(out + 16, out_shift); highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32); } } static void iidentity32_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, int bd, int out_shift) { (void)bit; for (int i = 0; i < 32; i += 16) { out[i] = vshlq_n_s32(in[i], 2); out[i + 1] = vshlq_n_s32(in[i + 1], 2); out[i + 2] = vshlq_n_s32(in[i + 2], 2); out[i + 3] = vshlq_n_s32(in[i + 3], 2); out[i + 4] = vshlq_n_s32(in[i + 4], 2); out[i + 5] = vshlq_n_s32(in[i + 5], 2); out[i + 6] = vshlq_n_s32(in[i + 6], 2); out[i + 7] = vshlq_n_s32(in[i + 7], 2); out[i + 8] = vshlq_n_s32(in[i + 8], 2); out[i + 9] = vshlq_n_s32(in[i + 9], 2); out[i + 10] = vshlq_n_s32(in[i + 10], 2); out[i + 11] = vshlq_n_s32(in[i + 11], 2); out[i + 12] = vshlq_n_s32(in[i + 12], 2); out[i + 13] = vshlq_n_s32(in[i + 13], 2); out[i + 14] = vshlq_n_s32(in[i + 14], 2); out[i + 15] = vshlq_n_s32(in[i + 15], 2); } if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); round_shift_8x8(out, out_shift); round_shift_8x8(out + 16, out_shift); highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32); } } // 1D itx types typedef enum ATTRIBUTE_PACKED { IDCT_1D, IADST_1D, IFLIPADST_1D = IADST_1D, IIDENTITY_1D, ITX_TYPES_1D, } ITX_TYPE_1D; static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D, IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D, }; static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { IDCT_1D, IDCT_1D, IADST_1D, IADST_1D, IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D, IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, }; static const transform_1d_neon highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { { { idct4x4_neon, NULL, NULL, NULL }, { iadst4x4_neon, NULL, NULL, NULL }, { iidentity4_neon, iidentity4_neon, iidentity4_neon, NULL }, }, { { idct8x8_low1_neon, idct8x8_new_neon, NULL, NULL }, { iadst8x8_low1_neon, iadst8x8_new_neon, NULL, NULL }, { iidentity8_neon, iidentity8_neon, NULL, NULL } }, { { idct16x16_low1_neon, idct16x16_low8_neon, idct16x16_neon, NULL }, { iadst16x16_low1_neon, iadst16x16_low8_neon, iadst16x16_neon, NULL }, { iidentity16_neon, NULL, iidentity16_neon, NULL }, }, { { idct32x32_low1_neon, idct32x32_low8_neon, idct32x32_low16_neon, idct32x32_neon }, { NULL, NULL, NULL, NULL }, { iidentity32_neon, NULL, NULL, NULL } }, { { idct64x64_low1_neon, idct64x64_low8_neon, idct64x64_low16_neon, idct64x64_neon }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } } }; void av1_inv_txfm2d_add_4x8_neon(const tran_low_t *input, uint16_t *output, int stride, TX_TYPE tx_type, const int bd) { TX_SIZE tx_size = TX_4X8; int32x4_t buf1[32] = { vdupq_n_s32(0) }; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const transform_1d_neon row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; const transform_1d_neon col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1]; const int input_stride = AOMMIN(32, txfm_size_row); assert(col_txfm != NULL); assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // 1st stage: column transform int32x4_t buf0[8]; load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col); load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col); round_shift_rect_array_32_neon(buf0, buf0, txfm_size_row); row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]); if (lr_flip) { TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2], buf1[3]); TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6], buf1[7]); } else { TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2], buf1[3]); TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6], buf1[7]); } // 2nd stage: column transform col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0); round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]); // write to buffer highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row, bd); } void av1_inv_txfm2d_add_8x4_neon(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, const int bd) { TX_SIZE tx_size = TX_8X4; int32x4_t buf1[8]; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const transform_1d_neon row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1]; const transform_1d_neon col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; assert(col_txfm != NULL); assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // 1st stage: column transform int32x4_t buf0[8]; const int32_t *input_row = input; load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); round_shift_rect_array_32_neon(buf0, buf0, txfm_size_col); row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); int32x4_t *buf1_ptr; if (lr_flip) { flip_buf_neon(buf0, buf1, txfm_size_col); buf1_ptr = buf1; } else { buf1_ptr = buf0; } // 2nd stage: column transform for (int i = 0; i < 2; i++) { int32x4_t *buf1_cur = buf1_ptr + i * txfm_size_row; transpose_4x4(buf1_cur, buf1_cur); col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0); } round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); // write to buffer highbd_write_buffer_8xn_neon(buf1_ptr, output, stride, ud_flip, txfm_size_row, bd); } void av1_inv_txfm2d_add_4x16_neon(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, const int bd) { TX_SIZE tx_size = TX_4X16; int32x4_t buf1[16]; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_h_div8 = txfm_size_row >> 2; const transform_1d_neon row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; const transform_1d_neon col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2]; const int input_stride = AOMMIN(32, txfm_size_row); assert(col_txfm != NULL); assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // 1st stage: column transform int32x4_t buf0[16]; for (int i = 0; i < (txfm_size_row >> 2); i++) { const int32_t *input_row = input + i * 4; int32x4_t *buf0_cur = buf0 + i * 4; load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col); row_txfm(buf0 + (i << 2), buf0 + (i << 2), INV_COS_BIT, 0, bd, -shift[0]); } if (lr_flip) { for (int j = 0; j < buf_size_h_div8; ++j) { TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]); } } else { for (int j = 0; j < buf_size_h_div8; ++j) { TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2], buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]); } } // 2nd stage: column transform col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0); round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]); // write to buffer highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row, bd); } void av1_inv_txfm2d_add_16x4_neon(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, const int bd) { TX_SIZE tx_size = TX_16X4; int32x4_t buf1[16]; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div8 = txfm_size_col >> 2; const transform_1d_neon row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2]; const transform_1d_neon col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; assert(col_txfm != NULL); assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // 1st stage: column transform int32x4_t buf0[16]; const int32_t *input_row = input; load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); int32x4_t *buf1_ptr; if (lr_flip) { flip_buf_neon(buf0, buf1, txfm_size_col); buf1_ptr = buf1; } else { buf1_ptr = buf0; } // 2nd stage: column transform for (int i = 0; i < buf_size_w_div8; i++) { int32x4_t *buf1_cur = buf1_ptr + i * txfm_size_row; transpose_4x4(buf1_cur, buf1_cur); col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0); } round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); // write to buffer for (int i = 0; i < (txfm_size_col >> 3); i++) { highbd_write_buffer_8xn_neon(buf1_ptr + i * txfm_size_row * 2, output + 8 * i, stride, ud_flip, txfm_size_row, bd); } } static const int lowbd_txfm_all_1d_zeros_idx[32] = { 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, }; // Transform block width in log2 for eob (size of 64 map to 32) static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = { 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x16_default[16]) = { 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x32_default[32]) = { 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = { 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = { 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x32_default[32]) = { 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x16_default[16]) = { 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = { 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = { 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f, }; DECLARE_ALIGNED(16, static const int16_t *, av1_eob_to_eobxy_default[TX_SIZES_ALL]) = { NULL, av1_eob_to_eobxy_8x8_default, av1_eob_to_eobxy_16x16_default, av1_eob_to_eobxy_32x32_default, av1_eob_to_eobxy_32x32_default, NULL, NULL, av1_eob_to_eobxy_8x16_default, av1_eob_to_eobxy_16x8_default, av1_eob_to_eobxy_16x32_default, av1_eob_to_eobxy_32x16_default, av1_eob_to_eobxy_32x32_default, av1_eob_to_eobxy_32x32_default, NULL, NULL, av1_eob_to_eobxy_8x32_default, av1_eob_to_eobxy_32x8_default, av1_eob_to_eobxy_16x32_default, av1_eob_to_eobxy_32x16_default, }; static inline void highbd_get_eobx_eoby_scan_default(int *eobx, int *eoby, TX_SIZE tx_size, int eob) { if (eob == 1) { *eobx = 0; *eoby = 0; return; } const int tx_w_log2 = tx_size_wide_log2_eob[tx_size]; const int eob_row = (eob - 1) >> tx_w_log2; const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row]; *eobx = eobxy & 0xFF; *eoby = eobxy >> 8; } static inline void get_eobx_eoby_scan_default(int *eobx, int *eoby, TX_SIZE tx_size) { if (tx_size == 2) { *eoby = 15, *eobx = 15; } else if (tx_size == 3) { *eoby = 31, *eobx = 31; } else if (tx_size == 4) { *eoby = 31, *eobx = 31; } else if (tx_size == 7) { *eoby = 15, *eobx = 7; } else if (tx_size == 8) { *eoby = 7, *eobx = 15; } else if (tx_size == 9) { *eoby = 31, *eobx = 15; } else if (tx_size == 10) { *eoby = 15, *eobx = 31; } else if (tx_size == 11) { *eoby = 31, *eobx = 31; } else if (tx_size == 12) { *eoby = 31, *eobx = 31; } else if (tx_size == 15) { *eoby = 31, *eobx = 7; } else if (tx_size == 16) { *eoby = 7, *eobx = 31; } else if (tx_size == 17) { *eoby = 31, *eobx = 15; } else if (tx_size == 18) { *eoby = 15, *eobx = 31; } else { *eoby = 0, *eobx = 0; } } static inline void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby, TX_SIZE tx_size) { const int txfm_size_row = tx_size_high[tx_size]; *eoby = AOMMIN(32, txfm_size_row) - 1; *eobx = 0; } static inline void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby, TX_SIZE tx_size) { const int txfm_size_col = tx_size_wide[tx_size]; *eobx = AOMMIN(32, txfm_size_col) - 1; *eoby = 0; } static void inv_txfm2d_add_h_identity_neon(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, const int bd) { int32x4_t buf1[64]; int eobx, eoby; get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w = AOMMIN(32, txfm_size_col); const int buf_size_w_div4 = buf_size_w >> 2; const int buf_size_h_div8 = (eoby + 8) >> 3; const int row_max = AOMMIN(32, txfm_size_row); const int input_stride = row_max; const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby]; const transform_1d_neon row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; assert(row_txfm != NULL); const transform_1d_neon col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx]; assert(col_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < (buf_size_h_div8 << 1); ++i) { int32x4_t buf0[16]; load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w); if (rect_type == 1 || rect_type == -1) { round_shift_rect_array_32_neon(buf0, buf0, buf_size_w); } row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); int32x4_t *_buf1 = buf1 + i * 4; for (int j = 0; j < buf_size_w_div4; ++j) { int32x4_t *buf0_cur = buf0 + j * 4; TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); _buf1[j * txfm_size_row + 0] = buf0_cur[0]; _buf1[j * txfm_size_row + 1] = buf0_cur[1]; _buf1[j * txfm_size_row + 2] = buf0_cur[2]; _buf1[j * txfm_size_row + 3] = buf0_cur[3]; } } for (int i = 0; i < buf_size_w_div4; i++) { col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, bd, 0); round_shift_array_32_neon(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, txfm_size_row, -shift[1]); } // write to buffer for (int i = 0; i < (txfm_size_col >> 3); i++) { highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i, stride, ud_flip, txfm_size_row, bd); } } static void inv_txfm2d_add_v_identity_neon(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, const int bd) { int32x4_t buf1[64]; int eobx, eoby; get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div4 = AOMMIN(32, txfm_size_col) >> 2; const int row_max = AOMMIN(32, txfm_size_row); const int input_stride = row_max; const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3; const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; const transform_1d_neon row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; assert(row_txfm != NULL); const transform_1d_neon col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; assert(col_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < (row_max >> 2); ++i) { int32x4_t buf0[16]; load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_nonzero_w); if (rect_type == 1 || rect_type == -1) { round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w); } row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); int32x4_t *_buf1 = buf1 + i * 4; if (lr_flip) { for (int j = 0; j < buf_size_w_div4; ++j) { TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], buf0[4 * j], _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0], _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1], _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2], _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]); } } else { for (int j = 0; j < buf_size_w_div4; ++j) { TRANSPOSE_4X4( buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); } } } for (int i = 0; i < buf_size_w_div4; i++) { col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, bd, 0); round_shift_array_32_neon(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, txfm_size_row, -shift[1]); } // write to buffer { for (int i = 0; i < (txfm_size_col >> 3); i++) { highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i, stride, ud_flip, txfm_size_row, bd); } } } static void inv_txfm2d_add_idtx_neon(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, const int bd) { int32x4_t buf1[64 * 4]; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int row_max = AOMMIN(32, txfm_size_row); const int input_stride = row_max; const int buf_size_w = AOMMIN(32, txfm_size_col); const int buf_size_w_div4 = buf_size_w >> 2; const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const transform_1d_neon row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; assert(row_txfm != NULL); const transform_1d_neon col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; assert(col_txfm != NULL); for (int i = 0; i < (row_max >> 2); ++i) { int32x4_t buf0[32]; load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w); if (rect_type == 1 || rect_type == -1) { round_shift_rect_array_32_neon(buf0, buf0, buf_size_w); } row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); int32x4_t *_buf1 = buf1 + i * 4; for (int j = 0; j < buf_size_w_div4; ++j) { int32x4_t *buf0_cur = buf0 + j * 4; TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); _buf1[j * txfm_size_row + 0] = buf0_cur[0]; _buf1[j * txfm_size_row + 1] = buf0_cur[1]; _buf1[j * txfm_size_row + 2] = buf0_cur[2]; _buf1[j * txfm_size_row + 3] = buf0_cur[3]; } } for (int i = 0; i < buf_size_w_div4; i++) { col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, bd, 0); round_shift_array_32_neon(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, txfm_size_row, -shift[1]); } // write to buffer { for (int i = 0; i < (txfm_size_col >> 3); i++) { highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i, stride, 0, txfm_size_row, bd); } } } static void inv_txfm2d_add_no_identity_neon(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, const int bd) { int32x4_t buf1[64 * 16]; int eobx, eoby; get_eobx_eoby_scan_default(&eobx, &eoby, tx_size); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div4 = txfm_size_col >> 2; const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; const int input_stride = AOMMIN(32, txfm_size_row); const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; const transform_1d_neon row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; const transform_1d_neon col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; assert(col_txfm != NULL); assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // 1st stage: column transform for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) { int32x4_t buf0[64]; load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_nonzero_w); if (rect_type == 1 || rect_type == -1) { round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w); } row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); int32x4_t *_buf1 = &buf1[i * 4]; if (lr_flip) { for (int j = 0; j < buf_size_w_div4; ++j) { TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], buf0[4 * j], _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0], _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1], _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2], _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]); } } else { for (int j = 0; j < buf_size_w_div4; ++j) { TRANSPOSE_4X4( buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); } } } // 2nd stage: column transform for (int i = 0; i < buf_size_w_div4; i++) { col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, bd, 0); round_shift_array_32_neon(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, txfm_size_row, -shift[1]); } // write to buffer { for (int i = 0; i < (txfm_size_col >> 3); i++) { highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i, stride, ud_flip, txfm_size_row, bd); } } } static void highbd_inv_txfm2d_add_no_identity_neon(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob, const int bd) { int32x4_t buf1[64 * 16]; int eobx, eoby; highbd_get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div8 = txfm_size_col >> 2; const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; const int input_stride = AOMMIN(32, txfm_size_col); const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; const transform_1d_neon row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; const transform_1d_neon col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; assert(col_txfm != NULL); assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // 1st stage: column transform for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) { int32x4_t buf0[64]; const int32_t *input_row = input + i * input_stride * 4; for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) { int32x4_t *buf0_cur = &buf0[j * 4]; load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); } if (rect_type == 1 || rect_type == -1) { round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w_div8 << 3); } row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); int32x4_t *_buf1 = &buf1[i * 4]; if (lr_flip) { for (int j = 0; j < buf_size_w_div8; ++j) { TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], buf0[4 * j], _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0], _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1], _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2], _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]); } } else { for (int j = 0; j < buf_size_w_div8; ++j) { TRANSPOSE_4X4( buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); } } } // 2nd stage: column transform for (int i = 0; i < buf_size_w_div8; i++) { col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, bd, 0); round_shift_array_32_neon(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, txfm_size_row, -shift[1]); } // write to buffer { for (int i = 0; i < (txfm_size_col >> 3); i++) { highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i, stride, ud_flip, txfm_size_row, bd); } } } static void highbd_inv_txfm2d_add_universe_neon(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob, const int bd) { switch (tx_type) { case DCT_DCT: case ADST_DCT: case DCT_ADST: case ADST_ADST: case FLIPADST_DCT: case DCT_FLIPADST: case FLIPADST_FLIPADST: case ADST_FLIPADST: case FLIPADST_ADST: highbd_inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, bd); break; case V_DCT: case V_ADST: case V_FLIPADST: inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, bd); break; case H_DCT: case H_ADST: case H_FLIPADST: inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, bd); break; case IDTX: inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, bd); break; default: assert(0); break; } } static void inv_txfm2d_add_universe_neon(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, const int bd) { switch (tx_type) { case DCT_DCT: case ADST_DCT: case DCT_ADST: case ADST_ADST: case FLIPADST_DCT: case DCT_FLIPADST: case FLIPADST_FLIPADST: case ADST_FLIPADST: case FLIPADST_ADST: inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, bd); break; case V_DCT: case V_ADST: case V_FLIPADST: inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, bd); break; case H_DCT: case H_ADST: case H_FLIPADST: inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, bd); break; case IDTX: inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, bd); break; default: assert(0); break; } } static void highbd_inv_txfm_add_8x8_neon(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const int32_t *src = cast_to_int32(input); switch (tx_type) { case IDTX: case H_DCT: case H_ADST: case H_FLIPADST: case V_DCT: case V_ADST: case V_FLIPADST: highbd_inv_txfm2d_add_universe_neon(input, dest, stride, tx_type, txfm_param->tx_size, txfm_param->eob, bd); break; default: av1_inv_txfm2d_add_8x8_neon(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); break; } } static void highbd_inv_txfm_add_4x4_neon(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); int eob = txfm_param->eob; int bd = txfm_param->bd; int lossless = txfm_param->lossless; const int32_t *src = cast_to_int32(input); const TX_TYPE tx_type = txfm_param->tx_type; if (lossless) { assert(tx_type == DCT_DCT); av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); return; } av1_inv_txfm2d_add_4x4_neon(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); } void av1_inv_txfm2d_add_8x16_neon(const tran_low_t *input, uint16_t *dest, int stride, TX_TYPE tx_type, const int bd) { inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_8X16, bd); } void av1_inv_txfm2d_add_16x8_neon(const tran_low_t *input, uint16_t *dest, int stride, TX_TYPE tx_type, const int bd) { inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_16X8, bd); } void av1_inv_txfm2d_add_16x32_neon(const tran_low_t *input, uint16_t *dest, int stride, TX_TYPE tx_type, const int bd) { inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_16X32, bd); } void av1_inv_txfm2d_add_32x16_neon(const tran_low_t *input, uint16_t *dest, int stride, TX_TYPE tx_type, const int bd) { inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_32X16, bd); } void av1_inv_txfm2d_add_32x32_neon(const tran_low_t *input, uint16_t *dest, int stride, TX_TYPE tx_type, const int bd) { inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_32X32, bd); } void av1_inv_txfm2d_add_64x64_neon(const tran_low_t *input, uint16_t *dest, int stride, TX_TYPE tx_type, const int bd) { inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_64X64, bd); } void av1_inv_txfm2d_add_32x64_neon(const tran_low_t *input, uint16_t *dest, int stride, TX_TYPE tx_type, const int bd) { inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_32X64, bd); } void av1_inv_txfm2d_add_64x32_neon(const tran_low_t *input, uint16_t *dest, int stride, TX_TYPE tx_type, const int bd) { inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_64X32, bd); } void av1_inv_txfm2d_add_64x16_neon(const tran_low_t *input, uint16_t *dest, int stride, TX_TYPE tx_type, const int bd) { inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_64X16, bd); } void av1_inv_txfm2d_add_16x64_neon(const tran_low_t *input, uint16_t *dest, int stride, TX_TYPE tx_type, const int bd) { inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_16X64, bd); } static void av1_inv_txfm2d_add_16x16_neon(const tran_low_t *input, uint16_t *dest, int stride, TX_TYPE tx_type, const int bd) { inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_16X16, bd); } void av1_inv_txfm2d_add_32x8_neon(const tran_low_t *input, uint16_t *dest, int stride, TX_TYPE tx_type, const int bd) { inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_32X8, bd); } void av1_inv_txfm2d_add_8x32_neon(const tran_low_t *input, uint16_t *dest, int stride, TX_TYPE tx_type, const int bd) { inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_8X32, bd); } void av1_highbd_inv_txfm_add_neon(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { const TX_SIZE tx_size = txfm_param->tx_size; TX_TYPE tx_type = txfm_param->tx_type; int bd = txfm_param->bd; switch (tx_size) { case TX_8X8: highbd_inv_txfm_add_8x8_neon(input, dest, stride, txfm_param); break; case TX_4X8: av1_inv_txfm2d_add_4x8_neon(input, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); break; case TX_8X4: av1_inv_txfm2d_add_8x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); break; case TX_4X4: highbd_inv_txfm_add_4x4_neon(input, dest, stride, txfm_param); break; case TX_16X4: av1_inv_txfm2d_add_16x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); break; case TX_4X16: av1_inv_txfm2d_add_4x16_neon(input, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); break; case TX_8X16: av1_inv_txfm2d_add_8x16_neon(input, (uint16_t *)dest, stride, tx_type, bd); break; case TX_16X8: av1_inv_txfm2d_add_16x8_neon(input, (uint16_t *)dest, stride, tx_type, bd); break; case TX_16X32: av1_inv_txfm2d_add_16x32_neon(input, (uint16_t *)dest, stride, tx_type, bd); break; case TX_32X16: av1_inv_txfm2d_add_32x16_neon(input, (uint16_t *)dest, stride, tx_type, bd); break; case TX_16X16: av1_inv_txfm2d_add_16x16_neon(input, (uint16_t *)dest, stride, tx_type, bd); break; case TX_32X32: av1_inv_txfm2d_add_32x32_neon(input, (uint16_t *)dest, stride, tx_type, bd); break; case TX_64X64: av1_inv_txfm2d_add_64x64_neon(input, (uint16_t *)dest, stride, tx_type, bd); break; case TX_32X64: av1_inv_txfm2d_add_32x64_neon(input, (uint16_t *)dest, stride, tx_type, bd); break; case TX_64X32: av1_inv_txfm2d_add_64x32_neon(input, (uint16_t *)dest, stride, tx_type, bd); break; case TX_16X64: av1_inv_txfm2d_add_16x64_neon(input, (uint16_t *)dest, stride, tx_type, bd); break; case TX_64X16: av1_inv_txfm2d_add_64x16_neon(input, (uint16_t *)dest, stride, tx_type, bd); break; case TX_32X8: av1_inv_txfm2d_add_32x8_neon(input, (uint16_t *)dest, stride, tx_type, bd); break; case TX_8X32: av1_inv_txfm2d_add_8x32_neon(input, (uint16_t *)dest, stride, tx_type, bd); break; } } aom-3.12.1/av1/common/arm/highbd_reconinter_neon.c000066400000000000000000000247101477627663500220030ustar00rootroot00000000000000/* * * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/blend.h" #include "aom_ports/mem.h" #include "config/av1_rtcd.h" static inline void diffwtd_mask_highbd_neon(uint8_t *mask, bool inverse, const uint16_t *src0, int src0_stride, const uint16_t *src1, int src1_stride, int h, int w, const unsigned int bd) { assert(DIFF_FACTOR > 0); uint8x16_t max_alpha = vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA); uint8x16_t mask_base = vdupq_n_u8(38); uint8x16_t mask_diff = vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA - 38); if (bd == 8) { if (w >= 16) { do { uint8_t *mask_ptr = mask; const uint16_t *src0_ptr = src0; const uint16_t *src1_ptr = src1; int width = w; do { uint16x8_t s0_lo = vld1q_u16(src0_ptr); uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8); uint16x8_t s1_lo = vld1q_u16(src1_ptr); uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8); uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo); uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi); uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, DIFF_FACTOR_LOG2); uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, DIFF_FACTOR_LOG2); uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8); uint8x16_t m; if (inverse) { m = vqsubq_u8(mask_diff, diff); } else { m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha); } vst1q_u8(mask_ptr, m); src0_ptr += 16; src1_ptr += 16; mask_ptr += 16; width -= 16; } while (width != 0); mask += w; src0 += src0_stride; src1 += src1_stride; } while (--h != 0); } else if (w == 8) { do { uint8_t *mask_ptr = mask; const uint16_t *src0_ptr = src0; const uint16_t *src1_ptr = src1; int width = w; do { uint16x8_t s0 = vld1q_u16(src0_ptr); uint16x8_t s1 = vld1q_u16(src1_ptr); uint16x8_t diff_u16 = vabdq_u16(s0, s1); uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2); uint8x8_t m; if (inverse) { m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); } else { m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), vget_low_u8(max_alpha)); } vst1_u8(mask_ptr, m); src0_ptr += 8; src1_ptr += 8; mask_ptr += 8; width -= 8; } while (width != 0); mask += w; src0 += src0_stride; src1 += src1_stride; } while (--h != 0); } else if (w == 4) { do { uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); uint16x8_t diff_u16 = vabdq_u16(s0, s1); uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2); uint8x8_t m; if (inverse) { m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); } else { m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), vget_low_u8(max_alpha)); } store_u8x4_strided_x2(mask, w, m); src0 += 2 * src0_stride; src1 += 2 * src1_stride; mask += 2 * w; h -= 2; } while (h != 0); } } else if (bd == 10) { if (w >= 16) { do { uint8_t *mask_ptr = mask; const uint16_t *src0_ptr = src0; const uint16_t *src1_ptr = src1; int width = w; do { uint16x8_t s0_lo = vld1q_u16(src0_ptr); uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8); uint16x8_t s1_lo = vld1q_u16(src1_ptr); uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8); uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo); uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi); uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, 2 + DIFF_FACTOR_LOG2); uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, 2 + DIFF_FACTOR_LOG2); uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8); uint8x16_t m; if (inverse) { m = vqsubq_u8(mask_diff, diff); } else { m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha); } vst1q_u8(mask_ptr, m); src0_ptr += 16; src1_ptr += 16; mask_ptr += 16; width -= 16; } while (width != 0); mask += w; src0 += src0_stride; src1 += src1_stride; } while (--h != 0); } else if (w == 8) { do { uint8_t *mask_ptr = mask; const uint16_t *src0_ptr = src0; const uint16_t *src1_ptr = src1; int width = w; do { uint16x8_t s0 = vld1q_u16(src0_ptr); uint16x8_t s1 = vld1q_u16(src1_ptr); uint16x8_t diff_u16 = vabdq_u16(s0, s1); uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 2 + DIFF_FACTOR_LOG2); uint8x8_t m; if (inverse) { m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); } else { m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), vget_low_u8(max_alpha)); } vst1_u8(mask_ptr, m); src0_ptr += 8; src1_ptr += 8; mask_ptr += 8; width -= 8; } while (width != 0); mask += w; src0 += src0_stride; src1 += src1_stride; } while (--h != 0); } else if (w == 4) { do { uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); uint16x8_t diff_u16 = vabdq_u16(s0, s1); uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 2 + DIFF_FACTOR_LOG2); uint8x8_t m; if (inverse) { m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); } else { m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), vget_low_u8(max_alpha)); } store_u8x4_strided_x2(mask, w, m); src0 += 2 * src0_stride; src1 += 2 * src1_stride; mask += 2 * w; h -= 2; } while (h != 0); } } else { assert(bd == 12); if (w >= 16) { do { uint8_t *mask_ptr = mask; const uint16_t *src0_ptr = src0; const uint16_t *src1_ptr = src1; int width = w; do { uint16x8_t s0_lo = vld1q_u16(src0_ptr); uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8); uint16x8_t s1_lo = vld1q_u16(src1_ptr); uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8); uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo); uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi); uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, 4 + DIFF_FACTOR_LOG2); uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, 4 + DIFF_FACTOR_LOG2); uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8); uint8x16_t m; if (inverse) { m = vqsubq_u8(mask_diff, diff); } else { m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha); } vst1q_u8(mask_ptr, m); src0_ptr += 16; src1_ptr += 16; mask_ptr += 16; width -= 16; } while (width != 0); mask += w; src0 += src0_stride; src1 += src1_stride; } while (--h != 0); } else if (w == 8) { do { uint8_t *mask_ptr = mask; const uint16_t *src0_ptr = src0; const uint16_t *src1_ptr = src1; int width = w; do { uint16x8_t s0 = vld1q_u16(src0_ptr); uint16x8_t s1 = vld1q_u16(src1_ptr); uint16x8_t diff_u16 = vabdq_u16(s0, s1); uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 4 + DIFF_FACTOR_LOG2); uint8x8_t m; if (inverse) { m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); } else { m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), vget_low_u8(max_alpha)); } vst1_u8(mask_ptr, m); src0_ptr += 8; src1_ptr += 8; mask_ptr += 8; width -= 8; } while (width != 0); mask += w; src0 += src0_stride; src1 += src1_stride; } while (--h != 0); } else if (w == 4) { do { uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); uint16x8_t diff_u16 = vabdq_u16(s0, s1); uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 4 + DIFF_FACTOR_LOG2); uint8x8_t m; if (inverse) { m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); } else { m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), vget_low_u8(max_alpha)); } store_u8x4_strided_x2(mask, w, m); src0 += 2 * src0_stride; src1 += 2 * src1_stride; mask += 2 * w; h -= 2; } while (h != 0); } } } void av1_build_compound_diffwtd_mask_highbd_neon( uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd) { assert(h % 4 == 0); assert(w % 4 == 0); assert(mask_type == DIFFWTD_38_INV || mask_type == DIFFWTD_38); if (mask_type == DIFFWTD_38) { diffwtd_mask_highbd_neon(mask, /*inverse=*/false, CONVERT_TO_SHORTPTR(src0), src0_stride, CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd); } else { // mask_type == DIFFWTD_38_INV diffwtd_mask_highbd_neon(mask, /*inverse=*/true, CONVERT_TO_SHORTPTR(src0), src0_stride, CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd); } } aom-3.12.1/av1/common/arm/highbd_reconintra_neon.c000066400000000000000000000162411477627663500217770ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/sum_neon.h" #include "config/av1_rtcd.h" #define MAX_UPSAMPLE_SZ 16 void av1_highbd_filter_intra_edge_neon(uint16_t *p, int sz, int strength) { if (!strength) return; assert(sz >= 0 && sz <= 129); DECLARE_ALIGNED(16, static const uint16_t, idx[8]) = { 0, 1, 2, 3, 4, 5, 6, 7 }; const uint16x8_t index = vld1q_u16(idx); uint16_t edge[160]; // Max value of sz + enough padding for vector accesses. memcpy(edge + 1, p, sz * sizeof(*p)); // Populate extra space appropriately. edge[0] = edge[1]; edge[sz + 1] = edge[sz]; edge[sz + 2] = edge[sz]; // Don't overwrite first pixel. uint16_t *dst = p + 1; sz--; if (strength == 1) { // Filter: {4, 8, 4}. const uint16_t *src = edge + 1; while (sz >= 8) { uint16x8_t s0 = vld1q_u16(src); uint16x8_t s1 = vld1q_u16(src + 1); uint16x8_t s2 = vld1q_u16(src + 2); // Make use of the identity: // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2 uint16x8_t t0 = vaddq_u16(s0, s2); uint16x8_t t1 = vaddq_u16(s1, s1); uint16x8_t sum = vaddq_u16(t0, t1); uint16x8_t res = vrshrq_n_u16(sum, 2); vst1q_u16(dst, res); src += 8; dst += 8; sz -= 8; } if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. uint16x8_t s0 = vld1q_u16(src); uint16x8_t s1 = vld1q_u16(src + 1); uint16x8_t s2 = vld1q_u16(src + 2); // Make use of the identity: // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2 uint16x8_t t0 = vaddq_u16(s0, s2); uint16x8_t t1 = vaddq_u16(s1, s1); uint16x8_t sum = vaddq_u16(t0, t1); uint16x8_t res = vrshrq_n_u16(sum, 2); // Mask off out-of-bounds indices. uint16x8_t current_dst = vld1q_u16(dst); uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index); res = vbslq_u16(mask, res, current_dst); vst1q_u16(dst, res); } } else if (strength == 2) { // Filter: {5, 6, 5}. const uint16_t *src = edge + 1; const uint16x8x3_t filter = { { vdupq_n_u16(5), vdupq_n_u16(6), vdupq_n_u16(5) } }; while (sz >= 8) { uint16x8_t s0 = vld1q_u16(src); uint16x8_t s1 = vld1q_u16(src + 1); uint16x8_t s2 = vld1q_u16(src + 2); uint16x8_t accum = vmulq_u16(s0, filter.val[0]); accum = vmlaq_u16(accum, s1, filter.val[1]); accum = vmlaq_u16(accum, s2, filter.val[2]); uint16x8_t res = vrshrq_n_u16(accum, 4); vst1q_u16(dst, res); src += 8; dst += 8; sz -= 8; } if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. uint16x8_t s0 = vld1q_u16(src); uint16x8_t s1 = vld1q_u16(src + 1); uint16x8_t s2 = vld1q_u16(src + 2); uint16x8_t accum = vmulq_u16(s0, filter.val[0]); accum = vmlaq_u16(accum, s1, filter.val[1]); accum = vmlaq_u16(accum, s2, filter.val[2]); uint16x8_t res = vrshrq_n_u16(accum, 4); // Mask off out-of-bounds indices. uint16x8_t current_dst = vld1q_u16(dst); uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index); res = vbslq_u16(mask, res, current_dst); vst1q_u16(dst, res); } } else { // Filter {2, 4, 4, 4, 2}. const uint16_t *src = edge; while (sz >= 8) { uint16x8_t s0 = vld1q_u16(src); uint16x8_t s1 = vld1q_u16(src + 1); uint16x8_t s2 = vld1q_u16(src + 2); uint16x8_t s3 = vld1q_u16(src + 3); uint16x8_t s4 = vld1q_u16(src + 4); // Make use of the identity: // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3 uint16x8_t t0 = vaddq_u16(s0, s4); uint16x8_t t1 = vaddq_u16(s1, s2); t1 = vaddq_u16(t1, s3); t1 = vaddq_u16(t1, t1); uint16x8_t sum = vaddq_u16(t0, t1); uint16x8_t res = vrshrq_n_u16(sum, 3); vst1q_u16(dst, res); src += 8; dst += 8; sz -= 8; } if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. uint16x8_t s0 = vld1q_u16(src); uint16x8_t s1 = vld1q_u16(src + 1); uint16x8_t s2 = vld1q_u16(src + 2); uint16x8_t s3 = vld1q_u16(src + 3); uint16x8_t s4 = vld1q_u16(src + 4); // Make use of the identity: // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3 uint16x8_t t0 = vaddq_u16(s0, s4); uint16x8_t t1 = vaddq_u16(s1, s2); t1 = vaddq_u16(t1, s3); t1 = vaddq_u16(t1, t1); uint16x8_t sum = vaddq_u16(t0, t1); uint16x8_t res = vrshrq_n_u16(sum, 3); // Mask off out-of-bounds indices. uint16x8_t current_dst = vld1q_u16(dst); uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index); res = vbslq_u16(mask, res, current_dst); vst1q_u16(dst, res); } } } void av1_highbd_upsample_intra_edge_neon(uint16_t *p, int sz, int bd) { if (!sz) return; assert(sz <= MAX_UPSAMPLE_SZ); uint16_t edge[MAX_UPSAMPLE_SZ + 3]; const uint16_t *src = edge; // Copy p[-1..(sz-1)] and pad out both ends. edge[0] = p[-1]; edge[1] = p[-1]; memcpy(edge + 2, p, sz * 2); edge[sz + 2] = p[sz - 1]; p[-2] = p[-1]; uint16x8_t pixel_val_max = vdupq_n_u16((1 << bd) - 1); uint16_t *dst = p - 1; if (bd == 12) { do { uint16x8_t s0 = vld1q_u16(src); uint16x8_t s1 = vld1q_u16(src + 1); uint16x8_t s2 = vld1q_u16(src + 2); uint16x8_t s3 = vld1q_u16(src + 3); uint16x8_t t0 = vaddq_u16(s1, s2); uint16x8_t t1 = vaddq_u16(s0, s3); uint32x4_t acc0 = vmull_n_u16(vget_low_u16(t0), 9); acc0 = vqsubq_u32(acc0, vmovl_u16(vget_low_u16(t1))); uint32x4_t acc1 = vmull_n_u16(vget_high_u16(t0), 9); acc1 = vqsubq_u32(acc1, vmovl_u16(vget_high_u16(t1))); uint16x8x2_t res; res.val[0] = vcombine_u16(vrshrn_n_u32(acc0, 4), vrshrn_n_u32(acc1, 4)); // Clamp pixel values at bitdepth maximum. res.val[0] = vminq_u16(res.val[0], pixel_val_max); res.val[1] = s2; vst2q_u16(dst, res); src += 8; dst += 16; sz -= 8; } while (sz > 0); } else { // Bit depth is 8 or 10. do { uint16x8_t s0 = vld1q_u16(src); uint16x8_t s1 = vld1q_u16(src + 1); uint16x8_t s2 = vld1q_u16(src + 2); uint16x8_t s3 = vld1q_u16(src + 3); uint16x8_t t0 = vaddq_u16(s0, s3); uint16x8_t t1 = vaddq_u16(s1, s2); t1 = vmulq_n_u16(t1, 9); t1 = vqsubq_u16(t1, t0); uint16x8x2_t res; res.val[0] = vrshrq_n_u16(t1, 4); // Clamp pixel values at bitdepth maximum. res.val[0] = vminq_u16(res.val[0], pixel_val_max); res.val[1] = s2; vst2q_u16(dst, res); src += 8; dst += 16; sz -= 8; } while (sz > 0); } } aom-3.12.1/av1/common/arm/highbd_warp_plane_neon.c000066400000000000000000000305711477627663500217650ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" #include "av1/common/scale.h" #include "av1/common/warped_motion.h" #include "config/av1_rtcd.h" #include "highbd_warp_plane_neon.h" static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_4x1_f4(int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int bd, int sx, int alpha) { int16x8_t f[4]; load_filters_4(f, sx, alpha); int32x4_t m0 = vmull_s16(vget_low_s16(f[0]), vget_low_s16(rv0)); m0 = vmlal_s16(m0, vget_high_s16(f[0]), vget_high_s16(rv0)); int32x4_t m1 = vmull_s16(vget_low_s16(f[1]), vget_low_s16(rv1)); m1 = vmlal_s16(m1, vget_high_s16(f[1]), vget_high_s16(rv1)); int32x4_t m2 = vmull_s16(vget_low_s16(f[2]), vget_low_s16(rv2)); m2 = vmlal_s16(m2, vget_high_s16(f[2]), vget_high_s16(rv2)); int32x4_t m3 = vmull_s16(vget_low_s16(f[3]), vget_low_s16(rv3)); m3 = vmlal_s16(m3, vget_high_s16(f[3]), vget_high_s16(rv3)); int32x4_t m0123[] = { m0, m1, m2, m3 }; const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; const int offset_bits_horiz = bd + FILTER_BITS - 1; int32x4_t res = horizontal_add_4d_s32x4(m0123); res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz)); res = vrshlq_s32(res, vdupq_n_s32(-round0)); return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); } static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f8( int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4, int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx, int alpha) { int16x8_t f[8]; load_filters_8(f, sx, alpha); int32x4_t m0 = vmull_s16(vget_low_s16(f[0]), vget_low_s16(rv0)); m0 = vmlal_s16(m0, vget_high_s16(f[0]), vget_high_s16(rv0)); int32x4_t m1 = vmull_s16(vget_low_s16(f[1]), vget_low_s16(rv1)); m1 = vmlal_s16(m1, vget_high_s16(f[1]), vget_high_s16(rv1)); int32x4_t m2 = vmull_s16(vget_low_s16(f[2]), vget_low_s16(rv2)); m2 = vmlal_s16(m2, vget_high_s16(f[2]), vget_high_s16(rv2)); int32x4_t m3 = vmull_s16(vget_low_s16(f[3]), vget_low_s16(rv3)); m3 = vmlal_s16(m3, vget_high_s16(f[3]), vget_high_s16(rv3)); int32x4_t m4 = vmull_s16(vget_low_s16(f[4]), vget_low_s16(rv4)); m4 = vmlal_s16(m4, vget_high_s16(f[4]), vget_high_s16(rv4)); int32x4_t m5 = vmull_s16(vget_low_s16(f[5]), vget_low_s16(rv5)); m5 = vmlal_s16(m5, vget_high_s16(f[5]), vget_high_s16(rv5)); int32x4_t m6 = vmull_s16(vget_low_s16(f[6]), vget_low_s16(rv6)); m6 = vmlal_s16(m6, vget_high_s16(f[6]), vget_high_s16(rv6)); int32x4_t m7 = vmull_s16(vget_low_s16(f[7]), vget_low_s16(rv7)); m7 = vmlal_s16(m7, vget_high_s16(f[7]), vget_high_s16(rv7)); int32x4_t m0123[] = { m0, m1, m2, m3 }; int32x4_t m4567[] = { m4, m5, m6, m7 }; const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; const int offset_bits_horiz = bd + FILTER_BITS - 1; int32x4_t res0 = horizontal_add_4d_s32x4(m0123); int32x4_t res1 = horizontal_add_4d_s32x4(m4567); res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz)); res0 = vrshlq_s32(res0, vdupq_n_s32(-round0)); res1 = vrshlq_s32(res1, vdupq_n_s32(-round0)); return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); } static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_4x1_f1(int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int bd, int sx) { int16x8_t f = load_filters_1(sx); int32x4_t m0 = vmull_s16(vget_low_s16(f), vget_low_s16(rv0)); m0 = vmlal_s16(m0, vget_high_s16(f), vget_high_s16(rv0)); int32x4_t m1 = vmull_s16(vget_low_s16(f), vget_low_s16(rv1)); m1 = vmlal_s16(m1, vget_high_s16(f), vget_high_s16(rv1)); int32x4_t m2 = vmull_s16(vget_low_s16(f), vget_low_s16(rv2)); m2 = vmlal_s16(m2, vget_high_s16(f), vget_high_s16(rv2)); int32x4_t m3 = vmull_s16(vget_low_s16(f), vget_low_s16(rv3)); m3 = vmlal_s16(m3, vget_high_s16(f), vget_high_s16(rv3)); int32x4_t m0123[] = { m0, m1, m2, m3 }; const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; const int offset_bits_horiz = bd + FILTER_BITS - 1; int32x4_t res = horizontal_add_4d_s32x4(m0123); res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz)); res = vrshlq_s32(res, vdupq_n_s32(-round0)); return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); } static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f1( int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4, int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx) { int16x8_t f = load_filters_1(sx); int32x4_t m0 = vmull_s16(vget_low_s16(f), vget_low_s16(rv0)); m0 = vmlal_s16(m0, vget_high_s16(f), vget_high_s16(rv0)); int32x4_t m1 = vmull_s16(vget_low_s16(f), vget_low_s16(rv1)); m1 = vmlal_s16(m1, vget_high_s16(f), vget_high_s16(rv1)); int32x4_t m2 = vmull_s16(vget_low_s16(f), vget_low_s16(rv2)); m2 = vmlal_s16(m2, vget_high_s16(f), vget_high_s16(rv2)); int32x4_t m3 = vmull_s16(vget_low_s16(f), vget_low_s16(rv3)); m3 = vmlal_s16(m3, vget_high_s16(f), vget_high_s16(rv3)); int32x4_t m4 = vmull_s16(vget_low_s16(f), vget_low_s16(rv4)); m4 = vmlal_s16(m4, vget_high_s16(f), vget_high_s16(rv4)); int32x4_t m5 = vmull_s16(vget_low_s16(f), vget_low_s16(rv5)); m5 = vmlal_s16(m5, vget_high_s16(f), vget_high_s16(rv5)); int32x4_t m6 = vmull_s16(vget_low_s16(f), vget_low_s16(rv6)); m6 = vmlal_s16(m6, vget_high_s16(f), vget_high_s16(rv6)); int32x4_t m7 = vmull_s16(vget_low_s16(f), vget_low_s16(rv7)); m7 = vmlal_s16(m7, vget_high_s16(f), vget_high_s16(rv7)); int32x4_t m0123[] = { m0, m1, m2, m3 }; int32x4_t m4567[] = { m4, m5, m6, m7 }; const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; const int offset_bits_horiz = bd + FILTER_BITS - 1; int32x4_t res0 = horizontal_add_4d_s32x4(m0123); int32x4_t res1 = horizontal_add_4d_s32x4(m4567); res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz)); res0 = vrshlq_s32(res0, vdupq_n_s32(-round0)); res1 = vrshlq_s32(res1, vdupq_n_s32(-round0)); return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); } static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, int sy) { const int16x8_t f = load_filters_1(sy); const int16x4_t f0123 = vget_low_s16(f); const int16x4_t f4567 = vget_high_s16(f); int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3); return m0123; } static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, int sy) { const int16x8_t f = load_filters_1(sy); const int16x4_t f0123 = vget_low_s16(f); const int16x4_t f4567 = vget_high_s16(f); int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3); int32x4_t m4567 = vmull_lane_s16(vget_high_s16(tmp[0]), f0123, 0); m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[1]), f0123, 1); m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[2]), f0123, 2); m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[3]), f0123, 3); m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[4]), f4567, 0); m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[5]), f4567, 1); m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[6]), f4567, 2); m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[7]), f4567, 3); return (int32x4x2_t){ { m0123, m4567 } }; } static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, int sy, int gamma) { int16x8_t s0, s1, s2, s3; transpose_elems_s16_4x8( vget_low_s16(tmp[0]), vget_low_s16(tmp[1]), vget_low_s16(tmp[2]), vget_low_s16(tmp[3]), vget_low_s16(tmp[4]), vget_low_s16(tmp[5]), vget_low_s16(tmp[6]), vget_low_s16(tmp[7]), &s0, &s1, &s2, &s3); int16x8_t f[4]; load_filters_4(f, sy, gamma); int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); int32x4_t m0123[] = { m0, m1, m2, m3 }; return horizontal_add_4d_s32x4(m0123); } static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, int sy, int gamma) { int16x8_t s0 = tmp[0]; int16x8_t s1 = tmp[1]; int16x8_t s2 = tmp[2]; int16x8_t s3 = tmp[3]; int16x8_t s4 = tmp[4]; int16x8_t s5 = tmp[5]; int16x8_t s6 = tmp[6]; int16x8_t s7 = tmp[7]; transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); int16x8_t f[8]; load_filters_8(f, sy, gamma); int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4])); m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4])); int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5])); m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5])); int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6])); m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6])); int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7])); m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7])); int32x4_t m0123[] = { m0, m1, m2, m3 }; int32x4_t m4567[] = { m4, m5, m6, m7 }; int32x4x2_t ret; ret.val[0] = horizontal_add_4d_s32x4(m0123); ret.val[1] = horizontal_add_4d_s32x4(m4567); return ret; } void av1_highbd_warp_affine_neon(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { highbd_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, bd, conv_params, alpha, beta, gamma, delta); } aom-3.12.1/av1/common/arm/highbd_warp_plane_neon.h000066400000000000000000000566131477627663500217770ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_ #define AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_ #include #include #include #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_ports/mem.h" #include "av1/common/scale.h" #include "av1/common/warped_motion.h" #include "config/av1_rtcd.h" static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_4x1_f4(int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int bd, int sx, int alpha); static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f8( int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4, int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx, int alpha); static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_4x1_f1( int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int bd, int sx); static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f1( int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4, int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx); static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, int sy); static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, int sy); static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, int sy, int gamma); static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, int sy, int gamma); static AOM_FORCE_INLINE int16x8_t load_filters_1(int ofs) { const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS); const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS]; return vld1q_s16(base + ofs0 * 8); } static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int ofs, int stride) { const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS); const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS); const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS); const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS); const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS]; out[0] = vld1q_s16(base + ofs0 * 8); out[1] = vld1q_s16(base + ofs1 * 8); out[2] = vld1q_s16(base + ofs2 * 8); out[3] = vld1q_s16(base + ofs3 * 8); } static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int ofs, int stride) { const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS); const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS); const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS); const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS); const int ofs4 = ROUND_POWER_OF_TWO(ofs + stride * 4, WARPEDDIFF_PREC_BITS); const int ofs5 = ROUND_POWER_OF_TWO(ofs + stride * 5, WARPEDDIFF_PREC_BITS); const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS); const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS); const int16_t *base = av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS]; out[0] = vld1q_s16(base + ofs0 * 8); out[1] = vld1q_s16(base + ofs1 * 8); out[2] = vld1q_s16(base + ofs2 * 8); out[3] = vld1q_s16(base + ofs3 * 8); out[4] = vld1q_s16(base + ofs4 * 8); out[5] = vld1q_s16(base + ofs5 * 8); out[6] = vld1q_s16(base + ofs6 * 8); out[7] = vld1q_s16(base + ofs7 * 8); } static AOM_FORCE_INLINE uint16x4_t clip_pixel_highbd_vec(int32x4_t val, int bd) { const int limit = (1 << bd) - 1; return vqmovun_s32(vminq_s32(val, vdupq_n_s32(limit))); } static AOM_FORCE_INLINE uint16x8x2_t clamp_horizontal( uint16x8x2_t src_1, int out_of_boundary_left, int out_of_boundary_right, const uint16_t *ref, int iy, int stride, int width, const uint16x8_t indx0, const uint16x8_t indx1) { if (out_of_boundary_left >= 0) { uint16x8_t cmp_vec = vdupq_n_u16(out_of_boundary_left); uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride]); uint16x8_t mask0 = vcleq_u16(indx0, cmp_vec); uint16x8_t mask1 = vcleq_u16(indx1, cmp_vec); src_1.val[0] = vbslq_u16(mask0, vec_dup, src_1.val[0]); src_1.val[1] = vbslq_u16(mask1, vec_dup, src_1.val[1]); } if (out_of_boundary_right >= 0) { uint16x8_t cmp_vec = vdupq_n_u16(15 - out_of_boundary_right); uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride + width - 1]); uint16x8_t mask0 = vcgeq_u16(indx0, cmp_vec); uint16x8_t mask1 = vcgeq_u16(indx1, cmp_vec); src_1.val[0] = vbslq_u16(mask0, vec_dup, src_1.val[0]); src_1.val[1] = vbslq_u16(mask1, vec_dup, src_1.val[1]); } return src_1; } static AOM_FORCE_INLINE void warp_affine_horizontal(const uint16_t *ref, int width, int height, int stride, int p_width, int16_t alpha, int16_t beta, int iy4, int sx4, int ix4, int16x8_t tmp[], int bd) { const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; if (ix4 <= -7) { for (int k = 0; k < 15; ++k) { int iy = clamp(iy4 + k - 7, 0, height - 1); int32_t dup_val = (1 << (bd + FILTER_BITS - round0 - 1)) + ref[iy * stride] * (1 << (FILTER_BITS - round0)); tmp[k] = vdupq_n_s16(dup_val); } return; } else if (ix4 >= width + 6) { for (int k = 0; k < 15; ++k) { int iy = clamp(iy4 + k - 7, 0, height - 1); int32_t dup_val = (1 << (bd + FILTER_BITS - round0 - 1)) + ref[iy * stride + (width - 1)] * (1 << (FILTER_BITS - round0)); tmp[k] = vdupq_n_s16(dup_val); } return; } static const uint16_t kIotaArr[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; const uint16x8_t indx0 = vld1q_u16(kIotaArr); const uint16x8_t indx1 = vld1q_u16(kIotaArr + 8); const int out_of_boundary_left = -(ix4 - 6); const int out_of_boundary_right = (ix4 + 8) - width; #define APPLY_HORIZONTAL_SHIFT_4X1(fn, ...) \ do { \ if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) { \ for (int k = 0; k < 15; ++k) { \ const int iy = clamp(iy4 + k - 7, 0, height - 1); \ const uint16_t *idx = ref + iy * stride + ix4 - 7; \ /* We don't use vld1q_u16_x2 here as LLVM generates an incorrect \ * alignment hint for this intrinsic that causes a SIGBUS on Armv7 \ * targets when alignment checks are enabled. \ * (See bug: b/349455146) */ \ uint16x8x2_t src_1 = { { vld1q_u16(idx), vld1q_u16(idx + 8) } }; \ src_1 = clamp_horizontal(src_1, out_of_boundary_left, \ out_of_boundary_right, ref, iy, stride, \ width, indx0, indx1); \ int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ vreinterpretq_s16_u16(src_1.val[1]), 0); \ int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ vreinterpretq_s16_u16(src_1.val[1]), 1); \ int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ vreinterpretq_s16_u16(src_1.val[1]), 2); \ int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ vreinterpretq_s16_u16(src_1.val[1]), 3); \ tmp[k] = (fn)(rv0, rv1, rv2, rv3, __VA_ARGS__); \ } \ } else { \ for (int k = 0; k < 15; ++k) { \ const int iy = clamp(iy4 + k - 7, 0, height - 1); \ const uint16_t *src = ref + iy * stride + ix4; \ int16x8_t rv0 = vreinterpretq_s16_u16(vld1q_u16(src - 7)); \ int16x8_t rv1 = vreinterpretq_s16_u16(vld1q_u16(src - 6)); \ int16x8_t rv2 = vreinterpretq_s16_u16(vld1q_u16(src - 5)); \ int16x8_t rv3 = vreinterpretq_s16_u16(vld1q_u16(src - 4)); \ tmp[k] = (fn)(rv0, rv1, rv2, rv3, __VA_ARGS__); \ } \ } \ } while (0) #define APPLY_HORIZONTAL_SHIFT_8X1(fn, ...) \ do { \ if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) { \ for (int k = 0; k < 15; ++k) { \ const int iy = clamp(iy4 + k - 7, 0, height - 1); \ const uint16_t *idx = ref + iy * stride + ix4 - 7; \ /* We don't use vld1q_u16_x2 here as LLVM generates an incorrect \ * alignment hint for this intrinsic that causes a SIGBUS on Armv7 \ * targets when alignment checks are enabled. \ * (See bug: b/349455146) */ \ uint16x8x2_t src_1 = { { vld1q_u16(idx), vld1q_u16(idx + 8) } }; \ src_1 = clamp_horizontal(src_1, out_of_boundary_left, \ out_of_boundary_right, ref, iy, stride, \ width, indx0, indx1); \ int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ vreinterpretq_s16_u16(src_1.val[1]), 0); \ int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ vreinterpretq_s16_u16(src_1.val[1]), 1); \ int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ vreinterpretq_s16_u16(src_1.val[1]), 2); \ int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ vreinterpretq_s16_u16(src_1.val[1]), 3); \ int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ vreinterpretq_s16_u16(src_1.val[1]), 4); \ int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ vreinterpretq_s16_u16(src_1.val[1]), 5); \ int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ vreinterpretq_s16_u16(src_1.val[1]), 6); \ int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(src_1.val[0]), \ vreinterpretq_s16_u16(src_1.val[1]), 7); \ tmp[k] = (fn)(rv0, rv1, rv2, rv3, rv4, rv5, rv6, rv7, __VA_ARGS__); \ } \ } else { \ for (int k = 0; k < 15; ++k) { \ const int iy = clamp(iy4 + k - 7, 0, height - 1); \ const uint16_t *src = ref + iy * stride + ix4; \ int16x8_t rv0 = vreinterpretq_s16_u16(vld1q_u16(src - 7)); \ int16x8_t rv1 = vreinterpretq_s16_u16(vld1q_u16(src - 6)); \ int16x8_t rv2 = vreinterpretq_s16_u16(vld1q_u16(src - 5)); \ int16x8_t rv3 = vreinterpretq_s16_u16(vld1q_u16(src - 4)); \ int16x8_t rv4 = vreinterpretq_s16_u16(vld1q_u16(src - 3)); \ int16x8_t rv5 = vreinterpretq_s16_u16(vld1q_u16(src - 2)); \ int16x8_t rv6 = vreinterpretq_s16_u16(vld1q_u16(src - 1)); \ int16x8_t rv7 = vreinterpretq_s16_u16(vld1q_u16(src - 0)); \ tmp[k] = (fn)(rv0, rv1, rv2, rv3, rv4, rv5, rv6, rv7, __VA_ARGS__); \ } \ } \ } while (0) if (p_width == 4) { if (beta == 0) { if (alpha == 0) { APPLY_HORIZONTAL_SHIFT_4X1(highbd_horizontal_filter_4x1_f1, bd, sx4); } else { APPLY_HORIZONTAL_SHIFT_4X1(highbd_horizontal_filter_4x1_f4, bd, sx4, alpha); } } else { if (alpha == 0) { APPLY_HORIZONTAL_SHIFT_4X1(highbd_horizontal_filter_4x1_f1, bd, (sx4 + beta * (k - 3))); } else { APPLY_HORIZONTAL_SHIFT_4X1(highbd_horizontal_filter_4x1_f4, bd, (sx4 + beta * (k - 3)), alpha); } } } else { if (beta == 0) { if (alpha == 0) { APPLY_HORIZONTAL_SHIFT_8X1(highbd_horizontal_filter_8x1_f1, bd, sx4); } else { APPLY_HORIZONTAL_SHIFT_8X1(highbd_horizontal_filter_8x1_f8, bd, sx4, alpha); } } else { if (alpha == 0) { APPLY_HORIZONTAL_SHIFT_8X1(highbd_horizontal_filter_8x1_f1, bd, (sx4 + beta * (k - 3))); } else { APPLY_HORIZONTAL_SHIFT_8X1(highbd_horizontal_filter_8x1_f8, bd, (sx4 + beta * (k - 3)), alpha); } } } #undef APPLY_HORIZONTAL_SHIFT_4X1 #undef APPLY_HORIZONTAL_SHIFT_8X1 } static AOM_FORCE_INLINE void highbd_vertical_filter_4x1_f4( uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride, bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd, int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) { int32x4_t sum0 = gamma == 0 ? vertical_filter_4x1_f1(tmp, sy) : vertical_filter_4x1_f4(tmp, sy, gamma); const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; const int offset_bits_vert = bd + 2 * FILTER_BITS - round0; sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert)); uint16_t *dst16 = &pred[i * p_stride + j]; if (!is_compound) { const int reduce_bits_vert = 2 * FILTER_BITS - round0; sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert)); const int res_sub_const = (1 << (bd - 1)) + (1 << bd); sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const)); uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd); vst1_u16(dst16, res0); return; } sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS); uint16_t *p = &dst[i * dst_stride + j]; if (!do_average) { vst1_u16(p, vqmovun_s32(sum0)); return; } uint16x4_t p0 = vld1_u16(p); int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(p0)); if (use_dist_wtd_comp_avg) { p_vec0 = vmulq_n_s32(p_vec0, fwd); p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd); p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS); } else { p_vec0 = vhaddq_s32(p_vec0, sum0); } const int offset_bits = bd + 2 * FILTER_BITS - round0; const int round1 = COMPOUND_ROUND1_BITS; const int res_sub_const = (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1)); const int round_bits = 2 * FILTER_BITS - round0 - round1; p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const)); p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits)); uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd); vst1_u16(dst16, res0); } static AOM_FORCE_INLINE void highbd_vertical_filter_8x1_f8( uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride, bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd, int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) { int32x4x2_t sums = gamma == 0 ? vertical_filter_8x1_f1(tmp, sy) : vertical_filter_8x1_f8(tmp, sy, gamma); int32x4_t sum0 = sums.val[0]; int32x4_t sum1 = sums.val[1]; const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; const int offset_bits_vert = bd + 2 * FILTER_BITS - round0; sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert)); sum1 = vaddq_s32(sum1, vdupq_n_s32(1 << offset_bits_vert)); uint16_t *dst16 = &pred[i * p_stride + j]; if (!is_compound) { const int reduce_bits_vert = 2 * FILTER_BITS - round0; sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert)); sum1 = vrshlq_s32(sum1, vdupq_n_s32(-reduce_bits_vert)); const int res_sub_const = (1 << (bd - 1)) + (1 << bd); sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const)); sum1 = vsubq_s32(sum1, vdupq_n_s32(res_sub_const)); uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd); uint16x4_t res1 = clip_pixel_highbd_vec(sum1, bd); vst1_u16(dst16, res0); vst1_u16(dst16 + 4, res1); return; } sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS); sum1 = vrshrq_n_s32(sum1, COMPOUND_ROUND1_BITS); uint16_t *p = &dst[i * dst_stride + j]; if (!do_average) { vst1_u16(p, vqmovun_s32(sum0)); vst1_u16(p + 4, vqmovun_s32(sum1)); return; } uint16x8_t p0 = vld1q_u16(p); int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(p0))); int32x4_t p_vec1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(p0))); if (use_dist_wtd_comp_avg) { p_vec0 = vmulq_n_s32(p_vec0, fwd); p_vec1 = vmulq_n_s32(p_vec1, fwd); p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd); p_vec1 = vmlaq_n_s32(p_vec1, sum1, bwd); p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS); p_vec1 = vshrq_n_s32(p_vec1, DIST_PRECISION_BITS); } else { p_vec0 = vhaddq_s32(p_vec0, sum0); p_vec1 = vhaddq_s32(p_vec1, sum1); } const int offset_bits = bd + 2 * FILTER_BITS - round0; const int round1 = COMPOUND_ROUND1_BITS; const int res_sub_const = (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1)); const int round_bits = 2 * FILTER_BITS - round0 - round1; p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const)); p_vec1 = vsubq_s32(p_vec1, vdupq_n_s32(res_sub_const)); p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits)); p_vec1 = vrshlq_s32(p_vec1, vdupq_n_s32(-round_bits)); uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd); uint16x4_t res1 = clip_pixel_highbd_vec(p_vec1, bd); vst1_u16(dst16, res0); vst1_u16(dst16 + 4, res1); } static AOM_FORCE_INLINE void warp_affine_vertical( uint16_t *pred, int p_width, int p_height, int p_stride, int bd, uint16_t *dst, int dst_stride, bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd, int bwd, int16_t gamma, int16_t delta, const int16x8_t *tmp, int i, int sy4, int j) { int limit_height = p_height > 4 ? 8 : 4; if (p_width > 4) { // p_width == 8 for (int k = 0; k < limit_height; ++k) { int sy = sy4 + delta * k; highbd_vertical_filter_8x1_f8( pred, p_stride, bd, dst, dst_stride, is_compound, do_average, use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j); } } else { // p_width == 4 for (int k = 0; k < limit_height; ++k) { int sy = sy4 + delta * k; highbd_vertical_filter_4x1_f4( pred, p_stride, bd, dst, dst_stride, is_compound, do_average, use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j); } } } static AOM_FORCE_INLINE void highbd_warp_affine_common( const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { uint16_t *const dst = conv_params->dst; const int dst_stride = conv_params->dst_stride; const bool is_compound = conv_params->is_compound; const bool do_average = conv_params->do_average; const bool use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int fwd = conv_params->fwd_offset; const int bwd = conv_params->bck_offset; assert(IMPLIES(is_compound, dst != NULL)); for (int i = 0; i < p_height; i += 8) { for (int j = 0; j < p_width; j += 8) { // Calculate the center of this 8x8 block, // project to luma coordinates (if in a subsampled chroma plane), // apply the affine transformation, // then convert back to the original coordinates (if necessary) const int32_t src_x = (j + 4 + p_col) << subsampling_x; const int32_t src_y = (i + 4 + p_row) << subsampling_y; const int64_t dst_x = (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; const int64_t dst_y = (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; const int64_t x4 = dst_x >> subsampling_x; const int64_t y4 = dst_y >> subsampling_y; const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); sx4 += alpha * (-4) + beta * (-4); sy4 += gamma * (-4) + delta * (-4); sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); // Each horizontal filter result is formed by the sum of up to eight // multiplications by filter values and then a shift. Although both the // inputs and filters are loaded as int16, the input data is at most bd // bits and the filters are at most 8 bits each. Additionally since we // know all possible filter values we know that the sum of absolute // filter values will fit in at most 9 bits. With this in mind we can // conclude that the sum of each filter application will fit in bd + 9 // bits. The shift following the summation is ROUND0_BITS (which is 3), // +2 for 12-bit, which gives us a final storage of: // bd == 8: ( 8 + 9) - 3 => 14 bits // bd == 10: (10 + 9) - 3 => 16 bits // bd == 12: (12 + 9) - 5 => 16 bits // So it is safe to use int16x8_t as the intermediate storage type here. int16x8_t tmp[15]; warp_affine_horizontal(ref, width, height, stride, p_width, alpha, beta, iy4, sx4, ix4, tmp, bd); warp_affine_vertical(pred, p_width, p_height, p_stride, bd, dst, dst_stride, is_compound, do_average, use_dist_wtd_comp_avg, fwd, bwd, gamma, delta, tmp, i, sy4, j); } } } #endif // AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_ aom-3.12.1/av1/common/arm/highbd_warp_plane_sve.c000066400000000000000000000250201477627663500216140ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" #include "av1/common/scale.h" #include "av1/common/warped_motion.h" #include "config/av1_rtcd.h" #include "highbd_warp_plane_neon.h" static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_4x1_f4(int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int bd, int sx, int alpha) { int16x8_t f[4]; load_filters_4(f, sx, alpha); int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f[0]); int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f[1]); int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f[2]); int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f[3]); int64x2_t m01 = vpaddq_s64(m0, m1); int64x2_t m23 = vpaddq_s64(m2, m3); const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS; const int offset_bits_horiz = bd + FILTER_BITS - 1; int32x4_t res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz)); res = vrshlq_s32(res, vdupq_n_s32(-round0)); return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); } static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f8( int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4, int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx, int alpha) { int16x8_t f[8]; load_filters_8(f, sx, alpha); int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f[0]); int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f[1]); int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f[2]); int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f[3]); int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), rv4, f[4]); int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), rv5, f[5]); int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), rv6, f[6]); int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), rv7, f[7]); int64x2_t m01 = vpaddq_s64(m0, m1); int64x2_t m23 = vpaddq_s64(m2, m3); int64x2_t m45 = vpaddq_s64(m4, m5); int64x2_t m67 = vpaddq_s64(m6, m7); const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS; const int offset_bits_horiz = bd + FILTER_BITS - 1; int32x4_t res0 = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); int32x4_t res1 = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67)); res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz)); res0 = vrshlq_s32(res0, vdupq_n_s32(-round0)); res1 = vrshlq_s32(res1, vdupq_n_s32(-round0)); return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); } static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_4x1_f1(int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int bd, int sx) { int16x8_t f = load_filters_1(sx); int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f); int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f); int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f); int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f); int64x2_t m01 = vpaddq_s64(m0, m1); int64x2_t m23 = vpaddq_s64(m2, m3); const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS; const int offset_bits_horiz = bd + FILTER_BITS - 1; int32x4_t res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz)); res = vrshlq_s32(res, vdupq_n_s32(-round0)); return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); } static AOM_FORCE_INLINE int16x8_t highbd_horizontal_filter_8x1_f1( int16x8_t rv0, int16x8_t rv1, int16x8_t rv2, int16x8_t rv3, int16x8_t rv4, int16x8_t rv5, int16x8_t rv6, int16x8_t rv7, int bd, int sx) { int16x8_t f = load_filters_1(sx); int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f); int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f); int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f); int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f); int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), rv4, f); int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), rv5, f); int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), rv6, f); int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), rv7, f); int64x2_t m01 = vpaddq_s64(m0, m1); int64x2_t m23 = vpaddq_s64(m2, m3); int64x2_t m45 = vpaddq_s64(m4, m5); int64x2_t m67 = vpaddq_s64(m6, m7); const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS; const int offset_bits_horiz = bd + FILTER_BITS - 1; int32x4_t res0 = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); int32x4_t res1 = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67)); res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz)); res0 = vrshlq_s32(res0, vdupq_n_s32(-round0)); res1 = vrshlq_s32(res1, vdupq_n_s32(-round0)); return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); } static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, int sy) { const int16x8_t f = load_filters_1(sy); const int16x4_t f0123 = vget_low_s16(f); const int16x4_t f4567 = vget_high_s16(f); // No benefit to using SDOT here, the cost of rearrangement is too high. int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3); return m0123; } static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, int sy) { const int16x8_t f = load_filters_1(sy); const int16x4_t f0123 = vget_low_s16(f); const int16x4_t f4567 = vget_high_s16(f); // No benefit to using SDOT here, the cost of rearrangement is too high. int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2); m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3); int32x4_t m4567 = vmull_lane_s16(vget_high_s16(tmp[0]), f0123, 0); m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[1]), f0123, 1); m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[2]), f0123, 2); m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[3]), f0123, 3); m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[4]), f4567, 0); m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[5]), f4567, 1); m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[6]), f4567, 2); m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[7]), f4567, 3); return (int32x4x2_t){ { m0123, m4567 } }; } static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, int sy, int gamma) { int16x8_t s0, s1, s2, s3; transpose_elems_s16_4x8( vget_low_s16(tmp[0]), vget_low_s16(tmp[1]), vget_low_s16(tmp[2]), vget_low_s16(tmp[3]), vget_low_s16(tmp[4]), vget_low_s16(tmp[5]), vget_low_s16(tmp[6]), vget_low_s16(tmp[7]), &s0, &s1, &s2, &s3); int16x8_t f[4]; load_filters_4(f, sy, gamma); int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]); int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]); int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]); int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]); int64x2_t m01 = vpaddq_s64(m0, m1); int64x2_t m23 = vpaddq_s64(m2, m3); return vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); } static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, int sy, int gamma) { int16x8_t s0 = tmp[0]; int16x8_t s1 = tmp[1]; int16x8_t s2 = tmp[2]; int16x8_t s3 = tmp[3]; int16x8_t s4 = tmp[4]; int16x8_t s5 = tmp[5]; int16x8_t s6 = tmp[6]; int16x8_t s7 = tmp[7]; transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); int16x8_t f[8]; load_filters_8(f, sy, gamma); int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]); int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]); int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]); int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]); int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), s4, f[4]); int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), s5, f[5]); int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), s6, f[6]); int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), s7, f[7]); int64x2_t m01 = vpaddq_s64(m0, m1); int64x2_t m23 = vpaddq_s64(m2, m3); int64x2_t m45 = vpaddq_s64(m4, m5); int64x2_t m67 = vpaddq_s64(m6, m7); int32x4x2_t ret; ret.val[0] = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); ret.val[1] = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67)); return ret; } void av1_highbd_warp_affine_sve(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { highbd_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, bd, conv_params, alpha, beta, gamma, delta); } aom-3.12.1/av1/common/arm/highbd_wiener_convolve_neon.c000066400000000000000000000626771477627663500230550ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/mem_neon.h" #include "av1/common/convolve.h" #include "config/aom_config.h" #include "config/av1_rtcd.h" #define HBD_WIENER_5TAP_HORIZ(name, shift) \ static inline uint16x8_t name##_wiener_convolve5_8_2d_h( \ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \ const int16x8_t s3, const int16x8_t s4, const int16x4_t x_filter, \ const int32x4_t round_vec, const uint16x8_t im_max_val) { \ /* Wiener filter is symmetric so add mirrored source elements. */ \ int16x8_t s04 = vaddq_s16(s0, s4); \ int16x8_t s13 = vaddq_s16(s1, s3); \ \ /* x_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) */ \ int32x4_t sum_lo = \ vmlal_lane_s16(round_vec, vget_low_s16(s04), x_filter, 1); \ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), x_filter, 2); \ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), x_filter, 3); \ \ int32x4_t sum_hi = \ vmlal_lane_s16(round_vec, vget_high_s16(s04), x_filter, 1); \ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), x_filter, 2); \ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), x_filter, 3); \ \ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \ \ return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val); \ } \ \ static inline void name##_convolve_add_src_5tap_horiz( \ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, \ const int32x4_t round_vec, const uint16x8_t im_max_val) { \ do { \ const int16_t *s = (int16_t *)src_ptr; \ uint16_t *d = dst_ptr; \ int width = w; \ \ do { \ int16x8_t s0, s1, s2, s3, s4; \ load_s16_8x5(s, 1, &s0, &s1, &s2, &s3, &s4); \ \ uint16x8_t d0 = name##_wiener_convolve5_8_2d_h( \ s0, s1, s2, s3, s4, x_filter, round_vec, im_max_val); \ \ vst1q_u16(d, d0); \ \ s += 8; \ d += 8; \ width -= 8; \ } while (width != 0); \ src_ptr += src_stride; \ dst_ptr += dst_stride; \ } while (--h != 0); \ } HBD_WIENER_5TAP_HORIZ(highbd, WIENER_ROUND0_BITS) HBD_WIENER_5TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2) #undef HBD_WIENER_5TAP_HORIZ #define HBD_WIENER_7TAP_HORIZ(name, shift) \ static inline uint16x8_t name##_wiener_convolve7_8_2d_h( \ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, \ const int16x8_t s6, const int16x4_t x_filter, const int32x4_t round_vec, \ const uint16x8_t im_max_val) { \ /* Wiener filter is symmetric so add mirrored source elements. */ \ int16x8_t s06 = vaddq_s16(s0, s6); \ int16x8_t s15 = vaddq_s16(s1, s5); \ int16x8_t s24 = vaddq_s16(s2, s4); \ \ int32x4_t sum_lo = \ vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0); \ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1); \ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2); \ sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3); \ \ int32x4_t sum_hi = \ vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0); \ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1); \ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2); \ sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3); \ \ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \ \ return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val); \ } \ \ static inline void name##_convolve_add_src_7tap_horiz( \ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \ ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, \ const int32x4_t round_vec, const uint16x8_t im_max_val) { \ do { \ const int16_t *s = (int16_t *)src_ptr; \ uint16_t *d = dst_ptr; \ int width = w; \ \ do { \ int16x8_t s0, s1, s2, s3, s4, s5, s6; \ load_s16_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6); \ \ uint16x8_t d0 = name##_wiener_convolve7_8_2d_h( \ s0, s1, s2, s3, s4, s5, s6, x_filter, round_vec, im_max_val); \ \ vst1q_u16(d, d0); \ \ s += 8; \ d += 8; \ width -= 8; \ } while (width != 0); \ src_ptr += src_stride; \ dst_ptr += dst_stride; \ } while (--h != 0); \ } HBD_WIENER_7TAP_HORIZ(highbd, WIENER_ROUND0_BITS) HBD_WIENER_7TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2) #undef HBD_WIENER_7TAP_HORIZ #define HBD_WIENER_5TAP_VERT(name, shift) \ static inline uint16x8_t name##_wiener_convolve5_8_2d_v( \ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \ const int16x8_t s3, const int16x8_t s4, const int16x4_t y_filter, \ const int32x4_t round_vec, const uint16x8_t res_max_val) { \ const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter)); \ const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter)); \ /* Wiener filter is symmetric so add mirrored source elements. */ \ int32x4_t s04_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s4)); \ int32x4_t s13_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s3)); \ \ /* y_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) */ \ int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s04_lo, y_filter_lo, 1); \ sum_lo = vmlaq_lane_s32(sum_lo, s13_lo, y_filter_hi, 0); \ sum_lo = \ vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s2)), y_filter_hi, 1); \ \ int32x4_t s04_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s4)); \ int32x4_t s13_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s3)); \ \ int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s04_hi, y_filter_lo, 1); \ sum_hi = vmlaq_lane_s32(sum_hi, s13_hi, y_filter_hi, 0); \ sum_hi = \ vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s2)), y_filter_hi, 1); \ \ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \ \ return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val); \ } \ \ static inline void name##_convolve_add_src_5tap_vert( \ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \ ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, \ const int32x4_t round_vec, const uint16x8_t res_max_val) { \ do { \ const int16_t *s = (int16_t *)src_ptr; \ uint16_t *d = dst_ptr; \ int height = h; \ \ while (height > 3) { \ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; \ load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); \ \ uint16x8_t d0 = name##_wiener_convolve5_8_2d_v( \ s0, s1, s2, s3, s4, y_filter, round_vec, res_max_val); \ uint16x8_t d1 = name##_wiener_convolve5_8_2d_v( \ s1, s2, s3, s4, s5, y_filter, round_vec, res_max_val); \ uint16x8_t d2 = name##_wiener_convolve5_8_2d_v( \ s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \ uint16x8_t d3 = name##_wiener_convolve5_8_2d_v( \ s3, s4, s5, s6, s7, y_filter, round_vec, res_max_val); \ \ store_u16_8x4(d, dst_stride, d0, d1, d2, d3); \ \ s += 4 * src_stride; \ d += 4 * dst_stride; \ height -= 4; \ } \ \ while (height-- != 0) { \ int16x8_t s0, s1, s2, s3, s4; \ load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); \ \ uint16x8_t d0 = name##_wiener_convolve5_8_2d_v( \ s0, s1, s2, s3, s4, y_filter, round_vec, res_max_val); \ \ vst1q_u16(d, d0); \ \ s += src_stride; \ d += dst_stride; \ } \ \ src_ptr += 8; \ dst_ptr += 8; \ w -= 8; \ } while (w != 0); \ } HBD_WIENER_5TAP_VERT(highbd, 2 * FILTER_BITS - WIENER_ROUND0_BITS) HBD_WIENER_5TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2) #undef HBD_WIENER_5TAP_VERT #define HBD_WIENER_7TAP_VERT(name, shift) \ static inline uint16x8_t name##_wiener_convolve7_8_2d_v( \ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, \ const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec, \ const uint16x8_t res_max_val) { \ const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter)); \ const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter)); \ /* Wiener filter is symmetric so add mirrored source elements. */ \ int32x4_t s06_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s6)); \ int32x4_t s15_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s5)); \ int32x4_t s24_lo = vaddl_s16(vget_low_s16(s2), vget_low_s16(s4)); \ \ int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s06_lo, y_filter_lo, 0); \ sum_lo = vmlaq_lane_s32(sum_lo, s15_lo, y_filter_lo, 1); \ sum_lo = vmlaq_lane_s32(sum_lo, s24_lo, y_filter_hi, 0); \ sum_lo = \ vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s3)), y_filter_hi, 1); \ \ int32x4_t s06_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s6)); \ int32x4_t s15_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s5)); \ int32x4_t s24_hi = vaddl_s16(vget_high_s16(s2), vget_high_s16(s4)); \ \ int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s06_hi, y_filter_lo, 0); \ sum_hi = vmlaq_lane_s32(sum_hi, s15_hi, y_filter_lo, 1); \ sum_hi = vmlaq_lane_s32(sum_hi, s24_hi, y_filter_hi, 0); \ sum_hi = \ vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s3)), y_filter_hi, 1); \ \ uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \ uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \ \ return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val); \ } \ \ static inline void name##_convolve_add_src_7tap_vert( \ const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \ ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, \ const int32x4_t round_vec, const uint16x8_t res_max_val) { \ do { \ const int16_t *s = (int16_t *)src_ptr; \ uint16_t *d = dst_ptr; \ int height = h; \ \ while (height > 3) { \ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9; \ load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, \ &s8, &s9); \ \ uint16x8_t d0 = name##_wiener_convolve7_8_2d_v( \ s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \ uint16x8_t d1 = name##_wiener_convolve7_8_2d_v( \ s1, s2, s3, s4, s5, s6, s7, y_filter, round_vec, res_max_val); \ uint16x8_t d2 = name##_wiener_convolve7_8_2d_v( \ s2, s3, s4, s5, s6, s7, s8, y_filter, round_vec, res_max_val); \ uint16x8_t d3 = name##_wiener_convolve7_8_2d_v( \ s3, s4, s5, s6, s7, s8, s9, y_filter, round_vec, res_max_val); \ \ store_u16_8x4(d, dst_stride, d0, d1, d2, d3); \ \ s += 4 * src_stride; \ d += 4 * dst_stride; \ height -= 4; \ } \ \ while (height-- != 0) { \ int16x8_t s0, s1, s2, s3, s4, s5, s6; \ load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); \ \ uint16x8_t d0 = name##_wiener_convolve7_8_2d_v( \ s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \ \ vst1q_u16(d, d0); \ \ s += src_stride; \ d += dst_stride; \ } \ \ src_ptr += 8; \ dst_ptr += 8; \ w -= 8; \ } while (w != 0); \ } HBD_WIENER_7TAP_VERT(highbd, 2 * FILTER_BITS - WIENER_ROUND0_BITS) HBD_WIENER_7TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2) #undef HBD_WIENER_7TAP_VERT static inline int get_wiener_filter_taps(const int16_t *filter) { assert(filter[7] == 0); if (filter[0] == 0 && filter[6] == 0) { return WIENER_WIN_REDUCED; } return WIENER_WIN; } void av1_highbd_wiener_convolve_add_src_neon( const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *x_filter, int x_step_q4, const int16_t *y_filter, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params, int bd) { (void)x_step_q4; (void)y_step_q4; assert(w % 8 == 0); assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); assert(x_step_q4 == 16 && y_step_q4 == 16); assert(x_filter[7] == 0 && y_filter[7] == 0); DECLARE_ALIGNED(16, uint16_t, im_block[(MAX_SB_SIZE + WIENER_WIN - 1) * MAX_SB_SIZE]); const int x_filter_taps = get_wiener_filter_taps(x_filter); const int y_filter_taps = get_wiener_filter_taps(y_filter); int16x4_t x_filter_s16 = vld1_s16(x_filter); int16x4_t y_filter_s16 = vld1_s16(y_filter); // Add 128 to tap 3. (Needed for rounding.) x_filter_s16 = vadd_s16(x_filter_s16, vcreate_s16(128ULL << 48)); y_filter_s16 = vadd_s16(y_filter_s16, vcreate_s16(128ULL << 48)); const int im_stride = MAX_SB_SIZE; const int im_h = h + y_filter_taps - 1; const int horiz_offset = x_filter_taps / 2; const int vert_offset = (y_filter_taps / 2) * (int)src_stride; const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(conv_params->round_0, bd); const uint16x8_t im_max_val = vdupq_n_u16(extraprec_clamp_limit - 1); const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1)); const uint16x8_t res_max_val = vdupq_n_u16((1 << bd) - 1); const int32x4_t vert_round_vec = vdupq_n_s32(-(1 << (bd + conv_params->round_1 - 1))); uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); if (bd == 12) { if (x_filter_taps == WIENER_WIN_REDUCED) { highbd_12_convolve_add_src_5tap_horiz( src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, im_h, x_filter_s16, horiz_round_vec, im_max_val); } else { highbd_12_convolve_add_src_7tap_horiz( src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, im_h, x_filter_s16, horiz_round_vec, im_max_val); } if (y_filter_taps == WIENER_WIN_REDUCED) { highbd_12_convolve_add_src_5tap_vert(im_block, im_stride, dst, dst_stride, w, h, y_filter_s16, vert_round_vec, res_max_val); } else { highbd_12_convolve_add_src_7tap_vert(im_block, im_stride, dst, dst_stride, w, h, y_filter_s16, vert_round_vec, res_max_val); } } else { if (x_filter_taps == WIENER_WIN_REDUCED) { highbd_convolve_add_src_5tap_horiz( src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, im_h, x_filter_s16, horiz_round_vec, im_max_val); } else { highbd_convolve_add_src_7tap_horiz( src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, im_h, x_filter_s16, horiz_round_vec, im_max_val); } if (y_filter_taps == WIENER_WIN_REDUCED) { highbd_convolve_add_src_5tap_vert(im_block, im_stride, dst, dst_stride, w, h, y_filter_s16, vert_round_vec, res_max_val); } else { highbd_convolve_add_src_7tap_vert(im_block, im_stride, dst, dst_stride, w, h, y_filter_s16, vert_round_vec, res_max_val); } } } aom-3.12.1/av1/common/arm/reconinter_neon.c000066400000000000000000000161571477627663500205040ustar00rootroot00000000000000/* * * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "aom/aom_integer.h" #include "aom_dsp/blend.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_ports/mem.h" #include "av1/common/blockd.h" #include "config/av1_rtcd.h" static inline void diffwtd_mask_d16_neon(uint8_t *mask, const bool inverse, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd) { const int round = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); const int16x8_t round_vec = vdupq_n_s16((int16_t)(-round)); if (w >= 16) { int i = 0; do { int j = 0; do { uint16x8_t s0_lo = vld1q_u16(src0 + j); uint16x8_t s1_lo = vld1q_u16(src1 + j); uint16x8_t s0_hi = vld1q_u16(src0 + j + 8); uint16x8_t s1_hi = vld1q_u16(src1 + j + 8); uint16x8_t diff_lo_u16 = vrshlq_u16(vabdq_u16(s0_lo, s1_lo), round_vec); uint16x8_t diff_hi_u16 = vrshlq_u16(vabdq_u16(s0_hi, s1_hi), round_vec); uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, DIFF_FACTOR_LOG2); uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, DIFF_FACTOR_LOG2); uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8); uint8x16_t m; if (inverse) { m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0 } else { m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64)); } vst1q_u8(mask, m); mask += 16; j += 16; } while (j < w); src0 += src0_stride; src1 += src1_stride; } while (++i < h); } else if (w == 8) { int i = 0; do { uint16x8_t s0 = vld1q_u16(src0); uint16x8_t s1 = vld1q_u16(src1); uint16x8_t diff_u16 = vrshlq_u16(vabdq_u16(s0, s1), round_vec); uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2); uint8x8_t m; if (inverse) { m = vqsub_u8(vdup_n_u8(64 - 38), diff_u8); // Saturating to 0 } else { m = vmin_u8(vadd_u8(diff_u8, vdup_n_u8(38)), vdup_n_u8(64)); } vst1_u8(mask, m); mask += 8; src0 += src0_stride; src1 += src1_stride; } while (++i < h); } else if (w == 4) { int i = 0; do { uint16x8_t s0 = vcombine_u16(vld1_u16(src0), vld1_u16(src0 + src0_stride)); uint16x8_t s1 = vcombine_u16(vld1_u16(src1), vld1_u16(src1 + src1_stride)); uint16x8_t diff_u16 = vrshlq_u16(vabdq_u16(s0, s1), round_vec); uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2); uint8x8_t m; if (inverse) { m = vqsub_u8(vdup_n_u8(64 - 38), diff_u8); // Saturating to 0 } else { m = vmin_u8(vadd_u8(diff_u8, vdup_n_u8(38)), vdup_n_u8(64)); } vst1_u8(mask, m); mask += 8; src0 += 2 * src0_stride; src1 += 2 * src1_stride; i += 2; } while (i < h); } } void av1_build_compound_diffwtd_mask_d16_neon( uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd) { assert(h >= 4); assert(w >= 4); assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38)); if (mask_type == DIFFWTD_38) { diffwtd_mask_d16_neon(mask, /*inverse=*/false, src0, src0_stride, src1, src1_stride, h, w, conv_params, bd); } else { // mask_type == DIFFWTD_38_INV diffwtd_mask_d16_neon(mask, /*inverse=*/true, src0, src0_stride, src1, src1_stride, h, w, conv_params, bd); } } static inline void diffwtd_mask_neon(uint8_t *mask, const bool inverse, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w) { if (w >= 16) { int i = 0; do { int j = 0; do { uint8x16_t s0 = vld1q_u8(src0 + j); uint8x16_t s1 = vld1q_u8(src1 + j); uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2); uint8x16_t m; if (inverse) { m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0 } else { m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64)); } vst1q_u8(mask, m); mask += 16; j += 16; } while (j < w); src0 += src0_stride; src1 += src1_stride; } while (++i < h); } else if (w == 8) { int i = 0; do { uint8x16_t s0 = vcombine_u8(vld1_u8(src0), vld1_u8(src0 + src0_stride)); uint8x16_t s1 = vcombine_u8(vld1_u8(src1), vld1_u8(src1 + src0_stride)); uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2); uint8x16_t m; if (inverse) { m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0 } else { m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64)); } vst1q_u8(mask, m); mask += 16; src0 += 2 * src0_stride; src1 += 2 * src1_stride; i += 2; } while (i < h); } else if (w == 4) { int i = 0; do { uint8x16_t s0 = load_unaligned_u8q(src0, src0_stride); uint8x16_t s1 = load_unaligned_u8q(src1, src1_stride); uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2); uint8x16_t m; if (inverse) { m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0 } else { m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64)); } vst1q_u8(mask, m); mask += 16; src0 += 4 * src0_stride; src1 += 4 * src1_stride; i += 4; } while (i < h); } } void av1_build_compound_diffwtd_mask_neon(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w) { assert(h % 4 == 0); assert(w % 4 == 0); assert(mask_type == DIFFWTD_38_INV || mask_type == DIFFWTD_38); if (mask_type == DIFFWTD_38) { diffwtd_mask_neon(mask, /*inverse=*/false, src0, src0_stride, src1, src1_stride, h, w); } else { // mask_type == DIFFWTD_38_INV diffwtd_mask_neon(mask, /*inverse=*/true, src0, src0_stride, src1, src1_stride, h, w); } } aom-3.12.1/av1/common/arm/reconintra_neon.c000066400000000000000000000311631477627663500204720ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #define MAX_UPSAMPLE_SZ 16 // TODO(aomedia:349436249): enable for armv7 after SIGBUS is fixed. #if AOM_ARCH_AARCH64 // These kernels are a transposed version of those defined in reconintra.c, // with the absolute value of the negatives taken in the top row. DECLARE_ALIGNED(16, const uint8_t, av1_filter_intra_taps_neon[FILTER_INTRA_MODES][7][8]) = { // clang-format off { { 6, 5, 3, 3, 4, 3, 3, 3 }, { 10, 2, 1, 1, 6, 2, 2, 1 }, { 0, 10, 1, 1, 0, 6, 2, 2 }, { 0, 0, 10, 2, 0, 0, 6, 2 }, { 0, 0, 0, 10, 0, 0, 0, 6 }, { 12, 9, 7, 5, 2, 2, 2, 3 }, { 0, 0, 0, 0, 12, 9, 7, 5 } }, { { 10, 6, 4, 2, 10, 6, 4, 2 }, { 16, 0, 0, 0, 16, 0, 0, 0 }, { 0, 16, 0, 0, 0, 16, 0, 0 }, { 0, 0, 16, 0, 0, 0, 16, 0 }, { 0, 0, 0, 16, 0, 0, 0, 16 }, { 10, 6, 4, 2, 0, 0, 0, 0 }, { 0, 0, 0, 0, 10, 6, 4, 2 } }, { { 8, 8, 8, 8, 4, 4, 4, 4 }, { 8, 0, 0, 0, 4, 0, 0, 0 }, { 0, 8, 0, 0, 0, 4, 0, 0 }, { 0, 0, 8, 0, 0, 0, 4, 0 }, { 0, 0, 0, 8, 0, 0, 0, 4 }, { 16, 16, 16, 16, 0, 0, 0, 0 }, { 0, 0, 0, 0, 16, 16, 16, 16 } }, { { 2, 1, 1, 0, 1, 1, 1, 1 }, { 8, 3, 2, 1, 4, 3, 2, 2 }, { 0, 8, 3, 2, 0, 4, 3, 2 }, { 0, 0, 8, 3, 0, 0, 4, 3 }, { 0, 0, 0, 8, 0, 0, 0, 4 }, { 10, 6, 4, 2, 3, 4, 4, 3 }, { 0, 0, 0, 0, 10, 6, 4, 3 } }, { { 12, 10, 9, 8, 10, 9, 8, 7 }, { 14, 0, 0, 0, 12, 1, 0, 0 }, { 0, 14, 0, 0, 0, 12, 0, 0 }, { 0, 0, 14, 0, 0, 0, 12, 1 }, { 0, 0, 0, 14, 0, 0, 0, 12 }, { 14, 12, 11, 10, 0, 0, 1, 1 }, { 0, 0, 0, 0, 14, 12, 11, 9 } } // clang-format on }; #define FILTER_INTRA_SCALE_BITS 4 void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode) { const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; assert(width <= 32 && height <= 32); const uint8x8_t f0 = vld1_u8(av1_filter_intra_taps_neon[mode][0]); const uint8x8_t f1 = vld1_u8(av1_filter_intra_taps_neon[mode][1]); const uint8x8_t f2 = vld1_u8(av1_filter_intra_taps_neon[mode][2]); const uint8x8_t f3 = vld1_u8(av1_filter_intra_taps_neon[mode][3]); const uint8x8_t f4 = vld1_u8(av1_filter_intra_taps_neon[mode][4]); const uint8x8_t f5 = vld1_u8(av1_filter_intra_taps_neon[mode][5]); const uint8x8_t f6 = vld1_u8(av1_filter_intra_taps_neon[mode][6]); uint8_t buffer[33][33]; // Populate the top row in the scratch buffer with data from above. memcpy(buffer[0], &above[-1], (width + 1) * sizeof(uint8_t)); // Populate the first column in the scratch buffer with data from the left. int r = 0; do { buffer[r + 1][0] = left[r]; } while (++r < height); // Computing 4 cols per iteration (instead of 8) for 8x blocks is faster. if (width <= 8) { r = 1; do { int c = 1; uint8x8_t s0 = vld1_dup_u8(&buffer[r - 1][c - 1]); uint8x8_t s5 = vld1_dup_u8(&buffer[r + 0][c - 1]); uint8x8_t s6 = vld1_dup_u8(&buffer[r + 1][c - 1]); do { uint8x8_t s1234 = load_u8_4x1(&buffer[r - 1][c - 1] + 1); uint8x8_t s1 = vdup_lane_u8(s1234, 0); uint8x8_t s2 = vdup_lane_u8(s1234, 1); uint8x8_t s3 = vdup_lane_u8(s1234, 2); uint8x8_t s4 = vdup_lane_u8(s1234, 3); uint16x8_t sum = vmull_u8(s1, f1); // First row of each filter has all negative values so subtract. sum = vmlsl_u8(sum, s0, f0); sum = vmlal_u8(sum, s2, f2); sum = vmlal_u8(sum, s3, f3); sum = vmlal_u8(sum, s4, f4); sum = vmlal_u8(sum, s5, f5); sum = vmlal_u8(sum, s6, f6); uint8x8_t res = vqrshrun_n_s16(vreinterpretq_s16_u16(sum), FILTER_INTRA_SCALE_BITS); // Store buffer[r + 0][c] and buffer[r + 1][c]. store_u8x4_strided_x2(&buffer[r][c], 33, res); store_u8x4_strided_x2(dst + (r - 1) * stride + c - 1, stride, res); s0 = s4; s5 = vdup_lane_u8(res, 3); s6 = vdup_lane_u8(res, 7); c += 4; } while (c < width + 1); r += 2; } while (r < height + 1); } else { r = 1; do { int c = 1; uint8x8_t s0_lo = vld1_dup_u8(&buffer[r - 1][c - 1]); uint8x8_t s5_lo = vld1_dup_u8(&buffer[r + 0][c - 1]); uint8x8_t s6_lo = vld1_dup_u8(&buffer[r + 1][c - 1]); do { uint8x8_t s1234 = vld1_u8(&buffer[r - 1][c - 1] + 1); uint8x8_t s1_lo = vdup_lane_u8(s1234, 0); uint8x8_t s2_lo = vdup_lane_u8(s1234, 1); uint8x8_t s3_lo = vdup_lane_u8(s1234, 2); uint8x8_t s4_lo = vdup_lane_u8(s1234, 3); uint16x8_t sum_lo = vmull_u8(s1_lo, f1); // First row of each filter has all negative values so subtract. sum_lo = vmlsl_u8(sum_lo, s0_lo, f0); sum_lo = vmlal_u8(sum_lo, s2_lo, f2); sum_lo = vmlal_u8(sum_lo, s3_lo, f3); sum_lo = vmlal_u8(sum_lo, s4_lo, f4); sum_lo = vmlal_u8(sum_lo, s5_lo, f5); sum_lo = vmlal_u8(sum_lo, s6_lo, f6); uint8x8_t res_lo = vqrshrun_n_s16(vreinterpretq_s16_u16(sum_lo), FILTER_INTRA_SCALE_BITS); uint8x8_t s0_hi = s4_lo; uint8x8_t s1_hi = vdup_lane_u8(s1234, 4); uint8x8_t s2_hi = vdup_lane_u8(s1234, 5); uint8x8_t s3_hi = vdup_lane_u8(s1234, 6); uint8x8_t s4_hi = vdup_lane_u8(s1234, 7); uint8x8_t s5_hi = vdup_lane_u8(res_lo, 3); uint8x8_t s6_hi = vdup_lane_u8(res_lo, 7); uint16x8_t sum_hi = vmull_u8(s1_hi, f1); // First row of each filter has all negative values so subtract. sum_hi = vmlsl_u8(sum_hi, s0_hi, f0); sum_hi = vmlal_u8(sum_hi, s2_hi, f2); sum_hi = vmlal_u8(sum_hi, s3_hi, f3); sum_hi = vmlal_u8(sum_hi, s4_hi, f4); sum_hi = vmlal_u8(sum_hi, s5_hi, f5); sum_hi = vmlal_u8(sum_hi, s6_hi, f6); uint8x8_t res_hi = vqrshrun_n_s16(vreinterpretq_s16_u16(sum_hi), FILTER_INTRA_SCALE_BITS); uint32x2x2_t res = vzip_u32(vreinterpret_u32_u8(res_lo), vreinterpret_u32_u8(res_hi)); vst1_u8(&buffer[r + 0][c], vreinterpret_u8_u32(res.val[0])); vst1_u8(&buffer[r + 1][c], vreinterpret_u8_u32(res.val[1])); vst1_u8(dst + (r - 1) * stride + c - 1, vreinterpret_u8_u32(res.val[0])); vst1_u8(dst + (r + 0) * stride + c - 1, vreinterpret_u8_u32(res.val[1])); s0_lo = s4_hi; s5_lo = vdup_lane_u8(res_hi, 3); s6_lo = vdup_lane_u8(res_hi, 7); c += 8; } while (c < width + 1); r += 2; } while (r < height + 1); } } #endif // AOM_ARCH_AARCH64 void av1_filter_intra_edge_neon(uint8_t *p, int sz, int strength) { if (!strength) return; assert(sz >= 0 && sz <= 129); uint8_t edge[160]; // Max value of sz + enough padding for vector accesses. memcpy(edge + 1, p, sz * sizeof(*p)); // Populate extra space appropriately. edge[0] = edge[1]; edge[sz + 1] = edge[sz]; edge[sz + 2] = edge[sz]; // Don't overwrite first pixel. uint8_t *dst = p + 1; sz--; if (strength == 1) { // Filter: {4, 8, 4}. const uint8_t *src = edge + 1; while (sz >= 8) { uint8x8_t s0 = vld1_u8(src); uint8x8_t s1 = vld1_u8(src + 1); uint8x8_t s2 = vld1_u8(src + 2); // Make use of the identity: // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2 uint16x8_t t0 = vaddl_u8(s0, s2); uint16x8_t t1 = vaddl_u8(s1, s1); uint16x8_t sum = vaddq_u16(t0, t1); uint8x8_t res = vrshrn_n_u16(sum, 2); vst1_u8(dst, res); src += 8; dst += 8; sz -= 8; } if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. uint8x8_t s0 = vld1_u8(src); uint8x8_t s1 = vld1_u8(src + 1); uint8x8_t s2 = vld1_u8(src + 2); uint16x8_t t0 = vaddl_u8(s0, s2); uint16x8_t t1 = vaddl_u8(s1, s1); uint16x8_t sum = vaddq_u16(t0, t1); uint8x8_t res = vrshrn_n_u16(sum, 2); // Mask off out-of-bounds indices. uint8x8_t current_dst = vld1_u8(dst); uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100)); res = vbsl_u8(mask, res, current_dst); vst1_u8(dst, res); } } else if (strength == 2) { // Filter: {5, 6, 5}. const uint8_t *src = edge + 1; const uint8x8x3_t filter = { { vdup_n_u8(5), vdup_n_u8(6), vdup_n_u8(5) } }; while (sz >= 8) { uint8x8_t s0 = vld1_u8(src); uint8x8_t s1 = vld1_u8(src + 1); uint8x8_t s2 = vld1_u8(src + 2); uint16x8_t accum = vmull_u8(s0, filter.val[0]); accum = vmlal_u8(accum, s1, filter.val[1]); accum = vmlal_u8(accum, s2, filter.val[2]); uint8x8_t res = vrshrn_n_u16(accum, 4); vst1_u8(dst, res); src += 8; dst += 8; sz -= 8; } if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. uint8x8_t s0 = vld1_u8(src); uint8x8_t s1 = vld1_u8(src + 1); uint8x8_t s2 = vld1_u8(src + 2); uint16x8_t accum = vmull_u8(s0, filter.val[0]); accum = vmlal_u8(accum, s1, filter.val[1]); accum = vmlal_u8(accum, s2, filter.val[2]); uint8x8_t res = vrshrn_n_u16(accum, 4); // Mask off out-of-bounds indices. uint8x8_t current_dst = vld1_u8(dst); uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100)); res = vbsl_u8(mask, res, current_dst); vst1_u8(dst, res); } } else { // Filter {2, 4, 4, 4, 2}. const uint8_t *src = edge; while (sz >= 8) { uint8x8_t s0 = vld1_u8(src); uint8x8_t s1 = vld1_u8(src + 1); uint8x8_t s2 = vld1_u8(src + 2); uint8x8_t s3 = vld1_u8(src + 3); uint8x8_t s4 = vld1_u8(src + 4); // Make use of the identity: // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3 uint16x8_t t0 = vaddl_u8(s0, s4); uint16x8_t t1 = vaddl_u8(s1, s2); t1 = vaddw_u8(t1, s3); t1 = vaddq_u16(t1, t1); uint16x8_t sum = vaddq_u16(t0, t1); uint8x8_t res = vrshrn_n_u16(sum, 3); vst1_u8(dst, res); src += 8; dst += 8; sz -= 8; } if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. uint8x8_t s0 = vld1_u8(src); uint8x8_t s1 = vld1_u8(src + 1); uint8x8_t s2 = vld1_u8(src + 2); uint8x8_t s3 = vld1_u8(src + 3); uint8x8_t s4 = vld1_u8(src + 4); uint16x8_t t0 = vaddl_u8(s0, s4); uint16x8_t t1 = vaddl_u8(s1, s2); t1 = vaddw_u8(t1, s3); t1 = vaddq_u16(t1, t1); uint16x8_t sum = vaddq_u16(t0, t1); uint8x8_t res = vrshrn_n_u16(sum, 3); // Mask off out-of-bounds indices. uint8x8_t current_dst = vld1_u8(dst); uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100)); res = vbsl_u8(mask, res, current_dst); vst1_u8(dst, res); } } } void av1_upsample_intra_edge_neon(uint8_t *p, int sz) { if (!sz) return; assert(sz <= MAX_UPSAMPLE_SZ); uint8_t edge[MAX_UPSAMPLE_SZ + 3]; const uint8_t *src = edge; // Copy p[-1..(sz-1)] and pad out both ends. edge[0] = p[-1]; edge[1] = p[-1]; memcpy(edge + 2, p, sz); edge[sz + 2] = p[sz - 1]; p[-2] = p[-1]; uint8_t *dst = p - 1; do { uint8x8_t s0 = vld1_u8(src); uint8x8_t s1 = vld1_u8(src + 1); uint8x8_t s2 = vld1_u8(src + 2); uint8x8_t s3 = vld1_u8(src + 3); int16x8_t t0 = vreinterpretq_s16_u16(vaddl_u8(s0, s3)); int16x8_t t1 = vreinterpretq_s16_u16(vaddl_u8(s1, s2)); t1 = vmulq_n_s16(t1, 9); t1 = vsubq_s16(t1, t0); uint8x8x2_t res = { { vqrshrun_n_s16(t1, 4), s2 } }; vst2_u8(dst, res); src += 8; dst += 16; sz -= 8; } while (sz > 0); } aom-3.12.1/av1/common/arm/resize_neon.c000066400000000000000000000730021477627663500176250ustar00rootroot00000000000000/* * * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "av1/common/arm/resize_neon.h" #include "av1/common/resize.h" #include "config/aom_scale_rtcd.h" #include "config/av1_rtcd.h" static inline void scale_plane_2_to_1_phase_0(const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, int w, int h) { assert(w > 0 && h > 0); do { const uint8_t *s = src; uint8_t *d = dst; int width = w; do { const uint8x16x2_t s0 = vld2q_u8(s); vst1q_u8(d, s0.val[0]); s += 32; d += 16; width -= 16; } while (width > 0); src += 2 * src_stride; dst += dst_stride; } while (--h != 0); } static inline void scale_plane_4_to_1_phase_0(const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, int w, int h) { assert(w > 0 && h > 0); do { const uint8_t *s = src; uint8_t *d = dst; int width = w; do { const uint8x16x4_t s0 = vld4q_u8(s); vst1q_u8(d, s0.val[0]); s += 64; d += 16; width -= 16; } while (width > 0); src += 4 * src_stride; dst += dst_stride; } while (--h != 0); } static inline uint8x16_t scale_plane_bilinear_kernel( const uint8x16_t s0_even, const uint8x16_t s0_odd, const uint8x16_t s1_even, const uint8x16_t s1_odd, const uint8x8_t filter0, const uint8x8_t filter1) { // A shim of 1 << (FILTER_BITS - 1) enables us to use non-rounding // shifts - which are generally faster than rounding shifts on modern CPUs. uint16x8_t offset = vdupq_n_u16(1 << (FILTER_BITS - 1)); // Horizontal filtering uint16x8_t h0_lo = vmlal_u8(offset, vget_low_u8(s0_even), filter0); uint16x8_t h0_hi = vmlal_u8(offset, vget_high_u8(s0_even), filter0); uint16x8_t h1_lo = vmlal_u8(offset, vget_low_u8(s1_even), filter0); uint16x8_t h1_hi = vmlal_u8(offset, vget_high_u8(s1_even), filter0); h0_lo = vmlal_u8(h0_lo, vget_low_u8(s0_odd), filter1); h0_hi = vmlal_u8(h0_hi, vget_high_u8(s0_odd), filter1); h1_lo = vmlal_u8(h1_lo, vget_low_u8(s1_odd), filter1); h1_hi = vmlal_u8(h1_hi, vget_high_u8(s1_odd), filter1); const uint8x8_t h0_lo_u8 = vshrn_n_u16(h0_lo, FILTER_BITS); const uint8x8_t h0_hi_u8 = vshrn_n_u16(h0_hi, FILTER_BITS); const uint8x8_t h1_lo_u8 = vshrn_n_u16(h1_lo, FILTER_BITS); const uint8x8_t h1_hi_u8 = vshrn_n_u16(h1_hi, FILTER_BITS); // Vertical filtering uint16x8_t v_lo = vmlal_u8(offset, h0_lo_u8, filter0); uint16x8_t v_hi = vmlal_u8(offset, h0_hi_u8, filter0); v_lo = vmlal_u8(v_lo, h1_lo_u8, filter1); v_hi = vmlal_u8(v_hi, h1_hi_u8, filter1); return vcombine_u8(vshrn_n_u16(v_lo, FILTER_BITS), vshrn_n_u16(v_hi, FILTER_BITS)); } static inline void scale_plane_2_to_1_bilinear( const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, int w, int h, const int16_t f0, const int16_t f1) { assert(w > 0 && h > 0); const uint8x8_t filter0 = vdup_n_u8(f0); const uint8x8_t filter1 = vdup_n_u8(f1); do { const uint8_t *s = src; uint8_t *d = dst; int width = w; do { const uint8x16x2_t s0 = vld2q_u8(s + 0 * src_stride); const uint8x16x2_t s1 = vld2q_u8(s + 1 * src_stride); uint8x16_t d0 = scale_plane_bilinear_kernel( s0.val[0], s0.val[1], s1.val[0], s1.val[1], filter0, filter1); vst1q_u8(d, d0); s += 32; d += 16; width -= 16; } while (width > 0); src += 2 * src_stride; dst += dst_stride; } while (--h != 0); } static inline void scale_plane_4_to_1_bilinear( const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, int w, int h, const int16_t f0, const int16_t f1) { assert(w > 0 && h > 0); const uint8x8_t filter0 = vdup_n_u8(f0); const uint8x8_t filter1 = vdup_n_u8(f1); do { const uint8_t *s = src; uint8_t *d = dst; int width = w; do { const uint8x16x4_t s0 = vld4q_u8(s + 0 * src_stride); const uint8x16x4_t s1 = vld4q_u8(s + 1 * src_stride); uint8x16_t d0 = scale_plane_bilinear_kernel( s0.val[0], s0.val[1], s1.val[0], s1.val[1], filter0, filter1); vst1q_u8(d, d0); s += 64; d += 16; width -= 16; } while (width > 0); src += 4 * src_stride; dst += dst_stride; } while (--h != 0); } static inline void scale_2_to_1_horiz_6tap(const uint8_t *src, const int src_stride, int w, int h, uint8_t *dst, const int dst_stride, const int16x8_t filters) { do { uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); const uint8_t *s = src + 6; uint8_t *d = dst; int width = w; do { uint8x8_t t8, t9, t10, t11, t12, t13; load_u8_8x8(s, src_stride, &t6, &t7, &t8, &t9, &t10, &t11, &t12, &t13); transpose_elems_inplace_u8_8x8(&t6, &t7, &t8, &t9, &t10, &t11, &t12, &t13); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13)); uint8x8_t d0 = scale_filter6_8(s0, s1, s2, s3, s4, s5, filters); uint8x8_t d1 = scale_filter6_8(s2, s3, s4, s5, s6, s7, filters); uint8x8_t d2 = scale_filter6_8(s4, s5, s6, s7, s8, s9, filters); uint8x8_t d3 = scale_filter6_8(s6, s7, s8, s9, s10, s11, filters); transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3); store_u8x4_strided_x2(d + 0 * dst_stride, 4 * dst_stride, d0); store_u8x4_strided_x2(d + 1 * dst_stride, 4 * dst_stride, d1); store_u8x4_strided_x2(d + 2 * dst_stride, 4 * dst_stride, d2); store_u8x4_strided_x2(d + 3 * dst_stride, 4 * dst_stride, d3); s0 = s8; s1 = s9; s2 = s10; s3 = s11; s4 = s12; s5 = s13; d += 4; s += 8; width -= 4; } while (width > 0); dst += 8 * dst_stride; src += 8 * src_stride; h -= 8; } while (h > 0); } static inline void scale_plane_2_to_1_6tap(const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, const int w, const int h, const int16_t *const filter_ptr, uint8_t *const im_block) { assert(w > 0 && h > 0); const int im_h = 2 * h + SUBPEL_TAPS - 3; const int im_stride = (w + 3) & ~3; // All filter values are even, halve them to stay in 16-bit elements when // applying filter. const int16x8_t filters = vshrq_n_s16(vld1q_s16(filter_ptr), 1); const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 2; const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 2) * src_stride; scale_2_to_1_horiz_6tap(src - horiz_offset - vert_offset, src_stride, w, im_h, im_block, im_stride, filters); scale_2_to_1_vert_6tap(im_block, im_stride, w, h, dst, dst_stride, filters); } static inline void scale_4_to_1_horiz_6tap(const uint8_t *src, const int src_stride, int w, int h, uint8_t *dst, const int dst_stride, const int16x8_t filters) { do { uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); const uint8_t *s = src + 4; uint8_t *d = dst; int width = w; do { uint8x8_t t8, t9, t10, t11; load_u8_8x8(s, src_stride, &t4, &t5, &t6, &t7, &t8, &t9, &t10, &t11); transpose_elems_inplace_u8_8x8(&t4, &t5, &t6, &t7, &t8, &t9, &t10, &t11); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); uint8x8_t d0 = scale_filter6_8(s0, s1, s2, s3, s4, s5, filters); uint8x8_t d1 = scale_filter6_8(s4, s5, s6, s7, s8, s9, filters); uint8x8x2_t d01 = vtrn_u8(d0, d1); store_u8x2_strided_x4(d + 0 * dst_stride, 2 * dst_stride, d01.val[0]); store_u8x2_strided_x4(d + 1 * dst_stride, 2 * dst_stride, d01.val[1]); s0 = s8; s1 = s9; s2 = s10; s3 = s11; d += 2; s += 8; width -= 2; } while (width > 0); dst += 8 * dst_stride; src += 8 * src_stride; h -= 8; } while (h > 0); } static inline void scale_plane_4_to_1_6tap(const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, const int w, const int h, const int16_t *const filter_ptr, uint8_t *const im_block) { assert(w > 0 && h > 0); const int im_h = 4 * h + SUBPEL_TAPS - 3; const int im_stride = (w + 1) & ~1; // All filter values are even, halve them to stay in 16-bit elements when // applying filter. const int16x8_t filters = vshrq_n_s16(vld1q_s16(filter_ptr), 1); const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 2; const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 2) * src_stride; scale_4_to_1_horiz_6tap(src - horiz_offset - vert_offset, src_stride, w, im_h, im_block, im_stride, filters); scale_4_to_1_vert_6tap(im_block, im_stride, w, h, dst, dst_stride, filters); } static inline uint8x8_t scale_filter_bilinear(const uint8x8_t *const s, const uint8x8_t *const coef) { const uint16x8_t h0 = vmull_u8(s[0], coef[0]); const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]); return vrshrn_n_u16(h1, 7); } // Notes for 4 to 3 scaling: // // 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be // multiple of 6, and no less than w. // // 2. 8 rows are calculated in each vertical inner loop, so width_ver must be // multiple of 8, and no less than w. // // 3. 8 columns are calculated in each horizontal inner loop for further // vertical scaling, so height_hor must be multiple of 8, and no less than // 4 * h / 3. // // 4. 6 columns are calculated in each vertical inner loop, so height_ver must // be multiple of 6, and no less than h. // // 5. The physical location of the last row of the 4 to 3 scaled frame is // decided by phase_scaler, and are always less than 1 pixel below the last row // of the original image. static inline void scale_plane_4_to_3_bilinear( const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, const int w, const int h, const int phase_scaler, uint8_t *const temp_buffer) { static const int step_q4 = 16 * 4 / 3; const int width_hor = (w + 5) - ((w + 5) % 6); const int stride_hor = width_hor + 2; // store 2 extra pixels const int width_ver = (w + 7) & ~7; // We only need 1 extra row below because there are only 2 bilinear // coefficients. const int height_hor = (4 * h / 3 + 1 + 7) & ~7; const int height_ver = (h + 5) - ((h + 5) % 6); int x, y = height_hor; uint8_t *t = temp_buffer; uint8x8_t s[9], d[8], c[6]; const InterpKernel *interp_kernel = (const InterpKernel *)av1_interp_filter_params_list[BILINEAR].filter_ptr; assert(w && h); c[0] = vdup_n_u8((uint8_t)interp_kernel[phase_scaler][3]); c[1] = vdup_n_u8((uint8_t)interp_kernel[phase_scaler][4]); c[2] = vdup_n_u8( (uint8_t)interp_kernel[(phase_scaler + 1 * step_q4) & SUBPEL_MASK][3]); c[3] = vdup_n_u8( (uint8_t)interp_kernel[(phase_scaler + 1 * step_q4) & SUBPEL_MASK][4]); c[4] = vdup_n_u8( (uint8_t)interp_kernel[(phase_scaler + 2 * step_q4) & SUBPEL_MASK][3]); c[5] = vdup_n_u8( (uint8_t)interp_kernel[(phase_scaler + 2 * step_q4) & SUBPEL_MASK][4]); d[6] = vdup_n_u8(0); d[7] = vdup_n_u8(0); // horizontal 6x8 do { load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); src += 1; transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); x = width_hor; do { load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]); src += 8; transpose_elems_inplace_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 d[0] = scale_filter_bilinear(&s[0], &c[0]); d[1] = scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]); d[2] = scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]); d[3] = scale_filter_bilinear(&s[4], &c[0]); d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], &c[2]); d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], &c[4]); // 00 01 02 03 04 05 xx xx // 10 11 12 13 14 15 xx xx // 20 21 22 23 24 25 xx xx // 30 31 32 33 34 35 xx xx // 40 41 42 43 44 45 xx xx // 50 51 52 53 54 55 xx xx // 60 61 62 63 64 65 xx xx // 70 71 72 73 74 75 xx xx transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); // store 2 extra pixels vst1_u8(t + 0 * stride_hor, d[0]); vst1_u8(t + 1 * stride_hor, d[1]); vst1_u8(t + 2 * stride_hor, d[2]); vst1_u8(t + 3 * stride_hor, d[3]); vst1_u8(t + 4 * stride_hor, d[4]); vst1_u8(t + 5 * stride_hor, d[5]); vst1_u8(t + 6 * stride_hor, d[6]); vst1_u8(t + 7 * stride_hor, d[7]); s[0] = s[8]; t += 6; x -= 6; } while (x); src += 8 * src_stride - 4 * width_hor / 3 - 1; t += 7 * stride_hor + 2; y -= 8; } while (y); // vertical 8x6 x = width_ver; t = temp_buffer; do { load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); t += stride_hor; y = height_ver; do { load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]); t += 8 * stride_hor; d[0] = scale_filter_bilinear(&s[0], &c[0]); d[1] = scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]); d[2] = scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]); d[3] = scale_filter_bilinear(&s[4], &c[0]); d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], &c[2]); d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], &c[4]); vst1_u8(dst + 0 * dst_stride, d[0]); vst1_u8(dst + 1 * dst_stride, d[1]); vst1_u8(dst + 2 * dst_stride, d[2]); vst1_u8(dst + 3 * dst_stride, d[3]); vst1_u8(dst + 4 * dst_stride, d[4]); vst1_u8(dst + 5 * dst_stride, d[5]); s[0] = s[8]; dst += 6 * dst_stride; y -= 6; } while (y); t -= stride_hor * (4 * height_ver / 3 + 1); t += 8; dst -= height_ver * dst_stride; dst += 8; x -= 8; } while (x); } static inline uint8x8_t scale_filter_8(const uint8x8_t *const s, const int16x8_t filter) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); int16x8_t ss0 = vreinterpretq_s16_u16(vmovl_u8(s[0])); int16x8_t ss1 = vreinterpretq_s16_u16(vmovl_u8(s[1])); int16x8_t ss2 = vreinterpretq_s16_u16(vmovl_u8(s[2])); int16x8_t ss3 = vreinterpretq_s16_u16(vmovl_u8(s[3])); int16x8_t ss4 = vreinterpretq_s16_u16(vmovl_u8(s[4])); int16x8_t ss5 = vreinterpretq_s16_u16(vmovl_u8(s[5])); int16x8_t ss6 = vreinterpretq_s16_u16(vmovl_u8(s[6])); int16x8_t ss7 = vreinterpretq_s16_u16(vmovl_u8(s[7])); int16x8_t sum = vmulq_lane_s16(ss0, filter_lo, 0); sum = vmlaq_lane_s16(sum, ss1, filter_lo, 1); sum = vmlaq_lane_s16(sum, ss2, filter_lo, 2); sum = vmlaq_lane_s16(sum, ss5, filter_hi, 1); sum = vmlaq_lane_s16(sum, ss6, filter_hi, 2); sum = vmlaq_lane_s16(sum, ss7, filter_hi, 3); sum = vqaddq_s16(sum, vmulq_lane_s16(ss3, filter_lo, 3)); sum = vqaddq_s16(sum, vmulq_lane_s16(ss4, filter_hi, 0)); return vqrshrun_n_s16(sum, FILTER_BITS); } static inline void scale_plane_4_to_3_8tap(const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, const int w, const int h, const InterpKernel *const coef, const int phase_scaler, uint8_t *const temp_buffer) { static const int step_q4 = 16 * 4 / 3; const int width_hor = (w + 5) - ((w + 5) % 6); const int stride_hor = width_hor + 2; // store 2 extra pixels const int width_ver = (w + 7) & ~7; // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows // above and (SUBPEL_TAPS / 2) extra rows below. const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; const int height_ver = (h + 5) - ((h + 5) % 6); const int16x8_t filters0 = vld1q_s16( (const int16_t *)&coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]); const int16x8_t filters1 = vld1q_s16( (const int16_t *)&coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]); const int16x8_t filters2 = vld1q_s16( (const int16_t *)&coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]); int x, y = height_hor; uint8_t *t = temp_buffer; uint8x8_t s[15], d[8]; assert(w > 0 && h > 0); src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2; d[6] = vdup_n_u8(0); d[7] = vdup_n_u8(0); // horizontal 6x8 do { load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); x = width_hor; do { src += 8; load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13], &s[14]); transpose_elems_inplace_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13], &s[14]); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 d[0] = scale_filter_8(&s[0], filters0); d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1); d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2); d[3] = scale_filter_8(&s[4], filters0); d[4] = scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1); d[5] = scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2); // 00 01 02 03 04 05 xx xx // 10 11 12 13 14 15 xx xx // 20 21 22 23 24 25 xx xx // 30 31 32 33 34 35 xx xx // 40 41 42 43 44 45 xx xx // 50 51 52 53 54 55 xx xx // 60 61 62 63 64 65 xx xx // 70 71 72 73 74 75 xx xx transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); // store 2 extra pixels vst1_u8(t + 0 * stride_hor, d[0]); vst1_u8(t + 1 * stride_hor, d[1]); vst1_u8(t + 2 * stride_hor, d[2]); vst1_u8(t + 3 * stride_hor, d[3]); vst1_u8(t + 4 * stride_hor, d[4]); vst1_u8(t + 5 * stride_hor, d[5]); vst1_u8(t + 6 * stride_hor, d[6]); vst1_u8(t + 7 * stride_hor, d[7]); s[0] = s[8]; s[1] = s[9]; s[2] = s[10]; s[3] = s[11]; s[4] = s[12]; s[5] = s[13]; s[6] = s[14]; t += 6; x -= 6; } while (x); src += 8 * src_stride - 4 * width_hor / 3; t += 7 * stride_hor + 2; y -= 8; } while (y); // vertical 8x6 x = width_ver; t = temp_buffer; do { load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); t += 7 * stride_hor; y = height_ver; do { load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13], &s[14]); t += 8 * stride_hor; d[0] = scale_filter_8(&s[0], filters0); d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1); d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2); d[3] = scale_filter_8(&s[4], filters0); d[4] = scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1); d[5] = scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2); vst1_u8(dst + 0 * dst_stride, d[0]); vst1_u8(dst + 1 * dst_stride, d[1]); vst1_u8(dst + 2 * dst_stride, d[2]); vst1_u8(dst + 3 * dst_stride, d[3]); vst1_u8(dst + 4 * dst_stride, d[4]); vst1_u8(dst + 5 * dst_stride, d[5]); s[0] = s[8]; s[1] = s[9]; s[2] = s[10]; s[3] = s[11]; s[4] = s[12]; s[5] = s[13]; s[6] = s[14]; dst += 6 * dst_stride; y -= 6; } while (y); t -= stride_hor * (4 * height_ver / 3 + 7); t += 8; dst -= height_ver * dst_stride; dst += 8; x -= 8; } while (x); } // There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling in NEON. static inline bool has_normative_scaler_neon(const int src_width, const int src_height, const int dst_width, const int dst_height) { const bool has_normative_scaler = (2 * dst_width == src_width && 2 * dst_height == src_height) || (4 * dst_width == src_width && 4 * dst_height == src_height) || (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height); return has_normative_scaler; } void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes) { assert(filter == BILINEAR || filter == EIGHTTAP_SMOOTH || filter == EIGHTTAP_REGULAR); bool has_normative_scaler = has_normative_scaler_neon(src->y_crop_width, src->y_crop_height, dst->y_crop_width, dst->y_crop_height); if (num_planes > 1) { has_normative_scaler = has_normative_scaler && has_normative_scaler_neon(src->uv_crop_width, src->uv_crop_height, dst->uv_crop_width, dst->uv_crop_height); } if (!has_normative_scaler) { av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); return; } // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet // the static analysis warnings. int malloc_failed = 0; for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { const int is_uv = i > 0; const int src_w = src->crop_widths[is_uv]; const int src_h = src->crop_heights[is_uv]; const int dst_w = dst->crop_widths[is_uv]; const int dst_h = dst->crop_heights[is_uv]; const int dst_y_w = (dst->crop_widths[0] + 1) & ~1; const int dst_y_h = (dst->crop_heights[0] + 1) & ~1; if (2 * dst_w == src_w && 2 * dst_h == src_h) { if (phase == 0) { scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h); } else if (filter == BILINEAR) { const int16_t c0 = av1_bilinear_filters[phase][3]; const int16_t c1 = av1_bilinear_filters[phase][4]; scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h, c0, c1); } else { const int buffer_stride = (dst_y_w + 3) & ~3; const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_stride * buffer_height); if (!temp_buffer) { malloc_failed = 1; break; } const InterpKernel *interp_kernel = (const InterpKernel *)av1_interp_filter_params_list[filter] .filter_ptr; scale_plane_2_to_1_6tap(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h, interp_kernel[phase], temp_buffer); free(temp_buffer); } } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { if (phase == 0) { scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h); } else if (filter == BILINEAR) { const int16_t c0 = av1_bilinear_filters[phase][3]; const int16_t c1 = av1_bilinear_filters[phase][4]; scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h, c0, c1); } else { const int buffer_stride = (dst_y_w + 1) & ~1; const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_stride * buffer_height); if (!temp_buffer) { malloc_failed = 1; break; } const InterpKernel *interp_kernel = (const InterpKernel *)av1_interp_filter_params_list[filter] .filter_ptr; scale_plane_4_to_1_6tap(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h, interp_kernel[phase], temp_buffer); free(temp_buffer); } } else { assert(4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h); // 4 to 3 const int buffer_stride = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2; const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_stride * buffer_height); if (!temp_buffer) { malloc_failed = 1; break; } if (filter == BILINEAR) { scale_plane_4_to_3_bilinear(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h, phase, temp_buffer); } else { const InterpKernel *interp_kernel = (const InterpKernel *)av1_interp_filter_params_list[filter] .filter_ptr; scale_plane_4_to_3_8tap(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h, interp_kernel, phase, temp_buffer); } free(temp_buffer); } } if (malloc_failed) { av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); } else { aom_extend_frame_borders(dst, num_planes); } } aom-3.12.1/av1/common/arm/resize_neon.h000066400000000000000000000117371477627663500176410ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_ARM_RESIZE_NEON_H_ #define AOM_AV1_COMMON_ARM_RESIZE_NEON_H_ #include #include "aom_dsp/aom_filter.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" static inline uint8x8_t scale_filter6_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, int16x8_t filter) { const int16x4_t filter_lo = vget_low_s16(filter); const int16x4_t filter_hi = vget_high_s16(filter); // Filter values at indices 0 and 7 are 0. int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 1); sum = vmlaq_lane_s16(sum, s1, filter_lo, 2); sum = vmlaq_lane_s16(sum, s2, filter_lo, 3); sum = vmlaq_lane_s16(sum, s3, filter_hi, 0); sum = vmlaq_lane_s16(sum, s4, filter_hi, 1); sum = vmlaq_lane_s16(sum, s5, filter_hi, 2); // We halved the convolution filter values so -1 from the right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void scale_2_to_1_vert_6tap(const uint8_t *src, const int src_stride, int w, int h, uint8_t *dst, const int dst_stride, const int16x8_t filters) { do { uint8x8_t t0, t1, t2, t3; load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); const uint8_t *s = src + 4 * src_stride; uint8_t *d = dst; int height = h; do { uint8x8_t t4, t5, t6, t7, t8, t9, t10, t11; load_u8_8x8(s, src_stride, &t4, &t5, &t6, &t7, &t8, &t9, &t10, &t11); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); uint8x8_t d0 = scale_filter6_8(s0, s1, s2, s3, s4, s5, filters); uint8x8_t d1 = scale_filter6_8(s2, s3, s4, s5, s6, s7, filters); uint8x8_t d2 = scale_filter6_8(s4, s5, s6, s7, s8, s9, filters); uint8x8_t d3 = scale_filter6_8(s6, s7, s8, s9, s10, s11, filters); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s8; s1 = s9; s2 = s10; s3 = s11; d += 4 * dst_stride; s += 8 * src_stride; height -= 4; } while (height > 0); dst += 8; src += 8; w -= 8; } while (w > 0); } static inline void scale_4_to_1_vert_6tap(const uint8_t *src, const int src_stride, int w, int h, uint8_t *dst, const int dst_stride, const int16x8_t filters) { do { uint8x8_t t0 = vld1_u8(src + 0 * src_stride); uint8x8_t t1 = vld1_u8(src + 1 * src_stride); int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); const uint8_t *s = src + 2 * src_stride; uint8_t *d = dst; int height = h; do { uint8x8_t t2, t3, t4, t5, t6, t7, t8, t9; load_u8_8x8(s, src_stride, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); uint8x8_t d0 = scale_filter6_8(s0, s1, s2, s3, s4, s5, filters); uint8x8_t d1 = scale_filter6_8(s4, s5, s6, s7, s8, s9, filters); store_u8_8x2(d, dst_stride, d0, d1); s0 = s8; s1 = s9; s += 8 * src_stride; d += 2 * dst_stride; height -= 2; } while (height > 0); src += 8; dst += 8; w -= 8; } while (w > 0); } #endif // AOM_AV1_COMMON_ARM_RESIZE_NEON_H_ aom-3.12.1/av1/common/arm/resize_neon_dotprod.c000066400000000000000000000323161477627663500213630ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "av1/common/arm/resize_neon.h" #include "av1/common/resize.h" #include "config/aom_scale_rtcd.h" #include "config/av1_rtcd.h" // clang-format off DECLARE_ALIGNED(16, static const uint8_t, kScale2DotProdPermuteTbl[32]) = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 }; DECLARE_ALIGNED(16, static const uint8_t, kScale4DotProdPermuteTbl[16]) = { 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11 }; // clang-format on static inline uint8x8_t scale_2_to_1_filter8_8(const uint8x16_t s0, const uint8x16_t s1, const uint8x16x2_t permute_tbl, const int8x8_t filter) { // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t s0_128 = vreinterpretq_s8_u8(vsubq_u8(s0, vdupq_n_u8(128))); int8x16_t s1_128 = vreinterpretq_s8_u8(vsubq_u8(s1, vdupq_n_u8(128))); // Permute samples ready for dot product. int8x16_t perm_samples[4] = { vqtbl1q_s8(s0_128, permute_tbl.val[0]), vqtbl1q_s8(s0_128, permute_tbl.val[1]), vqtbl1q_s8(s1_128, permute_tbl.val[0]), vqtbl1q_s8(s1_128, permute_tbl.val[1]) }; // Dot product constant: // The shim of 128 << FILTER_BITS is needed because we are subtracting 128 // from every source value. The additional right shift by one is needed // because we halve the filter values. const int32x4_t acc = vdupq_n_s32((128 << FILTER_BITS) >> 1); // First 4 output values. int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filter, 0); sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filter, 1); // Second 4 output values. int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[2], filter, 0); sum4567 = vdotq_lane_s32(sum4567, perm_samples[3], filter, 1); int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void scale_2_to_1_horiz_8tap(const uint8_t *src, const int src_stride, int w, int h, uint8_t *dst, const int dst_stride, const int16x8_t filters) { const int8x8_t filter = vmovn_s16(filters); const uint8x16x2_t permute_tbl = vld1q_u8_x2(kScale2DotProdPermuteTbl); do { const uint8_t *s = src; uint8_t *d = dst; int width = w; do { uint8x16_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; load_u8_16x8(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0], &s5[0], &s6[0], &s7[0]); load_u8_16x8(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1], &s5[1], &s6[1], &s7[1]); uint8x8_t d0 = scale_2_to_1_filter8_8(s0[0], s0[1], permute_tbl, filter); uint8x8_t d1 = scale_2_to_1_filter8_8(s1[0], s1[1], permute_tbl, filter); uint8x8_t d2 = scale_2_to_1_filter8_8(s2[0], s2[1], permute_tbl, filter); uint8x8_t d3 = scale_2_to_1_filter8_8(s3[0], s3[1], permute_tbl, filter); uint8x8_t d4 = scale_2_to_1_filter8_8(s4[0], s4[1], permute_tbl, filter); uint8x8_t d5 = scale_2_to_1_filter8_8(s5[0], s5[1], permute_tbl, filter); uint8x8_t d6 = scale_2_to_1_filter8_8(s6[0], s6[1], permute_tbl, filter); uint8x8_t d7 = scale_2_to_1_filter8_8(s7[0], s7[1], permute_tbl, filter); store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); d += 8; s += 16; width -= 8; } while (width > 0); dst += 8 * dst_stride; src += 8 * src_stride; h -= 8; } while (h > 0); } static inline void scale_plane_2_to_1_8tap(const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, const int w, const int h, const int16_t *const filter_ptr, uint8_t *const im_block) { assert(w > 0 && h > 0); const int im_h = 2 * h + SUBPEL_TAPS - 3; const int im_stride = (w + 7) & ~7; // All filter values are even, halve them to fit in int8_t when applying // horizontal filter and stay in 16-bit elements when applying vertical // filter. const int16x8_t filters = vshrq_n_s16(vld1q_s16(filter_ptr), 1); const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1; const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride; scale_2_to_1_horiz_8tap(src - horiz_offset - vert_offset, src_stride, w, im_h, im_block, im_stride, filters); // We can specialise the vertical filtering for 6-tap filters given that the // EIGHTTAP_SMOOTH and EIGHTTAP_REGULAR filters are 0-padded. scale_2_to_1_vert_6tap(im_block + im_stride, im_stride, w, h, dst, dst_stride, filters); } static inline uint8x8_t scale_4_to_1_filter8_8( const uint8x16_t s0, const uint8x16_t s1, const uint8x16_t s2, const uint8x16_t s3, const uint8x16_t permute_tbl, const int8x8_t filter) { int8x16_t filters = vcombine_s8(filter, filter); // Transform sample range to [-128, 127] for 8-bit signed dot product. int8x16_t s0_128 = vreinterpretq_s8_u8(vsubq_u8(s0, vdupq_n_u8(128))); int8x16_t s1_128 = vreinterpretq_s8_u8(vsubq_u8(s1, vdupq_n_u8(128))); int8x16_t s2_128 = vreinterpretq_s8_u8(vsubq_u8(s2, vdupq_n_u8(128))); int8x16_t s3_128 = vreinterpretq_s8_u8(vsubq_u8(s3, vdupq_n_u8(128))); int8x16_t perm_samples[4] = { vqtbl1q_s8(s0_128, permute_tbl), vqtbl1q_s8(s1_128, permute_tbl), vqtbl1q_s8(s2_128, permute_tbl), vqtbl1q_s8(s3_128, permute_tbl) }; // Dot product constant: // The shim of 128 << FILTER_BITS is needed because we are subtracting 128 // from every source value. The additional right shift by one is needed // because we halved the filter values and will use a pairwise add. const int32x4_t acc = vdupq_n_s32((128 << FILTER_BITS) >> 2); int32x4_t sum0 = vdotq_s32(acc, perm_samples[0], filters); int32x4_t sum1 = vdotq_s32(acc, perm_samples[1], filters); int32x4_t sum2 = vdotq_s32(acc, perm_samples[2], filters); int32x4_t sum3 = vdotq_s32(acc, perm_samples[3], filters); int32x4_t sum01 = vpaddq_s32(sum0, sum1); int32x4_t sum23 = vpaddq_s32(sum2, sum3); int16x8_t sum = vcombine_s16(vmovn_s32(sum01), vmovn_s32(sum23)); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void scale_4_to_1_horiz_8tap(const uint8_t *src, const int src_stride, int w, int h, uint8_t *dst, const int dst_stride, const int16x8_t filters) { const int8x8_t filter = vmovn_s16(filters); const uint8x16_t permute_tbl = vld1q_u8(kScale4DotProdPermuteTbl); do { const uint8_t *s = src; uint8_t *d = dst; int width = w; do { uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7; load_u8_16x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); uint8x8_t d0 = scale_4_to_1_filter8_8(s0, s1, s2, s3, permute_tbl, filter); uint8x8_t d1 = scale_4_to_1_filter8_8(s4, s5, s6, s7, permute_tbl, filter); store_u8x2_strided_x4(d + 0 * dst_stride, dst_stride, d0); store_u8x2_strided_x4(d + 4 * dst_stride, dst_stride, d1); d += 2; s += 8; width -= 2; } while (width > 0); dst += 8 * dst_stride; src += 8 * src_stride; h -= 8; } while (h > 0); } static inline void scale_plane_4_to_1_8tap(const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, const int w, const int h, const int16_t *const filter_ptr, uint8_t *const im_block) { assert(w > 0 && h > 0); const int im_h = 4 * h + SUBPEL_TAPS - 2; const int im_stride = (w + 1) & ~1; // All filter values are even, halve them to fit in int8_t when applying // horizontal filter and stay in 16-bit elements when applying vertical // filter. const int16x8_t filters = vshrq_n_s16(vld1q_s16(filter_ptr), 1); const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1; const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride; scale_4_to_1_horiz_8tap(src - horiz_offset - vert_offset, src_stride, w, im_h, im_block, im_stride, filters); // We can specialise the vertical filtering for 6-tap filters given that the // EIGHTTAP_SMOOTH and EIGHTTAP_REGULAR filters are 0-padded. scale_4_to_1_vert_6tap(im_block + im_stride, im_stride, w, h, dst, dst_stride, filters); } static inline bool has_normative_scaler_neon_dotprod(const int src_width, const int src_height, const int dst_width, const int dst_height) { return (2 * dst_width == src_width && 2 * dst_height == src_height) || (4 * dst_width == src_width && 4 * dst_height == src_height); } void av1_resize_and_extend_frame_neon_dotprod(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes) { assert(filter == BILINEAR || filter == EIGHTTAP_SMOOTH || filter == EIGHTTAP_REGULAR); bool has_normative_scaler = has_normative_scaler_neon_dotprod(src->y_crop_width, src->y_crop_height, dst->y_crop_width, dst->y_crop_height); if (num_planes > 1) { has_normative_scaler = has_normative_scaler && has_normative_scaler_neon_dotprod( src->uv_crop_width, src->uv_crop_height, dst->uv_crop_width, dst->uv_crop_height); } if (!has_normative_scaler || filter == BILINEAR || phase == 0) { av1_resize_and_extend_frame_neon(src, dst, filter, phase, num_planes); return; } // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet // the static analysis warnings. int malloc_failed = 0; for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { const int is_uv = i > 0; const int src_w = src->crop_widths[is_uv]; const int src_h = src->crop_heights[is_uv]; const int dst_w = dst->crop_widths[is_uv]; const int dst_h = dst->crop_heights[is_uv]; const int dst_y_w = (dst->crop_widths[0] + 1) & ~1; const int dst_y_h = (dst->crop_heights[0] + 1) & ~1; if (2 * dst_w == src_w && 2 * dst_h == src_h) { const int buffer_stride = (dst_y_w + 7) & ~7; const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_stride * buffer_height); if (!temp_buffer) { malloc_failed = 1; break; } const InterpKernel *interp_kernel = (const InterpKernel *)av1_interp_filter_params_list[filter] .filter_ptr; scale_plane_2_to_1_8tap(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h, interp_kernel[phase], temp_buffer); free(temp_buffer); } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { const int buffer_stride = (dst_y_w + 1) & ~1; const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_stride * buffer_height); if (!temp_buffer) { malloc_failed = 1; break; } const InterpKernel *interp_kernel = (const InterpKernel *)av1_interp_filter_params_list[filter] .filter_ptr; scale_plane_4_to_1_8tap(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h, interp_kernel[phase], temp_buffer); free(temp_buffer); } } if (malloc_failed) { av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); } else { aom_extend_frame_borders(dst, num_planes); } } aom-3.12.1/av1/common/arm/resize_neon_i8mm.c000066400000000000000000000274161477627663500205670ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "av1/common/arm/resize_neon.h" #include "av1/common/resize.h" #include "config/aom_scale_rtcd.h" #include "config/av1_rtcd.h" // clang-format off DECLARE_ALIGNED(16, static const uint8_t, kScalePermuteTbl[16]) = { 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11 }; // clang-format on static inline uint8x8_t scale_2_to_1_filter8_8(const uint8x16_t s0, const uint8x16_t s1, const uint8x16_t permute_tbl, const int8x16_t filters) { // Permute samples ready for matrix multiply. // { 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11 } uint8x16_t perm_samples[2] = { vqtbl1q_u8(s0, permute_tbl), vqtbl1q_u8(s1, permute_tbl) }; // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix // (filter), destructively accumulating into the destination register. int32x4_t sum0123 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[0], filters); int32x4_t sum4567 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[1], filters); int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567)); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void scale_2_to_1_horiz_6tap(const uint8_t *src, const int src_stride, int w, int h, uint8_t *dst, const int dst_stride, const int16x8_t filter) { const int8x8_t filter_s8 = vmovn_s16(filter); // Stagger the filter for use with the matrix multiply instructions. // { f1, f2, f3, f4, f5, f6, 0, 0, 0, 0, f1, f2, f3, f4, f5, f6 } const int8x16_t filters = vcombine_s8(vext_s8(filter_s8, filter_s8, 1), vext_s8(filter_s8, filter_s8, 7)); const uint8x16_t permute_tbl = vld1q_u8(kScalePermuteTbl); do { const uint8_t *s = src; uint8_t *d = dst; int width = w; do { uint8x16_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; load_u8_16x8(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0], &s5[0], &s6[0], &s7[0]); load_u8_16x8(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1], &s5[1], &s6[1], &s7[1]); uint8x8_t d0 = scale_2_to_1_filter8_8(s0[0], s0[1], permute_tbl, filters); uint8x8_t d1 = scale_2_to_1_filter8_8(s1[0], s1[1], permute_tbl, filters); uint8x8_t d2 = scale_2_to_1_filter8_8(s2[0], s2[1], permute_tbl, filters); uint8x8_t d3 = scale_2_to_1_filter8_8(s3[0], s3[1], permute_tbl, filters); uint8x8_t d4 = scale_2_to_1_filter8_8(s4[0], s4[1], permute_tbl, filters); uint8x8_t d5 = scale_2_to_1_filter8_8(s5[0], s5[1], permute_tbl, filters); uint8x8_t d6 = scale_2_to_1_filter8_8(s6[0], s6[1], permute_tbl, filters); uint8x8_t d7 = scale_2_to_1_filter8_8(s7[0], s7[1], permute_tbl, filters); store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); d += 8; s += 16; width -= 8; } while (width > 0); dst += 8 * dst_stride; src += 8 * src_stride; h -= 8; } while (h > 0); } static inline void scale_plane_2_to_1_6tap(const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, const int w, const int h, const int16_t *const filter_ptr, uint8_t *const im_block) { assert(w > 0 && h > 0); const int im_h = 2 * h + SUBPEL_TAPS - 3; const int im_stride = (w + 7) & ~7; // All filter values are even, halve them to fit in int8_t when applying // horizontal filter and stay in 16-bit elements when applying vertical // filter. const int16x8_t filters = vshrq_n_s16(vld1q_s16(filter_ptr), 1); const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 2; const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 2) * src_stride; scale_2_to_1_horiz_6tap(src - horiz_offset - vert_offset, src_stride, w, im_h, im_block, im_stride, filters); scale_2_to_1_vert_6tap(im_block, im_stride, w, h, dst, dst_stride, filters); } static inline uint8x8_t scale_4_to_1_filter8_8( const uint8x16_t s0, const uint8x16_t s1, const uint8x16_t s2, const uint8x16_t s3, const uint8x16_t permute_tbl, const int8x8_t filter) { int8x16_t filters = vcombine_s8(filter, filter); uint8x16_t perm_samples[4] = { vqtbl1q_u8(s0, permute_tbl), vqtbl1q_u8(s1, permute_tbl), vqtbl1q_u8(s2, permute_tbl), vqtbl1q_u8(s3, permute_tbl) }; int32x4_t sum0 = vusdotq_s32(vdupq_n_s32(0), perm_samples[0], filters); int32x4_t sum1 = vusdotq_s32(vdupq_n_s32(0), perm_samples[1], filters); int32x4_t sum2 = vusdotq_s32(vdupq_n_s32(0), perm_samples[2], filters); int32x4_t sum3 = vusdotq_s32(vdupq_n_s32(0), perm_samples[3], filters); int32x4_t sum01 = vpaddq_s32(sum0, sum1); int32x4_t sum23 = vpaddq_s32(sum2, sum3); int16x8_t sum = vcombine_s16(vmovn_s32(sum01), vmovn_s32(sum23)); // We halved the filter values so -1 from right shift. return vqrshrun_n_s16(sum, FILTER_BITS - 1); } static inline void scale_4_to_1_horiz_8tap(const uint8_t *src, const int src_stride, int w, int h, uint8_t *dst, const int dst_stride, const int16x8_t filters) { const int8x8_t filter = vmovn_s16(filters); const uint8x16_t permute_tbl = vld1q_u8(kScalePermuteTbl); do { const uint8_t *s = src; uint8_t *d = dst; int width = w; do { uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7; load_u8_16x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); uint8x8_t d0 = scale_4_to_1_filter8_8(s0, s1, s2, s3, permute_tbl, filter); uint8x8_t d1 = scale_4_to_1_filter8_8(s4, s5, s6, s7, permute_tbl, filter); store_u8x2_strided_x4(d + 0 * dst_stride, dst_stride, d0); store_u8x2_strided_x4(d + 4 * dst_stride, dst_stride, d1); d += 2; s += 8; width -= 2; } while (width > 0); dst += 8 * dst_stride; src += 8 * src_stride; h -= 8; } while (h > 0); } static inline void scale_plane_4_to_1_8tap(const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, const int w, const int h, const int16_t *const filter_ptr, uint8_t *const im_block) { assert(w > 0 && h > 0); const int im_h = 4 * h + SUBPEL_TAPS - 3; const int im_stride = (w + 1) & ~1; // All filter values are even, halve them to fit in int8_t when applying // horizontal filter and stay in 16-bit elements when applying vertical // filter. const int16x8_t filters = vshrq_n_s16(vld1q_s16(filter_ptr), 1); const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1; const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 2) * src_stride; scale_4_to_1_horiz_8tap(src - horiz_offset - vert_offset, src_stride, w, im_h, im_block, im_stride, filters); // We can specialise the vertical filtering for 6-tap filters given that the // EIGHTTAP_SMOOTH and EIGHTTAP_REGULAR filters are 0-padded. scale_4_to_1_vert_6tap(im_block, im_stride, w, h, dst, dst_stride, filters); } static inline bool has_normative_scaler_neon_i8mm(const int src_width, const int src_height, const int dst_width, const int dst_height) { return (2 * dst_width == src_width && 2 * dst_height == src_height) || (4 * dst_width == src_width && 4 * dst_height == src_height); } void av1_resize_and_extend_frame_neon_i8mm(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes) { assert(filter == BILINEAR || filter == EIGHTTAP_SMOOTH || filter == EIGHTTAP_REGULAR); bool has_normative_scaler = has_normative_scaler_neon_i8mm(src->y_crop_width, src->y_crop_height, dst->y_crop_width, dst->y_crop_height); if (num_planes > 1) { has_normative_scaler = has_normative_scaler && has_normative_scaler_neon_i8mm(src->uv_crop_width, src->uv_crop_height, dst->uv_crop_width, dst->uv_crop_height); } if (!has_normative_scaler || filter == BILINEAR || phase == 0) { av1_resize_and_extend_frame_neon(src, dst, filter, phase, num_planes); return; } // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet // the static analysis warnings. int malloc_failed = 0; for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { const int is_uv = i > 0; const int src_w = src->crop_widths[is_uv]; const int src_h = src->crop_heights[is_uv]; const int dst_w = dst->crop_widths[is_uv]; const int dst_h = dst->crop_heights[is_uv]; const int dst_y_w = (dst->crop_widths[0] + 1) & ~1; const int dst_y_h = (dst->crop_heights[0] + 1) & ~1; if (2 * dst_w == src_w && 2 * dst_h == src_h) { const int buffer_stride = (dst_y_w + 7) & ~7; const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_stride * buffer_height); if (!temp_buffer) { malloc_failed = 1; break; } const InterpKernel *interp_kernel = (const InterpKernel *)av1_interp_filter_params_list[filter] .filter_ptr; scale_plane_2_to_1_6tap(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h, interp_kernel[phase], temp_buffer); free(temp_buffer); } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { const int buffer_stride = (dst_y_w + 1) & ~1; const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_stride * buffer_height); if (!temp_buffer) { malloc_failed = 1; break; } const InterpKernel *interp_kernel = (const InterpKernel *)av1_interp_filter_params_list[filter] .filter_ptr; scale_plane_4_to_1_8tap(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h, interp_kernel[phase], temp_buffer); free(temp_buffer); } } if (malloc_failed) { av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); } else { aom_extend_frame_borders(dst, num_planes); } } aom-3.12.1/av1/common/arm/selfguided_neon.c000066400000000000000000001553631477627663500204520ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/txfm_common.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "av1/common/av1_common_int.h" #include "av1/common/common.h" #include "av1/common/resize.h" #include "av1/common/restoration.h" // Constants used for right shift in final_filter calculation. #define NB_EVEN 5 #define NB_ODD 4 static inline void calc_ab_fast_internal_common( uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4, uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, int32x4_t sr4, int32x4_t sr5, int32x4_t sr6, int32x4_t sr7, uint32x4_t const_n_val, uint32x4_t s_vec, uint32x4_t const_val, uint32x4_t one_by_n_minus_1_vec, uint16x4_t sgrproj_sgr, int32_t *src1, uint16_t *dst_A16, int32_t *src2, const int buf_stride) { uint32x4_t q0, q1, q2, q3; uint32x4_t p0, p1, p2, p3; uint16x4_t d0, d1, d2, d3; s0 = vmulq_u32(s0, const_n_val); s1 = vmulq_u32(s1, const_n_val); s2 = vmulq_u32(s2, const_n_val); s3 = vmulq_u32(s3, const_n_val); q0 = vmulq_u32(s4, s4); q1 = vmulq_u32(s5, s5); q2 = vmulq_u32(s6, s6); q3 = vmulq_u32(s7, s7); p0 = vcleq_u32(q0, s0); p1 = vcleq_u32(q1, s1); p2 = vcleq_u32(q2, s2); p3 = vcleq_u32(q3, s3); q0 = vsubq_u32(s0, q0); q1 = vsubq_u32(s1, q1); q2 = vsubq_u32(s2, q2); q3 = vsubq_u32(s3, q3); p0 = vandq_u32(p0, q0); p1 = vandq_u32(p1, q1); p2 = vandq_u32(p2, q2); p3 = vandq_u32(p3, q3); p0 = vmulq_u32(p0, s_vec); p1 = vmulq_u32(p1, s_vec); p2 = vmulq_u32(p2, s_vec); p3 = vmulq_u32(p3, s_vec); p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS); p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS); p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS); p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS); p0 = vminq_u32(p0, const_val); p1 = vminq_u32(p1, const_val); p2 = vminq_u32(p2, const_val); p3 = vminq_u32(p3, const_val); { store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3); for (int x = 0; x < 4; x++) { for (int y = 0; y < 4; y++) { dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]]; } } load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3); } p0 = vsubl_u16(sgrproj_sgr, d0); p1 = vsubl_u16(sgrproj_sgr, d1); p2 = vsubl_u16(sgrproj_sgr, d2); p3 = vsubl_u16(sgrproj_sgr, d3); s4 = vmulq_u32(vreinterpretq_u32_s32(sr4), one_by_n_minus_1_vec); s5 = vmulq_u32(vreinterpretq_u32_s32(sr5), one_by_n_minus_1_vec); s6 = vmulq_u32(vreinterpretq_u32_s32(sr6), one_by_n_minus_1_vec); s7 = vmulq_u32(vreinterpretq_u32_s32(sr7), one_by_n_minus_1_vec); s4 = vmulq_u32(s4, p0); s5 = vmulq_u32(s5, p1); s6 = vmulq_u32(s6, p2); s7 = vmulq_u32(s7, p3); p0 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS); p1 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS); p2 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS); p3 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS); store_s32_4x4(src2, buf_stride, vreinterpretq_s32_u32(p0), vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2), vreinterpretq_s32_u32(p3)); } static inline void calc_ab_internal_common( uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4, uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, uint16x8_t s16_0, uint16x8_t s16_1, uint16x8_t s16_2, uint16x8_t s16_3, uint16x8_t s16_4, uint16x8_t s16_5, uint16x8_t s16_6, uint16x8_t s16_7, uint32x4_t const_n_val, uint32x4_t s_vec, uint32x4_t const_val, uint16x4_t one_by_n_minus_1_vec, uint16x8_t sgrproj_sgr, int32_t *src1, uint16_t *dst_A16, int32_t *dst2, const int buf_stride) { uint16x4_t d0, d1, d2, d3, d4, d5, d6, d7; uint32x4_t q0, q1, q2, q3, q4, q5, q6, q7; uint32x4_t p0, p1, p2, p3, p4, p5, p6, p7; s0 = vmulq_u32(s0, const_n_val); s1 = vmulq_u32(s1, const_n_val); s2 = vmulq_u32(s2, const_n_val); s3 = vmulq_u32(s3, const_n_val); s4 = vmulq_u32(s4, const_n_val); s5 = vmulq_u32(s5, const_n_val); s6 = vmulq_u32(s6, const_n_val); s7 = vmulq_u32(s7, const_n_val); d0 = vget_low_u16(s16_4); d1 = vget_low_u16(s16_5); d2 = vget_low_u16(s16_6); d3 = vget_low_u16(s16_7); d4 = vget_high_u16(s16_4); d5 = vget_high_u16(s16_5); d6 = vget_high_u16(s16_6); d7 = vget_high_u16(s16_7); q0 = vmull_u16(d0, d0); q1 = vmull_u16(d1, d1); q2 = vmull_u16(d2, d2); q3 = vmull_u16(d3, d3); q4 = vmull_u16(d4, d4); q5 = vmull_u16(d5, d5); q6 = vmull_u16(d6, d6); q7 = vmull_u16(d7, d7); p0 = vcleq_u32(q0, s0); p1 = vcleq_u32(q1, s1); p2 = vcleq_u32(q2, s2); p3 = vcleq_u32(q3, s3); p4 = vcleq_u32(q4, s4); p5 = vcleq_u32(q5, s5); p6 = vcleq_u32(q6, s6); p7 = vcleq_u32(q7, s7); q0 = vsubq_u32(s0, q0); q1 = vsubq_u32(s1, q1); q2 = vsubq_u32(s2, q2); q3 = vsubq_u32(s3, q3); q4 = vsubq_u32(s4, q4); q5 = vsubq_u32(s5, q5); q6 = vsubq_u32(s6, q6); q7 = vsubq_u32(s7, q7); p0 = vandq_u32(p0, q0); p1 = vandq_u32(p1, q1); p2 = vandq_u32(p2, q2); p3 = vandq_u32(p3, q3); p4 = vandq_u32(p4, q4); p5 = vandq_u32(p5, q5); p6 = vandq_u32(p6, q6); p7 = vandq_u32(p7, q7); p0 = vmulq_u32(p0, s_vec); p1 = vmulq_u32(p1, s_vec); p2 = vmulq_u32(p2, s_vec); p3 = vmulq_u32(p3, s_vec); p4 = vmulq_u32(p4, s_vec); p5 = vmulq_u32(p5, s_vec); p6 = vmulq_u32(p6, s_vec); p7 = vmulq_u32(p7, s_vec); p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS); p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS); p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS); p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS); p4 = vrshrq_n_u32(p4, SGRPROJ_MTABLE_BITS); p5 = vrshrq_n_u32(p5, SGRPROJ_MTABLE_BITS); p6 = vrshrq_n_u32(p6, SGRPROJ_MTABLE_BITS); p7 = vrshrq_n_u32(p7, SGRPROJ_MTABLE_BITS); p0 = vminq_u32(p0, const_val); p1 = vminq_u32(p1, const_val); p2 = vminq_u32(p2, const_val); p3 = vminq_u32(p3, const_val); p4 = vminq_u32(p4, const_val); p5 = vminq_u32(p5, const_val); p6 = vminq_u32(p6, const_val); p7 = vminq_u32(p7, const_val); { store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3); store_u32_4x4((uint32_t *)src1 + 4, buf_stride, p4, p5, p6, p7); for (int x = 0; x < 4; x++) { for (int y = 0; y < 8; y++) { dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]]; } } load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7); } s16_4 = vsubq_u16(sgrproj_sgr, s16_4); s16_5 = vsubq_u16(sgrproj_sgr, s16_5); s16_6 = vsubq_u16(sgrproj_sgr, s16_6); s16_7 = vsubq_u16(sgrproj_sgr, s16_7); s0 = vmull_u16(vget_low_u16(s16_0), one_by_n_minus_1_vec); s1 = vmull_u16(vget_low_u16(s16_1), one_by_n_minus_1_vec); s2 = vmull_u16(vget_low_u16(s16_2), one_by_n_minus_1_vec); s3 = vmull_u16(vget_low_u16(s16_3), one_by_n_minus_1_vec); s4 = vmull_u16(vget_high_u16(s16_0), one_by_n_minus_1_vec); s5 = vmull_u16(vget_high_u16(s16_1), one_by_n_minus_1_vec); s6 = vmull_u16(vget_high_u16(s16_2), one_by_n_minus_1_vec); s7 = vmull_u16(vget_high_u16(s16_3), one_by_n_minus_1_vec); s0 = vmulq_u32(s0, vmovl_u16(vget_low_u16(s16_4))); s1 = vmulq_u32(s1, vmovl_u16(vget_low_u16(s16_5))); s2 = vmulq_u32(s2, vmovl_u16(vget_low_u16(s16_6))); s3 = vmulq_u32(s3, vmovl_u16(vget_low_u16(s16_7))); s4 = vmulq_u32(s4, vmovl_u16(vget_high_u16(s16_4))); s5 = vmulq_u32(s5, vmovl_u16(vget_high_u16(s16_5))); s6 = vmulq_u32(s6, vmovl_u16(vget_high_u16(s16_6))); s7 = vmulq_u32(s7, vmovl_u16(vget_high_u16(s16_7))); p0 = vrshrq_n_u32(s0, SGRPROJ_RECIP_BITS); p1 = vrshrq_n_u32(s1, SGRPROJ_RECIP_BITS); p2 = vrshrq_n_u32(s2, SGRPROJ_RECIP_BITS); p3 = vrshrq_n_u32(s3, SGRPROJ_RECIP_BITS); p4 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS); p5 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS); p6 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS); p7 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS); store_s32_4x4(dst2, buf_stride, vreinterpretq_s32_u32(p0), vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2), vreinterpretq_s32_u32(p3)); store_s32_4x4(dst2 + 4, buf_stride, vreinterpretq_s32_u32(p4), vreinterpretq_s32_u32(p5), vreinterpretq_s32_u32(p6), vreinterpretq_s32_u32(p7)); } static inline void boxsum2_square_sum_calc( int16x4_t t1, int16x4_t t2, int16x4_t t3, int16x4_t t4, int16x4_t t5, int16x4_t t6, int16x4_t t7, int16x4_t t8, int16x4_t t9, int16x4_t t10, int16x4_t t11, int32x4_t *r0, int32x4_t *r1, int32x4_t *r2, int32x4_t *r3) { int32x4_t d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11; int32x4_t r12, r34, r67, r89, r1011; int32x4_t r345, r6789, r789; d1 = vmull_s16(t1, t1); d2 = vmull_s16(t2, t2); d3 = vmull_s16(t3, t3); d4 = vmull_s16(t4, t4); d5 = vmull_s16(t5, t5); d6 = vmull_s16(t6, t6); d7 = vmull_s16(t7, t7); d8 = vmull_s16(t8, t8); d9 = vmull_s16(t9, t9); d10 = vmull_s16(t10, t10); d11 = vmull_s16(t11, t11); r12 = vaddq_s32(d1, d2); r34 = vaddq_s32(d3, d4); r67 = vaddq_s32(d6, d7); r89 = vaddq_s32(d8, d9); r1011 = vaddq_s32(d10, d11); r345 = vaddq_s32(r34, d5); r6789 = vaddq_s32(r67, r89); r789 = vsubq_s32(r6789, d6); *r0 = vaddq_s32(r12, r345); *r1 = vaddq_s32(r67, r345); *r2 = vaddq_s32(d5, r6789); *r3 = vaddq_s32(r789, r1011); } static inline void boxsum2(int16_t *src, const int src_stride, int16_t *dst16, int32_t *dst32, int32_t *dst2, const int dst_stride, const int width, const int height) { assert(width > 2 * SGRPROJ_BORDER_HORZ); assert(height > 2 * SGRPROJ_BORDER_VERT); int16_t *dst1_16_ptr, *src_ptr; int32_t *dst2_ptr; int h, w, count = 0; const int dst_stride_2 = (dst_stride << 1); const int dst_stride_8 = (dst_stride << 3); dst1_16_ptr = dst16; dst2_ptr = dst2; src_ptr = src; w = width; { int16x8_t t1, t2, t3, t4, t5, t6, t7; int16x8_t t8, t9, t10, t11, t12; int16x8_t q12345, q56789, q34567, q7891011; int16x8_t q12, q34, q67, q89, q1011; int16x8_t q345, q6789, q789; int32x4_t r12345, r56789, r34567, r7891011; do { h = height; dst1_16_ptr = dst16 + (count << 3); dst2_ptr = dst2 + (count << 3); src_ptr = src + (count << 3); dst1_16_ptr += dst_stride_2; dst2_ptr += dst_stride_2; do { load_s16_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4); src_ptr += 4 * src_stride; load_s16_8x4(src_ptr, src_stride, &t5, &t6, &t7, &t8); src_ptr += 4 * src_stride; load_s16_8x4(src_ptr, src_stride, &t9, &t10, &t11, &t12); q12 = vaddq_s16(t1, t2); q34 = vaddq_s16(t3, t4); q67 = vaddq_s16(t6, t7); q89 = vaddq_s16(t8, t9); q1011 = vaddq_s16(t10, t11); q345 = vaddq_s16(q34, t5); q6789 = vaddq_s16(q67, q89); q789 = vaddq_s16(q89, t7); q12345 = vaddq_s16(q12, q345); q34567 = vaddq_s16(q67, q345); q56789 = vaddq_s16(t5, q6789); q7891011 = vaddq_s16(q789, q1011); store_s16_8x4(dst1_16_ptr, dst_stride_2, q12345, q34567, q56789, q7891011); dst1_16_ptr += dst_stride_8; boxsum2_square_sum_calc( vget_low_s16(t1), vget_low_s16(t2), vget_low_s16(t3), vget_low_s16(t4), vget_low_s16(t5), vget_low_s16(t6), vget_low_s16(t7), vget_low_s16(t8), vget_low_s16(t9), vget_low_s16(t10), vget_low_s16(t11), &r12345, &r34567, &r56789, &r7891011); store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r34567, r56789, r7891011); boxsum2_square_sum_calc( vget_high_s16(t1), vget_high_s16(t2), vget_high_s16(t3), vget_high_s16(t4), vget_high_s16(t5), vget_high_s16(t6), vget_high_s16(t7), vget_high_s16(t8), vget_high_s16(t9), vget_high_s16(t10), vget_high_s16(t11), &r12345, &r34567, &r56789, &r7891011); store_s32_4x4(dst2_ptr + 4, dst_stride_2, r12345, r34567, r56789, r7891011); dst2_ptr += (dst_stride_8); h -= 8; } while (h > 0); w -= 8; count++; } while (w > 0); // memset needed for row pixels as 2nd stage of boxsum filter uses // first 2 rows of dst16, dst2 buffer which is not filled in first stage. for (int x = 0; x < 2; x++) { memset(dst16 + x * dst_stride, 0, (width + 4) * sizeof(*dst16)); memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2)); } // memset needed for extra columns as 2nd stage of boxsum filter uses // last 2 columns of dst16, dst2 buffer which is not filled in first stage. for (int x = 2; x < height + 2; x++) { int dst_offset = x * dst_stride + width + 2; memset(dst16 + dst_offset, 0, 3 * sizeof(*dst16)); memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2)); } } { int16x4_t s1, s2, s3, s4, s5, s6, s7, s8; int32x4_t d1, d2, d3, d4, d5, d6, d7, d8; int32x4_t q12345, q34567, q23456, q45678; int32x4_t q23, q45, q67; int32x4_t q2345, q4567; int32x4_t r12345, r34567, r23456, r45678; int32x4_t r23, r45, r67; int32x4_t r2345, r4567; int32_t *src2_ptr, *dst1_32_ptr; int16_t *src1_ptr; count = 0; h = height; do { dst1_32_ptr = dst32 + count * dst_stride_8 + (dst_stride_2); dst2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2); src1_ptr = dst16 + count * dst_stride_8 + (dst_stride_2); src2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2); w = width; dst1_32_ptr += 2; dst2_ptr += 2; load_s16_4x4(src1_ptr, dst_stride_2, &s1, &s2, &s3, &s4); transpose_elems_inplace_s16_4x4(&s1, &s2, &s3, &s4); load_s32_4x4(src2_ptr, dst_stride_2, &d1, &d2, &d3, &d4); transpose_elems_inplace_s32_4x4(&d1, &d2, &d3, &d4); do { src1_ptr += 4; src2_ptr += 4; load_s16_4x4(src1_ptr, dst_stride_2, &s5, &s6, &s7, &s8); transpose_elems_inplace_s16_4x4(&s5, &s6, &s7, &s8); load_s32_4x4(src2_ptr, dst_stride_2, &d5, &d6, &d7, &d8); transpose_elems_inplace_s32_4x4(&d5, &d6, &d7, &d8); q23 = vaddl_s16(s2, s3); q45 = vaddl_s16(s4, s5); q67 = vaddl_s16(s6, s7); q2345 = vaddq_s32(q23, q45); q4567 = vaddq_s32(q45, q67); q12345 = vaddq_s32(vmovl_s16(s1), q2345); q23456 = vaddq_s32(q2345, vmovl_s16(s6)); q34567 = vaddq_s32(q4567, vmovl_s16(s3)); q45678 = vaddq_s32(q4567, vmovl_s16(s8)); transpose_elems_inplace_s32_4x4(&q12345, &q23456, &q34567, &q45678); store_s32_4x4(dst1_32_ptr, dst_stride_2, q12345, q23456, q34567, q45678); dst1_32_ptr += 4; s1 = s5; s2 = s6; s3 = s7; s4 = s8; r23 = vaddq_s32(d2, d3); r45 = vaddq_s32(d4, d5); r67 = vaddq_s32(d6, d7); r2345 = vaddq_s32(r23, r45); r4567 = vaddq_s32(r45, r67); r12345 = vaddq_s32(d1, r2345); r23456 = vaddq_s32(r2345, d6); r34567 = vaddq_s32(r4567, d3); r45678 = vaddq_s32(r4567, d8); transpose_elems_inplace_s32_4x4(&r12345, &r23456, &r34567, &r45678); store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r23456, r34567, r45678); dst2_ptr += 4; d1 = d5; d2 = d6; d3 = d7; d4 = d8; w -= 4; } while (w > 0); h -= 8; count++; } while (h > 0); } } static inline void calc_ab_internal_lbd(int32_t *A, uint16_t *A16, uint16_t *B16, int32_t *B, const int buf_stride, const int width, const int height, const int r, const int s, const int ht_inc) { int32_t *src1, *dst2, count = 0; uint16_t *dst_A16, *src2; const uint32_t n = (2 * r + 1) * (2 * r + 1); const uint32x4_t const_n_val = vdupq_n_u32(n); const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR); const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]); const uint32x4_t const_val = vdupq_n_u32(255); uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7; uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; const uint32x4_t s_vec = vdupq_n_u32(s); int w, h = height; do { dst_A16 = A16 + (count << 2) * buf_stride; src1 = A + (count << 2) * buf_stride; src2 = B16 + (count << 2) * buf_stride; dst2 = B + (count << 2) * buf_stride; w = width; do { load_u32_4x4((uint32_t *)src1, buf_stride, &s0, &s1, &s2, &s3); load_u32_4x4((uint32_t *)src1 + 4, buf_stride, &s4, &s5, &s6, &s7); load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3); s16_4 = s16_0; s16_5 = s16_1; s16_6 = s16_2; s16_7 = s16_3; calc_ab_internal_common( s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7, const_n_val, s_vec, const_val, one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride); w -= 8; dst2 += 8; src1 += 8; src2 += 8; dst_A16 += 8; } while (w > 0); count++; h -= (ht_inc * 4); } while (h > 0); } #if CONFIG_AV1_HIGHBITDEPTH static inline void calc_ab_internal_hbd(int32_t *A, uint16_t *A16, uint16_t *B16, int32_t *B, const int buf_stride, const int width, const int height, const int bit_depth, const int r, const int s, const int ht_inc) { int32_t *src1, *dst2, count = 0; uint16_t *dst_A16, *src2; const uint32_t n = (2 * r + 1) * (2 * r + 1); const int16x8_t bd_min_2_vec = vdupq_n_s16(-(bit_depth - 8)); const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1)); const uint32x4_t const_n_val = vdupq_n_u32(n); const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR); const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]); const uint32x4_t const_val = vdupq_n_u32(255); int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; uint16x8_t s16_0, s16_1, s16_2, s16_3; uint16x8_t s16_4, s16_5, s16_6, s16_7; uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; const uint32x4_t s_vec = vdupq_n_u32(s); int w, h = height; do { src1 = A + (count << 2) * buf_stride; src2 = B16 + (count << 2) * buf_stride; dst2 = B + (count << 2) * buf_stride; dst_A16 = A16 + (count << 2) * buf_stride; w = width; do { load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3); load_s32_4x4(src1 + 4, buf_stride, &sr4, &sr5, &sr6, &sr7); load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3); s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec); s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec); s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec); s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec); s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_1_vec); s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_1_vec); s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_1_vec); s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_1_vec); s16_4 = vrshlq_u16(s16_0, bd_min_2_vec); s16_5 = vrshlq_u16(s16_1, bd_min_2_vec); s16_6 = vrshlq_u16(s16_2, bd_min_2_vec); s16_7 = vrshlq_u16(s16_3, bd_min_2_vec); calc_ab_internal_common( s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7, const_n_val, s_vec, const_val, one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride); w -= 8; dst2 += 8; src1 += 8; src2 += 8; dst_A16 += 8; } while (w > 0); count++; h -= (ht_inc * 4); } while (h > 0); } #endif // CONFIG_AV1_HIGHBITDEPTH static inline void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16, int32_t *B, const int buf_stride, const int width, const int height, const int r, const int s, const int ht_inc) { int32_t *src1, *src2, count = 0; uint16_t *dst_A16; const uint32_t n = (2 * r + 1) * (2 * r + 1); const uint32x4_t const_n_val = vdupq_n_u32(n); const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR); const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]); const uint32x4_t const_val = vdupq_n_u32(255); int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; const uint32x4_t s_vec = vdupq_n_u32(s); int w, h = height; do { src1 = A + (count << 2) * buf_stride; src2 = B + (count << 2) * buf_stride; dst_A16 = A16 + (count << 2) * buf_stride; w = width; do { load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3); load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7); s0 = vreinterpretq_u32_s32(sr0); s1 = vreinterpretq_u32_s32(sr1); s2 = vreinterpretq_u32_s32(sr2); s3 = vreinterpretq_u32_s32(sr3); s4 = vreinterpretq_u32_s32(sr4); s5 = vreinterpretq_u32_s32(sr5); s6 = vreinterpretq_u32_s32(sr6); s7 = vreinterpretq_u32_s32(sr7); calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5, sr6, sr7, const_n_val, s_vec, const_val, one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, src2, buf_stride); w -= 4; src1 += 4; src2 += 4; dst_A16 += 4; } while (w > 0); count++; h -= (ht_inc * 4); } while (h > 0); } #if CONFIG_AV1_HIGHBITDEPTH static inline void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16, int32_t *B, const int buf_stride, const int width, const int height, const int bit_depth, const int r, const int s, const int ht_inc) { int32_t *src1, *src2, count = 0; uint16_t *dst_A16; const uint32_t n = (2 * r + 1) * (2 * r + 1); const int32x4_t bd_min_2_vec = vdupq_n_s32(-(bit_depth - 8)); const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1)); const uint32x4_t const_n_val = vdupq_n_u32(n); const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR); const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]); const uint32x4_t const_val = vdupq_n_u32(255); int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; const uint32x4_t s_vec = vdupq_n_u32(s); int w, h = height; do { src1 = A + (count << 2) * buf_stride; src2 = B + (count << 2) * buf_stride; dst_A16 = A16 + (count << 2) * buf_stride; w = width; do { load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3); load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7); s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec); s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec); s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec); s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec); s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_2_vec); s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_2_vec); s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_2_vec); s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_2_vec); calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5, sr6, sr7, const_n_val, s_vec, const_val, one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, src2, buf_stride); w -= 4; src1 += 4; src2 += 4; dst_A16 += 4; } while (w > 0); count++; h -= (ht_inc * 4); } while (h > 0); } #endif // CONFIG_AV1_HIGHBITDEPTH static inline void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1, int32_t *dst2, const int dst_stride, const int width, const int height) { assert(width > 2 * SGRPROJ_BORDER_HORZ); assert(height > 2 * SGRPROJ_BORDER_VERT); int16_t *src_ptr; int32_t *dst2_ptr; uint16_t *dst1_ptr; int h, w, count = 0; w = width; { int16x8_t s1, s2, s3, s4, s5, s6, s7, s8; int16x8_t q23, q34, q56, q234, q345, q456, q567; int32x4_t r23, r56, r345, r456, r567, r78, r678; int32x4_t r4_low, r4_high, r34_low, r34_high, r234_low, r234_high; int32x4_t r2, r3, r5, r6, r7, r8; int16x8_t q678, q78; do { dst1_ptr = dst1 + (count << 3); dst2_ptr = dst2 + (count << 3); src_ptr = src + (count << 3); h = height; load_s16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4); src_ptr += 4 * src_stride; q23 = vaddq_s16(s2, s3); q234 = vaddq_s16(q23, s4); q34 = vaddq_s16(s3, s4); dst1_ptr += (dst_stride << 1); r2 = vmull_s16(vget_low_s16(s2), vget_low_s16(s2)); r3 = vmull_s16(vget_low_s16(s3), vget_low_s16(s3)); r4_low = vmull_s16(vget_low_s16(s4), vget_low_s16(s4)); r23 = vaddq_s32(r2, r3); r234_low = vaddq_s32(r23, r4_low); r34_low = vaddq_s32(r3, r4_low); r2 = vmull_s16(vget_high_s16(s2), vget_high_s16(s2)); r3 = vmull_s16(vget_high_s16(s3), vget_high_s16(s3)); r4_high = vmull_s16(vget_high_s16(s4), vget_high_s16(s4)); r23 = vaddq_s32(r2, r3); r234_high = vaddq_s32(r23, r4_high); r34_high = vaddq_s32(r3, r4_high); dst2_ptr += (dst_stride << 1); do { load_s16_8x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); src_ptr += 4 * src_stride; q345 = vaddq_s16(s5, q34); q56 = vaddq_s16(s5, s6); q456 = vaddq_s16(s4, q56); q567 = vaddq_s16(s7, q56); q78 = vaddq_s16(s7, s8); q678 = vaddq_s16(s6, q78); store_s16_8x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567); dst1_ptr += (dst_stride << 2); s4 = s8; q34 = q78; q234 = q678; r5 = vmull_s16(vget_low_s16(s5), vget_low_s16(s5)); r6 = vmull_s16(vget_low_s16(s6), vget_low_s16(s6)); r7 = vmull_s16(vget_low_s16(s7), vget_low_s16(s7)); r8 = vmull_s16(vget_low_s16(s8), vget_low_s16(s8)); r345 = vaddq_s32(r5, r34_low); r56 = vaddq_s32(r5, r6); r456 = vaddq_s32(r4_low, r56); r567 = vaddq_s32(r7, r56); r78 = vaddq_s32(r7, r8); r678 = vaddq_s32(r6, r78); store_s32_4x4(dst2_ptr, dst_stride, r234_low, r345, r456, r567); r4_low = r8; r34_low = r78; r234_low = r678; r5 = vmull_s16(vget_high_s16(s5), vget_high_s16(s5)); r6 = vmull_s16(vget_high_s16(s6), vget_high_s16(s6)); r7 = vmull_s16(vget_high_s16(s7), vget_high_s16(s7)); r8 = vmull_s16(vget_high_s16(s8), vget_high_s16(s8)); r345 = vaddq_s32(r5, r34_high); r56 = vaddq_s32(r5, r6); r456 = vaddq_s32(r4_high, r56); r567 = vaddq_s32(r7, r56); r78 = vaddq_s32(r7, r8); r678 = vaddq_s32(r6, r78); store_s32_4x4((dst2_ptr + 4), dst_stride, r234_high, r345, r456, r567); dst2_ptr += (dst_stride << 2); r4_high = r8; r34_high = r78; r234_high = r678; h -= 4; } while (h > 0); w -= 8; count++; } while (w > 0); // memset needed for row pixels as 2nd stage of boxsum filter uses // first 2 rows of dst1, dst2 buffer which is not filled in first stage. for (int x = 0; x < 2; x++) { memset(dst1 + x * dst_stride, 0, (width + 4) * sizeof(*dst1)); memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2)); } // memset needed for extra columns as 2nd stage of boxsum filter uses // last 2 columns of dst1, dst2 buffer which is not filled in first stage. for (int x = 2; x < height + 2; x++) { int dst_offset = x * dst_stride + width + 2; memset(dst1 + dst_offset, 0, 3 * sizeof(*dst1)); memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2)); } } { int16x4_t d1, d2, d3, d4, d5, d6, d7, d8; int16x4_t q23, q34, q56, q234, q345, q456, q567; int32x4_t r23, r56, r234, r345, r456, r567, r34, r78, r678; int32x4_t r1, r2, r3, r4, r5, r6, r7, r8; int16x4_t q678, q78; int32_t *src2_ptr; uint16_t *src1_ptr; count = 0; h = height; w = width; do { dst1_ptr = dst1 + (count << 2) * dst_stride; dst2_ptr = dst2 + (count << 2) * dst_stride; src1_ptr = dst1 + (count << 2) * dst_stride; src2_ptr = dst2 + (count << 2) * dst_stride; w = width; load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d1, &d2, &d3, &d4); transpose_elems_inplace_s16_4x4(&d1, &d2, &d3, &d4); load_s32_4x4(src2_ptr, dst_stride, &r1, &r2, &r3, &r4); transpose_elems_inplace_s32_4x4(&r1, &r2, &r3, &r4); src1_ptr += 4; src2_ptr += 4; q23 = vadd_s16(d2, d3); q234 = vadd_s16(q23, d4); q34 = vadd_s16(d3, d4); dst1_ptr += 2; r23 = vaddq_s32(r2, r3); r234 = vaddq_s32(r23, r4); r34 = vaddq_s32(r3, r4); dst2_ptr += 2; do { load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d5, &d6, &d7, &d8); transpose_elems_inplace_s16_4x4(&d5, &d6, &d7, &d8); load_s32_4x4(src2_ptr, dst_stride, &r5, &r6, &r7, &r8); transpose_elems_inplace_s32_4x4(&r5, &r6, &r7, &r8); src1_ptr += 4; src2_ptr += 4; q345 = vadd_s16(d5, q34); q56 = vadd_s16(d5, d6); q456 = vadd_s16(d4, q56); q567 = vadd_s16(d7, q56); q78 = vadd_s16(d7, d8); q678 = vadd_s16(d6, q78); transpose_elems_inplace_s16_4x4(&q234, &q345, &q456, &q567); store_s16_4x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567); dst1_ptr += 4; d4 = d8; q34 = q78; q234 = q678; r345 = vaddq_s32(r5, r34); r56 = vaddq_s32(r5, r6); r456 = vaddq_s32(r4, r56); r567 = vaddq_s32(r7, r56); r78 = vaddq_s32(r7, r8); r678 = vaddq_s32(r6, r78); transpose_elems_inplace_s32_4x4(&r234, &r345, &r456, &r567); store_s32_4x4(dst2_ptr, dst_stride, r234, r345, r456, r567); dst2_ptr += 4; r4 = r8; r34 = r78; r234 = r678; w -= 4; } while (w > 0); h -= 4; count++; } while (h > 0); } } static inline int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) { int32x4_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl; int32x4_t fours, threes, res; xtl = vld1q_s32(buf - buf_stride - 1); xt = vld1q_s32(buf - buf_stride); xtr = vld1q_s32(buf - buf_stride + 1); xl = vld1q_s32(buf - 1); x = vld1q_s32(buf); xr = vld1q_s32(buf + 1); xbl = vld1q_s32(buf + buf_stride - 1); xb = vld1q_s32(buf + buf_stride); xbr = vld1q_s32(buf + buf_stride + 1); fours = vaddq_s32(xl, vaddq_s32(xt, vaddq_s32(xr, vaddq_s32(xb, x)))); threes = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl))); res = vsubq_s32(vshlq_n_s32(vaddq_s32(fours, threes), 2), threes); return res; } static inline void cross_sum_inp_u16(uint16_t *buf, int buf_stride, int32x4_t *a0, int32x4_t *a1) { uint16x8_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl; uint16x8_t r0, r1; xtl = vld1q_u16(buf - buf_stride - 1); xt = vld1q_u16(buf - buf_stride); xtr = vld1q_u16(buf - buf_stride + 1); xl = vld1q_u16(buf - 1); x = vld1q_u16(buf); xr = vld1q_u16(buf + 1); xbl = vld1q_u16(buf + buf_stride - 1); xb = vld1q_u16(buf + buf_stride); xbr = vld1q_u16(buf + buf_stride + 1); xb = vaddq_u16(xb, x); xt = vaddq_u16(xt, xr); xl = vaddq_u16(xl, xb); xl = vaddq_u16(xl, xt); r0 = vshlq_n_u16(xl, 2); xbl = vaddq_u16(xbl, xbr); xtl = vaddq_u16(xtl, xtr); xtl = vaddq_u16(xtl, xbl); r1 = vshlq_n_u16(xtl, 2); r1 = vsubq_u16(r1, xtl); *a0 = vreinterpretq_s32_u32( vaddq_u32(vmovl_u16(vget_low_u16(r0)), vmovl_u16(vget_low_u16(r1)))); *a1 = vreinterpretq_s32_u32( vaddq_u32(vmovl_u16(vget_high_u16(r0)), vmovl_u16(vget_high_u16(r1)))); } static inline int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) { int32x4_t xtr, xt, xtl, xbr, xb, xbl; int32x4_t fives, sixes, fives_plus_sixes; xtl = vld1q_s32(buf - buf_stride - 1); xt = vld1q_s32(buf - buf_stride); xtr = vld1q_s32(buf - buf_stride + 1); xbl = vld1q_s32(buf + buf_stride - 1); xb = vld1q_s32(buf + buf_stride); xbr = vld1q_s32(buf + buf_stride + 1); fives = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl))); sixes = vaddq_s32(xt, xb); fives_plus_sixes = vaddq_s32(fives, sixes); return vaddq_s32( vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes); } static inline void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride, int32x4_t *a0, int32x4_t *a1) { uint16x8_t xtr, xt, xtl, xbr, xb, xbl, xb0; xtl = vld1q_u16(buf - buf_stride - 1); xt = vld1q_u16(buf - buf_stride); xtr = vld1q_u16(buf - buf_stride + 1); xbl = vld1q_u16(buf + buf_stride - 1); xb = vld1q_u16(buf + buf_stride); xbr = vld1q_u16(buf + buf_stride + 1); xbr = vaddq_u16(xbr, xbl); xtr = vaddq_u16(xtr, xtl); xbr = vaddq_u16(xbr, xtr); xtl = vshlq_n_u16(xbr, 2); xbr = vaddq_u16(xtl, xbr); xb = vaddq_u16(xb, xt); xb0 = vshlq_n_u16(xb, 1); xb = vshlq_n_u16(xb, 2); xb = vaddq_u16(xb, xb0); *a0 = vreinterpretq_s32_u32( vaddq_u32(vmovl_u16(vget_low_u16(xbr)), vmovl_u16(vget_low_u16(xb)))); *a1 = vreinterpretq_s32_u32( vaddq_u32(vmovl_u16(vget_high_u16(xbr)), vmovl_u16(vget_high_u16(xb)))); } static inline int32x4_t cross_sum_fast_odd_row(int32_t *buf) { int32x4_t xl, x, xr; int32x4_t fives, sixes, fives_plus_sixes; xl = vld1q_s32(buf - 1); x = vld1q_s32(buf); xr = vld1q_s32(buf + 1); fives = vaddq_s32(xl, xr); sixes = x; fives_plus_sixes = vaddq_s32(fives, sixes); return vaddq_s32( vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes); } static inline void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0, int32x4_t *a1) { uint16x8_t xl, x, xr; uint16x8_t x0; xl = vld1q_u16(buf - 1); x = vld1q_u16(buf); xr = vld1q_u16(buf + 1); xl = vaddq_u16(xl, xr); x0 = vshlq_n_u16(xl, 2); xl = vaddq_u16(xl, x0); x0 = vshlq_n_u16(x, 1); x = vshlq_n_u16(x, 2); x = vaddq_u16(x, x0); *a0 = vreinterpretq_s32_u32( vaddq_u32(vmovl_u16(vget_low_u16(xl)), vmovl_u16(vget_low_u16(x)))); *a1 = vreinterpretq_s32_u32( vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x)))); } static void final_filter_fast_internal(uint16_t *A, int32_t *B, const int buf_stride, int16_t *src, const int src_stride, int32_t *dst, const int dst_stride, const int width, const int height) { int16x8_t s0; int32_t *B_tmp, *dst_ptr; uint16_t *A_tmp; int16_t *src_ptr; int32x4_t a_res0, a_res1, b_res0, b_res1; int w, h, count = 0; assert(SGRPROJ_SGR_BITS == 8); assert(SGRPROJ_RST_BITS == 4); A_tmp = A; B_tmp = B; src_ptr = src; dst_ptr = dst; h = height; do { A_tmp = (A + count * buf_stride); B_tmp = (B + count * buf_stride); src_ptr = (src + count * src_stride); dst_ptr = (dst + count * dst_stride); w = width; if (!(count & 1)) { do { s0 = vld1q_s16(src_ptr); cross_sum_fast_even_row_inp16(A_tmp, buf_stride, &a_res0, &a_res1); a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0); a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1); b_res0 = cross_sum_fast_even_row(B_tmp, buf_stride); b_res1 = cross_sum_fast_even_row(B_tmp + 4, buf_stride); a_res0 = vaddq_s32(a_res0, b_res0); a_res1 = vaddq_s32(a_res1, b_res1); a_res0 = vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); a_res1 = vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); vst1q_s32(dst_ptr, a_res0); vst1q_s32(dst_ptr + 4, a_res1); A_tmp += 8; B_tmp += 8; src_ptr += 8; dst_ptr += 8; w -= 8; } while (w > 0); } else { do { s0 = vld1q_s16(src_ptr); cross_sum_fast_odd_row_inp16(A_tmp, &a_res0, &a_res1); a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0); a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1); b_res0 = cross_sum_fast_odd_row(B_tmp); b_res1 = cross_sum_fast_odd_row(B_tmp + 4); a_res0 = vaddq_s32(a_res0, b_res0); a_res1 = vaddq_s32(a_res1, b_res1); a_res0 = vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS); a_res1 = vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS); vst1q_s32(dst_ptr, a_res0); vst1q_s32(dst_ptr + 4, a_res1); A_tmp += 8; B_tmp += 8; src_ptr += 8; dst_ptr += 8; w -= 8; } while (w > 0); } count++; h -= 1; } while (h > 0); } static void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride, int16_t *src, const int src_stride, int32_t *dst, const int dst_stride, const int width, const int height) { int16x8_t s0; int32_t *B_tmp, *dst_ptr; uint16_t *A_tmp; int16_t *src_ptr; int32x4_t a_res0, a_res1, b_res0, b_res1; int w, h, count = 0; assert(SGRPROJ_SGR_BITS == 8); assert(SGRPROJ_RST_BITS == 4); h = height; do { A_tmp = (A + count * buf_stride); B_tmp = (B + count * buf_stride); src_ptr = (src + count * src_stride); dst_ptr = (dst + count * dst_stride); w = width; do { s0 = vld1q_s16(src_ptr); cross_sum_inp_u16(A_tmp, buf_stride, &a_res0, &a_res1); a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0); a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1); b_res0 = cross_sum_inp_s32(B_tmp, buf_stride); b_res1 = cross_sum_inp_s32(B_tmp + 4, buf_stride); a_res0 = vaddq_s32(a_res0, b_res0); a_res1 = vaddq_s32(a_res1, b_res1); a_res0 = vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); a_res1 = vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); vst1q_s32(dst_ptr, a_res0); vst1q_s32(dst_ptr + 4, a_res1); A_tmp += 8; B_tmp += 8; src_ptr += 8; dst_ptr += 8; w -= 8; } while (w > 0); count++; h -= 1; } while (h > 0); } static inline int restoration_fast_internal(uint16_t *dgd16, int width, int height, int dgd_stride, int32_t *dst, int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; const int buf_stride = ((width_ext + 3) & ~3) + 16; const size_t buf_size = 3 * sizeof(int32_t) * RESTORATION_PROC_UNIT_PELS; int32_t *buf = aom_memalign(8, buf_size); if (!buf) return -1; int32_t *square_sum_buf = buf; int32_t *sum_buf = square_sum_buf + RESTORATION_PROC_UNIT_PELS; uint16_t *tmp16_buf = (uint16_t *)(sum_buf + RESTORATION_PROC_UNIT_PELS); assert((char *)(sum_buf + RESTORATION_PROC_UNIT_PELS) <= (char *)buf + buf_size && "Allocated buffer is too small. Resize the buffer."); assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && "Need SGRPROJ_BORDER_* >= r+1"); assert(radius_idx == 0); assert(r == 2); // input(dgd16) is 16bit. // sum of pixels 1st stage output will be in 16bit(tmp16_buf). End output is // kept in 32bit [sum_buf]. sum of squares output is kept in 32bit // buffer(square_sum_buf). boxsum2((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ), dgd_stride, (int16_t *)tmp16_buf, sum_buf, square_sum_buf, buf_stride, width_ext, height_ext); square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; tmp16_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; // Calculation of a, b. a output is in 16bit tmp_buf which is in range of // [1, 256] for all bit depths. b output is kept in 32bit buffer. #if CONFIG_AV1_HIGHBITDEPTH if (bit_depth > 8) { calc_ab_fast_internal_hbd( (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, bit_depth, r, params->s[radius_idx], 2); } else { calc_ab_fast_internal_lbd( (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r, params->s[radius_idx], 2); } #else (void)bit_depth; calc_ab_fast_internal_lbd((square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r, params->s[radius_idx], 2); #endif final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16, dgd_stride, dst, dst_stride, width, height); aom_free(buf); return 0; } static inline int restoration_internal(uint16_t *dgd16, int width, int height, int dgd_stride, int32_t *dst, int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; const int buf_stride = ((width_ext + 3) & ~3) + 16; const size_t buf_size = 3 * sizeof(int32_t) * RESTORATION_PROC_UNIT_PELS; int32_t *buf = aom_memalign(8, buf_size); if (!buf) return -1; int32_t *square_sum_buf = buf; int32_t *B = square_sum_buf + RESTORATION_PROC_UNIT_PELS; uint16_t *A16 = (uint16_t *)(B + RESTORATION_PROC_UNIT_PELS); uint16_t *sum_buf = A16 + RESTORATION_PROC_UNIT_PELS; assert((char *)(sum_buf + RESTORATION_PROC_UNIT_PELS) <= (char *)buf + buf_size && "Allocated buffer is too small. Resize the buffer."); assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && "Need SGRPROJ_BORDER_* >= r+1"); assert(radius_idx == 1); assert(r == 1); // input(dgd16) is 16bit. // sum of pixels output will be in 16bit(sum_buf). // sum of squares output is kept in 32bit buffer(square_sum_buf). boxsum1((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ), dgd_stride, sum_buf, square_sum_buf, buf_stride, width_ext, height_ext); square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; #if CONFIG_AV1_HIGHBITDEPTH // Calculation of a, b. a output is in 16bit tmp_buf which is in range of // [1, 256] for all bit depths. b output is kept in 32bit buffer. if (bit_depth > 8) { calc_ab_internal_hbd((square_sum_buf - buf_stride - 1), (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), (B - buf_stride - 1), buf_stride, width + 2, height + 2, bit_depth, r, params->s[radius_idx], 1); } else { calc_ab_internal_lbd((square_sum_buf - buf_stride - 1), (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), (B - buf_stride - 1), buf_stride, width + 2, height + 2, r, params->s[radius_idx], 1); } #else (void)bit_depth; calc_ab_internal_lbd((square_sum_buf - buf_stride - 1), (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), (B - buf_stride - 1), buf_stride, width + 2, height + 2, r, params->s[radius_idx], 1); #endif final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst, dst_stride, width, height); aom_free(buf); return 0; } static inline void src_convert_u8_to_u16(const uint8_t *src, const int src_stride, uint16_t *dst, const int dst_stride, const int width, const int height) { const uint8_t *src_ptr; uint16_t *dst_ptr; int h, w, count = 0; uint8x8_t t1, t2, t3, t4; uint16x8_t s1, s2, s3, s4; h = height; do { src_ptr = src + (count << 2) * src_stride; dst_ptr = dst + (count << 2) * dst_stride; w = width; if (w >= 7) { do { load_u8_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4); s1 = vmovl_u8(t1); s2 = vmovl_u8(t2); s3 = vmovl_u8(t3); s4 = vmovl_u8(t4); store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w > 7); } for (int y = 0; y < w; y++) { dst_ptr[y] = src_ptr[y]; dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride]; dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride]; dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride]; } count++; h -= 4; } while (h > 3); src_ptr = src + (count << 2) * src_stride; dst_ptr = dst + (count << 2) * dst_stride; for (int x = 0; x < h; x++) { for (int y = 0; y < width; y++) { dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride]; } } // memset uninitialized rows of src buffer as they are needed for the // boxsum filter calculation. for (int x = height; x < height + 5; x++) memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst)); } #if CONFIG_AV1_HIGHBITDEPTH static inline void src_convert_hbd_copy(const uint16_t *src, int src_stride, uint16_t *dst, const int dst_stride, int width, int height) { const uint16_t *src_ptr; uint16_t *dst_ptr; int h, w, count = 0; uint16x8_t s1, s2, s3, s4; h = height; do { src_ptr = src + (count << 2) * src_stride; dst_ptr = dst + (count << 2) * dst_stride; w = width; do { load_u16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4); store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4); src_ptr += 8; dst_ptr += 8; w -= 8; } while (w > 7); for (int y = 0; y < w; y++) { dst_ptr[y] = src_ptr[y]; dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride]; dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride]; dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride]; } count++; h -= 4; } while (h > 3); src_ptr = src + (count << 2) * src_stride; dst_ptr = dst + (count << 2) * dst_stride; for (int x = 0; x < h; x++) { memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride), sizeof(uint16_t) * width); } // memset uninitialized rows of src buffer as they are needed for the // boxsum filter calculation. for (int x = height; x < height + 5; x++) memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst)); } #endif // CONFIG_AV1_HIGHBITDEPTH int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, int stride, int32_t *flt0, int32_t *flt1, int flt_stride, int sgr_params_idx, int bit_depth, int highbd) { const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; assert(!(params->r[0] == 0 && params->r[1] == 0)); uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS]; const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ; uint16_t *dgd16 = dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; const int dgd_stride = stride; #if CONFIG_AV1_HIGHBITDEPTH if (highbd) { const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8); src_convert_hbd_copy( dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride, dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, dgd16_stride, width_ext, height_ext); } else { src_convert_u8_to_u16( dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride, dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, dgd16_stride, width_ext, height_ext); } #else (void)highbd; src_convert_u8_to_u16( dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride, dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, dgd16_stride, width_ext, height_ext); #endif if (params->r[0] > 0) { int ret = restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, flt_stride, bit_depth, sgr_params_idx, 0); if (ret != 0) return ret; } if (params->r[1] > 0) { int ret = restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride, bit_depth, sgr_params_idx, 1); if (ret != 0) return ret; } return 0; } int av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst8, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd) { int32_t *flt0 = tmpbuf; int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; assert(width * height <= RESTORATION_UNITPELS_MAX); uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS]; const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ; uint16_t *dgd16 = dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; const int dgd_stride = stride; const sgr_params_type *const params = &av1_sgr_params[eps]; int xq[2]; assert(!(params->r[0] == 0 && params->r[1] == 0)); #if CONFIG_AV1_HIGHBITDEPTH if (highbd) { const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8); src_convert_hbd_copy( dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride, dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, dgd16_stride, width_ext, height_ext); } else { src_convert_u8_to_u16( dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride, dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, dgd16_stride, width_ext, height_ext); } #else (void)highbd; src_convert_u8_to_u16( dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride, dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, dgd16_stride, width_ext, height_ext); #endif if (params->r[0] > 0) { int ret = restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width, bit_depth, eps, 0); if (ret != 0) return ret; } if (params->r[1] > 0) { int ret = restoration_internal(dgd16, width, height, dgd16_stride, flt1, width, bit_depth, eps, 1); if (ret != 0) return ret; } av1_decode_xq(xqd, xq, params); { int16_t *src_ptr; uint8_t *dst_ptr; #if CONFIG_AV1_HIGHBITDEPTH uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8); uint16_t *dst16_ptr; #endif int16x4_t d0, d4; int16x8_t r0, s0; uint16x8_t r4; int32x4_t u0, u4, v0, v4, f00, f10; uint8x8_t t0; int count = 0, w = width, h = height, rc = 0; const int32x4_t xq0_vec = vdupq_n_s32(xq[0]); const int32x4_t xq1_vec = vdupq_n_s32(xq[1]); const int16x8_t zero = vdupq_n_s16(0); const uint16x8_t max = vdupq_n_u16((1 << bit_depth) - 1); src_ptr = (int16_t *)dgd16; do { w = width; count = 0; dst_ptr = dst8 + rc * dst_stride; #if CONFIG_AV1_HIGHBITDEPTH dst16_ptr = dst16 + rc * dst_stride; #endif do { s0 = vld1q_s16(src_ptr + count); u0 = vshll_n_s16(vget_low_s16(s0), SGRPROJ_RST_BITS); u4 = vshll_n_s16(vget_high_s16(s0), SGRPROJ_RST_BITS); v0 = vshlq_n_s32(u0, SGRPROJ_PRJ_BITS); v4 = vshlq_n_s32(u4, SGRPROJ_PRJ_BITS); if (params->r[0] > 0) { f00 = vld1q_s32(flt0 + count); f10 = vld1q_s32(flt0 + count + 4); f00 = vsubq_s32(f00, u0); f10 = vsubq_s32(f10, u4); v0 = vmlaq_s32(v0, xq0_vec, f00); v4 = vmlaq_s32(v4, xq0_vec, f10); } if (params->r[1] > 0) { f00 = vld1q_s32(flt1 + count); f10 = vld1q_s32(flt1 + count + 4); f00 = vsubq_s32(f00, u0); f10 = vsubq_s32(f10, u4); v0 = vmlaq_s32(v0, xq1_vec, f00); v4 = vmlaq_s32(v4, xq1_vec, f10); } d0 = vqrshrn_n_s32(v0, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); d4 = vqrshrn_n_s32(v4, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); r0 = vcombine_s16(d0, d4); r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero)); #if CONFIG_AV1_HIGHBITDEPTH if (highbd) { r4 = vminq_u16(r4, max); vst1q_u16(dst16_ptr, r4); dst16_ptr += 8; } else { t0 = vqmovn_u16(r4); vst1_u8(dst_ptr, t0); dst_ptr += 8; } #else (void)max; t0 = vqmovn_u16(r4); vst1_u8(dst_ptr, t0); dst_ptr += 8; #endif w -= 8; count += 8; } while (w > 0); src_ptr += dgd16_stride; flt1 += width; flt0 += width; rc++; h--; } while (h > 0); } return 0; } aom-3.12.1/av1/common/arm/warp_plane_neon.c000066400000000000000000000307621477627663500204620ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "warp_plane_neon.h" static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, int alpha) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); // Loading the 8 filter taps int16x8_t f[4]; load_filters_4(f, sx, alpha); int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in))); int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in))); int16x8_t m0 = vmulq_s16(f[0], in16_lo); int16x8_t m1 = vmulq_s16(f[1], vextq_s16(in16_lo, in16_hi, 1)); int16x8_t m2 = vmulq_s16(f[2], vextq_s16(in16_lo, in16_hi, 2)); int16x8_t m3 = vmulq_s16(f[3], vextq_s16(in16_lo, in16_hi, 3)); int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2), vpaddlq_s16(m3) }; int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs); tmp_res_low = vaddq_s32(tmp_res_low, add_const); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0)); return vreinterpretq_s16_u16(res); } static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, int alpha) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); // Loading the 8 filter taps int16x8_t f[8]; load_filters_8(f, sx, alpha); int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in))); int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in))); int16x8_t m0 = vmulq_s16(f[0], in16_lo); int16x8_t m1 = vmulq_s16(f[1], vextq_s16(in16_lo, in16_hi, 1)); int16x8_t m2 = vmulq_s16(f[2], vextq_s16(in16_lo, in16_hi, 2)); int16x8_t m3 = vmulq_s16(f[3], vextq_s16(in16_lo, in16_hi, 3)); int16x8_t m4 = vmulq_s16(f[4], vextq_s16(in16_lo, in16_hi, 4)); int16x8_t m5 = vmulq_s16(f[5], vextq_s16(in16_lo, in16_hi, 5)); int16x8_t m6 = vmulq_s16(f[6], vextq_s16(in16_lo, in16_hi, 6)); int16x8_t m7 = vmulq_s16(f[7], vextq_s16(in16_lo, in16_hi, 7)); int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2), vpaddlq_s16(m3) }; int32x4_t m4567_pairs[] = { vpaddlq_s16(m4), vpaddlq_s16(m5), vpaddlq_s16(m6), vpaddlq_s16(m7) }; int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs); int32x4_t tmp_res_high = horizontal_add_4d_s32x4(m4567_pairs); tmp_res_low = vaddq_s32(tmp_res_low, add_const); tmp_res_high = vaddq_s32(tmp_res_high, add_const); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vqrshrun_n_s32(tmp_res_high, ROUND0_BITS)); return vreinterpretq_s16_u16(res); } static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in))); int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in))); int16x8_t m0 = vmulq_s16(f_s16, in16_lo); int16x8_t m1 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 1)); int16x8_t m2 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 2)); int16x8_t m3 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 3)); int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2), vpaddlq_s16(m3) }; int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs); tmp_res_low = vaddq_s32(tmp_res_low, add_const); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0)); return vreinterpretq_s16_u16(res); } static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); return horizontal_filter_4x1_f1_beta0(in, f_s16); } static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in))); int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in))); int16x8_t m0 = vmulq_s16(f_s16, in16_lo); int16x8_t m1 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 1)); int16x8_t m2 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 2)); int16x8_t m3 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 3)); int16x8_t m4 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 4)); int16x8_t m5 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 5)); int16x8_t m6 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 6)); int16x8_t m7 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 7)); int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2), vpaddlq_s16(m3) }; int32x4_t m4567_pairs[] = { vpaddlq_s16(m4), vpaddlq_s16(m5), vpaddlq_s16(m6), vpaddlq_s16(m7) }; int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs); int32x4_t tmp_res_high = horizontal_add_4d_s32x4(m4567_pairs); tmp_res_low = vaddq_s32(tmp_res_low, add_const); tmp_res_high = vaddq_s32(tmp_res_high, add_const); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vqrshrun_n_s32(tmp_res_high, ROUND0_BITS)); return vreinterpretq_s16_u16(res); } static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); return horizontal_filter_8x1_f1_beta0(in, f_s16); } static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, int sy) { int16x4_t s0 = vget_low_s16(src[0]); int16x4_t s1 = vget_low_s16(src[1]); int16x4_t s2 = vget_low_s16(src[2]); int16x4_t s3 = vget_low_s16(src[3]); int16x4_t s4 = vget_low_s16(src[4]); int16x4_t s5 = vget_low_s16(src[5]); int16x4_t s6 = vget_low_s16(src[6]); int16x4_t s7 = vget_low_s16(src[7]); int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2); m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3); m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0); m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1); m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2); m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3); *res = m0123; } static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, int sy, int gamma) { int16x8_t s0, s1, s2, s3; transpose_elems_s16_4x8( vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]), vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]), vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3); int16x8_t f[4]; load_filters_4(f, sy, gamma); int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); int32x4_t m0123_pairs[] = { m0, m1, m2, m3 }; *res = horizontal_add_4d_s32x4(m0123_pairs); } static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, int32x4_t *res_low, int32x4_t *res_high, int sy) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; int16x8_t s3 = src[3]; int16x8_t s4 = src[4]; int16x8_t s5 = src[5]; int16x8_t s6 = src[6]; int16x8_t s7 = src[7]; int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3); int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3); *res_low = m0123; *res_high = m4567; } static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, int32x4_t *res_low, int32x4_t *res_high, int sy, int gamma) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; int16x8_t s3 = src[3]; int16x8_t s4 = src[4]; int16x8_t s5 = src[5]; int16x8_t s6 = src[6]; int16x8_t s7 = src[7]; transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); int16x8_t f[8]; load_filters_8(f, sy, gamma); int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4])); m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4])); int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5])); m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5])); int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6])); m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6])); int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7])); m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7])); int32x4_t m0123_pairs[] = { m0, m1, m2, m3 }; int32x4_t m4567_pairs[] = { m4, m5, m6, m7 }; *res_low = horizontal_add_4d_s32x4(m0123_pairs); *res_high = horizontal_add_4d_s32x4(m4567_pairs); } void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, conv_params, alpha, beta, gamma, delta); } aom-3.12.1/av1/common/arm/warp_plane_neon.h000066400000000000000000000403541477627663500204650ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_ #define AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_ #include #include #include #include #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_ports/mem.h" #include "config/av1_rtcd.h" #include "av1/common/warped_motion.h" #include "av1/common/scale.h" static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, int alpha); static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, int alpha); static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx); static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx); static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16); static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16); static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, int sy); static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, int sy, int gamma); static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, int32x4_t *res_low, int32x4_t *res_high, int sy); static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, int32x4_t *res_low, int32x4_t *res_high, int sy, int gamma); static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int offset, int stride) { out[0] = vld1q_s16( av1_warped_filter[(offset + 0 * stride) >> WARPEDDIFF_PREC_BITS]); out[1] = vld1q_s16( av1_warped_filter[(offset + 1 * stride) >> WARPEDDIFF_PREC_BITS]); out[2] = vld1q_s16( av1_warped_filter[(offset + 2 * stride) >> WARPEDDIFF_PREC_BITS]); out[3] = vld1q_s16( av1_warped_filter[(offset + 3 * stride) >> WARPEDDIFF_PREC_BITS]); } static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int offset, int stride) { out[0] = vld1q_s16( av1_warped_filter[(offset + 0 * stride) >> WARPEDDIFF_PREC_BITS]); out[1] = vld1q_s16( av1_warped_filter[(offset + 1 * stride) >> WARPEDDIFF_PREC_BITS]); out[2] = vld1q_s16( av1_warped_filter[(offset + 2 * stride) >> WARPEDDIFF_PREC_BITS]); out[3] = vld1q_s16( av1_warped_filter[(offset + 3 * stride) >> WARPEDDIFF_PREC_BITS]); out[4] = vld1q_s16( av1_warped_filter[(offset + 4 * stride) >> WARPEDDIFF_PREC_BITS]); out[5] = vld1q_s16( av1_warped_filter[(offset + 5 * stride) >> WARPEDDIFF_PREC_BITS]); out[6] = vld1q_s16( av1_warped_filter[(offset + 6 * stride) >> WARPEDDIFF_PREC_BITS]); out[7] = vld1q_s16( av1_warped_filter[(offset + 7 * stride) >> WARPEDDIFF_PREC_BITS]); } static AOM_FORCE_INLINE int clamp_iy(int iy, int height) { return clamp(iy, 0, height - 1); } static AOM_FORCE_INLINE void warp_affine_horizontal( const uint8_t *ref, int width, int height, int stride, int p_width, int p_height, int16_t alpha, int16_t beta, const int64_t x4, const int64_t y4, const int i, int16x8_t tmp[]) { const int bd = 8; const int reduce_bits_horiz = ROUND0_BITS; const int height_limit = AOMMIN(8, p_height - i) + 7; int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); if (ix4 <= -7) { for (int k = 0; k < height_limit; ++k) { int iy = clamp_iy(iy4 + k - 7, height); int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)); tmp[k] = vdupq_n_s16(dup_val); } return; } else if (ix4 >= width + 6) { for (int k = 0; k < height_limit; ++k) { int iy = clamp_iy(iy4 + k - 7, height); int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride + (width - 1)] * (1 << (FILTER_BITS - reduce_bits_horiz)); tmp[k] = vdupq_n_s16(dup_val); } return; } static const uint8_t kIotaArr[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; const uint8x16_t indx = vld1q_u8(kIotaArr); const int out_of_boundary_left = -(ix4 - 6); const int out_of_boundary_right = (ix4 + 8) - width; #define APPLY_HORIZONTAL_SHIFT(fn, ...) \ do { \ if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) { \ for (int k = 0; k < height_limit; ++k) { \ const int iy = clamp_iy(iy4 + k - 7, height); \ const uint8_t *src = ref + iy * stride + ix4 - 7; \ uint8x16_t src_1 = vld1q_u8(src); \ \ if (out_of_boundary_left >= 0) { \ int limit = out_of_boundary_left + 1; \ uint8x16_t cmp_vec = vdupq_n_u8(out_of_boundary_left); \ uint8x16_t vec_dup = vdupq_n_u8(*(src + limit)); \ uint8x16_t mask_val = vcleq_u8(indx, cmp_vec); \ src_1 = vbslq_u8(mask_val, vec_dup, src_1); \ } \ if (out_of_boundary_right >= 0) { \ int limit = 15 - (out_of_boundary_right + 1); \ uint8x16_t cmp_vec = vdupq_n_u8(15 - out_of_boundary_right); \ uint8x16_t vec_dup = vdupq_n_u8(*(src + limit)); \ uint8x16_t mask_val = vcgeq_u8(indx, cmp_vec); \ src_1 = vbslq_u8(mask_val, vec_dup, src_1); \ } \ tmp[k] = (fn)(src_1, __VA_ARGS__); \ } \ } else { \ for (int k = 0; k < height_limit; ++k) { \ const int iy = clamp_iy(iy4 + k - 7, height); \ const uint8_t *src = ref + iy * stride + ix4 - 7; \ uint8x16_t src_1 = vld1q_u8(src); \ tmp[k] = (fn)(src_1, __VA_ARGS__); \ } \ } \ } while (0) if (p_width == 4) { if (beta == 0) { if (alpha == 0) { int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]); APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1_beta0, f_s16); } else { APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha); } } else { if (alpha == 0) { APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1, (sx4 + beta * (k - 3))); } else { APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, (sx4 + beta * (k - 3)), alpha); } } } else { if (beta == 0) { if (alpha == 0) { int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx4 >> WARPEDDIFF_PREC_BITS]); APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1_beta0, f_s16); } else { APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha); } } else { if (alpha == 0) { APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1, (sx4 + beta * (k - 3))); } else { APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, (sx4 + beta * (k - 3)), alpha); } } } } static AOM_FORCE_INLINE void warp_affine_vertical( uint8_t *pred, int p_width, int p_height, int p_stride, int is_compound, uint16_t *dst, int dst_stride, int do_average, int use_dist_wtd_comp_avg, int16_t gamma, int16_t delta, const int64_t y4, const int i, const int j, int16x8_t tmp[], const int fwd, const int bwd) { const int bd = 8; const int reduce_bits_horiz = ROUND0_BITS; const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; int add_const_vert; if (is_compound) { add_const_vert = (1 << offset_bits_vert) + (1 << (COMPOUND_ROUND1_BITS - 1)); } else { add_const_vert = (1 << offset_bits_vert) + (1 << (2 * FILTER_BITS - ROUND0_BITS - 1)); } const int sub_constant = (1 << (bd - 1)) + (1 << bd); const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; const int res_sub_const = (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1)) - (1 << (offset_bits - COMPOUND_ROUND1_BITS)) - (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); if (p_width > 4) { for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { int sy = sy4 + delta * (k + 4); const int16x8_t *v_src = tmp + (k + 4); int32x4_t res_lo, res_hi; if (gamma == 0) { vertical_filter_8x1_f1(v_src, &res_lo, &res_hi, sy); } else { vertical_filter_8x1_f8(v_src, &res_lo, &res_hi, sy, gamma); } res_lo = vaddq_s32(res_lo, vdupq_n_s32(add_const_vert)); res_hi = vaddq_s32(res_hi, vdupq_n_s32(add_const_vert)); if (is_compound) { uint16_t *const p = (uint16_t *)&dst[(i + k + 4) * dst_stride + j]; int16x8_t res_s16 = vcombine_s16(vshrn_n_s32(res_lo, COMPOUND_ROUND1_BITS), vshrn_n_s32(res_hi, COMPOUND_ROUND1_BITS)); if (do_average) { int16x8_t tmp16 = vreinterpretq_s16_u16(vld1q_u16(p)); if (use_dist_wtd_comp_avg) { int32x4_t tmp32_lo = vmull_n_s16(vget_low_s16(tmp16), fwd); int32x4_t tmp32_hi = vmull_n_s16(vget_high_s16(tmp16), fwd); tmp32_lo = vmlal_n_s16(tmp32_lo, vget_low_s16(res_s16), bwd); tmp32_hi = vmlal_n_s16(tmp32_hi, vget_high_s16(res_s16), bwd); tmp16 = vcombine_s16(vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS), vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS)); } else { tmp16 = vhaddq_s16(tmp16, res_s16); } int16x8_t res = vaddq_s16(tmp16, vdupq_n_s16(res_sub_const)); uint8x8_t res8 = vqshrun_n_s16( res, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS); vst1_u8(&pred[(i + k + 4) * p_stride + j], res8); } else { vst1q_u16(p, vreinterpretq_u16_s16(res_s16)); } } else { int16x8_t res16 = vcombine_s16(vshrn_n_s32(res_lo, 2 * FILTER_BITS - ROUND0_BITS), vshrn_n_s32(res_hi, 2 * FILTER_BITS - ROUND0_BITS)); res16 = vsubq_s16(res16, vdupq_n_s16(sub_constant)); uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j]; vst1_u8(p, vqmovun_s16(res16)); } } } else { // p_width == 4 for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { int sy = sy4 + delta * (k + 4); const int16x8_t *v_src = tmp + (k + 4); int32x4_t res_lo; if (gamma == 0) { vertical_filter_4x1_f1(v_src, &res_lo, sy); } else { vertical_filter_4x1_f4(v_src, &res_lo, sy, gamma); } res_lo = vaddq_s32(res_lo, vdupq_n_s32(add_const_vert)); if (is_compound) { uint16_t *const p = (uint16_t *)&dst[(i + k + 4) * dst_stride + j]; int16x4_t res_lo_s16 = vshrn_n_s32(res_lo, COMPOUND_ROUND1_BITS); if (do_average) { uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j]; int16x4_t tmp16_lo = vreinterpret_s16_u16(vld1_u16(p)); if (use_dist_wtd_comp_avg) { int32x4_t tmp32_lo = vmull_n_s16(tmp16_lo, fwd); tmp32_lo = vmlal_n_s16(tmp32_lo, res_lo_s16, bwd); tmp16_lo = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS); } else { tmp16_lo = vhadd_s16(tmp16_lo, res_lo_s16); } int16x4_t res = vadd_s16(tmp16_lo, vdup_n_s16(res_sub_const)); uint8x8_t res8 = vqshrun_n_s16( vcombine_s16(res, vdup_n_s16(0)), 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS); vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res8), 0); } else { uint16x4_t res_u16_low = vreinterpret_u16_s16(res_lo_s16); vst1_u16(p, res_u16_low); } } else { int16x4_t res16 = vshrn_n_s32(res_lo, 2 * FILTER_BITS - ROUND0_BITS); res16 = vsub_s16(res16, vdup_n_s16(sub_constant)); uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j]; uint8x8_t val = vqmovun_s16(vcombine_s16(res16, vdup_n_s16(0))); vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0); } } } } static AOM_FORCE_INLINE void av1_warp_affine_common( const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const int is_compound = conv_params->is_compound; uint16_t *const dst = conv_params->dst; const int dst_stride = conv_params->dst_stride; const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; assert(IMPLIES(is_compound, dst != NULL)); assert(IMPLIES(do_average, is_compound)); for (int i = 0; i < p_height; i += 8) { for (int j = 0; j < p_width; j += 8) { const int32_t src_x = (p_col + j + 4) << subsampling_x; const int32_t src_y = (p_row + i + 4) << subsampling_y; const int64_t dst_x = (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; const int64_t dst_y = (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; const int64_t x4 = dst_x >> subsampling_x; const int64_t y4 = dst_y >> subsampling_y; int16x8_t tmp[15]; warp_affine_horizontal(ref, width, height, stride, p_width, p_height, alpha, beta, x4, y4, i, tmp); warp_affine_vertical(pred, p_width, p_height, p_stride, is_compound, dst, dst_stride, do_average, use_dist_wtd_comp_avg, gamma, delta, y4, i, j, tmp, w0, w1); } } } #endif // AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_ aom-3.12.1/av1/common/arm/warp_plane_neon_i8mm.c000066400000000000000000000310101477627663500213770ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "warp_plane_neon.h" DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, int alpha) { // Only put the constant in every other lane to avoid double-counting when // performing the pairwise add later. const int32x4_t add_const = vreinterpretq_s32_u64(vdupq_n_u64(1 << (8 + FILTER_BITS - 1))); // Loading the 8 filter taps int16x8_t f[4]; load_filters_4(f, sx, alpha); int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1])); int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3])); uint8x8_t in0 = vget_low_u8(in); uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1)); uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2)); uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3)); int32x4_t m01 = vusdotq_s32(add_const, vcombine_u8(in0, in1), f01_u8); int32x4_t m23 = vusdotq_s32(add_const, vcombine_u8(in2, in3), f23_u8); int32x4_t m0123 = vpaddq_s32(m01, m23); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(m0123, ROUND0_BITS), vdup_n_u16(0)); return vreinterpretq_s16_u16(res); } static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, int alpha) { // Only put the constant in every other lane to avoid double-counting when // performing the pairwise add later. const int32x4_t add_const = vreinterpretq_s32_u64(vdupq_n_u64(1 << (8 + FILTER_BITS - 1))); // Loading the 8 filter taps int16x8_t f[8]; load_filters_8(f, sx, alpha); int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1])); int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3])); int8x16_t f45_u8 = vcombine_s8(vmovn_s16(f[4]), vmovn_s16(f[5])); int8x16_t f67_u8 = vcombine_s8(vmovn_s16(f[6]), vmovn_s16(f[7])); uint8x8_t in0 = vget_low_u8(in); uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1)); uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2)); uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3)); uint8x8_t in4 = vget_low_u8(vextq_u8(in, in, 4)); uint8x8_t in5 = vget_low_u8(vextq_u8(in, in, 5)); uint8x8_t in6 = vget_low_u8(vextq_u8(in, in, 6)); uint8x8_t in7 = vget_low_u8(vextq_u8(in, in, 7)); int32x4_t m01 = vusdotq_s32(add_const, vcombine_u8(in0, in1), f01_u8); int32x4_t m23 = vusdotq_s32(add_const, vcombine_u8(in2, in3), f23_u8); int32x4_t m45 = vusdotq_s32(add_const, vcombine_u8(in4, in5), f45_u8); int32x4_t m67 = vusdotq_s32(add_const, vcombine_u8(in6, in7), f67_u8); int32x4_t m0123 = vpaddq_s32(m01, m23); int32x4_t m4567 = vpaddq_s32(m45, m67); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(m0123, ROUND0_BITS), vqrshrun_n_s32(m4567, ROUND0_BITS)); return vreinterpretq_s16_u16(res); } static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16)); uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]); uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } uint8x16_t in_0123 = vqtbl1q_u8(in, perm0); uint8x16_t in_4567 = vqtbl1q_u8(in, perm1); int32x4_t m0123 = vusdotq_laneq_s32(add_const, in_0123, f_s8, 0); m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(m0123, ROUND0_BITS), vdup_n_u16(0)); return vreinterpretq_s16_u16(res); } static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); return horizontal_filter_4x1_f1_beta0(in, f_s16); } static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16)); uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]); uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]); uint8x16_t perm2 = vld1q_u8(&usdot_permute_idx[32]); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } uint8x16_t in_0123 = vqtbl1q_u8(in, perm0); uint8x16_t in_4567 = vqtbl1q_u8(in, perm1); uint8x16_t in_89ab = vqtbl1q_u8(in, perm2); int32x4_t m0123 = vusdotq_laneq_s32(add_const, in_0123, f_s8, 0); m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1); int32x4_t m4567 = vusdotq_laneq_s32(add_const, in_4567, f_s8, 0); m4567 = vusdotq_laneq_s32(m4567, in_89ab, f_s8, 1); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(m0123, ROUND0_BITS), vqrshrun_n_s32(m4567, ROUND0_BITS)); return vreinterpretq_s16_u16(res); } static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); return horizontal_filter_8x1_f1_beta0(in, f_s16); } static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, int sy) { int16x4_t s0 = vget_low_s16(src[0]); int16x4_t s1 = vget_low_s16(src[1]); int16x4_t s2 = vget_low_s16(src[2]); int16x4_t s3 = vget_low_s16(src[3]); int16x4_t s4 = vget_low_s16(src[4]); int16x4_t s5 = vget_low_s16(src[5]); int16x4_t s6 = vget_low_s16(src[6]); int16x4_t s7 = vget_low_s16(src[7]); int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2); m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3); m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0); m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1); m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2); m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3); *res = m0123; } static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, int sy, int gamma) { int16x8_t s0, s1, s2, s3; transpose_elems_s16_4x8( vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]), vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]), vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3); int16x8_t f[4]; load_filters_4(f, sy, gamma); int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); int32x4_t m0123_pairs[] = { m0, m1, m2, m3 }; *res = horizontal_add_4d_s32x4(m0123_pairs); } static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, int32x4_t *res_low, int32x4_t *res_high, int sy) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; int16x8_t s3 = src[3]; int16x8_t s4 = src[4]; int16x8_t s5 = src[5]; int16x8_t s6 = src[6]; int16x8_t s7 = src[7]; int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3); int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3); *res_low = m0123; *res_high = m4567; } static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, int32x4_t *res_low, int32x4_t *res_high, int sy, int gamma) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; int16x8_t s3 = src[3]; int16x8_t s4 = src[4]; int16x8_t s5 = src[5]; int16x8_t s6 = src[6]; int16x8_t s7 = src[7]; transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); int16x8_t f[8]; load_filters_8(f, sy, gamma); int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4])); m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4])); int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5])); m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5])); int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6])); m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6])); int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7])); m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7])); int32x4_t m0123_pairs[] = { m0, m1, m2, m3 }; int32x4_t m4567_pairs[] = { m4, m5, m6, m7 }; *res_low = horizontal_add_4d_s32x4(m0123_pairs); *res_high = horizontal_add_4d_s32x4(m4567_pairs); } void av1_warp_affine_neon_i8mm(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, conv_params, alpha, beta, gamma, delta); } aom-3.12.1/av1/common/arm/warp_plane_sve.c000066400000000000000000000274551477627663500203250ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "warp_plane_neon.h" DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, int alpha) { // Only put the constant in every other lane to avoid double-counting when // performing the pairwise add later. const int32x4_t add_const = vreinterpretq_s32_u64(vdupq_n_u64(1 << (8 + FILTER_BITS - 1))); // Loading the 8 filter taps int16x8_t f[4]; load_filters_4(f, sx, alpha); int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1])); int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3])); uint8x8_t in0 = vget_low_u8(in); uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1)); uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2)); uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3)); int32x4_t m01 = vusdotq_s32(add_const, vcombine_u8(in0, in1), f01_u8); int32x4_t m23 = vusdotq_s32(add_const, vcombine_u8(in2, in3), f23_u8); int32x4_t m0123 = vpaddq_s32(m01, m23); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(m0123, ROUND0_BITS), vdup_n_u16(0)); return vreinterpretq_s16_u16(res); } static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, int alpha) { // Only put the constant in every other lane to avoid double-counting when // performing the pairwise add later. const int32x4_t add_const = vreinterpretq_s32_u64(vdupq_n_u64(1 << (8 + FILTER_BITS - 1))); // Loading the 8 filter taps int16x8_t f[8]; load_filters_8(f, sx, alpha); int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1])); int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3])); int8x16_t f45_u8 = vcombine_s8(vmovn_s16(f[4]), vmovn_s16(f[5])); int8x16_t f67_u8 = vcombine_s8(vmovn_s16(f[6]), vmovn_s16(f[7])); uint8x8_t in0 = vget_low_u8(in); uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1)); uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2)); uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3)); uint8x8_t in4 = vget_low_u8(vextq_u8(in, in, 4)); uint8x8_t in5 = vget_low_u8(vextq_u8(in, in, 5)); uint8x8_t in6 = vget_low_u8(vextq_u8(in, in, 6)); uint8x8_t in7 = vget_low_u8(vextq_u8(in, in, 7)); int32x4_t m01 = vusdotq_s32(add_const, vcombine_u8(in0, in1), f01_u8); int32x4_t m23 = vusdotq_s32(add_const, vcombine_u8(in2, in3), f23_u8); int32x4_t m45 = vusdotq_s32(add_const, vcombine_u8(in4, in5), f45_u8); int32x4_t m67 = vusdotq_s32(add_const, vcombine_u8(in6, in7), f67_u8); int32x4_t m0123 = vpaddq_s32(m01, m23); int32x4_t m4567 = vpaddq_s32(m45, m67); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(m0123, ROUND0_BITS), vqrshrun_n_s32(m4567, ROUND0_BITS)); return vreinterpretq_s16_u16(res); } static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16)); uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]); uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } uint8x16_t in_0123 = vqtbl1q_u8(in, perm0); uint8x16_t in_4567 = vqtbl1q_u8(in, perm1); int32x4_t m0123 = vusdotq_laneq_s32(add_const, in_0123, f_s8, 0); m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(m0123, ROUND0_BITS), vdup_n_u16(0)); return vreinterpretq_s16_u16(res); } static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); return horizontal_filter_4x1_f1_beta0(in, f_s16); } static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1_beta0(const uint8x16_t in, int16x8_t f_s16) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16)); uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]); uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]); uint8x16_t perm2 = vld1q_u8(&usdot_permute_idx[32]); // Permute samples ready for dot product. // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } uint8x16_t in_0123 = vqtbl1q_u8(in, perm0); uint8x16_t in_4567 = vqtbl1q_u8(in, perm1); uint8x16_t in_89ab = vqtbl1q_u8(in, perm2); int32x4_t m0123 = vusdotq_laneq_s32(add_const, in_0123, f_s8, 0); m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1); int32x4_t m4567 = vusdotq_laneq_s32(add_const, in_4567, f_s8, 0); m4567 = vusdotq_laneq_s32(m4567, in_89ab, f_s8, 1); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(m0123, ROUND0_BITS), vqrshrun_n_s32(m4567, ROUND0_BITS)); return vreinterpretq_s16_u16(res); } static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { int16x8_t f_s16 = vld1q_s16(av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); return horizontal_filter_8x1_f1_beta0(in, f_s16); } static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, int sy) { int16x4_t s0 = vget_low_s16(src[0]); int16x4_t s1 = vget_low_s16(src[1]); int16x4_t s2 = vget_low_s16(src[2]); int16x4_t s3 = vget_low_s16(src[3]); int16x4_t s4 = vget_low_s16(src[4]); int16x4_t s5 = vget_low_s16(src[5]); int16x4_t s6 = vget_low_s16(src[6]); int16x4_t s7 = vget_low_s16(src[7]); int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2); m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3); m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0); m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1); m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2); m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3); *res = m0123; } static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, int sy, int gamma) { int16x8_t s0, s1, s2, s3; transpose_elems_s16_4x8( vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]), vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]), vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3); int16x8_t f[4]; load_filters_4(f, sy, gamma); int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]); int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]); int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]); int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]); int64x2_t m01 = vpaddq_s64(m0, m1); int64x2_t m23 = vpaddq_s64(m2, m3); *res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); } static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, int32x4_t *res_low, int32x4_t *res_high, int sy) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; int16x8_t s3 = src[3]; int16x8_t s4 = src[4]; int16x8_t s5 = src[5]; int16x8_t s6 = src[6]; int16x8_t s7 = src[7]; int16x8_t f = vld1q_s16(av1_warped_filter[sy >> WARPEDDIFF_PREC_BITS]); int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2); m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3); int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2); m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3); *res_low = m0123; *res_high = m4567; } static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, int32x4_t *res_low, int32x4_t *res_high, int sy, int gamma) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; int16x8_t s3 = src[3]; int16x8_t s4 = src[4]; int16x8_t s5 = src[5]; int16x8_t s6 = src[6]; int16x8_t s7 = src[7]; transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); int16x8_t f[8]; load_filters_8(f, sy, gamma); int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]); int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]); int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]); int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]); int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), s4, f[4]); int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), s5, f[5]); int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), s6, f[6]); int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), s7, f[7]); int64x2_t m01 = vpaddq_s64(m0, m1); int64x2_t m23 = vpaddq_s64(m2, m3); int64x2_t m45 = vpaddq_s64(m4, m5); int64x2_t m67 = vpaddq_s64(m6, m7); *res_low = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); *res_high = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67)); } void av1_warp_affine_sve(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, conv_params, alpha, beta, gamma, delta); } aom-3.12.1/av1/common/arm/wiener_convolve_neon.c000066400000000000000000000322651477627663500215360ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_dsp/txfm_common.h" #include "aom_ports/mem.h" #include "av1/common/common.h" #include "av1/common/restoration.h" static inline uint16x8_t wiener_convolve5_8_2d_h( const uint8x8_t t0, const uint8x8_t t1, const uint8x8_t t2, const uint8x8_t t3, const uint8x8_t t4, const int16x4_t x_filter, const int32x4_t round_vec, const uint16x8_t im_max_val) { // Since the Wiener filter is symmetric about the middle tap (tap 2) add // mirrored source elements before multiplying filter coefficients. int16x8_t s04 = vreinterpretq_s16_u16(vaddl_u8(t0, t4)); int16x8_t s13 = vreinterpretq_s16_u16(vaddl_u8(t1, t3)); int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); // x_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s04), x_filter, 1); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), x_filter, 2); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), x_filter, 3); int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s04), x_filter, 1); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), x_filter, 2); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), x_filter, 3); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum_lo, WIENER_ROUND0_BITS), vqrshrun_n_s32(sum_hi, WIENER_ROUND0_BITS)); return vminq_u16(res, im_max_val); } static inline void convolve_add_src_horiz_5tap_neon( const uint8_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, const int32x4_t round_vec, const uint16x8_t im_max_val) { do { const uint8_t *s = src_ptr; uint16_t *d = dst_ptr; int width = w; do { uint8x8_t s0, s1, s2, s3, s4; load_u8_8x5(s, 1, &s0, &s1, &s2, &s3, &s4); uint16x8_t d0 = wiener_convolve5_8_2d_h(s0, s1, s2, s3, s4, x_filter, round_vec, im_max_val); vst1q_u16(d, d0); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--h != 0); } static inline uint16x8_t wiener_convolve7_8_2d_h( const uint8x8_t t0, const uint8x8_t t1, const uint8x8_t t2, const uint8x8_t t3, const uint8x8_t t4, const uint8x8_t t5, const uint8x8_t t6, const int16x4_t x_filter, const int32x4_t round_vec, const uint16x8_t im_max_val) { // Since the Wiener filter is symmetric about the middle tap (tap 3) add // mirrored source elements before multiplying by filter coefficients. int16x8_t s06 = vreinterpretq_s16_u16(vaddl_u8(t0, t6)); int16x8_t s15 = vreinterpretq_s16_u16(vaddl_u8(t1, t5)); int16x8_t s24 = vreinterpretq_s16_u16(vaddl_u8(t2, t4)); int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3); int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3); uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum_lo, WIENER_ROUND0_BITS), vqrshrun_n_s32(sum_hi, WIENER_ROUND0_BITS)); return vminq_u16(res, im_max_val); } static inline void convolve_add_src_horiz_7tap_neon( const uint8_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, const int32x4_t round_vec, const uint16x8_t im_max_val) { do { const uint8_t *s = src_ptr; uint16_t *d = dst_ptr; int width = w; do { uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6); uint16x8_t d0 = wiener_convolve7_8_2d_h(s0, s1, s2, s3, s4, s5, s6, x_filter, round_vec, im_max_val); vst1q_u16(d, d0); s += 8; d += 8; width -= 8; } while (width != 0); src_ptr += src_stride; dst_ptr += dst_stride; } while (--h != 0); } static inline uint8x8_t wiener_convolve5_8_2d_v( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x4_t y_filter, const int32x4_t round_vec) { // Since the Wiener filter is symmetric about the middle tap (tap 2) add // mirrored source elements before multiplying by filter coefficients. int16x8_t s04 = vaddq_s16(s0, s4); int16x8_t s13 = vaddq_s16(s1, s3); int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s04), y_filter, 1); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), y_filter, 2); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), y_filter, 3); int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s04), y_filter, 1); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), y_filter, 2); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), y_filter, 3); int16x4_t res_lo = vshrn_n_s32(sum_lo, 2 * FILTER_BITS - WIENER_ROUND0_BITS); int16x4_t res_hi = vshrn_n_s32(sum_hi, 2 * FILTER_BITS - WIENER_ROUND0_BITS); return vqmovun_s16(vcombine_s16(res_lo, res_hi)); } static inline void convolve_add_src_vert_5tap_neon( const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, const int32x4_t round_vec) { do { const int16_t *s = (int16_t *)src; uint8_t *d = dst; int height = h; while (height > 3) { int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); uint8x8_t d0 = wiener_convolve5_8_2d_v(s0, s1, s2, s3, s4, y_filter, round_vec); uint8x8_t d1 = wiener_convolve5_8_2d_v(s1, s2, s3, s4, s5, y_filter, round_vec); uint8x8_t d2 = wiener_convolve5_8_2d_v(s2, s3, s4, s5, s6, y_filter, round_vec); uint8x8_t d3 = wiener_convolve5_8_2d_v(s3, s4, s5, s6, s7, y_filter, round_vec); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height-- != 0) { int16x8_t s0, s1, s2, s3, s4; load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); uint8x8_t d0 = wiener_convolve5_8_2d_v(s0, s1, s2, s3, s4, y_filter, round_vec); vst1_u8(d, d0); d += dst_stride; s += src_stride; } src += 8; dst += 8; w -= 8; } while (w != 0); } static inline uint8x8_t wiener_convolve7_8_2d_v( const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec) { // Since the Wiener filter is symmetric about the middle tap (tap 3) add // mirrored source elements before multiplying by filter coefficients. int16x8_t s06 = vaddq_s16(s0, s6); int16x8_t s15 = vaddq_s16(s1, s5); int16x8_t s24 = vaddq_s16(s2, s4); int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), y_filter, 0); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), y_filter, 1); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), y_filter, 2); sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), y_filter, 3); int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), y_filter, 0); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), y_filter, 1); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), y_filter, 2); sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), y_filter, 3); int16x4_t res_lo = vshrn_n_s32(sum_lo, 2 * FILTER_BITS - WIENER_ROUND0_BITS); int16x4_t res_hi = vshrn_n_s32(sum_hi, 2 * FILTER_BITS - WIENER_ROUND0_BITS); return vqmovun_s16(vcombine_s16(res_lo, res_hi)); } static inline void convolve_add_src_vert_7tap_neon( const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, const int32x4_t round_vec) { do { const int16_t *s = (int16_t *)src; uint8_t *d = dst; int height = h; while (height > 3) { int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9; load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9); uint8x8_t d0 = wiener_convolve7_8_2d_v(s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec); uint8x8_t d1 = wiener_convolve7_8_2d_v(s1, s2, s3, s4, s5, s6, s7, y_filter, round_vec); uint8x8_t d2 = wiener_convolve7_8_2d_v(s2, s3, s4, s5, s6, s7, s8, y_filter, round_vec); uint8x8_t d3 = wiener_convolve7_8_2d_v(s3, s4, s5, s6, s7, s8, s9, y_filter, round_vec); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 4 * src_stride; d += 4 * dst_stride; height -= 4; } while (height-- != 0) { int16x8_t s0, s1, s2, s3, s4, s5, s6; load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); uint8x8_t d0 = wiener_convolve7_8_2d_v(s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec); vst1_u8(d, d0); d += dst_stride; s += src_stride; } src += 8; dst += 8; w -= 8; } while (w != 0); } static inline int get_wiener_filter_taps(const int16_t *filter) { assert(filter[7] == 0); if (filter[0] == 0 && filter[6] == 0) { return WIENER_WIN_REDUCED; } return WIENER_WIN; } // Wiener filter 2D // Apply horizontal filter and store in a temporary buffer. When applying // vertical filter, overwrite the original pixel values. void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *x_filter, int x_step_q4, const int16_t *y_filter, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params) { (void)x_step_q4; (void)y_step_q4; (void)conv_params; assert(w % 8 == 0); assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); assert(x_step_q4 == 16 && y_step_q4 == 16); assert(x_filter[7] == 0 && y_filter[7] == 0); // For bd == 8, assert horizontal filtering output will not exceed 15-bit: assert(8 + 1 + FILTER_BITS - conv_params->round_0 <= 15); DECLARE_ALIGNED(16, uint16_t, im_block[(MAX_SB_SIZE + WIENER_WIN - 1) * MAX_SB_SIZE]); const int x_filter_taps = get_wiener_filter_taps(x_filter); const int y_filter_taps = get_wiener_filter_taps(y_filter); int16x4_t x_filter_s16 = vld1_s16(x_filter); int16x4_t y_filter_s16 = vld1_s16(y_filter); // Add 128 to tap 3. (Needed for rounding.) x_filter_s16 = vadd_s16(x_filter_s16, vcreate_s16(128ULL << 48)); y_filter_s16 = vadd_s16(y_filter_s16, vcreate_s16(128ULL << 48)); const int im_stride = MAX_SB_SIZE; const int im_h = h + y_filter_taps - 1; const int horiz_offset = x_filter_taps / 2; const int vert_offset = (y_filter_taps / 2) * (int)src_stride; const int bd = 8; const uint16x8_t im_max_val = vdupq_n_u16((1 << (bd + 1 + FILTER_BITS - WIENER_ROUND0_BITS)) - 1); const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1)); const int32x4_t vert_round_vec = vdupq_n_s32((1 << (2 * FILTER_BITS - WIENER_ROUND0_BITS - 1)) - (1 << (bd + (2 * FILTER_BITS - WIENER_ROUND0_BITS) - 1))); if (x_filter_taps == WIENER_WIN_REDUCED) { convolve_add_src_horiz_5tap_neon(src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, im_h, x_filter_s16, horiz_round_vec, im_max_val); } else { convolve_add_src_horiz_7tap_neon(src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, im_h, x_filter_s16, horiz_round_vec, im_max_val); } if (y_filter_taps == WIENER_WIN_REDUCED) { convolve_add_src_vert_5tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter_s16, vert_round_vec); } else { convolve_add_src_vert_7tap_neon(im_block, im_stride, dst, dst_stride, w, h, y_filter_s16, vert_round_vec); } } aom-3.12.1/av1/common/av1_common_int.h000066400000000000000000002010461477627663500174450ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_AV1_COMMON_INT_H_ #define AOM_AV1_COMMON_AV1_COMMON_INT_H_ #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom/internal/aom_codec_internal.h" #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_util/aom_pthread.h" #include "av1/common/alloccommon.h" #include "av1/common/av1_loopfilter.h" #include "av1/common/entropy.h" #include "av1/common/entropymode.h" #include "av1/common/entropymv.h" #include "av1/common/enums.h" #include "av1/common/frame_buffers.h" #include "av1/common/mv.h" #include "av1/common/quant_common.h" #include "av1/common/restoration.h" #include "av1/common/tile_common.h" #include "av1/common/timing.h" #include "aom_dsp/grain_params.h" #include "aom_dsp/grain_table.h" #include "aom_dsp/odintrin.h" #ifdef __cplusplus extern "C" { #endif #if defined(__clang__) && defined(__has_warning) #if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough") #define AOM_FALLTHROUGH_INTENDED [[clang::fallthrough]] // NOLINT #endif #elif defined(__GNUC__) && __GNUC__ >= 7 #define AOM_FALLTHROUGH_INTENDED __attribute__((fallthrough)) // NOLINT #endif #ifndef AOM_FALLTHROUGH_INTENDED #define AOM_FALLTHROUGH_INTENDED \ do { \ } while (0) #endif #define CDEF_MAX_STRENGTHS 16 /* Constant values while waiting for the sequence header */ #define FRAME_ID_LENGTH 15 #define DELTA_FRAME_ID_LENGTH 14 #define FRAME_CONTEXTS (FRAME_BUFFERS + 1) // Extra frame context which is always kept at default values #define FRAME_CONTEXT_DEFAULTS (FRAME_CONTEXTS - 1) #define PRIMARY_REF_BITS 3 #define PRIMARY_REF_NONE 7 #define NUM_PING_PONG_BUFFERS 2 #define MAX_NUM_TEMPORAL_LAYERS 8 #define MAX_NUM_SPATIAL_LAYERS 4 /* clang-format off */ // clang-format seems to think this is a pointer dereference and not a // multiplication. #define MAX_NUM_OPERATING_POINTS \ (MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS) /* clang-format on */ // TODO(jingning): Turning this on to set up transform coefficient // processing timer. #define TXCOEFF_TIMER 0 #define TXCOEFF_COST_TIMER 0 /*!\cond */ enum { SINGLE_REFERENCE = 0, COMPOUND_REFERENCE = 1, REFERENCE_MODE_SELECT = 2, REFERENCE_MODES = 3, } UENUM1BYTE(REFERENCE_MODE); enum { /** * Frame context updates are disabled */ REFRESH_FRAME_CONTEXT_DISABLED, /** * Update frame context to values resulting from backward probability * updates based on entropy/counts in the decoded frame */ REFRESH_FRAME_CONTEXT_BACKWARD, } UENUM1BYTE(REFRESH_FRAME_CONTEXT_MODE); #define MFMV_STACK_SIZE 3 typedef struct { int_mv mfmv0; uint8_t ref_frame_offset; } TPL_MV_REF; typedef struct { int_mv mv; MV_REFERENCE_FRAME ref_frame; } MV_REF; typedef struct RefCntBuffer { // For a RefCntBuffer, the following are reference-holding variables: // - cm->ref_frame_map[] // - cm->cur_frame // - cm->scaled_ref_buf[] (encoder only) // - pbi->output_frame_index[] (decoder only) // With that definition, 'ref_count' is the number of reference-holding // variables that are currently referencing this buffer. // For example: // - suppose this buffer is at index 'k' in the buffer pool, and // - Total 'n' of the variables / array elements above have value 'k' (that // is, they are pointing to buffer at index 'k'). // Then, pool->frame_bufs[k].ref_count = n. int ref_count; unsigned int order_hint; unsigned int ref_order_hints[INTER_REFS_PER_FRAME]; // These variables are used only in encoder and compare the absolute // display order hint to compute the relative distance and overcome // the limitation of get_relative_dist() which returns incorrect // distance when a very old frame is used as a reference. unsigned int display_order_hint; unsigned int ref_display_order_hint[INTER_REFS_PER_FRAME]; // Frame's level within the hierarchical structure. unsigned int pyramid_level; MV_REF *mvs; uint8_t *seg_map; struct segmentation seg; int mi_rows; int mi_cols; // Width and height give the size of the buffer (before any upscaling, unlike // the sizes that can be derived from the buf structure) int width; int height; WarpedMotionParams global_motion[REF_FRAMES]; int showable_frame; // frame can be used as show existing frame in future uint8_t film_grain_params_present; aom_film_grain_t film_grain_params; aom_codec_frame_buffer_t raw_frame_buffer; YV12_BUFFER_CONFIG buf; int temporal_id; // Temporal layer ID of the frame int spatial_id; // Spatial layer ID of the frame FRAME_TYPE frame_type; // This is only used in the encoder but needs to be indexed per ref frame // so it's extremely convenient to keep it here. int interp_filter_selected[SWITCHABLE]; // Inter frame reference frame delta for loop filter int8_t ref_deltas[REF_FRAMES]; // 0 = ZERO_MV, MV int8_t mode_deltas[MAX_MODE_LF_DELTAS]; FRAME_CONTEXT frame_context; } RefCntBuffer; typedef struct BufferPool { // Protect BufferPool from being accessed by several FrameWorkers at // the same time during frame parallel decode. // TODO(hkuang): Try to use atomic variable instead of locking the whole pool. // TODO(wtc): Remove this. See // https://chromium-review.googlesource.com/c/webm/libvpx/+/560630. #if CONFIG_MULTITHREAD pthread_mutex_t pool_mutex; #endif // Private data associated with the frame buffer callbacks. void *cb_priv; aom_get_frame_buffer_cb_fn_t get_fb_cb; aom_release_frame_buffer_cb_fn_t release_fb_cb; RefCntBuffer *frame_bufs; uint8_t num_frame_bufs; // Frame buffers allocated internally by the codec. InternalFrameBufferList int_frame_buffers; } BufferPool; /*!\endcond */ /*!\brief Parameters related to CDEF */ typedef struct { //! CDEF column line buffer uint16_t *colbuf[MAX_MB_PLANE]; //! CDEF top & bottom line buffer uint16_t *linebuf[MAX_MB_PLANE]; //! CDEF intermediate buffer uint16_t *srcbuf; //! CDEF column line buffer sizes size_t allocated_colbuf_size[MAX_MB_PLANE]; //! CDEF top and bottom line buffer sizes size_t allocated_linebuf_size[MAX_MB_PLANE]; //! CDEF intermediate buffer size size_t allocated_srcbuf_size; //! CDEF damping factor int cdef_damping; //! Number of CDEF strength values int nb_cdef_strengths; //! CDEF strength values for luma int cdef_strengths[CDEF_MAX_STRENGTHS]; //! CDEF strength values for chroma int cdef_uv_strengths[CDEF_MAX_STRENGTHS]; //! Number of CDEF strength values in bits int cdef_bits; //! Number of rows in the frame in 4 pixel int allocated_mi_rows; //! Number of CDEF workers int allocated_num_workers; } CdefInfo; /*!\cond */ typedef struct { int delta_q_present_flag; // Resolution of delta quant int delta_q_res; int delta_lf_present_flag; // Resolution of delta lf level int delta_lf_res; // This is a flag for number of deltas of loop filter level // 0: use 1 delta, for y_vertical, y_horizontal, u, and v // 1: use separate deltas for each filter level int delta_lf_multi; } DeltaQInfo; typedef struct { int enable_order_hint; // 0 - disable order hint, and related tools int order_hint_bits_minus_1; // dist_wtd_comp, ref_frame_mvs, // frame_sign_bias // if 0, enable_dist_wtd_comp and // enable_ref_frame_mvs must be set as 0. int enable_dist_wtd_comp; // 0 - disable dist-wtd compound modes // 1 - enable it int enable_ref_frame_mvs; // 0 - disable ref frame mvs // 1 - enable it } OrderHintInfo; // Sequence header structure. // Note: All syntax elements of sequence_header_obu that need to be // bit-identical across multiple sequence headers must be part of this struct, // so that consistency is checked by are_seq_headers_consistent() function. // One exception is the last member 'op_params' that is ignored by // are_seq_headers_consistent() function. typedef struct SequenceHeader { int num_bits_width; int num_bits_height; int max_frame_width; int max_frame_height; // Whether current and reference frame IDs are signaled in the bitstream. // Frame id numbers are additional information that do not affect the // decoding process, but provide decoders with a way of detecting missing // reference frames so that appropriate action can be taken. uint8_t frame_id_numbers_present_flag; int frame_id_length; int delta_frame_id_length; BLOCK_SIZE sb_size; // Size of the superblock used for this frame int mib_size; // Size of the superblock in units of MI blocks int mib_size_log2; // Log 2 of above. OrderHintInfo order_hint_info; uint8_t force_screen_content_tools; // 0 - force off // 1 - force on // 2 - adaptive uint8_t still_picture; // Video is a single frame still picture uint8_t reduced_still_picture_hdr; // Use reduced header for still picture uint8_t force_integer_mv; // 0 - Don't force. MV can use subpel // 1 - force to integer // 2 - adaptive uint8_t enable_filter_intra; // enables/disables filterintra uint8_t enable_intra_edge_filter; // enables/disables edge upsampling uint8_t enable_interintra_compound; // enables/disables interintra_compound uint8_t enable_masked_compound; // enables/disables masked compound uint8_t enable_dual_filter; // 0 - disable dual interpolation filter // 1 - enable vert/horz filter selection uint8_t enable_warped_motion; // 0 - disable warp for the sequence // 1 - enable warp for the sequence uint8_t enable_superres; // 0 - Disable superres for the sequence // and no frame level superres flag // 1 - Enable superres for the sequence // enable per-frame superres flag uint8_t enable_cdef; // To turn on/off CDEF uint8_t enable_restoration; // To turn on/off loop restoration BITSTREAM_PROFILE profile; // Color config. aom_bit_depth_t bit_depth; // AOM_BITS_8 in profile 0 or 1, // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3. uint8_t use_highbitdepth; // If true, we need to use 16bit frame buffers. uint8_t monochrome; // Monochrome video aom_color_primaries_t color_primaries; aom_transfer_characteristics_t transfer_characteristics; aom_matrix_coefficients_t matrix_coefficients; int color_range; int subsampling_x; // Chroma subsampling for x int subsampling_y; // Chroma subsampling for y aom_chroma_sample_position_t chroma_sample_position; uint8_t separate_uv_delta_q; uint8_t film_grain_params_present; // Operating point info. int operating_points_cnt_minus_1; int operating_point_idc[MAX_NUM_OPERATING_POINTS]; // True if operating_point_idc[op] is not equal to 0 for any value of op from // 0 to operating_points_cnt_minus_1. bool has_nonzero_operating_point_idc; int timing_info_present; aom_timing_info_t timing_info; uint8_t decoder_model_info_present_flag; aom_dec_model_info_t decoder_model_info; uint8_t display_model_info_present_flag; AV1_LEVEL seq_level_idx[MAX_NUM_OPERATING_POINTS]; uint8_t tier[MAX_NUM_OPERATING_POINTS]; // seq_tier in spec. One bit: 0 or 1. // IMPORTANT: the op_params member must be at the end of the struct so that // are_seq_headers_consistent() can be implemented with a memcmp() call. // TODO(urvang): We probably don't need the +1 here. aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1]; } SequenceHeader; typedef struct { int skip_mode_allowed; int skip_mode_flag; int ref_frame_idx_0; int ref_frame_idx_1; } SkipModeInfo; typedef struct { FRAME_TYPE frame_type; REFERENCE_MODE reference_mode; unsigned int order_hint; unsigned int display_order_hint; // Frame's level within the hierarchical structure. unsigned int pyramid_level; unsigned int frame_number; SkipModeInfo skip_mode_info; int refresh_frame_flags; // Which ref frames are overwritten by this frame int frame_refs_short_signaling; } CurrentFrame; /*!\endcond */ /*! * \brief Frame level features. */ typedef struct { /*! * If true, CDF update in the symbol encoding/decoding process is disabled. */ bool disable_cdf_update; /*! * If true, motion vectors are specified to eighth pel precision; and * if false, motion vectors are specified to quarter pel precision. */ bool allow_high_precision_mv; /*! * If true, force integer motion vectors; if false, use the default. */ bool cur_frame_force_integer_mv; /*! * If true, palette tool and/or intra block copy tools may be used. */ bool allow_screen_content_tools; bool allow_intrabc; /*!< If true, intra block copy tool may be used. */ bool allow_warped_motion; /*!< If true, frame may use warped motion mode. */ /*! * If true, using previous frames' motion vectors for prediction is allowed. */ bool allow_ref_frame_mvs; /*! * If true, frame is fully lossless at coded resolution. * */ bool coded_lossless; /*! * If true, frame is fully lossless at upscaled resolution. */ bool all_lossless; /*! * If true, the frame is restricted to a reduced subset of the full set of * transform types. */ bool reduced_tx_set_used; /*! * If true, error resilient mode is enabled. * Note: Error resilient mode allows the syntax of a frame to be parsed * independently of previously decoded frames. */ bool error_resilient_mode; /*! * If false, only MOTION_MODE that may be used is SIMPLE_TRANSLATION; * if true, all MOTION_MODES may be used. */ bool switchable_motion_mode; TX_MODE tx_mode; /*!< Transform mode at frame level. */ InterpFilter interp_filter; /*!< Interpolation filter at frame level. */ /*! * The reference frame that contains the CDF values and other state that * should be loaded at the start of the frame. */ int primary_ref_frame; /*! * Byte alignment of the planes in the reference buffers. */ int byte_alignment; /*! * Flag signaling how frame contexts should be updated at the end of * a frame decode. */ REFRESH_FRAME_CONTEXT_MODE refresh_frame_context; } FeatureFlags; /*! * \brief Params related to tiles. */ typedef struct CommonTileParams { int cols; /*!< number of tile columns that frame is divided into */ int rows; /*!< number of tile rows that frame is divided into */ int max_width_sb; /*!< maximum tile width in superblock units. */ int max_height_sb; /*!< maximum tile height in superblock units. */ /*! * Min width of non-rightmost tile in MI units. Only valid if cols > 1. */ int min_inner_width; /*! * If true, tiles are uniformly spaced with power-of-two number of rows and * columns. * If false, tiles have explicitly configured widths and heights. */ int uniform_spacing; /** * \name Members only valid when uniform_spacing == 1 */ /**@{*/ int log2_cols; /*!< log2 of 'cols'. */ int log2_rows; /*!< log2 of 'rows'. */ int width; /*!< tile width in MI units */ int height; /*!< tile height in MI units */ /**@}*/ /*! * Min num of tile columns possible based on 'max_width_sb' and frame width. */ int min_log2_cols; /*! * Min num of tile rows possible based on 'max_height_sb' and frame height. */ int min_log2_rows; /*! * Max num of tile columns possible based on frame width. */ int max_log2_cols; /*! * Max num of tile rows possible based on frame height. */ int max_log2_rows; /*! * log2 of min number of tiles (same as min_log2_cols + min_log2_rows). */ int min_log2; /*! * col_start_sb[i] is the start position of tile column i in superblock units. * valid for 0 <= i <= cols */ int col_start_sb[MAX_TILE_COLS + 1]; /*! * row_start_sb[i] is the start position of tile row i in superblock units. * valid for 0 <= i <= rows */ int row_start_sb[MAX_TILE_ROWS + 1]; /*! * If true, we are using large scale tile mode. */ unsigned int large_scale; /*! * Only relevant when large_scale == 1. * If true, the independent decoding of a single tile or a section of a frame * is allowed. */ unsigned int single_tile_decoding; } CommonTileParams; typedef struct CommonModeInfoParams CommonModeInfoParams; /*! * \brief Params related to MB_MODE_INFO arrays and related info. */ struct CommonModeInfoParams { /*! * Number of rows in the frame in 16 pixel units. * This is computed from frame height aligned to a multiple of 8. */ int mb_rows; /*! * Number of cols in the frame in 16 pixel units. * This is computed from frame width aligned to a multiple of 8. */ int mb_cols; /*! * Total MBs = mb_rows * mb_cols. */ int MBs; /*! * Number of rows in the frame in 4 pixel (MB_MODE_INFO) units. * This is computed from frame height aligned to a multiple of 8. */ int mi_rows; /*! * Number of cols in the frame in 4 pixel (MB_MODE_INFO) units. * This is computed from frame width aligned to a multiple of 8. */ int mi_cols; /*! * An array of MB_MODE_INFO structs for every 'mi_alloc_bsize' sized block * in the frame. * Note: This array should be treated like a scratch memory, and should NOT be * accessed directly, in most cases. Please use 'mi_grid_base' array instead. */ MB_MODE_INFO *mi_alloc; /*! * Number of allocated elements in 'mi_alloc'. */ int mi_alloc_size; /*! * Stride for 'mi_alloc' array. */ int mi_alloc_stride; /*! * The minimum block size that each element in 'mi_alloc' can correspond to. * For decoder, this is always BLOCK_4X4. * For encoder, this is BLOCK_8X8 for resolution >= 4k case or REALTIME mode * case. Otherwise, this is BLOCK_4X4. */ BLOCK_SIZE mi_alloc_bsize; /*! * Grid of pointers to 4x4 MB_MODE_INFO structs allocated in 'mi_alloc'. * It's possible that: * - Multiple pointers in the grid point to the same element in 'mi_alloc' * (for example, for all 4x4 blocks that belong to the same partition block). * - Some pointers can be NULL (for example, for blocks outside visible area). */ MB_MODE_INFO **mi_grid_base; /*! * Number of allocated elements in 'mi_grid_base' (and 'tx_type_map' also). */ int mi_grid_size; /*! * Stride for 'mi_grid_base' (and 'tx_type_map' also). */ int mi_stride; /*! * An array of tx types for each 4x4 block in the frame. * Number of allocated elements is same as 'mi_grid_size', and stride is * same as 'mi_grid_size'. So, indexing into 'tx_type_map' is same as that of * 'mi_grid_base'. */ TX_TYPE *tx_type_map; /** * \name Function pointers to allow separate logic for encoder and decoder. */ /**@{*/ /*! * Free the memory allocated to arrays in 'mi_params'. * \param[in,out] mi_params object containing common mode info parameters */ void (*free_mi)(struct CommonModeInfoParams *mi_params); /*! * Initialize / reset appropriate arrays in 'mi_params'. * \param[in,out] mi_params object containing common mode info parameters */ void (*setup_mi)(struct CommonModeInfoParams *mi_params); /*! * Allocate required memory for arrays in 'mi_params'. * \param[in,out] mi_params object containing common mode info * parameters * \param width frame width * \param height frame height * \param min_partition_size minimum partition size allowed while * encoding */ void (*set_mb_mi)(struct CommonModeInfoParams *mi_params, int width, int height, BLOCK_SIZE min_partition_size); /**@}*/ }; typedef struct CommonQuantParams CommonQuantParams; /*! * \brief Parameters related to quantization at the frame level. */ struct CommonQuantParams { /*! * Base qindex of the frame in the range 0 to 255. */ int base_qindex; /*! * Delta of qindex (from base_qindex) for Y plane DC coefficient. * Note: y_ac_delta_q is implicitly 0. */ int y_dc_delta_q; /*! * Delta of qindex (from base_qindex) for U plane DC coefficients. */ int u_dc_delta_q; /*! * Delta of qindex (from base_qindex) for U plane AC coefficients. */ int v_dc_delta_q; /*! * Delta of qindex (from base_qindex) for V plane DC coefficients. * Same as those for U plane if cm->seq_params->separate_uv_delta_q == 0. */ int u_ac_delta_q; /*! * Delta of qindex (from base_qindex) for V plane AC coefficients. * Same as those for U plane if cm->seq_params->separate_uv_delta_q == 0. */ int v_ac_delta_q; /* * Note: The qindex per superblock may have a delta from the qindex obtained * at frame level from parameters above, based on 'cm->delta_q_info'. */ /** * \name True dequantizers. * The dequantizers below are true dequantizers used only in the * dequantization process. They have the same coefficient * shift/scale as TX. */ /**@{*/ int16_t y_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for Y plane */ int16_t u_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for U plane */ int16_t v_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for V plane */ /**@}*/ /** * \name Global quantization matrix tables. */ /**@{*/ /*! * Global dequantization matrix table. */ const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL]; /*! * Global quantization matrix table. */ const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL]; /**@}*/ /** * \name Local dequantization matrix tables for each frame. */ /**@{*/ /*! * Local dequant matrix for Y plane. */ const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; /*! * Local dequant matrix for U plane. */ const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; /*! * Local dequant matrix for V plane. */ const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; /**@}*/ /*! * Flag indicating whether quantization matrices are being used: * - If true, qm_level_y, qm_level_u and qm_level_v indicate the level * indices to be used to access appropriate global quant matrix tables. * - If false, we implicitly use level index 'NUM_QM_LEVELS - 1'. */ bool using_qmatrix; /** * \name Valid only when using_qmatrix == true * Indicate the level indices to be used to access appropriate global quant * matrix tables. */ /**@{*/ int qmatrix_level_y; /*!< Level index for Y plane */ int qmatrix_level_u; /*!< Level index for U plane */ int qmatrix_level_v; /*!< Level index for V plane */ /**@}*/ }; typedef struct CommonContexts CommonContexts; /*! * \brief Contexts used for transmitting various symbols in the bitstream. */ struct CommonContexts { /*! * Context used by 'FRAME_CONTEXT.partition_cdf' to transmit partition type. * partition[i][j] is the context for ith tile row, jth mi_col. */ PARTITION_CONTEXT **partition; /*! * Context used to derive context for multiple symbols: * - 'TXB_CTX.txb_skip_ctx' used by 'FRAME_CONTEXT.txb_skip_cdf' to transmit * to transmit skip_txfm flag. * - 'TXB_CTX.dc_sign_ctx' used by 'FRAME_CONTEXT.dc_sign_cdf' to transmit * sign. * entropy[i][j][k] is the context for ith plane, jth tile row, kth mi_col. */ ENTROPY_CONTEXT **entropy[MAX_MB_PLANE]; /*! * Context used to derive context for 'FRAME_CONTEXT.txfm_partition_cdf' to * transmit 'is_split' flag to indicate if this transform block should be * split into smaller sub-blocks. * txfm[i][j] is the context for ith tile row, jth mi_col. */ TXFM_CONTEXT **txfm; /*! * Dimensions that were used to allocate the arrays above. * If these dimensions change, the arrays may have to be re-allocated. */ int num_planes; /*!< Corresponds to av1_num_planes(cm) */ int num_tile_rows; /*!< Corresponds to cm->tiles.row */ int num_mi_cols; /*!< Corresponds to cm->mi_params.mi_cols */ }; /*! * \brief Top level common structure used by both encoder and decoder. */ typedef struct AV1Common { /*! * Information about the current frame that is being coded. */ CurrentFrame current_frame; /*! * Code and details about current error status. */ struct aom_internal_error_info *error; /*! * AV1 allows two types of frame scaling operations: * 1. Frame super-resolution: that allows coding a frame at lower resolution * and after decoding the frame, normatively scales and restores the frame -- * inside the coding loop. * 2. Frame resize: that allows coding frame at lower/higher resolution, and * then non-normatively upscale the frame at the time of rendering -- outside * the coding loop. * Hence, the need for 3 types of dimensions. */ /** * \name Coded frame dimensions. */ /**@{*/ int width; /*!< Coded frame width */ int height; /*!< Coded frame height */ /**@}*/ /** * \name Rendered frame dimensions. * Dimensions after applying both super-resolution and resize to the coded * frame. Different from coded dimensions if super-resolution and/or resize * are being used for this frame. */ /**@{*/ int render_width; /*!< Rendered frame width */ int render_height; /*!< Rendered frame height */ /**@}*/ /** * \name Super-resolved frame dimensions. * Frame dimensions after applying super-resolution to the coded frame (if * present), but before applying resize. * Larger than the coded dimensions if super-resolution is being used for * this frame. * Different from rendered dimensions if resize is being used for this frame. */ /**@{*/ int superres_upscaled_width; /*!< Super-resolved frame width */ int superres_upscaled_height; /*!< Super-resolved frame height */ /**@}*/ /*! * The denominator of the superres scale used by this frame. * Note: The numerator is fixed to be SCALE_NUMERATOR. */ uint8_t superres_scale_denominator; /*! * buffer_removal_times[op_num] specifies the frame removal time in units of * DecCT clock ticks counted from the removal time of the last random access * point for operating point op_num. * TODO(urvang): We probably don't need the +1 here. */ uint32_t buffer_removal_times[MAX_NUM_OPERATING_POINTS + 1]; /*! * Presentation time of the frame in clock ticks DispCT counted from the * removal time of the last random access point for the operating point that * is being decoded. */ uint32_t frame_presentation_time; /*! * Buffer where previous frame is stored. */ RefCntBuffer *prev_frame; /*! * Buffer into which the current frame will be stored and other related info. * TODO(hkuang): Combine this with cur_buf in macroblockd. */ RefCntBuffer *cur_frame; /*! * For encoder, we have a two-level mapping from reference frame type to the * corresponding buffer in the buffer pool: * * 'remapped_ref_idx[i - 1]' maps reference type 'i' (range: LAST_FRAME ... * EXTREF_FRAME) to a remapped index 'j' (in range: 0 ... REF_FRAMES - 1) * * Later, 'cm->ref_frame_map[j]' maps the remapped index 'j' to a pointer to * the reference counted buffer structure RefCntBuffer, taken from the buffer * pool cm->buffer_pool->frame_bufs. * * LAST_FRAME, ..., EXTREF_FRAME * | | * v v * remapped_ref_idx[LAST_FRAME - 1], ..., remapped_ref_idx[EXTREF_FRAME - 1] * | | * v v * ref_frame_map[], ..., ref_frame_map[] * * Note: INTRA_FRAME always refers to the current frame, so there's no need to * have a remapped index for the same. */ int remapped_ref_idx[REF_FRAMES]; /*! * Scale of the current frame with respect to itself. * This is currently used for intra block copy, which behaves like an inter * prediction mode, where the reference frame is the current frame itself. */ struct scale_factors sf_identity; /*! * Scale factors of the reference frame with respect to the current frame. * This is required for generating inter prediction and will be non-identity * for a reference frame, if it has different dimensions than the coded * dimensions of the current frame. */ struct scale_factors ref_scale_factors[REF_FRAMES]; /*! * For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to * the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'. * For encoder, ref_frame_map[j] (where j = remapped_ref_idx[i]) maps * remapped reference index 'j' (that is, original reference type 'i') to * a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'. */ RefCntBuffer *ref_frame_map[REF_FRAMES]; /*! * If true, this frame is actually shown after decoding. * If false, this frame is coded in the bitstream, but not shown. It is only * used as a reference for other frames coded later. */ int show_frame; /*! * If true, this frame can be used as a show-existing frame for other frames * coded later. * When 'show_frame' is true, this is always true for all non-keyframes. * When 'show_frame' is false, this value is transmitted in the bitstream. */ int showable_frame; /*! * If true, show an existing frame coded before, instead of actually coding a * frame. The existing frame comes from one of the existing reference buffers, * as signaled in the bitstream. */ int show_existing_frame; /*! * Whether some features are allowed or not. */ FeatureFlags features; /*! * Params related to MB_MODE_INFO arrays and related info. */ CommonModeInfoParams mi_params; #if CONFIG_ENTROPY_STATS /*! * Context type used by token CDFs, in the range 0 .. (TOKEN_CDF_Q_CTXS - 1). */ int coef_cdf_category; #endif // CONFIG_ENTROPY_STATS /*! * Quantization params. */ CommonQuantParams quant_params; /*! * Segmentation info for current frame. */ struct segmentation seg; /*! * Segmentation map for previous frame. */ uint8_t *last_frame_seg_map; /** * \name Deblocking filter parameters. */ /**@{*/ loop_filter_info_n lf_info; /*!< Loop filter info */ struct loopfilter lf; /*!< Loop filter parameters */ /**@}*/ /** * \name Loop Restoration filter parameters. */ /**@{*/ RestorationInfo rst_info[MAX_MB_PLANE]; /*!< Loop Restoration filter info */ int32_t *rst_tmpbuf; /*!< Scratch buffer for self-guided restoration */ RestorationLineBuffers *rlbs; /*!< Line buffers needed by loop restoration */ YV12_BUFFER_CONFIG rst_frame; /*!< Stores the output of loop restoration */ /**@}*/ /*! * CDEF (Constrained Directional Enhancement Filter) parameters. */ CdefInfo cdef_info; /*! * Parameters for film grain synthesis. */ aom_film_grain_t film_grain_params; /*! * Parameters for delta quantization and delta loop filter level. */ DeltaQInfo delta_q_info; /*! * Global motion parameters for each reference frame. */ WarpedMotionParams global_motion[REF_FRAMES]; /*! * Elements part of the sequence header, that are applicable for all the * frames in the video. */ SequenceHeader *seq_params; /*! * Current CDFs of all the symbols for the current frame. */ FRAME_CONTEXT *fc; /*! * Default CDFs used when features.primary_ref_frame = PRIMARY_REF_NONE * (e.g. for a keyframe). These default CDFs are defined by the bitstream and * copied from default CDF tables for each symbol. */ FRAME_CONTEXT *default_frame_context; /*! * Parameters related to tiling. */ CommonTileParams tiles; /*! * External BufferPool passed from outside. */ BufferPool *buffer_pool; /*! * Above context buffers and their sizes. * Note: above contexts are allocated in this struct, as their size is * dependent on frame width, while left contexts are declared and allocated in * MACROBLOCKD struct, as they have a fixed size. */ CommonContexts above_contexts; /** * \name Signaled when cm->seq_params->frame_id_numbers_present_flag == 1 */ /**@{*/ int current_frame_id; /*!< frame ID for the current frame. */ int ref_frame_id[REF_FRAMES]; /*!< frame IDs for the reference frames. */ /**@}*/ /*! * Motion vectors provided by motion field estimation. * tpl_mvs[row * stride + col] stores MV for block at [mi_row, mi_col] where: * mi_row = 2 * row, * mi_col = 2 * col, and * stride = cm->mi_params.mi_stride / 2 */ TPL_MV_REF *tpl_mvs; /*! * Allocated size of 'tpl_mvs' array. Refer to 'ensure_mv_buffer()' function. */ int tpl_mvs_mem_size; /*! * ref_frame_sign_bias[k] is 1 if relative distance between reference 'k' and * current frame is positive; and 0 otherwise. */ int ref_frame_sign_bias[REF_FRAMES]; /*! * ref_frame_side[k] is 1 if relative distance between reference 'k' and * current frame is positive, -1 if relative distance is 0; and 0 otherwise. * TODO(jingning): This can be combined with sign_bias later. */ int8_t ref_frame_side[REF_FRAMES]; /*! * Temporal layer ID of this frame * (in the range 0 ... (number_temporal_layers - 1)). */ int temporal_layer_id; /*! * Spatial layer ID of this frame * (in the range 0 ... (number_spatial_layers - 1)). */ int spatial_layer_id; #if TXCOEFF_TIMER int64_t cum_txcoeff_timer; int64_t txcoeff_timer; int txb_count; #endif // TXCOEFF_TIMER #if TXCOEFF_COST_TIMER int64_t cum_txcoeff_cost_timer; int64_t txcoeff_cost_timer; int64_t txcoeff_cost_count; #endif // TXCOEFF_COST_TIMER } AV1_COMMON; /*!\cond */ // TODO(hkuang): Don't need to lock the whole pool after implementing atomic // frame reference count. static void lock_buffer_pool(BufferPool *const pool) { #if CONFIG_MULTITHREAD pthread_mutex_lock(&pool->pool_mutex); #else (void)pool; #endif } static void unlock_buffer_pool(BufferPool *const pool) { #if CONFIG_MULTITHREAD pthread_mutex_unlock(&pool->pool_mutex); #else (void)pool; #endif } static inline YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) { if (index < 0 || index >= REF_FRAMES) return NULL; if (cm->ref_frame_map[index] == NULL) return NULL; return &cm->ref_frame_map[index]->buf; } static inline int get_free_fb(AV1_COMMON *cm) { RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; int i; lock_buffer_pool(cm->buffer_pool); const int num_frame_bufs = cm->buffer_pool->num_frame_bufs; for (i = 0; i < num_frame_bufs; ++i) if (frame_bufs[i].ref_count == 0) break; if (i != num_frame_bufs) { if (frame_bufs[i].buf.use_external_reference_buffers) { // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the // external reference buffers. Restore the buffer pointers to point to the // internally allocated memory. YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf; ybf->y_buffer = ybf->store_buf_adr[0]; ybf->u_buffer = ybf->store_buf_adr[1]; ybf->v_buffer = ybf->store_buf_adr[2]; ybf->use_external_reference_buffers = 0; } frame_bufs[i].ref_count = 1; } else { // We should never run out of free buffers. If this assertion fails, there // is a reference leak. assert(0 && "Ran out of free frame buffers. Likely a reference leak."); // Reset i to be INVALID_IDX to indicate no free buffer found. i = INVALID_IDX; } unlock_buffer_pool(cm->buffer_pool); return i; } static inline RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) { // Release the previously-used frame-buffer if (cm->cur_frame != NULL) { --cm->cur_frame->ref_count; cm->cur_frame = NULL; } // Assign a new framebuffer const int new_fb_idx = get_free_fb(cm); if (new_fb_idx == INVALID_IDX) return NULL; cm->cur_frame = &cm->buffer_pool->frame_bufs[new_fb_idx]; #if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY aom_invalidate_pyramid(cm->cur_frame->buf.y_pyramid); av1_invalidate_corner_list(cm->cur_frame->buf.corners); #endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY av1_zero(cm->cur_frame->interp_filter_selected); return cm->cur_frame; } // Modify 'lhs_ptr' to reference the buffer at 'rhs_ptr', and update the ref // counts accordingly. static inline void assign_frame_buffer_p(RefCntBuffer **lhs_ptr, RefCntBuffer *rhs_ptr) { RefCntBuffer *const old_ptr = *lhs_ptr; if (old_ptr != NULL) { assert(old_ptr->ref_count > 0); // One less reference to the buffer at 'old_ptr', so decrease ref count. --old_ptr->ref_count; } *lhs_ptr = rhs_ptr; // One more reference to the buffer at 'rhs_ptr', so increase ref count. ++rhs_ptr->ref_count; } static inline int frame_is_intra_only(const AV1_COMMON *const cm) { return cm->current_frame.frame_type == KEY_FRAME || cm->current_frame.frame_type == INTRA_ONLY_FRAME; } static inline int frame_is_sframe(const AV1_COMMON *cm) { return cm->current_frame.frame_type == S_FRAME; } // These functions take a reference frame label between LAST_FRAME and // EXTREF_FRAME inclusive. Note that this is different to the indexing // previously used by the frame_refs[] array. static inline int get_ref_frame_map_idx(const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME) ? cm->remapped_ref_idx[ref_frame - LAST_FRAME] : INVALID_IDX; } static inline RefCntBuffer *get_ref_frame_buf( const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { const int map_idx = get_ref_frame_map_idx(cm, ref_frame); return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL; } // Both const and non-const versions of this function are provided so that it // can be used with a const AV1_COMMON if needed. static inline const struct scale_factors *get_ref_scale_factors_const( const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { const int map_idx = get_ref_frame_map_idx(cm, ref_frame); return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL; } static inline struct scale_factors *get_ref_scale_factors( AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { const int map_idx = get_ref_frame_map_idx(cm, ref_frame); return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL; } static inline RefCntBuffer *get_primary_ref_frame_buf( const AV1_COMMON *const cm) { const int primary_ref_frame = cm->features.primary_ref_frame; if (primary_ref_frame == PRIMARY_REF_NONE) return NULL; const int map_idx = get_ref_frame_map_idx(cm, primary_ref_frame + 1); return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL; } // Returns 1 if this frame might allow mvs from some reference frame. static inline int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) { return !cm->features.error_resilient_mode && cm->seq_params->order_hint_info.enable_ref_frame_mvs && cm->seq_params->order_hint_info.enable_order_hint && !frame_is_intra_only(cm); } // Returns 1 if this frame might use warped_motion static inline int frame_might_allow_warped_motion(const AV1_COMMON *cm) { return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) && cm->seq_params->enable_warped_motion; } static inline void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) { const int buf_rows = buf->mi_rows; const int buf_cols = buf->mi_cols; const CommonModeInfoParams *const mi_params = &cm->mi_params; if (buf->mvs == NULL || buf_rows != mi_params->mi_rows || buf_cols != mi_params->mi_cols) { aom_free(buf->mvs); buf->mi_rows = mi_params->mi_rows; buf->mi_cols = mi_params->mi_cols; CHECK_MEM_ERROR(cm, buf->mvs, (MV_REF *)aom_calloc(((mi_params->mi_rows + 1) >> 1) * ((mi_params->mi_cols + 1) >> 1), sizeof(*buf->mvs))); aom_free(buf->seg_map); CHECK_MEM_ERROR( cm, buf->seg_map, (uint8_t *)aom_calloc(mi_params->mi_rows * mi_params->mi_cols, sizeof(*buf->seg_map))); } const int mem_size = ((mi_params->mi_rows + MAX_MIB_SIZE) >> 1) * (mi_params->mi_stride >> 1); if (cm->tpl_mvs == NULL || cm->tpl_mvs_mem_size < mem_size) { aom_free(cm->tpl_mvs); CHECK_MEM_ERROR(cm, cm->tpl_mvs, (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs))); cm->tpl_mvs_mem_size = mem_size; } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params); #endif static inline int av1_num_planes(const AV1_COMMON *cm) { return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE; } static inline void av1_init_above_context(CommonContexts *above_contexts, int num_planes, int tile_row, MACROBLOCKD *xd) { for (int i = 0; i < num_planes; ++i) { xd->above_entropy_context[i] = above_contexts->entropy[i][tile_row]; } xd->above_partition_context = above_contexts->partition[tile_row]; xd->above_txfm_context = above_contexts->txfm[tile_row]; } static inline void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd) { const int num_planes = av1_num_planes(cm); const CommonQuantParams *const quant_params = &cm->quant_params; for (int i = 0; i < num_planes; ++i) { if (xd->plane[i].plane_type == PLANE_TYPE_Y) { memcpy(xd->plane[i].seg_dequant_QTX, quant_params->y_dequant_QTX, sizeof(quant_params->y_dequant_QTX)); memcpy(xd->plane[i].seg_iqmatrix, quant_params->y_iqmatrix, sizeof(quant_params->y_iqmatrix)); } else { if (i == AOM_PLANE_U) { memcpy(xd->plane[i].seg_dequant_QTX, quant_params->u_dequant_QTX, sizeof(quant_params->u_dequant_QTX)); memcpy(xd->plane[i].seg_iqmatrix, quant_params->u_iqmatrix, sizeof(quant_params->u_iqmatrix)); } else { memcpy(xd->plane[i].seg_dequant_QTX, quant_params->v_dequant_QTX, sizeof(quant_params->v_dequant_QTX)); memcpy(xd->plane[i].seg_iqmatrix, quant_params->v_iqmatrix, sizeof(quant_params->v_iqmatrix)); } } } xd->mi_stride = cm->mi_params.mi_stride; xd->error_info = cm->error; #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER cfl_init(&xd->cfl, cm->seq_params); #endif } static inline void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col, const int num_planes) { int i; int row_offset = mi_row; int col_offset = mi_col; for (i = 0; i < num_planes; ++i) { struct macroblockd_plane *const pd = &xd->plane[i]; // Offset the buffer pointer const BLOCK_SIZE bsize = xd->mi[0]->bsize; if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1)) row_offset = mi_row - 1; if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1)) col_offset = mi_col - 1; int above_idx = col_offset; int left_idx = row_offset & MAX_MIB_MASK; pd->above_entropy_context = &xd->above_entropy_context[i][above_idx >> pd->subsampling_x]; pd->left_entropy_context = &xd->left_entropy_context[i][left_idx >> pd->subsampling_y]; } } static inline int calc_mi_size(int len) { // len is in mi units. Align to a multiple of SBs. return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2); } static inline void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, const int num_planes) { int i; for (i = 0; i < num_planes; i++) { xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x; xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y; xd->plane[i].width = AOMMAX(xd->plane[i].width, 4); xd->plane[i].height = AOMMAX(xd->plane[i].height, 4); } } static inline void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, int mi_row, int bh, int mi_col, int bw, int mi_rows, int mi_cols) { xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_rows - bh - mi_row) * MI_SIZE); xd->mb_to_left_edge = -GET_MV_SUBPEL((mi_col * MI_SIZE)); xd->mb_to_right_edge = GET_MV_SUBPEL((mi_cols - bw - mi_col) * MI_SIZE); xd->mi_row = mi_row; xd->mi_col = mi_col; // Are edges available for intra prediction? xd->up_available = (mi_row > tile->mi_row_start); const int ss_x = xd->plane[1].subsampling_x; const int ss_y = xd->plane[1].subsampling_y; xd->left_available = (mi_col > tile->mi_col_start); xd->chroma_up_available = xd->up_available; xd->chroma_left_available = xd->left_available; if (ss_x && bw < mi_size_wide[BLOCK_8X8]) xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start; if (ss_y && bh < mi_size_high[BLOCK_8X8]) xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start; if (xd->up_available) { xd->above_mbmi = xd->mi[-xd->mi_stride]; } else { xd->above_mbmi = NULL; } if (xd->left_available) { xd->left_mbmi = xd->mi[-1]; } else { xd->left_mbmi = NULL; } const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) && ((mi_col & 0x01) || !(bw & 0x01) || !ss_x); xd->is_chroma_ref = chroma_ref; if (chroma_ref) { // To help calculate the "above" and "left" chroma blocks, note that the // current block may cover multiple luma blocks (e.g., if partitioned into // 4x4 luma blocks). // First, find the top-left-most luma block covered by this chroma block MB_MODE_INFO **base_mi = &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)]; // Then, we consider the luma region covered by the left or above 4x4 chroma // prediction. We want to point to the chroma reference block in that // region, which is the bottom-right-most mi unit. // This leads to the following offsets: MB_MODE_INFO *chroma_above_mi = xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL; xd->chroma_above_mbmi = chroma_above_mi; MB_MODE_INFO *chroma_left_mi = xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL; xd->chroma_left_mbmi = chroma_left_mi; } xd->height = bh; xd->width = bw; xd->is_last_vertical_rect = 0; if (xd->width < xd->height) { if (!((mi_col + xd->width) & (xd->height - 1))) { xd->is_last_vertical_rect = 1; } } xd->is_first_horizontal_rect = 0; if (xd->width > xd->height) if (!(mi_row & (xd->width - 1))) xd->is_first_horizontal_rect = 1; } static inline aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx, const MB_MODE_INFO *above_mi, const MB_MODE_INFO *left_mi) { const PREDICTION_MODE above = av1_above_block_mode(above_mi); const PREDICTION_MODE left = av1_left_block_mode(left_mi); const int above_ctx = intra_mode_context[above]; const int left_ctx = intra_mode_context[left]; return tile_ctx->kf_y_cdf[above_ctx][left_ctx]; } static inline void update_partition_context(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE subsize, BLOCK_SIZE bsize) { PARTITION_CONTEXT *const above_ctx = xd->above_partition_context + mi_col; PARTITION_CONTEXT *const left_ctx = xd->left_partition_context + (mi_row & MAX_MIB_MASK); const int bw = mi_size_wide[bsize]; const int bh = mi_size_high[bsize]; memset(above_ctx, partition_context_lookup[subsize].above, bw); memset(left_ctx, partition_context_lookup[subsize].left, bh); } static inline int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize, int subsampling_x, int subsampling_y) { assert(bsize < BLOCK_SIZES_ALL); const int bw = mi_size_wide[bsize]; const int bh = mi_size_high[bsize]; int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) && ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x); return ref_pos; } static inline aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf, size_t element) { assert(cdf != NULL); return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element]; } static inline void partition_gather_horz_alike(aom_cdf_prob *out, const aom_cdf_prob *const in, BLOCK_SIZE bsize) { (void)bsize; out[0] = CDF_PROB_TOP; out[0] -= cdf_element_prob(in, PARTITION_HORZ); out[0] -= cdf_element_prob(in, PARTITION_SPLIT); out[0] -= cdf_element_prob(in, PARTITION_HORZ_A); out[0] -= cdf_element_prob(in, PARTITION_HORZ_B); out[0] -= cdf_element_prob(in, PARTITION_VERT_A); if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4); out[0] = AOM_ICDF(out[0]); out[1] = AOM_ICDF(CDF_PROB_TOP); } static inline void partition_gather_vert_alike(aom_cdf_prob *out, const aom_cdf_prob *const in, BLOCK_SIZE bsize) { (void)bsize; out[0] = CDF_PROB_TOP; out[0] -= cdf_element_prob(in, PARTITION_VERT); out[0] -= cdf_element_prob(in, PARTITION_SPLIT); out[0] -= cdf_element_prob(in, PARTITION_HORZ_A); out[0] -= cdf_element_prob(in, PARTITION_VERT_A); out[0] -= cdf_element_prob(in, PARTITION_VERT_B); if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4); out[0] = AOM_ICDF(out[0]); out[1] = AOM_ICDF(CDF_PROB_TOP); } static inline void update_ext_partition_context(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE subsize, BLOCK_SIZE bsize, PARTITION_TYPE partition) { if (bsize >= BLOCK_8X8) { const int hbs = mi_size_wide[bsize] / 2; BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); switch (partition) { case PARTITION_SPLIT: if (bsize != BLOCK_8X8) break; AOM_FALLTHROUGH_INTENDED; case PARTITION_NONE: case PARTITION_HORZ: case PARTITION_VERT: case PARTITION_HORZ_4: case PARTITION_VERT_4: update_partition_context(xd, mi_row, mi_col, subsize, bsize); break; case PARTITION_HORZ_A: update_partition_context(xd, mi_row, mi_col, bsize2, subsize); update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize); break; case PARTITION_HORZ_B: update_partition_context(xd, mi_row, mi_col, subsize, subsize); update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize); break; case PARTITION_VERT_A: update_partition_context(xd, mi_row, mi_col, bsize2, subsize); update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize); break; case PARTITION_VERT_B: update_partition_context(xd, mi_row, mi_col, subsize, subsize); update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize); break; default: assert(0 && "Invalid partition type"); } } } static inline int partition_plane_context(const MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize) { const PARTITION_CONTEXT *above_ctx = xd->above_partition_context + mi_col; const PARTITION_CONTEXT *left_ctx = xd->left_partition_context + (mi_row & MAX_MIB_MASK); // Minimum partition point is 8x8. Offset the bsl accordingly. const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8]; int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1; assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]); assert(bsl >= 0); return (left * 2 + above) + bsl * PARTITION_PLOFFSET; } // Return the number of elements in the partition CDF when // partitioning the (square) block with luma block size of bsize. static inline int partition_cdf_length(BLOCK_SIZE bsize) { if (bsize <= BLOCK_8X8) return PARTITION_TYPES; else if (bsize == BLOCK_128X128) return EXT_PARTITION_TYPES - 2; else return EXT_PARTITION_TYPES; } static inline int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane) { assert(bsize < BLOCK_SIZES_ALL); int max_blocks_wide = block_size_wide[bsize]; if (xd->mb_to_right_edge < 0) { const struct macroblockd_plane *const pd = &xd->plane[plane]; max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x); } // Scale the width in the transform block unit. return max_blocks_wide >> MI_SIZE_LOG2; } static inline int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane) { int max_blocks_high = block_size_high[bsize]; if (xd->mb_to_bottom_edge < 0) { const struct macroblockd_plane *const pd = &xd->plane[plane]; max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y); } // Scale the height in the transform block unit. return max_blocks_high >> MI_SIZE_LOG2; } static inline void av1_zero_above_context(AV1_COMMON *const cm, const MACROBLOCKD *xd, int mi_col_start, int mi_col_end, const int tile_row) { const SequenceHeader *const seq_params = cm->seq_params; const int num_planes = av1_num_planes(cm); const int width = mi_col_end - mi_col_start; const int aligned_width = ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2); const int offset_y = mi_col_start; const int width_y = aligned_width; const int offset_uv = offset_y >> seq_params->subsampling_x; const int width_uv = width_y >> seq_params->subsampling_x; CommonContexts *const above_contexts = &cm->above_contexts; av1_zero_array(above_contexts->entropy[0][tile_row] + offset_y, width_y); if (num_planes > 1) { if (above_contexts->entropy[1][tile_row] && above_contexts->entropy[2][tile_row]) { av1_zero_array(above_contexts->entropy[1][tile_row] + offset_uv, width_uv); av1_zero_array(above_contexts->entropy[2][tile_row] + offset_uv, width_uv); } else { aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, "Invalid value of planes"); } } av1_zero_array(above_contexts->partition[tile_row] + mi_col_start, aligned_width); memset(above_contexts->txfm[tile_row] + mi_col_start, tx_size_wide[TX_SIZES_LARGEST], aligned_width * sizeof(TXFM_CONTEXT)); } static inline void av1_zero_left_context(MACROBLOCKD *const xd) { av1_zero(xd->left_entropy_context); av1_zero(xd->left_partition_context); memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST], sizeof(xd->left_txfm_context_buffer)); } static inline void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) { int i; for (i = 0; i < len; ++i) txfm_ctx[i] = txs; } static inline void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip, const MACROBLOCKD *xd) { uint8_t bw = tx_size_wide[tx_size]; uint8_t bh = tx_size_high[tx_size]; if (skip) { bw = n4_w * MI_SIZE; bh = n4_h * MI_SIZE; } set_txfm_ctx(xd->above_txfm_context, bw, n4_w); set_txfm_ctx(xd->left_txfm_context, bh, n4_h); } static inline int get_mi_grid_idx(const CommonModeInfoParams *const mi_params, int mi_row, int mi_col) { return mi_row * mi_params->mi_stride + mi_col; } static inline int get_alloc_mi_idx(const CommonModeInfoParams *const mi_params, int mi_row, int mi_col) { const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; const int mi_alloc_row = mi_row / mi_alloc_size_1d; const int mi_alloc_col = mi_col / mi_alloc_size_1d; return mi_alloc_row * mi_params->mi_alloc_stride + mi_alloc_col; } // For this partition block, set pointers in mi_params->mi_grid_base and xd->mi. static inline void set_mi_offsets(const CommonModeInfoParams *const mi_params, MACROBLOCKD *const xd, int mi_row, int mi_col) { // 'mi_grid_base' should point to appropriate memory in 'mi'. const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col); const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col); mi_params->mi_grid_base[mi_grid_idx] = &mi_params->mi_alloc[mi_alloc_idx]; // 'xd->mi' should point to an offset in 'mi_grid_base'; xd->mi = mi_params->mi_grid_base + mi_grid_idx; // 'xd->tx_type_map' should point to an offset in 'mi_params->tx_type_map'. xd->tx_type_map = mi_params->tx_type_map + mi_grid_idx; xd->tx_type_map_stride = mi_params->mi_stride; } static inline void txfm_partition_update(TXFM_CONTEXT *above_ctx, TXFM_CONTEXT *left_ctx, TX_SIZE tx_size, TX_SIZE txb_size) { BLOCK_SIZE bsize = txsize_to_bsize[txb_size]; int bh = mi_size_high[bsize]; int bw = mi_size_wide[bsize]; uint8_t txw = tx_size_wide[tx_size]; uint8_t txh = tx_size_high[tx_size]; int i; for (i = 0; i < bh; ++i) left_ctx[i] = txh; for (i = 0; i < bw; ++i) above_ctx[i] = txw; } static inline TX_SIZE get_sqr_tx_size(int tx_dim) { switch (tx_dim) { case 128: case 64: return TX_64X64; break; case 32: return TX_32X32; break; case 16: return TX_16X16; break; case 8: return TX_8X8; break; default: return TX_4X4; } } static inline TX_SIZE get_tx_size(int width, int height) { if (width == height) { return get_sqr_tx_size(width); } if (width < height) { if (width + width == height) { switch (width) { case 4: return TX_4X8; break; case 8: return TX_8X16; break; case 16: return TX_16X32; break; case 32: return TX_32X64; break; } } else { switch (width) { case 4: return TX_4X16; break; case 8: return TX_8X32; break; case 16: return TX_16X64; break; } } } else { if (height + height == width) { switch (height) { case 4: return TX_8X4; break; case 8: return TX_16X8; break; case 16: return TX_32X16; break; case 32: return TX_64X32; break; } } else { switch (height) { case 4: return TX_16X4; break; case 8: return TX_32X8; break; case 16: return TX_64X16; break; } } } assert(0); return TX_4X4; } static inline int txfm_partition_context(const TXFM_CONTEXT *const above_ctx, const TXFM_CONTEXT *const left_ctx, BLOCK_SIZE bsize, TX_SIZE tx_size) { const uint8_t txw = tx_size_wide[tx_size]; const uint8_t txh = tx_size_high[tx_size]; const int above = *above_ctx < txw; const int left = *left_ctx < txh; int category = TXFM_PARTITION_CONTEXTS; // dummy return, not used by others. if (tx_size <= TX_4X4) return 0; TX_SIZE max_tx_size = get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize])); if (max_tx_size >= TX_8X8) { category = (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) + (TX_SIZES - 1 - max_tx_size) * 2; } assert(category != TXFM_PARTITION_CONTEXTS); return category * 3 + above + left; } // Compute the next partition in the direction of the sb_type stored in the mi // array, starting with bsize. static inline PARTITION_TYPE get_partition(const AV1_COMMON *const cm, int mi_row, int mi_col, BLOCK_SIZE bsize) { const CommonModeInfoParams *const mi_params = &cm->mi_params; if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return PARTITION_INVALID; const int offset = mi_row * mi_params->mi_stride + mi_col; MB_MODE_INFO **mi = mi_params->mi_grid_base + offset; const BLOCK_SIZE subsize = mi[0]->bsize; assert(bsize < BLOCK_SIZES_ALL); if (subsize == bsize) return PARTITION_NONE; const int bhigh = mi_size_high[bsize]; const int bwide = mi_size_wide[bsize]; const int sshigh = mi_size_high[subsize]; const int sswide = mi_size_wide[subsize]; if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < mi_params->mi_rows && mi_col + bhigh / 2 < mi_params->mi_cols) { // In this case, the block might be using an extended partition // type. const MB_MODE_INFO *const mbmi_right = mi[bwide / 2]; const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * mi_params->mi_stride]; if (sswide == bwide) { // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or // PARTITION_HORZ_B. To distinguish the latter two, check if the lower // half was split. if (sshigh * 4 == bhigh) return PARTITION_HORZ_4; assert(sshigh * 2 == bhigh); if (mbmi_below->bsize == subsize) return PARTITION_HORZ; else return PARTITION_HORZ_B; } else if (sshigh == bhigh) { // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or // PARTITION_VERT_B. To distinguish the latter two, check if the right // half was split. if (sswide * 4 == bwide) return PARTITION_VERT_4; assert(sswide * 2 == bwide); if (mbmi_right->bsize == subsize) return PARTITION_VERT; else return PARTITION_VERT_B; } else { // Smaller width and smaller height. Might be PARTITION_SPLIT or could be // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both // dimensions, we immediately know this is a split (which will recurse to // get to subsize). Otherwise look down and to the right. With // PARTITION_VERT_A, the right block will have height bhigh; with // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise // it's PARTITION_SPLIT. if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT; if (mi_size_wide[mbmi_below->bsize] == bwide) return PARTITION_HORZ_A; if (mi_size_high[mbmi_right->bsize] == bhigh) return PARTITION_VERT_A; return PARTITION_SPLIT; } } const int vert_split = sswide < bwide; const int horz_split = sshigh < bhigh; const int split_idx = (vert_split << 1) | horz_split; assert(split_idx != 0); static const PARTITION_TYPE base_partitions[4] = { PARTITION_INVALID, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT }; return base_partitions[split_idx]; } static inline void set_sb_size(SequenceHeader *const seq_params, BLOCK_SIZE sb_size) { seq_params->sb_size = sb_size; seq_params->mib_size = mi_size_wide[seq_params->sb_size]; seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size]; } // Returns true if the frame is fully lossless at the coded resolution. // Note: If super-resolution is used, such a frame will still NOT be lossless at // the upscaled resolution. static inline int is_coded_lossless(const AV1_COMMON *cm, const MACROBLOCKD *xd) { int coded_lossless = 1; if (cm->seg.enabled) { for (int i = 0; i < MAX_SEGMENTS; ++i) { if (!xd->lossless[i]) { coded_lossless = 0; break; } } } else { coded_lossless = xd->lossless[0]; } return coded_lossless; } static inline int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) { return seq_level_idx == SEQ_LEVEL_MAX || (seq_level_idx < SEQ_LEVELS && // The following levels are currently undefined. seq_level_idx != SEQ_LEVEL_2_2 && seq_level_idx != SEQ_LEVEL_2_3 && seq_level_idx != SEQ_LEVEL_3_2 && seq_level_idx != SEQ_LEVEL_3_3 && seq_level_idx != SEQ_LEVEL_4_2 && seq_level_idx != SEQ_LEVEL_4_3 #if !CONFIG_CWG_C013 && seq_level_idx != SEQ_LEVEL_7_0 && seq_level_idx != SEQ_LEVEL_7_1 && seq_level_idx != SEQ_LEVEL_7_2 && seq_level_idx != SEQ_LEVEL_7_3 && seq_level_idx != SEQ_LEVEL_8_0 && seq_level_idx != SEQ_LEVEL_8_1 && seq_level_idx != SEQ_LEVEL_8_2 && seq_level_idx != SEQ_LEVEL_8_3 #endif ); } /*!\endcond */ #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_AV1_COMMON_INT_H_ aom-3.12.1/av1/common/av1_inv_txfm1d.c000066400000000000000000002350241477627663500173600ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/common/av1_inv_txfm1d.h" #include "av1/common/av1_txfm.h" void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { assert(output != input); const int32_t size = 4; const int32_t *cospi = cospi_arr(cos_bit); int32_t stage = 0; int32_t *bf0, *bf1; int32_t step[4]; // stage 0; // stage 1; stage++; bf1 = output; bf1[0] = input[0]; bf1[1] = input[2]; bf1[2] = input[1]; bf1[3] = input[3]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; bf0 = output; bf1 = step; bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; bf0 = step; bf1 = output; bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); } void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { assert(output != input); const int32_t size = 8; const int32_t *cospi = cospi_arr(cos_bit); int32_t stage = 0; int32_t *bf0, *bf1; int32_t step[8]; // stage 0; // stage 1; stage++; bf1 = output; bf1[0] = input[0]; bf1[1] = input[4]; bf1[2] = input[2]; bf1[3] = input[6]; bf1[4] = input[1]; bf1[5] = input[5]; bf1[6] = input[3]; bf1[7] = input[7]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit); bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; bf0 = step; bf1 = output; bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]); bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; bf0 = output; bf1 = step; bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); bf1[4] = bf0[4]; bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[7] = bf0[7]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; bf0 = step; bf1 = output; bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]); bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]); bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]); bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]); bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]); bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]); bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); } void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { assert(output != input); const int32_t size = 16; const int32_t *cospi = cospi_arr(cos_bit); int32_t stage = 0; int32_t *bf0, *bf1; int32_t step[16]; // stage 0; // stage 1; stage++; bf1 = output; bf1[0] = input[0]; bf1[1] = input[8]; bf1[2] = input[4]; bf1[3] = input[12]; bf1[4] = input[2]; bf1[5] = input[10]; bf1[6] = input[6]; bf1[7] = input[14]; bf1[8] = input[1]; bf1[9] = input[9]; bf1[10] = input[5]; bf1[11] = input[13]; bf1[12] = input[3]; bf1[13] = input[11]; bf1[14] = input[7]; bf1[15] = input[15]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit); bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit); bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit); bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit); bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit); bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit); bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit); bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; bf0 = step; bf1 = output; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit); bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]); bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]); bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]); bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]); bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]); bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]); bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]); bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; bf0 = output; bf1 = step; bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]); bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); bf1[8] = bf0[8]; bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); bf1[11] = bf0[11]; bf1[12] = bf0[12]; bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit); bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit); bf1[15] = bf0[15]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; bf0 = step; bf1 = output; bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); bf1[4] = bf0[4]; bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[7] = bf0[7]; bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]); bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]); bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]); bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]); bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]); bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]); bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; bf0 = output; bf1 = step; bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]); bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]); bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]); bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]); bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]); bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]); bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); bf1[14] = bf0[14]; bf1[15] = bf0[15]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; bf0 = step; bf1 = output; bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]); bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]); bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]); bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]); bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]); bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]); bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]); bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]); bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]); bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]); bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]); bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]); bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]); bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]); } void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { assert(output != input); const int32_t size = 32; const int32_t *cospi = cospi_arr(cos_bit); int32_t stage = 0; int32_t *bf0, *bf1; int32_t step[32]; // stage 0; // stage 1; stage++; bf1 = output; bf1[0] = input[0]; bf1[1] = input[16]; bf1[2] = input[8]; bf1[3] = input[24]; bf1[4] = input[4]; bf1[5] = input[20]; bf1[6] = input[12]; bf1[7] = input[28]; bf1[8] = input[2]; bf1[9] = input[18]; bf1[10] = input[10]; bf1[11] = input[26]; bf1[12] = input[6]; bf1[13] = input[22]; bf1[14] = input[14]; bf1[15] = input[30]; bf1[16] = input[1]; bf1[17] = input[17]; bf1[18] = input[9]; bf1[19] = input[25]; bf1[20] = input[5]; bf1[21] = input[21]; bf1[22] = input[13]; bf1[23] = input[29]; bf1[24] = input[3]; bf1[25] = input[19]; bf1[26] = input[11]; bf1[27] = input[27]; bf1[28] = input[7]; bf1[29] = input[23]; bf1[30] = input[15]; bf1[31] = input[31]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = bf0[10]; bf1[11] = bf0[11]; bf1[12] = bf0[12]; bf1[13] = bf0[13]; bf1[14] = bf0[14]; bf1[15] = bf0[15]; bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit); bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit); bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit); bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit); bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit); bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit); bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit); bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit); bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit); bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit); bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit); bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit); bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit); bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit); bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit); bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; bf0 = step; bf1 = output; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit); bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit); bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit); bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit); bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit); bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit); bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit); bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit); bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]); bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]); bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]); bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]); bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]); bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]); bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]); bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]); bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]); bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]); bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]); bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]); bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]); bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]); bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]); bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit); bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]); bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]); bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]); bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]); bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]); bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]); bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]); bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]); bf1[16] = bf0[16]; bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); bf1[19] = bf0[19]; bf1[20] = bf0[20]; bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); bf1[23] = bf0[23]; bf1[24] = bf0[24]; bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit); bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit); bf1[27] = bf0[27]; bf1[28] = bf0[28]; bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit); bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit); bf1[31] = bf0[31]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; bf0 = step; bf1 = output; bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]); bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); bf1[8] = bf0[8]; bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); bf1[11] = bf0[11]; bf1[12] = bf0[12]; bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit); bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit); bf1[15] = bf0[15]; bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]); bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]); bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]); bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]); bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]); bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]); bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]); bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]); bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]); bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]); bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]); bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]); bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]); bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]); bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]); bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; bf0 = output; bf1 = step; bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); bf1[4] = bf0[4]; bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[7] = bf0[7]; bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]); bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]); bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]); bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]); bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]); bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]); bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]); bf1[16] = bf0[16]; bf1[17] = bf0[17]; bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); bf1[22] = bf0[22]; bf1[23] = bf0[23]; bf1[24] = bf0[24]; bf1[25] = bf0[25]; bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit); bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit); bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit); bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit); bf1[30] = bf0[30]; bf1[31] = bf0[31]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; bf0 = step; bf1 = output; bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]); bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]); bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]); bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]); bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]); bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]); bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); bf1[14] = bf0[14]; bf1[15] = bf0[15]; bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]); bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]); bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]); bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]); bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]); bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]); bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]); bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]); bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]); bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]); bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]); bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]); bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]); bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]); bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]); bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; bf0 = output; bf1 = step; bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]); bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]); bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]); bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]); bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]); bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]); bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]); bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]); bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]); bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]); bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]); bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]); bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]); bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]); bf1[16] = bf0[16]; bf1[17] = bf0[17]; bf1[18] = bf0[18]; bf1[19] = bf0[19]; bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); bf1[28] = bf0[28]; bf1[29] = bf0[29]; bf1[30] = bf0[30]; bf1[31] = bf0[31]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; bf0 = step; bf1 = output; bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]); bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]); bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]); bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]); bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]); bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]); bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]); bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]); bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]); bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]); bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]); bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]); bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]); bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]); bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]); bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]); bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]); bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]); bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]); bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]); bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]); bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]); bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]); bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]); bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]); bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]); bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]); bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]); bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]); bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]); bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]); } void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { int bit = cos_bit; const int32_t *sinpi = sinpi_arr(bit); int32_t s0, s1, s2, s3, s4, s5, s6, s7; int32_t x0 = input[0]; int32_t x1 = input[1]; int32_t x2 = input[2]; int32_t x3 = input[3]; if (!(x0 | x1 | x2 | x3)) { output[0] = output[1] = output[2] = output[3] = 0; return; } assert(sinpi[1] + sinpi[2] == sinpi[4]); // stage 1 s0 = range_check_value(sinpi[1] * x0, stage_range[1] + bit); s1 = range_check_value(sinpi[2] * x0, stage_range[1] + bit); s2 = range_check_value(sinpi[3] * x1, stage_range[1] + bit); s3 = range_check_value(sinpi[4] * x2, stage_range[1] + bit); s4 = range_check_value(sinpi[1] * x2, stage_range[1] + bit); s5 = range_check_value(sinpi[2] * x3, stage_range[1] + bit); s6 = range_check_value(sinpi[4] * x3, stage_range[1] + bit); // stage 2 // NOTICE: (x0 - x2) here may use one extra bit compared to the // opt_range_row/col specified in av1_gen_inv_stage_range() s7 = range_check_value((x0 - x2) + x3, stage_range[2]); // stage 3 s0 = range_check_value(s0 + s3, stage_range[3] + bit); s1 = range_check_value(s1 - s4, stage_range[3] + bit); s3 = range_check_value(s2, stage_range[3] + bit); s2 = range_check_value(sinpi[3] * s7, stage_range[3] + bit); // stage 4 s0 = range_check_value(s0 + s5, stage_range[4] + bit); s1 = range_check_value(s1 - s6, stage_range[4] + bit); // stage 5 x0 = range_check_value(s0 + s3, stage_range[5] + bit); x1 = range_check_value(s1 + s3, stage_range[5] + bit); x2 = range_check_value(s2, stage_range[5] + bit); x3 = range_check_value(s0 + s1, stage_range[5] + bit); // stage 6 x3 = range_check_value(x3 - s3, stage_range[6] + bit); output[0] = round_shift(x0, bit); output[1] = round_shift(x1, bit); output[2] = round_shift(x2, bit); output[3] = round_shift(x3, bit); } void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { assert(output != input); const int32_t size = 8; const int32_t *cospi = cospi_arr(cos_bit); int32_t stage = 0; int32_t *bf0, *bf1; int32_t step[8]; // stage 0; // stage 1; stage++; bf1 = output; bf1[0] = input[7]; bf1[1] = input[0]; bf1[2] = input[5]; bf1[3] = input[2]; bf1[4] = input[3]; bf1[5] = input[4]; bf1[6] = input[1]; bf1[7] = input[6]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; bf0 = output; bf1 = step; bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit); bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit); bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit); bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit); bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit); bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit); bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit); bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; bf0 = step; bf1 = output; bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]); bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]); bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]); bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]); bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]); bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]); bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; bf0 = step; bf1 = output; bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]); bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]); bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]); bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]); bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]); bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]); bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 bf0 = step; bf1 = output; bf1[0] = bf0[0]; bf1[1] = -bf0[4]; bf1[2] = bf0[6]; bf1[3] = -bf0[2]; bf1[4] = bf0[3]; bf1[5] = -bf0[7]; bf1[6] = bf0[5]; bf1[7] = -bf0[1]; } void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { assert(output != input); const int32_t size = 16; const int32_t *cospi = cospi_arr(cos_bit); int32_t stage = 0; int32_t *bf0, *bf1; int32_t step[16]; // stage 0; // stage 1; stage++; bf1 = output; bf1[0] = input[15]; bf1[1] = input[0]; bf1[2] = input[13]; bf1[3] = input[2]; bf1[4] = input[11]; bf1[5] = input[4]; bf1[6] = input[9]; bf1[7] = input[6]; bf1[8] = input[7]; bf1[9] = input[8]; bf1[10] = input[5]; bf1[11] = input[10]; bf1[12] = input[3]; bf1[13] = input[12]; bf1[14] = input[1]; bf1[15] = input[14]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; bf0 = output; bf1 = step; bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit); bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit); bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit); bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit); bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit); bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit); bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit); bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit); bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit); bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit); bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit); bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit); bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit); bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit); bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit); bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; bf0 = step; bf1 = output; bf1[0] = clamp_value(bf0[0] + bf0[8], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[9], stage_range[stage]); bf1[2] = clamp_value(bf0[2] + bf0[10], stage_range[stage]); bf1[3] = clamp_value(bf0[3] + bf0[11], stage_range[stage]); bf1[4] = clamp_value(bf0[4] + bf0[12], stage_range[stage]); bf1[5] = clamp_value(bf0[5] + bf0[13], stage_range[stage]); bf1[6] = clamp_value(bf0[6] + bf0[14], stage_range[stage]); bf1[7] = clamp_value(bf0[7] + bf0[15], stage_range[stage]); bf1[8] = clamp_value(bf0[0] - bf0[8], stage_range[stage]); bf1[9] = clamp_value(bf0[1] - bf0[9], stage_range[stage]); bf1[10] = clamp_value(bf0[2] - bf0[10], stage_range[stage]); bf1[11] = clamp_value(bf0[3] - bf0[11], stage_range[stage]); bf1[12] = clamp_value(bf0[4] - bf0[12], stage_range[stage]); bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]); bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit); bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit); bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit); bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit); bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit); bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit); bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; bf0 = step; bf1 = output; bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]); bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]); bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]); bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]); bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]); bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]); bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]); bf1[8] = clamp_value(bf0[8] + bf0[12], stage_range[stage]); bf1[9] = clamp_value(bf0[9] + bf0[13], stage_range[stage]); bf1[10] = clamp_value(bf0[10] + bf0[14], stage_range[stage]); bf1[11] = clamp_value(bf0[11] + bf0[15], stage_range[stage]); bf1[12] = clamp_value(bf0[8] - bf0[12], stage_range[stage]); bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]); bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = bf0[10]; bf1[11] = bf0[11]; bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit); bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit); bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; bf0 = step; bf1 = output; bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]); bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]); bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]); bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]); bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]); bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]); bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]); bf1[8] = clamp_value(bf0[8] + bf0[10], stage_range[stage]); bf1[9] = clamp_value(bf0[9] + bf0[11], stage_range[stage]); bf1[10] = clamp_value(bf0[8] - bf0[10], stage_range[stage]); bf1[11] = clamp_value(bf0[9] - bf0[11], stage_range[stage]); bf1[12] = clamp_value(bf0[12] + bf0[14], stage_range[stage]); bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]); bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit); bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit); bf1[12] = bf0[12]; bf1[13] = bf0[13]; bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 bf0 = step; bf1 = output; bf1[0] = bf0[0]; bf1[1] = -bf0[8]; bf1[2] = bf0[12]; bf1[3] = -bf0[4]; bf1[4] = bf0[6]; bf1[5] = -bf0[14]; bf1[6] = bf0[10]; bf1[7] = -bf0[2]; bf1[8] = bf0[3]; bf1[9] = -bf0[11]; bf1[10] = bf0[15]; bf1[11] = -bf0[7]; bf1[12] = bf0[5]; bf1[13] = -bf0[13]; bf1[14] = bf0[9]; bf1[15] = -bf0[1]; } void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { (void)cos_bit; (void)stage_range; for (int i = 0; i < 4; ++i) { output[i] = round_shift((int64_t)NewSqrt2 * input[i], NewSqrt2Bits); } assert(stage_range[0] + NewSqrt2Bits <= 32); } void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { (void)cos_bit; (void)stage_range; for (int i = 0; i < 8; ++i) output[i] = (int32_t)((int64_t)input[i] * 2); } void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { (void)cos_bit; (void)stage_range; for (int i = 0; i < 16; ++i) output[i] = round_shift((int64_t)NewSqrt2 * 2 * input[i], NewSqrt2Bits); assert(stage_range[0] + NewSqrt2Bits <= 32); } void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { (void)cos_bit; (void)stage_range; for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4); } void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { assert(output != input); const int32_t size = 64; const int32_t *cospi = cospi_arr(cos_bit); int32_t stage = 0; int32_t *bf0, *bf1; int32_t step[64]; // stage 0; // stage 1; stage++; bf1 = output; bf1[0] = input[0]; bf1[1] = input[32]; bf1[2] = input[16]; bf1[3] = input[48]; bf1[4] = input[8]; bf1[5] = input[40]; bf1[6] = input[24]; bf1[7] = input[56]; bf1[8] = input[4]; bf1[9] = input[36]; bf1[10] = input[20]; bf1[11] = input[52]; bf1[12] = input[12]; bf1[13] = input[44]; bf1[14] = input[28]; bf1[15] = input[60]; bf1[16] = input[2]; bf1[17] = input[34]; bf1[18] = input[18]; bf1[19] = input[50]; bf1[20] = input[10]; bf1[21] = input[42]; bf1[22] = input[26]; bf1[23] = input[58]; bf1[24] = input[6]; bf1[25] = input[38]; bf1[26] = input[22]; bf1[27] = input[54]; bf1[28] = input[14]; bf1[29] = input[46]; bf1[30] = input[30]; bf1[31] = input[62]; bf1[32] = input[1]; bf1[33] = input[33]; bf1[34] = input[17]; bf1[35] = input[49]; bf1[36] = input[9]; bf1[37] = input[41]; bf1[38] = input[25]; bf1[39] = input[57]; bf1[40] = input[5]; bf1[41] = input[37]; bf1[42] = input[21]; bf1[43] = input[53]; bf1[44] = input[13]; bf1[45] = input[45]; bf1[46] = input[29]; bf1[47] = input[61]; bf1[48] = input[3]; bf1[49] = input[35]; bf1[50] = input[19]; bf1[51] = input[51]; bf1[52] = input[11]; bf1[53] = input[43]; bf1[54] = input[27]; bf1[55] = input[59]; bf1[56] = input[7]; bf1[57] = input[39]; bf1[58] = input[23]; bf1[59] = input[55]; bf1[60] = input[15]; bf1[61] = input[47]; bf1[62] = input[31]; bf1[63] = input[63]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = bf0[10]; bf1[11] = bf0[11]; bf1[12] = bf0[12]; bf1[13] = bf0[13]; bf1[14] = bf0[14]; bf1[15] = bf0[15]; bf1[16] = bf0[16]; bf1[17] = bf0[17]; bf1[18] = bf0[18]; bf1[19] = bf0[19]; bf1[20] = bf0[20]; bf1[21] = bf0[21]; bf1[22] = bf0[22]; bf1[23] = bf0[23]; bf1[24] = bf0[24]; bf1[25] = bf0[25]; bf1[26] = bf0[26]; bf1[27] = bf0[27]; bf1[28] = bf0[28]; bf1[29] = bf0[29]; bf1[30] = bf0[30]; bf1[31] = bf0[31]; bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit); bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit); bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit); bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit); bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit); bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit); bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit); bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit); bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit); bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit); bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit); bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit); bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit); bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit); bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit); bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit); bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit); bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit); bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit); bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit); bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit); bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit); bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit); bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit); bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit); bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit); bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit); bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit); bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit); bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit); bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit); bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; bf0 = step; bf1 = output; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = bf0[10]; bf1[11] = bf0[11]; bf1[12] = bf0[12]; bf1[13] = bf0[13]; bf1[14] = bf0[14]; bf1[15] = bf0[15]; bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit); bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit); bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit); bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit); bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit); bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit); bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit); bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit); bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit); bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit); bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit); bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit); bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit); bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit); bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit); bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit); bf1[32] = clamp_value(bf0[32] + bf0[33], stage_range[stage]); bf1[33] = clamp_value(bf0[32] - bf0[33], stage_range[stage]); bf1[34] = clamp_value(-bf0[34] + bf0[35], stage_range[stage]); bf1[35] = clamp_value(bf0[34] + bf0[35], stage_range[stage]); bf1[36] = clamp_value(bf0[36] + bf0[37], stage_range[stage]); bf1[37] = clamp_value(bf0[36] - bf0[37], stage_range[stage]); bf1[38] = clamp_value(-bf0[38] + bf0[39], stage_range[stage]); bf1[39] = clamp_value(bf0[38] + bf0[39], stage_range[stage]); bf1[40] = clamp_value(bf0[40] + bf0[41], stage_range[stage]); bf1[41] = clamp_value(bf0[40] - bf0[41], stage_range[stage]); bf1[42] = clamp_value(-bf0[42] + bf0[43], stage_range[stage]); bf1[43] = clamp_value(bf0[42] + bf0[43], stage_range[stage]); bf1[44] = clamp_value(bf0[44] + bf0[45], stage_range[stage]); bf1[45] = clamp_value(bf0[44] - bf0[45], stage_range[stage]); bf1[46] = clamp_value(-bf0[46] + bf0[47], stage_range[stage]); bf1[47] = clamp_value(bf0[46] + bf0[47], stage_range[stage]); bf1[48] = clamp_value(bf0[48] + bf0[49], stage_range[stage]); bf1[49] = clamp_value(bf0[48] - bf0[49], stage_range[stage]); bf1[50] = clamp_value(-bf0[50] + bf0[51], stage_range[stage]); bf1[51] = clamp_value(bf0[50] + bf0[51], stage_range[stage]); bf1[52] = clamp_value(bf0[52] + bf0[53], stage_range[stage]); bf1[53] = clamp_value(bf0[52] - bf0[53], stage_range[stage]); bf1[54] = clamp_value(-bf0[54] + bf0[55], stage_range[stage]); bf1[55] = clamp_value(bf0[54] + bf0[55], stage_range[stage]); bf1[56] = clamp_value(bf0[56] + bf0[57], stage_range[stage]); bf1[57] = clamp_value(bf0[56] - bf0[57], stage_range[stage]); bf1[58] = clamp_value(-bf0[58] + bf0[59], stage_range[stage]); bf1[59] = clamp_value(bf0[58] + bf0[59], stage_range[stage]); bf1[60] = clamp_value(bf0[60] + bf0[61], stage_range[stage]); bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]); bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]); bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit); bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit); bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit); bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit); bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit); bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit); bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit); bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit); bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]); bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]); bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]); bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]); bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]); bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]); bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]); bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]); bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]); bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]); bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]); bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]); bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]); bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]); bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]); bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]); bf1[32] = bf0[32]; bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit); bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit); bf1[35] = bf0[35]; bf1[36] = bf0[36]; bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit); bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit); bf1[39] = bf0[39]; bf1[40] = bf0[40]; bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit); bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit); bf1[43] = bf0[43]; bf1[44] = bf0[44]; bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit); bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit); bf1[47] = bf0[47]; bf1[48] = bf0[48]; bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit); bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit); bf1[51] = bf0[51]; bf1[52] = bf0[52]; bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit); bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit); bf1[55] = bf0[55]; bf1[56] = bf0[56]; bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit); bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit); bf1[59] = bf0[59]; bf1[60] = bf0[60]; bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit); bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit); bf1[63] = bf0[63]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; bf0 = step; bf1 = output; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit); bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]); bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]); bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]); bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]); bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]); bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]); bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]); bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]); bf1[16] = bf0[16]; bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); bf1[19] = bf0[19]; bf1[20] = bf0[20]; bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); bf1[23] = bf0[23]; bf1[24] = bf0[24]; bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit); bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit); bf1[27] = bf0[27]; bf1[28] = bf0[28]; bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit); bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit); bf1[31] = bf0[31]; bf1[32] = clamp_value(bf0[32] + bf0[35], stage_range[stage]); bf1[33] = clamp_value(bf0[33] + bf0[34], stage_range[stage]); bf1[34] = clamp_value(bf0[33] - bf0[34], stage_range[stage]); bf1[35] = clamp_value(bf0[32] - bf0[35], stage_range[stage]); bf1[36] = clamp_value(-bf0[36] + bf0[39], stage_range[stage]); bf1[37] = clamp_value(-bf0[37] + bf0[38], stage_range[stage]); bf1[38] = clamp_value(bf0[37] + bf0[38], stage_range[stage]); bf1[39] = clamp_value(bf0[36] + bf0[39], stage_range[stage]); bf1[40] = clamp_value(bf0[40] + bf0[43], stage_range[stage]); bf1[41] = clamp_value(bf0[41] + bf0[42], stage_range[stage]); bf1[42] = clamp_value(bf0[41] - bf0[42], stage_range[stage]); bf1[43] = clamp_value(bf0[40] - bf0[43], stage_range[stage]); bf1[44] = clamp_value(-bf0[44] + bf0[47], stage_range[stage]); bf1[45] = clamp_value(-bf0[45] + bf0[46], stage_range[stage]); bf1[46] = clamp_value(bf0[45] + bf0[46], stage_range[stage]); bf1[47] = clamp_value(bf0[44] + bf0[47], stage_range[stage]); bf1[48] = clamp_value(bf0[48] + bf0[51], stage_range[stage]); bf1[49] = clamp_value(bf0[49] + bf0[50], stage_range[stage]); bf1[50] = clamp_value(bf0[49] - bf0[50], stage_range[stage]); bf1[51] = clamp_value(bf0[48] - bf0[51], stage_range[stage]); bf1[52] = clamp_value(-bf0[52] + bf0[55], stage_range[stage]); bf1[53] = clamp_value(-bf0[53] + bf0[54], stage_range[stage]); bf1[54] = clamp_value(bf0[53] + bf0[54], stage_range[stage]); bf1[55] = clamp_value(bf0[52] + bf0[55], stage_range[stage]); bf1[56] = clamp_value(bf0[56] + bf0[59], stage_range[stage]); bf1[57] = clamp_value(bf0[57] + bf0[58], stage_range[stage]); bf1[58] = clamp_value(bf0[57] - bf0[58], stage_range[stage]); bf1[59] = clamp_value(bf0[56] - bf0[59], stage_range[stage]); bf1[60] = clamp_value(-bf0[60] + bf0[63], stage_range[stage]); bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]); bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]); bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; bf0 = output; bf1 = step; bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]); bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); bf1[8] = bf0[8]; bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); bf1[11] = bf0[11]; bf1[12] = bf0[12]; bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit); bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit); bf1[15] = bf0[15]; bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]); bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]); bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]); bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]); bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]); bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]); bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]); bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]); bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]); bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]); bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]); bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]); bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]); bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]); bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]); bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]); bf1[32] = bf0[32]; bf1[33] = bf0[33]; bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit); bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit); bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit); bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit); bf1[38] = bf0[38]; bf1[39] = bf0[39]; bf1[40] = bf0[40]; bf1[41] = bf0[41]; bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit); bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit); bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit); bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit); bf1[46] = bf0[46]; bf1[47] = bf0[47]; bf1[48] = bf0[48]; bf1[49] = bf0[49]; bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit); bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit); bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit); bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit); bf1[54] = bf0[54]; bf1[55] = bf0[55]; bf1[56] = bf0[56]; bf1[57] = bf0[57]; bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit); bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit); bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit); bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit); bf1[62] = bf0[62]; bf1[63] = bf0[63]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; bf0 = step; bf1 = output; bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); bf1[4] = bf0[4]; bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[7] = bf0[7]; bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]); bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]); bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]); bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]); bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]); bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]); bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]); bf1[16] = bf0[16]; bf1[17] = bf0[17]; bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); bf1[22] = bf0[22]; bf1[23] = bf0[23]; bf1[24] = bf0[24]; bf1[25] = bf0[25]; bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit); bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit); bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit); bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit); bf1[30] = bf0[30]; bf1[31] = bf0[31]; bf1[32] = clamp_value(bf0[32] + bf0[39], stage_range[stage]); bf1[33] = clamp_value(bf0[33] + bf0[38], stage_range[stage]); bf1[34] = clamp_value(bf0[34] + bf0[37], stage_range[stage]); bf1[35] = clamp_value(bf0[35] + bf0[36], stage_range[stage]); bf1[36] = clamp_value(bf0[35] - bf0[36], stage_range[stage]); bf1[37] = clamp_value(bf0[34] - bf0[37], stage_range[stage]); bf1[38] = clamp_value(bf0[33] - bf0[38], stage_range[stage]); bf1[39] = clamp_value(bf0[32] - bf0[39], stage_range[stage]); bf1[40] = clamp_value(-bf0[40] + bf0[47], stage_range[stage]); bf1[41] = clamp_value(-bf0[41] + bf0[46], stage_range[stage]); bf1[42] = clamp_value(-bf0[42] + bf0[45], stage_range[stage]); bf1[43] = clamp_value(-bf0[43] + bf0[44], stage_range[stage]); bf1[44] = clamp_value(bf0[43] + bf0[44], stage_range[stage]); bf1[45] = clamp_value(bf0[42] + bf0[45], stage_range[stage]); bf1[46] = clamp_value(bf0[41] + bf0[46], stage_range[stage]); bf1[47] = clamp_value(bf0[40] + bf0[47], stage_range[stage]); bf1[48] = clamp_value(bf0[48] + bf0[55], stage_range[stage]); bf1[49] = clamp_value(bf0[49] + bf0[54], stage_range[stage]); bf1[50] = clamp_value(bf0[50] + bf0[53], stage_range[stage]); bf1[51] = clamp_value(bf0[51] + bf0[52], stage_range[stage]); bf1[52] = clamp_value(bf0[51] - bf0[52], stage_range[stage]); bf1[53] = clamp_value(bf0[50] - bf0[53], stage_range[stage]); bf1[54] = clamp_value(bf0[49] - bf0[54], stage_range[stage]); bf1[55] = clamp_value(bf0[48] - bf0[55], stage_range[stage]); bf1[56] = clamp_value(-bf0[56] + bf0[63], stage_range[stage]); bf1[57] = clamp_value(-bf0[57] + bf0[62], stage_range[stage]); bf1[58] = clamp_value(-bf0[58] + bf0[61], stage_range[stage]); bf1[59] = clamp_value(-bf0[59] + bf0[60], stage_range[stage]); bf1[60] = clamp_value(bf0[59] + bf0[60], stage_range[stage]); bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]); bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]); bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; bf0 = output; bf1 = step; bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]); bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]); bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]); bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]); bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]); bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]); bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); bf1[14] = bf0[14]; bf1[15] = bf0[15]; bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]); bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]); bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]); bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]); bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]); bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]); bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]); bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]); bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]); bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]); bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]); bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]); bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]); bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]); bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]); bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]); bf1[32] = bf0[32]; bf1[33] = bf0[33]; bf1[34] = bf0[34]; bf1[35] = bf0[35]; bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit); bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit); bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit); bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit); bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit); bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit); bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit); bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit); bf1[44] = bf0[44]; bf1[45] = bf0[45]; bf1[46] = bf0[46]; bf1[47] = bf0[47]; bf1[48] = bf0[48]; bf1[49] = bf0[49]; bf1[50] = bf0[50]; bf1[51] = bf0[51]; bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit); bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit); bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit); bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit); bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit); bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit); bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit); bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit); bf1[60] = bf0[60]; bf1[61] = bf0[61]; bf1[62] = bf0[62]; bf1[63] = bf0[63]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; bf0 = step; bf1 = output; bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]); bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]); bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]); bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]); bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]); bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]); bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]); bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]); bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]); bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]); bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]); bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]); bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]); bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]); bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]); bf1[16] = bf0[16]; bf1[17] = bf0[17]; bf1[18] = bf0[18]; bf1[19] = bf0[19]; bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); bf1[28] = bf0[28]; bf1[29] = bf0[29]; bf1[30] = bf0[30]; bf1[31] = bf0[31]; bf1[32] = clamp_value(bf0[32] + bf0[47], stage_range[stage]); bf1[33] = clamp_value(bf0[33] + bf0[46], stage_range[stage]); bf1[34] = clamp_value(bf0[34] + bf0[45], stage_range[stage]); bf1[35] = clamp_value(bf0[35] + bf0[44], stage_range[stage]); bf1[36] = clamp_value(bf0[36] + bf0[43], stage_range[stage]); bf1[37] = clamp_value(bf0[37] + bf0[42], stage_range[stage]); bf1[38] = clamp_value(bf0[38] + bf0[41], stage_range[stage]); bf1[39] = clamp_value(bf0[39] + bf0[40], stage_range[stage]); bf1[40] = clamp_value(bf0[39] - bf0[40], stage_range[stage]); bf1[41] = clamp_value(bf0[38] - bf0[41], stage_range[stage]); bf1[42] = clamp_value(bf0[37] - bf0[42], stage_range[stage]); bf1[43] = clamp_value(bf0[36] - bf0[43], stage_range[stage]); bf1[44] = clamp_value(bf0[35] - bf0[44], stage_range[stage]); bf1[45] = clamp_value(bf0[34] - bf0[45], stage_range[stage]); bf1[46] = clamp_value(bf0[33] - bf0[46], stage_range[stage]); bf1[47] = clamp_value(bf0[32] - bf0[47], stage_range[stage]); bf1[48] = clamp_value(-bf0[48] + bf0[63], stage_range[stage]); bf1[49] = clamp_value(-bf0[49] + bf0[62], stage_range[stage]); bf1[50] = clamp_value(-bf0[50] + bf0[61], stage_range[stage]); bf1[51] = clamp_value(-bf0[51] + bf0[60], stage_range[stage]); bf1[52] = clamp_value(-bf0[52] + bf0[59], stage_range[stage]); bf1[53] = clamp_value(-bf0[53] + bf0[58], stage_range[stage]); bf1[54] = clamp_value(-bf0[54] + bf0[57], stage_range[stage]); bf1[55] = clamp_value(-bf0[55] + bf0[56], stage_range[stage]); bf1[56] = clamp_value(bf0[55] + bf0[56], stage_range[stage]); bf1[57] = clamp_value(bf0[54] + bf0[57], stage_range[stage]); bf1[58] = clamp_value(bf0[53] + bf0[58], stage_range[stage]); bf1[59] = clamp_value(bf0[52] + bf0[59], stage_range[stage]); bf1[60] = clamp_value(bf0[51] + bf0[60], stage_range[stage]); bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]); bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]); bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 10 stage++; bf0 = output; bf1 = step; bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]); bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]); bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]); bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]); bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]); bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]); bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]); bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]); bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]); bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]); bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]); bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]); bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]); bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]); bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]); bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]); bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]); bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]); bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]); bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]); bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]); bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]); bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]); bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]); bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]); bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]); bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]); bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]); bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]); bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]); bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]); bf1[32] = bf0[32]; bf1[33] = bf0[33]; bf1[34] = bf0[34]; bf1[35] = bf0[35]; bf1[36] = bf0[36]; bf1[37] = bf0[37]; bf1[38] = bf0[38]; bf1[39] = bf0[39]; bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit); bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit); bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit); bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit); bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit); bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit); bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit); bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit); bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit); bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit); bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit); bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit); bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit); bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit); bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit); bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit); bf1[56] = bf0[56]; bf1[57] = bf0[57]; bf1[58] = bf0[58]; bf1[59] = bf0[59]; bf1[60] = bf0[60]; bf1[61] = bf0[61]; bf1[62] = bf0[62]; bf1[63] = bf0[63]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 11 stage++; bf0 = step; bf1 = output; bf1[0] = clamp_value(bf0[0] + bf0[63], stage_range[stage]); bf1[1] = clamp_value(bf0[1] + bf0[62], stage_range[stage]); bf1[2] = clamp_value(bf0[2] + bf0[61], stage_range[stage]); bf1[3] = clamp_value(bf0[3] + bf0[60], stage_range[stage]); bf1[4] = clamp_value(bf0[4] + bf0[59], stage_range[stage]); bf1[5] = clamp_value(bf0[5] + bf0[58], stage_range[stage]); bf1[6] = clamp_value(bf0[6] + bf0[57], stage_range[stage]); bf1[7] = clamp_value(bf0[7] + bf0[56], stage_range[stage]); bf1[8] = clamp_value(bf0[8] + bf0[55], stage_range[stage]); bf1[9] = clamp_value(bf0[9] + bf0[54], stage_range[stage]); bf1[10] = clamp_value(bf0[10] + bf0[53], stage_range[stage]); bf1[11] = clamp_value(bf0[11] + bf0[52], stage_range[stage]); bf1[12] = clamp_value(bf0[12] + bf0[51], stage_range[stage]); bf1[13] = clamp_value(bf0[13] + bf0[50], stage_range[stage]); bf1[14] = clamp_value(bf0[14] + bf0[49], stage_range[stage]); bf1[15] = clamp_value(bf0[15] + bf0[48], stage_range[stage]); bf1[16] = clamp_value(bf0[16] + bf0[47], stage_range[stage]); bf1[17] = clamp_value(bf0[17] + bf0[46], stage_range[stage]); bf1[18] = clamp_value(bf0[18] + bf0[45], stage_range[stage]); bf1[19] = clamp_value(bf0[19] + bf0[44], stage_range[stage]); bf1[20] = clamp_value(bf0[20] + bf0[43], stage_range[stage]); bf1[21] = clamp_value(bf0[21] + bf0[42], stage_range[stage]); bf1[22] = clamp_value(bf0[22] + bf0[41], stage_range[stage]); bf1[23] = clamp_value(bf0[23] + bf0[40], stage_range[stage]); bf1[24] = clamp_value(bf0[24] + bf0[39], stage_range[stage]); bf1[25] = clamp_value(bf0[25] + bf0[38], stage_range[stage]); bf1[26] = clamp_value(bf0[26] + bf0[37], stage_range[stage]); bf1[27] = clamp_value(bf0[27] + bf0[36], stage_range[stage]); bf1[28] = clamp_value(bf0[28] + bf0[35], stage_range[stage]); bf1[29] = clamp_value(bf0[29] + bf0[34], stage_range[stage]); bf1[30] = clamp_value(bf0[30] + bf0[33], stage_range[stage]); bf1[31] = clamp_value(bf0[31] + bf0[32], stage_range[stage]); bf1[32] = clamp_value(bf0[31] - bf0[32], stage_range[stage]); bf1[33] = clamp_value(bf0[30] - bf0[33], stage_range[stage]); bf1[34] = clamp_value(bf0[29] - bf0[34], stage_range[stage]); bf1[35] = clamp_value(bf0[28] - bf0[35], stage_range[stage]); bf1[36] = clamp_value(bf0[27] - bf0[36], stage_range[stage]); bf1[37] = clamp_value(bf0[26] - bf0[37], stage_range[stage]); bf1[38] = clamp_value(bf0[25] - bf0[38], stage_range[stage]); bf1[39] = clamp_value(bf0[24] - bf0[39], stage_range[stage]); bf1[40] = clamp_value(bf0[23] - bf0[40], stage_range[stage]); bf1[41] = clamp_value(bf0[22] - bf0[41], stage_range[stage]); bf1[42] = clamp_value(bf0[21] - bf0[42], stage_range[stage]); bf1[43] = clamp_value(bf0[20] - bf0[43], stage_range[stage]); bf1[44] = clamp_value(bf0[19] - bf0[44], stage_range[stage]); bf1[45] = clamp_value(bf0[18] - bf0[45], stage_range[stage]); bf1[46] = clamp_value(bf0[17] - bf0[46], stage_range[stage]); bf1[47] = clamp_value(bf0[16] - bf0[47], stage_range[stage]); bf1[48] = clamp_value(bf0[15] - bf0[48], stage_range[stage]); bf1[49] = clamp_value(bf0[14] - bf0[49], stage_range[stage]); bf1[50] = clamp_value(bf0[13] - bf0[50], stage_range[stage]); bf1[51] = clamp_value(bf0[12] - bf0[51], stage_range[stage]); bf1[52] = clamp_value(bf0[11] - bf0[52], stage_range[stage]); bf1[53] = clamp_value(bf0[10] - bf0[53], stage_range[stage]); bf1[54] = clamp_value(bf0[9] - bf0[54], stage_range[stage]); bf1[55] = clamp_value(bf0[8] - bf0[55], stage_range[stage]); bf1[56] = clamp_value(bf0[7] - bf0[56], stage_range[stage]); bf1[57] = clamp_value(bf0[6] - bf0[57], stage_range[stage]); bf1[58] = clamp_value(bf0[5] - bf0[58], stage_range[stage]); bf1[59] = clamp_value(bf0[4] - bf0[59], stage_range[stage]); bf1[60] = clamp_value(bf0[3] - bf0[60], stage_range[stage]); bf1[61] = clamp_value(bf0[2] - bf0[61], stage_range[stage]); bf1[62] = clamp_value(bf0[1] - bf0[62], stage_range[stage]); bf1[63] = clamp_value(bf0[0] - bf0[63], stage_range[stage]); } aom-3.12.1/av1/common/av1_inv_txfm1d.h000066400000000000000000000051011477627663500173540ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ #define AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ #include "av1/common/av1_txfm.h" #ifdef __cplusplus extern "C" { #endif static inline int32_t clamp_value(int32_t value, int8_t bit) { if (bit <= 0) return value; // Do nothing for invalid clamp bit. const int64_t max_value = (1LL << (bit - 1)) - 1; const int64_t min_value = -(1LL << (bit - 1)); return (int32_t)clamp64(value, min_value, max_value); } static inline void clamp_buf(int32_t *buf, int32_t size, int8_t bit) { for (int i = 0; i < size; ++i) buf[i] = clamp_value(buf[i], bit); } void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); #ifdef __cplusplus } #endif #endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ aom-3.12.1/av1/common/av1_inv_txfm1d_cfg.h000066400000000000000000000026541477627663500202050ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ #define AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ #include "av1/common/av1_inv_txfm1d.h" // sum of fwd_shift_## static const int8_t inv_start_range[TX_SIZES_ALL] = { 5, // 4x4 transform 6, // 8x8 transform 7, // 16x16 transform 7, // 32x32 transform 7, // 64x64 transform 5, // 4x8 transform 5, // 8x4 transform 6, // 8x16 transform 6, // 16x8 transform 6, // 16x32 transform 6, // 32x16 transform 6, // 32x64 transform 6, // 64x32 transform 6, // 4x16 transform 6, // 16x4 transform 7, // 8x32 transform 7, // 32x8 transform 7, // 16x64 transform 7, // 64x16 transform }; extern const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL]; // Values in both av1_inv_cos_bit_col and av1_inv_cos_bit_row are always 12 // for each valid row and col combination #define INV_COS_BIT 12 #endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ aom-3.12.1/av1/common/av1_inv_txfm2d.c000066400000000000000000000460421477627663500173610ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" #include "av1/common/enums.h" #include "av1/common/av1_txfm.h" #include "av1/common/av1_inv_txfm1d.h" #include "av1/common/av1_inv_txfm1d_cfg.h" void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ int i; tran_low_t output[16]; tran_low_t a1, b1, c1, d1, e1; const tran_low_t *ip = input; tran_low_t *op = output; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); for (i = 0; i < 4; i++) { a1 = ip[4 * 0] >> UNIT_QUANT_SHIFT; c1 = ip[4 * 1] >> UNIT_QUANT_SHIFT; d1 = ip[4 * 2] >> UNIT_QUANT_SHIFT; b1 = ip[4 * 3] >> UNIT_QUANT_SHIFT; a1 += c1; d1 -= b1; e1 = (a1 - d1) >> 1; b1 = e1 - b1; c1 = e1 - c1; a1 -= b1; d1 += c1; op[4 * 0] = a1; op[4 * 1] = b1; op[4 * 2] = c1; op[4 * 3] = d1; ip++; op++; } ip = output; for (i = 0; i < 4; i++) { a1 = ip[0]; c1 = ip[1]; d1 = ip[2]; b1 = ip[3]; a1 += c1; d1 -= b1; e1 = (a1 - d1) >> 1; b1 = e1 - b1; c1 = e1 - c1; a1 -= b1; d1 += c1; range_check_value(a1, bd + 1); range_check_value(b1, bd + 1); range_check_value(c1, bd + 1); range_check_value(d1, bd + 1); dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd); dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd); dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd); dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd); ip += 4; dest++; } } void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, int dest_stride, int bd) { int i; tran_low_t a1, e1; tran_low_t tmp[4]; const tran_low_t *ip = in; tran_low_t *op = tmp; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); (void)bd; a1 = ip[0 * 4] >> UNIT_QUANT_SHIFT; e1 = a1 >> 1; a1 -= e1; op[0] = a1; op[1] = op[2] = op[3] = e1; ip = tmp; for (i = 0; i < 4; i++) { e1 = ip[0] >> 1; a1 = ip[0] - e1; dest[dest_stride * 0] = highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd); dest[dest_stride * 1] = highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd); dest[dest_stride * 2] = highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd); dest[dest_stride * 3] = highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd); ip++; dest++; } } static inline TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) { switch (txfm_type) { case TXFM_TYPE_DCT4: return av1_idct4; case TXFM_TYPE_DCT8: return av1_idct8; case TXFM_TYPE_DCT16: return av1_idct16; case TXFM_TYPE_DCT32: return av1_idct32; case TXFM_TYPE_DCT64: return av1_idct64; case TXFM_TYPE_ADST4: return av1_iadst4; case TXFM_TYPE_ADST8: return av1_iadst8; case TXFM_TYPE_ADST16: return av1_iadst16; case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c; case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c; case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c; case TXFM_TYPE_IDENTITY32: return av1_iidentity32_c; default: assert(0); return NULL; } } static const int8_t inv_shift_4x4[2] = { 0, -4 }; static const int8_t inv_shift_8x8[2] = { -1, -4 }; static const int8_t inv_shift_16x16[2] = { -2, -4 }; static const int8_t inv_shift_32x32[2] = { -2, -4 }; static const int8_t inv_shift_64x64[2] = { -2, -4 }; static const int8_t inv_shift_4x8[2] = { 0, -4 }; static const int8_t inv_shift_8x4[2] = { 0, -4 }; static const int8_t inv_shift_8x16[2] = { -1, -4 }; static const int8_t inv_shift_16x8[2] = { -1, -4 }; static const int8_t inv_shift_16x32[2] = { -1, -4 }; static const int8_t inv_shift_32x16[2] = { -1, -4 }; static const int8_t inv_shift_32x64[2] = { -1, -4 }; static const int8_t inv_shift_64x32[2] = { -1, -4 }; static const int8_t inv_shift_4x16[2] = { -1, -4 }; static const int8_t inv_shift_16x4[2] = { -1, -4 }; static const int8_t inv_shift_8x32[2] = { -2, -4 }; static const int8_t inv_shift_32x8[2] = { -2, -4 }; static const int8_t inv_shift_16x64[2] = { -2, -4 }; static const int8_t inv_shift_64x16[2] = { -2, -4 }; const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL] = { inv_shift_4x4, inv_shift_8x8, inv_shift_16x16, inv_shift_32x32, inv_shift_64x64, inv_shift_4x8, inv_shift_8x4, inv_shift_8x16, inv_shift_16x8, inv_shift_16x32, inv_shift_32x16, inv_shift_32x64, inv_shift_64x32, inv_shift_4x16, inv_shift_16x4, inv_shift_8x32, inv_shift_32x8, inv_shift_16x64, inv_shift_64x16, }; static const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 }; void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, TXFM_2D_FLIP_CFG *cfg) { assert(cfg != NULL); cfg->tx_size = tx_size; av1_zero(cfg->stage_range_col); av1_zero(cfg->stage_range_row); set_flip_cfg(tx_type, cfg); const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type]; const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type]; cfg->shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); cfg->cos_bit_col = INV_COS_BIT; cfg->cos_bit_row = INV_COS_BIT; cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col]; if (cfg->txfm_type_col == TXFM_TYPE_ADST4) { memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range)); } cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row]; if (cfg->txfm_type_row == TXFM_TYPE_ADST4) { memcpy(cfg->stage_range_row, iadst4_range, sizeof(iadst4_range)); } cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col]; cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row]; } void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size, int bd) { const int fwd_shift = inv_start_range[tx_size]; const int8_t *shift = cfg->shift; int8_t opt_range_row, opt_range_col; if (bd == 8) { opt_range_row = 16; opt_range_col = 16; } else if (bd == 10) { opt_range_row = 18; opt_range_col = 16; } else { assert(bd == 12); opt_range_row = 20; opt_range_col = 18; } // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) { int real_range_row = cfg->stage_range_row[i] + fwd_shift + bd + 1; (void)real_range_row; if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) { // the adst4 may use 1 extra bit on top of opt_range_row at stage 1 // so opt_range_row >= real_range_row will not hold stage_range_row[i] = opt_range_row; } else { assert(opt_range_row >= real_range_row); stage_range_row[i] = opt_range_row; } } // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) { int real_range_col = cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1; (void)real_range_col; if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) { // the adst4 may use 1 extra bit on top of opt_range_col at stage 1 // so opt_range_col >= real_range_col will not hold stage_range_col[i] = opt_range_col; } else { assert(opt_range_col >= real_range_col); stage_range_col[i] = opt_range_col; } } } static inline void inv_txfm2d_add_c(const int32_t *input, uint16_t *output, int stride, TXFM_2D_FLIP_CFG *cfg, int32_t *txfm_buf, TX_SIZE tx_size, int bd) { // Note when assigning txfm_size_col, we use the txfm_size from the // row configuration and vice versa. This is intentionally done to // accurately perform rectangular transforms. When the transform is // rectangular, the number of columns will be the same as the // txfm_size stored in the row cfg struct. It will make no difference // for square transforms. const int txfm_size_col = tx_size_wide[cfg->tx_size]; const int txfm_size_row = tx_size_high[cfg->tx_size]; // Take the shift from the larger dimension in the rectangular case. const int8_t *shift = cfg->shift; const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM); assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM); av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, tx_size, bd); const int8_t cos_bit_col = cfg->cos_bit_col; const int8_t cos_bit_row = cfg->cos_bit_row; const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->txfm_type_col); const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->txfm_type_row); // txfm_buf's length is txfm_size_row * txfm_size_col + 2 * // AOMMAX(txfm_size_row, txfm_size_col) // it is used for intermediate data buffering const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); int32_t *temp_in = txfm_buf; int32_t *temp_out = temp_in + buf_offset; int32_t *buf = temp_out + buf_offset; int32_t *buf_ptr = buf; int c, r; // Rows for (r = 0; r < txfm_size_row; ++r) { if (abs(rect_type) == 1) { for (c = 0; c < txfm_size_col; ++c) { temp_in[c] = round_shift( (int64_t)input[c * txfm_size_row + r] * NewInvSqrt2, NewSqrt2Bits); } clamp_buf(temp_in, txfm_size_col, bd + 8); txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row); } else { for (c = 0; c < txfm_size_col; ++c) { temp_in[c] = input[c * txfm_size_row + r]; } clamp_buf(temp_in, txfm_size_col, bd + 8); txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row); } av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); buf_ptr += txfm_size_col; } // Columns for (c = 0; c < txfm_size_col; ++c) { if (cfg->lr_flip == 0) { for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + c]; } else { // flip left right for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; } clamp_buf(temp_in, txfm_size_row, AOMMAX(bd + 6, 16)); txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col); av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); if (cfg->ud_flip == 0) { for (r = 0; r < txfm_size_row; ++r) { output[r * stride + c] = highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); } } else { // flip upside down for (r = 0; r < txfm_size_row; ++r) { output[r * stride + c] = highbd_clip_pixel_add( output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); } } } } static inline void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output, int stride, int32_t *txfm_buf, TX_TYPE tx_type, TX_SIZE tx_size, int bd) { TXFM_2D_FLIP_CFG cfg; av1_get_inv_txfm_cfg(tx_type, tx_size, &cfg); // Forward shift sum uses larger square size, to be consistent with what // av1_gen_inv_stage_range() does for inverse shifts. inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf, tx_size, bd); } void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]); inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X8, bd); } void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]); inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X4, bd); } void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int, txfm_buf[8 * 16 + 16 + 16]); inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X16, bd); } void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int, txfm_buf[16 * 8 + 16 + 16]); inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X8, bd); } void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int, txfm_buf[16 * 32 + 32 + 32]); inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X32, bd); } void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int, txfm_buf[32 * 16 + 32 + 32]); inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X16, bd); } void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 4 + 4]); inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X4, bd); } void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int, txfm_buf[8 * 8 + 8 + 8]); inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X8, bd); } void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int, txfm_buf[16 * 16 + 16 + 16]); inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X16, bd); } void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X32, bd); } void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { // TODO(urvang): Can the same array be reused, instead of using a new array? // Remap 32x32 input into a modified 64x64 by: // - Copying over these values in top-left 32x32 locations. // - Setting the rest of the locations to 0. int32_t mod_input[64 * 64]; for (int col = 0; col < 32; ++col) { memcpy(mod_input + col * 64, input + col * 32, 32 * sizeof(*mod_input)); memset(mod_input + col * 64 + 32, 0, 32 * sizeof(*mod_input)); } memset(mod_input + 32 * 64, 0, 32 * 64 * sizeof(*mod_input)); DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]); inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X64, bd); } void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { // Remap 32x32 input into a modified 64x32 by: // - Copying over these values in top-left 32x32 locations. // - Setting the rest of the locations to 0. int32_t mod_input[32 * 64]; memcpy(mod_input, input, 32 * 32 * sizeof(*mod_input)); memset(mod_input + 32 * 32, 0, 32 * 32 * sizeof(*mod_input)); DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]); inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X32, bd); } void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { // Remap 32x32 input into a modified 32x64 input by: // - Copying over these values in top-left 32x32 locations. // - Setting the rest of the locations to 0. int32_t mod_input[64 * 32]; for (int col = 0; col < 32; ++col) { memcpy(mod_input + col * 64, input + col * 32, 32 * sizeof(*mod_input)); memset(mod_input + col * 64 + 32, 0, 32 * sizeof(*mod_input)); } DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]); inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_32X64, bd); } void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { // Remap 16x32 input into a modified 16x64 input by: // - Copying over these values in top-left 16x32 locations. // - Setting the rest of the locations to 0. int32_t mod_input[64 * 16]; for (int col = 0; col < 16; ++col) { memcpy(mod_input + col * 64, input + col * 32, 32 * sizeof(*mod_input)); memset(mod_input + col * 64 + 32, 0, 32 * sizeof(*mod_input)); } DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]); inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_16X64, bd); } void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { // Remap 32x16 input into a modified 64x16 by: // - Copying over these values in top-left 32x16 locations. // - Setting the rest of the locations to 0. int32_t mod_input[16 * 64]; memcpy(mod_input, input, 16 * 32 * sizeof(*mod_input)); memset(mod_input + 16 * 32, 0, 16 * 32 * sizeof(*mod_input)); DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]); inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X16, bd); } void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X16, bd); } void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X4, bd); } void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]); inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X32, bd); } void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]); inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X8, bd); } aom-3.12.1/av1/common/av1_loopfilter.c000066400000000000000000002402141477627663500174550ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "av1/common/av1_common_int.h" #include "av1/common/av1_loopfilter.h" #include "av1/common/reconinter.h" #include "av1/common/seg_common.h" enum { USE_SINGLE, USE_DUAL, USE_QUAD, } UENUM1BYTE(USE_FILTER_TYPE); static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = { { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H }, { SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U }, { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V } }; static const int delta_lf_id_lut[MAX_MB_PLANE][2] = { { 0, 1 }, { 2, 2 }, { 3, 3 } }; static const int mode_lf_lut[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES 1, 1, 0, 1, // INTER_MODES (GLOBALMV == 0) 1, 1, 1, 1, 1, 1, 0, 1 // INTER_COMPOUND_MODES (GLOBAL_GLOBALMV == 0) }; static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { int lvl; // For each possible value for the loop filter fill out limits for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) { // Set loop filter parameters that control sharpness. int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4)); if (sharpness_lvl > 0) { if (block_inside_limit > (9 - sharpness_lvl)) block_inside_limit = (9 - sharpness_lvl); } if (block_inside_limit < 1) block_inside_limit = 1; memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH); memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit), SIMD_WIDTH); } } static uint8_t get_filter_level(const AV1_COMMON *cm, const loop_filter_info_n *lfi_n, const int dir_idx, int plane, const MB_MODE_INFO *mbmi) { const int segment_id = mbmi->segment_id; if (cm->delta_q_info.delta_lf_present_flag) { int8_t delta_lf; if (cm->delta_q_info.delta_lf_multi) { const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx]; delta_lf = mbmi->delta_lf[delta_lf_idx]; } else { delta_lf = mbmi->delta_lf_from_base; } int base_level; if (plane == 0) base_level = cm->lf.filter_level[dir_idx]; else if (plane == 1) base_level = cm->lf.filter_level_u; else base_level = cm->lf.filter_level_v; int lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER); assert(plane >= 0 && plane <= 2); const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx]; if (segfeature_active(&cm->seg, segment_id, seg_lf_feature_id)) { const int data = get_segdata(&cm->seg, segment_id, seg_lf_feature_id); lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER); } if (cm->lf.mode_ref_delta_enabled) { const int scale = 1 << (lvl_seg >> 5); lvl_seg += cm->lf.ref_deltas[mbmi->ref_frame[0]] * scale; if (mbmi->ref_frame[0] > INTRA_FRAME) lvl_seg += cm->lf.mode_deltas[mode_lf_lut[mbmi->mode]] * scale; lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER); } return lvl_seg; } else { return lfi_n->lvl[plane][segment_id][dir_idx][mbmi->ref_frame[0]] [mode_lf_lut[mbmi->mode]]; } } void av1_loop_filter_init(AV1_COMMON *cm) { assert(MB_MODE_COUNT == NELEMENTS(mode_lf_lut)); loop_filter_info_n *lfi = &cm->lf_info; struct loopfilter *lf = &cm->lf; int lvl; // init limits for given sharpness update_sharpness(lfi, lf->sharpness_level); // init hev threshold const vectors for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH); } // Update the loop filter for the current frame. // This should be called before loop_filter_rows(), // av1_loop_filter_frame() calls this function directly. void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start, int plane_end) { int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE]; int plane; int seg_id; // n_shift is the multiplier for lf_deltas // the multiplier is 1 for when filter_lvl is between 0 and 31; // 2 when filter_lvl is between 32 and 63 loop_filter_info_n *const lfi = &cm->lf_info; struct loopfilter *const lf = &cm->lf; const struct segmentation *const seg = &cm->seg; // update sharpness limits update_sharpness(lfi, lf->sharpness_level); filt_lvl[0] = cm->lf.filter_level[0]; filt_lvl[1] = cm->lf.filter_level_u; filt_lvl[2] = cm->lf.filter_level_v; filt_lvl_r[0] = cm->lf.filter_level[1]; filt_lvl_r[1] = cm->lf.filter_level_u; filt_lvl_r[2] = cm->lf.filter_level_v; assert(plane_start >= AOM_PLANE_Y); assert(plane_end <= MAX_MB_PLANE); for (plane = plane_start; plane < plane_end; plane++) { if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0]) break; else if (plane == 1 && !filt_lvl[1]) continue; else if (plane == 2 && !filt_lvl[2]) continue; for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) { for (int dir = 0; dir < 2; ++dir) { int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane]; const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir]; if (segfeature_active(seg, seg_id, seg_lf_feature_id)) { const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id); lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER); } if (!lf->mode_ref_delta_enabled) { // we could get rid of this if we assume that deltas are set to // zero when not in use; encoder always uses deltas memset(lfi->lvl[plane][seg_id][dir], lvl_seg, sizeof(lfi->lvl[plane][seg_id][dir])); } else { int ref, mode; const int scale = 1 << (lvl_seg >> 5); const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale; lfi->lvl[plane][seg_id][dir][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER); for (ref = LAST_FRAME; ref < REF_FRAMES; ++ref) { for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) { const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale + lf->mode_deltas[mode] * scale; lfi->lvl[plane][seg_id][dir][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER); } } } } } } } static AOM_FORCE_INLINE TX_SIZE get_transform_size(const MACROBLOCKD *const xd, const MB_MODE_INFO *const mbmi, const int mi_row, const int mi_col, const int plane, const int ss_x, const int ss_y) { assert(mbmi != NULL); if (xd && xd->lossless[mbmi->segment_id]) return TX_4X4; TX_SIZE tx_size = (plane == AOM_PLANE_Y) ? mbmi->tx_size : av1_get_max_uv_txsize(mbmi->bsize, ss_x, ss_y); assert(tx_size < TX_SIZES_ALL); if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip_txfm) { const BLOCK_SIZE sb_type = mbmi->bsize; const int blk_row = mi_row & (mi_size_high[sb_type] - 1); const int blk_col = mi_col & (mi_size_wide[sb_type] - 1); const TX_SIZE mb_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(sb_type, blk_row, blk_col)]; assert(mb_tx_size < TX_SIZES_ALL); tx_size = mb_tx_size; } return tx_size; } static const int tx_dim_to_filter_length[TX_SIZES] = { 4, 8, 14, 14, 14 }; // Return TX_SIZE from get_transform_size(), so it is plane and direction // aware static TX_SIZE set_lpf_parameters( AV1_DEBLOCKING_PARAMETERS *const params, const ptrdiff_t mode_step, const AV1_COMMON *const cm, const MACROBLOCKD *const xd, const EDGE_DIR edge_dir, const uint32_t x, const uint32_t y, const int plane, const struct macroblockd_plane *const plane_ptr) { // reset to initial values params->filter_length = 0; // no deblocking is required const uint32_t width = plane_ptr->dst.width; const uint32_t height = plane_ptr->dst.height; if ((width <= x) || (height <= y)) { // just return the smallest transform unit size return TX_4X4; } const uint32_t scale_horz = plane_ptr->subsampling_x; const uint32_t scale_vert = plane_ptr->subsampling_y; // for sub8x8 block, chroma prediction mode is obtained from the bottom/right // mi structure of the co-located 8x8 luma block. so for chroma plane, mi_row // and mi_col should map to the bottom/right mi structure, i.e, both mi_row // and mi_col should be odd number for chroma plane. const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2); const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2); MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col; const MB_MODE_INFO *mbmi = mi[0]; // If current mbmi is not correctly setup, return an invalid value to stop // filtering. One example is that if this tile is not coded, then its mbmi // it not set up. if (mbmi == NULL) return TX_INVALID; const TX_SIZE ts = get_transform_size(xd, mi[0], mi_row, mi_col, plane, scale_horz, scale_vert); { const uint32_t coord = (VERT_EDGE == edge_dir) ? (x) : (y); const uint32_t transform_masks = edge_dir == VERT_EDGE ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1; const int32_t tu_edge = (coord & transform_masks) ? (0) : (1); if (!tu_edge) return ts; // prepare outer edge parameters. deblock the edge if it's an edge of a TU { const uint32_t curr_level = get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi); const int curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi); uint32_t level = curr_level; if (coord) { { const MB_MODE_INFO *const mi_prev = *(mi - mode_step); if (mi_prev == NULL) return TX_INVALID; const int pv_row = (VERT_EDGE == edge_dir) ? (mi_row) : (mi_row - (1 << scale_vert)); const int pv_col = (VERT_EDGE == edge_dir) ? (mi_col - (1 << scale_horz)) : (mi_col); const TX_SIZE pv_ts = get_transform_size( xd, mi_prev, pv_row, pv_col, plane, scale_horz, scale_vert); const uint32_t pv_lvl = get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev); const int pv_skip_txfm = mi_prev->skip_txfm && is_inter_block(mi_prev); const BLOCK_SIZE bsize = get_plane_block_size( mbmi->bsize, plane_ptr->subsampling_x, plane_ptr->subsampling_y); assert(bsize < BLOCK_SIZES_ALL); const int prediction_masks = edge_dir == VERT_EDGE ? block_size_wide[bsize] - 1 : block_size_high[bsize] - 1; const int32_t pu_edge = !(coord & prediction_masks); // if the current and the previous blocks are skipped, // deblock the edge if the edge belongs to a PU's edge only. if ((curr_level || pv_lvl) && (!pv_skip_txfm || !curr_skipped || pu_edge)) { const int dim = (VERT_EDGE == edge_dir) ? AOMMIN(tx_size_wide_unit_log2[ts], tx_size_wide_unit_log2[pv_ts]) : AOMMIN(tx_size_high_unit_log2[ts], tx_size_high_unit_log2[pv_ts]); if (plane) { params->filter_length = (dim == 0) ? 4 : 6; } else { assert(dim < TX_SIZES); assert(dim >= 0); params->filter_length = tx_dim_to_filter_length[dim]; } // update the level if the current block is skipped, // but the previous one is not level = (curr_level) ? (curr_level) : (pv_lvl); } } } // prepare common parameters if (params->filter_length) { const loop_filter_thresh *const limits = cm->lf_info.lfthr + level; params->lfthr = limits; } } } return ts; } static const uint32_t vert_filter_length_luma[TX_SIZES_ALL][TX_SIZES_ALL] = { // TX_4X4 { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, }, // TX_8X8 { 4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, }, // TX_16X16 { 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, }, // TX_32X32 { 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, }, // TX_64X64 { 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, }, // TX_4X8 { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, }, // TX_8X4 { 4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, }, // TX_8X16 { 4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, }, // TX_16X8 { 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, }, // TX_16X32 { 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, }, // TX_32X16 { 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, }, // TX_32X64 { 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, }, // TX_64X32 { 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, }, // TX_4X16 { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, }, // TX_16X4 { 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, }, // TX_8X32 { 4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, }, // TX_32X8 { 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, }, // TX_16X64 { 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, }, // TX_64X16 { 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, }, }; static const uint32_t horz_filter_length_luma[TX_SIZES_ALL][TX_SIZES_ALL] = { // TX_4X4 { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, }, // TX_8X8 { 4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, }, // TX_16X16 { 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, }, // TX_32X32 { 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, }, // TX_64X64 { 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, }, // TX_4X8 { 4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, }, // TX_8X4 { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, }, // TX_8X16 { 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, }, // TX_16X8 { 4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, }, // TX_16X32 { 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, }, // TX_32X16 { 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, }, // TX_32X64 { 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, }, // TX_64X32 { 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, }, // TX_4X16 { 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, }, // TX_16X4 { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, }, // TX_8X32 { 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, }, // TX_32X8 { 4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, }, // TX_16X64 { 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, }, // TX_64X16 { 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, }, }; static const uint32_t vert_filter_length_chroma[TX_SIZES_ALL][TX_SIZES_ALL] = { // TX_4X4 { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, }, // TX_8X8 { 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, }, // TX_16X16 { 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, }, // TX_32X32 { 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, }, // TX_64X64 { 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, }, // TX_4X8 { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, }, // TX_8X4 { 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, }, // TX_8X16 { 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, }, // TX_16X8 { 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, }, // TX_16X32 { 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, }, // TX_32X16 { 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, }, // TX_32X64 { 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, }, // TX_64X32 { 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, }, // TX_4X16 { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, }, // TX_16X4 { 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, }, // TX_8X32 { 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, }, // TX_32X8 { 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, }, // TX_16X64 { 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, }, // TX_64X16 { 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, }, }; static const uint32_t horz_filter_length_chroma[TX_SIZES_ALL][TX_SIZES_ALL] = { // TX_4X4 { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, }, // TX_8X8 { 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, }, // TX_16X16 { 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, }, // TX_32X32 { 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, }, // TX_64X64 { 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, }, // TX_4X8 { 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, }, // TX_8X4 { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, }, // TX_8X16 { 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, }, // TX_16X8 { 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, }, // TX_16X32 { 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, }, // TX_32X16 { 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, }, // TX_32X64 { 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, }, // TX_64X32 { 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, }, // TX_4X16 { 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, }, // TX_16X4 { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, }, // TX_8X32 { 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, }, // TX_32X8 { 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, }, // TX_16X64 { 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, }, // TX_64X16 { 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, }, }; static AOM_FORCE_INLINE void set_one_param_for_line_luma( AV1_DEBLOCKING_PARAMETERS *const params, TX_SIZE *tx_size, const AV1_COMMON *const cm, const MACROBLOCKD *const xd, const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row, const struct macroblockd_plane *const plane_ptr, int coord, bool is_first_block, TX_SIZE prev_tx_size, const ptrdiff_t mode_step, int *min_dim) { (void)plane_ptr; assert(mi_col << MI_SIZE_LOG2 < (uint32_t)plane_ptr->dst.width && mi_row << MI_SIZE_LOG2 < (uint32_t)plane_ptr->dst.height); const int is_vert = edge_dir == VERT_EDGE; // reset to initial values params->filter_length = 0; MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col; const MB_MODE_INFO *mbmi = mi[0]; assert(mbmi); const TX_SIZE ts = get_transform_size(xd, mi[0], mi_row, mi_col, AOM_PLANE_Y, 0, 0); #ifndef NDEBUG const uint32_t transform_masks = is_vert ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1; const int32_t tu_edge = ((coord * MI_SIZE) & transform_masks) ? (0) : (1); assert(tu_edge); #endif // NDEBUG // If we are not the first block, then coord is always true, so // !is_first_block is technically redundant. But we are keeping it here so the // compiler can compile away this conditional if we pass in is_first_block := // false bool curr_skipped = false; if (!is_first_block || coord) { const MB_MODE_INFO *const mi_prev = *(mi - mode_step); const int pv_row = is_vert ? mi_row : (mi_row - 1); const int pv_col = is_vert ? (mi_col - 1) : mi_col; const TX_SIZE pv_ts = is_first_block ? get_transform_size(xd, mi_prev, pv_row, pv_col, AOM_PLANE_Y, 0, 0) : prev_tx_size; if (is_first_block) { *min_dim = is_vert ? block_size_high[mi_prev->bsize] : block_size_wide[mi_prev->bsize]; } assert(mi_prev); uint8_t level = get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_Y, mbmi); if (!level) { level = get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_Y, mi_prev); } const int32_t pu_edge = mi_prev != mbmi; // The quad loop filter assumes that all the transform blocks within a // 8x16/16x8/16x16 prediction block are of the same size. assert(IMPLIES( !pu_edge && (mbmi->bsize >= BLOCK_8X16 && mbmi->bsize <= BLOCK_16X16), pv_ts == ts)); if (!pu_edge) { curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi); } if ((pu_edge || !curr_skipped) && level) { params->filter_length = is_vert ? vert_filter_length_luma[ts][pv_ts] : horz_filter_length_luma[ts][pv_ts]; // prepare common parameters const loop_filter_thresh *const limits = cm->lf_info.lfthr + level; params->lfthr = limits; } } const int block_dim = is_vert ? block_size_high[mbmi->bsize] : block_size_wide[mbmi->bsize]; *min_dim = AOMMIN(*min_dim, block_dim); *tx_size = ts; } // Similar to set_lpf_parameters, but does so one row/col at a time to reduce // calls to \ref get_transform_size and \ref get_filter_level static AOM_FORCE_INLINE void set_lpf_parameters_for_line_luma( AV1_DEBLOCKING_PARAMETERS *const params_buf, TX_SIZE *tx_buf, const AV1_COMMON *const cm, const MACROBLOCKD *const xd, const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row, const struct macroblockd_plane *const plane_ptr, const uint32_t mi_range, const ptrdiff_t mode_step, int *min_dim) { const int is_vert = edge_dir == VERT_EDGE; AV1_DEBLOCKING_PARAMETERS *params = params_buf; TX_SIZE *tx_size = tx_buf; uint32_t *counter_ptr = is_vert ? &mi_col : &mi_row; TX_SIZE prev_tx_size = TX_INVALID; // Unroll the first iteration of the loop set_one_param_for_line_luma(params, tx_size, cm, xd, edge_dir, mi_col, mi_row, plane_ptr, *counter_ptr, true, prev_tx_size, mode_step, min_dim); // Advance int advance_units = is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size]; prev_tx_size = *tx_size; *counter_ptr += advance_units; params += advance_units; tx_size += advance_units; while (*counter_ptr < mi_range) { set_one_param_for_line_luma(params, tx_size, cm, xd, edge_dir, mi_col, mi_row, plane_ptr, *counter_ptr, false, prev_tx_size, mode_step, min_dim); // Advance advance_units = is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size]; prev_tx_size = *tx_size; *counter_ptr += advance_units; params += advance_units; tx_size += advance_units; } } static AOM_FORCE_INLINE void set_one_param_for_line_chroma( AV1_DEBLOCKING_PARAMETERS *const params, TX_SIZE *tx_size, const AV1_COMMON *const cm, const MACROBLOCKD *const xd, const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row, int coord, bool is_first_block, TX_SIZE prev_tx_size, const struct macroblockd_plane *const plane_ptr, const ptrdiff_t mode_step, const int scale_horz, const int scale_vert, int *min_dim, int plane, int joint_filter_chroma) { const int is_vert = edge_dir == VERT_EDGE; (void)plane_ptr; assert((mi_col << MI_SIZE_LOG2) < (uint32_t)(plane_ptr->dst.width << scale_horz) && (mi_row << MI_SIZE_LOG2) < (uint32_t)(plane_ptr->dst.height << scale_vert)); // reset to initial values params->filter_length = 0; // for sub8x8 block, chroma prediction mode is obtained from the // bottom/right mi structure of the co-located 8x8 luma block. so for chroma // plane, mi_row and mi_col should map to the bottom/right mi structure, // i.e, both mi_row and mi_col should be odd number for chroma plane. mi_row |= scale_vert; mi_col |= scale_horz; MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col; const MB_MODE_INFO *mbmi = mi[0]; assert(mbmi); const TX_SIZE ts = get_transform_size(xd, mi[0], mi_row, mi_col, plane, scale_horz, scale_vert); *tx_size = ts; #ifndef NDEBUG const uint32_t transform_masks = is_vert ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1; const int32_t tu_edge = ((coord * MI_SIZE) & transform_masks) ? (0) : (1); assert(tu_edge); #endif // NDEBUG // If we are not the first block, then coord is always true, so // !is_first_block is technically redundant. But we are keeping it here so the // compiler can compile away this conditional if we pass in is_first_block := // false bool curr_skipped = false; if (!is_first_block || coord) { const MB_MODE_INFO *const mi_prev = *(mi - mode_step); assert(mi_prev); const int pv_row = is_vert ? (mi_row) : (mi_row - (1 << scale_vert)); const int pv_col = is_vert ? (mi_col - (1 << scale_horz)) : (mi_col); const TX_SIZE pv_ts = is_first_block ? get_transform_size(xd, mi_prev, pv_row, pv_col, plane, scale_horz, scale_vert) : prev_tx_size; if (is_first_block) { *min_dim = is_vert ? tx_size_high[pv_ts] : tx_size_wide[pv_ts]; } uint8_t level = get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi); if (!level) { level = get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev); } #ifndef NDEBUG if (joint_filter_chroma) { uint8_t v_level = get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_V, mbmi); if (!v_level) { v_level = get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_V, mi_prev); } assert(level == v_level); } #else (void)joint_filter_chroma; #endif // NDEBUG const int32_t pu_edge = mi_prev != mbmi; if (!pu_edge) { curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi); } // For realtime mode, u and v have the same level if ((!curr_skipped || pu_edge) && level) { params->filter_length = is_vert ? vert_filter_length_chroma[ts][pv_ts] : horz_filter_length_chroma[ts][pv_ts]; const loop_filter_thresh *const limits = cm->lf_info.lfthr; params->lfthr = limits + level; } } const int tx_dim = is_vert ? tx_size_high[ts] : tx_size_wide[ts]; *min_dim = AOMMIN(*min_dim, tx_dim); } static AOM_FORCE_INLINE void set_lpf_parameters_for_line_chroma( AV1_DEBLOCKING_PARAMETERS *const params_buf, TX_SIZE *tx_buf, const AV1_COMMON *const cm, const MACROBLOCKD *const xd, const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row, const struct macroblockd_plane *const plane_ptr, const uint32_t mi_range, const ptrdiff_t mode_step, const int scale_horz, const int scale_vert, int *min_dim, int plane, int joint_filter_chroma) { const int is_vert = edge_dir == VERT_EDGE; AV1_DEBLOCKING_PARAMETERS *params = params_buf; TX_SIZE *tx_size = tx_buf; uint32_t *counter_ptr = is_vert ? &mi_col : &mi_row; const uint32_t scale = is_vert ? scale_horz : scale_vert; TX_SIZE prev_tx_size = TX_INVALID; // Unroll the first iteration of the loop set_one_param_for_line_chroma(params, tx_size, cm, xd, edge_dir, mi_col, mi_row, *counter_ptr, true, prev_tx_size, plane_ptr, mode_step, scale_horz, scale_vert, min_dim, plane, joint_filter_chroma); // Advance int advance_units = is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size]; prev_tx_size = *tx_size; *counter_ptr += advance_units << scale; params += advance_units; tx_size += advance_units; while (*counter_ptr < mi_range) { set_one_param_for_line_chroma(params, tx_size, cm, xd, edge_dir, mi_col, mi_row, *counter_ptr, false, prev_tx_size, plane_ptr, mode_step, scale_horz, scale_vert, min_dim, plane, joint_filter_chroma); // Advance advance_units = is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size]; prev_tx_size = *tx_size; *counter_ptr += advance_units << scale; params += advance_units; tx_size += advance_units; } } static inline void filter_vert(uint8_t *dst, int dst_stride, const AV1_DEBLOCKING_PARAMETERS *params, const SequenceHeader *seq_params, USE_FILTER_TYPE use_filter_type) { const loop_filter_thresh *limits = params->lfthr; #if CONFIG_AV1_HIGHBITDEPTH const int use_highbitdepth = seq_params->use_highbitdepth; const aom_bit_depth_t bit_depth = seq_params->bit_depth; if (use_highbitdepth) { uint16_t *dst_shortptr = CONVERT_TO_SHORTPTR(dst); if (use_filter_type == USE_QUAD) { switch (params->filter_length) { // apply 4-tap filtering case 4: aom_highbd_lpf_vertical_4_dual( dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); aom_highbd_lpf_vertical_4_dual( dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; case 6: // apply 6-tap filter for chroma plane only aom_highbd_lpf_vertical_6_dual( dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); aom_highbd_lpf_vertical_6_dual( dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // apply 8-tap filtering case 8: aom_highbd_lpf_vertical_8_dual( dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); aom_highbd_lpf_vertical_8_dual( dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // apply 14-tap filtering case 14: aom_highbd_lpf_vertical_14_dual( dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); aom_highbd_lpf_vertical_14_dual( dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // no filtering default: break; } } else if (use_filter_type == USE_DUAL) { switch (params->filter_length) { // apply 4-tap filtering case 4: aom_highbd_lpf_vertical_4_dual( dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; case 6: // apply 6-tap filter for chroma plane only aom_highbd_lpf_vertical_6_dual( dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // apply 8-tap filtering case 8: aom_highbd_lpf_vertical_8_dual( dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // apply 14-tap filtering case 14: aom_highbd_lpf_vertical_14_dual( dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // no filtering default: break; } } else { assert(use_filter_type == USE_SINGLE); switch (params->filter_length) { // apply 4-tap filtering case 4: aom_highbd_lpf_vertical_4(dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; case 6: // apply 6-tap filter for chroma plane only aom_highbd_lpf_vertical_6(dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // apply 8-tap filtering case 8: aom_highbd_lpf_vertical_8(dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // apply 14-tap filtering case 14: aom_highbd_lpf_vertical_14(dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // no filtering default: break; } } return; } #endif // CONFIG_AV1_HIGHBITDEPTH if (use_filter_type == USE_QUAD) { // Only one set of loop filter parameters (mblim, lim and hev_thr) is // passed as argument to quad loop filter because quad loop filter is // called for those cases where all the 4 set of loop filter parameters // are equal. switch (params->filter_length) { // apply 4-tap filtering case 4: aom_lpf_vertical_4_quad(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr); break; case 6: // apply 6-tap filter for chroma plane only aom_lpf_vertical_6_quad(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr); break; // apply 8-tap filtering case 8: aom_lpf_vertical_8_quad(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr); break; // apply 14-tap filtering case 14: aom_lpf_vertical_14_quad(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr); break; // no filtering default: break; } } else if (use_filter_type == USE_DUAL) { switch (params->filter_length) { // apply 4-tap filtering case 4: aom_lpf_vertical_4_dual(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr); break; case 6: // apply 6-tap filter for chroma plane only aom_lpf_vertical_6_dual(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr); break; // apply 8-tap filtering case 8: aom_lpf_vertical_8_dual(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr); break; // apply 14-tap filtering case 14: aom_lpf_vertical_14_dual(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr); break; // no filtering default: break; } } else { assert(use_filter_type == USE_SINGLE); switch (params->filter_length) { // apply 4-tap filtering case 4: aom_lpf_vertical_4(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr); break; case 6: // apply 6-tap filter for chroma plane only aom_lpf_vertical_6(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr); break; // apply 8-tap filtering case 8: aom_lpf_vertical_8(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr); break; // apply 14-tap filtering case 14: aom_lpf_vertical_14(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr); break; // no filtering default: break; } } #if !CONFIG_AV1_HIGHBITDEPTH (void)seq_params; #endif // !CONFIG_AV1_HIGHBITDEPTH } static inline void filter_vert_chroma(uint8_t *u_dst, uint8_t *v_dst, int dst_stride, const AV1_DEBLOCKING_PARAMETERS *params, const SequenceHeader *seq_params, USE_FILTER_TYPE use_filter_type) { const loop_filter_thresh *u_limits = params->lfthr; const loop_filter_thresh *v_limits = params->lfthr; #if CONFIG_AV1_HIGHBITDEPTH const int use_highbitdepth = seq_params->use_highbitdepth; const aom_bit_depth_t bit_depth = seq_params->bit_depth; if (use_highbitdepth) { uint16_t *u_dst_shortptr = CONVERT_TO_SHORTPTR(u_dst); uint16_t *v_dst_shortptr = CONVERT_TO_SHORTPTR(v_dst); if (use_filter_type == USE_QUAD) { switch (params->filter_length) { // apply 4-tap filtering case 4: aom_highbd_lpf_vertical_4_dual( u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); aom_highbd_lpf_vertical_4_dual( u_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); aom_highbd_lpf_vertical_4_dual( v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); aom_highbd_lpf_vertical_4_dual( v_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); break; case 6: // apply 6-tap filter for chroma plane only aom_highbd_lpf_vertical_6_dual( u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); aom_highbd_lpf_vertical_6_dual( u_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); aom_highbd_lpf_vertical_6_dual( v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); aom_highbd_lpf_vertical_6_dual( v_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); break; case 8: case 14: assert(0); // no filtering default: break; } } else if (use_filter_type == USE_DUAL) { switch (params->filter_length) { // apply 4-tap filtering case 4: aom_highbd_lpf_vertical_4_dual( u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); aom_highbd_lpf_vertical_4_dual( v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); break; case 6: // apply 6-tap filter for chroma plane only aom_highbd_lpf_vertical_6_dual( u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); aom_highbd_lpf_vertical_6_dual( v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); break; case 8: case 14: assert(0); // no filtering default: break; } } else { assert(use_filter_type == USE_SINGLE); switch (params->filter_length) { // apply 4-tap filtering case 4: aom_highbd_lpf_vertical_4(u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); aom_highbd_lpf_vertical_4(v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); break; case 6: // apply 6-tap filter for chroma plane only aom_highbd_lpf_vertical_6(u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); aom_highbd_lpf_vertical_6(v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); break; case 8: case 14: assert(0); break; // no filtering default: break; } } return; } #endif // CONFIG_AV1_HIGHBITDEPTH if (use_filter_type == USE_QUAD) { // Only one set of loop filter parameters (mblim, lim and hev_thr) is // passed as argument to quad loop filter because quad loop filter is // called for those cases where all the 4 set of loop filter parameters // are equal. switch (params->filter_length) { // apply 4-tap filtering case 4: aom_lpf_vertical_4_quad(u_dst, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr); aom_lpf_vertical_4_quad(v_dst, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr); break; case 6: // apply 6-tap filter for chroma plane only aom_lpf_vertical_6_quad(u_dst, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr); aom_lpf_vertical_6_quad(v_dst, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr); break; case 8: case 14: assert(0); // no filtering default: break; } } else if (use_filter_type == USE_DUAL) { switch (params->filter_length) { // apply 4-tap filtering case 4: aom_lpf_vertical_4_dual(u_dst, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, u_limits->hev_thr); aom_lpf_vertical_4_dual(v_dst, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, v_limits->hev_thr); break; case 6: // apply 6-tap filter for chroma plane only aom_lpf_vertical_6_dual(u_dst, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, u_limits->hev_thr); aom_lpf_vertical_6_dual(v_dst, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, v_limits->hev_thr); break; case 8: case 14: assert(0); // no filtering default: break; } } else { assert(use_filter_type == USE_SINGLE); switch (params->filter_length) { // apply 4-tap filtering case 4: aom_lpf_vertical_4(u_dst, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr); aom_lpf_vertical_4(v_dst, dst_stride, v_limits->mblim, v_limits->lim, u_limits->hev_thr); break; case 6: // apply 6-tap filter for chroma plane only aom_lpf_vertical_6(u_dst, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr); aom_lpf_vertical_6(v_dst, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr); break; case 8: case 14: assert(0); break; // no filtering default: break; } } #if !CONFIG_AV1_HIGHBITDEPTH (void)seq_params; #endif // !CONFIG_AV1_HIGHBITDEPTH } void av1_filter_block_plane_vert(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, const int plane, const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, const uint32_t mi_col) { const uint32_t scale_horz = plane_ptr->subsampling_x; const uint32_t scale_vert = plane_ptr->subsampling_y; uint8_t *const dst_ptr = plane_ptr->dst.buf; const int dst_stride = plane_ptr->dst.stride; const int plane_mi_rows = ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert); const int plane_mi_cols = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz); const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)), (MAX_MIB_SIZE >> scale_vert)); const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)), (MAX_MIB_SIZE >> scale_horz)); for (int y = 0; y < y_range; y++) { uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride; for (int x = 0; x < x_range;) { // inner loop always filter vertical edges in a MI block. If MI size // is 8x8, it will filter the vertical edge aligned with a 8x8 block. // If 4x4 transform is used, it will then filter the internal edge // aligned with a 4x4 block const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE; const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE; uint32_t advance_units; TX_SIZE tx_size; AV1_DEBLOCKING_PARAMETERS params; memset(¶ms, 0, sizeof(params)); tx_size = set_lpf_parameters(¶ms, ((ptrdiff_t)1 << scale_horz), cm, xd, VERT_EDGE, curr_x, curr_y, plane, plane_ptr); if (tx_size == TX_INVALID) { params.filter_length = 0; tx_size = TX_4X4; } filter_vert(p, dst_stride, ¶ms, cm->seq_params, USE_SINGLE); // advance the destination pointer advance_units = tx_size_wide_unit[tx_size]; x += advance_units; p += advance_units * MI_SIZE; } } } void av1_filter_block_plane_vert_opt( const AV1_COMMON *const cm, const MACROBLOCKD *const xd, const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2) { uint8_t *const dst_ptr = plane_ptr->dst.buf; const int dst_stride = plane_ptr->dst.stride; // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned // to MI_SIZE. const int plane_mi_cols = CEIL_POWER_OF_TWO(plane_ptr->dst.width, MI_SIZE_LOG2); const int plane_mi_rows = CEIL_POWER_OF_TWO(plane_ptr->dst.height, MI_SIZE_LOG2); // Whenever 'pipeline_lpf_mt_with_enc' is enabled, height of the unit to // filter (i.e., y_range) is calculated based on the size of the superblock // used. const int y_range = AOMMIN((int)(plane_mi_rows - mi_row), (1 << num_mis_in_lpf_unit_height_log2)); // Width of the unit to filter (i.e., x_range) should always be calculated // based on maximum superblock size as this function is called for mi_col = 0, // MAX_MIB_SIZE, 2 * MAX_MIB_SIZE etc. const int x_range = AOMMIN((int)(plane_mi_cols - mi_col), MAX_MIB_SIZE); const ptrdiff_t mode_step = 1; for (int y = 0; y < y_range; y++) { const uint32_t curr_y = mi_row + y; const uint32_t x_start = mi_col; const uint32_t x_end = mi_col + x_range; int min_block_height = block_size_high[BLOCK_128X128]; set_lpf_parameters_for_line_luma(params_buf, tx_buf, cm, xd, VERT_EDGE, x_start, curr_y, plane_ptr, x_end, mode_step, &min_block_height); AV1_DEBLOCKING_PARAMETERS *params = params_buf; TX_SIZE *tx_size = tx_buf; USE_FILTER_TYPE use_filter_type = USE_SINGLE; uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride; if ((y & 3) == 0 && (y + 3) < y_range && min_block_height >= 16) { // If we are on a row which is a multiple of 4, and the minimum height is // 16 pixels, then the current and right 3 cols must contain the same // prediction block. This is because dim 16 can only happen every unit of // 4 mi's. use_filter_type = USE_QUAD; y += 3; } else if ((y + 1) < y_range && min_block_height >= 8) { use_filter_type = USE_DUAL; y += 1; } for (int x = 0; x < x_range;) { if (*tx_size == TX_INVALID) { params->filter_length = 0; *tx_size = TX_4X4; } filter_vert(p, dst_stride, params, cm->seq_params, use_filter_type); // advance the destination pointer const uint32_t advance_units = tx_size_wide_unit[*tx_size]; x += advance_units; p += advance_units * MI_SIZE; params += advance_units; tx_size += advance_units; } } } void av1_filter_block_plane_vert_opt_chroma( const AV1_COMMON *const cm, const MACROBLOCKD *const xd, const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, int plane, bool joint_filter_chroma, int num_mis_in_lpf_unit_height_log2) { const uint32_t scale_horz = plane_ptr->subsampling_x; const uint32_t scale_vert = plane_ptr->subsampling_y; const int dst_stride = plane_ptr->dst.stride; // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned // to MI_SIZE. const int mi_cols = ((plane_ptr->dst.width << scale_horz) + MI_SIZE - 1) >> MI_SIZE_LOG2; const int mi_rows = ((plane_ptr->dst.height << scale_vert) + MI_SIZE - 1) >> MI_SIZE_LOG2; const int plane_mi_rows = ROUND_POWER_OF_TWO(mi_rows, scale_vert); const int plane_mi_cols = ROUND_POWER_OF_TWO(mi_cols, scale_horz); const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)), ((1 << num_mis_in_lpf_unit_height_log2) >> scale_vert)); const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)), (MAX_MIB_SIZE >> scale_horz)); const ptrdiff_t mode_step = (ptrdiff_t)1 << scale_horz; for (int y = 0; y < y_range; y++) { const uint32_t curr_y = mi_row + (y << scale_vert); const uint32_t x_start = mi_col + (0 << scale_horz); const uint32_t x_end = mi_col + (x_range << scale_horz); int min_height = tx_size_high[TX_64X64]; set_lpf_parameters_for_line_chroma(params_buf, tx_buf, cm, xd, VERT_EDGE, x_start, curr_y, plane_ptr, x_end, mode_step, scale_horz, scale_vert, &min_height, plane, joint_filter_chroma); AV1_DEBLOCKING_PARAMETERS *params = params_buf; TX_SIZE *tx_size = tx_buf; int use_filter_type = USE_SINGLE; int y_inc = 0; if ((y & 3) == 0 && (y + 3) < y_range && min_height >= 16) { // If we are on a row which is a multiple of 4, and the minimum height is // 16 pixels, then the current and below 3 rows must contain the same tx // block. This is because dim 16 can only happen every unit of 4 mi's. use_filter_type = USE_QUAD; y_inc = 3; } else if (y % 2 == 0 && (y + 1) < y_range && min_height >= 8) { // If we are on an even row, and the minimum height is 8 pixels, then the // current and below rows must contain the same tx block. This is because // dim 4 can only happen every unit of 2**0, and 8 every unit of 2**1, // etc. use_filter_type = USE_DUAL; y_inc = 1; } for (int x = 0; x < x_range;) { // inner loop always filter vertical edges in a MI block. If MI size // is 8x8, it will filter the vertical edge aligned with a 8x8 block. // If 4x4 transform is used, it will then filter the internal edge // aligned with a 4x4 block if (*tx_size == TX_INVALID) { params->filter_length = 0; *tx_size = TX_4X4; } const int offset = y * MI_SIZE * dst_stride + x * MI_SIZE; if (joint_filter_chroma) { uint8_t *u_dst = plane_ptr[0].dst.buf + offset; uint8_t *v_dst = plane_ptr[1].dst.buf + offset; filter_vert_chroma(u_dst, v_dst, dst_stride, params, cm->seq_params, use_filter_type); } else { uint8_t *dst_ptr = plane_ptr->dst.buf + offset; filter_vert(dst_ptr, dst_stride, params, cm->seq_params, use_filter_type); } // advance the destination pointer const uint32_t advance_units = tx_size_wide_unit[*tx_size]; x += advance_units; params += advance_units; tx_size += advance_units; } y += y_inc; } } static inline void filter_horz(uint8_t *dst, int dst_stride, const AV1_DEBLOCKING_PARAMETERS *params, const SequenceHeader *seq_params, USE_FILTER_TYPE use_filter_type) { const loop_filter_thresh *limits = params->lfthr; #if CONFIG_AV1_HIGHBITDEPTH const int use_highbitdepth = seq_params->use_highbitdepth; const aom_bit_depth_t bit_depth = seq_params->bit_depth; if (use_highbitdepth) { uint16_t *dst_shortptr = CONVERT_TO_SHORTPTR(dst); if (use_filter_type == USE_QUAD) { switch (params->filter_length) { // apply 4-tap filtering case 4: aom_highbd_lpf_horizontal_4_dual( dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); aom_highbd_lpf_horizontal_4_dual( dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; case 6: // apply 6-tap filter for chroma plane only aom_highbd_lpf_horizontal_6_dual( dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); aom_highbd_lpf_horizontal_6_dual( dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // apply 8-tap filtering case 8: aom_highbd_lpf_horizontal_8_dual( dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); aom_highbd_lpf_horizontal_8_dual( dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // apply 14-tap filtering case 14: aom_highbd_lpf_horizontal_14_dual( dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); aom_highbd_lpf_horizontal_14_dual( dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // no filtering default: break; } } else if (use_filter_type == USE_DUAL) { switch (params->filter_length) { // apply 4-tap filtering case 4: aom_highbd_lpf_horizontal_4_dual( dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; case 6: // apply 6-tap filter for chroma plane only aom_highbd_lpf_horizontal_6_dual( dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // apply 8-tap filtering case 8: aom_highbd_lpf_horizontal_8_dual( dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // apply 14-tap filtering case 14: aom_highbd_lpf_horizontal_14_dual( dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // no filtering default: break; } } else { assert(use_filter_type == USE_SINGLE); switch (params->filter_length) { // apply 4-tap filtering case 4: aom_highbd_lpf_horizontal_4(dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; case 6: // apply 6-tap filter for chroma plane only aom_highbd_lpf_horizontal_6(dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // apply 8-tap filtering case 8: aom_highbd_lpf_horizontal_8(dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // apply 14-tap filtering case 14: aom_highbd_lpf_horizontal_14(dst_shortptr, dst_stride, limits->mblim, limits->lim, limits->hev_thr, bit_depth); break; // no filtering default: break; } } return; } #endif // CONFIG_AV1_HIGHBITDEPTH if (use_filter_type == USE_QUAD) { // Only one set of loop filter parameters (mblim, lim and hev_thr) is // passed as argument to quad loop filter because quad loop filter is // called for those cases where all the 4 set of loop filter parameters // are equal. switch (params->filter_length) { // apply 4-tap filtering case 4: aom_lpf_horizontal_4_quad(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr); break; case 6: // apply 6-tap filter for chroma plane only aom_lpf_horizontal_6_quad(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr); break; // apply 8-tap filtering case 8: aom_lpf_horizontal_8_quad(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr); break; // apply 14-tap filtering case 14: aom_lpf_horizontal_14_quad(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr); break; // no filtering default: break; } } else if (use_filter_type == USE_DUAL) { switch (params->filter_length) { // apply 4-tap filtering case 4: aom_lpf_horizontal_4_dual(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr); break; case 6: // apply 6-tap filter for chroma plane only aom_lpf_horizontal_6_dual(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr); break; // apply 8-tap filtering case 8: aom_lpf_horizontal_8_dual(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr); break; // apply 14-tap filtering case 14: aom_lpf_horizontal_14_dual(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr); break; // no filtering default: break; } } else { assert(use_filter_type == USE_SINGLE); switch (params->filter_length) { // apply 4-tap filtering case 4: aom_lpf_horizontal_4(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr); break; case 6: // apply 6-tap filter for chroma plane only aom_lpf_horizontal_6(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr); break; // apply 8-tap filtering case 8: aom_lpf_horizontal_8(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr); break; // apply 14-tap filtering case 14: aom_lpf_horizontal_14(dst, dst_stride, limits->mblim, limits->lim, limits->hev_thr); break; // no filtering default: break; } } #if !CONFIG_AV1_HIGHBITDEPTH (void)seq_params; #endif // !CONFIG_AV1_HIGHBITDEPTH } static inline void filter_horz_chroma(uint8_t *u_dst, uint8_t *v_dst, int dst_stride, const AV1_DEBLOCKING_PARAMETERS *params, const SequenceHeader *seq_params, USE_FILTER_TYPE use_filter_type) { const loop_filter_thresh *u_limits = params->lfthr; const loop_filter_thresh *v_limits = params->lfthr; #if CONFIG_AV1_HIGHBITDEPTH const int use_highbitdepth = seq_params->use_highbitdepth; const aom_bit_depth_t bit_depth = seq_params->bit_depth; if (use_highbitdepth) { uint16_t *u_dst_shortptr = CONVERT_TO_SHORTPTR(u_dst); uint16_t *v_dst_shortptr = CONVERT_TO_SHORTPTR(v_dst); if (use_filter_type == USE_QUAD) { switch (params->filter_length) { // apply 4-tap filtering case 4: aom_highbd_lpf_horizontal_4_dual( u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); aom_highbd_lpf_horizontal_4_dual( u_dst_shortptr + (2 * MI_SIZE), dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); aom_highbd_lpf_horizontal_4_dual( v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); aom_highbd_lpf_horizontal_4_dual( v_dst_shortptr + (2 * MI_SIZE), dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); break; case 6: // apply 6-tap filter for chroma plane only aom_highbd_lpf_horizontal_6_dual( u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); aom_highbd_lpf_horizontal_6_dual( u_dst_shortptr + (2 * MI_SIZE), dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); aom_highbd_lpf_horizontal_6_dual( v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); aom_highbd_lpf_horizontal_6_dual( v_dst_shortptr + (2 * MI_SIZE), dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); break; case 8: case 14: assert(0); // no filtering default: break; } } else if (use_filter_type == USE_DUAL) { switch (params->filter_length) { // apply 4-tap filtering case 4: aom_highbd_lpf_horizontal_4_dual( u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); aom_highbd_lpf_horizontal_4_dual( v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); break; case 6: // apply 6-tap filter for chroma plane only aom_highbd_lpf_horizontal_6_dual( u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); aom_highbd_lpf_horizontal_6_dual( v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); break; case 8: case 14: assert(0); // no filtering default: break; } } else { assert(use_filter_type == USE_SINGLE); switch (params->filter_length) { // apply 4-tap filtering case 4: aom_highbd_lpf_horizontal_4(u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); aom_highbd_lpf_horizontal_4(v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); break; case 6: // apply 6-tap filter for chroma plane only aom_highbd_lpf_horizontal_6(u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); aom_highbd_lpf_horizontal_6(v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); break; case 8: case 14: assert(0); break; // no filtering default: break; } } return; } #endif // CONFIG_AV1_HIGHBITDEPTH if (use_filter_type == USE_QUAD) { // Only one set of loop filter parameters (mblim, lim and hev_thr) is // passed as argument to quad loop filter because quad loop filter is // called for those cases where all the 4 set of loop filter parameters // are equal. switch (params->filter_length) { // apply 4-tap filtering case 4: aom_lpf_horizontal_4_quad(u_dst, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr); aom_lpf_horizontal_4_quad(v_dst, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr); break; case 6: // apply 6-tap filter for chroma plane only aom_lpf_horizontal_6_quad(u_dst, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr); aom_lpf_horizontal_6_quad(v_dst, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr); break; case 8: case 14: assert(0); // no filtering default: break; } } else if (use_filter_type == USE_DUAL) { switch (params->filter_length) { // apply 4-tap filtering case 4: aom_lpf_horizontal_4_dual(u_dst, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, u_limits->hev_thr); aom_lpf_horizontal_4_dual(v_dst, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, v_limits->hev_thr); break; case 6: // apply 6-tap filter for chroma plane only aom_lpf_horizontal_6_dual(u_dst, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, u_limits->hev_thr); aom_lpf_horizontal_6_dual(v_dst, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, v_limits->hev_thr); break; case 8: case 14: assert(0); // no filtering default: break; } } else { assert(use_filter_type == USE_SINGLE); switch (params->filter_length) { // apply 4-tap filtering case 4: aom_lpf_horizontal_4(u_dst, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr); aom_lpf_horizontal_4(v_dst, dst_stride, v_limits->mblim, v_limits->lim, u_limits->hev_thr); break; case 6: // apply 6-tap filter for chroma plane only aom_lpf_horizontal_6(u_dst, dst_stride, u_limits->mblim, u_limits->lim, u_limits->hev_thr); aom_lpf_horizontal_6(v_dst, dst_stride, v_limits->mblim, v_limits->lim, v_limits->hev_thr); break; case 8: case 14: assert(0); break; // no filtering default: break; } } #if !CONFIG_AV1_HIGHBITDEPTH (void)seq_params; #endif // !CONFIG_AV1_HIGHBITDEPTH } void av1_filter_block_plane_horz(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, const int plane, const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, const uint32_t mi_col) { const uint32_t scale_horz = plane_ptr->subsampling_x; const uint32_t scale_vert = plane_ptr->subsampling_y; uint8_t *const dst_ptr = plane_ptr->dst.buf; const int dst_stride = plane_ptr->dst.stride; const int plane_mi_rows = ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert); const int plane_mi_cols = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz); const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)), (MAX_MIB_SIZE >> scale_vert)); const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)), (MAX_MIB_SIZE >> scale_horz)); for (int x = 0; x < x_range; x++) { uint8_t *p = dst_ptr + x * MI_SIZE; for (int y = 0; y < y_range;) { // inner loop always filter vertical edges in a MI block. If MI size // is 8x8, it will first filter the vertical edge aligned with a 8x8 // block. If 4x4 transform is used, it will then filter the internal // edge aligned with a 4x4 block const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE; const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE; uint32_t advance_units; TX_SIZE tx_size; AV1_DEBLOCKING_PARAMETERS params; memset(¶ms, 0, sizeof(params)); tx_size = set_lpf_parameters( ¶ms, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE, curr_x, curr_y, plane, plane_ptr); if (tx_size == TX_INVALID) { params.filter_length = 0; tx_size = TX_4X4; } filter_horz(p, dst_stride, ¶ms, cm->seq_params, USE_SINGLE); // advance the destination pointer advance_units = tx_size_high_unit[tx_size]; y += advance_units; p += advance_units * dst_stride * MI_SIZE; } } } void av1_filter_block_plane_horz_opt( const AV1_COMMON *const cm, const MACROBLOCKD *const xd, const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2) { uint8_t *const dst_ptr = plane_ptr->dst.buf; const int dst_stride = plane_ptr->dst.stride; // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned // to MI_SIZE. const int plane_mi_cols = CEIL_POWER_OF_TWO(plane_ptr->dst.width, MI_SIZE_LOG2); const int plane_mi_rows = CEIL_POWER_OF_TWO(plane_ptr->dst.height, MI_SIZE_LOG2); const int y_range = AOMMIN((int)(plane_mi_rows - mi_row), (1 << num_mis_in_lpf_unit_height_log2)); const int x_range = AOMMIN((int)(plane_mi_cols - mi_col), MAX_MIB_SIZE); const ptrdiff_t mode_step = cm->mi_params.mi_stride; for (int x = 0; x < x_range; x++) { const uint32_t curr_x = mi_col + x; const uint32_t y_start = mi_row; const uint32_t y_end = mi_row + y_range; int min_block_width = block_size_high[BLOCK_128X128]; set_lpf_parameters_for_line_luma(params_buf, tx_buf, cm, xd, HORZ_EDGE, curr_x, y_start, plane_ptr, y_end, mode_step, &min_block_width); AV1_DEBLOCKING_PARAMETERS *params = params_buf; TX_SIZE *tx_size = tx_buf; USE_FILTER_TYPE filter_type = USE_SINGLE; uint8_t *p = dst_ptr + x * MI_SIZE; if ((x & 3) == 0 && (x + 3) < x_range && min_block_width >= 16) { // If we are on a col which is a multiple of 4, and the minimum width is // 16 pixels, then the current and right 3 cols must contain the same // prediction block. This is because dim 16 can only happen every unit of // 4 mi's. filter_type = USE_QUAD; x += 3; } else if ((x + 1) < x_range && min_block_width >= 8) { filter_type = USE_DUAL; x += 1; } for (int y = 0; y < y_range;) { if (*tx_size == TX_INVALID) { params->filter_length = 0; *tx_size = TX_4X4; } filter_horz(p, dst_stride, params, cm->seq_params, filter_type); // advance the destination pointer const uint32_t advance_units = tx_size_high_unit[*tx_size]; y += advance_units; p += advance_units * dst_stride * MI_SIZE; params += advance_units; tx_size += advance_units; } } } void av1_filter_block_plane_horz_opt_chroma( const AV1_COMMON *const cm, const MACROBLOCKD *const xd, const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, int plane, bool joint_filter_chroma, int num_mis_in_lpf_unit_height_log2) { const uint32_t scale_horz = plane_ptr->subsampling_x; const uint32_t scale_vert = plane_ptr->subsampling_y; const int dst_stride = plane_ptr->dst.stride; // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned // to MI_SIZE. const int mi_cols = ((plane_ptr->dst.width << scale_horz) + MI_SIZE - 1) >> MI_SIZE_LOG2; const int mi_rows = ((plane_ptr->dst.height << scale_vert) + MI_SIZE - 1) >> MI_SIZE_LOG2; const int plane_mi_rows = ROUND_POWER_OF_TWO(mi_rows, scale_vert); const int plane_mi_cols = ROUND_POWER_OF_TWO(mi_cols, scale_horz); const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)), ((1 << num_mis_in_lpf_unit_height_log2) >> scale_vert)); const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)), (MAX_MIB_SIZE >> scale_horz)); const ptrdiff_t mode_step = cm->mi_params.mi_stride << scale_vert; for (int x = 0; x < x_range; x++) { const uint32_t y_start = mi_row + (0 << scale_vert); const uint32_t curr_x = mi_col + (x << scale_horz); const uint32_t y_end = mi_row + (y_range << scale_vert); int min_width = tx_size_wide[TX_64X64]; set_lpf_parameters_for_line_chroma(params_buf, tx_buf, cm, xd, HORZ_EDGE, curr_x, y_start, plane_ptr, y_end, mode_step, scale_horz, scale_vert, &min_width, plane, joint_filter_chroma); AV1_DEBLOCKING_PARAMETERS *params = params_buf; TX_SIZE *tx_size = tx_buf; USE_FILTER_TYPE use_filter_type = USE_SINGLE; int x_inc = 0; if ((x & 3) == 0 && (x + 3) < x_range && min_width >= 16) { // If we are on a col which is a multiple of 4, and the minimum width is // 16 pixels, then the current and right 3 cols must contain the same tx // block. This is because dim 16 can only happen every unit of 4 mi's. use_filter_type = USE_QUAD; x_inc = 3; } else if (x % 2 == 0 && (x + 1) < x_range && min_width >= 8) { // If we are on an even col, and the minimum width is 8 pixels, then the // current and left cols must contain the same tx block. This is because // dim 4 can only happen every unit of 2**0, and 8 every unit of 2**1, // etc. use_filter_type = USE_DUAL; x_inc = 1; } for (int y = 0; y < y_range;) { // inner loop always filter vertical edges in a MI block. If MI size // is 8x8, it will first filter the vertical edge aligned with a 8x8 // block. If 4x4 transform is used, it will then filter the internal // edge aligned with a 4x4 block if (*tx_size == TX_INVALID) { params->filter_length = 0; *tx_size = TX_4X4; } const int offset = y * MI_SIZE * dst_stride + x * MI_SIZE; if (joint_filter_chroma) { uint8_t *u_dst = plane_ptr[0].dst.buf + offset; uint8_t *v_dst = plane_ptr[1].dst.buf + offset; filter_horz_chroma(u_dst, v_dst, dst_stride, params, cm->seq_params, use_filter_type); } else { uint8_t *dst_ptr = plane_ptr->dst.buf + offset; filter_horz(dst_ptr, dst_stride, params, cm->seq_params, use_filter_type); } // advance the destination pointer const int advance_units = tx_size_high_unit[*tx_size]; y += advance_units; params += advance_units; tx_size += advance_units; } x += x_inc; } } aom-3.12.1/av1/common/av1_loopfilter.h000066400000000000000000000114061477627663500174610ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_AV1_LOOPFILTER_H_ #define AOM_AV1_COMMON_AV1_LOOPFILTER_H_ #include "config/aom_config.h" #include "aom/internal/aom_codec_internal.h" #include "aom_ports/mem.h" #include "av1/common/blockd.h" #include "av1/common/seg_common.h" #ifdef __cplusplus extern "C" { #endif #define MAX_LOOP_FILTER 63 #define MAX_SHARPNESS 7 #define SIMD_WIDTH 16 enum lf_path { LF_PATH_420, LF_PATH_444, LF_PATH_SLOW, }; /*!\cond */ enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR); typedef struct { uint64_t bits[4]; } FilterMask; struct loopfilter { int filter_level[2]; int filter_level_u; int filter_level_v; int sharpness_level; uint8_t mode_ref_delta_enabled; uint8_t mode_ref_delta_update; // 0 = Intra, Last, Last2+Last3, // GF, BRF, ARF2, ARF int8_t ref_deltas[REF_FRAMES]; // 0 = ZERO_MV, MV int8_t mode_deltas[MAX_MODE_LF_DELTAS]; }; // Need to align this structure so when it is declared and // passed it can be loaded into vector registers. typedef struct { DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]); DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]); DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]); } loop_filter_thresh; typedef struct { loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1]; uint8_t lvl[MAX_MB_PLANE][MAX_SEGMENTS][2][REF_FRAMES][MAX_MODE_LF_DELTAS]; } loop_filter_info_n; typedef struct AV1_DEBLOCKING_PARAMETERS { // length of the filter applied to the outer edge uint8_t filter_length; // deblocking limits const loop_filter_thresh *lfthr; } AV1_DEBLOCKING_PARAMETERS; typedef struct LoopFilterWorkerData { YV12_BUFFER_CONFIG *frame_buffer; struct AV1Common *cm; struct macroblockd_plane planes[MAX_MB_PLANE]; // TODO(Ranjit): When the filter functions are modified to use xd->lossless // add lossless as a member here. MACROBLOCKD *xd; AV1_DEBLOCKING_PARAMETERS params_buf[MAX_MIB_SIZE]; TX_SIZE tx_buf[MAX_MIB_SIZE]; struct aom_internal_error_info error_info; } LFWorkerData; /*!\endcond */ /* assorted loopfilter functions which get used elsewhere */ struct AV1Common; struct macroblockd; struct AV1LfSyncData; void av1_loop_filter_init(struct AV1Common *cm); void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start, int plane_end); void av1_filter_block_plane_vert(const struct AV1Common *const cm, const MACROBLOCKD *const xd, const int plane, const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, const uint32_t mi_col); void av1_filter_block_plane_horz(const struct AV1Common *const cm, const MACROBLOCKD *const xd, const int plane, const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, const uint32_t mi_col); void av1_filter_block_plane_vert_opt( const struct AV1Common *const cm, const MACROBLOCKD *const xd, const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2); void av1_filter_block_plane_vert_opt_chroma( const struct AV1Common *const cm, const MACROBLOCKD *const xd, const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, int plane, bool joint_filter_chroma, int num_mis_in_lpf_unit_height_log2); void av1_filter_block_plane_horz_opt( const struct AV1Common *const cm, const MACROBLOCKD *const xd, const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2); void av1_filter_block_plane_horz_opt_chroma( const struct AV1Common *const cm, const MACROBLOCKD *const xd, const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, int plane, bool joint_filter_chroma, int num_mis_in_lpf_unit_height_log2); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_AV1_LOOPFILTER_H_ aom-3.12.1/av1/common/av1_rtcd.c000066400000000000000000000012671477627663500162350ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_config.h" #define RTCD_C #include "config/av1_rtcd.h" #include "aom_ports/aom_once.h" void av1_rtcd(void) { aom_once(setup_rtcd_internal); } aom-3.12.1/av1/common/av1_rtcd_defs.pl000066400000000000000000001351561477627663500174340ustar00rootroot00000000000000## ## Copyright (c) 2017, Alliance for Open Media. All rights reserved. ## ## This source code is subject to the terms of the BSD 2 Clause License and ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ## was not distributed with this source code in the LICENSE file, you can ## obtain it at www.aomedia.org/license/software. If the Alliance for Open ## Media Patent License 1.0 was not distributed with this source code in the ## PATENTS file, you can obtain it at www.aomedia.org/license/patent. ## sub av1_common_forward_decls() { print < 0) { for (i = 0; i < size; i++) { arr[i] = round_shift(arr[i], bit); } } else { for (i = 0; i < size; i++) { arr[i] = (int32_t)clamp64(((int64_t)1 << (-bit)) * arr[i], INT32_MIN, INT32_MAX); } } } } const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D] = { { TXFM_TYPE_DCT4, TXFM_TYPE_ADST4, TXFM_TYPE_ADST4, TXFM_TYPE_IDENTITY4 }, { TXFM_TYPE_DCT8, TXFM_TYPE_ADST8, TXFM_TYPE_ADST8, TXFM_TYPE_IDENTITY8 }, { TXFM_TYPE_DCT16, TXFM_TYPE_ADST16, TXFM_TYPE_ADST16, TXFM_TYPE_IDENTITY16 }, { TXFM_TYPE_DCT32, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID, TXFM_TYPE_IDENTITY32 }, { TXFM_TYPE_DCT64, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID } }; const int8_t av1_txfm_stage_num_list[TXFM_TYPES] = { 4, // TXFM_TYPE_DCT4 6, // TXFM_TYPE_DCT8 8, // TXFM_TYPE_DCT16 10, // TXFM_TYPE_DCT32 12, // TXFM_TYPE_DCT64 7, // TXFM_TYPE_ADST4 8, // TXFM_TYPE_ADST8 10, // TXFM_TYPE_ADST16 1, // TXFM_TYPE_IDENTITY4 1, // TXFM_TYPE_IDENTITY8 1, // TXFM_TYPE_IDENTITY16 1, // TXFM_TYPE_IDENTITY32 }; void av1_range_check_buf(int32_t stage, const int32_t *input, const int32_t *buf, int32_t size, int8_t bit) { #if CONFIG_COEFFICIENT_RANGE_CHECKING const int64_t max_value = (1LL << (bit - 1)) - 1; const int64_t min_value = -(1LL << (bit - 1)); int in_range = 1; for (int i = 0; i < size; ++i) { if (buf[i] < min_value || buf[i] > max_value) { in_range = 0; } } if (!in_range) { fprintf(stderr, "Error: coeffs contain out-of-range values\n"); fprintf(stderr, "size: %d\n", size); fprintf(stderr, "stage: %d\n", stage); fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value, max_value); fprintf(stderr, "coeffs: "); fprintf(stderr, "["); for (int j = 0; j < size; j++) { if (j > 0) fprintf(stderr, ", "); fprintf(stderr, "%d", input[j]); } fprintf(stderr, "]\n"); fprintf(stderr, " buf: "); fprintf(stderr, "["); for (int j = 0; j < size; j++) { if (j > 0) fprintf(stderr, ", "); fprintf(stderr, "%d", buf[j]); } fprintf(stderr, "]\n\n"); } assert(in_range); #else (void)stage; (void)input; (void)buf; (void)size; (void)bit; #endif } aom-3.12.1/av1/common/av1_txfm.h000066400000000000000000000171751477627663500162710ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_AV1_TXFM_H_ #define AOM_AV1_COMMON_AV1_TXFM_H_ #include #include #include #include "config/aom_config.h" #include "av1/common/enums.h" #include "av1/common/blockd.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #ifdef __cplusplus extern "C" { #endif #if !defined(DO_RANGE_CHECK_CLAMP) #define DO_RANGE_CHECK_CLAMP 0 #endif extern const int32_t av1_cospi_arr_data[4][64]; extern const int32_t av1_sinpi_arr_data[4][5]; #define MAX_TXFM_STAGE_NUM 12 static const int cos_bit_min = 10; #define NewSqrt2Bits ((int32_t)12) // 2^12 * sqrt(2) static const int32_t NewSqrt2 = 5793; // 2^12 / sqrt(2) static const int32_t NewInvSqrt2 = 2896; static inline const int32_t *cospi_arr(int n) { return av1_cospi_arr_data[n - cos_bit_min]; } static inline const int32_t *sinpi_arr(int n) { return av1_sinpi_arr_data[n - cos_bit_min]; } // The reduced bit-width and permuted arrays are only used in the Arm Neon // implementations in av1_fwd_txfm2d_neon.c and highbd_fwd_txfm_neon.c for now. #if HAVE_NEON // Store cospi/sinpi costants in Q2.13 format. // See: https://en.wikipedia.org/wiki/Q_(number_format) extern const int16_t av1_cospi_arr_q13_data[4][128]; extern const int16_t av1_sinpi_arr_q13_data[4][4]; extern const int32_t av1_cospi_arr_s32_data[4][66]; static inline const int16_t *cospi_arr_q13(int n) { return av1_cospi_arr_q13_data[n - cos_bit_min]; } static inline const int16_t *sinpi_arr_q13(int n) { return av1_sinpi_arr_q13_data[n - cos_bit_min]; } static inline const int32_t *cospi_arr_s32(int n) { return av1_cospi_arr_s32_data[n - cos_bit_min]; } #endif // HAVE_NEON static inline int32_t range_check_value(int32_t value, int8_t bit) { #if CONFIG_COEFFICIENT_RANGE_CHECKING const int64_t max_value = (1LL << (bit - 1)) - 1; const int64_t min_value = -(1LL << (bit - 1)); if (value < min_value || value > max_value) { fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit); #if !CONFIG_AV1_ENCODER assert(0); #endif } #endif // CONFIG_COEFFICIENT_RANGE_CHECKING #if DO_RANGE_CHECK_CLAMP bit = AOMMIN(bit, 31); return clamp(value, -(1 << (bit - 1)), (1 << (bit - 1)) - 1); #endif // DO_RANGE_CHECK_CLAMP (void)bit; return value; } static inline int32_t round_shift(int64_t value, int bit) { assert(bit >= 1); return (int32_t)((value + (1ll << (bit - 1))) >> bit); } static inline int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1, int bit) { int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1); int64_t intermediate = result_64 + (1LL << (bit - 1)); // NOTE(rachelbarker): The value 'result_64' may not necessarily fit // into 32 bits. However, the result of this function is nominally // ROUND_POWER_OF_TWO_64(result_64, bit) // and that is required to fit into stage_range[stage] many bits // (checked by range_check_buf()). // // Here we've unpacked that rounding operation, and it can be shown // that the value of 'intermediate' here *does* fit into 32 bits // for any conformant bitstream. // The upshot is that, if you do all this calculation using // wrapping 32-bit arithmetic instead of (non-wrapping) 64-bit arithmetic, // then you'll still get the correct result. // To provide a check on this logic, we assert that 'intermediate' // would fit into an int32 if range checking is enabled. #if CONFIG_COEFFICIENT_RANGE_CHECKING assert(intermediate >= INT32_MIN && intermediate <= INT32_MAX); #endif return (int32_t)(intermediate >> bit); } static inline uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, int bd) { return clip_pixel_highbd(dest + (int)trans, bd); } typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); typedef void (*FwdTxfm2dFunc)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); enum { TXFM_TYPE_DCT4, TXFM_TYPE_DCT8, TXFM_TYPE_DCT16, TXFM_TYPE_DCT32, TXFM_TYPE_DCT64, TXFM_TYPE_ADST4, TXFM_TYPE_ADST8, TXFM_TYPE_ADST16, TXFM_TYPE_IDENTITY4, TXFM_TYPE_IDENTITY8, TXFM_TYPE_IDENTITY16, TXFM_TYPE_IDENTITY32, TXFM_TYPES, TXFM_TYPE_INVALID, } UENUM1BYTE(TXFM_TYPE); typedef struct TXFM_2D_FLIP_CFG { TX_SIZE tx_size; int ud_flip; // flip upside down int lr_flip; // flip left to right const int8_t *shift; int8_t cos_bit_col; int8_t cos_bit_row; int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; TXFM_TYPE txfm_type_col; TXFM_TYPE txfm_type_row; int stage_num_col; int stage_num_row; } TXFM_2D_FLIP_CFG; static inline void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) { switch (tx_type) { case DCT_DCT: case ADST_DCT: case DCT_ADST: case ADST_ADST: *ud_flip = 0; *lr_flip = 0; break; case IDTX: case V_DCT: case H_DCT: case V_ADST: case H_ADST: *ud_flip = 0; *lr_flip = 0; break; case FLIPADST_DCT: case FLIPADST_ADST: case V_FLIPADST: *ud_flip = 1; *lr_flip = 0; break; case DCT_FLIPADST: case ADST_FLIPADST: case H_FLIPADST: *ud_flip = 0; *lr_flip = 1; break; case FLIPADST_FLIPADST: *ud_flip = 1; *lr_flip = 1; break; default: *ud_flip = 0; *lr_flip = 0; assert(0); } } static inline void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) { get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip); } // Utility function that returns the log of the ratio of the col and row // sizes. static inline int get_rect_tx_log_ratio(int col, int row) { if (col == row) return 0; if (col > row) { if (col == row * 2) return 1; if (col == row * 4) return 2; assert(0 && "Unsupported transform size"); } else { if (row == col * 2) return -1; if (row == col * 4) return -2; assert(0 && "Unsupported transform size"); } return 0; // Invalid } void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, const TXFM_2D_FLIP_CFG *cfg, int bd); void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size, int bd); void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, TXFM_2D_FLIP_CFG *cfg); void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, TXFM_2D_FLIP_CFG *cfg); extern const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D]; extern const int8_t av1_txfm_stage_num_list[TXFM_TYPES]; static inline int get_txw_idx(TX_SIZE tx_size) { return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0]; } static inline int get_txh_idx(TX_SIZE tx_size) { return tx_size_high_log2[tx_size] - tx_size_high_log2[0]; } void av1_range_check_buf(int32_t stage, const int32_t *input, const int32_t *buf, int32_t size, int8_t bit); #define MAX_TXWH_IDX 5 #ifdef __cplusplus } #endif // __cplusplus #endif // AOM_AV1_COMMON_AV1_TXFM_H_ aom-3.12.1/av1/common/blockd.c000066400000000000000000000074441477627663500157730ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) { if (!left_mi) return DC_PRED; assert(!is_inter_block(left_mi) || is_intrabc_block(left_mi)); return left_mi->mode; } PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) { if (!above_mi) return DC_PRED; assert(!is_inter_block(above_mi) || is_intrabc_block(above_mi)); return above_mi->mode; } void av1_set_entropy_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, int aoff, int loff) { ENTROPY_CONTEXT *const a = pd->above_entropy_context + aoff; ENTROPY_CONTEXT *const l = pd->left_entropy_context + loff; const int txs_wide = tx_size_wide_unit[tx_size]; const int txs_high = tx_size_high_unit[tx_size]; // above if (has_eob && xd->mb_to_right_edge < 0) { const int blocks_wide = max_block_wide(xd, plane_bsize, plane); const int above_contexts = AOMMIN(txs_wide, blocks_wide - aoff); memset(a, has_eob, sizeof(*a) * above_contexts); memset(a + above_contexts, 0, sizeof(*a) * (txs_wide - above_contexts)); } else { memset(a, has_eob, sizeof(*a) * txs_wide); } // left if (has_eob && xd->mb_to_bottom_edge < 0) { const int blocks_high = max_block_high(xd, plane_bsize, plane); const int left_contexts = AOMMIN(txs_high, blocks_high - loff); memset(l, has_eob, sizeof(*l) * left_contexts); memset(l + left_contexts, 0, sizeof(*l) * (txs_high - left_contexts)); } else { memset(l, has_eob, sizeof(*l) * txs_high); } } void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize, const int num_planes) { assert(bsize < BLOCK_SIZES_ALL); const int nplanes = 1 + (num_planes - 1) * xd->is_chroma_ref; for (int i = 0; i < nplanes; i++) { struct macroblockd_plane *const pd = &xd->plane[i]; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); const int txs_wide = mi_size_wide[plane_bsize]; const int txs_high = mi_size_high[plane_bsize]; memset(pd->above_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide); memset(pd->left_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high); } } void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes) { xd->delta_lf_from_base = 0; const int frame_lf_count = num_planes > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) xd->delta_lf[lf_id] = 0; } void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes) { for (int p = 0; p < num_planes; ++p) { set_default_wiener(xd->wiener_info + p); set_default_sgrproj(xd->sgrproj_info + p); } } void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y, const int num_planes) { int i; for (i = 0; i < num_planes; i++) { xd->plane[i].plane_type = get_plane_type(i); xd->plane[i].subsampling_x = i ? ss_x : 0; xd->plane[i].subsampling_y = i ? ss_y : 0; } for (i = num_planes; i < MAX_MB_PLANE; i++) { xd->plane[i].subsampling_x = 1; xd->plane[i].subsampling_y = 1; } } aom-3.12.1/av1/common/blockd.h000066400000000000000000001567271477627663500160110ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_BLOCKD_H_ #define AOM_AV1_COMMON_BLOCKD_H_ #include "config/aom_config.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/mem.h" #include "aom_scale/yv12config.h" #include "av1/common/common_data.h" #include "av1/common/quant_common.h" #include "av1/common/entropy.h" #include "av1/common/entropymode.h" #include "av1/common/mv.h" #include "av1/common/scale.h" #include "av1/common/seg_common.h" #include "av1/common/tile_common.h" #ifdef __cplusplus extern "C" { #endif #define USE_B_QUANT_NO_TRELLIS 1 #define MAX_MB_PLANE 3 #define MAX_DIFFWTD_MASK_BITS 1 #define INTERINTRA_WEDGE_SIGN 0 #define DEFAULT_INTER_TX_TYPE DCT_DCT #define MAX_PALETTE_BLOCK_WIDTH 64 #define MAX_PALETTE_BLOCK_HEIGHT 64 /*!\cond */ // DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS enum { DIFFWTD_38 = 0, DIFFWTD_38_INV, DIFFWTD_MASK_TYPES, } UENUM1BYTE(DIFFWTD_MASK_TYPE); enum { KEY_FRAME = 0, INTER_FRAME = 1, INTRA_ONLY_FRAME = 2, // replaces intra-only S_FRAME = 3, FRAME_TYPES, } UENUM1BYTE(FRAME_TYPE); static inline int is_comp_ref_allowed(BLOCK_SIZE bsize) { return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8; } static inline int is_inter_mode(PREDICTION_MODE mode) { return mode >= INTER_MODE_START && mode < INTER_MODE_END; } typedef struct { uint8_t *plane[MAX_MB_PLANE]; int stride[MAX_MB_PLANE]; } BUFFER_SET; static inline int is_inter_singleref_mode(PREDICTION_MODE mode) { return mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END; } static inline int is_inter_compound_mode(PREDICTION_MODE mode) { return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END; } static inline PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) { static const PREDICTION_MODE lut[] = { DC_PRED, // DC_PRED V_PRED, // V_PRED H_PRED, // H_PRED D45_PRED, // D45_PRED D135_PRED, // D135_PRED D113_PRED, // D113_PRED D157_PRED, // D157_PRED D203_PRED, // D203_PRED D67_PRED, // D67_PRED SMOOTH_PRED, // SMOOTH_PRED SMOOTH_V_PRED, // SMOOTH_V_PRED SMOOTH_H_PRED, // SMOOTH_H_PRED PAETH_PRED, // PAETH_PRED NEARESTMV, // NEARESTMV NEARMV, // NEARMV GLOBALMV, // GLOBALMV NEWMV, // NEWMV NEARESTMV, // NEAREST_NEARESTMV NEARMV, // NEAR_NEARMV NEARESTMV, // NEAREST_NEWMV NEWMV, // NEW_NEARESTMV NEARMV, // NEAR_NEWMV NEWMV, // NEW_NEARMV GLOBALMV, // GLOBAL_GLOBALMV NEWMV, // NEW_NEWMV }; assert(NELEMENTS(lut) == MB_MODE_COUNT); assert(is_inter_compound_mode(mode) || is_inter_singleref_mode(mode)); return lut[mode]; } static inline PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) { static const PREDICTION_MODE lut[] = { MB_MODE_COUNT, // DC_PRED MB_MODE_COUNT, // V_PRED MB_MODE_COUNT, // H_PRED MB_MODE_COUNT, // D45_PRED MB_MODE_COUNT, // D135_PRED MB_MODE_COUNT, // D113_PRED MB_MODE_COUNT, // D157_PRED MB_MODE_COUNT, // D203_PRED MB_MODE_COUNT, // D67_PRED MB_MODE_COUNT, // SMOOTH_PRED MB_MODE_COUNT, // SMOOTH_V_PRED MB_MODE_COUNT, // SMOOTH_H_PRED MB_MODE_COUNT, // PAETH_PRED MB_MODE_COUNT, // NEARESTMV MB_MODE_COUNT, // NEARMV MB_MODE_COUNT, // GLOBALMV MB_MODE_COUNT, // NEWMV NEARESTMV, // NEAREST_NEARESTMV NEARMV, // NEAR_NEARMV NEWMV, // NEAREST_NEWMV NEARESTMV, // NEW_NEARESTMV NEWMV, // NEAR_NEWMV NEARMV, // NEW_NEARMV GLOBALMV, // GLOBAL_GLOBALMV NEWMV, // NEW_NEWMV }; assert(NELEMENTS(lut) == MB_MODE_COUNT); assert(is_inter_compound_mode(mode)); return lut[mode]; } static inline int have_nearmv_in_inter_mode(PREDICTION_MODE mode) { return (mode == NEARMV || mode == NEAR_NEARMV || mode == NEAR_NEWMV || mode == NEW_NEARMV); } static inline int have_newmv_in_inter_mode(PREDICTION_MODE mode) { return (mode == NEWMV || mode == NEW_NEWMV || mode == NEAREST_NEWMV || mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV); } static inline int is_masked_compound_type(COMPOUND_TYPE type) { return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD); } /* For keyframes, intra block modes are predicted by the (already decoded) modes for the Y blocks to the left and above us; for interframes, there is a single probability table. */ typedef struct { // Value of base colors for Y, U, and V uint16_t palette_colors[3 * PALETTE_MAX_SIZE]; // Number of base colors for Y (0) and UV (1) uint8_t palette_size[2]; } PALETTE_MODE_INFO; typedef struct { FILTER_INTRA_MODE filter_intra_mode; uint8_t use_filter_intra; } FILTER_INTRA_MODE_INFO; static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = { DC_PRED, V_PRED, H_PRED, D157_PRED, DC_PRED }; #if CONFIG_RD_DEBUG #define TXB_COEFF_COST_MAP_SIZE (MAX_MIB_SIZE) #endif typedef struct RD_STATS { int rate; int zero_rate; int64_t dist; // Please be careful of using rdcost, it's not guaranteed to be set all the // time. // TODO(angiebird): Create a set of functions to manipulate the RD_STATS. In // these functions, make sure rdcost is always up-to-date according to // rate/dist. int64_t rdcost; int64_t sse; uint8_t skip_txfm; // sse should equal to dist when skip_txfm == 1 #if CONFIG_RD_DEBUG int txb_coeff_cost[MAX_MB_PLANE]; #endif // CONFIG_RD_DEBUG } RD_STATS; // This struct is used to group function args that are commonly // sent together in functions related to interinter compound modes typedef struct { uint8_t *seg_mask; int8_t wedge_index; int8_t wedge_sign; DIFFWTD_MASK_TYPE mask_type; COMPOUND_TYPE type; } INTERINTER_COMPOUND_DATA; #define INTER_TX_SIZE_BUF_LEN 16 #define TXK_TYPE_BUF_LEN 64 /*!\endcond */ /*! \brief Stores the prediction/txfm mode of the current coding block */ typedef struct MB_MODE_INFO { /***************************************************************************** * \name General Info of the Coding Block ****************************************************************************/ /**@{*/ /*! \brief The block size of the current coding block */ BLOCK_SIZE bsize; /*! \brief The partition type of the current coding block. */ PARTITION_TYPE partition; /*! \brief The prediction mode used */ PREDICTION_MODE mode; /*! \brief The UV mode when intra is used */ UV_PREDICTION_MODE uv_mode; /*! \brief The q index for the current coding block. */ int current_qindex; /**@}*/ /***************************************************************************** * \name Inter Mode Info ****************************************************************************/ /**@{*/ /*! \brief The motion vectors used by the current inter mode */ int_mv mv[2]; /*! \brief The reference frames for the MV */ MV_REFERENCE_FRAME ref_frame[2]; /*! \brief Filter used in subpel interpolation. */ int_interpfilters interp_filters; /*! \brief The motion mode used by the inter prediction. */ MOTION_MODE motion_mode; /*! \brief Number of samples used by warp causal */ uint8_t num_proj_ref; /*! \brief The number of overlapped neighbors above/left for obmc/warp motion * mode. */ uint8_t overlappable_neighbors; /*! \brief The parameters used in warp motion mode. */ WarpedMotionParams wm_params; /*! \brief The type of intra mode used by inter-intra */ INTERINTRA_MODE interintra_mode; /*! \brief The type of wedge used in interintra mode. */ int8_t interintra_wedge_index; /*! \brief Struct that stores the data used in interinter compound mode. */ INTERINTER_COMPOUND_DATA interinter_comp; /**@}*/ /***************************************************************************** * \name Intra Mode Info ****************************************************************************/ /**@{*/ /*! \brief Directional mode delta: the angle is base angle + (angle_delta * * step). */ int8_t angle_delta[PLANE_TYPES]; /*! \brief The type of filter intra mode used (if applicable). */ FILTER_INTRA_MODE_INFO filter_intra_mode_info; /*! \brief Chroma from Luma: Joint sign of alpha Cb and alpha Cr */ int8_t cfl_alpha_signs; /*! \brief Chroma from Luma: Index of the alpha Cb and alpha Cr combination */ uint8_t cfl_alpha_idx; /*! \brief Stores the size and colors of palette mode */ PALETTE_MODE_INFO palette_mode_info; /**@}*/ /***************************************************************************** * \name Transform Info ****************************************************************************/ /**@{*/ /*! \brief Whether to skip transforming and sending. */ uint8_t skip_txfm; /*! \brief Transform size when fixed size txfm is used (e.g. intra modes). */ TX_SIZE tx_size; /*! \brief Transform size when recursive txfm tree is on. */ TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN]; /**@}*/ /***************************************************************************** * \name Loop Filter Info ****************************************************************************/ /**@{*/ /*! \copydoc MACROBLOCKD::delta_lf_from_base */ int8_t delta_lf_from_base; /*! \copydoc MACROBLOCKD::delta_lf */ int8_t delta_lf[FRAME_LF_COUNT]; /**@}*/ /***************************************************************************** * \name Bitfield for Memory Reduction ****************************************************************************/ /**@{*/ /*! \brief The segment id */ uint8_t segment_id : 3; /*! \brief Only valid when temporal update if off. */ uint8_t seg_id_predicted : 1; /*! \brief Which ref_mv to use */ uint8_t ref_mv_idx : 2; /*! \brief Inter skip mode */ uint8_t skip_mode : 1; /*! \brief Whether intrabc is used. */ uint8_t use_intrabc : 1; /*! \brief Indicates if masked compound is used(1) or not (0). */ uint8_t comp_group_idx : 1; /*! \brief Indicates whether dist_wtd_comp(0) is used or not (0). */ uint8_t compound_idx : 1; /*! \brief Whether to use interintra wedge */ uint8_t use_wedge_interintra : 1; /*! \brief CDEF strength per BLOCK_64X64 */ int8_t cdef_strength : 4; /**@}*/ #if CONFIG_RD_DEBUG /*! \brief RD info used for debugging */ RD_STATS rd_stats; /*! \brief The current row in unit of 4x4 blocks for debugging */ int mi_row; /*! \brief The current col in unit of 4x4 blocks for debugging */ int mi_col; #endif #if CONFIG_INSPECTION /*! \brief Whether we are skipping the current rows or columns. */ int16_t tx_skip[TXK_TYPE_BUF_LEN]; #endif } MB_MODE_INFO; /*!\cond */ static inline int is_intrabc_block(const MB_MODE_INFO *mbmi) { return mbmi->use_intrabc; } static inline PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) { assert(mode < UV_INTRA_MODES); static const PREDICTION_MODE uv2y[] = { DC_PRED, // UV_DC_PRED V_PRED, // UV_V_PRED H_PRED, // UV_H_PRED D45_PRED, // UV_D45_PRED D135_PRED, // UV_D135_PRED D113_PRED, // UV_D113_PRED D157_PRED, // UV_D157_PRED D203_PRED, // UV_D203_PRED D67_PRED, // UV_D67_PRED SMOOTH_PRED, // UV_SMOOTH_PRED SMOOTH_V_PRED, // UV_SMOOTH_V_PRED SMOOTH_H_PRED, // UV_SMOOTH_H_PRED PAETH_PRED, // UV_PAETH_PRED DC_PRED, // UV_CFL_PRED INTRA_INVALID, // UV_INTRA_MODES INTRA_INVALID, // UV_MODE_INVALID }; return uv2y[mode]; } static inline int is_inter_block(const MB_MODE_INFO *mbmi) { return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME; } static inline int has_second_ref(const MB_MODE_INFO *mbmi) { return mbmi->ref_frame[1] > INTRA_FRAME; } static inline int has_uni_comp_refs(const MB_MODE_INFO *mbmi) { return has_second_ref(mbmi) && (!((mbmi->ref_frame[0] >= BWDREF_FRAME) ^ (mbmi->ref_frame[1] >= BWDREF_FRAME))); } static inline MV_REFERENCE_FRAME comp_ref0(int ref_idx) { static const MV_REFERENCE_FRAME lut[] = { LAST_FRAME, // LAST_LAST2_FRAMES, LAST_FRAME, // LAST_LAST3_FRAMES, LAST_FRAME, // LAST_GOLDEN_FRAMES, BWDREF_FRAME, // BWDREF_ALTREF_FRAMES, LAST2_FRAME, // LAST2_LAST3_FRAMES LAST2_FRAME, // LAST2_GOLDEN_FRAMES, LAST3_FRAME, // LAST3_GOLDEN_FRAMES, BWDREF_FRAME, // BWDREF_ALTREF2_FRAMES, ALTREF2_FRAME, // ALTREF2_ALTREF_FRAMES, }; assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS); return lut[ref_idx]; } static inline MV_REFERENCE_FRAME comp_ref1(int ref_idx) { static const MV_REFERENCE_FRAME lut[] = { LAST2_FRAME, // LAST_LAST2_FRAMES, LAST3_FRAME, // LAST_LAST3_FRAMES, GOLDEN_FRAME, // LAST_GOLDEN_FRAMES, ALTREF_FRAME, // BWDREF_ALTREF_FRAMES, LAST3_FRAME, // LAST2_LAST3_FRAMES GOLDEN_FRAME, // LAST2_GOLDEN_FRAMES, GOLDEN_FRAME, // LAST3_GOLDEN_FRAMES, ALTREF2_FRAME, // BWDREF_ALTREF2_FRAMES, ALTREF_FRAME, // ALTREF2_ALTREF_FRAMES, }; assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS); return lut[ref_idx]; } PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi); PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi); static inline int is_global_mv_block(const MB_MODE_INFO *const mbmi, TransformationType type) { const PREDICTION_MODE mode = mbmi->mode; const BLOCK_SIZE bsize = mbmi->bsize; const int block_size_allowed = AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8; return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION && block_size_allowed; } #if CONFIG_MISMATCH_DEBUG static inline void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col, int mi_row, int tx_blk_col, int tx_blk_row, int subsampling_x, int subsampling_y) { *pixel_c = ((mi_col >> subsampling_x) << MI_SIZE_LOG2) + (tx_blk_col << MI_SIZE_LOG2); *pixel_r = ((mi_row >> subsampling_y) << MI_SIZE_LOG2) + (tx_blk_row << MI_SIZE_LOG2); } #endif enum { MV_PRECISION_Q3, MV_PRECISION_Q4 } UENUM1BYTE(mv_precision); struct buf_2d { uint8_t *buf; uint8_t *buf0; int width; int height; int stride; }; typedef struct eob_info { uint16_t eob; uint16_t max_scan_line; } eob_info; typedef struct { DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_MB_PLANE][MAX_SB_SQUARE]); eob_info eob_data[MAX_MB_PLANE] [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)]; DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]); } CB_BUFFER; typedef struct macroblockd_plane { PLANE_TYPE plane_type; int subsampling_x; int subsampling_y; struct buf_2d dst; struct buf_2d pre[2]; ENTROPY_CONTEXT *above_entropy_context; ENTROPY_CONTEXT *left_entropy_context; // The dequantizers below are true dequantizers used only in the // dequantization process. They have the same coefficient // shift/scale as TX. int16_t seg_dequant_QTX[MAX_SEGMENTS][2]; // Pointer to color index map of: // - Current coding block, on encoder side. // - Current superblock, on decoder side. uint8_t *color_index_map; // block size in pixels uint8_t width, height; qm_val_t *seg_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; qm_val_t *seg_qmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; } MACROBLOCKD_PLANE; #define BLOCK_OFFSET(i) ((i) << 4) /*!\endcond */ /*!\brief Parameters related to Wiener Filter */ typedef struct { /*! * Vertical filter kernel. */ DECLARE_ALIGNED(16, InterpKernel, vfilter); /*! * Horizontal filter kernel. */ DECLARE_ALIGNED(16, InterpKernel, hfilter); } WienerInfo; /*!\brief Parameters related to Sgrproj Filter */ typedef struct { /*! * Parameter index. */ int ep; /*! * Weights for linear combination of filtered versions */ int xqd[2]; } SgrprojInfo; /*!\cond */ #define CFL_MAX_BLOCK_SIZE (BLOCK_32X32) #define CFL_BUF_LINE (32) #define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3) #define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4) #define CFL_BUF_SQUARE (CFL_BUF_LINE * CFL_BUF_LINE) typedef struct cfl_ctx { // Q3 reconstructed luma pixels (only Q2 is required, but Q3 is used to avoid // shifts) uint16_t recon_buf_q3[CFL_BUF_SQUARE]; // Q3 AC contributions (reconstructed luma pixels - tx block avg) int16_t ac_buf_q3[CFL_BUF_SQUARE]; // Cache the DC_PRED when performing RDO, so it does not have to be recomputed // for every scaling parameter bool dc_pred_is_cached[CFL_PRED_PLANES]; // Whether the DC_PRED cache is enabled. The DC_PRED cache is disabled when // decoding. bool use_dc_pred_cache; // Only cache the first row of the DC_PRED int16_t dc_pred_cache[CFL_PRED_PLANES][CFL_BUF_LINE]; // Height and width currently used in the CfL prediction buffer. int buf_height, buf_width; int are_parameters_computed; // Chroma subsampling int subsampling_x, subsampling_y; // Whether the reconstructed luma pixels need to be stored int store_y; } CFL_CTX; typedef struct dist_wtd_comp_params { int use_dist_wtd_comp_avg; int fwd_offset; int bck_offset; } DIST_WTD_COMP_PARAMS; struct scale_factors; /*!\endcond */ /*! \brief Variables related to current coding block. * * This is a common set of variables used by both encoder and decoder. * Most/all of the pointers are mere pointers to actual arrays are allocated * elsewhere. This is mostly for coding convenience. */ typedef struct macroblockd { /** * \name Position of current macroblock in mi units */ /**@{*/ int mi_row; /*!< Row position in mi units. */ int mi_col; /*!< Column position in mi units. */ /**@}*/ /*! * Same as cm->mi_params.mi_stride, copied here for convenience. */ int mi_stride; /*! * True if current block transmits chroma information. * More detail: * Smallest supported block size for both luma and chroma plane is 4x4. Hence, * in case of subsampled chroma plane (YUV 4:2:0 or YUV 4:2:2), multiple luma * blocks smaller than 8x8 maybe combined into one chroma block. * For example, for YUV 4:2:0, let's say an 8x8 area is split into four 4x4 * luma blocks. Then, a single chroma block of size 4x4 will cover the area of * these four luma blocks. This is implemented in bitstream as follows: * - There are four MB_MODE_INFO structs for the four luma blocks. * - First 3 MB_MODE_INFO have is_chroma_ref = false, and so do not transmit * any information for chroma planes. * - Last block will have is_chroma_ref = true and transmits chroma * information for the 4x4 chroma block that covers whole 8x8 area covered by * four luma blocks. * Similar logic applies for chroma blocks that cover 2 or 3 luma blocks. */ bool is_chroma_ref; /*! * Info specific to each plane. */ struct macroblockd_plane plane[MAX_MB_PLANE]; /*! * Tile related info. */ TileInfo tile; /*! * Appropriate offset inside cm->mi_params.mi_grid_base based on current * mi_row and mi_col. */ MB_MODE_INFO **mi; /*! * True if 4x4 block above the current block is available. */ bool up_available; /*! * True if 4x4 block to the left of the current block is available. */ bool left_available; /*! * True if the above chrome reference block is available. */ bool chroma_up_available; /*! * True if the left chrome reference block is available. */ bool chroma_left_available; /*! * MB_MODE_INFO for 4x4 block to the left of the current block, if * left_available == true; otherwise NULL. */ MB_MODE_INFO *left_mbmi; /*! * MB_MODE_INFO for 4x4 block above the current block, if * up_available == true; otherwise NULL. */ MB_MODE_INFO *above_mbmi; /*! * Above chroma reference block if is_chroma_ref == true for the current block * and chroma_up_available == true; otherwise NULL. * See also: the special case logic when current chroma block covers more than * one luma blocks in set_mi_row_col(). */ MB_MODE_INFO *chroma_left_mbmi; /*! * Left chroma reference block if is_chroma_ref == true for the current block * and chroma_left_available == true; otherwise NULL. * See also: the special case logic when current chroma block covers more than * one luma blocks in set_mi_row_col(). */ MB_MODE_INFO *chroma_above_mbmi; /*! * Appropriate offset based on current 'mi_row' and 'mi_col', inside * 'tx_type_map' in one of 'CommonModeInfoParams', 'PICK_MODE_CONTEXT' or * 'MACROBLOCK' structs. */ uint8_t *tx_type_map; /*! * Stride for 'tx_type_map'. Note that this may / may not be same as * 'mi_stride', depending on which actual array 'tx_type_map' points to. */ int tx_type_map_stride; /** * \name Distance of this macroblock from frame edges in 1/8th pixel units. */ /**@{*/ int mb_to_left_edge; /*!< Distance from left edge */ int mb_to_right_edge; /*!< Distance from right edge */ int mb_to_top_edge; /*!< Distance from top edge */ int mb_to_bottom_edge; /*!< Distance from bottom edge */ /**@}*/ /*! * Scale factors for reference frames of the current block. * These are pointers into 'cm->ref_scale_factors'. */ const struct scale_factors *block_ref_scale_factors[2]; /*! * - On encoder side: points to cpi->source, which is the buffer containing * the current *source* frame (maybe filtered). * - On decoder side: points to cm->cur_frame->buf, which is the buffer into * which current frame is being *decoded*. */ const YV12_BUFFER_CONFIG *cur_buf; /*! * Entropy contexts for the above blocks. * above_entropy_context[i][j] corresponds to above entropy context for ith * plane and jth mi column of this *frame*, wrt current 'mi_row'. * These are pointers into 'cm->above_contexts.entropy'. */ ENTROPY_CONTEXT *above_entropy_context[MAX_MB_PLANE]; /*! * Entropy contexts for the left blocks. * left_entropy_context[i][j] corresponds to left entropy context for ith * plane and jth mi row of this *superblock*, wrt current 'mi_col'. * Note: These contain actual data, NOT pointers. */ ENTROPY_CONTEXT left_entropy_context[MAX_MB_PLANE][MAX_MIB_SIZE]; /*! * Partition contexts for the above blocks. * above_partition_context[i] corresponds to above partition context for ith * mi column of this *frame*, wrt current 'mi_row'. * This is a pointer into 'cm->above_contexts.partition'. */ PARTITION_CONTEXT *above_partition_context; /*! * Partition contexts for the left blocks. * left_partition_context[i] corresponds to left partition context for ith * mi row of this *superblock*, wrt current 'mi_col'. * Note: These contain actual data, NOT pointers. */ PARTITION_CONTEXT left_partition_context[MAX_MIB_SIZE]; /*! * Transform contexts for the above blocks. * above_txfm_context[i] corresponds to above transform context for ith mi col * from the current position (mi row and mi column) for this *frame*. * This is a pointer into 'cm->above_contexts.txfm'. */ TXFM_CONTEXT *above_txfm_context; /*! * Transform contexts for the left blocks. * left_txfm_context[i] corresponds to left transform context for ith mi row * from the current position (mi_row and mi_col) for this *superblock*. * This is a pointer into 'left_txfm_context_buffer'. */ TXFM_CONTEXT *left_txfm_context; /*! * left_txfm_context_buffer[i] is the left transform context for ith mi_row * in this *superblock*. * Behaves like an internal actual buffer which 'left_txt_context' points to, * and never accessed directly except to fill in initial default values. */ TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE]; /** * \name Default values for the two restoration filters for each plane. * Default values for the two restoration filters for each plane. * These values are used as reference values when writing the bitstream. That * is, we transmit the delta between the actual values in * cm->rst_info[plane].unit_info[unit_idx] and these reference values. */ /**@{*/ WienerInfo wiener_info[MAX_MB_PLANE]; /*!< Defaults for Wiener filter*/ SgrprojInfo sgrproj_info[MAX_MB_PLANE]; /*!< Defaults for SGR filter */ /**@}*/ /** * \name Block dimensions in MB_MODE_INFO units. */ /**@{*/ uint8_t width; /*!< Block width in MB_MODE_INFO units */ uint8_t height; /*!< Block height in MB_MODE_INFO units */ /**@}*/ /*! * Contains the motion vector candidates found during motion vector prediction * process. ref_mv_stack[i] contains the candidates for ith type of * reference frame (single/compound). The actual number of candidates found in * ref_mv_stack[i] is stored in either dcb->ref_mv_count[i] (decoder side) * or mbmi_ext->ref_mv_count[i] (encoder side). */ CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE]; /*! * weight[i][j] is the weight for ref_mv_stack[i][j] and used to compute the * DRL (dynamic reference list) mode contexts. */ uint16_t weight[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE]; /*! * True if this is the last vertical rectangular block in a VERTICAL or * VERTICAL_4 partition. */ bool is_last_vertical_rect; /*! * True if this is the 1st horizontal rectangular block in a HORIZONTAL or * HORIZONTAL_4 partition. */ bool is_first_horizontal_rect; /*! * Counts of each reference frame in the above and left neighboring blocks. * NOTE: Take into account both single and comp references. */ uint8_t neighbors_ref_counts[REF_FRAMES]; /*! * Current CDFs of all the symbols for the current tile. */ FRAME_CONTEXT *tile_ctx; /*! * Bit depth: copied from cm->seq_params->bit_depth for convenience. */ int bd; /*! * Quantizer index for each segment (base qindex + delta for each segment). */ int qindex[MAX_SEGMENTS]; /*! * lossless[s] is true if segment 's' is coded losslessly. */ int lossless[MAX_SEGMENTS]; /*! * Q index for the coding blocks in this superblock will be stored in * mbmi->current_qindex. Now, when cm->delta_q_info.delta_q_present_flag is * true, mbmi->current_qindex is computed by taking 'current_base_qindex' as * the base, and adding any transmitted delta qindex on top of it. * Precisely, this is the latest qindex used by the first coding block of a * non-skip superblock in the current tile; OR * same as cm->quant_params.base_qindex (if not explicitly set yet). * Note: This is 'CurrentQIndex' in the AV1 spec. */ int current_base_qindex; /*! * Same as cm->features.cur_frame_force_integer_mv. */ int cur_frame_force_integer_mv; /*! * Pointer to cm->error. */ struct aom_internal_error_info *error_info; /*! * Same as cm->global_motion. */ const WarpedMotionParams *global_motion; /*! * Since actual frame level loop filtering level value is not available * at the beginning of the tile (only available during actual filtering) * at encoder side.we record the delta_lf (against the frame level loop * filtering level) and code the delta between previous superblock's delta * lf and current delta lf. It is equivalent to the delta between previous * superblock's actual lf and current lf. */ int8_t delta_lf_from_base; /*! * We have four frame filter levels for different plane and direction. So, to * support the per superblock update, we need to add a few more params: * 0. delta loop filter level for y plane vertical * 1. delta loop filter level for y plane horizontal * 2. delta loop filter level for u plane * 3. delta loop filter level for v plane * To make it consistent with the reference to each filter level in segment, * we need to -1, since * - SEG_LVL_ALT_LF_Y_V = 1; * - SEG_LVL_ALT_LF_Y_H = 2; * - SEG_LVL_ALT_LF_U = 3; * - SEG_LVL_ALT_LF_V = 4; */ int8_t delta_lf[FRAME_LF_COUNT]; /*! * cdef_transmitted[i] is true if CDEF strength for ith CDEF unit in the * current superblock has already been read from (decoder) / written to * (encoder) the bitstream; and false otherwise. * More detail: * 1. CDEF strength is transmitted only once per CDEF unit, in the 1st * non-skip coding block. So, we need this array to keep track of whether CDEF * strengths for the given CDEF units have been transmitted yet or not. * 2. Superblock size can be either 128x128 or 64x64, but CDEF unit size is * fixed to be 64x64. So, there may be 4 CDEF units within a superblock (if * superblock size is 128x128). Hence the array size is 4. * 3. In the current implementation, CDEF strength for this CDEF unit is * stored in the MB_MODE_INFO of the 1st block in this CDEF unit (inside * cm->mi_params.mi_grid_base). */ bool cdef_transmitted[4]; /*! * Mask for this block used for compound prediction. */ uint8_t *seg_mask; /*! * CFL (chroma from luma) related parameters. */ CFL_CTX cfl; /*! * Offset to plane[p].color_index_map. * Currently: * - On encoder side, this is always 0 as 'color_index_map' is allocated per * *coding block* there. * - On decoder side, this may be non-zero, as 'color_index_map' is a (static) * memory pointing to the base of a *superblock* there, and we need an offset * to it to get the color index map for current coding block. */ uint16_t color_index_map_offset[2]; /*! * Temporary buffer used for convolution in case of compound reference only * for (weighted or uniform) averaging operation. * There are pointers to actual buffers allocated elsewhere: e.g. * - In decoder, 'pbi->td.tmp_conv_dst' or * 'pbi->thread_data[t].td->xd.tmp_conv_dst' and * - In encoder, 'x->tmp_conv_dst' or * 'cpi->tile_thr_data[t].td->mb.tmp_conv_dst'. */ CONV_BUF_TYPE *tmp_conv_dst; /*! * Temporary buffers used to build OBMC prediction by above (index 0) and left * (index 1) predictors respectively. * tmp_obmc_bufs[i][p * MAX_SB_SQUARE] is the buffer used for plane 'p'. * There are pointers to actual buffers allocated elsewhere: e.g. * - In decoder, 'pbi->td.tmp_obmc_bufs' or * 'pbi->thread_data[t].td->xd.tmp_conv_dst' and * -In encoder, 'x->tmp_pred_bufs' or * 'cpi->tile_thr_data[t].td->mb.tmp_pred_bufs'. */ uint8_t *tmp_obmc_bufs[2]; } MACROBLOCKD; /*!\cond */ static inline int is_cur_buf_hbd(const MACROBLOCKD *xd) { #if CONFIG_AV1_HIGHBITDEPTH return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0; #else (void)xd; return 0; #endif } static inline uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) { #if CONFIG_AV1_HIGHBITDEPTH return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? CONVERT_TO_BYTEPTR(buf16) : buf16; #else (void)xd; return buf16; #endif } typedef struct BitDepthInfo { int bit_depth; /*! Is the image buffer high bit depth? * Low bit depth buffer uses uint8_t. * High bit depth buffer uses uint16_t. * Equivalent to cm->seq_params->use_highbitdepth */ int use_highbitdepth_buf; } BitDepthInfo; static inline BitDepthInfo get_bit_depth_info(const MACROBLOCKD *xd) { BitDepthInfo bit_depth_info; bit_depth_info.bit_depth = xd->bd; bit_depth_info.use_highbitdepth_buf = is_cur_buf_hbd(xd); assert(IMPLIES(!bit_depth_info.use_highbitdepth_buf, bit_depth_info.bit_depth == 8)); return bit_depth_info; } static inline int get_sqr_bsize_idx(BLOCK_SIZE bsize) { switch (bsize) { case BLOCK_4X4: return 0; case BLOCK_8X8: return 1; case BLOCK_16X16: return 2; case BLOCK_32X32: return 3; case BLOCK_64X64: return 4; case BLOCK_128X128: return 5; default: return SQR_BLOCK_SIZES; } } // For a square block size 'bsize', returns the size of the sub-blocks used by // the given partition type. If the partition produces sub-blocks of different // sizes, then the function returns the largest sub-block size. // Implements the Partition_Subsize lookup table in the spec (Section 9.3. // Conversion tables). // Note: the input block size should be square. // Otherwise it's considered invalid. static inline BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize, PARTITION_TYPE partition) { if (partition == PARTITION_INVALID) { return BLOCK_INVALID; } else { const int sqr_bsize_idx = get_sqr_bsize_idx(bsize); return sqr_bsize_idx >= SQR_BLOCK_SIZES ? BLOCK_INVALID : subsize_lookup[partition][sqr_bsize_idx]; } } static TX_TYPE intra_mode_to_tx_type(const MB_MODE_INFO *mbmi, PLANE_TYPE plane_type) { static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = { DCT_DCT, // DC_PRED ADST_DCT, // V_PRED DCT_ADST, // H_PRED DCT_DCT, // D45_PRED ADST_ADST, // D135_PRED ADST_DCT, // D113_PRED DCT_ADST, // D157_PRED DCT_ADST, // D203_PRED ADST_DCT, // D67_PRED ADST_ADST, // SMOOTH_PRED ADST_DCT, // SMOOTH_V_PRED DCT_ADST, // SMOOTH_H_PRED ADST_ADST, // PAETH_PRED }; const PREDICTION_MODE mode = (plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode); assert(mode < INTRA_MODES); return _intra_mode_to_tx_type[mode]; } static inline int is_rect_tx(TX_SIZE tx_size) { return tx_size >= TX_SIZES; } static inline int block_signals_txsize(BLOCK_SIZE bsize) { return bsize > BLOCK_4X4; } // Number of transform types in each set type static const int av1_num_ext_tx_set[EXT_TX_SET_TYPES] = { 1, 2, 5, 7, 12, 16, }; static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = { { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0 }, { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, }; // The bitmask corresponds to the transform types as defined in // enums.h TX_TYPE enumeration type. Setting the bit 0 means to disable // the use of the corresponding transform type in that table. // The av1_derived_intra_tx_used_flag table is used when // use_reduced_intra_txset is set to 2, where one only searches // the transform types derived from residual statistics. static const uint16_t av1_derived_intra_tx_used_flag[INTRA_MODES] = { 0x0209, // DC_PRED: 0000 0010 0000 1001 0x0403, // V_PRED: 0000 0100 0000 0011 0x0805, // H_PRED: 0000 1000 0000 0101 0x020F, // D45_PRED: 0000 0010 0000 1111 0x0009, // D135_PRED: 0000 0000 0000 1001 0x0009, // D113_PRED: 0000 0000 0000 1001 0x0009, // D157_PRED: 0000 0000 0000 1001 0x0805, // D203_PRED: 0000 1000 0000 0101 0x0403, // D67_PRED: 0000 0100 0000 0011 0x0205, // SMOOTH_PRED: 0000 0010 0000 1001 0x0403, // SMOOTH_V_PRED: 0000 0100 0000 0011 0x0805, // SMOOTH_H_PRED: 0000 1000 0000 0101 0x0209, // PAETH_PRED: 0000 0010 0000 1001 }; static const uint16_t av1_reduced_intra_tx_used_flag[INTRA_MODES] = { 0x080F, // DC_PRED: 0000 1000 0000 1111 0x040F, // V_PRED: 0000 0100 0000 1111 0x080F, // H_PRED: 0000 1000 0000 1111 0x020F, // D45_PRED: 0000 0010 0000 1111 0x080F, // D135_PRED: 0000 1000 0000 1111 0x040F, // D113_PRED: 0000 0100 0000 1111 0x080F, // D157_PRED: 0000 1000 0000 1111 0x080F, // D203_PRED: 0000 1000 0000 1111 0x040F, // D67_PRED: 0000 0100 0000 1111 0x080F, // SMOOTH_PRED: 0000 1000 0000 1111 0x040F, // SMOOTH_V_PRED: 0000 0100 0000 1111 0x080F, // SMOOTH_H_PRED: 0000 1000 0000 1111 0x0C0E, // PAETH_PRED: 0000 1100 0000 1110 }; static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = { 0x0001, // 0000 0000 0000 0001 0x0201, // 0000 0010 0000 0001 0x020F, // 0000 0010 0000 1111 0x0E0F, // 0000 1110 0000 1111 0x0FFF, // 0000 1111 1111 1111 0xFFFF, // 1111 1111 1111 1111 }; static const TxSetType av1_ext_tx_set_lookup[2][2] = { { EXT_TX_SET_DTT4_IDTX_1DDCT, EXT_TX_SET_DTT4_IDTX }, { EXT_TX_SET_ALL16, EXT_TX_SET_DTT9_IDTX_1DDCT }, }; static inline TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter, int use_reduced_set) { const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size]; if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY; if (tx_size_sqr_up == TX_32X32) return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY; if (use_reduced_set) return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX; const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size]; return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16]; } // Maps tx set types to the indices. static const int ext_tx_set_index[2][EXT_TX_SET_TYPES] = { { // Intra 0, -1, 2, 1, -1, -1 }, { // Inter 0, 3, -1, -1, 2, 1 }, }; static inline int get_ext_tx_set(TX_SIZE tx_size, int is_inter, int use_reduced_set) { const TxSetType set_type = av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set); return ext_tx_set_index[is_inter][set_type]; } static inline int get_ext_tx_types(TX_SIZE tx_size, int is_inter, int use_reduced_set) { const int set_type = av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set); return av1_num_ext_tx_set[set_type]; } #define TXSIZEMAX(t1, t2) (tx_size_2d[(t1)] >= tx_size_2d[(t2)] ? (t1) : (t2)) #define TXSIZEMIN(t1, t2) (tx_size_2d[(t1)] <= tx_size_2d[(t2)] ? (t1) : (t2)) static inline TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) { const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bsize]; if (bsize == BLOCK_4X4) return AOMMIN(max_txsize_lookup[bsize], largest_tx_size); if (txsize_sqr_map[max_rect_tx_size] <= largest_tx_size) return max_rect_tx_size; else return largest_tx_size; } static const uint8_t mode_to_angle_map[INTRA_MODES] = { 0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0, }; // Converts block_index for given transform size to index of the block in raster // order. static inline int av1_block_index_to_raster_order(TX_SIZE tx_size, int block_idx) { // For transform size 4x8, the possible block_idx values are 0 & 2, because // block_idx values are incremented in steps of size 'tx_width_unit x // tx_height_unit'. But, for this transform size, block_idx = 2 corresponds to // block number 1 in raster order, inside an 8x8 MI block. // For any other transform size, the two indices are equivalent. return (tx_size == TX_4X8 && block_idx == 2) ? 1 : block_idx; } // Inverse of above function. // Note: only implemented for transform sizes 4x4, 4x8 and 8x4 right now. static inline int av1_raster_order_to_block_index(TX_SIZE tx_size, int raster_order) { assert(tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4); // We ensure that block indices are 0 & 2 if tx size is 4x8 or 8x4. return (tx_size == TX_4X4) ? raster_order : (raster_order > 0) ? 2 : 0; } static inline TX_TYPE get_default_tx_type(PLANE_TYPE plane_type, const MACROBLOCKD *xd, TX_SIZE tx_size, int use_screen_content_tools) { const MB_MODE_INFO *const mbmi = xd->mi[0]; if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y || xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32 || use_screen_content_tools) return DEFAULT_INTER_TX_TYPE; return intra_mode_to_tx_type(mbmi, plane_type); } // Implements the get_plane_residual_size() function in the spec (Section // 5.11.38. Get plane residual size function). static inline BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize, int subsampling_x, int subsampling_y) { assert(bsize < BLOCK_SIZES_ALL); assert(subsampling_x >= 0 && subsampling_x < 2); assert(subsampling_y >= 0 && subsampling_y < 2); return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y]; } /* * Logic to generate the lookup tables: * * TX_SIZE txs = max_txsize_rect_lookup[bsize]; * for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level) * txs = sub_tx_size_map[txs]; * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2; * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2; * const int bw_uint_log2 = mi_size_wide_log2[bsize]; * const int stride_log2 = bw_uint_log2 - tx_w_log2; */ static inline int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row, int blk_col) { static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 1, 1, 2, 2, 3, }; static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 0, 2, 1, 3, 2, }; static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = { 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 2, 0, 1, 0, 1, 0, 1, }; const int index = ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) + (blk_col >> tw_w_log2_table[bsize]); assert(index < INTER_TX_SIZE_BUF_LEN); return index; } #if CONFIG_INSPECTION /* * Here is the logic to generate the lookup tables: * * TX_SIZE txs = max_txsize_rect_lookup[bsize]; * for (int level = 0; level < MAX_VARTX_DEPTH; ++level) * txs = sub_tx_size_map[txs]; * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2; * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2; * const int bw_uint_log2 = mi_size_wide_log2[bsize]; * const int stride_log2 = bw_uint_log2 - tx_w_log2; */ static inline int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row, int blk_col) { static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = { 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, }; static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = { 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, }; static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = { 0, 0, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 3, 3, 0, 2, 0, 2, 0, 2, }; const int index = ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) + (blk_col >> tw_w_log2_table[bsize]); assert(index < TXK_TYPE_BUF_LEN); return index; } #endif // CONFIG_INSPECTION static inline void update_txk_array(MACROBLOCKD *const xd, int blk_row, int blk_col, TX_SIZE tx_size, TX_TYPE tx_type) { const int stride = xd->tx_type_map_stride; xd->tx_type_map[blk_row * stride + blk_col] = tx_type; const int txw = tx_size_wide_unit[tx_size]; const int txh = tx_size_high_unit[tx_size]; // The 16x16 unit is due to the constraint from tx_64x64 which sets the // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block // size, the constraint takes effect in 32x16 / 16x32 size too. To solve // the intricacy, cover all the 16x16 units inside a 64 level transform. if (txw == tx_size_wide_unit[TX_64X64] || txh == tx_size_high_unit[TX_64X64]) { const int tx_unit = tx_size_wide_unit[TX_16X16]; for (int idy = 0; idy < txh; idy += tx_unit) { for (int idx = 0; idx < txw; idx += tx_unit) { xd->tx_type_map[(blk_row + idy) * stride + blk_col + idx] = tx_type; } } } } static inline TX_TYPE av1_get_tx_type(const MACROBLOCKD *xd, PLANE_TYPE plane_type, int blk_row, int blk_col, TX_SIZE tx_size, int reduced_tx_set) { const MB_MODE_INFO *const mbmi = xd->mi[0]; if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) { return DCT_DCT; } TX_TYPE tx_type; if (plane_type == PLANE_TYPE_Y) { tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; } else { if (is_inter_block(mbmi)) { // scale back to y plane's coordinate const struct macroblockd_plane *const pd = &xd->plane[plane_type]; blk_row <<= pd->subsampling_y; blk_col <<= pd->subsampling_x; tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; } else { // In intra mode, uv planes don't share the same prediction mode as y // plane, so the tx_type should not be shared tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV); } const TxSetType tx_set_type = av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set); if (!av1_ext_tx_used[tx_set_type][tx_type]) tx_type = DCT_DCT; } assert(tx_type < TX_TYPES); assert(av1_ext_tx_used[av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set)][tx_type]); return tx_type; } void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y, const int num_planes); /* * Logic to generate the lookup table: * * TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; * int depth = 0; * while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) { * depth++; * tx_size = sub_tx_size_map[tx_size]; * } */ static inline int bsize_to_max_depth(BLOCK_SIZE bsize) { static const uint8_t bsize_to_max_depth_table[BLOCK_SIZES_ALL] = { 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, }; return bsize_to_max_depth_table[bsize]; } /* * Logic to generate the lookup table: * * TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; * assert(tx_size != TX_4X4); * int depth = 0; * while (tx_size != TX_4X4) { * depth++; * tx_size = sub_tx_size_map[tx_size]; * } * assert(depth < 10); */ static inline int bsize_to_tx_size_cat(BLOCK_SIZE bsize) { assert(bsize < BLOCK_SIZES_ALL); static const uint8_t bsize_to_tx_size_depth_table[BLOCK_SIZES_ALL] = { 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 2, 2, 3, 3, 4, 4, }; const int depth = bsize_to_tx_size_depth_table[bsize]; assert(depth <= MAX_TX_CATS); return depth - 1; } static inline TX_SIZE depth_to_tx_size(int depth, BLOCK_SIZE bsize) { TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize]; TX_SIZE tx_size = max_tx_size; for (int d = 0; d < depth; ++d) tx_size = sub_tx_size_map[tx_size]; return tx_size; } static inline TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) { switch (tx_size) { case TX_64X64: case TX_64X32: case TX_32X64: return TX_32X32; case TX_64X16: return TX_32X16; case TX_16X64: return TX_16X32; default: return tx_size; } } static inline TX_SIZE av1_get_max_uv_txsize(BLOCK_SIZE bsize, int subsampling_x, int subsampling_y) { const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, subsampling_x, subsampling_y); assert(plane_bsize < BLOCK_SIZES_ALL); const TX_SIZE uv_tx = max_txsize_rect_lookup[plane_bsize]; return av1_get_adjusted_tx_size(uv_tx); } static inline TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) { const MB_MODE_INFO *mbmi = xd->mi[0]; if (xd->lossless[mbmi->segment_id]) return TX_4X4; if (plane == 0) return mbmi->tx_size; const MACROBLOCKD_PLANE *pd = &xd->plane[plane]; return av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); } void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize, const int num_planes); void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes); void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes); typedef void (*foreach_transformed_block_visitor)(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); void av1_set_entropy_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, int aoff, int loff); #define MAX_INTERINTRA_SB_SQUARE 32 * 32 static inline int is_interintra_mode(const MB_MODE_INFO *mbmi) { return (mbmi->ref_frame[0] > INTRA_FRAME && mbmi->ref_frame[1] == INTRA_FRAME); } static inline int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) { return (bsize >= BLOCK_8X8) && (bsize <= BLOCK_32X32); } static inline int is_interintra_allowed_mode(const PREDICTION_MODE mode) { return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END); } static inline int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) { return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME); } static inline int is_interintra_allowed(const MB_MODE_INFO *mbmi) { return is_interintra_allowed_bsize(mbmi->bsize) && is_interintra_allowed_mode(mbmi->mode) && is_interintra_allowed_ref(mbmi->ref_frame); } static inline int is_interintra_allowed_bsize_group(int group) { int i; for (i = 0; i < BLOCK_SIZES_ALL; i++) { if (size_group_lookup[i] == group && is_interintra_allowed_bsize((BLOCK_SIZE)i)) { return 1; } } return 0; } static inline int is_interintra_pred(const MB_MODE_INFO *mbmi) { return mbmi->ref_frame[0] > INTRA_FRAME && mbmi->ref_frame[1] == INTRA_FRAME && is_interintra_allowed(mbmi); } static inline int get_vartx_max_txsize(const MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane) { if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4; const TX_SIZE max_txsize = max_txsize_rect_lookup[bsize]; if (plane == 0) return max_txsize; // luma return av1_get_adjusted_tx_size(max_txsize); // chroma } static inline int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) { assert(bsize < BLOCK_SIZES_ALL); return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8; } static inline int is_motion_variation_allowed_compound( const MB_MODE_INFO *mbmi) { return !has_second_ref(mbmi); } // input: log2 of length, 0(4), 1(8), ... static const int max_neighbor_obmc[6] = { 0, 1, 2, 3, 4, 4 }; static inline int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) { return mbmi->overlappable_neighbors != 0; } static inline MOTION_MODE motion_mode_allowed( const WarpedMotionParams *gm_params, const MACROBLOCKD *xd, const MB_MODE_INFO *mbmi, int allow_warped_motion) { if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION; if (xd->cur_frame_force_integer_mv == 0) { const TransformationType gm_type = gm_params[mbmi->ref_frame[0]].wmtype; if (is_global_mv_block(mbmi, gm_type)) return SIMPLE_TRANSLATION; } if (is_motion_variation_allowed_bsize(mbmi->bsize) && is_inter_mode(mbmi->mode) && mbmi->ref_frame[1] != INTRA_FRAME && is_motion_variation_allowed_compound(mbmi)) { assert(!has_second_ref(mbmi)); if (mbmi->num_proj_ref >= 1 && allow_warped_motion && !xd->cur_frame_force_integer_mv && !av1_is_scaled(xd->block_ref_scale_factors[0])) { return WARPED_CAUSAL; } return OBMC_CAUSAL; } return SIMPLE_TRANSLATION; } static inline int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) { return (is_inter_block(mbmi)); } static inline int av1_allow_palette(int allow_screen_content_tools, BLOCK_SIZE sb_type) { assert(sb_type < BLOCK_SIZES_ALL); return allow_screen_content_tools && block_size_wide[sb_type] <= MAX_PALETTE_BLOCK_WIDTH && block_size_high[sb_type] <= MAX_PALETTE_BLOCK_HEIGHT && sb_type >= BLOCK_8X8; } // Returns sub-sampled dimensions of the given block. // The output values for 'rows_within_bounds' and 'cols_within_bounds' will // differ from 'height' and 'width' when part of the block is outside the // right // and/or bottom image boundary. static inline void av1_get_block_dimensions(BLOCK_SIZE bsize, int plane, const MACROBLOCKD *xd, int *width, int *height, int *rows_within_bounds, int *cols_within_bounds) { const int block_height = block_size_high[bsize]; const int block_width = block_size_wide[bsize]; const int block_rows = (xd->mb_to_bottom_edge >= 0) ? block_height : (xd->mb_to_bottom_edge >> 3) + block_height; const int block_cols = (xd->mb_to_right_edge >= 0) ? block_width : (xd->mb_to_right_edge >> 3) + block_width; const struct macroblockd_plane *const pd = &xd->plane[plane]; assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_x == 0)); assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_y == 0)); assert(block_width >= block_cols); assert(block_height >= block_rows); const int plane_block_width = block_width >> pd->subsampling_x; const int plane_block_height = block_height >> pd->subsampling_y; // Special handling for chroma sub8x8. const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4; const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4; if (width) { *width = plane_block_width + 2 * is_chroma_sub8_x; assert(*width >= 0); } if (height) { *height = plane_block_height + 2 * is_chroma_sub8_y; assert(*height >= 0); } if (rows_within_bounds) { *rows_within_bounds = (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y; assert(*rows_within_bounds >= 0); } if (cols_within_bounds) { *cols_within_bounds = (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x; assert(*cols_within_bounds >= 0); } } /* clang-format off */ // Pointer to a three-dimensional array whose first dimension is PALETTE_SIZES. typedef aom_cdf_prob (*MapCdf)[PALETTE_COLOR_INDEX_CONTEXTS] [CDF_SIZE(PALETTE_COLORS)]; // Pointer to a const three-dimensional array whose first dimension is // PALETTE_SIZES. typedef const int (*ColorCost)[PALETTE_COLOR_INDEX_CONTEXTS][PALETTE_COLORS]; /* clang-format on */ typedef struct { int rows; int cols; int n_colors; int plane_width; int plane_height; uint8_t *color_map; MapCdf map_cdf; ColorCost color_cost; } Av1ColorMapParam; static inline int is_nontrans_global_motion(const MACROBLOCKD *xd, const MB_MODE_INFO *mbmi) { int ref; // First check if all modes are GLOBALMV if (mbmi->mode != GLOBALMV && mbmi->mode != GLOBAL_GLOBALMV) return 0; if (AOMMIN(mi_size_wide[mbmi->bsize], mi_size_high[mbmi->bsize]) < 2) return 0; // Now check if all global motion is non translational for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { if (xd->global_motion[mbmi->ref_frame[ref]].wmtype == TRANSLATION) return 0; } return 1; } static inline PLANE_TYPE get_plane_type(int plane) { return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV; } static inline int av1_get_max_eob(TX_SIZE tx_size) { if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) { return 1024; } if (tx_size == TX_16X64 || tx_size == TX_64X16) { return 512; } return tx_size_2d[tx_size]; } /*!\endcond */ #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_BLOCKD_H_ aom-3.12.1/av1/common/cdef.c000066400000000000000000000461011477627663500154270ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_scale_rtcd.h" #include "aom/aom_integer.h" #include "aom_util/aom_pthread.h" #include "av1/common/av1_common_int.h" #include "av1/common/cdef.h" #include "av1/common/cdef_block.h" #include "av1/common/common.h" #include "av1/common/common_data.h" #include "av1/common/enums.h" #include "av1/common/reconinter.h" #include "av1/common/thread_common.h" static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col, int mi_stride) { MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col; for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r, mbmi += mi_stride) { for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c) { if (!mbmi[c]->skip_txfm) return 0; } } return 1; } int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params, int mi_row, int mi_col, cdef_list *dlist, BLOCK_SIZE bs) { MB_MODE_INFO **grid = mi_params->mi_grid_base; int maxc = mi_params->mi_cols - mi_col; int maxr = mi_params->mi_rows - mi_row; if (bs == BLOCK_128X128 || bs == BLOCK_128X64) maxc = AOMMIN(maxc, MI_SIZE_128X128); else maxc = AOMMIN(maxc, MI_SIZE_64X64); if (bs == BLOCK_128X128 || bs == BLOCK_64X128) maxr = AOMMIN(maxr, MI_SIZE_128X128); else maxr = AOMMIN(maxr, MI_SIZE_64X64); const int r_step = 2; // mi_size_high[BLOCK_8X8] const int c_step = 2; // mi_size_wide[BLOCK_8X8] const int r_shift = 1; const int c_shift = 1; int count = 0; for (int r = 0; r < maxr; r += r_step) { for (int c = 0; c < maxc; c += c_step) { if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, mi_params->mi_stride)) { dlist[count].by = r >> r_shift; dlist[count].bx = c >> c_shift; count++; } } } return count; } void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height) { for (int i = 0; i < height; i++) { for (int j = 0; j < width; j++) { dst[i * dstride + j] = src[i * sstride + j]; } } } #if CONFIG_AV1_HIGHBITDEPTH void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height) { for (int i = 0; i < height; i++) { for (int j = 0; j < width; j++) { dst[i * dstride + j] = src[i * sstride + j]; } } } #endif // CONFIG_AV1_HIGHBITDEPTH void av1_cdef_copy_sb8_16_lowbd(uint16_t *const dst, int dstride, const uint8_t *src, int src_voffset, int src_hoffset, int sstride, int vsize, int hsize) { const uint8_t *base = &src[src_voffset * (ptrdiff_t)sstride + src_hoffset]; cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, hsize, vsize); } #if CONFIG_AV1_HIGHBITDEPTH void av1_cdef_copy_sb8_16_highbd(uint16_t *const dst, int dstride, const uint8_t *src, int src_voffset, int src_hoffset, int sstride, int vsize, int hsize) { const uint16_t *base = &CONVERT_TO_SHORTPTR(src)[src_voffset * (ptrdiff_t)sstride + src_hoffset]; cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, hsize, vsize); } #endif // CONFIG_AV1_HIGHBITDEPTH void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst, int dstride, const uint8_t *src, int src_voffset, int src_hoffset, int sstride, int vsize, int hsize) { #if CONFIG_AV1_HIGHBITDEPTH if (cm->seq_params->use_highbitdepth) { av1_cdef_copy_sb8_16_highbd(dst, dstride, src, src_voffset, src_hoffset, sstride, vsize, hsize); return; } #else (void)cm; #endif // CONFIG_AV1_HIGHBITDEPTH av1_cdef_copy_sb8_16_lowbd(dst, dstride, src, src_voffset, src_hoffset, sstride, vsize, hsize); } static inline void copy_rect(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h) { for (int i = 0; i < v; i++) { for (int j = 0; j < h; j++) { dst[i * dstride + j] = src[i * sstride + j]; } } } // Prepares intermediate input buffer for CDEF. // Inputs: // cm: Pointer to common structure. // fb_info: Pointer to the CDEF block-level parameter structure. // colbuf: Left column buffer for CDEF. // cdef_left: Left block is filtered or not. // fbc, fbr: col and row index of a block. // plane: plane index Y/CB/CR. // Returns: // Nothing will be returned. static void cdef_prepare_fb(const AV1_COMMON *const cm, CdefBlockInfo *fb_info, uint16_t **const colbuf, const int cdef_left, int fbc, int fbr, int plane) { const CommonModeInfoParams *const mi_params = &cm->mi_params; uint16_t *src = fb_info->src; const int luma_stride = ALIGN_POWER_OF_TWO(mi_params->mi_cols << MI_SIZE_LOG2, 4); const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; int cstart = 0; if (!cdef_left) cstart = -CDEF_HBORDER; int rend, cend; const int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc); const int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr); const int hsize = nhb << fb_info->mi_wide_l2; const int vsize = nvb << fb_info->mi_high_l2; const uint16_t *top_linebuf = fb_info->top_linebuf[plane]; const uint16_t *bot_linebuf = fb_info->bot_linebuf[plane]; const int bot_offset = (vsize + CDEF_VBORDER) * CDEF_BSTRIDE; const int stride = luma_stride >> (plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x); if (fbc == nhfb - 1) cend = hsize; else cend = hsize + CDEF_HBORDER; if (fbr == nvfb - 1) rend = vsize; else rend = vsize + CDEF_VBORDER; /* Copy in the pixels we need from the current superblock for deringing.*/ av1_cdef_copy_sb8_16( cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart], CDEF_BSTRIDE, fb_info->dst, fb_info->roffset, fb_info->coffset + cstart, fb_info->dst_stride, vsize, cend - cstart); /* Copy in the pixels we need for the current superblock from bottom buffer.*/ if (fbr < nvfb - 1) { copy_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, &bot_linebuf[fb_info->coffset], stride, CDEF_VBORDER, hsize); } else { fill_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize, CDEF_VERY_LARGE); } if (fbr < nvfb - 1 && fbc > 0) { copy_rect(&src[bot_offset], CDEF_BSTRIDE, &bot_linebuf[fb_info->coffset - CDEF_HBORDER], stride, CDEF_VBORDER, CDEF_HBORDER); } else { fill_rect(&src[bot_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); } if (fbr < nvfb - 1 && fbc < nhfb - 1) { copy_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE, &bot_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER, CDEF_HBORDER); } else { fill_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); } /* Copy in the pixels we need from the current superblock from top buffer.*/ if (fbr > 0) { copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &top_linebuf[fb_info->coffset], stride, CDEF_VBORDER, hsize); } else { fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize, CDEF_VERY_LARGE); } if (fbr > 0 && fbc > 0) { copy_rect(src, CDEF_BSTRIDE, &top_linebuf[fb_info->coffset - CDEF_HBORDER], stride, CDEF_VBORDER, CDEF_HBORDER); } else { fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); } if (fbr > 0 && fbc < nhfb - 1) { copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, &top_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER, CDEF_HBORDER); } else { fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); } if (cdef_left) { /* If we deringed the superblock on the left then we need to copy in saved pixels. */ copy_rect(src, CDEF_BSTRIDE, colbuf[plane], CDEF_HBORDER, rend + CDEF_VBORDER, CDEF_HBORDER); } /* Saving pixels in case we need to dering the superblock on the right. */ copy_rect(colbuf[plane], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE, rend + CDEF_VBORDER, CDEF_HBORDER); if (fb_info->frame_boundary[LEFT]) { fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); } if (fb_info->frame_boundary[RIGHT]) { fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); } } static inline void cdef_filter_fb(CdefBlockInfo *const fb_info, int plane, uint8_t use_highbitdepth) { ptrdiff_t offset = (ptrdiff_t)fb_info->dst_stride * fb_info->roffset + fb_info->coffset; if (use_highbitdepth) { av1_cdef_filter_fb( NULL, CONVERT_TO_SHORTPTR(fb_info->dst + offset), fb_info->dst_stride, &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], fb_info->xdec, fb_info->ydec, fb_info->dir, NULL, fb_info->var, plane, fb_info->dlist, fb_info->cdef_count, fb_info->level, fb_info->sec_strength, fb_info->damping, fb_info->coeff_shift); } else { av1_cdef_filter_fb( fb_info->dst + offset, NULL, fb_info->dst_stride, &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], fb_info->xdec, fb_info->ydec, fb_info->dir, NULL, fb_info->var, plane, fb_info->dlist, fb_info->cdef_count, fb_info->level, fb_info->sec_strength, fb_info->damping, fb_info->coeff_shift); } } // Initializes block-level parameters for CDEF. static inline void cdef_init_fb_col(const MACROBLOCKD *const xd, CdefBlockInfo *const fb_info, int *level, int *sec_strength, int fbc, int fbr, int plane) { const PLANE_TYPE plane_type = get_plane_type(plane); fb_info->level = level[plane_type]; fb_info->sec_strength = sec_strength[plane_type]; fb_info->dst = xd->plane[plane].dst.buf; fb_info->dst_stride = xd->plane[plane].dst.stride; fb_info->xdec = xd->plane[plane].subsampling_x; fb_info->ydec = xd->plane[plane].subsampling_y; fb_info->mi_wide_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_x; fb_info->mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y; fb_info->roffset = MI_SIZE_64X64 * fbr << fb_info->mi_high_l2; fb_info->coffset = MI_SIZE_64X64 * fbc << fb_info->mi_wide_l2; } static void cdef_fb_col(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, CdefBlockInfo *const fb_info, uint16_t **const colbuf, int *cdef_left, int fbc, int fbr) { const CommonModeInfoParams *const mi_params = &cm->mi_params; const int mbmi_cdef_strength = mi_params ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc] ->cdef_strength; const int num_planes = av1_num_planes(cm); int is_zero_level[PLANE_TYPES] = { 1, 1 }; int level[PLANE_TYPES] = { 0 }; int sec_strength[PLANE_TYPES] = { 0 }; const CdefInfo *const cdef_info = &cm->cdef_info; if (mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc] == NULL || mbmi_cdef_strength == -1) { av1_zero_array(cdef_left, num_planes); return; } // Compute level and secondary strength for planes level[PLANE_TYPE_Y] = cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS; sec_strength[PLANE_TYPE_Y] = cdef_info->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS; sec_strength[PLANE_TYPE_Y] += sec_strength[PLANE_TYPE_Y] == 3; is_zero_level[PLANE_TYPE_Y] = (level[PLANE_TYPE_Y] == 0) && (sec_strength[PLANE_TYPE_Y] == 0); if (num_planes > 1) { level[PLANE_TYPE_UV] = cdef_info->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS; sec_strength[PLANE_TYPE_UV] = cdef_info->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS; sec_strength[PLANE_TYPE_UV] += sec_strength[PLANE_TYPE_UV] == 3; is_zero_level[PLANE_TYPE_UV] = (level[PLANE_TYPE_UV] == 0) && (sec_strength[PLANE_TYPE_UV] == 0); } if (is_zero_level[PLANE_TYPE_Y] && is_zero_level[PLANE_TYPE_UV]) { av1_zero_array(cdef_left, num_planes); return; } fb_info->cdef_count = av1_cdef_compute_sb_list(mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, fb_info->dlist, BLOCK_64X64); if (!fb_info->cdef_count) { av1_zero_array(cdef_left, num_planes); return; } for (int plane = 0; plane < num_planes; plane++) { // Do not skip cdef filtering for luma plane as filter direction is // computed based on luma. if (plane && is_zero_level[get_plane_type(plane)]) { cdef_left[plane] = 0; continue; } cdef_init_fb_col(xd, fb_info, level, sec_strength, fbc, fbr, plane); cdef_prepare_fb(cm, fb_info, colbuf, cdef_left[plane], fbc, fbr, plane); cdef_filter_fb(fb_info, plane, cm->seq_params->use_highbitdepth); cdef_left[plane] = 1; } } // Initializes row-level parameters for CDEF frame. void av1_cdef_init_fb_row(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, CdefBlockInfo *const fb_info, uint16_t **const linebuf, uint16_t *const src, struct AV1CdefSyncData *const cdef_sync, int fbr) { (void)cdef_sync; const int num_planes = av1_num_planes(cm); const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; const int luma_stride = ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4); const bool ping_pong = fbr & 1; // for the current filter block, it's top left corner mi structure (mi_tl) // is first accessed to check whether the top and left boundaries are // frame boundaries. Then bottom-left and top-right mi structures are // accessed to check whether the bottom and right boundaries // (respectively) are frame boundaries. // // Note that we can't just check the bottom-right mi structure - eg. if // we're at the right-hand edge of the frame but not the bottom, then // the bottom-right mi is NULL but the bottom-left is not. fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0; if (fbr != nvfb - 1) fb_info->frame_boundary[BOTTOM] = (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0; else fb_info->frame_boundary[BOTTOM] = 1; fb_info->src = src; fb_info->damping = cm->cdef_info.cdef_damping; fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0); av1_zero(fb_info->dir); av1_zero(fb_info->var); for (int plane = 0; plane < num_planes; plane++) { const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y; const int offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2; const int stride = luma_stride >> xd->plane[plane].subsampling_x; // here ping-pong buffers are maintained for top linebuf // to avoid linebuf over-write by consecutive row. uint16_t *const top_linebuf = &linebuf[plane][ping_pong * CDEF_VBORDER * stride]; fb_info->bot_linebuf[plane] = &linebuf[plane][(CDEF_VBORDER << 1) * stride]; if (fbr != nvfb - 1) // top line buffer copy av1_cdef_copy_sb8_16(cm, top_linebuf, stride, xd->plane[plane].dst.buf, offset - CDEF_VBORDER, 0, xd->plane[plane].dst.stride, CDEF_VBORDER, stride); fb_info->top_linebuf[plane] = &linebuf[plane][(!ping_pong) * CDEF_VBORDER * stride]; if (fbr != nvfb - 1) // bottom line buffer copy av1_cdef_copy_sb8_16(cm, fb_info->bot_linebuf[plane], stride, xd->plane[plane].dst.buf, offset, 0, xd->plane[plane].dst.stride, CDEF_VBORDER, stride); } } void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd, uint16_t **const linebuf, uint16_t **const colbuf, uint16_t *const src, int fbr, cdef_init_fb_row_t cdef_init_fb_row_fn, struct AV1CdefSyncData *const cdef_sync, struct aom_internal_error_info *error_info) { // TODO(aomedia:3276): Pass error_info to the low-level functions as required // in future to handle error propagation. (void)error_info; CdefBlockInfo fb_info; int cdef_left[MAX_MB_PLANE] = { 1, 1, 1 }; const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; cdef_init_fb_row_fn(cm, xd, &fb_info, linebuf, src, cdef_sync, fbr); #if CONFIG_MULTITHREAD if (cdef_sync && cm->cdef_info.allocated_num_workers > 1) { pthread_mutex_lock(cdef_sync->mutex_); const bool cdef_mt_exit = cdef_sync->cdef_mt_exit; pthread_mutex_unlock(cdef_sync->mutex_); // Exit in case any worker has encountered an error. if (cdef_mt_exit) return; } #endif for (int fbc = 0; fbc < nhfb; fbc++) { fb_info.frame_boundary[LEFT] = (MI_SIZE_64X64 * fbc == 0) ? 1 : 0; if (fbc != nhfb - 1) fb_info.frame_boundary[RIGHT] = (MI_SIZE_64X64 * (fbc + 1) == cm->mi_params.mi_cols) ? 1 : 0; else fb_info.frame_boundary[RIGHT] = 1; cdef_fb_col(cm, xd, &fb_info, colbuf, &cdef_left[0], fbc, fbr); } } // Perform CDEF on input frame. // Inputs: // frame: Pointer to input frame buffer. // cm: Pointer to common structure. // xd: Pointer to common current coding block structure. // Returns: // Nothing will be returned. void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm, MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn) { const int num_planes = av1_num_planes(cm); const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0, num_planes); for (int fbr = 0; fbr < nvfb; fbr++) av1_cdef_fb_row(cm, xd, cm->cdef_info.linebuf, cm->cdef_info.colbuf, cm->cdef_info.srcbuf, fbr, cdef_init_fb_row_fn, NULL, xd->error_info); } aom-3.12.1/av1/common/cdef.h000066400000000000000000000106521477627663500154360ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_CDEF_H_ #define AOM_AV1_COMMON_CDEF_H_ #define CDEF_STRENGTH_BITS 6 #define CDEF_PRI_STRENGTHS 16 #define CDEF_SEC_STRENGTHS 4 #include "config/aom_config.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "av1/common/av1_common_int.h" #include "av1/common/cdef_block.h" enum { TOP, LEFT, BOTTOM, RIGHT, BOUNDARIES } UENUM1BYTE(BOUNDARY); struct AV1CdefSyncData; /*!\brief Parameters related to CDEF Block */ typedef struct { uint16_t *src; /*!< CDEF intermediate buffer */ uint16_t *top_linebuf[MAX_MB_PLANE]; /*!< CDEF top line buffer */ uint16_t *bot_linebuf[MAX_MB_PLANE]; /*!< CDEF bottom line buffer */ uint8_t *dst; /*!< CDEF destination buffer */ cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64]; /*!< CDEF 8x8 block positions */ int xdec; /*!< Sub-sampling X */ int ydec; /*!< Sub-sampling X */ int mi_wide_l2; /*!< Pixels per mi unit in width */ int mi_high_l2; /*!< Pixels per mi unit in height */ int frame_boundary[BOUNDARIES]; /*!< frame boundaries */ int damping; /*!< CDEF damping factor */ int coeff_shift; /*!< Bit-depth based shift for calculating filter strength */ int level; /*!< CDEF filtering level */ int sec_strength; /*!< CDEF secondary strength */ int cdef_count; /*!< Number of CDEF sub-blocks in superblock */ int dir[CDEF_NBLOCKS] [CDEF_NBLOCKS]; /*!< CDEF filter direction for all 8x8 sub-blocks*/ int var[CDEF_NBLOCKS][CDEF_NBLOCKS]; /*!< variance for all 8x8 sub-blocks */ int dst_stride; /*!< CDEF destination buffer stride */ int coffset; /*!< current superblock offset in a row */ int roffset; /*!< current row offset */ } CdefBlockInfo; static inline int sign(int i) { return i < 0 ? -1 : 1; } static inline int constrain(int diff, int threshold, int damping) { if (!threshold) return 0; const int shift = AOMMAX(0, damping - get_msb(threshold)); return sign(diff) * AOMMIN(abs(diff), AOMMAX(0, threshold - (abs(diff) >> shift))); } #ifdef __cplusplus extern "C" { #endif int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params, int mi_row, int mi_col, cdef_list *dlist, BLOCK_SIZE bsize); typedef void (*cdef_init_fb_row_t)( const AV1_COMMON *const cm, const MACROBLOCKD *const xd, CdefBlockInfo *const fb_info, uint16_t **const linebuf, uint16_t *const src, struct AV1CdefSyncData *const cdef_sync, int fbr); /*!\brief Function for applying CDEF to a frame * * \ingroup in_loop_cdef * This function applies CDEF to a frame. * * \param[in, out] frame Compressed frame buffer * \param[in, out] cm Pointer to top level common structure * \param[in] xd Pointer to common current coding block structure * \param[in] cdef_init_fb_row_fn Function Pointer * * \remark Nothing is returned. Instead, the filtered frame is output in * \c frame. */ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm, MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn); void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd, uint16_t **const linebuf, uint16_t **const colbuf, uint16_t *const src, int fbr, cdef_init_fb_row_t cdef_init_fb_row_fn, struct AV1CdefSyncData *const cdef_sync, struct aom_internal_error_info *error_info); void av1_cdef_init_fb_row(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, CdefBlockInfo *const fb_info, uint16_t **const linebuf, uint16_t *const src, struct AV1CdefSyncData *const cdef_sync, int fbr); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_CDEF_H_ aom-3.12.1/av1/common/cdef_block.c000066400000000000000000000437271477627663500166140ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" #include "av1/common/cdef.h" /* This is Cdef_Directions (section 7.15.3) with 2 padding entries at the beginning and end of the table. The cdef direction range is [0, 7] and the first index is offset +/-2. This removes the need to constrain the first index to the same range using e.g., & 7. */ DECLARE_ALIGNED(16, static const int, cdef_directions_padded[12][2]) = { /* Padding: cdef_directions[6] */ { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 }, /* Padding: cdef_directions[7] */ { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }, /* Begin cdef_directions */ { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 }, { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 }, { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2 }, { 0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2 }, { 1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2 }, { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1 }, { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 }, { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }, /* End cdef_directions */ /* Padding: cdef_directions[0] */ { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 }, /* Padding: cdef_directions[1] */ { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 }, }; const int (*const cdef_directions)[2] = cdef_directions_padded + 2; /* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on. The search minimizes the weighted variance along all the lines in a particular direction, i.e. the squared error between the input and a "predicted" block where each pixel is replaced by the average along a line in a particular direction. Since each direction have the same sum(x^2) term, that term is never computed. See Section 2, step 2, of: http://jmvalin.ca/notes/intra_paint.pdf */ int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift) { int i; int32_t cost[8] = { 0 }; int partial[8][15] = { { 0 } }; int32_t best_cost = 0; int best_dir = 0; /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n. The output is then 840 times larger, but we don't care for finding the max. */ static const int div_table[] = { 0, 840, 420, 280, 210, 168, 140, 120, 105 }; for (i = 0; i < 8; i++) { int j; for (j = 0; j < 8; j++) { int x; /* We subtract 128 here to reduce the maximum range of the squared partial sums. */ x = (img[i * stride + j] >> coeff_shift) - 128; partial[0][i + j] += x; partial[1][i + j / 2] += x; partial[2][i] += x; partial[3][3 + i - j / 2] += x; partial[4][7 + i - j] += x; partial[5][3 - i / 2 + j] += x; partial[6][j] += x; partial[7][i / 2 + j] += x; } } for (i = 0; i < 8; i++) { cost[2] += partial[2][i] * partial[2][i]; cost[6] += partial[6][i] * partial[6][i]; } cost[2] *= div_table[8]; cost[6] *= div_table[8]; for (i = 0; i < 7; i++) { cost[0] += (partial[0][i] * partial[0][i] + partial[0][14 - i] * partial[0][14 - i]) * div_table[i + 1]; cost[4] += (partial[4][i] * partial[4][i] + partial[4][14 - i] * partial[4][14 - i]) * div_table[i + 1]; } cost[0] += partial[0][7] * partial[0][7] * div_table[8]; cost[4] += partial[4][7] * partial[4][7] * div_table[8]; for (i = 1; i < 8; i += 2) { int j; for (j = 0; j < 4 + 1; j++) { cost[i] += partial[i][3 + j] * partial[i][3 + j]; } cost[i] *= div_table[8]; for (j = 0; j < 4 - 1; j++) { cost[i] += (partial[i][j] * partial[i][j] + partial[i][10 - j] * partial[i][10 - j]) * div_table[2 * j + 2]; } } for (i = 0; i < 8; i++) { if (cost[i] > best_cost) { best_cost = cost[i]; best_dir = i; } } /* Difference between the optimal variance and the variance along the orthogonal direction. Again, the sum(x^2) terms cancel out. */ *var = best_cost - cost[(best_dir + 4) & 7]; /* We'd normally divide by 840, but dividing by 1024 is close enough for what we're going to do with this. */ *var >>= 10; return best_dir; } void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var1, int32_t *var2, int coeff_shift, int *out1, int *out2) { *out1 = cdef_find_dir_c(img1, stride, var1, coeff_shift); *out2 = cdef_find_dir_c(img2, stride, var2, coeff_shift); } const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } }; const int cdef_sec_taps[2] = { 2, 1 }; /* Smooth in the direction detected. */ static void cdef_filter_block_internal( uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height, int enable_primary, int enable_secondary) { const int clipping_required = (enable_primary && enable_secondary); int i, j, k; const int s = CDEF_BSTRIDE; const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; const int *sec_taps = cdef_sec_taps; for (i = 0; i < block_height; i++) { for (j = 0; j < block_width; j++) { int16_t sum = 0; int16_t y; int16_t x = in[i * s + j]; int max = x; int min = x; for (k = 0; k < 2; k++) { if (enable_primary) { int16_t p0 = in[i * s + j + cdef_directions[dir][k]]; int16_t p1 = in[i * s + j - cdef_directions[dir][k]]; sum += pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping); sum += pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping); if (clipping_required) { if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max); if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max); min = AOMMIN(p0, min); min = AOMMIN(p1, min); } } if (enable_secondary) { int16_t s0 = in[i * s + j + cdef_directions[dir + 2][k]]; int16_t s1 = in[i * s + j - cdef_directions[dir + 2][k]]; int16_t s2 = in[i * s + j + cdef_directions[dir - 2][k]]; int16_t s3 = in[i * s + j - cdef_directions[dir - 2][k]]; if (clipping_required) { if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max); if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max); if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max); if (s3 != CDEF_VERY_LARGE) max = AOMMAX(s3, max); min = AOMMIN(s0, min); min = AOMMIN(s1, min); min = AOMMIN(s2, min); min = AOMMIN(s3, min); } sum += sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping); sum += sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping); sum += sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping); sum += sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping); } } y = ((int16_t)x + ((8 + sum - (sum < 0)) >> 4)); if (clipping_required) { y = clamp(y, min, max); } if (dst8) dst8[i * dstride + j] = (uint8_t)y; else dst16[i * dstride + j] = (uint16_t)y; } } } void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_width, block_height, /*enable_primary=*/1, /*enable_secondary=*/1); } void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_width, block_height, /*enable_primary=*/1, /*enable_secondary=*/0); } void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_width, block_height, /*enable_primary=*/0, /*enable_secondary=*/1); } void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_width, block_height, /*enable_primary=*/0, /*enable_secondary=*/0); } void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_width, block_height, /*enable_primary=*/1, /*enable_secondary=*/1); } void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_width, block_height, /*enable_primary=*/1, /*enable_secondary=*/0); } void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_width, block_height, /*enable_primary=*/0, /*enable_secondary=*/1); } void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_width, block_height, /*enable_primary=*/0, /*enable_secondary=*/0); } /* Compute the primary filter strength for an 8x8 block based on the directional variance difference. A high variance difference means that we have a highly directional pattern (e.g. a high contrast edge), so we can apply more deringing. A low variance means that we either have a low contrast edge, or a non-directional texture, so we want to be careful not to blur. */ static inline int adjust_strength(int strength, int32_t var) { const int i = var >> 6 ? AOMMIN(get_msb(var >> 6), 12) : 0; /* We use the variance of 8x8 blocks to adjust the strength. */ return var ? (strength * (4 + i) + 8) >> 4 : 0; } static inline void aom_cdef_find_dir(const uint16_t *in, cdef_list *dlist, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int cdef_count, int coeff_shift, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS]) { int bi; // Find direction of two 8x8 blocks together. for (bi = 0; bi < cdef_count - 1; bi += 2) { const int by = dlist[bi].by; const int bx = dlist[bi].bx; const int by2 = dlist[bi + 1].by; const int bx2 = dlist[bi + 1].bx; const int pos1 = 8 * by * CDEF_BSTRIDE + 8 * bx; const int pos2 = 8 * by2 * CDEF_BSTRIDE + 8 * bx2; cdef_find_dir_dual(&in[pos1], &in[pos2], CDEF_BSTRIDE, &var[by][bx], &var[by2][bx2], coeff_shift, &dir[by][bx], &dir[by2][bx2]); } // Process remaining 8x8 blocks here. One 8x8 at a time. if (cdef_count % 2) { const int by = dlist[bi].by; const int bx = dlist[bi].bx; dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx], CDEF_BSTRIDE, &var[by][bx], coeff_shift); } } void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli, cdef_list *dlist, int cdef_count, int level, int sec_strength, int damping, int coeff_shift) { int bi; int bx; int by; const int pri_strength = level << coeff_shift; sec_strength <<= coeff_shift; damping += coeff_shift - (pli != AOM_PLANE_Y); const int bw_log2 = 3 - xdec; const int bh_log2 = 3 - ydec; if (dirinit && pri_strength == 0 && sec_strength == 0) { // If we're here, both primary and secondary strengths are 0, and // we still haven't written anything to y[] yet, so we just copy // the input to y[]. This is necessary only for av1_cdef_search() // and only av1_cdef_search() sets dirinit. for (bi = 0; bi < cdef_count; bi++) { by = dlist[bi].by; bx = dlist[bi].bx; // TODO(stemidts/jmvalin): SIMD optimisations for (int iy = 0; iy < 1 << bh_log2; iy++) { memcpy(&dst16[(bi << (bw_log2 + bh_log2)) + (iy << bw_log2)], &in[((by << bh_log2) + iy) * CDEF_BSTRIDE + (bx << bw_log2)], ((size_t)1 << bw_log2) * sizeof(*dst16)); } } return; } if (pli == 0) { if (!dirinit || !*dirinit) { aom_cdef_find_dir(in, dlist, var, cdef_count, coeff_shift, dir); if (dirinit) *dirinit = 1; } } if (pli == 1 && xdec != ydec) { for (bi = 0; bi < cdef_count; bi++) { static const int conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 }; static const int conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 }; by = dlist[bi].by; bx = dlist[bi].bx; dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]]; } } if (dst8) { const int block_width = 8 >> xdec; const int block_height = 8 >> ydec; /* * strength_index == 0 : enable_primary = 1, enable_secondary = 1 * strength_index == 1 : enable_primary = 1, enable_secondary = 0 * strength_index == 2 : enable_primary = 0, enable_secondary = 1 * strength_index == 3 : enable_primary = 0, enable_secondary = 0 */ const cdef_filter_block_func cdef_filter_fn[4] = { cdef_filter_8_0, cdef_filter_8_1, cdef_filter_8_2, cdef_filter_8_3 }; for (bi = 0; bi < cdef_count; bi++) { by = dlist[bi].by; bx = dlist[bi].bx; const int t = (pli ? pri_strength : adjust_strength(pri_strength, var[by][bx])); const int strength_index = (sec_strength == 0) | ((t == 0) << 1); cdef_filter_fn[strength_index]( &dst8[(by << bh_log2) * dstride + (bx << bw_log2)], dstride, &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], t, sec_strength, pri_strength ? dir[by][bx] : 0, damping, damping, coeff_shift, block_width, block_height); } } else { const int block_width = 8 >> xdec; const int block_height = 8 >> ydec; /* * strength_index == 0 : enable_primary = 1, enable_secondary = 1 * strength_index == 1 : enable_primary = 1, enable_secondary = 0 * strength_index == 2 : enable_primary = 0, enable_secondary = 1 * strength_index == 3 : enable_primary = 0, enable_secondary = 0 */ const cdef_filter_block_func cdef_filter_fn[4] = { cdef_filter_16_0, cdef_filter_16_1, cdef_filter_16_2, cdef_filter_16_3 }; for (bi = 0; bi < cdef_count; bi++) { by = dlist[bi].by; bx = dlist[bi].bx; const int t = (pli ? pri_strength : adjust_strength(pri_strength, var[by][bx])); const int strength_index = (sec_strength == 0) | ((t == 0) << 1); cdef_filter_fn[strength_index]( &dst16[dirinit ? bi << (bw_log2 + bh_log2) : (by << bh_log2) * dstride + (bx << bw_log2)], dirinit ? 1 << bw_log2 : dstride, &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], t, sec_strength, pri_strength ? dir[by][bx] : 0, damping, damping, coeff_shift, block_width, block_height); } } } aom-3.12.1/av1/common/cdef_block.h000066400000000000000000000047371477627663500166170ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_CDEF_BLOCK_H_ #define AOM_AV1_COMMON_CDEF_BLOCK_H_ #include "aom_dsp/odintrin.h" #define CDEF_BLOCKSIZE 64 #define CDEF_BLOCKSIZE_LOG2 6 #define CDEF_NBLOCKS ((1 << MAX_SB_SIZE_LOG2) / 8) #define CDEF_SB_SHIFT (MAX_SB_SIZE_LOG2 - CDEF_BLOCKSIZE_LOG2) /* We need to buffer two vertical lines. */ #define CDEF_VBORDER (2) /* We only need to buffer three horizontal pixels too, but let's align to 16 bytes (8 x 16 bits) to make vectorization easier. */ #define CDEF_HBORDER (8) #define CDEF_BSTRIDE \ ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3) #define CDEF_VERY_LARGE (0x4000) #define CDEF_INBUF_SIZE \ (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER)) extern const int cdef_pri_taps[2][2]; extern const int cdef_sec_taps[2]; extern const int (*const cdef_directions)[2]; typedef struct { uint8_t by; uint8_t bx; } cdef_list; typedef void (*cdef_filter_block_func)(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height); void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli, cdef_list *dlist, int cdef_count, int level, int sec_strength, int damping, int coeff_shift); static inline void fill_rect(uint16_t *dst, int dstride, int v, int h, uint16_t x) { for (int i = 0; i < v; i++) { for (int j = 0; j < h; j++) { dst[i * dstride + j] = x; } } } #endif // AOM_AV1_COMMON_CDEF_BLOCK_H_ aom-3.12.1/av1/common/cdef_block_simd.h000066400000000000000000001115021477627663500176200ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ #define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "av1/common/cdef_block.h" /* partial A is a 16-bit vector of the form: [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form: [0 y1 y2 y3 y4 y5 y6 y7]. This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1 and const2. */ static inline v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1, v128 const2) { v128 tmp; /* Reverse partial B. */ partialb = v128_shuffle_8( partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c)); /* Interleave the x and y values of identical indices and pair x8 with 0. */ tmp = partiala; partiala = v128_ziplo_16(partialb, partiala); partialb = v128_ziphi_16(partialb, tmp); /* Square and add the corresponding x and y values. */ partiala = v128_madd_s16(partiala, partiala); partialb = v128_madd_s16(partialb, partialb); /* Multiply by constant. */ partiala = v128_mullo_s32(partiala, const1); partialb = v128_mullo_s32(partialb, const2); /* Sum all results. */ partiala = v128_add_32(partiala, partialb); return partiala; } static inline v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) { v128 t0, t1, t2, t3; t0 = v128_ziplo_32(x1, x0); t1 = v128_ziplo_32(x3, x2); t2 = v128_ziphi_32(x1, x0); t3 = v128_ziphi_32(x3, x2); x0 = v128_ziplo_64(t1, t0); x1 = v128_ziphi_64(t1, t0); x2 = v128_ziplo_64(t3, t2); x3 = v128_ziphi_64(t3, t2); return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3)); } /* Computes cost for directions 0, 5, 6 and 7. We can call this function again to compute the remaining directions. */ static inline v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) { v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b; v128 partial6; v128 tmp; /* Partial sums for lines 0 and 1. */ partial4a = v128_shl_n_byte(lines[0], 14); partial4b = v128_shr_n_byte(lines[0], 2); partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12)); partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4)); tmp = v128_add_16(lines[0], lines[1]); partial5a = v128_shl_n_byte(tmp, 10); partial5b = v128_shr_n_byte(tmp, 6); partial7a = v128_shl_n_byte(tmp, 4); partial7b = v128_shr_n_byte(tmp, 12); partial6 = tmp; /* Partial sums for lines 2 and 3. */ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10)); partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6)); partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8)); partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8)); tmp = v128_add_16(lines[2], lines[3]); partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8)); partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8)); partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6)); partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10)); partial6 = v128_add_16(partial6, tmp); /* Partial sums for lines 4 and 5. */ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6)); partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10)); partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4)); partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12)); tmp = v128_add_16(lines[4], lines[5]); partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6)); partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10)); partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8)); partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8)); partial6 = v128_add_16(partial6, tmp); /* Partial sums for lines 6 and 7. */ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2)); partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14)); partial4a = v128_add_16(partial4a, lines[7]); tmp = v128_add_16(lines[6], lines[7]); partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4)); partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12)); partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10)); partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6)); partial6 = v128_add_16(partial6, tmp); /* Compute costs in terms of partial sums. */ partial4a = fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840), v128_from_32(105, 120, 140, 168)); partial7a = fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0), v128_from_32(105, 105, 105, 140)); partial5a = fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0), v128_from_32(105, 105, 105, 140)); partial6 = v128_madd_s16(partial6, partial6); partial6 = v128_mullo_s32(partial6, v128_dup_32(105)); partial4a = hsum4(partial4a, partial5a, partial6, partial7a); v128_store_unaligned(tmp_cost1, partial4a); return partial4a; } /* transpose and reverse the order of the lines -- equivalent to a 90-degree counter-clockwise rotation of the pixels. */ static inline void array_reverse_transpose_8x8(v128 *in, v128 *res) { const v128 tr0_0 = v128_ziplo_16(in[1], in[0]); const v128 tr0_1 = v128_ziplo_16(in[3], in[2]); const v128 tr0_2 = v128_ziphi_16(in[1], in[0]); const v128 tr0_3 = v128_ziphi_16(in[3], in[2]); const v128 tr0_4 = v128_ziplo_16(in[5], in[4]); const v128 tr0_5 = v128_ziplo_16(in[7], in[6]); const v128 tr0_6 = v128_ziphi_16(in[5], in[4]); const v128 tr0_7 = v128_ziphi_16(in[7], in[6]); const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0); const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4); const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0); const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4); const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2); const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6); const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2); const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6); res[7] = v128_ziplo_64(tr1_1, tr1_0); res[6] = v128_ziphi_64(tr1_1, tr1_0); res[5] = v128_ziplo_64(tr1_3, tr1_2); res[4] = v128_ziphi_64(tr1_3, tr1_2); res[3] = v128_ziplo_64(tr1_5, tr1_4); res[2] = v128_ziphi_64(tr1_5, tr1_4); res[1] = v128_ziplo_64(tr1_7, tr1_6); res[0] = v128_ziphi_64(tr1_7, tr1_6); } int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift) { int i; int32_t cost[8]; int32_t best_cost = 0; int best_dir = 0; v128 lines[8]; for (i = 0; i < 8; i++) { lines[i] = v128_load_unaligned(&img[i * stride]); lines[i] = v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128)); } /* Compute "mostly vertical" directions. */ v128 dir47 = compute_directions(lines, cost + 4); array_reverse_transpose_8x8(lines, lines); /* Compute "mostly horizontal" directions. */ v128 dir03 = compute_directions(lines, cost); v128 max = v128_max_s32(dir03, dir47); max = v128_max_s32(max, v128_align(max, max, 8)); max = v128_max_s32(max, v128_align(max, max, 4)); best_cost = v128_low_u32(max); v128 t = v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03)); best_dir = v128_movemask_8(v128_pack_s16_s8(t, t)); best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros /* Difference between the optimal variance and the variance along the orthogonal direction. Again, the sum(x^2) terms cancel out. */ *var = best_cost - cost[(best_dir + 4) & 7]; /* We'd normally divide by 840, but dividing by 1024 is close enough for what we're going to do with this. */ *var >>= 10; return best_dir; } // Work around compiler out of memory issues with Win32 builds. This issue has // been observed with Visual Studio 2017, 2019, and 2022 (version 17.10.3). #if defined(_MSC_VER) && defined(_M_IX86) #define CDEF_INLINE static inline #else #define CDEF_INLINE SIMD_INLINE #endif // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp))) CDEF_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold, unsigned int adjdamp) { v256 diff = v256_sub_16(a, b); const v256 sign = v256_shr_n_s16(diff, 15); diff = v256_abs_s16(diff); const v256 s = v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp)); return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign); } SIMD_INLINE v256 get_max_primary(const int is_lowbd, v256 *tap, v256 max, v256 cdef_large_value_mask) { if (is_lowbd) { v256 max_u8; max_u8 = tap[0]; max_u8 = v256_max_u8(max_u8, tap[1]); max_u8 = v256_max_u8(max_u8, tap[2]); max_u8 = v256_max_u8(max_u8, tap[3]); /* The source is 16 bits, however, we only really care about the lower 8 bits. The upper 8 bits contain the "large" flag. After the final primary max has been calculated, zero out the upper 8 bits. Use this to find the "16 bit" max. */ max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask)); } else { /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask)); max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask)); max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask)); max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask)); } return max; } SIMD_INLINE v256 get_max_secondary(const int is_lowbd, v256 *tap, v256 max, v256 cdef_large_value_mask) { if (is_lowbd) { v256 max_u8; max_u8 = tap[0]; max_u8 = v256_max_u8(max_u8, tap[1]); max_u8 = v256_max_u8(max_u8, tap[2]); max_u8 = v256_max_u8(max_u8, tap[3]); max_u8 = v256_max_u8(max_u8, tap[4]); max_u8 = v256_max_u8(max_u8, tap[5]); max_u8 = v256_max_u8(max_u8, tap[6]); max_u8 = v256_max_u8(max_u8, tap[7]); /* The source is 16 bits, however, we only really care about the lower 8 bits. The upper 8 bits contain the "large" flag. After the final primary max has been calculated, zero out the upper 8 bits. Use this to find the "16 bit" max. */ max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask)); } else { /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask)); max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask)); max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask)); max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask)); max = v256_max_s16(max, v256_and(tap[4], cdef_large_value_mask)); max = v256_max_s16(max, v256_and(tap[5], cdef_large_value_mask)); max = v256_max_s16(max, v256_and(tap[6], cdef_large_value_mask)); max = v256_max_s16(max, v256_and(tap[7], cdef_large_value_mask)); } return max; } // MSVC takes far too much time optimizing these. // https://bugs.chromium.org/p/aomedia/issues/detail?id=3395 #if defined(_MSC_VER) && !defined(__clang__) #pragma optimize("", off) #endif CDEF_INLINE void filter_block_4x4(const int is_lowbd, void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int height, int enable_primary, int enable_secondary) { uint8_t *dst8 = (uint8_t *)dest; uint16_t *dst16 = (uint16_t *)dest; const int clipping_required = enable_primary && enable_secondary; v256 p0, p1, p2, p3; v256 sum, row, res; v256 max, min; const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE); const int po1 = cdef_directions[dir][0]; const int po2 = cdef_directions[dir][1]; const int s1o1 = cdef_directions[dir + 2][0]; const int s1o2 = cdef_directions[dir + 2][1]; const int s2o1 = cdef_directions[dir - 2][0]; const int s2o2 = cdef_directions[dir - 2][1]; const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; const int *sec_taps = cdef_sec_taps; int i; if (enable_primary && pri_strength) pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); if (enable_secondary && sec_strength) sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); for (i = 0; i < height; i += 4) { sum = v256_zero(); row = v256_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]), v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]), v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]), v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE])); max = min = row; if (enable_primary) { v256 tap[4]; // Primary near taps tap[0] = v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po1]), v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]), v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]), v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1])); p0 = constrain16(tap[0], row, pri_strength, pri_damping); tap[1] = v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po1]), v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]), v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]), v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1])); p1 = constrain16(tap[1], row, pri_strength, pri_damping); // sum += pri_taps[0] * (p0 + p1) sum = v256_add_16( sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1))); // Primary far taps tap[2] = v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po2]), v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]), v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]), v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2])); p0 = constrain16(tap[2], row, pri_strength, pri_damping); tap[3] = v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po2]), v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]), v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]), v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2])); p1 = constrain16(tap[3], row, pri_strength, pri_damping); // sum += pri_taps[1] * (p0 + p1) sum = v256_add_16( sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1))); if (clipping_required) { max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); min = v256_min_s16(min, tap[0]); min = v256_min_s16(min, tap[1]); min = v256_min_s16(min, tap[2]); min = v256_min_s16(min, tap[3]); } } if (enable_secondary) { v256 tap[8]; // Secondary near taps tap[0] = v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o1]), v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]), v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]), v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1])); p0 = constrain16(tap[0], row, sec_strength, sec_damping); tap[1] = v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o1]), v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]), v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]), v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1])); p1 = constrain16(tap[1], row, sec_strength, sec_damping); tap[2] = v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o1]), v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]), v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]), v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1])); p2 = constrain16(tap[2], row, sec_strength, sec_damping); tap[3] = v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o1]), v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]), v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]), v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1])); p3 = constrain16(tap[3], row, sec_strength, sec_damping); // sum += sec_taps[0] * (p0 + p1 + p2 + p3) sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]), v256_add_16(v256_add_16(p0, p1), v256_add_16(p2, p3)))); // Secondary far taps tap[4] = v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o2]), v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]), v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]), v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2])); p0 = constrain16(tap[4], row, sec_strength, sec_damping); tap[5] = v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o2]), v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]), v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]), v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2])); p1 = constrain16(tap[5], row, sec_strength, sec_damping); tap[6] = v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o2]), v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]), v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]), v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2])); p2 = constrain16(tap[6], row, sec_strength, sec_damping); tap[7] = v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o2]), v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]), v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]), v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2])); p3 = constrain16(tap[7], row, sec_strength, sec_damping); // sum += sec_taps[1] * (p0 + p1 + p2 + p3) sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]), v256_add_16(v256_add_16(p0, p1), v256_add_16(p2, p3)))); if (clipping_required) { max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); min = v256_min_s16(min, tap[0]); min = v256_min_s16(min, tap[1]); min = v256_min_s16(min, tap[2]); min = v256_min_s16(min, tap[3]); min = v256_min_s16(min, tap[4]); min = v256_min_s16(min, tap[5]); min = v256_min_s16(min, tap[6]); min = v256_min_s16(min, tap[7]); } } // res = row + ((sum - (sum < 0) + 8) >> 4) sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); res = v256_add_16(sum, v256_dup_16(8)); res = v256_shr_n_s16(res, 4); res = v256_add_16(row, res); if (clipping_required) { res = v256_min_s16(v256_max_s16(res, min), max); } if (is_lowbd) { const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res)); u32_store_aligned(&dst8[(i + 0) * dstride], v64_high_u32(v128_high_v64(res_128))); u32_store_aligned(&dst8[(i + 1) * dstride], v64_low_u32(v128_high_v64(res_128))); u32_store_aligned(&dst8[(i + 2) * dstride], v64_high_u32(v128_low_v64(res_128))); u32_store_aligned(&dst8[(i + 3) * dstride], v64_low_u32(v128_low_v64(res_128))); } else { v64_store_aligned(&dst16[(i + 0) * dstride], v128_high_v64(v256_high_v128(res))); v64_store_aligned(&dst16[(i + 1) * dstride], v128_low_v64(v256_high_v128(res))); v64_store_aligned(&dst16[(i + 2) * dstride], v128_high_v64(v256_low_v128(res))); v64_store_aligned(&dst16[(i + 3) * dstride], v128_low_v64(v256_low_v128(res))); } } } CDEF_INLINE void filter_block_8x8(const int is_lowbd, void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int height, int enable_primary, int enable_secondary) { uint8_t *dst8 = (uint8_t *)dest; uint16_t *dst16 = (uint16_t *)dest; const int clipping_required = enable_primary && enable_secondary; int i; v256 sum, p0, p1, p2, p3, row, res; const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE); v256 max, min; const int po1 = cdef_directions[dir][0]; const int po2 = cdef_directions[dir][1]; const int s1o1 = cdef_directions[dir + 2][0]; const int s1o2 = cdef_directions[dir + 2][1]; const int s2o1 = cdef_directions[dir - 2][0]; const int s2o2 = cdef_directions[dir - 2][1]; const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; const int *sec_taps = cdef_sec_taps; if (enable_primary && pri_strength) pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); if (enable_secondary && sec_strength) sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); for (i = 0; i < height; i += 2) { v256 tap[8]; sum = v256_zero(); row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]), v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE])); min = max = row; if (enable_primary) { // Primary near taps tap[0] = v256_from_v128( v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]), v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1])); tap[1] = v256_from_v128( v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]), v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1])); p0 = constrain16(tap[0], row, pri_strength, pri_damping); p1 = constrain16(tap[1], row, pri_strength, pri_damping); // sum += pri_taps[0] * (p0 + p1) sum = v256_add_16( sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1))); // Primary far taps tap[2] = v256_from_v128( v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]), v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2])); tap[3] = v256_from_v128( v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]), v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2])); p0 = constrain16(tap[2], row, pri_strength, pri_damping); p1 = constrain16(tap[3], row, pri_strength, pri_damping); // sum += pri_taps[1] * (p0 + p1) sum = v256_add_16( sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1))); if (clipping_required) { max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); min = v256_min_s16(min, tap[0]); min = v256_min_s16(min, tap[1]); min = v256_min_s16(min, tap[2]); min = v256_min_s16(min, tap[3]); } // End primary } if (enable_secondary) { // Secondary near taps tap[0] = v256_from_v128( v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]), v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1])); tap[1] = v256_from_v128( v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]), v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1])); tap[2] = v256_from_v128( v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]), v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1])); tap[3] = v256_from_v128( v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]), v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1])); p0 = constrain16(tap[0], row, sec_strength, sec_damping); p1 = constrain16(tap[1], row, sec_strength, sec_damping); p2 = constrain16(tap[2], row, sec_strength, sec_damping); p3 = constrain16(tap[3], row, sec_strength, sec_damping); // sum += sec_taps[0] * (p0 + p1 + p2 + p3) sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]), v256_add_16(v256_add_16(p0, p1), v256_add_16(p2, p3)))); // Secondary far taps tap[4] = v256_from_v128( v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]), v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2])); tap[5] = v256_from_v128( v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]), v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2])); tap[6] = v256_from_v128( v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]), v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2])); tap[7] = v256_from_v128( v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]), v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2])); p0 = constrain16(tap[4], row, sec_strength, sec_damping); p1 = constrain16(tap[5], row, sec_strength, sec_damping); p2 = constrain16(tap[6], row, sec_strength, sec_damping); p3 = constrain16(tap[7], row, sec_strength, sec_damping); // sum += sec_taps[1] * (p0 + p1 + p2 + p3) sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]), v256_add_16(v256_add_16(p0, p1), v256_add_16(p2, p3)))); if (clipping_required) { max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); min = v256_min_s16(min, tap[0]); min = v256_min_s16(min, tap[1]); min = v256_min_s16(min, tap[2]); min = v256_min_s16(min, tap[3]); min = v256_min_s16(min, tap[4]); min = v256_min_s16(min, tap[5]); min = v256_min_s16(min, tap[6]); min = v256_min_s16(min, tap[7]); } // End secondary } // res = row + ((sum - (sum < 0) + 8) >> 4) sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); res = v256_add_16(sum, v256_dup_16(8)); res = v256_shr_n_s16(res, 4); res = v256_add_16(row, res); if (clipping_required) { res = v256_min_s16(v256_max_s16(res, min), max); } if (is_lowbd) { const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res)); v64_store_aligned(&dst8[i * dstride], v128_high_v64(res_128)); v64_store_aligned(&dst8[(i + 1) * dstride], v128_low_v64(res_128)); } else { v128_store_unaligned(&dst16[i * dstride], v256_high_v128(res)); v128_store_unaligned(&dst16[(i + 1) * dstride], v256_low_v128(res)); } } } #if defined(_MSC_VER) && !defined(__clang__) #pragma optimize("", on) #endif SIMD_INLINE void copy_block_4xh(const int is_lowbd, void *dest, int dstride, const uint16_t *in, int height) { uint8_t *dst8 = (uint8_t *)dest; uint16_t *dst16 = (uint16_t *)dest; int i; for (i = 0; i < height; i += 4) { const v128 row0 = v128_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]), v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE])); const v128 row1 = v128_from_v64(v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]), v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE])); if (is_lowbd) { /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */ const v128 res_128 = v128_pack_s16_u8(row1, row0); u32_store_aligned(&dst8[(i + 0) * dstride], v64_high_u32(v128_low_v64(res_128))); u32_store_aligned(&dst8[(i + 1) * dstride], v64_low_u32(v128_low_v64(res_128))); u32_store_aligned(&dst8[(i + 2) * dstride], v64_high_u32(v128_high_v64(res_128))); u32_store_aligned(&dst8[(i + 3) * dstride], v64_low_u32(v128_high_v64(res_128))); } else { v64_store_aligned(&dst16[(i + 0) * dstride], v128_high_v64(row0)); v64_store_aligned(&dst16[(i + 1) * dstride], v128_low_v64(row0)); v64_store_aligned(&dst16[(i + 2) * dstride], v128_high_v64(row1)); v64_store_aligned(&dst16[(i + 3) * dstride], v128_low_v64(row1)); } } } SIMD_INLINE void copy_block_8xh(const int is_lowbd, void *dest, int dstride, const uint16_t *in, int height) { uint8_t *dst8 = (uint8_t *)dest; uint16_t *dst16 = (uint16_t *)dest; int i; for (i = 0; i < height; i += 2) { const v128 row0 = v128_load_aligned(&in[i * CDEF_BSTRIDE]); const v128 row1 = v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]); if (is_lowbd) { /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */ const v128 res_128 = v128_pack_s16_u8(row1, row0); v64_store_aligned(&dst8[i * dstride], v128_low_v64(res_128)); v64_store_aligned(&dst8[(i + 1) * dstride], v128_high_v64(res_128)); } else { v128_store_unaligned(&dst16[i * dstride], row0); v128_store_unaligned(&dst16[(i + 1) * dstride], row1); } } } void SIMD_FUNC(cdef_filter_8_0)(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { if (block_width == 8) { filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_height, /*enable_primary=*/1, /*enable_secondary=*/1); } else { filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_height, /*enable_primary=*/1, /*enable_secondary=*/1); } } void SIMD_FUNC(cdef_filter_8_1)(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { if (block_width == 8) { filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_height, /*enable_primary=*/1, /*enable_secondary=*/0); } else { filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_height, /*enable_primary=*/1, /*enable_secondary=*/0); } } void SIMD_FUNC(cdef_filter_8_2)(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { if (block_width == 8) { filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_height, /*enable_primary=*/0, /*enable_secondary=*/1); } else { filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_height, /*enable_primary=*/0, /*enable_secondary=*/1); } } void SIMD_FUNC(cdef_filter_8_3)(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)pri_strength; (void)sec_strength; (void)dir; (void)pri_damping; (void)sec_damping; (void)coeff_shift; (void)block_width; if (block_width == 8) { copy_block_8xh(/*is_lowbd=*/1, dest, dstride, in, block_height); } else { copy_block_4xh(/*is_lowbd=*/1, dest, dstride, in, block_height); } } void SIMD_FUNC(cdef_filter_16_0)(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { if (block_width == 8) { filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_height, /*enable_primary=*/1, /*enable_secondary=*/1); } else { filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_height, /*enable_primary=*/1, /*enable_secondary=*/1); } } void SIMD_FUNC(cdef_filter_16_1)(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { if (block_width == 8) { filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_height, /*enable_primary=*/1, /*enable_secondary=*/0); } else { filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_height, /*enable_primary=*/1, /*enable_secondary=*/0); } } void SIMD_FUNC(cdef_filter_16_2)(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { if (block_width == 8) { filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_height, /*enable_primary=*/0, /*enable_secondary=*/1); } else { filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, sec_strength, dir, pri_damping, sec_damping, coeff_shift, block_height, /*enable_primary=*/0, /*enable_secondary=*/1); } } void SIMD_FUNC(cdef_filter_16_3)(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)pri_strength; (void)sec_strength; (void)dir; (void)pri_damping; (void)sec_damping; (void)coeff_shift; (void)block_width; if (block_width == 8) { copy_block_8xh(/*is_lowbd=*/0, dest, dstride, in, block_height); } else { copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height); } } #if CONFIG_AV1_HIGHBITDEPTH void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height) { int i, j; for (i = 0; i < height; i++) { for (j = 0; j < (width & ~0x7); j += 8) { v128 row = v128_load_unaligned(&src[i * sstride + j]); v128_store_unaligned(&dst[i * dstride + j], row); } for (; j < width; j++) { dst[i * dstride + j] = src[i * sstride + j]; } } } #endif // CONFIG_AV1_HIGHBITDEPTH #undef CDEF_INLINE #endif // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ aom-3.12.1/av1/common/cfl.c000066400000000000000000000375061477627663500153030ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/av1_common_int.h" #include "av1/common/cfl.h" #include "av1/common/common_data.h" #include "config/av1_rtcd.h" void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params) { assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE); assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE); memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3)); memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3)); cfl->subsampling_x = seq_params->subsampling_x; cfl->subsampling_y = seq_params->subsampling_y; cfl->are_parameters_computed = 0; cfl->store_y = 0; // The DC_PRED cache is disabled by default and is only enabled in // cfl_rd_pick_alpha clear_cfl_dc_pred_cache_flags(cfl); } void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input, CFL_PRED_TYPE pred_plane, int width) { assert(pred_plane < CFL_PRED_PLANES); assert(width <= CFL_BUF_LINE); if (is_cur_buf_hbd(xd)) { uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input); memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1); return; } memcpy(xd->cfl.dc_pred_cache[pred_plane], input, width); } static void cfl_load_dc_pred_lbd(const int16_t *dc_pred_cache, uint8_t *dst, int dst_stride, int width, int height) { for (int j = 0; j < height; j++) { memcpy(dst, dc_pred_cache, width); dst += dst_stride; } } static void cfl_load_dc_pred_hbd(const int16_t *dc_pred_cache, uint16_t *dst, int dst_stride, int width, int height) { const size_t num_bytes = width << 1; for (int j = 0; j < height; j++) { memcpy(dst, dc_pred_cache, num_bytes); dst += dst_stride; } } void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, TX_SIZE tx_size, CFL_PRED_TYPE pred_plane) { const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; assert(pred_plane < CFL_PRED_PLANES); assert(width <= CFL_BUF_LINE); assert(height <= CFL_BUF_LINE); if (is_cur_buf_hbd(xd)) { uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst); cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride, width, height); return; } cfl_load_dc_pred_lbd(xd->cfl.dc_pred_cache[pred_plane], dst, dst_stride, width, height); } // Due to frame boundary issues, it is possible that the total area covered by // chroma exceeds that of luma. When this happens, we fill the missing pixels by // repeating the last columns and/or rows. static inline void cfl_pad(CFL_CTX *cfl, int width, int height) { const int diff_width = width - cfl->buf_width; const int diff_height = height - cfl->buf_height; if (diff_width > 0) { const int min_height = height - diff_height; uint16_t *recon_buf_q3 = cfl->recon_buf_q3 + (width - diff_width); for (int j = 0; j < min_height; j++) { const uint16_t last_pixel = recon_buf_q3[-1]; assert(recon_buf_q3 + diff_width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE); for (int i = 0; i < diff_width; i++) { recon_buf_q3[i] = last_pixel; } recon_buf_q3 += CFL_BUF_LINE; } cfl->buf_width = width; } if (diff_height > 0) { uint16_t *recon_buf_q3 = cfl->recon_buf_q3 + ((height - diff_height) * CFL_BUF_LINE); for (int j = 0; j < diff_height; j++) { const uint16_t *last_row_q3 = recon_buf_q3 - CFL_BUF_LINE; assert(recon_buf_q3 + width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE); for (int i = 0; i < width; i++) { recon_buf_q3[i] = last_row_q3[i]; } recon_buf_q3 += CFL_BUF_LINE; } cfl->buf_height = height; } } static void subtract_average_c(const uint16_t *src, int16_t *dst, int width, int height, int round_offset, int num_pel_log2) { int sum = round_offset; const uint16_t *recon = src; for (int j = 0; j < height; j++) { for (int i = 0; i < width; i++) { sum += recon[i]; } recon += CFL_BUF_LINE; } const int avg = sum >> num_pel_log2; for (int j = 0; j < height; j++) { for (int i = 0; i < width; i++) { dst[i] = src[i] - avg; } src += CFL_BUF_LINE; dst += CFL_BUF_LINE; } } CFL_SUB_AVG_FN(c) static inline int cfl_idx_to_alpha(uint8_t alpha_idx, int8_t joint_sign, CFL_PRED_TYPE pred_type) { const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign) : CFL_SIGN_V(joint_sign); if (alpha_sign == CFL_SIGN_ZERO) return 0; const int abs_alpha_q3 = (pred_type == CFL_PRED_U) ? CFL_IDX_U(alpha_idx) : CFL_IDX_V(alpha_idx); return (alpha_sign == CFL_SIGN_POS) ? abs_alpha_q3 + 1 : -abs_alpha_q3 - 1; } static inline void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3, int width, int height) { for (int j = 0; j < height; j++) { for (int i = 0; i < width; i++) { dst[i] = clip_pixel(get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i]); } dst += dst_stride; ac_buf_q3 += CFL_BUF_LINE; } } CFL_PREDICT_FN(c, lbd) #if CONFIG_AV1_HIGHBITDEPTH static inline void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bit_depth, int width, int height) { for (int j = 0; j < height; j++) { for (int i = 0; i < width; i++) { dst[i] = clip_pixel_highbd( get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i], bit_depth); } dst += dst_stride; ac_buf_q3 += CFL_BUF_LINE; } } CFL_PREDICT_FN(c, hbd) #endif static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) { CFL_CTX *const cfl = &xd->cfl; // Do not call cfl_compute_parameters multiple time on the same values. assert(cfl->are_parameters_computed == 0); cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]); cfl_get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3); cfl->are_parameters_computed = 1; } void av1_cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, TX_SIZE tx_size, int plane) { CFL_CTX *const cfl = &xd->cfl; MB_MODE_INFO *mbmi = xd->mi[0]; assert(is_cfl_allowed(xd)); if (!cfl->are_parameters_computed) cfl_compute_parameters(xd, tx_size); const int alpha_q3 = cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1); assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <= CFL_BUF_SQUARE); #if CONFIG_AV1_HIGHBITDEPTH if (is_cur_buf_hbd(xd)) { uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst); cfl_get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, alpha_q3, xd->bd); return; } #endif cfl_get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3); } static void cfl_luma_subsampling_420_lbd_c(const uint8_t *input, int input_stride, uint16_t *output_q3, int width, int height) { for (int j = 0; j < height; j += 2) { for (int i = 0; i < width; i += 2) { const int bot = i + input_stride; output_q3[i >> 1] = (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1; } input += input_stride << 1; output_q3 += CFL_BUF_LINE; } } static void cfl_luma_subsampling_422_lbd_c(const uint8_t *input, int input_stride, uint16_t *output_q3, int width, int height) { assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE); for (int j = 0; j < height; j++) { for (int i = 0; i < width; i += 2) { output_q3[i >> 1] = (input[i] + input[i + 1]) << 2; } input += input_stride; output_q3 += CFL_BUF_LINE; } } static void cfl_luma_subsampling_444_lbd_c(const uint8_t *input, int input_stride, uint16_t *output_q3, int width, int height) { assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE); for (int j = 0; j < height; j++) { for (int i = 0; i < width; i++) { output_q3[i] = input[i] << 3; } input += input_stride; output_q3 += CFL_BUF_LINE; } } #if CONFIG_AV1_HIGHBITDEPTH static void cfl_luma_subsampling_420_hbd_c(const uint16_t *input, int input_stride, uint16_t *output_q3, int width, int height) { for (int j = 0; j < height; j += 2) { for (int i = 0; i < width; i += 2) { const int bot = i + input_stride; output_q3[i >> 1] = (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1; } input += input_stride << 1; output_q3 += CFL_BUF_LINE; } } static void cfl_luma_subsampling_422_hbd_c(const uint16_t *input, int input_stride, uint16_t *output_q3, int width, int height) { assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE); for (int j = 0; j < height; j++) { for (int i = 0; i < width; i += 2) { output_q3[i >> 1] = (input[i] + input[i + 1]) << 2; } input += input_stride; output_q3 += CFL_BUF_LINE; } } static void cfl_luma_subsampling_444_hbd_c(const uint16_t *input, int input_stride, uint16_t *output_q3, int width, int height) { assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE); for (int j = 0; j < height; j++) { for (int i = 0; i < width; i++) { output_q3[i] = input[i] << 3; } input += input_stride; output_q3 += CFL_BUF_LINE; } } #endif CFL_GET_SUBSAMPLE_FUNCTION(c) #if CONFIG_AV1_HIGHBITDEPTH static inline cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size, int sub_x, int sub_y) { if (sub_x == 1) { if (sub_y == 1) { return cfl_get_luma_subsampling_420_hbd(tx_size); } return cfl_get_luma_subsampling_422_hbd(tx_size); } return cfl_get_luma_subsampling_444_hbd(tx_size); } #endif static inline cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size, int sub_x, int sub_y) { if (sub_x == 1) { if (sub_y == 1) { return cfl_get_luma_subsampling_420_lbd(tx_size); } return cfl_get_luma_subsampling_422_lbd(tx_size); } return cfl_get_luma_subsampling_444_lbd(tx_size); } static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, int row, int col, TX_SIZE tx_size, int use_hbd) { const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const int tx_off_log2 = MI_SIZE_LOG2; const int sub_x = cfl->subsampling_x; const int sub_y = cfl->subsampling_y; const int store_row = row << (tx_off_log2 - sub_y); const int store_col = col << (tx_off_log2 - sub_x); const int store_height = height >> sub_y; const int store_width = width >> sub_x; // Invalidate current parameters cfl->are_parameters_computed = 0; // Store the surface of the pixel buffer that was written to, this way we // can manage chroma overrun (e.g. when the chroma surfaces goes beyond the // frame boundary) if (col == 0 && row == 0) { cfl->buf_width = store_width; cfl->buf_height = store_height; } else { cfl->buf_width = OD_MAXI(store_col + store_width, cfl->buf_width); cfl->buf_height = OD_MAXI(store_row + store_height, cfl->buf_height); } // Check that we will remain inside the pixel buffer. assert(store_row + store_height <= CFL_BUF_LINE); assert(store_col + store_width <= CFL_BUF_LINE); // Store the input into the CfL pixel buffer uint16_t *recon_buf_q3 = cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col); #if CONFIG_AV1_HIGHBITDEPTH if (use_hbd) { cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input), input_stride, recon_buf_q3); } else { cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, recon_buf_q3); } #else (void)use_hbd; cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, recon_buf_q3); #endif } // Adjust the row and column of blocks smaller than 8X8, as chroma-referenced // and non-chroma-referenced blocks are stored together in the CfL buffer. static inline void sub8x8_adjust_offset(const CFL_CTX *cfl, int mi_row, int mi_col, int *row_out, int *col_out) { // Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s. if ((mi_row & 0x01) && cfl->subsampling_y) { assert(*row_out == 0); (*row_out)++; } // Increment col index for right: 4x8, 4x16 or both right 4x4s. if ((mi_col & 0x01) && cfl->subsampling_x) { assert(*col_out == 0); (*col_out)++; } } void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size, BLOCK_SIZE bsize) { CFL_CTX *const cfl = &xd->cfl; struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2]; if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) { // Only dimensions of size 4 can have an odd offset. assert(!((col & 1) && tx_size_wide[tx_size] != 4)); assert(!((row & 1) && tx_size_high[tx_size] != 4)); sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col); } cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd)); } static inline int max_intra_block_width(const MACROBLOCKD *xd, BLOCK_SIZE plane_bsize, int plane, TX_SIZE tx_size) { const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane) << MI_SIZE_LOG2; return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]); } static inline int max_intra_block_height(const MACROBLOCKD *xd, BLOCK_SIZE plane_bsize, int plane, TX_SIZE tx_size) { const int max_blocks_high = max_block_high(xd, plane_bsize, plane) << MI_SIZE_LOG2; return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]); } void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) { CFL_CTX *const cfl = &xd->cfl; struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; int row = 0; int col = 0; if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) { sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col); } const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size); const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size); tx_size = get_tx_size(width, height); cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd)); } aom-3.12.1/av1/common/cfl.h000066400000000000000000000413341477627663500153020ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_CFL_H_ #define AOM_AV1_COMMON_CFL_H_ #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" // Can we use CfL for the current block? static inline CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) { const MB_MODE_INFO *mbmi = xd->mi[0]; const BLOCK_SIZE bsize = mbmi->bsize; assert(bsize < BLOCK_SIZES_ALL); if (xd->lossless[mbmi->segment_id]) { // In lossless, CfL is available when the partition size is equal to the // transform size. const int ssx = xd->plane[AOM_PLANE_U].subsampling_x; const int ssy = xd->plane[AOM_PLANE_U].subsampling_y; const int plane_bsize = get_plane_block_size(bsize, ssx, ssy); return (CFL_ALLOWED_TYPE)(plane_bsize == BLOCK_4X4); } // Spec: CfL is available to luma partitions lesser than or equal to 32x32 return (CFL_ALLOWED_TYPE)(block_size_wide[bsize] <= 32 && block_size_high[bsize] <= 32); } // Do we need to save the luma pixels from the current block, // for a possible future CfL prediction? static inline CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm, const MACROBLOCKD *xd) { const MB_MODE_INFO *mbmi = xd->mi[0]; if (cm->seq_params->monochrome) return CFL_DISALLOWED; if (!xd->is_chroma_ref) { // For non-chroma-reference blocks, we should always store the luma pixels, // in case the corresponding chroma-reference block uses CfL. // Note that this can only happen for block sizes which are <8 on // their shortest side, as otherwise they would be chroma reference // blocks. return CFL_ALLOWED; } // If this block has chroma information, we know whether we're // actually going to perform a CfL prediction return (CFL_ALLOWED_TYPE)(!is_inter_block(mbmi) && mbmi->uv_mode == UV_CFL_PRED); } static inline int get_scaled_luma_q0(int alpha_q3, int16_t pred_buf_q3) { int scaled_luma_q6 = alpha_q3 * pred_buf_q3; return ROUND_POWER_OF_TWO_SIGNED(scaled_luma_q6, 6); } static inline CFL_PRED_TYPE get_cfl_pred_type(int plane) { assert(plane > 0); return (CFL_PRED_TYPE)(plane - 1); } static inline void clear_cfl_dc_pred_cache_flags(CFL_CTX *cfl) { cfl->use_dc_pred_cache = false; cfl->dc_pred_is_cached[CFL_PRED_U] = false; cfl->dc_pred_is_cached[CFL_PRED_V] = false; } void av1_cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, TX_SIZE tx_size, int plane); void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size); void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size, BLOCK_SIZE bsize); void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input, CFL_PRED_TYPE pred_plane, int width); void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, TX_SIZE tx_size, CFL_PRED_TYPE pred_plane); // Allows the CFL_SUBSAMPLE function to switch types depending on the bitdepth. #define CFL_lbd_TYPE uint8_t *cfl_type #define CFL_hbd_TYPE uint16_t *cfl_type // Declare a size-specific wrapper for the size-generic function. The compiler // will inline the size generic function in here, the advantage is that the size // will be constant allowing for loop unrolling and other constant propagated // goodness. #define CFL_SUBSAMPLE(arch, sub, bd, width, height) \ void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch( \ const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3); \ void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch( \ const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) { \ cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride, \ output_q3, width, height); \ } // Declare size-specific wrappers for all valid CfL sizes. #define CFL_SUBSAMPLE_FUNCTIONS(arch, sub, bd) \ CFL_SUBSAMPLE(arch, sub, bd, 4, 4) \ CFL_SUBSAMPLE(arch, sub, bd, 8, 8) \ CFL_SUBSAMPLE(arch, sub, bd, 16, 16) \ CFL_SUBSAMPLE(arch, sub, bd, 32, 32) \ CFL_SUBSAMPLE(arch, sub, bd, 4, 8) \ CFL_SUBSAMPLE(arch, sub, bd, 8, 4) \ CFL_SUBSAMPLE(arch, sub, bd, 8, 16) \ CFL_SUBSAMPLE(arch, sub, bd, 16, 8) \ CFL_SUBSAMPLE(arch, sub, bd, 16, 32) \ CFL_SUBSAMPLE(arch, sub, bd, 32, 16) \ CFL_SUBSAMPLE(arch, sub, bd, 4, 16) \ CFL_SUBSAMPLE(arch, sub, bd, 16, 4) \ CFL_SUBSAMPLE(arch, sub, bd, 8, 32) \ CFL_SUBSAMPLE(arch, sub, bd, 32, 8) \ cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_##arch( \ TX_SIZE tx_size) { \ CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \ return subfn_##sub[tx_size]; \ } // Declare an architecture-specific array of function pointers for size-specific // wrappers. #define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \ static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ cfl_subsample_##bd##_##sub##_4x4_##arch, /* 4x4 */ \ cfl_subsample_##bd##_##sub##_8x8_##arch, /* 8x8 */ \ cfl_subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */ \ cfl_subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */ \ NULL, /* 64x64 (invalid CFL size) */ \ cfl_subsample_##bd##_##sub##_4x8_##arch, /* 4x8 */ \ cfl_subsample_##bd##_##sub##_8x4_##arch, /* 8x4 */ \ cfl_subsample_##bd##_##sub##_8x16_##arch, /* 8x16 */ \ cfl_subsample_##bd##_##sub##_16x8_##arch, /* 16x8 */ \ cfl_subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */ \ cfl_subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */ \ NULL, /* 32x64 (invalid CFL size) */ \ NULL, /* 64x32 (invalid CFL size) */ \ cfl_subsample_##bd##_##sub##_4x16_##arch, /* 4x16 */ \ cfl_subsample_##bd##_##sub##_16x4_##arch, /* 16x4 */ \ cfl_subsample_##bd##_##sub##_8x32_##arch, /* 8x32 */ \ cfl_subsample_##bd##_##sub##_32x8_##arch, /* 32x8 */ \ NULL, /* 16x64 (invalid CFL size) */ \ NULL, /* 64x16 (invalid CFL size) */ \ }; // The RTCD script does not support passing in an array, so we wrap it in this // function. #if CONFIG_AV1_HIGHBITDEPTH #define CFL_GET_SUBSAMPLE_FUNCTION(arch) \ CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \ CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \ CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) \ CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \ CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \ CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd) #else #define CFL_GET_SUBSAMPLE_FUNCTION(arch) \ CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \ CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \ CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) #endif // Declare a size-specific wrapper for the size-generic function. The compiler // will inline the size generic function in here, the advantage is that the size // will be constant allowing for loop unrolling and other constant propagated // goodness. #define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2) \ void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \ int16_t *dst); \ void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \ int16_t *dst) { \ subtract_average_##arch(src, dst, width, height, round_offset, \ num_pel_log2); \ } // Declare size-specific wrappers for all valid CfL sizes. #define CFL_SUB_AVG_FN(arch) \ CFL_SUB_AVG_X(arch, 4, 4, 8, 4) \ CFL_SUB_AVG_X(arch, 4, 8, 16, 5) \ CFL_SUB_AVG_X(arch, 4, 16, 32, 6) \ CFL_SUB_AVG_X(arch, 8, 4, 16, 5) \ CFL_SUB_AVG_X(arch, 8, 8, 32, 6) \ CFL_SUB_AVG_X(arch, 8, 16, 64, 7) \ CFL_SUB_AVG_X(arch, 8, 32, 128, 8) \ CFL_SUB_AVG_X(arch, 16, 4, 32, 6) \ CFL_SUB_AVG_X(arch, 16, 8, 64, 7) \ CFL_SUB_AVG_X(arch, 16, 16, 128, 8) \ CFL_SUB_AVG_X(arch, 16, 32, 256, 9) \ CFL_SUB_AVG_X(arch, 32, 8, 128, 8) \ CFL_SUB_AVG_X(arch, 32, 16, 256, 9) \ CFL_SUB_AVG_X(arch, 32, 32, 512, 10) \ cfl_subtract_average_fn cfl_get_subtract_average_fn_##arch( \ TX_SIZE tx_size) { \ static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { \ cfl_subtract_average_4x4_##arch, /* 4x4 */ \ cfl_subtract_average_8x8_##arch, /* 8x8 */ \ cfl_subtract_average_16x16_##arch, /* 16x16 */ \ cfl_subtract_average_32x32_##arch, /* 32x32 */ \ NULL, /* 64x64 (invalid CFL size) */ \ cfl_subtract_average_4x8_##arch, /* 4x8 */ \ cfl_subtract_average_8x4_##arch, /* 8x4 */ \ cfl_subtract_average_8x16_##arch, /* 8x16 */ \ cfl_subtract_average_16x8_##arch, /* 16x8 */ \ cfl_subtract_average_16x32_##arch, /* 16x32 */ \ cfl_subtract_average_32x16_##arch, /* 32x16 */ \ NULL, /* 32x64 (invalid CFL size) */ \ NULL, /* 64x32 (invalid CFL size) */ \ cfl_subtract_average_4x16_##arch, /* 4x16 (invalid CFL size) */ \ cfl_subtract_average_16x4_##arch, /* 16x4 (invalid CFL size) */ \ cfl_subtract_average_8x32_##arch, /* 8x32 (invalid CFL size) */ \ cfl_subtract_average_32x8_##arch, /* 32x8 (invalid CFL size) */ \ NULL, /* 16x64 (invalid CFL size) */ \ NULL, /* 64x16 (invalid CFL size) */ \ }; \ /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \ /* index the function pointer array out of bounds. */ \ return sub_avg[tx_size % TX_SIZES_ALL]; \ } #define CFL_PREDICT_lbd(arch, width, height) \ void cfl_predict_lbd_##width##x##height##_##arch( \ const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); \ void cfl_predict_lbd_##width##x##height##_##arch( \ const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, \ int alpha_q3) { \ cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \ height); \ } #if CONFIG_AV1_HIGHBITDEPTH #define CFL_PREDICT_hbd(arch, width, height) \ void cfl_predict_hbd_##width##x##height##_##arch( \ const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \ int bd); \ void cfl_predict_hbd_##width##x##height##_##arch( \ const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \ int bd) { \ cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \ height); \ } #endif // This wrapper exists because clang format does not like calling macros with // lowercase letters. #define CFL_PREDICT_X(arch, width, height, bd) \ CFL_PREDICT_##bd(arch, width, height) #define CFL_PREDICT_FN(arch, bd) \ CFL_PREDICT_X(arch, 4, 4, bd) \ CFL_PREDICT_X(arch, 4, 8, bd) \ CFL_PREDICT_X(arch, 4, 16, bd) \ CFL_PREDICT_X(arch, 8, 4, bd) \ CFL_PREDICT_X(arch, 8, 8, bd) \ CFL_PREDICT_X(arch, 8, 16, bd) \ CFL_PREDICT_X(arch, 8, 32, bd) \ CFL_PREDICT_X(arch, 16, 4, bd) \ CFL_PREDICT_X(arch, 16, 8, bd) \ CFL_PREDICT_X(arch, 16, 16, bd) \ CFL_PREDICT_X(arch, 16, 32, bd) \ CFL_PREDICT_X(arch, 32, 8, bd) \ CFL_PREDICT_X(arch, 32, 16, bd) \ CFL_PREDICT_X(arch, 32, 32, bd) \ cfl_predict_##bd##_fn cfl_get_predict_##bd##_fn_##arch(TX_SIZE tx_size) { \ static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = { \ cfl_predict_##bd##_4x4_##arch, /* 4x4 */ \ cfl_predict_##bd##_8x8_##arch, /* 8x8 */ \ cfl_predict_##bd##_16x16_##arch, /* 16x16 */ \ cfl_predict_##bd##_32x32_##arch, /* 32x32 */ \ NULL, /* 64x64 (invalid CFL size) */ \ cfl_predict_##bd##_4x8_##arch, /* 4x8 */ \ cfl_predict_##bd##_8x4_##arch, /* 8x4 */ \ cfl_predict_##bd##_8x16_##arch, /* 8x16 */ \ cfl_predict_##bd##_16x8_##arch, /* 16x8 */ \ cfl_predict_##bd##_16x32_##arch, /* 16x32 */ \ cfl_predict_##bd##_32x16_##arch, /* 32x16 */ \ NULL, /* 32x64 (invalid CFL size) */ \ NULL, /* 64x32 (invalid CFL size) */ \ cfl_predict_##bd##_4x16_##arch, /* 4x16 */ \ cfl_predict_##bd##_16x4_##arch, /* 16x4 */ \ cfl_predict_##bd##_8x32_##arch, /* 8x32 */ \ cfl_predict_##bd##_32x8_##arch, /* 32x8 */ \ NULL, /* 16x64 (invalid CFL size) */ \ NULL, /* 64x16 (invalid CFL size) */ \ }; \ /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \ /* index the function pointer array out of bounds. */ \ return pred[tx_size % TX_SIZES_ALL]; \ } #endif // AOM_AV1_COMMON_CFL_H_ aom-3.12.1/av1/common/common.h000066400000000000000000000035261477627663500160270ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_COMMON_H_ #define AOM_AV1_COMMON_COMMON_H_ /* Interface header for common constant data structures and lookup tables */ #include #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "aom/aom_integer.h" #include "aom_ports/bitops.h" #include "config/aom_config.h" #ifdef __cplusplus extern "C" { #endif // Only need this for fixed-size arrays, for structs just assign. #define av1_copy(dest, src) \ do { \ assert(sizeof(dest) == sizeof(src)); \ memcpy(dest, src, sizeof(src)); \ } while (0) // Use this for variably-sized arrays. #define av1_copy_array(dest, src, n) \ do { \ assert(sizeof(*(dest)) == sizeof(*(src))); \ memcpy(dest, src, n * sizeof(*(src))); \ } while (0) #define av1_zero(dest) memset(&(dest), 0, sizeof(dest)) #define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest))) static inline int get_unsigned_bits(unsigned int num_values) { return num_values > 0 ? get_msb(num_values) + 1 : 0; } #define CHECK_MEM_ERROR(cm, lval, expr) \ AOM_CHECK_MEM_ERROR((cm)->error, lval, expr) #define AOM_FRAME_MARKER 0x2 #define AV1_MIN_TILE_SIZE_BYTES 1 #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_COMMON_H_ aom-3.12.1/av1/common/common_data.c000066400000000000000000000046411477627663500170120ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/common_data.h" // The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual // size function). /* clang-format off */ const BLOCK_SIZE av1_ss_size_lookup[BLOCK_SIZES_ALL][2][2] = { // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } }, { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_4X4 } }, { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_4X4 } }, { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } }, { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_INVALID, BLOCK_4X8 } }, { { BLOCK_16X8, BLOCK_INVALID }, { BLOCK_8X8, BLOCK_8X4 } }, { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } }, { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_INVALID, BLOCK_8X16 } }, { { BLOCK_32X16, BLOCK_INVALID }, { BLOCK_16X16, BLOCK_16X8 } }, { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } }, { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_INVALID, BLOCK_16X32 } }, { { BLOCK_64X32, BLOCK_INVALID }, { BLOCK_32X32, BLOCK_32X16 } }, { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } }, { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } }, { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } }, { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } }, { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_INVALID, BLOCK_4X8 } }, { { BLOCK_16X4, BLOCK_INVALID }, { BLOCK_8X4, BLOCK_8X4 } }, { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } }, { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } }, { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } }, { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } } }; /* clang-format on */ aom-3.12.1/av1/common/common_data.h000066400000000000000000000312201477627663500170100ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_COMMON_DATA_H_ #define AOM_AV1_COMMON_COMMON_DATA_H_ #include "av1/common/enums.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" #ifdef __cplusplus extern "C" { #endif // Log 2 conversion lookup tables in units of mode info (4x4). // The Mi_Width_Log2 table in the spec (Section 9.3. Conversion tables). static const uint8_t mi_size_wide_log2[BLOCK_SIZES_ALL] = { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4 }; // The Mi_Height_Log2 table in the spec (Section 9.3. Conversion tables). static const uint8_t mi_size_high_log2[BLOCK_SIZES_ALL] = { 0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2 }; // Width/height lookup tables in units of mode info (4x4). // The Num_4x4_Blocks_Wide table in the spec (Section 9.3. Conversion tables). static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = { 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 1, 4, 2, 8, 4, 16 }; // The Num_4x4_Blocks_High table in the spec (Section 9.3. Conversion tables). static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = { 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 4, 1, 8, 2, 16, 4 }; // Width/height lookup tables in units of samples. // The Block_Width table in the spec (Section 9.3. Conversion tables). static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = { 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32, 64, 64, 64, 128, 128, 4, 16, 8, 32, 16, 64 }; // The Block_Height table in the spec (Section 9.3. Conversion tables). static const uint8_t block_size_high[BLOCK_SIZES_ALL] = { 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 64, 32, 64, 128, 64, 128, 16, 4, 32, 8, 64, 16 }; // Maps a block size to a context. // The Size_Group table in the spec (Section 9.3. Conversion tables). // AOMMIN(3, AOMMIN(mi_size_wide_log2(bsize), mi_size_high_log2(bsize))) static const uint8_t size_group_lookup[BLOCK_SIZES_ALL] = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2 }; static const uint8_t num_pels_log2_lookup[BLOCK_SIZES_ALL] = { 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10 }; // A compressed version of the Partition_Subsize table in the spec (9.3. // Conversion tables), for square block sizes only. /* clang-format off */ static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = { { // PARTITION_NONE BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128 }, { // PARTITION_HORZ BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8, BLOCK_32X16, BLOCK_64X32, BLOCK_128X64 }, { // PARTITION_VERT BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16, BLOCK_16X32, BLOCK_32X64, BLOCK_64X128 }, { // PARTITION_SPLIT BLOCK_INVALID, BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64 }, { // PARTITION_HORZ_A BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8, BLOCK_32X16, BLOCK_64X32, BLOCK_128X64 }, { // PARTITION_HORZ_B BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8, BLOCK_32X16, BLOCK_64X32, BLOCK_128X64 }, { // PARTITION_VERT_A BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16, BLOCK_16X32, BLOCK_32X64, BLOCK_64X128 }, { // PARTITION_VERT_B BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16, BLOCK_16X32, BLOCK_32X64, BLOCK_64X128 }, { // PARTITION_HORZ_4 BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4, BLOCK_32X8, BLOCK_64X16, BLOCK_INVALID }, { // PARTITION_VERT_4 BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16, BLOCK_8X32, BLOCK_16X64, BLOCK_INVALID } }; static const TX_SIZE max_txsize_lookup[BLOCK_SIZES_ALL] = { // 4X4 TX_4X4, // 4X8, 8X4, 8X8 TX_4X4, TX_4X4, TX_8X8, // 8X16, 16X8, 16X16 TX_8X8, TX_8X8, TX_16X16, // 16X32, 32X16, 32X32 TX_16X16, TX_16X16, TX_32X32, // 32X64, 64X32, TX_32X32, TX_32X32, // 64X64 TX_64X64, // 64x128, 128x64, 128x128 TX_64X64, TX_64X64, TX_64X64, // 4x16, 16x4, 8x32 TX_4X4, TX_4X4, TX_8X8, // 32x8, 16x64 64x16 TX_8X8, TX_16X16, TX_16X16 }; static const TX_SIZE max_txsize_rect_lookup[BLOCK_SIZES_ALL] = { // 4X4 TX_4X4, // 4X8, 8X4, 8X8 TX_4X8, TX_8X4, TX_8X8, // 8X16, 16X8, 16X16 TX_8X16, TX_16X8, TX_16X16, // 16X32, 32X16, 32X32 TX_16X32, TX_32X16, TX_32X32, // 32X64, 64X32, TX_32X64, TX_64X32, // 64X64 TX_64X64, // 64x128, 128x64, 128x128 TX_64X64, TX_64X64, TX_64X64, // 4x16, 16x4, TX_4X16, TX_16X4, // 8x32, 32x8 TX_8X32, TX_32X8, // 16x64, 64x16 TX_16X64, TX_64X16 }; static const TX_TYPE_1D vtx_tab[TX_TYPES] = { DCT_1D, ADST_1D, DCT_1D, ADST_1D, FLIPADST_1D, DCT_1D, FLIPADST_1D, ADST_1D, FLIPADST_1D, IDTX_1D, DCT_1D, IDTX_1D, ADST_1D, IDTX_1D, FLIPADST_1D, IDTX_1D, }; static const TX_TYPE_1D htx_tab[TX_TYPES] = { DCT_1D, DCT_1D, ADST_1D, ADST_1D, DCT_1D, FLIPADST_1D, FLIPADST_1D, FLIPADST_1D, ADST_1D, IDTX_1D, IDTX_1D, DCT_1D, IDTX_1D, ADST_1D, IDTX_1D, FLIPADST_1D, }; #define TXSIZE_CAT_INVALID (-1) /* clang-format on */ static const TX_SIZE sub_tx_size_map[TX_SIZES_ALL] = { TX_4X4, // TX_4X4 TX_4X4, // TX_8X8 TX_8X8, // TX_16X16 TX_16X16, // TX_32X32 TX_32X32, // TX_64X64 TX_4X4, // TX_4X8 TX_4X4, // TX_8X4 TX_8X8, // TX_8X16 TX_8X8, // TX_16X8 TX_16X16, // TX_16X32 TX_16X16, // TX_32X16 TX_32X32, // TX_32X64 TX_32X32, // TX_64X32 TX_4X8, // TX_4X16 TX_8X4, // TX_16X4 TX_8X16, // TX_8X32 TX_16X8, // TX_32X8 TX_16X32, // TX_16X64 TX_32X16, // TX_64X16 }; static const TX_SIZE txsize_horz_map[TX_SIZES_ALL] = { TX_4X4, // TX_4X4 TX_8X8, // TX_8X8 TX_16X16, // TX_16X16 TX_32X32, // TX_32X32 TX_64X64, // TX_64X64 TX_4X4, // TX_4X8 TX_8X8, // TX_8X4 TX_8X8, // TX_8X16 TX_16X16, // TX_16X8 TX_16X16, // TX_16X32 TX_32X32, // TX_32X16 TX_32X32, // TX_32X64 TX_64X64, // TX_64X32 TX_4X4, // TX_4X16 TX_16X16, // TX_16X4 TX_8X8, // TX_8X32 TX_32X32, // TX_32X8 TX_16X16, // TX_16X64 TX_64X64, // TX_64X16 }; static const TX_SIZE txsize_vert_map[TX_SIZES_ALL] = { TX_4X4, // TX_4X4 TX_8X8, // TX_8X8 TX_16X16, // TX_16X16 TX_32X32, // TX_32X32 TX_64X64, // TX_64X64 TX_8X8, // TX_4X8 TX_4X4, // TX_8X4 TX_16X16, // TX_8X16 TX_8X8, // TX_16X8 TX_32X32, // TX_16X32 TX_16X16, // TX_32X16 TX_64X64, // TX_32X64 TX_32X32, // TX_64X32 TX_16X16, // TX_4X16 TX_4X4, // TX_16X4 TX_32X32, // TX_8X32 TX_8X8, // TX_32X8 TX_64X64, // TX_16X64 TX_16X16, // TX_64X16 }; #define TX_SIZE_W_MIN 4 // Transform block width in pixels static const int tx_size_wide[TX_SIZES_ALL] = { 4, 8, 16, 32, 64, 4, 8, 8, 16, 16, 32, 32, 64, 4, 16, 8, 32, 16, 64, }; #define TX_SIZE_H_MIN 4 // Transform block height in pixels static const int tx_size_high[TX_SIZES_ALL] = { 4, 8, 16, 32, 64, 8, 4, 16, 8, 32, 16, 64, 32, 16, 4, 32, 8, 64, 16, }; // Transform block width in unit static const int tx_size_wide_unit[TX_SIZES_ALL] = { 1, 2, 4, 8, 16, 1, 2, 2, 4, 4, 8, 8, 16, 1, 4, 2, 8, 4, 16, }; // Transform block height in unit static const int tx_size_high_unit[TX_SIZES_ALL] = { 1, 2, 4, 8, 16, 2, 1, 4, 2, 8, 4, 16, 8, 4, 1, 8, 2, 16, 4, }; // Transform block width in log2 static const int tx_size_wide_log2[TX_SIZES_ALL] = { 2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6, }; // Transform block width in log2 unit static const int tx_size_wide_unit_log2[TX_SIZES_ALL] = { 0, 1, 2, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 2, 1, 3, 2, 4, }; // Transform block height in log2 static const int tx_size_high_log2[TX_SIZES_ALL] = { 2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4, }; // Transform block height in log2 unit static const int tx_size_high_unit_log2[TX_SIZES_ALL] = { 0, 1, 2, 3, 4, 1, 0, 2, 1, 3, 2, 4, 3, 2, 0, 3, 1, 4, 2, }; static const int tx_size_2d[TX_SIZES_ALL + 1] = { 16, 64, 256, 1024, 4096, 32, 32, 128, 128, 512, 512, 2048, 2048, 64, 64, 256, 256, 1024, 1024, }; static const BLOCK_SIZE txsize_to_bsize[TX_SIZES_ALL] = { BLOCK_4X4, // TX_4X4 BLOCK_8X8, // TX_8X8 BLOCK_16X16, // TX_16X16 BLOCK_32X32, // TX_32X32 BLOCK_64X64, // TX_64X64 BLOCK_4X8, // TX_4X8 BLOCK_8X4, // TX_8X4 BLOCK_8X16, // TX_8X16 BLOCK_16X8, // TX_16X8 BLOCK_16X32, // TX_16X32 BLOCK_32X16, // TX_32X16 BLOCK_32X64, // TX_32X64 BLOCK_64X32, // TX_64X32 BLOCK_4X16, // TX_4X16 BLOCK_16X4, // TX_16X4 BLOCK_8X32, // TX_8X32 BLOCK_32X8, // TX_32X8 BLOCK_16X64, // TX_16X64 BLOCK_64X16, // TX_64X16 }; static const TX_SIZE txsize_sqr_map[TX_SIZES_ALL] = { TX_4X4, // TX_4X4 TX_8X8, // TX_8X8 TX_16X16, // TX_16X16 TX_32X32, // TX_32X32 TX_64X64, // TX_64X64 TX_4X4, // TX_4X8 TX_4X4, // TX_8X4 TX_8X8, // TX_8X16 TX_8X8, // TX_16X8 TX_16X16, // TX_16X32 TX_16X16, // TX_32X16 TX_32X32, // TX_32X64 TX_32X32, // TX_64X32 TX_4X4, // TX_4X16 TX_4X4, // TX_16X4 TX_8X8, // TX_8X32 TX_8X8, // TX_32X8 TX_16X16, // TX_16X64 TX_16X16, // TX_64X16 }; static const TX_SIZE txsize_sqr_up_map[TX_SIZES_ALL] = { TX_4X4, // TX_4X4 TX_8X8, // TX_8X8 TX_16X16, // TX_16X16 TX_32X32, // TX_32X32 TX_64X64, // TX_64X64 TX_8X8, // TX_4X8 TX_8X8, // TX_8X4 TX_16X16, // TX_8X16 TX_16X16, // TX_16X8 TX_32X32, // TX_16X32 TX_32X32, // TX_32X16 TX_64X64, // TX_32X64 TX_64X64, // TX_64X32 TX_16X16, // TX_4X16 TX_16X16, // TX_16X4 TX_32X32, // TX_8X32 TX_32X32, // TX_32X8 TX_64X64, // TX_16X64 TX_64X64, // TX_64X16 }; static const int8_t txsize_log2_minus4[TX_SIZES_ALL] = { 0, // TX_4X4 2, // TX_8X8 4, // TX_16X16 6, // TX_32X32 6, // TX_64X64 1, // TX_4X8 1, // TX_8X4 3, // TX_8X16 3, // TX_16X8 5, // TX_16X32 5, // TX_32X16 6, // TX_32X64 6, // TX_64X32 2, // TX_4X16 2, // TX_16X4 4, // TX_8X32 4, // TX_32X8 5, // TX_16X64 5, // TX_64X16 }; static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { TX_4X4, // ONLY_4X4 TX_64X64, // TX_MODE_LARGEST TX_64X64, // TX_MODE_SELECT }; // The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual // size function). extern const BLOCK_SIZE av1_ss_size_lookup[BLOCK_SIZES_ALL][2][2]; // Generates 5 bit field in which each bit set to 1 represents // a blocksize partition 11111 means we split 128x128, 64x64, 32x32, 16x16 // and 8x8. 10000 means we just split the 128x128 to 64x64 /* clang-format off */ static const struct { PARTITION_CONTEXT above; PARTITION_CONTEXT left; } partition_context_lookup[BLOCK_SIZES_ALL] = { { 31, 31 }, // 4X4 - {0b11111, 0b11111} { 31, 30 }, // 4X8 - {0b11111, 0b11110} { 30, 31 }, // 8X4 - {0b11110, 0b11111} { 30, 30 }, // 8X8 - {0b11110, 0b11110} { 30, 28 }, // 8X16 - {0b11110, 0b11100} { 28, 30 }, // 16X8 - {0b11100, 0b11110} { 28, 28 }, // 16X16 - {0b11100, 0b11100} { 28, 24 }, // 16X32 - {0b11100, 0b11000} { 24, 28 }, // 32X16 - {0b11000, 0b11100} { 24, 24 }, // 32X32 - {0b11000, 0b11000} { 24, 16 }, // 32X64 - {0b11000, 0b10000} { 16, 24 }, // 64X32 - {0b10000, 0b11000} { 16, 16 }, // 64X64 - {0b10000, 0b10000} { 16, 0 }, // 64X128- {0b10000, 0b00000} { 0, 16 }, // 128X64- {0b00000, 0b10000} { 0, 0 }, // 128X128-{0b00000, 0b00000} { 31, 28 }, // 4X16 - {0b11111, 0b11100} { 28, 31 }, // 16X4 - {0b11100, 0b11111} { 30, 24 }, // 8X32 - {0b11110, 0b11000} { 24, 30 }, // 32X8 - {0b11000, 0b11110} { 28, 16 }, // 16X64 - {0b11100, 0b10000} { 16, 28 }, // 64X16 - {0b10000, 0b11100} }; /* clang-format on */ static const int intra_mode_context[INTRA_MODES] = { 0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0, }; // Note: this is also used in unit tests. So whenever one changes the table, // the unit tests need to be changed accordingly. static const int quant_dist_weight[4][2] = { { 2, 3 }, { 2, 5 }, { 2, 7 }, { 1, MAX_FRAME_DISTANCE } }; static const int quant_dist_lookup_table[4][2] = { { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 }, }; #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_COMMON_DATA_H_ aom-3.12.1/av1/common/convolve.c000066400000000000000000001731611477627663500163700ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" #include "av1/common/resize.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/mem.h" void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn) { src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; for (int y = 0; y < h; ++y) { int x_qn = x0_qn; for (int x = 0; x < w; ++x) { const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS]; const int x_filter_idx = (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; assert(x_filter_idx <= RS_SUBPEL_MASK); const int16_t *const x_filter = &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS]; int sum = 0; for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k) sum += src_x[k] * x_filter[k]; dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); x_qn += x_step_qn; } src += src_stride; dst += dst_stride; } } #if CONFIG_AV1_HIGHBITDEPTH void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd) { src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; for (int y = 0; y < h; ++y) { int x_qn = x0_qn; for (int x = 0; x < w; ++x) { const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS]; const int x_filter_idx = (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; assert(x_filter_idx <= RS_SUBPEL_MASK); const int16_t *const x_filter = &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS]; int sum = 0; for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k) sum += src_x[k] * x_filter[k]; dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); x_qn += x_step_qn; } src += src_stride; dst += dst_stride; } } #endif // CONFIG_AV1_HIGHBITDEPTH void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; int im_h = h + filter_params_y->taps - 1; int im_stride = w; assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const int bd = 8; const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; // horizontal filter const uint8_t *src_horiz = src - fo_vert * src_stride; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); for (int y = 0; y < im_h; ++y) { for (int x = 0; x < w; ++x) { int32_t sum = (1 << (bd + FILTER_BITS - 1)); for (int k = 0; k < filter_params_x->taps; ++k) { sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; } // TODO(aomedia:3393): for 12-tap filter, in extreme cases, the result can // be beyond the following range. For better prediction, a clamping can be // added for 12 tap filter to ensure the horizontal filtering result is // within 16 bit. The same applies to the vertical filtering. assert(filter_params_x->taps > 8 || (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); } } // vertical filter int16_t *src_vert = im_block + fo_vert * im_stride; const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t sum = 1 << offset_bits; for (int k = 0; k < filter_params_y->taps; ++k) { sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; } assert(filter_params_y->taps > 8 || (0 <= sum && sum < (1 << (offset_bits + 2)))); int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1))); dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits)); } } } void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) { const int fo_vert = filter_params_y->taps / 2 - 1; // vertical filter const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; for (int k = 0; k < filter_params_y->taps; ++k) { res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; } dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS)); } } } void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { const int fo_horiz = filter_params_x->taps / 2 - 1; const int bits = FILTER_BITS - conv_params->round_0; assert(bits >= 0); assert((FILTER_BITS - conv_params->round_1) >= 0 || ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; for (int k = 0; k < filter_params_x->taps; ++k) { res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; } res = ROUND_POWER_OF_TWO(res, conv_params->round_0); dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits)); } } } // This function is exactly the same as av1_convolve_2d_sr_c, and is an // optimized version for intrabc. Use the following 2-tap filter: // DECLARE_ALIGNED(256, static const int16_t, // av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = { // 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // }; void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { assert(subpel_x_qn == 8); assert(subpel_y_qn == 8); assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); (void)filter_params_x; (void)subpel_x_qn; (void)filter_params_y; (void)subpel_y_qn; (void)conv_params; int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; int im_h = h + 1; int im_stride = w; assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); const int bd = 8; // horizontal filter // explicitly operate for subpel_x_qn = 8. int16_t *im = im_block; for (int y = 0; y < im_h; ++y) { for (int x = 0; x < w; ++x) { const int32_t sum = (1 << bd) + src[x] + src[x + 1]; assert(0 <= sum && sum < (1 << (bd + 2))); im[x] = sum; } src += src_stride; im += im_stride; } // vertical filter // explicitly operate for subpel_y_qn = 8. int16_t *src_vert = im_block; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { const int32_t sum = (1 << (bd + 2)) + src_vert[x] + src_vert[im_stride + x]; assert(0 <= sum && sum < (1 << (bd + 4))); const int16_t res = ROUND_POWER_OF_TWO(sum, 2) - ((1 << bd) + (1 << (bd - 1))); dst[x] = clip_pixel(res); } src_vert += im_stride; dst += dst_stride; } } // This function is exactly the same as av1_convolve_y_sr_c, and is an // optimized version for intrabc. void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) { assert(subpel_y_qn == 8); assert(filter_params_y->taps == 2); (void)filter_params_y; (void)subpel_y_qn; // vertical filter // explicitly operate for subpel_y_qn = 8. for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { const int32_t res = src[x] + src[src_stride + x]; dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1)); } src += src_stride; dst += dst_stride; } } // This function is exactly the same as av1_convolve_x_sr_c, and is an // optimized version for intrabc. void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { assert(subpel_x_qn == 8); assert(filter_params_x->taps == 2); assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); (void)filter_params_x; (void)subpel_x_qn; (void)conv_params; // horizontal filter // explicitly operate for subpel_x_qn = 8. for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { const int32_t res = src[x] + src[x + 1]; dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1)); } src += src_stride; dst += dst_stride; } } void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst16 = conv_params->dst; int dst16_stride = conv_params->dst_stride; int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; int im_h = h + filter_params_y->taps - 1; int im_stride = w; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const int bd = 8; const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; // horizontal filter const uint8_t *src_horiz = src - fo_vert * src_stride; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); for (int y = 0; y < im_h; ++y) { for (int x = 0; x < w; ++x) { int32_t sum = (1 << (bd + FILTER_BITS - 1)); for (int k = 0; k < filter_params_x->taps; ++k) { sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; } assert(filter_params_x->taps > 8 || (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); } } // vertical filter int16_t *src_vert = im_block + fo_vert * im_stride; const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t sum = 1 << offset_bits; for (int k = 0; k < filter_params_y->taps; ++k) { sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; } assert(filter_params_y->taps > 8 || (0 <= sum && sum < (1 << (offset_bits + 2)))); CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { tmp += res; tmp = tmp >> 1; } tmp -= (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)); dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); } else { dst16[y * dst16_stride + x] = res; } } } } void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst16 = conv_params->dst; int dst16_stride = conv_params->dst_stride; const int fo_vert = filter_params_y->taps / 2 - 1; const int bits = FILTER_BITS - conv_params->round_0; const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int round_offset = (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; // vertical filter const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; for (int k = 0; k < filter_params_y->taps; ++k) { res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; } res *= (1 << bits); res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset; if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { tmp += res; tmp = tmp >> 1; } tmp -= round_offset; dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); } else { dst16[y * dst16_stride + x] = res; } } } } void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst16 = conv_params->dst; int dst16_stride = conv_params->dst_stride; const int fo_horiz = filter_params_x->taps / 2 - 1; const int bits = FILTER_BITS - conv_params->round_1; const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int round_offset = (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; for (int k = 0; k < filter_params_x->taps; ++k) { res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; } res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0); res += round_offset; if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { tmp += res; tmp = tmp >> 1; } tmp -= round_offset; dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); } else { dst16[y * dst16_stride + x] = res; } } } } void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst16 = conv_params->dst; int dst16_stride = conv_params->dst_stride; const int bits = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const int bd = 8; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int round_offset = (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { CONV_BUF_TYPE res = src[y * src_stride + x] << bits; res += round_offset; if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { tmp += res; tmp = tmp >> 1; } tmp -= round_offset; dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); } else { dst16[y * dst16_stride + x] = res; } } } } void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params) { int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps; CONV_BUF_TYPE *dst16 = conv_params->dst; const int dst16_stride = conv_params->dst_stride; const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; assert(bits >= 0); int im_stride = w; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const int bd = 8; // horizontal filter const uint8_t *src_horiz = src - fo_vert * src_stride; for (int y = 0; y < im_h; ++y) { int x_qn = subpel_x_qn; for (int x = 0; x < w; ++x, x_qn += x_step_qn) { const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(x_filter_idx < SUBPEL_SHIFTS); const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx); int32_t sum = (1 << (bd + FILTER_BITS - 1)); for (int k = 0; k < filter_params_x->taps; ++k) { sum += x_filter[k] * src_x[k - fo_horiz]; } assert(filter_params_x->taps > 8 || (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); } src_horiz += src_stride; } // vertical filter int16_t *src_vert = im_block + fo_vert * im_stride; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; for (int x = 0; x < w; ++x) { int y_qn = subpel_y_qn; for (int y = 0; y < h; ++y, y_qn += y_step_qn) { const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(y_filter_idx < SUBPEL_SHIFTS); const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx); int32_t sum = 1 << offset_bits; for (int k = 0; k < filter_params_y->taps; ++k) { sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; } assert(filter_params_y->taps > 8 || (0 <= sum && sum < (1 << (offset_bits + 2)))); CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); if (conv_params->is_compound) { if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { tmp += res; tmp = tmp >> 1; } /* Subtract round offset and convolve round */ tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1))); dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); } else { dst16[y * dst16_stride + x] = res; } } else { /* Subtract round offset and convolve round */ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1))); dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); } } src_vert++; } } static void convolve_2d_scale_wrapper( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params) { if (conv_params->is_compound) { assert(conv_params->dst != NULL); } av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn, y_step_qn, conv_params); } static void convolve_2d_facade_compound( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { const bool need_x = subpel_x_qn != 0; const bool need_y = subpel_y_qn != 0; if (!need_x && !need_y) { av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h, conv_params); } else if (need_x && !need_y) { av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params); } else if (!need_x && need_y) { av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn, conv_params); } else { assert(need_y && need_x); av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); } } static void convolve_2d_facade_single( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { const bool need_x = subpel_x_qn != 0; const bool need_y = subpel_y_qn != 0; if (!need_x && !need_y) { aom_convolve_copy(src, src_stride, dst, dst_stride, w, h); } else if (need_x && !need_y) { av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params); } else if (!need_x && need_y) { av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn); } else { assert(need_x && need_y); av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); } } void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *interp_filters[2], const int subpel_x_qn, int x_step_q4, const int subpel_y_qn, int y_step_q4, int scaled, ConvolveParams *conv_params) { (void)x_step_q4; (void)y_step_q4; (void)dst; (void)dst_stride; const InterpFilterParams *filter_params_x = interp_filters[0]; const InterpFilterParams *filter_params_y = interp_filters[1]; // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case. // 2-tap filter indicates that it is for IntraBC. if (filter_params_x->taps == 2 || filter_params_y->taps == 2) { assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); assert(!scaled); if (subpel_x_qn && subpel_y_qn) { av1_convolve_2d_sr_intrabc(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); return; } else if (subpel_x_qn) { av1_convolve_x_sr_intrabc(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params); return; } else if (subpel_y_qn) { av1_convolve_y_sr_intrabc(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn); return; } } if (scaled) { convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, x_step_q4, subpel_y_qn, y_step_q4, conv_params); } else if (conv_params->is_compound) { convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); } else { convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); } } #if CONFIG_AV1_HIGHBITDEPTH void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd) { const int fo_horiz = filter_params_x->taps / 2 - 1; const int bits = FILTER_BITS - conv_params->round_0; assert(bits >= 0); assert((FILTER_BITS - conv_params->round_1) >= 0 || ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; for (int k = 0; k < filter_params_x->taps; ++k) { res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; } res = ROUND_POWER_OF_TWO(res, conv_params->round_0); dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); } } } void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd) { const int fo_vert = filter_params_y->taps / 2 - 1; // vertical filter const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; for (int k = 0; k < filter_params_y->taps; ++k) { res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; } dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd); } } } void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; int im_h = h + filter_params_y->taps - 1; int im_stride = w; assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; assert(bits >= 0); // horizontal filter const uint16_t *src_horiz = src - fo_vert * src_stride; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); for (int y = 0; y < im_h; ++y) { for (int x = 0; x < w; ++x) { int32_t sum = (1 << (bd + FILTER_BITS - 1)); for (int k = 0; k < filter_params_x->taps; ++k) { sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; } assert(filter_params_x->taps > 8 || (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); im_block[y * im_stride + x] = ROUND_POWER_OF_TWO(sum, conv_params->round_0); } } // vertical filter int16_t *src_vert = im_block + fo_vert * im_stride; const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t sum = 1 << offset_bits; for (int k = 0; k < filter_params_y->taps; ++k) { sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; } assert(filter_params_y->taps > 8 || (0 <= sum && sum < (1 << (offset_bits + 2)))); int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1))); dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); } } } // This function is exactly the same as av1_highbd_convolve_2d_sr_c, and is an // optimized version for intrabc. Use the following 2-tap filter: // DECLARE_ALIGNED(256, static const int16_t, // av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = { // 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // }; void av1_highbd_convolve_2d_sr_intrabc_c( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; assert(bits >= 0); assert(subpel_x_qn == 8); assert(subpel_y_qn == 8); assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); (void)filter_params_x; (void)subpel_x_qn; (void)filter_params_y; (void)subpel_y_qn; (void)conv_params; int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; int im_h = h + 1; int im_stride = w; assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); // horizontal filter // explicitly operate for subpel_x_qn = 8. int16_t *im = im_block; for (int y = 0; y < im_h; ++y) { for (int x = 0; x < w; ++x) { int32_t sum = (1 << (bd + FILTER_BITS - 1)) + 64 * (src[x] + src[x + 1]); assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0); im[x] = sum; } src += src_stride; im += im_stride; } // vertical filter // explicitly operate for subpel_y_qn = 8. int16_t *src_vert = im_block; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { const int32_t sum = (1 << offset_bits) + 64 * (src_vert[x] + src_vert[im_stride + x]); assert(0 <= sum && sum < (1 << (offset_bits + 2))); const int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1))); dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); } src_vert += im_stride; dst += dst_stride; } } // This function is exactly the same as av1_highbd_convolve_y_sr_c, and is an // optimized version for intrabc. void av1_highbd_convolve_y_sr_intrabc_c( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, int bd) { assert(subpel_y_qn == 8); assert(filter_params_y->taps == 2); (void)filter_params_y; (void)subpel_y_qn; // vertical filter // explicitly operate for subpel_y_qn = 8. for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { const int32_t res = src[x] + src[src_stride + x]; dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, 1), bd); } src += src_stride; dst += dst_stride; } } // This function is exactly the same as av1_highbd_convolve_x_sr_c, and is an // optimized version for intrabc. void av1_highbd_convolve_x_sr_intrabc_c( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd) { const int bits = FILTER_BITS - conv_params->round_0; assert(bits >= 0); assert(subpel_x_qn == 8); assert(filter_params_x->taps == 2); assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); (void)filter_params_x; (void)subpel_x_qn; // horizontal filter // explicitly operate for subpel_x_qn = 8. for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 64 * (src[x] + src[x + 1]); res = ROUND_POWER_OF_TWO(res, conv_params->round_0); dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); } src += src_stride; dst += dst_stride; } } void av1_highbd_dist_wtd_convolve_2d_c( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { int x, y, k; int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; CONV_BUF_TYPE *dst16 = conv_params->dst; int dst16_stride = conv_params->dst_stride; int im_h = h + filter_params_y->taps - 1; int im_stride = w; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; assert(round_bits >= 0); // horizontal filter const uint16_t *src_horiz = src - fo_vert * src_stride; const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); for (y = 0; y < im_h; ++y) { for (x = 0; x < w; ++x) { int32_t sum = (1 << (bd + FILTER_BITS - 1)); for (k = 0; k < filter_params_x->taps; ++k) { sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; } assert(filter_params_x->taps > 8 || (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); (void)bd; im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); } } // vertical filter int16_t *src_vert = im_block + fo_vert * im_stride; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); for (y = 0; y < h; ++y) { for (x = 0; x < w; ++x) { int32_t sum = 1 << offset_bits; for (k = 0; k < filter_params_y->taps; ++k) { sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; } assert(filter_params_y->taps > 8 || (0 <= sum && sum < (1 << (offset_bits + 2)))); CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { tmp += res; tmp = tmp >> 1; } tmp -= (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)); dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); } else { dst16[y * dst16_stride + x] = res; } } } } void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst16 = conv_params->dst; int dst16_stride = conv_params->dst_stride; const int fo_horiz = filter_params_x->taps / 2 - 1; const int bits = FILTER_BITS - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int round_offset = (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; assert(round_bits >= 0); assert(bits >= 0); // horizontal filter const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; for (int k = 0; k < filter_params_x->taps; ++k) { res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; } res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0); res += round_offset; if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { tmp += res; tmp = tmp >> 1; } tmp -= round_offset; dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); } else { dst16[y * dst16_stride + x] = res; } } } } void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst16 = conv_params->dst; int dst16_stride = conv_params->dst_stride; const int fo_vert = filter_params_y->taps / 2 - 1; const int bits = FILTER_BITS - conv_params->round_0; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int round_offset = (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; assert(round_bits >= 0); assert(bits >= 0); // vertical filter const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int32_t res = 0; for (int k = 0; k < filter_params_y->taps; ++k) { res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; } res *= (1 << bits); res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset; if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { tmp += res; tmp = tmp >> 1; } tmp -= round_offset; dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); } else { dst16[y * dst16_stride + x] = res; } } } } void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst16 = conv_params->dst; int dst16_stride = conv_params->dst_stride; const int bits = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int round_offset = (1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1)); assert(bits >= 0); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { CONV_BUF_TYPE res = src[y * src_stride + x] << bits; res += round_offset; if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { tmp += res; tmp = tmp >> 1; } tmp -= round_offset; dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); } else { dst16[y * dst16_stride + x] = res; } } } } void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd) { int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps; int im_stride = w; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; CONV_BUF_TYPE *dst16 = conv_params->dst; const int dst16_stride = conv_params->dst_stride; const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; assert(bits >= 0); // horizontal filter const uint16_t *src_horiz = src - fo_vert * src_stride; for (int y = 0; y < im_h; ++y) { int x_qn = subpel_x_qn; for (int x = 0; x < w; ++x, x_qn += x_step_qn) { const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(x_filter_idx < SUBPEL_SHIFTS); const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx); int32_t sum = (1 << (bd + FILTER_BITS - 1)); for (int k = 0; k < filter_params_x->taps; ++k) { sum += x_filter[k] * src_x[k - fo_horiz]; } assert(filter_params_x->taps > 8 || (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); } src_horiz += src_stride; } // vertical filter int16_t *src_vert = im_block + fo_vert * im_stride; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; for (int x = 0; x < w; ++x) { int y_qn = subpel_y_qn; for (int y = 0; y < h; ++y, y_qn += y_step_qn) { const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(y_filter_idx < SUBPEL_SHIFTS); const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx); int32_t sum = 1 << offset_bits; for (int k = 0; k < filter_params_y->taps; ++k) { sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; } assert(filter_params_y->taps > 8 || (0 <= sum && sum < (1 << (offset_bits + 2)))); CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); if (conv_params->is_compound) { if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { tmp += res; tmp = tmp >> 1; } /* Subtract round offset and convolve round */ tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1))); dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); } else { dst16[y * dst16_stride + x] = res; } } else { /* Subtract round offset and convolve round */ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1))); dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); } } src_vert++; } } static void highbd_convolve_2d_facade_compound( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, const int w, const int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { const bool need_x = subpel_x_qn != 0; const bool need_y = subpel_y_qn != 0; if (!need_x && !need_y) { av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h, conv_params, bd); } else if (need_x && !need_y) { av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params, bd); } else if (!need_x && need_y) { av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn, conv_params, bd); } else { assert(need_x && need_y); av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); } } static void highbd_convolve_2d_facade_single( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, const int w, const int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { const bool need_x = subpel_x_qn != 0; const bool need_y = subpel_y_qn != 0; if (!need_x && !need_y) { aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h); } else if (need_x && !need_y) { av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params, bd); } else if (!need_x && need_y) { av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn, bd); } else { assert(need_x && need_y); av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); } } void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int w, int h, const InterpFilterParams *interp_filters[2], const int subpel_x_qn, int x_step_q4, const int subpel_y_qn, int y_step_q4, int scaled, ConvolveParams *conv_params, int bd) { (void)x_step_q4; (void)y_step_q4; (void)dst_stride; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const InterpFilterParams *filter_params_x = interp_filters[0]; const InterpFilterParams *filter_params_y = interp_filters[1]; uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); // 2-tap filter indicates that it is for IntraBC. if (filter_params_x->taps == 2 || filter_params_y->taps == 2) { assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); assert(!scaled); if (subpel_x_qn && subpel_y_qn) { av1_highbd_convolve_2d_sr_intrabc_c( src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); return; } else if (subpel_x_qn) { av1_highbd_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params, bd); return; } else if (subpel_y_qn) { av1_highbd_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn, bd); return; } } if (scaled) { if (conv_params->is_compound) { assert(conv_params->dst != NULL); } av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, x_step_q4, subpel_y_qn, y_step_q4, conv_params, bd); } else if (conv_params->is_compound) { highbd_convolve_2d_facade_compound( src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); } else { highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); } } #endif // CONFIG_AV1_HIGHBITDEPTH // Note: Fixed size intermediate buffers, place limits on parameters // of some functions. 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. // (2) Interpolate temp vertically to derive the sub-pixel result. // Deriving the maximum number of rows in the temp buffer (135): // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). // --Largest block size is 128x128 pixels. // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the // original frame (in 1/16th pixel units). // --Must round-up because block may be located at sub-pixel position. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((128 - 1) * 32 + 15) >> 4 + 8 = 263. #define WIENER_MAX_EXT_SIZE 263 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static inline int horz_scalar_product(const uint8_t *a, const int16_t *b) { int sum = 0; for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; return sum; } #if CONFIG_AV1_HIGHBITDEPTH static inline int highbd_horz_scalar_product(const uint16_t *a, const int16_t *b) { int sum = 0; for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; return sum; } #endif static inline int highbd_vert_scalar_product(const uint16_t *a, ptrdiff_t a_stride, const int16_t *b) { int sum = 0; for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; return sum; } static const InterpKernel *get_filter_base(const int16_t *filter) { // NOTE: This assumes that the filter table is 256-byte aligned. // TODO(agrange) Modify to make independent of table alignment. return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); } static int get_filter_offset(const int16_t *f, const InterpKernel *base) { return (int)((const InterpKernel *)(intptr_t)f - base); } static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int round0_bits) { const int bd = 8; src -= SUBPEL_TAPS / 2 - 1; for (int y = 0; y < h; ++y) { int x_q4 = x0_q4; for (int x = 0; x < w; ++x) { const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + (1 << (bd + FILTER_BITS - 1)); const int sum = horz_scalar_product(src_x, x_filter) + rounding; dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, WIENER_CLAMP_LIMIT(round0_bits, bd) - 1); x_q4 += x_step_q4; } src += src_stride; dst += dst_stride; } } static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int round1_bits) { const int bd = 8; src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (int x = 0; x < w; ++x) { int y_q4 = y0_q4; for (int y = 0; y < h; ++y) { const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; const int rounding = ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - (1 << (bd + round1_bits - 1)); const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding; dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits)); y_q4 += y_step_q4; } ++src; ++dst; } } void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1; memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); assert(w <= MAX_SB_SIZE); assert(h <= MAX_SB_SIZE); assert(y_step_q4 <= 32); assert(x_step_q4 <= 32); convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4, x_step_q4, w, intermediate_height, conv_params->round_0); convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #if CONFIG_AV1_HIGHBITDEPTH #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static void highbd_convolve_add_src_horiz_hip( const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int round0_bits, int bd) { const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd); uint16_t *src = CONVERT_TO_SHORTPTR(src8); src -= SUBPEL_TAPS / 2 - 1; for (int y = 0; y < h; ++y) { int x_q4 = x0_q4; for (int x = 0; x < w; ++x) { const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + (1 << (bd + FILTER_BITS - 1)); const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding; dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, extraprec_clamp_limit - 1); x_q4 += x_step_q4; } src += src_stride; dst += dst_stride; } } static void highbd_convolve_add_src_vert_hip( const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int round1_bits, int bd) { uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (int x = 0; x < w; ++x) { int y_q4 = y0_q4; for (int y = 0; y < h; ++y) { const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; const int rounding = ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - (1 << (bd + round1_bits - 1)); const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding; dst[y * dst_stride] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd); y_q4 += y_step_q4; } ++src; ++dst; } } void av1_highbd_wiener_convolve_add_src_c( const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params, int bd) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; assert(w <= MAX_SB_SIZE); assert(h <= MAX_SB_SIZE); assert(y_step_q4 <= 32); assert(x_step_q4 <= 32); assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4, x_step_q4, w, intermediate_height, conv_params->round_0, bd); highbd_convolve_add_src_vert_hip( temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/av1/common/convolve.h000066400000000000000000000115521477627663500163700ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_CONVOLVE_H_ #define AOM_AV1_COMMON_CONVOLVE_H_ #include "av1/common/filter.h" #ifdef __cplusplus extern "C" { #endif typedef uint16_t CONV_BUF_TYPE; typedef struct ConvolveParams { int do_average; CONV_BUF_TYPE *dst; int dst_stride; int round_0; int round_1; int plane; int is_compound; int use_dist_wtd_comp_avg; int fwd_offset; int bck_offset; } ConvolveParams; typedef struct WienerConvolveParams { int round_0; int round_1; } WienerConvolveParams; #define ROUND0_BITS 3 #define COMPOUND_ROUND1_BITS 7 #define WIENER_ROUND0_BITS 3 #define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0)) typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params); typedef void (*aom_highbd_convolve_fn_t)( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd); struct AV1Common; struct scale_factors; void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *interp_filters[2], const int subpel_x_qn, int x_step_q4, const int subpel_y_qn, int y_step_q4, int scaled, ConvolveParams *conv_params); static inline ConvolveParams get_conv_params_no_round(int cmp_index, int plane, CONV_BUF_TYPE *dst, int dst_stride, int is_compound, int bd) { ConvolveParams conv_params; assert(IMPLIES(cmp_index, is_compound)); conv_params.is_compound = is_compound; conv_params.use_dist_wtd_comp_avg = 0; conv_params.round_0 = ROUND0_BITS; conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS : 2 * FILTER_BITS - conv_params.round_0; #if CONFIG_AV1_HIGHBITDEPTH const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2; assert(IMPLIES(bd < 12, intbufrange <= 16)); if (intbufrange > 16) { conv_params.round_0 += intbufrange - 16; if (!is_compound) conv_params.round_1 -= intbufrange - 16; } #else (void)bd; #endif // CONFIG_AV1_HIGHBITDEPTH // TODO(yunqing): The following dst should only be valid while // is_compound = 1; conv_params.dst = dst; conv_params.dst_stride = dst_stride; conv_params.plane = plane; // By default, set do average to 1 if this is the second single prediction // in a compound mode. conv_params.do_average = cmp_index; return conv_params; } static inline ConvolveParams get_conv_params(int do_average, int plane, int bd) { return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd); } static inline WienerConvolveParams get_conv_params_wiener(int bd) { WienerConvolveParams conv_params; conv_params.round_0 = WIENER_ROUND0_BITS; conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0; const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2; assert(IMPLIES(bd < 12, intbufrange <= 16)); if (intbufrange > 16) { conv_params.round_0 += intbufrange - 16; conv_params.round_1 -= intbufrange - 16; } return conv_params; } void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *interp_filters[2], const int subpel_x_qn, int x_step_q4, const int subpel_y_qn, int y_step_q4, int scaled, ConvolveParams *conv_params, int bd); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_CONVOLVE_H_ aom-3.12.1/av1/common/debugmodes.c000066400000000000000000000100001477627663500166310ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/debugmodes.h" #include #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/enums.h" #if 0 static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) { fprintf(f, "%s", str); fprintf(f, "(Frame %u, Show:%d, Q:%d): \n", cm->current_frame.frame_number, cm->show_frame, cm->quant_params.base_qindex); } /* This function dereferences a pointer to the mbmi structure * and uses the passed in member offset to print out the value of an integer * for each mbmi member value in the mi structure. */ static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor, size_t member_offset) { const CommonModeInfoParams *const mi_params = &cm->mi_params; MB_MODE_INFO **mi = mi_params->mi_grid_base; int rows = mi_params->mi_rows; int cols = mi_params->mi_cols; char prefix = descriptor[0]; log_frame_info(cm, descriptor, file); for (int mi_row = 0; mi_row < rows; mi_row++) { fprintf(file, "%c ", prefix); for (int mi_col = 0; mi_col < cols; mi_col++) { fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset))); mi++; } fprintf(file, "\n"); mi += mi_params->mi_stride - cols; } fprintf(file, "\n"); } void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) { CommonModeInfoParams *mi_params = &cm->mi_params; FILE *mvs = fopen(file, "a"); MB_MODE_INFO **mi = mi_params->mi_grid_base; const int rows = mi_params->mi_rows; const int cols = mi_params->mi_cols; print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, bsize)); print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode)); print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0])); print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size)); print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode)); // output skip infomation. log_frame_info(cm, "Skips:", mvs); for (int mi_row = 0; mi_row < rows; mi_row++) { fprintf(mvs, "S "); for (int mi_col = 0; mi_col < cols; mi_col++) { fprintf(mvs, "%2d ", mi[0]->skip_txfm); mi++; } fprintf(mvs, "\n"); mi += mi_params->mi_stride - cols; } fprintf(mvs, "\n"); // output motion vectors. log_frame_info(cm, "Vectors ", mvs); mi = mi_params->mi_grid_base; for (int mi_row = 0; mi_row < rows; mi_row++) { fprintf(mvs, "V "); for (int mi_col = 0; mi_col < cols; mi_col++) { fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row, mi[0]->mv[0].as_mv.col); mi++; } fprintf(mvs, "\n"); mi += mi_params->mi_stride - cols; } fprintf(mvs, "\n"); fclose(mvs); } #endif // 0 void av1_print_uncompressed_frame_header(const uint8_t *data, int size, const char *filename) { FILE *hdrFile = fopen(filename, "w"); fwrite(data, size, sizeof(uint8_t), hdrFile); // Reset order hints(7bit + a previous bit) to 0, so that all camera frame // headers are identical in large scale coding. uint8_t zero = 0; fseek(hdrFile, 1, SEEK_SET); // Reset second byte. fwrite(&zero, 1, sizeof(uint8_t), hdrFile); fclose(hdrFile); } void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename) { FILE *fcFile = fopen(filename, "w"); const uint16_t *fcp = (uint16_t *)fc; const unsigned int n_contexts = sizeof(FRAME_CONTEXT) / sizeof(uint16_t); unsigned int i; for (i = 0; i < n_contexts; ++i) fprintf(fcFile, "%d ", *fcp++); fclose(fcFile); } aom-3.12.1/av1/common/debugmodes.h000066400000000000000000000020151477627663500166450ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_DEBUGMODES_H_ #define AOM_AV1_COMMON_DEBUGMODES_H_ #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/enums.h" void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file); void av1_print_uncompressed_frame_header(const uint8_t *data, int size, const char *filename); void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename); #endif // AOM_AV1_COMMON_DEBUGMODES_H_ aom-3.12.1/av1/common/entropy.c000066400000000000000000000176251477627663500162370ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_config.h" #include "aom/aom_integer.h" #include "aom_mem/aom_mem.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/entropy.h" #include "av1/common/entropymode.h" #include "av1/common/scan.h" #include "av1/common/token_cdfs.h" #include "av1/common/txb_common.h" static int get_q_ctx(int q) { if (q <= 20) return 0; if (q <= 60) return 1; if (q <= 120) return 2; return 3; } void av1_default_coef_probs(AV1_COMMON *cm) { const int index = get_q_ctx(cm->quant_params.base_qindex); #if CONFIG_ENTROPY_STATS cm->coef_cdf_category = index; #endif av1_copy(cm->fc->txb_skip_cdf, av1_default_txb_skip_cdfs[index]); av1_copy(cm->fc->eob_extra_cdf, av1_default_eob_extra_cdfs[index]); av1_copy(cm->fc->dc_sign_cdf, av1_default_dc_sign_cdfs[index]); av1_copy(cm->fc->coeff_br_cdf, av1_default_coeff_lps_multi_cdfs[index]); av1_copy(cm->fc->coeff_base_cdf, av1_default_coeff_base_multi_cdfs[index]); av1_copy(cm->fc->coeff_base_eob_cdf, av1_default_coeff_base_eob_multi_cdfs[index]); av1_copy(cm->fc->eob_flag_cdf16, av1_default_eob_multi16_cdfs[index]); av1_copy(cm->fc->eob_flag_cdf32, av1_default_eob_multi32_cdfs[index]); av1_copy(cm->fc->eob_flag_cdf64, av1_default_eob_multi64_cdfs[index]); av1_copy(cm->fc->eob_flag_cdf128, av1_default_eob_multi128_cdfs[index]); av1_copy(cm->fc->eob_flag_cdf256, av1_default_eob_multi256_cdfs[index]); av1_copy(cm->fc->eob_flag_cdf512, av1_default_eob_multi512_cdfs[index]); av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]); } static inline void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, int num_cdfs, int cdf_stride, int nsymbs) { for (int i = 0; i < num_cdfs; i++) { cdf_ptr[i * cdf_stride + nsymbs] = 0; } } #define RESET_CDF_COUNTER(cname, nsymbs) \ RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs)) #define RESET_CDF_COUNTER_STRIDE(cname, nsymbs, cdf_stride) \ do { \ aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname; \ int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob); \ int num_cdfs = array_size / cdf_stride; \ reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \ } while (0) static inline void reset_nmv_counter(nmv_context *nmv) { RESET_CDF_COUNTER(nmv->joints_cdf, 4); for (int i = 0; i < 2; i++) { RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES); RESET_CDF_COUNTER(nmv->comps[i].class0_fp_cdf, MV_FP_SIZE); RESET_CDF_COUNTER(nmv->comps[i].fp_cdf, MV_FP_SIZE); RESET_CDF_COUNTER(nmv->comps[i].sign_cdf, 2); RESET_CDF_COUNTER(nmv->comps[i].class0_hp_cdf, 2); RESET_CDF_COUNTER(nmv->comps[i].hp_cdf, 2); RESET_CDF_COUNTER(nmv->comps[i].class0_cdf, CLASS0_SIZE); RESET_CDF_COUNTER(nmv->comps[i].bits_cdf, 2); } } void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) { RESET_CDF_COUNTER(fc->txb_skip_cdf, 2); RESET_CDF_COUNTER(fc->eob_extra_cdf, 2); RESET_CDF_COUNTER(fc->dc_sign_cdf, 2); RESET_CDF_COUNTER(fc->eob_flag_cdf16, 5); RESET_CDF_COUNTER(fc->eob_flag_cdf32, 6); RESET_CDF_COUNTER(fc->eob_flag_cdf64, 7); RESET_CDF_COUNTER(fc->eob_flag_cdf128, 8); RESET_CDF_COUNTER(fc->eob_flag_cdf256, 9); RESET_CDF_COUNTER(fc->eob_flag_cdf512, 10); RESET_CDF_COUNTER(fc->eob_flag_cdf1024, 11); RESET_CDF_COUNTER(fc->coeff_base_eob_cdf, 3); RESET_CDF_COUNTER(fc->coeff_base_cdf, 4); RESET_CDF_COUNTER(fc->coeff_br_cdf, BR_CDF_SIZE); RESET_CDF_COUNTER(fc->newmv_cdf, 2); RESET_CDF_COUNTER(fc->zeromv_cdf, 2); RESET_CDF_COUNTER(fc->refmv_cdf, 2); RESET_CDF_COUNTER(fc->drl_cdf, 2); RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES); RESET_CDF_COUNTER(fc->compound_type_cdf, MASKED_COMPOUND_TYPES); RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16); RESET_CDF_COUNTER(fc->interintra_cdf, 2); RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2); RESET_CDF_COUNTER(fc->interintra_mode_cdf, INTERINTRA_MODES); RESET_CDF_COUNTER(fc->motion_mode_cdf, MOTION_MODES); RESET_CDF_COUNTER(fc->obmc_cdf, 2); RESET_CDF_COUNTER(fc->palette_y_size_cdf, PALETTE_SIZES); RESET_CDF_COUNTER(fc->palette_uv_size_cdf, PALETTE_SIZES); for (int j = 0; j < PALETTE_SIZES; j++) { int nsymbs = j + PALETTE_MIN_SIZE; RESET_CDF_COUNTER_STRIDE(fc->palette_y_color_index_cdf[j], nsymbs, CDF_SIZE(PALETTE_COLORS)); RESET_CDF_COUNTER_STRIDE(fc->palette_uv_color_index_cdf[j], nsymbs, CDF_SIZE(PALETTE_COLORS)); } RESET_CDF_COUNTER(fc->palette_y_mode_cdf, 2); RESET_CDF_COUNTER(fc->palette_uv_mode_cdf, 2); RESET_CDF_COUNTER(fc->comp_inter_cdf, 2); RESET_CDF_COUNTER(fc->single_ref_cdf, 2); RESET_CDF_COUNTER(fc->comp_ref_type_cdf, 2); RESET_CDF_COUNTER(fc->uni_comp_ref_cdf, 2); RESET_CDF_COUNTER(fc->comp_ref_cdf, 2); RESET_CDF_COUNTER(fc->comp_bwdref_cdf, 2); RESET_CDF_COUNTER(fc->txfm_partition_cdf, 2); RESET_CDF_COUNTER(fc->compound_index_cdf, 2); RESET_CDF_COUNTER(fc->comp_group_idx_cdf, 2); RESET_CDF_COUNTER(fc->skip_mode_cdfs, 2); RESET_CDF_COUNTER(fc->skip_txfm_cdfs, 2); RESET_CDF_COUNTER(fc->intra_inter_cdf, 2); reset_nmv_counter(&fc->nmvc); reset_nmv_counter(&fc->ndvc); RESET_CDF_COUNTER(fc->intrabc_cdf, 2); RESET_CDF_COUNTER(fc->seg.pred_cdf, 2); RESET_CDF_COUNTER(fc->seg.spatial_pred_seg_cdf, MAX_SEGMENTS); RESET_CDF_COUNTER(fc->filter_intra_cdfs, 2); RESET_CDF_COUNTER(fc->filter_intra_mode_cdf, FILTER_INTRA_MODES); RESET_CDF_COUNTER(fc->switchable_restore_cdf, RESTORE_SWITCHABLE_TYPES); RESET_CDF_COUNTER(fc->wiener_restore_cdf, 2); RESET_CDF_COUNTER(fc->sgrproj_restore_cdf, 2); RESET_CDF_COUNTER(fc->y_mode_cdf, INTRA_MODES); RESET_CDF_COUNTER_STRIDE(fc->uv_mode_cdf[0], UV_INTRA_MODES - 1, CDF_SIZE(UV_INTRA_MODES)); RESET_CDF_COUNTER(fc->uv_mode_cdf[1], UV_INTRA_MODES); for (int i = 0; i < PARTITION_CONTEXTS; i++) { if (i < 4) { RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 4, CDF_SIZE(10)); } else if (i < 16) { RESET_CDF_COUNTER(fc->partition_cdf[i], 10); } else { RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 8, CDF_SIZE(10)); } } RESET_CDF_COUNTER(fc->switchable_interp_cdf, SWITCHABLE_FILTERS); RESET_CDF_COUNTER(fc->kf_y_cdf, INTRA_MODES); RESET_CDF_COUNTER(fc->angle_delta_cdf, 2 * MAX_ANGLE_DELTA + 1); RESET_CDF_COUNTER_STRIDE(fc->tx_size_cdf[0], MAX_TX_DEPTH, CDF_SIZE(MAX_TX_DEPTH + 1)); RESET_CDF_COUNTER(fc->tx_size_cdf[1], MAX_TX_DEPTH + 1); RESET_CDF_COUNTER(fc->tx_size_cdf[2], MAX_TX_DEPTH + 1); RESET_CDF_COUNTER(fc->tx_size_cdf[3], MAX_TX_DEPTH + 1); RESET_CDF_COUNTER(fc->delta_q_cdf, DELTA_Q_PROBS + 1); RESET_CDF_COUNTER(fc->delta_lf_cdf, DELTA_LF_PROBS + 1); for (int i = 0; i < FRAME_LF_COUNT; i++) { RESET_CDF_COUNTER(fc->delta_lf_multi_cdf[i], DELTA_LF_PROBS + 1); } RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[1], 7, CDF_SIZE(TX_TYPES)); RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[2], 5, CDF_SIZE(TX_TYPES)); RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[1], 16, CDF_SIZE(TX_TYPES)); RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[2], 12, CDF_SIZE(TX_TYPES)); RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[3], 2, CDF_SIZE(TX_TYPES)); RESET_CDF_COUNTER(fc->cfl_sign_cdf, CFL_JOINT_SIGNS); RESET_CDF_COUNTER(fc->cfl_alpha_cdf, CFL_ALPHABET_SIZE); } aom-3.12.1/av1/common/entropy.h000066400000000000000000000120421477627663500162300ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_ENTROPY_H_ #define AOM_AV1_COMMON_ENTROPY_H_ #include "config/aom_config.h" #include "aom/aom_integer.h" #include "aom_dsp/prob.h" #include "av1/common/common.h" #include "av1/common/common_data.h" #include "av1/common/enums.h" #ifdef __cplusplus extern "C" { #endif #define TOKEN_CDF_Q_CTXS 4 #define TXB_SKIP_CONTEXTS 13 #define EOB_COEF_CONTEXTS 9 #define SIG_COEF_CONTEXTS_2D 26 #define SIG_COEF_CONTEXTS_1D 16 #define SIG_COEF_CONTEXTS_EOB 4 #define SIG_COEF_CONTEXTS (SIG_COEF_CONTEXTS_2D + SIG_COEF_CONTEXTS_1D) #define COEFF_BASE_CONTEXTS (SIG_COEF_CONTEXTS) #define DC_SIGN_CONTEXTS 3 #define BR_TMP_OFFSET 12 #define BR_REF_CAT 4 #define LEVEL_CONTEXTS 21 #define NUM_BASE_LEVELS 2 #define BR_CDF_SIZE (4) #define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1)) #define COEFF_CONTEXT_BITS 3 #define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1) #define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1) #define BASE_CONTEXT_POSITION_NUM 12 enum { TX_CLASS_2D = 0, TX_CLASS_HORIZ = 1, TX_CLASS_VERT = 2, TX_CLASSES = 3, } UENUM1BYTE(TX_CLASS); #define DCT_MAX_VALUE 16384 #define DCT_MAX_VALUE_HIGH10 65536 #define DCT_MAX_VALUE_HIGH12 262144 /* Coefficients are predicted via a 3-dimensional probability table indexed on * REF_TYPES, COEF_BANDS and COEF_CONTEXTS. */ #define REF_TYPES 2 // intra=0, inter=1 struct AV1Common; struct frame_contexts; void av1_reset_cdf_symbol_counters(struct frame_contexts *fc); void av1_default_coef_probs(struct AV1Common *cm); void av1_init_mode_probs(struct frame_contexts *fc); struct frame_contexts; typedef char ENTROPY_CONTEXT; static inline int combine_entropy_contexts(ENTROPY_CONTEXT a, ENTROPY_CONTEXT b) { return (a != 0) + (b != 0); } static inline int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l) { ENTROPY_CONTEXT above_ec = 0, left_ec = 0; switch (tx_size) { case TX_4X4: above_ec = a[0] != 0; left_ec = l[0] != 0; break; case TX_4X8: above_ec = a[0] != 0; left_ec = !!*(const uint16_t *)l; break; case TX_8X4: above_ec = !!*(const uint16_t *)a; left_ec = l[0] != 0; break; case TX_8X16: above_ec = !!*(const uint16_t *)a; left_ec = !!*(const uint32_t *)l; break; case TX_16X8: above_ec = !!*(const uint32_t *)a; left_ec = !!*(const uint16_t *)l; break; case TX_16X32: above_ec = !!*(const uint32_t *)a; left_ec = !!*(const uint64_t *)l; break; case TX_32X16: above_ec = !!*(const uint64_t *)a; left_ec = !!*(const uint32_t *)l; break; case TX_8X8: above_ec = !!*(const uint16_t *)a; left_ec = !!*(const uint16_t *)l; break; case TX_16X16: above_ec = !!*(const uint32_t *)a; left_ec = !!*(const uint32_t *)l; break; case TX_32X32: above_ec = !!*(const uint64_t *)a; left_ec = !!*(const uint64_t *)l; break; case TX_64X64: above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8)); left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8)); break; case TX_32X64: above_ec = !!*(const uint64_t *)a; left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8)); break; case TX_64X32: above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8)); left_ec = !!*(const uint64_t *)l; break; case TX_4X16: above_ec = a[0] != 0; left_ec = !!*(const uint32_t *)l; break; case TX_16X4: above_ec = !!*(const uint32_t *)a; left_ec = l[0] != 0; break; case TX_8X32: above_ec = !!*(const uint16_t *)a; left_ec = !!*(const uint64_t *)l; break; case TX_32X8: above_ec = !!*(const uint64_t *)a; left_ec = !!*(const uint16_t *)l; break; case TX_16X64: above_ec = !!*(const uint32_t *)a; left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8)); break; case TX_64X16: above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8)); left_ec = !!*(const uint32_t *)l; break; default: assert(0 && "Invalid transform size."); break; } return combine_entropy_contexts(above_ec, left_ec); } static inline TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) { return (TX_SIZE)((txsize_sqr_map[txsize] + txsize_sqr_up_map[txsize] + 1) >> 1); } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_ENTROPY_H_ aom-3.12.1/av1/common/entropymode.c000066400000000000000000001464531477627663500171060ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom_mem/aom_mem.h" #include "av1/common/av1_common_int.h" #include "av1/common/reconinter.h" #include "av1/common/scan.h" #include "av1/common/seg_common.h" #include "av1/common/txb_common.h" static const aom_cdf_prob default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][CDF_SIZE( INTRA_MODES)] = { { { AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244, 24189, 28165, 29093, 30466) }, { AOM_CDF13(12016, 18066, 19516, 20303, 20719, 21444, 21888, 23032, 24434, 28658, 30172, 31409) }, { AOM_CDF13(10052, 10771, 22296, 22788, 23055, 23239, 24133, 25620, 26160, 29336, 29929, 31567) }, { AOM_CDF13(14091, 15406, 16442, 18808, 19136, 19546, 19998, 22096, 24746, 29585, 30958, 32462) }, { AOM_CDF13(12122, 13265, 15603, 16501, 18609, 20033, 22391, 25583, 26437, 30261, 31073, 32475) } }, { { AOM_CDF13(10023, 19585, 20848, 21440, 21832, 22760, 23089, 24023, 25381, 29014, 30482, 31436) }, { AOM_CDF13(5983, 24099, 24560, 24886, 25066, 25795, 25913, 26423, 27610, 29905, 31276, 31794) }, { AOM_CDF13(7444, 12781, 20177, 20728, 21077, 21607, 22170, 23405, 24469, 27915, 29090, 30492) }, { AOM_CDF13(8537, 14689, 15432, 17087, 17408, 18172, 18408, 19825, 24649, 29153, 31096, 32210) }, { AOM_CDF13(7543, 14231, 15496, 16195, 17905, 20717, 21984, 24516, 26001, 29675, 30981, 31994) } }, { { AOM_CDF13(12613, 13591, 21383, 22004, 22312, 22577, 23401, 25055, 25729, 29538, 30305, 32077) }, { AOM_CDF13(9687, 13470, 18506, 19230, 19604, 20147, 20695, 22062, 23219, 27743, 29211, 30907) }, { AOM_CDF13(6183, 6505, 26024, 26252, 26366, 26434, 27082, 28354, 28555, 30467, 30794, 32086) }, { AOM_CDF13(10718, 11734, 14954, 17224, 17565, 17924, 18561, 21523, 23878, 28975, 30287, 32252) }, { AOM_CDF13(9194, 9858, 16501, 17263, 18424, 19171, 21563, 25961, 26561, 30072, 30737, 32463) } }, { { AOM_CDF13(12602, 14399, 15488, 18381, 18778, 19315, 19724, 21419, 25060, 29696, 30917, 32409) }, { AOM_CDF13(8203, 13821, 14524, 17105, 17439, 18131, 18404, 19468, 25225, 29485, 31158, 32342) }, { AOM_CDF13(8451, 9731, 15004, 17643, 18012, 18425, 19070, 21538, 24605, 29118, 30078, 32018) }, { AOM_CDF13(7714, 9048, 9516, 16667, 16817, 16994, 17153, 18767, 26743, 30389, 31536, 32528) }, { AOM_CDF13(8843, 10280, 11496, 15317, 16652, 17943, 19108, 22718, 25769, 29953, 30983, 32485) } }, { { AOM_CDF13(12578, 13671, 15979, 16834, 19075, 20913, 22989, 25449, 26219, 30214, 31150, 32477) }, { AOM_CDF13(9563, 13626, 15080, 15892, 17756, 20863, 22207, 24236, 25380, 29653, 31143, 32277) }, { AOM_CDF13(8356, 8901, 17616, 18256, 19350, 20106, 22598, 25947, 26466, 29900, 30523, 32261) }, { AOM_CDF13(10835, 11815, 13124, 16042, 17018, 18039, 18947, 22753, 24615, 29489, 30883, 32482) }, { AOM_CDF13(7618, 8288, 9859, 10509, 15386, 18657, 22903, 28776, 29180, 31355, 31802, 32593) } } }; static const aom_cdf_prob default_angle_delta_cdf[DIRECTIONAL_MODES][CDF_SIZE( 2 * MAX_ANGLE_DELTA + 1)] = { { AOM_CDF7(2180, 5032, 7567, 22776, 26989, 30217) }, { AOM_CDF7(2301, 5608, 8801, 23487, 26974, 30330) }, { AOM_CDF7(3780, 11018, 13699, 19354, 23083, 31286) }, { AOM_CDF7(4581, 11226, 15147, 17138, 21834, 28397) }, { AOM_CDF7(1737, 10927, 14509, 19588, 22745, 28823) }, { AOM_CDF7(2664, 10176, 12485, 17650, 21600, 30495) }, { AOM_CDF7(2240, 11096, 15453, 20341, 22561, 28917) }, { AOM_CDF7(3605, 10428, 12459, 17676, 21244, 30655) } }; static const aom_cdf_prob default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE( INTRA_MODES)] = { { AOM_CDF13(22801, 23489, 24293, 24756, 25601, 26123, 26606, 27418, 27945, 29228, 29685, 30349) }, { AOM_CDF13(18673, 19845, 22631, 23318, 23950, 24649, 25527, 27364, 28152, 29701, 29984, 30852) }, { AOM_CDF13(19770, 20979, 23396, 23939, 24241, 24654, 25136, 27073, 27830, 29360, 29730, 30659) }, { AOM_CDF13(20155, 21301, 22838, 23178, 23261, 23533, 23703, 24804, 25352, 26575, 27016, 28049) } }; static const aom_cdf_prob default_uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES][CDF_SIZE( UV_INTRA_MODES)] = { { { AOM_CDF13(22631, 24152, 25378, 25661, 25986, 26520, 27055, 27923, 28244, 30059, 30941, 31961) }, { AOM_CDF13(9513, 26881, 26973, 27046, 27118, 27664, 27739, 27824, 28359, 29505, 29800, 31796) }, { AOM_CDF13(9845, 9915, 28663, 28704, 28757, 28780, 29198, 29822, 29854, 30764, 31777, 32029) }, { AOM_CDF13(13639, 13897, 14171, 25331, 25606, 25727, 25953, 27148, 28577, 30612, 31355, 32493) }, { AOM_CDF13(9764, 9835, 9930, 9954, 25386, 27053, 27958, 28148, 28243, 31101, 31744, 32363) }, { AOM_CDF13(11825, 13589, 13677, 13720, 15048, 29213, 29301, 29458, 29711, 31161, 31441, 32550) }, { AOM_CDF13(14175, 14399, 16608, 16821, 17718, 17775, 28551, 30200, 30245, 31837, 32342, 32667) }, { AOM_CDF13(12885, 13038, 14978, 15590, 15673, 15748, 16176, 29128, 29267, 30643, 31961, 32461) }, { AOM_CDF13(12026, 13661, 13874, 15305, 15490, 15726, 15995, 16273, 28443, 30388, 30767, 32416) }, { AOM_CDF13(19052, 19840, 20579, 20916, 21150, 21467, 21885, 22719, 23174, 28861, 30379, 32175) }, { AOM_CDF13(18627, 19649, 20974, 21219, 21492, 21816, 22199, 23119, 23527, 27053, 31397, 32148) }, { AOM_CDF13(17026, 19004, 19997, 20339, 20586, 21103, 21349, 21907, 22482, 25896, 26541, 31819) }, { AOM_CDF13(12124, 13759, 14959, 14992, 15007, 15051, 15078, 15166, 15255, 15753, 16039, 16606) } }, { { AOM_CDF14(10407, 11208, 12900, 13181, 13823, 14175, 14899, 15656, 15986, 20086, 20995, 22455, 24212) }, { AOM_CDF14(4532, 19780, 20057, 20215, 20428, 21071, 21199, 21451, 22099, 24228, 24693, 27032, 29472) }, { AOM_CDF14(5273, 5379, 20177, 20270, 20385, 20439, 20949, 21695, 21774, 23138, 24256, 24703, 26679) }, { AOM_CDF14(6740, 7167, 7662, 14152, 14536, 14785, 15034, 16741, 18371, 21520, 22206, 23389, 24182) }, { AOM_CDF14(4987, 5368, 5928, 6068, 19114, 20315, 21857, 22253, 22411, 24911, 25380, 26027, 26376) }, { AOM_CDF14(5370, 6889, 7247, 7393, 9498, 21114, 21402, 21753, 21981, 24780, 25386, 26517, 27176) }, { AOM_CDF14(4816, 4961, 7204, 7326, 8765, 8930, 20169, 20682, 20803, 23188, 23763, 24455, 24940) }, { AOM_CDF14(6608, 6740, 8529, 9049, 9257, 9356, 9735, 18827, 19059, 22336, 23204, 23964, 24793) }, { AOM_CDF14(5998, 7419, 7781, 8933, 9255, 9549, 9753, 10417, 18898, 22494, 23139, 24764, 25989) }, { AOM_CDF14(10660, 11298, 12550, 12957, 13322, 13624, 14040, 15004, 15534, 20714, 21789, 23443, 24861) }, { AOM_CDF14(10522, 11530, 12552, 12963, 13378, 13779, 14245, 15235, 15902, 20102, 22696, 23774, 25838) }, { AOM_CDF14(10099, 10691, 12639, 13049, 13386, 13665, 14125, 15163, 15636, 19676, 20474, 23519, 25208) }, { AOM_CDF14(3144, 5087, 7382, 7504, 7593, 7690, 7801, 8064, 8232, 9248, 9875, 10521, 29048) } } }; static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE( EXT_PARTITION_TYPES)] = { { AOM_CDF4(19132, 25510, 30392) }, { AOM_CDF4(13928, 19855, 28540) }, { AOM_CDF4(12522, 23679, 28629) }, { AOM_CDF4(9896, 18783, 25853) }, { AOM_CDF10(15597, 20929, 24571, 26706, 27664, 28821, 29601, 30571, 31902) }, { AOM_CDF10(7925, 11043, 16785, 22470, 23971, 25043, 26651, 28701, 29834) }, { AOM_CDF10(5414, 13269, 15111, 20488, 22360, 24500, 25537, 26336, 32117) }, { AOM_CDF10(2662, 6362, 8614, 20860, 23053, 24778, 26436, 27829, 31171) }, { AOM_CDF10(18462, 20920, 23124, 27647, 28227, 29049, 29519, 30178, 31544) }, { AOM_CDF10(7689, 9060, 12056, 24992, 25660, 26182, 26951, 28041, 29052) }, { AOM_CDF10(6015, 9009, 10062, 24544, 25409, 26545, 27071, 27526, 32047) }, { AOM_CDF10(1394, 2208, 2796, 28614, 29061, 29466, 29840, 30185, 31899) }, { AOM_CDF10(20137, 21547, 23078, 29566, 29837, 30261, 30524, 30892, 31724) }, { AOM_CDF10(6732, 7490, 9497, 27944, 28250, 28515, 28969, 29630, 30104) }, { AOM_CDF10(5945, 7663, 8348, 28683, 29117, 29749, 30064, 30298, 32238) }, { AOM_CDF10(870, 1212, 1487, 31198, 31394, 31574, 31743, 31881, 32332) }, { AOM_CDF8(27899, 28219, 28529, 32484, 32539, 32619, 32639) }, { AOM_CDF8(6607, 6990, 8268, 32060, 32219, 32338, 32371) }, { AOM_CDF8(5429, 6676, 7122, 32027, 32227, 32531, 32582) }, { AOM_CDF8(711, 966, 1172, 32448, 32538, 32617, 32664) }, }; static const aom_cdf_prob default_intra_ext_tx_cdf [EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)] = { { { { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, }, { { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, }, { { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, }, { { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, }, }, { { { AOM_CDF7(1535, 8035, 9461, 12751, 23467, 27825) }, { AOM_CDF7(564, 3335, 9709, 10870, 18143, 28094) }, { AOM_CDF7(672, 3247, 3676, 11982, 19415, 23127) }, { AOM_CDF7(5279, 13885, 15487, 18044, 23527, 30252) }, { AOM_CDF7(4423, 6074, 7985, 10416, 25693, 29298) }, { AOM_CDF7(1486, 4241, 9460, 10662, 16456, 27694) }, { AOM_CDF7(439, 2838, 3522, 6737, 18058, 23754) }, { AOM_CDF7(1190, 4233, 4855, 11670, 20281, 24377) }, { AOM_CDF7(1045, 4312, 8647, 10159, 18644, 29335) }, { AOM_CDF7(202, 3734, 4747, 7298, 17127, 24016) }, { AOM_CDF7(447, 4312, 6819, 8884, 16010, 23858) }, { AOM_CDF7(277, 4369, 5255, 8905, 16465, 22271) }, { AOM_CDF7(3409, 5436, 10599, 15599, 19687, 24040) }, }, { { AOM_CDF7(1870, 13742, 14530, 16498, 23770, 27698) }, { AOM_CDF7(326, 8796, 14632, 15079, 19272, 27486) }, { AOM_CDF7(484, 7576, 7712, 14443, 19159, 22591) }, { AOM_CDF7(1126, 15340, 15895, 17023, 20896, 30279) }, { AOM_CDF7(655, 4854, 5249, 5913, 22099, 27138) }, { AOM_CDF7(1299, 6458, 8885, 9290, 14851, 25497) }, { AOM_CDF7(311, 5295, 5552, 6885, 16107, 22672) }, { AOM_CDF7(883, 8059, 8270, 11258, 17289, 21549) }, { AOM_CDF7(741, 7580, 9318, 10345, 16688, 29046) }, { AOM_CDF7(110, 7406, 7915, 9195, 16041, 23329) }, { AOM_CDF7(363, 7974, 9357, 10673, 15629, 24474) }, { AOM_CDF7(153, 7647, 8112, 9936, 15307, 19996) }, { AOM_CDF7(3511, 6332, 11165, 15335, 19323, 23594) }, }, { { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, }, { { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, }, }, { { { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, }, { { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, }, { { AOM_CDF5(1127, 12814, 22772, 27483) }, { AOM_CDF5(145, 6761, 11980, 26667) }, { AOM_CDF5(362, 5887, 11678, 16725) }, { AOM_CDF5(385, 15213, 18587, 30693) }, { AOM_CDF5(25, 2914, 23134, 27903) }, { AOM_CDF5(60, 4470, 11749, 23991) }, { AOM_CDF5(37, 3332, 14511, 21448) }, { AOM_CDF5(157, 6320, 13036, 17439) }, { AOM_CDF5(119, 6719, 12906, 29396) }, { AOM_CDF5(47, 5537, 12576, 21499) }, { AOM_CDF5(269, 6076, 11258, 23115) }, { AOM_CDF5(83, 5615, 12001, 17228) }, { AOM_CDF5(1968, 5556, 12023, 18547) }, }, { { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, { AOM_CDF5(6554, 13107, 19661, 26214) }, }, }, }; static const aom_cdf_prob default_inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES][CDF_SIZE( TX_TYPES)] = { { { 0 }, { 0 }, { 0 }, { 0 }, }, { { AOM_CDF16(4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, 21504, 22848, 23934, 25474, 27727, 28915, 30631) }, { AOM_CDF16(1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, 17674, 20408, 22517, 25010, 27116, 28856, 30749) }, { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, }, { { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, 24576, 27307, 30037) }, { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, 24576, 27307, 30037) }, { AOM_CDF12(770, 2421, 5225, 12907, 15819, 18927, 21561, 24089, 26595, 28526, 30529) }, { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, 24576, 27307, 30037) }, }, { { AOM_CDF2(16384) }, { AOM_CDF2(4167) }, { AOM_CDF2(1998) }, { AOM_CDF2(748) }, }, }; static const aom_cdf_prob default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)] = { AOM_CDF8(1418, 2123, 13340, 18405, 26972, 28343, 32294) }; static const aom_cdf_prob default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)] = { { AOM_CDF16(7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, 32700, 32704, 32708, 32712, 32716, 32720, 32724) }, { AOM_CDF16(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573, 32620, 32647, 32668, 32672, 32676, 32680, 32684) }, { AOM_CDF16(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649, 32673, 32677, 32681, 32685, 32689, 32693, 32697) }, { AOM_CDF16(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704, 32708, 32712, 32716, 32720, 32724, 32728, 32732) }, { AOM_CDF16(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321, 32394, 32464, 32516, 32560, 32576, 32593, 32622) }, { AOM_CDF16(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, 32144, 32413, 32520, 32594, 32622, 32656, 32660) } }; static const aom_cdf_prob default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE( SWITCHABLE_FILTERS)] = { { AOM_CDF3(31935, 32720) }, { AOM_CDF3(5568, 32719) }, { AOM_CDF3(422, 2938) }, { AOM_CDF3(28244, 32608) }, { AOM_CDF3(31206, 31953) }, { AOM_CDF3(4862, 32121) }, { AOM_CDF3(770, 1152) }, { AOM_CDF3(20889, 25637) }, { AOM_CDF3(31910, 32724) }, { AOM_CDF3(4120, 32712) }, { AOM_CDF3(305, 2247) }, { AOM_CDF3(27403, 32636) }, { AOM_CDF3(31022, 32009) }, { AOM_CDF3(2963, 32093) }, { AOM_CDF3(601, 943) }, { AOM_CDF3(14969, 21398) } }; static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE( 2)] = { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) }, { AOM_CDF2(8386) }, { AOM_CDF2(12222) }, { AOM_CDF2(4676) } }; static const aom_cdf_prob default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE( 2)] = { { AOM_CDF2(2175) }, { AOM_CDF2(1054) } }; static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE( 2)] = { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) }, { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } }; static const aom_cdf_prob default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)] = { { AOM_CDF2(13104) }, { AOM_CDF2(24560) }, { AOM_CDF2(18945) } }; static const aom_cdf_prob default_inter_compound_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE( INTER_COMPOUND_MODES)] = { { AOM_CDF8(7760, 13823, 15808, 17641, 19156, 20666, 26891) }, { AOM_CDF8(10730, 19452, 21145, 22749, 24039, 25131, 28724) }, { AOM_CDF8(10664, 20221, 21588, 22906, 24295, 25387, 28436) }, { AOM_CDF8(13298, 16984, 20471, 24182, 25067, 25736, 26422) }, { AOM_CDF8(18904, 23325, 25242, 27432, 27898, 28258, 30758) }, { AOM_CDF8(10725, 17454, 20124, 22820, 24195, 25168, 26046) }, { AOM_CDF8(17125, 24273, 25814, 27492, 28214, 28704, 30592) }, { AOM_CDF8(13046, 23214, 24505, 25942, 27435, 28442, 29330) } }; static const aom_cdf_prob default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE( 2)] = { { AOM_CDF2(16384) }, { AOM_CDF2(26887) }, { AOM_CDF2(27597) }, { AOM_CDF2(30237) } }; static const aom_cdf_prob default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE( INTERINTRA_MODES)] = { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(1875, 11082, 27332) }, { AOM_CDF4(2473, 9996, 26388) }, { AOM_CDF4(4238, 11537, 25926) } }; static const aom_cdf_prob default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = { { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(20036) }, { AOM_CDF2(24957) }, { AOM_CDF2(26704) }, { AOM_CDF2(27530) }, { AOM_CDF2(29564) }, { AOM_CDF2(29444) }, { AOM_CDF2(26872) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } }; static const aom_cdf_prob default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE( MASKED_COMPOUND_TYPES)] = { { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) }, { AOM_CDF2(9770) }, { AOM_CDF2(9100) }, { AOM_CDF2(8233) }, { AOM_CDF2(6172) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(11820) }, { AOM_CDF2(7701) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } }; static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE( 16)] = { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, 20359, 22362, 24127, 25702, 27752, 29450, 31171) }, { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323, 17367, 18452, 19422, 22839, 26127, 29629) }, { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939, 21332, 24520, 27470, 29456, 30529, 31656) }, { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, 19163, 20961, 22884, 24471, 26719, 28714, 30877) }, { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369, 16730, 18114, 19313, 22521, 26012, 29550) }, { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, 17270, 20533, 23434, 25972, 27944, 29570, 31416) }, { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, 20638, 22038, 23963, 25311, 26988, 28766, 31012) }, { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703, 24284, 24985, 25684, 27259, 28883, 30911) }, { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935, 25057, 27251, 29173, 30089, 30960, 31933) }, { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720) } }; static const aom_cdf_prob default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE( MOTION_MODES)] = { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, { AOM_CDF3(7651, 24760) }, { AOM_CDF3(4738, 24765) }, { AOM_CDF3(5391, 25528) }, { AOM_CDF3(19419, 26810) }, { AOM_CDF3(5123, 23606) }, { AOM_CDF3(11606, 24308) }, { AOM_CDF3(26260, 29116) }, { AOM_CDF3(20360, 28062) }, { AOM_CDF3(21679, 26830) }, { AOM_CDF3(29516, 30701) }, { AOM_CDF3(28898, 30397) }, { AOM_CDF3(30878, 31335) }, { AOM_CDF3(32507, 32558) }, { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, { AOM_CDF3(28799, 31390) }, { AOM_CDF3(26431, 30774) }, { AOM_CDF3(28973, 31594) }, { AOM_CDF3(29742, 31203) } }; static const aom_cdf_prob default_obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = { { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(10437) }, { AOM_CDF2(9371) }, { AOM_CDF2(9301) }, { AOM_CDF2(17432) }, { AOM_CDF2(14423) }, { AOM_CDF2(15142) }, { AOM_CDF2(25817) }, { AOM_CDF2(22823) }, { AOM_CDF2(22083) }, { AOM_CDF2(30128) }, { AOM_CDF2(31014) }, { AOM_CDF2(31560) }, { AOM_CDF2(32638) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(23664) }, { AOM_CDF2(20901) }, { AOM_CDF2(24008) }, { AOM_CDF2(26879) } }; static const aom_cdf_prob default_intra_inter_cdf[INTRA_INTER_CONTEXTS] [CDF_SIZE(2)] = { { AOM_CDF2(806) }, { AOM_CDF2(16662) }, { AOM_CDF2(20186) }, { AOM_CDF2(26538) } }; static const aom_cdf_prob default_comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE( 2)] = { { AOM_CDF2(26828) }, { AOM_CDF2(24035) }, { AOM_CDF2(12031) }, { AOM_CDF2(10640) }, { AOM_CDF2(2901) } }; static const aom_cdf_prob default_comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS] [CDF_SIZE(2)] = { { AOM_CDF2(1198) }, { AOM_CDF2(2070) }, { AOM_CDF2(9166) }, { AOM_CDF2(7499) }, { AOM_CDF2(22475) } }; static const aom_cdf_prob default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1][CDF_SIZE(2)] = { { { AOM_CDF2(5284) }, { AOM_CDF2(3865) }, { AOM_CDF2(3128) } }, { { AOM_CDF2(23152) }, { AOM_CDF2(14173) }, { AOM_CDF2(15270) } }, { { AOM_CDF2(31774) }, { AOM_CDF2(25120) }, { AOM_CDF2(26710) } } }; static const aom_cdf_prob default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1] [CDF_SIZE(2)] = { { { AOM_CDF2(4897) }, { AOM_CDF2(1555) }, { AOM_CDF2(4236) }, { AOM_CDF2(8650) }, { AOM_CDF2(904) }, { AOM_CDF2(1444) } }, { { AOM_CDF2(16973) }, { AOM_CDF2(16751) }, { AOM_CDF2(19647) }, { AOM_CDF2(24773) }, { AOM_CDF2(11014) }, { AOM_CDF2(15087) } }, { { AOM_CDF2(29744) }, { AOM_CDF2(30279) }, { AOM_CDF2(31194) }, { AOM_CDF2(31895) }, { AOM_CDF2(26875) }, { AOM_CDF2(30304) } } }; static const aom_cdf_prob default_comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)] = { { { AOM_CDF2(4946) }, { AOM_CDF2(9468) }, { AOM_CDF2(1503) } }, { { AOM_CDF2(19891) }, { AOM_CDF2(22441) }, { AOM_CDF2(15160) } }, { { AOM_CDF2(30731) }, { AOM_CDF2(31059) }, { AOM_CDF2(27544) } } }; static const aom_cdf_prob default_comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)] = { { { AOM_CDF2(2235) }, { AOM_CDF2(1423) } }, { { AOM_CDF2(17182) }, { AOM_CDF2(15175) } }, { { AOM_CDF2(30606) }, { AOM_CDF2(30489) } } }; static const aom_cdf_prob default_palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = { { AOM_CDF7(7952, 13000, 18149, 21478, 25527, 29241) }, { AOM_CDF7(7139, 11421, 16195, 19544, 23666, 28073) }, { AOM_CDF7(7788, 12741, 17325, 20500, 24315, 28530) }, { AOM_CDF7(8271, 14064, 18246, 21564, 25071, 28533) }, { AOM_CDF7(12725, 19180, 21863, 24839, 27535, 30120) }, { AOM_CDF7(9711, 14888, 16923, 21052, 25661, 27875) }, { AOM_CDF7(14940, 20797, 21678, 24186, 27033, 28999) } }; static const aom_cdf_prob default_palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = { { AOM_CDF7(8713, 19979, 27128, 29609, 31331, 32272) }, { AOM_CDF7(5839, 15573, 23581, 26947, 29848, 31700) }, { AOM_CDF7(4426, 11260, 17999, 21483, 25863, 29430) }, { AOM_CDF7(3228, 9464, 14993, 18089, 22523, 27420) }, { AOM_CDF7(3768, 8886, 13091, 17852, 22495, 27207) }, { AOM_CDF7(2464, 8451, 12861, 21632, 25525, 28555) }, { AOM_CDF7(1269, 5435, 10433, 18963, 21700, 25865) } }; static const aom_cdf_prob default_palette_y_mode_cdf [PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][CDF_SIZE(2)] = { { { AOM_CDF2(31676) }, { AOM_CDF2(3419) }, { AOM_CDF2(1261) } }, { { AOM_CDF2(31912) }, { AOM_CDF2(2859) }, { AOM_CDF2(980) } }, { { AOM_CDF2(31823) }, { AOM_CDF2(3400) }, { AOM_CDF2(781) } }, { { AOM_CDF2(32030) }, { AOM_CDF2(3561) }, { AOM_CDF2(904) } }, { { AOM_CDF2(32309) }, { AOM_CDF2(7337) }, { AOM_CDF2(1462) } }, { { AOM_CDF2(32265) }, { AOM_CDF2(4015) }, { AOM_CDF2(1521) } }, { { AOM_CDF2(32450) }, { AOM_CDF2(7946) }, { AOM_CDF2(129) } } }; static const aom_cdf_prob default_palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)] = { { AOM_CDF2(32461) }, { AOM_CDF2(21488) } }; static const aom_cdf_prob default_palette_y_color_index_cdf [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = { { { AOM_CDF2(28710) }, { AOM_CDF2(16384) }, { AOM_CDF2(10553) }, { AOM_CDF2(27036) }, { AOM_CDF2(31603) }, }, { { AOM_CDF3(27877, 30490) }, { AOM_CDF3(11532, 25697) }, { AOM_CDF3(6544, 30234) }, { AOM_CDF3(23018, 28072) }, { AOM_CDF3(31915, 32385) }, }, { { AOM_CDF4(25572, 28046, 30045) }, { AOM_CDF4(9478, 21590, 27256) }, { AOM_CDF4(7248, 26837, 29824) }, { AOM_CDF4(19167, 24486, 28349) }, { AOM_CDF4(31400, 31825, 32250) }, }, { { AOM_CDF5(24779, 26955, 28576, 30282) }, { AOM_CDF5(8669, 20364, 24073, 28093) }, { AOM_CDF5(4255, 27565, 29377, 31067) }, { AOM_CDF5(19864, 23674, 26716, 29530) }, { AOM_CDF5(31646, 31893, 32147, 32426) }, }, { { AOM_CDF6(23132, 25407, 26970, 28435, 30073) }, { AOM_CDF6(7443, 17242, 20717, 24762, 27982) }, { AOM_CDF6(6300, 24862, 26944, 28784, 30671) }, { AOM_CDF6(18916, 22895, 25267, 27435, 29652) }, { AOM_CDF6(31270, 31550, 31808, 32059, 32353) }, }, { { AOM_CDF7(23105, 25199, 26464, 27684, 28931, 30318) }, { AOM_CDF7(6950, 15447, 18952, 22681, 25567, 28563) }, { AOM_CDF7(7560, 23474, 25490, 27203, 28921, 30708) }, { AOM_CDF7(18544, 22373, 24457, 26195, 28119, 30045) }, { AOM_CDF7(31198, 31451, 31670, 31882, 32123, 32391) }, }, { { AOM_CDF8(21689, 23883, 25163, 26352, 27506, 28827, 30195) }, { AOM_CDF8(6892, 15385, 17840, 21606, 24287, 26753, 29204) }, { AOM_CDF8(5651, 23182, 25042, 26518, 27982, 29392, 30900) }, { AOM_CDF8(19349, 22578, 24418, 25994, 27524, 29031, 30448) }, { AOM_CDF8(31028, 31270, 31504, 31705, 31927, 32153, 32392) }, }, }; static const aom_cdf_prob default_palette_uv_color_index_cdf [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = { { { AOM_CDF2(29089) }, { AOM_CDF2(16384) }, { AOM_CDF2(8713) }, { AOM_CDF2(29257) }, { AOM_CDF2(31610) }, }, { { AOM_CDF3(25257, 29145) }, { AOM_CDF3(12287, 27293) }, { AOM_CDF3(7033, 27960) }, { AOM_CDF3(20145, 25405) }, { AOM_CDF3(30608, 31639) }, }, { { AOM_CDF4(24210, 27175, 29903) }, { AOM_CDF4(9888, 22386, 27214) }, { AOM_CDF4(5901, 26053, 29293) }, { AOM_CDF4(18318, 22152, 28333) }, { AOM_CDF4(30459, 31136, 31926) }, }, { { AOM_CDF5(22980, 25479, 27781, 29986) }, { AOM_CDF5(8413, 21408, 24859, 28874) }, { AOM_CDF5(2257, 29449, 30594, 31598) }, { AOM_CDF5(19189, 21202, 25915, 28620) }, { AOM_CDF5(31844, 32044, 32281, 32518) }, }, { { AOM_CDF6(22217, 24567, 26637, 28683, 30548) }, { AOM_CDF6(7307, 16406, 19636, 24632, 28424) }, { AOM_CDF6(4441, 25064, 26879, 28942, 30919) }, { AOM_CDF6(17210, 20528, 23319, 26750, 29582) }, { AOM_CDF6(30674, 30953, 31396, 31735, 32207) }, }, { { AOM_CDF7(21239, 23168, 25044, 26962, 28705, 30506) }, { AOM_CDF7(6545, 15012, 18004, 21817, 25503, 28701) }, { AOM_CDF7(3448, 26295, 27437, 28704, 30126, 31442) }, { AOM_CDF7(15889, 18323, 21704, 24698, 26976, 29690) }, { AOM_CDF7(30988, 31204, 31479, 31734, 31983, 32325) }, }, { { AOM_CDF8(21442, 23288, 24758, 26246, 27649, 28980, 30563) }, { AOM_CDF8(5863, 14933, 17552, 20668, 23683, 26411, 29273) }, { AOM_CDF8(3415, 25810, 26877, 27990, 29223, 30394, 31618) }, { AOM_CDF8(17965, 20084, 22232, 23974, 26274, 28402, 30390) }, { AOM_CDF8(31190, 31329, 31516, 31679, 31825, 32026, 32322) }, }, }; static const aom_cdf_prob default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)] = { { AOM_CDF2(28581) }, { AOM_CDF2(23846) }, { AOM_CDF2(20847) }, { AOM_CDF2(24315) }, { AOM_CDF2(18196) }, { AOM_CDF2(12133) }, { AOM_CDF2(18791) }, { AOM_CDF2(10887) }, { AOM_CDF2(11005) }, { AOM_CDF2(27179) }, { AOM_CDF2(20004) }, { AOM_CDF2(11281) }, { AOM_CDF2(26549) }, { AOM_CDF2(19308) }, { AOM_CDF2(14224) }, { AOM_CDF2(28015) }, { AOM_CDF2(21546) }, { AOM_CDF2(14400) }, { AOM_CDF2(28165) }, { AOM_CDF2(22401) }, { AOM_CDF2(16088) } }; static const aom_cdf_prob default_skip_txfm_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)] = { { AOM_CDF2(31671) }, { AOM_CDF2(16515) }, { AOM_CDF2(4576) } }; static const aom_cdf_prob default_skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE( 2)] = { { AOM_CDF2(32621) }, { AOM_CDF2(20708) }, { AOM_CDF2(8127) } }; static const aom_cdf_prob default_compound_idx_cdfs[COMP_INDEX_CONTEXTS][CDF_SIZE(2)] = { { AOM_CDF2(18244) }, { AOM_CDF2(12865) }, { AOM_CDF2(7053) }, { AOM_CDF2(13259) }, { AOM_CDF2(9334) }, { AOM_CDF2(4644) } }; static const aom_cdf_prob default_comp_group_idx_cdfs[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)] = { { AOM_CDF2(26607) }, { AOM_CDF2(22891) }, { AOM_CDF2(18840) }, { AOM_CDF2(24594) }, { AOM_CDF2(19934) }, { AOM_CDF2(22674) } }; static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)] = { AOM_CDF2( 30531) }; static const aom_cdf_prob default_filter_intra_mode_cdf[CDF_SIZE( FILTER_INTRA_MODES)] = { AOM_CDF5(8949, 12776, 17211, 29558) }; static const aom_cdf_prob default_filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE( 2)] = { { AOM_CDF2(4621) }, { AOM_CDF2(6743) }, { AOM_CDF2(5893) }, { AOM_CDF2(7866) }, { AOM_CDF2(12551) }, { AOM_CDF2(9394) }, { AOM_CDF2(12408) }, { AOM_CDF2(14301) }, { AOM_CDF2(12756) }, { AOM_CDF2(22343) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(12770) }, { AOM_CDF2(10368) }, { AOM_CDF2(20229) }, { AOM_CDF2(18101) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } }; static const aom_cdf_prob default_switchable_restore_cdf[CDF_SIZE( RESTORE_SWITCHABLE_TYPES)] = { AOM_CDF3(9413, 22581) }; static const aom_cdf_prob default_wiener_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2( 11570) }; static const aom_cdf_prob default_sgrproj_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2( 16855) }; static const aom_cdf_prob default_delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)] = { AOM_CDF4(28160, 32120, 32677) }; static const aom_cdf_prob default_delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE( DELTA_LF_PROBS + 1)] = { { AOM_CDF4(28160, 32120, 32677) }, { AOM_CDF4(28160, 32120, 32677) }, { AOM_CDF4(28160, 32120, 32677) }, { AOM_CDF4(28160, 32120, 32677) } }; static const aom_cdf_prob default_delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)] = { AOM_CDF4(28160, 32120, 32677) }; static const aom_cdf_prob default_segment_pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)] = { { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) } }; static const aom_cdf_prob default_spatial_pred_seg_tree_cdf[SPATIAL_PREDICTION_PROBS][CDF_SIZE( MAX_SEGMENTS)] = { { AOM_CDF8(5622, 7893, 16093, 18233, 27809, 28373, 32533), }, { AOM_CDF8(14274, 18230, 22557, 24935, 29980, 30851, 32344), }, { AOM_CDF8(27527, 28487, 28723, 28890, 32397, 32647, 32679), }, }; static const aom_cdf_prob default_tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS] [CDF_SIZE(MAX_TX_DEPTH + 1)] = { { { AOM_CDF2(19968) }, { AOM_CDF2(19968) }, { AOM_CDF2(24320) } }, { { AOM_CDF3(12272, 30172) }, { AOM_CDF3(12272, 30172) }, { AOM_CDF3(18677, 30848) } }, { { AOM_CDF3(12986, 15180) }, { AOM_CDF3(12986, 15180) }, { AOM_CDF3(24302, 25602) } }, { { AOM_CDF3(5782, 11475) }, { AOM_CDF3(5782, 11475) }, { AOM_CDF3(16803, 22759) } }, }; // Negative values are invalid const int av1_palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH + 1] = { -1, -1, 0, -1, -1, 4, 3, 2, 1 }; int av1_get_palette_color_index_context(const uint8_t *color_map, int stride, int r, int c, int palette_size, uint8_t *color_order, int *color_idx) { assert(palette_size <= PALETTE_MAX_SIZE); assert(r > 0 || c > 0); // Get color indices of neighbors. int color_neighbors[NUM_PALETTE_NEIGHBORS]; color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1; color_neighbors[1] = (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1; color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1; // The +10 below should not be needed. But we get a warning "array subscript // is above array bounds [-Werror=array-bounds]" without it, possibly due to // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124 int scores[PALETTE_MAX_SIZE + 10] = { 0 }; int i; static const int weights[NUM_PALETTE_NEIGHBORS] = { 2, 1, 2 }; for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) { if (color_neighbors[i] >= 0) { scores[color_neighbors[i]] += weights[i]; } } int inverse_color_order[PALETTE_MAX_SIZE]; for (i = 0; i < PALETTE_MAX_SIZE; ++i) { color_order[i] = i; inverse_color_order[i] = i; } // Get the top NUM_PALETTE_NEIGHBORS scores (sorted from large to small). for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) { int max = scores[i]; int max_idx = i; for (int j = i + 1; j < palette_size; ++j) { if (scores[j] > max) { max = scores[j]; max_idx = j; } } if (max_idx != i) { // Move the score at index 'max_idx' to index 'i', and shift the scores // from 'i' to 'max_idx - 1' by 1. const int max_score = scores[max_idx]; const uint8_t max_color_order = color_order[max_idx]; for (int k = max_idx; k > i; --k) { scores[k] = scores[k - 1]; color_order[k] = color_order[k - 1]; inverse_color_order[color_order[k]] = k; } scores[i] = max_score; color_order[i] = max_color_order; inverse_color_order[color_order[i]] = i; } } if (color_idx != NULL) *color_idx = inverse_color_order[color_map[r * stride + c]]; // Get hash value of context. int color_index_ctx_hash = 0; static const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 }; for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) { color_index_ctx_hash += scores[i] * hash_multipliers[i]; } assert(color_index_ctx_hash > 0); assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH); // Lookup context from hash. const int color_index_ctx = av1_palette_color_index_context_lookup[color_index_ctx_hash]; assert(color_index_ctx >= 0); assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS); return color_index_ctx; } void av1_init_mode_probs(FRAME_CONTEXT *fc) { av1_copy(fc->palette_y_size_cdf, default_palette_y_size_cdf); av1_copy(fc->palette_uv_size_cdf, default_palette_uv_size_cdf); av1_copy(fc->palette_y_color_index_cdf, default_palette_y_color_index_cdf); av1_copy(fc->palette_uv_color_index_cdf, default_palette_uv_color_index_cdf); av1_copy(fc->kf_y_cdf, default_kf_y_mode_cdf); av1_copy(fc->angle_delta_cdf, default_angle_delta_cdf); av1_copy(fc->comp_inter_cdf, default_comp_inter_cdf); av1_copy(fc->comp_ref_type_cdf, default_comp_ref_type_cdf); av1_copy(fc->uni_comp_ref_cdf, default_uni_comp_ref_cdf); av1_copy(fc->palette_y_mode_cdf, default_palette_y_mode_cdf); av1_copy(fc->palette_uv_mode_cdf, default_palette_uv_mode_cdf); av1_copy(fc->comp_ref_cdf, default_comp_ref_cdf); av1_copy(fc->comp_bwdref_cdf, default_comp_bwdref_cdf); av1_copy(fc->single_ref_cdf, default_single_ref_cdf); av1_copy(fc->txfm_partition_cdf, default_txfm_partition_cdf); av1_copy(fc->compound_index_cdf, default_compound_idx_cdfs); av1_copy(fc->comp_group_idx_cdf, default_comp_group_idx_cdfs); av1_copy(fc->newmv_cdf, default_newmv_cdf); av1_copy(fc->zeromv_cdf, default_zeromv_cdf); av1_copy(fc->refmv_cdf, default_refmv_cdf); av1_copy(fc->drl_cdf, default_drl_cdf); av1_copy(fc->motion_mode_cdf, default_motion_mode_cdf); av1_copy(fc->obmc_cdf, default_obmc_cdf); av1_copy(fc->inter_compound_mode_cdf, default_inter_compound_mode_cdf); av1_copy(fc->compound_type_cdf, default_compound_type_cdf); av1_copy(fc->wedge_idx_cdf, default_wedge_idx_cdf); av1_copy(fc->interintra_cdf, default_interintra_cdf); av1_copy(fc->wedge_interintra_cdf, default_wedge_interintra_cdf); av1_copy(fc->interintra_mode_cdf, default_interintra_mode_cdf); av1_copy(fc->seg.pred_cdf, default_segment_pred_cdf); av1_copy(fc->filter_intra_cdfs, default_filter_intra_cdfs); av1_copy(fc->filter_intra_mode_cdf, default_filter_intra_mode_cdf); av1_copy(fc->switchable_restore_cdf, default_switchable_restore_cdf); av1_copy(fc->wiener_restore_cdf, default_wiener_restore_cdf); av1_copy(fc->sgrproj_restore_cdf, default_sgrproj_restore_cdf); av1_copy(fc->y_mode_cdf, default_if_y_mode_cdf); av1_copy(fc->uv_mode_cdf, default_uv_mode_cdf); av1_copy(fc->switchable_interp_cdf, default_switchable_interp_cdf); av1_copy(fc->partition_cdf, default_partition_cdf); av1_copy(fc->intra_ext_tx_cdf, default_intra_ext_tx_cdf); av1_copy(fc->inter_ext_tx_cdf, default_inter_ext_tx_cdf); av1_copy(fc->skip_mode_cdfs, default_skip_mode_cdfs); av1_copy(fc->skip_txfm_cdfs, default_skip_txfm_cdfs); av1_copy(fc->intra_inter_cdf, default_intra_inter_cdf); for (int i = 0; i < SPATIAL_PREDICTION_PROBS; i++) av1_copy(fc->seg.spatial_pred_seg_cdf[i], default_spatial_pred_seg_tree_cdf[i]); av1_copy(fc->tx_size_cdf, default_tx_size_cdf); av1_copy(fc->delta_q_cdf, default_delta_q_cdf); av1_copy(fc->delta_lf_cdf, default_delta_lf_cdf); av1_copy(fc->delta_lf_multi_cdf, default_delta_lf_multi_cdf); av1_copy(fc->cfl_sign_cdf, default_cfl_sign_cdf); av1_copy(fc->cfl_alpha_cdf, default_cfl_alpha_cdf); av1_copy(fc->intrabc_cdf, default_intrabc_cdf); } void av1_set_default_ref_deltas(int8_t *ref_deltas) { assert(ref_deltas != NULL); ref_deltas[INTRA_FRAME] = 1; ref_deltas[LAST_FRAME] = 0; ref_deltas[LAST2_FRAME] = ref_deltas[LAST_FRAME]; ref_deltas[LAST3_FRAME] = ref_deltas[LAST_FRAME]; ref_deltas[BWDREF_FRAME] = ref_deltas[LAST_FRAME]; ref_deltas[GOLDEN_FRAME] = -1; ref_deltas[ALTREF2_FRAME] = -1; ref_deltas[ALTREF_FRAME] = -1; } void av1_set_default_mode_deltas(int8_t *mode_deltas) { assert(mode_deltas != NULL); mode_deltas[0] = 0; mode_deltas[1] = 0; } static void set_default_lf_deltas(struct loopfilter *lf) { lf->mode_ref_delta_enabled = 1; lf->mode_ref_delta_update = 1; av1_set_default_ref_deltas(lf->ref_deltas); av1_set_default_mode_deltas(lf->mode_deltas); } void av1_setup_frame_contexts(AV1_COMMON *cm) { // Store the frame context into a special slot (not associated with any // reference buffer), so that we can set up cm->pre_fc correctly later // This function must ONLY be called when cm->fc has been initialized with // default probs, either by av1_setup_past_independence or after manually // initializing them *cm->default_frame_context = *cm->fc; // TODO(jack.haughton@argondesign.com): don't think this should be necessary, // but could do with fuller testing if (cm->tiles.large_scale) { for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { RefCntBuffer *const buf = get_ref_frame_buf(cm, i); if (buf != NULL) buf->frame_context = *cm->fc; } for (int i = 0; i < cm->buffer_pool->num_frame_bufs; ++i) cm->buffer_pool->frame_bufs[i].frame_context = *cm->fc; } } void av1_setup_past_independence(AV1_COMMON *cm) { // Reset the segment feature data to the default stats: // Features disabled, 0, with delta coding (Default state). av1_clearall_segfeatures(&cm->seg); if (cm->cur_frame->seg_map) { memset(cm->cur_frame->seg_map, 0, (cm->cur_frame->mi_rows * cm->cur_frame->mi_cols)); } // reset mode ref deltas av1_set_default_ref_deltas(cm->cur_frame->ref_deltas); av1_set_default_mode_deltas(cm->cur_frame->mode_deltas); set_default_lf_deltas(&cm->lf); av1_default_coef_probs(cm); av1_init_mode_probs(cm->fc); av1_init_mv_probs(cm); cm->fc->initialized = 1; av1_setup_frame_contexts(cm); } aom-3.12.1/av1/common/entropymode.h000066400000000000000000000232171477627663500171030ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_ENTROPYMODE_H_ #define AOM_AV1_COMMON_ENTROPYMODE_H_ #include "aom_ports/bitops.h" #include "av1/common/entropy.h" #include "av1/common/entropymv.h" #include "av1/common/filter.h" #include "av1/common/seg_common.h" #ifdef __cplusplus extern "C" { #endif #define BLOCK_SIZE_GROUPS 4 #define TX_SIZE_CONTEXTS 3 #define INTER_OFFSET(mode) ((mode)-NEARESTMV) #define INTER_COMPOUND_OFFSET(mode) (uint8_t)((mode)-NEAREST_NEARESTMV) // Number of possible contexts for a color index. // As can be seen from av1_get_palette_color_index_context(), the possible // contexts are (2,0,0), (2,2,1), (3,2,0), (4,1,0), (5,0,0). These are mapped to // a value from 0 to 4 using 'av1_palette_color_index_context_lookup' table. #define PALETTE_COLOR_INDEX_CONTEXTS 5 // Palette Y mode context for a block is determined by number of neighboring // blocks (top and/or left) using a palette for Y plane. So, possible Y mode' // context values are: // 0 if neither left nor top block uses palette for Y plane, // 1 if exactly one of left or top block uses palette for Y plane, and // 2 if both left and top blocks use palette for Y plane. #define PALETTE_Y_MODE_CONTEXTS 3 // Palette UV mode context for a block is determined by whether this block uses // palette for the Y plane. So, possible values are: // 0 if this block doesn't use palette for Y plane. // 1 if this block uses palette for Y plane (i.e. Y palette size > 0). #define PALETTE_UV_MODE_CONTEXTS 2 // Map the number of pixels in a block size to a context // 64(BLOCK_8X8, BLOCK_4x16, BLOCK_16X4) -> 0 // 128(BLOCK_8X16, BLOCK_16x8) -> 1 // ... // 4096(BLOCK_64X64) -> 6 #define PALATTE_BSIZE_CTXS 7 #define MAX_COLOR_CONTEXT_HASH 8 #define NUM_PALETTE_NEIGHBORS 3 // left, top-left and top. #define KF_MODE_CONTEXTS 5 struct AV1Common; typedef struct { const int16_t *scan; const int16_t *iscan; } SCAN_ORDER; typedef struct frame_contexts { aom_cdf_prob txb_skip_cdf[TX_SIZES][TXB_SKIP_CONTEXTS][CDF_SIZE(2)]; aom_cdf_prob eob_extra_cdf[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS] [CDF_SIZE(2)]; aom_cdf_prob dc_sign_cdf[PLANE_TYPES][DC_SIGN_CONTEXTS][CDF_SIZE(2)]; aom_cdf_prob eob_flag_cdf16[PLANE_TYPES][2][CDF_SIZE(5)]; aom_cdf_prob eob_flag_cdf32[PLANE_TYPES][2][CDF_SIZE(6)]; aom_cdf_prob eob_flag_cdf64[PLANE_TYPES][2][CDF_SIZE(7)]; aom_cdf_prob eob_flag_cdf128[PLANE_TYPES][2][CDF_SIZE(8)]; aom_cdf_prob eob_flag_cdf256[PLANE_TYPES][2][CDF_SIZE(9)]; aom_cdf_prob eob_flag_cdf512[PLANE_TYPES][2][CDF_SIZE(10)]; aom_cdf_prob eob_flag_cdf1024[PLANE_TYPES][2][CDF_SIZE(11)]; aom_cdf_prob coeff_base_eob_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB] [CDF_SIZE(3)]; aom_cdf_prob coeff_base_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS] [CDF_SIZE(4)]; aom_cdf_prob coeff_br_cdf[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS] [CDF_SIZE(BR_CDF_SIZE)]; aom_cdf_prob newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)]; aom_cdf_prob zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(2)]; aom_cdf_prob refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)]; aom_cdf_prob drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)]; aom_cdf_prob inter_compound_mode_cdf[INTER_MODE_CONTEXTS] [CDF_SIZE(INTER_COMPOUND_MODES)]; aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL] [CDF_SIZE(MASKED_COMPOUND_TYPES)]; aom_cdf_prob wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)]; aom_cdf_prob interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)]; aom_cdf_prob wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]; aom_cdf_prob interintra_mode_cdf[BLOCK_SIZE_GROUPS] [CDF_SIZE(INTERINTRA_MODES)]; aom_cdf_prob motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)]; aom_cdf_prob obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]; aom_cdf_prob palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]; aom_cdf_prob palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]; aom_cdf_prob palette_y_color_index_cdf[PALETTE_SIZES] [PALETTE_COLOR_INDEX_CONTEXTS] [CDF_SIZE(PALETTE_COLORS)]; aom_cdf_prob palette_uv_color_index_cdf[PALETTE_SIZES] [PALETTE_COLOR_INDEX_CONTEXTS] [CDF_SIZE(PALETTE_COLORS)]; aom_cdf_prob palette_y_mode_cdf[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS] [CDF_SIZE(2)]; aom_cdf_prob palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)]; aom_cdf_prob comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(2)]; aom_cdf_prob single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)]; aom_cdf_prob comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS][CDF_SIZE(2)]; aom_cdf_prob uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1] [CDF_SIZE(2)]; aom_cdf_prob comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)]; aom_cdf_prob comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)]; aom_cdf_prob txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)]; aom_cdf_prob compound_index_cdf[COMP_INDEX_CONTEXTS][CDF_SIZE(2)]; aom_cdf_prob comp_group_idx_cdf[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)]; aom_cdf_prob skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(2)]; aom_cdf_prob skip_txfm_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)]; aom_cdf_prob intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)]; nmv_context nmvc; nmv_context ndvc; aom_cdf_prob intrabc_cdf[CDF_SIZE(2)]; struct segmentation_probs seg; aom_cdf_prob filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(2)]; aom_cdf_prob filter_intra_mode_cdf[CDF_SIZE(FILTER_INTRA_MODES)]; aom_cdf_prob switchable_restore_cdf[CDF_SIZE(RESTORE_SWITCHABLE_TYPES)]; aom_cdf_prob wiener_restore_cdf[CDF_SIZE(2)]; aom_cdf_prob sgrproj_restore_cdf[CDF_SIZE(2)]; aom_cdf_prob y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)]; aom_cdf_prob uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES] [CDF_SIZE(UV_INTRA_MODES)]; aom_cdf_prob partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_PARTITION_TYPES)]; aom_cdf_prob switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS] [CDF_SIZE(SWITCHABLE_FILTERS)]; /* kf_y_cdf is discarded after use, so does not require persistent storage. However, we keep it with the other CDFs in this struct since it needs to be copied to each tile to support parallelism just like the others. */ aom_cdf_prob kf_y_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS] [CDF_SIZE(INTRA_MODES)]; aom_cdf_prob angle_delta_cdf[DIRECTIONAL_MODES] [CDF_SIZE(2 * MAX_ANGLE_DELTA + 1)]; aom_cdf_prob tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS] [CDF_SIZE(MAX_TX_DEPTH + 1)]; aom_cdf_prob delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)]; aom_cdf_prob delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(DELTA_LF_PROBS + 1)]; aom_cdf_prob delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)]; aom_cdf_prob intra_ext_tx_cdf[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES] [CDF_SIZE(TX_TYPES)]; aom_cdf_prob inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES] [CDF_SIZE(TX_TYPES)]; aom_cdf_prob cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)]; aom_cdf_prob cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)]; int initialized; } FRAME_CONTEXT; static const int av1_ext_tx_ind[EXT_TX_SET_TYPES][TX_TYPES] = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1, 5, 6, 4, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0 }, { 3, 4, 5, 8, 6, 7, 9, 10, 11, 0, 1, 2, 0, 0, 0, 0 }, { 7, 8, 9, 12, 10, 11, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6 }, }; static const int av1_ext_tx_inv[EXT_TX_SET_TYPES][TX_TYPES] = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 9, 0, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 9, 0, 10, 11, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 9, 10, 11, 0, 1, 2, 4, 5, 3, 6, 7, 8, 0, 0, 0, 0 }, { 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 4, 5, 3, 6, 7, 8 }, }; void av1_set_default_ref_deltas(int8_t *ref_deltas); void av1_set_default_mode_deltas(int8_t *mode_deltas); void av1_setup_frame_contexts(struct AV1Common *cm); void av1_setup_past_independence(struct AV1Common *cm); // Returns (int)ceil(log2(n)). static inline int av1_ceil_log2(int n) { if (n < 2) return 0; return get_msb(n - 1) + 1; } // Returns the context for palette color index at row 'r' and column 'c', // along with the 'color_order' of neighbors and the 'color_idx'. // The 'color_map' is a 2D array with the given 'stride'. int av1_get_palette_color_index_context(const uint8_t *color_map, int stride, int r, int c, int palette_size, uint8_t *color_order, int *color_idx); extern const int av1_palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH + 1]; #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_ENTROPYMODE_H_ aom-3.12.1/av1/common/entropymv.c000066400000000000000000000053201477627663500165670ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/av1_common_int.h" #include "av1/common/entropymv.h" static const nmv_context default_nmv_context = { { AOM_CDF4(4096, 11264, 19328) }, // joints_cdf { { // Vertical component { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757, 32762, 32767) }, // class_cdf // fp { { AOM_CDF4(16384, 24576, 26624) }, { AOM_CDF4(12288, 21248, 24128) } }, // class0_fp_cdf { AOM_CDF4(8192, 17408, 21248) }, // fp_cdf { AOM_CDF2(128 * 128) }, // sign_cdf { AOM_CDF2(160 * 128) }, // class0_hp_cdf { AOM_CDF2(128 * 128) }, // hp_cdf { AOM_CDF2(216 * 128) }, // class0_cdf { { AOM_CDF2(128 * 136) }, { AOM_CDF2(128 * 140) }, { AOM_CDF2(128 * 148) }, { AOM_CDF2(128 * 160) }, { AOM_CDF2(128 * 176) }, { AOM_CDF2(128 * 192) }, { AOM_CDF2(128 * 224) }, { AOM_CDF2(128 * 234) }, { AOM_CDF2(128 * 234) }, { AOM_CDF2(128 * 240) } }, // bits_cdf }, { // Horizontal component { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757, 32762, 32767) }, // class_cdf // fp { { AOM_CDF4(16384, 24576, 26624) }, { AOM_CDF4(12288, 21248, 24128) } }, // class0_fp_cdf { AOM_CDF4(8192, 17408, 21248) }, // fp_cdf { AOM_CDF2(128 * 128) }, // sign_cdf { AOM_CDF2(160 * 128) }, // class0_hp_cdf { AOM_CDF2(128 * 128) }, // hp_cdf { AOM_CDF2(216 * 128) }, // class0_cdf { { AOM_CDF2(128 * 136) }, { AOM_CDF2(128 * 140) }, { AOM_CDF2(128 * 148) }, { AOM_CDF2(128 * 160) }, { AOM_CDF2(128 * 176) }, { AOM_CDF2(128 * 192) }, { AOM_CDF2(128 * 224) }, { AOM_CDF2(128 * 234) }, { AOM_CDF2(128 * 234) }, { AOM_CDF2(128 * 240) } }, // bits_cdf } }, }; void av1_init_mv_probs(AV1_COMMON *cm) { // NB: this sets CDFs too cm->fc->nmvc = default_nmv_context; cm->fc->ndvc = default_nmv_context; } aom-3.12.1/av1/common/entropymv.h000066400000000000000000000062171477627663500166020ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_ENTROPYMV_H_ #define AOM_AV1_COMMON_ENTROPYMV_H_ #include "config/aom_config.h" #include "aom_dsp/prob.h" #include "av1/common/mv.h" #ifdef __cplusplus extern "C" { #endif struct AV1Common; void av1_init_mv_probs(struct AV1Common *cm); #define MV_UPDATE_PROB 252 /* Symbols for coding which components are zero jointly */ #define MV_JOINTS 4 enum { MV_JOINT_ZERO = 0, /* Zero vector */ MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */ MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */ MV_JOINT_HNZVNZ = 3, /* Both components nonzero */ } UENUM1BYTE(MV_JOINT_TYPE); static inline int mv_joint_vertical(MV_JOINT_TYPE type) { return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ; } static inline int mv_joint_horizontal(MV_JOINT_TYPE type) { return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ; } /* Symbols for coding magnitude class of nonzero components */ #define MV_CLASSES 11 enum { MV_CLASS_0 = 0, /* (0, 2] integer pel */ MV_CLASS_1 = 1, /* (2, 4] integer pel */ MV_CLASS_2 = 2, /* (4, 8] integer pel */ MV_CLASS_3 = 3, /* (8, 16] integer pel */ MV_CLASS_4 = 4, /* (16, 32] integer pel */ MV_CLASS_5 = 5, /* (32, 64] integer pel */ MV_CLASS_6 = 6, /* (64, 128] integer pel */ MV_CLASS_7 = 7, /* (128, 256] integer pel */ MV_CLASS_8 = 8, /* (256, 512] integer pel */ MV_CLASS_9 = 9, /* (512, 1024] integer pel */ MV_CLASS_10 = 10, /* (1024,2048] integer pel */ } UENUM1BYTE(MV_CLASS_TYPE); #define CLASS0_BITS 1 /* bits at integer precision for class 0 */ #define CLASS0_SIZE (1 << CLASS0_BITS) #define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2) #define MV_BITS_CONTEXTS 6 #define MV_FP_SIZE 4 #define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2) #define MV_MAX ((1 << MV_MAX_BITS) - 1) #define MV_VALS ((MV_MAX << 1) + 1) #define MV_IN_USE_BITS 14 #define MV_UPP (1 << MV_IN_USE_BITS) #define MV_LOW (-(1 << MV_IN_USE_BITS)) typedef struct { aom_cdf_prob classes_cdf[CDF_SIZE(MV_CLASSES)]; aom_cdf_prob class0_fp_cdf[CLASS0_SIZE][CDF_SIZE(MV_FP_SIZE)]; aom_cdf_prob fp_cdf[CDF_SIZE(MV_FP_SIZE)]; aom_cdf_prob sign_cdf[CDF_SIZE(2)]; aom_cdf_prob class0_hp_cdf[CDF_SIZE(2)]; aom_cdf_prob hp_cdf[CDF_SIZE(2)]; aom_cdf_prob class0_cdf[CDF_SIZE(CLASS0_SIZE)]; aom_cdf_prob bits_cdf[MV_OFFSET_BITS][CDF_SIZE(2)]; } nmv_component; typedef struct { aom_cdf_prob joints_cdf[CDF_SIZE(MV_JOINTS)]; nmv_component comps[2]; } nmv_context; enum { MV_SUBPEL_NONE = -1, MV_SUBPEL_LOW_PRECISION = 0, MV_SUBPEL_HIGH_PRECISION, } SENUM1BYTE(MvSubpelPrecision); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_ENTROPYMV_H_ aom-3.12.1/av1/common/enums.h000066400000000000000000000451201477627663500156620ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_ENUMS_H_ #define AOM_AV1_COMMON_ENUMS_H_ #include "config/aom_config.h" #include "aom/aom_codec.h" #include "aom/aom_integer.h" #include "aom_dsp/txfm_common.h" #include "aom_ports/mem.h" #ifdef __cplusplus extern "C" { #endif /*! @file */ /*!\cond */ // Max superblock size #define MAX_SB_SIZE_LOG2 7 #define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2) #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE) // Min superblock size #define MIN_SB_SIZE_LOG2 6 // Pixels per Mode Info (MI) unit #define MI_SIZE_LOG2 2 #define MI_SIZE (1 << MI_SIZE_LOG2) // MI-units per max superblock (MI Block - MIB) #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2) #define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2) // MI-units per min superblock #define MIN_MIB_SIZE_LOG2 (MIN_SB_SIZE_LOG2 - MI_SIZE_LOG2) // Mask to extract MI offset within max MIB #define MAX_MIB_MASK (MAX_MIB_SIZE - 1) // Maximum number of tile rows and tile columns #define MAX_TILE_ROWS 64 #define MAX_TILE_COLS 64 #define MAX_VARTX_DEPTH 2 #define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2) #define MI_SIZE_128X128 (128 >> MI_SIZE_LOG2) #define MAX_PALETTE_SQUARE (64 * 64) // Maximum number of colors in a palette. #define PALETTE_MAX_SIZE 8 // Minimum number of colors in a palette. #define PALETTE_MIN_SIZE 2 #define FRAME_OFFSET_BITS 5 #define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1) // 4 frame filter levels: y plane vertical, y plane horizontal, // u plane, and v plane #define FRAME_LF_COUNT 4 #define DEFAULT_DELTA_LF_MULTI 0 #define MAX_MODE_LF_DELTAS 2 #define DIST_PRECISION_BITS 4 #define DIST_PRECISION (1 << DIST_PRECISION_BITS) // 16 #define PROFILE_BITS 3 // The following three profiles are currently defined. // Profile 0. 8-bit and 10-bit 4:2:0 and 4:0:0 only. // Profile 1. 8-bit and 10-bit 4:4:4 // Profile 2. 8-bit and 10-bit 4:2:2 // 12-bit 4:0:0, 4:2:2 and 4:4:4 // Since we have three bits for the profiles, it can be extended later. enum { PROFILE_0, PROFILE_1, PROFILE_2, MAX_PROFILES, } SENUM1BYTE(BITSTREAM_PROFILE); #define OP_POINTS_CNT_MINUS_1_BITS 5 #define OP_POINTS_IDC_BITS 12 // Note: Some enums use the attribute 'packed' to use smallest possible integer // type, so that we can save memory when they are used in structs/arrays. typedef enum ATTRIBUTE_PACKED { BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64, BLOCK_64X32, BLOCK_64X64, BLOCK_64X128, BLOCK_128X64, BLOCK_128X128, BLOCK_4X16, BLOCK_16X4, BLOCK_8X32, BLOCK_32X8, BLOCK_16X64, BLOCK_64X16, BLOCK_SIZES_ALL, BLOCK_SIZES = BLOCK_4X16, BLOCK_INVALID = 255, BLOCK_LARGEST = (BLOCK_SIZES - 1) } BLOCK_SIZE; // 4X4, 8X8, 16X16, 32X32, 64X64, 128X128 #define SQR_BLOCK_SIZES 6 // Partition types. R: Recursive // // NONE HORZ VERT SPLIT // +-------+ +-------+ +---+---+ +---+---+ // | | | | | | | | R | R | // | | +-------+ | | | +---+---+ // | | | | | | | | R | R | // +-------+ +-------+ +---+---+ +---+---+ // // HORZ_A HORZ_B VERT_A VERT_B // +---+---+ +-------+ +---+---+ +---+---+ // | | | | | | | | | | | // +---+---+ +---+---+ +---+ | | +---+ // | | | | | | | | | | | // +-------+ +---+---+ +---+---+ +---+---+ // // HORZ_4 VERT_4 // +-----+ +-+-+-+ // +-----+ | | | | // +-----+ | | | | // +-----+ +-+-+-+ enum { PARTITION_NONE, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT, PARTITION_HORZ_A, // HORZ split and the top partition is split again PARTITION_HORZ_B, // HORZ split and the bottom partition is split again PARTITION_VERT_A, // VERT split and the left partition is split again PARTITION_VERT_B, // VERT split and the right partition is split again PARTITION_HORZ_4, // 4:1 horizontal partition PARTITION_VERT_4, // 4:1 vertical partition EXT_PARTITION_TYPES, PARTITION_TYPES = PARTITION_SPLIT + 1, PARTITION_INVALID = 255 } UENUM1BYTE(PARTITION_TYPE); typedef char PARTITION_CONTEXT; #define PARTITION_PLOFFSET 4 // number of probability models per block size #define PARTITION_BLOCK_SIZES 5 #define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET) #define TX_SIZE_LUMA_MIN (TX_4X4) /* We don't need to code a transform size unless the allowed size is at least one more than the minimum. */ #define TX_SIZE_CTX_MIN (TX_SIZE_LUMA_MIN + 1) // Maximum tx_size categories #define MAX_TX_CATS (TX_SIZES - TX_SIZE_CTX_MIN) #define MAX_TX_DEPTH 2 #define MAX_TX_SIZE_LOG2 (6) #define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2) #define MIN_TX_SIZE_LOG2 2 #define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2) #define MAX_TX_SQUARE (MAX_TX_SIZE * MAX_TX_SIZE) // Pad 4 extra columns to remove horizontal availability check. #define TX_PAD_HOR_LOG2 2 #define TX_PAD_HOR 4 // Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability // check. #define TX_PAD_TOP 0 #define TX_PAD_BOTTOM 4 #define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM) // Pad 16 extra bytes to avoid reading overflow in SIMD optimization. #define TX_PAD_END 16 #define TX_PAD_2D ((32 + TX_PAD_HOR) * (32 + TX_PAD_VER) + TX_PAD_END) // Number of maximum size transform blocks in the maximum size superblock #define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2) #define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2) // frame transform mode enum { ONLY_4X4, // use only 4x4 transform TX_MODE_LARGEST, // transform size is the largest possible for pu size TX_MODE_SELECT, // transform specified for each block TX_MODES, } UENUM1BYTE(TX_MODE); // 1D tx types enum { DCT_1D, ADST_1D, FLIPADST_1D, IDTX_1D, TX_TYPES_1D, } UENUM1BYTE(TX_TYPE_1D); enum { REG_REG, REG_SMOOTH, REG_SHARP, SMOOTH_REG, SMOOTH_SMOOTH, SMOOTH_SHARP, SHARP_REG, SHARP_SMOOTH, SHARP_SHARP, } UENUM1BYTE(DUAL_FILTER_TYPE); #define EXT_TX_SIZES 4 // number of sizes that use extended transforms #define EXT_TX_SETS_INTER 4 // Sets of transform selections for INTER #define EXT_TX_SETS_INTRA 3 // Sets of transform selections for INTRA enum { AOM_LAST_FLAG = 1 << 0, AOM_LAST2_FLAG = 1 << 1, AOM_LAST3_FLAG = 1 << 2, AOM_GOLD_FLAG = 1 << 3, AOM_BWD_FLAG = 1 << 4, AOM_ALT2_FLAG = 1 << 5, AOM_ALT_FLAG = 1 << 6, AOM_REFFRAME_ALL = (1 << 7) - 1 } UENUM1BYTE(AOM_REFFRAME); enum { UNIDIR_COMP_REFERENCE, BIDIR_COMP_REFERENCE, COMP_REFERENCE_TYPES, } UENUM1BYTE(COMP_REFERENCE_TYPE); enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } UENUM1BYTE(PLANE_TYPE); #define CFL_ALPHABET_SIZE_LOG2 4 #define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2) #define CFL_MAGS_SIZE ((2 << CFL_ALPHABET_SIZE_LOG2) + 1) #define CFL_INDEX_ZERO CFL_ALPHABET_SIZE #define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2) #define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1)) enum { CFL_PRED_U, CFL_PRED_V, CFL_PRED_PLANES } UENUM1BYTE(CFL_PRED_TYPE); enum { CFL_SIGN_ZERO, CFL_SIGN_NEG, CFL_SIGN_POS, CFL_SIGNS } UENUM1BYTE(CFL_SIGN_TYPE); enum { CFL_DISALLOWED, CFL_ALLOWED, CFL_ALLOWED_TYPES } UENUM1BYTE(CFL_ALLOWED_TYPE); // CFL_SIGN_ZERO,CFL_SIGN_ZERO is invalid #define CFL_JOINT_SIGNS (CFL_SIGNS * CFL_SIGNS - 1) // CFL_SIGN_U is equivalent to (js + 1) / 3 for js in 0 to 8 #define CFL_SIGN_U(js) (((js + 1) * 11) >> 5) // CFL_SIGN_V is equivalent to (js + 1) % 3 for js in 0 to 8 #define CFL_SIGN_V(js) ((js + 1) - CFL_SIGNS * CFL_SIGN_U(js)) // There is no context when the alpha for a given plane is zero. // So there are 2 fewer contexts than joint signs. #define CFL_ALPHA_CONTEXTS (CFL_JOINT_SIGNS + 1 - CFL_SIGNS) #define CFL_CONTEXT_U(js) (js + 1 - CFL_SIGNS) // Also, the contexts are symmetric under swapping the planes. #define CFL_CONTEXT_V(js) \ (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS) enum { PALETTE_MAP, COLOR_MAP_TYPES, } UENUM1BYTE(COLOR_MAP_TYPE); enum { TWO_COLORS, THREE_COLORS, FOUR_COLORS, FIVE_COLORS, SIX_COLORS, SEVEN_COLORS, EIGHT_COLORS, PALETTE_SIZES } UENUM1BYTE(PALETTE_SIZE); enum { PALETTE_COLOR_ONE, PALETTE_COLOR_TWO, PALETTE_COLOR_THREE, PALETTE_COLOR_FOUR, PALETTE_COLOR_FIVE, PALETTE_COLOR_SIX, PALETTE_COLOR_SEVEN, PALETTE_COLOR_EIGHT, PALETTE_COLORS } UENUM1BYTE(PALETTE_COLOR); // Note: All directional predictors must be between V_PRED and D67_PRED (both // inclusive). enum { DC_PRED, // Average of above and left pixels V_PRED, // Vertical H_PRED, // Horizontal D45_PRED, // Directional 45 degree D135_PRED, // Directional 135 degree D113_PRED, // Directional 113 degree D157_PRED, // Directional 157 degree D203_PRED, // Directional 203 degree D67_PRED, // Directional 67 degree SMOOTH_PRED, // Combination of horizontal and vertical interpolation SMOOTH_V_PRED, // Vertical interpolation SMOOTH_H_PRED, // Horizontal interpolation PAETH_PRED, // Predict from the direction of smallest gradient NEARESTMV, NEARMV, GLOBALMV, NEWMV, // Compound ref compound modes NEAREST_NEARESTMV, NEAR_NEARMV, NEAREST_NEWMV, NEW_NEARESTMV, NEAR_NEWMV, NEW_NEARMV, GLOBAL_GLOBALMV, NEW_NEWMV, MB_MODE_COUNT, PRED_MODE_INVALID = MB_MODE_COUNT, INTRA_MODE_START = DC_PRED, INTRA_MODE_END = NEARESTMV, DIR_MODE_START = V_PRED, DIR_MODE_END = D67_PRED + 1, INTRA_MODE_NUM = INTRA_MODE_END - INTRA_MODE_START, SINGLE_INTER_MODE_START = NEARESTMV, SINGLE_INTER_MODE_END = NEAREST_NEARESTMV, SINGLE_INTER_MODE_NUM = SINGLE_INTER_MODE_END - SINGLE_INTER_MODE_START, COMP_INTER_MODE_START = NEAREST_NEARESTMV, COMP_INTER_MODE_END = MB_MODE_COUNT, COMP_INTER_MODE_NUM = COMP_INTER_MODE_END - COMP_INTER_MODE_START, INTER_MODE_START = NEARESTMV, INTER_MODE_END = MB_MODE_COUNT, INTRA_MODES = PAETH_PRED + 1, // PAETH_PRED has to be the last intra mode. INTRA_INVALID = MB_MODE_COUNT // For uv_mode in inter blocks } UENUM1BYTE(PREDICTION_MODE); // TODO(ltrudeau) Do we really want to pack this? // TODO(ltrudeau) Do we match with PREDICTION_MODE? enum { UV_DC_PRED, // Average of above and left pixels UV_V_PRED, // Vertical UV_H_PRED, // Horizontal UV_D45_PRED, // Directional 45 degree UV_D135_PRED, // Directional 135 degree UV_D113_PRED, // Directional 113 degree UV_D157_PRED, // Directional 157 degree UV_D203_PRED, // Directional 203 degree UV_D67_PRED, // Directional 67 degree UV_SMOOTH_PRED, // Combination of horizontal and vertical interpolation UV_SMOOTH_V_PRED, // Vertical interpolation UV_SMOOTH_H_PRED, // Horizontal interpolation UV_PAETH_PRED, // Predict from the direction of smallest gradient UV_CFL_PRED, // Chroma-from-Luma UV_INTRA_MODES, UV_MODE_INVALID, // For uv_mode in inter blocks } UENUM1BYTE(UV_PREDICTION_MODE); // Number of top model rd to store for pruning y modes in intra mode decision #define TOP_INTRA_MODEL_COUNT 4 // Total number of luma intra prediction modes (include both directional and // non-directional modes) // Because there are 8 directional modes, each has additional 6 delta angles. #define LUMA_MODE_COUNT (PAETH_PRED - DC_PRED + 1 + 6 * 8) enum { SIMPLE_TRANSLATION, OBMC_CAUSAL, // 2-sided OBMC WARPED_CAUSAL, // 2-sided WARPED MOTION_MODES } UENUM1BYTE(MOTION_MODE); enum { II_DC_PRED, II_V_PRED, II_H_PRED, II_SMOOTH_PRED, INTERINTRA_MODES } UENUM1BYTE(INTERINTRA_MODE); enum { COMPOUND_AVERAGE, COMPOUND_DISTWTD, COMPOUND_WEDGE, COMPOUND_DIFFWTD, COMPOUND_TYPES, MASKED_COMPOUND_TYPES = 2, } UENUM1BYTE(COMPOUND_TYPE); enum { FILTER_DC_PRED, FILTER_V_PRED, FILTER_H_PRED, FILTER_D157_PRED, FILTER_PAETH_PRED, FILTER_INTRA_MODES, } UENUM1BYTE(FILTER_INTRA_MODE); enum { SEQ_LEVEL_2_0, SEQ_LEVEL_2_1, SEQ_LEVEL_2_2, SEQ_LEVEL_2_3, SEQ_LEVEL_3_0, SEQ_LEVEL_3_1, SEQ_LEVEL_3_2, SEQ_LEVEL_3_3, SEQ_LEVEL_4_0, SEQ_LEVEL_4_1, SEQ_LEVEL_4_2, SEQ_LEVEL_4_3, SEQ_LEVEL_5_0, SEQ_LEVEL_5_1, SEQ_LEVEL_5_2, SEQ_LEVEL_5_3, SEQ_LEVEL_6_0, SEQ_LEVEL_6_1, SEQ_LEVEL_6_2, SEQ_LEVEL_6_3, SEQ_LEVEL_7_0, SEQ_LEVEL_7_1, SEQ_LEVEL_7_2, SEQ_LEVEL_7_3, SEQ_LEVEL_8_0, SEQ_LEVEL_8_1, SEQ_LEVEL_8_2, SEQ_LEVEL_8_3, SEQ_LEVELS, SEQ_LEVEL_MAX = 31, SEQ_LEVEL_KEEP_STATS = 32, } UENUM1BYTE(AV1_LEVEL); #define LEVEL_BITS 5 #define DIRECTIONAL_MODES 8 #define MAX_ANGLE_DELTA 3 #define ANGLE_STEP 3 #define INTER_MODES (1 + NEWMV - NEARESTMV) #define INTER_COMPOUND_MODES (1 + NEW_NEWMV - NEAREST_NEARESTMV) #define SKIP_CONTEXTS 3 #define SKIP_MODE_CONTEXTS 3 #define COMP_INDEX_CONTEXTS 6 #define COMP_GROUP_IDX_CONTEXTS 6 #define NMV_CONTEXTS 3 #define NEWMV_MODE_CONTEXTS 6 #define GLOBALMV_MODE_CONTEXTS 2 #define REFMV_MODE_CONTEXTS 6 #define DRL_MODE_CONTEXTS 3 #define GLOBALMV_OFFSET 3 #define REFMV_OFFSET 4 #define NEWMV_CTX_MASK ((1 << GLOBALMV_OFFSET) - 1) #define GLOBALMV_CTX_MASK ((1 << (REFMV_OFFSET - GLOBALMV_OFFSET)) - 1) #define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1) #define COMP_NEWMV_CTXS 5 #define INTER_MODE_CONTEXTS 8 #define DELTA_Q_SMALL 3 #define DELTA_Q_PROBS (DELTA_Q_SMALL) #define DEFAULT_DELTA_Q_RES_PERCEPTUAL 4 #define DEFAULT_DELTA_Q_RES_OBJECTIVE 4 #define DEFAULT_DELTA_Q_RES_DUCKY_ENCODE 4 #define DELTA_LF_SMALL 3 #define DELTA_LF_PROBS (DELTA_LF_SMALL) #define DEFAULT_DELTA_LF_RES 2 /* Segment Feature Masks */ #define MAX_MV_REF_CANDIDATES 2 #define MAX_REF_MV_STACK_SIZE 8 #define USABLE_REF_MV_STACK_SIZE 4 #define REF_CAT_LEVEL 640 #define INTRA_INTER_CONTEXTS 4 #define COMP_INTER_CONTEXTS 5 #define REF_CONTEXTS 3 #define COMP_REF_TYPE_CONTEXTS 5 #define UNI_COMP_REF_CONTEXTS 3 #define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 3) typedef uint8_t TXFM_CONTEXT; // An enum for single reference types (and some derived values). enum { NONE_FRAME = -1, INTRA_FRAME, LAST_FRAME, LAST2_FRAME, LAST3_FRAME, GOLDEN_FRAME, BWDREF_FRAME, ALTREF2_FRAME, ALTREF_FRAME, REF_FRAMES, // Extra/scratch reference frame. It may be: // - used to update the ALTREF2_FRAME ref (see lshift_bwd_ref_frames()), or // - updated from ALTREF2_FRAME ref (see rshift_bwd_ref_frames()). EXTREF_FRAME = REF_FRAMES, // Number of inter (non-intra) reference types. INTER_REFS_PER_FRAME = ALTREF_FRAME - LAST_FRAME + 1, // Number of forward (aka past) reference types. FWD_REFS = GOLDEN_FRAME - LAST_FRAME + 1, // Number of backward (aka future) reference types. BWD_REFS = ALTREF_FRAME - BWDREF_FRAME + 1, SINGLE_REFS = FWD_REFS + BWD_REFS, }; #define REF_FRAMES_LOG2 3 // REF_FRAMES for the cm->ref_frame_map array, 1 scratch frame for the new // frame in cm->cur_frame, INTER_REFS_PER_FRAME for scaled references on the // encoder in the cpi->scaled_ref_buf array. // The encoder uses FRAME_BUFFERS only in GOOD and REALTIME encoding modes. // The decoder also uses FRAME_BUFFERS. #define FRAME_BUFFERS (REF_FRAMES + 1 + INTER_REFS_PER_FRAME) // During allintra encoding, one reference frame buffer is free to be used again // only after another frame buffer is stored as the reference frame. Hence, it // is necessary and sufficient to maintain only two reference frame buffers in // this case. #define FRAME_BUFFERS_ALLINTRA 2 #define FWD_RF_OFFSET(ref) (ref - LAST_FRAME) #define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME) // Select all the decoded frame buffer slots #define SELECT_ALL_BUF_SLOTS 0xFF enum { LAST_LAST2_FRAMES, // { LAST_FRAME, LAST2_FRAME } LAST_LAST3_FRAMES, // { LAST_FRAME, LAST3_FRAME } LAST_GOLDEN_FRAMES, // { LAST_FRAME, GOLDEN_FRAME } BWDREF_ALTREF_FRAMES, // { BWDREF_FRAME, ALTREF_FRAME } LAST2_LAST3_FRAMES, // { LAST2_FRAME, LAST3_FRAME } LAST2_GOLDEN_FRAMES, // { LAST2_FRAME, GOLDEN_FRAME } LAST3_GOLDEN_FRAMES, // { LAST3_FRAME, GOLDEN_FRAME } BWDREF_ALTREF2_FRAMES, // { BWDREF_FRAME, ALTREF2_FRAME } ALTREF2_ALTREF_FRAMES, // { ALTREF2_FRAME, ALTREF_FRAME } TOTAL_UNIDIR_COMP_REFS, // NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs // that are explicitly signaled. UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1, } UENUM1BYTE(UNIDIR_COMP_REF); #define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS) #define COMP_REFS (FWD_REFS * BWD_REFS + UNIDIR_COMP_REFS) // NOTE: A limited number of unidirectional reference pairs can be signalled for // compound prediction. The use of skip mode, on the other hand, makes it // possible to have a reference pair not listed for explicit signaling. #define MODE_CTX_REF_FRAMES (REF_FRAMES + TOTAL_COMP_REFS) // Note: It includes single and compound references. So, it can take values from // NONE_FRAME to (MODE_CTX_REF_FRAMES - 1). Hence, it is not defined as an enum. typedef int8_t MV_REFERENCE_FRAME; /*!\endcond */ /*!\enum RestorationType * \brief This enumeration defines various restoration types supported */ typedef enum { RESTORE_NONE, /**< No restoration */ RESTORE_WIENER, /**< Separable Wiener restoration */ RESTORE_SGRPROJ, /**< Selfguided restoration */ RESTORE_SWITCHABLE, /**< Switchable restoration */ RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE, /**< Num Switchable types */ RESTORE_TYPES = 4, /**< Num Restore types */ } RestorationType; /*!\cond */ // Picture prediction structures (0-13 are predefined) in scalability metadata. enum { SCALABILITY_L1T2 = 0, SCALABILITY_L1T3 = 1, SCALABILITY_L2T1 = 2, SCALABILITY_L2T2 = 3, SCALABILITY_L2T3 = 4, SCALABILITY_S2T1 = 5, SCALABILITY_S2T2 = 6, SCALABILITY_S2T3 = 7, SCALABILITY_L2T1h = 8, SCALABILITY_L2T2h = 9, SCALABILITY_L2T3h = 10, SCALABILITY_S2T1h = 11, SCALABILITY_S2T2h = 12, SCALABILITY_S2T3h = 13, SCALABILITY_SS = 14 } UENUM1BYTE(SCALABILITY_STRUCTURES); #define SUPERRES_SCALE_BITS 3 #define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1) // In large_scale_tile coding, external references are used. #define MAX_EXTERNAL_REFERENCES 128 #define MAX_TILES 512 /*!\endcond */ #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_ENUMS_H_ aom-3.12.1/av1/common/filter.h000066400000000000000000000310151477627663500160160ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_FILTER_H_ #define AOM_AV1_COMMON_FILTER_H_ #include #include "config/aom_config.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_filter.h" #include "aom_ports/mem.h" #include "av1/common/enums.h" #ifdef __cplusplus extern "C" { #endif #define MAX_FILTER_TAP 12 typedef enum ATTRIBUTE_PACKED { EIGHTTAP_REGULAR, EIGHTTAP_SMOOTH, MULTITAP_SHARP, BILINEAR, // Encoder side only filters MULTITAP_SHARP2, INTERP_FILTERS_ALL, SWITCHABLE_FILTERS = BILINEAR, SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */ EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS, INTERP_INVALID = 0xff, } InterpFilter; enum { USE_2_TAPS_ORIG = 0, // This is used in temporal filtering. USE_2_TAPS, USE_4_TAPS, USE_8_TAPS, } UENUM1BYTE(SUBPEL_SEARCH_TYPE); enum { INTERP_EVAL_LUMA_EVAL_CHROMA = 0, INTERP_SKIP_LUMA_EVAL_CHROMA, INTERP_EVAL_INVALID, INTERP_SKIP_LUMA_SKIP_CHROMA, } UENUM1BYTE(INTERP_EVAL_PLANE); enum { INTERP_HORZ_NEQ_VERT_NEQ = 0, INTERP_HORZ_EQ_VERT_NEQ, INTERP_HORZ_NEQ_VERT_EQ, INTERP_HORZ_EQ_VERT_EQ, INTERP_PRED_TYPE_ALL, } UENUM1BYTE(INTERP_PRED_TYPE); // Pack two InterpFilter's into a uint32_t: since there are at most 10 filters, // we can use 16 bits for each and have more than enough space. This reduces // argument passing and unifies the operation of setting a (pair of) filters. typedef struct InterpFilters { uint16_t y_filter; uint16_t x_filter; } InterpFilters; typedef union int_interpfilters { uint32_t as_int; InterpFilters as_filters; } int_interpfilters; static inline InterpFilter av1_extract_interp_filter(int_interpfilters filters, int dir) { return (InterpFilter)((dir) ? filters.as_filters.x_filter : filters.as_filters.y_filter); } static inline int_interpfilters av1_broadcast_interp_filter( InterpFilter filter) { int_interpfilters filters; filters.as_filters.x_filter = filter; filters.as_filters.y_filter = filter; return filters; } static inline InterpFilter av1_unswitchable_filter(InterpFilter filter) { return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter; } /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */ #define LOG_SWITCHABLE_FILTERS 2 #define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4) #define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1) #define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2) #define ALLOW_ALL_INTERP_FILT_MASK (0x01ff) typedef struct InterpFilterParams { const int16_t *filter_ptr; uint16_t taps; InterpFilter interp_filter; } InterpFilterParams; DECLARE_ALIGNED(256, static const InterpKernel, av1_bilinear_filters[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 }, { 0, 0, 0, 96, 32, 0, 0, 0 }, { 0, 0, 0, 88, 40, 0, 0, 0 }, { 0, 0, 0, 80, 48, 0, 0, 0 }, { 0, 0, 0, 72, 56, 0, 0, 0 }, { 0, 0, 0, 64, 64, 0, 0, 0 }, { 0, 0, 0, 56, 72, 0, 0, 0 }, { 0, 0, 0, 48, 80, 0, 0, 0 }, { 0, 0, 0, 40, 88, 0, 0, 0 }, { 0, 0, 0, 32, 96, 0, 0, 0 }, { 0, 0, 0, 24, 104, 0, 0, 0 }, { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 } }; DECLARE_ALIGNED(256, static const InterpKernel, av1_sub_pel_filters_8[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, -6, 126, 8, -2, 0, 0 }, { 0, 2, -10, 122, 18, -4, 0, 0 }, { 0, 2, -12, 116, 28, -8, 2, 0 }, { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 }, { 0, 2, -16, 94, 58, -12, 2, 0 }, { 0, 2, -14, 84, 66, -12, 2, 0 }, { 0, 2, -14, 76, 76, -14, 2, 0 }, { 0, 2, -12, 66, 84, -14, 2, 0 }, { 0, 2, -12, 58, 94, -16, 2, 0 }, { 0, 2, -12, 48, 102, -14, 2, 0 }, { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 }, { 0, 0, -4, 18, 122, -10, 2, 0 }, { 0, 0, -2, 8, 126, -6, 2, 0 } }; DECLARE_ALIGNED(256, static const InterpKernel, av1_sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { -2, 2, -6, 126, 8, -2, 2, 0 }, { -2, 6, -12, 124, 16, -6, 4, -2 }, { -2, 8, -18, 120, 26, -10, 6, -2 }, { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 }, { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 }, { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 }, { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 }, { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 }, { -2, 4, -6, 16, 124, -12, 6, -2 }, { 0, 2, -2, 8, 126, -6, 2, -2 } }; DECLARE_ALIGNED(256, static const InterpKernel, av1_sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, 28, 62, 34, 2, 0, 0 }, { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 }, { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 }, { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, -2, 16, 54, 48, 12, 0, 0 }, { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 }, { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 }, { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 }, { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 28, 2, 0 } }; DECLARE_ALIGNED(256, static const int16_t, av1_sub_pel_filters_12sharp[SUBPEL_SHIFTS][12]) = { { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }, { 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0 }, { -1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1 }, { -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1 }, { -1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1 }, { -2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2 }, { -2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2 }, { -2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2 }, { -2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2 }, { -2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2 }, { -2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2 }, { -2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2 }, { -1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1 }, { -1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1 }, { -1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1 }, { 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0 } }; static const InterpFilterParams av1_interp_filter_params_list[INTERP_FILTERS_ALL] = { { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, EIGHTTAP_REGULAR }, { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS, EIGHTTAP_SMOOTH }, { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS, MULTITAP_SHARP }, { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, BILINEAR }, // The following filters are for encoder only, and now they are used in // temporal filtering. The predictor block size >= 16 in temporal filter. { (const int16_t *)av1_sub_pel_filters_12sharp, 12, MULTITAP_SHARP2 }, }; // A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel // MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV. DECLARE_ALIGNED(256, static const int16_t, av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = { 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; static const InterpFilterParams av1_intrabc_filter_params = { av1_intrabc_bilinear_filter, 2, BILINEAR }; DECLARE_ALIGNED(256, static const InterpKernel, av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 }, { 0, 0, -8, 122, 18, -4, 0, 0 }, { 0, 0, -10, 116, 28, -6, 0, 0 }, { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 }, { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 }, { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 }, { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 }, { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 }, { 0, 0, -4, 18, 122, -8, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 } }; DECLARE_ALIGNED(256, static const InterpKernel, av1_sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 30, 62, 34, 2, 0, 0 }, { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 }, { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 }, { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 }, { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 }, { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 }, { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 }, { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 30, 0, 0 } }; static const uint16_t av1_interp_dual_filt_mask[INTERP_PRED_TYPE_ALL - 2][SWITCHABLE_FILTERS] = { { (1 << REG_REG) | (1 << SMOOTH_REG) | (1 << SHARP_REG), (1 << REG_SMOOTH) | (1 << SMOOTH_SMOOTH) | (1 << SHARP_SMOOTH), (1 << REG_SHARP) | (1 << SMOOTH_SHARP) | (1 << SHARP_SHARP) }, { (1 << REG_REG) | (1 << REG_SMOOTH) | (1 << REG_SHARP), (1 << SMOOTH_REG) | (1 << SMOOTH_SMOOTH) | (1 << SMOOTH_SHARP), (1 << SHARP_REG) | (1 << SHARP_SMOOTH) | (1 << SHARP_SHARP) } }; // For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = { { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, EIGHTTAP_REGULAR }, { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS, EIGHTTAP_SMOOTH }, { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, EIGHTTAP_REGULAR }, { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, BILINEAR }, }; static inline const InterpFilterParams * av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter, const int w) { if (w <= 4 && interp_filter != MULTITAP_SHARP2) return &av1_interp_4tap[interp_filter]; return &av1_interp_filter_params_list[interp_filter]; } static inline const int16_t *av1_get_interp_filter_kernel( const InterpFilter interp_filter, int subpel_search) { assert(subpel_search >= USE_2_TAPS); return (subpel_search == USE_2_TAPS) ? av1_interp_4tap[BILINEAR].filter_ptr : ((subpel_search == USE_4_TAPS) ? av1_interp_4tap[interp_filter].filter_ptr : av1_interp_filter_params_list[interp_filter].filter_ptr); } static inline const int16_t *av1_get_interp_filter_subpel_kernel( const InterpFilterParams *const filter_params, const int subpel) { return filter_params->filter_ptr + filter_params->taps * subpel; } static inline const InterpFilterParams *av1_get_filter(int subpel_search) { assert(subpel_search >= USE_2_TAPS); switch (subpel_search) { case USE_2_TAPS: return &av1_interp_4tap[BILINEAR]; case USE_4_TAPS: return &av1_interp_4tap[EIGHTTAP_REGULAR]; case USE_8_TAPS: return &av1_interp_filter_params_list[EIGHTTAP_REGULAR]; default: assert(0); return NULL; } } static inline void reset_interp_filter_allowed_mask( uint16_t *allow_interp_mask, DUAL_FILTER_TYPE filt_type) { uint16_t tmp = (~(1 << filt_type)) & 0xffff; *allow_interp_mask &= (tmp & ALLOW_ALL_INTERP_FILT_MASK); } static inline void set_interp_filter_allowed_mask(uint16_t *allow_interp_mask, DUAL_FILTER_TYPE filt_type) { *allow_interp_mask |= (1 << filt_type); } static inline uint8_t get_interp_filter_allowed_mask( uint16_t allow_interp_mask, DUAL_FILTER_TYPE filt_type) { return (allow_interp_mask >> filt_type) & 1; } static inline int get_filter_tap(const InterpFilterParams *const filter_params, int subpel_qn) { const int16_t *const filter = av1_get_interp_filter_subpel_kernel( filter_params, subpel_qn & SUBPEL_MASK); if (filter_params->taps == 12) { return 12; } if (filter[0] | filter[7]) { return 8; } if (filter[1] | filter[6]) { return 6; } #if CONFIG_SVT_AV1 if (filter[2] | filter[5]) { return 4; } return 2; #else return 4; #endif } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_FILTER_H_ aom-3.12.1/av1/common/frame_buffers.c000066400000000000000000000061001477627663500173270ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/common/frame_buffers.h" #include "aom_mem/aom_mem.h" int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list) { assert(list != NULL); av1_free_internal_frame_buffers(list); list->num_internal_frame_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS; list->int_fb = (InternalFrameBuffer *)aom_calloc( list->num_internal_frame_buffers, sizeof(*list->int_fb)); if (list->int_fb == NULL) { list->num_internal_frame_buffers = 0; return 1; } return 0; } void av1_free_internal_frame_buffers(InternalFrameBufferList *list) { int i; assert(list != NULL); for (i = 0; i < list->num_internal_frame_buffers; ++i) { aom_free(list->int_fb[i].data); list->int_fb[i].data = NULL; } aom_free(list->int_fb); list->int_fb = NULL; list->num_internal_frame_buffers = 0; } void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) { int i; assert(list != NULL); for (i = 0; i < list->num_internal_frame_buffers; ++i) { if (list->int_fb[i].data && !list->int_fb[i].in_use) memset(list->int_fb[i].data, 0, list->int_fb[i].size); } } int av1_get_frame_buffer(void *cb_priv, size_t min_size, aom_codec_frame_buffer_t *fb) { int i; InternalFrameBufferList *const int_fb_list = (InternalFrameBufferList *)cb_priv; if (int_fb_list == NULL) return -1; // Find a free frame buffer. for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) { if (!int_fb_list->int_fb[i].in_use) break; } if (i == int_fb_list->num_internal_frame_buffers) return -1; if (int_fb_list->int_fb[i].size < min_size) { aom_free(int_fb_list->int_fb[i].data); // The data must be zeroed to fix a valgrind error from the C loop filter // due to access uninitialized memory in frame border. It could be // skipped if border were totally removed. int_fb_list->int_fb[i].data = (uint8_t *)aom_calloc(1, min_size); if (!int_fb_list->int_fb[i].data) { int_fb_list->int_fb[i].size = 0; return -1; } int_fb_list->int_fb[i].size = min_size; } fb->data = int_fb_list->int_fb[i].data; fb->size = int_fb_list->int_fb[i].size; int_fb_list->int_fb[i].in_use = 1; // Set the frame buffer's private data to point at the internal frame buffer. fb->priv = &int_fb_list->int_fb[i]; return 0; } int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) { InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv; (void)cb_priv; if (int_fb) int_fb->in_use = 0; return 0; } aom-3.12.1/av1/common/frame_buffers.h000066400000000000000000000041711477627663500173420ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_FRAME_BUFFERS_H_ #define AOM_AV1_COMMON_FRAME_BUFFERS_H_ #include "aom/aom_frame_buffer.h" #include "aom/aom_integer.h" #ifdef __cplusplus extern "C" { #endif typedef struct InternalFrameBuffer { uint8_t *data; size_t size; int in_use; } InternalFrameBuffer; typedef struct InternalFrameBufferList { int num_internal_frame_buffers; InternalFrameBuffer *int_fb; } InternalFrameBufferList; // Initializes |list|. Returns 0 on success. int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list); // Free any data allocated to the frame buffers. void av1_free_internal_frame_buffers(InternalFrameBufferList *list); // Zeros all unused internal frame buffers. In particular, this zeros the // frame borders. Call this function after a sequence header change to // re-initialize the frame borders for the different width, height, or bit // depth. void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list); // Callback used by libaom to request an external frame buffer. |cb_priv| // Callback private data, which points to an InternalFrameBufferList. // |min_size| is the minimum size in bytes needed to decode the next frame. // |fb| pointer to the frame buffer. int av1_get_frame_buffer(void *cb_priv, size_t min_size, aom_codec_frame_buffer_t *fb); // Callback used by libaom when there are no references to the frame buffer. // |cb_priv| is not used. |fb| pointer to the frame buffer. int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_FRAME_BUFFERS_H_ aom-3.12.1/av1/common/idct.c000066400000000000000000000323471477627663500154600ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" #include "aom_ports/mem.h" #include "av1/common/av1_inv_txfm1d_cfg.h" #include "av1/common/av1_txfm.h" #include "av1/common/blockd.h" #include "av1/common/enums.h" #include "av1/common/idct.h" int av1_get_tx_scale(const TX_SIZE tx_size) { const int pels = tx_size_2d[tx_size]; // Largest possible pels is 4096 (64x64). return (pels > 256) + (pels > 1024); } // NOTE: The implementation of all inverses need to be aware of the fact // that input and output could be the same buffer. // idct void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, int eob, int bd) { if (eob > 1) av1_highbd_iwht4x4_16_add(input, dest, stride, bd); else av1_highbd_iwht4x4_1_add(input, dest, stride, bd); } static void highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); int eob = txfm_param->eob; int bd = txfm_param->bd; int lossless = txfm_param->lossless; const int32_t *src = cast_to_int32(input); const TX_TYPE tx_type = txfm_param->tx_type; if (lossless) { assert(tx_type == DCT_DCT); av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); return; } av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); } static void highbd_inv_txfm_add_4x8_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_inv_txfm_add_8x4_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_inv_txfm_add_16x32_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_inv_txfm_add_32x16_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_inv_txfm_add_16x4_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_inv_txfm_add_4x16_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_inv_txfm_add_32x8_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_inv_txfm_add_8x32_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_inv_txfm_add_32x64_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_inv_txfm_add_64x32_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_inv_txfm_add_16x64_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_inv_txfm_add_64x16_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); } static void highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); } static void highbd_inv_txfm_add_8x16_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_inv_txfm_add_16x8_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { const int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const int32_t *src = cast_to_int32(input); av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); } static void highbd_inv_txfm_add_64x64_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { const int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const int32_t *src = cast_to_int32(input); assert(tx_type == DCT_DCT); av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); } static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, TX_TYPE tx_type, int eob, int reduced_tx_set, TxfmParam *txfm_param) { (void)plane; txfm_param->tx_type = tx_type; txfm_param->tx_size = tx_size; txfm_param->eob = eob; txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id]; txfm_param->bd = xd->bd; txfm_param->is_hbd = is_cur_buf_hbd(xd); txfm_param->tx_set_type = av1_get_ext_tx_set_type( txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set); } void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const TX_SIZE tx_size = txfm_param->tx_size; switch (tx_size) { case TX_32X32: highbd_inv_txfm_add_32x32_c(input, dest, stride, txfm_param); break; case TX_16X16: highbd_inv_txfm_add_16x16_c(input, dest, stride, txfm_param); break; case TX_8X8: highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param); break; case TX_4X8: highbd_inv_txfm_add_4x8_c(input, dest, stride, txfm_param); break; case TX_8X4: highbd_inv_txfm_add_8x4_c(input, dest, stride, txfm_param); break; case TX_8X16: highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param); break; case TX_16X8: highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param); break; case TX_16X32: highbd_inv_txfm_add_16x32_c(input, dest, stride, txfm_param); break; case TX_32X16: highbd_inv_txfm_add_32x16_c(input, dest, stride, txfm_param); break; case TX_64X64: highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param); break; case TX_32X64: highbd_inv_txfm_add_32x64_c(input, dest, stride, txfm_param); break; case TX_64X32: highbd_inv_txfm_add_64x32_c(input, dest, stride, txfm_param); break; case TX_16X64: highbd_inv_txfm_add_16x64_c(input, dest, stride, txfm_param); break; case TX_64X16: highbd_inv_txfm_add_64x16_c(input, dest, stride, txfm_param); break; case TX_4X4: // this is like av1_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param); break; case TX_16X4: highbd_inv_txfm_add_16x4_c(input, dest, stride, txfm_param); break; case TX_4X16: highbd_inv_txfm_add_4x16_c(input, dest, stride, txfm_param); break; case TX_8X32: highbd_inv_txfm_add_8x32_c(input, dest, stride, txfm_param); break; case TX_32X8: highbd_inv_txfm_add_32x8_c(input, dest, stride, txfm_param); break; default: assert(0 && "Invalid transform size"); break; } } void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param) { const TX_SIZE tx_size = txfm_param->tx_size; DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]); int tmp_stride = MAX_TX_SIZE; int w = tx_size_wide[tx_size]; int h = tx_size_high[tx_size]; for (int r = 0; r < h; ++r) { for (int c = 0; c < w; ++c) { tmp[r * tmp_stride + c] = dst[r * stride + c]; } } av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride, txfm_param); for (int r = 0; r < h; ++r) { for (int c = 0; c < w; ++c) { dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c]; } } } void av1_inverse_transform_block(const MACROBLOCKD *xd, const tran_low_t *dqcoeff, int plane, TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst, int stride, int eob, int reduced_tx_set) { if (!eob) return; assert(eob <= av1_get_max_eob(tx_size)); TxfmParam txfm_param; init_txfm_param(xd, plane, tx_size, tx_type, eob, reduced_tx_set, &txfm_param); assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]); if (txfm_param.is_hbd) { av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param); } else { av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param); } } aom-3.12.1/av1/common/idct.h000066400000000000000000000031721477627663500154570ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_IDCT_H_ #define AOM_AV1_COMMON_IDCT_H_ #include "config/aom_config.h" #include "av1/common/blockd.h" #include "av1/common/common.h" #include "av1/common/enums.h" #include "aom_dsp/txfm_common.h" #ifdef __cplusplus extern "C" { #endif typedef void (*transform_1d)(const tran_low_t *, tran_low_t *); typedef struct { transform_1d cols, rows; // vertical and horizontal } transform_2d; #define MAX_TX_SCALE 1 int av1_get_tx_scale(const TX_SIZE tx_size); void av1_inverse_transform_block(const MACROBLOCKD *xd, const tran_low_t *dqcoeff, int plane, TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst, int stride, int eob, int reduced_tx_set); void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, int eob, int bd); static inline const int32_t *cast_to_int32(const tran_low_t *input) { assert(sizeof(int32_t) == sizeof(tran_low_t)); return (const int32_t *)input; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_IDCT_H_ aom-3.12.1/av1/common/mv.h000066400000000000000000000241671477627663500151650ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_MV_H_ #define AOM_AV1_COMMON_MV_H_ #include #include "av1/common/common.h" #include "av1/common/common_data.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/flow_estimation/flow_estimation.h" #ifdef __cplusplus extern "C" { #endif #define INVALID_MV 0x80008000 #define INVALID_MV_ROW_COL -32768 #define GET_MV_RAWPEL(x) (((x) + 3 + ((x) >= 0)) >> 3) #define GET_MV_SUBPEL(x) ((x)*8) #define MARK_MV_INVALID(mv) \ do { \ ((int_mv *)(mv))->as_int = INVALID_MV; \ } while (0) #define CHECK_MV_EQUAL(x, y) (((x).row == (y).row) && ((x).col == (y).col)) // The motion vector in units of full pixel typedef struct fullpel_mv { int16_t row; int16_t col; } FULLPEL_MV; // The motion vector in units of 1/8-pel typedef struct mv { int16_t row; int16_t col; } MV; static const MV kZeroMv = { 0, 0 }; static const FULLPEL_MV kZeroFullMv = { 0, 0 }; typedef union int_mv { uint32_t as_int; MV as_mv; FULLPEL_MV as_fullmv; } int_mv; /* facilitates faster equality tests and copies */ typedef struct mv32 { int32_t row; int32_t col; } MV32; // The mv limit for fullpel mvs typedef struct { int col_min; int col_max; int row_min; int row_max; } FullMvLimits; // The mv limit for subpel mvs typedef struct { int col_min; int col_max; int row_min; int row_max; } SubpelMvLimits; static inline FULLPEL_MV get_fullmv_from_mv(const MV *subpel_mv) { const FULLPEL_MV full_mv = { (int16_t)GET_MV_RAWPEL(subpel_mv->row), (int16_t)GET_MV_RAWPEL(subpel_mv->col) }; return full_mv; } static inline MV get_mv_from_fullmv(const FULLPEL_MV *full_mv) { const MV subpel_mv = { (int16_t)GET_MV_SUBPEL(full_mv->row), (int16_t)GET_MV_SUBPEL(full_mv->col) }; return subpel_mv; } static inline void convert_fullmv_to_mv(int_mv *mv) { mv->as_mv = get_mv_from_fullmv(&mv->as_fullmv); } // Bits of precision used for the model #define WARPEDMODEL_PREC_BITS 16 #define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS) #define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3)) // Bits of subpel precision for warped interpolation #define WARPEDPIXEL_PREC_BITS 6 #define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS) #define WARP_PARAM_REDUCE_BITS 6 #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS) typedef struct { int global_warp_allowed; int local_warp_allowed; } WarpTypesAllowed; // The order of values in the wmmat matrix below is best described // by the affine transformation: // [x' (m2 m3 m0 [x // z . y' = m4 m5 m1 * y // 1] 0 0 1) 1] typedef struct { int32_t wmmat[MAX_PARAMDIM]; int16_t alpha, beta, gamma, delta; TransformationType wmtype; int8_t invalid; } WarpedMotionParams; /* clang-format off */ static const WarpedMotionParams default_warp_params = { { 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS) }, 0, 0, 0, 0, IDENTITY, 0, }; /* clang-format on */ // The following constants describe the various precisions // of different parameters in the global motion experiment. // // Given the general homography: // [x' (a b c [x // z . y' = d e f * y // 1] g h i) 1] // // Constants using the name ALPHA here are related to parameters // a, b, d, e. Constants using the name TRANS are related // to parameters c and f. // // Anything ending in PREC_BITS is the number of bits of precision // to maintain when converting from double to integer. // // The ABS parameters are used to create an upper and lower bound // for each parameter. In other words, after a parameter is integerized // it is clamped between -(1 << ABS_XXX_BITS) and (1 << ABS_XXX_BITS). // // XXX_PREC_DIFF and XXX_DECODE_FACTOR // are computed once here to prevent repetitive // computation on the decoder side. These are // to allow the global motion parameters to be encoded in a lower // precision than the warped model precision. This means that they // need to be changed to warped precision when they are decoded. // // XX_MIN, XX_MAX are also computed to avoid repeated computation #define SUBEXPFIN_K 3 #define GM_TRANS_PREC_BITS 6 #define GM_ABS_TRANS_BITS 12 #define GM_ABS_TRANS_ONLY_BITS (GM_ABS_TRANS_BITS - GM_TRANS_PREC_BITS + 3) #define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS) #define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3) #define GM_TRANS_DECODE_FACTOR (1 << GM_TRANS_PREC_DIFF) #define GM_TRANS_ONLY_DECODE_FACTOR (1 << GM_TRANS_ONLY_PREC_DIFF) #define GM_ALPHA_PREC_BITS 15 #define GM_ABS_ALPHA_BITS 12 #define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS) #define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF) #define GM_TRANS_MAX (1 << GM_ABS_TRANS_BITS) #define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS) #define GM_TRANS_MIN -GM_TRANS_MAX #define GM_ALPHA_MIN -GM_ALPHA_MAX static inline int block_center_x(int mi_col, BLOCK_SIZE bs) { const int bw = block_size_wide[bs]; return mi_col * MI_SIZE + bw / 2 - 1; } static inline int block_center_y(int mi_row, BLOCK_SIZE bs) { const int bh = block_size_high[bs]; return mi_row * MI_SIZE + bh / 2 - 1; } static inline int convert_to_trans_prec(int allow_hp, int coor) { if (allow_hp) return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 3); else return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2; } static inline void integer_mv_precision(MV *mv) { int mod = (mv->row % 8); if (mod != 0) { mv->row -= mod; if (abs(mod) > 4) { if (mod > 0) { mv->row += 8; } else { mv->row -= 8; } } } mod = (mv->col % 8); if (mod != 0) { mv->col -= mod; if (abs(mod) > 4) { if (mod > 0) { mv->col += 8; } else { mv->col -= 8; } } } } // Convert a global motion vector into a motion vector at the centre of the // given block. // // The resulting motion vector will have three fractional bits of precision. If // allow_hp is zero, the bottom bit will always be zero. If CONFIG_AMVR and // is_integer is true, the bottom three bits will be zero (so the motion vector // represents an integer) static inline int_mv gm_get_motion_vector(const WarpedMotionParams *gm, int allow_hp, BLOCK_SIZE bsize, int mi_col, int mi_row, int is_integer) { int_mv res; if (gm->wmtype == IDENTITY) { res.as_int = 0; return res; } const int32_t *mat = gm->wmmat; int x, y, tx, ty; if (gm->wmtype == TRANSLATION) { // All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16) // bits of fractional precision. The offset for a translation is stored in // entries 0 and 1. For translations, all but the top three (two if // cm->features.allow_high_precision_mv is false) fractional bits are always // zero. // // After the right shifts, there are 3 fractional bits of precision. If // allow_hp is false, the bottom bit is always zero (so we don't need a // call to convert_to_trans_prec here) // // Note: There is an AV1 specification bug here: // // gm->wmmat[0] is supposed to be the horizontal translation, and so should // go into res.as_mv.col, and gm->wmmat[1] is supposed to be the vertical // translation and so should go into res.as_mv.row // // However, in the spec, these assignments are accidentally reversed, and so // we must keep this incorrect logic to match the spec. // // See also: https://crbug.com/aomedia/3328 res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF; res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF; assert(IMPLIES(1 & (res.as_mv.row | res.as_mv.col), allow_hp)); if (is_integer) { integer_mv_precision(&res.as_mv); } return res; } x = block_center_x(mi_col, bsize); y = block_center_y(mi_row, bsize); if (gm->wmtype == ROTZOOM) { assert(gm->wmmat[5] == gm->wmmat[2]); assert(gm->wmmat[4] == -gm->wmmat[3]); } const int xc = (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0]; const int yc = mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1]; tx = convert_to_trans_prec(allow_hp, xc); ty = convert_to_trans_prec(allow_hp, yc); res.as_mv.row = ty; res.as_mv.col = tx; if (is_integer) { integer_mv_precision(&res.as_mv); } return res; } static inline TransformationType get_wmtype(const WarpedMotionParams *gm) { if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] && gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) { return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION); } if (gm->wmmat[2] == gm->wmmat[5] && gm->wmmat[3] == -gm->wmmat[4]) return ROTZOOM; else return AFFINE; } typedef struct candidate_mv { int_mv this_mv; int_mv comp_mv; } CANDIDATE_MV; static inline int is_zero_mv(const MV *mv) { return *((const uint32_t *)mv) == 0; } static inline int is_equal_mv(const MV *a, const MV *b) { return *((const uint32_t *)a) == *((const uint32_t *)b); } static inline void clamp_mv(MV *mv, const SubpelMvLimits *mv_limits) { mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max); mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max); } static inline void clamp_fullmv(FULLPEL_MV *mv, const FullMvLimits *mv_limits) { mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max); mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max); } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_MV_H_ aom-3.12.1/av1/common/mvref_common.c000066400000000000000000001614511477627663500172230ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/common/mvref_common.h" #include "av1/common/warped_motion.h" // Although we assign 32 bit integers, all the values are strictly under 14 // bits. static const int div_mult[32] = { 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092, 1024, 963, 910, 862, 819, 780, 744, 712, 682, 655, 630, 606, 585, 564, 546, 528 }; // TODO(jingning): Consider the use of lookup table for (num / den) // altogether. static inline void get_mv_projection(MV *output, MV ref, int num, int den) { den = AOMMIN(den, MAX_FRAME_DISTANCE); num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE) : AOMMAX(num, -MAX_FRAME_DISTANCE); const int mv_row = ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14); const int mv_col = ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14); const int clamp_max = MV_UPP - 1; const int clamp_min = MV_LOW + 1; output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max); output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max); } void av1_copy_frame_mvs(const AV1_COMMON *const cm, const MB_MODE_INFO *const mi, int mi_row, int mi_col, int x_mis, int y_mis) { const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1); MV_REF *frame_mvs = cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1); x_mis = ROUND_POWER_OF_TWO(x_mis, 1); y_mis = ROUND_POWER_OF_TWO(y_mis, 1); int w, h; for (h = 0; h < y_mis; h++) { MV_REF *mv = frame_mvs; for (w = 0; w < x_mis; w++) { mv->ref_frame = NONE_FRAME; mv->mv.as_int = 0; for (int idx = 0; idx < 2; ++idx) { MV_REFERENCE_FRAME ref_frame = mi->ref_frame[idx]; if (ref_frame > INTRA_FRAME) { int8_t ref_idx = cm->ref_frame_side[ref_frame]; if (ref_idx) continue; if ((abs(mi->mv[idx].as_mv.row) > REFMVS_LIMIT) || (abs(mi->mv[idx].as_mv.col) > REFMVS_LIMIT)) continue; mv->ref_frame = ref_frame; mv->mv.as_int = mi->mv[idx].as_int; } } mv++; } frame_mvs += frame_mvs_stride; } } static inline void add_ref_mv_candidate( const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2], uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count, CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight, int_mv *gm_mv_candidates, const WarpedMotionParams *gm_params, uint16_t weight) { if (!is_inter_block(candidate)) return; assert(weight % 2 == 0); int index, ref; if (rf[1] == NONE_FRAME) { // single reference frame for (ref = 0; ref < 2; ++ref) { if (candidate->ref_frame[ref] == rf[0]) { const int is_gm_block = is_global_mv_block(candidate, gm_params[rf[0]].wmtype); const int_mv this_refmv = is_gm_block ? gm_mv_candidates[0] : get_block_mv(candidate, ref); for (index = 0; index < *refmv_count; ++index) { if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) { ref_mv_weight[index] += weight; break; } } // Add a new item to the list. if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { ref_mv_stack[index].this_mv = this_refmv; ref_mv_weight[index] = weight; ++(*refmv_count); } if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count; ++*ref_match_count; } } } else { // compound reference frame if (candidate->ref_frame[0] == rf[0] && candidate->ref_frame[1] == rf[1]) { int_mv this_refmv[2]; for (ref = 0; ref < 2; ++ref) { if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype)) this_refmv[ref] = gm_mv_candidates[ref]; else this_refmv[ref] = get_block_mv(candidate, ref); } for (index = 0; index < *refmv_count; ++index) { if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) && (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)) { ref_mv_weight[index] += weight; break; } } // Add a new item to the list. if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { ref_mv_stack[index].this_mv = this_refmv[0]; ref_mv_stack[index].comp_mv = this_refmv[1]; ref_mv_weight[index] = weight; ++(*refmv_count); } if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count; ++*ref_match_count; } } } static inline void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_col, const MV_REFERENCE_FRAME rf[2], int row_offset, CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_row_offset, int *processed_rows) { int end_mi = AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col); end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]); const int width_8x8 = mi_size_wide[BLOCK_8X8]; const int width_16x16 = mi_size_wide[BLOCK_16X16]; int col_offset = 0; // TODO(jingning): Revisit this part after cb4x4 is stable. if (abs(row_offset) > 1) { col_offset = 1; if ((mi_col & 0x01) && xd->width < width_8x8) --col_offset; } const int use_step_16 = (xd->width >= 16); MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride; for (int i = 0; i < end_mi;) { const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i]; const int candidate_bsize = candidate->bsize; const int n4_w = mi_size_wide[candidate_bsize]; int len = AOMMIN(xd->width, n4_w); if (use_step_16) len = AOMMAX(width_16x16, len); else if (abs(row_offset) > 1) len = AOMMAX(len, width_8x8); uint16_t weight = 2; if (xd->width >= width_8x8 && xd->width <= n4_w) { uint16_t inc = AOMMIN(-max_row_offset + row_offset + 1, mi_size_high[candidate_bsize]); // Obtain range used in weight calculation. weight = AOMMAX(weight, inc); // Update processed rows. *processed_rows = inc - row_offset - 1; } add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count, newmv_count, ref_mv_stack, ref_mv_weight, gm_mv_candidates, cm->global_motion, len * weight); i += len; } } static inline void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row, const MV_REFERENCE_FRAME rf[2], int col_offset, CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_col_offset, int *processed_cols) { int end_mi = AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row); end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]); const int n8_h_8 = mi_size_high[BLOCK_8X8]; const int n8_h_16 = mi_size_high[BLOCK_16X16]; int i; int row_offset = 0; if (abs(col_offset) > 1) { row_offset = 1; if ((mi_row & 0x01) && xd->height < n8_h_8) --row_offset; } const int use_step_16 = (xd->height >= 16); for (i = 0; i < end_mi;) { const MB_MODE_INFO *const candidate = xd->mi[(row_offset + i) * xd->mi_stride + col_offset]; const int candidate_bsize = candidate->bsize; const int n4_h = mi_size_high[candidate_bsize]; int len = AOMMIN(xd->height, n4_h); if (use_step_16) len = AOMMAX(n8_h_16, len); else if (abs(col_offset) > 1) len = AOMMAX(len, n8_h_8); int weight = 2; if (xd->height >= n8_h_8 && xd->height <= n4_h) { int inc = AOMMIN(-max_col_offset + col_offset + 1, mi_size_wide[candidate_bsize]); // Obtain range used in weight calculation. weight = AOMMAX(weight, inc); // Update processed cols. *processed_cols = inc - col_offset - 1; } add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count, newmv_count, ref_mv_stack, ref_mv_weight, gm_mv_candidates, cm->global_motion, len * weight); i += len; } } static inline void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd, const int mi_row, const int mi_col, const MV_REFERENCE_FRAME rf[2], int row_offset, int col_offset, CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight, uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates, uint8_t *refmv_count) { const TileInfo *const tile = &xd->tile; POSITION mi_pos; mi_pos.row = row_offset; mi_pos.col = col_offset; if (is_inside(tile, mi_col, mi_row, &mi_pos)) { const MB_MODE_INFO *const candidate = xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col]; const int len = mi_size_wide[BLOCK_8X8]; add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count, newmv_count, ref_mv_stack, ref_mv_weight, gm_mv_candidates, cm->global_motion, 2 * len); } // Analyze a single 8x8 block motion information. } static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row, int mi_col, int bs) { const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size]; const int mask_row = mi_row & (sb_mi_size - 1); const int mask_col = mi_col & (sb_mi_size - 1); if (bs > mi_size_wide[BLOCK_64X64]) return 0; // In a split partition all apart from the bottom right has a top right int has_tr = !((mask_row & bs) && (mask_col & bs)); // bs > 0 and bs is a power of 2 assert(bs > 0 && !(bs & (bs - 1))); // For each 4x4 group of blocks, when the bottom right is decoded the blocks // to the right have not been decoded therefore the bottom right does // not have a top right while (bs < sb_mi_size) { if (mask_col & bs) { if ((mask_col & (2 * bs)) && (mask_row & (2 * bs))) { has_tr = 0; break; } } else { break; } bs <<= 1; } // In a VERTICAL or VERTICAL_4 partition, all partition before the last one // always have a top right (as the block above will have been decoded). if (xd->width < xd->height) { if (!xd->is_last_vertical_rect) has_tr = 1; } // In a HORIZONTAL or HORIZONTAL_4 partition, partitions after the first one // never have a top right (as the block to the right won't have been decoded). if (xd->width > xd->height) { if (!xd->is_first_horizontal_rect) has_tr = 0; } // The bottom left square of a Vertical A (in the old format) does // not have a top right as it is decoded before the right hand // rectangle of the partition if (xd->mi[0]->partition == PARTITION_VERT_A) { if (xd->width == xd->height) if (mask_row & bs) has_tr = 0; } return has_tr; } static int check_sb_border(const int mi_row, const int mi_col, const int row_offset, const int col_offset) { const int sb_mi_size = mi_size_wide[BLOCK_64X64]; const int row = mi_row & (sb_mi_size - 1); const int col = mi_col & (sb_mi_size - 1); if (row + row_offset < 0 || row + row_offset >= sb_mi_size || col + col_offset < 0 || col + col_offset >= sb_mi_size) return 0; return 1; } static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame, int blk_row, int blk_col, int_mv *gm_mv_candidates, uint8_t *const refmv_count, CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE], int16_t *mode_context) { POSITION mi_pos; mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1; mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1; if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0; const TPL_MV_REF *prev_frame_mvs = cm->tpl_mvs + ((mi_row + mi_pos.row) >> 1) * (cm->mi_params.mi_stride >> 1) + ((mi_col + mi_pos.col) >> 1); if (prev_frame_mvs->mfmv0.as_int == INVALID_MV) return 0; MV_REFERENCE_FRAME rf[2]; av1_set_ref_frame(rf, ref_frame); const uint16_t weight_unit = 1; // mi_size_wide[BLOCK_8X8]; const int cur_frame_index = cm->cur_frame->order_hint; const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]); const int frame0_index = buf_0->order_hint; const int cur_offset_0 = get_relative_dist(&cm->seq_params->order_hint_info, cur_frame_index, frame0_index); int idx; const int allow_high_precision_mv = cm->features.allow_high_precision_mv; const int force_integer_mv = cm->features.cur_frame_force_integer_mv; int_mv this_refmv; get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv, cur_offset_0, prev_frame_mvs->ref_frame_offset); lower_mv_precision(&this_refmv.as_mv, allow_high_precision_mv, force_integer_mv); if (rf[1] == NONE_FRAME) { if (blk_row == 0 && blk_col == 0) { if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 || abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16) mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET); } for (idx = 0; idx < *refmv_count; ++idx) if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break; if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit; if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int; ref_mv_weight[idx] = 2 * weight_unit; ++(*refmv_count); } } else { // Process compound inter mode const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]); const int frame1_index = buf_1->order_hint; const int cur_offset_1 = get_relative_dist(&cm->seq_params->order_hint_info, cur_frame_index, frame1_index); int_mv comp_refmv; get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv, cur_offset_1, prev_frame_mvs->ref_frame_offset); lower_mv_precision(&comp_refmv.as_mv, allow_high_precision_mv, force_integer_mv); if (blk_row == 0 && blk_col == 0) { if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 || abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 || abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 || abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16) mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET); } for (idx = 0; idx < *refmv_count; ++idx) { if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int && comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int) break; } if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit; if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int; ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int; ref_mv_weight[idx] = 2 * weight_unit; ++(*refmv_count); } } return 1; } static inline void process_compound_ref_mv_candidate( const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm, const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2], int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) { for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx]; for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) { if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) { ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx]; ++ref_id_count[cmp_idx]; } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) { int_mv this_mv = candidate->mv[rf_idx]; if (cm->ref_frame_sign_bias[can_rf] != cm->ref_frame_sign_bias[rf[cmp_idx]]) { this_mv.as_mv.row = -this_mv.as_mv.row; this_mv.as_mv.col = -this_mv.as_mv.col; } ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv; ++ref_diff_count[cmp_idx]; } } } } static inline void process_single_ref_mv_candidate( const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm, MV_REFERENCE_FRAME ref_frame, uint8_t *const refmv_count, CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE]) { for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { if (candidate->ref_frame[rf_idx] > INTRA_FRAME) { int_mv this_mv = candidate->mv[rf_idx]; if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] != cm->ref_frame_sign_bias[ref_frame]) { this_mv.as_mv.row = -this_mv.as_mv.row; this_mv.as_mv.col = -this_mv.as_mv.col; } int stack_idx; for (stack_idx = 0; stack_idx < *refmv_count; ++stack_idx) { const int_mv stack_mv = ref_mv_stack[stack_idx].this_mv; if (this_mv.as_int == stack_mv.as_int) break; } if (stack_idx == *refmv_count) { ref_mv_stack[stack_idx].this_mv = this_mv; // TODO(jingning): Set an arbitrary small number here. The weight // doesn't matter as long as it is properly initialized. ref_mv_weight[stack_idx] = 2; ++(*refmv_count); } } } } static inline void setup_ref_mv_list( const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame, uint8_t *const refmv_count, CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE], int_mv mv_ref_list[MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates, int mi_row, int mi_col, int16_t *mode_context) { const int bs = AOMMAX(xd->width, xd->height); const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs); MV_REFERENCE_FRAME rf[2]; const TileInfo *const tile = &xd->tile; int max_row_offset = 0, max_col_offset = 0; const int row_adj = (xd->height < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01); const int col_adj = (xd->width < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01); int processed_rows = 0; int processed_cols = 0; av1_set_ref_frame(rf, ref_frame); mode_context[ref_frame] = 0; *refmv_count = 0; // Find valid maximum row/col offset. if (xd->up_available) { max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj; if (xd->height < mi_size_high[BLOCK_8X8]) max_row_offset = -(2 << 1) + row_adj; max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset); } if (xd->left_available) { max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj; if (xd->width < mi_size_wide[BLOCK_8X8]) max_col_offset = -(2 << 1) + col_adj; max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset); } uint8_t col_match_count = 0; uint8_t row_match_count = 0; uint8_t newmv_count = 0; // Scan the first above row mode info. row_offset = -1; if (abs(max_row_offset) >= 1) scan_row_mbmi(cm, xd, mi_col, rf, -1, ref_mv_stack, ref_mv_weight, refmv_count, &row_match_count, &newmv_count, gm_mv_candidates, max_row_offset, &processed_rows); // Scan the first left column mode info. col_offset = -1; if (abs(max_col_offset) >= 1) scan_col_mbmi(cm, xd, mi_row, rf, -1, ref_mv_stack, ref_mv_weight, refmv_count, &col_match_count, &newmv_count, gm_mv_candidates, max_col_offset, &processed_cols); // Check top-right boundary if (has_tr) scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->width, ref_mv_stack, ref_mv_weight, &row_match_count, &newmv_count, gm_mv_candidates, refmv_count); const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0); const uint8_t nearest_refmv_count = *refmv_count; // TODO(yunqing): for comp_search, do it for all 3 cases. for (int idx = 0; idx < nearest_refmv_count; ++idx) ref_mv_weight[idx] += REF_CAT_LEVEL; if (cm->features.allow_ref_frame_mvs) { int is_available = 0; const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->height); const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->width); const int blk_row_end = AOMMIN(xd->height, mi_size_high[BLOCK_64X64]); const int blk_col_end = AOMMIN(xd->width, mi_size_wide[BLOCK_64X64]); const int tpl_sample_pos[3][2] = { { voffset, -2 }, { voffset, hoffset }, { voffset - 2, hoffset }, }; const int allow_extension = (xd->height >= mi_size_high[BLOCK_8X8]) && (xd->height < mi_size_high[BLOCK_64X64]) && (xd->width >= mi_size_wide[BLOCK_8X8]) && (xd->width < mi_size_wide[BLOCK_64X64]); const int step_h = (xd->height >= mi_size_high[BLOCK_64X64]) ? mi_size_high[BLOCK_16X16] : mi_size_high[BLOCK_8X8]; const int step_w = (xd->width >= mi_size_wide[BLOCK_64X64]) ? mi_size_wide[BLOCK_16X16] : mi_size_wide[BLOCK_8X8]; for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) { for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) { int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col, gm_mv_candidates, refmv_count, ref_mv_stack, ref_mv_weight, mode_context); if (blk_row == 0 && blk_col == 0) is_available = ret; } } if (is_available == 0) mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET); for (int i = 0; i < 3 && allow_extension; ++i) { const int blk_row = tpl_sample_pos[i][0]; const int blk_col = tpl_sample_pos[i][1]; if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue; add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col, gm_mv_candidates, refmv_count, ref_mv_stack, ref_mv_weight, mode_context); } } uint8_t dummy_newmv_count = 0; // Scan the second outer area. scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack, ref_mv_weight, &row_match_count, &dummy_newmv_count, gm_mv_candidates, refmv_count); for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) { const int row_offset = -(idx << 1) + 1 + row_adj; const int col_offset = -(idx << 1) + 1 + col_adj; if (abs(row_offset) <= abs(max_row_offset) && abs(row_offset) > processed_rows) scan_row_mbmi(cm, xd, mi_col, rf, row_offset, ref_mv_stack, ref_mv_weight, refmv_count, &row_match_count, &dummy_newmv_count, gm_mv_candidates, max_row_offset, &processed_rows); if (abs(col_offset) <= abs(max_col_offset) && abs(col_offset) > processed_cols) scan_col_mbmi(cm, xd, mi_row, rf, col_offset, ref_mv_stack, ref_mv_weight, refmv_count, &col_match_count, &dummy_newmv_count, gm_mv_candidates, max_col_offset, &processed_cols); } const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0); switch (nearest_match) { case 0: if (ref_match_count >= 1) mode_context[ref_frame] |= 1; if (ref_match_count == 1) mode_context[ref_frame] |= (1 << REFMV_OFFSET); else if (ref_match_count >= 2) mode_context[ref_frame] |= (2 << REFMV_OFFSET); break; case 1: mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3; if (ref_match_count == 1) mode_context[ref_frame] |= (3 << REFMV_OFFSET); else if (ref_match_count >= 2) mode_context[ref_frame] |= (4 << REFMV_OFFSET); break; case 2: default: if (newmv_count >= 1) mode_context[ref_frame] |= 4; else mode_context[ref_frame] |= 5; mode_context[ref_frame] |= (5 << REFMV_OFFSET); break; } // Rank the likelihood and assign nearest and near mvs. int len = nearest_refmv_count; while (len > 0) { int nr_len = 0; for (int idx = 1; idx < len; ++idx) { if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) { const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1]; const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1]; ref_mv_stack[idx - 1] = ref_mv_stack[idx]; ref_mv_stack[idx] = tmp_mv; ref_mv_weight[idx - 1] = ref_mv_weight[idx]; ref_mv_weight[idx] = tmp_ref_mv_weight; nr_len = idx; } } len = nr_len; } len = *refmv_count; while (len > nearest_refmv_count) { int nr_len = nearest_refmv_count; for (int idx = nearest_refmv_count + 1; idx < len; ++idx) { if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) { const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1]; const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1]; ref_mv_stack[idx - 1] = ref_mv_stack[idx]; ref_mv_stack[idx] = tmp_mv; ref_mv_weight[idx - 1] = ref_mv_weight[idx]; ref_mv_weight[idx] = tmp_ref_mv_weight; nr_len = idx; } } len = nr_len; } int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->width); mi_width = AOMMIN(mi_width, cm->mi_params.mi_cols - mi_col); int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->height); mi_height = AOMMIN(mi_height, cm->mi_params.mi_rows - mi_row); const int mi_size = AOMMIN(mi_width, mi_height); if (rf[1] > NONE_FRAME) { // TODO(jingning, yunqing): Refactor and consolidate the compound and // single reference frame modes. Reduce unnecessary redundancy. if (*refmv_count < MAX_MV_REF_CANDIDATES) { int_mv ref_id[2][2], ref_diff[2][2]; int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 }; for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) { const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx]; process_compound_ref_mv_candidate( candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count); idx += mi_size_wide[candidate->bsize]; } for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) { const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1]; process_compound_ref_mv_candidate( candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count); idx += mi_size_high[candidate->bsize]; } // Build up the compound mv predictor int_mv comp_list[MAX_MV_REF_CANDIDATES][2]; for (int idx = 0; idx < 2; ++idx) { int comp_idx = 0; for (int list_idx = 0; list_idx < ref_id_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES; ++list_idx, ++comp_idx) comp_list[comp_idx][idx] = ref_id[idx][list_idx]; for (int list_idx = 0; list_idx < ref_diff_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES; ++list_idx, ++comp_idx) comp_list[comp_idx][idx] = ref_diff[idx][list_idx]; for (; comp_idx < MAX_MV_REF_CANDIDATES; ++comp_idx) comp_list[comp_idx][idx] = gm_mv_candidates[idx]; } if (*refmv_count) { assert(*refmv_count == 1); if (comp_list[0][0].as_int == ref_mv_stack[0].this_mv.as_int && comp_list[0][1].as_int == ref_mv_stack[0].comp_mv.as_int) { ref_mv_stack[*refmv_count].this_mv = comp_list[1][0]; ref_mv_stack[*refmv_count].comp_mv = comp_list[1][1]; } else { ref_mv_stack[*refmv_count].this_mv = comp_list[0][0]; ref_mv_stack[*refmv_count].comp_mv = comp_list[0][1]; } ref_mv_weight[*refmv_count] = 2; ++*refmv_count; } else { for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) { ref_mv_stack[*refmv_count].this_mv = comp_list[idx][0]; ref_mv_stack[*refmv_count].comp_mv = comp_list[idx][1]; ref_mv_weight[*refmv_count] = 2; ++*refmv_count; } } } assert(*refmv_count >= 2); for (int idx = 0; idx < *refmv_count; ++idx) { clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2, xd->height << MI_SIZE_LOG2, xd); clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv, xd->width << MI_SIZE_LOG2, xd->height << MI_SIZE_LOG2, xd); } } else { // Handle single reference frame extension for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size && *refmv_count < MAX_MV_REF_CANDIDATES;) { const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx]; process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count, ref_mv_stack, ref_mv_weight); idx += mi_size_wide[candidate->bsize]; } for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size && *refmv_count < MAX_MV_REF_CANDIDATES;) { const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1]; process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count, ref_mv_stack, ref_mv_weight); idx += mi_size_high[candidate->bsize]; } for (int idx = 0; idx < *refmv_count; ++idx) { clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2, xd->height << MI_SIZE_LOG2, xd); } if (mv_ref_list != NULL) { for (int idx = *refmv_count; idx < MAX_MV_REF_CANDIDATES; ++idx) mv_ref_list[idx].as_int = gm_mv_candidates[0].as_int; for (int idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *refmv_count); ++idx) { mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int; } } } } void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd, MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, uint8_t ref_mv_count[MODE_CTX_REF_FRAMES], CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE], uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE], int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *global_mvs, int16_t *mode_context) { const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; int_mv gm_mv[2]; if (ref_frame == INTRA_FRAME) { gm_mv[0].as_int = gm_mv[1].as_int = 0; if (global_mvs != NULL) { global_mvs[ref_frame].as_int = INVALID_MV; } } else { const BLOCK_SIZE bsize = mi->bsize; const int allow_high_precision_mv = cm->features.allow_high_precision_mv; const int force_integer_mv = cm->features.cur_frame_force_integer_mv; if (ref_frame < REF_FRAMES) { gm_mv[0] = gm_get_motion_vector(&cm->global_motion[ref_frame], allow_high_precision_mv, bsize, mi_col, mi_row, force_integer_mv); gm_mv[1].as_int = 0; if (global_mvs != NULL) global_mvs[ref_frame] = gm_mv[0]; } else { MV_REFERENCE_FRAME rf[2]; av1_set_ref_frame(rf, ref_frame); gm_mv[0] = gm_get_motion_vector(&cm->global_motion[rf[0]], allow_high_precision_mv, bsize, mi_col, mi_row, force_integer_mv); gm_mv[1] = gm_get_motion_vector(&cm->global_motion[rf[1]], allow_high_precision_mv, bsize, mi_col, mi_row, force_integer_mv); } } setup_ref_mv_list(cm, xd, ref_frame, &ref_mv_count[ref_frame], ref_mv_stack[ref_frame], ref_mv_weight[ref_frame], mv_ref_list ? mv_ref_list[ref_frame] : NULL, gm_mv, mi_row, mi_col, mode_context); } void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv, int_mv *near_mv, int is_integer) { int i; // Make sure all the candidates are properly clamped etc for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { lower_mv_precision(&mvlist[i].as_mv, allow_hp, is_integer); } *nearest_mv = mvlist[0]; *near_mv = mvlist[1]; } void av1_setup_frame_buf_refs(AV1_COMMON *cm) { cm->cur_frame->order_hint = cm->current_frame.order_hint; cm->cur_frame->display_order_hint = cm->current_frame.display_order_hint; cm->cur_frame->pyramid_level = cm->current_frame.pyramid_level; MV_REFERENCE_FRAME ref_frame; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); if (buf != NULL) { cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME] = buf->order_hint; cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME] = buf->display_order_hint; } } } void av1_setup_frame_sign_bias(AV1_COMMON *cm) { MV_REFERENCE_FRAME ref_frame; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); if (cm->seq_params->order_hint_info.enable_order_hint && buf != NULL) { const int ref_order_hint = buf->order_hint; cm->ref_frame_sign_bias[ref_frame] = (get_relative_dist(&cm->seq_params->order_hint_info, ref_order_hint, (int)cm->current_frame.order_hint) <= 0) ? 0 : 1; } else { cm->ref_frame_sign_bias[ref_frame] = 0; } } } #define MAX_OFFSET_WIDTH 64 #define MAX_OFFSET_HEIGHT 0 static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row, int blk_col, MV mv, int sign_bias) { const int base_blk_row = (blk_row >> 3) << 3; const int base_blk_col = (blk_col >> 3) << 3; const int row_offset = (mv.row >= 0) ? (mv.row >> (4 + MI_SIZE_LOG2)) : -((-mv.row) >> (4 + MI_SIZE_LOG2)); const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2)) : -((-mv.col) >> (4 + MI_SIZE_LOG2)); const int row = (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset; const int col = (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset; if (row < 0 || row >= (cm->mi_params.mi_rows >> 1) || col < 0 || col >= (cm->mi_params.mi_cols >> 1)) return 0; if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) || row >= base_blk_row + 8 + (MAX_OFFSET_HEIGHT >> 3) || col < base_blk_col - (MAX_OFFSET_WIDTH >> 3) || col >= base_blk_col + 8 + (MAX_OFFSET_WIDTH >> 3)) return 0; *mi_r = row; *mi_c = col; return 1; } // Note: motion_filed_projection finds motion vectors of current frame's // reference frame, and projects them to current frame. To make it clear, // let's call current frame's reference frame as start frame. // Call Start frame's reference frames as reference frames. // Call ref_offset as frame distances between start frame and its reference // frames. static int motion_field_projection(AV1_COMMON *cm, MV_REFERENCE_FRAME start_frame, int dir) { TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs; int ref_offset[REF_FRAMES] = { 0 }; const RefCntBuffer *const start_frame_buf = get_ref_frame_buf(cm, start_frame); if (start_frame_buf == NULL) return 0; if (start_frame_buf->frame_type == KEY_FRAME || start_frame_buf->frame_type == INTRA_ONLY_FRAME) return 0; if (start_frame_buf->mi_rows != cm->mi_params.mi_rows || start_frame_buf->mi_cols != cm->mi_params.mi_cols) return 0; const int start_frame_order_hint = start_frame_buf->order_hint; const unsigned int *const ref_order_hints = &start_frame_buf->ref_order_hints[0]; const int cur_order_hint = cm->cur_frame->order_hint; int start_to_current_frame_offset = get_relative_dist( &cm->seq_params->order_hint_info, start_frame_order_hint, cur_order_hint); for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) { ref_offset[rf] = get_relative_dist(&cm->seq_params->order_hint_info, start_frame_order_hint, ref_order_hints[rf - LAST_FRAME]); } if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset; MV_REF *mv_ref_base = start_frame_buf->mvs; const int mvs_rows = (cm->mi_params.mi_rows + 1) >> 1; const int mvs_cols = (cm->mi_params.mi_cols + 1) >> 1; for (int blk_row = 0; blk_row < mvs_rows; ++blk_row) { for (int blk_col = 0; blk_col < mvs_cols; ++blk_col) { MV_REF *mv_ref = &mv_ref_base[blk_row * mvs_cols + blk_col]; MV fwd_mv = mv_ref->mv.as_mv; if (mv_ref->ref_frame > INTRA_FRAME) { int_mv this_mv; int mi_r, mi_c; const int ref_frame_offset = ref_offset[mv_ref->ref_frame]; int pos_valid = abs(ref_frame_offset) <= MAX_FRAME_DISTANCE && ref_frame_offset > 0 && abs(start_to_current_frame_offset) <= MAX_FRAME_DISTANCE; if (pos_valid) { get_mv_projection(&this_mv.as_mv, fwd_mv, start_to_current_frame_offset, ref_frame_offset); pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col, this_mv.as_mv, dir >> 1); } if (pos_valid) { const int mi_offset = mi_r * (cm->mi_params.mi_stride >> 1) + mi_c; tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row; tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col; tpl_mvs_base[mi_offset].ref_frame_offset = ref_frame_offset; } } } } return 1; } // cm->ref_frame_side is calculated here, and will be used in // av1_copy_frame_mvs() to affect how mvs are copied. void av1_calculate_ref_frame_side(AV1_COMMON *cm) { const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info; memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side)); if (!order_hint_info->enable_order_hint) return; const int cur_order_hint = cm->cur_frame->order_hint; for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); int order_hint = 0; if (buf != NULL) order_hint = buf->order_hint; if (get_relative_dist(order_hint_info, order_hint, cur_order_hint) > 0) cm->ref_frame_side[ref_frame] = 1; else if (order_hint == cur_order_hint) cm->ref_frame_side[ref_frame] = -1; } } void av1_setup_motion_field(AV1_COMMON *cm) { const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info; if (!order_hint_info->enable_order_hint) return; TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs; int size = ((cm->mi_params.mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_params.mi_stride >> 1); for (int idx = 0; idx < size; ++idx) { tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV; tpl_mvs_base[idx].ref_frame_offset = 0; } const int cur_order_hint = cm->cur_frame->order_hint; const RefCntBuffer *ref_buf[INTER_REFS_PER_FRAME]; int ref_order_hint[INTER_REFS_PER_FRAME]; for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { const int ref_idx = ref_frame - LAST_FRAME; const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); int order_hint = 0; if (buf != NULL) order_hint = buf->order_hint; ref_buf[ref_idx] = buf; ref_order_hint[ref_idx] = order_hint; } int ref_stamp = MFMV_STACK_SIZE - 1; if (ref_buf[LAST_FRAME - LAST_FRAME] != NULL) { const int alt_of_lst_order_hint = ref_buf[LAST_FRAME - LAST_FRAME] ->ref_order_hints[ALTREF_FRAME - LAST_FRAME]; const int is_lst_overlay = (alt_of_lst_order_hint == ref_order_hint[GOLDEN_FRAME - LAST_FRAME]); if (!is_lst_overlay) motion_field_projection(cm, LAST_FRAME, 2); --ref_stamp; } if (get_relative_dist(order_hint_info, ref_order_hint[BWDREF_FRAME - LAST_FRAME], cur_order_hint) > 0) { if (motion_field_projection(cm, BWDREF_FRAME, 0)) --ref_stamp; } if (get_relative_dist(order_hint_info, ref_order_hint[ALTREF2_FRAME - LAST_FRAME], cur_order_hint) > 0) { if (motion_field_projection(cm, ALTREF2_FRAME, 0)) --ref_stamp; } if (get_relative_dist(order_hint_info, ref_order_hint[ALTREF_FRAME - LAST_FRAME], cur_order_hint) > 0 && ref_stamp >= 0) if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp; if (ref_stamp >= 0) motion_field_projection(cm, LAST2_FRAME, 2); } static inline void record_samples(const MB_MODE_INFO *mbmi, int *pts, int *pts_inref, int row_offset, int sign_r, int col_offset, int sign_c) { const int bw = block_size_wide[mbmi->bsize]; const int bh = block_size_high[mbmi->bsize]; const int x = col_offset * MI_SIZE + sign_c * bw / 2 - 1; const int y = row_offset * MI_SIZE + sign_r * bh / 2 - 1; pts[0] = GET_MV_SUBPEL(x); pts[1] = GET_MV_SUBPEL(y); pts_inref[0] = pts[0] + mbmi->mv[0].as_mv.col; pts_inref[1] = pts[1] + mbmi->mv[0].as_mv.row; } // Select samples according to the motion vector difference. uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize) { const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int thresh = clamp(AOMMAX(bw, bh), 16, 112); uint8_t ret = 0; assert(len <= LEAST_SQUARES_SAMPLES_MAX); // Only keep the samples with MV differences within threshold. for (int i = 0; i < len; ++i) { const int diff = abs(pts_inref[2 * i] - pts[2 * i] - mv->col) + abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv->row); if (diff > thresh) continue; if (ret != i) { memcpy(pts + 2 * ret, pts + 2 * i, 2 * sizeof(pts[0])); memcpy(pts_inref + 2 * ret, pts_inref + 2 * i, 2 * sizeof(pts_inref[0])); } ++ret; } // Keep at least 1 sample. return AOMMAX(ret, 1); } // Note: Samples returned are at 1/8-pel precision // Sample are the neighbor block center point's coordinates relative to the // left-top pixel of current block. uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts, int *pts_inref) { const MB_MODE_INFO *const mbmi0 = xd->mi[0]; const int ref_frame = mbmi0->ref_frame[0]; const int up_available = xd->up_available; const int left_available = xd->left_available; uint8_t np = 0; int do_tl = 1; int do_tr = 1; const int mi_stride = xd->mi_stride; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; // scan the nearest above rows if (up_available) { const int mi_row_offset = -1; const MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * mi_stride]; uint8_t superblock_width = mi_size_wide[mbmi->bsize]; if (xd->width <= superblock_width) { // Handle "current block width <= above block width" case. const int col_offset = -mi_col % superblock_width; if (col_offset < 0) do_tl = 0; if (col_offset + superblock_width > xd->width) do_tr = 0; if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1); pts += 2; pts_inref += 2; if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; } } else { // Handle "current block width > above block width" case. for (int i = 0; i < AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col); i += superblock_width) { mbmi = xd->mi[i + mi_row_offset * mi_stride]; superblock_width = mi_size_wide[mbmi->bsize]; if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { record_samples(mbmi, pts, pts_inref, 0, -1, i, 1); pts += 2; pts_inref += 2; if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; } } } } assert(np <= LEAST_SQUARES_SAMPLES_MAX); // scan the nearest left columns if (left_available) { const int mi_col_offset = -1; const MB_MODE_INFO *mbmi = xd->mi[mi_col_offset]; uint8_t superblock_height = mi_size_high[mbmi->bsize]; if (xd->height <= superblock_height) { // Handle "current block height <= above block height" case. const int row_offset = -mi_row % superblock_height; if (row_offset < 0) do_tl = 0; if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { record_samples(mbmi, pts, pts_inref, row_offset, 1, 0, -1); pts += 2; pts_inref += 2; np++; if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; } } else { // Handle "current block height > above block height" case. for (int i = 0; i < AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row); i += superblock_height) { mbmi = xd->mi[mi_col_offset + i * mi_stride]; superblock_height = mi_size_high[mbmi->bsize]; if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { record_samples(mbmi, pts, pts_inref, i, 1, 0, -1); pts += 2; pts_inref += 2; if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; } } } } assert(np <= LEAST_SQUARES_SAMPLES_MAX); // Top-left block if (do_tl && left_available && up_available) { const int mi_row_offset = -1; const int mi_col_offset = -1; MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * mi_stride]; if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1); pts += 2; pts_inref += 2; if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; } } assert(np <= LEAST_SQUARES_SAMPLES_MAX); // Top-right block if (do_tr && has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->width, xd->height))) { const POSITION trb_pos = { -1, xd->width }; const TileInfo *const tile = &xd->tile; if (is_inside(tile, mi_col, mi_row, &trb_pos)) { const int mi_row_offset = -1; const int mi_col_offset = xd->width; const MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * mi_stride]; if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { record_samples(mbmi, pts, pts_inref, 0, -1, xd->width, 1); if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; } } } assert(np <= LEAST_SQUARES_SAMPLES_MAX); return np; } void av1_setup_skip_mode_allowed(AV1_COMMON *cm) { const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info; SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info; skip_mode_info->skip_mode_allowed = 0; skip_mode_info->ref_frame_idx_0 = INVALID_IDX; skip_mode_info->ref_frame_idx_1 = INVALID_IDX; if (!order_hint_info->enable_order_hint || frame_is_intra_only(cm) || cm->current_frame.reference_mode == SINGLE_REFERENCE) return; const int cur_order_hint = cm->current_frame.order_hint; int ref_order_hints[2] = { -1, INT_MAX }; int ref_idx[2] = { INVALID_IDX, INVALID_IDX }; // Identify the nearest forward and backward references. for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i); if (buf == NULL) continue; const int ref_order_hint = buf->order_hint; if (get_relative_dist(order_hint_info, ref_order_hint, cur_order_hint) < 0) { // Forward reference if (ref_order_hints[0] == -1 || get_relative_dist(order_hint_info, ref_order_hint, ref_order_hints[0]) > 0) { ref_order_hints[0] = ref_order_hint; ref_idx[0] = i; } } else if (get_relative_dist(order_hint_info, ref_order_hint, cur_order_hint) > 0) { // Backward reference if (ref_order_hints[1] == INT_MAX || get_relative_dist(order_hint_info, ref_order_hint, ref_order_hints[1]) < 0) { ref_order_hints[1] = ref_order_hint; ref_idx[1] = i; } } } if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) { // == Bi-directional prediction == skip_mode_info->skip_mode_allowed = 1; skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]); skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]); } else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) { // == Forward prediction only == // Identify the second nearest forward reference. ref_order_hints[1] = -1; for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i); if (buf == NULL) continue; const int ref_order_hint = buf->order_hint; if ((ref_order_hints[0] != -1 && get_relative_dist(order_hint_info, ref_order_hint, ref_order_hints[0]) < 0) && (ref_order_hints[1] == -1 || get_relative_dist(order_hint_info, ref_order_hint, ref_order_hints[1]) > 0)) { // Second closest forward reference ref_order_hints[1] = ref_order_hint; ref_idx[1] = i; } } if (ref_order_hints[1] != -1) { skip_mode_info->skip_mode_allowed = 1; skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]); skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]); } } } typedef struct { int map_idx; // frame map index RefCntBuffer *buf; // frame buffer int sort_idx; // index based on the offset to be used for sorting } REF_FRAME_INFO; // Compares the sort_idx fields. If they are equal, then compares the map_idx // fields to break the tie. This ensures a stable sort. static int compare_ref_frame_info(const void *arg_a, const void *arg_b) { const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a; const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b; const int sort_idx_diff = info_a->sort_idx - info_b->sort_idx; if (sort_idx_diff != 0) return sort_idx_diff; return info_a->map_idx - info_b->map_idx; } static inline void set_ref_frame_info(int *remapped_ref_idx, int frame_idx, REF_FRAME_INFO *ref_info) { assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME); remapped_ref_idx[frame_idx] = ref_info->map_idx; } void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx, int lst_map_idx, int gld_map_idx) { int lst_frame_sort_idx = -1; int gld_frame_sort_idx = -1; assert(cm->seq_params->order_hint_info.enable_order_hint); assert(cm->seq_params->order_hint_info.order_hint_bits_minus_1 >= 0); const int cur_order_hint = (int)cm->current_frame.order_hint; const int cur_frame_sort_idx = 1 << cm->seq_params->order_hint_info.order_hint_bits_minus_1; REF_FRAME_INFO ref_frame_info[REF_FRAMES]; int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 }; for (int i = 0; i < REF_FRAMES; ++i) { const int map_idx = i; ref_frame_info[i].map_idx = map_idx; ref_frame_info[i].sort_idx = -1; RefCntBuffer *const buf = cm->ref_frame_map[map_idx]; ref_frame_info[i].buf = buf; if (buf == NULL) continue; // If this assertion fails, there is a reference leak. assert(buf->ref_count > 0); const int offset = (int)buf->order_hint; ref_frame_info[i].sort_idx = (offset == -1) ? -1 : cur_frame_sort_idx + get_relative_dist(&cm->seq_params->order_hint_info, offset, cur_order_hint); assert(ref_frame_info[i].sort_idx >= -1); if (map_idx == lst_map_idx) lst_frame_sort_idx = ref_frame_info[i].sort_idx; if (map_idx == gld_map_idx) gld_frame_sort_idx = ref_frame_info[i].sort_idx; } // Confirm both LAST_FRAME and GOLDEN_FRAME are valid forward reference // frames. if (lst_frame_sort_idx == -1 || lst_frame_sort_idx >= cur_frame_sort_idx) { aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Inter frame requests a look-ahead frame as LAST"); } if (gld_frame_sort_idx == -1 || gld_frame_sort_idx >= cur_frame_sort_idx) { aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Inter frame requests a look-ahead frame as GOLDEN"); } // Sort ref frames based on their frame_offset values. qsort(ref_frame_info, REF_FRAMES, sizeof(REF_FRAME_INFO), compare_ref_frame_info); // Identify forward and backward reference frames. // Forward reference: offset < order_hint // Backward reference: offset >= order_hint int fwd_start_idx = 0, fwd_end_idx = REF_FRAMES - 1; for (int i = 0; i < REF_FRAMES; i++) { if (ref_frame_info[i].sort_idx == -1) { fwd_start_idx++; continue; } if (ref_frame_info[i].sort_idx >= cur_frame_sort_idx) { fwd_end_idx = i - 1; break; } } int bwd_start_idx = fwd_end_idx + 1; int bwd_end_idx = REF_FRAMES - 1; // === Backward Reference Frames === // == ALTREF_FRAME == if (bwd_start_idx <= bwd_end_idx) { set_ref_frame_info(remapped_ref_idx, ALTREF_FRAME - LAST_FRAME, &ref_frame_info[bwd_end_idx]); ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1; bwd_end_idx--; } // == BWDREF_FRAME == if (bwd_start_idx <= bwd_end_idx) { set_ref_frame_info(remapped_ref_idx, BWDREF_FRAME - LAST_FRAME, &ref_frame_info[bwd_start_idx]); ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1; bwd_start_idx++; } // == ALTREF2_FRAME == if (bwd_start_idx <= bwd_end_idx) { set_ref_frame_info(remapped_ref_idx, ALTREF2_FRAME - LAST_FRAME, &ref_frame_info[bwd_start_idx]); ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1; } // === Forward Reference Frames === for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) { // == LAST_FRAME == if (ref_frame_info[i].map_idx == lst_map_idx) { set_ref_frame_info(remapped_ref_idx, LAST_FRAME - LAST_FRAME, &ref_frame_info[i]); ref_flag_list[LAST_FRAME - LAST_FRAME] = 1; } // == GOLDEN_FRAME == if (ref_frame_info[i].map_idx == gld_map_idx) { set_ref_frame_info(remapped_ref_idx, GOLDEN_FRAME - LAST_FRAME, &ref_frame_info[i]); ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1; } } assert(ref_flag_list[LAST_FRAME - LAST_FRAME] == 1 && ref_flag_list[GOLDEN_FRAME - LAST_FRAME] == 1); // == LAST2_FRAME == // == LAST3_FRAME == // == BWDREF_FRAME == // == ALTREF2_FRAME == // == ALTREF_FRAME == // Set up the reference frames in the anti-chronological order. static const MV_REFERENCE_FRAME ref_frame_list[INTER_REFS_PER_FRAME - 2] = { LAST2_FRAME, LAST3_FRAME, BWDREF_FRAME, ALTREF2_FRAME, ALTREF_FRAME }; int ref_idx; for (ref_idx = 0; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) { const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx]; if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue; while (fwd_start_idx <= fwd_end_idx && (ref_frame_info[fwd_end_idx].map_idx == lst_map_idx || ref_frame_info[fwd_end_idx].map_idx == gld_map_idx)) { fwd_end_idx--; } if (fwd_start_idx > fwd_end_idx) break; set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME, &ref_frame_info[fwd_end_idx]); ref_flag_list[ref_frame - LAST_FRAME] = 1; fwd_end_idx--; } // Assign all the remaining frame(s), if any, to the earliest reference // frame. for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) { const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx]; if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue; set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME, &ref_frame_info[fwd_start_idx]); ref_flag_list[ref_frame - LAST_FRAME] = 1; } for (int i = 0; i < INTER_REFS_PER_FRAME; i++) { assert(ref_flag_list[i] == 1); } } aom-3.12.1/av1/common/mvref_common.h000066400000000000000000000312101477627663500172150ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_MVREF_COMMON_H_ #define AOM_AV1_COMMON_MVREF_COMMON_H_ #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #ifdef __cplusplus extern "C" { #endif #define MVREF_ROW_COLS 3 // Set the upper limit of the motion vector component magnitude. // This would make a motion vector fit in 26 bits. Plus 3 bits for the // reference frame index. A tuple of motion vector can hence be stored within // 32 bit range for efficient load/store operations. #define REFMVS_LIMIT ((1 << 12) - 1) typedef struct position { int row; int col; } POSITION; // clamp_mv_ref #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units static inline int get_relative_dist(const OrderHintInfo *oh, int a, int b) { if (!oh->enable_order_hint) return 0; const int bits = oh->order_hint_bits_minus_1 + 1; assert(bits >= 1); assert(a >= 0 && a < (1 << bits)); assert(b >= 0 && b < (1 << bits)); int diff = a - b; const int m = 1 << (bits - 1); diff = (diff & (m - 1)) - (diff & m); return diff; } static inline void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) { const SubpelMvLimits mv_limits = { xd->mb_to_left_edge - GET_MV_SUBPEL(bw) - MV_BORDER, xd->mb_to_right_edge + GET_MV_SUBPEL(bw) + MV_BORDER, xd->mb_to_top_edge - GET_MV_SUBPEL(bh) - MV_BORDER, xd->mb_to_bottom_edge + GET_MV_SUBPEL(bh) + MV_BORDER }; clamp_mv(mv, &mv_limits); } static inline int_mv get_block_mv(const MB_MODE_INFO *candidate, int which_mv) { return candidate->mv[which_mv]; } // Checks that the given mi_row, mi_col and search point // are inside the borders of the tile. static inline int is_inside(const TileInfo *const tile, int mi_col, int mi_row, const POSITION *mi_pos) { return !(mi_row + mi_pos->row < tile->mi_row_start || mi_col + mi_pos->col < tile->mi_col_start || mi_row + mi_pos->row >= tile->mi_row_end || mi_col + mi_pos->col >= tile->mi_col_end); } static inline int find_valid_row_offset(const TileInfo *const tile, int mi_row, int row_offset) { return clamp(row_offset, tile->mi_row_start - mi_row, tile->mi_row_end - mi_row - 1); } static inline int find_valid_col_offset(const TileInfo *const tile, int mi_col, int col_offset) { return clamp(col_offset, tile->mi_col_start - mi_col, tile->mi_col_end - mi_col - 1); } static inline void lower_mv_precision(MV *mv, int allow_hp, int is_integer) { if (is_integer) { integer_mv_precision(mv); } else { if (!allow_hp) { if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1); if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1); } } } static inline int8_t get_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) { // Single ref pred if (rf[1] <= INTRA_FRAME) return -1; // Bi-directional comp ref pred if ((rf[0] < BWDREF_FRAME) && (rf[1] >= BWDREF_FRAME)) return -1; for (int8_t ref_idx = 0; ref_idx < TOTAL_UNIDIR_COMP_REFS; ++ref_idx) { if (rf[0] == comp_ref0(ref_idx) && rf[1] == comp_ref1(ref_idx)) return ref_idx; } return -1; } static inline int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) { if (rf[1] > INTRA_FRAME) { const int8_t uni_comp_ref_idx = get_uni_comp_ref_idx(rf); if (uni_comp_ref_idx >= 0) { assert((REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx) < MODE_CTX_REF_FRAMES); return REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx; } else { return REF_FRAMES + FWD_RF_OFFSET(rf[0]) + BWD_RF_OFFSET(rf[1]) * FWD_REFS; } } return rf[0]; } // clang-format off static MV_REFERENCE_FRAME ref_frame_map[TOTAL_COMP_REFS][2] = { { LAST_FRAME, BWDREF_FRAME }, { LAST2_FRAME, BWDREF_FRAME }, { LAST3_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, BWDREF_FRAME }, { LAST_FRAME, ALTREF2_FRAME }, { LAST2_FRAME, ALTREF2_FRAME }, { LAST3_FRAME, ALTREF2_FRAME }, { GOLDEN_FRAME, ALTREF2_FRAME }, { LAST_FRAME, ALTREF_FRAME }, { LAST2_FRAME, ALTREF_FRAME }, { LAST3_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME }, { LAST_FRAME, LAST2_FRAME }, { LAST_FRAME, LAST3_FRAME }, { LAST_FRAME, GOLDEN_FRAME }, { BWDREF_FRAME, ALTREF_FRAME }, // NOTE: Following reference frame pairs are not supported to be explicitly // signalled, but they are possibly chosen by the use of skip_mode, // which may use the most recent one-sided reference frame pair. { LAST2_FRAME, LAST3_FRAME }, { LAST2_FRAME, GOLDEN_FRAME }, { LAST3_FRAME, GOLDEN_FRAME }, {BWDREF_FRAME, ALTREF2_FRAME}, { ALTREF2_FRAME, ALTREF_FRAME } }; // clang-format on static inline void av1_set_ref_frame(MV_REFERENCE_FRAME *rf, MV_REFERENCE_FRAME ref_frame_type) { if (ref_frame_type >= REF_FRAMES) { rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0]; rf[1] = ref_frame_map[ref_frame_type - REF_FRAMES][1]; } else { assert(ref_frame_type > NONE_FRAME); rf[0] = ref_frame_type; rf[1] = NONE_FRAME; } } static uint16_t compound_mode_ctx_map[3][COMP_NEWMV_CTXS] = { { 0, 1, 1, 1, 1 }, { 1, 2, 3, 4, 4 }, { 4, 4, 5, 6, 7 }, }; static inline int16_t av1_mode_context_analyzer( const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf) { const int8_t ref_frame = av1_ref_frame_type(rf); if (rf[1] <= INTRA_FRAME) return mode_context[ref_frame]; const int16_t newmv_ctx = mode_context[ref_frame] & NEWMV_CTX_MASK; const int16_t refmv_ctx = (mode_context[ref_frame] >> REFMV_OFFSET) & REFMV_CTX_MASK; const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN( newmv_ctx, COMP_NEWMV_CTXS - 1)]; return comp_ctx; } static inline uint8_t av1_drl_ctx(const uint16_t *ref_mv_weight, int ref_idx) { if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL && ref_mv_weight[ref_idx + 1] >= REF_CAT_LEVEL) return 0; if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL && ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL) return 1; if (ref_mv_weight[ref_idx] < REF_CAT_LEVEL && ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL) return 2; return 0; } void av1_setup_frame_buf_refs(AV1_COMMON *cm); void av1_setup_frame_sign_bias(AV1_COMMON *cm); void av1_setup_skip_mode_allowed(AV1_COMMON *cm); void av1_calculate_ref_frame_side(AV1_COMMON *cm); void av1_setup_motion_field(AV1_COMMON *cm); void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx, int lst_map_idx, int gld_map_idx); static inline void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) { av1_zero(xd->neighbors_ref_counts); uint8_t *const ref_counts = xd->neighbors_ref_counts; const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; const int above_in_image = xd->up_available; const int left_in_image = xd->left_available; // Above neighbor if (above_in_image && is_inter_block(above_mbmi)) { ref_counts[above_mbmi->ref_frame[0]]++; if (has_second_ref(above_mbmi)) { ref_counts[above_mbmi->ref_frame[1]]++; } } // Left neighbor if (left_in_image && is_inter_block(left_mbmi)) { ref_counts[left_mbmi->ref_frame[0]]++; if (has_second_ref(left_mbmi)) { ref_counts[left_mbmi->ref_frame[1]]++; } } } void av1_copy_frame_mvs(const AV1_COMMON *const cm, const MB_MODE_INFO *const mi, int mi_row, int mi_col, int x_mis, int y_mis); // The global_mvs output parameter points to an array of REF_FRAMES elements. // The caller may pass a null global_mvs if it does not need the global_mvs // output. void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd, MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, uint8_t ref_mv_count[MODE_CTX_REF_FRAMES], CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE], uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE], int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *global_mvs, int16_t *mode_context); // check a list of motion vectors by sad score using a number rows of pixels // above and a number cols of pixels in the left to select the one with best // score to use as ref motion vector void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv, int_mv *near_mv, int is_integer); uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize); uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts, int *pts_inref); #define INTRABC_DELAY_PIXELS 256 // Delay of 256 pixels #define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64) static inline void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile, int mib_size, int mi_row) { if (mi_row - mib_size < tile->mi_row_start) { ref_dv->as_fullmv.row = 0; ref_dv->as_fullmv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS; } else { ref_dv->as_fullmv.row = -MI_SIZE * mib_size; ref_dv->as_fullmv.col = 0; } convert_fullmv_to_mv(ref_dv); } static inline int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize, int mib_size_log2) { const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int SCALE_PX_TO_MV = 8; // Disallow subpixel for now // SUBPEL_MASK is not the correct scale if (((dv.row & (SCALE_PX_TO_MV - 1)) || (dv.col & (SCALE_PX_TO_MV - 1)))) return 0; const TileInfo *const tile = &xd->tile; // Is the source top-left inside the current tile? const int src_top_edge = mi_row * MI_SIZE * SCALE_PX_TO_MV + dv.row; const int tile_top_edge = tile->mi_row_start * MI_SIZE * SCALE_PX_TO_MV; if (src_top_edge < tile_top_edge) return 0; const int src_left_edge = mi_col * MI_SIZE * SCALE_PX_TO_MV + dv.col; const int tile_left_edge = tile->mi_col_start * MI_SIZE * SCALE_PX_TO_MV; if (src_left_edge < tile_left_edge) return 0; // Is the bottom right inside the current tile? const int src_bottom_edge = (mi_row * MI_SIZE + bh) * SCALE_PX_TO_MV + dv.row; const int tile_bottom_edge = tile->mi_row_end * MI_SIZE * SCALE_PX_TO_MV; if (src_bottom_edge > tile_bottom_edge) return 0; const int src_right_edge = (mi_col * MI_SIZE + bw) * SCALE_PX_TO_MV + dv.col; const int tile_right_edge = tile->mi_col_end * MI_SIZE * SCALE_PX_TO_MV; if (src_right_edge > tile_right_edge) return 0; // Special case for sub 8x8 chroma cases, to prevent referring to chroma // pixels outside current tile. if (xd->is_chroma_ref && av1_num_planes(cm) > 1) { const struct macroblockd_plane *const pd = &xd->plane[1]; if (bw < 8 && pd->subsampling_x) if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0; if (bh < 8 && pd->subsampling_y) if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0; } // Is the bottom right within an already coded SB? Also consider additional // constraints to facilitate HW decoder. const int max_mib_size = 1 << mib_size_log2; const int active_sb_row = mi_row >> mib_size_log2; const int active_sb64_col = (mi_col * MI_SIZE) >> 6; const int sb_size = max_mib_size * MI_SIZE; const int src_sb_row = ((src_bottom_edge >> 3) - 1) / sb_size; const int src_sb64_col = ((src_right_edge >> 3) - 1) >> 6; const int total_sb64_per_row = ((tile->mi_col_end - tile->mi_col_start - 1) >> 4) + 1; const int active_sb64 = active_sb_row * total_sb64_per_row + active_sb64_col; const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col; if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0; // Wavefront constraint: use only top left area of frame for reference. const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64); const int wf_offset = gradient * (active_sb_row - src_sb_row); if (src_sb_row > active_sb_row || src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset) return 0; return 1; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_MVREF_COMMON_H_ aom-3.12.1/av1/common/obmc.h000066400000000000000000000074031477627663500154550ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_OBMC_H_ #define AOM_AV1_COMMON_OBMC_H_ typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes); static inline void foreach_overlappable_nb_above(const AV1_COMMON *cm, MACROBLOCKD *xd, int nb_max, overlappable_nb_visitor_t fun, void *fun_ctxt) { if (!xd->up_available) return; const int num_planes = av1_num_planes(cm); int nb_count = 0; const int mi_col = xd->mi_col; // prev_row_mi points into the mi array, starting at the beginning of the // previous row. MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride; const int end_col = AOMMIN(mi_col + xd->width, cm->mi_params.mi_cols); uint8_t mi_step; for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max; above_mi_col += mi_step) { MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col; mi_step = AOMMIN(mi_size_wide[above_mi[0]->bsize], mi_size_wide[BLOCK_64X64]); // If we're considering a block with width 4, it should be treated as // half of a pair of blocks with chroma information in the second. Move // above_mi_col back to the start of the pair if needed, set above_mbmi // to point at the block with chroma information, and set mi_step to 2 to // step over the entire pair at the end of the iteration. if (mi_step == 1) { above_mi_col &= ~1; above_mi = prev_row_mi + above_mi_col + 1; mi_step = 2; } if (is_neighbor_overlappable(*above_mi)) { ++nb_count; fun(xd, 0, above_mi_col - mi_col, AOMMIN(xd->width, mi_step), 0, *above_mi, fun_ctxt, num_planes); } } } static inline void foreach_overlappable_nb_left(const AV1_COMMON *cm, MACROBLOCKD *xd, int nb_max, overlappable_nb_visitor_t fun, void *fun_ctxt) { if (!xd->left_available) return; const int num_planes = av1_num_planes(cm); int nb_count = 0; // prev_col_mi points into the mi array, starting at the top of the // previous column const int mi_row = xd->mi_row; MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride; const int end_row = AOMMIN(mi_row + xd->height, cm->mi_params.mi_rows); uint8_t mi_step; for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max; left_mi_row += mi_step) { MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride; mi_step = AOMMIN(mi_size_high[left_mi[0]->bsize], mi_size_high[BLOCK_64X64]); if (mi_step == 1) { left_mi_row &= ~1; left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride; mi_step = 2; } if (is_neighbor_overlappable(*left_mi)) { ++nb_count; fun(xd, left_mi_row - mi_row, 0, AOMMIN(xd->height, mi_step), 1, *left_mi, fun_ctxt, num_planes); } } } #endif // AOM_AV1_COMMON_OBMC_H_ aom-3.12.1/av1/common/obu_util.c000066400000000000000000000116321477627663500163510ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/common/obu_util.h" #include "aom_dsp/bitreader_buffer.h" static aom_codec_err_t read_obu_size(const uint8_t *data, size_t bytes_available, size_t *const obu_size, size_t *const length_field_size) { uint64_t u_obu_size = 0; if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) != 0) { return AOM_CODEC_CORRUPT_FRAME; } if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME; *obu_size = (size_t)u_obu_size; return AOM_CODEC_OK; } // Parses OBU header and stores values in 'header'. static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb, int is_annexb, ObuHeader *header) { if (!rb || !header) return AOM_CODEC_INVALID_PARAM; const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer; if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME; header->size = 1; if (aom_rb_read_bit(rb) != 0) { // Forbidden bit. Must not be set. return AOM_CODEC_CORRUPT_FRAME; } header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4); header->has_extension = aom_rb_read_bit(rb); header->has_size_field = aom_rb_read_bit(rb); if (!header->has_size_field && !is_annexb) { // section 5 obu streams must have obu_size field set. return AOM_CODEC_UNSUP_BITSTREAM; } // obu_reserved_1bit must be set to 0. The value is ignored by a decoder. aom_rb_read_bit(rb); if (header->has_extension) { if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME; header->size += 1; header->temporal_layer_id = aom_rb_read_literal(rb, 3); header->spatial_layer_id = aom_rb_read_literal(rb, 2); // extension_header_reserved_3bits must be set to 0. The value is ignored by // a decoder. aom_rb_read_literal(rb, 3); } else { header->temporal_layer_id = 0; header->spatial_layer_id = 0; } return AOM_CODEC_OK; } aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length, size_t *consumed, ObuHeader *header, int is_annexb) { if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM; // TODO(tomfinegan): Set the error handler here and throughout this file, and // confirm parsing work done via aom_read_bit_buffer is successful. struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL, NULL }; aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header); if (parse_result == AOM_CODEC_OK) *consumed = header->size; return parse_result; } aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data, size_t bytes_available, int is_annexb, ObuHeader *obu_header, size_t *const payload_size, size_t *const bytes_read) { size_t length_field_size_obu = 0; size_t length_field_size_payload = 0; size_t obu_size = 0; aom_codec_err_t status; if (is_annexb) { // Size field comes before the OBU header, and includes the OBU header status = read_obu_size(data, bytes_available, &obu_size, &length_field_size_obu); if (status != AOM_CODEC_OK) return status; } struct aom_read_bit_buffer rb = { data + length_field_size_obu, data + bytes_available, 0, NULL, NULL }; status = read_obu_header(&rb, is_annexb, obu_header); if (status != AOM_CODEC_OK) return status; if (!obu_header->has_size_field) { assert(is_annexb); // Derive the payload size from the data we've already read if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME; *payload_size = obu_size - obu_header->size; } else { // Size field comes after the OBU header, and is just the payload size status = read_obu_size( data + length_field_size_obu + obu_header->size, bytes_available - length_field_size_obu - obu_header->size, payload_size, &length_field_size_payload); if (status != AOM_CODEC_OK) return status; } *bytes_read = length_field_size_obu + obu_header->size + length_field_size_payload; return AOM_CODEC_OK; } aom-3.12.1/av1/common/obu_util.h000066400000000000000000000034231477627663500163550ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_OBU_UTIL_H_ #define AOM_AV1_COMMON_OBU_UTIL_H_ #include "aom/aom_codec.h" #ifdef __cplusplus extern "C" { #endif typedef struct { size_t size; // Size (1 or 2 bytes) of the OBU header (including the // optional OBU extension header) in the bitstream. OBU_TYPE type; int has_size_field; int has_extension; // Whether the optional OBU extension header is present. // The following fields come from the OBU extension header. They are set to 0 // if has_extension is false. int temporal_layer_id; int spatial_layer_id; } ObuHeader; aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length, size_t *consumed, ObuHeader *header, int is_annexb); aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data, size_t bytes_available, int is_annexb, ObuHeader *obu_header, size_t *const payload_size, size_t *const bytes_read); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_OBU_UTIL_H_ aom-3.12.1/av1/common/ppc/000077500000000000000000000000001477627663500151425ustar00rootroot00000000000000aom-3.12.1/av1/common/ppc/cfl_ppc.c000066400000000000000000000157301477627663500167220ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #include "av1/common/cfl.h" #define OFF_0 0 #define OFF_1 16 #define OFF_2 32 #define OFF_3 48 #define CFL_LINE_1 64 #define CFL_LINE_2 128 #define CFL_LINE_3 192 typedef vector signed char int8x16_t; // NOLINT(runtime/int) typedef vector unsigned char uint8x16_t; // NOLINT(runtime/int) typedef vector signed short int16x8_t; // NOLINT(runtime/int) typedef vector unsigned short uint16x8_t; // NOLINT(runtime/int) typedef vector signed int int32x4_t; // NOLINT(runtime/int) typedef vector unsigned int uint32x4_t; // NOLINT(runtime/int) typedef vector unsigned long long uint64x2_t; // NOLINT(runtime/int) static inline void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst, int width, int height, int round_offset, int num_pel_log2) { const int16_t *sum_buf = (const int16_t *)src_ptr; const int16_t *end = sum_buf + height * CFL_BUF_LINE; const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2); const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; const uint8x16_t mask_32 = { 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 0x1C, 0x1D, 0x1E, 0x1F, 0x08, 0x09, 0x0A, 0x0B }; int32x4_t sum_32x4_0 = { 0, 0, 0, round_offset }; int32x4_t sum_32x4_1 = { 0, 0, 0, 0 }; do { sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_0, sum_buf), sum_32x4_0); sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_0 + CFL_LINE_1, sum_buf), sum_32x4_1); if (width >= 16) { sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_1, sum_buf), sum_32x4_0); sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_1 + CFL_LINE_1, sum_buf), sum_32x4_1); } if (width == 32) { sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_2, sum_buf), sum_32x4_0); sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_2 + CFL_LINE_1, sum_buf), sum_32x4_1); sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_3, sum_buf), sum_32x4_0); sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_3 + CFL_LINE_1, sum_buf), sum_32x4_1); } sum_buf += CFL_BUF_LINE * 2; } while (sum_buf < end); int32x4_t sum_32x4 = vec_add(sum_32x4_0, sum_32x4_1); const int32x4_t perm_64 = vec_perm(sum_32x4, sum_32x4, mask_64); sum_32x4 = vec_add(sum_32x4, perm_64); const int32x4_t perm_32 = vec_perm(sum_32x4, sum_32x4, mask_32); sum_32x4 = vec_add(sum_32x4, perm_32); const int32x4_t avg = vec_sr(sum_32x4, div_shift); const int16x8_t vec_avg = vec_pack(avg, avg); const int16_t *src = (const int16_t *)src_ptr; do { vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, src), vec_avg), OFF_0, dst); vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, src), vec_avg), OFF_0 + CFL_LINE_1, dst); vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, src), vec_avg), OFF_0 + CFL_LINE_2, dst); vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, src), vec_avg), OFF_0 + CFL_LINE_3, dst); if (width >= 16) { vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, src), vec_avg), OFF_1, dst); vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, src), vec_avg), OFF_1 + CFL_LINE_1, dst); vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, src), vec_avg), OFF_1 + CFL_LINE_2, dst); vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, src), vec_avg), OFF_1 + CFL_LINE_3, dst); } if (width == 32) { vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, src), vec_avg), OFF_2, dst); vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, src), vec_avg), OFF_2 + CFL_LINE_1, dst); vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, src), vec_avg), OFF_2 + CFL_LINE_2, dst); vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, src), vec_avg), OFF_2 + CFL_LINE_3, dst); vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, src), vec_avg), OFF_3, dst); vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, src), vec_avg), OFF_3 + CFL_LINE_1, dst); vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, src), vec_avg), OFF_3 + CFL_LINE_2, dst); vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, src), vec_avg), OFF_3 + CFL_LINE_3, dst); } src += CFL_BUF_LINE * 4; dst += CFL_BUF_LINE * 4; } while (src < end); } // Declare wrappers for VSX sizes CFL_SUB_AVG_X(vsx, 8, 4, 16, 5) CFL_SUB_AVG_X(vsx, 8, 8, 32, 6) CFL_SUB_AVG_X(vsx, 8, 16, 64, 7) CFL_SUB_AVG_X(vsx, 8, 32, 128, 8) CFL_SUB_AVG_X(vsx, 16, 4, 32, 6) CFL_SUB_AVG_X(vsx, 16, 8, 64, 7) CFL_SUB_AVG_X(vsx, 16, 16, 128, 8) CFL_SUB_AVG_X(vsx, 16, 32, 256, 9) CFL_SUB_AVG_X(vsx, 32, 8, 128, 8) CFL_SUB_AVG_X(vsx, 32, 16, 256, 9) CFL_SUB_AVG_X(vsx, 32, 32, 512, 10) // Based on observation, for small blocks VSX does not outperform C (no 64bit // load and store intrinsics). So we call the C code for block widths 4. extern void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst); extern void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst); extern void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst); cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) { static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { cfl_subtract_average_4x4_c, /* 4x4 */ cfl_subtract_average_8x8_vsx, /* 8x8 */ cfl_subtract_average_16x16_vsx, /* 16x16 */ cfl_subtract_average_32x32_vsx, /* 32x32 */ NULL, /* 64x64 (invalid CFL size) */ cfl_subtract_average_4x8_c, /* 4x8 */ cfl_subtract_average_8x4_vsx, /* 8x4 */ cfl_subtract_average_8x16_vsx, /* 8x16 */ cfl_subtract_average_16x8_vsx, /* 16x8 */ cfl_subtract_average_16x32_vsx, /* 16x32 */ cfl_subtract_average_32x16_vsx, /* 32x16 */ NULL, /* 32x64 (invalid CFL size) */ NULL, /* 64x32 (invalid CFL size) */ cfl_subtract_average_4x16_c, /* 4x16 */ cfl_subtract_average_16x4_vsx, /* 16x4 */ cfl_subtract_average_8x32_vsx, /* 8x32 */ cfl_subtract_average_32x8_vsx, /* 32x8 */ NULL, /* 16x64 (invalid CFL size) */ NULL, /* 64x16 (invalid CFL size) */ }; // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to // index the function pointer array out of bounds. return sub_avg[tx_size % TX_SIZES_ALL]; } aom-3.12.1/av1/common/pred_common.c000066400000000000000000000457221477627663500170400ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/common.h" #include "av1/common/pred_common.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" #include "av1/common/seg_common.h" // Returns a context number for the given MB prediction signal static InterpFilter get_ref_filter_type(const MB_MODE_INFO *ref_mbmi, const MACROBLOCKD *xd, int dir, MV_REFERENCE_FRAME ref_frame) { (void)xd; return ((ref_mbmi->ref_frame[0] == ref_frame || ref_mbmi->ref_frame[1] == ref_frame) ? av1_extract_interp_filter(ref_mbmi->interp_filters, dir & 0x01) : SWITCHABLE_FILTERS); } int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) { const MB_MODE_INFO *const mbmi = xd->mi[0]; const int ctx_offset = (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET; assert(dir == 0 || dir == 1); const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0]; // Note: // The mode info data structure has a one element border above and to the // left of the entries corresponding to real macroblocks. // The prediction flags in these dummy entries are initialized to 0. int filter_type_ctx = ctx_offset + (dir & 0x01) * INTER_FILTER_DIR_OFFSET; int left_type = SWITCHABLE_FILTERS; int above_type = SWITCHABLE_FILTERS; if (xd->left_available) left_type = get_ref_filter_type(xd->mi[-1], xd, dir, ref_frame); if (xd->up_available) above_type = get_ref_filter_type(xd->mi[-xd->mi_stride], xd, dir, ref_frame); if (left_type == above_type) { filter_type_ctx += left_type; } else if (left_type == SWITCHABLE_FILTERS) { assert(above_type != SWITCHABLE_FILTERS); filter_type_ctx += above_type; } else if (above_type == SWITCHABLE_FILTERS) { assert(left_type != SWITCHABLE_FILTERS); filter_type_ctx += left_type; } else { filter_type_ctx += SWITCHABLE_FILTERS; } return filter_type_ctx; } static void palette_add_to_cache(uint16_t *cache, int *n, uint16_t val) { // Do not add an already existing value if (*n > 0 && val == cache[*n - 1]) return; cache[(*n)++] = val; } int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane, uint16_t *cache) { const int row = -xd->mb_to_top_edge >> 3; // Do not refer to above SB row when on SB boundary. const MB_MODE_INFO *const above_mi = (row % (1 << MIN_SB_SIZE_LOG2)) ? xd->above_mbmi : NULL; const MB_MODE_INFO *const left_mi = xd->left_mbmi; int above_n = 0, left_n = 0; if (above_mi) above_n = above_mi->palette_mode_info.palette_size[plane != 0]; if (left_mi) left_n = left_mi->palette_mode_info.palette_size[plane != 0]; if (above_n == 0 && left_n == 0) return 0; int above_idx = plane * PALETTE_MAX_SIZE; int left_idx = plane * PALETTE_MAX_SIZE; int n = 0; const uint16_t *above_colors = above_mi ? above_mi->palette_mode_info.palette_colors : NULL; const uint16_t *left_colors = left_mi ? left_mi->palette_mode_info.palette_colors : NULL; // Merge the sorted lists of base colors from above and left to get // combined sorted color cache. while (above_n > 0 && left_n > 0) { uint16_t v_above = above_colors[above_idx]; uint16_t v_left = left_colors[left_idx]; if (v_left < v_above) { palette_add_to_cache(cache, &n, v_left); ++left_idx, --left_n; } else { palette_add_to_cache(cache, &n, v_above); ++above_idx, --above_n; if (v_left == v_above) ++left_idx, --left_n; } } while (above_n-- > 0) { uint16_t val = above_colors[above_idx++]; palette_add_to_cache(cache, &n, val); } while (left_n-- > 0) { uint16_t val = left_colors[left_idx++]; palette_add_to_cache(cache, &n, val); } assert(n <= 2 * PALETTE_MAX_SIZE); return n; } // The mode info data structure has a one element border above and to the // left of the entries corresponding to real macroblocks. // The prediction flags in these dummy entries are initialized to 0. // 0 - inter/inter, inter/--, --/inter, --/-- // 1 - intra/inter, inter/intra // 2 - intra/--, --/intra // 3 - intra/intra int av1_get_intra_inter_context(const MACROBLOCKD *xd) { const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; const int has_above = xd->up_available; const int has_left = xd->left_available; if (has_above && has_left) { // both edges available const int above_intra = !is_inter_block(above_mbmi); const int left_intra = !is_inter_block(left_mbmi); return left_intra && above_intra ? 3 : left_intra || above_intra; } else if (has_above || has_left) { // one edge available return 2 * !is_inter_block(has_above ? above_mbmi : left_mbmi); } else { return 0; } } #define CHECK_BACKWARD_REFS(ref_frame) \ (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME)) #define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame) int av1_get_reference_mode_context(const MACROBLOCKD *xd) { int ctx; const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; const int has_above = xd->up_available; const int has_left = xd->left_available; // Note: // The mode info data structure has a one element border above and to the // left of the entries corresponding to real macroblocks. // The prediction flags in these dummy entries are initialized to 0. if (has_above && has_left) { // both edges available if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) // neither edge uses comp pred (0/1) ctx = IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ^ IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]); else if (!has_second_ref(above_mbmi)) // one of two edges uses comp pred (2/3) ctx = 2 + (IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) || !is_inter_block(above_mbmi)); else if (!has_second_ref(left_mbmi)) // one of two edges uses comp pred (2/3) ctx = 2 + (IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]) || !is_inter_block(left_mbmi)); else // both edges use comp pred (4) ctx = 4; } else if (has_above || has_left) { // one edge available const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi; if (!has_second_ref(edge_mbmi)) // edge does not use comp pred (0/1) ctx = IS_BACKWARD_REF_FRAME(edge_mbmi->ref_frame[0]); else // edge uses comp pred (3) ctx = 3; } else { // no edges available (1) ctx = 1; } assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS); return ctx; } int av1_get_comp_reference_type_context(const MACROBLOCKD *xd) { int pred_context; const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; const int above_in_image = xd->up_available; const int left_in_image = xd->left_available; if (above_in_image && left_in_image) { // both edges available const int above_intra = !is_inter_block(above_mbmi); const int left_intra = !is_inter_block(left_mbmi); if (above_intra && left_intra) { // intra/intra pred_context = 2; } else if (above_intra || left_intra) { // intra/inter const MB_MODE_INFO *inter_mbmi = above_intra ? left_mbmi : above_mbmi; if (!has_second_ref(inter_mbmi)) // single pred pred_context = 2; else // comp pred pred_context = 1 + 2 * has_uni_comp_refs(inter_mbmi); } else { // inter/inter const int a_sg = !has_second_ref(above_mbmi); const int l_sg = !has_second_ref(left_mbmi); const MV_REFERENCE_FRAME frfa = above_mbmi->ref_frame[0]; const MV_REFERENCE_FRAME frfl = left_mbmi->ref_frame[0]; if (a_sg && l_sg) { // single/single pred_context = 1 + 2 * (!(IS_BACKWARD_REF_FRAME(frfa) ^ IS_BACKWARD_REF_FRAME(frfl))); } else if (l_sg || a_sg) { // single/comp const int uni_rfc = a_sg ? has_uni_comp_refs(left_mbmi) : has_uni_comp_refs(above_mbmi); if (!uni_rfc) // comp bidir pred_context = 1; else // comp unidir pred_context = 3 + (!(IS_BACKWARD_REF_FRAME(frfa) ^ IS_BACKWARD_REF_FRAME(frfl))); } else { // comp/comp const int a_uni_rfc = has_uni_comp_refs(above_mbmi); const int l_uni_rfc = has_uni_comp_refs(left_mbmi); if (!a_uni_rfc && !l_uni_rfc) // bidir/bidir pred_context = 0; else if (!a_uni_rfc || !l_uni_rfc) // unidir/bidir pred_context = 2; else // unidir/unidir pred_context = 3 + (!((frfa == BWDREF_FRAME) ^ (frfl == BWDREF_FRAME))); } } } else if (above_in_image || left_in_image) { // one edge available const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; if (!is_inter_block(edge_mbmi)) { // intra pred_context = 2; } else { // inter if (!has_second_ref(edge_mbmi)) // single pred pred_context = 2; else // comp pred pred_context = 4 * has_uni_comp_refs(edge_mbmi); } } else { // no edges available pred_context = 2; } assert(pred_context >= 0 && pred_context < COMP_REF_TYPE_CONTEXTS); return pred_context; } // Returns a context number for the given MB prediction signal // // Signal the uni-directional compound reference frame pair as either // (BWDREF, ALTREF), or (LAST, LAST2) / (LAST, LAST3) / (LAST, GOLDEN), // conditioning on the pair is known as uni-directional. // // 3 contexts: Voting is used to compare the count of forward references with // that of backward references from the spatial neighbors. int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd) { const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; // Count of forward references (L, L2, L3, or G) const int frf_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] + ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME]; // Count of backward references (B or A) const int brf_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] + ref_counts[ALTREF_FRAME]; const int pred_context = (frf_count == brf_count) ? 1 : ((frf_count < brf_count) ? 0 : 2); assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS); return pred_context; } // Returns a context number for the given MB prediction signal // // Signal the uni-directional compound reference frame pair as // either (LAST, LAST2), or (LAST, LAST3) / (LAST, GOLDEN), // conditioning on the pair is known as one of the above three. // // 3 contexts: Voting is used to compare the count of LAST2_FRAME with the // total count of LAST3/GOLDEN from the spatial neighbors. int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd) { const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; // Count of LAST2 const int last2_count = ref_counts[LAST2_FRAME]; // Count of LAST3 or GOLDEN const int last3_or_gld_count = ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME]; const int pred_context = (last2_count == last3_or_gld_count) ? 1 : ((last2_count < last3_or_gld_count) ? 0 : 2); assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS); return pred_context; } // Returns a context number for the given MB prediction signal // // Signal the uni-directional compound reference frame pair as // either (LAST, LAST3) or (LAST, GOLDEN), // conditioning on the pair is known as one of the above two. // // 3 contexts: Voting is used to compare the count of LAST3_FRAME with the // total count of GOLDEN_FRAME from the spatial neighbors. int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd) { const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; // Count of LAST3 const int last3_count = ref_counts[LAST3_FRAME]; // Count of GOLDEN const int gld_count = ref_counts[GOLDEN_FRAME]; const int pred_context = (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2); assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS); return pred_context; } // == Common context functions for both comp and single ref == // // Obtain contexts to signal a reference frame to be either LAST/LAST2 or // LAST3/GOLDEN. static int get_pred_context_ll2_or_l3gld(const MACROBLOCKD *xd) { const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; // Count of LAST + LAST2 const int last_last2_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME]; // Count of LAST3 + GOLDEN const int last3_gld_count = ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME]; const int pred_context = (last_last2_count == last3_gld_count) ? 1 : ((last_last2_count < last3_gld_count) ? 0 : 2); assert(pred_context >= 0 && pred_context < REF_CONTEXTS); return pred_context; } // Obtain contexts to signal a reference frame to be either LAST or LAST2. static int get_pred_context_last_or_last2(const MACROBLOCKD *xd) { const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; // Count of LAST const int last_count = ref_counts[LAST_FRAME]; // Count of LAST2 const int last2_count = ref_counts[LAST2_FRAME]; const int pred_context = (last_count == last2_count) ? 1 : ((last_count < last2_count) ? 0 : 2); assert(pred_context >= 0 && pred_context < REF_CONTEXTS); return pred_context; } // Obtain contexts to signal a reference frame to be either LAST3 or GOLDEN. static int get_pred_context_last3_or_gld(const MACROBLOCKD *xd) { const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; // Count of LAST3 const int last3_count = ref_counts[LAST3_FRAME]; // Count of GOLDEN const int gld_count = ref_counts[GOLDEN_FRAME]; const int pred_context = (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2); assert(pred_context >= 0 && pred_context < REF_CONTEXTS); return pred_context; } // Obtain contexts to signal a reference frame be either BWDREF/ALTREF2, or // ALTREF. static int get_pred_context_brfarf2_or_arf(const MACROBLOCKD *xd) { const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; // Counts of BWDREF, ALTREF2, or ALTREF frames (B, A2, or A) const int brfarf2_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME]; const int arf_count = ref_counts[ALTREF_FRAME]; const int pred_context = (brfarf2_count == arf_count) ? 1 : ((brfarf2_count < arf_count) ? 0 : 2); assert(pred_context >= 0 && pred_context < REF_CONTEXTS); return pred_context; } // Obtain contexts to signal a reference frame be either BWDREF or ALTREF2. static int get_pred_context_brf_or_arf2(const MACROBLOCKD *xd) { const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; // Count of BWDREF frames (B) const int brf_count = ref_counts[BWDREF_FRAME]; // Count of ALTREF2 frames (A2) const int arf2_count = ref_counts[ALTREF2_FRAME]; const int pred_context = (brf_count == arf2_count) ? 1 : ((brf_count < arf2_count) ? 0 : 2); assert(pred_context >= 0 && pred_context < REF_CONTEXTS); return pred_context; } // == Context functions for comp ref == // // Returns a context number for the given MB prediction signal // Signal the first reference frame for a compound mode be either // GOLDEN/LAST3, or LAST/LAST2. int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd) { return get_pred_context_ll2_or_l3gld(xd); } // Returns a context number for the given MB prediction signal // Signal the first reference frame for a compound mode be LAST, // conditioning on that it is known either LAST/LAST2. int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd) { return get_pred_context_last_or_last2(xd); } // Returns a context number for the given MB prediction signal // Signal the first reference frame for a compound mode be GOLDEN, // conditioning on that it is known either GOLDEN or LAST3. int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd) { return get_pred_context_last3_or_gld(xd); } // Signal the 2nd reference frame for a compound mode be either // ALTREF, or ALTREF2/BWDREF. int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd) { return get_pred_context_brfarf2_or_arf(xd); } // Signal the 2nd reference frame for a compound mode be either // ALTREF2 or BWDREF. int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd) { return get_pred_context_brf_or_arf2(xd); } // == Context functions for single ref == // // For the bit to signal whether the single reference is a forward reference // frame or a backward reference frame. int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; // Count of forward reference frames const int fwd_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] + ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME]; // Count of backward reference frames const int bwd_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] + ref_counts[ALTREF_FRAME]; const int pred_context = (fwd_count == bwd_count) ? 1 : ((fwd_count < bwd_count) ? 0 : 2); assert(pred_context >= 0 && pred_context < REF_CONTEXTS); return pred_context; } // For the bit to signal whether the single reference is ALTREF_FRAME or // non-ALTREF backward reference frame, knowing that it shall be either of // these 2 choices. int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { return get_pred_context_brfarf2_or_arf(xd); } // For the bit to signal whether the single reference is LAST3/GOLDEN or // LAST2/LAST, knowing that it shall be either of these 2 choices. int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) { return get_pred_context_ll2_or_l3gld(xd); } // For the bit to signal whether the single reference is LAST2_FRAME or // LAST_FRAME, knowing that it shall be either of these 2 choices. int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) { return get_pred_context_last_or_last2(xd); } // For the bit to signal whether the single reference is GOLDEN_FRAME or // LAST3_FRAME, knowing that it shall be either of these 2 choices. int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) { return get_pred_context_last3_or_gld(xd); } // For the bit to signal whether the single reference is ALTREF2_FRAME or // BWDREF_FRAME, knowing that it shall be either of these 2 choices. int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd) { return get_pred_context_brf_or_arf2(xd); } aom-3.12.1/av1/common/pred_common.h000066400000000000000000000322411477627663500170350ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_PRED_COMMON_H_ #define AOM_AV1_COMMON_PRED_COMMON_H_ #include #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/mvref_common.h" #include "aom_dsp/aom_dsp_common.h" #ifdef __cplusplus extern "C" { #endif static inline uint8_t get_segment_id( const CommonModeInfoParams *const mi_params, const uint8_t *segment_ids, BLOCK_SIZE bsize, int mi_row, int mi_col) { const int mi_offset = mi_row * mi_params->mi_cols + mi_col; const int bw = mi_size_wide[bsize]; const int bh = mi_size_high[bsize]; const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw); const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh); const int seg_stride = mi_params->mi_cols; uint8_t segment_id = MAX_SEGMENTS; for (int y = 0; y < ymis; ++y) { for (int x = 0; x < xmis; ++x) { segment_id = AOMMIN(segment_id, segment_ids[mi_offset + y * seg_stride + x]); } } assert(segment_id < MAX_SEGMENTS); return segment_id; } static inline uint8_t av1_get_spatial_seg_pred(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, int *cdf_index, int skip_over4x4) { const int step_size = skip_over4x4 ? 2 : 1; uint8_t prev_ul = UINT8_MAX; // top left segment_id uint8_t prev_l = UINT8_MAX; // left segment_id uint8_t prev_u = UINT8_MAX; // top segment_id const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; const CommonModeInfoParams *const mi_params = &cm->mi_params; const uint8_t *seg_map = cm->cur_frame->seg_map; if ((xd->up_available) && (xd->left_available)) { prev_ul = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - step_size, mi_col - step_size); } if (xd->up_available) { prev_u = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - step_size, mi_col - 0); } if (xd->left_available) { prev_l = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 0, mi_col - step_size); } assert(IMPLIES(prev_ul != UINT8_MAX, prev_u != UINT8_MAX && prev_l != UINT8_MAX)); // Pick CDF index based on number of matching/out-of-bounds segment IDs. if (prev_ul == UINT8_MAX) /* Edge cases */ *cdf_index = 0; else if ((prev_ul == prev_u) && (prev_ul == prev_l)) *cdf_index = 2; else if ((prev_ul == prev_u) || (prev_ul == prev_l) || (prev_u == prev_l)) *cdf_index = 1; else *cdf_index = 0; // If 2 or more are identical returns that as predictor, otherwise prev_l. if (prev_u == UINT8_MAX) // edge case return prev_l == UINT8_MAX ? 0 : prev_l; if (prev_l == UINT8_MAX) // edge case return prev_u; return (prev_ul == prev_u) ? prev_u : prev_l; } static inline uint8_t av1_get_pred_context_seg_id(const MACROBLOCKD *xd) { const MB_MODE_INFO *const above_mi = xd->above_mbmi; const MB_MODE_INFO *const left_mi = xd->left_mbmi; const int above_sip = (above_mi != NULL) ? above_mi->seg_id_predicted : 0; const int left_sip = (left_mi != NULL) ? left_mi->seg_id_predicted : 0; return above_sip + left_sip; } static inline int get_comp_index_context(const AV1_COMMON *cm, const MACROBLOCKD *xd) { MB_MODE_INFO *mbmi = xd->mi[0]; const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]); const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]); int bck_frame_index = 0, fwd_frame_index = 0; int cur_frame_index = cm->cur_frame->order_hint; if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint; if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint; int fwd = abs(get_relative_dist(&cm->seq_params->order_hint_info, fwd_frame_index, cur_frame_index)); int bck = abs(get_relative_dist(&cm->seq_params->order_hint_info, cur_frame_index, bck_frame_index)); const MB_MODE_INFO *const above_mi = xd->above_mbmi; const MB_MODE_INFO *const left_mi = xd->left_mbmi; int above_ctx = 0, left_ctx = 0; const int offset = (fwd == bck); if (above_mi != NULL) { if (has_second_ref(above_mi)) above_ctx = above_mi->compound_idx; else if (above_mi->ref_frame[0] == ALTREF_FRAME) above_ctx = 1; } if (left_mi != NULL) { if (has_second_ref(left_mi)) left_ctx = left_mi->compound_idx; else if (left_mi->ref_frame[0] == ALTREF_FRAME) left_ctx = 1; } return above_ctx + left_ctx + 3 * offset; } static inline int get_comp_group_idx_context(const MACROBLOCKD *xd) { const MB_MODE_INFO *const above_mi = xd->above_mbmi; const MB_MODE_INFO *const left_mi = xd->left_mbmi; int above_ctx = 0, left_ctx = 0; if (above_mi) { if (has_second_ref(above_mi)) above_ctx = above_mi->comp_group_idx; else if (above_mi->ref_frame[0] == ALTREF_FRAME) above_ctx = 3; } if (left_mi) { if (has_second_ref(left_mi)) left_ctx = left_mi->comp_group_idx; else if (left_mi->ref_frame[0] == ALTREF_FRAME) left_ctx = 3; } return AOMMIN(5, above_ctx + left_ctx); } static inline aom_cdf_prob *av1_get_pred_cdf_seg_id( struct segmentation_probs *segp, const MACROBLOCKD *xd) { return segp->pred_cdf[av1_get_pred_context_seg_id(xd)]; } static inline int av1_get_skip_mode_context(const MACROBLOCKD *xd) { const MB_MODE_INFO *const above_mi = xd->above_mbmi; const MB_MODE_INFO *const left_mi = xd->left_mbmi; const int above_skip_mode = above_mi ? above_mi->skip_mode : 0; const int left_skip_mode = left_mi ? left_mi->skip_mode : 0; return above_skip_mode + left_skip_mode; } static inline int av1_get_skip_txfm_context(const MACROBLOCKD *xd) { const MB_MODE_INFO *const above_mi = xd->above_mbmi; const MB_MODE_INFO *const left_mi = xd->left_mbmi; const int above_skip_txfm = above_mi ? above_mi->skip_txfm : 0; const int left_skip_txfm = left_mi ? left_mi->skip_txfm : 0; return above_skip_txfm + left_skip_txfm; } int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir); // Get a list of palette base colors that are used in the above and left blocks, // referred to as "color cache". The return value is the number of colors in the // cache (<= 2 * PALETTE_MAX_SIZE). The color values are stored in "cache" // in ascending order. int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane, uint16_t *cache); static inline int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) { assert(bsize < BLOCK_SIZES_ALL); return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8]; } static inline int av1_get_palette_mode_ctx(const MACROBLOCKD *xd) { const MB_MODE_INFO *const above_mi = xd->above_mbmi; const MB_MODE_INFO *const left_mi = xd->left_mbmi; int ctx = 0; if (above_mi) ctx += (above_mi->palette_mode_info.palette_size[0] > 0); if (left_mi) ctx += (left_mi->palette_mode_info.palette_size[0] > 0); return ctx; } int av1_get_intra_inter_context(const MACROBLOCKD *xd); int av1_get_reference_mode_context(const MACROBLOCKD *xd); static inline aom_cdf_prob *av1_get_reference_mode_cdf(const MACROBLOCKD *xd) { return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)]; } static inline aom_cdf_prob *av1_get_skip_txfm_cdf(const MACROBLOCKD *xd) { return xd->tile_ctx->skip_txfm_cdfs[av1_get_skip_txfm_context(xd)]; } int av1_get_comp_reference_type_context(const MACROBLOCKD *xd); // == Uni-directional contexts == int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd); int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd); int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd); static inline aom_cdf_prob *av1_get_comp_reference_type_cdf( const MACROBLOCKD *xd) { const int pred_context = av1_get_comp_reference_type_context(xd); return xd->tile_ctx->comp_ref_type_cdf[pred_context]; } static inline aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p( const MACROBLOCKD *xd) { const int pred_context = av1_get_pred_context_uni_comp_ref_p(xd); return xd->tile_ctx->uni_comp_ref_cdf[pred_context][0]; } static inline aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p1( const MACROBLOCKD *xd) { const int pred_context = av1_get_pred_context_uni_comp_ref_p1(xd); return xd->tile_ctx->uni_comp_ref_cdf[pred_context][1]; } static inline aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p2( const MACROBLOCKD *xd) { const int pred_context = av1_get_pred_context_uni_comp_ref_p2(xd); return xd->tile_ctx->uni_comp_ref_cdf[pred_context][2]; } // == Bi-directional contexts == int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd); int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd); int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd); int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd); int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd); static inline aom_cdf_prob *av1_get_pred_cdf_comp_ref_p(const MACROBLOCKD *xd) { const int pred_context = av1_get_pred_context_comp_ref_p(xd); return xd->tile_ctx->comp_ref_cdf[pred_context][0]; } static inline aom_cdf_prob *av1_get_pred_cdf_comp_ref_p1( const MACROBLOCKD *xd) { const int pred_context = av1_get_pred_context_comp_ref_p1(xd); return xd->tile_ctx->comp_ref_cdf[pred_context][1]; } static inline aom_cdf_prob *av1_get_pred_cdf_comp_ref_p2( const MACROBLOCKD *xd) { const int pred_context = av1_get_pred_context_comp_ref_p2(xd); return xd->tile_ctx->comp_ref_cdf[pred_context][2]; } static inline aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p( const MACROBLOCKD *xd) { const int pred_context = av1_get_pred_context_comp_bwdref_p(xd); return xd->tile_ctx->comp_bwdref_cdf[pred_context][0]; } static inline aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p1( const MACROBLOCKD *xd) { const int pred_context = av1_get_pred_context_comp_bwdref_p1(xd); return xd->tile_ctx->comp_bwdref_cdf[pred_context][1]; } // == Single contexts == int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd); int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd); int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd); int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd); int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd); int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd); static inline aom_cdf_prob *av1_get_pred_cdf_single_ref_p1( const MACROBLOCKD *xd) { return xd->tile_ctx ->single_ref_cdf[av1_get_pred_context_single_ref_p1(xd)][0]; } static inline aom_cdf_prob *av1_get_pred_cdf_single_ref_p2( const MACROBLOCKD *xd) { return xd->tile_ctx ->single_ref_cdf[av1_get_pred_context_single_ref_p2(xd)][1]; } static inline aom_cdf_prob *av1_get_pred_cdf_single_ref_p3( const MACROBLOCKD *xd) { return xd->tile_ctx ->single_ref_cdf[av1_get_pred_context_single_ref_p3(xd)][2]; } static inline aom_cdf_prob *av1_get_pred_cdf_single_ref_p4( const MACROBLOCKD *xd) { return xd->tile_ctx ->single_ref_cdf[av1_get_pred_context_single_ref_p4(xd)][3]; } static inline aom_cdf_prob *av1_get_pred_cdf_single_ref_p5( const MACROBLOCKD *xd) { return xd->tile_ctx ->single_ref_cdf[av1_get_pred_context_single_ref_p5(xd)][4]; } static inline aom_cdf_prob *av1_get_pred_cdf_single_ref_p6( const MACROBLOCKD *xd) { return xd->tile_ctx ->single_ref_cdf[av1_get_pred_context_single_ref_p6(xd)][5]; } // Returns a context number for the given MB prediction signal // The mode info data structure has a one element border above and to the // left of the entries corresponding to real blocks. // The prediction flags in these dummy entries are initialized to 0. static inline int get_tx_size_context(const MACROBLOCKD *xd) { const MB_MODE_INFO *mbmi = xd->mi[0]; const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; const TX_SIZE max_tx_size = max_txsize_rect_lookup[mbmi->bsize]; const int max_tx_wide = tx_size_wide[max_tx_size]; const int max_tx_high = tx_size_high[max_tx_size]; const int has_above = xd->up_available; const int has_left = xd->left_available; int above = xd->above_txfm_context[0] >= max_tx_wide; int left = xd->left_txfm_context[0] >= max_tx_high; if (has_above) if (is_inter_block(above_mbmi)) above = block_size_wide[above_mbmi->bsize] >= max_tx_wide; if (has_left) if (is_inter_block(left_mbmi)) left = block_size_high[left_mbmi->bsize] >= max_tx_high; if (has_above && has_left) return (above + left); else if (has_above) return above; else if (has_left) return left; else return 0; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_PRED_COMMON_H_ aom-3.12.1/av1/common/quant_common.c000066400000000000000000034454661477627663500172520ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_config.h" #include "aom/aom_frame_buffer.h" #include "aom_scale/yv12config.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/common.h" #include "av1/common/entropy.h" #include "av1/common/filter.h" #include "av1/common/quant_common.h" #include "av1/common/seg_common.h" static const int16_t dc_qlookup_QTX[QINDEX_RANGE] = { 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 31, 32, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 53, 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 62, 63, 64, 65, 66, 66, 67, 68, 69, 70, 70, 71, 72, 73, 74, 74, 75, 76, 77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85, 87, 88, 90, 92, 93, 95, 96, 98, 99, 101, 102, 104, 105, 107, 108, 110, 111, 113, 114, 116, 117, 118, 120, 121, 123, 125, 127, 129, 131, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 161, 164, 166, 169, 172, 174, 177, 180, 182, 185, 187, 190, 192, 195, 199, 202, 205, 208, 211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247, 250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292, 296, 300, 304, 309, 313, 317, 322, 326, 330, 335, 340, 344, 349, 354, 359, 364, 369, 374, 379, 384, 389, 395, 400, 406, 411, 417, 423, 429, 435, 441, 447, 454, 461, 467, 475, 482, 489, 497, 505, 513, 522, 530, 539, 549, 559, 569, 579, 590, 602, 614, 626, 640, 654, 668, 684, 700, 717, 736, 755, 775, 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336, }; static const int16_t dc_qlookup_10_QTX[QINDEX_RANGE] = { 4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34, 37, 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75, 78, 82, 86, 90, 93, 97, 101, 105, 109, 113, 116, 120, 124, 128, 132, 136, 140, 143, 147, 151, 155, 159, 163, 166, 170, 174, 178, 182, 185, 189, 193, 197, 200, 204, 208, 212, 215, 219, 223, 226, 230, 233, 237, 241, 244, 248, 251, 255, 259, 262, 266, 269, 273, 276, 280, 283, 287, 290, 293, 297, 300, 304, 307, 310, 314, 317, 321, 324, 327, 331, 334, 337, 343, 350, 356, 362, 369, 375, 381, 387, 394, 400, 406, 412, 418, 424, 430, 436, 442, 448, 454, 460, 466, 472, 478, 484, 490, 499, 507, 516, 525, 533, 542, 550, 559, 567, 576, 584, 592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687, 698, 708, 718, 729, 739, 749, 759, 770, 782, 795, 807, 819, 831, 844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988, 1001, 1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202, 1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, 1398, 1416, 1436, 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717, 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088, 2123, 2159, 2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675, 2737, 2802, 2871, 2944, 3020, 3102, 3188, 3280, 3375, 3478, 3586, 3702, 3823, 3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347, }; static const int16_t dc_qlookup_12_QTX[QINDEX_RANGE] = { 4, 12, 18, 25, 33, 41, 50, 60, 70, 80, 91, 103, 115, 127, 140, 153, 166, 180, 194, 208, 222, 237, 251, 266, 281, 296, 312, 327, 343, 358, 374, 390, 405, 421, 437, 453, 469, 484, 500, 516, 532, 548, 564, 580, 596, 611, 627, 643, 659, 674, 690, 706, 721, 737, 752, 768, 783, 798, 814, 829, 844, 859, 874, 889, 904, 919, 934, 949, 964, 978, 993, 1008, 1022, 1037, 1051, 1065, 1080, 1094, 1108, 1122, 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234, 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, 1368, 1393, 1419, 1444, 1469, 1494, 1519, 1544, 1569, 1594, 1618, 1643, 1668, 1692, 1717, 1741, 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933, 1957, 1992, 2027, 2061, 2096, 2130, 2165, 2199, 2233, 2267, 2300, 2334, 2367, 2400, 2434, 2467, 2499, 2532, 2575, 2618, 2661, 2704, 2746, 2788, 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127, 3177, 3226, 3275, 3324, 3373, 3421, 3469, 3517, 3565, 3621, 3677, 3733, 3788, 3843, 3897, 3951, 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420, 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, 5013, 5083, 5153, 5222, 5291, 5367, 5442, 5517, 5591, 5665, 5745, 5825, 5905, 5984, 6063, 6149, 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867, 6966, 7064, 7163, 7269, 7376, 7483, 7599, 7715, 7832, 7958, 8085, 8214, 8352, 8492, 8635, 8788, 8945, 9104, 9275, 9450, 9639, 9832, 10031, 10245, 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, 12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387, }; static const int16_t ac_qlookup_QTX[QINDEX_RANGE] = { 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 155, 158, 161, 164, 167, 170, 173, 176, 179, 182, 185, 188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280, 285, 290, 295, 300, 305, 311, 317, 323, 329, 335, 341, 347, 353, 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432, 440, 448, 456, 465, 474, 483, 492, 501, 510, 520, 530, 540, 550, 560, 571, 582, 593, 604, 615, 627, 639, 651, 663, 676, 689, 702, 715, 729, 743, 757, 771, 786, 801, 816, 832, 848, 864, 881, 898, 915, 933, 951, 969, 988, 1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828, }; static const int16_t ac_qlookup_10_QTX[QINDEX_RANGE] = { 4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37, 40, 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83, 88, 92, 96, 100, 105, 109, 114, 118, 122, 127, 131, 136, 140, 145, 149, 154, 158, 163, 168, 172, 177, 181, 186, 190, 195, 199, 204, 208, 213, 217, 222, 226, 231, 235, 240, 244, 249, 253, 258, 262, 267, 271, 275, 280, 284, 289, 293, 297, 302, 306, 311, 315, 319, 324, 328, 332, 337, 341, 345, 349, 354, 358, 362, 367, 371, 375, 379, 384, 388, 392, 396, 401, 409, 417, 425, 433, 441, 449, 458, 466, 474, 482, 490, 498, 506, 514, 523, 531, 539, 547, 555, 563, 571, 579, 588, 596, 604, 616, 628, 640, 652, 664, 676, 688, 700, 713, 725, 737, 749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889, 905, 922, 938, 954, 970, 986, 1002, 1018, 1038, 1058, 1078, 1098, 1118, 1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, 1411, 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791, 1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283, 2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915, 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591, 3659, 3731, 3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784, 4876, 4972, 5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148, 6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312, }; static const int16_t ac_qlookup_12_QTX[QINDEX_RANGE] = { 4, 13, 19, 27, 35, 44, 54, 64, 75, 87, 99, 112, 126, 139, 154, 168, 183, 199, 214, 230, 247, 263, 280, 297, 314, 331, 349, 366, 384, 402, 420, 438, 456, 475, 493, 511, 530, 548, 567, 586, 604, 623, 642, 660, 679, 698, 716, 735, 753, 772, 791, 809, 828, 846, 865, 884, 902, 920, 939, 957, 976, 994, 1012, 1030, 1049, 1067, 1085, 1103, 1121, 1139, 1157, 1175, 1193, 1211, 1229, 1246, 1264, 1282, 1299, 1317, 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457, 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, 1627, 1660, 1693, 1725, 1758, 1791, 1824, 1856, 1889, 1922, 1954, 1987, 2020, 2052, 2085, 2118, 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378, 2411, 2459, 2508, 2556, 2605, 2653, 2701, 2750, 2798, 2847, 2895, 2943, 2992, 3040, 3088, 3137, 3185, 3234, 3298, 3362, 3426, 3491, 3555, 3619, 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149, 4230, 4310, 4390, 4470, 4550, 4631, 4711, 4791, 4871, 4967, 5064, 5160, 5256, 5352, 5448, 5544, 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410, 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435, 7579, 7723, 7867, 8011, 8155, 8315, 8475, 8635, 8795, 8956, 9132, 9308, 9484, 9660, 9836, 10028, 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565, 13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247, }; // Coefficient scaling and quantization with AV1 TX are tailored to // the AV1 TX transforms. Regardless of the bit-depth of the input, // the transform stages scale the coefficient values up by a factor of // 8 (3 bits) over the scale of the pixel values. Thus, for 8-bit // input, the coefficients have effectively 11 bits of scale depth // (8+3), 10-bit input pixels result in 13-bit coefficient depth // (10+3) and 12-bit pixels yield 15-bit (12+3) coefficient depth. // All quantizers are built using this invariant of x8, 3-bit scaling, // thus the Q3 suffix. // A partial exception to this rule is large transforms; to avoid // overflow, TX blocks with > 256 pels (>16x16) are scaled only // 4-times unity (2 bits) over the pixel depth, and TX blocks with // over 1024 pixels (>32x32) are scaled up only 2x unity (1 bit). // This descaling is found via av1_tx_get_scale(). Thus, 16x32, 32x16 // and 32x32 transforms actually return Q2 coefficients, and 32x64, // 64x32 and 64x64 transforms return Q1 coefficients. However, the // quantizers are de-scaled down on-the-fly by the same amount // (av1_tx_get_scale()) during quantization, and as such the // dequantized/decoded coefficients, even for large TX blocks, are always // effectively Q3. Meanwhile, quantized/coded coefficients are Q0 // because Qn quantizers are applied to Qn tx coefficients. // Note that encoder decision making (which uses the quantizer to // generate several bespoke lamdas for RDO and other heuristics) // expects quantizers to be larger for higher-bitdepth input. In // addition, the minimum allowable quantizer is 4; smaller values will // underflow to 0 in the actual quantization routines. int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) { const int q_clamped = clamp(qindex + delta, 0, MAXQ); switch (bit_depth) { case AOM_BITS_8: return dc_qlookup_QTX[q_clamped]; case AOM_BITS_10: return dc_qlookup_10_QTX[q_clamped]; case AOM_BITS_12: return dc_qlookup_12_QTX[q_clamped]; default: assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); return -1; } } int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) { const int q_clamped = clamp(qindex + delta, 0, MAXQ); switch (bit_depth) { case AOM_BITS_8: return ac_qlookup_QTX[q_clamped]; case AOM_BITS_10: return ac_qlookup_10_QTX[q_clamped]; case AOM_BITS_12: return ac_qlookup_12_QTX[q_clamped]; default: assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); return -1; } } int av1_get_qindex(const struct segmentation *seg, int segment_id, int base_qindex) { if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q); const int seg_qindex = base_qindex + data; return clamp(seg_qindex, 0, MAXQ); } else { return base_qindex; } } bool av1_use_qmatrix(const CommonQuantParams *quant_params, const struct macroblockd *xd, int segment_id) { // True if explicit Q matrix levels and this is not a lossless segment. return quant_params->using_qmatrix && !xd->lossless[segment_id]; } // Returns true if the tx_type corresponds to non-identity transform in both // horizontal and vertical directions. static inline bool is_2d_transform(TX_TYPE tx_type) { return (tx_type < IDTX); } const qm_val_t *av1_get_iqmatrix(const CommonQuantParams *quant_params, const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, TX_TYPE tx_type) { const struct macroblockd_plane *const pd = &xd->plane[plane]; const MB_MODE_INFO *const mbmi = xd->mi[0]; const int seg_id = mbmi->segment_id; const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size); // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms return is_2d_transform(tx_type) ? pd->seg_iqmatrix[seg_id][qm_tx_size] : quant_params->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size]; } const qm_val_t *av1_get_qmatrix(const CommonQuantParams *quant_params, const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, TX_TYPE tx_type) { const struct macroblockd_plane *const pd = &xd->plane[plane]; const MB_MODE_INFO *const mbmi = xd->mi[0]; const int seg_id = mbmi->segment_id; const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size); // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms return is_2d_transform(tx_type) ? pd->seg_qmatrix[seg_id][qm_tx_size] : quant_params->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size]; } #if CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER #define QM_TOTAL_SIZE 3344 // We only use wt_matrix_ref[q] and iwt_matrix_ref[q] // for q = 0, ..., NUM_QM_LEVELS - 2. static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE]; static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE]; #endif void av1_qm_init(CommonQuantParams *quant_params, int num_planes) { #if CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER for (int q = 0; q < NUM_QM_LEVELS; ++q) { for (int c = 0; c < num_planes; ++c) { int current = 0; for (int t = 0; t < TX_SIZES_ALL; ++t) { const int size = tx_size_2d[t]; const int qm_tx_size = av1_get_adjusted_tx_size(t); if (q == NUM_QM_LEVELS - 1) { quant_params->gqmatrix[q][c][t] = NULL; quant_params->giqmatrix[q][c][t] = NULL; } else if (t != qm_tx_size) { // Reuse matrices for 'qm_tx_size' assert(t > qm_tx_size); quant_params->gqmatrix[q][c][t] = quant_params->gqmatrix[q][c][qm_tx_size]; quant_params->giqmatrix[q][c][t] = quant_params->giqmatrix[q][c][qm_tx_size]; } else { assert(current + size <= QM_TOTAL_SIZE); quant_params->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current]; quant_params->giqmatrix[q][c][t] = &iwt_matrix_ref[q][c >= 1][current]; current += size; } } } } #else (void)quant_params; (void)num_planes; #endif // CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER } /* Provide 15 sets of quantization matrices for chroma and luma and each TX size. Matrices for different TX sizes are in fact sub-sampled from the 32x32 and 16x16 sizes, but explicitly defined here for convenience. Intra and inter matrix sets are the same but changing DEFAULT_QM_INTER_OFFSET from zero allows for different matrices for inter and intra blocks in the same frame. Matrices for different QM levels have been rescaled in the frequency domain according to different nominal viewing distances. Matrices for QM level 15 are omitted because they are not used. */ #if CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = { { { /* Luma */ /* Size 4x4 */ 32, 43, 73, 97, 43, 67, 94, 110, 73, 94, 137, 150, 97, 110, 150, 200, /* Size 8x8 */ 32, 32, 38, 51, 68, 84, 95, 109, 32, 35, 40, 49, 63, 76, 89, 102, 38, 40, 54, 65, 78, 91, 98, 106, 51, 49, 65, 82, 97, 111, 113, 121, 68, 63, 78, 97, 117, 134, 138, 142, 84, 76, 91, 111, 134, 152, 159, 168, 95, 89, 98, 113, 138, 159, 183, 199, 109, 102, 106, 121, 142, 168, 199, 220, /* Size 16x16 */ 32, 31, 31, 34, 36, 44, 48, 59, 65, 80, 83, 91, 97, 104, 111, 119, 31, 32, 32, 33, 34, 41, 44, 54, 59, 72, 75, 83, 90, 97, 104, 112, 31, 32, 33, 35, 36, 42, 45, 54, 59, 71, 74, 81, 86, 93, 100, 107, 34, 33, 35, 39, 42, 47, 51, 58, 63, 74, 76, 81, 84, 90, 97, 105, 36, 34, 36, 42, 48, 54, 57, 64, 68, 79, 81, 88, 91, 96, 102, 105, 44, 41, 42, 47, 54, 63, 67, 75, 79, 90, 92, 95, 100, 102, 109, 112, 48, 44, 45, 51, 57, 67, 71, 80, 85, 96, 99, 107, 108, 111, 117, 120, 59, 54, 54, 58, 64, 75, 80, 92, 98, 110, 113, 115, 116, 122, 125, 130, 65, 59, 59, 63, 68, 79, 85, 98, 105, 118, 121, 127, 130, 134, 135, 140, 80, 72, 71, 74, 79, 90, 96, 110, 118, 134, 137, 140, 143, 144, 146, 152, 83, 75, 74, 76, 81, 92, 99, 113, 121, 137, 140, 151, 152, 155, 158, 165, 91, 83, 81, 81, 88, 95, 107, 115, 127, 140, 151, 159, 166, 169, 173, 179, 97, 90, 86, 84, 91, 100, 108, 116, 130, 143, 152, 166, 174, 182, 189, 193, 104, 97, 93, 90, 96, 102, 111, 122, 134, 144, 155, 169, 182, 191, 200, 210, 111, 104, 100, 97, 102, 109, 117, 125, 135, 146, 158, 173, 189, 200, 210, 220, 119, 112, 107, 105, 105, 112, 120, 130, 140, 152, 165, 179, 193, 210, 220, 231, /* Size 32x32 */ 32, 31, 31, 31, 31, 32, 34, 35, 36, 39, 44, 46, 48, 54, 59, 62, 65, 71, 80, 81, 83, 88, 91, 94, 97, 101, 104, 107, 111, 115, 119, 123, 31, 32, 32, 32, 32, 32, 34, 34, 35, 38, 42, 44, 46, 51, 56, 59, 62, 68, 76, 77, 78, 84, 86, 89, 92, 95, 99, 102, 105, 109, 113, 116, 31, 32, 32, 32, 32, 32, 33, 34, 34, 37, 41, 42, 44, 49, 54, 56, 59, 65, 72, 73, 75, 80, 83, 86, 90, 93, 97, 101, 104, 108, 112, 116, 31, 32, 32, 32, 33, 33, 34, 35, 35, 38, 41, 43, 45, 49, 54, 56, 59, 64, 72, 73, 74, 79, 82, 85, 88, 91, 94, 97, 101, 104, 107, 111, 31, 32, 32, 33, 33, 34, 35, 36, 36, 39, 42, 44, 45, 50, 54, 56, 59, 64, 71, 72, 74, 78, 81, 84, 86, 89, 93, 96, 100, 104, 107, 111, 32, 32, 32, 33, 34, 35, 37, 37, 38, 40, 42, 44, 46, 49, 53, 55, 58, 63, 69, 70, 72, 76, 79, 82, 85, 89, 93, 96, 99, 102, 106, 109, 34, 34, 33, 34, 35, 37, 39, 41, 42, 45, 47, 49, 51, 54, 58, 60, 63, 68, 74, 75, 76, 80, 81, 82, 84, 87, 90, 93, 97, 101, 105, 110, 35, 34, 34, 35, 36, 37, 41, 43, 45, 47, 50, 52, 53, 57, 61, 63, 65, 70, 76, 77, 79, 82, 84, 86, 89, 91, 92, 93, 96, 100, 103, 107, 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 54, 55, 57, 60, 64, 66, 68, 73, 79, 80, 81, 85, 88, 90, 91, 93, 96, 99, 102, 103, 105, 107, 39, 38, 37, 38, 39, 40, 45, 47, 50, 54, 58, 59, 61, 65, 69, 71, 73, 78, 84, 85, 86, 91, 92, 92, 95, 98, 100, 101, 103, 106, 110, 114, 44, 42, 41, 41, 42, 42, 47, 50, 54, 58, 63, 65, 67, 71, 75, 77, 79, 84, 90, 91, 92, 95, 95, 97, 100, 101, 102, 105, 109, 111, 112, 114, 46, 44, 42, 43, 44, 44, 49, 52, 55, 59, 65, 67, 69, 74, 78, 80, 82, 87, 93, 94, 95, 98, 100, 103, 102, 105, 108, 110, 111, 113, 117, 121, 48, 46, 44, 45, 45, 46, 51, 53, 57, 61, 67, 69, 71, 76, 80, 83, 85, 90, 96, 97, 99, 103, 107, 105, 108, 111, 111, 113, 117, 119, 120, 122, 54, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 74, 76, 82, 87, 89, 92, 97, 104, 105, 106, 111, 110, 111, 114, 113, 116, 120, 120, 121, 125, 130, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, 92, 95, 98, 103, 110, 111, 113, 115, 115, 119, 116, 120, 122, 122, 125, 129, 130, 130, 62, 59, 56, 56, 56, 55, 60, 63, 66, 71, 77, 80, 83, 89, 95, 98, 101, 107, 114, 115, 117, 119, 123, 121, 125, 126, 125, 129, 131, 131, 135, 140, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 127, 128, 130, 130, 134, 133, 135, 140, 140, 140, 71, 68, 65, 64, 64, 63, 68, 70, 73, 78, 84, 87, 90, 97, 103, 107, 111, 117, 125, 126, 128, 134, 132, 136, 133, 138, 137, 140, 143, 142, 145, 150, 80, 76, 72, 72, 71, 69, 74, 76, 79, 84, 90, 93, 96, 104, 110, 114, 118, 125, 134, 135, 137, 139, 140, 139, 143, 142, 144, 146, 146, 151, 152, 151, 81, 77, 73, 73, 72, 70, 75, 77, 80, 85, 91, 94, 97, 105, 111, 115, 119, 126, 135, 137, 138, 144, 147, 146, 148, 149, 151, 150, 156, 155, 157, 163, 83, 78, 75, 74, 74, 72, 76, 79, 81, 86, 92, 95, 99, 106, 113, 117, 121, 128, 137, 138, 140, 147, 151, 156, 152, 157, 155, 161, 158, 162, 165, 164, 88, 84, 80, 79, 78, 76, 80, 82, 85, 91, 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147, 152, 154, 158, 163, 159, 165, 163, 168, 168, 169, 176, 91, 86, 83, 82, 81, 79, 81, 84, 88, 92, 95, 100, 107, 110, 115, 123, 127, 132, 140, 147, 151, 154, 159, 161, 166, 171, 169, 173, 173, 176, 179, 177, 94, 89, 86, 85, 84, 82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136, 139, 146, 156, 158, 161, 166, 168, 174, 179, 178, 180, 183, 183, 190, 97, 92, 90, 88, 86, 85, 84, 89, 91, 95, 100, 102, 108, 114, 116, 125, 130, 133, 143, 148, 152, 163, 166, 168, 174, 176, 182, 187, 189, 188, 193, 191, 101, 95, 93, 91, 89, 89, 87, 91, 93, 98, 101, 105, 111, 113, 120, 126, 130, 138, 142, 149, 157, 159, 171, 174, 176, 183, 184, 191, 195, 199, 197, 204, 104, 99, 97, 94, 93, 93, 90, 92, 96, 100, 102, 108, 111, 116, 122, 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191, 193, 200, 204, 210, 206, 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110, 113, 120, 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, 193, 200, 202, 210, 214, 222, 111, 105, 104, 101, 100, 99, 97, 96, 102, 103, 109, 111, 117, 120, 125, 131, 135, 143, 146, 156, 158, 168, 173, 180, 189, 195, 200, 202, 210, 212, 220, 224, 115, 109, 108, 104, 104, 102, 101, 100, 103, 106, 111, 113, 119, 121, 129, 131, 140, 142, 151, 155, 162, 168, 176, 183, 188, 199, 204, 210, 212, 220, 222, 230, 119, 113, 112, 107, 107, 106, 105, 103, 105, 110, 112, 117, 120, 125, 130, 135, 140, 145, 152, 157, 165, 169, 179, 183, 193, 197, 210, 214, 220, 222, 231, 232, 123, 116, 116, 111, 111, 109, 110, 107, 107, 114, 114, 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176, 177, 190, 191, 204, 206, 222, 224, 230, 232, 242, /* Size 4x8 */ 32, 33, 37, 49, 65, 80, 91, 104, 42, 42, 58, 71, 84, 97, 100, 112, 75, 69, 84, 103, 125, 142, 145, 146, 91, 86, 91, 110, 128, 152, 178, 190, /* Size 8x4 */ 32, 42, 75, 91, 33, 42, 69, 86, 37, 58, 84, 91, 49, 71, 103, 110, 65, 84, 125, 128, 80, 97, 142, 152, 91, 100, 145, 178, 104, 112, 146, 190, /* Size 8x16 */ 32, 31, 32, 34, 36, 44, 48, 58, 65, 79, 82, 91, 97, 103, 110, 118, 32, 33, 34, 37, 38, 43, 46, 54, 58, 70, 72, 80, 86, 93, 100, 107, 36, 34, 36, 42, 48, 53, 56, 63, 68, 79, 81, 88, 94, 98, 101, 105, 53, 49, 50, 54, 60, 71, 76, 87, 92, 104, 106, 106, 107, 114, 117, 118, 65, 59, 59, 63, 68, 79, 85, 98, 105, 118, 121, 130, 128, 131, 138, 136, 87, 78, 77, 79, 84, 95, 102, 116, 124, 141, 144, 148, 157, 150, 161, 157, 93, 86, 82, 80, 86, 94, 105, 112, 122, 135, 149, 162, 167, 174, 183, 182, 99, 93, 89, 88, 90, 97, 105, 115, 124, 135, 146, 159, 171, 186, 193, 203, /* Size 16x8 */ 32, 32, 36, 53, 65, 87, 93, 99, 31, 33, 34, 49, 59, 78, 86, 93, 32, 34, 36, 50, 59, 77, 82, 89, 34, 37, 42, 54, 63, 79, 80, 88, 36, 38, 48, 60, 68, 84, 86, 90, 44, 43, 53, 71, 79, 95, 94, 97, 48, 46, 56, 76, 85, 102, 105, 105, 58, 54, 63, 87, 98, 116, 112, 115, 65, 58, 68, 92, 105, 124, 122, 124, 79, 70, 79, 104, 118, 141, 135, 135, 82, 72, 81, 106, 121, 144, 149, 146, 91, 80, 88, 106, 130, 148, 162, 159, 97, 86, 94, 107, 128, 157, 167, 171, 103, 93, 98, 114, 131, 150, 174, 186, 110, 100, 101, 117, 138, 161, 183, 193, 118, 107, 105, 118, 136, 157, 182, 203, /* Size 16x32 */ 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71, 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 31, 32, 32, 32, 32, 33, 34, 34, 34, 37, 41, 43, 45, 49, 54, 57, 60, 65, 72, 74, 75, 80, 83, 85, 88, 91, 94, 97, 101, 104, 108, 111, 32, 32, 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, 34, 34, 33, 34, 35, 37, 39, 41, 43, 45, 48, 49, 51, 54, 58, 60, 63, 68, 74, 75, 76, 80, 81, 82, 85, 87, 90, 93, 97, 100, 103, 107, 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100, 101, 103, 105, 107, 44, 42, 41, 41, 42, 42, 48, 50, 54, 58, 63, 65, 67, 71, 75, 77, 79, 84, 90, 91, 92, 97, 100, 100, 100, 100, 101, 104, 108, 112, 115, 119, 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, 117, 118, 119, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, 92, 95, 98, 103, 110, 112, 113, 115, 114, 118, 123, 121, 120, 119, 123, 127, 131, 136, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136, 138, 137, 136, 136, 79, 75, 72, 71, 71, 69, 73, 76, 78, 84, 90, 93, 96, 103, 110, 114, 118, 125, 133, 135, 136, 142, 142, 137, 140, 145, 144, 142, 141, 146, 151, 156, 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152, 150, 155, 161, 159, 157, 156, 90, 85, 82, 81, 80, 78, 78, 83, 87, 89, 93, 100, 102, 107, 115, 118, 123, 132, 136, 140, 151, 153, 155, 160, 161, 164, 170, 168, 165, 167, 172, 178, 93, 88, 86, 84, 82, 82, 80, 84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162, 165, 167, 173, 174, 177, 183, 185, 182, 179, 96, 91, 90, 87, 86, 86, 83, 84, 89, 91, 95, 100, 102, 110, 111, 118, 123, 128, 135, 138, 149, 152, 160, 167, 173, 178, 180, 187, 188, 190, 197, 203, 99, 94, 93, 90, 89, 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146, 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, 102, 97, 97, 93, 93, 92, 92, 90, 90, 96, 97, 103, 104, 111, 112, 120, 121, 130, 131, 142, 143, 154, 155, 168, 169, 181, 183, 198, 200, 206, 208, 217, /* Size 32x16 */ 32, 31, 32, 34, 36, 44, 53, 59, 65, 79, 87, 90, 93, 96, 99, 102, 31, 32, 32, 34, 35, 42, 51, 56, 62, 75, 82, 85, 88, 91, 94, 97, 31, 32, 33, 33, 34, 41, 49, 54, 59, 72, 78, 82, 86, 90, 93, 97, 31, 32, 33, 34, 35, 41, 49, 54, 59, 71, 78, 81, 84, 87, 90, 93, 32, 32, 34, 35, 36, 42, 50, 54, 59, 71, 77, 80, 82, 86, 89, 93, 32, 33, 35, 37, 38, 42, 49, 53, 58, 69, 75, 78, 82, 86, 89, 92, 34, 34, 37, 39, 42, 48, 54, 58, 63, 73, 79, 78, 80, 83, 88, 92, 35, 34, 37, 41, 45, 50, 57, 61, 65, 76, 82, 83, 84, 84, 87, 90, 36, 34, 38, 43, 48, 54, 60, 64, 68, 78, 84, 87, 86, 89, 90, 90, 39, 37, 40, 45, 50, 58, 65, 69, 73, 84, 89, 89, 91, 91, 93, 96, 44, 41, 43, 48, 53, 63, 71, 75, 79, 90, 95, 93, 94, 95, 97, 97, 46, 43, 44, 49, 55, 65, 73, 78, 82, 93, 98, 100, 98, 100, 99, 103, 48, 45, 46, 51, 56, 67, 76, 80, 85, 96, 102, 102, 105, 102, 105, 104, 53, 49, 50, 54, 60, 71, 82, 87, 92, 103, 109, 107, 107, 110, 107, 111, 58, 54, 54, 58, 63, 75, 87, 92, 98, 110, 116, 115, 112, 111, 115, 112, 61, 57, 56, 60, 66, 77, 89, 95, 101, 114, 120, 118, 119, 118, 116, 120, 65, 60, 58, 63, 68, 79, 92, 98, 105, 118, 124, 123, 122, 123, 124, 121, 71, 65, 63, 68, 73, 84, 97, 103, 111, 125, 132, 132, 130, 128, 127, 130, 79, 72, 70, 74, 79, 90, 104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81, 74, 71, 75, 80, 91, 105, 112, 119, 135, 142, 140, 140, 138, 139, 142, 82, 75, 72, 76, 81, 92, 106, 113, 121, 136, 144, 151, 149, 149, 146, 143, 88, 80, 77, 80, 85, 97, 108, 115, 126, 142, 149, 153, 153, 152, 152, 154, 91, 83, 80, 81, 88, 100, 106, 114, 130, 142, 148, 155, 162, 160, 159, 155, 94, 85, 83, 82, 91, 100, 105, 118, 131, 137, 153, 160, 165, 167, 166, 168, 97, 88, 86, 85, 94, 100, 107, 123, 128, 140, 157, 161, 167, 173, 171, 169, 100, 91, 89, 87, 97, 100, 111, 121, 127, 145, 152, 164, 173, 178, 182, 181, 103, 94, 93, 90, 98, 101, 114, 120, 131, 144, 150, 170, 174, 180, 186, 183, 107, 97, 96, 93, 100, 104, 117, 119, 136, 142, 155, 168, 177, 187, 191, 198, 110, 101, 100, 97, 101, 108, 117, 123, 138, 141, 161, 165, 183, 188, 193, 200, 114, 104, 104, 100, 103, 112, 117, 127, 137, 146, 159, 167, 185, 190, 201, 206, 118, 108, 107, 103, 105, 115, 118, 131, 136, 151, 157, 172, 182, 197, 203, 208, 122, 111, 111, 107, 107, 119, 119, 136, 136, 156, 156, 178, 179, 203, 204, 217, /* Size 4x16 */ 31, 32, 32, 34, 34, 41, 45, 54, 60, 72, 75, 83, 88, 94, 101, 108, 44, 41, 42, 48, 54, 63, 67, 75, 79, 90, 92, 100, 100, 101, 108, 115, 79, 72, 71, 73, 78, 90, 96, 110, 118, 133, 136, 142, 140, 144, 141, 151, 96, 90, 86, 83, 89, 95, 102, 111, 123, 135, 149, 160, 173, 180, 188, 197, /* Size 16x4 */ 31, 44, 79, 96, 32, 41, 72, 90, 32, 42, 71, 86, 34, 48, 73, 83, 34, 54, 78, 89, 41, 63, 90, 95, 45, 67, 96, 102, 54, 75, 110, 111, 60, 79, 118, 123, 72, 90, 133, 135, 75, 92, 136, 149, 83, 100, 142, 160, 88, 100, 140, 173, 94, 101, 144, 180, 101, 108, 141, 188, 108, 115, 151, 197, /* Size 8x32 */ 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71, 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 32, 32, 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100, 101, 103, 105, 107, 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, 117, 118, 119, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136, 138, 137, 136, 136, 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152, 150, 155, 161, 159, 157, 156, 93, 88, 86, 84, 82, 82, 80, 84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162, 165, 167, 173, 174, 177, 183, 185, 182, 179, 99, 94, 93, 90, 89, 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146, 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, /* Size 32x8 */ 32, 32, 36, 53, 65, 87, 93, 99, 31, 32, 35, 51, 62, 82, 88, 94, 31, 33, 34, 49, 59, 78, 86, 93, 31, 33, 35, 49, 59, 78, 84, 90, 32, 34, 36, 50, 59, 77, 82, 89, 32, 35, 38, 49, 58, 75, 82, 89, 34, 37, 42, 54, 63, 79, 80, 88, 35, 37, 45, 57, 65, 82, 84, 87, 36, 38, 48, 60, 68, 84, 86, 90, 39, 40, 50, 65, 73, 89, 91, 93, 44, 43, 53, 71, 79, 95, 94, 97, 46, 44, 55, 73, 82, 98, 98, 99, 48, 46, 56, 76, 85, 102, 105, 105, 53, 50, 60, 82, 92, 109, 107, 107, 58, 54, 63, 87, 98, 116, 112, 115, 61, 56, 66, 89, 101, 120, 119, 116, 65, 58, 68, 92, 105, 124, 122, 124, 71, 63, 73, 97, 111, 132, 130, 127, 79, 70, 79, 104, 118, 141, 135, 135, 81, 71, 80, 105, 119, 142, 140, 139, 82, 72, 81, 106, 121, 144, 149, 146, 88, 77, 85, 108, 126, 149, 153, 152, 91, 80, 88, 106, 130, 148, 162, 159, 94, 83, 91, 105, 131, 153, 165, 166, 97, 86, 94, 107, 128, 157, 167, 171, 100, 89, 97, 111, 127, 152, 173, 182, 103, 93, 98, 114, 131, 150, 174, 186, 107, 96, 100, 117, 136, 155, 177, 191, 110, 100, 101, 117, 138, 161, 183, 193, 114, 104, 103, 117, 137, 159, 185, 201, 118, 107, 105, 118, 136, 157, 182, 203, 122, 111, 107, 119, 136, 156, 179, 204 }, { /* Chroma */ /* Size 4x4 */ 35, 46, 57, 66, 46, 60, 69, 71, 57, 69, 90, 90, 66, 71, 90, 109, /* Size 8x8 */ 31, 38, 47, 50, 57, 63, 67, 71, 38, 47, 46, 47, 52, 57, 62, 67, 47, 46, 54, 57, 61, 66, 67, 68, 50, 47, 57, 66, 72, 77, 75, 75, 57, 52, 61, 72, 82, 88, 86, 84, 63, 57, 66, 77, 88, 96, 95, 95, 67, 62, 67, 75, 86, 95, 104, 107, 71, 67, 68, 75, 84, 95, 107, 113, /* Size 16x16 */ 32, 30, 33, 41, 49, 49, 50, 54, 57, 63, 65, 68, 70, 72, 74, 76, 30, 32, 35, 42, 46, 45, 46, 49, 52, 57, 58, 62, 64, 67, 70, 72, 33, 35, 39, 45, 47, 45, 46, 49, 51, 56, 57, 60, 62, 64, 66, 69, 41, 42, 45, 48, 50, 49, 50, 52, 53, 57, 58, 59, 60, 61, 64, 67, 49, 46, 47, 50, 53, 53, 54, 55, 56, 60, 61, 64, 64, 65, 66, 66, 49, 45, 45, 49, 53, 58, 60, 62, 63, 67, 68, 67, 69, 68, 70, 70, 50, 46, 46, 50, 54, 60, 61, 65, 67, 71, 71, 74, 73, 73, 74, 74, 54, 49, 49, 52, 55, 62, 65, 71, 73, 78, 79, 78, 77, 78, 78, 78, 57, 52, 51, 53, 56, 63, 67, 73, 76, 82, 83, 84, 84, 84, 82, 83, 63, 57, 56, 57, 60, 67, 71, 78, 82, 89, 90, 90, 89, 88, 87, 88, 65, 58, 57, 58, 61, 68, 71, 79, 83, 90, 91, 94, 93, 93, 92, 93, 68, 62, 60, 59, 64, 67, 74, 78, 84, 90, 94, 98, 99, 98, 98, 98, 70, 64, 62, 60, 64, 69, 73, 77, 84, 89, 93, 99, 102, 103, 104, 104, 72, 67, 64, 61, 65, 68, 73, 78, 84, 88, 93, 98, 103, 106, 108, 109, 74, 70, 66, 64, 66, 70, 74, 78, 82, 87, 92, 98, 104, 108, 111, 112, 76, 72, 69, 67, 66, 70, 74, 78, 83, 88, 93, 98, 104, 109, 112, 116, /* Size 32x32 */ 32, 31, 30, 32, 33, 36, 41, 45, 49, 48, 49, 50, 50, 52, 54, 56, 57, 60, 63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 31, 31, 31, 33, 34, 38, 42, 45, 47, 47, 47, 47, 48, 50, 52, 53, 54, 57, 60, 61, 61, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 30, 31, 32, 33, 35, 40, 42, 44, 46, 45, 45, 45, 46, 47, 49, 51, 52, 54, 57, 58, 58, 61, 62, 63, 64, 66, 67, 68, 70, 71, 72, 74, 32, 33, 33, 35, 37, 41, 43, 45, 47, 46, 45, 46, 46, 47, 49, 50, 51, 54, 57, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 33, 34, 35, 37, 39, 43, 45, 46, 47, 46, 45, 46, 46, 47, 49, 50, 51, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 70, 36, 38, 40, 41, 43, 47, 47, 47, 48, 46, 45, 46, 46, 47, 48, 49, 50, 52, 54, 55, 55, 57, 58, 59, 61, 62, 64, 65, 66, 67, 68, 69, 41, 42, 42, 43, 45, 47, 48, 49, 50, 49, 49, 49, 50, 50, 52, 52, 53, 55, 57, 58, 58, 60, 59, 59, 60, 61, 61, 63, 64, 66, 67, 69, 45, 45, 44, 45, 46, 47, 49, 50, 51, 51, 51, 51, 52, 52, 53, 54, 55, 57, 59, 59, 60, 61, 61, 62, 63, 63, 63, 63, 63, 64, 65, 66, 49, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 64, 64, 64, 65, 66, 66, 66, 66, 66, 48, 47, 45, 46, 46, 46, 49, 51, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 63, 64, 64, 66, 66, 65, 66, 67, 67, 67, 67, 68, 69, 70, 49, 47, 45, 45, 45, 45, 49, 51, 53, 55, 58, 59, 60, 61, 62, 63, 63, 65, 67, 67, 68, 69, 67, 68, 69, 68, 68, 69, 70, 70, 70, 70, 50, 47, 45, 46, 46, 46, 49, 51, 54, 56, 59, 60, 60, 62, 64, 64, 65, 67, 69, 69, 70, 70, 71, 71, 70, 70, 71, 71, 71, 71, 72, 74, 50, 48, 46, 46, 46, 46, 50, 52, 54, 56, 60, 60, 61, 63, 65, 66, 67, 68, 71, 71, 71, 73, 74, 72, 73, 74, 73, 73, 74, 74, 74, 74, 52, 50, 47, 47, 47, 47, 50, 52, 54, 57, 61, 62, 63, 66, 68, 69, 70, 72, 75, 75, 75, 77, 75, 75, 76, 75, 75, 76, 75, 75, 76, 77, 54, 52, 49, 49, 49, 48, 52, 53, 55, 58, 62, 64, 65, 68, 71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 77, 78, 78, 77, 78, 79, 78, 78, 56, 53, 51, 50, 50, 49, 52, 54, 56, 59, 63, 64, 66, 69, 72, 73, 75, 77, 80, 80, 81, 81, 82, 80, 81, 81, 79, 81, 80, 79, 81, 82, 57, 54, 52, 51, 51, 50, 53, 55, 56, 60, 63, 65, 67, 70, 73, 75, 76, 79, 82, 82, 83, 85, 84, 83, 84, 83, 84, 82, 82, 84, 83, 82, 60, 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72, 75, 77, 79, 82, 85, 85, 86, 88, 86, 87, 85, 86, 85, 85, 86, 84, 85, 86, 63, 60, 57, 57, 56, 54, 57, 59, 60, 63, 67, 69, 71, 75, 78, 80, 82, 85, 89, 89, 90, 90, 90, 89, 89, 88, 88, 88, 87, 88, 88, 87, 64, 61, 58, 57, 57, 55, 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90, 91, 92, 93, 92, 92, 91, 91, 90, 91, 90, 90, 92, 65, 61, 58, 58, 57, 55, 58, 60, 61, 64, 68, 70, 71, 75, 79, 81, 83, 86, 90, 91, 91, 94, 94, 96, 93, 94, 93, 94, 92, 93, 93, 92, 67, 63, 61, 60, 59, 57, 60, 61, 63, 66, 69, 70, 73, 77, 79, 81, 85, 88, 90, 92, 94, 96, 96, 97, 98, 95, 97, 95, 96, 95, 95, 96, 68, 64, 62, 61, 60, 58, 59, 61, 64, 66, 67, 71, 74, 75, 78, 82, 84, 86, 90, 93, 94, 96, 98, 98, 99, 100, 98, 99, 98, 98, 98, 97, 69, 65, 63, 62, 61, 59, 59, 62, 64, 65, 68, 71, 72, 75, 79, 80, 83, 87, 89, 92, 96, 97, 98, 100, 100, 101, 102, 101, 101, 101, 100, 102, 70, 66, 64, 63, 62, 61, 60, 63, 64, 66, 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98, 99, 100, 102, 102, 103, 104, 104, 103, 104, 102, 71, 67, 66, 64, 63, 62, 61, 63, 64, 67, 68, 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100, 101, 102, 104, 104, 105, 106, 107, 105, 107, 72, 68, 67, 65, 64, 64, 61, 63, 65, 67, 68, 71, 73, 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102, 103, 104, 106, 106, 108, 108, 109, 107, 73, 69, 68, 66, 65, 65, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 88, 90, 94, 95, 99, 101, 104, 105, 106, 109, 108, 110, 111, 112, 74, 70, 70, 67, 66, 66, 64, 63, 66, 67, 70, 71, 74, 75, 78, 80, 82, 86, 87, 91, 92, 96, 98, 101, 104, 106, 108, 108, 111, 111, 112, 113, 75, 71, 71, 68, 68, 67, 66, 64, 66, 68, 70, 71, 74, 75, 79, 79, 84, 84, 88, 90, 93, 95, 98, 101, 103, 107, 108, 110, 111, 113, 113, 115, 76, 72, 72, 69, 69, 68, 67, 65, 66, 69, 70, 72, 74, 76, 78, 81, 83, 85, 88, 90, 93, 95, 98, 100, 104, 105, 109, 111, 112, 113, 116, 115, 78, 74, 74, 70, 70, 69, 69, 66, 66, 70, 70, 74, 74, 77, 78, 82, 82, 86, 87, 92, 92, 96, 97, 102, 102, 107, 107, 112, 113, 115, 115, 118, /* Size 4x8 */ 31, 40, 46, 48, 54, 61, 64, 68, 47, 45, 56, 61, 65, 69, 68, 71, 60, 54, 64, 75, 85, 92, 90, 87, 66, 61, 64, 73, 82, 92, 102, 105, /* Size 8x4 */ 31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64, 48, 61, 75, 73, 54, 65, 85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105, /* Size 8x16 */ 32, 30, 33, 42, 49, 49, 50, 54, 57, 63, 64, 68, 70, 72, 74, 76, 37, 40, 43, 47, 48, 46, 46, 49, 50, 55, 56, 59, 62, 64, 67, 69, 48, 46, 47, 50, 53, 53, 54, 55, 56, 60, 61, 64, 66, 66, 66, 67, 52, 48, 47, 50, 54, 61, 64, 68, 70, 75, 75, 74, 73, 75, 74, 73, 57, 52, 51, 53, 57, 64, 67, 73, 76, 82, 83, 86, 83, 83, 84, 82, 66, 60, 59, 60, 62, 69, 73, 80, 84, 92, 93, 94, 96, 92, 94, 91, 68, 63, 60, 59, 62, 66, 72, 76, 80, 87, 93, 98, 99, 101, 103, 101, 71, 66, 63, 62, 62, 66, 70, 75, 79, 84, 89, 94, 98, 104, 106, 109, /* Size 16x8 */ 32, 37, 48, 52, 57, 66, 68, 71, 30, 40, 46, 48, 52, 60, 63, 66, 33, 43, 47, 47, 51, 59, 60, 63, 42, 47, 50, 50, 53, 60, 59, 62, 49, 48, 53, 54, 57, 62, 62, 62, 49, 46, 53, 61, 64, 69, 66, 66, 50, 46, 54, 64, 67, 73, 72, 70, 54, 49, 55, 68, 73, 80, 76, 75, 57, 50, 56, 70, 76, 84, 80, 79, 63, 55, 60, 75, 82, 92, 87, 84, 64, 56, 61, 75, 83, 93, 93, 89, 68, 59, 64, 74, 86, 94, 98, 94, 70, 62, 66, 73, 83, 96, 99, 98, 72, 64, 66, 75, 83, 92, 101, 104, 74, 67, 66, 74, 84, 94, 103, 106, 76, 69, 67, 73, 82, 91, 101, 109, /* Size 16x32 */ 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60, 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 31, 31, 32, 34, 36, 40, 43, 44, 46, 46, 45, 46, 46, 48, 50, 51, 52, 54, 57, 58, 59, 61, 62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 37, 38, 40, 41, 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 42, 42, 42, 44, 45, 47, 48, 49, 50, 50, 49, 49, 50, 50, 52, 52, 53, 55, 58, 58, 58, 60, 60, 60, 60, 61, 62, 63, 64, 65, 66, 67, 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67, 49, 47, 45, 45, 46, 45, 49, 51, 53, 56, 58, 59, 59, 61, 62, 63, 64, 65, 67, 68, 68, 69, 71, 70, 69, 68, 68, 69, 70, 71, 72, 73, 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74, 73, 73, 54, 52, 50, 49, 49, 48, 52, 54, 55, 59, 62, 64, 65, 68, 71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 81, 79, 78, 76, 77, 78, 80, 81, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, 63, 60, 57, 57, 56, 54, 57, 59, 60, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 89, 90, 92, 91, 88, 89, 90, 89, 87, 86, 87, 88, 90, 66, 63, 60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 67, 64, 62, 61, 60, 58, 58, 61, 63, 65, 67, 70, 72, 74, 78, 80, 82, 86, 88, 90, 95, 96, 96, 98, 97, 98, 100, 98, 96, 96, 97, 99, 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103, 103, 101, 99, 69, 66, 65, 63, 62, 61, 60, 60, 63, 64, 66, 68, 70, 73, 74, 78, 80, 82, 85, 87, 91, 92, 96, 98, 101, 102, 103, 105, 105, 105, 107, 108, 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109, 108, 72, 68, 68, 65, 65, 63, 63, 61, 62, 65, 65, 68, 69, 72, 73, 77, 77, 81, 81, 86, 87, 91, 91, 96, 97, 101, 102, 107, 107, 109, 110, 113, /* Size 32x16 */ 32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67, 68, 69, 71, 72, 31, 31, 38, 42, 47, 47, 50, 52, 54, 60, 63, 64, 65, 66, 67, 68, 30, 32, 40, 42, 46, 45, 48, 50, 52, 57, 60, 62, 63, 65, 66, 68, 32, 34, 41, 44, 46, 45, 48, 49, 51, 57, 59, 61, 62, 63, 64, 65, 33, 36, 43, 45, 47, 46, 47, 49, 51, 56, 59, 60, 60, 62, 63, 65, 37, 40, 47, 47, 47, 45, 47, 48, 50, 54, 57, 58, 60, 61, 62, 63, 42, 43, 47, 48, 50, 49, 50, 52, 53, 57, 60, 58, 59, 60, 62, 63, 45, 44, 47, 49, 51, 51, 52, 54, 55, 59, 61, 61, 61, 60, 61, 61, 49, 46, 48, 50, 53, 53, 54, 55, 57, 60, 62, 63, 62, 63, 62, 62, 48, 46, 47, 50, 53, 56, 57, 59, 60, 64, 66, 65, 65, 64, 64, 65, 49, 45, 46, 49, 53, 58, 61, 62, 64, 67, 69, 67, 66, 66, 66, 65, 49, 46, 46, 49, 53, 59, 62, 64, 65, 69, 71, 70, 68, 68, 67, 68, 50, 46, 46, 50, 54, 59, 64, 65, 67, 71, 73, 72, 72, 70, 70, 69, 52, 48, 47, 50, 54, 61, 66, 68, 71, 75, 77, 74, 73, 73, 71, 72, 54, 50, 49, 52, 55, 62, 68, 71, 73, 78, 80, 78, 76, 74, 75, 73, 55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80, 79, 78, 76, 77, 57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82, 80, 80, 79, 77, 60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86, 84, 82, 81, 81, 63, 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88, 87, 85, 84, 81, 64, 58, 55, 58, 61, 68, 75, 78, 82, 89, 92, 90, 89, 87, 86, 86, 64, 59, 56, 58, 61, 68, 75, 79, 83, 90, 93, 95, 93, 91, 89, 87, 67, 61, 58, 60, 63, 69, 76, 79, 85, 92, 95, 96, 94, 92, 91, 91, 68, 62, 59, 60, 64, 71, 74, 78, 86, 91, 94, 96, 98, 96, 94, 91, 69, 62, 60, 60, 65, 70, 72, 79, 85, 88, 95, 98, 99, 98, 97, 96, 70, 63, 62, 60, 66, 69, 73, 81, 83, 89, 96, 97, 99, 101, 98, 97, 71, 64, 63, 61, 67, 68, 74, 79, 82, 90, 93, 98, 102, 102, 102, 101, 72, 65, 64, 62, 66, 68, 75, 78, 83, 89, 92, 100, 101, 103, 104, 102, 73, 66, 65, 63, 66, 69, 75, 76, 84, 87, 93, 98, 102, 105, 106, 107, 74, 67, 67, 64, 66, 70, 74, 77, 84, 86, 94, 96, 103, 105, 106, 107, 75, 68, 68, 65, 66, 71, 74, 78, 83, 87, 93, 96, 103, 105, 109, 109, 76, 69, 69, 66, 67, 72, 73, 80, 82, 88, 91, 97, 101, 107, 109, 110, 77, 70, 70, 67, 67, 73, 73, 81, 81, 90, 90, 99, 99, 108, 108, 113, /* Size 4x16 */ 31, 32, 36, 43, 46, 45, 46, 50, 52, 57, 59, 62, 63, 65, 67, 69, 49, 45, 46, 49, 53, 58, 59, 62, 64, 67, 68, 71, 69, 68, 70, 72, 63, 57, 56, 57, 60, 67, 71, 78, 82, 89, 90, 91, 89, 89, 86, 88, 69, 65, 62, 60, 63, 66, 70, 74, 80, 85, 91, 96, 101, 103, 105, 107, /* Size 16x4 */ 31, 49, 63, 69, 32, 45, 57, 65, 36, 46, 56, 62, 43, 49, 57, 60, 46, 53, 60, 63, 45, 58, 67, 66, 46, 59, 71, 70, 50, 62, 78, 74, 52, 64, 82, 80, 57, 67, 89, 85, 59, 68, 90, 91, 62, 71, 91, 96, 63, 69, 89, 101, 65, 68, 89, 103, 67, 70, 86, 105, 69, 72, 88, 107, /* Size 8x32 */ 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60, 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 37, 38, 40, 41, 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67, 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74, 73, 73, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, 66, 63, 60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103, 103, 101, 99, 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109, 108, /* Size 32x8 */ 32, 37, 48, 52, 57, 66, 68, 71, 31, 38, 47, 50, 54, 63, 65, 67, 30, 40, 46, 48, 52, 60, 63, 66, 32, 41, 46, 48, 51, 59, 62, 64, 33, 43, 47, 47, 51, 59, 60, 63, 37, 47, 47, 47, 50, 57, 60, 62, 42, 47, 50, 50, 53, 60, 59, 62, 45, 47, 51, 52, 55, 61, 61, 61, 49, 48, 53, 54, 57, 62, 62, 62, 48, 47, 53, 57, 60, 66, 65, 64, 49, 46, 53, 61, 64, 69, 66, 66, 49, 46, 53, 62, 65, 71, 68, 67, 50, 46, 54, 64, 67, 73, 72, 70, 52, 47, 54, 66, 71, 77, 73, 71, 54, 49, 55, 68, 73, 80, 76, 75, 55, 49, 56, 69, 75, 82, 79, 76, 57, 50, 56, 70, 76, 84, 80, 79, 60, 52, 58, 72, 79, 88, 84, 81, 63, 55, 60, 75, 82, 92, 87, 84, 64, 55, 61, 75, 82, 92, 89, 86, 64, 56, 61, 75, 83, 93, 93, 89, 67, 58, 63, 76, 85, 95, 94, 91, 68, 59, 64, 74, 86, 94, 98, 94, 69, 60, 65, 72, 85, 95, 99, 97, 70, 62, 66, 73, 83, 96, 99, 98, 71, 63, 67, 74, 82, 93, 102, 102, 72, 64, 66, 75, 83, 92, 101, 104, 73, 65, 66, 75, 84, 93, 102, 106, 74, 67, 66, 74, 84, 94, 103, 106, 75, 68, 66, 74, 83, 93, 103, 109, 76, 69, 67, 73, 82, 91, 101, 109, 77, 70, 67, 73, 81, 90, 99, 108 }, }, { { /* Luma */ /* Size 4x4 */ 32, 41, 69, 92, 41, 63, 88, 103, 69, 88, 127, 140, 92, 103, 140, 184, /* Size 8x8 */ 32, 32, 37, 47, 62, 78, 90, 102, 32, 35, 39, 46, 58, 72, 84, 96, 37, 39, 51, 60, 71, 84, 93, 100, 47, 46, 60, 73, 87, 100, 106, 113, 62, 58, 71, 87, 105, 121, 129, 132, 78, 72, 84, 100, 121, 140, 148, 155, 90, 84, 93, 106, 129, 148, 169, 183, 102, 96, 100, 113, 132, 155, 183, 201, /* Size 16x16 */ 32, 31, 31, 32, 36, 39, 47, 54, 61, 71, 80, 86, 92, 98, 104, 111, 31, 32, 32, 33, 34, 37, 44, 50, 56, 65, 73, 79, 85, 91, 98, 105, 31, 32, 33, 34, 36, 39, 45, 50, 56, 64, 71, 77, 82, 88, 94, 100, 32, 33, 34, 36, 40, 42, 47, 51, 57, 65, 71, 76, 80, 85, 91, 98, 36, 34, 36, 40, 48, 50, 56, 60, 65, 73, 79, 84, 86, 90, 95, 98, 39, 37, 39, 42, 50, 54, 60, 65, 70, 78, 84, 89, 95, 96, 102, 105, 47, 44, 45, 47, 56, 60, 69, 75, 81, 89, 95, 100, 102, 104, 109, 112, 54, 50, 50, 51, 60, 65, 75, 82, 89, 97, 104, 109, 110, 114, 117, 121, 61, 56, 56, 57, 65, 70, 81, 89, 97, 106, 113, 119, 122, 126, 125, 130, 71, 65, 64, 65, 73, 78, 89, 97, 106, 117, 125, 131, 134, 134, 136, 141, 80, 73, 71, 71, 79, 84, 95, 104, 113, 125, 134, 140, 142, 145, 146, 152, 86, 79, 77, 76, 84, 89, 100, 109, 119, 131, 140, 147, 154, 157, 160, 165, 92, 85, 82, 80, 86, 95, 102, 110, 122, 134, 142, 154, 162, 168, 174, 178, 98, 91, 88, 85, 90, 96, 104, 114, 126, 134, 145, 157, 168, 176, 184, 193, 104, 98, 94, 91, 95, 102, 109, 117, 125, 136, 146, 160, 174, 184, 193, 201, 111, 105, 100, 98, 98, 105, 112, 121, 130, 141, 152, 165, 178, 193, 201, 210, /* Size 32x32 */ 32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 61, 65, 71, 76, 80, 83, 86, 89, 92, 95, 98, 101, 104, 108, 111, 114, 31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 58, 62, 68, 72, 76, 78, 82, 85, 88, 90, 93, 96, 99, 102, 105, 109, 31, 32, 32, 32, 32, 32, 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 56, 60, 65, 70, 73, 76, 79, 82, 85, 88, 91, 95, 98, 101, 105, 109, 31, 32, 32, 32, 32, 33, 33, 34, 35, 36, 38, 41, 44, 45, 49, 54, 56, 59, 65, 69, 72, 75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104, 31, 32, 32, 32, 33, 34, 34, 35, 36, 38, 39, 42, 45, 46, 50, 54, 56, 59, 64, 68, 71, 74, 77, 79, 82, 85, 88, 91, 94, 97, 100, 104, 32, 32, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 45, 46, 49, 53, 55, 58, 63, 66, 69, 72, 74, 78, 81, 84, 87, 90, 93, 96, 99, 102, 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 44, 47, 48, 51, 55, 57, 60, 65, 68, 71, 73, 76, 78, 80, 82, 85, 88, 91, 95, 98, 102, 34, 34, 33, 34, 35, 37, 38, 39, 42, 44, 45, 47, 50, 51, 54, 58, 60, 63, 68, 71, 74, 76, 79, 82, 85, 86, 87, 88, 90, 93, 96, 99, 36, 35, 34, 35, 36, 38, 40, 42, 48, 50, 50, 54, 56, 57, 60, 64, 65, 68, 73, 76, 79, 81, 84, 86, 86, 88, 90, 93, 95, 97, 98, 100, 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 52, 56, 58, 60, 63, 67, 68, 71, 76, 79, 82, 84, 87, 87, 90, 93, 94, 95, 96, 100, 103, 106, 39, 38, 37, 38, 39, 40, 42, 45, 50, 52, 54, 58, 60, 62, 65, 69, 70, 73, 78, 81, 84, 86, 89, 92, 95, 95, 96, 99, 102, 104, 105, 106, 44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58, 63, 66, 68, 71, 75, 77, 79, 84, 88, 90, 92, 95, 97, 97, 99, 102, 103, 103, 106, 109, 113, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 100, 102, 105, 104, 106, 109, 111, 112, 113, 49, 47, 46, 45, 46, 46, 48, 51, 57, 60, 62, 68, 71, 73, 77, 81, 83, 87, 92, 95, 98, 100, 103, 105, 107, 106, 109, 112, 112, 113, 117, 120, 54, 51, 50, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104, 106, 109, 112, 110, 113, 114, 114, 117, 121, 121, 121, 59, 56, 54, 54, 54, 53, 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, 94, 98, 103, 107, 110, 113, 116, 114, 117, 118, 117, 121, 122, 122, 125, 129, 61, 58, 56, 56, 56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97, 101, 106, 110, 113, 116, 119, 120, 122, 121, 126, 124, 125, 130, 130, 130, 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92, 98, 101, 105, 111, 115, 118, 121, 124, 128, 125, 129, 128, 131, 133, 132, 135, 139, 71, 68, 65, 65, 64, 63, 65, 68, 73, 76, 78, 84, 89, 92, 97, 103, 106, 111, 117, 122, 125, 128, 131, 131, 134, 132, 134, 136, 136, 140, 141, 140, 76, 72, 70, 69, 68, 66, 68, 71, 76, 79, 81, 88, 92, 95, 101, 107, 110, 115, 122, 127, 130, 133, 136, 136, 138, 139, 141, 140, 145, 143, 146, 151, 80, 76, 73, 72, 71, 69, 71, 74, 79, 82, 84, 90, 95, 98, 104, 110, 113, 118, 125, 130, 134, 137, 140, 146, 142, 146, 145, 149, 146, 150, 152, 151, 83, 78, 76, 75, 74, 72, 73, 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121, 128, 133, 137, 140, 144, 147, 152, 148, 154, 151, 156, 155, 156, 162, 86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100, 103, 109, 116, 119, 124, 131, 136, 140, 144, 147, 150, 154, 159, 157, 160, 160, 162, 165, 162, 89, 85, 82, 81, 79, 78, 78, 82, 86, 87, 92, 97, 100, 105, 112, 114, 120, 128, 131, 136, 146, 147, 150, 155, 156, 161, 166, 165, 167, 169, 169, 175, 92, 88, 85, 84, 82, 81, 80, 85, 86, 90, 95, 97, 102, 107, 110, 117, 122, 125, 134, 138, 142, 152, 154, 156, 162, 163, 168, 173, 174, 174, 178, 176, 95, 90, 88, 86, 85, 84, 82, 86, 88, 93, 95, 99, 105, 106, 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163, 169, 170, 176, 180, 183, 181, 187, 98, 93, 91, 89, 88, 87, 85, 87, 90, 94, 96, 102, 104, 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, 176, 178, 184, 188, 193, 188, 101, 96, 95, 92, 91, 90, 88, 88, 93, 95, 99, 103, 106, 112, 114, 121, 124, 131, 136, 140, 149, 151, 160, 165, 173, 176, 178, 184, 186, 192, 196, 203, 104, 99, 98, 95, 94, 93, 91, 90, 95, 96, 102, 103, 109, 112, 117, 122, 125, 133, 136, 145, 146, 156, 160, 167, 174, 180, 184, 186, 193, 194, 201, 204, 108, 102, 101, 98, 97, 96, 95, 93, 97, 100, 104, 106, 111, 113, 121, 122, 130, 132, 140, 143, 150, 155, 162, 169, 174, 183, 188, 192, 194, 201, 202, 210, 111, 105, 105, 101, 100, 99, 98, 96, 98, 103, 105, 109, 112, 117, 121, 125, 130, 135, 141, 146, 152, 156, 165, 169, 178, 181, 193, 196, 201, 202, 210, 211, 114, 109, 109, 104, 104, 102, 102, 99, 100, 106, 106, 113, 113, 120, 121, 129, 130, 139, 140, 151, 151, 162, 162, 175, 176, 187, 188, 203, 204, 210, 211, 219, /* Size 4x8 */ 32, 33, 36, 46, 60, 75, 86, 98, 42, 42, 56, 67, 79, 92, 95, 105, 69, 64, 77, 93, 112, 130, 136, 136, 88, 83, 88, 105, 122, 144, 167, 177, /* Size 8x4 */ 32, 42, 69, 88, 33, 42, 64, 83, 36, 56, 77, 88, 46, 67, 93, 105, 60, 79, 112, 122, 75, 92, 130, 144, 86, 95, 136, 167, 98, 105, 136, 177, /* Size 8x16 */ 32, 31, 32, 32, 36, 39, 47, 53, 61, 71, 79, 86, 92, 98, 104, 110, 32, 32, 34, 35, 37, 40, 45, 50, 56, 64, 70, 76, 82, 88, 94, 100, 36, 35, 36, 40, 48, 50, 56, 60, 65, 73, 79, 84, 89, 93, 95, 98, 47, 44, 45, 47, 56, 60, 69, 75, 81, 89, 95, 100, 101, 108, 110, 111, 65, 60, 59, 60, 68, 73, 84, 92, 100, 111, 118, 124, 121, 124, 129, 127, 79, 72, 71, 71, 78, 84, 95, 103, 113, 125, 133, 140, 148, 141, 151, 147, 90, 84, 80, 78, 83, 91, 101, 108, 116, 129, 142, 153, 157, 163, 171, 169, 96, 90, 87, 85, 87, 94, 101, 110, 118, 129, 138, 150, 161, 174, 181, 188, /* Size 16x8 */ 32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 44, 60, 72, 84, 90, 32, 34, 36, 45, 59, 71, 80, 87, 32, 35, 40, 47, 60, 71, 78, 85, 36, 37, 48, 56, 68, 78, 83, 87, 39, 40, 50, 60, 73, 84, 91, 94, 47, 45, 56, 69, 84, 95, 101, 101, 53, 50, 60, 75, 92, 103, 108, 110, 61, 56, 65, 81, 100, 113, 116, 118, 71, 64, 73, 89, 111, 125, 129, 129, 79, 70, 79, 95, 118, 133, 142, 138, 86, 76, 84, 100, 124, 140, 153, 150, 92, 82, 89, 101, 121, 148, 157, 161, 98, 88, 93, 108, 124, 141, 163, 174, 104, 94, 95, 110, 129, 151, 171, 181, 110, 100, 98, 111, 127, 147, 169, 188, /* Size 16x32 */ 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65, 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 31, 32, 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 56, 60, 65, 69, 72, 75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104, 32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 57, 60, 65, 69, 71, 74, 77, 78, 80, 83, 85, 88, 91, 94, 97, 100, 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, 100, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 77, 79, 84, 88, 90, 92, 95, 95, 95, 95, 95, 98, 101, 105, 108, 111, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111, 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104, 106, 109, 112, 116, 114, 113, 112, 115, 119, 123, 126, 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127, 73, 69, 67, 66, 65, 64, 66, 69, 74, 77, 79, 85, 90, 93, 99, 105, 107, 112, 119, 123, 127, 130, 133, 130, 132, 136, 136, 133, 132, 136, 141, 145, 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145, 87, 83, 80, 79, 78, 76, 76, 80, 84, 86, 90, 96, 99, 103, 111, 114, 118, 126, 130, 134, 143, 146, 147, 152, 151, 155, 160, 158, 154, 156, 161, 166, 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, 166, 93, 88, 87, 84, 83, 83, 81, 81, 86, 88, 92, 96, 98, 105, 107, 113, 117, 122, 129, 131, 141, 144, 151, 157, 163, 167, 169, 175, 175, 177, 183, 189, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181, 188, 188, 190, 99, 94, 94, 90, 90, 88, 89, 86, 87, 93, 93, 99, 99, 106, 107, 115, 116, 124, 125, 135, 136, 145, 146, 158, 159, 170, 171, 185, 186, 192, 193, 201, /* Size 32x16 */ 32, 31, 32, 32, 36, 44, 47, 53, 65, 73, 79, 87, 90, 93, 96, 99, 31, 32, 32, 33, 35, 42, 45, 51, 62, 69, 75, 83, 86, 88, 91, 94, 31, 32, 32, 33, 35, 41, 44, 49, 60, 67, 72, 80, 84, 87, 90, 94, 31, 32, 33, 33, 35, 41, 44, 49, 59, 66, 71, 79, 82, 84, 87, 90, 32, 32, 34, 34, 36, 42, 45, 50, 59, 65, 71, 78, 80, 83, 87, 90, 32, 33, 35, 36, 38, 42, 45, 49, 58, 64, 69, 76, 80, 83, 86, 88, 32, 33, 35, 36, 40, 44, 47, 51, 60, 66, 71, 76, 78, 81, 85, 89, 34, 34, 36, 38, 42, 48, 50, 54, 63, 69, 73, 80, 82, 81, 84, 86, 36, 34, 37, 40, 48, 54, 56, 60, 68, 74, 78, 84, 83, 86, 87, 87, 38, 36, 39, 41, 49, 56, 58, 63, 71, 77, 81, 86, 88, 88, 90, 93, 39, 37, 40, 42, 50, 58, 60, 65, 73, 79, 84, 90, 91, 92, 94, 93, 44, 41, 42, 45, 53, 63, 66, 71, 79, 85, 90, 96, 94, 96, 96, 99, 47, 44, 45, 47, 56, 66, 69, 75, 84, 90, 95, 99, 101, 98, 101, 99, 49, 46, 47, 48, 57, 67, 71, 77, 86, 93, 97, 103, 103, 105, 102, 106, 53, 49, 50, 51, 60, 71, 75, 82, 92, 99, 103, 111, 108, 107, 110, 107, 58, 54, 54, 55, 63, 75, 79, 87, 98, 105, 110, 114, 114, 113, 111, 115, 61, 56, 56, 57, 65, 77, 81, 89, 100, 107, 113, 118, 116, 117, 118, 116, 65, 60, 59, 60, 68, 79, 84, 92, 105, 112, 118, 126, 124, 122, 121, 124, 71, 65, 64, 65, 73, 84, 89, 97, 111, 119, 125, 130, 129, 129, 129, 125, 76, 69, 68, 69, 76, 88, 92, 101, 115, 123, 130, 134, 134, 131, 132, 135, 79, 72, 70, 71, 79, 90, 95, 104, 118, 127, 133, 143, 142, 141, 138, 136, 82, 75, 73, 74, 81, 92, 97, 106, 121, 130, 136, 146, 145, 144, 144, 145, 86, 78, 76, 77, 84, 95, 100, 109, 124, 133, 140, 147, 153, 151, 150, 146, 89, 81, 79, 78, 87, 95, 99, 112, 124, 130, 145, 152, 156, 157, 156, 158, 92, 84, 82, 80, 89, 95, 101, 116, 121, 132, 148, 151, 157, 163, 161, 159, 95, 86, 85, 83, 92, 95, 105, 114, 120, 136, 143, 155, 163, 167, 171, 170, 98, 89, 88, 85, 93, 95, 108, 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92, 91, 88, 94, 98, 110, 112, 128, 133, 146, 158, 166, 175, 179, 185, 104, 95, 94, 91, 95, 101, 110, 115, 129, 132, 151, 154, 171, 175, 181, 186, 107, 98, 97, 94, 96, 105, 110, 119, 128, 136, 149, 156, 173, 177, 188, 192, 110, 101, 100, 97, 98, 108, 111, 123, 127, 141, 147, 161, 169, 183, 188, 193, 114, 104, 104, 100, 100, 111, 111, 126, 127, 145, 145, 166, 166, 189, 190, 201, /* Size 4x16 */ 31, 32, 32, 33, 34, 37, 44, 49, 56, 65, 72, 78, 84, 89, 95, 101, 44, 41, 42, 44, 54, 58, 66, 71, 77, 84, 90, 95, 95, 95, 101, 108, 73, 67, 65, 66, 74, 79, 90, 99, 107, 119, 127, 133, 132, 136, 132, 141, 93, 87, 83, 81, 86, 92, 98, 107, 117, 129, 141, 151, 163, 169, 175, 183, /* Size 16x4 */ 31, 44, 73, 93, 32, 41, 67, 87, 32, 42, 65, 83, 33, 44, 66, 81, 34, 54, 74, 86, 37, 58, 79, 92, 44, 66, 90, 98, 49, 71, 99, 107, 56, 77, 107, 117, 65, 84, 119, 129, 72, 90, 127, 141, 78, 95, 133, 151, 84, 95, 132, 163, 89, 95, 136, 169, 95, 101, 132, 175, 101, 108, 141, 183, /* Size 8x32 */ 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65, 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, 100, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111, 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127, 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145, 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, 166, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181, 188, 188, 190, /* Size 32x8 */ 32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 45, 62, 75, 86, 91, 31, 32, 35, 44, 60, 72, 84, 90, 31, 33, 35, 44, 59, 71, 82, 87, 32, 34, 36, 45, 59, 71, 80, 87, 32, 35, 38, 45, 58, 69, 80, 86, 32, 35, 40, 47, 60, 71, 78, 85, 34, 36, 42, 50, 63, 73, 82, 84, 36, 37, 48, 56, 68, 78, 83, 87, 38, 39, 49, 58, 71, 81, 88, 90, 39, 40, 50, 60, 73, 84, 91, 94, 44, 42, 53, 66, 79, 90, 94, 96, 47, 45, 56, 69, 84, 95, 101, 101, 49, 47, 57, 71, 86, 97, 103, 102, 53, 50, 60, 75, 92, 103, 108, 110, 58, 54, 63, 79, 98, 110, 114, 111, 61, 56, 65, 81, 100, 113, 116, 118, 65, 59, 68, 84, 105, 118, 124, 121, 71, 64, 73, 89, 111, 125, 129, 129, 76, 68, 76, 92, 115, 130, 134, 132, 79, 70, 79, 95, 118, 133, 142, 138, 82, 73, 81, 97, 121, 136, 145, 144, 86, 76, 84, 100, 124, 140, 153, 150, 89, 79, 87, 99, 124, 145, 156, 156, 92, 82, 89, 101, 121, 148, 157, 161, 95, 85, 92, 105, 120, 143, 163, 171, 98, 88, 93, 108, 124, 141, 163, 174, 101, 91, 94, 110, 128, 146, 166, 179, 104, 94, 95, 110, 129, 151, 171, 181, 107, 97, 96, 110, 128, 149, 173, 188, 110, 100, 98, 111, 127, 147, 169, 188, 114, 104, 100, 111, 127, 145, 166, 190 }, { /* Chroma */ /* Size 4x4 */ 33, 45, 56, 64, 45, 58, 66, 69, 56, 66, 86, 87, 64, 69, 87, 105, /* Size 8x8 */ 31, 38, 47, 48, 54, 61, 66, 69, 38, 47, 47, 46, 50, 55, 61, 65, 47, 47, 53, 55, 58, 63, 65, 66, 48, 46, 55, 62, 67, 72, 73, 73, 54, 50, 58, 67, 76, 83, 84, 82, 61, 55, 63, 72, 83, 91, 92, 92, 66, 61, 65, 73, 84, 92, 101, 103, 69, 65, 66, 73, 82, 92, 103, 109, /* Size 16x16 */ 32, 30, 33, 38, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 30, 31, 35, 41, 46, 46, 46, 48, 51, 55, 58, 60, 63, 65, 68, 70, 33, 35, 39, 44, 47, 46, 46, 47, 50, 53, 56, 58, 60, 62, 65, 67, 38, 41, 44, 47, 49, 48, 47, 48, 50, 53, 55, 58, 58, 60, 62, 65, 49, 46, 47, 49, 53, 53, 54, 54, 56, 58, 60, 62, 62, 63, 64, 64, 48, 46, 46, 48, 53, 54, 56, 57, 59, 61, 63, 65, 67, 66, 68, 68, 50, 46, 46, 47, 54, 56, 61, 63, 65, 68, 70, 72, 71, 71, 72, 72, 52, 48, 47, 48, 54, 57, 63, 66, 69, 72, 75, 76, 75, 76, 76, 76, 55, 51, 50, 50, 56, 59, 65, 69, 73, 77, 79, 81, 81, 81, 80, 80, 60, 55, 53, 53, 58, 61, 68, 72, 77, 82, 85, 87, 87, 85, 84, 85, 63, 58, 56, 55, 60, 63, 70, 75, 79, 85, 89, 91, 91, 90, 89, 90, 66, 60, 58, 58, 62, 65, 72, 76, 81, 87, 91, 94, 96, 95, 95, 95, 68, 63, 60, 58, 62, 67, 71, 75, 81, 87, 91, 96, 99, 100, 100, 100, 70, 65, 62, 60, 63, 66, 71, 76, 81, 85, 90, 95, 100, 103, 104, 105, 72, 68, 65, 62, 64, 68, 72, 76, 80, 84, 89, 95, 100, 104, 107, 108, 74, 70, 67, 65, 64, 68, 72, 76, 80, 85, 90, 95, 100, 105, 108, 111, /* Size 32x32 */ 32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 55, 57, 60, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 31, 32, 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 53, 54, 57, 59, 60, 61, 63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 30, 31, 31, 32, 35, 39, 41, 42, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 55, 57, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 71, 31, 32, 32, 33, 36, 40, 41, 43, 46, 46, 45, 45, 46, 46, 47, 49, 50, 51, 54, 56, 57, 58, 59, 61, 62, 63, 63, 64, 65, 66, 67, 68, 33, 34, 35, 36, 39, 43, 44, 45, 47, 46, 46, 45, 46, 47, 47, 49, 50, 51, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 36, 38, 39, 40, 43, 47, 47, 47, 48, 47, 46, 45, 46, 46, 47, 48, 49, 50, 52, 53, 54, 55, 56, 58, 59, 61, 62, 63, 64, 65, 66, 66, 38, 40, 41, 41, 44, 47, 47, 48, 49, 48, 48, 47, 47, 47, 48, 49, 50, 51, 53, 54, 55, 56, 58, 58, 58, 59, 60, 61, 62, 64, 65, 66, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50, 49, 49, 50, 50, 50, 52, 52, 53, 55, 56, 57, 58, 59, 60, 61, 61, 61, 61, 62, 63, 63, 64, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 59, 60, 61, 62, 63, 62, 62, 63, 64, 64, 64, 64, 64, 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 58, 58, 60, 61, 62, 63, 64, 64, 64, 65, 65, 65, 65, 66, 67, 68, 48, 47, 46, 45, 46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 63, 63, 64, 65, 66, 67, 66, 66, 67, 68, 68, 68, 68, 49, 47, 45, 45, 45, 45, 47, 49, 53, 55, 55, 58, 59, 60, 61, 62, 63, 63, 65, 66, 67, 68, 69, 69, 68, 68, 69, 69, 69, 69, 70, 71, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 64, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 71, 71, 72, 72, 72, 71, 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62, 64, 66, 66, 67, 69, 70, 71, 72, 73, 73, 74, 73, 73, 74, 73, 73, 74, 75, 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, 69, 70, 72, 74, 75, 75, 76, 77, 75, 76, 76, 75, 76, 77, 76, 75, 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, 64, 66, 68, 71, 72, 73, 75, 77, 78, 79, 80, 78, 79, 78, 77, 78, 78, 77, 78, 79, 55, 53, 51, 50, 50, 49, 50, 52, 56, 58, 59, 63, 65, 66, 69, 72, 73, 74, 77, 78, 79, 80, 81, 81, 81, 80, 81, 80, 80, 81, 80, 79, 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63, 66, 67, 70, 73, 74, 76, 79, 80, 82, 83, 84, 85, 83, 84, 83, 83, 83, 82, 82, 83, 60, 57, 55, 54, 53, 52, 53, 55, 58, 60, 61, 65, 68, 69, 72, 75, 77, 79, 82, 84, 85, 86, 87, 86, 87, 85, 85, 85, 84, 86, 85, 84, 62, 59, 57, 56, 55, 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86, 87, 88, 90, 89, 89, 88, 88, 87, 88, 87, 87, 88, 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 63, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 91, 91, 90, 91, 89, 90, 90, 89, 65, 61, 59, 58, 57, 55, 56, 58, 61, 63, 64, 68, 71, 72, 75, 79, 80, 83, 86, 88, 90, 91, 93, 94, 95, 92, 94, 92, 93, 92, 91, 93, 66, 63, 60, 59, 58, 56, 58, 59, 62, 64, 65, 69, 72, 73, 76, 80, 81, 84, 87, 90, 91, 93, 94, 95, 96, 97, 95, 95, 95, 95, 95, 93, 67, 64, 62, 61, 59, 58, 58, 60, 63, 64, 66, 69, 71, 73, 77, 78, 81, 85, 86, 89, 93, 94, 95, 97, 97, 98, 99, 97, 97, 97, 96, 98, 68, 65, 63, 62, 60, 59, 58, 61, 62, 64, 67, 68, 71, 74, 75, 79, 81, 83, 87, 89, 91, 95, 96, 97, 99, 98, 100, 100, 100, 99, 100, 98, 69, 66, 64, 63, 61, 61, 59, 61, 62, 65, 66, 68, 72, 73, 76, 78, 80, 84, 85, 88, 91, 92, 97, 98, 98, 101, 100, 102, 102, 103, 101, 102, 70, 67, 65, 63, 62, 62, 60, 61, 63, 65, 66, 69, 71, 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99, 100, 100, 103, 102, 104, 104, 105, 103, 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69, 71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102, 105, 104, 106, 106, 108, 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69, 72, 73, 76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104, 107, 106, 108, 108, 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69, 72, 73, 77, 77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106, 106, 109, 108, 110, 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70, 72, 74, 76, 78, 80, 82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106, 108, 108, 111, 110, 75, 71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 84, 88, 89, 93, 93, 98, 98, 102, 103, 108, 108, 110, 110, 113, /* Size 4x8 */ 31, 40, 46, 47, 52, 59, 63, 66, 47, 45, 55, 60, 64, 68, 66, 69, 57, 52, 61, 70, 79, 87, 88, 85, 65, 61, 63, 72, 81, 90, 99, 102, /* Size 8x4 */ 31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72, 52, 64, 79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102, /* Size 8x16 */ 32, 30, 33, 39, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 35, 38, 41, 46, 48, 46, 46, 47, 49, 53, 55, 58, 60, 62, 65, 67, 48, 46, 47, 48, 53, 53, 54, 54, 56, 58, 60, 62, 64, 65, 65, 65, 50, 46, 46, 47, 54, 56, 61, 63, 65, 68, 70, 72, 71, 73, 72, 71, 57, 52, 51, 51, 57, 60, 66, 71, 74, 79, 82, 84, 81, 81, 82, 79, 63, 58, 56, 55, 60, 64, 70, 75, 79, 85, 89, 91, 94, 89, 92, 89, 68, 63, 60, 58, 61, 65, 71, 75, 79, 85, 91, 95, 97, 98, 100, 98, 70, 65, 63, 61, 61, 65, 69, 74, 78, 82, 87, 91, 96, 101, 103, 105, /* Size 16x8 */ 32, 35, 48, 50, 57, 63, 68, 70, 30, 38, 46, 46, 52, 58, 63, 65, 33, 41, 47, 46, 51, 56, 60, 63, 39, 46, 48, 47, 51, 55, 58, 61, 49, 48, 53, 54, 57, 60, 61, 61, 48, 46, 53, 56, 60, 64, 65, 65, 50, 46, 54, 61, 66, 70, 71, 69, 52, 47, 54, 63, 71, 75, 75, 74, 55, 49, 56, 65, 74, 79, 79, 78, 60, 53, 58, 68, 79, 85, 85, 82, 63, 55, 60, 70, 82, 89, 91, 87, 66, 58, 62, 72, 84, 91, 95, 91, 68, 60, 64, 71, 81, 94, 97, 96, 70, 62, 65, 73, 81, 89, 98, 101, 72, 65, 65, 72, 82, 92, 100, 103, 74, 67, 65, 71, 79, 89, 98, 105, /* Size 16x32 */ 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 54, 56, 57, 59, 60, 61, 62, 63, 64, 65, 65, 66, 67, 68, 35, 37, 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 50, 51, 53, 55, 56, 57, 58, 58, 59, 60, 60, 61, 62, 63, 64, 65, 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65, 49, 47, 45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 68, 67, 66, 66, 67, 68, 69, 70, 71, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72, 71, 71, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, 69, 70, 72, 74, 75, 75, 76, 78, 79, 77, 76, 74, 75, 76, 77, 78, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, 61, 57, 55, 55, 54, 52, 54, 56, 59, 61, 62, 66, 68, 70, 73, 76, 77, 79, 82, 84, 86, 87, 88, 86, 86, 88, 87, 85, 83, 85, 86, 87, 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 67, 63, 61, 60, 59, 57, 57, 60, 63, 64, 66, 69, 71, 73, 77, 79, 81, 85, 87, 88, 92, 93, 94, 96, 95, 96, 97, 95, 93, 93, 94, 96, 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100, 100, 98, 96, 69, 65, 64, 62, 61, 61, 59, 59, 62, 63, 65, 67, 68, 72, 73, 76, 78, 81, 84, 85, 89, 90, 93, 96, 98, 99, 100, 102, 102, 102, 103, 105, 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105, 71, 67, 67, 64, 64, 62, 62, 60, 61, 64, 64, 67, 67, 71, 71, 75, 75, 79, 80, 84, 84, 89, 89, 94, 94, 98, 99, 104, 104, 106, 106, 109, /* Size 32x16 */ 32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68, 69, 70, 71, 31, 31, 37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64, 65, 66, 67, 30, 32, 38, 40, 46, 45, 46, 48, 52, 55, 58, 61, 63, 64, 65, 67, 31, 33, 38, 41, 46, 45, 46, 48, 52, 55, 57, 60, 61, 62, 63, 64, 33, 36, 41, 44, 47, 46, 46, 47, 51, 54, 56, 59, 60, 61, 63, 64, 37, 40, 45, 47, 47, 45, 46, 47, 50, 52, 54, 57, 59, 61, 62, 62, 39, 41, 46, 47, 48, 47, 47, 48, 51, 54, 55, 57, 58, 59, 61, 62, 42, 43, 46, 48, 50, 49, 50, 50, 53, 56, 57, 60, 60, 59, 60, 60, 49, 46, 48, 49, 53, 53, 54, 54, 57, 59, 60, 63, 61, 62, 61, 61, 48, 46, 47, 48, 53, 55, 55, 56, 58, 61, 62, 64, 64, 63, 63, 64, 48, 46, 46, 48, 53, 56, 56, 57, 60, 62, 64, 66, 65, 65, 65, 64, 49, 45, 45, 47, 53, 58, 59, 61, 64, 66, 67, 69, 67, 67, 66, 67, 50, 46, 46, 48, 54, 59, 61, 63, 66, 68, 70, 71, 71, 68, 69, 67, 51, 47, 47, 48, 54, 60, 61, 64, 68, 70, 71, 73, 72, 72, 70, 71, 52, 48, 47, 48, 54, 61, 63, 66, 71, 73, 75, 77, 75, 73, 74, 71, 54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79, 78, 76, 74, 75, 55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79, 78, 78, 75, 57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83, 81, 79, 79, 60, 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85, 84, 82, 80, 62, 56, 54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87, 85, 84, 84, 63, 57, 55, 56, 60, 67, 70, 75, 82, 86, 89, 92, 91, 89, 87, 84, 64, 59, 56, 57, 61, 68, 71, 75, 83, 87, 90, 93, 92, 90, 89, 89, 66, 60, 58, 58, 62, 69, 72, 76, 84, 88, 91, 94, 95, 93, 91, 89, 67, 61, 59, 58, 63, 68, 71, 78, 83, 86, 93, 96, 96, 96, 94, 94, 68, 62, 60, 59, 64, 67, 71, 79, 81, 86, 94, 95, 97, 98, 96, 94, 69, 63, 61, 60, 65, 66, 72, 77, 80, 88, 91, 96, 99, 99, 100, 98, 70, 64, 62, 60, 65, 66, 73, 76, 81, 87, 89, 97, 98, 100, 101, 99, 71, 65, 64, 61, 65, 67, 73, 74, 82, 85, 90, 95, 99, 102, 103, 104, 72, 65, 65, 62, 65, 68, 72, 75, 82, 83, 92, 93, 100, 102, 103, 104, 73, 66, 66, 63, 65, 69, 72, 76, 81, 85, 90, 93, 100, 102, 105, 106, 74, 67, 67, 64, 65, 70, 71, 77, 79, 86, 89, 94, 98, 103, 105, 106, 75, 68, 68, 65, 65, 71, 71, 78, 78, 87, 87, 96, 96, 105, 105, 109, /* Size 4x16 */ 31, 32, 36, 41, 46, 46, 46, 48, 51, 54, 57, 60, 62, 64, 65, 67, 49, 45, 46, 47, 53, 56, 59, 61, 63, 65, 67, 69, 67, 66, 68, 70, 61, 55, 54, 54, 59, 62, 68, 73, 77, 82, 86, 88, 86, 87, 83, 86, 69, 64, 61, 59, 62, 65, 68, 73, 78, 84, 89, 93, 98, 100, 102, 103, /* Size 16x4 */ 31, 49, 61, 69, 32, 45, 55, 64, 36, 46, 54, 61, 41, 47, 54, 59, 46, 53, 59, 62, 46, 56, 62, 65, 46, 59, 68, 68, 48, 61, 73, 73, 51, 63, 77, 78, 54, 65, 82, 84, 57, 67, 86, 89, 60, 69, 88, 93, 62, 67, 86, 98, 64, 66, 87, 100, 65, 68, 83, 102, 67, 70, 86, 103, /* Size 8x32 */ 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 35, 37, 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72, 71, 71, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100, 100, 98, 96, 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105, /* Size 32x8 */ 32, 35, 48, 50, 57, 63, 68, 70, 31, 37, 47, 48, 54, 60, 64, 66, 30, 38, 46, 46, 52, 58, 63, 65, 31, 38, 46, 46, 52, 57, 61, 63, 33, 41, 47, 46, 51, 56, 60, 63, 37, 45, 47, 46, 50, 54, 59, 62, 39, 46, 48, 47, 51, 55, 58, 61, 42, 46, 50, 50, 53, 57, 60, 60, 49, 48, 53, 54, 57, 60, 61, 61, 48, 47, 53, 55, 58, 62, 64, 63, 48, 46, 53, 56, 60, 64, 65, 65, 49, 45, 53, 59, 64, 67, 67, 66, 50, 46, 54, 61, 66, 70, 71, 69, 51, 47, 54, 61, 68, 71, 72, 70, 52, 47, 54, 63, 71, 75, 75, 74, 54, 49, 55, 65, 73, 78, 78, 74, 55, 49, 56, 65, 74, 79, 79, 78, 57, 50, 56, 66, 76, 82, 83, 79, 60, 53, 58, 68, 79, 85, 85, 82, 62, 54, 60, 69, 81, 87, 87, 84, 63, 55, 60, 70, 82, 89, 91, 87, 64, 56, 61, 71, 83, 90, 92, 89, 66, 58, 62, 72, 84, 91, 95, 91, 67, 59, 63, 71, 83, 93, 96, 94, 68, 60, 64, 71, 81, 94, 97, 96, 69, 61, 65, 72, 80, 91, 99, 100, 70, 62, 65, 73, 81, 89, 98, 101, 71, 64, 65, 73, 82, 90, 99, 103, 72, 65, 65, 72, 82, 92, 100, 103, 73, 66, 65, 72, 81, 90, 100, 105, 74, 67, 65, 71, 79, 89, 98, 105, 75, 68, 65, 71, 78, 87, 96, 105 }, }, { { /* Luma */ /* Size 4x4 */ 32, 38, 63, 86, 38, 56, 78, 97, 63, 78, 113, 130, 86, 97, 130, 169, /* Size 8x8 */ 32, 32, 35, 46, 57, 76, 85, 96, 32, 34, 37, 45, 54, 70, 79, 90, 35, 37, 48, 56, 64, 79, 87, 93, 46, 45, 56, 70, 80, 96, 100, 105, 57, 54, 64, 80, 93, 111, 121, 122, 76, 70, 79, 96, 111, 134, 138, 144, 85, 79, 87, 100, 121, 138, 156, 168, 96, 90, 93, 105, 122, 144, 168, 184, /* Size 16x16 */ 32, 31, 31, 32, 34, 39, 44, 49, 58, 65, 71, 81, 87, 93, 98, 104, 31, 32, 32, 32, 34, 38, 41, 46, 54, 60, 66, 75, 81, 86, 92, 98, 31, 32, 33, 34, 36, 39, 42, 46, 53, 59, 64, 73, 78, 83, 88, 94, 32, 32, 34, 35, 37, 40, 42, 46, 52, 58, 63, 71, 75, 80, 86, 92, 34, 34, 36, 37, 42, 47, 50, 53, 59, 65, 70, 77, 82, 85, 89, 92, 39, 38, 39, 40, 47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 96, 98, 44, 41, 42, 42, 50, 58, 63, 68, 74, 79, 84, 91, 96, 98, 102, 104, 49, 46, 46, 46, 53, 62, 68, 73, 81, 87, 92, 99, 103, 107, 109, 112, 58, 54, 53, 52, 59, 68, 74, 81, 90, 97, 102, 110, 114, 118, 117, 121, 65, 60, 59, 58, 65, 73, 79, 87, 97, 105, 111, 120, 125, 125, 126, 130, 71, 66, 64, 63, 70, 78, 84, 92, 102, 111, 117, 127, 133, 134, 136, 141, 81, 75, 73, 71, 77, 85, 91, 99, 110, 120, 127, 137, 143, 145, 148, 152, 87, 81, 78, 75, 82, 90, 96, 103, 114, 125, 133, 143, 150, 156, 160, 163, 93, 86, 83, 80, 85, 90, 98, 107, 118, 125, 134, 145, 156, 163, 169, 177, 98, 92, 88, 86, 89, 96, 102, 109, 117, 126, 136, 148, 160, 169, 176, 184, 104, 98, 94, 92, 92, 98, 104, 112, 121, 130, 141, 152, 163, 177, 184, 191, /* Size 32x32 */ 32, 31, 31, 31, 31, 32, 32, 34, 34, 36, 39, 41, 44, 48, 49, 54, 58, 59, 65, 69, 71, 80, 81, 83, 87, 90, 93, 95, 98, 101, 104, 107, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 38, 39, 42, 46, 47, 51, 55, 57, 62, 66, 68, 76, 77, 78, 83, 85, 88, 90, 93, 96, 99, 101, 31, 32, 32, 32, 32, 32, 32, 33, 34, 34, 38, 39, 41, 45, 46, 50, 54, 55, 60, 64, 66, 73, 75, 76, 81, 83, 86, 89, 92, 95, 98, 101, 31, 32, 32, 32, 32, 32, 32, 33, 34, 34, 37, 38, 41, 44, 45, 49, 53, 54, 59, 63, 65, 72, 74, 75, 79, 81, 84, 86, 89, 91, 94, 97, 31, 32, 32, 32, 33, 33, 34, 35, 36, 36, 39, 40, 42, 45, 46, 50, 53, 54, 59, 63, 64, 71, 73, 74, 78, 80, 83, 85, 88, 91, 94, 97, 32, 32, 32, 32, 33, 34, 34, 36, 36, 37, 40, 40, 42, 45, 46, 49, 53, 54, 58, 62, 63, 70, 72, 73, 77, 79, 82, 85, 87, 90, 92, 95, 32, 32, 32, 32, 34, 34, 35, 37, 37, 38, 40, 41, 42, 45, 46, 49, 52, 54, 58, 61, 63, 69, 71, 72, 75, 78, 80, 83, 86, 89, 92, 95, 34, 34, 33, 33, 35, 36, 37, 39, 41, 42, 45, 46, 47, 50, 51, 54, 57, 59, 63, 66, 68, 74, 75, 76, 80, 81, 82, 83, 85, 87, 90, 93, 34, 34, 34, 34, 36, 36, 37, 41, 42, 45, 47, 48, 50, 53, 53, 56, 59, 61, 65, 68, 70, 76, 77, 78, 82, 83, 85, 88, 89, 90, 92, 93, 36, 35, 34, 34, 36, 37, 38, 42, 45, 48, 50, 51, 54, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 89, 89, 90, 93, 96, 99, 39, 38, 38, 37, 39, 40, 40, 45, 47, 50, 54, 55, 58, 61, 62, 65, 68, 69, 73, 76, 78, 84, 85, 86, 90, 89, 90, 93, 96, 97, 98, 99, 41, 39, 39, 38, 40, 40, 41, 46, 48, 51, 55, 56, 59, 62, 63, 67, 70, 71, 75, 78, 80, 86, 87, 88, 91, 93, 96, 97, 97, 99, 102, 105, 44, 42, 41, 41, 42, 42, 42, 47, 50, 54, 58, 59, 63, 66, 68, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 98, 98, 99, 102, 104, 104, 105, 48, 46, 45, 44, 45, 45, 45, 50, 53, 56, 61, 62, 66, 70, 71, 76, 79, 80, 85, 88, 90, 96, 97, 98, 101, 100, 102, 105, 105, 105, 109, 112, 49, 47, 46, 45, 46, 46, 46, 51, 53, 57, 62, 63, 68, 71, 73, 77, 81, 82, 87, 90, 92, 98, 99, 100, 103, 106, 107, 106, 109, 112, 112, 112, 54, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 76, 77, 82, 86, 87, 92, 96, 97, 104, 105, 106, 110, 110, 109, 113, 114, 113, 116, 120, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 114, 118, 116, 117, 121, 121, 120, 59, 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82, 87, 91, 93, 99, 102, 104, 111, 112, 113, 117, 121, 120, 122, 124, 122, 125, 129, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 87, 92, 97, 99, 105, 109, 111, 118, 120, 121, 125, 124, 125, 127, 126, 130, 130, 129, 69, 66, 64, 63, 63, 62, 61, 66, 68, 71, 76, 78, 83, 88, 90, 96, 100, 102, 109, 113, 115, 123, 125, 126, 129, 130, 131, 130, 134, 133, 135, 139, 71, 68, 66, 65, 64, 63, 63, 68, 70, 73, 78, 80, 84, 90, 92, 97, 102, 104, 111, 115, 117, 125, 127, 128, 133, 136, 134, 139, 136, 139, 141, 140, 80, 76, 73, 72, 71, 70, 69, 74, 76, 79, 84, 86, 90, 96, 98, 104, 109, 111, 118, 123, 125, 134, 136, 137, 142, 138, 143, 140, 144, 144, 144, 149, 81, 77, 75, 74, 73, 72, 71, 75, 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 120, 125, 127, 136, 137, 139, 143, 148, 145, 148, 148, 150, 152, 149, 83, 78, 76, 75, 74, 73, 72, 76, 78, 81, 86, 88, 92, 98, 100, 106, 111, 113, 121, 126, 128, 137, 139, 140, 145, 149, 153, 153, 154, 155, 155, 161, 87, 83, 81, 79, 78, 77, 75, 80, 82, 85, 90, 91, 96, 101, 103, 110, 114, 117, 125, 129, 133, 142, 143, 145, 150, 151, 156, 159, 160, 160, 163, 161, 90, 85, 83, 81, 80, 79, 78, 81, 83, 87, 89, 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151, 156, 157, 162, 166, 168, 166, 172, 93, 88, 86, 84, 83, 82, 80, 82, 85, 89, 90, 96, 98, 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, 156, 157, 163, 164, 169, 172, 177, 172, 95, 90, 89, 86, 85, 85, 83, 83, 88, 89, 93, 97, 99, 105, 106, 113, 116, 122, 127, 130, 139, 140, 148, 153, 159, 162, 164, 169, 170, 176, 179, 185, 98, 93, 92, 89, 88, 87, 86, 85, 89, 90, 96, 97, 102, 105, 109, 114, 117, 124, 126, 134, 136, 144, 148, 154, 160, 166, 169, 170, 176, 177, 184, 186, 101, 96, 95, 91, 91, 90, 89, 87, 90, 93, 97, 99, 104, 105, 112, 113, 121, 122, 130, 133, 139, 144, 150, 155, 160, 168, 172, 176, 177, 184, 185, 191, 104, 99, 98, 94, 94, 92, 92, 90, 92, 96, 98, 102, 104, 109, 112, 116, 121, 125, 130, 135, 141, 144, 152, 155, 163, 166, 177, 179, 184, 185, 191, 192, 107, 101, 101, 97, 97, 95, 95, 93, 93, 99, 99, 105, 105, 112, 112, 120, 120, 129, 129, 139, 140, 149, 149, 161, 161, 172, 172, 185, 186, 191, 192, 199, /* Size 4x8 */ 32, 32, 34, 44, 54, 72, 82, 92, 38, 40, 51, 61, 69, 84, 89, 98, 62, 58, 68, 85, 98, 118, 129, 127, 86, 80, 85, 101, 117, 136, 157, 165, /* Size 8x4 */ 32, 38, 62, 86, 32, 40, 58, 80, 34, 51, 68, 85, 44, 61, 85, 101, 54, 69, 98, 117, 72, 84, 118, 136, 82, 89, 129, 157, 92, 98, 127, 165, /* Size 8x16 */ 32, 31, 32, 32, 34, 39, 44, 49, 57, 65, 71, 81, 87, 92, 98, 103, 32, 32, 33, 34, 36, 39, 42, 46, 53, 59, 64, 72, 77, 83, 88, 94, 36, 35, 36, 38, 44, 50, 53, 57, 63, 68, 73, 80, 85, 88, 89, 92, 44, 41, 42, 42, 50, 58, 63, 67, 74, 79, 84, 91, 96, 102, 103, 103, 58, 54, 53, 52, 59, 68, 74, 81, 90, 97, 102, 110, 114, 117, 121, 119, 79, 73, 71, 69, 75, 84, 90, 97, 108, 118, 125, 135, 140, 133, 141, 137, 88, 81, 78, 76, 81, 88, 97, 104, 111, 123, 135, 145, 148, 153, 160, 158, 93, 88, 84, 82, 84, 90, 97, 105, 113, 122, 131, 141, 151, 163, 169, 175, /* Size 16x8 */ 32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 41, 54, 73, 81, 88, 32, 33, 36, 42, 53, 71, 78, 84, 32, 34, 38, 42, 52, 69, 76, 82, 34, 36, 44, 50, 59, 75, 81, 84, 39, 39, 50, 58, 68, 84, 88, 90, 44, 42, 53, 63, 74, 90, 97, 97, 49, 46, 57, 67, 81, 97, 104, 105, 57, 53, 63, 74, 90, 108, 111, 113, 65, 59, 68, 79, 97, 118, 123, 122, 71, 64, 73, 84, 102, 125, 135, 131, 81, 72, 80, 91, 110, 135, 145, 141, 87, 77, 85, 96, 114, 140, 148, 151, 92, 83, 88, 102, 117, 133, 153, 163, 98, 88, 89, 103, 121, 141, 160, 169, 103, 94, 92, 103, 119, 137, 158, 175, /* Size 16x32 */ 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59, 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 31, 32, 32, 32, 32, 32, 33, 34, 34, 34, 37, 38, 41, 44, 46, 49, 53, 54, 60, 63, 65, 72, 74, 75, 79, 82, 84, 87, 89, 92, 94, 97, 32, 32, 32, 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 40, 41, 43, 46, 47, 50, 53, 54, 58, 62, 63, 70, 71, 72, 76, 78, 81, 83, 85, 88, 90, 93, 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93, 39, 38, 38, 37, 39, 40, 40, 45, 47, 51, 54, 55, 58, 61, 62, 65, 68, 69, 73, 76, 78, 84, 85, 86, 90, 89, 90, 92, 95, 98, 101, 104, 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103, 103, 103, 103, 104, 53, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 75, 77, 82, 86, 87, 92, 96, 97, 104, 105, 106, 110, 108, 106, 105, 108, 111, 114, 118, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120, 121, 120, 119, 118, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 86, 92, 97, 98, 105, 109, 111, 118, 120, 121, 125, 129, 128, 125, 124, 127, 131, 135, 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137, 141, 139, 137, 135, 81, 77, 75, 74, 72, 71, 70, 75, 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 119, 124, 127, 135, 137, 139, 143, 146, 150, 148, 144, 146, 150, 154, 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, 155, 160, 161, 158, 155, 90, 86, 84, 82, 81, 80, 78, 79, 83, 85, 89, 92, 94, 101, 102, 108, 112, 117, 123, 125, 134, 136, 143, 148, 154, 157, 158, 164, 164, 165, 170, 175, 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176, 96, 91, 91, 87, 87, 85, 86, 83, 84, 89, 89, 95, 95, 102, 102, 110, 110, 118, 119, 128, 129, 137, 138, 149, 149, 159, 160, 173, 174, 179, 180, 187, /* Size 32x16 */ 32, 31, 32, 32, 36, 39, 44, 53, 58, 65, 79, 81, 88, 90, 93, 96, 31, 32, 32, 32, 35, 38, 42, 51, 55, 62, 75, 77, 83, 86, 88, 91, 31, 32, 32, 32, 35, 38, 41, 50, 54, 60, 73, 75, 81, 84, 88, 91, 31, 32, 32, 33, 34, 37, 41, 49, 53, 59, 72, 74, 79, 82, 84, 87, 32, 32, 33, 34, 36, 39, 42, 50, 53, 59, 71, 72, 78, 81, 84, 87, 32, 32, 34, 34, 37, 40, 42, 49, 53, 58, 70, 71, 77, 80, 83, 85, 32, 33, 34, 35, 38, 40, 42, 49, 52, 58, 69, 70, 76, 78, 82, 86, 34, 34, 35, 37, 42, 45, 48, 54, 57, 63, 73, 75, 79, 79, 81, 83, 34, 34, 36, 37, 44, 47, 50, 56, 59, 65, 75, 77, 81, 83, 84, 84, 36, 34, 37, 38, 48, 51, 54, 60, 63, 68, 78, 80, 85, 85, 86, 89, 39, 37, 39, 40, 50, 54, 58, 65, 68, 73, 84, 85, 88, 89, 90, 89, 40, 38, 40, 41, 51, 55, 59, 67, 70, 75, 85, 87, 91, 92, 92, 95, 44, 41, 42, 43, 53, 58, 63, 71, 74, 79, 90, 91, 97, 94, 97, 95, 47, 44, 45, 46, 56, 61, 66, 75, 79, 85, 95, 97, 99, 101, 98, 102, 49, 46, 46, 47, 57, 62, 67, 77, 81, 86, 97, 99, 104, 102, 105, 102, 53, 49, 50, 50, 60, 65, 71, 82, 86, 92, 103, 105, 109, 108, 106, 110, 57, 53, 53, 53, 63, 68, 74, 86, 90, 97, 108, 110, 111, 112, 113, 110, 59, 54, 54, 54, 64, 69, 75, 87, 91, 98, 111, 112, 119, 117, 115, 118, 65, 60, 59, 58, 68, 73, 79, 92, 97, 105, 118, 119, 123, 123, 122, 119, 69, 63, 62, 62, 71, 76, 83, 96, 100, 109, 122, 124, 127, 125, 125, 128, 71, 65, 64, 63, 73, 78, 84, 97, 102, 111, 125, 127, 135, 134, 131, 129, 79, 72, 71, 70, 79, 84, 90, 104, 109, 118, 133, 135, 137, 136, 136, 137, 81, 74, 72, 71, 80, 85, 91, 105, 110, 120, 135, 137, 145, 143, 141, 138, 82, 75, 73, 72, 81, 86, 92, 106, 111, 121, 136, 139, 147, 148, 147, 149, 87, 79, 77, 76, 85, 90, 96, 110, 114, 125, 140, 143, 148, 154, 151, 149, 90, 82, 80, 78, 87, 89, 99, 108, 113, 129, 135, 146, 153, 157, 160, 159, 92, 84, 83, 81, 88, 90, 102, 106, 117, 128, 133, 150, 153, 158, 163, 160, 95, 87, 85, 83, 88, 92, 103, 105, 120, 125, 137, 148, 155, 164, 168, 173, 98, 89, 88, 85, 89, 95, 103, 108, 121, 124, 141, 144, 160, 164, 169, 174, 100, 92, 91, 88, 90, 98, 103, 111, 120, 127, 139, 146, 161, 165, 175, 179, 103, 94, 94, 90, 92, 101, 103, 114, 119, 131, 137, 150, 158, 170, 175, 180, 106, 97, 97, 93, 93, 104, 104, 118, 118, 135, 135, 154, 155, 175, 176, 187, /* Size 4x16 */ 31, 32, 32, 33, 34, 37, 41, 46, 53, 60, 65, 74, 79, 84, 89, 94, 39, 38, 39, 40, 47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 95, 101, 65, 60, 59, 58, 65, 73, 79, 86, 97, 105, 111, 120, 125, 128, 124, 131, 90, 84, 81, 78, 83, 89, 94, 102, 112, 123, 134, 143, 154, 158, 164, 170, /* Size 16x4 */ 31, 39, 65, 90, 32, 38, 60, 84, 32, 39, 59, 81, 33, 40, 58, 78, 34, 47, 65, 83, 37, 54, 73, 89, 41, 58, 79, 94, 46, 62, 86, 102, 53, 68, 97, 112, 60, 73, 105, 123, 65, 78, 111, 134, 74, 85, 120, 143, 79, 90, 125, 154, 84, 90, 128, 158, 89, 95, 124, 164, 94, 101, 131, 170, /* Size 8x32 */ 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59, 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 32, 32, 32, 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93, 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103, 103, 103, 103, 104, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120, 121, 120, 119, 118, 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137, 141, 139, 137, 135, 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, 155, 160, 161, 158, 155, 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176, /* Size 32x8 */ 32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 42, 55, 75, 83, 88, 31, 32, 35, 41, 54, 73, 81, 88, 31, 32, 34, 41, 53, 72, 79, 84, 32, 33, 36, 42, 53, 71, 78, 84, 32, 34, 37, 42, 53, 70, 77, 83, 32, 34, 38, 42, 52, 69, 76, 82, 34, 35, 42, 48, 57, 73, 79, 81, 34, 36, 44, 50, 59, 75, 81, 84, 36, 37, 48, 54, 63, 78, 85, 86, 39, 39, 50, 58, 68, 84, 88, 90, 40, 40, 51, 59, 70, 85, 91, 92, 44, 42, 53, 63, 74, 90, 97, 97, 47, 45, 56, 66, 79, 95, 99, 98, 49, 46, 57, 67, 81, 97, 104, 105, 53, 50, 60, 71, 86, 103, 109, 106, 57, 53, 63, 74, 90, 108, 111, 113, 59, 54, 64, 75, 91, 111, 119, 115, 65, 59, 68, 79, 97, 118, 123, 122, 69, 62, 71, 83, 100, 122, 127, 125, 71, 64, 73, 84, 102, 125, 135, 131, 79, 71, 79, 90, 109, 133, 137, 136, 81, 72, 80, 91, 110, 135, 145, 141, 82, 73, 81, 92, 111, 136, 147, 147, 87, 77, 85, 96, 114, 140, 148, 151, 90, 80, 87, 99, 113, 135, 153, 160, 92, 83, 88, 102, 117, 133, 153, 163, 95, 85, 88, 103, 120, 137, 155, 168, 98, 88, 89, 103, 121, 141, 160, 169, 100, 91, 90, 103, 120, 139, 161, 175, 103, 94, 92, 103, 119, 137, 158, 175, 106, 97, 93, 104, 118, 135, 155, 176 }, { /* Chroma */ /* Size 4x4 */ 32, 45, 53, 63, 45, 55, 62, 67, 53, 62, 80, 84, 63, 67, 84, 101, /* Size 8x8 */ 31, 36, 47, 48, 52, 60, 64, 67, 36, 43, 47, 46, 49, 55, 59, 63, 47, 47, 53, 54, 55, 60, 63, 64, 48, 46, 54, 61, 65, 70, 71, 71, 52, 49, 55, 65, 71, 78, 81, 79, 60, 55, 60, 70, 78, 89, 89, 89, 64, 59, 63, 71, 81, 89, 97, 99, 67, 63, 64, 71, 79, 89, 99, 104, /* Size 16x16 */ 32, 30, 33, 36, 44, 48, 49, 51, 54, 57, 60, 64, 67, 68, 70, 72, 30, 31, 35, 39, 44, 46, 46, 47, 50, 53, 55, 59, 61, 64, 66, 68, 33, 35, 39, 43, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 36, 39, 43, 47, 47, 46, 45, 46, 48, 50, 52, 55, 57, 58, 61, 63, 44, 44, 46, 47, 50, 51, 51, 51, 53, 54, 56, 59, 61, 61, 63, 62, 48, 46, 46, 46, 51, 54, 55, 56, 58, 60, 61, 64, 65, 64, 66, 66, 49, 46, 45, 45, 51, 55, 58, 60, 62, 63, 65, 68, 69, 69, 69, 69, 51, 47, 47, 46, 51, 56, 60, 62, 65, 67, 69, 72, 73, 74, 73, 73, 54, 50, 49, 48, 53, 58, 62, 65, 70, 73, 75, 78, 79, 79, 77, 77, 57, 53, 51, 50, 54, 60, 63, 67, 73, 76, 79, 82, 84, 83, 82, 82, 60, 55, 53, 52, 56, 61, 65, 69, 75, 79, 82, 86, 88, 87, 86, 87, 64, 59, 57, 55, 59, 64, 68, 72, 78, 82, 86, 90, 93, 92, 91, 92, 67, 61, 59, 57, 61, 65, 69, 73, 79, 84, 88, 93, 95, 96, 96, 96, 68, 64, 61, 58, 61, 64, 69, 74, 79, 83, 87, 92, 96, 99, 100, 101, 70, 66, 63, 61, 63, 66, 69, 73, 77, 82, 86, 91, 96, 100, 103, 104, 72, 68, 65, 63, 62, 66, 69, 73, 77, 82, 87, 92, 96, 101, 104, 106, /* Size 32x32 */ 32, 31, 30, 30, 33, 35, 36, 41, 44, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, 59, 60, 63, 64, 65, 67, 68, 68, 69, 70, 71, 72, 73, 31, 31, 31, 31, 34, 36, 38, 42, 44, 47, 47, 47, 47, 48, 48, 50, 51, 52, 54, 56, 57, 60, 61, 61, 63, 64, 65, 66, 67, 67, 68, 69, 30, 31, 31, 31, 35, 37, 39, 42, 44, 47, 46, 46, 46, 47, 47, 48, 50, 51, 53, 54, 55, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 30, 31, 31, 32, 35, 37, 40, 42, 44, 46, 45, 45, 45, 46, 46, 47, 49, 50, 52, 53, 54, 57, 58, 58, 60, 61, 62, 63, 63, 64, 65, 66, 33, 34, 35, 35, 39, 41, 43, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 53, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 35, 36, 37, 37, 41, 43, 45, 46, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 53, 55, 56, 56, 58, 59, 60, 61, 62, 63, 64, 64, 36, 38, 39, 40, 43, 45, 47, 47, 47, 48, 46, 46, 45, 46, 46, 47, 48, 48, 50, 51, 52, 54, 55, 55, 57, 58, 58, 59, 61, 62, 63, 64, 41, 42, 42, 42, 45, 46, 47, 48, 49, 50, 49, 49, 49, 50, 50, 50, 51, 52, 53, 54, 55, 57, 58, 58, 60, 60, 59, 59, 60, 61, 61, 62, 44, 44, 44, 44, 46, 46, 47, 49, 50, 51, 51, 51, 51, 51, 51, 52, 53, 53, 54, 56, 56, 59, 59, 59, 61, 61, 61, 62, 63, 62, 62, 62, 49, 47, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 64, 63, 63, 64, 65, 66, 48, 47, 46, 45, 46, 46, 46, 49, 51, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 61, 63, 64, 64, 65, 65, 64, 65, 66, 66, 66, 66, 48, 47, 46, 45, 46, 46, 46, 49, 51, 53, 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 63, 65, 65, 65, 66, 67, 68, 67, 67, 67, 68, 69, 49, 47, 46, 45, 45, 45, 45, 49, 51, 53, 55, 56, 58, 59, 60, 61, 62, 62, 63, 65, 65, 67, 68, 68, 69, 70, 69, 69, 69, 70, 69, 69, 50, 48, 47, 46, 46, 46, 46, 50, 51, 54, 56, 57, 59, 61, 62, 63, 64, 65, 66, 68, 68, 70, 71, 71, 72, 71, 71, 72, 71, 71, 71, 72, 51, 48, 47, 46, 47, 46, 46, 50, 51, 54, 56, 57, 60, 62, 62, 64, 65, 66, 67, 69, 69, 71, 72, 72, 73, 74, 74, 72, 73, 74, 73, 73, 52, 50, 48, 47, 47, 47, 47, 50, 52, 54, 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 76, 75, 76, 76, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 64, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 78, 77, 78, 77, 77, 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60, 62, 65, 66, 68, 70, 71, 73, 75, 76, 78, 79, 79, 80, 81, 80, 80, 81, 79, 79, 81, 57, 54, 53, 52, 51, 50, 50, 53, 54, 56, 60, 61, 63, 66, 67, 70, 73, 73, 76, 78, 79, 82, 82, 83, 84, 83, 83, 83, 82, 83, 82, 81, 59, 56, 54, 53, 53, 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80, 81, 84, 85, 85, 86, 86, 86, 84, 85, 84, 84, 85, 60, 57, 55, 54, 53, 53, 52, 55, 56, 58, 61, 63, 65, 68, 69, 72, 75, 76, 79, 81, 82, 85, 86, 86, 88, 88, 87, 88, 86, 87, 87, 85, 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 63, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 92, 89, 91, 89, 90, 89, 88, 89, 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 89, 90, 91, 93, 94, 92, 92, 91, 91, 92, 90, 65, 61, 60, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 83, 85, 86, 90, 91, 91, 93, 94, 95, 94, 94, 94, 93, 94, 67, 63, 61, 60, 59, 58, 57, 60, 61, 63, 65, 66, 69, 72, 73, 77, 79, 80, 84, 86, 88, 92, 93, 93, 95, 95, 96, 97, 96, 95, 96, 94, 68, 64, 63, 61, 60, 59, 58, 60, 61, 63, 65, 67, 70, 71, 74, 76, 78, 81, 83, 86, 88, 89, 94, 94, 95, 97, 97, 98, 99, 99, 97, 99, 68, 65, 64, 62, 61, 60, 58, 59, 61, 64, 64, 68, 69, 71, 74, 75, 79, 80, 83, 86, 87, 91, 92, 95, 96, 97, 99, 99, 100, 100, 101, 99, 69, 66, 65, 63, 62, 61, 59, 59, 62, 63, 65, 67, 69, 72, 72, 76, 78, 80, 83, 84, 88, 89, 92, 94, 97, 98, 99, 101, 100, 102, 102, 104, 70, 67, 66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 86, 90, 91, 94, 96, 99, 100, 100, 103, 102, 104, 104, 71, 67, 67, 64, 64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74, 74, 78, 79, 83, 84, 87, 89, 91, 94, 95, 99, 100, 102, 102, 104, 104, 106, 72, 68, 68, 65, 65, 64, 63, 61, 62, 65, 66, 68, 69, 71, 73, 75, 77, 79, 82, 84, 87, 88, 92, 93, 96, 97, 101, 102, 104, 104, 106, 106, 73, 69, 69, 66, 66, 64, 64, 62, 62, 66, 66, 69, 69, 72, 73, 76, 77, 81, 81, 85, 85, 89, 90, 94, 94, 99, 99, 104, 104, 106, 106, 108, /* Size 4x8 */ 31, 38, 46, 46, 50, 57, 61, 65, 47, 46, 53, 56, 59, 64, 65, 67, 54, 50, 57, 66, 74, 82, 85, 82, 64, 60, 62, 71, 79, 88, 97, 99, /* Size 8x4 */ 31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71, 50, 59, 74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99, /* Size 8x16 */ 32, 31, 33, 37, 44, 48, 49, 51, 54, 57, 60, 64, 66, 68, 70, 72, 34, 36, 40, 44, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 48, 46, 47, 47, 51, 53, 53, 54, 55, 56, 58, 61, 63, 63, 63, 63, 49, 46, 46, 45, 51, 56, 58, 60, 62, 64, 65, 68, 69, 71, 70, 69, 54, 50, 49, 48, 53, 58, 62, 65, 70, 73, 75, 78, 79, 79, 80, 77, 63, 58, 56, 54, 59, 64, 67, 71, 77, 82, 85, 89, 91, 87, 89, 86, 67, 62, 59, 57, 60, 64, 70, 73, 77, 83, 89, 93, 94, 96, 97, 95, 69, 65, 62, 60, 61, 64, 68, 72, 76, 81, 85, 89, 93, 98, 100, 102, /* Size 16x8 */ 32, 34, 48, 49, 54, 63, 67, 69, 31, 36, 46, 46, 50, 58, 62, 65, 33, 40, 47, 46, 49, 56, 59, 62, 37, 44, 47, 45, 48, 54, 57, 60, 44, 46, 51, 51, 53, 59, 60, 61, 48, 46, 53, 56, 58, 64, 64, 64, 49, 45, 53, 58, 62, 67, 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 54, 49, 55, 62, 70, 77, 77, 76, 57, 51, 56, 64, 73, 82, 83, 81, 60, 53, 58, 65, 75, 85, 89, 85, 64, 57, 61, 68, 78, 89, 93, 89, 66, 59, 63, 69, 79, 91, 94, 93, 68, 61, 63, 71, 79, 87, 96, 98, 70, 63, 63, 70, 80, 89, 97, 100, 72, 65, 63, 69, 77, 86, 95, 102, /* Size 16x32 */ 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54, 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 31, 31, 32, 32, 36, 38, 40, 43, 44, 46, 46, 45, 45, 46, 47, 48, 49, 50, 52, 54, 54, 57, 58, 59, 60, 61, 62, 63, 64, 65, 65, 66, 34, 35, 36, 36, 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 37, 38, 39, 40, 43, 45, 47, 47, 47, 48, 47, 46, 46, 46, 47, 47, 48, 49, 50, 52, 52, 55, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63, 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63, 48, 47, 46, 45, 46, 46, 46, 50, 51, 53, 54, 55, 56, 56, 57, 57, 58, 59, 60, 61, 62, 64, 64, 65, 66, 65, 64, 65, 66, 67, 68, 69, 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70, 69, 69, 52, 50, 48, 48, 47, 47, 47, 50, 52, 54, 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 75, 74, 72, 73, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, 57, 54, 53, 52, 51, 50, 50, 53, 54, 57, 60, 61, 64, 66, 68, 71, 73, 74, 76, 78, 79, 82, 82, 83, 84, 85, 84, 82, 81, 82, 83, 84, 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 90, 90, 91, 93, 93, 94, 93, 90, 90, 92, 93, 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97, 95, 93, 68, 64, 63, 61, 60, 60, 58, 58, 61, 62, 64, 66, 67, 71, 71, 75, 77, 79, 82, 83, 87, 88, 91, 93, 95, 97, 97, 99, 99, 99, 100, 101, 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101, 69, 66, 66, 63, 63, 61, 61, 59, 60, 63, 63, 66, 66, 70, 70, 73, 74, 78, 78, 82, 82, 86, 87, 91, 91, 95, 96, 101, 101, 103, 103, 105, /* Size 32x16 */ 32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68, 69, 69, 31, 31, 35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64, 65, 66, 31, 32, 36, 39, 46, 46, 46, 48, 50, 53, 58, 59, 62, 63, 65, 66, 30, 32, 36, 40, 46, 45, 45, 48, 49, 52, 57, 58, 60, 61, 62, 63, 33, 36, 40, 43, 47, 46, 46, 47, 49, 51, 56, 57, 59, 60, 62, 63, 35, 38, 42, 45, 47, 46, 45, 47, 48, 50, 55, 56, 58, 60, 61, 61, 37, 40, 44, 47, 47, 46, 45, 47, 48, 50, 54, 55, 57, 58, 60, 61, 42, 43, 45, 47, 50, 50, 49, 50, 51, 53, 57, 58, 59, 58, 59, 59, 44, 44, 46, 47, 51, 51, 51, 52, 53, 54, 59, 59, 60, 61, 61, 60, 49, 46, 47, 48, 53, 53, 53, 54, 55, 57, 60, 61, 63, 62, 62, 63, 48, 46, 46, 47, 53, 54, 56, 57, 58, 60, 64, 64, 64, 64, 64, 63, 48, 45, 46, 46, 53, 55, 56, 58, 59, 61, 65, 65, 66, 66, 65, 66, 49, 45, 45, 46, 53, 56, 58, 61, 62, 64, 67, 68, 70, 67, 68, 66, 50, 46, 46, 46, 54, 56, 59, 63, 65, 66, 70, 71, 70, 71, 68, 70, 51, 47, 47, 47, 54, 57, 60, 64, 65, 68, 71, 72, 73, 71, 72, 70, 52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75, 76, 75, 73, 73, 54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77, 76, 74, 54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79, 77, 78, 57, 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82, 81, 78, 59, 54, 52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83, 82, 82, 60, 54, 53, 52, 58, 62, 65, 72, 75, 79, 85, 86, 89, 87, 85, 82, 63, 57, 56, 55, 60, 64, 67, 75, 77, 82, 89, 90, 90, 88, 87, 86, 64, 58, 57, 55, 61, 64, 68, 75, 78, 82, 89, 90, 93, 91, 89, 87, 64, 59, 57, 56, 61, 65, 68, 75, 78, 83, 90, 91, 94, 93, 92, 91, 66, 60, 59, 57, 63, 66, 69, 77, 79, 84, 91, 93, 94, 95, 93, 91, 67, 61, 60, 58, 63, 65, 70, 75, 78, 85, 88, 93, 96, 97, 97, 95, 68, 62, 61, 59, 63, 64, 71, 74, 79, 84, 87, 94, 96, 97, 98, 96, 69, 63, 62, 60, 63, 65, 71, 72, 80, 82, 88, 93, 96, 99, 100, 101, 70, 64, 63, 60, 63, 66, 70, 73, 80, 81, 89, 90, 97, 99, 100, 101, 71, 65, 64, 61, 63, 67, 70, 74, 78, 82, 88, 90, 97, 99, 102, 103, 72, 65, 65, 62, 63, 68, 69, 75, 77, 83, 86, 92, 95, 100, 102, 103, 73, 66, 66, 63, 63, 69, 69, 76, 76, 84, 84, 93, 93, 101, 101, 105, /* Size 4x16 */ 31, 32, 36, 40, 44, 46, 45, 47, 49, 52, 54, 58, 60, 62, 64, 65, 48, 46, 46, 46, 51, 54, 56, 57, 58, 60, 62, 64, 66, 64, 66, 68, 57, 53, 51, 50, 54, 60, 64, 68, 73, 76, 79, 82, 84, 84, 81, 83, 68, 63, 60, 58, 61, 64, 67, 71, 77, 82, 87, 91, 95, 97, 99, 100, /* Size 16x4 */ 31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58, 44, 51, 54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71, 49, 58, 73, 77, 52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91, 60, 66, 84, 95, 62, 64, 84, 97, 64, 66, 81, 99, 65, 68, 83, 100, /* Size 8x32 */ 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54, 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 34, 35, 36, 36, 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63, 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70, 69, 69, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97, 95, 93, 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101, /* Size 32x8 */ 32, 34, 48, 49, 54, 63, 67, 69, 31, 35, 47, 47, 51, 60, 63, 65, 31, 36, 46, 46, 50, 58, 62, 65, 30, 36, 46, 45, 49, 57, 60, 62, 33, 40, 47, 46, 49, 56, 59, 62, 35, 42, 47, 45, 48, 55, 58, 61, 37, 44, 47, 45, 48, 54, 57, 60, 42, 45, 50, 49, 51, 57, 59, 59, 44, 46, 51, 51, 53, 59, 60, 61, 49, 47, 53, 53, 55, 60, 63, 62, 48, 46, 53, 56, 58, 64, 64, 64, 48, 46, 53, 56, 59, 65, 66, 65, 49, 45, 53, 58, 62, 67, 70, 68, 50, 46, 54, 59, 65, 70, 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 52, 47, 54, 61, 68, 75, 76, 73, 54, 49, 55, 62, 70, 77, 77, 76, 54, 49, 55, 62, 70, 78, 81, 77, 57, 51, 56, 64, 73, 82, 83, 81, 59, 52, 58, 65, 74, 84, 85, 82, 60, 53, 58, 65, 75, 85, 89, 85, 63, 56, 60, 67, 77, 89, 90, 87, 64, 57, 61, 68, 78, 89, 93, 89, 64, 57, 61, 68, 78, 90, 94, 92, 66, 59, 63, 69, 79, 91, 94, 93, 67, 60, 63, 70, 78, 88, 96, 97, 68, 61, 63, 71, 79, 87, 96, 98, 69, 62, 63, 71, 80, 88, 96, 100, 70, 63, 63, 70, 80, 89, 97, 100, 71, 64, 63, 70, 78, 88, 97, 102, 72, 65, 63, 69, 77, 86, 95, 102, 73, 66, 63, 69, 76, 84, 93, 101 }, }, { { /* Luma */ /* Size 4x4 */ 32, 37, 58, 81, 37, 54, 72, 91, 58, 72, 102, 121, 81, 91, 121, 156, /* Size 8x8 */ 32, 32, 35, 42, 53, 68, 78, 90, 32, 33, 36, 42, 51, 64, 74, 84, 35, 36, 46, 52, 60, 72, 80, 87, 42, 42, 52, 63, 73, 84, 92, 98, 53, 51, 60, 73, 86, 100, 109, 114, 68, 64, 72, 84, 100, 117, 128, 133, 78, 74, 80, 92, 109, 128, 140, 155, 90, 84, 87, 98, 114, 133, 155, 168, /* Size 16x16 */ 32, 31, 31, 32, 34, 36, 41, 47, 54, 59, 65, 74, 82, 87, 92, 97, 31, 32, 32, 32, 34, 35, 39, 45, 50, 55, 61, 69, 76, 81, 87, 92, 31, 32, 33, 33, 35, 36, 40, 44, 49, 54, 59, 67, 73, 78, 83, 88, 32, 32, 33, 35, 37, 38, 41, 45, 49, 53, 58, 65, 71, 75, 80, 86, 34, 34, 35, 37, 39, 42, 46, 50, 54, 58, 63, 70, 76, 80, 84, 85, 36, 35, 36, 38, 42, 48, 52, 56, 60, 64, 68, 75, 80, 85, 90, 91, 41, 39, 40, 41, 46, 52, 57, 62, 67, 71, 75, 83, 88, 92, 95, 97, 47, 45, 44, 45, 50, 56, 62, 69, 75, 79, 84, 91, 97, 100, 102, 104, 54, 50, 49, 49, 54, 60, 67, 75, 82, 87, 92, 100, 106, 110, 109, 112, 59, 55, 54, 53, 58, 64, 71, 79, 87, 92, 98, 106, 112, 117, 117, 121, 65, 61, 59, 58, 63, 68, 75, 84, 92, 98, 105, 114, 120, 125, 126, 130, 74, 69, 67, 65, 70, 75, 83, 91, 100, 106, 114, 123, 131, 135, 137, 140, 82, 76, 73, 71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144, 148, 150, 87, 81, 78, 75, 80, 85, 92, 100, 110, 117, 125, 135, 144, 150, 155, 162, 92, 87, 83, 80, 84, 90, 95, 102, 109, 117, 126, 137, 148, 155, 162, 168, 97, 92, 88, 86, 85, 91, 97, 104, 112, 121, 130, 140, 150, 162, 168, 174, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 54, 56, 59, 64, 65, 71, 74, 80, 82, 83, 87, 90, 92, 95, 97, 100, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 51, 53, 56, 61, 62, 68, 71, 76, 78, 78, 83, 85, 88, 90, 92, 95, 31, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 38, 39, 42, 45, 45, 50, 52, 55, 60, 61, 67, 69, 74, 76, 77, 81, 84, 87, 89, 92, 95, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 38, 41, 44, 44, 49, 51, 54, 58, 59, 65, 68, 72, 74, 75, 79, 81, 84, 86, 88, 90, 31, 32, 32, 32, 33, 33, 33, 34, 35, 36, 36, 39, 40, 42, 44, 45, 49, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 80, 83, 85, 88, 90, 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 45, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 80, 82, 84, 86, 89, 32, 32, 32, 32, 33, 34, 35, 36, 37, 38, 38, 40, 41, 42, 45, 46, 49, 51, 53, 57, 58, 63, 65, 69, 71, 72, 75, 78, 80, 83, 86, 89, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39, 40, 42, 43, 44, 47, 47, 51, 53, 55, 59, 60, 65, 67, 71, 73, 73, 77, 78, 80, 82, 84, 86, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47, 50, 51, 54, 56, 58, 62, 63, 68, 70, 74, 76, 76, 80, 82, 84, 85, 85, 86, 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 59, 60, 62, 66, 67, 72, 74, 78, 79, 80, 83, 84, 85, 87, 90, 92, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 60, 61, 64, 67, 68, 73, 75, 79, 80, 81, 85, 87, 90, 91, 91, 92, 39, 38, 38, 37, 39, 39, 40, 42, 45, 49, 50, 54, 55, 58, 60, 61, 65, 66, 69, 72, 73, 78, 80, 84, 86, 86, 90, 91, 91, 92, 95, 97, 41, 40, 39, 38, 40, 40, 41, 43, 46, 50, 52, 55, 57, 60, 62, 63, 67, 69, 71, 75, 75, 80, 83, 86, 88, 89, 92, 93, 95, 97, 97, 98, 44, 42, 42, 41, 42, 42, 42, 44, 47, 52, 54, 58, 60, 63, 66, 67, 71, 73, 75, 79, 79, 84, 86, 90, 92, 92, 96, 98, 98, 98, 101, 104, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97, 100, 99, 102, 105, 104, 104, 48, 46, 45, 44, 45, 45, 46, 47, 51, 55, 57, 61, 63, 67, 70, 71, 76, 78, 80, 84, 85, 90, 93, 96, 98, 99, 102, 106, 106, 105, 108, 111, 54, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 106, 106, 110, 108, 109, 112, 112, 111, 56, 53, 52, 51, 51, 51, 51, 53, 56, 60, 61, 66, 69, 73, 77, 78, 84, 86, 89, 93, 94, 100, 102, 106, 108, 109, 112, 113, 115, 114, 116, 119, 59, 56, 55, 54, 54, 54, 53, 55, 58, 62, 64, 69, 71, 75, 79, 80, 87, 89, 92, 97, 98, 103, 106, 110, 112, 113, 117, 118, 117, 121, 121, 119, 64, 61, 60, 58, 58, 58, 57, 59, 62, 66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102, 103, 109, 112, 116, 118, 119, 122, 121, 125, 123, 125, 128, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 75, 79, 84, 85, 92, 94, 98, 103, 105, 111, 114, 118, 120, 121, 125, 129, 126, 129, 130, 129, 71, 68, 67, 65, 64, 64, 63, 65, 68, 72, 73, 78, 80, 84, 89, 90, 97, 100, 103, 109, 111, 117, 120, 125, 127, 128, 133, 130, 134, 133, 133, 137, 74, 71, 69, 68, 67, 67, 65, 67, 70, 74, 75, 80, 83, 86, 91, 93, 100, 102, 106, 112, 114, 120, 123, 128, 131, 131, 135, 137, 137, 138, 140, 137, 80, 76, 74, 72, 71, 71, 69, 71, 74, 78, 79, 84, 86, 90, 95, 96, 104, 106, 110, 116, 118, 125, 128, 134, 136, 137, 142, 141, 142, 143, 143, 147, 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 147, 148, 147, 150, 148, 83, 78, 77, 75, 74, 74, 72, 73, 76, 80, 81, 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128, 131, 137, 139, 140, 145, 150, 152, 155, 152, 157, 87, 83, 81, 79, 78, 78, 75, 77, 80, 83, 85, 90, 92, 96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142, 144, 145, 150, 151, 155, 158, 162, 158, 90, 85, 84, 81, 80, 80, 78, 78, 82, 84, 87, 91, 93, 98, 99, 106, 108, 113, 118, 121, 129, 130, 137, 141, 147, 150, 151, 156, 156, 161, 164, 169, 92, 88, 87, 84, 83, 82, 80, 80, 84, 85, 90, 91, 95, 98, 102, 106, 109, 115, 117, 125, 126, 134, 137, 142, 148, 152, 155, 156, 162, 162, 168, 170, 95, 90, 89, 86, 85, 84, 83, 82, 85, 87, 91, 92, 97, 98, 105, 105, 112, 114, 121, 123, 129, 133, 138, 143, 147, 155, 158, 161, 162, 168, 168, 174, 97, 92, 92, 88, 88, 86, 86, 84, 85, 90, 91, 95, 97, 101, 104, 108, 112, 116, 121, 125, 130, 133, 140, 143, 150, 152, 162, 164, 168, 168, 174, 175, 100, 95, 95, 90, 90, 89, 89, 86, 86, 92, 92, 97, 98, 104, 104, 111, 111, 119, 119, 128, 129, 137, 137, 147, 148, 157, 158, 169, 170, 174, 175, 181, /* Size 4x8 */ 32, 32, 34, 41, 51, 65, 75, 86, 35, 36, 47, 53, 61, 73, 81, 92, 59, 57, 65, 78, 92, 108, 117, 119, 83, 78, 82, 97, 111, 129, 148, 154, /* Size 8x4 */ 32, 35, 59, 83, 32, 36, 57, 78, 34, 47, 65, 82, 41, 53, 78, 97, 51, 61, 92, 111, 65, 73, 108, 129, 75, 81, 117, 148, 86, 92, 119, 154, /* Size 8x16 */ 32, 31, 31, 32, 34, 36, 41, 47, 53, 58, 65, 74, 82, 87, 92, 97, 31, 32, 33, 34, 35, 36, 40, 44, 50, 54, 59, 67, 73, 78, 83, 88, 35, 34, 35, 37, 41, 46, 49, 53, 57, 61, 66, 73, 79, 83, 84, 86, 44, 41, 42, 42, 48, 54, 60, 66, 71, 75, 79, 86, 92, 96, 97, 97, 53, 50, 49, 49, 54, 60, 67, 75, 82, 87, 92, 100, 105, 110, 114, 111, 65, 61, 59, 58, 63, 68, 76, 84, 92, 98, 105, 113, 120, 125, 132, 128, 82, 76, 73, 71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144, 150, 147, 90, 85, 81, 79, 81, 87, 93, 101, 108, 116, 124, 134, 142, 153, 157, 163, /* Size 16x8 */ 32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 34, 41, 50, 61, 76, 85, 31, 33, 35, 42, 49, 59, 73, 81, 32, 34, 37, 42, 49, 58, 71, 79, 34, 35, 41, 48, 54, 63, 76, 81, 36, 36, 46, 54, 60, 68, 80, 87, 41, 40, 49, 60, 67, 76, 88, 93, 47, 44, 53, 66, 75, 84, 97, 101, 53, 50, 57, 71, 82, 92, 106, 108, 58, 54, 61, 75, 87, 98, 112, 116, 65, 59, 66, 79, 92, 105, 120, 124, 74, 67, 73, 86, 100, 113, 131, 134, 82, 73, 79, 92, 105, 120, 139, 142, 87, 78, 83, 96, 110, 125, 144, 153, 92, 83, 84, 97, 114, 132, 150, 157, 97, 88, 86, 97, 111, 128, 147, 163, /* Size 16x32 */ 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55, 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 49, 51, 54, 58, 60, 65, 68, 72, 75, 75, 79, 82, 84, 86, 88, 91, 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 81, 83, 85, 88, 91, 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 50, 51, 54, 57, 58, 63, 66, 70, 72, 72, 76, 78, 80, 82, 85, 87, 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 60, 61, 63, 67, 68, 73, 75, 79, 81, 81, 85, 87, 89, 92, 94, 97, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97, 97, 97, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97, 100, 99, 101, 104, 107, 110, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111, 110, 62, 59, 58, 57, 57, 57, 56, 58, 61, 65, 66, 71, 74, 78, 82, 83, 90, 92, 95, 100, 102, 108, 110, 115, 117, 117, 120, 118, 116, 119, 123, 126, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126, 79, 75, 74, 72, 71, 71, 69, 71, 73, 77, 78, 84, 86, 90, 95, 96, 103, 106, 110, 116, 118, 125, 128, 133, 136, 136, 141, 139, 135, 136, 140, 144, 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, 144, 88, 83, 82, 79, 79, 78, 76, 76, 81, 82, 85, 89, 91, 97, 98, 104, 107, 111, 117, 119, 127, 129, 135, 140, 145, 148, 148, 153, 153, 154, 159, 163, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, 163, 163, 93, 88, 88, 84, 84, 82, 83, 80, 80, 86, 86, 91, 91, 97, 98, 105, 105, 112, 113, 121, 122, 130, 130, 140, 140, 149, 150, 161, 162, 166, 167, 173, /* Size 32x16 */ 32, 31, 31, 32, 35, 36, 44, 47, 53, 62, 65, 79, 82, 88, 90, 93, 31, 32, 32, 32, 35, 35, 42, 45, 51, 59, 62, 75, 78, 83, 86, 88, 31, 32, 32, 32, 34, 35, 41, 45, 50, 58, 61, 74, 76, 82, 85, 88, 31, 32, 32, 33, 34, 34, 41, 44, 49, 57, 59, 72, 74, 79, 82, 84, 31, 32, 33, 34, 35, 36, 42, 44, 49, 57, 59, 71, 73, 79, 81, 84, 32, 32, 33, 34, 36, 36, 42, 45, 50, 57, 59, 71, 73, 78, 80, 82, 32, 33, 34, 35, 37, 38, 42, 45, 49, 56, 58, 69, 71, 76, 79, 83, 32, 33, 34, 36, 39, 40, 44, 47, 51, 58, 60, 71, 73, 76, 78, 80, 34, 34, 35, 37, 41, 42, 48, 50, 54, 61, 63, 73, 76, 81, 81, 80, 35, 34, 36, 38, 45, 47, 52, 55, 59, 65, 67, 77, 79, 82, 83, 86, 36, 34, 36, 38, 46, 48, 54, 56, 60, 66, 68, 78, 80, 85, 87, 86, 39, 37, 39, 40, 48, 50, 58, 60, 65, 71, 73, 84, 86, 89, 88, 91, 41, 39, 40, 41, 49, 51, 60, 62, 67, 74, 76, 86, 88, 91, 93, 91, 44, 41, 42, 43, 51, 53, 63, 66, 71, 78, 79, 90, 92, 97, 94, 97, 47, 44, 44, 45, 53, 56, 66, 69, 75, 82, 84, 95, 97, 98, 101, 98, 48, 45, 45, 46, 54, 56, 67, 70, 76, 83, 85, 96, 98, 104, 101, 105, 53, 49, 50, 50, 57, 60, 71, 75, 82, 90, 92, 103, 106, 107, 108, 105, 55, 51, 51, 51, 59, 61, 72, 77, 84, 92, 94, 106, 108, 111, 110, 112, 58, 54, 54, 54, 61, 63, 75, 79, 87, 95, 98, 110, 112, 117, 116, 113, 63, 58, 58, 57, 65, 67, 78, 83, 91, 100, 103, 116, 118, 119, 119, 121, 65, 60, 59, 58, 66, 68, 79, 84, 92, 102, 105, 118, 120, 127, 124, 122, 71, 65, 64, 63, 71, 73, 84, 89, 97, 108, 111, 125, 127, 129, 129, 130, 74, 68, 67, 66, 73, 75, 86, 91, 100, 110, 113, 128, 131, 135, 134, 130, 79, 72, 71, 70, 77, 79, 90, 95, 104, 115, 118, 133, 136, 140, 139, 140, 82, 75, 73, 72, 79, 81, 92, 97, 105, 117, 120, 136, 139, 145, 142, 140, 82, 75, 74, 72, 79, 81, 92, 97, 106, 117, 121, 136, 139, 148, 150, 149, 87, 79, 78, 76, 83, 85, 96, 100, 110, 120, 125, 141, 144, 148, 153, 150, 89, 82, 81, 78, 83, 87, 97, 99, 113, 118, 128, 139, 145, 153, 157, 161, 92, 84, 83, 80, 84, 89, 97, 101, 114, 116, 132, 135, 150, 153, 157, 162, 94, 86, 85, 82, 85, 92, 97, 104, 112, 119, 130, 136, 151, 154, 163, 166, 97, 88, 88, 85, 86, 94, 97, 107, 111, 123, 128, 140, 147, 159, 163, 167, 99, 91, 91, 87, 87, 97, 97, 110, 110, 126, 126, 144, 144, 163, 163, 173, /* Size 4x16 */ 31, 32, 32, 33, 34, 34, 39, 44, 49, 54, 60, 68, 75, 79, 84, 88, 36, 35, 36, 38, 42, 48, 51, 56, 60, 63, 68, 75, 81, 85, 89, 94, 62, 58, 57, 56, 61, 66, 74, 82, 90, 95, 102, 110, 117, 120, 116, 123, 88, 82, 79, 76, 81, 85, 91, 98, 107, 117, 127, 135, 145, 148, 153, 159, /* Size 16x4 */ 31, 36, 62, 88, 32, 35, 58, 82, 32, 36, 57, 79, 33, 38, 56, 76, 34, 42, 61, 81, 34, 48, 66, 85, 39, 51, 74, 91, 44, 56, 82, 98, 49, 60, 90, 107, 54, 63, 95, 117, 60, 68, 102, 127, 68, 75, 110, 135, 75, 81, 117, 145, 79, 85, 120, 148, 84, 89, 116, 153, 88, 94, 123, 159, /* Size 8x32 */ 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55, 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 81, 83, 85, 88, 91, 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97, 97, 97, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111, 110, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126, 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, 144, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, 163, 163, /* Size 32x8 */ 32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 35, 42, 51, 62, 78, 86, 31, 32, 34, 41, 50, 61, 76, 85, 31, 32, 34, 41, 49, 59, 74, 82, 31, 33, 35, 42, 49, 59, 73, 81, 32, 33, 36, 42, 50, 59, 73, 80, 32, 34, 37, 42, 49, 58, 71, 79, 32, 34, 39, 44, 51, 60, 73, 78, 34, 35, 41, 48, 54, 63, 76, 81, 35, 36, 45, 52, 59, 67, 79, 83, 36, 36, 46, 54, 60, 68, 80, 87, 39, 39, 48, 58, 65, 73, 86, 88, 41, 40, 49, 60, 67, 76, 88, 93, 44, 42, 51, 63, 71, 79, 92, 94, 47, 44, 53, 66, 75, 84, 97, 101, 48, 45, 54, 67, 76, 85, 98, 101, 53, 50, 57, 71, 82, 92, 106, 108, 55, 51, 59, 72, 84, 94, 108, 110, 58, 54, 61, 75, 87, 98, 112, 116, 63, 58, 65, 78, 91, 103, 118, 119, 65, 59, 66, 79, 92, 105, 120, 124, 71, 64, 71, 84, 97, 111, 127, 129, 74, 67, 73, 86, 100, 113, 131, 134, 79, 71, 77, 90, 104, 118, 136, 139, 82, 73, 79, 92, 105, 120, 139, 142, 82, 74, 79, 92, 106, 121, 139, 150, 87, 78, 83, 96, 110, 125, 144, 153, 89, 81, 83, 97, 113, 128, 145, 157, 92, 83, 84, 97, 114, 132, 150, 157, 94, 85, 85, 97, 112, 130, 151, 163, 97, 88, 86, 97, 111, 128, 147, 163, 99, 91, 87, 97, 110, 126, 144, 163 }, { /* Chroma */ /* Size 4x4 */ 32, 45, 51, 61, 45, 54, 59, 65, 51, 59, 75, 81, 61, 65, 81, 97, /* Size 8x8 */ 31, 34, 46, 47, 50, 57, 61, 65, 34, 39, 47, 45, 48, 53, 57, 61, 46, 47, 52, 52, 54, 58, 61, 62, 47, 45, 52, 58, 62, 65, 68, 68, 50, 48, 54, 62, 68, 73, 77, 76, 57, 53, 58, 65, 73, 82, 86, 86, 61, 57, 61, 68, 77, 86, 91, 95, 65, 61, 62, 68, 76, 86, 95, 100, /* Size 16x16 */ 32, 31, 33, 36, 41, 49, 49, 50, 52, 54, 57, 61, 64, 67, 68, 70, 31, 31, 34, 39, 42, 47, 46, 47, 49, 51, 53, 57, 60, 62, 64, 66, 33, 34, 37, 42, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 36, 39, 42, 47, 47, 48, 46, 46, 47, 48, 50, 53, 55, 57, 59, 61, 41, 42, 44, 47, 48, 50, 49, 50, 50, 52, 53, 56, 58, 60, 61, 60, 49, 47, 47, 48, 50, 53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 64, 49, 46, 46, 46, 49, 53, 55, 57, 59, 60, 61, 64, 66, 67, 67, 67, 50, 47, 46, 46, 50, 54, 57, 61, 63, 64, 66, 69, 70, 72, 71, 71, 52, 49, 47, 47, 50, 54, 59, 63, 66, 68, 70, 73, 75, 77, 75, 75, 54, 51, 49, 48, 52, 55, 60, 64, 68, 71, 73, 76, 79, 80, 79, 79, 57, 53, 51, 50, 53, 56, 61, 66, 70, 73, 76, 80, 82, 84, 83, 84, 61, 57, 55, 53, 56, 59, 64, 69, 73, 76, 80, 84, 87, 89, 88, 88, 64, 60, 57, 55, 58, 61, 66, 70, 75, 79, 82, 87, 91, 93, 93, 93, 67, 62, 59, 57, 60, 63, 67, 72, 77, 80, 84, 89, 93, 95, 96, 97, 68, 64, 61, 59, 61, 64, 67, 71, 75, 79, 83, 88, 93, 96, 99, 100, 70, 66, 63, 61, 60, 64, 67, 71, 75, 79, 84, 88, 93, 97, 100, 102, /* Size 32x32 */ 32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 52, 53, 54, 56, 57, 60, 61, 63, 64, 65, 67, 67, 68, 69, 70, 71, 31, 31, 31, 31, 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 50, 50, 52, 54, 54, 57, 58, 60, 61, 61, 63, 64, 65, 65, 66, 67, 31, 31, 31, 31, 34, 35, 39, 40, 42, 46, 47, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 56, 57, 59, 60, 60, 62, 63, 64, 65, 66, 67, 30, 31, 31, 32, 34, 35, 40, 41, 42, 45, 46, 45, 45, 45, 46, 46, 47, 48, 49, 51, 52, 54, 55, 57, 58, 58, 60, 61, 62, 62, 63, 64, 33, 34, 34, 34, 37, 38, 42, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 53, 55, 56, 57, 57, 59, 60, 61, 62, 63, 64, 33, 34, 35, 35, 38, 39, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 53, 54, 56, 57, 57, 59, 60, 60, 61, 62, 62, 36, 38, 39, 40, 42, 43, 47, 47, 47, 47, 48, 46, 46, 45, 46, 46, 47, 47, 48, 49, 50, 52, 53, 54, 55, 55, 57, 58, 59, 60, 61, 62, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48, 49, 48, 47, 47, 47, 47, 48, 49, 49, 51, 51, 53, 54, 55, 56, 56, 58, 58, 58, 59, 60, 60, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49, 50, 50, 50, 51, 52, 53, 53, 55, 56, 57, 58, 58, 60, 61, 61, 61, 60, 60, 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 53, 54, 55, 55, 56, 58, 58, 60, 60, 61, 62, 61, 61, 62, 63, 64, 49, 47, 47, 46, 47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 56, 58, 59, 60, 61, 61, 63, 63, 64, 64, 64, 64, 48, 47, 46, 45, 46, 46, 46, 48, 49, 52, 53, 54, 55, 55, 56, 56, 57, 58, 58, 59, 60, 61, 62, 63, 64, 64, 66, 65, 65, 65, 66, 67, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55, 55, 57, 57, 58, 59, 59, 60, 61, 61, 63, 64, 65, 66, 66, 67, 67, 67, 68, 67, 67, 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58, 59, 60, 61, 62, 62, 63, 63, 65, 66, 67, 68, 68, 69, 70, 69, 68, 69, 70, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, 63, 64, 64, 66, 66, 68, 69, 70, 70, 71, 72, 70, 71, 72, 71, 70, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 68, 69, 71, 71, 71, 73, 74, 73, 72, 73, 74, 52, 50, 49, 47, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 63, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 75, 75, 76, 75, 74, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58, 59, 62, 64, 64, 67, 68, 69, 71, 71, 73, 74, 76, 77, 77, 78, 78, 78, 76, 77, 78, 54, 52, 51, 49, 49, 49, 48, 49, 52, 55, 55, 58, 60, 62, 64, 65, 68, 69, 71, 73, 73, 75, 76, 78, 79, 79, 80, 80, 79, 80, 79, 78, 56, 54, 53, 51, 51, 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75, 76, 78, 79, 81, 82, 82, 83, 81, 83, 81, 81, 82, 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 56, 60, 61, 63, 66, 67, 70, 71, 73, 76, 76, 79, 80, 82, 82, 83, 84, 85, 83, 84, 84, 82, 60, 57, 56, 54, 53, 53, 52, 53, 55, 58, 58, 61, 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 83, 85, 86, 86, 88, 86, 87, 86, 85, 86, 61, 58, 57, 55, 55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69, 73, 74, 76, 79, 80, 83, 84, 86, 87, 88, 89, 89, 88, 88, 88, 86, 63, 60, 59, 57, 56, 56, 54, 55, 57, 60, 60, 63, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, 90, 90, 92, 91, 91, 90, 89, 91, 64, 61, 60, 58, 57, 57, 55, 56, 58, 60, 61, 64, 66, 68, 70, 71, 75, 77, 79, 82, 82, 86, 87, 90, 91, 91, 93, 93, 93, 92, 93, 91, 65, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 88, 90, 91, 91, 93, 94, 95, 95, 93, 95, 67, 63, 62, 60, 59, 59, 57, 58, 60, 62, 63, 66, 67, 69, 72, 73, 77, 78, 80, 83, 84, 88, 89, 92, 93, 93, 95, 95, 96, 96, 97, 95, 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63, 65, 67, 70, 70, 74, 75, 78, 80, 81, 85, 86, 89, 91, 93, 94, 95, 97, 97, 98, 98, 100, 68, 65, 64, 62, 61, 60, 59, 58, 61, 61, 64, 65, 67, 69, 71, 73, 75, 78, 79, 83, 83, 87, 88, 91, 93, 95, 96, 97, 99, 98, 100, 100, 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 72, 76, 76, 80, 81, 84, 86, 88, 90, 92, 95, 96, 98, 98, 100, 100, 101, 70, 66, 66, 63, 63, 62, 61, 60, 60, 63, 64, 66, 67, 69, 71, 73, 75, 77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98, 100, 100, 102, 101, 71, 67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67, 70, 70, 74, 74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100, 101, 101, 104, /* Size 4x8 */ 31, 36, 46, 45, 49, 54, 59, 63, 47, 47, 52, 53, 55, 58, 61, 65, 53, 50, 55, 63, 71, 77, 81, 80, 63, 59, 61, 70, 77, 86, 94, 95, /* Size 8x4 */ 31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70, 49, 55, 71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95, /* Size 8x16 */ 32, 31, 33, 37, 42, 49, 48, 50, 52, 54, 57, 61, 64, 66, 68, 70, 33, 34, 37, 43, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 45, 45, 46, 47, 49, 52, 51, 52, 53, 54, 55, 58, 60, 61, 61, 61, 49, 46, 45, 45, 49, 53, 57, 59, 61, 62, 64, 66, 68, 69, 68, 67, 52, 49, 47, 47, 50, 54, 59, 63, 66, 68, 70, 73, 75, 77, 77, 75, 57, 53, 51, 50, 53, 57, 61, 66, 71, 73, 76, 80, 83, 84, 86, 83, 64, 60, 57, 55, 58, 61, 66, 71, 75, 79, 83, 87, 91, 93, 94, 92, 68, 64, 61, 59, 60, 63, 67, 71, 74, 79, 83, 87, 91, 95, 97, 98, /* Size 16x8 */ 32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64, 33, 37, 46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59, 42, 44, 49, 49, 50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 57, 59, 61, 66, 67, 50, 46, 52, 59, 63, 66, 71, 71, 52, 47, 53, 61, 66, 71, 75, 74, 54, 49, 54, 62, 68, 73, 79, 79, 57, 51, 55, 64, 70, 76, 83, 83, 61, 55, 58, 66, 73, 80, 87, 87, 64, 57, 60, 68, 75, 83, 91, 91, 66, 59, 61, 69, 77, 84, 93, 95, 68, 61, 61, 68, 77, 86, 94, 97, 70, 63, 61, 67, 75, 83, 92, 98, /* Size 16x32 */ 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53, 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 31, 31, 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 48, 49, 50, 51, 52, 54, 56, 57, 58, 59, 60, 61, 62, 63, 63, 64, 33, 34, 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 47, 48, 49, 50, 50, 52, 53, 55, 56, 56, 57, 58, 59, 59, 60, 61, 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61, 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 56, 58, 59, 60, 61, 61, 63, 63, 64, 65, 66, 67, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68, 67, 67, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 69, 70, 71, 71, 72, 70, 71, 72, 73, 74, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, 56, 53, 52, 51, 50, 50, 49, 50, 53, 55, 56, 59, 61, 63, 65, 66, 70, 71, 72, 74, 75, 77, 79, 80, 81, 81, 82, 80, 79, 80, 81, 82, 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 63, 60, 59, 57, 56, 56, 54, 55, 57, 60, 60, 64, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, 90, 90, 92, 90, 88, 88, 89, 90, 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94, 92, 90, 67, 63, 62, 60, 60, 59, 57, 57, 60, 61, 63, 65, 66, 70, 70, 73, 75, 77, 80, 81, 85, 86, 89, 91, 93, 94, 94, 96, 96, 95, 97, 98, 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98, 68, 65, 65, 62, 62, 60, 61, 59, 59, 62, 62, 65, 65, 68, 68, 72, 72, 76, 76, 80, 80, 84, 84, 89, 89, 93, 93, 97, 98, 99, 99, 102, /* Size 32x16 */ 32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31, 31, 34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32, 34, 39, 45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35, 40, 44, 46, 45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42, 46, 47, 45, 46, 47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46, 47, 46, 46, 47, 50, 51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47, 45, 46, 47, 49, 50, 54, 55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47, 47, 48, 50, 51, 55, 56, 57, 58, 59, 42, 43, 44, 47, 49, 50, 49, 50, 50, 53, 53, 57, 58, 60, 60, 59, 47, 46, 46, 48, 51, 52, 53, 53, 53, 55, 56, 60, 61, 61, 61, 62, 49, 46, 47, 48, 52, 53, 53, 54, 54, 56, 57, 60, 61, 63, 63, 62, 48, 46, 46, 47, 51, 53, 56, 56, 57, 59, 60, 64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53, 57, 57, 59, 61, 61, 65, 66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59, 61, 63, 64, 67, 68, 70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65, 66, 70, 71, 70, 71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71, 71, 73, 71, 72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75, 74, 72, 53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76, 54, 50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51, 51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51, 50, 55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52, 57, 58, 65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58, 59, 66, 69, 73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60, 67, 70, 75, 80, 82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68, 71, 75, 81, 83, 90, 91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71, 75, 81, 83, 90, 91, 94, 94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77, 82, 84, 92, 93, 94, 95, 93, 67, 61, 60, 58, 61, 63, 69, 70, 78, 80, 85, 90, 93, 96, 97, 97, 68, 62, 61, 59, 61, 64, 68, 71, 77, 79, 86, 88, 94, 96, 97, 98, 69, 63, 62, 59, 61, 65, 68, 72, 76, 80, 85, 88, 94, 95, 99, 99, 70, 63, 63, 60, 61, 66, 67, 73, 75, 81, 83, 89, 92, 97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74, 74, 82, 82, 90, 90, 98, 98, 102, /* Size 4x16 */ 31, 32, 35, 40, 43, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 63, 48, 46, 47, 47, 50, 53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 66, 56, 52, 50, 49, 53, 56, 61, 65, 70, 72, 75, 79, 81, 82, 79, 81, 67, 62, 60, 57, 60, 63, 66, 70, 75, 80, 85, 89, 93, 94, 96, 97, /* Size 16x4 */ 31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57, 43, 50, 53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70, 48, 54, 70, 75, 50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89, 58, 61, 81, 93, 60, 63, 82, 94, 62, 64, 79, 96, 63, 66, 81, 97, /* Size 8x32 */ 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53, 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 33, 34, 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68, 67, 67, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94, 92, 90, 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98, /* Size 32x8 */ 32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64, 31, 34, 45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61, 33, 37, 46, 45, 47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60, 37, 43, 47, 45, 47, 50, 55, 59, 39, 43, 48, 47, 48, 51, 56, 58, 42, 44, 49, 49, 50, 53, 58, 60, 47, 46, 51, 53, 53, 56, 61, 61, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 56, 57, 60, 64, 64, 48, 46, 51, 57, 59, 61, 66, 67, 49, 45, 51, 58, 61, 64, 68, 67, 50, 46, 52, 59, 63, 66, 71, 71, 50, 46, 52, 59, 64, 67, 71, 71, 52, 47, 53, 61, 66, 71, 75, 74, 53, 48, 53, 61, 67, 72, 77, 75, 54, 49, 54, 62, 68, 73, 79, 79, 56, 51, 55, 63, 70, 76, 82, 80, 57, 51, 55, 64, 70, 76, 83, 83, 60, 54, 57, 65, 72, 79, 86, 85, 61, 55, 58, 66, 73, 80, 87, 87, 63, 56, 59, 67, 75, 82, 90, 89, 64, 57, 60, 68, 75, 83, 91, 91, 64, 58, 60, 68, 75, 83, 91, 94, 66, 59, 61, 69, 77, 84, 93, 95, 67, 60, 61, 69, 78, 85, 93, 97, 68, 61, 61, 68, 77, 86, 94, 97, 69, 62, 61, 68, 76, 85, 94, 99, 70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67, 74, 82, 90, 98 }, }, { { /* Luma */ /* Size 4x4 */ 32, 34, 53, 75, 34, 49, 64, 81, 53, 64, 91, 112, 75, 81, 112, 140, /* Size 8x8 */ 32, 32, 34, 39, 50, 62, 76, 84, 32, 33, 35, 40, 48, 59, 71, 79, 34, 35, 39, 46, 53, 63, 74, 81, 39, 40, 46, 56, 65, 75, 86, 92, 50, 48, 53, 65, 78, 90, 101, 106, 62, 59, 63, 75, 90, 105, 118, 123, 76, 71, 74, 86, 101, 118, 134, 142, 84, 79, 81, 92, 106, 123, 142, 153, /* Size 16x16 */ 32, 31, 31, 32, 33, 36, 39, 44, 48, 54, 59, 66, 74, 81, 86, 91, 31, 32, 32, 32, 33, 35, 38, 42, 46, 51, 56, 63, 70, 77, 81, 86, 31, 32, 32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 77, 82, 32, 32, 33, 34, 36, 37, 40, 42, 45, 49, 53, 59, 66, 71, 75, 80, 33, 33, 34, 36, 38, 42, 44, 46, 50, 53, 57, 63, 69, 74, 78, 80, 36, 35, 35, 37, 42, 48, 50, 54, 57, 60, 64, 69, 75, 80, 84, 85, 39, 38, 38, 40, 44, 50, 54, 58, 61, 65, 69, 74, 80, 85, 89, 91, 44, 42, 41, 42, 46, 54, 58, 63, 67, 71, 75, 80, 86, 91, 95, 97, 48, 46, 45, 45, 50, 57, 61, 67, 71, 76, 80, 86, 93, 98, 101, 104, 54, 51, 49, 49, 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 112, 59, 56, 54, 53, 57, 64, 69, 75, 80, 87, 92, 99, 106, 112, 116, 120, 66, 63, 60, 59, 63, 69, 74, 80, 86, 93, 99, 107, 115, 121, 125, 129, 74, 70, 67, 66, 69, 75, 80, 86, 93, 100, 106, 115, 123, 130, 135, 138, 81, 77, 73, 71, 74, 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148, 86, 81, 77, 75, 78, 84, 89, 95, 101, 109, 116, 125, 135, 142, 147, 153, 91, 86, 82, 80, 80, 85, 91, 97, 104, 112, 120, 129, 138, 148, 153, 159, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 32, 32, 33, 34, 36, 36, 39, 41, 44, 46, 48, 52, 54, 58, 59, 65, 66, 71, 74, 80, 81, 83, 86, 89, 91, 93, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 39, 42, 44, 46, 50, 51, 56, 56, 62, 63, 68, 71, 76, 77, 78, 82, 84, 86, 88, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 39, 42, 44, 46, 49, 51, 55, 56, 61, 63, 67, 70, 75, 77, 78, 81, 84, 86, 88, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 38, 41, 42, 44, 48, 49, 53, 54, 59, 60, 65, 68, 72, 74, 75, 78, 80, 82, 84, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 77, 80, 82, 84, 31, 32, 32, 32, 33, 33, 33, 34, 35, 35, 36, 36, 39, 40, 42, 44, 45, 48, 50, 53, 54, 59, 60, 64, 67, 71, 73, 74, 77, 79, 81, 83, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 37, 38, 40, 40, 42, 44, 45, 48, 49, 53, 53, 58, 59, 63, 66, 70, 71, 72, 75, 78, 80, 83, 32, 32, 32, 32, 33, 34, 35, 35, 36, 37, 38, 38, 40, 41, 42, 44, 46, 48, 49, 53, 53, 58, 59, 63, 65, 69, 71, 72, 74, 77, 79, 80, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 74, 75, 78, 79, 80, 81, 34, 34, 34, 33, 34, 35, 36, 37, 39, 39, 42, 43, 45, 46, 47, 49, 51, 53, 54, 58, 58, 63, 64, 68, 70, 74, 75, 76, 79, 81, 84, 86, 36, 35, 35, 34, 35, 36, 37, 38, 42, 42, 48, 48, 50, 51, 54, 55, 57, 59, 60, 63, 64, 68, 69, 73, 75, 79, 80, 81, 84, 85, 85, 86, 36, 35, 35, 34, 35, 36, 38, 38, 42, 43, 48, 49, 51, 52, 54, 55, 57, 59, 60, 64, 64, 68, 69, 73, 75, 79, 80, 81, 84, 86, 88, 91, 39, 38, 38, 37, 38, 39, 40, 40, 44, 45, 50, 51, 54, 55, 58, 59, 61, 64, 65, 68, 69, 73, 74, 78, 80, 84, 85, 86, 89, 91, 91, 91, 41, 39, 39, 38, 39, 40, 40, 41, 45, 46, 51, 52, 55, 56, 59, 61, 63, 65, 67, 70, 70, 75, 76, 80, 82, 86, 87, 88, 91, 92, 94, 96, 44, 42, 42, 41, 41, 42, 42, 42, 46, 47, 54, 54, 58, 59, 63, 65, 67, 70, 71, 75, 75, 79, 80, 84, 86, 90, 91, 92, 95, 97, 97, 97, 46, 44, 44, 42, 43, 44, 44, 44, 48, 49, 55, 55, 59, 61, 65, 67, 69, 72, 74, 77, 78, 82, 83, 87, 89, 93, 94, 95, 98, 98, 100, 103, 48, 46, 46, 44, 45, 45, 45, 46, 50, 51, 57, 57, 61, 63, 67, 69, 71, 74, 76, 80, 80, 85, 86, 90, 93, 96, 98, 99, 101, 104, 104, 103, 52, 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59, 64, 65, 70, 72, 74, 78, 80, 84, 85, 90, 91, 95, 97, 101, 103, 104, 106, 106, 107, 110, 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71, 74, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112, 112, 110, 58, 56, 55, 53, 53, 53, 53, 53, 57, 58, 63, 64, 68, 70, 75, 77, 80, 84, 86, 91, 91, 97, 98, 103, 105, 110, 111, 112, 115, 114, 115, 118, 59, 56, 56, 54, 54, 54, 53, 53, 57, 58, 64, 64, 69, 70, 75, 78, 80, 85, 87, 91, 92, 98, 99, 103, 106, 110, 112, 113, 116, 119, 120, 119, 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111, 114, 118, 120, 121, 124, 123, 123, 126, 66, 63, 63, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98, 99, 106, 107, 112, 115, 119, 121, 122, 125, 128, 129, 126, 71, 68, 67, 65, 65, 64, 63, 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95, 97, 103, 103, 111, 112, 117, 120, 125, 127, 128, 131, 132, 132, 135, 74, 71, 70, 68, 67, 67, 66, 65, 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105, 106, 114, 115, 120, 123, 128, 130, 131, 135, 135, 138, 136, 80, 76, 75, 72, 72, 71, 70, 69, 73, 74, 79, 79, 84, 86, 90, 93, 96, 101, 104, 110, 110, 118, 119, 125, 128, 134, 136, 137, 140, 142, 140, 144, 81, 77, 77, 74, 73, 73, 71, 71, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111, 112, 120, 121, 127, 130, 136, 137, 139, 142, 145, 148, 144, 83, 78, 78, 75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88, 92, 95, 99, 104, 106, 112, 113, 121, 122, 128, 131, 137, 139, 140, 144, 148, 150, 155, 86, 82, 81, 78, 77, 77, 75, 74, 78, 79, 84, 84, 89, 91, 95, 98, 101, 106, 109, 115, 116, 124, 125, 131, 135, 140, 142, 144, 147, 149, 153, 155, 89, 84, 84, 80, 80, 79, 78, 77, 79, 81, 85, 86, 91, 92, 97, 98, 104, 106, 112, 114, 119, 123, 128, 132, 135, 142, 145, 148, 149, 153, 154, 159, 91, 86, 86, 82, 82, 81, 80, 79, 80, 84, 85, 88, 91, 94, 97, 100, 104, 107, 112, 115, 120, 123, 129, 132, 138, 140, 148, 150, 153, 154, 159, 159, 93, 88, 88, 84, 84, 83, 83, 80, 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110, 118, 119, 126, 126, 135, 136, 144, 144, 155, 155, 159, 159, 164, /* Size 4x8 */ 32, 32, 34, 38, 48, 60, 72, 81, 35, 36, 42, 51, 59, 68, 79, 86, 51, 50, 54, 67, 80, 92, 104, 112, 77, 72, 75, 87, 103, 119, 135, 144, /* Size 8x4 */ 32, 35, 51, 77, 32, 36, 50, 72, 34, 42, 54, 75, 38, 51, 67, 87, 48, 59, 80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144, /* Size 8x16 */ 32, 31, 31, 32, 33, 36, 39, 44, 48, 53, 58, 66, 74, 81, 86, 91, 31, 32, 32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 78, 82, 33, 33, 34, 36, 38, 42, 44, 46, 50, 53, 57, 63, 69, 75, 78, 80, 40, 39, 38, 40, 44, 51, 54, 59, 62, 66, 70, 75, 81, 86, 90, 90, 51, 49, 47, 48, 52, 58, 63, 69, 74, 79, 84, 90, 97, 102, 106, 103, 65, 61, 59, 58, 62, 68, 73, 79, 85, 92, 98, 106, 113, 120, 124, 119, 79, 74, 71, 69, 72, 78, 84, 90, 96, 103, 110, 119, 128, 135, 140, 137, 87, 82, 79, 77, 78, 84, 89, 96, 103, 111, 118, 126, 134, 143, 147, 151, /* Size 16x8 */ 32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 61, 74, 82, 31, 32, 34, 38, 47, 59, 71, 79, 32, 33, 36, 40, 48, 58, 69, 77, 33, 34, 38, 44, 52, 62, 72, 78, 36, 35, 42, 51, 58, 68, 78, 84, 39, 38, 44, 54, 63, 73, 84, 89, 44, 41, 46, 59, 69, 79, 90, 96, 48, 45, 50, 62, 74, 85, 96, 103, 53, 49, 53, 66, 79, 92, 103, 111, 58, 54, 57, 70, 84, 98, 110, 118, 66, 60, 63, 75, 90, 106, 119, 126, 74, 67, 69, 81, 97, 113, 128, 134, 81, 73, 75, 86, 102, 120, 135, 143, 86, 78, 78, 90, 106, 124, 140, 147, 91, 82, 80, 90, 103, 119, 137, 151, /* Size 16x32 */ 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52, 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 37, 38, 41, 43, 45, 48, 49, 53, 54, 60, 61, 65, 68, 72, 74, 75, 78, 81, 83, 85, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 78, 80, 82, 85, 32, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 40, 41, 43, 44, 46, 49, 50, 53, 54, 58, 59, 63, 66, 70, 71, 72, 75, 77, 79, 81, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81, 36, 35, 35, 34, 35, 36, 37, 38, 41, 42, 48, 48, 50, 51, 53, 55, 56, 59, 60, 63, 63, 68, 69, 73, 75, 79, 80, 81, 84, 86, 88, 90, 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90, 90, 90, 44, 42, 42, 41, 41, 42, 42, 42, 46, 48, 54, 54, 58, 59, 63, 65, 67, 70, 71, 74, 75, 79, 80, 84, 86, 90, 91, 92, 95, 98, 100, 102, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103, 53, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71, 73, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112, 114, 117, 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117, 66, 63, 62, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98, 99, 106, 107, 112, 115, 119, 121, 122, 125, 127, 130, 134, 79, 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134, 81, 77, 76, 74, 73, 72, 71, 70, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111, 112, 119, 121, 127, 130, 135, 137, 139, 142, 144, 148, 151, 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, 152, 90, 85, 85, 81, 81, 80, 80, 77, 78, 83, 83, 87, 88, 93, 93, 100, 100, 107, 107, 115, 115, 123, 123, 132, 132, 140, 140, 151, 151, 155, 155, 160, /* Size 32x16 */ 32, 31, 31, 32, 33, 36, 40, 44, 51, 53, 65, 66, 79, 81, 87, 90, 31, 32, 32, 32, 33, 35, 39, 42, 49, 51, 62, 63, 75, 77, 83, 85, 31, 32, 32, 32, 33, 35, 39, 42, 49, 51, 61, 62, 74, 76, 82, 85, 31, 32, 32, 33, 33, 34, 38, 41, 47, 49, 59, 60, 72, 74, 79, 81, 31, 32, 32, 33, 34, 35, 38, 41, 47, 49, 59, 60, 71, 73, 79, 81, 32, 32, 33, 34, 35, 36, 39, 42, 48, 50, 59, 60, 71, 72, 78, 80, 32, 32, 33, 35, 36, 37, 40, 42, 48, 49, 58, 59, 69, 71, 77, 80, 32, 33, 33, 35, 36, 38, 41, 42, 48, 49, 58, 59, 69, 70, 75, 77, 33, 33, 34, 36, 38, 41, 44, 46, 52, 53, 62, 63, 72, 74, 78, 78, 34, 34, 34, 37, 39, 42, 45, 48, 53, 54, 63, 64, 73, 75, 80, 83, 36, 34, 35, 38, 42, 48, 51, 54, 58, 60, 68, 69, 78, 80, 84, 83, 36, 35, 35, 38, 42, 48, 51, 54, 59, 60, 68, 69, 79, 80, 85, 87, 39, 37, 38, 40, 44, 50, 54, 58, 63, 65, 73, 74, 84, 85, 89, 88, 40, 38, 39, 41, 45, 51, 56, 59, 65, 67, 75, 76, 85, 87, 90, 93, 44, 41, 41, 43, 46, 53, 59, 63, 69, 71, 79, 80, 90, 91, 96, 93, 46, 43, 43, 44, 48, 55, 60, 65, 72, 73, 82, 83, 93, 94, 97, 100, 48, 45, 45, 46, 50, 56, 62, 67, 74, 76, 85, 86, 96, 98, 103, 100, 52, 48, 48, 49, 52, 59, 65, 70, 78, 80, 90, 91, 101, 103, 105, 107, 53, 49, 49, 50, 53, 60, 66, 71, 79, 82, 92, 93, 103, 105, 111, 107, 58, 53, 53, 53, 57, 63, 69, 74, 83, 86, 97, 98, 109, 111, 113, 115, 58, 54, 54, 54, 57, 63, 70, 75, 84, 87, 98, 99, 110, 112, 118, 115, 65, 60, 59, 58, 62, 68, 74, 79, 89, 92, 105, 106, 118, 119, 122, 123, 66, 61, 60, 59, 63, 69, 75, 80, 90, 93, 106, 107, 119, 121, 126, 123, 71, 65, 65, 63, 67, 73, 79, 84, 94, 97, 111, 112, 125, 127, 131, 132, 74, 68, 67, 66, 69, 75, 81, 86, 97, 100, 113, 115, 128, 130, 134, 132, 79, 72, 72, 70, 73, 79, 85, 90, 101, 104, 118, 119, 133, 135, 141, 140, 81, 74, 73, 71, 75, 80, 86, 91, 102, 105, 120, 121, 135, 137, 143, 140, 82, 75, 74, 72, 75, 81, 87, 92, 103, 106, 121, 122, 136, 139, 147, 151, 86, 78, 78, 75, 78, 84, 90, 95, 106, 109, 124, 125, 140, 142, 147, 151, 88, 81, 80, 77, 80, 86, 90, 98, 105, 112, 122, 127, 140, 144, 152, 155, 91, 83, 82, 79, 80, 88, 90, 100, 103, 114, 119, 130, 137, 148, 151, 155, 93, 85, 85, 81, 81, 90, 90, 102, 103, 117, 117, 134, 134, 151, 152, 160, /* Size 4x16 */ 31, 32, 32, 32, 33, 34, 37, 41, 45, 49, 54, 61, 68, 74, 78, 83, 36, 35, 35, 37, 41, 48, 50, 53, 56, 60, 63, 69, 75, 80, 84, 88, 53, 51, 49, 49, 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 114, 81, 76, 73, 71, 74, 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148, /* Size 16x4 */ 31, 36, 53, 81, 32, 35, 51, 76, 32, 35, 49, 73, 32, 37, 49, 71, 33, 41, 53, 74, 34, 48, 60, 80, 37, 50, 65, 85, 41, 53, 71, 91, 45, 56, 76, 98, 49, 60, 82, 105, 54, 63, 87, 112, 61, 69, 93, 121, 68, 75, 100, 130, 74, 80, 105, 137, 78, 84, 109, 142, 83, 88, 114, 148, /* Size 8x32 */ 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52, 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 78, 80, 82, 85, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81, 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90, 90, 90, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103, 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117, 79, 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134, 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, 152, /* Size 32x8 */ 32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 62, 75, 83, 31, 32, 33, 39, 49, 61, 74, 82, 31, 32, 33, 38, 47, 59, 72, 79, 31, 32, 34, 38, 47, 59, 71, 79, 32, 33, 35, 39, 48, 59, 71, 78, 32, 33, 36, 40, 48, 58, 69, 77, 32, 33, 36, 41, 48, 58, 69, 75, 33, 34, 38, 44, 52, 62, 72, 78, 34, 34, 39, 45, 53, 63, 73, 80, 36, 35, 42, 51, 58, 68, 78, 84, 36, 35, 42, 51, 59, 68, 79, 85, 39, 38, 44, 54, 63, 73, 84, 89, 40, 39, 45, 56, 65, 75, 85, 90, 44, 41, 46, 59, 69, 79, 90, 96, 46, 43, 48, 60, 72, 82, 93, 97, 48, 45, 50, 62, 74, 85, 96, 103, 52, 48, 52, 65, 78, 90, 101, 105, 53, 49, 53, 66, 79, 92, 103, 111, 58, 53, 57, 69, 83, 97, 109, 113, 58, 54, 57, 70, 84, 98, 110, 118, 65, 59, 62, 74, 89, 105, 118, 122, 66, 60, 63, 75, 90, 106, 119, 126, 71, 65, 67, 79, 94, 111, 125, 131, 74, 67, 69, 81, 97, 113, 128, 134, 79, 72, 73, 85, 101, 118, 133, 141, 81, 73, 75, 86, 102, 120, 135, 143, 82, 74, 75, 87, 103, 121, 136, 147, 86, 78, 78, 90, 106, 124, 140, 147, 88, 80, 80, 90, 105, 122, 140, 152, 91, 82, 80, 90, 103, 119, 137, 151, 93, 85, 81, 90, 103, 117, 134, 152 }, { /* Chroma */ /* Size 4x4 */ 32, 46, 49, 58, 46, 53, 55, 62, 49, 55, 70, 78, 58, 62, 78, 91, /* Size 8x8 */ 31, 34, 42, 47, 49, 54, 60, 64, 34, 39, 45, 46, 47, 51, 56, 59, 42, 45, 48, 49, 50, 53, 57, 60, 47, 46, 49, 55, 58, 61, 65, 66, 49, 47, 50, 58, 65, 69, 73, 74, 54, 51, 53, 61, 69, 76, 82, 83, 60, 56, 57, 65, 73, 82, 89, 92, 64, 59, 60, 66, 74, 83, 92, 96, /* Size 16x16 */ 32, 31, 31, 35, 40, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 31, 31, 32, 37, 41, 47, 47, 46, 48, 49, 51, 54, 57, 60, 62, 64, 31, 32, 34, 39, 43, 46, 46, 45, 46, 47, 49, 52, 55, 57, 59, 61, 35, 37, 39, 44, 46, 47, 46, 45, 46, 47, 48, 51, 53, 56, 57, 59, 40, 41, 43, 46, 48, 50, 49, 48, 49, 49, 51, 53, 55, 57, 59, 59, 49, 47, 46, 47, 50, 53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 62, 48, 47, 46, 46, 49, 53, 54, 55, 56, 57, 58, 60, 62, 64, 65, 65, 49, 46, 45, 45, 48, 53, 55, 58, 60, 61, 62, 64, 66, 68, 69, 69, 50, 48, 46, 46, 49, 54, 56, 60, 61, 63, 65, 67, 69, 71, 72, 72, 52, 49, 47, 47, 49, 54, 57, 61, 63, 66, 68, 71, 73, 75, 76, 77, 54, 51, 49, 48, 51, 55, 58, 62, 65, 68, 71, 74, 76, 78, 80, 81, 57, 54, 52, 51, 53, 57, 60, 64, 67, 71, 74, 77, 80, 83, 84, 85, 61, 57, 55, 53, 55, 59, 62, 66, 69, 73, 76, 80, 84, 87, 89, 89, 64, 60, 57, 56, 57, 61, 64, 68, 71, 75, 78, 83, 87, 90, 92, 94, 66, 62, 59, 57, 59, 62, 65, 69, 72, 76, 80, 84, 89, 92, 94, 96, 68, 64, 61, 59, 59, 62, 65, 69, 72, 77, 81, 85, 89, 94, 96, 98, /* Size 32x32 */ 32, 31, 31, 30, 31, 33, 35, 36, 40, 41, 49, 49, 48, 48, 49, 50, 50, 52, 52, 54, 54, 57, 57, 60, 61, 63, 64, 65, 66, 67, 68, 69, 31, 31, 31, 31, 32, 34, 37, 38, 41, 42, 47, 47, 47, 47, 47, 47, 48, 49, 50, 52, 52, 54, 55, 57, 58, 60, 61, 61, 63, 64, 64, 65, 31, 31, 31, 31, 32, 35, 37, 39, 41, 42, 47, 47, 47, 46, 46, 47, 48, 49, 49, 51, 51, 54, 54, 56, 57, 59, 60, 61, 62, 63, 64, 65, 30, 31, 31, 32, 33, 35, 38, 40, 42, 42, 46, 46, 45, 45, 45, 45, 46, 47, 47, 49, 49, 52, 52, 54, 55, 57, 58, 58, 60, 61, 61, 62, 31, 32, 32, 33, 34, 37, 39, 41, 43, 43, 46, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 51, 52, 54, 55, 57, 57, 58, 59, 60, 61, 62, 33, 34, 35, 35, 37, 39, 41, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 47, 47, 49, 49, 51, 51, 53, 54, 56, 57, 57, 58, 59, 60, 61, 35, 37, 37, 38, 39, 41, 44, 46, 46, 46, 47, 47, 46, 46, 45, 46, 46, 47, 47, 48, 48, 50, 51, 52, 53, 55, 56, 56, 57, 58, 59, 61, 36, 38, 39, 40, 41, 43, 46, 47, 47, 47, 48, 47, 46, 46, 45, 46, 46, 46, 47, 48, 48, 50, 50, 52, 53, 54, 55, 55, 56, 57, 58, 58, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 49, 49, 49, 48, 49, 49, 49, 49, 51, 51, 52, 53, 54, 55, 57, 57, 58, 59, 59, 59, 59, 41, 42, 42, 42, 43, 45, 46, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50, 50, 52, 52, 53, 53, 55, 56, 57, 58, 58, 59, 60, 61, 62, 49, 47, 47, 46, 46, 47, 47, 48, 50, 50, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 57, 58, 59, 60, 61, 61, 62, 62, 62, 62, 49, 47, 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 56, 57, 57, 59, 59, 61, 61, 62, 63, 63, 64, 65, 48, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 62, 63, 64, 64, 65, 66, 65, 65, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 60, 61, 61, 63, 63, 65, 65, 65, 66, 66, 67, 68, 49, 47, 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56, 58, 59, 60, 61, 61, 62, 62, 63, 64, 65, 66, 67, 68, 68, 69, 70, 69, 68, 50, 47, 47, 45, 46, 46, 46, 46, 49, 49, 54, 54, 56, 57, 59, 60, 60, 62, 62, 63, 64, 65, 65, 67, 68, 69, 69, 70, 70, 70, 71, 71, 50, 48, 48, 46, 46, 46, 46, 46, 49, 50, 54, 54, 56, 57, 60, 60, 61, 63, 63, 65, 65, 67, 67, 68, 69, 71, 71, 71, 72, 73, 72, 71, 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65, 65, 67, 67, 69, 70, 71, 72, 73, 74, 74, 75, 74, 74, 75, 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65, 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 77, 75, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 55, 58, 59, 62, 63, 65, 67, 68, 70, 70, 73, 73, 75, 76, 78, 78, 78, 79, 78, 78, 79, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 73, 74, 75, 76, 78, 78, 79, 80, 81, 81, 79, 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 56, 57, 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 83, 82, 83, 57, 55, 54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65, 67, 70, 71, 73, 74, 77, 77, 79, 80, 82, 83, 83, 84, 85, 85, 83, 60, 57, 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61, 63, 65, 67, 68, 71, 72, 75, 75, 79, 79, 82, 83, 85, 86, 86, 87, 87, 86, 87, 61, 58, 57, 55, 55, 54, 53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80, 83, 84, 86, 87, 88, 89, 89, 89, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 63, 65, 67, 69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 89, 90, 91, 92, 90, 91, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 89, 90, 91, 92, 93, 94, 91, 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64, 65, 68, 70, 71, 74, 75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 93, 94, 94, 96, 66, 63, 62, 60, 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76, 79, 80, 84, 84, 87, 89, 91, 92, 93, 94, 94, 96, 96, 67, 64, 63, 61, 60, 59, 58, 57, 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87, 89, 92, 93, 94, 94, 96, 96, 97, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62, 64, 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94, 96, 96, 98, 97, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97, 99, /* Size 4x8 */ 31, 36, 43, 45, 47, 52, 57, 61, 47, 47, 50, 53, 54, 56, 60, 63, 50, 47, 50, 58, 66, 70, 75, 77, 61, 57, 58, 65, 74, 82, 90, 93, /* Size 8x4 */ 31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65, 47, 54, 66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93, /* Size 8x16 */ 32, 31, 31, 35, 41, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 32, 33, 35, 39, 43, 47, 46, 45, 46, 48, 50, 52, 55, 58, 59, 61, 40, 41, 43, 46, 48, 50, 49, 48, 49, 50, 51, 53, 56, 58, 59, 59, 49, 47, 46, 46, 49, 53, 54, 56, 57, 58, 59, 61, 63, 65, 66, 65, 51, 49, 47, 47, 49, 54, 57, 61, 63, 65, 67, 69, 72, 73, 75, 72, 57, 54, 51, 50, 52, 57, 60, 64, 67, 71, 73, 77, 80, 82, 84, 81, 63, 59, 57, 55, 57, 60, 64, 67, 71, 75, 78, 82, 86, 89, 91, 89, 67, 63, 60, 58, 59, 62, 65, 69, 73, 77, 81, 85, 88, 92, 94, 95, /* Size 16x8 */ 32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63, 31, 35, 43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58, 41, 43, 48, 49, 49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62, 48, 46, 49, 54, 57, 60, 64, 65, 49, 45, 48, 56, 61, 64, 67, 69, 50, 46, 49, 57, 63, 67, 71, 73, 52, 48, 50, 58, 65, 71, 75, 77, 54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 53, 61, 69, 77, 82, 85, 61, 55, 56, 63, 72, 80, 86, 88, 64, 58, 58, 65, 73, 82, 89, 92, 66, 59, 59, 66, 75, 84, 91, 94, 68, 61, 59, 65, 72, 81, 89, 95, /* Size 16x32 */ 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51, 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 31, 31, 31, 32, 33, 36, 38, 40, 42, 43, 46, 46, 46, 45, 45, 46, 46, 47, 48, 50, 50, 52, 52, 54, 56, 57, 58, 59, 60, 61, 62, 62, 32, 33, 33, 33, 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57, 58, 58, 59, 60, 61, 62, 37, 38, 38, 40, 41, 43, 45, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 47, 47, 48, 49, 50, 51, 52, 53, 55, 55, 56, 57, 58, 58, 59, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59, 48, 47, 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65, 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66, 65, 65, 49, 47, 47, 45, 45, 46, 45, 45, 48, 49, 53, 54, 56, 56, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 66, 67, 68, 68, 69, 70, 71, 71, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, 52, 50, 49, 48, 48, 47, 47, 47, 50, 50, 54, 55, 57, 58, 61, 62, 64, 66, 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 78, 79, 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 58, 55, 54, 52, 52, 52, 51, 50, 53, 54, 57, 57, 60, 61, 64, 66, 67, 70, 71, 73, 74, 77, 77, 79, 81, 82, 83, 83, 85, 85, 86, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91, 89, 87, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 90, 90, 91, 92, 93, 94, 95, 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95, 67, 64, 64, 61, 61, 60, 60, 58, 58, 61, 61, 64, 64, 67, 67, 70, 71, 74, 74, 78, 78, 82, 82, 86, 86, 90, 90, 95, 95, 96, 96, 98, /* Size 32x16 */ 32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31, 31, 33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31, 33, 38, 41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33, 40, 42, 46, 45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41, 43, 46, 46, 45, 47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44, 47, 46, 46, 47, 47, 51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47, 46, 45, 47, 47, 50, 51, 55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46, 45, 46, 47, 50, 50, 54, 55, 57, 58, 41, 42, 43, 47, 48, 49, 49, 48, 49, 50, 52, 53, 57, 57, 59, 58, 42, 43, 43, 47, 48, 50, 49, 49, 50, 50, 53, 54, 57, 58, 60, 61, 49, 46, 47, 48, 50, 53, 53, 53, 54, 54, 57, 57, 60, 61, 62, 61, 49, 46, 47, 48, 50, 53, 53, 54, 54, 55, 57, 57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53, 54, 56, 57, 57, 60, 60, 64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56, 58, 58, 61, 61, 65, 65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61, 64, 64, 67, 68, 69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66, 69, 69, 70, 70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71, 73, 71, 51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74, 52, 48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50, 49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50, 49, 51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50, 52, 56, 60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53, 57, 61, 64, 69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58, 62, 65, 71, 72, 79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63, 66, 72, 73, 80, 81, 86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67, 73, 75, 82, 82, 89, 90, 92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73, 75, 82, 83, 89, 90, 92, 90, 64, 59, 58, 56, 58, 61, 65, 68, 74, 75, 83, 83, 90, 91, 94, 95, 66, 60, 59, 57, 59, 62, 66, 69, 75, 76, 84, 85, 91, 92, 94, 95, 67, 61, 60, 58, 59, 63, 66, 70, 74, 77, 82, 85, 91, 93, 96, 96, 68, 62, 61, 58, 59, 64, 65, 71, 72, 78, 81, 86, 89, 94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71, 71, 79, 79, 87, 87, 95, 95, 98, /* Size 4x16 */ 31, 31, 33, 38, 42, 46, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 48, 47, 46, 47, 49, 53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 64, 52, 49, 48, 47, 50, 54, 57, 61, 64, 66, 68, 71, 73, 75, 76, 78, 64, 60, 57, 56, 57, 61, 64, 68, 71, 75, 78, 83, 87, 90, 92, 94, /* Size 16x4 */ 31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56, 42, 49, 50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68, 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83, 56, 59, 73, 87, 58, 61, 75, 90, 60, 62, 76, 92, 62, 64, 78, 94, /* Size 8x32 */ 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51, 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 32, 33, 33, 33, 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57, 58, 58, 59, 60, 61, 62, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59, 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66, 65, 65, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91, 89, 87, 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95, /* Size 32x8 */ 32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63, 31, 33, 41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60, 31, 35, 43, 46, 47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59, 35, 39, 46, 46, 47, 50, 55, 58, 37, 41, 47, 46, 46, 50, 54, 57, 41, 43, 48, 49, 49, 52, 57, 59, 42, 43, 48, 49, 50, 53, 57, 60, 49, 47, 50, 53, 54, 57, 60, 62, 49, 47, 50, 53, 54, 57, 61, 63, 48, 46, 49, 54, 57, 60, 64, 65, 48, 46, 49, 55, 58, 61, 65, 66, 49, 45, 48, 56, 61, 64, 67, 69, 49, 46, 49, 57, 62, 65, 69, 70, 50, 46, 49, 57, 63, 67, 71, 73, 51, 47, 49, 58, 64, 69, 73, 74, 52, 48, 50, 58, 65, 71, 75, 77, 54, 49, 51, 59, 67, 73, 77, 78, 54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 52, 60, 69, 76, 82, 83, 57, 52, 53, 61, 69, 77, 82, 85, 60, 54, 55, 62, 71, 79, 85, 87, 61, 55, 56, 63, 72, 80, 86, 88, 63, 57, 57, 64, 73, 82, 89, 92, 64, 58, 58, 65, 73, 82, 89, 92, 64, 58, 58, 65, 74, 83, 90, 94, 66, 59, 59, 66, 75, 84, 91, 94, 67, 60, 59, 66, 74, 82, 91, 96, 68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65, 71, 79, 87, 95 }, }, { { /* Luma */ /* Size 4x4 */ 32, 34, 49, 72, 34, 48, 60, 79, 49, 60, 82, 104, 72, 79, 104, 134, /* Size 8x8 */ 32, 32, 34, 38, 46, 56, 68, 78, 32, 33, 35, 39, 45, 54, 64, 74, 34, 35, 39, 45, 51, 58, 68, 76, 38, 39, 45, 54, 61, 69, 78, 86, 46, 45, 51, 61, 71, 80, 90, 99, 56, 54, 58, 69, 80, 92, 103, 113, 68, 64, 68, 78, 90, 103, 117, 128, 78, 74, 76, 86, 99, 113, 128, 140, /* Size 16x16 */ 32, 31, 31, 31, 32, 34, 36, 39, 44, 48, 54, 59, 65, 71, 80, 83, 31, 32, 32, 32, 32, 34, 35, 38, 42, 46, 51, 56, 62, 68, 76, 78, 31, 32, 32, 32, 32, 33, 34, 37, 41, 44, 49, 54, 59, 65, 72, 75, 31, 32, 32, 33, 34, 35, 36, 39, 42, 45, 50, 54, 59, 64, 71, 74, 32, 32, 32, 34, 35, 37, 38, 40, 42, 46, 49, 53, 58, 63, 69, 72, 34, 34, 33, 35, 37, 39, 42, 45, 47, 51, 54, 58, 63, 68, 74, 76, 36, 35, 34, 36, 38, 42, 48, 50, 54, 57, 60, 64, 68, 73, 79, 81, 39, 38, 37, 39, 40, 45, 50, 54, 58, 61, 65, 69, 73, 78, 84, 86, 44, 42, 41, 42, 42, 47, 54, 58, 63, 67, 71, 75, 79, 84, 90, 92, 48, 46, 44, 45, 46, 51, 57, 61, 67, 71, 76, 80, 85, 90, 96, 99, 54, 51, 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 59, 56, 54, 54, 53, 58, 64, 69, 75, 80, 87, 92, 98, 103, 110, 113, 65, 62, 59, 59, 58, 63, 68, 73, 79, 85, 92, 98, 105, 111, 118, 121, 71, 68, 65, 64, 63, 68, 73, 78, 84, 90, 97, 103, 111, 117, 125, 128, 80, 76, 72, 71, 69, 74, 79, 84, 90, 96, 104, 110, 118, 125, 134, 137, 83, 78, 75, 74, 72, 76, 81, 86, 92, 99, 106, 113, 121, 128, 137, 140, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, 48, 54, 54, 59, 59, 65, 65, 71, 71, 80, 80, 83, 83, 87, 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51, 56, 56, 62, 62, 68, 68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51, 56, 56, 62, 62, 68, 68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 41, 44, 44, 49, 49, 54, 54, 59, 59, 65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 41, 44, 44, 49, 49, 54, 54, 59, 59, 65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 42, 45, 45, 50, 50, 54, 54, 59, 59, 64, 64, 71, 71, 74, 74, 77, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 42, 45, 45, 50, 50, 54, 54, 59, 59, 64, 64, 71, 71, 74, 74, 77, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69, 69, 72, 72, 75, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69, 69, 72, 72, 75, 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51, 51, 54, 54, 58, 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51, 51, 54, 54, 58, 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68, 68, 73, 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68, 68, 73, 73, 79, 79, 81, 81, 84, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58, 58, 61, 61, 65, 65, 69, 69, 73, 73, 78, 78, 84, 84, 86, 86, 90, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58, 58, 61, 61, 65, 65, 69, 69, 73, 73, 78, 78, 84, 84, 86, 86, 90, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71, 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102, 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71, 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102, 54, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 54, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 98, 98, 103, 103, 110, 110, 113, 113, 116, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 98, 98, 103, 103, 110, 110, 113, 113, 116, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128, 132, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128, 132, 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134, 134, 137, 137, 141, 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134, 134, 137, 137, 141, 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 87, 83, 83, 79, 79, 77, 77, 75, 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102, 109, 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149, /* Size 4x8 */ 32, 32, 34, 37, 45, 54, 65, 75, 35, 36, 42, 50, 56, 63, 73, 81, 51, 50, 54, 65, 76, 87, 97, 106, 75, 71, 73, 84, 96, 110, 125, 136, /* Size 8x4 */ 32, 35, 51, 75, 32, 36, 50, 71, 34, 42, 54, 73, 37, 50, 65, 84, 45, 56, 76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136, /* Size 8x16 */ 32, 31, 31, 32, 32, 34, 36, 39, 44, 48, 53, 58, 65, 71, 79, 82, 31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 32, 32, 33, 34, 35, 37, 38, 40, 43, 46, 50, 54, 58, 63, 70, 72, 36, 35, 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 44, 42, 41, 42, 42, 48, 54, 58, 63, 67, 71, 75, 79, 84, 90, 92, 53, 51, 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 65, 62, 59, 59, 58, 63, 68, 73, 79, 85, 92, 98, 105, 111, 118, 121, 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136, /* Size 16x8 */ 32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32, 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 33, 35, 38, 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136, /* Size 16x32 */ 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141, 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141, 87, 82, 82, 78, 78, 77, 77, 75, 75, 79, 79, 84, 84, 89, 89, 95, 95, 102, 102, 109, 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149, /* Size 32x16 */ 32, 31, 31, 32, 32, 36, 36, 44, 44, 53, 53, 65, 65, 79, 79, 87, 31, 32, 32, 32, 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32, 32, 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32, 33, 33, 34, 34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 31, 32, 32, 33, 33, 34, 34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 32, 32, 32, 34, 34, 36, 36, 42, 42, 50, 50, 59, 59, 71, 71, 77, 32, 32, 32, 34, 34, 36, 36, 42, 42, 50, 50, 59, 59, 71, 71, 77, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69, 69, 75, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69, 69, 75, 34, 34, 34, 37, 37, 42, 42, 48, 48, 54, 54, 63, 63, 73, 73, 79, 34, 34, 34, 37, 37, 42, 42, 48, 48, 54, 54, 63, 63, 73, 73, 79, 36, 34, 34, 38, 38, 48, 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 36, 34, 34, 38, 38, 48, 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 39, 37, 37, 40, 40, 50, 50, 58, 58, 65, 65, 73, 73, 84, 84, 89, 39, 37, 37, 40, 40, 50, 50, 58, 58, 65, 65, 73, 73, 84, 84, 89, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, 79, 90, 90, 95, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, 79, 90, 90, 95, 48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102, 48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102, 53, 49, 49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 53, 49, 49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 58, 54, 54, 54, 54, 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116, 58, 54, 54, 54, 54, 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116, 65, 60, 60, 58, 58, 68, 68, 79, 79, 92, 92, 105, 105, 118, 118, 124, 65, 60, 60, 58, 58, 68, 68, 79, 79, 92, 92, 105, 105, 118, 118, 124, 71, 65, 65, 63, 63, 73, 73, 84, 84, 97, 97, 111, 111, 125, 125, 132, 71, 65, 65, 63, 63, 73, 73, 84, 84, 97, 97, 111, 111, 125, 125, 132, 79, 72, 72, 70, 70, 79, 79, 90, 90, 104, 104, 118, 118, 133, 133, 141, 79, 72, 72, 70, 70, 79, 79, 90, 90, 104, 104, 118, 118, 133, 133, 141, 82, 75, 75, 72, 72, 81, 81, 92, 92, 106, 106, 121, 121, 136, 136, 144, 82, 75, 75, 72, 72, 81, 81, 92, 92, 106, 106, 121, 121, 136, 136, 144, 87, 79, 79, 76, 76, 84, 84, 96, 96, 109, 109, 124, 124, 141, 141, 149, /* Size 4x16 */ 31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 36, 35, 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 53, 51, 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136, /* Size 16x4 */ 31, 36, 53, 79, 32, 35, 51, 75, 32, 34, 49, 72, 32, 36, 50, 71, 33, 38, 49, 69, 34, 42, 54, 73, 34, 48, 60, 78, 37, 50, 65, 84, 41, 53, 71, 90, 45, 56, 76, 96, 49, 60, 82, 103, 54, 63, 87, 110, 60, 68, 92, 118, 65, 73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136, /* Size 8x32 */ 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133, 136, 136, 141, /* Size 32x8 */ 32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32, 33, 34, 41, 49, 59, 72, 31, 32, 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 32, 34, 36, 42, 50, 59, 71, 32, 33, 35, 38, 42, 49, 58, 69, 32, 33, 35, 38, 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, 38, 48, 54, 60, 68, 78, 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50, 58, 65, 73, 84, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71, 79, 90, 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96, 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, 53, 49, 50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110, 58, 54, 54, 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125, 71, 65, 63, 73, 84, 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136, 82, 75, 72, 81, 92, 106, 121, 136, 87, 79, 76, 84, 96, 109, 124, 141 }, { /* Chroma */ /* Size 4x4 */ 32, 46, 47, 57, 46, 53, 54, 60, 47, 54, 66, 75, 57, 60, 75, 89, /* Size 8x8 */ 31, 34, 42, 47, 48, 52, 57, 61, 34, 39, 45, 46, 46, 49, 53, 57, 42, 45, 48, 49, 50, 52, 55, 58, 47, 46, 49, 54, 56, 58, 61, 64, 48, 46, 50, 56, 61, 65, 68, 71, 52, 49, 52, 58, 65, 71, 75, 79, 57, 53, 55, 61, 68, 75, 82, 86, 61, 57, 58, 64, 71, 79, 86, 91, /* Size 16x16 */ 32, 31, 30, 33, 36, 41, 49, 48, 49, 50, 52, 54, 57, 60, 63, 65, 31, 31, 31, 34, 38, 42, 47, 47, 47, 48, 50, 52, 54, 57, 60, 61, 30, 31, 32, 35, 40, 42, 46, 45, 45, 46, 47, 49, 52, 54, 57, 58, 33, 34, 35, 39, 43, 45, 47, 46, 45, 46, 47, 49, 51, 53, 56, 57, 36, 38, 40, 43, 47, 47, 48, 46, 45, 46, 47, 48, 50, 52, 54, 55, 41, 42, 42, 45, 47, 48, 50, 49, 49, 50, 50, 52, 53, 55, 57, 58, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 48, 47, 45, 46, 46, 49, 53, 54, 55, 56, 57, 58, 60, 61, 63, 64, 49, 47, 45, 45, 45, 49, 53, 55, 58, 60, 61, 62, 63, 65, 67, 68, 50, 48, 46, 46, 46, 50, 54, 56, 60, 61, 63, 65, 67, 68, 71, 71, 52, 50, 47, 47, 47, 50, 54, 57, 61, 63, 66, 68, 70, 72, 75, 75, 54, 52, 49, 49, 48, 52, 55, 58, 62, 65, 68, 71, 73, 75, 78, 79, 57, 54, 52, 51, 50, 53, 56, 60, 63, 67, 70, 73, 76, 79, 82, 83, 60, 57, 54, 53, 52, 55, 58, 61, 65, 68, 72, 75, 79, 82, 85, 86, 63, 60, 57, 56, 54, 57, 60, 63, 67, 71, 75, 78, 82, 85, 89, 90, 65, 61, 58, 57, 55, 58, 61, 64, 68, 71, 75, 79, 83, 86, 90, 91, /* Size 32x32 */ 32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 49, 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 65, 65, 67, 31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 61, 61, 63, 31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 61, 61, 63, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 45, 46, 46, 47, 47, 49, 49, 52, 52, 54, 54, 57, 57, 58, 58, 60, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 45, 46, 46, 47, 47, 49, 49, 52, 52, 54, 54, 57, 57, 58, 58, 60, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 51, 51, 53, 53, 56, 56, 57, 57, 59, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 51, 51, 53, 53, 56, 56, 57, 57, 59, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, 46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 55, 55, 57, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, 46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 55, 55, 57, 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 52, 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 52, 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 62, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 62, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55, 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 61, 63, 63, 64, 64, 66, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55, 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 61, 63, 63, 64, 64, 66, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63, 63, 65, 65, 67, 67, 68, 68, 69, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63, 63, 65, 65, 67, 67, 68, 68, 69, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61, 61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61, 61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 76, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 76, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75, 78, 78, 79, 79, 80, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75, 78, 78, 79, 79, 80, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85, 85, 86, 86, 88, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85, 85, 86, 86, 88, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91, 91, 93, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91, 91, 93, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95, /* Size 4x8 */ 31, 36, 43, 46, 46, 50, 54, 59, 47, 47, 50, 53, 54, 55, 58, 61, 50, 47, 50, 57, 64, 68, 72, 75, 60, 56, 57, 64, 71, 78, 85, 90, /* Size 8x4 */ 31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64, 46, 54, 64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90, /* Size 8x16 */ 32, 31, 30, 33, 37, 42, 49, 48, 49, 50, 52, 54, 57, 60, 63, 64, 31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 37, 38, 40, 43, 47, 47, 48, 47, 46, 46, 47, 49, 50, 52, 55, 56, 48, 47, 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 49, 47, 45, 46, 45, 49, 53, 56, 58, 59, 61, 62, 64, 65, 67, 68, 52, 50, 48, 47, 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, 57, 54, 52, 51, 50, 53, 57, 60, 64, 67, 71, 73, 76, 79, 82, 83, 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85, 89, 90, /* Size 16x8 */ 32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90, /* Size 16x32 */ 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 66, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 73, 73, 77, 77, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95, /* Size 32x16 */ 32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31, 31, 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31, 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32, 40, 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40, 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43, 47, 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47, 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47, 45, 45, 47, 47, 50, 50, 54, 54, 57, 37, 40, 40, 47, 47, 47, 47, 45, 45, 47, 47, 50, 50, 54, 54, 57, 42, 43, 43, 47, 47, 50, 50, 49, 49, 50, 50, 53, 53, 57, 57, 60, 42, 43, 43, 47, 47, 50, 50, 49, 49, 50, 50, 53, 53, 57, 57, 60, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54, 57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54, 57, 57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60, 64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60, 64, 64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67, 69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67, 69, 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48, 48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49, 49, 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49, 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56, 56, 64, 64, 70, 70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56, 64, 64, 70, 70, 76, 76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65, 65, 72, 72, 79, 79, 85, 85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65, 72, 72, 79, 79, 85, 85, 88, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75, 82, 82, 89, 89, 92, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75, 82, 82, 89, 89, 92, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83, 83, 90, 90, 93, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83, 83, 90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69, 69, 77, 77, 84, 84, 92, 92, 95, /* Size 4x16 */ 31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 48, 47, 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 52, 50, 48, 47, 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85, 89, 90, /* Size 16x4 */ 31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56, 40, 47, 47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64, 45, 53, 61, 67, 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 56, 70, 82, 54, 58, 72, 85, 57, 60, 75, 89, 59, 61, 75, 90, /* Size 8x32 */ 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, /* Size 32x8 */ 32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 31, 31, 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57, 30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47, 45, 47, 50, 54, 37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54, 57, 60, 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67, 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71, 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, 52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78, 54, 50, 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85, 60, 54, 52, 58, 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90, 64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63, 69, 77, 84, 92 }, }, { { /* Luma */ /* Size 4x4 */ 32, 33, 45, 62, 33, 39, 51, 64, 45, 51, 71, 87, 62, 64, 87, 108, /* Size 8x8 */ 31, 32, 32, 35, 42, 51, 59, 69, 32, 32, 33, 35, 41, 49, 56, 65, 32, 33, 35, 38, 43, 49, 56, 64, 35, 35, 38, 48, 54, 59, 66, 73, 42, 41, 43, 54, 63, 71, 77, 85, 51, 49, 49, 59, 71, 81, 89, 97, 59, 56, 56, 66, 77, 89, 98, 108, 69, 65, 64, 73, 85, 97, 108, 119, /* Size 16x16 */ 32, 31, 31, 31, 32, 34, 35, 38, 41, 45, 48, 54, 59, 65, 71, 80, 31, 32, 32, 32, 32, 34, 35, 37, 40, 43, 46, 51, 56, 62, 68, 76, 31, 32, 32, 32, 32, 33, 34, 36, 38, 41, 44, 49, 54, 59, 65, 72, 31, 32, 32, 33, 34, 35, 36, 38, 40, 42, 45, 50, 54, 59, 64, 71, 32, 32, 32, 34, 35, 37, 38, 39, 41, 43, 46, 49, 53, 58, 63, 69, 34, 34, 33, 35, 37, 39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74, 35, 35, 34, 36, 38, 42, 46, 48, 50, 53, 55, 59, 62, 67, 72, 78, 38, 37, 36, 38, 39, 44, 48, 51, 54, 57, 59, 63, 67, 71, 76, 82, 41, 40, 38, 40, 41, 46, 50, 54, 57, 60, 63, 67, 71, 75, 80, 86, 45, 43, 41, 42, 43, 48, 53, 57, 60, 65, 68, 72, 76, 81, 85, 91, 48, 46, 44, 45, 46, 51, 55, 59, 63, 68, 71, 76, 80, 85, 90, 96, 54, 51, 49, 50, 49, 54, 59, 63, 67, 72, 76, 82, 87, 92, 97, 104, 59, 56, 54, 54, 53, 58, 62, 67, 71, 76, 80, 87, 92, 98, 103, 110, 65, 62, 59, 59, 58, 63, 67, 71, 75, 81, 85, 92, 98, 105, 111, 118, 71, 68, 65, 64, 63, 68, 72, 76, 80, 85, 90, 97, 103, 111, 117, 125, 80, 76, 72, 71, 69, 74, 78, 82, 86, 91, 96, 104, 110, 118, 125, 134, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, 45, 48, 48, 53, 54, 57, 59, 62, 65, 67, 71, 72, 80, 80, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46, 51, 52, 55, 56, 59, 62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46, 51, 51, 55, 56, 59, 62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 36, 38, 39, 41, 42, 45, 45, 49, 50, 53, 54, 57, 60, 62, 66, 66, 73, 73, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 36, 37, 38, 41, 41, 44, 44, 49, 49, 52, 54, 56, 59, 61, 65, 65, 72, 72, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 37, 38, 39, 41, 42, 45, 45, 49, 49, 52, 54, 56, 59, 61, 64, 65, 72, 72, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 45, 45, 49, 50, 52, 54, 56, 59, 60, 64, 65, 71, 71, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 38, 39, 40, 42, 43, 45, 45, 49, 49, 52, 54, 56, 59, 60, 64, 64, 70, 70, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 37, 38, 38, 39, 40, 41, 42, 43, 46, 46, 49, 49, 52, 53, 55, 58, 59, 63, 63, 69, 69, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 41, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54, 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 34, 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54, 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 35, 35, 35, 34, 34, 35, 36, 36, 38, 38, 42, 42, 46, 47, 48, 49, 50, 52, 53, 55, 55, 58, 59, 61, 62, 64, 67, 68, 72, 72, 78, 78, 36, 35, 35, 34, 34, 35, 36, 37, 38, 38, 42, 42, 47, 48, 50, 50, 52, 54, 54, 57, 57, 59, 60, 62, 64, 66, 68, 69, 73, 73, 79, 79, 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 50, 51, 52, 54, 56, 57, 59, 59, 62, 63, 65, 67, 69, 71, 72, 76, 76, 82, 82, 39, 38, 38, 38, 37, 38, 39, 39, 40, 41, 45, 45, 49, 50, 52, 54, 55, 58, 58, 61, 61, 64, 65, 67, 69, 71, 73, 74, 78, 78, 84, 84, 41, 40, 40, 39, 38, 39, 40, 40, 41, 41, 46, 46, 50, 52, 54, 55, 57, 60, 60, 63, 63, 67, 67, 70, 71, 73, 75, 77, 80, 81, 86, 86, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54, 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, 45, 43, 43, 42, 41, 42, 42, 43, 43, 43, 48, 48, 53, 54, 57, 58, 60, 64, 65, 68, 68, 72, 72, 75, 76, 78, 81, 82, 85, 86, 91, 91, 48, 46, 46, 45, 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75, 76, 79, 80, 83, 85, 87, 90, 91, 96, 96, 48, 46, 46, 45, 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75, 76, 79, 80, 83, 85, 87, 90, 91, 96, 96, 53, 51, 51, 49, 49, 49, 49, 49, 49, 49, 54, 54, 58, 59, 62, 64, 67, 71, 72, 75, 75, 81, 81, 85, 86, 89, 91, 93, 97, 97, 103, 103, 54, 52, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 57, 55, 55, 53, 52, 52, 52, 52, 52, 52, 57, 57, 61, 62, 65, 67, 70, 74, 75, 79, 79, 85, 85, 89, 90, 93, 96, 98, 102, 102, 108, 108, 59, 56, 56, 54, 54, 54, 54, 54, 53, 54, 58, 58, 62, 64, 67, 69, 71, 75, 76, 80, 80, 86, 87, 90, 92, 95, 98, 99, 103, 104, 110, 110, 62, 59, 59, 57, 56, 56, 56, 56, 55, 56, 60, 60, 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93, 95, 98, 101, 103, 107, 108, 114, 114, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, 67, 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, 106, 111, 111, 118, 118, 67, 64, 64, 62, 61, 61, 60, 60, 59, 60, 64, 64, 68, 69, 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106, 108, 113, 113, 120, 120, 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68, 72, 73, 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111, 113, 117, 118, 125, 125, 72, 69, 69, 66, 65, 65, 65, 64, 63, 64, 68, 68, 72, 73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108, 111, 113, 118, 119, 126, 126, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126, 134, 134, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126, 134, 134, /* Size 4x8 */ 32, 32, 33, 35, 41, 49, 57, 66, 34, 34, 37, 43, 48, 54, 60, 68, 43, 42, 44, 54, 64, 71, 78, 86, 62, 59, 58, 68, 79, 91, 101, 111, /* Size 8x4 */ 32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58, 35, 43, 54, 68, 41, 48, 64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111, /* Size 8x16 */ 32, 31, 31, 32, 32, 34, 35, 38, 41, 44, 48, 53, 58, 65, 71, 79, 31, 32, 32, 32, 33, 34, 34, 36, 39, 42, 45, 49, 54, 60, 65, 72, 32, 32, 33, 34, 35, 37, 38, 40, 41, 43, 46, 50, 54, 58, 63, 70, 36, 35, 34, 36, 38, 42, 47, 49, 51, 54, 56, 60, 63, 68, 73, 79, 44, 42, 41, 42, 42, 48, 52, 56, 60, 64, 67, 71, 75, 79, 84, 90, 53, 51, 49, 50, 49, 54, 59, 63, 67, 72, 76, 82, 87, 92, 97, 104, 62, 59, 57, 57, 56, 61, 65, 69, 74, 79, 83, 90, 95, 102, 108, 115, 73, 69, 66, 65, 64, 69, 73, 77, 81, 86, 91, 99, 105, 112, 119, 127, /* Size 16x8 */ 32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 59, 69, 31, 32, 33, 34, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, 57, 65, 32, 33, 35, 38, 42, 49, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59, 65, 73, 38, 36, 40, 49, 56, 63, 69, 77, 41, 39, 41, 51, 60, 67, 74, 81, 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, 53, 49, 50, 60, 71, 82, 90, 99, 58, 54, 54, 63, 75, 87, 95, 105, 65, 60, 58, 68, 79, 92, 102, 112, 71, 65, 63, 73, 84, 97, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127, /* Size 16x32 */ 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 38, 39, 41, 42, 45, 45, 49, 50, 53, 54, 57, 60, 62, 66, 66, 73, 73, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57, 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 36, 36, 37, 37, 38, 40, 41, 42, 43, 46, 46, 49, 50, 52, 54, 56, 59, 60, 64, 64, 71, 71, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, 39, 42, 43, 44, 45, 46, 48, 48, 51, 51, 54, 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73, 79, 79, 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 49, 51, 52, 54, 56, 56, 59, 59, 62, 63, 65, 67, 69, 71, 72, 76, 76, 82, 82, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, 44, 43, 43, 42, 41, 42, 43, 43, 43, 44, 48, 48, 53, 54, 57, 58, 60, 64, 64, 67, 67, 71, 72, 75, 76, 78, 80, 82, 85, 86, 91, 91, 53, 51, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 53, 51, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57, 57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102, 103, 108, 108, 115, 115, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, 67, 68, 71, 73, 76, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, 106, 111, 111, 118, 118, 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119, 127, 127, 79, 75, 75, 73, 72, 71, 71, 70, 69, 69, 73, 73, 77, 78, 81, 84, 86, 90, 91, 96, 96, 103, 103, 108, 110, 114, 118, 120, 125, 125, 133, 133, /* Size 32x16 */ 32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53, 53, 62, 65, 73, 79, 31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 60, 62, 70, 75, 31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 59, 62, 69, 75, 31, 32, 32, 32, 32, 33, 35, 36, 41, 42, 50, 50, 58, 60, 67, 73, 31, 32, 32, 32, 33, 33, 34, 36, 41, 41, 49, 49, 57, 59, 66, 72, 31, 32, 32, 33, 33, 34, 35, 37, 41, 42, 49, 49, 57, 59, 66, 71, 32, 32, 32, 33, 34, 35, 36, 38, 42, 43, 50, 50, 57, 59, 65, 71, 32, 32, 32, 34, 34, 35, 37, 38, 42, 43, 49, 49, 56, 59, 65, 70, 32, 32, 33, 34, 35, 37, 38, 39, 42, 43, 49, 49, 56, 58, 64, 69, 32, 33, 33, 34, 35, 37, 39, 40, 43, 44, 50, 50, 56, 58, 64, 69, 34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, 34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, 35, 34, 34, 37, 38, 42, 47, 48, 52, 53, 59, 59, 65, 67, 73, 77, 36, 35, 34, 37, 38, 43, 48, 49, 54, 54, 60, 60, 66, 68, 74, 78, 38, 36, 36, 38, 40, 44, 49, 51, 56, 57, 63, 63, 69, 71, 77, 81, 39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65, 65, 71, 73, 79, 84, 41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67, 67, 74, 76, 81, 86, 44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71, 71, 78, 79, 85, 90, 44, 42, 42, 43, 43, 48, 54, 56, 64, 64, 72, 72, 79, 81, 86, 91, 48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 53, 49, 49, 49, 49, 54, 59, 62, 71, 71, 81, 81, 89, 91, 98, 103, 53, 50, 49, 50, 50, 54, 60, 63, 71, 72, 82, 82, 90, 92, 99, 103, 57, 53, 52, 52, 52, 57, 62, 65, 74, 75, 85, 85, 94, 96, 103, 108, 58, 54, 54, 54, 54, 58, 63, 67, 75, 76, 87, 87, 95, 98, 105, 110, 61, 57, 57, 56, 56, 60, 66, 69, 77, 78, 89, 89, 98, 101, 108, 114, 65, 60, 60, 59, 58, 63, 68, 71, 79, 80, 92, 92, 102, 105, 112, 118, 67, 62, 61, 60, 60, 64, 69, 72, 81, 82, 94, 94, 103, 106, 114, 120, 71, 66, 65, 64, 63, 68, 73, 76, 84, 85, 97, 97, 108, 111, 119, 125, 72, 66, 66, 64, 64, 68, 73, 76, 85, 86, 98, 98, 108, 111, 119, 125, 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133, 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133, /* Size 4x16 */ 31, 32, 32, 32, 32, 34, 34, 36, 39, 42, 45, 50, 54, 60, 66, 73, 34, 34, 33, 35, 37, 39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74, 44, 43, 41, 43, 43, 48, 53, 57, 60, 64, 67, 72, 76, 80, 85, 91, 65, 62, 59, 59, 58, 63, 67, 71, 76, 81, 85, 92, 98, 105, 111, 118, /* Size 16x4 */ 31, 34, 44, 65, 32, 34, 43, 62, 32, 33, 41, 59, 32, 35, 43, 59, 32, 37, 43, 58, 34, 39, 48, 63, 34, 42, 53, 67, 36, 44, 57, 71, 39, 46, 60, 76, 42, 48, 64, 81, 45, 51, 67, 85, 50, 54, 72, 92, 54, 58, 76, 98, 60, 63, 80, 105, 66, 68, 85, 111, 73, 74, 91, 118, /* Size 8x32 */ 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57, 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73, 79, 79, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, 53, 51, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57, 57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102, 103, 108, 108, 115, 115, 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119, 127, 127, /* Size 32x8 */ 32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 60, 70, 31, 32, 32, 35, 42, 51, 59, 69, 31, 32, 32, 35, 41, 50, 58, 67, 31, 32, 33, 34, 41, 49, 57, 66, 31, 32, 33, 35, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, 57, 65, 32, 32, 34, 37, 42, 49, 56, 65, 32, 33, 35, 38, 42, 49, 56, 64, 32, 33, 35, 39, 43, 50, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 34, 34, 37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59, 65, 73, 36, 34, 38, 48, 54, 60, 66, 74, 38, 36, 40, 49, 56, 63, 69, 77, 39, 37, 40, 50, 58, 65, 71, 79, 41, 39, 41, 51, 60, 67, 74, 81, 44, 41, 43, 53, 63, 71, 78, 85, 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, 48, 45, 46, 56, 67, 76, 83, 91, 53, 49, 49, 59, 71, 81, 89, 98, 53, 49, 50, 60, 71, 82, 90, 99, 57, 52, 52, 62, 74, 85, 94, 103, 58, 54, 54, 63, 75, 87, 95, 105, 61, 57, 56, 66, 77, 89, 98, 108, 65, 60, 58, 68, 79, 92, 102, 112, 67, 61, 60, 69, 81, 94, 103, 114, 71, 65, 63, 73, 84, 97, 108, 119, 72, 66, 64, 73, 85, 98, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127, 79, 72, 70, 79, 90, 104, 115, 127 }, { /* Chroma */ /* Size 4x4 */ 31, 42, 47, 53, 42, 48, 50, 54, 47, 50, 61, 67, 53, 54, 67, 78, /* Size 8x8 */ 31, 32, 38, 48, 47, 50, 53, 57, 32, 35, 42, 47, 45, 47, 50, 54, 38, 42, 47, 48, 45, 47, 49, 52, 48, 47, 48, 53, 53, 54, 56, 58, 47, 45, 45, 53, 58, 61, 63, 65, 50, 47, 47, 54, 61, 66, 69, 72, 53, 50, 49, 56, 63, 69, 73, 77, 57, 54, 52, 58, 65, 72, 77, 82, /* Size 16x16 */ 32, 31, 30, 33, 36, 41, 47, 49, 49, 49, 50, 52, 54, 57, 60, 63, 31, 31, 31, 34, 38, 42, 46, 47, 47, 47, 48, 50, 52, 54, 57, 60, 30, 31, 32, 35, 40, 42, 45, 46, 45, 45, 46, 47, 49, 52, 54, 57, 33, 34, 35, 39, 43, 45, 47, 46, 46, 45, 46, 47, 49, 51, 53, 56, 36, 38, 40, 43, 47, 47, 47, 47, 46, 45, 46, 47, 48, 50, 52, 54, 41, 42, 42, 45, 47, 48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 57, 47, 46, 45, 47, 47, 50, 52, 52, 52, 52, 53, 53, 55, 56, 58, 60, 49, 47, 46, 46, 47, 50, 52, 53, 54, 55, 55, 56, 57, 58, 60, 62, 49, 47, 45, 46, 46, 49, 52, 54, 55, 57, 58, 59, 60, 61, 63, 65, 49, 47, 45, 45, 45, 49, 52, 55, 57, 59, 60, 61, 63, 64, 66, 68, 50, 48, 46, 46, 46, 50, 53, 55, 58, 60, 61, 63, 65, 67, 68, 71, 52, 50, 47, 47, 47, 50, 53, 56, 59, 61, 63, 66, 68, 70, 72, 75, 54, 52, 49, 49, 48, 52, 55, 57, 60, 63, 65, 68, 71, 73, 75, 78, 57, 54, 52, 51, 50, 53, 56, 58, 61, 64, 67, 70, 73, 76, 79, 82, 60, 57, 54, 53, 52, 55, 58, 60, 63, 66, 68, 72, 75, 79, 82, 85, 63, 60, 57, 56, 54, 57, 60, 62, 65, 68, 71, 75, 78, 82, 85, 89, /* Size 32x32 */ 32, 31, 31, 30, 30, 32, 33, 34, 36, 37, 41, 41, 47, 49, 49, 48, 49, 49, 49, 50, 50, 52, 52, 54, 54, 56, 57, 58, 60, 60, 63, 63, 31, 31, 31, 31, 31, 32, 34, 35, 38, 38, 42, 42, 46, 48, 47, 47, 47, 47, 47, 48, 48, 50, 50, 51, 52, 53, 54, 55, 57, 57, 60, 60, 31, 31, 31, 31, 31, 33, 34, 35, 38, 39, 42, 42, 46, 47, 47, 47, 47, 47, 47, 48, 48, 49, 50, 51, 52, 53, 54, 55, 57, 57, 60, 60, 30, 31, 31, 31, 31, 33, 35, 36, 39, 40, 42, 42, 46, 47, 46, 46, 46, 45, 46, 47, 47, 48, 48, 50, 50, 51, 52, 53, 55, 55, 58, 58, 30, 31, 31, 31, 32, 33, 35, 36, 40, 40, 42, 42, 45, 46, 46, 45, 45, 45, 45, 46, 46, 47, 47, 49, 49, 51, 52, 52, 54, 54, 57, 57, 32, 32, 33, 33, 33, 35, 37, 38, 41, 42, 43, 43, 46, 47, 46, 46, 45, 45, 45, 46, 46, 47, 47, 49, 49, 50, 51, 52, 54, 54, 57, 57, 33, 34, 34, 35, 35, 37, 39, 40, 43, 43, 45, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 50, 51, 52, 53, 54, 56, 56, 34, 35, 35, 36, 36, 38, 40, 41, 44, 44, 45, 45, 47, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 48, 49, 50, 51, 51, 53, 53, 55, 55, 36, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 47, 48, 47, 46, 46, 45, 45, 46, 46, 46, 47, 48, 48, 49, 50, 50, 52, 52, 54, 54, 37, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 45, 46, 46, 46, 47, 47, 48, 48, 49, 50, 51, 52, 52, 55, 55, 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 51, 52, 52, 53, 54, 55, 55, 57, 57, 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 51, 52, 52, 53, 54, 55, 55, 57, 57, 47, 46, 46, 46, 45, 46, 47, 47, 47, 48, 50, 50, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 54, 55, 55, 56, 56, 58, 58, 60, 60, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 58, 60, 60, 49, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 62, 62, 48, 47, 47, 46, 45, 46, 46, 46, 46, 47, 49, 49, 52, 53, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 59, 60, 60, 61, 62, 63, 63, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 52, 53, 54, 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 63, 63, 65, 65, 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, 55, 55, 57, 58, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64, 65, 65, 67, 67, 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56, 57, 59, 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63, 63, 65, 65, 66, 67, 67, 68, 69, 71, 71, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63, 63, 65, 65, 66, 67, 67, 68, 69, 71, 71, 52, 50, 49, 48, 47, 47, 47, 47, 46, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66, 67, 68, 69, 70, 71, 72, 72, 74, 74, 52, 50, 50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 54, 51, 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54, 55, 57, 58, 60, 62, 62, 65, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 77, 77, 54, 52, 52, 50, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68, 70, 71, 72, 73, 74, 75, 76, 78, 78, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, 52, 52, 55, 56, 58, 59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 75, 75, 77, 77, 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 56, 58, 60, 61, 63, 64, 67, 67, 70, 70, 72, 73, 75, 76, 77, 79, 79, 82, 82, 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56, 57, 59, 60, 62, 64, 65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 80, 80, 83, 83, 60, 57, 57, 55, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66, 68, 68, 72, 72, 74, 75, 77, 79, 80, 82, 82, 85, 85, 60, 57, 57, 55, 54, 54, 54, 53, 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75, 76, 77, 79, 80, 82, 82, 85, 85, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89, /* Size 4x8 */ 31, 33, 40, 47, 45, 48, 51, 55, 42, 44, 47, 50, 49, 50, 52, 55, 47, 45, 46, 54, 59, 61, 63, 66, 54, 51, 50, 57, 64, 70, 75, 79, /* Size 8x4 */ 31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57, 45, 49, 59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79, /* Size 8x16 */ 32, 31, 30, 33, 37, 42, 47, 48, 48, 49, 50, 52, 54, 57, 60, 63, 31, 31, 32, 36, 40, 43, 46, 46, 45, 45, 46, 48, 50, 52, 54, 57, 37, 38, 40, 43, 47, 47, 48, 47, 46, 46, 46, 47, 49, 50, 52, 55, 48, 47, 46, 47, 47, 50, 52, 53, 53, 53, 54, 54, 55, 56, 58, 60, 49, 47, 45, 46, 45, 49, 53, 55, 57, 58, 59, 61, 62, 64, 65, 67, 52, 50, 48, 47, 47, 50, 53, 56, 59, 62, 64, 66, 68, 70, 72, 75, 56, 53, 51, 50, 49, 53, 55, 58, 61, 64, 66, 70, 72, 75, 77, 80, 61, 57, 55, 54, 52, 56, 58, 61, 63, 66, 69, 73, 76, 79, 82, 86, /* Size 16x8 */ 32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57, 30, 32, 40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54, 37, 40, 47, 47, 45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53, 55, 58, 48, 46, 47, 53, 55, 56, 58, 61, 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 70, 73, 54, 50, 49, 55, 62, 68, 72, 76, 57, 52, 50, 56, 64, 70, 75, 79, 60, 54, 52, 58, 65, 72, 77, 82, 63, 57, 55, 60, 67, 75, 80, 86, /* Size 16x32 */ 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49, 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32, 32, 33, 35, 37, 40, 40, 43, 43, 46, 47, 46, 46, 46, 45, 46, 47, 47, 48, 48, 50, 50, 51, 52, 53, 55, 55, 58, 58, 31, 31, 31, 32, 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51, 52, 53, 54, 55, 57, 57, 35, 36, 36, 37, 37, 39, 40, 42, 45, 45, 46, 46, 47, 47, 47, 46, 46, 45, 46, 46, 46, 47, 47, 48, 49, 50, 51, 51, 53, 53, 56, 56, 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55, 42, 42, 42, 42, 42, 44, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 49, 49, 50, 50, 50, 50, 51, 52, 52, 53, 54, 55, 55, 58, 58, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59, 60, 60, 48, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 62, 62, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 53, 54, 55, 56, 57, 59, 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 52, 50, 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 52, 50, 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78, 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 57, 58, 60, 61, 64, 64, 67, 67, 70, 71, 72, 73, 75, 76, 77, 79, 79, 82, 82, 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 64, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89, /* Size 32x16 */ 32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31, 31, 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31, 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32, 37, 39, 42, 46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37, 40, 42, 46, 46, 45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41, 44, 46, 46, 45, 45, 48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45, 47, 46, 46, 46, 47, 47, 50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47, 47, 45, 46, 47, 47, 50, 51, 53, 55, 37, 40, 40, 45, 47, 47, 47, 47, 45, 46, 47, 47, 49, 50, 52, 54, 37, 40, 40, 45, 47, 47, 48, 47, 46, 46, 47, 47, 49, 50, 53, 55, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49, 50, 50, 53, 53, 56, 57, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49, 50, 50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50, 52, 52, 53, 53, 53, 53, 55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 54, 56, 57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55, 56, 56, 58, 58, 61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57, 59, 60, 62, 64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61, 63, 65, 49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67, 49, 46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47, 46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47, 47, 50, 54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47, 50, 54, 56, 61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51, 55, 57, 62, 62, 68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55, 57, 62, 63, 68, 68, 72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58, 63, 63, 69, 69, 74, 75, 78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64, 64, 70, 70, 75, 76, 79, 82, 58, 53, 53, 51, 51, 54, 57, 59, 64, 65, 71, 71, 76, 77, 80, 83, 60, 55, 54, 53, 52, 55, 58, 60, 65, 66, 72, 72, 77, 79, 82, 85, 60, 55, 55, 53, 53, 55, 59, 60, 65, 66, 73, 73, 78, 79, 83, 85, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80, 82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80, 82, 86, 89, /* Size 4x16 */ 31, 31, 32, 35, 40, 43, 46, 46, 46, 46, 47, 48, 50, 52, 55, 58, 42, 42, 42, 45, 47, 48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 58, 49, 47, 45, 46, 46, 49, 53, 55, 57, 59, 60, 61, 63, 64, 66, 68, 57, 54, 52, 51, 50, 53, 56, 58, 61, 64, 67, 71, 73, 76, 79, 82, /* Size 16x4 */ 31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51, 40, 47, 46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58, 46, 49, 57, 61, 46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71, 50, 52, 63, 73, 52, 53, 64, 76, 55, 55, 66, 79, 58, 58, 68, 82, /* Size 8x32 */ 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49, 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32, 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51, 52, 53, 54, 55, 57, 57, 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59, 60, 60, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, 52, 50, 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78, 80, 80, 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86, /* Size 32x8 */ 32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58, 31, 31, 38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56, 30, 32, 40, 46, 45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54, 33, 36, 43, 47, 46, 47, 50, 54, 34, 37, 44, 47, 45, 47, 50, 53, 37, 40, 47, 47, 45, 47, 49, 52, 37, 40, 47, 48, 46, 47, 49, 53, 42, 43, 47, 50, 49, 50, 53, 56, 42, 43, 47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53, 55, 58, 49, 46, 48, 53, 53, 54, 56, 59, 48, 46, 47, 53, 55, 56, 58, 61, 48, 46, 47, 53, 56, 57, 59, 62, 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 61, 63, 66, 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, 50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 69, 72, 52, 48, 47, 54, 61, 66, 70, 73, 53, 49, 48, 55, 62, 68, 71, 75, 54, 50, 49, 55, 62, 68, 72, 76, 55, 51, 49, 56, 63, 69, 74, 78, 57, 52, 50, 56, 64, 70, 75, 79, 58, 53, 51, 57, 64, 71, 76, 80, 60, 54, 52, 58, 65, 72, 77, 82, 60, 55, 53, 59, 65, 73, 78, 83, 63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60, 67, 75, 80, 86 }, }, { { /* Luma */ /* Size 4x4 */ 32, 33, 42, 55, 33, 38, 46, 57, 42, 46, 63, 75, 55, 57, 75, 92, /* Size 8x8 */ 31, 32, 32, 34, 38, 46, 52, 63, 32, 32, 32, 34, 37, 44, 49, 59, 32, 32, 35, 37, 40, 45, 49, 58, 34, 34, 37, 42, 47, 52, 56, 65, 38, 37, 40, 47, 54, 60, 65, 73, 46, 44, 45, 52, 60, 69, 75, 84, 52, 49, 49, 56, 65, 75, 82, 92, 63, 59, 58, 65, 73, 84, 92, 105, /* Size 16x16 */ 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 54, 58, 61, 65, 31, 32, 32, 32, 32, 32, 34, 35, 38, 40, 42, 46, 51, 55, 58, 62, 31, 32, 32, 32, 32, 32, 33, 34, 37, 38, 41, 44, 49, 53, 56, 59, 31, 32, 32, 33, 33, 33, 35, 36, 38, 40, 42, 45, 49, 53, 56, 59, 32, 32, 32, 33, 34, 34, 36, 37, 39, 40, 42, 45, 49, 53, 55, 59, 32, 32, 32, 33, 34, 35, 37, 38, 40, 41, 42, 46, 49, 52, 55, 58, 34, 34, 33, 35, 36, 37, 39, 42, 44, 46, 47, 51, 54, 57, 60, 63, 36, 35, 34, 36, 37, 38, 42, 48, 50, 52, 54, 57, 60, 63, 65, 68, 38, 38, 37, 38, 39, 40, 44, 50, 52, 54, 57, 60, 64, 67, 69, 72, 41, 40, 38, 40, 40, 41, 46, 52, 54, 57, 60, 63, 67, 70, 73, 75, 44, 42, 41, 42, 42, 42, 47, 54, 57, 60, 63, 67, 71, 74, 77, 79, 48, 46, 44, 45, 45, 46, 51, 57, 60, 63, 67, 71, 76, 79, 82, 85, 54, 51, 49, 49, 49, 49, 54, 60, 64, 67, 71, 76, 82, 86, 89, 92, 58, 55, 53, 53, 53, 52, 57, 63, 67, 70, 74, 79, 86, 90, 93, 97, 61, 58, 56, 56, 55, 55, 60, 65, 69, 73, 77, 82, 89, 93, 97, 101, 65, 62, 59, 59, 59, 58, 63, 68, 72, 75, 79, 85, 92, 97, 101, 105, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, 41, 44, 44, 47, 48, 50, 54, 54, 58, 59, 61, 65, 65, 70, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42, 46, 47, 49, 52, 52, 56, 57, 59, 63, 63, 67, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42, 45, 46, 48, 51, 51, 55, 56, 58, 62, 62, 67, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39, 42, 42, 45, 45, 47, 50, 50, 54, 55, 57, 61, 61, 65, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 37, 38, 41, 41, 44, 44, 46, 49, 49, 53, 54, 56, 59, 59, 64, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 37, 38, 41, 41, 44, 44, 46, 49, 49, 53, 54, 56, 59, 59, 64, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, 49, 49, 53, 54, 56, 59, 59, 63, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36, 36, 36, 38, 39, 40, 42, 42, 45, 45, 47, 50, 50, 53, 54, 56, 59, 59, 63, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36, 37, 37, 39, 39, 40, 42, 42, 45, 45, 47, 49, 49, 53, 54, 55, 59, 59, 63, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 42, 42, 45, 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 42, 42, 45, 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 43, 44, 45, 46, 46, 49, 50, 51, 53, 53, 56, 57, 59, 62, 62, 66, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39, 39, 41, 42, 42, 44, 45, 46, 47, 47, 50, 51, 52, 54, 54, 57, 58, 60, 63, 63, 67, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42, 45, 45, 46, 47, 48, 50, 50, 52, 53, 54, 56, 56, 59, 60, 62, 65, 65, 69, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 50, 50, 52, 54, 54, 56, 57, 58, 60, 60, 63, 64, 65, 68, 68, 72, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 50, 50, 52, 54, 54, 56, 57, 58, 60, 60, 63, 64, 65, 68, 68, 72, 38, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44, 45, 47, 50, 50, 53, 54, 55, 58, 58, 60, 61, 62, 65, 65, 68, 69, 70, 73, 73, 77, 41, 40, 40, 39, 38, 38, 40, 40, 40, 41, 41, 45, 46, 48, 52, 52, 54, 55, 57, 60, 60, 62, 63, 65, 67, 67, 70, 71, 73, 75, 75, 79, 44, 42, 42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 49, 50, 52, 56, 56, 59, 60, 62, 66, 66, 69, 70, 72, 75, 75, 78, 79, 81, 84, 84, 88, 48, 47, 46, 45, 44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57, 57, 60, 61, 63, 67, 67, 70, 71, 73, 76, 76, 79, 80, 82, 85, 85, 89, 50, 49, 48, 47, 46, 46, 47, 47, 47, 47, 47, 51, 52, 54, 58, 58, 61, 62, 65, 68, 68, 72, 73, 75, 78, 78, 82, 83, 85, 88, 88, 92, 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, 92, 96, 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101, 59, 57, 56, 55, 54, 54, 54, 54, 54, 53, 53, 57, 58, 60, 64, 64, 68, 69, 71, 75, 75, 79, 80, 83, 87, 87, 91, 92, 94, 98, 98, 102, 61, 59, 58, 57, 56, 56, 56, 56, 55, 55, 55, 59, 60, 62, 65, 65, 69, 70, 73, 77, 77, 81, 82, 85, 89, 89, 93, 94, 97, 101, 101, 105, 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105, 105, 109, 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105, 105, 109, 70, 67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66, 67, 69, 72, 72, 76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105, 109, 109, 114, /* Size 4x8 */ 32, 32, 32, 34, 38, 44, 50, 61, 32, 33, 35, 37, 40, 45, 50, 58, 42, 41, 42, 50, 58, 66, 71, 79, 56, 53, 52, 59, 68, 78, 86, 97, /* Size 8x4 */ 32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59, 38, 40, 58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97, /* Size 8x16 */ 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 53, 57, 61, 65, 31, 32, 32, 32, 32, 33, 34, 34, 37, 39, 41, 45, 49, 53, 56, 60, 32, 32, 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 35, 35, 34, 35, 36, 37, 41, 46, 47, 49, 51, 54, 57, 60, 63, 66, 39, 38, 37, 38, 39, 40, 44, 50, 52, 54, 57, 60, 64, 67, 69, 72, 44, 42, 41, 42, 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, 53, 51, 49, 49, 49, 49, 54, 60, 64, 67, 71, 76, 82, 86, 89, 92, 65, 62, 59, 59, 58, 58, 63, 68, 72, 76, 79, 85, 92, 97, 100, 105, /* Size 16x8 */ 32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 51, 62, 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, 49, 59, 32, 32, 34, 36, 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 34, 34, 37, 41, 44, 48, 54, 63, 36, 34, 38, 46, 50, 54, 60, 68, 38, 37, 40, 47, 52, 57, 64, 72, 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, 48, 45, 46, 54, 60, 67, 76, 85, 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60, 67, 74, 86, 97, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105, /* Size 16x32 */ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39, 41, 41, 44, 45, 47, 50, 50, 54, 55, 57, 61, 61, 65, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49, 53, 54, 56, 60, 60, 64, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, 50, 50, 53, 54, 56, 59, 59, 63, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66, 66, 70, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48, 50, 50, 51, 53, 53, 56, 56, 58, 60, 60, 63, 63, 65, 68, 68, 72, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 51, 49, 49, 48, 47, 47, 48, 48, 48, 48, 48, 52, 53, 55, 58, 58, 62, 63, 66, 69, 69, 73, 74, 76, 79, 79, 83, 84, 86, 89, 89, 93, 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109, /* Size 32x16 */ 32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58, 65, 65, 31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56, 63, 63, 31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 51, 55, 62, 62, 31, 32, 32, 32, 32, 32, 34, 35, 37, 41, 41, 48, 50, 54, 61, 61, 31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53, 59, 59, 31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53, 59, 59, 31, 32, 32, 33, 34, 34, 35, 36, 38, 42, 42, 48, 49, 53, 59, 59, 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 48, 50, 53, 59, 59, 32, 32, 32, 33, 34, 34, 36, 37, 39, 42, 42, 48, 49, 53, 58, 58, 32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, 32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, 33, 33, 33, 35, 36, 36, 40, 41, 43, 46, 46, 52, 53, 56, 62, 62, 34, 34, 34, 35, 37, 37, 41, 42, 44, 48, 48, 53, 54, 57, 63, 63, 34, 34, 34, 35, 37, 37, 43, 44, 46, 50, 50, 55, 56, 59, 65, 65, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63, 68, 68, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63, 68, 68, 38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67, 72, 72, 39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68, 73, 73, 41, 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70, 76, 76, 44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 47, 44, 44, 44, 45, 45, 53, 56, 59, 66, 66, 73, 75, 78, 84, 84, 48, 45, 45, 45, 46, 46, 54, 56, 60, 67, 67, 74, 76, 79, 85, 85, 50, 47, 46, 47, 47, 47, 55, 58, 61, 68, 68, 76, 78, 82, 88, 88, 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86, 92, 92, 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86, 92, 92, 57, 54, 53, 53, 53, 53, 60, 63, 67, 74, 74, 83, 86, 90, 97, 97, 58, 55, 54, 54, 54, 54, 61, 63, 68, 75, 75, 84, 87, 91, 98, 98, 61, 57, 56, 56, 56, 56, 63, 65, 69, 77, 77, 86, 89, 93, 100, 100, 65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, 65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, 70, 65, 64, 63, 62, 62, 70, 72, 76, 83, 83, 93, 96, 101, 109, 109, /* Size 4x16 */ 31, 32, 32, 32, 32, 32, 34, 35, 37, 39, 41, 45, 50, 54, 57, 61, 32, 32, 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 44, 42, 41, 42, 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, 58, 55, 53, 53, 53, 52, 57, 63, 67, 70, 74, 79, 86, 90, 93, 97, /* Size 16x4 */ 31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53, 32, 34, 42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63, 37, 40, 57, 67, 39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79, 50, 50, 71, 86, 54, 53, 74, 90, 57, 56, 77, 93, 61, 58, 79, 97, /* Size 8x32 */ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49, 53, 54, 56, 60, 60, 64, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66, 66, 70, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, 92, 96, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109, /* Size 32x8 */ 32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 52, 63, 31, 32, 32, 35, 38, 42, 51, 62, 31, 32, 32, 34, 37, 41, 50, 61, 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, 49, 59, 32, 32, 34, 36, 38, 42, 50, 59, 32, 32, 34, 36, 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 33, 33, 36, 40, 43, 46, 53, 62, 34, 34, 37, 41, 44, 48, 54, 63, 34, 34, 37, 43, 46, 50, 56, 65, 36, 34, 38, 46, 50, 54, 60, 68, 36, 34, 38, 46, 50, 54, 60, 68, 38, 37, 40, 47, 52, 57, 64, 72, 39, 37, 40, 48, 53, 58, 65, 73, 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, 44, 41, 43, 51, 57, 63, 71, 79, 47, 44, 45, 53, 59, 66, 75, 84, 48, 45, 46, 54, 60, 67, 76, 85, 50, 46, 47, 55, 61, 68, 78, 88, 53, 49, 50, 57, 64, 71, 82, 92, 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60, 67, 74, 86, 97, 58, 54, 54, 61, 68, 75, 87, 98, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105, 65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62, 70, 76, 83, 96, 109 }, { /* Chroma */ /* Size 4x4 */ 31, 41, 46, 51, 41, 48, 48, 51, 46, 48, 58, 62, 51, 51, 62, 71, /* Size 8x8 */ 31, 31, 38, 44, 47, 48, 50, 55, 31, 32, 40, 44, 45, 46, 47, 52, 38, 40, 47, 47, 46, 46, 47, 50, 44, 44, 47, 50, 51, 51, 52, 54, 47, 45, 46, 51, 54, 56, 57, 60, 48, 46, 46, 51, 56, 61, 63, 66, 50, 47, 47, 52, 57, 63, 66, 70, 55, 52, 50, 54, 60, 66, 70, 76, /* Size 16x16 */ 32, 31, 30, 33, 34, 36, 41, 49, 48, 49, 49, 50, 52, 54, 55, 57, 31, 31, 31, 34, 36, 38, 42, 47, 47, 47, 47, 48, 50, 51, 53, 54, 30, 31, 32, 34, 37, 40, 42, 46, 45, 45, 45, 46, 47, 49, 50, 52, 33, 34, 34, 37, 40, 42, 44, 47, 46, 46, 45, 46, 47, 49, 50, 51, 34, 36, 37, 40, 42, 45, 46, 47, 46, 46, 45, 46, 47, 48, 49, 50, 36, 38, 40, 42, 45, 47, 47, 48, 47, 46, 45, 46, 47, 48, 49, 50, 41, 42, 42, 44, 46, 47, 48, 50, 50, 49, 49, 50, 50, 51, 52, 53, 49, 47, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 55, 56, 56, 48, 47, 45, 46, 46, 47, 50, 53, 54, 54, 55, 56, 57, 58, 58, 59, 49, 47, 45, 46, 46, 46, 49, 53, 54, 55, 57, 58, 59, 60, 60, 61, 49, 47, 45, 45, 45, 45, 49, 53, 55, 57, 58, 60, 61, 62, 63, 63, 50, 48, 46, 46, 46, 46, 50, 54, 56, 58, 60, 61, 63, 65, 66, 67, 52, 50, 47, 47, 47, 47, 50, 54, 57, 59, 61, 63, 66, 68, 69, 70, 54, 51, 49, 49, 48, 48, 51, 55, 58, 60, 62, 65, 68, 70, 71, 73, 55, 53, 50, 50, 49, 49, 52, 56, 58, 60, 63, 66, 69, 71, 73, 74, 57, 54, 52, 51, 50, 50, 53, 56, 59, 61, 63, 67, 70, 73, 74, 76, /* Size 32x32 */ 32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31, 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 47, 47, 47, 47, 47, 48, 49, 49, 50, 50, 52, 52, 53, 55, 55, 57, 31, 31, 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 47, 47, 47, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 54, 54, 56, 31, 31, 31, 31, 31, 31, 34, 35, 36, 39, 39, 41, 42, 44, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 53, 55, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, 45, 45, 45, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 52, 52, 54, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, 45, 45, 45, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 52, 52, 54, 33, 33, 34, 34, 34, 34, 37, 38, 40, 42, 42, 44, 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 51, 51, 53, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43, 43, 44, 45, 46, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 51, 51, 53, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 46, 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 46, 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 40, 41, 41, 41, 42, 42, 44, 44, 45, 47, 47, 48, 48, 49, 50, 50, 49, 49, 49, 48, 48, 49, 49, 49, 49, 49, 51, 51, 51, 52, 52, 54, 41, 42, 42, 42, 42, 42, 44, 45, 46, 47, 47, 48, 48, 49, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 53, 53, 55, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 53, 53, 54, 54, 54, 56, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56, 58, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 60, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 54, 55, 55, 55, 56, 56, 57, 57, 57, 58, 58, 59, 60, 60, 61, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 55, 55, 57, 57, 57, 58, 58, 59, 59, 60, 60, 60, 61, 61, 63, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61, 62, 62, 63, 63, 63, 65, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61, 62, 62, 63, 63, 63, 65, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 61, 62, 63, 63, 64, 64, 65, 66, 66, 67, 50, 49, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 58, 60, 60, 61, 61, 62, 63, 63, 65, 65, 66, 67, 67, 68, 51, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 51, 54, 54, 56, 57, 58, 60, 60, 62, 62, 63, 65, 65, 66, 66, 67, 68, 68, 70, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 68, 68, 69, 70, 70, 72, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 68, 68, 69, 70, 70, 72, 54, 52, 51, 50, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74, 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 72, 73, 73, 75, 55, 53, 53, 52, 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60, 63, 63, 65, 66, 67, 69, 69, 71, 72, 73, 74, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76, 76, 78, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56, 58, 58, 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78, 80, /* Size 4x8 */ 31, 32, 39, 44, 46, 47, 48, 53, 38, 40, 47, 47, 47, 46, 47, 50, 47, 45, 45, 51, 56, 59, 61, 64, 52, 49, 48, 53, 58, 64, 68, 73, /* Size 8x4 */ 31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53, 46, 47, 56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73, /* Size 8x16 */ 32, 31, 30, 33, 35, 37, 42, 49, 48, 48, 49, 50, 52, 54, 55, 57, 31, 31, 32, 35, 37, 40, 43, 46, 46, 45, 45, 46, 48, 49, 51, 52, 37, 38, 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 45, 45, 44, 46, 46, 47, 49, 52, 51, 51, 51, 52, 53, 54, 54, 55, 48, 47, 45, 46, 46, 47, 50, 53, 54, 54, 55, 56, 57, 58, 58, 59, 49, 47, 45, 45, 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, 52, 50, 48, 47, 47, 47, 50, 54, 57, 59, 61, 64, 66, 68, 69, 70, 57, 54, 52, 51, 51, 50, 53, 57, 59, 61, 64, 67, 71, 73, 74, 76, /* Size 16x8 */ 32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54, 30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51, 35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50, 42, 43, 47, 49, 50, 49, 50, 53, 49, 46, 48, 52, 53, 53, 54, 57, 48, 46, 47, 51, 54, 55, 57, 59, 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 64, 67, 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76, /* Size 16x32 */ 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48, 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 48, 48, 50, 51, 51, 53, 53, 55, 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 49, 50, 51, 52, 52, 54, 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 51, 51, 53, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55, 55, 57, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 51, 50, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 52, 54, 54, 56, 57, 58, 61, 61, 62, 63, 64, 65, 65, 67, 67, 68, 69, 69, 70, 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70, 70, 72, 54, 52, 51, 51, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78, /* Size 32x16 */ 32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31, 31, 31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31, 31, 34, 38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32, 34, 39, 39, 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35, 40, 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40, 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42, 46, 47, 46, 45, 45, 47, 47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46, 47, 46, 46, 46, 47, 47, 49, 51, 51, 35, 37, 37, 40, 44, 44, 46, 47, 46, 45, 45, 47, 47, 48, 51, 51, 37, 39, 40, 43, 47, 47, 47, 47, 47, 45, 45, 46, 47, 48, 50, 50, 37, 39, 40, 43, 47, 47, 47, 47, 47, 45, 45, 46, 47, 48, 50, 50, 41, 42, 42, 44, 47, 47, 49, 49, 49, 48, 48, 49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47, 49, 50, 50, 49, 49, 50, 50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51, 51, 51, 51, 52, 52, 53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55, 57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55, 57, 57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58, 59, 59, 48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60, 48, 46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46, 45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46, 46, 46, 52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46, 46, 52, 54, 56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47, 52, 54, 56, 60, 60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53, 54, 57, 61, 61, 65, 66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54, 57, 61, 61, 65, 66, 68, 71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58, 62, 62, 67, 68, 70, 73, 73, 54, 51, 50, 49, 49, 49, 54, 55, 58, 62, 62, 67, 68, 70, 73, 73, 55, 51, 51, 50, 49, 49, 54, 56, 58, 63, 63, 68, 69, 71, 74, 74, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69, 70, 73, 76, 76, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69, 70, 73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58, 61, 65, 65, 70, 72, 74, 78, 78, /* Size 4x16 */ 31, 31, 32, 34, 37, 39, 42, 47, 46, 46, 46, 47, 48, 50, 51, 53, 37, 38, 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 49, 47, 45, 45, 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, 54, 51, 49, 49, 48, 48, 51, 55, 58, 60, 62, 65, 68, 70, 71, 73, /* Size 16x4 */ 31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49, 37, 44, 45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55, 46, 47, 55, 58, 46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65, 48, 47, 61, 68, 50, 48, 62, 70, 51, 49, 63, 71, 53, 50, 64, 73, /* Size 8x32 */ 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48, 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 49, 50, 51, 52, 52, 54, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55, 55, 57, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70, 70, 72, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78, /* Size 32x8 */ 32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55, 31, 31, 38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53, 30, 32, 40, 44, 45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51, 33, 36, 43, 46, 46, 46, 47, 51, 35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50, 37, 40, 47, 47, 47, 45, 47, 50, 41, 42, 47, 49, 49, 48, 50, 52, 42, 43, 47, 49, 50, 49, 50, 53, 44, 44, 47, 50, 51, 51, 52, 54, 49, 46, 48, 52, 53, 53, 54, 57, 49, 46, 48, 52, 53, 53, 54, 57, 48, 46, 47, 51, 54, 55, 57, 59, 48, 46, 47, 51, 54, 56, 57, 60, 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, 49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 63, 66, 50, 46, 46, 52, 56, 59, 64, 67, 51, 47, 47, 52, 56, 60, 65, 68, 52, 48, 47, 53, 57, 61, 66, 71, 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73, 54, 50, 49, 54, 58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76, 57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57, 61, 65, 72, 78 }, }, { { /* Luma */ /* Size 4x4 */ 32, 32, 38, 51, 32, 35, 40, 49, 38, 40, 54, 64, 51, 49, 64, 81, /* Size 8x8 */ 31, 32, 32, 34, 35, 41, 47, 53, 32, 32, 32, 33, 34, 40, 44, 50, 32, 32, 34, 35, 37, 41, 45, 51, 34, 33, 35, 39, 42, 47, 51, 55, 35, 34, 37, 42, 48, 53, 57, 61, 41, 40, 41, 47, 53, 60, 65, 70, 47, 44, 45, 51, 57, 65, 71, 77, 53, 50, 51, 55, 61, 70, 77, 85, /* Size 16x16 */ 32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 31, 32, 32, 32, 32, 32, 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 31, 32, 32, 32, 32, 33, 33, 34, 35, 36, 38, 41, 44, 45, 49, 54, 31, 32, 32, 32, 33, 34, 34, 35, 36, 38, 39, 42, 45, 46, 50, 54, 32, 32, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 45, 46, 49, 53, 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 44, 47, 48, 51, 55, 34, 34, 33, 34, 35, 37, 38, 39, 42, 44, 45, 47, 50, 51, 54, 58, 36, 35, 34, 35, 36, 38, 40, 42, 48, 50, 50, 54, 56, 57, 60, 64, 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 52, 56, 58, 60, 63, 67, 39, 38, 37, 38, 39, 40, 42, 45, 50, 52, 54, 58, 60, 62, 65, 69, 44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58, 63, 66, 68, 71, 75, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 49, 47, 46, 45, 46, 46, 48, 51, 57, 60, 62, 68, 71, 73, 77, 81, 54, 51, 50, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 59, 56, 54, 54, 54, 53, 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 54, 55, 59, 59, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 39, 39, 41, 43, 43, 46, 47, 48, 51, 52, 53, 57, 57, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42, 43, 45, 46, 47, 51, 51, 53, 56, 56, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42, 42, 45, 46, 47, 51, 51, 52, 56, 56, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 50, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 44, 45, 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 36, 38, 38, 40, 41, 41, 44, 45, 45, 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 38, 39, 39, 41, 42, 42, 44, 45, 46, 49, 50, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 36, 36, 36, 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 50, 51, 54, 54, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42, 42, 43, 45, 46, 46, 49, 49, 50, 53, 53, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42, 42, 43, 45, 46, 46, 49, 49, 50, 53, 53, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 44, 45, 47, 47, 48, 51, 51, 52, 55, 55, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 41, 42, 42, 44, 45, 45, 47, 47, 48, 50, 51, 51, 54, 54, 55, 58, 58, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 41, 42, 42, 44, 45, 45, 47, 47, 48, 50, 51, 51, 54, 54, 55, 58, 58, 35, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 39, 41, 41, 43, 45, 45, 47, 47, 47, 49, 50, 51, 53, 53, 54, 57, 57, 58, 61, 61, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59, 60, 61, 64, 64, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59, 60, 61, 64, 64, 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47, 50, 50, 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56, 58, 58, 60, 61, 62, 64, 65, 66, 69, 69, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56, 58, 58, 60, 61, 62, 64, 65, 66, 69, 69, 42, 41, 41, 41, 40, 40, 40, 41, 41, 41, 42, 42, 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 61, 62, 64, 65, 66, 69, 69, 70, 73, 73, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50, 54, 54, 56, 58, 58, 61, 63, 64, 66, 67, 68, 71, 71, 72, 75, 75, 44, 43, 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45, 48, 48, 51, 54, 54, 56, 58, 58, 62, 64, 64, 66, 67, 68, 71, 72, 73, 76, 76, 47, 46, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76, 79, 79, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, 46, 46, 47, 51, 51, 53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 72, 75, 76, 77, 80, 80, 49, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 51, 51, 54, 57, 57, 60, 62, 62, 66, 68, 68, 71, 72, 73, 77, 77, 78, 81, 81, 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51, 54, 54, 57, 59, 59, 62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 81, 83, 86, 86, 54, 52, 51, 51, 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 55, 53, 53, 52, 51, 50, 50, 51, 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73, 76, 77, 78, 83, 83, 85, 88, 88, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92, /* Size 4x8 */ 32, 32, 32, 34, 35, 40, 46, 52, 32, 33, 34, 37, 38, 42, 46, 51, 37, 36, 38, 44, 49, 55, 59, 64, 52, 49, 49, 54, 60, 69, 76, 83, /* Size 8x4 */ 32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54, 35, 38, 49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83, /* Size 8x16 */ 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 31, 32, 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, /* Size 16x8 */ 32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 34, 34, 36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54, 36, 34, 37, 40, 48, 54, 56, 60, 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, 44, 41, 42, 45, 53, 63, 66, 71, 47, 44, 45, 47, 56, 66, 69, 75, 49, 46, 47, 48, 57, 67, 71, 77, 53, 49, 50, 51, 60, 71, 75, 82, 58, 54, 54, 55, 63, 75, 79, 87, /* Size 16x32 */ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 42, 42, 45, 46, 47, 50, 51, 52, 55, 55, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 40, 40, 40, 42, 43, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53, 55, 55, 35, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 39, 42, 42, 44, 47, 47, 48, 49, 49, 51, 52, 52, 54, 55, 56, 58, 59, 60, 62, 62, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 46, 49, 49, 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76, 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92, /* Size 32x16 */ 32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31, 32, 32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32, 32, 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, 32, 32, 33, 34, 35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32, 33, 33, 34, 34, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33, 33, 35, 35, 36, 41, 41, 44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 45, 49, 49, 54, 32, 32, 32, 33, 34, 34, 34, 36, 36, 38, 42, 42, 45, 50, 50, 54, 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 42, 42, 45, 49, 49, 54, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39, 42, 42, 45, 49, 49, 53, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39, 42, 42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36, 36, 39, 40, 41, 44, 44, 47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50, 54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50, 54, 54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50, 53, 57, 57, 61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64, 38, 37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38, 37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40, 42, 42, 44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42, 43, 45, 52, 53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43, 45, 52, 54, 56, 63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47, 54, 56, 58, 66, 66, 69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55, 56, 59, 67, 67, 70, 76, 76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57, 60, 67, 67, 71, 77, 77, 81, 53, 50, 49, 49, 49, 49, 51, 58, 59, 62, 71, 71, 74, 81, 81, 86, 53, 51, 49, 49, 50, 50, 51, 59, 60, 63, 71, 71, 75, 82, 82, 87, 55, 52, 51, 51, 51, 51, 53, 60, 61, 64, 72, 72, 76, 83, 83, 88, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79, 87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79, 87, 87, 92, /* Size 4x16 */ 31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 55, 32, 32, 32, 33, 34, 35, 36, 37, 38, 40, 40, 43, 45, 47, 50, 54, 38, 37, 36, 36, 38, 39, 41, 44, 49, 51, 52, 56, 58, 60, 63, 67, 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, /* Size 16x4 */ 31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49, 32, 34, 38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54, 35, 38, 49, 60, 37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71, 45, 45, 58, 75, 47, 47, 60, 77, 51, 50, 63, 82, 55, 54, 67, 87, /* Size 8x32 */ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53, 55, 55, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76, 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, /* Size 32x8 */ 32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 33, 34, 36, 42, 45, 49, 32, 32, 34, 34, 36, 42, 45, 50, 32, 32, 34, 35, 37, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54, 34, 34, 36, 38, 42, 48, 50, 54, 35, 34, 37, 39, 45, 50, 53, 57, 36, 34, 37, 40, 48, 54, 56, 60, 36, 34, 37, 40, 48, 54, 56, 60, 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, 39, 37, 40, 42, 50, 58, 60, 65, 42, 40, 42, 44, 52, 61, 64, 69, 44, 41, 42, 45, 53, 63, 66, 71, 44, 41, 43, 45, 54, 63, 66, 72, 47, 44, 45, 47, 56, 66, 69, 75, 48, 45, 46, 48, 56, 67, 70, 76, 49, 46, 47, 48, 57, 67, 71, 77, 53, 49, 49, 51, 59, 71, 74, 81, 53, 49, 50, 51, 60, 71, 75, 82, 55, 51, 51, 53, 61, 72, 76, 83, 58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55, 63, 75, 79, 87 }, { /* Chroma */ /* Size 4x4 */ 31, 38, 47, 49, 38, 47, 46, 46, 47, 46, 54, 57, 49, 46, 57, 66, /* Size 8x8 */ 31, 31, 35, 42, 48, 47, 49, 51, 31, 32, 36, 42, 46, 45, 46, 48, 35, 36, 41, 45, 47, 45, 46, 48, 42, 42, 45, 48, 50, 49, 50, 51, 48, 46, 47, 50, 53, 53, 54, 54, 47, 45, 45, 49, 53, 57, 59, 60, 49, 46, 46, 50, 54, 59, 61, 64, 51, 48, 48, 51, 54, 60, 64, 68, /* Size 16x16 */ 32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 31, 31, 31, 32, 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 30, 31, 31, 32, 35, 39, 41, 42, 46, 46, 46, 45, 46, 47, 48, 50, 31, 32, 32, 33, 36, 40, 41, 43, 46, 46, 45, 45, 46, 46, 47, 49, 33, 34, 35, 36, 39, 43, 44, 45, 47, 46, 46, 45, 46, 47, 47, 49, 36, 38, 39, 40, 43, 47, 47, 47, 48, 47, 46, 45, 46, 46, 47, 48, 38, 40, 41, 41, 44, 47, 47, 48, 49, 48, 48, 47, 47, 47, 48, 49, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50, 49, 49, 50, 50, 50, 52, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54, 54, 55, 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 48, 47, 46, 45, 46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 49, 47, 45, 45, 45, 45, 47, 49, 53, 55, 55, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 64, 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62, 64, 66, 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, 64, 66, 68, 71, /* Size 32x32 */ 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 36, 36, 38, 41, 41, 45, 49, 49, 49, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31, 31, 31, 31, 34, 34, 35, 38, 38, 39, 42, 42, 45, 48, 48, 47, 47, 47, 47, 47, 47, 49, 49, 49, 50, 50, 51, 53, 53, 31, 31, 31, 31, 31, 31, 32, 34, 34, 35, 38, 38, 40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 52, 52, 31, 31, 31, 31, 31, 31, 32, 34, 34, 36, 38, 38, 40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 46, 47, 48, 48, 48, 49, 49, 50, 52, 52, 30, 31, 31, 31, 31, 31, 32, 35, 35, 36, 39, 39, 41, 42, 42, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 47, 47, 48, 48, 48, 50, 50, 30, 31, 31, 31, 31, 32, 32, 35, 35, 36, 40, 40, 41, 42, 42, 44, 46, 46, 46, 45, 45, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 31, 31, 32, 32, 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 45, 45, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 33, 34, 34, 34, 35, 35, 35, 38, 38, 40, 43, 43, 43, 44, 44, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, 33, 34, 34, 34, 35, 35, 36, 38, 39, 40, 43, 43, 44, 45, 45, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, 34, 35, 35, 36, 36, 36, 37, 40, 40, 41, 44, 44, 45, 45, 45, 46, 47, 47, 47, 46, 46, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 45, 46, 46, 46, 46, 47, 47, 48, 48, 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 45, 46, 46, 46, 46, 47, 47, 48, 48, 38, 39, 40, 40, 41, 41, 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 49, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 49, 50, 50, 50, 49, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 49, 50, 50, 50, 49, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 45, 45, 45, 45, 44, 44, 44, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 56, 57, 57, 58, 58, 58, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 56, 57, 57, 58, 58, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 60, 61, 61, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61, 61, 62, 62, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 58, 58, 59, 59, 60, 60, 61, 61, 62, 63, 63, 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 59, 61, 61, 61, 63, 63, 63, 64, 64, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 65, 51, 49, 48, 48, 47, 46, 46, 47, 47, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 62, 62, 64, 64, 64, 66, 66, 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 52, 50, 50, 49, 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 53, 51, 50, 50, 48, 48, 48, 48, 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62, 63, 64, 64, 67, 67, 68, 69, 69, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71, 71, /* Size 4x8 */ 31, 31, 36, 42, 47, 46, 48, 50, 38, 40, 44, 47, 48, 46, 46, 48, 47, 46, 47, 50, 53, 54, 55, 56, 50, 48, 47, 50, 54, 60, 64, 67, /* Size 8x4 */ 31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50, 47, 48, 53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67, /* Size 8x16 */ 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 31, 31, 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 35, 37, 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, 49, 47, 45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, /* Size 16x8 */ 32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32, 38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48, 33, 36, 41, 44, 47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50, 49, 46, 48, 49, 53, 53, 54, 54, 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 45, 47, 53, 58, 59, 61, 50, 46, 46, 48, 54, 59, 61, 63, 51, 47, 47, 48, 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66, 54, 50, 49, 50, 55, 62, 65, 68, /* Size 16x32 */ 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49, 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31, 31, 31, 32, 35, 35, 36, 39, 39, 40, 42, 42, 45, 47, 47, 47, 46, 46, 46, 46, 46, 47, 48, 48, 49, 49, 50, 51, 51, 31, 31, 31, 31, 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, 31, 32, 32, 32, 32, 33, 33, 36, 36, 37, 41, 41, 42, 43, 43, 45, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 37, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 50, 50, 51, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 53, 54, 55, 55, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 49, 48, 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 49, 48, 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63, 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 54, 53, 52, 52, 50, 50, 50, 49, 49, 49, 48, 48, 50, 52, 52, 54, 55, 55, 57, 59, 59, 61, 62, 63, 65, 65, 66, 68, 68, 69, 71, 71, /* Size 32x16 */ 32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31, 31, 31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31, 31, 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31, 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32, 38, 39, 40, 45, 46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38, 40, 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40, 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43, 46, 47, 46, 45, 45, 46, 47, 47, 49, 33, 35, 36, 36, 41, 43, 44, 46, 47, 46, 46, 46, 46, 47, 47, 49, 34, 36, 37, 37, 42, 44, 45, 47, 47, 47, 45, 45, 46, 47, 47, 49, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47, 45, 45, 46, 47, 47, 48, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47, 45, 45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47, 47, 48, 48, 48, 47, 47, 47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50, 50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50, 50, 50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51, 52, 52, 52, 54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55, 48, 47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46, 46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45, 46, 46, 47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45, 46, 47, 52, 53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46, 47, 52, 53, 55, 58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48, 53, 54, 55, 59, 59, 61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53, 54, 55, 59, 59, 61, 64, 64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54, 55, 60, 60, 61, 64, 64, 66, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61, 61, 63, 66, 66, 68, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61, 61, 63, 66, 66, 68, 53, 50, 48, 48, 48, 48, 49, 54, 54, 56, 61, 61, 63, 67, 67, 69, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65, 68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65, 68, 68, 71, /* Size 4x16 */ 31, 31, 31, 32, 35, 39, 40, 42, 47, 47, 46, 46, 47, 48, 49, 51, 37, 38, 39, 40, 43, 47, 47, 47, 48, 47, 47, 46, 46, 47, 47, 49, 48, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, /* Size 16x4 */ 31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48, 35, 43, 46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50, 47, 48, 53, 54, 47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61, 47, 46, 55, 63, 48, 47, 55, 64, 49, 47, 56, 66, 51, 49, 57, 68, /* Size 8x32 */ 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49, 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31, 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, 49, 48, 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63, 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, /* Size 32x8 */ 32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50, 31, 31, 37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32, 38, 40, 46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48, 33, 35, 41, 43, 47, 45, 46, 47, 33, 36, 41, 44, 47, 46, 46, 47, 34, 37, 42, 45, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50, 42, 43, 46, 48, 50, 49, 50, 50, 45, 44, 47, 48, 51, 51, 52, 52, 49, 46, 48, 49, 53, 53, 54, 54, 49, 46, 48, 49, 53, 53, 54, 54, 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, 48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 46, 47, 53, 57, 58, 60, 49, 45, 45, 47, 53, 58, 59, 61, 49, 45, 46, 47, 53, 58, 60, 61, 50, 46, 46, 48, 54, 59, 61, 63, 50, 46, 46, 48, 54, 59, 61, 64, 51, 47, 47, 48, 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66, 52, 48, 47, 48, 54, 61, 63, 66, 53, 48, 48, 49, 54, 61, 63, 67, 54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50, 55, 62, 65, 68 }, }, { { /* Luma */ /* Size 4x4 */ 32, 32, 35, 43, 32, 34, 37, 43, 35, 37, 48, 54, 43, 43, 54, 65, /* Size 8x8 */ 31, 31, 32, 32, 34, 37, 43, 47, 31, 32, 32, 32, 34, 36, 41, 44, 32, 32, 33, 34, 35, 38, 42, 45, 32, 32, 34, 35, 37, 39, 42, 46, 34, 34, 35, 37, 41, 45, 49, 52, 37, 36, 38, 39, 45, 51, 56, 59, 43, 41, 42, 42, 49, 56, 63, 67, 47, 44, 45, 46, 52, 59, 67, 71, /* Size 16x16 */ 32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 31, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 38, 39, 42, 45, 45, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 38, 41, 44, 44, 31, 32, 32, 32, 33, 33, 33, 34, 35, 36, 36, 39, 40, 42, 44, 45, 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 45, 45, 32, 32, 32, 32, 33, 34, 35, 36, 37, 38, 38, 40, 41, 42, 45, 46, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39, 40, 42, 43, 44, 47, 47, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47, 50, 51, 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 39, 38, 38, 37, 39, 39, 40, 42, 45, 49, 50, 54, 55, 58, 60, 61, 41, 40, 39, 38, 40, 40, 41, 43, 46, 50, 52, 55, 57, 60, 62, 63, 44, 42, 42, 41, 42, 42, 42, 44, 47, 52, 54, 58, 60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, 48, 46, 45, 44, 45, 45, 46, 47, 51, 55, 57, 61, 63, 67, 70, 71, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 45, 47, 48, 48, 51, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 39, 39, 40, 43, 43, 44, 46, 47, 47, 50, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 36, 38, 38, 39, 42, 42, 42, 45, 45, 45, 48, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 38, 41, 41, 41, 44, 44, 44, 47, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 38, 41, 41, 41, 44, 44, 44, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 38, 38, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 38, 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 38, 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 39, 40, 40, 41, 42, 42, 43, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 46, 46, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 46, 46, 48, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 44, 44, 45, 47, 47, 47, 50, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45, 46, 47, 47, 48, 50, 51, 51, 53, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45, 46, 47, 47, 48, 50, 51, 51, 53, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 38, 40, 40, 41, 43, 44, 44, 45, 46, 46, 47, 49, 49, 49, 51, 52, 52, 54, 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42, 42, 43, 46, 47, 47, 48, 49, 49, 50, 52, 52, 53, 55, 55, 55, 57, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50, 50, 50, 52, 54, 54, 54, 56, 57, 57, 58, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50, 50, 50, 52, 54, 54, 54, 56, 57, 57, 58, 38, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 39, 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 52, 52, 54, 56, 56, 57, 58, 59, 59, 61, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54, 55, 58, 58, 58, 60, 61, 61, 63, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54, 55, 58, 58, 58, 60, 61, 61, 63, 41, 40, 40, 40, 39, 38, 38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55, 55, 57, 60, 60, 60, 62, 63, 63, 66, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43, 43, 45, 48, 48, 49, 53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 67, 68, 68, 70, 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71, 71, 74, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71, 71, 74, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48, 50, 53, 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74, 77, /* Size 4x8 */ 31, 32, 32, 32, 34, 37, 42, 46, 32, 33, 34, 35, 37, 40, 43, 46, 35, 34, 36, 38, 43, 49, 53, 56, 43, 41, 42, 42, 49, 56, 63, 67, /* Size 8x4 */ 31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42, 34, 37, 43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67, /* Size 8x16 */ 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, /* Size 16x8 */ 32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50, 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, 39, 37, 39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70, /* Size 16x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 45, 45, 47, 48, 48, 50, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 40, 39, 39, 39, 39, 38, 38, 38, 39, 39, 39, 40, 41, 41, 42, 45, 45, 46, 50, 51, 51, 53, 54, 54, 56, 59, 59, 59, 61, 62, 62, 64, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 53, 52, 51, 51, 50, 49, 49, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 55, 59, 60, 60, 63, 65, 65, 67, 71, 71, 72, 75, 76, 76, 79, /* Size 32x16 */ 32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, 33, 34, 35, 35, 38, 41, 41, 44, 49, 31, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 39, 42, 42, 44, 49, 32, 32, 32, 32, 33, 34, 34, 34, 36, 36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 34, 34, 34, 36, 36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 35, 35, 35, 37, 37, 37, 40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41, 42, 42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40, 40, 42, 44, 44, 47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50, 54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50, 54, 34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55, 35, 35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35, 34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36, 37, 40, 40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39, 40, 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40, 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41, 43, 49, 51, 51, 56, 60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45, 51, 53, 53, 59, 63, 63, 66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51, 53, 53, 59, 63, 63, 66, 71, 44, 43, 42, 42, 42, 43, 43, 45, 51, 54, 54, 59, 64, 64, 67, 72, 47, 45, 44, 44, 44, 45, 45, 47, 53, 56, 56, 61, 66, 66, 69, 75, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62, 67, 67, 70, 76, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62, 67, 67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50, 56, 58, 58, 64, 69, 69, 73, 79, /* Size 4x16 */ 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, /* Size 16x4 */ 31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41, 32, 34, 36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44, 34, 37, 42, 48, 35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58, 40, 41, 51, 60, 42, 43, 53, 63, 45, 45, 56, 66, 46, 46, 56, 67, /* Size 8x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, /* Size 32x8 */ 32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 35, 41, 44, 31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 35, 37, 37, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 42, 43, 49, 51, 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, 36, 34, 36, 38, 46, 48, 54, 56, 38, 36, 37, 40, 47, 49, 56, 58, 39, 37, 39, 40, 48, 50, 58, 60, 39, 37, 39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66, 44, 41, 42, 43, 51, 53, 63, 66, 44, 42, 42, 43, 51, 54, 64, 67, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70, 48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48, 56, 58, 69, 73 }, { /* Chroma */ /* Size 4x4 */ 31, 37, 47, 47, 37, 44, 47, 45, 47, 47, 53, 53, 47, 45, 53, 59, /* Size 8x8 */ 31, 31, 34, 37, 43, 48, 47, 49, 31, 32, 35, 40, 43, 46, 45, 46, 34, 35, 39, 43, 45, 46, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 43, 43, 45, 47, 49, 50, 50, 50, 48, 46, 46, 47, 50, 53, 55, 55, 47, 45, 45, 45, 50, 55, 58, 60, 49, 46, 46, 46, 50, 55, 60, 61, /* Size 16x16 */ 32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 31, 31, 31, 31, 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 31, 31, 31, 31, 34, 35, 39, 40, 42, 46, 47, 46, 46, 46, 47, 47, 30, 31, 31, 32, 34, 35, 40, 41, 42, 45, 46, 45, 45, 45, 46, 46, 33, 34, 34, 34, 37, 38, 42, 43, 44, 46, 47, 46, 46, 45, 46, 46, 33, 34, 35, 35, 38, 39, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 36, 38, 39, 40, 42, 43, 47, 47, 47, 47, 48, 46, 46, 45, 46, 46, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48, 49, 48, 47, 47, 47, 47, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49, 50, 50, 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 49, 47, 47, 46, 47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 48, 47, 46, 45, 46, 46, 46, 48, 49, 52, 53, 54, 55, 55, 56, 56, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55, 55, 57, 57, 58, 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58, 59, 60, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58, 60, 61, 61, /* Size 32x32 */ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 41, 43, 47, 49, 49, 49, 48, 48, 49, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 42, 43, 47, 48, 48, 48, 47, 47, 47, 47, 47, 48, 49, 49, 49, 50, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 48, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 42, 43, 45, 46, 46, 46, 45, 45, 45, 45, 45, 45, 46, 46, 46, 47, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 42, 43, 45, 46, 46, 46, 45, 45, 45, 45, 45, 45, 46, 46, 46, 47, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, 46, 46, 46, 47, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 35, 36, 37, 37, 37, 38, 38, 38, 41, 41, 41, 44, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 46, 45, 45, 45, 46, 46, 46, 46, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 46, 45, 45, 45, 46, 46, 46, 46, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 48, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 50, 50, 50, 50, 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, 48, 49, 49, 49, 50, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 55, 55, 55, 56, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 56, 57, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 56, 57, 49, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55, 55, 55, 57, 57, 57, 57, 58, 58, 58, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59, 59, 60, 60, 60, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59, 59, 60, 60, 60, 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 60, 60, 60, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61, 61, 63, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61, 61, 63, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63, 64, /* Size 4x8 */ 31, 31, 35, 39, 43, 47, 46, 48, 38, 40, 43, 47, 47, 47, 46, 46, 47, 46, 47, 47, 50, 53, 53, 54, 48, 45, 46, 45, 50, 55, 58, 59, /* Size 8x4 */ 31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45, 43, 47, 50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59, /* Size 8x16 */ 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 31, 31, 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 33, 34, 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, /* Size 16x8 */ 32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46, 37, 40, 43, 47, 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50, 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61, /* Size 16x32 */ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43, 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 46, 46, 47, 48, 48, 48, 49, 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, 48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 56, 56, 56, 57, 57, 57, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 52, 51, 50, 50, 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 59, 61, 61, 62, 63, 64, 64, 65, /* Size 32x16 */ 32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31, 31, 31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31, 31, 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31, 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32, 34, 39, 39, 40, 45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35, 40, 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40, 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40, 41, 45, 46, 46, 45, 45, 45, 46, 48, 33, 34, 35, 35, 37, 42, 42, 43, 46, 47, 47, 46, 45, 45, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46, 47, 47, 46, 46, 46, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46, 47, 47, 46, 46, 46, 46, 47, 35, 37, 38, 38, 41, 45, 45, 46, 47, 47, 47, 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46, 45, 45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50, 50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50, 50, 43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51, 47, 46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47, 46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46, 46, 47, 47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46, 47, 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47, 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46, 47, 51, 53, 53, 55, 57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47, 51, 53, 53, 56, 58, 58, 59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51, 53, 53, 56, 58, 58, 59, 61, 49, 47, 45, 45, 45, 46, 46, 47, 52, 53, 53, 56, 58, 58, 60, 62, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 63, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 64, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48, 52, 54, 54, 58, 60, 60, 62, 65, /* Size 4x16 */ 31, 31, 31, 31, 34, 35, 39, 40, 42, 46, 47, 47, 47, 46, 48, 48, 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, /* Size 16x4 */ 31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45, 34, 42, 47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47, 42, 47, 50, 49, 46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56, 47, 46, 53, 57, 46, 46, 53, 58, 48, 46, 54, 59, 48, 46, 54, 59, /* Size 8x32 */ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43, 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, /* Size 32x8 */ 32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49, 31, 31, 34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 30, 32, 35, 40, 44, 46, 45, 46, 31, 33, 35, 40, 45, 46, 45, 46, 33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46, 33, 36, 38, 43, 46, 47, 46, 46, 35, 38, 41, 45, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50, 42, 43, 44, 47, 49, 50, 49, 50, 43, 43, 45, 47, 50, 50, 50, 50, 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, 49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 52, 53, 55, 55, 48, 46, 46, 47, 51, 53, 56, 56, 48, 46, 46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 52, 53, 58, 60, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47, 52, 54, 60, 62 }, }, { { /* Luma */ /* Size 4x4 */ 32, 32, 34, 38, 32, 33, 35, 39, 34, 35, 39, 45, 38, 39, 45, 54, /* Size 8x8 */ 31, 31, 32, 32, 33, 34, 37, 41, 31, 32, 32, 32, 33, 34, 36, 39, 32, 32, 32, 33, 34, 35, 37, 40, 32, 32, 33, 34, 35, 36, 38, 41, 33, 33, 34, 35, 37, 39, 41, 44, 34, 34, 35, 36, 39, 43, 46, 49, 37, 36, 37, 38, 41, 46, 51, 54, 41, 39, 40, 41, 44, 49, 54, 58, /* Size 16x16 */ 32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 42, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 42, 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 38, 38, 38, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 38, 38, 38, 40, 41, 41, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 38, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 42, 44, 45, 45, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 35, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45, 45, 46, 47, 47, 47, 49, 50, 50, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39, 39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 52, 52, 52, 54, 56, 56, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 41, 41, 40, 40, 40, 39, 39, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52, 52, 54, 56, 56, 56, 58, 60, 60, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, /* Size 4x8 */ 31, 32, 32, 32, 33, 34, 37, 40, 32, 32, 33, 33, 34, 36, 38, 40, 34, 34, 34, 36, 38, 41, 44, 46, 39, 38, 38, 40, 42, 47, 52, 56, /* Size 8x4 */ 31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40, 33, 34, 38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56, /* Size 8x16 */ 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, /* Size 16x8 */ 32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42, 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 44, 41, 41, 43, 43, 53, 53, 63, /* Size 16x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 48, 48, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 51, 51, 51, 52, 54, 54, 54, 56, 58, 58, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, /* Size 32x16 */ 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 38, 41, 41, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 36, 37, 37, 37, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45, 34, 34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34, 34, 36, 37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39, 39, 44, 49, 49, 49, 52, 56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 41, 40, 39, 39, 39, 40, 42, 42, 42, 46, 52, 52, 52, 56, 60, 60, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53, 58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53, 58, 63, 63, /* Size 4x16 */ 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 43, 43, 45, 45, 48, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 51, 51, 54, 54, 58, /* Size 16x4 */ 31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37, 32, 32, 33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40, 32, 34, 37, 40, 34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51, 35, 36, 43, 51, 38, 39, 45, 54, 38, 39, 45, 54, 42, 42, 48, 58, /* Size 8x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, /* Size 32x8 */ 32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 35, 35, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 37, 37, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42, 33, 33, 33, 36, 36, 40, 40, 45, 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 35, 34, 34, 37, 37, 45, 45, 50, 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 37, 36, 36, 39, 39, 49, 49, 56, 39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 41, 39, 39, 42, 42, 52, 52, 60, 44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43, 43, 53, 53, 63 }, { /* Chroma */ /* Size 4x4 */ 31, 34, 42, 47, 34, 39, 45, 46, 42, 45, 48, 49, 47, 46, 49, 54, /* Size 8x8 */ 31, 31, 32, 35, 39, 45, 48, 48, 31, 31, 33, 37, 41, 44, 46, 46, 32, 33, 35, 39, 42, 45, 46, 45, 35, 37, 39, 43, 45, 47, 47, 46, 39, 41, 42, 45, 47, 48, 48, 47, 45, 44, 45, 47, 48, 50, 51, 51, 48, 46, 46, 47, 48, 51, 53, 54, 48, 46, 45, 46, 47, 51, 54, 56, /* Size 16x16 */ 32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, 46, 45, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, 46, 45, 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 36, 36, 36, 39, 41, 41, 41, 45, 49, 49, 49, 49, 48, 48, 48, 49, 49, 49, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 35, 35, 35, 37, 39, 39, 39, 41, 42, 42, 42, 44, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, 32, 32, 33, 33, 33, 33, 33, 33, 33, 35, 37, 37, 37, 39, 41, 41, 41, 42, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45, 45, 45, 46, 46, 46, 47, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 46, 46, 46, 46, 45, 45, 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 48, 48, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 56, 57, 57, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58, 58, /* Size 4x8 */ 31, 31, 33, 36, 40, 45, 47, 47, 34, 35, 37, 41, 44, 46, 47, 46, 42, 42, 44, 46, 48, 49, 50, 49, 48, 46, 46, 46, 48, 51, 54, 55, /* Size 8x4 */ 31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46, 40, 44, 48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55, /* Size 8x16 */ 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 31, 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, /* Size 16x8 */ 32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 49, 45, 45, 46, 46, 53, 53, 58, /* Size 16x32 */ 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39, 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 50, 50, 49, 49, 49, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 50, 50, 50, 51, 53, 53, 53, 54, 54, 54, 54, 55, 56, 56, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58, /* Size 32x16 */ 32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31, 31, 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32, 35, 39, 39, 39, 42, 46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45, 45, 45, 32, 33, 34, 34, 34, 37, 41, 41, 41, 44, 46, 46, 46, 46, 45, 45, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45, 45, 46, 47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45, 45, 39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47, 42, 42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44, 44, 46, 47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47, 47, 50, 53, 53, 53, 54, 54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47, 45, 45, 45, 46, 46, 46, 46, 49, 53, 53, 53, 55, 57, 57, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53, 56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53, 56, 58, 58, /* Size 4x16 */ 31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 48, 48, 47, 47, 47, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 42, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 48, 47, 47, 45, 45, 46, 46, 46, 46, 50, 50, 53, 53, 54, 54, 56, /* Size 16x4 */ 31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45, 31, 35, 42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46, 38, 43, 47, 46, 42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53, 48, 47, 50, 53, 47, 46, 50, 54, 47, 46, 50, 54, 47, 45, 49, 56, /* Size 8x32 */ 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39, 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58, /* Size 32x8 */ 32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45, 32, 34, 34, 41, 41, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46, 35, 38, 38, 45, 45, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 39, 41, 41, 47, 47, 49, 49, 47, 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 45, 44, 44, 47, 47, 51, 51, 51, 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 54, 48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 48, 45, 45, 46, 46, 53, 53, 57, 49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46, 46, 53, 53, 58 }, }, { { /* Luma */ /* Size 4x4 */ 32, 32, 32, 35, 32, 32, 33, 35, 32, 33, 35, 38, 35, 35, 38, 46, /* Size 8x8 */ 31, 31, 31, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 33, 34, 35, 31, 32, 32, 32, 32, 33, 33, 34, 32, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 34, 35, 35, 36, 38, 32, 33, 33, 34, 35, 36, 38, 40, 34, 34, 33, 35, 36, 38, 39, 42, 35, 35, 34, 36, 38, 40, 42, 48, /* Size 16x16 */ 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39, 39, 41, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42, 45, 45, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39, 40, 40, 40, 41, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, 39, 39, 39, 40, 41, 42, 42, 42, 42, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 41, 41, 41, 42, 44, 45, 45, 45, 45, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 47, 47, 47, 48, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48, 48, 49, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49, 50, /* Size 4x8 */ 31, 32, 32, 32, 32, 33, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34, 32, 32, 33, 34, 35, 36, 37, 38, 35, 35, 34, 36, 38, 40, 42, 48, /* Size 8x4 */ 31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36, 32, 33, 35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48, /* Size 8x16 */ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48, /* Size 16x8 */ 32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 33, 33, 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48, /* Size 16x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, 39, 39, 39, 40, 41, 42, 42, 42, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46, 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, /* Size 32x16 */ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33, 33, 35, 36, 36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37, 37, 37, 40, 43, 44, 44, 44, 35, 35, 34, 34, 34, 34, 36, 37, 38, 38, 38, 41, 45, 47, 47, 47, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38, 39, 39, 39, 42, 46, 49, 49, 49, /* Size 4x16 */ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48, /* Size 16x4 */ 31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36, 32, 32, 34, 37, 32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41, 34, 34, 37, 42, 34, 34, 37, 44, 35, 34, 38, 48, 35, 34, 38, 48, /* Size 8x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46, 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, /* Size 32x8 */ 32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 33, 33, 35, 35, 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 33, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 32, 33, 33, 34, 36, 36, 39, 40, 33, 33, 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44, 35, 34, 34, 36, 38, 38, 45, 47, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37, 39, 39, 46, 49 }, { /* Chroma */ /* Size 4x4 */ 31, 32, 38, 46, 32, 34, 41, 46, 38, 41, 47, 47, 46, 46, 47, 52, /* Size 8x8 */ 31, 31, 30, 34, 36, 39, 42, 48, 31, 31, 31, 34, 37, 40, 42, 47, 30, 31, 32, 35, 39, 41, 42, 46, 34, 34, 35, 39, 42, 44, 45, 47, 36, 37, 39, 42, 46, 47, 47, 47, 39, 40, 41, 44, 47, 47, 48, 49, 42, 42, 42, 45, 47, 48, 48, 50, 48, 47, 46, 47, 47, 49, 50, 53, /* Size 16x16 */ 32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 31, 31, 31, 31, 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31, 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 47, 47, 31, 31, 31, 31, 31, 31, 34, 35, 36, 39, 39, 41, 42, 44, 47, 47, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, 33, 33, 34, 34, 34, 34, 37, 38, 40, 42, 42, 44, 44, 45, 47, 47, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43, 43, 44, 45, 46, 47, 47, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46, 47, 47, 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 40, 41, 41, 41, 42, 42, 44, 44, 45, 47, 47, 48, 48, 49, 50, 50, 41, 42, 42, 42, 42, 42, 44, 45, 46, 47, 47, 48, 48, 49, 50, 50, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47, 49, 49, 50, 51, 51, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 34, 36, 36, 36, 36, 38, 40, 41, 41, 41, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 36, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 48, 48, 48, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 35, 35, 36, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 41, 42, 42, 42, 42, 44, 46, 46, 46, 46, 46, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39, 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39, 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39, 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 36, 37, 37, 37, 38, 40, 41, 41, 41, 42, 43, 43, 43, 43, 44, 46, 46, 46, 46, 46, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 46, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 34, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 40, 40, 40, 42, 44, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, 47, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 38, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 42, 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 49, 49, 49, 48, 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 49, 50, 50, 50, 49, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 51, 51, 51, 51, 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 52, 52, 52, 52, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53, 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, /* Size 4x8 */ 31, 31, 31, 34, 37, 39, 42, 48, 31, 31, 32, 36, 39, 41, 43, 46, 37, 38, 40, 43, 46, 47, 47, 48, 48, 47, 46, 47, 47, 48, 50, 53, /* Size 8x4 */ 31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47, 37, 39, 46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53, /* Size 8x16 */ 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 31, 31, 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53, /* Size 16x8 */ 32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46, 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 35, 37, 37, 40, 44, 44, 46, 47, 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 41, 42, 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53, /* Size 16x32 */ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41, 41, 41, 43, 44, 45, 45, 45, 46, 46, 46, 46, 46, 47, 47, 48, 48, 48, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 49, 50, 50, 50, 49, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52, 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, /* Size 32x16 */ 32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31, 31, 31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31, 31, 31, 31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32, 34, 37, 39, 39, 39, 41, 45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34, 38, 39, 39, 39, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 31, 32, 33, 33, 33, 33, 36, 39, 41, 41, 41, 43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35, 37, 40, 42, 42, 42, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47, 47, 47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47, 47, 47, 36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41, 43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42, 42, 44, 46, 47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48, 49, 50, 50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47, 47, 47, 49, 50, 51, 51, 51, 47, 46, 46, 46, 46, 46, 46, 47, 48, 48, 48, 49, 51, 52, 52, 52, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 52, 53, 53, 53, /* Size 4x16 */ 31, 31, 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53, /* Size 16x4 */ 31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46, 31, 32, 40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47, 36, 37, 44, 47, 38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49, 42, 43, 47, 50, 44, 44, 47, 51, 48, 46, 48, 53, 48, 46, 48, 53, /* Size 8x32 */ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52, 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, /* Size 32x8 */ 32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46, 30, 31, 32, 34, 39, 39, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 31, 33, 33, 36, 41, 41, 45, 46, 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 35, 37, 37, 40, 44, 44, 46, 47, 36, 38, 39, 42, 46, 46, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 39, 40, 41, 43, 47, 47, 48, 48, 41, 42, 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51, 47, 46, 46, 46, 48, 48, 51, 52, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 47, 47, 52, 53 }, }, { { /* Luma */ /* Size 4x4 */ 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 33, 34, 32, 33, 34, 35, /* Size 8x8 */ 31, 31, 31, 31, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 33, 33, 34, 35, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 34, 34, 35, 36, 33, 33, 33, 33, 35, 35, 36, 38, /* Size 16x16 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 35, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 36, 37, 38, 38, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 38, 38, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, /* Size 4x8 */ 31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 33, 34, 34, 35, 36, /* Size 8x4 */ 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36, /* Size 8x16 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, /* Size 16x8 */ 32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 34, 34, 34, 34, 35, 37, 37, 38, /* Size 16x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, /* Size 32x16 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 36, 38, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37, 38, 39, /* Size 4x16 */ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 36, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, /* Size 16x4 */ 31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33, 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35, 32, 33, 34, 35, 32, 33, 34, 35, 33, 33, 35, 36, 34, 34, 36, 37, /* Size 8x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38, /* Size 32x8 */ 32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33, 31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 33, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 33, 33, 33, 33, 34, 36, 36, 37, 34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34, 35, 37, 37, 38 }, { /* Chroma */ /* Size 4x4 */ 31, 31, 34, 38, 31, 32, 35, 40, 34, 35, 39, 43, 38, 40, 43, 47, /* Size 8x8 */ 31, 31, 31, 30, 34, 35, 37, 40, 31, 31, 31, 31, 34, 35, 38, 41, 31, 31, 31, 31, 35, 36, 39, 41, 30, 31, 31, 32, 35, 36, 40, 42, 34, 34, 35, 35, 39, 40, 43, 44, 35, 35, 36, 36, 40, 41, 44, 45, 37, 38, 39, 40, 43, 44, 47, 47, 40, 41, 41, 42, 44, 45, 47, 48, /* Size 16x16 */ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42, 43, 44, 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 35, 36, 37, 37, 37, 38, 38, 38, 41, 41, 41, 44, 46, 46, 46, 46, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, 47, 47, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 38, 40, 41, 41, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37, 38, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 34, 34, 34, 35, 36, 38, 38, 38, 38, 38, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 39, 39, 39, 39, 40, 40, 41, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 40, 40, 40, 40, 41, 41, 42, 43, 43, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 38, 39, 41, 41, 41, 41, 42, 42, 43, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 42, 42, 43, 43, 44, 44, 44, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 44, 44, 44, 44, 44, 45, 45, 45, 45, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 41, 41, 41, 41, 41, 42, 44, 45, 46, 46, 46, 46, 46, 46, 46, 46, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 41, 42, 43, 43, 43, 43, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 40, 40, 40, 41, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48, /* Size 4x8 */ 31, 31, 31, 31, 34, 35, 38, 41, 31, 31, 32, 32, 36, 37, 40, 42, 35, 36, 37, 37, 40, 42, 45, 45, 37, 38, 39, 40, 43, 44, 47, 47, /* Size 8x4 */ 31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40, 34, 36, 40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47, /* Size 8x16 */ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, /* Size 16x8 */ 32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 35, 37, 38, 38, 41, 45, 45, 46, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48, /* Size 16x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 35, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 39, 40, 41, 41, 41, 41, 42, 42, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, 35, 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 39, 40, 40, 40, 40, 40, 42, 43, 44, 45, 45, 45, 45, 45, 45, 46, 46, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48, /* Size 32x16 */ 32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31, 31, 31, 31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 32, 34, 37, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38, 40, 40, 40, 40, 41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41, 41, 41, 42, 44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42, 43, 44, 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37, 37, 37, 37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38, 38, 38, 39, 41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39, 39, 40, 42, 44, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41, 41, 42, 43, 45, 47, 47, 47, 47, 47, 48, 40, 41, 41, 42, 42, 42, 42, 42, 44, 45, 47, 47, 47, 47, 47, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 48, 48, /* Size 4x16 */ 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 38, 38, 39, 42, 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 35, 35, 36, 36, 36, 37, 37, 38, 40, 40, 40, 43, 45, 45, 45, 46, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, /* Size 16x4 */ 31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38, 31, 32, 36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40, 33, 35, 40, 42, 34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45, 38, 40, 45, 47, 38, 40, 45, 47, 39, 41, 45, 47, 42, 43, 46, 47, /* Size 8x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, /* Size 32x8 */ 32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39, 31, 31, 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 32, 33, 34, 34, 36, 41, 41, 42, 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 34, 36, 37, 37, 39, 44, 44, 45, 35, 37, 38, 38, 41, 45, 45, 46, 36, 38, 39, 39, 42, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 40, 41, 42, 42, 44, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48 }, }, { { /* Luma */ /* Size 4x4 */ 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, /* Size 8x8 */ 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, /* Size 16x16 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, /* Size 4x8 */ 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, /* Size 8x4 */ 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, /* Size 8x16 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, /* Size 16x8 */ 32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, /* Size 16x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, /* Size 32x16 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, /* Size 4x16 */ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, /* Size 16x4 */ 31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, /* Size 8x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, /* Size 32x8 */ 32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34 }, { /* Chroma */ /* Size 4x4 */ 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 32, 35, 34, 35, 35, 39, /* Size 8x8 */ 31, 31, 31, 31, 30, 31, 33, 33, 31, 31, 31, 31, 31, 32, 34, 34, 31, 31, 31, 31, 31, 32, 34, 34, 31, 31, 31, 31, 31, 32, 35, 35, 30, 31, 31, 31, 32, 32, 35, 35, 31, 32, 32, 32, 32, 33, 36, 36, 33, 34, 34, 35, 35, 36, 39, 39, 33, 34, 34, 35, 35, 36, 39, 39, /* Size 16x16 */ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 35, 35, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 36, 37, 37, 37, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 35, 35, 35, 35, 35, 35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, 35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 36, 36, 36, 36, 36, 36, 37, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, /* Size 4x8 */ 31, 31, 31, 31, 31, 31, 34, 34, 31, 31, 31, 32, 32, 33, 36, 36, 31, 31, 31, 32, 32, 33, 36, 36, 34, 35, 35, 36, 36, 37, 40, 40, /* Size 8x4 */ 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40, /* Size 8x16 */ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41, 41, 41, /* Size 16x8 */ 32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 31, 32, 33, 33, 33, 33, 36, 39, 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, /* Size 16x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, 35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 37, 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 42, 43, 43, 43, 43, 43, 43, 44, /* Size 32x16 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 36, 38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32, 32, 33, 33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 36, 38, 39, 40, 42, 44, /* Size 4x16 */ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 39, 40, 40, 40, /* Size 16x4 */ 31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37, 33, 35, 35, 39, 34, 36, 36, 40, 34, 36, 36, 40, 34, 36, 36, 40, /* Size 8x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42, /* Size 32x8 */ 32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 31, 31, 32, 33, 33, 33, 35, 38, 31, 32, 33, 33, 33, 33, 36, 39, 32, 33, 34, 34, 34, 34, 37, 40, 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36, 36, 36, 39, 42 }, }, { { /* Luma */ /* Size 4x4 */ 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, /* Size 8x8 */ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, /* Size 16x16 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 4x8 */ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, /* Size 8x4 */ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, /* Size 8x16 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 16x8 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, /* Size 16x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 32x16 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 4x16 */ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 16x4 */ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, /* Size 8x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 32x8 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32 }, { /* Chroma */ /* Size 4x4 */ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, /* Size 8x8 */ 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, /* Size 16x16 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, /* Size 32x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, /* Size 4x8 */ 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, /* Size 8x4 */ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32, /* Size 8x16 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, /* Size 16x8 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32, /* Size 16x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 32x16 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 4x16 */ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, /* Size 16x4 */ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 30, 31, 32, 32, /* Size 8x32 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 32x8 */ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32 }, }, }; static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = { { { /* Luma */ /* Size 4x4 */ 32, 24, 14, 11, 24, 15, 11, 9, 14, 11, 7, 7, 11, 9, 7, 5, /* Size 8x8 */ 32, 32, 27, 20, 15, 12, 11, 9, 32, 29, 26, 21, 16, 13, 12, 10, 27, 26, 19, 16, 13, 11, 10, 10, 20, 21, 16, 12, 11, 9, 9, 8, 15, 16, 13, 11, 9, 8, 7, 7, 12, 13, 11, 9, 8, 7, 6, 6, 11, 12, 10, 9, 7, 6, 6, 5, 9, 10, 10, 8, 7, 6, 5, 5, /* Size 16x16 */ 32, 33, 33, 30, 28, 23, 21, 17, 16, 13, 12, 11, 11, 10, 9, 9, 33, 32, 32, 31, 30, 25, 23, 19, 17, 14, 14, 12, 11, 11, 10, 9, 33, 32, 31, 29, 28, 24, 23, 19, 17, 14, 14, 13, 12, 11, 10, 10, 30, 31, 29, 26, 24, 22, 20, 18, 16, 14, 13, 13, 12, 11, 11, 10, 28, 30, 28, 24, 21, 19, 18, 16, 15, 13, 13, 12, 11, 11, 10, 10, 23, 25, 24, 22, 19, 16, 15, 14, 13, 11, 11, 11, 10, 10, 9, 9, 21, 23, 23, 20, 18, 15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 9, 17, 19, 19, 18, 16, 14, 13, 11, 10, 9, 9, 9, 9, 8, 8, 8, 16, 17, 17, 16, 15, 13, 12, 10, 10, 9, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 11, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 12, 14, 14, 13, 13, 11, 10, 9, 8, 7, 7, 7, 7, 7, 6, 6, 11, 12, 13, 13, 12, 11, 10, 9, 8, 7, 7, 6, 6, 6, 6, 6, 11, 11, 12, 12, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 10, 11, 11, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 8, 7, 6, 6, 5, 5, 5, 5, 9, 9, 10, 10, 10, 9, 9, 8, 7, 7, 6, 6, 5, 5, 5, 4, /* Size 32x32 */ 32, 33, 33, 33, 33, 32, 30, 29, 28, 26, 23, 22, 21, 19, 17, 17, 16, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 33, 32, 32, 32, 32, 32, 30, 30, 29, 27, 24, 23, 22, 20, 18, 17, 17, 15, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 33, 32, 32, 32, 32, 32, 31, 30, 30, 28, 25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 33, 32, 32, 32, 31, 31, 30, 29, 29, 27, 25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 33, 32, 32, 31, 31, 30, 29, 28, 28, 26, 24, 23, 23, 20, 19, 18, 17, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 32, 32, 32, 31, 30, 29, 28, 28, 27, 26, 24, 23, 22, 21, 19, 19, 18, 16, 15, 15, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 30, 30, 31, 30, 29, 28, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 9, 29, 30, 30, 29, 28, 28, 25, 24, 23, 22, 20, 20, 19, 18, 17, 16, 16, 15, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 26, 27, 28, 27, 26, 26, 23, 22, 20, 19, 18, 17, 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 23, 24, 25, 25, 24, 24, 22, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 22, 23, 24, 24, 23, 23, 21, 20, 19, 17, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 8, 21, 22, 23, 23, 23, 22, 20, 19, 18, 17, 15, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 19, 20, 21, 21, 20, 21, 19, 18, 17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 17, 18, 19, 19, 19, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 17, 17, 18, 18, 18, 19, 17, 16, 16, 14, 13, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 16, 17, 17, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 14, 15, 16, 16, 16, 16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 13, 14, 14, 14, 14, 13, 13, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 12, 12, 13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 11, 11, 11, 12, 12, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 10, 11, 11, 11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 10, 10, 10, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 9, 10, 10, 10, 10, 10, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 4, 9, 9, 9, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 4, 4, 8, 9, 9, 9, 9, 9, 9, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 4, 4, 4, /* Size 4x8 */ 32, 31, 28, 21, 16, 13, 11, 10, 24, 24, 18, 14, 12, 11, 10, 9, 14, 15, 12, 10, 8, 7, 7, 7, 11, 12, 11, 9, 8, 7, 6, 5, /* Size 8x4 */ 32, 24, 14, 11, 31, 24, 15, 12, 28, 18, 12, 11, 21, 14, 10, 9, 16, 12, 8, 8, 13, 11, 7, 7, 11, 10, 7, 6, 10, 9, 7, 5, /* Size 8x16 */ 32, 33, 32, 30, 28, 23, 21, 18, 16, 13, 12, 11, 11, 10, 9, 9, 32, 31, 30, 28, 27, 24, 22, 19, 18, 15, 14, 13, 12, 11, 10, 10, 28, 30, 28, 24, 21, 19, 18, 16, 15, 13, 13, 12, 11, 10, 10, 10, 19, 21, 20, 19, 17, 14, 13, 12, 11, 10, 10, 10, 10, 9, 9, 9, 16, 17, 17, 16, 15, 13, 12, 10, 10, 9, 8, 8, 8, 8, 7, 8, 12, 13, 13, 13, 12, 11, 10, 9, 8, 7, 7, 7, 7, 7, 6, 7, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 12, 12, 11, 11, 10, 9, 8, 8, 7, 6, 6, 6, 5, 5, /* Size 16x8 */ 32, 32, 28, 19, 16, 12, 11, 10, 33, 31, 30, 21, 17, 13, 12, 11, 32, 30, 28, 20, 17, 13, 12, 12, 30, 28, 24, 19, 16, 13, 13, 12, 28, 27, 21, 17, 15, 12, 12, 11, 23, 24, 19, 14, 13, 11, 11, 11, 21, 22, 18, 13, 12, 10, 10, 10, 18, 19, 16, 12, 10, 9, 9, 9, 16, 18, 15, 11, 10, 8, 8, 8, 13, 15, 13, 10, 9, 7, 8, 8, 12, 14, 13, 10, 8, 7, 7, 7, 11, 13, 12, 10, 8, 7, 6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 6, 9, 10, 10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, /* Size 16x32 */ 32, 33, 33, 33, 32, 32, 30, 29, 28, 26, 23, 22, 21, 19, 18, 17, 16, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 33, 32, 32, 32, 32, 31, 30, 30, 30, 28, 25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 32, 32, 31, 31, 30, 29, 28, 28, 27, 26, 24, 23, 22, 20, 19, 18, 18, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 30, 30, 31, 30, 29, 28, 26, 25, 24, 23, 21, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 23, 24, 25, 25, 24, 24, 21, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 19, 20, 21, 21, 20, 21, 19, 18, 17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 17, 18, 19, 19, 19, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 8, 8, 9, 9, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 8, 8, 13, 14, 14, 14, 14, 15, 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 14, 13, 12, 12, 12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 11, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, /* Size 32x16 */ 32, 33, 32, 30, 28, 23, 19, 17, 16, 13, 12, 11, 11, 11, 10, 10, 33, 32, 32, 30, 29, 24, 20, 18, 17, 14, 12, 12, 12, 11, 11, 11, 33, 32, 31, 31, 30, 25, 21, 19, 17, 14, 13, 12, 12, 11, 11, 11, 33, 32, 31, 30, 29, 25, 21, 19, 17, 14, 13, 13, 12, 12, 11, 11, 32, 32, 30, 29, 28, 24, 20, 19, 17, 14, 13, 13, 12, 12, 12, 11, 32, 31, 29, 28, 27, 24, 21, 19, 18, 15, 14, 13, 12, 12, 12, 11, 30, 30, 28, 26, 24, 21, 19, 18, 16, 14, 13, 13, 13, 12, 12, 11, 29, 30, 28, 25, 23, 20, 18, 17, 16, 13, 12, 12, 12, 12, 12, 11, 28, 30, 27, 24, 21, 19, 17, 16, 15, 13, 12, 12, 12, 12, 11, 11, 26, 28, 26, 23, 20, 18, 16, 15, 14, 12, 12, 12, 11, 11, 11, 11, 23, 25, 24, 21, 19, 16, 14, 14, 13, 11, 11, 11, 11, 11, 11, 11, 22, 24, 23, 21, 19, 16, 14, 13, 12, 11, 10, 10, 10, 10, 10, 10, 21, 23, 22, 20, 18, 15, 13, 13, 12, 11, 10, 10, 10, 10, 10, 10, 19, 21, 20, 19, 17, 14, 12, 12, 11, 10, 9, 10, 10, 9, 10, 9, 18, 19, 19, 18, 16, 14, 12, 11, 10, 9, 9, 9, 9, 9, 9, 9, 17, 18, 18, 17, 16, 13, 12, 11, 10, 9, 9, 9, 9, 9, 9, 9, 16, 17, 18, 16, 15, 13, 11, 10, 10, 9, 8, 8, 8, 8, 8, 8, 14, 16, 16, 15, 14, 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 8, 13, 14, 15, 14, 13, 11, 10, 9, 9, 8, 7, 8, 8, 8, 8, 8, 13, 14, 14, 14, 13, 11, 10, 9, 9, 8, 7, 7, 7, 7, 7, 7, 12, 14, 14, 13, 13, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 10, 10, 9, 8, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12, 12, 11, 10, 10, 9, 8, 7, 7, 6, 6, 6, 6, 6, 11, 12, 12, 12, 11, 10, 10, 8, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 12, 12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 5, 5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 9, 10, 10, 10, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 9, 9, 10, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5, 8, 9, 9, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5, /* Size 4x16 */ 33, 32, 32, 30, 30, 25, 23, 19, 17, 14, 14, 12, 12, 11, 10, 9, 23, 25, 24, 21, 19, 16, 15, 14, 13, 11, 11, 10, 10, 10, 9, 9, 13, 14, 14, 14, 13, 11, 11, 9, 9, 8, 8, 7, 7, 7, 7, 7, 11, 11, 12, 12, 12, 11, 10, 9, 8, 8, 7, 6, 6, 6, 5, 5, /* Size 16x4 */ 33, 23, 13, 11, 32, 25, 14, 11, 32, 24, 14, 12, 30, 21, 14, 12, 30, 19, 13, 12, 25, 16, 11, 11, 23, 15, 11, 10, 19, 14, 9, 9, 17, 13, 9, 8, 14, 11, 8, 8, 14, 11, 8, 7, 12, 10, 7, 6, 12, 10, 7, 6, 11, 10, 7, 6, 10, 9, 7, 5, 9, 9, 7, 5, /* Size 8x32 */ 32, 33, 33, 33, 32, 32, 30, 29, 28, 26, 23, 22, 21, 19, 18, 17, 16, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 32, 32, 31, 31, 30, 29, 28, 28, 27, 26, 24, 23, 22, 20, 19, 18, 18, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 19, 20, 21, 21, 20, 21, 19, 18, 17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 16, 17, 17, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 8, 8, 12, 12, 13, 13, 13, 14, 13, 12, 12, 12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 11, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, /* Size 32x8 */ 32, 32, 28, 19, 16, 12, 11, 10, 33, 32, 29, 20, 17, 12, 12, 11, 33, 31, 30, 21, 17, 13, 12, 11, 33, 31, 29, 21, 17, 13, 12, 11, 32, 30, 28, 20, 17, 13, 12, 12, 32, 29, 27, 21, 18, 14, 12, 12, 30, 28, 24, 19, 16, 13, 13, 12, 29, 28, 23, 18, 16, 12, 12, 12, 28, 27, 21, 17, 15, 12, 12, 11, 26, 26, 20, 16, 14, 12, 11, 11, 23, 24, 19, 14, 13, 11, 11, 11, 22, 23, 19, 14, 12, 10, 10, 10, 21, 22, 18, 13, 12, 10, 10, 10, 19, 20, 17, 12, 11, 9, 10, 10, 18, 19, 16, 12, 10, 9, 9, 9, 17, 18, 16, 12, 10, 9, 9, 9, 16, 18, 15, 11, 10, 8, 8, 8, 14, 16, 14, 11, 9, 8, 8, 8, 13, 15, 13, 10, 9, 7, 8, 8, 13, 14, 13, 10, 9, 7, 7, 7, 12, 14, 13, 10, 8, 7, 7, 7, 12, 13, 12, 9, 8, 7, 7, 7, 11, 13, 12, 10, 8, 7, 6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 5, 9, 10, 10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, 8, 9, 10, 9, 8, 7, 6, 5 }, { /* Chroma */ /* Size 4x4 */ 29, 22, 18, 16, 22, 17, 15, 14, 18, 15, 11, 11, 16, 14, 11, 9, /* Size 8x8 */ 33, 27, 22, 20, 18, 16, 15, 14, 27, 22, 22, 22, 20, 18, 17, 15, 22, 22, 19, 18, 17, 16, 15, 15, 20, 22, 18, 16, 14, 13, 14, 14, 18, 20, 17, 14, 12, 12, 12, 12, 16, 18, 16, 13, 12, 11, 11, 11, 15, 17, 15, 14, 12, 11, 10, 10, 14, 15, 15, 14, 12, 11, 10, 9, /* Size 16x16 */ 32, 34, 31, 25, 21, 21, 20, 19, 18, 16, 16, 15, 15, 14, 14, 13, 34, 32, 29, 24, 22, 23, 22, 21, 20, 18, 18, 17, 16, 15, 15, 14, 31, 29, 26, 23, 22, 23, 22, 21, 20, 18, 18, 17, 17, 16, 16, 15, 25, 24, 23, 21, 20, 21, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 21, 22, 22, 20, 19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 16, 16, 21, 23, 23, 21, 19, 18, 17, 17, 16, 15, 15, 15, 15, 15, 15, 15, 20, 22, 22, 20, 19, 17, 17, 16, 15, 14, 14, 14, 14, 14, 14, 14, 19, 21, 21, 20, 19, 17, 16, 14, 14, 13, 13, 13, 13, 13, 13, 13, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18, 18, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 16, 18, 18, 18, 17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 10, 10, 15, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 15, 16, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 9, 9, 9, 13, 14, 15, 15, 16, 15, 14, 13, 12, 12, 11, 10, 10, 9, 9, 9, /* Size 32x32 */ 32, 33, 34, 32, 31, 28, 25, 23, 21, 21, 21, 20, 20, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 33, 33, 33, 31, 30, 27, 24, 23, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 34, 33, 32, 31, 29, 26, 24, 23, 22, 23, 23, 23, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 32, 31, 31, 29, 28, 25, 24, 23, 22, 22, 23, 22, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 31, 30, 29, 28, 26, 24, 23, 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 28, 27, 26, 25, 24, 22, 22, 22, 21, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 25, 24, 24, 24, 23, 22, 21, 21, 20, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 23, 23, 23, 23, 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 23, 22, 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 21, 22, 23, 23, 23, 23, 21, 20, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 22, 23, 22, 22, 22, 21, 20, 19, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15, 14, 14, 14, 14, 14, 14, 20, 21, 22, 22, 22, 22, 20, 20, 19, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 22, 22, 22, 22, 20, 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14, 13, 14, 14, 13, 14, 14, 13, 13, 19, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 20, 20, 20, 21, 20, 19, 18, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 13, 12, 13, 13, 13, 13, 13, 13, 13, 13, 12, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 19, 19, 19, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 15, 16, 16, 17, 17, 17, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17, 17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 16, 16, 16, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 10, 14, 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 14, 14, 14, 15, 15, 15, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 13, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 13, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, /* Size 4x8 */ 33, 26, 22, 21, 19, 17, 16, 15, 22, 23, 18, 17, 16, 15, 15, 14, 17, 19, 16, 14, 12, 11, 11, 12, 16, 17, 16, 14, 12, 11, 10, 10, /* Size 8x4 */ 33, 22, 17, 16, 26, 23, 19, 17, 22, 18, 16, 16, 21, 17, 14, 14, 19, 16, 12, 12, 17, 15, 11, 11, 16, 15, 11, 10, 15, 14, 12, 10, /* Size 8x16 */ 32, 34, 31, 24, 21, 21, 20, 19, 18, 16, 16, 15, 15, 14, 14, 13, 28, 26, 24, 22, 21, 22, 22, 21, 20, 19, 18, 17, 17, 16, 15, 15, 21, 22, 22, 20, 19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 16, 15, 20, 21, 22, 20, 19, 17, 16, 15, 15, 14, 14, 14, 14, 14, 14, 14, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 17, 17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 14, 13, 13, 12, 11, 10, 10, 10, 10, 10, 14, 16, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, /* Size 16x8 */ 32, 28, 21, 20, 18, 16, 15, 14, 34, 26, 22, 21, 20, 17, 16, 16, 31, 24, 22, 22, 20, 17, 17, 16, 24, 22, 20, 20, 19, 17, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17, 21, 22, 19, 17, 16, 15, 16, 16, 20, 22, 19, 16, 15, 14, 14, 15, 19, 21, 19, 15, 14, 13, 13, 14, 18, 20, 18, 15, 13, 12, 13, 13, 16, 19, 17, 14, 12, 11, 12, 12, 16, 18, 17, 14, 12, 11, 11, 12, 15, 17, 16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 10, 13, 15, 15, 14, 12, 11, 10, 9, /* Size 16x32 */ 32, 33, 34, 32, 31, 28, 24, 23, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 33, 33, 32, 30, 28, 26, 24, 23, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 28, 27, 26, 25, 24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 21, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 16, 16, 16, 16, 15, 15, 21, 22, 23, 23, 22, 23, 21, 20, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 15, 15, 15, 15, 15, 15, 14, 14, 14, 20, 20, 21, 21, 22, 22, 20, 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 19, 20, 20, 21, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 12, 12, 11, 12, 12, 12, 12, 12, 11, 16, 16, 17, 17, 17, 18, 17, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 11, 10, 10, 10, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 9, 14, 15, 16, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, /* Size 32x16 */ 32, 33, 28, 24, 21, 21, 20, 19, 18, 16, 16, 15, 15, 15, 14, 14, 33, 33, 27, 24, 22, 22, 20, 20, 19, 17, 16, 16, 16, 16, 15, 15, 34, 32, 26, 24, 22, 23, 21, 20, 20, 18, 17, 17, 16, 16, 16, 15, 32, 30, 25, 23, 22, 23, 21, 21, 20, 18, 17, 17, 17, 16, 16, 16, 31, 28, 24, 23, 22, 22, 22, 21, 20, 18, 17, 17, 17, 17, 16, 16, 28, 26, 22, 22, 22, 23, 22, 21, 20, 19, 18, 18, 17, 17, 17, 16, 24, 24, 22, 21, 20, 21, 20, 20, 19, 18, 17, 18, 17, 17, 17, 16, 23, 23, 22, 21, 20, 20, 20, 19, 19, 17, 17, 17, 17, 17, 17, 17, 21, 22, 21, 20, 19, 19, 19, 19, 18, 17, 17, 16, 17, 16, 17, 17, 21, 22, 22, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 16, 21, 23, 22, 21, 19, 18, 17, 17, 16, 15, 15, 15, 16, 16, 16, 16, 21, 22, 22, 21, 19, 17, 17, 16, 16, 15, 14, 15, 15, 15, 15, 15, 20, 22, 22, 20, 19, 17, 16, 16, 15, 14, 14, 14, 14, 15, 15, 15, 20, 21, 22, 20, 19, 17, 16, 15, 14, 14, 13, 14, 14, 14, 14, 14, 19, 20, 21, 20, 19, 17, 15, 14, 14, 13, 13, 13, 13, 14, 14, 14, 19, 20, 21, 20, 18, 16, 15, 14, 14, 13, 12, 13, 13, 13, 13, 13, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 13, 13, 13, 13, 17, 19, 20, 19, 18, 16, 14, 14, 13, 12, 12, 12, 12, 12, 13, 13, 16, 18, 19, 18, 17, 15, 14, 13, 12, 12, 11, 12, 12, 12, 12, 13, 16, 18, 19, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 16, 17, 18, 18, 17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 12, 12, 15, 17, 18, 17, 16, 15, 13, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 14, 14, 13, 12, 11, 11, 11, 10, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 11, 11, 15, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 11, 14, 16, 16, 17, 15, 15, 14, 13, 12, 11, 11, 10, 10, 10, 10, 10, 14, 16, 16, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 16, 16, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 13, 15, 15, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 13, 15, 15, 15, 15, 14, 14, 13, 13, 11, 11, 10, 10, 9, 9, 9, /* Size 4x16 */ 33, 32, 28, 24, 22, 23, 22, 20, 20, 18, 17, 17, 16, 16, 15, 15, 21, 23, 22, 21, 19, 18, 17, 17, 16, 15, 15, 14, 15, 15, 15, 14, 16, 18, 18, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 15, 16, 17, 17, 16, 16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 10, /* Size 16x4 */ 33, 21, 16, 15, 32, 23, 18, 16, 28, 22, 18, 17, 24, 21, 18, 17, 22, 19, 17, 16, 23, 18, 15, 16, 22, 17, 14, 15, 20, 17, 13, 14, 20, 16, 12, 13, 18, 15, 12, 12, 17, 15, 11, 11, 17, 14, 11, 11, 16, 15, 12, 10, 16, 15, 12, 10, 15, 15, 12, 10, 15, 14, 12, 10, /* Size 8x32 */ 32, 33, 34, 32, 31, 28, 24, 23, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 28, 27, 26, 25, 24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 21, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 16, 16, 16, 16, 15, 15, 20, 20, 21, 21, 22, 22, 20, 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 16, 16, 17, 17, 17, 18, 17, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 16, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, /* Size 32x8 */ 32, 28, 21, 20, 18, 16, 15, 14, 33, 27, 22, 20, 19, 16, 16, 15, 34, 26, 22, 21, 20, 17, 16, 16, 32, 25, 22, 21, 20, 17, 17, 16, 31, 24, 22, 22, 20, 17, 17, 16, 28, 22, 22, 22, 20, 18, 17, 17, 24, 22, 20, 20, 19, 17, 17, 17, 23, 22, 20, 20, 19, 17, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17, 21, 22, 19, 18, 17, 16, 16, 16, 21, 22, 19, 17, 16, 15, 16, 16, 21, 22, 19, 17, 16, 14, 15, 15, 20, 22, 19, 16, 15, 14, 14, 15, 20, 22, 19, 16, 14, 13, 14, 14, 19, 21, 19, 15, 14, 13, 13, 14, 19, 21, 18, 15, 14, 12, 13, 13, 18, 20, 18, 15, 13, 12, 13, 13, 17, 20, 18, 14, 13, 12, 12, 13, 16, 19, 17, 14, 12, 11, 12, 12, 16, 19, 17, 14, 12, 11, 12, 12, 16, 18, 17, 14, 12, 11, 11, 12, 15, 18, 16, 13, 12, 11, 11, 11, 15, 17, 16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 10, 14, 16, 15, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 9, 13, 15, 15, 14, 12, 11, 10, 9, 13, 15, 15, 14, 13, 11, 10, 9 }, }, { { /* Luma */ /* Size 4x4 */ 32, 25, 15, 11, 25, 16, 12, 10, 15, 12, 8, 7, 11, 10, 7, 6, /* Size 8x8 */ 32, 32, 28, 22, 17, 13, 11, 10, 32, 29, 26, 22, 18, 14, 12, 11, 28, 26, 20, 17, 14, 12, 11, 10, 22, 22, 17, 14, 12, 10, 10, 9, 17, 18, 14, 12, 10, 8, 8, 8, 13, 14, 12, 10, 8, 7, 7, 7, 11, 12, 11, 10, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 5, /* Size 16x16 */ 32, 33, 33, 32, 28, 26, 22, 19, 17, 14, 13, 12, 11, 10, 10, 9, 33, 32, 32, 31, 30, 28, 23, 20, 18, 16, 14, 13, 12, 11, 10, 10, 33, 32, 31, 30, 28, 26, 23, 20, 18, 16, 14, 13, 12, 12, 11, 10, 32, 31, 30, 28, 26, 24, 22, 20, 18, 16, 14, 13, 13, 12, 11, 10, 28, 30, 28, 26, 21, 20, 18, 17, 16, 14, 13, 12, 12, 11, 11, 10, 26, 28, 26, 24, 20, 19, 17, 16, 15, 13, 12, 12, 11, 11, 10, 10, 22, 23, 23, 22, 18, 17, 15, 14, 13, 12, 11, 10, 10, 10, 9, 9, 19, 20, 20, 20, 17, 16, 14, 12, 12, 11, 10, 9, 9, 9, 9, 8, 17, 18, 18, 18, 16, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 14, 16, 16, 16, 14, 13, 12, 11, 10, 9, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 12, 10, 9, 9, 8, 7, 7, 7, 7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10, 11, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 5, 10, 10, 11, 11, 11, 10, 9, 9, 8, 8, 7, 6, 6, 6, 5, 5, 9, 10, 10, 10, 10, 10, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5, /* Size 32x32 */ 32, 33, 33, 33, 33, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 17, 17, 16, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 33, 32, 32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 18, 18, 17, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 33, 32, 32, 32, 32, 32, 31, 31, 30, 28, 28, 25, 23, 22, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 33, 32, 32, 32, 32, 31, 31, 30, 29, 28, 27, 25, 23, 23, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 33, 32, 32, 32, 31, 30, 30, 29, 28, 27, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 32, 32, 32, 31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 21, 19, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 30, 30, 31, 30, 29, 28, 27, 26, 24, 23, 23, 22, 20, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 10, 28, 29, 30, 29, 28, 27, 26, 24, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 27, 28, 28, 28, 27, 26, 25, 23, 20, 20, 20, 18, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 26, 27, 28, 27, 26, 26, 24, 23, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 23, 24, 25, 25, 24, 24, 23, 22, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 21, 22, 22, 23, 22, 22, 21, 20, 18, 17, 17, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 19, 20, 20, 21, 20, 21, 20, 19, 17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 17, 18, 19, 19, 19, 19, 19, 18, 16, 15, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 17, 18, 18, 18, 18, 19, 18, 17, 16, 15, 15, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 14, 15, 16, 16, 16, 16, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 13, 14, 15, 15, 15, 16, 15, 14, 13, 13, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 12, 13, 13, 13, 14, 13, 13, 12, 12, 12, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 7, 6, 6, 6, 6, 6, 12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 10, 11, 11, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 10, 11, 11, 11, 11, 11, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 9, 10, 10, 10, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 9, 10, 10, 10, 10, 10, 10, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, /* Size 4x8 */ 32, 31, 28, 22, 17, 14, 12, 10, 24, 24, 18, 15, 13, 11, 11, 10, 15, 16, 13, 11, 9, 8, 8, 8, 12, 12, 12, 10, 8, 7, 6, 6, /* Size 8x4 */ 32, 24, 15, 12, 31, 24, 16, 12, 28, 18, 13, 12, 22, 15, 11, 10, 17, 13, 9, 8, 14, 11, 8, 7, 12, 11, 8, 6, 10, 10, 8, 6, /* Size 8x16 */ 32, 33, 32, 32, 28, 26, 22, 19, 17, 14, 13, 12, 11, 10, 10, 9, 32, 32, 30, 29, 28, 26, 23, 20, 18, 16, 15, 13, 12, 12, 11, 10, 28, 29, 28, 26, 21, 20, 18, 17, 16, 14, 13, 12, 12, 11, 11, 10, 22, 23, 23, 22, 18, 17, 15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 16, 17, 17, 17, 15, 14, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8, 13, 14, 14, 14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 6, 6, 6, 11, 11, 12, 12, 12, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, /* Size 16x8 */ 32, 32, 28, 22, 16, 13, 11, 11, 33, 32, 29, 23, 17, 14, 12, 11, 32, 30, 28, 23, 17, 14, 13, 12, 32, 29, 26, 22, 17, 14, 13, 12, 28, 28, 21, 18, 15, 13, 12, 12, 26, 26, 20, 17, 14, 12, 11, 11, 22, 23, 18, 15, 12, 11, 10, 10, 19, 20, 17, 14, 11, 10, 9, 9, 17, 18, 16, 13, 10, 9, 9, 9, 14, 16, 14, 12, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 7, 11, 12, 12, 10, 8, 7, 7, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7, 6, 6, 9, 10, 10, 9, 8, 7, 6, 5, /* Size 16x32 */ 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 17, 16, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 33, 32, 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 23, 22, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 32, 32, 32, 31, 30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 28, 29, 29, 29, 28, 27, 26, 24, 21, 21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 23, 24, 25, 25, 24, 24, 23, 21, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 6, 6, 11, 12, 12, 12, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 11, 11, 11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 10, 11, 11, 11, 11, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, /* Size 32x16 */ 32, 33, 32, 32, 28, 23, 22, 19, 16, 14, 13, 12, 11, 11, 11, 10, 33, 32, 32, 31, 29, 24, 23, 20, 17, 15, 14, 12, 12, 12, 11, 11, 33, 32, 32, 31, 29, 25, 23, 21, 17, 15, 14, 13, 12, 12, 11, 11, 33, 32, 31, 31, 29, 25, 23, 21, 17, 16, 14, 13, 12, 12, 12, 11, 32, 32, 30, 30, 28, 24, 23, 20, 17, 16, 14, 13, 13, 12, 12, 11, 32, 31, 29, 28, 27, 24, 23, 21, 18, 16, 15, 13, 13, 12, 12, 12, 32, 31, 29, 28, 26, 23, 22, 20, 17, 16, 14, 13, 13, 13, 12, 12, 30, 30, 28, 27, 24, 21, 20, 19, 16, 15, 14, 13, 12, 13, 12, 12, 28, 30, 28, 26, 21, 19, 18, 17, 15, 14, 13, 12, 12, 12, 12, 12, 27, 28, 26, 25, 21, 18, 18, 16, 14, 13, 13, 12, 12, 12, 11, 11, 26, 28, 26, 24, 20, 18, 17, 16, 14, 13, 12, 11, 11, 11, 11, 11, 23, 25, 24, 23, 19, 16, 16, 14, 13, 12, 11, 11, 11, 11, 11, 10, 22, 23, 23, 22, 18, 16, 15, 14, 12, 11, 11, 10, 10, 10, 10, 10, 21, 22, 22, 21, 18, 15, 14, 13, 12, 11, 11, 10, 10, 10, 10, 10, 19, 21, 20, 20, 17, 14, 14, 12, 11, 10, 10, 9, 9, 10, 9, 10, 18, 19, 19, 19, 16, 14, 13, 12, 10, 10, 9, 9, 9, 9, 9, 9, 17, 18, 18, 18, 16, 13, 13, 12, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17, 17, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8, 14, 16, 16, 16, 14, 12, 12, 11, 9, 9, 8, 8, 8, 8, 8, 8, 13, 15, 15, 15, 13, 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 8, 13, 14, 15, 14, 13, 11, 11, 10, 9, 8, 8, 7, 7, 7, 7, 8, 12, 14, 14, 14, 13, 11, 11, 10, 8, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 6, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12, 11, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6, 10, 12, 12, 12, 11, 11, 9, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 11, 12, 11, 10, 9, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10, 10, 11, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 6, 5, 5, 9, 10, 10, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5, /* Size 4x16 */ 33, 32, 32, 31, 30, 28, 23, 21, 18, 16, 14, 13, 12, 12, 11, 10, 23, 25, 24, 23, 19, 18, 16, 14, 13, 12, 11, 11, 11, 11, 10, 9, 14, 15, 16, 16, 14, 13, 11, 10, 10, 9, 8, 8, 8, 8, 8, 7, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 7, 7, 6, 6, 6, 6, /* Size 16x4 */ 33, 23, 14, 11, 32, 25, 15, 12, 32, 24, 16, 12, 31, 23, 16, 13, 30, 19, 14, 12, 28, 18, 13, 11, 23, 16, 11, 10, 21, 14, 10, 10, 18, 13, 10, 9, 16, 12, 9, 8, 14, 11, 8, 7, 13, 11, 8, 7, 12, 11, 8, 6, 12, 11, 8, 6, 11, 10, 8, 6, 10, 9, 7, 6, /* Size 8x32 */ 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 17, 16, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 32, 32, 32, 31, 30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 28, 29, 29, 29, 28, 27, 26, 24, 21, 21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 11, 12, 12, 12, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, /* Size 32x8 */ 32, 32, 28, 22, 16, 13, 11, 11, 33, 32, 29, 23, 17, 14, 12, 11, 33, 32, 29, 23, 17, 14, 12, 11, 33, 31, 29, 23, 17, 14, 12, 12, 32, 30, 28, 23, 17, 14, 13, 12, 32, 29, 27, 23, 18, 15, 13, 12, 32, 29, 26, 22, 17, 14, 13, 12, 30, 28, 24, 20, 16, 14, 12, 12, 28, 28, 21, 18, 15, 13, 12, 12, 27, 26, 21, 18, 14, 13, 12, 11, 26, 26, 20, 17, 14, 12, 11, 11, 23, 24, 19, 16, 13, 11, 11, 11, 22, 23, 18, 15, 12, 11, 10, 10, 21, 22, 18, 14, 12, 11, 10, 10, 19, 20, 17, 14, 11, 10, 9, 9, 18, 19, 16, 13, 10, 9, 9, 9, 17, 18, 16, 13, 10, 9, 9, 9, 16, 17, 15, 12, 10, 9, 8, 8, 14, 16, 14, 12, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 14, 13, 11, 8, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 7, 12, 13, 12, 10, 8, 7, 7, 7, 11, 12, 12, 10, 8, 7, 7, 6, 11, 12, 11, 10, 9, 7, 6, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5 }, { /* Chroma */ /* Size 4x4 */ 31, 23, 18, 16, 23, 18, 16, 15, 18, 16, 12, 12, 16, 15, 12, 10, /* Size 8x8 */ 33, 27, 22, 21, 19, 17, 16, 15, 27, 22, 22, 22, 20, 19, 17, 16, 22, 22, 19, 19, 18, 16, 16, 16, 21, 22, 19, 17, 15, 14, 14, 14, 19, 20, 18, 15, 13, 12, 12, 12, 17, 19, 16, 14, 12, 11, 11, 11, 16, 17, 16, 14, 12, 11, 10, 10, 15, 16, 16, 14, 12, 11, 10, 9, /* Size 16x16 */ 32, 34, 31, 27, 21, 21, 20, 20, 19, 17, 16, 16, 15, 15, 14, 14, 34, 33, 29, 25, 22, 22, 22, 21, 20, 19, 18, 17, 16, 16, 15, 15, 31, 29, 26, 23, 22, 22, 22, 22, 20, 19, 18, 18, 17, 17, 16, 15, 27, 25, 23, 22, 21, 21, 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 21, 22, 22, 21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 21, 22, 22, 21, 19, 19, 18, 18, 17, 17, 16, 16, 15, 16, 15, 15, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 20, 21, 22, 21, 19, 18, 16, 16, 15, 14, 14, 13, 14, 13, 13, 13, 19, 20, 20, 20, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 17, 19, 19, 19, 18, 17, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 12, 11, 16, 17, 18, 18, 17, 16, 14, 13, 13, 12, 11, 11, 11, 11, 11, 11, 15, 16, 17, 18, 17, 15, 14, 14, 13, 12, 11, 11, 10, 10, 10, 10, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 11, 11, 10, 10, 10, 10, 14, 15, 16, 17, 16, 15, 14, 13, 13, 12, 12, 11, 10, 10, 10, 9, 14, 15, 15, 16, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 9, 9, /* Size 32x32 */ 32, 33, 34, 33, 31, 28, 27, 25, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 33, 33, 33, 32, 30, 27, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 34, 33, 33, 32, 29, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 33, 32, 32, 31, 28, 26, 25, 24, 22, 22, 23, 23, 22, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 31, 30, 29, 28, 26, 24, 23, 23, 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 28, 27, 26, 26, 24, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 27, 26, 25, 25, 23, 22, 22, 21, 21, 21, 21, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 25, 24, 24, 24, 23, 22, 21, 21, 20, 20, 21, 21, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 21, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 17, 17, 16, 16, 16, 16, 16, 16, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 21, 22, 22, 23, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 16, 16, 15, 15, 15, 15, 15, 21, 22, 23, 23, 23, 23, 22, 21, 19, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 14, 13, 13, 14, 13, 13, 13, 14, 19, 20, 20, 21, 21, 21, 21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 19, 19, 20, 20, 20, 21, 20, 20, 18, 18, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 19, 19, 19, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 17, 18, 18, 19, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 12, 11, 11, 12, 16, 17, 17, 18, 18, 19, 18, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 16, 17, 17, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 11, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 14, 15, 15, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 14, 15, 15, 16, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 14, 15, 15, 15, 15, 16, 16, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 14, 14, 14, 15, 15, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, /* Size 4x8 */ 33, 26, 22, 22, 20, 17, 16, 16, 22, 23, 19, 17, 16, 15, 16, 15, 18, 20, 17, 15, 13, 12, 12, 12, 16, 17, 16, 14, 13, 11, 10, 10, /* Size 8x4 */ 33, 22, 18, 16, 26, 23, 20, 17, 22, 19, 17, 16, 22, 17, 15, 14, 20, 16, 13, 13, 17, 15, 12, 11, 16, 16, 12, 10, 16, 15, 12, 10, /* Size 8x16 */ 32, 34, 31, 26, 21, 21, 20, 20, 19, 17, 16, 16, 15, 15, 14, 14, 29, 27, 25, 22, 21, 22, 22, 22, 21, 19, 19, 18, 17, 17, 16, 15, 21, 22, 22, 21, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 18, 20, 20, 20, 18, 17, 16, 14, 14, 13, 12, 12, 13, 13, 12, 13, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 12, 11, 12, 15, 16, 17, 18, 17, 16, 14, 14, 13, 12, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, /* Size 16x8 */ 32, 29, 21, 20, 18, 16, 15, 15, 34, 27, 22, 22, 20, 18, 16, 16, 31, 25, 22, 22, 20, 18, 17, 16, 26, 22, 21, 22, 20, 19, 18, 17, 21, 21, 19, 19, 18, 17, 17, 17, 21, 22, 19, 18, 17, 16, 16, 16, 20, 22, 19, 17, 16, 15, 14, 15, 20, 22, 19, 16, 14, 14, 14, 14, 19, 21, 18, 16, 14, 13, 13, 13, 17, 19, 18, 15, 13, 12, 12, 12, 16, 19, 17, 15, 12, 12, 11, 12, 16, 18, 17, 14, 12, 11, 11, 11, 15, 17, 16, 14, 13, 11, 11, 11, 15, 17, 16, 14, 13, 12, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 13, 12, 10, 10, /* Size 16x32 */ 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 33, 33, 32, 31, 28, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 29, 28, 27, 27, 25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 27, 26, 26, 25, 23, 22, 22, 21, 21, 21, 21, 22, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 16, 16, 15, 15, 15, 15, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 14, 14, 13, 13, 13, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 13, 13, 13, 12, 12, 13, 13, 13, 17, 18, 19, 19, 19, 20, 19, 18, 17, 17, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 12, 11, 11, 11, 12, 12, 15, 16, 17, 17, 17, 18, 18, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 10, 10, 9, /* Size 32x16 */ 32, 33, 29, 27, 21, 21, 20, 20, 18, 17, 16, 15, 15, 15, 15, 14, 33, 33, 28, 26, 22, 22, 21, 20, 19, 18, 17, 16, 16, 16, 16, 15, 34, 32, 27, 26, 22, 23, 22, 21, 20, 19, 18, 17, 16, 16, 16, 15, 33, 31, 27, 25, 22, 23, 22, 21, 20, 19, 18, 17, 17, 17, 16, 16, 31, 28, 25, 23, 22, 22, 22, 22, 20, 19, 18, 17, 17, 17, 16, 16, 28, 26, 23, 22, 22, 23, 22, 22, 20, 20, 19, 18, 17, 17, 17, 17, 26, 25, 22, 22, 21, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 17, 24, 24, 22, 21, 20, 21, 20, 20, 19, 18, 18, 17, 17, 17, 17, 17, 21, 22, 21, 21, 19, 19, 19, 19, 18, 17, 17, 16, 17, 17, 17, 17, 21, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 22, 22, 21, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 21, 23, 23, 22, 19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 16, 15, 20, 22, 22, 21, 19, 17, 17, 16, 16, 15, 15, 14, 14, 15, 15, 15, 20, 22, 22, 21, 19, 17, 17, 16, 15, 15, 14, 14, 14, 14, 15, 14, 20, 21, 22, 21, 19, 17, 16, 16, 14, 14, 14, 13, 14, 14, 14, 14, 19, 20, 21, 20, 19, 17, 16, 15, 14, 13, 13, 13, 13, 13, 14, 14, 19, 20, 21, 20, 18, 16, 16, 15, 14, 13, 13, 13, 13, 13, 13, 14, 18, 20, 20, 20, 18, 16, 16, 15, 13, 13, 12, 12, 12, 13, 13, 13, 17, 19, 19, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 13, 17, 18, 19, 19, 17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18, 19, 18, 17, 15, 15, 14, 12, 12, 12, 11, 11, 12, 12, 12, 16, 17, 18, 18, 17, 15, 14, 14, 12, 12, 11, 11, 11, 11, 12, 12, 16, 17, 18, 18, 17, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 12, 15, 17, 17, 18, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 13, 12, 11, 11, 11, 10, 11, 11, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 11, 11, 10, 10, 10, 10, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 12, 11, 10, 10, 10, 10, 14, 16, 16, 17, 16, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, 10, 14, 16, 16, 17, 16, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, 10, 14, 16, 16, 16, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 15, 14, 13, 13, 12, 12, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, /* Size 4x16 */ 33, 32, 28, 25, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 15, 21, 23, 22, 22, 19, 18, 17, 17, 16, 16, 15, 15, 15, 16, 15, 15, 17, 19, 19, 19, 17, 17, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 15, 16, 17, 17, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, /* Size 16x4 */ 33, 21, 17, 15, 32, 23, 19, 16, 28, 22, 19, 17, 25, 22, 19, 17, 22, 19, 17, 17, 22, 18, 17, 16, 22, 17, 15, 15, 21, 17, 14, 14, 20, 16, 13, 13, 19, 16, 12, 12, 18, 15, 12, 12, 17, 15, 12, 11, 17, 15, 12, 10, 16, 16, 12, 10, 16, 15, 12, 10, 15, 15, 12, 10, /* Size 8x32 */ 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 29, 28, 27, 27, 25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 13, 13, 13, 12, 12, 13, 13, 13, 16, 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 12, 11, 11, 11, 12, 12, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 11, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, /* Size 32x8 */ 32, 29, 21, 20, 18, 16, 15, 15, 33, 28, 22, 21, 19, 17, 16, 16, 34, 27, 22, 22, 20, 18, 16, 16, 33, 27, 22, 22, 20, 18, 17, 16, 31, 25, 22, 22, 20, 18, 17, 16, 28, 23, 22, 22, 20, 19, 17, 17, 26, 22, 21, 22, 20, 19, 18, 17, 24, 22, 20, 20, 19, 18, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17, 21, 22, 19, 19, 18, 17, 16, 16, 21, 22, 19, 18, 17, 16, 16, 16, 21, 23, 19, 17, 16, 15, 15, 16, 20, 22, 19, 17, 16, 15, 14, 15, 20, 22, 19, 17, 15, 14, 14, 15, 20, 22, 19, 16, 14, 14, 14, 14, 19, 21, 19, 16, 14, 13, 13, 14, 19, 21, 18, 16, 14, 13, 13, 13, 18, 20, 18, 16, 13, 12, 12, 13, 17, 19, 18, 15, 13, 12, 12, 12, 17, 19, 17, 15, 13, 12, 12, 12, 16, 19, 17, 15, 12, 12, 11, 12, 16, 18, 17, 14, 12, 11, 11, 12, 16, 18, 17, 14, 12, 11, 11, 11, 15, 17, 16, 14, 12, 11, 11, 11, 15, 17, 16, 14, 13, 11, 11, 11, 15, 17, 16, 14, 13, 11, 10, 10, 15, 17, 16, 14, 13, 12, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14, 13, 11, 10, 10, 14, 15, 16, 14, 13, 12, 10, 10, 14, 15, 16, 14, 13, 12, 11, 10 }, }, { { /* Luma */ /* Size 4x4 */ 32, 27, 16, 12, 27, 18, 13, 11, 16, 13, 9, 8, 12, 11, 8, 6, /* Size 8x8 */ 32, 32, 29, 22, 18, 13, 12, 11, 32, 30, 28, 23, 19, 15, 13, 11, 29, 28, 21, 18, 16, 13, 12, 11, 22, 23, 18, 15, 13, 11, 10, 10, 18, 19, 16, 13, 11, 9, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 6, 11, 11, 11, 10, 8, 7, 6, 6, /* Size 16x16 */ 32, 33, 33, 32, 30, 26, 23, 21, 18, 16, 14, 13, 12, 11, 10, 10, 33, 32, 32, 32, 30, 27, 25, 22, 19, 17, 16, 14, 13, 12, 11, 10, 33, 32, 31, 30, 28, 26, 24, 22, 19, 17, 16, 14, 13, 12, 12, 11, 32, 32, 30, 29, 28, 26, 24, 22, 20, 18, 16, 14, 14, 13, 12, 11, 30, 30, 28, 28, 24, 22, 20, 19, 17, 16, 15, 13, 12, 12, 12, 11, 26, 27, 26, 26, 22, 19, 18, 17, 15, 14, 13, 12, 11, 11, 11, 10, 23, 25, 24, 24, 20, 18, 16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 21, 22, 22, 22, 19, 17, 15, 14, 13, 12, 11, 10, 10, 10, 9, 9, 18, 19, 19, 20, 17, 15, 14, 13, 11, 11, 10, 9, 9, 9, 9, 8, 16, 17, 17, 18, 16, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 14, 16, 16, 16, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 7, 12, 13, 13, 14, 12, 11, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 10, 11, 12, 12, 12, 11, 10, 9, 9, 8, 8, 7, 6, 6, 6, 6, 10, 10, 11, 11, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 5, /* Size 32x32 */ 32, 33, 33, 33, 33, 32, 32, 30, 30, 28, 26, 25, 23, 21, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 27, 26, 24, 22, 22, 20, 19, 18, 17, 16, 15, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 33, 32, 32, 32, 32, 32, 32, 31, 30, 30, 27, 26, 25, 23, 22, 20, 19, 19, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 33, 32, 32, 32, 32, 32, 32, 31, 30, 30, 28, 27, 25, 23, 23, 21, 19, 19, 17, 16, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 33, 32, 32, 32, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 16, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 31, 30, 30, 28, 28, 28, 26, 26, 24, 23, 22, 21, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 30, 30, 29, 28, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 30, 30, 31, 31, 29, 28, 28, 26, 25, 24, 23, 22, 22, 20, 20, 19, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 30, 30, 30, 30, 28, 28, 28, 25, 24, 23, 22, 21, 20, 19, 19, 18, 17, 17, 16, 15, 15, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 28, 29, 30, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 26, 27, 27, 28, 26, 26, 26, 23, 22, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, 11, 12, 11, 11, 11, 11, 10, 10, 25, 26, 26, 27, 26, 26, 25, 22, 21, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 23, 24, 25, 25, 24, 24, 24, 22, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 21, 22, 23, 23, 23, 23, 23, 20, 19, 18, 17, 17, 16, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 21, 22, 22, 23, 22, 22, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 19, 20, 20, 21, 20, 21, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 9, 17, 18, 19, 19, 19, 19, 19, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 9, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 18, 16, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 15, 16, 16, 16, 16, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 14, 15, 16, 16, 16, 16, 16, 15, 15, 14, 13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 8, 7, 7, 7, 13, 13, 14, 14, 14, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 12, 13, 13, 13, 13, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 11, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 11, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 10, 10, 10, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, /* Size 4x8 */ 32, 32, 30, 23, 19, 14, 12, 11, 27, 26, 20, 17, 15, 12, 12, 10, 17, 18, 15, 12, 10, 9, 8, 8, 12, 13, 12, 10, 9, 8, 7, 6, /* Size 8x4 */ 32, 27, 17, 12, 32, 26, 18, 13, 30, 20, 15, 12, 23, 17, 12, 10, 19, 15, 10, 9, 14, 12, 9, 8, 12, 12, 8, 7, 11, 10, 8, 6, /* Size 8x16 */ 32, 33, 32, 32, 30, 26, 23, 21, 18, 16, 14, 13, 12, 11, 10, 10, 32, 32, 31, 30, 28, 26, 24, 22, 19, 17, 16, 14, 13, 12, 12, 11, 28, 29, 28, 27, 23, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 11, 23, 25, 24, 24, 20, 18, 16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 18, 19, 19, 20, 17, 15, 14, 13, 11, 11, 10, 9, 9, 9, 8, 9, 13, 14, 14, 15, 14, 12, 11, 11, 9, 9, 8, 8, 7, 8, 7, 7, 12, 13, 13, 13, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6, 11, 12, 12, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, /* Size 16x8 */ 32, 32, 28, 23, 18, 13, 12, 11, 33, 32, 29, 25, 19, 14, 13, 12, 32, 31, 28, 24, 19, 14, 13, 12, 32, 30, 27, 24, 20, 15, 13, 12, 30, 28, 23, 20, 17, 14, 13, 12, 26, 26, 20, 18, 15, 12, 12, 11, 23, 24, 19, 16, 14, 11, 11, 11, 21, 22, 18, 15, 13, 11, 10, 10, 18, 19, 16, 14, 11, 9, 9, 9, 16, 17, 15, 13, 11, 9, 8, 8, 14, 16, 14, 12, 10, 8, 8, 8, 13, 14, 13, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 7, 7, 7, 11, 12, 12, 10, 9, 8, 7, 6, 10, 12, 12, 10, 8, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6, /* Size 16x32 */ 32, 33, 33, 33, 32, 32, 32, 30, 30, 28, 26, 26, 23, 22, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 33, 32, 32, 32, 32, 32, 31, 30, 30, 30, 28, 27, 25, 23, 22, 21, 19, 19, 17, 16, 16, 14, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 17, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 26, 25, 24, 22, 22, 20, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 28, 29, 29, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 26, 27, 27, 28, 26, 26, 26, 23, 22, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, 11, 12, 11, 11, 11, 10, 10, 10, 23, 24, 25, 25, 24, 24, 24, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 19, 20, 20, 21, 20, 21, 21, 19, 18, 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 10, 10, 9, 9, 9, 9, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 16, 17, 17, 17, 17, 18, 18, 16, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 8, 8, 7, 7, 7, 7, 8, 13, 13, 14, 14, 14, 14, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, /* Size 32x16 */ 32, 33, 32, 32, 28, 26, 23, 19, 18, 16, 13, 13, 12, 11, 11, 11, 33, 32, 32, 32, 29, 27, 24, 20, 19, 17, 14, 13, 12, 12, 12, 11, 33, 32, 32, 32, 29, 27, 25, 20, 19, 17, 14, 14, 13, 12, 12, 11, 33, 32, 32, 31, 30, 28, 25, 21, 19, 17, 14, 14, 13, 12, 12, 12, 32, 32, 31, 30, 28, 26, 24, 20, 19, 17, 14, 14, 13, 13, 12, 12, 32, 32, 30, 30, 28, 26, 24, 21, 19, 18, 15, 14, 13, 13, 12, 12, 32, 31, 30, 29, 27, 26, 24, 21, 20, 18, 15, 15, 13, 13, 12, 12, 30, 30, 29, 28, 24, 23, 21, 19, 18, 16, 14, 14, 13, 13, 13, 12, 30, 30, 28, 28, 23, 22, 20, 18, 17, 16, 14, 13, 13, 12, 12, 12, 28, 30, 28, 27, 21, 20, 19, 17, 16, 15, 13, 13, 12, 12, 12, 12, 26, 28, 26, 26, 20, 19, 18, 16, 15, 14, 12, 12, 12, 12, 11, 12, 26, 27, 26, 25, 20, 19, 17, 15, 15, 14, 12, 12, 11, 11, 11, 11, 23, 25, 24, 24, 19, 18, 16, 14, 14, 13, 11, 11, 11, 11, 11, 11, 22, 23, 23, 22, 18, 17, 16, 14, 13, 12, 11, 11, 10, 10, 10, 10, 21, 22, 22, 22, 18, 17, 15, 13, 13, 12, 11, 10, 10, 10, 10, 10, 19, 21, 20, 20, 17, 16, 14, 12, 12, 11, 10, 10, 9, 9, 10, 9, 18, 19, 19, 19, 16, 15, 14, 12, 11, 11, 9, 9, 9, 9, 9, 9, 17, 19, 19, 19, 16, 15, 14, 12, 11, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17, 18, 15, 14, 13, 11, 11, 10, 9, 9, 8, 8, 8, 9, 15, 16, 17, 17, 14, 13, 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 14, 16, 16, 16, 14, 13, 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 13, 14, 14, 15, 13, 12, 11, 10, 9, 9, 8, 8, 7, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 7, 12, 14, 14, 14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 12, 10, 9, 9, 8, 8, 7, 7, 7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 11, 12, 12, 12, 12, 11, 10, 10, 9, 8, 7, 7, 7, 6, 6, 6, 10, 12, 12, 12, 12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10, 11, 11, 12, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, /* Size 4x16 */ 33, 32, 32, 31, 30, 28, 25, 22, 19, 17, 16, 14, 13, 12, 12, 11, 26, 27, 26, 26, 22, 19, 18, 17, 15, 14, 13, 12, 11, 11, 11, 10, 16, 17, 17, 18, 16, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 11, 12, 13, 13, 12, 12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, /* Size 16x4 */ 33, 26, 16, 11, 32, 27, 17, 12, 32, 26, 17, 13, 31, 26, 18, 13, 30, 22, 16, 12, 28, 19, 14, 12, 25, 18, 13, 11, 22, 17, 12, 10, 19, 15, 11, 9, 17, 14, 10, 8, 16, 13, 9, 8, 14, 12, 9, 7, 13, 11, 8, 7, 12, 11, 8, 6, 12, 11, 8, 6, 11, 10, 8, 6, /* Size 8x32 */ 32, 33, 33, 33, 32, 32, 32, 30, 30, 28, 26, 26, 23, 22, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 32, 32, 32, 32, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 17, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 28, 29, 29, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 23, 24, 25, 25, 24, 24, 24, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 8, 8, 7, 7, 7, 7, 8, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, /* Size 32x8 */ 32, 32, 28, 23, 18, 13, 12, 11, 33, 32, 29, 24, 19, 14, 12, 12, 33, 32, 29, 25, 19, 14, 13, 12, 33, 32, 30, 25, 19, 14, 13, 12, 32, 31, 28, 24, 19, 14, 13, 12, 32, 30, 28, 24, 19, 15, 13, 12, 32, 30, 27, 24, 20, 15, 13, 12, 30, 29, 24, 21, 18, 14, 13, 13, 30, 28, 23, 20, 17, 14, 13, 12, 28, 28, 21, 19, 16, 13, 12, 12, 26, 26, 20, 18, 15, 12, 12, 11, 26, 26, 20, 17, 15, 12, 11, 11, 23, 24, 19, 16, 14, 11, 11, 11, 22, 23, 18, 16, 13, 11, 10, 10, 21, 22, 18, 15, 13, 11, 10, 10, 19, 20, 17, 14, 12, 10, 9, 10, 18, 19, 16, 14, 11, 9, 9, 9, 17, 19, 16, 14, 11, 9, 9, 9, 16, 17, 15, 13, 11, 9, 8, 8, 15, 17, 14, 12, 10, 8, 8, 8, 14, 16, 14, 12, 10, 8, 8, 8, 13, 14, 13, 11, 9, 8, 7, 8, 13, 14, 13, 11, 9, 8, 7, 7, 12, 14, 13, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 7, 7, 7, 11, 13, 12, 10, 9, 8, 7, 6, 11, 12, 12, 10, 9, 8, 7, 6, 11, 12, 12, 10, 9, 7, 7, 6, 10, 12, 12, 10, 8, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6, 10, 11, 11, 10, 9, 8, 7, 6 }, { /* Chroma */ /* Size 4x4 */ 32, 23, 19, 16, 23, 19, 17, 15, 19, 17, 13, 12, 16, 15, 12, 10, /* Size 8x8 */ 33, 28, 22, 21, 20, 17, 16, 15, 28, 24, 22, 22, 21, 19, 17, 16, 22, 22, 19, 19, 19, 17, 16, 16, 21, 22, 19, 17, 16, 15, 14, 14, 20, 21, 19, 16, 14, 13, 13, 13, 17, 19, 17, 15, 13, 12, 12, 12, 16, 17, 16, 14, 13, 12, 11, 10, 15, 16, 16, 14, 13, 12, 10, 10, /* Size 16x16 */ 32, 34, 31, 28, 23, 21, 21, 20, 19, 18, 17, 16, 15, 15, 15, 14, 34, 33, 29, 26, 23, 22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 15, 31, 29, 26, 24, 22, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 28, 26, 24, 22, 22, 22, 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 23, 23, 22, 22, 20, 20, 20, 20, 19, 19, 18, 17, 17, 17, 16, 17, 21, 22, 22, 22, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 22, 23, 23, 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 15, 20, 22, 22, 22, 20, 18, 17, 17, 16, 15, 15, 14, 14, 14, 14, 14, 19, 20, 21, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 19, 20, 20, 19, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 17, 19, 19, 20, 18, 17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 16, 17, 18, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 16, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 15, 16, 16, 17, 16, 16, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 15, 16, 16, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, /* Size 32x32 */ 32, 33, 34, 34, 31, 29, 28, 25, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 14, 33, 33, 33, 33, 30, 28, 27, 24, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 34, 33, 33, 33, 29, 28, 26, 24, 23, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 34, 33, 33, 32, 29, 28, 26, 24, 23, 22, 23, 23, 23, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 31, 30, 29, 29, 26, 25, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 29, 28, 28, 28, 25, 24, 23, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 28, 27, 26, 26, 24, 23, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 25, 24, 24, 24, 23, 22, 22, 21, 21, 20, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 23, 23, 23, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 17, 17, 17, 21, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 21, 20, 19, 19, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 21, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 21, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 14, 13, 13, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 20, 19, 19, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 17, 18, 19, 19, 19, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 19, 19, 19, 19, 20, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 12, 11, 12, 11, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 11, 10, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 15, 15, 16, 16, 16, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, /* Size 4x8 */ 33, 27, 22, 22, 20, 18, 17, 16, 22, 22, 19, 18, 17, 16, 16, 15, 19, 20, 18, 16, 14, 12, 12, 12, 16, 17, 17, 14, 13, 12, 11, 10, /* Size 8x4 */ 33, 22, 19, 16, 27, 22, 20, 17, 22, 19, 18, 17, 22, 18, 16, 14, 20, 17, 14, 13, 18, 16, 12, 12, 17, 16, 12, 11, 16, 15, 12, 10, /* Size 8x16 */ 32, 33, 31, 28, 23, 21, 21, 20, 19, 18, 17, 16, 16, 15, 15, 14, 30, 28, 26, 23, 22, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 21, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 17, 16, 16, 16, 16, 21, 22, 22, 23, 20, 18, 18, 17, 17, 16, 16, 15, 15, 14, 15, 15, 19, 20, 21, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12, 12, 12, 11, 12, 12, 12, 15, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 10, 10, 10, /* Size 16x8 */ 32, 30, 21, 21, 19, 16, 15, 15, 33, 28, 22, 22, 20, 18, 17, 16, 31, 26, 22, 22, 21, 18, 17, 17, 28, 23, 22, 23, 21, 19, 18, 17, 23, 22, 20, 20, 19, 17, 17, 17, 21, 22, 19, 18, 18, 16, 16, 16, 21, 23, 19, 18, 17, 15, 15, 15, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 13, 13, 13, 18, 20, 18, 16, 14, 12, 12, 13, 17, 19, 18, 16, 14, 12, 12, 12, 16, 18, 17, 15, 13, 12, 11, 12, 16, 17, 16, 15, 13, 11, 11, 11, 15, 17, 16, 14, 13, 12, 11, 10, 15, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12, 11, 10, /* Size 16x32 */ 32, 33, 33, 34, 31, 29, 28, 24, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 33, 33, 32, 32, 28, 27, 26, 24, 23, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 30, 29, 28, 28, 26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 28, 27, 26, 26, 24, 23, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 21, 22, 22, 23, 22, 23, 23, 21, 20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15, 15, 15, 20, 20, 21, 21, 22, 22, 22, 20, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 14, 14, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, /* Size 32x16 */ 32, 33, 30, 28, 21, 21, 21, 20, 19, 18, 16, 16, 15, 15, 15, 15, 33, 33, 29, 27, 22, 22, 22, 20, 20, 19, 17, 17, 16, 16, 16, 16, 33, 32, 28, 26, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 16, 34, 32, 28, 26, 22, 23, 23, 21, 21, 20, 18, 18, 17, 17, 17, 16, 31, 28, 26, 24, 22, 22, 22, 22, 21, 20, 18, 18, 17, 17, 17, 16, 29, 27, 24, 23, 22, 22, 23, 22, 21, 20, 19, 18, 18, 17, 17, 17, 28, 26, 23, 22, 22, 22, 23, 22, 21, 20, 19, 19, 18, 18, 17, 17, 24, 24, 23, 22, 20, 20, 21, 20, 20, 19, 18, 18, 17, 18, 17, 17, 23, 23, 22, 22, 20, 20, 20, 20, 19, 19, 17, 17, 17, 17, 17, 17, 21, 22, 22, 21, 19, 19, 19, 19, 19, 18, 17, 17, 16, 17, 17, 16, 21, 22, 22, 22, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 16, 16, 21, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 21, 23, 23, 22, 19, 18, 18, 17, 17, 16, 15, 15, 15, 15, 15, 16, 20, 22, 22, 22, 19, 18, 17, 16, 16, 16, 15, 14, 15, 14, 15, 15, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15, 14, 14, 14, 14, 14, 15, 20, 21, 22, 22, 19, 18, 17, 16, 15, 14, 14, 14, 13, 14, 14, 14, 19, 21, 21, 21, 19, 18, 17, 15, 15, 14, 13, 13, 13, 13, 13, 14, 19, 20, 21, 21, 19, 17, 17, 15, 15, 14, 13, 13, 13, 13, 13, 13, 18, 20, 20, 20, 18, 17, 16, 15, 14, 13, 12, 12, 12, 12, 13, 13, 17, 19, 20, 20, 18, 17, 16, 14, 14, 13, 12, 12, 12, 12, 12, 12, 17, 19, 19, 20, 18, 17, 16, 14, 14, 13, 12, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 12, 12, 16, 17, 18, 18, 17, 16, 15, 14, 13, 12, 11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 16, 16, 15, 13, 13, 12, 11, 11, 11, 11, 11, 11, 15, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 11, 10, 11, 15, 16, 17, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 10, 10, 10, 15, 16, 16, 17, 16, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 14, 16, 16, 17, 16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 16, 16, 17, 16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 16, 16, 16, 16, 15, 15, 13, 13, 12, 12, 11, 11, 10, 10, 10, /* Size 4x16 */ 33, 32, 28, 26, 23, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 21, 22, 22, 22, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 18, 19, 20, 20, 19, 17, 16, 15, 14, 13, 13, 12, 12, 12, 13, 12, 15, 16, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 10, 10, /* Size 16x4 */ 33, 21, 18, 15, 32, 22, 19, 16, 28, 22, 20, 17, 26, 22, 20, 18, 23, 20, 19, 17, 22, 19, 17, 16, 23, 18, 16, 15, 22, 18, 15, 14, 21, 18, 14, 13, 20, 17, 13, 12, 19, 17, 13, 12, 18, 16, 12, 11, 17, 16, 12, 11, 17, 16, 12, 11, 16, 16, 13, 10, 16, 15, 12, 10, /* Size 8x32 */ 32, 33, 33, 34, 31, 29, 28, 24, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 30, 29, 28, 28, 26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 23, 23, 21, 20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15, 15, 15, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 16, 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 12, 12, 12, 12, 12, 12, 12, 15, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, /* Size 32x8 */ 32, 30, 21, 21, 19, 16, 15, 15, 33, 29, 22, 22, 20, 17, 16, 16, 33, 28, 22, 22, 20, 18, 17, 16, 34, 28, 22, 23, 21, 18, 17, 17, 31, 26, 22, 22, 21, 18, 17, 17, 29, 24, 22, 23, 21, 19, 18, 17, 28, 23, 22, 23, 21, 19, 18, 17, 24, 23, 20, 21, 20, 18, 17, 17, 23, 22, 20, 20, 19, 17, 17, 17, 21, 22, 19, 19, 19, 17, 16, 17, 21, 22, 19, 18, 18, 16, 16, 16, 21, 22, 19, 18, 17, 16, 16, 16, 21, 23, 19, 18, 17, 15, 15, 15, 20, 22, 19, 17, 16, 15, 15, 15, 20, 22, 19, 17, 16, 14, 14, 14, 20, 22, 19, 17, 15, 14, 13, 14, 19, 21, 19, 17, 15, 13, 13, 13, 19, 21, 19, 17, 15, 13, 13, 13, 18, 20, 18, 16, 14, 12, 12, 13, 17, 20, 18, 16, 14, 12, 12, 12, 17, 19, 18, 16, 14, 12, 12, 12, 16, 18, 17, 15, 13, 12, 11, 12, 16, 18, 17, 15, 13, 12, 11, 12, 16, 18, 17, 15, 13, 11, 11, 11, 16, 17, 16, 15, 13, 11, 11, 11, 15, 17, 16, 15, 13, 12, 11, 11, 15, 17, 16, 14, 13, 12, 11, 10, 15, 17, 16, 14, 13, 12, 11, 10, 15, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12, 11, 10 }, }, { { /* Luma */ /* Size 4x4 */ 32, 28, 18, 13, 28, 19, 14, 11, 18, 14, 10, 8, 13, 11, 8, 7, /* Size 8x8 */ 32, 32, 29, 24, 19, 15, 13, 11, 32, 31, 28, 24, 20, 16, 14, 12, 29, 28, 22, 20, 17, 14, 13, 12, 24, 24, 20, 16, 14, 12, 11, 10, 19, 20, 17, 14, 12, 10, 9, 9, 15, 16, 14, 12, 10, 9, 8, 8, 13, 14, 13, 11, 9, 8, 7, 7, 11, 12, 12, 10, 9, 8, 7, 6, /* Size 16x16 */ 32, 33, 33, 32, 30, 28, 25, 22, 19, 17, 16, 14, 12, 12, 11, 11, 33, 32, 32, 32, 30, 29, 26, 23, 20, 19, 17, 15, 13, 13, 12, 11, 33, 32, 31, 31, 29, 28, 26, 23, 21, 19, 17, 15, 14, 13, 12, 12, 32, 32, 31, 29, 28, 27, 25, 23, 21, 19, 18, 16, 14, 14, 13, 12, 30, 30, 29, 28, 26, 24, 22, 20, 19, 18, 16, 15, 13, 13, 12, 12, 28, 29, 28, 27, 24, 21, 20, 18, 17, 16, 15, 14, 13, 12, 11, 11, 25, 26, 26, 25, 22, 20, 18, 17, 15, 14, 14, 12, 12, 11, 11, 11, 22, 23, 23, 23, 20, 18, 17, 15, 14, 13, 12, 11, 11, 10, 10, 10, 19, 20, 21, 21, 19, 17, 15, 14, 12, 12, 11, 10, 10, 9, 9, 9, 17, 19, 19, 19, 18, 16, 14, 13, 12, 11, 10, 10, 9, 9, 9, 8, 16, 17, 17, 18, 16, 15, 14, 12, 11, 10, 10, 9, 9, 8, 8, 8, 14, 15, 15, 16, 15, 14, 12, 11, 10, 10, 9, 8, 8, 8, 7, 7, 12, 13, 14, 14, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 12, 13, 13, 14, 13, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6, 11, 12, 12, 13, 12, 11, 11, 10, 9, 9, 8, 7, 7, 7, 6, 6, 11, 11, 12, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 18, 17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 11, 11, 10, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 20, 19, 18, 17, 17, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 27, 26, 24, 23, 23, 20, 20, 19, 17, 17, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, 27, 25, 23, 23, 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 33, 32, 32, 32, 31, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 23, 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 33, 32, 32, 32, 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 32, 32, 32, 32, 31, 30, 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 18, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 32, 31, 31, 31, 30, 30, 28, 28, 27, 26, 26, 24, 24, 23, 22, 22, 20, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 30, 30, 30, 31, 29, 29, 28, 27, 26, 24, 24, 23, 22, 22, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12, 12, 29, 29, 30, 30, 28, 28, 27, 26, 24, 22, 22, 21, 20, 20, 19, 19, 17, 17, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 28, 29, 29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 26, 27, 27, 28, 26, 26, 26, 24, 23, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 25, 26, 26, 27, 26, 26, 25, 24, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 23, 24, 24, 25, 24, 24, 24, 23, 22, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 21, 22, 23, 23, 23, 23, 22, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 18, 19, 20, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 17, 18, 19, 19, 19, 19, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 8, 8, 9, 16, 17, 17, 18, 18, 18, 18, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 17, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 7, 14, 14, 15, 15, 15, 15, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, /* Size 4x8 */ 32, 32, 30, 25, 20, 16, 14, 12, 29, 28, 22, 19, 17, 14, 13, 11, 17, 18, 16, 13, 11, 9, 9, 9, 12, 13, 12, 11, 9, 8, 7, 7, /* Size 8x4 */ 32, 29, 17, 12, 32, 28, 18, 13, 30, 22, 16, 12, 25, 19, 13, 11, 20, 17, 11, 9, 16, 14, 9, 8, 14, 13, 9, 7, 12, 11, 9, 7, /* Size 8x16 */ 32, 33, 33, 32, 30, 28, 25, 22, 19, 18, 16, 14, 12, 12, 11, 11, 33, 32, 31, 30, 29, 28, 26, 23, 20, 19, 17, 15, 14, 13, 12, 12, 29, 30, 29, 28, 25, 22, 21, 19, 18, 17, 16, 14, 13, 12, 12, 12, 23, 25, 24, 24, 21, 19, 17, 16, 14, 14, 13, 12, 11, 11, 11, 11, 19, 20, 21, 21, 19, 17, 15, 14, 12, 12, 11, 10, 10, 9, 9, 9, 16, 17, 17, 18, 16, 15, 13, 12, 11, 10, 10, 9, 9, 8, 8, 8, 12, 13, 14, 14, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 11, 12, 13, 13, 13, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6, /* Size 16x8 */ 32, 33, 29, 23, 19, 16, 12, 11, 33, 32, 30, 25, 20, 17, 13, 12, 33, 31, 29, 24, 21, 17, 14, 13, 32, 30, 28, 24, 21, 18, 14, 13, 30, 29, 25, 21, 19, 16, 13, 13, 28, 28, 22, 19, 17, 15, 13, 12, 25, 26, 21, 17, 15, 13, 12, 11, 22, 23, 19, 16, 14, 12, 11, 10, 19, 20, 18, 14, 12, 11, 10, 9, 18, 19, 17, 14, 12, 10, 9, 9, 16, 17, 16, 13, 11, 10, 9, 8, 14, 15, 14, 12, 10, 9, 8, 8, 12, 14, 13, 11, 10, 9, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 6, /* Size 16x32 */ 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 19, 18, 16, 16, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 10, 33, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 26, 25, 23, 23, 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 12, 12, 12, 12, 11, 33, 32, 32, 32, 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 20, 20, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 29, 29, 30, 30, 29, 28, 28, 26, 25, 23, 22, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 28, 29, 29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 17, 17, 18, 18, 18, 18, 18, 18, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 16, 17, 17, 17, 17, 17, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 8, 8, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 11, 12, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, /* Size 32x16 */ 32, 33, 33, 32, 29, 28, 23, 22, 19, 17, 16, 13, 12, 12, 11, 11, 33, 32, 32, 32, 29, 29, 24, 23, 20, 17, 17, 14, 13, 12, 12, 12, 33, 32, 32, 32, 30, 29, 25, 23, 20, 18, 17, 14, 13, 12, 12, 12, 33, 32, 32, 31, 30, 30, 25, 23, 21, 18, 17, 14, 14, 13, 12, 12, 33, 32, 31, 30, 29, 28, 24, 23, 21, 18, 17, 14, 14, 13, 13, 12, 32, 32, 31, 30, 28, 28, 24, 23, 20, 18, 17, 14, 14, 13, 13, 12, 32, 31, 30, 29, 28, 27, 24, 23, 21, 18, 18, 15, 14, 13, 13, 12, 32, 31, 30, 28, 26, 26, 23, 22, 20, 18, 17, 14, 14, 13, 13, 13, 30, 30, 29, 28, 25, 24, 21, 20, 19, 17, 16, 14, 13, 13, 13, 13, 29, 30, 28, 27, 23, 22, 20, 19, 17, 16, 15, 13, 13, 12, 12, 12, 28, 30, 28, 27, 22, 21, 19, 18, 17, 16, 15, 13, 13, 12, 12, 12, 26, 28, 26, 26, 21, 20, 18, 17, 16, 14, 14, 12, 12, 12, 12, 11, 25, 26, 26, 25, 21, 20, 17, 17, 15, 14, 13, 12, 12, 11, 11, 11, 23, 25, 24, 24, 20, 19, 16, 16, 14, 13, 13, 11, 11, 11, 11, 11, 22, 23, 23, 23, 19, 18, 16, 15, 14, 12, 12, 11, 11, 10, 10, 10, 21, 23, 23, 22, 19, 18, 15, 15, 13, 12, 12, 11, 10, 10, 10, 10, 19, 21, 20, 20, 18, 17, 14, 14, 12, 11, 11, 10, 10, 10, 9, 10, 19, 20, 20, 20, 17, 17, 14, 13, 12, 11, 11, 10, 9, 9, 9, 9, 18, 19, 19, 19, 17, 16, 14, 13, 12, 11, 10, 9, 9, 9, 9, 9, 16, 18, 18, 18, 16, 15, 13, 12, 11, 10, 10, 9, 9, 9, 9, 8, 16, 17, 17, 18, 16, 15, 13, 12, 11, 10, 10, 9, 9, 8, 8, 8, 14, 16, 16, 16, 14, 14, 12, 12, 11, 9, 9, 8, 8, 8, 8, 8, 14, 15, 15, 16, 14, 14, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8, 13, 14, 14, 15, 13, 13, 11, 11, 10, 9, 9, 8, 8, 7, 7, 7, 12, 14, 14, 14, 13, 13, 11, 11, 10, 9, 9, 8, 7, 7, 7, 7, 12, 14, 14, 14, 13, 13, 11, 11, 10, 9, 8, 8, 7, 7, 7, 7, 12, 13, 13, 13, 12, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 7, 12, 12, 13, 13, 12, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 6, 11, 12, 12, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6, 11, 12, 12, 12, 12, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 11, 12, 12, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 10, 11, 11, 12, 12, 11, 11, 9, 9, 8, 8, 7, 7, 6, 6, 6, /* Size 4x16 */ 33, 32, 32, 31, 30, 30, 26, 23, 21, 19, 17, 15, 14, 13, 12, 12, 28, 29, 28, 27, 24, 21, 20, 18, 17, 16, 15, 14, 13, 12, 12, 11, 17, 18, 18, 18, 17, 16, 14, 12, 11, 11, 10, 9, 9, 9, 9, 8, 12, 12, 13, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 6, /* Size 16x4 */ 33, 28, 17, 12, 32, 29, 18, 12, 32, 28, 18, 13, 31, 27, 18, 13, 30, 24, 17, 13, 30, 21, 16, 12, 26, 20, 14, 11, 23, 18, 12, 10, 21, 17, 11, 10, 19, 16, 11, 9, 17, 15, 10, 8, 15, 14, 9, 8, 14, 13, 9, 7, 13, 12, 9, 7, 12, 12, 9, 7, 12, 11, 8, 6, /* Size 8x32 */ 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 19, 18, 16, 16, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 10, 33, 32, 32, 32, 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 29, 29, 30, 30, 29, 28, 28, 26, 25, 23, 22, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17, 17, 17, 17, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 11, 12, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, /* Size 32x8 */ 32, 33, 29, 23, 19, 16, 12, 11, 33, 32, 29, 24, 20, 17, 13, 12, 33, 32, 30, 25, 20, 17, 13, 12, 33, 32, 30, 25, 21, 17, 14, 12, 33, 31, 29, 24, 21, 17, 14, 13, 32, 31, 28, 24, 20, 17, 14, 13, 32, 30, 28, 24, 21, 18, 14, 13, 32, 30, 26, 23, 20, 17, 14, 13, 30, 29, 25, 21, 19, 16, 13, 13, 29, 28, 23, 20, 17, 15, 13, 12, 28, 28, 22, 19, 17, 15, 13, 12, 26, 26, 21, 18, 16, 14, 12, 12, 25, 26, 21, 17, 15, 13, 12, 11, 23, 24, 20, 16, 14, 13, 11, 11, 22, 23, 19, 16, 14, 12, 11, 10, 21, 23, 19, 15, 13, 12, 10, 10, 19, 20, 18, 14, 12, 11, 10, 9, 19, 20, 17, 14, 12, 11, 9, 9, 18, 19, 17, 14, 12, 10, 9, 9, 16, 18, 16, 13, 11, 10, 9, 9, 16, 17, 16, 13, 11, 10, 9, 8, 14, 16, 14, 12, 11, 9, 8, 8, 14, 15, 14, 12, 10, 9, 8, 8, 13, 14, 13, 11, 10, 9, 8, 7, 12, 14, 13, 11, 10, 9, 7, 7, 12, 14, 13, 11, 10, 8, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 6, 11, 12, 12, 11, 9, 8, 7, 6, 10, 11, 12, 11, 9, 8, 7, 6 }, { /* Chroma */ /* Size 4x4 */ 32, 23, 20, 17, 23, 19, 17, 16, 20, 17, 14, 13, 17, 16, 13, 11, /* Size 8x8 */ 33, 30, 22, 22, 20, 18, 17, 16, 30, 26, 22, 23, 21, 19, 18, 17, 22, 22, 20, 20, 19, 18, 17, 17, 22, 23, 20, 18, 17, 16, 15, 15, 20, 21, 19, 17, 15, 14, 13, 13, 18, 19, 18, 16, 14, 12, 12, 12, 17, 18, 17, 15, 13, 12, 11, 11, 16, 17, 17, 15, 13, 12, 11, 10, /* Size 16x16 */ 32, 33, 31, 28, 25, 21, 21, 20, 20, 19, 18, 17, 16, 15, 15, 15, 33, 33, 30, 26, 24, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 31, 30, 28, 24, 23, 22, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 28, 26, 24, 22, 22, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 25, 24, 23, 22, 21, 20, 21, 20, 20, 20, 19, 18, 18, 17, 17, 17, 21, 22, 22, 21, 20, 19, 19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 21, 22, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, 15, 20, 22, 22, 22, 20, 19, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 20, 21, 22, 22, 20, 19, 17, 16, 16, 15, 15, 14, 14, 13, 14, 14, 19, 20, 21, 21, 20, 19, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 19, 20, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 16, 17, 18, 19, 18, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, /* Size 32x32 */ 32, 33, 33, 34, 31, 31, 28, 27, 25, 22, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 33, 33, 33, 33, 30, 30, 27, 26, 24, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 33, 33, 33, 33, 30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 34, 33, 33, 32, 30, 29, 26, 25, 24, 23, 22, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 31, 30, 30, 30, 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 31, 30, 29, 29, 27, 26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 28, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 27, 26, 26, 25, 24, 23, 22, 22, 21, 21, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 25, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 15, 21, 22, 22, 23, 22, 22, 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 15, 14, 14, 14, 15, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 14, 14, 13, 14, 14, 19, 20, 20, 21, 21, 21, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 19, 20, 20, 21, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 13, 12, 13, 13, 12, 18, 19, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 12, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, /* Size 4x8 */ 33, 28, 22, 23, 21, 19, 17, 16, 22, 22, 20, 19, 19, 18, 17, 16, 19, 20, 19, 16, 14, 13, 13, 13, 16, 17, 17, 15, 13, 12, 11, 11, /* Size 8x4 */ 33, 22, 19, 16, 28, 22, 20, 17, 22, 20, 19, 17, 23, 19, 16, 15, 21, 19, 14, 13, 19, 18, 13, 12, 17, 17, 13, 11, 16, 16, 13, 11, /* Size 8x16 */ 32, 33, 31, 28, 24, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 15, 31, 30, 28, 24, 23, 22, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 23, 23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 21, 22, 23, 23, 21, 19, 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 20, 21, 22, 22, 20, 19, 17, 16, 16, 15, 15, 14, 14, 13, 13, 14, 18, 19, 20, 20, 19, 18, 17, 16, 14, 14, 13, 13, 12, 12, 12, 12, 16, 17, 18, 19, 18, 17, 16, 14, 14, 13, 12, 12, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, /* Size 16x8 */ 32, 31, 23, 21, 20, 18, 16, 15, 33, 30, 23, 22, 21, 19, 17, 16, 31, 28, 22, 23, 22, 20, 18, 17, 28, 24, 22, 23, 22, 20, 19, 17, 24, 23, 21, 21, 20, 19, 18, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22, 20, 18, 17, 17, 16, 15, 20, 22, 20, 17, 16, 16, 14, 14, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 14, 13, 13, 18, 20, 19, 16, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 16, 18, 17, 15, 14, 12, 11, 11, 16, 17, 17, 15, 13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 16, 17, 15, 14, 12, 11, 10, /* Size 16x32 */ 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 33, 33, 32, 32, 29, 28, 26, 25, 24, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 31, 30, 30, 29, 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 28, 27, 26, 26, 24, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 15, 14, 14, 14, 14, 20, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 14, 14, 18, 19, 20, 20, 20, 20, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 12, 13, 13, 13, 13, 12, 18, 19, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 12, 12, 12, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, /* Size 32x16 */ 32, 33, 31, 28, 23, 21, 21, 20, 20, 18, 18, 16, 16, 15, 15, 15, 33, 33, 30, 27, 23, 22, 22, 21, 20, 19, 19, 17, 17, 16, 16, 16, 33, 32, 30, 26, 23, 22, 22, 22, 21, 20, 19, 17, 17, 17, 16, 16, 34, 32, 29, 26, 23, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 17, 31, 29, 28, 24, 22, 22, 23, 22, 22, 20, 20, 18, 18, 17, 17, 17, 31, 28, 27, 24, 22, 22, 22, 22, 22, 20, 20, 18, 18, 17, 17, 17, 28, 26, 24, 22, 22, 22, 23, 22, 22, 21, 20, 19, 19, 18, 17, 17, 26, 25, 24, 22, 21, 21, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 24, 24, 23, 22, 21, 20, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 18, 17, 17, 17, 17, 17, 21, 22, 22, 21, 20, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 17, 21, 22, 22, 22, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 23, 22, 22, 20, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 16, 21, 23, 23, 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 15, 20, 22, 22, 22, 20, 19, 17, 17, 16, 16, 16, 15, 14, 15, 14, 15, 20, 22, 22, 22, 20, 19, 17, 17, 16, 16, 15, 14, 14, 14, 14, 14, 20, 21, 22, 22, 19, 19, 17, 16, 16, 15, 14, 14, 14, 14, 14, 14, 19, 21, 21, 21, 19, 19, 17, 16, 15, 14, 14, 13, 13, 13, 14, 13, 19, 20, 21, 21, 19, 19, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 20, 20, 20, 19, 18, 16, 16, 15, 14, 13, 13, 12, 13, 13, 13, 18, 20, 20, 20, 19, 18, 16, 16, 15, 14, 13, 12, 12, 12, 12, 13, 17, 19, 19, 20, 18, 18, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 17, 15, 15, 14, 13, 12, 12, 11, 11, 12, 12, 16, 18, 18, 18, 17, 17, 15, 14, 14, 13, 12, 11, 11, 11, 11, 12, 16, 17, 18, 18, 17, 17, 15, 14, 14, 13, 12, 11, 11, 11, 11, 11, 16, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, 15, 15, 13, 13, 12, 11, 11, 11, 11, 11, 15, 17, 17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 15, 16, 17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 15, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, /* Size 4x16 */ 33, 32, 29, 26, 24, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 16, 21, 22, 22, 22, 20, 19, 19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 18, 20, 20, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 13, 13, 15, 17, 17, 18, 17, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, /* Size 16x4 */ 33, 21, 18, 15, 32, 22, 20, 17, 29, 22, 20, 17, 26, 22, 21, 18, 24, 20, 19, 17, 22, 19, 18, 16, 23, 19, 17, 16, 22, 19, 16, 15, 21, 19, 15, 14, 20, 19, 14, 13, 20, 18, 14, 12, 18, 17, 13, 12, 18, 17, 13, 11, 17, 16, 12, 11, 17, 16, 13, 11, 16, 16, 13, 11, /* Size 8x32 */ 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 31, 30, 30, 29, 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 14, 14, 18, 19, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, /* Size 32x8 */ 32, 31, 23, 21, 20, 18, 16, 15, 33, 30, 23, 22, 20, 19, 17, 16, 33, 30, 23, 22, 21, 19, 17, 16, 34, 29, 23, 23, 21, 20, 18, 17, 31, 28, 22, 23, 22, 20, 18, 17, 31, 27, 22, 22, 22, 20, 18, 17, 28, 24, 22, 23, 22, 20, 19, 17, 26, 24, 21, 22, 21, 20, 18, 18, 24, 23, 21, 21, 20, 19, 18, 17, 22, 22, 20, 19, 19, 18, 17, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22, 20, 18, 18, 17, 16, 16, 21, 22, 20, 18, 17, 17, 16, 15, 21, 23, 20, 18, 17, 16, 15, 15, 20, 22, 20, 17, 16, 16, 14, 14, 20, 22, 20, 17, 16, 15, 14, 14, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 14, 13, 14, 19, 21, 19, 17, 15, 14, 13, 13, 18, 20, 19, 16, 15, 13, 12, 13, 18, 20, 19, 16, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 16, 18, 17, 15, 14, 12, 11, 12, 16, 18, 17, 15, 14, 12, 11, 11, 16, 18, 17, 15, 14, 12, 11, 11, 16, 17, 17, 15, 13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 10, 15, 16, 17, 15, 14, 12, 11, 10, 15, 16, 17, 15, 14, 12, 11, 10 }, }, { { /* Luma */ /* Size 4x4 */ 32, 30, 19, 14, 30, 21, 16, 13, 19, 16, 11, 9, 14, 13, 9, 7, /* Size 8x8 */ 32, 32, 30, 26, 20, 17, 13, 12, 32, 31, 29, 26, 21, 17, 14, 13, 30, 29, 26, 22, 19, 16, 14, 13, 26, 26, 22, 18, 16, 14, 12, 11, 20, 21, 19, 16, 13, 11, 10, 10, 17, 17, 16, 14, 11, 10, 9, 8, 13, 14, 14, 12, 10, 9, 8, 7, 12, 13, 13, 11, 10, 8, 7, 7, /* Size 16x16 */ 32, 33, 33, 32, 31, 28, 26, 23, 21, 19, 17, 16, 14, 13, 12, 11, 33, 32, 32, 32, 31, 29, 27, 24, 22, 20, 18, 16, 15, 13, 13, 12, 33, 32, 32, 31, 30, 29, 27, 25, 23, 21, 19, 17, 15, 14, 13, 12, 32, 32, 31, 30, 28, 28, 26, 24, 23, 21, 19, 17, 16, 14, 14, 13, 31, 31, 30, 28, 27, 24, 23, 22, 20, 19, 18, 16, 15, 14, 13, 13, 28, 29, 29, 28, 24, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 12, 26, 27, 27, 26, 23, 20, 19, 18, 17, 16, 15, 14, 13, 12, 12, 11, 23, 24, 25, 24, 22, 19, 18, 16, 15, 14, 14, 13, 12, 11, 11, 11, 21, 22, 23, 23, 20, 18, 17, 15, 14, 13, 13, 12, 11, 10, 10, 10, 19, 20, 21, 21, 19, 17, 16, 14, 13, 12, 12, 11, 10, 10, 9, 9, 17, 18, 19, 19, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 16, 16, 17, 17, 16, 15, 14, 13, 12, 11, 10, 10, 9, 8, 8, 8, 14, 15, 15, 16, 15, 14, 13, 12, 11, 10, 10, 9, 8, 8, 8, 7, 13, 13, 14, 14, 14, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 12, 13, 13, 14, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 11, 12, 12, 13, 13, 12, 11, 11, 10, 9, 9, 8, 7, 7, 7, 6, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 32, 32, 31, 30, 28, 28, 26, 25, 23, 22, 21, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 20, 20, 18, 18, 17, 16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, 27, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32, 32, 32, 31, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 23, 23, 21, 20, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 13, 12, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 27, 26, 26, 24, 23, 23, 21, 21, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 32, 32, 32, 32, 31, 30, 29, 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 21, 21, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 13, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 30, 30, 30, 31, 30, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 28, 29, 29, 30, 29, 28, 28, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 28, 29, 29, 30, 29, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 26, 27, 27, 28, 27, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 25, 26, 26, 27, 26, 26, 26, 25, 23, 22, 20, 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 22, 22, 19, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 22, 23, 23, 24, 24, 23, 23, 23, 21, 21, 19, 19, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 21, 22, 22, 23, 23, 23, 23, 22, 20, 20, 18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 20, 20, 21, 21, 21, 21, 21, 21, 20, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 17, 18, 18, 19, 19, 19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 16, 17, 17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 16, 16, 16, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 8, 13, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 11, 12, 12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, /* Size 4x8 */ 32, 32, 30, 27, 21, 17, 14, 13, 29, 28, 24, 20, 17, 15, 13, 12, 20, 20, 19, 15, 13, 11, 10, 9, 13, 14, 14, 12, 10, 9, 8, 7, /* Size 8x4 */ 32, 29, 20, 13, 32, 28, 20, 14, 30, 24, 19, 14, 27, 20, 15, 12, 21, 17, 13, 10, 17, 15, 11, 9, 14, 13, 10, 8, 13, 12, 9, 7, /* Size 8x16 */ 32, 33, 33, 32, 31, 28, 26, 23, 21, 19, 18, 16, 14, 13, 12, 11, 33, 32, 32, 31, 30, 29, 27, 25, 23, 21, 19, 17, 15, 14, 13, 12, 31, 31, 30, 28, 27, 24, 23, 22, 20, 19, 18, 16, 15, 14, 13, 13, 26, 26, 27, 26, 23, 20, 19, 17, 17, 16, 15, 14, 13, 12, 11, 11, 20, 21, 22, 21, 20, 18, 16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 16, 17, 17, 18, 17, 15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 9, 13, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8, 7, 7, 12, 12, 13, 13, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, /* Size 16x8 */ 32, 33, 31, 26, 20, 16, 13, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32, 30, 27, 22, 17, 14, 13, 32, 31, 28, 26, 21, 18, 15, 13, 31, 30, 27, 23, 20, 17, 14, 13, 28, 29, 24, 20, 18, 15, 13, 12, 26, 27, 23, 19, 16, 14, 12, 12, 23, 25, 22, 17, 15, 13, 11, 11, 21, 23, 20, 17, 14, 12, 11, 10, 19, 21, 19, 16, 13, 11, 10, 9, 18, 19, 18, 15, 12, 10, 9, 9, 16, 17, 16, 14, 11, 10, 9, 8, 14, 15, 15, 13, 11, 9, 8, 8, 13, 14, 14, 12, 10, 9, 8, 7, 12, 13, 13, 11, 10, 8, 7, 7, 11, 12, 13, 11, 10, 9, 7, 7, /* Size 16x32 */ 32, 33, 33, 33, 33, 32, 32, 32, 31, 30, 28, 28, 26, 26, 23, 22, 21, 20, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 28, 27, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 32, 32, 32, 31, 31, 30, 29, 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 13, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 28, 29, 29, 30, 29, 28, 28, 27, 25, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 26, 26, 26, 27, 27, 26, 26, 25, 23, 23, 20, 20, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 22, 21, 19, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 20, 21, 21, 22, 22, 21, 21, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 19, 20, 20, 21, 21, 20, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 16, 17, 17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 9, 9, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 8, 13, 13, 13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 11, 12, 12, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, /* Size 32x16 */ 32, 33, 33, 32, 31, 28, 26, 23, 20, 19, 16, 16, 13, 13, 12, 11, 33, 32, 32, 32, 31, 29, 26, 24, 21, 20, 17, 16, 14, 13, 12, 12, 33, 32, 32, 32, 31, 29, 26, 24, 21, 20, 17, 17, 14, 13, 12, 12, 33, 32, 32, 31, 31, 30, 27, 25, 22, 21, 17, 17, 14, 14, 13, 13, 33, 32, 32, 31, 30, 29, 27, 25, 22, 21, 17, 17, 14, 14, 13, 13, 32, 32, 31, 30, 29, 28, 26, 24, 21, 20, 17, 17, 14, 14, 13, 13, 32, 32, 31, 29, 28, 28, 26, 24, 21, 21, 18, 17, 15, 14, 13, 13, 32, 31, 31, 29, 28, 27, 25, 24, 21, 21, 18, 17, 15, 15, 14, 13, 31, 31, 30, 28, 27, 25, 23, 22, 20, 19, 17, 16, 14, 14, 13, 13, 30, 30, 30, 28, 26, 24, 23, 21, 19, 19, 16, 16, 14, 14, 13, 12, 28, 30, 29, 27, 24, 21, 20, 19, 18, 17, 15, 15, 13, 13, 12, 12, 28, 29, 29, 27, 24, 21, 20, 19, 17, 17, 15, 15, 13, 13, 12, 12, 26, 28, 27, 26, 23, 20, 19, 18, 16, 16, 14, 14, 12, 12, 12, 12, 26, 27, 26, 25, 23, 20, 18, 17, 16, 15, 14, 13, 12, 12, 11, 11, 23, 25, 25, 24, 22, 19, 17, 16, 15, 14, 13, 13, 11, 11, 11, 11, 22, 24, 24, 23, 21, 19, 17, 16, 14, 14, 12, 12, 11, 11, 11, 10, 21, 23, 23, 22, 20, 18, 17, 15, 14, 13, 12, 12, 11, 10, 10, 10, 20, 21, 21, 21, 20, 17, 16, 15, 13, 13, 11, 11, 10, 10, 10, 10, 19, 21, 21, 20, 19, 17, 16, 14, 13, 12, 11, 11, 10, 10, 9, 10, 18, 19, 19, 19, 18, 16, 15, 14, 12, 12, 11, 10, 9, 9, 9, 9, 18, 19, 19, 19, 18, 16, 15, 14, 12, 12, 10, 10, 9, 9, 9, 9, 16, 17, 17, 18, 17, 15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 8, 16, 17, 17, 17, 16, 15, 14, 13, 11, 11, 10, 10, 9, 8, 8, 8, 14, 16, 16, 16, 15, 14, 13, 12, 11, 11, 9, 9, 8, 8, 8, 8, 14, 15, 15, 16, 15, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 13, 14, 14, 15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 8, 7, 7, 13, 14, 14, 14, 14, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 12, 14, 14, 14, 14, 13, 12, 11, 10, 10, 8, 8, 8, 7, 7, 7, 12, 13, 13, 14, 13, 12, 11, 11, 10, 9, 8, 8, 7, 7, 7, 7, 12, 13, 13, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 7, 11, 12, 12, 13, 13, 12, 11, 10, 10, 9, 9, 8, 7, 7, 7, 7, 11, 12, 12, 13, 13, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, /* Size 4x16 */ 33, 32, 32, 32, 31, 30, 28, 25, 23, 21, 19, 17, 15, 14, 13, 12, 28, 29, 29, 28, 25, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 12, 19, 20, 21, 21, 19, 17, 16, 14, 13, 12, 12, 11, 10, 10, 9, 9, 13, 13, 14, 14, 14, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, /* Size 16x4 */ 33, 28, 19, 13, 32, 29, 20, 13, 32, 29, 21, 14, 32, 28, 21, 14, 31, 25, 19, 14, 30, 21, 17, 13, 28, 20, 16, 12, 25, 19, 14, 11, 23, 18, 13, 10, 21, 17, 12, 10, 19, 16, 12, 9, 17, 15, 11, 8, 15, 14, 10, 8, 14, 13, 10, 7, 13, 12, 9, 7, 12, 12, 9, 7, /* Size 8x32 */ 32, 33, 33, 33, 33, 32, 32, 32, 31, 30, 28, 28, 26, 26, 23, 22, 21, 20, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 26, 26, 26, 27, 27, 26, 26, 25, 23, 23, 20, 20, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 20, 21, 21, 22, 22, 21, 21, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 16, 17, 17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 9, 9, 13, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 8, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, /* Size 32x8 */ 32, 33, 31, 26, 20, 16, 13, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32, 31, 27, 22, 17, 14, 13, 33, 32, 30, 27, 22, 17, 14, 13, 32, 31, 29, 26, 21, 17, 14, 13, 32, 31, 28, 26, 21, 18, 15, 13, 32, 31, 28, 25, 21, 18, 15, 14, 31, 30, 27, 23, 20, 17, 14, 13, 30, 30, 26, 23, 19, 16, 14, 13, 28, 29, 24, 20, 18, 15, 13, 12, 28, 29, 24, 20, 17, 15, 13, 12, 26, 27, 23, 19, 16, 14, 12, 12, 26, 26, 23, 18, 16, 14, 12, 11, 23, 25, 22, 17, 15, 13, 11, 11, 22, 24, 21, 17, 14, 12, 11, 11, 21, 23, 20, 17, 14, 12, 11, 10, 20, 21, 20, 16, 13, 11, 10, 10, 19, 21, 19, 16, 13, 11, 10, 9, 18, 19, 18, 15, 12, 11, 9, 9, 18, 19, 18, 15, 12, 10, 9, 9, 16, 17, 17, 14, 12, 10, 9, 8, 16, 17, 16, 14, 11, 10, 9, 8, 14, 16, 15, 13, 11, 9, 8, 8, 14, 15, 15, 13, 11, 9, 8, 8, 13, 14, 14, 12, 10, 9, 8, 7, 13, 14, 14, 12, 10, 9, 8, 7, 12, 14, 14, 12, 10, 8, 8, 7, 12, 13, 13, 11, 10, 8, 7, 7, 12, 13, 13, 11, 10, 8, 7, 7, 11, 12, 13, 11, 10, 9, 7, 7, 11, 12, 13, 11, 10, 9, 8, 7 }, { /* Chroma */ /* Size 4x4 */ 32, 22, 21, 18, 22, 19, 19, 17, 21, 19, 15, 13, 18, 17, 13, 11, /* Size 8x8 */ 33, 30, 24, 22, 21, 19, 17, 16, 30, 26, 23, 22, 22, 20, 18, 17, 24, 23, 21, 21, 20, 19, 18, 17, 22, 22, 21, 19, 18, 17, 16, 16, 21, 22, 20, 18, 16, 15, 14, 14, 19, 20, 19, 17, 15, 13, 12, 12, 17, 18, 18, 16, 14, 12, 12, 11, 16, 17, 17, 16, 14, 12, 11, 11, /* Size 16x16 */ 32, 33, 33, 29, 26, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 33, 33, 32, 28, 25, 22, 22, 22, 21, 21, 20, 19, 18, 17, 17, 16, 33, 32, 30, 26, 24, 22, 22, 23, 22, 22, 21, 20, 19, 18, 17, 17, 29, 28, 26, 23, 22, 22, 22, 23, 22, 22, 21, 20, 19, 18, 18, 17, 26, 25, 24, 22, 21, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 21, 22, 22, 22, 20, 19, 19, 19, 19, 19, 19, 18, 17, 17, 17, 17, 21, 22, 22, 22, 21, 19, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 21, 22, 23, 23, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, 15, 20, 21, 22, 22, 21, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 20, 21, 22, 22, 21, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 19, 20, 21, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 18, 19, 20, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 17, 18, 19, 19, 19, 17, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 16, 17, 18, 18, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, 11, 16, 17, 17, 18, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 15, 16, 17, 17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, /* Size 32x32 */ 32, 33, 33, 34, 33, 31, 29, 28, 26, 25, 21, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 33, 33, 33, 33, 32, 30, 28, 27, 25, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 33, 33, 33, 33, 32, 29, 28, 26, 25, 24, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 34, 33, 33, 32, 31, 29, 27, 26, 24, 24, 22, 22, 23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 33, 32, 32, 31, 30, 28, 26, 25, 24, 24, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 31, 30, 29, 29, 28, 26, 25, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 29, 28, 28, 27, 26, 25, 23, 22, 22, 22, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 28, 27, 26, 26, 25, 24, 22, 22, 22, 22, 21, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 18, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 25, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 22, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 15, 15, 21, 22, 22, 23, 23, 23, 23, 23, 21, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 20, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 14, 19, 20, 20, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 19, 20, 20, 21, 21, 21, 21, 21, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 16, 16, 17, 17, 17, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10, /* Size 4x8 */ 33, 28, 24, 23, 22, 20, 18, 17, 22, 22, 20, 19, 19, 18, 17, 16, 20, 22, 20, 18, 16, 15, 14, 13, 17, 18, 18, 16, 14, 12, 11, 11, /* Size 8x4 */ 33, 22, 20, 17, 28, 22, 22, 18, 24, 20, 20, 18, 23, 19, 18, 16, 22, 19, 16, 14, 20, 18, 15, 12, 18, 17, 14, 11, 17, 16, 13, 11, /* Size 8x16 */ 32, 33, 33, 29, 25, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 32, 31, 29, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 17, 26, 25, 24, 22, 21, 20, 21, 21, 21, 20, 20, 19, 18, 18, 17, 17, 21, 22, 22, 22, 21, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 20, 21, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 18, 19, 20, 20, 20, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 13, 16, 17, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, 12, 15, 16, 17, 18, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, /* Size 16x8 */ 32, 32, 26, 21, 20, 18, 16, 15, 33, 31, 25, 22, 21, 19, 17, 16, 33, 29, 24, 22, 22, 20, 18, 17, 29, 26, 22, 22, 22, 20, 19, 18, 25, 24, 21, 21, 21, 20, 18, 17, 21, 22, 20, 19, 19, 18, 17, 17, 21, 22, 21, 19, 18, 17, 16, 16, 21, 23, 21, 18, 17, 16, 15, 15, 20, 22, 21, 18, 16, 15, 14, 14, 20, 21, 20, 18, 16, 14, 14, 13, 19, 20, 20, 17, 15, 14, 13, 13, 18, 20, 19, 17, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 16, 18, 18, 16, 14, 12, 12, 11, 16, 17, 17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 13, 12, 11, /* Size 16x32 */ 32, 33, 33, 34, 33, 31, 29, 28, 25, 24, 21, 21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 33, 33, 33, 32, 31, 28, 27, 26, 24, 24, 22, 22, 22, 23, 23, 22, 22, 22, 21, 20, 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 32, 31, 31, 31, 29, 28, 26, 25, 24, 24, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 28, 27, 27, 26, 25, 24, 23, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 22, 23, 23, 21, 21, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 21, 21, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10, /* Size 32x16 */ 32, 33, 32, 28, 26, 21, 21, 21, 20, 20, 18, 18, 16, 16, 15, 15, 33, 33, 31, 27, 25, 22, 22, 22, 21, 20, 19, 19, 17, 17, 16, 16, 33, 33, 31, 27, 25, 22, 22, 22, 21, 21, 19, 19, 17, 17, 16, 16, 34, 32, 31, 26, 24, 22, 23, 23, 22, 21, 20, 20, 18, 18, 17, 17, 33, 31, 29, 25, 24, 22, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 31, 28, 28, 24, 23, 22, 22, 22, 22, 22, 20, 20, 18, 18, 17, 17, 29, 27, 26, 23, 22, 22, 22, 23, 22, 22, 20, 20, 19, 18, 18, 17, 28, 26, 25, 22, 22, 22, 22, 23, 22, 22, 20, 20, 19, 19, 18, 18, 25, 24, 24, 22, 21, 21, 21, 21, 21, 20, 20, 19, 18, 18, 17, 18, 24, 24, 24, 22, 21, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 21, 22, 22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 21, 22, 22, 22, 21, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 21, 23, 22, 22, 21, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 21, 23, 23, 22, 21, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 15, 21, 22, 22, 22, 21, 19, 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 20, 22, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 20, 22, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 20, 21, 21, 22, 20, 19, 18, 17, 16, 16, 14, 14, 14, 14, 13, 14, 19, 20, 21, 21, 20, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 19, 20, 20, 21, 20, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 18, 20, 20, 20, 20, 18, 17, 16, 15, 15, 13, 13, 12, 12, 12, 12, 18, 20, 20, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 17, 19, 19, 20, 19, 18, 17, 16, 14, 14, 13, 13, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 12, 16, 18, 18, 19, 18, 17, 16, 15, 14, 14, 12, 12, 12, 11, 11, 11, 16, 18, 18, 19, 18, 17, 16, 15, 14, 14, 12, 12, 12, 11, 11, 11, 16, 17, 18, 18, 18, 17, 16, 15, 14, 14, 12, 12, 11, 11, 11, 11, 16, 17, 17, 18, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 11, 11, 11, 15, 17, 17, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 11, 11, 10, /* Size 4x16 */ 33, 33, 31, 27, 24, 22, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 21, 22, 22, 22, 21, 19, 19, 19, 19, 19, 19, 18, 17, 17, 17, 16, 20, 21, 21, 22, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 16, 17, 18, 18, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, 11, /* Size 16x4 */ 33, 21, 20, 16, 33, 22, 21, 17, 31, 22, 21, 18, 27, 22, 22, 18, 24, 21, 20, 18, 22, 19, 19, 17, 22, 19, 18, 16, 23, 19, 17, 15, 22, 19, 16, 14, 21, 19, 16, 14, 20, 19, 15, 13, 20, 18, 14, 12, 18, 17, 14, 12, 18, 17, 14, 11, 17, 17, 13, 11, 17, 16, 13, 11, /* Size 8x32 */ 32, 33, 33, 34, 33, 31, 29, 28, 25, 24, 21, 21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 32, 31, 31, 31, 29, 28, 26, 25, 24, 24, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 13, 13, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 12, 12, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, /* Size 32x8 */ 32, 32, 26, 21, 20, 18, 16, 15, 33, 31, 25, 22, 21, 19, 17, 16, 33, 31, 25, 22, 21, 19, 17, 16, 34, 31, 24, 23, 22, 20, 18, 17, 33, 29, 24, 22, 22, 20, 18, 17, 31, 28, 23, 22, 22, 20, 18, 17, 29, 26, 22, 22, 22, 20, 19, 18, 28, 25, 22, 22, 22, 20, 19, 18, 25, 24, 21, 21, 21, 20, 18, 17, 24, 24, 21, 21, 20, 19, 18, 17, 21, 22, 20, 19, 19, 18, 17, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22, 21, 19, 18, 17, 16, 16, 21, 22, 21, 19, 18, 17, 16, 16, 21, 23, 21, 18, 17, 16, 15, 15, 21, 22, 21, 18, 17, 16, 15, 15, 20, 22, 21, 18, 16, 15, 14, 14, 20, 22, 21, 18, 16, 15, 14, 14, 20, 21, 20, 18, 16, 14, 14, 13, 19, 21, 20, 17, 15, 14, 13, 13, 19, 20, 20, 17, 15, 14, 13, 13, 18, 20, 20, 17, 15, 13, 12, 12, 18, 20, 19, 17, 15, 13, 12, 12, 17, 19, 19, 17, 14, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 16, 18, 18, 16, 14, 12, 12, 11, 16, 18, 18, 16, 14, 12, 12, 11, 16, 18, 18, 16, 14, 12, 11, 11, 16, 17, 17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 13, 12, 11, 15, 17, 17, 16, 14, 13, 12, 11 }, }, { { /* Luma */ /* Size 4x4 */ 32, 30, 21, 14, 30, 21, 17, 13, 21, 17, 12, 10, 14, 13, 10, 8, /* Size 8x8 */ 32, 32, 30, 27, 22, 18, 15, 13, 32, 31, 29, 26, 23, 19, 16, 14, 30, 29, 26, 23, 20, 18, 15, 13, 27, 26, 23, 19, 17, 15, 13, 12, 22, 23, 20, 17, 14, 13, 11, 10, 18, 19, 18, 15, 13, 11, 10, 9, 15, 16, 15, 13, 11, 10, 9, 8, 13, 14, 13, 12, 10, 9, 8, 7, /* Size 16x16 */ 32, 33, 33, 33, 32, 30, 28, 26, 23, 21, 19, 17, 16, 14, 13, 12, 33, 32, 32, 32, 32, 30, 29, 27, 24, 22, 20, 18, 17, 15, 13, 13, 33, 32, 32, 32, 32, 31, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 33, 32, 32, 31, 30, 29, 28, 26, 24, 23, 20, 19, 17, 16, 14, 14, 32, 32, 32, 30, 29, 28, 27, 26, 24, 22, 21, 19, 18, 16, 15, 14, 30, 30, 31, 29, 28, 26, 24, 23, 22, 20, 19, 18, 16, 15, 14, 13, 28, 29, 30, 28, 27, 24, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 26, 27, 28, 26, 26, 23, 20, 19, 18, 17, 16, 15, 14, 13, 12, 12, 23, 24, 25, 24, 24, 22, 19, 18, 16, 15, 14, 14, 13, 12, 11, 11, 21, 22, 23, 23, 22, 20, 18, 17, 15, 14, 13, 13, 12, 11, 11, 10, 19, 20, 21, 20, 21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 17, 18, 19, 19, 19, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 16, 17, 17, 17, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 14, 15, 16, 16, 16, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8, 13, 13, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8, 7, 12, 13, 14, 14, 14, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 24, 22, 22, 20, 20, 18, 18, 17, 17, 15, 15, 13, 13, 13, 13, 12, 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 24, 22, 22, 20, 20, 18, 18, 17, 17, 15, 15, 13, 13, 13, 13, 12, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 20, 20, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 20, 20, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, 24, 22, 22, 21, 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, 24, 22, 22, 21, 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22, 22, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22, 22, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 21, 22, 22, 23, 23, 23, 23, 22, 22, 20, 20, 18, 18, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 21, 22, 22, 23, 23, 23, 23, 22, 22, 20, 20, 18, 18, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 19, 20, 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, /* Size 4x8 */ 32, 32, 30, 28, 23, 19, 16, 14, 29, 28, 24, 20, 18, 16, 14, 13, 20, 20, 19, 16, 13, 12, 11, 10, 14, 14, 14, 12, 11, 9, 8, 8, /* Size 8x4 */ 32, 29, 20, 14, 32, 28, 20, 14, 30, 24, 19, 14, 28, 20, 16, 12, 23, 18, 13, 11, 19, 16, 12, 9, 16, 14, 11, 8, 14, 13, 10, 8, /* Size 8x16 */ 32, 33, 33, 32, 32, 30, 28, 26, 23, 21, 19, 18, 16, 14, 13, 12, 33, 32, 32, 32, 31, 30, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 32, 32, 31, 30, 29, 28, 27, 26, 24, 22, 20, 19, 18, 16, 15, 14, 28, 29, 30, 28, 27, 24, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 23, 24, 25, 24, 24, 21, 19, 18, 16, 15, 14, 14, 13, 12, 11, 11, 19, 20, 21, 20, 21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 16, 17, 17, 17, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8, 8, /* Size 16x8 */ 32, 33, 32, 28, 23, 19, 16, 13, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32, 31, 30, 25, 21, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 31, 29, 27, 24, 21, 18, 15, 30, 30, 28, 24, 21, 19, 16, 14, 28, 30, 27, 21, 19, 17, 15, 13, 26, 28, 26, 20, 18, 16, 14, 12, 23, 25, 24, 19, 16, 14, 13, 11, 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 12, 14, 14, 13, 11, 10, 8, 8, /* Size 16x32 */ 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21, 21, 19, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 19, 20, 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 12, 12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, /* Size 32x16 */ 32, 33, 33, 32, 32, 28, 28, 23, 23, 19, 19, 16, 16, 13, 13, 12, 33, 32, 32, 32, 32, 29, 29, 24, 24, 20, 20, 17, 17, 14, 14, 12, 33, 32, 32, 32, 32, 29, 29, 24, 24, 20, 20, 17, 17, 14, 14, 12, 33, 32, 32, 31, 31, 30, 30, 25, 25, 21, 21, 17, 17, 14, 14, 13, 33, 32, 32, 31, 31, 30, 30, 25, 25, 21, 21, 17, 17, 14, 14, 13, 32, 32, 32, 30, 30, 28, 28, 24, 24, 20, 20, 17, 17, 14, 14, 13, 32, 32, 32, 30, 30, 28, 28, 24, 24, 20, 20, 17, 17, 14, 14, 13, 32, 31, 31, 29, 29, 27, 27, 24, 24, 21, 21, 18, 18, 15, 15, 14, 32, 31, 31, 29, 29, 27, 27, 24, 24, 21, 21, 18, 18, 15, 15, 14, 30, 30, 30, 28, 28, 24, 24, 21, 21, 19, 19, 16, 16, 14, 14, 13, 30, 30, 30, 28, 28, 24, 24, 21, 21, 19, 19, 16, 16, 14, 14, 13, 28, 30, 30, 27, 27, 21, 21, 19, 19, 17, 17, 15, 15, 13, 13, 12, 28, 30, 30, 27, 27, 21, 21, 19, 19, 17, 17, 15, 15, 13, 13, 12, 26, 28, 28, 26, 26, 20, 20, 18, 18, 16, 16, 14, 14, 12, 12, 12, 26, 28, 28, 26, 26, 20, 20, 18, 18, 16, 16, 14, 14, 12, 12, 12, 23, 25, 25, 24, 24, 19, 19, 16, 16, 14, 14, 13, 13, 11, 11, 11, 23, 25, 25, 24, 24, 19, 19, 16, 16, 14, 14, 13, 13, 11, 11, 11, 21, 23, 23, 22, 22, 18, 18, 15, 15, 13, 13, 12, 12, 11, 11, 10, 21, 23, 23, 22, 22, 18, 18, 15, 15, 13, 13, 12, 12, 11, 11, 10, 19, 21, 21, 20, 20, 17, 17, 14, 14, 12, 12, 11, 11, 10, 10, 9, 19, 21, 21, 20, 20, 17, 17, 14, 14, 12, 12, 11, 11, 10, 10, 9, 18, 19, 19, 19, 19, 16, 16, 14, 14, 12, 12, 10, 10, 9, 9, 9, 18, 19, 19, 19, 19, 16, 16, 14, 14, 12, 12, 10, 10, 9, 9, 9, 16, 17, 17, 18, 18, 15, 15, 13, 13, 11, 11, 10, 10, 9, 9, 8, 16, 17, 17, 18, 18, 15, 15, 13, 13, 11, 11, 10, 10, 9, 9, 8, 14, 16, 16, 16, 16, 14, 14, 12, 12, 11, 11, 9, 9, 8, 8, 8, 14, 16, 16, 16, 16, 14, 14, 12, 12, 11, 11, 9, 9, 8, 8, 8, 13, 14, 14, 15, 15, 13, 13, 11, 11, 10, 10, 9, 9, 8, 8, 7, 13, 14, 14, 15, 15, 13, 13, 11, 11, 10, 10, 9, 9, 8, 8, 7, 12, 14, 14, 14, 14, 13, 13, 11, 11, 10, 10, 8, 8, 8, 8, 7, 12, 14, 14, 14, 14, 13, 13, 11, 11, 10, 10, 8, 8, 8, 8, 7, 12, 13, 13, 13, 13, 12, 12, 11, 11, 9, 9, 8, 8, 7, 7, 7, /* Size 4x16 */ 33, 32, 32, 32, 31, 30, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 28, 29, 30, 28, 27, 24, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 19, 20, 21, 20, 21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8, 8, /* Size 16x4 */ 33, 28, 19, 13, 32, 29, 20, 14, 32, 30, 21, 14, 32, 28, 20, 14, 31, 27, 21, 15, 30, 24, 19, 14, 30, 21, 17, 13, 28, 20, 16, 12, 25, 19, 14, 11, 23, 18, 13, 11, 21, 17, 12, 10, 19, 16, 12, 9, 17, 15, 11, 9, 16, 14, 11, 8, 14, 13, 10, 8, 14, 13, 10, 8, /* Size 8x32 */ 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21, 21, 19, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 19, 20, 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, /* Size 32x8 */ 32, 33, 32, 28, 23, 19, 16, 13, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32, 31, 30, 25, 21, 17, 14, 33, 32, 31, 30, 25, 21, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 31, 29, 27, 24, 21, 18, 15, 32, 31, 29, 27, 24, 21, 18, 15, 30, 30, 28, 24, 21, 19, 16, 14, 30, 30, 28, 24, 21, 19, 16, 14, 28, 30, 27, 21, 19, 17, 15, 13, 28, 30, 27, 21, 19, 17, 15, 13, 26, 28, 26, 20, 18, 16, 14, 12, 26, 28, 26, 20, 18, 16, 14, 12, 23, 25, 24, 19, 16, 14, 13, 11, 23, 25, 24, 19, 16, 14, 13, 11, 21, 23, 22, 18, 15, 13, 12, 11, 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 19, 21, 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 10, 9, 18, 19, 19, 16, 14, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 8, 14, 16, 16, 14, 12, 11, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 12, 14, 14, 13, 11, 10, 8, 8, 12, 14, 14, 13, 11, 10, 8, 8, 12, 13, 13, 12, 11, 9, 8, 7 }, { /* Chroma */ /* Size 4x4 */ 32, 22, 22, 18, 22, 19, 19, 17, 22, 19, 16, 14, 18, 17, 14, 12, /* Size 8x8 */ 33, 30, 24, 22, 21, 20, 18, 17, 30, 26, 23, 22, 22, 21, 19, 18, 24, 23, 21, 21, 20, 20, 19, 18, 22, 22, 21, 19, 18, 18, 17, 16, 21, 22, 20, 18, 17, 16, 15, 14, 20, 21, 20, 18, 16, 14, 14, 13, 18, 19, 19, 17, 15, 14, 12, 12, 17, 18, 18, 16, 14, 13, 12, 11, /* Size 16x16 */ 32, 33, 34, 31, 28, 25, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 33, 33, 33, 30, 27, 24, 22, 22, 22, 21, 20, 20, 19, 18, 17, 17, 34, 33, 32, 29, 26, 24, 22, 23, 23, 22, 22, 21, 20, 19, 18, 18, 31, 30, 29, 26, 24, 23, 22, 22, 23, 22, 22, 21, 20, 19, 18, 18, 28, 27, 26, 24, 22, 22, 21, 22, 23, 22, 22, 21, 20, 20, 19, 19, 25, 24, 24, 23, 22, 21, 20, 21, 21, 20, 20, 20, 19, 19, 18, 18, 21, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 23, 22, 22, 21, 19, 19, 19, 18, 18, 18, 17, 17, 16, 16, 21, 22, 23, 23, 23, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, 20, 21, 22, 22, 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 20, 20, 22, 22, 22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 19, 20, 21, 21, 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 18, 19, 20, 20, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 17, 18, 19, 19, 20, 19, 18, 17, 16, 15, 14, 14, 13, 12, 12, 12, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, /* Size 32x32 */ 32, 33, 33, 34, 34, 31, 31, 28, 28, 25, 25, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 33, 33, 33, 33, 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 17, 16, 33, 33, 33, 33, 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 17, 16, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22, 22, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22, 22, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 18, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 18, 25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21, 21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21, 21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 20, 20, 20, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 19, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 18, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, /* Size 4x8 */ 33, 28, 24, 22, 22, 20, 19, 17, 22, 22, 20, 19, 19, 19, 18, 17, 20, 22, 20, 18, 16, 15, 14, 14, 17, 18, 18, 16, 14, 13, 12, 11, /* Size 8x4 */ 33, 22, 20, 17, 28, 22, 22, 18, 24, 20, 20, 18, 22, 19, 18, 16, 22, 19, 16, 14, 20, 19, 15, 13, 19, 18, 14, 12, 17, 17, 14, 11, /* Size 8x16 */ 32, 33, 34, 31, 28, 24, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 33, 33, 32, 28, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 28, 27, 26, 24, 22, 22, 21, 22, 22, 22, 22, 21, 20, 20, 19, 18, 21, 22, 22, 22, 22, 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 23, 22, 23, 21, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 20, 20, 21, 22, 22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 18, 19, 20, 20, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, /* Size 16x8 */ 32, 33, 28, 21, 21, 20, 18, 16, 33, 33, 27, 22, 22, 20, 19, 17, 34, 32, 26, 22, 23, 21, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 28, 26, 22, 22, 23, 22, 20, 19, 24, 24, 22, 20, 21, 20, 19, 18, 21, 22, 21, 19, 19, 19, 18, 17, 21, 22, 22, 19, 18, 18, 17, 16, 21, 23, 22, 19, 18, 17, 16, 15, 20, 22, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 14, 14, 19, 20, 21, 19, 17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 13, 12, 17, 19, 20, 18, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 17, 18, 17, 15, 14, 12, 11, /* Size 16x32 */ 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 33, 33, 33, 32, 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 33, 33, 33, 32, 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 20, 20, 20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 20, 20, 20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 16, 16, 17, 17, 17, 17, 18, 18, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, /* Size 32x16 */ 32, 33, 33, 28, 28, 21, 21, 21, 21, 20, 20, 18, 18, 16, 16, 16, 33, 33, 33, 27, 27, 22, 22, 22, 22, 20, 20, 19, 19, 17, 17, 16, 33, 33, 33, 27, 27, 22, 22, 22, 22, 20, 20, 19, 19, 17, 17, 16, 34, 32, 32, 26, 26, 22, 22, 23, 23, 21, 21, 20, 20, 18, 18, 17, 34, 32, 32, 26, 26, 22, 22, 23, 23, 21, 21, 20, 20, 18, 18, 17, 31, 28, 28, 24, 24, 22, 22, 22, 22, 22, 22, 20, 20, 18, 18, 17, 31, 28, 28, 24, 24, 22, 22, 22, 22, 22, 22, 20, 20, 18, 18, 17, 28, 26, 26, 22, 22, 22, 22, 23, 23, 22, 22, 20, 20, 19, 19, 18, 28, 26, 26, 22, 22, 22, 22, 23, 23, 22, 22, 20, 20, 19, 19, 18, 24, 24, 24, 22, 22, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 24, 24, 24, 22, 22, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 21, 22, 22, 21, 21, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 21, 22, 22, 21, 21, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 21, 22, 22, 22, 22, 19, 19, 18, 18, 18, 18, 17, 17, 16, 16, 16, 21, 22, 22, 22, 22, 19, 19, 18, 18, 18, 18, 17, 17, 16, 16, 16, 21, 23, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 21, 23, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 20, 22, 22, 22, 22, 19, 19, 17, 17, 16, 16, 15, 15, 14, 14, 14, 20, 22, 22, 22, 22, 19, 19, 17, 17, 16, 16, 15, 15, 14, 14, 14, 20, 21, 21, 22, 22, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 20, 21, 21, 22, 22, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 19, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 19, 20, 20, 21, 21, 19, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 18, 20, 20, 20, 20, 18, 18, 16, 16, 15, 15, 13, 13, 12, 12, 12, 18, 20, 20, 20, 20, 18, 18, 16, 16, 15, 15, 13, 13, 12, 12, 12, 17, 19, 19, 20, 20, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 17, 19, 19, 20, 20, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 16, 18, 18, 19, 19, 17, 17, 15, 15, 14, 14, 12, 12, 12, 12, 11, 16, 18, 18, 19, 19, 17, 17, 15, 15, 14, 14, 12, 12, 12, 12, 11, 16, 17, 17, 18, 18, 17, 17, 15, 15, 14, 14, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 17, 17, 15, 15, 14, 14, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 16, 16, 15, 15, 13, 13, 12, 12, 11, 11, 11, /* Size 4x16 */ 33, 33, 32, 28, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 21, 22, 22, 22, 22, 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 20, 20, 21, 22, 22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, /* Size 16x4 */ 33, 21, 20, 16, 33, 22, 20, 17, 32, 22, 21, 18, 28, 22, 22, 18, 26, 22, 22, 19, 24, 20, 20, 18, 22, 19, 19, 17, 22, 19, 18, 16, 23, 19, 17, 15, 22, 19, 16, 14, 21, 19, 16, 14, 20, 19, 15, 13, 20, 18, 15, 12, 19, 18, 14, 12, 18, 17, 14, 12, 17, 17, 14, 11, /* Size 8x32 */ 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 33, 33, 33, 32, 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 20, 20, 20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, /* Size 32x8 */ 32, 33, 28, 21, 21, 20, 18, 16, 33, 33, 27, 22, 22, 20, 19, 17, 33, 33, 27, 22, 22, 20, 19, 17, 34, 32, 26, 22, 23, 21, 20, 18, 34, 32, 26, 22, 23, 21, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 28, 26, 22, 22, 23, 22, 20, 19, 28, 26, 22, 22, 23, 22, 20, 19, 24, 24, 22, 20, 21, 20, 19, 18, 24, 24, 22, 20, 21, 20, 19, 18, 21, 22, 21, 19, 19, 19, 18, 17, 21, 22, 21, 19, 19, 19, 18, 17, 21, 22, 22, 19, 18, 18, 17, 16, 21, 22, 22, 19, 18, 18, 17, 16, 21, 23, 22, 19, 18, 17, 16, 15, 21, 23, 22, 19, 18, 17, 16, 15, 20, 22, 22, 19, 17, 16, 15, 14, 20, 22, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 14, 14, 20, 21, 22, 19, 17, 16, 14, 14, 19, 20, 21, 19, 17, 15, 14, 13, 19, 20, 21, 19, 17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 13, 12, 18, 20, 20, 18, 16, 15, 13, 12, 17, 19, 20, 18, 16, 14, 13, 12, 17, 19, 20, 18, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 17, 18, 17, 15, 14, 12, 11, 16, 17, 18, 17, 15, 14, 12, 11, 16, 17, 18, 16, 15, 13, 12, 11 }, }, { { /* Luma */ /* Size 4x4 */ 32, 31, 23, 17, 31, 26, 20, 16, 23, 20, 14, 12, 17, 16, 12, 9, /* Size 8x8 */ 33, 32, 32, 29, 24, 20, 17, 15, 32, 32, 31, 29, 25, 21, 18, 16, 32, 31, 29, 27, 24, 21, 18, 16, 29, 29, 27, 21, 19, 17, 16, 14, 24, 25, 24, 19, 16, 14, 13, 12, 20, 21, 21, 17, 14, 13, 12, 11, 17, 18, 18, 16, 13, 12, 10, 9, 15, 16, 16, 14, 12, 11, 9, 9, /* Size 16x16 */ 32, 33, 33, 33, 32, 30, 29, 27, 25, 23, 21, 19, 17, 16, 14, 13, 33, 32, 32, 32, 32, 30, 29, 28, 26, 24, 22, 20, 18, 17, 15, 13, 33, 32, 32, 32, 32, 31, 30, 28, 27, 25, 23, 21, 19, 17, 16, 14, 33, 32, 32, 31, 30, 29, 28, 27, 26, 24, 23, 20, 19, 17, 16, 14, 32, 32, 32, 30, 29, 28, 27, 26, 25, 24, 22, 21, 19, 18, 16, 15, 30, 30, 31, 29, 28, 26, 24, 23, 22, 21, 20, 19, 18, 16, 15, 14, 29, 29, 30, 28, 27, 24, 22, 21, 20, 19, 19, 17, 17, 15, 14, 13, 27, 28, 28, 27, 26, 23, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 25, 26, 27, 26, 25, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 12, 23, 24, 25, 24, 24, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 21, 22, 23, 23, 22, 20, 19, 17, 16, 15, 14, 13, 13, 12, 11, 11, 19, 20, 21, 20, 21, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, 17, 18, 19, 19, 19, 18, 17, 15, 14, 13, 13, 12, 11, 10, 10, 9, 16, 17, 17, 17, 18, 16, 15, 14, 14, 13, 12, 11, 10, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 13, 12, 11, 11, 10, 9, 9, 8, 13, 13, 14, 14, 15, 14, 13, 12, 12, 11, 11, 10, 9, 9, 8, 8, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23, 23, 21, 21, 19, 19, 18, 17, 17, 16, 15, 14, 14, 13, 13, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 27, 26, 24, 24, 22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 27, 26, 24, 24, 22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, 27, 26, 25, 24, 23, 23, 21, 20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 28, 28, 27, 25, 25, 23, 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 28, 27, 26, 25, 24, 23, 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 21, 20, 20, 19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 15, 15, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 25, 24, 24, 22, 22, 21, 21, 20, 19, 19, 18, 17, 16, 16, 15, 15, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 25, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 30, 30, 30, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 30, 30, 30, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 29, 29, 29, 30, 30, 29, 28, 28, 27, 27, 24, 24, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16, 15, 15, 14, 14, 13, 13, 28, 29, 29, 30, 30, 29, 28, 28, 27, 27, 24, 24, 22, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 27, 28, 28, 28, 28, 28, 27, 27, 26, 26, 23, 23, 21, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 26, 27, 27, 27, 28, 27, 26, 26, 26, 25, 23, 23, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 25, 26, 26, 26, 27, 26, 26, 26, 25, 25, 22, 22, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 22, 22, 20, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 23, 24, 24, 24, 25, 24, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 21, 22, 22, 23, 23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 21, 22, 22, 23, 23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 19, 20, 20, 21, 21, 21, 21, 21, 21, 21, 19, 19, 18, 17, 17, 16, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 19, 20, 20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 17, 17, 17, 18, 18, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 14, 15, 15, 16, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 8, 8, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, /* Size 4x8 */ 32, 32, 31, 29, 25, 21, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 24, 24, 23, 19, 16, 14, 13, 12, 17, 17, 18, 15, 13, 11, 10, 9, /* Size 8x4 */ 32, 30, 24, 17, 32, 30, 24, 17, 31, 28, 23, 18, 29, 24, 19, 15, 25, 21, 16, 13, 21, 19, 14, 11, 18, 17, 13, 10, 16, 15, 12, 9, /* Size 8x16 */ 32, 33, 33, 32, 32, 30, 29, 27, 25, 23, 21, 19, 18, 16, 14, 13, 33, 32, 32, 32, 31, 30, 30, 28, 26, 24, 23, 21, 19, 17, 16, 14, 32, 32, 31, 30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 16, 15, 28, 29, 30, 28, 27, 24, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 23, 24, 25, 24, 24, 21, 20, 18, 17, 16, 15, 14, 14, 13, 12, 11, 19, 20, 21, 20, 21, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, 17, 17, 18, 18, 18, 17, 16, 15, 14, 13, 12, 11, 11, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 13, 12, 11, 10, 10, 9, 9, 8, /* Size 16x8 */ 32, 33, 32, 28, 23, 19, 17, 14, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32, 31, 30, 25, 21, 18, 16, 32, 32, 30, 28, 24, 20, 18, 16, 32, 31, 29, 27, 24, 21, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 29, 30, 27, 22, 20, 17, 16, 14, 27, 28, 26, 21, 18, 16, 15, 13, 25, 26, 25, 20, 17, 15, 14, 13, 23, 24, 24, 19, 16, 14, 13, 12, 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 11, 10, 16, 17, 18, 15, 13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 9, 13, 14, 15, 13, 11, 10, 9, 8, /* Size 16x32 */ 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23, 23, 21, 21, 19, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 28, 27, 26, 25, 24, 23, 23, 21, 20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 24, 23, 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28, 28, 28, 27, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 30, 30, 30, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 28, 29, 29, 29, 30, 29, 28, 28, 27, 26, 24, 24, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 27, 28, 28, 28, 28, 28, 27, 27, 26, 26, 23, 23, 21, 21, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 21, 21, 20, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 23, 24, 24, 24, 25, 24, 24, 24, 24, 23, 21, 21, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 19, 20, 20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 19, 20, 20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, /* Size 32x16 */ 32, 33, 33, 32, 32, 30, 28, 27, 23, 23, 19, 19, 17, 16, 14, 13, 33, 32, 32, 32, 32, 30, 29, 28, 24, 24, 20, 20, 17, 17, 15, 14, 33, 32, 32, 32, 32, 30, 29, 28, 24, 24, 20, 20, 17, 17, 15, 14, 33, 32, 32, 32, 32, 31, 29, 28, 25, 24, 20, 20, 18, 17, 15, 14, 33, 32, 32, 32, 31, 31, 30, 28, 25, 25, 21, 21, 18, 17, 16, 14, 33, 32, 32, 31, 31, 30, 29, 28, 25, 24, 21, 21, 18, 17, 16, 14, 32, 32, 32, 31, 30, 29, 28, 27, 24, 24, 20, 20, 18, 17, 16, 14, 32, 32, 32, 30, 30, 29, 28, 27, 24, 24, 21, 21, 18, 17, 16, 15, 32, 32, 31, 30, 29, 28, 27, 26, 24, 24, 21, 21, 18, 18, 16, 15, 32, 31, 31, 30, 29, 28, 26, 26, 24, 23, 20, 20, 18, 18, 16, 15, 30, 30, 30, 28, 28, 26, 24, 23, 21, 21, 19, 19, 17, 16, 15, 14, 30, 30, 30, 28, 28, 26, 24, 23, 21, 21, 19, 19, 17, 16, 15, 14, 29, 30, 30, 28, 27, 24, 22, 21, 20, 19, 17, 17, 16, 15, 14, 13, 28, 29, 30, 28, 27, 24, 21, 21, 19, 19, 17, 17, 16, 15, 14, 13, 27, 28, 28, 27, 26, 23, 21, 20, 18, 18, 16, 16, 15, 14, 13, 13, 26, 27, 28, 26, 26, 23, 20, 20, 18, 18, 16, 16, 14, 14, 13, 12, 25, 26, 26, 25, 25, 22, 20, 19, 17, 17, 15, 15, 14, 13, 13, 12, 23, 25, 25, 24, 24, 21, 19, 18, 16, 16, 14, 14, 13, 13, 12, 11, 23, 24, 24, 24, 24, 21, 19, 18, 16, 16, 14, 14, 13, 13, 12, 11, 21, 23, 23, 22, 22, 20, 18, 17, 15, 15, 13, 13, 12, 12, 11, 11, 21, 23, 23, 22, 22, 20, 18, 17, 15, 15, 13, 13, 12, 12, 11, 11, 19, 21, 21, 21, 21, 19, 17, 17, 14, 14, 13, 13, 12, 11, 10, 10, 19, 20, 21, 20, 20, 19, 17, 16, 14, 14, 12, 12, 11, 11, 10, 10, 18, 19, 20, 20, 20, 18, 17, 16, 14, 14, 12, 12, 11, 11, 10, 9, 18, 19, 19, 19, 19, 18, 16, 15, 14, 13, 12, 12, 11, 10, 10, 9, 17, 18, 18, 18, 18, 17, 16, 15, 13, 13, 12, 12, 10, 10, 9, 9, 16, 17, 17, 17, 18, 16, 15, 14, 13, 13, 11, 11, 10, 10, 9, 9, 15, 17, 17, 17, 17, 16, 15, 14, 13, 12, 11, 11, 10, 10, 9, 9, 14, 16, 16, 16, 16, 15, 14, 13, 12, 12, 11, 11, 9, 9, 9, 8, 14, 16, 16, 16, 16, 15, 14, 13, 12, 12, 10, 10, 9, 9, 9, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, /* Size 4x16 */ 33, 32, 32, 32, 32, 30, 30, 28, 26, 24, 23, 20, 19, 17, 16, 14, 30, 30, 31, 29, 28, 26, 24, 23, 22, 21, 20, 19, 18, 16, 15, 14, 23, 24, 25, 24, 24, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 16, 17, 17, 17, 18, 16, 15, 14, 13, 13, 12, 11, 10, 10, 9, 9, /* Size 16x4 */ 33, 30, 23, 16, 32, 30, 24, 17, 32, 31, 25, 17, 32, 29, 24, 17, 32, 28, 24, 18, 30, 26, 21, 16, 30, 24, 19, 15, 28, 23, 18, 14, 26, 22, 17, 13, 24, 21, 16, 13, 23, 20, 15, 12, 20, 19, 14, 11, 19, 18, 13, 10, 17, 16, 13, 10, 16, 15, 12, 9, 14, 14, 11, 9, /* Size 8x32 */ 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23, 23, 21, 21, 19, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 24, 23, 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 28, 29, 29, 29, 30, 29, 28, 28, 27, 26, 24, 24, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 21, 21, 20, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 19, 20, 20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 8, 8, /* Size 32x8 */ 32, 33, 32, 28, 23, 19, 17, 14, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32, 32, 29, 25, 20, 18, 15, 33, 32, 31, 30, 25, 21, 18, 16, 33, 32, 31, 29, 25, 21, 18, 16, 32, 32, 30, 28, 24, 20, 18, 16, 32, 32, 30, 28, 24, 21, 18, 16, 32, 31, 29, 27, 24, 21, 18, 16, 32, 31, 29, 26, 24, 20, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 30, 30, 28, 24, 21, 19, 17, 15, 29, 30, 27, 22, 20, 17, 16, 14, 28, 30, 27, 21, 19, 17, 16, 14, 27, 28, 26, 21, 18, 16, 15, 13, 26, 28, 26, 20, 18, 16, 14, 13, 25, 26, 25, 20, 17, 15, 14, 13, 23, 25, 24, 19, 16, 14, 13, 12, 23, 24, 24, 19, 16, 14, 13, 12, 21, 23, 22, 18, 15, 13, 12, 11, 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 21, 17, 14, 13, 12, 10, 19, 21, 20, 17, 14, 12, 11, 10, 18, 20, 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 11, 10, 17, 18, 18, 16, 13, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 15, 17, 17, 15, 13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 9, 14, 16, 16, 14, 12, 10, 9, 9, 13, 14, 15, 13, 11, 10, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8 }, { /* Chroma */ /* Size 4x4 */ 33, 24, 22, 19, 24, 21, 20, 19, 22, 20, 17, 15, 19, 19, 15, 13, /* Size 8x8 */ 33, 32, 27, 21, 22, 20, 19, 18, 32, 29, 24, 22, 23, 22, 20, 19, 27, 24, 22, 21, 23, 22, 21, 20, 21, 22, 21, 19, 19, 19, 18, 18, 22, 23, 23, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 15, 14, 19, 20, 21, 18, 16, 15, 14, 13, 18, 19, 20, 18, 16, 14, 13, 12, /* Size 16x16 */ 32, 33, 34, 31, 28, 25, 22, 21, 21, 21, 20, 20, 19, 18, 17, 16, 33, 33, 33, 30, 27, 24, 22, 22, 22, 22, 21, 20, 20, 19, 18, 17, 34, 33, 32, 29, 26, 24, 23, 22, 23, 23, 22, 22, 21, 20, 19, 18, 31, 30, 29, 26, 24, 23, 22, 22, 22, 23, 22, 22, 21, 20, 19, 18, 28, 27, 26, 24, 22, 22, 22, 22, 22, 23, 22, 22, 21, 20, 20, 19, 25, 24, 24, 23, 22, 21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 18, 22, 22, 23, 22, 22, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 21, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 21, 22, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 21, 22, 23, 23, 23, 21, 20, 19, 18, 17, 17, 17, 16, 16, 16, 15, 20, 21, 22, 22, 22, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 20, 20, 22, 22, 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 19, 20, 21, 21, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 18, 19, 20, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 13, 13, 12, 17, 18, 19, 19, 20, 19, 18, 17, 16, 16, 15, 14, 14, 13, 12, 12, 16, 17, 18, 18, 19, 18, 17, 17, 16, 15, 14, 14, 13, 12, 12, 12, /* Size 32x32 */ 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 25, 25, 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, 33, 33, 33, 33, 33, 32, 30, 29, 27, 27, 24, 24, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 33, 33, 33, 33, 33, 31, 30, 29, 27, 26, 24, 24, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 34, 33, 33, 33, 33, 31, 29, 28, 26, 26, 24, 24, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 34, 33, 33, 33, 32, 31, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 32, 32, 31, 31, 31, 29, 28, 27, 25, 24, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 31, 30, 30, 29, 29, 28, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 30, 29, 29, 28, 28, 27, 26, 25, 23, 23, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 23, 22, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 21, 22, 22, 22, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21, 20, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 17, 18, 18, 19, 19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, /* Size 4x8 */ 33, 31, 26, 22, 23, 21, 20, 19, 24, 23, 22, 20, 21, 20, 20, 19, 22, 23, 22, 19, 17, 17, 16, 16, 19, 20, 20, 18, 16, 15, 14, 13, /* Size 8x4 */ 33, 24, 22, 19, 31, 23, 23, 20, 26, 22, 22, 20, 22, 20, 19, 18, 23, 21, 17, 16, 21, 20, 17, 15, 20, 20, 16, 14, 19, 19, 16, 13, /* Size 8x16 */ 32, 33, 34, 31, 28, 24, 22, 21, 21, 21, 20, 20, 19, 18, 17, 16, 33, 33, 32, 28, 26, 24, 22, 22, 23, 23, 22, 21, 20, 20, 19, 18, 28, 27, 26, 24, 22, 22, 21, 22, 22, 22, 22, 22, 21, 20, 20, 19, 21, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 21, 22, 23, 22, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 20, 20, 21, 22, 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 18, 19, 20, 20, 21, 19, 19, 18, 17, 16, 16, 15, 14, 14, 13, 13, 17, 18, 19, 19, 20, 18, 18, 17, 16, 16, 15, 14, 13, 13, 12, 12, /* Size 16x8 */ 32, 33, 28, 21, 21, 20, 18, 17, 33, 33, 27, 22, 22, 20, 19, 18, 34, 32, 26, 22, 23, 21, 20, 19, 31, 28, 24, 22, 22, 22, 20, 19, 28, 26, 22, 22, 23, 22, 21, 20, 24, 24, 22, 20, 21, 20, 19, 18, 22, 22, 21, 20, 19, 19, 19, 18, 21, 22, 22, 19, 19, 18, 18, 17, 21, 23, 22, 19, 18, 17, 17, 16, 21, 23, 22, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 16, 15, 20, 21, 22, 19, 17, 16, 15, 14, 19, 20, 21, 19, 17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 14, 13, 17, 19, 20, 18, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 13, 12, /* Size 16x32 */ 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 24, 24, 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 33, 33, 33, 32, 32, 31, 29, 28, 26, 26, 24, 24, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 33, 33, 33, 32, 32, 30, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 29, 28, 28, 28, 28, 26, 26, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 17, 18, 18, 18, 19, 19, 19, 19, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, /* Size 32x16 */ 32, 33, 33, 29, 28, 24, 21, 21, 21, 21, 20, 20, 18, 18, 17, 16, 33, 33, 33, 28, 27, 24, 22, 22, 22, 22, 20, 20, 19, 19, 18, 17, 33, 33, 33, 28, 27, 24, 22, 22, 22, 22, 20, 20, 19, 19, 18, 17, 34, 32, 32, 28, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 18, 18, 34, 32, 32, 28, 26, 24, 22, 22, 23, 23, 21, 21, 20, 20, 19, 18, 32, 31, 30, 26, 25, 23, 22, 22, 23, 23, 21, 21, 20, 20, 19, 18, 31, 29, 28, 26, 24, 23, 22, 22, 22, 22, 22, 22, 20, 20, 19, 18, 30, 28, 28, 24, 23, 23, 22, 22, 23, 22, 22, 22, 20, 20, 19, 19, 28, 26, 26, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 19, 28, 26, 26, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 24, 24, 24, 22, 22, 21, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 24, 24, 24, 22, 22, 21, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 21, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 21, 22, 22, 22, 22, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 16, 21, 22, 23, 22, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 21, 23, 23, 23, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 21, 22, 23, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 20, 22, 22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 16, 15, 15, 14, 20, 22, 22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 16, 15, 15, 14, 20, 21, 21, 22, 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 20, 21, 21, 22, 22, 20, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 19, 20, 21, 21, 21, 20, 19, 18, 17, 17, 15, 15, 14, 14, 14, 13, 19, 20, 20, 21, 21, 20, 19, 18, 17, 16, 15, 15, 14, 14, 13, 13, 19, 20, 20, 20, 21, 20, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 18, 20, 20, 20, 20, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12, 18, 19, 19, 20, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 13, 12, 17, 19, 19, 19, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 17, 19, 19, 19, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 16, 18, 18, 18, 19, 18, 17, 17, 15, 15, 14, 14, 13, 12, 12, 12, 16, 18, 18, 18, 19, 18, 17, 17, 15, 15, 14, 14, 13, 12, 12, 12, /* Size 4x16 */ 33, 33, 32, 29, 26, 24, 22, 22, 22, 22, 22, 21, 20, 20, 19, 18, 24, 24, 24, 23, 22, 21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 18, 21, 22, 23, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 18, 19, 20, 20, 20, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 12, /* Size 16x4 */ 33, 24, 21, 18, 33, 24, 22, 19, 32, 24, 23, 20, 29, 23, 22, 20, 26, 22, 22, 20, 24, 21, 21, 19, 22, 20, 19, 18, 22, 20, 19, 18, 22, 21, 18, 17, 22, 21, 17, 16, 22, 20, 17, 15, 21, 20, 17, 14, 20, 20, 16, 14, 20, 19, 16, 13, 19, 19, 16, 13, 18, 18, 15, 12, /* Size 8x32 */ 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 24, 24, 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 33, 33, 33, 32, 32, 30, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 17, 18, 18, 18, 19, 19, 19, 19, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, /* Size 32x8 */ 32, 33, 28, 21, 21, 20, 18, 17, 33, 33, 27, 22, 22, 20, 19, 18, 33, 33, 27, 22, 22, 20, 19, 18, 34, 32, 26, 22, 22, 21, 20, 18, 34, 32, 26, 22, 23, 21, 20, 19, 32, 30, 25, 22, 23, 21, 20, 19, 31, 28, 24, 22, 22, 22, 20, 19, 30, 28, 23, 22, 23, 22, 20, 19, 28, 26, 22, 22, 23, 22, 21, 20, 28, 26, 22, 21, 22, 22, 21, 19, 24, 24, 22, 20, 21, 20, 19, 18, 24, 24, 22, 20, 21, 20, 19, 18, 22, 22, 21, 20, 19, 19, 19, 18, 21, 22, 21, 19, 19, 19, 18, 17, 21, 22, 22, 19, 19, 18, 18, 17, 21, 22, 22, 19, 18, 18, 17, 17, 21, 23, 22, 19, 18, 17, 17, 16, 21, 23, 22, 19, 18, 17, 16, 16, 21, 23, 22, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 16, 15, 20, 22, 22, 19, 17, 16, 16, 15, 20, 21, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 15, 14, 19, 21, 21, 19, 17, 15, 14, 14, 19, 20, 21, 19, 17, 15, 14, 13, 19, 20, 21, 18, 16, 15, 14, 13, 18, 20, 20, 18, 16, 15, 14, 13, 18, 19, 20, 18, 16, 14, 13, 13, 17, 19, 20, 18, 16, 14, 13, 12, 17, 19, 19, 17, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 13, 12, 16, 18, 19, 17, 15, 14, 13, 12 }, }, { { /* Luma */ /* Size 4x4 */ 32, 31, 24, 19, 31, 27, 22, 18, 24, 22, 16, 14, 19, 18, 14, 11, /* Size 8x8 */ 33, 32, 32, 30, 27, 22, 20, 16, 32, 32, 32, 30, 28, 23, 21, 17, 32, 32, 29, 28, 26, 23, 21, 18, 30, 30, 28, 24, 22, 20, 18, 16, 27, 28, 26, 22, 19, 17, 16, 14, 22, 23, 23, 20, 17, 15, 14, 12, 20, 21, 21, 18, 16, 14, 12, 11, 16, 17, 18, 16, 14, 12, 11, 10, /* Size 16x16 */ 32, 33, 33, 33, 32, 32, 30, 28, 27, 25, 23, 21, 19, 18, 17, 16, 33, 32, 32, 32, 32, 32, 30, 29, 27, 26, 24, 22, 20, 19, 18, 17, 33, 32, 32, 32, 32, 32, 31, 30, 28, 27, 25, 23, 21, 19, 18, 17, 33, 32, 32, 31, 31, 31, 29, 28, 27, 26, 24, 23, 21, 19, 18, 17, 32, 32, 32, 31, 30, 30, 28, 28, 26, 26, 24, 23, 21, 19, 19, 17, 32, 32, 32, 31, 30, 29, 28, 27, 26, 25, 24, 22, 21, 20, 19, 18, 30, 30, 31, 29, 28, 28, 26, 24, 23, 22, 22, 20, 19, 18, 17, 16, 28, 29, 30, 28, 28, 27, 24, 21, 20, 20, 19, 18, 17, 16, 16, 15, 27, 27, 28, 27, 26, 26, 23, 20, 20, 19, 18, 17, 16, 15, 15, 14, 25, 26, 27, 26, 26, 25, 22, 20, 19, 18, 17, 16, 15, 15, 14, 14, 23, 24, 25, 24, 24, 24, 22, 19, 18, 17, 16, 15, 14, 14, 13, 13, 21, 22, 23, 23, 23, 22, 20, 18, 17, 16, 15, 14, 13, 13, 12, 12, 19, 20, 21, 21, 21, 21, 19, 17, 16, 15, 14, 13, 12, 12, 12, 11, 18, 19, 19, 19, 19, 20, 18, 16, 15, 15, 14, 13, 12, 11, 11, 11, 17, 18, 18, 18, 19, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, 16, 17, 17, 17, 17, 18, 16, 15, 14, 14, 13, 12, 11, 11, 10, 10, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26, 25, 23, 23, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 27, 27, 26, 24, 24, 22, 22, 21, 20, 20, 18, 18, 17, 16, 16, 15, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 27, 27, 26, 24, 24, 23, 22, 21, 20, 20, 19, 18, 18, 17, 17, 15, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 27, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28, 27, 25, 25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28, 27, 25, 25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 28, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17, 17, 16, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 23, 23, 22, 21, 21, 19, 19, 19, 17, 17, 16, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 31, 31, 31, 31, 31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 30, 30, 30, 30, 31, 31, 29, 29, 28, 28, 28, 26, 26, 25, 24, 24, 23, 23, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 16, 16, 15, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, 23, 23, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 27, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 26, 27, 27, 27, 28, 28, 26, 26, 26, 26, 26, 23, 23, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 25, 26, 26, 26, 27, 27, 26, 26, 26, 25, 25, 23, 22, 21, 20, 20, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 23, 24, 24, 24, 25, 25, 24, 24, 24, 24, 24, 22, 22, 20, 19, 19, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 23, 24, 24, 24, 25, 25, 24, 24, 24, 24, 24, 22, 22, 20, 19, 19, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 21, 20, 20, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 21, 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 17, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 18, 18, 18, 18, 18, 18, 19, 19, 19, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 16, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 16, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, /* Size 4x8 */ 32, 32, 32, 30, 27, 23, 20, 17, 32, 31, 29, 28, 26, 23, 20, 18, 24, 25, 24, 20, 18, 16, 14, 13, 18, 19, 20, 17, 15, 13, 12, 11, /* Size 8x4 */ 32, 32, 24, 18, 32, 31, 25, 19, 32, 29, 24, 20, 30, 28, 20, 17, 27, 26, 18, 15, 23, 23, 16, 13, 20, 20, 14, 12, 17, 18, 13, 11, /* Size 8x16 */ 32, 33, 33, 33, 32, 32, 30, 28, 27, 25, 23, 21, 19, 18, 17, 16, 33, 32, 32, 32, 32, 31, 30, 30, 28, 26, 25, 23, 21, 19, 18, 17, 32, 32, 31, 30, 30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 18, 29, 29, 30, 29, 28, 28, 25, 22, 22, 21, 20, 19, 18, 17, 16, 16, 26, 27, 28, 27, 26, 26, 23, 20, 20, 19, 18, 17, 16, 15, 15, 14, 23, 24, 25, 24, 24, 24, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 19, 20, 21, 21, 21, 21, 19, 17, 16, 15, 14, 13, 12, 12, 12, 11, 16, 17, 17, 17, 18, 18, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, /* Size 16x8 */ 32, 33, 32, 29, 26, 23, 19, 16, 33, 32, 32, 29, 27, 24, 20, 17, 33, 32, 31, 30, 28, 25, 21, 17, 33, 32, 30, 29, 27, 24, 21, 17, 32, 32, 30, 28, 26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 30, 30, 28, 25, 23, 21, 19, 16, 28, 30, 27, 22, 20, 19, 17, 15, 27, 28, 26, 22, 20, 18, 16, 14, 25, 26, 25, 21, 19, 17, 15, 13, 23, 25, 24, 20, 18, 16, 14, 13, 21, 23, 22, 19, 17, 15, 13, 12, 19, 21, 20, 18, 16, 14, 12, 11, 18, 19, 19, 17, 15, 14, 12, 11, 17, 18, 18, 16, 15, 13, 12, 10, 16, 17, 18, 16, 14, 13, 11, 10, /* Size 16x32 */ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26, 25, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 27, 26, 25, 25, 23, 23, 22, 20, 20, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17, 17, 16, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 25, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 26, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 20, 21, 21, 21, 22, 22, 21, 21, 21, 21, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, /* Size 32x16 */ 32, 33, 33, 33, 32, 32, 29, 28, 26, 23, 23, 20, 19, 18, 16, 16, 33, 32, 32, 32, 32, 32, 29, 29, 27, 24, 24, 21, 20, 18, 16, 16, 33, 32, 32, 32, 32, 32, 29, 29, 27, 24, 24, 21, 20, 19, 17, 17, 33, 32, 32, 32, 32, 32, 30, 29, 28, 25, 25, 21, 20, 19, 17, 17, 33, 32, 32, 32, 31, 31, 30, 30, 28, 25, 25, 22, 21, 19, 17, 17, 33, 32, 32, 32, 31, 31, 30, 30, 28, 25, 25, 22, 21, 19, 17, 17, 33, 32, 32, 31, 30, 30, 29, 28, 27, 24, 24, 21, 21, 19, 17, 17, 32, 32, 32, 31, 30, 30, 28, 28, 27, 24, 24, 21, 20, 19, 17, 17, 32, 32, 32, 31, 30, 30, 28, 28, 26, 24, 24, 21, 21, 19, 18, 18, 32, 32, 31, 30, 29, 29, 28, 27, 26, 24, 24, 21, 21, 20, 18, 18, 32, 32, 31, 30, 29, 29, 28, 27, 26, 24, 24, 21, 21, 20, 18, 18, 31, 31, 31, 29, 28, 28, 26, 25, 24, 22, 22, 20, 19, 18, 17, 17, 30, 30, 30, 29, 28, 28, 25, 24, 23, 21, 21, 19, 19, 18, 16, 16, 30, 30, 30, 29, 28, 28, 24, 23, 22, 20, 20, 19, 18, 17, 16, 16, 28, 29, 30, 28, 27, 27, 22, 21, 20, 19, 19, 18, 17, 16, 15, 15, 28, 29, 30, 28, 27, 27, 22, 21, 20, 19, 19, 18, 17, 16, 15, 15, 27, 28, 28, 27, 26, 26, 22, 20, 20, 18, 18, 17, 16, 15, 14, 14, 26, 27, 28, 26, 26, 26, 21, 20, 19, 18, 18, 16, 16, 15, 14, 14, 25, 26, 26, 26, 25, 25, 21, 20, 19, 17, 17, 16, 15, 15, 13, 13, 23, 25, 25, 24, 24, 24, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 23, 25, 25, 24, 24, 24, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 22, 23, 23, 23, 23, 23, 19, 18, 17, 16, 16, 14, 14, 13, 12, 12, 21, 23, 23, 23, 22, 22, 19, 18, 17, 15, 15, 14, 13, 13, 12, 12, 20, 22, 22, 22, 22, 22, 19, 18, 17, 15, 15, 13, 13, 12, 12, 12, 19, 20, 21, 20, 20, 20, 18, 17, 16, 14, 14, 13, 12, 12, 11, 11, 19, 20, 21, 20, 20, 20, 18, 17, 16, 14, 14, 13, 12, 12, 11, 11, 18, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 12, 12, 11, 11, 11, 18, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 12, 12, 11, 10, 10, 17, 18, 18, 18, 18, 18, 16, 16, 15, 13, 13, 12, 12, 11, 10, 10, 16, 17, 17, 17, 18, 18, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 16, 17, 17, 17, 18, 18, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 15, 16, 16, 16, 17, 17, 15, 14, 13, 12, 12, 11, 11, 10, 9, 9, /* Size 4x16 */ 33, 32, 32, 32, 32, 32, 30, 29, 28, 26, 25, 23, 20, 19, 18, 17, 32, 32, 31, 30, 30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 18, 23, 24, 25, 24, 24, 24, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 18, 19, 19, 19, 19, 20, 18, 16, 15, 15, 14, 13, 12, 11, 11, 11, /* Size 16x4 */ 33, 32, 23, 18, 32, 32, 24, 19, 32, 31, 25, 19, 32, 30, 24, 19, 32, 30, 24, 19, 32, 29, 24, 20, 30, 28, 21, 18, 29, 27, 19, 16, 28, 26, 18, 15, 26, 25, 17, 15, 25, 24, 16, 14, 23, 22, 15, 13, 20, 20, 14, 12, 19, 19, 14, 11, 18, 18, 13, 11, 17, 18, 13, 11, /* Size 8x32 */ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26, 25, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 26, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, /* Size 32x8 */ 32, 33, 32, 29, 26, 23, 19, 16, 33, 32, 32, 29, 27, 24, 20, 16, 33, 32, 32, 29, 27, 24, 20, 17, 33, 32, 32, 30, 28, 25, 20, 17, 33, 32, 31, 30, 28, 25, 21, 17, 33, 32, 31, 30, 28, 25, 21, 17, 33, 32, 30, 29, 27, 24, 21, 17, 32, 32, 30, 28, 27, 24, 20, 17, 32, 32, 30, 28, 26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 31, 31, 28, 26, 24, 22, 19, 17, 30, 30, 28, 25, 23, 21, 19, 16, 30, 30, 28, 24, 22, 20, 18, 16, 28, 30, 27, 22, 20, 19, 17, 15, 28, 30, 27, 22, 20, 19, 17, 15, 27, 28, 26, 22, 20, 18, 16, 14, 26, 28, 26, 21, 19, 18, 16, 14, 25, 26, 25, 21, 19, 17, 15, 13, 23, 25, 24, 20, 18, 16, 14, 13, 23, 25, 24, 20, 18, 16, 14, 13, 22, 23, 23, 19, 17, 16, 14, 12, 21, 23, 22, 19, 17, 15, 13, 12, 20, 22, 22, 19, 17, 15, 13, 12, 19, 21, 20, 18, 16, 14, 12, 11, 19, 21, 20, 18, 16, 14, 12, 11, 18, 19, 19, 17, 15, 14, 12, 11, 18, 19, 19, 17, 15, 14, 12, 10, 17, 18, 18, 16, 15, 13, 12, 10, 16, 17, 18, 16, 14, 13, 11, 10, 16, 17, 18, 16, 14, 13, 11, 10, 15, 16, 17, 15, 13, 12, 11, 9 }, { /* Chroma */ /* Size 4x4 */ 33, 25, 22, 20, 25, 21, 21, 20, 22, 21, 18, 17, 20, 20, 17, 14, /* Size 8x8 */ 33, 33, 27, 23, 22, 21, 20, 19, 33, 32, 26, 23, 23, 22, 22, 20, 27, 26, 22, 22, 22, 22, 22, 20, 23, 23, 22, 20, 20, 20, 20, 19, 22, 23, 22, 20, 19, 18, 18, 17, 21, 22, 22, 20, 18, 17, 16, 16, 20, 22, 22, 20, 18, 16, 16, 15, 19, 20, 20, 19, 17, 16, 15, 13, /* Size 16x16 */ 32, 33, 34, 31, 30, 28, 25, 21, 21, 21, 21, 20, 20, 19, 19, 18, 33, 33, 33, 30, 28, 27, 24, 22, 22, 22, 22, 21, 20, 20, 19, 19, 34, 33, 32, 30, 28, 26, 24, 22, 23, 23, 23, 22, 22, 21, 20, 20, 31, 30, 30, 28, 26, 24, 23, 22, 22, 22, 23, 22, 22, 21, 20, 20, 30, 28, 28, 26, 24, 23, 22, 22, 22, 22, 23, 22, 22, 21, 21, 20, 28, 27, 26, 24, 23, 22, 22, 21, 22, 22, 23, 22, 22, 21, 21, 20, 25, 24, 24, 23, 22, 22, 21, 20, 20, 21, 21, 20, 20, 20, 20, 19, 21, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 21, 22, 23, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 23, 22, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 23, 23, 23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 21, 22, 22, 22, 22, 20, 19, 18, 18, 17, 17, 16, 16, 16, 15, 20, 20, 22, 22, 22, 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 19, 20, 21, 21, 21, 21, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 19, 19, 20, 20, 21, 21, 20, 18, 18, 17, 16, 16, 15, 14, 14, 14, 18, 19, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, /* Size 32x32 */ 32, 33, 33, 33, 34, 34, 31, 31, 30, 28, 28, 26, 25, 23, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 33, 33, 33, 31, 30, 28, 27, 27, 25, 24, 23, 21, 21, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 18, 33, 33, 33, 33, 33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 18, 33, 33, 33, 33, 33, 33, 30, 29, 28, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 31, 31, 30, 30, 30, 30, 28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 31, 30, 30, 29, 29, 29, 27, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 30, 28, 28, 28, 28, 28, 26, 26, 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 26, 25, 25, 25, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 19, 19, 19, 20, 20, 20, 20, 20, 21, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13, 17, 18, 18, 19, 19, 19, 19, 19, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, /* Size 4x8 */ 33, 32, 26, 23, 22, 22, 21, 19, 27, 26, 22, 22, 22, 22, 22, 20, 22, 23, 23, 20, 18, 17, 17, 16, 20, 21, 21, 19, 18, 16, 15, 14, /* Size 8x4 */ 33, 27, 22, 20, 32, 26, 23, 21, 26, 22, 23, 21, 23, 22, 20, 19, 22, 22, 18, 18, 22, 22, 17, 16, 21, 22, 17, 15, 19, 20, 16, 14, /* Size 8x16 */ 32, 33, 34, 31, 29, 28, 24, 21, 21, 21, 21, 20, 20, 19, 19, 18, 33, 33, 32, 29, 28, 26, 24, 22, 22, 23, 23, 22, 21, 21, 20, 20, 28, 27, 26, 24, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 23, 23, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 22, 23, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 23, 23, 23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 20, 21, 22, 22, 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 18, 19, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 14, 14, 14, 13, /* Size 16x8 */ 32, 33, 28, 23, 21, 21, 20, 18, 33, 33, 27, 23, 22, 22, 20, 19, 34, 32, 26, 23, 23, 23, 21, 20, 31, 29, 24, 22, 22, 23, 22, 20, 29, 28, 23, 22, 22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 24, 24, 22, 21, 20, 21, 20, 19, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 22, 20, 19, 19, 18, 17, 21, 23, 22, 20, 19, 18, 17, 17, 21, 23, 22, 20, 19, 18, 17, 16, 20, 22, 22, 20, 18, 17, 16, 15, 20, 21, 22, 19, 18, 17, 16, 14, 19, 21, 21, 19, 18, 17, 15, 14, 19, 20, 21, 19, 18, 16, 15, 14, 18, 20, 20, 19, 17, 16, 15, 13, /* Size 16x32 */ 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 33, 33, 33, 32, 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 31, 30, 30, 30, 29, 29, 28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 21, 22, 22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, /* Size 32x16 */ 32, 33, 33, 31, 28, 28, 23, 21, 21, 21, 21, 20, 20, 19, 18, 18, 33, 33, 33, 30, 27, 27, 23, 22, 22, 22, 22, 20, 20, 20, 19, 19, 33, 33, 33, 30, 27, 27, 23, 22, 22, 22, 22, 21, 20, 20, 19, 19, 33, 33, 32, 30, 26, 26, 23, 22, 22, 22, 22, 21, 21, 20, 19, 19, 34, 32, 32, 29, 26, 26, 23, 22, 23, 23, 23, 22, 21, 21, 20, 20, 34, 32, 32, 29, 26, 26, 23, 22, 23, 23, 23, 22, 21, 21, 20, 20, 31, 30, 29, 28, 24, 24, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 31, 29, 28, 27, 24, 24, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 29, 28, 28, 26, 23, 23, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 28, 26, 26, 24, 22, 22, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 28, 26, 26, 24, 22, 22, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 25, 24, 24, 23, 22, 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 24, 24, 24, 23, 22, 22, 21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 23, 23, 23, 23, 22, 22, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 21, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 18, 18, 18, 17, 17, 21, 22, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 21, 22, 23, 23, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 21, 22, 23, 23, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 22, 22, 22, 22, 22, 20, 19, 18, 17, 17, 17, 16, 16, 16, 16, 20, 22, 22, 22, 22, 22, 20, 19, 18, 17, 17, 16, 16, 16, 15, 15, 20, 21, 22, 22, 22, 22, 20, 19, 18, 17, 17, 16, 16, 16, 15, 15, 20, 21, 21, 22, 22, 22, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 20, 21, 21, 22, 22, 22, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 19, 20, 21, 21, 21, 21, 19, 19, 18, 17, 17, 15, 15, 15, 14, 14, 19, 20, 20, 21, 21, 21, 19, 19, 18, 17, 17, 15, 15, 15, 14, 14, 19, 20, 20, 20, 21, 21, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 18, 19, 20, 20, 20, 20, 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 18, 19, 20, 20, 20, 20, 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 17, 19, 19, 19, 20, 20, 18, 18, 17, 16, 16, 15, 14, 14, 13, 13, /* Size 4x16 */ 33, 33, 32, 30, 28, 26, 24, 22, 22, 22, 22, 22, 21, 20, 20, 19, 28, 27, 26, 24, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 21, 22, 23, 23, 23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 19, 20, 21, 21, 21, 21, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, /* Size 16x4 */ 33, 28, 21, 19, 33, 27, 22, 20, 32, 26, 23, 21, 30, 24, 23, 21, 28, 23, 23, 21, 26, 22, 23, 21, 24, 22, 21, 20, 22, 21, 19, 19, 22, 22, 19, 18, 22, 22, 18, 17, 22, 22, 18, 17, 22, 22, 17, 16, 21, 22, 17, 15, 20, 21, 17, 15, 20, 21, 16, 14, 19, 20, 16, 14, /* Size 8x32 */ 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 32, 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 21, 22, 22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, /* Size 32x8 */ 32, 33, 28, 23, 21, 21, 20, 18, 33, 33, 27, 23, 22, 22, 20, 19, 33, 33, 27, 23, 22, 22, 20, 19, 33, 32, 26, 23, 22, 22, 21, 19, 34, 32, 26, 23, 23, 23, 21, 20, 34, 32, 26, 23, 23, 23, 21, 20, 31, 29, 24, 22, 22, 23, 22, 20, 31, 28, 24, 22, 22, 22, 22, 20, 29, 28, 23, 22, 22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 25, 24, 22, 21, 21, 21, 20, 20, 24, 24, 22, 21, 20, 21, 20, 19, 23, 23, 22, 20, 20, 20, 20, 19, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 22, 20, 19, 19, 18, 17, 21, 22, 22, 20, 19, 18, 18, 17, 21, 23, 22, 20, 19, 18, 17, 17, 21, 23, 22, 20, 19, 18, 17, 16, 21, 23, 22, 20, 19, 18, 17, 16, 20, 22, 22, 20, 18, 17, 16, 16, 20, 22, 22, 20, 18, 17, 16, 15, 20, 22, 22, 20, 18, 17, 16, 15, 20, 21, 22, 19, 18, 17, 16, 14, 20, 21, 22, 19, 18, 17, 16, 14, 19, 21, 21, 19, 18, 17, 15, 14, 19, 20, 21, 19, 18, 17, 15, 14, 19, 20, 21, 19, 18, 16, 15, 14, 18, 20, 20, 19, 17, 16, 15, 13, 18, 20, 20, 19, 17, 16, 15, 13, 17, 19, 20, 18, 17, 16, 14, 13 }, }, { { /* Luma */ /* Size 4x4 */ 32, 32, 27, 20, 32, 29, 26, 21, 27, 26, 19, 16, 20, 21, 16, 13, /* Size 8x8 */ 33, 32, 32, 30, 29, 25, 22, 19, 32, 32, 32, 31, 30, 26, 23, 20, 32, 32, 30, 29, 28, 25, 23, 20, 30, 31, 29, 26, 24, 22, 20, 19, 29, 30, 28, 24, 21, 19, 18, 17, 25, 26, 25, 22, 19, 17, 16, 15, 22, 23, 23, 20, 18, 16, 14, 13, 19, 20, 20, 19, 17, 15, 13, 12, /* Size 16x16 */ 32, 33, 33, 33, 33, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 17, 33, 32, 32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 18, 33, 32, 32, 32, 32, 32, 31, 31, 30, 28, 28, 25, 23, 22, 20, 19, 33, 32, 32, 32, 32, 31, 31, 30, 29, 28, 27, 25, 23, 23, 21, 19, 33, 32, 32, 32, 31, 30, 30, 29, 28, 27, 26, 24, 23, 22, 20, 19, 32, 32, 32, 31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 21, 19, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 30, 30, 31, 30, 29, 28, 27, 26, 24, 23, 23, 22, 20, 20, 19, 18, 28, 29, 30, 29, 28, 27, 26, 24, 21, 20, 20, 19, 18, 18, 17, 16, 27, 28, 28, 28, 27, 26, 25, 23, 20, 20, 20, 18, 18, 17, 16, 15, 26, 27, 28, 27, 26, 26, 24, 23, 20, 20, 19, 18, 17, 17, 16, 15, 23, 24, 25, 25, 24, 24, 23, 22, 19, 18, 18, 16, 16, 15, 14, 14, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 21, 22, 22, 23, 22, 22, 21, 20, 18, 17, 17, 15, 14, 14, 13, 13, 19, 20, 20, 21, 20, 21, 20, 19, 17, 16, 16, 14, 14, 13, 12, 12, 17, 18, 19, 19, 19, 19, 19, 18, 16, 15, 15, 14, 13, 13, 12, 11, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 28, 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 17, 17, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 26, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 25, 24, 24, 23, 22, 22, 20, 20, 19, 18, 18, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 25, 24, 24, 23, 22, 22, 20, 20, 20, 18, 18, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 28, 28, 28, 26, 25, 25, 23, 23, 22, 21, 20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 28, 28, 28, 26, 25, 25, 23, 23, 23, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 23, 23, 23, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 20, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28, 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20, 19, 19, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 29, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 25, 24, 23, 23, 22, 22, 22, 21, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 28, 29, 29, 29, 30, 30, 29, 28, 28, 28, 27, 27, 26, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 28, 29, 29, 29, 30, 30, 29, 28, 28, 28, 27, 27, 26, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 27, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 25, 23, 23, 22, 20, 20, 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 26, 26, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 26, 26, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 24, 25, 25, 25, 26, 26, 26, 25, 25, 25, 24, 24, 23, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 23, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 22, 22, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 23, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 21, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 19, 20, 20, 20, 20, 21, 21, 20, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, /* Size 4x8 */ 32, 32, 32, 30, 29, 26, 22, 20, 32, 31, 30, 28, 27, 24, 22, 20, 28, 28, 27, 23, 21, 19, 17, 16, 20, 21, 21, 19, 17, 15, 13, 12, /* Size 8x4 */ 32, 32, 28, 20, 32, 31, 28, 21, 32, 30, 27, 21, 30, 28, 23, 19, 29, 27, 21, 17, 26, 24, 19, 15, 22, 22, 17, 13, 20, 20, 16, 12, /* Size 8x16 */ 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 33, 32, 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 23, 22, 21, 19, 32, 32, 32, 31, 30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 28, 29, 29, 29, 28, 27, 26, 24, 21, 21, 20, 19, 18, 18, 17, 16, 23, 24, 25, 25, 24, 24, 23, 21, 19, 18, 18, 16, 16, 15, 14, 14, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16, 14, 14, 13, 12, 12, /* Size 16x8 */ 32, 33, 32, 32, 28, 23, 22, 19, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32, 32, 31, 29, 25, 23, 21, 33, 32, 31, 31, 29, 25, 23, 21, 32, 32, 30, 30, 28, 24, 23, 20, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 26, 23, 22, 20, 30, 30, 28, 27, 24, 21, 20, 19, 28, 30, 28, 26, 21, 19, 18, 17, 27, 28, 26, 25, 21, 18, 18, 16, 26, 28, 26, 24, 20, 18, 17, 16, 23, 25, 24, 23, 19, 16, 16, 14, 22, 23, 23, 22, 18, 16, 15, 14, 21, 22, 22, 21, 18, 15, 14, 13, 19, 21, 20, 20, 17, 14, 14, 12, 18, 19, 19, 19, 16, 14, 13, 12, /* Size 16x32 */ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 28, 28, 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 24, 24, 23, 22, 22, 20, 20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 23, 23, 22, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28, 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 21, 21, 20, 20, 19, 19, 19, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 17, 28, 29, 29, 29, 29, 30, 29, 28, 28, 28, 27, 27, 26, 24, 24, 23, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 27, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 25, 23, 23, 22, 21, 21, 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 23, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 23, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, /* Size 32x16 */ 32, 33, 33, 33, 32, 32, 32, 29, 28, 27, 23, 23, 22, 19, 19, 17, 33, 32, 32, 32, 32, 32, 31, 29, 29, 28, 24, 24, 22, 20, 20, 18, 33, 32, 32, 32, 32, 32, 31, 29, 29, 28, 24, 24, 23, 20, 20, 18, 33, 32, 32, 32, 32, 32, 31, 29, 29, 28, 24, 24, 23, 20, 20, 18, 33, 32, 32, 32, 32, 32, 31, 30, 29, 28, 25, 25, 23, 21, 21, 19, 33, 32, 32, 32, 32, 31, 31, 30, 30, 28, 25, 25, 23, 21, 21, 19, 33, 32, 32, 32, 31, 31, 31, 29, 29, 28, 25, 25, 23, 21, 21, 19, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 24, 24, 23, 21, 21, 19, 32, 32, 32, 31, 30, 30, 30, 28, 28, 27, 24, 24, 23, 20, 20, 19, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 24, 24, 23, 21, 21, 19, 32, 32, 31, 31, 29, 29, 28, 27, 27, 26, 24, 24, 23, 21, 21, 19, 32, 32, 31, 31, 29, 29, 28, 27, 27, 26, 24, 24, 23, 21, 21, 19, 32, 31, 31, 31, 29, 28, 28, 26, 26, 25, 23, 23, 22, 20, 20, 19, 30, 30, 30, 30, 28, 28, 27, 24, 24, 23, 21, 21, 20, 19, 19, 18, 30, 30, 30, 30, 28, 28, 27, 24, 24, 23, 21, 21, 20, 19, 19, 18, 29, 30, 30, 30, 28, 28, 26, 23, 23, 22, 20, 20, 19, 18, 18, 17, 28, 29, 30, 29, 28, 27, 26, 22, 21, 21, 19, 19, 18, 17, 17, 16, 28, 29, 30, 29, 28, 27, 26, 22, 21, 21, 19, 19, 18, 17, 17, 16, 27, 28, 28, 28, 26, 26, 25, 21, 21, 20, 18, 18, 18, 16, 16, 15, 26, 27, 28, 27, 26, 26, 24, 21, 20, 20, 18, 18, 17, 16, 16, 15, 26, 27, 28, 27, 26, 26, 24, 21, 20, 20, 18, 18, 17, 16, 16, 15, 24, 26, 26, 26, 24, 24, 23, 20, 20, 19, 17, 17, 16, 15, 15, 14, 23, 24, 25, 25, 24, 24, 23, 20, 19, 18, 16, 16, 16, 14, 14, 14, 23, 24, 25, 25, 24, 24, 23, 20, 19, 18, 16, 16, 16, 14, 14, 13, 22, 23, 23, 23, 23, 23, 22, 19, 18, 18, 16, 16, 15, 14, 14, 13, 21, 22, 23, 23, 22, 22, 21, 19, 18, 17, 15, 15, 15, 13, 13, 13, 21, 22, 22, 22, 22, 22, 21, 18, 18, 17, 15, 15, 14, 13, 13, 13, 19, 20, 21, 21, 21, 21, 20, 18, 17, 17, 14, 14, 14, 13, 13, 12, 19, 20, 21, 21, 20, 20, 20, 17, 17, 16, 14, 14, 14, 12, 12, 12, 19, 20, 20, 20, 20, 20, 19, 17, 17, 16, 14, 14, 13, 12, 12, 12, 18, 19, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 13, 12, 12, 11, 18, 19, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 13, 12, 12, 11, /* Size 4x16 */ 33, 32, 32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 19, 32, 32, 32, 31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 20, 19, 27, 28, 28, 28, 27, 26, 25, 23, 21, 20, 20, 18, 18, 17, 16, 15, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16, 14, 14, 13, 12, 12, /* Size 16x4 */ 33, 32, 27, 19, 32, 32, 28, 20, 32, 32, 28, 21, 32, 31, 28, 21, 32, 30, 27, 20, 32, 29, 26, 21, 31, 28, 25, 20, 30, 28, 23, 19, 29, 27, 21, 17, 28, 26, 20, 16, 27, 26, 20, 16, 24, 24, 18, 14, 23, 23, 18, 14, 22, 22, 17, 13, 20, 20, 16, 12, 19, 19, 15, 12, /* Size 8x32 */ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 28, 28, 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28, 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 21, 21, 20, 20, 19, 19, 19, 28, 29, 29, 29, 29, 30, 29, 28, 28, 28, 27, 27, 26, 24, 24, 23, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 23, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, /* Size 32x8 */ 32, 33, 32, 32, 28, 23, 22, 19, 33, 32, 32, 31, 29, 24, 22, 20, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32, 32, 31, 29, 25, 23, 21, 33, 32, 32, 31, 30, 25, 23, 21, 33, 32, 31, 31, 29, 25, 23, 21, 32, 32, 31, 30, 28, 24, 23, 21, 32, 32, 30, 30, 28, 24, 23, 20, 32, 32, 30, 29, 28, 24, 23, 21, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 26, 23, 22, 20, 30, 30, 28, 27, 24, 21, 20, 19, 30, 30, 28, 27, 24, 21, 20, 19, 29, 30, 28, 26, 23, 20, 19, 18, 28, 30, 28, 26, 21, 19, 18, 17, 28, 30, 28, 26, 21, 19, 18, 17, 27, 28, 26, 25, 21, 18, 18, 16, 26, 28, 26, 24, 20, 18, 17, 16, 26, 28, 26, 24, 20, 18, 17, 16, 24, 26, 24, 23, 20, 17, 16, 15, 23, 25, 24, 23, 19, 16, 16, 14, 23, 25, 24, 23, 19, 16, 16, 14, 22, 23, 23, 22, 18, 16, 15, 14, 21, 23, 22, 21, 18, 15, 15, 13, 21, 22, 22, 21, 18, 15, 14, 13, 19, 21, 21, 20, 17, 14, 14, 13, 19, 21, 20, 20, 17, 14, 14, 12, 19, 20, 20, 19, 17, 14, 13, 12, 18, 19, 19, 19, 16, 14, 13, 12, 18, 19, 19, 19, 16, 14, 13, 12 }, { /* Chroma */ /* Size 4x4 */ 33, 27, 22, 21, 27, 22, 22, 22, 22, 22, 19, 18, 21, 22, 18, 16, /* Size 8x8 */ 33, 33, 29, 24, 21, 22, 21, 20, 33, 32, 28, 24, 22, 23, 22, 21, 29, 28, 25, 23, 22, 23, 22, 21, 24, 24, 23, 21, 20, 21, 20, 20, 21, 22, 22, 20, 19, 19, 19, 19, 22, 23, 23, 21, 19, 18, 17, 17, 21, 22, 22, 20, 19, 17, 17, 16, 20, 21, 21, 20, 19, 17, 16, 15, /* Size 16x16 */ 32, 33, 34, 33, 31, 28, 27, 25, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33, 33, 32, 30, 27, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 34, 33, 33, 32, 29, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 33, 32, 32, 31, 28, 26, 25, 24, 22, 22, 23, 23, 22, 22, 22, 21, 31, 30, 29, 28, 26, 24, 23, 23, 22, 22, 22, 23, 22, 22, 22, 21, 28, 27, 26, 26, 24, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 27, 26, 25, 25, 23, 22, 22, 21, 21, 21, 21, 22, 22, 22, 21, 21, 25, 24, 24, 24, 23, 22, 21, 21, 20, 20, 21, 21, 20, 20, 20, 20, 21, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 21, 22, 22, 23, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 23, 23, 23, 23, 22, 21, 19, 19, 19, 18, 17, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 20, 20, 21, 22, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16, 16, 15, 19, 20, 20, 21, 21, 21, 21, 20, 19, 18, 18, 17, 16, 16, 15, 14, /* Size 32x32 */ 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 27, 25, 25, 23, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33, 33, 33, 33, 30, 30, 29, 27, 27, 26, 24, 24, 23, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 33, 33, 33, 33, 33, 33, 32, 30, 30, 29, 27, 27, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, 33, 33, 32, 30, 30, 28, 27, 27, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 34, 33, 33, 33, 33, 33, 32, 29, 29, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 34, 33, 33, 33, 33, 32, 32, 29, 29, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 33, 33, 32, 32, 32, 32, 31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 31, 30, 30, 30, 29, 29, 29, 27, 27, 26, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 31, 30, 30, 30, 29, 29, 28, 27, 26, 26, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 30, 29, 29, 28, 28, 28, 28, 26, 26, 25, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 28, 27, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 27, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21, 20, 20, 20, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21, 20, 20, 20, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, /* Size 4x8 */ 33, 33, 28, 24, 22, 22, 21, 20, 27, 26, 23, 22, 21, 22, 22, 21, 22, 22, 22, 20, 19, 19, 19, 18, 20, 21, 22, 20, 19, 17, 16, 15, /* Size 8x4 */ 33, 27, 22, 20, 33, 26, 22, 21, 28, 23, 22, 22, 24, 22, 20, 20, 22, 21, 19, 19, 22, 22, 19, 17, 21, 22, 19, 16, 20, 21, 18, 15, /* Size 8x16 */ 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33, 32, 31, 28, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 29, 28, 27, 27, 25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 27, 26, 26, 25, 23, 22, 22, 21, 21, 21, 21, 22, 21, 21, 21, 20, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16, 16, 15, /* Size 16x8 */ 32, 33, 29, 27, 21, 21, 20, 20, 33, 33, 28, 26, 22, 22, 21, 20, 34, 32, 27, 26, 22, 23, 22, 21, 33, 31, 27, 25, 22, 23, 22, 21, 31, 28, 25, 23, 22, 22, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 26, 25, 22, 22, 21, 22, 22, 21, 24, 24, 22, 21, 20, 21, 20, 20, 21, 22, 21, 21, 19, 19, 19, 19, 21, 22, 22, 21, 19, 19, 19, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 23, 23, 22, 19, 18, 17, 17, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21, 19, 17, 17, 16, 20, 21, 22, 21, 19, 17, 16, 16, 19, 20, 21, 20, 19, 17, 16, 15, /* Size 16x32 */ 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 26, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33, 33, 33, 32, 29, 29, 28, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 33, 33, 33, 33, 32, 32, 31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 33, 32, 32, 32, 32, 31, 31, 28, 28, 28, 25, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 29, 28, 28, 28, 27, 27, 27, 25, 25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 28, 27, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 27, 26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20, 20, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 21, 22, 22, 23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 21, 22, 22, 23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 19, 19, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, /* Size 32x16 */ 32, 33, 33, 33, 29, 28, 27, 22, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33, 33, 32, 28, 27, 26, 22, 22, 22, 21, 21, 21, 20, 20, 19, 33, 33, 33, 32, 28, 27, 26, 22, 22, 22, 22, 22, 21, 20, 20, 20, 33, 33, 33, 32, 28, 27, 26, 22, 22, 22, 22, 22, 21, 20, 20, 20, 34, 33, 32, 32, 27, 26, 26, 23, 22, 22, 23, 23, 22, 21, 21, 20, 34, 33, 32, 31, 27, 26, 25, 23, 22, 22, 23, 23, 22, 21, 21, 20, 33, 32, 31, 31, 27, 26, 25, 23, 22, 22, 23, 23, 22, 21, 21, 20, 31, 29, 29, 28, 25, 24, 24, 22, 22, 22, 23, 23, 22, 22, 22, 21, 31, 29, 28, 28, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 30, 28, 28, 28, 24, 23, 23, 22, 22, 22, 23, 23, 22, 22, 22, 21, 28, 26, 26, 25, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 21, 28, 26, 26, 25, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 21, 26, 26, 25, 24, 22, 22, 22, 21, 21, 21, 22, 22, 22, 21, 21, 20, 24, 24, 24, 24, 22, 22, 21, 20, 20, 20, 21, 21, 20, 20, 20, 20, 24, 24, 24, 24, 22, 22, 21, 20, 20, 20, 21, 21, 20, 20, 20, 20, 23, 23, 23, 23, 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20, 19, 21, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 18, 18, 18, 17, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 18, 18, 18, 17, 21, 22, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 17, 21, 22, 23, 23, 23, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 20, 22, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16, 16, 16, 20, 21, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16, 16, 16, 20, 21, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16, 16, 16, 20, 21, 21, 21, 22, 22, 21, 19, 19, 18, 17, 17, 16, 16, 16, 15, 20, 21, 21, 21, 22, 22, 21, 19, 19, 18, 17, 17, 16, 16, 16, 15, 19, 20, 21, 21, 21, 21, 21, 19, 19, 18, 17, 17, 16, 15, 15, 15, 19, 20, 20, 20, 21, 21, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 19, 20, 20, 20, 21, 21, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, /* Size 4x16 */ 33, 33, 33, 32, 29, 26, 26, 24, 22, 22, 22, 22, 22, 21, 21, 20, 28, 27, 26, 26, 24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16, 16, 15, /* Size 16x4 */ 33, 28, 21, 20, 33, 27, 22, 20, 33, 26, 22, 21, 32, 26, 22, 21, 29, 24, 22, 22, 26, 22, 22, 22, 26, 22, 21, 21, 24, 22, 20, 20, 22, 21, 19, 19, 22, 22, 19, 18, 22, 22, 19, 18, 22, 22, 19, 17, 22, 22, 19, 16, 21, 22, 19, 16, 21, 22, 18, 16, 20, 21, 18, 15, /* Size 8x32 */ 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 26, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33, 32, 32, 31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 29, 28, 28, 28, 27, 27, 27, 25, 25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 27, 26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, /* Size 32x8 */ 32, 33, 29, 27, 21, 21, 20, 20, 33, 33, 28, 26, 22, 21, 21, 20, 33, 33, 28, 26, 22, 22, 21, 20, 33, 33, 28, 26, 22, 22, 21, 20, 34, 32, 27, 26, 22, 23, 22, 21, 34, 32, 27, 25, 22, 23, 22, 21, 33, 31, 27, 25, 22, 23, 22, 21, 31, 29, 25, 24, 22, 23, 22, 22, 31, 28, 25, 23, 22, 22, 22, 22, 30, 28, 24, 23, 22, 23, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 26, 25, 22, 22, 21, 22, 22, 21, 24, 24, 22, 21, 20, 21, 20, 20, 24, 24, 22, 21, 20, 21, 20, 20, 23, 23, 22, 21, 20, 20, 20, 20, 21, 22, 21, 21, 19, 19, 19, 19, 21, 22, 21, 21, 19, 19, 19, 19, 21, 22, 22, 21, 19, 19, 19, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 23, 22, 22, 19, 18, 18, 17, 21, 23, 23, 22, 19, 18, 17, 17, 21, 23, 22, 22, 19, 18, 17, 17, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21, 19, 17, 17, 16, 20, 21, 22, 21, 19, 17, 16, 16, 20, 21, 22, 21, 19, 17, 16, 16, 19, 21, 21, 21, 19, 17, 16, 15, 19, 20, 21, 20, 19, 17, 16, 15, 19, 20, 21, 20, 19, 17, 16, 15 }, }, { { /* Luma */ /* Size 4x4 */ 32, 32, 29, 24, 32, 30, 28, 24, 29, 28, 21, 19, 24, 24, 19, 16, /* Size 8x8 */ 33, 33, 32, 32, 30, 28, 24, 22, 33, 32, 32, 32, 30, 28, 25, 23, 32, 32, 31, 30, 29, 27, 24, 23, 32, 32, 30, 29, 28, 26, 24, 22, 30, 30, 29, 28, 25, 23, 21, 20, 28, 28, 27, 26, 23, 20, 18, 17, 24, 25, 24, 24, 21, 18, 16, 15, 22, 23, 23, 22, 20, 17, 15, 14, /* Size 16x16 */ 32, 33, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 33, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 27, 26, 24, 23, 23, 33, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, 27, 25, 23, 23, 33, 32, 32, 32, 31, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 23, 33, 32, 32, 32, 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 32, 32, 32, 32, 31, 30, 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 32, 31, 31, 31, 30, 30, 28, 28, 27, 26, 26, 24, 24, 23, 22, 22, 30, 30, 30, 31, 29, 29, 28, 27, 26, 24, 24, 23, 22, 22, 20, 20, 29, 29, 30, 30, 28, 28, 27, 26, 24, 22, 22, 21, 20, 20, 19, 19, 28, 29, 29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 26, 27, 27, 28, 26, 26, 26, 24, 23, 21, 20, 19, 19, 18, 17, 17, 25, 26, 26, 27, 26, 26, 25, 24, 22, 20, 20, 19, 18, 17, 17, 16, 23, 24, 24, 25, 24, 24, 24, 23, 22, 20, 19, 18, 17, 16, 16, 15, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16, 15, 15, 21, 22, 23, 23, 23, 23, 22, 22, 20, 19, 18, 17, 16, 15, 15, 14, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 30, 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 26, 26, 26, 24, 24, 23, 22, 22, 22, 20, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 22, 22, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 22, 22, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 23, 23, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, 27, 25, 25, 25, 23, 23, 23, 22, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, 27, 25, 25, 25, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 28, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 20, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 22, 22, 22, 21, 20, 20, 20, 19, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 22, 22, 22, 21, 20, 20, 20, 19, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27, 26, 26, 25, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 29, 29, 29, 29, 30, 30, 30, 30, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 18, 28, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 28, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 27, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26, 26, 25, 23, 23, 23, 21, 20, 20, 20, 20, 20, 19, 18, 18, 18, 18, 17, 17, 17, 26, 26, 27, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 23, 23, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 26, 26, 27, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 23, 23, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 25, 26, 26, 26, 26, 27, 27, 26, 26, 26, 26, 25, 25, 25, 24, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15, 23, 24, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15, 23, 23, 24, 24, 24, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 19, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 15, 15, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 15, 14, 14, 14, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 15, 14, 14, 14, 20, 20, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20, 19, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 13, /* Size 4x8 */ 33, 32, 32, 32, 30, 28, 24, 22, 32, 31, 30, 29, 28, 26, 24, 22, 29, 30, 28, 27, 24, 21, 19, 18, 24, 25, 24, 24, 21, 18, 16, 15, /* Size 8x4 */ 33, 32, 29, 24, 32, 31, 30, 25, 32, 30, 28, 24, 32, 29, 27, 24, 30, 28, 24, 21, 28, 26, 21, 18, 24, 24, 19, 16, 22, 22, 18, 15, /* Size 8x16 */ 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 33, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 26, 25, 23, 23, 33, 32, 32, 32, 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 29, 29, 30, 30, 29, 28, 28, 26, 25, 23, 22, 21, 21, 20, 19, 19, 28, 29, 29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18, 17, 16, 16, 15, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16, 15, 15, /* Size 16x8 */ 32, 33, 33, 32, 29, 28, 23, 22, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32, 32, 32, 30, 29, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 31, 30, 29, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 31, 30, 29, 28, 27, 24, 23, 32, 31, 30, 28, 26, 26, 23, 22, 30, 30, 29, 28, 25, 24, 21, 20, 29, 30, 28, 27, 23, 22, 20, 19, 28, 30, 28, 27, 22, 21, 19, 18, 26, 28, 26, 26, 21, 20, 18, 17, 25, 26, 26, 25, 21, 20, 17, 17, 23, 25, 24, 24, 20, 19, 16, 16, 22, 23, 23, 23, 19, 18, 16, 15, 21, 23, 23, 22, 19, 18, 15, 15, /* Size 16x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 30, 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 22, 22, 21, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 22, 21, 21, 20, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 26, 25, 25, 24, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 28, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 28, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 26, 26, 26, 26, 26, 27, 27, 27, 26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 17, 17, 16, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 19, 20, 20, 20, 20, 21, 21, 21, 21, 20, 20, 21, 21, 21, 20, 19, 19, 19, 17, 17, 17, 16, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, /* Size 32x16 */ 32, 33, 33, 33, 33, 32, 32, 32, 29, 28, 28, 26, 23, 23, 22, 19, 33, 33, 32, 32, 32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 22, 20, 33, 32, 32, 32, 32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 23, 20, 33, 32, 32, 32, 32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 23, 20, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 29, 26, 25, 25, 23, 20, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 27, 25, 25, 23, 21, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 27, 25, 25, 23, 21, 33, 32, 32, 32, 32, 31, 31, 31, 30, 29, 29, 27, 25, 25, 23, 21, 33, 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 26, 24, 24, 23, 21, 32, 32, 32, 32, 31, 30, 30, 30, 28, 28, 28, 26, 24, 24, 23, 20, 32, 32, 32, 32, 31, 30, 30, 30, 28, 28, 28, 26, 24, 24, 23, 20, 32, 32, 32, 32, 31, 29, 29, 29, 28, 28, 28, 26, 24, 24, 23, 21, 32, 32, 31, 31, 30, 29, 29, 28, 28, 27, 27, 25, 24, 24, 23, 21, 32, 32, 31, 31, 30, 29, 29, 28, 28, 27, 27, 25, 24, 24, 23, 21, 32, 31, 31, 31, 30, 28, 28, 28, 26, 26, 26, 24, 23, 23, 22, 20, 30, 30, 30, 30, 29, 28, 28, 27, 25, 24, 24, 23, 21, 21, 20, 19, 30, 30, 30, 30, 29, 28, 28, 27, 25, 24, 24, 23, 21, 21, 20, 19, 30, 30, 30, 30, 29, 28, 28, 27, 24, 24, 24, 22, 21, 21, 20, 19, 29, 29, 30, 30, 28, 27, 27, 26, 23, 22, 22, 20, 20, 20, 19, 17, 28, 29, 30, 30, 28, 27, 27, 26, 22, 21, 21, 20, 19, 19, 18, 17, 28, 29, 30, 30, 28, 27, 27, 26, 22, 21, 21, 20, 19, 19, 18, 17, 27, 28, 28, 28, 28, 26, 26, 25, 22, 21, 21, 19, 18, 18, 18, 16, 26, 27, 28, 28, 26, 26, 26, 24, 21, 20, 20, 19, 18, 18, 17, 16, 26, 27, 28, 28, 26, 26, 26, 24, 21, 20, 20, 19, 18, 18, 17, 16, 25, 26, 26, 26, 26, 25, 25, 24, 21, 20, 20, 18, 17, 17, 17, 15, 23, 24, 25, 25, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 16, 14, 23, 24, 25, 25, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 16, 14, 23, 24, 24, 24, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 15, 14, 22, 23, 23, 23, 23, 23, 23, 22, 19, 18, 18, 17, 16, 16, 15, 14, 21, 22, 23, 23, 23, 22, 22, 21, 19, 18, 18, 17, 15, 15, 15, 13, 21, 22, 23, 23, 23, 22, 22, 21, 19, 18, 18, 17, 15, 15, 15, 13, 20, 21, 22, 22, 21, 21, 21, 20, 18, 18, 18, 16, 15, 15, 14, 13, /* Size 4x16 */ 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 28, 29, 29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18, 17, 16, 16, 15, /* Size 16x4 */ 33, 32, 28, 23, 32, 32, 29, 24, 32, 32, 29, 25, 32, 31, 30, 25, 32, 30, 28, 24, 32, 30, 28, 24, 32, 29, 27, 24, 31, 28, 26, 23, 30, 28, 24, 21, 29, 27, 22, 20, 29, 27, 21, 19, 27, 26, 20, 18, 26, 25, 20, 17, 24, 24, 19, 16, 23, 23, 18, 16, 22, 22, 18, 15, /* Size 8x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 30, 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 26, 25, 25, 24, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 28, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, /* Size 32x8 */ 32, 33, 33, 32, 29, 28, 23, 22, 33, 32, 32, 32, 29, 29, 24, 22, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32, 32, 32, 30, 29, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 32, 31, 30, 29, 25, 23, 33, 32, 31, 30, 29, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 32, 31, 29, 28, 28, 24, 23, 32, 31, 30, 29, 28, 27, 24, 23, 32, 31, 30, 29, 28, 27, 24, 23, 32, 31, 30, 28, 26, 26, 23, 22, 30, 30, 29, 28, 25, 24, 21, 20, 30, 30, 29, 28, 25, 24, 21, 20, 30, 30, 29, 28, 24, 24, 21, 20, 29, 30, 28, 27, 23, 22, 20, 19, 28, 30, 28, 27, 22, 21, 19, 18, 28, 30, 28, 27, 22, 21, 19, 18, 27, 28, 28, 26, 22, 21, 18, 18, 26, 28, 26, 26, 21, 20, 18, 17, 26, 28, 26, 26, 21, 20, 18, 17, 25, 26, 26, 25, 21, 20, 17, 17, 23, 25, 24, 24, 20, 19, 16, 16, 23, 25, 24, 24, 20, 19, 16, 16, 23, 24, 24, 24, 20, 19, 16, 15, 22, 23, 23, 23, 19, 18, 16, 15, 21, 23, 23, 22, 19, 18, 15, 15, 21, 23, 23, 22, 19, 18, 15, 15, 20, 22, 21, 21, 18, 18, 15, 14 }, { /* Chroma */ /* Size 4x4 */ 33, 28, 22, 22, 28, 23, 22, 23, 22, 22, 19, 19, 22, 23, 19, 17, /* Size 8x8 */ 33, 33, 30, 28, 24, 21, 22, 21, 33, 32, 29, 26, 24, 22, 23, 22, 30, 29, 26, 24, 23, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 24, 24, 23, 22, 21, 20, 20, 20, 21, 22, 22, 22, 20, 19, 19, 19, 22, 23, 23, 23, 20, 19, 18, 17, 21, 22, 22, 22, 20, 19, 17, 17, /* Size 16x16 */ 32, 33, 33, 34, 31, 31, 28, 27, 25, 22, 21, 21, 21, 21, 20, 20, 33, 33, 33, 33, 30, 30, 27, 26, 24, 22, 22, 22, 22, 22, 21, 21, 33, 33, 33, 33, 30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 34, 33, 33, 32, 30, 29, 26, 25, 24, 23, 22, 23, 23, 23, 22, 22, 31, 30, 30, 30, 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 31, 30, 29, 29, 27, 26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 28, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 22, 22, 23, 22, 22, 27, 26, 26, 25, 24, 23, 22, 22, 21, 21, 21, 21, 22, 22, 22, 22, 25, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 21, 20, 20, 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 19, 19, 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 23, 22, 22, 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 21, 22, 22, 23, 23, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, /* Size 32x32 */ 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 27, 25, 25, 24, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 28, 28, 26, 24, 24, 24, 22, 21, 21, 21, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 29, 28, 28, 28, 28, 27, 27, 27, 25, 25, 25, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 21, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, /* Size 4x8 */ 33, 33, 29, 26, 24, 22, 22, 21, 27, 26, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 21, 23, 22, 23, 20, 19, 18, 17, /* Size 8x4 */ 33, 27, 22, 21, 33, 26, 22, 23, 29, 24, 22, 22, 26, 22, 22, 23, 24, 22, 20, 20, 22, 22, 19, 19, 22, 22, 19, 18, 21, 22, 19, 17, /* Size 8x16 */ 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 33, 33, 32, 32, 29, 28, 26, 25, 24, 22, 22, 22, 23, 23, 22, 22, 31, 30, 30, 29, 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 28, 27, 26, 26, 24, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18, 18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, /* Size 16x8 */ 32, 33, 31, 28, 23, 21, 21, 20, 33, 33, 30, 27, 23, 22, 22, 21, 33, 32, 30, 26, 23, 22, 22, 22, 34, 32, 29, 26, 23, 22, 23, 22, 31, 29, 28, 24, 22, 22, 23, 22, 31, 28, 27, 24, 22, 22, 22, 22, 28, 26, 24, 22, 22, 22, 23, 22, 26, 25, 24, 22, 21, 21, 22, 22, 24, 24, 23, 22, 21, 20, 21, 20, 22, 22, 22, 21, 20, 20, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22, 22, 22, 20, 19, 18, 18, 21, 23, 22, 22, 20, 19, 18, 18, 21, 23, 23, 22, 20, 19, 18, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19, 17, 17, /* Size 16x32 */ 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 24, 24, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 22, 33, 33, 33, 33, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 22, 31, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, /* Size 32x16 */ 32, 33, 33, 33, 31, 28, 28, 27, 23, 21, 21, 21, 21, 21, 20, 20, 33, 33, 33, 33, 31, 27, 27, 26, 23, 22, 22, 21, 21, 21, 21, 20, 33, 33, 33, 33, 30, 27, 27, 26, 23, 22, 22, 22, 22, 22, 21, 20, 33, 33, 33, 33, 30, 27, 27, 26, 23, 22, 22, 22, 22, 22, 21, 20, 33, 33, 32, 32, 30, 26, 26, 26, 23, 22, 22, 22, 22, 22, 22, 21, 34, 33, 32, 32, 29, 26, 26, 25, 23, 22, 22, 23, 23, 23, 22, 21, 34, 33, 32, 32, 29, 26, 26, 25, 23, 22, 22, 23, 23, 23, 22, 21, 33, 32, 31, 31, 29, 26, 26, 25, 23, 22, 22, 23, 23, 23, 22, 21, 31, 30, 29, 29, 28, 24, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 31, 29, 28, 28, 27, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 31, 29, 28, 28, 27, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 29, 28, 27, 27, 25, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 21, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 20, 20, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 20, 20, 24, 24, 24, 24, 23, 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 21, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 18, 21, 22, 23, 23, 22, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, /* Size 4x16 */ 33, 33, 33, 33, 30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 21, 21, 28, 27, 26, 26, 24, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18, 18, 18, 17, 17, /* Size 16x4 */ 33, 28, 21, 21, 33, 27, 22, 22, 33, 26, 22, 22, 33, 26, 22, 23, 30, 24, 22, 23, 29, 24, 22, 22, 26, 22, 22, 23, 26, 22, 21, 22, 24, 22, 20, 21, 22, 21, 20, 19, 22, 21, 19, 19, 22, 22, 19, 18, 22, 22, 19, 18, 22, 22, 19, 18, 21, 22, 19, 17, 21, 22, 19, 17, /* Size 8x32 */ 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 24, 24, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 22, 31, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, /* Size 32x8 */ 32, 33, 31, 28, 23, 21, 21, 20, 33, 33, 31, 27, 23, 22, 21, 21, 33, 33, 30, 27, 23, 22, 22, 21, 33, 33, 30, 27, 23, 22, 22, 21, 33, 32, 30, 26, 23, 22, 22, 22, 34, 32, 29, 26, 23, 22, 23, 22, 34, 32, 29, 26, 23, 22, 23, 22, 33, 31, 29, 26, 23, 22, 23, 22, 31, 29, 28, 24, 22, 22, 23, 22, 31, 28, 27, 24, 22, 22, 22, 22, 31, 28, 27, 24, 22, 22, 22, 22, 29, 27, 25, 23, 22, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 26, 25, 24, 22, 21, 21, 22, 22, 24, 24, 23, 22, 21, 20, 21, 20, 24, 24, 23, 22, 21, 20, 21, 20, 24, 24, 23, 22, 20, 20, 20, 20, 22, 22, 22, 21, 20, 20, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22, 22, 22, 20, 19, 19, 19, 21, 22, 22, 22, 20, 19, 18, 18, 21, 22, 22, 22, 20, 19, 18, 18, 21, 23, 22, 22, 20, 19, 18, 18, 21, 23, 23, 22, 20, 19, 18, 17, 21, 23, 23, 22, 20, 19, 18, 17, 21, 23, 23, 22, 20, 19, 18, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19, 17, 17 }, }, { { /* Luma */ /* Size 4x4 */ 32, 32, 30, 27, 32, 31, 29, 26, 30, 29, 26, 23, 27, 26, 23, 19, /* Size 8x8 */ 33, 33, 32, 32, 31, 30, 28, 25, 33, 32, 32, 32, 31, 30, 28, 26, 32, 32, 32, 31, 30, 29, 28, 26, 32, 32, 31, 30, 29, 28, 27, 25, 31, 31, 30, 29, 28, 26, 25, 23, 30, 30, 29, 28, 26, 24, 22, 21, 28, 28, 28, 27, 25, 22, 20, 19, 25, 26, 26, 25, 23, 21, 19, 18, /* Size 16x16 */ 32, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 28, 28, 26, 26, 23, 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, 30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22, 22, 19, 19, 18, 18, 16, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 27, 27, 27, 26, 25, 25, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 29, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 26, 25, 25, 25, 24, 23, 23, 23, 22, 22, 22, 22, 21, 20, 20, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 26, 25, 23, 23, 23, 22, 21, 21, 21, 20, 20, 20, 20, 19, 18, 18, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 23, 22, 22, 22, 21, 20, 20, 20, 19, 18, 18, 18, 18, 17, 17, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16, /* Size 4x8 */ 33, 32, 32, 32, 31, 30, 28, 26, 32, 32, 31, 31, 30, 28, 27, 26, 30, 30, 30, 28, 27, 25, 23, 22, 26, 27, 27, 26, 24, 22, 20, 18, /* Size 8x4 */ 33, 32, 30, 26, 32, 32, 30, 27, 32, 31, 30, 27, 32, 31, 28, 26, 31, 30, 27, 24, 30, 28, 25, 22, 28, 27, 23, 20, 26, 26, 22, 18, /* Size 8x16 */ 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 33, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 16, /* Size 16x8 */ 32, 33, 33, 32, 32, 28, 28, 23, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31, 31, 30, 30, 25, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28, 28, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27, 27, 24, 30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 28, 30, 30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 26, 28, 28, 26, 26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 23, 25, 25, 24, 24, 19, 19, 16, /* Size 16x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 21, 21, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16, /* Size 32x16 */ 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 26, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 29, 29, 29, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 29, 29, 29, 27, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28, 25, 25, 33, 32, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 27, 25, 25, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31, 30, 30, 30, 28, 28, 28, 28, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 31, 31, 31, 31, 31, 30, 28, 28, 28, 27, 26, 26, 26, 24, 23, 23, 30, 30, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 30, 30, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 30, 30, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 29, 30, 30, 30, 30, 28, 28, 28, 28, 25, 23, 23, 23, 22, 20, 20, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24, 21, 21, 21, 20, 19, 19, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24, 21, 21, 21, 20, 19, 19, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24, 21, 21, 21, 20, 19, 19, 28, 28, 28, 28, 28, 27, 26, 26, 26, 23, 21, 21, 21, 20, 18, 18, 26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 25, 26, 26, 26, 26, 26, 24, 24, 24, 22, 20, 20, 20, 18, 17, 17, 23, 24, 25, 25, 25, 24, 24, 24, 24, 21, 19, 19, 19, 18, 16, 16, 23, 24, 25, 25, 25, 24, 24, 24, 24, 21, 19, 19, 19, 18, 16, 16, /* Size 4x16 */ 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 21, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, /* Size 16x4 */ 33, 32, 30, 26, 32, 32, 30, 27, 32, 32, 30, 27, 32, 32, 31, 28, 32, 32, 31, 28, 32, 31, 29, 26, 32, 31, 29, 26, 32, 30, 28, 26, 32, 30, 28, 26, 30, 29, 26, 23, 30, 29, 26, 23, 29, 28, 24, 20, 29, 28, 24, 20, 27, 26, 23, 19, 27, 26, 23, 19, 24, 24, 21, 18, /* Size 8x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16, /* Size 32x8 */ 32, 33, 33, 32, 32, 28, 28, 23, 33, 33, 33, 32, 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 25, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31, 31, 29, 29, 25, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28, 28, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27, 27, 24, 31, 31, 31, 28, 28, 26, 26, 23, 30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 29, 30, 30, 28, 28, 23, 23, 20, 28, 30, 30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 28, 28, 28, 26, 26, 21, 21, 18, 26, 28, 28, 26, 26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 25, 26, 26, 24, 24, 20, 20, 17, 23, 25, 25, 24, 24, 19, 19, 16, 23, 25, 25, 24, 24, 19, 19, 16 }, { /* Chroma */ /* Size 4x4 */ 33, 30, 24, 22, 30, 26, 23, 22, 24, 23, 21, 21, 22, 22, 21, 19, /* Size 8x8 */ 33, 33, 32, 29, 26, 23, 21, 21, 33, 33, 31, 28, 25, 23, 22, 22, 32, 31, 29, 26, 24, 23, 22, 23, 29, 28, 26, 24, 23, 22, 22, 22, 26, 25, 24, 23, 22, 21, 21, 22, 23, 23, 23, 22, 21, 20, 20, 20, 21, 22, 22, 22, 21, 20, 19, 19, 21, 22, 23, 22, 22, 20, 19, 18, /* Size 16x16 */ 32, 33, 33, 34, 34, 31, 31, 28, 28, 25, 25, 21, 21, 21, 21, 21, 33, 33, 33, 33, 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22, 22, 23, 23, 23, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22, 22, 23, 23, 23, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 23, 25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21, 21, 19, 19, 19, 19, 18, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26, 25, 25, 25, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 32, 32, 31, 31, 31, 31, 31, 31, 31, 29, 28, 28, 28, 26, 25, 25, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 29, 29, 28, 28, 28, 28, 28, 28, 28, 26, 25, 25, 25, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 23, 23, 26, 26, 26, 26, 26, 25, 25, 25, 25, 24, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, /* Size 4x8 */ 33, 33, 31, 28, 26, 23, 22, 22, 30, 29, 28, 25, 23, 22, 22, 22, 24, 24, 23, 22, 21, 21, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, /* Size 8x4 */ 33, 30, 24, 21, 33, 29, 24, 22, 31, 28, 23, 22, 28, 25, 22, 22, 26, 23, 21, 21, 23, 22, 21, 20, 22, 22, 20, 19, 22, 22, 21, 19, /* Size 8x16 */ 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 33, 33, 33, 32, 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 33, 33, 33, 32, 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 22, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, 18, 18, /* Size 16x8 */ 32, 33, 33, 28, 28, 21, 21, 21, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33, 33, 27, 27, 22, 22, 22, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26, 26, 22, 22, 23, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22, 22, 22, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 22, 22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19, 19, 18, /* Size 16x32 */ 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, /* Size 32x16 */ 32, 33, 33, 33, 33, 31, 28, 28, 28, 24, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, 33, 30, 28, 28, 28, 24, 22, 22, 22, 21, 21, 21, 33, 33, 33, 33, 33, 30, 27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30, 27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30, 27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23, 23, 23, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23, 23, 23, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23, 23, 23, 32, 31, 30, 30, 30, 28, 25, 25, 25, 23, 22, 22, 22, 22, 23, 23, 31, 30, 28, 28, 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 31, 30, 28, 28, 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 31, 30, 28, 28, 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 29, 28, 27, 27, 27, 25, 23, 23, 23, 22, 22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 26, 26, 25, 25, 25, 23, 22, 22, 22, 21, 21, 21, 21, 21, 22, 22, 24, 24, 24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 24, 24, 24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 24, 24, 24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22, 23, 23, 23, 22, 22, 22, 22, 21, 19, 19, 19, 19, 18, 18, 21, 22, 23, 23, 23, 23, 22, 22, 22, 21, 19, 19, 19, 18, 18, 18, 21, 22, 23, 23, 23, 23, 22, 22, 22, 21, 19, 19, 19, 18, 18, 18, /* Size 4x16 */ 33, 33, 33, 33, 33, 30, 30, 27, 27, 24, 24, 21, 21, 22, 22, 22, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 24, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 21, 21, 22, 22, 23, 23, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 18, /* Size 16x4 */ 33, 31, 24, 21, 33, 30, 24, 22, 33, 30, 24, 22, 33, 29, 24, 23, 33, 29, 24, 23, 30, 26, 23, 22, 30, 26, 23, 22, 27, 24, 22, 22, 27, 24, 22, 22, 24, 23, 21, 20, 24, 23, 21, 20, 21, 22, 20, 19, 21, 22, 20, 19, 22, 22, 20, 19, 22, 22, 20, 19, 22, 23, 21, 18, /* Size 8x32 */ 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, /* Size 32x8 */ 32, 33, 33, 28, 28, 21, 21, 21, 33, 33, 33, 28, 28, 22, 22, 21, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33, 33, 27, 27, 22, 22, 22, 33, 32, 32, 26, 26, 22, 22, 22, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26, 26, 22, 22, 23, 32, 30, 30, 25, 25, 22, 22, 23, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22, 22, 22, 29, 27, 27, 23, 23, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 26, 25, 25, 22, 22, 21, 21, 22, 24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 23, 23, 23, 22, 22, 20, 20, 20, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 22, 22, 19, 19, 19, 21, 22, 22, 22, 22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19, 19, 18 }, }, { { /* Luma */ /* Size 4x4 */ 32, 32, 32, 29, 32, 32, 31, 29, 32, 31, 29, 27, 29, 29, 27, 22, /* Size 8x8 */ 33, 33, 33, 32, 32, 32, 30, 29, 33, 32, 32, 32, 32, 31, 30, 29, 33, 32, 32, 32, 32, 31, 31, 30, 32, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 30, 29, 29, 28, 27, 32, 31, 31, 30, 29, 28, 27, 26, 30, 30, 31, 29, 28, 27, 26, 24, 29, 29, 30, 28, 27, 26, 24, 21, /* Size 16x16 */ 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 29, 29, 28, 28, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 31, 31, 31, 31, 31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 30, 30, 30, 30, 31, 31, 29, 29, 28, 28, 28, 26, 26, 25, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, 23, 23, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 24, 24, 23, 21, 21, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 27, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 25, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 25, 25, 24, 23, 23, 23, 23, 23, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 21, 28, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 21, 21, 21, 21, 20, /* Size 4x8 */ 33, 32, 32, 32, 32, 31, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30, 32, 32, 31, 30, 29, 28, 28, 27, 29, 29, 30, 28, 27, 26, 24, 21, /* Size 8x4 */ 33, 33, 32, 29, 32, 32, 32, 29, 32, 32, 31, 30, 32, 32, 30, 28, 32, 31, 29, 27, 31, 31, 28, 26, 30, 30, 28, 24, 29, 30, 27, 21, /* Size 8x16 */ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, 22, 22, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 25, 24, 23, 21, 21, /* Size 16x8 */ 32, 33, 33, 33, 32, 32, 29, 28, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 31, 31, 31, 29, 28, 28, 26, 25, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28, 24, 23, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27, 22, 21, /* Size 16x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 24, 23, 22, 22, 22, 22, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, /* Size 32x16 */ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32, 32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32, 32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32, 32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 26, 26, 26, 26, 31, 31, 31, 31, 31, 31, 29, 28, 28, 28, 28, 27, 26, 25, 25, 25, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 24, 23, 23, 23, 29, 29, 30, 30, 30, 30, 28, 28, 27, 27, 27, 25, 23, 22, 22, 22, 28, 29, 29, 30, 30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 29, 29, 30, 30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 29, 29, 30, 30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 24, 22, 21, 21, 21, /* Size 4x16 */ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 25, 24, 23, 21, 21, /* Size 16x4 */ 33, 33, 32, 28, 33, 32, 32, 29, 32, 32, 32, 29, 32, 32, 32, 29, 32, 32, 31, 30, 32, 32, 31, 30, 32, 32, 30, 28, 32, 32, 30, 28, 32, 32, 30, 28, 32, 31, 29, 27, 32, 31, 29, 27, 31, 31, 28, 25, 30, 30, 28, 24, 30, 30, 28, 23, 29, 30, 27, 21, 29, 30, 27, 21, /* Size 8x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 24, 23, 22, 22, 22, 22, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, /* Size 32x8 */ 32, 33, 33, 33, 32, 32, 29, 28, 33, 33, 33, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 31, 31, 31, 29, 29, 33, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 31, 31, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 32, 31, 31, 30, 28, 28, 26, 26, 31, 31, 31, 29, 28, 28, 26, 25, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28, 24, 23, 29, 30, 30, 28, 27, 27, 23, 22, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27, 22, 21, 28, 28, 28, 28, 26, 26, 22, 21 }, { /* Chroma */ /* Size 4x4 */ 33, 32, 27, 22, 32, 30, 25, 22, 27, 25, 22, 22, 22, 22, 22, 20, /* Size 8x8 */ 33, 33, 34, 30, 28, 26, 24, 21, 33, 33, 33, 30, 28, 26, 24, 22, 34, 33, 32, 29, 26, 25, 24, 22, 30, 30, 29, 26, 24, 23, 23, 22, 28, 28, 26, 24, 22, 22, 22, 22, 26, 26, 25, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 21, 21, 20, 21, 22, 22, 22, 22, 21, 20, 19, /* Size 16x16 */ 32, 33, 33, 33, 34, 34, 31, 31, 30, 28, 28, 26, 25, 23, 21, 21, 33, 33, 33, 33, 33, 33, 31, 30, 28, 27, 27, 25, 24, 23, 21, 21, 33, 33, 33, 33, 33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 22, 22, 33, 33, 33, 33, 33, 33, 30, 29, 28, 26, 26, 25, 24, 23, 22, 22, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 31, 31, 30, 30, 30, 30, 28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 31, 30, 30, 29, 29, 29, 27, 26, 26, 24, 24, 23, 23, 22, 22, 22, 30, 28, 28, 28, 28, 28, 26, 26, 24, 23, 23, 23, 22, 22, 22, 22, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 26, 25, 25, 25, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 30, 28, 28, 28, 28, 27, 26, 25, 25, 25, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 28, 28, 28, 28, 27, 26, 25, 25, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 27, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 26, 25, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, /* Size 4x8 */ 33, 33, 33, 30, 28, 26, 24, 21, 33, 33, 32, 28, 26, 25, 24, 22, 28, 27, 26, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 21, 20, 19, /* Size 8x4 */ 33, 33, 28, 21, 33, 33, 27, 22, 33, 32, 26, 22, 30, 28, 24, 22, 28, 26, 22, 22, 26, 25, 22, 21, 24, 24, 22, 20, 21, 22, 21, 19, /* Size 8x16 */ 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 33, 33, 33, 32, 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 31, 30, 30, 30, 29, 29, 28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, /* Size 16x8 */ 32, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 32, 30, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 31, 30, 29, 28, 24, 24, 22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 29, 28, 28, 26, 23, 23, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 25, 24, 24, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 22, 21, 20, 23, 23, 23, 23, 22, 22, 20, 20, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21, 20, 19, /* Size 16x32 */ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 29, 28, 28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, 28, 27, 27, 27, 26, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 25, 25, 25, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 26, 25, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, /* Size 32x16 */ 32, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 26, 23, 21, 21, 21, 33, 33, 33, 33, 33, 33, 31, 28, 28, 28, 28, 25, 23, 21, 21, 21, 33, 33, 33, 33, 33, 33, 30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 32, 32, 32, 30, 28, 26, 26, 26, 25, 23, 22, 22, 22, 34, 33, 33, 32, 32, 32, 30, 27, 26, 26, 26, 24, 23, 22, 22, 22, 34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 33, 32, 31, 31, 31, 31, 28, 26, 25, 25, 25, 24, 23, 22, 22, 22, 31, 30, 30, 29, 29, 29, 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24, 24, 23, 22, 22, 22, 22, 29, 28, 28, 28, 28, 28, 26, 24, 23, 23, 23, 23, 22, 22, 22, 22, 28, 28, 27, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 26, 26, 26, 25, 25, 25, 24, 22, 22, 22, 22, 21, 21, 21, 21, 21, 25, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 20, 20, 20, 20, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, /* Size 4x16 */ 33, 33, 33, 33, 33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 21, 21, 33, 33, 33, 32, 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, /* Size 16x4 */ 33, 33, 28, 21, 33, 33, 27, 22, 33, 33, 27, 22, 33, 32, 26, 22, 33, 32, 26, 22, 33, 32, 26, 22, 30, 29, 24, 22, 30, 28, 24, 22, 28, 28, 23, 22, 27, 26, 22, 22, 27, 26, 22, 22, 25, 24, 22, 21, 24, 24, 22, 20, 23, 23, 22, 20, 21, 22, 21, 19, 21, 22, 21, 19, /* Size 8x32 */ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 29, 28, 28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, 28, 27, 27, 27, 26, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, /* Size 32x8 */ 32, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 32, 30, 26, 26, 23, 22, 34, 33, 32, 30, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 33, 31, 31, 28, 25, 25, 23, 22, 31, 30, 29, 28, 24, 24, 22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 29, 28, 28, 26, 23, 23, 22, 22, 28, 27, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 26, 26, 25, 24, 22, 22, 21, 21, 25, 24, 24, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 22, 21, 20, 24, 24, 24, 23, 22, 22, 21, 20, 24, 24, 24, 23, 22, 22, 21, 20, 23, 23, 23, 23, 22, 22, 20, 20, 22, 22, 22, 22, 21, 21, 20, 20, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 22, 22, 20, 19 }, }, { { /* Luma */ /* Size 4x4 */ 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 31, 30, 32, 31, 30, 29, /* Size 8x8 */ 33, 33, 33, 33, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 31, 31, 30, 29, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 32, 30, 30, 29, 28, 31, 31, 31, 31, 29, 29, 28, 27, /* Size 16x16 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 29, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28, 28, 27, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 28, 27, 26, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26, /* Size 4x8 */ 33, 33, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 31, 30, 30, 29, 28, /* Size 8x4 */ 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 31, 30, 32, 32, 30, 30, 32, 31, 30, 29, 31, 31, 29, 28, /* Size 8x16 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28, 28, 27, /* Size 16x8 */ 32, 33, 33, 33, 33, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 29, 29, 29, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31, 30, 29, 29, 28, 32, 31, 31, 31, 30, 28, 28, 28, 30, 30, 30, 30, 29, 28, 28, 27, /* Size 16x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 27, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26, /* Size 32x16 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 29, 29, 29, 29, 28, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28, 28, 28, 28, 27, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 26, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 26, /* Size 4x16 */ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 28, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, /* Size 16x4 */ 33, 33, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 31, 31, 32, 32, 31, 30, 32, 32, 31, 30, 32, 32, 31, 30, 32, 32, 30, 29, 32, 31, 30, 29, 32, 31, 30, 29, 31, 31, 29, 28, 30, 30, 28, 28, /* Size 8x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 27, /* Size 32x8 */ 32, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 31, 33, 33, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 32, 31, 29, 29, 29, 32, 32, 31, 31, 31, 29, 29, 28, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31, 30, 29, 29, 28, 32, 31, 31, 31, 30, 28, 28, 28, 31, 31, 31, 31, 30, 28, 28, 28, 30, 30, 30, 30, 29, 28, 28, 27, 30, 30, 30, 30, 29, 28, 28, 27 }, { /* Chroma */ /* Size 4x4 */ 33, 33, 30, 27, 33, 32, 29, 26, 30, 29, 26, 24, 27, 26, 24, 22, /* Size 8x8 */ 33, 33, 33, 34, 30, 29, 28, 26, 33, 33, 33, 33, 30, 29, 27, 25, 33, 33, 33, 33, 29, 28, 26, 25, 34, 33, 33, 32, 29, 28, 26, 24, 30, 30, 29, 29, 26, 26, 24, 23, 29, 29, 28, 28, 26, 25, 23, 23, 28, 27, 26, 26, 24, 23, 22, 22, 26, 25, 25, 24, 23, 23, 22, 21, /* Size 16x16 */ 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 27, 25, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 28, 28, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 31, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 25, 24, 24, 24, 23, 31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 29, 28, 28, 28, 28, 27, 27, 27, 25, 25, 25, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 21, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 21, 21, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 25, 25, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 27, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28, 28, 26, 26, 26, 26, 26, 26, 25, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 25, 24, 24, 24, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 25, 25, 25, 25, 24, 24, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 23, 23, 23, 23, 23, 23, 23, 23, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 26, 25, 25, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 26, 26, 26, 25, 25, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, /* Size 4x8 */ 33, 33, 33, 33, 30, 29, 27, 25, 33, 33, 32, 32, 28, 28, 26, 24, 29, 28, 28, 28, 26, 24, 23, 23, 28, 27, 26, 26, 24, 23, 22, 22, /* Size 8x4 */ 33, 33, 29, 28, 33, 33, 28, 27, 33, 32, 28, 26, 33, 32, 28, 26, 30, 28, 26, 24, 29, 28, 24, 23, 27, 26, 23, 22, 25, 24, 23, 22, /* Size 8x16 */ 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 33, 33, 33, 33, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 33, 33, 33, 33, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 31, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 25, 24, 24, 24, 23, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 21, /* Size 16x8 */ 32, 33, 33, 33, 31, 28, 28, 27, 33, 33, 33, 33, 31, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 32, 32, 30, 26, 26, 26, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26, 26, 25, 33, 32, 31, 31, 29, 26, 26, 25, 31, 30, 29, 29, 28, 24, 24, 24, 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 29, 28, 27, 27, 25, 23, 23, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 24, 24, 24, 24, 23, 22, 22, 21, /* Size 16x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 25, 24, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 24, 23, 23, 23, 23, 23, 23, 23, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, /* Size 32x16 */ 32, 33, 33, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 28, 27, 24, 33, 33, 33, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 28, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 31, 29, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 26, 26, 26, 26, 26, 24, 34, 33, 33, 32, 32, 32, 32, 32, 30, 28, 26, 26, 26, 26, 26, 24, 34, 33, 33, 32, 32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 33, 33, 32, 32, 31, 31, 31, 31, 29, 27, 26, 26, 26, 26, 25, 24, 32, 32, 31, 31, 30, 30, 30, 30, 28, 26, 25, 25, 25, 25, 24, 23, 31, 31, 30, 29, 29, 29, 29, 29, 28, 26, 24, 24, 24, 24, 24, 23, 31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 30, 29, 28, 28, 28, 28, 28, 28, 26, 24, 23, 23, 23, 23, 23, 23, 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 23, 23, 23, 23, 22, 22, 28, 28, 27, 26, 26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22, 26, 26, 26, 25, 25, 25, 25, 24, 24, 23, 22, 22, 22, 22, 22, 21, 26, 25, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 21, 24, 24, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 24, 24, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, /* Size 4x16 */ 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 24, 23, 23, 23, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, /* Size 16x4 */ 33, 33, 29, 28, 33, 33, 29, 27, 33, 33, 28, 27, 33, 33, 28, 27, 33, 32, 28, 26, 33, 32, 28, 26, 33, 32, 28, 26, 33, 31, 27, 26, 31, 29, 26, 24, 30, 28, 26, 24, 30, 28, 26, 24, 28, 27, 24, 23, 27, 26, 23, 22, 27, 26, 23, 22, 26, 25, 23, 22, 24, 24, 22, 22, /* Size 8x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, /* Size 32x8 */ 32, 33, 33, 33, 31, 28, 28, 27, 33, 33, 33, 33, 31, 28, 28, 26, 33, 33, 33, 33, 31, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 32, 32, 30, 26, 26, 26, 34, 33, 32, 32, 30, 26, 26, 26, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26, 26, 25, 33, 32, 31, 31, 29, 26, 26, 25, 32, 31, 30, 30, 28, 25, 25, 24, 31, 30, 29, 29, 28, 24, 24, 24, 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 30, 28, 28, 28, 26, 23, 23, 23, 29, 28, 27, 27, 25, 23, 23, 22, 28, 27, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 26, 25, 24, 24, 23, 22, 22, 22, 24, 24, 24, 24, 23, 22, 22, 21, 24, 24, 24, 24, 23, 22, 22, 21 }, }, { { /* Luma */ /* Size 4x4 */ 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, /* Size 8x8 */ 33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, /* Size 16x16 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, /* Size 4x8 */ 33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, /* Size 8x4 */ 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, /* Size 8x16 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, /* Size 16x8 */ 32, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, /* Size 16x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, /* Size 32x16 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, /* Size 4x16 */ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, /* Size 16x4 */ 33, 33, 33, 32, 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, /* Size 8x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, /* Size 32x8 */ 32, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30 }, { /* Chroma */ /* Size 4x4 */ 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 32, 29, 30, 29, 29, 26, /* Size 8x8 */ 33, 33, 33, 33, 34, 33, 31, 31, 33, 33, 33, 33, 33, 32, 30, 30, 33, 33, 33, 33, 33, 32, 30, 30, 33, 33, 33, 33, 33, 32, 29, 29, 34, 33, 33, 33, 32, 32, 29, 29, 33, 32, 32, 32, 32, 31, 28, 28, 31, 30, 30, 29, 29, 28, 26, 26, 31, 30, 30, 29, 29, 28, 26, 26, /* Size 16x16 */ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 29, 29, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 29, 29, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 28, 28, 28, 28, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 29, 29, 29, 29, 29, 29, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29, 29, 29, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 27, 27, 27, 27, 27, 27, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 26, /* Size 4x8 */ 33, 33, 33, 33, 33, 33, 30, 30, 33, 33, 33, 32, 32, 31, 28, 28, 33, 33, 33, 32, 32, 31, 28, 28, 30, 29, 29, 28, 28, 28, 26, 26, /* Size 8x4 */ 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 33, 29, 33, 32, 32, 28, 33, 32, 32, 28, 33, 31, 31, 28, 30, 28, 28, 26, 30, 28, 28, 26, /* Size 8x16 */ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, 28, 27, 27, 27, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 25, 25, 25, /* Size 16x8 */ 32, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 32, 32, 32, 30, 28, 34, 33, 33, 32, 32, 32, 30, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 33, 32, 31, 31, 31, 31, 28, 26, 31, 30, 30, 29, 29, 29, 28, 26, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, /* Size 16x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29, 29, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 27, 27, 27, 27, 27, 26, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 26, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 27, 27, 27, 26, 26, 26, 25, 25, 25, 25, 25, 25, 24, 28, 28, 28, 27, 27, 27, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, /* Size 32x16 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 28, 28, 26, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 28, 28, 26, 34, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 33, 33, 33, 32, 32, 31, 31, 31, 31, 31, 31, 30, 29, 28, 27, 26, 33, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 29, 28, 28, 26, 25, 32, 32, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 28, 27, 26, 25, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 26, 26, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 24, 23, /* Size 4x16 */ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 26, 26, 26, 26, /* Size 16x4 */ 33, 33, 33, 30, 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 33, 29, 33, 33, 33, 29, 33, 33, 33, 29, 33, 32, 32, 28, 33, 32, 32, 28, 33, 32, 32, 28, 33, 32, 32, 28, 33, 32, 32, 28, 32, 31, 31, 28, 31, 29, 29, 26, 30, 28, 28, 26, 30, 28, 28, 26, 30, 28, 28, 26, /* Size 8x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 27, 27, 27, 27, 27, 26, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 27, 27, 27, 26, 26, 26, 25, 25, 25, 25, 25, 25, 24, /* Size 32x8 */ 32, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 32, 32, 32, 30, 28, 33, 33, 33, 32, 32, 32, 30, 28, 34, 33, 33, 32, 32, 32, 30, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 33, 33, 32, 31, 31, 31, 29, 27, 33, 32, 31, 31, 31, 31, 28, 26, 32, 31, 30, 30, 30, 30, 28, 26, 31, 30, 30, 29, 29, 29, 28, 26, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 30, 29, 28, 28, 28, 28, 26, 24 }, }, { { /* Luma */ /* Size 4x4 */ 33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, /* Size 8x8 */ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, /* Size 16x16 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 4x8 */ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, /* Size 8x4 */ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, /* Size 8x16 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 16x8 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, /* Size 16x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 32x16 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 4x16 */ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 16x4 */ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, /* Size 8x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 32x8 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32 }, { /* Chroma */ /* Size 4x4 */ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, /* Size 8x8 */ 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, /* Size 16x16 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, /* Size 32x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, /* Size 4x8 */ 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, /* Size 8x4 */ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 34, 33, 32, 32, /* Size 8x16 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, /* Size 16x8 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 32, 32, 32, 32, /* Size 16x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 32x16 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 4x16 */ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, /* Size 16x4 */ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 32, 32, 33, 33, 32, 32, 34, 33, 32, 32, /* Size 8x32 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* Size 32x8 */ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 32, 32, 32, 32, 34, 33, 33, 33, 32, 32, 32, 32, 34, 33, 33, 33, 32, 32, 32, 32 }, }, }; #endif // CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER aom-3.12.1/av1/common/quant_common.h000066400000000000000000000154001477627663500172310ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_QUANT_COMMON_H_ #define AOM_AV1_COMMON_QUANT_COMMON_H_ #include #include "aom/aom_codec.h" #include "av1/common/seg_common.h" #include "av1/common/enums.h" #include "av1/common/entropy.h" #ifdef __cplusplus extern "C" { #endif #define MINQ 0 #define MAXQ 255 #define QINDEX_RANGE (MAXQ - MINQ + 1) #define QINDEX_BITS 8 // Total number of QM sets stored #define QM_LEVEL_BITS 4 #define NUM_QM_LEVELS (1 << QM_LEVEL_BITS) /* Range of QMS is between first and last value, with offset applied to inter * blocks*/ #define DEFAULT_QM_Y 10 #define DEFAULT_QM_U 11 #define DEFAULT_QM_V 12 #define DEFAULT_QM_FIRST 5 #define DEFAULT_QM_LAST 9 #define DEFAULT_QM_FIRST_ALLINTRA 4 #define DEFAULT_QM_LAST_ALLINTRA 10 #define QM_FIRST_IQ 2 #define QM_LAST_IQ 10 #define LOSSLESS_Q_STEP 4 // this should equal to dc/ac_qlookup_QTX[0] struct AV1Common; struct CommonQuantParams; struct macroblockd; int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth); int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth); int av1_get_qindex(const struct segmentation *seg, int segment_id, int base_qindex); // Returns true if we are using quantization matrix. bool av1_use_qmatrix(const struct CommonQuantParams *quant_params, const struct macroblockd *xd, int segment_id); // Reduce the large number of quantizers to a smaller number of levels for which // different matrices may be defined. This is an increasing function in qindex. static inline int aom_get_qmlevel(int qindex, int first, int last) { return first + (qindex * (last + 1 - first)) / QINDEX_RANGE; } // QM levels tuned for all intra mode (including still images) // This formula was empirically derived by encoding the CID22 validation // testset for each QP/QM tuple, and building a convex hull that // maximizes SSIMULACRA 2 scores, and a final subjective visual quality pass // as a quick validation. This is a decreasing function in qindex. // There are a total of 16 luma QM levels, and the higher the level, the // flatter these QMs are. // QM level 15 is a completely-flat matrix and level 0 is the steepest. // This formula only uses levels 4 through 10, unless qm-min and qm-max are // both set below or above this range. // For more information on quantization matrices, please refer to // https://arxiv.org/pdf/2008.06091, section F. static inline int aom_get_qmlevel_allintra(int qindex, int first, int last) { int qm_level = 0; if (qindex <= 40) { qm_level = 10; } else if (qindex <= 100) { qm_level = 9; } else if (qindex <= 160) { qm_level = 8; } else if (qindex <= 200) { qm_level = 7; } else if (qindex <= 220) { qm_level = 6; } else if (qindex <= 240) { qm_level = 5; } else { qm_level = 4; } return clamp(qm_level, first, last); } // Luma QM levels tuned for image quality (IQ) // This formula was empirically derived by encoding Daala's subset1 validation // testset for each QP/QM tuple, and building a convex hull that maximizes // SSIMULACRA 2 scores, and a final subjective visual quality pass as a quick // validation. This is a decreasing function in qindex. // There are a total of 16 luma QM levels, and the higher the level, the // flatter these QMs are. // QM level 15 is a completely-flat matrix and level 0 is the steepest. // This formula only uses levels 2 through 10, unless qm-min and qm-max are // both set below or above this range. // For more information on quantization matrices, please refer to // https://arxiv.org/pdf/2008.06091, section F. static inline int aom_get_qmlevel_luma_iq(int qindex, int first, int last) { int qm_level = 0; if (qindex <= 40) { qm_level = 10; } else if (qindex <= 60) { qm_level = 9; } else if (qindex <= 100) { qm_level = 8; } else if (qindex <= 120) { qm_level = 7; } else if (qindex <= 140) { qm_level = 6; } else if (qindex <= 160) { qm_level = 5; } else if (qindex <= 200) { qm_level = 4; } else if (qindex <= 220) { qm_level = 3; } else { qm_level = 2; } return clamp(qm_level, first, last); } // Chroma QM levels for 4:4:4 subsampling tuned for image quality (IQ) // This formula was empirically derived by encoding Daala's subset1 validation // testset for each QP/QM tuple, and building a convex hull that maximizes // SSIMULACRA 2 scores, and a final subjective visual quality pass as a quick // validation. This is a decreasing function in qindex. // Like with luma QMs, there are a total of 16 chroma QM levels, and the higher // the level, the flatter these QMs are. // QM level 15 is a completely-flat matrix and level 0 is the steepest. // This formula only uses levels 2 through 10, unless qm-min and qm-max are // both set below or above this range. // For more information on quantization matrices, please refer to // https://arxiv.org/pdf/2008.06091, section F. static inline int aom_get_qmlevel_444_chroma_iq(int qindex, int first, int last) { int chroma_qm_level = 0; if (qindex <= 12) { chroma_qm_level = 10; } else if (qindex <= 24) { chroma_qm_level = 9; } else if (qindex <= 32) { chroma_qm_level = 8; } else if (qindex <= 36) { chroma_qm_level = 7; } else if (qindex <= 44) { chroma_qm_level = 6; } else if (qindex <= 48) { chroma_qm_level = 5; } else if (qindex <= 56) { chroma_qm_level = 4; } else if (qindex <= 88) { chroma_qm_level = 3; } else { chroma_qm_level = 2; } return clamp(chroma_qm_level, first, last); } // Initialize all global quant/dequant matrices. void av1_qm_init(struct CommonQuantParams *quant_params, int num_planes); // Get either local / global dequant matrix as appropriate. const qm_val_t *av1_get_iqmatrix(const struct CommonQuantParams *quant_params, const struct macroblockd *xd, int plane, TX_SIZE tx_size, TX_TYPE tx_type); // Get either local / global quant matrix as appropriate. const qm_val_t *av1_get_qmatrix(const struct CommonQuantParams *quant_params, const struct macroblockd *xd, int plane, TX_SIZE tx_size, TX_TYPE tx_type); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_QUANT_COMMON_H_ aom-3.12.1/av1/common/reconinter.c000066400000000000000000001373151477627663500167060ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "config/aom_scale_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/blend.h" #include "aom_ports/aom_once.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/mvref_common.h" #include "av1/common/obmc.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" // This function will determine whether or not to create a warped // prediction. static int allow_warp(const MB_MODE_INFO *const mbmi, const WarpTypesAllowed *const warp_types, const WarpedMotionParams *const gm_params, int build_for_obmc, const struct scale_factors *const sf, WarpedMotionParams *final_warp_params) { // Note: As per the spec, we must test the fixed point scales here, which are // at a higher precision (1 << 14) than the xs and ys in subpel_params (that // have 1 << 10 precision). if (av1_is_scaled(sf)) return 0; if (final_warp_params != NULL) *final_warp_params = default_warp_params; if (build_for_obmc) return 0; if (warp_types->local_warp_allowed && !mbmi->wm_params.invalid) { if (final_warp_params != NULL) memcpy(final_warp_params, &mbmi->wm_params, sizeof(*final_warp_params)); return 1; } else if (warp_types->global_warp_allowed && !gm_params->invalid) { if (final_warp_params != NULL) memcpy(final_warp_params, gm_params, sizeof(*final_warp_params)); return 1; } return 0; } void av1_init_warp_params(InterPredParams *inter_pred_params, const WarpTypesAllowed *warp_types, int ref, const MACROBLOCKD *xd, const MB_MODE_INFO *mi) { if (inter_pred_params->block_height < 8 || inter_pred_params->block_width < 8) return; if (xd->cur_frame_force_integer_mv) return; if (allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], 0, inter_pred_params->scale_factors, &inter_pred_params->warp_params)) { #if CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_FEATURE, "Warped motion is disabled in realtime only build."); #endif // CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER inter_pred_params->mode = WARP_PRED; } } void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, InterPredParams *inter_pred_params, const SubpelParams *subpel_params) { assert(IMPLIES(inter_pred_params->conv_params.is_compound, inter_pred_params->conv_params.dst != NULL)); if (inter_pred_params->mode == TRANSLATION_PRED) { #if CONFIG_AV1_HIGHBITDEPTH if (inter_pred_params->use_hbd_buf) { highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, inter_pred_params->block_width, inter_pred_params->block_height, &inter_pred_params->conv_params, inter_pred_params->interp_filter_params, inter_pred_params->bit_depth); } else { inter_predictor(src, src_stride, dst, dst_stride, subpel_params, inter_pred_params->block_width, inter_pred_params->block_height, &inter_pred_params->conv_params, inter_pred_params->interp_filter_params); } #else inter_predictor(src, src_stride, dst, dst_stride, subpel_params, inter_pred_params->block_width, inter_pred_params->block_height, &inter_pred_params->conv_params, inter_pred_params->interp_filter_params); #endif } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // TODO(jingning): av1_warp_plane() can be further cleaned up. else if (inter_pred_params->mode == WARP_PRED) { av1_warp_plane( &inter_pred_params->warp_params, inter_pred_params->use_hbd_buf, inter_pred_params->bit_depth, inter_pred_params->ref_frame_buf.buf0, inter_pred_params->ref_frame_buf.width, inter_pred_params->ref_frame_buf.height, inter_pred_params->ref_frame_buf.stride, dst, inter_pred_params->pix_col, inter_pred_params->pix_row, inter_pred_params->block_width, inter_pred_params->block_height, dst_stride, inter_pred_params->subsampling_x, inter_pred_params->subsampling_y, &inter_pred_params->conv_params); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER else { assert(0 && "Unsupported inter_pred_params->mode"); } } static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 6, 18, 37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, }; static const uint8_t wedge_master_oblique_even[MASK_MASTER_SIZE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 11, 27, 46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, }; static const uint8_t wedge_master_vertical[MASK_MASTER_SIZE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 7, 21, 43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, }; static inline void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) { if (shift >= 0) { memcpy(dst + shift, src, width - shift); memset(dst, src[0], shift); } else { shift = -shift; memcpy(dst, src + shift, width - shift); memset(dst + width - shift, src[width - 1], shift); } } /* clang-format off */ DECLARE_ALIGNED(16, static uint8_t, wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]) = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, }, { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used }; /* clang-format on */ // [negative][direction] DECLARE_ALIGNED( 16, static uint8_t, wedge_mask_obl[2][WEDGE_DIRECTIONS][MASK_MASTER_SIZE * MASK_MASTER_SIZE]); // 4 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound // on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE. DECLARE_ALIGNED(16, static uint8_t, wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]); DECLARE_ALIGNED(16, static uint8_t, smooth_interintra_mask_buf[INTERINTRA_MODES][BLOCK_SIZES_ALL] [MAX_WEDGE_SQUARE]); static wedge_masks_type wedge_masks[BLOCK_SIZES_ALL][2]; static const wedge_code_type wedge_codebook_16_hgtw[16] = { { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 }, { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 }, { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, }; static const wedge_code_type wedge_codebook_16_hltw[16] = { { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 }, { WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 }, { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, }; static const wedge_code_type wedge_codebook_16_heqw[16] = { { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 }, { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, }; const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL] = { { 0, NULL, NULL, NULL }, { 0, NULL, NULL, NULL }, { 0, NULL, NULL, NULL }, { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8], wedge_masks[BLOCK_8X8] }, { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16], wedge_masks[BLOCK_8X16] }, { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8], wedge_masks[BLOCK_16X8] }, { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16], wedge_masks[BLOCK_16X16] }, { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32], wedge_masks[BLOCK_16X32] }, { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16], wedge_masks[BLOCK_32X16] }, { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32], wedge_masks[BLOCK_32X32] }, { 0, NULL, NULL, NULL }, { 0, NULL, NULL, NULL }, { 0, NULL, NULL, NULL }, { 0, NULL, NULL, NULL }, { 0, NULL, NULL, NULL }, { 0, NULL, NULL, NULL }, { 0, NULL, NULL, NULL }, { 0, NULL, NULL, NULL }, { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32], wedge_masks[BLOCK_8X32] }, { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8], wedge_masks[BLOCK_32X8] }, { 0, NULL, NULL, NULL }, { 0, NULL, NULL, NULL }, }; static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg, BLOCK_SIZE sb_type) { const uint8_t *master; const int bh = block_size_high[sb_type]; const int bw = block_size_wide[sb_type]; const wedge_code_type *a = av1_wedge_params_lookup[sb_type].codebook + wedge_index; int woff, hoff; const uint8_t wsignflip = av1_wedge_params_lookup[sb_type].signflip[wedge_index]; assert(wedge_index >= 0 && wedge_index < get_wedge_types_lookup(sb_type)); woff = (a->x_offset * bw) >> 3; hoff = (a->y_offset * bh) >> 3; master = wedge_mask_obl[neg ^ wsignflip][a->direction] + MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) + MASK_MASTER_SIZE / 2 - woff; return master; } const uint8_t *av1_get_compound_type_mask( const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type) { (void)sb_type; switch (comp_data->type) { case COMPOUND_WEDGE: return av1_get_contiguous_soft_mask(comp_data->wedge_index, comp_data->wedge_sign, sb_type); default: return comp_data->seg_mask; } } static inline void diffwtd_mask_d16(uint8_t *mask, int which_inverse, int mask_base, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd) { int round = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); int i, j, m, diff; for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { diff = abs(src0[i * src0_stride + j] - src1[i * src1_stride + j]); diff = ROUND_POWER_OF_TWO(diff, round); m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA); mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m; } } } void av1_build_compound_diffwtd_mask_d16_c( uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd) { switch (mask_type) { case DIFFWTD_38: diffwtd_mask_d16(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w, conv_params, bd); break; case DIFFWTD_38_INV: diffwtd_mask_d16(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w, conv_params, bd); break; default: assert(0); } } static inline void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w) { int i, j, m, diff; for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { diff = abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]); m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA); mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m; } } } void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w) { switch (mask_type) { case DIFFWTD_38: diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w); break; case DIFFWTD_38_INV: diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w); break; default: assert(0); } } #if CONFIG_AV1_HIGHBITDEPTH static AOM_FORCE_INLINE void diffwtd_mask_highbd( uint8_t *mask, int which_inverse, int mask_base, const uint16_t *src0, int src0_stride, const uint16_t *src1, int src1_stride, int h, int w, const unsigned int bd) { assert(bd >= 8); if (bd == 8) { if (which_inverse) { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR; unsigned int m = negative_to_zero(mask_base + diff); m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA); mask[j] = AOM_BLEND_A64_MAX_ALPHA - m; } src0 += src0_stride; src1 += src1_stride; mask += w; } } else { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR; unsigned int m = negative_to_zero(mask_base + diff); m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA); mask[j] = m; } src0 += src0_stride; src1 += src1_stride; mask += w; } } } else { const unsigned int bd_shift = bd - 8; if (which_inverse) { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { int diff = (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR; unsigned int m = negative_to_zero(mask_base + diff); m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA); mask[j] = AOM_BLEND_A64_MAX_ALPHA - m; } src0 += src0_stride; src1 += src1_stride; mask += w; } } else { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { int diff = (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR; unsigned int m = negative_to_zero(mask_base + diff); m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA); mask[j] = m; } src0 += src0_stride; src1 += src1_stride; mask += w; } } } } void av1_build_compound_diffwtd_mask_highbd_c( uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd) { switch (mask_type) { case DIFFWTD_38: diffwtd_mask_highbd(mask, 0, 38, CONVERT_TO_SHORTPTR(src0), src0_stride, CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd); break; case DIFFWTD_38_INV: diffwtd_mask_highbd(mask, 1, 38, CONVERT_TO_SHORTPTR(src0), src0_stride, CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd); break; default: assert(0); } } #endif // CONFIG_AV1_HIGHBITDEPTH static inline void init_wedge_master_masks(void) { int i, j; const int w = MASK_MASTER_SIZE; const int h = MASK_MASTER_SIZE; const int stride = MASK_MASTER_STRIDE; // Note: index [0] stores the masters, and [1] its complement. // Generate prototype by shifting the masters int shift = h / 4; for (i = 0; i < h; i += 2) { shift_copy(wedge_master_oblique_even, &wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride], shift, MASK_MASTER_SIZE); shift--; shift_copy(wedge_master_oblique_odd, &wedge_mask_obl[0][WEDGE_OBLIQUE63][(i + 1) * stride], shift, MASK_MASTER_SIZE); memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][i * stride], wedge_master_vertical, MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0])); memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][(i + 1) * stride], wedge_master_vertical, MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0])); } for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j]; wedge_mask_obl[0][WEDGE_OBLIQUE27][j * stride + i] = msk; wedge_mask_obl[0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] = wedge_mask_obl[0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = (1 << WEDGE_WEIGHT_BITS) - msk; wedge_mask_obl[1][WEDGE_OBLIQUE63][i * stride + j] = wedge_mask_obl[1][WEDGE_OBLIQUE27][j * stride + i] = (1 << WEDGE_WEIGHT_BITS) - msk; wedge_mask_obl[1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] = wedge_mask_obl[1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = msk; const int mskx = wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j]; wedge_mask_obl[0][WEDGE_HORIZONTAL][j * stride + i] = mskx; wedge_mask_obl[1][WEDGE_VERTICAL][i * stride + j] = wedge_mask_obl[1][WEDGE_HORIZONTAL][j * stride + i] = (1 << WEDGE_WEIGHT_BITS) - mskx; } } } static inline void init_wedge_masks(void) { uint8_t *dst = wedge_mask_buf; BLOCK_SIZE bsize; memset(wedge_masks, 0, sizeof(wedge_masks)); for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) { const wedge_params_type *wedge_params = &av1_wedge_params_lookup[bsize]; const int wtypes = wedge_params->wedge_types; if (wtypes == 0) continue; const uint8_t *mask; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; int w; for (w = 0; w < wtypes; ++w) { mask = get_wedge_mask_inplace(w, 0, bsize); aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw /* dst_stride */, bw, bh); wedge_params->masks[0][w] = dst; dst += bw * bh; mask = get_wedge_mask_inplace(w, 1, bsize); aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw /* dst_stride */, bw, bh); wedge_params->masks[1][w] = dst; dst += bw * bh; } assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf)); } } /* clang-format off */ static const uint8_t ii_weights1d[MAX_SB_SIZE] = { 60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = { 32, 16, 16, 16, 8, 8, 8, 4, 4, 4, 2, 2, 2, 1, 1, 1, 8, 8, 4, 4, 2, 2 }; /* clang-format on */ static inline void build_smooth_interintra_mask(uint8_t *mask, int stride, BLOCK_SIZE plane_bsize, INTERINTRA_MODE mode) { int i, j; const int bw = block_size_wide[plane_bsize]; const int bh = block_size_high[plane_bsize]; const int size_scale = ii_size_scales[plane_bsize]; switch (mode) { case II_V_PRED: for (i = 0; i < bh; ++i) { memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0])); mask += stride; } break; case II_H_PRED: for (i = 0; i < bh; ++i) { for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale]; mask += stride; } break; case II_SMOOTH_PRED: for (i = 0; i < bh; ++i) { for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[(i < j ? i : j) * size_scale]; mask += stride; } break; case II_DC_PRED: default: for (i = 0; i < bh; ++i) { memset(mask, 32, bw * sizeof(mask[0])); mask += stride; } break; } } static inline void init_smooth_interintra_masks(void) { for (int m = 0; m < INTERINTRA_MODES; ++m) { for (int bs = 0; bs < BLOCK_SIZES_ALL; ++bs) { const int bw = block_size_wide[bs]; const int bh = block_size_high[bs]; if (bw > MAX_WEDGE_SIZE || bh > MAX_WEDGE_SIZE) continue; build_smooth_interintra_mask(smooth_interintra_mask_buf[m][bs], bw, bs, m); } } } // Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0 static void init_all_wedge_masks(void) { init_wedge_master_masks(); init_wedge_masks(); init_smooth_interintra_masks(); } void av1_init_wedge_masks(void) { aom_once(init_all_wedge_masks); } static inline void build_masked_compound_no_round( uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, int w, InterPredParams *inter_pred_params) { const int ssy = inter_pred_params->subsampling_y; const int ssx = inter_pred_params->subsampling_x; const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); const int mask_stride = block_size_wide[sb_type]; #if CONFIG_AV1_HIGHBITDEPTH if (inter_pred_params->use_hbd_buf) { aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, ssx, ssy, &inter_pred_params->conv_params, inter_pred_params->bit_depth); } else { aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, ssx, ssy, &inter_pred_params->conv_params); } #else aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, w, h, ssx, ssy, &inter_pred_params->conv_params); #endif } void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride, InterPredParams *inter_pred_params, const SubpelParams *subpel_params) { const INTERINTER_COMPOUND_DATA *comp_data = &inter_pred_params->mask_comp; BLOCK_SIZE sb_type = inter_pred_params->sb_type; // We're going to call av1_make_inter_predictor to generate a prediction into // a temporary buffer, then will blend that temporary buffer with that from // the other reference. DECLARE_ALIGNED(32, uint8_t, tmp_buf[2 * MAX_SB_SQUARE]); uint8_t *tmp_dst = inter_pred_params->use_hbd_buf ? CONVERT_TO_BYTEPTR(tmp_buf) : tmp_buf; const int tmp_buf_stride = MAX_SB_SIZE; CONV_BUF_TYPE *org_dst = inter_pred_params->conv_params.dst; int org_dst_stride = inter_pred_params->conv_params.dst_stride; CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf; inter_pred_params->conv_params.dst = tmp_buf16; inter_pred_params->conv_params.dst_stride = tmp_buf_stride; assert(inter_pred_params->conv_params.do_average == 0); // This will generate a prediction in tmp_buf for the second reference av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, inter_pred_params, subpel_params); if (!inter_pred_params->conv_params.plane && comp_data->type == COMPOUND_DIFFWTD) { av1_build_compound_diffwtd_mask_d16( comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride, tmp_buf16, tmp_buf_stride, inter_pred_params->block_height, inter_pred_params->block_width, &inter_pred_params->conv_params, inter_pred_params->bit_depth); } build_masked_compound_no_round( dst, dst_stride, org_dst, org_dst_stride, tmp_buf16, tmp_buf_stride, comp_data, sb_type, inter_pred_params->block_height, inter_pred_params->block_width, inter_pred_params); } void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi, int *fwd_offset, int *bck_offset, int *use_dist_wtd_comp_avg, int is_compound) { assert(fwd_offset != NULL && bck_offset != NULL); if (!is_compound || mbmi->compound_idx) { *fwd_offset = 8; *bck_offset = 8; *use_dist_wtd_comp_avg = 0; return; } *use_dist_wtd_comp_avg = 1; const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]); const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]); const int cur_frame_index = cm->cur_frame->order_hint; int bck_frame_index = 0, fwd_frame_index = 0; if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint; if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint; int d0 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info, fwd_frame_index, cur_frame_index)), 0, MAX_FRAME_DISTANCE); int d1 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info, cur_frame_index, bck_frame_index)), 0, MAX_FRAME_DISTANCE); const int order = d0 <= d1; if (d0 == 0 || d1 == 0) { *fwd_offset = quant_dist_lookup_table[3][order]; *bck_offset = quant_dist_lookup_table[3][1 - order]; return; } int i; for (i = 0; i < 3; ++i) { int c0 = quant_dist_weight[i][order]; int c1 = quant_dist_weight[i][!order]; int d0_c0 = d0 * c0; int d1_c1 = d1 * c1; if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break; } *fwd_offset = quant_dist_lookup_table[i][order]; *bck_offset = quant_dist_lookup_table[i][1 - order]; } void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize, const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, const int plane_start, const int plane_end) { // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet // the static analysis warnings. for (int i = plane_start; i < AOMMIN(plane_end, MAX_MB_PLANE); ++i) { struct macroblockd_plane *const pd = &planes[i]; const int is_uv = i > 0; setup_pred_plane(&pd->dst, bsize, src->buffers[i], src->crop_widths[is_uv], src->crop_heights[is_uv], src->strides[is_uv], mi_row, mi_col, NULL, pd->subsampling_x, pd->subsampling_y); } } void av1_setup_pre_planes(MACROBLOCKD *xd, int idx, const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, const struct scale_factors *sf, const int num_planes) { if (src != NULL) { // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet // the static analysis warnings. for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { struct macroblockd_plane *const pd = &xd->plane[i]; const int is_uv = i > 0; setup_pred_plane(&pd->pre[idx], xd->mi[0]->bsize, src->buffers[i], src->crop_widths[is_uv], src->crop_heights[is_uv], src->strides[is_uv], mi_row, mi_col, sf, pd->subsampling_x, pd->subsampling_y); } } } // obmc_mask_N[overlap_position] static const uint8_t obmc_mask_1[1] = { 64 }; DECLARE_ALIGNED(2, static const uint8_t, obmc_mask_2[2]) = { 45, 64 }; DECLARE_ALIGNED(4, static const uint8_t, obmc_mask_4[4]) = { 39, 50, 59, 64 }; static const uint8_t obmc_mask_8[8] = { 36, 42, 48, 53, 57, 61, 64, 64 }; static const uint8_t obmc_mask_16[16] = { 34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 64, 64, 64, 64 }; static const uint8_t obmc_mask_32[32] = { 33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64 }; static const uint8_t obmc_mask_64[64] = { 33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44, 45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56, 56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62, 62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, }; const uint8_t *av1_get_obmc_mask(int length) { switch (length) { case 1: return obmc_mask_1; case 2: return obmc_mask_2; case 4: return obmc_mask_4; case 8: return obmc_mask_8; case 16: return obmc_mask_16; case 32: return obmc_mask_32; case 64: return obmc_mask_64; default: assert(0); return NULL; } } static inline void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, int dir, MB_MODE_INFO *mi, void *fun_ctxt, const int num_planes) { (void)xd; (void)rel_mi_row; (void)rel_mi_col; (void)op_mi_size; (void)dir; (void)mi; ++*(uint8_t *)fun_ctxt; (void)num_planes; } void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd) { MB_MODE_INFO *mbmi = xd->mi[0]; mbmi->overlappable_neighbors = 0; if (!is_motion_variation_allowed_bsize(mbmi->bsize)) return; foreach_overlappable_nb_above(cm, xd, INT_MAX, increment_int_ptr, &mbmi->overlappable_neighbors); if (mbmi->overlappable_neighbors) return; foreach_overlappable_nb_left(cm, xd, INT_MAX, increment_int_ptr, &mbmi->overlappable_neighbors); } // HW does not support < 4x4 prediction. To limit the bandwidth requirement, if // block-size of current plane is smaller than 8x8, always only blend with the // left neighbor(s) (skip blending with the above side). #define DISABLE_CHROMA_U8X8_OBMC 0 // 0: one-sided obmc; 1: disable int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize, const struct macroblockd_plane *pd, int dir) { assert(is_motion_variation_allowed_bsize(bsize)); const BLOCK_SIZE bsize_plane = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); switch (bsize_plane) { #if DISABLE_CHROMA_U8X8_OBMC case BLOCK_4X4: case BLOCK_8X4: case BLOCK_4X8: return 1; #else case BLOCK_4X4: case BLOCK_8X4: case BLOCK_4X8: return dir == 0; #endif default: return 0; } } #if CONFIG_AV1_DECODER static void modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) { mbmi->ref_frame[1] = NONE_FRAME; mbmi->interinter_comp.type = COMPOUND_AVERAGE; } #endif // CONFIG_AV1_DECODER struct obmc_inter_pred_ctxt { uint8_t **adjacent; int *adjacent_stride; }; static inline void build_obmc_inter_pred_above( MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, int dir, MB_MODE_INFO *above_mi, void *fun_ctxt, const int num_planes) { (void)above_mi; (void)rel_mi_row; (void)dir; struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt; const BLOCK_SIZE bsize = xd->mi[0]->bsize; const int overlap = AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1; for (int plane = 0; plane < num_planes; ++plane) { const struct macroblockd_plane *pd = &xd->plane[plane]; const int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x; const int bh = overlap >> pd->subsampling_y; const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x; if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue; const int dst_stride = pd->dst.stride; uint8_t *const dst = &pd->dst.buf[plane_col]; const int tmp_stride = ctxt->adjacent_stride[plane]; const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col]; const uint8_t *const mask = av1_get_obmc_mask(bh); #if CONFIG_AV1_HIGHBITDEPTH const int is_hbd = is_cur_buf_hbd(xd); if (is_hbd) aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, bw, bh, xd->bd); else aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, bw, bh); #else aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, bw, bh); #endif } } static inline void build_obmc_inter_pred_left( MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, int dir, MB_MODE_INFO *left_mi, void *fun_ctxt, const int num_planes) { (void)left_mi; (void)rel_mi_col; (void)dir; struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt; const BLOCK_SIZE bsize = xd->mi[0]->bsize; const int overlap = AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1; for (int plane = 0; plane < num_planes; ++plane) { const struct macroblockd_plane *pd = &xd->plane[plane]; const int bw = overlap >> pd->subsampling_x; const int bh = (op_mi_size * MI_SIZE) >> pd->subsampling_y; const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y; if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue; const int dst_stride = pd->dst.stride; uint8_t *const dst = &pd->dst.buf[plane_row * dst_stride]; const int tmp_stride = ctxt->adjacent_stride[plane]; const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride]; const uint8_t *const mask = av1_get_obmc_mask(bw); #if CONFIG_AV1_HIGHBITDEPTH const int is_hbd = is_cur_buf_hbd(xd); if (is_hbd) aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, bw, bh, xd->bd); else aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, bw, bh); #else aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, bw, bh); #endif } } // This function combines motion compensated predictions that are generated by // top/left neighboring blocks' inter predictors with the regular inter // prediction. We assume the original prediction (bmc) is stored in // xd->plane[].dst.buf void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *above[MAX_MB_PLANE], int above_stride[MAX_MB_PLANE], uint8_t *left[MAX_MB_PLANE], int left_stride[MAX_MB_PLANE]) { const BLOCK_SIZE bsize = xd->mi[0]->bsize; // handle above row struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride }; foreach_overlappable_nb_above(cm, xd, max_neighbor_obmc[mi_size_wide_log2[bsize]], build_obmc_inter_pred_above, &ctxt_above); // handle left column struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride }; foreach_overlappable_nb_left(cm, xd, max_neighbor_obmc[mi_size_high_log2[bsize]], build_obmc_inter_pred_left, &ctxt_left); } void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1, uint8_t **dst_buf2) { if (is_cur_buf_hbd(xd)) { int len = sizeof(uint16_t); dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]); dst_buf1[1] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len); dst_buf1[2] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len); dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]); dst_buf2[1] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len); dst_buf2[2] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len); } else { dst_buf1[0] = xd->tmp_obmc_bufs[0]; dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE; dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2; dst_buf2[0] = xd->tmp_obmc_bufs[1]; dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE; dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2; } } #if CONFIG_AV1_DECODER void av1_setup_build_prediction_by_above_pred( MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width, MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt, const int num_planes) { const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->bsize); const int above_mi_col = xd->mi_col + rel_mi_col; modify_neighbor_predictor_for_obmc(above_mbmi); for (int j = 0; j < num_planes; ++j) { struct macroblockd_plane *const pd = &xd->plane[j]; setup_pred_plane(&pd->dst, a_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j], ctxt->tmp_height[j], ctxt->tmp_stride[j], 0, rel_mi_col, NULL, pd->subsampling_x, pd->subsampling_y); } const int num_refs = 1 + has_second_ref(above_mbmi); for (int ref = 0; ref < num_refs; ++ref) { const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref]; const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); const struct scale_factors *const sf = get_ref_scale_factors_const(ctxt->cm, frame); xd->block_ref_scale_factors[ref] = sf; if ((!av1_is_valid_scale(sf))) aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, "Reference frame has invalid dimensions"); av1_setup_pre_planes(xd, ref, &ref_buf->buf, xd->mi_row, above_mi_col, sf, num_planes); } xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col); xd->mb_to_right_edge = ctxt->mb_to_far_edge + (xd->width - rel_mi_col - above_mi_width) * MI_SIZE * 8; } void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height, MB_MODE_INFO *left_mbmi, struct build_prediction_ctxt *ctxt, const int num_planes) { const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->bsize); const int left_mi_row = xd->mi_row + rel_mi_row; modify_neighbor_predictor_for_obmc(left_mbmi); for (int j = 0; j < num_planes; ++j) { struct macroblockd_plane *const pd = &xd->plane[j]; setup_pred_plane(&pd->dst, l_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j], ctxt->tmp_height[j], ctxt->tmp_stride[j], rel_mi_row, 0, NULL, pd->subsampling_x, pd->subsampling_y); } const int num_refs = 1 + has_second_ref(left_mbmi); for (int ref = 0; ref < num_refs; ++ref) { const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref]; const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); const struct scale_factors *const ref_scale_factors = get_ref_scale_factors_const(ctxt->cm, frame); xd->block_ref_scale_factors[ref] = ref_scale_factors; if ((!av1_is_valid_scale(ref_scale_factors))) aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, "Reference frame has invalid dimensions"); av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, xd->mi_col, ref_scale_factors, num_planes); } xd->mb_to_top_edge = GET_MV_SUBPEL(MI_SIZE * (-left_mi_row)); xd->mb_to_bottom_edge = ctxt->mb_to_far_edge + GET_MV_SUBPEL((xd->height - rel_mi_row - left_mi_height) * MI_SIZE); } #endif // CONFIG_AV1_DECODER static inline void combine_interintra( INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index, int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize, uint8_t *comppred, int compstride, const uint8_t *interpred, int interstride, const uint8_t *intrapred, int intrastride) { const int bw = block_size_wide[plane_bsize]; const int bh = block_size_high[plane_bsize]; if (use_wedge_interintra) { if (av1_is_wedge_used(bsize)) { const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); const int subw = 2 * mi_size_wide[bsize] == bw; const int subh = 2 * mi_size_high[bsize] == bh; aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred, interstride, mask, block_size_wide[bsize], bw, bh, subw, subh); } return; } const uint8_t *mask = smooth_interintra_mask_buf[mode][plane_bsize]; aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred, interstride, mask, bw, bw, bh, 0, 0); } #if CONFIG_AV1_HIGHBITDEPTH static inline void combine_interintra_highbd( INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index, int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize, uint8_t *comppred8, int compstride, const uint8_t *interpred8, int interstride, const uint8_t *intrapred8, int intrastride, int bd) { const int bw = block_size_wide[plane_bsize]; const int bh = block_size_high[plane_bsize]; if (use_wedge_interintra) { if (av1_is_wedge_used(bsize)) { const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); const int subh = 2 * mi_size_high[bsize] == bh; const int subw = 2 * mi_size_wide[bsize] == bw; aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride, interpred8, interstride, mask, block_size_wide[bsize], bw, bh, subw, subh, bd); } return; } uint8_t mask[MAX_SB_SQUARE]; build_smooth_interintra_mask(mask, bw, plane_bsize, mode); aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride, interpred8, interstride, mask, bw, bw, bh, 0, 0, bd); } #endif void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, const BUFFER_SET *ctx, uint8_t *dst, int dst_stride) { struct macroblockd_plane *const pd = &xd->plane[plane]; const int ssx = xd->plane[plane].subsampling_x; const int ssy = xd->plane[plane].subsampling_y; BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy); PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode]; assert(xd->mi[0]->angle_delta[PLANE_TYPE_Y] == 0); assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0); assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0); assert(xd->mi[0]->use_intrabc == 0); const SequenceHeader *seq_params = cm->seq_params; av1_predict_intra_block(xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width, pd->height, max_txsize_rect_lookup[plane_bsize], mode, 0, 0, FILTER_INTRA_MODES, ctx->plane[plane], ctx->stride[plane], dst, dst_stride, 0, 0, plane); } void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, const uint8_t *inter_pred, int inter_stride, const uint8_t *intra_pred, int intra_stride) { const int ssx = xd->plane[plane].subsampling_x; const int ssy = xd->plane[plane].subsampling_y; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy); #if CONFIG_AV1_HIGHBITDEPTH if (is_cur_buf_hbd(xd)) { combine_interintra_highbd( xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra, xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize, plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, inter_pred, inter_stride, intra_pred, intra_stride, xd->bd); return; } #endif combine_interintra( xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra, xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize, plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, inter_pred, inter_stride, intra_pred, intra_stride); } // build interintra_predictors for one plane void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *pred, int stride, const BUFFER_SET *ctx, int plane, BLOCK_SIZE bsize) { assert(bsize < BLOCK_SIZES_ALL); if (is_cur_buf_hbd(xd)) { DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]); av1_build_intra_predictors_for_interintra( cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE); av1_combine_interintra(xd, bsize, plane, pred, stride, CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE); } else { DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]); av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx, intrapredictor, MAX_SB_SIZE); av1_combine_interintra(xd, bsize, plane, pred, stride, intrapredictor, MAX_SB_SIZE); } } aom-3.12.1/av1/common/reconinter.h000066400000000000000000000456101477627663500167070ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_RECONINTER_H_ #define AOM_AV1_COMMON_RECONINTER_H_ #include "av1/common/av1_common_int.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" #include "av1/common/warped_motion.h" #include "aom/aom_integer.h" // Work out how many pixels off the edge of a reference frame we're allowed // to go when forming an inter prediction. // The outermost row/col of each referernce frame is extended by // (AOM_BORDER_IN_PIXELS >> subsampling) pixels, but we need to keep // at least AOM_INTERP_EXTEND pixels within that to account for filtering. // // We have to break this up into two macros to keep both clang-format and // tools/lint-hunks.py happy. #define AOM_LEFT_TOP_MARGIN_PX(subsampling) \ ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND) #define AOM_LEFT_TOP_MARGIN_SCALED(subsampling) \ (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS) #ifdef __cplusplus extern "C" { #endif #define MAX_WEDGE_TYPES 16 #define MAX_WEDGE_SIZE_LOG2 5 // 32x32 #define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2) #define MAX_WEDGE_SQUARE (MAX_WEDGE_SIZE * MAX_WEDGE_SIZE) #define WEDGE_WEIGHT_BITS 6 #define WEDGE_NONE -1 // Angles are with respect to horizontal anti-clockwise enum { WEDGE_HORIZONTAL = 0, WEDGE_VERTICAL = 1, WEDGE_OBLIQUE27 = 2, WEDGE_OBLIQUE63 = 3, WEDGE_OBLIQUE117 = 4, WEDGE_OBLIQUE153 = 5, WEDGE_DIRECTIONS } UENUM1BYTE(WedgeDirectionType); // 3-tuple: {direction, x_offset, y_offset} typedef struct { WedgeDirectionType direction; int x_offset; int y_offset; } wedge_code_type; typedef uint8_t *wedge_masks_type[MAX_WEDGE_TYPES]; typedef struct { int wedge_types; const wedge_code_type *codebook; uint8_t *signflip; wedge_masks_type *masks; } wedge_params_type; extern const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL]; typedef struct SubpelParams { int xs; int ys; int subpel_x; int subpel_y; int pos_x; int pos_y; } SubpelParams; struct build_prediction_ctxt { const AV1_COMMON *cm; uint8_t **tmp_buf; int *tmp_width; int *tmp_height; int *tmp_stride; int mb_to_far_edge; void *dcb; // Decoder-only coding block. }; typedef enum InterPredMode { TRANSLATION_PRED, WARP_PRED, } InterPredMode; typedef enum InterCompMode { UNIFORM_SINGLE, UNIFORM_COMP, MASK_COMP, } InterCompMode; typedef struct InterPredParams { InterPredMode mode; InterCompMode comp_mode; WarpedMotionParams warp_params; ConvolveParams conv_params; const InterpFilterParams *interp_filter_params[2]; int block_width; int block_height; int pix_row; int pix_col; struct buf_2d ref_frame_buf; int subsampling_x; int subsampling_y; const struct scale_factors *scale_factors; int bit_depth; int use_hbd_buf; INTERINTER_COMPOUND_DATA mask_comp; BLOCK_SIZE sb_type; int is_intrabc; int top; int left; } InterPredParams; // Initialize sub-pel params required for inter prediction. static inline void init_subpel_params(const MV *const src_mv, InterPredParams *const inter_pred_params, SubpelParams *subpel_params, int width, int height) { const struct scale_factors *sf = inter_pred_params->scale_factors; int ssx = inter_pred_params->subsampling_x; int ssy = inter_pred_params->subsampling_y; int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS; orig_pos_y += src_mv->row * (1 << (1 - ssy)); int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS; orig_pos_x += src_mv->col * (1 << (1 - ssx)); const int is_scaled = av1_is_scaled(sf); int pos_x, pos_y; if (LIKELY(!is_scaled)) { pos_y = av1_unscaled_value(orig_pos_y, sf); pos_x = av1_unscaled_value(orig_pos_x, sf); } else { pos_y = av1_scaled_y(orig_pos_y, sf); pos_x = av1_scaled_x(orig_pos_x, sf); } pos_x += SCALE_EXTRA_OFF; pos_y += SCALE_EXTRA_OFF; const int bottom = (height + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS; const int right = (width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS; pos_y = clamp(pos_y, inter_pred_params->top, bottom); pos_x = clamp(pos_x, inter_pred_params->left, right); subpel_params->pos_x = pos_x; subpel_params->pos_y = pos_y; subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK; subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK; subpel_params->xs = sf->x_step_q4; subpel_params->ys = sf->y_step_q4; } // Initialize interp filter required for inter prediction. static inline void init_interp_filter_params( const InterpFilterParams *interp_filter_params[2], const InterpFilters *filter, int block_width, int block_height, int is_intrabc) { if (UNLIKELY(is_intrabc)) { interp_filter_params[0] = &av1_intrabc_filter_params; interp_filter_params[1] = &av1_intrabc_filter_params; } else { interp_filter_params[0] = av1_get_interp_filter_params_with_block_size( (InterpFilter)filter->x_filter, block_width); interp_filter_params[1] = av1_get_interp_filter_params_with_block_size( (InterpFilter)filter->y_filter, block_height); } } // Initialize parameters required for inter prediction at mode level. static inline void init_inter_mode_params( const MV *const src_mv, InterPredParams *const inter_pred_params, SubpelParams *subpel_params, const struct scale_factors *sf, int width, int height) { inter_pred_params->scale_factors = sf; init_subpel_params(src_mv, inter_pred_params, subpel_params, width, height); } // Initialize parameters required for inter prediction at block level. static inline void init_inter_block_params(InterPredParams *inter_pred_params, int block_width, int block_height, int pix_row, int pix_col, int subsampling_x, int subsampling_y, int bit_depth, int use_hbd_buf, int is_intrabc) { inter_pred_params->block_width = block_width; inter_pred_params->block_height = block_height; inter_pred_params->pix_row = pix_row; inter_pred_params->pix_col = pix_col; inter_pred_params->subsampling_x = subsampling_x; inter_pred_params->subsampling_y = subsampling_y; inter_pred_params->bit_depth = bit_depth; inter_pred_params->use_hbd_buf = use_hbd_buf; inter_pred_params->is_intrabc = is_intrabc; inter_pred_params->mode = TRANSLATION_PRED; inter_pred_params->comp_mode = UNIFORM_SINGLE; inter_pred_params->top = -AOM_LEFT_TOP_MARGIN_SCALED(subsampling_y); inter_pred_params->left = -AOM_LEFT_TOP_MARGIN_SCALED(subsampling_x); } // Initialize params required for inter prediction. static inline void av1_init_inter_params( InterPredParams *inter_pred_params, int block_width, int block_height, int pix_row, int pix_col, int subsampling_x, int subsampling_y, int bit_depth, int use_hbd_buf, int is_intrabc, const struct scale_factors *sf, const struct buf_2d *ref_buf, int_interpfilters interp_filters) { init_inter_block_params(inter_pred_params, block_width, block_height, pix_row, pix_col, subsampling_x, subsampling_y, bit_depth, use_hbd_buf, is_intrabc); init_interp_filter_params(inter_pred_params->interp_filter_params, &interp_filters.as_filters, block_width, block_height, is_intrabc); inter_pred_params->scale_factors = sf; inter_pred_params->ref_frame_buf = *ref_buf; } static inline void av1_init_comp_mode(InterPredParams *inter_pred_params) { inter_pred_params->comp_mode = UNIFORM_COMP; } void av1_init_warp_params(InterPredParams *inter_pred_params, const WarpTypesAllowed *warp_types, int ref, const MACROBLOCKD *xd, const MB_MODE_INFO *mi); static inline int has_scale(int xs, int ys) { return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS; } static inline void revert_scale_extra_bits(SubpelParams *sp) { sp->subpel_x >>= SCALE_EXTRA_BITS; sp->subpel_y >>= SCALE_EXTRA_BITS; sp->xs >>= SCALE_EXTRA_BITS; sp->ys >>= SCALE_EXTRA_BITS; assert(sp->subpel_x < SUBPEL_SHIFTS); assert(sp->subpel_y < SUBPEL_SHIFTS); assert(sp->xs <= SUBPEL_SHIFTS); assert(sp->ys <= SUBPEL_SHIFTS); } static inline void inter_predictor( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const SubpelParams *subpel_params, int w, int h, ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2]) { assert(conv_params->do_average == 0 || conv_params->do_average == 1); const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys); if (is_scaled) { av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, interp_filters, subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y, subpel_params->ys, 1, conv_params); } else { SubpelParams sp = *subpel_params; revert_scale_extra_bits(&sp); av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x, sp.xs, sp.subpel_y, sp.ys, 0, conv_params); } } static inline void highbd_inter_predictor( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const SubpelParams *subpel_params, int w, int h, ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2], int bd) { assert(conv_params->do_average == 0 || conv_params->do_average == 1); const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys); if (is_scaled) { av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, interp_filters, subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y, subpel_params->ys, 1, conv_params, bd); } else { SubpelParams sp = *subpel_params; revert_scale_extra_bits(&sp); av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x, sp.xs, sp.subpel_y, sp.ys, 0, conv_params, bd); } } int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize, const struct macroblockd_plane *pd, int dir); static inline int is_interinter_compound_used(COMPOUND_TYPE type, BLOCK_SIZE sb_type) { const int comp_allowed = is_comp_ref_allowed(sb_type); switch (type) { case COMPOUND_AVERAGE: case COMPOUND_DISTWTD: case COMPOUND_DIFFWTD: return comp_allowed; case COMPOUND_WEDGE: return comp_allowed && av1_wedge_params_lookup[sb_type].wedge_types > 0; default: assert(0); return 0; } } static inline int is_any_masked_compound_used(BLOCK_SIZE sb_type) { COMPOUND_TYPE comp_type; int i; if (!is_comp_ref_allowed(sb_type)) return 0; for (i = 0; i < COMPOUND_TYPES; i++) { comp_type = (COMPOUND_TYPE)i; if (is_masked_compound_type(comp_type) && is_interinter_compound_used(comp_type, sb_type)) return 1; } return 0; } static inline int get_wedge_types_lookup(BLOCK_SIZE sb_type) { return av1_wedge_params_lookup[sb_type].wedge_types; } static inline int av1_is_wedge_used(BLOCK_SIZE sb_type) { return av1_wedge_params_lookup[sb_type].wedge_types > 0; } void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, InterPredParams *inter_pred_params, const SubpelParams *subpel_params); void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride, InterPredParams *inter_pred_params, const SubpelParams *subpel_params); // TODO(jkoleszar): yet another mv clamping function :-( static inline MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv, int bw, int bh, int ss_x, int ss_y) { // If the MV points so far into the UMV border that no visible pixels // are used for reconstruction, the subpel part of the MV can be // discarded and the MV limited to 16 pixels with equivalent results. const int spel_left = (AOM_INTERP_EXTEND + bw) << SUBPEL_BITS; const int spel_right = spel_left - SUBPEL_SHIFTS; const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS; const int spel_bottom = spel_top - SUBPEL_SHIFTS; MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))), (int16_t)(src_mv->col * (1 << (1 - ss_x))) }; assert(ss_x <= 1); assert(ss_y <= 1); const SubpelMvLimits mv_limits = { xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left, xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right, xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top, xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom }; clamp_mv(&clamped_mv, &mv_limits); return clamped_mv; } static inline int64_t scaled_buffer_offset(int x_offset, int y_offset, int stride, const struct scale_factors *sf) { int x, y; if (!sf) { x = x_offset; y = y_offset; } else if (av1_is_scaled(sf)) { x = av1_scaled_x(x_offset, sf) >> SCALE_EXTRA_BITS; y = av1_scaled_y(y_offset, sf) >> SCALE_EXTRA_BITS; } else { x = av1_unscaled_value(x_offset, sf) >> SCALE_EXTRA_BITS; y = av1_unscaled_value(y_offset, sf) >> SCALE_EXTRA_BITS; } return (int64_t)y * stride + x; } static inline void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize, uint8_t *src, int width, int height, int stride, int mi_row, int mi_col, const struct scale_factors *scale, int subsampling_x, int subsampling_y) { // Offset the buffer pointer if (subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1)) mi_row -= 1; if (subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1)) mi_col -= 1; const int x = (MI_SIZE * mi_col) >> subsampling_x; const int y = (MI_SIZE * mi_row) >> subsampling_y; dst->buf = src + scaled_buffer_offset(x, y, stride, scale); dst->buf0 = src; dst->width = width; dst->height = height; dst->stride = stride; } void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize, const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, const int plane_start, const int plane_end); void av1_setup_pre_planes(MACROBLOCKD *xd, int idx, const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, const struct scale_factors *sf, const int num_planes); static inline void set_default_interp_filters( MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) { mbmi->interp_filters = av1_broadcast_interp_filter(av1_unswitchable_filter(frame_interp_filter)); } static inline int av1_is_interp_needed(const MACROBLOCKD *const xd) { const MB_MODE_INFO *const mbmi = xd->mi[0]; if (mbmi->skip_mode) return 0; if (mbmi->motion_mode == WARPED_CAUSAL) return 0; if (is_nontrans_global_motion(xd, xd->mi[0])) return 0; return 1; } // Sets up buffers 'dst_buf1' and 'dst_buf2' from relevant buffers in 'xd' for // subsequent use in OBMC prediction. void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1, uint8_t **dst_buf2); void av1_setup_build_prediction_by_above_pred( MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width, MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt, const int num_planes); void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height, MB_MODE_INFO *left_mbmi, struct build_prediction_ctxt *ctxt, const int num_planes); void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *above[MAX_MB_PLANE], int above_stride[MAX_MB_PLANE], uint8_t *left[MAX_MB_PLANE], int left_stride[MAX_MB_PLANE]); const uint8_t *av1_get_obmc_mask(int length); void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd); #define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1) #define MASK_MASTER_STRIDE (MASK_MASTER_SIZE) void av1_init_wedge_masks(void); static inline const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index, int8_t wedge_sign, BLOCK_SIZE sb_type) { return av1_wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index]; } void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi, int *fwd_offset, int *bck_offset, int *use_dist_wtd_comp_avg, int is_compound); const uint8_t *av1_get_compound_type_mask( const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type); // build interintra_predictors for one plane void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *pred, int stride, const BUFFER_SET *ctx, int plane, BLOCK_SIZE bsize); void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, const BUFFER_SET *ctx, uint8_t *dst, int dst_stride); void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, const uint8_t *inter_pred, int inter_stride, const uint8_t *intra_pred, int intra_stride); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_RECONINTER_H_ aom-3.12.1/av1/common/reconinter_template.inc000066400000000000000000000264341477627663500211270ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef IS_DEC #error "IS_DEC must be defined for reconinter_template.inc." #endif #if IS_DEC static inline void build_one_inter_predictor(uint8_t *dst, int dst_stride, const MV *src_mv, InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y, int ref, uint8_t **mc_buf) { #else static inline void build_one_inter_predictor( uint8_t *dst, int dst_stride, const MV *src_mv, InterPredParams *inter_pred_params) { #endif // IS_DEC SubpelParams subpel_params; uint8_t *src; int src_stride; #if IS_DEC dec_calc_subpel_params_and_extend(src_mv, inter_pred_params, xd, mi_x, mi_y, ref, mc_buf, &src, &subpel_params, &src_stride); #else enc_calc_subpel_params(src_mv, inter_pred_params, &src, &subpel_params, &src_stride); #endif // IS_DEC if (inter_pred_params->comp_mode == UNIFORM_SINGLE || inter_pred_params->comp_mode == UNIFORM_COMP) { av1_make_inter_predictor(src, src_stride, dst, dst_stride, inter_pred_params, &subpel_params); } else { av1_make_masked_inter_predictor(src, src_stride, dst, dst_stride, inter_pred_params, &subpel_params); } } // True if the following hold: // 1. Not intrabc and not build_for_obmc // 2. At least one dimension is size 4 with subsampling // 3. If sub-sampled, none of the previous blocks around the sub-sample // are intrabc or inter-blocks static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize, int is_intrabc, int build_for_obmc) { if (is_intrabc || build_for_obmc) { return false; } const struct macroblockd_plane *const pd = &xd->plane[plane]; const int ss_x = pd->subsampling_x; const int ss_y = pd->subsampling_y; const int is_sub4_x = (block_size_wide[bsize] == 4) && ss_x; const int is_sub4_y = (block_size_high[bsize] == 4) && ss_y; if (!is_sub4_x && !is_sub4_y) { return false; } // For sub8x8 chroma blocks, we may be covering more than one luma block's // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for // the top-left corner of the prediction source - the correct top-left corner // is at (pre_x, pre_y). const int row_start = is_sub4_y ? -1 : 0; const int col_start = is_sub4_x ? -1 : 0; for (int row = row_start; row <= 0; ++row) { for (int col = col_start; col <= 0; ++col) { const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; if (!is_inter_block(this_mbmi)) return false; if (is_intrabc_block(this_mbmi)) return false; } } return true; } #if IS_DEC static inline void build_inter_predictors_sub8x8(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, int mi_x, int mi_y, uint8_t **mc_buf) { #else static inline void build_inter_predictors_sub8x8(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, int mi_x, int mi_y) { #endif // IS_DEC const BLOCK_SIZE bsize = mi->bsize; struct macroblockd_plane *const pd = &xd->plane[plane]; const bool ss_x = pd->subsampling_x; const bool ss_y = pd->subsampling_y; const int b4_w = block_size_wide[bsize] >> ss_x; const int b4_h = block_size_high[bsize] >> ss_y; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); const int b8_w = block_size_wide[plane_bsize]; const int b8_h = block_size_high[plane_bsize]; const int is_compound = has_second_ref(mi); assert(!is_compound); assert(!is_intrabc_block(mi)); // For sub8x8 chroma blocks, we may be covering more than one luma block's // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for // the top-left corner of the prediction source - the correct top-left corner // is at (pre_x, pre_y). const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0; const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0; const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x; const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y; int row = row_start; for (int y = 0; y < b8_h; y += b4_h) { int col = col_start; for (int x = 0; x < b8_w; x += b4_w) { MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; struct buf_2d *const dst_buf = &pd->dst; uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x; int ref = 0; const RefCntBuffer *ref_buf = get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]); const struct scale_factors *ref_scale_factors = get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]); const struct scale_factors *const sf = ref_scale_factors; const struct buf_2d pre_buf = { NULL, (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer, ref_buf->buf.uv_crop_width, ref_buf->buf.uv_crop_height, ref_buf->buf.uv_stride, }; const MV mv = this_mbmi->mv[ref].as_mv; InterPredParams inter_pred_params; av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y, pre_x + x, pd->subsampling_x, pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf, &pre_buf, this_mbmi->interp_filters); inter_pred_params.conv_params = get_conv_params_no_round(ref, plane, NULL, 0, is_compound, xd->bd); #if IS_DEC build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params, xd, mi_x + x, mi_y + y, ref, mc_buf); #else build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params); #endif // IS_DEC ++col; } ++row; } } #if IS_DEC static inline void build_inter_predictors_8x8_and_bigger( const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf) { #else static inline void build_inter_predictors_8x8_and_bigger( const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, int build_for_obmc, int bw, int bh, int mi_x, int mi_y) { #endif // IS_DEC const int is_compound = has_second_ref(mi); const int is_intrabc = is_intrabc_block(mi); assert(IMPLIES(is_intrabc, !is_compound)); struct macroblockd_plane *const pd = &xd->plane[plane]; struct buf_2d *const dst_buf = &pd->dst; uint8_t *const dst = dst_buf->buf; int is_global[2] = { 0, 0 }; for (int ref = 0; ref < 1 + is_compound; ++ref) { const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; is_global[ref] = is_global_mv_block(mi, wm->wmtype); } const BLOCK_SIZE bsize = mi->bsize; const int ss_x = pd->subsampling_x; const int ss_y = pd->subsampling_y; const int row_start = (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0; const int col_start = (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0; const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x; const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y; for (int ref = 0; ref < 1 + is_compound; ++ref) { const struct scale_factors *const sf = is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref]; struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; const MV mv = mi->mv[ref].as_mv; const WarpTypesAllowed warp_types = { is_global[ref], mi->motion_mode == WARPED_CAUSAL }; InterPredParams inter_pred_params; av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x, pd->subsampling_x, pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf, pre_buf, mi->interp_filters); if (is_compound) av1_init_comp_mode(&inter_pred_params); inter_pred_params.conv_params = get_conv_params_no_round( ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd); av1_dist_wtd_comp_weight_assign( cm, mi, &inter_pred_params.conv_params.fwd_offset, &inter_pred_params.conv_params.bck_offset, &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound); if (!build_for_obmc) av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi); if (is_masked_compound_type(mi->interinter_comp.type)) { inter_pred_params.sb_type = mi->bsize; inter_pred_params.mask_comp = mi->interinter_comp; if (ref == 1) { inter_pred_params.conv_params.do_average = 0; inter_pred_params.comp_mode = MASK_COMP; } // Assign physical buffer. inter_pred_params.mask_comp.seg_mask = xd->seg_mask; } #if IS_DEC build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params, xd, mi_x, mi_y, ref, mc_buf); #else build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params); #endif // IS_DEC } } #if IS_DEC static inline void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf) { if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi), build_for_obmc)) { assert(bw < 8 || bh < 8); build_inter_predictors_sub8x8(cm, xd, plane, mi, mi_x, mi_y, mc_buf); } else { build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw, bh, mi_x, mi_y, mc_buf); } } #else static inline void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, int build_for_obmc, int bw, int bh, int mi_x, int mi_y) { if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi), build_for_obmc)) { assert(bw < 8 || bh < 8); build_inter_predictors_sub8x8(cm, xd, plane, mi, mi_x, mi_y); } else { build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw, bh, mi_x, mi_y); } } #endif // IS_DEC aom-3.12.1/av1/common/reconintra.c000066400000000000000000002112701477627663500166730ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/aom_once.h" #include "aom_ports/mem.h" #include "av1/common/av1_common_int.h" #include "av1/common/cfl.h" #include "av1/common/reconintra.h" enum { NEED_LEFT = 1 << 1, NEED_ABOVE = 1 << 2, NEED_ABOVERIGHT = 1 << 3, NEED_ABOVELEFT = 1 << 4, NEED_BOTTOMLEFT = 1 << 5, }; #define INTRA_EDGE_FILT 3 #define INTRA_EDGE_TAPS 5 #define MAX_UPSAMPLE_SZ 16 #define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32) static const uint8_t extend_modes[INTRA_MODES] = { NEED_ABOVE | NEED_LEFT, // DC NEED_ABOVE, // V NEED_LEFT, // H NEED_ABOVE | NEED_ABOVERIGHT, // D45 NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D135 NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D113 NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D157 NEED_LEFT | NEED_BOTTOMLEFT, // D203 NEED_ABOVE | NEED_ABOVERIGHT, // D67 NEED_LEFT | NEED_ABOVE, // SMOOTH NEED_LEFT | NEED_ABOVE, // SMOOTH_V NEED_LEFT | NEED_ABOVE, // SMOOTH_H NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // PAETH }; // Tables to store if the top-right reference pixels are available. The flags // are represented with bits, packed into 8-bit integers. E.g., for the 32x32 // blocks in a 128x128 superblock, the index of the "o" block is 10 (in raster // order), so its flag is stored at the 3rd bit of the 2nd entry in the table, // i.e. (table[10 / 8] >> (10 % 8)) & 1. // . . . . // . . . . // . . o . // . . . . static uint8_t has_tr_4x4[128] = { 255, 255, 255, 255, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, 255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, 255, 255, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, 255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, }; static uint8_t has_tr_4x8[64] = { 255, 255, 255, 255, 119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119, 255, 127, 255, 127, 119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119, 255, 255, 255, 127, 119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119, 255, 127, 255, 127, 119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119, }; static uint8_t has_tr_8x4[64] = { 255, 255, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, 127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, 255, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, 127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, }; static uint8_t has_tr_8x8[32] = { 255, 255, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85, 255, 127, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85, }; static uint8_t has_tr_8x16[16] = { 255, 255, 119, 119, 127, 127, 119, 119, 255, 127, 119, 119, 127, 127, 119, 119, }; static uint8_t has_tr_16x8[16] = { 255, 0, 85, 0, 119, 0, 85, 0, 127, 0, 85, 0, 119, 0, 85, 0, }; static uint8_t has_tr_16x16[8] = { 255, 85, 119, 85, 127, 85, 119, 85, }; static uint8_t has_tr_16x32[4] = { 255, 119, 127, 119 }; static uint8_t has_tr_32x16[4] = { 15, 5, 7, 5 }; static uint8_t has_tr_32x32[2] = { 95, 87 }; static uint8_t has_tr_32x64[1] = { 127 }; static uint8_t has_tr_64x32[1] = { 19 }; static uint8_t has_tr_64x64[1] = { 7 }; static uint8_t has_tr_64x128[1] = { 3 }; static uint8_t has_tr_128x64[1] = { 1 }; static uint8_t has_tr_128x128[1] = { 1 }; static uint8_t has_tr_4x16[32] = { 255, 255, 255, 255, 127, 127, 127, 127, 255, 127, 255, 127, 127, 127, 127, 127, 255, 255, 255, 127, 127, 127, 127, 127, 255, 127, 255, 127, 127, 127, 127, 127, }; static uint8_t has_tr_16x4[32] = { 255, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0, 127, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0, }; static uint8_t has_tr_8x32[8] = { 255, 255, 127, 127, 255, 127, 127, 127, }; static uint8_t has_tr_32x8[8] = { 15, 0, 5, 0, 7, 0, 5, 0, }; static uint8_t has_tr_16x64[2] = { 255, 127 }; static uint8_t has_tr_64x16[2] = { 3, 1 }; static const uint8_t *const has_tr_tables[BLOCK_SIZES_ALL] = { // 4X4 has_tr_4x4, // 4X8, 8X4, 8X8 has_tr_4x8, has_tr_8x4, has_tr_8x8, // 8X16, 16X8, 16X16 has_tr_8x16, has_tr_16x8, has_tr_16x16, // 16X32, 32X16, 32X32 has_tr_16x32, has_tr_32x16, has_tr_32x32, // 32X64, 64X32, 64X64 has_tr_32x64, has_tr_64x32, has_tr_64x64, // 64x128, 128x64, 128x128 has_tr_64x128, has_tr_128x64, has_tr_128x128, // 4x16, 16x4, 8x32 has_tr_4x16, has_tr_16x4, has_tr_8x32, // 32x8, 16x64, 64x16 has_tr_32x8, has_tr_16x64, has_tr_64x16 }; static uint8_t has_tr_vert_8x8[32] = { 255, 255, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0, 255, 127, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0, }; static uint8_t has_tr_vert_16x16[8] = { 255, 0, 119, 0, 127, 0, 119, 0, }; static uint8_t has_tr_vert_32x32[2] = { 15, 7 }; static uint8_t has_tr_vert_64x64[1] = { 3 }; // The _vert_* tables are like the ordinary tables above, but describe the // order we visit square blocks when doing a PARTITION_VERT_A or // PARTITION_VERT_B. This is the same order as normal except for on the last // split where we go vertically (TL, BL, TR, BR). We treat the rectangular block // as a pair of squares, which means that these tables work correctly for both // mixed vertical partition types. // // There are tables for each of the square sizes. Vertical rectangles (like // BLOCK_16X32) use their respective "non-vert" table static const uint8_t *const has_tr_vert_tables[BLOCK_SIZES] = { // 4X4 NULL, // 4X8, 8X4, 8X8 has_tr_4x8, NULL, has_tr_vert_8x8, // 8X16, 16X8, 16X16 has_tr_8x16, NULL, has_tr_vert_16x16, // 16X32, 32X16, 32X32 has_tr_16x32, NULL, has_tr_vert_32x32, // 32X64, 64X32, 64X64 has_tr_32x64, NULL, has_tr_vert_64x64, // 64x128, 128x64, 128x128 has_tr_64x128, NULL, has_tr_128x128 }; static const uint8_t *get_has_tr_table(PARTITION_TYPE partition, BLOCK_SIZE bsize) { const uint8_t *ret = NULL; // If this is a mixed vertical partition, look up bsize in orders_vert. if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) { assert(bsize < BLOCK_SIZES); ret = has_tr_vert_tables[bsize]; } else { ret = has_tr_tables[bsize]; } assert(ret); return ret; } static int has_top_right(BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row, int mi_col, int top_available, int right_available, PARTITION_TYPE partition, TX_SIZE txsz, int row_off, int col_off, int ss_x, int ss_y) { if (!top_available || !right_available) return 0; const int bw_unit = mi_size_wide[bsize]; const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1); const int top_right_count_unit = tx_size_wide_unit[txsz]; if (row_off > 0) { // Just need to check if enough pixels on the right. if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64]) { // Special case: For 128x128 blocks, the transform unit whose // top-right corner is at the center of the block does in fact have // pixels available at its top-right corner. if (row_off == mi_size_high[BLOCK_64X64] >> ss_y && col_off + top_right_count_unit == mi_size_wide[BLOCK_64X64] >> ss_x) { return 1; } const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x; const int col_off_64 = col_off % plane_bw_unit_64; return col_off_64 + top_right_count_unit < plane_bw_unit_64; } return col_off + top_right_count_unit < plane_bw_unit; } else { // All top-right pixels are in the block above, which is already available. if (col_off + top_right_count_unit < plane_bw_unit) return 1; const int bw_in_mi_log2 = mi_size_wide_log2[bsize]; const int bh_in_mi_log2 = mi_size_high_log2[bsize]; const int sb_mi_size = mi_size_high[sb_size]; const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2; const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2; // Top row of superblock: so top-right pixels are in the top and/or // top-right superblocks, both of which are already available. if (blk_row_in_sb == 0) return 1; // Rightmost column of superblock (and not the top row): so top-right pixels // fall in the right superblock, which is not available yet. if (((blk_col_in_sb + 1) << bw_in_mi_log2) >= sb_mi_size) { return 0; } // General case (neither top row nor rightmost column): check if the // top-right block is coded before the current block. const int this_blk_index = ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) + blk_col_in_sb + 0; const int idx1 = this_blk_index / 8; const int idx2 = this_blk_index % 8; const uint8_t *has_tr_table = get_has_tr_table(partition, bsize); return (has_tr_table[idx1] >> idx2) & 1; } } // Similar to the has_tr_* tables, but store if the bottom-left reference // pixels are available. static uint8_t has_bl_4x4[128] = { 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 1, 0, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 0, 0, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 1, 0, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 0, 0, }; static uint8_t has_bl_4x8[64] = { 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0, 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0, 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0, 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0, }; static uint8_t has_bl_8x4[64] = { 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1, 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0, 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1, 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0, }; static uint8_t has_bl_8x8[32] = { 84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0, 84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0, }; static uint8_t has_bl_8x16[16] = { 16, 17, 0, 1, 16, 17, 0, 0, 16, 17, 0, 1, 16, 17, 0, 0, }; static uint8_t has_bl_16x8[16] = { 254, 84, 254, 16, 254, 84, 254, 0, 254, 84, 254, 16, 254, 84, 254, 0, }; static uint8_t has_bl_16x16[8] = { 84, 16, 84, 0, 84, 16, 84, 0, }; static uint8_t has_bl_16x32[4] = { 16, 0, 16, 0 }; static uint8_t has_bl_32x16[4] = { 78, 14, 78, 14 }; static uint8_t has_bl_32x32[2] = { 4, 4 }; static uint8_t has_bl_32x64[1] = { 0 }; static uint8_t has_bl_64x32[1] = { 34 }; static uint8_t has_bl_64x64[1] = { 0 }; static uint8_t has_bl_64x128[1] = { 0 }; static uint8_t has_bl_128x64[1] = { 0 }; static uint8_t has_bl_128x128[1] = { 0 }; static uint8_t has_bl_4x16[32] = { 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, }; static uint8_t has_bl_16x4[32] = { 254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0, 254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0, }; static uint8_t has_bl_8x32[8] = { 0, 1, 0, 0, 0, 1, 0, 0, }; static uint8_t has_bl_32x8[8] = { 238, 78, 238, 14, 238, 78, 238, 14, }; static uint8_t has_bl_16x64[2] = { 0, 0 }; static uint8_t has_bl_64x16[2] = { 42, 42 }; static const uint8_t *const has_bl_tables[BLOCK_SIZES_ALL] = { // 4X4 has_bl_4x4, // 4X8, 8X4, 8X8 has_bl_4x8, has_bl_8x4, has_bl_8x8, // 8X16, 16X8, 16X16 has_bl_8x16, has_bl_16x8, has_bl_16x16, // 16X32, 32X16, 32X32 has_bl_16x32, has_bl_32x16, has_bl_32x32, // 32X64, 64X32, 64X64 has_bl_32x64, has_bl_64x32, has_bl_64x64, // 64x128, 128x64, 128x128 has_bl_64x128, has_bl_128x64, has_bl_128x128, // 4x16, 16x4, 8x32 has_bl_4x16, has_bl_16x4, has_bl_8x32, // 32x8, 16x64, 64x16 has_bl_32x8, has_bl_16x64, has_bl_64x16 }; static uint8_t has_bl_vert_8x8[32] = { 254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0, 254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0, }; static uint8_t has_bl_vert_16x16[8] = { 254, 16, 254, 0, 254, 16, 254, 0, }; static uint8_t has_bl_vert_32x32[2] = { 14, 14 }; static uint8_t has_bl_vert_64x64[1] = { 2 }; // The _vert_* tables are like the ordinary tables above, but describe the // order we visit square blocks when doing a PARTITION_VERT_A or // PARTITION_VERT_B. This is the same order as normal except for on the last // split where we go vertically (TL, BL, TR, BR). We treat the rectangular block // as a pair of squares, which means that these tables work correctly for both // mixed vertical partition types. // // There are tables for each of the square sizes. Vertical rectangles (like // BLOCK_16X32) use their respective "non-vert" table static const uint8_t *const has_bl_vert_tables[BLOCK_SIZES] = { // 4X4 NULL, // 4X8, 8X4, 8X8 has_bl_4x8, NULL, has_bl_vert_8x8, // 8X16, 16X8, 16X16 has_bl_8x16, NULL, has_bl_vert_16x16, // 16X32, 32X16, 32X32 has_bl_16x32, NULL, has_bl_vert_32x32, // 32X64, 64X32, 64X64 has_bl_32x64, NULL, has_bl_vert_64x64, // 64x128, 128x64, 128x128 has_bl_64x128, NULL, has_bl_128x128 }; static const uint8_t *get_has_bl_table(PARTITION_TYPE partition, BLOCK_SIZE bsize) { const uint8_t *ret = NULL; // If this is a mixed vertical partition, look up bsize in orders_vert. if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) { assert(bsize < BLOCK_SIZES); ret = has_bl_vert_tables[bsize]; } else { ret = has_bl_tables[bsize]; } assert(ret); return ret; } static int has_bottom_left(BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row, int mi_col, int bottom_available, int left_available, PARTITION_TYPE partition, TX_SIZE txsz, int row_off, int col_off, int ss_x, int ss_y) { if (!bottom_available || !left_available) return 0; // Special case for 128x* blocks, when col_off is half the block width. // This is needed because 128x* superblocks are divided into 64x* blocks in // raster order if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64] && col_off > 0) { const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x; const int col_off_64 = col_off % plane_bw_unit_64; if (col_off_64 == 0) { // We are at the left edge of top-right or bottom-right 64x* block. const int plane_bh_unit_64 = mi_size_high[BLOCK_64X64] >> ss_y; const int row_off_64 = row_off % plane_bh_unit_64; const int plane_bh_unit = AOMMIN(mi_size_high[bsize] >> ss_y, plane_bh_unit_64); // Check if all bottom-left pixels are in the left 64x* block (which is // already coded). return row_off_64 + tx_size_high_unit[txsz] < plane_bh_unit; } } if (col_off > 0) { // Bottom-left pixels are in the bottom-left block, which is not available. return 0; } else { const int bh_unit = mi_size_high[bsize]; const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1); const int bottom_left_count_unit = tx_size_high_unit[txsz]; // All bottom-left pixels are in the left block, which is already available. if (row_off + bottom_left_count_unit < plane_bh_unit) return 1; const int bw_in_mi_log2 = mi_size_wide_log2[bsize]; const int bh_in_mi_log2 = mi_size_high_log2[bsize]; const int sb_mi_size = mi_size_high[sb_size]; const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2; const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2; // Leftmost column of superblock: so bottom-left pixels maybe in the left // and/or bottom-left superblocks. But only the left superblock is // available, so check if all required pixels fall in that superblock. if (blk_col_in_sb == 0) { const int blk_start_row_off = blk_row_in_sb << (bh_in_mi_log2 + MI_SIZE_LOG2 - MI_SIZE_LOG2) >> ss_y; const int row_off_in_sb = blk_start_row_off + row_off; const int sb_height_unit = sb_mi_size >> ss_y; return row_off_in_sb + bottom_left_count_unit < sb_height_unit; } // Bottom row of superblock (and not the leftmost column): so bottom-left // pixels fall in the bottom superblock, which is not available yet. if (((blk_row_in_sb + 1) << bh_in_mi_log2) >= sb_mi_size) return 0; // General case (neither leftmost column nor bottom row): check if the // bottom-left block is coded before the current block. const int this_blk_index = ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) + blk_col_in_sb + 0; const int idx1 = this_blk_index / 8; const int idx2 = this_blk_index % 8; const uint8_t *has_bl_table = get_has_bl_table(partition, bsize); return (has_bl_table[idx1] >> idx2) & 1; } } typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); static intra_pred_fn pred[INTRA_MODES][TX_SIZES_ALL]; static intra_pred_fn dc_pred[2][2][TX_SIZES_ALL]; #if CONFIG_AV1_HIGHBITDEPTH typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES_ALL]; static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES_ALL]; #endif static void init_intra_predictors_internal(void) { assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES); #if CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER #define INIT_RECTANGULAR(p, type) \ p[TX_4X8] = aom_##type##_predictor_4x8; \ p[TX_8X4] = aom_##type##_predictor_8x4; \ p[TX_8X16] = aom_##type##_predictor_8x16; \ p[TX_16X8] = aom_##type##_predictor_16x8; \ p[TX_16X32] = aom_##type##_predictor_16x32; \ p[TX_32X16] = aom_##type##_predictor_32x16; \ p[TX_32X64] = aom_##type##_predictor_32x64; \ p[TX_64X32] = aom_##type##_predictor_64x32; #else #define INIT_RECTANGULAR(p, type) \ p[TX_4X8] = aom_##type##_predictor_4x8; \ p[TX_8X4] = aom_##type##_predictor_8x4; \ p[TX_8X16] = aom_##type##_predictor_8x16; \ p[TX_16X8] = aom_##type##_predictor_16x8; \ p[TX_16X32] = aom_##type##_predictor_16x32; \ p[TX_32X16] = aom_##type##_predictor_32x16; \ p[TX_32X64] = aom_##type##_predictor_32x64; \ p[TX_64X32] = aom_##type##_predictor_64x32; \ p[TX_4X16] = aom_##type##_predictor_4x16; \ p[TX_16X4] = aom_##type##_predictor_16x4; \ p[TX_8X32] = aom_##type##_predictor_8x32; \ p[TX_32X8] = aom_##type##_predictor_32x8; \ p[TX_16X64] = aom_##type##_predictor_16x64; \ p[TX_64X16] = aom_##type##_predictor_64x16; #endif // CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER #define INIT_NO_4X4(p, type) \ p[TX_8X8] = aom_##type##_predictor_8x8; \ p[TX_16X16] = aom_##type##_predictor_16x16; \ p[TX_32X32] = aom_##type##_predictor_32x32; \ p[TX_64X64] = aom_##type##_predictor_64x64; \ INIT_RECTANGULAR(p, type) #define INIT_ALL_SIZES(p, type) \ p[TX_4X4] = aom_##type##_predictor_4x4; \ INIT_NO_4X4(p, type) INIT_ALL_SIZES(pred[V_PRED], v) INIT_ALL_SIZES(pred[H_PRED], h) INIT_ALL_SIZES(pred[PAETH_PRED], paeth) INIT_ALL_SIZES(pred[SMOOTH_PRED], smooth) INIT_ALL_SIZES(pred[SMOOTH_V_PRED], smooth_v) INIT_ALL_SIZES(pred[SMOOTH_H_PRED], smooth_h) INIT_ALL_SIZES(dc_pred[0][0], dc_128) INIT_ALL_SIZES(dc_pred[0][1], dc_top) INIT_ALL_SIZES(dc_pred[1][0], dc_left) INIT_ALL_SIZES(dc_pred[1][1], dc) #if CONFIG_AV1_HIGHBITDEPTH INIT_ALL_SIZES(pred_high[V_PRED], highbd_v) INIT_ALL_SIZES(pred_high[H_PRED], highbd_h) INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth) INIT_ALL_SIZES(pred_high[SMOOTH_PRED], highbd_smooth) INIT_ALL_SIZES(pred_high[SMOOTH_V_PRED], highbd_smooth_v) INIT_ALL_SIZES(pred_high[SMOOTH_H_PRED], highbd_smooth_h) INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128) INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top) INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left) INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc) #endif #undef intra_pred_allsizes } // Directional prediction, zone 1: 0 < angle < 90 void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy) { int r, c, x, base, shift, val; (void)left; (void)dy; assert(dy == 1); assert(dx > 0); const int max_base_x = ((bw + bh) - 1) << upsample_above; const int frac_bits = 6 - upsample_above; const int base_inc = 1 << upsample_above; x = dx; for (r = 0; r < bh; ++r, dst += stride, x += dx) { base = x >> frac_bits; shift = ((x << upsample_above) & 0x3F) >> 1; if (base >= max_base_x) { for (int i = r; i < bh; ++i) { memset(dst, above[max_base_x], bw * sizeof(dst[0])); dst += stride; } return; } for (c = 0; c < bw; ++c, base += base_inc) { if (base < max_base_x) { val = above[base] * (32 - shift) + above[base + 1] * shift; dst[c] = ROUND_POWER_OF_TWO(val, 5); } else { dst[c] = above[max_base_x]; } } } } // Directional prediction, zone 2: 90 < angle < 180 void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy) { assert(dx > 0); assert(dy > 0); const int min_base_x = -(1 << upsample_above); const int min_base_y = -(1 << upsample_left); (void)min_base_y; const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; for (int r = 0; r < bh; ++r) { for (int c = 0; c < bw; ++c) { int val; int y = r + 1; int x = (c << 6) - y * dx; const int base_x = x >> frac_bits_x; if (base_x >= min_base_x) { const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1; val = above[base_x] * (32 - shift) + above[base_x + 1] * shift; val = ROUND_POWER_OF_TWO(val, 5); } else { x = c + 1; y = (r << 6) - x * dy; const int base_y = y >> frac_bits_y; assert(base_y >= min_base_y); const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1; val = left[base_y] * (32 - shift) + left[base_y + 1] * shift; val = ROUND_POWER_OF_TWO(val, 5); } dst[c] = val; } dst += stride; } } // Directional prediction, zone 3: 180 < angle < 270 void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy) { int r, c, y, base, shift, val; (void)above; (void)dx; assert(dx == 1); assert(dy > 0); const int max_base_y = (bw + bh - 1) << upsample_left; const int frac_bits = 6 - upsample_left; const int base_inc = 1 << upsample_left; y = dy; for (c = 0; c < bw; ++c, y += dy) { base = y >> frac_bits; shift = ((y << upsample_left) & 0x3F) >> 1; for (r = 0; r < bh; ++r, base += base_inc) { if (base < max_base_y) { val = left[base] * (32 - shift) + left[base + 1] * shift; dst[r * stride + c] = ROUND_POWER_OF_TWO(val, 5); } else { for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y]; break; } } } } static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int angle) { const int dx = av1_get_dx(angle); const int dy = av1_get_dy(angle); const int bw = tx_size_wide[tx_size]; const int bh = tx_size_high[tx_size]; assert(angle > 0 && angle < 270); if (angle > 0 && angle < 90) { av1_dr_prediction_z1(dst, stride, bw, bh, above, left, upsample_above, dx, dy); } else if (angle > 90 && angle < 180) { av1_dr_prediction_z2(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy); } else if (angle > 180 && angle < 270) { av1_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, dx, dy); } else if (angle == 90) { pred[V_PRED][tx_size](dst, stride, above, left); } else if (angle == 180) { pred[H_PRED][tx_size](dst, stride, above, left); } } #if CONFIG_AV1_HIGHBITDEPTH // Directional prediction, zone 1: 0 < angle < 90 void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd) { int r, c, x, base, shift, val; (void)left; (void)dy; (void)bd; assert(dy == 1); assert(dx > 0); const int max_base_x = ((bw + bh) - 1) << upsample_above; const int frac_bits = 6 - upsample_above; const int base_inc = 1 << upsample_above; x = dx; for (r = 0; r < bh; ++r, dst += stride, x += dx) { base = x >> frac_bits; shift = ((x << upsample_above) & 0x3F) >> 1; if (base >= max_base_x) { for (int i = r; i < bh; ++i) { aom_memset16(dst, above[max_base_x], bw); dst += stride; } return; } for (c = 0; c < bw; ++c, base += base_inc) { if (base < max_base_x) { val = above[base] * (32 - shift) + above[base + 1] * shift; dst[c] = ROUND_POWER_OF_TWO(val, 5); } else { dst[c] = above[max_base_x]; } } } } // Directional prediction, zone 2: 90 < angle < 180 void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd) { (void)bd; assert(dx > 0); assert(dy > 0); const int min_base_x = -(1 << upsample_above); const int min_base_y = -(1 << upsample_left); (void)min_base_y; const int frac_bits_x = 6 - upsample_above; const int frac_bits_y = 6 - upsample_left; for (int r = 0; r < bh; ++r) { for (int c = 0; c < bw; ++c) { int val; int y = r + 1; int x = (c << 6) - y * dx; const int base_x = x >> frac_bits_x; if (base_x >= min_base_x) { const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1; val = above[base_x] * (32 - shift) + above[base_x + 1] * shift; val = ROUND_POWER_OF_TWO(val, 5); } else { x = c + 1; y = (r << 6) - x * dy; const int base_y = y >> frac_bits_y; assert(base_y >= min_base_y); const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1; val = left[base_y] * (32 - shift) + left[base_y + 1] * shift; val = ROUND_POWER_OF_TWO(val, 5); } dst[c] = val; } dst += stride; } } // Directional prediction, zone 3: 180 < angle < 270 void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd) { int r, c, y, base, shift, val; (void)above; (void)dx; (void)bd; assert(dx == 1); assert(dy > 0); const int max_base_y = (bw + bh - 1) << upsample_left; const int frac_bits = 6 - upsample_left; const int base_inc = 1 << upsample_left; y = dy; for (c = 0; c < bw; ++c, y += dy) { base = y >> frac_bits; shift = ((y << upsample_left) & 0x3F) >> 1; for (r = 0; r < bh; ++r, base += base_inc) { if (base < max_base_y) { val = left[base] * (32 - shift) + left[base + 1] * shift; dst[r * stride + c] = ROUND_POWER_OF_TWO(val, 5); } else { for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y]; break; } } } } static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int angle, int bd) { const int dx = av1_get_dx(angle); const int dy = av1_get_dy(angle); const int bw = tx_size_wide[tx_size]; const int bh = tx_size_high[tx_size]; assert(angle > 0 && angle < 270); if (angle > 0 && angle < 90) { av1_highbd_dr_prediction_z1(dst, stride, bw, bh, above, left, upsample_above, dx, dy, bd); } else if (angle > 90 && angle < 180) { av1_highbd_dr_prediction_z2(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy, bd); } else if (angle > 180 && angle < 270) { av1_highbd_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, dx, dy, bd); } else if (angle == 90) { pred_high[V_PRED][tx_size](dst, stride, above, left, bd); } else if (angle == 180) { pred_high[H_PRED][tx_size](dst, stride, above, left, bd); } } #endif // CONFIG_AV1_HIGHBITDEPTH DECLARE_ALIGNED(16, const int8_t, av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]) = { { { -6, 10, 0, 0, 0, 12, 0, 0 }, { -5, 2, 10, 0, 0, 9, 0, 0 }, { -3, 1, 1, 10, 0, 7, 0, 0 }, { -3, 1, 1, 2, 10, 5, 0, 0 }, { -4, 6, 0, 0, 0, 2, 12, 0 }, { -3, 2, 6, 0, 0, 2, 9, 0 }, { -3, 2, 2, 6, 0, 2, 7, 0 }, { -3, 1, 2, 2, 6, 3, 5, 0 }, }, { { -10, 16, 0, 0, 0, 10, 0, 0 }, { -6, 0, 16, 0, 0, 6, 0, 0 }, { -4, 0, 0, 16, 0, 4, 0, 0 }, { -2, 0, 0, 0, 16, 2, 0, 0 }, { -10, 16, 0, 0, 0, 0, 10, 0 }, { -6, 0, 16, 0, 0, 0, 6, 0 }, { -4, 0, 0, 16, 0, 0, 4, 0 }, { -2, 0, 0, 0, 16, 0, 2, 0 }, }, { { -8, 8, 0, 0, 0, 16, 0, 0 }, { -8, 0, 8, 0, 0, 16, 0, 0 }, { -8, 0, 0, 8, 0, 16, 0, 0 }, { -8, 0, 0, 0, 8, 16, 0, 0 }, { -4, 4, 0, 0, 0, 0, 16, 0 }, { -4, 0, 4, 0, 0, 0, 16, 0 }, { -4, 0, 0, 4, 0, 0, 16, 0 }, { -4, 0, 0, 0, 4, 0, 16, 0 }, }, { { -2, 8, 0, 0, 0, 10, 0, 0 }, { -1, 3, 8, 0, 0, 6, 0, 0 }, { -1, 2, 3, 8, 0, 4, 0, 0 }, { 0, 1, 2, 3, 8, 2, 0, 0 }, { -1, 4, 0, 0, 0, 3, 10, 0 }, { -1, 3, 4, 0, 0, 4, 6, 0 }, { -1, 2, 3, 4, 0, 4, 4, 0 }, { -1, 2, 2, 3, 4, 3, 3, 0 }, }, { { -12, 14, 0, 0, 0, 14, 0, 0 }, { -10, 0, 14, 0, 0, 12, 0, 0 }, { -9, 0, 0, 14, 0, 11, 0, 0 }, { -8, 0, 0, 0, 14, 10, 0, 0 }, { -10, 12, 0, 0, 0, 0, 14, 0 }, { -9, 1, 12, 0, 0, 0, 12, 0 }, { -8, 0, 0, 12, 0, 1, 11, 0 }, { -7, 0, 0, 1, 12, 1, 9, 0 }, }, }; void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode) { int r, c; uint8_t buffer[33][33]; const int bw = tx_size_wide[tx_size]; const int bh = tx_size_high[tx_size]; assert(bw <= 32 && bh <= 32); for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r]; memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t)); for (r = 1; r < bh + 1; r += 2) for (c = 1; c < bw + 1; c += 4) { const uint8_t p0 = buffer[r - 1][c - 1]; const uint8_t p1 = buffer[r - 1][c]; const uint8_t p2 = buffer[r - 1][c + 1]; const uint8_t p3 = buffer[r - 1][c + 2]; const uint8_t p4 = buffer[r - 1][c + 3]; const uint8_t p5 = buffer[r][c - 1]; const uint8_t p6 = buffer[r + 1][c - 1]; for (int k = 0; k < 8; ++k) { int r_offset = k >> 2; int c_offset = k & 0x03; int pr = av1_filter_intra_taps[mode][k][0] * p0 + av1_filter_intra_taps[mode][k][1] * p1 + av1_filter_intra_taps[mode][k][2] * p2 + av1_filter_intra_taps[mode][k][3] * p3 + av1_filter_intra_taps[mode][k][4] * p4 + av1_filter_intra_taps[mode][k][5] * p5 + av1_filter_intra_taps[mode][k][6] * p6; // Section 7.11.2.3 specifies the right-hand side of the assignment as // Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ). // Since Clip1() clips a negative value to 0, it is safe to replace // Round2Signed() with Round2(). buffer[r + r_offset][c + c_offset] = clip_pixel(ROUND_POWER_OF_TWO(pr, FILTER_INTRA_SCALE_BITS)); } } for (r = 0; r < bh; ++r) { memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t)); dst += stride; } } #if CONFIG_AV1_HIGHBITDEPTH static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int mode, int bd) { int r, c; uint16_t buffer[33][33]; const int bw = tx_size_wide[tx_size]; const int bh = tx_size_high[tx_size]; assert(bw <= 32 && bh <= 32); for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r]; memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0])); for (r = 1; r < bh + 1; r += 2) for (c = 1; c < bw + 1; c += 4) { const uint16_t p0 = buffer[r - 1][c - 1]; const uint16_t p1 = buffer[r - 1][c]; const uint16_t p2 = buffer[r - 1][c + 1]; const uint16_t p3 = buffer[r - 1][c + 2]; const uint16_t p4 = buffer[r - 1][c + 3]; const uint16_t p5 = buffer[r][c - 1]; const uint16_t p6 = buffer[r + 1][c - 1]; for (int k = 0; k < 8; ++k) { int r_offset = k >> 2; int c_offset = k & 0x03; int pr = av1_filter_intra_taps[mode][k][0] * p0 + av1_filter_intra_taps[mode][k][1] * p1 + av1_filter_intra_taps[mode][k][2] * p2 + av1_filter_intra_taps[mode][k][3] * p3 + av1_filter_intra_taps[mode][k][4] * p4 + av1_filter_intra_taps[mode][k][5] * p5 + av1_filter_intra_taps[mode][k][6] * p6; // Section 7.11.2.3 specifies the right-hand side of the assignment as // Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ). // Since Clip1() clips a negative value to 0, it is safe to replace // Round2Signed() with Round2(). buffer[r + r_offset][c + c_offset] = clip_pixel_highbd( ROUND_POWER_OF_TWO(pr, FILTER_INTRA_SCALE_BITS), bd); } } for (r = 0; r < bh; ++r) { memcpy(dst, &buffer[r + 1][1], bw * sizeof(dst[0])); dst += stride; } } #endif // CONFIG_AV1_HIGHBITDEPTH static int is_smooth(const MB_MODE_INFO *mbmi, int plane) { if (plane == 0) { const PREDICTION_MODE mode = mbmi->mode; return (mode == SMOOTH_PRED || mode == SMOOTH_V_PRED || mode == SMOOTH_H_PRED); } else { // uv_mode is not set for inter blocks, so need to explicitly // detect that case. if (is_inter_block(mbmi)) return 0; const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; return (uv_mode == UV_SMOOTH_PRED || uv_mode == UV_SMOOTH_V_PRED || uv_mode == UV_SMOOTH_H_PRED); } } static int get_intra_edge_filter_type(const MACROBLOCKD *xd, int plane) { const MB_MODE_INFO *above; const MB_MODE_INFO *left; if (plane == 0) { above = xd->above_mbmi; left = xd->left_mbmi; } else { above = xd->chroma_above_mbmi; left = xd->chroma_left_mbmi; } return (above && is_smooth(above, plane)) || (left && is_smooth(left, plane)); } static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) { const int d = abs(delta); int strength = 0; const int blk_wh = bs0 + bs1; if (type == 0) { if (blk_wh <= 8) { if (d >= 56) strength = 1; } else if (blk_wh <= 12) { if (d >= 40) strength = 1; } else if (blk_wh <= 16) { if (d >= 40) strength = 1; } else if (blk_wh <= 24) { if (d >= 8) strength = 1; if (d >= 16) strength = 2; if (d >= 32) strength = 3; } else if (blk_wh <= 32) { if (d >= 1) strength = 1; if (d >= 4) strength = 2; if (d >= 32) strength = 3; } else { if (d >= 1) strength = 3; } } else { if (blk_wh <= 8) { if (d >= 40) strength = 1; if (d >= 64) strength = 2; } else if (blk_wh <= 16) { if (d >= 20) strength = 1; if (d >= 48) strength = 2; } else if (blk_wh <= 24) { if (d >= 4) strength = 3; } else { if (d >= 1) strength = 3; } } return strength; } void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) { if (!strength) return; const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 } }; const int filt = strength - 1; uint8_t edge[129]; memcpy(edge, p, sz * sizeof(*p)); for (int i = 1; i < sz; i++) { int s = 0; for (int j = 0; j < INTRA_EDGE_TAPS; j++) { int k = i - 2 + j; k = (k < 0) ? 0 : k; k = (k > sz - 1) ? sz - 1 : k; s += edge[k] * kernel[filt][j]; } s = (s + 8) >> 4; p[i] = s; } } static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) { const int kernel[3] = { 5, 6, 5 }; int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) + (p_above[0] * kernel[2]); s = (s + 8) >> 4; p_above[-1] = s; p_left[-1] = s; } void av1_upsample_intra_edge_c(uint8_t *p, int sz) { // interpolate half-sample positions assert(sz <= MAX_UPSAMPLE_SZ); uint8_t in[MAX_UPSAMPLE_SZ + 3]; // copy p[-1..(sz-1)] and extend first and last samples in[0] = p[-1]; in[1] = p[-1]; for (int i = 0; i < sz; i++) { in[i + 2] = p[i]; } in[sz + 2] = p[sz - 1]; // interpolate half-sample edge positions p[-2] = in[0]; for (int i = 0; i < sz; i++) { int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3]; s = clip_pixel((s + 8) >> 4); p[2 * i - 1] = s; p[2 * i] = in[i + 2]; } } static void build_directional_and_filter_intra_predictors( const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px, int n_left_px, int n_bottomleft_px, int intra_edge_filter_type) { int i; const uint8_t *above_ref = ref - ref_stride; const uint8_t *left_ref = ref - 1; DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); uint8_t *const above_row = above_data + 16; uint8_t *const left_col = left_data + 16; const int txwpx = tx_size_wide[tx_size]; const int txhpx = tx_size_high[tx_size]; int need_left = extend_modes[mode] & NEED_LEFT; int need_above = extend_modes[mode] & NEED_ABOVE; int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; const int is_dr_mode = av1_is_directional_mode(mode); const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; assert(use_filter_intra || is_dr_mode); // The left_data, above_data buffers must be zeroed to fix some intermittent // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4 // path in av1_dr_prediction_z1_avx2()) from left_data, above_data are seen to // be the potential reason for this issue. memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS); memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS); // The default values if ref pixels are not available: // 128 127 127 .. 127 127 127 127 127 127 // 129 A B .. Y Z // 129 C D .. W X // 129 E F .. U V // 129 G H .. S T T T T T // .. if (is_dr_mode) { if (p_angle <= 90) need_above = 1, need_left = 0, need_above_left = 1; else if (p_angle < 180) need_above = 1, need_left = 1, need_above_left = 1; else need_above = 0, need_left = 1, need_above_left = 1; } if (use_filter_intra) need_left = need_above = need_above_left = 1; assert(n_top_px >= 0); assert(n_topright_px >= -1); assert(n_left_px >= 0); assert(n_bottomleft_px >= -1); if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { int val; if (need_left) { val = (n_top_px > 0) ? above_ref[0] : 129; } else { val = (n_left_px > 0) ? left_ref[0] : 127; } for (i = 0; i < txhpx; ++i) { memset(dst, val, txwpx); dst += dst_stride; } return; } // NEED_LEFT if (need_left) { const int num_left_pixels_needed = txhpx + (n_bottomleft_px >= 0 ? txwpx : 0); i = 0; if (n_left_px > 0) { for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; if (n_bottomleft_px > 0) { assert(i == txhpx); for (; i < txhpx + n_bottomleft_px; i++) left_col[i] = left_ref[i * ref_stride]; } if (i < num_left_pixels_needed) memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i); } else if (n_top_px > 0) { memset(left_col, above_ref[0], num_left_pixels_needed); } } // NEED_ABOVE if (need_above) { const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0); if (n_top_px > 0) { memcpy(above_row, above_ref, n_top_px); i = n_top_px; if (n_topright_px > 0) { assert(n_top_px == txwpx); memcpy(above_row + txwpx, above_ref + txwpx, n_topright_px); i += n_topright_px; } if (i < num_top_pixels_needed) memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i); } else if (n_left_px > 0) { memset(above_row, left_ref[0], num_top_pixels_needed); } } if (need_above_left) { if (n_top_px > 0 && n_left_px > 0) { above_row[-1] = above_ref[-1]; } else if (n_top_px > 0) { above_row[-1] = above_ref[0]; } else if (n_left_px > 0) { above_row[-1] = left_ref[0]; } else { above_row[-1] = 128; } left_col[-1] = above_row[-1]; } if (use_filter_intra) { av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col, filter_intra_mode); return; } assert(is_dr_mode); int upsample_above = 0; int upsample_left = 0; if (!disable_edge_filter) { const int need_right = p_angle < 90; const int need_bottom = p_angle > 180; if (p_angle != 90 && p_angle != 180) { assert(need_above_left); const int ab_le = 1; if (need_above && need_left && (txwpx + txhpx >= 24)) { filter_intra_edge_corner(above_row, left_col); } if (need_above && n_top_px > 0) { const int strength = intra_edge_filter_strength( txwpx, txhpx, p_angle - 90, intra_edge_filter_type); const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); av1_filter_intra_edge(above_row - ab_le, n_px, strength); } if (need_left && n_left_px > 0) { const int strength = intra_edge_filter_strength( txhpx, txwpx, p_angle - 180, intra_edge_filter_type); const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); av1_filter_intra_edge(left_col - ab_le, n_px, strength); } } upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, intra_edge_filter_type); if (need_above && upsample_above) { const int n_px = txwpx + (need_right ? txhpx : 0); av1_upsample_intra_edge(above_row, n_px); } upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, intra_edge_filter_type); if (need_left && upsample_left) { const int n_px = txhpx + (need_bottom ? txwpx : 0); av1_upsample_intra_edge(left_col, n_px); } } dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above, upsample_left, p_angle); } // This function generates the pred data of a given block for non-directional // intra prediction modes (i.e., DC, SMOOTH, SMOOTH_H, SMOOTH_V and PAETH). static void build_non_directional_intra_predictors( const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px, int n_left_px) { const uint8_t *above_ref = ref - ref_stride; const uint8_t *left_ref = ref - 1; const int txwpx = tx_size_wide[tx_size]; const int txhpx = tx_size_high[tx_size]; const int need_left = extend_modes[mode] & NEED_LEFT; const int need_above = extend_modes[mode] & NEED_ABOVE; const int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; int i = 0; assert(n_top_px >= 0); assert(n_left_px >= 0); assert(mode == DC_PRED || mode == SMOOTH_PRED || mode == SMOOTH_V_PRED || mode == SMOOTH_H_PRED || mode == PAETH_PRED); if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { int val = 0; if (need_left) { val = (n_top_px > 0) ? above_ref[0] : 129; } else { val = (n_left_px > 0) ? left_ref[0] : 127; } for (i = 0; i < txhpx; ++i) { memset(dst, val, txwpx); dst += dst_stride; } return; } DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); uint8_t *const above_row = above_data + 16; uint8_t *const left_col = left_data + 16; if (need_left) { memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS); if (n_left_px > 0) { for (i = 0; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; if (i < txhpx) memset(&left_col[i], left_col[i - 1], txhpx - i); } else if (n_top_px > 0) { memset(left_col, above_ref[0], txhpx); } } if (need_above) { memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS); if (n_top_px > 0) { memcpy(above_row, above_ref, n_top_px); i = n_top_px; if (i < txwpx) memset(&above_row[i], above_row[i - 1], txwpx - i); } else if (n_left_px > 0) { memset(above_row, left_ref[0], txwpx); } } if (need_above_left) { if (n_top_px > 0 && n_left_px > 0) { above_row[-1] = above_ref[-1]; } else if (n_top_px > 0) { above_row[-1] = above_ref[0]; } else if (n_left_px > 0) { above_row[-1] = left_ref[0]; } else { above_row[-1] = 128; } left_col[-1] = above_row[-1]; } if (mode == DC_PRED) { dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride, above_row, left_col); } else { pred[mode][tx_size](dst, dst_stride, above_row, left_col); } } #if CONFIG_AV1_HIGHBITDEPTH void av1_highbd_filter_intra_edge_c(uint16_t *p, int sz, int strength) { if (!strength) return; const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 } }; const int filt = strength - 1; uint16_t edge[129]; memcpy(edge, p, sz * sizeof(*p)); for (int i = 1; i < sz; i++) { int s = 0; for (int j = 0; j < INTRA_EDGE_TAPS; j++) { int k = i - 2 + j; k = (k < 0) ? 0 : k; k = (k > sz - 1) ? sz - 1 : k; s += edge[k] * kernel[filt][j]; } s = (s + 8) >> 4; p[i] = s; } } static void highbd_filter_intra_edge_corner(uint16_t *p_above, uint16_t *p_left) { const int kernel[3] = { 5, 6, 5 }; int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) + (p_above[0] * kernel[2]); s = (s + 8) >> 4; p_above[-1] = s; p_left[-1] = s; } void av1_highbd_upsample_intra_edge_c(uint16_t *p, int sz, int bd) { // interpolate half-sample positions assert(sz <= MAX_UPSAMPLE_SZ); uint16_t in[MAX_UPSAMPLE_SZ + 3]; // copy p[-1..(sz-1)] and extend first and last samples in[0] = p[-1]; in[1] = p[-1]; for (int i = 0; i < sz; i++) { in[i + 2] = p[i]; } in[sz + 2] = p[sz - 1]; // interpolate half-sample edge positions p[-2] = in[0]; for (int i = 0; i < sz; i++) { int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3]; s = (s + 8) >> 4; s = clip_pixel_highbd(s, bd); p[2 * i - 1] = s; p[2 * i] = in[i + 2]; } } static void highbd_build_directional_and_filter_intra_predictors( const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride, PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px, int n_left_px, int n_bottomleft_px, int intra_edge_filter_type, int bit_depth) { int i; uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8); DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); uint16_t *const above_row = above_data + 16; uint16_t *const left_col = left_data + 16; const int txwpx = tx_size_wide[tx_size]; const int txhpx = tx_size_high[tx_size]; int need_left = extend_modes[mode] & NEED_LEFT; int need_above = extend_modes[mode] & NEED_ABOVE; int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; const uint16_t *above_ref = ref - ref_stride; const uint16_t *left_ref = ref - 1; const int is_dr_mode = av1_is_directional_mode(mode); const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; assert(use_filter_intra || is_dr_mode); const int base = 128 << (bit_depth - 8); // The left_data, above_data buffers must be zeroed to fix some intermittent // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4 // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are // seen to be the potential reason for this issue. aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS); aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS); // The default values if ref pixels are not available: // base base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1 // base+1 A B .. Y Z // base+1 C D .. W X // base+1 E F .. U V // base+1 G H .. S T T T T T if (is_dr_mode) { if (p_angle <= 90) need_above = 1, need_left = 0, need_above_left = 1; else if (p_angle < 180) need_above = 1, need_left = 1, need_above_left = 1; else need_above = 0, need_left = 1, need_above_left = 1; } if (use_filter_intra) need_left = need_above = need_above_left = 1; assert(n_top_px >= 0); assert(n_topright_px >= -1); assert(n_left_px >= 0); assert(n_bottomleft_px >= -1); if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { int val; if (need_left) { val = (n_top_px > 0) ? above_ref[0] : base + 1; } else { val = (n_left_px > 0) ? left_ref[0] : base - 1; } for (i = 0; i < txhpx; ++i) { aom_memset16(dst, val, txwpx); dst += dst_stride; } return; } // NEED_LEFT if (need_left) { const int num_left_pixels_needed = txhpx + (n_bottomleft_px >= 0 ? txwpx : 0); i = 0; if (n_left_px > 0) { for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; if (n_bottomleft_px > 0) { assert(i == txhpx); for (; i < txhpx + n_bottomleft_px; i++) left_col[i] = left_ref[i * ref_stride]; } if (i < num_left_pixels_needed) aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i); } else if (n_top_px > 0) { aom_memset16(left_col, above_ref[0], num_left_pixels_needed); } } // NEED_ABOVE if (need_above) { const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0); if (n_top_px > 0) { memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0])); i = n_top_px; if (n_topright_px > 0) { assert(n_top_px == txwpx); memcpy(above_row + txwpx, above_ref + txwpx, n_topright_px * sizeof(above_ref[0])); i += n_topright_px; } if (i < num_top_pixels_needed) aom_memset16(&above_row[i], above_row[i - 1], num_top_pixels_needed - i); } else if (n_left_px > 0) { aom_memset16(above_row, left_ref[0], num_top_pixels_needed); } } if (need_above_left) { if (n_top_px > 0 && n_left_px > 0) { above_row[-1] = above_ref[-1]; } else if (n_top_px > 0) { above_row[-1] = above_ref[0]; } else if (n_left_px > 0) { above_row[-1] = left_ref[0]; } else { above_row[-1] = base; } left_col[-1] = above_row[-1]; } if (use_filter_intra) { highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col, filter_intra_mode, bit_depth); return; } assert(is_dr_mode); int upsample_above = 0; int upsample_left = 0; if (!disable_edge_filter) { const int need_right = p_angle < 90; const int need_bottom = p_angle > 180; if (p_angle != 90 && p_angle != 180) { assert(need_above_left); const int ab_le = 1; if (need_above && need_left && (txwpx + txhpx >= 24)) { highbd_filter_intra_edge_corner(above_row, left_col); } if (need_above && n_top_px > 0) { const int strength = intra_edge_filter_strength( txwpx, txhpx, p_angle - 90, intra_edge_filter_type); const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength); } if (need_left && n_left_px > 0) { const int strength = intra_edge_filter_strength( txhpx, txwpx, p_angle - 180, intra_edge_filter_type); const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength); } } upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, intra_edge_filter_type); if (need_above && upsample_above) { const int n_px = txwpx + (need_right ? txhpx : 0); av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth); } upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, intra_edge_filter_type); if (need_left && upsample_left) { const int n_px = txhpx + (need_bottom ? txwpx : 0); av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth); } } highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above, upsample_left, p_angle, bit_depth); } // For HBD encode/decode, this function generates the pred data of a given // block for non-directional intra prediction modes (i.e., DC, SMOOTH, SMOOTH_H, // SMOOTH_V and PAETH). static void highbd_build_non_directional_intra_predictors( const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride, PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px, int n_left_px, int bit_depth) { int i = 0; uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8); const int txwpx = tx_size_wide[tx_size]; const int txhpx = tx_size_high[tx_size]; int need_left = extend_modes[mode] & NEED_LEFT; int need_above = extend_modes[mode] & NEED_ABOVE; int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; const uint16_t *above_ref = ref - ref_stride; const uint16_t *left_ref = ref - 1; const int base = 128 << (bit_depth - 8); assert(n_top_px >= 0); assert(n_left_px >= 0); assert(mode == DC_PRED || mode == SMOOTH_PRED || mode == SMOOTH_V_PRED || mode == SMOOTH_H_PRED || mode == PAETH_PRED); if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { int val = 0; if (need_left) { val = (n_top_px > 0) ? above_ref[0] : base + 1; } else { val = (n_left_px > 0) ? left_ref[0] : base - 1; } for (i = 0; i < txhpx; ++i) { aom_memset16(dst, val, txwpx); dst += dst_stride; } return; } DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); uint16_t *const above_row = above_data + 16; uint16_t *const left_col = left_data + 16; if (need_left) { aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS); if (n_left_px > 0) { for (i = 0; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; if (i < txhpx) aom_memset16(&left_col[i], left_col[i - 1], txhpx - i); } else if (n_top_px > 0) { aom_memset16(left_col, above_ref[0], txhpx); } } if (need_above) { aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS); if (n_top_px > 0) { memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0])); i = n_top_px; if (i < txwpx) aom_memset16(&above_row[i], above_row[i - 1], (txwpx - i)); } else if (n_left_px > 0) { aom_memset16(above_row, left_ref[0], txwpx); } } if (need_above_left) { if (n_top_px > 0 && n_left_px > 0) { above_row[-1] = above_ref[-1]; } else if (n_top_px > 0) { above_row[-1] = above_ref[0]; } else if (n_left_px > 0) { above_row[-1] = left_ref[0]; } else { above_row[-1] = base; } left_col[-1] = above_row[-1]; } if (mode == DC_PRED) { dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size]( dst, dst_stride, above_row, left_col, bit_depth); } else { pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, bit_depth); } } #endif // CONFIG_AV1_HIGHBITDEPTH static inline BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x, int subsampling_y) { assert(subsampling_x >= 0 && subsampling_x < 2); assert(subsampling_y >= 0 && subsampling_y < 2); BLOCK_SIZE bs = bsize; switch (bsize) { case BLOCK_4X4: if (subsampling_x == 1 && subsampling_y == 1) bs = BLOCK_8X8; else if (subsampling_x == 1) bs = BLOCK_8X4; else if (subsampling_y == 1) bs = BLOCK_4X8; break; case BLOCK_4X8: if (subsampling_x == 1 && subsampling_y == 1) bs = BLOCK_8X8; else if (subsampling_x == 1) bs = BLOCK_8X8; else if (subsampling_y == 1) bs = BLOCK_4X8; break; case BLOCK_8X4: if (subsampling_x == 1 && subsampling_y == 1) bs = BLOCK_8X8; else if (subsampling_x == 1) bs = BLOCK_8X4; else if (subsampling_y == 1) bs = BLOCK_8X8; break; case BLOCK_4X16: if (subsampling_x == 1 && subsampling_y == 1) bs = BLOCK_8X16; else if (subsampling_x == 1) bs = BLOCK_8X16; else if (subsampling_y == 1) bs = BLOCK_4X16; break; case BLOCK_16X4: if (subsampling_x == 1 && subsampling_y == 1) bs = BLOCK_16X8; else if (subsampling_x == 1) bs = BLOCK_16X4; else if (subsampling_y == 1) bs = BLOCK_16X8; break; default: break; } return bs; } void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size, int enable_intra_edge_filter, int wpx, int hpx, TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette, FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, int col_off, int row_off, int plane) { const MB_MODE_INFO *const mbmi = xd->mi[0]; const int txwpx = tx_size_wide[tx_size]; const int txhpx = tx_size_high[tx_size]; const int x = col_off << MI_SIZE_LOG2; const int y = row_off << MI_SIZE_LOG2; const int is_hbd = is_cur_buf_hbd(xd); assert(mode < INTRA_MODES); if (use_palette) { int r, c; const uint8_t *const map = xd->plane[plane != 0].color_index_map + xd->color_index_map_offset[plane != 0]; const uint16_t *const palette = mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE; if (is_hbd) { uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); for (r = 0; r < txhpx; ++r) { for (c = 0; c < txwpx; ++c) { dst16[r * dst_stride + c] = palette[map[(r + y) * wpx + c + x]]; } } } else { for (r = 0; r < txhpx; ++r) { for (c = 0; c < txwpx; ++c) { dst[r * dst_stride + c] = (uint8_t)palette[map[(r + y) * wpx + c + x]]; } } } return; } const struct macroblockd_plane *const pd = &xd->plane[plane]; const int ss_x = pd->subsampling_x; const int ss_y = pd->subsampling_y; const int have_top = row_off || (ss_y ? xd->chroma_up_available : xd->up_available); const int have_left = col_off || (ss_x ? xd->chroma_left_available : xd->left_available); // Distance between the right edge of this prediction block to // the frame right edge const int xr = (xd->mb_to_right_edge >> (3 + ss_x)) + wpx - x - txwpx; // Distance between the bottom edge of this prediction block to // the frame bottom edge const int yd = (xd->mb_to_bottom_edge >> (3 + ss_y)) + hpx - y - txhpx; const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; const int is_dr_mode = av1_is_directional_mode(mode); // The computations in this function, as well as in build_intra_predictors(), // are generalized for all intra modes. Some of these operations are not // required since non-directional intra modes (i.e., DC, SMOOTH, SMOOTH_H, // SMOOTH_V, and PAETH) specifically require left and top neighbors. Hence, a // separate function build_non_directional_intra_predictors() is introduced // for these modes to avoid redundant computations while generating pred data. const int n_top_px = have_top ? AOMMIN(txwpx, xr + txwpx) : 0; const int n_left_px = have_left ? AOMMIN(txhpx, yd + txhpx) : 0; if (!use_filter_intra && !is_dr_mode) { #if CONFIG_AV1_HIGHBITDEPTH if (is_hbd) { highbd_build_non_directional_intra_predictors( ref, ref_stride, dst, dst_stride, mode, tx_size, n_top_px, n_left_px, xd->bd); return; } #endif // CONFIG_AV1_HIGHBITDEPTH build_non_directional_intra_predictors(ref, ref_stride, dst, dst_stride, mode, tx_size, n_top_px, n_left_px); return; } const int txw = tx_size_wide_unit[tx_size]; const int txh = tx_size_high_unit[tx_size]; const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); const int right_available = mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end; const int bottom_available = (yd > 0) && (mi_row + ((row_off + txh) << ss_y) < xd->tile.mi_row_end); const PARTITION_TYPE partition = mbmi->partition; BLOCK_SIZE bsize = mbmi->bsize; // force 4x4 chroma component block size. if (ss_x || ss_y) { bsize = scale_chroma_bsize(bsize, ss_x, ss_y); } int p_angle = 0; int need_top_right = extend_modes[mode] & NEED_ABOVERIGHT; int need_bottom_left = extend_modes[mode] & NEED_BOTTOMLEFT; if (use_filter_intra) { need_top_right = 0; need_bottom_left = 0; } if (is_dr_mode) { p_angle = mode_to_angle_map[mode] + angle_delta; need_top_right = p_angle < 90; need_bottom_left = p_angle > 180; } // Possible states for have_top_right(TR) and have_bottom_left(BL) // -1 : TR and BL are not needed // 0 : TR and BL are needed but not available // > 0 : TR and BL are needed and pixels are available const int have_top_right = need_top_right ? has_top_right(sb_size, bsize, mi_row, mi_col, have_top, right_available, partition, tx_size, row_off, col_off, ss_x, ss_y) : -1; const int have_bottom_left = need_bottom_left ? has_bottom_left(sb_size, bsize, mi_row, mi_col, bottom_available, have_left, partition, tx_size, row_off, col_off, ss_x, ss_y) : -1; const int disable_edge_filter = !enable_intra_edge_filter; const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane); const int n_topright_px = have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right; const int n_bottomleft_px = have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left; #if CONFIG_AV1_HIGHBITDEPTH if (is_hbd) { highbd_build_directional_and_filter_intra_predictors( ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode, tx_size, disable_edge_filter, n_top_px, n_topright_px, n_left_px, n_bottomleft_px, intra_edge_filter_type, xd->bd); return; } #endif build_directional_and_filter_intra_predictors( ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode, tx_size, disable_edge_filter, n_top_px, n_topright_px, n_left_px, n_bottomleft_px, intra_edge_filter_type); } void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, int blk_col, int blk_row, TX_SIZE tx_size) { const MB_MODE_INFO *const mbmi = xd->mi[0]; struct macroblockd_plane *const pd = &xd->plane[plane]; const int dst_stride = pd->dst.stride; uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; const PREDICTION_MODE mode = (plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode); const int use_palette = mbmi->palette_mode_info.palette_size[plane != 0] > 0; const FILTER_INTRA_MODE filter_intra_mode = (plane == AOM_PLANE_Y && mbmi->filter_intra_mode_info.use_filter_intra) ? mbmi->filter_intra_mode_info.filter_intra_mode : FILTER_INTRA_MODES; const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP; const SequenceHeader *seq_params = cm->seq_params; #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) { #if CONFIG_DEBUG assert(is_cfl_allowed(xd)); const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); (void)plane_bsize; assert(plane_bsize < BLOCK_SIZES_ALL); if (!xd->lossless[mbmi->segment_id]) { assert(blk_col == 0); assert(blk_row == 0); assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]); assert(block_size_high[plane_bsize] == tx_size_high[tx_size]); } #endif CFL_CTX *const cfl = &xd->cfl; CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane); if (!cfl->dc_pred_is_cached[pred_plane]) { av1_predict_intra_block(xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width, pd->height, tx_size, mode, angle_delta, use_palette, filter_intra_mode, dst, dst_stride, dst, dst_stride, blk_col, blk_row, plane); if (cfl->use_dc_pred_cache) { cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]); cfl->dc_pred_is_cached[pred_plane] = true; } } else { cfl_load_dc_pred(xd, dst, dst_stride, tx_size, pred_plane); } av1_cfl_predict_block(xd, dst, dst_stride, tx_size, plane); return; } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER av1_predict_intra_block( xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width, pd->height, tx_size, mode, angle_delta, use_palette, filter_intra_mode, dst, dst_stride, dst, dst_stride, blk_col, blk_row, plane); } void av1_init_intra_predictors(void) { aom_once(init_intra_predictors_internal); } aom-3.12.1/av1/common/reconintra.h000066400000000000000000000132261477627663500167010ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_RECONINTRA_H_ #define AOM_AV1_COMMON_RECONINTRA_H_ #include #include "aom/aom_integer.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #ifdef __cplusplus extern "C" { #endif void av1_init_intra_predictors(void); void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, int blk_col, int blk_row, TX_SIZE tx_size); void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size, int enable_intra_edge_filter, int wpx, int hpx, TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette, FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, int col_off, int row_off, int plane); // Mapping of interintra to intra mode for use in the intra component static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = { DC_PRED, V_PRED, H_PRED, SMOOTH_PRED }; // Mapping of intra mode to the interintra mode static const INTERINTRA_MODE intra_to_interintra_mode[INTRA_MODES] = { II_DC_PRED, II_V_PRED, II_H_PRED, II_V_PRED, II_SMOOTH_PRED, II_V_PRED, II_H_PRED, II_H_PRED, II_V_PRED, II_SMOOTH_PRED, II_SMOOTH_PRED }; #define FILTER_INTRA_SCALE_BITS 4 static inline int av1_is_directional_mode(PREDICTION_MODE mode) { return mode >= V_PRED && mode <= D67_PRED; } static inline int av1_is_diagonal_mode(PREDICTION_MODE mode) { return mode >= D45_PRED && mode <= D67_PRED; } static inline int av1_use_angle_delta(BLOCK_SIZE bsize) { return bsize >= BLOCK_8X8; } static inline int av1_allow_intrabc(const AV1_COMMON *const cm) { return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools && cm->features.allow_intrabc; } static inline int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm, BLOCK_SIZE bs) { if (!cm->seq_params->enable_filter_intra || bs == BLOCK_INVALID) return 0; return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32; } static inline int av1_filter_intra_allowed(const AV1_COMMON *const cm, const MB_MODE_INFO *mbmi) { return mbmi->mode == DC_PRED && mbmi->palette_mode_info.palette_size[0] == 0 && av1_filter_intra_allowed_bsize(cm, mbmi->bsize); } extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]; static const int16_t dr_intra_derivative[90] = { // More evenly spread out angles and limited to 10-bit // Values that are 0 will never be used // Approx angle 0, 0, 0, // 1023, 0, 0, // 3, ... 547, 0, 0, // 6, ... 372, 0, 0, 0, 0, // 9, ... 273, 0, 0, // 14, ... 215, 0, 0, // 17, ... 178, 0, 0, // 20, ... 151, 0, 0, // 23, ... (113 & 203 are base angles) 132, 0, 0, // 26, ... 116, 0, 0, // 29, ... 102, 0, 0, 0, // 32, ... 90, 0, 0, // 36, ... 80, 0, 0, // 39, ... 71, 0, 0, // 42, ... 64, 0, 0, // 45, ... (45 & 135 are base angles) 57, 0, 0, // 48, ... 51, 0, 0, // 51, ... 45, 0, 0, 0, // 54, ... 40, 0, 0, // 58, ... 35, 0, 0, // 61, ... 31, 0, 0, // 64, ... 27, 0, 0, // 67, ... (67 & 157 are base angles) 23, 0, 0, // 70, ... 19, 0, 0, // 73, ... 15, 0, 0, 0, 0, // 76, ... 11, 0, 0, // 81, ... 7, 0, 0, // 84, ... 3, 0, 0, // 87, ... }; // Get the shift (up-scaled by 256) in X w.r.t a unit change in Y. // If angle > 0 && angle < 90, dx = -((int)(256 / t)); // If angle > 90 && angle < 180, dx = (int)(256 / t); // If angle > 180 && angle < 270, dx = 1; static inline int av1_get_dx(int angle) { if (angle > 0 && angle < 90) { return dr_intra_derivative[angle]; } else if (angle > 90 && angle < 180) { return dr_intra_derivative[180 - angle]; } else { // In this case, we are not really going to use dx. We may return any value. return 1; } } // Get the shift (up-scaled by 256) in Y w.r.t a unit change in X. // If angle > 0 && angle < 90, dy = 1; // If angle > 90 && angle < 180, dy = (int)(256 * t); // If angle > 180 && angle < 270, dy = -((int)(256 * t)); static inline int av1_get_dy(int angle) { if (angle > 90 && angle < 180) { return dr_intra_derivative[angle - 90]; } else if (angle > 180 && angle < 270) { return dr_intra_derivative[270 - angle]; } else { // In this case, we are not really going to use dy. We may return any value. return 1; } } static inline int av1_use_intra_edge_upsample(int bs0, int bs1, int delta, int type) { const int d = abs(delta); const int blk_wh = bs0 + bs1; if (d == 0 || d >= 40) return 0; return type ? (blk_wh <= 8) : (blk_wh <= 16); } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_RECONINTRA_H_ aom-3.12.1/av1/common/resize.c000066400000000000000000001631041477627663500160320ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_ports/mem.h" #include "av1/common/common.h" #include "av1/common/resize.h" #include "config/aom_dsp_rtcd.h" #include "config/aom_scale_rtcd.h" // Filters for interpolation (0.5-band) - note this also filters integer pels. static const InterpKernel filteredinterp_filters500[(1 << RS_SUBPEL_BITS)] = { { -3, 0, 35, 64, 35, 0, -3, 0 }, { -3, 0, 34, 64, 36, 0, -3, 0 }, { -3, -1, 34, 64, 36, 1, -3, 0 }, { -3, -1, 33, 64, 37, 1, -3, 0 }, { -3, -1, 32, 64, 38, 1, -3, 0 }, { -3, -1, 31, 64, 39, 1, -3, 0 }, { -3, -1, 31, 63, 39, 2, -3, 0 }, { -2, -2, 30, 63, 40, 2, -3, 0 }, { -2, -2, 29, 63, 41, 2, -3, 0 }, { -2, -2, 29, 63, 41, 3, -4, 0 }, { -2, -2, 28, 63, 42, 3, -4, 0 }, { -2, -2, 27, 63, 43, 3, -4, 0 }, { -2, -3, 27, 63, 43, 4, -4, 0 }, { -2, -3, 26, 62, 44, 5, -4, 0 }, { -2, -3, 25, 62, 45, 5, -4, 0 }, { -2, -3, 25, 62, 45, 5, -4, 0 }, { -2, -3, 24, 62, 46, 5, -4, 0 }, { -2, -3, 23, 61, 47, 6, -4, 0 }, { -2, -3, 23, 61, 47, 6, -4, 0 }, { -2, -3, 22, 61, 48, 7, -4, -1 }, { -2, -3, 21, 60, 49, 7, -4, 0 }, { -1, -4, 20, 60, 49, 8, -4, 0 }, { -1, -4, 20, 60, 50, 8, -4, -1 }, { -1, -4, 19, 59, 51, 9, -4, -1 }, { -1, -4, 19, 59, 51, 9, -4, -1 }, { -1, -4, 18, 58, 52, 10, -4, -1 }, { -1, -4, 17, 58, 52, 11, -4, -1 }, { -1, -4, 16, 58, 53, 11, -4, -1 }, { -1, -4, 16, 57, 53, 12, -4, -1 }, { -1, -4, 15, 57, 54, 12, -4, -1 }, { -1, -4, 15, 56, 54, 13, -4, -1 }, { -1, -4, 14, 56, 55, 13, -4, -1 }, { -1, -4, 14, 55, 55, 14, -4, -1 }, { -1, -4, 13, 55, 56, 14, -4, -1 }, { -1, -4, 13, 54, 56, 15, -4, -1 }, { -1, -4, 12, 54, 57, 15, -4, -1 }, { -1, -4, 12, 53, 57, 16, -4, -1 }, { -1, -4, 11, 53, 58, 16, -4, -1 }, { -1, -4, 11, 52, 58, 17, -4, -1 }, { -1, -4, 10, 52, 58, 18, -4, -1 }, { -1, -4, 9, 51, 59, 19, -4, -1 }, { -1, -4, 9, 51, 59, 19, -4, -1 }, { -1, -4, 8, 50, 60, 20, -4, -1 }, { 0, -4, 8, 49, 60, 20, -4, -1 }, { 0, -4, 7, 49, 60, 21, -3, -2 }, { -1, -4, 7, 48, 61, 22, -3, -2 }, { 0, -4, 6, 47, 61, 23, -3, -2 }, { 0, -4, 6, 47, 61, 23, -3, -2 }, { 0, -4, 5, 46, 62, 24, -3, -2 }, { 0, -4, 5, 45, 62, 25, -3, -2 }, { 0, -4, 5, 45, 62, 25, -3, -2 }, { 0, -4, 5, 44, 62, 26, -3, -2 }, { 0, -4, 4, 43, 63, 27, -3, -2 }, { 0, -4, 3, 43, 63, 27, -2, -2 }, { 0, -4, 3, 42, 63, 28, -2, -2 }, { 0, -4, 3, 41, 63, 29, -2, -2 }, { 0, -3, 2, 41, 63, 29, -2, -2 }, { 0, -3, 2, 40, 63, 30, -2, -2 }, { 0, -3, 2, 39, 63, 31, -1, -3 }, { 0, -3, 1, 39, 64, 31, -1, -3 }, { 0, -3, 1, 38, 64, 32, -1, -3 }, { 0, -3, 1, 37, 64, 33, -1, -3 }, { 0, -3, 1, 36, 64, 34, -1, -3 }, { 0, -3, 0, 36, 64, 34, 0, -3 }, }; // Filters for interpolation (0.625-band) - note this also filters integer pels. static const InterpKernel filteredinterp_filters625[(1 << RS_SUBPEL_BITS)] = { { -1, -8, 33, 80, 33, -8, -1, 0 }, { -1, -8, 31, 80, 34, -8, -1, 1 }, { -1, -8, 30, 80, 35, -8, -1, 1 }, { -1, -8, 29, 80, 36, -7, -2, 1 }, { -1, -8, 28, 80, 37, -7, -2, 1 }, { -1, -8, 27, 80, 38, -7, -2, 1 }, { 0, -8, 26, 79, 39, -7, -2, 1 }, { 0, -8, 25, 79, 40, -7, -2, 1 }, { 0, -8, 24, 79, 41, -7, -2, 1 }, { 0, -8, 23, 78, 42, -6, -2, 1 }, { 0, -8, 22, 78, 43, -6, -2, 1 }, { 0, -8, 21, 78, 44, -6, -2, 1 }, { 0, -8, 20, 78, 45, -5, -3, 1 }, { 0, -8, 19, 77, 47, -5, -3, 1 }, { 0, -8, 18, 77, 48, -5, -3, 1 }, { 0, -8, 17, 77, 49, -5, -3, 1 }, { 0, -8, 16, 76, 50, -4, -3, 1 }, { 0, -8, 15, 76, 51, -4, -3, 1 }, { 0, -8, 15, 75, 52, -3, -4, 1 }, { 0, -7, 14, 74, 53, -3, -4, 1 }, { 0, -7, 13, 74, 54, -3, -4, 1 }, { 0, -7, 12, 73, 55, -2, -4, 1 }, { 0, -7, 11, 73, 56, -2, -4, 1 }, { 0, -7, 10, 72, 57, -1, -4, 1 }, { 1, -7, 10, 71, 58, -1, -5, 1 }, { 0, -7, 9, 71, 59, 0, -5, 1 }, { 1, -7, 8, 70, 60, 0, -5, 1 }, { 1, -7, 7, 69, 61, 1, -5, 1 }, { 1, -6, 6, 68, 62, 1, -5, 1 }, { 0, -6, 6, 68, 62, 2, -5, 1 }, { 1, -6, 5, 67, 63, 2, -5, 1 }, { 1, -6, 5, 66, 64, 3, -6, 1 }, { 1, -6, 4, 65, 65, 4, -6, 1 }, { 1, -6, 3, 64, 66, 5, -6, 1 }, { 1, -5, 2, 63, 67, 5, -6, 1 }, { 1, -5, 2, 62, 68, 6, -6, 0 }, { 1, -5, 1, 62, 68, 6, -6, 1 }, { 1, -5, 1, 61, 69, 7, -7, 1 }, { 1, -5, 0, 60, 70, 8, -7, 1 }, { 1, -5, 0, 59, 71, 9, -7, 0 }, { 1, -5, -1, 58, 71, 10, -7, 1 }, { 1, -4, -1, 57, 72, 10, -7, 0 }, { 1, -4, -2, 56, 73, 11, -7, 0 }, { 1, -4, -2, 55, 73, 12, -7, 0 }, { 1, -4, -3, 54, 74, 13, -7, 0 }, { 1, -4, -3, 53, 74, 14, -7, 0 }, { 1, -4, -3, 52, 75, 15, -8, 0 }, { 1, -3, -4, 51, 76, 15, -8, 0 }, { 1, -3, -4, 50, 76, 16, -8, 0 }, { 1, -3, -5, 49, 77, 17, -8, 0 }, { 1, -3, -5, 48, 77, 18, -8, 0 }, { 1, -3, -5, 47, 77, 19, -8, 0 }, { 1, -3, -5, 45, 78, 20, -8, 0 }, { 1, -2, -6, 44, 78, 21, -8, 0 }, { 1, -2, -6, 43, 78, 22, -8, 0 }, { 1, -2, -6, 42, 78, 23, -8, 0 }, { 1, -2, -7, 41, 79, 24, -8, 0 }, { 1, -2, -7, 40, 79, 25, -8, 0 }, { 1, -2, -7, 39, 79, 26, -8, 0 }, { 1, -2, -7, 38, 80, 27, -8, -1 }, { 1, -2, -7, 37, 80, 28, -8, -1 }, { 1, -2, -7, 36, 80, 29, -8, -1 }, { 1, -1, -8, 35, 80, 30, -8, -1 }, { 1, -1, -8, 34, 80, 31, -8, -1 }, }; // Filters for interpolation (0.75-band) - note this also filters integer pels. static const InterpKernel filteredinterp_filters750[(1 << RS_SUBPEL_BITS)] = { { 2, -11, 25, 96, 25, -11, 2, 0 }, { 2, -11, 24, 96, 26, -11, 2, 0 }, { 2, -11, 22, 96, 28, -11, 2, 0 }, { 2, -10, 21, 96, 29, -12, 2, 0 }, { 2, -10, 19, 96, 31, -12, 2, 0 }, { 2, -10, 18, 95, 32, -11, 2, 0 }, { 2, -10, 17, 95, 34, -12, 2, 0 }, { 2, -9, 15, 95, 35, -12, 2, 0 }, { 2, -9, 14, 94, 37, -12, 2, 0 }, { 2, -9, 13, 94, 38, -12, 2, 0 }, { 2, -8, 12, 93, 40, -12, 1, 0 }, { 2, -8, 11, 93, 41, -12, 1, 0 }, { 2, -8, 9, 92, 43, -12, 1, 1 }, { 2, -8, 8, 92, 44, -12, 1, 1 }, { 2, -7, 7, 91, 46, -12, 1, 0 }, { 2, -7, 6, 90, 47, -12, 1, 1 }, { 2, -7, 5, 90, 49, -12, 1, 0 }, { 2, -6, 4, 89, 50, -12, 1, 0 }, { 2, -6, 3, 88, 52, -12, 0, 1 }, { 2, -6, 2, 87, 54, -12, 0, 1 }, { 2, -5, 1, 86, 55, -12, 0, 1 }, { 2, -5, 0, 85, 57, -12, 0, 1 }, { 2, -5, -1, 84, 58, -11, 0, 1 }, { 2, -5, -2, 83, 60, -11, 0, 1 }, { 2, -4, -2, 82, 61, -11, -1, 1 }, { 1, -4, -3, 81, 63, -10, -1, 1 }, { 2, -4, -4, 80, 64, -10, -1, 1 }, { 1, -4, -4, 79, 66, -10, -1, 1 }, { 1, -3, -5, 77, 67, -9, -1, 1 }, { 1, -3, -6, 76, 69, -9, -1, 1 }, { 1, -3, -6, 75, 70, -8, -2, 1 }, { 1, -2, -7, 74, 71, -8, -2, 1 }, { 1, -2, -7, 72, 72, -7, -2, 1 }, { 1, -2, -8, 71, 74, -7, -2, 1 }, { 1, -2, -8, 70, 75, -6, -3, 1 }, { 1, -1, -9, 69, 76, -6, -3, 1 }, { 1, -1, -9, 67, 77, -5, -3, 1 }, { 1, -1, -10, 66, 79, -4, -4, 1 }, { 1, -1, -10, 64, 80, -4, -4, 2 }, { 1, -1, -10, 63, 81, -3, -4, 1 }, { 1, -1, -11, 61, 82, -2, -4, 2 }, { 1, 0, -11, 60, 83, -2, -5, 2 }, { 1, 0, -11, 58, 84, -1, -5, 2 }, { 1, 0, -12, 57, 85, 0, -5, 2 }, { 1, 0, -12, 55, 86, 1, -5, 2 }, { 1, 0, -12, 54, 87, 2, -6, 2 }, { 1, 0, -12, 52, 88, 3, -6, 2 }, { 0, 1, -12, 50, 89, 4, -6, 2 }, { 0, 1, -12, 49, 90, 5, -7, 2 }, { 1, 1, -12, 47, 90, 6, -7, 2 }, { 0, 1, -12, 46, 91, 7, -7, 2 }, { 1, 1, -12, 44, 92, 8, -8, 2 }, { 1, 1, -12, 43, 92, 9, -8, 2 }, { 0, 1, -12, 41, 93, 11, -8, 2 }, { 0, 1, -12, 40, 93, 12, -8, 2 }, { 0, 2, -12, 38, 94, 13, -9, 2 }, { 0, 2, -12, 37, 94, 14, -9, 2 }, { 0, 2, -12, 35, 95, 15, -9, 2 }, { 0, 2, -12, 34, 95, 17, -10, 2 }, { 0, 2, -11, 32, 95, 18, -10, 2 }, { 0, 2, -12, 31, 96, 19, -10, 2 }, { 0, 2, -12, 29, 96, 21, -10, 2 }, { 0, 2, -11, 28, 96, 22, -11, 2 }, { 0, 2, -11, 26, 96, 24, -11, 2 }, }; // Filters for interpolation (0.875-band) - note this also filters integer pels. static const InterpKernel filteredinterp_filters875[(1 << RS_SUBPEL_BITS)] = { { 3, -8, 13, 112, 13, -8, 3, 0 }, { 2, -7, 12, 112, 15, -8, 3, -1 }, { 3, -7, 10, 112, 17, -9, 3, -1 }, { 2, -6, 8, 112, 19, -9, 3, -1 }, { 2, -6, 7, 112, 21, -10, 3, -1 }, { 2, -5, 6, 111, 22, -10, 3, -1 }, { 2, -5, 4, 111, 24, -10, 3, -1 }, { 2, -4, 3, 110, 26, -11, 3, -1 }, { 2, -4, 1, 110, 28, -11, 3, -1 }, { 2, -4, 0, 109, 30, -12, 4, -1 }, { 1, -3, -1, 108, 32, -12, 4, -1 }, { 1, -3, -2, 108, 34, -13, 4, -1 }, { 1, -2, -4, 107, 36, -13, 4, -1 }, { 1, -2, -5, 106, 38, -13, 4, -1 }, { 1, -1, -6, 105, 40, -14, 4, -1 }, { 1, -1, -7, 104, 42, -14, 4, -1 }, { 1, -1, -7, 103, 44, -15, 4, -1 }, { 1, 0, -8, 101, 46, -15, 4, -1 }, { 1, 0, -9, 100, 48, -15, 4, -1 }, { 1, 0, -10, 99, 50, -15, 4, -1 }, { 1, 1, -11, 97, 53, -16, 4, -1 }, { 0, 1, -11, 96, 55, -16, 4, -1 }, { 0, 1, -12, 95, 57, -16, 4, -1 }, { 0, 2, -13, 93, 59, -16, 4, -1 }, { 0, 2, -13, 91, 61, -16, 4, -1 }, { 0, 2, -14, 90, 63, -16, 4, -1 }, { 0, 2, -14, 88, 65, -16, 4, -1 }, { 0, 2, -15, 86, 67, -16, 4, 0 }, { 0, 3, -15, 84, 69, -17, 4, 0 }, { 0, 3, -16, 83, 71, -17, 4, 0 }, { 0, 3, -16, 81, 73, -16, 3, 0 }, { 0, 3, -16, 79, 75, -16, 3, 0 }, { 0, 3, -16, 77, 77, -16, 3, 0 }, { 0, 3, -16, 75, 79, -16, 3, 0 }, { 0, 3, -16, 73, 81, -16, 3, 0 }, { 0, 4, -17, 71, 83, -16, 3, 0 }, { 0, 4, -17, 69, 84, -15, 3, 0 }, { 0, 4, -16, 67, 86, -15, 2, 0 }, { -1, 4, -16, 65, 88, -14, 2, 0 }, { -1, 4, -16, 63, 90, -14, 2, 0 }, { -1, 4, -16, 61, 91, -13, 2, 0 }, { -1, 4, -16, 59, 93, -13, 2, 0 }, { -1, 4, -16, 57, 95, -12, 1, 0 }, { -1, 4, -16, 55, 96, -11, 1, 0 }, { -1, 4, -16, 53, 97, -11, 1, 1 }, { -1, 4, -15, 50, 99, -10, 0, 1 }, { -1, 4, -15, 48, 100, -9, 0, 1 }, { -1, 4, -15, 46, 101, -8, 0, 1 }, { -1, 4, -15, 44, 103, -7, -1, 1 }, { -1, 4, -14, 42, 104, -7, -1, 1 }, { -1, 4, -14, 40, 105, -6, -1, 1 }, { -1, 4, -13, 38, 106, -5, -2, 1 }, { -1, 4, -13, 36, 107, -4, -2, 1 }, { -1, 4, -13, 34, 108, -2, -3, 1 }, { -1, 4, -12, 32, 108, -1, -3, 1 }, { -1, 4, -12, 30, 109, 0, -4, 2 }, { -1, 3, -11, 28, 110, 1, -4, 2 }, { -1, 3, -11, 26, 110, 3, -4, 2 }, { -1, 3, -10, 24, 111, 4, -5, 2 }, { -1, 3, -10, 22, 111, 6, -5, 2 }, { -1, 3, -10, 21, 112, 7, -6, 2 }, { -1, 3, -9, 19, 112, 8, -6, 2 }, { -1, 3, -9, 17, 112, 10, -7, 3 }, { -1, 3, -8, 15, 112, 12, -7, 2 }, }; const int16_t av1_resize_filter_normative[( 1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = { #if UPSCALE_NORMATIVE_TAPS == 8 { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -1, 128, 2, -1, 0, 0 }, { 0, 1, -3, 127, 4, -2, 1, 0 }, { 0, 1, -4, 127, 6, -3, 1, 0 }, { 0, 2, -6, 126, 8, -3, 1, 0 }, { 0, 2, -7, 125, 11, -4, 1, 0 }, { -1, 2, -8, 125, 13, -5, 2, 0 }, { -1, 3, -9, 124, 15, -6, 2, 0 }, { -1, 3, -10, 123, 18, -6, 2, -1 }, { -1, 3, -11, 122, 20, -7, 3, -1 }, { -1, 4, -12, 121, 22, -8, 3, -1 }, { -1, 4, -13, 120, 25, -9, 3, -1 }, { -1, 4, -14, 118, 28, -9, 3, -1 }, { -1, 4, -15, 117, 30, -10, 4, -1 }, { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 }, { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 }, { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 }, { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 }, { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 }, { -1, 6, -20, 97, 58, -17, 6, -1 }, { -1, 6, -20, 95, 61, -18, 6, -1 }, { -2, 7, -20, 93, 64, -18, 6, -2 }, { -2, 7, -20, 91, 66, -19, 6, -1 }, { -2, 7, -20, 88, 69, -19, 6, -1 }, { -2, 7, -20, 86, 71, -19, 6, -1 }, { -2, 7, -20, 84, 74, -20, 7, -2 }, { -2, 7, -20, 81, 76, -20, 7, -1 }, { -2, 7, -20, 79, 79, -20, 7, -2 }, { -1, 7, -20, 76, 81, -20, 7, -2 }, { -2, 7, -20, 74, 84, -20, 7, -2 }, { -1, 6, -19, 71, 86, -20, 7, -2 }, { -1, 6, -19, 69, 88, -20, 7, -2 }, { -1, 6, -19, 66, 91, -20, 7, -2 }, { -2, 6, -18, 64, 93, -20, 7, -2 }, { -1, 6, -18, 61, 95, -20, 6, -1 }, { -1, 6, -17, 58, 97, -20, 6, -1 }, { -1, 6, -17, 56, 99, -20, 6, -1 }, { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 }, { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 }, { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 }, { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 }, { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 }, { -1, 3, -9, 28, 118, -14, 4, -1 }, { -1, 3, -9, 25, 120, -13, 4, -1 }, { -1, 3, -8, 22, 121, -12, 4, -1 }, { -1, 3, -7, 20, 122, -11, 3, -1 }, { -1, 2, -6, 18, 123, -10, 3, -1 }, { 0, 2, -6, 15, 124, -9, 3, -1 }, { 0, 2, -5, 13, 125, -8, 2, -1 }, { 0, 1, -4, 11, 125, -7, 2, 0 }, { 0, 1, -3, 8, 126, -6, 2, 0 }, { 0, 1, -3, 6, 127, -4, 1, 0 }, { 0, 1, -2, 4, 127, -3, 1, 0 }, { 0, 0, -1, 2, 128, -1, 0, 0 }, #else #error "Invalid value of UPSCALE_NORMATIVE_TAPS" #endif // UPSCALE_NORMATIVE_TAPS == 8 }; // Filters for interpolation (full-band) - no filtering for integer pixels #define filteredinterp_filters1000 av1_resize_filter_normative static const InterpKernel *choose_interp_filter(int in_length, int out_length) { int out_length16 = out_length * 16; if (out_length16 >= in_length * 16) return filteredinterp_filters1000; else if (out_length16 >= in_length * 13) return filteredinterp_filters875; else if (out_length16 >= in_length * 11) return filteredinterp_filters750; else if (out_length16 >= in_length * 9) return filteredinterp_filters625; else return filteredinterp_filters500; } static void interpolate_core(const uint8_t *const input, int in_length, uint8_t *output, int out_length, const int16_t *interp_filters, int interp_taps) { const int32_t delta = (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length; const int32_t offset = in_length > out_length ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) + out_length / 2) / out_length : -(((int32_t)(out_length - in_length) << (RS_SCALE_SUBPEL_BITS - 1)) + out_length / 2) / out_length; uint8_t *optr = output; int x, x1, x2, sum, k, int_pel, sub_pel; int32_t y; x = 0; y = offset + RS_SCALE_EXTRA_OFF; while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) { x++; y += delta; } x1 = x; x = out_length - 1; y = delta * x + offset + RS_SCALE_EXTRA_OFF; while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >= in_length) { x--; y -= delta; } x2 = x; if (x1 > x2) { for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length; ++x, y += delta) { int_pel = y >> RS_SCALE_SUBPEL_BITS; sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; const int16_t *filter = &interp_filters[sub_pel * interp_taps]; sum = 0; for (k = 0; k < interp_taps; ++k) { const int pk = int_pel - interp_taps / 2 + 1 + k; sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)]; } *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); } } else { // Initial part. for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) { int_pel = y >> RS_SCALE_SUBPEL_BITS; sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; const int16_t *filter = &interp_filters[sub_pel * interp_taps]; sum = 0; for (k = 0; k < interp_taps; ++k) sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)]; *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); } // Middle part. for (; x <= x2; ++x, y += delta) { int_pel = y >> RS_SCALE_SUBPEL_BITS; sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; const int16_t *filter = &interp_filters[sub_pel * interp_taps]; sum = 0; for (k = 0; k < interp_taps; ++k) sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k]; *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); } // End part. for (; x < out_length; ++x, y += delta) { int_pel = y >> RS_SCALE_SUBPEL_BITS; sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; const int16_t *filter = &interp_filters[sub_pel * interp_taps]; sum = 0; for (k = 0; k < interp_taps; ++k) sum += filter[k] * input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)]; *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); } } } static void interpolate(const uint8_t *const input, int in_length, uint8_t *output, int out_length) { const InterpKernel *interp_filters = choose_interp_filter(in_length, out_length); interpolate_core(input, in_length, output, out_length, &interp_filters[0][0], SUBPEL_TAPS); } int32_t av1_get_upscale_convolve_step(int in_length, int out_length) { return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length; } static int32_t get_upscale_convolve_x0(int in_length, int out_length, int32_t x_step_qn) { const int err = out_length * x_step_qn - (in_length << RS_SCALE_SUBPEL_BITS); const int32_t x0 = (-((out_length - in_length) << (RS_SCALE_SUBPEL_BITS - 1)) + out_length / 2) / out_length + RS_SCALE_EXTRA_OFF - err / 2; return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK); } void down2_symeven(const uint8_t *const input, int length, uint8_t *output, int start_offset) { // Actual filter len = 2 * filter_len_half. const int16_t *filter = av1_down2_symeven_half_filter; const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2; int i, j; uint8_t *optr = output; int l1 = filter_len_half; int l2 = (length - filter_len_half); l1 += (l1 & 1); l2 += (l2 & 1); if (l1 > l2) { // Short input length. for (i = start_offset; i < length; i += 2) { int sum = (1 << (FILTER_BITS - 1)); for (j = 0; j < filter_len_half; ++j) { sum += (input[AOMMAX(i - j, 0)] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j]; } sum >>= FILTER_BITS; *optr++ = clip_pixel(sum); } } else { // Initial part. for (i = start_offset; i < l1; i += 2) { int sum = (1 << (FILTER_BITS - 1)); for (j = 0; j < filter_len_half; ++j) { sum += (input[AOMMAX(i - j, 0)] + input[i + 1 + j]) * filter[j]; } sum >>= FILTER_BITS; *optr++ = clip_pixel(sum); } // Middle part. for (; i < l2; i += 2) { int sum = (1 << (FILTER_BITS - 1)); for (j = 0; j < filter_len_half; ++j) { sum += (input[i - j] + input[i + 1 + j]) * filter[j]; } sum >>= FILTER_BITS; *optr++ = clip_pixel(sum); } // End part. for (; i < length; i += 2) { int sum = (1 << (FILTER_BITS - 1)); for (j = 0; j < filter_len_half; ++j) { sum += (input[i - j] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j]; } sum >>= FILTER_BITS; *optr++ = clip_pixel(sum); } } } static void down2_symodd(const uint8_t *const input, int length, uint8_t *output) { // Actual filter len = 2 * filter_len_half - 1. const int16_t *filter = av1_down2_symodd_half_filter; const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2; int i, j; uint8_t *optr = output; int l1 = filter_len_half - 1; int l2 = (length - filter_len_half + 1); l1 += (l1 & 1); l2 += (l2 & 1); if (l1 > l2) { // Short input length. for (i = 0; i < length; i += 2) { int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; for (j = 1; j < filter_len_half; ++j) { sum += (input[(i - j < 0 ? 0 : i - j)] + input[(i + j >= length ? length - 1 : i + j)]) * filter[j]; } sum >>= FILTER_BITS; *optr++ = clip_pixel(sum); } } else { // Initial part. for (i = 0; i < l1; i += 2) { int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; for (j = 1; j < filter_len_half; ++j) { sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j]; } sum >>= FILTER_BITS; *optr++ = clip_pixel(sum); } // Middle part. for (; i < l2; i += 2) { int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; for (j = 1; j < filter_len_half; ++j) { sum += (input[i - j] + input[i + j]) * filter[j]; } sum >>= FILTER_BITS; *optr++ = clip_pixel(sum); } // End part. for (; i < length; i += 2) { int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; for (j = 1; j < filter_len_half; ++j) { sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) * filter[j]; } sum >>= FILTER_BITS; *optr++ = clip_pixel(sum); } } } static int get_down2_length(int length, int steps) { for (int s = 0; s < steps; ++s) length = (length + 1) >> 1; return length; } static int get_down2_steps(int in_length, int out_length) { int steps = 0; int proj_in_length; while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) { ++steps; in_length = proj_in_length; if (in_length == 1) { // Special case: we break because any further calls to get_down2_length() // with be with length == 1, which return 1, resulting in an infinite // loop. break; } } return steps; } static void resize_multistep(const uint8_t *const input, int length, uint8_t *output, int olength, uint8_t *otmp) { if (length == olength) { memcpy(output, input, sizeof(output[0]) * length); return; } const int steps = get_down2_steps(length, olength); if (steps > 0) { uint8_t *out = NULL; int filteredlength = length; assert(otmp != NULL); uint8_t *otmp2 = otmp + get_down2_length(length, 1); for (int s = 0; s < steps; ++s) { const int proj_filteredlength = get_down2_length(filteredlength, 1); const uint8_t *const in = (s == 0 ? input : out); if (s == steps - 1 && proj_filteredlength == olength) out = output; else out = (s & 1 ? otmp2 : otmp); if (filteredlength & 1) down2_symodd(in, filteredlength, out); else down2_symeven(in, filteredlength, out, 0); filteredlength = proj_filteredlength; } if (filteredlength != olength) { interpolate(out, filteredlength, output, olength); } } else { interpolate(input, length, output, olength); } } static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) { int i; uint8_t *iptr = img; uint8_t *aptr = arr; for (i = 0; i < len; ++i, iptr += stride) { *aptr++ = *iptr; } } static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) { int i; uint8_t *iptr = img; uint8_t *aptr = arr; for (i = 0; i < len; ++i, iptr += stride) { *iptr = *aptr++; } } bool av1_resize_vert_dir_c(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int width2, int start_col) { bool mem_status = true; uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(*arrbuf) * height); uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(*arrbuf2) * height2); if (arrbuf == NULL || arrbuf2 == NULL) { mem_status = false; goto Error; } for (int i = start_col; i < width2; ++i) { fill_col_to_arr(intbuf + i, width2, height, arrbuf); down2_symeven(arrbuf, height, arrbuf2, 0); fill_arr_to_col(output + i, out_stride, height2, arrbuf2); } Error: aom_free(arrbuf); aom_free(arrbuf2); return mem_status; } void av1_resize_horz_dir_c(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filtered_length, int width2) { for (int i = 0; i < height; ++i) down2_symeven(input + in_stride * i, filtered_length, intbuf + width2 * i, 0); } bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride) { uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(*intbuf) * width2 * height); if (intbuf == NULL) { return false; } // Resize in the horizontal direction av1_resize_horz_dir(input, in_stride, intbuf, height, width, width2); // Resize in the vertical direction bool mem_status = av1_resize_vert_dir(intbuf, output, out_stride, height, height2, width2, 0 /*start_col*/); aom_free(intbuf); return mem_status; } // Check if both the output width and height are half of input width and // height respectively. bool should_resize_by_half(int height, int width, int height2, int width2) { const bool is_width_by_2 = get_down2_length(width, 1) == width2; const bool is_height_by_2 = get_down2_length(height, 1) == height2; return (is_width_by_2 && is_height_by_2); } bool av1_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride) { int i; bool mem_status = true; uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * width2 * height); uint8_t *tmpbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * AOMMAX(width, height)); uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * height); uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(uint8_t) * height2); if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) { mem_status = false; goto Error; } assert(width > 0); assert(height > 0); assert(width2 > 0); assert(height2 > 0); for (i = 0; i < height; ++i) resize_multistep(input + in_stride * i, width, intbuf + width2 * i, width2, tmpbuf); for (i = 0; i < width2; ++i) { fill_col_to_arr(intbuf + i, width2, height, arrbuf); resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf); fill_arr_to_col(output + i, out_stride, height2, arrbuf2); } Error: aom_free(intbuf); aom_free(tmpbuf); aom_free(arrbuf); aom_free(arrbuf2); return mem_status; } static bool upscale_normative_rect(const uint8_t *const input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride, int x_step_qn, int x0_qn, int pad_left, int pad_right) { assert(width > 0); assert(height > 0); assert(width2 > 0); assert(height2 > 0); assert(height2 == height); // Extend the left/right pixels of the tile column if needed // (either because we can't sample from other tiles, or because we're at // a frame edge). // Save the overwritten pixels into tmp_left and tmp_right. // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra // column of border pixels compared to what we'd naively think. const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1; uint8_t *tmp_left = NULL; // Silence spurious "may be used uninitialized" warnings uint8_t *tmp_right = NULL; uint8_t *const in_tl = (uint8_t *)(input - border_cols); // Cast off 'const' uint8_t *const in_tr = (uint8_t *)(input + width); if (pad_left) { tmp_left = (uint8_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height); if (!tmp_left) return false; for (int i = 0; i < height; i++) { memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_cols); memset(in_tl + i * in_stride, input[i * in_stride], border_cols); } } if (pad_right) { tmp_right = (uint8_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height); if (!tmp_right) { aom_free(tmp_left); return false; } for (int i = 0; i < height; i++) { memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_cols); memset(in_tr + i * in_stride, input[i * in_stride + width - 1], border_cols); } } av1_convolve_horiz_rs(input - 1, in_stride, output, out_stride, width2, height2, &av1_resize_filter_normative[0][0], x0_qn, x_step_qn); // Restore the left/right border pixels if (pad_left) { for (int i = 0; i < height; i++) { memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_cols); } aom_free(tmp_left); } if (pad_right) { for (int i = 0; i < height; i++) { memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_cols); } aom_free(tmp_right); } return true; } #if CONFIG_AV1_HIGHBITDEPTH static void highbd_interpolate_core(const uint16_t *const input, int in_length, uint16_t *output, int out_length, int bd, const int16_t *interp_filters, int interp_taps) { const int32_t delta = (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length; const int32_t offset = in_length > out_length ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) + out_length / 2) / out_length : -(((int32_t)(out_length - in_length) << (RS_SCALE_SUBPEL_BITS - 1)) + out_length / 2) / out_length; uint16_t *optr = output; int x, x1, x2, sum, k, int_pel, sub_pel; int32_t y; x = 0; y = offset + RS_SCALE_EXTRA_OFF; while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) { x++; y += delta; } x1 = x; x = out_length - 1; y = delta * x + offset + RS_SCALE_EXTRA_OFF; while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >= in_length) { x--; y -= delta; } x2 = x; if (x1 > x2) { for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length; ++x, y += delta) { int_pel = y >> RS_SCALE_SUBPEL_BITS; sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; const int16_t *filter = &interp_filters[sub_pel * interp_taps]; sum = 0; for (k = 0; k < interp_taps; ++k) { const int pk = int_pel - interp_taps / 2 + 1 + k; sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)]; } *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); } } else { // Initial part. for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) { int_pel = y >> RS_SCALE_SUBPEL_BITS; sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; const int16_t *filter = &interp_filters[sub_pel * interp_taps]; sum = 0; for (k = 0; k < interp_taps; ++k) sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)]; *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); } // Middle part. for (; x <= x2; ++x, y += delta) { int_pel = y >> RS_SCALE_SUBPEL_BITS; sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; const int16_t *filter = &interp_filters[sub_pel * interp_taps]; sum = 0; for (k = 0; k < interp_taps; ++k) sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k]; *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); } // End part. for (; x < out_length; ++x, y += delta) { int_pel = y >> RS_SCALE_SUBPEL_BITS; sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; const int16_t *filter = &interp_filters[sub_pel * interp_taps]; sum = 0; for (k = 0; k < interp_taps; ++k) sum += filter[k] * input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)]; *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); } } } static void highbd_interpolate(const uint16_t *const input, int in_length, uint16_t *output, int out_length, int bd) { const InterpKernel *interp_filters = choose_interp_filter(in_length, out_length); highbd_interpolate_core(input, in_length, output, out_length, bd, &interp_filters[0][0], SUBPEL_TAPS); } static void highbd_down2_symeven(const uint16_t *const input, int length, uint16_t *output, int bd) { // Actual filter len = 2 * filter_len_half. static const int16_t *filter = av1_down2_symeven_half_filter; const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2; int i, j; uint16_t *optr = output; int l1 = filter_len_half; int l2 = (length - filter_len_half); l1 += (l1 & 1); l2 += (l2 & 1); if (l1 > l2) { // Short input length. for (i = 0; i < length; i += 2) { int sum = (1 << (FILTER_BITS - 1)); for (j = 0; j < filter_len_half; ++j) { sum += (input[AOMMAX(0, i - j)] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j]; } sum >>= FILTER_BITS; *optr++ = clip_pixel_highbd(sum, bd); } } else { // Initial part. for (i = 0; i < l1; i += 2) { int sum = (1 << (FILTER_BITS - 1)); for (j = 0; j < filter_len_half; ++j) { sum += (input[AOMMAX(0, i - j)] + input[i + 1 + j]) * filter[j]; } sum >>= FILTER_BITS; *optr++ = clip_pixel_highbd(sum, bd); } // Middle part. for (; i < l2; i += 2) { int sum = (1 << (FILTER_BITS - 1)); for (j = 0; j < filter_len_half; ++j) { sum += (input[i - j] + input[i + 1 + j]) * filter[j]; } sum >>= FILTER_BITS; *optr++ = clip_pixel_highbd(sum, bd); } // End part. for (; i < length; i += 2) { int sum = (1 << (FILTER_BITS - 1)); for (j = 0; j < filter_len_half; ++j) { sum += (input[i - j] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j]; } sum >>= FILTER_BITS; *optr++ = clip_pixel_highbd(sum, bd); } } } static void highbd_down2_symodd(const uint16_t *const input, int length, uint16_t *output, int bd) { // Actual filter len = 2 * filter_len_half - 1. static const int16_t *filter = av1_down2_symodd_half_filter; const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2; int i, j; uint16_t *optr = output; int l1 = filter_len_half - 1; int l2 = (length - filter_len_half + 1); l1 += (l1 & 1); l2 += (l2 & 1); if (l1 > l2) { // Short input length. for (i = 0; i < length; i += 2) { int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; for (j = 1; j < filter_len_half; ++j) { sum += (input[AOMMAX(i - j, 0)] + input[AOMMIN(i + j, length - 1)]) * filter[j]; } sum >>= FILTER_BITS; *optr++ = clip_pixel_highbd(sum, bd); } } else { // Initial part. for (i = 0; i < l1; i += 2) { int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; for (j = 1; j < filter_len_half; ++j) { sum += (input[AOMMAX(i - j, 0)] + input[i + j]) * filter[j]; } sum >>= FILTER_BITS; *optr++ = clip_pixel_highbd(sum, bd); } // Middle part. for (; i < l2; i += 2) { int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; for (j = 1; j < filter_len_half; ++j) { sum += (input[i - j] + input[i + j]) * filter[j]; } sum >>= FILTER_BITS; *optr++ = clip_pixel_highbd(sum, bd); } // End part. for (; i < length; i += 2) { int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; for (j = 1; j < filter_len_half; ++j) { sum += (input[i - j] + input[AOMMIN(i + j, length - 1)]) * filter[j]; } sum >>= FILTER_BITS; *optr++ = clip_pixel_highbd(sum, bd); } } } static void highbd_resize_multistep(const uint16_t *const input, int length, uint16_t *output, int olength, uint16_t *otmp, int bd) { if (length == olength) { memcpy(output, input, sizeof(output[0]) * length); return; } const int steps = get_down2_steps(length, olength); if (steps > 0) { uint16_t *out = NULL; int filteredlength = length; assert(otmp != NULL); uint16_t *otmp2 = otmp + get_down2_length(length, 1); for (int s = 0; s < steps; ++s) { const int proj_filteredlength = get_down2_length(filteredlength, 1); const uint16_t *const in = (s == 0 ? input : out); if (s == steps - 1 && proj_filteredlength == olength) out = output; else out = (s & 1 ? otmp2 : otmp); if (filteredlength & 1) highbd_down2_symodd(in, filteredlength, out, bd); else highbd_down2_symeven(in, filteredlength, out, bd); filteredlength = proj_filteredlength; } if (filteredlength != olength) { highbd_interpolate(out, filteredlength, output, olength, bd); } } else { highbd_interpolate(input, length, output, olength, bd); } } static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len, uint16_t *arr) { int i; uint16_t *iptr = img; uint16_t *aptr = arr; for (i = 0; i < len; ++i, iptr += stride) { *aptr++ = *iptr; } } static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len, uint16_t *arr) { int i; uint16_t *iptr = img; uint16_t *aptr = arr; for (i = 0; i < len; ++i, iptr += stride) { *iptr = *aptr++; } } static void highbd_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride, int bd) { int i; uint16_t *intbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * width2 * height); uint16_t *tmpbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * AOMMAX(width, height)); uint16_t *arrbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * height); uint16_t *arrbuf2 = (uint16_t *)aom_malloc(sizeof(uint16_t) * height2); if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error; for (i = 0; i < height; ++i) { highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width, intbuf + width2 * i, width2, tmpbuf, bd); } for (i = 0; i < width2; ++i) { highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf); highbd_resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf, bd); highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2, arrbuf2); } Error: aom_free(intbuf); aom_free(tmpbuf); aom_free(arrbuf); aom_free(arrbuf2); } static bool highbd_upscale_normative_rect(const uint8_t *const input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride, int x_step_qn, int x0_qn, int pad_left, int pad_right, int bd) { assert(width > 0); assert(height > 0); assert(width2 > 0); assert(height2 > 0); assert(height2 == height); // Extend the left/right pixels of the tile column if needed // (either because we can't sample from other tiles, or because we're at // a frame edge). // Save the overwritten pixels into tmp_left and tmp_right. // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra // column of border pixels compared to what we'd naively think. const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1; const int border_size = border_cols * sizeof(uint16_t); uint16_t *tmp_left = NULL; // Silence spurious "may be used uninitialized" warnings uint16_t *tmp_right = NULL; uint16_t *const input16 = CONVERT_TO_SHORTPTR(input); uint16_t *const in_tl = input16 - border_cols; uint16_t *const in_tr = input16 + width; if (pad_left) { tmp_left = (uint16_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height); if (!tmp_left) return false; for (int i = 0; i < height; i++) { memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_size); aom_memset16(in_tl + i * in_stride, input16[i * in_stride], border_cols); } } if (pad_right) { tmp_right = (uint16_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height); if (!tmp_right) { aom_free(tmp_left); return false; } for (int i = 0; i < height; i++) { memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_size); aom_memset16(in_tr + i * in_stride, input16[i * in_stride + width - 1], border_cols); } } av1_highbd_convolve_horiz_rs(CONVERT_TO_SHORTPTR(input - 1), in_stride, CONVERT_TO_SHORTPTR(output), out_stride, width2, height2, &av1_resize_filter_normative[0][0], x0_qn, x_step_qn, bd); // Restore the left/right border pixels if (pad_left) { for (int i = 0; i < height; i++) { memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_size); } aom_free(tmp_left); } if (pad_right) { for (int i = 0; i < height; i++) { memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_size); } aom_free(tmp_right); } return true; } #endif // CONFIG_AV1_HIGHBITDEPTH void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase_scaler, const int num_planes) { assert(filter == BILINEAR || filter == EIGHTTAP_SMOOTH || filter == EIGHTTAP_REGULAR); const InterpKernel *const kernel = (const InterpKernel *)av1_interp_filter_params_list[filter].filter_ptr; for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { const int is_uv = i > 0; const int src_w = src->crop_widths[is_uv]; const int src_h = src->crop_heights[is_uv]; const uint8_t *src_buffer = src->buffers[i]; const int src_stride = src->strides[is_uv]; const int dst_w = dst->crop_widths[is_uv]; const int dst_h = dst->crop_heights[is_uv]; uint8_t *dst_buffer = dst->buffers[i]; const int dst_stride = dst->strides[is_uv]; for (int y = 0; y < dst_h; y += 16) { const int y_q4 = src_h == dst_h ? 0 : y * 16 * src_h / dst_h + phase_scaler; for (int x = 0; x < dst_w; x += 16) { const int x_q4 = src_w == dst_w ? 0 : x * 16 * src_w / dst_w + phase_scaler; const uint8_t *src_ptr = src_buffer + y * src_h / dst_h * src_stride + x * src_w / dst_w; uint8_t *dst_ptr = dst_buffer + y * dst_stride + x; // Width and height of the actual working area. const int work_w = AOMMIN(16, dst_w - x); const int work_h = AOMMIN(16, dst_h - y); // SIMD versions of aom_scaled_2d() have some trouble handling // nonstandard sizes, so fall back on the C version to handle borders. if (work_w != 16 || work_h != 16) { aom_scaled_2d_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel, x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, 16 * src_h / dst_h, work_w, work_h); } else { aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel, x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, 16 * src_h / dst_h, 16, 16); } } } } aom_extend_frame_borders(dst, num_planes); } bool av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int bd, int num_planes) { // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet // the static analysis warnings. for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { const int is_uv = i > 0; #if CONFIG_AV1_HIGHBITDEPTH if (src->flags & YV12_FLAG_HIGHBITDEPTH) { highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv], src->crop_widths[is_uv], src->strides[is_uv], dst->buffers[i], dst->crop_heights[is_uv], dst->crop_widths[is_uv], dst->strides[is_uv], bd); } else if (!av1_resize_plane(src->buffers[i], src->crop_heights[is_uv], src->crop_widths[is_uv], src->strides[is_uv], dst->buffers[i], dst->crop_heights[is_uv], dst->crop_widths[is_uv], dst->strides[is_uv])) { return false; } #else (void)bd; if (!av1_resize_plane(src->buffers[i], src->crop_heights[is_uv], src->crop_widths[is_uv], src->strides[is_uv], dst->buffers[i], dst->crop_heights[is_uv], dst->crop_widths[is_uv], dst->strides[is_uv])) return false; #endif } aom_extend_frame_borders(dst, num_planes); return true; } void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int plane, int rows) { const int is_uv = (plane > 0); const int ss_x = is_uv && cm->seq_params->subsampling_x; const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x); const int upscaled_plane_width = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x); const int superres_denom = cm->superres_scale_denominator; TileInfo tile_col; const int32_t x_step_qn = av1_get_upscale_convolve_step( downscaled_plane_width, upscaled_plane_width); int32_t x0_qn = get_upscale_convolve_x0(downscaled_plane_width, upscaled_plane_width, x_step_qn); for (int j = 0; j < cm->tiles.cols; j++) { av1_tile_set_col(&tile_col, cm, j); // Determine the limits of this tile column in both the source // and destination images. // Note: The actual location which we start sampling from is // (downscaled_x0 - 1 + (x0_qn/2^14)), and this quantity increases // by exactly dst_width * (x_step_qn/2^14) pixels each iteration. const int downscaled_x0 = tile_col.mi_col_start << (MI_SIZE_LOG2 - ss_x); const int downscaled_x1 = tile_col.mi_col_end << (MI_SIZE_LOG2 - ss_x); const int src_width = downscaled_x1 - downscaled_x0; const int upscaled_x0 = (downscaled_x0 * superres_denom) / SCALE_NUMERATOR; int upscaled_x1; if (j == cm->tiles.cols - 1) { // Note that we can't just use AOMMIN here - due to rounding, // (downscaled_x1 * superres_denom) / SCALE_NUMERATOR may be less than // upscaled_plane_width. upscaled_x1 = upscaled_plane_width; } else { upscaled_x1 = (downscaled_x1 * superres_denom) / SCALE_NUMERATOR; } const uint8_t *const src_ptr = src + downscaled_x0; uint8_t *const dst_ptr = dst + upscaled_x0; const int dst_width = upscaled_x1 - upscaled_x0; const int pad_left = (j == 0); const int pad_right = (j == cm->tiles.cols - 1); bool success; #if CONFIG_AV1_HIGHBITDEPTH if (cm->seq_params->use_highbitdepth) success = highbd_upscale_normative_rect( src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width, dst_stride, x_step_qn, x0_qn, pad_left, pad_right, cm->seq_params->bit_depth); else success = upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width, dst_stride, x_step_qn, x0_qn, pad_left, pad_right); #else success = upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width, dst_stride, x_step_qn, x0_qn, pad_left, pad_right); #endif if (!success) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Error upscaling frame"); } // Update the fractional pixel offset to prepare for the next tile column. x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS); } } static void upscale_normative_and_extend_frame(const AV1_COMMON *cm, const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst) { const int num_planes = av1_num_planes(cm); for (int i = 0; i < num_planes; ++i) { const int is_uv = (i > 0); av1_upscale_normative_rows(cm, src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], i, src->crop_heights[is_uv]); } aom_extend_frame_borders(dst, num_planes); } YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required( AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, const InterpFilter filter, const int phase, const bool use_optimized_scaler, const bool for_psnr, const int border_in_pixels, const bool alloc_pyramid) { // If scaling is performed for the sole purpose of calculating PSNR, then our // target dimensions are superres upscaled width/height. Otherwise our target // dimensions are coded width/height. const int scaled_width = for_psnr ? cm->superres_upscaled_width : cm->width; const int scaled_height = for_psnr ? cm->superres_upscaled_height : cm->height; const bool scaling_required = (scaled_width != unscaled->y_crop_width) || (scaled_height != unscaled->y_crop_height); if (scaling_required) { const int num_planes = av1_num_planes(cm); const SequenceHeader *seq_params = cm->seq_params; // Reallocate the frame buffer based on the target dimensions when scaling // is required. if (aom_realloc_frame_buffer( scaled, scaled_width, scaled_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL, alloc_pyramid, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate scaled buffer"); bool has_optimized_scaler = av1_has_optimized_scaler( unscaled->y_crop_width, unscaled->y_crop_height, scaled_width, scaled_height); if (num_planes > 1) { has_optimized_scaler = has_optimized_scaler && av1_has_optimized_scaler(unscaled->uv_crop_width, unscaled->uv_crop_height, scaled->uv_crop_width, scaled->uv_crop_height); } #if CONFIG_AV1_HIGHBITDEPTH if (use_optimized_scaler && has_optimized_scaler && cm->seq_params->bit_depth == AOM_BITS_8) { av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes); } else { if (!av1_resize_and_extend_frame_nonnormative( unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate buffers during resize"); } #else if (use_optimized_scaler && has_optimized_scaler) { av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes); } else { if (!av1_resize_and_extend_frame_nonnormative( unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate buffers during resize"); } #endif if (unscaled->metadata && aom_copy_metadata_to_frame_buffer(scaled, unscaled->metadata)) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to copy source metadata to scaled frame"); } return scaled; } return unscaled; } // Calculates the scaled dimension given the original dimension and the scale // denominator. static void calculate_scaled_size_helper(int *dim, int denom) { if (denom != SCALE_NUMERATOR) { // We need to ensure the constraint in "Appendix A" of the spec: // * FrameWidth is greater than or equal to 16 // * FrameHeight is greater than or equal to 16 // For this, we clamp the downscaled dimension to at least 16. One // exception: if original dimension itself was < 16, then we keep the // downscaled dimension to be same as the original, to ensure that resizing // is valid. const int min_dim = AOMMIN(16, *dim); // Use this version if we need *dim to be even // *width = (*width * SCALE_NUMERATOR + denom) / (2 * denom); // *width <<= 1; *dim = (*dim * SCALE_NUMERATOR + denom / 2) / (denom); *dim = AOMMAX(*dim, min_dim); } } void av1_calculate_scaled_size(int *width, int *height, int resize_denom) { calculate_scaled_size_helper(width, resize_denom); calculate_scaled_size_helper(height, resize_denom); } void av1_calculate_scaled_superres_size(int *width, int *height, int superres_denom) { (void)height; calculate_scaled_size_helper(width, superres_denom); } // Copy only the config data from 'src' to 'dst'. static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const dst) { dst->bit_depth = src->bit_depth; dst->color_primaries = src->color_primaries; dst->transfer_characteristics = src->transfer_characteristics; dst->matrix_coefficients = src->matrix_coefficients; dst->monochrome = src->monochrome; dst->chroma_sample_position = src->chroma_sample_position; dst->color_range = src->color_range; } // TODO(afergs): Look for in-place upscaling // TODO(afergs): aom_ vs av1_ functions? Which can I use? // Upscale decoded image. void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool, bool alloc_pyramid) { const int num_planes = av1_num_planes(cm); if (!av1_superres_scaled(cm)) return; const SequenceHeader *const seq_params = cm->seq_params; const int byte_alignment = cm->features.byte_alignment; YV12_BUFFER_CONFIG copy_buffer; memset(©_buffer, 0, sizeof(copy_buffer)); YV12_BUFFER_CONFIG *const frame_to_show = &cm->cur_frame->buf; const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3); if (aom_alloc_frame_buffer( ©_buffer, aligned_width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, byte_alignment, false, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate copy buffer for superres upscaling"); // Copy function assumes the frames are the same size. // Note that it does not copy YV12_BUFFER_CONFIG config data. aom_yv12_copy_frame(frame_to_show, ©_buffer, num_planes); assert(copy_buffer.y_crop_width == aligned_width); assert(copy_buffer.y_crop_height == cm->height); // Realloc the current frame buffer at a higher resolution in place. if (pool != NULL) { // Use callbacks if on the decoder. aom_codec_frame_buffer_t *fb = &cm->cur_frame->raw_frame_buffer; aom_release_frame_buffer_cb_fn_t release_fb_cb = pool->release_fb_cb; aom_get_frame_buffer_cb_fn_t cb = pool->get_fb_cb; void *cb_priv = pool->cb_priv; lock_buffer_pool(pool); // Realloc with callback does not release the frame buffer - release first. if (release_fb_cb(cb_priv, fb)) { unlock_buffer_pool(pool); aom_internal_error( cm->error, AOM_CODEC_MEM_ERROR, "Failed to free current frame buffer before superres upscaling"); } // aom_realloc_frame_buffer() leaves config data for frame_to_show intact if (aom_realloc_frame_buffer( frame_to_show, cm->superres_upscaled_width, cm->superres_upscaled_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv, alloc_pyramid, 0)) { unlock_buffer_pool(pool); aom_internal_error( cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate current frame buffer for superres upscaling"); } unlock_buffer_pool(pool); } else { // Make a copy of the config data for frame_to_show in copy_buffer copy_buffer_config(frame_to_show, ©_buffer); // Don't use callbacks on the encoder. // aom_alloc_frame_buffer() clears the config data for frame_to_show if (aom_alloc_frame_buffer( frame_to_show, cm->superres_upscaled_width, cm->superres_upscaled_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, byte_alignment, alloc_pyramid, 0)) aom_internal_error( cm->error, AOM_CODEC_MEM_ERROR, "Failed to reallocate current frame buffer for superres upscaling"); // Restore config data back to frame_to_show copy_buffer_config(©_buffer, frame_to_show); } // TODO(afergs): verify frame_to_show is correct after realloc // encoder: // decoder: assert(frame_to_show->y_crop_width == cm->superres_upscaled_width); assert(frame_to_show->y_crop_height == cm->superres_upscaled_height); // Scale up and back into frame_to_show. assert(frame_to_show->y_crop_width != cm->width); upscale_normative_and_extend_frame(cm, ©_buffer, frame_to_show); // Free the copy buffer aom_free_frame_buffer(©_buffer); } aom-3.12.1/av1/common/resize.h000066400000000000000000000114721477627663500160370ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_RESIZE_H_ #define AOM_AV1_COMMON_RESIZE_H_ #include #include "aom/aom_integer.h" #include "av1/common/av1_common_int.h" #ifdef __cplusplus extern "C" { #endif // Filters for factor of 2 downsampling. static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 }; static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 }; bool av1_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride); void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int plane, int rows); YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required( AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, const InterpFilter filter, const int phase, const bool use_optimized_scaler, const bool for_psnr, const int border_in_pixels, const bool alloc_pyramid); bool av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int bd, int num_planes); // Calculates the scaled dimensions from the given original dimensions and the // resize scale denominator. void av1_calculate_scaled_size(int *width, int *height, int resize_denom); // Similar to above, but calculates scaled dimensions after superres from the // given original dimensions and superres scale denominator. void av1_calculate_scaled_superres_size(int *width, int *height, int superres_denom); void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool, bool alloc_pyramid); bool av1_resize_plane_to_half(const uint8_t *const input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride); void down2_symeven(const uint8_t *const input, int length, uint8_t *output, int start_offset); bool should_resize_by_half(int height, int width, int height2, int width2); // Returns 1 if a superres upscaled frame is scaled and 0 otherwise. static inline int av1_superres_scaled(const AV1_COMMON *cm) { // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling // required even though cm->superres_scale_denominator != SCALE_NUMERATOR. // So, the following check is more accurate. return (cm->width != cm->superres_upscaled_width); } // The optimized scaler av1_resize_and_extend_frame() can only handle scaling // ratios >= 1/4 and <= 16. See comment in aom_scaled_2d_c() for detail. // Visual assessment shows that if the scaling ratio or its reciprocal is not a // multiple of 1/16, there are some artifacts in the output of the optimized // scaler, especially on lines, due to non-exact ratio representation. SSSE3 // and NEON have a specialized 3/4 version of av1_resize_and_extend_frame() // that does not have this issue. // // Use the non-normative scaler av1_resize_and_extend_frame_nonnormative() // for other scaling ratios. static inline bool av1_has_optimized_scaler(const int src_width, const int src_height, const int dst_width, const int dst_height) { bool has_optimized_scaler = (dst_width * 4 >= src_width && dst_height * 4 >= src_height) && (dst_width <= src_width * 16 && dst_height <= src_height * 16) && (16 * dst_width % src_width == 0) && (16 * src_width % dst_width == 0) && (16 * dst_height % src_height == 0) && (16 * src_height % dst_height == 0); #if HAVE_SSSE3 || HAVE_NEON has_optimized_scaler = has_optimized_scaler || (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height); #endif return has_optimized_scaler; } #define UPSCALE_NORMATIVE_TAPS 8 extern const int16_t av1_resize_filter_normative[1 << RS_SUBPEL_BITS] [UPSCALE_NORMATIVE_TAPS]; int32_t av1_get_upscale_convolve_step(int in_length, int out_length); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_RESIZE_H_ aom-3.12.1/av1/common/restoration.c000066400000000000000000001715631477627663500171120ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. * */ #include #include #include "config/aom_config.h" #include "config/aom_scale_rtcd.h" #include "aom/internal/aom_codec_internal.h" #include "aom_mem/aom_mem.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "aom_util/aom_pthread.h" #include "av1/common/av1_common_int.h" #include "av1/common/convolve.h" #include "av1/common/enums.h" #include "av1/common/resize.h" #include "av1/common/restoration.h" #include "av1/common/thread_common.h" // The 's' values are calculated based on original 'r' and 'e' values in the // spec using GenSgrprojVtable(). // Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid). const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = { { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } }, { { 2, 1 }, { 93, 1618 } }, { { 2, 1 }, { 80, 1438 } }, { { 2, 1 }, { 70, 1295 } }, { { 2, 1 }, { 58, 1177 } }, { { 2, 1 }, { 47, 1079 } }, { { 2, 1 }, { 37, 996 } }, { { 2, 1 }, { 30, 925 } }, { { 2, 1 }, { 25, 863 } }, { { 0, 1 }, { -1, 2589 } }, { { 0, 1 }, { -1, 1618 } }, { { 0, 1 }, { -1, 1177 } }, { { 0, 1 }, { -1, 925 } }, { { 2, 0 }, { 56, -1 } }, { { 2, 0 }, { 22, -1 } }, }; void av1_get_upsampled_plane_size(const AV1_COMMON *cm, int is_uv, int *plane_w, int *plane_h) { int ss_x = is_uv && cm->seq_params->subsampling_x; int ss_y = is_uv && cm->seq_params->subsampling_y; *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x); *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y); } // Count horizontal or vertical units in a plane (use a width or height for // plane_size, respectively). We basically want to divide the plane size by the // size of a restoration unit. Rather than rounding up unconditionally as you // might expect, we round to nearest, which models the way a right or bottom // restoration unit can extend to up to 150% its normal width or height. // // The max with 1 is to deal with small frames, which may be smaller than // half of an LR unit in size. int av1_lr_count_units(int unit_size, int plane_size) { return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1); } void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi, int is_uv) { int plane_w, plane_h; av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); const int unit_size = rsi->restoration_unit_size; const int horz_units = av1_lr_count_units(unit_size, plane_w); const int vert_units = av1_lr_count_units(unit_size, plane_h); rsi->num_rest_units = horz_units * vert_units; rsi->horz_units = horz_units; rsi->vert_units = vert_units; aom_free(rsi->unit_info); CHECK_MEM_ERROR(cm, rsi->unit_info, (RestorationUnitInfo *)aom_memalign( 16, sizeof(*rsi->unit_info) * rsi->num_rest_units)); } void av1_free_restoration_struct(RestorationInfo *rst_info) { aom_free(rst_info->unit_info); rst_info->unit_info = NULL; } #if 0 // Pair of values for each sgrproj parameter: // Index 0 corresponds to r[0], e[0] // Index 1 corresponds to r[1], e[1] int sgrproj_mtable[SGRPROJ_PARAMS][2]; static void GenSgrprojVtable(void) { for (int i = 0; i < SGRPROJ_PARAMS; ++i) { const sgr_params_type *const params = &av1_sgr_params[i]; for (int j = 0; j < 2; ++j) { const int e = params->e[j]; const int r = params->r[j]; if (r == 0) { // filter is disabled sgrproj_mtable[i][j] = -1; // mark invalid } else { // filter is enabled const int n = (2 * r + 1) * (2 * r + 1); const int n2e = n * n * e; assert(n2e != 0); sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e); } } } } #endif void av1_loop_restoration_precal(void) { #if 0 GenSgrprojVtable(); #endif } static void extend_frame_lowbd(uint8_t *data, int width, int height, ptrdiff_t stride, int border_horz, int border_vert) { uint8_t *data_p; int i; for (i = 0; i < height; ++i) { data_p = data + i * stride; memset(data_p - border_horz, data_p[0], border_horz); memset(data_p + width, data_p[width - 1], border_horz); } data_p = data - border_horz; for (i = -border_vert; i < 0; ++i) { memcpy(data_p + i * stride, data_p, width + 2 * border_horz); } for (i = height; i < height + border_vert; ++i) { memcpy(data_p + i * stride, data_p + (height - 1) * stride, width + 2 * border_horz); } } #if CONFIG_AV1_HIGHBITDEPTH static void extend_frame_highbd(uint16_t *data, int width, int height, ptrdiff_t stride, int border_horz, int border_vert) { uint16_t *data_p; int i, j; for (i = 0; i < height; ++i) { data_p = data + i * stride; for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0]; for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1]; } data_p = data - border_horz; for (i = -border_vert; i < 0; ++i) { memcpy(data_p + i * stride, data_p, (width + 2 * border_horz) * sizeof(uint16_t)); } for (i = height; i < height + border_vert; ++i) { memcpy(data_p + i * stride, data_p + (height - 1) * stride, (width + 2 * border_horz) * sizeof(uint16_t)); } } static void copy_rest_unit_highbd(int width, int height, const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride) { for (int i = 0; i < height; ++i) memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst)); } #endif void av1_extend_frame(uint8_t *data, int width, int height, int stride, int border_horz, int border_vert, int highbd) { #if CONFIG_AV1_HIGHBITDEPTH if (highbd) { extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride, border_horz, border_vert); return; } #endif (void)highbd; extend_frame_lowbd(data, width, height, stride, border_horz, border_vert); } static void copy_rest_unit_lowbd(int width, int height, const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride) { for (int i = 0; i < height; ++i) memcpy(dst + i * dst_stride, src + i * src_stride, width); } static void copy_rest_unit(int width, int height, const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int highbd) { #if CONFIG_AV1_HIGHBITDEPTH if (highbd) { copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride, CONVERT_TO_SHORTPTR(dst), dst_stride); return; } #endif (void)highbd; copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride); } #define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d)) // With striped loop restoration, the filtering for each 64-pixel stripe gets // most of its input from the output of CDEF (stored in data8), but we need to // fill out a border of 3 pixels above/below the stripe according to the // following rules: // // * At the top and bottom of the frame, we copy the outermost row of CDEF // pixels three times. This extension is done by a call to av1_extend_frame() // at the start of the loop restoration process, so the value of // copy_above/copy_below doesn't strictly matter. // // * All other boundaries are stripe boundaries within the frame. In that case, // we take 2 rows of deblocked pixels and extend them to 3 rows of context. static void get_stripe_boundary_info(const RestorationTileLimits *limits, int plane_w, int plane_h, int ss_y, int *copy_above, int *copy_below) { (void)plane_w; *copy_above = 1; *copy_below = 1; const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y; const int first_stripe_in_plane = (limits->v_start == 0); const int this_stripe_height = full_stripe_height - (first_stripe_in_plane ? runit_offset : 0); const int last_stripe_in_plane = (limits->v_start + this_stripe_height >= plane_h); if (first_stripe_in_plane) *copy_above = 0; if (last_stripe_in_plane) *copy_below = 0; } // Overwrite the border pixels around a processing stripe so that the conditions // listed above get_stripe_boundary_info() are preserved. // We save the pixels which get overwritten into a temporary buffer, so that // they can be restored by restore_processing_stripe_boundary() after we've // processed the stripe. // // limits gives the rectangular limits of the remaining stripes for the current // restoration unit. rsb is the stored stripe boundaries (taken from either // deblock or CDEF output as necessary). static void setup_processing_stripe_boundary( const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb, int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride, RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) { // Offsets within the line buffers. The buffer logically starts at column // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ) // has column x0 in the buffer. const int buf_stride = rsb->stripe_boundary_stride; const int buf_x0_off = limits->h_start; const int line_width = (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ; const int line_size = line_width << use_highbd; const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ; // Replace RESTORATION_BORDER pixels above the top of the stripe // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by // duplicating the topmost of the 2 lines (see the AOMMAX call when // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1). if (!opt) { if (copy_above) { uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; for (int i = -RESTORATION_BORDER; i < 0; ++i) { const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0); const int buf_off = buf_x0_off + buf_row * buf_stride; const uint8_t *buf = rsb->stripe_boundary_above + (buf_off << use_highbd); uint8_t *dst8 = data8_tl + i * data_stride; // Save old pixels, then replace with data from stripe_boundary_above memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER], REAL_PTR(use_highbd, dst8), line_size); memcpy(REAL_PTR(use_highbd, dst8), buf, line_size); } } // Replace RESTORATION_BORDER pixels below the bottom of the stripe. // The second buffer row is repeated, so src_row gets the values 0, 1, 1 // for i = 0, 1, 2. if (copy_below) { const int stripe_end = limits->v_start + h; uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride; for (int i = 0; i < RESTORATION_BORDER; ++i) { const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1); const int buf_off = buf_x0_off + buf_row * buf_stride; const uint8_t *src = rsb->stripe_boundary_below + (buf_off << use_highbd); uint8_t *dst8 = data8_bl + i * data_stride; // Save old pixels, then replace with data from stripe_boundary_below memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size); memcpy(REAL_PTR(use_highbd, dst8), src, line_size); } } } else { if (copy_above) { uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; // Only save and overwrite i=-RESTORATION_BORDER line. uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride; // Save old pixels, then replace with data from stripe_boundary_above memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size); memcpy(REAL_PTR(use_highbd, dst8), REAL_PTR(use_highbd, data8_tl + (-RESTORATION_BORDER + 1) * data_stride), line_size); } if (copy_below) { const int stripe_end = limits->v_start + h; uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride; // Only save and overwrite i=2 line. uint8_t *dst8 = data8_bl + 2 * data_stride; // Save old pixels, then replace with data from stripe_boundary_below memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size); memcpy(REAL_PTR(use_highbd, dst8), REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size); } } } // Once a processing stripe is finished, this function sets the boundary // pixels which were overwritten by setup_processing_stripe_boundary() // back to their original values static void restore_processing_stripe_boundary( const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs, int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above, int copy_below, int opt) { const int line_width = (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ; const int line_size = line_width << use_highbd; const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ; if (!opt) { if (copy_above) { uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; for (int i = -RESTORATION_BORDER; i < 0; ++i) { uint8_t *dst8 = data8_tl + i * data_stride; memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size); } } if (copy_below) { const int stripe_bottom = limits->v_start + h; uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride; for (int i = 0; i < RESTORATION_BORDER; ++i) { if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break; uint8_t *dst8 = data8_bl + i * data_stride; memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size); } } } else { if (copy_above) { uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; // Only restore i=-RESTORATION_BORDER line. uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride; memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size); } if (copy_below) { const int stripe_bottom = limits->v_start + h; uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride; // Only restore i=2 line. if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) { uint8_t *dst8 = data8_bl + 2 * data_stride; memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size); } } } } static void wiener_filter_stripe(const RestorationUnitInfo *rui, int stripe_width, int stripe_height, int procunit_width, const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, struct aom_internal_error_info *error_info) { (void)tmpbuf; (void)bit_depth; (void)error_info; assert(bit_depth == 8); const WienerConvolveParams conv_params = get_conv_params_wiener(8); for (int j = 0; j < stripe_width; j += procunit_width) { int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15); const uint8_t *src_p = src + j; uint8_t *dst_p = dst + j; av1_wiener_convolve_add_src( src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16, rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params); } } /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1) over the input. The window is of size (2r + 1)x(2r + 1), and we specialize to r = 1, 2, 3. A default function is used for r > 3. Each loop follows the same format: We keep a window's worth of input in individual variables and select data out of that as appropriate. */ static void boxsum1(int32_t *src, int width, int height, int src_stride, int sqr, int32_t *dst, int dst_stride) { int i, j, a, b, c; assert(width > 2 * SGRPROJ_BORDER_HORZ); assert(height > 2 * SGRPROJ_BORDER_VERT); // Vertical sum over 3-pixel regions, from src into dst. if (!sqr) { for (j = 0; j < width; ++j) { a = src[j]; b = src[src_stride + j]; c = src[2 * src_stride + j]; dst[j] = a + b; for (i = 1; i < height - 2; ++i) { // Loop invariant: At the start of each iteration, // a = src[(i - 1) * src_stride + j] // b = src[(i ) * src_stride + j] // c = src[(i + 1) * src_stride + j] dst[i * dst_stride + j] = a + b + c; a = b; b = c; c = src[(i + 2) * src_stride + j]; } dst[i * dst_stride + j] = a + b + c; dst[(i + 1) * dst_stride + j] = b + c; } } else { for (j = 0; j < width; ++j) { a = src[j] * src[j]; b = src[src_stride + j] * src[src_stride + j]; c = src[2 * src_stride + j] * src[2 * src_stride + j]; dst[j] = a + b; for (i = 1; i < height - 2; ++i) { dst[i * dst_stride + j] = a + b + c; a = b; b = c; c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j]; } dst[i * dst_stride + j] = a + b + c; dst[(i + 1) * dst_stride + j] = b + c; } } // Horizontal sum over 3-pixel regions of dst for (i = 0; i < height; ++i) { a = dst[i * dst_stride]; b = dst[i * dst_stride + 1]; c = dst[i * dst_stride + 2]; dst[i * dst_stride] = a + b; for (j = 1; j < width - 2; ++j) { // Loop invariant: At the start of each iteration, // a = src[i * src_stride + (j - 1)] // b = src[i * src_stride + (j )] // c = src[i * src_stride + (j + 1)] dst[i * dst_stride + j] = a + b + c; a = b; b = c; c = dst[i * dst_stride + (j + 2)]; } dst[i * dst_stride + j] = a + b + c; dst[i * dst_stride + (j + 1)] = b + c; } } static void boxsum2(int32_t *src, int width, int height, int src_stride, int sqr, int32_t *dst, int dst_stride) { int i, j, a, b, c, d, e; assert(width > 2 * SGRPROJ_BORDER_HORZ); assert(height > 2 * SGRPROJ_BORDER_VERT); // Vertical sum over 5-pixel regions, from src into dst. if (!sqr) { for (j = 0; j < width; ++j) { a = src[j]; b = src[src_stride + j]; c = src[2 * src_stride + j]; d = src[3 * src_stride + j]; e = src[4 * src_stride + j]; dst[j] = a + b + c; dst[dst_stride + j] = a + b + c + d; for (i = 2; i < height - 3; ++i) { // Loop invariant: At the start of each iteration, // a = src[(i - 2) * src_stride + j] // b = src[(i - 1) * src_stride + j] // c = src[(i ) * src_stride + j] // d = src[(i + 1) * src_stride + j] // e = src[(i + 2) * src_stride + j] dst[i * dst_stride + j] = a + b + c + d + e; a = b; b = c; c = d; d = e; e = src[(i + 3) * src_stride + j]; } dst[i * dst_stride + j] = a + b + c + d + e; dst[(i + 1) * dst_stride + j] = b + c + d + e; dst[(i + 2) * dst_stride + j] = c + d + e; } } else { for (j = 0; j < width; ++j) { a = src[j] * src[j]; b = src[src_stride + j] * src[src_stride + j]; c = src[2 * src_stride + j] * src[2 * src_stride + j]; d = src[3 * src_stride + j] * src[3 * src_stride + j]; e = src[4 * src_stride + j] * src[4 * src_stride + j]; dst[j] = a + b + c; dst[dst_stride + j] = a + b + c + d; for (i = 2; i < height - 3; ++i) { dst[i * dst_stride + j] = a + b + c + d + e; a = b; b = c; c = d; d = e; e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j]; } dst[i * dst_stride + j] = a + b + c + d + e; dst[(i + 1) * dst_stride + j] = b + c + d + e; dst[(i + 2) * dst_stride + j] = c + d + e; } } // Horizontal sum over 5-pixel regions of dst for (i = 0; i < height; ++i) { a = dst[i * dst_stride]; b = dst[i * dst_stride + 1]; c = dst[i * dst_stride + 2]; d = dst[i * dst_stride + 3]; e = dst[i * dst_stride + 4]; dst[i * dst_stride] = a + b + c; dst[i * dst_stride + 1] = a + b + c + d; for (j = 2; j < width - 3; ++j) { // Loop invariant: At the start of each iteration, // a = src[i * src_stride + (j - 2)] // b = src[i * src_stride + (j - 1)] // c = src[i * src_stride + (j )] // d = src[i * src_stride + (j + 1)] // e = src[i * src_stride + (j + 2)] dst[i * dst_stride + j] = a + b + c + d + e; a = b; b = c; c = d; d = e; e = dst[i * dst_stride + (j + 3)]; } dst[i * dst_stride + j] = a + b + c + d + e; dst[i * dst_stride + (j + 1)] = b + c + d + e; dst[i * dst_stride + (j + 2)] = c + d + e; } } static void boxsum(int32_t *src, int width, int height, int src_stride, int r, int sqr, int32_t *dst, int dst_stride) { if (r == 1) boxsum1(src, width, height, src_stride, sqr, dst, dst_stride); else if (r == 2) boxsum2(src, width, height, src_stride, sqr, dst, dst_stride); else assert(0 && "Invalid value of r in self-guided filter"); } void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) { if (params->r[0] == 0) { xq[0] = 0; xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1]; } else if (params->r[1] == 0) { xq[0] = xqd[0]; xq[1] = 0; } else { xq[0] = xqd[0]; xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1]; } } const int32_t av1_x_by_xplus1[256] = { // Special case: Map 0 -> 1 (corresponding to a value of 1/256) // instead of 0. See comments in selfguided_restoration_internal() for why 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239, 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247, 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250, 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 256, }; const int32_t av1_one_by_x[MAX_NELEM] = { 4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315, 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, }; static void calculate_intermediate_result(int32_t *dgd, int width, int height, int dgd_stride, int bit_depth, int sgr_params_idx, int radius_idx, int pass, int32_t *A, int32_t *B) { const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; // Adjusting the stride of A and B here appears to avoid bad cache effects, // leading to a significant speed improvement. // We also align the stride to a multiple of 16 bytes, for consistency // with the SIMD version of this function. int buf_stride = ((width_ext + 3) & ~3) + 16; const int step = pass == 0 ? 1 : 2; int i, j; assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && "Need SGRPROJ_BORDER_* >= r+1"); boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ, width_ext, height_ext, dgd_stride, r, 0, B, buf_stride); boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ, width_ext, height_ext, dgd_stride, r, 1, A, buf_stride); A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie, // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[]. for (i = -1; i < height + 1; i += step) { for (j = -1; j < width + 1; ++j) { const int k = i * buf_stride + j; const int n = (2 * r + 1) * (2 * r + 1); // a < 2^16 * n < 2^22 regardless of bit depth uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8)); // b < 2^8 * n < 2^14 regardless of bit depth uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8); // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28, // and p itself satisfies p < 2^14 * n^2 < 2^26. // This bound on p is due to: // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances // // Note: Sometimes, in high bit depth, we can end up with a*n < b*b. // This is an artefact of rounding, and can only happen if all pixels // are (almost) identical, so in this case we saturate to p=0. uint32_t p = (a * n < b * b) ? 0 : a * n - b * b; const uint32_t s = params->s[radius_idx]; // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32 // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12 // (this holds even after accounting for the rounding in s) const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS); // Note: We have to be quite careful about the value of A[k]. // This is used as a blend factor between individual pixel values and the // local mean. So it logically has a range of [0, 256], including both // endpoints. // // This is a pain for hardware, as we'd like something which can be stored // in exactly 8 bits. // Further, in the calculation of B[k] below, if z == 0 and r == 2, // then A[k] "should be" 0. But then we can end up setting B[k] to a value // slightly above 2^(8 + bit depth), due to rounding in the value of // av1_one_by_x[25-1]. // // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0. // This fixes the above issues (256 - A[k] fits in a uint8, and we can't // overflow), without significantly affecting the final result: z == 0 // implies that the image is essentially "flat", so the local mean and // individual pixel values are very similar. // // Note that saturating on the other side, ie. requring A[k] <= 255, // would be a bad idea, as that corresponds to the case where the image // is very variable, when we want to preserve the local pixel value as // much as possible. A[k] = av1_x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256] // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n, // av1_one_by_x[n - 1] = round(2^12 / n) // => the product here is < 2^(20 + bit_depth) <= 2^32, // and B[k] is set to a value < 2^(8 + bit depth) // This holds even with the rounding in av1_one_by_x and in the overall // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8. B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) * (uint32_t)B[k] * (uint32_t)av1_one_by_x[n - 1], SGRPROJ_RECIP_BITS); } } } static void selfguided_restoration_fast_internal( int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst, int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; // Adjusting the stride of A and B here appears to avoid bad cache effects, // leading to a significant speed improvement. // We also align the stride to a multiple of 16 bytes, for consistency // with the SIMD version of this function. int buf_stride = ((width_ext + 3) & ~3) + 16; int32_t A_[RESTORATION_PROC_UNIT_PELS]; int32_t B_[RESTORATION_PROC_UNIT_PELS]; int32_t *A = A_; int32_t *B = B_; int i, j; calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth, sgr_params_idx, radius_idx, 1, A, B); A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; // Use the A[] and B[] arrays to calculate the filtered image (void)r; assert(r == 2); for (i = 0; i < height; ++i) { if (!(i & 1)) { // even row for (j = 0; j < width; ++j) { const int k = i * buf_stride + j; const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 5; const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 + (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] + A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) * 5; const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 + (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] + B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) * 5; const int32_t v = a * dgd[l] + b; dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); } } else { // odd row for (j = 0; j < width; ++j) { const int k = i * buf_stride + j; const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 4; const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5; const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5; const int32_t v = a * dgd[l] + b; dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); } } } } static void selfguided_restoration_internal(int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst, int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; // Adjusting the stride of A and B here appears to avoid bad cache effects, // leading to a significant speed improvement. // We also align the stride to a multiple of 16 bytes, for consistency // with the SIMD version of this function. int buf_stride = ((width_ext + 3) & ~3) + 16; int32_t A_[RESTORATION_PROC_UNIT_PELS]; int32_t B_[RESTORATION_PROC_UNIT_PELS]; int32_t *A = A_; int32_t *B = B_; int i, j; calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth, sgr_params_idx, radius_idx, 0, A, B); A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; // Use the A[] and B[] arrays to calculate the filtered image for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { const int k = i * buf_stride + j; const int l = i * dgd_stride + j; const int m = i * dst_stride + j; const int nb = 5; const int32_t a = (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) * 4 + (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] + A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) * 3; const int32_t b = (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) * 4 + (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] + B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) * 3; const int32_t v = a * dgd[l] + b; dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); } } } int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride, int sgr_params_idx, int bit_depth, int highbd) { int32_t dgd32_[RESTORATION_PROC_UNIT_PELS]; const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ; int32_t *dgd32 = dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; if (highbd) { const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8); for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) { for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) { dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j]; } } } else { for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) { for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) { dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j]; } } } const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; // If params->r == 0 we skip the corresponding filter. We only allow one of // the radii to be 0, as having both equal to 0 would be equivalent to // skipping SGR entirely. assert(!(params->r[0] == 0 && params->r[1] == 0)); if (params->r[0] > 0) selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride, flt0, flt_stride, bit_depth, sgr_params_idx, 0); if (params->r[1] > 0) selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1, flt_stride, bit_depth, sgr_params_idx, 1); return 0; } int av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst8, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd) { int32_t *flt0 = tmpbuf; int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; assert(width * height <= RESTORATION_UNITPELS_MAX); const int ret = av1_selfguided_restoration_c( dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); if (ret != 0) return ret; const sgr_params_type *const params = &av1_sgr_params[eps]; int xq[2]; av1_decode_xq(xqd, xq, params); for (int i = 0; i < height; ++i) { for (int j = 0; j < width; ++j) { const int k = i * width + j; uint8_t *dst8ij = dst8 + i * dst_stride + j; const uint8_t *dat8ij = dat8 + i * stride + j; const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij; const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS; int32_t v = u << SGRPROJ_PRJ_BITS; // If params->r == 0 then we skipped the filtering in // av1_selfguided_restoration_c, i.e. flt[k] == u if (params->r[0] > 0) v += xq[0] * (flt0[k] - u); if (params->r[1] > 0) v += xq[1] * (flt1[k] - u); const int16_t w = (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); const uint16_t out = clip_pixel_highbd(w, bit_depth); if (highbd) *CONVERT_TO_SHORTPTR(dst8ij) = out; else *dst8ij = (uint8_t)out; } } return 0; } static void sgrproj_filter_stripe(const RestorationUnitInfo *rui, int stripe_width, int stripe_height, int procunit_width, const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, struct aom_internal_error_info *error_info) { (void)bit_depth; assert(bit_depth == 8); for (int j = 0; j < stripe_width; j += procunit_width) { int w = AOMMIN(procunit_width, stripe_width - j); if (av1_apply_selfguided_restoration( src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep, rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, 0) != 0) { aom_internal_error( error_info, AOM_CODEC_MEM_ERROR, "Error allocating buffer in av1_apply_selfguided_restoration"); } } } #if CONFIG_AV1_HIGHBITDEPTH static void wiener_filter_stripe_highbd( const RestorationUnitInfo *rui, int stripe_width, int stripe_height, int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf, int bit_depth, struct aom_internal_error_info *error_info) { (void)tmpbuf; (void)error_info; const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth); for (int j = 0; j < stripe_width; j += procunit_width) { int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15); const uint8_t *src8_p = src8 + j; uint8_t *dst8_p = dst8 + j; av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride, rui->wiener_info.hfilter, 16, rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params, bit_depth); } } static void sgrproj_filter_stripe_highbd( const RestorationUnitInfo *rui, int stripe_width, int stripe_height, int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf, int bit_depth, struct aom_internal_error_info *error_info) { for (int j = 0; j < stripe_width; j += procunit_width) { int w = AOMMIN(procunit_width, stripe_width - j); if (av1_apply_selfguided_restoration( src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep, rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, 1) != 0) { aom_internal_error( error_info, AOM_CODEC_MEM_ERROR, "Error allocating buffer in av1_apply_selfguided_restoration"); } } } #endif // CONFIG_AV1_HIGHBITDEPTH typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui, int stripe_width, int stripe_height, int procunit_width, const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, struct aom_internal_error_info *error_info); #if CONFIG_AV1_HIGHBITDEPTH #define NUM_STRIPE_FILTERS 4 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = { wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd, sgrproj_filter_stripe_highbd }; #else #define NUM_STRIPE_FILTERS 2 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = { wiener_filter_stripe, sgrproj_filter_stripe }; #endif // CONFIG_AV1_HIGHBITDEPTH // Filter one restoration unit void av1_loop_restoration_filter_unit( const RestorationTileLimits *limits, const RestorationUnitInfo *rui, const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs, int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf, int optimized_lr, struct aom_internal_error_info *error_info) { RestorationType unit_rtype = rui->restoration_type; int unit_h = limits->v_end - limits->v_start; int unit_w = limits->h_end - limits->h_start; uint8_t *data8_tl = data8 + limits->v_start * (ptrdiff_t)stride + limits->h_start; uint8_t *dst8_tl = dst8 + limits->v_start * (ptrdiff_t)dst_stride + limits->h_start; if (unit_rtype == RESTORE_NONE) { copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd); return; } const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ); assert(filter_idx < NUM_STRIPE_FILTERS); const stripe_filter_fun stripe_filter = stripe_filters[filter_idx]; const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x; // Filter the whole image one stripe at a time RestorationTileLimits remaining_stripes = *limits; int i = 0; while (i < unit_h) { int copy_above, copy_below; remaining_stripes.v_start = limits->v_start + i; get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y, ©_above, ©_below); const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y; // Work out where this stripe's boundaries are within // rsb->stripe_boundary_{above,below} const int frame_stripe = (remaining_stripes.v_start + runit_offset) / full_stripe_height; const int rsb_row = RESTORATION_CTX_VERT * frame_stripe; // Calculate this stripe's height, based on two rules: // * The topmost stripe in the frame is 8 luma pixels shorter than usual. // * We can't extend past the end of the current restoration unit const int nominal_stripe_height = full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0); const int h = AOMMIN(nominal_stripe_height, remaining_stripes.v_end - remaining_stripes.v_start); setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd, h, data8, stride, rlbs, copy_above, copy_below, optimized_lr); stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride, dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth, error_info); restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h, data8, stride, copy_above, copy_below, optimized_lr); i += h; } } static void filter_frame_on_unit(const RestorationTileLimits *limits, int rest_unit_idx, void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs, struct aom_internal_error_info *error_info) { FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv; const RestorationInfo *rsi = ctxt->rsi; av1_loop_restoration_filter_unit( limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info); } void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, int optimized_lr, int num_planes) { const SequenceHeader *const seq_params = cm->seq_params; const int bit_depth = seq_params->bit_depth; const int highbd = seq_params->use_highbitdepth; lr_ctxt->dst = &cm->rst_frame; const int frame_width = frame->crop_widths[0]; const int frame_height = frame->crop_heights[0]; if (aom_realloc_frame_buffer( lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x, seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER, cm->features.byte_alignment, NULL, NULL, NULL, false, 0) != AOM_CODEC_OK) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate restoration dst buffer"); lr_ctxt->on_rest_unit = filter_frame_on_unit; lr_ctxt->frame = frame; for (int plane = 0; plane < num_planes; ++plane) { RestorationInfo *rsi = &cm->rst_info[plane]; RestorationType rtype = rsi->frame_restoration_type; rsi->optimized_lr = optimized_lr; lr_ctxt->ctxt[plane].rsi = rsi; if (rtype == RESTORE_NONE) { continue; } const int is_uv = plane > 0; int plane_w, plane_h; av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); assert(plane_w == frame->crop_widths[is_uv]); assert(plane_h == frame->crop_heights[is_uv]); av1_extend_frame(frame->buffers[plane], plane_w, plane_h, frame->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER, highbd); FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane]; lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x; lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y; lr_plane_ctxt->plane_w = plane_w; lr_plane_ctxt->plane_h = plane_h; lr_plane_ctxt->highbd = highbd; lr_plane_ctxt->bit_depth = bit_depth; lr_plane_ctxt->data8 = frame->buffers[plane]; lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane]; lr_plane_ctxt->data_stride = frame->strides[is_uv]; lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv]; } } static void loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt, AV1_COMMON *cm, int num_planes) { typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend, int vstart, int vend); static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y, aom_yv12_partial_coloc_copy_u, aom_yv12_partial_coloc_copy_v }; assert(num_planes <= 3); for (int plane = 0; plane < num_planes; ++plane) { if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane]; copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0, lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h); } } // Call on_rest_unit for each loop restoration unit in the plane. static void foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane, rest_unit_visitor_t on_rest_unit, void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs) { const RestorationInfo *rsi = &cm->rst_info[plane]; const int hnum_rest_units = rsi->horz_units; const int vnum_rest_units = rsi->vert_units; const int unit_size = rsi->restoration_unit_size; const int is_uv = plane > 0; const int ss_y = is_uv && cm->seq_params->subsampling_y; const int ext_size = unit_size * 3 / 2; int plane_w, plane_h; av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); int y0 = 0, i = 0; while (y0 < plane_h) { int remaining_h = plane_h - y0; int h = (remaining_h < ext_size) ? remaining_h : unit_size; RestorationTileLimits limits; limits.v_start = y0; limits.v_end = y0 + h; assert(limits.v_end <= plane_h); // Offset upwards to align with the restoration processing stripe const int voffset = RESTORATION_UNIT_OFFSET >> ss_y; limits.v_start = AOMMAX(0, limits.v_start - voffset); if (limits.v_end < plane_h) limits.v_end -= voffset; av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size, hnum_rest_units, vnum_rest_units, plane, priv, tmpbuf, rlbs, av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL, cm->error); y0 += h; ++i; } } static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm, int num_planes) { FilterFrameCtxt *ctxt = lr_ctxt->ctxt; for (int plane = 0; plane < num_planes; ++plane) { if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) { continue; } foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, &ctxt[plane], cm->rst_tmpbuf, cm->rlbs); } } void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, int optimized_lr, void *lr_ctxt) { assert(!cm->features.all_lossless); const int num_planes = av1_num_planes(cm); AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt; av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm, optimized_lr, num_planes); foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes); loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes); } void av1_foreach_rest_unit_in_row( RestorationTileLimits *limits, int plane_w, rest_unit_visitor_t on_rest_unit, int row_number, int unit_size, int hnum_rest_units, int vnum_rest_units, int plane, void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync, struct aom_internal_error_info *error_info) { const int ext_size = unit_size * 3 / 2; int x0 = 0, j = 0; while (x0 < plane_w) { int remaining_w = plane_w - x0; int w = (remaining_w < ext_size) ? remaining_w : unit_size; limits->h_start = x0; limits->h_end = x0 + w; assert(limits->h_end <= plane_w); const int unit_idx = row_number * hnum_rest_units + j; // No sync for even numbered rows // For odd numbered rows, Loop Restoration of current block requires the LR // of top-right and bottom-right blocks to be completed // top-right sync on_sync_read(lr_sync, row_number, j, plane); if ((row_number + 1) < vnum_rest_units) // bottom-right sync on_sync_read(lr_sync, row_number + 2, j, plane); #if CONFIG_MULTITHREAD if (lr_sync && lr_sync->num_workers > 1) { pthread_mutex_lock(lr_sync->job_mutex); const bool lr_mt_exit = lr_sync->lr_mt_exit; pthread_mutex_unlock(lr_sync->job_mutex); // Exit in case any worker has encountered an error. if (lr_mt_exit) return; } #endif on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info); on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane); x0 += w; ++j; } } void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) { (void)lr_sync; (void)r; (void)c; (void)plane; } void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c, const int sb_cols, int plane) { (void)lr_sync; (void)r; (void)c; (void)sb_cols; (void)plane; } int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rcol0, int *rcol1, int *rrow0, int *rrow1) { assert(rcol0 && rcol1 && rrow0 && rrow1); if (bsize != cm->seq_params->sb_size) return 0; assert(!cm->features.all_lossless); const int is_uv = plane > 0; // Compute the mi-unit corners of the superblock const int mi_row0 = mi_row; const int mi_col0 = mi_col; const int mi_row1 = mi_row0 + mi_size_high[bsize]; const int mi_col1 = mi_col0 + mi_size_wide[bsize]; const RestorationInfo *rsi = &cm->rst_info[plane]; const int size = rsi->restoration_unit_size; const int horz_units = rsi->horz_units; const int vert_units = rsi->vert_units; // The size of an MI-unit on this plane of the image const int ss_x = is_uv && cm->seq_params->subsampling_x; const int ss_y = is_uv && cm->seq_params->subsampling_y; const int mi_size_x = MI_SIZE >> ss_x; const int mi_size_y = MI_SIZE >> ss_y; // Write m for the relative mi column or row, D for the superres denominator // and N for the superres numerator. If u is the upscaled pixel offset then // we can write the downscaled pixel offset in two ways as: // // MI_SIZE * m = N / D u // // from which we get u = D * MI_SIZE * m / N const int mi_to_num_x = av1_superres_scaled(cm) ? mi_size_x * cm->superres_scale_denominator : mi_size_x; const int mi_to_num_y = mi_size_y; const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size; const int denom_y = size; const int rnd_x = denom_x - 1; const int rnd_y = denom_y - 1; // rcol0/rrow0 should be the first column/row of restoration units that // doesn't start left/below of mi_col/mi_row. For this calculation, we need // to round up the division (if the sb starts at runit column 10.1, the first // matching runit has column index 11) *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x; *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y; // rel_col1/rel_row1 is the equivalent calculation, but for the superblock // below-right. If we're at the bottom or right of the frame, this restoration // unit might not exist, in which case we'll clamp accordingly. *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units); *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units); return *rcol0 < *rcol1 && *rrow0 < *rrow1; } // Extend to left and right static void extend_lines(uint8_t *buf, int width, int height, int stride, int extend, int use_highbitdepth) { for (int i = 0; i < height; ++i) { if (use_highbitdepth) { uint16_t *buf16 = (uint16_t *)buf; aom_memset16(buf16 - extend, buf16[0], extend); aom_memset16(buf16 + width, buf16[width - 1], extend); } else { memset(buf - extend, buf[0], extend); memset(buf + width, buf[width - 1], extend); } buf += stride; } } static void save_deblock_boundary_lines( const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row, int stripe, int use_highbd, int is_above, RestorationStripeBoundaries *boundaries) { const int is_uv = plane > 0; const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]); const int src_stride = frame->strides[is_uv] << use_highbd; const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride; uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above : boundaries->stripe_boundary_below; uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd); const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd; uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride; // There is a rare case in which a processing stripe can end 1px above the // crop border. In this case, we do want to use deblocked pixels from below // the stripe (hence why we ended up in this function), but instead of // fetching 2 "below" rows we need to fetch one and duplicate it. // This is equivalent to clamping the sample locations against the crop border const int lines_to_save = AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row); assert(lines_to_save == 1 || lines_to_save == 2); int upscaled_width; int line_bytes; if (av1_superres_scaled(cm)) { const int ss_x = is_uv && cm->seq_params->subsampling_x; upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x; line_bytes = upscaled_width << use_highbd; if (use_highbd) av1_upscale_normative_rows( cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv], CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride, plane, lines_to_save); else av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows, boundaries->stripe_boundary_stride, plane, lines_to_save); } else { upscaled_width = frame->crop_widths[is_uv]; line_bytes = upscaled_width << use_highbd; for (int i = 0; i < lines_to_save; i++) { memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride, line_bytes); } } // If we only saved one line, then copy it into the second line buffer if (lines_to_save == 1) memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes); extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride, RESTORATION_EXTRA_HORZ, use_highbd); } static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row, int stripe, int use_highbd, int is_above, RestorationStripeBoundaries *boundaries) { const int is_uv = plane > 0; const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]); const int src_stride = frame->strides[is_uv] << use_highbd; const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride; uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above : boundaries->stripe_boundary_below; uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd); const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd; uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride; const int src_width = frame->crop_widths[is_uv]; // At the point where this function is called, we've already applied // superres. So we don't need to extend the lines here, we can just // pull directly from the topmost row of the upscaled frame. const int ss_x = is_uv && cm->seq_params->subsampling_x; const int upscaled_width = av1_superres_scaled(cm) ? (cm->superres_upscaled_width + ss_x) >> ss_x : src_width; const int line_bytes = upscaled_width << use_highbd; for (int i = 0; i < RESTORATION_CTX_VERT; i++) { // Copy the line at 'src_rows' into both context lines memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes); } extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride, RESTORATION_EXTRA_HORZ, use_highbd); } static void save_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd, int plane, AV1_COMMON *cm, int after_cdef) { const int is_uv = plane > 0; const int ss_y = is_uv && cm->seq_params->subsampling_y; const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y; int plane_w, plane_h; av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries; const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y); int stripe_idx; for (stripe_idx = 0;; ++stripe_idx) { const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off); const int y0 = rel_y0; if (y0 >= plane_h) break; const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off; const int y1 = AOMMIN(rel_y1, plane_h); // Extend using CDEF pixels at the top and bottom of the frame, // and deblocked pixels at internal stripe boundaries const int use_deblock_above = (stripe_idx > 0); const int use_deblock_below = (y1 < plane_height); if (!after_cdef) { // Save deblocked context at internal stripe boundaries if (use_deblock_above) { save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT, stripe_idx, use_highbd, 1, boundaries); } if (use_deblock_below) { save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx, use_highbd, 0, boundaries); } } else { // Save CDEF context at frame boundaries if (!use_deblock_above) { save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd, 1, boundaries); } if (!use_deblock_below) { save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx, use_highbd, 0, boundaries); } } } } // For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan // lines to be used as boundary in the loop restoration process. The // lines are saved in rst_internal.stripe_boundary_lines void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, int after_cdef) { const int num_planes = av1_num_planes(cm); const int use_highbd = cm->seq_params->use_highbitdepth; for (int p = 0; p < num_planes; ++p) { save_boundary_lines(frame, use_highbd, p, cm, after_cdef); } } aom-3.12.1/av1/common/restoration.h000066400000000000000000000413251477627663500171070ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_RESTORATION_H_ #define AOM_AV1_COMMON_RESTORATION_H_ #include "aom_ports/mem.h" #include "config/aom_config.h" #include "av1/common/blockd.h" #include "av1/common/enums.h" #ifdef __cplusplus extern "C" { #endif /*! @file */ /*!\cond */ // Border for Loop restoration buffer #define AOM_RESTORATION_FRAME_BORDER 32 #define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x)) #define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5)) #define RESTORATION_PROC_UNIT_SIZE 64 // Filter stripe grid offset upwards compared to the superblock grid #define RESTORATION_UNIT_OFFSET 8 #define SGRPROJ_BORDER_VERT 3 // Vertical border used for Sgr #define SGRPROJ_BORDER_HORZ 3 // Horizontal border used for Sgr #define WIENER_BORDER_VERT 2 // Vertical border used for Wiener #define WIENER_HALFWIN 3 #define WIENER_BORDER_HORZ (WIENER_HALFWIN) // Horizontal border for Wiener // RESTORATION_BORDER_VERT determines line buffer requirement for LR. // Should be set at the max of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT. // Note the line buffer needed is twice the value of this macro. #if SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT #define RESTORATION_BORDER_VERT (SGRPROJ_BORDER_VERT) #else #define RESTORATION_BORDER_VERT (WIENER_BORDER_VERT) #endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT #if SGRPROJ_BORDER_HORZ >= WIENER_BORDER_HORZ #define RESTORATION_BORDER_HORZ (SGRPROJ_BORDER_HORZ) #else #define RESTORATION_BORDER_HORZ (WIENER_BORDER_HORZ) #endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT // How many border pixels do we need for each processing unit? #define RESTORATION_BORDER 3 // How many rows of deblocked pixels do we save above/below each processing // stripe? #define RESTORATION_CTX_VERT 2 // Additional pixels to the left and right in above/below buffers // It is RESTORATION_BORDER_HORZ rounded up to get nicer buffer alignment #define RESTORATION_EXTRA_HORZ 4 // Pad up to 20 more (may be much less is needed) #define RESTORATION_PADDING 20 #define RESTORATION_PROC_UNIT_PELS \ ((RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_HORZ * 2 + \ RESTORATION_PADDING) * \ (RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_VERT * 2 + \ RESTORATION_PADDING)) #define RESTORATION_UNITSIZE_MAX 256 #define RESTORATION_UNITPELS_HORZ_MAX \ (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16) #define RESTORATION_UNITPELS_VERT_MAX \ ((RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \ RESTORATION_UNIT_OFFSET)) #define RESTORATION_UNITPELS_MAX \ (RESTORATION_UNITPELS_HORZ_MAX * RESTORATION_UNITPELS_VERT_MAX) // Two 32-bit buffers needed for the restored versions from two filters // TODO(debargha, rupert): Refactor to not need the large tile size to be // stored on the decoder side. #define SGRPROJ_TMPBUF_SIZE (RESTORATION_UNITPELS_MAX * 2 * sizeof(int32_t)) #define SGRPROJ_EXTBUF_SIZE (0) #define SGRPROJ_PARAMS_BITS 4 #define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS) // Precision bits for projection #define SGRPROJ_PRJ_BITS 7 // Restoration precision bits generated higher than source before projection #define SGRPROJ_RST_BITS 4 // Internal precision bits for core selfguided_restoration #define SGRPROJ_SGR_BITS 8 #define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS) #define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4) #define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1) #define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4) #define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1) #define SGRPROJ_PRJ_SUBEXP_K 4 #define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS) #define MAX_RADIUS 2 // Only 1, 2, 3 allowed #define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1)) #define SGRPROJ_MTABLE_BITS 20 #define SGRPROJ_RECIP_BITS 12 #define WIENER_HALFWIN1 (WIENER_HALFWIN + 1) #define WIENER_WIN (2 * WIENER_HALFWIN + 1) #define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN)) #define WIENER_TMPBUF_SIZE (0) #define WIENER_EXTBUF_SIZE (0) // If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for // chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN. #define WIENER_WIN_CHROMA (WIENER_WIN - 2) #define WIENER_WIN_REDUCED (WIENER_WIN - 2) #define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA)) #define WIENER_STATS_DOWNSAMPLE_FACTOR 4 #define WIENER_FILT_PREC_BITS 7 #define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS) // Central values for the taps #define WIENER_FILT_TAP0_MIDV (3) #define WIENER_FILT_TAP1_MIDV (-7) #define WIENER_FILT_TAP2_MIDV (15) #define WIENER_FILT_TAP3_MIDV \ (WIENER_FILT_STEP - 2 * (WIENER_FILT_TAP0_MIDV + WIENER_FILT_TAP1_MIDV + \ WIENER_FILT_TAP2_MIDV)) #define WIENER_FILT_TAP0_BITS 4 #define WIENER_FILT_TAP1_BITS 5 #define WIENER_FILT_TAP2_BITS 6 #define WIENER_FILT_BITS \ ((WIENER_FILT_TAP0_BITS + WIENER_FILT_TAP1_BITS + WIENER_FILT_TAP2_BITS) * 2) #define WIENER_FILT_TAP0_MINV \ (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2) #define WIENER_FILT_TAP1_MINV \ (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2) #define WIENER_FILT_TAP2_MINV \ (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2) #define WIENER_FILT_TAP0_MAXV \ (WIENER_FILT_TAP0_MIDV - 1 + (1 << WIENER_FILT_TAP0_BITS) / 2) #define WIENER_FILT_TAP1_MAXV \ (WIENER_FILT_TAP1_MIDV - 1 + (1 << WIENER_FILT_TAP1_BITS) / 2) #define WIENER_FILT_TAP2_MAXV \ (WIENER_FILT_TAP2_MIDV - 1 + (1 << WIENER_FILT_TAP2_BITS) / 2) #define WIENER_FILT_TAP0_SUBEXP_K 1 #define WIENER_FILT_TAP1_SUBEXP_K 2 #define WIENER_FILT_TAP2_SUBEXP_K 3 // Max of SGRPROJ_TMPBUF_SIZE, DOMAINTXFMRF_TMPBUF_SIZE, WIENER_TMPBUF_SIZE #define RESTORATION_TMPBUF_SIZE (SGRPROJ_TMPBUF_SIZE) // Max of SGRPROJ_EXTBUF_SIZE, WIENER_EXTBUF_SIZE #define RESTORATION_EXTBUF_SIZE (WIENER_EXTBUF_SIZE) // Check the assumptions of the existing code #if SUBPEL_TAPS != WIENER_WIN + 1 #error "Wiener filter currently only works if SUBPEL_TAPS == WIENER_WIN + 1" #endif #if WIENER_FILT_PREC_BITS != 7 #error "Wiener filter currently only works if WIENER_FILT_PREC_BITS == 7" #endif typedef struct { int r[2]; // radii int s[2]; // sgr parameters for r[0] and r[1], based on GenSgrprojVtable() } sgr_params_type; /*!\endcond */ /*!\brief Parameters related to Restoration Unit Info */ typedef struct { /*! * restoration type */ RestorationType restoration_type; /*! * Wiener filter parameters if restoration_type indicates Wiener */ WienerInfo wiener_info; /*! * Sgrproj filter parameters if restoration_type indicates Sgrproj */ SgrprojInfo sgrproj_info; } RestorationUnitInfo; /*!\cond */ // A restoration line buffer needs space for two lines plus a horizontal filter // margin of RESTORATION_EXTRA_HORZ on each side. #define RESTORATION_LINEBUFFER_WIDTH \ (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_EXTRA_HORZ) typedef struct { // Temporary buffers to save/restore 3 lines above/below the restoration // stripe. uint16_t tmp_save_above[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH]; uint16_t tmp_save_below[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH]; } RestorationLineBuffers; /*!\endcond */ /*!\brief Parameters related to Restoration Stripe boundaries */ typedef struct { /*! * stripe boundary above */ uint8_t *stripe_boundary_above; /*! * stripe boundary below */ uint8_t *stripe_boundary_below; /*! * strides for stripe boundaries above and below */ int stripe_boundary_stride; /*! * size of stripe boundaries above and below */ int stripe_boundary_size; } RestorationStripeBoundaries; /*!\brief Parameters related to Restoration Info */ typedef struct { /*! * Restoration type for frame */ RestorationType frame_restoration_type; /*! * Restoration unit size */ int restoration_unit_size; /** * \name Fields allocated and initialised by av1_alloc_restoration_struct. */ /**@{*/ /*! * Total number of restoration units in this plane */ int num_rest_units; /*! * Number of vertical restoration units in this plane */ int vert_units; /*! * Number of horizontal restoration units in this plane */ int horz_units; /**@}*/ /*! * Parameters for each restoration unit in this plane */ RestorationUnitInfo *unit_info; /*! * Restoration Stripe boundary info */ RestorationStripeBoundaries boundaries; /*! * Whether optimized lr can be used for speed. * That includes cases of no cdef and no superres, or if fast trial runs * are used on the encoder side. */ int optimized_lr; } RestorationInfo; /*!\cond */ static inline void set_default_sgrproj(SgrprojInfo *sgrproj_info) { sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2; sgrproj_info->xqd[1] = (SGRPROJ_PRJ_MIN1 + SGRPROJ_PRJ_MAX1) / 2; } static inline void set_default_wiener(WienerInfo *wiener_info) { wiener_info->vfilter[0] = wiener_info->hfilter[0] = WIENER_FILT_TAP0_MIDV; wiener_info->vfilter[1] = wiener_info->hfilter[1] = WIENER_FILT_TAP1_MIDV; wiener_info->vfilter[2] = wiener_info->hfilter[2] = WIENER_FILT_TAP2_MIDV; wiener_info->vfilter[WIENER_HALFWIN] = wiener_info->hfilter[WIENER_HALFWIN] = -2 * (WIENER_FILT_TAP2_MIDV + WIENER_FILT_TAP1_MIDV + WIENER_FILT_TAP0_MIDV); wiener_info->vfilter[4] = wiener_info->hfilter[4] = WIENER_FILT_TAP2_MIDV; wiener_info->vfilter[5] = wiener_info->hfilter[5] = WIENER_FILT_TAP1_MIDV; wiener_info->vfilter[6] = wiener_info->hfilter[6] = WIENER_FILT_TAP0_MIDV; } typedef struct { int h_start, h_end, v_start, v_end; } RestorationTileLimits; typedef void (*rest_unit_visitor_t)(const RestorationTileLimits *limits, int rest_unit_idx, void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs, struct aom_internal_error_info *error_info); typedef struct FilterFrameCtxt { const RestorationInfo *rsi; int ss_x, ss_y; int plane_w, plane_h; int highbd, bit_depth; uint8_t *data8, *dst8; int data_stride, dst_stride; } FilterFrameCtxt; typedef struct AV1LrStruct { rest_unit_visitor_t on_rest_unit; FilterFrameCtxt ctxt[MAX_MB_PLANE]; YV12_BUFFER_CONFIG *frame; YV12_BUFFER_CONFIG *dst; } AV1LrStruct; extern const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS]; extern int sgrproj_mtable[SGRPROJ_PARAMS][2]; extern const int32_t av1_x_by_xplus1[256]; extern const int32_t av1_one_by_x[MAX_NELEM]; void av1_alloc_restoration_struct(struct AV1Common *cm, RestorationInfo *rsi, int is_uv); void av1_free_restoration_struct(RestorationInfo *rst_info); void av1_extend_frame(uint8_t *data, int width, int height, int stride, int border_horz, int border_vert, int highbd); void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params); /*!\endcond */ /*!\brief Function for applying loop restoration filter to a single unit. * * \ingroup in_loop_restoration * This function applies the loop restoration filter to a single * loop restoration unit. * * \param[in] limits Limits of the unit * \param[in] rui The parameters to use for this unit and its * coefficients * \param[in] rsb Deblocked pixels to use for stripe boundaries * \param[in] rlbs Space to use as a scratch buffer * \param[in] ss_x Horizontal subsampling for plane * \param[in] ss_y Vertical subsampling for plane * \param[in] plane_w Width of the current plane * \param[in] plane_h Height of the current plane * \param[in] highbd Whether high bitdepth pipeline is used * \param[in] bit_depth Bit-depth of the video * \param[in] data8 Frame data (pointing at the top-left corner of * the frame, not the restoration unit). * \param[in] stride Stride of \c data8 * \param[out] dst8 Buffer where the results will be written. Like * \c data8, \c dst8 should point at the top-left * corner of the frame * \param[in] dst_stride Stride of \c dst8 * \param[in] tmpbuf Scratch buffer used by the sgrproj filter * which should be at least SGRPROJ_TMPBUF_SIZE * big. * \param[in] optimized_lr Whether to use fast optimized Loop Restoration * \param[in,out] error_info Error info for reporting errors * * \remark Nothing is returned. Instead, the filtered unit is output in * \c dst8 at the proper restoration unit offset. */ void av1_loop_restoration_filter_unit( const RestorationTileLimits *limits, const RestorationUnitInfo *rui, const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs, int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf, int optimized_lr, struct aom_internal_error_info *error_info); /*!\brief Function for applying loop restoration filter to a frame * * \ingroup in_loop_restoration * This function applies the loop restoration filter to a frame. * * \param[in,out] frame Compressed frame buffer * \param[in,out] cm Pointer to top level common structure * \param[in] optimized_lr Whether to use fast optimized Loop Restoration * \param[in] lr_ctxt Loop restoration context * * \remark Nothing is returned. Instead, the filtered frame is output in * \c frame. */ void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, int optimized_lr, void *lr_ctxt); /*!\cond */ void av1_loop_restoration_precal(void); struct AV1LrSyncData; typedef void (*sync_read_fn_t)(void *const lr_sync, int r, int c, int plane); typedef void (*sync_write_fn_t)(void *const lr_sync, int r, int c, const int sb_cols, int plane); // Return 1 iff the block at mi_row, mi_col with size bsize is a // top-level superblock containing the top-left corner of at least one // loop restoration unit. // // If the block is a top-level superblock, the function writes to // *rcol0, *rcol1, *rrow0, *rrow1. This means that the parameters for all // restoration units in the rectangle [*rcol0, *rcol1) x [*rrow0, *rrow1) // are signaled in this superblock. int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rcol0, int *rcol1, int *rrow0, int *rrow1); void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, int after_cdef); void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, int optimized_lr, int num_planes); void av1_foreach_rest_unit_in_row( RestorationTileLimits *limits, int plane_w, rest_unit_visitor_t on_rest_unit, int row_number, int unit_size, int hnum_rest_units, int vnum_rest_units, int plane, void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync, struct aom_internal_error_info *error_info); void av1_get_upsampled_plane_size(const struct AV1Common *cm, int is_uv, int *plane_w, int *plane_h); int av1_lr_count_units(int unit_size, int plane_size); void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane); void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c, const int sb_cols, int plane); /*!\endcond */ #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_RESTORATION_H_ aom-3.12.1/av1/common/riscv/000077500000000000000000000000001477627663500155065ustar00rootroot00000000000000aom-3.12.1/av1/common/riscv/cdef_block_rvv.c000066400000000000000000001450151477627663500206300ustar00rootroot00000000000000/* * Copyright (c) 2025, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "av1/common/cdef_block.h" // partial A is a 16-bit vector of the form: // [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form: // [0 y1 y2 y3 y4 y5 y6 y7]. // This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... // (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1 // and const2. static inline vuint32m1_t fold_mul_and_sum_rvv(vint16m1_t partiala, vint16m1_t partialb, vuint32m1_t const1, vuint32m1_t const2) { // Square and add the corresponding x and y values. vint32m2_t cost = __riscv_vwmul_vv_i32m2(partiala, partiala, 8); cost = __riscv_vwmacc_vv_i32m2(cost, partialb, partialb, 8); // Multiply by constant. vuint32m2_t tmp1_u32m2 = __riscv_vreinterpret_v_i32m2_u32m2(cost); vuint32m1_t cost_u32m1 = __riscv_vmul_vv_u32m1( __riscv_vlmul_trunc_v_u32m2_u32m1(tmp1_u32m2), const1, 4); tmp1_u32m2 = __riscv_vslidedown_vx_u32m2(tmp1_u32m2, 4, 8); vuint32m1_t ret = __riscv_vmacc_vv_u32m1( cost_u32m1, __riscv_vlmul_trunc_v_u32m2_u32m1(tmp1_u32m2), const2, 4); return ret; } // This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal // down-right, 6 is vertical). // // For each direction the lines are shifted so that we can perform a // basic sum on each vector element. For example, direction 5 is "south by // southeast", so we need to add the pixels along each line i below: // // 0 1 2 3 4 5 6 7 // 0 1 2 3 4 5 6 7 // 8 0 1 2 3 4 5 6 // 8 0 1 2 3 4 5 6 // 9 8 0 1 2 3 4 5 // 9 8 0 1 2 3 4 5 // 10 9 8 0 1 2 3 4 // 10 9 8 0 1 2 3 4 // // For this to fit nicely in vectors, the lines need to be shifted like so: // 0 1 2 3 4 5 6 7 // 0 1 2 3 4 5 6 7 // 8 0 1 2 3 4 5 6 // 8 0 1 2 3 4 5 6 // 9 8 0 1 2 3 4 5 // 9 8 0 1 2 3 4 5 // 10 9 8 0 1 2 3 4 // 10 9 8 0 1 2 3 4 // // In this configuration we can now perform SIMD additions to get the cost // along direction 5. Since this won't fit into a single 128-bit vector, we use // two of them to compute each half of the new configuration, and pad the empty // spaces with zeros. Similar shifting is done for other directions, except // direction 6 which is straightforward as it's the vertical direction. static vuint32m1_t compute_vert_directions_rvv( vint16m1_t lines_0, vint16m1_t lines_1, vint16m1_t lines_2, vint16m1_t lines_3, vint16m1_t lines_4, vint16m1_t lines_5, vint16m1_t lines_6, vint16m1_t lines_7, uint32_t cost[4], size_t vl) { size_t VL_SLIDE_DOWN = __riscv_vsetvl_e16m1(16); vint16m1_t vec_zero_i16m1 = __riscv_vmv_v_x_i16m1(0, vl); // Partial sums for lines 0 and 1. vint16m1_t partial4a = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_0, (8 - 1), vl); vint16m1_t tmp1_i16m1 = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 2), vl); partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl); vint16m1_t partial4b = __riscv_vslide1down_vx_i16m1(lines_0, 0, vl); tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_1, 2, VL_SLIDE_DOWN); partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl); tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_0, lines_1, VL_SLIDE_DOWN); vint16m1_t partial5a = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 3), vl); vint16m1_t partial5b = __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 3, VL_SLIDE_DOWN); vint16m1_t partial7a = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 6), vl); vint16m1_t partial7b = __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 6, VL_SLIDE_DOWN); vint16m1_t partial6 = __riscv_vmv_v_v_i16m1(tmp1_i16m1, vl); // Partial sums for lines 2 and 3. tmp1_i16m1 = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 3), vl); partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl); tmp1_i16m1 = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 4), vl); partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl); tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_2, 3, VL_SLIDE_DOWN); partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl); tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_3, 4, VL_SLIDE_DOWN); partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl); tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_2, lines_3, VL_SLIDE_DOWN); partial5a = __riscv_vadd_vv_i16m1( partial5a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 4), vl), vl); partial5b = __riscv_vadd_vv_i16m1( partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 4, VL_SLIDE_DOWN), vl); partial7a = __riscv_vadd_vv_i16m1( partial7a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 5), vl), vl); partial7b = __riscv_vadd_vv_i16m1( partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 5, VL_SLIDE_DOWN), vl); partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl); // Partial sums for lines 4 and 5. partial4a = __riscv_vadd_vv_i16m1( partial4a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 5), vl), vl); partial4a = __riscv_vadd_vv_i16m1( partial4a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 6), vl), vl); partial4b = __riscv_vadd_vv_i16m1( partial4b, __riscv_vslidedown_vx_i16m1(lines_4, 5, VL_SLIDE_DOWN), vl); partial4b = __riscv_vadd_vv_i16m1( partial4b, __riscv_vslidedown_vx_i16m1(lines_5, 6, VL_SLIDE_DOWN), vl); tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_4, lines_5, VL_SLIDE_DOWN); partial5a = __riscv_vadd_vv_i16m1( partial5a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 5), vl), vl); partial5b = __riscv_vadd_vv_i16m1( partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 5, VL_SLIDE_DOWN), vl); partial7a = __riscv_vadd_vv_i16m1( partial7a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 4), vl), vl); partial7b = __riscv_vadd_vv_i16m1( partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 4, VL_SLIDE_DOWN), vl); partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl); // Partial sums for lines 6 and 7. partial4a = __riscv_vadd_vv_i16m1( partial4a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 7), vl), vl); partial4a = __riscv_vadd_vv_i16m1(partial4a, lines_7, vl); partial4b = __riscv_vadd_vv_i16m1( partial4b, __riscv_vslidedown_vx_i16m1(lines_6, 7, VL_SLIDE_DOWN), vl); tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_6, lines_7, VL_SLIDE_DOWN); partial5a = __riscv_vadd_vv_i16m1( partial5a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 6), vl), vl); partial5b = __riscv_vadd_vv_i16m1( partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 6, VL_SLIDE_DOWN), vl); partial7a = __riscv_vadd_vv_i16m1( partial7a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 3), vl), vl); partial7b = __riscv_vadd_vv_i16m1( partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 3, VL_SLIDE_DOWN), vl); partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl); // const0 = { 840, 420, 280, 210, } vuint32m1_t const0 = __riscv_vmv_s_x_u32m1(210, 4); const0 = __riscv_vslide1up_vx_u32m1(const0, 280, 4); const0 = __riscv_vslide1up_vx_u32m1(const0, 420, 4); const0 = __riscv_vslide1up_vx_u32m1(const0, 840, 4); // const1 = { 168, 140, 120, 105, } vuint32m1_t const1 = __riscv_vmv_s_x_u32m1(105, 4); const1 = __riscv_vslide1up_vx_u32m1(const1, 120, 4); const1 = __riscv_vslide1up_vx_u32m1(const1, 140, 4); const1 = __riscv_vslide1up_vx_u32m1(const1, 168, 4); // const2 = { 0, 0, 420, 210, } vuint32m1_t const2 = __riscv_vmv_v_x_u32m1(0, 4); const2 = __riscv_vslide1down_vx_u32m1(const2, 420, 4); const2 = __riscv_vslide1down_vx_u32m1(const2, 210, 4); // const3 = { 140, 105, 105, 105, }; vuint32m1_t const3 = __riscv_vmv_v_x_u32m1(105, 4); const3 = __riscv_vslide1up_vx_u32m1(const3, 140, 4); // Compute costs in terms of partial sums. vint32m2_t tmp1_i32m2 = __riscv_vwmul_vv_i32m2(partial6, partial6, vl); vint32m2_t partial6_s32 = __riscv_vslidedown_vx_i32m2(tmp1_i32m2, 4, vl); partial6_s32 = __riscv_vadd_vv_i32m2(partial6_s32, tmp1_i32m2, 4); // Reverse partial B. // pattern = { 6, 5, 4, 3, 2, 1, 0, 7, }. vuint32m1_t costs_0, costs_1, costs_2, costs_3; static const uint16_t tab_u16[8] = { 6, 5, 4, 3, 2, 1, 0, 7, }; vuint16m1_t index_u16m1 = __riscv_vle16_v_u16m1(tab_u16, 8); vint16m1_t partial4b_rv = __riscv_vrgather_vv_i16m1(partial4b, index_u16m1, 8); costs_0 = fold_mul_and_sum_rvv(partial4a, partial4b_rv, const0, const1); vuint32m1_t partial6_u32 = __riscv_vreinterpret_v_i32m1_u32m1( __riscv_vlmul_trunc_v_i32m2_i32m1(partial6_s32)); costs_2 = __riscv_vmul_vx_u32m1(partial6_u32, 105, 4); vint16m1_t partial5b_rv = __riscv_vrgather_vv_i16m1(partial5b, index_u16m1, 8); costs_1 = fold_mul_and_sum_rvv(partial5a, partial5b_rv, const2, const3); vint16m1_t partial7b_rv = __riscv_vrgather_vv_i16m1(partial7b, index_u16m1, 8); costs_3 = fold_mul_and_sum_rvv(partial7a, partial7b_rv, const2, const3); // combine values vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1); vuint32m1_t cost0_sum = __riscv_vredsum_vs_u32m1_u32m1(costs_0, vec_scalar_u32m1, 4); vuint32m1_t cost1_sum = __riscv_vredsum_vs_u32m1_u32m1(costs_1, vec_scalar_u32m1, 4); vuint32m1_t cost2_sum = __riscv_vredsum_vs_u32m1_u32m1(costs_2, vec_scalar_u32m1, 4); vuint32m1_t cost3_sum = __riscv_vredsum_vs_u32m1_u32m1(costs_3, vec_scalar_u32m1, 4); vuint32m1_t cost47 = __riscv_vslideup_vx_u32m1(cost0_sum, cost1_sum, 1, 4); cost47 = __riscv_vslideup_vx_u32m1(cost47, cost2_sum, 2, 4); cost47 = __riscv_vslideup_vx_u32m1(cost47, cost3_sum, 3, 4); __riscv_vse32_v_u32m1(&cost[0], cost47, 4); return cost47; } static inline vuint32m1_t fold_mul_and_sum_pairwise_rvv(vint16m1_t partiala, vint16m1_t partialb, vint16m1_t partialc, vuint32m1_t const0) { vuint16m1_t vid_u16m1 = __riscv_vid_v_u16m1(4); vuint16m1_t index_u16m1 = __riscv_vsll_vx_u16m1(vid_u16m1, 1, 4); vint16m1_t tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partiala, 0, 8); vint32m2_t partiala_i32m2 = __riscv_vwadd_vv_i32m2(partiala, tmp_i16m1, 8); tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partialb, 0, 8); vint32m2_t partialb_i32m2 = __riscv_vwadd_vv_i32m2(partialb, tmp_i16m1, 8); tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partialc, 0, 8); vint32m2_t partialc_i32m2 = __riscv_vwadd_vv_i32m2(partialc, tmp_i16m1, 8); partiala_i32m2 = __riscv_vmul_vv_i32m2(partiala_i32m2, partiala_i32m2, 8); partialb_i32m2 = __riscv_vmul_vv_i32m2(partialb_i32m2, partialb_i32m2, 8); vint32m1_t partialb_i32m1 = __riscv_vlmul_trunc_v_i32m2_i32m1( __riscv_vrgatherei16_vv_i32m2(partialb_i32m2, index_u16m1, 4)); partialc_i32m2 = __riscv_vmul_vv_i32m2(partialc_i32m2, partialc_i32m2, 8); partiala_i32m2 = __riscv_vadd_vv_i32m2(partiala_i32m2, partialc_i32m2, 8); vint32m1_t partiala_i32m1 = __riscv_vlmul_trunc_v_i32m2_i32m1( __riscv_vrgatherei16_vv_i32m2(partiala_i32m2, index_u16m1, 4)); vuint32m1_t cost = __riscv_vmul_vx_u32m1( __riscv_vreinterpret_v_i32m1_u32m1(partialb_i32m1), 105, 4); cost = __riscv_vmacc_vv_u32m1( cost, __riscv_vreinterpret_v_i32m1_u32m1(partiala_i32m1), const0, 4); return cost; } static inline vint32m1_t horizontal_add_4d_s16x8(vint16m1_t lines_0, vint16m1_t lines_1, vint16m1_t lines_2, vint16m1_t lines_3) { vint32m1_t vec_scalar_i32m1 = __riscv_vmv_s_x_i32m1(0, 1); vint32m1_t lines0_sum = __riscv_vwredsum_vs_i16m1_i32m1(lines_0, vec_scalar_i32m1, 8); vint32m1_t lines1_sum = __riscv_vwredsum_vs_i16m1_i32m1(lines_1, vec_scalar_i32m1, 8); vint32m1_t lines2_sum = __riscv_vwredsum_vs_i16m1_i32m1(lines_2, vec_scalar_i32m1, 8); vint32m1_t lines3_sum = __riscv_vwredsum_vs_i16m1_i32m1(lines_3, vec_scalar_i32m1, 8); vint32m1_t ret = __riscv_vslideup_vx_i32m1(lines0_sum, lines1_sum, 1, 4); ret = __riscv_vslideup_vx_i32m1(ret, lines2_sum, 2, 4); ret = __riscv_vslideup_vx_i32m1(ret, lines3_sum, 3, 4); return ret; } // This function computes the cost along directions 0, 1, 2, 3. (0 means // 45-degree up-right, 2 is horizontal). // // For direction 1 and 3 ("east northeast" and "east southeast") the shifted // lines need three vectors instead of two. For direction 1 for example, we need // to compute the sums along the line i below: // 0 0 1 1 2 2 3 3 // 1 1 2 2 3 3 4 4 // 2 2 3 3 4 4 5 5 // 3 3 4 4 5 5 6 6 // 4 4 5 5 6 6 7 7 // 5 5 6 6 7 7 8 8 // 6 6 7 7 8 8 9 9 // 7 7 8 8 9 9 10 10 // // Which means we need the following configuration: // 0 0 1 1 2 2 3 3 // 1 1 2 2 3 3 4 4 // 2 2 3 3 4 4 5 5 // 3 3 4 4 5 5 6 6 // 4 4 5 5 6 6 7 7 // 5 5 6 6 7 7 8 8 // 6 6 7 7 8 8 9 9 // 7 7 8 8 9 9 10 10 // // Three vectors are needed to compute this, as well as some extra pairwise // additions. static vuint32m1_t compute_horiz_directions_rvv( vint16m1_t lines_0, vint16m1_t lines_1, vint16m1_t lines_2, vint16m1_t lines_3, vint16m1_t lines_4, vint16m1_t lines_5, vint16m1_t lines_6, vint16m1_t lines_7, uint32_t cost[4], size_t vl) { // Compute diagonal directions (1, 2, 3). // Partial sums for lines 0 and 1. size_t VL_SLIDE_DOWN = __riscv_vsetvl_e16m1(16); vint16m1_t vec_zero_i16m1 = __riscv_vmv_v_x_i16m1(0, vl); vint16m1_t partial0a = __riscv_vmv_v_v_i16m1(lines_0, vl); partial0a = __riscv_vadd_vv_i16m1( partial0a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 7), vl), vl); vint16m1_t partial0b = __riscv_vslidedown_vx_i16m1(lines_1, 7, VL_SLIDE_DOWN); vint16m1_t partial1a = __riscv_vadd_vv_i16m1( lines_0, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 6), vl), vl); vint16m1_t partial1b = __riscv_vslidedown_vx_i16m1(lines_1, 6, VL_SLIDE_DOWN); vint16m1_t partial3a = __riscv_vslidedown_vx_i16m1(lines_0, 2, VL_SLIDE_DOWN); partial3a = __riscv_vadd_vv_i16m1( partial3a, __riscv_vslidedown_vx_i16m1(lines_1, 4, VL_SLIDE_DOWN), vl); vint16m1_t partial3b = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_0, (8 - 2), vl); partial3b = __riscv_vadd_vv_i16m1( partial3b, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, 4, vl), vl); // Partial sums for lines 2 and 3. partial0a = __riscv_vadd_vv_i16m1( partial0a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 6), vl), vl); partial0a = __riscv_vadd_vv_i16m1( partial0a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 5), vl), vl); partial0b = __riscv_vadd_vv_i16m1( partial0b, __riscv_vslidedown_vx_i16m1(lines_2, 6, VL_SLIDE_DOWN), vl); partial0b = __riscv_vadd_vv_i16m1( partial0b, __riscv_vslidedown_vx_i16m1(lines_3, 5, VL_SLIDE_DOWN), vl); partial1a = __riscv_vadd_vv_i16m1( partial1a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 4), vl), vl); partial1a = __riscv_vadd_vv_i16m1( partial1a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 2), vl), vl); partial1b = __riscv_vadd_vv_i16m1( partial1b, __riscv_vslidedown_vx_i16m1(lines_2, 4, VL_SLIDE_DOWN), vl); partial1b = __riscv_vadd_vv_i16m1( partial1b, __riscv_vslidedown_vx_i16m1(lines_3, 2, VL_SLIDE_DOWN), vl); partial3a = __riscv_vadd_vv_i16m1( partial3a, __riscv_vslidedown_vx_i16m1(lines_2, 6, VL_SLIDE_DOWN), vl); partial3b = __riscv_vadd_vv_i16m1( partial3b, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 6), vl), vl); partial3b = __riscv_vadd_vv_i16m1(partial3b, lines_3, vl); // Partial sums for lines 4 and 5. partial0a = __riscv_vadd_vv_i16m1( partial0a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 4), vl), vl); partial0a = __riscv_vadd_vv_i16m1( partial0a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 3), vl), vl); partial0b = __riscv_vadd_vv_i16m1( partial0b, __riscv_vslidedown_vx_i16m1(lines_4, 4, VL_SLIDE_DOWN), vl); partial0b = __riscv_vadd_vv_i16m1( partial0b, __riscv_vslidedown_vx_i16m1(lines_5, 3, VL_SLIDE_DOWN), vl); partial1b = __riscv_vadd_vv_i16m1(partial1b, lines_4, vl); partial1b = __riscv_vadd_vv_i16m1( partial1b, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 6), vl), vl); vint16m1_t partial1c = __riscv_vslidedown_vx_i16m1(lines_5, 6, VL_SLIDE_DOWN); partial3b = __riscv_vadd_vv_i16m1( partial3b, __riscv_vslidedown_vx_i16m1(lines_4, 2, VL_SLIDE_DOWN), vl); partial3b = __riscv_vadd_vv_i16m1( partial3b, __riscv_vslidedown_vx_i16m1(lines_5, 4, VL_SLIDE_DOWN), vl); vint16m1_t partial3c = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 2), vl); partial3c = __riscv_vadd_vv_i16m1( partial3c, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 4), vl), vl); // Partial sums for lines 6 and 7. partial0a = __riscv_vadd_vv_i16m1( partial0a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 2), vl), vl); partial0a = __riscv_vadd_vv_i16m1( partial0a, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_7, (8 - 1), vl), vl); partial0b = __riscv_vadd_vv_i16m1( partial0b, __riscv_vslidedown_vx_i16m1(lines_6, 2, VL_SLIDE_DOWN), vl); partial0b = __riscv_vadd_vv_i16m1( partial0b, __riscv_vslide1down_vx_i16m1(lines_7, 0, vl), vl); partial1b = __riscv_vadd_vv_i16m1( partial1b, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 4), vl), vl); partial1b = __riscv_vadd_vv_i16m1( partial1b, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_7, (8 - 2), vl), vl); partial1c = __riscv_vadd_vv_i16m1( partial1c, __riscv_vslidedown_vx_i16m1(lines_6, 4, VL_SLIDE_DOWN), vl); partial1c = __riscv_vadd_vv_i16m1( partial1c, __riscv_vslidedown_vx_i16m1(lines_7, 2, VL_SLIDE_DOWN), vl); partial3b = __riscv_vadd_vv_i16m1( partial3b, __riscv_vslidedown_vx_i16m1(lines_6, 6, VL_SLIDE_DOWN), vl); partial3c = __riscv_vadd_vv_i16m1( partial3c, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 6), vl), vl); partial3c = __riscv_vadd_vv_i16m1(partial3c, lines_7, vl); // Special case for direction 2 as it's just a sum along each line. vint32m1_t partial2a = horizontal_add_4d_s16x8(lines_0, lines_1, lines_2, lines_3); vint32m1_t partial2b = horizontal_add_4d_s16x8(lines_4, lines_5, lines_6, lines_7); vuint32m1_t partial2a_u32 = __riscv_vreinterpret_v_i32m1_u32m1( __riscv_vmul_vv_i32m1(partial2a, partial2a, 4)); vuint32m1_t partial2b_u32 = __riscv_vreinterpret_v_i32m1_u32m1( __riscv_vmul_vv_i32m1(partial2b, partial2b, 4)); // const0 = { 840, 420, 280, 210, } vuint32m1_t const0 = __riscv_vmv_s_x_u32m1(210, 4); const0 = __riscv_vslide1up_vx_u32m1(const0, 280, 4); const0 = __riscv_vslide1up_vx_u32m1(const0, 420, 4); const0 = __riscv_vslide1up_vx_u32m1(const0, 840, 4); // const1 = { 168, 140, 120, 105, } vuint32m1_t const1 = __riscv_vmv_s_x_u32m1(105, 4); const1 = __riscv_vslide1up_vx_u32m1(const1, 120, 4); const1 = __riscv_vslide1up_vx_u32m1(const1, 140, 4); const1 = __riscv_vslide1up_vx_u32m1(const1, 168, 4); // const2 = { 420, 210, 140, 105, }; vuint32m1_t const2 = __riscv_vmv_s_x_u32m1(105, 4); const2 = __riscv_vslide1up_vx_u32m1(const2, 140, 4); const2 = __riscv_vslide1up_vx_u32m1(const2, 210, 4); const2 = __riscv_vslide1up_vx_u32m1(const2, 420, 4); static const uint16_t tab_u16[8] = { 0, 6, 5, 4, 3, 2, 1, 0, }; vuint32m1_t costs_0, costs_1, costs_2, costs_3; vuint16m1_t template_u16m1 = __riscv_vle16_v_u16m1(tab_u16, 8); // Reverse partial c. // pattern = { 6, 5, 4, 3, 2, 1, 0, 7, } vuint16m1_t index_u16m1 = __riscv_vslide1down_vx_u16m1(template_u16m1, 7, 8); vint16m1_t partial0b_rv = __riscv_vrgather_vv_i16m1(partial0b, index_u16m1, 8); costs_0 = fold_mul_and_sum_rvv(partial0a, partial0b_rv, const0, const1); // Reverse partial c. // pattern = { 5, 4, 3, 2, 1, 0, 6, 7, } vuint16m1_t index_pair_u16m1 = __riscv_vslide1down_vx_u16m1(template_u16m1, 6, 8); index_pair_u16m1 = __riscv_vslide1down_vx_u16m1(index_pair_u16m1, 7, 8); vint16m1_t partialc_rv = __riscv_vrgather_vv_i16m1(partial1c, index_pair_u16m1, 8); costs_1 = fold_mul_and_sum_pairwise_rvv(partial1a, partial1b, partialc_rv, const2); costs_2 = __riscv_vadd_vv_u32m1(partial2a_u32, partial2b_u32, 4); costs_2 = __riscv_vmul_vx_u32m1(costs_2, 105, 4); vint16m1_t partial3a_rv = __riscv_vrgather_vv_i16m1(partial3a, index_pair_u16m1, 8); costs_3 = fold_mul_and_sum_pairwise_rvv(partial3c, partial3b, partial3a_rv, const2); // combine values vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1); vuint32m1_t cost0_sum = __riscv_vredsum_vs_u32m1_u32m1(costs_0, vec_scalar_u32m1, 4); vuint32m1_t cost1_sum = __riscv_vredsum_vs_u32m1_u32m1(costs_1, vec_scalar_u32m1, 4); vuint32m1_t cost2_sum = __riscv_vredsum_vs_u32m1_u32m1(costs_2, vec_scalar_u32m1, 4); vuint32m1_t cost3_sum = __riscv_vredsum_vs_u32m1_u32m1(costs_3, vec_scalar_u32m1, 4); costs_0 = __riscv_vslideup_vx_u32m1(cost0_sum, cost1_sum, 1, 4); costs_0 = __riscv_vslideup_vx_u32m1(costs_0, cost2_sum, 2, 4); costs_0 = __riscv_vslideup_vx_u32m1(costs_0, cost3_sum, 3, 4); __riscv_vse32_v_u32m1(&cost[0], costs_0, 4); return costs_0; } int cdef_find_dir_rvv(const uint16_t *img, int stride, int32_t *var, int coeff_shift) { size_t vl = 8; size_t vlmax = __riscv_vsetvlmax_e16m1(); vuint16m1_t s; vint16m1_t lines_0, lines_1, lines_2, lines_3; vint16m1_t lines_4, lines_5, lines_6, lines_7; vuint16m1_t vec_zero_u16m1 = __riscv_vmv_v_x_u16m1(0, __riscv_vsetvl_e16m1(16)); if (vlmax == 8) s = __riscv_vle16_v_u16m1(img, vl); else s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); lines_0 = __riscv_vreinterpret_v_u16m1_i16m1( __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); lines_0 = __riscv_vsub_vx_i16m1(lines_0, 128, vl); img += stride; if (vlmax == 8) s = __riscv_vle16_v_u16m1(img, vl); else s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); lines_1 = __riscv_vreinterpret_v_u16m1_i16m1( __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); lines_1 = __riscv_vsub_vx_i16m1(lines_1, 128, vl); img += stride; if (vlmax == 8) s = __riscv_vle16_v_u16m1(img, vl); else s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); lines_2 = __riscv_vreinterpret_v_u16m1_i16m1( __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); lines_2 = __riscv_vsub_vx_i16m1(lines_2, 128, vl); img += stride; if (vlmax == 8) s = __riscv_vle16_v_u16m1(img, vl); else s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); lines_3 = __riscv_vreinterpret_v_u16m1_i16m1( __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); lines_3 = __riscv_vsub_vx_i16m1(lines_3, 128, vl); img += stride; if (vlmax == 8) s = __riscv_vle16_v_u16m1(img, vl); else s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); lines_4 = __riscv_vreinterpret_v_u16m1_i16m1( __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); lines_4 = __riscv_vsub_vx_i16m1(lines_4, 128, vl); img += stride; if (vlmax == 8) s = __riscv_vle16_v_u16m1(img, vl); else s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); lines_5 = __riscv_vreinterpret_v_u16m1_i16m1( __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); lines_5 = __riscv_vsub_vx_i16m1(lines_5, 128, vl); img += stride; if (vlmax == 8) s = __riscv_vle16_v_u16m1(img, vl); else s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); lines_6 = __riscv_vreinterpret_v_u16m1_i16m1( __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); lines_6 = __riscv_vsub_vx_i16m1(lines_6, 128, vl); img += stride; if (vlmax == 8) s = __riscv_vle16_v_u16m1(img, vl); else s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); lines_7 = __riscv_vreinterpret_v_u16m1_i16m1( __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); lines_7 = __riscv_vsub_vx_i16m1(lines_7, 128, vl); // Compute "mostly vertical" directions. uint32_t cost[8]; vuint32m1_t cost47 = compute_vert_directions_rvv(lines_0, lines_1, lines_2, lines_3, lines_4, lines_5, lines_6, lines_7, cost + 4, vl); // Compute "mostly horizontal" directions. vuint32m1_t cost03 = compute_horiz_directions_rvv(lines_0, lines_1, lines_2, lines_3, lines_4, lines_5, lines_6, lines_7, cost, vl); // Find max cost as well as its index to get best_dir. // The max cost needs to be propagated in the whole vector to find its // position in the original cost vectors cost03 and cost47. vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1); vuint32m1_t cost07 = __riscv_vmaxu_vv_u32m1(cost03, cost47, 4); uint32_t best_cost = __riscv_vmv_x_s_u32m1_u32( __riscv_vredmaxu_vs_u32m1_u32m1(cost07, vec_scalar_u32m1, 4)); vbool32_t mask_cost = __riscv_vmseq_vx_u32m1_b32(cost03, best_cost, 4); long best_dir = __riscv_vfirst_m_b32(mask_cost, 4); if (best_dir == -1) { mask_cost = __riscv_vmseq_vx_u32m1_b32(cost47, best_cost, 4); best_dir = __riscv_vfirst_m_b32(mask_cost, 4); best_dir += 4; } // Difference between the optimal variance and the variance along the // orthogonal direction. Again, the sum(x^2) terms cancel out. *var = best_cost - cost[(best_dir + 4) & 7]; // We'd normally divide by 840, but dividing by 1024 is close enough // for what we're going to do with this. *var >>= 10; return (int)best_dir; } void cdef_copy_rect8_8bit_to_16bit_rvv(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height) { do { int w = 0; size_t num_cols = width; while (num_cols > 0) { size_t vl = __riscv_vsetvl_e8mf2(num_cols); vuint8mf2_t u8_src = __riscv_vle8_v_u8mf2(src + w, vl); vuint16m1_t u16_src = __riscv_vwcvtu_x_x_v_u16m1(u8_src, vl); __riscv_vse16_v_u16m1(dst + w, u16_src, vl); w += vl; num_cols -= vl; } src += sstride; dst += dstride; } while (--height != 0); } void cdef_copy_rect8_16bit_to_16bit_rvv(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int width, int height) { do { int w = 0; size_t num_cols = width; while (num_cols > 0) { size_t vl = __riscv_vsetvl_e16m1(num_cols); vuint16m1_t u16_src = __riscv_vle16_v_u16m1(src + w, vl); __riscv_vse16_v_u16m1(dst + w, u16_src, vl); w += vl; num_cols -= vl; } src += sstride; dst += dstride; } while (--height != 0); } static inline vint16m1_t constrain16(vint16m1_t a, vint16m1_t b, int16_t threshold, int16_t adjdamp, size_t vl) { if (!threshold) return __riscv_vmv_v_x_i16m1(0, vl); const vbool16_t mask = __riscv_vmslt_vv_i16m1_b16(a, b, vl); const vint16m1_t diff = __riscv_vsub_vv_i16m1(a, b, vl); const vint16m1_t abs_diff = __riscv_vneg_v_i16m1_tumu(mask, diff, diff, vl); const vint16m1_t shift = __riscv_vsra_vx_i16m1(abs_diff, adjdamp, vl); const vint16m1_t thr = __riscv_vmv_v_x_i16m1(threshold, vl); const vint16m1_t sub = __riscv_vsub_vv_i16m1(thr, shift, vl); const vint16m1_t max = __riscv_vmax_vx_i16m1(sub, 0, vl); const vint16m1_t min = __riscv_vmin_vv_i16m1(abs_diff, max, vl); return __riscv_vneg_v_i16m1_tumu(mask, min, min, vl); } static inline vint16m1_t vmax_mask(vint16m1_t a, vint16m1_t b, size_t vl) { const vbool16_t mask = __riscv_vmseq_vx_i16m1_b16(a, (int16_t)CDEF_VERY_LARGE, vl); const vint16m1_t val = __riscv_vmerge_vvm_i16m1(a, b, mask, vl); return __riscv_vmax_vv_i16m1(val, b, vl); } static inline vint16m1_t load_strided_i16_4x2(int16_t *addr, const ptrdiff_t stride, size_t vl) { const vint16m1_t px_l1 = __riscv_vle16_v_i16m1(addr + stride, vl); const vint16m1_t px_l0 = __riscv_vle16_v_i16m1(addr, vl); return __riscv_vslideup_vx_i16m1(px_l0, px_l1, 4, vl); } static inline void store_strided_u8_4x2(uint8_t *addr, vuint8mf2_t vdst, const ptrdiff_t stride, size_t vl) { __riscv_vse8_v_u8mf2(addr, vdst, vl >> 1); vdst = __riscv_vslidedown_vx_u8mf2(vdst, 4, vl); __riscv_vse8_v_u8mf2(addr + stride, vdst, vl >> 1); } static inline void store_strided_u16_4x2(uint16_t *addr, vuint16m1_t vdst, const ptrdiff_t stride, size_t vl) { __riscv_vse16_v_u16m1(addr, vdst, vl >> 1); vdst = __riscv_vslidedown_vx_u16m1(vdst, 4, vl); __riscv_vse16_v_u16m1(addr + stride, vdst, vl >> 1); } #define LOAD_PIX(addr) \ const vint16m1_t px = __riscv_vle16_v_i16m1((int16_t *)addr, vl); \ vint16m1_t sum = __riscv_vmv_v_x_i16m1(0, vl) #define LOAD_PIX4(addr) \ const vint16m1_t px = \ load_strided_i16_4x2((int16_t *)addr, CDEF_BSTRIDE, vl); \ vint16m1_t sum = __riscv_vmv_v_x_i16m1(0, vl) #define LOAD_DIR(p, addr, o0, o1) \ const vint16m1_t p##0 = __riscv_vle16_v_i16m1((int16_t *)addr + o0, vl); \ const vint16m1_t p##1 = __riscv_vle16_v_i16m1((int16_t *)addr - o0, vl); \ const vint16m1_t p##2 = __riscv_vle16_v_i16m1((int16_t *)addr + o1, vl); \ const vint16m1_t p##3 = __riscv_vle16_v_i16m1((int16_t *)addr - o1, vl) #define LOAD_DIR4(p, addr, o0, o1) \ const vint16m1_t p##0 = \ load_strided_i16_4x2((int16_t *)addr + o0, CDEF_BSTRIDE, vl); \ const vint16m1_t p##1 = \ load_strided_i16_4x2((int16_t *)addr - o0, CDEF_BSTRIDE, vl); \ const vint16m1_t p##2 = \ load_strided_i16_4x2((int16_t *)addr + o1, CDEF_BSTRIDE, vl); \ const vint16m1_t p##3 = \ load_strided_i16_4x2((int16_t *)addr - o1, CDEF_BSTRIDE, vl) #define MAKE_TAPS \ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; \ const int16_t tap0 = (int16_t)(pri_taps[0]); \ const int16_t tap1 = (int16_t)(pri_taps[1]) #define CONSTRAIN(p, strength, shift) \ vint16m1_t p##_c0 = \ constrain16(p##0, px, (int16_t)strength, (int16_t)shift, vl); \ vint16m1_t p##_c1 = \ constrain16(p##1, px, (int16_t)strength, (int16_t)shift, vl); \ vint16m1_t p##_c2 = \ constrain16(p##2, px, (int16_t)strength, (int16_t)shift, vl); \ vint16m1_t p##_c3 = \ constrain16(p##3, px, (int16_t)strength, (int16_t)shift, vl) #define SETUP_MINMAX \ vint16m1_t max = px; \ vint16m1_t min = px #define MIN_MAX(p) \ do { \ max = vmax_mask(p##0, max, vl); \ min = __riscv_vmin_vv_i16m1(p##0, min, vl); \ max = vmax_mask(p##1, max, vl); \ min = __riscv_vmin_vv_i16m1(p##1, min, vl); \ max = vmax_mask(p##2, max, vl); \ min = __riscv_vmin_vv_i16m1(p##2, min, vl); \ max = vmax_mask(p##3, max, vl); \ min = __riscv_vmin_vv_i16m1(p##3, min, vl); \ } while (0) #define PRI_0_UPDATE_SUM(p) \ const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \ const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \ sum = __riscv_vmacc_vx_i16m1(sum, tap0, p##sum0, vl); \ sum = __riscv_vmacc_vx_i16m1(sum, tap1, p##sum1, vl) #define UPDATE_SUM(p) \ const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \ const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \ sum = __riscv_vadd_vv_i16m1(sum, p##sum0, vl); \ sum = __riscv_vadd_vv_i16m1(sum, p##sum1, vl) #define SEC_0_UPDATE_SUM(p) \ const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \ const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \ const vint16m1_t p##sum2 = __riscv_vadd_vv_i16m1(p##sum0, p##sum1, vl); \ sum = __riscv_vadd_vv_i16m1(sum, __riscv_vsll_vx_i16m1(p##sum2, 1, vl), vl) #define BIAS \ const vbool16_t mask = __riscv_vmslt_vx_i16m1_b16(sum, 0, vl); \ const vint16m1_t v_8 = __riscv_vmv_v_x_i16m1(8, vl); \ const vint16m1_t bias = __riscv_vsub_vx_i16m1_tumu(mask, v_8, v_8, 1, vl); \ const vint16m1_t unclamped = __riscv_vadd_vv_i16m1( \ px, __riscv_vsra_vx_i16m1(__riscv_vadd_vv_i16m1(bias, sum, vl), 4, vl), \ vl) #define STORE4 \ do { \ store_strided_u8_4x2(dst8, vdst, dstride, vl); \ \ in += (CDEF_BSTRIDE << 1); \ dst8 += (dstride << 1); \ } while (0) #define STORE4_CLAMPED \ do { \ BIAS; \ vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ __riscv_vreinterpret_v_i16m1_u16m1(clamped), vl); \ STORE4; \ } while (0) #define STORE4_UNCLAMPED \ do { \ BIAS; \ vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ __riscv_vreinterpret_v_i16m1_u16m1(unclamped), vl); \ STORE4; \ } while (0) #define STORE8 \ do { \ __riscv_vse8_v_u8mf2(dst8, vdst, vl); \ \ in += CDEF_BSTRIDE; \ dst8 += dstride; \ } while (0) #define STORE8_CLAMPED \ do { \ BIAS; \ vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ __riscv_vreinterpret_v_i16m1_u16m1(clamped), vl); \ STORE8; \ } while (0) #define STORE8_UNCLAMPED \ do { \ BIAS; \ vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ __riscv_vreinterpret_v_i16m1_u16m1(unclamped), vl); \ STORE8; \ } while (0) #define STORE16_4 \ do { \ store_strided_u16_4x2(dst16, vdst, dstride, vl); \ \ in += (CDEF_BSTRIDE << 1); \ dst16 += (dstride << 1); \ } while (0) #define STORE16_4_CLAMPED \ do { \ BIAS; \ vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(clamped); \ STORE16_4; \ } while (0) #define STORE16_4_UNCLAMPED \ do { \ BIAS; \ vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(unclamped); \ STORE16_4; \ } while (0) #define STORE16 \ do { \ __riscv_vse16_v_u16m1(dst16, vdst, vl); \ \ in += CDEF_BSTRIDE; \ dst16 += dstride; \ } while (0) #define STORE16_CLAMPED \ do { \ BIAS; \ vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(clamped); \ STORE16; \ } while (0) #define STORE16_UNCLAMPED \ do { \ BIAS; \ vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(unclamped); \ STORE16; \ } while (0) void cdef_filter_8_0_rvv(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { const int po1 = cdef_directions[dir][0]; const int po2 = cdef_directions[dir][1]; const int s1o1 = cdef_directions[dir + 2][0]; const int s1o2 = cdef_directions[dir + 2][1]; const int s2o1 = cdef_directions[dir - 2][0]; const int s2o2 = cdef_directions[dir - 2][1]; MAKE_TAPS; if (pri_strength) { pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); } if (sec_strength) { sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); } if (block_width == 8) { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; const size_t vl = block_width; do { LOAD_PIX(in); SETUP_MINMAX; // Primary pass LOAD_DIR(p, in, po1, po2); CONSTRAIN(p, pri_strength, pri_damping); MIN_MAX(p); PRI_0_UPDATE_SUM(p); // Secondary pass 1 LOAD_DIR(s, in, s1o1, s2o1); CONSTRAIN(s, sec_strength, sec_damping); MIN_MAX(s); SEC_0_UPDATE_SUM(s); // Secondary pass 2 LOAD_DIR(s2, in, s1o2, s2o2); CONSTRAIN(s2, sec_strength, sec_damping); MIN_MAX(s2); UPDATE_SUM(s2); // Store STORE8_CLAMPED; } while (--h != 0); } else { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; const size_t vl = block_width << 1; do { LOAD_PIX4(in); SETUP_MINMAX; // Primary pass LOAD_DIR4(p, in, po1, po2); CONSTRAIN(p, pri_strength, pri_damping); MIN_MAX(p); PRI_0_UPDATE_SUM(p); // Secondary pass 1 LOAD_DIR4(s, in, s1o1, s2o1); CONSTRAIN(s, sec_strength, sec_damping); MIN_MAX(s); SEC_0_UPDATE_SUM(s); // Secondary pass 2 LOAD_DIR4(s2, in, s1o2, s2o2); CONSTRAIN(s2, sec_strength, sec_damping); MIN_MAX(s2); UPDATE_SUM(s2); // Store STORE4_CLAMPED; h -= 2; } while (h != 0); } } void cdef_filter_8_1_rvv(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)sec_strength; (void)sec_damping; const int po1 = cdef_directions[dir][0]; const int po2 = cdef_directions[dir][1]; MAKE_TAPS; if (pri_strength) { pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); } if (block_width == 8) { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; const size_t vl = block_width; do { LOAD_PIX(in); // Primary pass LOAD_DIR(p, in, po1, po2); CONSTRAIN(p, pri_strength, pri_damping); PRI_0_UPDATE_SUM(p); // Store STORE8_UNCLAMPED; } while (--h != 0); } else { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; const size_t vl = block_width << 1; do { LOAD_PIX4(in); // Primary pass LOAD_DIR4(p, in, po1, po2); CONSTRAIN(p, pri_strength, pri_damping); PRI_0_UPDATE_SUM(p); // Store STORE4_UNCLAMPED; h -= 2; } while (h != 0); } } void cdef_filter_8_2_rvv(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)pri_strength; (void)pri_damping; (void)coeff_shift; const int s1o1 = cdef_directions[dir + 2][0]; const int s1o2 = cdef_directions[dir + 2][1]; const int s2o1 = cdef_directions[dir - 2][0]; const int s2o2 = cdef_directions[dir - 2][1]; if (sec_strength) { sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); } if (block_width == 8) { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; const size_t vl = block_width; do { LOAD_PIX(in); // Secondary pass 1 LOAD_DIR(s, in, s1o1, s2o1); CONSTRAIN(s, sec_strength, sec_damping); SEC_0_UPDATE_SUM(s); // Secondary pass 2 LOAD_DIR(s2, in, s1o2, s2o2); CONSTRAIN(s2, sec_strength, sec_damping); UPDATE_SUM(s2); // Store STORE8_UNCLAMPED; } while (--h != 0); } else { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; const size_t vl = block_width << 1; do { LOAD_PIX4(in); // Secondary pass 1 LOAD_DIR4(s, in, s1o1, s2o1); CONSTRAIN(s, sec_strength, sec_damping); SEC_0_UPDATE_SUM(s); // Secondary pass 2 LOAD_DIR4(s2, in, s1o2, s2o2); CONSTRAIN(s2, sec_strength, sec_damping); UPDATE_SUM(s2); // Store STORE4_UNCLAMPED; h -= 2; } while (h != 0); } } void cdef_filter_8_3_rvv(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)pri_strength; (void)sec_strength; (void)dir; (void)pri_damping; (void)sec_damping; (void)coeff_shift; if (block_width == 8) { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; const size_t vl = block_width; do { const vuint16m1_t px = __riscv_vle16_v_u16m1(in, vl); const vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(px, vl); __riscv_vse8_v_u8mf2(dst8, vdst, vl); in += CDEF_BSTRIDE; dst8 += dstride; } while (--h != 0); } else { uint8_t *dst8 = (uint8_t *)dest; int h = block_height; const size_t vl = block_width << 1; do { const vint16m1_t px = load_strided_i16_4x2((int16_t *)in, CDEF_BSTRIDE, vl); vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(px), vl); store_strided_u8_4x2(dst8, vdst, dstride, vl); in += 2 * CDEF_BSTRIDE; dst8 += 2 * dstride; h -= 2; } while (h != 0); } } void cdef_filter_16_0_rvv(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { const int po1 = cdef_directions[dir][0]; const int po2 = cdef_directions[dir][1]; const int s1o1 = cdef_directions[dir + 2][0]; const int s1o2 = cdef_directions[dir + 2][1]; const int s2o1 = cdef_directions[dir - 2][0]; const int s2o2 = cdef_directions[dir - 2][1]; MAKE_TAPS; if (pri_strength) { pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); } if (sec_strength) { sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); } if (block_width == 8) { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; const size_t vl = block_width; do { LOAD_PIX(in); SETUP_MINMAX; // Primary pass LOAD_DIR(p, in, po1, po2); CONSTRAIN(p, pri_strength, pri_damping); MIN_MAX(p); PRI_0_UPDATE_SUM(p); // Secondary pass 1 LOAD_DIR(s, in, s1o1, s2o1); CONSTRAIN(s, sec_strength, sec_damping); MIN_MAX(s); SEC_0_UPDATE_SUM(s); // Secondary pass 2 LOAD_DIR(s2, in, s1o2, s2o2); CONSTRAIN(s2, sec_strength, sec_damping); MIN_MAX(s2); UPDATE_SUM(s2); // Store STORE16_CLAMPED; } while (--h != 0); } else { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; const size_t vl = block_width << 1; do { LOAD_PIX4(in); SETUP_MINMAX; // Primary pass LOAD_DIR4(p, in, po1, po2); CONSTRAIN(p, pri_strength, pri_damping); MIN_MAX(p); PRI_0_UPDATE_SUM(p); // Secondary pass 1 LOAD_DIR4(s, in, s1o1, s2o1); CONSTRAIN(s, sec_strength, sec_damping); MIN_MAX(s); SEC_0_UPDATE_SUM(s); // Secondary pass 2 LOAD_DIR4(s2, in, s1o2, s2o2); CONSTRAIN(s2, sec_strength, sec_damping); MIN_MAX(s2); UPDATE_SUM(s2); // Store STORE16_4_CLAMPED; h -= 2; } while (h != 0); } } void cdef_filter_16_1_rvv(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)sec_strength; (void)sec_damping; const int po1 = cdef_directions[dir][0]; const int po2 = cdef_directions[dir][1]; MAKE_TAPS; if (pri_strength) { pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); } if (block_width == 8) { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; const size_t vl = block_width; do { LOAD_PIX(in); // Primary pass LOAD_DIR(p, in, po1, po2); CONSTRAIN(p, pri_strength, pri_damping); PRI_0_UPDATE_SUM(p); // Store STORE16_UNCLAMPED; } while (--h != 0); } else { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; const size_t vl = block_width << 1; do { LOAD_PIX4(in); // Primary pass LOAD_DIR4(p, in, po1, po2); CONSTRAIN(p, pri_strength, pri_damping); PRI_0_UPDATE_SUM(p); // Store STORE16_4_UNCLAMPED; h -= 2; } while (h != 0); } } void cdef_filter_16_2_rvv(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)pri_strength; (void)pri_damping; (void)coeff_shift; const int s1o1 = cdef_directions[dir + 2][0]; const int s1o2 = cdef_directions[dir + 2][1]; const int s2o1 = cdef_directions[dir - 2][0]; const int s2o2 = cdef_directions[dir - 2][1]; if (sec_strength) { sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); } if (block_width == 8) { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; const size_t vl = block_width; do { LOAD_PIX(in); // Secondary pass 1 LOAD_DIR(s, in, s1o1, s2o1); CONSTRAIN(s, sec_strength, sec_damping); SEC_0_UPDATE_SUM(s); // Secondary pass 2 LOAD_DIR(s2, in, s1o2, s2o2); CONSTRAIN(s2, sec_strength, sec_damping); UPDATE_SUM(s2); // Store STORE16_UNCLAMPED; } while (--h != 0); } else { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; const size_t vl = block_width << 1; do { LOAD_PIX4(in); // Secondary pass 1 LOAD_DIR4(s, in, s1o1, s2o1); CONSTRAIN(s, sec_strength, sec_damping); SEC_0_UPDATE_SUM(s); // Secondary pass 2 LOAD_DIR4(s2, in, s1o2, s2o2); CONSTRAIN(s2, sec_strength, sec_damping); UPDATE_SUM(s2); // Store STORE16_4_UNCLAMPED; h -= 2; } while (h != 0); } } void cdef_filter_16_3_rvv(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { (void)pri_strength; (void)sec_strength; (void)dir; (void)pri_damping; (void)sec_damping; (void)coeff_shift; if (block_width == 8) { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; const size_t vl = block_width; do { const vuint16m1_t px = __riscv_vle16_v_u16m1(in, vl); __riscv_vse16_v_u16m1(dst16, px, vl); in += CDEF_BSTRIDE; dst16 += dstride; } while (--h != 0); } else { uint16_t *dst16 = (uint16_t *)dest; int h = block_height; const size_t vl = block_width << 1; do { const vint16m1_t px = load_strided_i16_4x2((int16_t *)in, CDEF_BSTRIDE, vl); vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(px); store_strided_u16_4x2(dst16, vdst, dstride, vl); in += 2 * CDEF_BSTRIDE; dst16 += 2 * dstride; h -= 2; } while (h != 0); } } aom-3.12.1/av1/common/scale.c000066400000000000000000000045171477627663500156220ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" #include "av1/common/filter.h" #include "av1/common/scale.h" #include "aom_dsp/aom_filter.h" static int get_fixed_point_scale_factor(int other_size, int this_size) { // Calculate scaling factor once for each reference frame // and use fixed point scaling factors in decoding and encoding routines. // Hardware implementations can calculate scale factor in device driver // and use multiplication and shifting on hardware instead of division. return ((other_size << REF_SCALE_SHIFT) + this_size / 2) / this_size; } // Given the fixed point scale, calculate coarse point scale. static int fixed_point_scale_to_coarse_point_scale(int scale_fp) { return ROUND_POWER_OF_TWO(scale_fp, REF_SCALE_SHIFT - SCALE_SUBPEL_BITS); } // Note: x and y are integer precision, mvq4 is q4 precision. MV32 av1_scale_mv(const MV *mvq4, int x, int y, const struct scale_factors *sf) { const int x_off_q4 = av1_scaled_x(x << SUBPEL_BITS, sf); const int y_off_q4 = av1_scaled_y(y << SUBPEL_BITS, sf); const MV32 res = { av1_scaled_y((y << SUBPEL_BITS) + mvq4->row, sf) - y_off_q4, av1_scaled_x((x << SUBPEL_BITS) + mvq4->col, sf) - x_off_q4 }; return res; } void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h) { if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) { sf->x_scale_fp = REF_INVALID_SCALE; sf->y_scale_fp = REF_INVALID_SCALE; return; } sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp); sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp); } aom-3.12.1/av1/common/scale.h000066400000000000000000000057461477627663500156340ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_SCALE_H_ #define AOM_AV1_COMMON_SCALE_H_ #include "av1/common/convolve.h" #include "av1/common/mv.h" #ifdef __cplusplus extern "C" { #endif #define SCALE_NUMERATOR 8 #define REF_SCALE_SHIFT 14 #define REF_NO_SCALE (1 << REF_SCALE_SHIFT) #define REF_INVALID_SCALE -1 struct scale_factors { int x_scale_fp; // horizontal fixed point scale factor int y_scale_fp; // vertical fixed point scale factor int x_step_q4; int y_step_q4; }; // Note: Expect val to be in q4 precision static inline int av1_scaled_x(int val, const struct scale_factors *sf) { const int off = (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1)); const int64_t tval = (int64_t)val * sf->x_scale_fp + off; return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval, REF_SCALE_SHIFT - SCALE_EXTRA_BITS); } // Note: Expect val to be in q4 precision static inline int av1_scaled_y(int val, const struct scale_factors *sf) { const int off = (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1)); const int64_t tval = (int64_t)val * sf->y_scale_fp + off; return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval, REF_SCALE_SHIFT - SCALE_EXTRA_BITS); } // Note: Expect val to be in q4 precision static inline int av1_unscaled_value(int val, const struct scale_factors *sf) { (void)sf; return val * (1 << SCALE_EXTRA_BITS); } MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf); void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h); static inline int av1_is_valid_scale(const struct scale_factors *sf) { assert(sf != NULL); return sf->x_scale_fp != REF_INVALID_SCALE && sf->y_scale_fp != REF_INVALID_SCALE; } static inline int av1_is_scaled(const struct scale_factors *sf) { assert(sf != NULL); return av1_is_valid_scale(sf) && (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE); } // See AV1 spec, Section 6.8.6. Frame size with refs semantics. static inline int valid_ref_frame_size(int ref_width, int ref_height, int this_width, int this_height) { return 2 * this_width >= ref_width && 2 * this_height >= ref_height && this_width <= 16 * ref_width && this_height <= 16 * ref_height; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_SCALE_H_ aom-3.12.1/av1/common/scan.c000066400000000000000000003721771477627663500154710ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/common/common_data.h" #include "av1/common/scan.h" DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = { 0, 4, 1, 2, 5, 8, 12, 9, 6, 3, 7, 10, 13, 14, 11, 15, }; DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, }; DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x4[16]) = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, }; DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = { 0, 8, 1, 16, 9, 2, 24, 17, 10, 3, 25, 18, 11, 4, 26, 19, 12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31, }; DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, }; DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x8[32]) = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31, }; DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = { 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31, }; DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, }; DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x4[32]) = { 0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29, 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31, }; DECLARE_ALIGNED(16, static const int16_t, default_scan_4x16[64]) = { 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 49, 34, 19, 4, 50, 35, 20, 5, 51, 36, 21, 6, 52, 37, 22, 7, 53, 38, 23, 8, 54, 39, 24, 9, 55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43, 28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63, }; DECLARE_ALIGNED(16, static const int16_t, default_scan_16x4[64]) = { 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30, 33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46, 49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63, }; DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x16[64]) = { 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59, 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63, }; DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x4[64]) = { 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63, }; DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x16[64]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, }; DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x4[64]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, }; DECLARE_ALIGNED(16, static const int16_t, default_scan_8x32[256]) = { 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, 162, 131, 100, 69, 38, 7, 225, 194, 163, 132, 101, 70, 39, 8, 226, 195, 164, 133, 102, 71, 40, 9, 227, 196, 165, 134, 103, 72, 41, 10, 228, 197, 166, 135, 104, 73, 42, 11, 229, 198, 167, 136, 105, 74, 43, 12, 230, 199, 168, 137, 106, 75, 44, 13, 231, 200, 169, 138, 107, 76, 45, 14, 232, 201, 170, 139, 108, 77, 46, 15, 233, 202, 171, 140, 109, 78, 47, 16, 234, 203, 172, 141, 110, 79, 48, 17, 235, 204, 173, 142, 111, 80, 49, 18, 236, 205, 174, 143, 112, 81, 50, 19, 237, 206, 175, 144, 113, 82, 51, 20, 238, 207, 176, 145, 114, 83, 52, 21, 239, 208, 177, 146, 115, 84, 53, 22, 240, 209, 178, 147, 116, 85, 54, 23, 241, 210, 179, 148, 117, 86, 55, 24, 242, 211, 180, 149, 118, 87, 56, 25, 243, 212, 181, 150, 119, 88, 57, 26, 244, 213, 182, 151, 120, 89, 58, 27, 245, 214, 183, 152, 121, 90, 59, 28, 246, 215, 184, 153, 122, 91, 60, 29, 247, 216, 185, 154, 123, 92, 61, 30, 248, 217, 186, 155, 124, 93, 62, 31, 249, 218, 187, 156, 125, 94, 63, 250, 219, 188, 157, 126, 95, 251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223, 255, }; DECLARE_ALIGNED(16, static const int16_t, default_scan_32x8[256]) = { 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 128, 87, 94, 101, 108, 115, 122, 129, 136, 95, 102, 109, 116, 123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125, 132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134, 141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143, 150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200, 159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209, 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227, 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, 255, }; DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x32[256]) = { 0, 32, 64, 96, 128, 160, 192, 224, 1, 33, 65, 97, 129, 161, 193, 225, 2, 34, 66, 98, 130, 162, 194, 226, 3, 35, 67, 99, 131, 163, 195, 227, 4, 36, 68, 100, 132, 164, 196, 228, 5, 37, 69, 101, 133, 165, 197, 229, 6, 38, 70, 102, 134, 166, 198, 230, 7, 39, 71, 103, 135, 167, 199, 231, 8, 40, 72, 104, 136, 168, 200, 232, 9, 41, 73, 105, 137, 169, 201, 233, 10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235, 12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237, 14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239, 16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241, 18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243, 20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245, 22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247, 24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249, 26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251, 28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253, 30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255, }; DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x8[256]) = { 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250, 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, 219, 227, 235, 243, 251, 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, 255, }; DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x32[256]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, }; DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x8[256]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, }; DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = { 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40, 33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35, 28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30, 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63, }; DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x8[64]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, }; DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x8[64]) = { 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57, 2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61, 6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63, }; DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = { 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 49, 34, 19, 4, 80, 65, 50, 35, 20, 5, 96, 81, 66, 51, 36, 21, 6, 112, 97, 82, 67, 52, 37, 22, 7, 113, 98, 83, 68, 53, 38, 23, 8, 114, 99, 84, 69, 54, 39, 24, 9, 115, 100, 85, 70, 55, 40, 25, 10, 116, 101, 86, 71, 56, 41, 26, 11, 117, 102, 87, 72, 57, 42, 27, 12, 118, 103, 88, 73, 58, 43, 28, 13, 119, 104, 89, 74, 59, 44, 29, 14, 120, 105, 90, 75, 60, 45, 30, 15, 121, 106, 91, 76, 61, 46, 31, 122, 107, 92, 77, 62, 47, 123, 108, 93, 78, 63, 124, 109, 94, 79, 125, 110, 95, 126, 111, 127, }; DECLARE_ALIGNED(16, static const int16_t, default_scan_16x8[128]) = { 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 87, 94, 101, 108, 115, 122, 95, 102, 109, 116, 123, 103, 110, 117, 124, 111, 118, 125, 119, 126, 127, }; DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x16[128]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, }; DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x8[128]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, }; DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x16[128]) = { 0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113, 2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115, 4, 20, 36, 52, 68, 84, 100, 116, 5, 21, 37, 53, 69, 85, 101, 117, 6, 22, 38, 54, 70, 86, 102, 118, 7, 23, 39, 55, 71, 87, 103, 119, 8, 24, 40, 56, 72, 88, 104, 120, 9, 25, 41, 57, 73, 89, 105, 121, 10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123, 12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125, 14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127, }; DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x8[128]) = { 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, }; DECLARE_ALIGNED(16, static const int16_t, default_scan_16x32[512]) = { 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, 162, 131, 100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8, 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 320, 289, 258, 227, 196, 165, 134, 103, 72, 41, 10, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43, 12, 416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75, 44, 13, 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76, 45, 14, 480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77, 46, 15, 481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78, 47, 16, 482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110, 79, 48, 17, 483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142, 111, 80, 49, 18, 484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174, 143, 112, 81, 50, 19, 485, 454, 423, 392, 361, 330, 299, 268, 237, 206, 175, 144, 113, 82, 51, 20, 486, 455, 424, 393, 362, 331, 300, 269, 238, 207, 176, 145, 114, 83, 52, 21, 487, 456, 425, 394, 363, 332, 301, 270, 239, 208, 177, 146, 115, 84, 53, 22, 488, 457, 426, 395, 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 489, 458, 427, 396, 365, 334, 303, 272, 241, 210, 179, 148, 117, 86, 55, 24, 490, 459, 428, 397, 366, 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, 491, 460, 429, 398, 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57, 26, 492, 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 493, 462, 431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90, 59, 28, 494, 463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91, 60, 29, 495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92, 61, 30, 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93, 62, 31, 497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125, 94, 63, 498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126, 95, 499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500, 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408, 377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285, 254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411, 380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, 382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, 479, 511, }; DECLARE_ALIGNED(16, static const int16_t, default_scan_32x16[512]) = { 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, 37, 52, 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70, 85, 100, 115, 130, 145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, 240, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226, 241, 256, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242, 257, 272, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228, 243, 258, 273, 288, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 259, 274, 289, 304, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230, 245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231, 246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232, 247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233, 248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234, 249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235, 250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236, 251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237, 252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238, 253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239, 254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464, 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465, 480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466, 481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467, 482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483, 498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335, 350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396, 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444, 459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, 510, 511, }; DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x32[512]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, }; DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x16[512]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, }; DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x32[512]) = { 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481, 2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, 3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483, 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484, 5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485, 6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487, 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488, 9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, 10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490, 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491, 12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492, 13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494, 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495, 16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, 17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497, 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498, 19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499, 20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501, 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502, 23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, 24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504, 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505, 26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506, 27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508, 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509, 30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, 31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511, }; DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x16[512]) = { 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433, 449, 465, 481, 497, 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402, 418, 434, 450, 466, 482, 498, 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, 387, 403, 419, 435, 451, 467, 483, 499, 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, 356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487, 503, 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456, 472, 488, 504, 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425, 441, 457, 473, 489, 505, 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394, 410, 426, 442, 458, 474, 490, 506, 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395, 411, 427, 443, 459, 475, 491, 507, 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510, 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479, 495, 511, }; DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = { 0, 16, 1, 2, 17, 32, 48, 33, 18, 3, 4, 19, 34, 49, 64, 80, 65, 50, 35, 20, 5, 6, 21, 36, 51, 66, 81, 96, 112, 97, 82, 67, 52, 37, 22, 7, 8, 23, 38, 53, 68, 83, 98, 113, 128, 144, 129, 114, 99, 84, 69, 54, 39, 24, 9, 10, 25, 40, 55, 70, 85, 100, 115, 130, 145, 160, 176, 161, 146, 131, 116, 101, 86, 71, 56, 41, 26, 11, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 208, 193, 178, 163, 148, 133, 118, 103, 88, 73, 58, 43, 28, 13, 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224, 240, 225, 210, 195, 180, 165, 150, 135, 120, 105, 90, 75, 60, 45, 30, 15, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226, 241, 242, 227, 212, 197, 182, 167, 152, 137, 122, 107, 92, 77, 62, 47, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228, 243, 244, 229, 214, 199, 184, 169, 154, 139, 124, 109, 94, 79, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230, 245, 246, 231, 216, 201, 186, 171, 156, 141, 126, 111, 127, 142, 157, 172, 187, 202, 217, 232, 247, 248, 233, 218, 203, 188, 173, 158, 143, 159, 174, 189, 204, 219, 234, 249, 250, 235, 220, 205, 190, 175, 191, 206, 221, 236, 251, 252, 237, 222, 207, 223, 238, 253, 254, 239, 255, }; DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x16[256]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, }; DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x16[256]) = { 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, }; DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x32[1024]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, }; DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x32[1024]) = { 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481, 513, 545, 577, 609, 641, 673, 705, 737, 769, 801, 833, 865, 897, 929, 961, 993, 2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, 514, 546, 578, 610, 642, 674, 706, 738, 770, 802, 834, 866, 898, 930, 962, 994, 3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483, 515, 547, 579, 611, 643, 675, 707, 739, 771, 803, 835, 867, 899, 931, 963, 995, 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484, 516, 548, 580, 612, 644, 676, 708, 740, 772, 804, 836, 868, 900, 932, 964, 996, 5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485, 517, 549, 581, 613, 645, 677, 709, 741, 773, 805, 837, 869, 901, 933, 965, 997, 6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, 518, 550, 582, 614, 646, 678, 710, 742, 774, 806, 838, 870, 902, 934, 966, 998, 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487, 519, 551, 583, 615, 647, 679, 711, 743, 775, 807, 839, 871, 903, 935, 967, 999, 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488, 520, 552, 584, 616, 648, 680, 712, 744, 776, 808, 840, 872, 904, 936, 968, 1000, 9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, 521, 553, 585, 617, 649, 681, 713, 745, 777, 809, 841, 873, 905, 937, 969, 1001, 10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490, 522, 554, 586, 618, 650, 682, 714, 746, 778, 810, 842, 874, 906, 938, 970, 1002, 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491, 523, 555, 587, 619, 651, 683, 715, 747, 779, 811, 843, 875, 907, 939, 971, 1003, 12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492, 524, 556, 588, 620, 652, 684, 716, 748, 780, 812, 844, 876, 908, 940, 972, 1004, 13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, 525, 557, 589, 621, 653, 685, 717, 749, 781, 813, 845, 877, 909, 941, 973, 1005, 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494, 526, 558, 590, 622, 654, 686, 718, 750, 782, 814, 846, 878, 910, 942, 974, 1006, 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495, 527, 559, 591, 623, 655, 687, 719, 751, 783, 815, 847, 879, 911, 943, 975, 1007, 16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, 528, 560, 592, 624, 656, 688, 720, 752, 784, 816, 848, 880, 912, 944, 976, 1008, 17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497, 529, 561, 593, 625, 657, 689, 721, 753, 785, 817, 849, 881, 913, 945, 977, 1009, 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498, 530, 562, 594, 626, 658, 690, 722, 754, 786, 818, 850, 882, 914, 946, 978, 1010, 19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499, 531, 563, 595, 627, 659, 691, 723, 755, 787, 819, 851, 883, 915, 947, 979, 1011, 20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, 532, 564, 596, 628, 660, 692, 724, 756, 788, 820, 852, 884, 916, 948, 980, 1012, 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501, 533, 565, 597, 629, 661, 693, 725, 757, 789, 821, 853, 885, 917, 949, 981, 1013, 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502, 534, 566, 598, 630, 662, 694, 726, 758, 790, 822, 854, 886, 918, 950, 982, 1014, 23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, 535, 567, 599, 631, 663, 695, 727, 759, 791, 823, 855, 887, 919, 951, 983, 1015, 24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504, 536, 568, 600, 632, 664, 696, 728, 760, 792, 824, 856, 888, 920, 952, 984, 1016, 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505, 537, 569, 601, 633, 665, 697, 729, 761, 793, 825, 857, 889, 921, 953, 985, 1017, 26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506, 538, 570, 602, 634, 666, 698, 730, 762, 794, 826, 858, 890, 922, 954, 986, 1018, 27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, 539, 571, 603, 635, 667, 699, 731, 763, 795, 827, 859, 891, 923, 955, 987, 1019, 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508, 540, 572, 604, 636, 668, 700, 732, 764, 796, 828, 860, 892, 924, 956, 988, 1020, 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509, 541, 573, 605, 637, 669, 701, 733, 765, 797, 829, 861, 893, 925, 957, 989, 1021, 30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, 542, 574, 606, 638, 670, 702, 734, 766, 798, 830, 862, 894, 926, 958, 990, 1022, 31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511, 543, 575, 607, 639, 671, 703, 735, 767, 799, 831, 863, 895, 927, 959, 991, 1023, }; DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = { 0, 32, 1, 2, 33, 64, 96, 65, 34, 3, 4, 35, 66, 97, 128, 160, 129, 98, 67, 36, 5, 6, 37, 68, 99, 130, 161, 192, 224, 193, 162, 131, 100, 69, 38, 7, 8, 39, 70, 101, 132, 163, 194, 225, 256, 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 10, 41, 72, 103, 134, 165, 196, 227, 258, 289, 320, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 12, 43, 74, 105, 136, 167, 198, 229, 260, 291, 322, 353, 384, 416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75, 44, 13, 14, 45, 76, 107, 138, 169, 200, 231, 262, 293, 324, 355, 386, 417, 448, 480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77, 46, 15, 16, 47, 78, 109, 140, 171, 202, 233, 264, 295, 326, 357, 388, 419, 450, 481, 512, 544, 513, 482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110, 79, 48, 17, 18, 49, 80, 111, 142, 173, 204, 235, 266, 297, 328, 359, 390, 421, 452, 483, 514, 545, 576, 608, 577, 546, 515, 484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174, 143, 112, 81, 50, 19, 20, 51, 82, 113, 144, 175, 206, 237, 268, 299, 330, 361, 392, 423, 454, 485, 516, 547, 578, 609, 640, 672, 641, 610, 579, 548, 517, 486, 455, 424, 393, 362, 331, 300, 269, 238, 207, 176, 145, 114, 83, 52, 21, 22, 53, 84, 115, 146, 177, 208, 239, 270, 301, 332, 363, 394, 425, 456, 487, 518, 549, 580, 611, 642, 673, 704, 736, 705, 674, 643, 612, 581, 550, 519, 488, 457, 426, 395, 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 24, 55, 86, 117, 148, 179, 210, 241, 272, 303, 334, 365, 396, 427, 458, 489, 520, 551, 582, 613, 644, 675, 706, 737, 768, 800, 769, 738, 707, 676, 645, 614, 583, 552, 521, 490, 459, 428, 397, 366, 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, 26, 57, 88, 119, 150, 181, 212, 243, 274, 305, 336, 367, 398, 429, 460, 491, 522, 553, 584, 615, 646, 677, 708, 739, 770, 801, 832, 864, 833, 802, 771, 740, 709, 678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 28, 59, 90, 121, 152, 183, 214, 245, 276, 307, 338, 369, 400, 431, 462, 493, 524, 555, 586, 617, 648, 679, 710, 741, 772, 803, 834, 865, 896, 928, 897, 866, 835, 804, 773, 742, 711, 680, 649, 618, 587, 556, 525, 494, 463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91, 60, 29, 30, 61, 92, 123, 154, 185, 216, 247, 278, 309, 340, 371, 402, 433, 464, 495, 526, 557, 588, 619, 650, 681, 712, 743, 774, 805, 836, 867, 898, 929, 960, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682, 651, 620, 589, 558, 527, 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93, 62, 31, 63, 94, 125, 156, 187, 218, 249, 280, 311, 342, 373, 404, 435, 466, 497, 528, 559, 590, 621, 652, 683, 714, 745, 776, 807, 838, 869, 900, 931, 962, 993, 994, 963, 932, 901, 870, 839, 808, 777, 746, 715, 684, 653, 622, 591, 560, 529, 498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126, 95, 127, 158, 189, 220, 251, 282, 313, 344, 375, 406, 437, 468, 499, 530, 561, 592, 623, 654, 685, 716, 747, 778, 809, 840, 871, 902, 933, 964, 995, 996, 965, 934, 903, 872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 191, 222, 253, 284, 315, 346, 377, 408, 439, 470, 501, 532, 563, 594, 625, 656, 687, 718, 749, 780, 811, 842, 873, 904, 935, 966, 997, 998, 967, 936, 905, 874, 843, 812, 781, 750, 719, 688, 657, 626, 595, 564, 533, 502, 471, 440, 409, 378, 347, 316, 285, 254, 223, 255, 286, 317, 348, 379, 410, 441, 472, 503, 534, 565, 596, 627, 658, 689, 720, 751, 782, 813, 844, 875, 906, 937, 968, 999, 1000, 969, 938, 907, 876, 845, 814, 783, 752, 721, 690, 659, 628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 319, 350, 381, 412, 443, 474, 505, 536, 567, 598, 629, 660, 691, 722, 753, 784, 815, 846, 877, 908, 939, 970, 1001, 1002, 971, 940, 909, 878, 847, 816, 785, 754, 723, 692, 661, 630, 599, 568, 537, 506, 475, 444, 413, 382, 351, 383, 414, 445, 476, 507, 538, 569, 600, 631, 662, 693, 724, 755, 786, 817, 848, 879, 910, 941, 972, 1003, 1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601, 570, 539, 508, 477, 446, 415, 447, 478, 509, 540, 571, 602, 633, 664, 695, 726, 757, 788, 819, 850, 881, 912, 943, 974, 1005, 1006, 975, 944, 913, 882, 851, 820, 789, 758, 727, 696, 665, 634, 603, 572, 541, 510, 479, 511, 542, 573, 604, 635, 666, 697, 728, 759, 790, 821, 852, 883, 914, 945, 976, 1007, 1008, 977, 946, 915, 884, 853, 822, 791, 760, 729, 698, 667, 636, 605, 574, 543, 575, 606, 637, 668, 699, 730, 761, 792, 823, 854, 885, 916, 947, 978, 1009, 1010, 979, 948, 917, 886, 855, 824, 793, 762, 731, 700, 669, 638, 607, 639, 670, 701, 732, 763, 794, 825, 856, 887, 918, 949, 980, 1011, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733, 702, 671, 703, 734, 765, 796, 827, 858, 889, 920, 951, 982, 1013, 1014, 983, 952, 921, 890, 859, 828, 797, 766, 735, 767, 798, 829, 860, 891, 922, 953, 984, 1015, 1016, 985, 954, 923, 892, 861, 830, 799, 831, 862, 893, 924, 955, 986, 1017, 1018, 987, 956, 925, 894, 863, 895, 926, 957, 988, 1019, 1020, 989, 958, 927, 959, 990, 1021, 1022, 991, 1023, }; DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x4[16]) = { 0, 2, 3, 9, 1, 4, 8, 10, 5, 7, 11, 14, 6, 12, 13, 15, }; DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, }; DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x4[16]) = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, }; DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x8[32]) = { 0, 2, 5, 9, 13, 17, 21, 25, 1, 4, 8, 12, 16, 20, 24, 28, 3, 7, 11, 15, 19, 23, 27, 30, 6, 10, 14, 18, 22, 26, 29, 31, }; DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x8[32]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, }; DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x8[32]) = { 0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29, 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31, }; DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x4[32]) = { 0, 1, 3, 6, 2, 4, 7, 10, 5, 8, 11, 14, 9, 12, 15, 18, 13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 29, 25, 28, 30, 31, }; DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x4[32]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, }; DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x4[32]) = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31, }; DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x16[64]) = { 0, 2, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 1, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 62, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 61, 63, }; DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x4[64]) = { 0, 1, 3, 6, 2, 4, 7, 10, 5, 8, 11, 14, 9, 12, 15, 18, 13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 30, 25, 28, 31, 34, 29, 32, 35, 38, 33, 36, 39, 42, 37, 40, 43, 46, 41, 44, 47, 50, 45, 48, 51, 54, 49, 52, 55, 58, 53, 56, 59, 61, 57, 60, 62, 63, }; DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x16[64]) = { 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63, }; DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x4[64]) = { 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59, 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63, }; DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x16[64]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, }; DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x4[64]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, }; DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x32[256]) = { 0, 2, 5, 9, 14, 20, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, 219, 227, 1, 4, 8, 13, 19, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 3, 7, 12, 18, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 240, 6, 11, 17, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 239, 245, 10, 16, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 238, 244, 249, 15, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 237, 243, 248, 252, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, 165, 173, 181, 189, 197, 205, 213, 221, 229, 236, 242, 247, 251, 254, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 235, 241, 246, 250, 253, 255, }; DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x8[256]) = { 0, 1, 3, 6, 10, 15, 21, 28, 2, 4, 7, 11, 16, 22, 29, 36, 5, 8, 12, 17, 23, 30, 37, 44, 9, 13, 18, 24, 31, 38, 45, 52, 14, 19, 25, 32, 39, 46, 53, 60, 20, 26, 33, 40, 47, 54, 61, 68, 27, 34, 41, 48, 55, 62, 69, 76, 35, 42, 49, 56, 63, 70, 77, 84, 43, 50, 57, 64, 71, 78, 85, 92, 51, 58, 65, 72, 79, 86, 93, 100, 59, 66, 73, 80, 87, 94, 101, 108, 67, 74, 81, 88, 95, 102, 109, 116, 75, 82, 89, 96, 103, 110, 117, 124, 83, 90, 97, 104, 111, 118, 125, 132, 91, 98, 105, 112, 119, 126, 133, 140, 99, 106, 113, 120, 127, 134, 141, 148, 107, 114, 121, 128, 135, 142, 149, 156, 115, 122, 129, 136, 143, 150, 157, 164, 123, 130, 137, 144, 151, 158, 165, 172, 131, 138, 145, 152, 159, 166, 173, 180, 139, 146, 153, 160, 167, 174, 181, 188, 147, 154, 161, 168, 175, 182, 189, 196, 155, 162, 169, 176, 183, 190, 197, 204, 163, 170, 177, 184, 191, 198, 205, 212, 171, 178, 185, 192, 199, 206, 213, 220, 179, 186, 193, 200, 207, 214, 221, 228, 187, 194, 201, 208, 215, 222, 229, 235, 195, 202, 209, 216, 223, 230, 236, 241, 203, 210, 217, 224, 231, 237, 242, 246, 211, 218, 225, 232, 238, 243, 247, 250, 219, 226, 233, 239, 244, 248, 251, 253, 227, 234, 240, 245, 249, 252, 254, 255, }; DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x32[256]) = { 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250, 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, 219, 227, 235, 243, 251, 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, 255, }; DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x8[256]) = { 0, 32, 64, 96, 128, 160, 192, 224, 1, 33, 65, 97, 129, 161, 193, 225, 2, 34, 66, 98, 130, 162, 194, 226, 3, 35, 67, 99, 131, 163, 195, 227, 4, 36, 68, 100, 132, 164, 196, 228, 5, 37, 69, 101, 133, 165, 197, 229, 6, 38, 70, 102, 134, 166, 198, 230, 7, 39, 71, 103, 135, 167, 199, 231, 8, 40, 72, 104, 136, 168, 200, 232, 9, 41, 73, 105, 137, 169, 201, 233, 10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235, 12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237, 14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239, 16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241, 18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243, 20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245, 22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247, 24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249, 26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251, 28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253, 30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255, }; DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x32[256]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, }; DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x8[256]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, }; DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x8[64]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, }; DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x8[64]) = { 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57, 2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61, 6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63, }; DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x8[64]) = { 0, 2, 3, 9, 10, 20, 21, 35, 1, 4, 8, 11, 19, 22, 34, 36, 5, 7, 12, 18, 23, 33, 37, 48, 6, 13, 17, 24, 32, 38, 47, 49, 14, 16, 25, 31, 39, 46, 50, 57, 15, 26, 30, 40, 45, 51, 56, 58, 27, 29, 41, 44, 52, 55, 59, 62, 28, 42, 43, 53, 54, 60, 61, 63, }; DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = { 0, 2, 5, 9, 14, 20, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 1, 4, 8, 13, 19, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 3, 7, 12, 18, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 112, 6, 11, 17, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 111, 117, 10, 16, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 110, 116, 121, 15, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 109, 115, 120, 124, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 108, 114, 119, 123, 126, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 107, 113, 118, 122, 125, 127, }; DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x8[128]) = { 0, 1, 3, 6, 10, 15, 21, 28, 2, 4, 7, 11, 16, 22, 29, 36, 5, 8, 12, 17, 23, 30, 37, 44, 9, 13, 18, 24, 31, 38, 45, 52, 14, 19, 25, 32, 39, 46, 53, 60, 20, 26, 33, 40, 47, 54, 61, 68, 27, 34, 41, 48, 55, 62, 69, 76, 35, 42, 49, 56, 63, 70, 77, 84, 43, 50, 57, 64, 71, 78, 85, 92, 51, 58, 65, 72, 79, 86, 93, 100, 59, 66, 73, 80, 87, 94, 101, 107, 67, 74, 81, 88, 95, 102, 108, 113, 75, 82, 89, 96, 103, 109, 114, 118, 83, 90, 97, 104, 110, 115, 119, 122, 91, 98, 105, 111, 116, 120, 123, 125, 99, 106, 112, 117, 121, 124, 126, 127, }; DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x16[128]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, }; DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x8[128]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, }; DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x16[128]) = { 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, }; DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x8[128]) = { 0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113, 2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115, 4, 20, 36, 52, 68, 84, 100, 116, 5, 21, 37, 53, 69, 85, 101, 117, 6, 22, 38, 54, 70, 86, 102, 118, 7, 23, 39, 55, 71, 87, 103, 119, 8, 24, 40, 56, 72, 88, 104, 120, 9, 25, 41, 57, 73, 89, 105, 121, 10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123, 12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125, 14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127, }; DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x32[512]) = { 0, 2, 5, 9, 14, 20, 27, 35, 44, 54, 65, 77, 90, 104, 119, 135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359, 375, 391, 1, 4, 8, 13, 19, 26, 34, 43, 53, 64, 76, 89, 103, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342, 358, 374, 390, 406, 3, 7, 12, 18, 25, 33, 42, 52, 63, 75, 88, 102, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325, 341, 357, 373, 389, 405, 420, 6, 11, 17, 24, 32, 41, 51, 62, 74, 87, 101, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, 356, 372, 388, 404, 419, 433, 10, 16, 23, 31, 40, 50, 61, 73, 86, 100, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, 387, 403, 418, 432, 445, 15, 22, 30, 39, 49, 60, 72, 85, 99, 114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402, 417, 431, 444, 456, 21, 29, 38, 48, 59, 71, 84, 98, 113, 129, 145, 161, 177, 193, 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 416, 430, 443, 455, 466, 28, 37, 47, 58, 70, 83, 97, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 415, 429, 442, 454, 465, 475, 36, 46, 57, 69, 82, 96, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 414, 428, 441, 453, 464, 474, 483, 45, 56, 68, 81, 95, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 413, 427, 440, 452, 463, 473, 482, 490, 55, 67, 80, 94, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 412, 426, 439, 451, 462, 472, 481, 489, 496, 66, 79, 93, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 411, 425, 438, 450, 461, 471, 480, 488, 495, 501, 78, 92, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395, 410, 424, 437, 449, 460, 470, 479, 487, 494, 500, 505, 91, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394, 409, 423, 436, 448, 459, 469, 478, 486, 493, 499, 504, 508, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 408, 422, 435, 447, 458, 468, 477, 485, 492, 498, 503, 507, 510, 120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 407, 421, 434, 446, 457, 467, 476, 484, 491, 497, 502, 506, 509, 511, }; DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x16[512]) = { 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, 120, 2, 4, 7, 11, 16, 22, 29, 37, 46, 56, 67, 79, 92, 106, 121, 136, 5, 8, 12, 17, 23, 30, 38, 47, 57, 68, 80, 93, 107, 122, 137, 152, 9, 13, 18, 24, 31, 39, 48, 58, 69, 81, 94, 108, 123, 138, 153, 168, 14, 19, 25, 32, 40, 49, 59, 70, 82, 95, 109, 124, 139, 154, 169, 184, 20, 26, 33, 41, 50, 60, 71, 83, 96, 110, 125, 140, 155, 170, 185, 200, 27, 34, 42, 51, 61, 72, 84, 97, 111, 126, 141, 156, 171, 186, 201, 216, 35, 43, 52, 62, 73, 85, 98, 112, 127, 142, 157, 172, 187, 202, 217, 232, 44, 53, 63, 74, 86, 99, 113, 128, 143, 158, 173, 188, 203, 218, 233, 248, 54, 64, 75, 87, 100, 114, 129, 144, 159, 174, 189, 204, 219, 234, 249, 264, 65, 76, 88, 101, 115, 130, 145, 160, 175, 190, 205, 220, 235, 250, 265, 280, 77, 89, 102, 116, 131, 146, 161, 176, 191, 206, 221, 236, 251, 266, 281, 296, 90, 103, 117, 132, 147, 162, 177, 192, 207, 222, 237, 252, 267, 282, 297, 312, 104, 118, 133, 148, 163, 178, 193, 208, 223, 238, 253, 268, 283, 298, 313, 328, 119, 134, 149, 164, 179, 194, 209, 224, 239, 254, 269, 284, 299, 314, 329, 344, 135, 150, 165, 180, 195, 210, 225, 240, 255, 270, 285, 300, 315, 330, 345, 360, 151, 166, 181, 196, 211, 226, 241, 256, 271, 286, 301, 316, 331, 346, 361, 376, 167, 182, 197, 212, 227, 242, 257, 272, 287, 302, 317, 332, 347, 362, 377, 392, 183, 198, 213, 228, 243, 258, 273, 288, 303, 318, 333, 348, 363, 378, 393, 407, 199, 214, 229, 244, 259, 274, 289, 304, 319, 334, 349, 364, 379, 394, 408, 421, 215, 230, 245, 260, 275, 290, 305, 320, 335, 350, 365, 380, 395, 409, 422, 434, 231, 246, 261, 276, 291, 306, 321, 336, 351, 366, 381, 396, 410, 423, 435, 446, 247, 262, 277, 292, 307, 322, 337, 352, 367, 382, 397, 411, 424, 436, 447, 457, 263, 278, 293, 308, 323, 338, 353, 368, 383, 398, 412, 425, 437, 448, 458, 467, 279, 294, 309, 324, 339, 354, 369, 384, 399, 413, 426, 438, 449, 459, 468, 476, 295, 310, 325, 340, 355, 370, 385, 400, 414, 427, 439, 450, 460, 469, 477, 484, 311, 326, 341, 356, 371, 386, 401, 415, 428, 440, 451, 461, 470, 478, 485, 491, 327, 342, 357, 372, 387, 402, 416, 429, 441, 452, 462, 471, 479, 486, 492, 497, 343, 358, 373, 388, 403, 417, 430, 442, 453, 463, 472, 480, 487, 493, 498, 502, 359, 374, 389, 404, 418, 431, 443, 454, 464, 473, 481, 488, 494, 499, 503, 506, 375, 390, 405, 419, 432, 444, 455, 465, 474, 482, 489, 495, 500, 504, 507, 509, 391, 406, 420, 433, 445, 456, 466, 475, 483, 490, 496, 501, 505, 508, 510, 511, }; DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x32[512]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, }; DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x16[512]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, }; DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x32[512]) = { 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433, 449, 465, 481, 497, 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402, 418, 434, 450, 466, 482, 498, 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, 387, 403, 419, 435, 451, 467, 483, 499, 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, 356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487, 503, 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456, 472, 488, 504, 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425, 441, 457, 473, 489, 505, 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394, 410, 426, 442, 458, 474, 490, 506, 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395, 411, 427, 443, 459, 475, 491, 507, 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510, 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479, 495, 511, }; DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x16[512]) = { 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481, 2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, 3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483, 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484, 5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485, 6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487, 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488, 9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, 10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490, 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491, 12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492, 13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494, 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495, 16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, 17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497, 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498, 19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499, 20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501, 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502, 23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, 24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504, 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505, 26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506, 27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508, 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509, 30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, 31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511, }; DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x16[256]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, }; DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x16[256]) = { 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, }; DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x16[256]) = { 0, 2, 3, 9, 10, 20, 21, 35, 36, 54, 55, 77, 78, 104, 105, 135, 1, 4, 8, 11, 19, 22, 34, 37, 53, 56, 76, 79, 103, 106, 134, 136, 5, 7, 12, 18, 23, 33, 38, 52, 57, 75, 80, 102, 107, 133, 137, 164, 6, 13, 17, 24, 32, 39, 51, 58, 74, 81, 101, 108, 132, 138, 163, 165, 14, 16, 25, 31, 40, 50, 59, 73, 82, 100, 109, 131, 139, 162, 166, 189, 15, 26, 30, 41, 49, 60, 72, 83, 99, 110, 130, 140, 161, 167, 188, 190, 27, 29, 42, 48, 61, 71, 84, 98, 111, 129, 141, 160, 168, 187, 191, 210, 28, 43, 47, 62, 70, 85, 97, 112, 128, 142, 159, 169, 186, 192, 209, 211, 44, 46, 63, 69, 86, 96, 113, 127, 143, 158, 170, 185, 193, 208, 212, 227, 45, 64, 68, 87, 95, 114, 126, 144, 157, 171, 184, 194, 207, 213, 226, 228, 65, 67, 88, 94, 115, 125, 145, 156, 172, 183, 195, 206, 214, 225, 229, 240, 66, 89, 93, 116, 124, 146, 155, 173, 182, 196, 205, 215, 224, 230, 239, 241, 90, 92, 117, 123, 147, 154, 174, 181, 197, 204, 216, 223, 231, 238, 242, 249, 91, 118, 122, 148, 153, 175, 180, 198, 203, 217, 222, 232, 237, 243, 248, 250, 119, 121, 149, 152, 176, 179, 199, 202, 218, 221, 233, 236, 244, 247, 251, 254, 120, 150, 151, 177, 178, 200, 201, 219, 220, 234, 235, 245, 246, 252, 253, 255, }; DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x32[1024]) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, }; DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x32[1024]) = { 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481, 513, 545, 577, 609, 641, 673, 705, 737, 769, 801, 833, 865, 897, 929, 961, 993, 2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, 514, 546, 578, 610, 642, 674, 706, 738, 770, 802, 834, 866, 898, 930, 962, 994, 3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483, 515, 547, 579, 611, 643, 675, 707, 739, 771, 803, 835, 867, 899, 931, 963, 995, 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484, 516, 548, 580, 612, 644, 676, 708, 740, 772, 804, 836, 868, 900, 932, 964, 996, 5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485, 517, 549, 581, 613, 645, 677, 709, 741, 773, 805, 837, 869, 901, 933, 965, 997, 6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, 518, 550, 582, 614, 646, 678, 710, 742, 774, 806, 838, 870, 902, 934, 966, 998, 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487, 519, 551, 583, 615, 647, 679, 711, 743, 775, 807, 839, 871, 903, 935, 967, 999, 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488, 520, 552, 584, 616, 648, 680, 712, 744, 776, 808, 840, 872, 904, 936, 968, 1000, 9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, 521, 553, 585, 617, 649, 681, 713, 745, 777, 809, 841, 873, 905, 937, 969, 1001, 10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490, 522, 554, 586, 618, 650, 682, 714, 746, 778, 810, 842, 874, 906, 938, 970, 1002, 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491, 523, 555, 587, 619, 651, 683, 715, 747, 779, 811, 843, 875, 907, 939, 971, 1003, 12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492, 524, 556, 588, 620, 652, 684, 716, 748, 780, 812, 844, 876, 908, 940, 972, 1004, 13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, 525, 557, 589, 621, 653, 685, 717, 749, 781, 813, 845, 877, 909, 941, 973, 1005, 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494, 526, 558, 590, 622, 654, 686, 718, 750, 782, 814, 846, 878, 910, 942, 974, 1006, 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495, 527, 559, 591, 623, 655, 687, 719, 751, 783, 815, 847, 879, 911, 943, 975, 1007, 16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, 528, 560, 592, 624, 656, 688, 720, 752, 784, 816, 848, 880, 912, 944, 976, 1008, 17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497, 529, 561, 593, 625, 657, 689, 721, 753, 785, 817, 849, 881, 913, 945, 977, 1009, 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498, 530, 562, 594, 626, 658, 690, 722, 754, 786, 818, 850, 882, 914, 946, 978, 1010, 19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499, 531, 563, 595, 627, 659, 691, 723, 755, 787, 819, 851, 883, 915, 947, 979, 1011, 20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, 532, 564, 596, 628, 660, 692, 724, 756, 788, 820, 852, 884, 916, 948, 980, 1012, 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501, 533, 565, 597, 629, 661, 693, 725, 757, 789, 821, 853, 885, 917, 949, 981, 1013, 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502, 534, 566, 598, 630, 662, 694, 726, 758, 790, 822, 854, 886, 918, 950, 982, 1014, 23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, 535, 567, 599, 631, 663, 695, 727, 759, 791, 823, 855, 887, 919, 951, 983, 1015, 24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504, 536, 568, 600, 632, 664, 696, 728, 760, 792, 824, 856, 888, 920, 952, 984, 1016, 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505, 537, 569, 601, 633, 665, 697, 729, 761, 793, 825, 857, 889, 921, 953, 985, 1017, 26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506, 538, 570, 602, 634, 666, 698, 730, 762, 794, 826, 858, 890, 922, 954, 986, 1018, 27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, 539, 571, 603, 635, 667, 699, 731, 763, 795, 827, 859, 891, 923, 955, 987, 1019, 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508, 540, 572, 604, 636, 668, 700, 732, 764, 796, 828, 860, 892, 924, 956, 988, 1020, 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509, 541, 573, 605, 637, 669, 701, 733, 765, 797, 829, 861, 893, 925, 957, 989, 1021, 30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, 542, 574, 606, 638, 670, 702, 734, 766, 798, 830, 862, 894, 926, 958, 990, 1022, 31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511, 543, 575, 607, 639, 671, 703, 735, 767, 799, 831, 863, 895, 927, 959, 991, 1023, }; DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x32[1024]) = { 0, 2, 3, 9, 10, 20, 21, 35, 36, 54, 55, 77, 78, 104, 105, 135, 136, 170, 171, 209, 210, 252, 253, 299, 300, 350, 351, 405, 406, 464, 465, 527, 1, 4, 8, 11, 19, 22, 34, 37, 53, 56, 76, 79, 103, 106, 134, 137, 169, 172, 208, 211, 251, 254, 298, 301, 349, 352, 404, 407, 463, 466, 526, 528, 5, 7, 12, 18, 23, 33, 38, 52, 57, 75, 80, 102, 107, 133, 138, 168, 173, 207, 212, 250, 255, 297, 302, 348, 353, 403, 408, 462, 467, 525, 529, 588, 6, 13, 17, 24, 32, 39, 51, 58, 74, 81, 101, 108, 132, 139, 167, 174, 206, 213, 249, 256, 296, 303, 347, 354, 402, 409, 461, 468, 524, 530, 587, 589, 14, 16, 25, 31, 40, 50, 59, 73, 82, 100, 109, 131, 140, 166, 175, 205, 214, 248, 257, 295, 304, 346, 355, 401, 410, 460, 469, 523, 531, 586, 590, 645, 15, 26, 30, 41, 49, 60, 72, 83, 99, 110, 130, 141, 165, 176, 204, 215, 247, 258, 294, 305, 345, 356, 400, 411, 459, 470, 522, 532, 585, 591, 644, 646, 27, 29, 42, 48, 61, 71, 84, 98, 111, 129, 142, 164, 177, 203, 216, 246, 259, 293, 306, 344, 357, 399, 412, 458, 471, 521, 533, 584, 592, 643, 647, 698, 28, 43, 47, 62, 70, 85, 97, 112, 128, 143, 163, 178, 202, 217, 245, 260, 292, 307, 343, 358, 398, 413, 457, 472, 520, 534, 583, 593, 642, 648, 697, 699, 44, 46, 63, 69, 86, 96, 113, 127, 144, 162, 179, 201, 218, 244, 261, 291, 308, 342, 359, 397, 414, 456, 473, 519, 535, 582, 594, 641, 649, 696, 700, 747, 45, 64, 68, 87, 95, 114, 126, 145, 161, 180, 200, 219, 243, 262, 290, 309, 341, 360, 396, 415, 455, 474, 518, 536, 581, 595, 640, 650, 695, 701, 746, 748, 65, 67, 88, 94, 115, 125, 146, 160, 181, 199, 220, 242, 263, 289, 310, 340, 361, 395, 416, 454, 475, 517, 537, 580, 596, 639, 651, 694, 702, 745, 749, 792, 66, 89, 93, 116, 124, 147, 159, 182, 198, 221, 241, 264, 288, 311, 339, 362, 394, 417, 453, 476, 516, 538, 579, 597, 638, 652, 693, 703, 744, 750, 791, 793, 90, 92, 117, 123, 148, 158, 183, 197, 222, 240, 265, 287, 312, 338, 363, 393, 418, 452, 477, 515, 539, 578, 598, 637, 653, 692, 704, 743, 751, 790, 794, 833, 91, 118, 122, 149, 157, 184, 196, 223, 239, 266, 286, 313, 337, 364, 392, 419, 451, 478, 514, 540, 577, 599, 636, 654, 691, 705, 742, 752, 789, 795, 832, 834, 119, 121, 150, 156, 185, 195, 224, 238, 267, 285, 314, 336, 365, 391, 420, 450, 479, 513, 541, 576, 600, 635, 655, 690, 706, 741, 753, 788, 796, 831, 835, 870, 120, 151, 155, 186, 194, 225, 237, 268, 284, 315, 335, 366, 390, 421, 449, 480, 512, 542, 575, 601, 634, 656, 689, 707, 740, 754, 787, 797, 830, 836, 869, 871, 152, 154, 187, 193, 226, 236, 269, 283, 316, 334, 367, 389, 422, 448, 481, 511, 543, 574, 602, 633, 657, 688, 708, 739, 755, 786, 798, 829, 837, 868, 872, 903, 153, 188, 192, 227, 235, 270, 282, 317, 333, 368, 388, 423, 447, 482, 510, 544, 573, 603, 632, 658, 687, 709, 738, 756, 785, 799, 828, 838, 867, 873, 902, 904, 189, 191, 228, 234, 271, 281, 318, 332, 369, 387, 424, 446, 483, 509, 545, 572, 604, 631, 659, 686, 710, 737, 757, 784, 800, 827, 839, 866, 874, 901, 905, 932, 190, 229, 233, 272, 280, 319, 331, 370, 386, 425, 445, 484, 508, 546, 571, 605, 630, 660, 685, 711, 736, 758, 783, 801, 826, 840, 865, 875, 900, 906, 931, 933, 230, 232, 273, 279, 320, 330, 371, 385, 426, 444, 485, 507, 547, 570, 606, 629, 661, 684, 712, 735, 759, 782, 802, 825, 841, 864, 876, 899, 907, 930, 934, 957, 231, 274, 278, 321, 329, 372, 384, 427, 443, 486, 506, 548, 569, 607, 628, 662, 683, 713, 734, 760, 781, 803, 824, 842, 863, 877, 898, 908, 929, 935, 956, 958, 275, 277, 322, 328, 373, 383, 428, 442, 487, 505, 549, 568, 608, 627, 663, 682, 714, 733, 761, 780, 804, 823, 843, 862, 878, 897, 909, 928, 936, 955, 959, 978, 276, 323, 327, 374, 382, 429, 441, 488, 504, 550, 567, 609, 626, 664, 681, 715, 732, 762, 779, 805, 822, 844, 861, 879, 896, 910, 927, 937, 954, 960, 977, 979, 324, 326, 375, 381, 430, 440, 489, 503, 551, 566, 610, 625, 665, 680, 716, 731, 763, 778, 806, 821, 845, 860, 880, 895, 911, 926, 938, 953, 961, 976, 980, 995, 325, 376, 380, 431, 439, 490, 502, 552, 565, 611, 624, 666, 679, 717, 730, 764, 777, 807, 820, 846, 859, 881, 894, 912, 925, 939, 952, 962, 975, 981, 994, 996, 377, 379, 432, 438, 491, 501, 553, 564, 612, 623, 667, 678, 718, 729, 765, 776, 808, 819, 847, 858, 882, 893, 913, 924, 940, 951, 963, 974, 982, 993, 997, 1008, 378, 433, 437, 492, 500, 554, 563, 613, 622, 668, 677, 719, 728, 766, 775, 809, 818, 848, 857, 883, 892, 914, 923, 941, 950, 964, 973, 983, 992, 998, 1007, 1009, 434, 436, 493, 499, 555, 562, 614, 621, 669, 676, 720, 727, 767, 774, 810, 817, 849, 856, 884, 891, 915, 922, 942, 949, 965, 972, 984, 991, 999, 1006, 1010, 1017, 435, 494, 498, 556, 561, 615, 620, 670, 675, 721, 726, 768, 773, 811, 816, 850, 855, 885, 890, 916, 921, 943, 948, 966, 971, 985, 990, 1000, 1005, 1011, 1016, 1018, 495, 497, 557, 560, 616, 619, 671, 674, 722, 725, 769, 772, 812, 815, 851, 854, 886, 889, 917, 920, 944, 947, 967, 970, 986, 989, 1001, 1004, 1012, 1015, 1019, 1022, 496, 558, 559, 617, 618, 672, 673, 723, 724, 770, 771, 813, 814, 852, 853, 887, 888, 918, 919, 945, 946, 968, 969, 987, 988, 1002, 1003, 1013, 1014, 1020, 1021, 1023, }; const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES] = { { // TX_4X4 { default_scan_4x4, av1_default_iscan_4x4 }, { default_scan_4x4, av1_default_iscan_4x4 }, { default_scan_4x4, av1_default_iscan_4x4 }, { default_scan_4x4, av1_default_iscan_4x4 }, { default_scan_4x4, av1_default_iscan_4x4 }, { default_scan_4x4, av1_default_iscan_4x4 }, { default_scan_4x4, av1_default_iscan_4x4 }, { default_scan_4x4, av1_default_iscan_4x4 }, { default_scan_4x4, av1_default_iscan_4x4 }, { default_scan_4x4, av1_default_iscan_4x4 }, { mrow_scan_4x4, av1_mrow_iscan_4x4 }, { mcol_scan_4x4, av1_mcol_iscan_4x4 }, { mrow_scan_4x4, av1_mrow_iscan_4x4 }, { mcol_scan_4x4, av1_mcol_iscan_4x4 }, { mrow_scan_4x4, av1_mrow_iscan_4x4 }, { mcol_scan_4x4, av1_mcol_iscan_4x4 }, }, { // TX_8X8 { default_scan_8x8, av1_default_iscan_8x8 }, { default_scan_8x8, av1_default_iscan_8x8 }, { default_scan_8x8, av1_default_iscan_8x8 }, { default_scan_8x8, av1_default_iscan_8x8 }, { default_scan_8x8, av1_default_iscan_8x8 }, { default_scan_8x8, av1_default_iscan_8x8 }, { default_scan_8x8, av1_default_iscan_8x8 }, { default_scan_8x8, av1_default_iscan_8x8 }, { default_scan_8x8, av1_default_iscan_8x8 }, { default_scan_8x8, av1_default_iscan_8x8 }, { mrow_scan_8x8, av1_mrow_iscan_8x8 }, { mcol_scan_8x8, av1_mcol_iscan_8x8 }, { mrow_scan_8x8, av1_mrow_iscan_8x8 }, { mcol_scan_8x8, av1_mcol_iscan_8x8 }, { mrow_scan_8x8, av1_mrow_iscan_8x8 }, { mcol_scan_8x8, av1_mcol_iscan_8x8 }, }, { // TX_16X16 { default_scan_16x16, av1_default_iscan_16x16 }, { default_scan_16x16, av1_default_iscan_16x16 }, { default_scan_16x16, av1_default_iscan_16x16 }, { default_scan_16x16, av1_default_iscan_16x16 }, { default_scan_16x16, av1_default_iscan_16x16 }, { default_scan_16x16, av1_default_iscan_16x16 }, { default_scan_16x16, av1_default_iscan_16x16 }, { default_scan_16x16, av1_default_iscan_16x16 }, { default_scan_16x16, av1_default_iscan_16x16 }, { default_scan_16x16, av1_default_iscan_16x16 }, { mrow_scan_16x16, av1_mrow_iscan_16x16 }, { mcol_scan_16x16, av1_mcol_iscan_16x16 }, { mrow_scan_16x16, av1_mrow_iscan_16x16 }, { mcol_scan_16x16, av1_mcol_iscan_16x16 }, { mrow_scan_16x16, av1_mrow_iscan_16x16 }, { mcol_scan_16x16, av1_mcol_iscan_16x16 }, }, { // TX_32X32 { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { mrow_scan_32x32, av1_mrow_iscan_32x32 }, { mcol_scan_32x32, av1_mcol_iscan_32x32 }, { mrow_scan_32x32, av1_mrow_iscan_32x32 }, { mcol_scan_32x32, av1_mcol_iscan_32x32 }, { mrow_scan_32x32, av1_mrow_iscan_32x32 }, { mcol_scan_32x32, av1_mcol_iscan_32x32 }, }, { // TX_64X64 // Half of the coefficients of tx64 at higher frequencies are set to // zeros. So tx32's scan order is used. { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { mrow_scan_32x32, av1_mrow_iscan_32x32 }, { mcol_scan_32x32, av1_mcol_iscan_32x32 }, { mrow_scan_32x32, av1_mrow_iscan_32x32 }, { mcol_scan_32x32, av1_mcol_iscan_32x32 }, { mrow_scan_32x32, av1_mrow_iscan_32x32 }, { mcol_scan_32x32, av1_mcol_iscan_32x32 }, }, { // TX_4X8 { default_scan_4x8, av1_default_iscan_4x8 }, { default_scan_4x8, av1_default_iscan_4x8 }, { default_scan_4x8, av1_default_iscan_4x8 }, { default_scan_4x8, av1_default_iscan_4x8 }, { default_scan_4x8, av1_default_iscan_4x8 }, { default_scan_4x8, av1_default_iscan_4x8 }, { default_scan_4x8, av1_default_iscan_4x8 }, { default_scan_4x8, av1_default_iscan_4x8 }, { default_scan_4x8, av1_default_iscan_4x8 }, { default_scan_4x8, av1_default_iscan_4x8 }, { mrow_scan_4x8, av1_mrow_iscan_4x8 }, { mcol_scan_4x8, av1_mcol_iscan_4x8 }, { mrow_scan_4x8, av1_mrow_iscan_4x8 }, { mcol_scan_4x8, av1_mcol_iscan_4x8 }, { mrow_scan_4x8, av1_mrow_iscan_4x8 }, { mcol_scan_4x8, av1_mcol_iscan_4x8 }, }, { // TX_8X4 { default_scan_8x4, av1_default_iscan_8x4 }, { default_scan_8x4, av1_default_iscan_8x4 }, { default_scan_8x4, av1_default_iscan_8x4 }, { default_scan_8x4, av1_default_iscan_8x4 }, { default_scan_8x4, av1_default_iscan_8x4 }, { default_scan_8x4, av1_default_iscan_8x4 }, { default_scan_8x4, av1_default_iscan_8x4 }, { default_scan_8x4, av1_default_iscan_8x4 }, { default_scan_8x4, av1_default_iscan_8x4 }, { default_scan_8x4, av1_default_iscan_8x4 }, { mrow_scan_8x4, av1_mrow_iscan_8x4 }, { mcol_scan_8x4, av1_mcol_iscan_8x4 }, { mrow_scan_8x4, av1_mrow_iscan_8x4 }, { mcol_scan_8x4, av1_mcol_iscan_8x4 }, { mrow_scan_8x4, av1_mrow_iscan_8x4 }, { mcol_scan_8x4, av1_mcol_iscan_8x4 }, }, { // TX_8X16 { default_scan_8x16, av1_default_iscan_8x16 }, { default_scan_8x16, av1_default_iscan_8x16 }, { default_scan_8x16, av1_default_iscan_8x16 }, { default_scan_8x16, av1_default_iscan_8x16 }, { default_scan_8x16, av1_default_iscan_8x16 }, { default_scan_8x16, av1_default_iscan_8x16 }, { default_scan_8x16, av1_default_iscan_8x16 }, { default_scan_8x16, av1_default_iscan_8x16 }, { default_scan_8x16, av1_default_iscan_8x16 }, { default_scan_8x16, av1_default_iscan_8x16 }, { mrow_scan_8x16, av1_mrow_iscan_8x16 }, { mcol_scan_8x16, av1_mcol_iscan_8x16 }, { mrow_scan_8x16, av1_mrow_iscan_8x16 }, { mcol_scan_8x16, av1_mcol_iscan_8x16 }, { mrow_scan_8x16, av1_mrow_iscan_8x16 }, { mcol_scan_8x16, av1_mcol_iscan_8x16 }, }, { // TX_16X8 { default_scan_16x8, av1_default_iscan_16x8 }, { default_scan_16x8, av1_default_iscan_16x8 }, { default_scan_16x8, av1_default_iscan_16x8 }, { default_scan_16x8, av1_default_iscan_16x8 }, { default_scan_16x8, av1_default_iscan_16x8 }, { default_scan_16x8, av1_default_iscan_16x8 }, { default_scan_16x8, av1_default_iscan_16x8 }, { default_scan_16x8, av1_default_iscan_16x8 }, { default_scan_16x8, av1_default_iscan_16x8 }, { default_scan_16x8, av1_default_iscan_16x8 }, { mrow_scan_16x8, av1_mrow_iscan_16x8 }, { mcol_scan_16x8, av1_mcol_iscan_16x8 }, { mrow_scan_16x8, av1_mrow_iscan_16x8 }, { mcol_scan_16x8, av1_mcol_iscan_16x8 }, { mrow_scan_16x8, av1_mrow_iscan_16x8 }, { mcol_scan_16x8, av1_mcol_iscan_16x8 }, }, { // TX_16X32 { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { mrow_scan_16x32, av1_mrow_iscan_16x32 }, { mcol_scan_16x32, av1_mcol_iscan_16x32 }, { mrow_scan_16x32, av1_mrow_iscan_16x32 }, { mcol_scan_16x32, av1_mcol_iscan_16x32 }, { mrow_scan_16x32, av1_mrow_iscan_16x32 }, { mcol_scan_16x32, av1_mcol_iscan_16x32 }, }, { // TX_32X16 { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { mrow_scan_32x16, av1_mrow_iscan_32x16 }, { mcol_scan_32x16, av1_mcol_iscan_32x16 }, { mrow_scan_32x16, av1_mrow_iscan_32x16 }, { mcol_scan_32x16, av1_mcol_iscan_32x16 }, { mrow_scan_32x16, av1_mrow_iscan_32x16 }, { mcol_scan_32x16, av1_mcol_iscan_32x16 }, }, { // TX_32X64 // Half of the coefficients of tx64 at higher frequencies are set to // zeros. So tx32's scan order is used. { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { mrow_scan_32x32, av1_mrow_iscan_32x32 }, { mcol_scan_32x32, av1_mcol_iscan_32x32 }, { mrow_scan_32x32, av1_mrow_iscan_32x32 }, { mcol_scan_32x32, av1_mcol_iscan_32x32 }, { mrow_scan_32x32, av1_mrow_iscan_32x32 }, { mcol_scan_32x32, av1_mcol_iscan_32x32 }, }, { // TX_64X32 // Half of the coefficients of tx64 at higher frequencies are set to // zeros. So tx32's scan order is used. { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { default_scan_32x32, av1_default_iscan_32x32 }, { mrow_scan_32x32, av1_mrow_iscan_32x32 }, { mcol_scan_32x32, av1_mcol_iscan_32x32 }, { mrow_scan_32x32, av1_mrow_iscan_32x32 }, { mcol_scan_32x32, av1_mcol_iscan_32x32 }, { mrow_scan_32x32, av1_mrow_iscan_32x32 }, { mcol_scan_32x32, av1_mcol_iscan_32x32 }, }, { // TX_4X16 { default_scan_4x16, av1_default_iscan_4x16 }, { default_scan_4x16, av1_default_iscan_4x16 }, { default_scan_4x16, av1_default_iscan_4x16 }, { default_scan_4x16, av1_default_iscan_4x16 }, { default_scan_4x16, av1_default_iscan_4x16 }, { default_scan_4x16, av1_default_iscan_4x16 }, { default_scan_4x16, av1_default_iscan_4x16 }, { default_scan_4x16, av1_default_iscan_4x16 }, { default_scan_4x16, av1_default_iscan_4x16 }, { default_scan_4x16, av1_default_iscan_4x16 }, { mrow_scan_4x16, av1_mrow_iscan_4x16 }, { mcol_scan_4x16, av1_mcol_iscan_4x16 }, { mrow_scan_4x16, av1_mrow_iscan_4x16 }, { mcol_scan_4x16, av1_mcol_iscan_4x16 }, { mrow_scan_4x16, av1_mrow_iscan_4x16 }, { mcol_scan_4x16, av1_mcol_iscan_4x16 }, }, { // TX_16X4 { default_scan_16x4, av1_default_iscan_16x4 }, { default_scan_16x4, av1_default_iscan_16x4 }, { default_scan_16x4, av1_default_iscan_16x4 }, { default_scan_16x4, av1_default_iscan_16x4 }, { default_scan_16x4, av1_default_iscan_16x4 }, { default_scan_16x4, av1_default_iscan_16x4 }, { default_scan_16x4, av1_default_iscan_16x4 }, { default_scan_16x4, av1_default_iscan_16x4 }, { default_scan_16x4, av1_default_iscan_16x4 }, { default_scan_16x4, av1_default_iscan_16x4 }, { mrow_scan_16x4, av1_mrow_iscan_16x4 }, { mcol_scan_16x4, av1_mcol_iscan_16x4 }, { mrow_scan_16x4, av1_mrow_iscan_16x4 }, { mcol_scan_16x4, av1_mcol_iscan_16x4 }, { mrow_scan_16x4, av1_mrow_iscan_16x4 }, { mcol_scan_16x4, av1_mcol_iscan_16x4 }, }, { // TX_8X32 { default_scan_8x32, av1_default_iscan_8x32 }, { default_scan_8x32, av1_default_iscan_8x32 }, { default_scan_8x32, av1_default_iscan_8x32 }, { default_scan_8x32, av1_default_iscan_8x32 }, { default_scan_8x32, av1_default_iscan_8x32 }, { default_scan_8x32, av1_default_iscan_8x32 }, { default_scan_8x32, av1_default_iscan_8x32 }, { default_scan_8x32, av1_default_iscan_8x32 }, { default_scan_8x32, av1_default_iscan_8x32 }, { default_scan_8x32, av1_default_iscan_8x32 }, { mrow_scan_8x32, av1_mrow_iscan_8x32 }, { mcol_scan_8x32, av1_mcol_iscan_8x32 }, { mrow_scan_8x32, av1_mrow_iscan_8x32 }, { mcol_scan_8x32, av1_mcol_iscan_8x32 }, { mrow_scan_8x32, av1_mrow_iscan_8x32 }, { mcol_scan_8x32, av1_mcol_iscan_8x32 }, }, { // TX_32X8 { default_scan_32x8, av1_default_iscan_32x8 }, { default_scan_32x8, av1_default_iscan_32x8 }, { default_scan_32x8, av1_default_iscan_32x8 }, { default_scan_32x8, av1_default_iscan_32x8 }, { default_scan_32x8, av1_default_iscan_32x8 }, { default_scan_32x8, av1_default_iscan_32x8 }, { default_scan_32x8, av1_default_iscan_32x8 }, { default_scan_32x8, av1_default_iscan_32x8 }, { default_scan_32x8, av1_default_iscan_32x8 }, { default_scan_32x8, av1_default_iscan_32x8 }, { mrow_scan_32x8, av1_mrow_iscan_32x8 }, { mcol_scan_32x8, av1_mcol_iscan_32x8 }, { mrow_scan_32x8, av1_mrow_iscan_32x8 }, { mcol_scan_32x8, av1_mcol_iscan_32x8 }, { mrow_scan_32x8, av1_mrow_iscan_32x8 }, { mcol_scan_32x8, av1_mcol_iscan_32x8 }, }, { // TX_16X64 // Half of the coefficients of tx64 at higher frequencies are set to // zeros. So tx32's scan order is used. { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { default_scan_16x32, av1_default_iscan_16x32 }, { mrow_scan_16x32, av1_mrow_iscan_16x32 }, { mcol_scan_16x32, av1_mcol_iscan_16x32 }, { mrow_scan_16x32, av1_mrow_iscan_16x32 }, { mcol_scan_16x32, av1_mcol_iscan_16x32 }, { mrow_scan_16x32, av1_mrow_iscan_16x32 }, { mcol_scan_16x32, av1_mcol_iscan_16x32 }, }, { // TX_64X16 // Half of the coefficients of tx64 at higher frequencies are set to // zeros. So tx32's scan order is used. { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { default_scan_32x16, av1_default_iscan_32x16 }, { mrow_scan_32x16, av1_mrow_iscan_32x16 }, { mcol_scan_32x16, av1_mcol_iscan_32x16 }, { mrow_scan_32x16, av1_mrow_iscan_32x16 }, { mcol_scan_32x16, av1_mcol_iscan_32x16 }, { mrow_scan_32x16, av1_mrow_iscan_32x16 }, { mcol_scan_32x16, av1_mcol_iscan_32x16 }, }, }; aom-3.12.1/av1/common/scan.h000066400000000000000000000027231477627663500154610ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_SCAN_H_ #define AOM_AV1_COMMON_SCAN_H_ #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/enums.h" #ifdef __cplusplus extern "C" { #endif #define MAX_NEIGHBORS 2 enum { SCAN_MODE_ZIG_ZAG, SCAN_MODE_COL_DIAG, SCAN_MODE_ROW_DIAG, SCAN_MODE_COL_1D, SCAN_MODE_ROW_1D, SCAN_MODES } UENUM1BYTE(SCAN_MODE); extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES]; void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd); static inline const SCAN_ORDER *get_default_scan(TX_SIZE tx_size, TX_TYPE tx_type) { return &av1_scan_orders[tx_size][tx_type]; } static inline const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) { return get_default_scan(tx_size, tx_type); } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_SCAN_H_ aom-3.12.1/av1/common/seg_common.c000066400000000000000000000063771477627663500166670ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/common/av1_loopfilter.h" #include "av1/common/blockd.h" #include "av1/common/seg_common.h" #include "av1/common/quant_common.h" static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 1, 1, 1, 0, 0, 0 }; static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, 7, 0, 0 }; // These functions provide access to new segment level features. // Eventually these function may be "optimized out" but for the moment, // the coding mechanism is still subject to change so these provide a // convenient single point of change. void av1_clearall_segfeatures(struct segmentation *seg) { av1_zero(seg->feature_data); av1_zero(seg->feature_mask); } void av1_calculate_segdata(struct segmentation *seg) { seg->segid_preskip = 0; seg->last_active_segid = 0; for (int i = 0; i < MAX_SEGMENTS; i++) { for (int j = 0; j < SEG_LVL_MAX; j++) { if (seg->feature_mask[i] & (1 << j)) { seg->segid_preskip |= (j >= SEG_LVL_REF_FRAME); seg->last_active_segid = i; } } } } void av1_enable_segfeature(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id) { seg->feature_mask[segment_id] |= 1 << feature_id; } int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id) { return seg_feature_data_max[feature_id]; } int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id) { return seg_feature_data_signed[feature_id]; } // The 'seg_data' given for each segment can be either deltas (from the default // value chosen for the frame) or absolute values. // // Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for // SEGMENT_ALT_LF) // Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for // SEGMENT_ALT_LF) // // abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use // the absolute values given). void av1_set_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id, int seg_data) { if (seg_data < 0) { assert(seg_feature_data_signed[feature_id]); assert(-seg_data <= seg_feature_data_max[feature_id]); } else { assert(seg_data <= seg_feature_data_max[feature_id]); } seg->feature_data[segment_id][feature_id] = seg_data; } // TBD? Functions to read and write segment data with range / validity checking aom-3.12.1/av1/common/seg_common.h000066400000000000000000000076541477627663500166730ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_SEG_COMMON_H_ #define AOM_AV1_COMMON_SEG_COMMON_H_ #include "aom_dsp/prob.h" #ifdef __cplusplus extern "C" { #endif #define MAX_SEGMENTS 8 #define SEG_TREE_PROBS (MAX_SEGMENTS - 1) #define SEG_TEMPORAL_PRED_CTXS 3 #define SPATIAL_PREDICTION_PROBS 3 enum { SEG_LVL_ALT_Q, // Use alternate Quantizer .... SEG_LVL_ALT_LF_Y_V, // Use alternate loop filter value on y plane vertical SEG_LVL_ALT_LF_Y_H, // Use alternate loop filter value on y plane horizontal SEG_LVL_ALT_LF_U, // Use alternate loop filter value on u plane SEG_LVL_ALT_LF_V, // Use alternate loop filter value on v plane SEG_LVL_REF_FRAME, // Optional Segment reference frame SEG_LVL_SKIP, // Optional Segment (0,0) + skip mode SEG_LVL_GLOBALMV, SEG_LVL_MAX } UENUM1BYTE(SEG_LVL_FEATURES); struct segmentation { uint8_t enabled; uint8_t update_map; uint8_t update_data; uint8_t temporal_update; int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX]; unsigned int feature_mask[MAX_SEGMENTS]; int last_active_segid; // The highest numbered segment id that has some // enabled feature. uint8_t segid_preskip; // Whether the segment id will be read before the // skip syntax element. // 1: the segment id will be read first. // 0: the skip syntax element will be read first. }; struct segmentation_probs { aom_cdf_prob pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)]; aom_cdf_prob spatial_pred_seg_cdf[SPATIAL_PREDICTION_PROBS] [CDF_SIZE(MAX_SEGMENTS)]; }; static inline int segfeature_active(const struct segmentation *seg, uint8_t segment_id, SEG_LVL_FEATURES feature_id) { return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id)); } static inline void segfeatures_copy(struct segmentation *dst, const struct segmentation *src) { int i, j; for (i = 0; i < MAX_SEGMENTS; i++) { dst->feature_mask[i] = src->feature_mask[i]; for (j = 0; j < SEG_LVL_MAX; j++) { dst->feature_data[i][j] = src->feature_data[i][j]; } } dst->segid_preskip = src->segid_preskip; dst->last_active_segid = src->last_active_segid; } void av1_clearall_segfeatures(struct segmentation *seg); void av1_enable_segfeature(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); void av1_calculate_segdata(struct segmentation *seg); int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id); int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id); void av1_set_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id, int seg_data); static inline int get_segdata(const struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id) { return seg->feature_data[segment_id][feature_id]; } static inline void set_segment_id(uint8_t *segment_ids, int mi_offset, int x_mis, int y_mis, int mi_stride, uint8_t segment_id) { segment_ids += mi_offset; for (int y = 0; y < y_mis; ++y) { memset(&segment_ids[y * mi_stride], segment_id, x_mis * sizeof(segment_ids[0])); } } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_SEG_COMMON_H_ aom-3.12.1/av1/common/thread_common.c000066400000000000000000001311321477627663500173440ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom/aom_image.h" #include "config/aom_config.h" #include "config/aom_scale_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/txfm_common.h" #include "aom_mem/aom_mem.h" #include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #include "av1/common/av1_loopfilter.h" #include "av1/common/blockd.h" #include "av1/common/cdef.h" #include "av1/common/entropymode.h" #include "av1/common/enums.h" #include "av1/common/thread_common.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" #include "av1/common/restoration.h" // Set up nsync by width. static inline int get_sync_range(int width) { // nsync numbers are picked by testing. For example, for 4k // video, using 4 gives best performance. if (width < 640) return 1; else if (width <= 1280) return 2; else if (width <= 4096) return 4; else return 8; } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static inline int get_lr_sync_range(int width) { #if 0 // nsync numbers are picked by testing. For example, for 4k // video, using 4 gives best performance. if (width < 640) return 1; else if (width <= 1280) return 2; else if (width <= 4096) return 4; else return 8; #else (void)width; return 1; #endif } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // Allocate memory for lf row synchronization void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows, int width, int num_workers) { lf_sync->rows = rows; #if CONFIG_MULTITHREAD { int i, j; for (j = 0; j < MAX_MB_PLANE; j++) { CHECK_MEM_ERROR(cm, lf_sync->mutex_[j], aom_malloc(sizeof(*(lf_sync->mutex_[j])) * rows)); if (lf_sync->mutex_[j]) { for (i = 0; i < rows; ++i) { pthread_mutex_init(&lf_sync->mutex_[j][i], NULL); } } CHECK_MEM_ERROR(cm, lf_sync->cond_[j], aom_malloc(sizeof(*(lf_sync->cond_[j])) * rows)); if (lf_sync->cond_[j]) { for (i = 0; i < rows; ++i) { pthread_cond_init(&lf_sync->cond_[j][i], NULL); } } } CHECK_MEM_ERROR(cm, lf_sync->job_mutex, aom_malloc(sizeof(*(lf_sync->job_mutex)))); if (lf_sync->job_mutex) { pthread_mutex_init(lf_sync->job_mutex, NULL); } } #endif // CONFIG_MULTITHREAD CHECK_MEM_ERROR(cm, lf_sync->lfdata, aom_malloc(num_workers * sizeof(*(lf_sync->lfdata)))); lf_sync->num_workers = num_workers; for (int j = 0; j < MAX_MB_PLANE; j++) { CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col[j], aom_malloc(sizeof(*(lf_sync->cur_sb_col[j])) * rows)); } CHECK_MEM_ERROR( cm, lf_sync->job_queue, aom_malloc(sizeof(*(lf_sync->job_queue)) * rows * MAX_MB_PLANE * 2)); // Set up nsync. lf_sync->sync_range = get_sync_range(width); } // Deallocate lf synchronization related mutex and data void av1_loop_filter_dealloc(AV1LfSync *lf_sync) { if (lf_sync != NULL) { int j; #if CONFIG_MULTITHREAD int i; for (j = 0; j < MAX_MB_PLANE; j++) { if (lf_sync->mutex_[j] != NULL) { for (i = 0; i < lf_sync->rows; ++i) { pthread_mutex_destroy(&lf_sync->mutex_[j][i]); } aom_free(lf_sync->mutex_[j]); } if (lf_sync->cond_[j] != NULL) { for (i = 0; i < lf_sync->rows; ++i) { pthread_cond_destroy(&lf_sync->cond_[j][i]); } aom_free(lf_sync->cond_[j]); } } if (lf_sync->job_mutex != NULL) { pthread_mutex_destroy(lf_sync->job_mutex); aom_free(lf_sync->job_mutex); } #endif // CONFIG_MULTITHREAD aom_free(lf_sync->lfdata); for (j = 0; j < MAX_MB_PLANE; j++) { aom_free(lf_sync->cur_sb_col[j]); } aom_free(lf_sync->job_queue); // clear the structure as the source of this call may be a resize in which // case this call will be followed by an _alloc() which may fail. av1_zero(*lf_sync); } } void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync, int num_workers) { if (num_workers < 1) return; #if CONFIG_MULTITHREAD if (cdef_sync->mutex_ == NULL) { CHECK_MEM_ERROR(cm, cdef_sync->mutex_, aom_malloc(sizeof(*(cdef_sync->mutex_)))); if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); } #else (void)cm; (void)cdef_sync; #endif // CONFIG_MULTITHREAD } void av1_free_cdef_sync(AV1CdefSync *cdef_sync) { if (cdef_sync == NULL) return; #if CONFIG_MULTITHREAD if (cdef_sync->mutex_ != NULL) { pthread_mutex_destroy(cdef_sync->mutex_); aom_free(cdef_sync->mutex_); } #endif // CONFIG_MULTITHREAD } static inline void cdef_row_mt_sync_read(AV1CdefSync *const cdef_sync, int row) { if (!row) return; #if CONFIG_MULTITHREAD AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt; pthread_mutex_lock(cdef_row_mt[row - 1].row_mutex_); while (cdef_row_mt[row - 1].is_row_done != 1) pthread_cond_wait(cdef_row_mt[row - 1].row_cond_, cdef_row_mt[row - 1].row_mutex_); cdef_row_mt[row - 1].is_row_done = 0; pthread_mutex_unlock(cdef_row_mt[row - 1].row_mutex_); #else (void)cdef_sync; #endif // CONFIG_MULTITHREAD } static inline void cdef_row_mt_sync_write(AV1CdefSync *const cdef_sync, int row) { #if CONFIG_MULTITHREAD AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt; pthread_mutex_lock(cdef_row_mt[row].row_mutex_); pthread_cond_signal(cdef_row_mt[row].row_cond_); cdef_row_mt[row].is_row_done = 1; pthread_mutex_unlock(cdef_row_mt[row].row_mutex_); #else (void)cdef_sync; (void)row; #endif // CONFIG_MULTITHREAD } static inline void sync_read(AV1LfSync *const lf_sync, int r, int c, int plane) { #if CONFIG_MULTITHREAD const int nsync = lf_sync->sync_range; if (r && !(c & (nsync - 1))) { pthread_mutex_t *const mutex = &lf_sync->mutex_[plane][r - 1]; pthread_mutex_lock(mutex); while (c > lf_sync->cur_sb_col[plane][r - 1] - nsync) { pthread_cond_wait(&lf_sync->cond_[plane][r - 1], mutex); } pthread_mutex_unlock(mutex); } #else (void)lf_sync; (void)r; (void)c; (void)plane; #endif // CONFIG_MULTITHREAD } static inline void sync_write(AV1LfSync *const lf_sync, int r, int c, const int sb_cols, int plane) { #if CONFIG_MULTITHREAD const int nsync = lf_sync->sync_range; int cur; // Only signal when there are enough filtered SB for next row to run. int sig = 1; if (c < sb_cols - 1) { cur = c; if (c % nsync) sig = 0; } else { cur = sb_cols + nsync; } if (sig) { pthread_mutex_lock(&lf_sync->mutex_[plane][r]); // When a thread encounters an error, cur_sb_col[plane][r] is set to maximum // column number. In this case, the AOMMAX operation here ensures that // cur_sb_col[plane][r] is not overwritten with a smaller value thus // preventing the infinite waiting of threads in the relevant sync_read() // function. lf_sync->cur_sb_col[plane][r] = AOMMAX(lf_sync->cur_sb_col[plane][r], cur); pthread_cond_broadcast(&lf_sync->cond_[plane][r]); pthread_mutex_unlock(&lf_sync->mutex_[plane][r]); } #else (void)lf_sync; (void)r; (void)c; (void)sb_cols; (void)plane; #endif // CONFIG_MULTITHREAD } // One job of row loopfiltering. void av1_thread_loop_filter_rows( const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm, struct macroblockd_plane *planes, MACROBLOCKD *xd, int mi_row, int plane, int dir, int lpf_opt_level, AV1LfSync *const lf_sync, struct aom_internal_error_info *error_info, AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2) { // TODO(aomedia:3276): Pass error_info to the low-level functions as required // in future to handle error propagation. (void)error_info; const int sb_cols = CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2); const int r = mi_row >> num_mis_in_lpf_unit_height_log2; int mi_col, c; const bool joint_filter_chroma = (lpf_opt_level == 2) && plane > AOM_PLANE_Y; const int num_planes = joint_filter_chroma ? 2 : 1; assert(IMPLIES(joint_filter_chroma, plane == AOM_PLANE_U)); if (dir == 0) { for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) { c = mi_col >> MAX_MIB_SIZE_LOG2; av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer, mi_row, mi_col, plane, plane + num_planes); if (lpf_opt_level) { if (plane == AOM_PLANE_Y) { av1_filter_block_plane_vert_opt(cm, xd, &planes[plane], mi_row, mi_col, params_buf, tx_buf, num_mis_in_lpf_unit_height_log2); } else { av1_filter_block_plane_vert_opt_chroma( cm, xd, &planes[plane], mi_row, mi_col, params_buf, tx_buf, plane, joint_filter_chroma, num_mis_in_lpf_unit_height_log2); } } else { av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row, mi_col); } if (lf_sync != NULL) { sync_write(lf_sync, r, c, sb_cols, plane); } } } else if (dir == 1) { for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) { c = mi_col >> MAX_MIB_SIZE_LOG2; if (lf_sync != NULL) { // Wait for vertical edge filtering of the top-right block to be // completed sync_read(lf_sync, r, c, plane); // Wait for vertical edge filtering of the right block to be completed sync_read(lf_sync, r + 1, c, plane); } #if CONFIG_MULTITHREAD if (lf_sync && lf_sync->num_workers > 1) { pthread_mutex_lock(lf_sync->job_mutex); const bool lf_mt_exit = lf_sync->lf_mt_exit; pthread_mutex_unlock(lf_sync->job_mutex); // Exit in case any worker has encountered an error. if (lf_mt_exit) return; } #endif av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer, mi_row, mi_col, plane, plane + num_planes); if (lpf_opt_level) { if (plane == AOM_PLANE_Y) { av1_filter_block_plane_horz_opt(cm, xd, &planes[plane], mi_row, mi_col, params_buf, tx_buf, num_mis_in_lpf_unit_height_log2); } else { av1_filter_block_plane_horz_opt_chroma( cm, xd, &planes[plane], mi_row, mi_col, params_buf, tx_buf, plane, joint_filter_chroma, num_mis_in_lpf_unit_height_log2); } } else { av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row, mi_col); } } } } void av1_set_vert_loop_filter_done(AV1_COMMON *cm, AV1LfSync *lf_sync, int num_mis_in_lpf_unit_height_log2) { int plane, sb_row; const int sb_cols = CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, num_mis_in_lpf_unit_height_log2); const int sb_rows = CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, num_mis_in_lpf_unit_height_log2); // In case of loopfilter row-multithreading, the worker on an SB row waits for // the vertical edge filtering of the right and top-right SBs. Hence, in case // a thread (main/worker) encounters an error, update that vertical // loopfiltering of every SB row in the frame is complete in order to avoid // dependent workers waiting indefinitely. for (sb_row = 0; sb_row < sb_rows; ++sb_row) for (plane = 0; plane < MAX_MB_PLANE; ++plane) sync_write(lf_sync, sb_row, sb_cols - 1, sb_cols, plane); } static inline void sync_lf_workers(AVxWorker *const workers, AV1_COMMON *const cm, int num_workers) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); int had_error = workers[0].had_error; struct aom_internal_error_info error_info; // Read the error_info of main thread. if (had_error) { AVxWorker *const worker = &workers[0]; error_info = ((LFWorkerData *)worker->data2)->error_info; } // Wait till all rows are finished. for (int i = num_workers - 1; i > 0; --i) { AVxWorker *const worker = &workers[i]; if (!winterface->sync(worker)) { had_error = 1; error_info = ((LFWorkerData *)worker->data2)->error_info; } } if (had_error) aom_internal_error_copy(cm->error, &error_info); } // Row-based multi-threaded loopfilter hook static int loop_filter_row_worker(void *arg1, void *arg2) { AV1LfSync *const lf_sync = (AV1LfSync *)arg1; LFWorkerData *const lf_data = (LFWorkerData *)arg2; AV1LfMTInfo *cur_job_info; #if CONFIG_MULTITHREAD pthread_mutex_t *job_mutex_ = lf_sync->job_mutex; #endif struct aom_internal_error_info *const error_info = &lf_data->error_info; // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(error_info->jmp)) { error_info->setjmp = 0; #if CONFIG_MULTITHREAD pthread_mutex_lock(job_mutex_); lf_sync->lf_mt_exit = true; pthread_mutex_unlock(job_mutex_); #endif av1_set_vert_loop_filter_done(lf_data->cm, lf_sync, MAX_MIB_SIZE_LOG2); return 0; } error_info->setjmp = 1; while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) { const int lpf_opt_level = cur_job_info->lpf_opt_level; av1_thread_loop_filter_rows( lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd, cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir, lpf_opt_level, lf_sync, error_info, lf_data->params_buf, lf_data->tx_buf, MAX_MIB_SIZE_LOG2); } error_info->setjmp = 0; return 1; } static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd, int start, int stop, const int planes_to_lf[MAX_MB_PLANE], AVxWorker *workers, int num_workers, AV1LfSync *lf_sync, int lpf_opt_level) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); int i; loop_filter_frame_mt_init(cm, start, stop, planes_to_lf, num_workers, lf_sync, lpf_opt_level, MAX_MIB_SIZE_LOG2); // Set up loopfilter thread data. for (i = num_workers - 1; i >= 0; --i) { AVxWorker *const worker = &workers[i]; LFWorkerData *const lf_data = &lf_sync->lfdata[i]; worker->hook = loop_filter_row_worker; worker->data1 = lf_sync; worker->data2 = lf_data; // Loopfilter data loop_filter_data_reset(lf_data, frame, cm, xd); // Start loopfiltering worker->had_error = 0; if (i == 0) { winterface->execute(worker); } else { winterface->launch(worker); } } sync_lf_workers(workers, cm, num_workers); } static void loop_filter_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd, int start, int stop, const int planes_to_lf[MAX_MB_PLANE], int lpf_opt_level) { // Filter top rows of all planes first, in case the output can be partially // reconstructed row by row. int mi_row, plane, dir; AV1_DEBLOCKING_PARAMETERS params_buf[MAX_MIB_SIZE]; TX_SIZE tx_buf[MAX_MIB_SIZE]; for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { for (plane = 0; plane < MAX_MB_PLANE; ++plane) { if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) { continue; } for (dir = 0; dir < 2; ++dir) { av1_thread_loop_filter_rows(frame, cm, xd->plane, xd, mi_row, plane, dir, lpf_opt_level, /*lf_sync=*/NULL, xd->error_info, params_buf, tx_buf, MAX_MIB_SIZE_LOG2); } } } } void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd, int plane_start, int plane_end, int partial_frame, AVxWorker *workers, int num_workers, AV1LfSync *lf_sync, int lpf_opt_level) { int start_mi_row, end_mi_row, mi_rows_to_filter; int planes_to_lf[MAX_MB_PLANE]; if (!check_planes_to_loop_filter(&cm->lf, planes_to_lf, plane_start, plane_end)) return; start_mi_row = 0; mi_rows_to_filter = cm->mi_params.mi_rows; if (partial_frame && cm->mi_params.mi_rows > 8) { start_mi_row = cm->mi_params.mi_rows >> 1; start_mi_row &= 0xfffffff8; mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8); } end_mi_row = start_mi_row + mi_rows_to_filter; av1_loop_filter_frame_init(cm, plane_start, plane_end); if (num_workers > 1) { // Enqueue and execute loopfiltering jobs. loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, planes_to_lf, workers, num_workers, lf_sync, lpf_opt_level); } else { // Directly filter in the main thread. loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, planes_to_lf, lpf_opt_level); } } #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER static inline void lr_sync_read(void *const lr_sync, int r, int c, int plane) { #if CONFIG_MULTITHREAD AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync; const int nsync = loop_res_sync->sync_range; if (r && !(c & (nsync - 1))) { pthread_mutex_t *const mutex = &loop_res_sync->mutex_[plane][r - 1]; pthread_mutex_lock(mutex); while (c > loop_res_sync->cur_sb_col[plane][r - 1] - nsync) { pthread_cond_wait(&loop_res_sync->cond_[plane][r - 1], mutex); } pthread_mutex_unlock(mutex); } #else (void)lr_sync; (void)r; (void)c; (void)plane; #endif // CONFIG_MULTITHREAD } static inline void lr_sync_write(void *const lr_sync, int r, int c, const int sb_cols, int plane) { #if CONFIG_MULTITHREAD AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync; const int nsync = loop_res_sync->sync_range; int cur; // Only signal when there are enough filtered SB for next row to run. int sig = 1; if (c < sb_cols - 1) { cur = c; if (c % nsync) sig = 0; } else { cur = sb_cols + nsync; } if (sig) { pthread_mutex_lock(&loop_res_sync->mutex_[plane][r]); // When a thread encounters an error, cur_sb_col[plane][r] is set to maximum // column number. In this case, the AOMMAX operation here ensures that // cur_sb_col[plane][r] is not overwritten with a smaller value thus // preventing the infinite waiting of threads in the relevant sync_read() // function. loop_res_sync->cur_sb_col[plane][r] = AOMMAX(loop_res_sync->cur_sb_col[plane][r], cur); pthread_cond_broadcast(&loop_res_sync->cond_[plane][r]); pthread_mutex_unlock(&loop_res_sync->mutex_[plane][r]); } #else (void)lr_sync; (void)r; (void)c; (void)sb_cols; (void)plane; #endif // CONFIG_MULTITHREAD } // Allocate memory for loop restoration row synchronization void av1_loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm, int num_workers, int num_rows_lr, int num_planes, int width) { lr_sync->rows = num_rows_lr; lr_sync->num_planes = num_planes; #if CONFIG_MULTITHREAD { int i, j; for (j = 0; j < num_planes; j++) { CHECK_MEM_ERROR(cm, lr_sync->mutex_[j], aom_malloc(sizeof(*(lr_sync->mutex_[j])) * num_rows_lr)); if (lr_sync->mutex_[j]) { for (i = 0; i < num_rows_lr; ++i) { pthread_mutex_init(&lr_sync->mutex_[j][i], NULL); } } CHECK_MEM_ERROR(cm, lr_sync->cond_[j], aom_malloc(sizeof(*(lr_sync->cond_[j])) * num_rows_lr)); if (lr_sync->cond_[j]) { for (i = 0; i < num_rows_lr; ++i) { pthread_cond_init(&lr_sync->cond_[j][i], NULL); } } } CHECK_MEM_ERROR(cm, lr_sync->job_mutex, aom_malloc(sizeof(*(lr_sync->job_mutex)))); if (lr_sync->job_mutex) { pthread_mutex_init(lr_sync->job_mutex, NULL); } } #endif // CONFIG_MULTITHREAD CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata, aom_calloc(num_workers, sizeof(*(lr_sync->lrworkerdata)))); lr_sync->num_workers = num_workers; for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) { if (worker_idx < num_workers - 1) { CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rst_tmpbuf, (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE)); CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rlbs, aom_malloc(sizeof(RestorationLineBuffers))); } else { lr_sync->lrworkerdata[worker_idx].rst_tmpbuf = cm->rst_tmpbuf; lr_sync->lrworkerdata[worker_idx].rlbs = cm->rlbs; } } for (int j = 0; j < num_planes; j++) { CHECK_MEM_ERROR( cm, lr_sync->cur_sb_col[j], aom_malloc(sizeof(*(lr_sync->cur_sb_col[j])) * num_rows_lr)); } CHECK_MEM_ERROR( cm, lr_sync->job_queue, aom_malloc(sizeof(*(lr_sync->job_queue)) * num_rows_lr * num_planes)); // Set up nsync. lr_sync->sync_range = get_lr_sync_range(width); } // Deallocate loop restoration synchronization related mutex and data void av1_loop_restoration_dealloc(AV1LrSync *lr_sync) { if (lr_sync != NULL) { int j; #if CONFIG_MULTITHREAD int i; for (j = 0; j < MAX_MB_PLANE; j++) { if (lr_sync->mutex_[j] != NULL) { for (i = 0; i < lr_sync->rows; ++i) { pthread_mutex_destroy(&lr_sync->mutex_[j][i]); } aom_free(lr_sync->mutex_[j]); } if (lr_sync->cond_[j] != NULL) { for (i = 0; i < lr_sync->rows; ++i) { pthread_cond_destroy(&lr_sync->cond_[j][i]); } aom_free(lr_sync->cond_[j]); } } if (lr_sync->job_mutex != NULL) { pthread_mutex_destroy(lr_sync->job_mutex); aom_free(lr_sync->job_mutex); } #endif // CONFIG_MULTITHREAD for (j = 0; j < MAX_MB_PLANE; j++) { aom_free(lr_sync->cur_sb_col[j]); } aom_free(lr_sync->job_queue); if (lr_sync->lrworkerdata) { for (int worker_idx = 0; worker_idx < lr_sync->num_workers - 1; worker_idx++) { LRWorkerData *const workerdata_data = lr_sync->lrworkerdata + worker_idx; aom_free(workerdata_data->rst_tmpbuf); aom_free(workerdata_data->rlbs); } aom_free(lr_sync->lrworkerdata); } // clear the structure as the source of this call may be a resize in which // case this call will be followed by an _alloc() which may fail. av1_zero(*lr_sync); } } static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt, AV1_COMMON *cm) { FilterFrameCtxt *ctxt = lr_ctxt->ctxt; const int num_planes = av1_num_planes(cm); AV1LrMTInfo *lr_job_queue = lr_sync->job_queue; int32_t lr_job_counter[2], num_even_lr_jobs = 0; lr_sync->jobs_enqueued = 0; lr_sync->jobs_dequeued = 0; for (int plane = 0; plane < num_planes; plane++) { if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; num_even_lr_jobs = num_even_lr_jobs + ((ctxt[plane].rsi->vert_units + 1) >> 1); } lr_job_counter[0] = 0; lr_job_counter[1] = num_even_lr_jobs; for (int plane = 0; plane < num_planes; plane++) { if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; const int is_uv = plane > 0; const int ss_y = is_uv && cm->seq_params->subsampling_y; const int unit_size = ctxt[plane].rsi->restoration_unit_size; const int plane_h = ctxt[plane].plane_h; const int ext_size = unit_size * 3 / 2; int y0 = 0, i = 0; while (y0 < plane_h) { int remaining_h = plane_h - y0; int h = (remaining_h < ext_size) ? remaining_h : unit_size; RestorationTileLimits limits; limits.v_start = y0; limits.v_end = y0 + h; assert(limits.v_end <= plane_h); // Offset upwards to align with the restoration processing stripe const int voffset = RESTORATION_UNIT_OFFSET >> ss_y; limits.v_start = AOMMAX(0, limits.v_start - voffset); if (limits.v_end < plane_h) limits.v_end -= voffset; assert(lr_job_counter[0] <= num_even_lr_jobs); lr_job_queue[lr_job_counter[i & 1]].lr_unit_row = i; lr_job_queue[lr_job_counter[i & 1]].plane = plane; lr_job_queue[lr_job_counter[i & 1]].v_start = limits.v_start; lr_job_queue[lr_job_counter[i & 1]].v_end = limits.v_end; lr_job_queue[lr_job_counter[i & 1]].sync_mode = i & 1; if ((i & 1) == 0) { lr_job_queue[lr_job_counter[i & 1]].v_copy_start = limits.v_start + RESTORATION_BORDER; lr_job_queue[lr_job_counter[i & 1]].v_copy_end = limits.v_end - RESTORATION_BORDER; if (i == 0) { assert(limits.v_start == 0); lr_job_queue[lr_job_counter[i & 1]].v_copy_start = 0; } if (i == (ctxt[plane].rsi->vert_units - 1)) { assert(limits.v_end == plane_h); lr_job_queue[lr_job_counter[i & 1]].v_copy_end = plane_h; } } else { lr_job_queue[lr_job_counter[i & 1]].v_copy_start = AOMMAX(limits.v_start - RESTORATION_BORDER, 0); lr_job_queue[lr_job_counter[i & 1]].v_copy_end = AOMMIN(limits.v_end + RESTORATION_BORDER, plane_h); } lr_job_counter[i & 1]++; lr_sync->jobs_enqueued++; y0 += h; ++i; } } } static AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) { AV1LrMTInfo *cur_job_info = NULL; #if CONFIG_MULTITHREAD pthread_mutex_lock(lr_sync->job_mutex); if (!lr_sync->lr_mt_exit && lr_sync->jobs_dequeued < lr_sync->jobs_enqueued) { cur_job_info = lr_sync->job_queue + lr_sync->jobs_dequeued; lr_sync->jobs_dequeued++; } pthread_mutex_unlock(lr_sync->job_mutex); #else (void)lr_sync; #endif return cur_job_info; } static void set_loop_restoration_done(AV1LrSync *const lr_sync, FilterFrameCtxt *const ctxt) { for (int plane = 0; plane < MAX_MB_PLANE; ++plane) { if (ctxt[plane].rsi->frame_restoration_type == RESTORE_NONE) continue; int y0 = 0, row_number = 0; const int unit_size = ctxt[plane].rsi->restoration_unit_size; const int plane_h = ctxt[plane].plane_h; const int ext_size = unit_size * 3 / 2; const int hnum_rest_units = ctxt[plane].rsi->horz_units; while (y0 < plane_h) { const int remaining_h = plane_h - y0; const int h = (remaining_h < ext_size) ? remaining_h : unit_size; lr_sync_write(lr_sync, row_number, hnum_rest_units - 1, hnum_rest_units, plane); y0 += h; ++row_number; } } } // Implement row loop restoration for each thread. static int loop_restoration_row_worker(void *arg1, void *arg2) { AV1LrSync *const lr_sync = (AV1LrSync *)arg1; LRWorkerData *lrworkerdata = (LRWorkerData *)arg2; AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt; FilterFrameCtxt *ctxt = lr_ctxt->ctxt; int lr_unit_row; int plane; int plane_w; #if CONFIG_MULTITHREAD pthread_mutex_t *job_mutex_ = lr_sync->job_mutex; #endif struct aom_internal_error_info *const error_info = &lrworkerdata->error_info; // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(error_info->jmp)) { error_info->setjmp = 0; #if CONFIG_MULTITHREAD pthread_mutex_lock(job_mutex_); lr_sync->lr_mt_exit = true; pthread_mutex_unlock(job_mutex_); #endif // In case of loop restoration multithreading, the worker on an even lr // block row waits for the completion of the filtering of the top-right and // bottom-right blocks. Hence, in case a thread (main/worker) encounters an // error, update that filtering of every row in the frame is complete in // order to avoid the dependent workers from waiting indefinitely. set_loop_restoration_done(lr_sync, lr_ctxt->ctxt); return 0; } error_info->setjmp = 1; typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend, int vstart, int vend); static const copy_fun copy_funs[MAX_MB_PLANE] = { aom_yv12_partial_coloc_copy_y, aom_yv12_partial_coloc_copy_u, aom_yv12_partial_coloc_copy_v }; while (1) { AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync); if (cur_job_info != NULL) { RestorationTileLimits limits; sync_read_fn_t on_sync_read; sync_write_fn_t on_sync_write; limits.v_start = cur_job_info->v_start; limits.v_end = cur_job_info->v_end; lr_unit_row = cur_job_info->lr_unit_row; plane = cur_job_info->plane; plane_w = ctxt[plane].plane_w; // sync_mode == 1 implies only sync read is required in LR Multi-threading // sync_mode == 0 implies only sync write is required. on_sync_read = cur_job_info->sync_mode == 1 ? lr_sync_read : av1_lr_sync_read_dummy; on_sync_write = cur_job_info->sync_mode == 0 ? lr_sync_write : av1_lr_sync_write_dummy; av1_foreach_rest_unit_in_row( &limits, plane_w, lr_ctxt->on_rest_unit, lr_unit_row, ctxt[plane].rsi->restoration_unit_size, ctxt[plane].rsi->horz_units, ctxt[plane].rsi->vert_units, plane, &ctxt[plane], lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read, on_sync_write, lr_sync, error_info); copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, 0, plane_w, cur_job_info->v_copy_start, cur_job_info->v_copy_end); if (lrworkerdata->do_extend_border) { aom_extend_frame_borders_plane_row(lr_ctxt->frame, plane, cur_job_info->v_copy_start, cur_job_info->v_copy_end); } } else { break; } } error_info->setjmp = 0; return 1; } static inline void sync_lr_workers(AVxWorker *const workers, AV1_COMMON *const cm, int num_workers) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); int had_error = workers[0].had_error; struct aom_internal_error_info error_info; // Read the error_info of main thread. if (had_error) { AVxWorker *const worker = &workers[0]; error_info = ((LRWorkerData *)worker->data2)->error_info; } // Wait till all rows are finished. for (int i = num_workers - 1; i > 0; --i) { AVxWorker *const worker = &workers[i]; if (!winterface->sync(worker)) { had_error = 1; error_info = ((LRWorkerData *)worker->data2)->error_info; } } if (had_error) aom_internal_error_copy(cm->error, &error_info); } static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt, AVxWorker *workers, int num_workers, AV1LrSync *lr_sync, AV1_COMMON *cm, int do_extend_border) { FilterFrameCtxt *ctxt = lr_ctxt->ctxt; const int num_planes = av1_num_planes(cm); const AVxWorkerInterface *const winterface = aom_get_worker_interface(); int num_rows_lr = 0; for (int plane = 0; plane < num_planes; plane++) { if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; const int plane_h = ctxt[plane].plane_h; const int unit_size = cm->rst_info[plane].restoration_unit_size; num_rows_lr = AOMMAX(num_rows_lr, av1_lr_count_units(unit_size, plane_h)); } int i; assert(MAX_MB_PLANE == 3); if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows || num_workers > lr_sync->num_workers || num_planes > lr_sync->num_planes) { av1_loop_restoration_dealloc(lr_sync); av1_loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr, num_planes, cm->width); } lr_sync->lr_mt_exit = false; // Initialize cur_sb_col to -1 for all SB rows. for (i = 0; i < num_planes; i++) { memset(lr_sync->cur_sb_col[i], -1, sizeof(*(lr_sync->cur_sb_col[i])) * num_rows_lr); } enqueue_lr_jobs(lr_sync, lr_ctxt, cm); // Set up looprestoration thread data. for (i = num_workers - 1; i >= 0; --i) { AVxWorker *const worker = &workers[i]; lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt; lr_sync->lrworkerdata[i].do_extend_border = do_extend_border; worker->hook = loop_restoration_row_worker; worker->data1 = lr_sync; worker->data2 = &lr_sync->lrworkerdata[i]; // Start loop restoration worker->had_error = 0; if (i == 0) { winterface->execute(worker); } else { winterface->launch(worker); } } sync_lr_workers(workers, cm, num_workers); } void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, int optimized_lr, AVxWorker *workers, int num_workers, AV1LrSync *lr_sync, void *lr_ctxt, int do_extend_border) { assert(!cm->features.all_lossless); const int num_planes = av1_num_planes(cm); AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt; av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm, optimized_lr, num_planes); foreach_rest_unit_in_planes_mt(loop_rest_ctxt, workers, num_workers, lr_sync, cm, do_extend_border); } #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER // Initializes cdef_sync parameters. static inline void reset_cdef_job_info(AV1CdefSync *const cdef_sync) { cdef_sync->end_of_frame = 0; cdef_sync->fbr = 0; cdef_sync->fbc = 0; cdef_sync->cdef_mt_exit = false; } static inline void launch_cdef_workers(AVxWorker *const workers, int num_workers) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = &workers[i]; worker->had_error = 0; if (i == 0) winterface->execute(worker); else winterface->launch(worker); } } static inline void sync_cdef_workers(AVxWorker *const workers, AV1_COMMON *const cm, int num_workers) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); int had_error = workers[0].had_error; struct aom_internal_error_info error_info; // Read the error_info of main thread. if (had_error) { AVxWorker *const worker = &workers[0]; error_info = ((AV1CdefWorkerData *)worker->data2)->error_info; } // Wait till all rows are finished. for (int i = num_workers - 1; i > 0; --i) { AVxWorker *const worker = &workers[i]; if (!winterface->sync(worker)) { had_error = 1; error_info = ((AV1CdefWorkerData *)worker->data2)->error_info; } } if (had_error) aom_internal_error_copy(cm->error, &error_info); } // Updates the row index of the next job to be processed. // Also updates end_of_frame flag when the processing of all rows is complete. static void update_cdef_row_next_job_info(AV1CdefSync *const cdef_sync, const int nvfb) { cdef_sync->fbr++; if (cdef_sync->fbr == nvfb) { cdef_sync->end_of_frame = 1; } } // Checks if a job is available. If job is available, // populates next job information and returns 1, else returns 0. static inline int get_cdef_row_next_job(AV1CdefSync *const cdef_sync, volatile int *cur_fbr, const int nvfb) { #if CONFIG_MULTITHREAD pthread_mutex_lock(cdef_sync->mutex_); #endif // CONFIG_MULTITHREAD int do_next_row = 0; // Populates information needed for current job and update the row // index of the next row to be processed. if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) { do_next_row = 1; *cur_fbr = cdef_sync->fbr; update_cdef_row_next_job_info(cdef_sync, nvfb); } #if CONFIG_MULTITHREAD pthread_mutex_unlock(cdef_sync->mutex_); #endif // CONFIG_MULTITHREAD return do_next_row; } static void set_cdef_init_fb_row_done(AV1CdefSync *const cdef_sync, int nvfb) { for (int fbr = 0; fbr < nvfb; fbr++) cdef_row_mt_sync_write(cdef_sync, fbr); } // Hook function for each thread in CDEF multi-threading. static int cdef_sb_row_worker_hook(void *arg1, void *arg2) { AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1; AV1CdefWorkerData *const cdef_worker = (AV1CdefWorkerData *)arg2; AV1_COMMON *cm = cdef_worker->cm; const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; #if CONFIG_MULTITHREAD pthread_mutex_t *job_mutex_ = cdef_sync->mutex_; #endif struct aom_internal_error_info *const error_info = &cdef_worker->error_info; // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(error_info->jmp)) { error_info->setjmp = 0; #if CONFIG_MULTITHREAD pthread_mutex_lock(job_mutex_); cdef_sync->cdef_mt_exit = true; pthread_mutex_unlock(job_mutex_); #endif // In case of cdef row-multithreading, the worker on a filter block row // (fbr) waits for the line buffers (top and bottom) copy of the above row. // Hence, in case a thread (main/worker) encounters an error before copying // of the line buffers, update that line buffer copy is complete in order to // avoid dependent workers waiting indefinitely. set_cdef_init_fb_row_done(cdef_sync, nvfb); return 0; } error_info->setjmp = 1; volatile int cur_fbr; const int num_planes = av1_num_planes(cm); while (get_cdef_row_next_job(cdef_sync, &cur_fbr, nvfb)) { MACROBLOCKD *xd = cdef_worker->xd; av1_cdef_fb_row(cm, xd, cdef_worker->linebuf, cdef_worker->colbuf, cdef_worker->srcbuf, cur_fbr, cdef_worker->cdef_init_fb_row_fn, cdef_sync, error_info); if (cdef_worker->do_extend_border) { for (int plane = 0; plane < num_planes; ++plane) { const YV12_BUFFER_CONFIG *ybf = &cm->cur_frame->buf; const int is_uv = plane > 0; const int mi_high = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y; const int unit_height = MI_SIZE_64X64 << mi_high; const int v_start = cur_fbr * unit_height; const int v_end = AOMMIN(v_start + unit_height, ybf->crop_heights[is_uv]); aom_extend_frame_borders_plane_row(ybf, plane, v_start, v_end); } } } error_info->setjmp = 0; return 1; } // Assigns CDEF hook function and thread data to each worker. static void prepare_cdef_frame_workers( AV1_COMMON *const cm, MACROBLOCKD *xd, AV1CdefWorkerData *const cdef_worker, AVxWorkerHook hook, AVxWorker *const workers, AV1CdefSync *const cdef_sync, int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn, int do_extend_border) { const int num_planes = av1_num_planes(cm); cdef_worker[0].srcbuf = cm->cdef_info.srcbuf; for (int plane = 0; plane < num_planes; plane++) cdef_worker[0].colbuf[plane] = cm->cdef_info.colbuf[plane]; for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = &workers[i]; cdef_worker[i].cm = cm; cdef_worker[i].xd = xd; cdef_worker[i].cdef_init_fb_row_fn = cdef_init_fb_row_fn; cdef_worker[i].do_extend_border = do_extend_border; for (int plane = 0; plane < num_planes; plane++) cdef_worker[i].linebuf[plane] = cm->cdef_info.linebuf[plane]; worker->hook = hook; worker->data1 = cdef_sync; worker->data2 = &cdef_worker[i]; } } // Initializes row-level parameters for CDEF frame. void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, CdefBlockInfo *const fb_info, uint16_t **const linebuf, uint16_t *const src, struct AV1CdefSyncData *const cdef_sync, int fbr) { const int num_planes = av1_num_planes(cm); const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; const int luma_stride = ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4); // for the current filter block, it's top left corner mi structure (mi_tl) // is first accessed to check whether the top and left boundaries are // frame boundaries. Then bottom-left and top-right mi structures are // accessed to check whether the bottom and right boundaries // (respectively) are frame boundaries. // // Note that we can't just check the bottom-right mi structure - eg. if // we're at the right-hand edge of the frame but not the bottom, then // the bottom-right mi is NULL but the bottom-left is not. fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0; if (fbr != nvfb - 1) fb_info->frame_boundary[BOTTOM] = (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0; else fb_info->frame_boundary[BOTTOM] = 1; fb_info->src = src; fb_info->damping = cm->cdef_info.cdef_damping; fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0); av1_zero(fb_info->dir); av1_zero(fb_info->var); for (int plane = 0; plane < num_planes; plane++) { const int stride = luma_stride >> xd->plane[plane].subsampling_x; uint16_t *top_linebuf = &linebuf[plane][0]; uint16_t *bot_linebuf = &linebuf[plane][nvfb * CDEF_VBORDER * stride]; { const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y; const int top_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2; const int bot_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2; if (fbr != nvfb - 1) // if (fbr != 0) // top line buffer copy av1_cdef_copy_sb8_16( cm, &top_linebuf[(fbr + 1) * CDEF_VBORDER * stride], stride, xd->plane[plane].dst.buf, top_offset - CDEF_VBORDER, 0, xd->plane[plane].dst.stride, CDEF_VBORDER, stride); if (fbr != nvfb - 1) // bottom line buffer copy av1_cdef_copy_sb8_16(cm, &bot_linebuf[fbr * CDEF_VBORDER * stride], stride, xd->plane[plane].dst.buf, bot_offset, 0, xd->plane[plane].dst.stride, CDEF_VBORDER, stride); } fb_info->top_linebuf[plane] = &linebuf[plane][fbr * CDEF_VBORDER * stride]; fb_info->bot_linebuf[plane] = &linebuf[plane] [nvfb * CDEF_VBORDER * stride + (fbr * CDEF_VBORDER * stride)]; } cdef_row_mt_sync_write(cdef_sync, fbr); cdef_row_mt_sync_read(cdef_sync, fbr); } // Implements multi-threading for CDEF. // Perform CDEF on input frame. // Inputs: // frame: Pointer to input frame buffer. // cm: Pointer to common structure. // xd: Pointer to common current coding block structure. // Returns: // Nothing will be returned. void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd, AV1CdefWorkerData *const cdef_worker, AVxWorker *const workers, AV1CdefSync *const cdef_sync, int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn, int do_extend_border) { YV12_BUFFER_CONFIG *frame = &cm->cur_frame->buf; const int num_planes = av1_num_planes(cm); av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0, num_planes); reset_cdef_job_info(cdef_sync); prepare_cdef_frame_workers(cm, xd, cdef_worker, cdef_sb_row_worker_hook, workers, cdef_sync, num_workers, cdef_init_fb_row_fn, do_extend_border); launch_cdef_workers(workers, num_workers); sync_cdef_workers(workers, cm, num_workers); } int av1_get_intrabc_extra_top_right_sb_delay(const AV1_COMMON *cm) { // No additional top-right delay when intraBC tool is not enabled. if (!av1_allow_intrabc(cm)) return 0; // Due to the hardware constraints on processing the intraBC tool with row // multithreading, a top-right delay of 3 superblocks of size 128x128 or 5 // superblocks of size 64x64 is mandated. However, a minimum top-right delay // of 1 superblock is assured with 'sync_range'. Hence return only the // additional superblock delay when the intraBC tool is enabled. return cm->seq_params->sb_size == BLOCK_128X128 ? 2 : 4; } aom-3.12.1/av1/common/thread_common.h000066400000000000000000000310471477627663500173550ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_THREAD_COMMON_H_ #define AOM_AV1_COMMON_THREAD_COMMON_H_ #include "config/aom_config.h" #include "av1/common/av1_loopfilter.h" #include "av1/common/cdef.h" #include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #ifdef __cplusplus extern "C" { #endif struct AV1Common; typedef struct AV1LfMTInfo { int mi_row; int plane; int dir; int lpf_opt_level; } AV1LfMTInfo; // Loopfilter row synchronization typedef struct AV1LfSyncData { #if CONFIG_MULTITHREAD pthread_mutex_t *mutex_[MAX_MB_PLANE]; pthread_cond_t *cond_[MAX_MB_PLANE]; #endif // Allocate memory to store the loop-filtered superblock index in each row. int *cur_sb_col[MAX_MB_PLANE]; // The optimal sync_range for different resolution and platform should be // determined by testing. Currently, it is chosen to be a power-of-2 number. int sync_range; int rows; // Row-based parallel loopfilter data LFWorkerData *lfdata; int num_workers; #if CONFIG_MULTITHREAD pthread_mutex_t *job_mutex; #endif AV1LfMTInfo *job_queue; int jobs_enqueued; int jobs_dequeued; // Initialized to false, set to true by the worker thread that encounters an // error in order to abort the processing of other worker threads. bool lf_mt_exit; } AV1LfSync; typedef struct AV1LrMTInfo { int v_start; int v_end; int lr_unit_row; int plane; int sync_mode; int v_copy_start; int v_copy_end; } AV1LrMTInfo; typedef struct LoopRestorationWorkerData { int32_t *rst_tmpbuf; void *rlbs; void *lr_ctxt; int do_extend_border; struct aom_internal_error_info error_info; } LRWorkerData; // Looprestoration row synchronization typedef struct AV1LrSyncData { #if CONFIG_MULTITHREAD pthread_mutex_t *mutex_[MAX_MB_PLANE]; pthread_cond_t *cond_[MAX_MB_PLANE]; #endif // Allocate memory to store the loop-restoration block index in each row. int *cur_sb_col[MAX_MB_PLANE]; // The optimal sync_range for different resolution and platform should be // determined by testing. Currently, it is chosen to be a power-of-2 number. int sync_range; int rows; int num_planes; int num_workers; #if CONFIG_MULTITHREAD pthread_mutex_t *job_mutex; #endif // Row-based parallel loopfilter data LRWorkerData *lrworkerdata; AV1LrMTInfo *job_queue; int jobs_enqueued; int jobs_dequeued; // Initialized to false, set to true by the worker thread that encounters // an error in order to abort the processing of other worker threads. bool lr_mt_exit; } AV1LrSync; typedef struct AV1CdefWorker { AV1_COMMON *cm; MACROBLOCKD *xd; uint16_t *colbuf[MAX_MB_PLANE]; uint16_t *srcbuf; uint16_t *linebuf[MAX_MB_PLANE]; cdef_init_fb_row_t cdef_init_fb_row_fn; int do_extend_border; struct aom_internal_error_info error_info; } AV1CdefWorkerData; typedef struct AV1CdefRowSync { #if CONFIG_MULTITHREAD pthread_mutex_t *row_mutex_; pthread_cond_t *row_cond_; #endif // CONFIG_MULTITHREAD int is_row_done; } AV1CdefRowSync; // Data related to CDEF search multi-thread synchronization. typedef struct AV1CdefSyncData { #if CONFIG_MULTITHREAD // Mutex lock used while dispatching jobs. pthread_mutex_t *mutex_; #endif // CONFIG_MULTITHREAD // Data related to CDEF row mt sync information AV1CdefRowSync *cdef_row_mt; // Flag to indicate all blocks are processed and end of frame is reached int end_of_frame; // Row index in units of 64x64 block int fbr; // Column index in units of 64x64 block int fbc; // Initialized to false, set to true by the worker thread that encounters // an error in order to abort the processing of other worker threads. bool cdef_mt_exit; } AV1CdefSync; void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd, AV1CdefWorkerData *const cdef_worker, AVxWorker *const workers, AV1CdefSync *const cdef_sync, int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn, int do_extend_border); void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, CdefBlockInfo *const fb_info, uint16_t **const linebuf, uint16_t *const src, struct AV1CdefSyncData *const cdef_sync, int fbr); void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst, int dstride, const uint8_t *src, int src_voffset, int src_hoffset, int sstride, int vsize, int hsize); void av1_cdef_copy_sb8_16_lowbd(uint16_t *const dst, int dstride, const uint8_t *src, int src_voffset, int src_hoffset, int sstride, int vsize, int hsize); #if CONFIG_AV1_HIGHBITDEPTH void av1_cdef_copy_sb8_16_highbd(uint16_t *const dst, int dstride, const uint8_t *src, int src_voffset, int src_hoffset, int sstride, int vsize, int hsize); #endif // CONFIG_AV1_HIGHBITDEPTH void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync, int num_workers); void av1_free_cdef_sync(AV1CdefSync *cdef_sync); // Deallocate loopfilter synchronization related mutex and data. void av1_loop_filter_dealloc(AV1LfSync *lf_sync); void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows, int width, int num_workers); void av1_set_vert_loop_filter_done(AV1_COMMON *cm, AV1LfSync *lf_sync, int num_mis_in_lpf_unit_height_log2); void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, struct macroblockd *xd, int plane_start, int plane_end, int partial_frame, AVxWorker *workers, int num_workers, AV1LfSync *lf_sync, int lpf_opt_level); #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, int optimized_lr, AVxWorker *workers, int num_workers, AV1LrSync *lr_sync, void *lr_ctxt, int do_extend_border); void av1_loop_restoration_dealloc(AV1LrSync *lr_sync); void av1_loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm, int num_workers, int num_rows_lr, int num_planes, int width); #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER int av1_get_intrabc_extra_top_right_sb_delay(const AV1_COMMON *cm); void av1_thread_loop_filter_rows( const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm, struct macroblockd_plane *planes, MACROBLOCKD *xd, int mi_row, int plane, int dir, int lpf_opt_level, AV1LfSync *const lf_sync, struct aom_internal_error_info *error_info, AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, int mib_size_log2); static AOM_FORCE_INLINE bool skip_loop_filter_plane( const int planes_to_lf[MAX_MB_PLANE], int plane, int lpf_opt_level) { // If LPF_PICK_METHOD is LPF_PICK_FROM_Q, we have the option to filter both // chroma planes together if (lpf_opt_level == 2) { if (plane == AOM_PLANE_Y) { return !planes_to_lf[plane]; } if (plane == AOM_PLANE_U) { // U and V are handled together return !planes_to_lf[1] && !planes_to_lf[2]; } assert(plane == AOM_PLANE_V); if (plane == AOM_PLANE_V) { // V is handled when u is filtered return true; } } // Normal operation mode return !planes_to_lf[plane]; } static inline void enqueue_lf_jobs(AV1LfSync *lf_sync, int start, int stop, const int planes_to_lf[MAX_MB_PLANE], int lpf_opt_level, int num_mis_in_lpf_unit_height) { int mi_row, plane, dir; AV1LfMTInfo *lf_job_queue = lf_sync->job_queue; lf_sync->jobs_enqueued = 0; lf_sync->jobs_dequeued = 0; // Launch all vertical jobs first, as they are blocking the horizontal ones. // Launch top row jobs for all planes first, in case the output can be // partially reconstructed row by row. for (dir = 0; dir < 2; ++dir) { for (mi_row = start; mi_row < stop; mi_row += num_mis_in_lpf_unit_height) { for (plane = 0; plane < MAX_MB_PLANE; ++plane) { if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) { continue; } if (!planes_to_lf[plane]) continue; lf_job_queue->mi_row = mi_row; lf_job_queue->plane = plane; lf_job_queue->dir = dir; lf_job_queue->lpf_opt_level = lpf_opt_level; lf_job_queue++; lf_sync->jobs_enqueued++; } } } } static inline void loop_filter_frame_mt_init( AV1_COMMON *cm, int start_mi_row, int end_mi_row, const int planes_to_lf[MAX_MB_PLANE], int num_workers, AV1LfSync *lf_sync, int lpf_opt_level, int num_mis_in_lpf_unit_height_log2) { // Number of superblock rows const int sb_rows = CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, num_mis_in_lpf_unit_height_log2); if (!lf_sync->sync_range || sb_rows != lf_sync->rows || num_workers > lf_sync->num_workers) { av1_loop_filter_dealloc(lf_sync); av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); } lf_sync->lf_mt_exit = false; // Initialize cur_sb_col to -1 for all SB rows. for (int i = 0; i < MAX_MB_PLANE; i++) { memset(lf_sync->cur_sb_col[i], -1, sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows); } enqueue_lf_jobs(lf_sync, start_mi_row, end_mi_row, planes_to_lf, lpf_opt_level, (1 << num_mis_in_lpf_unit_height_log2)); } static inline AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) { AV1LfMTInfo *cur_job_info = NULL; #if CONFIG_MULTITHREAD pthread_mutex_lock(lf_sync->job_mutex); if (!lf_sync->lf_mt_exit && lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) { cur_job_info = lf_sync->job_queue + lf_sync->jobs_dequeued; lf_sync->jobs_dequeued++; } pthread_mutex_unlock(lf_sync->job_mutex); #else (void)lf_sync; #endif return cur_job_info; } static inline void loop_filter_data_reset(LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer, struct AV1Common *cm, MACROBLOCKD *xd) { struct macroblockd_plane *pd = xd->plane; lf_data->frame_buffer = frame_buffer; lf_data->cm = cm; lf_data->xd = xd; for (int i = 0; i < MAX_MB_PLANE; i++) { memcpy(&lf_data->planes[i].dst, &pd[i].dst, sizeof(lf_data->planes[i].dst)); lf_data->planes[i].subsampling_x = pd[i].subsampling_x; lf_data->planes[i].subsampling_y = pd[i].subsampling_y; } } static inline void set_planes_to_loop_filter(const struct loopfilter *lf, int planes_to_lf[MAX_MB_PLANE], int plane_start, int plane_end) { // For each luma and chroma plane, whether to filter it or not. planes_to_lf[0] = (lf->filter_level[0] || lf->filter_level[1]) && plane_start <= 0 && 0 < plane_end; planes_to_lf[1] = lf->filter_level_u && plane_start <= 1 && 1 < plane_end; planes_to_lf[2] = lf->filter_level_v && plane_start <= 2 && 2 < plane_end; } static inline int check_planes_to_loop_filter(const struct loopfilter *lf, int planes_to_lf[MAX_MB_PLANE], int plane_start, int plane_end) { set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end); // If the luma plane is purposely not filtered, neither are the chroma // planes. if (!planes_to_lf[0] && plane_start <= 0 && 0 < plane_end) return 0; // Early exit. if (!planes_to_lf[0] && !planes_to_lf[1] && !planes_to_lf[2]) return 0; return 1; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_THREAD_COMMON_H_ aom-3.12.1/av1/common/tile_common.c000066400000000000000000000204601477627663500170330ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/common/av1_common_int.h" #include "av1/common/resize.h" #include "av1/common/tile_common.h" #include "aom_dsp/aom_dsp_common.h" void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) { av1_tile_set_row(tile, cm, row); av1_tile_set_col(tile, cm, col); } // Find smallest k>=0 such that (blk_size << k) >= target static int tile_log2(int blk_size, int target) { int k; for (k = 0; (blk_size << k) < target; k++) { } return k; } void av1_get_tile_limits(AV1_COMMON *const cm) { const SequenceHeader *const seq_params = cm->seq_params; CommonTileParams *const tiles = &cm->tiles; const int sb_cols = CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2); const int sb_rows = CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2); const int sb_size_log2 = seq_params->mib_size_log2 + MI_SIZE_LOG2; tiles->max_width_sb = MAX_TILE_WIDTH >> sb_size_log2; #if CONFIG_CWG_C013 bool use_level_7_above = false; for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) { if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_7_0 && seq_params->seq_level_idx[i] <= SEQ_LEVEL_8_3) { // Currently it is assumed that levels 7.x and 8.x are either used for all // operating points, or none of them. if (i != 0 && !use_level_7_above) { aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Either all the operating points are levels 7.x or " "8.x, or none of them are."); } use_level_7_above = true; } } const int max_tile_area_sb = (use_level_7_above ? MAX_TILE_AREA_LEVEL_7_AND_ABOVE : MAX_TILE_AREA) >> (2 * sb_size_log2); #else const int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2); #endif tiles->min_log2_cols = tile_log2(tiles->max_width_sb, sb_cols); tiles->max_log2_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS)); tiles->max_log2_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS)); tiles->min_log2 = tile_log2(max_tile_area_sb, sb_cols * sb_rows); tiles->min_log2 = AOMMAX(tiles->min_log2, tiles->min_log2_cols); } void av1_calculate_tile_cols(const SequenceHeader *const seq_params, int cm_mi_rows, int cm_mi_cols, CommonTileParams *const tiles) { int sb_cols = CEIL_POWER_OF_TWO(cm_mi_cols, seq_params->mib_size_log2); int sb_rows = CEIL_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2); int i; // This will be overridden if there is at least two columns of tiles // (otherwise there is no inner tile width) tiles->min_inner_width = -1; if (tiles->uniform_spacing) { int start_sb; int size_sb = CEIL_POWER_OF_TWO(sb_cols, tiles->log2_cols); assert(size_sb > 0); for (i = 0, start_sb = 0; start_sb < sb_cols; i++) { tiles->col_start_sb[i] = start_sb; start_sb += size_sb; } tiles->cols = i; tiles->col_start_sb[i] = sb_cols; tiles->min_log2_rows = AOMMAX(tiles->min_log2 - tiles->log2_cols, 0); tiles->max_height_sb = sb_rows >> tiles->min_log2_rows; tiles->width = size_sb << seq_params->mib_size_log2; tiles->width = AOMMIN(tiles->width, cm_mi_cols); if (tiles->cols > 1) { tiles->min_inner_width = tiles->width; } } else { int max_tile_area_sb = (sb_rows * sb_cols); int widest_tile_sb = 1; int narrowest_inner_tile_sb = 65536; tiles->log2_cols = tile_log2(1, tiles->cols); for (i = 0; i < tiles->cols; i++) { int size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; widest_tile_sb = AOMMAX(widest_tile_sb, size_sb); // ignore the rightmost tile in frame for determining the narrowest if (i < tiles->cols - 1) narrowest_inner_tile_sb = AOMMIN(narrowest_inner_tile_sb, size_sb); } if (tiles->min_log2) { max_tile_area_sb >>= (tiles->min_log2 + 1); } tiles->max_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1); if (tiles->cols > 1) { tiles->min_inner_width = narrowest_inner_tile_sb << seq_params->mib_size_log2; } } } void av1_calculate_tile_rows(const SequenceHeader *const seq_params, int cm_mi_rows, CommonTileParams *const tiles) { int sb_rows = CEIL_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2); int start_sb, size_sb, i; if (tiles->uniform_spacing) { size_sb = CEIL_POWER_OF_TWO(sb_rows, tiles->log2_rows); assert(size_sb > 0); for (i = 0, start_sb = 0; start_sb < sb_rows; i++) { tiles->row_start_sb[i] = start_sb; start_sb += size_sb; } tiles->rows = i; tiles->row_start_sb[i] = sb_rows; tiles->height = size_sb << seq_params->mib_size_log2; tiles->height = AOMMIN(tiles->height, cm_mi_rows); } else { tiles->log2_rows = tile_log2(1, tiles->rows); } } void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) { assert(row < cm->tiles.rows); int mi_row_start = cm->tiles.row_start_sb[row] << cm->seq_params->mib_size_log2; int mi_row_end = cm->tiles.row_start_sb[row + 1] << cm->seq_params->mib_size_log2; tile->tile_row = row; tile->mi_row_start = mi_row_start; tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_params.mi_rows); assert(tile->mi_row_end > tile->mi_row_start); } void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) { assert(col < cm->tiles.cols); int mi_col_start = cm->tiles.col_start_sb[col] << cm->seq_params->mib_size_log2; int mi_col_end = cm->tiles.col_start_sb[col + 1] << cm->seq_params->mib_size_log2; tile->tile_col = col; tile->mi_col_start = mi_col_start; tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_params.mi_cols); assert(tile->mi_col_end > tile->mi_col_start); } int av1_get_sb_rows_in_tile(const AV1_COMMON *cm, const TileInfo *tile) { return CEIL_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start, cm->seq_params->mib_size_log2); } int av1_get_sb_cols_in_tile(const AV1_COMMON *cm, const TileInfo *tile) { return CEIL_POWER_OF_TWO(tile->mi_col_end - tile->mi_col_start, cm->seq_params->mib_size_log2); } // Section 7.3.1 of the AV1 spec says, on pages 200-201: // It is a requirement of bitstream conformance that the following conditions // are met: // ... // * TileHeight is equal to (use_128x128_superblock ? 128 : 64) for all // tiles (i.e. the tile is exactly one superblock high) // * TileWidth is identical for all tiles and is an integer multiple of // TileHeight (i.e. the tile is an integer number of superblocks wide) // ... bool av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) { const CommonTileParams *const tiles = &cm->tiles; if (tiles->uniform_spacing) { *w = tiles->width; *h = tiles->height; } else { for (int i = 0; i < tiles->cols; ++i) { const int tile_width_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; const int tile_w = tile_width_sb * cm->seq_params->mib_size; // ensure all tiles have same dimension if (i != 0 && tile_w != *w) { return false; } *w = tile_w; } for (int i = 0; i < tiles->rows; ++i) { const int tile_height_sb = tiles->row_start_sb[i + 1] - tiles->row_start_sb[i]; const int tile_h = tile_height_sb * cm->seq_params->mib_size; // ensure all tiles have same dimension if (i != 0 && tile_h != *h) { return false; } *h = tile_h; } } return true; } int av1_is_min_tile_width_satisfied(const AV1_COMMON *cm) { // Disable check if there is a single tile col in the frame if (cm->tiles.cols == 1) return 1; return ((cm->tiles.min_inner_width << MI_SIZE_LOG2) >= (64 << av1_superres_scaled(cm))); } aom-3.12.1/av1/common/tile_common.h000066400000000000000000000052541477627663500170440ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_TILE_COMMON_H_ #define AOM_AV1_COMMON_TILE_COMMON_H_ #include #include "config/aom_config.h" #ifdef __cplusplus extern "C" { #endif struct AV1Common; struct SequenceHeader; struct CommonTileParams; #define DEFAULT_MAX_NUM_TG 1 typedef struct TileInfo { int mi_row_start, mi_row_end; int mi_col_start, mi_col_end; int tile_row; int tile_col; } TileInfo; // initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on // 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)' void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row, int col); void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row); void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col); int av1_get_sb_rows_in_tile(const struct AV1Common *cm, const TileInfo *tile); int av1_get_sb_cols_in_tile(const struct AV1Common *cm, const TileInfo *tile); // Define tile maximum width and area // There is no maximum height since height is limited by area and width limits // The minimum tile width or height is fixed at one superblock #define MAX_TILE_WIDTH (4096) // Max Tile width in pixels #define MAX_TILE_AREA (4096 * 2304) // Maximum tile area in pixels #if CONFIG_CWG_C013 #define MAX_TILE_AREA_LEVEL_7_AND_ABOVE (4096 * 4608) #endif // Gets the width and height (in units of MI_SIZE) of the tiles in a tile list. // Returns true on success, false on failure. bool av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h); void av1_get_tile_limits(struct AV1Common *const cm); void av1_calculate_tile_cols(const struct SequenceHeader *const seq_params, int cm_mi_rows, int cm_mi_cols, struct CommonTileParams *const tiles); void av1_calculate_tile_rows(const struct SequenceHeader *const seq_params, int cm_mi_rows, struct CommonTileParams *const tiles); // Checks if the minimum tile_width requirement is satisfied int av1_is_min_tile_width_satisfied(const struct AV1Common *cm); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_TILE_COMMON_H_ aom-3.12.1/av1/common/timing.c000066400000000000000000000100021477627663500160040ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/timing.h" /* Tables for AV1 max bitrates for different levels of main and high tier. * The tables are in Kbps instead of Mbps in the specification. * Note that depending on the profile, a multiplier is needed. */ #define UNDEFINED_RATE \ (1 << 21) // Placeholder rate for levels with undefined rate #define INVALID_RATE \ (0) // For invalid profile-level configuration, set rate to 0 /* Max Bitrates for levels of Main Tier in kbps. Bitrate in main_kbps [31] */ /* is a dummy value. The decoder model is not applicable for level 31. */ static const int32_t main_kbps[1 << LEVEL_BITS] = { 1500, 3000, UNDEFINED_RATE, UNDEFINED_RATE, 6000, 10000, UNDEFINED_RATE, UNDEFINED_RATE, 12000, 20000, UNDEFINED_RATE, UNDEFINED_RATE, 30000, 40000, 60000, 60000, 60000, 100000, 160000, 160000, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE }; /* Max Bitrates for levels of High Tier in kbps. Bitrate in high_kbps [31] */ /* is a dummy value. The decoder model is not applicable for level 31. */ static const int32_t high_kbps[1 << LEVEL_BITS] = { INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE, 30000, 50000, UNDEFINED_RATE, UNDEFINED_RATE, 100000, 160000, 240000, 240000, 240000, 480000, 800000, 800000, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE }; /* BitrateProfileFactor */ static const int bitrate_profile_factor[1 << PROFILE_BITS] = { 1, 2, 3, 0, 0, 0, 0, 0 }; int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx, int seq_tier) { int64_t bitrate; if (seq_tier) { bitrate = high_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile]; } else { bitrate = main_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile]; } return bitrate * 1000; } void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) { decoder_model->encoder_decoder_buffer_delay_length = 16; decoder_model->buffer_removal_time_length = 10; decoder_model->frame_presentation_time_length = 10; } void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) { op_params->decoder_model_param_present_flag = 1; op_params->decoder_buffer_delay = 90000 >> 1; // 0.5 s op_params->encoder_buffer_delay = 90000 >> 1; // 0.5 s op_params->low_delay_mode_flag = 0; op_params->display_model_param_present_flag = 1; op_params->initial_display_delay = 8; // 8 frames delay } void av1_set_resource_availability_parameters( aom_dec_model_op_parameters_t *op_params) { op_params->decoder_model_param_present_flag = 0; op_params->decoder_buffer_delay = 70000; // Resource availability mode default op_params->encoder_buffer_delay = 20000; // Resource availability mode default op_params->low_delay_mode_flag = 0; // Resource availability mode default op_params->display_model_param_present_flag = 1; op_params->initial_display_delay = 8; // 8 frames delay } aom-3.12.1/av1/common/timing.h000066400000000000000000000034221477627663500160210ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_TIMING_H_ #define AOM_AV1_COMMON_TIMING_H_ #include "aom/aom_integer.h" #include "av1/common/enums.h" #define MAX_NUM_OP_POINTS 32 typedef struct aom_timing { uint32_t num_units_in_display_tick; uint32_t time_scale; int equal_picture_interval; uint32_t num_ticks_per_picture; } aom_timing_info_t; typedef struct aom_dec_model_info { uint32_t num_units_in_decoding_tick; int encoder_decoder_buffer_delay_length; int buffer_removal_time_length; int frame_presentation_time_length; } aom_dec_model_info_t; typedef struct aom_dec_model_op_parameters { int decoder_model_param_present_flag; int64_t bitrate; int64_t buffer_size; uint32_t decoder_buffer_delay; uint32_t encoder_buffer_delay; int low_delay_mode_flag; int display_model_param_present_flag; int initial_display_delay; } aom_dec_model_op_parameters_t; void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model); void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params); void av1_set_resource_availability_parameters( aom_dec_model_op_parameters_t *op_params); int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx, int seq_tier); #endif // AOM_AV1_COMMON_TIMING_H_ aom-3.12.1/av1/common/token_cdfs.h000066400000000000000000006356361477627663500166730ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_TOKEN_CDFS_H_ #define AOM_AV1_COMMON_TOKEN_CDFS_H_ #include "config/aom_config.h" #include "av1/common/entropy.h" static const aom_cdf_prob av1_default_dc_sign_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][DC_SIGN_CONTEXTS] [CDF_SIZE(2)] = { { { { AOM_CDF2(128 * 125) }, { AOM_CDF2(128 * 102) }, { AOM_CDF2(128 * 147) }, }, { { AOM_CDF2(128 * 119) }, { AOM_CDF2(128 * 101) }, { AOM_CDF2(128 * 135) }, } }, { { { AOM_CDF2(128 * 125) }, { AOM_CDF2(128 * 102) }, { AOM_CDF2(128 * 147) }, }, { { AOM_CDF2(128 * 119) }, { AOM_CDF2(128 * 101) }, { AOM_CDF2(128 * 135) }, } }, { { { AOM_CDF2(128 * 125) }, { AOM_CDF2(128 * 102) }, { AOM_CDF2(128 * 147) }, }, { { AOM_CDF2(128 * 119) }, { AOM_CDF2(128 * 101) }, { AOM_CDF2(128 * 135) }, } }, { { { AOM_CDF2(128 * 125) }, { AOM_CDF2(128 * 102) }, { AOM_CDF2(128 * 147) }, }, { { AOM_CDF2(128 * 119) }, { AOM_CDF2(128 * 101) }, { AOM_CDF2(128 * 135) }, } }, }; static const aom_cdf_prob av1_default_txb_skip_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS] [CDF_SIZE(2)] = { { { { AOM_CDF2(31849) }, { AOM_CDF2(5892) }, { AOM_CDF2(12112) }, { AOM_CDF2(21935) }, { AOM_CDF2(20289) }, { AOM_CDF2(27473) }, { AOM_CDF2(32487) }, { AOM_CDF2(7654) }, { AOM_CDF2(19473) }, { AOM_CDF2(29984) }, { AOM_CDF2(9961) }, { AOM_CDF2(30242) }, { AOM_CDF2(32117) } }, { { AOM_CDF2(31548) }, { AOM_CDF2(1549) }, { AOM_CDF2(10130) }, { AOM_CDF2(16656) }, { AOM_CDF2(18591) }, { AOM_CDF2(26308) }, { AOM_CDF2(32537) }, { AOM_CDF2(5403) }, { AOM_CDF2(18096) }, { AOM_CDF2(30003) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } }, { { AOM_CDF2(29957) }, { AOM_CDF2(5391) }, { AOM_CDF2(18039) }, { AOM_CDF2(23566) }, { AOM_CDF2(22431) }, { AOM_CDF2(25822) }, { AOM_CDF2(32197) }, { AOM_CDF2(3778) }, { AOM_CDF2(15336) }, { AOM_CDF2(28981) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } }, { { AOM_CDF2(17920) }, { AOM_CDF2(1818) }, { AOM_CDF2(7282) }, { AOM_CDF2(25273) }, { AOM_CDF2(10923) }, { AOM_CDF2(31554) }, { AOM_CDF2(32624) }, { AOM_CDF2(1366) }, { AOM_CDF2(15628) }, { AOM_CDF2(30462) }, { AOM_CDF2(146) }, { AOM_CDF2(5132) }, { AOM_CDF2(31657) } }, { { AOM_CDF2(6308) }, { AOM_CDF2(117) }, { AOM_CDF2(1638) }, { AOM_CDF2(2161) }, { AOM_CDF2(16384) }, { AOM_CDF2(10923) }, { AOM_CDF2(30247) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } } }, { { { AOM_CDF2(30371) }, { AOM_CDF2(7570) }, { AOM_CDF2(13155) }, { AOM_CDF2(20751) }, { AOM_CDF2(20969) }, { AOM_CDF2(27067) }, { AOM_CDF2(32013) }, { AOM_CDF2(5495) }, { AOM_CDF2(17942) }, { AOM_CDF2(28280) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } }, { { AOM_CDF2(31782) }, { AOM_CDF2(1836) }, { AOM_CDF2(10689) }, { AOM_CDF2(17604) }, { AOM_CDF2(21622) }, { AOM_CDF2(27518) }, { AOM_CDF2(32399) }, { AOM_CDF2(4419) }, { AOM_CDF2(16294) }, { AOM_CDF2(28345) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } }, { { AOM_CDF2(31901) }, { AOM_CDF2(10311) }, { AOM_CDF2(18047) }, { AOM_CDF2(24806) }, { AOM_CDF2(23288) }, { AOM_CDF2(27914) }, { AOM_CDF2(32296) }, { AOM_CDF2(4215) }, { AOM_CDF2(15756) }, { AOM_CDF2(28341) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } }, { { AOM_CDF2(26726) }, { AOM_CDF2(1045) }, { AOM_CDF2(11703) }, { AOM_CDF2(20590) }, { AOM_CDF2(18554) }, { AOM_CDF2(25970) }, { AOM_CDF2(31938) }, { AOM_CDF2(5583) }, { AOM_CDF2(21313) }, { AOM_CDF2(29390) }, { AOM_CDF2(641) }, { AOM_CDF2(22265) }, { AOM_CDF2(31452) } }, { { AOM_CDF2(26584) }, { AOM_CDF2(188) }, { AOM_CDF2(8847) }, { AOM_CDF2(24519) }, { AOM_CDF2(22938) }, { AOM_CDF2(30583) }, { AOM_CDF2(32608) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } } }, { { { AOM_CDF2(29614) }, { AOM_CDF2(9068) }, { AOM_CDF2(12924) }, { AOM_CDF2(19538) }, { AOM_CDF2(17737) }, { AOM_CDF2(24619) }, { AOM_CDF2(30642) }, { AOM_CDF2(4119) }, { AOM_CDF2(16026) }, { AOM_CDF2(25657) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } }, { { AOM_CDF2(31957) }, { AOM_CDF2(3230) }, { AOM_CDF2(11153) }, { AOM_CDF2(18123) }, { AOM_CDF2(20143) }, { AOM_CDF2(26536) }, { AOM_CDF2(31986) }, { AOM_CDF2(3050) }, { AOM_CDF2(14603) }, { AOM_CDF2(25155) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } }, { { AOM_CDF2(32363) }, { AOM_CDF2(10692) }, { AOM_CDF2(19090) }, { AOM_CDF2(24357) }, { AOM_CDF2(24442) }, { AOM_CDF2(28312) }, { AOM_CDF2(32169) }, { AOM_CDF2(3648) }, { AOM_CDF2(15690) }, { AOM_CDF2(26815) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } }, { { AOM_CDF2(30669) }, { AOM_CDF2(3832) }, { AOM_CDF2(11663) }, { AOM_CDF2(18889) }, { AOM_CDF2(19782) }, { AOM_CDF2(23313) }, { AOM_CDF2(31330) }, { AOM_CDF2(5124) }, { AOM_CDF2(18719) }, { AOM_CDF2(28468) }, { AOM_CDF2(3082) }, { AOM_CDF2(20982) }, { AOM_CDF2(29443) } }, { { AOM_CDF2(28573) }, { AOM_CDF2(3183) }, { AOM_CDF2(17802) }, { AOM_CDF2(25977) }, { AOM_CDF2(26677) }, { AOM_CDF2(27832) }, { AOM_CDF2(32387) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } } }, { { { AOM_CDF2(26887) }, { AOM_CDF2(6729) }, { AOM_CDF2(10361) }, { AOM_CDF2(17442) }, { AOM_CDF2(15045) }, { AOM_CDF2(22478) }, { AOM_CDF2(29072) }, { AOM_CDF2(2713) }, { AOM_CDF2(11861) }, { AOM_CDF2(20773) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } }, { { AOM_CDF2(31903) }, { AOM_CDF2(2044) }, { AOM_CDF2(7528) }, { AOM_CDF2(14618) }, { AOM_CDF2(16182) }, { AOM_CDF2(24168) }, { AOM_CDF2(31037) }, { AOM_CDF2(2786) }, { AOM_CDF2(11194) }, { AOM_CDF2(20155) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } }, { { AOM_CDF2(32510) }, { AOM_CDF2(8430) }, { AOM_CDF2(17318) }, { AOM_CDF2(24154) }, { AOM_CDF2(23674) }, { AOM_CDF2(28789) }, { AOM_CDF2(32139) }, { AOM_CDF2(3440) }, { AOM_CDF2(13117) }, { AOM_CDF2(22702) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } }, { { AOM_CDF2(31671) }, { AOM_CDF2(2056) }, { AOM_CDF2(11746) }, { AOM_CDF2(16852) }, { AOM_CDF2(18635) }, { AOM_CDF2(24715) }, { AOM_CDF2(31484) }, { AOM_CDF2(4656) }, { AOM_CDF2(16074) }, { AOM_CDF2(24704) }, { AOM_CDF2(1806) }, { AOM_CDF2(14645) }, { AOM_CDF2(25336) } }, { { AOM_CDF2(31539) }, { AOM_CDF2(8433) }, { AOM_CDF2(20576) }, { AOM_CDF2(27904) }, { AOM_CDF2(27852) }, { AOM_CDF2(30026) }, { AOM_CDF2(32441) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) } } } }; static const aom_cdf_prob av1_default_eob_extra_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] [EOB_COEF_CONTEXTS][CDF_SIZE(2)] = { { { { { AOM_CDF2(16961) }, { AOM_CDF2(17223) }, { AOM_CDF2(7621) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, }, { { AOM_CDF2(19069) }, { AOM_CDF2(22525) }, { AOM_CDF2(13377) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, } }, { { { AOM_CDF2(20401) }, { AOM_CDF2(17025) }, { AOM_CDF2(12845) }, { AOM_CDF2(12873) }, { AOM_CDF2(14094) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, }, { { AOM_CDF2(20681) }, { AOM_CDF2(20701) }, { AOM_CDF2(15250) }, { AOM_CDF2(15017) }, { AOM_CDF2(14928) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, } }, { { { AOM_CDF2(23905) }, { AOM_CDF2(17194) }, { AOM_CDF2(16170) }, { AOM_CDF2(17695) }, { AOM_CDF2(13826) }, { AOM_CDF2(15810) }, { AOM_CDF2(12036) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, }, { { AOM_CDF2(23959) }, { AOM_CDF2(20799) }, { AOM_CDF2(19021) }, { AOM_CDF2(16203) }, { AOM_CDF2(17886) }, { AOM_CDF2(14144) }, { AOM_CDF2(12010) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, } }, { { { AOM_CDF2(27399) }, { AOM_CDF2(16327) }, { AOM_CDF2(18071) }, { AOM_CDF2(19584) }, { AOM_CDF2(20721) }, { AOM_CDF2(18432) }, { AOM_CDF2(19560) }, { AOM_CDF2(10150) }, { AOM_CDF2(8805) }, }, { { AOM_CDF2(24932) }, { AOM_CDF2(20833) }, { AOM_CDF2(12027) }, { AOM_CDF2(16670) }, { AOM_CDF2(19914) }, { AOM_CDF2(15106) }, { AOM_CDF2(17662) }, { AOM_CDF2(13783) }, { AOM_CDF2(28756) }, } }, { { { AOM_CDF2(23406) }, { AOM_CDF2(21845) }, { AOM_CDF2(18432) }, { AOM_CDF2(16384) }, { AOM_CDF2(17096) }, { AOM_CDF2(12561) }, { AOM_CDF2(17320) }, { AOM_CDF2(22395) }, { AOM_CDF2(21370) }, }, { { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, } } }, { { { { AOM_CDF2(17471) }, { AOM_CDF2(20223) }, { AOM_CDF2(11357) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, }, { { AOM_CDF2(20335) }, { AOM_CDF2(21667) }, { AOM_CDF2(14818) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, } }, { { { AOM_CDF2(20430) }, { AOM_CDF2(20662) }, { AOM_CDF2(15367) }, { AOM_CDF2(16970) }, { AOM_CDF2(14657) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, }, { { AOM_CDF2(22117) }, { AOM_CDF2(22028) }, { AOM_CDF2(18650) }, { AOM_CDF2(16042) }, { AOM_CDF2(15885) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, } }, { { { AOM_CDF2(22409) }, { AOM_CDF2(21012) }, { AOM_CDF2(15650) }, { AOM_CDF2(17395) }, { AOM_CDF2(15469) }, { AOM_CDF2(20205) }, { AOM_CDF2(19511) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, }, { { AOM_CDF2(24220) }, { AOM_CDF2(22480) }, { AOM_CDF2(17737) }, { AOM_CDF2(18916) }, { AOM_CDF2(19268) }, { AOM_CDF2(18412) }, { AOM_CDF2(18844) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, } }, { { { AOM_CDF2(25991) }, { AOM_CDF2(20314) }, { AOM_CDF2(17731) }, { AOM_CDF2(19678) }, { AOM_CDF2(18649) }, { AOM_CDF2(17307) }, { AOM_CDF2(21798) }, { AOM_CDF2(17549) }, { AOM_CDF2(15630) }, }, { { AOM_CDF2(26585) }, { AOM_CDF2(21469) }, { AOM_CDF2(20432) }, { AOM_CDF2(17735) }, { AOM_CDF2(19280) }, { AOM_CDF2(15235) }, { AOM_CDF2(20297) }, { AOM_CDF2(22471) }, { AOM_CDF2(28997) }, } }, { { { AOM_CDF2(26605) }, { AOM_CDF2(11304) }, { AOM_CDF2(16726) }, { AOM_CDF2(16560) }, { AOM_CDF2(20866) }, { AOM_CDF2(23524) }, { AOM_CDF2(19878) }, { AOM_CDF2(13469) }, { AOM_CDF2(23084) }, }, { { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, } } }, { { { { AOM_CDF2(18983) }, { AOM_CDF2(20512) }, { AOM_CDF2(14885) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, }, { { AOM_CDF2(20090) }, { AOM_CDF2(19444) }, { AOM_CDF2(17286) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, } }, { { { AOM_CDF2(19139) }, { AOM_CDF2(21487) }, { AOM_CDF2(18959) }, { AOM_CDF2(20910) }, { AOM_CDF2(19089) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, }, { { AOM_CDF2(20536) }, { AOM_CDF2(20664) }, { AOM_CDF2(20625) }, { AOM_CDF2(19123) }, { AOM_CDF2(14862) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, } }, { { { AOM_CDF2(19833) }, { AOM_CDF2(21502) }, { AOM_CDF2(17485) }, { AOM_CDF2(20267) }, { AOM_CDF2(18353) }, { AOM_CDF2(23329) }, { AOM_CDF2(21478) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, }, { { AOM_CDF2(22041) }, { AOM_CDF2(23434) }, { AOM_CDF2(20001) }, { AOM_CDF2(20554) }, { AOM_CDF2(20951) }, { AOM_CDF2(20145) }, { AOM_CDF2(15562) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, } }, { { { AOM_CDF2(23312) }, { AOM_CDF2(21607) }, { AOM_CDF2(16526) }, { AOM_CDF2(18957) }, { AOM_CDF2(18034) }, { AOM_CDF2(18934) }, { AOM_CDF2(24247) }, { AOM_CDF2(16921) }, { AOM_CDF2(17080) }, }, { { AOM_CDF2(26579) }, { AOM_CDF2(24910) }, { AOM_CDF2(18637) }, { AOM_CDF2(19800) }, { AOM_CDF2(20388) }, { AOM_CDF2(9887) }, { AOM_CDF2(15642) }, { AOM_CDF2(30198) }, { AOM_CDF2(24721) }, } }, { { { AOM_CDF2(26998) }, { AOM_CDF2(16737) }, { AOM_CDF2(17838) }, { AOM_CDF2(18922) }, { AOM_CDF2(19515) }, { AOM_CDF2(18636) }, { AOM_CDF2(17333) }, { AOM_CDF2(15776) }, { AOM_CDF2(22658) }, }, { { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, } } }, { { { { AOM_CDF2(20177) }, { AOM_CDF2(20789) }, { AOM_CDF2(20262) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, }, { { AOM_CDF2(21416) }, { AOM_CDF2(20855) }, { AOM_CDF2(23410) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, } }, { { { AOM_CDF2(20238) }, { AOM_CDF2(21057) }, { AOM_CDF2(19159) }, { AOM_CDF2(22337) }, { AOM_CDF2(20159) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, }, { { AOM_CDF2(20125) }, { AOM_CDF2(20559) }, { AOM_CDF2(21707) }, { AOM_CDF2(22296) }, { AOM_CDF2(17333) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, } }, { { { AOM_CDF2(19941) }, { AOM_CDF2(20527) }, { AOM_CDF2(21470) }, { AOM_CDF2(22487) }, { AOM_CDF2(19558) }, { AOM_CDF2(22354) }, { AOM_CDF2(20331) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, }, { { AOM_CDF2(22752) }, { AOM_CDF2(25006) }, { AOM_CDF2(22075) }, { AOM_CDF2(21576) }, { AOM_CDF2(17740) }, { AOM_CDF2(21690) }, { AOM_CDF2(19211) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, } }, { { { AOM_CDF2(21442) }, { AOM_CDF2(22358) }, { AOM_CDF2(18503) }, { AOM_CDF2(20291) }, { AOM_CDF2(19945) }, { AOM_CDF2(21294) }, { AOM_CDF2(21178) }, { AOM_CDF2(19400) }, { AOM_CDF2(10556) }, }, { { AOM_CDF2(24648) }, { AOM_CDF2(24949) }, { AOM_CDF2(20708) }, { AOM_CDF2(23905) }, { AOM_CDF2(20501) }, { AOM_CDF2(9558) }, { AOM_CDF2(9423) }, { AOM_CDF2(30365) }, { AOM_CDF2(19253) }, } }, { { { AOM_CDF2(26064) }, { AOM_CDF2(22098) }, { AOM_CDF2(19613) }, { AOM_CDF2(20525) }, { AOM_CDF2(17595) }, { AOM_CDF2(16618) }, { AOM_CDF2(20497) }, { AOM_CDF2(18989) }, { AOM_CDF2(15513) }, }, { { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, } } } }; static const aom_cdf_prob av1_default_eob_multi16_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( 5)] = { { { { AOM_CDF5(840, 1039, 1980, 4895) }, { AOM_CDF5(370, 671, 1883, 4471) } }, { { AOM_CDF5(3247, 4950, 9688, 14563) }, { AOM_CDF5(1904, 3354, 7763, 14647) } } }, { { { AOM_CDF5(2125, 2551, 5165, 8946) }, { AOM_CDF5(513, 765, 1859, 6339) } }, { { AOM_CDF5(7637, 9498, 14259, 19108) }, { AOM_CDF5(2497, 4096, 8866, 16993) } } }, { { { AOM_CDF5(4016, 4897, 8881, 14968) }, { AOM_CDF5(716, 1105, 2646, 10056) } }, { { AOM_CDF5(11139, 13270, 18241, 23566) }, { AOM_CDF5(3192, 5032, 10297, 19755) } } }, { { { AOM_CDF5(6708, 8958, 14746, 22133) }, { AOM_CDF5(1222, 2074, 4783, 15410) } }, { { AOM_CDF5(19575, 21766, 26044, 29709) }, { AOM_CDF5(7297, 10767, 19273, 28194) } } } }; static const aom_cdf_prob av1_default_eob_multi32_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( 6)] = { { { { AOM_CDF6(400, 520, 977, 2102, 6542) }, { AOM_CDF6(210, 405, 1315, 3326, 7537) } }, { { AOM_CDF6(2636, 4273, 7588, 11794, 20401) }, { AOM_CDF6(1786, 3179, 6902, 11357, 19054) } } }, { { { AOM_CDF6(989, 1249, 2019, 4151, 10785) }, { AOM_CDF6(313, 441, 1099, 2917, 8562) } }, { { AOM_CDF6(8394, 10352, 13932, 18855, 26014) }, { AOM_CDF6(2578, 4124, 8181, 13670, 24234) } } }, { { { AOM_CDF6(2515, 3003, 4452, 8162, 16041) }, { AOM_CDF6(574, 821, 1836, 5089, 13128) } }, { { AOM_CDF6(13468, 16303, 20361, 25105, 29281) }, { AOM_CDF6(3542, 5502, 10415, 16760, 25644) } } }, { { { AOM_CDF6(4617, 5709, 8446, 13584, 23135) }, { AOM_CDF6(1156, 1702, 3675, 9274, 20539) } }, { { AOM_CDF6(22086, 24282, 27010, 29770, 31743) }, { AOM_CDF6(7699, 10897, 20891, 26926, 31628) } } } }; static const aom_cdf_prob av1_default_eob_multi64_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( 7)] = { { { { AOM_CDF7(329, 498, 1101, 1784, 3265, 7758) }, { AOM_CDF7(335, 730, 1459, 5494, 8755, 12997) } }, { { AOM_CDF7(3505, 5304, 10086, 13814, 17684, 23370) }, { AOM_CDF7(1563, 2700, 4876, 10911, 14706, 22480) } } }, { { { AOM_CDF7(1260, 1446, 2253, 3712, 6652, 13369) }, { AOM_CDF7(401, 605, 1029, 2563, 5845, 12626) } }, { { AOM_CDF7(8609, 10612, 14624, 18714, 22614, 29024) }, { AOM_CDF7(1923, 3127, 5867, 9703, 14277, 27100) } } }, { { { AOM_CDF7(2374, 2772, 4583, 7276, 12288, 19706) }, { AOM_CDF7(497, 810, 1315, 3000, 7004, 15641) } }, { { AOM_CDF7(15050, 17126, 21410, 24886, 28156, 30726) }, { AOM_CDF7(4034, 6290, 10235, 14982, 21214, 28491) } } }, { { { AOM_CDF7(6307, 7541, 12060, 16358, 22553, 27865) }, { AOM_CDF7(1289, 2320, 3971, 7926, 14153, 24291) } }, { { AOM_CDF7(24212, 25708, 28268, 30035, 31307, 32049) }, { AOM_CDF7(8726, 12378, 19409, 26450, 30038, 32462) } } } }; static const aom_cdf_prob av1_default_eob_multi128_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( 8)] = { { { { AOM_CDF8(219, 482, 1140, 2091, 3680, 6028, 12586) }, { AOM_CDF8(371, 699, 1254, 4830, 9479, 12562, 17497) } }, { { AOM_CDF8(5245, 7456, 12880, 15852, 20033, 23932, 27608) }, { AOM_CDF8(2054, 3472, 5869, 14232, 18242, 20590, 26752) } } }, { { { AOM_CDF8(685, 933, 1488, 2714, 4766, 8562, 19254) }, { AOM_CDF8(217, 352, 618, 2303, 5261, 9969, 17472) } }, { { AOM_CDF8(8045, 11200, 15497, 19595, 23948, 27408, 30938) }, { AOM_CDF8(2310, 4160, 7471, 14997, 17931, 20768, 30240) } } }, { { { AOM_CDF8(1366, 1738, 2527, 5016, 9355, 15797, 24643) }, { AOM_CDF8(354, 558, 944, 2760, 7287, 14037, 21779) } }, { { AOM_CDF8(13627, 16246, 20173, 24429, 27948, 30415, 31863) }, { AOM_CDF8(6275, 9889, 14769, 23164, 27988, 30493, 32272) } } }, { { { AOM_CDF8(3472, 4885, 7489, 12481, 18517, 24536, 29635) }, { AOM_CDF8(886, 1731, 3271, 8469, 15569, 22126, 28383) } }, { { AOM_CDF8(24313, 26062, 28385, 30107, 31217, 31898, 32345) }, { AOM_CDF8(9165, 13282, 21150, 30286, 31894, 32571, 32712) } } } }; static const aom_cdf_prob av1_default_eob_multi256_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( 9)] = { { { { AOM_CDF9(310, 584, 1887, 3589, 6168, 8611, 11352, 15652) }, { AOM_CDF9(998, 1850, 2998, 5604, 17341, 19888, 22899, 25583) } }, { { AOM_CDF9(2520, 3240, 5952, 8870, 12577, 17558, 19954, 24168) }, { AOM_CDF9(2203, 4130, 7435, 10739, 20652, 23681, 25609, 27261) } } }, { { { AOM_CDF9(1448, 2109, 4151, 6263, 9329, 13260, 17944, 23300) }, { AOM_CDF9(399, 1019, 1749, 3038, 10444, 15546, 22739, 27294) } }, { { AOM_CDF9(6402, 8148, 12623, 15072, 18728, 22847, 26447, 29377) }, { AOM_CDF9(1674, 3252, 5734, 10159, 22397, 23802, 24821, 30940) } } }, { { { AOM_CDF9(3089, 3920, 6038, 9460, 14266, 19881, 25766, 29176) }, { AOM_CDF9(1084, 2358, 3488, 5122, 11483, 18103, 26023, 29799) } }, { { AOM_CDF9(11514, 13794, 17480, 20754, 24361, 27378, 29492, 31277) }, { AOM_CDF9(6571, 9610, 15516, 21826, 29092, 30829, 31842, 32708) } } }, { { { AOM_CDF9(5348, 7113, 11820, 15924, 22106, 26777, 30334, 31757) }, { AOM_CDF9(2453, 4474, 6307, 8777, 16474, 22975, 29000, 31547) } }, { { AOM_CDF9(23110, 24597, 27140, 28894, 30167, 30927, 31392, 32094) }, { AOM_CDF9(9998, 17661, 25178, 28097, 31308, 32038, 32403, 32695) } } } }; static const aom_cdf_prob av1_default_eob_multi512_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( 10)] = { { { { AOM_CDF10(641, 983, 3707, 5430, 10234, 14958, 18788, 23412, 26061) }, { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491) } }, { { AOM_CDF10(5095, 6446, 9996, 13354, 16017, 17986, 20919, 26129, 29140) }, { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491) } } }, { { { AOM_CDF10(1230, 2278, 5035, 7776, 11871, 15346, 19590, 24584, 28749) }, { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491) } }, { { AOM_CDF10(7265, 9979, 15819, 19250, 21780, 23846, 26478, 28396, 31811) }, { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491) } } }, { { { AOM_CDF10(2624, 3936, 6480, 9686, 13979, 17726, 23267, 28410, 31078) }, { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491) } }, { { AOM_CDF10(12015, 14769, 19588, 22052, 24222, 25812, 27300, 29219, 32114) }, { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491) } } }, { { { AOM_CDF10(5927, 7809, 10923, 14597, 19439, 24135, 28456, 31142, 32060) }, { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491) } }, { { AOM_CDF10(21093, 23043, 25742, 27658, 29097, 29716, 30073, 30820, 31956) }, { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491) } } } }; static const aom_cdf_prob av1_default_eob_multi1024_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( 11)] = { { { { AOM_CDF11(393, 421, 751, 1623, 3160, 6352, 13345, 18047, 22571, 25830) }, { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789) } }, { { AOM_CDF11(1865, 1988, 2930, 4242, 10533, 16538, 21354, 27255, 28546, 31784) }, { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789) } } }, { { { AOM_CDF11(696, 948, 3145, 5702, 9706, 13217, 17851, 21856, 25692, 28034) }, { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789) } }, { { AOM_CDF11(2672, 3591, 9330, 17084, 22725, 24284, 26527, 28027, 28377, 30876) }, { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789) } } }, { { { AOM_CDF11(2784, 3831, 7041, 10521, 14847, 18844, 23155, 26682, 29229, 31045) }, { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789) } }, { { AOM_CDF11(9577, 12466, 17739, 20750, 22061, 23215, 24601, 25483, 25843, 32056) }, { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789) } } }, { { { AOM_CDF11(6698, 8334, 11961, 15762, 20186, 23862, 27434, 29326, 31082, 32050) }, { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789) } }, { { AOM_CDF11(20569, 22426, 25569, 26859, 28053, 28913, 29486, 29724, 29807, 32570) }, { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789) } } } }; static const aom_cdf_prob av1_default_coeff_lps_multi_cdfs [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS] [CDF_SIZE(BR_CDF_SIZE)] = { { { { { AOM_CDF4(14298, 20718, 24174) }, { AOM_CDF4(12536, 19601, 23789) }, { AOM_CDF4(8712, 15051, 19503) }, { AOM_CDF4(6170, 11327, 15434) }, { AOM_CDF4(4742, 8926, 12538) }, { AOM_CDF4(3803, 7317, 10546) }, { AOM_CDF4(1696, 3317, 4871) }, { AOM_CDF4(14392, 19951, 22756) }, { AOM_CDF4(15978, 23218, 26818) }, { AOM_CDF4(12187, 19474, 23889) }, { AOM_CDF4(9176, 15640, 20259) }, { AOM_CDF4(7068, 12655, 17028) }, { AOM_CDF4(5656, 10442, 14472) }, { AOM_CDF4(2580, 4992, 7244) }, { AOM_CDF4(12136, 18049, 21426) }, { AOM_CDF4(13784, 20721, 24481) }, { AOM_CDF4(10836, 17621, 21900) }, { AOM_CDF4(8372, 14444, 18847) }, { AOM_CDF4(6523, 11779, 16000) }, { AOM_CDF4(5337, 9898, 13760) }, { AOM_CDF4(3034, 5860, 8462) } }, { { AOM_CDF4(15967, 22905, 26286) }, { AOM_CDF4(13534, 20654, 24579) }, { AOM_CDF4(9504, 16092, 20535) }, { AOM_CDF4(6975, 12568, 16903) }, { AOM_CDF4(5364, 10091, 14020) }, { AOM_CDF4(4357, 8370, 11857) }, { AOM_CDF4(2506, 4934, 7218) }, { AOM_CDF4(23032, 28815, 30936) }, { AOM_CDF4(19540, 26704, 29719) }, { AOM_CDF4(15158, 22969, 27097) }, { AOM_CDF4(11408, 18865, 23650) }, { AOM_CDF4(8885, 15448, 20250) }, { AOM_CDF4(7108, 12853, 17416) }, { AOM_CDF4(4231, 8041, 11480) }, { AOM_CDF4(19823, 26490, 29156) }, { AOM_CDF4(18890, 25929, 28932) }, { AOM_CDF4(15660, 23491, 27433) }, { AOM_CDF4(12147, 19776, 24488) }, { AOM_CDF4(9728, 16774, 21649) }, { AOM_CDF4(7919, 14277, 19066) }, { AOM_CDF4(5440, 10170, 14185) } } }, { { { AOM_CDF4(14406, 20862, 24414) }, { AOM_CDF4(11824, 18907, 23109) }, { AOM_CDF4(8257, 14393, 18803) }, { AOM_CDF4(5860, 10747, 14778) }, { AOM_CDF4(4475, 8486, 11984) }, { AOM_CDF4(3606, 6954, 10043) }, { AOM_CDF4(1736, 3410, 5048) }, { AOM_CDF4(14430, 20046, 22882) }, { AOM_CDF4(15593, 22899, 26709) }, { AOM_CDF4(12102, 19368, 23811) }, { AOM_CDF4(9059, 15584, 20262) }, { AOM_CDF4(6999, 12603, 17048) }, { AOM_CDF4(5684, 10497, 14553) }, { AOM_CDF4(2822, 5438, 7862) }, { AOM_CDF4(15785, 21585, 24359) }, { AOM_CDF4(18347, 25229, 28266) }, { AOM_CDF4(14974, 22487, 26389) }, { AOM_CDF4(11423, 18681, 23271) }, { AOM_CDF4(8863, 15350, 20008) }, { AOM_CDF4(7153, 12852, 17278) }, { AOM_CDF4(3707, 7036, 9982) } }, { { AOM_CDF4(15460, 21696, 25469) }, { AOM_CDF4(12170, 19249, 23191) }, { AOM_CDF4(8723, 15027, 19332) }, { AOM_CDF4(6428, 11704, 15874) }, { AOM_CDF4(4922, 9292, 13052) }, { AOM_CDF4(4139, 7695, 11010) }, { AOM_CDF4(2291, 4508, 6598) }, { AOM_CDF4(19856, 26920, 29828) }, { AOM_CDF4(17923, 25289, 28792) }, { AOM_CDF4(14278, 21968, 26297) }, { AOM_CDF4(10910, 18136, 22950) }, { AOM_CDF4(8423, 14815, 19627) }, { AOM_CDF4(6771, 12283, 16774) }, { AOM_CDF4(4074, 7750, 11081) }, { AOM_CDF4(19852, 26074, 28672) }, { AOM_CDF4(19371, 26110, 28989) }, { AOM_CDF4(16265, 23873, 27663) }, { AOM_CDF4(12758, 20378, 24952) }, { AOM_CDF4(10095, 17098, 21961) }, { AOM_CDF4(8250, 14628, 19451) }, { AOM_CDF4(5205, 9745, 13622) } } }, { { { AOM_CDF4(10563, 16233, 19763) }, { AOM_CDF4(9794, 16022, 19804) }, { AOM_CDF4(6750, 11945, 15759) }, { AOM_CDF4(4963, 9186, 12752) }, { AOM_CDF4(3845, 7435, 10627) }, { AOM_CDF4(3051, 6085, 8834) }, { AOM_CDF4(1311, 2596, 3830) }, { AOM_CDF4(11246, 16404, 19689) }, { AOM_CDF4(12315, 18911, 22731) }, { AOM_CDF4(10557, 17095, 21289) }, { AOM_CDF4(8136, 14006, 18249) }, { AOM_CDF4(6348, 11474, 15565) }, { AOM_CDF4(5196, 9655, 13400) }, { AOM_CDF4(2349, 4526, 6587) }, { AOM_CDF4(13337, 18730, 21569) }, { AOM_CDF4(19306, 26071, 28882) }, { AOM_CDF4(15952, 23540, 27254) }, { AOM_CDF4(12409, 19934, 24430) }, { AOM_CDF4(9760, 16706, 21389) }, { AOM_CDF4(8004, 14220, 18818) }, { AOM_CDF4(4138, 7794, 10961) } }, { { AOM_CDF4(10870, 16684, 20949) }, { AOM_CDF4(9664, 15230, 18680) }, { AOM_CDF4(6886, 12109, 15408) }, { AOM_CDF4(4825, 8900, 12305) }, { AOM_CDF4(3630, 7162, 10314) }, { AOM_CDF4(3036, 6429, 9387) }, { AOM_CDF4(1671, 3296, 4940) }, { AOM_CDF4(13819, 19159, 23026) }, { AOM_CDF4(11984, 19108, 23120) }, { AOM_CDF4(10690, 17210, 21663) }, { AOM_CDF4(7984, 14154, 18333) }, { AOM_CDF4(6868, 12294, 16124) }, { AOM_CDF4(5274, 8994, 12868) }, { AOM_CDF4(2988, 5771, 8424) }, { AOM_CDF4(19736, 26647, 29141) }, { AOM_CDF4(18933, 26070, 28984) }, { AOM_CDF4(15779, 23048, 27200) }, { AOM_CDF4(12638, 20061, 24532) }, { AOM_CDF4(10692, 17545, 22220) }, { AOM_CDF4(9217, 15251, 20054) }, { AOM_CDF4(5078, 9284, 12594) } } }, { { { AOM_CDF4(2331, 3662, 5244) }, { AOM_CDF4(2891, 4771, 6145) }, { AOM_CDF4(4598, 7623, 9729) }, { AOM_CDF4(3520, 6845, 9199) }, { AOM_CDF4(3417, 6119, 9324) }, { AOM_CDF4(2601, 5412, 7385) }, { AOM_CDF4(600, 1173, 1744) }, { AOM_CDF4(7672, 13286, 17469) }, { AOM_CDF4(4232, 7792, 10793) }, { AOM_CDF4(2915, 5317, 7397) }, { AOM_CDF4(2318, 4356, 6152) }, { AOM_CDF4(2127, 4000, 5554) }, { AOM_CDF4(1850, 3478, 5275) }, { AOM_CDF4(977, 1933, 2843) }, { AOM_CDF4(18280, 24387, 27989) }, { AOM_CDF4(15852, 22671, 26185) }, { AOM_CDF4(13845, 20951, 24789) }, { AOM_CDF4(11055, 17966, 22129) }, { AOM_CDF4(9138, 15422, 19801) }, { AOM_CDF4(7454, 13145, 17456) }, { AOM_CDF4(3370, 6393, 9013) } }, { { AOM_CDF4(5842, 9229, 10838) }, { AOM_CDF4(2313, 3491, 4276) }, { AOM_CDF4(2998, 6104, 7496) }, { AOM_CDF4(2420, 7447, 9868) }, { AOM_CDF4(3034, 8495, 10923) }, { AOM_CDF4(4076, 8937, 10975) }, { AOM_CDF4(1086, 2370, 3299) }, { AOM_CDF4(9714, 17254, 20444) }, { AOM_CDF4(8543, 13698, 17123) }, { AOM_CDF4(4918, 9007, 11910) }, { AOM_CDF4(4129, 7532, 10553) }, { AOM_CDF4(2364, 5533, 8058) }, { AOM_CDF4(1834, 3546, 5563) }, { AOM_CDF4(1473, 2908, 4133) }, { AOM_CDF4(15405, 21193, 25619) }, { AOM_CDF4(15691, 21952, 26561) }, { AOM_CDF4(12962, 19194, 24165) }, { AOM_CDF4(10272, 17855, 22129) }, { AOM_CDF4(8588, 15270, 20718) }, { AOM_CDF4(8682, 14669, 19500) }, { AOM_CDF4(4870, 9636, 13205) } } }, { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } } } }, { { { { AOM_CDF4(14995, 21341, 24749) }, { AOM_CDF4(13158, 20289, 24601) }, { AOM_CDF4(8941, 15326, 19876) }, { AOM_CDF4(6297, 11541, 15807) }, { AOM_CDF4(4817, 9029, 12776) }, { AOM_CDF4(3731, 7273, 10627) }, { AOM_CDF4(1847, 3617, 5354) }, { AOM_CDF4(14472, 19659, 22343) }, { AOM_CDF4(16806, 24162, 27533) }, { AOM_CDF4(12900, 20404, 24713) }, { AOM_CDF4(9411, 16112, 20797) }, { AOM_CDF4(7056, 12697, 17148) }, { AOM_CDF4(5544, 10339, 14460) }, { AOM_CDF4(2954, 5704, 8319) }, { AOM_CDF4(12464, 18071, 21354) }, { AOM_CDF4(15482, 22528, 26034) }, { AOM_CDF4(12070, 19269, 23624) }, { AOM_CDF4(8953, 15406, 20106) }, { AOM_CDF4(7027, 12730, 17220) }, { AOM_CDF4(5887, 10913, 15140) }, { AOM_CDF4(3793, 7278, 10447) } }, { { AOM_CDF4(15571, 22232, 25749) }, { AOM_CDF4(14506, 21575, 25374) }, { AOM_CDF4(10189, 17089, 21569) }, { AOM_CDF4(7316, 13301, 17915) }, { AOM_CDF4(5783, 10912, 15190) }, { AOM_CDF4(4760, 9155, 13088) }, { AOM_CDF4(2993, 5966, 8774) }, { AOM_CDF4(23424, 28903, 30778) }, { AOM_CDF4(20775, 27666, 30290) }, { AOM_CDF4(16474, 24410, 28299) }, { AOM_CDF4(12471, 20180, 24987) }, { AOM_CDF4(9410, 16487, 21439) }, { AOM_CDF4(7536, 13614, 18529) }, { AOM_CDF4(5048, 9586, 13549) }, { AOM_CDF4(21090, 27290, 29756) }, { AOM_CDF4(20796, 27402, 30026) }, { AOM_CDF4(17819, 25485, 28969) }, { AOM_CDF4(13860, 21909, 26462) }, { AOM_CDF4(11002, 18494, 23529) }, { AOM_CDF4(8953, 15929, 20897) }, { AOM_CDF4(6448, 11918, 16454) } } }, { { { AOM_CDF4(15999, 22208, 25449) }, { AOM_CDF4(13050, 19988, 24122) }, { AOM_CDF4(8594, 14864, 19378) }, { AOM_CDF4(6033, 11079, 15238) }, { AOM_CDF4(4554, 8683, 12347) }, { AOM_CDF4(3672, 7139, 10337) }, { AOM_CDF4(1900, 3771, 5576) }, { AOM_CDF4(15788, 21340, 23949) }, { AOM_CDF4(16825, 24235, 27758) }, { AOM_CDF4(12873, 20402, 24810) }, { AOM_CDF4(9590, 16363, 21094) }, { AOM_CDF4(7352, 13209, 17733) }, { AOM_CDF4(5960, 10989, 15184) }, { AOM_CDF4(3232, 6234, 9007) }, { AOM_CDF4(15761, 20716, 23224) }, { AOM_CDF4(19318, 25989, 28759) }, { AOM_CDF4(15529, 23094, 26929) }, { AOM_CDF4(11662, 18989, 23641) }, { AOM_CDF4(8955, 15568, 20366) }, { AOM_CDF4(7281, 13106, 17708) }, { AOM_CDF4(4248, 8059, 11440) } }, { { AOM_CDF4(14899, 21217, 24503) }, { AOM_CDF4(13519, 20283, 24047) }, { AOM_CDF4(9429, 15966, 20365) }, { AOM_CDF4(6700, 12355, 16652) }, { AOM_CDF4(5088, 9704, 13716) }, { AOM_CDF4(4243, 8154, 11731) }, { AOM_CDF4(2702, 5364, 7861) }, { AOM_CDF4(22745, 28388, 30454) }, { AOM_CDF4(20235, 27146, 29922) }, { AOM_CDF4(15896, 23715, 27637) }, { AOM_CDF4(11840, 19350, 24131) }, { AOM_CDF4(9122, 15932, 20880) }, { AOM_CDF4(7488, 13581, 18362) }, { AOM_CDF4(5114, 9568, 13370) }, { AOM_CDF4(20845, 26553, 28932) }, { AOM_CDF4(20981, 27372, 29884) }, { AOM_CDF4(17781, 25335, 28785) }, { AOM_CDF4(13760, 21708, 26297) }, { AOM_CDF4(10975, 18415, 23365) }, { AOM_CDF4(9045, 15789, 20686) }, { AOM_CDF4(6130, 11199, 15423) } } }, { { { AOM_CDF4(13549, 19724, 23158) }, { AOM_CDF4(11844, 18382, 22246) }, { AOM_CDF4(7919, 13619, 17773) }, { AOM_CDF4(5486, 10143, 13946) }, { AOM_CDF4(4166, 7983, 11324) }, { AOM_CDF4(3364, 6506, 9427) }, { AOM_CDF4(1598, 3160, 4674) }, { AOM_CDF4(15281, 20979, 23781) }, { AOM_CDF4(14939, 22119, 25952) }, { AOM_CDF4(11363, 18407, 22812) }, { AOM_CDF4(8609, 14857, 19370) }, { AOM_CDF4(6737, 12184, 16480) }, { AOM_CDF4(5506, 10263, 14262) }, { AOM_CDF4(2990, 5786, 8380) }, { AOM_CDF4(20249, 25253, 27417) }, { AOM_CDF4(21070, 27518, 30001) }, { AOM_CDF4(16854, 24469, 28074) }, { AOM_CDF4(12864, 20486, 25000) }, { AOM_CDF4(9962, 16978, 21778) }, { AOM_CDF4(8074, 14338, 19048) }, { AOM_CDF4(4494, 8479, 11906) } }, { { AOM_CDF4(13960, 19617, 22829) }, { AOM_CDF4(11150, 17341, 21228) }, { AOM_CDF4(7150, 12964, 17190) }, { AOM_CDF4(5331, 10002, 13867) }, { AOM_CDF4(4167, 7744, 11057) }, { AOM_CDF4(3480, 6629, 9646) }, { AOM_CDF4(1883, 3784, 5686) }, { AOM_CDF4(18752, 25660, 28912) }, { AOM_CDF4(16968, 24586, 28030) }, { AOM_CDF4(13520, 21055, 25313) }, { AOM_CDF4(10453, 17626, 22280) }, { AOM_CDF4(8386, 14505, 19116) }, { AOM_CDF4(6742, 12595, 17008) }, { AOM_CDF4(4273, 8140, 11499) }, { AOM_CDF4(22120, 27827, 30233) }, { AOM_CDF4(20563, 27358, 29895) }, { AOM_CDF4(17076, 24644, 28153) }, { AOM_CDF4(13362, 20942, 25309) }, { AOM_CDF4(10794, 17965, 22695) }, { AOM_CDF4(9014, 15652, 20319) }, { AOM_CDF4(5708, 10512, 14497) } } }, { { { AOM_CDF4(5705, 10930, 15725) }, { AOM_CDF4(7946, 12765, 16115) }, { AOM_CDF4(6801, 12123, 16226) }, { AOM_CDF4(5462, 10135, 14200) }, { AOM_CDF4(4189, 8011, 11507) }, { AOM_CDF4(3191, 6229, 9408) }, { AOM_CDF4(1057, 2137, 3212) }, { AOM_CDF4(10018, 17067, 21491) }, { AOM_CDF4(7380, 12582, 16453) }, { AOM_CDF4(6068, 10845, 14339) }, { AOM_CDF4(5098, 9198, 12555) }, { AOM_CDF4(4312, 8010, 11119) }, { AOM_CDF4(3700, 6966, 9781) }, { AOM_CDF4(1693, 3326, 4887) }, { AOM_CDF4(18757, 24930, 27774) }, { AOM_CDF4(17648, 24596, 27817) }, { AOM_CDF4(14707, 22052, 26026) }, { AOM_CDF4(11720, 18852, 23292) }, { AOM_CDF4(9357, 15952, 20525) }, { AOM_CDF4(7810, 13753, 18210) }, { AOM_CDF4(3879, 7333, 10328) } }, { { AOM_CDF4(8278, 13242, 15922) }, { AOM_CDF4(10547, 15867, 18919) }, { AOM_CDF4(9106, 15842, 20609) }, { AOM_CDF4(6833, 13007, 17218) }, { AOM_CDF4(4811, 9712, 13923) }, { AOM_CDF4(3985, 7352, 11128) }, { AOM_CDF4(1688, 3458, 5262) }, { AOM_CDF4(12951, 21861, 26510) }, { AOM_CDF4(9788, 16044, 20276) }, { AOM_CDF4(6309, 11244, 14870) }, { AOM_CDF4(5183, 9349, 12566) }, { AOM_CDF4(4389, 8229, 11492) }, { AOM_CDF4(3633, 6945, 10620) }, { AOM_CDF4(3600, 6847, 9907) }, { AOM_CDF4(21748, 28137, 30255) }, { AOM_CDF4(19436, 26581, 29560) }, { AOM_CDF4(16359, 24201, 27953) }, { AOM_CDF4(13961, 21693, 25871) }, { AOM_CDF4(11544, 18686, 23322) }, { AOM_CDF4(9372, 16462, 20952) }, { AOM_CDF4(6138, 11210, 15390) } } }, { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } } } }, { { { { AOM_CDF4(16138, 22223, 25509) }, { AOM_CDF4(15347, 22430, 26332) }, { AOM_CDF4(9614, 16736, 21332) }, { AOM_CDF4(6600, 12275, 16907) }, { AOM_CDF4(4811, 9424, 13547) }, { AOM_CDF4(3748, 7809, 11420) }, { AOM_CDF4(2254, 4587, 6890) }, { AOM_CDF4(15196, 20284, 23177) }, { AOM_CDF4(18317, 25469, 28451) }, { AOM_CDF4(13918, 21651, 25842) }, { AOM_CDF4(10052, 17150, 21995) }, { AOM_CDF4(7499, 13630, 18587) }, { AOM_CDF4(6158, 11417, 16003) }, { AOM_CDF4(4014, 7785, 11252) }, { AOM_CDF4(15048, 21067, 24384) }, { AOM_CDF4(18202, 25346, 28553) }, { AOM_CDF4(14302, 22019, 26356) }, { AOM_CDF4(10839, 18139, 23166) }, { AOM_CDF4(8715, 15744, 20806) }, { AOM_CDF4(7536, 13576, 18544) }, { AOM_CDF4(5413, 10335, 14498) } }, { { AOM_CDF4(17394, 24501, 27895) }, { AOM_CDF4(15889, 23420, 27185) }, { AOM_CDF4(11561, 19133, 23870) }, { AOM_CDF4(8285, 14812, 19844) }, { AOM_CDF4(6496, 12043, 16550) }, { AOM_CDF4(4771, 9574, 13677) }, { AOM_CDF4(3603, 6830, 10144) }, { AOM_CDF4(21656, 27704, 30200) }, { AOM_CDF4(21324, 27915, 30511) }, { AOM_CDF4(17327, 25336, 28997) }, { AOM_CDF4(13417, 21381, 26033) }, { AOM_CDF4(10132, 17425, 22338) }, { AOM_CDF4(8580, 15016, 19633) }, { AOM_CDF4(5694, 11477, 16411) }, { AOM_CDF4(24116, 29780, 31450) }, { AOM_CDF4(23853, 29695, 31591) }, { AOM_CDF4(20085, 27614, 30428) }, { AOM_CDF4(15326, 24335, 28575) }, { AOM_CDF4(11814, 19472, 24810) }, { AOM_CDF4(10221, 18611, 24767) }, { AOM_CDF4(7689, 14558, 20321) } } }, { { { AOM_CDF4(16214, 22380, 25770) }, { AOM_CDF4(14213, 21304, 25295) }, { AOM_CDF4(9213, 15823, 20455) }, { AOM_CDF4(6395, 11758, 16139) }, { AOM_CDF4(4779, 9187, 13066) }, { AOM_CDF4(3821, 7501, 10953) }, { AOM_CDF4(2293, 4567, 6795) }, { AOM_CDF4(15859, 21283, 23820) }, { AOM_CDF4(18404, 25602, 28726) }, { AOM_CDF4(14325, 21980, 26206) }, { AOM_CDF4(10669, 17937, 22720) }, { AOM_CDF4(8297, 14642, 19447) }, { AOM_CDF4(6746, 12389, 16893) }, { AOM_CDF4(4324, 8251, 11770) }, { AOM_CDF4(16532, 21631, 24475) }, { AOM_CDF4(20667, 27150, 29668) }, { AOM_CDF4(16728, 24510, 28175) }, { AOM_CDF4(12861, 20645, 25332) }, { AOM_CDF4(10076, 17361, 22417) }, { AOM_CDF4(8395, 14940, 19963) }, { AOM_CDF4(5731, 10683, 14912) } }, { { AOM_CDF4(14433, 21155, 24938) }, { AOM_CDF4(14658, 21716, 25545) }, { AOM_CDF4(9923, 16824, 21557) }, { AOM_CDF4(6982, 13052, 17721) }, { AOM_CDF4(5419, 10503, 15050) }, { AOM_CDF4(4852, 9162, 13014) }, { AOM_CDF4(3271, 6395, 9630) }, { AOM_CDF4(22210, 27833, 30109) }, { AOM_CDF4(20750, 27368, 29821) }, { AOM_CDF4(16894, 24828, 28573) }, { AOM_CDF4(13247, 21276, 25757) }, { AOM_CDF4(10038, 17265, 22563) }, { AOM_CDF4(8587, 14947, 20327) }, { AOM_CDF4(5645, 11371, 15252) }, { AOM_CDF4(22027, 27526, 29714) }, { AOM_CDF4(23098, 29146, 31221) }, { AOM_CDF4(19886, 27341, 30272) }, { AOM_CDF4(15609, 23747, 28046) }, { AOM_CDF4(11993, 20065, 24939) }, { AOM_CDF4(9637, 18267, 23671) }, { AOM_CDF4(7625, 13801, 19144) } } }, { { { AOM_CDF4(14438, 20798, 24089) }, { AOM_CDF4(12621, 19203, 23097) }, { AOM_CDF4(8177, 14125, 18402) }, { AOM_CDF4(5674, 10501, 14456) }, { AOM_CDF4(4236, 8239, 11733) }, { AOM_CDF4(3447, 6750, 9806) }, { AOM_CDF4(1986, 3950, 5864) }, { AOM_CDF4(16208, 22099, 24930) }, { AOM_CDF4(16537, 24025, 27585) }, { AOM_CDF4(12780, 20381, 24867) }, { AOM_CDF4(9767, 16612, 21416) }, { AOM_CDF4(7686, 13738, 18398) }, { AOM_CDF4(6333, 11614, 15964) }, { AOM_CDF4(3941, 7571, 10836) }, { AOM_CDF4(22819, 27422, 29202) }, { AOM_CDF4(22224, 28514, 30721) }, { AOM_CDF4(17660, 25433, 28913) }, { AOM_CDF4(13574, 21482, 26002) }, { AOM_CDF4(10629, 17977, 22938) }, { AOM_CDF4(8612, 15298, 20265) }, { AOM_CDF4(5607, 10491, 14596) } }, { { AOM_CDF4(13569, 19800, 23206) }, { AOM_CDF4(13128, 19924, 23869) }, { AOM_CDF4(8329, 14841, 19403) }, { AOM_CDF4(6130, 10976, 15057) }, { AOM_CDF4(4682, 8839, 12518) }, { AOM_CDF4(3656, 7409, 10588) }, { AOM_CDF4(2577, 5099, 7412) }, { AOM_CDF4(22427, 28684, 30585) }, { AOM_CDF4(20913, 27750, 30139) }, { AOM_CDF4(15840, 24109, 27834) }, { AOM_CDF4(12308, 20029, 24569) }, { AOM_CDF4(10216, 16785, 21458) }, { AOM_CDF4(8309, 14203, 19113) }, { AOM_CDF4(6043, 11168, 15307) }, { AOM_CDF4(23166, 28901, 30998) }, { AOM_CDF4(21899, 28405, 30751) }, { AOM_CDF4(18413, 26091, 29443) }, { AOM_CDF4(15233, 23114, 27352) }, { AOM_CDF4(12683, 20472, 25288) }, { AOM_CDF4(10702, 18259, 23409) }, { AOM_CDF4(8125, 14464, 19226) } } }, { { { AOM_CDF4(9040, 14786, 18360) }, { AOM_CDF4(9979, 15718, 19415) }, { AOM_CDF4(7913, 13918, 18311) }, { AOM_CDF4(5859, 10889, 15184) }, { AOM_CDF4(4593, 8677, 12510) }, { AOM_CDF4(3820, 7396, 10791) }, { AOM_CDF4(1730, 3471, 5192) }, { AOM_CDF4(11803, 18365, 22709) }, { AOM_CDF4(11419, 18058, 22225) }, { AOM_CDF4(9418, 15774, 20243) }, { AOM_CDF4(7539, 13325, 17657) }, { AOM_CDF4(6233, 11317, 15384) }, { AOM_CDF4(5137, 9656, 13545) }, { AOM_CDF4(2977, 5774, 8349) }, { AOM_CDF4(21207, 27246, 29640) }, { AOM_CDF4(19547, 26578, 29497) }, { AOM_CDF4(16169, 23871, 27690) }, { AOM_CDF4(12820, 20458, 25018) }, { AOM_CDF4(10224, 17332, 22214) }, { AOM_CDF4(8526, 15048, 19884) }, { AOM_CDF4(5037, 9410, 13118) } }, { { AOM_CDF4(12339, 17329, 20140) }, { AOM_CDF4(13505, 19895, 23225) }, { AOM_CDF4(9847, 16944, 21564) }, { AOM_CDF4(7280, 13256, 18348) }, { AOM_CDF4(4712, 10009, 14454) }, { AOM_CDF4(4361, 7914, 12477) }, { AOM_CDF4(2870, 5628, 7995) }, { AOM_CDF4(20061, 25504, 28526) }, { AOM_CDF4(15235, 22878, 26145) }, { AOM_CDF4(12985, 19958, 24155) }, { AOM_CDF4(9782, 16641, 21403) }, { AOM_CDF4(9456, 16360, 20760) }, { AOM_CDF4(6855, 12940, 18557) }, { AOM_CDF4(5661, 10564, 15002) }, { AOM_CDF4(25656, 30602, 31894) }, { AOM_CDF4(22570, 29107, 31092) }, { AOM_CDF4(18917, 26423, 29541) }, { AOM_CDF4(15940, 23649, 27754) }, { AOM_CDF4(12803, 20581, 25219) }, { AOM_CDF4(11082, 18695, 23376) }, { AOM_CDF4(7939, 14373, 19005) } } }, { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } } } }, { { { { AOM_CDF4(18315, 24289, 27551) }, { AOM_CDF4(16854, 24068, 27835) }, { AOM_CDF4(10140, 17927, 23173) }, { AOM_CDF4(6722, 12982, 18267) }, { AOM_CDF4(4661, 9826, 14706) }, { AOM_CDF4(3832, 8165, 12294) }, { AOM_CDF4(2795, 6098, 9245) }, { AOM_CDF4(17145, 23326, 26672) }, { AOM_CDF4(20733, 27680, 30308) }, { AOM_CDF4(16032, 24461, 28546) }, { AOM_CDF4(11653, 20093, 25081) }, { AOM_CDF4(9290, 16429, 22086) }, { AOM_CDF4(7796, 14598, 19982) }, { AOM_CDF4(6502, 12378, 17441) }, { AOM_CDF4(21681, 27732, 30320) }, { AOM_CDF4(22389, 29044, 31261) }, { AOM_CDF4(19027, 26731, 30087) }, { AOM_CDF4(14739, 23755, 28624) }, { AOM_CDF4(11358, 20778, 25511) }, { AOM_CDF4(10995, 18073, 24190) }, { AOM_CDF4(9162, 14990, 20617) } }, { { AOM_CDF4(21425, 27952, 30388) }, { AOM_CDF4(18062, 25838, 29034) }, { AOM_CDF4(11956, 19881, 24808) }, { AOM_CDF4(7718, 15000, 20980) }, { AOM_CDF4(5702, 11254, 16143) }, { AOM_CDF4(4898, 9088, 16864) }, { AOM_CDF4(3679, 6776, 11907) }, { AOM_CDF4(23294, 30160, 31663) }, { AOM_CDF4(24397, 29896, 31836) }, { AOM_CDF4(19245, 27128, 30593) }, { AOM_CDF4(13202, 19825, 26404) }, { AOM_CDF4(11578, 19297, 23957) }, { AOM_CDF4(8073, 13297, 21370) }, { AOM_CDF4(5461, 10923, 19745) }, { AOM_CDF4(27367, 30521, 31934) }, { AOM_CDF4(24904, 30671, 31940) }, { AOM_CDF4(23075, 28460, 31299) }, { AOM_CDF4(14400, 23658, 30417) }, { AOM_CDF4(13885, 23882, 28325) }, { AOM_CDF4(14746, 22938, 27853) }, { AOM_CDF4(5461, 16384, 27307) } } }, { { { AOM_CDF4(18274, 24813, 27890) }, { AOM_CDF4(15537, 23149, 27003) }, { AOM_CDF4(9449, 16740, 21827) }, { AOM_CDF4(6700, 12498, 17261) }, { AOM_CDF4(4988, 9866, 14198) }, { AOM_CDF4(4236, 8147, 11902) }, { AOM_CDF4(2867, 5860, 8654) }, { AOM_CDF4(17124, 23171, 26101) }, { AOM_CDF4(20396, 27477, 30148) }, { AOM_CDF4(16573, 24629, 28492) }, { AOM_CDF4(12749, 20846, 25674) }, { AOM_CDF4(10233, 17878, 22818) }, { AOM_CDF4(8525, 15332, 20363) }, { AOM_CDF4(6283, 11632, 16255) }, { AOM_CDF4(20466, 26511, 29286) }, { AOM_CDF4(23059, 29174, 31191) }, { AOM_CDF4(19481, 27263, 30241) }, { AOM_CDF4(15458, 23631, 28137) }, { AOM_CDF4(12416, 20608, 25693) }, { AOM_CDF4(10261, 18011, 23261) }, { AOM_CDF4(8016, 14655, 19666) } }, { { AOM_CDF4(17616, 24586, 28112) }, { AOM_CDF4(15809, 23299, 27155) }, { AOM_CDF4(10767, 18890, 23793) }, { AOM_CDF4(7727, 14255, 18865) }, { AOM_CDF4(6129, 11926, 16882) }, { AOM_CDF4(4482, 9704, 14861) }, { AOM_CDF4(3277, 7452, 11522) }, { AOM_CDF4(22956, 28551, 30730) }, { AOM_CDF4(22724, 28937, 30961) }, { AOM_CDF4(18467, 26324, 29580) }, { AOM_CDF4(13234, 20713, 25649) }, { AOM_CDF4(11181, 17592, 22481) }, { AOM_CDF4(8291, 18358, 24576) }, { AOM_CDF4(7568, 11881, 14984) }, { AOM_CDF4(24948, 29001, 31147) }, { AOM_CDF4(25674, 30619, 32151) }, { AOM_CDF4(20841, 26793, 29603) }, { AOM_CDF4(14669, 24356, 28666) }, { AOM_CDF4(11334, 23593, 28219) }, { AOM_CDF4(8922, 14762, 22873) }, { AOM_CDF4(8301, 13544, 20535) } } }, { { { AOM_CDF4(17113, 23733, 27081) }, { AOM_CDF4(14139, 21406, 25452) }, { AOM_CDF4(8552, 15002, 19776) }, { AOM_CDF4(5871, 11120, 15378) }, { AOM_CDF4(4455, 8616, 12253) }, { AOM_CDF4(3469, 6910, 10386) }, { AOM_CDF4(2255, 4553, 6782) }, { AOM_CDF4(18224, 24376, 27053) }, { AOM_CDF4(19290, 26710, 29614) }, { AOM_CDF4(14936, 22991, 27184) }, { AOM_CDF4(11238, 18951, 23762) }, { AOM_CDF4(8786, 15617, 20588) }, { AOM_CDF4(7317, 13228, 18003) }, { AOM_CDF4(5101, 9512, 13493) }, { AOM_CDF4(22639, 28222, 30210) }, { AOM_CDF4(23216, 29331, 31307) }, { AOM_CDF4(19075, 26762, 29895) }, { AOM_CDF4(15014, 23113, 27457) }, { AOM_CDF4(11938, 19857, 24752) }, { AOM_CDF4(9942, 17280, 22282) }, { AOM_CDF4(7167, 13144, 17752) } }, { { AOM_CDF4(15820, 22738, 26488) }, { AOM_CDF4(13530, 20885, 25216) }, { AOM_CDF4(8395, 15530, 20452) }, { AOM_CDF4(6574, 12321, 16380) }, { AOM_CDF4(5353, 10419, 14568) }, { AOM_CDF4(4613, 8446, 12381) }, { AOM_CDF4(3440, 7158, 9903) }, { AOM_CDF4(24247, 29051, 31224) }, { AOM_CDF4(22118, 28058, 30369) }, { AOM_CDF4(16498, 24768, 28389) }, { AOM_CDF4(12920, 21175, 26137) }, { AOM_CDF4(10730, 18619, 25352) }, { AOM_CDF4(10187, 16279, 22791) }, { AOM_CDF4(9310, 14631, 22127) }, { AOM_CDF4(24970, 30558, 32057) }, { AOM_CDF4(24801, 29942, 31698) }, { AOM_CDF4(22432, 28453, 30855) }, { AOM_CDF4(19054, 25680, 29580) }, { AOM_CDF4(14392, 23036, 28109) }, { AOM_CDF4(12495, 20947, 26650) }, { AOM_CDF4(12442, 20326, 26214) } } }, { { { AOM_CDF4(12162, 18785, 22648) }, { AOM_CDF4(12749, 19697, 23806) }, { AOM_CDF4(8580, 15297, 20346) }, { AOM_CDF4(6169, 11749, 16543) }, { AOM_CDF4(4836, 9391, 13448) }, { AOM_CDF4(3821, 7711, 11613) }, { AOM_CDF4(2228, 4601, 7070) }, { AOM_CDF4(16319, 24725, 28280) }, { AOM_CDF4(15698, 23277, 27168) }, { AOM_CDF4(12726, 20368, 25047) }, { AOM_CDF4(9912, 17015, 21976) }, { AOM_CDF4(7888, 14220, 19179) }, { AOM_CDF4(6777, 12284, 17018) }, { AOM_CDF4(4492, 8590, 12252) }, { AOM_CDF4(23249, 28904, 30947) }, { AOM_CDF4(21050, 27908, 30512) }, { AOM_CDF4(17440, 25340, 28949) }, { AOM_CDF4(14059, 22018, 26541) }, { AOM_CDF4(11288, 18903, 23898) }, { AOM_CDF4(9411, 16342, 21428) }, { AOM_CDF4(6278, 11588, 15944) } }, { { AOM_CDF4(13981, 20067, 23226) }, { AOM_CDF4(16922, 23580, 26783) }, { AOM_CDF4(11005, 19039, 24487) }, { AOM_CDF4(7389, 14218, 19798) }, { AOM_CDF4(5598, 11505, 17206) }, { AOM_CDF4(6090, 11213, 15659) }, { AOM_CDF4(3820, 7371, 10119) }, { AOM_CDF4(21082, 26925, 29675) }, { AOM_CDF4(21262, 28627, 31128) }, { AOM_CDF4(18392, 26454, 30437) }, { AOM_CDF4(14870, 22910, 27096) }, { AOM_CDF4(12620, 19484, 24908) }, { AOM_CDF4(9290, 16553, 22802) }, { AOM_CDF4(6668, 14288, 20004) }, { AOM_CDF4(27704, 31055, 31949) }, { AOM_CDF4(24709, 29978, 31788) }, { AOM_CDF4(21668, 29264, 31657) }, { AOM_CDF4(18295, 26968, 30074) }, { AOM_CDF4(16399, 24422, 29313) }, { AOM_CDF4(14347, 23026, 28104) }, { AOM_CDF4(12370, 19806, 24477) } } }, { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } } } } }; static const aom_cdf_prob av1_default_coeff_base_multi_cdfs [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS] [CDF_SIZE(NUM_BASE_LEVELS + 2)] = { { { { { AOM_CDF4(4034, 8930, 12727) }, { AOM_CDF4(18082, 29741, 31877) }, { AOM_CDF4(12596, 26124, 30493) }, { AOM_CDF4(9446, 21118, 27005) }, { AOM_CDF4(6308, 15141, 21279) }, { AOM_CDF4(2463, 6357, 9783) }, { AOM_CDF4(20667, 30546, 31929) }, { AOM_CDF4(13043, 26123, 30134) }, { AOM_CDF4(8151, 18757, 24778) }, { AOM_CDF4(5255, 12839, 18632) }, { AOM_CDF4(2820, 7206, 11161) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(15736, 27553, 30604) }, { AOM_CDF4(11210, 23794, 28787) }, { AOM_CDF4(5947, 13874, 19701) }, { AOM_CDF4(4215, 9323, 13891) }, { AOM_CDF4(2833, 6462, 10059) }, { AOM_CDF4(19605, 30393, 31582) }, { AOM_CDF4(13523, 26252, 30248) }, { AOM_CDF4(8446, 18622, 24512) }, { AOM_CDF4(3818, 10343, 15974) }, { AOM_CDF4(1481, 4117, 6796) }, { AOM_CDF4(22649, 31302, 32190) }, { AOM_CDF4(14829, 27127, 30449) }, { AOM_CDF4(8313, 17702, 23304) }, { AOM_CDF4(3022, 8301, 12786) }, { AOM_CDF4(1536, 4412, 7184) }, { AOM_CDF4(22354, 29774, 31372) }, { AOM_CDF4(14723, 25472, 29214) }, { AOM_CDF4(6673, 13745, 18662) }, { AOM_CDF4(2068, 5766, 9322) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(6302, 16444, 21761) }, { AOM_CDF4(23040, 31538, 32475) }, { AOM_CDF4(15196, 28452, 31496) }, { AOM_CDF4(10020, 22946, 28514) }, { AOM_CDF4(6533, 16862, 23501) }, { AOM_CDF4(3538, 9816, 15076) }, { AOM_CDF4(24444, 31875, 32525) }, { AOM_CDF4(15881, 28924, 31635) }, { AOM_CDF4(9922, 22873, 28466) }, { AOM_CDF4(6527, 16966, 23691) }, { AOM_CDF4(4114, 11303, 17220) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(20201, 30770, 32209) }, { AOM_CDF4(14754, 28071, 31258) }, { AOM_CDF4(8378, 20186, 26517) }, { AOM_CDF4(5916, 15299, 21978) }, { AOM_CDF4(4268, 11583, 17901) }, { AOM_CDF4(24361, 32025, 32581) }, { AOM_CDF4(18673, 30105, 31943) }, { AOM_CDF4(10196, 22244, 27576) }, { AOM_CDF4(5495, 14349, 20417) }, { AOM_CDF4(2676, 7415, 11498) }, { AOM_CDF4(24678, 31958, 32585) }, { AOM_CDF4(18629, 29906, 31831) }, { AOM_CDF4(9364, 20724, 26315) }, { AOM_CDF4(4641, 12318, 18094) }, { AOM_CDF4(2758, 7387, 11579) }, { AOM_CDF4(25433, 31842, 32469) }, { AOM_CDF4(18795, 29289, 31411) }, { AOM_CDF4(7644, 17584, 23592) }, { AOM_CDF4(3408, 9014, 15047) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } } }, { { { AOM_CDF4(4536, 10072, 14001) }, { AOM_CDF4(25459, 31416, 32206) }, { AOM_CDF4(16605, 28048, 30818) }, { AOM_CDF4(11008, 22857, 27719) }, { AOM_CDF4(6915, 16268, 22315) }, { AOM_CDF4(2625, 6812, 10537) }, { AOM_CDF4(24257, 31788, 32499) }, { AOM_CDF4(16880, 29454, 31879) }, { AOM_CDF4(11958, 25054, 29778) }, { AOM_CDF4(7916, 18718, 25084) }, { AOM_CDF4(3383, 8777, 13446) }, { AOM_CDF4(22720, 31603, 32393) }, { AOM_CDF4(14960, 28125, 31335) }, { AOM_CDF4(9731, 22210, 27928) }, { AOM_CDF4(6304, 15832, 22277) }, { AOM_CDF4(2910, 7818, 12166) }, { AOM_CDF4(20375, 30627, 32131) }, { AOM_CDF4(13904, 27284, 30887) }, { AOM_CDF4(9368, 21558, 27144) }, { AOM_CDF4(5937, 14966, 21119) }, { AOM_CDF4(2667, 7225, 11319) }, { AOM_CDF4(23970, 31470, 32378) }, { AOM_CDF4(17173, 29734, 32018) }, { AOM_CDF4(12795, 25441, 29965) }, { AOM_CDF4(8981, 19680, 25893) }, { AOM_CDF4(4728, 11372, 16902) }, { AOM_CDF4(24287, 31797, 32439) }, { AOM_CDF4(16703, 29145, 31696) }, { AOM_CDF4(10833, 23554, 28725) }, { AOM_CDF4(6468, 16566, 23057) }, { AOM_CDF4(2415, 6562, 10278) }, { AOM_CDF4(26610, 32395, 32659) }, { AOM_CDF4(18590, 30498, 32117) }, { AOM_CDF4(12420, 25756, 29950) }, { AOM_CDF4(7639, 18746, 24710) }, { AOM_CDF4(3001, 8086, 12347) }, { AOM_CDF4(25076, 32064, 32580) }, { AOM_CDF4(17946, 30128, 32028) }, { AOM_CDF4(12024, 24985, 29378) }, { AOM_CDF4(7517, 18390, 24304) }, { AOM_CDF4(3243, 8781, 13331) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(6037, 16771, 21957) }, { AOM_CDF4(24774, 31704, 32426) }, { AOM_CDF4(16830, 28589, 31056) }, { AOM_CDF4(10602, 22828, 27760) }, { AOM_CDF4(6733, 16829, 23071) }, { AOM_CDF4(3250, 8914, 13556) }, { AOM_CDF4(25582, 32220, 32668) }, { AOM_CDF4(18659, 30342, 32223) }, { AOM_CDF4(12546, 26149, 30515) }, { AOM_CDF4(8420, 20451, 26801) }, { AOM_CDF4(4636, 12420, 18344) }, { AOM_CDF4(27581, 32362, 32639) }, { AOM_CDF4(18987, 30083, 31978) }, { AOM_CDF4(11327, 24248, 29084) }, { AOM_CDF4(7264, 17719, 24120) }, { AOM_CDF4(3995, 10768, 16169) }, { AOM_CDF4(25893, 31831, 32487) }, { AOM_CDF4(16577, 28587, 31379) }, { AOM_CDF4(10189, 22748, 28182) }, { AOM_CDF4(6832, 17094, 23556) }, { AOM_CDF4(3708, 10110, 15334) }, { AOM_CDF4(25904, 32282, 32656) }, { AOM_CDF4(19721, 30792, 32276) }, { AOM_CDF4(12819, 26243, 30411) }, { AOM_CDF4(8572, 20614, 26891) }, { AOM_CDF4(5364, 14059, 20467) }, { AOM_CDF4(26580, 32438, 32677) }, { AOM_CDF4(20852, 31225, 32340) }, { AOM_CDF4(12435, 25700, 29967) }, { AOM_CDF4(8691, 20825, 26976) }, { AOM_CDF4(4446, 12209, 17269) }, { AOM_CDF4(27350, 32429, 32696) }, { AOM_CDF4(21372, 30977, 32272) }, { AOM_CDF4(12673, 25270, 29853) }, { AOM_CDF4(9208, 20925, 26640) }, { AOM_CDF4(5018, 13351, 18732) }, { AOM_CDF4(27351, 32479, 32713) }, { AOM_CDF4(21398, 31209, 32387) }, { AOM_CDF4(12162, 25047, 29842) }, { AOM_CDF4(7896, 18691, 25319) }, { AOM_CDF4(4670, 12882, 18881) }, { AOM_CDF4(8192, 16384, 24576) } } }, { { { AOM_CDF4(5487, 10460, 13708) }, { AOM_CDF4(21597, 28303, 30674) }, { AOM_CDF4(11037, 21953, 26476) }, { AOM_CDF4(8147, 17962, 22952) }, { AOM_CDF4(5242, 13061, 18532) }, { AOM_CDF4(1889, 5208, 8182) }, { AOM_CDF4(26774, 32133, 32590) }, { AOM_CDF4(17844, 29564, 31767) }, { AOM_CDF4(11690, 24438, 29171) }, { AOM_CDF4(7542, 18215, 24459) }, { AOM_CDF4(2993, 8050, 12319) }, { AOM_CDF4(28023, 32328, 32591) }, { AOM_CDF4(18651, 30126, 31954) }, { AOM_CDF4(12164, 25146, 29589) }, { AOM_CDF4(7762, 18530, 24771) }, { AOM_CDF4(3492, 9183, 13920) }, { AOM_CDF4(27591, 32008, 32491) }, { AOM_CDF4(17149, 28853, 31510) }, { AOM_CDF4(11485, 24003, 28860) }, { AOM_CDF4(7697, 18086, 24210) }, { AOM_CDF4(3075, 7999, 12218) }, { AOM_CDF4(28268, 32482, 32654) }, { AOM_CDF4(19631, 31051, 32404) }, { AOM_CDF4(13860, 27260, 31020) }, { AOM_CDF4(9605, 21613, 27594) }, { AOM_CDF4(4876, 12162, 17908) }, { AOM_CDF4(27248, 32316, 32576) }, { AOM_CDF4(18955, 30457, 32075) }, { AOM_CDF4(11824, 23997, 28795) }, { AOM_CDF4(7346, 18196, 24647) }, { AOM_CDF4(3403, 9247, 14111) }, { AOM_CDF4(29711, 32655, 32735) }, { AOM_CDF4(21169, 31394, 32417) }, { AOM_CDF4(13487, 27198, 30957) }, { AOM_CDF4(8828, 21683, 27614) }, { AOM_CDF4(4270, 11451, 17038) }, { AOM_CDF4(28708, 32578, 32731) }, { AOM_CDF4(20120, 31241, 32482) }, { AOM_CDF4(13692, 27550, 31321) }, { AOM_CDF4(9418, 22514, 28439) }, { AOM_CDF4(4999, 13283, 19462) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(5673, 14302, 19711) }, { AOM_CDF4(26251, 30701, 31834) }, { AOM_CDF4(12782, 23783, 27803) }, { AOM_CDF4(9127, 20657, 25808) }, { AOM_CDF4(6368, 16208, 21462) }, { AOM_CDF4(2465, 7177, 10822) }, { AOM_CDF4(29961, 32563, 32719) }, { AOM_CDF4(18318, 29891, 31949) }, { AOM_CDF4(11361, 24514, 29357) }, { AOM_CDF4(7900, 19603, 25607) }, { AOM_CDF4(4002, 10590, 15546) }, { AOM_CDF4(29637, 32310, 32595) }, { AOM_CDF4(18296, 29913, 31809) }, { AOM_CDF4(10144, 21515, 26871) }, { AOM_CDF4(5358, 14322, 20394) }, { AOM_CDF4(3067, 8362, 13346) }, { AOM_CDF4(28652, 32470, 32676) }, { AOM_CDF4(17538, 30771, 32209) }, { AOM_CDF4(13924, 26882, 30494) }, { AOM_CDF4(10496, 22837, 27869) }, { AOM_CDF4(7236, 16396, 21621) }, { AOM_CDF4(30743, 32687, 32746) }, { AOM_CDF4(23006, 31676, 32489) }, { AOM_CDF4(14494, 27828, 31120) }, { AOM_CDF4(10174, 22801, 28352) }, { AOM_CDF4(6242, 15281, 21043) }, { AOM_CDF4(25817, 32243, 32720) }, { AOM_CDF4(18618, 31367, 32325) }, { AOM_CDF4(13997, 28318, 31878) }, { AOM_CDF4(12255, 26534, 31383) }, { AOM_CDF4(9561, 21588, 28450) }, { AOM_CDF4(28188, 32635, 32724) }, { AOM_CDF4(22060, 32365, 32728) }, { AOM_CDF4(18102, 30690, 32528) }, { AOM_CDF4(14196, 28864, 31999) }, { AOM_CDF4(12262, 25792, 30865) }, { AOM_CDF4(24176, 32109, 32628) }, { AOM_CDF4(18280, 29681, 31963) }, { AOM_CDF4(10205, 23703, 29664) }, { AOM_CDF4(7889, 20025, 27676) }, { AOM_CDF4(6060, 16743, 23970) }, { AOM_CDF4(8192, 16384, 24576) } } }, { { { AOM_CDF4(5141, 7096, 8260) }, { AOM_CDF4(27186, 29022, 29789) }, { AOM_CDF4(6668, 12568, 15682) }, { AOM_CDF4(2172, 6181, 8638) }, { AOM_CDF4(1126, 3379, 4531) }, { AOM_CDF4(443, 1361, 2254) }, { AOM_CDF4(26083, 31153, 32436) }, { AOM_CDF4(13486, 24603, 28483) }, { AOM_CDF4(6508, 14840, 19910) }, { AOM_CDF4(3386, 8800, 13286) }, { AOM_CDF4(1530, 4322, 7054) }, { AOM_CDF4(29639, 32080, 32548) }, { AOM_CDF4(15897, 27552, 30290) }, { AOM_CDF4(8588, 20047, 25383) }, { AOM_CDF4(4889, 13339, 19269) }, { AOM_CDF4(2240, 6871, 10498) }, { AOM_CDF4(28165, 32197, 32517) }, { AOM_CDF4(20735, 30427, 31568) }, { AOM_CDF4(14325, 24671, 27692) }, { AOM_CDF4(5119, 12554, 17805) }, { AOM_CDF4(1810, 5441, 8261) }, { AOM_CDF4(31212, 32724, 32748) }, { AOM_CDF4(23352, 31766, 32545) }, { AOM_CDF4(14669, 27570, 31059) }, { AOM_CDF4(8492, 20894, 27272) }, { AOM_CDF4(3644, 10194, 15204) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(2461, 7013, 9371) }, { AOM_CDF4(24749, 29600, 30986) }, { AOM_CDF4(9466, 19037, 22417) }, { AOM_CDF4(3584, 9280, 14400) }, { AOM_CDF4(1505, 3929, 5433) }, { AOM_CDF4(677, 1500, 2736) }, { AOM_CDF4(23987, 30702, 32117) }, { AOM_CDF4(13554, 24571, 29263) }, { AOM_CDF4(6211, 14556, 21155) }, { AOM_CDF4(3135, 10972, 15625) }, { AOM_CDF4(2435, 7127, 11427) }, { AOM_CDF4(31300, 32532, 32550) }, { AOM_CDF4(14757, 30365, 31954) }, { AOM_CDF4(4405, 11612, 18553) }, { AOM_CDF4(580, 4132, 7322) }, { AOM_CDF4(1695, 10169, 14124) }, { AOM_CDF4(30008, 32282, 32591) }, { AOM_CDF4(19244, 30108, 31748) }, { AOM_CDF4(11180, 24158, 29555) }, { AOM_CDF4(5650, 14972, 19209) }, { AOM_CDF4(2114, 5109, 8456) }, { AOM_CDF4(31856, 32716, 32748) }, { AOM_CDF4(23012, 31664, 32572) }, { AOM_CDF4(13694, 26656, 30636) }, { AOM_CDF4(8142, 19508, 26093) }, { AOM_CDF4(4253, 10955, 16724) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } } }, { { { AOM_CDF4(601, 983, 1311) }, { AOM_CDF4(18725, 23406, 28087) }, { AOM_CDF4(5461, 8192, 10923) }, { AOM_CDF4(3781, 15124, 21425) }, { AOM_CDF4(2587, 7761, 12072) }, { AOM_CDF4(106, 458, 810) }, { AOM_CDF4(22282, 29710, 31894) }, { AOM_CDF4(8508, 20926, 25984) }, { AOM_CDF4(3726, 12713, 18083) }, { AOM_CDF4(1620, 7112, 10893) }, { AOM_CDF4(729, 2236, 3495) }, { AOM_CDF4(30163, 32474, 32684) }, { AOM_CDF4(18304, 30464, 32000) }, { AOM_CDF4(11443, 26526, 29647) }, { AOM_CDF4(6007, 15292, 21299) }, { AOM_CDF4(2234, 6703, 8937) }, { AOM_CDF4(30954, 32177, 32571) }, { AOM_CDF4(17363, 29562, 31076) }, { AOM_CDF4(9686, 22464, 27410) }, { AOM_CDF4(8192, 16384, 21390) }, { AOM_CDF4(1755, 8046, 11264) }, { AOM_CDF4(31168, 32734, 32748) }, { AOM_CDF4(22486, 31441, 32471) }, { AOM_CDF4(12833, 25627, 29738) }, { AOM_CDF4(6980, 17379, 23122) }, { AOM_CDF4(3111, 8887, 13479) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } } } }, { { { { AOM_CDF4(6041, 11854, 15927) }, { AOM_CDF4(20326, 30905, 32251) }, { AOM_CDF4(14164, 26831, 30725) }, { AOM_CDF4(9760, 20647, 26585) }, { AOM_CDF4(6416, 14953, 21219) }, { AOM_CDF4(2966, 7151, 10891) }, { AOM_CDF4(23567, 31374, 32254) }, { AOM_CDF4(14978, 27416, 30946) }, { AOM_CDF4(9434, 20225, 26254) }, { AOM_CDF4(6658, 14558, 20535) }, { AOM_CDF4(3916, 8677, 12989) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(18088, 29545, 31587) }, { AOM_CDF4(13062, 25843, 30073) }, { AOM_CDF4(8940, 16827, 22251) }, { AOM_CDF4(7654, 13220, 17973) }, { AOM_CDF4(5733, 10316, 14456) }, { AOM_CDF4(22879, 31388, 32114) }, { AOM_CDF4(15215, 27993, 30955) }, { AOM_CDF4(9397, 19445, 24978) }, { AOM_CDF4(3442, 9813, 15344) }, { AOM_CDF4(1368, 3936, 6532) }, { AOM_CDF4(25494, 32033, 32406) }, { AOM_CDF4(16772, 27963, 30718) }, { AOM_CDF4(9419, 18165, 23260) }, { AOM_CDF4(2677, 7501, 11797) }, { AOM_CDF4(1516, 4344, 7170) }, { AOM_CDF4(26556, 31454, 32101) }, { AOM_CDF4(17128, 27035, 30108) }, { AOM_CDF4(8324, 15344, 20249) }, { AOM_CDF4(1903, 5696, 9469) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(8455, 19003, 24368) }, { AOM_CDF4(23563, 32021, 32604) }, { AOM_CDF4(16237, 29446, 31935) }, { AOM_CDF4(10724, 23999, 29358) }, { AOM_CDF4(6725, 17528, 24416) }, { AOM_CDF4(3927, 10927, 16825) }, { AOM_CDF4(26313, 32288, 32634) }, { AOM_CDF4(17430, 30095, 32095) }, { AOM_CDF4(11116, 24606, 29679) }, { AOM_CDF4(7195, 18384, 25269) }, { AOM_CDF4(4726, 12852, 19315) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(22822, 31648, 32483) }, { AOM_CDF4(16724, 29633, 31929) }, { AOM_CDF4(10261, 23033, 28725) }, { AOM_CDF4(7029, 17840, 24528) }, { AOM_CDF4(4867, 13886, 21502) }, { AOM_CDF4(25298, 31892, 32491) }, { AOM_CDF4(17809, 29330, 31512) }, { AOM_CDF4(9668, 21329, 26579) }, { AOM_CDF4(4774, 12956, 18976) }, { AOM_CDF4(2322, 7030, 11540) }, { AOM_CDF4(25472, 31920, 32543) }, { AOM_CDF4(17957, 29387, 31632) }, { AOM_CDF4(9196, 20593, 26400) }, { AOM_CDF4(4680, 12705, 19202) }, { AOM_CDF4(2917, 8456, 13436) }, { AOM_CDF4(26471, 32059, 32574) }, { AOM_CDF4(18458, 29783, 31909) }, { AOM_CDF4(8400, 19464, 25956) }, { AOM_CDF4(3812, 10973, 17206) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } } }, { { { AOM_CDF4(6779, 13743, 17678) }, { AOM_CDF4(24806, 31797, 32457) }, { AOM_CDF4(17616, 29047, 31372) }, { AOM_CDF4(11063, 23175, 28003) }, { AOM_CDF4(6521, 16110, 22324) }, { AOM_CDF4(2764, 7504, 11654) }, { AOM_CDF4(25266, 32367, 32637) }, { AOM_CDF4(19054, 30553, 32175) }, { AOM_CDF4(12139, 25212, 29807) }, { AOM_CDF4(7311, 18162, 24704) }, { AOM_CDF4(3397, 9164, 14074) }, { AOM_CDF4(25988, 32208, 32522) }, { AOM_CDF4(16253, 28912, 31526) }, { AOM_CDF4(9151, 21387, 27372) }, { AOM_CDF4(5688, 14915, 21496) }, { AOM_CDF4(2717, 7627, 12004) }, { AOM_CDF4(23144, 31855, 32443) }, { AOM_CDF4(16070, 28491, 31325) }, { AOM_CDF4(8702, 20467, 26517) }, { AOM_CDF4(5243, 13956, 20367) }, { AOM_CDF4(2621, 7335, 11567) }, { AOM_CDF4(26636, 32340, 32630) }, { AOM_CDF4(19990, 31050, 32341) }, { AOM_CDF4(13243, 26105, 30315) }, { AOM_CDF4(8588, 19521, 25918) }, { AOM_CDF4(4717, 11585, 17304) }, { AOM_CDF4(25844, 32292, 32582) }, { AOM_CDF4(19090, 30635, 32097) }, { AOM_CDF4(11963, 24546, 28939) }, { AOM_CDF4(6218, 16087, 22354) }, { AOM_CDF4(2340, 6608, 10426) }, { AOM_CDF4(28046, 32576, 32694) }, { AOM_CDF4(21178, 31313, 32296) }, { AOM_CDF4(13486, 26184, 29870) }, { AOM_CDF4(7149, 17871, 23723) }, { AOM_CDF4(2833, 7958, 12259) }, { AOM_CDF4(27710, 32528, 32686) }, { AOM_CDF4(20674, 31076, 32268) }, { AOM_CDF4(12413, 24955, 29243) }, { AOM_CDF4(6676, 16927, 23097) }, { AOM_CDF4(2966, 8333, 12919) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(8639, 19339, 24429) }, { AOM_CDF4(24404, 31837, 32525) }, { AOM_CDF4(16997, 29425, 31784) }, { AOM_CDF4(11253, 24234, 29149) }, { AOM_CDF4(6751, 17394, 24028) }, { AOM_CDF4(3490, 9830, 15191) }, { AOM_CDF4(26283, 32471, 32714) }, { AOM_CDF4(19599, 31168, 32442) }, { AOM_CDF4(13146, 26954, 30893) }, { AOM_CDF4(8214, 20588, 26890) }, { AOM_CDF4(4699, 13081, 19300) }, { AOM_CDF4(28212, 32458, 32669) }, { AOM_CDF4(18594, 30316, 32100) }, { AOM_CDF4(11219, 24408, 29234) }, { AOM_CDF4(6865, 17656, 24149) }, { AOM_CDF4(3678, 10362, 16006) }, { AOM_CDF4(25825, 32136, 32616) }, { AOM_CDF4(17313, 29853, 32021) }, { AOM_CDF4(11197, 24471, 29472) }, { AOM_CDF4(6947, 17781, 24405) }, { AOM_CDF4(3768, 10660, 16261) }, { AOM_CDF4(27352, 32500, 32706) }, { AOM_CDF4(20850, 31468, 32469) }, { AOM_CDF4(14021, 27707, 31133) }, { AOM_CDF4(8964, 21748, 27838) }, { AOM_CDF4(5437, 14665, 21187) }, { AOM_CDF4(26304, 32492, 32698) }, { AOM_CDF4(20409, 31380, 32385) }, { AOM_CDF4(13682, 27222, 30632) }, { AOM_CDF4(8974, 21236, 26685) }, { AOM_CDF4(4234, 11665, 16934) }, { AOM_CDF4(26273, 32357, 32711) }, { AOM_CDF4(20672, 31242, 32441) }, { AOM_CDF4(14172, 27254, 30902) }, { AOM_CDF4(9870, 21898, 27275) }, { AOM_CDF4(5164, 13506, 19270) }, { AOM_CDF4(26725, 32459, 32728) }, { AOM_CDF4(20991, 31442, 32527) }, { AOM_CDF4(13071, 26434, 30811) }, { AOM_CDF4(8184, 20090, 26742) }, { AOM_CDF4(4803, 13255, 19895) }, { AOM_CDF4(8192, 16384, 24576) } } }, { { { AOM_CDF4(7555, 14942, 18501) }, { AOM_CDF4(24410, 31178, 32287) }, { AOM_CDF4(14394, 26738, 30253) }, { AOM_CDF4(8413, 19554, 25195) }, { AOM_CDF4(4766, 12924, 18785) }, { AOM_CDF4(2029, 5806, 9207) }, { AOM_CDF4(26776, 32364, 32663) }, { AOM_CDF4(18732, 29967, 31931) }, { AOM_CDF4(11005, 23786, 28852) }, { AOM_CDF4(6466, 16909, 23510) }, { AOM_CDF4(3044, 8638, 13419) }, { AOM_CDF4(29208, 32582, 32704) }, { AOM_CDF4(20068, 30857, 32208) }, { AOM_CDF4(12003, 25085, 29595) }, { AOM_CDF4(6947, 17750, 24189) }, { AOM_CDF4(3245, 9103, 14007) }, { AOM_CDF4(27359, 32465, 32669) }, { AOM_CDF4(19421, 30614, 32174) }, { AOM_CDF4(11915, 25010, 29579) }, { AOM_CDF4(6950, 17676, 24074) }, { AOM_CDF4(3007, 8473, 13096) }, { AOM_CDF4(29002, 32676, 32735) }, { AOM_CDF4(22102, 31849, 32576) }, { AOM_CDF4(14408, 28009, 31405) }, { AOM_CDF4(9027, 21679, 27931) }, { AOM_CDF4(4694, 12678, 18748) }, { AOM_CDF4(28216, 32528, 32682) }, { AOM_CDF4(20849, 31264, 32318) }, { AOM_CDF4(12756, 25815, 29751) }, { AOM_CDF4(7565, 18801, 24923) }, { AOM_CDF4(3509, 9533, 14477) }, { AOM_CDF4(30133, 32687, 32739) }, { AOM_CDF4(23063, 31910, 32515) }, { AOM_CDF4(14588, 28051, 31132) }, { AOM_CDF4(9085, 21649, 27457) }, { AOM_CDF4(4261, 11654, 17264) }, { AOM_CDF4(29518, 32691, 32748) }, { AOM_CDF4(22451, 31959, 32613) }, { AOM_CDF4(14864, 28722, 31700) }, { AOM_CDF4(9695, 22964, 28716) }, { AOM_CDF4(4932, 13358, 19502) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(6465, 16958, 21688) }, { AOM_CDF4(25199, 31514, 32360) }, { AOM_CDF4(14774, 27149, 30607) }, { AOM_CDF4(9257, 21438, 26972) }, { AOM_CDF4(5723, 15183, 21882) }, { AOM_CDF4(3150, 8879, 13731) }, { AOM_CDF4(26989, 32262, 32682) }, { AOM_CDF4(17396, 29937, 32085) }, { AOM_CDF4(11387, 24901, 29784) }, { AOM_CDF4(7289, 18821, 25548) }, { AOM_CDF4(3734, 10577, 16086) }, { AOM_CDF4(29728, 32501, 32695) }, { AOM_CDF4(17431, 29701, 31903) }, { AOM_CDF4(9921, 22826, 28300) }, { AOM_CDF4(5896, 15434, 22068) }, { AOM_CDF4(3430, 9646, 14757) }, { AOM_CDF4(28614, 32511, 32705) }, { AOM_CDF4(19364, 30638, 32263) }, { AOM_CDF4(13129, 26254, 30402) }, { AOM_CDF4(8754, 20484, 26440) }, { AOM_CDF4(4378, 11607, 17110) }, { AOM_CDF4(30292, 32671, 32744) }, { AOM_CDF4(21780, 31603, 32501) }, { AOM_CDF4(14314, 27829, 31291) }, { AOM_CDF4(9611, 22327, 28263) }, { AOM_CDF4(4890, 13087, 19065) }, { AOM_CDF4(25862, 32567, 32733) }, { AOM_CDF4(20794, 32050, 32567) }, { AOM_CDF4(17243, 30625, 32254) }, { AOM_CDF4(13283, 27628, 31474) }, { AOM_CDF4(9669, 22532, 28918) }, { AOM_CDF4(27435, 32697, 32748) }, { AOM_CDF4(24922, 32390, 32714) }, { AOM_CDF4(21449, 31504, 32536) }, { AOM_CDF4(16392, 29729, 31832) }, { AOM_CDF4(11692, 24884, 29076) }, { AOM_CDF4(24193, 32290, 32735) }, { AOM_CDF4(18909, 31104, 32563) }, { AOM_CDF4(12236, 26841, 31403) }, { AOM_CDF4(8171, 21840, 29082) }, { AOM_CDF4(7224, 17280, 25275) }, { AOM_CDF4(8192, 16384, 24576) } } }, { { { AOM_CDF4(3078, 6839, 9890) }, { AOM_CDF4(13837, 20450, 24479) }, { AOM_CDF4(5914, 14222, 19328) }, { AOM_CDF4(3866, 10267, 14762) }, { AOM_CDF4(2612, 7208, 11042) }, { AOM_CDF4(1067, 2991, 4776) }, { AOM_CDF4(25817, 31646, 32529) }, { AOM_CDF4(13708, 26338, 30385) }, { AOM_CDF4(7328, 18585, 24870) }, { AOM_CDF4(4691, 13080, 19276) }, { AOM_CDF4(1825, 5253, 8352) }, { AOM_CDF4(29386, 32315, 32624) }, { AOM_CDF4(17160, 29001, 31360) }, { AOM_CDF4(9602, 21862, 27396) }, { AOM_CDF4(5915, 15772, 22148) }, { AOM_CDF4(2786, 7779, 12047) }, { AOM_CDF4(29246, 32450, 32663) }, { AOM_CDF4(18696, 29929, 31818) }, { AOM_CDF4(10510, 23369, 28560) }, { AOM_CDF4(6229, 16499, 23125) }, { AOM_CDF4(2608, 7448, 11705) }, { AOM_CDF4(30753, 32710, 32748) }, { AOM_CDF4(21638, 31487, 32503) }, { AOM_CDF4(12937, 26854, 30870) }, { AOM_CDF4(8182, 20596, 26970) }, { AOM_CDF4(3637, 10269, 15497) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(5244, 12150, 16906) }, { AOM_CDF4(20486, 26858, 29701) }, { AOM_CDF4(7756, 18317, 23735) }, { AOM_CDF4(3452, 9256, 13146) }, { AOM_CDF4(2020, 5206, 8229) }, { AOM_CDF4(1801, 4993, 7903) }, { AOM_CDF4(27051, 31858, 32531) }, { AOM_CDF4(15988, 27531, 30619) }, { AOM_CDF4(9188, 21484, 26719) }, { AOM_CDF4(6273, 17186, 23800) }, { AOM_CDF4(3108, 9355, 14764) }, { AOM_CDF4(31076, 32520, 32680) }, { AOM_CDF4(18119, 30037, 31850) }, { AOM_CDF4(10244, 22969, 27472) }, { AOM_CDF4(4692, 14077, 19273) }, { AOM_CDF4(3694, 11677, 17556) }, { AOM_CDF4(30060, 32581, 32720) }, { AOM_CDF4(21011, 30775, 32120) }, { AOM_CDF4(11931, 24820, 29289) }, { AOM_CDF4(7119, 17662, 24356) }, { AOM_CDF4(3833, 10706, 16304) }, { AOM_CDF4(31954, 32731, 32748) }, { AOM_CDF4(23913, 31724, 32489) }, { AOM_CDF4(15520, 28060, 31286) }, { AOM_CDF4(11517, 23008, 28571) }, { AOM_CDF4(6193, 14508, 20629) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } } }, { { { AOM_CDF4(1035, 2807, 4156) }, { AOM_CDF4(13162, 18138, 20939) }, { AOM_CDF4(2696, 6633, 8755) }, { AOM_CDF4(1373, 4161, 6853) }, { AOM_CDF4(1099, 2746, 4716) }, { AOM_CDF4(340, 1021, 1599) }, { AOM_CDF4(22826, 30419, 32135) }, { AOM_CDF4(10395, 21762, 26942) }, { AOM_CDF4(4726, 12407, 17361) }, { AOM_CDF4(2447, 7080, 10593) }, { AOM_CDF4(1227, 3717, 6011) }, { AOM_CDF4(28156, 31424, 31934) }, { AOM_CDF4(16915, 27754, 30373) }, { AOM_CDF4(9148, 20990, 26431) }, { AOM_CDF4(5950, 15515, 21148) }, { AOM_CDF4(2492, 7327, 11526) }, { AOM_CDF4(30602, 32477, 32670) }, { AOM_CDF4(20026, 29955, 31568) }, { AOM_CDF4(11220, 23628, 28105) }, { AOM_CDF4(6652, 17019, 22973) }, { AOM_CDF4(3064, 8536, 13043) }, { AOM_CDF4(31769, 32724, 32748) }, { AOM_CDF4(22230, 30887, 32373) }, { AOM_CDF4(12234, 25079, 29731) }, { AOM_CDF4(7326, 18816, 25353) }, { AOM_CDF4(3933, 10907, 16616) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } } } }, { { { { AOM_CDF4(8896, 16227, 20630) }, { AOM_CDF4(23629, 31782, 32527) }, { AOM_CDF4(15173, 27755, 31321) }, { AOM_CDF4(10158, 21233, 27382) }, { AOM_CDF4(6420, 14857, 21558) }, { AOM_CDF4(3269, 8155, 12646) }, { AOM_CDF4(24835, 32009, 32496) }, { AOM_CDF4(16509, 28421, 31579) }, { AOM_CDF4(10957, 21514, 27418) }, { AOM_CDF4(7881, 15930, 22096) }, { AOM_CDF4(5388, 10960, 15918) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(20745, 30773, 32093) }, { AOM_CDF4(15200, 27221, 30861) }, { AOM_CDF4(13032, 20873, 25667) }, { AOM_CDF4(12285, 18663, 23494) }, { AOM_CDF4(11563, 17481, 21489) }, { AOM_CDF4(26260, 31982, 32320) }, { AOM_CDF4(15397, 28083, 31100) }, { AOM_CDF4(9742, 19217, 24824) }, { AOM_CDF4(3261, 9629, 15362) }, { AOM_CDF4(1480, 4322, 7499) }, { AOM_CDF4(27599, 32256, 32460) }, { AOM_CDF4(16857, 27659, 30774) }, { AOM_CDF4(9551, 18290, 23748) }, { AOM_CDF4(3052, 8933, 14103) }, { AOM_CDF4(2021, 5910, 9787) }, { AOM_CDF4(29005, 32015, 32392) }, { AOM_CDF4(17677, 27694, 30863) }, { AOM_CDF4(9204, 17356, 23219) }, { AOM_CDF4(2403, 7516, 12814) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(10808, 22056, 26896) }, { AOM_CDF4(25739, 32313, 32676) }, { AOM_CDF4(17288, 30203, 32221) }, { AOM_CDF4(11359, 24878, 29896) }, { AOM_CDF4(6949, 17767, 24893) }, { AOM_CDF4(4287, 11796, 18071) }, { AOM_CDF4(27880, 32521, 32705) }, { AOM_CDF4(19038, 31004, 32414) }, { AOM_CDF4(12564, 26345, 30768) }, { AOM_CDF4(8269, 19947, 26779) }, { AOM_CDF4(5674, 14657, 21674) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(25742, 32319, 32671) }, { AOM_CDF4(19557, 31164, 32454) }, { AOM_CDF4(13381, 26381, 30755) }, { AOM_CDF4(10101, 21466, 26722) }, { AOM_CDF4(9209, 19650, 26825) }, { AOM_CDF4(27107, 31917, 32432) }, { AOM_CDF4(18056, 28893, 31203) }, { AOM_CDF4(10200, 21434, 26764) }, { AOM_CDF4(4660, 12913, 19502) }, { AOM_CDF4(2368, 6930, 12504) }, { AOM_CDF4(26960, 32158, 32613) }, { AOM_CDF4(18628, 30005, 32031) }, { AOM_CDF4(10233, 22442, 28232) }, { AOM_CDF4(5471, 14630, 21516) }, { AOM_CDF4(3235, 10767, 17109) }, { AOM_CDF4(27696, 32440, 32692) }, { AOM_CDF4(20032, 31167, 32438) }, { AOM_CDF4(8700, 21341, 28442) }, { AOM_CDF4(5662, 14831, 21795) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } } }, { { { AOM_CDF4(9704, 17294, 21132) }, { AOM_CDF4(26762, 32278, 32633) }, { AOM_CDF4(18382, 29620, 31819) }, { AOM_CDF4(10891, 23475, 28723) }, { AOM_CDF4(6358, 16583, 23309) }, { AOM_CDF4(3248, 9118, 14141) }, { AOM_CDF4(27204, 32573, 32699) }, { AOM_CDF4(19818, 30824, 32329) }, { AOM_CDF4(11772, 25120, 30041) }, { AOM_CDF4(6995, 18033, 25039) }, { AOM_CDF4(3752, 10442, 16098) }, { AOM_CDF4(27222, 32256, 32559) }, { AOM_CDF4(15356, 28399, 31475) }, { AOM_CDF4(8821, 20635, 27057) }, { AOM_CDF4(5511, 14404, 21239) }, { AOM_CDF4(2935, 8222, 13051) }, { AOM_CDF4(24875, 32120, 32529) }, { AOM_CDF4(15233, 28265, 31445) }, { AOM_CDF4(8605, 20570, 26932) }, { AOM_CDF4(5431, 14413, 21196) }, { AOM_CDF4(2994, 8341, 13223) }, { AOM_CDF4(28201, 32604, 32700) }, { AOM_CDF4(21041, 31446, 32456) }, { AOM_CDF4(13221, 26213, 30475) }, { AOM_CDF4(8255, 19385, 26037) }, { AOM_CDF4(4930, 12585, 18830) }, { AOM_CDF4(28768, 32448, 32627) }, { AOM_CDF4(19705, 30561, 32021) }, { AOM_CDF4(11572, 23589, 28220) }, { AOM_CDF4(5532, 15034, 21446) }, { AOM_CDF4(2460, 7150, 11456) }, { AOM_CDF4(29874, 32619, 32699) }, { AOM_CDF4(21621, 31071, 32201) }, { AOM_CDF4(12511, 24747, 28992) }, { AOM_CDF4(6281, 16395, 22748) }, { AOM_CDF4(3246, 9278, 14497) }, { AOM_CDF4(29715, 32625, 32712) }, { AOM_CDF4(20958, 31011, 32283) }, { AOM_CDF4(11233, 23671, 28806) }, { AOM_CDF4(6012, 16128, 22868) }, { AOM_CDF4(3427, 9851, 15414) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(11016, 22111, 26794) }, { AOM_CDF4(25946, 32357, 32677) }, { AOM_CDF4(17890, 30452, 32252) }, { AOM_CDF4(11678, 25142, 29816) }, { AOM_CDF4(6720, 17534, 24584) }, { AOM_CDF4(4230, 11665, 17820) }, { AOM_CDF4(28400, 32623, 32747) }, { AOM_CDF4(21164, 31668, 32575) }, { AOM_CDF4(13572, 27388, 31182) }, { AOM_CDF4(8234, 20750, 27358) }, { AOM_CDF4(5065, 14055, 20897) }, { AOM_CDF4(28981, 32547, 32705) }, { AOM_CDF4(18681, 30543, 32239) }, { AOM_CDF4(10919, 24075, 29286) }, { AOM_CDF4(6431, 17199, 24077) }, { AOM_CDF4(3819, 10464, 16618) }, { AOM_CDF4(26870, 32467, 32693) }, { AOM_CDF4(19041, 30831, 32347) }, { AOM_CDF4(11794, 25211, 30016) }, { AOM_CDF4(6888, 18019, 24970) }, { AOM_CDF4(4370, 12363, 18992) }, { AOM_CDF4(29578, 32670, 32744) }, { AOM_CDF4(23159, 32007, 32613) }, { AOM_CDF4(15315, 28669, 31676) }, { AOM_CDF4(9298, 22607, 28782) }, { AOM_CDF4(6144, 15913, 22968) }, { AOM_CDF4(28110, 32499, 32669) }, { AOM_CDF4(21574, 30937, 32015) }, { AOM_CDF4(12759, 24818, 28727) }, { AOM_CDF4(6545, 16761, 23042) }, { AOM_CDF4(3649, 10597, 16833) }, { AOM_CDF4(28163, 32552, 32728) }, { AOM_CDF4(22101, 31469, 32464) }, { AOM_CDF4(13160, 25472, 30143) }, { AOM_CDF4(7303, 18684, 25468) }, { AOM_CDF4(5241, 13975, 20955) }, { AOM_CDF4(28400, 32631, 32744) }, { AOM_CDF4(22104, 31793, 32603) }, { AOM_CDF4(13557, 26571, 30846) }, { AOM_CDF4(7749, 19861, 26675) }, { AOM_CDF4(4873, 14030, 21234) }, { AOM_CDF4(8192, 16384, 24576) } } }, { { { AOM_CDF4(9800, 17635, 21073) }, { AOM_CDF4(26153, 31885, 32527) }, { AOM_CDF4(15038, 27852, 31006) }, { AOM_CDF4(8718, 20564, 26486) }, { AOM_CDF4(5128, 14076, 20514) }, { AOM_CDF4(2636, 7566, 11925) }, { AOM_CDF4(27551, 32504, 32701) }, { AOM_CDF4(18310, 30054, 32100) }, { AOM_CDF4(10211, 23420, 29082) }, { AOM_CDF4(6222, 16876, 23916) }, { AOM_CDF4(3462, 9954, 15498) }, { AOM_CDF4(29991, 32633, 32721) }, { AOM_CDF4(19883, 30751, 32201) }, { AOM_CDF4(11141, 24184, 29285) }, { AOM_CDF4(6420, 16940, 23774) }, { AOM_CDF4(3392, 9753, 15118) }, { AOM_CDF4(28465, 32616, 32712) }, { AOM_CDF4(19850, 30702, 32244) }, { AOM_CDF4(10983, 24024, 29223) }, { AOM_CDF4(6294, 16770, 23582) }, { AOM_CDF4(3244, 9283, 14509) }, { AOM_CDF4(30023, 32717, 32748) }, { AOM_CDF4(22940, 32032, 32626) }, { AOM_CDF4(14282, 27928, 31473) }, { AOM_CDF4(8562, 21327, 27914) }, { AOM_CDF4(4846, 13393, 19919) }, { AOM_CDF4(29981, 32590, 32695) }, { AOM_CDF4(20465, 30963, 32166) }, { AOM_CDF4(11479, 23579, 28195) }, { AOM_CDF4(5916, 15648, 22073) }, { AOM_CDF4(3031, 8605, 13398) }, { AOM_CDF4(31146, 32691, 32739) }, { AOM_CDF4(23106, 31724, 32444) }, { AOM_CDF4(13783, 26738, 30439) }, { AOM_CDF4(7852, 19468, 25807) }, { AOM_CDF4(3860, 11124, 16853) }, { AOM_CDF4(31014, 32724, 32748) }, { AOM_CDF4(23629, 32109, 32628) }, { AOM_CDF4(14747, 28115, 31403) }, { AOM_CDF4(8545, 21242, 27478) }, { AOM_CDF4(4574, 12781, 19067) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(9185, 19694, 24688) }, { AOM_CDF4(26081, 31985, 32621) }, { AOM_CDF4(16015, 29000, 31787) }, { AOM_CDF4(10542, 23690, 29206) }, { AOM_CDF4(6732, 17945, 24677) }, { AOM_CDF4(3916, 11039, 16722) }, { AOM_CDF4(28224, 32566, 32744) }, { AOM_CDF4(19100, 31138, 32485) }, { AOM_CDF4(12528, 26620, 30879) }, { AOM_CDF4(7741, 20277, 26885) }, { AOM_CDF4(4566, 12845, 18990) }, { AOM_CDF4(29933, 32593, 32718) }, { AOM_CDF4(17670, 30333, 32155) }, { AOM_CDF4(10385, 23600, 28909) }, { AOM_CDF4(6243, 16236, 22407) }, { AOM_CDF4(3976, 10389, 16017) }, { AOM_CDF4(28377, 32561, 32738) }, { AOM_CDF4(19366, 31175, 32482) }, { AOM_CDF4(13327, 27175, 31094) }, { AOM_CDF4(8258, 20769, 27143) }, { AOM_CDF4(4703, 13198, 19527) }, { AOM_CDF4(31086, 32706, 32748) }, { AOM_CDF4(22853, 31902, 32583) }, { AOM_CDF4(14759, 28186, 31419) }, { AOM_CDF4(9284, 22382, 28348) }, { AOM_CDF4(5585, 15192, 21868) }, { AOM_CDF4(28291, 32652, 32746) }, { AOM_CDF4(19849, 32107, 32571) }, { AOM_CDF4(14834, 26818, 29214) }, { AOM_CDF4(10306, 22594, 28672) }, { AOM_CDF4(6615, 17384, 23384) }, { AOM_CDF4(28947, 32604, 32745) }, { AOM_CDF4(25625, 32289, 32646) }, { AOM_CDF4(18758, 28672, 31403) }, { AOM_CDF4(10017, 23430, 28523) }, { AOM_CDF4(6862, 15269, 22131) }, { AOM_CDF4(23933, 32509, 32739) }, { AOM_CDF4(19927, 31495, 32631) }, { AOM_CDF4(11903, 26023, 30621) }, { AOM_CDF4(7026, 20094, 27252) }, { AOM_CDF4(5998, 18106, 24437) }, { AOM_CDF4(8192, 16384, 24576) } } }, { { { AOM_CDF4(4456, 11274, 15533) }, { AOM_CDF4(21219, 29079, 31616) }, { AOM_CDF4(11173, 23774, 28567) }, { AOM_CDF4(7282, 18293, 24263) }, { AOM_CDF4(4890, 13286, 19115) }, { AOM_CDF4(1890, 5508, 8659) }, { AOM_CDF4(26651, 32136, 32647) }, { AOM_CDF4(14630, 28254, 31455) }, { AOM_CDF4(8716, 21287, 27395) }, { AOM_CDF4(5615, 15331, 22008) }, { AOM_CDF4(2675, 7700, 12150) }, { AOM_CDF4(29954, 32526, 32690) }, { AOM_CDF4(16126, 28982, 31633) }, { AOM_CDF4(9030, 21361, 27352) }, { AOM_CDF4(5411, 14793, 21271) }, { AOM_CDF4(2943, 8422, 13163) }, { AOM_CDF4(29539, 32601, 32730) }, { AOM_CDF4(18125, 30385, 32201) }, { AOM_CDF4(10422, 24090, 29468) }, { AOM_CDF4(6468, 17487, 24438) }, { AOM_CDF4(2970, 8653, 13531) }, { AOM_CDF4(30912, 32715, 32748) }, { AOM_CDF4(20666, 31373, 32497) }, { AOM_CDF4(12509, 26640, 30917) }, { AOM_CDF4(8058, 20629, 27290) }, { AOM_CDF4(4231, 12006, 18052) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(10202, 20633, 25484) }, { AOM_CDF4(27336, 31445, 32352) }, { AOM_CDF4(12420, 24384, 28552) }, { AOM_CDF4(7648, 18115, 23856) }, { AOM_CDF4(5662, 14341, 19902) }, { AOM_CDF4(3611, 10328, 15390) }, { AOM_CDF4(30945, 32616, 32736) }, { AOM_CDF4(18682, 30505, 32253) }, { AOM_CDF4(11513, 25336, 30203) }, { AOM_CDF4(7449, 19452, 26148) }, { AOM_CDF4(4482, 13051, 18886) }, { AOM_CDF4(32022, 32690, 32747) }, { AOM_CDF4(18578, 30501, 32146) }, { AOM_CDF4(11249, 23368, 28631) }, { AOM_CDF4(5645, 16958, 22158) }, { AOM_CDF4(5009, 11444, 16637) }, { AOM_CDF4(31357, 32710, 32748) }, { AOM_CDF4(21552, 31494, 32504) }, { AOM_CDF4(13891, 27677, 31340) }, { AOM_CDF4(9051, 22098, 28172) }, { AOM_CDF4(5190, 13377, 19486) }, { AOM_CDF4(32364, 32740, 32748) }, { AOM_CDF4(24839, 31907, 32551) }, { AOM_CDF4(17160, 28779, 31696) }, { AOM_CDF4(12452, 24137, 29602) }, { AOM_CDF4(6165, 15389, 22477) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } } }, { { { AOM_CDF4(2575, 7281, 11077) }, { AOM_CDF4(14002, 20866, 25402) }, { AOM_CDF4(6343, 15056, 19658) }, { AOM_CDF4(4474, 11858, 17041) }, { AOM_CDF4(2865, 8299, 12534) }, { AOM_CDF4(1344, 3949, 6391) }, { AOM_CDF4(24720, 31239, 32459) }, { AOM_CDF4(12585, 25356, 29968) }, { AOM_CDF4(7181, 18246, 24444) }, { AOM_CDF4(5025, 13667, 19885) }, { AOM_CDF4(2521, 7304, 11605) }, { AOM_CDF4(29908, 32252, 32584) }, { AOM_CDF4(17421, 29156, 31575) }, { AOM_CDF4(9889, 22188, 27782) }, { AOM_CDF4(5878, 15647, 22123) }, { AOM_CDF4(2814, 8665, 13323) }, { AOM_CDF4(30183, 32568, 32713) }, { AOM_CDF4(18528, 30195, 32049) }, { AOM_CDF4(10982, 24606, 29657) }, { AOM_CDF4(6957, 18165, 25231) }, { AOM_CDF4(3508, 10118, 15468) }, { AOM_CDF4(31761, 32736, 32748) }, { AOM_CDF4(21041, 31328, 32546) }, { AOM_CDF4(12568, 26732, 31166) }, { AOM_CDF4(8052, 20720, 27733) }, { AOM_CDF4(4336, 12192, 18396) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } } } }, { { { { AOM_CDF4(7062, 16472, 22319) }, { AOM_CDF4(24538, 32261, 32674) }, { AOM_CDF4(13675, 28041, 31779) }, { AOM_CDF4(8590, 20674, 27631) }, { AOM_CDF4(5685, 14675, 22013) }, { AOM_CDF4(3655, 9898, 15731) }, { AOM_CDF4(26493, 32418, 32658) }, { AOM_CDF4(16376, 29342, 32090) }, { AOM_CDF4(10594, 22649, 28970) }, { AOM_CDF4(8176, 17170, 24303) }, { AOM_CDF4(5605, 12694, 19139) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(23888, 31902, 32542) }, { AOM_CDF4(18612, 29687, 31987) }, { AOM_CDF4(16245, 24852, 29249) }, { AOM_CDF4(15765, 22608, 27559) }, { AOM_CDF4(19895, 24699, 27510) }, { AOM_CDF4(28401, 32212, 32457) }, { AOM_CDF4(15274, 27825, 30980) }, { AOM_CDF4(9364, 18128, 24332) }, { AOM_CDF4(2283, 8193, 15082) }, { AOM_CDF4(1228, 3972, 7881) }, { AOM_CDF4(29455, 32469, 32620) }, { AOM_CDF4(17981, 28245, 31388) }, { AOM_CDF4(10921, 20098, 26240) }, { AOM_CDF4(3743, 11829, 18657) }, { AOM_CDF4(2374, 9593, 15715) }, { AOM_CDF4(31068, 32466, 32635) }, { AOM_CDF4(20321, 29572, 31971) }, { AOM_CDF4(10771, 20255, 27119) }, { AOM_CDF4(2795, 10410, 17361) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(9320, 22102, 27840) }, { AOM_CDF4(27057, 32464, 32724) }, { AOM_CDF4(16331, 30268, 32309) }, { AOM_CDF4(10319, 23935, 29720) }, { AOM_CDF4(6189, 16448, 24106) }, { AOM_CDF4(3589, 10884, 18808) }, { AOM_CDF4(29026, 32624, 32748) }, { AOM_CDF4(19226, 31507, 32587) }, { AOM_CDF4(12692, 26921, 31203) }, { AOM_CDF4(7049, 19532, 27635) }, { AOM_CDF4(7727, 15669, 23252) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(28056, 32625, 32748) }, { AOM_CDF4(22383, 32075, 32669) }, { AOM_CDF4(15417, 27098, 31749) }, { AOM_CDF4(18127, 26493, 27190) }, { AOM_CDF4(5461, 16384, 21845) }, { AOM_CDF4(27982, 32091, 32584) }, { AOM_CDF4(19045, 29868, 31972) }, { AOM_CDF4(10397, 22266, 27932) }, { AOM_CDF4(5990, 13697, 21500) }, { AOM_CDF4(1792, 6912, 15104) }, { AOM_CDF4(28198, 32501, 32718) }, { AOM_CDF4(21534, 31521, 32569) }, { AOM_CDF4(11109, 25217, 30017) }, { AOM_CDF4(5671, 15124, 26151) }, { AOM_CDF4(4681, 14043, 18725) }, { AOM_CDF4(28688, 32580, 32741) }, { AOM_CDF4(22576, 32079, 32661) }, { AOM_CDF4(10627, 22141, 28340) }, { AOM_CDF4(9362, 14043, 28087) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } } }, { { { AOM_CDF4(7754, 16948, 22142) }, { AOM_CDF4(25670, 32330, 32691) }, { AOM_CDF4(15663, 29225, 31994) }, { AOM_CDF4(9878, 23288, 29158) }, { AOM_CDF4(6419, 17088, 24336) }, { AOM_CDF4(3859, 11003, 17039) }, { AOM_CDF4(27562, 32595, 32725) }, { AOM_CDF4(17575, 30588, 32399) }, { AOM_CDF4(10819, 24838, 30309) }, { AOM_CDF4(7124, 18686, 25916) }, { AOM_CDF4(4479, 12688, 19340) }, { AOM_CDF4(28385, 32476, 32673) }, { AOM_CDF4(15306, 29005, 31938) }, { AOM_CDF4(8937, 21615, 28322) }, { AOM_CDF4(5982, 15603, 22786) }, { AOM_CDF4(3620, 10267, 16136) }, { AOM_CDF4(27280, 32464, 32667) }, { AOM_CDF4(15607, 29160, 32004) }, { AOM_CDF4(9091, 22135, 28740) }, { AOM_CDF4(6232, 16632, 24020) }, { AOM_CDF4(4047, 11377, 17672) }, { AOM_CDF4(29220, 32630, 32718) }, { AOM_CDF4(19650, 31220, 32462) }, { AOM_CDF4(13050, 26312, 30827) }, { AOM_CDF4(9228, 20870, 27468) }, { AOM_CDF4(6146, 15149, 21971) }, { AOM_CDF4(30169, 32481, 32623) }, { AOM_CDF4(17212, 29311, 31554) }, { AOM_CDF4(9911, 21311, 26882) }, { AOM_CDF4(4487, 13314, 20372) }, { AOM_CDF4(2570, 7772, 12889) }, { AOM_CDF4(30924, 32613, 32708) }, { AOM_CDF4(19490, 30206, 32107) }, { AOM_CDF4(11232, 23998, 29276) }, { AOM_CDF4(6769, 17955, 25035) }, { AOM_CDF4(4398, 12623, 19214) }, { AOM_CDF4(30609, 32627, 32722) }, { AOM_CDF4(19370, 30582, 32287) }, { AOM_CDF4(10457, 23619, 29409) }, { AOM_CDF4(6443, 17637, 24834) }, { AOM_CDF4(4645, 13236, 20106) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(8626, 20271, 26216) }, { AOM_CDF4(26707, 32406, 32711) }, { AOM_CDF4(16999, 30329, 32286) }, { AOM_CDF4(11445, 25123, 30286) }, { AOM_CDF4(6411, 18828, 25601) }, { AOM_CDF4(6801, 12458, 20248) }, { AOM_CDF4(29918, 32682, 32748) }, { AOM_CDF4(20649, 31739, 32618) }, { AOM_CDF4(12879, 27773, 31581) }, { AOM_CDF4(7896, 21751, 28244) }, { AOM_CDF4(5260, 14870, 23698) }, { AOM_CDF4(29252, 32593, 32731) }, { AOM_CDF4(17072, 30460, 32294) }, { AOM_CDF4(10653, 24143, 29365) }, { AOM_CDF4(6536, 17490, 23983) }, { AOM_CDF4(4929, 13170, 20085) }, { AOM_CDF4(28137, 32518, 32715) }, { AOM_CDF4(18171, 30784, 32407) }, { AOM_CDF4(11437, 25436, 30459) }, { AOM_CDF4(7252, 18534, 26176) }, { AOM_CDF4(4126, 13353, 20978) }, { AOM_CDF4(31162, 32726, 32748) }, { AOM_CDF4(23017, 32222, 32701) }, { AOM_CDF4(15629, 29233, 32046) }, { AOM_CDF4(9387, 22621, 29480) }, { AOM_CDF4(6922, 17616, 25010) }, { AOM_CDF4(28838, 32265, 32614) }, { AOM_CDF4(19701, 30206, 31920) }, { AOM_CDF4(11214, 22410, 27933) }, { AOM_CDF4(5320, 14177, 23034) }, { AOM_CDF4(5049, 12881, 17827) }, { AOM_CDF4(27484, 32471, 32734) }, { AOM_CDF4(21076, 31526, 32561) }, { AOM_CDF4(12707, 26303, 31211) }, { AOM_CDF4(8169, 21722, 28219) }, { AOM_CDF4(6045, 19406, 27042) }, { AOM_CDF4(27753, 32572, 32745) }, { AOM_CDF4(20832, 31878, 32653) }, { AOM_CDF4(13250, 27356, 31674) }, { AOM_CDF4(7718, 21508, 29858) }, { AOM_CDF4(7209, 18350, 25559) }, { AOM_CDF4(8192, 16384, 24576) } } }, { { { AOM_CDF4(7876, 16901, 21741) }, { AOM_CDF4(24001, 31898, 32625) }, { AOM_CDF4(14529, 27959, 31451) }, { AOM_CDF4(8273, 20818, 27258) }, { AOM_CDF4(5278, 14673, 21510) }, { AOM_CDF4(2983, 8843, 14039) }, { AOM_CDF4(28016, 32574, 32732) }, { AOM_CDF4(17471, 30306, 32301) }, { AOM_CDF4(10224, 24063, 29728) }, { AOM_CDF4(6602, 17954, 25052) }, { AOM_CDF4(4002, 11585, 17759) }, { AOM_CDF4(30190, 32634, 32739) }, { AOM_CDF4(17497, 30282, 32270) }, { AOM_CDF4(10229, 23729, 29538) }, { AOM_CDF4(6344, 17211, 24440) }, { AOM_CDF4(3849, 11189, 17108) }, { AOM_CDF4(28570, 32583, 32726) }, { AOM_CDF4(17521, 30161, 32238) }, { AOM_CDF4(10153, 23565, 29378) }, { AOM_CDF4(6455, 17341, 24443) }, { AOM_CDF4(3907, 11042, 17024) }, { AOM_CDF4(30689, 32715, 32748) }, { AOM_CDF4(21546, 31840, 32610) }, { AOM_CDF4(13547, 27581, 31459) }, { AOM_CDF4(8912, 21757, 28309) }, { AOM_CDF4(5548, 15080, 22046) }, { AOM_CDF4(30783, 32540, 32685) }, { AOM_CDF4(17540, 29528, 31668) }, { AOM_CDF4(10160, 21468, 26783) }, { AOM_CDF4(4724, 13393, 20054) }, { AOM_CDF4(2702, 8174, 13102) }, { AOM_CDF4(31648, 32686, 32742) }, { AOM_CDF4(20954, 31094, 32337) }, { AOM_CDF4(12420, 25698, 30179) }, { AOM_CDF4(7304, 19320, 26248) }, { AOM_CDF4(4366, 12261, 18864) }, { AOM_CDF4(31581, 32723, 32748) }, { AOM_CDF4(21373, 31586, 32525) }, { AOM_CDF4(12744, 26625, 30885) }, { AOM_CDF4(7431, 20322, 26950) }, { AOM_CDF4(4692, 13323, 20111) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(7833, 18369, 24095) }, { AOM_CDF4(26650, 32273, 32702) }, { AOM_CDF4(16371, 29961, 32191) }, { AOM_CDF4(11055, 24082, 29629) }, { AOM_CDF4(6892, 18644, 25400) }, { AOM_CDF4(5006, 13057, 19240) }, { AOM_CDF4(29834, 32666, 32748) }, { AOM_CDF4(19577, 31335, 32570) }, { AOM_CDF4(12253, 26509, 31122) }, { AOM_CDF4(7991, 20772, 27711) }, { AOM_CDF4(5677, 15910, 23059) }, { AOM_CDF4(30109, 32532, 32720) }, { AOM_CDF4(16747, 30166, 32252) }, { AOM_CDF4(10134, 23542, 29184) }, { AOM_CDF4(5791, 16176, 23556) }, { AOM_CDF4(4362, 10414, 17284) }, { AOM_CDF4(29492, 32626, 32748) }, { AOM_CDF4(19894, 31402, 32525) }, { AOM_CDF4(12942, 27071, 30869) }, { AOM_CDF4(8346, 21216, 27405) }, { AOM_CDF4(6572, 17087, 23859) }, { AOM_CDF4(32035, 32735, 32748) }, { AOM_CDF4(22957, 31838, 32618) }, { AOM_CDF4(14724, 28572, 31772) }, { AOM_CDF4(10364, 23999, 29553) }, { AOM_CDF4(7004, 18433, 25655) }, { AOM_CDF4(27528, 32277, 32681) }, { AOM_CDF4(16959, 31171, 32096) }, { AOM_CDF4(10486, 23593, 27962) }, { AOM_CDF4(8192, 16384, 23211) }, { AOM_CDF4(8937, 17873, 20852) }, { AOM_CDF4(27715, 32002, 32615) }, { AOM_CDF4(15073, 29491, 31676) }, { AOM_CDF4(11264, 24576, 28672) }, { AOM_CDF4(2341, 18725, 23406) }, { AOM_CDF4(7282, 18204, 25486) }, { AOM_CDF4(28547, 32213, 32657) }, { AOM_CDF4(20788, 29773, 32239) }, { AOM_CDF4(6780, 21469, 30508) }, { AOM_CDF4(5958, 14895, 23831) }, { AOM_CDF4(16384, 21845, 27307) }, { AOM_CDF4(8192, 16384, 24576) } } }, { { { AOM_CDF4(5992, 14304, 19765) }, { AOM_CDF4(22612, 31238, 32456) }, { AOM_CDF4(13456, 27162, 31087) }, { AOM_CDF4(8001, 20062, 26504) }, { AOM_CDF4(5168, 14105, 20764) }, { AOM_CDF4(2632, 7771, 12385) }, { AOM_CDF4(27034, 32344, 32709) }, { AOM_CDF4(15850, 29415, 31997) }, { AOM_CDF4(9494, 22776, 28841) }, { AOM_CDF4(6151, 16830, 23969) }, { AOM_CDF4(3461, 10039, 15722) }, { AOM_CDF4(30134, 32569, 32731) }, { AOM_CDF4(15638, 29422, 31945) }, { AOM_CDF4(9150, 21865, 28218) }, { AOM_CDF4(5647, 15719, 22676) }, { AOM_CDF4(3402, 9772, 15477) }, { AOM_CDF4(28530, 32586, 32735) }, { AOM_CDF4(17139, 30298, 32292) }, { AOM_CDF4(10200, 24039, 29685) }, { AOM_CDF4(6419, 17674, 24786) }, { AOM_CDF4(3544, 10225, 15824) }, { AOM_CDF4(31333, 32726, 32748) }, { AOM_CDF4(20618, 31487, 32544) }, { AOM_CDF4(12901, 27217, 31232) }, { AOM_CDF4(8624, 21734, 28171) }, { AOM_CDF4(5104, 14191, 20748) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(11206, 21090, 26561) }, { AOM_CDF4(28759, 32279, 32671) }, { AOM_CDF4(14171, 27952, 31569) }, { AOM_CDF4(9743, 22907, 29141) }, { AOM_CDF4(6871, 17886, 24868) }, { AOM_CDF4(4960, 13152, 19315) }, { AOM_CDF4(31077, 32661, 32748) }, { AOM_CDF4(19400, 31195, 32515) }, { AOM_CDF4(12752, 26858, 31040) }, { AOM_CDF4(8370, 22098, 28591) }, { AOM_CDF4(5457, 15373, 22298) }, { AOM_CDF4(31697, 32706, 32748) }, { AOM_CDF4(17860, 30657, 32333) }, { AOM_CDF4(12510, 24812, 29261) }, { AOM_CDF4(6180, 19124, 24722) }, { AOM_CDF4(5041, 13548, 17959) }, { AOM_CDF4(31552, 32716, 32748) }, { AOM_CDF4(21908, 31769, 32623) }, { AOM_CDF4(14470, 28201, 31565) }, { AOM_CDF4(9493, 22982, 28608) }, { AOM_CDF4(6858, 17240, 24137) }, { AOM_CDF4(32543, 32752, 32756) }, { AOM_CDF4(24286, 32097, 32666) }, { AOM_CDF4(15958, 29217, 32024) }, { AOM_CDF4(10207, 24234, 29958) }, { AOM_CDF4(6929, 18305, 25652) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } } }, { { { AOM_CDF4(4137, 10847, 15682) }, { AOM_CDF4(17824, 27001, 30058) }, { AOM_CDF4(10204, 22796, 28291) }, { AOM_CDF4(6076, 15935, 22125) }, { AOM_CDF4(3852, 10937, 16816) }, { AOM_CDF4(2252, 6324, 10131) }, { AOM_CDF4(25840, 32016, 32662) }, { AOM_CDF4(15109, 28268, 31531) }, { AOM_CDF4(9385, 22231, 28340) }, { AOM_CDF4(6082, 16672, 23479) }, { AOM_CDF4(3318, 9427, 14681) }, { AOM_CDF4(30594, 32574, 32718) }, { AOM_CDF4(16836, 29552, 31859) }, { AOM_CDF4(9556, 22542, 28356) }, { AOM_CDF4(6305, 16725, 23540) }, { AOM_CDF4(3376, 9895, 15184) }, { AOM_CDF4(29383, 32617, 32745) }, { AOM_CDF4(18891, 30809, 32401) }, { AOM_CDF4(11688, 25942, 30687) }, { AOM_CDF4(7468, 19469, 26651) }, { AOM_CDF4(3909, 11358, 17012) }, { AOM_CDF4(31564, 32736, 32748) }, { AOM_CDF4(20906, 31611, 32600) }, { AOM_CDF4(13191, 27621, 31537) }, { AOM_CDF4(8768, 22029, 28676) }, { AOM_CDF4(5079, 14109, 20906) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } }, { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) } } } } }; static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB][CDF_SIZE( NUM_BASE_LEVELS + 1)] = { { { { { AOM_CDF3(17837, 29055) }, { AOM_CDF3(29600, 31446) }, { AOM_CDF3(30844, 31878) }, { AOM_CDF3(24926, 28948) } }, { { AOM_CDF3(21365, 30026) }, { AOM_CDF3(30512, 32423) }, { AOM_CDF3(31658, 32621) }, { AOM_CDF3(29630, 31881) } } }, { { { AOM_CDF3(5717, 26477) }, { AOM_CDF3(30491, 31703) }, { AOM_CDF3(31550, 32158) }, { AOM_CDF3(29648, 31491) } }, { { AOM_CDF3(12608, 27820) }, { AOM_CDF3(30680, 32225) }, { AOM_CDF3(30809, 32335) }, { AOM_CDF3(31299, 32423) } } }, { { { AOM_CDF3(1786, 12612) }, { AOM_CDF3(30663, 31625) }, { AOM_CDF3(32339, 32468) }, { AOM_CDF3(31148, 31833) } }, { { AOM_CDF3(18857, 23865) }, { AOM_CDF3(31428, 32428) }, { AOM_CDF3(31744, 32373) }, { AOM_CDF3(31775, 32526) } } }, { { { AOM_CDF3(1787, 2532) }, { AOM_CDF3(30832, 31662) }, { AOM_CDF3(31824, 32682) }, { AOM_CDF3(32133, 32569) } }, { { AOM_CDF3(13751, 22235) }, { AOM_CDF3(32089, 32409) }, { AOM_CDF3(27084, 27920) }, { AOM_CDF3(29291, 32594) } } }, { { { AOM_CDF3(1725, 3449) }, { AOM_CDF3(31102, 31935) }, { AOM_CDF3(32457, 32613) }, { AOM_CDF3(32412, 32649) } }, { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) } } } }, { { { { AOM_CDF3(17560, 29888) }, { AOM_CDF3(29671, 31549) }, { AOM_CDF3(31007, 32056) }, { AOM_CDF3(27286, 30006) } }, { { AOM_CDF3(26594, 31212) }, { AOM_CDF3(31208, 32582) }, { AOM_CDF3(31835, 32637) }, { AOM_CDF3(30595, 32206) } } }, { { { AOM_CDF3(15239, 29932) }, { AOM_CDF3(31315, 32095) }, { AOM_CDF3(32130, 32434) }, { AOM_CDF3(30864, 31996) } }, { { AOM_CDF3(26279, 30968) }, { AOM_CDF3(31142, 32495) }, { AOM_CDF3(31713, 32540) }, { AOM_CDF3(31929, 32594) } } }, { { { AOM_CDF3(2644, 25198) }, { AOM_CDF3(32038, 32451) }, { AOM_CDF3(32639, 32695) }, { AOM_CDF3(32166, 32518) } }, { { AOM_CDF3(17187, 27668) }, { AOM_CDF3(31714, 32550) }, { AOM_CDF3(32283, 32678) }, { AOM_CDF3(31930, 32563) } } }, { { { AOM_CDF3(1044, 2257) }, { AOM_CDF3(30755, 31923) }, { AOM_CDF3(32208, 32693) }, { AOM_CDF3(32244, 32615) } }, { { AOM_CDF3(21317, 26207) }, { AOM_CDF3(29133, 30868) }, { AOM_CDF3(29311, 31231) }, { AOM_CDF3(29657, 31087) } } }, { { { AOM_CDF3(478, 1834) }, { AOM_CDF3(31005, 31987) }, { AOM_CDF3(32317, 32724) }, { AOM_CDF3(30865, 32648) } }, { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) } } } }, { { { { AOM_CDF3(20092, 30774) }, { AOM_CDF3(30695, 32020) }, { AOM_CDF3(31131, 32103) }, { AOM_CDF3(28666, 30870) } }, { { AOM_CDF3(27258, 31095) }, { AOM_CDF3(31804, 32623) }, { AOM_CDF3(31763, 32528) }, { AOM_CDF3(31438, 32506) } } }, { { { AOM_CDF3(18049, 30489) }, { AOM_CDF3(31706, 32286) }, { AOM_CDF3(32163, 32473) }, { AOM_CDF3(31550, 32184) } }, { { AOM_CDF3(27116, 30842) }, { AOM_CDF3(31971, 32598) }, { AOM_CDF3(32088, 32576) }, { AOM_CDF3(32067, 32664) } } }, { { { AOM_CDF3(12854, 29093) }, { AOM_CDF3(32272, 32558) }, { AOM_CDF3(32667, 32729) }, { AOM_CDF3(32306, 32585) } }, { { AOM_CDF3(25476, 30366) }, { AOM_CDF3(32169, 32687) }, { AOM_CDF3(32479, 32689) }, { AOM_CDF3(31673, 32634) } } }, { { { AOM_CDF3(2809, 19301) }, { AOM_CDF3(32205, 32622) }, { AOM_CDF3(32338, 32730) }, { AOM_CDF3(31786, 32616) } }, { { AOM_CDF3(22737, 29105) }, { AOM_CDF3(30810, 32362) }, { AOM_CDF3(30014, 32627) }, { AOM_CDF3(30528, 32574) } } }, { { { AOM_CDF3(935, 3382) }, { AOM_CDF3(30789, 31909) }, { AOM_CDF3(32466, 32756) }, { AOM_CDF3(30860, 32513) } }, { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) } } } }, { { { { AOM_CDF3(22497, 31198) }, { AOM_CDF3(31715, 32495) }, { AOM_CDF3(31606, 32337) }, { AOM_CDF3(30388, 31990) } }, { { AOM_CDF3(27877, 31584) }, { AOM_CDF3(32170, 32728) }, { AOM_CDF3(32155, 32688) }, { AOM_CDF3(32219, 32702) } } }, { { { AOM_CDF3(21457, 31043) }, { AOM_CDF3(31951, 32483) }, { AOM_CDF3(32153, 32562) }, { AOM_CDF3(31473, 32215) } }, { { AOM_CDF3(27558, 31151) }, { AOM_CDF3(32020, 32640) }, { AOM_CDF3(32097, 32575) }, { AOM_CDF3(32242, 32719) } } }, { { { AOM_CDF3(19980, 30591) }, { AOM_CDF3(32219, 32597) }, { AOM_CDF3(32581, 32706) }, { AOM_CDF3(31803, 32287) } }, { { AOM_CDF3(26473, 30507) }, { AOM_CDF3(32431, 32723) }, { AOM_CDF3(32196, 32611) }, { AOM_CDF3(31588, 32528) } } }, { { { AOM_CDF3(24647, 30463) }, { AOM_CDF3(32412, 32695) }, { AOM_CDF3(32468, 32720) }, { AOM_CDF3(31269, 32523) } }, { { AOM_CDF3(28482, 31505) }, { AOM_CDF3(32152, 32701) }, { AOM_CDF3(31732, 32598) }, { AOM_CDF3(31767, 32712) } } }, { { { AOM_CDF3(12358, 24977) }, { AOM_CDF3(31331, 32385) }, { AOM_CDF3(32634, 32756) }, { AOM_CDF3(30411, 32548) } }, { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) } } } } }; #endif // AOM_AV1_COMMON_TOKEN_CDFS_H_ aom-3.12.1/av1/common/txb_common.c000066400000000000000000000571031477627663500166770ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom/aom_integer.h" #include "av1/common/av1_common_int.h" #include "av1/common/txb_common.h" // The ctx offset table when TX is TX_CLASS_2D. // TX col and row indices are clamped to 4 static const int8_t av1_nz_map_ctx_offset_4x4[16] = { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21, }; static const int8_t av1_nz_map_ctx_offset_8x8[64] = { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21, 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, }; static const int8_t av1_nz_map_ctx_offset_16x16[256] = { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, }; static const int8_t av1_nz_map_ctx_offset_32x32[1024] = { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, }; static const int8_t av1_nz_map_ctx_offset_4x8[32] = { 0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, }; static const int8_t av1_nz_map_ctx_offset_8x16[128] = { 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, }; static const int8_t av1_nz_map_ctx_offset_16x32[512] = { 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, }; static const int8_t av1_nz_map_ctx_offset_32x16[512] = { 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, }; static const int8_t av1_nz_map_ctx_offset_32x64[1024] = { 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, }; static const int8_t av1_nz_map_ctx_offset_64x32[1024] = { 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, }; static const int8_t av1_nz_map_ctx_offset_4x16[64] = { 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, }; static const int8_t av1_nz_map_ctx_offset_16x4[64] = { 0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, }; static const int8_t av1_nz_map_ctx_offset_8x32[256] = { 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, }; static const int8_t av1_nz_map_ctx_offset_32x8[256] = { 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, }; const int8_t *av1_nz_map_ctx_offset[19] = { av1_nz_map_ctx_offset_4x4, // TX_4x4 av1_nz_map_ctx_offset_8x8, // TX_8x8 av1_nz_map_ctx_offset_16x16, // TX_16x16 av1_nz_map_ctx_offset_32x32, // TX_32x32 av1_nz_map_ctx_offset_32x32, // TX_64x64 av1_nz_map_ctx_offset_4x8, // TX_4x8 av1_nz_map_ctx_offset_16x4, // TX_8x4 av1_nz_map_ctx_offset_8x16, // TX_8x16 av1_nz_map_ctx_offset_32x8, // TX_16x8 av1_nz_map_ctx_offset_16x32, // TX_16x32 av1_nz_map_ctx_offset_32x16, // TX_32x16 av1_nz_map_ctx_offset_32x64, // TX_32x64 av1_nz_map_ctx_offset_64x32, // TX_64x32 av1_nz_map_ctx_offset_4x16, // TX_4x16 av1_nz_map_ctx_offset_16x4, // TX_16x4 av1_nz_map_ctx_offset_8x32, // TX_8x32 av1_nz_map_ctx_offset_32x8, // TX_32x8 av1_nz_map_ctx_offset_32x64, // TX_16x64 av1_nz_map_ctx_offset_32x16, // TX_64x16 }; const int16_t av1_eob_group_start[12] = { 0, 1, 2, 3, 5, 9, 17, 33, 65, 129, 257, 513 }; const int16_t av1_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; aom-3.12.1/av1/common/txb_common.h000066400000000000000000000476261477627663500167150ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_TXB_COMMON_H_ #define AOM_AV1_COMMON_TXB_COMMON_H_ #include "av1/common/av1_common_int.h" extern const int16_t av1_eob_group_start[12]; extern const int16_t av1_eob_offset_bits[12]; extern const int8_t *av1_nz_map_ctx_offset[TX_SIZES_ALL]; typedef struct txb_ctx { int txb_skip_ctx; int dc_sign_ctx; } TXB_CTX; static const int base_level_count_to_index[13] = { 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, }; static const TX_CLASS tx_type_to_class[TX_TYPES] = { TX_CLASS_2D, // DCT_DCT TX_CLASS_2D, // ADST_DCT TX_CLASS_2D, // DCT_ADST TX_CLASS_2D, // ADST_ADST TX_CLASS_2D, // FLIPADST_DCT TX_CLASS_2D, // DCT_FLIPADST TX_CLASS_2D, // FLIPADST_FLIPADST TX_CLASS_2D, // ADST_FLIPADST TX_CLASS_2D, // FLIPADST_ADST TX_CLASS_2D, // IDTX TX_CLASS_VERT, // V_DCT TX_CLASS_HORIZ, // H_DCT TX_CLASS_VERT, // V_ADST TX_CLASS_HORIZ, // H_ADST TX_CLASS_VERT, // V_FLIPADST TX_CLASS_HORIZ, // H_FLIPADST }; static inline int get_txb_bhl(TX_SIZE tx_size) { tx_size = av1_get_adjusted_tx_size(tx_size); return tx_size_high_log2[tx_size]; } static inline int get_txb_wide(TX_SIZE tx_size) { tx_size = av1_get_adjusted_tx_size(tx_size); return tx_size_wide[tx_size]; } static inline int get_txb_high(TX_SIZE tx_size) { tx_size = av1_get_adjusted_tx_size(tx_size); return tx_size_high[tx_size]; } static inline uint8_t *set_levels(uint8_t *const levels_buf, const int height) { return levels_buf + TX_PAD_TOP * (height + TX_PAD_HOR); } static inline int get_padded_idx(const int idx, const int bhl) { return idx + ((idx >> bhl) << TX_PAD_HOR_LOG2); } static inline int get_br_ctx_2d(const uint8_t *const levels, const int c, // raster order const int bhl) { assert(c > 0); const int col = c >> bhl; const int row = c - (col << bhl); const int stride = (1 << bhl) + TX_PAD_HOR; const int pos = col * stride + row; int mag = AOMMIN(levels[pos + 1], MAX_BASE_BR_RANGE) + AOMMIN(levels[pos + stride], MAX_BASE_BR_RANGE) + AOMMIN(levels[pos + 1 + stride], MAX_BASE_BR_RANGE); mag = AOMMIN((mag + 1) >> 1, 6); //((row | col) < 2) is equivalent to ((row < 2) && (col < 2)) if ((row | col) < 2) return mag + 7; return mag + 14; } static AOM_FORCE_INLINE int get_br_ctx_eob(const int c, // raster order const int bhl, const TX_CLASS tx_class) { const int col = c >> bhl; const int row = c - (col << bhl); if (c == 0) return 0; if ((tx_class == TX_CLASS_2D && row < 2 && col < 2) || (tx_class == TX_CLASS_HORIZ && col == 0) || (tx_class == TX_CLASS_VERT && row == 0)) return 7; return 14; } static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels, const int c, // raster order const int bhl, const TX_CLASS tx_class) { const int col = c >> bhl; const int row = c - (col << bhl); const int stride = (1 << bhl) + TX_PAD_HOR; const int pos = col * stride + row; int mag = levels[pos + 1]; mag += levels[pos + stride]; switch (tx_class) { case TX_CLASS_2D: mag += levels[pos + stride + 1]; mag = AOMMIN((mag + 1) >> 1, 6); if (c == 0) return mag; if ((row < 2) && (col < 2)) return mag + 7; break; case TX_CLASS_HORIZ: mag += levels[pos + (stride << 1)]; mag = AOMMIN((mag + 1) >> 1, 6); if (c == 0) return mag; if (col == 0) return mag + 7; break; case TX_CLASS_VERT: mag += levels[pos + 2]; mag = AOMMIN((mag + 1) >> 1, 6); if (c == 0) return mag; if (row == 0) return mag + 7; break; default: break; } return mag + 14; } static const uint8_t clip_max3[256] = { 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; static AOM_FORCE_INLINE int get_nz_mag(const uint8_t *const levels, const int bhl, const TX_CLASS tx_class) { int mag; // Note: AOMMIN(level, 3) is useless for decoder since level < 3. mag = clip_max3[levels[(1 << bhl) + TX_PAD_HOR]]; // { 0, 1 } mag += clip_max3[levels[1]]; // { 1, 0 } if (tx_class == TX_CLASS_2D) { mag += clip_max3[levels[(1 << bhl) + TX_PAD_HOR + 1]]; // { 1, 1 } mag += clip_max3[levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)]]; // { 0, 2 } mag += clip_max3[levels[2]]; // { 2, 0 } } else if (tx_class == TX_CLASS_VERT) { mag += clip_max3[levels[2]]; // { 2, 0 } mag += clip_max3[levels[3]]; // { 3, 0 } mag += clip_max3[levels[4]]; // { 4, 0 } } else { mag += clip_max3[levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)]]; // { 0, 2 } mag += clip_max3[levels[(3 << bhl) + (3 << TX_PAD_HOR_LOG2)]]; // { 0, 3 } mag += clip_max3[levels[(4 << bhl) + (4 << TX_PAD_HOR_LOG2)]]; // { 0, 4 } } return mag; } #define NZ_MAP_CTX_0 SIG_COEF_CONTEXTS_2D #define NZ_MAP_CTX_5 (NZ_MAP_CTX_0 + 5) #define NZ_MAP_CTX_10 (NZ_MAP_CTX_0 + 10) static const int nz_map_ctx_offset_1d[32] = { NZ_MAP_CTX_0, NZ_MAP_CTX_5, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, }; static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats( const int stats, const int coeff_idx, // raster order const int bhl, const TX_SIZE tx_size, const TX_CLASS tx_class) { // tx_class == 0(TX_CLASS_2D) if ((tx_class | coeff_idx) == 0) return 0; int ctx = (stats + 1) >> 1; ctx = AOMMIN(ctx, 4); switch (tx_class) { case TX_CLASS_2D: { // This is the algorithm to generate av1_nz_map_ctx_offset[][] // const int width = tx_size_wide[tx_size]; // const int height = tx_size_high[tx_size]; // if (width < height) { // if (row < 2) return 11 + ctx; // } else if (width > height) { // if (col < 2) return 16 + ctx; // } // if (row + col < 2) return ctx + 1; // if (row + col < 4) return 5 + ctx + 1; // return 21 + ctx; return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx]; } case TX_CLASS_HORIZ: { const int col = coeff_idx >> bhl; return ctx + nz_map_ctx_offset_1d[col]; } case TX_CLASS_VERT: { const int col = coeff_idx >> bhl; const int row = coeff_idx - (col << bhl); return ctx + nz_map_ctx_offset_1d[row]; } default: break; } return 0; } typedef aom_cdf_prob (*base_cdf_arr)[CDF_SIZE(4)]; typedef aom_cdf_prob (*br_cdf_arr)[CDF_SIZE(BR_CDF_SIZE)]; static inline int get_lower_levels_ctx_eob(int bhl, int width, int scan_idx) { if (scan_idx == 0) return 0; if (scan_idx <= (width << bhl) / 8) return 1; if (scan_idx <= (width << bhl) / 4) return 2; return 3; } static inline int get_lower_levels_ctx_2d(const uint8_t *levels, int coeff_idx, int bhl, TX_SIZE tx_size) { assert(coeff_idx > 0); int mag; // Note: AOMMIN(level, 3) is useless for decoder since level < 3. levels = levels + get_padded_idx(coeff_idx, bhl); mag = AOMMIN(levels[(1 << bhl) + TX_PAD_HOR], 3); // { 0, 1 } mag += AOMMIN(levels[1], 3); // { 1, 0 } mag += AOMMIN(levels[(1 << bhl) + TX_PAD_HOR + 1], 3); // { 1, 1 } mag += AOMMIN(levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)], 3); // { 0, 2 } mag += AOMMIN(levels[2], 3); // { 2, 0 } const int ctx = AOMMIN((mag + 1) >> 1, 4); return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx]; } static AOM_FORCE_INLINE int get_lower_levels_ctx(const uint8_t *levels, int coeff_idx, int bhl, TX_SIZE tx_size, TX_CLASS tx_class) { const int stats = get_nz_mag(levels + get_padded_idx(coeff_idx, bhl), bhl, tx_class); return get_nz_map_ctx_from_stats(stats, coeff_idx, bhl, tx_size, tx_class); } static inline int get_lower_levels_ctx_general(int is_last, int scan_idx, int bhl, int width, const uint8_t *levels, int coeff_idx, TX_SIZE tx_size, TX_CLASS tx_class) { if (is_last) { if (scan_idx == 0) return 0; if (scan_idx <= (width << bhl) >> 3) return 1; if (scan_idx <= (width << bhl) >> 2) return 2; return 3; } return get_lower_levels_ctx(levels, coeff_idx, bhl, tx_size, tx_class); } static inline void set_dc_sign(int *cul_level, int dc_val) { if (dc_val < 0) *cul_level |= 1 << COEFF_CONTEXT_BITS; else if (dc_val > 0) *cul_level += 2 << COEFF_CONTEXT_BITS; } static void get_txb_ctx_general(const BLOCK_SIZE plane_bsize, const TX_SIZE tx_size, const int plane, const ENTROPY_CONTEXT *const a, const ENTROPY_CONTEXT *const l, TXB_CTX *const txb_ctx) { #define MAX_TX_SIZE_UNIT 16 static const int8_t signs[3] = { 0, -1, 1 }; static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }; const int txb_w_unit = tx_size_wide_unit[tx_size]; const int txb_h_unit = tx_size_high_unit[tx_size]; int dc_sign = 0; int k = 0; do { const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS; assert(sign <= 2); dc_sign += signs[sign]; } while (++k < txb_w_unit); k = 0; do { const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS; assert(sign <= 2); dc_sign += signs[sign]; } while (++k < txb_h_unit); txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT]; if (plane == 0) { if (plane_bsize == txsize_to_bsize[tx_size]) { txb_ctx->txb_skip_ctx = 0; } else { // This is the algorithm to generate table skip_contexts[top][left]. // const int max = AOMMIN(top | left, 4); // const int min = AOMMIN(AOMMIN(top, left), 4); // if (!max) // txb_skip_ctx = 1; // else if (!min) // txb_skip_ctx = 2 + (max > 3); // else if (max <= 3) // txb_skip_ctx = 4; // else if (min <= 3) // txb_skip_ctx = 5; // else // txb_skip_ctx = 6; static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 }, { 2, 4, 4, 4, 5 }, { 2, 4, 4, 4, 5 }, { 2, 4, 4, 4, 5 }, { 3, 5, 5, 5, 6 } }; // For top and left, we only care about which of the following three // categories they belong to: { 0 }, { 1, 2, 3 }, or { 4, 5, ... }. The // spec calculates top and left with the Max() function. We can calculate // an approximate max with bitwise OR because the real max and the // approximate max belong to the same category. int top = 0; int left = 0; k = 0; do { top |= a[k]; } while (++k < txb_w_unit); top &= COEFF_CONTEXT_MASK; top = AOMMIN(top, 4); k = 0; do { left |= l[k]; } while (++k < txb_h_unit); left &= COEFF_CONTEXT_MASK; left = AOMMIN(left, 4); txb_ctx->txb_skip_ctx = skip_contexts[top][left]; } } else { const int ctx_base = get_entropy_context(tx_size, a, l); const int ctx_offset = (num_pels_log2_lookup[plane_bsize] > num_pels_log2_lookup[txsize_to_bsize[tx_size]]) ? 10 : 7; txb_ctx->txb_skip_ctx = ctx_base + ctx_offset; } } #define SPECIALIZE_GET_TXB_CTX(w, h) \ static void get_txb_ctx_##w##x##h( \ const BLOCK_SIZE plane_bsize, const int plane, \ const ENTROPY_CONTEXT *const a, const ENTROPY_CONTEXT *const l, \ TXB_CTX *const txb_ctx) { \ static const int8_t signs[3] = { 0, -1, 1 }; \ static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = { \ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, \ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 \ }; \ const TX_SIZE tx_size = TX_##w##X##h; \ const int txb_w_unit = tx_size_wide_unit[tx_size]; \ const int txb_h_unit = tx_size_high_unit[tx_size]; \ int dc_sign = 0; \ int k = 0; \ \ do { \ const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS; \ assert(sign <= 2); \ dc_sign += signs[sign]; \ } while (++k < txb_w_unit); \ \ k = 0; \ do { \ const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS; \ assert(sign <= 2); \ dc_sign += signs[sign]; \ } while (++k < txb_h_unit); \ \ txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT]; \ \ if (plane == 0) { \ if (plane_bsize == txsize_to_bsize[tx_size]) { \ txb_ctx->txb_skip_ctx = 0; \ } else { \ static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 }, \ { 2, 4, 4, 4, 5 }, \ { 2, 4, 4, 4, 5 }, \ { 2, 4, 4, 4, 5 }, \ { 3, 5, 5, 5, 6 } }; \ int top = 0; \ int left = 0; \ \ k = 0; \ do { \ top |= a[k]; \ } while (++k < txb_w_unit); \ top &= COEFF_CONTEXT_MASK; \ top = AOMMIN(top, 4); \ \ k = 0; \ do { \ left |= l[k]; \ } while (++k < txb_h_unit); \ left &= COEFF_CONTEXT_MASK; \ left = AOMMIN(left, 4); \ \ txb_ctx->txb_skip_ctx = skip_contexts[top][left]; \ } \ } else { \ const int ctx_base = get_entropy_context(tx_size, a, l); \ const int ctx_offset = (num_pels_log2_lookup[plane_bsize] > \ num_pels_log2_lookup[txsize_to_bsize[tx_size]]) \ ? 10 \ : 7; \ txb_ctx->txb_skip_ctx = ctx_base + ctx_offset; \ } \ } SPECIALIZE_GET_TXB_CTX(4, 4) SPECIALIZE_GET_TXB_CTX(8, 8) SPECIALIZE_GET_TXB_CTX(16, 16) SPECIALIZE_GET_TXB_CTX(32, 32) // Wrapper for get_txb_ctx that calls the specialized version of get_txb_ctc_* // so that the compiler can compile away the while loops. static inline void get_txb_ctx(const BLOCK_SIZE plane_bsize, const TX_SIZE tx_size, const int plane, const ENTROPY_CONTEXT *const a, const ENTROPY_CONTEXT *const l, TXB_CTX *const txb_ctx) { switch (tx_size) { case TX_4X4: get_txb_ctx_4x4(plane_bsize, plane, a, l, txb_ctx); break; case TX_8X8: get_txb_ctx_8x8(plane_bsize, plane, a, l, txb_ctx); break; case TX_16X16: get_txb_ctx_16x16(plane_bsize, plane, a, l, txb_ctx); break; case TX_32X32: get_txb_ctx_32x32(plane_bsize, plane, a, l, txb_ctx); break; default: get_txb_ctx_general(plane_bsize, tx_size, plane, a, l, txb_ctx); break; } } #undef MAX_TX_SIZE_UNIT #endif // AOM_AV1_COMMON_TXB_COMMON_H_ aom-3.12.1/av1/common/warped_motion.c000066400000000000000000001225341477627663500174020ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include #include "config/av1_rtcd.h" #include "av1/common/av1_common_int.h" #include "av1/common/warped_motion.h" #include "av1/common/scale.h" // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels // at a time. The zoom/rotation/shear in the model are applied to the // "fractional" position of each pixel, which therefore varies within // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS. // We need an extra 2 taps to fit this in, for a total of 8 taps. /* clang-format off */ const WarpedFilterCoeff av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1] [8] = { // [-1, 0) { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 }, { 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 }, { 1, - 5, 126, 8, - 3, 1, 0, 0 }, { 1, - 6, 125, 11, - 4, 1, 0, 0 }, { 1, - 7, 124, 13, - 4, 1, 0, 0 }, { 2, - 8, 123, 15, - 5, 1, 0, 0 }, { 2, - 9, 122, 18, - 6, 1, 0, 0 }, { 2, -10, 121, 20, - 6, 1, 0, 0 }, { 2, -11, 120, 22, - 7, 2, 0, 0 }, { 2, -12, 119, 25, - 8, 2, 0, 0 }, { 3, -13, 117, 27, - 8, 2, 0, 0 }, { 3, -13, 116, 29, - 9, 2, 0, 0 }, { 3, -14, 114, 32, -10, 3, 0, 0 }, { 3, -15, 113, 35, -10, 2, 0, 0 }, { 3, -15, 111, 37, -11, 3, 0, 0 }, { 3, -16, 109, 40, -11, 3, 0, 0 }, { 3, -16, 108, 42, -12, 3, 0, 0 }, { 4, -17, 106, 45, -13, 3, 0, 0 }, { 4, -17, 104, 47, -13, 3, 0, 0 }, { 4, -17, 102, 50, -14, 3, 0, 0 }, { 4, -17, 100, 52, -14, 3, 0, 0 }, { 4, -18, 98, 55, -15, 4, 0, 0 }, { 4, -18, 96, 58, -15, 3, 0, 0 }, { 4, -18, 94, 60, -16, 4, 0, 0 }, { 4, -18, 91, 63, -16, 4, 0, 0 }, { 4, -18, 89, 65, -16, 4, 0, 0 }, { 4, -18, 87, 68, -17, 4, 0, 0 }, { 4, -18, 85, 70, -17, 4, 0, 0 }, { 4, -18, 82, 73, -17, 4, 0, 0 }, { 4, -18, 80, 75, -17, 4, 0, 0 }, { 4, -18, 78, 78, -18, 4, 0, 0 }, { 4, -17, 75, 80, -18, 4, 0, 0 }, { 4, -17, 73, 82, -18, 4, 0, 0 }, { 4, -17, 70, 85, -18, 4, 0, 0 }, { 4, -17, 68, 87, -18, 4, 0, 0 }, { 4, -16, 65, 89, -18, 4, 0, 0 }, { 4, -16, 63, 91, -18, 4, 0, 0 }, { 4, -16, 60, 94, -18, 4, 0, 0 }, { 3, -15, 58, 96, -18, 4, 0, 0 }, { 4, -15, 55, 98, -18, 4, 0, 0 }, { 3, -14, 52, 100, -17, 4, 0, 0 }, { 3, -14, 50, 102, -17, 4, 0, 0 }, { 3, -13, 47, 104, -17, 4, 0, 0 }, { 3, -13, 45, 106, -17, 4, 0, 0 }, { 3, -12, 42, 108, -16, 3, 0, 0 }, { 3, -11, 40, 109, -16, 3, 0, 0 }, { 3, -11, 37, 111, -15, 3, 0, 0 }, { 2, -10, 35, 113, -15, 3, 0, 0 }, { 3, -10, 32, 114, -14, 3, 0, 0 }, { 2, - 9, 29, 116, -13, 3, 0, 0 }, { 2, - 8, 27, 117, -13, 3, 0, 0 }, { 2, - 8, 25, 119, -12, 2, 0, 0 }, { 2, - 7, 22, 120, -11, 2, 0, 0 }, { 1, - 6, 20, 121, -10, 2, 0, 0 }, { 1, - 6, 18, 122, - 9, 2, 0, 0 }, { 1, - 5, 15, 123, - 8, 2, 0, 0 }, { 1, - 4, 13, 124, - 7, 1, 0, 0 }, { 1, - 4, 11, 125, - 6, 1, 0, 0 }, { 1, - 3, 8, 126, - 5, 1, 0, 0 }, { 1, - 2, 6, 126, - 4, 1, 0, 0 }, { 0, - 1, 4, 127, - 3, 1, 0, 0 }, { 0, 0, 2, 127, - 1, 0, 0, 0 }, // [0, 1) { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 0, -1, 127, 2, 0, 0, 0}, { 0, 1, -3, 127, 4, -2, 1, 0}, { 0, 1, -5, 127, 6, -2, 1, 0}, { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 2, -7, 126, 11, -4, 2, -1}, {-1, 3, -8, 125, 13, -5, 2, -1}, {-1, 3, -10, 124, 16, -6, 3, -1}, {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -12, 122, 20, -7, 3, -1}, {-1, 4, -13, 121, 23, -8, 3, -1}, {-2, 5, -14, 120, 25, -9, 4, -1}, {-1, 5, -15, 119, 27, -10, 4, -1}, {-1, 5, -16, 118, 30, -11, 4, -1}, {-2, 6, -17, 116, 33, -12, 5, -1}, {-2, 6, -17, 114, 35, -12, 5, -1}, {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 111, 41, -14, 6, -2}, {-2, 7, -19, 110, 43, -15, 6, -2}, {-2, 7, -20, 108, 46, -15, 6, -2}, {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 104, 51, -16, 7, -2}, {-2, 7, -21, 102, 54, -17, 7, -2}, {-2, 8, -21, 100, 56, -18, 7, -2}, {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 96, 62, -19, 7, -2}, {-2, 8, -22, 94, 64, -19, 7, -2}, {-2, 8, -22, 91, 67, -20, 8, -2}, {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -22, 87, 72, -21, 8, -2}, {-2, 8, -21, 84, 74, -21, 8, -2}, {-2, 8, -22, 82, 77, -21, 8, -2}, {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 77, 82, -22, 8, -2}, {-2, 8, -21, 74, 84, -21, 8, -2}, {-2, 8, -21, 72, 87, -22, 8, -2}, {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 8, -20, 67, 91, -22, 8, -2}, {-2, 7, -19, 64, 94, -22, 8, -2}, {-2, 7, -19, 62, 96, -22, 8, -2}, {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -18, 56, 100, -21, 8, -2}, {-2, 7, -17, 54, 102, -21, 7, -2}, {-2, 7, -16, 51, 104, -21, 7, -2}, {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 46, 108, -20, 7, -2}, {-2, 6, -15, 43, 110, -19, 7, -2}, {-2, 6, -14, 41, 111, -19, 7, -2}, {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 35, 114, -17, 6, -2}, {-1, 5, -12, 33, 116, -17, 6, -2}, {-1, 4, -11, 30, 118, -16, 5, -1}, {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 4, -9, 25, 120, -14, 5, -2}, {-1, 3, -8, 23, 121, -13, 4, -1}, {-1, 3, -7, 20, 122, -12, 4, -1}, {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 3, -6, 16, 124, -10, 3, -1}, {-1, 2, -5, 13, 125, -8, 3, -1}, {-1, 2, -4, 11, 126, -7, 2, -1}, { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 6, 127, -5, 1, 0}, { 0, 1, -2, 4, 127, -3, 1, 0}, { 0, 0, 0, 2, 127, -1, 0, 0}, // [1, 2) { 0, 0, 0, 1, 127, 0, 0, 0 }, { 0, 0, 0, - 1, 127, 2, 0, 0 }, { 0, 0, 1, - 3, 127, 4, - 1, 0 }, { 0, 0, 1, - 4, 126, 6, - 2, 1 }, { 0, 0, 1, - 5, 126, 8, - 3, 1 }, { 0, 0, 1, - 6, 125, 11, - 4, 1 }, { 0, 0, 1, - 7, 124, 13, - 4, 1 }, { 0, 0, 2, - 8, 123, 15, - 5, 1 }, { 0, 0, 2, - 9, 122, 18, - 6, 1 }, { 0, 0, 2, -10, 121, 20, - 6, 1 }, { 0, 0, 2, -11, 120, 22, - 7, 2 }, { 0, 0, 2, -12, 119, 25, - 8, 2 }, { 0, 0, 3, -13, 117, 27, - 8, 2 }, { 0, 0, 3, -13, 116, 29, - 9, 2 }, { 0, 0, 3, -14, 114, 32, -10, 3 }, { 0, 0, 3, -15, 113, 35, -10, 2 }, { 0, 0, 3, -15, 111, 37, -11, 3 }, { 0, 0, 3, -16, 109, 40, -11, 3 }, { 0, 0, 3, -16, 108, 42, -12, 3 }, { 0, 0, 4, -17, 106, 45, -13, 3 }, { 0, 0, 4, -17, 104, 47, -13, 3 }, { 0, 0, 4, -17, 102, 50, -14, 3 }, { 0, 0, 4, -17, 100, 52, -14, 3 }, { 0, 0, 4, -18, 98, 55, -15, 4 }, { 0, 0, 4, -18, 96, 58, -15, 3 }, { 0, 0, 4, -18, 94, 60, -16, 4 }, { 0, 0, 4, -18, 91, 63, -16, 4 }, { 0, 0, 4, -18, 89, 65, -16, 4 }, { 0, 0, 4, -18, 87, 68, -17, 4 }, { 0, 0, 4, -18, 85, 70, -17, 4 }, { 0, 0, 4, -18, 82, 73, -17, 4 }, { 0, 0, 4, -18, 80, 75, -17, 4 }, { 0, 0, 4, -18, 78, 78, -18, 4 }, { 0, 0, 4, -17, 75, 80, -18, 4 }, { 0, 0, 4, -17, 73, 82, -18, 4 }, { 0, 0, 4, -17, 70, 85, -18, 4 }, { 0, 0, 4, -17, 68, 87, -18, 4 }, { 0, 0, 4, -16, 65, 89, -18, 4 }, { 0, 0, 4, -16, 63, 91, -18, 4 }, { 0, 0, 4, -16, 60, 94, -18, 4 }, { 0, 0, 3, -15, 58, 96, -18, 4 }, { 0, 0, 4, -15, 55, 98, -18, 4 }, { 0, 0, 3, -14, 52, 100, -17, 4 }, { 0, 0, 3, -14, 50, 102, -17, 4 }, { 0, 0, 3, -13, 47, 104, -17, 4 }, { 0, 0, 3, -13, 45, 106, -17, 4 }, { 0, 0, 3, -12, 42, 108, -16, 3 }, { 0, 0, 3, -11, 40, 109, -16, 3 }, { 0, 0, 3, -11, 37, 111, -15, 3 }, { 0, 0, 2, -10, 35, 113, -15, 3 }, { 0, 0, 3, -10, 32, 114, -14, 3 }, { 0, 0, 2, - 9, 29, 116, -13, 3 }, { 0, 0, 2, - 8, 27, 117, -13, 3 }, { 0, 0, 2, - 8, 25, 119, -12, 2 }, { 0, 0, 2, - 7, 22, 120, -11, 2 }, { 0, 0, 1, - 6, 20, 121, -10, 2 }, { 0, 0, 1, - 6, 18, 122, - 9, 2 }, { 0, 0, 1, - 5, 15, 123, - 8, 2 }, { 0, 0, 1, - 4, 13, 124, - 7, 1 }, { 0, 0, 1, - 4, 11, 125, - 6, 1 }, { 0, 0, 1, - 3, 8, 126, - 5, 1 }, { 0, 0, 1, - 2, 6, 126, - 4, 1 }, { 0, 0, 0, - 1, 4, 127, - 3, 1 }, { 0, 0, 0, 0, 2, 127, - 1, 0 }, // dummy (replicate row index 191) { 0, 0, 0, 0, 2, 127, - 1, 0 }, }; /* clang-format on */ #define DIV_LUT_PREC_BITS 14 #define DIV_LUT_BITS 8 #define DIV_LUT_NUM (1 << DIV_LUT_BITS) static const uint16_t div_lut[DIV_LUT_NUM + 1] = { 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768, 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142, 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564, 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028, 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530, 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066, 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633, 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228, 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848, 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491, 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155, 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838, 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538, 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255, 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986, 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732, 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489, 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259, 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039, 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830, 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630, 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439, 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257, 8240, 8224, 8208, 8192, }; // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned // at precision of DIV_LUT_PREC_BITS along with the shift. static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) { int64_t f; *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32 : get_msb((unsigned int)D)); // e is obtained from D after resetting the most significant 1 bit. const int64_t e = D - ((uint64_t)1 << *shift); // Get the most significant DIV_LUT_BITS (8) bits of e into f if (*shift > DIV_LUT_BITS) f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS); else f = e << (DIV_LUT_BITS - *shift); assert(f <= DIV_LUT_NUM); *shift += DIV_LUT_PREC_BITS; // Use f as lookup into the precomputed table of multipliers return div_lut[f]; } static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) { int32_t f; *shift = get_msb(D); // e is obtained from D after resetting the most significant 1 bit. const int32_t e = D - ((uint32_t)1 << *shift); // Get the most significant DIV_LUT_BITS (8) bits of e into f if (*shift > DIV_LUT_BITS) f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS); else f = e << (DIV_LUT_BITS - *shift); assert(f <= DIV_LUT_NUM); *shift += DIV_LUT_PREC_BITS; // Use f as lookup into the precomputed table of multipliers return div_lut[f]; } static int is_affine_valid(const WarpedMotionParams *const wm) { const int32_t *mat = wm->wmmat; return (mat[2] > 0); } static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) || (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS))) return 0; else return 1; } #ifndef NDEBUG // Check that the given warp model satisfies the relevant constraints for // its stated model type static void check_model_consistency(WarpedMotionParams *wm) { switch (wm->wmtype) { case IDENTITY: assert(wm->wmmat[0] == 0); assert(wm->wmmat[1] == 0); AOM_FALLTHROUGH_INTENDED; case TRANSLATION: assert(wm->wmmat[2] == 1 << WARPEDMODEL_PREC_BITS); assert(wm->wmmat[3] == 0); AOM_FALLTHROUGH_INTENDED; case ROTZOOM: assert(wm->wmmat[4] == -wm->wmmat[3]); assert(wm->wmmat[5] == wm->wmmat[2]); AOM_FALLTHROUGH_INTENDED; case AFFINE: break; default: assert(0 && "Bad wmtype"); } } #endif // NDEBUG // Returns 1 on success or 0 on an invalid affine set int av1_get_shear_params(WarpedMotionParams *wm) { #ifndef NDEBUG // Check that models have been constructed sensibly // This is a good place to check, because this function does not need to // be called until after model construction is complete, but must be called // before the model can be used for prediction. check_model_consistency(wm); #endif // NDEBUG const int32_t *mat = wm->wmmat; if (!is_affine_valid(wm)) return 0; wm->alpha = clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX); wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX); int16_t shift; int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1); int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y; wm->gamma = clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX); v = ((int64_t)mat[3] * mat[4]) * y; wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX); wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) * (1 << WARP_PARAM_REDUCE_BITS); wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) * (1 << WARP_PARAM_REDUCE_BITS); wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) * (1 << WARP_PARAM_REDUCE_BITS); wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) * (1 << WARP_PARAM_REDUCE_BITS); if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta)) return 0; return 1; } #if CONFIG_AV1_HIGHBITDEPTH /* Note: For an explanation of the warp algorithm, and some notes on bit widths for hardware implementations, see the comments above av1_warp_affine_c */ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { int32_t tmp[15 * 8]; const int reduce_bits_horiz = conv_params->round_0; const int reduce_bits_vert = conv_params->is_compound ? conv_params->round_1 : 2 * FILTER_BITS - reduce_bits_horiz; const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz; const int offset_bits_horiz = bd + FILTER_BITS - 1; const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; (void)max_bits_horiz; assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); // Check that, even with 12-bit input, the intermediate values will fit // into an unsigned 16-bit intermediate array. assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); for (int i = p_row; i < p_row + p_height; i += 8) { for (int j = p_col; j < p_col + p_width; j += 8) { // Calculate the center of this 8x8 block, // project to luma coordinates (if in a subsampled chroma plane), // apply the affine transformation, // then convert back to the original coordinates (if necessary) const int32_t src_x = (j + 4) << subsampling_x; const int32_t src_y = (i + 4) << subsampling_y; const int64_t dst_x = (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; const int64_t dst_y = (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; const int64_t x4 = dst_x >> subsampling_x; const int64_t y4 = dst_y >> subsampling_y; const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); sx4 += alpha * (-4) + beta * (-4); sy4 += gamma * (-4) + delta * (-4); sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); // Horizontal filter for (int k = -7; k < 8; ++k) { const int iy = clamp(iy4 + k, 0, height - 1); int sx = sx4 + beta * (k + 4); for (int l = -4; l < 4; ++l) { int ix = ix4 + l - 3; const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS; assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); const WarpedFilterCoeff *coeffs = av1_warped_filter[offs]; int32_t sum = 1 << offset_bits_horiz; for (int m = 0; m < 8; ++m) { const int sample_x = clamp(ix + m, 0, width - 1); sum += ref[iy * stride + sample_x] * coeffs[m]; } sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz); assert(0 <= sum && sum < (1 << max_bits_horiz)); tmp[(k + 7) * 8 + (l + 4)] = sum; sx += alpha; } } // Vertical filter for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) { int sy = sy4 + delta * (k + 4); for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) { const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS; assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); const WarpedFilterCoeff *coeffs = av1_warped_filter[offs]; int32_t sum = 1 << offset_bits_vert; for (int m = 0; m < 8; ++m) { sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; } if (conv_params->is_compound) { CONV_BUF_TYPE *p = &conv_params ->dst[(i - p_row + k + 4) * conv_params->dst_stride + (j - p_col + l + 4)]; sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); if (conv_params->do_average) { uint16_t *dst16 = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; int32_t tmp32 = *p; if (conv_params->use_dist_wtd_comp_avg) { tmp32 = tmp32 * conv_params->fwd_offset + sum * conv_params->bck_offset; tmp32 = tmp32 >> DIST_PRECISION_BITS; } else { tmp32 += sum; tmp32 = tmp32 >> 1; } tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) - (1 << (offset_bits - conv_params->round_1 - 1)); *dst16 = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd); } else { *p = sum; } } else { uint16_t *p = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); assert(0 <= sum && sum < (1 << (bd + 2))); *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd); } sy += gamma; } } } } } void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref, int width, int height, int stride, uint16_t *const pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params) { const int32_t *const mat = wm->wmmat; const int16_t alpha = wm->alpha; const int16_t beta = wm->beta; const int16_t gamma = wm->gamma; const int16_t delta = wm->delta; av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, bd, conv_params, alpha, beta, gamma, delta); } #endif // CONFIG_AV1_HIGHBITDEPTH /* The warp filter for ROTZOOM and AFFINE models works as follows: * Split the input into 8x8 blocks * For each block, project the point (4, 4) within the block, to get the overall block position. Split into integer and fractional coordinates, maintaining full WARPEDMODEL precision * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a variable horizontal offset. This means that, while the rows of the intermediate buffer align with the rows of the *reference* image, the columns align with the columns of the *destination* image. * Filter vertically: Generate the output block (up to 8x8 pixels, but if the destination is too small we crop the output at this stage). Each pixel has a variable vertical offset, so that the resulting rows are aligned with the rows of the destination image. To accomplish these alignments, we factor the warp matrix as a product of two shear / asymmetric zoom matrices: / a b \ = / 1 0 \ * / 1+alpha beta \ \ c d / \ gamma 1+delta / \ 0 1 / where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively. The horizontal shear (with alpha and beta) is applied first, then the vertical shear (with gamma and delta) is applied second. The only limitation is that, to fit this in a fixed 8-tap filter size, the fractional pixel offsets must be at most +-1. Since the horizontal filter generates 15 rows of 8 columns, and the initial point we project is at (4, 4) within the block, the parameters must satisfy 4 * |alpha| + 7 * |beta| <= 1 and 4 * |gamma| + 4 * |delta| <= 1 for this filter to be applicable. Note: This function assumes that the caller has done all of the relevant checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5] are set appropriately (if using a ROTZOOM model), and that alpha, beta, gamma, delta are all in range. TODO(rachelbarker): Maybe support scaled references? */ /* A note on hardware implementation: The warp filter is intended to be implementable using the same hardware as the high-precision convolve filters from the loop-restoration and convolve-round experiments. For a single filter stage, considering all of the coefficient sets for the warp filter and the regular convolution filter, an input in the range [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)] before rounding. Allowing for some changes to the filter coefficient sets, call the range [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k, we can replace this by the range [0, 256 * 2^k], which can be stored in an unsigned value with 8 + k bits. This allows the derivation of the appropriate bit widths and offsets for the various intermediate values: If F := FILTER_BITS = 7 (or else the above ranges need adjusting) So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit intermediate value. H := ROUND0_BITS V := VERSHEAR_REDUCE_PREC_BITS (and note that we must have H + V = 2*F for the output to have the same scale as the input) then we end up with the following offsets and ranges: Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a uint{bd + F + 1} After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}. Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a uint{bd + 2*F + 2 - H} After rounding: The final value, before undoing the offset, fits into a uint{bd + 2}. Then we need to undo the offsets before clamping to a pixel. Note that, if we do this at the end, the amount to subtract is actually independent of H and V: offset to subtract = (1 << ((bd + F - 1) - H + F - V)) + (1 << ((bd + 2*F - H) - V)) == (1 << (bd - 1)) + (1 << bd) This allows us to entirely avoid clamping in both the warp filter and the convolve-round experiment. As of the time of writing, the Wiener filter from loop-restoration can encode a central coefficient up to 216, which leads to a maximum value of about 282 * 2^k after applying the offset. So in that case we still need to clamp. */ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { int32_t tmp[15 * 8]; const int bd = 8; const int reduce_bits_horiz = conv_params->round_0; const int reduce_bits_vert = conv_params->is_compound ? conv_params->round_1 : 2 * FILTER_BITS - reduce_bits_horiz; const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz; const int offset_bits_horiz = bd + FILTER_BITS - 1; const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; (void)max_bits_horiz; assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); for (int i = p_row; i < p_row + p_height; i += 8) { for (int j = p_col; j < p_col + p_width; j += 8) { // Calculate the center of this 8x8 block, // project to luma coordinates (if in a subsampled chroma plane), // apply the affine transformation, // then convert back to the original coordinates (if necessary) const int32_t src_x = (j + 4) << subsampling_x; const int32_t src_y = (i + 4) << subsampling_y; const int64_t dst_x = (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; const int64_t dst_y = (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; const int64_t x4 = dst_x >> subsampling_x; const int64_t y4 = dst_y >> subsampling_y; int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); sx4 += alpha * (-4) + beta * (-4); sy4 += gamma * (-4) + delta * (-4); sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); // Horizontal filter for (int k = -7; k < 8; ++k) { // Clamp to top/bottom edge of the frame const int iy = clamp(iy4 + k, 0, height - 1); int sx = sx4 + beta * (k + 4); for (int l = -4; l < 4; ++l) { int ix = ix4 + l - 3; // At this point, sx = sx4 + alpha * l + beta * k const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS; assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); const WarpedFilterCoeff *coeffs = av1_warped_filter[offs]; int32_t sum = 1 << offset_bits_horiz; for (int m = 0; m < 8; ++m) { // Clamp to left/right edge of the frame const int sample_x = clamp(ix + m, 0, width - 1); sum += ref[iy * stride + sample_x] * coeffs[m]; } sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz); assert(0 <= sum && sum < (1 << max_bits_horiz)); tmp[(k + 7) * 8 + (l + 4)] = sum; sx += alpha; } } // Vertical filter for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) { int sy = sy4 + delta * (k + 4); for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) { // At this point, sy = sy4 + gamma * l + delta * k const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS; assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); const WarpedFilterCoeff *coeffs = av1_warped_filter[offs]; int32_t sum = 1 << offset_bits_vert; for (int m = 0; m < 8; ++m) { sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; } if (conv_params->is_compound) { CONV_BUF_TYPE *p = &conv_params ->dst[(i - p_row + k + 4) * conv_params->dst_stride + (j - p_col + l + 4)]; sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); if (conv_params->do_average) { uint8_t *dst8 = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; int32_t tmp32 = *p; if (conv_params->use_dist_wtd_comp_avg) { tmp32 = tmp32 * conv_params->fwd_offset + sum * conv_params->bck_offset; tmp32 = tmp32 >> DIST_PRECISION_BITS; } else { tmp32 += sum; tmp32 = tmp32 >> 1; } tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) - (1 << (offset_bits - conv_params->round_1 - 1)); *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits)); } else { *p = sum; } } else { uint8_t *p = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); assert(0 <= sum && sum < (1 << (bd + 2))); *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd)); } sy += gamma; } } } } } void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params) { const int32_t *const mat = wm->wmmat; const int16_t alpha = wm->alpha; const int16_t beta = wm->beta; const int16_t gamma = wm->gamma; const int16_t delta = wm->delta; av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, conv_params, alpha, beta, gamma, delta); } void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params) { #if CONFIG_AV1_HIGHBITDEPTH if (use_hbd) highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride, CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, bd, conv_params); else warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, conv_params); #else (void)use_hbd; (void)bd; warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y, conv_params); #endif } #define LS_MV_MAX 256 // max mv in 1/8-pel // Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By. #define LS_STEP 8 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8, // the precision needed is: // (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] + // (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] + // 1 [for sign] + // LEAST_SQUARES_SAMPLES_MAX_BITS // [for adding up to LEAST_SQUARES_SAMPLES_MAX samples] // The value is 23 #define LS_MAT_RANGE_BITS \ ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS) // Bit-depth reduction from the full-range #define LS_MAT_DOWN_BITS 2 // bits range of A, Bx and By after downshifting #define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS) #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1))) #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1) // By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are // 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here. #define LS_SQUARE(a) \ (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \ (2 + LS_MAT_DOWN_BITS)) #define LS_PRODUCT1(a, b) \ (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \ (2 + LS_MAT_DOWN_BITS)) #define LS_PRODUCT2(a, b) \ (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \ (2 + LS_MAT_DOWN_BITS)) #define USE_LIMITED_PREC_MULT 0 #if USE_LIMITED_PREC_MULT #define MUL_PREC_BITS 16 static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) { int msb = 0; uint16_t mult = 0; *shift = 0; if (D != 0) { msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32 : get_msb((unsigned int)D)); if (msb >= MUL_PREC_BITS) { mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS); *shift = msb + 1 - MUL_PREC_BITS; } else { mult = (uint16_t)D; *shift = 0; } } return mult; } static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) { int32_t ret; int16_t mshift; uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift); int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1); shift -= mshift; if (shift > 0) { return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift), -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); } else { return (int32_t)clamp(v * (1 << (-shift)), -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); } return ret; } static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) { int16_t mshift; uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift); int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1); shift -= mshift; if (shift > 0) { return (int32_t)clamp( ROUND_POWER_OF_TWO_SIGNED(v, shift), (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); } else { return (int32_t)clamp( v * (1 << (-shift)), (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); } } #else static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) { int64_t v = Px * (int64_t)iDet; return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift), -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); } static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) { int64_t v = Px * (int64_t)iDet; return (int32_t)clamp64( ROUND_POWER_OF_TWO_SIGNED_64(v, shift), (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); } #endif // USE_LIMITED_PREC_MULT static int find_affine_int(int np, const int *pts1, const int *pts2, BLOCK_SIZE bsize, int mvy, int mvx, WarpedMotionParams *wm, int mi_row, int mi_col) { int32_t A[2][2] = { { 0, 0 }, { 0, 0 } }; int32_t Bx[2] = { 0, 0 }; int32_t By[2] = { 0, 0 }; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int rsuy = bh / 2 - 1; const int rsux = bw / 2 - 1; const int suy = rsuy * 8; const int sux = rsux * 8; const int duy = suy + mvy; const int dux = sux + mvx; // Assume the center pixel of the block has exactly the same motion vector // as transmitted for the block. First shift the origin of the source // points to the block center, and the origin of the destination points to // the block center added to the motion vector transmitted. // Let (xi, yi) denote the source points and (xi', yi') denote destination // points after origin shfifting, for i = 0, 1, 2, .... n-1. // Then if P = [x0, y0, // x1, y1 // x2, y1, // .... // ] // q = [x0', x1', x2', ... ]' // r = [y0', y1', y2', ... ]' // the least squares problems that need to be solved are: // [h1, h2]' = inv(P'P)P'q and // [h3, h4]' = inv(P'P)P'r // where the affine transformation is given by: // x' = h1.x + h2.y // y' = h3.x + h4.y // // The loop below computes: A = P'P, Bx = P'q, By = P'r // We need to just compute inv(A).Bx and inv(A).By for the solutions. // Contribution from neighbor block for (int i = 0; i < np; i++) { const int dx = pts2[i * 2] - dux; const int dy = pts2[i * 2 + 1] - duy; const int sx = pts1[i * 2] - sux; const int sy = pts1[i * 2 + 1] - suy; // (TODO)yunqing: This comparison wouldn't be necessary if the sample // selection is done in find_samples(). Also, global offset can be removed // while collecting samples. if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) { A[0][0] += LS_SQUARE(sx); A[0][1] += LS_PRODUCT1(sx, sy); A[1][1] += LS_SQUARE(sy); Bx[0] += LS_PRODUCT2(sx, dx); Bx[1] += LS_PRODUCT1(sy, dx); By[0] += LS_PRODUCT1(sx, dy); By[1] += LS_PRODUCT2(sy, dy); } } // Just for debugging, and can be removed later. assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX); assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX); assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX); assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX); assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX); assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX); assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX); // Compute Determinant of A const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1]; if (Det == 0) return 1; int16_t shift; int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1); shift -= WARPEDMODEL_PREC_BITS; if (shift < 0) { iDet <<= (-shift); shift = 0; } int64_t Px[2], Py[2]; // These divided by the Det, are the least squares solutions Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1]; Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1]; Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1]; Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1]; wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift); wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift); wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift); wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift); const int isuy = (mi_row * MI_SIZE + rsuy); const int isux = (mi_col * MI_SIZE + rsux); // Note: In the vx, vy expressions below, the max value of each of the // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room // for the first term so that the overall sum in the worst case fits // within 32 bits overall. const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) - (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) + isuy * wm->wmmat[3]); const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) - (isux * wm->wmmat[4] + isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS))); wm->wmmat[0] = clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1); wm->wmmat[1] = clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1); return 0; } int av1_find_projection(int np, const int *pts1, const int *pts2, BLOCK_SIZE bsize, int mvy, int mvx, WarpedMotionParams *wm_params, int mi_row, int mi_col) { assert(wm_params->wmtype == AFFINE); if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row, mi_col)) return 1; // check compatibility with the fast warp filter if (!av1_get_shear_params(wm_params)) return 1; return 0; } aom-3.12.1/av1/common/warped_motion.h000066400000000000000000000110201477627663500173720ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_WARPED_MOTION_H_ #define AOM_AV1_COMMON_WARPED_MOTION_H_ #include #include #include #include #include #include "config/aom_config.h" #include "aom_ports/mem.h" #include "aom_dsp/aom_dsp_common.h" #include "av1/common/mv.h" #include "av1/common/convolve.h" #define LEAST_SQUARES_SAMPLES_MAX_BITS 3 #define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS) #define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2) #define WARPED_MOTION_DEBUG 0 #define DEFAULT_WMTYPE AFFINE #define WARP_ERROR_BLOCK_LOG 5 #define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG) #if AOM_ARCH_ARM || AOM_ARCH_AARCH64 || AOM_ARCH_X86 || AOM_ARCH_X86_64 typedef int16_t WarpedFilterCoeff; #else typedef int8_t WarpedFilterCoeff; #endif extern const WarpedFilterCoeff av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]; DECLARE_ALIGNED(8, extern const int8_t, av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]); static const uint8_t warp_pad_left[14][16] = { { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 3, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 6, 6, 6, 6, 6, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 7, 7, 7, 7, 7, 7, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, 15 }, { 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 11, 12, 13, 14, 15 }, { 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 12, 13, 14, 15 }, { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 13, 14, 15 }, { 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15 }, { 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 15 }, { 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15 }, }; static const uint8_t warp_pad_right[14][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11, 11 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9, 9, 9, 9 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8 }, { 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 }, { 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 }, { 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }, { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }, { 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }, { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 } }; void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref, int width, int height, int stride, uint16_t *const pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params); void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params); void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params); int av1_find_projection(int np, const int *pts1, const int *pts2, BLOCK_SIZE bsize, int mvy, int mvx, WarpedMotionParams *wm_params, int mi_row, int mi_col); int av1_get_shear_params(WarpedMotionParams *wm); #endif // AOM_AV1_COMMON_WARPED_MOTION_H_ aom-3.12.1/av1/common/x86/000077500000000000000000000000001477627663500150055ustar00rootroot00000000000000aom-3.12.1/av1/common/x86/av1_convolve_horiz_rs_sse4.c000066400000000000000000000233021477627663500224300ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "av1/common/convolve.h" #include "av1/common/resize.h" #include "aom_dsp/x86/synonyms.h" // Note: If the crop width is not a multiple of 4, then, unlike the C version, // this function will overwrite some of the padding on the right hand side of // the frame. This padding appears to be trashed anyway, so this should not // affect the running of the decoder. void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn) { assert(UPSCALE_NORMATIVE_TAPS == 8); src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1); const __m128i zero = _mm_setzero_si128(); const uint8_t *src_y; uint8_t *dst_y; int x_qn = x0_qn; for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) { const int x_filter_idx0 = ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; const int x_filter_idx1 = ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; const int x_filter_idx2 = ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; const int x_filter_idx3 = ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; assert(x_filter_idx0 <= RS_SUBPEL_MASK); assert(x_filter_idx1 <= RS_SUBPEL_MASK); assert(x_filter_idx2 <= RS_SUBPEL_MASK); assert(x_filter_idx3 <= RS_SUBPEL_MASK); const int16_t *const x_filter0 = &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS]; const int16_t *const x_filter1 = &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS]; const int16_t *const x_filter2 = &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS]; const int16_t *const x_filter3 = &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS]; const __m128i fil0_16 = xx_loadu_128(x_filter0); const __m128i fil1_16 = xx_loadu_128(x_filter1); const __m128i fil2_16 = xx_loadu_128(x_filter2); const __m128i fil3_16 = xx_loadu_128(x_filter3); src_y = src; dst_y = dst; for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) { const uint8_t *const src_x0 = &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; const uint8_t *const src_x1 = &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; const uint8_t *const src_x2 = &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; const uint8_t *const src_x3 = &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; // Load up the source data. This is 8-bit input data, so each load // gets 8 pixels. const __m128i src0_8 = xx_loadl_64(src_x0); const __m128i src1_8 = xx_loadl_64(src_x1); const __m128i src2_8 = xx_loadl_64(src_x2); const __m128i src3_8 = xx_loadl_64(src_x3); // Now zero-extend up to 16-bit precision, i.e. // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ] const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8); const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8); const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8); const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8); // Multiply by filter coefficients (results in a 32-bit value), // and add adjacent pairs, i.e. // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ]) // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ] const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16); const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16); const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16); const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16); // Reduce horizontally and add, i.e. // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ] const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32); const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32); const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32); // Divide down by (1 << FILTER_BITS), rounding to nearest. const __m128i shifted_32 = _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS); // Pack 32-bit values into 16-bit values, i.e. // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ] const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero); // Pack 16-bit values into 8-bit values, i.e. // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ]) // -> [ 0 0 0 0 0 0 DC BA ] const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero); // Write to the output xx_storel_32(&dst_y[x], shifted_8); } } } #if CONFIG_AV1_HIGHBITDEPTH // Note: If the crop width is not a multiple of 4, then, unlike the C version, // this function will overwrite some of the padding on the right hand side of // the frame. This padding appears to be trashed anyway, so this should not // affect the running of the decoder. void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd) { assert(UPSCALE_NORMATIVE_TAPS == 8); assert(bd == 8 || bd == 10 || bd == 12); src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1); const __m128i zero = _mm_setzero_si128(); const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1); const uint16_t *src_y; uint16_t *dst_y; int x_qn = x0_qn; for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) { const int x_filter_idx0 = ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; const int x_filter_idx1 = ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; const int x_filter_idx2 = ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; const int x_filter_idx3 = ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; assert(x_filter_idx0 <= RS_SUBPEL_MASK); assert(x_filter_idx1 <= RS_SUBPEL_MASK); assert(x_filter_idx2 <= RS_SUBPEL_MASK); assert(x_filter_idx3 <= RS_SUBPEL_MASK); const int16_t *const x_filter0 = &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS]; const int16_t *const x_filter1 = &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS]; const int16_t *const x_filter2 = &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS]; const int16_t *const x_filter3 = &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS]; const __m128i fil0_16 = xx_loadu_128(x_filter0); const __m128i fil1_16 = xx_loadu_128(x_filter1); const __m128i fil2_16 = xx_loadu_128(x_filter2); const __m128i fil3_16 = xx_loadu_128(x_filter3); src_y = src; dst_y = dst; for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) { const uint16_t *const src_x0 = &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; const uint16_t *const src_x1 = &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; const uint16_t *const src_x2 = &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; const uint16_t *const src_x3 = &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; // Load up the source data. This is 16-bit input data, so each load // gets 8 pixels. const __m128i src0_16 = xx_loadu_128(src_x0); const __m128i src1_16 = xx_loadu_128(src_x1); const __m128i src2_16 = xx_loadu_128(src_x2); const __m128i src3_16 = xx_loadu_128(src_x3); // Multiply by filter coefficients (results in a 32-bit value), // and add adjacent pairs, i.e. // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ]) // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ] const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16); const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16); const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16); const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16); // Reduce horizontally and add, i.e. // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ] const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32); const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32); const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32); // Divide down by (1 << FILTER_BITS), rounding to nearest. const __m128i shifted_32 = _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS); // Pack 32-bit values into 16-bit values, i.e. // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ] const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero); // Clip the values at (1 << bd) - 1 const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum); // Write to the output xx_storel_64(&dst_y[x], clipped_16); } } } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/av1/common/x86/av1_convolve_scale_sse4.c000066400000000000000000000520641477627663500216670ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "av1/common/convolve.h" // A specialised version of hfilter, the horizontal filter for // av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters. static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w, int h, int subpel_x_qn, int x_step_qn, const InterpFilterParams *filter_params, int round) { const int bd = 8; const int ntaps = 8; src -= ntaps / 2 - 1; int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1)); const __m128i round_add = _mm_set1_epi32(round_add32); const __m128i round_shift = _mm_cvtsi32_si128(round); int x_qn = subpel_x_qn; for (int x = 0; x < w; ++x, x_qn += x_step_qn) { const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS); const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(filter_idx < SUBPEL_SHIFTS); const int16_t *filter = av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); // Load the filter coefficients const __m128i coefflo = _mm_loadu_si128((__m128i *)filter); const __m128i zero = _mm_castps_si128(_mm_setzero_ps()); int y; for (y = 0; y <= h - 4; y += 4) { const uint8_t *const src0 = src_col + y * src_stride; const uint8_t *const src1 = src0 + 1 * src_stride; const uint8_t *const src2 = src0 + 2 * src_stride; const uint8_t *const src3 = src0 + 3 * src_stride; // Load up source data. This is 8-bit input data; each load is just // loading the lower half of the register and gets 8 pixels const __m128i data08 = _mm_loadl_epi64((__m128i *)src0); const __m128i data18 = _mm_loadl_epi64((__m128i *)src1); const __m128i data28 = _mm_loadl_epi64((__m128i *)src2); const __m128i data38 = _mm_loadl_epi64((__m128i *)src3); // Now zero-extend up to 16-bit precision by interleaving with // zeros. Drop the upper half of each register (which just had zeros) const __m128i data0lo = _mm_unpacklo_epi8(data08, zero); const __m128i data1lo = _mm_unpacklo_epi8(data18, zero); const __m128i data2lo = _mm_unpacklo_epi8(data28, zero); const __m128i data3lo = _mm_unpacklo_epi8(data38, zero); // Multiply by coefficients const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo); const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo); const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo); const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo); // Reduce horizontally and add const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo); const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo); const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo); // Divide down by (1 << round), rounding to nearest. __m128i shifted = _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift); shifted = _mm_packus_epi32(shifted, shifted); // Write transposed to the output _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted); } for (; y < h; ++y) { const uint8_t *const src_row = src_col + y * src_stride; int32_t sum = (1 << (bd + FILTER_BITS - 1)); for (int k = 0; k < ntaps; ++k) { sum += filter[k] * src_row[k]; } dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round); } } } static __m128i convolve_16_8(const int16_t *src, __m128i coeff) { __m128i data = _mm_loadu_si128((__m128i *)src); return _mm_madd_epi16(data, coeff); } // A specialised version of vfilter, the vertical filter for // av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters. static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int subpel_y_qn, int y_step_qn, const InterpFilterParams *filter_params, const ConvolveParams *conv_params, int bd) { const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int ntaps = 8; const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1))); const __m128i sub = _mm_set1_epi16(sub32); CONV_BUF_TYPE *dst16 = conv_params->dst; const int dst16_stride = conv_params->dst_stride; const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; const __m128i bits_shift = _mm_cvtsi32_si128(bits); const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1)); const __m128i round_shift_add = _mm_set1_epi32(((1 << conv_params->round_1) >> 1)); const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits); const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi16((short)w0); const __m128i wt1 = _mm_set1_epi16((short)w1); const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); int y_qn = subpel_y_qn; for (int y = 0; y < h; ++y, y_qn += y_step_qn) { const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS); const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(filter_idx < SUBPEL_SHIFTS); const int16_t *filter = av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter); int x; for (x = 0; x <= w - 4; x += 4) { const int16_t *const src0 = src_y + x * src_stride; const int16_t *const src1 = src0 + 1 * src_stride; const int16_t *const src2 = src0 + 2 * src_stride; const int16_t *const src3 = src0 + 3 * src_stride; // Load the source data for the three rows, adding the three registers of // convolved products to one as we go (conv0..conv3) to avoid the // register pressure getting too high. const __m128i conv0 = convolve_16_8(src0, coeff0716); const __m128i conv1 = convolve_16_8(src1, coeff0716); const __m128i conv2 = convolve_16_8(src2, coeff0716); const __m128i conv3 = convolve_16_8(src3, coeff0716); // Now reduce horizontally to get one lane for each result const __m128i conv01 = _mm_hadd_epi32(conv0, conv1); const __m128i conv23 = _mm_hadd_epi32(conv2, conv3); __m128i conv = _mm_hadd_epi32(conv01, conv23); conv = _mm_add_epi32(conv, res_add_const); // Divide down by (1 << round_1), rounding to nearest and subtract sub32. __m128i shifted = _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift); uint8_t *dst_x = dst + y * dst_stride + x; __m128i result; __m128i shifted_16 = _mm_packus_epi32(shifted, shifted); if (conv_params->is_compound) { CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x; if (conv_params->do_average) { const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x); if (conv_params->use_dist_wtd_comp_avg) { const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16); const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt); const __m128i shifted_32 = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); shifted_16 = _mm_packus_epi32(shifted_32, shifted_32); } else { shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1); } const __m128i subbed = _mm_sub_epi16(shifted_16, sub); result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift); const __m128i result_8 = _mm_packus_epi16(result, result); *(int *)dst_x = _mm_cvtsi128_si32(result_8); } else { _mm_storel_epi64((__m128i *)dst_16_x, shifted_16); } } else { const __m128i subbed = _mm_sub_epi16(shifted_16, sub); result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift); const __m128i result_8 = _mm_packus_epi16(result, result); *(int *)dst_x = _mm_cvtsi128_si32(result_8); } } for (; x < w; ++x) { const int16_t *src_x = src_y + x * src_stride; int32_t sum = 1 << offset_bits; for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k]; CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); if (conv_params->is_compound) { if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { tmp += res; tmp = tmp >> 1; } /* Subtract round offset and convolve round */ tmp = tmp - sub32; dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); } else { dst16[y * dst16_stride + x] = res; } } else { /* Subtract round offset and convolve round */ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1))); dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); } } } } void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params) { int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps; const int xtaps = filter_params_x->taps; const int ytaps = filter_params_y->taps; const int fo_vert = ytaps / 2 - 1; assert((xtaps == 8) && (ytaps == 8)); (void)xtaps; // horizontal filter hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0); // vertical filter (input is transposed) vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn, filter_params_y, conv_params, 8); } #if CONFIG_AV1_HIGHBITDEPTH // A specialised version of hfilter, the horizontal filter for // av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap // filters. static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst, int w, int h, int subpel_x_qn, int x_step_qn, const InterpFilterParams *filter_params, int round, int bd) { const int ntaps = 8; src -= ntaps / 2 - 1; int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1)); const __m128i round_add = _mm_set1_epi32(round_add32); const __m128i round_shift = _mm_cvtsi32_si128(round); int x_qn = subpel_x_qn; for (int x = 0; x < w; ++x, x_qn += x_step_qn) { const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS); const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(filter_idx < SUBPEL_SHIFTS); const int16_t *filter = av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); // Load the filter coefficients const __m128i coefflo = _mm_loadu_si128((__m128i *)filter); int y; for (y = 0; y <= h - 4; y += 4) { const uint16_t *const src0 = src_col + y * src_stride; const uint16_t *const src1 = src0 + 1 * src_stride; const uint16_t *const src2 = src0 + 2 * src_stride; const uint16_t *const src3 = src0 + 3 * src_stride; // Load up source data. This is 16-bit input data, so each load gets the 8 // pixels we need. const __m128i data0lo = _mm_loadu_si128((__m128i *)src0); const __m128i data1lo = _mm_loadu_si128((__m128i *)src1); const __m128i data2lo = _mm_loadu_si128((__m128i *)src2); const __m128i data3lo = _mm_loadu_si128((__m128i *)src3); // Multiply by coefficients const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo); const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo); const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo); const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo); // Reduce horizontally and add const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo); const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo); const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo); // Divide down by (1 << round), rounding to nearest. __m128i shifted = _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift); shifted = _mm_packus_epi32(shifted, shifted); // Write transposed to the output _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted); } for (; y < h; ++y) { const uint16_t *const src_row = src_col + y * src_stride; int32_t sum = (1 << (bd + FILTER_BITS - 1)); for (int k = 0; k < ntaps; ++k) { sum += filter[k] * src_row[k]; } dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round); } } } // A specialised version of vfilter, the vertical filter for // av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap // filters. static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, int subpel_y_qn, int y_step_qn, const InterpFilterParams *filter_params, const ConvolveParams *conv_params, int bd) { const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const int ntaps = 8; const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1))); const __m128i sub = _mm_set1_epi32(sub32); CONV_BUF_TYPE *dst16 = conv_params->dst; const int dst16_stride = conv_params->dst_stride; const __m128i clip_pixel_ = _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; const __m128i bits_shift = _mm_cvtsi32_si128(bits); const __m128i bits_const = _mm_set1_epi32(((1 << bits) >> 1)); const __m128i round_shift_add = _mm_set1_epi32(((1 << conv_params->round_1) >> 1)); const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1)); const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi32(w0); const __m128i wt1 = _mm_set1_epi32(w1); int y_qn = subpel_y_qn; for (int y = 0; y < h; ++y, y_qn += y_step_qn) { const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS); const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; assert(filter_idx < SUBPEL_SHIFTS); const int16_t *filter = av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter); int x; for (x = 0; x <= w - 4; x += 4) { const int16_t *const src0 = src_y + x * src_stride; const int16_t *const src1 = src0 + 1 * src_stride; const int16_t *const src2 = src0 + 2 * src_stride; const int16_t *const src3 = src0 + 3 * src_stride; // Load the source data for the three rows, adding the three registers of // convolved products to one as we go (conv0..conv3) to avoid the // register pressure getting too high. const __m128i conv0 = convolve_16_8(src0, coeff0716); const __m128i conv1 = convolve_16_8(src1, coeff0716); const __m128i conv2 = convolve_16_8(src2, coeff0716); const __m128i conv3 = convolve_16_8(src3, coeff0716); // Now reduce horizontally to get one lane for each result const __m128i conv01 = _mm_hadd_epi32(conv0, conv1); const __m128i conv23 = _mm_hadd_epi32(conv2, conv3); __m128i conv = _mm_hadd_epi32(conv01, conv23); conv = _mm_add_epi32(conv, res_add_const); // Divide down by (1 << round_1), rounding to nearest and subtract sub32. __m128i shifted = _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift); uint16_t *dst_x = dst + y * dst_stride + x; __m128i result; if (conv_params->is_compound) { CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x; if (conv_params->do_average) { __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x)); if (conv_params->use_dist_wtd_comp_avg) { shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0), _mm_mullo_epi32(shifted, wt1)); shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS); } else { shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1); } result = _mm_sub_epi32(shifted, sub); result = _mm_sra_epi32(_mm_add_epi32(result, round_bits_const), round_bits_shift); result = _mm_packus_epi32(result, result); result = _mm_min_epi16(result, clip_pixel_); _mm_storel_epi64((__m128i *)dst_x, result); } else { __m128i shifted_16 = _mm_packus_epi32(shifted, shifted); _mm_storel_epi64((__m128i *)dst_16_x, shifted_16); } } else { result = _mm_sub_epi32(shifted, sub); result = _mm_sra_epi16(_mm_add_epi32(result, bits_const), bits_shift); result = _mm_packus_epi32(result, result); result = _mm_min_epi16(result, clip_pixel_); _mm_storel_epi64((__m128i *)dst_x, result); } } for (; x < w; ++x) { const int16_t *src_x = src_y + x * src_stride; int32_t sum = 1 << offset_bits; for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k]; CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); if (conv_params->is_compound) { if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { tmp += res; tmp = tmp >> 1; } /* Subtract round offset and convolve round */ tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1))); dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); } else { dst16[y * dst16_stride + x] = res; } } else { /* Subtract round offset and convolve round */ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + (1 << (offset_bits - conv_params->round_1 - 1))); dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); } } } } void av1_highbd_convolve_2d_scale_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd) { // TODO(yaowu): Move this out of stack DECLARE_ALIGNED(16, int16_t, tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps; const int xtaps = filter_params_x->taps; const int ytaps = filter_params_y->taps; const int fo_vert = ytaps / 2 - 1; memset(tmp, 0, sizeof(tmp)); assert((xtaps == 8) && (ytaps == 8)); (void)xtaps; // horizontal filter highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0, bd); // vertical filter (input is transposed) highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn, filter_params_y, conv_params, bd); } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/av1/common/x86/av1_inv_txfm_avx2.c000066400000000000000000002575451477627663500205340ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "av1/common/av1_inv_txfm1d_cfg.h" #include "av1/common/x86/av1_txfm_sse2.h" #include "av1/common/x86/av1_inv_txfm_avx2.h" #include "av1/common/x86/av1_inv_txfm_ssse3.h" // TODO(venkatsanampudi@ittiam.com): move this to header file // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 static const int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, 4 * 5793 }; static inline void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); btf_16_adds_subs_avx2(&x1[0], &x1[3]); btf_16_adds_subs_avx2(&x1[1], &x1[2]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); btf_16_adds_subs_avx2(&x1[8], &x1[11]); btf_16_adds_subs_avx2(&x1[9], &x1[10]); btf_16_adds_subs_avx2(&x1[15], &x1[12]); btf_16_adds_subs_avx2(&x1[14], &x1[13]); } static inline void idct16_stage6_avx2(__m256i *x, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); btf_16_adds_subs_avx2(&x[0], &x[7]); btf_16_adds_subs_avx2(&x[1], &x[6]); btf_16_adds_subs_avx2(&x[2], &x[5]); btf_16_adds_subs_avx2(&x[3], &x[4]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); } static inline void idct16_stage7_avx2(__m256i *output, __m256i *x1) { btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]); btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]); btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]); btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]); btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]); btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]); btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]); btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]); } static void idct16_avx2(const __m256i *input, __m256i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); // stage 1 __m256i x1[16]; x1[0] = input[0]; x1[1] = input[8]; x1[2] = input[4]; x1[3] = input[12]; x1[4] = input[2]; x1[5] = input[10]; x1[6] = input[6]; x1[7] = input[14]; x1[8] = input[1]; x1[9] = input[9]; x1[10] = input[5]; x1[11] = input[13]; x1[12] = input[3]; x1[13] = input[11]; x1[14] = input[7]; x1[15] = input[15]; // stage 2 btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, INV_COS_BIT); // stage 3 btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, INV_COS_BIT); btf_16_adds_subs_avx2(&x1[8], &x1[9]); btf_16_adds_subs_avx2(&x1[11], &x1[10]); btf_16_adds_subs_avx2(&x1[12], &x1[13]); btf_16_adds_subs_avx2(&x1[15], &x1[14]); // stage 4 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, INV_COS_BIT); btf_16_adds_subs_avx2(&x1[4], &x1[5]); btf_16_adds_subs_avx2(&x1[7], &x1[6]); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, INV_COS_BIT); idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT); idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); idct16_stage7_avx2(output, x1); } static void idct16_low8_avx2(const __m256i *input, __m256i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); // stage 1 __m256i x1[16]; x1[0] = input[0]; x1[2] = input[4]; x1[4] = input[2]; x1[6] = input[6]; x1[8] = input[1]; x1[10] = input[5]; x1[12] = input[3]; x1[14] = input[7]; // stage 2 btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]); btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]); btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]); btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]); // stage 3 btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]); btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]); btf_16_adds_subs_avx2(&x1[8], &x1[9]); btf_16_adds_subs_avx2(&x1[11], &x1[10]); btf_16_adds_subs_avx2(&x1[12], &x1[13]); btf_16_adds_subs_avx2(&x1[15], &x1[14]); // stage 4 btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]); btf_16_adds_subs_avx2(&x1[4], &x1[5]); btf_16_adds_subs_avx2(&x1[7], &x1[6]); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, INV_COS_BIT); idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT); idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); idct16_stage7_avx2(output, x1); } static void idct16_low1_avx2(const __m256i *input, __m256i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); // stage 1 __m256i x1[2]; x1[0] = input[0]; // stage 2 // stage 3 // stage 4 btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); // stage 5 // stage 6 output[0] = x1[0]; output[1] = x1[1]; output[2] = x1[1]; output[3] = x1[0]; output[4] = x1[0]; output[5] = x1[1]; output[6] = x1[1]; output[7] = x1[0]; output[8] = x1[0]; output[9] = x1[1]; output[10] = x1[1]; output[11] = x1[0]; output[12] = x1[0]; output[13] = x1[1]; output[14] = x1[1]; output[15] = x1[0]; } static inline void iadst16_stage3_avx2(__m256i *x) { btf_16_adds_subs_avx2(&x[0], &x[8]); btf_16_adds_subs_avx2(&x[1], &x[9]); btf_16_adds_subs_avx2(&x[2], &x[10]); btf_16_adds_subs_avx2(&x[3], &x[11]); btf_16_adds_subs_avx2(&x[4], &x[12]); btf_16_adds_subs_avx2(&x[5], &x[13]); btf_16_adds_subs_avx2(&x[6], &x[14]); btf_16_adds_subs_avx2(&x[7], &x[15]); } static inline void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit); btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit); btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit); btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit); } static inline void iadst16_stage5_avx2(__m256i *x) { btf_16_adds_subs_avx2(&x[0], &x[4]); btf_16_adds_subs_avx2(&x[1], &x[5]); btf_16_adds_subs_avx2(&x[2], &x[6]); btf_16_adds_subs_avx2(&x[3], &x[7]); btf_16_adds_subs_avx2(&x[8], &x[12]); btf_16_adds_subs_avx2(&x[9], &x[13]); btf_16_adds_subs_avx2(&x[10], &x[14]); btf_16_adds_subs_avx2(&x[11], &x[15]); } static inline void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit); btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit); btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit); btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit); } static inline void iadst16_stage7_avx2(__m256i *x) { btf_16_adds_subs_avx2(&x[0], &x[2]); btf_16_adds_subs_avx2(&x[1], &x[3]); btf_16_adds_subs_avx2(&x[4], &x[6]); btf_16_adds_subs_avx2(&x[5], &x[7]); btf_16_adds_subs_avx2(&x[8], &x[10]); btf_16_adds_subs_avx2(&x[9], &x[11]); btf_16_adds_subs_avx2(&x[12], &x[14]); btf_16_adds_subs_avx2(&x[13], &x[15]); } static inline void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit); btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit); btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit); btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit); } static inline void iadst16_stage9_avx2(__m256i *output, __m256i *x1) { const __m256i __zero = _mm256_setzero_si256(); output[0] = x1[0]; output[1] = _mm256_subs_epi16(__zero, x1[8]); output[2] = x1[12]; output[3] = _mm256_subs_epi16(__zero, x1[4]); output[4] = x1[6]; output[5] = _mm256_subs_epi16(__zero, x1[14]); output[6] = x1[10]; output[7] = _mm256_subs_epi16(__zero, x1[2]); output[8] = x1[3]; output[9] = _mm256_subs_epi16(__zero, x1[11]); output[10] = x1[15]; output[11] = _mm256_subs_epi16(__zero, x1[7]); output[12] = x1[5]; output[13] = _mm256_subs_epi16(__zero, x1[13]); output[14] = x1[9]; output[15] = _mm256_subs_epi16(__zero, x1[1]); } static void iadst16_avx2(const __m256i *input, __m256i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); // stage 1 __m256i x1[16]; x1[0] = input[15]; x1[1] = input[0]; x1[2] = input[13]; x1[3] = input[2]; x1[4] = input[11]; x1[5] = input[4]; x1[6] = input[9]; x1[7] = input[6]; x1[8] = input[7]; x1[9] = input[8]; x1[10] = input[5]; x1[11] = input[10]; x1[12] = input[3]; x1[13] = input[12]; x1[14] = input[1]; x1[15] = input[14]; // stage 2 btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, INV_COS_BIT); iadst16_stage3_avx2(x1); iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT); iadst16_stage5_avx2(x1); iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); iadst16_stage7_avx2(x1); iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT); iadst16_stage9_avx2(output, x1); } static void iadst16_low8_avx2(const __m256i *input, __m256i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); // stage 1 __m256i x1[16]; x1[1] = input[0]; x1[3] = input[2]; x1[5] = input[4]; x1[7] = input[6]; x1[8] = input[7]; x1[10] = input[5]; x1[12] = input[3]; x1[14] = input[1]; // stage 2 btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]); btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]); btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]); btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]); btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]); btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]); btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]); btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]); iadst16_stage3_avx2(x1); iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT); iadst16_stage5_avx2(x1); iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); iadst16_stage7_avx2(x1); iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT); iadst16_stage9_avx2(output, x1); } static void iadst16_low1_avx2(const __m256i *input, __m256i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); // stage 1 __m256i x1[16]; x1[1] = input[0]; // stage 2 btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]); // stage 3 x1[8] = x1[0]; x1[9] = x1[1]; // stage 4 btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, INV_COS_BIT); // stage 5 x1[4] = x1[0]; x1[5] = x1[1]; x1[12] = x1[8]; x1[13] = x1[9]; // stage 6 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, INV_COS_BIT); // stage 7 x1[2] = x1[0]; x1[3] = x1[1]; x1[6] = x1[4]; x1[7] = x1[5]; x1[10] = x1[8]; x1[11] = x1[9]; x1[14] = x1[12]; x1[15] = x1[13]; iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT); iadst16_stage9_avx2(output, x1); } static inline void idct32_high16_stage3_avx2(__m256i *x) { btf_16_adds_subs_avx2(&x[16], &x[17]); btf_16_adds_subs_avx2(&x[19], &x[18]); btf_16_adds_subs_avx2(&x[20], &x[21]); btf_16_adds_subs_avx2(&x[23], &x[22]); btf_16_adds_subs_avx2(&x[24], &x[25]); btf_16_adds_subs_avx2(&x[27], &x[26]); btf_16_adds_subs_avx2(&x[28], &x[29]); btf_16_adds_subs_avx2(&x[31], &x[30]); } static inline void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); } static inline void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); btf_16_adds_subs_avx2(&x[16], &x[19]); btf_16_adds_subs_avx2(&x[17], &x[18]); btf_16_adds_subs_avx2(&x[23], &x[20]); btf_16_adds_subs_avx2(&x[22], &x[21]); btf_16_adds_subs_avx2(&x[24], &x[27]); btf_16_adds_subs_avx2(&x[25], &x[26]); btf_16_adds_subs_avx2(&x[31], &x[28]); btf_16_adds_subs_avx2(&x[30], &x[29]); } static inline void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); btf_16_adds_subs_avx2(&x[8], &x[11]); btf_16_adds_subs_avx2(&x[9], &x[10]); btf_16_adds_subs_avx2(&x[15], &x[12]); btf_16_adds_subs_avx2(&x[14], &x[13]); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); } static inline void idct32_stage7_avx2(__m256i *x, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); btf_16_adds_subs_avx2(&x[0], &x[7]); btf_16_adds_subs_avx2(&x[1], &x[6]); btf_16_adds_subs_avx2(&x[2], &x[5]); btf_16_adds_subs_avx2(&x[3], &x[4]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); btf_16_adds_subs_avx2(&x[16], &x[23]); btf_16_adds_subs_avx2(&x[17], &x[22]); btf_16_adds_subs_avx2(&x[18], &x[21]); btf_16_adds_subs_avx2(&x[19], &x[20]); btf_16_adds_subs_avx2(&x[31], &x[24]); btf_16_adds_subs_avx2(&x[30], &x[25]); btf_16_adds_subs_avx2(&x[29], &x[26]); btf_16_adds_subs_avx2(&x[28], &x[27]); } static inline void idct32_stage8_avx2(__m256i *x, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); btf_16_adds_subs_avx2(&x[0], &x[15]); btf_16_adds_subs_avx2(&x[1], &x[14]); btf_16_adds_subs_avx2(&x[2], &x[13]); btf_16_adds_subs_avx2(&x[3], &x[12]); btf_16_adds_subs_avx2(&x[4], &x[11]); btf_16_adds_subs_avx2(&x[5], &x[10]); btf_16_adds_subs_avx2(&x[6], &x[9]); btf_16_adds_subs_avx2(&x[7], &x[8]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); } static inline void idct32_stage9_avx2(__m256i *output, __m256i *x) { btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]); btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]); btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]); btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]); btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]); btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]); btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]); btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]); btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]); btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]); btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]); btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]); btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]); btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]); btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]); btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]); } static void idct32_low1_avx2(const __m256i *input, __m256i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); // stage 1 __m256i x[2]; x[0] = input[0]; // stage 2 // stage 3 // stage 4 // stage 5 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); // stage 6 // stage 7 // stage 8 // stage 9 output[0] = x[0]; output[31] = x[0]; output[1] = x[1]; output[30] = x[1]; output[2] = x[1]; output[29] = x[1]; output[3] = x[0]; output[28] = x[0]; output[4] = x[0]; output[27] = x[0]; output[5] = x[1]; output[26] = x[1]; output[6] = x[1]; output[25] = x[1]; output[7] = x[0]; output[24] = x[0]; output[8] = x[0]; output[23] = x[0]; output[9] = x[1]; output[22] = x[1]; output[10] = x[1]; output[21] = x[1]; output[11] = x[0]; output[20] = x[0]; output[12] = x[0]; output[19] = x[0]; output[13] = x[1]; output[18] = x[1]; output[14] = x[1]; output[17] = x[1]; output[15] = x[0]; output[16] = x[0]; } static void idct32_low8_avx2(const __m256i *input, __m256i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); // stage 1 __m256i x[32]; x[0] = input[0]; x[4] = input[4]; x[8] = input[2]; x[12] = input[6]; x[16] = input[1]; x[20] = input[5]; x[24] = input[3]; x[28] = input[7]; // stage 2 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); // stage 3 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); x[17] = x[16]; x[18] = x[19]; x[21] = x[20]; x[22] = x[23]; x[25] = x[24]; x[26] = x[27]; x[29] = x[28]; x[30] = x[31]; // stage 4 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); x[9] = x[8]; x[10] = x[11]; x[13] = x[12]; x[14] = x[15]; idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT); // stage 5 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); x[5] = x[4]; x[6] = x[7]; idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT); // stage 6 x[3] = x[0]; x[2] = x[1]; idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT); idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT); idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT); idct32_stage9_avx2(output, x); } static void idct32_low16_avx2(const __m256i *input, __m256i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); // stage 1 __m256i x[32]; x[0] = input[0]; x[2] = input[8]; x[4] = input[4]; x[6] = input[12]; x[8] = input[2]; x[10] = input[10]; x[12] = input[6]; x[14] = input[14]; x[16] = input[1]; x[18] = input[9]; x[20] = input[5]; x[22] = input[13]; x[24] = input[3]; x[26] = input[11]; x[28] = input[7]; x[30] = input[15]; // stage 2 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]); btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]); btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); // stage 3 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); idct32_high16_stage3_avx2(x); // stage 4 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); btf_16_adds_subs_avx2(&x[8], &x[9]); btf_16_adds_subs_avx2(&x[11], &x[10]); btf_16_adds_subs_avx2(&x[12], &x[13]); btf_16_adds_subs_avx2(&x[15], &x[14]); idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT); // stage 5 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); btf_16_adds_subs_avx2(&x[4], &x[5]); btf_16_adds_subs_avx2(&x[7], &x[6]); idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT); btf_16_adds_subs_avx2(&x[0], &x[3]); btf_16_adds_subs_avx2(&x[1], &x[2]); idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT); idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT); idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT); idct32_stage9_avx2(output, x); } static void idct32_avx2(const __m256i *input, __m256i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); // stage 1 __m256i x1[32]; x1[0] = input[0]; x1[1] = input[16]; x1[2] = input[8]; x1[3] = input[24]; x1[4] = input[4]; x1[5] = input[20]; x1[6] = input[12]; x1[7] = input[28]; x1[8] = input[2]; x1[9] = input[18]; x1[10] = input[10]; x1[11] = input[26]; x1[12] = input[6]; x1[13] = input[22]; x1[14] = input[14]; x1[15] = input[30]; x1[16] = input[1]; x1[17] = input[17]; x1[18] = input[9]; x1[19] = input[25]; x1[20] = input[5]; x1[21] = input[21]; x1[22] = input[13]; x1[23] = input[29]; x1[24] = input[3]; x1[25] = input[19]; x1[26] = input[11]; x1[27] = input[27]; x1[28] = input[7]; x1[29] = input[23]; x1[30] = input[15]; x1[31] = input[31]; // stage 2 btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, INV_COS_BIT); // stage 3 btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, INV_COS_BIT); idct32_high16_stage3_avx2(x1); // stage 4 btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, INV_COS_BIT); btf_16_adds_subs_avx2(&x1[8], &x1[9]); btf_16_adds_subs_avx2(&x1[11], &x1[10]); btf_16_adds_subs_avx2(&x1[12], &x1[13]); btf_16_adds_subs_avx2(&x1[15], &x1[14]); idct32_high16_stage4_avx2(x1, cospi, _r, INV_COS_BIT); // stage 5 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, INV_COS_BIT); btf_16_adds_subs_avx2(&x1[4], &x1[5]); btf_16_adds_subs_avx2(&x1[7], &x1[6]); idct32_high24_stage5_avx2(x1, cospi, _r, INV_COS_BIT); // stage 6 btf_16_adds_subs_avx2(&x1[0], &x1[3]); btf_16_adds_subs_avx2(&x1[1], &x1[2]); idct32_high28_stage6_avx2(x1, cospi, _r, INV_COS_BIT); idct32_stage7_avx2(x1, cospi, _r, INV_COS_BIT); idct32_stage8_avx2(x1, cospi, _r, INV_COS_BIT); idct32_stage9_avx2(output, x1); } static inline void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]); const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]); const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit); btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit); btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit); btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit); btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit); btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit); btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit); btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit); } static inline void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); btf_16_adds_subs_avx2(&x[32], &x[35]); btf_16_adds_subs_avx2(&x[33], &x[34]); btf_16_adds_subs_avx2(&x[39], &x[36]); btf_16_adds_subs_avx2(&x[38], &x[37]); btf_16_adds_subs_avx2(&x[40], &x[43]); btf_16_adds_subs_avx2(&x[41], &x[42]); btf_16_adds_subs_avx2(&x[47], &x[44]); btf_16_adds_subs_avx2(&x[46], &x[45]); btf_16_adds_subs_avx2(&x[48], &x[51]); btf_16_adds_subs_avx2(&x[49], &x[50]); btf_16_adds_subs_avx2(&x[55], &x[52]); btf_16_adds_subs_avx2(&x[54], &x[53]); btf_16_adds_subs_avx2(&x[56], &x[59]); btf_16_adds_subs_avx2(&x[57], &x[58]); btf_16_adds_subs_avx2(&x[63], &x[60]); btf_16_adds_subs_avx2(&x[62], &x[61]); } static inline void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit); btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit); btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit); btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit); btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit); btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit); btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit); btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit); } static inline void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { btf_16_adds_subs_avx2(&x[16], &x[19]); btf_16_adds_subs_avx2(&x[17], &x[18]); btf_16_adds_subs_avx2(&x[23], &x[20]); btf_16_adds_subs_avx2(&x[22], &x[21]); btf_16_adds_subs_avx2(&x[24], &x[27]); btf_16_adds_subs_avx2(&x[25], &x[26]); btf_16_adds_subs_avx2(&x[31], &x[28]); btf_16_adds_subs_avx2(&x[30], &x[29]); idct64_stage6_high32_avx2(x, cospi, _r, cos_bit); } static inline void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); btf_16_adds_subs_avx2(&x[32], &x[39]); btf_16_adds_subs_avx2(&x[33], &x[38]); btf_16_adds_subs_avx2(&x[34], &x[37]); btf_16_adds_subs_avx2(&x[35], &x[36]); btf_16_adds_subs_avx2(&x[47], &x[40]); btf_16_adds_subs_avx2(&x[46], &x[41]); btf_16_adds_subs_avx2(&x[45], &x[42]); btf_16_adds_subs_avx2(&x[44], &x[43]); btf_16_adds_subs_avx2(&x[48], &x[55]); btf_16_adds_subs_avx2(&x[49], &x[54]); btf_16_adds_subs_avx2(&x[50], &x[53]); btf_16_adds_subs_avx2(&x[51], &x[52]); btf_16_adds_subs_avx2(&x[63], &x[56]); btf_16_adds_subs_avx2(&x[62], &x[57]); btf_16_adds_subs_avx2(&x[61], &x[58]); btf_16_adds_subs_avx2(&x[60], &x[59]); } static inline void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); btf_16_adds_subs_avx2(&x[16], &x[23]); btf_16_adds_subs_avx2(&x[17], &x[22]); btf_16_adds_subs_avx2(&x[18], &x[21]); btf_16_adds_subs_avx2(&x[19], &x[20]); btf_16_adds_subs_avx2(&x[31], &x[24]); btf_16_adds_subs_avx2(&x[30], &x[25]); btf_16_adds_subs_avx2(&x[29], &x[26]); btf_16_adds_subs_avx2(&x[28], &x[27]); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit); } static inline void idct64_stage9_avx2(__m256i *x, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); btf_16_adds_subs_avx2(&x[0], &x[15]); btf_16_adds_subs_avx2(&x[1], &x[14]); btf_16_adds_subs_avx2(&x[2], &x[13]); btf_16_adds_subs_avx2(&x[3], &x[12]); btf_16_adds_subs_avx2(&x[4], &x[11]); btf_16_adds_subs_avx2(&x[5], &x[10]); btf_16_adds_subs_avx2(&x[6], &x[9]); btf_16_adds_subs_avx2(&x[7], &x[8]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); btf_16_adds_subs_avx2(&x[32], &x[47]); btf_16_adds_subs_avx2(&x[33], &x[46]); btf_16_adds_subs_avx2(&x[34], &x[45]); btf_16_adds_subs_avx2(&x[35], &x[44]); btf_16_adds_subs_avx2(&x[36], &x[43]); btf_16_adds_subs_avx2(&x[37], &x[42]); btf_16_adds_subs_avx2(&x[38], &x[41]); btf_16_adds_subs_avx2(&x[39], &x[40]); btf_16_adds_subs_avx2(&x[63], &x[48]); btf_16_adds_subs_avx2(&x[62], &x[49]); btf_16_adds_subs_avx2(&x[61], &x[50]); btf_16_adds_subs_avx2(&x[60], &x[51]); btf_16_adds_subs_avx2(&x[59], &x[52]); btf_16_adds_subs_avx2(&x[58], &x[53]); btf_16_adds_subs_avx2(&x[57], &x[54]); btf_16_adds_subs_avx2(&x[56], &x[55]); } static inline void idct64_stage10_avx2(__m256i *x, const int32_t *cospi, const __m256i _r, int8_t cos_bit) { (void)cos_bit; const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); btf_16_adds_subs_avx2(&x[0], &x[31]); btf_16_adds_subs_avx2(&x[1], &x[30]); btf_16_adds_subs_avx2(&x[2], &x[29]); btf_16_adds_subs_avx2(&x[3], &x[28]); btf_16_adds_subs_avx2(&x[4], &x[27]); btf_16_adds_subs_avx2(&x[5], &x[26]); btf_16_adds_subs_avx2(&x[6], &x[25]); btf_16_adds_subs_avx2(&x[7], &x[24]); btf_16_adds_subs_avx2(&x[8], &x[23]); btf_16_adds_subs_avx2(&x[9], &x[22]); btf_16_adds_subs_avx2(&x[10], &x[21]); btf_16_adds_subs_avx2(&x[11], &x[20]); btf_16_adds_subs_avx2(&x[12], &x[19]); btf_16_adds_subs_avx2(&x[13], &x[18]); btf_16_adds_subs_avx2(&x[14], &x[17]); btf_16_adds_subs_avx2(&x[15], &x[16]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit); } static inline void idct64_stage11_avx2(__m256i *output, __m256i *x) { btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]); btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]); btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]); btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]); btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]); btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]); btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]); btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]); btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]); btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]); btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]); btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]); btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]); btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]); btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]); btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]); btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]); btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]); btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]); btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]); btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]); btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]); btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]); btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]); btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]); btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]); btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]); btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]); btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]); btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]); btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]); btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]); } static void idct64_low1_avx2(const __m256i *input, __m256i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); // stage 1 __m256i x[32]; x[0] = input[0]; // stage 2 // stage 3 // stage 4 // stage 5 // stage 6 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); // stage 7 // stage 8 // stage 9 // stage 10 // stage 11 output[0] = x[0]; output[63] = x[0]; output[1] = x[1]; output[62] = x[1]; output[2] = x[1]; output[61] = x[1]; output[3] = x[0]; output[60] = x[0]; output[4] = x[0]; output[59] = x[0]; output[5] = x[1]; output[58] = x[1]; output[6] = x[1]; output[57] = x[1]; output[7] = x[0]; output[56] = x[0]; output[8] = x[0]; output[55] = x[0]; output[9] = x[1]; output[54] = x[1]; output[10] = x[1]; output[53] = x[1]; output[11] = x[0]; output[52] = x[0]; output[12] = x[0]; output[51] = x[0]; output[13] = x[1]; output[50] = x[1]; output[14] = x[1]; output[49] = x[1]; output[15] = x[0]; output[48] = x[0]; output[16] = x[0]; output[47] = x[0]; output[17] = x[1]; output[46] = x[1]; output[18] = x[1]; output[45] = x[1]; output[19] = x[0]; output[44] = x[0]; output[20] = x[0]; output[43] = x[0]; output[21] = x[1]; output[42] = x[1]; output[22] = x[1]; output[41] = x[1]; output[23] = x[0]; output[40] = x[0]; output[24] = x[0]; output[39] = x[0]; output[25] = x[1]; output[38] = x[1]; output[26] = x[1]; output[37] = x[1]; output[27] = x[0]; output[36] = x[0]; output[28] = x[0]; output[35] = x[0]; output[29] = x[1]; output[34] = x[1]; output[30] = x[1]; output[33] = x[1]; output[31] = x[0]; output[32] = x[0]; } static void idct64_low8_avx2(const __m256i *input, __m256i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); // stage 1 __m256i x[64]; x[0] = input[0]; x[8] = input[4]; x[16] = input[2]; x[24] = input[6]; x[32] = input[1]; x[40] = input[5]; x[48] = input[3]; x[56] = input[7]; // stage 2 btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); // stage 3 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); x[33] = x[32]; x[38] = x[39]; x[41] = x[40]; x[46] = x[47]; x[49] = x[48]; x[54] = x[55]; x[57] = x[56]; x[62] = x[63]; // stage 4 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); x[17] = x[16]; x[22] = x[23]; x[25] = x[24]; x[30] = x[31]; btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, INV_COS_BIT); // stage 5 x[9] = x[8]; x[14] = x[15]; btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, INV_COS_BIT); x[35] = x[32]; x[34] = x[33]; x[36] = x[39]; x[37] = x[38]; x[43] = x[40]; x[42] = x[41]; x[44] = x[47]; x[45] = x[46]; x[51] = x[48]; x[50] = x[49]; x[52] = x[55]; x[53] = x[54]; x[59] = x[56]; x[58] = x[57]; x[60] = x[63]; x[61] = x[62]; // stage 6 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT); x[19] = x[16]; x[18] = x[17]; x[20] = x[23]; x[21] = x[22]; x[27] = x[24]; x[26] = x[25]; x[28] = x[31]; x[29] = x[30]; idct64_stage6_high32_avx2(x, cospi, _r, INV_COS_BIT); // stage 7 x[3] = x[0]; x[2] = x[1]; x[11] = x[8]; x[10] = x[9]; x[12] = x[15]; x[13] = x[14]; idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT); // stage 8 x[7] = x[0]; x[6] = x[1]; x[5] = x[2]; x[4] = x[3]; btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, INV_COS_BIT); idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT); idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT); idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT); idct64_stage11_avx2(output, x); } static void idct64_low16_avx2(const __m256i *input, __m256i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); // stage 1 __m256i x[64]; x[0] = input[0]; x[4] = input[8]; x[8] = input[4]; x[12] = input[12]; x[16] = input[2]; x[20] = input[10]; x[24] = input[6]; x[28] = input[14]; x[32] = input[1]; x[36] = input[9]; x[40] = input[5]; x[44] = input[13]; x[48] = input[3]; x[52] = input[11]; x[56] = input[7]; x[60] = input[15]; // stage 2 btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]); btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]); btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]); btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]); btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); // stage 3 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); x[33] = x[32]; x[34] = x[35]; x[37] = x[36]; x[38] = x[39]; x[41] = x[40]; x[42] = x[43]; x[45] = x[44]; x[46] = x[47]; x[49] = x[48]; x[50] = x[51]; x[53] = x[52]; x[54] = x[55]; x[57] = x[56]; x[58] = x[59]; x[61] = x[60]; x[62] = x[63]; // stage 4 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); x[17] = x[16]; x[18] = x[19]; x[21] = x[20]; x[22] = x[23]; x[25] = x[24]; x[26] = x[27]; x[29] = x[28]; x[30] = x[31]; idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT); // stage 5 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); x[9] = x[8]; x[10] = x[11]; x[13] = x[12]; x[14] = x[15]; idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT); // stage 6 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); x[5] = x[4]; x[6] = x[7]; btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, INV_COS_BIT); idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT); // stage 7 x[3] = x[0]; x[2] = x[1]; btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT); btf_16_adds_subs_avx2(&x[8], &x[11]); btf_16_adds_subs_avx2(&x[9], &x[10]); btf_16_adds_subs_avx2(&x[15], &x[12]); btf_16_adds_subs_avx2(&x[14], &x[13]); idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT); // stage 8 btf_16_adds_subs_avx2(&x[0], &x[7]); btf_16_adds_subs_avx2(&x[1], &x[6]); btf_16_adds_subs_avx2(&x[2], &x[5]); btf_16_adds_subs_avx2(&x[3], &x[4]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, INV_COS_BIT); idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT); idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT); idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT); idct64_stage11_avx2(output, x); } static void idct64_low32_avx2(const __m256i *input, __m256i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); // stage 1 __m256i x[64]; x[0] = input[0]; x[2] = input[16]; x[4] = input[8]; x[6] = input[24]; x[8] = input[4]; x[10] = input[20]; x[12] = input[12]; x[14] = input[28]; x[16] = input[2]; x[18] = input[18]; x[20] = input[10]; x[22] = input[26]; x[24] = input[6]; x[26] = input[22]; x[28] = input[14]; x[30] = input[30]; x[32] = input[1]; x[34] = input[17]; x[36] = input[9]; x[38] = input[25]; x[40] = input[5]; x[42] = input[21]; x[44] = input[13]; x[46] = input[29]; x[48] = input[3]; x[50] = input[19]; x[52] = input[11]; x[54] = input[27]; x[56] = input[7]; x[58] = input[23]; x[60] = input[15]; x[62] = input[31]; // stage 2 btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]); btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]); btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]); btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]); btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]); btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]); btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]); btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]); btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]); btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]); btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]); btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]); btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); // stage 3 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]); btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]); btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); btf_16_adds_subs_avx2(&x[32], &x[33]); btf_16_adds_subs_avx2(&x[35], &x[34]); btf_16_adds_subs_avx2(&x[36], &x[37]); btf_16_adds_subs_avx2(&x[39], &x[38]); btf_16_adds_subs_avx2(&x[40], &x[41]); btf_16_adds_subs_avx2(&x[43], &x[42]); btf_16_adds_subs_avx2(&x[44], &x[45]); btf_16_adds_subs_avx2(&x[47], &x[46]); btf_16_adds_subs_avx2(&x[48], &x[49]); btf_16_adds_subs_avx2(&x[51], &x[50]); btf_16_adds_subs_avx2(&x[52], &x[53]); btf_16_adds_subs_avx2(&x[55], &x[54]); btf_16_adds_subs_avx2(&x[56], &x[57]); btf_16_adds_subs_avx2(&x[59], &x[58]); btf_16_adds_subs_avx2(&x[60], &x[61]); btf_16_adds_subs_avx2(&x[63], &x[62]); // stage 4 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); btf_16_adds_subs_avx2(&x[16], &x[17]); btf_16_adds_subs_avx2(&x[19], &x[18]); btf_16_adds_subs_avx2(&x[20], &x[21]); btf_16_adds_subs_avx2(&x[23], &x[22]); btf_16_adds_subs_avx2(&x[24], &x[25]); btf_16_adds_subs_avx2(&x[27], &x[26]); btf_16_adds_subs_avx2(&x[28], &x[29]); btf_16_adds_subs_avx2(&x[31], &x[30]); idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT); // stage 5 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); btf_16_adds_subs_avx2(&x[8], &x[9]); btf_16_adds_subs_avx2(&x[11], &x[10]); btf_16_adds_subs_avx2(&x[12], &x[13]); btf_16_adds_subs_avx2(&x[15], &x[14]); idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT); // stage 6 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); btf_16_adds_subs_avx2(&x[4], &x[5]); btf_16_adds_subs_avx2(&x[7], &x[6]); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, INV_COS_BIT); idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT); // stage 7 btf_16_adds_subs_avx2(&x[0], &x[3]); btf_16_adds_subs_avx2(&x[1], &x[2]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT); btf_16_adds_subs_avx2(&x[8], &x[11]); btf_16_adds_subs_avx2(&x[9], &x[10]); btf_16_adds_subs_avx2(&x[15], &x[12]); btf_16_adds_subs_avx2(&x[14], &x[13]); idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT); // stage 8 btf_16_adds_subs_avx2(&x[0], &x[7]); btf_16_adds_subs_avx2(&x[1], &x[6]); btf_16_adds_subs_avx2(&x[2], &x[5]); btf_16_adds_subs_avx2(&x[3], &x[4]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, INV_COS_BIT); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, INV_COS_BIT); idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT); // stage 9~11 idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT); idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT); idct64_stage11_avx2(output, x); } typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output); // 1D functions process 16 pixels at one time. static const transform_1d_avx2 lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = { { { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL }, }, { { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } }, { { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL }, { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL }, { NULL, NULL, NULL, NULL }, }, { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } }, { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_low32_avx2 }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } } }; // only process w >= 16 h >= 16 static inline void lowbd_inv_txfm2d_add_no_identity_avx2( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { __m256i buf1[64 * 16]; int eobx, eoby; get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div16 = txfm_size_col >> 4; const int buf_size_nonzero_w = ((eobx + 16) >> 4) << 4; const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4; const int input_stride = AOMMIN(32, txfm_size_row); const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; const transform_1d_avx2 row_txfm = lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; const transform_1d_avx2 col_txfm = lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; assert(col_txfm != NULL); assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0])); for (int i = 0; i < buf_size_nonzero_h_div16; i++) { __m256i buf0[64]; load_buffer_32bit_to_16bit_w16_avx2(input + 16 * i, input_stride, buf0, buf_size_nonzero_w); if (rect_type == 1 || rect_type == -1) { round_shift_avx2(buf0, buf0, buf_size_nonzero_w); // rect special code } row_txfm(buf0, buf0); for (int j = 0; j < txfm_size_col; ++j) { buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0); } __m256i *buf1_cur = buf1 + (i << 4); if (lr_flip) { for (int j = 0; j < buf_size_w_div16; ++j) { __m256i temp[16]; flip_buf_avx2(buf0 + 16 * j, temp, 16); int offset = txfm_size_row * (buf_size_w_div16 - 1 - j); transpose_16bit_16x16_avx2(temp, buf1_cur + offset); } } else { for (int j = 0; j < buf_size_w_div16; ++j) { transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j); } } } const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1])); for (int i = 0; i < buf_size_w_div16; i++) { __m256i *buf1_cur = buf1 + i * txfm_size_row; col_txfm(buf1_cur, buf1_cur); for (int j = 0; j < txfm_size_row; ++j) { buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1); } } for (int i = 0; i < buf_size_w_div16; i++) { lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i, stride, ud_flip, txfm_size_row); } } static inline void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input, int stride, int shift, int height, int txw_idx, int rect_type) { const int32_t *input_row = input; const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]); const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) + (1 << (NewSqrt2Bits - shift - 1))); const __m256i one = _mm256_set1_epi16(1); const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r); if (rect_type != 1 && rect_type != -1) { for (int i = 0; i < height; ++i) { const __m256i src = load_32bit_to_16bit_w16_avx2(input_row); input_row += stride; __m256i lo = _mm256_unpacklo_epi16(src, one); __m256i hi = _mm256_unpackhi_epi16(src, one); lo = _mm256_madd_epi16(lo, scale__r); hi = _mm256_madd_epi16(hi, scale__r); lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); out[i] = _mm256_packs_epi32(lo, hi); } } else { const __m256i rect_scale = _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits)); for (int i = 0; i < height; ++i) { __m256i src = load_32bit_to_16bit_w16_avx2(input_row); src = _mm256_mulhrs_epi16(src, rect_scale); input_row += stride; __m256i lo = _mm256_unpacklo_epi16(src, one); __m256i hi = _mm256_unpackhi_epi16(src, one); lo = _mm256_madd_epi16(lo, scale__r); hi = _mm256_madd_epi16(hi, scale__r); lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); out[i] = _mm256_packs_epi32(lo, hi); } } } static inline void iidentity_col_16xn_avx2(uint8_t *output, int stride, __m256i *buf, int shift, int height, int txh_idx) { const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]); const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1)); const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1)); const __m256i one = _mm256_set1_epi16(1); const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r); for (int h = 0; h < height; ++h) { __m256i lo = _mm256_unpacklo_epi16(buf[h], one); __m256i hi = _mm256_unpackhi_epi16(buf[h], one); lo = _mm256_madd_epi16(lo, scale_coeff); hi = _mm256_madd_epi16(hi, scale_coeff); lo = _mm256_srai_epi32(lo, NewSqrt2Bits); hi = _mm256_srai_epi32(hi, NewSqrt2Bits); lo = _mm256_add_epi32(lo, shift__r); hi = _mm256_add_epi32(hi, shift__r); lo = _mm256_srai_epi32(lo, -shift); hi = _mm256_srai_epi32(hi, -shift); const __m256i x = _mm256_packs_epi32(lo, hi); write_recon_w16_avx2(x, output); output += stride; } } static inline void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input, uint8_t *output, int stride, TX_SIZE tx_size, int32_t eob) { (void)eob; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int col_max = AOMMIN(32, txfm_size_col); const int row_max = AOMMIN(32, txfm_size_row); const int input_stride = row_max; const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); __m256i buf[32]; for (int i = 0; i < (col_max >> 4); ++i) { for (int j = 0; j < (row_max >> 4); j++) { iidentity_row_16xn_avx2(buf, input + j * 16 + i * 16 * input_stride, row_max, shift[0], 16, txw_idx, rect_type); transpose_16bit_16x16_avx2(buf, buf); iidentity_col_16xn_avx2(output + i * 16 + j * 16 * stride, stride, buf, shift[1], 16, txh_idx); } } } static inline void lowbd_inv_txfm2d_add_h_identity_avx2( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { int eobx, eoby; get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int txfm_size_row_notzero = AOMMIN(32, txfm_size_row); const int input_stride = txfm_size_row_notzero; const int buf_size_w_div16 = (eobx + 16) >> 4; const int buf_size_h_div16 = (eoby + 16) >> 4; const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; const transform_1d_avx2 col_txfm = lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; assert(col_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < buf_size_w_div16; i++) { __m256i buf0[64]; for (int j = 0; j < buf_size_h_div16; j++) { __m256i *buf0_cur = buf0 + j * 16; const int32_t *input_cur = input + i * 16 * input_stride + j * 16; iidentity_row_16xn_avx2(buf0_cur, input_cur, input_stride, shift[0], 16, txw_idx, rect_type); transpose_16bit_16x16_avx2(buf0_cur, buf0_cur); } col_txfm(buf0, buf0); __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1])); int k = ud_flip ? (txfm_size_row - 1) : 0; const int step = ud_flip ? -1 : 1; for (int j = 0; j < txfm_size_row; ++j, k += step) { __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift); write_recon_w16_avx2(res, output + (i << 4) + j * stride); } } } static inline void lowbd_inv_txfm2d_add_v_identity_avx2( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { __m256i buf1[64]; int eobx, eoby; get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div16 = txfm_size_col >> 4; const int buf_size_h_div16 = (eoby + 16) >> 4; const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3; const int input_stride = AOMMIN(32, txfm_size_row); const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; const transform_1d_avx2 row_txfm = lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < buf_size_h_div16; i++) { __m256i buf0[64]; load_buffer_32bit_to_16bit_w16_avx2(input + i * 16, input_stride, buf0, buf_size_nonzero_w); if (rect_type == 1 || rect_type == -1) { round_shift_avx2(buf0, buf0, buf_size_nonzero_w); // rect special code } row_txfm(buf0, buf0); round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]); __m256i *_buf1 = buf1; if (lr_flip) { for (int j = 0; j < buf_size_w_div16; ++j) { __m256i temp[16]; flip_buf_avx2(buf0 + 16 * j, temp, 16); transpose_16bit_16x16_avx2(temp, _buf1 + 16 * (buf_size_w_div16 - 1 - j)); } } else { for (int j = 0; j < buf_size_w_div16; ++j) { transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j); } } for (int j = 0; j < buf_size_w_div16; ++j) { iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride, buf1 + j * 16, shift[1], 16, txh_idx); } } } static const transform_1d_ssse3 lowbd_txfm_all_1d_zeros_8x8_arr[2][2] = { { av1_idct8_low1_ssse3, av1_idct8_sse2 }, { av1_iadst8_low1_ssse3, av1_iadst8_sse2 } }; static inline void load_buffer_avx2(const int32_t *in, int stride, __m128i *out) { const __m256i a = _mm256_load_si256((const __m256i *)in); const __m256i b = _mm256_load_si256((const __m256i *)(in + stride * 1)); const __m256i c = _mm256_load_si256((const __m256i *)(in + stride * 2)); const __m256i d = _mm256_load_si256((const __m256i *)(in + stride * 3)); const __m256i e = _mm256_load_si256((const __m256i *)(in + stride * 4)); const __m256i f = _mm256_load_si256((const __m256i *)(in + stride * 5)); const __m256i g = _mm256_load_si256((const __m256i *)(in + stride * 6)); const __m256i h = _mm256_load_si256((const __m256i *)(in + stride * 7)); // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7 const __m256i ab_16bit = _mm256_packs_epi32(a, b); // c0 c1 c2 c3 d0 d1 d2 d3 c4 c5 c6 c7 d4 d5 d6 d7 const __m256i cd_16bit = _mm256_packs_epi32(c, d); // e0 e1 e2 e3 f0 f1 f2 f3 e4 e5 e6 e7 f4 f5 f6 f7 const __m256i ef_16bit = _mm256_packs_epi32(e, f); // g0 g1 g2 g3 h0 h1 h2 h3 g4 g5 g6 g7 h4 h5 h6 h7 const __m256i gh_16bit = _mm256_packs_epi32(g, h); // a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7 const __m256i ab = _mm256_permute4x64_epi64(ab_16bit, 0xd8); // c0 c1 c2 c3 c4 c5 c6 c7 d0 d1 d2 d3 d4 d5 d6 d7 const __m256i cd = _mm256_permute4x64_epi64(cd_16bit, 0xd8); // e0 e1 e2 e3 e4 e5 e6 e7 f0 f1 f2 f3 f4 f5 f6 f7 const __m256i ef = _mm256_permute4x64_epi64(ef_16bit, 0xd8); // g0 g1 g2 g3 g4 g5 g6 g7 h0 h1 h2 h3 h4 h5 h6 h7 const __m256i gh = _mm256_permute4x64_epi64(gh_16bit, 0xd8); out[0] = _mm256_castsi256_si128(ab); out[1] = _mm256_extractf128_si256(ab, 1); out[2] = _mm256_castsi256_si128(cd); out[3] = _mm256_extractf128_si256(cd, 1); out[4] = _mm256_castsi256_si128(ef); out[5] = _mm256_extractf128_si256(ef, 1); out[6] = _mm256_castsi256_si128(gh); out[7] = _mm256_extractf128_si256(gh, 1); } static inline void round_and_transpose_avx2(const __m128i *const in, __m128i *const out, int bit, int *lr_flip) { __m256i buf_temp[4]; const __m256i scale = _mm256_set1_epi16(1 << (15 + bit)); int j = *lr_flip ? 7 : 0; const int step = *lr_flip ? -1 : 1; // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37 buf_temp[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + 4 * step], 1); j += step; // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27 buf_temp[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + 4 * step], 1); j += step; // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17 buf_temp[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + 4 * step], 1); j += step; // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07 buf_temp[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + 4 * step], 1); // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37 buf_temp[0] = _mm256_mulhrs_epi16(buf_temp[0], scale); // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27 buf_temp[1] = _mm256_mulhrs_epi16(buf_temp[1], scale); // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17 buf_temp[2] = _mm256_mulhrs_epi16(buf_temp[2], scale); // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07 buf_temp[3] = _mm256_mulhrs_epi16(buf_temp[3], scale); // 70 60 71 61 72 62 73 63 | 30 20 31 21 32 22 33 23 const __m256i unpcklo0 = _mm256_unpacklo_epi16(buf_temp[0], buf_temp[1]); // 74 64 75 65 76 66 77 67 | 34 24 35 25 36 26 37 27 const __m256i unpckhi0 = _mm256_unpackhi_epi16(buf_temp[0], buf_temp[1]); // 50 40 51 41 52 42 53 43 | 10 00 11 01 12 02 13 03 const __m256i unpcklo1 = _mm256_unpacklo_epi16(buf_temp[2], buf_temp[3]); // 54 44 55 45 56 46 57 47 | 14 04 15 05 16 06 17 07 const __m256i unpckhi1 = _mm256_unpackhi_epi16(buf_temp[2], buf_temp[3]); // 70 60 50 40 71 61 51 41 | 30 20 10 00 31 21 11 01 const __m256i unpcklo00 = _mm256_unpacklo_epi32(unpcklo0, unpcklo1); // 72 62 52 42 73 63 53 43 | 32 22 12 02 33 23 13 03 const __m256i unpckhi00 = _mm256_unpackhi_epi32(unpcklo0, unpcklo1); // 74 64 54 44 75 65 55 45 | 34 24 14 04 35 25 15 05 const __m256i unpcklo01 = _mm256_unpacklo_epi32(unpckhi0, unpckhi1); // 76 66 56 46 77 67 57 47 | 36 26 16 06 37 27 17 07 const __m256i unpckhi01 = _mm256_unpackhi_epi32(unpckhi0, unpckhi1); // 70 60 50 40 30 20 10 00 | 71 61 51 41 31 21 11 01 const __m256i reg_00 = _mm256_permute4x64_epi64(unpcklo00, 0xd8); // 72 62 52 42 32 22 12 02 | 73 63 53 43 33 23 13 03 const __m256i reg_01 = _mm256_permute4x64_epi64(unpckhi00, 0xd8); // 74 64 54 44 34 24 14 04 | 75 65 55 45 35 25 15 05 const __m256i reg_10 = _mm256_permute4x64_epi64(unpcklo01, 0xd8); // 76 66 56 46 36 26 16 06 | 77 67 57 47 37 27 17 07 const __m256i reg_11 = _mm256_permute4x64_epi64(unpckhi01, 0xd8); // 70 60 50 40 30 20 10 00 out[0] = _mm256_castsi256_si128(reg_00); // 71 61 51 41 31 21 11 01 out[1] = _mm256_extracti128_si256(reg_00, 1); // 72 62 52 42 32 22 12 02 out[2] = _mm256_castsi256_si128(reg_01); // 73 63 53 43 33 23 13 03 out[3] = _mm256_extracti128_si256(reg_01, 1); // 74 64 54 44 34 24 14 04 out[4] = _mm256_castsi256_si128(reg_10); // 75 65 55 45 35 25 15 05 out[5] = _mm256_extracti128_si256(reg_10, 1); // 76 66 56 46 36 26 16 06 out[6] = _mm256_castsi256_si128(reg_11); // 77 67 57 47 37 27 17 07 out[7] = _mm256_extracti128_si256(reg_11, 1); } static inline void round_shift_lowbd_write_buffer_avx2(__m128i *in, int bit, uint8_t *output, int stride, int flipud) { __m256i in_256[4], v_256[4]; int j = flipud ? 7 : 0; const int step = flipud ? -1 : 1; const __m256i scale = _mm256_set1_epi16(1 << (15 + bit)); const __m256i zero = _mm256_setzero_si256(); // in[0], in[1] in_256[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); j += 2 * step; // in[2], in[3] in_256[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); j += 2 * step; // in[4], in[5] in_256[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); j += 2 * step; // in[6], in[7] in_256[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); // i00 i01 i02 i03 i04 i05 i06 i07 i10 i11 i12 i13 i14 i15 i16 i17 in_256[0] = _mm256_mulhrs_epi16(in_256[0], scale); // i20 i21 i22 i23 i24 i25 i26 i27 i30 i31 i32 i33 i34 i35 i36 i37 in_256[1] = _mm256_mulhrs_epi16(in_256[1], scale); // i40 i41 i42 i43 i44 i45 i46 i47 i50 i51 i52 i53 i54 i55 i56 i57 in_256[2] = _mm256_mulhrs_epi16(in_256[2], scale); // i60 i61 i62 i63 i64 i65 i66 i67 i70 i71 i72 i73 i74 i75 i76 i77 in_256[3] = _mm256_mulhrs_epi16(in_256[3], scale); const __m128i v0 = _mm_loadl_epi64((__m128i const *)(output)); const __m128i v1 = _mm_loadl_epi64((__m128i const *)(output + stride)); const __m128i v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride)); const __m128i v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride)); const __m128i v4 = _mm_loadl_epi64((__m128i const *)(output + 4 * stride)); const __m128i v5 = _mm_loadl_epi64((__m128i const *)(output + 5 * stride)); const __m128i v6 = _mm_loadl_epi64((__m128i const *)(output + 6 * stride)); const __m128i v7 = _mm_loadl_epi64((__m128i const *)(output + 7 * stride)); v_256[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(v0), v1, 1); v_256[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(v2), v3, 1); v_256[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(v4), v5, 1); v_256[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(v6), v7, 1); const __m256i unpcklo0 = _mm256_unpacklo_epi8(v_256[0], zero); const __m256i unpcklo1 = _mm256_unpacklo_epi8(v_256[1], zero); const __m256i unpcklo2 = _mm256_unpacklo_epi8(v_256[2], zero); const __m256i unpcklo3 = _mm256_unpacklo_epi8(v_256[3], zero); // 00 01 10 11 const __m256i x0 = _mm256_adds_epi16(in_256[0], unpcklo0); // 20 21 30 31 const __m256i x1 = _mm256_adds_epi16(in_256[1], unpcklo1); // 40 41 50 51 const __m256i x2 = _mm256_adds_epi16(in_256[2], unpcklo2); // 60 61 70 71 const __m256i x3 = _mm256_adds_epi16(in_256[3], unpcklo3); // 00 01 20 21 10 11 30 31 const __m256i res_0123 = _mm256_packus_epi16(x0, x1); // 40 41 60 61 50 51 70 71 const __m256i res_4567 = _mm256_packus_epi16(x2, x3); // 00 01 20 21 const __m128i res_02 = _mm256_castsi256_si128(res_0123); // 10 11 30 31 const __m128i res_13 = _mm256_extracti128_si256(res_0123, 1); // 40 41 60 61 const __m128i res_46 = _mm256_castsi256_si128(res_4567); // 50 51 70 71 const __m128i res_57 = _mm256_extracti128_si256(res_4567, 1); // 00 01 _mm_storel_epi64((__m128i *)(output), res_02); // 10 11 _mm_storel_epi64((__m128i *)(output + stride), res_13); // 20 21 _mm_storel_epi64((__m128i *)(output + 2 * stride), _mm_unpackhi_epi64(res_02, res_02)); // 30 31 _mm_storel_epi64((__m128i *)(output + 3 * stride), _mm_unpackhi_epi64(res_13, res_13)); // 40 41 _mm_storel_epi64((__m128i *)(output + 4 * stride), res_46); // 50 51 _mm_storel_epi64((__m128i *)(output + 5 * stride), res_57); // 60 61 _mm_storel_epi64((__m128i *)(output + 6 * stride), _mm_unpackhi_epi64(res_46, res_46)); // 70 71 _mm_storel_epi64((__m128i *)(output + 7 * stride), _mm_unpackhi_epi64(res_57, res_57)); } // AVX2 implementation has the advantage when combined multiple operations // together. static inline void lowbd_inv_txfm2d_8x8_no_identity_avx2( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { __m128i buf1[8]; const int input_stride = 8; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; assert(hitx_1d_tab[tx_type] < 2); assert(vitx_1d_tab[tx_type] < 2); const transform_1d_ssse3 row_txfm = lowbd_txfm_all_1d_zeros_8x8_arr[hitx_1d_tab[tx_type]][eob != 1]; const transform_1d_ssse3 col_txfm = lowbd_txfm_all_1d_zeros_8x8_arr[vitx_1d_tab[tx_type]][eob != 1]; assert(col_txfm != NULL); assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); __m128i buf0[8]; __m128i *buf0_cur = buf0; load_buffer_avx2(input, input_stride, buf0_cur); row_txfm(buf0, buf0); assert(shift[0] < 0); __m128i *_buf1 = buf1; round_and_transpose_avx2(buf0, _buf1, shift[0], &lr_flip); assert(shift[1] < 0); col_txfm(buf1, buf1); round_shift_lowbd_write_buffer_avx2(buf1, shift[1], output, stride, ud_flip); } // AVX2 implementation of 8x8 inverse transform. Observed that coding AVX2 for // tx_type with identity in either of the direction has no advantage. static void lowbd_inv_txfm2d_add_8x8_avx2(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { switch (tx_type) { case IDTX: av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size); break; case V_DCT: case V_ADST: case V_FLIPADST: av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type, tx_size, eob); break; case H_DCT: case H_ADST: case H_FLIPADST: av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type, tx_size, eob); break; default: lowbd_inv_txfm2d_8x8_no_identity_avx2(input, output, stride, tx_type, tx_size, eob); } } // for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64 static inline void lowbd_inv_txfm2d_add_universe_avx2( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { (void)eob; switch (tx_type) { case DCT_DCT: case ADST_DCT: // ADST in vertical, DCT in horizontal case DCT_ADST: // DCT in vertical, ADST in horizontal case ADST_ADST: // ADST in both directions case FLIPADST_DCT: case DCT_FLIPADST: case FLIPADST_FLIPADST: case ADST_FLIPADST: case FLIPADST_ADST: lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type, tx_size, eob); break; case IDTX: lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob); break; case V_DCT: case V_ADST: case V_FLIPADST: lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type, tx_size, eob); break; case H_DCT: case H_ADST: case H_FLIPADST: lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type, tx_size, eob); break; default: av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size, eob); break; } } void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { switch (tx_size) { case TX_4X4: case TX_4X8: case TX_8X4: case TX_8X16: case TX_16X8: case TX_4X16: case TX_16X4: case TX_8X32: case TX_32X8: av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size, eob); break; case TX_8X8: lowbd_inv_txfm2d_add_8x8_avx2(input, output, stride, tx_type, tx_size, eob); break; case TX_16X16: case TX_32X32: case TX_64X64: case TX_16X32: case TX_32X16: case TX_32X64: case TX_64X32: case TX_16X64: case TX_64X16: default: lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type, tx_size, eob); break; } } void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param) { const TX_TYPE tx_type = txfm_param->tx_type; if (!txfm_param->lossless) { av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type, txfm_param->tx_size, txfm_param->eob); } else { av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); } } aom-3.12.1/av1/common/x86/av1_inv_txfm_avx2.h000066400000000000000000000050701477627663500205210ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ #define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/transpose_sse2.h" #include "aom_dsp/x86/txfm_common_sse2.h" #include "aom_dsp/x86/txfm_common_avx2.h" #ifdef __cplusplus extern "C" { #endif // half input is zero #define btf_16_w16_0_avx2(w0, w1, in, out0, out1) \ do { \ const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \ const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \ const __m256i _in = in; \ out0 = _mm256_mulhrs_epi16(_in, _w0); \ out1 = _mm256_mulhrs_epi16(_in, _w1); \ } while (0) static inline void round_shift_avx2(const __m256i *input, __m256i *output, int size) { const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8); for (int i = 0; i < size; ++i) { output[i] = _mm256_mulhrs_epi16(input[i], scale); } } static inline void write_recon_w16_avx2(__m256i res, uint8_t *output) { __m128i pred = _mm_loadu_si128((__m128i const *)(output)); __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res); __m128i y = _mm256_castsi256_si128( _mm256_permute4x64_epi64(_mm256_packus_epi16(u, u), 168)); _mm_storeu_si128((__m128i *)(output), y); } static inline void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output, int stride, int flipud, int height) { int j = flipud ? (height - 1) : 0; const int step = flipud ? -1 : 1; for (int i = 0; i < height; ++i, j += step) { write_recon_w16_avx2(in[j], output + i * stride); } } void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob); #ifdef __cplusplus } #endif #endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ aom-3.12.1/av1/common/x86/av1_inv_txfm_ssse3.c000066400000000000000000003364501477627663500207050ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "av1/common/av1_inv_txfm1d_cfg.h" #include "av1/common/x86/av1_inv_txfm_ssse3.h" #include "av1/common/x86/av1_txfm_sse2.h" // TODO(venkatsanampudi@ittiam.com): move this to header file // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 static const int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, 4 * 5793 }; // TODO(binpengsmail@gmail.com): replace some for loop with do {} while static void idct4_sse2(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); // stage 1 __m128i x[4]; x[0] = input[0]; x[1] = input[2]; x[2] = input[1]; x[3] = input[3]; // stage 2 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); // stage 3 btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]); btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]); } static void idct4_w4_sse2(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); // stage 1 __m128i x[4]; x[0] = input[0]; x[1] = input[2]; x[2] = input[1]; x[3] = input[3]; // stage 2 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); // stage 3 btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]); btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]); } void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); // stage 1 __m128i x[2]; x[0] = input[0]; // stage 2 // stage 3 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); // stage 4 // stage 5 output[0] = x[0]; output[7] = x[0]; output[1] = x[1]; output[6] = x[1]; output[2] = x[1]; output[5] = x[1]; output[3] = x[0]; output[4] = x[0]; } void av1_idct8_sse2(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); // stage 1 __m128i x[8]; x[0] = input[0]; x[1] = input[4]; x[2] = input[2]; x[3] = input[6]; x[4] = input[1]; x[5] = input[5]; x[6] = input[3]; x[7] = input[7]; // stage 2 btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); // stage 3 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); btf_16_adds_subs_sse2(x[4], x[5]); btf_16_subs_adds_sse2(x[7], x[6]); // stage 4 btf_16_adds_subs_sse2(x[0], x[3]); btf_16_adds_subs_sse2(x[1], x[2]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); // stage 5 btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]); btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]); btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]); btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]); } static void idct8_w4_sse2(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); // stage 1 __m128i x[8]; x[0] = input[0]; x[1] = input[4]; x[2] = input[2]; x[3] = input[6]; x[4] = input[1]; x[5] = input[5]; x[6] = input[3]; x[7] = input[7]; // stage 2 btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); // stage 3 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); btf_16_adds_subs_sse2(x[4], x[5]); btf_16_subs_adds_sse2(x[7], x[6]); // stage 4 btf_16_adds_subs_sse2(x[0], x[3]); btf_16_adds_subs_sse2(x[1], x[2]); btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); // stage 5 btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]); btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]); btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]); btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]); } static inline void idct16_stage5_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); btf_16_adds_subs_sse2(x[0], x[3]); btf_16_adds_subs_sse2(x[1], x[2]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); btf_16_adds_subs_sse2(x[8], x[11]); btf_16_adds_subs_sse2(x[9], x[10]); btf_16_subs_adds_sse2(x[15], x[12]); btf_16_subs_adds_sse2(x[14], x[13]); } static inline void idct16_stage6_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); btf_16_adds_subs_sse2(x[0], x[7]); btf_16_adds_subs_sse2(x[1], x[6]); btf_16_adds_subs_sse2(x[2], x[5]); btf_16_adds_subs_sse2(x[3], x[4]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); } static inline void idct16_stage7_sse2(__m128i *output, __m128i *x) { btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]); btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]); btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]); btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]); btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]); btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]); btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]); btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]); } static void idct16_low1_ssse3(const __m128i *input, __m128i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); // stage 1 __m128i x[2]; x[0] = input[0]; // stage 2 // stage 3 // stage 4 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); // stage 5 // stage 6 // stage 7 output[0] = x[0]; output[15] = x[0]; output[1] = x[1]; output[14] = x[1]; output[2] = x[1]; output[13] = x[1]; output[3] = x[0]; output[12] = x[0]; output[4] = x[0]; output[11] = x[0]; output[5] = x[1]; output[10] = x[1]; output[6] = x[1]; output[9] = x[1]; output[7] = x[0]; output[8] = x[0]; } static void idct16_low8_ssse3(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); // stage 1 __m128i x[16]; x[0] = input[0]; x[2] = input[4]; x[4] = input[2]; x[6] = input[6]; x[8] = input[1]; x[10] = input[5]; x[12] = input[3]; x[14] = input[7]; // stage 2 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); // stage 3 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); btf_16_adds_subs_sse2(x[8], x[9]); btf_16_subs_adds_sse2(x[11], x[10]); btf_16_adds_subs_sse2(x[12], x[13]); btf_16_subs_adds_sse2(x[15], x[14]); // stage 4 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); btf_16_adds_subs_sse2(x[4], x[5]); btf_16_subs_adds_sse2(x[7], x[6]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); idct16_stage5_sse2(x, cospi, __rounding, cos_bit); idct16_stage6_sse2(x, cospi, __rounding, cos_bit); idct16_stage7_sse2(output, x); } static void idct16_sse2(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); // stage 1 __m128i x[16]; x[0] = input[0]; x[1] = input[8]; x[2] = input[4]; x[3] = input[12]; x[4] = input[2]; x[5] = input[10]; x[6] = input[6]; x[7] = input[14]; x[8] = input[1]; x[9] = input[9]; x[10] = input[5]; x[11] = input[13]; x[12] = input[3]; x[13] = input[11]; x[14] = input[7]; x[15] = input[15]; // stage 2 btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); // stage 3 btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); btf_16_adds_subs_sse2(x[8], x[9]); btf_16_subs_adds_sse2(x[11], x[10]); btf_16_adds_subs_sse2(x[12], x[13]); btf_16_subs_adds_sse2(x[15], x[14]); // stage 4 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); btf_16_adds_subs_sse2(x[4], x[5]); btf_16_subs_adds_sse2(x[7], x[6]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); // stage 5~7 idct16_stage5_sse2(x, cospi, __rounding, cos_bit); idct16_stage6_sse2(x, cospi, __rounding, cos_bit); idct16_stage7_sse2(output, x); } static void idct16_w4_sse2(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); // stage 1 __m128i x[16]; x[0] = input[0]; x[1] = input[8]; x[2] = input[4]; x[3] = input[12]; x[4] = input[2]; x[5] = input[10]; x[6] = input[6]; x[7] = input[14]; x[8] = input[1]; x[9] = input[9]; x[10] = input[5]; x[11] = input[13]; x[12] = input[3]; x[13] = input[11]; x[14] = input[7]; x[15] = input[15]; // stage 2 btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); // stage 3 btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); btf_16_adds_subs_sse2(x[8], x[9]); btf_16_subs_adds_sse2(x[11], x[10]); btf_16_adds_subs_sse2(x[12], x[13]); btf_16_subs_adds_sse2(x[15], x[14]); // stage 4 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); btf_16_adds_subs_sse2(x[4], x[5]); btf_16_subs_adds_sse2(x[7], x[6]); btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); // stage 5 btf_16_adds_subs_sse2(x[0], x[3]); btf_16_adds_subs_sse2(x[1], x[2]); btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); btf_16_adds_subs_sse2(x[8], x[11]); btf_16_adds_subs_sse2(x[9], x[10]); btf_16_subs_adds_sse2(x[15], x[12]); btf_16_subs_adds_sse2(x[14], x[13]); // stage 6 btf_16_adds_subs_sse2(x[0], x[7]); btf_16_adds_subs_sse2(x[1], x[6]); btf_16_adds_subs_sse2(x[2], x[5]); btf_16_adds_subs_sse2(x[3], x[4]); btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); // stage 7 idct16_stage7_sse2(output, x); } static inline void idct32_high16_stage3_sse2(__m128i *x) { btf_16_adds_subs_sse2(x[16], x[17]); btf_16_subs_adds_sse2(x[19], x[18]); btf_16_adds_subs_sse2(x[20], x[21]); btf_16_subs_adds_sse2(x[23], x[22]); btf_16_adds_subs_sse2(x[24], x[25]); btf_16_subs_adds_sse2(x[27], x[26]); btf_16_adds_subs_sse2(x[28], x[29]); btf_16_subs_adds_sse2(x[31], x[30]); } static inline void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); } static inline void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); btf_16_adds_subs_sse2(x[16], x[19]); btf_16_adds_subs_sse2(x[17], x[18]); btf_16_subs_adds_sse2(x[23], x[20]); btf_16_subs_adds_sse2(x[22], x[21]); btf_16_adds_subs_sse2(x[24], x[27]); btf_16_adds_subs_sse2(x[25], x[26]); btf_16_subs_adds_sse2(x[31], x[28]); btf_16_subs_adds_sse2(x[30], x[29]); } static inline void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); btf_16_adds_subs_sse2(x[8], x[11]); btf_16_adds_subs_sse2(x[9], x[10]); btf_16_subs_adds_sse2(x[15], x[12]); btf_16_subs_adds_sse2(x[14], x[13]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); } static inline void idct32_stage7_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); btf_16_adds_subs_sse2(x[0], x[7]); btf_16_adds_subs_sse2(x[1], x[6]); btf_16_adds_subs_sse2(x[2], x[5]); btf_16_adds_subs_sse2(x[3], x[4]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); btf_16_adds_subs_sse2(x[16], x[23]); btf_16_adds_subs_sse2(x[17], x[22]); btf_16_adds_subs_sse2(x[18], x[21]); btf_16_adds_subs_sse2(x[19], x[20]); btf_16_subs_adds_sse2(x[31], x[24]); btf_16_subs_adds_sse2(x[30], x[25]); btf_16_subs_adds_sse2(x[29], x[26]); btf_16_subs_adds_sse2(x[28], x[27]); } static inline void idct32_stage8_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); btf_16_adds_subs_sse2(x[0], x[15]); btf_16_adds_subs_sse2(x[1], x[14]); btf_16_adds_subs_sse2(x[2], x[13]); btf_16_adds_subs_sse2(x[3], x[12]); btf_16_adds_subs_sse2(x[4], x[11]); btf_16_adds_subs_sse2(x[5], x[10]); btf_16_adds_subs_sse2(x[6], x[9]); btf_16_adds_subs_sse2(x[7], x[8]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); } static inline void idct32_stage9_sse2(__m128i *output, __m128i *x) { btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]); btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]); btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]); btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]); btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]); btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]); btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]); btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]); btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]); btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]); btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]); btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]); btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]); btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]); btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]); btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]); } static void idct32_low1_ssse3(const __m128i *input, __m128i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); // stage 1 __m128i x[2]; x[0] = input[0]; // stage 2 // stage 3 // stage 4 // stage 5 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); // stage 6 // stage 7 // stage 8 // stage 9 output[0] = x[0]; output[31] = x[0]; output[1] = x[1]; output[30] = x[1]; output[2] = x[1]; output[29] = x[1]; output[3] = x[0]; output[28] = x[0]; output[4] = x[0]; output[27] = x[0]; output[5] = x[1]; output[26] = x[1]; output[6] = x[1]; output[25] = x[1]; output[7] = x[0]; output[24] = x[0]; output[8] = x[0]; output[23] = x[0]; output[9] = x[1]; output[22] = x[1]; output[10] = x[1]; output[21] = x[1]; output[11] = x[0]; output[20] = x[0]; output[12] = x[0]; output[19] = x[0]; output[13] = x[1]; output[18] = x[1]; output[14] = x[1]; output[17] = x[1]; output[15] = x[0]; output[16] = x[0]; } static void idct32_low8_ssse3(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); // stage 1 __m128i x[32]; x[0] = input[0]; x[4] = input[4]; x[8] = input[2]; x[12] = input[6]; x[16] = input[1]; x[20] = input[5]; x[24] = input[3]; x[28] = input[7]; // stage 2 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); // stage 3 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); x[17] = x[16]; x[18] = x[19]; x[21] = x[20]; x[22] = x[23]; x[25] = x[24]; x[26] = x[27]; x[29] = x[28]; x[30] = x[31]; // stage 4 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); x[9] = x[8]; x[10] = x[11]; x[13] = x[12]; x[14] = x[15]; idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); // stage 5 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); x[5] = x[4]; x[6] = x[7]; idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); // stage 6 x[3] = x[0]; x[2] = x[1]; idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); idct32_stage7_sse2(x, cospi, __rounding, cos_bit); idct32_stage8_sse2(x, cospi, __rounding, cos_bit); idct32_stage9_sse2(output, x); } static void idct32_low16_ssse3(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); // stage 1 __m128i x[32]; x[0] = input[0]; x[2] = input[8]; x[4] = input[4]; x[6] = input[12]; x[8] = input[2]; x[10] = input[10]; x[12] = input[6]; x[14] = input[14]; x[16] = input[1]; x[18] = input[9]; x[20] = input[5]; x[22] = input[13]; x[24] = input[3]; x[26] = input[11]; x[28] = input[7]; x[30] = input[15]; // stage 2 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]); btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]); btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]); btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]); btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); // stage 3 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); idct32_high16_stage3_sse2(x); // stage 4 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); btf_16_adds_subs_sse2(x[8], x[9]); btf_16_subs_adds_sse2(x[11], x[10]); btf_16_adds_subs_sse2(x[12], x[13]); btf_16_subs_adds_sse2(x[15], x[14]); idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); // stage 5 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); btf_16_adds_subs_sse2(x[4], x[5]); btf_16_subs_adds_sse2(x[7], x[6]); idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); btf_16_adds_subs_sse2(x[0], x[3]); btf_16_adds_subs_sse2(x[1], x[2]); idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); idct32_stage7_sse2(x, cospi, __rounding, cos_bit); idct32_stage8_sse2(x, cospi, __rounding, cos_bit); idct32_stage9_sse2(output, x); } static void idct32_sse2(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); // stage 1 __m128i x[32]; x[0] = input[0]; x[1] = input[16]; x[2] = input[8]; x[3] = input[24]; x[4] = input[4]; x[5] = input[20]; x[6] = input[12]; x[7] = input[28]; x[8] = input[2]; x[9] = input[18]; x[10] = input[10]; x[11] = input[26]; x[12] = input[6]; x[13] = input[22]; x[14] = input[14]; x[15] = input[30]; x[16] = input[1]; x[17] = input[17]; x[18] = input[9]; x[19] = input[25]; x[20] = input[5]; x[21] = input[21]; x[22] = input[13]; x[23] = input[29]; x[24] = input[3]; x[25] = input[19]; x[26] = input[11]; x[27] = input[27]; x[28] = input[7]; x[29] = input[23]; x[30] = input[15]; x[31] = input[31]; // stage 2 btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]); btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]); btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]); btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]); btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]); btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]); btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]); btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]); // stage 3 btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); idct32_high16_stage3_sse2(x); // stage 4 btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); btf_16_adds_subs_sse2(x[8], x[9]); btf_16_subs_adds_sse2(x[11], x[10]); btf_16_adds_subs_sse2(x[12], x[13]); btf_16_subs_adds_sse2(x[15], x[14]); idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); // stage 5 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); btf_16_adds_subs_sse2(x[4], x[5]); btf_16_adds_subs_sse2(x[7], x[6]); idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); // stage 6 btf_16_adds_subs_sse2(x[0], x[3]); btf_16_adds_subs_sse2(x[1], x[2]); idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); // stage 7~8 idct32_stage7_sse2(x, cospi, __rounding, cos_bit); idct32_stage8_sse2(x, cospi, __rounding, cos_bit); idct32_stage9_sse2(output, x); } static inline void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]); const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]); const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]); btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]); btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]); btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]); btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); } static inline void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); btf_16_adds_subs_sse2(x[32], x[35]); btf_16_adds_subs_sse2(x[33], x[34]); btf_16_subs_adds_sse2(x[39], x[36]); btf_16_subs_adds_sse2(x[38], x[37]); btf_16_adds_subs_sse2(x[40], x[43]); btf_16_adds_subs_sse2(x[41], x[42]); btf_16_subs_adds_sse2(x[47], x[44]); btf_16_subs_adds_sse2(x[46], x[45]); btf_16_adds_subs_sse2(x[48], x[51]); btf_16_adds_subs_sse2(x[49], x[50]); btf_16_subs_adds_sse2(x[55], x[52]); btf_16_subs_adds_sse2(x[54], x[53]); btf_16_adds_subs_sse2(x[56], x[59]); btf_16_adds_subs_sse2(x[57], x[58]); btf_16_subs_adds_sse2(x[63], x[60]); btf_16_subs_adds_sse2(x[62], x[61]); } static inline void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]); btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]); btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]); btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]); btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]); btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]); btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]); btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]); } static inline void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { btf_16_adds_subs_sse2(x[16], x[19]); btf_16_adds_subs_sse2(x[17], x[18]); btf_16_subs_adds_sse2(x[23], x[20]); btf_16_subs_adds_sse2(x[22], x[21]); btf_16_adds_subs_sse2(x[24], x[27]); btf_16_adds_subs_sse2(x[25], x[26]); btf_16_subs_adds_sse2(x[31], x[28]); btf_16_subs_adds_sse2(x[30], x[29]); idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit); } static inline void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); btf_16_adds_subs_sse2(x[32], x[39]); btf_16_adds_subs_sse2(x[33], x[38]); btf_16_adds_subs_sse2(x[34], x[37]); btf_16_adds_subs_sse2(x[35], x[36]); btf_16_subs_adds_sse2(x[47], x[40]); btf_16_subs_adds_sse2(x[46], x[41]); btf_16_subs_adds_sse2(x[45], x[42]); btf_16_subs_adds_sse2(x[44], x[43]); btf_16_adds_subs_sse2(x[48], x[55]); btf_16_adds_subs_sse2(x[49], x[54]); btf_16_adds_subs_sse2(x[50], x[53]); btf_16_adds_subs_sse2(x[51], x[52]); btf_16_subs_adds_sse2(x[63], x[56]); btf_16_subs_adds_sse2(x[62], x[57]); btf_16_subs_adds_sse2(x[61], x[58]); btf_16_subs_adds_sse2(x[60], x[59]); } static inline void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); btf_16_adds_subs_sse2(x[16], x[23]); btf_16_adds_subs_sse2(x[17], x[22]); btf_16_adds_subs_sse2(x[18], x[21]); btf_16_adds_subs_sse2(x[19], x[20]); btf_16_subs_adds_sse2(x[31], x[24]); btf_16_subs_adds_sse2(x[30], x[25]); btf_16_subs_adds_sse2(x[29], x[26]); btf_16_subs_adds_sse2(x[28], x[27]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]); } static inline void idct64_stage9_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); btf_16_adds_subs_sse2(x[0], x[15]); btf_16_adds_subs_sse2(x[1], x[14]); btf_16_adds_subs_sse2(x[2], x[13]); btf_16_adds_subs_sse2(x[3], x[12]); btf_16_adds_subs_sse2(x[4], x[11]); btf_16_adds_subs_sse2(x[5], x[10]); btf_16_adds_subs_sse2(x[6], x[9]); btf_16_adds_subs_sse2(x[7], x[8]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); btf_16_adds_subs_sse2(x[32], x[47]); btf_16_adds_subs_sse2(x[33], x[46]); btf_16_adds_subs_sse2(x[34], x[45]); btf_16_adds_subs_sse2(x[35], x[44]); btf_16_adds_subs_sse2(x[36], x[43]); btf_16_adds_subs_sse2(x[37], x[42]); btf_16_adds_subs_sse2(x[38], x[41]); btf_16_adds_subs_sse2(x[39], x[40]); btf_16_subs_adds_sse2(x[63], x[48]); btf_16_subs_adds_sse2(x[62], x[49]); btf_16_subs_adds_sse2(x[61], x[50]); btf_16_subs_adds_sse2(x[60], x[51]); btf_16_subs_adds_sse2(x[59], x[52]); btf_16_subs_adds_sse2(x[58], x[53]); btf_16_subs_adds_sse2(x[57], x[54]); btf_16_subs_adds_sse2(x[56], x[55]); } static inline void idct64_stage10_sse2(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); btf_16_adds_subs_sse2(x[0], x[31]); btf_16_adds_subs_sse2(x[1], x[30]); btf_16_adds_subs_sse2(x[2], x[29]); btf_16_adds_subs_sse2(x[3], x[28]); btf_16_adds_subs_sse2(x[4], x[27]); btf_16_adds_subs_sse2(x[5], x[26]); btf_16_adds_subs_sse2(x[6], x[25]); btf_16_adds_subs_sse2(x[7], x[24]); btf_16_adds_subs_sse2(x[8], x[23]); btf_16_adds_subs_sse2(x[9], x[22]); btf_16_adds_subs_sse2(x[10], x[21]); btf_16_adds_subs_sse2(x[11], x[20]); btf_16_adds_subs_sse2(x[12], x[19]); btf_16_adds_subs_sse2(x[13], x[18]); btf_16_adds_subs_sse2(x[14], x[17]); btf_16_adds_subs_sse2(x[15], x[16]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]); } static inline void idct64_stage11_sse2(__m128i *output, __m128i *x) { btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]); btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]); btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]); btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]); btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]); btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]); btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]); btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]); btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]); btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]); btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]); btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]); btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]); btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]); btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]); btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]); btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]); btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]); btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]); btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]); btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]); btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]); btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]); btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]); btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]); btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]); btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]); btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]); btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]); btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]); btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]); btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]); } static void idct64_low1_ssse3(const __m128i *input, __m128i *output) { const int32_t *cospi = cospi_arr(INV_COS_BIT); // stage 1 __m128i x[32]; x[0] = input[0]; // stage 2 // stage 3 // stage 4 // stage 5 // stage 6 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); // stage 7 // stage 8 // stage 9 // stage 10 // stage 11 output[0] = x[0]; output[63] = x[0]; output[1] = x[1]; output[62] = x[1]; output[2] = x[1]; output[61] = x[1]; output[3] = x[0]; output[60] = x[0]; output[4] = x[0]; output[59] = x[0]; output[5] = x[1]; output[58] = x[1]; output[6] = x[1]; output[57] = x[1]; output[7] = x[0]; output[56] = x[0]; output[8] = x[0]; output[55] = x[0]; output[9] = x[1]; output[54] = x[1]; output[10] = x[1]; output[53] = x[1]; output[11] = x[0]; output[52] = x[0]; output[12] = x[0]; output[51] = x[0]; output[13] = x[1]; output[50] = x[1]; output[14] = x[1]; output[49] = x[1]; output[15] = x[0]; output[48] = x[0]; output[16] = x[0]; output[47] = x[0]; output[17] = x[1]; output[46] = x[1]; output[18] = x[1]; output[45] = x[1]; output[19] = x[0]; output[44] = x[0]; output[20] = x[0]; output[43] = x[0]; output[21] = x[1]; output[42] = x[1]; output[22] = x[1]; output[41] = x[1]; output[23] = x[0]; output[40] = x[0]; output[24] = x[0]; output[39] = x[0]; output[25] = x[1]; output[38] = x[1]; output[26] = x[1]; output[37] = x[1]; output[27] = x[0]; output[36] = x[0]; output[28] = x[0]; output[35] = x[0]; output[29] = x[1]; output[34] = x[1]; output[30] = x[1]; output[33] = x[1]; output[31] = x[0]; output[32] = x[0]; } static void idct64_low8_ssse3(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); // stage 1 __m128i x[64]; x[0] = input[0]; x[8] = input[4]; x[16] = input[2]; x[24] = input[6]; x[32] = input[1]; x[40] = input[5]; x[48] = input[3]; x[56] = input[7]; // stage 2 btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); // stage 3 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); x[33] = x[32]; x[38] = x[39]; x[41] = x[40]; x[46] = x[47]; x[49] = x[48]; x[54] = x[55]; x[57] = x[56]; x[62] = x[63]; // stage 4 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); x[17] = x[16]; x[22] = x[23]; x[25] = x[24]; x[30] = x[31]; btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); // stage 5 x[9] = x[8]; x[14] = x[15]; btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); x[35] = x[32]; x[34] = x[33]; x[36] = x[39]; x[37] = x[38]; x[43] = x[40]; x[42] = x[41]; x[44] = x[47]; x[45] = x[46]; x[51] = x[48]; x[50] = x[49]; x[52] = x[55]; x[53] = x[54]; x[59] = x[56]; x[58] = x[57]; x[60] = x[63]; x[61] = x[62]; // stage 6 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); x[19] = x[16]; x[18] = x[17]; x[20] = x[23]; x[21] = x[22]; x[27] = x[24]; x[26] = x[25]; x[28] = x[31]; x[29] = x[30]; idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit); // stage 7 x[3] = x[0]; x[2] = x[1]; x[11] = x[8]; x[10] = x[9]; x[12] = x[15]; x[13] = x[14]; idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); // stage 8 x[7] = x[0]; x[6] = x[1]; x[5] = x[2]; x[4] = x[3]; btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); idct64_stage9_sse2(x, cospi, __rounding, cos_bit); idct64_stage10_sse2(x, cospi, __rounding, cos_bit); idct64_stage11_sse2(output, x); } static void idct64_low16_ssse3(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); // stage 1 __m128i x[64]; x[0] = input[0]; x[4] = input[8]; x[8] = input[4]; x[12] = input[12]; x[16] = input[2]; x[20] = input[10]; x[24] = input[6]; x[28] = input[14]; x[32] = input[1]; x[36] = input[9]; x[40] = input[5]; x[44] = input[13]; x[48] = input[3]; x[52] = input[11]; x[56] = input[7]; x[60] = input[15]; // stage 2 btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]); btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]); btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]); btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]); btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); // stage 3 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); x[33] = x[32]; x[34] = x[35]; x[37] = x[36]; x[38] = x[39]; x[41] = x[40]; x[42] = x[43]; x[45] = x[44]; x[46] = x[47]; x[49] = x[48]; x[50] = x[51]; x[53] = x[52]; x[54] = x[55]; x[57] = x[56]; x[58] = x[59]; x[61] = x[60]; x[62] = x[63]; // stage 4 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); x[17] = x[16]; x[18] = x[19]; x[21] = x[20]; x[22] = x[23]; x[25] = x[24]; x[26] = x[27]; x[29] = x[28]; x[30] = x[31]; idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit); // stage 5 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); x[9] = x[8]; x[10] = x[11]; x[13] = x[12]; x[14] = x[15]; idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit); // stage 6 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); x[5] = x[4]; x[6] = x[7]; btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit); // stage 7 x[3] = x[0]; x[2] = x[1]; btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); btf_16_adds_subs_sse2(x[8], x[11]); btf_16_adds_subs_sse2(x[9], x[10]); btf_16_subs_adds_sse2(x[15], x[12]); btf_16_subs_adds_sse2(x[14], x[13]); idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); // stage 8 btf_16_adds_subs_sse2(x[0], x[7]); btf_16_adds_subs_sse2(x[1], x[6]); btf_16_adds_subs_sse2(x[2], x[5]); btf_16_adds_subs_sse2(x[3], x[4]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); idct64_stage9_sse2(x, cospi, __rounding, cos_bit); idct64_stage10_sse2(x, cospi, __rounding, cos_bit); idct64_stage11_sse2(output, x); } static void idct64_low32_ssse3(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); // stage 1 __m128i x[64]; x[0] = input[0]; x[2] = input[16]; x[4] = input[8]; x[6] = input[24]; x[8] = input[4]; x[10] = input[20]; x[12] = input[12]; x[14] = input[28]; x[16] = input[2]; x[18] = input[18]; x[20] = input[10]; x[22] = input[26]; x[24] = input[6]; x[26] = input[22]; x[28] = input[14]; x[30] = input[30]; x[32] = input[1]; x[34] = input[17]; x[36] = input[9]; x[38] = input[25]; x[40] = input[5]; x[42] = input[21]; x[44] = input[13]; x[46] = input[29]; x[48] = input[3]; x[50] = input[19]; x[52] = input[11]; x[54] = input[27]; x[56] = input[7]; x[58] = input[23]; x[60] = input[15]; x[62] = input[31]; // stage 2 btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]); btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]); btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]); btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]); btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]); btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]); btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]); btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]); btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]); btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]); btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]); btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]); btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); // stage 3 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]); btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]); btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]); btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]); btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); btf_16_adds_subs_sse2(x[32], x[33]); btf_16_subs_adds_sse2(x[35], x[34]); btf_16_adds_subs_sse2(x[36], x[37]); btf_16_subs_adds_sse2(x[39], x[38]); btf_16_adds_subs_sse2(x[40], x[41]); btf_16_subs_adds_sse2(x[43], x[42]); btf_16_adds_subs_sse2(x[44], x[45]); btf_16_subs_adds_sse2(x[47], x[46]); btf_16_adds_subs_sse2(x[48], x[49]); btf_16_subs_adds_sse2(x[51], x[50]); btf_16_adds_subs_sse2(x[52], x[53]); btf_16_subs_adds_sse2(x[55], x[54]); btf_16_adds_subs_sse2(x[56], x[57]); btf_16_subs_adds_sse2(x[59], x[58]); btf_16_adds_subs_sse2(x[60], x[61]); btf_16_subs_adds_sse2(x[63], x[62]); // stage 4 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); btf_16_adds_subs_sse2(x[16], x[17]); btf_16_subs_adds_sse2(x[19], x[18]); btf_16_adds_subs_sse2(x[20], x[21]); btf_16_subs_adds_sse2(x[23], x[22]); btf_16_adds_subs_sse2(x[24], x[25]); btf_16_subs_adds_sse2(x[27], x[26]); btf_16_adds_subs_sse2(x[28], x[29]); btf_16_subs_adds_sse2(x[31], x[30]); idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit); // stage 5 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); btf_16_adds_subs_sse2(x[8], x[9]); btf_16_subs_adds_sse2(x[11], x[10]); btf_16_adds_subs_sse2(x[12], x[13]); btf_16_subs_adds_sse2(x[15], x[14]); idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit); // stage 6 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); btf_16_adds_subs_sse2(x[4], x[5]); btf_16_subs_adds_sse2(x[7], x[6]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit); // stage 7 btf_16_adds_subs_sse2(x[0], x[3]); btf_16_adds_subs_sse2(x[1], x[2]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); btf_16_adds_subs_sse2(x[8], x[11]); btf_16_adds_subs_sse2(x[9], x[10]); btf_16_subs_adds_sse2(x[15], x[12]); btf_16_subs_adds_sse2(x[14], x[13]); idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); // stage 8 btf_16_adds_subs_sse2(x[0], x[7]); btf_16_adds_subs_sse2(x[1], x[6]); btf_16_adds_subs_sse2(x[2], x[5]); btf_16_adds_subs_sse2(x[3], x[4]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); // stage 9~11 idct64_stage9_sse2(x, cospi, __rounding, cos_bit); idct64_stage10_sse2(x, cospi, __rounding, cos_bit); idct64_stage11_sse2(output, x); } static void iadst4_sse2(const __m128i *input, __m128i *output) { const int32_t *sinpi = sinpi_arr(INV_COS_BIT); const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]); const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]); const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]); const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]); const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]); const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]); const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]); const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]); __m128i x0[4]; x0[0] = input[0]; x0[1] = input[1]; x0[2] = input[2]; x0[3] = input[3]; __m128i u[4]; u[0] = _mm_unpacklo_epi16(x0[0], x0[2]); u[1] = _mm_unpackhi_epi16(x0[0], x0[2]); u[2] = _mm_unpacklo_epi16(x0[1], x0[3]); u[3] = _mm_unpackhi_epi16(x0[1], x0[3]); __m128i x1[16]; x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4 x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04); x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1 x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01); x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2 x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02); x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4 x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04); x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3 x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03); x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3 x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03); x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2 x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02); x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01); // -x1*sin3 - x3*sin1 x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01); __m128i x2[8]; x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2 x2[1] = _mm_add_epi32(x1[1], x1[5]); x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4 x2[3] = _mm_add_epi32(x1[3], x1[7]); x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 -x2*sin3 +x3*sin3 x2[5] = _mm_add_epi32(x1[9], x1[11]); x2[6] = _mm_add_epi32(x1[12], x1[14]); // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1 x2[7] = _mm_add_epi32(x1[13], x1[15]); const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); for (int i = 0; i < 4; ++i) { __m128i out0 = _mm_add_epi32(x2[2 * i], rounding); __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding); out0 = _mm_srai_epi32(out0, INV_COS_BIT); out1 = _mm_srai_epi32(out1, INV_COS_BIT); output[i] = _mm_packs_epi32(out0, out1); } } static void iadst4_w4_sse2(const __m128i *input, __m128i *output) { const int32_t *sinpi = sinpi_arr(INV_COS_BIT); const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]); const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]); const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]); const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]); const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]); const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]); const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]); const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]); __m128i x0[4]; x0[0] = input[0]; x0[1] = input[1]; x0[2] = input[2]; x0[3] = input[3]; __m128i u[2]; u[0] = _mm_unpacklo_epi16(x0[0], x0[2]); u[1] = _mm_unpacklo_epi16(x0[1], x0[3]); __m128i x1[8]; x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4 x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1 x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02); // x1*sin3 + x3*sin2 x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04); // x1*sin3 - x3*sin4 x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3 x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03); // x2*sin3 x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2 x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01); // -x1*sin3 - x3*sin1 __m128i x2[4]; x2[0] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2 x2[1] = _mm_add_epi32(x1[1], x1[3]); // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4 x2[2] = _mm_add_epi32(x1[4], x1[5]); // x0*sin3 - x2*sin3 + x3*sin3 x2[3] = _mm_add_epi32(x1[6], x1[7]); // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1 const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); for (int i = 0; i < 4; ++i) { __m128i out0 = _mm_add_epi32(x2[i], rounding); out0 = _mm_srai_epi32(out0, INV_COS_BIT); output[i] = _mm_packs_epi32(out0, out0); } } void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __zero = _mm_setzero_si128(); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); // stage 1 __m128i x[8]; x[1] = input[0]; // stage 2 btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]); // stage 3 x[4] = x[0]; x[5] = x[1]; // stage 4 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); // stage 5 x[2] = x[0]; x[3] = x[1]; x[6] = x[4]; x[7] = x[5]; // stage 6 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); // stage 7 output[0] = x[0]; output[1] = _mm_subs_epi16(__zero, x[4]); output[2] = x[6]; output[3] = _mm_subs_epi16(__zero, x[2]); output[4] = x[3]; output[5] = _mm_subs_epi16(__zero, x[7]); output[6] = x[5]; output[7] = _mm_subs_epi16(__zero, x[1]); } void av1_iadst8_sse2(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __zero = _mm_setzero_si128(); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); // stage 1 __m128i x[8]; x[0] = input[7]; x[1] = input[0]; x[2] = input[5]; x[3] = input[2]; x[4] = input[3]; x[5] = input[4]; x[6] = input[1]; x[7] = input[6]; // stage 2 btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]); btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]); btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]); btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]); // stage 3 btf_16_adds_subs_sse2(x[0], x[4]); btf_16_adds_subs_sse2(x[1], x[5]); btf_16_adds_subs_sse2(x[2], x[6]); btf_16_adds_subs_sse2(x[3], x[7]); // stage 4 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); // stage 5 btf_16_adds_subs_sse2(x[0], x[2]); btf_16_adds_subs_sse2(x[1], x[3]); btf_16_adds_subs_sse2(x[4], x[6]); btf_16_adds_subs_sse2(x[5], x[7]); // stage 6 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); // stage 7 output[0] = x[0]; output[1] = _mm_subs_epi16(__zero, x[4]); output[2] = x[6]; output[3] = _mm_subs_epi16(__zero, x[2]); output[4] = x[3]; output[5] = _mm_subs_epi16(__zero, x[7]); output[6] = x[5]; output[7] = _mm_subs_epi16(__zero, x[1]); } static void iadst8_w4_sse2(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __zero = _mm_setzero_si128(); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); // stage 1 __m128i x[8]; x[0] = input[7]; x[1] = input[0]; x[2] = input[5]; x[3] = input[2]; x[4] = input[3]; x[5] = input[4]; x[6] = input[1]; x[7] = input[6]; // stage 2 btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]); btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]); btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]); btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]); // stage 3 btf_16_adds_subs_sse2(x[0], x[4]); btf_16_adds_subs_sse2(x[1], x[5]); btf_16_adds_subs_sse2(x[2], x[6]); btf_16_adds_subs_sse2(x[3], x[7]); // stage 4 btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); // stage 5 btf_16_adds_subs_sse2(x[0], x[2]); btf_16_adds_subs_sse2(x[1], x[3]); btf_16_adds_subs_sse2(x[4], x[6]); btf_16_adds_subs_sse2(x[5], x[7]); // stage 6 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); // stage 7 output[0] = x[0]; output[1] = _mm_subs_epi16(__zero, x[4]); output[2] = x[6]; output[3] = _mm_subs_epi16(__zero, x[2]); output[4] = x[3]; output[5] = _mm_subs_epi16(__zero, x[7]); output[6] = x[5]; output[7] = _mm_subs_epi16(__zero, x[1]); } static inline void iadst16_stage3_ssse3(__m128i *x) { btf_16_adds_subs_sse2(x[0], x[8]); btf_16_adds_subs_sse2(x[1], x[9]); btf_16_adds_subs_sse2(x[2], x[10]); btf_16_adds_subs_sse2(x[3], x[11]); btf_16_adds_subs_sse2(x[4], x[12]); btf_16_adds_subs_sse2(x[5], x[13]); btf_16_adds_subs_sse2(x[6], x[14]); btf_16_adds_subs_sse2(x[7], x[15]); } static inline void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]); btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]); btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]); } static inline void iadst16_stage5_ssse3(__m128i *x) { btf_16_adds_subs_sse2(x[0], x[4]); btf_16_adds_subs_sse2(x[1], x[5]); btf_16_adds_subs_sse2(x[2], x[6]); btf_16_adds_subs_sse2(x[3], x[7]); btf_16_adds_subs_sse2(x[8], x[12]); btf_16_adds_subs_sse2(x[9], x[13]); btf_16_adds_subs_sse2(x[10], x[14]); btf_16_adds_subs_sse2(x[11], x[15]); } static inline void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]); } static inline void iadst16_stage7_ssse3(__m128i *x) { btf_16_adds_subs_sse2(x[0], x[2]); btf_16_adds_subs_sse2(x[1], x[3]); btf_16_adds_subs_sse2(x[4], x[6]); btf_16_adds_subs_sse2(x[5], x[7]); btf_16_adds_subs_sse2(x[8], x[10]); btf_16_adds_subs_sse2(x[9], x[11]); btf_16_adds_subs_sse2(x[12], x[14]); btf_16_adds_subs_sse2(x[13], x[15]); } static inline void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi, const __m128i __rounding, int8_t cos_bit) { const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]); btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]); } static inline void iadst16_stage9_ssse3(__m128i *output, __m128i *x) { const __m128i __zero = _mm_setzero_si128(); output[0] = x[0]; output[1] = _mm_subs_epi16(__zero, x[8]); output[2] = x[12]; output[3] = _mm_subs_epi16(__zero, x[4]); output[4] = x[6]; output[5] = _mm_subs_epi16(__zero, x[14]); output[6] = x[10]; output[7] = _mm_subs_epi16(__zero, x[2]); output[8] = x[3]; output[9] = _mm_subs_epi16(__zero, x[11]); output[10] = x[15]; output[11] = _mm_subs_epi16(__zero, x[7]); output[12] = x[5]; output[13] = _mm_subs_epi16(__zero, x[13]); output[14] = x[9]; output[15] = _mm_subs_epi16(__zero, x[1]); } static void iadst16_low1_ssse3(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); // stage 1 __m128i x[16]; x[1] = input[0]; // stage 2 btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]); // stage 3 x[8] = x[0]; x[9] = x[1]; // stage 4 btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); // stage 5 x[4] = x[0]; x[5] = x[1]; x[12] = x[8]; x[13] = x[9]; // stage 6 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); // stage 7 x[2] = x[0]; x[3] = x[1]; x[6] = x[4]; x[7] = x[5]; x[10] = x[8]; x[11] = x[9]; x[14] = x[12]; x[15] = x[13]; iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); iadst16_stage9_ssse3(output, x); } static void iadst16_low8_ssse3(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); // stage 1 __m128i x[16]; x[1] = input[0]; x[3] = input[2]; x[5] = input[4]; x[7] = input[6]; x[8] = input[7]; x[10] = input[5]; x[12] = input[3]; x[14] = input[1]; // stage 2 btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]); btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]); btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]); btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]); btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]); btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]); btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]); btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]); // stage 3 iadst16_stage3_ssse3(x); iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit); iadst16_stage5_ssse3(x); iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit); iadst16_stage7_ssse3(x); iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); iadst16_stage9_ssse3(output, x); } static void iadst16_sse2(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); // stage 1 __m128i x[16]; x[0] = input[15]; x[1] = input[0]; x[2] = input[13]; x[3] = input[2]; x[4] = input[11]; x[5] = input[4]; x[6] = input[9]; x[7] = input[6]; x[8] = input[7]; x[9] = input[8]; x[10] = input[5]; x[11] = input[10]; x[12] = input[3]; x[13] = input[12]; x[14] = input[1]; x[15] = input[14]; // stage 2 btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]); btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]); btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]); btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]); btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]); btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]); btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]); btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]); // stage 3~9 iadst16_stage3_ssse3(x); iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit); iadst16_stage5_ssse3(x); iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit); iadst16_stage7_ssse3(x); iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); iadst16_stage9_ssse3(output, x); } static void iadst16_w4_sse2(const __m128i *input, __m128i *output) { const int8_t cos_bit = INV_COS_BIT; const int32_t *cospi = cospi_arr(INV_COS_BIT); const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); // stage 1 __m128i x[16]; x[0] = input[15]; x[1] = input[0]; x[2] = input[13]; x[3] = input[2]; x[4] = input[11]; x[5] = input[4]; x[6] = input[9]; x[7] = input[6]; x[8] = input[7]; x[9] = input[8]; x[10] = input[5]; x[11] = input[10]; x[12] = input[3]; x[13] = input[12]; x[14] = input[1]; x[15] = input[14]; // stage 2 btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]); btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]); btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]); btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]); btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]); btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]); btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]); btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]); // stage 3 iadst16_stage3_ssse3(x); // stage 4 btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]); btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]); btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]); // stage 5 iadst16_stage5_ssse3(x); // stage 6 btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]); // stage 7 iadst16_stage7_ssse3(x); // stage 8 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]); btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]); // stage 9 iadst16_stage9_ssse3(output, x); } static void iidentity4_ssse3(const __m128i *input, __m128i *output) { const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits)); const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits)); for (int i = 0; i < 4; ++i) { __m128i x = _mm_mulhrs_epi16(input[i], scale); output[i] = _mm_adds_epi16(x, input[i]); } } static void iidentity8_sse2(const __m128i *input, __m128i *output) { for (int i = 0; i < 8; ++i) { output[i] = _mm_adds_epi16(input[i], input[i]); } } static void iidentity16_ssse3(const __m128i *input, __m128i *output) { const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits)); const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits)); for (int i = 0; i < 16; ++i) { __m128i x = _mm_mulhrs_epi16(input[i], scale); __m128i srcx2 = _mm_adds_epi16(input[i], input[i]); output[i] = _mm_adds_epi16(x, srcx2); } } static inline __m128i lowbd_get_recon_8x8_sse2(const __m128i pred, __m128i res) { const __m128i zero = _mm_setzero_si128(); __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero)); return _mm_packus_epi16(x0, x0); } static inline void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output, int stride, int flipud, const int height) { int j = flipud ? (height - 1) : 0; const int step = flipud ? -1 : 1; const __m128i zero = _mm_setzero_si128(); for (int i = 0; i < height; ++i, j += step) { const __m128i v = _mm_cvtsi32_si128(*((int *)(output + i * stride))); __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero)); u = _mm_packus_epi16(u, zero); *((int *)(output + i * stride)) = _mm_cvtsi128_si32(u); } } static inline void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output, int stride, int flipud, const int height) { int j = flipud ? (height - 1) : 0; const int step = flipud ? -1 : 1; for (int i = 0; i < height; ++i, j += step) { const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride)); const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]); _mm_storel_epi64((__m128i *)(output + i * stride), u); } } // 1D functions process process 8 pixels at one time. static const transform_1d_ssse3 lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = { { idct4_sse2, iadst4_sse2, iidentity4_ssse3 }, { av1_idct8_sse2, av1_iadst8_sse2, iidentity8_sse2 }, { idct16_sse2, iadst16_sse2, iidentity16_ssse3 }, { idct32_sse2, NULL, NULL }, { idct64_low32_ssse3, NULL, NULL }, }; // functions for blocks with eob at DC and within // topleft 8x8, 16x16, 32x32 corner static const transform_1d_ssse3 lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { { { idct4_sse2, idct4_sse2, NULL, NULL }, { iadst4_sse2, iadst4_sse2, NULL, NULL }, { iidentity4_ssse3, iidentity4_ssse3, NULL, NULL }, }, { { av1_idct8_low1_ssse3, av1_idct8_sse2, NULL, NULL }, { av1_iadst8_low1_ssse3, av1_iadst8_sse2, NULL, NULL }, { iidentity8_sse2, iidentity8_sse2, NULL, NULL } }, { { idct16_low1_ssse3, idct16_low8_ssse3, idct16_sse2, NULL }, { iadst16_low1_ssse3, iadst16_low8_ssse3, iadst16_sse2, NULL }, { NULL, NULL, NULL, NULL }, }, { { idct32_low1_ssse3, idct32_low8_ssse3, idct32_low16_ssse3, idct32_sse2 }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } }, { { idct64_low1_ssse3, idct64_low8_ssse3, idct64_low16_ssse3, idct64_low32_ssse3 }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } } }; // 1D functions process process 4 pixels at one time. // used in 4x4, 4x8, 4x16, 8x4, 16x4 static const transform_1d_ssse3 lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = { { idct4_w4_sse2, iadst4_w4_sse2, iidentity4_ssse3 }, { idct8_w4_sse2, iadst8_w4_sse2, iidentity8_sse2 }, { idct16_w4_sse2, iadst16_w4_sse2, iidentity16_ssse3 }, { NULL, NULL, NULL }, { NULL, NULL, NULL }, }; static inline void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input, int stride, int shift, int height, int txw_idx, int rect_type) { const int32_t *input_row = input; const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]); const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) + (1 << (NewSqrt2Bits - shift - 1))); const __m128i one = _mm_set1_epi16(1); const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding); if (rect_type != 1 && rect_type != -1) { for (int i = 0; i < height; ++i) { const __m128i src = load_32bit_to_16bit(input_row); input_row += stride; __m128i lo = _mm_unpacklo_epi16(src, one); __m128i hi = _mm_unpackhi_epi16(src, one); lo = _mm_madd_epi16(lo, scale_rounding); hi = _mm_madd_epi16(hi, scale_rounding); lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift); hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift); out[i] = _mm_packs_epi32(lo, hi); } } else { const __m128i rect_scale = _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits)); for (int i = 0; i < height; ++i) { __m128i src = load_32bit_to_16bit(input_row); src = _mm_mulhrs_epi16(src, rect_scale); input_row += stride; __m128i lo = _mm_unpacklo_epi16(src, one); __m128i hi = _mm_unpackhi_epi16(src, one); lo = _mm_madd_epi16(lo, scale_rounding); hi = _mm_madd_epi16(hi, scale_rounding); lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift); hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift); out[i] = _mm_packs_epi32(lo, hi); } } } static inline void iidentity_col_8xn_ssse3(uint8_t *output, int stride, __m128i *buf, int shift, int height, int txh_idx) { const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]); const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1)); const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1)); const __m128i one = _mm_set1_epi16(1); const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding); const __m128i zero = _mm_setzero_si128(); for (int h = 0; h < height; ++h) { __m128i lo = _mm_unpacklo_epi16(buf[h], one); __m128i hi = _mm_unpackhi_epi16(buf[h], one); lo = _mm_madd_epi16(lo, scale_coeff); hi = _mm_madd_epi16(hi, scale_coeff); lo = _mm_srai_epi32(lo, NewSqrt2Bits); hi = _mm_srai_epi32(hi, NewSqrt2Bits); lo = _mm_add_epi32(lo, shift_rounding); hi = _mm_add_epi32(hi, shift_rounding); lo = _mm_srai_epi32(lo, -shift); hi = _mm_srai_epi32(hi, -shift); __m128i x = _mm_packs_epi32(lo, hi); const __m128i pred = _mm_loadl_epi64((__m128i const *)(output)); x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero)); const __m128i u = _mm_packus_epi16(x, x); _mm_storel_epi64((__m128i *)(output), u); output += stride; } } void av1_lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output, int stride, TX_SIZE tx_size) { const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int col_max = AOMMIN(32, txfm_size_col); const int row_max = AOMMIN(32, txfm_size_row); const int input_stride = row_max; const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); for (int i = 0; i < (col_max >> 3); ++i) { for (int j = 0; j < (row_max >> 3); j++) { __m128i buf[8]; iidentity_row_8xn_ssse3(buf, input + j * 8 + i * 8 * input_stride, row_max, shift[0], 8, txw_idx, rect_type); transpose_16bit_8x8(buf, buf); iidentity_col_8xn_ssse3(output + i * 8 + j * 8 * stride, stride, buf, shift[1], 8, txh_idx); } } } static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size_, int eob) { (void)tx_size_; (void)eob; __m128i buf[4]; const TX_SIZE tx_size = TX_4X4; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const transform_1d_ssse3 row_txfm = lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_ssse3 col_txfm = lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col); row_txfm(buf, buf); if (lr_flip) { __m128i temp[4]; flip_buf_sse2(buf, temp, txfm_size_col); transpose_16bit_4x4(temp, buf); } else { transpose_16bit_4x4(buf, buf); } col_txfm(buf, buf); round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); } static inline __m128i lowbd_get_recon_16x16_sse2(const __m128i pred, __m128i res0, __m128i res1) { const __m128i zero = _mm_setzero_si128(); __m128i x0 = _mm_unpacklo_epi8(pred, zero); __m128i x1 = _mm_unpackhi_epi8(pred, zero); x0 = _mm_adds_epi16(res0, x0); x1 = _mm_adds_epi16(res1, x1); return _mm_packus_epi16(x0, x1); } static inline void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output, int stride, int flipud, int height) { int j = flipud ? (height - 1) : 0; const int step = flipud ? -1 : 1; for (int i = 0; i < height; ++i, j += step) { __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride)); __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]); _mm_storeu_si128((__m128i *)(output + i * stride), u); } } static inline void round_shift_ssse3(const __m128i *input, __m128i *output, int size) { const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8); for (int i = 0; i < size; ++i) { output[i] = _mm_mulhrs_epi16(input[i], scale); } } static inline void lowbd_inv_txfm2d_add_no_identity_ssse3( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { __m128i buf1[64 * 8]; int eobx, eoby; get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div8 = txfm_size_col >> 3; const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3; const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; const int input_stride = AOMMIN(32, txfm_size_row); const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; const transform_1d_ssse3 row_txfm = lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; const transform_1d_ssse3 col_txfm = lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; assert(col_txfm != NULL); assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < buf_size_nonzero_h_div8; i++) { __m128i buf0[64]; load_buffer_32bit_to_16bit(input + 8 * i, input_stride, buf0, buf_size_nonzero_w); if (rect_type == 1 || rect_type == -1) { round_shift_ssse3(buf0, buf0, buf_size_nonzero_w); // rect special code } row_txfm(buf0, buf0); round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]); __m128i *_buf1 = buf1 + i * 8; if (lr_flip) { for (int j = 0; j < buf_size_w_div8; ++j) { __m128i temp[8]; flip_buf_sse2(buf0 + 8 * j, temp, 8); transpose_16bit_8x8(temp, _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j)); } } else { for (int j = 0; j < buf_size_w_div8; ++j) { transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j); } } } for (int i = 0; i < buf_size_w_div8; i++) { col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row); round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]); } if (txfm_size_col >= 16) { for (int i = 0; i < (txfm_size_col >> 4); i++) { lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2, output + 16 * i, stride, ud_flip, txfm_size_row); } } else if (txfm_size_col == 8) { lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row); } } void av1_lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; int eobx, eoby; get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div8 = (eobx + 8) >> 3; const int buf_size_h_div8 = (eoby + 8) >> 3; const int input_stride = AOMMIN(32, txfm_size_row); const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby]; assert(fun_idx < 5); const transform_1d_ssse3 col_txfm = lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx]; assert(col_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < buf_size_w_div8; i++) { __m128i buf0[64]; for (int j = 0; j < buf_size_h_div8; j++) { __m128i *buf0_cur = buf0 + j * 8; const int32_t *input_cur = input + i * 8 * input_stride + j * 8; iidentity_row_8xn_ssse3(buf0_cur, input_cur, input_stride, shift[0], 8, txw_idx, rect_type); transpose_16bit_8x8(buf0_cur, buf0_cur); } col_txfm(buf0, buf0); __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1])); int k = ud_flip ? (txfm_size_row - 1) : 0; const int step = ud_flip ? -1 : 1; uint8_t *out = output + 8 * i; for (int j = 0; j < txfm_size_row; ++j, k += step) { const __m128i v = _mm_loadl_epi64((__m128i const *)(out)); __m128i res = _mm_mulhrs_epi16(buf0[k], mshift); const __m128i u = lowbd_get_recon_8x8_sse2(v, res); _mm_storel_epi64((__m128i *)(out), u); out += stride; } } } void av1_lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { __m128i buf1[64]; int eobx, eoby; get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div8 = txfm_size_col >> 3; const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3; const int buf_size_h_div8 = (eoby + 8) >> 3; const int input_stride = AOMMIN(32, txfm_size_row); const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; const transform_1d_ssse3 row_txfm = lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < buf_size_h_div8; i++) { __m128i buf0[64]; load_buffer_32bit_to_16bit(input + i * 8, input_stride, buf0, buf_size_nonzero_w); if (rect_type == 1 || rect_type == -1) { round_shift_ssse3(buf0, buf0, buf_size_nonzero_w); // rect special code } row_txfm(buf0, buf0); round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]); __m128i *_buf1 = buf1; if (lr_flip) { for (int j = 0; j < buf_size_w_div8; ++j) { __m128i temp[8]; flip_buf_sse2(buf0 + 8 * j, temp, 8); transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j)); } } else { for (int j = 0; j < buf_size_w_div8; ++j) { transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j); } } for (int j = 0; j < buf_size_w_div8; ++j) { iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride, buf1 + j * 8, shift[1], 8, txh_idx); } } } // for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64 static inline void lowbd_inv_txfm2d_add_universe_ssse3( const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { switch (tx_type) { case DCT_DCT: lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type, tx_size, eob); break; case IDTX: av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size); break; case V_DCT: case V_ADST: case V_FLIPADST: av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type, tx_size, eob); break; case H_DCT: case H_ADST: case H_FLIPADST: av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type, tx_size, eob); break; default: lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type, tx_size, eob); break; } } static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size_, int eob) { (void)tx_size_; (void)eob; __m128i buf[8]; const TX_SIZE tx_size = TX_4X8; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const transform_1d_ssse3 row_txfm = lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_ssse3 col_txfm = lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); load_buffer_32bit_to_16bit(input, txfm_size_row, buf, txfm_size_col); round_shift_ssse3(buf, buf, txfm_size_col); // rect special code row_txfm(buf, buf); // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0 if (lr_flip) { __m128i temp[4]; flip_buf_sse2(buf, temp, txfm_size_col); transpose_16bit_8x4(temp, buf); } else { transpose_16bit_8x4(buf, buf); } col_txfm(buf, buf); round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); } static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size_, int eob) { (void)tx_size_; (void)eob; __m128i buf[8]; const TX_SIZE tx_size = TX_8X4; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const transform_1d_ssse3 row_txfm = lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_ssse3 col_txfm = lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col); round_shift_ssse3(buf, buf, txfm_size_col); // rect special code row_txfm(buf, buf); // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0 if (lr_flip) { __m128i temp[8]; flip_buf_sse2(buf, temp, txfm_size_col); transpose_16bit_4x8(temp, buf); } else { transpose_16bit_4x8(buf, buf); } col_txfm(buf, buf); round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row); } static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size_, int eob) { (void)tx_size_; (void)eob; __m128i buf[16]; const TX_SIZE tx_size = TX_4X16; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const transform_1d_ssse3 row_txfm = lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_ssse3 col_txfm = lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); const int row_one_loop = 8; for (int i = 0; i < 2; ++i) { const int32_t *input_cur = input + i * row_one_loop; __m128i *buf_cur = buf + i * row_one_loop; load_buffer_32bit_to_16bit(input_cur, txfm_size_row, buf_cur, txfm_size_col); if (row_txfm == iidentity4_ssse3) { const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1)); const __m128i ones = _mm_set1_epi16(1); for (int j = 0; j < 4; ++j) { const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones); const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones); const __m128i buf_32_lo = _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1)); const __m128i buf_32_hi = _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1)); buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi); } } else { row_txfm(buf_cur, buf_cur); round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]); } if (lr_flip) { __m128i temp[8]; flip_buf_sse2(buf_cur, temp, txfm_size_col); transpose_16bit_8x4(temp, buf_cur); } else { transpose_16bit_8x4(buf_cur, buf_cur); } } col_txfm(buf, buf); round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); } static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size_, int eob) { (void)tx_size_; (void)eob; __m128i buf[16]; const TX_SIZE tx_size = TX_16X4; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div8 = txfm_size_col >> 3; const transform_1d_ssse3 row_txfm = lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; const transform_1d_ssse3 col_txfm = lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); const int row_one_loop = 8; load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col); if (row_txfm == iidentity16_ssse3) { const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1)); const __m128i ones = _mm_set1_epi16(1); for (int j = 0; j < 16; ++j) { const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones); const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones); const __m128i buf_32_lo = _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1)); const __m128i buf_32_hi = _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1)); buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi); } } else { row_txfm(buf, buf); round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); } if (lr_flip) { __m128i temp[16]; flip_buf_sse2(buf, temp, 16); transpose_16bit_4x8(temp, buf); transpose_16bit_4x8(temp + 8, buf + 8); } else { transpose_16bit_4x8(buf, buf); transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop); } for (int i = 0; i < buf_size_w_div8; i++) { col_txfm(buf + i * row_one_loop, buf + i * row_one_loop); round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]); } lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4); lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4); } void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob) { switch (tx_size) { case TX_4X4: lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size, eob); break; case TX_4X8: lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size, eob); break; case TX_8X4: lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size, eob); break; case TX_4X16: lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size, eob); break; case TX_16X4: lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size, eob); break; default: lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type, tx_size, eob); break; } } void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param) { if (!txfm_param->lossless) { const TX_TYPE tx_type = txfm_param->tx_type; av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type, txfm_param->tx_size, txfm_param->eob); } else { av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); } } aom-3.12.1/av1/common/x86/av1_inv_txfm_ssse3.h000066400000000000000000000214051477627663500207010ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ #define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ #include // SSE2 #include // SSSE3 #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/transpose_sse2.h" #ifdef __cplusplus extern "C" { #endif #define btf_16_ssse3(w0, w1, in, out0, out1) \ do { \ const __m128i _w0 = _mm_set1_epi16(w0 * 8); \ const __m128i _w1 = _mm_set1_epi16(w1 * 8); \ const __m128i _in = in; \ out0 = _mm_mulhrs_epi16(_in, _w0); \ out1 = _mm_mulhrs_epi16(_in, _w1); \ } while (0) #define btf_16_adds_subs_sse2(in0, in1) \ do { \ const __m128i _in0 = in0; \ const __m128i _in1 = in1; \ in0 = _mm_adds_epi16(_in0, _in1); \ in1 = _mm_subs_epi16(_in0, _in1); \ } while (0) #define btf_16_subs_adds_sse2(in0, in1) \ do { \ const __m128i _in0 = in0; \ const __m128i _in1 = in1; \ in1 = _mm_subs_epi16(_in0, _in1); \ in0 = _mm_adds_epi16(_in0, _in1); \ } while (0) #define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \ do { \ const __m128i _in0 = in0; \ const __m128i _in1 = in1; \ out0 = _mm_adds_epi16(_in0, _in1); \ out1 = _mm_subs_epi16(_in0, _in1); \ } while (0) static inline void round_shift_16bit_ssse3(__m128i *in, int size, int bit) { if (bit < 0) { const __m128i scale = _mm_set1_epi16(1 << (15 + bit)); for (int i = 0; i < size; ++i) { in[i] = _mm_mulhrs_epi16(in[i], scale); } } else if (bit > 0) { for (int i = 0; i < size; ++i) { in[i] = _mm_slli_epi16(in[i], bit); } } } // 1D itx types enum { IDCT_1D, IADST_1D, IFLIPADST_1D = IADST_1D, IIDENTITY_1D, ITX_TYPES_1D, } UENUM1BYTE(ITX_TYPE_1D); static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D, IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D, }; static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { IDCT_1D, IDCT_1D, IADST_1D, IADST_1D, IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D, IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x16_default[16]) = { 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x32_default[32]) = { 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = { 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = { 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x32_default[32]) = { 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x16_default[16]) = { 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = { 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, }; DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = { 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f, }; DECLARE_ALIGNED(16, static const int16_t *, av1_eob_to_eobxy_default[TX_SIZES_ALL]) = { NULL, av1_eob_to_eobxy_8x8_default, av1_eob_to_eobxy_16x16_default, av1_eob_to_eobxy_32x32_default, av1_eob_to_eobxy_32x32_default, NULL, NULL, av1_eob_to_eobxy_8x16_default, av1_eob_to_eobxy_16x8_default, av1_eob_to_eobxy_16x32_default, av1_eob_to_eobxy_32x16_default, av1_eob_to_eobxy_32x32_default, av1_eob_to_eobxy_32x32_default, NULL, NULL, av1_eob_to_eobxy_8x32_default, av1_eob_to_eobxy_32x8_default, av1_eob_to_eobxy_16x32_default, av1_eob_to_eobxy_32x16_default, }; static const int lowbd_txfm_all_1d_zeros_idx[32] = { 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, }; // Transform block width in log2 for eob (size of 64 map to 32) static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = { 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5, }; static inline void get_eobx_eoby_scan_default(int *eobx, int *eoby, TX_SIZE tx_size, int eob) { if (eob == 1) { *eobx = 0; *eoby = 0; return; } const int tx_w_log2 = tx_size_wide_log2_eob[tx_size]; const int eob_row = (eob - 1) >> tx_w_log2; const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row]; *eobx = eobxy & 0xFF; *eoby = eobxy >> 8; } static const int eob_fill[32] = { 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, }; static inline void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby, TX_SIZE tx_size, int eob) { eob -= 1; const int txfm_size_col = tx_size_wide[tx_size]; const int eobx_max = AOMMIN(32, txfm_size_col) - 1; *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob]; const int temp_eoby = eob / (eobx_max + 1); assert(temp_eoby < 32); *eoby = eob_fill[temp_eoby]; } static inline void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby, TX_SIZE tx_size, int eob) { eob -= 1; const int txfm_size_row = tx_size_high[tx_size]; const int eoby_max = AOMMIN(32, txfm_size_row) - 1; *eobx = eob_fill[eob / (eoby_max + 1)]; *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob]; } typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output); void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob); void av1_lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output, int stride, TX_SIZE tx_size); void av1_lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob); void av1_lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob); void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output); void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ aom-3.12.1/av1/common/x86/av1_txfm_sse2.h000066400000000000000000000315271477627663500176470ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ #define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ #include // SSE2 #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/transpose_sse2.h" #include "aom_dsp/x86/txfm_common_sse2.h" #include "av1/common/av1_txfm.h" #ifdef __cplusplus extern "C" { #endif static inline void btf_16_w4_sse2( const __m128i *const w0, const __m128i *const w1, const __m128i __rounding, const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1, __m128i *const out0, __m128i *const out1) { const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1); const __m128i u0 = _mm_madd_epi16(t0, *w0); const __m128i v0 = _mm_madd_epi16(t0, *w1); const __m128i a0 = _mm_add_epi32(u0, __rounding); const __m128i b0 = _mm_add_epi32(v0, __rounding); const __m128i c0 = _mm_srai_epi32(a0, cos_bit); const __m128i d0 = _mm_srai_epi32(b0, cos_bit); *out0 = _mm_packs_epi32(c0, c0); *out1 = _mm_packs_epi32(d0, c0); } #define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \ do { \ __m128i t0 = _mm_unpacklo_epi16(in0, in1); \ __m128i u0 = _mm_madd_epi16(t0, w0); \ __m128i v0 = _mm_madd_epi16(t0, w1); \ \ __m128i a0 = _mm_add_epi32(u0, __rounding); \ __m128i b0 = _mm_add_epi32(v0, __rounding); \ \ __m128i c0 = _mm_srai_epi32(a0, cos_bit); \ __m128i d0 = _mm_srai_epi32(b0, cos_bit); \ \ out0 = _mm_packs_epi32(c0, c0); \ out1 = _mm_packs_epi32(d0, d0); \ } while (0) #define btf_16_sse2(w0, w1, in0, in1, out0, out1) \ do { \ __m128i t0 = _mm_unpacklo_epi16(in0, in1); \ __m128i t1 = _mm_unpackhi_epi16(in0, in1); \ __m128i u0 = _mm_madd_epi16(t0, w0); \ __m128i u1 = _mm_madd_epi16(t1, w0); \ __m128i v0 = _mm_madd_epi16(t0, w1); \ __m128i v1 = _mm_madd_epi16(t1, w1); \ \ __m128i a0 = _mm_add_epi32(u0, __rounding); \ __m128i a1 = _mm_add_epi32(u1, __rounding); \ __m128i b0 = _mm_add_epi32(v0, __rounding); \ __m128i b1 = _mm_add_epi32(v1, __rounding); \ \ __m128i c0 = _mm_srai_epi32(a0, cos_bit); \ __m128i c1 = _mm_srai_epi32(a1, cos_bit); \ __m128i d0 = _mm_srai_epi32(b0, cos_bit); \ __m128i d1 = _mm_srai_epi32(b1, cos_bit); \ \ out0 = _mm_packs_epi32(c0, c1); \ out1 = _mm_packs_epi32(d0, d1); \ } while (0) static inline __m128i load_16bit_to_16bit(const int16_t *a) { return _mm_load_si128((const __m128i *)a); } static inline __m128i load_32bit_to_16bit(const int32_t *a) { const __m128i a_low = _mm_load_si128((const __m128i *)a); return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); } static inline __m128i load_32bit_to_16bit_w4(const int32_t *a) { const __m128i a_low = _mm_load_si128((const __m128i *)a); return _mm_packs_epi32(a_low, a_low); } // Store 4 16 bit values. Sign extend the values. static inline void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) { const __m128i a_lo = _mm_unpacklo_epi16(a, a); const __m128i a_1 = _mm_srai_epi32(a_lo, 16); _mm_store_si128((__m128i *)b, a_1); } // Store 8 16 bit values. Sign extend the values. static inline void store_16bit_to_32bit(__m128i a, int32_t *b) { const __m128i a_lo = _mm_unpacklo_epi16(a, a); const __m128i a_hi = _mm_unpackhi_epi16(a, a); const __m128i a_1 = _mm_srai_epi32(a_lo, 16); const __m128i a_2 = _mm_srai_epi32(a_hi, 16); _mm_store_si128((__m128i *)b, a_1); _mm_store_si128((__m128i *)(b + 4), a_2); } static inline __m128i scale_round_sse2(const __m128i a, const int scale) { const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1)); const __m128i b = _mm_madd_epi16(a, scale_rounding); return _mm_srai_epi32(b, NewSqrt2Bits); } static inline void store_rect_16bit_to_32bit_w4(const __m128i a, int32_t *const b) { const __m128i one = _mm_set1_epi16(1); const __m128i a_lo = _mm_unpacklo_epi16(a, one); const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); _mm_store_si128((__m128i *)b, b_lo); } static inline void store_rect_16bit_to_32bit(const __m128i a, int32_t *const b) { const __m128i one = _mm_set1_epi16(1); const __m128i a_lo = _mm_unpacklo_epi16(a, one); const __m128i a_hi = _mm_unpackhi_epi16(a, one); const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2); _mm_store_si128((__m128i *)b, b_lo); _mm_store_si128((__m128i *)(b + 4), b_hi); } static inline void load_buffer_16bit_to_16bit_w4(const int16_t *const in, const int stride, __m128i *const out, const int out_size) { for (int i = 0; i < out_size; ++i) { out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride)); } } static inline void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in, const int stride, __m128i *const out, const int out_size) { for (int i = 0; i < out_size; ++i) { out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride)); } } static inline void load_buffer_16bit_to_16bit(const int16_t *in, int stride, __m128i *out, int out_size) { for (int i = 0; i < out_size; ++i) { out[i] = load_16bit_to_16bit(in + i * stride); } } static inline void load_buffer_16bit_to_16bit_flip(const int16_t *in, int stride, __m128i *out, int out_size) { for (int i = 0; i < out_size; ++i) { out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride); } } static inline void load_buffer_32bit_to_16bit(const int32_t *in, int stride, __m128i *out, int out_size) { for (int i = 0; i < out_size; ++i) { out[i] = load_32bit_to_16bit(in + i * stride); } } static inline void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride, __m128i *out, int out_size) { for (int i = 0; i < out_size; ++i) { out[i] = load_32bit_to_16bit_w4(in + i * stride); } } static inline void load_buffer_32bit_to_16bit_flip(const int32_t *in, int stride, __m128i *out, int out_size) { for (int i = 0; i < out_size; ++i) { out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride); } } static inline void store_buffer_16bit_to_32bit_w4(const __m128i *const in, int32_t *const out, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { store_16bit_to_32bit_w4(in[i], out + i * stride); } } static inline void store_buffer_16bit_to_32bit_w8(const __m128i *const in, int32_t *const out, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { store_16bit_to_32bit(in[i], out + i * stride); } } static inline void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in, int32_t *const out, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { store_rect_16bit_to_32bit_w4(in[i], out + i * stride); } } static inline void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in, int32_t *const out, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { store_rect_16bit_to_32bit(in[i], out + i * stride); } } static inline void store_buffer_16bit_to_16bit_8x8(const __m128i *in, uint16_t *out, const int stride) { for (int i = 0; i < 8; ++i) { _mm_store_si128((__m128i *)(out + i * stride), in[i]); } } static inline void round_shift_16bit(__m128i *in, int size, int bit) { if (bit < 0) { bit = -bit; __m128i rounding = _mm_set1_epi16(1 << (bit - 1)); for (int i = 0; i < size; ++i) { in[i] = _mm_adds_epi16(in[i], rounding); in[i] = _mm_srai_epi16(in[i], bit); } } else if (bit > 0) { for (int i = 0; i < size; ++i) { in[i] = _mm_slli_epi16(in[i], bit); } } } static inline void flip_buf_sse2(__m128i *in, __m128i *out, int size) { for (int i = 0; i < size; ++i) { out[size - i - 1] = in[i]; } } void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output, int8_t cos_bit); void av1_iadst8_sse2(const __m128i *input, __m128i *output); void av1_idct8_sse2(const __m128i *input, __m128i *output); typedef struct { transform_1d_sse2 col, row; // vertical and horizontal } transform_2d_sse2; #ifdef __cplusplus } #endif // __cplusplus #endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ aom-3.12.1/av1/common/x86/av1_txfm_sse4.c000066400000000000000000000015741477627663500176430ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/av1_rtcd.h" #include "av1/common/av1_txfm.h" #include "av1/common/x86/av1_txfm_sse4.h" // This function assumes `arr` is 16-byte aligned. void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit) { __m128i *const vec = (__m128i *)arr; const int vec_size = size >> 2; av1_round_shift_array_32_sse4_1(vec, vec, vec_size, bit); } aom-3.12.1/av1/common/x86/av1_txfm_sse4.h000066400000000000000000000045311477627663500176440ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ #define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ #include #ifdef __cplusplus extern "C" { #endif static inline __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) { __m128i tmp, round; round = _mm_set1_epi32(1 << (bit - 1)); tmp = _mm_add_epi32(vec, round); return _mm_srai_epi32(tmp, bit); } static inline void av1_round_shift_array_32_sse4_1(const __m128i *input, __m128i *output, const int size, const int bit) { if (bit > 0) { int i; for (i = 0; i < size; i++) { output[i] = av1_round_shift_32_sse4_1(input[i], bit); } } else { int i; for (i = 0; i < size; i++) { output[i] = _mm_slli_epi32(input[i], -bit); } } } static inline void av1_round_shift_rect_array_32_sse4_1(const __m128i *input, __m128i *output, const int size, const int bit, const int val) { const __m128i sqrt2 = _mm_set1_epi32(val); if (bit > 0) { int i; for (i = 0; i < size; i++) { const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit); const __m128i r1 = _mm_mullo_epi32(sqrt2, r0); output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits); } } else { int i; for (i = 0; i < size; i++) { const __m128i r0 = _mm_slli_epi32(input[i], -bit); const __m128i r1 = _mm_mullo_epi32(sqrt2, r0); output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits); } } } #ifdef __cplusplus } #endif #endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ aom-3.12.1/av1/common/x86/cdef_block_avx2.c000066400000000000000000000364301477627663500201720ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom_dsp/aom_simd.h" #define SIMD_FUNC(name) name##_avx2 #include "av1/common/cdef_block_simd.h" /* partial A is a 16-bit vector of the form: [x8 - - x1 | x16 - - x9] and partial B has the form: [0 y1 - y7 | 0 y9 - y15]. This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 on each 128-bit lane. Here the C1..C8 constants are in const1 and const2. */ static inline __m256i fold_mul_and_sum_avx2(__m256i *partiala, __m256i *partialb, const __m256i *const1, const __m256i *const2) { // Mask used to shuffle the elements present in 256bit register. static const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504, 0x0f0e0100, 0x0b0a0d0c, 0x07060908, 0x03020504, 0x0f0e0100 }; __m256i tmp; /* Reverse partial B. */ *partialb = _mm256_shuffle_epi8( *partialb, _mm256_loadu_si256((const __m256i *)shuffle_reg_256bit)); /* Interleave the x and y values of identical indices and pair x8 with 0. */ tmp = *partiala; *partiala = _mm256_unpacklo_epi16(*partiala, *partialb); *partialb = _mm256_unpackhi_epi16(tmp, *partialb); /* Square and add the corresponding x and y values. */ *partiala = _mm256_madd_epi16(*partiala, *partiala); *partialb = _mm256_madd_epi16(*partialb, *partialb); /* Multiply by constant. */ *partiala = _mm256_mullo_epi32(*partiala, *const1); *partialb = _mm256_mullo_epi32(*partialb, *const2); /* Sum all results. */ *partiala = _mm256_add_epi32(*partiala, *partialb); return *partiala; } static inline __m256i hsum4_avx2(__m256i *x0, __m256i *x1, __m256i *x2, __m256i *x3) { const __m256i t0 = _mm256_unpacklo_epi32(*x0, *x1); const __m256i t1 = _mm256_unpacklo_epi32(*x2, *x3); const __m256i t2 = _mm256_unpackhi_epi32(*x0, *x1); const __m256i t3 = _mm256_unpackhi_epi32(*x2, *x3); *x0 = _mm256_unpacklo_epi64(t0, t1); *x1 = _mm256_unpackhi_epi64(t0, t1); *x2 = _mm256_unpacklo_epi64(t2, t3); *x3 = _mm256_unpackhi_epi64(t2, t3); return _mm256_add_epi32(_mm256_add_epi32(*x0, *x1), _mm256_add_epi32(*x2, *x3)); } /* Computes cost for directions 0, 5, 6 and 7. We can call this function again to compute the remaining directions. */ static inline __m256i compute_directions_avx2(__m256i *lines, int32_t cost_frist_8x8[4], int32_t cost_second_8x8[4]) { __m256i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b; __m256i partial6; __m256i tmp; /* Partial sums for lines 0 and 1. */ partial4a = _mm256_slli_si256(lines[0], 14); partial4b = _mm256_srli_si256(lines[0], 2); partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[1], 12)); partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[1], 4)); tmp = _mm256_add_epi16(lines[0], lines[1]); partial5a = _mm256_slli_si256(tmp, 10); partial5b = _mm256_srli_si256(tmp, 6); partial7a = _mm256_slli_si256(tmp, 4); partial7b = _mm256_srli_si256(tmp, 12); partial6 = tmp; /* Partial sums for lines 2 and 3. */ partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[2], 10)); partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[2], 6)); partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[3], 8)); partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[3], 8)); tmp = _mm256_add_epi16(lines[2], lines[3]); partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 8)); partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 8)); partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 6)); partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 10)); partial6 = _mm256_add_epi16(partial6, tmp); /* Partial sums for lines 4 and 5. */ partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[4], 6)); partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[4], 10)); partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[5], 4)); partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[5], 12)); tmp = _mm256_add_epi16(lines[4], lines[5]); partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 6)); partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 10)); partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 8)); partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 8)); partial6 = _mm256_add_epi16(partial6, tmp); /* Partial sums for lines 6 and 7. */ partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[6], 2)); partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[6], 14)); partial4a = _mm256_add_epi16(partial4a, lines[7]); tmp = _mm256_add_epi16(lines[6], lines[7]); partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 4)); partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 12)); partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 10)); partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 6)); partial6 = _mm256_add_epi16(partial6, tmp); const __m256i const_reg_1 = _mm256_set_epi32(210, 280, 420, 840, 210, 280, 420, 840); const __m256i const_reg_2 = _mm256_set_epi32(105, 120, 140, 168, 105, 120, 140, 168); const __m256i const_reg_3 = _mm256_set_epi32(210, 420, 0, 0, 210, 420, 0, 0); const __m256i const_reg_4 = _mm256_set_epi32(105, 105, 105, 140, 105, 105, 105, 140); /* Compute costs in terms of partial sums. */ partial4a = fold_mul_and_sum_avx2(&partial4a, &partial4b, &const_reg_1, &const_reg_2); partial7a = fold_mul_and_sum_avx2(&partial7a, &partial7b, &const_reg_3, &const_reg_4); partial5a = fold_mul_and_sum_avx2(&partial5a, &partial5b, &const_reg_3, &const_reg_4); partial6 = _mm256_madd_epi16(partial6, partial6); partial6 = _mm256_mullo_epi32(partial6, _mm256_set1_epi32(105)); partial4a = hsum4_avx2(&partial4a, &partial5a, &partial6, &partial7a); _mm_storeu_si128((__m128i *)cost_frist_8x8, _mm256_castsi256_si128(partial4a)); _mm_storeu_si128((__m128i *)cost_second_8x8, _mm256_extractf128_si256(partial4a, 1)); return partial4a; } /* transpose and reverse the order of the lines -- equivalent to a 90-degree counter-clockwise rotation of the pixels. */ static inline void array_reverse_transpose_8x8_avx2(__m256i *in, __m256i *res) { const __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); const __m256i tr0_1 = _mm256_unpacklo_epi16(in[2], in[3]); const __m256i tr0_2 = _mm256_unpackhi_epi16(in[0], in[1]); const __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); const __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); const __m256i tr0_5 = _mm256_unpacklo_epi16(in[6], in[7]); const __m256i tr0_6 = _mm256_unpackhi_epi16(in[4], in[5]); const __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_4, tr0_5); const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1); const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_4, tr0_5); const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_2, tr0_3); const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7); const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_2, tr0_3); const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); res[7] = _mm256_unpacklo_epi64(tr1_0, tr1_1); res[6] = _mm256_unpackhi_epi64(tr1_0, tr1_1); res[5] = _mm256_unpacklo_epi64(tr1_2, tr1_3); res[4] = _mm256_unpackhi_epi64(tr1_2, tr1_3); res[3] = _mm256_unpacklo_epi64(tr1_4, tr1_5); res[2] = _mm256_unpackhi_epi64(tr1_4, tr1_5); res[1] = _mm256_unpacklo_epi64(tr1_6, tr1_7); res[0] = _mm256_unpackhi_epi64(tr1_6, tr1_7); } void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var_out_1st, int32_t *var_out_2nd, int coeff_shift, int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { int32_t cost_first_8x8[8]; int32_t cost_second_8x8[8]; // Used to store the best cost for 2 8x8's. int32_t best_cost[2] = { 0 }; // Best direction for 2 8x8's. int best_dir[2] = { 0 }; const __m128i const_coeff_shift_reg = _mm_cvtsi32_si128(coeff_shift); const __m256i const_128_reg = _mm256_set1_epi16(128); __m256i lines[8]; for (int i = 0; i < 8; i++) { const __m128i src_1 = _mm_loadu_si128((const __m128i *)&img1[i * stride]); const __m128i src_2 = _mm_loadu_si128((const __m128i *)&img2[i * stride]); lines[i] = _mm256_insertf128_si256(_mm256_castsi128_si256(src_1), src_2, 1); lines[i] = _mm256_sub_epi16( _mm256_sra_epi16(lines[i], const_coeff_shift_reg), const_128_reg); } /* Compute "mostly vertical" directions. */ const __m256i dir47 = compute_directions_avx2(lines, cost_first_8x8 + 4, cost_second_8x8 + 4); /* Transpose and reverse the order of the lines. */ array_reverse_transpose_8x8_avx2(lines, lines); /* Compute "mostly horizontal" directions. */ const __m256i dir03 = compute_directions_avx2(lines, cost_first_8x8, cost_second_8x8); __m256i max = _mm256_max_epi32(dir03, dir47); max = _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 8), _mm256_slli_si256(max, 16 - (8)))); max = _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 4), _mm256_slli_si256(max, 16 - (4)))); const __m128i first_8x8_output = _mm256_castsi256_si128(max); const __m128i second_8x8_output = _mm256_extractf128_si256(max, 1); const __m128i cmpeg_res_00 = _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir47)); const __m128i cmpeg_res_01 = _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir03)); const __m128i cmpeg_res_10 = _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir47, 1)); const __m128i cmpeg_res_11 = _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir03, 1)); const __m128i t_first_8x8 = _mm_packs_epi32(cmpeg_res_01, cmpeg_res_00); const __m128i t_second_8x8 = _mm_packs_epi32(cmpeg_res_11, cmpeg_res_10); best_cost[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(max)); best_cost[1] = _mm_cvtsi128_si32(second_8x8_output); best_dir[0] = _mm_movemask_epi8(_mm_packs_epi16(t_first_8x8, t_first_8x8)); best_dir[0] = get_msb(best_dir[0] ^ (best_dir[0] - 1)); // Count trailing zeros best_dir[1] = _mm_movemask_epi8(_mm_packs_epi16(t_second_8x8, t_second_8x8)); best_dir[1] = get_msb(best_dir[1] ^ (best_dir[1] - 1)); // Count trailing zeros /* Difference between the optimal variance and the variance along the orthogonal direction. Again, the sum(x^2) terms cancel out. */ *var_out_1st = best_cost[0] - cost_first_8x8[(best_dir[0] + 4) & 7]; *var_out_2nd = best_cost[1] - cost_second_8x8[(best_dir[1] + 4) & 7]; /* We'd normally divide by 840, but dividing by 1024 is close enough for what we're going to do with this. */ *var_out_1st >>= 10; *var_out_2nd >>= 10; *out_dir_1st_8x8 = best_dir[0]; *out_dir_2nd_8x8 = best_dir[1]; } void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height) { int j = 0; int remaining_width = width; assert(height % 2 == 0); assert(height > 0); assert(width > 0); // Process multiple 32 pixels at a time. if (remaining_width > 31) { int i = 0; do { j = 0; do { __m128i row00 = _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + (j + 0)]); __m128i row01 = _mm_loadu_si128( (const __m128i *)&src[(i + 0) * sstride + (j + 16)]); __m128i row10 = _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + (j + 0)]); __m128i row11 = _mm_loadu_si128( (const __m128i *)&src[(i + 1) * sstride + (j + 16)]); _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 0)], _mm256_cvtepu8_epi16(row00)); _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 16)], _mm256_cvtepu8_epi16(row01)); _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 0)], _mm256_cvtepu8_epi16(row10)); _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 16)], _mm256_cvtepu8_epi16(row11)); j += 32; } while (j <= width - 32); i += 2; } while (i < height); remaining_width = width & 31; } // Process 16 pixels at a time. if (remaining_width > 15) { int i = 0; do { __m128i row0 = _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + j]); __m128i row1 = _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + j]); _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + j], _mm256_cvtepu8_epi16(row0)); _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + j], _mm256_cvtepu8_epi16(row1)); i += 2; } while (i < height); remaining_width = width & 15; j += 16; } // Process 8 pixels at a time. if (remaining_width > 7) { int i = 0; do { __m128i row0 = _mm_loadl_epi64((const __m128i *)&src[(i + 0) * sstride + j]); __m128i row1 = _mm_loadl_epi64((const __m128i *)&src[(i + 1) * sstride + j]); _mm_storeu_si128((__m128i *)&dst[(i + 0) * dstride + j], _mm_unpacklo_epi8(row0, _mm_setzero_si128())); _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride + j], _mm_unpacklo_epi8(row1, _mm_setzero_si128())); i += 2; } while (i < height); remaining_width = width & 7; j += 8; } // Process 4 pixels at a time. if (remaining_width > 3) { int i = 0; do { __m128i row0 = _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 0) * sstride + j])); __m128i row1 = _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 1) * sstride + j])); _mm_storel_epi64((__m128i *)&dst[(i + 0) * dstride + j], _mm_unpacklo_epi8(row0, _mm_setzero_si128())); _mm_storel_epi64((__m128i *)&dst[(i + 1) * dstride + j], _mm_unpacklo_epi8(row1, _mm_setzero_si128())); i += 2; } while (i < height); remaining_width = width & 3; j += 4; } // Process the remaining pixels. if (remaining_width) { for (int i = 0; i < height; i++) { for (int k = j; k < width; k++) { dst[i * dstride + k] = src[i * sstride + k]; } } } } aom-3.12.1/av1/common/x86/cdef_block_sse4.c000066400000000000000000000032111477627663500201570ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom_dsp/aom_simd.h" #define SIMD_FUNC(name) name##_sse4_1 #include "av1/common/cdef_block_simd.h" void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var_out_1st, int32_t *var_out_2nd, int coeff_shift, int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { // Process first 8x8. *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift); // Process second 8x8. *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift); } void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height) { int j = 0; for (int i = 0; i < height; i++) { for (j = 0; j < (width & ~0x7); j += 8) { v64 row = v64_load_unaligned(&src[i * sstride + j]); v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row)); } for (; j < width; j++) { dst[i * dstride + j] = src[i * sstride + j]; } } } aom-3.12.1/av1/common/x86/cdef_block_ssse3.c000066400000000000000000000040571477627663500203520ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Include SSSE3 CDEF code only for 32-bit x86, to support Valgrind. // For normal use, we require SSE4.1, so cdef_*_sse4_1 will be used instead of // these functions. However, 32-bit Valgrind does not support SSE4.1, so we // include a fallback to SSSE3 to improve performance #include "config/aom_config.h" #if !AOM_ARCH_X86 #error "cdef_block_ssse3.c is included for compatibility with 32-bit x86 only" #endif // !AOM_ARCH_X86 #include "aom_dsp/aom_simd.h" #define SIMD_FUNC(name) name##_ssse3 #include "av1/common/cdef_block_simd.h" void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, int stride, int32_t *var_out_1st, int32_t *var_out_2nd, int coeff_shift, int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { // Process first 8x8. *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift); // Process second 8x8. *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift); } void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height) { int j; for (int i = 0; i < height; i++) { for (j = 0; j < (width & ~0x7); j += 8) { v64 row = v64_load_unaligned(&src[i * sstride + j]); v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row)); } for (; j < width; j++) { dst[i * dstride + j] = src[i * sstride + j]; } } } aom-3.12.1/av1/common/x86/cfl_avx2.c000066400000000000000000000515741477627663500166710ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #include "av1/common/cfl.h" #include "av1/common/x86/cfl_simd.h" #define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \ CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \ CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \ CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \ cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \ TX_SIZE tx_size) { \ static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ NULL, /* 64x64 (invalid CFL size) */ \ cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ NULL, /* 32x64 (invalid CFL size) */ \ NULL, /* 64x32 (invalid CFL size) */ \ cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ NULL, /* 16x64 (invalid CFL size) */ \ NULL, /* 64x16 (invalid CFL size) */ \ }; \ return subfn_##sub[tx_size]; \ } /** * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more * precise version of a box filter 4:2:0 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. * * Note: For 4:2:0 luma subsampling, the width will never be greater than 16. */ static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { (void)width; // Forever 32 const __m256i twos = _mm256_set1_epi8(2); // Thirty two twos const int luma_stride = input_stride << 1; __m256i *row = (__m256i *)pred_buf_q3; const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256; do { __m256i top = _mm256_loadu_si256((__m256i *)input); __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride)); __m256i top_16x16 = _mm256_maddubs_epi16(top, twos); __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos); __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16); _mm256_storeu_si256(row, sum_16x16); input += luma_stride; } while ((row += CFL_BUF_LINE_I256) < row_end); } CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd) /** * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more * precise version of a box filter 4:2:2 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { (void)width; // Forever 32 const __m256i fours = _mm256_set1_epi8(4); // Thirty two fours __m256i *row = (__m256i *)pred_buf_q3; const __m256i *row_end = row + height * CFL_BUF_LINE_I256; do { __m256i top = _mm256_loadu_si256((__m256i *)input); __m256i top_16x16 = _mm256_maddubs_epi16(top, fours); _mm256_storeu_si256(row, top_16x16); input += input_stride; } while ((row += CFL_BUF_LINE_I256) < row_end); } CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd) /** * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only * performed on block of width 32. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { (void)width; // Forever 32 __m256i *row = (__m256i *)pred_buf_q3; const __m256i *row_end = row + height * CFL_BUF_LINE_I256; const __m256i zeros = _mm256_setzero_si256(); do { __m256i top = _mm256_loadu_si256((__m256i *)input); top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0)); __m256i row_lo = _mm256_unpacklo_epi8(top, zeros); row_lo = _mm256_slli_epi16(row_lo, 3); __m256i row_hi = _mm256_unpackhi_epi8(top, zeros); row_hi = _mm256_slli_epi16(row_hi, 3); _mm256_storeu_si256(row, row_lo); _mm256_storeu_si256(row + 1, row_hi); input += input_stride; } while ((row += CFL_BUF_LINE_I256) < row_end); } CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd) #if CONFIG_AV1_HIGHBITDEPTH /** * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more * precise version of a box filter 4:2:0 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. * * Note: For 4:2:0 luma subsampling, the width will never be greater than 16. */ static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { (void)width; // Forever 32 const int luma_stride = input_stride << 1; __m256i *row = (__m256i *)pred_buf_q3; const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256; do { __m256i top = _mm256_loadu_si256((__m256i *)input); __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride)); __m256i sum = _mm256_add_epi16(top, bot); __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride)); __m256i sum_1 = _mm256_add_epi16(top_1, bot_1); __m256i hsum = _mm256_hadd_epi16(sum, sum_1); hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0)); hsum = _mm256_add_epi16(hsum, hsum); _mm256_storeu_si256(row, hsum); input += luma_stride; } while ((row += CFL_BUF_LINE_I256) < row_end); } CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, hbd) /** * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more * precise version of a box filter 4:2:2 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. * */ static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { (void)width; // Forever 32 __m256i *row = (__m256i *)pred_buf_q3; const __m256i *row_end = row + height * CFL_BUF_LINE_I256; do { __m256i top = _mm256_loadu_si256((__m256i *)input); __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); __m256i hsum = _mm256_hadd_epi16(top, top_1); hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0)); hsum = _mm256_slli_epi16(hsum, 2); _mm256_storeu_si256(row, hsum); input += input_stride; } while ((row += CFL_BUF_LINE_I256) < row_end); } CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, hbd) static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { (void)width; // Forever 32 __m256i *row = (__m256i *)pred_buf_q3; const __m256i *row_end = row + height * CFL_BUF_LINE_I256; do { __m256i top = _mm256_loadu_si256((__m256i *)input); __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3)); _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3)); input += input_stride; } while ((row += CFL_BUF_LINE_I256) < row_end); } CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd) #endif // CONFIG_AV1_HIGHBITDEPTH static inline __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12, __m256i alpha_sign, __m256i dc_q0) { __m256i ac_q3 = _mm256_loadu_si256(input); __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3); __m256i scaled_luma_q0 = _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12); scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign); return _mm256_add_epi16(scaled_luma_q0, dc_q0); } static inline void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3, int width, int height) { (void)width; const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3); const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9); const __m256i dc_q0 = _mm256_set1_epi16(*dst); __m256i *row = (__m256i *)pred_buf_q3; const __m256i *row_end = row + height * CFL_BUF_LINE_I256; do { __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); res = _mm256_packus_epi16(res, next); res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0)); _mm256_storeu_si256((__m256i *)dst, res); dst += dst_stride; } while ((row += CFL_BUF_LINE_I256) < row_end); } CFL_PREDICT_X(avx2, 32, 8, lbd) CFL_PREDICT_X(avx2, 32, 16, lbd) CFL_PREDICT_X(avx2, 32, 32, lbd) cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) { static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = { cfl_predict_lbd_4x4_ssse3, /* 4x4 */ cfl_predict_lbd_8x8_ssse3, /* 8x8 */ cfl_predict_lbd_16x16_ssse3, /* 16x16 */ cfl_predict_lbd_32x32_avx2, /* 32x32 */ NULL, /* 64x64 (invalid CFL size) */ cfl_predict_lbd_4x8_ssse3, /* 4x8 */ cfl_predict_lbd_8x4_ssse3, /* 8x4 */ cfl_predict_lbd_8x16_ssse3, /* 8x16 */ cfl_predict_lbd_16x8_ssse3, /* 16x8 */ cfl_predict_lbd_16x32_ssse3, /* 16x32 */ cfl_predict_lbd_32x16_avx2, /* 32x16 */ NULL, /* 32x64 (invalid CFL size) */ NULL, /* 64x32 (invalid CFL size) */ cfl_predict_lbd_4x16_ssse3, /* 4x16 */ cfl_predict_lbd_16x4_ssse3, /* 16x4 */ cfl_predict_lbd_8x32_ssse3, /* 8x32 */ cfl_predict_lbd_32x8_avx2, /* 32x8 */ NULL, /* 16x64 (invalid CFL size) */ NULL, /* 64x16 (invalid CFL size) */ }; // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the // function pointer array out of bounds. return pred[tx_size % TX_SIZES_ALL]; } #if CONFIG_AV1_HIGHBITDEPTH static __m256i highbd_max_epi16(int bd) { const __m256i neg_one = _mm256_set1_epi16(-1); // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd) return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one); } static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) { return _mm256_max_epi16(_mm256_min_epi16(u, max), zero); } static inline void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd, int width, int height) { // Use SSSE3 version for smaller widths assert(width == 16 || width == 32); const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3); const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9); const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst); const __m256i max = highbd_max_epi16(bd); __m256i *row = (__m256i *)pred_buf_q3; const __m256i *row_end = row + height * CFL_BUF_LINE_I256; do { const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); _mm256_storeu_si256((__m256i *)dst, highbd_clamp_epi16(res, _mm256_setzero_si256(), max)); if (width == 32) { const __m256i res_1 = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); _mm256_storeu_si256( (__m256i *)(dst + 16), highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max)); } dst += dst_stride; } while ((row += CFL_BUF_LINE_I256) < row_end); } CFL_PREDICT_X(avx2, 16, 4, hbd) CFL_PREDICT_X(avx2, 16, 8, hbd) CFL_PREDICT_X(avx2, 16, 16, hbd) CFL_PREDICT_X(avx2, 16, 32, hbd) CFL_PREDICT_X(avx2, 32, 8, hbd) CFL_PREDICT_X(avx2, 32, 16, hbd) CFL_PREDICT_X(avx2, 32, 32, hbd) cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) { static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = { cfl_predict_hbd_4x4_ssse3, /* 4x4 */ cfl_predict_hbd_8x8_ssse3, /* 8x8 */ cfl_predict_hbd_16x16_avx2, /* 16x16 */ cfl_predict_hbd_32x32_avx2, /* 32x32 */ NULL, /* 64x64 (invalid CFL size) */ cfl_predict_hbd_4x8_ssse3, /* 4x8 */ cfl_predict_hbd_8x4_ssse3, /* 8x4 */ cfl_predict_hbd_8x16_ssse3, /* 8x16 */ cfl_predict_hbd_16x8_avx2, /* 16x8 */ cfl_predict_hbd_16x32_avx2, /* 16x32 */ cfl_predict_hbd_32x16_avx2, /* 32x16 */ NULL, /* 32x64 (invalid CFL size) */ NULL, /* 64x32 (invalid CFL size) */ cfl_predict_hbd_4x16_ssse3, /* 4x16 */ cfl_predict_hbd_16x4_avx2, /* 16x4 */ cfl_predict_hbd_8x32_ssse3, /* 8x32 */ cfl_predict_hbd_32x8_avx2, /* 32x8 */ NULL, /* 16x64 (invalid CFL size) */ NULL, /* 64x16 (invalid CFL size) */ }; // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the // function pointer array out of bounds. return pred[tx_size % TX_SIZES_ALL]; } #endif // CONFIG_AV1_HIGHBITDEPTH // Returns a vector where all the (32-bits) elements are the sum of all the // lanes in a. static inline __m256i fill_sum_epi32(__m256i a) { // Given that a == [A, B, C, D, E, F, G, H] a = _mm256_hadd_epi32(a, a); // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H // a == [A', C', A', C', E', G', E', G'] a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)); // a == [A', C', E', G', A', C', E', G'] a = _mm256_hadd_epi32(a, a); // Given that A'' == A' + C' and E'' == E' + G' // a == [A'', E'', A'', E'', A'', E'', A'', E''] return _mm256_hadd_epi32(a, a); // Given that A''' == A'' + E'' // a == [A''', A''', A''', A''', A''', A''', A''', A'''] } static inline __m256i _mm256_addl_epi16(__m256i a) { return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()), _mm256_unpackhi_epi16(a, _mm256_setzero_si256())); } static inline void subtract_average_avx2(const uint16_t *src_ptr, int16_t *dst_ptr, int width, int height, int round_offset, int num_pel_log2) { // Use SSE2 version for smaller widths assert(width == 16 || width == 32); const __m256i *src = (__m256i *)src_ptr; const __m256i *const end = src + height * CFL_BUF_LINE_I256; // To maximize usage of the AVX2 registers, we sum two rows per loop // iteration const int step = 2 * CFL_BUF_LINE_I256; __m256i sum = _mm256_setzero_si256(); // For width 32, we use a second sum accumulator to reduce accumulator // dependencies in the loop. __m256i sum2; if (width == 32) sum2 = _mm256_setzero_si256(); do { // Add top row to the bottom row __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src), _mm256_loadu_si256(src + CFL_BUF_LINE_I256)); sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0)); if (width == 32) { /* Don't worry, this if it gets optimized out. */ // Add the second part of the top row to the second part of the bottom row __m256i l1 = _mm256_add_epi16(_mm256_loadu_si256(src + 1), _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256)); sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1)); } src += step; } while (src < end); // Combine both sum accumulators if (width == 32) sum = _mm256_add_epi32(sum, sum2); __m256i fill = fill_sum_epi32(sum); __m256i avg_epi16 = _mm256_srli_epi32( _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2); avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16); // Store and subtract loop src = (__m256i *)src_ptr; __m256i *dst = (__m256i *)dst_ptr; do { _mm256_storeu_si256(dst, _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16)); if (width == 32) { _mm256_storeu_si256( dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16)); } src += CFL_BUF_LINE_I256; dst += CFL_BUF_LINE_I256; } while (src < end); } // Declare wrappers for AVX2 sizes CFL_SUB_AVG_X(avx2, 16, 4, 32, 6) CFL_SUB_AVG_X(avx2, 16, 8, 64, 7) CFL_SUB_AVG_X(avx2, 16, 16, 128, 8) CFL_SUB_AVG_X(avx2, 16, 32, 256, 9) CFL_SUB_AVG_X(avx2, 32, 8, 128, 8) CFL_SUB_AVG_X(avx2, 32, 16, 256, 9) CFL_SUB_AVG_X(avx2, 32, 32, 512, 10) // Based on the observation that for small blocks AVX2 does not outperform // SSE2, we call the SSE2 code for block widths 4 and 8. cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) { static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { cfl_subtract_average_4x4_sse2, /* 4x4 */ cfl_subtract_average_8x8_sse2, /* 8x8 */ cfl_subtract_average_16x16_avx2, /* 16x16 */ cfl_subtract_average_32x32_avx2, /* 32x32 */ NULL, /* 64x64 (invalid CFL size) */ cfl_subtract_average_4x8_sse2, /* 4x8 */ cfl_subtract_average_8x4_sse2, /* 8x4 */ cfl_subtract_average_8x16_sse2, /* 8x16 */ cfl_subtract_average_16x8_avx2, /* 16x8 */ cfl_subtract_average_16x32_avx2, /* 16x32 */ cfl_subtract_average_32x16_avx2, /* 32x16 */ NULL, /* 32x64 (invalid CFL size) */ NULL, /* 64x32 (invalid CFL size) */ cfl_subtract_average_4x16_sse2, /* 4x16 */ cfl_subtract_average_16x4_avx2, /* 16x4 */ cfl_subtract_average_8x32_sse2, /* 8x32 */ cfl_subtract_average_32x8_avx2, /* 32x8 */ NULL, /* 16x64 (invalid CFL size) */ NULL, /* 64x16 (invalid CFL size) */ }; // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to // index the function pointer array out of bounds. return sub_avg[tx_size % TX_SIZES_ALL]; } aom-3.12.1/av1/common/x86/cfl_simd.h000066400000000000000000000347651477627663500167550ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_ #define AOM_AV1_COMMON_X86_CFL_SIMD_H_ #include "av1/common/blockd.h" // SSSE3 version is optimal for with == 4, we reuse them in AVX2 void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 16, we reuse it in AVX2 void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 4, we reuse them in AVX2 void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 16, we reuse it in AVX2 void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 4, we reuse them in AVX2 void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 16, we reuse it in AVX2 void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *cfl_type, int input_stride, uint16_t *output_q3); #if CONFIG_AV1_HIGHBITDEPTH void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is faster for with == 16, we reuse it in AVX2 void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is faster for with == 16, we reuse it in AVX2 void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); // SSSE3 version is faster for with == 16, we reuse it in AVX2 void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *cfl_type, int input_stride, uint16_t *output_q3); #endif // CONFIG_AV1_HIGHBITDEPTH // SSE2 version is optimal for with == 4, we reuse them in AVX2 void cfl_subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst); void cfl_subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst); void cfl_subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst); // SSE2 version is optimal for with == 8, we reuse them in AVX2 void cfl_subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst); void cfl_subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst); void cfl_subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst); void cfl_subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst); void cfl_predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); void cfl_predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); void cfl_predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); void cfl_predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); void cfl_predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); void cfl_predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); void cfl_predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); void cfl_predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); void cfl_predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); void cfl_predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); void cfl_predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3); #if CONFIG_AV1_HIGHBITDEPTH void cfl_predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd); void cfl_predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd); void cfl_predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd); void cfl_predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd); void cfl_predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd); void cfl_predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd); void cfl_predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd); void cfl_predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd); void cfl_predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd); void cfl_predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd); void cfl_predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd); #endif // CONFIG_AV1_HIGHBITDEPTH #endif // AOM_AV1_COMMON_X86_CFL_SIMD_H_ aom-3.12.1/av1/common/x86/cfl_sse2.c000066400000000000000000000067651477627663500166670ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/common/cfl.h" #include "config/av1_rtcd.h" static inline __m128i fill_sum_epi32(__m128i l0) { l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1))); } static inline void subtract_average_sse2(const uint16_t *src_ptr, int16_t *dst_ptr, int width, int height, int round_offset, int num_pel_log2) { const __m128i zeros = _mm_setzero_si128(); const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset); const __m128i *src = (__m128i *)src_ptr; const __m128i *const end = src + height * CFL_BUF_LINE_I128; const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4)); __m128i sum = zeros; do { __m128i l0; if (width == 4) { l0 = _mm_add_epi16(_mm_loadl_epi64(src), _mm_loadl_epi64(src + CFL_BUF_LINE_I128)); __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128), _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128)); sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), _mm_unpacklo_epi16(l1, zeros))); } else { if (width == 8) { l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + CFL_BUF_LINE_I128)); } else { l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1)); } sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), _mm_unpackhi_epi16(l0, zeros))); if (width == 32) { l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3)); sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), _mm_unpackhi_epi16(l0, zeros))); } } src += step; } while (src < end); sum = fill_sum_epi32(sum); __m128i avg_epi16 = _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2); avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16); src = (__m128i *)src_ptr; __m128i *dst = (__m128i *)dst_ptr; do { if (width == 4) { _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16)); } else { _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16)); if (width > 8) { _mm_storeu_si128(dst + 1, _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16)); if (width == 32) { _mm_storeu_si128(dst + 2, _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16)); _mm_storeu_si128(dst + 3, _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16)); } } } src += CFL_BUF_LINE_I128; dst += CFL_BUF_LINE_I128; } while (src < end); } CFL_SUB_AVG_FN(sse2) aom-3.12.1/av1/common/x86/cfl_ssse3.c000066400000000000000000000411321477627663500170360ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #include "av1/common/cfl.h" #include "av1/common/x86/cfl_simd.h" // Load 32-bit integer from memory into the first element of dst. static inline __m128i _mm_loadh_epi32(__m128i const *mem_addr) { return _mm_cvtsi32_si128(*((int *)mem_addr)); } // Store 32-bit integer from the first element of a into memory. static inline void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) { *((int *)mem_addr) = _mm_cvtsi128_si32(a); } /** * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more * precise version of a box filter 4:2:0 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static inline void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const __m128i twos = _mm_set1_epi8(2); __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128; const int luma_stride = input_stride << 1; do { if (width == 4) { __m128i top = _mm_loadh_epi32((__m128i *)input); top = _mm_maddubs_epi16(top, twos); __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride)); bot = _mm_maddubs_epi16(bot, twos); const __m128i sum = _mm_add_epi16(top, bot); _mm_storeh_epi32(pred_buf_m128i, sum); } else if (width == 8) { __m128i top = _mm_loadl_epi64((__m128i *)input); top = _mm_maddubs_epi16(top, twos); __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); bot = _mm_maddubs_epi16(bot, twos); const __m128i sum = _mm_add_epi16(top, bot); _mm_storel_epi64(pred_buf_m128i, sum); } else { __m128i top = _mm_loadu_si128((__m128i *)input); top = _mm_maddubs_epi16(top, twos); __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); bot = _mm_maddubs_epi16(bot, twos); const __m128i sum = _mm_add_epi16(top, bot); _mm_storeu_si128(pred_buf_m128i, sum); if (width == 32) { __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); __m128i bot_1 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); top_1 = _mm_maddubs_epi16(top_1, twos); bot_1 = _mm_maddubs_epi16(bot_1, twos); __m128i sum_1 = _mm_add_epi16(top_1, bot_1); _mm_storeu_si128(pred_buf_m128i + 1, sum_1); } } input += luma_stride; pred_buf_m128i += CFL_BUF_LINE_I128; } while (pred_buf_m128i < end); } /** * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more * precise version of a box filter 4:2:2 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static inline void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const __m128i fours = _mm_set1_epi8(4); __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; do { if (width == 4) { __m128i top = _mm_loadh_epi32((__m128i *)input); top = _mm_maddubs_epi16(top, fours); _mm_storeh_epi32(pred_buf_m128i, top); } else if (width == 8) { __m128i top = _mm_loadl_epi64((__m128i *)input); top = _mm_maddubs_epi16(top, fours); _mm_storel_epi64(pred_buf_m128i, top); } else { __m128i top = _mm_loadu_si128((__m128i *)input); top = _mm_maddubs_epi16(top, fours); _mm_storeu_si128(pred_buf_m128i, top); if (width == 32) { __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); top_1 = _mm_maddubs_epi16(top_1, fours); _mm_storeu_si128(pred_buf_m128i + 1, top_1); } } input += input_stride; pred_buf_m128i += CFL_BUF_LINE_I128; } while (pred_buf_m128i < end); } /** * Multiplies the pixels by 8 (scaling in Q3). * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static inline void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const __m128i zeros = _mm_setzero_si128(); const int luma_stride = input_stride; __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; do { if (width == 4) { __m128i row = _mm_loadh_epi32((__m128i *)input); row = _mm_unpacklo_epi8(row, zeros); _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3)); } else if (width == 8) { __m128i row = _mm_loadl_epi64((__m128i *)input); row = _mm_unpacklo_epi8(row, zeros); _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3)); } else { __m128i row = _mm_loadu_si128((__m128i *)input); const __m128i row_lo = _mm_unpacklo_epi8(row, zeros); const __m128i row_hi = _mm_unpackhi_epi8(row, zeros); _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3)); _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3)); if (width == 32) { __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1); const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros); const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros); _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3)); _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3)); } } input += luma_stride; pred_buf_m128i += CFL_BUF_LINE_I128; } while (pred_buf_m128i < end); } #if CONFIG_AV1_HIGHBITDEPTH /** * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more * precise version of a box filter 4:2:0 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static inline void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; const int luma_stride = input_stride << 1; do { if (width == 4) { const __m128i top = _mm_loadl_epi64((__m128i *)input); const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); __m128i sum = _mm_add_epi16(top, bot); sum = _mm_hadd_epi16(sum, sum); *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum)); } else { const __m128i top = _mm_loadu_si128((__m128i *)input); const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); __m128i sum = _mm_add_epi16(top, bot); if (width == 8) { sum = _mm_hadd_epi16(sum, sum); _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); } else { const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); const __m128i bot_1 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1)); _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); if (width == 32) { const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); const __m128i bot_2 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2); const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); const __m128i bot_3 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3); const __m128i sum_2 = _mm_add_epi16(top_2, bot_2); const __m128i sum_3 = _mm_add_epi16(top_3, bot_3); __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3); _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, _mm_add_epi16(next_sum, next_sum)); } } } input += luma_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); } /** * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more * precise version of a box filter 4:2:2 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static inline void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; do { if (width == 4) { const __m128i top = _mm_loadl_epi64((__m128i *)input); const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2); _mm_storeh_epi32(pred_buf_m128i, sum); } else { const __m128i top = _mm_loadu_si128((__m128i *)input); if (width == 8) { const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2); _mm_storel_epi64(pred_buf_m128i, sum); } else { const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2); _mm_storeu_si128(pred_buf_m128i, sum); if (width == 32) { const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2); _mm_storeu_si128(pred_buf_m128i + 1, sum_1); } } } pred_buf_m128i += CFL_BUF_LINE_I128; input += input_stride; } while (pred_buf_m128i < end); } static inline void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; do { if (width == 4) { const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3); _mm_storel_epi64((__m128i *)pred_buf_q3, row); } else { const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3); _mm_storeu_si128((__m128i *)pred_buf_q3, row); if (width >= 16) { __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1); row_1 = _mm_slli_epi16(row_1, 3); _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1); if (width == 32) { __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2); row_2 = _mm_slli_epi16(row_2, 3); _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2); __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3); row_3 = _mm_slli_epi16(row_3, 3); _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3); } } } input += input_stride; pred_buf_q3 += CFL_BUF_LINE; } while (pred_buf_q3 < end); } #endif // CONFIG_AV1_HIGHBITDEPTH CFL_GET_SUBSAMPLE_FUNCTION(ssse3) static inline __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12, __m128i alpha_sign, __m128i dc_q0) { __m128i ac_q3 = _mm_loadu_si128(input); __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3); __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12); scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign); return _mm_add_epi16(scaled_luma_q0, dc_q0); } static inline void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3, int width, int height) { const __m128i alpha_sign = _mm_set1_epi16(alpha_q3); const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); const __m128i dc_q0 = _mm_set1_epi16(*dst); __m128i *row = (__m128i *)pred_buf_q3; const __m128i *row_end = row + height * CFL_BUF_LINE_I128; do { __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); if (width < 16) { res = _mm_packus_epi16(res, res); if (width == 4) _mm_storeh_epi32((__m128i *)dst, res); else _mm_storel_epi64((__m128i *)dst, res); } else { __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); res = _mm_packus_epi16(res, next); _mm_storeu_si128((__m128i *)dst, res); if (width == 32) { res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0); next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0); res = _mm_packus_epi16(res, next); _mm_storeu_si128((__m128i *)(dst + 16), res); } } dst += dst_stride; } while ((row += CFL_BUF_LINE_I128) < row_end); } CFL_PREDICT_FN(ssse3, lbd) #if CONFIG_AV1_HIGHBITDEPTH static inline __m128i highbd_max_epi16(int bd) { const __m128i neg_one = _mm_set1_epi16(-1); // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd) return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one); } static inline __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) { return _mm_max_epi16(_mm_min_epi16(u, max), zero); } static inline void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, int bd, int width, int height) { const __m128i alpha_sign = _mm_set1_epi16(alpha_q3); const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); const __m128i dc_q0 = _mm_set1_epi16(*dst); const __m128i max = highbd_max_epi16(bd); const __m128i zeros = _mm_setzero_si128(); __m128i *row = (__m128i *)pred_buf_q3; const __m128i *row_end = row + height * CFL_BUF_LINE_I128; do { __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); res = highbd_clamp_epi16(res, zeros, max); if (width == 4) { _mm_storel_epi64((__m128i *)dst, res); } else { _mm_storeu_si128((__m128i *)dst, res); } if (width >= 16) { const __m128i res_1 = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); _mm_storeu_si128(((__m128i *)dst) + 1, highbd_clamp_epi16(res_1, zeros, max)); } if (width == 32) { const __m128i res_2 = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0); _mm_storeu_si128((__m128i *)(dst + 16), highbd_clamp_epi16(res_2, zeros, max)); const __m128i res_3 = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0); _mm_storeu_si128((__m128i *)(dst + 24), highbd_clamp_epi16(res_3, zeros, max)); } dst += dst_stride; } while ((row += CFL_BUF_LINE_I128) < row_end); } CFL_PREDICT_FN(ssse3, hbd) #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/av1/common/x86/convolve_2d_avx2.c000066400000000000000000000145651477627663500203440ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #if CONFIG_SVT_AV1 #include "third_party/SVT-AV1/convolve_2d_avx2.h" #endif #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/synonyms.h" #include "av1/common/convolve.h" static void convolve_2d_sr_general_avx2( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { if (filter_params_x->taps > 8) { const int bd = 8; int im_stride = 8, i; DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; assert(conv_params->round_0 > 0); const __m256i round_const_h12 = _mm256_set1_epi32( ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1))); const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0); const __m256i sum_round_v = _mm256_set1_epi32( (1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1); const __m256i round_const_v = _mm256_set1_epi32( ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - ((1 << (offset_bits - conv_params->round_1)) >> 1)); const __m128i round_shift_v = _mm_cvtsi32_si128(bits); __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 }; int horiz_tap = 12; int vert_tap = 12; prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h); prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v); int im_h = h + vert_tap - 1; const int fo_vert = vert_tap / 2 - 1; const int fo_horiz = horiz_tap / 2 - 1; const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; for (int j = 0; j < w; j += 8) { CONVOLVE_SR_HORIZONTAL_FILTER_12TAP CONVOLVE_SR_VERTICAL_FILTER_12TAP } } else { const int bd = 8; int im_stride = 8, i; DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; assert(conv_params->round_0 > 0); const __m256i round_const_h = _mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); const __m256i sum_round_v = _mm256_set1_epi32( (1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1); const __m256i round_const_v = _mm256_set1_epi32( ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - ((1 << (offset_bits - conv_params->round_1)) >> 1)); const __m128i round_shift_v = _mm_cvtsi32_si128(bits); __m256i filt[4], coeffs_h[4], coeffs_v[4]; prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h); prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v); int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn); int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); if (horiz_tap == 6) prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h); else prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h); if (vert_tap == 6) prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v); else prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v); int im_h = h + vert_tap - 1; const int fo_vert = vert_tap / 2 - 1; const int fo_horiz = horiz_tap / 2 - 1; const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); for (int j = 0; j < w; j += 8) { if (horiz_tap == 4) { CONVOLVE_SR_HORIZONTAL_FILTER_4TAP } else if (horiz_tap == 6) { CONVOLVE_SR_HORIZONTAL_FILTER_6TAP } else { CONVOLVE_SR_HORIZONTAL_FILTER_8TAP } if (vert_tap == 4) { CONVOLVE_SR_VERTICAL_FILTER_4TAP } else if (vert_tap == 6) { CONVOLVE_SR_VERTICAL_FILTER_6TAP } else { CONVOLVE_SR_VERTICAL_FILTER_8TAP } } } } void av1_convolve_2d_sr_avx2( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t w, int32_t h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn, const int32_t subpel_y_qn, ConvolveParams *conv_params) { #if CONFIG_SVT_AV1 const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_qn); const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_qn); const bool use_general = (tap_x == 12 || tap_y == 12); if (use_general) { convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); } else { av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); } #else convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); #endif } aom-3.12.1/av1/common/x86/convolve_2d_sse2.c000066400000000000000000000563771477627663500203470ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" #include "aom_dsp/x86/convolve_common_intrin.h" #include "av1/common/convolve.h" static void convolve_2d_sr_12tap_sse2( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { const int bd = 8; DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); int im_h = h + filter_params_y->taps - 1; int im_stride = w; int i, j; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; const __m128i zero = _mm_setzero_si128(); const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; assert(conv_params->round_0 > 0); __m128i coeffs[6]; /* Horizontal filter */ { prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs); const __m128i round_const = _mm_set1_epi32( (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1)); const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); for (i = 0; i < im_h; ++i) { for (j = 0; j < w; j += 8) { const __m128i data = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); const __m128i data_2 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 4)]); // Filter even-index pixels const __m128i src_0 = _mm_unpacklo_epi8(data, zero); const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]); const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]); const __m128i src_4 = _mm_unpacklo_epi8(data_2, zero); const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]); const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data_2, 2), zero); const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]); const __m128i src_8 = _mm_unpacklo_epi8(_mm_srli_si128(data_2, 4), zero); const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]); const __m128i src_10 = _mm_unpacklo_epi8(_mm_srli_si128(data_2, 6), zero); const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]); const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246); res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); // Filter odd-index pixels const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]); const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]); const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data_2, 1), zero); const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]); const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data_2, 3), zero); const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]); const __m128i src_9 = _mm_unpacklo_epi8(_mm_srli_si128(data_2, 5), zero); const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]); const __m128i src_11 = _mm_unpacklo_epi8(_mm_srli_si128(data_2, 7), zero); const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]); const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357); res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 __m128i res = _mm_packs_epi32(res_even, res_odd); _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); } } } /* Vertical filter */ { prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs); const __m128i sum_round = _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1); const __m128i round_const = _mm_set1_epi32( ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - ((1 << (offset_bits - conv_params->round_1)) >> 1)); const __m128i round_shift = _mm_cvtsi32_si128(bits); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { // Filter even-index pixels const int16_t *data = &im_block[i * im_stride + j]; const __m128i src_0 = _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), *(__m128i *)(data + 1 * im_stride)); const __m128i src_2 = _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), *(__m128i *)(data + 3 * im_stride)); const __m128i src_4 = _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), *(__m128i *)(data + 5 * im_stride)); const __m128i src_6 = _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), *(__m128i *)(data + 7 * im_stride)); const __m128i src_8 = _mm_unpacklo_epi16(*(__m128i *)(data + 8 * im_stride), *(__m128i *)(data + 9 * im_stride)); const __m128i src_10 = _mm_unpacklo_epi16(*(__m128i *)(data + 10 * im_stride), *(__m128i *)(data + 11 * im_stride)); const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]); const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]); const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]); const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]); const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]); const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]); const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246); // Filter odd-index pixels const __m128i src_1 = _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), *(__m128i *)(data + 1 * im_stride)); const __m128i src_3 = _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), *(__m128i *)(data + 3 * im_stride)); const __m128i src_5 = _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), *(__m128i *)(data + 5 * im_stride)); const __m128i src_7 = _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), *(__m128i *)(data + 7 * im_stride)); const __m128i src_9 = _mm_unpackhi_epi16(*(__m128i *)(data + 8 * im_stride), *(__m128i *)(data + 9 * im_stride)); const __m128i src_11 = _mm_unpackhi_epi16(*(__m128i *)(data + 10 * im_stride), *(__m128i *)(data + 11 * im_stride)); const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]); const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]); const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]); const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]); const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]); const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]); const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357); // Rearrange pixels back into the order 0 ... 7 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); __m128i res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift); __m128i res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift); res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift); res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const), round_shift); const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round); const __m128i res = _mm_packus_epi16(res16, res16); // Accumulate values into the destination buffer __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; _mm_storel_epi64(p, res); } } } } void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { if (filter_params_x->taps > 8) { if (w < 8) { av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); } else { convolve_2d_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); } } else { const int bd = 8; DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); int im_h = h + filter_params_y->taps - 1; int im_stride = MAX_SB_SIZE; int i, j; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; const __m128i zero = _mm_setzero_si128(); const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; assert(conv_params->round_0 > 0); /* Horizontal filter */ { const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32( (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1)); const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); for (i = 0; i < im_h; ++i) { for (j = 0; j < w; j += 8) { const __m128i data = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); // Filter even-index pixels const __m128i src_0 = _mm_unpacklo_epi8(data, zero); const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); // Filter odd-index pixels const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 __m128i res = _mm_packs_epi32(res_even, res_odd); _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); } } } /* Vertical filter */ { const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i sum_round = _mm_set1_epi32( (1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1); const __m128i round_const = _mm_set1_epi32( ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - ((1 << (offset_bits - conv_params->round_1)) >> 1)); const __m128i round_shift = _mm_cvtsi32_si128(bits); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { // Filter even-index pixels const int16_t *data = &im_block[i * im_stride + j]; const __m128i src_0 = _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), *(__m128i *)(data + 1 * im_stride)); const __m128i src_2 = _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), *(__m128i *)(data + 3 * im_stride)); const __m128i src_4 = _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), *(__m128i *)(data + 5 * im_stride)); const __m128i src_6 = _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), *(__m128i *)(data + 7 * im_stride)); const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); // Filter odd-index pixels const __m128i src_1 = _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), *(__m128i *)(data + 1 * im_stride)); const __m128i src_3 = _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), *(__m128i *)(data + 3 * im_stride)); const __m128i src_5 = _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), *(__m128i *)(data + 5 * im_stride)); const __m128i src_7 = _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), *(__m128i *)(data + 7 * im_stride)); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); // Rearrange pixels back into the order 0 ... 7 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); __m128i res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift); __m128i res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift); res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift); res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const), round_shift); const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round); const __m128i res = _mm_packus_epi16(res16, res16); // Accumulate values into the destination buffer __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; if (w == 2) { *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res); } else if (w == 4) { *(int *)p = _mm_cvtsi128_si32(res); } else { _mm_storel_epi64(p, res); } } } } } } void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, ConvolveParams *conv_params) { const int bd = 8; CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bits = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const __m128i zero = _mm_setzero_si128(); const __m128i left_shift = _mm_cvtsi32_si128(bits); int i, j; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi16(w0); const __m128i wt1 = _mm_set1_epi16(w1); const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m128i offset_const = _mm_set1_epi16(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); assert((w % 4) == 0); if (!(w % 16)) { for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 16) { const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]); const __m128i d16_lo = _mm_unpacklo_epi8(d8, zero); const __m128i d16_hi = _mm_unpackhi_epi8(d8, zero); const __m128i res_lo = _mm_sll_epi16(d16_lo, left_shift); const __m128i res_unsigned_lo = _mm_add_epi16(res_lo, offset_const); const __m128i res_hi = _mm_sll_epi16(d16_hi, left_shift); const __m128i res_unsigned_hi = _mm_add_epi16(res_hi, offset_const); if (do_average) { const __m128i data_ref_0_lo = _mm_loadu_si128((__m128i *)(&dst[j])); const __m128i data_ref_0_hi = _mm_loadu_si128((__m128i *)(&dst[j + 8])); const __m128i comp_avg_res_lo = comp_avg( &data_ref_0_lo, &res_unsigned_lo, &wt, use_dist_wtd_comp_avg); const __m128i round_result_lo = convolve_rounding( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); const __m128i comp_avg_res_hi = comp_avg( &data_ref_0_hi, &res_unsigned_hi, &wt, use_dist_wtd_comp_avg); const __m128i round_result_hi = convolve_rounding( &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); const __m128i res_8 = _mm_packus_epi16(round_result_lo, round_result_hi); _mm_store_si128((__m128i *)(&dst0[j]), res_8); } else { _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo); _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi); } } src += src_stride; dst += dst_stride; dst0 += dst_stride0; } } else { for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]); const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero); const __m128i res = _mm_sll_epi16(d16_0, left_shift); const __m128i res_unsigned = _mm_add_epi16(res, offset_const); if (do_average) { const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j])); const __m128i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m128i res_8 = _mm_packus_epi16(round_result, round_result); if (w > 4) _mm_storel_epi64((__m128i *)(&dst0[j]), res_8); else *(int *)(&dst0[j]) = _mm_cvtsi128_si32(res_8); } else { _mm_store_si128((__m128i *)(&dst[j]), res_unsigned); } } src += src_stride; dst += dst_stride; dst0 += dst_stride0; } } } aom-3.12.1/av1/common/x86/convolve_avx2.c000066400000000000000000001154371477627663500177570ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #if CONFIG_SVT_AV1 #include "third_party/SVT-AV1/convolve_avx2.h" #endif #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/convolve_common_intrin.h" #include "aom_dsp/x86/synonyms.h" static inline void av1_convolve_y_sr_general_avx2( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) { // right shift is F-1 because we are already dividing // filter co-efficients by 2 const int right_shift_bits = (FILTER_BITS - 1); __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits); __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1); __m256i coeffs[6], s[12]; __m128i d[10]; int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); if (vert_tap == 6) prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs); else if (vert_tap == 12) { prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs); } else { prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); } // vert_filt as 4 tap if (vert_tap == 4) { const int fo_vert = 1; const uint8_t *const src_ptr = src - fo_vert * src_stride; for (int j = 0; j < w; j += 16) { const uint8_t *data = &src_ptr[j]; d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); // Load lines a and b. Line a to lower 128, line b to upper 128 const __m256i src_01a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); const __m256i src_12a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); const __m256i src_23a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); const __m256i src_34a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); for (i = 0; i < h; i += 2) { data = &src_ptr[i * src_stride + j]; d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); const __m256i src_45a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); const __m256i src_56a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20); s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); /* rounding code */ // shift by F - 1 const __m256i res_16b_lo = _mm256_sra_epi16( _mm256_add_epi16(res_lo, right_shift_const), right_shift); // 8 bit conversion and saturation to uint8 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); if (w - j > 8) { const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); /* rounding code */ // shift by F - 1 const __m256i res_16b_hi = _mm256_sra_epi16( _mm256_add_epi16(res_hi, right_shift_const), right_shift); // 8 bit conversion and saturation to uint8 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); const __m128i res_0 = _mm256_castsi256_si128(res_a); const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], res_1); } else { const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); if (w - j > 4) { _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], res_1); } else if (w - j > 2) { xx_storel_32(&dst[i * dst_stride + j], res_0); xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); } else { __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); } } s[0] = s[1]; s[1] = s[2]; s[3] = s[4]; s[4] = s[5]; } } } else if (vert_tap == 6) { const int fo_vert = vert_tap / 2 - 1; const uint8_t *const src_ptr = src - fo_vert * src_stride; for (int j = 0; j < w; j += 16) { const uint8_t *data = &src_ptr[j]; __m256i src6; d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); // Load lines a and b. Line a to lower 128, line b to upper 128 const __m256i src_01a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); const __m256i src_12a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); const __m256i src_23a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); src6 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 4 * src_stride))); const __m256i src_34a = _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20); s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); for (i = 0; i < h; i += 2) { data = &src_ptr[i * src_stride + j]; const __m256i src_45a = _mm256_permute2x128_si256( src6, _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), 0x20); src6 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); const __m256i src_56a = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), src6, 0x20); s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); const __m256i res_lo = convolve_lowbd_6tap(s, coeffs); /* rounding code */ // shift by F - 1 const __m256i res_16b_lo = _mm256_sra_epi16( _mm256_add_epi16(res_lo, right_shift_const), right_shift); // 8 bit conversion and saturation to uint8 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); if (w - j > 8) { const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs); /* rounding code */ // shift by F - 1 const __m256i res_16b_hi = _mm256_sra_epi16( _mm256_add_epi16(res_hi, right_shift_const), right_shift); // 8 bit conversion and saturation to uint8 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); const __m128i res_0 = _mm256_castsi256_si128(res_a); const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], res_1); } else { const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); if (w - j > 4) { _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], res_1); } else if (w - j > 2) { xx_storel_32(&dst[i * dst_stride + j], res_0); xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); } else { __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); } } s[0] = s[1]; s[1] = s[2]; s[3] = s[4]; s[4] = s[5]; } } } else if (vert_tap == 12) { // vert_tap == 12 const int fo_vert = filter_params_y->taps / 2 - 1; const uint8_t *const src_ptr = src - fo_vert * src_stride; const __m256i v_zero = _mm256_setzero_si256(); right_shift = _mm_cvtsi32_si128(FILTER_BITS); right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1); for (int j = 0; j < w; j += 8) { const uint8_t *data = &src_ptr[j]; __m256i src10; d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)); d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)); d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)); d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)); d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)); d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)); d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)); // Load lines a and b. Line a to lower 128, line b to upper 128 const __m256i src_01a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); const __m256i src_12a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); const __m256i src_23a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); const __m256i src_34a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); const __m256i src_45a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); const __m256i src_56a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20); const __m256i src_67a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20); const __m256i src_78a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20); const __m256i src_89a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20); src10 = _mm256_castsi128_si256( _mm_loadl_epi64((__m128i *)(data + 10 * src_stride))); const __m256i src_910a = _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20); const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero); const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero); const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero); const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero); const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero); const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero); const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero); const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero); const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero); const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero); s[0] = _mm256_unpacklo_epi16(src_01, src_12); s[1] = _mm256_unpacklo_epi16(src_23, src_34); s[2] = _mm256_unpacklo_epi16(src_45, src_56); s[3] = _mm256_unpacklo_epi16(src_67, src_78); s[4] = _mm256_unpacklo_epi16(src_89, src_910); s[6] = _mm256_unpackhi_epi16(src_01, src_12); s[7] = _mm256_unpackhi_epi16(src_23, src_34); s[8] = _mm256_unpackhi_epi16(src_45, src_56); s[9] = _mm256_unpackhi_epi16(src_67, src_78); s[10] = _mm256_unpackhi_epi16(src_89, src_910); for (i = 0; i < h; i += 2) { data = &src_ptr[i * src_stride + j]; const __m256i src_1011a = _mm256_permute2x128_si256( src10, _mm256_castsi128_si256( _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), 0x20); src10 = _mm256_castsi128_si256( _mm_loadl_epi64((__m128i *)(data + 12 * src_stride))); const __m256i src_1112a = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), src10, 0x20); const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero); const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero); s[5] = _mm256_unpacklo_epi16(src_1011, src_1112); s[11] = _mm256_unpackhi_epi16(src_1011, src_1112); const __m256i res_lo = convolve_12taps(s, coeffs); const __m256i res_32b_lo = _mm256_sra_epi32( _mm256_add_epi32(res_lo, right_shift_const), right_shift); // 8 bit conversion and saturation to uint8 __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); if (w - j > 4) { const __m256i res_hi = convolve_12taps(s + 6, coeffs); const __m256i res_32b_hi = _mm256_sra_epi32( _mm256_add_epi32(res_hi, right_shift_const), right_shift); __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi); // 8 bit conversion and saturation to uint8 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi); const __m128i res_0 = _mm256_extracti128_si256(res_a, 0); const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], res_1); } else { const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); if (w - j > 2) { *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0); *(int *)&dst[i * dst_stride + j + dst_stride] = _mm_cvtsi128_si32(res_1); } else { *(uint16_t *)&dst[i * dst_stride + j] = (uint16_t)_mm_cvtsi128_si32(res_0); *(uint16_t *)&dst[i * dst_stride + j + dst_stride] = (uint16_t)_mm_cvtsi128_si32(res_1); } } s[0] = s[1]; s[1] = s[2]; s[2] = s[3]; s[3] = s[4]; s[4] = s[5]; s[6] = s[7]; s[7] = s[8]; s[8] = s[9]; s[9] = s[10]; s[10] = s[11]; } } } else { const int fo_vert = filter_params_y->taps / 2 - 1; const uint8_t *const src_ptr = src - fo_vert * src_stride; for (int j = 0; j < w; j += 16) { const uint8_t *data = &src_ptr[j]; __m256i src6; d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); // Load lines a and b. Line a to lower 128, line b to upper 128 const __m256i src_01a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); const __m256i src_12a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); const __m256i src_23a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); const __m256i src_34a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); const __m256i src_45a = _mm256_permute2x128_si256( _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); src6 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); const __m256i src_56a = _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20); s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); for (i = 0; i < h; i += 2) { data = &src_ptr[i * src_stride + j]; const __m256i src_67a = _mm256_permute2x128_si256( src6, _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), 0x20); src6 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); const __m256i src_78a = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), src6, 0x20); s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); const __m256i res_lo = convolve_lowbd(s, coeffs); /* rounding code */ // shift by F - 1 const __m256i res_16b_lo = _mm256_sra_epi16( _mm256_add_epi16(res_lo, right_shift_const), right_shift); // 8 bit conversion and saturation to uint8 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); if (w - j > 8) { const __m256i res_hi = convolve_lowbd(s + 4, coeffs); /* rounding code */ // shift by F - 1 const __m256i res_16b_hi = _mm256_sra_epi16( _mm256_add_epi16(res_hi, right_shift_const), right_shift); // 8 bit conversion and saturation to uint8 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); const __m128i res_0 = _mm256_castsi256_si128(res_a); const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], res_1); } else { const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); if (w - j > 4) { _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], res_1); } else if (w - j > 2) { xx_storel_32(&dst[i * dst_stride + j], res_0); xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); } else { __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); } } s[0] = s[1]; s[1] = s[2]; s[2] = s[3]; s[4] = s[5]; s[5] = s[6]; s[6] = s[7]; } } } } void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t w, int32_t h, const InterpFilterParams *filter_params_y, const int32_t subpel_y_qn) { #if CONFIG_SVT_AV1 const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); if (vert_tap == 12) { av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn); } else { av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn); } #else av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn); #endif } static inline void av1_convolve_x_sr_general_avx2( const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { const int bits = FILTER_BITS - conv_params->round_0; const __m128i round_shift = _mm_cvtsi32_si128(bits); __m256i round_0_const = _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1); int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn); assert(bits >= 0); assert((FILTER_BITS - conv_params->round_1) >= 0 || ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); assert(conv_params->round_0 > 0); __m256i coeffs[6], filt[4]; filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); if (horiz_tap == 6) prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs); else if (horiz_tap == 12) { prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs); } else { prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); } // horz_filt as 4 tap if (horiz_tap == 4) { const int fo_horiz = 1; const uint8_t *const src_ptr = src - fo_horiz; if (w <= 8) { for (i = 0; i < h; i += 2) { const __m256i data = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), _mm256_castsi128_si256(_mm_loadu_si128( (__m128i *)(&src_ptr[i * src_stride + src_stride]))), 0x20); __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), round_0_shift); res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift); /* rounding code */ // 8 bit conversion and saturation to uint8 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); const __m128i res_0 = _mm256_castsi256_si128(res_8b); const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); if (w > 4) { _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); } else if (w > 2) { xx_storel_32(&dst[i * dst_stride], res_0); xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); } else { __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); } } } else { for (i = 0; i < h; ++i) { for (int j = 0; j < w; j += 16) { // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 // 18 19 20 21 22 23 const __m256i data = _mm256_inserti128_si256( _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), 1); __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), round_0_shift); res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift); /* rounding code */ // 8 bit conversion and saturation to uint8 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); // Store values into the destination buffer // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 res_8b = _mm256_permute4x64_epi64(res_8b, 216); __m128i res = _mm256_castsi256_si128(res_8b); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); } } } } else if (horiz_tap == 6) { const int fo_horiz = horiz_tap / 2 - 1; const uint8_t *const src_ptr = src - fo_horiz; filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); if (w <= 8) { for (i = 0; i < h; i += 2) { const __m256i data = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), _mm256_castsi128_si256(_mm_loadu_si128( (__m128i *)(&src_ptr[i * src_stride + src_stride]))), 0x20); __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt); res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), round_0_shift); res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift); /* rounding code */ // 8 bit conversion and saturation to uint8 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); const __m128i res_0 = _mm256_castsi256_si128(res_8b); const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); if (w > 4) { _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); } else if (w > 2) { xx_storel_32(&dst[i * dst_stride], res_0); xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); } else { __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); } } } else { for (i = 0; i < h; ++i) { for (int j = 0; j < w; j += 16) { // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 // 18 19 20 21 22 23 const __m256i data = _mm256_inserti128_si256( _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), 1); __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt); res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), round_0_shift); res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift); /* rounding code */ // 8 bit conversion and saturation to uint8 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); // Store values into the destination buffer // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 res_8b = _mm256_permute4x64_epi64(res_8b, 216); __m128i res = _mm256_castsi256_si128(res_8b); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); } } } } else if (horiz_tap == 12) { // horiz_tap == 12 const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *const src_ptr = src - fo_horiz; const __m256i v_zero = _mm256_setzero_si256(); round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1); round_const = _mm256_set1_epi32((1 << bits) >> 1); round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); __m256i s[6]; if (w <= 4) { for (i = 0; i < h; i += 2) { const __m256i data = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), _mm256_castsi128_si256(_mm_loadu_si128( (__m128i *)(&src_ptr[i * src_stride + src_stride]))), 0x20); // row0 0..7 row1 0..7 const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); // row0 8..F row1 8..F const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03 const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); // row0 04 04 .. 07 07 row1 04 04 .. 07 07 const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14 s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16 s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18 s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); const __m256i res_lo = convolve_12taps(s, coeffs); __m256i res_32b_lo = _mm256_sra_epi32( _mm256_add_epi32(res_lo, round_0_const), round_0_shift); // 00 01 02 03 10 12 13 14 res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const), round_shift); // 8 bit conversion and saturation to uint8 // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13 __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); if (w > 2) { // 00 01 02 03 *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0); // 10 11 12 13 *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1); } else { // 00 01 *(uint16_t *)&dst[i * dst_stride] = (uint16_t)_mm_cvtsi128_si32(res_0); // 10 11 *(uint16_t *)&dst[i * dst_stride + dst_stride] = (uint16_t)_mm_cvtsi128_si32(res_1); } } } else { for (i = 0; i < h; i++) { for (int j = 0; j < w; j += 8) { const __m256i data = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), _mm256_castsi128_si256(_mm_loadu_si128( (__m128i *)(&src_ptr[i * src_stride + j + 4]))), 0x20); // row0 0..7 4..B const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); // row0 8..F C..13 const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07 const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); // row0 04 04 .. 07 07 08 08 .. 0B 0B const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13 const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); const __m256i res_lo = convolve_12taps(s, coeffs); __m256i res_32b_lo = _mm256_sra_epi32( _mm256_add_epi32(res_lo, round_0_const), round_0_shift); res_32b_lo = _mm256_sra_epi32( _mm256_add_epi32(res_32b_lo, round_const), round_shift); // 8 bit conversion and saturation to uint8 __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0); *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1); } } } } else { const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *const src_ptr = src - fo_horiz; filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); if (w <= 8) { for (i = 0; i < h; i += 2) { const __m256i data = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), _mm256_castsi128_si256(_mm_loadu_si128( (__m128i *)(&src_ptr[i * src_stride + src_stride]))), 0x20); __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), round_0_shift); res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift); /* rounding code */ // 8 bit conversion and saturation to uint8 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); const __m128i res_0 = _mm256_castsi256_si128(res_8b); const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); if (w > 4) { _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); } else if (w > 2) { xx_storel_32(&dst[i * dst_stride], res_0); xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); } else { __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); } } } else { for (i = 0; i < h; ++i) { for (int j = 0; j < w; j += 16) { // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 // 18 19 20 21 22 23 const __m256i data = _mm256_inserti128_si256( _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), 1); __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), round_0_shift); res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift); /* rounding code */ // 8 bit conversion and saturation to uint8 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); // Store values into the destination buffer // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 res_8b = _mm256_permute4x64_epi64(res_8b, 216); __m128i res = _mm256_castsi256_si128(res_8b); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); } } } } } void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t w, int32_t h, const InterpFilterParams *filter_params_x, const int32_t subpel_x_qn, ConvolveParams *conv_params) { #if CONFIG_SVT_AV1 const int horz_tap = get_filter_tap(filter_params_x, subpel_x_qn); if (horz_tap == 12) { av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params); } else { av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params); } #else av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params); #endif } aom-3.12.1/av1/common/x86/convolve_sse2.c000066400000000000000000000464671477627663500177610ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_common_intrin.h" #include "aom_dsp/x86/synonyms.h" #include "av1/common/convolve.h" static inline void prepare_coeffs(const InterpFilterParams *const filter_params, const int subpel_q4, __m128i *const coeffs /* [4] */) { const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel( filter_params, subpel_q4 & SUBPEL_MASK); const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 0 1 0 1 0 1 0 1 coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 } static inline __m128i convolve(const __m128i *const s, const __m128i *const coeffs) { const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]); const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]); const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]); const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]); const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3)); return d; } static inline __m128i convolve_lo_x(const __m128i *const s, const __m128i *const coeffs) { __m128i ss[4]; ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128()); ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128()); return convolve(ss, coeffs); } static inline __m128i convolve_lo_y(const __m128i *const s, const __m128i *const coeffs) { __m128i ss[4]; ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128()); ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128()); return convolve(ss, coeffs); } static inline __m128i convolve_hi_y(const __m128i *const s, const __m128i *const coeffs) { __m128i ss[4]; ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128()); ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128()); ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128()); ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128()); return convolve(ss, coeffs); } static void convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, int subpel_y_qn) { const int fo_vert = filter_params_y->taps / 2 - 1; const uint8_t *src_ptr = src - fo_vert * src_stride; const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1); const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS); __m128i coeffs[6]; prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs); int j = 0; do { __m128i s[12], src10, res_lo, res_hi; __m128i res_lo_round, res_hi_round, res16, res; const uint8_t *data = &src_ptr[j]; src10 = _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)); s[0] = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 0 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 1 * src_stride))); s[1] = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 1 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 2 * src_stride))); s[2] = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 2 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 3 * src_stride))); s[3] = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 3 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 4 * src_stride))); s[4] = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 4 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 5 * src_stride))); s[5] = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 6 * src_stride))); s[6] = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 6 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 7 * src_stride))); s[7] = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 8 * src_stride))); s[8] = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 8 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 9 * src_stride))); s[9] = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)), src10); int i = 0; do { data = &src_ptr[i * src_stride + j]; s[10] = _mm_unpacklo_epi8( src10, _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))); src10 = _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)); s[11] = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)), src10); res_lo = convolve_lo_y_12tap(s, coeffs); // Filter low index pixels res_hi = convolve_hi_y_12tap(s, coeffs); // Filter high index pixels res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); res16 = _mm_packs_epi32(res_lo_round, res_hi_round); res = _mm_packus_epi16(res16, res16); _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); i++; res_lo = convolve_lo_y_12tap(s + 1, coeffs); // Filter low index pixels res_hi = convolve_hi_y_12tap(s + 1, coeffs); // Filter high index pixels res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); res16 = _mm_packs_epi32(res_lo_round, res_hi_round); res = _mm_packus_epi16(res16, res16); _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); i++; s[0] = s[2]; s[1] = s[3]; s[2] = s[4]; s[3] = s[5]; s[4] = s[6]; s[5] = s[7]; s[6] = s[8]; s[7] = s[9]; s[8] = s[10]; s[9] = s[11]; } while (i < h); j += 8; } while (j < w); } void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) { if (filter_params_y->taps > 8) { if (w < 8) { av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn); } else { convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, filter_params_y, subpel_y_qn); } } else { const int fo_vert = filter_params_y->taps / 2 - 1; const uint8_t *src_ptr = src - fo_vert * src_stride; const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1); const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS); __m128i coeffs[4]; prepare_coeffs(filter_params_y, subpel_y_qn, coeffs); if (w <= 4) { __m128i s[8], src6, res, res_round, res16; int res_int; s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride), xx_loadl_32(src_ptr + 1 * src_stride)); s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride), xx_loadl_32(src_ptr + 2 * src_stride)); s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride), xx_loadl_32(src_ptr + 3 * src_stride)); s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride), xx_loadl_32(src_ptr + 4 * src_stride)); s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride), xx_loadl_32(src_ptr + 5 * src_stride)); src6 = xx_loadl_32(src_ptr + 6 * src_stride); s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6); do { s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride)); src6 = xx_loadl_32(src_ptr + 8 * src_stride); s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6); res = convolve_lo_y(s + 0, coeffs); res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift); res16 = _mm_packs_epi32(res_round, res_round); res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16)); if (w == 2) *(uint16_t *)dst = (uint16_t)res_int; else *(int *)dst = res_int; src_ptr += src_stride; dst += dst_stride; res = convolve_lo_y(s + 1, coeffs); res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift); res16 = _mm_packs_epi32(res_round, res_round); res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16)); if (w == 2) *(uint16_t *)dst = (uint16_t)res_int; else *(int *)dst = res_int; src_ptr += src_stride; dst += dst_stride; s[0] = s[2]; s[1] = s[3]; s[2] = s[4]; s[3] = s[5]; s[4] = s[6]; s[5] = s[7]; h -= 2; } while (h); } else { assert(!(w % 8)); int j = 0; do { __m128i s[8], src6, res_lo, res_hi; __m128i res_lo_round, res_hi_round, res16, res; const uint8_t *data = &src_ptr[j]; src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); s[0] = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 1 * src_stride))); s[1] = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 2 * src_stride))); s[2] = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 3 * src_stride))); s[3] = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 4 * src_stride))); s[4] = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 5 * src_stride))); s[5] = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6); int i = 0; do { data = &src_ptr[i * src_stride + j]; s[6] = _mm_unpacklo_epi8( src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride))); src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); s[7] = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6); res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); res16 = _mm_packs_epi32(res_lo_round, res_hi_round); res = _mm_packus_epi16(res16, res16); _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); i++; res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); res16 = _mm_packs_epi32(res_lo_round, res_hi_round); res = _mm_packus_epi16(res16, res16); _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); i++; s[0] = s[2]; s[1] = s[3]; s[2] = s[4]; s[3] = s[5]; s[4] = s[6]; s[5] = s[7]; } while (i < h); j += 8; } while (j < w); } } } static void convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, int subpel_x_qn, ConvolveParams *conv_params) { const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - fo_horiz; const int bits = FILTER_BITS - conv_params->round_0; const __m128i round_0_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1); const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1); const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); const __m128i round_shift = _mm_cvtsi32_si128(bits); const __m128i zero = _mm_setzero_si128(); __m128i coeffs[6]; assert(bits >= 0); assert((FILTER_BITS - conv_params->round_1) >= 0 || ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs); int i = 0; do { int j = 0; do { const __m128i data = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); __m128i s[4]; s[0] = _mm_unpacklo_epi16(data, _mm_srli_si128(data, 1)); s[1] = _mm_unpacklo_epi16(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3)); s[2] = _mm_unpacklo_epi16(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5)); s[3] = _mm_unpacklo_epi16(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7)); const __m128i res32 = convolve_lo_x_12tap(s, coeffs, zero); __m128i res32_round = _mm_sra_epi32(_mm_add_epi32(res32, round_0_const), round_0_shift); res32_round = _mm_sra_epi32(_mm_add_epi32(res32_round, round_const), round_shift); const __m128i res16 = _mm_packs_epi32(res32_round, zero); const __m128i res = _mm_packus_epi16(res16, zero); const int val = _mm_cvtsi128_si32(res); memcpy((dst + i * dst_stride + j), &val, sizeof(val)); j += 4; } while (j < w); } while (++i < h); } void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { if (filter_params_x->taps > 8) { if (w < 4) { av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params); } else { convolve_x_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, filter_params_x, subpel_x_qn, conv_params); } } else { const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - fo_horiz; const int bits = FILTER_BITS - conv_params->round_0; const __m128i round_0_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1); const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1); const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); const __m128i round_shift = _mm_cvtsi32_si128(bits); __m128i coeffs[4]; assert(bits >= 0); assert((FILTER_BITS - conv_params->round_1) >= 0 || ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); prepare_coeffs(filter_params_x, subpel_x_qn, coeffs); if (w <= 4) { do { const __m128i data = _mm_loadu_si128((__m128i *)src_ptr); __m128i s[4]; s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1)); s[1] = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3)); s[2] = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5)); s[3] = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7)); const __m128i res_lo = convolve_lo_x(s, coeffs); __m128i res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift); res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift); const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round); const __m128i res = _mm_packus_epi16(res16, res16); int r = _mm_cvtsi128_si32(res); if (w == 2) *(uint16_t *)dst = (uint16_t)r; else *(int *)dst = r; src_ptr += src_stride; dst += dst_stride; } while (--h); } else { assert(!(w % 8)); int i = 0; do { int j = 0; do { const __m128i data = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); __m128i s[4]; // Filter even-index pixels s[0] = data; s[1] = _mm_srli_si128(data, 2); s[2] = _mm_srli_si128(data, 4); s[3] = _mm_srli_si128(data, 6); const __m128i res_even = convolve_lo_x(s, coeffs); // Filter odd-index pixels s[0] = _mm_srli_si128(data, 1); s[1] = _mm_srli_si128(data, 3); s[2] = _mm_srli_si128(data, 5); s[3] = _mm_srli_si128(data, 7); const __m128i res_odd = convolve_lo_x(s, coeffs); // Rearrange pixels back into the order 0 ... 7 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); __m128i res_lo_round = _mm_sra_epi32( _mm_add_epi32(res_lo, round_0_const), round_0_shift); res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift); __m128i res_hi_round = _mm_sra_epi32( _mm_add_epi32(res_hi, round_0_const), round_0_shift); res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const), round_shift); const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round); const __m128i res = _mm_packus_epi16(res16, res16); _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); j += 8; } while (j < w); } while (++i < h); } } } aom-3.12.1/av1/common/x86/filterintra_sse4.c000066400000000000000000000356671477627663500204530ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/av1_rtcd.h" #include "aom_dsp/x86/synonyms.h" #include "av1/common/enums.h" #include "av1/common/reconintra.h" //------------------------------------------------------------------------------ // filter_intra_predictor_sse4_1 // This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which // duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes. #define DUPLICATE_FIRST_HALF 0x44 // Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th // at zero to preserve the sum. static inline void filter_4x2_sse4_1(uint8_t *dst, const ptrdiff_t stride, const __m128i *pixels, const __m128i *taps_0_1, const __m128i *taps_2_3, const __m128i *taps_4_5, const __m128i *taps_6_7) { const __m128i mul_0_01 = _mm_maddubs_epi16(*pixels, *taps_0_1); const __m128i mul_0_23 = _mm_maddubs_epi16(*pixels, *taps_2_3); // |output_half| contains 8 partial sums. __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23); __m128i output = _mm_hadd_epi16(output_half, output_half); const __m128i output_row0 = _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4), /* arbitrary pack arg */ output); xx_storel_32(dst, output_row0); const __m128i mul_1_01 = _mm_maddubs_epi16(*pixels, *taps_4_5); const __m128i mul_1_23 = _mm_maddubs_epi16(*pixels, *taps_6_7); output_half = _mm_hadd_epi16(mul_1_01, mul_1_23); output = _mm_hadd_epi16(output_half, output_half); const __m128i output_row1 = _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4), /* arbitrary pack arg */ output); xx_storel_32(dst + stride, output_row1); } // 4xH transform sizes are given special treatment because xx_loadl_64 goes out // of bounds and every block involves the left column. This implementation // loads TL from the top row for the first block, so it is not static inline void filter_4xh(uint8_t *dest, ptrdiff_t stride, const uint8_t *const top_ptr, const uint8_t *const left_ptr, int mode, const int height) { const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]); const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]); const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]); const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]); __m128i top = xx_loadl_32(top_ptr - 1); __m128i pixels = _mm_insert_epi8(top, (int8_t)top_ptr[3], 4); __m128i left = (height == 4 ? xx_loadl_32(left_ptr) : xx_loadl_64(left_ptr)); left = _mm_slli_si128(left, 5); // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1], // left[2], left[3], left[4], left[5], left[6], left[7] pixels = _mm_or_si128(left, pixels); // Duplicate first 8 bytes. pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF); filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, &taps_6_7); dest += stride; // Move to y = 1. pixels = xx_loadl_32(dest); // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1], // left[0], left[1], ... pixels = _mm_or_si128(left, pixels); // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last // byte is an unused value, which shall be multiplied by 0 when we apply the // filter. const int64_t kInsertTopLeftFirstMask = 0x0F08070302010006; // Insert left[-1] in front as TL and put left[0] and left[1] at the end. const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask); pixels = _mm_shuffle_epi8(pixels, pixel_order1); dest += stride; // Move to y = 2. filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, &taps_6_7); dest += stride; // Move to y = 3. // Compute the middle 8 rows before using common code for the final 4 rows. // Because the common code below this block assumes that if (height == 16) { // This shift allows us to use pixel_order2 twice after shifting by 2 later. left = _mm_slli_si128(left, 1); pixels = xx_loadl_32(dest); // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4], // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3] pixels = _mm_or_si128(left, pixels); // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The // last byte is an unused value, as above. The top-left was shifted to // position nine to keep two empty spaces after the top pixels. const int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009; // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at // the end. const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask); pixels = _mm_shuffle_epi8(pixels, pixel_order2); dest += stride; // Move to y = 4. // First 4x2 in the if body. filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, &taps_6_7); // Clear all but final pixel in the first 8 of left column. __m128i keep_top_left = _mm_srli_si128(left, 13); dest += stride; // Move to y = 5. pixels = xx_loadl_32(dest); left = _mm_srli_si128(left, 2); // Relative pixels: top[0], top[1], top[2], top[3], left[-6], // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1] pixels = _mm_or_si128(left, pixels); left = xx_loadl_64(left_ptr + 8); pixels = _mm_shuffle_epi8(pixels, pixel_order2); dest += stride; // Move to y = 6. // Second 4x2 in the if body. filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, &taps_6_7); // Position TL value so we can use pixel_order1. keep_top_left = _mm_slli_si128(keep_top_left, 6); dest += stride; // Move to y = 7. pixels = xx_loadl_32(dest); left = _mm_slli_si128(left, 7); left = _mm_or_si128(left, keep_top_left); // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, // left[-1], left[0], left[1], left[2], left[3], ... pixels = _mm_or_si128(left, pixels); pixels = _mm_shuffle_epi8(pixels, pixel_order1); dest += stride; // Move to y = 8. // Third 4x2 in the if body. filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, &taps_6_7); dest += stride; // Move to y = 9. // Prepare final inputs. pixels = xx_loadl_32(dest); left = _mm_srli_si128(left, 2); // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2] // left[-1], left[0], left[1], left[2], left[3], ... pixels = _mm_or_si128(left, pixels); pixels = _mm_shuffle_epi8(pixels, pixel_order1); dest += stride; // Move to y = 10. // Fourth 4x2 in the if body. filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, &taps_6_7); dest += stride; // Move to y = 11. } // In both the 8 and 16 case, we assume that the left vector has the next TL // at position 8. if (height > 4) { // Erase prior left pixels by shifting TL to position 0. left = _mm_srli_si128(left, 8); left = _mm_slli_si128(left, 6); pixels = xx_loadl_32(dest); // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, // left[-1], left[0], left[1], left[2], left[3], ... pixels = _mm_or_si128(left, pixels); pixels = _mm_shuffle_epi8(pixels, pixel_order1); dest += stride; // Move to y = 12 or 4. // First of final two 4x2 blocks. filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, &taps_6_7); dest += stride; // Move to y = 13 or 5. pixels = xx_loadl_32(dest); left = _mm_srli_si128(left, 2); // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2] // left[-1], left[0], left[1], left[2], left[3], ... pixels = _mm_or_si128(left, pixels); pixels = _mm_shuffle_epi8(pixels, pixel_order1); dest += stride; // Move to y = 14 or 6. // Last of final two 4x2 blocks. filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, &taps_6_7); } } static inline void filter_intra_predictor_sse4_1(void *const dest, ptrdiff_t stride, const void *const top_row, const void *const left_column, int mode, const int width, const int height) { const uint8_t *const top_ptr = (const uint8_t *)top_row; const uint8_t *const left_ptr = (const uint8_t *)left_column; uint8_t *dst = (uint8_t *)dest; if (width == 4) { filter_4xh(dst, stride, top_ptr, left_ptr, mode, height); return; } // There is one set of 7 taps for each of the 4x2 output pixels. const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]); const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]); const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]); const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]); // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at // the end is an unused value, which shall be multiplied by 0 when we apply // the filter. const int64_t kCondenseLeftMask = 0x0F09080403020100; // Takes the "left section" and puts it right after p0-p4. const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask); // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last // byte is unused as above. const int64_t kInsertTopLeftMask = 0x0F0A090302010008; // Shuffles the "top left" from the left section, to the front. Used when // grabbing data from left_column and not top_row. const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask); // This first pass takes care of the cases where the top left pixel comes from // top_row. __m128i pixels = xx_loadl_64(top_ptr - 1); __m128i left = _mm_slli_si128(xx_loadl_32(left_column), 8); pixels = _mm_or_si128(pixels, left); // Two sets of the same pixels to multiply with two sets of taps. pixels = _mm_shuffle_epi8(pixels, pixel_order1); filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, &taps_6_7); left = _mm_srli_si128(left, 1); // Load pixels = xx_loadl_32(dst + stride); // Because of the above shift, this OR 'invades' the final of the first 8 // bytes of |pixels|. This is acceptable because the 8th filter tap is always // a padded 0. pixels = _mm_or_si128(pixels, left); pixels = _mm_shuffle_epi8(pixels, pixel_order2); const ptrdiff_t stride2 = stride << 1; const ptrdiff_t stride4 = stride << 2; filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, &taps_6_7); dst += 4; for (int x = 3; x < width - 4; x += 4) { pixels = xx_loadl_32(top_ptr + x); pixels = _mm_insert_epi8(pixels, (int8_t)top_ptr[x + 4], 4); pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5); pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6); // Duplicate bottom half into upper half. pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF); filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, &taps_6_7); pixels = xx_loadl_32(dst + stride - 1); pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4); pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5); pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + stride2 - 1], 6); // Duplicate bottom half into upper half. pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF); filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, &taps_6_7); dst += 4; } // Now we handle heights that reference previous blocks rather than top_row. for (int y = 4; y < height; y += 4) { // Leftmost 4x4 block for this height. dst -= width; dst += stride4; // Top Left is not available by offset in these leftmost blocks. pixels = xx_loadl_32(dst - stride); left = _mm_slli_si128(xx_loadl_32(left_ptr + y - 1), 8); left = _mm_insert_epi8(left, (int8_t)left_ptr[y + 3], 12); pixels = _mm_or_si128(pixels, left); pixels = _mm_shuffle_epi8(pixels, pixel_order2); filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, &taps_6_7); // The bytes shifted into positions 6 and 7 will be ignored by the shuffle. left = _mm_srli_si128(left, 2); pixels = xx_loadl_32(dst + stride); pixels = _mm_or_si128(pixels, left); pixels = _mm_shuffle_epi8(pixels, pixel_order2); filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, &taps_6_7); dst += 4; // Remaining 4x4 blocks for this height. for (int x = 4; x < width; x += 4) { pixels = xx_loadl_32(dst - stride - 1); pixels = _mm_insert_epi8(pixels, (int8_t)dst[-stride + 3], 4); pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5); pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6); // Duplicate bottom half into upper half. pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF); filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, &taps_6_7); pixels = xx_loadl_32(dst + stride - 1); pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4); pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5); pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 + stride - 1], 6); // Duplicate bottom half into upper half. pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF); filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, &taps_6_7); dst += 4; } } } void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode) { const int bw = tx_size_wide[tx_size]; const int bh = tx_size_high[tx_size]; filter_intra_predictor_sse4_1(dst, stride, above, left, mode, bw, bh); } aom-3.12.1/av1/common/x86/highbd_convolve_2d_avx2.c000066400000000000000000000201701477627663500216360ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "av1/common/convolve.h" void av1_highbd_convolve_2d_sr_ssse3( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd); void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { if (filter_params_x->taps == 12) { av1_highbd_convolve_2d_sr_ssse3(src, src_stride, dst, dst_stride, w, h, filter_params_x, filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); return; } DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); int im_h = h + filter_params_y->taps - 1; int im_stride = 8; int i, j; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; // Check that, even with 12-bit input, the intermediate values will fit // into an unsigned 16-bit intermediate array. assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); __m256i s[8], coeffs_y[4], coeffs_x[4]; const __m256i round_const_x = _mm256_set1_epi32( ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); const __m256i round_const_y = _mm256_set1_epi32( ((1 << conv_params->round_1) >> 1) - (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); const __m256i clip_pixel = _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m256i zero = _mm256_setzero_si256(); prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); for (j = 0; j < w; j += 8) { /* Horizontal filter */ { for (i = 0; i < im_h; i += 2) { const __m256i row0 = _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); __m256i row1 = _mm256_setzero_si256(); if (i + 1 < im_h) row1 = _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); // even pixels s[0] = _mm256_alignr_epi8(r1, r0, 0); s[1] = _mm256_alignr_epi8(r1, r0, 4); s[2] = _mm256_alignr_epi8(r1, r0, 8); s[3] = _mm256_alignr_epi8(r1, r0, 12); __m256i res_even = convolve(s, coeffs_x); res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), round_shift_x); // odd pixels s[0] = _mm256_alignr_epi8(r1, r0, 2); s[1] = _mm256_alignr_epi8(r1, r0, 6); s[2] = _mm256_alignr_epi8(r1, r0, 10); s[3] = _mm256_alignr_epi8(r1, r0, 14); __m256i res_odd = convolve(s, coeffs_x); res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), round_shift_x); __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); } } /* Vertical filter */ { __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); s[0] = _mm256_unpacklo_epi16(s0, s1); s[1] = _mm256_unpacklo_epi16(s2, s3); s[2] = _mm256_unpacklo_epi16(s4, s5); s[4] = _mm256_unpackhi_epi16(s0, s1); s[5] = _mm256_unpackhi_epi16(s2, s3); s[6] = _mm256_unpackhi_epi16(s4, s5); for (i = 0; i < h; i += 2) { const int16_t *data = &im_block[i * im_stride]; const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); s[3] = _mm256_unpacklo_epi16(s6, s7); s[7] = _mm256_unpackhi_epi16(s6, s7); const __m256i res_a = convolve(s, coeffs_y); __m256i res_a_round = _mm256_sra_epi32( _mm256_add_epi32(res_a, round_const_y), round_shift_y); res_a_round = _mm256_sra_epi32( _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits); if (w - j > 4) { const __m256i res_b = convolve(s + 4, coeffs_y); __m256i res_b_round = _mm256_sra_epi32( _mm256_add_epi32(res_b, round_const_y), round_shift_y); res_b_round = _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits), round_shift_bits); __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); res_16bit = _mm256_max_epi16(res_16bit, zero); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], _mm256_castsi256_si128(res_16bit)); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], _mm256_extracti128_si256(res_16bit, 1)); } else if (w == 4) { res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); res_a_round = _mm256_max_epi16(res_a_round, zero); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], _mm256_castsi256_si128(res_a_round)); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], _mm256_extracti128_si256(res_a_round, 1)); } else { res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); res_a_round = _mm256_max_epi16(res_a_round, zero); xx_storel_32(&dst[i * dst_stride + j], _mm256_castsi256_si128(res_a_round)); xx_storel_32(&dst[i * dst_stride + j + dst_stride], _mm256_extracti128_si256(res_a_round, 1)); } s[0] = s[1]; s[1] = s[2]; s[2] = s[3]; s[4] = s[5]; s[5] = s[6]; s[6] = s[7]; } } } } aom-3.12.1/av1/common/x86/highbd_convolve_2d_sse4.c000066400000000000000000000434751477627663500216510ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" #include "aom_dsp/x86/convolve_sse4_1.h" #include "av1/common/convolve.h" void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bits = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const __m128i left_shift = _mm_cvtsi32_si128(bits); const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi32(w0); const __m128i wt1 = _mm_set1_epi32(w1); const __m128i zero = _mm_setzero_si128(); int i, j; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m128i offset_const = _mm_set1_epi32(offset); const __m128i offset_const_16b = _mm_set1_epi16(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); const __m128i clip_pixel_to_bd = _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); assert(bits <= 4); if (!(w % 8)) { for (i = 0; i < h; i += 1) { for (j = 0; j < w; j += 8) { const __m128i src_16bit = _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])); const __m128i res = _mm_sll_epi16(src_16bit, left_shift); if (do_average) { const __m128i data_0 = _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero); const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero); const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero); const __m128i res_unsigned_lo = _mm_add_epi32(res_32b_lo, offset_const); const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero); const __m128i res_unsigned_hi = _mm_add_epi32(res_32b_hi, offset_const); const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i round_result_lo = highbd_convolve_rounding_sse2( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); const __m128i round_result_hi = highbd_convolve_rounding_sse2( &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); const __m128i res_16b = _mm_packus_epi32(round_result_lo, round_result_hi); const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); } else { const __m128i res_unsigned_16b = _mm_adds_epu16(res, offset_const_16b); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned_16b); } } } } else if (!(w % 4)) { for (i = 0; i < h; i += 2) { for (j = 0; j < w; j += 4) { const __m128i src_row_0 = _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j])); const __m128i src_row_1 = _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride])); const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1); const __m128i res = _mm_sll_epi16(src_10, left_shift); if (do_average) { const __m128i data_0 = _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); const __m128i data_1 = _mm_loadl_epi64( (__m128i *)(&dst[i * dst_stride + j + dst_stride])); const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero); const __m128i res_32b = _mm_unpacklo_epi16(res, zero); const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const); const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero); const __m128i res_unsigned_hi = _mm_add_epi32(res_32b_hi, offset_const); const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i round_result_lo = highbd_convolve_rounding_sse2( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); const __m128i round_result_hi = highbd_convolve_rounding_sse2( &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); const __m128i res_16b = _mm_packus_epi32(round_result_lo, round_result_hi); const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); const __m128i res_1 = _mm_srli_si128(res_clip, 8); _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); _mm_storel_epi64( (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); } else { const __m128i res_unsigned_16b = _mm_adds_epu16(res, offset_const_16b); const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8); _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_unsigned_16b); _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_1); } } } } } void av1_highbd_dist_wtd_convolve_2d_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; int im_h = h + filter_params_y->taps - 1; int im_stride = MAX_SB_SIZE; int i, j; const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi32(w0); const __m128i wt1 = _mm_set1_epi32(w1); const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m128i offset_const = _mm_set1_epi32(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); const __m128i clip_pixel_to_bd = _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); // Check that, even with 12-bit input, the intermediate values will fit // into an unsigned 16-bit intermediate array. assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); /* Horizontal filter */ { const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32( ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); for (i = 0; i < im_h; ++i) { for (j = 0; j < w; j += 8) { const __m128i data = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); const __m128i data2 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); // Filter even-index pixels const __m128i res_0 = _mm_madd_epi16(data, coeff_01); const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); // Filter odd-index pixels const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 __m128i res = _mm_packs_epi32(res_even, res_odd); _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); } } } /* Vertical filter */ { const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32( ((1 << conv_params->round_1) >> 1) - (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { // Filter even-index pixels const int16_t *data = &im_block[i * im_stride + j]; const __m128i src_0 = _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), *(__m128i *)(data + 1 * im_stride)); const __m128i src_2 = _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), *(__m128i *)(data + 3 * im_stride)); const __m128i src_4 = _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), *(__m128i *)(data + 5 * im_stride)); const __m128i src_6 = _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), *(__m128i *)(data + 7 * im_stride)); const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); // Filter odd-index pixels const __m128i src_1 = _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), *(__m128i *)(data + 1 * im_stride)); const __m128i src_3 = _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), *(__m128i *)(data + 3 * im_stride)); const __m128i src_5 = _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), *(__m128i *)(data + 5 * im_stride)); const __m128i src_7 = _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), *(__m128i *)(data + 7 * im_stride)); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); // Rearrange pixels back into the order 0 ... 7 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); const __m128i res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); const __m128i res_unsigned_lo = _mm_add_epi32(res_lo_round, offset_const); if (w < 8) { if (do_average) { const __m128i data_0 = _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0); const __m128i comp_avg_res = highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i round_result = highbd_convolve_rounding_sse2( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m128i res_16b = _mm_packus_epi32(round_result, round_result); const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); } else { const __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo); _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_16b); } } else { const __m128i res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); const __m128i res_unsigned_hi = _mm_add_epi32(res_hi_round, offset_const); if (do_average) { const __m128i data_lo = _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); const __m128i data_hi = _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j + 4])); const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo); const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi); const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i round_result_lo = highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); const __m128i round_result_hi = highbd_convolve_rounding_sse2(&comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); const __m128i res_16b = _mm_packus_epi32(round_result_lo, round_result_hi); const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); } else { const __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b); } } } } } } aom-3.12.1/av1/common/x86/highbd_convolve_2d_ssse3.c000066400000000000000000000412451477627663500220240ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" #include "av1/common/convolve.h" #include "aom_dsp/x86/convolve_common_intrin.h" void av1_highbd_convolve_2d_sr_ssse3( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); int im_h = h + filter_params_y->taps - 1; int im_stride = 8; int i, j; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; // Check that, even with 12-bit input, the intermediate values will fit // into an unsigned 16-bit intermediate array. assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); const __m128i round_const_x = _mm_set1_epi32( ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); const __m128i round_const_y = _mm_set1_epi32(((1 << conv_params->round_1) >> 1) - (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); const __m128i clip_pixel = _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m128i zero = _mm_setzero_si128(); if (filter_params_x->taps == 12) { __m128i coeffs_x[6], coeffs_y[6], s[24]; prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs_x); prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs_y); for (j = 0; j < w; j += 8) { /* Horizontal filter */ { for (i = 0; i < im_h; i += 1) { const __m128i row00 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); const __m128i row01 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); const __m128i row02 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 16)]); // even pixels s[0] = _mm_alignr_epi8(row01, row00, 0); s[1] = _mm_alignr_epi8(row01, row00, 4); s[2] = _mm_alignr_epi8(row01, row00, 8); s[3] = _mm_alignr_epi8(row01, row00, 12); s[4] = _mm_alignr_epi8(row02, row01, 0); s[5] = _mm_alignr_epi8(row02, row01, 4); __m128i res_even = convolve_12tap(s, coeffs_x); res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x); // odd pixels s[0] = _mm_alignr_epi8(row01, row00, 2); s[1] = _mm_alignr_epi8(row01, row00, 6); s[2] = _mm_alignr_epi8(row01, row00, 10); s[3] = _mm_alignr_epi8(row01, row00, 14); s[4] = _mm_alignr_epi8(row02, row01, 2); s[5] = _mm_alignr_epi8(row02, row01, 6); __m128i res_odd = convolve_12tap(s, coeffs_x); res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x); __m128i res_even1 = _mm_packs_epi32(res_even, res_even); __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); _mm_store_si128((__m128i *)&im_block[i * im_stride], res); } } /* Vertical filter */ { __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride)); __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride)); __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride)); __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride)); __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride)); __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride)); __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride)); __m128i s7 = _mm_loadu_si128((__m128i *)(im_block + 7 * im_stride)); __m128i s8 = _mm_loadu_si128((__m128i *)(im_block + 8 * im_stride)); __m128i s9 = _mm_loadu_si128((__m128i *)(im_block + 9 * im_stride)); __m128i s10 = _mm_loadu_si128((__m128i *)(im_block + 10 * im_stride)); s[0] = _mm_unpacklo_epi16(s0, s1); s[1] = _mm_unpacklo_epi16(s2, s3); s[2] = _mm_unpacklo_epi16(s4, s5); s[3] = _mm_unpacklo_epi16(s6, s7); s[4] = _mm_unpacklo_epi16(s8, s9); s[6] = _mm_unpackhi_epi16(s0, s1); s[7] = _mm_unpackhi_epi16(s2, s3); s[8] = _mm_unpackhi_epi16(s4, s5); s[9] = _mm_unpackhi_epi16(s6, s7); s[10] = _mm_unpackhi_epi16(s8, s9); s[12] = _mm_unpacklo_epi16(s1, s2); s[13] = _mm_unpacklo_epi16(s3, s4); s[14] = _mm_unpacklo_epi16(s5, s6); s[15] = _mm_unpacklo_epi16(s7, s8); s[16] = _mm_unpacklo_epi16(s9, s10); s[18] = _mm_unpackhi_epi16(s1, s2); s[19] = _mm_unpackhi_epi16(s3, s4); s[20] = _mm_unpackhi_epi16(s5, s6); s[21] = _mm_unpackhi_epi16(s7, s8); s[22] = _mm_unpackhi_epi16(s9, s10); for (i = 0; i < h; i += 2) { const int16_t *data = &im_block[i * im_stride]; __m128i s11 = _mm_loadu_si128((__m128i *)(data + 11 * im_stride)); __m128i s12 = _mm_loadu_si128((__m128i *)(data + 12 * im_stride)); s[5] = _mm_unpacklo_epi16(s10, s11); s[11] = _mm_unpackhi_epi16(s10, s11); s[17] = _mm_unpacklo_epi16(s11, s12); s[23] = _mm_unpackhi_epi16(s11, s12); const __m128i res_a0 = convolve_12tap(s, coeffs_y); __m128i res_a_round0 = _mm_sra_epi32( _mm_add_epi32(res_a0, round_const_y), round_shift_y); res_a_round0 = _mm_sra_epi32( _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits); const __m128i res_a1 = convolve_12tap(s + 12, coeffs_y); __m128i res_a_round1 = _mm_sra_epi32( _mm_add_epi32(res_a1, round_const_y), round_shift_y); res_a_round1 = _mm_sra_epi32( _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits); if (w - j > 4) { const __m128i res_b0 = convolve_12tap(s + 6, coeffs_y); __m128i res_b_round0 = _mm_sra_epi32( _mm_add_epi32(res_b0, round_const_y), round_shift_y); res_b_round0 = _mm_sra_epi32(_mm_add_epi32(res_b_round0, round_const_bits), round_shift_bits); const __m128i res_b1 = convolve_12tap(s + 18, coeffs_y); __m128i res_b_round1 = _mm_sra_epi32( _mm_add_epi32(res_b1, round_const_y), round_shift_y); res_b_round1 = _mm_sra_epi32(_mm_add_epi32(res_b_round1, round_const_bits), round_shift_bits); __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); res_16bit0 = _mm_max_epi16(res_16bit0, zero); __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); res_16bit1 = _mm_max_epi16(res_16bit1, zero); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], res_16bit1); } else if (w == 4) { res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); res_a_round0 = _mm_max_epi16(res_a_round0, zero); res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); res_a_round1 = _mm_max_epi16(res_a_round1, zero); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], res_a_round1); } else { res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); res_a_round0 = _mm_max_epi16(res_a_round0, zero); res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); res_a_round1 = _mm_max_epi16(res_a_round1, zero); *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res_a_round0); *((int *)(&dst[i * dst_stride + j + dst_stride])) = _mm_cvtsi128_si32(res_a_round1); } s[0] = s[1]; s[1] = s[2]; s[2] = s[3]; s[3] = s[4]; s[4] = s[5]; s[6] = s[7]; s[7] = s[8]; s[8] = s[9]; s[9] = s[10]; s[10] = s[11]; s[12] = s[13]; s[13] = s[14]; s[14] = s[15]; s[15] = s[16]; s[16] = s[17]; s[18] = s[19]; s[19] = s[20]; s[20] = s[21]; s[21] = s[22]; s[22] = s[23]; s10 = s12; } } } } else { __m128i coeffs_x[4], coeffs_y[4], s[16]; prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); for (j = 0; j < w; j += 8) { /* Horizontal filter */ { for (i = 0; i < im_h; i += 1) { const __m128i row00 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); const __m128i row01 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); // even pixels s[0] = _mm_alignr_epi8(row01, row00, 0); s[1] = _mm_alignr_epi8(row01, row00, 4); s[2] = _mm_alignr_epi8(row01, row00, 8); s[3] = _mm_alignr_epi8(row01, row00, 12); __m128i res_even = convolve(s, coeffs_x); res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x); // odd pixels s[0] = _mm_alignr_epi8(row01, row00, 2); s[1] = _mm_alignr_epi8(row01, row00, 6); s[2] = _mm_alignr_epi8(row01, row00, 10); s[3] = _mm_alignr_epi8(row01, row00, 14); __m128i res_odd = convolve(s, coeffs_x); res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x); __m128i res_even1 = _mm_packs_epi32(res_even, res_even); __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); _mm_store_si128((__m128i *)&im_block[i * im_stride], res); } } /* Vertical filter */ { __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride)); __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride)); __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride)); __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride)); __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride)); __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride)); __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride)); s[0] = _mm_unpacklo_epi16(s0, s1); s[1] = _mm_unpacklo_epi16(s2, s3); s[2] = _mm_unpacklo_epi16(s4, s5); s[4] = _mm_unpackhi_epi16(s0, s1); s[5] = _mm_unpackhi_epi16(s2, s3); s[6] = _mm_unpackhi_epi16(s4, s5); s[0 + 8] = _mm_unpacklo_epi16(s1, s2); s[1 + 8] = _mm_unpacklo_epi16(s3, s4); s[2 + 8] = _mm_unpacklo_epi16(s5, s6); s[4 + 8] = _mm_unpackhi_epi16(s1, s2); s[5 + 8] = _mm_unpackhi_epi16(s3, s4); s[6 + 8] = _mm_unpackhi_epi16(s5, s6); for (i = 0; i < h; i += 2) { const int16_t *data = &im_block[i * im_stride]; __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * im_stride)); __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * im_stride)); s[3] = _mm_unpacklo_epi16(s6, s7); s[7] = _mm_unpackhi_epi16(s6, s7); s[3 + 8] = _mm_unpacklo_epi16(s7, s8); s[7 + 8] = _mm_unpackhi_epi16(s7, s8); const __m128i res_a0 = convolve(s, coeffs_y); __m128i res_a_round0 = _mm_sra_epi32( _mm_add_epi32(res_a0, round_const_y), round_shift_y); res_a_round0 = _mm_sra_epi32( _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits); const __m128i res_a1 = convolve(s + 8, coeffs_y); __m128i res_a_round1 = _mm_sra_epi32( _mm_add_epi32(res_a1, round_const_y), round_shift_y); res_a_round1 = _mm_sra_epi32( _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits); if (w - j > 4) { const __m128i res_b0 = convolve(s + 4, coeffs_y); __m128i res_b_round0 = _mm_sra_epi32( _mm_add_epi32(res_b0, round_const_y), round_shift_y); res_b_round0 = _mm_sra_epi32(_mm_add_epi32(res_b_round0, round_const_bits), round_shift_bits); const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); __m128i res_b_round1 = _mm_sra_epi32( _mm_add_epi32(res_b1, round_const_y), round_shift_y); res_b_round1 = _mm_sra_epi32(_mm_add_epi32(res_b_round1, round_const_bits), round_shift_bits); __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); res_16bit0 = _mm_max_epi16(res_16bit0, zero); __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); res_16bit1 = _mm_max_epi16(res_16bit1, zero); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], res_16bit1); } else if (w == 4) { res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); res_a_round0 = _mm_max_epi16(res_a_round0, zero); res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); res_a_round1 = _mm_max_epi16(res_a_round1, zero); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], res_a_round1); } else { res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); res_a_round0 = _mm_max_epi16(res_a_round0, zero); res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); res_a_round1 = _mm_max_epi16(res_a_round1, zero); *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res_a_round0); *((int *)(&dst[i * dst_stride + j + dst_stride])) = _mm_cvtsi128_si32(res_a_round1); } s[0] = s[1]; s[1] = s[2]; s[2] = s[3]; s[4] = s[5]; s[5] = s[6]; s[6] = s[7]; s[0 + 8] = s[1 + 8]; s[1 + 8] = s[2 + 8]; s[2 + 8] = s[3 + 8]; s[4 + 8] = s[5 + 8]; s[5 + 8] = s[6 + 8]; s[6 + 8] = s[7 + 8]; s6 = s8; } } } } } aom-3.12.1/av1/common/x86/highbd_inv_txfm_avx2.c000066400000000000000000005124431477627663500212610ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "av1/common/av1_inv_txfm1d_cfg.h" #include "av1/common/idct.h" #include "av1/common/x86/av1_inv_txfm_ssse3.h" #include "av1/common/x86/highbd_txfm_utility_sse4.h" #include "aom_dsp/x86/txfm_common_avx2.h" // Note: // Total 32x4 registers to represent 32x32 block coefficients. // For high bit depth, each coefficient is 4-byte. // Each __m256i register holds 8 coefficients. // So each "row" we needs 4 register. Totally 32 rows // Register layout: // v0, v1, v2, v3, // v4, v5, v6, v7, // ... ... // v124, v125, v126, v127 static inline __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) { const __m256i zero = _mm256_setzero_si256(); const __m256i one = _mm256_set1_epi16(1); const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one); __m256i clamped, mask; mask = _mm256_cmpgt_epi16(u, max); clamped = _mm256_andnot_si256(mask, u); mask = _mm256_and_si256(mask, max); clamped = _mm256_or_si256(mask, clamped); mask = _mm256_cmpgt_epi16(clamped, zero); clamped = _mm256_and_si256(clamped, mask); return clamped; } static inline void round_shift_4x4_avx2(__m256i *in, int shift) { if (shift != 0) { __m256i rnding = _mm256_set1_epi32(1 << (shift - 1)); in[0] = _mm256_add_epi32(in[0], rnding); in[1] = _mm256_add_epi32(in[1], rnding); in[2] = _mm256_add_epi32(in[2], rnding); in[3] = _mm256_add_epi32(in[3], rnding); in[0] = _mm256_srai_epi32(in[0], shift); in[1] = _mm256_srai_epi32(in[1], shift); in[2] = _mm256_srai_epi32(in[2], shift); in[3] = _mm256_srai_epi32(in[3], shift); } } static inline void round_shift_8x8_avx2(__m256i *in, int shift) { round_shift_4x4_avx2(in, shift); round_shift_4x4_avx2(in + 4, shift); round_shift_4x4_avx2(in + 8, shift); round_shift_4x4_avx2(in + 12, shift); } static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out, const __m256i *clamp_lo, const __m256i *clamp_hi, int size) { __m256i a0, a1; for (int i = 0; i < size; i += 4) { a0 = _mm256_max_epi32(in[i], *clamp_lo); out[i] = _mm256_min_epi32(a0, *clamp_hi); a1 = _mm256_max_epi32(in[i + 1], *clamp_lo); out[i + 1] = _mm256_min_epi32(a1, *clamp_hi); a0 = _mm256_max_epi32(in[i + 2], *clamp_lo); out[i + 2] = _mm256_min_epi32(a0, *clamp_hi); a1 = _mm256_max_epi32(in[i + 3], *clamp_lo); out[i + 3] = _mm256_min_epi32(a1, *clamp_hi); } } static inline __m256i highbd_get_recon_16x8_avx2(const __m256i pred, __m256i res0, __m256i res1, const int bd) { __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred)); __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1)); x0 = _mm256_add_epi32(res0, x0); x1 = _mm256_add_epi32(res1, x1); x0 = _mm256_packus_epi32(x0, x1); x0 = _mm256_permute4x64_epi64(x0, 0xd8); x0 = highbd_clamp_epi16_avx2(x0, bd); return x0; } static inline void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output, int stride, int flipud, int height, const int bd) { int j = flipud ? (height - 1) : 0; const int step = flipud ? -1 : 1; for (int i = 0; i < height; ++i, j += step) { __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride)); __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd); _mm256_storeu_si256((__m256i *)(output + i * stride), u); } } static inline __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res, const int bd) { __m256i x0 = pred; x0 = _mm256_add_epi32(res, x0); x0 = _mm256_packus_epi32(x0, x0); x0 = _mm256_permute4x64_epi64(x0, 0xd8); x0 = highbd_clamp_epi16_avx2(x0, bd); return x0; } static inline void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output, int stride, int flipud, int height, const int bd) { int j = flipud ? (height - 1) : 0; __m128i temp; const int step = flipud ? -1 : 1; for (int i = 0; i < height; ++i, j += step) { temp = _mm_loadu_si128((__m128i const *)(output + i * stride)); __m256i v = _mm256_cvtepi16_epi32(temp); __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd); __m128i u1 = _mm256_castsi256_si128(u); _mm_storeu_si128((__m128i *)(output + i * stride), u1); } } static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0, __m256i *out1, const __m256i *clamp_lo, const __m256i *clamp_hi, int shift) { __m256i offset = _mm256_set1_epi32((1 << shift) >> 1); __m256i a0 = _mm256_add_epi32(offset, in0); __m256i a1 = _mm256_sub_epi32(offset, in1); a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift)); a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift)); a0 = _mm256_max_epi32(a0, *clamp_lo); a0 = _mm256_min_epi32(a0, *clamp_hi); a1 = _mm256_max_epi32(a1, *clamp_lo); a1 = _mm256_min_epi32(a1, *clamp_hi); *out0 = a0; *out1 = a1; } static void transpose_8x8_avx2(const __m256i *in, __m256i *out) { __m256i u0, u1, u2, u3, u4, u5, u6, u7; __m256i x0, x1; u0 = _mm256_unpacklo_epi32(in[0], in[1]); u1 = _mm256_unpackhi_epi32(in[0], in[1]); u2 = _mm256_unpacklo_epi32(in[2], in[3]); u3 = _mm256_unpackhi_epi32(in[2], in[3]); u4 = _mm256_unpacklo_epi32(in[4], in[5]); u5 = _mm256_unpackhi_epi32(in[4], in[5]); u6 = _mm256_unpacklo_epi32(in[6], in[7]); u7 = _mm256_unpackhi_epi32(in[6], in[7]); x0 = _mm256_unpacklo_epi64(u0, u2); x1 = _mm256_unpacklo_epi64(u4, u6); out[0] = _mm256_permute2f128_si256(x0, x1, 0x20); out[4] = _mm256_permute2f128_si256(x0, x1, 0x31); x0 = _mm256_unpackhi_epi64(u0, u2); x1 = _mm256_unpackhi_epi64(u4, u6); out[1] = _mm256_permute2f128_si256(x0, x1, 0x20); out[5] = _mm256_permute2f128_si256(x0, x1, 0x31); x0 = _mm256_unpacklo_epi64(u1, u3); x1 = _mm256_unpacklo_epi64(u5, u7); out[2] = _mm256_permute2f128_si256(x0, x1, 0x20); out[6] = _mm256_permute2f128_si256(x0, x1, 0x31); x0 = _mm256_unpackhi_epi64(u1, u3); x1 = _mm256_unpackhi_epi64(u5, u7); out[3] = _mm256_permute2f128_si256(x0, x1, 0x20); out[7] = _mm256_permute2f128_si256(x0, x1, 0x31); } static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) { __m256i u0, u1, u2, u3, u4, u5, u6, u7; __m256i x0, x1; u0 = _mm256_unpacklo_epi32(in[7], in[6]); u1 = _mm256_unpackhi_epi32(in[7], in[6]); u2 = _mm256_unpacklo_epi32(in[5], in[4]); u3 = _mm256_unpackhi_epi32(in[5], in[4]); u4 = _mm256_unpacklo_epi32(in[3], in[2]); u5 = _mm256_unpackhi_epi32(in[3], in[2]); u6 = _mm256_unpacklo_epi32(in[1], in[0]); u7 = _mm256_unpackhi_epi32(in[1], in[0]); x0 = _mm256_unpacklo_epi64(u0, u2); x1 = _mm256_unpacklo_epi64(u4, u6); out[0] = _mm256_permute2f128_si256(x0, x1, 0x20); out[4] = _mm256_permute2f128_si256(x0, x1, 0x31); x0 = _mm256_unpackhi_epi64(u0, u2); x1 = _mm256_unpackhi_epi64(u4, u6); out[1] = _mm256_permute2f128_si256(x0, x1, 0x20); out[5] = _mm256_permute2f128_si256(x0, x1, 0x31); x0 = _mm256_unpacklo_epi64(u1, u3); x1 = _mm256_unpacklo_epi64(u5, u7); out[2] = _mm256_permute2f128_si256(x0, x1, 0x20); out[6] = _mm256_permute2f128_si256(x0, x1, 0x31); x0 = _mm256_unpackhi_epi64(u1, u3); x1 = _mm256_unpackhi_epi64(u5, u7); out[3] = _mm256_permute2f128_si256(x0, x1, 0x20); out[7] = _mm256_permute2f128_si256(x0, x1, 0x31); } static inline void load_buffer_32bit_input(const int32_t *in, int stride, __m256i *out, int out_size) { for (int i = 0; i < out_size; ++i) { out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride)); } } static inline __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0, const __m256i *rounding, int bit) { __m256i x; x = _mm256_mullo_epi32(*w0, *n0); x = _mm256_add_epi32(x, *rounding); x = _mm256_srai_epi32(x, bit); return x; } static inline __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0, const __m256i *w1, const __m256i *n1, const __m256i *rounding, int bit) { __m256i x, y; x = _mm256_mullo_epi32(*w0, *n0); y = _mm256_mullo_epi32(*w1, *n1); x = _mm256_add_epi32(x, y); x = _mm256_add_epi32(x, *rounding); x = _mm256_srai_epi32(x, bit); return x; } static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0, __m256i *out1, const __m256i *clamp_lo, const __m256i *clamp_hi) { __m256i a0 = _mm256_add_epi32(in0, in1); __m256i a1 = _mm256_sub_epi32(in0, in1); a0 = _mm256_max_epi32(a0, *clamp_lo); a0 = _mm256_min_epi32(a0, *clamp_hi); a1 = _mm256_max_epi32(a1, *clamp_lo); a1 = _mm256_min_epi32(a1, *clamp_hi); *out0 = a0; *out1 = a1; } static inline void idct32_stage4_avx2( __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56, const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40, const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24, const __m256i *rounding, int bit) { __m256i temp1, temp2; temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit); bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit); bf1[17] = temp1; temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit); bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit); bf1[18] = temp2; temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit); bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit); bf1[21] = temp1; temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit); bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit); bf1[22] = temp2; } static inline void idct32_stage5_avx2( __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi, const __m256i *rounding, int bit) { __m256i temp1, temp2; temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit); bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit); bf1[9] = temp1; temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit); bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit); bf1[10] = temp2; addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); } static inline void idct32_stage6_avx2( __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32, const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi, const __m256i *rounding, int bit) { __m256i temp1, temp2; temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit); bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit); bf1[5] = temp1; addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit); bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit); bf1[18] = temp1; temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit); bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit); bf1[19] = temp2; temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit); bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit); bf1[20] = temp1; temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit); bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit); bf1[21] = temp2; } static inline void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32, const __m256i *cospi32, const __m256i *clamp_lo, const __m256i *clamp_hi, const __m256i *rounding, int bit) { __m256i temp1, temp2; addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit); bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit); bf1[10] = temp1; temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit); bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit); bf1[11] = temp2; addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); } static inline void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32, const __m256i *cospi32, const __m256i *clamp_lo, const __m256i *clamp_hi, const __m256i *rounding, int bit) { __m256i temp1, temp2; addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit); bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit); bf1[20] = temp1; temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit); bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit); bf1[21] = temp2; temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit); bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit); bf1[22] = temp1; temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit); bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit); bf1[23] = temp2; } static inline void idct32_stage9_avx2(__m256i *bf1, __m256i *out, const int do_cols, const int bd, const int out_shift, const __m256i *clamp_lo, const __m256i *clamp_hi) { addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi); addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi); addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi); addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi); addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi); addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi); addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi); addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi); addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi); addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi); addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi); addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi); addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi); addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi); addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi); addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi); if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); const __m256i clamp_hi_out = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); round_shift_8x8_avx2(out, out_shift); round_shift_8x8_avx2(out + 16, out_shift); highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32); } } static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); __m256i x; // stage 0 // stage 1 // stage 2 // stage 3 // stage 4 // stage 5 x = _mm256_mullo_epi32(in[0], cospi32); x = _mm256_add_epi32(x, rounding); x = _mm256_srai_epi32(x, bit); // stage 6 // stage 7 // stage 8 // stage 9 if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); x = _mm256_add_epi32(offset, x); x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); } x = _mm256_max_epi32(x, clamp_lo); x = _mm256_min_epi32(x, clamp_hi); out[0] = x; out[1] = x; out[2] = x; out[3] = x; out[4] = x; out[5] = x; out[6] = x; out[7] = x; out[8] = x; out[9] = x; out[10] = x; out[11] = x; out[12] = x; out[13] = x; out[14] = x; out[15] = x; out[16] = x; out[17] = x; out[18] = x; out[19] = x; out[20] = x; out[21] = x; out[22] = x; out[23] = x; out[24] = x; out[25] = x; out[26] = x; out[27] = x; out[28] = x; out[29] = x; out[30] = x; out[31] = x; } static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); __m256i bf1[32]; { // stage 0 // stage 1 bf1[0] = in[0]; bf1[4] = in[4]; bf1[8] = in[2]; bf1[12] = in[6]; bf1[16] = in[1]; bf1[20] = in[5]; bf1[24] = in[3]; bf1[28] = in[7]; // stage 2 bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit); bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit); bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit); bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit); bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit); bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit); bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit); bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit); // stage 3 bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit); bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit); bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit); bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit); bf1[17] = bf1[16]; bf1[18] = bf1[19]; bf1[21] = bf1[20]; bf1[22] = bf1[23]; bf1[25] = bf1[24]; bf1[26] = bf1[27]; bf1[29] = bf1[28]; bf1[30] = bf1[31]; // stage 4 bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit); bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit); bf1[9] = bf1[8]; bf1[10] = bf1[11]; bf1[13] = bf1[12]; bf1[14] = bf1[15]; idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, &cospi24, &cospi40, &cospim24, &rounding, bit); // stage 5 bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit); bf1[1] = bf1[0]; bf1[5] = bf1[4]; bf1[6] = bf1[7]; idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); // stage 6 bf1[3] = bf1[0]; bf1[2] = bf1[1]; idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); // stage 7 idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rounding, bit); // stage 8 idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rounding, bit); // stage 9 idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } } static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); __m256i bf1[32]; { // stage 0 // stage 1 bf1[0] = in[0]; bf1[2] = in[8]; bf1[4] = in[4]; bf1[6] = in[12]; bf1[8] = in[2]; bf1[10] = in[10]; bf1[12] = in[6]; bf1[14] = in[14]; bf1[16] = in[1]; bf1[18] = in[9]; bf1[20] = in[5]; bf1[22] = in[13]; bf1[24] = in[3]; bf1[26] = in[11]; bf1[28] = in[7]; bf1[30] = in[15]; // stage 2 bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit); bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit); bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit); bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit); bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit); bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit); bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit); bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit); bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit); bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit); bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit); bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit); bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit); bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit); bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit); bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit); // stage 3 bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit); bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit); bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit); bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit); bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit); bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit); bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit); bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit); addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); // stage 4 bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit); bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit); bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit); bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit); addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, &cospi24, &cospi40, &cospim24, &rounding, bit); // stage 5 bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit); bf1[1] = bf1[0]; bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit); bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit); addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); // stage 6 addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); // stage 7 idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rounding, bit); // stage 8 idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rounding, bit); // stage 9 idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } } static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]); const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]); const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]); const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]); const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); __m256i bf1[32], bf0[32]; { // stage 0 // stage 1 bf1[0] = in[0]; bf1[1] = in[16]; bf1[2] = in[8]; bf1[3] = in[24]; bf1[4] = in[4]; bf1[5] = in[20]; bf1[6] = in[12]; bf1[7] = in[28]; bf1[8] = in[2]; bf1[9] = in[18]; bf1[10] = in[10]; bf1[11] = in[26]; bf1[12] = in[6]; bf1[13] = in[22]; bf1[14] = in[14]; bf1[15] = in[30]; bf1[16] = in[1]; bf1[17] = in[17]; bf1[18] = in[9]; bf1[19] = in[25]; bf1[20] = in[5]; bf1[21] = in[21]; bf1[22] = in[13]; bf1[23] = in[29]; bf1[24] = in[3]; bf1[25] = in[19]; bf1[26] = in[11]; bf1[27] = in[27]; bf1[28] = in[7]; bf1[29] = in[23]; bf1[30] = in[15]; bf1[31] = in[31]; // stage 2 bf0[0] = bf1[0]; bf0[1] = bf1[1]; bf0[2] = bf1[2]; bf0[3] = bf1[3]; bf0[4] = bf1[4]; bf0[5] = bf1[5]; bf0[6] = bf1[6]; bf0[7] = bf1[7]; bf0[8] = bf1[8]; bf0[9] = bf1[9]; bf0[10] = bf1[10]; bf0[11] = bf1[11]; bf0[12] = bf1[12]; bf0[13] = bf1[13]; bf0[14] = bf1[14]; bf0[15] = bf1[15]; bf0[16] = half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit); bf0[17] = half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit); bf0[18] = half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit); bf0[19] = half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit); bf0[20] = half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit); bf0[21] = half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit); bf0[22] = half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit); bf0[23] = half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit); bf0[24] = half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit); bf0[25] = half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit); bf0[26] = half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit); bf0[27] = half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit); bf0[28] = half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit); bf0[29] = half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit); bf0[30] = half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit); bf0[31] = half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit); // stage 3 bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; bf1[8] = half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit); bf1[9] = half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit); bf1[10] = half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit); bf1[11] = half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit); bf1[12] = half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit); bf1[13] = half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit); bf1[14] = half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit); bf1[15] = half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit); addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); // stage 4 bf0[0] = bf1[0]; bf0[1] = bf1[1]; bf0[2] = bf1[2]; bf0[3] = bf1[3]; bf0[4] = half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit); bf0[5] = half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit); bf0[6] = half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit); bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit); addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); bf0[16] = bf1[16]; bf0[17] = half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit); bf0[18] = half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit); bf0[19] = bf1[19]; bf0[20] = bf1[20]; bf0[21] = half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit); bf0[22] = half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit); bf0[23] = bf1[23]; bf0[24] = bf1[24]; bf0[25] = half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit); bf0[26] = half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit); bf0[27] = bf1[27]; bf0[28] = bf1[28]; bf0[29] = half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit); bf0[30] = half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit); bf0[31] = bf1[31]; // stage 5 bf1[0] = half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit); bf1[1] = half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit); bf1[2] = half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit); bf1[3] = half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit); addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); bf1[8] = bf0[8]; bf1[9] = half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit); bf1[10] = half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit); bf1[11] = bf0[11]; bf1[12] = bf0[12]; bf1[13] = half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit); bf1[14] = half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit); bf1[15] = bf0[15]; addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); // stage 6 addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); bf0[4] = bf1[4]; bf0[5] = half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); bf0[6] = half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); bf0[7] = bf1[7]; addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); bf0[16] = bf1[16]; bf0[17] = bf1[17]; bf0[18] = half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit); bf0[19] = half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit); bf0[20] = half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit); bf0[21] = half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit); bf0[22] = bf1[22]; bf0[23] = bf1[23]; bf0[24] = bf1[24]; bf0[25] = bf1[25]; bf0[26] = half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit); bf0[27] = half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit); bf0[28] = half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit); bf0[29] = half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit); bf0[30] = bf1[30]; bf0[31] = bf1[31]; // stage 7 addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); bf1[11] = half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); bf1[12] = half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); bf1[13] = half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); bf1[14] = bf0[14]; bf1[15] = bf0[15]; addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); // stage 8 addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); bf0[16] = bf1[16]; bf0[17] = bf1[17]; bf0[18] = bf1[18]; bf0[19] = bf1[19]; bf0[20] = half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); bf0[21] = half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); bf0[22] = half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); bf0[23] = half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); bf0[24] = half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); bf0[25] = half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); bf0[26] = half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); bf0[27] = half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); bf0[28] = bf1[28]; bf0[29] = bf1[29]; bf0[30] = bf1[30]; bf0[31] = bf1[31]; // stage 9 addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi); addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi); addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi); addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi); addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi); addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi); addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi); addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi); addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi); addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi); addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi); addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi); addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi); addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi); addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi); addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi); if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); const __m256i clamp_hi_out = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); round_shift_8x8_avx2(out, out_shift); round_shift_8x8_avx2(out + 16, out_shift); highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32); } } } static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); { // stage 0 // stage 1 // stage 2 // stage 3 // stage 4 in[0] = _mm256_mullo_epi32(in[0], cospi32); in[0] = _mm256_add_epi32(in[0], rnding); in[0] = _mm256_srai_epi32(in[0], bit); // stage 5 // stage 6 // stage 7 if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); in[0] = _mm256_add_epi32(in[0], offset); in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift)); } in[0] = _mm256_max_epi32(in[0], clamp_lo); in[0] = _mm256_min_epi32(in[0], clamp_hi); out[0] = in[0]; out[1] = in[0]; out[2] = in[0]; out[3] = in[0]; out[4] = in[0]; out[5] = in[0]; out[6] = in[0]; out[7] = in[0]; out[8] = in[0]; out[9] = in[0]; out[10] = in[0]; out[11] = in[0]; out[12] = in[0]; out[13] = in[0]; out[14] = in[0]; out[15] = in[0]; } } static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); __m256i u[16], x, y; { // stage 0 // stage 1 u[0] = in[0]; u[2] = in[4]; u[4] = in[2]; u[6] = in[6]; u[8] = in[1]; u[10] = in[5]; u[12] = in[3]; u[14] = in[7]; // stage 2 u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit); u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit); u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit); u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit); u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); // stage 3 u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit); u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit); u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit); u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit); addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); // stage 4 x = _mm256_mullo_epi32(u[0], cospi32); u[0] = _mm256_add_epi32(x, rnding); u[0] = _mm256_srai_epi32(u[0], bit); u[1] = u[0]; u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit); u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit); addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); u[9] = x; y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); u[10] = y; // stage 5 addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); x = _mm256_mullo_epi32(u[5], cospi32); y = _mm256_mullo_epi32(u[6], cospi32); u[5] = _mm256_sub_epi32(y, x); u[5] = _mm256_add_epi32(u[5], rnding); u[5] = _mm256_srai_epi32(u[5], bit); u[6] = _mm256_add_epi32(y, x); u[6] = _mm256_add_epi32(u[6], rnding); u[6] = _mm256_srai_epi32(u[6], bit); addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); // stage 6 addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); x = _mm256_mullo_epi32(u[10], cospi32); y = _mm256_mullo_epi32(u[13], cospi32); u[10] = _mm256_sub_epi32(y, x); u[10] = _mm256_add_epi32(u[10], rnding); u[10] = _mm256_srai_epi32(u[10], bit); u[13] = _mm256_add_epi32(x, y); u[13] = _mm256_add_epi32(u[13], rnding); u[13] = _mm256_srai_epi32(u[13], bit); x = _mm256_mullo_epi32(u[11], cospi32); y = _mm256_mullo_epi32(u[12], cospi32); u[11] = _mm256_sub_epi32(y, x); u[11] = _mm256_add_epi32(u[11], rnding); u[11] = _mm256_srai_epi32(u[11], bit); u[12] = _mm256_add_epi32(x, y); u[12] = _mm256_add_epi32(u[12], rnding); u[12] = _mm256_srai_epi32(u[12], bit); // stage 7 addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi); addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi); addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi); addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi); addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi); addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi); addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi); addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi); if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); const __m256i clamp_hi_out = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); round_shift_8x8_avx2(out, out_shift); highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16); } } } static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); __m256i u[16], v[16], x, y; { // stage 0 // stage 1 u[0] = in[0]; u[1] = in[8]; u[2] = in[4]; u[3] = in[12]; u[4] = in[2]; u[5] = in[10]; u[6] = in[6]; u[7] = in[14]; u[8] = in[1]; u[9] = in[9]; u[10] = in[5]; u[11] = in[13]; u[12] = in[3]; u[13] = in[11]; u[14] = in[7]; u[15] = in[15]; // stage 2 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = u[4]; v[5] = u[5]; v[6] = u[6]; v[7] = u[7]; v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); // stage 3 u[0] = v[0]; u[1] = v[1]; u[2] = v[2]; u[3] = v[3]; u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); // stage 4 x = _mm256_mullo_epi32(u[0], cospi32); y = _mm256_mullo_epi32(u[1], cospi32); v[0] = _mm256_add_epi32(x, y); v[0] = _mm256_add_epi32(v[0], rnding); v[0] = _mm256_srai_epi32(v[0], bit); v[1] = _mm256_sub_epi32(x, y); v[1] = _mm256_add_epi32(v[1], rnding); v[1] = _mm256_srai_epi32(v[1], bit); v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); v[8] = u[8]; v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); v[11] = u[11]; v[12] = u[12]; v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); v[15] = u[15]; // stage 5 addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); u[4] = v[4]; x = _mm256_mullo_epi32(v[5], cospi32); y = _mm256_mullo_epi32(v[6], cospi32); u[5] = _mm256_sub_epi32(y, x); u[5] = _mm256_add_epi32(u[5], rnding); u[5] = _mm256_srai_epi32(u[5], bit); u[6] = _mm256_add_epi32(y, x); u[6] = _mm256_add_epi32(u[6], rnding); u[6] = _mm256_srai_epi32(u[6], bit); u[7] = v[7]; addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); // stage 6 addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); v[8] = u[8]; v[9] = u[9]; x = _mm256_mullo_epi32(u[10], cospi32); y = _mm256_mullo_epi32(u[13], cospi32); v[10] = _mm256_sub_epi32(y, x); v[10] = _mm256_add_epi32(v[10], rnding); v[10] = _mm256_srai_epi32(v[10], bit); v[13] = _mm256_add_epi32(x, y); v[13] = _mm256_add_epi32(v[13], rnding); v[13] = _mm256_srai_epi32(v[13], bit); x = _mm256_mullo_epi32(u[11], cospi32); y = _mm256_mullo_epi32(u[12], cospi32); v[11] = _mm256_sub_epi32(y, x); v[11] = _mm256_add_epi32(v[11], rnding); v[11] = _mm256_srai_epi32(v[11], bit); v[12] = _mm256_add_epi32(x, y); v[12] = _mm256_add_epi32(v[12], rnding); v[12] = _mm256_srai_epi32(v[12], bit); v[14] = u[14]; v[15] = u[15]; // stage 7 addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi); addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi); addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi); addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi); addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi); addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi); addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi); addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi); if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); const __m256i clamp_hi_out = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); round_shift_8x8_avx2(out, out_shift); highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16); } } } static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); const __m256i zero = _mm256_setzero_si256(); __m256i v[16], x, y, temp1, temp2; // Calculate the column 0, 1, 2, 3 { // stage 0 // stage 1 // stage 2 x = _mm256_mullo_epi32(in[0], cospi62); v[0] = _mm256_add_epi32(x, rnding); v[0] = _mm256_srai_epi32(v[0], bit); x = _mm256_mullo_epi32(in[0], cospi2); v[1] = _mm256_sub_epi32(zero, x); v[1] = _mm256_add_epi32(v[1], rnding); v[1] = _mm256_srai_epi32(v[1], bit); // stage 3 v[8] = v[0]; v[9] = v[1]; // stage 4 temp1 = _mm256_mullo_epi32(v[8], cospi8); x = _mm256_mullo_epi32(v[9], cospi56); temp1 = _mm256_add_epi32(temp1, x); temp1 = _mm256_add_epi32(temp1, rnding); temp1 = _mm256_srai_epi32(temp1, bit); temp2 = _mm256_mullo_epi32(v[8], cospi56); x = _mm256_mullo_epi32(v[9], cospi8); temp2 = _mm256_sub_epi32(temp2, x); temp2 = _mm256_add_epi32(temp2, rnding); temp2 = _mm256_srai_epi32(temp2, bit); v[8] = temp1; v[9] = temp2; // stage 5 v[4] = v[0]; v[5] = v[1]; v[12] = v[8]; v[13] = v[9]; // stage 6 temp1 = _mm256_mullo_epi32(v[4], cospi16); x = _mm256_mullo_epi32(v[5], cospi48); temp1 = _mm256_add_epi32(temp1, x); temp1 = _mm256_add_epi32(temp1, rnding); temp1 = _mm256_srai_epi32(temp1, bit); temp2 = _mm256_mullo_epi32(v[4], cospi48); x = _mm256_mullo_epi32(v[5], cospi16); temp2 = _mm256_sub_epi32(temp2, x); temp2 = _mm256_add_epi32(temp2, rnding); temp2 = _mm256_srai_epi32(temp2, bit); v[4] = temp1; v[5] = temp2; temp1 = _mm256_mullo_epi32(v[12], cospi16); x = _mm256_mullo_epi32(v[13], cospi48); temp1 = _mm256_add_epi32(temp1, x); temp1 = _mm256_add_epi32(temp1, rnding); temp1 = _mm256_srai_epi32(temp1, bit); temp2 = _mm256_mullo_epi32(v[12], cospi48); x = _mm256_mullo_epi32(v[13], cospi16); temp2 = _mm256_sub_epi32(temp2, x); temp2 = _mm256_add_epi32(temp2, rnding); temp2 = _mm256_srai_epi32(temp2, bit); v[12] = temp1; v[13] = temp2; // stage 7 v[2] = v[0]; v[3] = v[1]; v[6] = v[4]; v[7] = v[5]; v[10] = v[8]; v[11] = v[9]; v[14] = v[12]; v[15] = v[13]; // stage 8 y = _mm256_mullo_epi32(v[2], cospi32); x = _mm256_mullo_epi32(v[3], cospi32); v[2] = _mm256_add_epi32(y, x); v[2] = _mm256_add_epi32(v[2], rnding); v[2] = _mm256_srai_epi32(v[2], bit); v[3] = _mm256_sub_epi32(y, x); v[3] = _mm256_add_epi32(v[3], rnding); v[3] = _mm256_srai_epi32(v[3], bit); y = _mm256_mullo_epi32(v[6], cospi32); x = _mm256_mullo_epi32(v[7], cospi32); v[6] = _mm256_add_epi32(y, x); v[6] = _mm256_add_epi32(v[6], rnding); v[6] = _mm256_srai_epi32(v[6], bit); v[7] = _mm256_sub_epi32(y, x); v[7] = _mm256_add_epi32(v[7], rnding); v[7] = _mm256_srai_epi32(v[7], bit); y = _mm256_mullo_epi32(v[10], cospi32); x = _mm256_mullo_epi32(v[11], cospi32); v[10] = _mm256_add_epi32(y, x); v[10] = _mm256_add_epi32(v[10], rnding); v[10] = _mm256_srai_epi32(v[10], bit); v[11] = _mm256_sub_epi32(y, x); v[11] = _mm256_add_epi32(v[11], rnding); v[11] = _mm256_srai_epi32(v[11], bit); y = _mm256_mullo_epi32(v[14], cospi32); x = _mm256_mullo_epi32(v[15], cospi32); v[14] = _mm256_add_epi32(y, x); v[14] = _mm256_add_epi32(v[14], rnding); v[14] = _mm256_srai_epi32(v[14], bit); v[15] = _mm256_sub_epi32(y, x); v[15] = _mm256_add_epi32(v[15], rnding); v[15] = _mm256_srai_epi32(v[15], bit); // stage 9 if (do_cols) { out[0] = v[0]; out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]); out[2] = v[12]; out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]); out[4] = v[6]; out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]); out[6] = v[10]; out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]); out[8] = v[3]; out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]); out[10] = v[15]; out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]); out[12] = v[5]; out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]); out[14] = v[9]; out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); const __m256i clamp_hi_out = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out, &clamp_hi_out, out_shift); } } } static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); __m256i u[16], x, y; { // stage 0 // stage 1 // stage 2 __m256i zero = _mm256_setzero_si256(); x = _mm256_mullo_epi32(in[0], cospi62); u[0] = _mm256_add_epi32(x, rnding); u[0] = _mm256_srai_epi32(u[0], bit); x = _mm256_mullo_epi32(in[0], cospi2); u[1] = _mm256_sub_epi32(zero, x); u[1] = _mm256_add_epi32(u[1], rnding); u[1] = _mm256_srai_epi32(u[1], bit); x = _mm256_mullo_epi32(in[2], cospi54); u[2] = _mm256_add_epi32(x, rnding); u[2] = _mm256_srai_epi32(u[2], bit); x = _mm256_mullo_epi32(in[2], cospi10); u[3] = _mm256_sub_epi32(zero, x); u[3] = _mm256_add_epi32(u[3], rnding); u[3] = _mm256_srai_epi32(u[3], bit); x = _mm256_mullo_epi32(in[4], cospi46); u[4] = _mm256_add_epi32(x, rnding); u[4] = _mm256_srai_epi32(u[4], bit); x = _mm256_mullo_epi32(in[4], cospi18); u[5] = _mm256_sub_epi32(zero, x); u[5] = _mm256_add_epi32(u[5], rnding); u[5] = _mm256_srai_epi32(u[5], bit); x = _mm256_mullo_epi32(in[6], cospi38); u[6] = _mm256_add_epi32(x, rnding); u[6] = _mm256_srai_epi32(u[6], bit); x = _mm256_mullo_epi32(in[6], cospi26); u[7] = _mm256_sub_epi32(zero, x); u[7] = _mm256_add_epi32(u[7], rnding); u[7] = _mm256_srai_epi32(u[7], bit); u[8] = _mm256_mullo_epi32(in[7], cospi34); u[8] = _mm256_add_epi32(u[8], rnding); u[8] = _mm256_srai_epi32(u[8], bit); u[9] = _mm256_mullo_epi32(in[7], cospi30); u[9] = _mm256_add_epi32(u[9], rnding); u[9] = _mm256_srai_epi32(u[9], bit); u[10] = _mm256_mullo_epi32(in[5], cospi42); u[10] = _mm256_add_epi32(u[10], rnding); u[10] = _mm256_srai_epi32(u[10], bit); u[11] = _mm256_mullo_epi32(in[5], cospi22); u[11] = _mm256_add_epi32(u[11], rnding); u[11] = _mm256_srai_epi32(u[11], bit); u[12] = _mm256_mullo_epi32(in[3], cospi50); u[12] = _mm256_add_epi32(u[12], rnding); u[12] = _mm256_srai_epi32(u[12], bit); u[13] = _mm256_mullo_epi32(in[3], cospi14); u[13] = _mm256_add_epi32(u[13], rnding); u[13] = _mm256_srai_epi32(u[13], bit); u[14] = _mm256_mullo_epi32(in[1], cospi58); u[14] = _mm256_add_epi32(u[14], rnding); u[14] = _mm256_srai_epi32(u[14], bit); u[15] = _mm256_mullo_epi32(in[1], cospi6); u[15] = _mm256_add_epi32(u[15], rnding); u[15] = _mm256_srai_epi32(u[15], bit); // stage 3 addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); // stage 4 y = _mm256_mullo_epi32(u[8], cospi56); x = _mm256_mullo_epi32(u[9], cospi56); u[8] = _mm256_mullo_epi32(u[8], cospi8); u[8] = _mm256_add_epi32(u[8], x); u[8] = _mm256_add_epi32(u[8], rnding); u[8] = _mm256_srai_epi32(u[8], bit); x = _mm256_mullo_epi32(u[9], cospi8); u[9] = _mm256_sub_epi32(y, x); u[9] = _mm256_add_epi32(u[9], rnding); u[9] = _mm256_srai_epi32(u[9], bit); x = _mm256_mullo_epi32(u[11], cospi24); y = _mm256_mullo_epi32(u[10], cospi24); u[10] = _mm256_mullo_epi32(u[10], cospi40); u[10] = _mm256_add_epi32(u[10], x); u[10] = _mm256_add_epi32(u[10], rnding); u[10] = _mm256_srai_epi32(u[10], bit); x = _mm256_mullo_epi32(u[11], cospi40); u[11] = _mm256_sub_epi32(y, x); u[11] = _mm256_add_epi32(u[11], rnding); u[11] = _mm256_srai_epi32(u[11], bit); x = _mm256_mullo_epi32(u[13], cospi8); y = _mm256_mullo_epi32(u[12], cospi8); u[12] = _mm256_mullo_epi32(u[12], cospim56); u[12] = _mm256_add_epi32(u[12], x); u[12] = _mm256_add_epi32(u[12], rnding); u[12] = _mm256_srai_epi32(u[12], bit); x = _mm256_mullo_epi32(u[13], cospim56); u[13] = _mm256_sub_epi32(y, x); u[13] = _mm256_add_epi32(u[13], rnding); u[13] = _mm256_srai_epi32(u[13], bit); x = _mm256_mullo_epi32(u[15], cospi40); y = _mm256_mullo_epi32(u[14], cospi40); u[14] = _mm256_mullo_epi32(u[14], cospim24); u[14] = _mm256_add_epi32(u[14], x); u[14] = _mm256_add_epi32(u[14], rnding); u[14] = _mm256_srai_epi32(u[14], bit); x = _mm256_mullo_epi32(u[15], cospim24); u[15] = _mm256_sub_epi32(y, x); u[15] = _mm256_add_epi32(u[15], rnding); u[15] = _mm256_srai_epi32(u[15], bit); // stage 5 addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); // stage 6 x = _mm256_mullo_epi32(u[5], cospi48); y = _mm256_mullo_epi32(u[4], cospi48); u[4] = _mm256_mullo_epi32(u[4], cospi16); u[4] = _mm256_add_epi32(u[4], x); u[4] = _mm256_add_epi32(u[4], rnding); u[4] = _mm256_srai_epi32(u[4], bit); x = _mm256_mullo_epi32(u[5], cospi16); u[5] = _mm256_sub_epi32(y, x); u[5] = _mm256_add_epi32(u[5], rnding); u[5] = _mm256_srai_epi32(u[5], bit); x = _mm256_mullo_epi32(u[7], cospi16); y = _mm256_mullo_epi32(u[6], cospi16); u[6] = _mm256_mullo_epi32(u[6], cospim48); u[6] = _mm256_add_epi32(u[6], x); u[6] = _mm256_add_epi32(u[6], rnding); u[6] = _mm256_srai_epi32(u[6], bit); x = _mm256_mullo_epi32(u[7], cospim48); u[7] = _mm256_sub_epi32(y, x); u[7] = _mm256_add_epi32(u[7], rnding); u[7] = _mm256_srai_epi32(u[7], bit); x = _mm256_mullo_epi32(u[13], cospi48); y = _mm256_mullo_epi32(u[12], cospi48); u[12] = _mm256_mullo_epi32(u[12], cospi16); u[12] = _mm256_add_epi32(u[12], x); u[12] = _mm256_add_epi32(u[12], rnding); u[12] = _mm256_srai_epi32(u[12], bit); x = _mm256_mullo_epi32(u[13], cospi16); u[13] = _mm256_sub_epi32(y, x); u[13] = _mm256_add_epi32(u[13], rnding); u[13] = _mm256_srai_epi32(u[13], bit); x = _mm256_mullo_epi32(u[15], cospi16); y = _mm256_mullo_epi32(u[14], cospi16); u[14] = _mm256_mullo_epi32(u[14], cospim48); u[14] = _mm256_add_epi32(u[14], x); u[14] = _mm256_add_epi32(u[14], rnding); u[14] = _mm256_srai_epi32(u[14], bit); x = _mm256_mullo_epi32(u[15], cospim48); u[15] = _mm256_sub_epi32(y, x); u[15] = _mm256_add_epi32(u[15], rnding); u[15] = _mm256_srai_epi32(u[15], bit); // stage 7 addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); // stage 8 y = _mm256_mullo_epi32(u[2], cospi32); x = _mm256_mullo_epi32(u[3], cospi32); u[2] = _mm256_add_epi32(y, x); u[2] = _mm256_add_epi32(u[2], rnding); u[2] = _mm256_srai_epi32(u[2], bit); u[3] = _mm256_sub_epi32(y, x); u[3] = _mm256_add_epi32(u[3], rnding); u[3] = _mm256_srai_epi32(u[3], bit); y = _mm256_mullo_epi32(u[6], cospi32); x = _mm256_mullo_epi32(u[7], cospi32); u[6] = _mm256_add_epi32(y, x); u[6] = _mm256_add_epi32(u[6], rnding); u[6] = _mm256_srai_epi32(u[6], bit); u[7] = _mm256_sub_epi32(y, x); u[7] = _mm256_add_epi32(u[7], rnding); u[7] = _mm256_srai_epi32(u[7], bit); y = _mm256_mullo_epi32(u[10], cospi32); x = _mm256_mullo_epi32(u[11], cospi32); u[10] = _mm256_add_epi32(y, x); u[10] = _mm256_add_epi32(u[10], rnding); u[10] = _mm256_srai_epi32(u[10], bit); u[11] = _mm256_sub_epi32(y, x); u[11] = _mm256_add_epi32(u[11], rnding); u[11] = _mm256_srai_epi32(u[11], bit); y = _mm256_mullo_epi32(u[14], cospi32); x = _mm256_mullo_epi32(u[15], cospi32); u[14] = _mm256_add_epi32(y, x); u[14] = _mm256_add_epi32(u[14], rnding); u[14] = _mm256_srai_epi32(u[14], bit); u[15] = _mm256_sub_epi32(y, x); u[15] = _mm256_add_epi32(u[15], rnding); u[15] = _mm256_srai_epi32(u[15], bit); // stage 9 if (do_cols) { out[0] = u[0]; out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]); out[2] = u[12]; out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]); out[4] = u[6]; out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]); out[6] = u[10]; out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]); out[8] = u[3]; out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]); out[10] = u[15]; out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]); out[12] = u[5]; out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]); out[14] = u[9]; out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); const __m256i clamp_hi_out = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out, &clamp_hi_out, out_shift); } } } static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); __m256i u[16], v[16], x, y; { // stage 0 // stage 1 // stage 2 v[0] = _mm256_mullo_epi32(in[15], cospi2); x = _mm256_mullo_epi32(in[0], cospi62); v[0] = _mm256_add_epi32(v[0], x); v[0] = _mm256_add_epi32(v[0], rnding); v[0] = _mm256_srai_epi32(v[0], bit); v[1] = _mm256_mullo_epi32(in[15], cospi62); x = _mm256_mullo_epi32(in[0], cospi2); v[1] = _mm256_sub_epi32(v[1], x); v[1] = _mm256_add_epi32(v[1], rnding); v[1] = _mm256_srai_epi32(v[1], bit); v[2] = _mm256_mullo_epi32(in[13], cospi10); x = _mm256_mullo_epi32(in[2], cospi54); v[2] = _mm256_add_epi32(v[2], x); v[2] = _mm256_add_epi32(v[2], rnding); v[2] = _mm256_srai_epi32(v[2], bit); v[3] = _mm256_mullo_epi32(in[13], cospi54); x = _mm256_mullo_epi32(in[2], cospi10); v[3] = _mm256_sub_epi32(v[3], x); v[3] = _mm256_add_epi32(v[3], rnding); v[3] = _mm256_srai_epi32(v[3], bit); v[4] = _mm256_mullo_epi32(in[11], cospi18); x = _mm256_mullo_epi32(in[4], cospi46); v[4] = _mm256_add_epi32(v[4], x); v[4] = _mm256_add_epi32(v[4], rnding); v[4] = _mm256_srai_epi32(v[4], bit); v[5] = _mm256_mullo_epi32(in[11], cospi46); x = _mm256_mullo_epi32(in[4], cospi18); v[5] = _mm256_sub_epi32(v[5], x); v[5] = _mm256_add_epi32(v[5], rnding); v[5] = _mm256_srai_epi32(v[5], bit); v[6] = _mm256_mullo_epi32(in[9], cospi26); x = _mm256_mullo_epi32(in[6], cospi38); v[6] = _mm256_add_epi32(v[6], x); v[6] = _mm256_add_epi32(v[6], rnding); v[6] = _mm256_srai_epi32(v[6], bit); v[7] = _mm256_mullo_epi32(in[9], cospi38); x = _mm256_mullo_epi32(in[6], cospi26); v[7] = _mm256_sub_epi32(v[7], x); v[7] = _mm256_add_epi32(v[7], rnding); v[7] = _mm256_srai_epi32(v[7], bit); v[8] = _mm256_mullo_epi32(in[7], cospi34); x = _mm256_mullo_epi32(in[8], cospi30); v[8] = _mm256_add_epi32(v[8], x); v[8] = _mm256_add_epi32(v[8], rnding); v[8] = _mm256_srai_epi32(v[8], bit); v[9] = _mm256_mullo_epi32(in[7], cospi30); x = _mm256_mullo_epi32(in[8], cospi34); v[9] = _mm256_sub_epi32(v[9], x); v[9] = _mm256_add_epi32(v[9], rnding); v[9] = _mm256_srai_epi32(v[9], bit); v[10] = _mm256_mullo_epi32(in[5], cospi42); x = _mm256_mullo_epi32(in[10], cospi22); v[10] = _mm256_add_epi32(v[10], x); v[10] = _mm256_add_epi32(v[10], rnding); v[10] = _mm256_srai_epi32(v[10], bit); v[11] = _mm256_mullo_epi32(in[5], cospi22); x = _mm256_mullo_epi32(in[10], cospi42); v[11] = _mm256_sub_epi32(v[11], x); v[11] = _mm256_add_epi32(v[11], rnding); v[11] = _mm256_srai_epi32(v[11], bit); v[12] = _mm256_mullo_epi32(in[3], cospi50); x = _mm256_mullo_epi32(in[12], cospi14); v[12] = _mm256_add_epi32(v[12], x); v[12] = _mm256_add_epi32(v[12], rnding); v[12] = _mm256_srai_epi32(v[12], bit); v[13] = _mm256_mullo_epi32(in[3], cospi14); x = _mm256_mullo_epi32(in[12], cospi50); v[13] = _mm256_sub_epi32(v[13], x); v[13] = _mm256_add_epi32(v[13], rnding); v[13] = _mm256_srai_epi32(v[13], bit); v[14] = _mm256_mullo_epi32(in[1], cospi58); x = _mm256_mullo_epi32(in[14], cospi6); v[14] = _mm256_add_epi32(v[14], x); v[14] = _mm256_add_epi32(v[14], rnding); v[14] = _mm256_srai_epi32(v[14], bit); v[15] = _mm256_mullo_epi32(in[1], cospi6); x = _mm256_mullo_epi32(in[14], cospi58); v[15] = _mm256_sub_epi32(v[15], x); v[15] = _mm256_add_epi32(v[15], rnding); v[15] = _mm256_srai_epi32(v[15], bit); // stage 3 addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); // stage 4 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = u[4]; v[5] = u[5]; v[6] = u[6]; v[7] = u[7]; v[8] = _mm256_mullo_epi32(u[8], cospi8); x = _mm256_mullo_epi32(u[9], cospi56); v[8] = _mm256_add_epi32(v[8], x); v[8] = _mm256_add_epi32(v[8], rnding); v[8] = _mm256_srai_epi32(v[8], bit); v[9] = _mm256_mullo_epi32(u[8], cospi56); x = _mm256_mullo_epi32(u[9], cospi8); v[9] = _mm256_sub_epi32(v[9], x); v[9] = _mm256_add_epi32(v[9], rnding); v[9] = _mm256_srai_epi32(v[9], bit); v[10] = _mm256_mullo_epi32(u[10], cospi40); x = _mm256_mullo_epi32(u[11], cospi24); v[10] = _mm256_add_epi32(v[10], x); v[10] = _mm256_add_epi32(v[10], rnding); v[10] = _mm256_srai_epi32(v[10], bit); v[11] = _mm256_mullo_epi32(u[10], cospi24); x = _mm256_mullo_epi32(u[11], cospi40); v[11] = _mm256_sub_epi32(v[11], x); v[11] = _mm256_add_epi32(v[11], rnding); v[11] = _mm256_srai_epi32(v[11], bit); v[12] = _mm256_mullo_epi32(u[12], cospim56); x = _mm256_mullo_epi32(u[13], cospi8); v[12] = _mm256_add_epi32(v[12], x); v[12] = _mm256_add_epi32(v[12], rnding); v[12] = _mm256_srai_epi32(v[12], bit); v[13] = _mm256_mullo_epi32(u[12], cospi8); x = _mm256_mullo_epi32(u[13], cospim56); v[13] = _mm256_sub_epi32(v[13], x); v[13] = _mm256_add_epi32(v[13], rnding); v[13] = _mm256_srai_epi32(v[13], bit); v[14] = _mm256_mullo_epi32(u[14], cospim24); x = _mm256_mullo_epi32(u[15], cospi40); v[14] = _mm256_add_epi32(v[14], x); v[14] = _mm256_add_epi32(v[14], rnding); v[14] = _mm256_srai_epi32(v[14], bit); v[15] = _mm256_mullo_epi32(u[14], cospi40); x = _mm256_mullo_epi32(u[15], cospim24); v[15] = _mm256_sub_epi32(v[15], x); v[15] = _mm256_add_epi32(v[15], rnding); v[15] = _mm256_srai_epi32(v[15], bit); // stage 5 addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); // stage 6 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = _mm256_mullo_epi32(u[4], cospi16); x = _mm256_mullo_epi32(u[5], cospi48); v[4] = _mm256_add_epi32(v[4], x); v[4] = _mm256_add_epi32(v[4], rnding); v[4] = _mm256_srai_epi32(v[4], bit); v[5] = _mm256_mullo_epi32(u[4], cospi48); x = _mm256_mullo_epi32(u[5], cospi16); v[5] = _mm256_sub_epi32(v[5], x); v[5] = _mm256_add_epi32(v[5], rnding); v[5] = _mm256_srai_epi32(v[5], bit); v[6] = _mm256_mullo_epi32(u[6], cospim48); x = _mm256_mullo_epi32(u[7], cospi16); v[6] = _mm256_add_epi32(v[6], x); v[6] = _mm256_add_epi32(v[6], rnding); v[6] = _mm256_srai_epi32(v[6], bit); v[7] = _mm256_mullo_epi32(u[6], cospi16); x = _mm256_mullo_epi32(u[7], cospim48); v[7] = _mm256_sub_epi32(v[7], x); v[7] = _mm256_add_epi32(v[7], rnding); v[7] = _mm256_srai_epi32(v[7], bit); v[8] = u[8]; v[9] = u[9]; v[10] = u[10]; v[11] = u[11]; v[12] = _mm256_mullo_epi32(u[12], cospi16); x = _mm256_mullo_epi32(u[13], cospi48); v[12] = _mm256_add_epi32(v[12], x); v[12] = _mm256_add_epi32(v[12], rnding); v[12] = _mm256_srai_epi32(v[12], bit); v[13] = _mm256_mullo_epi32(u[12], cospi48); x = _mm256_mullo_epi32(u[13], cospi16); v[13] = _mm256_sub_epi32(v[13], x); v[13] = _mm256_add_epi32(v[13], rnding); v[13] = _mm256_srai_epi32(v[13], bit); v[14] = _mm256_mullo_epi32(u[14], cospim48); x = _mm256_mullo_epi32(u[15], cospi16); v[14] = _mm256_add_epi32(v[14], x); v[14] = _mm256_add_epi32(v[14], rnding); v[14] = _mm256_srai_epi32(v[14], bit); v[15] = _mm256_mullo_epi32(u[14], cospi16); x = _mm256_mullo_epi32(u[15], cospim48); v[15] = _mm256_sub_epi32(v[15], x); v[15] = _mm256_add_epi32(v[15], rnding); v[15] = _mm256_srai_epi32(v[15], bit); // stage 7 addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); // stage 8 v[0] = u[0]; v[1] = u[1]; y = _mm256_mullo_epi32(u[2], cospi32); x = _mm256_mullo_epi32(u[3], cospi32); v[2] = _mm256_add_epi32(y, x); v[2] = _mm256_add_epi32(v[2], rnding); v[2] = _mm256_srai_epi32(v[2], bit); v[3] = _mm256_sub_epi32(y, x); v[3] = _mm256_add_epi32(v[3], rnding); v[3] = _mm256_srai_epi32(v[3], bit); v[4] = u[4]; v[5] = u[5]; y = _mm256_mullo_epi32(u[6], cospi32); x = _mm256_mullo_epi32(u[7], cospi32); v[6] = _mm256_add_epi32(y, x); v[6] = _mm256_add_epi32(v[6], rnding); v[6] = _mm256_srai_epi32(v[6], bit); v[7] = _mm256_sub_epi32(y, x); v[7] = _mm256_add_epi32(v[7], rnding); v[7] = _mm256_srai_epi32(v[7], bit); v[8] = u[8]; v[9] = u[9]; y = _mm256_mullo_epi32(u[10], cospi32); x = _mm256_mullo_epi32(u[11], cospi32); v[10] = _mm256_add_epi32(y, x); v[10] = _mm256_add_epi32(v[10], rnding); v[10] = _mm256_srai_epi32(v[10], bit); v[11] = _mm256_sub_epi32(y, x); v[11] = _mm256_add_epi32(v[11], rnding); v[11] = _mm256_srai_epi32(v[11], bit); v[12] = u[12]; v[13] = u[13]; y = _mm256_mullo_epi32(u[14], cospi32); x = _mm256_mullo_epi32(u[15], cospi32); v[14] = _mm256_add_epi32(y, x); v[14] = _mm256_add_epi32(v[14], rnding); v[14] = _mm256_srai_epi32(v[14], bit); v[15] = _mm256_sub_epi32(y, x); v[15] = _mm256_add_epi32(v[15], rnding); v[15] = _mm256_srai_epi32(v[15], bit); // stage 9 if (do_cols) { out[0] = v[0]; out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]); out[2] = v[12]; out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]); out[4] = v[6]; out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]); out[6] = v[10]; out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]); out[8] = v[3]; out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]); out[10] = v[15]; out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]); out[12] = v[5]; out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]); out[14] = v[9]; out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); const __m256i clamp_hi_out = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out, &clamp_hi_out, out_shift); } } } static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); __m256i x; // stage 0 // stage 1 // stage 2 // stage 3 x = _mm256_mullo_epi32(in[0], cospi32); x = _mm256_add_epi32(x, rnding); x = _mm256_srai_epi32(x, bit); // stage 4 // stage 5 if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); x = _mm256_add_epi32(x, offset); x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); } x = _mm256_max_epi32(x, clamp_lo); x = _mm256_min_epi32(x, clamp_hi); out[0] = x; out[1] = x; out[2] = x; out[3] = x; out[4] = x; out[5] = x; out[6] = x; out[7] = x; } static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); __m256i u0, u1, u2, u3, u4, u5, u6, u7; __m256i v0, v1, v2, v3, v4, v5, v6, v7; __m256i x, y; // stage 0 // stage 1 // stage 2 u0 = in[0]; u1 = in[4]; u2 = in[2]; u3 = in[6]; x = _mm256_mullo_epi32(in[1], cospi56); y = _mm256_mullo_epi32(in[7], cospim8); u4 = _mm256_add_epi32(x, y); u4 = _mm256_add_epi32(u4, rnding); u4 = _mm256_srai_epi32(u4, bit); x = _mm256_mullo_epi32(in[1], cospi8); y = _mm256_mullo_epi32(in[7], cospi56); u7 = _mm256_add_epi32(x, y); u7 = _mm256_add_epi32(u7, rnding); u7 = _mm256_srai_epi32(u7, bit); x = _mm256_mullo_epi32(in[5], cospi24); y = _mm256_mullo_epi32(in[3], cospim40); u5 = _mm256_add_epi32(x, y); u5 = _mm256_add_epi32(u5, rnding); u5 = _mm256_srai_epi32(u5, bit); x = _mm256_mullo_epi32(in[5], cospi40); y = _mm256_mullo_epi32(in[3], cospi24); u6 = _mm256_add_epi32(x, y); u6 = _mm256_add_epi32(u6, rnding); u6 = _mm256_srai_epi32(u6, bit); // stage 3 x = _mm256_mullo_epi32(u0, cospi32); y = _mm256_mullo_epi32(u1, cospi32); v0 = _mm256_add_epi32(x, y); v0 = _mm256_add_epi32(v0, rnding); v0 = _mm256_srai_epi32(v0, bit); v1 = _mm256_sub_epi32(x, y); v1 = _mm256_add_epi32(v1, rnding); v1 = _mm256_srai_epi32(v1, bit); x = _mm256_mullo_epi32(u2, cospi48); y = _mm256_mullo_epi32(u3, cospim16); v2 = _mm256_add_epi32(x, y); v2 = _mm256_add_epi32(v2, rnding); v2 = _mm256_srai_epi32(v2, bit); x = _mm256_mullo_epi32(u2, cospi16); y = _mm256_mullo_epi32(u3, cospi48); v3 = _mm256_add_epi32(x, y); v3 = _mm256_add_epi32(v3, rnding); v3 = _mm256_srai_epi32(v3, bit); addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); // stage 4 addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); u4 = v4; u7 = v7; x = _mm256_mullo_epi32(v5, cospi32); y = _mm256_mullo_epi32(v6, cospi32); u6 = _mm256_add_epi32(y, x); u6 = _mm256_add_epi32(u6, rnding); u6 = _mm256_srai_epi32(u6, bit); u5 = _mm256_sub_epi32(y, x); u5 = _mm256_add_epi32(u5, rnding); u5 = _mm256_srai_epi32(u5, bit); addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi); addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi); addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi); addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi); // stage 5 if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); const __m256i clamp_hi_out = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); round_shift_4x4_avx2(out, out_shift); round_shift_4x4_avx2(out + 4, out_shift); highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8); } } static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); const __m256i kZero = _mm256_setzero_si256(); __m256i u[8], x; // stage 0 // stage 1 // stage 2 x = _mm256_mullo_epi32(in[0], cospi60); u[0] = _mm256_add_epi32(x, rnding); u[0] = _mm256_srai_epi32(u[0], bit); x = _mm256_mullo_epi32(in[0], cospi4); u[1] = _mm256_sub_epi32(kZero, x); u[1] = _mm256_add_epi32(u[1], rnding); u[1] = _mm256_srai_epi32(u[1], bit); // stage 3 // stage 4 __m256i temp1, temp2; temp1 = _mm256_mullo_epi32(u[0], cospi16); x = _mm256_mullo_epi32(u[1], cospi48); temp1 = _mm256_add_epi32(temp1, x); temp1 = _mm256_add_epi32(temp1, rnding); temp1 = _mm256_srai_epi32(temp1, bit); u[4] = temp1; temp2 = _mm256_mullo_epi32(u[0], cospi48); x = _mm256_mullo_epi32(u[1], cospi16); u[5] = _mm256_sub_epi32(temp2, x); u[5] = _mm256_add_epi32(u[5], rnding); u[5] = _mm256_srai_epi32(u[5], bit); // stage 5 // stage 6 temp1 = _mm256_mullo_epi32(u[0], cospi32); x = _mm256_mullo_epi32(u[1], cospi32); u[2] = _mm256_add_epi32(temp1, x); u[2] = _mm256_add_epi32(u[2], rnding); u[2] = _mm256_srai_epi32(u[2], bit); u[3] = _mm256_sub_epi32(temp1, x); u[3] = _mm256_add_epi32(u[3], rnding); u[3] = _mm256_srai_epi32(u[3], bit); temp1 = _mm256_mullo_epi32(u[4], cospi32); x = _mm256_mullo_epi32(u[5], cospi32); u[6] = _mm256_add_epi32(temp1, x); u[6] = _mm256_add_epi32(u[6], rnding); u[6] = _mm256_srai_epi32(u[6], bit); u[7] = _mm256_sub_epi32(temp1, x); u[7] = _mm256_add_epi32(u[7], rnding); u[7] = _mm256_srai_epi32(u[7], bit); // stage 7 if (do_cols) { out[0] = u[0]; out[1] = _mm256_sub_epi32(kZero, u[4]); out[2] = u[6]; out[3] = _mm256_sub_epi32(kZero, u[2]); out[4] = u[3]; out[5] = _mm256_sub_epi32(kZero, u[7]); out[6] = u[5]; out[7] = _mm256_sub_epi32(kZero, u[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); const __m256i clamp_hi_out = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, out_shift); } } static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); const __m256i kZero = _mm256_setzero_si256(); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); __m256i u[8], v[8], x; // stage 0 // stage 1 // stage 2 u[0] = _mm256_mullo_epi32(in[7], cospi4); x = _mm256_mullo_epi32(in[0], cospi60); u[0] = _mm256_add_epi32(u[0], x); u[0] = _mm256_add_epi32(u[0], rnding); u[0] = _mm256_srai_epi32(u[0], bit); u[1] = _mm256_mullo_epi32(in[7], cospi60); x = _mm256_mullo_epi32(in[0], cospi4); u[1] = _mm256_sub_epi32(u[1], x); u[1] = _mm256_add_epi32(u[1], rnding); u[1] = _mm256_srai_epi32(u[1], bit); u[2] = _mm256_mullo_epi32(in[5], cospi20); x = _mm256_mullo_epi32(in[2], cospi44); u[2] = _mm256_add_epi32(u[2], x); u[2] = _mm256_add_epi32(u[2], rnding); u[2] = _mm256_srai_epi32(u[2], bit); u[3] = _mm256_mullo_epi32(in[5], cospi44); x = _mm256_mullo_epi32(in[2], cospi20); u[3] = _mm256_sub_epi32(u[3], x); u[3] = _mm256_add_epi32(u[3], rnding); u[3] = _mm256_srai_epi32(u[3], bit); u[4] = _mm256_mullo_epi32(in[3], cospi36); x = _mm256_mullo_epi32(in[4], cospi28); u[4] = _mm256_add_epi32(u[4], x); u[4] = _mm256_add_epi32(u[4], rnding); u[4] = _mm256_srai_epi32(u[4], bit); u[5] = _mm256_mullo_epi32(in[3], cospi28); x = _mm256_mullo_epi32(in[4], cospi36); u[5] = _mm256_sub_epi32(u[5], x); u[5] = _mm256_add_epi32(u[5], rnding); u[5] = _mm256_srai_epi32(u[5], bit); u[6] = _mm256_mullo_epi32(in[1], cospi52); x = _mm256_mullo_epi32(in[6], cospi12); u[6] = _mm256_add_epi32(u[6], x); u[6] = _mm256_add_epi32(u[6], rnding); u[6] = _mm256_srai_epi32(u[6], bit); u[7] = _mm256_mullo_epi32(in[1], cospi12); x = _mm256_mullo_epi32(in[6], cospi52); u[7] = _mm256_sub_epi32(u[7], x); u[7] = _mm256_add_epi32(u[7], rnding); u[7] = _mm256_srai_epi32(u[7], bit); // stage 3 addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); // stage 4 u[0] = v[0]; u[1] = v[1]; u[2] = v[2]; u[3] = v[3]; u[4] = _mm256_mullo_epi32(v[4], cospi16); x = _mm256_mullo_epi32(v[5], cospi48); u[4] = _mm256_add_epi32(u[4], x); u[4] = _mm256_add_epi32(u[4], rnding); u[4] = _mm256_srai_epi32(u[4], bit); u[5] = _mm256_mullo_epi32(v[4], cospi48); x = _mm256_mullo_epi32(v[5], cospi16); u[5] = _mm256_sub_epi32(u[5], x); u[5] = _mm256_add_epi32(u[5], rnding); u[5] = _mm256_srai_epi32(u[5], bit); u[6] = _mm256_mullo_epi32(v[6], cospim48); x = _mm256_mullo_epi32(v[7], cospi16); u[6] = _mm256_add_epi32(u[6], x); u[6] = _mm256_add_epi32(u[6], rnding); u[6] = _mm256_srai_epi32(u[6], bit); u[7] = _mm256_mullo_epi32(v[6], cospi16); x = _mm256_mullo_epi32(v[7], cospim48); u[7] = _mm256_sub_epi32(u[7], x); u[7] = _mm256_add_epi32(u[7], rnding); u[7] = _mm256_srai_epi32(u[7], bit); // stage 5 addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); // stage 6 u[0] = v[0]; u[1] = v[1]; u[4] = v[4]; u[5] = v[5]; v[0] = _mm256_mullo_epi32(v[2], cospi32); x = _mm256_mullo_epi32(v[3], cospi32); u[2] = _mm256_add_epi32(v[0], x); u[2] = _mm256_add_epi32(u[2], rnding); u[2] = _mm256_srai_epi32(u[2], bit); u[3] = _mm256_sub_epi32(v[0], x); u[3] = _mm256_add_epi32(u[3], rnding); u[3] = _mm256_srai_epi32(u[3], bit); v[0] = _mm256_mullo_epi32(v[6], cospi32); x = _mm256_mullo_epi32(v[7], cospi32); u[6] = _mm256_add_epi32(v[0], x); u[6] = _mm256_add_epi32(u[6], rnding); u[6] = _mm256_srai_epi32(u[6], bit); u[7] = _mm256_sub_epi32(v[0], x); u[7] = _mm256_add_epi32(u[7], rnding); u[7] = _mm256_srai_epi32(u[7], bit); // stage 7 if (do_cols) { out[0] = u[0]; out[1] = _mm256_sub_epi32(kZero, u[4]); out[2] = u[6]; out[3] = _mm256_sub_epi32(kZero, u[2]); out[4] = u[3]; out[5] = _mm256_sub_epi32(kZero, u[7]); out[6] = u[5]; out[7] = _mm256_sub_epi32(kZero, u[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); const __m256i clamp_hi_out = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, out_shift); } } static inline void idct64_stage8_avx2( __m256i *u, const __m256i *cospim32, const __m256i *cospi32, const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi, const __m256i *rnding, int bit) { int i; __m256i temp1, temp2, temp3, temp4; temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit); u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit); u[10] = temp1; temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit); u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit); u[11] = temp2; for (i = 16; i < 20; ++i) { addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi); } temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit); temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit); temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit); temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit); u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit); u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit); u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit); u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit); u[36] = temp1; u[37] = temp2; u[38] = temp3; u[39] = temp4; temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit); temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit); temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit); temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit); u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit); u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit); u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit); u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit); u[40] = temp1; u[41] = temp2; u[42] = temp3; u[43] = temp4; } static inline void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32, const __m256i *cospi32, const __m256i *clamp_lo, const __m256i *clamp_hi, const __m256i *rnding, int bit) { int i; __m256i temp1, temp2, temp3, temp4; for (i = 0; i < 8; ++i) { addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); } temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit); temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit); temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit); temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit); u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit); u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit); u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit); u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit); u[20] = temp1; u[21] = temp2; u[22] = temp3; u[23] = temp4; for (i = 32; i < 40; i++) { addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); } for (i = 48; i < 56; i++) { addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); } } static inline void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32, const __m256i *cospi32, const __m256i *clamp_lo, const __m256i *clamp_hi, const __m256i *rnding, int bit) { __m256i temp1, temp2, temp3, temp4; for (int i = 0; i < 16; i++) { addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); } temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit); temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit); temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit); temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit); u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit); u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit); u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit); u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit); u[40] = temp1; u[41] = temp2; u[42] = temp3; u[43] = temp4; temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit); temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit); temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit); temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit); u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit); u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit); u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit); u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit); u[44] = temp1; u[45] = temp2; u[46] = temp3; u[47] = temp4; } static inline void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols, int bd, int out_shift, const __m256i *clamp_lo, const __m256i *clamp_hi) { for (int i = 0; i < 32; i++) { addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi); } if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); const __m256i clamp_hi_out = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); round_shift_8x8_avx2(out, out_shift); round_shift_8x8_avx2(out + 16, out_shift); round_shift_8x8_avx2(out + 32, out_shift); round_shift_8x8_avx2(out + 48, out_shift); highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64); } } static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); { __m256i x; // stage 1 // stage 2 // stage 3 // stage 4 // stage 5 // stage 6 x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit); // stage 8 // stage 9 // stage 10 // stage 11 if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); if (out_shift != 0) { __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); x = _mm256_add_epi32(x, offset); x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); } } x = _mm256_max_epi32(x, clamp_lo); x = _mm256_min_epi32(x, clamp_hi); out[0] = x; out[1] = x; out[2] = x; out[3] = x; out[4] = x; out[5] = x; out[6] = x; out[7] = x; out[8] = x; out[9] = x; out[10] = x; out[11] = x; out[12] = x; out[13] = x; out[14] = x; out[15] = x; out[16] = x; out[17] = x; out[18] = x; out[19] = x; out[20] = x; out[21] = x; out[22] = x; out[23] = x; out[24] = x; out[25] = x; out[26] = x; out[27] = x; out[28] = x; out[29] = x; out[30] = x; out[31] = x; out[32] = x; out[33] = x; out[34] = x; out[35] = x; out[36] = x; out[37] = x; out[38] = x; out[39] = x; out[40] = x; out[41] = x; out[42] = x; out[43] = x; out[44] = x; out[45] = x; out[46] = x; out[47] = x; out[48] = x; out[49] = x; out[50] = x; out[51] = x; out[52] = x; out[53] = x; out[54] = x; out[55] = x; out[56] = x; out[57] = x; out[58] = x; out[59] = x; out[60] = x; out[61] = x; out[62] = x; out[63] = x; } } static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { int i, j; const int32_t *cospi = cospi_arr(bit); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); { __m256i u[64]; // stage 1 u[0] = in[0]; u[8] = in[4]; u[16] = in[2]; u[24] = in[6]; u[32] = in[1]; u[40] = in[5]; u[48] = in[3]; u[56] = in[7]; // stage 2 u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); // stage 3 u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit); u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit); u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit); u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit); u[33] = u[32]; u[38] = u[39]; u[41] = u[40]; u[46] = u[47]; u[49] = u[48]; u[54] = u[55]; u[57] = u[56]; u[62] = u[63]; // stage 4 __m256i temp1, temp2; u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); u[17] = u[16]; u[22] = u[23]; u[25] = u[24]; u[30] = u[31]; temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); u[33] = temp1; temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); u[57] = temp2; temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); u[41] = temp1; temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); u[46] = temp2; // stage 5 u[9] = u[8]; u[14] = u[15]; temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); u[17] = temp1; temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); u[22] = temp2; u[35] = u[32]; u[34] = u[33]; u[36] = u[39]; u[37] = u[38]; u[43] = u[40]; u[42] = u[41]; u[44] = u[47]; u[45] = u[46]; u[51] = u[48]; u[50] = u[49]; u[52] = u[55]; u[53] = u[54]; u[59] = u[56]; u[58] = u[57]; u[60] = u[63]; u[61] = u[62]; // stage 6 temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); u[0] = temp1; temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); u[9] = temp2; u[19] = u[16]; u[18] = u[17]; u[20] = u[23]; u[21] = u[22]; u[27] = u[24]; u[26] = u[25]; u[28] = u[31]; u[29] = u[30]; temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); u[34] = temp1; temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); u[35] = temp2; temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); u[36] = temp1; temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); u[37] = temp2; temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); u[42] = temp1; temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); u[43] = temp2; temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); u[44] = temp1; temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); u[45] = temp2; // stage 7 u[3] = u[0]; u[2] = u[1]; u[11] = u[8]; u[10] = u[9]; u[12] = u[15]; u[13] = u[14]; temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); u[18] = temp1; temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); u[19] = temp2; temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); u[20] = temp1; temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); u[21] = temp2; for (i = 32; i < 64; i += 16) { for (j = i; j < i + 4; j++) { addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, &clamp_hi); } } // stage 8 u[7] = u[0]; u[6] = u[1]; u[5] = u[2]; u[4] = u[3]; idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); // stage 9 idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, bit); // stage 10 idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, bit); // stage 11 idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } } static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { int i, j; const int32_t *cospi = cospi_arr(bit); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); const __m256i cospi9 = _mm256_set1_epi32(cospi[9]); const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); const __m256i cospi11 = _mm256_set1_epi32(cospi[11]); const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); const __m256i cospi13 = _mm256_set1_epi32(cospi[13]); const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); const __m256i cospi15 = _mm256_set1_epi32(cospi[15]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospi51 = _mm256_set1_epi32(cospi[51]); const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); const __m256i cospi55 = _mm256_set1_epi32(cospi[55]); const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]); const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]); const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]); const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]); const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); { __m256i u[64]; __m256i tmp1, tmp2, tmp3, tmp4; // stage 1 u[0] = in[0]; u[32] = in[1]; u[36] = in[9]; u[40] = in[5]; u[44] = in[13]; u[48] = in[3]; u[52] = in[11]; u[56] = in[7]; u[60] = in[15]; u[16] = in[2]; u[20] = in[10]; u[24] = in[6]; u[28] = in[14]; u[4] = in[8]; u[8] = in[4]; u[12] = in[12]; // stage 2 u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit); u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit); u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit); u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit); u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit); u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit); u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit); u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit); // stage 3 u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit); u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit); u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit); u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit); u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit); u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit); u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit); u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit); u[33] = u[32]; u[34] = u[35]; u[37] = u[36]; u[38] = u[39]; u[41] = u[40]; u[42] = u[43]; u[45] = u[44]; u[46] = u[47]; u[49] = u[48]; u[50] = u[51]; u[53] = u[52]; u[54] = u[55]; u[57] = u[56]; u[58] = u[59]; u[61] = u[60]; u[62] = u[63]; // stage 4 u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); u[17] = u[16]; u[18] = u[19]; u[21] = u[20]; u[22] = u[23]; u[25] = u[24]; u[26] = u[27]; u[29] = u[28]; u[30] = u[31]; tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); u[33] = tmp1; u[34] = tmp2; u[37] = tmp3; u[38] = tmp4; tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); u[41] = tmp1; u[42] = tmp2; u[45] = tmp3; u[46] = tmp4; // stage 5 u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit); u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit); u[9] = u[8]; u[10] = u[11]; u[13] = u[12]; u[14] = u[15]; tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit); tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit); tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit); u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit); u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); u[17] = tmp1; u[18] = tmp2; u[21] = tmp3; u[22] = tmp4; for (i = 32; i < 64; i += 8) { addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, &clamp_hi); addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, &clamp_hi); addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, &clamp_hi); addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, &clamp_hi); } // stage 6 tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); u[0] = tmp1; u[5] = u[4]; u[6] = u[7]; tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); u[9] = tmp1; tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); u[10] = tmp2; for (i = 16; i < 32; i += 8) { addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, &clamp_hi); addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, &clamp_hi); addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, &clamp_hi); addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, &clamp_hi); } tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); u[34] = tmp1; u[35] = tmp2; u[36] = tmp3; u[37] = tmp4; tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); u[42] = tmp1; u[43] = tmp2; u[44] = tmp3; u[45] = tmp4; // stage 7 u[3] = u[0]; u[2] = u[1]; tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit); u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit); u[5] = tmp1; addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); u[18] = tmp1; u[19] = tmp2; u[20] = tmp3; u[21] = tmp4; for (i = 32; i < 64; i += 16) { for (j = i; j < i + 4; j++) { addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, &clamp_hi); } } // stage 8 for (i = 0; i < 4; ++i) { addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); } idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); // stage 9 idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, bit); // stage 10 idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, bit); // stage 11 idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } } static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift) { int i, j; const int32_t *cospi = cospi_arr(bit); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); const __m256i cospi9 = _mm256_set1_epi32(cospi[9]); const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); const __m256i cospi11 = _mm256_set1_epi32(cospi[11]); const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); const __m256i cospi13 = _mm256_set1_epi32(cospi[13]); const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); const __m256i cospi15 = _mm256_set1_epi32(cospi[15]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospi17 = _mm256_set1_epi32(cospi[17]); const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); const __m256i cospi19 = _mm256_set1_epi32(cospi[19]); const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); const __m256i cospi21 = _mm256_set1_epi32(cospi[21]); const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); const __m256i cospi23 = _mm256_set1_epi32(cospi[23]); const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); const __m256i cospi25 = _mm256_set1_epi32(cospi[25]); const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); const __m256i cospi27 = _mm256_set1_epi32(cospi[27]); const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); const __m256i cospi29 = _mm256_set1_epi32(cospi[29]); const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); const __m256i cospi31 = _mm256_set1_epi32(cospi[31]); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i cospi35 = _mm256_set1_epi32(cospi[35]); const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); const __m256i cospi39 = _mm256_set1_epi32(cospi[39]); const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); const __m256i cospi43 = _mm256_set1_epi32(cospi[43]); const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); const __m256i cospi47 = _mm256_set1_epi32(cospi[47]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospi51 = _mm256_set1_epi32(cospi[51]); const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); const __m256i cospi55 = _mm256_set1_epi32(cospi[55]); const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]); const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]); const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]); const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]); const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]); const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]); const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]); const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]); const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); { __m256i u[64], v[64]; // stage 1 u[32] = in[1]; u[34] = in[17]; u[36] = in[9]; u[38] = in[25]; u[40] = in[5]; u[42] = in[21]; u[44] = in[13]; u[46] = in[29]; u[48] = in[3]; u[50] = in[19]; u[52] = in[11]; u[54] = in[27]; u[56] = in[7]; u[58] = in[23]; u[60] = in[15]; u[62] = in[31]; v[16] = in[2]; v[18] = in[18]; v[20] = in[10]; v[22] = in[26]; v[24] = in[6]; v[26] = in[22]; v[28] = in[14]; v[30] = in[30]; u[8] = in[4]; u[10] = in[20]; u[12] = in[12]; u[14] = in[28]; v[4] = in[8]; v[6] = in[24]; u[0] = in[0]; u[2] = in[16]; // stage 2 v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit); v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit); v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit); v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit); v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit); v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit); v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit); v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit); v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit); v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit); v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit); v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit); v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit); v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit); v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit); v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit); v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit); v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit); v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit); v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit); v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit); v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit); v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit); v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit); v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); // stage 3 u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit); u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit); u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit); u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit); u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit); u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit); u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit); u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit); u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit); u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit); u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit); u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit); u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit); u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit); u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit); u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit); for (i = 32; i < 64; i += 4) { addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, &clamp_hi); addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, &clamp_hi); } // stage 4 v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit); v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit); v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit); v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit); v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); for (i = 16; i < 32; i += 4) { addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, &clamp_hi); addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, &clamp_hi); } for (i = 32; i < 64; i += 4) { v[i + 0] = u[i + 0]; v[i + 3] = u[i + 3]; } v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); // stage 5 u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit); u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit); u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit); u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit); for (i = 8; i < 16; i += 4) { addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, &clamp_hi); addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, &clamp_hi); } for (i = 16; i < 32; i += 4) { u[i + 0] = v[i + 0]; u[i + 3] = v[i + 3]; } u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit); u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit); u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit); u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit); u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit); u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit); u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit); u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit); for (i = 32; i < 64; i += 8) { addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, &clamp_hi); addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, &clamp_hi); addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, &clamp_hi); addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, &clamp_hi); } // stage 6 v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit); v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit); addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); for (i = 8; i < 16; i += 4) { v[i + 0] = u[i + 0]; v[i + 3] = u[i + 3]; } v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); for (i = 16; i < 32; i += 8) { addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, &clamp_hi); addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, &clamp_hi); addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, &clamp_hi); addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, &clamp_hi); } for (i = 32; i < 64; i += 8) { v[i + 0] = u[i + 0]; v[i + 1] = u[i + 1]; v[i + 6] = u[i + 6]; v[i + 7] = u[i + 7]; } v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); // stage 7 addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); u[4] = v[4]; u[7] = v[7]; u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit); u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit); addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); for (i = 16; i < 32; i += 8) { u[i + 0] = v[i + 0]; u[i + 1] = v[i + 1]; u[i + 6] = v[i + 6]; u[i + 7] = v[i + 7]; } u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit); u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit); u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit); u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit); u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit); u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit); u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit); u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit); for (i = 32; i < 64; i += 16) { for (j = i; j < i + 4; j++) { addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, &clamp_hi); } } // stage 8 for (i = 0; i < 4; ++i) { addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); } v[8] = u[8]; v[9] = u[9]; v[14] = u[14]; v[15] = u[15]; v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit); v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit); v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit); v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit); for (i = 16; i < 20; ++i) { addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, &clamp_hi); } for (i = 32; i < 36; ++i) { v[i] = u[i]; v[i + 12] = u[i + 12]; v[i + 16] = u[i + 16]; v[i + 28] = u[i + 28]; } v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit); v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit); v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit); v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit); v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit); v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit); v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit); v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit); v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit); v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit); v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit); v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit); v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit); v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit); v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit); v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit); // stage 9 for (i = 0; i < 8; ++i) { addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); } for (i = 16; i < 20; ++i) { u[i] = v[i]; u[i + 12] = v[i + 12]; } u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit); u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit); u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit); u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit); u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit); u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit); u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit); u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit); for (i = 32; i < 40; i++) { addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); } for (i = 48; i < 56; i++) { addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); } // stage 10 for (i = 0; i < 16; i++) { addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); } for (i = 32; i < 40; i++) v[i] = u[i]; v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit); v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit); v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit); v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit); v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit); v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit); v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit); v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit); v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit); v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit); v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit); v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit); v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit); v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit); v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit); v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit); for (i = 56; i < 64; i++) v[i] = u[i]; // stage 11 for (i = 0; i < 32; i++) { addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo, &clamp_hi); } if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); const __m256i clamp_hi_out = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); round_shift_8x8_avx2(out, out_shift); round_shift_8x8_avx2(out + 16, out_shift); round_shift_8x8_avx2(out + 32, out_shift); round_shift_8x8_avx2(out + 48, out_shift); highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64); } } } typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit, int do_cols, int bd, int out_shift); static const transform_1d_avx2 highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { { { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL }, }, { { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL }, { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL }, { NULL, NULL, NULL, NULL }, }, { { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL }, { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL }, { NULL, NULL, NULL, NULL }, }, { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } }, { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } } }; static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob, const int bd) { __m256i buf1[64 * 8]; int eobx, eoby; get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div8 = txfm_size_col >> 3; const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; const int input_stride = AOMMIN(32, txfm_size_row); const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; const transform_1d_avx2 row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; const transform_1d_avx2 col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; assert(col_txfm != NULL); assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // 1st stage: column transform for (int i = 0; i < buf_size_nonzero_h_div8; i++) { __m256i buf0[64]; load_buffer_32bit_input(input + i * 8, input_stride, buf0, buf_size_nonzero_w); if (rect_type == 1 || rect_type == -1) { round_shift_rect_array_32_avx2(buf0, buf0, buf_size_nonzero_w, 0, NewInvSqrt2); } row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); __m256i *_buf1 = buf1 + i * 8; if (lr_flip) { for (int j = 0; j < buf_size_w_div8; ++j) { transpose_8x8_flip_avx2( &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]); } } else { for (int j = 0; j < buf_size_w_div8; ++j) { transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]); } } } // 2nd stage: column transform for (int i = 0; i < buf_size_w_div8; i++) { col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, bd, 0); round_shift_array_32_avx2(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, txfm_size_row, -shift[1]); } // write to buffer if (txfm_size_col >= 16) { for (int i = 0; i < (txfm_size_col >> 4); i++) { highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2, output + 16 * i, stride, ud_flip, txfm_size_row, bd); } } else if (txfm_size_col == 8) { highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row, bd); } } static void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob, const int bd) { switch (tx_type) { case DCT_DCT: case ADST_DCT: case DCT_ADST: case ADST_ADST: case FLIPADST_DCT: case DCT_FLIPADST: case FLIPADST_FLIPADST: case ADST_FLIPADST: case FLIPADST_ADST: highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, bd); break; case IDTX: case H_DCT: case H_ADST: case H_FLIPADST: case V_DCT: case V_ADST: case V_FLIPADST: av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type, tx_size, eob, bd); break; default: assert(0); break; } } void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const TX_SIZE tx_size = txfm_param->tx_size; switch (tx_size) { case TX_4X8: case TX_8X4: case TX_4X4: case TX_16X4: case TX_4X16: av1_highbd_inv_txfm_add_sse4_1(input, dest, stride, txfm_param); break; default: av1_highbd_inv_txfm2d_add_universe_avx2( input, dest, stride, txfm_param->tx_type, txfm_param->tx_size, txfm_param->eob, txfm_param->bd); break; } } aom-3.12.1/av1/common/x86/highbd_inv_txfm_sse4.c000066400000000000000000006652041477627663500212630ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include /* SSE4.1 */ #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "av1/common/av1_inv_txfm1d_cfg.h" #include "av1/common/idct.h" #include "av1/common/x86/av1_inv_txfm_ssse3.h" #include "av1/common/x86/av1_txfm_sse2.h" #include "av1/common/x86/av1_txfm_sse4.h" #include "av1/common/x86/highbd_txfm_utility_sse4.h" static inline __m128i highbd_clamp_epi16(__m128i u, int bd) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); __m128i clamped, mask; mask = _mm_cmpgt_epi16(u, max); clamped = _mm_andnot_si128(mask, u); mask = _mm_and_si128(mask, max); clamped = _mm_or_si128(mask, clamped); mask = _mm_cmpgt_epi16(clamped, zero); clamped = _mm_and_si128(clamped, mask); return clamped; } static inline void round_shift_4x4(__m128i *in, int shift) { if (shift != 0) { __m128i rnding = _mm_set1_epi32(1 << (shift - 1)); in[0] = _mm_add_epi32(in[0], rnding); in[1] = _mm_add_epi32(in[1], rnding); in[2] = _mm_add_epi32(in[2], rnding); in[3] = _mm_add_epi32(in[3], rnding); in[0] = _mm_srai_epi32(in[0], shift); in[1] = _mm_srai_epi32(in[1], shift); in[2] = _mm_srai_epi32(in[2], shift); in[3] = _mm_srai_epi32(in[3], shift); } } static void round_shift_8x8(__m128i *in, int shift) { round_shift_4x4(&in[0], shift); round_shift_4x4(&in[4], shift); round_shift_4x4(&in[8], shift); round_shift_4x4(&in[12], shift); } static void highbd_clamp_epi32_sse4_1(__m128i *in, __m128i *out, const __m128i *clamp_lo, const __m128i *clamp_hi, int size) { __m128i a0, a1; for (int i = 0; i < size; i += 4) { a0 = _mm_max_epi32(in[i], *clamp_lo); out[i] = _mm_min_epi32(a0, *clamp_hi); a1 = _mm_max_epi32(in[i + 1], *clamp_lo); out[i + 1] = _mm_min_epi32(a1, *clamp_hi); a0 = _mm_max_epi32(in[i + 2], *clamp_lo); out[i + 2] = _mm_min_epi32(a0, *clamp_hi); a1 = _mm_max_epi32(in[i + 3], *clamp_lo); out[i + 3] = _mm_min_epi32(a1, *clamp_hi); } } static inline __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred, __m128i res0, __m128i res1, const int bd) { __m128i x0 = _mm_cvtepi16_epi32(pred); __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8)); __m128i min_clip_val = _mm_setzero_si128(); __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1); x0 = _mm_add_epi32(res0, x0); x1 = _mm_add_epi32(res1, x1); x0 = _mm_max_epi32(x0, min_clip_val); x0 = _mm_min_epi32(x0, max_clip_val); x1 = _mm_max_epi32(x1, min_clip_val); x1 = _mm_min_epi32(x1, max_clip_val); x0 = _mm_packus_epi32(x0, x1); return x0; } static inline __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred, __m128i res0, const int bd) { __m128i x0 = _mm_cvtepi16_epi32(pred); x0 = _mm_add_epi32(res0, x0); x0 = _mm_packus_epi32(x0, x0); x0 = highbd_clamp_epi16(x0, bd); return x0; } static inline void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output, int stride, int flipud, int height, const int bd) { int j = flipud ? (height - 1) : 0; const int step = flipud ? -1 : 1; for (int i = 0; i < height; ++i, j += step) { __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride)); __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd); _mm_storel_epi64((__m128i *)(output + i * stride), u); } } static inline void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output, int stride, int flipud, int height, const int bd) { int j = flipud ? (height - 1) : 0; const int step = flipud ? -1 : 1; for (int i = 0; i < height; ++i, j += step) { __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride)); __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd); _mm_storeu_si128((__m128i *)(output + i * stride), u); } } static inline void load_buffer_32bit_input(const int32_t *in, int stride, __m128i *out, int out_size) { for (int i = 0; i < out_size; ++i) { out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride)); } } static inline void load_buffer_4x4(const int32_t *coeff, __m128i *in) { in[0] = _mm_load_si128((const __m128i *)(coeff + 0)); in[1] = _mm_load_si128((const __m128i *)(coeff + 4)); in[2] = _mm_load_si128((const __m128i *)(coeff + 8)); in[3] = _mm_load_si128((const __m128i *)(coeff + 12)); } void av1_highbd_iwht4x4_16_add_sse4_1(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ __m128i op[4]; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); load_buffer_4x4(input, op); // Shift before-hand. op[0] = _mm_srai_epi32(op[0], UNIT_QUANT_SHIFT); op[1] = _mm_srai_epi32(op[1], UNIT_QUANT_SHIFT); op[2] = _mm_srai_epi32(op[2], UNIT_QUANT_SHIFT); op[3] = _mm_srai_epi32(op[3], UNIT_QUANT_SHIFT); for (int i = 0; i < 2; ++i) { __m128i a1 = op[0]; __m128i c1 = op[1]; __m128i d1 = op[2]; __m128i b1 = op[3]; a1 = _mm_add_epi32(a1, c1); // a1 += c1 d1 = _mm_sub_epi32(d1, b1); // d1 -= b1 __m128i e1 = _mm_sub_epi32(a1, d1); // e1 = (a1 - d1) >> 1 e1 = _mm_srai_epi32(e1, 1); b1 = _mm_sub_epi32(e1, b1); // b1 = e1 - b1 c1 = _mm_sub_epi32(e1, c1); // c1 = e1 - c1 a1 = _mm_sub_epi32(a1, b1); // a1 -= b1 d1 = _mm_add_epi32(d1, c1); // d1 += c1 op[0] = a1; op[1] = b1; op[2] = c1; op[3] = d1; if (i == 0) { transpose_32bit_4x4(op, op); } } // Convert to int16_t. The C code checks that we are in range. op[0] = _mm_packs_epi32(op[0], op[1]); op[1] = _mm_packs_epi32(op[2], op[3]); // Load uint16_t. __m128i dst[2]; __m128i tmp[4]; tmp[0] = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); tmp[1] = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride)); dst[0] = _mm_unpacklo_epi64(tmp[0], tmp[1]); tmp[2] = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride)); tmp[3] = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride)); dst[1] = _mm_unpacklo_epi64(tmp[2], tmp[3]); // Add to the previous results. dst[0] = _mm_add_epi16(dst[0], op[0]); dst[1] = _mm_add_epi16(dst[1], op[1]); // Clamp. dst[0] = highbd_clamp_epi16(dst[0], bd); dst[1] = highbd_clamp_epi16(dst[1], bd); // Store. _mm_storel_epi64((__m128i *)(dest + 0 * stride), dst[0]); dst[0] = _mm_srli_si128(dst[0], 8); _mm_storel_epi64((__m128i *)(dest + 1 * stride), dst[0]); _mm_storel_epi64((__m128i *)(dest + 2 * stride), dst[1]); dst[1] = _mm_srli_si128(dst[1], 8); _mm_storel_epi64((__m128i *)(dest + 3 * stride), dst[1]); } static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0, __m128i *out1, const __m128i *clamp_lo, const __m128i *clamp_hi) { __m128i a0 = _mm_add_epi32(in0, in1); __m128i a1 = _mm_sub_epi32(in0, in1); a0 = _mm_max_epi32(a0, *clamp_lo); a0 = _mm_min_epi32(a0, *clamp_hi); a1 = _mm_max_epi32(a1, *clamp_lo); a1 = _mm_min_epi32(a1, *clamp_hi); *out0 = a0; *out1 = a1; } static void shift_and_clamp_sse4_1(__m128i *in0, __m128i *in1, const __m128i *clamp_lo, const __m128i *clamp_hi, int shift) { __m128i offset = _mm_set1_epi32((1 << shift) >> 1); __m128i in0_w_offset = _mm_add_epi32(*in0, offset); __m128i in1_w_offset = _mm_add_epi32(*in1, offset); in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift)); in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift)); in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo); in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi); in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo); in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi); *in0 = in0_w_offset; *in1 = in1_w_offset; } static inline void idct32_stage4_sse4_1( __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56, const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40, const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24, const __m128i *rounding, int bit) { __m128i temp1, temp2; temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit); bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit); bf1[17] = temp1; temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit); bf1[29] = half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit); bf1[18] = temp2; temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit); bf1[26] = half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit); bf1[21] = temp1; temp2 = half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit); bf1[25] = half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit); bf1[22] = temp2; } static inline void idct32_stage5_sse4_1( __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, const __m128i *rounding, int bit) { __m128i temp1, temp2; temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit); bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit); bf1[9] = temp1; temp2 = half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit); bf1[13] = half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit); bf1[10] = temp2; addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); } static inline void idct32_stage6_sse4_1( __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32, const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, const __m128i *rounding, int bit) { __m128i temp1, temp2; temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit); bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit); bf1[5] = temp1; addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit); bf1[29] = half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit); bf1[18] = temp1; temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit); bf1[28] = half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit); bf1[19] = temp2; temp1 = half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit); bf1[27] = half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit); bf1[20] = temp1; temp2 = half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit); bf1[26] = half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit); bf1[21] = temp2; } static inline void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32, const __m128i *cospi32, const __m128i *clamp_lo, const __m128i *clamp_hi, const __m128i *rounding, int bit) { __m128i temp1, temp2; addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit); bf1[13] = half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit); bf1[10] = temp1; temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit); bf1[12] = half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit); bf1[11] = temp2; addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); } static inline void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32, const __m128i *cospi32, const __m128i *clamp_lo, const __m128i *clamp_hi, const __m128i *rounding, int bit) { __m128i temp1, temp2; addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit); bf1[27] = half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit); bf1[20] = temp1; temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit); bf1[26] = half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit); bf1[21] = temp2; temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit); bf1[25] = half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit); bf1[22] = temp1; temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit); bf1[24] = half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit); bf1[23] = temp2; } static inline void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out, const int do_cols, const int bd, const int out_shift, const __m128i *clamp_lo, const __m128i *clamp_hi) { addsub_sse4_1(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi); addsub_sse4_1(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi); addsub_sse4_1(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi); addsub_sse4_1(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi); addsub_sse4_1(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi); addsub_sse4_1(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi); addsub_sse4_1(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi); addsub_sse4_1(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi); addsub_sse4_1(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi); addsub_sse4_1(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi); addsub_sse4_1(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi); addsub_sse4_1(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi); addsub_sse4_1(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi); addsub_sse4_1(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi); addsub_sse4_1(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi); addsub_sse4_1(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi); if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); for (int i = 0; i < 32; i += 8) { round_shift_4x4(out + i, out_shift); round_shift_4x4(out + i + 4, out_shift); } highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); } } static void neg_shift_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0, __m128i *out1, const __m128i *clamp_lo, const __m128i *clamp_hi, int shift) { __m128i offset = _mm_set1_epi32((1 << shift) >> 1); __m128i a0 = _mm_add_epi32(offset, in0); __m128i a1 = _mm_sub_epi32(offset, in1); a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift)); a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift)); a0 = _mm_max_epi32(a0, *clamp_lo); a0 = _mm_min_epi32(a0, *clamp_hi); a1 = _mm_max_epi32(a1, *clamp_lo); a1 = _mm_min_epi32(a1, *clamp_hi); *out0 = a0; *out1 = a1; } static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i u0, u1, u2, u3; __m128i v0, v1, v2, v3, x, y; // Stage 0 // Stage 1 // Stage 2 u0 = in[0]; u1 = in[1]; u2 = in[2]; u3 = in[3]; x = _mm_mullo_epi32(u0, cospi32); y = _mm_mullo_epi32(u2, cospi32); v0 = _mm_add_epi32(x, y); v0 = _mm_add_epi32(v0, rnding); v0 = _mm_srai_epi32(v0, bit); v1 = _mm_sub_epi32(x, y); v1 = _mm_add_epi32(v1, rnding); v1 = _mm_srai_epi32(v1, bit); x = _mm_mullo_epi32(u1, cospi48); y = _mm_mullo_epi32(u3, cospim16); v2 = _mm_add_epi32(x, y); v2 = _mm_add_epi32(v2, rnding); v2 = _mm_srai_epi32(v2, bit); x = _mm_mullo_epi32(u1, cospi16); y = _mm_mullo_epi32(u3, cospi48); v3 = _mm_add_epi32(x, y); v3 = _mm_add_epi32(v3, rnding); v3 = _mm_srai_epi32(v3, bit); // Stage 3 addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi); addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi); if (!do_cols) { log_range = AOMMAX(16, bd + 6); clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift); shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift); } } static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *sinpi = sinpi_arr(bit); const __m128i zero = _mm_setzero_si128(); __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1)); rnding = _mm_unpacklo_epi32(rnding, zero); const __m128i mul = _mm_set1_epi32(1 << 4); const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]); const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]); const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]); const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]); __m128i t; __m128i s0, s1, s2, s3, s4, s5, s6, s7; __m128i x0, x1, x2, x3; __m128i u0, u1, u2, u3; __m128i u0_low, u1_low, u2_low, u3_low; __m128i u0_high, u1_high, u2_high, u3_high; x0 = in[0]; x1 = in[1]; x2 = in[2]; x3 = in[3]; s0 = _mm_mullo_epi32(x0, sinpi1); s1 = _mm_mullo_epi32(x0, sinpi2); s2 = _mm_mullo_epi32(x1, sinpi3); s3 = _mm_mullo_epi32(x2, sinpi4); s4 = _mm_mullo_epi32(x2, sinpi1); s5 = _mm_mullo_epi32(x3, sinpi2); s6 = _mm_mullo_epi32(x3, sinpi4); t = _mm_sub_epi32(x0, x2); s7 = _mm_add_epi32(t, x3); t = _mm_add_epi32(s0, s3); s0 = _mm_add_epi32(t, s5); t = _mm_sub_epi32(s1, s4); s1 = _mm_sub_epi32(t, s6); s3 = s2; s2 = _mm_mullo_epi32(s7, sinpi3); u0 = _mm_add_epi32(s0, s3); u1 = _mm_add_epi32(s1, s3); u2 = s2; t = _mm_add_epi32(s0, s1); u3 = _mm_sub_epi32(t, s3); // u0 u0_low = _mm_mul_epi32(u0, mul); u0_low = _mm_add_epi64(u0_low, rnding); u0 = _mm_srli_si128(u0, 4); u0_high = _mm_mul_epi32(u0, mul); u0_high = _mm_add_epi64(u0_high, rnding); u0_low = _mm_srli_si128(u0_low, 2); u0_high = _mm_srli_si128(u0_high, 2); u0 = _mm_unpacklo_epi32(u0_low, u0_high); u0_high = _mm_unpackhi_epi32(u0_low, u0_high); u0 = _mm_unpacklo_epi64(u0, u0_high); // u1 u1_low = _mm_mul_epi32(u1, mul); u1_low = _mm_add_epi64(u1_low, rnding); u1 = _mm_srli_si128(u1, 4); u1_high = _mm_mul_epi32(u1, mul); u1_high = _mm_add_epi64(u1_high, rnding); u1_low = _mm_srli_si128(u1_low, 2); u1_high = _mm_srli_si128(u1_high, 2); u1 = _mm_unpacklo_epi32(u1_low, u1_high); u1_high = _mm_unpackhi_epi32(u1_low, u1_high); u1 = _mm_unpacklo_epi64(u1, u1_high); // u2 u2_low = _mm_mul_epi32(u2, mul); u2_low = _mm_add_epi64(u2_low, rnding); u2 = _mm_srli_si128(u2, 4); u2_high = _mm_mul_epi32(u2, mul); u2_high = _mm_add_epi64(u2_high, rnding); u2_low = _mm_srli_si128(u2_low, 2); u2_high = _mm_srli_si128(u2_high, 2); u2 = _mm_unpacklo_epi32(u2_low, u2_high); u2_high = _mm_unpackhi_epi32(u2_low, u2_high); u2 = _mm_unpacklo_epi64(u2, u2_high); // u3 u3_low = _mm_mul_epi32(u3, mul); u3_low = _mm_add_epi64(u3_low, rnding); u3 = _mm_srli_si128(u3, 4); u3_high = _mm_mul_epi32(u3, mul); u3_high = _mm_add_epi64(u3_high, rnding); u3_low = _mm_srli_si128(u3_low, 2); u3_high = _mm_srli_si128(u3_high, 2); u3 = _mm_unpacklo_epi32(u3_low, u3_high); u3_high = _mm_unpackhi_epi32(u3_low, u3_high); u3 = _mm_unpacklo_epi64(u3, u3_high); out[0] = u0; out[1] = u1; out[2] = u2; out[3] = u3; if (!do_cols) { const int log_range = AOMMAX(16, bd + 6); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); round_shift_4x4(out, out_shift); highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4); } } static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride, int fliplr, int flipud, int shift, int bd) { const __m128i zero = _mm_setzero_si128(); __m128i u0, u1, u2, u3; __m128i v0, v1, v2, v3; round_shift_4x4(in, shift); v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride)); v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride)); v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride)); v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride)); v0 = _mm_unpacklo_epi16(v0, zero); v1 = _mm_unpacklo_epi16(v1, zero); v2 = _mm_unpacklo_epi16(v2, zero); v3 = _mm_unpacklo_epi16(v3, zero); if (fliplr) { in[0] = _mm_shuffle_epi32(in[0], 0x1B); in[1] = _mm_shuffle_epi32(in[1], 0x1B); in[2] = _mm_shuffle_epi32(in[2], 0x1B); in[3] = _mm_shuffle_epi32(in[3], 0x1B); } if (flipud) { u0 = _mm_add_epi32(in[3], v0); u1 = _mm_add_epi32(in[2], v1); u2 = _mm_add_epi32(in[1], v2); u3 = _mm_add_epi32(in[0], v3); } else { u0 = _mm_add_epi32(in[0], v0); u1 = _mm_add_epi32(in[1], v1); u2 = _mm_add_epi32(in[2], v2); u3 = _mm_add_epi32(in[3], v3); } v0 = _mm_packus_epi32(u0, u1); v2 = _mm_packus_epi32(u2, u3); u0 = highbd_clamp_epi16(v0, bd); u2 = highbd_clamp_epi16(v2, bd); v0 = _mm_unpacklo_epi64(u0, u0); v1 = _mm_unpackhi_epi64(u0, u0); v2 = _mm_unpacklo_epi64(u2, u2); v3 = _mm_unpackhi_epi64(u2, u2); _mm_storel_epi64((__m128i *)(output + 0 * stride), v0); _mm_storel_epi64((__m128i *)(output + 1 * stride), v1); _mm_storel_epi64((__m128i *)(output + 2 * stride), v2); _mm_storel_epi64((__m128i *)(output + 3 * stride), v3); } static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { (void)bit; __m128i zero = _mm_setzero_si128(); __m128i fact = _mm_set1_epi32(NewSqrt2); __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); __m128i a0_low, a1_low; __m128i a0_high, a1_high; offset = _mm_unpacklo_epi32(offset, zero); for (int i = 0; i < 4; i++) { a0_low = _mm_mul_epi32(in[i], fact); a0_low = _mm_add_epi32(a0_low, offset); a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits); a0_high = _mm_srli_si128(in[i], 4); a0_high = _mm_mul_epi32(a0_high, fact); a0_high = _mm_add_epi32(a0_high, offset); a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits); a1_low = _mm_unpacklo_epi32(a0_low, a0_high); a1_high = _mm_unpackhi_epi32(a0_low, a0_high); out[i] = _mm_unpacklo_epi64(a1_low, a1_high); } if (!do_cols) { const int log_range = AOMMAX(16, bd + 6); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); round_shift_4x4(out, out_shift); highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4); } } void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { __m128i in[4]; const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4]; switch (tx_type) { case DCT_DCT: load_buffer_4x4(input, in); idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); transpose_32bit_4x4(in, in); idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case ADST_DCT: load_buffer_4x4(input, in); idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); transpose_32bit_4x4(in, in); iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case DCT_ADST: load_buffer_4x4(input, in); iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); transpose_32bit_4x4(in, in); idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case ADST_ADST: load_buffer_4x4(input, in); iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); transpose_32bit_4x4(in, in); iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case FLIPADST_DCT: load_buffer_4x4(input, in); idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); transpose_32bit_4x4(in, in); iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); break; case DCT_FLIPADST: load_buffer_4x4(input, in); iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); transpose_32bit_4x4(in, in); idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); break; case FLIPADST_FLIPADST: load_buffer_4x4(input, in); iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); transpose_32bit_4x4(in, in); iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd); break; case ADST_FLIPADST: load_buffer_4x4(input, in); iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); transpose_32bit_4x4(in, in); iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); break; case FLIPADST_ADST: load_buffer_4x4(input, in); iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); transpose_32bit_4x4(in, in); iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); break; case IDTX: load_buffer_4x4(input, in); iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); transpose_32bit_4x4(in, in); iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case V_DCT: load_buffer_4x4(input, in); iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); transpose_32bit_4x4(in, in); idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case H_DCT: load_buffer_4x4(input, in); idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); transpose_32bit_4x4(in, in); iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case V_ADST: load_buffer_4x4(input, in); iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); transpose_32bit_4x4(in, in); iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case H_ADST: load_buffer_4x4(input, in); iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); transpose_32bit_4x4(in, in); iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); break; case V_FLIPADST: load_buffer_4x4(input, in); iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); transpose_32bit_4x4(in, in); iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); break; case H_FLIPADST: load_buffer_4x4(input, in); iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); transpose_32bit_4x4(in, in); iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); break; default: assert(0); } } // 8x8 static void load_buffer_8x8(const int32_t *coeff, __m128i *in) { in[0] = _mm_load_si128((const __m128i *)(coeff + 0)); in[1] = _mm_load_si128((const __m128i *)(coeff + 4)); in[2] = _mm_load_si128((const __m128i *)(coeff + 8)); in[3] = _mm_load_si128((const __m128i *)(coeff + 12)); in[4] = _mm_load_si128((const __m128i *)(coeff + 16)); in[5] = _mm_load_si128((const __m128i *)(coeff + 20)); in[6] = _mm_load_si128((const __m128i *)(coeff + 24)); in[7] = _mm_load_si128((const __m128i *)(coeff + 28)); in[8] = _mm_load_si128((const __m128i *)(coeff + 32)); in[9] = _mm_load_si128((const __m128i *)(coeff + 36)); in[10] = _mm_load_si128((const __m128i *)(coeff + 40)); in[11] = _mm_load_si128((const __m128i *)(coeff + 44)); in[12] = _mm_load_si128((const __m128i *)(coeff + 48)); in[13] = _mm_load_si128((const __m128i *)(coeff + 52)); in[14] = _mm_load_si128((const __m128i *)(coeff + 56)); in[15] = _mm_load_si128((const __m128i *)(coeff + 60)); } static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i u0, u1, u2, u3, u4, u5, u6, u7; __m128i v0, v1, v2, v3, v4, v5, v6, v7; __m128i x, y; int col; // Note: // Even column: 0, 2, ..., 14 // Odd column: 1, 3, ..., 15 // one even column plus one odd column constructs one row (8 coeffs) // total we have 8 rows (8x8). for (col = 0; col < 2; ++col) { // stage 0 // stage 1 // stage 2 u0 = in[0 * 2 + col]; u1 = in[4 * 2 + col]; u2 = in[2 * 2 + col]; u3 = in[6 * 2 + col]; x = _mm_mullo_epi32(in[1 * 2 + col], cospi56); y = _mm_mullo_epi32(in[7 * 2 + col], cospim8); u4 = _mm_add_epi32(x, y); u4 = _mm_add_epi32(u4, rnding); u4 = _mm_srai_epi32(u4, bit); x = _mm_mullo_epi32(in[1 * 2 + col], cospi8); y = _mm_mullo_epi32(in[7 * 2 + col], cospi56); u7 = _mm_add_epi32(x, y); u7 = _mm_add_epi32(u7, rnding); u7 = _mm_srai_epi32(u7, bit); x = _mm_mullo_epi32(in[5 * 2 + col], cospi24); y = _mm_mullo_epi32(in[3 * 2 + col], cospim40); u5 = _mm_add_epi32(x, y); u5 = _mm_add_epi32(u5, rnding); u5 = _mm_srai_epi32(u5, bit); x = _mm_mullo_epi32(in[5 * 2 + col], cospi40); y = _mm_mullo_epi32(in[3 * 2 + col], cospi24); u6 = _mm_add_epi32(x, y); u6 = _mm_add_epi32(u6, rnding); u6 = _mm_srai_epi32(u6, bit); // stage 3 x = _mm_mullo_epi32(u0, cospi32); y = _mm_mullo_epi32(u1, cospi32); v0 = _mm_add_epi32(x, y); v0 = _mm_add_epi32(v0, rnding); v0 = _mm_srai_epi32(v0, bit); v1 = _mm_sub_epi32(x, y); v1 = _mm_add_epi32(v1, rnding); v1 = _mm_srai_epi32(v1, bit); x = _mm_mullo_epi32(u2, cospi48); y = _mm_mullo_epi32(u3, cospim16); v2 = _mm_add_epi32(x, y); v2 = _mm_add_epi32(v2, rnding); v2 = _mm_srai_epi32(v2, bit); x = _mm_mullo_epi32(u2, cospi16); y = _mm_mullo_epi32(u3, cospi48); v3 = _mm_add_epi32(x, y); v3 = _mm_add_epi32(v3, rnding); v3 = _mm_srai_epi32(v3, bit); addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); // stage 4 addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); u4 = v4; u7 = v7; x = _mm_mullo_epi32(v5, cospi32); y = _mm_mullo_epi32(v6, cospi32); u6 = _mm_add_epi32(y, x); u6 = _mm_add_epi32(u6, rnding); u6 = _mm_srai_epi32(u6, bit); u5 = _mm_sub_epi32(y, x); u5 = _mm_add_epi32(u5, rnding); u5 = _mm_srai_epi32(u5, bit); // stage 5 addsub_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo, &clamp_hi); addsub_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo, &clamp_hi); addsub_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo, &clamp_hi); addsub_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo, &clamp_hi); } if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); round_shift_8x8(out, out_shift); highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); } } static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospi20 = _mm_set1_epi32(cospi[20]); const __m128i cospi44 = _mm_set1_epi32(cospi[44]); const __m128i cospi36 = _mm_set1_epi32(cospi[36]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); const __m128i cospi52 = _mm_set1_epi32(cospi[52]); const __m128i cospi12 = _mm_set1_epi32(cospi[12]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const __m128i kZero = _mm_setzero_si128(); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i u[8], v[8], x; // Even 8 points: 0, 2, ..., 14 // stage 0 // stage 1 // stage 2 // (1) u[0] = _mm_mullo_epi32(in[14], cospi4); x = _mm_mullo_epi32(in[0], cospi60); u[0] = _mm_add_epi32(u[0], x); u[0] = _mm_add_epi32(u[0], rnding); u[0] = _mm_srai_epi32(u[0], bit); u[1] = _mm_mullo_epi32(in[14], cospi60); x = _mm_mullo_epi32(in[0], cospi4); u[1] = _mm_sub_epi32(u[1], x); u[1] = _mm_add_epi32(u[1], rnding); u[1] = _mm_srai_epi32(u[1], bit); // (2) u[2] = _mm_mullo_epi32(in[10], cospi20); x = _mm_mullo_epi32(in[4], cospi44); u[2] = _mm_add_epi32(u[2], x); u[2] = _mm_add_epi32(u[2], rnding); u[2] = _mm_srai_epi32(u[2], bit); u[3] = _mm_mullo_epi32(in[10], cospi44); x = _mm_mullo_epi32(in[4], cospi20); u[3] = _mm_sub_epi32(u[3], x); u[3] = _mm_add_epi32(u[3], rnding); u[3] = _mm_srai_epi32(u[3], bit); // (3) u[4] = _mm_mullo_epi32(in[6], cospi36); x = _mm_mullo_epi32(in[8], cospi28); u[4] = _mm_add_epi32(u[4], x); u[4] = _mm_add_epi32(u[4], rnding); u[4] = _mm_srai_epi32(u[4], bit); u[5] = _mm_mullo_epi32(in[6], cospi28); x = _mm_mullo_epi32(in[8], cospi36); u[5] = _mm_sub_epi32(u[5], x); u[5] = _mm_add_epi32(u[5], rnding); u[5] = _mm_srai_epi32(u[5], bit); // (4) u[6] = _mm_mullo_epi32(in[2], cospi52); x = _mm_mullo_epi32(in[12], cospi12); u[6] = _mm_add_epi32(u[6], x); u[6] = _mm_add_epi32(u[6], rnding); u[6] = _mm_srai_epi32(u[6], bit); u[7] = _mm_mullo_epi32(in[2], cospi12); x = _mm_mullo_epi32(in[12], cospi52); u[7] = _mm_sub_epi32(u[7], x); u[7] = _mm_add_epi32(u[7], rnding); u[7] = _mm_srai_epi32(u[7], bit); // stage 3 addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); // stage 4 u[0] = v[0]; u[1] = v[1]; u[2] = v[2]; u[3] = v[3]; u[4] = _mm_mullo_epi32(v[4], cospi16); x = _mm_mullo_epi32(v[5], cospi48); u[4] = _mm_add_epi32(u[4], x); u[4] = _mm_add_epi32(u[4], rnding); u[4] = _mm_srai_epi32(u[4], bit); u[5] = _mm_mullo_epi32(v[4], cospi48); x = _mm_mullo_epi32(v[5], cospi16); u[5] = _mm_sub_epi32(u[5], x); u[5] = _mm_add_epi32(u[5], rnding); u[5] = _mm_srai_epi32(u[5], bit); u[6] = _mm_mullo_epi32(v[6], cospim48); x = _mm_mullo_epi32(v[7], cospi16); u[6] = _mm_add_epi32(u[6], x); u[6] = _mm_add_epi32(u[6], rnding); u[6] = _mm_srai_epi32(u[6], bit); u[7] = _mm_mullo_epi32(v[6], cospi16); x = _mm_mullo_epi32(v[7], cospim48); u[7] = _mm_sub_epi32(u[7], x); u[7] = _mm_add_epi32(u[7], rnding); u[7] = _mm_srai_epi32(u[7], bit); // stage 5 addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); // stage 6 u[0] = v[0]; u[1] = v[1]; u[4] = v[4]; u[5] = v[5]; v[0] = _mm_mullo_epi32(v[2], cospi32); x = _mm_mullo_epi32(v[3], cospi32); u[2] = _mm_add_epi32(v[0], x); u[2] = _mm_add_epi32(u[2], rnding); u[2] = _mm_srai_epi32(u[2], bit); u[3] = _mm_sub_epi32(v[0], x); u[3] = _mm_add_epi32(u[3], rnding); u[3] = _mm_srai_epi32(u[3], bit); v[0] = _mm_mullo_epi32(v[6], cospi32); x = _mm_mullo_epi32(v[7], cospi32); u[6] = _mm_add_epi32(v[0], x); u[6] = _mm_add_epi32(u[6], rnding); u[6] = _mm_srai_epi32(u[6], bit); u[7] = _mm_sub_epi32(v[0], x); u[7] = _mm_add_epi32(u[7], rnding); u[7] = _mm_srai_epi32(u[7], bit); // stage 7 if (do_cols) { out[0] = u[0]; out[2] = _mm_sub_epi32(kZero, u[4]); out[4] = u[6]; out[6] = _mm_sub_epi32(kZero, u[2]); out[8] = u[3]; out[10] = _mm_sub_epi32(kZero, u[7]); out[12] = u[5]; out[14] = _mm_sub_epi32(kZero, u[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out, &clamp_hi_out, out_shift); } // Odd 8 points: 1, 3, ..., 15 // stage 0 // stage 1 // stage 2 // (1) u[0] = _mm_mullo_epi32(in[15], cospi4); x = _mm_mullo_epi32(in[1], cospi60); u[0] = _mm_add_epi32(u[0], x); u[0] = _mm_add_epi32(u[0], rnding); u[0] = _mm_srai_epi32(u[0], bit); u[1] = _mm_mullo_epi32(in[15], cospi60); x = _mm_mullo_epi32(in[1], cospi4); u[1] = _mm_sub_epi32(u[1], x); u[1] = _mm_add_epi32(u[1], rnding); u[1] = _mm_srai_epi32(u[1], bit); // (2) u[2] = _mm_mullo_epi32(in[11], cospi20); x = _mm_mullo_epi32(in[5], cospi44); u[2] = _mm_add_epi32(u[2], x); u[2] = _mm_add_epi32(u[2], rnding); u[2] = _mm_srai_epi32(u[2], bit); u[3] = _mm_mullo_epi32(in[11], cospi44); x = _mm_mullo_epi32(in[5], cospi20); u[3] = _mm_sub_epi32(u[3], x); u[3] = _mm_add_epi32(u[3], rnding); u[3] = _mm_srai_epi32(u[3], bit); // (3) u[4] = _mm_mullo_epi32(in[7], cospi36); x = _mm_mullo_epi32(in[9], cospi28); u[4] = _mm_add_epi32(u[4], x); u[4] = _mm_add_epi32(u[4], rnding); u[4] = _mm_srai_epi32(u[4], bit); u[5] = _mm_mullo_epi32(in[7], cospi28); x = _mm_mullo_epi32(in[9], cospi36); u[5] = _mm_sub_epi32(u[5], x); u[5] = _mm_add_epi32(u[5], rnding); u[5] = _mm_srai_epi32(u[5], bit); // (4) u[6] = _mm_mullo_epi32(in[3], cospi52); x = _mm_mullo_epi32(in[13], cospi12); u[6] = _mm_add_epi32(u[6], x); u[6] = _mm_add_epi32(u[6], rnding); u[6] = _mm_srai_epi32(u[6], bit); u[7] = _mm_mullo_epi32(in[3], cospi12); x = _mm_mullo_epi32(in[13], cospi52); u[7] = _mm_sub_epi32(u[7], x); u[7] = _mm_add_epi32(u[7], rnding); u[7] = _mm_srai_epi32(u[7], bit); // stage 3 addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); // stage 4 u[0] = v[0]; u[1] = v[1]; u[2] = v[2]; u[3] = v[3]; u[4] = _mm_mullo_epi32(v[4], cospi16); x = _mm_mullo_epi32(v[5], cospi48); u[4] = _mm_add_epi32(u[4], x); u[4] = _mm_add_epi32(u[4], rnding); u[4] = _mm_srai_epi32(u[4], bit); u[5] = _mm_mullo_epi32(v[4], cospi48); x = _mm_mullo_epi32(v[5], cospi16); u[5] = _mm_sub_epi32(u[5], x); u[5] = _mm_add_epi32(u[5], rnding); u[5] = _mm_srai_epi32(u[5], bit); u[6] = _mm_mullo_epi32(v[6], cospim48); x = _mm_mullo_epi32(v[7], cospi16); u[6] = _mm_add_epi32(u[6], x); u[6] = _mm_add_epi32(u[6], rnding); u[6] = _mm_srai_epi32(u[6], bit); u[7] = _mm_mullo_epi32(v[6], cospi16); x = _mm_mullo_epi32(v[7], cospim48); u[7] = _mm_sub_epi32(u[7], x); u[7] = _mm_add_epi32(u[7], rnding); u[7] = _mm_srai_epi32(u[7], bit); // stage 5 addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); // stage 6 u[0] = v[0]; u[1] = v[1]; u[4] = v[4]; u[5] = v[5]; v[0] = _mm_mullo_epi32(v[2], cospi32); x = _mm_mullo_epi32(v[3], cospi32); u[2] = _mm_add_epi32(v[0], x); u[2] = _mm_add_epi32(u[2], rnding); u[2] = _mm_srai_epi32(u[2], bit); u[3] = _mm_sub_epi32(v[0], x); u[3] = _mm_add_epi32(u[3], rnding); u[3] = _mm_srai_epi32(u[3], bit); v[0] = _mm_mullo_epi32(v[6], cospi32); x = _mm_mullo_epi32(v[7], cospi32); u[6] = _mm_add_epi32(v[0], x); u[6] = _mm_add_epi32(u[6], rnding); u[6] = _mm_srai_epi32(u[6], bit); u[7] = _mm_sub_epi32(v[0], x); u[7] = _mm_add_epi32(u[7], rnding); u[7] = _mm_srai_epi32(u[7], bit); // stage 7 if (do_cols) { out[1] = u[0]; out[3] = _mm_sub_epi32(kZero, u[4]); out[5] = u[6]; out[7] = _mm_sub_epi32(kZero, u[2]); out[9] = u[3]; out[11] = _mm_sub_epi32(kZero, u[7]); out[13] = u[5]; out[15] = _mm_sub_epi32(kZero, u[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out, &clamp_hi_out, out_shift); } } static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { (void)bit; out[0] = _mm_add_epi32(in[0], in[0]); out[1] = _mm_add_epi32(in[1], in[1]); out[2] = _mm_add_epi32(in[2], in[2]); out[3] = _mm_add_epi32(in[3], in[3]); out[4] = _mm_add_epi32(in[4], in[4]); out[5] = _mm_add_epi32(in[5], in[5]); out[6] = _mm_add_epi32(in[6], in[6]); out[7] = _mm_add_epi32(in[7], in[7]); if (!do_cols) { const int log_range = AOMMAX(16, bd + 6); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); round_shift_4x4(out, out_shift); round_shift_4x4(out + 4, out_shift); highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8); } } static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi, int fliplr, int bd) { __m128i x0, x1; const __m128i zero = _mm_setzero_si128(); x0 = _mm_unpacklo_epi16(pred, zero); x1 = _mm_unpackhi_epi16(pred, zero); if (fliplr) { res_lo = _mm_shuffle_epi32(res_lo, 0x1B); res_hi = _mm_shuffle_epi32(res_hi, 0x1B); x0 = _mm_add_epi32(res_hi, x0); x1 = _mm_add_epi32(res_lo, x1); } else { x0 = _mm_add_epi32(res_lo, x0); x1 = _mm_add_epi32(res_hi, x1); } x0 = _mm_packus_epi32(x0, x1); return highbd_clamp_epi16(x0, bd); } static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride, int fliplr, int flipud, int shift, int bd) { __m128i u0, u1, u2, u3, u4, u5, u6, u7; __m128i v0, v1, v2, v3, v4, v5, v6, v7; round_shift_8x8(in, shift); v0 = _mm_load_si128((__m128i const *)(output + 0 * stride)); v1 = _mm_load_si128((__m128i const *)(output + 1 * stride)); v2 = _mm_load_si128((__m128i const *)(output + 2 * stride)); v3 = _mm_load_si128((__m128i const *)(output + 3 * stride)); v4 = _mm_load_si128((__m128i const *)(output + 4 * stride)); v5 = _mm_load_si128((__m128i const *)(output + 5 * stride)); v6 = _mm_load_si128((__m128i const *)(output + 6 * stride)); v7 = _mm_load_si128((__m128i const *)(output + 7 * stride)); if (flipud) { u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd); u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd); u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd); u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd); u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd); u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd); u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd); u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd); } else { u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd); u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd); u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd); u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd); u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd); u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd); u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd); u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd); } _mm_store_si128((__m128i *)(output + 0 * stride), u0); _mm_store_si128((__m128i *)(output + 1 * stride), u1); _mm_store_si128((__m128i *)(output + 2 * stride), u2); _mm_store_si128((__m128i *)(output + 3 * stride), u3); _mm_store_si128((__m128i *)(output + 4 * stride), u4); _mm_store_si128((__m128i *)(output + 5 * stride), u5); _mm_store_si128((__m128i *)(output + 6 * stride), u6); _mm_store_si128((__m128i *)(output + 7 * stride), u7); } void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { __m128i in[16], out[16]; const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8]; switch (tx_type) { case DCT_DCT: load_buffer_8x8(input, in); idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); break; case DCT_ADST: load_buffer_8x8(input, in); iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); break; case ADST_DCT: load_buffer_8x8(input, in); idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); break; case ADST_ADST: load_buffer_8x8(input, in); iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); break; case FLIPADST_DCT: load_buffer_8x8(input, in); idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd); break; case DCT_FLIPADST: load_buffer_8x8(input, in); iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd); break; case ADST_FLIPADST: load_buffer_8x8(input, in); iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd); break; case FLIPADST_FLIPADST: load_buffer_8x8(input, in); iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 1, 1, -shift[1], bd); break; case FLIPADST_ADST: load_buffer_8x8(input, in); iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); transpose_8x8(out, in); iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd); break; default: assert(0); } } static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i x; // stage 0 // stage 1 // stage 2 // stage 3 x = _mm_mullo_epi32(in[0], cospi32); x = _mm_add_epi32(x, rnding); x = _mm_srai_epi32(x, bit); // stage 4 // stage 5 if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); x = _mm_add_epi32(x, offset); x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); } x = _mm_max_epi32(x, clamp_lo); x = _mm_min_epi32(x, clamp_hi); out[0] = x; out[1] = x; out[2] = x; out[3] = x; out[4] = x; out[5] = x; out[6] = x; out[7] = x; } static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i u0, u1, u2, u3, u4, u5, u6, u7; __m128i v0, v1, v2, v3, v4, v5, v6, v7; __m128i x, y; // stage 0 // stage 1 // stage 2 u0 = in[0]; u1 = in[4]; u2 = in[2]; u3 = in[6]; x = _mm_mullo_epi32(in[1], cospi56); y = _mm_mullo_epi32(in[7], cospim8); u4 = _mm_add_epi32(x, y); u4 = _mm_add_epi32(u4, rnding); u4 = _mm_srai_epi32(u4, bit); x = _mm_mullo_epi32(in[1], cospi8); y = _mm_mullo_epi32(in[7], cospi56); u7 = _mm_add_epi32(x, y); u7 = _mm_add_epi32(u7, rnding); u7 = _mm_srai_epi32(u7, bit); x = _mm_mullo_epi32(in[5], cospi24); y = _mm_mullo_epi32(in[3], cospim40); u5 = _mm_add_epi32(x, y); u5 = _mm_add_epi32(u5, rnding); u5 = _mm_srai_epi32(u5, bit); x = _mm_mullo_epi32(in[5], cospi40); y = _mm_mullo_epi32(in[3], cospi24); u6 = _mm_add_epi32(x, y); u6 = _mm_add_epi32(u6, rnding); u6 = _mm_srai_epi32(u6, bit); // stage 3 x = _mm_mullo_epi32(u0, cospi32); y = _mm_mullo_epi32(u1, cospi32); v0 = _mm_add_epi32(x, y); v0 = _mm_add_epi32(v0, rnding); v0 = _mm_srai_epi32(v0, bit); v1 = _mm_sub_epi32(x, y); v1 = _mm_add_epi32(v1, rnding); v1 = _mm_srai_epi32(v1, bit); x = _mm_mullo_epi32(u2, cospi48); y = _mm_mullo_epi32(u3, cospim16); v2 = _mm_add_epi32(x, y); v2 = _mm_add_epi32(v2, rnding); v2 = _mm_srai_epi32(v2, bit); x = _mm_mullo_epi32(u2, cospi16); y = _mm_mullo_epi32(u3, cospi48); v3 = _mm_add_epi32(x, y); v3 = _mm_add_epi32(v3, rnding); v3 = _mm_srai_epi32(v3, bit); addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); // stage 4 addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); u4 = v4; u7 = v7; x = _mm_mullo_epi32(v5, cospi32); y = _mm_mullo_epi32(v6, cospi32); u6 = _mm_add_epi32(y, x); u6 = _mm_add_epi32(u6, rnding); u6 = _mm_srai_epi32(u6, bit); u5 = _mm_sub_epi32(y, x); u5 = _mm_add_epi32(u5, rnding); u5 = _mm_srai_epi32(u5, bit); // stage 5 addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi); addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi); addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi); addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi); if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); round_shift_4x4(out, out_shift); round_shift_4x4(out + 4, out_shift); highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8); } } static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const __m128i kZero = _mm_setzero_si128(); __m128i u[8], x; // stage 0 // stage 1 // stage 2 x = _mm_mullo_epi32(in[0], cospi60); u[0] = _mm_add_epi32(x, rnding); u[0] = _mm_srai_epi32(u[0], bit); x = _mm_mullo_epi32(in[0], cospi4); u[1] = _mm_sub_epi32(kZero, x); u[1] = _mm_add_epi32(u[1], rnding); u[1] = _mm_srai_epi32(u[1], bit); // stage 3 // stage 4 __m128i temp1, temp2; temp1 = _mm_mullo_epi32(u[0], cospi16); x = _mm_mullo_epi32(u[1], cospi48); temp1 = _mm_add_epi32(temp1, x); temp1 = _mm_add_epi32(temp1, rnding); temp1 = _mm_srai_epi32(temp1, bit); u[4] = temp1; temp2 = _mm_mullo_epi32(u[0], cospi48); x = _mm_mullo_epi32(u[1], cospi16); u[5] = _mm_sub_epi32(temp2, x); u[5] = _mm_add_epi32(u[5], rnding); u[5] = _mm_srai_epi32(u[5], bit); // stage 5 // stage 6 temp1 = _mm_mullo_epi32(u[0], cospi32); x = _mm_mullo_epi32(u[1], cospi32); u[2] = _mm_add_epi32(temp1, x); u[2] = _mm_add_epi32(u[2], rnding); u[2] = _mm_srai_epi32(u[2], bit); u[3] = _mm_sub_epi32(temp1, x); u[3] = _mm_add_epi32(u[3], rnding); u[3] = _mm_srai_epi32(u[3], bit); temp1 = _mm_mullo_epi32(u[4], cospi32); x = _mm_mullo_epi32(u[5], cospi32); u[6] = _mm_add_epi32(temp1, x); u[6] = _mm_add_epi32(u[6], rnding); u[6] = _mm_srai_epi32(u[6], bit); u[7] = _mm_sub_epi32(temp1, x); u[7] = _mm_add_epi32(u[7], rnding); u[7] = _mm_srai_epi32(u[7], bit); // stage 7 if (do_cols) { out[0] = u[0]; out[1] = _mm_sub_epi32(kZero, u[4]); out[2] = u[6]; out[3] = _mm_sub_epi32(kZero, u[2]); out[4] = u[3]; out[5] = _mm_sub_epi32(kZero, u[7]); out[6] = u[5]; out[7] = _mm_sub_epi32(kZero, u[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, out_shift); } } static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospi20 = _mm_set1_epi32(cospi[20]); const __m128i cospi44 = _mm_set1_epi32(cospi[44]); const __m128i cospi36 = _mm_set1_epi32(cospi[36]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); const __m128i cospi52 = _mm_set1_epi32(cospi[52]); const __m128i cospi12 = _mm_set1_epi32(cospi[12]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const __m128i kZero = _mm_setzero_si128(); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i u[8], v[8], x; // stage 0 // stage 1 // stage 2 u[0] = _mm_mullo_epi32(in[7], cospi4); x = _mm_mullo_epi32(in[0], cospi60); u[0] = _mm_add_epi32(u[0], x); u[0] = _mm_add_epi32(u[0], rnding); u[0] = _mm_srai_epi32(u[0], bit); u[1] = _mm_mullo_epi32(in[7], cospi60); x = _mm_mullo_epi32(in[0], cospi4); u[1] = _mm_sub_epi32(u[1], x); u[1] = _mm_add_epi32(u[1], rnding); u[1] = _mm_srai_epi32(u[1], bit); // (2) u[2] = _mm_mullo_epi32(in[5], cospi20); x = _mm_mullo_epi32(in[2], cospi44); u[2] = _mm_add_epi32(u[2], x); u[2] = _mm_add_epi32(u[2], rnding); u[2] = _mm_srai_epi32(u[2], bit); u[3] = _mm_mullo_epi32(in[5], cospi44); x = _mm_mullo_epi32(in[2], cospi20); u[3] = _mm_sub_epi32(u[3], x); u[3] = _mm_add_epi32(u[3], rnding); u[3] = _mm_srai_epi32(u[3], bit); // (3) u[4] = _mm_mullo_epi32(in[3], cospi36); x = _mm_mullo_epi32(in[4], cospi28); u[4] = _mm_add_epi32(u[4], x); u[4] = _mm_add_epi32(u[4], rnding); u[4] = _mm_srai_epi32(u[4], bit); u[5] = _mm_mullo_epi32(in[3], cospi28); x = _mm_mullo_epi32(in[4], cospi36); u[5] = _mm_sub_epi32(u[5], x); u[5] = _mm_add_epi32(u[5], rnding); u[5] = _mm_srai_epi32(u[5], bit); // (4) u[6] = _mm_mullo_epi32(in[1], cospi52); x = _mm_mullo_epi32(in[6], cospi12); u[6] = _mm_add_epi32(u[6], x); u[6] = _mm_add_epi32(u[6], rnding); u[6] = _mm_srai_epi32(u[6], bit); u[7] = _mm_mullo_epi32(in[1], cospi12); x = _mm_mullo_epi32(in[6], cospi52); u[7] = _mm_sub_epi32(u[7], x); u[7] = _mm_add_epi32(u[7], rnding); u[7] = _mm_srai_epi32(u[7], bit); // stage 3 addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); // stage 4 u[0] = v[0]; u[1] = v[1]; u[2] = v[2]; u[3] = v[3]; u[4] = _mm_mullo_epi32(v[4], cospi16); x = _mm_mullo_epi32(v[5], cospi48); u[4] = _mm_add_epi32(u[4], x); u[4] = _mm_add_epi32(u[4], rnding); u[4] = _mm_srai_epi32(u[4], bit); u[5] = _mm_mullo_epi32(v[4], cospi48); x = _mm_mullo_epi32(v[5], cospi16); u[5] = _mm_sub_epi32(u[5], x); u[5] = _mm_add_epi32(u[5], rnding); u[5] = _mm_srai_epi32(u[5], bit); u[6] = _mm_mullo_epi32(v[6], cospim48); x = _mm_mullo_epi32(v[7], cospi16); u[6] = _mm_add_epi32(u[6], x); u[6] = _mm_add_epi32(u[6], rnding); u[6] = _mm_srai_epi32(u[6], bit); u[7] = _mm_mullo_epi32(v[6], cospi16); x = _mm_mullo_epi32(v[7], cospim48); u[7] = _mm_sub_epi32(u[7], x); u[7] = _mm_add_epi32(u[7], rnding); u[7] = _mm_srai_epi32(u[7], bit); // stage 5 addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); // stage 6 u[0] = v[0]; u[1] = v[1]; u[4] = v[4]; u[5] = v[5]; v[0] = _mm_mullo_epi32(v[2], cospi32); x = _mm_mullo_epi32(v[3], cospi32); u[2] = _mm_add_epi32(v[0], x); u[2] = _mm_add_epi32(u[2], rnding); u[2] = _mm_srai_epi32(u[2], bit); u[3] = _mm_sub_epi32(v[0], x); u[3] = _mm_add_epi32(u[3], rnding); u[3] = _mm_srai_epi32(u[3], bit); v[0] = _mm_mullo_epi32(v[6], cospi32); x = _mm_mullo_epi32(v[7], cospi32); u[6] = _mm_add_epi32(v[0], x); u[6] = _mm_add_epi32(u[6], rnding); u[6] = _mm_srai_epi32(u[6], bit); u[7] = _mm_sub_epi32(v[0], x); u[7] = _mm_add_epi32(u[7], rnding); u[7] = _mm_srai_epi32(u[7], bit); // stage 7 if (do_cols) { out[0] = u[0]; out[1] = _mm_sub_epi32(kZero, u[4]); out[2] = u[6]; out[3] = _mm_sub_epi32(kZero, u[2]); out[4] = u[3]; out[5] = _mm_sub_epi32(kZero, u[7]); out[6] = u[5]; out[7] = _mm_sub_epi32(kZero, u[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, out_shift); } } static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); // stage 0 // stage 1 // stage 2 // stage 3 // stage 4 in[0] = _mm_mullo_epi32(in[0], cospi32); in[0] = _mm_add_epi32(in[0], rnding); in[0] = _mm_srai_epi32(in[0], bit); // stage 5 // stage 6 // stage 7 if (!do_cols) { log_range = AOMMAX(16, bd + 6); clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); if (out_shift != 0) { __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); in[0] = _mm_add_epi32(in[0], offset); in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift)); } } in[0] = _mm_max_epi32(in[0], clamp_lo); in[0] = _mm_min_epi32(in[0], clamp_hi); out[0] = in[0]; out[1] = in[0]; out[2] = in[0]; out[3] = in[0]; out[4] = in[0]; out[5] = in[0]; out[6] = in[0]; out[7] = in[0]; out[8] = in[0]; out[9] = in[0]; out[10] = in[0]; out[11] = in[0]; out[12] = in[0]; out[13] = in[0]; out[14] = in[0]; out[15] = in[0]; } static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); const __m128i cospi44 = _mm_set1_epi32(cospi[44]); const __m128i cospi20 = _mm_set1_epi32(cospi[20]); const __m128i cospi12 = _mm_set1_epi32(cospi[12]); const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i u[16], x, y; // stage 0 // stage 1 u[0] = in[0]; u[2] = in[4]; u[4] = in[2]; u[6] = in[6]; u[8] = in[1]; u[10] = in[5]; u[12] = in[3]; u[14] = in[7]; // stage 2 u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit); u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit); u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit); u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); // stage 3 u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit); u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit); addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); // stage 4 x = _mm_mullo_epi32(u[0], cospi32); u[0] = _mm_add_epi32(x, rnding); u[0] = _mm_srai_epi32(u[0], bit); u[1] = u[0]; u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); u[9] = x; y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); u[10] = y; // stage 5 addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); x = _mm_mullo_epi32(u[5], cospi32); y = _mm_mullo_epi32(u[6], cospi32); u[5] = _mm_sub_epi32(y, x); u[5] = _mm_add_epi32(u[5], rnding); u[5] = _mm_srai_epi32(u[5], bit); u[6] = _mm_add_epi32(y, x); u[6] = _mm_add_epi32(u[6], rnding); u[6] = _mm_srai_epi32(u[6], bit); addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); // stage 6 addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); x = _mm_mullo_epi32(u[10], cospi32); y = _mm_mullo_epi32(u[13], cospi32); u[10] = _mm_sub_epi32(y, x); u[10] = _mm_add_epi32(u[10], rnding); u[10] = _mm_srai_epi32(u[10], bit); u[13] = _mm_add_epi32(x, y); u[13] = _mm_add_epi32(u[13], rnding); u[13] = _mm_srai_epi32(u[13], bit); x = _mm_mullo_epi32(u[11], cospi32); y = _mm_mullo_epi32(u[12], cospi32); u[11] = _mm_sub_epi32(y, x); u[11] = _mm_add_epi32(u[11], rnding); u[11] = _mm_srai_epi32(u[11], bit); u[12] = _mm_add_epi32(x, y); u[12] = _mm_add_epi32(u[12], rnding); u[12] = _mm_srai_epi32(u[12], bit); // stage 7 addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi); addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi); addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi); addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi); addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi); addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi); addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi); addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi); if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); round_shift_8x8(out, out_shift); highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); } } static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi2 = _mm_set1_epi32(cospi[2]); const __m128i cospi62 = _mm_set1_epi32(cospi[62]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const __m128i zero = _mm_setzero_si128(); __m128i v[16], x, y, temp1, temp2; // stage 0 // stage 1 // stage 2 x = _mm_mullo_epi32(in[0], cospi62); v[0] = _mm_add_epi32(x, rnding); v[0] = _mm_srai_epi32(v[0], bit); x = _mm_mullo_epi32(in[0], cospi2); v[1] = _mm_sub_epi32(zero, x); v[1] = _mm_add_epi32(v[1], rnding); v[1] = _mm_srai_epi32(v[1], bit); // stage 3 v[8] = v[0]; v[9] = v[1]; // stage 4 temp1 = _mm_mullo_epi32(v[8], cospi8); x = _mm_mullo_epi32(v[9], cospi56); temp1 = _mm_add_epi32(temp1, x); temp1 = _mm_add_epi32(temp1, rnding); temp1 = _mm_srai_epi32(temp1, bit); temp2 = _mm_mullo_epi32(v[8], cospi56); x = _mm_mullo_epi32(v[9], cospi8); temp2 = _mm_sub_epi32(temp2, x); temp2 = _mm_add_epi32(temp2, rnding); temp2 = _mm_srai_epi32(temp2, bit); v[8] = temp1; v[9] = temp2; // stage 5 v[4] = v[0]; v[5] = v[1]; v[12] = v[8]; v[13] = v[9]; // stage 6 temp1 = _mm_mullo_epi32(v[4], cospi16); x = _mm_mullo_epi32(v[5], cospi48); temp1 = _mm_add_epi32(temp1, x); temp1 = _mm_add_epi32(temp1, rnding); temp1 = _mm_srai_epi32(temp1, bit); temp2 = _mm_mullo_epi32(v[4], cospi48); x = _mm_mullo_epi32(v[5], cospi16); temp2 = _mm_sub_epi32(temp2, x); temp2 = _mm_add_epi32(temp2, rnding); temp2 = _mm_srai_epi32(temp2, bit); v[4] = temp1; v[5] = temp2; temp1 = _mm_mullo_epi32(v[12], cospi16); x = _mm_mullo_epi32(v[13], cospi48); temp1 = _mm_add_epi32(temp1, x); temp1 = _mm_add_epi32(temp1, rnding); temp1 = _mm_srai_epi32(temp1, bit); temp2 = _mm_mullo_epi32(v[12], cospi48); x = _mm_mullo_epi32(v[13], cospi16); temp2 = _mm_sub_epi32(temp2, x); temp2 = _mm_add_epi32(temp2, rnding); temp2 = _mm_srai_epi32(temp2, bit); v[12] = temp1; v[13] = temp2; // stage 7 v[2] = v[0]; v[3] = v[1]; v[6] = v[4]; v[7] = v[5]; v[10] = v[8]; v[11] = v[9]; v[14] = v[12]; v[15] = v[13]; // stage 8 y = _mm_mullo_epi32(v[2], cospi32); x = _mm_mullo_epi32(v[3], cospi32); v[2] = _mm_add_epi32(y, x); v[2] = _mm_add_epi32(v[2], rnding); v[2] = _mm_srai_epi32(v[2], bit); v[3] = _mm_sub_epi32(y, x); v[3] = _mm_add_epi32(v[3], rnding); v[3] = _mm_srai_epi32(v[3], bit); y = _mm_mullo_epi32(v[6], cospi32); x = _mm_mullo_epi32(v[7], cospi32); v[6] = _mm_add_epi32(y, x); v[6] = _mm_add_epi32(v[6], rnding); v[6] = _mm_srai_epi32(v[6], bit); v[7] = _mm_sub_epi32(y, x); v[7] = _mm_add_epi32(v[7], rnding); v[7] = _mm_srai_epi32(v[7], bit); y = _mm_mullo_epi32(v[10], cospi32); x = _mm_mullo_epi32(v[11], cospi32); v[10] = _mm_add_epi32(y, x); v[10] = _mm_add_epi32(v[10], rnding); v[10] = _mm_srai_epi32(v[10], bit); v[11] = _mm_sub_epi32(y, x); v[11] = _mm_add_epi32(v[11], rnding); v[11] = _mm_srai_epi32(v[11], bit); y = _mm_mullo_epi32(v[14], cospi32); x = _mm_mullo_epi32(v[15], cospi32); v[14] = _mm_add_epi32(y, x); v[14] = _mm_add_epi32(v[14], rnding); v[14] = _mm_srai_epi32(v[14], bit); v[15] = _mm_sub_epi32(y, x); v[15] = _mm_add_epi32(v[15], rnding); v[15] = _mm_srai_epi32(v[15], bit); // stage 9 if (do_cols) { out[0] = v[0]; out[1] = _mm_sub_epi32(zero, v[8]); out[2] = v[12]; out[3] = _mm_sub_epi32(zero, v[4]); out[4] = v[6]; out[5] = _mm_sub_epi32(zero, v[14]); out[6] = v[10]; out[7] = _mm_sub_epi32(zero, v[2]); out[8] = v[3]; out[9] = _mm_sub_epi32(zero, v[11]); out[10] = v[15]; out[11] = _mm_sub_epi32(zero, v[7]); out[12] = v[5]; out[13] = _mm_sub_epi32(zero, v[13]); out[14] = v[9]; out[15] = _mm_sub_epi32(zero, v[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, &clamp_hi_out, out_shift); } } static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi2 = _mm_set1_epi32(cospi[2]); const __m128i cospi62 = _mm_set1_epi32(cospi[62]); const __m128i cospi10 = _mm_set1_epi32(cospi[10]); const __m128i cospi54 = _mm_set1_epi32(cospi[54]); const __m128i cospi18 = _mm_set1_epi32(cospi[18]); const __m128i cospi46 = _mm_set1_epi32(cospi[46]); const __m128i cospi26 = _mm_set1_epi32(cospi[26]); const __m128i cospi38 = _mm_set1_epi32(cospi[38]); const __m128i cospi34 = _mm_set1_epi32(cospi[34]); const __m128i cospi30 = _mm_set1_epi32(cospi[30]); const __m128i cospi42 = _mm_set1_epi32(cospi[42]); const __m128i cospi22 = _mm_set1_epi32(cospi[22]); const __m128i cospi50 = _mm_set1_epi32(cospi[50]); const __m128i cospi14 = _mm_set1_epi32(cospi[14]); const __m128i cospi58 = _mm_set1_epi32(cospi[58]); const __m128i cospi6 = _mm_set1_epi32(cospi[6]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i zero = _mm_setzero_si128(); __m128i u[16], x, y; // stage 0 // stage 1 // stage 2 x = _mm_mullo_epi32(in[0], cospi62); u[0] = _mm_add_epi32(x, rnding); u[0] = _mm_srai_epi32(u[0], bit); x = _mm_mullo_epi32(in[0], cospi2); u[1] = _mm_sub_epi32(zero, x); u[1] = _mm_add_epi32(u[1], rnding); u[1] = _mm_srai_epi32(u[1], bit); x = _mm_mullo_epi32(in[2], cospi54); u[2] = _mm_add_epi32(x, rnding); u[2] = _mm_srai_epi32(u[2], bit); x = _mm_mullo_epi32(in[2], cospi10); u[3] = _mm_sub_epi32(zero, x); u[3] = _mm_add_epi32(u[3], rnding); u[3] = _mm_srai_epi32(u[3], bit); x = _mm_mullo_epi32(in[4], cospi46); u[4] = _mm_add_epi32(x, rnding); u[4] = _mm_srai_epi32(u[4], bit); x = _mm_mullo_epi32(in[4], cospi18); u[5] = _mm_sub_epi32(zero, x); u[5] = _mm_add_epi32(u[5], rnding); u[5] = _mm_srai_epi32(u[5], bit); x = _mm_mullo_epi32(in[6], cospi38); u[6] = _mm_add_epi32(x, rnding); u[6] = _mm_srai_epi32(u[6], bit); x = _mm_mullo_epi32(in[6], cospi26); u[7] = _mm_sub_epi32(zero, x); u[7] = _mm_add_epi32(u[7], rnding); u[7] = _mm_srai_epi32(u[7], bit); u[8] = _mm_mullo_epi32(in[7], cospi34); u[8] = _mm_add_epi32(u[8], rnding); u[8] = _mm_srai_epi32(u[8], bit); u[9] = _mm_mullo_epi32(in[7], cospi30); u[9] = _mm_add_epi32(u[9], rnding); u[9] = _mm_srai_epi32(u[9], bit); u[10] = _mm_mullo_epi32(in[5], cospi42); u[10] = _mm_add_epi32(u[10], rnding); u[10] = _mm_srai_epi32(u[10], bit); u[11] = _mm_mullo_epi32(in[5], cospi22); u[11] = _mm_add_epi32(u[11], rnding); u[11] = _mm_srai_epi32(u[11], bit); u[12] = _mm_mullo_epi32(in[3], cospi50); u[12] = _mm_add_epi32(u[12], rnding); u[12] = _mm_srai_epi32(u[12], bit); u[13] = _mm_mullo_epi32(in[3], cospi14); u[13] = _mm_add_epi32(u[13], rnding); u[13] = _mm_srai_epi32(u[13], bit); u[14] = _mm_mullo_epi32(in[1], cospi58); u[14] = _mm_add_epi32(u[14], rnding); u[14] = _mm_srai_epi32(u[14], bit); u[15] = _mm_mullo_epi32(in[1], cospi6); u[15] = _mm_add_epi32(u[15], rnding); u[15] = _mm_srai_epi32(u[15], bit); // stage 3 addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); // stage 4 y = _mm_mullo_epi32(u[8], cospi56); x = _mm_mullo_epi32(u[9], cospi56); u[8] = _mm_mullo_epi32(u[8], cospi8); u[8] = _mm_add_epi32(u[8], x); u[8] = _mm_add_epi32(u[8], rnding); u[8] = _mm_srai_epi32(u[8], bit); x = _mm_mullo_epi32(u[9], cospi8); u[9] = _mm_sub_epi32(y, x); u[9] = _mm_add_epi32(u[9], rnding); u[9] = _mm_srai_epi32(u[9], bit); x = _mm_mullo_epi32(u[11], cospi24); y = _mm_mullo_epi32(u[10], cospi24); u[10] = _mm_mullo_epi32(u[10], cospi40); u[10] = _mm_add_epi32(u[10], x); u[10] = _mm_add_epi32(u[10], rnding); u[10] = _mm_srai_epi32(u[10], bit); x = _mm_mullo_epi32(u[11], cospi40); u[11] = _mm_sub_epi32(y, x); u[11] = _mm_add_epi32(u[11], rnding); u[11] = _mm_srai_epi32(u[11], bit); x = _mm_mullo_epi32(u[13], cospi8); y = _mm_mullo_epi32(u[12], cospi8); u[12] = _mm_mullo_epi32(u[12], cospim56); u[12] = _mm_add_epi32(u[12], x); u[12] = _mm_add_epi32(u[12], rnding); u[12] = _mm_srai_epi32(u[12], bit); x = _mm_mullo_epi32(u[13], cospim56); u[13] = _mm_sub_epi32(y, x); u[13] = _mm_add_epi32(u[13], rnding); u[13] = _mm_srai_epi32(u[13], bit); x = _mm_mullo_epi32(u[15], cospi40); y = _mm_mullo_epi32(u[14], cospi40); u[14] = _mm_mullo_epi32(u[14], cospim24); u[14] = _mm_add_epi32(u[14], x); u[14] = _mm_add_epi32(u[14], rnding); u[14] = _mm_srai_epi32(u[14], bit); x = _mm_mullo_epi32(u[15], cospim24); u[15] = _mm_sub_epi32(y, x); u[15] = _mm_add_epi32(u[15], rnding); u[15] = _mm_srai_epi32(u[15], bit); // stage 5 addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); // stage 6 x = _mm_mullo_epi32(u[5], cospi48); y = _mm_mullo_epi32(u[4], cospi48); u[4] = _mm_mullo_epi32(u[4], cospi16); u[4] = _mm_add_epi32(u[4], x); u[4] = _mm_add_epi32(u[4], rnding); u[4] = _mm_srai_epi32(u[4], bit); x = _mm_mullo_epi32(u[5], cospi16); u[5] = _mm_sub_epi32(y, x); u[5] = _mm_add_epi32(u[5], rnding); u[5] = _mm_srai_epi32(u[5], bit); x = _mm_mullo_epi32(u[7], cospi16); y = _mm_mullo_epi32(u[6], cospi16); u[6] = _mm_mullo_epi32(u[6], cospim48); u[6] = _mm_add_epi32(u[6], x); u[6] = _mm_add_epi32(u[6], rnding); u[6] = _mm_srai_epi32(u[6], bit); x = _mm_mullo_epi32(u[7], cospim48); u[7] = _mm_sub_epi32(y, x); u[7] = _mm_add_epi32(u[7], rnding); u[7] = _mm_srai_epi32(u[7], bit); x = _mm_mullo_epi32(u[13], cospi48); y = _mm_mullo_epi32(u[12], cospi48); u[12] = _mm_mullo_epi32(u[12], cospi16); u[12] = _mm_add_epi32(u[12], x); u[12] = _mm_add_epi32(u[12], rnding); u[12] = _mm_srai_epi32(u[12], bit); x = _mm_mullo_epi32(u[13], cospi16); u[13] = _mm_sub_epi32(y, x); u[13] = _mm_add_epi32(u[13], rnding); u[13] = _mm_srai_epi32(u[13], bit); x = _mm_mullo_epi32(u[15], cospi16); y = _mm_mullo_epi32(u[14], cospi16); u[14] = _mm_mullo_epi32(u[14], cospim48); u[14] = _mm_add_epi32(u[14], x); u[14] = _mm_add_epi32(u[14], rnding); u[14] = _mm_srai_epi32(u[14], bit); x = _mm_mullo_epi32(u[15], cospim48); u[15] = _mm_sub_epi32(y, x); u[15] = _mm_add_epi32(u[15], rnding); u[15] = _mm_srai_epi32(u[15], bit); // stage 7 addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); // stage 8 y = _mm_mullo_epi32(u[2], cospi32); x = _mm_mullo_epi32(u[3], cospi32); u[2] = _mm_add_epi32(y, x); u[2] = _mm_add_epi32(u[2], rnding); u[2] = _mm_srai_epi32(u[2], bit); u[3] = _mm_sub_epi32(y, x); u[3] = _mm_add_epi32(u[3], rnding); u[3] = _mm_srai_epi32(u[3], bit); y = _mm_mullo_epi32(u[6], cospi32); x = _mm_mullo_epi32(u[7], cospi32); u[6] = _mm_add_epi32(y, x); u[6] = _mm_add_epi32(u[6], rnding); u[6] = _mm_srai_epi32(u[6], bit); u[7] = _mm_sub_epi32(y, x); u[7] = _mm_add_epi32(u[7], rnding); u[7] = _mm_srai_epi32(u[7], bit); y = _mm_mullo_epi32(u[10], cospi32); x = _mm_mullo_epi32(u[11], cospi32); u[10] = _mm_add_epi32(y, x); u[10] = _mm_add_epi32(u[10], rnding); u[10] = _mm_srai_epi32(u[10], bit); u[11] = _mm_sub_epi32(y, x); u[11] = _mm_add_epi32(u[11], rnding); u[11] = _mm_srai_epi32(u[11], bit); y = _mm_mullo_epi32(u[14], cospi32); x = _mm_mullo_epi32(u[15], cospi32); u[14] = _mm_add_epi32(y, x); u[14] = _mm_add_epi32(u[14], rnding); u[14] = _mm_srai_epi32(u[14], bit); u[15] = _mm_sub_epi32(y, x); u[15] = _mm_add_epi32(u[15], rnding); u[15] = _mm_srai_epi32(u[15], bit); // stage 9 if (do_cols) { out[0] = u[0]; out[1] = _mm_sub_epi32(zero, u[8]); out[2] = u[12]; out[3] = _mm_sub_epi32(zero, u[4]); out[4] = u[6]; out[5] = _mm_sub_epi32(zero, u[14]); out[6] = u[10]; out[7] = _mm_sub_epi32(zero, u[2]); out[8] = u[3]; out[9] = _mm_sub_epi32(zero, u[11]); out[10] = u[15]; out[11] = _mm_sub_epi32(zero, u[7]); out[12] = u[5]; out[13] = _mm_sub_epi32(zero, u[13]); out[14] = u[9]; out[15] = _mm_sub_epi32(zero, u[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out, &clamp_hi_out, out_shift); } } static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); const __m128i cospi44 = _mm_set1_epi32(cospi[44]); const __m128i cospi20 = _mm_set1_epi32(cospi[20]); const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); const __m128i cospi12 = _mm_set1_epi32(cospi[12]); const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); const __m128i cospi52 = _mm_set1_epi32(cospi[52]); const __m128i cospi36 = _mm_set1_epi32(cospi[36]); const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i u[16], v[16], x, y; { // stage 0 // stage 1 u[0] = in[0]; u[1] = in[8]; u[2] = in[4]; u[3] = in[12]; u[4] = in[2]; u[5] = in[10]; u[6] = in[6]; u[7] = in[14]; u[8] = in[1]; u[9] = in[9]; u[10] = in[5]; u[11] = in[13]; u[12] = in[3]; u[13] = in[11]; u[14] = in[7]; u[15] = in[15]; // stage 2 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = u[4]; v[5] = u[5]; v[6] = u[6]; v[7] = u[7]; v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); // stage 3 u[0] = v[0]; u[1] = v[1]; u[2] = v[2]; u[3] = v[3]; u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); // stage 4 x = _mm_mullo_epi32(u[0], cospi32); y = _mm_mullo_epi32(u[1], cospi32); v[0] = _mm_add_epi32(x, y); v[0] = _mm_add_epi32(v[0], rnding); v[0] = _mm_srai_epi32(v[0], bit); v[1] = _mm_sub_epi32(x, y); v[1] = _mm_add_epi32(v[1], rnding); v[1] = _mm_srai_epi32(v[1], bit); v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); v[8] = u[8]; v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); v[11] = u[11]; v[12] = u[12]; v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); v[15] = u[15]; // stage 5 addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); u[4] = v[4]; x = _mm_mullo_epi32(v[5], cospi32); y = _mm_mullo_epi32(v[6], cospi32); u[5] = _mm_sub_epi32(y, x); u[5] = _mm_add_epi32(u[5], rnding); u[5] = _mm_srai_epi32(u[5], bit); u[6] = _mm_add_epi32(y, x); u[6] = _mm_add_epi32(u[6], rnding); u[6] = _mm_srai_epi32(u[6], bit); u[7] = v[7]; addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); // stage 6 addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); v[8] = u[8]; v[9] = u[9]; x = _mm_mullo_epi32(u[10], cospi32); y = _mm_mullo_epi32(u[13], cospi32); v[10] = _mm_sub_epi32(y, x); v[10] = _mm_add_epi32(v[10], rnding); v[10] = _mm_srai_epi32(v[10], bit); v[13] = _mm_add_epi32(x, y); v[13] = _mm_add_epi32(v[13], rnding); v[13] = _mm_srai_epi32(v[13], bit); x = _mm_mullo_epi32(u[11], cospi32); y = _mm_mullo_epi32(u[12], cospi32); v[11] = _mm_sub_epi32(y, x); v[11] = _mm_add_epi32(v[11], rnding); v[11] = _mm_srai_epi32(v[11], bit); v[12] = _mm_add_epi32(x, y); v[12] = _mm_add_epi32(v[12], rnding); v[12] = _mm_srai_epi32(v[12], bit); v[14] = u[14]; v[15] = u[15]; // stage 7 addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi); addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi); addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi); addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi); addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi); addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi); addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi); addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi); if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); round_shift_8x8(out, out_shift); highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); } } } static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi2 = _mm_set1_epi32(cospi[2]); const __m128i cospi62 = _mm_set1_epi32(cospi[62]); const __m128i cospi10 = _mm_set1_epi32(cospi[10]); const __m128i cospi54 = _mm_set1_epi32(cospi[54]); const __m128i cospi18 = _mm_set1_epi32(cospi[18]); const __m128i cospi46 = _mm_set1_epi32(cospi[46]); const __m128i cospi26 = _mm_set1_epi32(cospi[26]); const __m128i cospi38 = _mm_set1_epi32(cospi[38]); const __m128i cospi34 = _mm_set1_epi32(cospi[34]); const __m128i cospi30 = _mm_set1_epi32(cospi[30]); const __m128i cospi42 = _mm_set1_epi32(cospi[42]); const __m128i cospi22 = _mm_set1_epi32(cospi[22]); const __m128i cospi50 = _mm_set1_epi32(cospi[50]); const __m128i cospi14 = _mm_set1_epi32(cospi[14]); const __m128i cospi58 = _mm_set1_epi32(cospi[58]); const __m128i cospi6 = _mm_set1_epi32(cospi[6]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); const __m128i zero = _mm_setzero_si128(); __m128i u[16], v[16], x, y; // Calculate the column 0, 1, 2, 3 // stage 0 // stage 1 // stage 2 v[0] = _mm_mullo_epi32(in[15], cospi2); x = _mm_mullo_epi32(in[0], cospi62); v[0] = _mm_add_epi32(v[0], x); v[0] = _mm_add_epi32(v[0], rnding); v[0] = _mm_srai_epi32(v[0], bit); v[1] = _mm_mullo_epi32(in[15], cospi62); x = _mm_mullo_epi32(in[0], cospi2); v[1] = _mm_sub_epi32(v[1], x); v[1] = _mm_add_epi32(v[1], rnding); v[1] = _mm_srai_epi32(v[1], bit); v[2] = _mm_mullo_epi32(in[13], cospi10); x = _mm_mullo_epi32(in[2], cospi54); v[2] = _mm_add_epi32(v[2], x); v[2] = _mm_add_epi32(v[2], rnding); v[2] = _mm_srai_epi32(v[2], bit); v[3] = _mm_mullo_epi32(in[13], cospi54); x = _mm_mullo_epi32(in[2], cospi10); v[3] = _mm_sub_epi32(v[3], x); v[3] = _mm_add_epi32(v[3], rnding); v[3] = _mm_srai_epi32(v[3], bit); v[4] = _mm_mullo_epi32(in[11], cospi18); x = _mm_mullo_epi32(in[4], cospi46); v[4] = _mm_add_epi32(v[4], x); v[4] = _mm_add_epi32(v[4], rnding); v[4] = _mm_srai_epi32(v[4], bit); v[5] = _mm_mullo_epi32(in[11], cospi46); x = _mm_mullo_epi32(in[4], cospi18); v[5] = _mm_sub_epi32(v[5], x); v[5] = _mm_add_epi32(v[5], rnding); v[5] = _mm_srai_epi32(v[5], bit); v[6] = _mm_mullo_epi32(in[9], cospi26); x = _mm_mullo_epi32(in[6], cospi38); v[6] = _mm_add_epi32(v[6], x); v[6] = _mm_add_epi32(v[6], rnding); v[6] = _mm_srai_epi32(v[6], bit); v[7] = _mm_mullo_epi32(in[9], cospi38); x = _mm_mullo_epi32(in[6], cospi26); v[7] = _mm_sub_epi32(v[7], x); v[7] = _mm_add_epi32(v[7], rnding); v[7] = _mm_srai_epi32(v[7], bit); v[8] = _mm_mullo_epi32(in[7], cospi34); x = _mm_mullo_epi32(in[8], cospi30); v[8] = _mm_add_epi32(v[8], x); v[8] = _mm_add_epi32(v[8], rnding); v[8] = _mm_srai_epi32(v[8], bit); v[9] = _mm_mullo_epi32(in[7], cospi30); x = _mm_mullo_epi32(in[8], cospi34); v[9] = _mm_sub_epi32(v[9], x); v[9] = _mm_add_epi32(v[9], rnding); v[9] = _mm_srai_epi32(v[9], bit); v[10] = _mm_mullo_epi32(in[5], cospi42); x = _mm_mullo_epi32(in[10], cospi22); v[10] = _mm_add_epi32(v[10], x); v[10] = _mm_add_epi32(v[10], rnding); v[10] = _mm_srai_epi32(v[10], bit); v[11] = _mm_mullo_epi32(in[5], cospi22); x = _mm_mullo_epi32(in[10], cospi42); v[11] = _mm_sub_epi32(v[11], x); v[11] = _mm_add_epi32(v[11], rnding); v[11] = _mm_srai_epi32(v[11], bit); v[12] = _mm_mullo_epi32(in[3], cospi50); x = _mm_mullo_epi32(in[12], cospi14); v[12] = _mm_add_epi32(v[12], x); v[12] = _mm_add_epi32(v[12], rnding); v[12] = _mm_srai_epi32(v[12], bit); v[13] = _mm_mullo_epi32(in[3], cospi14); x = _mm_mullo_epi32(in[12], cospi50); v[13] = _mm_sub_epi32(v[13], x); v[13] = _mm_add_epi32(v[13], rnding); v[13] = _mm_srai_epi32(v[13], bit); v[14] = _mm_mullo_epi32(in[1], cospi58); x = _mm_mullo_epi32(in[14], cospi6); v[14] = _mm_add_epi32(v[14], x); v[14] = _mm_add_epi32(v[14], rnding); v[14] = _mm_srai_epi32(v[14], bit); v[15] = _mm_mullo_epi32(in[1], cospi6); x = _mm_mullo_epi32(in[14], cospi58); v[15] = _mm_sub_epi32(v[15], x); v[15] = _mm_add_epi32(v[15], rnding); v[15] = _mm_srai_epi32(v[15], bit); // stage 3 addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); // stage 4 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = u[4]; v[5] = u[5]; v[6] = u[6]; v[7] = u[7]; v[8] = _mm_mullo_epi32(u[8], cospi8); x = _mm_mullo_epi32(u[9], cospi56); v[8] = _mm_add_epi32(v[8], x); v[8] = _mm_add_epi32(v[8], rnding); v[8] = _mm_srai_epi32(v[8], bit); v[9] = _mm_mullo_epi32(u[8], cospi56); x = _mm_mullo_epi32(u[9], cospi8); v[9] = _mm_sub_epi32(v[9], x); v[9] = _mm_add_epi32(v[9], rnding); v[9] = _mm_srai_epi32(v[9], bit); v[10] = _mm_mullo_epi32(u[10], cospi40); x = _mm_mullo_epi32(u[11], cospi24); v[10] = _mm_add_epi32(v[10], x); v[10] = _mm_add_epi32(v[10], rnding); v[10] = _mm_srai_epi32(v[10], bit); v[11] = _mm_mullo_epi32(u[10], cospi24); x = _mm_mullo_epi32(u[11], cospi40); v[11] = _mm_sub_epi32(v[11], x); v[11] = _mm_add_epi32(v[11], rnding); v[11] = _mm_srai_epi32(v[11], bit); v[12] = _mm_mullo_epi32(u[12], cospim56); x = _mm_mullo_epi32(u[13], cospi8); v[12] = _mm_add_epi32(v[12], x); v[12] = _mm_add_epi32(v[12], rnding); v[12] = _mm_srai_epi32(v[12], bit); v[13] = _mm_mullo_epi32(u[12], cospi8); x = _mm_mullo_epi32(u[13], cospim56); v[13] = _mm_sub_epi32(v[13], x); v[13] = _mm_add_epi32(v[13], rnding); v[13] = _mm_srai_epi32(v[13], bit); v[14] = _mm_mullo_epi32(u[14], cospim24); x = _mm_mullo_epi32(u[15], cospi40); v[14] = _mm_add_epi32(v[14], x); v[14] = _mm_add_epi32(v[14], rnding); v[14] = _mm_srai_epi32(v[14], bit); v[15] = _mm_mullo_epi32(u[14], cospi40); x = _mm_mullo_epi32(u[15], cospim24); v[15] = _mm_sub_epi32(v[15], x); v[15] = _mm_add_epi32(v[15], rnding); v[15] = _mm_srai_epi32(v[15], bit); // stage 5 addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); // stage 6 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = _mm_mullo_epi32(u[4], cospi16); x = _mm_mullo_epi32(u[5], cospi48); v[4] = _mm_add_epi32(v[4], x); v[4] = _mm_add_epi32(v[4], rnding); v[4] = _mm_srai_epi32(v[4], bit); v[5] = _mm_mullo_epi32(u[4], cospi48); x = _mm_mullo_epi32(u[5], cospi16); v[5] = _mm_sub_epi32(v[5], x); v[5] = _mm_add_epi32(v[5], rnding); v[5] = _mm_srai_epi32(v[5], bit); v[6] = _mm_mullo_epi32(u[6], cospim48); x = _mm_mullo_epi32(u[7], cospi16); v[6] = _mm_add_epi32(v[6], x); v[6] = _mm_add_epi32(v[6], rnding); v[6] = _mm_srai_epi32(v[6], bit); v[7] = _mm_mullo_epi32(u[6], cospi16); x = _mm_mullo_epi32(u[7], cospim48); v[7] = _mm_sub_epi32(v[7], x); v[7] = _mm_add_epi32(v[7], rnding); v[7] = _mm_srai_epi32(v[7], bit); v[8] = u[8]; v[9] = u[9]; v[10] = u[10]; v[11] = u[11]; v[12] = _mm_mullo_epi32(u[12], cospi16); x = _mm_mullo_epi32(u[13], cospi48); v[12] = _mm_add_epi32(v[12], x); v[12] = _mm_add_epi32(v[12], rnding); v[12] = _mm_srai_epi32(v[12], bit); v[13] = _mm_mullo_epi32(u[12], cospi48); x = _mm_mullo_epi32(u[13], cospi16); v[13] = _mm_sub_epi32(v[13], x); v[13] = _mm_add_epi32(v[13], rnding); v[13] = _mm_srai_epi32(v[13], bit); v[14] = _mm_mullo_epi32(u[14], cospim48); x = _mm_mullo_epi32(u[15], cospi16); v[14] = _mm_add_epi32(v[14], x); v[14] = _mm_add_epi32(v[14], rnding); v[14] = _mm_srai_epi32(v[14], bit); v[15] = _mm_mullo_epi32(u[14], cospi16); x = _mm_mullo_epi32(u[15], cospim48); v[15] = _mm_sub_epi32(v[15], x); v[15] = _mm_add_epi32(v[15], rnding); v[15] = _mm_srai_epi32(v[15], bit); // stage 7 addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); // stage 8 v[0] = u[0]; v[1] = u[1]; y = _mm_mullo_epi32(u[2], cospi32); x = _mm_mullo_epi32(u[3], cospi32); v[2] = _mm_add_epi32(y, x); v[2] = _mm_add_epi32(v[2], rnding); v[2] = _mm_srai_epi32(v[2], bit); v[3] = _mm_sub_epi32(y, x); v[3] = _mm_add_epi32(v[3], rnding); v[3] = _mm_srai_epi32(v[3], bit); v[4] = u[4]; v[5] = u[5]; y = _mm_mullo_epi32(u[6], cospi32); x = _mm_mullo_epi32(u[7], cospi32); v[6] = _mm_add_epi32(y, x); v[6] = _mm_add_epi32(v[6], rnding); v[6] = _mm_srai_epi32(v[6], bit); v[7] = _mm_sub_epi32(y, x); v[7] = _mm_add_epi32(v[7], rnding); v[7] = _mm_srai_epi32(v[7], bit); v[8] = u[8]; v[9] = u[9]; y = _mm_mullo_epi32(u[10], cospi32); x = _mm_mullo_epi32(u[11], cospi32); v[10] = _mm_add_epi32(y, x); v[10] = _mm_add_epi32(v[10], rnding); v[10] = _mm_srai_epi32(v[10], bit); v[11] = _mm_sub_epi32(y, x); v[11] = _mm_add_epi32(v[11], rnding); v[11] = _mm_srai_epi32(v[11], bit); v[12] = u[12]; v[13] = u[13]; y = _mm_mullo_epi32(u[14], cospi32); x = _mm_mullo_epi32(u[15], cospi32); v[14] = _mm_add_epi32(y, x); v[14] = _mm_add_epi32(v[14], rnding); v[14] = _mm_srai_epi32(v[14], bit); v[15] = _mm_sub_epi32(y, x); v[15] = _mm_add_epi32(v[15], rnding); v[15] = _mm_srai_epi32(v[15], bit); // stage 9 if (do_cols) { out[0] = v[0]; out[1] = _mm_sub_epi32(zero, v[8]); out[2] = v[12]; out[3] = _mm_sub_epi32(zero, v[4]); out[4] = v[6]; out[5] = _mm_sub_epi32(zero, v[14]); out[6] = v[10]; out[7] = _mm_sub_epi32(zero, v[2]); out[8] = v[3]; out[9] = _mm_sub_epi32(zero, v[11]); out[10] = v[15]; out[11] = _mm_sub_epi32(zero, v[7]); out[12] = v[5]; out[13] = _mm_sub_epi32(zero, v[13]); out[14] = v[9]; out[15] = _mm_sub_epi32(zero, v[1]); } else { const int log_range_out = AOMMAX(16, bd + 6); const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, &clamp_hi_out, out_shift); neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, &clamp_hi_out, out_shift); } } static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { (void)bit; __m128i fact = _mm_set1_epi32(2 * NewSqrt2); __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); __m128i a0_low, a0_high, a1_low, a1_high; __m128i zero = _mm_setzero_si128(); offset = _mm_unpacklo_epi32(offset, zero); for (int i = 0; i < 16; i++) { a0_low = _mm_mul_epi32(in[i], fact); a0_low = _mm_add_epi32(a0_low, offset); a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits); a0_high = _mm_srli_si128(in[i], 4); a0_high = _mm_mul_epi32(a0_high, fact); a0_high = _mm_add_epi32(a0_high, offset); a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits); a1_low = _mm_unpacklo_epi32(a0_low, a0_high); a1_high = _mm_unpackhi_epi32(a0_low, a0_high); out[i] = _mm_unpacklo_epi64(a1_low, a1_high); } if (!do_cols) { const int log_range = AOMMAX(16, bd + 6); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); round_shift_8x8(out, out_shift); highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16); } } static inline void idct64_stage8_sse4_1( __m128i *u, const __m128i *cospim32, const __m128i *cospi32, const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, const __m128i *rnding, int bit) { int i; __m128i temp1, temp2, temp3, temp4; temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit); u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit); u[10] = temp1; temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit); u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit); u[11] = temp2; for (i = 16; i < 20; ++i) { addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi); } temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit); temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit); temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit); temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit); u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit); u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit); u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit); u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit); u[36] = temp1; u[37] = temp2; u[38] = temp3; u[39] = temp4; temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit); temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit); temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit); temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit); u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit); u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit); u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit); u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit); u[40] = temp1; u[41] = temp2; u[42] = temp3; u[43] = temp4; } static inline void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32, const __m128i *cospi32, const __m128i *clamp_lo, const __m128i *clamp_hi, const __m128i *rnding, int bit) { int i; __m128i temp1, temp2, temp3, temp4; for (i = 0; i < 8; ++i) { addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); } temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit); temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit); temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit); temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit); u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit); u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit); u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit); u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit); u[20] = temp1; u[21] = temp2; u[22] = temp3; u[23] = temp4; for (i = 32; i < 40; i++) { addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); } for (i = 48; i < 56; i++) { addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); } } static inline void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32, const __m128i *cospi32, const __m128i *clamp_lo, const __m128i *clamp_hi, const __m128i *rnding, int bit) { __m128i temp1, temp2, temp3, temp4; for (int i = 0; i < 16; i++) { addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); } temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit); temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit); temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit); temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit); u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit); u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit); u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit); u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit); u[40] = temp1; u[41] = temp2; u[42] = temp3; u[43] = temp4; temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit); temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit); temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit); temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit); u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit); u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit); u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit); u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit); u[44] = temp1; u[45] = temp2; u[46] = temp3; u[47] = temp4; } static inline void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols, int bd, int out_shift, const __m128i *clamp_lo, const __m128i *clamp_hi) { for (int i = 0; i < 32; i++) { addsub_sse4_1(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi); } if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); for (int i = 0; i < 64; i += 4) { round_shift_4x4(out + i, out_shift); highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, &clamp_hi_out, 4); } } } static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); { __m128i x; // stage 1 // stage 2 // stage 3 // stage 4 // stage 5 // stage 6 x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit); // stage 8 // stage 9 // stage 10 // stage 11 if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); if (out_shift != 0) { __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); x = _mm_add_epi32(x, offset); x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); } } x = _mm_max_epi32(x, clamp_lo); x = _mm_min_epi32(x, clamp_hi); out[0] = x; out[1] = x; out[2] = x; out[3] = x; out[4] = x; out[5] = x; out[6] = x; out[7] = x; out[8] = x; out[9] = x; out[10] = x; out[11] = x; out[12] = x; out[13] = x; out[14] = x; out[15] = x; out[16] = x; out[17] = x; out[18] = x; out[19] = x; out[20] = x; out[21] = x; out[22] = x; out[23] = x; out[24] = x; out[25] = x; out[26] = x; out[27] = x; out[28] = x; out[29] = x; out[30] = x; out[31] = x; out[32] = x; out[33] = x; out[34] = x; out[35] = x; out[36] = x; out[37] = x; out[38] = x; out[39] = x; out[40] = x; out[41] = x; out[42] = x; out[43] = x; out[44] = x; out[45] = x; out[46] = x; out[47] = x; out[48] = x; out[49] = x; out[50] = x; out[51] = x; out[52] = x; out[53] = x; out[54] = x; out[55] = x; out[56] = x; out[57] = x; out[58] = x; out[59] = x; out[60] = x; out[61] = x; out[62] = x; out[63] = x; } } static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { int i, j; const int32_t *cospi = cospi_arr(bit); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); const __m128i cospi1 = _mm_set1_epi32(cospi[1]); const __m128i cospi2 = _mm_set1_epi32(cospi[2]); const __m128i cospi3 = _mm_set1_epi32(cospi[3]); const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospi6 = _mm_set1_epi32(cospi[6]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi12 = _mm_set1_epi32(cospi[12]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospi20 = _mm_set1_epi32(cospi[20]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i cospi44 = _mm_set1_epi32(cospi[44]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); const __m128i cospi63 = _mm_set1_epi32(cospi[63]); const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); const __m128i cospi7 = _mm_set1_epi32(cospi[7]); const __m128i cospi5 = _mm_set1_epi32(cospi[5]); const __m128i cospi59 = _mm_set1_epi32(cospi[59]); const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); const __m128i cospi62 = _mm_set1_epi32(cospi[62]); { __m128i u[64]; // stage 1 u[0] = in[0]; u[8] = in[4]; u[16] = in[2]; u[24] = in[6]; u[32] = in[1]; u[40] = in[5]; u[48] = in[3]; u[56] = in[7]; // stage 2 u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); // stage 3 u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit); u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit); u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit); u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit); u[33] = u[32]; u[38] = u[39]; u[41] = u[40]; u[46] = u[47]; u[49] = u[48]; u[54] = u[55]; u[57] = u[56]; u[62] = u[63]; // stage 4 __m128i temp1, temp2; u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); u[17] = u[16]; u[22] = u[23]; u[25] = u[24]; u[30] = u[31]; temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); u[33] = temp1; temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); u[57] = temp2; temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); u[41] = temp1; temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); u[46] = temp2; // stage 5 u[9] = u[8]; u[14] = u[15]; temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); u[17] = temp1; temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); u[22] = temp2; u[35] = u[32]; u[34] = u[33]; u[36] = u[39]; u[37] = u[38]; u[43] = u[40]; u[42] = u[41]; u[44] = u[47]; u[45] = u[46]; u[51] = u[48]; u[50] = u[49]; u[52] = u[55]; u[53] = u[54]; u[59] = u[56]; u[58] = u[57]; u[60] = u[63]; u[61] = u[62]; // stage 6 temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); u[0] = temp1; temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); u[9] = temp2; u[19] = u[16]; u[18] = u[17]; u[20] = u[23]; u[21] = u[22]; u[27] = u[24]; u[26] = u[25]; u[28] = u[31]; u[29] = u[30]; temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); u[34] = temp1; temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); u[35] = temp2; temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); u[36] = temp1; temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); u[37] = temp2; temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); u[42] = temp1; temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); u[43] = temp2; temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); u[44] = temp1; temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); u[45] = temp2; // stage 7 u[3] = u[0]; u[2] = u[1]; u[11] = u[8]; u[10] = u[9]; u[12] = u[15]; u[13] = u[14]; temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); u[18] = temp1; temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); u[19] = temp2; temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); u[20] = temp1; temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); u[21] = temp2; for (i = 32; i < 64; i += 16) { for (j = i; j < i + 4; j++) { addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, &clamp_hi); } } // stage 8 u[7] = u[0]; u[6] = u[1]; u[5] = u[2]; u[4] = u[3]; idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); // stage 9 idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, bit); // stage 10 idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, bit); // stage 11 idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } } static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { int i, j; const int32_t *cospi = cospi_arr(bit); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); const __m128i cospi1 = _mm_set1_epi32(cospi[1]); const __m128i cospi2 = _mm_set1_epi32(cospi[2]); const __m128i cospi3 = _mm_set1_epi32(cospi[3]); const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospi5 = _mm_set1_epi32(cospi[5]); const __m128i cospi6 = _mm_set1_epi32(cospi[6]); const __m128i cospi7 = _mm_set1_epi32(cospi[7]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi9 = _mm_set1_epi32(cospi[9]); const __m128i cospi10 = _mm_set1_epi32(cospi[10]); const __m128i cospi11 = _mm_set1_epi32(cospi[11]); const __m128i cospi12 = _mm_set1_epi32(cospi[12]); const __m128i cospi13 = _mm_set1_epi32(cospi[13]); const __m128i cospi14 = _mm_set1_epi32(cospi[14]); const __m128i cospi15 = _mm_set1_epi32(cospi[15]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospi20 = _mm_set1_epi32(cospi[20]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi36 = _mm_set1_epi32(cospi[36]); const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i cospi44 = _mm_set1_epi32(cospi[44]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi51 = _mm_set1_epi32(cospi[51]); const __m128i cospi52 = _mm_set1_epi32(cospi[52]); const __m128i cospi54 = _mm_set1_epi32(cospi[54]); const __m128i cospi55 = _mm_set1_epi32(cospi[55]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospi59 = _mm_set1_epi32(cospi[59]); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospi62 = _mm_set1_epi32(cospi[62]); const __m128i cospi63 = _mm_set1_epi32(cospi[63]); const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); { __m128i u[64]; __m128i tmp1, tmp2, tmp3, tmp4; // stage 1 u[0] = in[0]; u[32] = in[1]; u[36] = in[9]; u[40] = in[5]; u[44] = in[13]; u[48] = in[3]; u[52] = in[11]; u[56] = in[7]; u[60] = in[15]; u[16] = in[2]; u[20] = in[10]; u[24] = in[6]; u[28] = in[14]; u[4] = in[8]; u[8] = in[4]; u[12] = in[12]; // stage 2 u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); // stage 3 u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit); u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit); u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit); u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit); u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit); u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit); u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit); u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit); u[33] = u[32]; u[34] = u[35]; u[37] = u[36]; u[38] = u[39]; u[41] = u[40]; u[42] = u[43]; u[45] = u[44]; u[46] = u[47]; u[49] = u[48]; u[50] = u[51]; u[53] = u[52]; u[54] = u[55]; u[57] = u[56]; u[58] = u[59]; u[61] = u[60]; u[62] = u[63]; // stage 4 u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); u[17] = u[16]; u[18] = u[19]; u[21] = u[20]; u[22] = u[23]; u[25] = u[24]; u[26] = u[27]; u[29] = u[28]; u[30] = u[31]; tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); u[33] = tmp1; u[34] = tmp2; u[37] = tmp3; u[38] = tmp4; tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); u[41] = tmp1; u[42] = tmp2; u[45] = tmp3; u[46] = tmp4; // stage 5 u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); u[9] = u[8]; u[10] = u[11]; u[13] = u[12]; u[14] = u[15]; tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit); tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit); tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit); u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit); u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); u[17] = tmp1; u[18] = tmp2; u[21] = tmp3; u[22] = tmp4; for (i = 32; i < 64; i += 8) { addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, &clamp_hi); addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, &clamp_hi); addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, &clamp_hi); addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, &clamp_hi); } // stage 6 tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); u[0] = tmp1; u[5] = u[4]; u[6] = u[7]; tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); u[9] = tmp1; tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); u[10] = tmp2; for (i = 16; i < 32; i += 8) { addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, &clamp_hi); addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, &clamp_hi); addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, &clamp_hi); addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, &clamp_hi); } tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); u[34] = tmp1; u[35] = tmp2; u[36] = tmp3; u[37] = tmp4; tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); u[42] = tmp1; u[43] = tmp2; u[44] = tmp3; u[45] = tmp4; // stage 7 u[3] = u[0]; u[2] = u[1]; tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit); u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit); u[5] = tmp1; addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); u[18] = tmp1; u[19] = tmp2; u[20] = tmp3; u[21] = tmp4; for (i = 32; i < 64; i += 16) { for (j = i; j < i + 4; j++) { addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, &clamp_hi); } } // stage 8 for (i = 0; i < 4; ++i) { addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); } idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); // stage 9 idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, bit); // stage 10 idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, bit); // stage 11 idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } } static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { int i, j; const int32_t *cospi = cospi_arr(bit); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); const __m128i cospi1 = _mm_set1_epi32(cospi[1]); const __m128i cospi2 = _mm_set1_epi32(cospi[2]); const __m128i cospi3 = _mm_set1_epi32(cospi[3]); const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospi5 = _mm_set1_epi32(cospi[5]); const __m128i cospi6 = _mm_set1_epi32(cospi[6]); const __m128i cospi7 = _mm_set1_epi32(cospi[7]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi9 = _mm_set1_epi32(cospi[9]); const __m128i cospi10 = _mm_set1_epi32(cospi[10]); const __m128i cospi11 = _mm_set1_epi32(cospi[11]); const __m128i cospi12 = _mm_set1_epi32(cospi[12]); const __m128i cospi13 = _mm_set1_epi32(cospi[13]); const __m128i cospi14 = _mm_set1_epi32(cospi[14]); const __m128i cospi15 = _mm_set1_epi32(cospi[15]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospi17 = _mm_set1_epi32(cospi[17]); const __m128i cospi18 = _mm_set1_epi32(cospi[18]); const __m128i cospi19 = _mm_set1_epi32(cospi[19]); const __m128i cospi20 = _mm_set1_epi32(cospi[20]); const __m128i cospi21 = _mm_set1_epi32(cospi[21]); const __m128i cospi22 = _mm_set1_epi32(cospi[22]); const __m128i cospi23 = _mm_set1_epi32(cospi[23]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospi25 = _mm_set1_epi32(cospi[25]); const __m128i cospi26 = _mm_set1_epi32(cospi[26]); const __m128i cospi27 = _mm_set1_epi32(cospi[27]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); const __m128i cospi29 = _mm_set1_epi32(cospi[29]); const __m128i cospi30 = _mm_set1_epi32(cospi[30]); const __m128i cospi31 = _mm_set1_epi32(cospi[31]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi35 = _mm_set1_epi32(cospi[35]); const __m128i cospi36 = _mm_set1_epi32(cospi[36]); const __m128i cospi38 = _mm_set1_epi32(cospi[38]); const __m128i cospi39 = _mm_set1_epi32(cospi[39]); const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i cospi43 = _mm_set1_epi32(cospi[43]); const __m128i cospi44 = _mm_set1_epi32(cospi[44]); const __m128i cospi46 = _mm_set1_epi32(cospi[46]); const __m128i cospi47 = _mm_set1_epi32(cospi[47]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi51 = _mm_set1_epi32(cospi[51]); const __m128i cospi52 = _mm_set1_epi32(cospi[52]); const __m128i cospi54 = _mm_set1_epi32(cospi[54]); const __m128i cospi55 = _mm_set1_epi32(cospi[55]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospi59 = _mm_set1_epi32(cospi[59]); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospi62 = _mm_set1_epi32(cospi[62]); const __m128i cospi63 = _mm_set1_epi32(cospi[63]); const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); const __m128i cospim33 = _mm_set1_epi32(-cospi[33]); const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); const __m128i cospim37 = _mm_set1_epi32(-cospi[37]); const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); const __m128i cospim41 = _mm_set1_epi32(-cospi[41]); const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); const __m128i cospim45 = _mm_set1_epi32(-cospi[45]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); { __m128i u[64], v[64]; // stage 1 u[32] = in[1]; u[34] = in[17]; u[36] = in[9]; u[38] = in[25]; u[40] = in[5]; u[42] = in[21]; u[44] = in[13]; u[46] = in[29]; u[48] = in[3]; u[50] = in[19]; u[52] = in[11]; u[54] = in[27]; u[56] = in[7]; u[58] = in[23]; u[60] = in[15]; u[62] = in[31]; v[16] = in[2]; v[18] = in[18]; v[20] = in[10]; v[22] = in[26]; v[24] = in[6]; v[26] = in[22]; v[28] = in[14]; v[30] = in[30]; u[8] = in[4]; u[10] = in[20]; u[12] = in[12]; u[14] = in[28]; v[4] = in[8]; v[6] = in[24]; u[0] = in[0]; u[2] = in[16]; // stage 2 v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit); v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit); v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit); v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit); v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit); v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit); v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit); v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit); v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit); v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit); v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit); v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit); v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit); v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit); v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit); v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit); v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); // stage 3 u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit); u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit); u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit); u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit); u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit); u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit); u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit); u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit); u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit); u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit); u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit); u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit); u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit); u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit); u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit); u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit); for (i = 32; i < 64; i += 4) { addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, &clamp_hi); addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, &clamp_hi); } // stage 4 v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit); v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit); v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit); v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); for (i = 16; i < 32; i += 4) { addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, &clamp_hi); addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, &clamp_hi); } for (i = 32; i < 64; i += 4) { v[i + 0] = u[i + 0]; v[i + 3] = u[i + 3]; } v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); // stage 5 u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit); u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit); u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit); u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit); for (i = 8; i < 16; i += 4) { addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, &clamp_hi); addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, &clamp_hi); } for (i = 16; i < 32; i += 4) { u[i + 0] = v[i + 0]; u[i + 3] = v[i + 3]; } u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit); u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit); u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit); u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit); u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit); u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit); u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit); u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit); for (i = 32; i < 64; i += 8) { addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, &clamp_hi); addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, &clamp_hi); addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, &clamp_hi); addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, &clamp_hi); } // stage 6 v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); for (i = 8; i < 16; i += 4) { v[i + 0] = u[i + 0]; v[i + 3] = u[i + 3]; } v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); for (i = 16; i < 32; i += 8) { addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, &clamp_hi); addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, &clamp_hi); addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, &clamp_hi); addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, &clamp_hi); } for (i = 32; i < 64; i += 8) { v[i + 0] = u[i + 0]; v[i + 1] = u[i + 1]; v[i + 6] = u[i + 6]; v[i + 7] = u[i + 7]; } v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); // stage 7 addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); u[4] = v[4]; u[7] = v[7]; u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit); u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit); addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); for (i = 16; i < 32; i += 8) { u[i + 0] = v[i + 0]; u[i + 1] = v[i + 1]; u[i + 6] = v[i + 6]; u[i + 7] = v[i + 7]; } u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit); u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit); u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit); u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit); u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit); u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit); u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit); u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit); for (i = 32; i < 64; i += 16) { for (j = i; j < i + 4; j++) { addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, &clamp_hi); } } // stage 8 for (i = 0; i < 4; ++i) { addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); } v[8] = u[8]; v[9] = u[9]; v[14] = u[14]; v[15] = u[15]; v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit); v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit); v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit); v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit); for (i = 16; i < 20; ++i) { addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, &clamp_hi); } for (i = 32; i < 36; ++i) { v[i] = u[i]; v[i + 12] = u[i + 12]; v[i + 16] = u[i + 16]; v[i + 28] = u[i + 28]; } v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit); v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit); v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit); v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit); v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit); v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit); v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit); v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit); v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit); v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit); v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit); v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit); v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit); v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit); v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit); v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit); // stage 9 for (i = 0; i < 8; ++i) { addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); } for (i = 16; i < 20; ++i) { u[i] = v[i]; u[i + 12] = v[i + 12]; } u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit); u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit); u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit); u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit); u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit); u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit); u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit); u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit); for (i = 32; i < 40; i++) { addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); } for (i = 48; i < 56; i++) { addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); } // stage 10 for (i = 0; i < 16; i++) { addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); } for (i = 32; i < 40; i++) v[i] = u[i]; v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit); v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit); v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit); v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit); v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit); v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit); v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit); v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit); v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit); v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit); v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit); v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit); v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit); v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit); v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit); v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit); for (i = 56; i < 64; i++) v[i] = u[i]; // stage 11 for (i = 0; i < 32; i++) { addsub_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo, &clamp_hi); } if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); for (i = 0; i < 64; i += 4) { round_shift_4x4(out + i, out_shift); highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, &clamp_hi_out, 4); } } } } static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i bf1; // stage 0 // stage 1 bf1 = in[0]; // stage 2 // stage 3 // stage 4 // stage 5 bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit); // stage 6 // stage 7 // stage 8 // stage 9 if (do_cols) { bf1 = _mm_max_epi32(bf1, clamp_lo); bf1 = _mm_min_epi32(bf1, clamp_hi); } else { const int log_range_out = AOMMAX(16, bd + 6); clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); if (out_shift != 0) { __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); bf1 = _mm_add_epi32(bf1, offset); bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift)); } } bf1 = _mm_max_epi32(bf1, clamp_lo); bf1 = _mm_min_epi32(bf1, clamp_hi); out[0] = bf1; out[1] = bf1; out[2] = bf1; out[3] = bf1; out[4] = bf1; out[5] = bf1; out[6] = bf1; out[7] = bf1; out[8] = bf1; out[9] = bf1; out[10] = bf1; out[11] = bf1; out[12] = bf1; out[13] = bf1; out[14] = bf1; out[15] = bf1; out[16] = bf1; out[17] = bf1; out[18] = bf1; out[19] = bf1; out[20] = bf1; out[21] = bf1; out[22] = bf1; out[23] = bf1; out[24] = bf1; out[25] = bf1; out[26] = bf1; out[27] = bf1; out[28] = bf1; out[29] = bf1; out[30] = bf1; out[31] = bf1; } static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi62 = _mm_set1_epi32(cospi[62]); const __m128i cospi14 = _mm_set1_epi32(cospi[14]); const __m128i cospi54 = _mm_set1_epi32(cospi[54]); const __m128i cospi6 = _mm_set1_epi32(cospi[6]); const __m128i cospi10 = _mm_set1_epi32(cospi[10]); const __m128i cospi2 = _mm_set1_epi32(cospi[2]); const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospi12 = _mm_set1_epi32(cospi[12]); const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i bf1[32]; // stage 0 // stage 1 bf1[0] = in[0]; bf1[4] = in[4]; bf1[8] = in[2]; bf1[12] = in[6]; bf1[16] = in[1]; bf1[20] = in[5]; bf1[24] = in[3]; bf1[28] = in[7]; // stage 2 bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit); bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit); bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit); bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit); bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit); bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit); bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit); bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit); // stage 3 bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit); bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit); bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit); bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit); bf1[17] = bf1[16]; bf1[18] = bf1[19]; bf1[21] = bf1[20]; bf1[22] = bf1[23]; bf1[25] = bf1[24]; bf1[26] = bf1[27]; bf1[29] = bf1[28]; bf1[30] = bf1[31]; // stage 4 : bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit); bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit); bf1[9] = bf1[8]; bf1[10] = bf1[11]; bf1[13] = bf1[12]; bf1[14] = bf1[15]; idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, &cospi24, &cospi40, &cospim24, &rounding, bit); // stage 5 bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit); bf1[1] = bf1[0]; bf1[5] = bf1[4]; bf1[6] = bf1[7]; idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); // stage 6 bf1[3] = bf1[0]; bf1[2] = bf1[1]; idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); // stage 7 idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rounding, bit); // stage 8 idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rounding, bit); // stage 9 idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi62 = _mm_set1_epi32(cospi[62]); const __m128i cospi30 = _mm_set1_epi32(cospi[30]); const __m128i cospi46 = _mm_set1_epi32(cospi[46]); const __m128i cospi14 = _mm_set1_epi32(cospi[14]); const __m128i cospi54 = _mm_set1_epi32(cospi[54]); const __m128i cospi22 = _mm_set1_epi32(cospi[22]); const __m128i cospi38 = _mm_set1_epi32(cospi[38]); const __m128i cospi6 = _mm_set1_epi32(cospi[6]); const __m128i cospi26 = _mm_set1_epi32(cospi[26]); const __m128i cospi10 = _mm_set1_epi32(cospi[10]); const __m128i cospi18 = _mm_set1_epi32(cospi[18]); const __m128i cospi2 = _mm_set1_epi32(cospi[2]); const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); const __m128i cospi44 = _mm_set1_epi32(cospi[44]); const __m128i cospi12 = _mm_set1_epi32(cospi[12]); const __m128i cospi20 = _mm_set1_epi32(cospi[20]); const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i bf1[32]; // stage 0 // stage 1 bf1[0] = in[0]; bf1[2] = in[8]; bf1[4] = in[4]; bf1[6] = in[12]; bf1[8] = in[2]; bf1[10] = in[10]; bf1[12] = in[6]; bf1[14] = in[14]; bf1[16] = in[1]; bf1[18] = in[9]; bf1[20] = in[5]; bf1[22] = in[13]; bf1[24] = in[3]; bf1[26] = in[11]; bf1[28] = in[7]; bf1[30] = in[15]; // stage 2 bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit); bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit); bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit); bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit); bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit); bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit); bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit); bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit); bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit); bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit); bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit); bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit); bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit); bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit); bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit); bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit); // stage 3 bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit); bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit); bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit); bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit); bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit); bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit); bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit); bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit); addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); // stage 4 bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit); bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit); bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit); bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit); addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, &cospi24, &cospi40, &cospim24, &rounding, bit); // stage 5 bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit); bf1[1] = bf1[0]; bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit); bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit); addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); // stage 6 addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); // stage 7 idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rounding, bit); // stage 8 idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rounding, bit); // stage 9 idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); } static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi62 = _mm_set1_epi32(cospi[62]); const __m128i cospi30 = _mm_set1_epi32(cospi[30]); const __m128i cospi46 = _mm_set1_epi32(cospi[46]); const __m128i cospi14 = _mm_set1_epi32(cospi[14]); const __m128i cospi54 = _mm_set1_epi32(cospi[54]); const __m128i cospi22 = _mm_set1_epi32(cospi[22]); const __m128i cospi38 = _mm_set1_epi32(cospi[38]); const __m128i cospi6 = _mm_set1_epi32(cospi[6]); const __m128i cospi58 = _mm_set1_epi32(cospi[58]); const __m128i cospi26 = _mm_set1_epi32(cospi[26]); const __m128i cospi42 = _mm_set1_epi32(cospi[42]); const __m128i cospi10 = _mm_set1_epi32(cospi[10]); const __m128i cospi50 = _mm_set1_epi32(cospi[50]); const __m128i cospi18 = _mm_set1_epi32(cospi[18]); const __m128i cospi34 = _mm_set1_epi32(cospi[34]); const __m128i cospi2 = _mm_set1_epi32(cospi[2]); const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); const __m128i cospim26 = _mm_set1_epi32(-cospi[26]); const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); const __m128i cospim10 = _mm_set1_epi32(-cospi[10]); const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); const __m128i cospim18 = _mm_set1_epi32(-cospi[18]); const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); const __m128i cospim2 = _mm_set1_epi32(-cospi[2]); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); const __m128i cospi44 = _mm_set1_epi32(cospi[44]); const __m128i cospi12 = _mm_set1_epi32(cospi[12]); const __m128i cospi52 = _mm_set1_epi32(cospi[52]); const __m128i cospi20 = _mm_set1_epi32(cospi[20]); const __m128i cospi36 = _mm_set1_epi32(cospi[36]); const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); __m128i bf1[32], bf0[32]; // stage 0 // stage 1 bf1[0] = in[0]; bf1[1] = in[16]; bf1[2] = in[8]; bf1[3] = in[24]; bf1[4] = in[4]; bf1[5] = in[20]; bf1[6] = in[12]; bf1[7] = in[28]; bf1[8] = in[2]; bf1[9] = in[18]; bf1[10] = in[10]; bf1[11] = in[26]; bf1[12] = in[6]; bf1[13] = in[22]; bf1[14] = in[14]; bf1[15] = in[30]; bf1[16] = in[1]; bf1[17] = in[17]; bf1[18] = in[9]; bf1[19] = in[25]; bf1[20] = in[5]; bf1[21] = in[21]; bf1[22] = in[13]; bf1[23] = in[29]; bf1[24] = in[3]; bf1[25] = in[19]; bf1[26] = in[11]; bf1[27] = in[27]; bf1[28] = in[7]; bf1[29] = in[23]; bf1[30] = in[15]; bf1[31] = in[31]; // stage 2 bf0[0] = bf1[0]; bf0[1] = bf1[1]; bf0[2] = bf1[2]; bf0[3] = bf1[3]; bf0[4] = bf1[4]; bf0[5] = bf1[5]; bf0[6] = bf1[6]; bf0[7] = bf1[7]; bf0[8] = bf1[8]; bf0[9] = bf1[9]; bf0[10] = bf1[10]; bf0[11] = bf1[11]; bf0[12] = bf1[12]; bf0[13] = bf1[13]; bf0[14] = bf1[14]; bf0[15] = bf1[15]; bf0[16] = half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit); bf0[17] = half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit); bf0[18] = half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit); bf0[19] = half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit); bf0[20] = half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit); bf0[21] = half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit); bf0[22] = half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit); bf0[23] = half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit); bf0[24] = half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit); bf0[25] = half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit); bf0[26] = half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit); bf0[27] = half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit); bf0[28] = half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit); bf0[29] = half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit); bf0[30] = half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit); bf0[31] = half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit); // stage 3 bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; bf1[8] = half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit); bf1[9] = half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit); bf1[10] = half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit); bf1[11] = half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit); bf1[12] = half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit); bf1[13] = half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit); bf1[14] = half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit); bf1[15] = half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit); addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); // stage 4 bf0[0] = bf1[0]; bf0[1] = bf1[1]; bf0[2] = bf1[2]; bf0[3] = bf1[3]; bf0[4] = half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit); bf0[5] = half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit); bf0[6] = half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit); bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit); addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); bf0[16] = bf1[16]; bf0[17] = half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit); bf0[18] = half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit); bf0[19] = bf1[19]; bf0[20] = bf1[20]; bf0[21] = half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit); bf0[22] = half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit); bf0[23] = bf1[23]; bf0[24] = bf1[24]; bf0[25] = half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit); bf0[26] = half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit); bf0[27] = bf1[27]; bf0[28] = bf1[28]; bf0[29] = half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit); bf0[30] = half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit); bf0[31] = bf1[31]; // stage 5 bf1[0] = half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit); bf1[1] = half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit); bf1[2] = half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit); bf1[3] = half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit); addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); bf1[8] = bf0[8]; bf1[9] = half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit); bf1[10] = half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit); bf1[11] = bf0[11]; bf1[12] = bf0[12]; bf1[13] = half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit); bf1[14] = half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit); bf1[15] = bf0[15]; addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); // stage 6 addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); bf0[4] = bf1[4]; bf0[5] = half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); bf0[6] = half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); bf0[7] = bf1[7]; addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); bf0[16] = bf1[16]; bf0[17] = bf1[17]; bf0[18] = half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit); bf0[19] = half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit); bf0[20] = half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit); bf0[21] = half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit); bf0[22] = bf1[22]; bf0[23] = bf1[23]; bf0[24] = bf1[24]; bf0[25] = bf1[25]; bf0[26] = half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit); bf0[27] = half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit); bf0[28] = half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit); bf0[29] = half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit); bf0[30] = bf1[30]; bf0[31] = bf1[31]; // stage 7 addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); bf1[11] = half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); bf1[12] = half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); bf1[13] = half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); bf1[14] = bf0[14]; bf1[15] = bf0[15]; addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); // stage 8 addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); bf0[16] = bf1[16]; bf0[17] = bf1[17]; bf0[18] = bf1[18]; bf0[19] = bf1[19]; bf0[20] = half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); bf0[21] = half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); bf0[22] = half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); bf0[23] = half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); bf0[24] = half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); bf0[25] = half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); bf0[26] = half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); bf0[27] = half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); bf0[28] = bf1[28]; bf0[29] = bf1[29]; bf0[30] = bf1[30]; bf0[31] = bf1[31]; // stage 9 addsub_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi); addsub_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi); if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); round_shift_8x8(out, out_shift); round_shift_8x8(out + 16, out_shift); highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); } } static void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const int32_t *src = cast_to_int32(input); switch (tx_type) { case IDTX: case H_DCT: case H_ADST: case H_FLIPADST: case V_DCT: case V_ADST: case V_FLIPADST: av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, txfm_param->tx_size, txfm_param->eob, bd); break; default: av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); break; } } static void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); int eob = txfm_param->eob; int bd = txfm_param->bd; int lossless = txfm_param->lossless; const int32_t *src = cast_to_int32(input); const TX_TYPE tx_type = txfm_param->tx_type; if (lossless) { assert(tx_type == DCT_DCT); av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); return; } av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); } static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift) { (void)bit; for (int i = 0; i < 32; i += 16) { out[i] = _mm_slli_epi32(in[i], 2); out[i + 1] = _mm_slli_epi32(in[i + 1], 2); out[i + 2] = _mm_slli_epi32(in[i + 2], 2); out[i + 3] = _mm_slli_epi32(in[i + 3], 2); out[i + 4] = _mm_slli_epi32(in[i + 4], 2); out[i + 5] = _mm_slli_epi32(in[i + 5], 2); out[i + 6] = _mm_slli_epi32(in[i + 6], 2); out[i + 7] = _mm_slli_epi32(in[i + 7], 2); out[i + 8] = _mm_slli_epi32(in[i + 8], 2); out[i + 9] = _mm_slli_epi32(in[i + 9], 2); out[i + 10] = _mm_slli_epi32(in[i + 10], 2); out[i + 11] = _mm_slli_epi32(in[i + 11], 2); out[i + 12] = _mm_slli_epi32(in[i + 12], 2); out[i + 13] = _mm_slli_epi32(in[i + 13], 2); out[i + 14] = _mm_slli_epi32(in[i + 14], 2); out[i + 15] = _mm_slli_epi32(in[i + 15], 2); } if (!do_cols) { const int log_range_out = AOMMAX(16, bd + 6); const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); round_shift_8x8(out, out_shift); round_shift_8x8(out + 16, out_shift); highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); } } static const transform_1d_sse4_1 highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { { { idct4x4_sse4_1, NULL, NULL, NULL }, { iadst4x4_sse4_1, NULL, NULL, NULL }, { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL }, }, { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL }, { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL }, { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } }, { { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1, NULL }, { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1, NULL }, { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL }, }, { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1, idct32x32_sse4_1 }, { NULL, NULL, NULL, NULL }, { iidentity32_sse4_1, NULL, NULL, NULL } }, { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1, idct64x64_sse4_1 }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } } }; static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob, const int bd) { __m128i buf1[64]; int eobx, eoby; get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w = AOMMIN(32, txfm_size_col); const int buf_size_w_div4 = buf_size_w >> 2; const int buf_size_h_div8 = (eoby + 8) >> 3; const int row_max = AOMMIN(32, txfm_size_row); const int input_stride = row_max; const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby]; const transform_1d_sse4_1 row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; const transform_1d_sse4_1 col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < (buf_size_h_div8 << 1); ++i) { __m128i buf0[16]; load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w); if (rect_type == 1 || rect_type == -1) { av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0, NewInvSqrt2); } row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); __m128i *_buf1 = buf1 + i * 4; for (int j = 0; j < buf_size_w_div4; ++j) { __m128i *buf0_cur = buf0 + j * 4; TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); _buf1[j * txfm_size_row + 0] = buf0_cur[0]; _buf1[j * txfm_size_row + 1] = buf0_cur[1]; _buf1[j * txfm_size_row + 2] = buf0_cur[2]; _buf1[j * txfm_size_row + 3] = buf0_cur[3]; } } for (int i = 0; i < buf_size_w_div4; i++) { col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, bd, 0); av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, txfm_size_row, -shift[1]); } // write to buffer for (int i = 0; i < (txfm_size_col >> 3); i++) { highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i, stride, ud_flip, txfm_size_row, bd); } } static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob, const int bd) { __m128i buf1[64]; int eobx, eoby; get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div4 = AOMMIN(32, txfm_size_col) >> 2; const int row_max = AOMMIN(32, txfm_size_row); const int input_stride = row_max; const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3; const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; const transform_1d_sse4_1 row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; const transform_1d_sse4_1 col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < (row_max >> 2); ++i) { __m128i buf0[16]; load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_nonzero_w); if (rect_type == 1 || rect_type == -1) { av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0, NewInvSqrt2); } row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); __m128i *_buf1 = buf1 + i * 4; if (lr_flip) { for (int j = 0; j < buf_size_w_div4; ++j) { TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], buf0[4 * j], _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0], _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1], _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2], _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]); } } else { for (int j = 0; j < buf_size_w_div4; ++j) { TRANSPOSE_4X4( buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); } } } for (int i = 0; i < buf_size_w_div4; i++) { col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, bd, 0); av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, txfm_size_row, -shift[1]); } // write to buffer { for (int i = 0; i < (txfm_size_col >> 3); i++) { highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i, stride, ud_flip, txfm_size_row, bd); } } } static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob, const int bd) { (void)eob; __m128i buf1[64 * 4]; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int row_max = AOMMIN(32, txfm_size_row); const int input_stride = row_max; const int buf_size_w = AOMMIN(32, txfm_size_col); const int buf_size_w_div4 = buf_size_w >> 2; const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const transform_1d_sse4_1 row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; const transform_1d_sse4_1 col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; for (int i = 0; i < (row_max >> 2); ++i) { __m128i buf0[32]; load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w); if (rect_type == 1 || rect_type == -1) { av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0, NewInvSqrt2); } row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); __m128i *_buf1 = buf1 + i * 4; for (int j = 0; j < buf_size_w_div4; ++j) { __m128i *buf0_cur = buf0 + j * 4; TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); _buf1[j * txfm_size_row + 0] = buf0_cur[0]; _buf1[j * txfm_size_row + 1] = buf0_cur[1]; _buf1[j * txfm_size_row + 2] = buf0_cur[2]; _buf1[j * txfm_size_row + 3] = buf0_cur[3]; } } for (int i = 0; i < buf_size_w_div4; i++) { col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, bd, 0); av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, txfm_size_row, -shift[1]); } // write to buffer { for (int i = 0; i < (txfm_size_col >> 3); i++) { highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i, stride, 0, txfm_size_row, bd); } } } static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob, const int bd) { __m128i buf1[64 * 16]; int eobx, eoby; get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div4 = txfm_size_col >> 2; const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; const int input_stride = AOMMIN(32, txfm_size_row); const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; const transform_1d_sse4_1 row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; const transform_1d_sse4_1 col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; assert(col_txfm != NULL); assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // 1st stage: column transform for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) { __m128i buf0[64]; load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_nonzero_w); if (rect_type == 1 || rect_type == -1) { av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0, NewInvSqrt2); } row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); __m128i *_buf1 = buf1 + i * 4; if (lr_flip) { for (int j = 0; j < buf_size_w_div4; ++j) { TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], buf0[4 * j], _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0], _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1], _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2], _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]); } } else { for (int j = 0; j < buf_size_w_div4; ++j) { TRANSPOSE_4X4( buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); } } } // 2nd stage: column transform for (int i = 0; i < buf_size_w_div4; i++) { col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, bd, 0); av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, txfm_size_row, -shift[1]); } // write to buffer { for (int i = 0; i < (txfm_size_col >> 3); i++) { highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i, stride, ud_flip, txfm_size_row, bd); } } } static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob, const int bd) { (void)eob; __m128i buf1[8]; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const transform_1d_sse4_1 row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; const transform_1d_sse4_1 col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1]; const int input_stride = AOMMIN(32, txfm_size_row); assert(col_txfm != NULL); assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // 1st stage: column transform __m128i buf0[8]; load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col); load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col); av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0, NewInvSqrt2); row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]); if (lr_flip) { TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2], buf1[3]); TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6], buf1[7]); } else { TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2], buf1[3]); TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6], buf1[7]); } // 2nd stage: column transform col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0); av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]); // write to buffer highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row, bd); } static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob, const int bd) { (void)eob; __m128i buf1[8]; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const transform_1d_sse4_1 row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1]; const transform_1d_sse4_1 col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; assert(col_txfm != NULL); assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // 1st stage: column transform __m128i buf0[8]; const int32_t *input_row = input; load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_col, 0, NewInvSqrt2); row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); __m128i *buf1_ptr; if (lr_flip) { flip_buf_sse2(buf0, buf1, txfm_size_col); buf1_ptr = buf1; } else { buf1_ptr = buf0; } // 2nd stage: column transform for (int i = 0; i < 2; i++) { __m128i *buf1_cur = buf1_ptr + i * txfm_size_row; transpose_32bit_4x4(buf1_cur, buf1_cur); col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0); } av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); // write to buffer highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip, txfm_size_row, bd); } static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob, const int bd) { (void)eob; __m128i buf1[16]; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_h_div8 = txfm_size_row >> 2; const transform_1d_sse4_1 row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; const transform_1d_sse4_1 col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2]; const int input_stride = AOMMIN(32, txfm_size_row); assert(col_txfm != NULL); assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // 1st stage: column transform __m128i buf0[16]; for (int i = 0; i < (txfm_size_row >> 2); i++) { const int32_t *input_row = input + i * 4; __m128i *buf0_cur = buf0 + i * 4; load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col); row_txfm(buf0_cur, buf0_cur, INV_COS_BIT, 0, bd, -shift[0]); } if (lr_flip) { for (int j = 0; j < buf_size_h_div8; ++j) { TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]); } } else { for (int j = 0; j < buf_size_h_div8; ++j) { TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2], buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]); } } // 2nd stage: column transform col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0); av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]); // write to buffer highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row, bd); } static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob, const int bd) { (void)eob; __m128i buf1[16]; const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int txfm_size_col = tx_size_wide[tx_size]; const int txfm_size_row = tx_size_high[tx_size]; const int buf_size_w_div8 = txfm_size_col >> 2; const transform_1d_sse4_1 row_txfm = highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2]; const transform_1d_sse4_1 col_txfm = highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; assert(col_txfm != NULL); assert(row_txfm != NULL); int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // 1st stage: column transform __m128i buf0[16]; const int32_t *input_row = input; load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); __m128i *buf1_ptr; if (lr_flip) { flip_buf_sse2(buf0, buf1, txfm_size_col); buf1_ptr = buf1; } else { buf1_ptr = buf0; } // 2nd stage: column transform for (int i = 0; i < buf_size_w_div8; i++) { __m128i *buf1_cur = buf1_ptr + i * txfm_size_row; transpose_32bit_4x4(buf1_cur, buf1_cur); col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0); } av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); // write to buffer for (int i = 0; i < (txfm_size_col >> 3); i++) { highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2, output + 8 * i, stride, ud_flip, txfm_size_row, bd); } } void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob, const int bd) { switch (tx_type) { case DCT_DCT: case ADST_DCT: case DCT_ADST: case ADST_ADST: case FLIPADST_DCT: case DCT_FLIPADST: case FLIPADST_FLIPADST: case ADST_FLIPADST: case FLIPADST_ADST: highbd_inv_txfm2d_add_no_identity_sse41( input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, bd); break; case V_DCT: case V_ADST: case V_FLIPADST: highbd_inv_txfm2d_add_h_identity_ssse41( input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, bd); break; case H_DCT: case H_ADST: case H_FLIPADST: highbd_inv_txfm2d_add_v_identity_ssse41( input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, bd); break; case IDTX: highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, bd); break; default: assert(0); break; } } static void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const TX_SIZE tx_size = txfm_param->tx_size; int eob = txfm_param->eob; highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride, tx_type, tx_size, eob, bd); } static void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const TX_SIZE tx_size = txfm_param->tx_size; int eob = txfm_param->eob; highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride, tx_type, tx_size, eob, bd); } static void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const TX_SIZE tx_size = txfm_param->tx_size; int eob = txfm_param->eob; highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride, tx_type, tx_size, eob, bd); } static void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const TX_SIZE tx_size = txfm_param->tx_size; int eob = txfm_param->eob; highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride, tx_type, tx_size, eob, bd); } void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const TX_SIZE tx_size = txfm_param->tx_size; switch (tx_size) { case TX_8X8: av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param); break; case TX_4X8: av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param); break; case TX_8X4: av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param); break; case TX_4X4: av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); break; case TX_16X4: av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param); break; case TX_4X16: av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param); break; default: av1_highbd_inv_txfm2d_add_universe_sse4_1( input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob, txfm_param->bd); break; } } aom-3.12.1/av1/common/x86/highbd_jnt_convolve_avx2.c000066400000000000000000001065571477627663500221420ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/convolve_common_intrin.h" #include "aom_dsp/x86/convolve_sse4_1.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "av1/common/convolve.h" void av1_highbd_dist_wtd_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bits = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const __m128i left_shift = _mm_cvtsi32_si128(bits); const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m256i wt0 = _mm256_set1_epi32(w0); const __m256i wt1 = _mm256_set1_epi32(w1); const __m256i zero = _mm256_setzero_si256(); int i, j; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m256i offset_const = _mm256_set1_epi32(offset); const __m256i offset_const_16b = _mm256_set1_epi16(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); const __m256i clip_pixel_to_bd = _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); assert(bits <= 4); if (!(w % 16)) { for (i = 0; i < h; i += 1) { for (j = 0; j < w; j += 16) { const __m256i src_16bit = _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j])); const __m256i res = _mm256_sll_epi16(src_16bit, left_shift); if (do_average) { const __m256i data_0 = _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j])); const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero); const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero); const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero); const __m256i res_unsigned_lo = _mm256_add_epi32(res_32b_lo, offset_const); const __m256i comp_avg_res_lo = highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero); const __m256i res_unsigned_hi = _mm256_add_epi32(res_32b_hi, offset_const); const __m256i comp_avg_res_hi = highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg); const __m256i round_result_lo = highbd_convolve_rounding( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); const __m256i round_result_hi = highbd_convolve_rounding( &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); const __m256i res_16b = _mm256_packus_epi32(round_result_lo, round_result_hi); const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip); } else { const __m256i res_unsigned_16b = _mm256_adds_epu16(res, offset_const_16b); _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]), res_unsigned_16b); } } } } else if (!(w % 4)) { for (i = 0; i < h; i += 2) { for (j = 0; j < w; j += 8) { const __m128i src_row_0 = _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])); const __m128i src_row_1 = _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride])); // since not all compilers yet support _mm256_set_m128i() const __m256i src_10 = _mm256_insertf128_si256( _mm256_castsi128_si256(src_row_0), src_row_1, 1); const __m256i res = _mm256_sll_epi16(src_10, left_shift); if (w - j < 8) { if (do_average) { const __m256i data_0 = _mm256_castsi128_si256( _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); const __m256i data_01 = _mm256_permute2x128_si256(data_0, data_1, 0x20); const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); const __m256i res_32b = _mm256_unpacklo_epi16(res, zero); const __m256i res_unsigned_lo = _mm256_add_epi32(res_32b, offset_const); const __m256i comp_avg_res = highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m256i round_result = highbd_convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m256i res_16b = _mm256_packus_epi32(round_result, round_result); const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); const __m128i res_0 = _mm256_castsi256_si128(res_clip); const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); _mm_storel_epi64( (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); } else { const __m256i res_unsigned_16b = _mm256_adds_epu16(res, offset_const_16b); const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b); const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1); _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_1); } } else { if (do_average) { const __m256i data_0 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); const __m256i data_01 = _mm256_permute2x128_si256(data_0, data_1, 0x20); const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero); const __m256i res_unsigned_lo = _mm256_add_epi32(res_32b_lo, offset_const); const __m256i comp_avg_res_lo = highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero); const __m256i res_unsigned_hi = _mm256_add_epi32(res_32b_hi, offset_const); const __m256i comp_avg_res_hi = highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg); const __m256i round_result_lo = highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); const __m256i round_result_hi = highbd_convolve_rounding(&comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); const __m256i res_16b = _mm256_packus_epi32(round_result_lo, round_result_hi); const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); const __m128i res_0 = _mm256_castsi256_si128(res_clip); const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); _mm_store_si128( (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); } else { const __m256i res_unsigned_16b = _mm256_adds_epu16(res, offset_const_16b); const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b); const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_1); } } } } } } void av1_highbd_dist_wtd_convolve_2d_avx2( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; int im_h = h + filter_params_y->taps - 1; int im_stride = 8; int i, j; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; // Check that, even with 12-bit input, the intermediate values will fit // into an unsigned 16-bit intermediate array. assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); __m256i s[8], coeffs_y[4], coeffs_x[4]; const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m256i wt0 = _mm256_set1_epi32(w0); const __m256i wt1 = _mm256_set1_epi32(w1); const __m256i zero = _mm256_setzero_si256(); const __m256i round_const_x = _mm256_set1_epi32( ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); const __m256i round_const_y = _mm256_set1_epi32( ((1 << conv_params->round_1) >> 1) - (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m256i offset_const = _mm256_set1_epi32(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); const __m256i clip_pixel_to_bd = _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); for (j = 0; j < w; j += 8) { /* Horizontal filter */ { for (i = 0; i < im_h; i += 2) { const __m256i row0 = _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); __m256i row1 = _mm256_setzero_si256(); if (i + 1 < im_h) row1 = _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); // even pixels s[0] = _mm256_alignr_epi8(r1, r0, 0); s[1] = _mm256_alignr_epi8(r1, r0, 4); s[2] = _mm256_alignr_epi8(r1, r0, 8); s[3] = _mm256_alignr_epi8(r1, r0, 12); __m256i res_even = convolve(s, coeffs_x); res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), round_shift_x); // odd pixels s[0] = _mm256_alignr_epi8(r1, r0, 2); s[1] = _mm256_alignr_epi8(r1, r0, 6); s[2] = _mm256_alignr_epi8(r1, r0, 10); s[3] = _mm256_alignr_epi8(r1, r0, 14); __m256i res_odd = convolve(s, coeffs_x); res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), round_shift_x); __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); } } /* Vertical filter */ { __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); s[0] = _mm256_unpacklo_epi16(s0, s1); s[1] = _mm256_unpacklo_epi16(s2, s3); s[2] = _mm256_unpacklo_epi16(s4, s5); s[4] = _mm256_unpackhi_epi16(s0, s1); s[5] = _mm256_unpackhi_epi16(s2, s3); s[6] = _mm256_unpackhi_epi16(s4, s5); for (i = 0; i < h; i += 2) { const int16_t *data = &im_block[i * im_stride]; const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); s[3] = _mm256_unpacklo_epi16(s6, s7); s[7] = _mm256_unpackhi_epi16(s6, s7); const __m256i res_a = convolve(s, coeffs_y); const __m256i res_a_round = _mm256_sra_epi32( _mm256_add_epi32(res_a, round_const_y), round_shift_y); const __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const); if (w - j < 8) { if (do_average) { const __m256i data_0 = _mm256_castsi128_si256( _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); const __m256i data_01 = _mm256_permute2x128_si256(data_0, data_1, 0x20); const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); const __m256i comp_avg_res = highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m256i round_result = highbd_convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m256i res_16b = _mm256_packus_epi32(round_result, round_result); const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); const __m128i res_0 = _mm256_castsi256_si128(res_clip); const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); _mm_storel_epi64( (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); } else { __m256i res_16b = _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo); const __m128i res_0 = _mm256_castsi256_si128(res_16b); const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_1); } } else { const __m256i res_b = convolve(s + 4, coeffs_y); const __m256i res_b_round = _mm256_sra_epi32( _mm256_add_epi32(res_b, round_const_y), round_shift_y); __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const); if (do_average) { const __m256i data_0 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); const __m256i data_01 = _mm256_permute2x128_si256(data_0, data_1, 0x20); const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); const __m256i comp_avg_res_lo = highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m256i comp_avg_res_hi = highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg); const __m256i round_result_lo = highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); const __m256i round_result_hi = highbd_convolve_rounding(&comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); const __m256i res_16b = _mm256_packus_epi32(round_result_lo, round_result_hi); const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); const __m128i res_0 = _mm256_castsi256_si128(res_clip); const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); _mm_store_si128( (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); } else { __m256i res_16b = _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi); const __m128i res_0 = _mm256_castsi256_si128(res_16b); const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_1); } } s[0] = s[1]; s[1] = s[2]; s[2] = s[3]; s[4] = s[5]; s[5] = s[6]; s[6] = s[7]; } } } } void av1_highbd_dist_wtd_convolve_x_avx2( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_horiz; const int bits = FILTER_BITS - conv_params->round_1; int i, j; __m256i s[4], coeffs_x[4]; const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m256i wt0 = _mm256_set1_epi32(w0); const __m256i wt1 = _mm256_set1_epi32(w1); const __m256i zero = _mm256_setzero_si256(); const __m256i round_const_x = _mm256_set1_epi32(((1 << conv_params->round_0) >> 1)); const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m256i offset_const = _mm256_set1_epi32(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); const __m256i clip_pixel_to_bd = _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); assert(bits >= 0); prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); for (j = 0; j < w; j += 8) { /* Horizontal filter */ for (i = 0; i < h; i += 2) { const __m256i row0 = _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); __m256i row1 = _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); // even pixels s[0] = _mm256_alignr_epi8(r1, r0, 0); s[1] = _mm256_alignr_epi8(r1, r0, 4); s[2] = _mm256_alignr_epi8(r1, r0, 8); s[3] = _mm256_alignr_epi8(r1, r0, 12); __m256i res_even = convolve(s, coeffs_x); res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), round_shift_x); // odd pixels s[0] = _mm256_alignr_epi8(r1, r0, 2); s[1] = _mm256_alignr_epi8(r1, r0, 6); s[2] = _mm256_alignr_epi8(r1, r0, 10); s[3] = _mm256_alignr_epi8(r1, r0, 14); __m256i res_odd = convolve(s, coeffs_x); res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), round_shift_x); res_even = _mm256_sll_epi32(res_even, round_shift_bits); res_odd = _mm256_sll_epi32(res_odd, round_shift_bits); __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd); __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const); if (w - j < 8) { if (do_average) { const __m256i data_0 = _mm256_castsi128_si256( _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); const __m256i data_01 = _mm256_permute2x128_si256(data_0, data_1, 0x20); const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); const __m256i comp_avg_res = highbd_comp_avg( &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m256i round_result = highbd_convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m256i res_16b = _mm256_packus_epi32(round_result, round_result); const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); const __m128i res_0 = _mm256_castsi256_si128(res_clip); const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); _mm_storel_epi64( (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); } else { __m256i res_16b = _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo); const __m128i res_0 = _mm256_castsi256_si128(res_16b); const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_1); } } else { __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd); __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const); if (do_average) { const __m256i data_0 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); const __m256i data_01 = _mm256_permute2x128_si256(data_0, data_1, 0x20); const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); const __m256i comp_avg_res_lo = highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m256i comp_avg_res_hi = highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg); const __m256i round_result_lo = highbd_convolve_rounding( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); const __m256i round_result_hi = highbd_convolve_rounding( &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); const __m256i res_16b = _mm256_packus_epi32(round_result_lo, round_result_hi); const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); const __m128i res_0 = _mm256_castsi256_si128(res_clip); const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); } else { __m256i res_16b = _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi); const __m128i res_0 = _mm256_castsi256_si128(res_16b); const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_1); } } } } } void av1_highbd_dist_wtd_convolve_y_avx2( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int fo_vert = filter_params_y->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_vert * src_stride; const int bits = FILTER_BITS - conv_params->round_0; assert(bits >= 0); int i, j; __m256i s[8], coeffs_y[4]; const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m256i wt0 = _mm256_set1_epi32(w0); const __m256i wt1 = _mm256_set1_epi32(w1); const __m256i round_const_y = _mm256_set1_epi32(((1 << conv_params->round_1) >> 1)); const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m256i offset_const = _mm256_set1_epi32(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); const __m256i clip_pixel_to_bd = _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m256i zero = _mm256_setzero_si256(); prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); for (j = 0; j < w; j += 8) { const uint16_t *data = &src_ptr[j]; /* Vertical filter */ { __m256i src6; __m256i s01 = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), 0x20); __m256i s12 = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), 0x20); __m256i s23 = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), 0x20); __m256i s34 = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), 0x20); __m256i s45 = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), 0x20); src6 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); __m256i s56 = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), src6, 0x20); s[0] = _mm256_unpacklo_epi16(s01, s12); s[1] = _mm256_unpacklo_epi16(s23, s34); s[2] = _mm256_unpacklo_epi16(s45, s56); s[4] = _mm256_unpackhi_epi16(s01, s12); s[5] = _mm256_unpackhi_epi16(s23, s34); s[6] = _mm256_unpackhi_epi16(s45, s56); for (i = 0; i < h; i += 2) { data = &src_ptr[i * src_stride + j]; const __m256i s67 = _mm256_permute2x128_si256( src6, _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), 0x20); src6 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); const __m256i s78 = _mm256_permute2x128_si256( _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), src6, 0x20); s[3] = _mm256_unpacklo_epi16(s67, s78); s[7] = _mm256_unpackhi_epi16(s67, s78); const __m256i res_a = convolve(s, coeffs_y); __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits); res_a_round = _mm256_sra_epi32( _mm256_add_epi32(res_a_round, round_const_y), round_shift_y); __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const); if (w - j < 8) { if (do_average) { const __m256i data_0 = _mm256_castsi128_si256( _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); const __m256i data_01 = _mm256_permute2x128_si256(data_0, data_1, 0x20); const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); const __m256i comp_avg_res = highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m256i round_result = highbd_convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m256i res_16b = _mm256_packus_epi32(round_result, round_result); const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); const __m128i res_0 = _mm256_castsi256_si128(res_clip); const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); _mm_storel_epi64( (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); } else { __m256i res_16b = _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo); const __m128i res_0 = _mm256_castsi256_si128(res_16b); const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_1); } } else { const __m256i res_b = convolve(s + 4, coeffs_y); __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits); res_b_round = _mm256_sra_epi32( _mm256_add_epi32(res_b_round, round_const_y), round_shift_y); __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const); if (do_average) { const __m256i data_0 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); const __m256i data_01 = _mm256_permute2x128_si256(data_0, data_1, 0x20); const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); const __m256i comp_avg_res_lo = highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m256i comp_avg_res_hi = highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg); const __m256i round_result_lo = highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); const __m256i round_result_hi = highbd_convolve_rounding(&comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); const __m256i res_16b = _mm256_packus_epi32(round_result_lo, round_result_hi); const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); const __m128i res_0 = _mm256_castsi256_si128(res_clip); const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); _mm_store_si128( (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); } else { __m256i res_16b = _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi); const __m128i res_0 = _mm256_castsi256_si128(res_16b); const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_1); } } s[0] = s[1]; s[1] = s[2]; s[2] = s[3]; s[4] = s[5]; s[5] = s[6]; s[6] = s[7]; } } } } aom-3.12.1/av1/common/x86/highbd_jnt_convolve_sse4.c000066400000000000000000000403521477627663500221260ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "aom_dsp/x86/convolve_sse2.h" #include "aom_dsp/x86/convolve_sse4_1.h" void av1_highbd_dist_wtd_convolve_y_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int fo_vert = filter_params_y->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_vert * src_stride; const int bits = FILTER_BITS - conv_params->round_0; assert(bits >= 0); int i, j; const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi32(w0); const __m128i wt1 = _mm_set1_epi32(w1); const __m128i round_const_y = _mm_set1_epi32(((1 << conv_params->round_1) >> 1)); const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m128i offset_const = _mm_set1_epi32(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); const __m128i clip_pixel_to_bd = _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m128i zero = _mm_setzero_si128(); __m128i s[16], coeffs_y[4]; prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); for (j = 0; j < w; j += 8) { const uint16_t *data = &src_ptr[j]; /* Vertical filter */ { __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); s[0] = _mm_unpacklo_epi16(s0, s1); s[1] = _mm_unpacklo_epi16(s2, s3); s[2] = _mm_unpacklo_epi16(s4, s5); s[4] = _mm_unpackhi_epi16(s0, s1); s[5] = _mm_unpackhi_epi16(s2, s3); s[6] = _mm_unpackhi_epi16(s4, s5); s[0 + 8] = _mm_unpacklo_epi16(s1, s2); s[1 + 8] = _mm_unpacklo_epi16(s3, s4); s[2 + 8] = _mm_unpacklo_epi16(s5, s6); s[4 + 8] = _mm_unpackhi_epi16(s1, s2); s[5 + 8] = _mm_unpackhi_epi16(s3, s4); s[6 + 8] = _mm_unpackhi_epi16(s5, s6); for (i = 0; i < h; i += 2) { data = &src_ptr[i * src_stride + j]; __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); s[3] = _mm_unpacklo_epi16(s6, s7); s[7] = _mm_unpackhi_epi16(s6, s7); s[3 + 8] = _mm_unpacklo_epi16(s7, s8); s[7 + 8] = _mm_unpackhi_epi16(s7, s8); const __m128i res_a0 = convolve(s, coeffs_y); __m128i res_a_round0 = _mm_sll_epi32(res_a0, round_shift_bits); res_a_round0 = _mm_sra_epi32(_mm_add_epi32(res_a_round0, round_const_y), round_shift_y); const __m128i res_a1 = convolve(s + 8, coeffs_y); __m128i res_a_round1 = _mm_sll_epi32(res_a1, round_shift_bits); res_a_round1 = _mm_sra_epi32(_mm_add_epi32(res_a_round1, round_const_y), round_shift_y); __m128i res_unsigned_lo_0 = _mm_add_epi32(res_a_round0, offset_const); __m128i res_unsigned_lo_1 = _mm_add_epi32(res_a_round1, offset_const); if (w - j < 8) { if (do_average) { const __m128i data_0 = _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); const __m128i data_1 = _mm_loadl_epi64( (__m128i *)(&dst[i * dst_stride + j + dst_stride])); const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero); const __m128i comp_avg_res_0 = highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo_0, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i comp_avg_res_1 = highbd_comp_avg_sse4_1(&data_ref_1, &res_unsigned_lo_1, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i round_result_0 = highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const, &rounding_const, rounding_shift); const __m128i round_result_1 = highbd_convolve_rounding_sse2(&comp_avg_res_1, &offset_const, &rounding_const, rounding_shift); const __m128i res_16b_0 = _mm_packus_epi32(round_result_0, round_result_0); const __m128i res_clip_0 = _mm_min_epi16(res_16b_0, clip_pixel_to_bd); const __m128i res_16b_1 = _mm_packus_epi32(round_result_1, round_result_1); const __m128i res_clip_1 = _mm_min_epi16(res_16b_1, clip_pixel_to_bd); _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip_0); _mm_storel_epi64( (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_clip_1); } else { __m128i res_16b_0 = _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_lo_0); __m128i res_16b_1 = _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_lo_1); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b_0); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], res_16b_1); } } else { const __m128i res_b0 = convolve(s + 4, coeffs_y); __m128i res_b_round0 = _mm_sll_epi32(res_b0, round_shift_bits); res_b_round0 = _mm_sra_epi32( _mm_add_epi32(res_b_round0, round_const_y), round_shift_y); const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); __m128i res_b_round1 = _mm_sll_epi32(res_b1, round_shift_bits); res_b_round1 = _mm_sra_epi32( _mm_add_epi32(res_b_round1, round_const_y), round_shift_y); __m128i res_unsigned_hi_0 = _mm_add_epi32(res_b_round0, offset_const); __m128i res_unsigned_hi_1 = _mm_add_epi32(res_b_round1, offset_const); if (do_average) { const __m128i data_0 = _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i data_1 = _mm_loadu_si128( (__m128i *)(&dst[i * dst_stride + j + dst_stride])); const __m128i data_ref_0_lo_0 = _mm_unpacklo_epi16(data_0, zero); const __m128i data_ref_0_lo_1 = _mm_unpacklo_epi16(data_1, zero); const __m128i data_ref_0_hi_0 = _mm_unpackhi_epi16(data_0, zero); const __m128i data_ref_0_hi_1 = _mm_unpackhi_epi16(data_1, zero); const __m128i comp_avg_res_lo_0 = highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i comp_avg_res_lo_1 = highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i comp_avg_res_hi_0 = highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i comp_avg_res_hi_1 = highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i round_result_lo_0 = highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const, &rounding_const, rounding_shift); const __m128i round_result_lo_1 = highbd_convolve_rounding_sse2(&comp_avg_res_lo_1, &offset_const, &rounding_const, rounding_shift); const __m128i round_result_hi_0 = highbd_convolve_rounding_sse2(&comp_avg_res_hi_0, &offset_const, &rounding_const, rounding_shift); const __m128i round_result_hi_1 = highbd_convolve_rounding_sse2(&comp_avg_res_hi_1, &offset_const, &rounding_const, rounding_shift); const __m128i res_16b_0 = _mm_packus_epi32(round_result_lo_0, round_result_hi_0); const __m128i res_clip_0 = _mm_min_epi16(res_16b_0, clip_pixel_to_bd); const __m128i res_16b_1 = _mm_packus_epi32(round_result_lo_1, round_result_hi_1); const __m128i res_clip_1 = _mm_min_epi16(res_16b_1, clip_pixel_to_bd); _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip_0); _mm_store_si128( (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_clip_1); } else { __m128i res_16bit0 = _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_hi_0); __m128i res_16bit1 = _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_hi_1); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16bit0); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_16bit1); } } s[0] = s[1]; s[1] = s[2]; s[2] = s[3]; s[4] = s[5]; s[5] = s[6]; s[6] = s[7]; s[0 + 8] = s[1 + 8]; s[1 + 8] = s[2 + 8]; s[2 + 8] = s[3 + 8]; s[4 + 8] = s[5 + 8]; s[5 + 8] = s[6 + 8]; s[6 + 8] = s[7 + 8]; s6 = s8; } } } } void av1_highbd_dist_wtd_convolve_x_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_horiz; const int bits = FILTER_BITS - conv_params->round_1; int i, j; __m128i s[4], coeffs_x[4]; const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi32(w0); const __m128i wt1 = _mm_set1_epi32(w1); const __m128i zero = _mm_setzero_si128(); const __m128i round_const_x = _mm_set1_epi32(((1 << conv_params->round_0) >> 1)); const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m128i offset_const = _mm_set1_epi32(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); const __m128i clip_pixel_to_bd = _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); assert(bits >= 0); prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); for (j = 0; j < w; j += 8) { /* Horizontal filter */ for (i = 0; i < h; i += 1) { const __m128i row00 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); const __m128i row01 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); // even pixels s[0] = _mm_alignr_epi8(row01, row00, 0); s[1] = _mm_alignr_epi8(row01, row00, 4); s[2] = _mm_alignr_epi8(row01, row00, 8); s[3] = _mm_alignr_epi8(row01, row00, 12); __m128i res_even = convolve(s, coeffs_x); res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x); // odd pixels s[0] = _mm_alignr_epi8(row01, row00, 2); s[1] = _mm_alignr_epi8(row01, row00, 6); s[2] = _mm_alignr_epi8(row01, row00, 10); s[3] = _mm_alignr_epi8(row01, row00, 14); __m128i res_odd = convolve(s, coeffs_x); res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x); res_even = _mm_sll_epi32(res_even, round_shift_bits); res_odd = _mm_sll_epi32(res_odd, round_shift_bits); __m128i res1 = _mm_unpacklo_epi32(res_even, res_odd); __m128i res_unsigned_lo = _mm_add_epi32(res1, offset_const); if (w - j < 8) { if (do_average) { const __m128i data_0 = _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); const __m128i comp_avg_res = highbd_comp_avg_sse4_1( &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i round_result = highbd_convolve_rounding_sse2( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m128i res_16b = _mm_packus_epi32(round_result, round_result); const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); } else { __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo); _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b); } } else { __m128i res2 = _mm_unpackhi_epi32(res_even, res_odd); __m128i res_unsigned_hi = _mm_add_epi32(res2, offset_const); if (do_average) { const __m128i data_0 = _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero); const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero); const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i round_result_lo = highbd_convolve_rounding_sse2( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); const __m128i round_result_hi = highbd_convolve_rounding_sse2( &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); const __m128i res_16b = _mm_packus_epi32(round_result_lo, round_result_hi); const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); } else { __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b); } } } } } aom-3.12.1/av1/common/x86/highbd_txfm_utility_sse4.h000066400000000000000000000127351477627663500221720ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ #define AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ #include /* SSE4.1 */ #define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \ do { \ __m128i u0, u1, u2, u3; \ u0 = _mm_unpacklo_epi32(x0, x1); \ u1 = _mm_unpackhi_epi32(x0, x1); \ u2 = _mm_unpacklo_epi32(x2, x3); \ u3 = _mm_unpackhi_epi32(x2, x3); \ y0 = _mm_unpacklo_epi64(u0, u2); \ y1 = _mm_unpackhi_epi64(u0, u2); \ y2 = _mm_unpacklo_epi64(u1, u3); \ y3 = _mm_unpackhi_epi64(u1, u3); \ } while (0) static inline void transpose_8x8(const __m128i *in, __m128i *out) { TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]); TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]); TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]); TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13], out[15]); } static inline void transpose_16x16(const __m128i *in, __m128i *out) { // Upper left 8x8 TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]); TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24], out[28]); TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9], out[13]); TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25], out[29]); // Upper right 8x8 TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40], out[44]); TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56], out[60]); TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41], out[45]); TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57], out[61]); // Lower left 8x8 TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10], out[14]); TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26], out[30]); TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11], out[15]); TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27], out[31]); // Lower right 8x8 TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42], out[46]); TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58], out[62]); TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43], out[47]); TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59], out[63]); } static inline void transpose_8nx8n(const __m128i *input, __m128i *output, const int width, const int height) { const int numcol = height >> 2; const int numrow = width >> 2; for (int j = 0; j < numrow; j++) { for (int i = 0; i < numcol; i++) { TRANSPOSE_4X4(input[i * width + j + (numrow * 0)], input[i * width + j + (numrow * 1)], input[i * width + j + (numrow * 2)], input[i * width + j + (numrow * 3)], output[j * height + i + (numcol * 0)], output[j * height + i + (numcol * 1)], output[j * height + i + (numcol * 2)], output[j * height + i + (numcol * 3)]); } } } // Note: // rounding = 1 << (bit - 1) static inline __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0, const __m128i *w1, const __m128i *n1, const __m128i *rounding, int bit) { __m128i x, y; x = _mm_mullo_epi32(*w0, *n0); y = _mm_mullo_epi32(*w1, *n1); x = _mm_add_epi32(x, y); x = _mm_add_epi32(x, *rounding); x = _mm_srai_epi32(x, bit); return x; } static inline __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0, const __m128i *rounding, int bit) { __m128i x; x = _mm_mullo_epi32(*w0, *n0); x = _mm_add_epi32(x, *rounding); x = _mm_srai_epi32(x, bit); return x; } typedef void (*transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit, int do_cols, int bd, int out_shift); typedef void (*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit, const int num_cols); void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, TX_SIZE tx_size, int eob, const int bd); #endif // AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ aom-3.12.1/av1/common/x86/highbd_warp_affine_avx2.c000066400000000000000000000706571477627663500217160ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #include "av1/common/warped_motion.h" void av1_highbd_warp_affine_avx2(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { __m256i tmp[15]; const int reduce_bits_horiz = conv_params->round_0; const int reduce_bits_vert = conv_params->is_compound ? conv_params->round_1 : 2 * FILTER_BITS - reduce_bits_horiz; const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz; const int offset_bits_horiz = bd + FILTER_BITS - 1; const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; (void)max_bits_horiz; assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); // Check that, even with 12-bit input, the intermediate values will fit // into an unsigned 16-bit intermediate array. assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); const __m256i clip_pixel = _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert); const __m256i reduce_bits_vert_const = _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1)); const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert); const __m256i res_sub_const = _mm256_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) - (1 << (offset_bits - conv_params->round_1 - 1))); __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); __m256i round_bits_const = _mm256_set1_epi32(((1 << round_bits) >> 1)); const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m256i wt0 = _mm256_set1_epi32(w0); const __m256i wt1 = _mm256_set1_epi32(w1); __m256i v_rbhoriz = _mm256_set1_epi32(1 << (reduce_bits_horiz - 1)); __m256i v_zeros = _mm256_setzero_si256(); int ohoriz = 1 << offset_bits_horiz; int mhoriz = 1 << max_bits_horiz; (void)mhoriz; int sx; for (int i = 0; i < p_height; i += 8) { for (int j = 0; j < p_width; j += 8) { // Calculate the center of this 8x8 block, // project to luma coordinates (if in a subsampled chroma plane), // apply the affine transformation, // then convert back to the original coordinates (if necessary) const int32_t src_x = (p_col + j + 4) << subsampling_x; const int32_t src_y = (p_row + i + 4) << subsampling_y; const int64_t dst_x = (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; const int64_t dst_y = (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; const int64_t x4 = dst_x >> subsampling_x; const int64_t y4 = dst_y >> subsampling_y; const int16_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); const int16_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); // Horizontal filter if (ix4 <= -7) { for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm256_cvtepi16_epi32(_mm_set1_epi16( (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)))); } } else if (ix4 >= width + 6) { for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm256_cvtepi16_epi32( _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride + (width - 1)] * (1 << (FILTER_BITS - reduce_bits_horiz)))); } } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { int32_t tmp1[8]; for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { const int iy = clamp(iy4 + k, 0, height - 1); sx = sx4 + beta * (k + 4); for (int l = -4; l < 4; ++l) { int ix = ix4 + l - 3; const int offs = sx >> WARPEDDIFF_PREC_BITS; const int16_t *coeffs = av1_warped_filter[offs]; int32_t sum = 1 << offset_bits_horiz; for (int m = 0; m < 8; ++m) { const int sample_x = clamp(ix + m, 0, width - 1); sum += ref[iy * stride + sample_x] * coeffs[m]; } sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz); tmp1[(l + 4) / 2 + ((l + 4) % 2) * 4] = sum; sx += alpha; } tmp[k + 7] = _mm256_loadu_si256((__m256i *)tmp1); } } else { if (beta == 0 && alpha == 0) { sx = sx4; __m128i v_01 = _mm_loadu_si128( (__m128i *) av1_warped_filter[sx >> WARPEDDIFF_PREC_BITS]); // A7A6A5A4A3A2A1A0 __m256i v_c01 = _mm256_broadcastd_epi32(v_01); // A1A0A1A0A1A0A1A0 __m256i v_c23 = _mm256_broadcastd_epi32( _mm_shuffle_epi32(v_01, 1)); // A3A2A3A2A3A2A3A2 __m256i v_c45 = _mm256_broadcastd_epi32( _mm_shuffle_epi32(v_01, 2)); // A5A4A5A4A5A4A5A4 __m256i v_c67 = _mm256_broadcastd_epi32( _mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6 for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; iy = iy * stride; __m256i v_refl = _mm256_inserti128_si256( _mm256_setzero_si256(), _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); v_refl = _mm256_inserti128_si256( v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), 1); // R15 .. R0 __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); __m256i v_refu = _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 v_refl = _mm256_inserti128_si256( v_refl, _mm256_extracti128_si256(v_refu, 0), 1); v_refu = _mm256_inserti128_si256( v_refu, _mm256_extracti128_si256(v_ref, 0), 0); __m256i v_sum = _mm256_set1_epi32(ohoriz); __m256i parsum = _mm256_madd_epi16( v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0)); // R8R7R6..R1R7R6R5..R1R0 __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); parsum = _mm256_madd_epi16( v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4)); // R10R9..R3R9R8..R3R2 __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); parsum = _mm256_madd_epi16( v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8)); // R12R11..R5R11R10..R5R4 __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); parsum = _mm256_madd_epi16( v_c67, _mm256_alignr_epi8(v_refu, v_refl, 12)); // R14R13..R7R13R12..R7R6 __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), reduce_bits_horiz); } } else if (alpha == 0) { for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; iy = iy * stride; sx = sx4 + beta * (k + 4); __m128i v_01 = _mm_loadu_si128( (__m128i *)av1_warped_filter [sx >> WARPEDDIFF_PREC_BITS]); // A7A6A5A4A3A2A1A0 __m256i v_c01 = _mm256_broadcastd_epi32(v_01); // A1A0A1A0A1A0A1A0 __m256i v_c23 = _mm256_broadcastd_epi32( _mm_shuffle_epi32(v_01, 1)); // A3A2A3A2A3A2A3A2 __m256i v_c45 = _mm256_broadcastd_epi32( _mm_shuffle_epi32(v_01, 2)); // A5A4A5A4A5A4A5A4 __m256i v_c67 = _mm256_broadcastd_epi32( _mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6 __m256i v_refl = _mm256_inserti128_si256( _mm256_setzero_si256(), _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); v_refl = _mm256_inserti128_si256( v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), 1); // R15 .. R0 __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); __m256i v_refu = _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 v_refl = _mm256_inserti128_si256( v_refl, _mm256_extracti128_si256(v_refu, 0), 1); v_refu = _mm256_inserti128_si256( v_refu, _mm256_extracti128_si256(v_ref, 0), 0); __m256i v_sum = _mm256_set1_epi32(ohoriz); __m256i parsum = _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0)); __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); parsum = _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4)); __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); parsum = _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8)); __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); parsum = _mm256_madd_epi16(v_c67, _mm256_alignr_epi8(v_refu, v_refl, 12)); __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), reduce_bits_horiz); } } else if (beta == 0) { sx = sx4; __m256i v_coeff01 = _mm256_inserti128_si256( v_zeros, _mm_loadu_si128( (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]), 0); v_coeff01 = _mm256_inserti128_si256( v_coeff01, _mm_loadu_si128( (__m128i *) av1_warped_filter[(sx + alpha) >> WARPEDDIFF_PREC_BITS]), 1); // B7B6..B1B0A7A6..A1A0 __m256i v_coeff23 = _mm256_inserti128_si256( v_zeros, _mm_loadu_si128( (__m128i *)av1_warped_filter[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]), 0); v_coeff23 = _mm256_inserti128_si256( v_coeff23, _mm_loadu_si128( (__m128i *)av1_warped_filter[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]), 1); // D7D6..D1D0C7C6..C1C0 __m256i v_coeff45 = _mm256_inserti128_si256( v_zeros, _mm_loadu_si128( (__m128i *)av1_warped_filter[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]), 0); v_coeff45 = _mm256_inserti128_si256( v_coeff45, _mm_loadu_si128( (__m128i *)av1_warped_filter[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]), 1); // F7F6..F1F0E7E6..E1E0 __m256i v_coeff67 = _mm256_inserti128_si256( v_zeros, _mm_loadu_si128( (__m128i *)av1_warped_filter[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]), 0); v_coeff67 = _mm256_inserti128_si256( v_coeff67, _mm_loadu_si128( (__m128i *)av1_warped_filter[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]), 1); // H7H6..H1H0G7G6..G1G0 __m256i v_c0123 = _mm256_unpacklo_epi32( v_coeff01, v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0 __m256i v_c0123u = _mm256_unpackhi_epi32( v_coeff01, v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4 __m256i v_c4567 = _mm256_unpacklo_epi32( v_coeff45, v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0 __m256i v_c4567u = _mm256_unpackhi_epi32( v_coeff45, v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4 __m256i v_c01 = _mm256_unpacklo_epi64( v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0 __m256i v_c23 = _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2 __m256i v_c45 = _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4 __m256i v_c67 = _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6 for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; iy = iy * stride; __m256i v_refl = _mm256_inserti128_si256( _mm256_setzero_si256(), _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); v_refl = _mm256_inserti128_si256( v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), 1); // R15 .. R0 __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); __m256i v_refu = _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 v_refl = _mm256_inserti128_si256( v_refl, _mm256_extracti128_si256(v_refu, 0), 1); v_refu = _mm256_inserti128_si256( v_refu, _mm256_extracti128_si256(v_ref, 0), 0); __m256i v_sum = _mm256_set1_epi32(ohoriz); __m256i parsum = _mm256_madd_epi16( v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0)); // R8R7R6..R1R7R6R5..R1R0 __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); parsum = _mm256_madd_epi16( v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4)); // R10R9..R3R9R8..R3R2 __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); parsum = _mm256_madd_epi16( v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8)); // R12R11..R5R11R10..R5R4 __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); parsum = _mm256_madd_epi16( v_c67, _mm256_alignr_epi8(v_refu, v_refl, 12)); // R14R13..R7R13R12..R7R6 __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), reduce_bits_horiz); } } else { for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; iy = iy * stride; sx = sx4 + beta * (k + 4); __m256i v_coeff01 = _mm256_inserti128_si256( v_zeros, _mm_loadu_si128( (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]), 0); v_coeff01 = _mm256_inserti128_si256( v_coeff01, _mm_loadu_si128( (__m128i *)av1_warped_filter[(sx + alpha) >> WARPEDDIFF_PREC_BITS]), 1); // B7B6..B1B0A7A6..A1A0 __m256i v_coeff23 = _mm256_inserti128_si256( v_zeros, _mm_loadu_si128( (__m128i *)av1_warped_filter[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]), 0); v_coeff23 = _mm256_inserti128_si256( v_coeff23, _mm_loadu_si128( (__m128i *)av1_warped_filter[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]), 1); // D7D6..D1D0C7C6..C1C0 __m256i v_coeff45 = _mm256_inserti128_si256( v_zeros, _mm_loadu_si128( (__m128i *)av1_warped_filter[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]), 0); v_coeff45 = _mm256_inserti128_si256( v_coeff45, _mm_loadu_si128( (__m128i *)av1_warped_filter[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]), 1); // F7F6..F1F0E7E6..E1E0 __m256i v_coeff67 = _mm256_inserti128_si256( v_zeros, _mm_loadu_si128( (__m128i *)av1_warped_filter[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]), 0); v_coeff67 = _mm256_inserti128_si256( v_coeff67, _mm_loadu_si128( (__m128i *)av1_warped_filter[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]), 1); // H7H6..H1H0G7G6..G1G0 __m256i v_c0123 = _mm256_unpacklo_epi32( v_coeff01, v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0 __m256i v_c0123u = _mm256_unpackhi_epi32( v_coeff01, v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4 __m256i v_c4567 = _mm256_unpacklo_epi32( v_coeff45, v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0 __m256i v_c4567u = _mm256_unpackhi_epi32( v_coeff45, v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4 __m256i v_c01 = _mm256_unpacklo_epi64( v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0 __m256i v_c23 = _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2 __m256i v_c45 = _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4 __m256i v_c67 = _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6 __m256i v_refl = _mm256_inserti128_si256( _mm256_setzero_si256(), _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); v_refl = _mm256_inserti128_si256( v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), 1); // R15 .. R0 __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); __m256i v_refu = _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 v_refl = _mm256_inserti128_si256( v_refl, _mm256_extracti128_si256(v_refu, 0), 1); v_refu = _mm256_inserti128_si256( v_refu, _mm256_extracti128_si256(v_ref, 0), 0); __m256i v_sum = _mm256_set1_epi32(ohoriz); __m256i parsum = _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0)); __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); parsum = _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4)); __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); parsum = _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8)); __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); parsum = _mm256_madd_epi16(v_c67, _mm256_alignr_epi8(v_refu, v_refl, 12)); __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), reduce_bits_horiz); } } } // Vertical filter for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { int sy = sy4 + delta * (k + 4); const __m256i *src = tmp + (k + 4); __m256i v_coeff01 = _mm256_inserti128_si256( v_zeros, _mm_loadu_si128( (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS]), 0); v_coeff01 = _mm256_inserti128_si256( v_coeff01, _mm_loadu_si128( (__m128i *) av1_warped_filter[(sy + gamma) >> WARPEDDIFF_PREC_BITS]), 1); __m256i v_coeff23 = _mm256_inserti128_si256( v_zeros, _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS]), 0); v_coeff23 = _mm256_inserti128_si256( v_coeff23, _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS]), 1); __m256i v_coeff45 = _mm256_inserti128_si256( v_zeros, _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS]), 0); v_coeff45 = _mm256_inserti128_si256( v_coeff45, _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS]), 1); __m256i v_coeff67 = _mm256_inserti128_si256( v_zeros, _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS]), 0); v_coeff67 = _mm256_inserti128_si256( v_coeff67, _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS]), 1); __m256i v_c0123 = _mm256_unpacklo_epi32( v_coeff01, v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0 __m256i v_c0123u = _mm256_unpackhi_epi32( v_coeff01, v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4 __m256i v_c4567 = _mm256_unpacklo_epi32( v_coeff45, v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0 __m256i v_c4567u = _mm256_unpackhi_epi32( v_coeff45, v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4 __m256i v_c01 = _mm256_unpacklo_epi64( v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0 __m256i v_c23 = _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2 __m256i v_c45 = _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4 __m256i v_c67 = _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6 __m256i v_src01l = _mm256_unpacklo_epi32(src[0], src[1]); // T13T03T11T01T12T02T10T00 __m256i v_src01u = _mm256_unpackhi_epi32(src[0], src[1]); // T17T07T15T05T16T06T14T04 __m256i v_sum = _mm256_madd_epi16(_mm256_packus_epi32(v_src01l, v_src01u), v_c01); // S7S5S3S1S6S4S2S0 __m256i v_src23l = _mm256_unpacklo_epi32(src[2], src[3]); __m256i v_src23u = _mm256_unpackhi_epi32(src[2], src[3]); v_sum = _mm256_add_epi32( v_sum, _mm256_madd_epi16(_mm256_packus_epi32(v_src23l, v_src23u), v_c23)); __m256i v_src45l = _mm256_unpacklo_epi32(src[4], src[5]); __m256i v_src45u = _mm256_unpackhi_epi32(src[4], src[5]); v_sum = _mm256_add_epi32( v_sum, _mm256_madd_epi16(_mm256_packus_epi32(v_src45l, v_src45u), v_c45)); __m256i v_src67l = _mm256_unpacklo_epi32(src[6], src[7]); __m256i v_src67u = _mm256_unpackhi_epi32(src[6], src[7]); v_sum = _mm256_add_epi32( v_sum, _mm256_madd_epi16(_mm256_packus_epi32(v_src67l, v_src67u), v_c67)); // unpack S7S5S3S1S6S4S2S0 to S7S6S5S4S3S2S1S0 __m256i v_suml = _mm256_permute4x64_epi64(v_sum, 0xD8); // S7S5S6S4S3S1S2S0 __m256i v_sumh = _mm256_permute4x64_epi64(v_sum, 0x32); // S2S0S7S5S2S0S3S1 v_sum = _mm256_unpacklo_epi32(v_suml, v_sumh); // S7S6S5S4S3S2S1S0 if (conv_params->is_compound) { __m128i *const p = (__m128i *)&conv_params ->dst[(i + k + 4) * conv_params->dst_stride + j]; v_sum = _mm256_add_epi32(v_sum, res_add_const); v_sum = _mm256_sra_epi32(_mm256_add_epi32(v_sum, reduce_bits_vert_const), reduce_bits_vert_shift); if (conv_params->do_average) { __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; __m256i p_32 = _mm256_cvtepu16_epi32(_mm_loadu_si128(p)); if (conv_params->use_dist_wtd_comp_avg) { v_sum = _mm256_add_epi32(_mm256_mullo_epi32(p_32, wt0), _mm256_mullo_epi32(v_sum, wt1)); v_sum = _mm256_srai_epi32(v_sum, DIST_PRECISION_BITS); } else { v_sum = _mm256_srai_epi32(_mm256_add_epi32(p_32, v_sum), 1); } __m256i v_sum1 = _mm256_add_epi32(v_sum, res_sub_const); v_sum1 = _mm256_sra_epi32( _mm256_add_epi32(v_sum1, round_bits_const), round_bits_shift); __m256i v_sum16 = _mm256_packus_epi32(v_sum1, v_sum1); v_sum16 = _mm256_permute4x64_epi64(v_sum16, 0xD8); v_sum16 = _mm256_min_epi16(v_sum16, clip_pixel); _mm_storeu_si128(dst16, _mm256_extracti128_si256(v_sum16, 0)); } else { v_sum = _mm256_packus_epi32(v_sum, v_sum); __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum, 0xD8); _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0)); } } else { // Round and pack into 8 bits const __m256i round_const = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + ((1 << reduce_bits_vert) >> 1)); __m256i v_sum1 = _mm256_srai_epi32( _mm256_add_epi32(v_sum, round_const), reduce_bits_vert); v_sum1 = _mm256_packus_epi32(v_sum1, v_sum1); __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum1, 0xD8); // Clamp res_16bit to the range [0, 2^bd - 1] const __m256i max_val = _mm256_set1_epi16((1 << bd) - 1); const __m256i zero = _mm256_setzero_si256(); v_sum16 = _mm256_max_epi16(_mm256_min_epi16(v_sum16, max_val), zero); __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0)); } } } } } aom-3.12.1/av1/common/x86/highbd_warp_plane_sse4.c000066400000000000000000000660431477627663500215550ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #include "av1/common/warped_motion.h" static const uint8_t warp_highbd_arrange_bytes[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; static const uint8_t highbd_shuffle_alpha0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 }; static const uint8_t highbd_shuffle_alpha0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 }; static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 }; static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 }; static inline void highbd_prepare_horizontal_filter_coeff(int alpha, int sx, __m128i *coeff) { // Filter even-index pixels const __m128i tmp_0 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_2 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_4 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_6 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS))); // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2 const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6 const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2 const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6 const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6 coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10); // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6 coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10); // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6 coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14); // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6 coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14); // Filter odd-index pixels const __m128i tmp_1 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_3 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_5 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_7 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11); coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11); coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15); coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15); } static inline void highbd_prepare_horizontal_filter_coeff_alpha0( int sx, __m128i *coeff) { // Filter coeff const __m128i tmp_0 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); coeff[0] = _mm_shuffle_epi8( tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0)); coeff[2] = _mm_shuffle_epi8( tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1)); coeff[4] = _mm_shuffle_epi8( tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2)); coeff[6] = _mm_shuffle_epi8( tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3)); coeff[1] = coeff[0]; coeff[3] = coeff[2]; coeff[5] = coeff[4]; coeff[7] = coeff[6]; } static inline void highbd_filter_src_pixels( const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff, const int offset_bits_horiz, const int reduce_bits_horiz, int k) { const __m128i src_1 = *src; const __m128i src2_1 = *src2; const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1)); const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]); const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]); const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]); const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]); __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const), _mm_cvtsi32_si128(reduce_bits_horiz)); const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]); const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]); const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]); const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), _mm_cvtsi32_si128(reduce_bits_horiz)); // Combine results into one register. // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7 // as this order helps with the vertical filter. tmp[k + 7] = _mm_packs_epi32(res_even, res_odd); } static inline void highbd_horiz_filter(const __m128i *src, const __m128i *src2, __m128i *tmp, int sx, int alpha, int k, const int offset_bits_horiz, const int reduce_bits_horiz) { __m128i coeff[8]; highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff); highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); } static inline void highbd_warp_horizontal_filter_alpha0_beta0( const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, int32_t sx4, int alpha, int beta, int p_height, int height, int i, const int offset_bits_horiz, const int reduce_bits_horiz) { (void)beta; (void)alpha; int k; __m128i coeff[8]; highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff); for (k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; // Load source pixels const __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); const __m128i src2 = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); } } static inline void highbd_warp_horizontal_filter_alpha0( const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, int32_t sx4, int alpha, int beta, int p_height, int height, int i, const int offset_bits_horiz, const int reduce_bits_horiz) { (void)alpha; int k; for (k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; int sx = sx4 + beta * (k + 4); // Load source pixels const __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); const __m128i src2 = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); __m128i coeff[8]; highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff); highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); } } static inline void highbd_warp_horizontal_filter_beta0( const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, int32_t sx4, int alpha, int beta, int p_height, int height, int i, const int offset_bits_horiz, const int reduce_bits_horiz) { (void)beta; int k; __m128i coeff[8]; highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff); for (k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; // Load source pixels const __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); const __m128i src2 = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); } } static inline void highbd_warp_horizontal_filter( const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, int32_t sx4, int alpha, int beta, int p_height, int height, int i, const int offset_bits_horiz, const int reduce_bits_horiz) { int k; for (k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; int sx = sx4 + beta * (k + 4); // Load source pixels const __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); const __m128i src2 = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz, reduce_bits_horiz); } } static inline void highbd_prepare_warp_horizontal_filter( const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, int32_t sx4, int alpha, int beta, int p_height, int height, int i, const int offset_bits_horiz, const int reduce_bits_horiz) { if (alpha == 0 && beta == 0) highbd_warp_horizontal_filter_alpha0_beta0( ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, offset_bits_horiz, reduce_bits_horiz); else if (alpha == 0 && beta != 0) highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, offset_bits_horiz, reduce_bits_horiz); else if (alpha != 0 && beta == 0) highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, offset_bits_horiz, reduce_bits_horiz); else highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, offset_bits_horiz, reduce_bits_horiz); } void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { __m128i tmp[15]; int i, j, k; const int reduce_bits_horiz = conv_params->round_0; const int reduce_bits_vert = conv_params->is_compound ? conv_params->round_1 : 2 * FILTER_BITS - reduce_bits_horiz; const int offset_bits_horiz = bd + FILTER_BITS - 1; assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); assert(!(bd == 12 && reduce_bits_horiz < 5)); assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); // Check that, even with 12-bit input, the intermediate values will fit // into an unsigned 16-bit intermediate array. assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; const __m128i clip_pixel = _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert); const __m128i reduce_bits_vert_const = _mm_set1_epi32(((1 << reduce_bits_vert) >> 1)); const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; const __m128i res_sub_const = _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) - (1 << (offset_bits - conv_params->round_1 - 1))); __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1)); const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi32(w0); const __m128i wt1 = _mm_set1_epi32(w1); /* Note: For this code to work, the left/right frame borders need to be extended by at least 13 pixels each. By the time we get here, other code will have set up this border, but we allow an explicit check for debugging purposes. */ /*for (i = 0; i < height; ++i) { for (j = 0; j < 13; ++j) { assert(ref[i * stride - 13 + j] == ref[i * stride]); assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]); } }*/ for (i = 0; i < p_height; i += 8) { for (j = 0; j < p_width; j += 8) { const int32_t src_x = (p_col + j + 4) << subsampling_x; const int32_t src_y = (p_row + i + 4) << subsampling_y; const int64_t dst_x = (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; const int64_t dst_y = (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; const int64_t x4 = dst_x >> subsampling_x; const int64_t y4 = dst_y >> subsampling_y; int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); // Add in all the constant terms, including rounding and offset sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); // Horizontal filter // If the block is aligned such that, after clamping, every sample // would be taken from the leftmost/rightmost column, then we can // skip the expensive horizontal filter. if (ix4 <= -7) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))); } } else if (ix4 >= width + 6) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride + (width - 1)] * (1 << (FILTER_BITS - reduce_bits_horiz))); } } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { const int out_of_boundary_left = -(ix4 - 6); const int out_of_boundary_right = (ix4 + 8) - width; for (k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; int sx = sx4 + beta * (k + 4); // Load source pixels const __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); const __m128i src2 = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); const __m128i src_01 = _mm_shuffle_epi8( src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes)); const __m128i src2_01 = _mm_shuffle_epi8( src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes)); __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01); __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01); if (out_of_boundary_left >= 0) { const __m128i shuffle_reg_left = _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left); src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left); } if (out_of_boundary_right >= 0) { const __m128i shuffle_reg_right = _mm_loadu_si128( (__m128i *)warp_pad_right[out_of_boundary_right]); src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right); src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right); } const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi); const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi); highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k, offset_bits_horiz, reduce_bits_horiz); } } else { highbd_prepare_warp_horizontal_filter( ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, offset_bits_horiz, reduce_bits_horiz); } // Vertical filter for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { int sy = sy4 + delta * (k + 4); // Load from tmp and rearrange pairs of consecutive rows into the // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 const __m128i *src = tmp + (k + 4); const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); // Filter even-index pixels const __m128i tmp_0 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_2 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_4 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_6 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6); const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); // Filter odd-index pixels const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); const __m128i tmp_1 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_3 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_5 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_7 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7); const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); // Rearrange pixels back into the order 0 ... 7 __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); if (conv_params->is_compound) { __m128i *const p = (__m128i *)&conv_params ->dst[(i + k + 4) * conv_params->dst_stride + j]; res_lo = _mm_add_epi32(res_lo, res_add_const); res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const), reduce_bits_vert_shift); if (conv_params->do_average) { __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p)); if (conv_params->use_dist_wtd_comp_avg) { res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0), _mm_mullo_epi32(res_lo, wt1)); res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS); } else { res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1); } __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const); res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const), round_bits_shift); __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo); res16_lo = _mm_min_epi16(res16_lo, clip_pixel); _mm_storel_epi64(dst16, res16_lo); } else { res_lo = _mm_packus_epi32(res_lo, res_lo); _mm_storel_epi64(p, res_lo); } if (p_width > 4) { __m128i *const p4 = (__m128i *)&conv_params ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; res_hi = _mm_add_epi32(res_hi, res_add_const); res_hi = _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const), reduce_bits_vert_shift); if (conv_params->do_average) { __m128i *const dst16_4 = (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4)); if (conv_params->use_dist_wtd_comp_avg) { res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0), _mm_mullo_epi32(res_hi, wt1)); res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS); } else { res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1); } __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const); res32_hi = _mm_sra_epi32( _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift); __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi); res16_hi = _mm_min_epi16(res16_hi, clip_pixel); _mm_storel_epi64(dst16_4, res16_hi); } else { res_hi = _mm_packus_epi32(res_hi, res_hi); _mm_storel_epi64(p4, res_hi); } } } else { // Round and pack into 8 bits const __m128i round_const = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + ((1 << reduce_bits_vert) >> 1)); const __m128i res_lo_round = _mm_srai_epi32( _mm_add_epi32(res_lo, round_const), reduce_bits_vert); const __m128i res_hi_round = _mm_srai_epi32( _mm_add_epi32(res_hi, round_const), reduce_bits_vert); __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); // Clamp res_16bit to the range [0, 2^bd - 1] const __m128i max_val = _mm_set1_epi16((1 << bd) - 1); const __m128i zero = _mm_setzero_si128(); res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero); // Store, blending with 'pred' if needed __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; // Note: If we're outputting a 4x4 block, we need to be very careful // to only output 4 pixels at this point, to avoid encode/decode // mismatches when encoding with multiple threads. if (p_width == 4) { _mm_storel_epi64(p, res_16bit); } else { _mm_storeu_si128(p, res_16bit); } } } } } } aom-3.12.1/av1/common/x86/highbd_wiener_convolve_avx2.c000066400000000000000000000265221477627663500226310ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "av1/common/convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" // 128-bit xmmwords are written as [ ... ] with the MSB on the left. // 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB // on the left. // A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be // loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ]. void av1_highbd_wiener_convolve_add_src_avx2( const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params, int bd) { assert(x_step_q4 == 16 && y_step_q4 == 16); assert(!(w & 7)); assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); (void)x_step_q4; (void)y_step_q4; const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); DECLARE_ALIGNED(32, uint16_t, temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); int intermediate_height = h + SUBPEL_TAPS - 1; const int center_tap = ((SUBPEL_TAPS - 1) / 2); const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; const __m128i zero_128 = _mm_setzero_si128(); const __m256i zero_256 = _mm256_setzero_si256(); // Add an offset to account for the "add_src" part of the convolve function. const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3); const __m256i clamp_low = zero_256; /* Horizontal filter */ { const __m256i clamp_high_ep = _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1); // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ] const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset); // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ] const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ] const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ] const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123); // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ] const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123); // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ] const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567); // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ] const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567); // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ] const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128); // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ] const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128); // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ] const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128); // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ] const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128); const __m256i round_const = _mm256_set1_epi32( (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); for (int i = 0; i < intermediate_height; ++i) { for (int j = 0; j < w; j += 16) { const uint16_t *src_ij = src_ptr + i * src_stride + j; // Load 16-bit src data const __m256i src_0 = yy_loadu_256(src_ij + 0); const __m256i src_1 = yy_loadu_256(src_ij + 1); const __m256i src_2 = yy_loadu_256(src_ij + 2); const __m256i src_3 = yy_loadu_256(src_ij + 3); const __m256i src_4 = yy_loadu_256(src_ij + 4); const __m256i src_5 = yy_loadu_256(src_ij + 5); const __m256i src_6 = yy_loadu_256(src_ij + 6); const __m256i src_7 = yy_loadu_256(src_ij + 7); // Multiply src data by filter coeffs and sum pairs const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); // Calculate scalar product for even- and odd-indices separately, // increasing to 32-bit precision const __m256i res_even_sum = _mm256_add_epi32( _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6)); const __m256i res_even = _mm256_srai_epi32( _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0); const __m256i res_odd_sum = _mm256_add_epi32( _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7)); const __m256i res_odd = _mm256_srai_epi32( _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0); // Reduce to 16-bit precision and pack even- and odd-index results // back into one register. The _mm256_packs_epi32 intrinsic returns // a register with the pixels ordered as follows: // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ] const __m256i res = _mm256_packs_epi32(res_even, res_odd); const __m256i res_clamped = _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep); // Store in a temporary array yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped); } } } /* Vertical filter */ { const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1); // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ] const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset); // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ] const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ] const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ] const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123); // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ] const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123); // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ] const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567); // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ] const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567); // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ] const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128); // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ] const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128); // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ] const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128); // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ] const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128); const __m256i round_const = _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) - (1 << (bd + conv_params->round_1 - 1))); for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 16) { const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j; // Load 16-bit data from the output of the horizontal filter in // which the pixels are ordered as follows: // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ] const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE); const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE); const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE); const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE); const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE); const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE); const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE); const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE); // Filter the even-indices, increasing to 32-bit precision const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1); const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3); const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5); const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7); const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); const __m256i res_even = _mm256_add_epi32( _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6)); // Filter the odd-indices, increasing to 32-bit precision const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1); const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3); const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5); const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7); const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); const __m256i res_odd = _mm256_add_epi32( _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7)); // Pixels are currently in the following order: // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ] // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ] // // Rearrange the pixels into the following order: // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ] // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ] const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd); const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd); const __m256i res_lo_round = _mm256_srai_epi32( _mm256_add_epi32(res_lo, round_const), conv_params->round_1); const __m256i res_hi_round = _mm256_srai_epi32( _mm256_add_epi32(res_hi, round_const), conv_params->round_1); // Reduce to 16-bit precision and pack into the correct order: // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ] const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round); const __m256i res_16bit_clamped = _mm256_min_epi16( _mm256_max_epi16(res_16bit, clamp_low), clamp_high); // Store in the dst array yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped); } } } } aom-3.12.1/av1/common/x86/highbd_wiener_convolve_ssse3.c000066400000000000000000000206521477627663500230070ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "av1/common/convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" void av1_highbd_wiener_convolve_add_src_ssse3( const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params, int bd) { assert(x_step_q4 == 16 && y_step_q4 == 16); assert(!(w & 7)); assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); (void)x_step_q4; (void)y_step_q4; const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); DECLARE_ALIGNED(16, uint16_t, temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); int intermediate_height = h + SUBPEL_TAPS - 1; int i, j; const int center_tap = ((SUBPEL_TAPS - 1) / 2); const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; const __m128i zero = _mm_setzero_si128(); // Add an offset to account for the "add_src" part of the convolve function. const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); /* Horizontal filter */ { const __m128i coeffs_x = _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32( (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); for (i = 0; i < intermediate_height; ++i) { for (j = 0; j < w; j += 8) { const __m128i data = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); const __m128i data2 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); // Filter even-index pixels const __m128i res_0 = _mm_madd_epi16(data, coeff_01); const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), conv_params->round_0); // Filter odd-index pixels const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), conv_params->round_0); // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 const __m128i maxval = _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1); __m128i res = _mm_packs_epi32(res_even, res_odd); res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval); _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); } } } /* Vertical filter */ { const __m128i coeffs_y = _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32((1 << (conv_params->round_1 - 1)) - (1 << (bd + conv_params->round_1 - 1))); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { // Filter even-index pixels const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; const __m128i src_0 = _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), *(__m128i *)(data + 1 * MAX_SB_SIZE)); const __m128i src_2 = _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), *(__m128i *)(data + 3 * MAX_SB_SIZE)); const __m128i src_4 = _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), *(__m128i *)(data + 5 * MAX_SB_SIZE)); const __m128i src_6 = _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), *(__m128i *)(data + 7 * MAX_SB_SIZE)); const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); // Filter odd-index pixels const __m128i src_1 = _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), *(__m128i *)(data + 1 * MAX_SB_SIZE)); const __m128i src_3 = _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), *(__m128i *)(data + 3 * MAX_SB_SIZE)); const __m128i src_5 = _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), *(__m128i *)(data + 5 * MAX_SB_SIZE)); const __m128i src_7 = _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), *(__m128i *)(data + 7 * MAX_SB_SIZE)); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); // Rearrange pixels back into the order 0 ... 7 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); const __m128i res_lo_round = _mm_srai_epi32( _mm_add_epi32(res_lo, round_const), conv_params->round_1); const __m128i res_hi_round = _mm_srai_epi32( _mm_add_epi32(res_hi, round_const), conv_params->round_1); const __m128i maxval = _mm_set1_epi16((1 << bd) - 1); __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval); __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; _mm_storeu_si128(p, res_16bit); } } } } aom-3.12.1/av1/common/x86/intra_edge_sse4.c000066400000000000000000000265261477627663500202230ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) { if (!strength) return; DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = { { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 }, // strength 1: 4,8,4 { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 }, // strength 2: 5,6,5 { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 } // strength 3: 2,4,4,4,2 }; DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }, { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, }; // Extend the first and last samples to simplify the loop for the 5-tap case p[-1] = p[0]; __m128i last = _mm_set1_epi8((char)p[sz - 1]); _mm_storeu_si128((__m128i *)&p[sz], last); // Adjust input pointer for filter support area uint8_t *in = (strength == 3) ? p - 1 : p; // Avoid modifying first sample uint8_t *out = p + 1; int len = sz - 1; const int use_3tap_filter = (strength < 3); if (use_3tap_filter) { __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]); __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]); __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]); __m128i in0 = _mm_lddqu_si128((__m128i *)in); while (len > 0) { int n_out = (len < 8) ? len : 8; __m128i d0 = _mm_shuffle_epi8(in0, shuf0); __m128i d1 = _mm_shuffle_epi8(in0, shuf1); d0 = _mm_maddubs_epi16(d0, coef0); d1 = _mm_maddubs_epi16(d1, coef0); d0 = _mm_hadd_epi16(d0, d1); __m128i eight = _mm_set1_epi16(8); d0 = _mm_add_epi16(d0, eight); d0 = _mm_srai_epi16(d0, 4); d0 = _mm_packus_epi16(d0, d0); __m128i out0 = _mm_lddqu_si128((__m128i *)out); __m128i n0 = _mm_set1_epi8(n_out); __m128i mask = _mm_cmpgt_epi8(n0, iden); out0 = _mm_blendv_epi8(out0, d0, mask); _mm_storel_epi64((__m128i *)out, out0); __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16)); in0 = _mm_alignr_epi8(in1, in0, 8); in += 8; out += 8; len -= n_out; } } else { // 5-tap filter __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); __m128i two = _mm_set1_epi8(2); __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]); __m128i shuf_b = _mm_add_epi8(shuf_a, two); __m128i shuf_c = _mm_add_epi8(shuf_b, two); __m128i shuf_d = _mm_add_epi8(shuf_c, two); __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]); __m128i in0 = _mm_lddqu_si128((__m128i *)in); while (len > 0) { int n_out = (len < 8) ? len : 8; __m128i d0 = _mm_shuffle_epi8(in0, shuf_a); __m128i d1 = _mm_shuffle_epi8(in0, shuf_b); __m128i d2 = _mm_shuffle_epi8(in0, shuf_c); __m128i d3 = _mm_shuffle_epi8(in0, shuf_d); d0 = _mm_maddubs_epi16(d0, coef0); d1 = _mm_maddubs_epi16(d1, coef0); d2 = _mm_maddubs_epi16(d2, coef0); d3 = _mm_maddubs_epi16(d3, coef0); d0 = _mm_hadd_epi16(d0, d1); d2 = _mm_hadd_epi16(d2, d3); d0 = _mm_hadd_epi16(d0, d2); __m128i eight = _mm_set1_epi16(8); d0 = _mm_add_epi16(d0, eight); d0 = _mm_srai_epi16(d0, 4); d0 = _mm_packus_epi16(d0, d0); __m128i out0 = _mm_lddqu_si128((__m128i *)out); __m128i n0 = _mm_set1_epi8(n_out); __m128i mask = _mm_cmpgt_epi8(n0, iden); out0 = _mm_blendv_epi8(out0, d0, mask); _mm_storel_epi64((__m128i *)out, out0); __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16)); in0 = _mm_alignr_epi8(in1, in0, 8); in += 8; out += 8; len -= n_out; } } } void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) { // interpolate half-sample positions assert(sz <= 24); DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = { { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 } }; DECLARE_ALIGNED( 16, static const int8_t, v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } }; // Extend first/last samples (upper-left p[-1], last p[sz-1]) // to support 4-tap filter p[-2] = p[-1]; p[sz] = p[sz - 1]; uint8_t *in = &p[-2]; uint8_t *out = &p[-2]; int n = sz + 1; // Input length including upper-left sample __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]); __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]); __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]); __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]); while (n > 0) { __m128i in8 = _mm_alignr_epi8(in16, in0, 8); __m128i d0 = _mm_shuffle_epi8(in0, shuf0); __m128i d1 = _mm_shuffle_epi8(in0, shuf1); __m128i d2 = _mm_shuffle_epi8(in8, shuf0); __m128i d3 = _mm_shuffle_epi8(in8, shuf1); d0 = _mm_maddubs_epi16(d0, coef0); d1 = _mm_maddubs_epi16(d1, coef0); d2 = _mm_maddubs_epi16(d2, coef0); d3 = _mm_maddubs_epi16(d3, coef0); d0 = _mm_hadd_epi16(d0, d1); d2 = _mm_hadd_epi16(d2, d3); __m128i eight = _mm_set1_epi16(8); d0 = _mm_add_epi16(d0, eight); d2 = _mm_add_epi16(d2, eight); d0 = _mm_srai_epi16(d0, 4); d2 = _mm_srai_epi16(d2, 4); d0 = _mm_packus_epi16(d0, d2); __m128i in1 = _mm_alignr_epi8(in16, in0, 1); __m128i out0 = _mm_unpacklo_epi8(in1, d0); __m128i out1 = _mm_unpackhi_epi8(in1, d0); _mm_storeu_si128((__m128i *)&out[0], out0); _mm_storeu_si128((__m128i *)&out[16], out1); in0 = in16; in16 = _mm_setzero_si128(); out += 32; n -= 16; } } #if CONFIG_AV1_HIGHBITDEPTH void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) { if (!strength) return; DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = { { 4, 8, 4, 8, 4, 8, 4, 8 }, // strength 1: 4,8,4 { 5, 6, 5, 6, 5, 6, 5, 6 }, // strength 2: 5,6,5 { 2, 4, 2, 4, 2, 4, 2, 4 } // strength 3: 2,4,4,4,2 }; DECLARE_ALIGNED(16, static const int16_t, v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } }; // Extend the first and last samples to simplify the loop for the 5-tap case p[-1] = p[0]; __m128i last = _mm_set1_epi16(p[sz - 1]); _mm_storeu_si128((__m128i *)&p[sz], last); // Adjust input pointer for filter support area uint16_t *in = (strength == 3) ? p - 1 : p; // Avoid modifying first sample uint16_t *out = p + 1; int len = sz - 1; const int use_3tap_filter = (strength < 3); if (use_3tap_filter) { __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]); __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); while (len > 0) { int n_out = (len < 8) ? len : 8; __m128i in1 = _mm_alignr_epi8(in8, in0, 2); __m128i in2 = _mm_alignr_epi8(in8, in0, 4); __m128i in02 = _mm_add_epi16(in0, in2); __m128i d0 = _mm_unpacklo_epi16(in02, in1); __m128i d1 = _mm_unpackhi_epi16(in02, in1); d0 = _mm_mullo_epi16(d0, coef0); d1 = _mm_mullo_epi16(d1, coef0); d0 = _mm_hadd_epi16(d0, d1); __m128i eight = _mm_set1_epi16(8); d0 = _mm_add_epi16(d0, eight); d0 = _mm_srli_epi16(d0, 4); __m128i out0 = _mm_lddqu_si128((__m128i *)out); __m128i n0 = _mm_set1_epi16(n_out); __m128i mask = _mm_cmpgt_epi16(n0, iden); out0 = _mm_blendv_epi8(out0, d0, mask); _mm_storeu_si128((__m128i *)out, out0); in += 8; in0 = in8; in8 = _mm_lddqu_si128((__m128i *)&in[8]); out += 8; len -= n_out; } } else { // 5-tap filter __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]); __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); while (len > 0) { int n_out = (len < 8) ? len : 8; __m128i in1 = _mm_alignr_epi8(in8, in0, 2); __m128i in2 = _mm_alignr_epi8(in8, in0, 4); __m128i in3 = _mm_alignr_epi8(in8, in0, 6); __m128i in4 = _mm_alignr_epi8(in8, in0, 8); __m128i in04 = _mm_add_epi16(in0, in4); __m128i in123 = _mm_add_epi16(in1, in2); in123 = _mm_add_epi16(in123, in3); __m128i d0 = _mm_unpacklo_epi16(in04, in123); __m128i d1 = _mm_unpackhi_epi16(in04, in123); d0 = _mm_mullo_epi16(d0, coef0); d1 = _mm_mullo_epi16(d1, coef0); d0 = _mm_hadd_epi16(d0, d1); __m128i eight = _mm_set1_epi16(8); d0 = _mm_add_epi16(d0, eight); d0 = _mm_srli_epi16(d0, 4); __m128i out0 = _mm_lddqu_si128((__m128i *)out); __m128i n0 = _mm_set1_epi16(n_out); __m128i mask = _mm_cmpgt_epi16(n0, iden); out0 = _mm_blendv_epi8(out0, d0, mask); _mm_storeu_si128((__m128i *)out, out0); in += 8; in0 = in8; in8 = _mm_lddqu_si128((__m128i *)&in[8]); out += 8; len -= n_out; } } } void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) { // interpolate half-sample positions assert(sz <= 24); DECLARE_ALIGNED(16, static const int16_t, kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } }; // Extend first/last samples (upper-left p[-1], last p[sz-1]) // to support 4-tap filter p[-2] = p[-1]; p[sz] = p[sz - 1]; uint16_t *in = &p[-2]; uint16_t *out = in; int n = sz + 1; __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]); __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]); while (n > 0) { __m128i in1 = _mm_alignr_epi8(in8, in0, 2); __m128i in2 = _mm_alignr_epi8(in8, in0, 4); __m128i in3 = _mm_alignr_epi8(in8, in0, 6); __m128i sum0 = _mm_add_epi16(in0, in3); __m128i sum1 = _mm_add_epi16(in1, in2); __m128i d0 = _mm_unpacklo_epi16(sum0, sum1); __m128i d1 = _mm_unpackhi_epi16(sum0, sum1); __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]); d0 = _mm_madd_epi16(d0, coef0); d1 = _mm_madd_epi16(d1, coef0); __m128i eight = _mm_set1_epi32(8); d0 = _mm_add_epi32(d0, eight); d1 = _mm_add_epi32(d1, eight); d0 = _mm_srai_epi32(d0, 4); d1 = _mm_srai_epi32(d1, 4); d0 = _mm_packus_epi32(d0, d1); __m128i max0 = _mm_set1_epi16((1 << bd) - 1); d0 = _mm_min_epi16(d0, max0); __m128i out0 = _mm_unpacklo_epi16(in1, d0); __m128i out1 = _mm_unpackhi_epi16(in1, d0); _mm_storeu_si128((__m128i *)&out[0], out0); _mm_storeu_si128((__m128i *)&out[8], out1); in0 = in8; in8 = in16; in16 = in24; in24 = _mm_setzero_si128(); out += 16; n -= 8; } } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/av1/common/x86/jnt_convolve_avx2.c000066400000000000000000001513051477627663500206240ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/convolve_common_intrin.h" #include "aom_dsp/x86/convolve_sse4_1.h" #include "aom_dsp/x86/mem_sse2.h" #include "aom_dsp/x86/synonyms_avx2.h" #include "av1/common/convolve.h" static inline __m256i unpack_weights_avx2(ConvolveParams *conv_params) { const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m256i wt0 = _mm256_set1_epi16((int16_t)w0); const __m256i wt1 = _mm256_set1_epi16((int16_t)w1); const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); return wt; } static inline __m256i load_line2_avx2(const void *a, const void *b) { return _mm256_permute2x128_si256( _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)), _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20); } void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bd = 8; int i, j, is_horiz_4tap = 0; const int bits = FILTER_BITS - conv_params->round_1; const __m256i wt = unpack_weights_avx2(conv_params); const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m256i offset_const = _mm256_set1_epi16(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); assert(bits >= 0); assert(conv_params->round_0 > 0); const __m256i round_const = _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); __m256i filt[4], coeffs[4]; filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); // Condition for checking valid horz_filt taps if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) is_horiz_4tap = 1; // horz_filt as 4 tap if (is_horiz_4tap) { const int fo_horiz = 1; const uint8_t *const src_ptr = src - fo_horiz; for (i = 0; i < h; i += 2) { const uint8_t *src_data = src_ptr + i * src_stride; CONV_BUF_TYPE *dst_data = dst + i * dst_stride; for (j = 0; j < w; j += 8) { const __m256i data = load_line2_avx2(&src_data[j], &src_data[j + src_stride]); __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt); res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); res = _mm256_slli_epi16(res, bits); const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); // Accumulate values into the destination buffer if (do_average) { const __m256i data_ref_0 = load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m256i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); const __m128i res_0 = _mm256_castsi256_si128(res_8); const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); if (w > 4) { _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); _mm_storel_epi64( (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); } else { *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = _mm_cvtsi128_si32(res_1); } } else { const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_1); } } } } else { const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *const src_ptr = src - fo_horiz; filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); for (i = 0; i < h; i += 2) { const uint8_t *src_data = src_ptr + i * src_stride; CONV_BUF_TYPE *dst_data = dst + i * dst_stride; for (j = 0; j < w; j += 8) { const __m256i data = load_line2_avx2(&src_data[j], &src_data[j + src_stride]); __m256i res = convolve_lowbd_x(data, coeffs, filt); res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); res = _mm256_slli_epi16(res, bits); const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); // Accumulate values into the destination buffer if (do_average) { const __m256i data_ref_0 = load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m256i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); const __m128i res_0 = _mm256_castsi256_si128(res_8); const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); if (w > 4) { _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); _mm_storel_epi64( (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); } else { *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = _mm_cvtsi128_si32(res_1); } } else { const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_1); } } } } } void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bd = 8; int i, j, is_vert_4tap = 0; // +1 to compensate for dividing the filter coeffs by 2 const int left_shift = FILTER_BITS - conv_params->round_0 + 1; const __m256i round_const = _mm256_set1_epi32((1 << conv_params->round_1) >> 1); const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); const __m256i wt = unpack_weights_avx2(conv_params); const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m256i offset_const = _mm256_set1_epi16(offset); const int offset_1 = (1 << (bd + FILTER_BITS - 2)); const __m256i offset_const_1 = _mm256_set1_epi16(offset_1); const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0)); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); const __m256i zero = _mm256_setzero_si256(); __m256i coeffs[4], s[8]; assert((FILTER_BITS - conv_params->round_0) >= 0); prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); // Condition for checking valid vert_filt taps if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) is_vert_4tap = 1; if (is_vert_4tap) { const int fo_vert = 1; const uint8_t *const src_ptr = src - fo_vert * src_stride; for (j = 0; j < w; j += 16) { const uint8_t *data = &src_ptr[j]; __m256i src4; // Load lines a and b. Line a to lower 128, line b to upper 128 { __m256i src_ab[4]; __m256i src_a[5]; src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); for (int kk = 0; kk < 4; ++kk) { data += src_stride; src_a[kk + 1] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); } src4 = src_a[4]; s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); } for (i = 0; i < h; i += 2) { data = &src_ptr[(i + 5) * src_stride + j]; const __m256i src5 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20); src4 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + src_stride))); const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20); s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); res_lo = _mm256_add_epi16(res_lo, offset_const_1); const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); const __m256i res_lo_0_shift = _mm256_slli_epi32(res_lo_0_32b, left_shift); const __m256i res_lo_0_round = _mm256_sra_epi32( _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); const __m256i res_lo_1_shift = _mm256_slli_epi32(res_lo_1_32b, left_shift); const __m256i res_lo_1_round = _mm256_sra_epi32( _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); const __m256i res_lo_round = _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); const __m256i res_lo_unsigned = _mm256_add_epi16(res_lo_round, offset_const_2); if (w - j < 16) { if (do_average) { const __m256i data_ref_0 = load_line2_avx2(&dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); const __m256i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); const __m128i res_0 = _mm256_castsi256_si128(res_8); const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); if (w - j > 4) { _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); _mm_storel_epi64( (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); } else { *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = _mm_cvtsi128_si32(res_1); } } else { const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_1); } } else { __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); res_hi = _mm256_add_epi16(res_hi, offset_const_1); const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); const __m256i res_hi_0_shift = _mm256_slli_epi32(res_hi_0_32b, left_shift); const __m256i res_hi_0_round = _mm256_sra_epi32( _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); const __m256i res_hi_1_shift = _mm256_slli_epi32(res_hi_1_32b, left_shift); const __m256i res_hi_1_round = _mm256_sra_epi32( _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); const __m256i res_hi_round = _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); const __m256i res_hi_unsigned = _mm256_add_epi16(res_hi_round, offset_const_2); if (do_average) { const __m256i data_ref_0_lo = load_line2_avx2(&dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); const __m256i data_ref_0_hi = load_line2_avx2(&dst[i * dst_stride + j + 8], &dst[i * dst_stride + j + 8 + dst_stride]); const __m256i comp_avg_res_lo = comp_avg( &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); const __m256i comp_avg_res_hi = comp_avg( &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); const __m256i round_result_lo = convolve_rounding(&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); const __m256i round_result_hi = convolve_rounding(&comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); const __m256i res_8 = _mm256_packus_epi16(round_result_lo, round_result_hi); const __m128i res_0 = _mm256_castsi256_si128(res_8); const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); _mm_store_si128( (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); } else { const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); const __m128i res_lo_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_lo_1); const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), res_hi_0); const __m128i res_hi_1 = _mm256_extracti128_si256(res_hi_unsigned, 1); _mm_store_si128( (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), res_hi_1); } } s[0] = s[1]; s[1] = s[2]; s[3] = s[4]; s[4] = s[5]; } } } else { const int fo_vert = filter_params_y->taps / 2 - 1; const uint8_t *const src_ptr = src - fo_vert * src_stride; for (j = 0; j < w; j += 16) { const uint8_t *data = &src_ptr[j]; __m256i src6; // Load lines a and b. Line a to lower 128, line b to upper 128 { __m256i src_ab[7]; __m256i src_a[7]; src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); for (int kk = 0; kk < 6; ++kk) { data += src_stride; src_a[kk + 1] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); } src6 = src_a[6]; s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); } for (i = 0; i < h; i += 2) { data = &src_ptr[(i + 7) * src_stride + j]; const __m256i src7 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); src6 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(data + src_stride))); const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); __m256i res_lo = convolve_lowbd(s, coeffs); res_lo = _mm256_add_epi16(res_lo, offset_const_1); const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); const __m256i res_lo_0_shift = _mm256_slli_epi32(res_lo_0_32b, left_shift); const __m256i res_lo_0_round = _mm256_sra_epi32( _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); const __m256i res_lo_1_shift = _mm256_slli_epi32(res_lo_1_32b, left_shift); const __m256i res_lo_1_round = _mm256_sra_epi32( _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); const __m256i res_lo_round = _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); const __m256i res_lo_unsigned = _mm256_add_epi16(res_lo_round, offset_const_2); if (w - j < 16) { if (do_average) { const __m256i data_ref_0 = load_line2_avx2(&dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); const __m256i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); const __m128i res_0 = _mm256_castsi256_si128(res_8); const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); if (w - j > 4) { _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); _mm_storel_epi64( (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); } else { *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = _mm_cvtsi128_si32(res_1); } } else { const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_1); } } else { __m256i res_hi = convolve_lowbd(s + 4, coeffs); res_hi = _mm256_add_epi16(res_hi, offset_const_1); const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); const __m256i res_hi_0_shift = _mm256_slli_epi32(res_hi_0_32b, left_shift); const __m256i res_hi_0_round = _mm256_sra_epi32( _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); const __m256i res_hi_1_shift = _mm256_slli_epi32(res_hi_1_32b, left_shift); const __m256i res_hi_1_round = _mm256_sra_epi32( _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); const __m256i res_hi_round = _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); const __m256i res_hi_unsigned = _mm256_add_epi16(res_hi_round, offset_const_2); if (do_average) { const __m256i data_ref_0_lo = load_line2_avx2(&dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); const __m256i data_ref_0_hi = load_line2_avx2(&dst[i * dst_stride + j + 8], &dst[i * dst_stride + j + 8 + dst_stride]); const __m256i comp_avg_res_lo = comp_avg( &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); const __m256i comp_avg_res_hi = comp_avg( &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); const __m256i round_result_lo = convolve_rounding(&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); const __m256i round_result_hi = convolve_rounding(&comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); const __m256i res_8 = _mm256_packus_epi16(round_result_lo, round_result_hi); const __m128i res_0 = _mm256_castsi256_si128(res_8); const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); _mm_store_si128( (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); } else { const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); const __m128i res_lo_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_lo_1); const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), res_hi_0); const __m128i res_hi_1 = _mm256_extracti128_si256(res_hi_unsigned, 1); _mm_store_si128( (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), res_hi_1); } } s[0] = s[1]; s[1] = s[2]; s[2] = s[3]; s[4] = s[5]; s[5] = s[6]; s[6] = s[7]; } } } } void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bd = 8; DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); int im_stride = 8; int i, is_horiz_4tap = 0, is_vert_4tap = 0; const __m256i wt = unpack_weights_avx2(conv_params); const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m256i offset_const = _mm256_set1_epi16(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); assert(conv_params->round_0 > 0); const __m256i round_const_h = _mm256_set1_epi16( ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); const __m256i round_const_v = _mm256_set1_epi32( ((1 << conv_params->round_1) >> 1) - (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); __m256i filt[4], coeffs_x[4], coeffs_y[4]; filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x); prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); // Condition for checking valid horz_filt taps if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0))) is_horiz_4tap = 1; // Condition for checking valid vert_filt taps if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0))) is_vert_4tap = 1; if (is_horiz_4tap) { int im_h = h + filter_params_y->taps - 1; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = 1; const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; for (int j = 0; j < w; j += 8) { /* Horizontal filter */ const uint8_t *src_h = src_ptr + j; for (i = 0; i < im_h; i += 2) { __m256i data = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); if (i + 1 < im_h) data = _mm256_inserti128_si256( data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); src_h += (src_stride << 1); __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt); res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); } DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; } } else if (is_vert_4tap) { int im_h = h + 3; const int fo_vert = 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); for (int j = 0; j < w; j += 8) { /* Horizontal filter */ const uint8_t *src_h = src_ptr + j; DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; /* Vertical filter */ __m256i s[6]; __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); s[0] = _mm256_unpacklo_epi16(s0, s1); s[1] = _mm256_unpacklo_epi16(s2, s3); s[3] = _mm256_unpackhi_epi16(s0, s1); s[4] = _mm256_unpackhi_epi16(s2, s3); for (i = 0; i < h; i += 2) { const int16_t *data = &im_block[i * im_stride]; const __m256i s4 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); const __m256i s5 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); s[2] = _mm256_unpacklo_epi16(s4, s5); s[5] = _mm256_unpackhi_epi16(s4, s5); const __m256i res_a = convolve_4tap(s, coeffs_y + 1); const __m256i res_a_round = _mm256_sra_epi32( _mm256_add_epi32(res_a, round_const_v), round_shift_v); if (w - j > 4) { const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1); const __m256i res_b_round = _mm256_sra_epi32( _mm256_add_epi32(res_b, round_const_v), round_shift_v); const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); if (do_average) { const __m256i data_ref_0 = load_line2_avx2(&dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m256i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); const __m128i res_0 = _mm256_castsi256_si128(res_8); const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); _mm_storel_epi64( (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); } else { const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_1); } } else { const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); if (do_average) { const __m256i data_ref_0 = load_line2_avx2(&dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m256i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); const __m128i res_0 = _mm256_castsi256_si128(res_8); const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = _mm_cvtsi128_si32(res_1); } else { const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_1); } } s[0] = s[1]; s[1] = s[2]; s[3] = s[4]; s[4] = s[5]; } } } else { int im_h = h + filter_params_y->taps - 1; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); for (int j = 0; j < w; j += 8) { /* Horizontal filter */ const uint8_t *src_h = src_ptr + j; DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; } } } #define DO_NO_AVG_2D_COPY_4X16(r0, c0, r1, c1, r2, c2, r3, c3) \ do { \ src_0 = _mm256_cvtepu8_epi16( \ _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \ src_1 = _mm256_cvtepu8_epi16( \ _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \ src_2 = _mm256_cvtepu8_epi16( \ _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \ src_3 = _mm256_cvtepu8_epi16( \ _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \ \ src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \ src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \ src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \ src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \ \ src_0 = _mm256_add_epi16(src_0, offset_const); \ src_1 = _mm256_add_epi16(src_1, offset_const); \ src_2 = _mm256_add_epi16(src_2, offset_const); \ src_3 = _mm256_add_epi16(src_3, offset_const); \ \ _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \ _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \ _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \ _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \ } while (0) #define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7) static inline void av1_dist_wtd_convolve_2d_no_avg_copy_avx2( const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, const __m256i offset_const) { int i = h; if (w >= 16) { __m256i src_0, src_1, src_2, src_3; if (w == 128) { do { DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48); DO_NO_AVG_2D_COPY_4X16(0, 64, 0, 80, 0, 96, 0, 112); src += 1 * src_stride; dst += 1 * dst_stride; i -= 1; } while (i); } else if (w == 64) { do { DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48); src += 1 * src_stride; dst += 1 * dst_stride; i -= 1; } while (i); } else if (w == 32) { do { DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 0, 16, 1, 16); src += 2 * src_stride; dst += 2 * dst_stride; i -= 2; } while (i); } else if (w == 16) { do { DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 2, 0, 3, 0); src += 4 * src_stride; dst += 4 * dst_stride; i -= 4; } while (i); } } else { const __m256i zero = _mm256_setzero_si256(); do { const __m128i src_row_0 = _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); const __m128i src_row_1 = _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); const __m128i src_row_2 = _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); const __m128i src_row_3 = _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); __m256i src_10 = _mm256_insertf128_si256( _mm256_castsi128_si256(src_row_0), src_row_1, 1); __m256i src_32 = _mm256_insertf128_si256( _mm256_castsi128_si256(src_row_2), src_row_3, 1); src_10 = _mm256_unpacklo_epi8(src_10, zero); src_32 = _mm256_unpacklo_epi8(src_32, zero); src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); src_10 = _mm256_add_epi16(src_10, offset_const); src_32 = _mm256_add_epi16(src_32, offset_const); // Accumulate values into the destination buffer _mm_store_si128((__m128i *)(&dst[0 * dst_stride]), _mm256_castsi256_si128(src_10)); _mm_store_si128((__m128i *)(&dst[1 * dst_stride]), _mm256_extracti128_si256(src_10, 1)); _mm_store_si128((__m128i *)(&dst[2 * dst_stride]), _mm256_castsi256_si128(src_32)); _mm_store_si128((__m128i *)(&dst[3 * dst_stride]), _mm256_extracti128_si256(src_32, 1)); src += 4 * src_stride; dst += 4 * dst_stride; i -= 4; } while (i); } } #define DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, r0, c0, r1, c1, r2, c2, r3, c3) \ do { \ src_0 = _mm256_cvtepu8_epi16( \ _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \ src_1 = _mm256_cvtepu8_epi16( \ _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \ src_2 = _mm256_cvtepu8_epi16( \ _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \ src_3 = _mm256_cvtepu8_epi16( \ _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \ \ src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \ src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \ src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \ src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \ src_0 = _mm256_add_epi16(src_0, offset_const); \ src_1 = _mm256_add_epi16(src_1, offset_const); \ src_2 = _mm256_add_epi16(src_2, offset_const); \ src_3 = _mm256_add_epi16(src_3, offset_const); \ \ ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0])); \ ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1])); \ ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2])); \ ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3])); \ \ res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED); \ res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED); \ res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED); \ res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED); \ \ res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const, \ rounding_shift); \ res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const, \ rounding_shift); \ res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const, \ rounding_shift); \ res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const, \ rounding_shift); \ \ res_10 = _mm256_packus_epi16(res_0, res_1); \ res_32 = _mm256_packus_epi16(res_2, res_3); \ res_10 = _mm256_permute4x64_epi64(res_10, 0xD8); \ res_32 = _mm256_permute4x64_epi64(res_32, 0xD8); \ \ _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]), \ _mm256_castsi256_si128(res_10)); \ _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]), \ _mm256_extracti128_si256(res_10, 1)); \ _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]), \ _mm256_castsi256_si128(res_32)); \ _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]), \ _mm256_extracti128_si256(res_32, 1)); \ } while (0) #define DO_AVG_2D_COPY(USE_DIST_WEIGHTED) \ int i = h; \ if (w >= 16) { \ __m256i src_0, src_1, src_2, src_3; \ __m256i ref_0, ref_1, ref_2, ref_3; \ __m256i res_0, res_1, res_2, res_3; \ __m256i res_10, res_32; \ if (w == 128) { \ do { \ DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \ DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 64, 0, 80, 0, 96, 0, 112); \ i -= 1; \ src += 1 * src_stride; \ dst += 1 * dst_stride; \ dst0 += 1 * dst_stride0; \ } while (i); \ } else if (w == 64) { \ do { \ DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \ \ i -= 1; \ src += 1 * src_stride; \ dst += 1 * dst_stride; \ dst0 += 1 * dst_stride0; \ } while (i); \ } else if (w == 32) { \ do { \ DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 0, 16, 1, 16); \ \ i -= 2; \ src += 2 * src_stride; \ dst += 2 * dst_stride; \ dst0 += 2 * dst_stride0; \ } while (i); \ } else { \ assert(w == 16); \ do { \ DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 2, 0, 3, 0); \ \ i -= 4; \ src += 4 * src_stride; \ dst += 4 * dst_stride; \ dst0 += 4 * dst_stride0; \ } while (i); \ } \ } else if (w == 8) { \ do { \ const __m128i src_0 = \ _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); \ const __m128i src_1 = \ _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); \ const __m128i src_2 = \ _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); \ const __m128i src_3 = \ _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); \ __m256i src_10 = \ _mm256_insertf128_si256(_mm256_castsi128_si256(src_0), src_1, 1); \ __m256i src_32 = \ _mm256_insertf128_si256(_mm256_castsi128_si256(src_2), src_3, 1); \ \ src_10 = _mm256_unpacklo_epi8(src_10, zero); \ src_32 = _mm256_unpacklo_epi8(src_32, zero); \ \ src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); \ src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); \ \ src_10 = _mm256_add_epi16(src_10, offset_const); \ src_32 = _mm256_add_epi16(src_32, offset_const); \ \ const __m256i ref_10 = \ load_line2_avx2(&dst[0 * dst_stride], &dst[1 * dst_stride]); \ const __m256i ref_32 = \ load_line2_avx2(&dst[2 * dst_stride], &dst[3 * dst_stride]); \ __m256i res_10 = comp_avg(&ref_10, &src_10, &wt, USE_DIST_WEIGHTED); \ __m256i res_32 = comp_avg(&ref_32, &src_32, &wt, USE_DIST_WEIGHTED); \ \ res_10 = convolve_rounding(&res_10, &offset_const, &rounding_const, \ rounding_shift); \ res_32 = convolve_rounding(&res_32, &offset_const, &rounding_const, \ rounding_shift); \ \ __m256i res = _mm256_packus_epi16(res_10, res_32); \ const __m128i res_20 = _mm256_castsi256_si128(res); \ const __m128i res_31 = _mm256_extracti128_si256(res, 1); \ \ _mm_storel_epi64((__m128i *)(&dst0[0 * dst_stride0]), res_20); \ _mm_storel_epi64((__m128i *)((&dst0[1 * dst_stride0])), res_31); \ _mm_storeh_epi64((__m128i *)(&dst0[2 * dst_stride0]), res_20); \ _mm_storeh_epi64((__m128i *)((&dst0[3 * dst_stride0])), res_31); \ i -= 4; \ src += 4 * src_stride; \ dst += 4 * dst_stride; \ dst0 += 4 * dst_stride0; \ } while (i); \ } else { \ assert(w == 4); \ do { \ __m256i src_3210_8bit = \ _mm256_setr_epi32(loadu_int32(src + 0 * src_stride), \ loadu_int32(src + 1 * src_stride), 0, 0, \ loadu_int32(src + 2 * src_stride), \ loadu_int32(src + 3 * src_stride), 0, 0); \ \ __m256i src_3210 = _mm256_unpacklo_epi8(src_3210_8bit, zero); \ src_3210 = _mm256_slli_epi16(src_3210, LEFT_SHIFT); \ src_3210 = _mm256_add_epi16(src_3210, offset_const); \ \ __m256i ref_3210 = \ _mm256_setr_epi64x(*(int64_t *)(dst + 0 * dst_stride), \ *(int64_t *)(dst + 1 * dst_stride), \ *(int64_t *)(dst + 2 * dst_stride), \ *(int64_t *)(dst + 3 * dst_stride)); \ __m256i res_3210 = \ comp_avg(&ref_3210, &src_3210, &wt, USE_DIST_WEIGHTED); \ \ res_3210 = convolve_rounding(&res_3210, &offset_const, &rounding_const, \ rounding_shift); \ \ res_3210 = _mm256_packus_epi16(res_3210, res_3210); \ const __m128i res_10 = _mm256_castsi256_si128(res_3210); \ const __m128i res_32 = _mm256_extracti128_si256(res_3210, 1); \ \ *(int *)(&dst0[0 * dst_stride0]) = _mm_cvtsi128_si32(res_10); \ *(int *)(&dst0[2 * dst_stride0]) = _mm_cvtsi128_si32(res_32); \ *(int *)(&dst0[1 * dst_stride0]) = _mm_extract_epi32(res_10, 1); \ *(int *)(&dst0[3 * dst_stride0]) = _mm_extract_epi32(res_32, 1); \ i -= 4; \ src += 4 * src_stride; \ dst += 4 * dst_stride; \ dst0 += 4 * dst_stride0; \ } while (i); \ } void av1_dist_wtd_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, ConvolveParams *conv_params) { const int bd = 8; CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; assert(conv_params->round_0 == 3); assert(conv_params->round_1 == 7); assert(w % 4 == 0); assert(h % 4 == 0); const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const __m256i wt = unpack_weights_avx2(conv_params); const __m256i zero = _mm256_setzero_si256(); const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m256i offset_const = _mm256_set1_epi16(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); if (do_average) { if (use_dist_wtd_comp_avg) { DO_AVG_2D_COPY(1) } else { DO_AVG_2D_COPY(0) } } else { av1_dist_wtd_convolve_2d_no_avg_copy_avx2(src, src_stride, dst, dst_stride, w, h, offset_const); } } #undef LEFT_SHIFT aom-3.12.1/av1/common/x86/jnt_convolve_sse2.c000066400000000000000000000357431477627663500206270ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" #include "aom_dsp/x86/synonyms.h" void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, ConvolveParams *conv_params) { const int bd = 8; CONV_BUF_TYPE *dst = conv_params->dst; const int dst_stride = conv_params->dst_stride; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint8_t *src_ptr = src - fo_horiz; const int bits = FILTER_BITS - conv_params->round_1; const __m128i left_shift = _mm_cvtsi32_si128(bits); const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1); const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi16(w0); const __m128i wt1 = _mm_set1_epi16(w1); const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m128i offset_const = _mm_set1_epi16(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); __m128i coeffs[4]; prepare_coeffs(filter_params_x, subpel_x_qn, coeffs); if (w == 4) { do { const __m128i data = _mm_loadu_si128((__m128i *)src_ptr); __m128i s[4]; s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1)); s[1] = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3)); s[2] = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5)); s[3] = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7)); const __m128i res_lo = convolve_lo_x(s, coeffs); const __m128i res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift); const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_lo_shift); const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); // Accumulate values into the destination buffer if (do_average) { const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); const __m128i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m128i res_8 = _mm_packus_epi16(round_result, round_result); *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8); } else { _mm_store_si128((__m128i *)(&dst[0]), res_unsigned); } src_ptr += src_stride; dst += dst_stride; dst0 += dst_stride0; } while (--h); } else { assert(!(w % 8)); int i = 0; do { int j = 0; do { const __m128i data = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); __m128i s[4]; // Filter even-index pixels s[0] = data; s[1] = _mm_srli_si128(data, 2); s[2] = _mm_srli_si128(data, 4); s[3] = _mm_srli_si128(data, 6); const __m128i res_even = convolve_lo_x(s, coeffs); // Filter odd-index pixels s[0] = _mm_srli_si128(data, 1); s[1] = _mm_srli_si128(data, 3); s[2] = _mm_srli_si128(data, 5); s[3] = _mm_srli_si128(data, 7); const __m128i res_odd = convolve_lo_x(s, coeffs); // Rearrange pixels back into the order 0 ... 7 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); const __m128i res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); const __m128i res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift); const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift); const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift); const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); // Accumulate values into the destination buffer if (do_average) { const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m128i res_8 = _mm_packus_epi16(round_result, round_result); _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); } else { _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); } j += 8; } while (j < w); } while (++i < h); } } void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, ConvolveParams *conv_params) { const int bd = 8; CONV_BUF_TYPE *dst = conv_params->dst; const int dst_stride = conv_params->dst_stride; const int fo_vert = filter_params_y->taps / 2 - 1; const uint8_t *src_ptr = src - fo_vert * src_stride; const int bits = FILTER_BITS - conv_params->round_0; const __m128i left_shift = _mm_cvtsi32_si128(bits); const __m128i wt0 = _mm_set1_epi16(conv_params->fwd_offset); const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset); const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m128i offset_const = _mm_set1_epi16(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1); const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); __m128i coeffs[4]; prepare_coeffs(filter_params_y, subpel_y_qn, coeffs); if (w == 4) { __m128i s[8], src6, res, res_shift; s[0] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 0 * src_stride), xx_loadl_32(src_ptr + 1 * src_stride)); s[1] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 1 * src_stride), xx_loadl_32(src_ptr + 2 * src_stride)); s[2] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 2 * src_stride), xx_loadl_32(src_ptr + 3 * src_stride)); s[3] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 3 * src_stride), xx_loadl_32(src_ptr + 4 * src_stride)); s[4] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 4 * src_stride), xx_loadl_32(src_ptr + 5 * src_stride)); src6 = xx_loadl_32(src_ptr + 6 * src_stride); s[5] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 5 * src_stride), src6); do { s[6] = _mm_unpacklo_epi8(src6, xx_loadl_32(src_ptr + 7 * src_stride)); src6 = xx_loadl_32(src_ptr + 8 * src_stride); s[7] = _mm_unpacklo_epi8(xx_loadl_32(src_ptr + 7 * src_stride), src6); res = convolve_lo_y(s + 0, coeffs); res_shift = _mm_sll_epi32(res, left_shift); res_shift = _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift); __m128i res_16b = _mm_packs_epi32(res_shift, res_shift); __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); // Accumulate values into the destination buffer if (do_average) { const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); const __m128i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m128i res_8 = _mm_packus_epi16(round_result, round_result); *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8); } else { _mm_store_si128((__m128i *)dst, res_unsigned); } src_ptr += src_stride; dst += dst_stride; dst0 += dst_stride0; res = convolve_lo_y(s + 1, coeffs); res_shift = _mm_sll_epi32(res, left_shift); res_shift = _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift); res_16b = _mm_packs_epi32(res_shift, res_shift); res_unsigned = _mm_add_epi16(res_16b, offset_const); // Accumulate values into the destination buffer if (do_average) { const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); const __m128i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m128i res_8 = _mm_packus_epi16(round_result, round_result); *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8); } else { _mm_store_si128((__m128i *)dst, res_unsigned); } src_ptr += src_stride; dst += dst_stride; dst0 += dst_stride0; s[0] = s[2]; s[1] = s[3]; s[2] = s[4]; s[3] = s[5]; s[4] = s[6]; s[5] = s[7]; h -= 2; } while (h); } else { assert(!(w % 8)); int j = 0; do { __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift; const uint8_t *data = &src_ptr[j]; src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); s[0] = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 1 * src_stride))); s[1] = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 2 * src_stride))); s[2] = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 3 * src_stride))); s[3] = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 4 * src_stride))); s[4] = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)), _mm_loadl_epi64((__m128i *)(data + 5 * src_stride))); s[5] = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6); int i = 0; do { data = &src_ptr[i * src_stride + j]; s[6] = _mm_unpacklo_epi8( src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride))); src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); s[7] = _mm_unpacklo_epi8( _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6); res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels res_lo_shift = _mm_sll_epi32(res_lo, left_shift); res_hi_shift = _mm_sll_epi32(res_hi, left_shift); res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const), round_shift); res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const), round_shift); __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift); __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); // Accumulate values into the destination buffer if (do_average) { const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m128i res_8 = _mm_packus_epi16(round_result, round_result); _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); } else { _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); } i++; res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels res_lo_shift = _mm_sll_epi32(res_lo, left_shift); res_hi_shift = _mm_sll_epi32(res_hi, left_shift); res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const), round_shift); res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const), round_shift); res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift); res_unsigned = _mm_add_epi16(res_16b, offset_const); // Accumulate values into the destination buffer if (do_average) { __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m128i res_8 = _mm_packus_epi16(round_result, round_result); _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); } else { _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); } i++; s[0] = s[2]; s[1] = s[3]; s[2] = s[4]; s[3] = s[5]; s[4] = s[6]; s[5] = s[7]; } while (i < h); j += 8; } while (j < w); } } aom-3.12.1/av1/common/x86/jnt_convolve_ssse3.c000066400000000000000000000233771477627663500210130ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" void av1_dist_wtd_convolve_2d_ssse3( const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bd = 8; DECLARE_ALIGNED(16, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); int im_h = h + filter_params_y->taps - 1; int im_stride = MAX_SB_SIZE; int i, j; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; const __m128i zero = _mm_setzero_si128(); const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi16(w0); const __m128i wt1 = _mm_set1_epi16(w1); const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m128i offset_const = _mm_set1_epi16(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); /* Horizontal filter */ { const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( filter_params_x, subpel_x_qn & SUBPEL_MASK); const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32( ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); for (i = 0; i < im_h; ++i) { for (j = 0; j < w; j += 8) { const __m128i data = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); const __m128i src_lo = _mm_unpacklo_epi8(data, zero); const __m128i src_hi = _mm_unpackhi_epi8(data, zero); // Filter even-index pixels const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01); const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); // Filter odd-index pixels const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 __m128i res = _mm_packs_epi32(res_even, res_odd); _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res); } } } /* Vertical filter */ { const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( filter_params_y, subpel_y_qn & SUBPEL_MASK); const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32( ((1 << conv_params->round_1) >> 1) - (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { // Filter even-index pixels const int16_t *data = &im_block[i * im_stride + j]; const __m128i src_0 = _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), *(__m128i *)(data + 1 * im_stride)); const __m128i src_2 = _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), *(__m128i *)(data + 3 * im_stride)); const __m128i src_4 = _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), *(__m128i *)(data + 5 * im_stride)); const __m128i src_6 = _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), *(__m128i *)(data + 7 * im_stride)); const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); // Filter odd-index pixels const __m128i src_1 = _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), *(__m128i *)(data + 1 * im_stride)); const __m128i src_3 = _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), *(__m128i *)(data + 3 * im_stride)); const __m128i src_5 = _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), *(__m128i *)(data + 5 * im_stride)); const __m128i src_7 = _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), *(__m128i *)(data + 7 * im_stride)); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); // Rearrange pixels back into the order 0 ... 7 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); const __m128i res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); const __m128i res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round); const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); // Accumulate values into the destination buffer if (do_average) { const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); const __m128i res_8 = _mm_packus_epi16(round_result, round_result); if (w > 4) _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); else *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_8); } else { _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); } } } } } aom-3.12.1/av1/common/x86/reconinter_avx2.c000066400000000000000000000657411477627663500202760ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/blend.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" #include "av1/common/blockd.h" static inline __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0, const __m256i s1) { const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)); return _mm256_abs_epi16( _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4))); // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54) } void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w) { const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0; const __m256i y_mask_base = _mm256_set1_epi16(38 - mb); int i = 0; if (4 == w) { do { const __m128i s0A = xx_loadl_32(src0); const __m128i s0B = xx_loadl_32(src0 + src0_stride); const __m128i s0C = xx_loadl_32(src0 + src0_stride * 2); const __m128i s0D = xx_loadl_32(src0 + src0_stride * 3); const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B); const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D); const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD); const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD); const __m128i s1A = xx_loadl_32(src1); const __m128i s1B = xx_loadl_32(src1 + src1_stride); const __m128i s1C = xx_loadl_32(src1 + src1_stride * 2); const __m128i s1D = xx_loadl_32(src1 + src1_stride * 3); const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B); const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D); const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD); const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD); const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w); const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); const __m128i x_m8 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)); xx_storeu_128(mask, x_m8); src0 += (src0_stride << 2); src1 += (src1_stride << 2); mask += 16; i += 4; } while (i < h); } else if (8 == w) { do { const __m128i s0A = xx_loadl_64(src0); const __m128i s0B = xx_loadl_64(src0 + src0_stride); const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C)); const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D)); const __m128i s1A = xx_loadl_64(src1); const __m128i s1B = xx_loadl_64(src1 + src1_stride); const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C)); const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D)); const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w); const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w); const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD); yy_storeu_256(mask, m8); src0 += src0_stride << 2; src1 += src1_stride << 2; mask += 32; i += 4; } while (i < h); } else if (16 == w) { do { const __m128i s0A = xx_load_128(src0); const __m128i s0B = xx_load_128(src0 + src0_stride); const __m128i s1A = xx_load_128(src1); const __m128i s1B = xx_load_128(src1 + src1_stride); const __m256i s0AL = _mm256_cvtepu8_epi16(s0A); const __m256i s0BL = _mm256_cvtepu8_epi16(s0B); const __m256i s1AL = _mm256_cvtepu8_epi16(s1A); const __m256i s1BL = _mm256_cvtepu8_epi16(s1B); const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL); const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL); const __m256i m8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8); yy_storeu_256(mask, m8); src0 += src0_stride << 1; src1 += src1_stride << 1; mask += 32; i += 2; } while (i < h); } else { do { int j = 0; do { const __m256i s0 = yy_loadu_256(src0 + j); const __m256i s1 = yy_loadu_256(src1 + j); const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0)); const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); const __m256i s0H = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1)); const __m256i s1H = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L); const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H); const __m256i m8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8); yy_storeu_256(mask + j, m8); j += 32; } while (j < w); src0 += src0_stride; src1 += src1_stride; mask += w; i += 1; } while (i < h); } } static inline __m256i calc_mask_d16_avx2(const __m256i *data_src0, const __m256i *data_src1, const __m256i *round_const, const __m256i *mask_base_16, const __m256i *clip_diff, int round) { const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1); const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0); const __m256i diff = _mm256_max_epu16(diffa, diffb); const __m256i diff_round = _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round); const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2); const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16); const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff); return diff_clamp; } static inline __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0, const __m256i *data_src1, const __m256i *round_const, const __m256i *mask_base_16, const __m256i *clip_diff, int round) { const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1); const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0); const __m256i diff = _mm256_max_epu16(diffa, diffb); const __m256i diff_round = _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round); const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2); const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16); const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff); const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp); return diff_const_16; } static inline void build_compound_diffwtd_mask_d16_avx2( uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) { const int mask_base = 38; const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1); const __m256i y38 = _mm256_set1_epi16(mask_base); const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); int i = 0; if (w == 4) { do { const __m128i s0A = xx_loadl_64(src0); const __m128i s0B = xx_loadl_64(src0 + src0_stride); const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); const __m128i s1A = xx_loadl_64(src1); const __m128i s1B = xx_loadl_64(src1 + src1_stride); const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D), _mm_unpacklo_epi64(s0A, s0B)); const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D), _mm_unpacklo_epi64(s1A, s1B)); const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift); const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); xx_storeu_128(mask, _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8))); src0 += src0_stride << 2; src1 += src1_stride << 2; mask += 16; i += 4; } while (i < h); } else if (w == 8) { do { const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0); const __m256i s0CD = yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2); const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1); const __m256i s1CD = yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2); const __m256i m16AB = calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift); const __m256i m16CD = calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift); const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD); yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); src0 += src0_stride << 2; src1 += src1_stride << 2; mask += 32; i += 4; } while (i < h); } else if (w == 16) { do { const __m256i s0A = yy_loadu_256(src0); const __m256i s0B = yy_loadu_256(src0 + src0_stride); const __m256i s1A = yy_loadu_256(src1); const __m256i s1B = yy_loadu_256(src1 + src1_stride); const __m256i m16A = calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); const __m256i m16B = calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); const __m256i m8 = _mm256_packus_epi16(m16A, m16B); yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); src0 += src0_stride << 1; src1 += src1_stride << 1; mask += 32; i += 2; } while (i < h); } else if (w == 32) { do { const __m256i s0A = yy_loadu_256(src0); const __m256i s0B = yy_loadu_256(src0 + 16); const __m256i s1A = yy_loadu_256(src1); const __m256i s1B = yy_loadu_256(src1 + 16); const __m256i m16A = calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); const __m256i m16B = calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); const __m256i m8 = _mm256_packus_epi16(m16A, m16B); yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); src0 += src0_stride; src1 += src1_stride; mask += 32; i += 1; } while (i < h); } else if (w == 64) { do { const __m256i s0A = yy_loadu_256(src0); const __m256i s0B = yy_loadu_256(src0 + 16); const __m256i s0C = yy_loadu_256(src0 + 32); const __m256i s0D = yy_loadu_256(src0 + 48); const __m256i s1A = yy_loadu_256(src1); const __m256i s1B = yy_loadu_256(src1 + 16); const __m256i s1C = yy_loadu_256(src1 + 32); const __m256i s1D = yy_loadu_256(src1 + 48); const __m256i m16A = calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); const __m256i m16B = calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); const __m256i m16C = calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); const __m256i m16D = calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); src0 += src0_stride; src1 += src1_stride; mask += 64; i += 1; } while (i < h); } else { do { const __m256i s0A = yy_loadu_256(src0); const __m256i s0B = yy_loadu_256(src0 + 16); const __m256i s0C = yy_loadu_256(src0 + 32); const __m256i s0D = yy_loadu_256(src0 + 48); const __m256i s0E = yy_loadu_256(src0 + 64); const __m256i s0F = yy_loadu_256(src0 + 80); const __m256i s0G = yy_loadu_256(src0 + 96); const __m256i s0H = yy_loadu_256(src0 + 112); const __m256i s1A = yy_loadu_256(src1); const __m256i s1B = yy_loadu_256(src1 + 16); const __m256i s1C = yy_loadu_256(src1 + 32); const __m256i s1D = yy_loadu_256(src1 + 48); const __m256i s1E = yy_loadu_256(src1 + 64); const __m256i s1F = yy_loadu_256(src1 + 80); const __m256i s1G = yy_loadu_256(src1 + 96); const __m256i s1H = yy_loadu_256(src1 + 112); const __m256i m16A = calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); const __m256i m16B = calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); const __m256i m16C = calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); const __m256i m16D = calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); const __m256i m16E = calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift); const __m256i m16F = calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift); const __m256i m16G = calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift); const __m256i m16H = calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift); const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); const __m256i m8EF = _mm256_packus_epi16(m16E, m16F); const __m256i m8GH = _mm256_packus_epi16(m16G, m16H); yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8)); yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8)); src0 += src0_stride; src1 += src1_stride; mask += 128; i += 1; } while (i < h); } } static inline void build_compound_diffwtd_mask_d16_inv_avx2( uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) { const int mask_base = 38; const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1); const __m256i y38 = _mm256_set1_epi16(mask_base); const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); int i = 0; if (w == 4) { do { const __m128i s0A = xx_loadl_64(src0); const __m128i s0B = xx_loadl_64(src0 + src0_stride); const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); const __m128i s1A = xx_loadl_64(src1); const __m128i s1B = xx_loadl_64(src1 + src1_stride); const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D), _mm_unpacklo_epi64(s0A, s0B)); const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D), _mm_unpacklo_epi64(s1A, s1B)); const __m256i m16 = calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift); const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); xx_storeu_128(mask, _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8))); src0 += src0_stride << 2; src1 += src1_stride << 2; mask += 16; i += 4; } while (i < h); } else if (w == 8) { do { const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0); const __m256i s0CD = yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2); const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1); const __m256i s1CD = yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2); const __m256i m16AB = calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift); const __m256i m16CD = calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift); const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD); yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); src0 += src0_stride << 2; src1 += src1_stride << 2; mask += 32; i += 4; } while (i < h); } else if (w == 16) { do { const __m256i s0A = yy_loadu_256(src0); const __m256i s0B = yy_loadu_256(src0 + src0_stride); const __m256i s1A = yy_loadu_256(src1); const __m256i s1B = yy_loadu_256(src1 + src1_stride); const __m256i m16A = calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); const __m256i m16B = calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); const __m256i m8 = _mm256_packus_epi16(m16A, m16B); yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); src0 += src0_stride << 1; src1 += src1_stride << 1; mask += 32; i += 2; } while (i < h); } else if (w == 32) { do { const __m256i s0A = yy_loadu_256(src0); const __m256i s0B = yy_loadu_256(src0 + 16); const __m256i s1A = yy_loadu_256(src1); const __m256i s1B = yy_loadu_256(src1 + 16); const __m256i m16A = calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); const __m256i m16B = calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); const __m256i m8 = _mm256_packus_epi16(m16A, m16B); yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); src0 += src0_stride; src1 += src1_stride; mask += 32; i += 1; } while (i < h); } else if (w == 64) { do { const __m256i s0A = yy_loadu_256(src0); const __m256i s0B = yy_loadu_256(src0 + 16); const __m256i s0C = yy_loadu_256(src0 + 32); const __m256i s0D = yy_loadu_256(src0 + 48); const __m256i s1A = yy_loadu_256(src1); const __m256i s1B = yy_loadu_256(src1 + 16); const __m256i s1C = yy_loadu_256(src1 + 32); const __m256i s1D = yy_loadu_256(src1 + 48); const __m256i m16A = calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); const __m256i m16B = calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); const __m256i m16C = calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); const __m256i m16D = calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); src0 += src0_stride; src1 += src1_stride; mask += 64; i += 1; } while (i < h); } else { do { const __m256i s0A = yy_loadu_256(src0); const __m256i s0B = yy_loadu_256(src0 + 16); const __m256i s0C = yy_loadu_256(src0 + 32); const __m256i s0D = yy_loadu_256(src0 + 48); const __m256i s0E = yy_loadu_256(src0 + 64); const __m256i s0F = yy_loadu_256(src0 + 80); const __m256i s0G = yy_loadu_256(src0 + 96); const __m256i s0H = yy_loadu_256(src0 + 112); const __m256i s1A = yy_loadu_256(src1); const __m256i s1B = yy_loadu_256(src1 + 16); const __m256i s1C = yy_loadu_256(src1 + 32); const __m256i s1D = yy_loadu_256(src1 + 48); const __m256i s1E = yy_loadu_256(src1 + 64); const __m256i s1F = yy_loadu_256(src1 + 80); const __m256i s1G = yy_loadu_256(src1 + 96); const __m256i s1H = yy_loadu_256(src1 + 112); const __m256i m16A = calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); const __m256i m16B = calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); const __m256i m16C = calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); const __m256i m16D = calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); const __m256i m16E = calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift); const __m256i m16F = calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift); const __m256i m16G = calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift); const __m256i m16H = calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift); const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); const __m256i m8EF = _mm256_packus_epi16(m16E, m16F); const __m256i m8GH = _mm256_packus_epi16(m16G, m16H); yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8)); yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8)); src0 += src0_stride; src1 += src1_stride; mask += 128; i += 1; } while (i < h); } } void av1_build_compound_diffwtd_mask_d16_avx2( uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd) { const int shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); // When rounding constant is added, there is a possibility of overflow. // However that much precision is not required. Code should very well work for // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But // there is a possibility of corner case bugs. assert(DIFF_FACTOR_LOG2 == 4); assert(AOM_BLEND_A64_MAX_ALPHA == 64); if (mask_type == DIFFWTD_38) { build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1, src1_stride, h, w, shift); } else { build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1, src1_stride, h, w, shift); } } #if CONFIG_AV1_HIGHBITDEPTH void av1_build_compound_diffwtd_mask_highbd_avx2( uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd) { if (w < 16) { av1_build_compound_diffwtd_mask_highbd_ssse3( mask, mask_type, src0, src0_stride, src1, src1_stride, h, w, bd); } else { assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV); assert(bd >= 8); assert((w % 16) == 0); const __m256i y0 = _mm256_setzero_si256(); const __m256i yAOM_BLEND_A64_MAX_ALPHA = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const int mask_base = 38; const __m256i ymask_base = _mm256_set1_epi16(mask_base); const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0); const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1); if (bd == 8) { if (mask_type == DIFFWTD_38_INV) { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 16) { __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); __m256i diff = _mm256_srai_epi16( _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2); __m256i m = _mm256_min_epi16( _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), yAOM_BLEND_A64_MAX_ALPHA); m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m); m = _mm256_packus_epi16(m, m); m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); __m128i m0 = _mm256_castsi256_si128(m); _mm_storeu_si128((__m128i *)&mask[j], m0); } ssrc0 += src0_stride; ssrc1 += src1_stride; mask += w; } } else { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 16) { __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); __m256i diff = _mm256_srai_epi16( _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2); __m256i m = _mm256_min_epi16( _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), yAOM_BLEND_A64_MAX_ALPHA); m = _mm256_packus_epi16(m, m); m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); __m128i m0 = _mm256_castsi256_si128(m); _mm_storeu_si128((__m128i *)&mask[j], m0); } ssrc0 += src0_stride; ssrc1 += src1_stride; mask += w; } } } else { const __m128i xshift = _mm_set1_epi64x(bd - 8 + DIFF_FACTOR_LOG2); if (mask_type == DIFFWTD_38_INV) { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 16) { __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); __m256i diff = _mm256_sra_epi16( _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift); __m256i m = _mm256_min_epi16( _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), yAOM_BLEND_A64_MAX_ALPHA); m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m); m = _mm256_packus_epi16(m, m); m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); __m128i m0 = _mm256_castsi256_si128(m); _mm_storeu_si128((__m128i *)&mask[j], m0); } ssrc0 += src0_stride; ssrc1 += src1_stride; mask += w; } } else { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 16) { __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); __m256i diff = _mm256_sra_epi16( _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift); __m256i m = _mm256_min_epi16( _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), yAOM_BLEND_A64_MAX_ALPHA); m = _mm256_packus_epi16(m, m); m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); __m128i m0 = _mm256_castsi256_si128(m); _mm_storeu_si128((__m128i *)&mask[j], m0); } ssrc0 += src0_stride; ssrc1 += src1_stride; mask += w; } } } } } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/av1/common/x86/reconinter_sse4.c000066400000000000000000000141311477627663500202570ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // SSE2 #include /* SSE4.1 */ #include "aom/aom_integer.h" #include "aom_dsp/blend.h" #include "av1/common/blockd.h" #include "config/av1_rtcd.h" static inline __m128i calc_mask(const __m128i mask_base, const __m128i s0, const __m128i s1) { const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(s0, s1)); return _mm_abs_epi16(_mm_add_epi16(mask_base, _mm_srli_epi16(diff, 4))); // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54) } void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int stride0, const uint8_t *src1, int stride1, int h, int w) { const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0; const __m128i mask_base = _mm_set1_epi16(38 - mb); int i = 0; if (4 == w) { do { const __m128i s0A = _mm_cvtsi32_si128(*(int *)src0); const __m128i s0B = _mm_cvtsi32_si128(*(int *)(src0 + stride0)); const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B); const __m128i s0 = _mm_cvtepu8_epi16(s0AB); const __m128i s1A = _mm_cvtsi32_si128(*(int *)src1); const __m128i s1B = _mm_cvtsi32_si128(*(int *)(src1 + stride1)); const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B); const __m128i s1 = _mm_cvtepu8_epi16(s1AB); const __m128i m16 = calc_mask(mask_base, s0, s1); const __m128i m8 = _mm_packus_epi16(m16, m16); *(int *)mask = _mm_cvtsi128_si32(m8); *(int *)(mask + w) = _mm_extract_epi32(m8, 1); src0 += (stride0 << 1); src1 += (stride1 << 1); mask += 8; i += 2; } while (i < h); } else if (8 == w) { do { __m128i s0 = _mm_loadl_epi64((__m128i const *)src0); __m128i s1 = _mm_loadl_epi64((__m128i const *)src1); s0 = _mm_cvtepu8_epi16(s0); s1 = _mm_cvtepu8_epi16(s1); const __m128i m16 = calc_mask(mask_base, s0, s1); const __m128i m8 = _mm_packus_epi16(m16, m16); _mm_storel_epi64((__m128i *)mask, m8); src0 += stride0; src1 += stride1; mask += 8; i += 1; } while (i < h); } else { const __m128i zero = _mm_setzero_si128(); do { int j = 0; do { const __m128i s0 = _mm_load_si128((__m128i const *)(src0 + j)); const __m128i s1 = _mm_load_si128((__m128i const *)(src1 + j)); const __m128i s0L = _mm_cvtepu8_epi16(s0); const __m128i s1L = _mm_cvtepu8_epi16(s1); const __m128i s0H = _mm_unpackhi_epi8(s0, zero); const __m128i s1H = _mm_unpackhi_epi8(s1, zero); const __m128i m16L = calc_mask(mask_base, s0L, s1L); const __m128i m16H = calc_mask(mask_base, s0H, s1H); const __m128i m8 = _mm_packus_epi16(m16L, m16H); _mm_store_si128((__m128i *)(mask + j), m8); j += 16; } while (j < w); src0 += stride0; src1 += stride1; mask += w; i += 1; } while (i < h); } } void av1_build_compound_diffwtd_mask_d16_sse4_1( uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd) { const int which_inverse = (mask_type == DIFFWTD_38) ? 0 : 1; const int mask_base = 38; int round = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); const __m128i round_const = _mm_set1_epi16((1 << round) >> 1); const __m128i mask_base_16 = _mm_set1_epi16(mask_base); const __m128i clip_diff = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const __m128i add_const = _mm_set1_epi16((which_inverse ? AOM_BLEND_A64_MAX_ALPHA : 0)); const __m128i add_sign = _mm_set1_epi16((which_inverse ? -1 : 1)); int i, j; // When rounding constant is added, there is a possibility of overflow. // However that much precision is not required. Code should very well work for // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But // there is a possibility of corner case bugs. assert(DIFF_FACTOR_LOG2 == 4); assert(AOM_BLEND_A64_MAX_ALPHA == 64); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { const __m128i data_src0 = _mm_loadu_si128((__m128i *)&src0[(i * src0_stride) + j]); const __m128i data_src1 = _mm_loadu_si128((__m128i *)&src1[(i * src1_stride) + j]); const __m128i diffa = _mm_subs_epu16(data_src0, data_src1); const __m128i diffb = _mm_subs_epu16(data_src1, data_src0); const __m128i diff = _mm_max_epu16(diffa, diffb); const __m128i diff_round = _mm_srli_epi16(_mm_adds_epu16(diff, round_const), round); const __m128i diff_factor = _mm_srli_epi16(diff_round, DIFF_FACTOR_LOG2); const __m128i diff_mask = _mm_adds_epi16(diff_factor, mask_base_16); __m128i diff_clamp = _mm_min_epi16(diff_mask, clip_diff); // clamp to 0 can be skipped since we are using add and saturate // instruction const __m128i diff_sign = _mm_sign_epi16(diff_clamp, add_sign); const __m128i diff_const_16 = _mm_add_epi16(diff_sign, add_const); // 8 bit conversion and saturation to uint8 const __m128i res_8 = _mm_packus_epi16(diff_const_16, diff_const_16); // Store values into the destination buffer __m128i *const dst = (__m128i *)&mask[i * w + j]; if ((w - j) > 4) { _mm_storel_epi64(dst, res_8); } else { // w==4 *(int *)dst = _mm_cvtsi128_si32(res_8); } } } } aom-3.12.1/av1/common/x86/reconinter_ssse3.c000066400000000000000000000112231477627663500204400ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/av1_rtcd.h" #if CONFIG_AV1_HIGHBITDEPTH #include #include "aom/aom_integer.h" #include "aom_dsp/blend.h" #include "aom_dsp/x86/synonyms.h" #include "av1/common/blockd.h" void av1_build_compound_diffwtd_mask_highbd_ssse3( uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd) { if (w < 8) { av1_build_compound_diffwtd_mask_highbd_c(mask, mask_type, src0, src0_stride, src1, src1_stride, h, w, bd); } else { assert(bd >= 8); assert((w % 8) == 0); assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV); const __m128i x0 = _mm_setzero_si128(); const __m128i xAOM_BLEND_A64_MAX_ALPHA = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); const int mask_base = 38; const __m128i xmask_base = _mm_set1_epi16(mask_base); const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0); const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1); if (bd == 8) { if (mask_type == DIFFWTD_38_INV) { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 8) { __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2); __m128i m = _mm_min_epi16( _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), xAOM_BLEND_A64_MAX_ALPHA); m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m); m = _mm_packus_epi16(m, m); _mm_storel_epi64((__m128i *)&mask[j], m); } ssrc0 += src0_stride; ssrc1 += src1_stride; mask += w; } } else { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 8) { __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2); __m128i m = _mm_min_epi16( _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), xAOM_BLEND_A64_MAX_ALPHA); m = _mm_packus_epi16(m, m); _mm_storel_epi64((__m128i *)&mask[j], m); } ssrc0 += src0_stride; ssrc1 += src1_stride; mask += w; } } } else { const __m128i xshift = _mm_set1_epi64x(bd - 8 + DIFF_FACTOR_LOG2); if (mask_type == DIFFWTD_38_INV) { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 8) { __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); __m128i diff = _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift); __m128i m = _mm_min_epi16( _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), xAOM_BLEND_A64_MAX_ALPHA); m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m); m = _mm_packus_epi16(m, m); _mm_storel_epi64((__m128i *)&mask[j], m); } ssrc0 += src0_stride; ssrc1 += src1_stride; mask += w; } } else { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; j += 8) { __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); __m128i diff = _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift); __m128i m = _mm_min_epi16( _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), xAOM_BLEND_A64_MAX_ALPHA); m = _mm_packus_epi16(m, m); _mm_storel_epi64((__m128i *)&mask[j], m); } ssrc0 += src0_stride; ssrc1 += src1_stride; mask += w; } } } } } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/av1/common/x86/resize_avx2.c000066400000000000000000001075121477627663500174200ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "av1/common/resize.h" #include "aom_dsp/x86/synonyms.h" #define ROW_OFFSET 5 #define CAST_HI(x) _mm256_castsi128_si256(x) #define CAST_LOW(x) _mm256_castsi256_si128(x) #define PROCESS_RESIZE_Y_WD16 \ const int idx1 = AOMMIN(height - 1, i + 5); \ const int idx2 = AOMMIN(height - 1, i + 6); \ l6 = l10; \ l7 = l11; \ l8 = _mm_loadu_si128((__m128i *)(data + idx1 * stride)); \ l9 = _mm_loadu_si128((__m128i *)(data + idx2 * stride)); \ \ /* g0... g15 | i0... i15 */ \ const __m256i s68 = \ _mm256_permute2x128_si256(CAST_HI(l6), CAST_HI(l8), 0x20); \ /* h0... h15 | j0... j15 */ \ const __m256i s79 = \ _mm256_permute2x128_si256(CAST_HI(l7), CAST_HI(l9), 0x20); \ \ /* g0h0... g7g7 | i0j0... i7j */ \ s[3] = _mm256_unpacklo_epi8(s68, s79); \ /* g8h8... g15g15 | i8j8... i15j15 */ \ s[8] = _mm256_unpackhi_epi8(s68, s79); \ \ __m256i res_out[2] = { 0 }; \ resize_convolve(s, coeffs_y, res_out); \ \ /* r00... r07 */ \ __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \ /* r20... r27 */ \ __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \ \ res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits); \ res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \ \ __m256i res_out_b[2] = { 0 }; \ resize_convolve(s + 5, coeffs_y, res_out_b); \ \ /* r08... r015 */ \ __m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \ /* r28... r215 */ \ __m256i res_b_round_2 = _mm256_add_epi32(res_out_b[1], round_const_bits); \ res_b_round_1 = _mm256_sra_epi32(res_b_round_1, round_shift_bits); \ res_b_round_2 = _mm256_sra_epi32(res_b_round_2, round_shift_bits); \ \ /* r00... r03 r20... r23 | r04... r07 r24... r27 */ \ __m256i res_8bit0 = _mm256_packus_epi32(res_a_round_1, res_a_round_2); \ /* r08... r012 r28... r212 | r013... r015 r213... r215 */ \ __m256i res_8bit1 = _mm256_packus_epi32(res_b_round_1, res_b_round_2); \ /* r00... r07 | r20... r27 */ \ res_8bit0 = _mm256_permute4x64_epi64(res_8bit0, 0xd8); \ /* r08... r015 | r28... r215 */ \ res_8bit1 = _mm256_permute4x64_epi64(res_8bit1, 0xd8); \ /* r00... r015 | r20... r215 */ \ res_8bit1 = _mm256_packus_epi16(res_8bit0, res_8bit1); \ res_8bit0 = _mm256_min_epu8(res_8bit1, clip_pixel); \ res_8bit0 = _mm256_max_epu8(res_8bit0, zero); #define PROCESS_RESIZE_Y_WD8 \ const int idx1 = AOMMIN(height - 1, i + 5); \ const int idx2 = AOMMIN(height - 1, i + 6); \ l6 = l10; \ l7 = l11; \ l8 = _mm_loadl_epi64((__m128i *)(data + idx1 * stride)); \ l9 = _mm_loadl_epi64((__m128i *)(data + idx2 * stride)); \ \ /* g0h0... g7h7 */ \ s67 = _mm_unpacklo_epi8(l6, l7); \ /* i0j0...i7j7 */ \ __m128i s89 = _mm_unpacklo_epi8(l8, l9); \ \ /* g0h0...g7g7 | i0j0...i7j7 */ \ s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20); \ \ __m256i res_out[2] = { 0 }; \ resize_convolve(s, coeffs_y, res_out); \ \ /* r00... r07 */ \ __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \ /* r20...r27 */ \ __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \ res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits); \ res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits); \ \ /* r00...r03 r20...r23 | r04...r07 r24...r27 */ \ res_a_round_1 = _mm256_packus_epi32(res_a_round_1, res_a_round_2); \ /* r00...r07 | r20...r27 */ \ res_a_round_1 = _mm256_permute4x64_epi64(res_a_round_1, 0xd8); \ res_a_round_1 = _mm256_packus_epi16(res_a_round_1, res_a_round_1); \ res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel); \ res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero); #define PROCESS_RESIZE_X_WD32 \ /* a0 a1 ..... a30 a31 */ \ __m256i row0 = _mm256_loadu_si256( \ (__m256i *)&input[i * in_stride + j - filter_offset]); \ /* b0 b1 ..... b30 b31 */ \ __m256i row1 = _mm256_loadu_si256( \ (__m256i *)&input[(i + 1) * in_stride + j - filter_offset]); \ /* a0 .... a15 || b0.... b15 */ \ __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); \ /* a16 .... a31 || b16 .... b31 */ \ __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); \ filter_offset = 3; \ \ /* Pad start pixels to the left, while processing the first pixels in the \ * row. */ \ if (j == 0) { \ /* a0 a0 a0 a0 .... a12 || b0 b0 b0 b0 .... b12 */ \ row0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask); \ /* a13 a14 a15 a16.....a28 || b13 b14 b15 b16.....b28 */ \ row1 = _mm256_alignr_epi8(r1, r0, 13); \ r0 = row0; \ r1 = row1; \ } \ const int is_last_cols32 = (j + 32 == filtered_length); \ /* Avoid loading extra pixels at frame boundary.*/ \ if (is_last_cols32) row_offset = ROW_OFFSET; \ /* a29 a30 a31 a32 a33 a34 a35 a36 0 0 ....*/ \ __m128i row0_0 = _mm_loadl_epi64( \ (__m128i *)&input[i * in_stride + 32 + j - filter_offset - row_offset]); \ /* b29 b30 b31 b32 b33 b34 b35 b36 0 0 .... */ \ __m128i row1_0 = \ _mm_loadl_epi64((__m128i *)&input[(i + 1) * in_stride + 32 + j - \ filter_offset - row_offset]); \ __m256i r2 = _mm256_permute2x128_si256( \ _mm256_castsi128_si256(row0_0), _mm256_castsi128_si256(row1_0), 0x20); \ \ /* Pad end pixels to the right, while processing the last pixels in the \ * row. */ \ if (is_last_cols32) { \ r2 = _mm256_shuffle_epi8(_mm256_srli_si256(r2, ROW_OFFSET), \ wd32_end_pad_mask); \ } \ \ /* Process even pixels of the first row */ \ /* a0 a0 a0 a0 a1 a2 .... a12 | b0 b0 b0 b0 b1 b2 .... b12 */ \ s0[0] = _mm256_alignr_epi8(r1, r0, 0); \ /* a0 a0 a1 a2 a3 a4 .... a14 | b0 b0 b1 b2 b3 b4 .... b14 */ \ s0[1] = _mm256_alignr_epi8(r1, r0, 2); \ /* a1 a2 a3 a4 a5 a6 .... a16 | b1 b2 b3 b4 b5 b6 .... b16 */ \ s0[2] = _mm256_alignr_epi8(r1, r0, 4); \ /* a3 a4 a5 a6 a7 a8 .... a18 | b3 b4 b5 b6 b7 b8 .... b18 */ \ s0[3] = _mm256_alignr_epi8(r1, r0, 6); \ \ /* Process even pixels of the second row */ \ /* a13 a14 a15 a16 ..... a28 | b13 b14 b15 b16 ..... b28 */ \ s1[0] = _mm256_alignr_epi8(r2, r1, 0); \ /* a15 a16 a17 a18 ..... a30 | b15 b16 b17 b18 ..... b30 */ \ s1[1] = _mm256_alignr_epi8(r2, r1, 2); \ /* a17 a18 a19 a20 ..... a32 | b17 b18 b19 b20 ..... b32 */ \ s1[2] = _mm256_alignr_epi8(r2, r1, 4); \ /* a19 a20 a21 a22 ..... a34 | b19 b20 b21 b22 ..... b34 */ \ s1[3] = _mm256_alignr_epi8(r2, r1, 6); \ \ /* The register res_out_0 stores the result of start-16 pixels corresponding \ * to the first and second rows whereas res_out_1 stores the end-16 \ * pixels. */ \ __m256i res_out_0[2], res_out_1[2]; \ res_out_1[0] = res_out_1[1] = zero; \ res_out_0[0] = res_out_0[1] = zero; \ resize_convolve(s0, coeffs_x, res_out_0); \ resize_convolve(s1, coeffs_x, res_out_1); \ \ /* Result of 32 pixels of row0 (a0 to a32) */ \ res_out_0[0] = _mm256_sra_epi32( \ _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits); \ res_out_1[0] = _mm256_sra_epi32( \ _mm256_add_epi32(res_out_1[0], round_const_bits), round_shift_bits); \ /* r00-r03 r08-r011 | r04-r07 r012-r015 */ \ __m256i res_out_r0 = _mm256_packus_epi32(res_out_0[0], res_out_1[0]); \ \ /* Result of 32 pixels of row1 (b0 to b32) */ \ res_out_0[1] = _mm256_sra_epi32( \ _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits); \ res_out_1[1] = _mm256_sra_epi32( \ _mm256_add_epi32(res_out_1[1], round_const_bits), round_shift_bits); \ /* r10-r13 r18-r111 | r14-r17 r112-r115 */ \ __m256i res_out_r1 = _mm256_packus_epi32(res_out_0[1], res_out_1[1]); \ \ /* Convert the result from 16bit to 8bit */ \ /* r00-r03 r08-r011 r10-r13 r18-r111 | r04-r07 r012-r015 r14-r17 r112-r115 \ */ \ __m256i res_out_r01 = _mm256_packus_epi16(res_out_r0, res_out_r1); \ __m256i res_out_row01 = _mm256_min_epu8(res_out_r01, clip_pixel); \ res_out_row01 = _mm256_max_epu8(res_out_r01, zero); \ __m128i low_128 = CAST_LOW(res_out_row01); \ __m128i high_128 = _mm256_extracti128_si256(res_out_row01, 1); \ \ _mm_storeu_si128((__m128i *)&intbuf[i * dst_stride + j / 2], \ _mm_unpacklo_epi32(low_128, high_128)); \ _mm_storeu_si128((__m128i *)&intbuf[(i + 1) * dst_stride + j / 2], \ _mm_unpackhi_epi32(low_128, high_128)); static inline void resize_convolve(const __m256i *const s, const __m256i *const coeffs, __m256i *res_out) { const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]); const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]); const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]); const __m256i res_3 = _mm256_maddubs_epi16(s[3], coeffs[3]); const __m256i dst_0 = _mm256_add_epi16(res_0, res_1); const __m256i dst_1 = _mm256_add_epi16(res_2, res_3); // The sum of convolve operation crosses signed 16bit. Hence, the addition // should happen in 32bit. const __m256i dst_00 = _mm256_cvtepi16_epi32(CAST_LOW(dst_0)); const __m256i dst_01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_0, 1)); const __m256i dst_10 = _mm256_cvtepi16_epi32(CAST_LOW(dst_1)); const __m256i dst_11 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_1, 1)); res_out[0] = _mm256_add_epi32(dst_00, dst_10); res_out[1] = _mm256_add_epi32(dst_01, dst_11); } static inline void prepare_filter_coeffs(const int16_t *filter, __m256i *const coeffs /* [4] */) { // f0 f1 f2 f3 x x x x const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter); // f0 f1 f2 f3 f0 f1 f2 f3 const __m128i tmp0 = _mm_shuffle_epi32(sym_even_filter, 0x44); // f0 f1 f2 f3 f1 f0 f3 f2 const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, 0xb1); const __m128i filter_8bit = _mm_packs_epi16(tmp1, tmp1); // f0 f1 f0 f1 .. coeffs[2] = _mm256_broadcastw_epi16(filter_8bit); // f2 f3 f2 f3 .. coeffs[3] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 2)); // f3 f2 f3 f2 .. coeffs[0] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 6)); // f1 f0 f1 f0 .. coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4)); } bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int stride, int start_col) { assert(start_col <= stride); // For the GM tool, the input layer height or width is assured to be an even // number. Hence the function 'down2_symodd()' is not invoked and SIMD // optimization of the same is not implemented. // When the input height is less than 8 and even, the potential input // heights are limited to 2, 4, or 6. These scenarios require seperate // handling due to padding requirements. Invoking the C function here will // eliminate the need for conditional statements within the subsequent SIMD // code to manage these cases. if (height & 1 || height < 8) { return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2, stride, start_col); } __m256i s[10], coeffs_y[4]; const int bits = FILTER_BITS; const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); const uint8_t max_pixel = 255; const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel); const __m256i zero = _mm256_setzero_si256(); prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y); const int num_col16 = stride / 16; int remain_col = stride % 16; // The core vertical SIMD processes 4 input rows simultaneously to generate // output corresponding to 2 rows. To streamline the core loop and eliminate // the need for conditional checks, the remaining rows (4 or 6) are processed // separately. const int remain_row = (height % 4 == 0) ? 4 : 6; for (int j = start_col; j < stride - remain_col; j += 16) { const uint8_t *data = &intbuf[j]; const __m128i l3 = _mm_loadu_si128((__m128i *)(data + 0 * stride)); // Padding top 3 rows with the last available row at the top. const __m128i l0 = l3; const __m128i l1 = l3; const __m128i l2 = l3; const __m128i l4 = _mm_loadu_si128((__m128i *)(data + 1 * stride)); __m128i l6, l7, l8, l9; __m128i l5 = _mm_loadu_si128((__m128i *)(data + 2 * stride)); __m128i l10 = _mm_loadu_si128((__m128i *)(data + 3 * stride)); __m128i l11 = _mm_loadu_si128((__m128i *)(data + 4 * stride)); // a0...a15 | c0...c15 const __m256i s02 = _mm256_permute2x128_si256(CAST_HI(l0), CAST_HI(l2), 0x20); // b0...b15 | d0...d15 const __m256i s13 = _mm256_permute2x128_si256(CAST_HI(l1), CAST_HI(l3), 0x20); // c0...c15 | e0...e15 const __m256i s24 = _mm256_permute2x128_si256(CAST_HI(l2), CAST_HI(l4), 0x20); // d0...d15 | f0...f15 const __m256i s35 = _mm256_permute2x128_si256(CAST_HI(l3), CAST_HI(l5), 0x20); // e0...e15 | g0...g15 const __m256i s46 = _mm256_permute2x128_si256(CAST_HI(l4), CAST_HI(l10), 0x20); // f0...f15 | h0...h15 const __m256i s57 = _mm256_permute2x128_si256(CAST_HI(l5), CAST_HI(l11), 0x20); // a0b0...a7b7 | c0d0...c7d7 s[0] = _mm256_unpacklo_epi8(s02, s13); // c0d0...c7d7 | e0f0...e7f7 s[1] = _mm256_unpacklo_epi8(s24, s35); // e0f0...e7f7 | g0h0...g7h7 s[2] = _mm256_unpacklo_epi8(s46, s57); // a8b8...a15b15 | c8d8...c15d15 s[5] = _mm256_unpackhi_epi8(s02, s13); // c8d8...c15d15 | e8f8...e15f15 s[6] = _mm256_unpackhi_epi8(s24, s35); // e8f8...e15f15 | g8h8...g15h15 s[7] = _mm256_unpackhi_epi8(s46, s57); // height to be processed here const int process_ht = height - remain_row; for (int i = 0; i < process_ht; i += 4) { PROCESS_RESIZE_Y_WD16 _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j], CAST_LOW(res_8bit0)); _mm_storeu_si128( (__m128i *)&output[(i / 2) * out_stride + j + out_stride], _mm256_extracti128_si256(res_8bit0, 1)); // Load the required data for processing of next 4 input rows. const int idx7 = AOMMIN(height - 1, i + 7); const int idx8 = AOMMIN(height - 1, i + 8); l10 = _mm_loadu_si128((__m128i *)(data + idx7 * stride)); l11 = _mm_loadu_si128((__m128i *)(data + idx8 * stride)); const __m256i s810 = _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20); const __m256i s911 = _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20); // i0j0... i7j7 | k0l0... k7l7 s[4] = _mm256_unpacklo_epi8(s810, s911); // i8j8... i15j15 | k8l8... k15l15 s[9] = _mm256_unpackhi_epi8(s810, s911); s[0] = s[2]; s[1] = s[3]; s[2] = s[4]; s[5] = s[7]; s[6] = s[8]; s[7] = s[9]; } // Process the remaining last 4 or 6 rows here. int i = process_ht; while (i < height - 1) { PROCESS_RESIZE_Y_WD16 _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j], CAST_LOW(res_8bit0)); i += 2; const int is_store_valid = (i < height - 1); if (is_store_valid) _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j], _mm256_extracti128_si256(res_8bit0, 1)); i += 2; // Check if there is any remaining height to process. If so, perform the // necessary data loading for processing the next row. if (i < height - 1) { l10 = l11 = l9; const __m256i s810 = _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20); const __m256i s911 = _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20); // i0j0... i7j7 | k0l0... k7l7 s[4] = _mm256_unpacklo_epi8(s810, s911); // i8j8... i15j15 | k8l8... k15l15 s[9] = _mm256_unpackhi_epi8(s810, s911); s[0] = s[2]; s[1] = s[3]; s[2] = s[4]; s[5] = s[7]; s[6] = s[8]; s[7] = s[9]; } } } if (remain_col > 7) { const int processed_wd = num_col16 * 16; remain_col = stride % 8; const uint8_t *data = &intbuf[processed_wd]; const __m128i l3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride)); // Padding top 3 rows with available top-most row. const __m128i l0 = l3; const __m128i l1 = l3; const __m128i l2 = l3; const __m128i l4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride)); __m128i l6, l7, l8, l9; __m128i l5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride)); __m128i l10 = _mm_loadl_epi64((__m128i *)(data + 3 * stride)); __m128i l11 = _mm_loadl_epi64((__m128i *)(data + 4 * stride)); // a0b0...a7b7 const __m128i s01 = _mm_unpacklo_epi8(l0, l1); // c0d0...c7d7 const __m128i s23 = _mm_unpacklo_epi8(l2, l3); // e0f0...e7f7 const __m128i s45 = _mm_unpacklo_epi8(l4, l5); // g0h0...g7h7 __m128i s67 = _mm_unpacklo_epi8(l10, l11); // a0b0...a7b7 | c0d0...c7d7 s[0] = _mm256_permute2x128_si256(CAST_HI(s01), CAST_HI(s23), 0x20); // c0d0...c7d7 | e0f0...e7f7 s[1] = _mm256_permute2x128_si256(CAST_HI(s23), CAST_HI(s45), 0x20); // e0f0...e7f7 | g0h0...g7h7 s[2] = _mm256_permute2x128_si256(CAST_HI(s45), CAST_HI(s67), 0x20); // height to be processed here const int process_ht = height - remain_row; for (int i = 0; i < process_ht; i += 4) { PROCESS_RESIZE_Y_WD8 _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd], CAST_LOW(res_a_round_1)); _mm_storel_epi64( (__m128i *)&output[(i / 2) * out_stride + processed_wd + out_stride], _mm256_extracti128_si256(res_a_round_1, 1)); const int idx7 = AOMMIN(height - 1, i + 7); const int idx8 = AOMMIN(height - 1, i + 8); l10 = _mm_loadl_epi64((__m128i *)(data + idx7 * stride)); l11 = _mm_loadl_epi64((__m128i *)(data + idx8 * stride)); // k0l0... k7l7 const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11); // i0j0... i7j7 | k0l0... k7l7 s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20); s[0] = s[2]; s[1] = s[3]; s[2] = s[4]; } // Process the remaining last 4 or 6 rows here. int i = process_ht; while (i < height - 1) { PROCESS_RESIZE_Y_WD8 _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd], CAST_LOW(res_a_round_1)); i += 2; const int is_store_valid = (i < height - 1); if (is_store_valid) _mm_storel_epi64( (__m128i *)&output[(i / 2) * out_stride + processed_wd], _mm256_extracti128_si256(res_a_round_1, 1)); i += 2; // Check rows are still remaining for processing. If yes do the required // load of data for the next iteration. if (i < height - 1) { l10 = l11 = l9; // k0l0... k7l7 const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11); // i0j0... i7j7 | k0l0... k7l7 s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20); s[0] = s[2]; s[1] = s[3]; s[2] = s[4]; } } } if (remain_col) return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2, stride, stride - remain_col); return true; } // Masks used for width 32 and 8 pixels, with left and right padding // requirements static const uint8_t wd32_left_padding_mask[32] = { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }; static const uint8_t wd32_right_padding_mask[32] = { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }; static const uint8_t wd8_right_padding_mask[32] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10 }; void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filtered_length, int width2) { assert(height % 2 == 0); // Invoke SSE2 for width less than 32. if (filtered_length < 32) { av1_resize_horz_dir_sse2(input, in_stride, intbuf, height, filtered_length, width2); return; } const int filt_length = sizeof(av1_down2_symeven_half_filter); assert(filt_length % 2 == 0); (void)filt_length; __m256i s0[4], s1[4], coeffs_x[4]; const int bits = FILTER_BITS; const int dst_stride = width2; const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); const uint8_t max_pixel = 255; const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel); const __m256i zero = _mm256_setzero_si256(); const __m256i wd32_start_pad_mask = _mm256_loadu_si256((__m256i *)wd32_left_padding_mask); const __m256i wd32_end_pad_mask = _mm256_loadu_si256((__m256i *)wd32_right_padding_mask); const __m256i wd8_end_pad_mask = _mm256_loadu_si256((__m256i *)wd8_right_padding_mask); prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x); // The core horizontal SIMD processes 32 input pixels of 2 rows simultaneously // to generate output corresponding to 2 rows. To streamline the core loop and // eliminate the need for conditional checks, the remaining columns (16 or 8) // are processed separately. if (filtered_length % 32 == 0) { for (int i = 0; i < height; i += 2) { int filter_offset = 0; int row_offset = 0; for (int j = 0; j < filtered_length; j += 32) { PROCESS_RESIZE_X_WD32 } } } else { for (int i = 0; i < height; i += 2) { int filter_offset = 0; int remain_col = filtered_length; int row_offset = 0; // To avoid pixel over-read at frame boundary, processing of 32 pixels // is done using the core loop only if sufficient number of pixels // required for the load are present. The remaining pixels are processed // separately. for (int j = 0; j <= filtered_length - 32; j += 32) { if (remain_col == 34 || remain_col == 36) { break; } PROCESS_RESIZE_X_WD32 remain_col -= 32; } int wd_processed = filtered_length - remain_col; // To avoid pixel over-read at frame boundary, processing of 16 pixels // is done only if sufficient number of pixels required for the // load are present. The remaining pixels are processed separately. if (remain_col > 15 && remain_col != 18 && remain_col != 20) { remain_col = filtered_length - wd_processed - 16; const int in_idx = i * in_stride + wd_processed; const int out_idx = (i * dst_stride) + wd_processed / 2; // a0 a1 --- a15 __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx - filter_offset]); // b0 b1 --- b15 __m128i row1 = _mm_loadu_si128( (__m128i *)&input[in_idx + in_stride - filter_offset]); // a0 a1 --- a15 || b0 b1 --- b15 __m256i r0 = _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20); if (filter_offset == 0) { r0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask); } filter_offset = 3; const int is_last_cols16 = wd_processed + 16 == filtered_length; if (is_last_cols16) row_offset = ROW_OFFSET; // a16 a17 --- a23 row0 = _mm_loadl_epi64( (__m128i *)&input[in_idx + 16 - row_offset - filter_offset]); // b16 b17 --- b23 row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride - row_offset - filter_offset]); // a16-a23 x x x x| b16-b23 x x x x __m256i r1 = _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20); // Pad end pixels to the right, while processing the last pixels in the // row. if (is_last_cols16) { r1 = _mm256_shuffle_epi8(_mm256_srli_si256(r1, ROW_OFFSET), wd32_end_pad_mask); } // a0 a1 --- a15 || b0 b1 --- b15 s0[0] = r0; // a2 a3 --- a17 || b2 b3 --- b17 s0[1] = _mm256_alignr_epi8(r1, r0, 2); // a4 a5 --- a19 || b4 b5 --- b19 s0[2] = _mm256_alignr_epi8(r1, r0, 4); // a6 a7 --- a21 || b6 b7 --- b21 s0[3] = _mm256_alignr_epi8(r1, r0, 6); // result for 16 pixels (a0 to a15) of row0 and row1 __m256i res_out_0[2]; res_out_0[0] = res_out_0[1] = zero; resize_convolve(s0, coeffs_x, res_out_0); // r00-r07 res_out_0[0] = _mm256_sra_epi32( _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits); // r10-r17 res_out_0[1] = _mm256_sra_epi32( _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits); // r00-r03 r10-r13 r04-r07 r14-r17 __m256i res_out_row01 = _mm256_packus_epi32(res_out_0[0], res_out_0[1]); // r00-r03 r10-r13 r00-r03 r10-r13 | r04-r07 r14-r17 r04-r07 r14-r17 res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01); res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel); res_out_row01 = _mm256_max_epu8(res_out_row01, zero); // r00-r03 r10-r13 r04-r07 r14-r17 __m128i low_result = CAST_LOW(_mm256_permute4x64_epi64(res_out_row01, 0xd8)); // r00-r03 r04-r07 r10-r13 r14-r17 low_result = _mm_shuffle_epi32(low_result, 0xd8); _mm_storel_epi64((__m128i *)&intbuf[out_idx], low_result); _mm_storel_epi64((__m128i *)&intbuf[out_idx + dst_stride], _mm_unpackhi_epi64(low_result, low_result)); } // To avoid pixel over-read at frame boundary, processing of 8 pixels // is done only if sufficient number of pixels required for the // load are present. The remaining pixels are processed by C function. wd_processed = filtered_length - remain_col; if (remain_col > 7 && remain_col != 10 && remain_col != 12) { remain_col = filtered_length - wd_processed - 8; const int in_idx = i * in_stride + wd_processed - filter_offset; const int out_idx = (i * dst_stride) + wd_processed / 2; const int is_last_cols_8 = wd_processed + 8 == filtered_length; if (is_last_cols_8) row_offset = ROW_OFFSET; // a0 a1 --- a15 __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx - row_offset]); // b0 b1 --- b15 __m128i row1 = _mm_loadu_si128((__m128i *)&input[in_idx + in_stride - row_offset]); // a0 a1 --- a15 || b0 b1 --- b15 __m256i r0 = _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20); // Pad end pixels to the right, while processing the last pixels in the // row. if (is_last_cols_8) r0 = _mm256_shuffle_epi8(_mm256_srli_si256(r0, ROW_OFFSET), wd8_end_pad_mask); // a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7 s0[0] = r0; // a2 a3 a4 a5 a6 a7 a8 a9 | b2 b3 b4 b5 b6 b7 b8 b9 s0[1] = _mm256_bsrli_epi128(r0, 2); // a4 a5 a6 a7 a8 a9 a10 a10 | b4 b5 b6 b7 b8 b9 b10 b10 s0[2] = _mm256_bsrli_epi128(r0, 4); // a6 a7 a8 a9 a10 a10 a10 a10 | b6 b7 b8 b9 b10 b10 b10 b10 s0[3] = _mm256_bsrli_epi128(r0, 6); __m256i res_out_0[2]; res_out_0[0] = res_out_0[1] = zero; resize_convolve(s0, coeffs_x, res_out_0); // r00 - r03 | r10 - r13 __m256i res_out = _mm256_permute2x128_si256(res_out_0[0], res_out_0[1], 0x20); // r00 - r03 | r10 - r13 res_out = _mm256_sra_epi32(_mm256_add_epi32(res_out, round_const_bits), round_shift_bits); // r00-r03 r00-r03 r10-r13 r10-r13 __m256i res_out_row01 = _mm256_packus_epi32(res_out, res_out); // r00-r03 r00-r03 r00-r03 r00-r03 r10-r13 r10-r13 r10-r13 r10-r13 res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01); res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel); res_out_row01 = _mm256_max_epu8(res_out_row01, zero); xx_storel_32(intbuf + out_idx, CAST_LOW(res_out_row01)); xx_storel_32(intbuf + out_idx + dst_stride, _mm256_extracti128_si256(res_out_row01, 1)); } wd_processed = filtered_length - remain_col; if (remain_col) { const int in_idx = (in_stride * i); const int out_idx = (wd_processed / 2) + width2 * i; down2_symeven(input + in_idx, filtered_length, intbuf + out_idx, wd_processed); down2_symeven(input + in_idx + in_stride, filtered_length, intbuf + out_idx + width2, wd_processed); } } } } aom-3.12.1/av1/common/x86/resize_sse2.c000066400000000000000000000365411477627663500174170ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #include "av1/common/resize.h" #include "aom_dsp/x86/synonyms.h" #define ROW_OFFSET 5 #define PROCESS_RESIZE_Y_WD8 \ /* ah0 ah1 ... ah7 */ \ const __m128i AH = _mm_add_epi16(l0, l7); \ /* bg0 bg1 ... bh7 */ \ const __m128i BG = _mm_add_epi16(l1, l6); \ /* cf0 cf1 ... cf7 */ \ const __m128i CF = _mm_add_epi16(l2, l5); \ /* de0 de1 ... de7 */ \ const __m128i DE = _mm_add_epi16(l3, l4); \ \ /* ah0 bg0 ... ah3 bg3 */ \ const __m128i AHBG_low = _mm_unpacklo_epi16(AH, BG); \ /*cf0 de0 ... cf2 de2 */ \ const __m128i CFDE_low = _mm_unpacklo_epi16(CF, DE); \ \ /* ah4 bg4... ah7 bg7 */ \ const __m128i AHBG_hi = _mm_unpackhi_epi16(AH, BG); \ /* cf4 de4... cf7 de7 */ \ const __m128i CFDE_hi = _mm_unpackhi_epi16(CF, DE); \ \ /* r00 r01 r02 r03 */ \ const __m128i r00 = _mm_madd_epi16(AHBG_low, coeffs_y[0]); \ const __m128i r01 = _mm_madd_epi16(CFDE_low, coeffs_y[1]); \ __m128i r0 = _mm_add_epi32(r00, r01); \ /* r04 r05 r06 r07 */ \ const __m128i r10 = _mm_madd_epi16(AHBG_hi, coeffs_y[0]); \ const __m128i r11 = _mm_madd_epi16(CFDE_hi, coeffs_y[1]); \ __m128i r1 = _mm_add_epi32(r10, r11); \ \ r0 = _mm_add_epi32(r0, round_const_bits); \ r1 = _mm_add_epi32(r1, round_const_bits); \ r0 = _mm_sra_epi32(r0, round_shift_bits); \ r1 = _mm_sra_epi32(r1, round_shift_bits); \ \ /* r00 ... r07 (8 values of each 16bit) */ \ const __m128i res_16b = _mm_packs_epi32(r0, r1); \ /* r00 ... r07 | r00 ... r07 (16 values of each 8bit) */ \ const __m128i res_8b0 = _mm_packus_epi16(res_16b, res_16b); \ \ __m128i res = _mm_min_epu8(res_8b0, clip_pixel); \ res = _mm_max_epu8(res, zero); \ _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + j], res); \ \ l0 = l2; \ l1 = l3; \ l2 = l4; \ l3 = l5; \ l4 = l6; \ l5 = l7; \ data += 2 * stride; static inline void prepare_filter_coeffs(const int16_t *filter, __m128i *const coeffs /* [2] */) { // f0 f1 f2 f3 x x x x const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter); // f1 f0 f3 f2 x x x x const __m128i tmp1 = _mm_shufflelo_epi16(sym_even_filter, 0xb1); // f3 f2 f3 f2 ... coeffs[0] = _mm_shuffle_epi32(tmp1, 0x55); // f1 f0 f1 f0 ... coeffs[1] = _mm_shuffle_epi32(tmp1, 0x00); } bool av1_resize_vert_dir_sse2(uint8_t *intbuf, uint8_t *output, int out_stride, int height, int height2, int stride, int start_col) { // For the GM tool, the input layer height or width is assured to be an even // number. Hence the function 'down2_symodd()' is not invoked and SIMD // optimization of the same is not implemented. // When the input height is less than 8 and even, the potential input // heights are limited to 2, 4, or 6. These scenarios require seperate // handling due to padding requirements. Invoking the C function here will // eliminate the need for conditional statements within the subsequent SIMD // code to manage these cases. if (height & 1 || height < 8) { return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2, stride, start_col); } __m128i coeffs_y[2]; const int bits = FILTER_BITS; const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); const uint8_t max_pixel = 255; const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel); const __m128i zero = _mm_setzero_si128(); prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y); const int remain_col = stride % 8; for (int j = start_col; j < stride - remain_col; j += 8) { uint8_t *data = &intbuf[j]; // d0 ... d7 const __m128i l8_3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride)); // Padding top 3 rows with the last available row at the top. // a0 ... a7 const __m128i l8_0 = l8_3; // b0 ... b7 const __m128i l8_1 = l8_3; // c0 ... c7 const __m128i l8_2 = l8_3; // e0 ... e7 const __m128i l8_4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride)); // f0 ... f7 const __m128i l8_5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride)); // Convert to 16bit as addition of 2 source pixel crosses 8 bit. __m128i l0 = _mm_unpacklo_epi8(l8_0, zero); // A(128bit) = a0 - a7(16 bit) __m128i l1 = _mm_unpacklo_epi8(l8_1, zero); // B(128bit) = b0 - b7(16 bit) __m128i l2 = _mm_unpacklo_epi8(l8_2, zero); // C(128bit) = c0 - c7(16 bit) __m128i l3 = _mm_unpacklo_epi8(l8_3, zero); // D(128bit) = d0 - d7(16 bit) __m128i l4 = _mm_unpacklo_epi8(l8_4, zero); // E(128bit) = e0 - e7(16 bit) __m128i l5 = _mm_unpacklo_epi8(l8_5, zero); // F(128bit) = f0 - f7(16 bit) // Increment the pointer such that the loading starts from row G. data = data + 3 * stride; // The core vertical SIMD processes 2 input rows simultaneously to generate // output corresponding to 1 row. To streamline the core loop and eliminate // the need for conditional checks, the remaining rows 4 are processed // separately. for (int i = 0; i < height - 4; i += 2) { // g0 ... g7 __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data)); // h0 ... h7 __m128i l8_7 = _mm_loadl_epi64((__m128i *)(data + stride)); __m128i l6 = _mm_unpacklo_epi8(l8_6, zero); // G(128bit):g0-g7(16b) __m128i l7 = _mm_unpacklo_epi8(l8_7, zero); // H(128bit):h0-h7(16b) PROCESS_RESIZE_Y_WD8 } __m128i l8_6 = _mm_loadl_epi64((__m128i *)(data)); __m128i l6 = _mm_unpacklo_epi8(l8_6, zero); // Process the last 4 input rows here. for (int i = height - 4; i < height; i += 2) { __m128i l7 = l6; PROCESS_RESIZE_Y_WD8 } } if (remain_col) return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2, stride, stride - remain_col); return true; } // Blends a and b using mask and returns the result. static inline __m128i blend(__m128i a, __m128i b, __m128i mask) { const __m128i masked_b = _mm_and_si128(mask, b); const __m128i masked_a = _mm_andnot_si128(mask, a); return (_mm_or_si128(masked_a, masked_b)); } // Masks used for width 16 pixels, with left and right padding // requirements. static const uint8_t left_padding_mask[16] = { 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; static const uint8_t right_padding_mask[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255 }; static const uint8_t mask_16[16] = { 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, }; void av1_resize_horz_dir_sse2(const uint8_t *const input, int in_stride, uint8_t *intbuf, int height, int filtered_length, int width2) { assert(height % 2 == 0); // Invoke C for width less than 16. if (filtered_length < 16) { av1_resize_horz_dir_c(input, in_stride, intbuf, height, filtered_length, width2); return; } __m128i coeffs_x[2]; const int bits = FILTER_BITS; const int dst_stride = width2; const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); const uint8_t max_pixel = 255; const __m128i clip_pixel = _mm_set1_epi8((char)max_pixel); const __m128i zero = _mm_setzero_si128(); const __m128i start_pad_mask = _mm_loadu_si128((__m128i *)left_padding_mask); const __m128i end_pad_mask = _mm_loadu_si128((__m128i *)right_padding_mask); const __m128i mask_even = _mm_loadu_si128((__m128i *)mask_16); prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x); for (int i = 0; i < height; ++i) { int filter_offset = 0; int row01_offset = ROW_OFFSET; int remain_col = filtered_length; // To avoid pixel over-read at frame boundary, processing of 16 pixels // is done using the core loop only if sufficient number of pixels required // for the load are present.The remaining pixels are processed separately. for (int j = 0; j <= filtered_length - 16; j += 16) { if (remain_col == 18 || remain_col == 20) { break; } const int is_last_cols16 = (j == filtered_length - 16); // While processing the last 16 pixels of the row, ensure that only valid // pixels are loaded. if (is_last_cols16) row01_offset = 0; const int in_idx = i * in_stride + j - filter_offset; const int out_idx = i * dst_stride + j / 2; remain_col -= 16; // a0 a1 a2 a3 .... a15 __m128i row00 = _mm_loadu_si128((__m128i *)&input[in_idx]); // a8 a9 a10 a11 .... a23 __m128i row01 = _mm_loadu_si128( (__m128i *)&input[in_idx + row01_offset + filter_offset]); filter_offset = 3; // Pad start pixels to the left, while processing the first pixels in the // row. if (j == 0) { const __m128i start_pixel_row0 = _mm_set1_epi8((char)input[i * in_stride]); row00 = blend(_mm_slli_si128(row00, 3), start_pixel_row0, start_pad_mask); } // Pad end pixels to the right, while processing the last pixels in the // row. if (is_last_cols16) { const __m128i end_pixel_row0 = _mm_set1_epi8((char)input[i * in_stride + filtered_length - 1]); row01 = blend(_mm_srli_si128(row01, ROW_OFFSET), end_pixel_row0, end_pad_mask); } // a2 a3 a4 a5 a6 a7 a8 a9 .... a17 const __m128i row0_1 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 2), _mm_srli_si128(row01, 2)); // a4 a5 a6 a7 a9 10 a11 a12 .... a19 const __m128i row0_2 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 4), _mm_srli_si128(row01, 4)); // a6 a7 a8 a9 a10 a11 a12 a13 .... a21 const __m128i row0_3 = _mm_unpacklo_epi64(_mm_srli_si128(row00, 6), _mm_srli_si128(row01, 6)); // a0 a2 a4 a6 a8 a10 a12 a14 (each 16 bit) const __m128i s0 = _mm_and_si128(row00, mask_even); // a1 a3 a5 a7 a9 a11 a13 a15 const __m128i s1 = _mm_and_si128(_mm_srli_epi16(row00, 8), mask_even); // a2 a4 a6 a8 a10 a12 a14 a16 const __m128i s2 = _mm_and_si128(row0_1, mask_even); // a3 a5 a7 a9 a11 a13 a15 a17 const __m128i s3 = _mm_and_si128(_mm_srli_epi16(row0_1, 8), mask_even); // a4 a6 a8 a10 a12 a14 a16 a18 const __m128i s4 = _mm_and_si128(row0_2, mask_even); // a5 a7 a9 a11 a13 a15 a17 a19 const __m128i s5 = _mm_and_si128(_mm_srli_epi16(row0_2, 8), mask_even); // a6 a8 a10 a12 a14 a16 a18 a20 const __m128i s6 = _mm_and_si128(row0_3, mask_even); // a7 a9 a11 a13 a15 a17 a19 a21 const __m128i s7 = _mm_and_si128(_mm_srli_epi16(row0_3, 8), mask_even); // a0a7 a2a9 a4a11 .... a12a19 a14a21 const __m128i s07 = _mm_add_epi16(s0, s7); // a1a6 a3a8 a5a10 .... a13a18 a15a20 const __m128i s16 = _mm_add_epi16(s1, s6); // a2a5 a4a7 a6a9 .... a14a17 a16a19 const __m128i s25 = _mm_add_epi16(s2, s5); // a3a4 a5a6 a7a8 .... a15a16 a17a18 const __m128i s34 = _mm_add_epi16(s3, s4); // a0a7 a1a6 a2a9 a3a8 a4a11 a5a10 a6a13 a7a12 const __m128i s1607_low = _mm_unpacklo_epi16(s07, s16); // a2a5 a3a4 a4a7 a5a6 a6a9 a7a8 a8a11 a9a10 const __m128i s3425_low = _mm_unpacklo_epi16(s25, s34); // a8a15 a9a14 a10a17 a11a16 a12a19 a13a18 a14a21 a15a20 const __m128i s1607_high = _mm_unpackhi_epi16(s07, s16); // a10a13 a11a12 a12a15 a13a14 a14a17 a15a16 a16a19 a17a18 const __m128i s3425_high = _mm_unpackhi_epi16(s25, s34); const __m128i r01_0 = _mm_madd_epi16(s3425_low, coeffs_x[1]); const __m128i r01_1 = _mm_madd_epi16(s1607_low, coeffs_x[0]); const __m128i r01_2 = _mm_madd_epi16(s3425_high, coeffs_x[1]); const __m128i r01_3 = _mm_madd_epi16(s1607_high, coeffs_x[0]); // Result of first 8 pixels of row0 (a0 to a7). // r0_0 r0_1 r0_2 r0_3 __m128i r00 = _mm_add_epi32(r01_0, r01_1); r00 = _mm_add_epi32(r00, round_const_bits); r00 = _mm_sra_epi32(r00, round_shift_bits); // Result of next 8 pixels of row0 (a8 to 15). // r0_4 r0_5 r0_6 r0_7 __m128i r01 = _mm_add_epi32(r01_2, r01_3); r01 = _mm_add_epi32(r01, round_const_bits); r01 = _mm_sra_epi32(r01, round_shift_bits); // r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7 const __m128i res_16 = _mm_packs_epi32(r00, r01); const __m128i res_8 = _mm_packus_epi16(res_16, res_16); __m128i res = _mm_min_epu8(res_8, clip_pixel); res = _mm_max_epu8(res, zero); // r0_0 r0_1 r1_2 r0_3 r0_4 r0_5 r0_6 r0_7 _mm_storel_epi64((__m128i *)&intbuf[out_idx], res); } int wd_processed = filtered_length - remain_col; if (remain_col) { const int in_idx = (in_stride * i); const int out_idx = (wd_processed / 2) + width2 * i; down2_symeven(input + in_idx, filtered_length, intbuf + out_idx, wd_processed); } } } aom-3.12.1/av1/common/x86/resize_ssse3.c000066400000000000000000001142361477627663500176010ustar00rootroot00000000000000/* * * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // SSSE3 #include "config/av1_rtcd.h" #include "config/aom_scale_rtcd.h" #include "aom_dsp/x86/convolve_sse2.h" #include "aom_dsp/x86/convolve_ssse3.h" #include "aom_dsp/x86/mem_sse2.h" #include "aom_dsp/x86/transpose_sse2.h" #include "av1/common/resize.h" static inline __m128i scale_plane_2_to_1_phase_0_kernel( const uint8_t *const src, const __m128i *const mask) { const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0])); const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16])); const __m128i a_and = _mm_and_si128(a, *mask); const __m128i b_and = _mm_and_si128(b, *mask); return _mm_packus_epi16(a_and, b_and); } static inline void shuffle_filter_odd_ssse3(const int16_t *const filter, __m128i *const f) { const __m128i f_values = _mm_load_si128((const __m128i *)filter); // pack and duplicate the filter values // It utilizes the fact that the high byte of filter[3] is always 0 to clean // half of f[0] and f[4]. assert(filter[3] >= 0 && filter[3] < 256); f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u)); f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u)); f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u)); f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au)); f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu)); } static inline __m128i convolve8_8_even_offset_ssse3(const __m128i *const s, const __m128i *const f) { // multiply 2 adjacent elements with the filter and add the result const __m128i k_64 = _mm_set1_epi16(1 << 6); const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); // compensate the subtracted 64 in f[1]. x4 is always non negative. const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64)); // add and saturate the results together __m128i temp = _mm_adds_epi16(x0, x3); temp = _mm_adds_epi16(temp, x1); temp = _mm_adds_epi16(temp, x2); temp = _mm_adds_epi16(temp, x4); // round and shift by 7 bit each 16 bit temp = _mm_adds_epi16(temp, k_64); temp = _mm_srai_epi16(temp, 7); return temp; } static inline __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s, const __m128i *const f) { // multiply 2 adjacent elements with the filter and add the result const __m128i k_64 = _mm_set1_epi16(1 << 6); const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]); // compensate the subtracted 64 in f[2]. x5 is always non negative. const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64)); __m128i temp; // add and saturate the results together temp = _mm_adds_epi16(x0, x1); temp = _mm_adds_epi16(temp, x2); temp = _mm_adds_epi16(temp, x3); temp = _mm_adds_epi16(temp, x4); temp = _mm_adds_epi16(temp, x5); // round and shift by 7 bit each 16 bit temp = _mm_adds_epi16(temp, k_64); temp = _mm_srai_epi16(temp, 7); return temp; } static void scale_plane_2_to_1_phase_0(const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const int dst_w, const int dst_h) { const int max_width = (dst_w + 15) & ~15; const __m128i mask = _mm_set1_epi16(0x00FF); int y = dst_h; do { int x = max_width; do { const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask); _mm_storeu_si128((__m128i *)dst, d); src += 32; dst += 16; x -= 16; } while (x); src += 2 * (src_stride - max_width); dst += dst_stride - max_width; } while (--y); } static void scale_plane_4_to_1_phase_0(const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const int dst_w, const int dst_h) { const int max_width = (dst_w + 15) & ~15; const __m128i mask = _mm_set1_epi32(0x000000FF); int y = dst_h; do { int x = max_width; do { const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask); const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask); const __m128i d2 = _mm_packus_epi16(d0, d1); _mm_storeu_si128((__m128i *)dst, d2); src += 64; dst += 16; x -= 16; } while (x); src += 4 * (src_stride - max_width); dst += dst_stride - max_width; } while (--y); } static inline __m128i scale_plane_bilinear_kernel(const __m128i *const s, const __m128i c0c1) { const __m128i k_64 = _mm_set1_epi16(1 << 6); const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1); const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1); // round and shift by 7 bit each 16 bit const __m128i t2 = _mm_adds_epi16(t0, k_64); const __m128i t3 = _mm_adds_epi16(t1, k_64); const __m128i t4 = _mm_srai_epi16(t2, 7); const __m128i t5 = _mm_srai_epi16(t3, 7); return _mm_packus_epi16(t4, t5); } static void scale_plane_2_to_1_bilinear(const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const int dst_w, const int dst_h, const __m128i c0c1) { const int max_width = (dst_w + 15) & ~15; int y = dst_h; do { int x = max_width; do { __m128i s[2], d[2]; // Horizontal // Even rows s[0] = _mm_loadu_si128((const __m128i *)(src + 0)); s[1] = _mm_loadu_si128((const __m128i *)(src + 16)); d[0] = scale_plane_bilinear_kernel(s, c0c1); // odd rows s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); d[1] = scale_plane_bilinear_kernel(s, c0c1); // Vertical s[0] = _mm_unpacklo_epi8(d[0], d[1]); s[1] = _mm_unpackhi_epi8(d[0], d[1]); d[0] = scale_plane_bilinear_kernel(s, c0c1); _mm_storeu_si128((__m128i *)dst, d[0]); src += 32; dst += 16; x -= 16; } while (x); src += 2 * (src_stride - max_width); dst += dst_stride - max_width; } while (--y); } static void scale_plane_4_to_1_bilinear(const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const int dst_w, const int dst_h, const __m128i c0c1) { const int max_width = (dst_w + 15) & ~15; int y = dst_h; do { int x = max_width; do { __m128i s[8], d[8]; // Note: Using _mm_packus_epi32() in SSE4.1 could be faster. // Here we tried to not use shuffle instructions which would be slow // on some x86 CPUs. // Horizontal // 000 001 xx xx 004 005 xx xx 008 009 xx xx 00C 00D xx xx // 010 011 xx xx 014 015 xx xx 018 019 xx xx 01C 01D xx xx // 020 021 xx xx 024 025 xx xx 028 029 xx xx 02C 02D xx xx // 030 031 xx xx 034 035 xx xx 038 039 xx xx 03C 03D xx xx // 100 101 xx xx 104 105 xx xx 108 109 xx xx 10C 10D xx xx // 110 111 xx xx 114 115 xx xx 118 119 xx xx 11C 11D xx xx // 120 121 xx xx 124 125 xx xx 128 129 xx xx 12C 12D xx xx // 130 131 xx xx 134 135 xx xx 138 139 xx xx 13C 13D xx xx s[0] = _mm_loadu_si128((const __m128i *)(&src[0])); s[1] = _mm_loadu_si128((const __m128i *)(&src[16])); s[2] = _mm_loadu_si128((const __m128i *)(&src[32])); s[3] = _mm_loadu_si128((const __m128i *)(&src[48])); s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32)); s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48)); // 000 001 100 101 xx xx xx xx 004 005 104 105 xx xx xx xx // 008 009 108 109 xx xx xx xx 00C 00D 10C 10D xx xx xx xx // 010 011 110 111 xx xx xx xx 014 015 114 115 xx xx xx xx // 018 019 118 119 xx xx xx xx 01C 01D 11C 11D xx xx xx xx // 020 021 120 121 xx xx xx xx 024 025 124 125 xx xx xx xx // 028 029 128 129 xx xx xx xx 02C 02D 12C 12D xx xx xx xx // 030 031 130 131 xx xx xx xx 034 035 134 135 xx xx xx xx // 038 039 138 139 xx xx xx xx 03C 03D 13C 13D xx xx xx xx d[0] = _mm_unpacklo_epi16(s[0], s[4]); d[1] = _mm_unpackhi_epi16(s[0], s[4]); d[2] = _mm_unpacklo_epi16(s[1], s[5]); d[3] = _mm_unpackhi_epi16(s[1], s[5]); d[4] = _mm_unpacklo_epi16(s[2], s[6]); d[5] = _mm_unpackhi_epi16(s[2], s[6]); d[6] = _mm_unpacklo_epi16(s[3], s[7]); d[7] = _mm_unpackhi_epi16(s[3], s[7]); // 000 001 100 101 008 009 108 109 xx xx xx xx xx xx xx xx // 004 005 104 105 00C 00D 10C 10D xx xx xx xx xx xx xx xx // 010 011 110 111 018 019 118 119 xx xx xx xx xx xx xx xx // 014 015 114 115 01C 01D 11C 11D xx xx xx xx xx xx xx xx // 020 021 120 121 028 029 128 129 xx xx xx xx xx xx xx xx // 024 025 124 125 02C 02D 12C 12D xx xx xx xx xx xx xx xx // 030 031 130 131 038 039 138 139 xx xx xx xx xx xx xx xx // 034 035 134 135 03C 03D 13C 13D xx xx xx xx xx xx xx xx s[0] = _mm_unpacklo_epi32(d[0], d[1]); s[1] = _mm_unpackhi_epi32(d[0], d[1]); s[2] = _mm_unpacklo_epi32(d[2], d[3]); s[3] = _mm_unpackhi_epi32(d[2], d[3]); s[4] = _mm_unpacklo_epi32(d[4], d[5]); s[5] = _mm_unpackhi_epi32(d[4], d[5]); s[6] = _mm_unpacklo_epi32(d[6], d[7]); s[7] = _mm_unpackhi_epi32(d[6], d[7]); // 000 001 100 101 004 005 104 105 008 009 108 109 00C 00D 10C 10D // 010 011 110 111 014 015 114 115 018 019 118 119 01C 01D 11C 11D // 020 021 120 121 024 025 124 125 028 029 128 129 02C 02D 12C 12D // 030 031 130 131 034 035 134 135 038 039 138 139 03C 03D 13C 13D d[0] = _mm_unpacklo_epi32(s[0], s[1]); d[1] = _mm_unpacklo_epi32(s[2], s[3]); d[2] = _mm_unpacklo_epi32(s[4], s[5]); d[3] = _mm_unpacklo_epi32(s[6], s[7]); d[0] = scale_plane_bilinear_kernel(&d[0], c0c1); d[1] = scale_plane_bilinear_kernel(&d[2], c0c1); // Vertical d[0] = scale_plane_bilinear_kernel(d, c0c1); _mm_storeu_si128((__m128i *)dst, d[0]); src += 64; dst += 16; x -= 16; } while (x); src += 4 * (src_stride - max_width); dst += dst_stride - max_width; } while (--y); } static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, const int w, const int h, const int16_t *const coef, uint8_t *const temp_buffer) { const int width_hor = (w + 1) & ~1; const int width_ver = (w + 7) & ~7; const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7; const int height_ver = (h + 1) & ~1; int x, y = height_hor; uint8_t *t = temp_buffer; __m128i s[11], d[4]; __m128i f[4]; assert(w && h); shuffle_filter_ssse3(coef, f); src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3; // horizontal 2x8 do { load_8bit_8x8(src + 4, src_stride, s); // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 (overlapped) // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) transpose_16bit_4x8(s, s); x = width_hor; do { src += 8; load_8bit_8x8(src, src_stride, &s[2]); // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B transpose_16bit_4x8(&s[2], &s[2]); d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 d[1] = convolve8_8_ssse3(&s[2], f); // 01 11 21 31 41 51 61 71 // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx // 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx d[0] = _mm_packus_epi16(d[0], d[0]); d[1] = _mm_packus_epi16(d[1], d[1]); // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 d[0] = _mm_unpacklo_epi16(d[0], d[1]); store_8bit_4x4_sse2(d[0], t, 2 * width_hor); s[0] = s[4]; s[1] = s[5]; t += 4; x -= 2; } while (x); src += 8 * src_stride - 4 * width_hor; t += 6 * width_hor; y -= 8; } while (y); // vertical 8x2 x = width_ver; t = temp_buffer; do { // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); t += 4 * width_hor; y = height_ver; do { // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 loadu_8bit_16x4(t, 2 * width_hor, &s[2]); t += 8 * width_hor; d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 d[1] = convolve8_8_ssse3(&s[2], f); // 10 11 12 13 14 15 16 17 // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 d[0] = _mm_packus_epi16(d[0], d[1]); _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); s[0] = s[4]; s[1] = s[5]; dst += 2 * dst_stride; y -= 2; } while (y); t -= width_hor * (4 * height_ver + 4); t += 16; dst -= height_ver * dst_stride; dst += 8; x -= 8; } while (x); } static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, const int w, const int h, const int16_t *const coef, uint8_t *const temp_buffer) { const int width_hor = (w + 3) & ~3; const int width_ver = (w + 7) & ~7; const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7; const int height_ver = (h + 3) & ~3; int x, y = height_hor; uint8_t *t = temp_buffer; __m128i s[11], d[4]; __m128i f[4]; assert(w && h); shuffle_filter_ssse3(coef, f); src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1; // horizontal 4x8 do { load_8bit_8x8(src + 2, src_stride, s); // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) transpose_16bit_4x8(s, s); x = width_hor; do { src += 8; load_8bit_8x8(src, src_stride, &s[3]); // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B // 0C 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D transpose_16bit_4x8(&s[3], &s[3]); d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 d[1] = convolve8_8_ssse3(&s[1], f); // 01 11 21 31 41 51 61 71 d[2] = convolve8_8_ssse3(&s[2], f); // 02 12 22 32 42 52 62 72 d[3] = convolve8_8_ssse3(&s[3], f); // 03 13 23 33 43 53 63 73 // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 d[0] = _mm_packus_epi16(d[0], d[2]); d[1] = _mm_packus_epi16(d[1], d[3]); // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 d[2] = _mm_unpacklo_epi16(d[0], d[1]); d[3] = _mm_unpackhi_epi16(d[0], d[1]); // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 d[0] = _mm_unpacklo_epi32(d[2], d[3]); d[1] = _mm_unpackhi_epi32(d[2], d[3]); store_8bit_8x4_from_16x2(d, t, 2 * width_hor); s[0] = s[4]; s[1] = s[5]; s[2] = s[6]; t += 8; x -= 4; } while (x); src += 8 * src_stride - 2 * width_hor; t += 6 * width_hor; y -= 8; } while (y); // vertical 8x4 x = width_ver; t = temp_buffer; do { // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor)); t += 6 * width_hor; y = height_ver; do { // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 77 loadu_8bit_16x4(t, 2 * width_hor, &s[3]); t += 8 * width_hor; d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 d[1] = convolve8_8_ssse3(&s[1], f); // 10 11 12 13 14 15 16 17 d[2] = convolve8_8_ssse3(&s[2], f); // 20 21 22 23 24 25 26 27 d[3] = convolve8_8_ssse3(&s[3], f); // 30 31 32 33 34 35 36 37 // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 d[0] = _mm_packus_epi16(d[0], d[1]); d[1] = _mm_packus_epi16(d[2], d[3]); store_8bit_8x4_from_16x2(d, dst, dst_stride); s[0] = s[4]; s[1] = s[5]; s[2] = s[6]; dst += 4 * dst_stride; y -= 4; } while (y); t -= width_hor * (2 * height_ver + 6); t += 16; dst -= height_ver * dst_stride; dst += 8; x -= 8; } while (x); } typedef void (*shuffle_filter_funcs)(const int16_t *const filter, __m128i *const f); typedef __m128i (*convolve8_funcs)(const __m128i *const s, const __m128i *const f); static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, uint8_t *dst, const int dst_stride, const int w, const int h, const InterpKernel *const coef, const int phase, uint8_t *const temp_buffer) { static const int step_q4 = 16 * 4 / 3; const int width_hor = (w + 5) - ((w + 5) % 6); const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels const int width_ver = (w + 7) & ~7; // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows // above and (SUBPEL_TAPS / 2) extra rows below. const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; const int height_ver = (h + 5) - ((h + 5) % 6); int x, y = height_hor; uint8_t *t = temp_buffer; __m128i s[12], d[6], dd[4]; __m128i f0[4], f1[5], f2[5]; // The offset of the first row is always less than 1 pixel. const int offset1_q4 = phase + 1 * step_q4; const int offset2_q4 = phase + 2 * step_q4; // offset_idxx indicates the pixel offset is even (0) or odd (1). // It's used to choose the src offset and filter coefficient offset. const int offset_idx1 = (offset1_q4 >> 4) & 1; const int offset_idx2 = (offset2_q4 >> 4) & 1; static const shuffle_filter_funcs shuffle_filter_func_list[2] = { shuffle_filter_ssse3, shuffle_filter_odd_ssse3 }; static const convolve8_funcs convolve8_func_list[2] = { convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3 }; assert(w && h); shuffle_filter_ssse3(coef[(phase + 0 * step_q4) & SUBPEL_MASK], f0); shuffle_filter_func_list[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1); shuffle_filter_func_list[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2); // Sub 64 to avoid overflow. // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here. // Coef 128 is in either fx[1] or fx[2] depending on the phase idx. // When filter phase idx is 1, the two biggest coefficients are shuffled // together, and the sum of them are always no less than 128. Sub 64 here. // After the subtraction, when the sum of all positive coefficients are no // larger than 128, and the sum of all negative coefficients are no // less than -128, there will be no overflow in the convolve8 functions. f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64)); f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64)); f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64)); src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1; // horizontal 6x8 do { load_8bit_8x8(src, src_stride, s); // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 transpose_16bit_4x8(s, s); x = width_hor; do { src += 8; load_8bit_8x8(src, src_stride, &s[4]); // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F transpose_16bit_4x8(&s[4], &s[4]); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1); d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2); d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx dd[0] = _mm_packus_epi16(d[0], d[2]); dd[1] = _mm_packus_epi16(d[1], d[3]); dd[2] = _mm_packus_epi16(d[4], d[4]); dd[3] = _mm_packus_epi16(d[5], d[5]); // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75 d[0] = _mm_unpacklo_epi16(dd[0], dd[1]); d[1] = _mm_unpackhi_epi16(dd[0], dd[1]); d[2] = _mm_unpacklo_epi16(dd[2], dd[3]); // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx dd[0] = _mm_unpacklo_epi32(d[0], d[1]); dd[1] = _mm_unpackhi_epi32(d[0], d[1]); dd[2] = _mm_unpacklo_epi32(d[2], d[2]); dd[3] = _mm_unpackhi_epi32(d[2], d[2]); // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx d[0] = _mm_unpacklo_epi64(dd[0], dd[2]); d[1] = _mm_unpackhi_epi64(dd[0], dd[2]); d[2] = _mm_unpacklo_epi64(dd[1], dd[3]); d[3] = _mm_unpackhi_epi64(dd[1], dd[3]); // store 4 extra pixels storeu_8bit_16x4(d, t, stride_hor); s[0] = s[4]; s[1] = s[5]; s[2] = s[6]; s[3] = s[7]; t += 12; x -= 6; } while (x); src += 8 * src_stride - 4 * width_hor / 3; t += 3 * stride_hor + 4; y -= 8; } while (y); // vertical 8x6 x = width_ver; t = temp_buffer; do { // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 loadu_8bit_16x4(t, stride_hor, s); y = height_ver; do { // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97 // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7 // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7 // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7 t += 4 * stride_hor; loadu_8bit_16x4(t, stride_hor, &s[4]); d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1); d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2); d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57 d[0] = _mm_packus_epi16(d[0], d[1]); d[2] = _mm_packus_epi16(d[2], d[3]); d[4] = _mm_packus_epi16(d[4], d[5]); _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]); _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]); _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]); _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]); s[0] = s[4]; s[1] = s[5]; s[2] = s[6]; s[3] = s[7]; dst += 6 * dst_stride; y -= 6; } while (y); t -= stride_hor * 2 * height_ver / 3; t += 16; dst -= height_ver * dst_stride; dst += 8; x -= 8; } while (x); } static inline __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s, const __m128i *const f) { __m128i ss[4], temp; ss[0] = _mm_unpacklo_epi8(s[0], s[1]); ss[1] = _mm_unpacklo_epi8(s[2], s[3]); ss[2] = _mm_unpacklo_epi8(s[4], s[5]); ss[3] = _mm_unpacklo_epi8(s[6], s[7]); temp = convolve8_8_ssse3(ss, f); return _mm_packus_epi16(temp, temp); } // Only calculate odd columns since even columns are just src pixels' copies. static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst, const int w, const __m128i *const f) { int x = w; do { __m128i s[8], temp; s[0] = _mm_loadl_epi64((const __m128i *)(src + 0)); s[1] = _mm_loadl_epi64((const __m128i *)(src + 1)); s[2] = _mm_loadl_epi64((const __m128i *)(src + 2)); s[3] = _mm_loadl_epi64((const __m128i *)(src + 3)); s[4] = _mm_loadl_epi64((const __m128i *)(src + 4)); s[5] = _mm_loadl_epi64((const __m128i *)(src + 5)); s[6] = _mm_loadl_epi64((const __m128i *)(src + 6)); s[7] = _mm_loadl_epi64((const __m128i *)(src + 7)); temp = scale_1_to_2_phase_0_kernel(s, f); _mm_storel_epi64((__m128i *)dst, temp); src += 8; dst += 8; x -= 8; } while (x); } static void scale_plane_1_to_2_phase_0(const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const int src_w, const int src_h, const int16_t *const coef, uint8_t *const temp_buffer) { int max_width; int y; uint8_t *tmp[9]; __m128i f[4]; max_width = (src_w + 7) & ~7; tmp[0] = temp_buffer + 0 * max_width; tmp[1] = temp_buffer + 1 * max_width; tmp[2] = temp_buffer + 2 * max_width; tmp[3] = temp_buffer + 3 * max_width; tmp[4] = temp_buffer + 4 * max_width; tmp[5] = temp_buffer + 5 * max_width; tmp[6] = temp_buffer + 6 * max_width; tmp[7] = temp_buffer + 7 * max_width; shuffle_filter_ssse3(coef, f); scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f); scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f); scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f); scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f); scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f); scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f); scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f); y = src_h; do { int x; scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f); for (x = 0; x < max_width; x += 8) { __m128i s[8], C, D, CD; // Even rows const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x)); const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); const __m128i ab = _mm_unpacklo_epi8(a, b); _mm_storeu_si128((__m128i *)(dst + 2 * x), ab); // Odd rows // Even columns load_8bit_8x8(src + x - 3 * src_stride, src_stride, s); C = scale_1_to_2_phase_0_kernel(s, f); // Odd columns s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x)); s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x)); s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x)); s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x)); s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x)); s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x)); s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x)); D = scale_1_to_2_phase_0_kernel(s, f); CD = _mm_unpacklo_epi8(C, D); _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD); } src += src_stride; dst += 2 * dst_stride; tmp[8] = tmp[0]; tmp[0] = tmp[1]; tmp[1] = tmp[2]; tmp[2] = tmp[3]; tmp[3] = tmp[4]; tmp[4] = tmp[5]; tmp[5] = tmp[6]; tmp[6] = tmp[7]; tmp[7] = tmp[8]; } while (--y); } // There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling and 2x upscaling // in SSSE3. static inline bool has_normative_scaler_ssse3(const int src_width, const int src_height, const int dst_width, const int dst_height) { const bool has_normative_scaler = (2 * dst_width == src_width && 2 * dst_height == src_height) || (4 * dst_width == src_width && 4 * dst_height == src_height) || (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) || (dst_width == src_width * 2 && dst_height == src_height * 2); return has_normative_scaler; } void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, const InterpFilter filter, const int phase, const int num_planes) { bool has_normative_scaler = has_normative_scaler_ssse3(src->y_crop_width, src->y_crop_height, dst->y_crop_width, dst->y_crop_height); if (num_planes > 1) { has_normative_scaler = has_normative_scaler && has_normative_scaler_ssse3(src->uv_crop_width, src->uv_crop_height, dst->uv_crop_width, dst->uv_crop_height); } if (!has_normative_scaler) { av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); return; } // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet // the static analysis warnings. int malloc_failed = 0; for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { const int is_uv = i > 0; const int src_w = src->crop_widths[is_uv]; const int src_h = src->crop_heights[is_uv]; const int src_y_w = (src->crop_widths[0] + 1) & ~1; const int dst_w = dst->crop_widths[is_uv]; const int dst_h = dst->crop_heights[is_uv]; const int dst_y_w = (dst->crop_widths[0] + 1) & ~1; const int dst_y_h = (dst->crop_heights[0] + 1) & ~1; if (2 * dst_w == src_w && 2 * dst_h == src_h) { // 2 to 1 if (phase == 0) { scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h); } else if (filter == BILINEAR) { const int16_t c0 = av1_bilinear_filters[phase][3]; const int16_t c1 = av1_bilinear_filters[phase][4]; const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h, c0c1); } else { const int buffer_stride = (dst_y_w + 3) & ~3; const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_stride * buffer_height); if (!temp_buffer) { malloc_failed = 1; break; } const InterpKernel *interp_kernel = (const InterpKernel *)av1_interp_filter_params_list[filter] .filter_ptr; scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h, interp_kernel[phase], temp_buffer); free(temp_buffer); } } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { // 4 to 1 if (phase == 0) { scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h); } else if (filter == BILINEAR) { const int16_t c0 = av1_bilinear_filters[phase][3]; const int16_t c1 = av1_bilinear_filters[phase][4]; const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h, c0c1); } else { const int buffer_stride = (dst_y_w + 1) & ~1; const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; // When dst_w is 1 or 2, we need extra padding to avoid heap read // overflow const int extra_padding = 16; uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding); if (!temp_buffer) { malloc_failed = 1; break; } const InterpKernel *interp_kernel = (const InterpKernel *)av1_interp_filter_params_list[filter] .filter_ptr; scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h, interp_kernel[phase], temp_buffer); free(temp_buffer); } } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) { // 4 to 3 const int buffer_stride_hor = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2; const int buffer_stride_ver = (dst_y_w + 7) & ~7; const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; // When the vertical filter reads more pixels than the horizontal filter // generated in each row, we need extra padding to avoid heap read // overflow. For example, the horizontal filter generates 18 pixels but // the vertical filter reads 24 pixels in a row. The difference is // multiplied by 2 since two rows are interlaced together in the // optimization. const int extra_padding = (buffer_stride_ver > buffer_stride_hor) ? 2 * (buffer_stride_ver - buffer_stride_hor) : 0; const int buffer_size = buffer_stride_hor * buffer_height + extra_padding; uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size); if (!temp_buffer) { malloc_failed = 1; break; } const InterpKernel *interp_kernel = (const InterpKernel *)av1_interp_filter_params_list[filter] .filter_ptr; scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], dst_w, dst_h, interp_kernel, phase, temp_buffer); free(temp_buffer); } else { assert(dst_w == src_w * 2 && dst_h == src_h * 2); // 1 to 2 uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_y_w + 7) & ~7)); if (!temp_buffer) { malloc_failed = 1; break; } const InterpKernel *interp_kernel = (const InterpKernel *)av1_interp_filter_params_list[filter] .filter_ptr; scale_plane_1_to_2_phase_0(src->buffers[i], src->strides[is_uv], dst->buffers[i], dst->strides[is_uv], src_w, src_h, interp_kernel[8], temp_buffer); free(temp_buffer); } } if (malloc_failed) { av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); } else { aom_extend_frame_borders(dst, num_planes); } } aom-3.12.1/av1/common/x86/selfguided_avx2.c000066400000000000000000000700101477627663500202220ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "av1/common/restoration.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" // Load 8 bytes from the possibly-misaligned pointer p, extend each byte to // 32-bit precision and return them in an AVX2 register. static __m256i yy256_load_extend_8_32(const void *p) { return _mm256_cvtepu8_epi32(xx_loadl_64(p)); } // Load 8 halfwords from the possibly-misaligned pointer p, extend each // halfword to 32-bit precision and return them in an AVX2 register. static __m256i yy256_load_extend_16_32(const void *p) { return _mm256_cvtepu16_epi32(xx_loadu_128(p)); } // Compute the scan of an AVX2 register holding 8 32-bit integers. If the // register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ..., // x0+x1+...+x7 // // Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers // (assumed small enough to be able to add them without overflow). // // Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a. // // x = [h g f e][d c b a] // x01 = [g f e 0][c b a 0] // x02 = [g+h f+g e+f e][c+d b+c a+b a] // x03 = [e+f e 0 0][a+b a 0 0] // x04 = [e->h e->g e->f e][a->d a->c a->b a] // s = a->d // s01 = [a->d a->d a->d a->d] // s02 = [a->d a->d a->d a->d][0 0 0 0] // ret = [a->h a->g a->f a->e][a->d a->c a->b a] static __m256i scan_32(__m256i x) { const __m256i x01 = _mm256_slli_si256(x, 4); const __m256i x02 = _mm256_add_epi32(x, x01); const __m256i x03 = _mm256_slli_si256(x02, 8); const __m256i x04 = _mm256_add_epi32(x02, x03); const int32_t s = _mm256_extract_epi32(x04, 3); const __m128i s01 = _mm_set1_epi32(s); const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1); return _mm256_add_epi32(x04, s02); } // Compute two integral images from src. B sums elements; A sums their // squares. The images are offset by one pixel, so will have width and height // equal to width + 1, height + 1 and the first row and column will be zero. // // A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple // of 8. static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) { unsigned int i = 0; for (i = 0; i < (count & 0xffffffe0); i += 32) { _mm256_storeu_si256((__m256i *)(dest + i), *zero); _mm256_storeu_si256((__m256i *)(dest + i + 8), *zero); _mm256_storeu_si256((__m256i *)(dest + i + 16), *zero); _mm256_storeu_si256((__m256i *)(dest + i + 24), *zero); } for (; i < (count & 0xfffffff8); i += 8) { _mm256_storeu_si256((__m256i *)(dest + i), *zero); } for (; i < count; i++) { dest[i] = 0; } return dest; } static void integral_images(const uint8_t *src, int src_stride, int width, int height, int32_t *A, int32_t *B, int buf_stride) { const __m256i zero = _mm256_setzero_si256(); // Write out the zero top row memset_zero_avx(A, &zero, (width + 8)); memset_zero_avx(B, &zero, (width + 8)); for (int i = 0; i < height; ++i) { // Zero the left column. A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; // ldiff is the difference H - D where H is the output sample immediately // to the left and D is the output sample above it. These are scalars, // replicated across the eight lanes. __m256i ldiff1 = zero, ldiff2 = zero; for (int j = 0; j < width; j += 8) { const int ABj = 1 + j; const __m256i above1 = yy_load_256(B + ABj + i * buf_stride); const __m256i above2 = yy_load_256(A + ABj + i * buf_stride); const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride); const __m256i x2 = _mm256_madd_epi16(x1, x1); const __m256i sc1 = scan_32(x1); const __m256i sc2 = scan_32(x2); const __m256i row1 = _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1); const __m256i row2 = _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2); yy_store_256(B + ABj + (i + 1) * buf_stride, row1); yy_store_256(A + ABj + (i + 1) * buf_stride, row2); // Calculate the new H - D. ldiff1 = _mm256_set1_epi32( _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7)); ldiff2 = _mm256_set1_epi32( _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7)); } } } // Compute two integral images from src. B sums elements; A sums their squares // // A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8. static void integral_images_highbd(const uint16_t *src, int src_stride, int width, int height, int32_t *A, int32_t *B, int buf_stride) { const __m256i zero = _mm256_setzero_si256(); // Write out the zero top row memset_zero_avx(A, &zero, (width + 8)); memset_zero_avx(B, &zero, (width + 8)); for (int i = 0; i < height; ++i) { // Zero the left column. A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; // ldiff is the difference H - D where H is the output sample immediately // to the left and D is the output sample above it. These are scalars, // replicated across the eight lanes. __m256i ldiff1 = zero, ldiff2 = zero; for (int j = 0; j < width; j += 8) { const int ABj = 1 + j; const __m256i above1 = yy_load_256(B + ABj + i * buf_stride); const __m256i above2 = yy_load_256(A + ABj + i * buf_stride); const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride); const __m256i x2 = _mm256_madd_epi16(x1, x1); const __m256i sc1 = scan_32(x1); const __m256i sc2 = scan_32(x2); const __m256i row1 = _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1); const __m256i row2 = _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2); yy_store_256(B + ABj + (i + 1) * buf_stride, row1); yy_store_256(A + ABj + (i + 1) * buf_stride, row2); // Calculate the new H - D. ldiff1 = _mm256_set1_epi32( _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7)); ldiff2 = _mm256_set1_epi32( _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7)); } } } // Compute 8 values of boxsum from the given integral image. ii should point // at the middle of the box (for the first value). r is the box radius. static inline __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) { const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride); const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride); const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride); const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride); const __m256i u = _mm256_sub_epi32(tr, tl); const __m256i v = _mm256_sub_epi32(br, bl); return _mm256_sub_epi32(v, u); } static __m256i round_for_shift(unsigned shift) { return _mm256_set1_epi32((1 << shift) >> 1); } static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) { __m256i an, bb; if (bit_depth > 8) { const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8)); const __m256i rounding_b = round_for_shift(bit_depth - 8); const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8)); const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8); const __m256i a = _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a); const __m256i b = _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b); // b < 2^14, so we can use a 16-bit madd rather than a 32-bit // mullo to square it bb = _mm256_madd_epi16(b, b); an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb); } else { bb = _mm256_madd_epi16(sum1, sum1); an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n)); } return _mm256_sub_epi32(an, bb); } // Assumes that C, D are integral images for the original buffer which has been // extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels // on the sides. A, B, C, D point at logical position (0, 0). static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, int width, int height, int buf_stride, int bit_depth, int sgr_params_idx, int radius_idx) { const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int n = (2 * r + 1) * (2 * r + 1); const __m256i s = _mm256_set1_epi32(params->s[radius_idx]); // one_over_n[n-1] is 2^12/n, so easily fits in an int16 const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]); const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); // Set up masks const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0); __m256i mask[8]; for (int idx = 0; idx < 8; idx++) { const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx)); mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); } for (int i = -1; i < height + 1; ++i) { for (int j = -1; j < width + 1; j += 8) { const int32_t *Cij = C + i * buf_stride + j; const int32_t *Dij = D + i * buf_stride + j; __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r); __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r); // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain // some uninitialised data in their upper words. We use a mask to // ensure that these bits are set to 0. int idx = AOMMIN(8, width + 1 - j); assert(idx >= 1); if (idx < 8) { sum1 = _mm256_and_si256(mask[idx], sum1); sum2 = _mm256_and_si256(mask[idx], sum2); } const __m256i p = compute_p(sum1, sum2, bit_depth, n); const __m256i z = _mm256_min_epi32( _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z), SGRPROJ_MTABLE_BITS), _mm256_set1_epi32(255)); const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4); yy_storeu_256(A + i * buf_stride + j, a_res); const __m256i a_complement = _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res); // sum1 might have lanes greater than 2^15, so we can't use madd to do // multiplication involving sum1. However, a_complement and one_over_n // are both less than 256, so we can multiply them first. const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n); const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1); const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS); yy_storeu_256(B + i * buf_stride + j, b_res); } } } // Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter // where the outer four corners have weight 3 and all other pixels have weight // 4. // // Pixels are indexed as follows: // xtl xt xtr // xl x xr // xbl xb xbr // // buf points to x // // fours = xl + xt + xr + xb + x // threes = xtl + xtr + xbr + xbl // cross_sum = 4 * fours + 3 * threes // = 4 * (fours + threes) - threes // = (fours + threes) << 2 - threes static inline __m256i cross_sum(const int32_t *buf, int stride) { const __m256i xtl = yy_loadu_256(buf - 1 - stride); const __m256i xt = yy_loadu_256(buf - stride); const __m256i xtr = yy_loadu_256(buf + 1 - stride); const __m256i xl = yy_loadu_256(buf - 1); const __m256i x = yy_loadu_256(buf); const __m256i xr = yy_loadu_256(buf + 1); const __m256i xbl = yy_loadu_256(buf - 1 + stride); const __m256i xb = yy_loadu_256(buf + stride); const __m256i xbr = yy_loadu_256(buf + 1 + stride); const __m256i fours = _mm256_add_epi32( xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x)))); const __m256i threes = _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl))); return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2), threes); } // The final filter for self-guided restoration. Computes a weighted average // across A, B with "cross sums" (see cross_sum implementation above). static void final_filter(int32_t *dst, int dst_stride, const int32_t *A, const int32_t *B, int buf_stride, const void *dgd8, int dgd_stride, int width, int height, int highbd) { const int nb = 5; const __m256i rounding = round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); const uint8_t *dgd_real = highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; j += 8) { const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride); const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride); const __m128i raw = xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); const __m256i src = highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding), SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); yy_storeu_256(dst + i * dst_stride + j, w); } } } // Assumes that C, D are integral images for the original buffer which has been // extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels // on the sides. A, B, C, D point at logical position (0, 0). static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, int width, int height, int buf_stride, int bit_depth, int sgr_params_idx, int radius_idx) { const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int n = (2 * r + 1) * (2 * r + 1); const __m256i s = _mm256_set1_epi32(params->s[radius_idx]); // one_over_n[n-1] is 2^12/n, so easily fits in an int16 const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]); const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); // Set up masks const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0); __m256i mask[8]; for (int idx = 0; idx < 8; idx++) { const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx)); mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); } for (int i = -1; i < height + 1; i += 2) { for (int j = -1; j < width + 1; j += 8) { const int32_t *Cij = C + i * buf_stride + j; const int32_t *Dij = D + i * buf_stride + j; __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r); __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r); // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain // some uninitialised data in their upper words. We use a mask to // ensure that these bits are set to 0. int idx = AOMMIN(8, width + 1 - j); assert(idx >= 1); if (idx < 8) { sum1 = _mm256_and_si256(mask[idx], sum1); sum2 = _mm256_and_si256(mask[idx], sum2); } const __m256i p = compute_p(sum1, sum2, bit_depth, n); const __m256i z = _mm256_min_epi32( _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z), SGRPROJ_MTABLE_BITS), _mm256_set1_epi32(255)); const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4); yy_storeu_256(A + i * buf_stride + j, a_res); const __m256i a_complement = _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res); // sum1 might have lanes greater than 2^15, so we can't use madd to do // multiplication involving sum1. However, a_complement and one_over_n // are both less than 256, so we can multiply them first. const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n); const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1); const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS); yy_storeu_256(B + i * buf_stride + j, b_res); } } } // Calculate 8 values of the "cross sum" starting at buf. // // Pixels are indexed like this: // xtl xt xtr // - buf - // xbl xb xbr // // Pixels are weighted like this: // 5 6 5 // 0 0 0 // 5 6 5 // // fives = xtl + xtr + xbl + xbr // sixes = xt + xb // cross_sum = 6 * sixes + 5 * fives // = 5 * (fives + sixes) - sixes // = (fives + sixes) << 2 + (fives + sixes) + sixes static inline __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) { const __m256i xtl = yy_loadu_256(buf - 1 - stride); const __m256i xt = yy_loadu_256(buf - stride); const __m256i xtr = yy_loadu_256(buf + 1 - stride); const __m256i xbl = yy_loadu_256(buf - 1 + stride); const __m256i xb = yy_loadu_256(buf + stride); const __m256i xbr = yy_loadu_256(buf + 1 + stride); const __m256i fives = _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl))); const __m256i sixes = _mm256_add_epi32(xt, xb); const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes); return _mm256_add_epi32( _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes), sixes); } // Calculate 8 values of the "cross sum" starting at buf. // // Pixels are indexed like this: // xl x xr // // Pixels are weighted like this: // 5 6 5 // // buf points to x // // fives = xl + xr // sixes = x // cross_sum = 5 * fives + 6 * sixes // = 4 * (fives + sixes) + (fives + sixes) + sixes // = (fives + sixes) << 2 + (fives + sixes) + sixes static inline __m256i cross_sum_fast_odd_row(const int32_t *buf) { const __m256i xl = yy_loadu_256(buf - 1); const __m256i x = yy_loadu_256(buf); const __m256i xr = yy_loadu_256(buf + 1); const __m256i fives = _mm256_add_epi32(xl, xr); const __m256i sixes = x; const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes); return _mm256_add_epi32( _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes), sixes); } // The final filter for the self-guided restoration. Computes a // weighted average across A, B with "cross sums" (see cross_sum_... // implementations above). static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A, const int32_t *B, int buf_stride, const void *dgd8, int dgd_stride, int width, int height, int highbd) { const int nb0 = 5; const int nb1 = 4; const __m256i rounding0 = round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); const __m256i rounding1 = round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); const uint8_t *dgd_real = highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; for (int i = 0; i < height; ++i) { if (!(i & 1)) { // even row for (int j = 0; j < width; j += 8) { const __m256i a = cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride); const __m256i b = cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride); const __m128i raw = xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); const __m256i src = highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding0), SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); yy_storeu_256(dst + i * dst_stride + j, w); } } else { // odd row for (int j = 0; j < width; j += 8) { const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j); const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j); const __m128i raw = xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); const __m256i src = highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding1), SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); yy_storeu_256(dst + i * dst_stride + j, w); } } } } int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height, int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride, int sgr_params_idx, int bit_depth, int highbd) { // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl, // Ctl and Dtl is 32-byte aligned. const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3); int32_t *buf = aom_memalign( 32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)); if (!buf) return -1; const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; // Adjusting the stride of A and B here appears to avoid bad cache effects, // leading to a significant speed improvement. // We also align the stride to a multiple of 32 bytes for efficiency. int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3); // The "tl" pointers point at the top-left of the initialised data for the // array. int32_t *Atl = buf + 0 * buf_elts + 7; int32_t *Btl = buf + 1 * buf_elts + 7; int32_t *Ctl = buf + 2 * buf_elts + 7; int32_t *Dtl = buf + 3 * buf_elts + 7; // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note // there's a zero row and column in A, B (integral images), so we move down // and right one for them. const int buf_diag_border = SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT; int32_t *A0 = Atl + 1 + buf_stride; int32_t *B0 = Btl + 1 + buf_stride; int32_t *C0 = Ctl + 1 + buf_stride; int32_t *D0 = Dtl + 1 + buf_stride; // Finally, A, B, C, D point at position (0, 0). int32_t *A = A0 + buf_diag_border; int32_t *B = B0 + buf_diag_border; int32_t *C = C0 + buf_diag_border; int32_t *D = D0 + buf_diag_border; const int dgd_diag_border = SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT; const uint8_t *dgd0 = dgd8 - dgd_diag_border; // Generate integral images from the input. C will contain sums of squares; D // will contain just sums if (highbd) integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext, height_ext, Ctl, Dtl, buf_stride); else integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl, buf_stride); const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; // Write to flt0 and flt1 // If params->r == 0 we skip the corresponding filter. We only allow one of // the radii to be 0, as having both equal to 0 would be equivalent to // skipping SGR entirely. assert(!(params->r[0] == 0 && params->r[1] == 0)); assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); if (params->r[0] > 0) { calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx, 0); final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, height, highbd); } if (params->r[1] > 0) { calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx, 1); final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, height, highbd); } aom_free(buf); return 0; } int av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst8, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd) { int32_t *flt0 = tmpbuf; int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; assert(width * height <= RESTORATION_UNITPELS_MAX); const int ret = av1_selfguided_restoration_avx2( dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); if (ret != 0) return ret; const sgr_params_type *const params = &av1_sgr_params[eps]; int xq[2]; av1_decode_xq(xqd, xq, params); __m256i xq0 = _mm256_set1_epi32(xq[0]); __m256i xq1 = _mm256_set1_epi32(xq[1]); for (int i = 0; i < height; ++i) { // Calculate output in batches of 16 pixels for (int j = 0; j < width; j += 16) { const int k = i * width + j; const int m = i * dst_stride + j; const uint8_t *dat8ij = dat8 + i * stride + j; __m256i ep_0, ep_1; __m128i src_0, src_1; if (highbd) { src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij)); src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8)); ep_0 = _mm256_cvtepu16_epi32(src_0); ep_1 = _mm256_cvtepu16_epi32(src_1); } else { src_0 = xx_loadu_128(dat8ij); ep_0 = _mm256_cvtepu8_epi32(src_0); ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8)); } const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS); const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS); __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS); __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS); if (params->r[0] > 0) { const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0); v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0)); const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1); v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1)); } if (params->r[1] > 0) { const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0); v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0)); const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1); v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1)); } const __m256i rounding = round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); const __m256i w_0 = _mm256_srai_epi32( _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); const __m256i w_1 = _mm256_srai_epi32( _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); if (highbd) { // Pack into 16 bits and clamp to [0, 2^bit_depth) // Note that packing into 16 bits messes up the order of the bits, // so we use a permute function to correct this const __m256i tmp = _mm256_packus_epi32(w_0, w_1); const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8); const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1); const __m256i res = _mm256_min_epi16(tmp2, max); yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res); } else { // Pack into 8 bits and clamp to [0, 256) // Note that each pack messes up the order of the bits, // so we use a permute function to correct this const __m256i tmp = _mm256_packs_epi32(w_0, w_1); const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8); const __m256i res = _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */); const __m128i res2 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8)); xx_storeu_128(dst8 + m, res2); } } } return 0; } aom-3.12.1/av1/common/x86/selfguided_sse4.c000066400000000000000000000635531477627663500202360ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "av1/common/restoration.h" #include "aom_dsp/x86/synonyms.h" // Load 4 bytes from the possibly-misaligned pointer p, extend each byte to // 32-bit precision and return them in an SSE register. static __m128i xx_load_extend_8_32(const void *p) { return _mm_cvtepu8_epi32(xx_loadl_32(p)); } // Load 4 halfwords from the possibly-misaligned pointer p, extend each // halfword to 32-bit precision and return them in an SSE register. static __m128i xx_load_extend_16_32(const void *p) { return _mm_cvtepu16_epi32(xx_loadl_64(p)); } // Compute the scan of an SSE register holding 4 32-bit integers. If the // register holds x0..x3 then the scan will hold x0, x0+x1, x0+x1+x2, // x0+x1+x2+x3 static __m128i scan_32(__m128i x) { const __m128i x01 = _mm_add_epi32(x, _mm_slli_si128(x, 4)); return _mm_add_epi32(x01, _mm_slli_si128(x01, 8)); } // Compute two integral images from src. B sums elements; A sums their // squares. The images are offset by one pixel, so will have width and height // equal to width + 1, height + 1 and the first row and column will be zero. // // A+1 and B+1 should be aligned to 16 bytes. buf_stride should be a multiple // of 4. static void integral_images(const uint8_t *src, int src_stride, int width, int height, int32_t *A, int32_t *B, int buf_stride) { // Write out the zero top row memset(A, 0, sizeof(*A) * (width + 1)); memset(B, 0, sizeof(*B) * (width + 1)); const __m128i zero = _mm_setzero_si128(); for (int i = 0; i < height; ++i) { // Zero the left column. A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; // ldiff is the difference H - D where H is the output sample immediately // to the left and D is the output sample above it. These are scalars, // replicated across the four lanes. __m128i ldiff1 = zero, ldiff2 = zero; for (int j = 0; j < width; j += 4) { const int ABj = 1 + j; const __m128i above1 = xx_load_128(B + ABj + i * buf_stride); const __m128i above2 = xx_load_128(A + ABj + i * buf_stride); const __m128i x1 = xx_load_extend_8_32(src + j + i * src_stride); const __m128i x2 = _mm_madd_epi16(x1, x1); const __m128i sc1 = scan_32(x1); const __m128i sc2 = scan_32(x2); const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1); const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2); xx_store_128(B + ABj + (i + 1) * buf_stride, row1); xx_store_128(A + ABj + (i + 1) * buf_stride, row2); // Calculate the new H - D. ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff); ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff); } } } // Compute two integral images from src. B sums elements; A sums their squares // // A and B should be aligned to 16 bytes. buf_stride should be a multiple of 4. static void integral_images_highbd(const uint16_t *src, int src_stride, int width, int height, int32_t *A, int32_t *B, int buf_stride) { // Write out the zero top row memset(A, 0, sizeof(*A) * (width + 1)); memset(B, 0, sizeof(*B) * (width + 1)); const __m128i zero = _mm_setzero_si128(); for (int i = 0; i < height; ++i) { // Zero the left column. A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; // ldiff is the difference H - D where H is the output sample immediately // to the left and D is the output sample above it. These are scalars, // replicated across the four lanes. __m128i ldiff1 = zero, ldiff2 = zero; for (int j = 0; j < width; j += 4) { const int ABj = 1 + j; const __m128i above1 = xx_load_128(B + ABj + i * buf_stride); const __m128i above2 = xx_load_128(A + ABj + i * buf_stride); const __m128i x1 = xx_load_extend_16_32(src + j + i * src_stride); const __m128i x2 = _mm_madd_epi16(x1, x1); const __m128i sc1 = scan_32(x1); const __m128i sc2 = scan_32(x2); const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1); const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2); xx_store_128(B + ABj + (i + 1) * buf_stride, row1); xx_store_128(A + ABj + (i + 1) * buf_stride, row2); // Calculate the new H - D. ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff); ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff); } } } // Compute 4 values of boxsum from the given integral image. ii should point // at the middle of the box (for the first value). r is the box radius. static inline __m128i boxsum_from_ii(const int32_t *ii, int stride, int r) { const __m128i tl = xx_loadu_128(ii - (r + 1) - (r + 1) * stride); const __m128i tr = xx_loadu_128(ii + (r + 0) - (r + 1) * stride); const __m128i bl = xx_loadu_128(ii - (r + 1) + r * stride); const __m128i br = xx_loadu_128(ii + (r + 0) + r * stride); const __m128i u = _mm_sub_epi32(tr, tl); const __m128i v = _mm_sub_epi32(br, bl); return _mm_sub_epi32(v, u); } static __m128i round_for_shift(unsigned shift) { return _mm_set1_epi32((1 << shift) >> 1); } static __m128i compute_p(__m128i sum1, __m128i sum2, int bit_depth, int n) { __m128i an, bb; if (bit_depth > 8) { const __m128i rounding_a = round_for_shift(2 * (bit_depth - 8)); const __m128i rounding_b = round_for_shift(bit_depth - 8); const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8)); const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8); const __m128i a = _mm_srl_epi32(_mm_add_epi32(sum2, rounding_a), shift_a); const __m128i b = _mm_srl_epi32(_mm_add_epi32(sum1, rounding_b), shift_b); // b < 2^14, so we can use a 16-bit madd rather than a 32-bit // mullo to square it bb = _mm_madd_epi16(b, b); an = _mm_max_epi32(_mm_mullo_epi32(a, _mm_set1_epi32(n)), bb); } else { bb = _mm_madd_epi16(sum1, sum1); an = _mm_mullo_epi32(sum2, _mm_set1_epi32(n)); } return _mm_sub_epi32(an, bb); } // Assumes that C, D are integral images for the original buffer which has been // extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels // on the sides. A, B, C, D point at logical position (0, 0). static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, int width, int height, int buf_stride, int bit_depth, int sgr_params_idx, int radius_idx) { const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int n = (2 * r + 1) * (2 * r + 1); const __m128i s = _mm_set1_epi32(params->s[radius_idx]); // one_over_n[n-1] is 2^12/n, so easily fits in an int16 const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]); const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); // Set up masks const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0); __m128i mask[4]; for (int idx = 0; idx < 4; idx++) { const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx)); mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); } for (int i = -1; i < height + 1; ++i) { for (int j = -1; j < width + 1; j += 4) { const int32_t *Cij = C + i * buf_stride + j; const int32_t *Dij = D + i * buf_stride + j; __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r); __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r); // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain // some uninitialised data in their upper words. We use a mask to // ensure that these bits are set to 0. int idx = AOMMIN(4, width + 1 - j); assert(idx >= 1); if (idx < 4) { sum1 = _mm_and_si128(mask[idx], sum1); sum2 = _mm_and_si128(mask[idx], sum2); } const __m128i p = compute_p(sum1, sum2, bit_depth, n); const __m128i z = _mm_min_epi32( _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z), SGRPROJ_MTABLE_BITS), _mm_set1_epi32(255)); // 'Gather' type instructions are not available pre-AVX2, so synthesize a // gather using scalar loads. const __m128i a_res = _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)], av1_x_by_xplus1[_mm_extract_epi32(z, 2)], av1_x_by_xplus1[_mm_extract_epi32(z, 1)], av1_x_by_xplus1[_mm_extract_epi32(z, 0)]); xx_storeu_128(A + i * buf_stride + j, a_res); const __m128i a_complement = _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res); // sum1 might have lanes greater than 2^15, so we can't use madd to do // multiplication involving sum1. However, a_complement and one_over_n // are both less than 256, so we can multiply them first. const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n); const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1); const __m128i b_res = _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS); xx_storeu_128(B + i * buf_stride + j, b_res); } } } // Calculate 4 values of the "cross sum" starting at buf. This is a 3x3 filter // where the outer four corners have weight 3 and all other pixels have weight // 4. // // Pixels are indexed like this: // xtl xt xtr // xl x xr // xbl xb xbr // // buf points to x // // fours = xl + xt + xr + xb + x // threes = xtl + xtr + xbr + xbl // cross_sum = 4 * fours + 3 * threes // = 4 * (fours + threes) - threes // = (fours + threes) << 2 - threes static inline __m128i cross_sum(const int32_t *buf, int stride) { const __m128i xtl = xx_loadu_128(buf - 1 - stride); const __m128i xt = xx_loadu_128(buf - stride); const __m128i xtr = xx_loadu_128(buf + 1 - stride); const __m128i xl = xx_loadu_128(buf - 1); const __m128i x = xx_loadu_128(buf); const __m128i xr = xx_loadu_128(buf + 1); const __m128i xbl = xx_loadu_128(buf - 1 + stride); const __m128i xb = xx_loadu_128(buf + stride); const __m128i xbr = xx_loadu_128(buf + 1 + stride); const __m128i fours = _mm_add_epi32( xl, _mm_add_epi32(xt, _mm_add_epi32(xr, _mm_add_epi32(xb, x)))); const __m128i threes = _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl))); return _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(fours, threes), 2), threes); } // The final filter for self-guided restoration. Computes a weighted average // across A, B with "cross sums" (see cross_sum implementation above). static void final_filter(int32_t *dst, int dst_stride, const int32_t *A, const int32_t *B, int buf_stride, const void *dgd8, int dgd_stride, int width, int height, int highbd) { const int nb = 5; const __m128i rounding = round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); const uint8_t *dgd_real = highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; j += 4) { const __m128i a = cross_sum(A + i * buf_stride + j, buf_stride); const __m128i b = cross_sum(B + i * buf_stride + j, buf_stride); const __m128i raw = xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd)); const __m128i src = highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw); __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b); __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding), SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); xx_storeu_128(dst + i * dst_stride + j, w); } } } // Assumes that C, D are integral images for the original buffer which has been // extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels // on the sides. A, B, C, D point at logical position (0, 0). static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, int width, int height, int buf_stride, int bit_depth, int sgr_params_idx, int radius_idx) { const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; const int r = params->r[radius_idx]; const int n = (2 * r + 1) * (2 * r + 1); const __m128i s = _mm_set1_epi32(params->s[radius_idx]); // one_over_n[n-1] is 2^12/n, so easily fits in an int16 const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]); const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); // Set up masks const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0); __m128i mask[4]; for (int idx = 0; idx < 4; idx++) { const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx)); mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); } for (int i = -1; i < height + 1; i += 2) { for (int j = -1; j < width + 1; j += 4) { const int32_t *Cij = C + i * buf_stride + j; const int32_t *Dij = D + i * buf_stride + j; __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r); __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r); // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain // some uninitialised data in their upper words. We use a mask to // ensure that these bits are set to 0. int idx = AOMMIN(4, width + 1 - j); assert(idx >= 1); if (idx < 4) { sum1 = _mm_and_si128(mask[idx], sum1); sum2 = _mm_and_si128(mask[idx], sum2); } const __m128i p = compute_p(sum1, sum2, bit_depth, n); const __m128i z = _mm_min_epi32( _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z), SGRPROJ_MTABLE_BITS), _mm_set1_epi32(255)); // 'Gather' type instructions are not available pre-AVX2, so synthesize a // gather using scalar loads. const __m128i a_res = _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)], av1_x_by_xplus1[_mm_extract_epi32(z, 2)], av1_x_by_xplus1[_mm_extract_epi32(z, 1)], av1_x_by_xplus1[_mm_extract_epi32(z, 0)]); xx_storeu_128(A + i * buf_stride + j, a_res); const __m128i a_complement = _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res); // sum1 might have lanes greater than 2^15, so we can't use madd to do // multiplication involving sum1. However, a_complement and one_over_n // are both less than 256, so we can multiply them first. const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n); const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1); const __m128i b_res = _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS); xx_storeu_128(B + i * buf_stride + j, b_res); } } } // Calculate 4 values of the "cross sum" starting at buf. // // Pixels are indexed like this: // xtl xt xtr // - buf - // xbl xb xbr // // Pixels are weighted like this: // 5 6 5 // 0 0 0 // 5 6 5 // // fives = xtl + xtr + xbl + xbr // sixes = xt + xb // cross_sum = 6 * sixes + 5 * fives // = 5 * (fives + sixes) - sixes // = (fives + sixes) << 2 + (fives + sixes) + sixes static inline __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) { const __m128i xtl = xx_loadu_128(buf - 1 - stride); const __m128i xt = xx_loadu_128(buf - stride); const __m128i xtr = xx_loadu_128(buf + 1 - stride); const __m128i xbl = xx_loadu_128(buf - 1 + stride); const __m128i xb = xx_loadu_128(buf + stride); const __m128i xbr = xx_loadu_128(buf + 1 + stride); const __m128i fives = _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl))); const __m128i sixes = _mm_add_epi32(xt, xb); const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes); return _mm_add_epi32( _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes), sixes); } // Calculate 4 values of the "cross sum" starting at buf. // // Pixels are indexed like this: // xl x xr // // Pixels are weighted like this: // 5 6 5 // // buf points to x // // fives = xl + xr // sixes = x // cross_sum = 5 * fives + 6 * sixes // = 4 * (fives + sixes) + (fives + sixes) + sixes // = (fives + sixes) << 2 + (fives + sixes) + sixes static inline __m128i cross_sum_fast_odd_row(const int32_t *buf) { const __m128i xl = xx_loadu_128(buf - 1); const __m128i x = xx_loadu_128(buf); const __m128i xr = xx_loadu_128(buf + 1); const __m128i fives = _mm_add_epi32(xl, xr); const __m128i sixes = x; const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes); return _mm_add_epi32( _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes), sixes); } // The final filter for the self-guided restoration. Computes a // weighted average across A, B with "cross sums" (see cross_sum_... // implementations above). static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A, const int32_t *B, int buf_stride, const void *dgd8, int dgd_stride, int width, int height, int highbd) { const int nb0 = 5; const int nb1 = 4; const __m128i rounding0 = round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); const __m128i rounding1 = round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); const uint8_t *dgd_real = highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; for (int i = 0; i < height; ++i) { if (!(i & 1)) { // even row for (int j = 0; j < width; j += 4) { const __m128i a = cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride); const __m128i b = cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride); const __m128i raw = xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd)); const __m128i src = highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw); __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b); __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0), SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); xx_storeu_128(dst + i * dst_stride + j, w); } } else { // odd row for (int j = 0; j < width; j += 4) { const __m128i a = cross_sum_fast_odd_row(A + i * buf_stride + j); const __m128i b = cross_sum_fast_odd_row(B + i * buf_stride + j); const __m128i raw = xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd)); const __m128i src = highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw); __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b); __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1), SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); xx_storeu_128(dst + i * dst_stride + j, w); } } } } int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, int height, int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride, int sgr_params_idx, int bit_depth, int highbd) { int32_t *buf = (int32_t *)aom_memalign( 16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS); if (!buf) return -1; memset(buf, 0, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS); const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; // Adjusting the stride of A and B here appears to avoid bad cache effects, // leading to a significant speed improvement. // We also align the stride to a multiple of 16 bytes for efficiency. int buf_stride = ((width_ext + 3) & ~3) + 16; // The "tl" pointers point at the top-left of the initialised data for the // array. Adding 3 here ensures that column 1 is 16-byte aligned. int32_t *Atl = buf + 0 * RESTORATION_PROC_UNIT_PELS + 3; int32_t *Btl = buf + 1 * RESTORATION_PROC_UNIT_PELS + 3; int32_t *Ctl = buf + 2 * RESTORATION_PROC_UNIT_PELS + 3; int32_t *Dtl = buf + 3 * RESTORATION_PROC_UNIT_PELS + 3; // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note // there's a zero row and column in A, B (integral images), so we move down // and right one for them. const int buf_diag_border = SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT; int32_t *A0 = Atl + 1 + buf_stride; int32_t *B0 = Btl + 1 + buf_stride; int32_t *C0 = Ctl + 1 + buf_stride; int32_t *D0 = Dtl + 1 + buf_stride; // Finally, A, B, C, D point at position (0, 0). int32_t *A = A0 + buf_diag_border; int32_t *B = B0 + buf_diag_border; int32_t *C = C0 + buf_diag_border; int32_t *D = D0 + buf_diag_border; const int dgd_diag_border = SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT; const uint8_t *dgd0 = dgd8 - dgd_diag_border; // Generate integral images from the input. C will contain sums of squares; D // will contain just sums if (highbd) integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext, height_ext, Ctl, Dtl, buf_stride); else integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl, buf_stride); const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; // Write to flt0 and flt1 // If params->r == 0 we skip the corresponding filter. We only allow one of // the radii to be 0, as having both equal to 0 would be equivalent to // skipping SGR entirely. assert(!(params->r[0] == 0 && params->r[1] == 0)); assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); if (params->r[0] > 0) { calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx, 0); final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, height, highbd); } if (params->r[1] > 0) { calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx, 1); final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, height, highbd); } aom_free(buf); return 0; } int av1_apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst8, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd) { int32_t *flt0 = tmpbuf; int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; assert(width * height <= RESTORATION_UNITPELS_MAX); const int ret = av1_selfguided_restoration_sse4_1( dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); if (ret != 0) return ret; const sgr_params_type *const params = &av1_sgr_params[eps]; int xq[2]; av1_decode_xq(xqd, xq, params); __m128i xq0 = _mm_set1_epi32(xq[0]); __m128i xq1 = _mm_set1_epi32(xq[1]); for (int i = 0; i < height; ++i) { // Calculate output in batches of 8 pixels for (int j = 0; j < width; j += 8) { const int k = i * width + j; const int m = i * dst_stride + j; const uint8_t *dat8ij = dat8 + i * stride + j; __m128i src; if (highbd) { src = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij)); } else { src = _mm_cvtepu8_epi16(xx_loadl_64(dat8ij)); } const __m128i u = _mm_slli_epi16(src, SGRPROJ_RST_BITS); const __m128i u_0 = _mm_cvtepu16_epi32(u); const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(u, 8)); __m128i v_0 = _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS); __m128i v_1 = _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS); if (params->r[0] > 0) { const __m128i f1_0 = _mm_sub_epi32(xx_loadu_128(&flt0[k]), u_0); v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq0, f1_0)); const __m128i f1_1 = _mm_sub_epi32(xx_loadu_128(&flt0[k + 4]), u_1); v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq0, f1_1)); } if (params->r[1] > 0) { const __m128i f2_0 = _mm_sub_epi32(xx_loadu_128(&flt1[k]), u_0); v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq1, f2_0)); const __m128i f2_1 = _mm_sub_epi32(xx_loadu_128(&flt1[k + 4]), u_1); v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq1, f2_1)); } const __m128i rounding = round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); if (highbd) { // Pack into 16 bits and clamp to [0, 2^bit_depth) const __m128i tmp = _mm_packus_epi32(w_0, w_1); const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1); const __m128i res = _mm_min_epi16(tmp, max); xx_storeu_128(CONVERT_TO_SHORTPTR(dst8 + m), res); } else { // Pack into 8 bits and clamp to [0, 256) const __m128i tmp = _mm_packs_epi32(w_0, w_1); const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */); xx_storel_64(dst8 + m, res); } } } return 0; } aom-3.12.1/av1/common/x86/warp_plane_avx2.c000066400000000000000000001511021477627663500202410ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #include "av1/common/warped_motion.h" #include "aom_dsp/x86/synonyms.h" DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }; DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = { 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3 }; DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = { 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5 }; DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = { 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7 }; DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 }; DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = { 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 }; DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = { 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 }; DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = { 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 }; DECLARE_ALIGNED(32, static const uint8_t, shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, 5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 }; DECLARE_ALIGNED(32, static const uint8_t, shuffle_src1[32]) = { 4, 6, 6, 8, 8, 10, 10, 12, 5, 7, 7, 9, 9, 11, 11, 13, 4, 6, 6, 8, 8, 10, 10, 12, 5, 7, 7, 9, 9, 11, 11, 13 }; DECLARE_ALIGNED(32, static const uint8_t, shuffle_src2[32]) = { 1, 3, 3, 5, 5, 7, 7, 9, 2, 4, 4, 6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7, 7, 9, 2, 4, 4, 6, 6, 8, 8, 10 }; DECLARE_ALIGNED(32, static const uint8_t, shuffle_src3[32]) = { 5, 7, 7, 9, 9, 11, 11, 13, 6, 8, 8, 10, 10, 12, 12, 14, 5, 7, 7, 9, 9, 11, 11, 13, 6, 8, 8, 10, 10, 12, 12, 14 }; static inline void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out, __m256i *coeff, const __m256i *shuffle_src, const __m256i *round_const, const __m128i *shift, int row) { const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]); const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]); const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]); const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]); const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]); const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]); const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]); const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]); const __m256i res_even = _mm256_add_epi16(res_02, res_46); const __m256i res_odd = _mm256_add_epi16(res_13, res_57); const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const); horz_out[row] = _mm256_srl_epi16(res, *shift); } static inline void prepare_horizontal_filter_coeff_avx2(int alpha, int beta, int sx, __m256i *coeff) { __m128i tmp_0 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >> WARPEDDIFF_PREC_BITS]); __m128i tmp_1 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >> WARPEDDIFF_PREC_BITS]); __m128i tmp_2 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >> WARPEDDIFF_PREC_BITS]); __m128i tmp_3 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >> WARPEDDIFF_PREC_BITS]); __m128i tmp_4 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >> WARPEDDIFF_PREC_BITS]); __m128i tmp_5 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >> WARPEDDIFF_PREC_BITS]); __m128i tmp_6 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >> WARPEDDIFF_PREC_BITS]); __m128i tmp_7 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >> WARPEDDIFF_PREC_BITS]); __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0); __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2); __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1); __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3); __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4); __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6); __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5); __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7); __m128i tmp_8 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1); __m128i tmp_9 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1); __m128i tmp_10 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1); __m128i tmp_11 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1); tmp_2 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1); tmp_3 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1); tmp_6 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1); tmp_7 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1); const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256); const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256); const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256); const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256); const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14); const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14); const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15); const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15); coeff[0] = _mm256_unpacklo_epi64(res_0, res_2); coeff[1] = _mm256_unpackhi_epi64(res_0, res_2); coeff[2] = _mm256_unpacklo_epi64(res_1, res_3); coeff[3] = _mm256_unpackhi_epi64(res_1, res_3); } static inline void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx, __m256i *coeff) { __m128i tmp_0 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); __m128i tmp_1 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); __m128i tmp_2 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); __m128i tmp_3 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); __m128i tmp_4 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); __m128i tmp_5 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); __m128i tmp_6 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); __m128i tmp_7 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2); tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3); tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6); tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7); const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0); const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1); const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4); const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5); const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14); const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14); const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15); const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15); coeff[0] = _mm256_unpacklo_epi64(res_0, res_2); coeff[1] = _mm256_unpackhi_epi64(res_0, res_2); coeff[2] = _mm256_unpacklo_epi64(res_1, res_3); coeff[3] = _mm256_unpackhi_epi64(res_1, res_3); } static inline void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx, __m256i *coeff) { const __m128i tmp_0 = _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_1 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]); const __m256i res_0 = _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1); coeff[0] = _mm256_shuffle_epi8( res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2)); coeff[1] = _mm256_shuffle_epi8( res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2)); coeff[2] = _mm256_shuffle_epi8( res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2)); coeff[3] = _mm256_shuffle_epi8( res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2)); } static inline void horizontal_filter_avx2(const __m256i src, __m256i *horz_out, int sx, int alpha, int beta, int row, const __m256i *shuffle_src, const __m256i *round_const, const __m128i *shift) { __m256i coeff[4]; prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff); filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift, row); } static inline void prepare_horizontal_filter_coeff(int alpha, int sx, __m256i *coeff) { const __m128i tmp_0 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_1 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_2 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_3 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_4 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_5 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_6 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_7 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2); const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3); const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6); const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7); const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10); const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10); const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11); const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11); coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14)); coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14)); coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15)); coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15)); } static inline void warp_horizontal_filter_avx2( const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, int32_t sx4, int alpha, int beta, int p_height, int height, int i, const __m256i *round_const, const __m128i *shift, const __m256i *shuffle_src) { int k, iy, sx, row = 0; __m256i coeff[4]; for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { iy = iy4 + k; iy = clamp(iy, 0, height - 1); const __m128i src_0 = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); iy = iy4 + k + 1; iy = clamp(iy, 0, height - 1); const __m128i src_1 = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); const __m256i src_01 = _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); sx = sx4 + beta * (k + 4); horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src, round_const, shift); row += 1; } iy = iy4 + k; iy = clamp(iy, 0, height - 1); const __m256i src_01 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); sx = sx4 + beta * (k + 4); prepare_horizontal_filter_coeff(alpha, sx, coeff); filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, shift, row); } static inline void warp_horizontal_filter_alpha0_avx2( const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, int32_t sx4, int alpha, int beta, int p_height, int height, int i, const __m256i *round_const, const __m128i *shift, const __m256i *shuffle_src) { (void)alpha; int k, iy, sx, row = 0; __m256i coeff[4]; for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { iy = iy4 + k; iy = clamp(iy, 0, height - 1); const __m128i src_0 = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); iy = iy4 + k + 1; iy = clamp(iy, 0, height - 1); const __m128i src_1 = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); const __m256i src_01 = _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); sx = sx4 + beta * (k + 4); prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff); filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, shift, row); row += 1; } iy = iy4 + k; iy = clamp(iy, 0, height - 1); const __m256i src_01 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); sx = sx4 + beta * (k + 4); prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff); filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, shift, row); } static inline void warp_horizontal_filter_beta0_avx2( const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, int32_t sx4, int alpha, int beta, int p_height, int height, int i, const __m256i *round_const, const __m128i *shift, const __m256i *shuffle_src) { (void)beta; int k, iy, row = 0; __m256i coeff[4]; prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff); for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { iy = iy4 + k; iy = clamp(iy, 0, height - 1); const __m128i src_0 = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); iy = iy4 + k + 1; iy = clamp(iy, 0, height - 1); const __m128i src_1 = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); const __m256i src_01 = _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, shift, row); row += 1; } iy = iy4 + k; iy = clamp(iy, 0, height - 1); const __m256i src_01 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, shift, row); } static inline void warp_horizontal_filter_alpha0_beta0_avx2( const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, int32_t sx4, int alpha, int beta, int p_height, int height, int i, const __m256i *round_const, const __m128i *shift, const __m256i *shuffle_src) { (void)alpha; int k, iy, row = 0; __m256i coeff[4]; prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff); for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { iy = iy4 + k; iy = clamp(iy, 0, height - 1); const __m128i src0 = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); iy = iy4 + k + 1; iy = clamp(iy, 0, height - 1); const __m128i src1 = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); const __m256i src_01 = _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1); filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, shift, row); row += 1; } iy = iy4 + k; iy = clamp(iy, 0, height - 1); const __m256i src_01 = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, shift, row); } static inline void unpack_weights_and_set_round_const_avx2( ConvolveParams *conv_params, const int round_bits, const int offset_bits, __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) { *res_sub_const = _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) - (1 << (offset_bits - conv_params->round_1 - 1))); *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1)); const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m256i wt0 = _mm256_set1_epi16((short)w0); const __m256i wt1 = _mm256_set1_epi16((short)w1); *wt = _mm256_unpacklo_epi16(wt0, wt1); } static inline void prepare_vertical_filter_coeffs_avx2(int gamma, int delta, int sy, __m256i *coeffs) { __m128i filt_00 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); __m128i filt_01 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); __m128i filt_02 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); __m128i filt_03 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); __m128i filt_10 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); __m128i filt_11 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); __m128i filt_12 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); __m128i filt_13 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); __m256i filt_0 = _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1); __m256i filt_1 = _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1); __m256i filt_2 = _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1); __m256i filt_3 = _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1); __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1); coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1); coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3); coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3); filt_00 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); filt_01 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); filt_02 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); filt_03 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); filt_10 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); filt_11 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); filt_12 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); filt_13 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); filt_0 = _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1); filt_1 = _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1); filt_2 = _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1); filt_3 = _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1); res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1); coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1); coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3); coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3); } static inline void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy, __m256i *coeffs) { __m128i filt_00 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); __m128i filt_01 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); __m128i filt_02 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); __m128i filt_03 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00); __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01); __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02); __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03); __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1); coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1); coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3); coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3); filt_00 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); filt_01 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); filt_02 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); filt_03 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); filt_0 = _mm256_broadcastsi128_si256(filt_00); filt_1 = _mm256_broadcastsi128_si256(filt_01); filt_2 = _mm256_broadcastsi128_si256(filt_02); filt_3 = _mm256_broadcastsi128_si256(filt_03); res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1); coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1); coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3); coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3); } static inline void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy, __m256i *coeffs) { const __m128i filt_0 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); const __m128i filt_1 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS))); __m256i res_0 = _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1); coeffs[0] = _mm256_shuffle_epi8( res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2)); coeffs[1] = _mm256_shuffle_epi8( res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2)); coeffs[2] = _mm256_shuffle_epi8( res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2)); coeffs[3] = _mm256_shuffle_epi8( res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2)); coeffs[4] = coeffs[0]; coeffs[5] = coeffs[1]; coeffs[6] = coeffs[2]; coeffs[7] = coeffs[3]; } static inline void filter_src_pixels_vertical_avx2(__m256i *horz_out, __m256i *src, __m256i *coeffs, __m256i *res_lo, __m256i *res_hi, int row) { const __m256i src_6 = horz_out[row + 3]; const __m256i src_7 = _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21); src[6] = _mm256_unpacklo_epi16(src_6, src_7); const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]); const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]); const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]); const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]); const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6)); src[7] = _mm256_unpackhi_epi16(src_6, src_7); const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]); const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]); const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]); const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]); const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7)); // Rearrange pixels back into the order 0 ... 7 *res_lo = _mm256_unpacklo_epi32(res_even, res_odd); *res_hi = _mm256_unpackhi_epi32(res_even, res_odd); } static inline void store_vertical_filter_output_avx2( const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const, const __m256i *wt, const __m256i *res_sub_const, const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width, const int round_bits) { __m256i res_lo_1 = *res_lo; __m256i res_hi_1 = *res_hi; if (conv_params->is_compound) { __m128i *const p_0 = (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j]; __m128i *const p_1 = (__m128i *)&conv_params ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j]; res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert); const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1); __m256i res_lo_16; if (conv_params->do_average) { __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; __m128i *const dst8_1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j]; const __m128i p_16_0 = _mm_loadl_epi64(p_0); const __m128i p_16_1 = _mm_loadl_epi64(p_1); const __m256i p_16 = _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1); if (conv_params->use_dist_wtd_comp_avg) { const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16); const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt); const __m256i shifted_32 = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32); } else { res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1); } res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const); res_lo_16 = _mm256_srai_epi16( _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits); const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16); const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo); const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1); *(int *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0); *(int *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1); } else { const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16); const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1); _mm_storel_epi64(p_0, temp_lo_16_0); _mm_storel_epi64(p_1, temp_lo_16_1); } if (p_width > 4) { __m128i *const p4_0 = (__m128i *)&conv_params ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; __m128i *const p4_1 = (__m128i *)&conv_params ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4]; res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert); const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1); __m256i res_hi_16; if (conv_params->do_average) { __m128i *const dst8_4_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; __m128i *const dst8_4_1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4]; const __m128i p4_16_0 = _mm_loadl_epi64(p4_0); const __m128i p4_16_1 = _mm_loadl_epi64(p4_1); const __m256i p4_16 = _mm256_inserti128_si256( _mm256_castsi128_si256(p4_16_0), p4_16_1, 1); if (conv_params->use_dist_wtd_comp_avg) { const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16); const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt); const __m256i shifted_32 = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32); } else { res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1); } res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const); res_hi_16 = _mm256_srai_epi16( _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits); __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16); const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi); const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1); *(int *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0); *(int *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1); } else { const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16); const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1); _mm_storel_epi64(p4_0, temp_hi_16_0); _mm_storel_epi64(p4_1, temp_hi_16_1); } } } else { const __m256i res_lo_round = _mm256_srai_epi32( _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert); const __m256i res_hi_round = _mm256_srai_epi32( _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert); const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round); const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit); const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit); const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1); // Store, blending with 'pred' if needed __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j]; if (p_width == 4) { *(int *)p = _mm_cvtsi128_si32(res_8bit0); *(int *)p1 = _mm_cvtsi128_si32(res_8bit1); } else { _mm_storel_epi64(p, res_8bit0); _mm_storel_epi64(p1, res_8bit1); } } } static inline void warp_vertical_filter_avx2( uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, int i, int j, int sy4, const int reduce_bits_vert, const __m256i *res_add_const, const int round_bits, const __m256i *res_sub_const, const __m256i *round_bits_const, const __m256i *wt) { int k, row = 0; __m256i src[8]; const __m256i src_0 = horz_out[0]; const __m256i src_1 = _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); const __m256i src_2 = horz_out[1]; const __m256i src_3 = _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); const __m256i src_4 = horz_out[2]; const __m256i src_5 = _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); src[0] = _mm256_unpacklo_epi16(src_0, src_1); src[2] = _mm256_unpacklo_epi16(src_2, src_3); src[4] = _mm256_unpacklo_epi16(src_4, src_5); src[1] = _mm256_unpackhi_epi16(src_0, src_1); src[3] = _mm256_unpackhi_epi16(src_2, src_3); src[5] = _mm256_unpackhi_epi16(src_4, src_5); for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { int sy = sy4 + delta * (k + 4); __m256i coeffs[8]; prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs); __m256i res_lo, res_hi; filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, row); store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, res_sub_const, round_bits_const, pred, conv_params, i, j, k, reduce_bits_vert, p_stride, p_width, round_bits); src[0] = src[2]; src[2] = src[4]; src[4] = src[6]; src[1] = src[3]; src[3] = src[5]; src[5] = src[7]; row += 1; } } static inline void warp_vertical_filter_gamma0_avx2( uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, int i, int j, int sy4, const int reduce_bits_vert, const __m256i *res_add_const, const int round_bits, const __m256i *res_sub_const, const __m256i *round_bits_const, const __m256i *wt) { (void)gamma; int k, row = 0; __m256i src[8]; const __m256i src_0 = horz_out[0]; const __m256i src_1 = _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); const __m256i src_2 = horz_out[1]; const __m256i src_3 = _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); const __m256i src_4 = horz_out[2]; const __m256i src_5 = _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); src[0] = _mm256_unpacklo_epi16(src_0, src_1); src[2] = _mm256_unpacklo_epi16(src_2, src_3); src[4] = _mm256_unpacklo_epi16(src_4, src_5); src[1] = _mm256_unpackhi_epi16(src_0, src_1); src[3] = _mm256_unpackhi_epi16(src_2, src_3); src[5] = _mm256_unpackhi_epi16(src_4, src_5); for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { int sy = sy4 + delta * (k + 4); __m256i coeffs[8]; prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs); __m256i res_lo, res_hi; filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, row); store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, res_sub_const, round_bits_const, pred, conv_params, i, j, k, reduce_bits_vert, p_stride, p_width, round_bits); src[0] = src[2]; src[2] = src[4]; src[4] = src[6]; src[1] = src[3]; src[3] = src[5]; src[5] = src[7]; row += 1; } } static inline void warp_vertical_filter_delta0_avx2( uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, int i, int j, int sy4, const int reduce_bits_vert, const __m256i *res_add_const, const int round_bits, const __m256i *res_sub_const, const __m256i *round_bits_const, const __m256i *wt) { (void)delta; int k, row = 0; __m256i src[8], coeffs[8]; const __m256i src_0 = horz_out[0]; const __m256i src_1 = _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); const __m256i src_2 = horz_out[1]; const __m256i src_3 = _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); const __m256i src_4 = horz_out[2]; const __m256i src_5 = _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); src[0] = _mm256_unpacklo_epi16(src_0, src_1); src[2] = _mm256_unpacklo_epi16(src_2, src_3); src[4] = _mm256_unpacklo_epi16(src_4, src_5); src[1] = _mm256_unpackhi_epi16(src_0, src_1); src[3] = _mm256_unpackhi_epi16(src_2, src_3); src[5] = _mm256_unpackhi_epi16(src_4, src_5); prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs); for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { __m256i res_lo, res_hi; filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, row); store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, res_sub_const, round_bits_const, pred, conv_params, i, j, k, reduce_bits_vert, p_stride, p_width, round_bits); src[0] = src[2]; src[2] = src[4]; src[4] = src[6]; src[1] = src[3]; src[3] = src[5]; src[5] = src[7]; row += 1; } } static inline void warp_vertical_filter_gamma0_delta0_avx2( uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, int i, int j, int sy4, const int reduce_bits_vert, const __m256i *res_add_const, const int round_bits, const __m256i *res_sub_const, const __m256i *round_bits_const, const __m256i *wt) { (void)gamma; int k, row = 0; __m256i src[8], coeffs[8]; const __m256i src_0 = horz_out[0]; const __m256i src_1 = _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); const __m256i src_2 = horz_out[1]; const __m256i src_3 = _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); const __m256i src_4 = horz_out[2]; const __m256i src_5 = _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); src[0] = _mm256_unpacklo_epi16(src_0, src_1); src[2] = _mm256_unpacklo_epi16(src_2, src_3); src[4] = _mm256_unpacklo_epi16(src_4, src_5); src[1] = _mm256_unpackhi_epi16(src_0, src_1); src[3] = _mm256_unpackhi_epi16(src_2, src_3); src[5] = _mm256_unpackhi_epi16(src_4, src_5); prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs); for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { __m256i res_lo, res_hi; filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, row); store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, res_sub_const, round_bits_const, pred, conv_params, i, j, k, reduce_bits_vert, p_stride, p_width, round_bits); src[0] = src[2]; src[2] = src[4]; src[4] = src[6]; src[1] = src[3]; src[3] = src[5]; src[5] = src[7]; row += 1; } } static inline void prepare_warp_vertical_filter_avx2( uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, int i, int j, int sy4, const int reduce_bits_vert, const __m256i *res_add_const, const int round_bits, const __m256i *res_sub_const, const __m256i *round_bits_const, const __m256i *wt) { if (gamma == 0 && delta == 0) warp_vertical_filter_gamma0_delta0_avx2( pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, round_bits_const, wt); else if (gamma == 0 && delta != 0) warp_vertical_filter_gamma0_avx2( pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, round_bits_const, wt); else if (gamma != 0 && delta == 0) warp_vertical_filter_delta0_avx2( pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, round_bits_const, wt); else warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, round_bits_const, wt); } static inline void prepare_warp_horizontal_filter_avx2( const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, int32_t sx4, int alpha, int beta, int p_height, int height, int i, const __m256i *round_const, const __m128i *shift, const __m256i *shuffle_src) { if (alpha == 0 && beta == 0) warp_horizontal_filter_alpha0_beta0_avx2( ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, round_const, shift, shuffle_src); else if (alpha == 0 && beta != 0) warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, round_const, shift, shuffle_src); else if (alpha != 0 && beta == 0) warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, round_const, shift, shuffle_src); else warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, round_const, shift, shuffle_src); } void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { __m256i horz_out[8]; int i, j, k; const int bd = 8; const int reduce_bits_horiz = conv_params->round_0; const int reduce_bits_vert = conv_params->is_compound ? conv_params->round_1 : 2 * FILTER_BITS - reduce_bits_horiz; const int offset_bits_horiz = bd + FILTER_BITS - 1; assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; const __m256i reduce_bits_vert_const = _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1)); const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); const __m256i round_const = _mm256_set1_epi16( (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1)); const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz); __m256i res_sub_const, round_bits_const, wt; unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits, &res_sub_const, &round_bits_const, &wt); __m256i res_add_const_1; if (conv_params->is_compound == 1) { res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const); } else { res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + ((1 << reduce_bits_vert) >> 1)); } const int32_t const1 = alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); const int32_t const2 = gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1); const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)); const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz)); __m256i shuffle_src[4]; shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0); shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1); shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2); shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3); for (i = 0; i < p_height; i += 8) { for (j = 0; j < p_width; j += 8) { const int32_t src_x = (p_col + j + 4) << subsampling_x; const int32_t src_y = (p_row + i + 4) << subsampling_y; const int64_t dst_x = (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; const int64_t dst_y = (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; const int64_t x4 = dst_x >> subsampling_x; const int64_t y4 = dst_y >> subsampling_y; int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); // Add in all the constant terms, including rounding and offset sx4 += const1; sy4 += const2; sx4 &= ~const3; sy4 &= ~const3; // Horizontal filter // If the block is aligned such that, after clamping, every sample // would be taken from the leftmost/rightmost column, then we can // skip the expensive horizontal filter. if (ix4 <= -7) { int iy, row = 0; for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { iy = iy4 + k; iy = clamp(iy, 0, height - 1); const __m256i temp_0 = _mm256_set1_epi16(const4 + ref[iy * stride] * const5); iy = iy4 + k + 1; iy = clamp(iy, 0, height - 1); const __m256i temp_1 = _mm256_set1_epi16(const4 + ref[iy * stride] * const5); horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0); row += 1; } iy = iy4 + k; iy = clamp(iy, 0, height - 1); horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5); } else if (ix4 >= width + 6) { int iy, row = 0; for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { iy = iy4 + k; iy = clamp(iy, 0, height - 1); const __m256i temp_0 = _mm256_set1_epi16( const4 + ref[iy * stride + (width - 1)] * const5); iy = iy4 + k + 1; iy = clamp(iy, 0, height - 1); const __m256i temp_1 = _mm256_set1_epi16( const4 + ref[iy * stride + (width - 1)] * const5); horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0); row += 1; } iy = iy4 + k; iy = clamp(iy, 0, height - 1); horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5); } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { const int out_of_boundary_left = -(ix4 - 6); const int out_of_boundary_right = (ix4 + 8) - width; int iy, sx, row = 0; for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { iy = iy4 + k; iy = clamp(iy, 0, height - 1); __m128i src0 = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); iy = iy4 + k + 1; iy = clamp(iy, 0, height - 1); __m128i src1 = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); if (out_of_boundary_left >= 0) { const __m128i shuffle_reg_left = _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); src0 = _mm_shuffle_epi8(src0, shuffle_reg_left); src1 = _mm_shuffle_epi8(src1, shuffle_reg_left); } if (out_of_boundary_right >= 0) { const __m128i shuffle_reg_right = _mm_loadu_si128( (__m128i *)warp_pad_right[out_of_boundary_right]); src0 = _mm_shuffle_epi8(src0, shuffle_reg_right); src1 = _mm_shuffle_epi8(src1, shuffle_reg_right); } sx = sx4 + beta * (k + 4); const __m256i src_01 = _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1); horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src, &round_const, &shift); row += 1; } iy = iy4 + k; iy = clamp(iy, 0, height - 1); __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); if (out_of_boundary_left >= 0) { const __m128i shuffle_reg_left = _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); src = _mm_shuffle_epi8(src, shuffle_reg_left); } if (out_of_boundary_right >= 0) { const __m128i shuffle_reg_right = _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]); src = _mm_shuffle_epi8(src, shuffle_reg_right); } sx = sx4 + beta * (k + 4); const __m256i src_01 = _mm256_castsi128_si256(src); __m256i coeff[4]; prepare_horizontal_filter_coeff(alpha, sx, coeff); filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, &round_const, &shift, row); } else { prepare_warp_horizontal_filter_avx2( ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, &round_const, &shift, shuffle_src); } // Vertical filter prepare_warp_vertical_filter_avx2( pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, &res_sub_const, &round_bits_const, &wt); } } } aom-3.12.1/av1/common/x86/warp_plane_sse4.c000066400000000000000000001220171477627663500202420ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "av1/common/warped_motion.h" /* This is a modified version of 'av1_warped_filter' from warped_motion.c: * Each coefficient is stored in 8 bits instead of 16 bits * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7 This is done in order to avoid overflow: Since the tap with the largest coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular convolve functions. Instead, we use the summation order ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)). The rearrangement of coefficients in this table is so that we can get the coefficients into the correct order more quickly. */ /* clang-format off */ DECLARE_ALIGNED(8, const int8_t, av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = { // [-1, 0) { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0}, { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0}, { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0}, { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0}, { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0}, { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0}, { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0}, { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0}, { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0}, { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0}, { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0}, { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0}, { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0}, { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0}, { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0}, { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0}, { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0}, { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0}, { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0}, { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0}, { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0}, { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0}, { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0}, { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0}, { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0}, { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0}, { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0}, { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0}, { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0}, { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0}, { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0}, { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0}, // [0, 1) { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0}, { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0}, { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1}, {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1}, {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1}, {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1}, {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1}, {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1}, {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2}, {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2}, {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2}, {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2}, {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2}, {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2}, {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2}, {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2}, {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2}, {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2}, {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2}, {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2}, {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2}, {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2}, {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2}, {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2}, {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2}, {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1}, {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2}, {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1}, {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1}, {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1}, { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0}, { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0}, // [1, 2) { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0}, { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1}, { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1}, { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1}, { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1}, { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2}, { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2}, { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2}, { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3}, { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3}, { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3}, { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4}, { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4}, { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4}, { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4}, { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4}, { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4}, { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4}, { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4}, { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4}, { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4}, { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4}, { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4}, { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3}, { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3}, { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3}, { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2}, { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2}, { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2}, { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1}, { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1}, { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0}, // dummy (replicate row index 191) { 0, 0, 2, -1, 0, 0, 127, 0}, }; /* clang-format on */ // Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15 // in an SSE register into two sequences: // 0, 2, 2, 4, ..., 12, 12, 14, // 1, 3, 3, 5, ..., 13, 13, 15, DECLARE_ALIGNED(16, static const uint8_t, even_mask[16]) = { 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 0 }; DECLARE_ALIGNED(16, static const uint8_t, odd_mask[16]) = { 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, 0 }; DECLARE_ALIGNED(16, static const uint8_t, shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }; DECLARE_ALIGNED(16, static const uint8_t, shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3 }; DECLARE_ALIGNED(16, static const uint8_t, shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5 }; DECLARE_ALIGNED(16, static const uint8_t, shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7 }; DECLARE_ALIGNED(16, static const uint8_t, shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 }; DECLARE_ALIGNED(16, static const uint8_t, shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 }; DECLARE_ALIGNED(16, static const uint8_t, shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 }; DECLARE_ALIGNED(16, static const uint8_t, shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 }; static inline void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff, const int offset_bits_horiz, const int reduce_bits_horiz, int k) { const __m128i src_even = _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask)); const __m128i src_odd = _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask)); // The pixel order we need for 'src' is: // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9 const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd); const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]); // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13 const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4), _mm_srli_si128(src_odd, 4)); const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]); // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10 const __m128i src_13 = _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2)); const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]); // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14 const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4), _mm_srli_si128(src_even, 6)); const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]); const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1)); // Note: The values res_02 + res_46 and res_13 + res_57 both // fit into int16s at this point, but their sum may be too wide to fit // into an int16. However, once we also add round_const, the sum of // all of these fits into a uint16. // // The wrapping behaviour of _mm_add_* is used here to make sure we // get the correct result despite converting between different // (implicit) types. const __m128i res_even = _mm_add_epi16(res_02, res_46); const __m128i res_odd = _mm_add_epi16(res_13, res_57); const __m128i res = _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const); tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz)); } static inline void prepare_horizontal_filter_coeff(int alpha, int sx, __m128i *coeff) { // Filter even-index pixels const __m128i tmp_0 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_1 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_2 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_3 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_4 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_5 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_6 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); const __m128i tmp_7 = _mm_loadl_epi64( (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2 const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2); // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3 const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3); // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6 const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6); // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7 const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7); // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6 const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10); // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10); // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7 const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11); // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11); // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14); // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14); // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15); // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15); } static inline void prepare_horizontal_filter_coeff_alpha0(int sx, __m128i *coeff) { // Filter even-index pixels const __m128i tmp_0 = _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 coeff[0] = _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01)); // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 coeff[1] = _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23)); // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 coeff[2] = _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45)); // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 coeff[3] = _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67)); } static inline void horizontal_filter(__m128i src, __m128i *tmp, int sx, int alpha, int k, const int offset_bits_horiz, const int reduce_bits_horiz) { __m128i coeff[4]; prepare_horizontal_filter_coeff(alpha, sx, coeff); filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); } static inline void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, int32_t sx4, int alpha, int beta, int p_height, int height, int i, const int offset_bits_horiz, const int reduce_bits_horiz) { int k; for (k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; int sx = sx4 + beta * (k + 4); // Load source pixels const __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz, reduce_bits_horiz); } } static inline void warp_horizontal_filter_alpha0( const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, int32_t sx4, int alpha, int beta, int p_height, int height, int i, const int offset_bits_horiz, const int reduce_bits_horiz) { (void)alpha; int k; for (k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; int sx = sx4 + beta * (k + 4); // Load source pixels const __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); __m128i coeff[4]; prepare_horizontal_filter_coeff_alpha0(sx, coeff); filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); } } static inline void warp_horizontal_filter_beta0( const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, int32_t sx4, int alpha, int beta, int p_height, int height, int i, const int offset_bits_horiz, const int reduce_bits_horiz) { (void)beta; int k; __m128i coeff[4]; prepare_horizontal_filter_coeff(alpha, sx4, coeff); for (k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; // Load source pixels const __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); } } static inline void warp_horizontal_filter_alpha0_beta0( const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, int32_t sx4, int alpha, int beta, int p_height, int height, int i, const int offset_bits_horiz, const int reduce_bits_horiz) { (void)beta; (void)alpha; int k; __m128i coeff[4]; prepare_horizontal_filter_coeff_alpha0(sx4, coeff); for (k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; // Load source pixels const __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); } } static inline void unpack_weights_and_set_round_const( ConvolveParams *conv_params, const int round_bits, const int offset_bits, __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) { *res_sub_const = _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) - (1 << (offset_bits - conv_params->round_1 - 1))); *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1)); const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi16((int16_t)w0); const __m128i wt1 = _mm_set1_epi16((int16_t)w1); *wt = _mm_unpacklo_epi16(wt0, wt1); } static inline void prepare_vertical_filter_coeffs(int gamma, int sy, __m128i *coeffs) { const __m128i tmp_0 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_2 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_4 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_6 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); // even coeffs coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10); coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10); coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14); coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14); const __m128i tmp_1 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_3 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_5 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_7 = _mm_loadu_si128((__m128i *)(av1_warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); // odd coeffs coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11); coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11); coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15); coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15); } static inline void prepare_vertical_filter_coeffs_gamma0(int sy, __m128i *coeffs) { const __m128i tmp_0 = _mm_loadu_si128( (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); // even coeffs coeffs[0] = _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0)); coeffs[1] = _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1)); coeffs[2] = _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2)); coeffs[3] = _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3)); // odd coeffs coeffs[4] = coeffs[0]; coeffs[5] = coeffs[1]; coeffs[6] = coeffs[2]; coeffs[7] = coeffs[3]; } static inline void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs, __m128i *res_lo, __m128i *res_hi, int k) { // Load from tmp and rearrange pairs of consecutive rows into the // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 const __m128i *src = tmp + (k + 4); const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]); const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]); const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]); const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]); const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); // Filter odd-index pixels const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]); const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]); const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]); const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]); const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); // Rearrange pixels back into the order 0 ... 7 *res_lo = _mm_unpacklo_epi32(res_even, res_odd); *res_hi = _mm_unpackhi_epi32(res_even, res_odd); } static inline void store_vertical_filter_output( __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const, const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width, const int round_bits) { __m128i res_lo_1 = *res_lo; __m128i res_hi_1 = *res_hi; if (conv_params->is_compound) { __m128i *const p = (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j]; res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert); const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1); __m128i res_lo_16; if (conv_params->do_average) { __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; const __m128i p_16 = _mm_loadl_epi64(p); if (conv_params->use_dist_wtd_comp_avg) { const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16); const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt); const __m128i shifted_32 = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32); } else { res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1); } res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const); res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const), round_bits); __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16); *(int *)dst8 = _mm_cvtsi128_si32(res_8_lo); } else { _mm_storel_epi64(p, temp_lo_16); } if (p_width > 4) { __m128i *const p4 = (__m128i *)&conv_params ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert); const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1); __m128i res_hi_16; if (conv_params->do_average) { __m128i *const dst8_4 = (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; const __m128i p4_16 = _mm_loadl_epi64(p4); if (conv_params->use_dist_wtd_comp_avg) { const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16); const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt); const __m128i shifted_32 = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32); } else { res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1); } res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const); res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const), round_bits); __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16); *(int *)dst8_4 = _mm_cvtsi128_si32(res_8_hi); } else { _mm_storel_epi64(p4, temp_hi_16); } } } else { const __m128i res_lo_round = _mm_srai_epi32( _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert); const __m128i res_hi_round = _mm_srai_epi32( _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert); const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); // Store, blending with 'pred' if needed __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; // Note: If we're outputting a 4x4 block, we need to be very careful // to only output 4 pixels at this point, to avoid encode/decode // mismatches when encoding with multiple threads. if (p_width == 4) { *(int *)p = _mm_cvtsi128_si32(res_8bit); } else { _mm_storel_epi64(p, res_8bit); } } } static inline void warp_vertical_filter( uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, int i, int j, int sy4, const int reduce_bits_vert, const __m128i *res_add_const, const int round_bits, const int offset_bits) { int k; __m128i res_sub_const, round_bits_const, wt; unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, &res_sub_const, &round_bits_const, &wt); // Vertical filter for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { int sy = sy4 + delta * (k + 4); __m128i coeffs[8]; prepare_vertical_filter_coeffs(gamma, sy, coeffs); __m128i res_lo; __m128i res_hi; filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, &res_sub_const, &round_bits_const, pred, conv_params, i, j, k, reduce_bits_vert, p_stride, p_width, round_bits); } } static inline void warp_vertical_filter_gamma0( uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, int i, int j, int sy4, const int reduce_bits_vert, const __m128i *res_add_const, const int round_bits, const int offset_bits) { int k; (void)gamma; __m128i res_sub_const, round_bits_const, wt; unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, &res_sub_const, &round_bits_const, &wt); // Vertical filter for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { int sy = sy4 + delta * (k + 4); __m128i coeffs[8]; prepare_vertical_filter_coeffs_gamma0(sy, coeffs); __m128i res_lo; __m128i res_hi; filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, &res_sub_const, &round_bits_const, pred, conv_params, i, j, k, reduce_bits_vert, p_stride, p_width, round_bits); } } static inline void warp_vertical_filter_delta0( uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, int i, int j, int sy4, const int reduce_bits_vert, const __m128i *res_add_const, const int round_bits, const int offset_bits) { (void)delta; int k; __m128i res_sub_const, round_bits_const, wt; unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, &res_sub_const, &round_bits_const, &wt); __m128i coeffs[8]; prepare_vertical_filter_coeffs(gamma, sy4, coeffs); // Vertical filter for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { __m128i res_lo; __m128i res_hi; filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, &res_sub_const, &round_bits_const, pred, conv_params, i, j, k, reduce_bits_vert, p_stride, p_width, round_bits); } } static inline void warp_vertical_filter_gamma0_delta0( uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, int i, int j, int sy4, const int reduce_bits_vert, const __m128i *res_add_const, const int round_bits, const int offset_bits) { (void)delta; (void)gamma; int k; __m128i res_sub_const, round_bits_const, wt; unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, &res_sub_const, &round_bits_const, &wt); __m128i coeffs[8]; prepare_vertical_filter_coeffs_gamma0(sy4, coeffs); // Vertical filter for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { __m128i res_lo; __m128i res_hi; filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, &res_sub_const, &round_bits_const, pred, conv_params, i, j, k, reduce_bits_vert, p_stride, p_width, round_bits); } } static inline void prepare_warp_vertical_filter( uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, int i, int j, int sy4, const int reduce_bits_vert, const __m128i *res_add_const, const int round_bits, const int offset_bits) { if (gamma == 0 && delta == 0) warp_vertical_filter_gamma0_delta0( pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits); else if (gamma == 0 && delta != 0) warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits); else if (gamma != 0 && delta == 0) warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits); else warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits); } static inline void prepare_warp_horizontal_filter( const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, int32_t sx4, int alpha, int beta, int p_height, int height, int i, const int offset_bits_horiz, const int reduce_bits_horiz) { if (alpha == 0 && beta == 0) warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, offset_bits_horiz, reduce_bits_horiz); else if (alpha == 0 && beta != 0) warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, offset_bits_horiz, reduce_bits_horiz); else if (alpha != 0 && beta == 0) warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, offset_bits_horiz, reduce_bits_horiz); else warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, offset_bits_horiz, reduce_bits_horiz); } void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { __m128i tmp[15]; int i, j, k; const int bd = 8; const int reduce_bits_horiz = conv_params->round_0; const int reduce_bits_vert = conv_params->is_compound ? conv_params->round_1 : 2 * FILTER_BITS - reduce_bits_horiz; const int offset_bits_horiz = bd + FILTER_BITS - 1; assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; const __m128i reduce_bits_vert_const = _mm_set1_epi32(((1 << reduce_bits_vert) >> 1)); const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); /* Note: For this code to work, the left/right frame borders need to be extended by at least 13 pixels each. By the time we get here, other code will have set up this border, but we allow an explicit check for debugging purposes. */ /*for (i = 0; i < height; ++i) { for (j = 0; j < 13; ++j) { assert(ref[i * stride - 13 + j] == ref[i * stride]); assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]); } }*/ __m128i res_add_const_1; if (conv_params->is_compound == 1) { res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const); } else { res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + ((1 << reduce_bits_vert) >> 1)); } for (i = 0; i < p_height; i += 8) { for (j = 0; j < p_width; j += 8) { const int32_t src_x = (p_col + j + 4) << subsampling_x; const int32_t src_y = (p_row + i + 4) << subsampling_y; const int64_t dst_x = (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; const int64_t dst_y = (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; const int64_t x4 = dst_x >> subsampling_x; const int64_t y4 = dst_y >> subsampling_y; int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); // Add in all the constant terms, including rounding and offset sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); // Horizontal filter // If the block is aligned such that, after clamping, every sample // would be taken from the leftmost/rightmost column, then we can // skip the expensive horizontal filter. if (ix4 <= -7) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))); } } else if (ix4 >= width + 6) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + ref[iy * stride + (width - 1)] * (1 << (FILTER_BITS - reduce_bits_horiz))); } } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { const int out_of_boundary_left = -(ix4 - 6); const int out_of_boundary_right = (ix4 + 8) - width; for (k = -7; k < AOMMIN(8, p_height - i); ++k) { int iy = iy4 + k; if (iy < 0) iy = 0; else if (iy > height - 1) iy = height - 1; int sx = sx4 + beta * (k + 4); // Load source pixels __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); if (out_of_boundary_left >= 0) { const __m128i shuffle_reg_left = _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); src = _mm_shuffle_epi8(src, shuffle_reg_left); } if (out_of_boundary_right >= 0) { const __m128i shuffle_reg_right = _mm_loadu_si128( (__m128i *)warp_pad_right[out_of_boundary_right]); src = _mm_shuffle_epi8(src, shuffle_reg_right); } horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz, reduce_bits_horiz); } } else { prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, offset_bits_horiz, reduce_bits_horiz); } // Vertical filter prepare_warp_vertical_filter( pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits); } } } aom-3.12.1/av1/common/x86/wiener_convolve_avx2.c000066400000000000000000000233441477627663500213230ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "av1/common/convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" // 128-bit xmmwords are written as [ ... ] with the MSB on the left. // 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB // on the left. // A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be // loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ]. // Exploiting the range of wiener filter coefficients, // horizontal filtering can be done in 16 bit intermediate precision. // The details are as follows : // Consider the horizontal wiener filter coefficients of the following form : // [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0] // Subtracting 2^(FILTER_BITS) from the centre tap we get the following : // [C0, C1, C2, -2 * (C0 + C1 + C2), C2, C1, C0] // The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3 // + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit // precision. Finally, after rounding the above result by round_0, we multiply // the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the // horizontal filter output. void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params) { const int bd = 8; assert(x_step_q4 == 16 && y_step_q4 == 16); assert(!(w & 7)); (void)x_step_q4; (void)y_step_q4; DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]); int im_h = h + SUBPEL_TAPS - 2; int im_stride = 8; memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE); int i, j; const int center_tap = (SUBPEL_TAPS - 1) / 2; const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center; assert(conv_params->round_0 > 0); filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2); const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x); const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x); // coeffs 0 1 0 1 0 1 0 1 coeffs_h[0] = _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u)); // coeffs 2 3 2 3 2 3 2 3 coeffs_h[1] = _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u)); // coeffs 4 5 4 5 4 5 4 5 coeffs_h[2] = _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u)); // coeffs 6 7 6 7 6 7 6 7 coeffs_h[3] = _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu)); const __m256i round_const_h = _mm256_set1_epi16((1 << (conv_params->round_0 - 1))); const __m256i round_const_horz = _mm256_set1_epi16((1 << (bd + FILTER_BITS - conv_params->round_0 - 1))); const __m256i clamp_low = _mm256_setzero_si256(); const __m256i clamp_high = _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1); const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0); // Add an offset to account for the "add_src" part of the convolve function. const __m128i zero_128 = _mm_setzero_si128(); const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3); const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0); const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y); // coeffs 0 1 0 1 0 1 0 1 coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00); // coeffs 2 3 2 3 2 3 2 3 coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55); // coeffs 4 5 4 5 4 5 4 5 coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa); // coeffs 6 7 6 7 6 7 6 7 coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff); const __m256i round_const_v = _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) - (1 << (bd + conv_params->round_1 - 1))); const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); for (j = 0; j < w; j += 8) { for (i = 0; i < im_h; i += 2) { __m256i data = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); // Load the next line if (i + 1 < im_h) data = _mm256_inserti128_si256( data, _mm_loadu_si128( (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), 1); __m256i res = convolve_lowbd_x(data, coeffs_h, filt); res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); __m256i data_0 = _mm256_shuffle_epi8(data, filt_center); // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to // the result data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0); res = _mm256_add_epi16(res, data_0); res = _mm256_add_epi16(res, round_const_horz); const __m256i res_clamped = _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high); _mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped); } /* Vertical filter */ { __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); __m256i s[8]; s[0] = _mm256_unpacklo_epi16(src_0, src_1); s[1] = _mm256_unpacklo_epi16(src_2, src_3); s[2] = _mm256_unpacklo_epi16(src_4, src_5); s[4] = _mm256_unpackhi_epi16(src_0, src_1); s[5] = _mm256_unpackhi_epi16(src_2, src_3); s[6] = _mm256_unpackhi_epi16(src_4, src_5); for (i = 0; i < h - 1; i += 2) { const int16_t *data = &im_block[i * im_stride]; const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); s[3] = _mm256_unpacklo_epi16(s6, s7); s[7] = _mm256_unpackhi_epi16(s6, s7); __m256i res_a = convolve(s, coeffs_v); __m256i res_b = convolve(s + 4, coeffs_v); const __m256i res_a_round = _mm256_sra_epi32( _mm256_add_epi32(res_a, round_const_v), round_shift_v); const __m256i res_b_round = _mm256_sra_epi32( _mm256_add_epi32(res_b, round_const_v), round_shift_v); /* rounding code */ // 16 bit conversion const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); // 8 bit conversion and saturation to uint8 const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); const __m128i res_0 = _mm256_castsi256_si128(res_8b); const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); // Store values into the destination buffer __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; _mm_storel_epi64(p_0, res_0); _mm_storel_epi64(p_1, res_1); s[0] = s[1]; s[1] = s[2]; s[2] = s[3]; s[4] = s[5]; s[5] = s[6]; s[6] = s[7]; } if (h - i) { s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20); s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20); s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20); const int16_t *data = &im_block[i * im_stride]; const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride)); const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride)); __m128i s3 = _mm_unpacklo_epi16(s6_, s7_); __m128i s7 = _mm_unpackhi_epi16(s6_, s7_); s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1); __m256i convolveres = convolve(s, coeffs_v); const __m256i res_round = _mm256_sra_epi32( _mm256_add_epi32(convolveres, round_const_v), round_shift_v); /* rounding code */ // 16 bit conversion __m128i reslo = _mm256_castsi256_si128(res_round); __m128i reshi = _mm256_extracti128_si256(res_round, 1); const __m128i res_16bit = _mm_packus_epi32(reslo, reshi); // 8 bit conversion and saturation to uint8 const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit); __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; _mm_storel_epi64(p_0, res_8b); } } } } aom-3.12.1/av1/common/x86/wiener_convolve_sse2.c000066400000000000000000000212201477627663500213060ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "av1/common/convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const WienerConvolveParams *conv_params) { const int bd = 8; assert(x_step_q4 == 16 && y_step_q4 == 16); assert(!(w & 7)); (void)x_step_q4; (void)y_step_q4; DECLARE_ALIGNED(16, uint16_t, temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); int intermediate_height = h + SUBPEL_TAPS - 2; memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); int i, j; const int center_tap = ((SUBPEL_TAPS - 1) / 2); const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; const __m128i zero = _mm_setzero_si128(); // Add an offset to account for the "add_src" part of the convolve function. const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); /* Horizontal filter */ { const __m128i coeffs_x = _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32( (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); for (i = 0; i < intermediate_height; ++i) { for (j = 0; j < w; j += 8) { const __m128i data = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); // Filter even-index pixels const __m128i src_0 = _mm_unpacklo_epi8(data, zero); const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), conv_params->round_0); // Filter odd-index pixels const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), conv_params->round_0); // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 __m128i res = _mm_packs_epi32(res_even, res_odd); res = _mm_min_epi16( _mm_max_epi16(res, zero), _mm_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1)); _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); } } } /* Vertical filter */ { const __m128i coeffs_y = _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32((1 << (conv_params->round_1 - 1)) - (1 << (bd + conv_params->round_1 - 1))); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { // Filter even-index pixels const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; const __m128i src_0 = _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), *(__m128i *)(data + 1 * MAX_SB_SIZE)); const __m128i src_2 = _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), *(__m128i *)(data + 3 * MAX_SB_SIZE)); const __m128i src_4 = _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), *(__m128i *)(data + 5 * MAX_SB_SIZE)); const __m128i src_6 = _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), *(__m128i *)(data + 7 * MAX_SB_SIZE)); const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); // Filter odd-index pixels const __m128i src_1 = _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), *(__m128i *)(data + 1 * MAX_SB_SIZE)); const __m128i src_3 = _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), *(__m128i *)(data + 3 * MAX_SB_SIZE)); const __m128i src_5 = _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), *(__m128i *)(data + 5 * MAX_SB_SIZE)); const __m128i src_7 = _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), *(__m128i *)(data + 7 * MAX_SB_SIZE)); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); // Rearrange pixels back into the order 0 ... 7 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); const __m128i res_lo_round = _mm_srai_epi32( _mm_add_epi32(res_lo, round_const), conv_params->round_1); const __m128i res_hi_round = _mm_srai_epi32( _mm_add_epi32(res_hi, round_const), conv_params->round_1); const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; _mm_storel_epi64(p, res_8bit); } } } } aom-3.12.1/av1/decoder/000077500000000000000000000000001477627663500144755ustar00rootroot00000000000000aom-3.12.1/av1/decoder/accounting.c000066400000000000000000000114661477627663500170030ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "aom/aom_integer.h" #include "av1/decoder/accounting.h" static int accounting_hash(const char *str) { uint32_t val; const unsigned char *ustr; val = 0; ustr = (const unsigned char *)str; /* This is about the worst hash one can design, but it should be good enough here. */ while (*ustr) val += *ustr++; return val % AOM_ACCOUNTING_HASH_SIZE; } /* Dictionary lookup based on an open-addressing hash table. */ int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str) { int hash; size_t len; AccountingDictionary *dictionary; dictionary = &accounting->syms.dictionary; hash = accounting_hash(str); while (accounting->hash_dictionary[hash] != -1) { if (strcmp(dictionary->strs[accounting->hash_dictionary[hash]], str) == 0) { return accounting->hash_dictionary[hash]; } hash++; if (hash == AOM_ACCOUNTING_HASH_SIZE) hash = 0; } /* No match found. */ assert(dictionary->num_strs + 1 < MAX_SYMBOL_TYPES); accounting->hash_dictionary[hash] = dictionary->num_strs; len = strlen(str); dictionary->strs[dictionary->num_strs] = malloc(len + 1); if (!dictionary->strs[dictionary->num_strs]) abort(); snprintf(dictionary->strs[dictionary->num_strs], len + 1, "%s", str); dictionary->num_strs++; return dictionary->num_strs - 1; } void aom_accounting_init(Accounting *accounting) { int i; accounting->num_syms_allocated = 1000; accounting->syms.syms = malloc(sizeof(AccountingSymbol) * accounting->num_syms_allocated); if (!accounting->syms.syms) abort(); accounting->syms.dictionary.num_strs = 0; assert(AOM_ACCOUNTING_HASH_SIZE > 2 * MAX_SYMBOL_TYPES); for (i = 0; i < AOM_ACCOUNTING_HASH_SIZE; i++) accounting->hash_dictionary[i] = -1; aom_accounting_reset(accounting); } void aom_accounting_reset(Accounting *accounting) { accounting->syms.num_syms = 0; accounting->syms.num_binary_syms = 0; accounting->syms.num_multi_syms = 0; accounting->context.x = -1; accounting->context.y = -1; accounting->last_tell_frac = 0; } void aom_accounting_clear(Accounting *accounting) { int i; AccountingDictionary *dictionary; free(accounting->syms.syms); dictionary = &accounting->syms.dictionary; for (i = 0; i < dictionary->num_strs; i++) { free(dictionary->strs[i]); } } void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y) { accounting->context.x = x; accounting->context.y = y; } void aom_accounting_record(Accounting *accounting, const char *str, uint32_t bits) { AccountingSymbol sym; // Reuse previous symbol if it has the same context and symbol id. if (accounting->syms.num_syms) { AccountingSymbol *last_sym; last_sym = &accounting->syms.syms[accounting->syms.num_syms - 1]; if (memcmp(&last_sym->context, &accounting->context, sizeof(AccountingSymbolContext)) == 0) { uint32_t id; id = aom_accounting_dictionary_lookup(accounting, str); if (id == last_sym->id) { last_sym->bits += bits; last_sym->samples++; return; } } } sym.context = accounting->context; sym.samples = 1; sym.bits = bits; sym.id = aom_accounting_dictionary_lookup(accounting, str); assert(sym.id <= 255); if (accounting->syms.num_syms == accounting->num_syms_allocated) { accounting->num_syms_allocated *= 2; accounting->syms.syms = realloc(accounting->syms.syms, sizeof(AccountingSymbol) * accounting->num_syms_allocated); if (!accounting->syms.syms) abort(); } accounting->syms.syms[accounting->syms.num_syms++] = sym; } void aom_accounting_dump(Accounting *accounting) { int i; AccountingSymbol *sym; printf("\n----- Number of recorded syntax elements = %d -----\n", accounting->syms.num_syms); printf("----- Total number of symbol calls = %d (%d binary) -----\n", accounting->syms.num_multi_syms + accounting->syms.num_binary_syms, accounting->syms.num_binary_syms); for (i = 0; i < accounting->syms.num_syms; i++) { sym = &accounting->syms.syms[i]; printf("%s x: %d, y: %d bits: %f samples: %d\n", accounting->syms.dictionary.strs[sym->id], sym->context.x, sym->context.y, (float)sym->bits / 8.0, sym->samples); } } aom-3.12.1/av1/decoder/accounting.h000066400000000000000000000050421477627663500170010ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_DECODER_ACCOUNTING_H_ #define AOM_AV1_DECODER_ACCOUNTING_H_ #include #include "aom/aomdx.h" #ifdef __cplusplus extern "C" { #endif // __cplusplus #define AOM_ACCOUNTING_HASH_SIZE (1021) /* Max number of entries for symbol types in the dictionary (increase as necessary). */ #define MAX_SYMBOL_TYPES (256) /*The resolution of fractional-precision bit usage measurements, i.e., 3 => 1/8th bits.*/ #define AOM_ACCT_BITRES (3) typedef struct { int16_t x; int16_t y; } AccountingSymbolContext; typedef struct { AccountingSymbolContext context; uint32_t id; /** Number of bits in units of 1/8 bit. */ uint32_t bits; uint32_t samples; } AccountingSymbol; /** Dictionary for translating strings into id. */ typedef struct { char *strs[MAX_SYMBOL_TYPES]; int num_strs; } AccountingDictionary; typedef struct { /** All recorded symbols decoded. */ AccountingSymbol *syms; /** Number of syntax actually recorded. */ int num_syms; /** Raw symbol decoding calls for non-binary values. */ int num_multi_syms; /** Raw binary symbol decoding calls. */ int num_binary_syms; /** Dictionary for translating strings into id. */ AccountingDictionary dictionary; } AccountingSymbols; struct Accounting { AccountingSymbols syms; /** Size allocated for symbols (not all may be used). */ int num_syms_allocated; int16_t hash_dictionary[AOM_ACCOUNTING_HASH_SIZE]; AccountingSymbolContext context; uint32_t last_tell_frac; }; void aom_accounting_init(Accounting *accounting); void aom_accounting_reset(Accounting *accounting); void aom_accounting_clear(Accounting *accounting); void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y); int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str); void aom_accounting_record(Accounting *accounting, const char *str, uint32_t bits); void aom_accounting_dump(Accounting *accounting); #ifdef __cplusplus } // extern "C" #endif // __cplusplus #endif // AOM_AV1_DECODER_ACCOUNTING_H_ aom-3.12.1/av1/decoder/decodeframe.c000066400000000000000000006412151477627663500171100ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "config/aom_scale_rtcd.h" #include "aom/aom_codec.h" #include "aom/aom_image.h" #include "aom/internal/aom_codec_internal.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/binary_codes_reader.h" #include "aom_dsp/bitreader.h" #include "aom_dsp/bitreader_buffer.h" #include "aom_dsp/txfm_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/aom_timer.h" #include "aom_ports/mem.h" #include "aom_ports/mem_ops.h" #include "aom_scale/yv12config.h" #include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "aom_util/debug_util.h" #endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "av1/common/alloccommon.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/cdef.h" #include "av1/common/cfl.h" #include "av1/common/common_data.h" #include "av1/common/common.h" #include "av1/common/entropy.h" #include "av1/common/entropymode.h" #include "av1/common/entropymv.h" #include "av1/common/enums.h" #include "av1/common/frame_buffers.h" #include "av1/common/idct.h" #include "av1/common/mv.h" #include "av1/common/mvref_common.h" #include "av1/common/obmc.h" #include "av1/common/pred_common.h" #include "av1/common/quant_common.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" #include "av1/common/resize.h" #include "av1/common/restoration.h" #include "av1/common/scale.h" #include "av1/common/seg_common.h" #include "av1/common/thread_common.h" #include "av1/common/tile_common.h" #include "av1/common/warped_motion.h" #include "av1/decoder/decodeframe.h" #include "av1/decoder/decodemv.h" #include "av1/decoder/decoder.h" #include "av1/decoder/decodetxb.h" #include "av1/decoder/detokenize.h" #if CONFIG_INSPECTION #include "av1/decoder/inspection.h" #endif #define ACCT_STR __func__ #define AOM_MIN_THREADS_PER_TILE 1 #define AOM_MAX_THREADS_PER_TILE 2 // This is needed by ext_tile related unit tests. #define EXT_TILE_DEBUG 1 #define MC_TEMP_BUF_PELS \ (((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2) * \ ((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2)) // Checks that the remaining bits start with a 1 and ends with 0s. // It consumes an additional byte, if already byte aligned before the check. int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) { // bit_offset is set to 0 (mod 8) when the reader is already byte aligned int bits_before_alignment = 8 - rb->bit_offset % 8; int trailing = aom_rb_read_literal(rb, bits_before_alignment); if (trailing != (1 << (bits_before_alignment - 1))) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } return 0; } // Use only_chroma = 1 to only set the chroma planes static inline void set_planes_to_neutral_grey( const SequenceHeader *const seq_params, const YV12_BUFFER_CONFIG *const buf, int only_chroma) { if (seq_params->use_highbitdepth) { const int val = 1 << (seq_params->bit_depth - 1); for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) { const int is_uv = plane > 0; uint16_t *const base = CONVERT_TO_SHORTPTR(buf->buffers[plane]); // Set the first row to neutral grey. Then copy the first row to all // subsequent rows. if (buf->crop_heights[is_uv] > 0) { aom_memset16(base, val, buf->crop_widths[is_uv]); for (int row_idx = 1; row_idx < buf->crop_heights[is_uv]; row_idx++) { memcpy(&base[row_idx * buf->strides[is_uv]], base, sizeof(*base) * buf->crop_widths[is_uv]); } } } } else { for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) { const int is_uv = plane > 0; for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) { memset(&buf->buffers[plane][row_idx * buf->strides[is_uv]], 1 << 7, buf->crop_widths[is_uv]); } } } } static inline void loop_restoration_read_sb_coeffs(const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_reader *const r, int plane, int runit_idx); static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) { return len != 0 && len <= (size_t)(end - start); } static TX_MODE read_tx_mode(struct aom_read_bit_buffer *rb, int coded_lossless) { if (coded_lossless) return ONLY_4X4; return aom_rb_read_bit(rb) ? TX_MODE_SELECT : TX_MODE_LARGEST; } static REFERENCE_MODE read_frame_reference_mode( const AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { if (frame_is_intra_only(cm)) { return SINGLE_REFERENCE; } else { return aom_rb_read_bit(rb) ? REFERENCE_MODE_SELECT : SINGLE_REFERENCE; } } static inline void inverse_transform_block(DecoderCodingBlock *dcb, int plane, const TX_TYPE tx_type, const TX_SIZE tx_size, uint8_t *dst, int stride, int reduced_tx_set) { tran_low_t *const dqcoeff = dcb->dqcoeff_block[plane] + dcb->cb_offset[plane]; eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane]; uint16_t scan_line = eob_data->max_scan_line; uint16_t eob = eob_data->eob; av1_inverse_transform_block(&dcb->xd, dqcoeff, plane, tx_type, tx_size, dst, stride, eob, reduced_tx_set); memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0])); } static inline void read_coeffs_tx_intra_block( const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r, const int plane, const int row, const int col, const TX_SIZE tx_size) { MB_MODE_INFO *mbmi = dcb->xd.mi[0]; if (!mbmi->skip_txfm) { #if TXCOEFF_TIMER struct aom_usec_timer timer; aom_usec_timer_start(&timer); #endif av1_read_coeffs_txb(cm, dcb, r, plane, row, col, tx_size); #if TXCOEFF_TIMER aom_usec_timer_mark(&timer); const int64_t elapsed_time = aom_usec_timer_elapsed(&timer); cm->txcoeff_timer += elapsed_time; ++cm->txb_count; #endif } } static inline void decode_block_void(const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r, const int plane, const int row, const int col, const TX_SIZE tx_size) { (void)cm; (void)dcb; (void)r; (void)plane; (void)row; (void)col; (void)tx_size; } static inline void predict_inter_block_void(AV1_COMMON *const cm, DecoderCodingBlock *dcb, BLOCK_SIZE bsize) { (void)cm; (void)dcb; (void)bsize; } static inline void cfl_store_inter_block_void(AV1_COMMON *const cm, MACROBLOCKD *const xd) { (void)cm; (void)xd; } static inline void predict_and_reconstruct_intra_block( const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r, const int plane, const int row, const int col, const TX_SIZE tx_size) { (void)r; MACROBLOCKD *const xd = &dcb->xd; MB_MODE_INFO *mbmi = xd->mi[0]; PLANE_TYPE plane_type = get_plane_type(plane); av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size); if (!mbmi->skip_txfm) { eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane]; if (eob_data->eob) { const bool reduced_tx_set_used = cm->features.reduced_tx_set_used; // tx_type was read out in av1_read_coeffs_txb. const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size, reduced_tx_set_used); struct macroblockd_plane *const pd = &xd->plane[plane]; uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2]; inverse_transform_block(dcb, plane, tx_type, tx_size, dst, pd->dst.stride, reduced_tx_set_used); } } if (plane == AOM_PLANE_Y && store_cfl_required(cm, xd)) { cfl_store_tx(xd, row, col, tx_size, mbmi->bsize); } } static inline void inverse_transform_inter_block( const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r, const int plane, const int blk_row, const int blk_col, const TX_SIZE tx_size) { (void)r; MACROBLOCKD *const xd = &dcb->xd; PLANE_TYPE plane_type = get_plane_type(plane); const struct macroblockd_plane *const pd = &xd->plane[plane]; const bool reduced_tx_set_used = cm->features.reduced_tx_set_used; // tx_type was read out in av1_read_coeffs_txb. const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, reduced_tx_set_used); uint8_t *dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2]; inverse_transform_block(dcb, plane, tx_type, tx_size, dst, pd->dst.stride, reduced_tx_set_used); #if CONFIG_MISMATCH_DEBUG int pixel_c, pixel_r; BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; int blk_w = block_size_wide[bsize]; int blk_h = block_size_high[bsize]; const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row, pd->subsampling_x, pd->subsampling_y); mismatch_check_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint, plane, pixel_c, pixel_r, blk_w, blk_h, xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); #endif } static inline void set_cb_buffer_offsets(DecoderCodingBlock *dcb, TX_SIZE tx_size, int plane) { dcb->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size]; dcb->txb_offset[plane] = dcb->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN); } static inline void decode_reconstruct_tx(AV1_COMMON *cm, ThreadData *const td, aom_reader *r, MB_MODE_INFO *const mbmi, int plane, BLOCK_SIZE plane_bsize, int blk_row, int blk_col, int block, TX_SIZE tx_size, int *eob_total) { DecoderCodingBlock *const dcb = &td->dcb; MACROBLOCKD *const xd = &dcb->xd; const struct macroblockd_plane *const pd = &xd->plane[plane]; const TX_SIZE plane_tx_size = plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x, pd->subsampling_y) : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, blk_col)]; // Scale to match transform block unit. const int max_blocks_high = max_block_high(xd, plane_bsize, plane); const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; if (tx_size == plane_tx_size || plane) { td->read_coeffs_tx_inter_block_visit(cm, dcb, r, plane, blk_row, blk_col, tx_size); td->inverse_tx_inter_block_visit(cm, dcb, r, plane, blk_row, blk_col, tx_size); eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane]; *eob_total += eob_data->eob; set_cb_buffer_offsets(dcb, tx_size, plane); } else { const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size)); assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size)); const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; const int sub_step = bsw * bsh; const int row_end = AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); const int col_end = AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); assert(bsw > 0 && bsh > 0); for (int row = 0; row < row_end; row += bsh) { const int offsetr = blk_row + row; for (int col = 0; col < col_end; col += bsw) { const int offsetc = blk_col + col; decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, offsetr, offsetc, block, sub_txs, eob_total); block += sub_step; } } } } static inline void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd, BLOCK_SIZE bsize, int mi_row, int mi_col, int bw, int bh, int x_mis, int y_mis) { const int num_planes = av1_num_planes(cm); const CommonModeInfoParams *const mi_params = &cm->mi_params; const TileInfo *const tile = &xd->tile; set_mi_offsets(mi_params, xd, mi_row, mi_col); xd->mi[0]->bsize = bsize; #if CONFIG_RD_DEBUG xd->mi[0]->mi_row = mi_row; xd->mi[0]->mi_col = mi_col; #endif assert(x_mis && y_mis); for (int x = 1; x < x_mis; ++x) xd->mi[x] = xd->mi[0]; int idx = mi_params->mi_stride; for (int y = 1; y < y_mis; ++y) { memcpy(&xd->mi[idx], &xd->mi[0], x_mis * sizeof(xd->mi[0])); idx += mi_params->mi_stride; } set_plane_n4(xd, bw, bh, num_planes); set_entropy_context(xd, mi_row, mi_col, num_planes); // Distance of Mb to the various image edges. These are specified to 8th pel // as they are always compared to values that are in 1/8th pel units set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows, mi_params->mi_cols); av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, num_planes); } static inline void decode_mbmi_block(AV1Decoder *const pbi, DecoderCodingBlock *dcb, int mi_row, int mi_col, aom_reader *r, PARTITION_TYPE partition, BLOCK_SIZE bsize) { AV1_COMMON *const cm = &pbi->common; const SequenceHeader *const seq_params = cm->seq_params; const int bw = mi_size_wide[bsize]; const int bh = mi_size_high[bsize]; const int x_mis = AOMMIN(bw, cm->mi_params.mi_cols - mi_col); const int y_mis = AOMMIN(bh, cm->mi_params.mi_rows - mi_row); MACROBLOCKD *const xd = &dcb->xd; #if CONFIG_ACCOUNTING aom_accounting_set_context(&pbi->accounting, mi_col, mi_row); #endif set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis); xd->mi[0]->partition = partition; av1_read_mode_info(pbi, dcb, r, x_mis, y_mis); if (bsize >= BLOCK_8X8 && (seq_params->subsampling_x || seq_params->subsampling_y)) { const BLOCK_SIZE uv_subsize = av1_ss_size_lookup[bsize][seq_params->subsampling_x] [seq_params->subsampling_y]; if (uv_subsize == BLOCK_INVALID) aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, "Invalid block size."); } } typedef struct PadBlock { int x0; int x1; int y0; int y1; } PadBlock; #if CONFIG_AV1_HIGHBITDEPTH static inline void highbd_build_mc_border(const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int x, int y, int b_w, int b_h, int w, int h) { // Get a pointer to the start of the real data for this row. const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); const uint16_t *ref_row = src - x - y * src_stride; if (y >= h) ref_row += (h - 1) * src_stride; else if (y > 0) ref_row += y * src_stride; do { int right = 0, copy; int left = x < 0 ? -x : 0; if (left > b_w) left = b_w; if (x + b_w > w) right = x + b_w - w; if (right > b_w) right = b_w; copy = b_w - left - right; if (left) aom_memset16(dst, ref_row[0], left); if (copy) memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t)); if (right) aom_memset16(dst + left + copy, ref_row[w - 1], right); dst += dst_stride; ++y; if (y > 0 && y < h) ref_row += src_stride; } while (--b_h); } #endif // CONFIG_AV1_HIGHBITDEPTH static inline void build_mc_border(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int x, int y, int b_w, int b_h, int w, int h) { // Get a pointer to the start of the real data for this row. const uint8_t *ref_row = src - x - y * src_stride; if (y >= h) ref_row += (h - 1) * src_stride; else if (y > 0) ref_row += y * src_stride; do { int right = 0, copy; int left = x < 0 ? -x : 0; if (left > b_w) left = b_w; if (x + b_w > w) right = x + b_w - w; if (right > b_w) right = b_w; copy = b_w - left - right; if (left) memset(dst, ref_row[0], left); if (copy) memcpy(dst + left, ref_row + x + left, copy); if (right) memset(dst + left + copy, ref_row[w - 1], right); dst += dst_stride; ++y; if (y > 0 && y < h) ref_row += src_stride; } while (--b_h); } static inline int update_extend_mc_border_params( const struct scale_factors *const sf, struct buf_2d *const pre_buf, MV32 scaled_mv, PadBlock *block, int subpel_x_mv, int subpel_y_mv, int do_warp, int is_intrabc, int *x_pad, int *y_pad) { const int is_scaled = av1_is_scaled(sf); // Get reference width and height. int frame_width = pre_buf->width; int frame_height = pre_buf->height; // Do border extension if there is motion or // width/height is not a multiple of 8 pixels. if ((!is_intrabc) && (!do_warp) && (is_scaled || scaled_mv.col || scaled_mv.row || (frame_width & 0x7) || (frame_height & 0x7))) { if (subpel_x_mv || (sf->x_step_q4 != SUBPEL_SHIFTS)) { block->x0 -= AOM_INTERP_EXTEND - 1; block->x1 += AOM_INTERP_EXTEND; *x_pad = 1; } if (subpel_y_mv || (sf->y_step_q4 != SUBPEL_SHIFTS)) { block->y0 -= AOM_INTERP_EXTEND - 1; block->y1 += AOM_INTERP_EXTEND; *y_pad = 1; } // Skip border extension if block is inside the frame. if (block->x0 < 0 || block->x1 > frame_width - 1 || block->y0 < 0 || block->y1 > frame_height - 1) { return 1; } } return 0; } static inline void extend_mc_border(const struct scale_factors *const sf, struct buf_2d *const pre_buf, MV32 scaled_mv, PadBlock block, int subpel_x_mv, int subpel_y_mv, int do_warp, int is_intrabc, int highbd, uint8_t *mc_buf, uint8_t **pre, int *src_stride) { int x_pad = 0, y_pad = 0; if (update_extend_mc_border_params(sf, pre_buf, scaled_mv, &block, subpel_x_mv, subpel_y_mv, do_warp, is_intrabc, &x_pad, &y_pad)) { // Get reference block pointer. const uint8_t *const buf_ptr = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0; int buf_stride = pre_buf->stride; const int b_w = block.x1 - block.x0; const int b_h = block.y1 - block.y0; #if CONFIG_AV1_HIGHBITDEPTH // Extend the border. if (highbd) { highbd_build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w, b_h, pre_buf->width, pre_buf->height); } else { build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w, b_h, pre_buf->width, pre_buf->height); } #else (void)highbd; build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w, b_h, pre_buf->width, pre_buf->height); #endif *src_stride = b_w; *pre = mc_buf + y_pad * (AOM_INTERP_EXTEND - 1) * b_w + x_pad * (AOM_INTERP_EXTEND - 1); } } static inline void dec_calc_subpel_params( const MV *const src_mv, InterPredParams *const inter_pred_params, const MACROBLOCKD *const xd, int mi_x, int mi_y, uint8_t **pre, SubpelParams *subpel_params, int *src_stride, PadBlock *block, MV32 *scaled_mv, int *subpel_x_mv, int *subpel_y_mv) { const struct scale_factors *sf = inter_pred_params->scale_factors; struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf; const int bw = inter_pred_params->block_width; const int bh = inter_pred_params->block_height; const int is_scaled = av1_is_scaled(sf); if (is_scaled) { int ssx = inter_pred_params->subsampling_x; int ssy = inter_pred_params->subsampling_y; int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS; orig_pos_y += src_mv->row * (1 << (1 - ssy)); int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS; orig_pos_x += src_mv->col * (1 << (1 - ssx)); int pos_y = av1_scaled_y(orig_pos_y, sf); int pos_x = av1_scaled_x(orig_pos_x, sf); pos_x += SCALE_EXTRA_OFF; pos_y += SCALE_EXTRA_OFF; const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS; const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS; pos_y = clamp(pos_y, top, bottom); pos_x = clamp(pos_x, left, right); subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK; subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK; subpel_params->xs = sf->x_step_q4; subpel_params->ys = sf->y_step_q4; // Get reference block top left coordinate. block->x0 = pos_x >> SCALE_SUBPEL_BITS; block->y0 = pos_y >> SCALE_SUBPEL_BITS; // Get reference block bottom right coordinate. block->x1 = ((pos_x + (bw - 1) * subpel_params->xs) >> SCALE_SUBPEL_BITS) + 1; block->y1 = ((pos_y + (bh - 1) * subpel_params->ys) >> SCALE_SUBPEL_BITS) + 1; MV temp_mv; temp_mv = clamp_mv_to_umv_border_sb(xd, src_mv, bw, bh, inter_pred_params->subsampling_x, inter_pred_params->subsampling_y); *scaled_mv = av1_scale_mv(&temp_mv, mi_x, mi_y, sf); scaled_mv->row += SCALE_EXTRA_OFF; scaled_mv->col += SCALE_EXTRA_OFF; *subpel_x_mv = scaled_mv->col & SCALE_SUBPEL_MASK; *subpel_y_mv = scaled_mv->row & SCALE_SUBPEL_MASK; } else { // Get block position in current frame. int pos_x = inter_pred_params->pix_col << SUBPEL_BITS; int pos_y = inter_pred_params->pix_row << SUBPEL_BITS; const MV mv_q4 = clamp_mv_to_umv_border_sb( xd, src_mv, bw, bh, inter_pred_params->subsampling_x, inter_pred_params->subsampling_y); subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS; subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS; subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS; // Get reference block top left coordinate. pos_x += mv_q4.col; pos_y += mv_q4.row; block->x0 = pos_x >> SUBPEL_BITS; block->y0 = pos_y >> SUBPEL_BITS; // Get reference block bottom right coordinate. block->x1 = (pos_x >> SUBPEL_BITS) + (bw - 1) + 1; block->y1 = (pos_y >> SUBPEL_BITS) + (bh - 1) + 1; scaled_mv->row = mv_q4.row; scaled_mv->col = mv_q4.col; *subpel_x_mv = scaled_mv->col & SUBPEL_MASK; *subpel_y_mv = scaled_mv->row & SUBPEL_MASK; } *pre = pre_buf->buf0 + block->y0 * pre_buf->stride + block->x0; *src_stride = pre_buf->stride; } static inline void dec_calc_subpel_params_and_extend( const MV *const src_mv, InterPredParams *const inter_pred_params, MACROBLOCKD *const xd, int mi_x, int mi_y, int ref, uint8_t **mc_buf, uint8_t **pre, SubpelParams *subpel_params, int *src_stride) { PadBlock block; MV32 scaled_mv; int subpel_x_mv, subpel_y_mv; dec_calc_subpel_params(src_mv, inter_pred_params, xd, mi_x, mi_y, pre, subpel_params, src_stride, &block, &scaled_mv, &subpel_x_mv, &subpel_y_mv); extend_mc_border( inter_pred_params->scale_factors, &inter_pred_params->ref_frame_buf, scaled_mv, block, subpel_x_mv, subpel_y_mv, inter_pred_params->mode == WARP_PRED, inter_pred_params->is_intrabc, inter_pred_params->use_hbd_buf, mc_buf[ref], pre, src_stride); } #define IS_DEC 1 #include "av1/common/reconinter_template.inc" #undef IS_DEC static void dec_build_inter_predictors(const AV1_COMMON *cm, DecoderCodingBlock *dcb, int plane, const MB_MODE_INFO *mi, int build_for_obmc, int bw, int bh, int mi_x, int mi_y) { build_inter_predictors(cm, &dcb->xd, plane, mi, build_for_obmc, bw, bh, mi_x, mi_y, dcb->mc_buf); } static inline void dec_build_inter_predictor(const AV1_COMMON *cm, DecoderCodingBlock *dcb, int mi_row, int mi_col, BLOCK_SIZE bsize) { MACROBLOCKD *const xd = &dcb->xd; const int num_planes = av1_num_planes(cm); for (int plane = 0; plane < num_planes; ++plane) { if (plane && !xd->is_chroma_ref) break; const int mi_x = mi_col * MI_SIZE; const int mi_y = mi_row * MI_SIZE; dec_build_inter_predictors(cm, dcb, plane, xd->mi[0], 0, xd->plane[plane].width, xd->plane[plane].height, mi_x, mi_y); if (is_interintra_pred(xd->mi[0])) { BUFFER_SET ctx = { { xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf }, { xd->plane[0].dst.stride, xd->plane[1].dst.stride, xd->plane[2].dst.stride } }; av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, &ctx, plane, bsize); } } } static inline void dec_build_prediction_by_above_pred( MACROBLOCKD *const xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, int dir, MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) { struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; const int above_mi_col = xd->mi_col + rel_mi_col; int mi_x, mi_y; MB_MODE_INFO backup_mbmi = *above_mbmi; (void)rel_mi_row; (void)dir; av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, op_mi_size, &backup_mbmi, ctxt, num_planes); mi_x = above_mi_col << MI_SIZE_LOG2; mi_y = xd->mi_row << MI_SIZE_LOG2; const BLOCK_SIZE bsize = xd->mi[0]->bsize; for (int j = 0; j < num_planes; ++j) { const struct macroblockd_plane *pd = &xd->plane[j]; int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x; int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4, block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1)); if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue; dec_build_inter_predictors(ctxt->cm, (DecoderCodingBlock *)ctxt->dcb, j, &backup_mbmi, 1, bw, bh, mi_x, mi_y); } } static inline void dec_build_prediction_by_above_preds( const AV1_COMMON *cm, DecoderCodingBlock *dcb, uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE], int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) { MACROBLOCKD *const xd = &dcb->xd; if (!xd->up_available) return; // Adjust mb_to_bottom_edge to have the correct value for the OBMC // prediction block. This is half the height of the original block, // except for 128-wide blocks, where we only use a height of 32. const int this_height = xd->height * MI_SIZE; const int pred_height = AOMMIN(this_height / 2, 32); xd->mb_to_bottom_edge += GET_MV_SUBPEL(this_height - pred_height); struct build_prediction_ctxt ctxt = { cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_right_edge, dcb }; const BLOCK_SIZE bsize = xd->mi[0]->bsize; foreach_overlappable_nb_above(cm, xd, max_neighbor_obmc[mi_size_wide_log2[bsize]], dec_build_prediction_by_above_pred, &ctxt); xd->mb_to_left_edge = -GET_MV_SUBPEL(xd->mi_col * MI_SIZE); xd->mb_to_right_edge = ctxt.mb_to_far_edge; xd->mb_to_bottom_edge -= GET_MV_SUBPEL(this_height - pred_height); } static inline void dec_build_prediction_by_left_pred( MACROBLOCKD *const xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, int dir, MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) { struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; const int left_mi_row = xd->mi_row + rel_mi_row; int mi_x, mi_y; MB_MODE_INFO backup_mbmi = *left_mbmi; (void)rel_mi_col; (void)dir; av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, op_mi_size, &backup_mbmi, ctxt, num_planes); mi_x = xd->mi_col << MI_SIZE_LOG2; mi_y = left_mi_row << MI_SIZE_LOG2; const BLOCK_SIZE bsize = xd->mi[0]->bsize; for (int j = 0; j < num_planes; ++j) { const struct macroblockd_plane *pd = &xd->plane[j]; int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4, block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1)); int bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y; if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue; dec_build_inter_predictors(ctxt->cm, (DecoderCodingBlock *)ctxt->dcb, j, &backup_mbmi, 1, bw, bh, mi_x, mi_y); } } static inline void dec_build_prediction_by_left_preds( const AV1_COMMON *cm, DecoderCodingBlock *dcb, uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE], int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) { MACROBLOCKD *const xd = &dcb->xd; if (!xd->left_available) return; // Adjust mb_to_right_edge to have the correct value for the OBMC // prediction block. This is half the width of the original block, // except for 128-wide blocks, where we only use a width of 32. const int this_width = xd->width * MI_SIZE; const int pred_width = AOMMIN(this_width / 2, 32); xd->mb_to_right_edge += GET_MV_SUBPEL(this_width - pred_width); struct build_prediction_ctxt ctxt = { cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_bottom_edge, dcb }; const BLOCK_SIZE bsize = xd->mi[0]->bsize; foreach_overlappable_nb_left(cm, xd, max_neighbor_obmc[mi_size_high_log2[bsize]], dec_build_prediction_by_left_pred, &ctxt); xd->mb_to_top_edge = -GET_MV_SUBPEL(xd->mi_row * MI_SIZE); xd->mb_to_right_edge -= GET_MV_SUBPEL(this_width - pred_width); xd->mb_to_bottom_edge = ctxt.mb_to_far_edge; } static inline void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, DecoderCodingBlock *dcb) { const int num_planes = av1_num_planes(cm); uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE]; int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; MACROBLOCKD *const xd = &dcb->xd; av1_setup_obmc_dst_bufs(xd, dst_buf1, dst_buf2); dec_build_prediction_by_above_preds(cm, dcb, dst_buf1, dst_width1, dst_height1, dst_stride1); dec_build_prediction_by_left_preds(cm, dcb, dst_buf2, dst_width2, dst_height2, dst_stride2); const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; av1_setup_dst_planes(xd->plane, xd->mi[0]->bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, num_planes); av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2, dst_stride2); } static inline void cfl_store_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd) { MB_MODE_INFO *mbmi = xd->mi[0]; if (store_cfl_required(cm, xd)) { cfl_store_block(xd, mbmi->bsize, mbmi->tx_size); } } static inline void predict_inter_block(AV1_COMMON *const cm, DecoderCodingBlock *dcb, BLOCK_SIZE bsize) { MACROBLOCKD *const xd = &dcb->xd; MB_MODE_INFO *mbmi = xd->mi[0]; const int num_planes = av1_num_planes(cm); const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; if (frame < LAST_FRAME) { assert(is_intrabc_block(mbmi)); assert(frame == INTRA_FRAME); assert(ref == 0); } else { const RefCntBuffer *ref_buf = get_ref_frame_buf(cm, frame); const struct scale_factors *ref_scale_factors = get_ref_scale_factors_const(cm, frame); xd->block_ref_scale_factors[ref] = ref_scale_factors; av1_setup_pre_planes(xd, ref, &ref_buf->buf, mi_row, mi_col, ref_scale_factors, num_planes); } } dec_build_inter_predictor(cm, dcb, mi_row, mi_col, bsize); if (mbmi->motion_mode == OBMC_CAUSAL) { dec_build_obmc_inter_predictors_sb(cm, dcb); } #if CONFIG_MISMATCH_DEBUG for (int plane = 0; plane < num_planes; ++plane) { const struct macroblockd_plane *pd = &xd->plane[plane]; int pixel_c, pixel_r; mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, pd->subsampling_x, pd->subsampling_y); if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, pd->subsampling_y)) continue; mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, cm->current_frame.order_hint, plane, pixel_c, pixel_r, pd->width, pd->height, xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); } #endif } static inline void set_color_index_map_offset(MACROBLOCKD *const xd, int plane, aom_reader *r) { (void)r; Av1ColorMapParam params; const MB_MODE_INFO *const mbmi = xd->mi[0]; av1_get_block_dimensions(mbmi->bsize, plane, xd, ¶ms.plane_width, ¶ms.plane_height, NULL, NULL); xd->color_index_map_offset[plane] += params.plane_width * params.plane_height; } static inline void decode_token_recon_block(AV1Decoder *const pbi, ThreadData *const td, aom_reader *r, BLOCK_SIZE bsize) { AV1_COMMON *const cm = &pbi->common; DecoderCodingBlock *const dcb = &td->dcb; MACROBLOCKD *const xd = &dcb->xd; const int num_planes = av1_num_planes(cm); MB_MODE_INFO *mbmi = xd->mi[0]; if (!is_inter_block(mbmi)) { int row, col; assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x, xd->plane[0].subsampling_y)); const int max_blocks_wide = max_block_wide(xd, bsize, 0); const int max_blocks_high = max_block_high(xd, bsize, 0); const BLOCK_SIZE max_unit_bsize = BLOCK_64X64; int mu_blocks_wide = mi_size_wide[max_unit_bsize]; int mu_blocks_high = mi_size_high[max_unit_bsize]; mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide); mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high); for (row = 0; row < max_blocks_high; row += mu_blocks_high) { for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) { for (int plane = 0; plane < num_planes; ++plane) { if (plane && !xd->is_chroma_ref) break; const struct macroblockd_plane *const pd = &xd->plane[plane]; const TX_SIZE tx_size = av1_get_tx_size(plane, xd); const int stepr = tx_size_high_unit[tx_size]; const int stepc = tx_size_wide_unit[tx_size]; const int unit_height = ROUND_POWER_OF_TWO( AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y); const int unit_width = ROUND_POWER_OF_TWO( AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x); for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height; blk_row += stepr) { for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width; blk_col += stepc) { td->read_coeffs_tx_intra_block_visit(cm, dcb, r, plane, blk_row, blk_col, tx_size); td->predict_and_recon_intra_block_visit( cm, dcb, r, plane, blk_row, blk_col, tx_size); set_cb_buffer_offsets(dcb, tx_size, plane); } } } } } } else { td->predict_inter_block_visit(cm, dcb, bsize); // Reconstruction if (!mbmi->skip_txfm) { int eobtotal = 0; const int max_blocks_wide = max_block_wide(xd, bsize, 0); const int max_blocks_high = max_block_high(xd, bsize, 0); int row, col; const BLOCK_SIZE max_unit_bsize = BLOCK_64X64; assert(max_unit_bsize == get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x, xd->plane[0].subsampling_y)); int mu_blocks_wide = mi_size_wide[max_unit_bsize]; int mu_blocks_high = mi_size_high[max_unit_bsize]; mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide); mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high); for (row = 0; row < max_blocks_high; row += mu_blocks_high) { for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) { for (int plane = 0; plane < num_planes; ++plane) { if (plane && !xd->is_chroma_ref) break; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int ss_x = pd->subsampling_x; const int ss_y = pd->subsampling_y; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); const int bh_var_tx = tx_size_high_unit[max_tx_size]; const int bw_var_tx = tx_size_wide_unit[max_tx_size]; int block = 0; int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; int blk_row, blk_col; const int unit_height = ROUND_POWER_OF_TWO( AOMMIN(mu_blocks_high + row, max_blocks_high), ss_y); const int unit_width = ROUND_POWER_OF_TWO( AOMMIN(mu_blocks_wide + col, max_blocks_wide), ss_x); for (blk_row = row >> ss_y; blk_row < unit_height; blk_row += bh_var_tx) { for (blk_col = col >> ss_x; blk_col < unit_width; blk_col += bw_var_tx) { decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, blk_row, blk_col, block, max_tx_size, &eobtotal); block += step; } } } } } } td->cfl_store_inter_block_visit(cm, xd); } av1_visit_palette(pbi, xd, r, set_color_index_map_offset); } static inline void set_inter_tx_size(MB_MODE_INFO *mbmi, int stride_log2, int tx_w_log2, int tx_h_log2, int min_txs, int split_size, int txs, int blk_row, int blk_col) { for (int idy = 0; idy < tx_size_high_unit[split_size]; idy += tx_size_high_unit[min_txs]) { for (int idx = 0; idx < tx_size_wide_unit[split_size]; idx += tx_size_wide_unit[min_txs]) { const int index = (((blk_row + idy) >> tx_h_log2) << stride_log2) + ((blk_col + idx) >> tx_w_log2); mbmi->inter_tx_size[index] = txs; } } } static inline void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi, TX_SIZE tx_size, int depth, int blk_row, int blk_col, aom_reader *r) { FRAME_CONTEXT *ec_ctx = xd->tile_ctx; int is_split = 0; const BLOCK_SIZE bsize = mbmi->bsize; const int max_blocks_high = max_block_high(xd, bsize, 0); const int max_blocks_wide = max_block_wide(xd, bsize, 0); if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; assert(tx_size > TX_4X4); TX_SIZE txs = max_txsize_rect_lookup[bsize]; for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level) txs = sub_tx_size_map[txs]; const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2; const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2; const int bw_log2 = mi_size_wide_log2[bsize]; const int stride_log2 = bw_log2 - tx_w_log2; if (depth == MAX_VARTX_DEPTH) { set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size, tx_size, blk_row, blk_col); mbmi->tx_size = tx_size; txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, tx_size, tx_size); return; } const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, mbmi->bsize, tx_size); is_split = aom_read_symbol(r, ec_ctx->txfm_partition_cdf[ctx], 2, ACCT_STR); if (is_split) { const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; if (sub_txs == TX_4X4) { set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size, sub_txs, blk_row, blk_col); mbmi->tx_size = sub_txs; txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, sub_txs, tx_size); return; } assert(bsw > 0 && bsh > 0); for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { int offsetr = blk_row + row; int offsetc = blk_col + col; read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, r); } } } else { set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size, tx_size, blk_row, blk_col); mbmi->tx_size = tx_size; txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, tx_size, tx_size); } } static TX_SIZE read_selected_tx_size(const MACROBLOCKD *const xd, aom_reader *r) { // TODO(debargha): Clean up the logic here. This function should only // be called for intra. const BLOCK_SIZE bsize = xd->mi[0]->bsize; const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); const int max_depths = bsize_to_max_depth(bsize); const int ctx = get_tx_size_context(xd); FRAME_CONTEXT *ec_ctx = xd->tile_ctx; const int depth = aom_read_symbol(r, ec_ctx->tx_size_cdf[tx_size_cat][ctx], max_depths + 1, ACCT_STR); assert(depth >= 0 && depth <= max_depths); const TX_SIZE tx_size = depth_to_tx_size(depth, bsize); return tx_size; } static TX_SIZE read_tx_size(const MACROBLOCKD *const xd, TX_MODE tx_mode, int is_inter, int allow_select_inter, aom_reader *r) { const BLOCK_SIZE bsize = xd->mi[0]->bsize; if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4; if (block_signals_txsize(bsize)) { if ((!is_inter || allow_select_inter) && tx_mode == TX_MODE_SELECT) { const TX_SIZE coded_tx_size = read_selected_tx_size(xd, r); return coded_tx_size; } else { return tx_size_from_tx_mode(bsize, tx_mode); } } else { assert(IMPLIES(tx_mode == ONLY_4X4, bsize == BLOCK_4X4)); return max_txsize_rect_lookup[bsize]; } } static inline void parse_decode_block(AV1Decoder *const pbi, ThreadData *const td, int mi_row, int mi_col, aom_reader *r, PARTITION_TYPE partition, BLOCK_SIZE bsize) { DecoderCodingBlock *const dcb = &td->dcb; MACROBLOCKD *const xd = &dcb->xd; decode_mbmi_block(pbi, dcb, mi_row, mi_col, r, partition, bsize); av1_visit_palette(pbi, xd, r, av1_decode_palette_tokens); AV1_COMMON *cm = &pbi->common; const int num_planes = av1_num_planes(cm); MB_MODE_INFO *mbmi = xd->mi[0]; int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi); if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) && !mbmi->skip_txfm && inter_block_tx && !xd->lossless[mbmi->segment_id]) { const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize]; const int bh = tx_size_high_unit[max_tx_size]; const int bw = tx_size_wide_unit[max_tx_size]; const int width = mi_size_wide[bsize]; const int height = mi_size_high[bsize]; for (int idy = 0; idy < height; idy += bh) for (int idx = 0; idx < width; idx += bw) read_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, r); } else { mbmi->tx_size = read_tx_size(xd, cm->features.tx_mode, inter_block_tx, !mbmi->skip_txfm, r); if (inter_block_tx) memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, mbmi->skip_txfm && is_inter_block(mbmi), xd); } if (cm->delta_q_info.delta_q_present_flag) { for (int i = 0; i < MAX_SEGMENTS; i++) { const int current_qindex = av1_get_qindex(&cm->seg, i, xd->current_base_qindex); const CommonQuantParams *const quant_params = &cm->quant_params; for (int j = 0; j < num_planes; ++j) { const int dc_delta_q = j == 0 ? quant_params->y_dc_delta_q : (j == 1 ? quant_params->u_dc_delta_q : quant_params->v_dc_delta_q); const int ac_delta_q = j == 0 ? 0 : (j == 1 ? quant_params->u_ac_delta_q : quant_params->v_ac_delta_q); xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX( current_qindex, dc_delta_q, cm->seq_params->bit_depth); xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX( current_qindex, ac_delta_q, cm->seq_params->bit_depth); } } } if (mbmi->skip_txfm) av1_reset_entropy_context(xd, bsize, num_planes); decode_token_recon_block(pbi, td, r, bsize); } static inline void set_offsets_for_pred_and_recon(AV1Decoder *const pbi, ThreadData *const td, int mi_row, int mi_col, BLOCK_SIZE bsize) { AV1_COMMON *const cm = &pbi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; DecoderCodingBlock *const dcb = &td->dcb; MACROBLOCKD *const xd = &dcb->xd; const int bw = mi_size_wide[bsize]; const int bh = mi_size_high[bsize]; const int num_planes = av1_num_planes(cm); const int offset = mi_row * mi_params->mi_stride + mi_col; const TileInfo *const tile = &xd->tile; xd->mi = mi_params->mi_grid_base + offset; xd->tx_type_map = &mi_params->tx_type_map[mi_row * mi_params->mi_stride + mi_col]; xd->tx_type_map_stride = mi_params->mi_stride; set_plane_n4(xd, bw, bh, num_planes); // Distance of Mb to the various image edges. These are specified to 8th pel // as they are always compared to values that are in 1/8th pel units set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows, mi_params->mi_cols); av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, num_planes); } static inline void decode_block(AV1Decoder *const pbi, ThreadData *const td, int mi_row, int mi_col, aom_reader *r, PARTITION_TYPE partition, BLOCK_SIZE bsize) { (void)partition; set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize); decode_token_recon_block(pbi, td, r, bsize); } static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col, aom_reader *r, int has_rows, int has_cols, BLOCK_SIZE bsize) { const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize); FRAME_CONTEXT *ec_ctx = xd->tile_ctx; if (!has_rows && !has_cols) return PARTITION_SPLIT; assert(ctx >= 0); aom_cdf_prob *partition_cdf = ec_ctx->partition_cdf[ctx]; if (has_rows && has_cols) { return (PARTITION_TYPE)aom_read_symbol( r, partition_cdf, partition_cdf_length(bsize), ACCT_STR); } else if (!has_rows && has_cols) { assert(bsize > BLOCK_8X8); aom_cdf_prob cdf[2]; partition_gather_vert_alike(cdf, partition_cdf, bsize); assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP)); return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_HORZ; } else { assert(has_rows && !has_cols); assert(bsize > BLOCK_8X8); aom_cdf_prob cdf[2]; partition_gather_horz_alike(cdf, partition_cdf, bsize); assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP)); return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_VERT; } } // TODO(slavarnway): eliminate bsize and subsize in future commits static inline void decode_partition(AV1Decoder *const pbi, ThreadData *const td, int mi_row, int mi_col, aom_reader *reader, BLOCK_SIZE bsize, int parse_decode_flag) { assert(bsize < BLOCK_SIZES_ALL); AV1_COMMON *const cm = &pbi->common; DecoderCodingBlock *const dcb = &td->dcb; MACROBLOCKD *const xd = &dcb->xd; const int bw = mi_size_wide[bsize]; const int hbs = bw >> 1; PARTITION_TYPE partition; BLOCK_SIZE subsize; const int quarter_step = bw / 4; BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows; const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols; if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) return; // parse_decode_flag takes the following values : // 01 - do parse only // 10 - do decode only // 11 - do parse and decode static const block_visitor_fn_t block_visit[4] = { NULL, parse_decode_block, decode_block, parse_decode_block }; if (parse_decode_flag & 1) { const int num_planes = av1_num_planes(cm); for (int plane = 0; plane < num_planes; ++plane) { int rcol0, rcol1, rrow0, rrow1; // Skip some unnecessary work if loop restoration is disabled if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize, &rcol0, &rcol1, &rrow0, &rrow1)) { const int rstride = cm->rst_info[plane].horz_units; for (int rrow = rrow0; rrow < rrow1; ++rrow) { for (int rcol = rcol0; rcol < rcol1; ++rcol) { const int runit_idx = rcol + rrow * rstride; loop_restoration_read_sb_coeffs(cm, xd, reader, plane, runit_idx); } } } } partition = (bsize < BLOCK_8X8) ? PARTITION_NONE : read_partition(xd, mi_row, mi_col, reader, has_rows, has_cols, bsize); } else { partition = get_partition(cm, mi_row, mi_col, bsize); } subsize = get_partition_subsize(bsize, partition); if (subsize == BLOCK_INVALID) { // When an internal error occurs ensure that xd->mi_row is set appropriately // w.r.t. current tile, which is used to signal processing of current row is // done. xd->mi_row = mi_row; aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, "Partition is invalid for block size %dx%d", block_size_wide[bsize], block_size_high[bsize]); } // Check the bitstream is conformant: if there is subsampling on the // chroma planes, subsize must subsample to a valid block size. const struct macroblockd_plane *const pd_u = &xd->plane[1]; if (get_plane_block_size(subsize, pd_u->subsampling_x, pd_u->subsampling_y) == BLOCK_INVALID) { // When an internal error occurs ensure that xd->mi_row is set appropriately // w.r.t. current tile, which is used to signal processing of current row is // done. xd->mi_row = mi_row; aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, "Block size %dx%d invalid with this subsampling mode", block_size_wide[subsize], block_size_high[subsize]); } #define DEC_BLOCK_STX_ARG #define DEC_BLOCK_EPT_ARG partition, #define DEC_BLOCK(db_r, db_c, db_subsize) \ block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \ reader, DEC_BLOCK_EPT_ARG(db_subsize)) #define DEC_PARTITION(db_r, db_c, db_subsize) \ decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \ (db_subsize), parse_decode_flag) switch (partition) { case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break; case PARTITION_HORZ: DEC_BLOCK(mi_row, mi_col, subsize); if (has_rows) DEC_BLOCK(mi_row + hbs, mi_col, subsize); break; case PARTITION_VERT: DEC_BLOCK(mi_row, mi_col, subsize); if (has_cols) DEC_BLOCK(mi_row, mi_col + hbs, subsize); break; case PARTITION_SPLIT: DEC_PARTITION(mi_row, mi_col, subsize); DEC_PARTITION(mi_row, mi_col + hbs, subsize); DEC_PARTITION(mi_row + hbs, mi_col, subsize); DEC_PARTITION(mi_row + hbs, mi_col + hbs, subsize); break; case PARTITION_HORZ_A: DEC_BLOCK(mi_row, mi_col, bsize2); DEC_BLOCK(mi_row, mi_col + hbs, bsize2); DEC_BLOCK(mi_row + hbs, mi_col, subsize); break; case PARTITION_HORZ_B: DEC_BLOCK(mi_row, mi_col, subsize); DEC_BLOCK(mi_row + hbs, mi_col, bsize2); DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2); break; case PARTITION_VERT_A: DEC_BLOCK(mi_row, mi_col, bsize2); DEC_BLOCK(mi_row + hbs, mi_col, bsize2); DEC_BLOCK(mi_row, mi_col + hbs, subsize); break; case PARTITION_VERT_B: DEC_BLOCK(mi_row, mi_col, subsize); DEC_BLOCK(mi_row, mi_col + hbs, bsize2); DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2); break; case PARTITION_HORZ_4: for (int i = 0; i < 4; ++i) { int this_mi_row = mi_row + i * quarter_step; if (i > 0 && this_mi_row >= cm->mi_params.mi_rows) break; DEC_BLOCK(this_mi_row, mi_col, subsize); } break; case PARTITION_VERT_4: for (int i = 0; i < 4; ++i) { int this_mi_col = mi_col + i * quarter_step; if (i > 0 && this_mi_col >= cm->mi_params.mi_cols) break; DEC_BLOCK(mi_row, this_mi_col, subsize); } break; default: assert(0 && "Invalid partition type"); } #undef DEC_PARTITION #undef DEC_BLOCK #undef DEC_BLOCK_EPT_ARG #undef DEC_BLOCK_STX_ARG if (parse_decode_flag & 1) update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); } static inline void setup_bool_decoder( MACROBLOCKD *const xd, const uint8_t *data, const uint8_t *data_end, const size_t read_size, struct aom_internal_error_info *error_info, aom_reader *r, uint8_t allow_update_cdf) { // Validate the calculated partition length. If the buffer // described by the partition can't be fully read, then restrict // it to the portion that can be (for EC mode) or throw an error. if (!read_is_valid(data, read_size, data_end)) { // When internal error occurs ensure that xd->mi_row is set appropriately // w.r.t. current tile, which is used to signal processing of current row is // done in row-mt decoding. xd->mi_row = xd->tile.mi_row_start; aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt tile length"); } if (aom_reader_init(r, data, read_size)) { // When internal error occurs ensure that xd->mi_row is set appropriately // w.r.t. current tile, which is used to signal processing of current row is // done in row-mt decoding. xd->mi_row = xd->tile.mi_row_start; aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate bool decoder %d", 1); } r->allow_update_cdf = allow_update_cdf; } static inline void setup_segmentation(AV1_COMMON *const cm, struct aom_read_bit_buffer *rb) { struct segmentation *const seg = &cm->seg; seg->update_map = 0; seg->update_data = 0; seg->temporal_update = 0; seg->enabled = aom_rb_read_bit(rb); if (!seg->enabled) { if (cm->cur_frame->seg_map) { memset(cm->cur_frame->seg_map, 0, (cm->cur_frame->mi_rows * cm->cur_frame->mi_cols)); } memset(seg, 0, sizeof(*seg)); segfeatures_copy(&cm->cur_frame->seg, seg); return; } if (cm->seg.enabled && cm->prev_frame && (cm->mi_params.mi_rows == cm->prev_frame->mi_rows) && (cm->mi_params.mi_cols == cm->prev_frame->mi_cols)) { cm->last_frame_seg_map = cm->prev_frame->seg_map; } else { cm->last_frame_seg_map = NULL; } // Read update flags if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) { // These frames can't use previous frames, so must signal map + features seg->update_map = 1; seg->temporal_update = 0; seg->update_data = 1; } else { seg->update_map = aom_rb_read_bit(rb); if (seg->update_map) { seg->temporal_update = aom_rb_read_bit(rb); } else { seg->temporal_update = 0; } seg->update_data = aom_rb_read_bit(rb); } // Segmentation data update if (seg->update_data) { av1_clearall_segfeatures(seg); for (int i = 0; i < MAX_SEGMENTS; i++) { for (int j = 0; j < SEG_LVL_MAX; j++) { int data = 0; const int feature_enabled = aom_rb_read_bit(rb); if (feature_enabled) { av1_enable_segfeature(seg, i, j); const int data_max = av1_seg_feature_data_max(j); const int data_min = -data_max; const int ubits = get_unsigned_bits(data_max); if (av1_is_segfeature_signed(j)) { data = aom_rb_read_inv_signed_literal(rb, ubits); } else { data = aom_rb_read_literal(rb, ubits); } data = clamp(data, data_min, data_max); } av1_set_segdata(seg, i, j, data); } } av1_calculate_segdata(seg); } else if (cm->prev_frame) { segfeatures_copy(seg, &cm->prev_frame->seg); } segfeatures_copy(&cm->cur_frame->seg, seg); } static inline void decode_restoration_mode(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { assert(!cm->features.all_lossless); const int num_planes = av1_num_planes(cm); if (cm->features.allow_intrabc) return; int all_none = 1, chroma_none = 1; for (int p = 0; p < num_planes; ++p) { RestorationInfo *rsi = &cm->rst_info[p]; if (aom_rb_read_bit(rb)) { rsi->frame_restoration_type = aom_rb_read_bit(rb) ? RESTORE_SGRPROJ : RESTORE_WIENER; } else { rsi->frame_restoration_type = aom_rb_read_bit(rb) ? RESTORE_SWITCHABLE : RESTORE_NONE; } if (rsi->frame_restoration_type != RESTORE_NONE) { all_none = 0; chroma_none &= p == 0; } } if (!all_none) { assert(cm->seq_params->sb_size == BLOCK_64X64 || cm->seq_params->sb_size == BLOCK_128X128); const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64; for (int p = 0; p < num_planes; ++p) cm->rst_info[p].restoration_unit_size = sb_size; RestorationInfo *rsi = &cm->rst_info[0]; if (sb_size == 64) { rsi->restoration_unit_size <<= aom_rb_read_bit(rb); } if (rsi->restoration_unit_size > 64) { rsi->restoration_unit_size <<= aom_rb_read_bit(rb); } } else { const int size = RESTORATION_UNITSIZE_MAX; for (int p = 0; p < num_planes; ++p) cm->rst_info[p].restoration_unit_size = size; } if (num_planes > 1) { int s = AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y); if (s && !chroma_none) { cm->rst_info[1].restoration_unit_size = cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s); } else { cm->rst_info[1].restoration_unit_size = cm->rst_info[0].restoration_unit_size; } cm->rst_info[2].restoration_unit_size = cm->rst_info[1].restoration_unit_size; } } static inline void read_wiener_filter(int wiener_win, WienerInfo *wiener_info, WienerInfo *ref_wiener_info, aom_reader *rb) { memset(wiener_info->vfilter, 0, sizeof(wiener_info->vfilter)); memset(wiener_info->hfilter, 0, sizeof(wiener_info->hfilter)); if (wiener_win == WIENER_WIN) wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] = aom_read_primitive_refsubexpfin( rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, WIENER_FILT_TAP0_SUBEXP_K, ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) + WIENER_FILT_TAP0_MINV; else wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] = 0; wiener_info->vfilter[1] = wiener_info->vfilter[WIENER_WIN - 2] = aom_read_primitive_refsubexpfin( rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, WIENER_FILT_TAP1_SUBEXP_K, ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) + WIENER_FILT_TAP1_MINV; wiener_info->vfilter[2] = wiener_info->vfilter[WIENER_WIN - 3] = aom_read_primitive_refsubexpfin( rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, WIENER_FILT_TAP2_SUBEXP_K, ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) + WIENER_FILT_TAP2_MINV; // The central element has an implicit +WIENER_FILT_STEP wiener_info->vfilter[WIENER_HALFWIN] = -2 * (wiener_info->vfilter[0] + wiener_info->vfilter[1] + wiener_info->vfilter[2]); if (wiener_win == WIENER_WIN) wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] = aom_read_primitive_refsubexpfin( rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, WIENER_FILT_TAP0_SUBEXP_K, ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) + WIENER_FILT_TAP0_MINV; else wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] = 0; wiener_info->hfilter[1] = wiener_info->hfilter[WIENER_WIN - 2] = aom_read_primitive_refsubexpfin( rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, WIENER_FILT_TAP1_SUBEXP_K, ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) + WIENER_FILT_TAP1_MINV; wiener_info->hfilter[2] = wiener_info->hfilter[WIENER_WIN - 3] = aom_read_primitive_refsubexpfin( rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, WIENER_FILT_TAP2_SUBEXP_K, ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) + WIENER_FILT_TAP2_MINV; // The central element has an implicit +WIENER_FILT_STEP wiener_info->hfilter[WIENER_HALFWIN] = -2 * (wiener_info->hfilter[0] + wiener_info->hfilter[1] + wiener_info->hfilter[2]); memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info)); } static inline void read_sgrproj_filter(SgrprojInfo *sgrproj_info, SgrprojInfo *ref_sgrproj_info, aom_reader *rb) { sgrproj_info->ep = aom_read_literal(rb, SGRPROJ_PARAMS_BITS, ACCT_STR); const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep]; if (params->r[0] == 0) { sgrproj_info->xqd[0] = 0; sgrproj_info->xqd[1] = aom_read_primitive_refsubexpfin( rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) + SGRPROJ_PRJ_MIN1; } else if (params->r[1] == 0) { sgrproj_info->xqd[0] = aom_read_primitive_refsubexpfin( rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) + SGRPROJ_PRJ_MIN0; sgrproj_info->xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - sgrproj_info->xqd[0], SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1); } else { sgrproj_info->xqd[0] = aom_read_primitive_refsubexpfin( rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) + SGRPROJ_PRJ_MIN0; sgrproj_info->xqd[1] = aom_read_primitive_refsubexpfin( rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) + SGRPROJ_PRJ_MIN1; } memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info)); } static inline void loop_restoration_read_sb_coeffs(const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_reader *const r, int plane, int runit_idx) { const RestorationInfo *rsi = &cm->rst_info[plane]; RestorationUnitInfo *rui = &rsi->unit_info[runit_idx]; assert(rsi->frame_restoration_type != RESTORE_NONE); assert(!cm->features.all_lossless); const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN; WienerInfo *wiener_info = xd->wiener_info + plane; SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane; if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) { rui->restoration_type = aom_read_symbol(r, xd->tile_ctx->switchable_restore_cdf, RESTORE_SWITCHABLE_TYPES, ACCT_STR); switch (rui->restoration_type) { case RESTORE_WIENER: read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r); break; case RESTORE_SGRPROJ: read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r); break; default: assert(rui->restoration_type == RESTORE_NONE); break; } } else if (rsi->frame_restoration_type == RESTORE_WIENER) { if (aom_read_symbol(r, xd->tile_ctx->wiener_restore_cdf, 2, ACCT_STR)) { rui->restoration_type = RESTORE_WIENER; read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r); } else { rui->restoration_type = RESTORE_NONE; } } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) { if (aom_read_symbol(r, xd->tile_ctx->sgrproj_restore_cdf, 2, ACCT_STR)) { rui->restoration_type = RESTORE_SGRPROJ; read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r); } else { rui->restoration_type = RESTORE_NONE; } } } static inline void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { const int num_planes = av1_num_planes(cm); struct loopfilter *lf = &cm->lf; if (cm->features.allow_intrabc || cm->features.coded_lossless) { // write default deltas to frame buffer av1_set_default_ref_deltas(cm->cur_frame->ref_deltas); av1_set_default_mode_deltas(cm->cur_frame->mode_deltas); return; } assert(!cm->features.coded_lossless); if (cm->prev_frame) { // write deltas to frame buffer memcpy(lf->ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES); memcpy(lf->mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS); } else { av1_set_default_ref_deltas(lf->ref_deltas); av1_set_default_mode_deltas(lf->mode_deltas); } lf->filter_level[0] = aom_rb_read_literal(rb, 6); lf->filter_level[1] = aom_rb_read_literal(rb, 6); if (num_planes > 1) { if (lf->filter_level[0] || lf->filter_level[1]) { lf->filter_level_u = aom_rb_read_literal(rb, 6); lf->filter_level_v = aom_rb_read_literal(rb, 6); } } lf->sharpness_level = aom_rb_read_literal(rb, 3); // Read in loop filter deltas applied at the MB level based on mode or ref // frame. lf->mode_ref_delta_update = 0; lf->mode_ref_delta_enabled = aom_rb_read_bit(rb); if (lf->mode_ref_delta_enabled) { lf->mode_ref_delta_update = aom_rb_read_bit(rb); if (lf->mode_ref_delta_update) { for (int i = 0; i < REF_FRAMES; i++) if (aom_rb_read_bit(rb)) lf->ref_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6); for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) if (aom_rb_read_bit(rb)) lf->mode_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6); } } // write deltas to frame buffer memcpy(cm->cur_frame->ref_deltas, lf->ref_deltas, REF_FRAMES); memcpy(cm->cur_frame->mode_deltas, lf->mode_deltas, MAX_MODE_LF_DELTAS); } static inline void setup_cdef(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { const int num_planes = av1_num_planes(cm); CdefInfo *const cdef_info = &cm->cdef_info; if (cm->features.allow_intrabc) return; cdef_info->cdef_damping = aom_rb_read_literal(rb, 2) + 3; cdef_info->cdef_bits = aom_rb_read_literal(rb, 2); cdef_info->nb_cdef_strengths = 1 << cdef_info->cdef_bits; for (int i = 0; i < cdef_info->nb_cdef_strengths; i++) { cdef_info->cdef_strengths[i] = aom_rb_read_literal(rb, CDEF_STRENGTH_BITS); cdef_info->cdef_uv_strengths[i] = num_planes > 1 ? aom_rb_read_literal(rb, CDEF_STRENGTH_BITS) : 0; } } static inline int read_delta_q(struct aom_read_bit_buffer *rb) { return aom_rb_read_bit(rb) ? aom_rb_read_inv_signed_literal(rb, 6) : 0; } static inline void setup_quantization(CommonQuantParams *quant_params, int num_planes, bool separate_uv_delta_q, struct aom_read_bit_buffer *rb) { quant_params->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS); quant_params->y_dc_delta_q = read_delta_q(rb); if (num_planes > 1) { int diff_uv_delta = 0; if (separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb); quant_params->u_dc_delta_q = read_delta_q(rb); quant_params->u_ac_delta_q = read_delta_q(rb); if (diff_uv_delta) { quant_params->v_dc_delta_q = read_delta_q(rb); quant_params->v_ac_delta_q = read_delta_q(rb); } else { quant_params->v_dc_delta_q = quant_params->u_dc_delta_q; quant_params->v_ac_delta_q = quant_params->u_ac_delta_q; } } else { quant_params->u_dc_delta_q = 0; quant_params->u_ac_delta_q = 0; quant_params->v_dc_delta_q = 0; quant_params->v_ac_delta_q = 0; } quant_params->using_qmatrix = aom_rb_read_bit(rb); if (quant_params->using_qmatrix) { quant_params->qmatrix_level_y = aom_rb_read_literal(rb, QM_LEVEL_BITS); quant_params->qmatrix_level_u = aom_rb_read_literal(rb, QM_LEVEL_BITS); if (!separate_uv_delta_q) quant_params->qmatrix_level_v = quant_params->qmatrix_level_u; else quant_params->qmatrix_level_v = aom_rb_read_literal(rb, QM_LEVEL_BITS); } else { quant_params->qmatrix_level_y = 0; quant_params->qmatrix_level_u = 0; quant_params->qmatrix_level_v = 0; } } // Get global dequant matrix. static const qm_val_t *get_iqmatrix(const CommonQuantParams *quant_params, int qmlevel, int plane, TX_SIZE tx_size) { assert(quant_params->giqmatrix[qmlevel][plane][tx_size] != NULL || qmlevel == NUM_QM_LEVELS - 1); return quant_params->giqmatrix[qmlevel][plane][tx_size]; } // Build y/uv dequant values based on segmentation. static inline void setup_segmentation_dequant(AV1_COMMON *const cm, MACROBLOCKD *const xd) { const int bit_depth = cm->seq_params->bit_depth; // When segmentation is disabled, only the first value is used. The // remaining are don't cares. const int max_segments = cm->seg.enabled ? MAX_SEGMENTS : 1; CommonQuantParams *const quant_params = &cm->quant_params; for (int i = 0; i < max_segments; ++i) { const int qindex = xd->qindex[i]; quant_params->y_dequant_QTX[i][0] = av1_dc_quant_QTX(qindex, quant_params->y_dc_delta_q, bit_depth); quant_params->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, bit_depth); quant_params->u_dequant_QTX[i][0] = av1_dc_quant_QTX(qindex, quant_params->u_dc_delta_q, bit_depth); quant_params->u_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, quant_params->u_ac_delta_q, bit_depth); quant_params->v_dequant_QTX[i][0] = av1_dc_quant_QTX(qindex, quant_params->v_dc_delta_q, bit_depth); quant_params->v_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, quant_params->v_ac_delta_q, bit_depth); const int use_qmatrix = av1_use_qmatrix(quant_params, xd, i); // NB: depends on base index so there is only 1 set per frame // No quant weighting when lossless or signalled not using QM const int qmlevel_y = use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1; for (int j = 0; j < TX_SIZES_ALL; ++j) { quant_params->y_iqmatrix[i][j] = get_iqmatrix(quant_params, qmlevel_y, AOM_PLANE_Y, j); } const int qmlevel_u = use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1; for (int j = 0; j < TX_SIZES_ALL; ++j) { quant_params->u_iqmatrix[i][j] = get_iqmatrix(quant_params, qmlevel_u, AOM_PLANE_U, j); } const int qmlevel_v = use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1; for (int j = 0; j < TX_SIZES_ALL; ++j) { quant_params->v_iqmatrix[i][j] = get_iqmatrix(quant_params, qmlevel_v, AOM_PLANE_V, j); } } } static InterpFilter read_frame_interp_filter(struct aom_read_bit_buffer *rb) { return aom_rb_read_bit(rb) ? SWITCHABLE : aom_rb_read_literal(rb, LOG_SWITCHABLE_FILTERS); } static void read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width, int num_bits_height, int *width, int *height) { *width = aom_rb_read_literal(rb, num_bits_width) + 1; *height = aom_rb_read_literal(rb, num_bits_height) + 1; } static inline void setup_render_size(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { cm->render_width = cm->superres_upscaled_width; cm->render_height = cm->superres_upscaled_height; if (aom_rb_read_bit(rb)) read_frame_size(rb, 16, 16, &cm->render_width, &cm->render_height); } // TODO(afergs): make "struct aom_read_bit_buffer *const rb"? static inline void setup_superres(AV1_COMMON *const cm, struct aom_read_bit_buffer *rb, int *width, int *height) { cm->superres_upscaled_width = *width; cm->superres_upscaled_height = *height; const SequenceHeader *const seq_params = cm->seq_params; if (!seq_params->enable_superres) return; if (aom_rb_read_bit(rb)) { cm->superres_scale_denominator = (uint8_t)aom_rb_read_literal(rb, SUPERRES_SCALE_BITS); cm->superres_scale_denominator += SUPERRES_SCALE_DENOMINATOR_MIN; // Don't edit cm->width or cm->height directly, or the buffers won't get // resized correctly av1_calculate_scaled_superres_size(width, height, cm->superres_scale_denominator); } else { // 1:1 scaling - ie. no scaling, scale not provided cm->superres_scale_denominator = SCALE_NUMERATOR; } } static inline void resize_context_buffers(AV1_COMMON *cm, int width, int height) { #if CONFIG_SIZE_LIMIT if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Dimensions of %dx%d beyond allowed size of %dx%d.", width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT); #endif if (cm->width != width || cm->height != height) { const int new_mi_rows = CEIL_POWER_OF_TWO(height, MI_SIZE_LOG2); const int new_mi_cols = CEIL_POWER_OF_TWO(width, MI_SIZE_LOG2); // Allocations in av1_alloc_context_buffers() depend on individual // dimensions as well as the overall size. if (new_mi_cols > cm->mi_params.mi_cols || new_mi_rows > cm->mi_params.mi_rows) { if (av1_alloc_context_buffers(cm, width, height, BLOCK_4X4)) { // The cm->mi_* values have been cleared and any existing context // buffers have been freed. Clear cm->width and cm->height to be // consistent and to force a realloc next time. cm->width = 0; cm->height = 0; aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate context buffers"); } } else { cm->mi_params.set_mb_mi(&cm->mi_params, width, height, BLOCK_4X4); } av1_init_mi_buffers(&cm->mi_params); cm->width = width; cm->height = height; } ensure_mv_buffer(cm->cur_frame, cm); cm->cur_frame->width = cm->width; cm->cur_frame->height = cm->height; } static inline void setup_buffer_pool(AV1_COMMON *cm) { BufferPool *const pool = cm->buffer_pool; const SequenceHeader *const seq_params = cm->seq_params; lock_buffer_pool(pool); if (aom_realloc_frame_buffer( &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, AOM_DEC_BORDER_IN_PIXELS, cm->features.byte_alignment, &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, false, 0)) { unlock_buffer_pool(pool); aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } unlock_buffer_pool(pool); cm->cur_frame->buf.bit_depth = (unsigned int)seq_params->bit_depth; cm->cur_frame->buf.color_primaries = seq_params->color_primaries; cm->cur_frame->buf.transfer_characteristics = seq_params->transfer_characteristics; cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients; cm->cur_frame->buf.monochrome = seq_params->monochrome; cm->cur_frame->buf.chroma_sample_position = seq_params->chroma_sample_position; cm->cur_frame->buf.color_range = seq_params->color_range; cm->cur_frame->buf.render_width = cm->render_width; cm->cur_frame->buf.render_height = cm->render_height; } static inline void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag, struct aom_read_bit_buffer *rb) { const SequenceHeader *const seq_params = cm->seq_params; int width, height; if (frame_size_override_flag) { int num_bits_width = seq_params->num_bits_width; int num_bits_height = seq_params->num_bits_height; read_frame_size(rb, num_bits_width, num_bits_height, &width, &height); if (width > seq_params->max_frame_width || height > seq_params->max_frame_height) { aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Frame dimensions are larger than the maximum values"); } } else { width = seq_params->max_frame_width; height = seq_params->max_frame_height; } setup_superres(cm, rb, &width, &height); resize_context_buffers(cm, width, height); setup_render_size(cm, rb); setup_buffer_pool(cm); } static inline void setup_sb_size(SequenceHeader *seq_params, struct aom_read_bit_buffer *rb) { set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64); } static inline int valid_ref_frame_img_fmt(aom_bit_depth_t ref_bit_depth, int ref_xss, int ref_yss, aom_bit_depth_t this_bit_depth, int this_xss, int this_yss) { return ref_bit_depth == this_bit_depth && ref_xss == this_xss && ref_yss == this_yss; } static inline void setup_frame_size_with_refs(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { int width, height; int found = 0; int has_valid_ref_frame = 0; for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { if (aom_rb_read_bit(rb)) { const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i); // This will never be NULL in a normal stream, as streams are required to // have a shown keyframe before any inter frames, which would refresh all // the reference buffers. However, it might be null if we're starting in // the middle of a stream, and static analysis will error if we don't do // a null check here. if (ref_buf == NULL) { aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Invalid condition: invalid reference buffer"); } else { const YV12_BUFFER_CONFIG *const buf = &ref_buf->buf; width = buf->y_crop_width; height = buf->y_crop_height; cm->render_width = buf->render_width; cm->render_height = buf->render_height; setup_superres(cm, rb, &width, &height); resize_context_buffers(cm, width, height); found = 1; break; } } } const SequenceHeader *const seq_params = cm->seq_params; if (!found) { int num_bits_width = seq_params->num_bits_width; int num_bits_height = seq_params->num_bits_height; read_frame_size(rb, num_bits_width, num_bits_height, &width, &height); setup_superres(cm, rb, &width, &height); resize_context_buffers(cm, width, height); setup_render_size(cm, rb); } if (width <= 0 || height <= 0) aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Invalid frame size"); // Check to make sure at least one of frames that this frame references // has valid dimensions. for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i); has_valid_ref_frame |= valid_ref_frame_size(ref_frame->buf.y_crop_width, ref_frame->buf.y_crop_height, width, height); } if (!has_valid_ref_frame) aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Referenced frame has invalid size"); for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i); if (!valid_ref_frame_img_fmt( ref_frame->buf.bit_depth, ref_frame->buf.subsampling_x, ref_frame->buf.subsampling_y, seq_params->bit_depth, seq_params->subsampling_x, seq_params->subsampling_y)) aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Referenced frame has incompatible color format"); } setup_buffer_pool(cm); } // Same function as av1_read_uniform but reading from uncompresses header wb static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) { const int l = get_unsigned_bits(n); const int m = (1 << l) - n; const int v = aom_rb_read_literal(rb, l - 1); assert(l != 0); if (v < m) return v; else return (v << 1) - m + aom_rb_read_bit(rb); } static inline void read_tile_info_max_tile( AV1_COMMON *const cm, struct aom_read_bit_buffer *const rb) { const SequenceHeader *const seq_params = cm->seq_params; CommonTileParams *const tiles = &cm->tiles; int width_sb = CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2); int height_sb = CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2); av1_get_tile_limits(cm); tiles->uniform_spacing = aom_rb_read_bit(rb); // Read tile columns if (tiles->uniform_spacing) { tiles->log2_cols = tiles->min_log2_cols; while (tiles->log2_cols < tiles->max_log2_cols) { if (!aom_rb_read_bit(rb)) { break; } tiles->log2_cols++; } } else { int i; int start_sb; for (i = 0, start_sb = 0; width_sb > 0 && i < MAX_TILE_COLS; i++) { const int size_sb = 1 + rb_read_uniform(rb, AOMMIN(width_sb, tiles->max_width_sb)); tiles->col_start_sb[i] = start_sb; start_sb += size_sb; width_sb -= size_sb; } tiles->cols = i; tiles->col_start_sb[i] = start_sb + width_sb; } av1_calculate_tile_cols(seq_params, cm->mi_params.mi_rows, cm->mi_params.mi_cols, tiles); // Read tile rows if (tiles->uniform_spacing) { tiles->log2_rows = tiles->min_log2_rows; while (tiles->log2_rows < tiles->max_log2_rows) { if (!aom_rb_read_bit(rb)) { break; } tiles->log2_rows++; } } else { int i; int start_sb; for (i = 0, start_sb = 0; height_sb > 0 && i < MAX_TILE_ROWS; i++) { const int size_sb = 1 + rb_read_uniform(rb, AOMMIN(height_sb, tiles->max_height_sb)); tiles->row_start_sb[i] = start_sb; start_sb += size_sb; height_sb -= size_sb; } tiles->rows = i; tiles->row_start_sb[i] = start_sb + height_sb; } av1_calculate_tile_rows(seq_params, cm->mi_params.mi_rows, tiles); } void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm) { cm->tiles.single_tile_decoding = 0; if (cm->tiles.large_scale) { struct loopfilter *lf = &cm->lf; RestorationInfo *const rst_info = cm->rst_info; const CdefInfo *const cdef_info = &cm->cdef_info; // Figure out single_tile_decoding by loopfilter_level. const int no_loopfilter = !(lf->filter_level[0] || lf->filter_level[1]); const int no_cdef = cdef_info->cdef_bits == 0 && cdef_info->cdef_strengths[0] == 0 && cdef_info->cdef_uv_strengths[0] == 0; const int no_restoration = rst_info[0].frame_restoration_type == RESTORE_NONE && rst_info[1].frame_restoration_type == RESTORE_NONE && rst_info[2].frame_restoration_type == RESTORE_NONE; assert(IMPLIES(cm->features.coded_lossless, no_loopfilter && no_cdef)); assert(IMPLIES(cm->features.all_lossless, no_restoration)); cm->tiles.single_tile_decoding = no_loopfilter && no_cdef && no_restoration; } } static inline void read_tile_info(AV1Decoder *const pbi, struct aom_read_bit_buffer *const rb) { AV1_COMMON *const cm = &pbi->common; read_tile_info_max_tile(cm, rb); pbi->context_update_tile_id = 0; if (cm->tiles.rows * cm->tiles.cols > 1) { // tile to use for cdf update pbi->context_update_tile_id = aom_rb_read_literal(rb, cm->tiles.log2_rows + cm->tiles.log2_cols); if (pbi->context_update_tile_id >= cm->tiles.rows * cm->tiles.cols) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Invalid context_update_tile_id"); } // tile size magnitude pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1; } } #if EXT_TILE_DEBUG static inline void read_ext_tile_info(AV1Decoder *const pbi, struct aom_read_bit_buffer *const rb) { AV1_COMMON *const cm = &pbi->common; // This information is stored as a separate byte. int mod = rb->bit_offset % CHAR_BIT; if (mod > 0) aom_rb_read_literal(rb, CHAR_BIT - mod); assert(rb->bit_offset % CHAR_BIT == 0); if (cm->tiles.cols * cm->tiles.rows > 1) { // Read the number of bytes used to store tile size pbi->tile_col_size_bytes = aom_rb_read_literal(rb, 2) + 1; pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1; } } #endif // EXT_TILE_DEBUG static size_t mem_get_varsize(const uint8_t *src, int sz) { switch (sz) { case 1: return src[0]; case 2: return mem_get_le16(src); case 3: return mem_get_le24(src); case 4: return mem_get_le32(src); default: assert(0 && "Invalid size"); return -1; } } #if EXT_TILE_DEBUG // Reads the next tile returning its size and adjusting '*data' accordingly // based on 'is_last'. On return, '*data' is updated to point to the end of the // raw tile buffer in the bit stream. static inline void get_ls_tile_buffer( const uint8_t *const data_end, struct aom_internal_error_info *error_info, const uint8_t **data, TileBufferDec (*const tile_buffers)[MAX_TILE_COLS], int tile_size_bytes, int col, int row, int tile_copy_mode) { size_t size; size_t copy_size = 0; const uint8_t *copy_data = NULL; if (!read_is_valid(*data, tile_size_bytes, data_end)) aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt tile length"); size = mem_get_varsize(*data, tile_size_bytes); // If tile_copy_mode = 1, then the top bit of the tile header indicates copy // mode. if (tile_copy_mode && (size >> (tile_size_bytes * 8 - 1)) == 1) { // The remaining bits in the top byte signal the row offset int offset = (size >> (tile_size_bytes - 1) * 8) & 0x7f; if (offset > row) { aom_internal_error( error_info, AOM_CODEC_CORRUPT_FRAME, "Invalid row offset in tile copy mode: row=%d offset=%d", row, offset); } // Currently, only use tiles in same column as reference tiles. copy_data = tile_buffers[row - offset][col].data; copy_size = tile_buffers[row - offset][col].size; size = 0; } else { size += AV1_MIN_TILE_SIZE_BYTES; } *data += tile_size_bytes; if (size > (size_t)(data_end - *data)) aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt tile size"); if (size > 0) { tile_buffers[row][col].data = *data; tile_buffers[row][col].size = size; } else { tile_buffers[row][col].data = copy_data; tile_buffers[row][col].size = copy_size; } *data += size; } // Returns the end of the last tile buffer // (tile_buffers[cm->tiles.rows - 1][cm->tiles.cols - 1]). static const uint8_t *get_ls_tile_buffers( AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) { AV1_COMMON *const cm = &pbi->common; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; const int have_tiles = tile_cols * tile_rows > 1; const uint8_t *raw_data_end; // The end of the last tile buffer if (!have_tiles) { const size_t tile_size = data_end - data; tile_buffers[0][0].data = data; tile_buffers[0][0].size = tile_size; raw_data_end = NULL; } else { // We locate only the tile buffers that are required, which are the ones // specified by pbi->dec_tile_col and pbi->dec_tile_row. Also, we always // need the last (bottom right) tile buffer, as we need to know where the // end of the compressed frame buffer is for proper superframe decoding. const uint8_t *tile_col_data_end[MAX_TILE_COLS] = { NULL }; const uint8_t *const data_start = data; const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows); const int single_row = pbi->dec_tile_row >= 0; const int tile_rows_start = single_row ? dec_tile_row : 0; const int tile_rows_end = single_row ? tile_rows_start + 1 : tile_rows; const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols); const int single_col = pbi->dec_tile_col >= 0; const int tile_cols_start = single_col ? dec_tile_col : 0; const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols; const int tile_col_size_bytes = pbi->tile_col_size_bytes; const int tile_size_bytes = pbi->tile_size_bytes; int tile_width, tile_height; if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) { aom_internal_error( &pbi->error, AOM_CODEC_CORRUPT_FRAME, "Not all the tiles in the tile list have the same size."); } const int tile_copy_mode = ((AOMMAX(tile_width, tile_height) << MI_SIZE_LOG2) <= 256) ? 1 : 0; // Read tile column sizes for all columns (we need the last tile buffer) for (int c = 0; c < tile_cols; ++c) { const int is_last = c == tile_cols - 1; size_t tile_col_size; if (!is_last) { if (tile_col_size_bytes > data_end - data) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Not enough data to read tile_col_size"); } tile_col_size = mem_get_varsize(data, tile_col_size_bytes); data += tile_col_size_bytes; if (tile_col_size > (size_t)(data_end - data)) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "tile_col_data_end[%d] is out of bound", c); } tile_col_data_end[c] = data + tile_col_size; } else { tile_col_size = data_end - data; tile_col_data_end[c] = data_end; } data += tile_col_size; } data = data_start; // Read the required tile sizes. for (int c = tile_cols_start; c < tile_cols_end; ++c) { const int is_last = c == tile_cols - 1; if (c > 0) data = tile_col_data_end[c - 1]; if (!is_last) data += tile_col_size_bytes; // Get the whole of the last column, otherwise stop at the required tile. for (int r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) { get_ls_tile_buffer(tile_col_data_end[c], &pbi->error, &data, tile_buffers, tile_size_bytes, c, r, tile_copy_mode); } } // If we have not read the last column, then read it to get the last tile. if (tile_cols_end != tile_cols) { const int c = tile_cols - 1; data = tile_col_data_end[c - 1]; for (int r = 0; r < tile_rows; ++r) { get_ls_tile_buffer(tile_col_data_end[c], &pbi->error, &data, tile_buffers, tile_size_bytes, c, r, tile_copy_mode); } } raw_data_end = data; } return raw_data_end; } #endif // EXT_TILE_DEBUG static const uint8_t *get_ls_single_tile_buffer( AV1Decoder *pbi, const uint8_t *data, TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) { assert(pbi->dec_tile_row >= 0 && pbi->dec_tile_col >= 0); tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data = data; tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size = (size_t)pbi->coded_tile_data_size; return data + pbi->coded_tile_data_size; } // Reads the next tile returning its size and adjusting '*data' accordingly // based on 'is_last'. static inline void get_tile_buffer(const uint8_t *const data_end, const int tile_size_bytes, int is_last, struct aom_internal_error_info *error_info, const uint8_t **data, TileBufferDec *const buf) { size_t size; if (!is_last) { if (!read_is_valid(*data, tile_size_bytes, data_end)) aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME, "Not enough data to read tile size"); size = mem_get_varsize(*data, tile_size_bytes) + AV1_MIN_TILE_SIZE_BYTES; *data += tile_size_bytes; if (size > (size_t)(data_end - *data)) aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt tile size"); } else { size = data_end - *data; } buf->data = *data; buf->size = size; *data += size; } static inline void get_tile_buffers( AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, TileBufferDec (*const tile_buffers)[MAX_TILE_COLS], int start_tile, int end_tile) { AV1_COMMON *const cm = &pbi->common; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; int tc = 0; for (int r = 0; r < tile_rows; ++r) { for (int c = 0; c < tile_cols; ++c, ++tc) { TileBufferDec *const buf = &tile_buffers[r][c]; const int is_last = (tc == end_tile); const size_t hdr_offset = 0; if (tc < start_tile || tc > end_tile) continue; if (data + hdr_offset >= data_end) aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Data ended before all tiles were read."); data += hdr_offset; get_tile_buffer(data_end, pbi->tile_size_bytes, is_last, &pbi->error, &data, buf); } } } static inline void set_cb_buffer(AV1Decoder *pbi, DecoderCodingBlock *dcb, CB_BUFFER *cb_buffer_base, const int num_planes, int mi_row, int mi_col) { AV1_COMMON *const cm = &pbi->common; int mib_size_log2 = cm->seq_params->mib_size_log2; int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1; int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2); CB_BUFFER *cb_buffer = cb_buffer_base + offset; for (int plane = 0; plane < num_planes; ++plane) { dcb->dqcoeff_block[plane] = cb_buffer->dqcoeff[plane]; dcb->eob_data[plane] = cb_buffer->eob_data[plane]; dcb->cb_offset[plane] = 0; dcb->txb_offset[plane] = 0; } MACROBLOCKD *const xd = &dcb->xd; xd->plane[0].color_index_map = cb_buffer->color_index_map[0]; xd->plane[1].color_index_map = cb_buffer->color_index_map[1]; xd->color_index_map_offset[0] = 0; xd->color_index_map_offset[1] = 0; } static inline void decoder_alloc_tile_data(AV1Decoder *pbi, const int n_tiles) { AV1_COMMON *const cm = &pbi->common; aom_free(pbi->tile_data); pbi->allocated_tiles = 0; CHECK_MEM_ERROR(cm, pbi->tile_data, aom_memalign(32, n_tiles * sizeof(*pbi->tile_data))); pbi->allocated_tiles = n_tiles; for (int i = 0; i < n_tiles; i++) { TileDataDec *const tile_data = pbi->tile_data + i; av1_zero(tile_data->dec_row_mt_sync); } pbi->allocated_row_mt_sync_rows = 0; } // Set up nsync by width. static inline int get_sync_range(int width) { // nsync numbers are picked by testing. #if 0 if (width < 640) return 1; else if (width <= 1280) return 2; else if (width <= 4096) return 4; else return 8; #else (void)width; #endif return 1; } // Allocate memory for decoder row synchronization static inline void dec_row_mt_alloc(AV1DecRowMTSync *dec_row_mt_sync, AV1_COMMON *cm, int rows) { dec_row_mt_sync->allocated_sb_rows = rows; #if CONFIG_MULTITHREAD { int i; CHECK_MEM_ERROR(cm, dec_row_mt_sync->mutex_, aom_malloc(sizeof(*(dec_row_mt_sync->mutex_)) * rows)); if (dec_row_mt_sync->mutex_) { for (i = 0; i < rows; ++i) { pthread_mutex_init(&dec_row_mt_sync->mutex_[i], NULL); } } CHECK_MEM_ERROR(cm, dec_row_mt_sync->cond_, aom_malloc(sizeof(*(dec_row_mt_sync->cond_)) * rows)); if (dec_row_mt_sync->cond_) { for (i = 0; i < rows; ++i) { pthread_cond_init(&dec_row_mt_sync->cond_[i], NULL); } } } #endif // CONFIG_MULTITHREAD CHECK_MEM_ERROR(cm, dec_row_mt_sync->cur_sb_col, aom_malloc(sizeof(*(dec_row_mt_sync->cur_sb_col)) * rows)); // Set up nsync. dec_row_mt_sync->sync_range = get_sync_range(cm->width); } // Deallocate decoder row synchronization related mutex and data void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync) { if (dec_row_mt_sync != NULL) { #if CONFIG_MULTITHREAD int i; if (dec_row_mt_sync->mutex_ != NULL) { for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) { pthread_mutex_destroy(&dec_row_mt_sync->mutex_[i]); } aom_free(dec_row_mt_sync->mutex_); } if (dec_row_mt_sync->cond_ != NULL) { for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) { pthread_cond_destroy(&dec_row_mt_sync->cond_[i]); } aom_free(dec_row_mt_sync->cond_); } #endif // CONFIG_MULTITHREAD aom_free(dec_row_mt_sync->cur_sb_col); // clear the structure as the source of this call may be a resize in which // case this call will be followed by an _alloc() which may fail. av1_zero(*dec_row_mt_sync); } } static inline void sync_read(AV1DecRowMTSync *const dec_row_mt_sync, int r, int c) { #if CONFIG_MULTITHREAD const int nsync = dec_row_mt_sync->sync_range; if (r && !(c & (nsync - 1))) { pthread_mutex_t *const mutex = &dec_row_mt_sync->mutex_[r - 1]; pthread_mutex_lock(mutex); while (c > dec_row_mt_sync->cur_sb_col[r - 1] - nsync - dec_row_mt_sync->intrabc_extra_top_right_sb_delay) { pthread_cond_wait(&dec_row_mt_sync->cond_[r - 1], mutex); } pthread_mutex_unlock(mutex); } #else (void)dec_row_mt_sync; (void)r; (void)c; #endif // CONFIG_MULTITHREAD } static inline void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r, int c, const int sb_cols) { #if CONFIG_MULTITHREAD const int nsync = dec_row_mt_sync->sync_range; int cur; int sig = 1; if (c < sb_cols - 1) { cur = c; if (c % nsync) sig = 0; } else { cur = sb_cols + nsync + dec_row_mt_sync->intrabc_extra_top_right_sb_delay; } if (sig) { pthread_mutex_lock(&dec_row_mt_sync->mutex_[r]); dec_row_mt_sync->cur_sb_col[r] = cur; pthread_cond_signal(&dec_row_mt_sync->cond_[r]); pthread_mutex_unlock(&dec_row_mt_sync->mutex_[r]); } #else (void)dec_row_mt_sync; (void)r; (void)c; (void)sb_cols; #endif // CONFIG_MULTITHREAD } static inline void signal_decoding_done_for_erroneous_row( AV1Decoder *const pbi, const MACROBLOCKD *const xd) { AV1_COMMON *const cm = &pbi->common; const TileInfo *const tile = &xd->tile; const int sb_row_in_tile = ((xd->mi_row - tile->mi_row_start) >> cm->seq_params->mib_size_log2); const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile); TileDataDec *const tile_data = pbi->tile_data + tile->tile_row * cm->tiles.cols + tile->tile_col; AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync; sync_write(dec_row_mt_sync, sb_row_in_tile, sb_cols_in_tile - 1, sb_cols_in_tile); } static inline void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td, const TileInfo *tile_info, const int mi_row) { AV1_COMMON *const cm = &pbi->common; const int num_planes = av1_num_planes(cm); TileDataDec *const tile_data = pbi->tile_data + tile_info->tile_row * cm->tiles.cols + tile_info->tile_col; const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info); const int sb_row_in_tile = (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2; int sb_col_in_tile = 0; int row_mt_exit = 0; for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end; mi_col += cm->seq_params->mib_size, sb_col_in_tile++) { set_cb_buffer(pbi, &td->dcb, pbi->cb_buffer_base, num_planes, mi_row, mi_col); sync_read(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile); #if CONFIG_MULTITHREAD pthread_mutex_lock(pbi->row_mt_mutex_); #endif row_mt_exit = pbi->frame_row_mt_info.row_mt_exit; #if CONFIG_MULTITHREAD pthread_mutex_unlock(pbi->row_mt_mutex_); #endif if (!row_mt_exit) { // Decoding of the super-block decode_partition(pbi, td, mi_row, mi_col, td->bit_reader, cm->seq_params->sb_size, 0x2); } sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile, sb_cols_in_tile); } } static int check_trailing_bits_after_symbol_coder(aom_reader *r) { if (aom_reader_has_overflowed(r)) return -1; uint32_t nb_bits = aom_reader_tell(r); uint32_t nb_bytes = (nb_bits + 7) >> 3; const uint8_t *p = aom_reader_find_begin(r) + nb_bytes; // aom_reader_tell() returns 1 for a newly initialized decoder, and the // return value only increases as values are decoded. So nb_bits > 0, and // thus p > p_begin. Therefore accessing p[-1] is safe. uint8_t last_byte = p[-1]; uint8_t pattern = 128 >> ((nb_bits - 1) & 7); if ((last_byte & (2 * pattern - 1)) != pattern) return -1; // Make sure that all padding bytes are zero as required by the spec. const uint8_t *p_end = aom_reader_find_end(r); while (p < p_end) { if (*p != 0) return -1; p++; } return 0; } static inline void set_decode_func_pointers(ThreadData *td, int parse_decode_flag) { td->read_coeffs_tx_intra_block_visit = decode_block_void; td->predict_and_recon_intra_block_visit = decode_block_void; td->read_coeffs_tx_inter_block_visit = decode_block_void; td->inverse_tx_inter_block_visit = decode_block_void; td->predict_inter_block_visit = predict_inter_block_void; td->cfl_store_inter_block_visit = cfl_store_inter_block_void; if (parse_decode_flag & 0x1) { td->read_coeffs_tx_intra_block_visit = read_coeffs_tx_intra_block; td->read_coeffs_tx_inter_block_visit = av1_read_coeffs_txb; } if (parse_decode_flag & 0x2) { td->predict_and_recon_intra_block_visit = predict_and_reconstruct_intra_block; td->inverse_tx_inter_block_visit = inverse_transform_inter_block; td->predict_inter_block_visit = predict_inter_block; td->cfl_store_inter_block_visit = cfl_store_inter_block; } } static inline void decode_tile(AV1Decoder *pbi, ThreadData *const td, int tile_row, int tile_col) { TileInfo tile_info; AV1_COMMON *const cm = &pbi->common; const int num_planes = av1_num_planes(cm); av1_tile_set_row(&tile_info, cm, tile_row); av1_tile_set_col(&tile_info, cm, tile_col); DecoderCodingBlock *const dcb = &td->dcb; MACROBLOCKD *const xd = &dcb->xd; av1_zero_above_context(cm, xd, tile_info.mi_col_start, tile_info.mi_col_end, tile_row); av1_reset_loop_filter_delta(xd, num_planes); av1_reset_loop_restoration(xd, num_planes); for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end; mi_row += cm->seq_params->mib_size) { av1_zero_left_context(xd); for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end; mi_col += cm->seq_params->mib_size) { set_cb_buffer(pbi, dcb, &td->cb_buffer_base, num_planes, 0, 0); // Bit-stream parsing and decoding of the superblock decode_partition(pbi, td, mi_row, mi_col, td->bit_reader, cm->seq_params->sb_size, 0x3); if (aom_reader_has_overflowed(td->bit_reader)) { aom_merge_corrupted_flag(&dcb->corrupted, 1); return; } } } int corrupted = (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0; aom_merge_corrupted_flag(&dcb->corrupted, corrupted); } static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, int start_tile, int end_tile) { AV1_COMMON *const cm = &pbi->common; ThreadData *const td = &pbi->td; CommonTileParams *const tiles = &cm->tiles; const int tile_cols = tiles->cols; const int tile_rows = tiles->rows; const int n_tiles = tile_cols * tile_rows; TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers; const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows); const int single_row = pbi->dec_tile_row >= 0; const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols); const int single_col = pbi->dec_tile_col >= 0; int tile_rows_start; int tile_rows_end; int tile_cols_start; int tile_cols_end; int inv_col_order; int inv_row_order; int tile_row, tile_col; uint8_t allow_update_cdf; const uint8_t *raw_data_end = NULL; if (tiles->large_scale) { tile_rows_start = single_row ? dec_tile_row : 0; tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows; tile_cols_start = single_col ? dec_tile_col : 0; tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols; inv_col_order = pbi->inv_tile_order && !single_col; inv_row_order = pbi->inv_tile_order && !single_row; allow_update_cdf = 0; } else { tile_rows_start = 0; tile_rows_end = tile_rows; tile_cols_start = 0; tile_cols_end = tile_cols; inv_col_order = pbi->inv_tile_order; inv_row_order = pbi->inv_tile_order; allow_update_cdf = 1; } // No tiles to decode. if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start || // First tile is larger than end_tile. tile_rows_start * tiles->cols + tile_cols_start > end_tile || // Last tile is smaller than start_tile. (tile_rows_end - 1) * tiles->cols + tile_cols_end - 1 < start_tile) return data; allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update; assert(tile_rows <= MAX_TILE_ROWS); assert(tile_cols <= MAX_TILE_COLS); #if EXT_TILE_DEBUG if (tiles->large_scale && !pbi->ext_tile_debug) raw_data_end = get_ls_single_tile_buffer(pbi, data, tile_buffers); else if (tiles->large_scale && pbi->ext_tile_debug) raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers); else #endif // EXT_TILE_DEBUG get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile); if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) { decoder_alloc_tile_data(pbi, n_tiles); } if (pbi->dcb.xd.seg_mask == NULL) CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask, (uint8_t *)aom_memalign( 16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask))); #if CONFIG_ACCOUNTING if (pbi->acct_enabled) { aom_accounting_reset(&pbi->accounting); } #endif set_decode_func_pointers(&pbi->td, 0x3); // Load all tile information into thread_data. td->dcb = pbi->dcb; td->dcb.corrupted = 0; td->dcb.mc_buf[0] = td->mc_buf[0]; td->dcb.mc_buf[1] = td->mc_buf[1]; td->dcb.xd.tmp_conv_dst = td->tmp_conv_dst; for (int j = 0; j < 2; ++j) { td->dcb.xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j]; } for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) { const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row; for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) { const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col; TileDataDec *const tile_data = pbi->tile_data + row * tiles->cols + col; const TileBufferDec *const tile_bs_buf = &tile_buffers[row][col]; if (row * tiles->cols + col < start_tile || row * tiles->cols + col > end_tile) continue; td->bit_reader = &tile_data->bit_reader; av1_zero(td->cb_buffer_base.dqcoeff); av1_tile_init(&td->dcb.xd.tile, cm, row, col); td->dcb.xd.current_base_qindex = cm->quant_params.base_qindex; setup_bool_decoder(&td->dcb.xd, tile_bs_buf->data, data_end, tile_bs_buf->size, &pbi->error, td->bit_reader, allow_update_cdf); #if CONFIG_ACCOUNTING if (pbi->acct_enabled) { td->bit_reader->accounting = &pbi->accounting; td->bit_reader->accounting->last_tell_frac = aom_reader_tell_frac(td->bit_reader); } else { td->bit_reader->accounting = NULL; } #endif av1_init_macroblockd(cm, &td->dcb.xd); av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), row, &td->dcb.xd); // Initialise the tile context from the frame context tile_data->tctx = *cm->fc; td->dcb.xd.tile_ctx = &tile_data->tctx; // decode tile decode_tile(pbi, td, row, col); aom_merge_corrupted_flag(&pbi->dcb.corrupted, td->dcb.corrupted); if (pbi->dcb.corrupted) aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Failed to decode tile data"); } } if (tiles->large_scale) { if (n_tiles == 1) { // Find the end of the single tile buffer return aom_reader_find_end(&pbi->tile_data->bit_reader); } // Return the end of the last tile buffer return raw_data_end; } TileDataDec *const tile_data = pbi->tile_data + end_tile; return aom_reader_find_end(&tile_data->bit_reader); } static TileJobsDec *get_dec_job_info(AV1DecTileMT *tile_mt_info) { TileJobsDec *cur_job_info = NULL; #if CONFIG_MULTITHREAD pthread_mutex_lock(tile_mt_info->job_mutex); if (tile_mt_info->jobs_dequeued < tile_mt_info->jobs_enqueued) { cur_job_info = tile_mt_info->job_queue + tile_mt_info->jobs_dequeued; tile_mt_info->jobs_dequeued++; } pthread_mutex_unlock(tile_mt_info->job_mutex); #else (void)tile_mt_info; #endif return cur_job_info; } static inline void tile_worker_hook_init(AV1Decoder *const pbi, DecWorkerData *const thread_data, const TileBufferDec *const tile_buffer, TileDataDec *const tile_data, uint8_t allow_update_cdf) { AV1_COMMON *cm = &pbi->common; ThreadData *const td = thread_data->td; int tile_row = tile_data->tile_info.tile_row; int tile_col = tile_data->tile_info.tile_col; td->bit_reader = &tile_data->bit_reader; av1_zero(td->cb_buffer_base.dqcoeff); MACROBLOCKD *const xd = &td->dcb.xd; av1_tile_init(&xd->tile, cm, tile_row, tile_col); xd->current_base_qindex = cm->quant_params.base_qindex; setup_bool_decoder(xd, tile_buffer->data, thread_data->data_end, tile_buffer->size, &thread_data->error_info, td->bit_reader, allow_update_cdf); #if CONFIG_ACCOUNTING if (pbi->acct_enabled) { td->bit_reader->accounting = &pbi->accounting; td->bit_reader->accounting->last_tell_frac = aom_reader_tell_frac(td->bit_reader); } else { td->bit_reader->accounting = NULL; } #endif av1_init_macroblockd(cm, xd); xd->error_info = &thread_data->error_info; av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, xd); // Initialise the tile context from the frame context tile_data->tctx = *cm->fc; xd->tile_ctx = &tile_data->tctx; #if CONFIG_ACCOUNTING if (pbi->acct_enabled) { tile_data->bit_reader.accounting->last_tell_frac = aom_reader_tell_frac(&tile_data->bit_reader); } #endif } static int tile_worker_hook(void *arg1, void *arg2) { DecWorkerData *const thread_data = (DecWorkerData *)arg1; AV1Decoder *const pbi = (AV1Decoder *)arg2; AV1_COMMON *cm = &pbi->common; ThreadData *const td = thread_data->td; uint8_t allow_update_cdf; // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(thread_data->error_info.jmp)) { thread_data->error_info.setjmp = 0; thread_data->td->dcb.corrupted = 1; return 0; } thread_data->error_info.setjmp = 1; allow_update_cdf = cm->tiles.large_scale ? 0 : 1; allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update; set_decode_func_pointers(td, 0x3); assert(cm->tiles.cols > 0); while (!td->dcb.corrupted) { TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info); if (cur_job_info != NULL) { const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer; TileDataDec *const tile_data = cur_job_info->tile_data; tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data, allow_update_cdf); // decode tile int tile_row = tile_data->tile_info.tile_row; int tile_col = tile_data->tile_info.tile_col; decode_tile(pbi, td, tile_row, tile_col); } else { break; } } thread_data->error_info.setjmp = 0; return !td->dcb.corrupted; } static inline int get_max_row_mt_workers_per_tile(AV1_COMMON *cm, const TileInfo *tile) { // NOTE: Currently value of max workers is calculated based // on the parse and decode time. As per the theoretical estimate // when percentage of parse time is equal to percentage of decode // time, number of workers needed to parse + decode a tile can not // exceed more than 2. // TODO(any): Modify this value if parsing is optimized in future. int sb_rows = av1_get_sb_rows_in_tile(cm, tile); int max_workers = sb_rows == 1 ? AOM_MIN_THREADS_PER_TILE : AOM_MAX_THREADS_PER_TILE; return max_workers; } // The caller must hold pbi->row_mt_mutex_ when calling this function. // Returns 1 if either the next job is stored in *next_job_info or 1 is stored // in *end_of_frame. // NOTE: The caller waits on pbi->row_mt_cond_ if this function returns 0. // The return value of this function depends on the following variables: // - frame_row_mt_info->mi_rows_parse_done // - frame_row_mt_info->mi_rows_decode_started // - frame_row_mt_info->row_mt_exit // Therefore we may need to signal or broadcast pbi->row_mt_cond_ if any of // these variables is modified. static int get_next_job_info(AV1Decoder *const pbi, AV1DecRowMTJobInfo *next_job_info, int *end_of_frame) { AV1_COMMON *cm = &pbi->common; TileDataDec *tile_data; AV1DecRowMTSync *dec_row_mt_sync; AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info; const int tile_rows_start = frame_row_mt_info->tile_rows_start; const int tile_rows_end = frame_row_mt_info->tile_rows_end; const int tile_cols_start = frame_row_mt_info->tile_cols_start; const int tile_cols_end = frame_row_mt_info->tile_cols_end; const int start_tile = frame_row_mt_info->start_tile; const int end_tile = frame_row_mt_info->end_tile; const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size]; int num_mis_to_decode, num_threads_working; int num_mis_waiting_for_decode; int min_threads_working = INT_MAX; int max_mis_to_decode = 0; int tile_row_idx, tile_col_idx; int tile_row = -1; int tile_col = -1; memset(next_job_info, 0, sizeof(*next_job_info)); // Frame decode is completed or error is encountered. *end_of_frame = (frame_row_mt_info->mi_rows_decode_started == frame_row_mt_info->mi_rows_to_decode) || (frame_row_mt_info->row_mt_exit == 1); if (*end_of_frame) { return 1; } // Decoding cannot start as bit-stream parsing is not complete. assert(frame_row_mt_info->mi_rows_parse_done >= frame_row_mt_info->mi_rows_decode_started); if (frame_row_mt_info->mi_rows_parse_done == frame_row_mt_info->mi_rows_decode_started) return 0; // Choose the tile to decode. for (tile_row_idx = tile_rows_start; tile_row_idx < tile_rows_end; ++tile_row_idx) { for (tile_col_idx = tile_cols_start; tile_col_idx < tile_cols_end; ++tile_col_idx) { if (tile_row_idx * cm->tiles.cols + tile_col_idx < start_tile || tile_row_idx * cm->tiles.cols + tile_col_idx > end_tile) continue; tile_data = pbi->tile_data + tile_row_idx * cm->tiles.cols + tile_col_idx; dec_row_mt_sync = &tile_data->dec_row_mt_sync; num_threads_working = dec_row_mt_sync->num_threads_working; num_mis_waiting_for_decode = (dec_row_mt_sync->mi_rows_parse_done - dec_row_mt_sync->mi_rows_decode_started) * dec_row_mt_sync->mi_cols; num_mis_to_decode = (dec_row_mt_sync->mi_rows - dec_row_mt_sync->mi_rows_decode_started) * dec_row_mt_sync->mi_cols; assert(num_mis_to_decode >= num_mis_waiting_for_decode); // Pick the tile which has minimum number of threads working on it. if (num_mis_waiting_for_decode > 0) { if (num_threads_working < min_threads_working) { min_threads_working = num_threads_working; max_mis_to_decode = 0; } if (num_threads_working == min_threads_working && num_mis_to_decode > max_mis_to_decode && num_threads_working < get_max_row_mt_workers_per_tile(cm, &tile_data->tile_info)) { max_mis_to_decode = num_mis_to_decode; tile_row = tile_row_idx; tile_col = tile_col_idx; } } } } // No job found to process if (tile_row == -1 || tile_col == -1) return 0; tile_data = pbi->tile_data + tile_row * cm->tiles.cols + tile_col; dec_row_mt_sync = &tile_data->dec_row_mt_sync; next_job_info->tile_row = tile_row; next_job_info->tile_col = tile_col; next_job_info->mi_row = dec_row_mt_sync->mi_rows_decode_started + tile_data->tile_info.mi_row_start; dec_row_mt_sync->num_threads_working++; dec_row_mt_sync->mi_rows_decode_started += sb_mi_size; frame_row_mt_info->mi_rows_decode_started += sb_mi_size; assert(frame_row_mt_info->mi_rows_parse_done >= frame_row_mt_info->mi_rows_decode_started); #if CONFIG_MULTITHREAD if (frame_row_mt_info->mi_rows_decode_started == frame_row_mt_info->mi_rows_to_decode) { pthread_cond_broadcast(pbi->row_mt_cond_); } #endif return 1; } static inline void signal_parse_sb_row_done(AV1Decoder *const pbi, TileDataDec *const tile_data, const int sb_mi_size) { AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info; #if CONFIG_MULTITHREAD pthread_mutex_lock(pbi->row_mt_mutex_); #endif assert(frame_row_mt_info->mi_rows_parse_done >= frame_row_mt_info->mi_rows_decode_started); tile_data->dec_row_mt_sync.mi_rows_parse_done += sb_mi_size; frame_row_mt_info->mi_rows_parse_done += sb_mi_size; #if CONFIG_MULTITHREAD // A new decode job is available. Wake up one worker thread to handle the // new decode job. // NOTE: This assumes we bump mi_rows_parse_done and mi_rows_decode_started // by the same increment (sb_mi_size). pthread_cond_signal(pbi->row_mt_cond_); pthread_mutex_unlock(pbi->row_mt_mutex_); #endif } // This function is very similar to decode_tile(). It would be good to figure // out how to share code. static inline void parse_tile_row_mt(AV1Decoder *pbi, ThreadData *const td, TileDataDec *const tile_data) { AV1_COMMON *const cm = &pbi->common; const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size]; const int num_planes = av1_num_planes(cm); const TileInfo *const tile_info = &tile_data->tile_info; int tile_row = tile_info->tile_row; DecoderCodingBlock *const dcb = &td->dcb; MACROBLOCKD *const xd = &dcb->xd; av1_zero_above_context(cm, xd, tile_info->mi_col_start, tile_info->mi_col_end, tile_row); av1_reset_loop_filter_delta(xd, num_planes); av1_reset_loop_restoration(xd, num_planes); for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; mi_row += cm->seq_params->mib_size) { av1_zero_left_context(xd); for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end; mi_col += cm->seq_params->mib_size) { set_cb_buffer(pbi, dcb, pbi->cb_buffer_base, num_planes, mi_row, mi_col); // Bit-stream parsing of the superblock decode_partition(pbi, td, mi_row, mi_col, td->bit_reader, cm->seq_params->sb_size, 0x1); if (aom_reader_has_overflowed(td->bit_reader)) { aom_merge_corrupted_flag(&dcb->corrupted, 1); return; } } signal_parse_sb_row_done(pbi, tile_data, sb_mi_size); } int corrupted = (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0; aom_merge_corrupted_flag(&dcb->corrupted, corrupted); } static int row_mt_worker_hook(void *arg1, void *arg2) { DecWorkerData *const thread_data = (DecWorkerData *)arg1; AV1Decoder *const pbi = (AV1Decoder *)arg2; ThreadData *const td = thread_data->td; uint8_t allow_update_cdf; AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info; td->dcb.corrupted = 0; // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(thread_data->error_info.jmp)) { thread_data->error_info.setjmp = 0; thread_data->td->dcb.corrupted = 1; #if CONFIG_MULTITHREAD pthread_mutex_lock(pbi->row_mt_mutex_); #endif frame_row_mt_info->row_mt_exit = 1; #if CONFIG_MULTITHREAD pthread_cond_broadcast(pbi->row_mt_cond_); pthread_mutex_unlock(pbi->row_mt_mutex_); #endif // If any SB row (erroneous row) processed by a thread encounters an // internal error, there is a need to indicate other threads that decoding // of the erroneous row is complete. This ensures that other threads which // wait upon the completion of SB's present in erroneous row are not waiting // indefinitely. signal_decoding_done_for_erroneous_row(pbi, &thread_data->td->dcb.xd); return 0; } thread_data->error_info.setjmp = 1; AV1_COMMON *cm = &pbi->common; allow_update_cdf = cm->tiles.large_scale ? 0 : 1; allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update; set_decode_func_pointers(td, 0x1); assert(cm->tiles.cols > 0); while (!td->dcb.corrupted) { TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info); if (cur_job_info != NULL) { const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer; TileDataDec *const tile_data = cur_job_info->tile_data; tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data, allow_update_cdf); #if CONFIG_MULTITHREAD pthread_mutex_lock(pbi->row_mt_mutex_); #endif tile_data->dec_row_mt_sync.num_threads_working++; #if CONFIG_MULTITHREAD pthread_mutex_unlock(pbi->row_mt_mutex_); #endif // decode tile parse_tile_row_mt(pbi, td, tile_data); #if CONFIG_MULTITHREAD pthread_mutex_lock(pbi->row_mt_mutex_); #endif tile_data->dec_row_mt_sync.num_threads_working--; #if CONFIG_MULTITHREAD pthread_mutex_unlock(pbi->row_mt_mutex_); #endif } else { break; } } if (td->dcb.corrupted) { thread_data->error_info.setjmp = 0; #if CONFIG_MULTITHREAD pthread_mutex_lock(pbi->row_mt_mutex_); #endif frame_row_mt_info->row_mt_exit = 1; #if CONFIG_MULTITHREAD pthread_cond_broadcast(pbi->row_mt_cond_); pthread_mutex_unlock(pbi->row_mt_mutex_); #endif return 0; } set_decode_func_pointers(td, 0x2); while (1) { AV1DecRowMTJobInfo next_job_info; int end_of_frame = 0; #if CONFIG_MULTITHREAD pthread_mutex_lock(pbi->row_mt_mutex_); #endif while (!get_next_job_info(pbi, &next_job_info, &end_of_frame)) { #if CONFIG_MULTITHREAD pthread_cond_wait(pbi->row_mt_cond_, pbi->row_mt_mutex_); #endif } #if CONFIG_MULTITHREAD pthread_mutex_unlock(pbi->row_mt_mutex_); #endif if (end_of_frame) break; int tile_row = next_job_info.tile_row; int tile_col = next_job_info.tile_col; int mi_row = next_job_info.mi_row; TileDataDec *tile_data = pbi->tile_data + tile_row * cm->tiles.cols + tile_col; AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync; av1_tile_init(&td->dcb.xd.tile, cm, tile_row, tile_col); av1_init_macroblockd(cm, &td->dcb.xd); td->dcb.xd.error_info = &thread_data->error_info; decode_tile_sb_row(pbi, td, &tile_data->tile_info, mi_row); #if CONFIG_MULTITHREAD pthread_mutex_lock(pbi->row_mt_mutex_); #endif dec_row_mt_sync->num_threads_working--; #if CONFIG_MULTITHREAD pthread_mutex_unlock(pbi->row_mt_mutex_); #endif } thread_data->error_info.setjmp = 0; return !td->dcb.corrupted; } // sorts in descending order static int compare_tile_buffers(const void *a, const void *b) { const TileJobsDec *const buf1 = (const TileJobsDec *)a; const TileJobsDec *const buf2 = (const TileJobsDec *)b; return (((int)buf2->tile_buffer->size) - ((int)buf1->tile_buffer->size)); } static inline void enqueue_tile_jobs(AV1Decoder *pbi, AV1_COMMON *cm, int tile_rows_start, int tile_rows_end, int tile_cols_start, int tile_cols_end, int start_tile, int end_tile) { AV1DecTileMT *tile_mt_info = &pbi->tile_mt_info; TileJobsDec *tile_job_queue = tile_mt_info->job_queue; tile_mt_info->jobs_enqueued = 0; tile_mt_info->jobs_dequeued = 0; for (int row = tile_rows_start; row < tile_rows_end; row++) { for (int col = tile_cols_start; col < tile_cols_end; col++) { if (row * cm->tiles.cols + col < start_tile || row * cm->tiles.cols + col > end_tile) continue; tile_job_queue->tile_buffer = &pbi->tile_buffers[row][col]; tile_job_queue->tile_data = pbi->tile_data + row * cm->tiles.cols + col; tile_job_queue++; tile_mt_info->jobs_enqueued++; } } } static inline void alloc_dec_jobs(AV1DecTileMT *tile_mt_info, AV1_COMMON *cm, int tile_rows, int tile_cols) { tile_mt_info->alloc_tile_rows = tile_rows; tile_mt_info->alloc_tile_cols = tile_cols; int num_tiles = tile_rows * tile_cols; #if CONFIG_MULTITHREAD { CHECK_MEM_ERROR(cm, tile_mt_info->job_mutex, aom_malloc(sizeof(*tile_mt_info->job_mutex) * num_tiles)); for (int i = 0; i < num_tiles; i++) { pthread_mutex_init(&tile_mt_info->job_mutex[i], NULL); } } #endif CHECK_MEM_ERROR(cm, tile_mt_info->job_queue, aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles)); } void av1_free_mc_tmp_buf(ThreadData *thread_data) { int ref; for (ref = 0; ref < 2; ref++) { if (thread_data->mc_buf_use_highbd) aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref])); else aom_free(thread_data->mc_buf[ref]); thread_data->mc_buf[ref] = NULL; } thread_data->mc_buf_size = 0; thread_data->mc_buf_use_highbd = 0; aom_free(thread_data->tmp_conv_dst); thread_data->tmp_conv_dst = NULL; aom_free(thread_data->seg_mask); thread_data->seg_mask = NULL; for (int i = 0; i < 2; ++i) { aom_free(thread_data->tmp_obmc_bufs[i]); thread_data->tmp_obmc_bufs[i] = NULL; } } static inline void allocate_mc_tmp_buf(AV1_COMMON *const cm, ThreadData *thread_data, int buf_size, int use_highbd) { for (int ref = 0; ref < 2; ref++) { // The mc_buf/hbd_mc_buf must be zeroed to fix a intermittent valgrind error // 'Conditional jump or move depends on uninitialised value' from the loop // filter. Uninitialized reads in convolve function (e.g. horiz_4tap path in // av1_convolve_2d_sr_avx2()) from mc_buf/hbd_mc_buf are seen to be the // potential reason for this issue. if (use_highbd) { uint16_t *hbd_mc_buf; CHECK_MEM_ERROR(cm, hbd_mc_buf, (uint16_t *)aom_memalign(16, buf_size)); memset(hbd_mc_buf, 0, buf_size); thread_data->mc_buf[ref] = CONVERT_TO_BYTEPTR(hbd_mc_buf); } else { CHECK_MEM_ERROR(cm, thread_data->mc_buf[ref], (uint8_t *)aom_memalign(16, buf_size)); memset(thread_data->mc_buf[ref], 0, buf_size); } } thread_data->mc_buf_size = buf_size; thread_data->mc_buf_use_highbd = use_highbd; CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst, aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*thread_data->tmp_conv_dst))); CHECK_MEM_ERROR(cm, thread_data->seg_mask, (uint8_t *)aom_memalign( 16, 2 * MAX_SB_SQUARE * sizeof(*thread_data->seg_mask))); for (int i = 0; i < 2; ++i) { CHECK_MEM_ERROR( cm, thread_data->tmp_obmc_bufs[i], aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*thread_data->tmp_obmc_bufs[i]))); } } static inline void reset_dec_workers(AV1Decoder *pbi, AVxWorkerHook worker_hook, int num_workers) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); // Reset tile decoding hook for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) { AVxWorker *const worker = &pbi->tile_workers[worker_idx]; DecWorkerData *const thread_data = pbi->thread_data + worker_idx; thread_data->td->dcb = pbi->dcb; thread_data->td->dcb.corrupted = 0; thread_data->td->dcb.mc_buf[0] = thread_data->td->mc_buf[0]; thread_data->td->dcb.mc_buf[1] = thread_data->td->mc_buf[1]; thread_data->td->dcb.xd.tmp_conv_dst = thread_data->td->tmp_conv_dst; if (worker_idx) thread_data->td->dcb.xd.seg_mask = thread_data->td->seg_mask; for (int j = 0; j < 2; ++j) { thread_data->td->dcb.xd.tmp_obmc_bufs[j] = thread_data->td->tmp_obmc_bufs[j]; } winterface->sync(worker); worker->hook = worker_hook; worker->data1 = thread_data; worker->data2 = pbi; } #if CONFIG_ACCOUNTING if (pbi->acct_enabled) { aom_accounting_reset(&pbi->accounting); } #endif } static inline void launch_dec_workers(AV1Decoder *pbi, const uint8_t *data_end, int num_workers) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); for (int worker_idx = num_workers - 1; worker_idx >= 0; --worker_idx) { AVxWorker *const worker = &pbi->tile_workers[worker_idx]; DecWorkerData *const thread_data = (DecWorkerData *)worker->data1; thread_data->data_end = data_end; worker->had_error = 0; if (worker_idx == 0) { winterface->execute(worker); } else { winterface->launch(worker); } } } static inline void sync_dec_workers(AV1Decoder *pbi, int num_workers) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); int corrupted = 0; for (int worker_idx = num_workers; worker_idx > 0; --worker_idx) { AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1]; aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker)); } pbi->dcb.corrupted = corrupted; } static inline void decode_mt_init(AV1Decoder *pbi) { AV1_COMMON *const cm = &pbi->common; const AVxWorkerInterface *const winterface = aom_get_worker_interface(); int worker_idx; // Create workers and thread_data if (pbi->num_workers == 0) { const int num_threads = pbi->max_threads; CHECK_MEM_ERROR(cm, pbi->tile_workers, aom_malloc(num_threads * sizeof(*pbi->tile_workers))); CHECK_MEM_ERROR(cm, pbi->thread_data, aom_calloc(num_threads, sizeof(*pbi->thread_data))); for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) { AVxWorker *const worker = &pbi->tile_workers[worker_idx]; DecWorkerData *const thread_data = pbi->thread_data + worker_idx; winterface->init(worker); worker->thread_name = "aom tile worker"; if (worker_idx != 0 && !winterface->reset(worker)) { aom_internal_error(&pbi->error, AOM_CODEC_ERROR, "Tile decoder thread creation failed"); } ++pbi->num_workers; if (worker_idx != 0) { // Allocate thread data. CHECK_MEM_ERROR(cm, thread_data->td, aom_memalign(32, sizeof(*thread_data->td))); av1_zero(*thread_data->td); } else { // Main thread acts as a worker and uses the thread data in pbi thread_data->td = &pbi->td; } thread_data->error_info.error_code = AOM_CODEC_OK; thread_data->error_info.setjmp = 0; } } const int use_highbd = cm->seq_params->use_highbitdepth; const int buf_size = MC_TEMP_BUF_PELS << use_highbd; for (worker_idx = 1; worker_idx < pbi->max_threads; ++worker_idx) { DecWorkerData *const thread_data = pbi->thread_data + worker_idx; if (thread_data->td->mc_buf_size != buf_size) { av1_free_mc_tmp_buf(thread_data->td); allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd); } } } static inline void tile_mt_queue(AV1Decoder *pbi, int tile_cols, int tile_rows, int tile_rows_start, int tile_rows_end, int tile_cols_start, int tile_cols_end, int start_tile, int end_tile) { AV1_COMMON *const cm = &pbi->common; if (pbi->tile_mt_info.alloc_tile_cols != tile_cols || pbi->tile_mt_info.alloc_tile_rows != tile_rows) { av1_dealloc_dec_jobs(&pbi->tile_mt_info); alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols); } enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start, tile_cols_end, start_tile, end_tile); qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued, sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers); } static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, int start_tile, int end_tile) { AV1_COMMON *const cm = &pbi->common; CommonTileParams *const tiles = &cm->tiles; const int tile_cols = tiles->cols; const int tile_rows = tiles->rows; const int n_tiles = tile_cols * tile_rows; TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers; const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows); const int single_row = pbi->dec_tile_row >= 0; const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols); const int single_col = pbi->dec_tile_col >= 0; int tile_rows_start; int tile_rows_end; int tile_cols_start; int tile_cols_end; int tile_count_tg; int num_workers; const uint8_t *raw_data_end = NULL; if (tiles->large_scale) { tile_rows_start = single_row ? dec_tile_row : 0; tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows; tile_cols_start = single_col ? dec_tile_col : 0; tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols; } else { tile_rows_start = 0; tile_rows_end = tile_rows; tile_cols_start = 0; tile_cols_end = tile_cols; } tile_count_tg = end_tile - start_tile + 1; num_workers = AOMMIN(pbi->max_threads, tile_count_tg); // No tiles to decode. if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start || // First tile is larger than end_tile. tile_rows_start * tile_cols + tile_cols_start > end_tile || // Last tile is smaller than start_tile. (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile) return data; assert(tile_rows <= MAX_TILE_ROWS); assert(tile_cols <= MAX_TILE_COLS); assert(tile_count_tg > 0); assert(num_workers > 0); assert(start_tile <= end_tile); assert(start_tile >= 0 && end_tile < n_tiles); decode_mt_init(pbi); // get tile size in tile group #if EXT_TILE_DEBUG if (tiles->large_scale) assert(pbi->ext_tile_debug == 1); if (tiles->large_scale) raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers); else #endif // EXT_TILE_DEBUG get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile); if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) { decoder_alloc_tile_data(pbi, n_tiles); } if (pbi->dcb.xd.seg_mask == NULL) CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask, (uint8_t *)aom_memalign( 16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask))); for (int row = 0; row < tile_rows; row++) { for (int col = 0; col < tile_cols; col++) { TileDataDec *tile_data = pbi->tile_data + row * tiles->cols + col; av1_tile_init(&tile_data->tile_info, cm, row, col); } } tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end, tile_cols_start, tile_cols_end, start_tile, end_tile); reset_dec_workers(pbi, tile_worker_hook, num_workers); launch_dec_workers(pbi, data_end, num_workers); sync_dec_workers(pbi, num_workers); if (pbi->dcb.corrupted) aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Failed to decode tile data"); if (tiles->large_scale) { if (n_tiles == 1) { // Find the end of the single tile buffer return aom_reader_find_end(&pbi->tile_data->bit_reader); } // Return the end of the last tile buffer return raw_data_end; } TileDataDec *const tile_data = pbi->tile_data + end_tile; return aom_reader_find_end(&tile_data->bit_reader); } static inline void dec_alloc_cb_buf(AV1Decoder *pbi) { AV1_COMMON *const cm = &pbi->common; int size = ((cm->mi_params.mi_rows >> cm->seq_params->mib_size_log2) + 1) * ((cm->mi_params.mi_cols >> cm->seq_params->mib_size_log2) + 1); if (pbi->cb_buffer_alloc_size < size) { av1_dec_free_cb_buf(pbi); CHECK_MEM_ERROR(cm, pbi->cb_buffer_base, aom_memalign(32, sizeof(*pbi->cb_buffer_base) * size)); memset(pbi->cb_buffer_base, 0, sizeof(*pbi->cb_buffer_base) * size); pbi->cb_buffer_alloc_size = size; } } static inline void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start, int tile_rows_end, int tile_cols_start, int tile_cols_end, int start_tile, int end_tile, int max_sb_rows) { AV1_COMMON *const cm = &pbi->common; AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info; frame_row_mt_info->tile_rows_start = tile_rows_start; frame_row_mt_info->tile_rows_end = tile_rows_end; frame_row_mt_info->tile_cols_start = tile_cols_start; frame_row_mt_info->tile_cols_end = tile_cols_end; frame_row_mt_info->start_tile = start_tile; frame_row_mt_info->end_tile = end_tile; frame_row_mt_info->mi_rows_to_decode = 0; frame_row_mt_info->mi_rows_parse_done = 0; frame_row_mt_info->mi_rows_decode_started = 0; frame_row_mt_info->row_mt_exit = 0; for (int tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) { for (int tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) { if (tile_row * cm->tiles.cols + tile_col < start_tile || tile_row * cm->tiles.cols + tile_col > end_tile) continue; TileDataDec *const tile_data = pbi->tile_data + tile_row * cm->tiles.cols + tile_col; const TileInfo *const tile_info = &tile_data->tile_info; tile_data->dec_row_mt_sync.mi_rows_parse_done = 0; tile_data->dec_row_mt_sync.mi_rows_decode_started = 0; tile_data->dec_row_mt_sync.num_threads_working = 0; tile_data->dec_row_mt_sync.mi_rows = ALIGN_POWER_OF_TWO(tile_info->mi_row_end - tile_info->mi_row_start, cm->seq_params->mib_size_log2); tile_data->dec_row_mt_sync.mi_cols = ALIGN_POWER_OF_TWO(tile_info->mi_col_end - tile_info->mi_col_start, cm->seq_params->mib_size_log2); tile_data->dec_row_mt_sync.intrabc_extra_top_right_sb_delay = av1_get_intrabc_extra_top_right_sb_delay(cm); frame_row_mt_info->mi_rows_to_decode += tile_data->dec_row_mt_sync.mi_rows; // Initialize cur_sb_col to -1 for all SB rows. memset(tile_data->dec_row_mt_sync.cur_sb_col, -1, sizeof(*tile_data->dec_row_mt_sync.cur_sb_col) * max_sb_rows); } } #if CONFIG_MULTITHREAD if (pbi->row_mt_mutex_ == NULL) { CHECK_MEM_ERROR(cm, pbi->row_mt_mutex_, aom_malloc(sizeof(*(pbi->row_mt_mutex_)))); if (pbi->row_mt_mutex_) { pthread_mutex_init(pbi->row_mt_mutex_, NULL); } } if (pbi->row_mt_cond_ == NULL) { CHECK_MEM_ERROR(cm, pbi->row_mt_cond_, aom_malloc(sizeof(*(pbi->row_mt_cond_)))); if (pbi->row_mt_cond_) { pthread_cond_init(pbi->row_mt_cond_, NULL); } } #endif } static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, int start_tile, int end_tile) { AV1_COMMON *const cm = &pbi->common; CommonTileParams *const tiles = &cm->tiles; const int tile_cols = tiles->cols; const int tile_rows = tiles->rows; const int n_tiles = tile_cols * tile_rows; TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers; const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows); const int single_row = pbi->dec_tile_row >= 0; const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols); const int single_col = pbi->dec_tile_col >= 0; int tile_rows_start; int tile_rows_end; int tile_cols_start; int tile_cols_end; int tile_count_tg; int num_workers = 0; int max_threads; const uint8_t *raw_data_end = NULL; int max_sb_rows = 0; if (tiles->large_scale) { tile_rows_start = single_row ? dec_tile_row : 0; tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows; tile_cols_start = single_col ? dec_tile_col : 0; tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols; } else { tile_rows_start = 0; tile_rows_end = tile_rows; tile_cols_start = 0; tile_cols_end = tile_cols; } tile_count_tg = end_tile - start_tile + 1; max_threads = pbi->max_threads; // No tiles to decode. if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start || // First tile is larger than end_tile. tile_rows_start * tile_cols + tile_cols_start > end_tile || // Last tile is smaller than start_tile. (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile) return data; assert(tile_rows <= MAX_TILE_ROWS); assert(tile_cols <= MAX_TILE_COLS); assert(tile_count_tg > 0); assert(max_threads > 0); assert(start_tile <= end_tile); assert(start_tile >= 0 && end_tile < n_tiles); (void)tile_count_tg; decode_mt_init(pbi); // get tile size in tile group #if EXT_TILE_DEBUG if (tiles->large_scale) assert(pbi->ext_tile_debug == 1); if (tiles->large_scale) raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers); else #endif // EXT_TILE_DEBUG get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile); if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) { if (pbi->tile_data != NULL) { for (int i = 0; i < pbi->allocated_tiles; i++) { TileDataDec *const tile_data = pbi->tile_data + i; av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync); } } decoder_alloc_tile_data(pbi, n_tiles); } if (pbi->dcb.xd.seg_mask == NULL) CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask, (uint8_t *)aom_memalign( 16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask))); for (int row = 0; row < tile_rows; row++) { for (int col = 0; col < tile_cols; col++) { TileDataDec *tile_data = pbi->tile_data + row * tiles->cols + col; av1_tile_init(&tile_data->tile_info, cm, row, col); max_sb_rows = AOMMAX(max_sb_rows, av1_get_sb_rows_in_tile(cm, &tile_data->tile_info)); num_workers += get_max_row_mt_workers_per_tile(cm, &tile_data->tile_info); } } num_workers = AOMMIN(num_workers, max_threads); if (pbi->allocated_row_mt_sync_rows != max_sb_rows) { for (int i = 0; i < n_tiles; ++i) { TileDataDec *const tile_data = pbi->tile_data + i; av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync); dec_row_mt_alloc(&tile_data->dec_row_mt_sync, cm, max_sb_rows); } pbi->allocated_row_mt_sync_rows = max_sb_rows; } tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end, tile_cols_start, tile_cols_end, start_tile, end_tile); dec_alloc_cb_buf(pbi); row_mt_frame_init(pbi, tile_rows_start, tile_rows_end, tile_cols_start, tile_cols_end, start_tile, end_tile, max_sb_rows); reset_dec_workers(pbi, row_mt_worker_hook, num_workers); launch_dec_workers(pbi, data_end, num_workers); sync_dec_workers(pbi, num_workers); if (pbi->dcb.corrupted) aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Failed to decode tile data"); if (tiles->large_scale) { if (n_tiles == 1) { // Find the end of the single tile buffer return aom_reader_find_end(&pbi->tile_data->bit_reader); } // Return the end of the last tile buffer return raw_data_end; } TileDataDec *const tile_data = pbi->tile_data + end_tile; return aom_reader_find_end(&tile_data->bit_reader); } static inline void error_handler(void *data) { AV1_COMMON *const cm = (AV1_COMMON *)data; aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet"); } // Reads the high_bitdepth and twelve_bit fields in color_config() and sets // seq_params->bit_depth based on the values of those fields and // seq_params->profile. Reports errors by calling rb->error_handler() or // aom_internal_error(). static inline void read_bitdepth(struct aom_read_bit_buffer *rb, SequenceHeader *seq_params, struct aom_internal_error_info *error_info) { const int high_bitdepth = aom_rb_read_bit(rb); if (seq_params->profile == PROFILE_2 && high_bitdepth) { const int twelve_bit = aom_rb_read_bit(rb); seq_params->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10; } else if (seq_params->profile <= PROFILE_2) { seq_params->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8; } else { aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM, "Unsupported profile/bit-depth combination"); } #if !CONFIG_AV1_HIGHBITDEPTH if (seq_params->bit_depth > AOM_BITS_8) { aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM, "Bit-depth %d not supported", seq_params->bit_depth); } #endif } static void read_film_grain_params(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { aom_film_grain_t *pars = &cm->film_grain_params; const SequenceHeader *const seq_params = cm->seq_params; pars->apply_grain = aom_rb_read_bit(rb); if (!pars->apply_grain) { memset(pars, 0, sizeof(*pars)); return; } pars->random_seed = aom_rb_read_literal(rb, 16); if (cm->current_frame.frame_type == INTER_FRAME) pars->update_parameters = aom_rb_read_bit(rb); else pars->update_parameters = 1; pars->bit_depth = seq_params->bit_depth; if (!pars->update_parameters) { // inherit parameters from a previous reference frame int film_grain_params_ref_idx = aom_rb_read_literal(rb, 3); // Section 6.8.20: It is a requirement of bitstream conformance that // film_grain_params_ref_idx is equal to ref_frame_idx[ j ] for some value // of j in the range 0 to REFS_PER_FRAME - 1. int found = 0; for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { if (film_grain_params_ref_idx == cm->remapped_ref_idx[i]) { found = 1; break; } } if (!found) { aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Invalid film grain reference idx %d. ref_frame_idx = " "{%d, %d, %d, %d, %d, %d, %d}", film_grain_params_ref_idx, cm->remapped_ref_idx[0], cm->remapped_ref_idx[1], cm->remapped_ref_idx[2], cm->remapped_ref_idx[3], cm->remapped_ref_idx[4], cm->remapped_ref_idx[5], cm->remapped_ref_idx[6]); } RefCntBuffer *const buf = cm->ref_frame_map[film_grain_params_ref_idx]; if (buf == NULL) { aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Invalid Film grain reference idx"); } if (!buf->film_grain_params_present) { aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Film grain reference parameters not available"); } uint16_t random_seed = pars->random_seed; *pars = buf->film_grain_params; // inherit paramaters pars->random_seed = random_seed; // with new random seed return; } // Scaling functions parameters pars->num_y_points = aom_rb_read_literal(rb, 4); // max 14 if (pars->num_y_points > 14) aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Number of points for film grain luma scaling function " "exceeds the maximum value."); for (int i = 0; i < pars->num_y_points; i++) { pars->scaling_points_y[i][0] = aom_rb_read_literal(rb, 8); if (i && pars->scaling_points_y[i - 1][0] >= pars->scaling_points_y[i][0]) aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "First coordinate of the scaling function points " "shall be increasing."); pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8); } if (!seq_params->monochrome) pars->chroma_scaling_from_luma = aom_rb_read_bit(rb); else pars->chroma_scaling_from_luma = 0; if (seq_params->monochrome || pars->chroma_scaling_from_luma || ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) && (pars->num_y_points == 0))) { pars->num_cb_points = 0; pars->num_cr_points = 0; } else { pars->num_cb_points = aom_rb_read_literal(rb, 4); // max 10 if (pars->num_cb_points > 10) aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Number of points for film grain cb scaling function " "exceeds the maximum value."); for (int i = 0; i < pars->num_cb_points; i++) { pars->scaling_points_cb[i][0] = aom_rb_read_literal(rb, 8); if (i && pars->scaling_points_cb[i - 1][0] >= pars->scaling_points_cb[i][0]) aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "First coordinate of the scaling function points " "shall be increasing."); pars->scaling_points_cb[i][1] = aom_rb_read_literal(rb, 8); } pars->num_cr_points = aom_rb_read_literal(rb, 4); // max 10 if (pars->num_cr_points > 10) aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Number of points for film grain cr scaling function " "exceeds the maximum value."); for (int i = 0; i < pars->num_cr_points; i++) { pars->scaling_points_cr[i][0] = aom_rb_read_literal(rb, 8); if (i && pars->scaling_points_cr[i - 1][0] >= pars->scaling_points_cr[i][0]) aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "First coordinate of the scaling function points " "shall be increasing."); pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8); } if ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) && (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) || ((pars->num_cb_points != 0) && (pars->num_cr_points == 0)))) aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "In YCbCr 4:2:0, film grain shall be applied " "to both chroma components or neither."); } pars->scaling_shift = aom_rb_read_literal(rb, 2) + 8; // 8 + value // AR coefficients // Only sent if the corresponsing scaling function has // more than 0 points pars->ar_coeff_lag = aom_rb_read_literal(rb, 2); int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); int num_pos_chroma = num_pos_luma; if (pars->num_y_points > 0) ++num_pos_chroma; if (pars->num_y_points) for (int i = 0; i < num_pos_luma; i++) pars->ar_coeffs_y[i] = aom_rb_read_literal(rb, 8) - 128; if (pars->num_cb_points || pars->chroma_scaling_from_luma) for (int i = 0; i < num_pos_chroma; i++) pars->ar_coeffs_cb[i] = aom_rb_read_literal(rb, 8) - 128; if (pars->num_cr_points || pars->chroma_scaling_from_luma) for (int i = 0; i < num_pos_chroma; i++) pars->ar_coeffs_cr[i] = aom_rb_read_literal(rb, 8) - 128; pars->ar_coeff_shift = aom_rb_read_literal(rb, 2) + 6; // 6 + value pars->grain_scale_shift = aom_rb_read_literal(rb, 2); if (pars->num_cb_points) { pars->cb_mult = aom_rb_read_literal(rb, 8); pars->cb_luma_mult = aom_rb_read_literal(rb, 8); pars->cb_offset = aom_rb_read_literal(rb, 9); } if (pars->num_cr_points) { pars->cr_mult = aom_rb_read_literal(rb, 8); pars->cr_luma_mult = aom_rb_read_literal(rb, 8); pars->cr_offset = aom_rb_read_literal(rb, 9); } pars->overlap_flag = aom_rb_read_bit(rb); pars->clip_to_restricted_range = aom_rb_read_bit(rb); } static inline void read_film_grain(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { if (cm->seq_params->film_grain_params_present && (cm->show_frame || cm->showable_frame)) { read_film_grain_params(cm, rb); } else { memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params)); } cm->film_grain_params.bit_depth = cm->seq_params->bit_depth; memcpy(&cm->cur_frame->film_grain_params, &cm->film_grain_params, sizeof(aom_film_grain_t)); } void av1_read_color_config(struct aom_read_bit_buffer *rb, int allow_lowbitdepth, SequenceHeader *seq_params, struct aom_internal_error_info *error_info) { read_bitdepth(rb, seq_params, error_info); seq_params->use_highbitdepth = seq_params->bit_depth > AOM_BITS_8 || !allow_lowbitdepth; // monochrome bit (not needed for PROFILE_1) const int is_monochrome = seq_params->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0; seq_params->monochrome = is_monochrome; int color_description_present_flag = aom_rb_read_bit(rb); if (color_description_present_flag) { seq_params->color_primaries = aom_rb_read_literal(rb, 8); seq_params->transfer_characteristics = aom_rb_read_literal(rb, 8); seq_params->matrix_coefficients = aom_rb_read_literal(rb, 8); } else { seq_params->color_primaries = AOM_CICP_CP_UNSPECIFIED; seq_params->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED; seq_params->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED; } if (is_monochrome) { // [16,235] (including xvycc) vs [0,255] range seq_params->color_range = aom_rb_read_bit(rb); seq_params->subsampling_y = seq_params->subsampling_x = 1; seq_params->chroma_sample_position = AOM_CSP_UNKNOWN; seq_params->separate_uv_delta_q = 0; return; } if (seq_params->color_primaries == AOM_CICP_CP_BT_709 && seq_params->transfer_characteristics == AOM_CICP_TC_SRGB && seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { seq_params->subsampling_y = seq_params->subsampling_x = 0; seq_params->color_range = 1; // assume full color-range if (!(seq_params->profile == PROFILE_1 || (seq_params->profile == PROFILE_2 && seq_params->bit_depth == AOM_BITS_12))) { aom_internal_error( error_info, AOM_CODEC_UNSUP_BITSTREAM, "sRGB colorspace not compatible with specified profile"); } } else { // [16,235] (including xvycc) vs [0,255] range seq_params->color_range = aom_rb_read_bit(rb); if (seq_params->profile == PROFILE_0) { // 420 only seq_params->subsampling_x = seq_params->subsampling_y = 1; } else if (seq_params->profile == PROFILE_1) { // 444 only seq_params->subsampling_x = seq_params->subsampling_y = 0; } else { assert(seq_params->profile == PROFILE_2); if (seq_params->bit_depth == AOM_BITS_12) { seq_params->subsampling_x = aom_rb_read_bit(rb); if (seq_params->subsampling_x) seq_params->subsampling_y = aom_rb_read_bit(rb); // 422 or 420 else seq_params->subsampling_y = 0; // 444 } else { // 422 seq_params->subsampling_x = 1; seq_params->subsampling_y = 0; } } if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY && (seq_params->subsampling_x || seq_params->subsampling_y)) { aom_internal_error( error_info, AOM_CODEC_UNSUP_BITSTREAM, "Identity CICP Matrix incompatible with non 4:4:4 color sampling"); } if (seq_params->subsampling_x && seq_params->subsampling_y) { seq_params->chroma_sample_position = aom_rb_read_literal(rb, 2); } } seq_params->separate_uv_delta_q = aom_rb_read_bit(rb); } void av1_read_timing_info_header(aom_timing_info_t *timing_info, struct aom_internal_error_info *error, struct aom_read_bit_buffer *rb) { timing_info->num_units_in_display_tick = aom_rb_read_unsigned_literal(rb, 32); // Number of units in a display tick timing_info->time_scale = aom_rb_read_unsigned_literal(rb, 32); // Time scale if (timing_info->num_units_in_display_tick == 0 || timing_info->time_scale == 0) { aom_internal_error( error, AOM_CODEC_UNSUP_BITSTREAM, "num_units_in_display_tick and time_scale must be greater than 0."); } timing_info->equal_picture_interval = aom_rb_read_bit(rb); // Equal picture interval bit if (timing_info->equal_picture_interval) { const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb); if (num_ticks_per_picture_minus_1 == UINT32_MAX) { aom_internal_error( error, AOM_CODEC_UNSUP_BITSTREAM, "num_ticks_per_picture_minus_1 cannot be (1 << 32) - 1."); } timing_info->num_ticks_per_picture = num_ticks_per_picture_minus_1 + 1; } } void av1_read_decoder_model_info(aom_dec_model_info_t *decoder_model_info, struct aom_read_bit_buffer *rb) { decoder_model_info->encoder_decoder_buffer_delay_length = aom_rb_read_literal(rb, 5) + 1; decoder_model_info->num_units_in_decoding_tick = aom_rb_read_unsigned_literal(rb, 32); // Number of units in a decoding tick decoder_model_info->buffer_removal_time_length = aom_rb_read_literal(rb, 5) + 1; decoder_model_info->frame_presentation_time_length = aom_rb_read_literal(rb, 5) + 1; } void av1_read_op_parameters_info(aom_dec_model_op_parameters_t *op_params, int buffer_delay_length, struct aom_read_bit_buffer *rb) { op_params->decoder_buffer_delay = aom_rb_read_unsigned_literal(rb, buffer_delay_length); op_params->encoder_buffer_delay = aom_rb_read_unsigned_literal(rb, buffer_delay_length); op_params->low_delay_mode_flag = aom_rb_read_bit(rb); } static inline void read_temporal_point_info(AV1_COMMON *const cm, struct aom_read_bit_buffer *rb) { cm->frame_presentation_time = aom_rb_read_unsigned_literal( rb, cm->seq_params->decoder_model_info.frame_presentation_time_length); } void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb, SequenceHeader *seq_params) { const int num_bits_width = aom_rb_read_literal(rb, 4) + 1; const int num_bits_height = aom_rb_read_literal(rb, 4) + 1; const int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1; const int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1; seq_params->num_bits_width = num_bits_width; seq_params->num_bits_height = num_bits_height; seq_params->max_frame_width = max_frame_width; seq_params->max_frame_height = max_frame_height; if (seq_params->reduced_still_picture_hdr) { seq_params->frame_id_numbers_present_flag = 0; } else { seq_params->frame_id_numbers_present_flag = aom_rb_read_bit(rb); } if (seq_params->frame_id_numbers_present_flag) { // We must always have delta_frame_id_length < frame_id_length, // in order for a frame to be referenced with a unique delta. // Avoid wasting bits by using a coding that enforces this restriction. seq_params->delta_frame_id_length = aom_rb_read_literal(rb, 4) + 2; seq_params->frame_id_length = aom_rb_read_literal(rb, 3) + seq_params->delta_frame_id_length + 1; if (seq_params->frame_id_length > 16) aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Invalid frame_id_length"); } setup_sb_size(seq_params, rb); seq_params->enable_filter_intra = aom_rb_read_bit(rb); seq_params->enable_intra_edge_filter = aom_rb_read_bit(rb); if (seq_params->reduced_still_picture_hdr) { seq_params->enable_interintra_compound = 0; seq_params->enable_masked_compound = 0; seq_params->enable_warped_motion = 0; seq_params->enable_dual_filter = 0; seq_params->order_hint_info.enable_order_hint = 0; seq_params->order_hint_info.enable_dist_wtd_comp = 0; seq_params->order_hint_info.enable_ref_frame_mvs = 0; seq_params->force_screen_content_tools = 2; // SELECT_SCREEN_CONTENT_TOOLS seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV seq_params->order_hint_info.order_hint_bits_minus_1 = -1; } else { seq_params->enable_interintra_compound = aom_rb_read_bit(rb); seq_params->enable_masked_compound = aom_rb_read_bit(rb); seq_params->enable_warped_motion = aom_rb_read_bit(rb); seq_params->enable_dual_filter = aom_rb_read_bit(rb); seq_params->order_hint_info.enable_order_hint = aom_rb_read_bit(rb); seq_params->order_hint_info.enable_dist_wtd_comp = seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0; seq_params->order_hint_info.enable_ref_frame_mvs = seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0; if (aom_rb_read_bit(rb)) { seq_params->force_screen_content_tools = 2; // SELECT_SCREEN_CONTENT_TOOLS } else { seq_params->force_screen_content_tools = aom_rb_read_bit(rb); } if (seq_params->force_screen_content_tools > 0) { if (aom_rb_read_bit(rb)) { seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV } else { seq_params->force_integer_mv = aom_rb_read_bit(rb); } } else { seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV } seq_params->order_hint_info.order_hint_bits_minus_1 = seq_params->order_hint_info.enable_order_hint ? aom_rb_read_literal(rb, 3) : -1; } seq_params->enable_superres = aom_rb_read_bit(rb); seq_params->enable_cdef = aom_rb_read_bit(rb); seq_params->enable_restoration = aom_rb_read_bit(rb); } static int read_global_motion_params(WarpedMotionParams *params, const WarpedMotionParams *ref_params, struct aom_read_bit_buffer *rb, int allow_hp) { TransformationType type = aom_rb_read_bit(rb); if (type != IDENTITY) { if (aom_rb_read_bit(rb)) type = ROTZOOM; else type = aom_rb_read_bit(rb) ? TRANSLATION : AFFINE; } *params = default_warp_params; params->wmtype = type; if (type >= ROTZOOM) { params->wmmat[2] = aom_rb_read_signed_primitive_refsubexpfin( rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)) * GM_ALPHA_DECODE_FACTOR + (1 << WARPEDMODEL_PREC_BITS); params->wmmat[3] = aom_rb_read_signed_primitive_refsubexpfin( rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF)) * GM_ALPHA_DECODE_FACTOR; } if (type >= AFFINE) { params->wmmat[4] = aom_rb_read_signed_primitive_refsubexpfin( rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF)) * GM_ALPHA_DECODE_FACTOR; params->wmmat[5] = aom_rb_read_signed_primitive_refsubexpfin( rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)) * GM_ALPHA_DECODE_FACTOR + (1 << WARPEDMODEL_PREC_BITS); } else { params->wmmat[4] = -params->wmmat[3]; params->wmmat[5] = params->wmmat[2]; } if (type >= TRANSLATION) { const int trans_bits = (type == TRANSLATION) ? GM_ABS_TRANS_ONLY_BITS - !allow_hp : GM_ABS_TRANS_BITS; const int trans_dec_factor = (type == TRANSLATION) ? GM_TRANS_ONLY_DECODE_FACTOR * (1 << !allow_hp) : GM_TRANS_DECODE_FACTOR; const int trans_prec_diff = (type == TRANSLATION) ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp : GM_TRANS_PREC_DIFF; params->wmmat[0] = aom_rb_read_signed_primitive_refsubexpfin( rb, (1 << trans_bits) + 1, SUBEXPFIN_K, (ref_params->wmmat[0] >> trans_prec_diff)) * trans_dec_factor; params->wmmat[1] = aom_rb_read_signed_primitive_refsubexpfin( rb, (1 << trans_bits) + 1, SUBEXPFIN_K, (ref_params->wmmat[1] >> trans_prec_diff)) * trans_dec_factor; } int good_shear_params = av1_get_shear_params(params); if (!good_shear_params) return 0; return 1; } static inline void read_global_motion(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { for (int frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) { const WarpedMotionParams *ref_params = cm->prev_frame ? &cm->prev_frame->global_motion[frame] : &default_warp_params; int good_params = read_global_motion_params(&cm->global_motion[frame], ref_params, rb, cm->features.allow_high_precision_mv); if (!good_params) { #if WARPED_MOTION_DEBUG printf("Warning: unexpected global motion shear params from aomenc\n"); #endif cm->global_motion[frame].invalid = 1; } // TODO(sarahparker, debargha): The logic in the commented out code below // does not work currently and causes mismatches when resize is on. Fix it // before turning the optimization back on. /* YV12_BUFFER_CONFIG *ref_buf = get_ref_frame(cm, frame); if (cm->width == ref_buf->y_crop_width && cm->height == ref_buf->y_crop_height) { read_global_motion_params(&cm->global_motion[frame], &cm->prev_frame->global_motion[frame], rb, cm->features.allow_high_precision_mv); } else { cm->global_motion[frame] = default_warp_params; } */ /* printf("Dec Ref %d [%d/%d]: %d %d %d %d\n", frame, cm->current_frame.frame_number, cm->show_frame, cm->global_motion[frame].wmmat[0], cm->global_motion[frame].wmmat[1], cm->global_motion[frame].wmmat[2], cm->global_motion[frame].wmmat[3]); */ } memcpy(cm->cur_frame->global_motion, cm->global_motion, REF_FRAMES * sizeof(WarpedMotionParams)); } // Release the references to the frame buffers in cm->ref_frame_map and reset // all elements of cm->ref_frame_map to NULL. static inline void reset_ref_frame_map(AV1_COMMON *const cm) { BufferPool *const pool = cm->buffer_pool; for (int i = 0; i < REF_FRAMES; i++) { decrease_ref_count(cm->ref_frame_map[i], pool); cm->ref_frame_map[i] = NULL; } } // If the refresh_frame_flags bitmask is set, update reference frame id values // and mark frames as valid for reference. static inline void update_ref_frame_id(AV1Decoder *const pbi) { AV1_COMMON *const cm = &pbi->common; int refresh_frame_flags = cm->current_frame.refresh_frame_flags; for (int i = 0; i < REF_FRAMES; i++) { if ((refresh_frame_flags >> i) & 1) { cm->ref_frame_id[i] = cm->current_frame_id; pbi->valid_for_referencing[i] = 1; } } } static inline void show_existing_frame_reset(AV1Decoder *const pbi, int existing_frame_idx) { AV1_COMMON *const cm = &pbi->common; assert(cm->show_existing_frame); cm->current_frame.frame_type = KEY_FRAME; cm->current_frame.refresh_frame_flags = (1 << REF_FRAMES) - 1; for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { cm->remapped_ref_idx[i] = INVALID_IDX; } if (pbi->need_resync) { reset_ref_frame_map(cm); pbi->need_resync = 0; } // Note that the displayed frame must be valid for referencing in order to // have been selected. cm->current_frame_id = cm->ref_frame_id[existing_frame_idx]; update_ref_frame_id(pbi); cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; } static inline void reset_frame_buffers(AV1_COMMON *cm) { RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; int i; lock_buffer_pool(cm->buffer_pool); reset_ref_frame_map(cm); assert(cm->cur_frame->ref_count == 1); for (i = 0; i < cm->buffer_pool->num_frame_bufs; ++i) { // Reset all unreferenced frame buffers. We can also reset cm->cur_frame // because we are the sole owner of cm->cur_frame. if (frame_bufs[i].ref_count > 0 && &frame_bufs[i] != cm->cur_frame) { continue; } frame_bufs[i].order_hint = 0; av1_zero(frame_bufs[i].ref_order_hints); } av1_zero_unused_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers); unlock_buffer_pool(cm->buffer_pool); } // On success, returns 0. On failure, calls aom_internal_error and does not // return. static int read_uncompressed_header(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) { AV1_COMMON *const cm = &pbi->common; const SequenceHeader *const seq_params = cm->seq_params; CurrentFrame *const current_frame = &cm->current_frame; FeatureFlags *const features = &cm->features; MACROBLOCKD *const xd = &pbi->dcb.xd; BufferPool *const pool = cm->buffer_pool; RefCntBuffer *const frame_bufs = pool->frame_bufs; aom_s_frame_info *sframe_info = &pbi->sframe_info; sframe_info->is_s_frame = 0; sframe_info->is_s_frame_at_altref = 0; if (!pbi->sequence_header_ready) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "No sequence header"); } if (seq_params->reduced_still_picture_hdr) { cm->show_existing_frame = 0; cm->show_frame = 1; current_frame->frame_type = KEY_FRAME; if (pbi->sequence_header_changed) { // This is the start of a new coded video sequence. pbi->sequence_header_changed = 0; pbi->decoding_first_frame = 1; reset_frame_buffers(cm); } features->error_resilient_mode = 1; } else { cm->show_existing_frame = aom_rb_read_bit(rb); pbi->reset_decoder_state = 0; if (cm->show_existing_frame) { if (pbi->sequence_header_changed) { aom_internal_error( &pbi->error, AOM_CODEC_CORRUPT_FRAME, "New sequence header starts with a show_existing_frame."); } // Show an existing frame directly. const int existing_frame_idx = aom_rb_read_literal(rb, 3); RefCntBuffer *const frame_to_show = cm->ref_frame_map[existing_frame_idx]; if (frame_to_show == NULL) { aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, "Buffer does not contain a decoded frame"); } if (seq_params->decoder_model_info_present_flag && seq_params->timing_info.equal_picture_interval == 0) { read_temporal_point_info(cm, rb); } if (seq_params->frame_id_numbers_present_flag) { int frame_id_length = seq_params->frame_id_length; int display_frame_id = aom_rb_read_literal(rb, frame_id_length); /* Compare display_frame_id with ref_frame_id and check valid for * referencing */ if (display_frame_id != cm->ref_frame_id[existing_frame_idx] || pbi->valid_for_referencing[existing_frame_idx] == 0) aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Reference buffer frame ID mismatch"); } lock_buffer_pool(pool); assert(frame_to_show->ref_count > 0); // cm->cur_frame should be the buffer referenced by the return value // of the get_free_fb() call in assign_cur_frame_new_fb() (called by // av1_receive_compressed_data()), so the ref_count should be 1. assert(cm->cur_frame->ref_count == 1); // assign_frame_buffer_p() decrements ref_count directly rather than // call decrease_ref_count(). If cm->cur_frame->raw_frame_buffer has // already been allocated, it will not be released by // assign_frame_buffer_p()! assert(!cm->cur_frame->raw_frame_buffer.data); assign_frame_buffer_p(&cm->cur_frame, frame_to_show); pbi->reset_decoder_state = frame_to_show->frame_type == KEY_FRAME; unlock_buffer_pool(pool); cm->lf.filter_level[0] = 0; cm->lf.filter_level[1] = 0; cm->show_frame = 1; current_frame->order_hint = frame_to_show->order_hint; // Section 6.8.2: It is a requirement of bitstream conformance that when // show_existing_frame is used to show a previous frame, that the value // of showable_frame for the previous frame was equal to 1. if (!frame_to_show->showable_frame) { aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, "Buffer does not contain a showable frame"); } // Section 6.8.2: It is a requirement of bitstream conformance that when // show_existing_frame is used to show a previous frame with // RefFrameType[ frame_to_show_map_idx ] equal to KEY_FRAME, that the // frame is output via the show_existing_frame mechanism at most once. if (pbi->reset_decoder_state) frame_to_show->showable_frame = 0; cm->film_grain_params = frame_to_show->film_grain_params; if (pbi->reset_decoder_state) { show_existing_frame_reset(pbi, existing_frame_idx); } else { current_frame->refresh_frame_flags = 0; } return 0; } current_frame->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2); if (pbi->sequence_header_changed) { if (current_frame->frame_type == KEY_FRAME) { // This is the start of a new coded video sequence. pbi->sequence_header_changed = 0; pbi->decoding_first_frame = 1; reset_frame_buffers(cm); } else { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Sequence header has changed without a keyframe."); } } cm->show_frame = aom_rb_read_bit(rb); if (cm->show_frame == 0) pbi->is_arf_frame_present = 1; if (cm->show_frame == 0 && cm->current_frame.frame_type == KEY_FRAME) pbi->is_fwd_kf_present = 1; if (cm->current_frame.frame_type == S_FRAME) { sframe_info->is_s_frame = 1; sframe_info->is_s_frame_at_altref = cm->show_frame ? 0 : 1; } if (seq_params->still_picture && (current_frame->frame_type != KEY_FRAME || !cm->show_frame)) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Still pictures must be coded as shown keyframes"); } cm->showable_frame = current_frame->frame_type != KEY_FRAME; if (cm->show_frame) { if (seq_params->decoder_model_info_present_flag && seq_params->timing_info.equal_picture_interval == 0) read_temporal_point_info(cm, rb); } else { // See if this frame can be used as show_existing_frame in future cm->showable_frame = aom_rb_read_bit(rb); } cm->cur_frame->showable_frame = cm->showable_frame; features->error_resilient_mode = frame_is_sframe(cm) || (current_frame->frame_type == KEY_FRAME && cm->show_frame) ? 1 : aom_rb_read_bit(rb); } if (current_frame->frame_type == KEY_FRAME && cm->show_frame) { /* All frames need to be marked as not valid for referencing */ for (int i = 0; i < REF_FRAMES; i++) { pbi->valid_for_referencing[i] = 0; } } features->disable_cdf_update = aom_rb_read_bit(rb); if (seq_params->force_screen_content_tools == 2) { features->allow_screen_content_tools = aom_rb_read_bit(rb); } else { features->allow_screen_content_tools = seq_params->force_screen_content_tools; } if (features->allow_screen_content_tools) { if (seq_params->force_integer_mv == 2) { features->cur_frame_force_integer_mv = aom_rb_read_bit(rb); } else { features->cur_frame_force_integer_mv = seq_params->force_integer_mv; } } else { features->cur_frame_force_integer_mv = 0; } int frame_size_override_flag = 0; features->allow_intrabc = 0; features->primary_ref_frame = PRIMARY_REF_NONE; if (!seq_params->reduced_still_picture_hdr) { if (seq_params->frame_id_numbers_present_flag) { int frame_id_length = seq_params->frame_id_length; int diff_len = seq_params->delta_frame_id_length; int prev_frame_id = 0; int have_prev_frame_id = !pbi->decoding_first_frame && !(current_frame->frame_type == KEY_FRAME && cm->show_frame); if (have_prev_frame_id) { prev_frame_id = cm->current_frame_id; } cm->current_frame_id = aom_rb_read_literal(rb, frame_id_length); if (have_prev_frame_id) { int diff_frame_id; if (cm->current_frame_id > prev_frame_id) { diff_frame_id = cm->current_frame_id - prev_frame_id; } else { diff_frame_id = (1 << frame_id_length) + cm->current_frame_id - prev_frame_id; } /* Check current_frame_id for conformance */ if (prev_frame_id == cm->current_frame_id || diff_frame_id >= (1 << (frame_id_length - 1))) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Invalid value of current_frame_id"); } } /* Check if some frames need to be marked as not valid for referencing */ for (int i = 0; i < REF_FRAMES; i++) { if (cm->current_frame_id - (1 << diff_len) > 0) { if (cm->ref_frame_id[i] > cm->current_frame_id || cm->ref_frame_id[i] < cm->current_frame_id - (1 << diff_len)) pbi->valid_for_referencing[i] = 0; } else { if (cm->ref_frame_id[i] > cm->current_frame_id && cm->ref_frame_id[i] < (1 << frame_id_length) + cm->current_frame_id - (1 << diff_len)) pbi->valid_for_referencing[i] = 0; } } } frame_size_override_flag = frame_is_sframe(cm) ? 1 : aom_rb_read_bit(rb); current_frame->order_hint = aom_rb_read_literal( rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1); if (seq_params->order_hint_info.enable_order_hint) current_frame->frame_number = current_frame->order_hint; if (!features->error_resilient_mode && !frame_is_intra_only(cm)) { features->primary_ref_frame = aom_rb_read_literal(rb, PRIMARY_REF_BITS); } } if (seq_params->decoder_model_info_present_flag) { pbi->buffer_removal_time_present = aom_rb_read_bit(rb); if (pbi->buffer_removal_time_present) { for (int op_num = 0; op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) { if (seq_params->op_params[op_num].decoder_model_param_present_flag) { if (seq_params->operating_point_idc[op_num] == 0 || (((seq_params->operating_point_idc[op_num] >> cm->temporal_layer_id) & 0x1) && ((seq_params->operating_point_idc[op_num] >> (cm->spatial_layer_id + 8)) & 0x1))) { cm->buffer_removal_times[op_num] = aom_rb_read_unsigned_literal( rb, seq_params->decoder_model_info.buffer_removal_time_length); } else { cm->buffer_removal_times[op_num] = 0; } } else { cm->buffer_removal_times[op_num] = 0; } } } } if (current_frame->frame_type == KEY_FRAME) { if (!cm->show_frame) { // unshown keyframe (forward keyframe) current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES); } else { // shown keyframe current_frame->refresh_frame_flags = (1 << REF_FRAMES) - 1; } for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { cm->remapped_ref_idx[i] = INVALID_IDX; } if (pbi->need_resync) { reset_ref_frame_map(cm); pbi->need_resync = 0; } } else { if (current_frame->frame_type == INTRA_ONLY_FRAME) { current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES); if (current_frame->refresh_frame_flags == 0xFF) { aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, "Intra only frames cannot have refresh flags 0xFF"); } if (pbi->need_resync) { reset_ref_frame_map(cm); pbi->need_resync = 0; } } else if (pbi->need_resync != 1) { /* Skip if need resync */ current_frame->refresh_frame_flags = frame_is_sframe(cm) ? 0xFF : aom_rb_read_literal(rb, REF_FRAMES); } } if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xFF) { // Read all ref frame order hints if error_resilient_mode == 1 if (features->error_resilient_mode && seq_params->order_hint_info.enable_order_hint) { for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) { // Read order hint from bit stream unsigned int order_hint = aom_rb_read_literal( rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1); // Get buffer RefCntBuffer *buf = cm->ref_frame_map[ref_idx]; if (buf == NULL || order_hint != buf->order_hint) { if (buf != NULL) { lock_buffer_pool(pool); decrease_ref_count(buf, pool); unlock_buffer_pool(pool); cm->ref_frame_map[ref_idx] = NULL; } // If no corresponding buffer exists, allocate a new buffer with all // pixels set to neutral grey. int buf_idx = get_free_fb(cm); if (buf_idx == INVALID_IDX) { aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Unable to find free frame buffer"); } buf = &frame_bufs[buf_idx]; lock_buffer_pool(pool); #if CONFIG_SIZE_LIMIT if (seq_params->max_frame_width > DECODE_WIDTH_LIMIT || seq_params->max_frame_height > DECODE_HEIGHT_LIMIT) { decrease_ref_count(buf, pool); unlock_buffer_pool(pool); aom_internal_error( cm->error, AOM_CODEC_CORRUPT_FRAME, "Dimensions of %dx%d beyond allowed size of %dx%d.", seq_params->max_frame_width, seq_params->max_frame_height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT); } #endif if (aom_realloc_frame_buffer( &buf->buf, seq_params->max_frame_width, seq_params->max_frame_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, features->byte_alignment, &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, false, 0)) { decrease_ref_count(buf, pool); unlock_buffer_pool(pool); aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } unlock_buffer_pool(pool); // According to the specification, valid bitstreams are required to // never use missing reference frames so the filling process for // missing frames is not normatively defined and RefValid for missing // frames is set to 0. // To make libaom more robust when the bitstream has been corrupted // by the loss of some frames of data, this code adds a neutral grey // buffer in place of missing frames, i.e. // set_planes_to_neutral_grey(seq_params, &buf->buf, 0); // // and allows the frames to be used for referencing, i.e. // pbi->valid_for_referencing[ref_idx] = 1; // // Please note such behavior is not normative and other decoders may // use a different approach. cm->ref_frame_map[ref_idx] = buf; buf->order_hint = order_hint; } } } } if (current_frame->frame_type == KEY_FRAME) { setup_frame_size(cm, frame_size_override_flag, rb); if (features->allow_screen_content_tools && !av1_superres_scaled(cm)) features->allow_intrabc = aom_rb_read_bit(rb); features->allow_ref_frame_mvs = 0; cm->prev_frame = NULL; } else { features->allow_ref_frame_mvs = 0; if (current_frame->frame_type == INTRA_ONLY_FRAME) { cm->cur_frame->film_grain_params_present = seq_params->film_grain_params_present; setup_frame_size(cm, frame_size_override_flag, rb); if (features->allow_screen_content_tools && !av1_superres_scaled(cm)) features->allow_intrabc = aom_rb_read_bit(rb); } else if (pbi->need_resync != 1) { /* Skip if need resync */ int frame_refs_short_signaling = 0; // Frame refs short signaling is off when error resilient mode is on. if (seq_params->order_hint_info.enable_order_hint) frame_refs_short_signaling = aom_rb_read_bit(rb); if (frame_refs_short_signaling) { // == LAST_FRAME == const int lst_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2); const RefCntBuffer *const lst_buf = cm->ref_frame_map[lst_ref]; // == GOLDEN_FRAME == const int gld_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2); const RefCntBuffer *const gld_buf = cm->ref_frame_map[gld_ref]; // Most of the time, streams start with a keyframe. In that case, // ref_frame_map will have been filled in at that point and will not // contain any NULLs. However, streams are explicitly allowed to start // with an intra-only frame, so long as they don't then signal a // reference to a slot that hasn't been set yet. That's what we are // checking here. if (lst_buf == NULL) aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Inter frame requests nonexistent reference"); if (gld_buf == NULL) aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Inter frame requests nonexistent reference"); av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_ref, gld_ref); } for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { int ref = 0; if (!frame_refs_short_signaling) { ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2); // Most of the time, streams start with a keyframe. In that case, // ref_frame_map will have been filled in at that point and will not // contain any NULLs. However, streams are explicitly allowed to start // with an intra-only frame, so long as they don't then signal a // reference to a slot that hasn't been set yet. That's what we are // checking here. if (cm->ref_frame_map[ref] == NULL) aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Inter frame requests nonexistent reference"); cm->remapped_ref_idx[i] = ref; } else { ref = cm->remapped_ref_idx[i]; } // Check valid for referencing if (pbi->valid_for_referencing[ref] == 0) aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Reference frame not valid for referencing"); cm->ref_frame_sign_bias[LAST_FRAME + i] = 0; if (seq_params->frame_id_numbers_present_flag) { int frame_id_length = seq_params->frame_id_length; int diff_len = seq_params->delta_frame_id_length; int delta_frame_id_minus_1 = aom_rb_read_literal(rb, diff_len); int ref_frame_id = ((cm->current_frame_id - (delta_frame_id_minus_1 + 1) + (1 << frame_id_length)) % (1 << frame_id_length)); // Compare values derived from delta_frame_id_minus_1 and // refresh_frame_flags. if (ref_frame_id != cm->ref_frame_id[ref]) aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Reference buffer frame ID mismatch"); } } if (!features->error_resilient_mode && frame_size_override_flag) { setup_frame_size_with_refs(cm, rb); } else { setup_frame_size(cm, frame_size_override_flag, rb); } if (features->cur_frame_force_integer_mv) { features->allow_high_precision_mv = 0; } else { features->allow_high_precision_mv = aom_rb_read_bit(rb); } features->interp_filter = read_frame_interp_filter(rb); features->switchable_motion_mode = aom_rb_read_bit(rb); } cm->prev_frame = get_primary_ref_frame_buf(cm); if (features->primary_ref_frame != PRIMARY_REF_NONE && get_primary_ref_frame_buf(cm) == NULL) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Reference frame containing this frame's initial " "frame context is unavailable."); } if (!(current_frame->frame_type == INTRA_ONLY_FRAME) && pbi->need_resync != 1) { if (frame_might_allow_ref_frame_mvs(cm)) features->allow_ref_frame_mvs = aom_rb_read_bit(rb); else features->allow_ref_frame_mvs = 0; for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i); struct scale_factors *const ref_scale_factors = get_ref_scale_factors(cm, i); av1_setup_scale_factors_for_frame( ref_scale_factors, ref_buf->buf.y_crop_width, ref_buf->buf.y_crop_height, cm->width, cm->height); if ((!av1_is_valid_scale(ref_scale_factors))) aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, "Reference frame has invalid dimensions"); } } } av1_setup_frame_buf_refs(cm); av1_setup_frame_sign_bias(cm); cm->cur_frame->frame_type = current_frame->frame_type; update_ref_frame_id(pbi); const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) && !(features->disable_cdf_update); if (might_bwd_adapt) { features->refresh_frame_context = aom_rb_read_bit(rb) ? REFRESH_FRAME_CONTEXT_DISABLED : REFRESH_FRAME_CONTEXT_BACKWARD; } else { features->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; } cm->cur_frame->buf.bit_depth = seq_params->bit_depth; cm->cur_frame->buf.color_primaries = seq_params->color_primaries; cm->cur_frame->buf.transfer_characteristics = seq_params->transfer_characteristics; cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients; cm->cur_frame->buf.monochrome = seq_params->monochrome; cm->cur_frame->buf.chroma_sample_position = seq_params->chroma_sample_position; cm->cur_frame->buf.color_range = seq_params->color_range; cm->cur_frame->buf.render_width = cm->render_width; cm->cur_frame->buf.render_height = cm->render_height; if (pbi->need_resync) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Keyframe / intra-only frame required to reset decoder" " state"); } if (features->allow_intrabc) { // Set parameters corresponding to no filtering. struct loopfilter *lf = &cm->lf; lf->filter_level[0] = 0; lf->filter_level[1] = 0; cm->cdef_info.cdef_bits = 0; cm->cdef_info.cdef_strengths[0] = 0; cm->cdef_info.nb_cdef_strengths = 1; cm->cdef_info.cdef_uv_strengths[0] = 0; cm->rst_info[0].frame_restoration_type = RESTORE_NONE; cm->rst_info[1].frame_restoration_type = RESTORE_NONE; cm->rst_info[2].frame_restoration_type = RESTORE_NONE; } read_tile_info(pbi, rb); if (!av1_is_min_tile_width_satisfied(cm)) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Minimum tile width requirement not satisfied"); } CommonQuantParams *const quant_params = &cm->quant_params; setup_quantization(quant_params, av1_num_planes(cm), cm->seq_params->separate_uv_delta_q, rb); xd->bd = (int)seq_params->bit_depth; CommonContexts *const above_contexts = &cm->above_contexts; if (above_contexts->num_planes < av1_num_planes(cm) || above_contexts->num_mi_cols < cm->mi_params.mi_cols || above_contexts->num_tile_rows < cm->tiles.rows) { av1_free_above_context_buffers(above_contexts); if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows, cm->mi_params.mi_cols, av1_num_planes(cm))) { aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate context buffers"); } } if (features->primary_ref_frame == PRIMARY_REF_NONE) { av1_setup_past_independence(cm); } setup_segmentation(cm, rb); cm->delta_q_info.delta_q_res = 1; cm->delta_q_info.delta_lf_res = 1; cm->delta_q_info.delta_lf_present_flag = 0; cm->delta_q_info.delta_lf_multi = 0; cm->delta_q_info.delta_q_present_flag = quant_params->base_qindex > 0 ? aom_rb_read_bit(rb) : 0; if (cm->delta_q_info.delta_q_present_flag) { xd->current_base_qindex = quant_params->base_qindex; cm->delta_q_info.delta_q_res = 1 << aom_rb_read_literal(rb, 2); if (!features->allow_intrabc) cm->delta_q_info.delta_lf_present_flag = aom_rb_read_bit(rb); if (cm->delta_q_info.delta_lf_present_flag) { cm->delta_q_info.delta_lf_res = 1 << aom_rb_read_literal(rb, 2); cm->delta_q_info.delta_lf_multi = aom_rb_read_bit(rb); av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); } } xd->cur_frame_force_integer_mv = features->cur_frame_force_integer_mv; for (int i = 0; i < MAX_SEGMENTS; ++i) { const int qindex = av1_get_qindex(&cm->seg, i, quant_params->base_qindex); xd->lossless[i] = qindex == 0 && quant_params->y_dc_delta_q == 0 && quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 && quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0; xd->qindex[i] = qindex; } features->coded_lossless = is_coded_lossless(cm, xd); features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm); setup_segmentation_dequant(cm, xd); if (features->coded_lossless) { cm->lf.filter_level[0] = 0; cm->lf.filter_level[1] = 0; } if (features->coded_lossless || !seq_params->enable_cdef) { cm->cdef_info.cdef_bits = 0; cm->cdef_info.cdef_strengths[0] = 0; cm->cdef_info.cdef_uv_strengths[0] = 0; } if (features->all_lossless || !seq_params->enable_restoration) { cm->rst_info[0].frame_restoration_type = RESTORE_NONE; cm->rst_info[1].frame_restoration_type = RESTORE_NONE; cm->rst_info[2].frame_restoration_type = RESTORE_NONE; } setup_loopfilter(cm, rb); if (!features->coded_lossless && seq_params->enable_cdef) { setup_cdef(cm, rb); } if (!features->all_lossless && seq_params->enable_restoration) { decode_restoration_mode(cm, rb); } features->tx_mode = read_tx_mode(rb, features->coded_lossless); current_frame->reference_mode = read_frame_reference_mode(cm, rb); av1_setup_skip_mode_allowed(cm); current_frame->skip_mode_info.skip_mode_flag = current_frame->skip_mode_info.skip_mode_allowed ? aom_rb_read_bit(rb) : 0; if (frame_might_allow_warped_motion(cm)) features->allow_warped_motion = aom_rb_read_bit(rb); else features->allow_warped_motion = 0; features->reduced_tx_set_used = aom_rb_read_bit(rb); if (features->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Frame wrongly requests reference frame MVs"); } if (!frame_is_intra_only(cm)) read_global_motion(cm, rb); cm->cur_frame->film_grain_params_present = seq_params->film_grain_params_present; read_film_grain(cm, rb); #if EXT_TILE_DEBUG if (pbi->ext_tile_debug && cm->tiles.large_scale) { read_ext_tile_info(pbi, rb); av1_set_single_tile_decoding_mode(cm); } #endif // EXT_TILE_DEBUG return 0; } struct aom_read_bit_buffer *av1_init_read_bit_buffer( AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data, const uint8_t *data_end) { rb->bit_offset = 0; rb->error_handler = error_handler; rb->error_handler_data = &pbi->common; rb->bit_buffer = data; rb->bit_buffer_end = data_end; return rb; } BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb) { int profile = aom_rb_read_literal(rb, PROFILE_BITS); return (BITSTREAM_PROFILE)profile; } static inline void superres_post_decode(AV1Decoder *pbi) { AV1_COMMON *const cm = &pbi->common; BufferPool *const pool = cm->buffer_pool; if (!av1_superres_scaled(cm)) return; assert(!cm->features.all_lossless); av1_superres_upscale(cm, pool, 0); } uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi, struct aom_read_bit_buffer *rb, int trailing_bits_present) { AV1_COMMON *const cm = &pbi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &pbi->dcb.xd; #if CONFIG_BITSTREAM_DEBUG if (cm->seq_params->order_hint_info.enable_order_hint) { aom_bitstream_queue_set_frame_read(cm->current_frame.order_hint * 2 + cm->show_frame); } else { // This is currently used in RTC encoding. cm->show_frame is always 1. assert(cm->show_frame); aom_bitstream_queue_set_frame_read(cm->current_frame.frame_number); } #endif #if CONFIG_MISMATCH_DEBUG mismatch_move_frame_idx_r(); #endif for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { cm->global_motion[i] = default_warp_params; cm->cur_frame->global_motion[i] = default_warp_params; } xd->global_motion = cm->global_motion; read_uncompressed_header(pbi, rb); if (trailing_bits_present) av1_check_trailing_bits(pbi, rb); if (!cm->tiles.single_tile_decoding && (pbi->dec_tile_row >= 0 || pbi->dec_tile_col >= 0)) { pbi->dec_tile_row = -1; pbi->dec_tile_col = -1; } const uint32_t uncomp_hdr_size = (uint32_t)aom_rb_bytes_read(rb); // Size of the uncompressed header YV12_BUFFER_CONFIG *new_fb = &cm->cur_frame->buf; xd->cur_buf = new_fb; if (av1_allow_intrabc(cm)) { av1_setup_scale_factors_for_frame( &cm->sf_identity, xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height, xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height); } // Showing a frame directly. if (cm->show_existing_frame) { if (pbi->reset_decoder_state) { // Use the default frame context values. *cm->fc = *cm->default_frame_context; if (!cm->fc->initialized) aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Uninitialized entropy context."); } return uncomp_hdr_size; } cm->mi_params.setup_mi(&cm->mi_params); av1_calculate_ref_frame_side(cm); if (cm->features.allow_ref_frame_mvs) av1_setup_motion_field(cm); av1_setup_block_planes(xd, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, num_planes); if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) { // use the default frame context values *cm->fc = *cm->default_frame_context; } else { *cm->fc = get_primary_ref_frame_buf(cm)->frame_context; } if (!cm->fc->initialized) aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Uninitialized entropy context."); pbi->dcb.corrupted = 0; return uncomp_hdr_size; } // Once-per-frame initialization static inline void setup_frame_info(AV1Decoder *pbi) { AV1_COMMON *const cm = &pbi->common; if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE || cm->rst_info[1].frame_restoration_type != RESTORE_NONE || cm->rst_info[2].frame_restoration_type != RESTORE_NONE) { av1_alloc_restoration_buffers(cm, /*is_sgr_enabled =*/true); for (int p = 0; p < av1_num_planes(cm); p++) { av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0); } } const int use_highbd = cm->seq_params->use_highbitdepth; const int buf_size = MC_TEMP_BUF_PELS << use_highbd; if (pbi->td.mc_buf_size != buf_size) { av1_free_mc_tmp_buf(&pbi->td); allocate_mc_tmp_buf(cm, &pbi->td, buf_size, use_highbd); } } void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, const uint8_t **p_data_end, int start_tile, int end_tile, int initialize_flag) { AV1_COMMON *const cm = &pbi->common; CommonTileParams *const tiles = &cm->tiles; MACROBLOCKD *const xd = &pbi->dcb.xd; const int tile_count_tg = end_tile - start_tile + 1; xd->error_info = cm->error; if (initialize_flag) setup_frame_info(pbi); const int num_planes = av1_num_planes(cm); if (pbi->max_threads > 1 && !(tiles->large_scale && !pbi->ext_tile_debug) && pbi->row_mt) *p_data_end = decode_tiles_row_mt(pbi, data, data_end, start_tile, end_tile); else if (pbi->max_threads > 1 && tile_count_tg > 1 && !(tiles->large_scale && !pbi->ext_tile_debug)) *p_data_end = decode_tiles_mt(pbi, data, data_end, start_tile, end_tile); else *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile); // If the bit stream is monochrome, set the U and V buffers to a constant. if (num_planes < 3) { set_planes_to_neutral_grey(cm->seq_params, xd->cur_buf, 1); } if (end_tile != tiles->rows * tiles->cols - 1) { return; } av1_alloc_cdef_buffers(cm, &pbi->cdef_worker, &pbi->cdef_sync, pbi->num_workers, 1); av1_alloc_cdef_sync(cm, &pbi->cdef_sync, pbi->num_workers); if (!cm->features.allow_intrabc && !tiles->single_tile_decoding) { if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) { av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &pbi->dcb.xd, 0, num_planes, 0, pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync, 0); } const int do_cdef = !pbi->skip_loop_filter && !cm->features.coded_lossless && (cm->cdef_info.cdef_bits || cm->cdef_info.cdef_strengths[0] || cm->cdef_info.cdef_uv_strengths[0]); const int do_superres = av1_superres_scaled(cm); const int optimized_loop_restoration = !do_cdef && !do_superres; const int do_loop_restoration = cm->rst_info[0].frame_restoration_type != RESTORE_NONE || cm->rst_info[1].frame_restoration_type != RESTORE_NONE || cm->rst_info[2].frame_restoration_type != RESTORE_NONE; // Frame border extension is not required in the decoder // as it happens in extend_mc_border(). int do_extend_border_mt = 0; if (!optimized_loop_restoration) { if (do_loop_restoration) av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf, cm, 0); if (do_cdef) { if (pbi->num_workers > 1) { av1_cdef_frame_mt(cm, &pbi->dcb.xd, pbi->cdef_worker, pbi->tile_workers, &pbi->cdef_sync, pbi->num_workers, av1_cdef_init_fb_row_mt, do_extend_border_mt); } else { av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd, av1_cdef_init_fb_row); } } superres_post_decode(pbi); if (do_loop_restoration) { av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf, cm, 1); if (pbi->num_workers > 1) { av1_loop_restoration_filter_frame_mt( (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration, pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync, &pbi->lr_ctxt, do_extend_border_mt); } else { av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration, &pbi->lr_ctxt); } } } else { // In no cdef and no superres case. Provide an optimized version of // loop_restoration_filter. if (do_loop_restoration) { if (pbi->num_workers > 1) { av1_loop_restoration_filter_frame_mt( (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration, pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync, &pbi->lr_ctxt, do_extend_border_mt); } else { av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration, &pbi->lr_ctxt); } } } } if (!pbi->dcb.corrupted) { if (cm->features.refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { assert(pbi->context_update_tile_id < pbi->allocated_tiles); *cm->fc = pbi->tile_data[pbi->context_update_tile_id].tctx; av1_reset_cdf_symbol_counters(cm->fc); } } else { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Decode failed. Frame data is corrupted."); } #if CONFIG_INSPECTION if (pbi->inspect_cb != NULL) { (*pbi->inspect_cb)(pbi, pbi->inspect_ctx); } #endif // Non frame parallel update frame context here. if (!tiles->large_scale) { cm->cur_frame->frame_context = *cm->fc; } if (cm->show_frame && !cm->seq_params->order_hint_info.enable_order_hint) { ++cm->current_frame.frame_number; } } aom-3.12.1/av1/decoder/decodeframe.h000066400000000000000000000067521477627663500171160ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_DECODER_DECODEFRAME_H_ #define AOM_AV1_DECODER_DECODEFRAME_H_ #ifdef __cplusplus extern "C" { #endif struct AV1Decoder; struct aom_read_bit_buffer; struct ThreadData; // Reads the middle part of the sequence header OBU (from // frame_width_bits_minus_1 to enable_restoration) into seq_params. // Reports errors by calling rb->error_handler() or aom_internal_error(). void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb, SequenceHeader *seq_params); BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb); // Returns 0 on success. Sets pbi->common.error.error_code and returns -1 on // failure. int av1_check_trailing_bits(struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb); // On success, returns the frame header size. On failure, calls // aom_internal_error and does not return. uint32_t av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, int trailing_bits_present); void av1_decode_tg_tiles_and_wrapup(struct AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, const uint8_t **p_data_end, int start_tile, int end_tile, int initialize_flag); // Implements the color_config() function in the spec. Reports errors by // calling rb->error_handler() or aom_internal_error(). void av1_read_color_config(struct aom_read_bit_buffer *rb, int allow_lowbitdepth, SequenceHeader *seq_params, struct aom_internal_error_info *error_info); // Implements the timing_info() function in the spec. Reports errors by calling // rb->error_handler() or aom_internal_error(). void av1_read_timing_info_header(aom_timing_info_t *timing_info, struct aom_internal_error_info *error, struct aom_read_bit_buffer *rb); // Implements the decoder_model_info() function in the spec. Reports errors by // calling rb->error_handler(). void av1_read_decoder_model_info(aom_dec_model_info_t *decoder_model_info, struct aom_read_bit_buffer *rb); // Implements the operating_parameters_info() function in the spec. Reports // errors by calling rb->error_handler(). void av1_read_op_parameters_info(aom_dec_model_op_parameters_t *op_params, int buffer_delay_length, struct aom_read_bit_buffer *rb); struct aom_read_bit_buffer *av1_init_read_bit_buffer( struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data, const uint8_t *data_end); void av1_free_mc_tmp_buf(struct ThreadData *thread_data); void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_DECODER_DECODEFRAME_H_ aom-3.12.1/av1/decoder/decodemv.c000066400000000000000000001665661477627663500164530ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/common/cfl.h" #include "av1/common/common.h" #include "av1/common/entropy.h" #include "av1/common/entropymode.h" #include "av1/common/entropymv.h" #include "av1/common/mvref_common.h" #include "av1/common/pred_common.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" #include "av1/common/seg_common.h" #include "av1/common/warped_motion.h" #include "av1/decoder/decodeframe.h" #include "av1/decoder/decodemv.h" #include "aom_dsp/aom_dsp_common.h" #define ACCT_STR __func__ #define DEC_MISMATCH_DEBUG 0 static PREDICTION_MODE read_intra_mode(aom_reader *r, aom_cdf_prob *cdf) { return (PREDICTION_MODE)aom_read_symbol(r, cdf, INTRA_MODES, ACCT_STR); } static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) { const int skip_txfm = xd->mi[0]->skip_txfm; if (cm->features.coded_lossless) return; if (cm->features.allow_intrabc) { assert(cm->cdef_info.cdef_bits == 0); return; } // At the start of a superblock, mark that we haven't yet read CDEF strengths // for any of the CDEF units contained in this superblock. const int sb_mask = (cm->seq_params->mib_size - 1); const int mi_row_in_sb = (xd->mi_row & sb_mask); const int mi_col_in_sb = (xd->mi_col & sb_mask); if (mi_row_in_sb == 0 && mi_col_in_sb == 0) { xd->cdef_transmitted[0] = xd->cdef_transmitted[1] = xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false; } // CDEF unit size is 64x64 irrespective of the superblock size. const int cdef_size = 1 << (6 - MI_SIZE_LOG2); // Find index of this CDEF unit in this superblock. const int index_mask = cdef_size; const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0); const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0); const int index = (cm->seq_params->sb_size == BLOCK_128X128) ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb : 0; // Read CDEF strength from the first non-skip coding block in this CDEF unit. if (!xd->cdef_transmitted[index] && !skip_txfm) { // CDEF strength for this CDEF unit needs to be read into the MB_MODE_INFO // of the 1st block in this CDEF unit. const int first_block_mask = ~(cdef_size - 1); CommonModeInfoParams *const mi_params = &cm->mi_params; const int grid_idx = get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask, xd->mi_col & first_block_mask); MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx]; mbmi->cdef_strength = aom_read_literal(r, cm->cdef_info.cdef_bits, ACCT_STR); xd->cdef_transmitted[index] = true; } } static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd, aom_reader *r, MB_MODE_INFO *const mbmi) { int sign, abs, reduced_delta_qindex = 0; BLOCK_SIZE bsize = mbmi->bsize; const int b_col = xd->mi_col & (cm->seq_params->mib_size - 1); const int b_row = xd->mi_row & (cm->seq_params->mib_size - 1); const int read_delta_q_flag = (b_col == 0 && b_row == 0); FRAME_CONTEXT *ec_ctx = xd->tile_ctx; if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) && read_delta_q_flag) { abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR); const int smallval = (abs < DELTA_Q_SMALL); if (!smallval) { const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1; const int thr = (1 << rem_bits) + 1; abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr; } if (abs) { sign = aom_read_bit(r, ACCT_STR); } else { sign = 1; } reduced_delta_qindex = sign ? -abs : abs; } return reduced_delta_qindex; } static int read_delta_lflevel(const AV1_COMMON *const cm, aom_reader *r, aom_cdf_prob *const cdf, const MB_MODE_INFO *const mbmi, int mi_col, int mi_row) { int reduced_delta_lflevel = 0; const BLOCK_SIZE bsize = mbmi->bsize; const int b_col = mi_col & (cm->seq_params->mib_size - 1); const int b_row = mi_row & (cm->seq_params->mib_size - 1); const int read_delta_lf_flag = (b_col == 0 && b_row == 0); if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) && read_delta_lf_flag) { int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR); const int smallval = (abs < DELTA_LF_SMALL); if (!smallval) { const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1; const int thr = (1 << rem_bits) + 1; abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr; } const int sign = abs ? aom_read_bit(r, ACCT_STR) : 1; reduced_delta_lflevel = sign ? -abs : abs; } return reduced_delta_lflevel; } static UV_PREDICTION_MODE read_intra_mode_uv(FRAME_CONTEXT *ec_ctx, aom_reader *r, CFL_ALLOWED_TYPE cfl_allowed, PREDICTION_MODE y_mode) { const UV_PREDICTION_MODE uv_mode = aom_read_symbol(r, ec_ctx->uv_mode_cdf[cfl_allowed][y_mode], UV_INTRA_MODES - !cfl_allowed, ACCT_STR); return uv_mode; } static uint8_t read_cfl_alphas(FRAME_CONTEXT *const ec_ctx, aom_reader *r, int8_t *signs_out) { const int8_t joint_sign = aom_read_symbol(r, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS, "cfl:signs"); uint8_t idx = 0; // Magnitudes are only coded for nonzero values if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) { aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; idx = (uint8_t)aom_read_symbol(r, cdf_u, CFL_ALPHABET_SIZE, "cfl:alpha_u") << CFL_ALPHABET_SIZE_LOG2; } if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) { aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; idx += (uint8_t)aom_read_symbol(r, cdf_v, CFL_ALPHABET_SIZE, "cfl:alpha_v"); } *signs_out = joint_sign; return idx; } static INTERINTRA_MODE read_interintra_mode(MACROBLOCKD *xd, aom_reader *r, int size_group) { const INTERINTRA_MODE ii_mode = (INTERINTRA_MODE)aom_read_symbol( r, xd->tile_ctx->interintra_mode_cdf[size_group], INTERINTRA_MODES, ACCT_STR); return ii_mode; } static PREDICTION_MODE read_inter_mode(FRAME_CONTEXT *ec_ctx, aom_reader *r, int16_t ctx) { int16_t mode_ctx = ctx & NEWMV_CTX_MASK; int is_newmv, is_zeromv, is_refmv; is_newmv = aom_read_symbol(r, ec_ctx->newmv_cdf[mode_ctx], 2, ACCT_STR) == 0; if (is_newmv) return NEWMV; mode_ctx = (ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; is_zeromv = aom_read_symbol(r, ec_ctx->zeromv_cdf[mode_ctx], 2, ACCT_STR) == 0; if (is_zeromv) return GLOBALMV; mode_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; is_refmv = aom_read_symbol(r, ec_ctx->refmv_cdf[mode_ctx], 2, ACCT_STR) == 0; if (is_refmv) return NEARESTMV; else return NEARMV; } static void read_drl_idx(FRAME_CONTEXT *ec_ctx, DecoderCodingBlock *dcb, MB_MODE_INFO *mbmi, aom_reader *r) { MACROBLOCKD *const xd = &dcb->xd; uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); mbmi->ref_mv_idx = 0; if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) { for (int idx = 0; idx < 2; ++idx) { if (dcb->ref_mv_count[ref_frame_type] > idx + 1) { uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx); int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR); mbmi->ref_mv_idx = idx + drl_idx; if (!drl_idx) return; } } } if (have_nearmv_in_inter_mode(mbmi->mode)) { // Offset the NEARESTMV mode. // TODO(jingning): Unify the two syntax decoding loops after the NEARESTMV // mode is factored in. for (int idx = 1; idx < 3; ++idx) { if (dcb->ref_mv_count[ref_frame_type] > idx + 1) { uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx); int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR); mbmi->ref_mv_idx = idx + drl_idx - 1; if (!drl_idx) return; } } } } static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd, MB_MODE_INFO *mbmi, aom_reader *r) { if (cm->features.switchable_motion_mode == 0) return SIMPLE_TRANSLATION; if (mbmi->skip_mode) return SIMPLE_TRANSLATION; const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed( xd->global_motion, xd, mbmi, cm->features.allow_warped_motion); int motion_mode; if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return SIMPLE_TRANSLATION; if (last_motion_mode_allowed == OBMC_CAUSAL) { motion_mode = aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->bsize], 2, ACCT_STR); return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode); } else { motion_mode = aom_read_symbol(r, xd->tile_ctx->motion_mode_cdf[mbmi->bsize], MOTION_MODES, ACCT_STR); return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode); } } static PREDICTION_MODE read_inter_compound_mode(MACROBLOCKD *xd, aom_reader *r, int16_t ctx) { const int mode = aom_read_symbol(r, xd->tile_ctx->inter_compound_mode_cdf[ctx], INTER_COMPOUND_MODES, ACCT_STR); assert(is_inter_compound_mode(NEAREST_NEARESTMV + mode)); return NEAREST_NEARESTMV + mode; } int av1_neg_deinterleave(int diff, int ref, int max) { if (!ref) return diff; if (ref >= (max - 1)) return max - diff - 1; if (2 * ref < max) { if (diff <= 2 * ref) { if (diff & 1) return ref + ((diff + 1) >> 1); else return ref - (diff >> 1); } return diff; } else { if (diff <= 2 * (max - ref - 1)) { if (diff & 1) return ref + ((diff + 1) >> 1); else return ref - (diff >> 1); } return max - (diff + 1); } } static int read_segment_id(AV1_COMMON *const cm, const MACROBLOCKD *const xd, aom_reader *r, int skip) { int cdf_num; const uint8_t pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num, 0); if (skip) return pred; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; struct segmentation *const seg = &cm->seg; struct segmentation_probs *const segp = &ec_ctx->seg; aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num]; const int coded_id = aom_read_symbol(r, pred_cdf, MAX_SEGMENTS, ACCT_STR); const int segment_id = av1_neg_deinterleave(coded_id, pred, seg->last_active_segid + 1); if (segment_id < 0 || segment_id > seg->last_active_segid) { aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, "Corrupted segment_ids"); } return segment_id; } static int dec_get_segment_id(const AV1_COMMON *cm, const uint8_t *segment_ids, int mi_offset, int x_mis, int y_mis) { int segment_id = INT_MAX; for (int y = 0; y < y_mis; y++) for (int x = 0; x < x_mis; x++) segment_id = AOMMIN( segment_id, segment_ids[mi_offset + y * cm->mi_params.mi_cols + x]); assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); return segment_id; } static int read_intra_segment_id(AV1_COMMON *const cm, const MACROBLOCKD *const xd, BLOCK_SIZE bsize, aom_reader *r, int skip) { struct segmentation *const seg = &cm->seg; if (!seg->enabled) return 0; // Default for disabled segmentation assert(seg->update_map && !seg->temporal_update); const CommonModeInfoParams *const mi_params = &cm->mi_params; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; const int mi_stride = cm->mi_params.mi_cols; const int mi_offset = mi_row * mi_stride + mi_col; const int bw = mi_size_wide[bsize]; const int bh = mi_size_high[bsize]; const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw); const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh); const int segment_id = read_segment_id(cm, xd, r, skip); set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride, segment_id); return segment_id; } static void copy_segment_id(const CommonModeInfoParams *const mi_params, const uint8_t *last_segment_ids, uint8_t *current_segment_ids, int mi_offset, int x_mis, int y_mis) { const int stride = mi_params->mi_cols; if (last_segment_ids) { assert(last_segment_ids != current_segment_ids); for (int y = 0; y < y_mis; y++) { memcpy(¤t_segment_ids[mi_offset + y * stride], &last_segment_ids[mi_offset + y * stride], sizeof(current_segment_ids[0]) * x_mis); } } else { for (int y = 0; y < y_mis; y++) { memset(¤t_segment_ids[mi_offset + y * stride], 0, sizeof(current_segment_ids[0]) * x_mis); } } } static int get_predicted_segment_id(AV1_COMMON *const cm, int mi_offset, int x_mis, int y_mis) { return cm->last_frame_seg_map ? dec_get_segment_id(cm, cm->last_frame_seg_map, mi_offset, x_mis, y_mis) : 0; } static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd, int preskip, aom_reader *r) { struct segmentation *const seg = &cm->seg; const CommonModeInfoParams *const mi_params = &cm->mi_params; MB_MODE_INFO *const mbmi = xd->mi[0]; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; const int mi_offset = mi_row * mi_params->mi_cols + mi_col; const int bw = mi_size_wide[mbmi->bsize]; const int bh = mi_size_high[mbmi->bsize]; // TODO(slavarnway): move x_mis, y_mis into xd ????? const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw); const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh); if (!seg->enabled) return 0; // Default for disabled segmentation if (!seg->update_map) { copy_segment_id(mi_params, cm->last_frame_seg_map, cm->cur_frame->seg_map, mi_offset, x_mis, y_mis); return get_predicted_segment_id(cm, mi_offset, x_mis, y_mis); } uint8_t segment_id; const int mi_stride = cm->mi_params.mi_cols; if (preskip) { if (!seg->segid_preskip) return 0; } else { if (mbmi->skip_txfm) { if (seg->temporal_update) { mbmi->seg_id_predicted = 0; } segment_id = read_segment_id(cm, xd, r, 1); set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride, segment_id); return segment_id; } } if (seg->temporal_update) { const uint8_t ctx = av1_get_pred_context_seg_id(xd); FRAME_CONTEXT *ec_ctx = xd->tile_ctx; struct segmentation_probs *const segp = &ec_ctx->seg; aom_cdf_prob *pred_cdf = segp->pred_cdf[ctx]; mbmi->seg_id_predicted = aom_read_symbol(r, pred_cdf, 2, ACCT_STR); if (mbmi->seg_id_predicted) { segment_id = get_predicted_segment_id(cm, mi_offset, x_mis, y_mis); } else { segment_id = read_segment_id(cm, xd, r, 0); } } else { segment_id = read_segment_id(cm, xd, r, 0); } set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride, segment_id); return segment_id; } static int read_skip_mode(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id, aom_reader *r) { if (!cm->current_frame.skip_mode_info.skip_mode_flag) return 0; if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 0; } if (!is_comp_ref_allowed(xd->mi[0]->bsize)) return 0; if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) || segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { // These features imply single-reference mode, while skip mode implies // compound reference. Hence, the two are mutually exclusive. // In other words, skip_mode is implicitly 0 here. return 0; } const int ctx = av1_get_skip_mode_context(xd); FRAME_CONTEXT *ec_ctx = xd->tile_ctx; const int skip_mode = aom_read_symbol(r, ec_ctx->skip_mode_cdfs[ctx], 2, ACCT_STR); return skip_mode; } static int read_skip_txfm(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id, aom_reader *r) { if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 1; } else { const int ctx = av1_get_skip_txfm_context(xd); FRAME_CONTEXT *ec_ctx = xd->tile_ctx; const int skip_txfm = aom_read_symbol(r, ec_ctx->skip_txfm_cdfs[ctx], 2, ACCT_STR); return skip_txfm; } } // Merge the sorted list of cached colors(cached_colors[0...n_cached_colors-1]) // and the sorted list of transmitted colors(colors[n_cached_colors...n-1]) into // one single sorted list(colors[...]). static void merge_colors(uint16_t *colors, uint16_t *cached_colors, int n_colors, int n_cached_colors) { if (n_cached_colors == 0) return; int cache_idx = 0, trans_idx = n_cached_colors; for (int i = 0; i < n_colors; ++i) { if (cache_idx < n_cached_colors && (trans_idx >= n_colors || cached_colors[cache_idx] <= colors[trans_idx])) { colors[i] = cached_colors[cache_idx++]; } else { assert(trans_idx < n_colors); colors[i] = colors[trans_idx++]; } } } static void read_palette_colors_y(MACROBLOCKD *const xd, int bit_depth, PALETTE_MODE_INFO *const pmi, aom_reader *r) { uint16_t color_cache[2 * PALETTE_MAX_SIZE]; uint16_t cached_colors[PALETTE_MAX_SIZE]; const int n_cache = av1_get_palette_cache(xd, 0, color_cache); const int n = pmi->palette_size[0]; int idx = 0; for (int i = 0; i < n_cache && idx < n; ++i) if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i]; if (idx < n) { const int n_cached_colors = idx; pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR); if (idx < n) { const int min_bits = bit_depth - 3; int bits = min_bits + aom_read_literal(r, 2, ACCT_STR); int range = (1 << bit_depth) - pmi->palette_colors[idx - 1] - 1; for (; idx < n; ++idx) { assert(range >= 0); const int delta = aom_read_literal(r, bits, ACCT_STR) + 1; pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta, 0, (1 << bit_depth) - 1); range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]); bits = AOMMIN(bits, av1_ceil_log2(range)); } } merge_colors(pmi->palette_colors, cached_colors, n, n_cached_colors); } else { memcpy(pmi->palette_colors, cached_colors, n * sizeof(cached_colors[0])); } } static void read_palette_colors_uv(MACROBLOCKD *const xd, int bit_depth, PALETTE_MODE_INFO *const pmi, aom_reader *r) { const int n = pmi->palette_size[1]; // U channel colors. uint16_t color_cache[2 * PALETTE_MAX_SIZE]; uint16_t cached_colors[PALETTE_MAX_SIZE]; const int n_cache = av1_get_palette_cache(xd, 1, color_cache); int idx = 0; for (int i = 0; i < n_cache && idx < n; ++i) if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i]; if (idx < n) { const int n_cached_colors = idx; idx += PALETTE_MAX_SIZE; pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR); if (idx < PALETTE_MAX_SIZE + n) { const int min_bits = bit_depth - 3; int bits = min_bits + aom_read_literal(r, 2, ACCT_STR); int range = (1 << bit_depth) - pmi->palette_colors[idx - 1]; for (; idx < PALETTE_MAX_SIZE + n; ++idx) { assert(range >= 0); const int delta = aom_read_literal(r, bits, ACCT_STR); pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta, 0, (1 << bit_depth) - 1); range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]); bits = AOMMIN(bits, av1_ceil_log2(range)); } } merge_colors(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors, n, n_cached_colors); } else { memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors, n * sizeof(cached_colors[0])); } // V channel colors. if (aom_read_bit(r, ACCT_STR)) { // Delta encoding. const int min_bits_v = bit_depth - 4; const int max_val = 1 << bit_depth; int bits = min_bits_v + aom_read_literal(r, 2, ACCT_STR); pmi->palette_colors[2 * PALETTE_MAX_SIZE] = aom_read_literal(r, bit_depth, ACCT_STR); for (int i = 1; i < n; ++i) { int delta = aom_read_literal(r, bits, ACCT_STR); if (delta && aom_read_bit(r, ACCT_STR)) delta = -delta; int val = (int)pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1] + delta; if (val < 0) val += max_val; if (val >= max_val) val -= max_val; pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] = val; } } else { for (int i = 0; i < n; ++i) { pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] = aom_read_literal(r, bit_depth, ACCT_STR); } } } static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *r) { const int num_planes = av1_num_planes(cm); MB_MODE_INFO *const mbmi = xd->mi[0]; const BLOCK_SIZE bsize = mbmi->bsize; assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize)); PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); if (mbmi->mode == DC_PRED) { const int palette_mode_ctx = av1_get_palette_mode_ctx(xd); const int modev = aom_read_symbol( r, xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_mode_ctx], 2, ACCT_STR); if (modev) { pmi->palette_size[0] = aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx], PALETTE_SIZES, ACCT_STR) + 2; read_palette_colors_y(xd, cm->seq_params->bit_depth, pmi, r); } } if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref) { const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0); const int modev = aom_read_symbol( r, xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2, ACCT_STR); if (modev) { pmi->palette_size[1] = aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx], PALETTE_SIZES, ACCT_STR) + 2; read_palette_colors_uv(xd, cm->seq_params->bit_depth, pmi, r); } } } static int read_angle_delta(aom_reader *r, aom_cdf_prob *cdf) { const int sym = aom_read_symbol(r, cdf, 2 * MAX_ANGLE_DELTA + 1, ACCT_STR); return sym - MAX_ANGLE_DELTA; } static void read_filter_intra_mode_info(const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *r) { MB_MODE_INFO *const mbmi = xd->mi[0]; FILTER_INTRA_MODE_INFO *filter_intra_mode_info = &mbmi->filter_intra_mode_info; if (av1_filter_intra_allowed(cm, mbmi)) { filter_intra_mode_info->use_filter_intra = aom_read_symbol( r, xd->tile_ctx->filter_intra_cdfs[mbmi->bsize], 2, ACCT_STR); if (filter_intra_mode_info->use_filter_intra) { filter_intra_mode_info->filter_intra_mode = aom_read_symbol( r, xd->tile_ctx->filter_intra_mode_cdf, FILTER_INTRA_MODES, ACCT_STR); } } else { filter_intra_mode_info->use_filter_intra = 0; } } void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row, int blk_col, TX_SIZE tx_size, aom_reader *r) { MB_MODE_INFO *mbmi = xd->mi[0]; uint8_t *tx_type = &xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; *tx_type = DCT_DCT; // No need to read transform type if block is skipped. if (mbmi->skip_txfm || segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) return; // No need to read transform type for lossless mode(qindex==0). const int qindex = xd->qindex[mbmi->segment_id]; if (qindex == 0) return; const int inter_block = is_inter_block(mbmi); if (get_ext_tx_types(tx_size, inter_block, cm->features.reduced_tx_set_used) > 1) { const TxSetType tx_set_type = av1_get_ext_tx_set_type( tx_size, inter_block, cm->features.reduced_tx_set_used); const int eset = get_ext_tx_set(tx_size, inter_block, cm->features.reduced_tx_set_used); // eset == 0 should correspond to a set with only DCT_DCT and // there is no need to read the tx_type assert(eset != 0); const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; if (inter_block) { *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol( r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size], av1_num_ext_tx_set[tx_set_type], ACCT_STR)]; } else { const PREDICTION_MODE intra_mode = mbmi->filter_intra_mode_info.use_filter_intra ? fimode_to_intradir[mbmi->filter_intra_mode_info .filter_intra_mode] : mbmi->mode; *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol( r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_mode], av1_num_ext_tx_set[tx_set_type], ACCT_STR)]; } } } static inline void read_mv(aom_reader *r, MV *mv, const MV *ref, nmv_context *ctx, MvSubpelPrecision precision); static inline int is_mv_valid(const MV *mv); static inline int assign_dv(AV1_COMMON *cm, MACROBLOCKD *xd, int_mv *mv, const int_mv *ref_mv, int mi_row, int mi_col, BLOCK_SIZE bsize, aom_reader *r) { FRAME_CONTEXT *ec_ctx = xd->tile_ctx; read_mv(r, &mv->as_mv, &ref_mv->as_mv, &ec_ctx->ndvc, MV_SUBPEL_NONE); // DV should not have sub-pel. assert((mv->as_mv.col & 7) == 0); assert((mv->as_mv.row & 7) == 0); mv->as_mv.col = (mv->as_mv.col >> 3) * 8; mv->as_mv.row = (mv->as_mv.row >> 3) * 8; int valid = is_mv_valid(&mv->as_mv) && av1_is_dv_valid(mv->as_mv, cm, xd, mi_row, mi_col, bsize, cm->seq_params->mib_size_log2); return valid; } static void read_intrabc_info(AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *r) { MACROBLOCKD *const xd = &dcb->xd; MB_MODE_INFO *const mbmi = xd->mi[0]; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; mbmi->use_intrabc = aom_read_symbol(r, ec_ctx->intrabc_cdf, 2, ACCT_STR); if (mbmi->use_intrabc) { BLOCK_SIZE bsize = mbmi->bsize; mbmi->mode = DC_PRED; mbmi->uv_mode = UV_DC_PRED; mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR); mbmi->motion_mode = SIMPLE_TRANSLATION; int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES]; int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES]; av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, dcb->ref_mv_count, xd->ref_mv_stack, xd->weight, ref_mvs, /*global_mvs=*/NULL, inter_mode_ctx); int_mv nearestmv, nearmv; av1_find_best_ref_mvs(0, ref_mvs[INTRA_FRAME], &nearestmv, &nearmv, 0); int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv; if (dv_ref.as_int == 0) av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params->mib_size, xd->mi_row); // Ref DV should not have sub-pel. int valid_dv = (dv_ref.as_mv.col & 7) == 0 && (dv_ref.as_mv.row & 7) == 0; dv_ref.as_mv.col = (dv_ref.as_mv.col >> 3) * 8; dv_ref.as_mv.row = (dv_ref.as_mv.row >> 3) * 8; valid_dv = valid_dv && assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, xd->mi_row, xd->mi_col, bsize, r); if (!valid_dv) { // Intra bc motion vectors are not valid - signal corrupt frame aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, "Invalid intrabc dv"); } } } // If delta q is present, reads delta_q index. // Also reads delta_q loop filter levels, if present. static void read_delta_q_params(AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *r) { DeltaQInfo *const delta_q_info = &cm->delta_q_info; if (delta_q_info->delta_q_present_flag) { MB_MODE_INFO *const mbmi = xd->mi[0]; xd->current_base_qindex += read_delta_qindex(cm, xd, r, mbmi) * delta_q_info->delta_q_res; /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */ xd->current_base_qindex = clamp(xd->current_base_qindex, 1, MAXQ); FRAME_CONTEXT *const ec_ctx = xd->tile_ctx; if (delta_q_info->delta_lf_present_flag) { const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; if (delta_q_info->delta_lf_multi) { const int frame_lf_count = av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { const int tmp_lvl = xd->delta_lf[lf_id] + read_delta_lflevel(cm, r, ec_ctx->delta_lf_multi_cdf[lf_id], mbmi, mi_col, mi_row) * delta_q_info->delta_lf_res; mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] = clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); } } else { const int tmp_lvl = xd->delta_lf_from_base + read_delta_lflevel(cm, r, ec_ctx->delta_lf_cdf, mbmi, mi_col, mi_row) * delta_q_info->delta_lf_res; mbmi->delta_lf_from_base = xd->delta_lf_from_base = clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); } } } } static void read_intra_frame_mode_info(AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *r) { MACROBLOCKD *const xd = &dcb->xd; MB_MODE_INFO *const mbmi = xd->mi[0]; const MB_MODE_INFO *above_mi = xd->above_mbmi; const MB_MODE_INFO *left_mi = xd->left_mbmi; const BLOCK_SIZE bsize = mbmi->bsize; struct segmentation *const seg = &cm->seg; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; if (seg->segid_preskip) mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, 0); mbmi->skip_txfm = read_skip_txfm(cm, xd, mbmi->segment_id, r); if (!seg->segid_preskip) mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, mbmi->skip_txfm); read_cdef(cm, r, xd); read_delta_q_params(cm, xd, r); mbmi->current_qindex = xd->current_base_qindex; mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE_FRAME; mbmi->palette_mode_info.palette_size[0] = 0; mbmi->palette_mode_info.palette_size[1] = 0; mbmi->filter_intra_mode_info.use_filter_intra = 0; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; xd->above_txfm_context = cm->above_contexts.txfm[xd->tile.tile_row] + mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); if (av1_allow_intrabc(cm)) { read_intrabc_info(cm, dcb, r); if (is_intrabc_block(mbmi)) return; } mbmi->mode = read_intra_mode(r, get_y_mode_cdf(ec_ctx, above_mi, left_mi)); const int use_angle_delta = av1_use_angle_delta(bsize); mbmi->angle_delta[PLANE_TYPE_Y] = (use_angle_delta && av1_is_directional_mode(mbmi->mode)) ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED]) : 0; if (!cm->seq_params->monochrome && xd->is_chroma_ref) { mbmi->uv_mode = read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode); if (mbmi->uv_mode == UV_CFL_PRED) { mbmi->cfl_alpha_idx = read_cfl_alphas(ec_ctx, r, &mbmi->cfl_alpha_signs); } const PREDICTION_MODE intra_mode = get_uv_mode(mbmi->uv_mode); mbmi->angle_delta[PLANE_TYPE_UV] = (use_angle_delta && av1_is_directional_mode(intra_mode)) ? read_angle_delta(r, ec_ctx->angle_delta_cdf[intra_mode - V_PRED]) : 0; } else { // Avoid decoding angle_info if there is no chroma prediction mbmi->uv_mode = UV_DC_PRED; } xd->cfl.store_y = store_cfl_required(cm, xd); if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) read_palette_mode_info(cm, xd, r); read_filter_intra_mode_info(cm, xd, r); } static int read_mv_component(aom_reader *r, nmv_component *mvcomp, int use_subpel, int usehp) { int mag, d, fr, hp; const int sign = aom_read_symbol(r, mvcomp->sign_cdf, 2, ACCT_STR); const int mv_class = aom_read_symbol(r, mvcomp->classes_cdf, MV_CLASSES, ACCT_STR); const int class0 = mv_class == MV_CLASS_0; // Integer part if (class0) { d = aom_read_symbol(r, mvcomp->class0_cdf, CLASS0_SIZE, ACCT_STR); mag = 0; } else { const int n = mv_class + CLASS0_BITS - 1; // number of bits d = 0; for (int i = 0; i < n; ++i) d |= aom_read_symbol(r, mvcomp->bits_cdf[i], 2, ACCT_STR) << i; mag = CLASS0_SIZE << (mv_class + 2); } if (use_subpel) { // Fractional part fr = aom_read_symbol(r, class0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf, MV_FP_SIZE, ACCT_STR); // High precision part (if hp is not used, the default value of the hp is 1) hp = usehp ? aom_read_symbol( r, class0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf, 2, ACCT_STR) : 1; } else { fr = 3; hp = 1; } // Result mag += ((d << 3) | (fr << 1) | hp) + 1; return sign ? -mag : mag; } static inline void read_mv(aom_reader *r, MV *mv, const MV *ref, nmv_context *ctx, MvSubpelPrecision precision) { MV diff = kZeroMv; const MV_JOINT_TYPE joint_type = (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joints_cdf, MV_JOINTS, ACCT_STR); if (mv_joint_vertical(joint_type)) diff.row = read_mv_component(r, &ctx->comps[0], precision > MV_SUBPEL_NONE, precision > MV_SUBPEL_LOW_PRECISION); if (mv_joint_horizontal(joint_type)) diff.col = read_mv_component(r, &ctx->comps[1], precision > MV_SUBPEL_NONE, precision > MV_SUBPEL_LOW_PRECISION); mv->row = ref->row + diff.row; mv->col = ref->col + diff.col; } static REFERENCE_MODE read_block_reference_mode(AV1_COMMON *cm, const MACROBLOCKD *xd, aom_reader *r) { if (!is_comp_ref_allowed(xd->mi[0]->bsize)) return SINGLE_REFERENCE; if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) { const int ctx = av1_get_reference_mode_context(xd); const REFERENCE_MODE mode = (REFERENCE_MODE)aom_read_symbol( r, xd->tile_ctx->comp_inter_cdf[ctx], 2, ACCT_STR); return mode; // SINGLE_REFERENCE or COMPOUND_REFERENCE } else { assert(cm->current_frame.reference_mode == SINGLE_REFERENCE); return cm->current_frame.reference_mode; } } #define READ_REF_BIT(pname) \ aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR) static COMP_REFERENCE_TYPE read_comp_reference_type(const MACROBLOCKD *xd, aom_reader *r) { const int ctx = av1_get_comp_reference_type_context(xd); const COMP_REFERENCE_TYPE comp_ref_type = (COMP_REFERENCE_TYPE)aom_read_symbol( r, xd->tile_ctx->comp_ref_type_cdf[ctx], 2, ACCT_STR); return comp_ref_type; // UNIDIR_COMP_REFERENCE or BIDIR_COMP_REFERENCE } static void set_ref_frames_for_skip_mode(AV1_COMMON *const cm, MV_REFERENCE_FRAME ref_frame[2]) { ref_frame[0] = LAST_FRAME + cm->current_frame.skip_mode_info.ref_frame_idx_0; ref_frame[1] = LAST_FRAME + cm->current_frame.skip_mode_info.ref_frame_idx_1; } // Read the referncence frame static void read_ref_frames(AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *r, int segment_id, MV_REFERENCE_FRAME ref_frame[2]) { if (xd->mi[0]->skip_mode) { set_ref_frames_for_skip_mode(cm, ref_frame); return; } if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME); ref_frame[1] = NONE_FRAME; } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) || segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { ref_frame[0] = LAST_FRAME; ref_frame[1] = NONE_FRAME; } else { const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r); if (mode == COMPOUND_REFERENCE) { const COMP_REFERENCE_TYPE comp_ref_type = read_comp_reference_type(xd, r); if (comp_ref_type == UNIDIR_COMP_REFERENCE) { const int bit = READ_REF_BIT(uni_comp_ref_p); if (bit) { ref_frame[0] = BWDREF_FRAME; ref_frame[1] = ALTREF_FRAME; } else { const int bit1 = READ_REF_BIT(uni_comp_ref_p1); if (bit1) { const int bit2 = READ_REF_BIT(uni_comp_ref_p2); if (bit2) { ref_frame[0] = LAST_FRAME; ref_frame[1] = GOLDEN_FRAME; } else { ref_frame[0] = LAST_FRAME; ref_frame[1] = LAST3_FRAME; } } else { ref_frame[0] = LAST_FRAME; ref_frame[1] = LAST2_FRAME; } } return; } assert(comp_ref_type == BIDIR_COMP_REFERENCE); const int idx = 1; const int bit = READ_REF_BIT(comp_ref_p); // Decode forward references. if (!bit) { const int bit1 = READ_REF_BIT(comp_ref_p1); ref_frame[!idx] = bit1 ? LAST2_FRAME : LAST_FRAME; } else { const int bit2 = READ_REF_BIT(comp_ref_p2); ref_frame[!idx] = bit2 ? GOLDEN_FRAME : LAST3_FRAME; } // Decode backward references. const int bit_bwd = READ_REF_BIT(comp_bwdref_p); if (!bit_bwd) { const int bit1_bwd = READ_REF_BIT(comp_bwdref_p1); ref_frame[idx] = bit1_bwd ? ALTREF2_FRAME : BWDREF_FRAME; } else { ref_frame[idx] = ALTREF_FRAME; } } else if (mode == SINGLE_REFERENCE) { const int bit0 = READ_REF_BIT(single_ref_p1); if (bit0) { const int bit1 = READ_REF_BIT(single_ref_p2); if (!bit1) { const int bit5 = READ_REF_BIT(single_ref_p6); ref_frame[0] = bit5 ? ALTREF2_FRAME : BWDREF_FRAME; } else { ref_frame[0] = ALTREF_FRAME; } } else { const int bit2 = READ_REF_BIT(single_ref_p3); if (bit2) { const int bit4 = READ_REF_BIT(single_ref_p5); ref_frame[0] = bit4 ? GOLDEN_FRAME : LAST3_FRAME; } else { const int bit3 = READ_REF_BIT(single_ref_p4); ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME; } } ref_frame[1] = NONE_FRAME; } else { assert(0 && "Invalid prediction mode."); } } } static inline void read_mb_interp_filter(const MACROBLOCKD *const xd, InterpFilter interp_filter, bool enable_dual_filter, MB_MODE_INFO *const mbmi, aom_reader *r) { FRAME_CONTEXT *ec_ctx = xd->tile_ctx; if (!av1_is_interp_needed(xd)) { set_default_interp_filters(mbmi, interp_filter); return; } if (interp_filter != SWITCHABLE) { mbmi->interp_filters = av1_broadcast_interp_filter(interp_filter); } else { InterpFilter ref0_filter[2] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR }; for (int dir = 0; dir < 2; ++dir) { const int ctx = av1_get_pred_context_switchable_interp(xd, dir); ref0_filter[dir] = (InterpFilter)aom_read_symbol( r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS, ACCT_STR); if (!enable_dual_filter) { ref0_filter[1] = ref0_filter[0]; break; } } // The index system works as: (0, 1) -> (vertical, horizontal) filter types mbmi->interp_filters.as_filters.x_filter = ref0_filter[1]; mbmi->interp_filters.as_filters.y_filter = ref0_filter[0]; } } static void read_intra_block_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd, MB_MODE_INFO *const mbmi, aom_reader *r) { const BLOCK_SIZE bsize = mbmi->bsize; const int use_angle_delta = av1_use_angle_delta(bsize); mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE_FRAME; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; mbmi->mode = read_intra_mode(r, ec_ctx->y_mode_cdf[size_group_lookup[bsize]]); mbmi->angle_delta[PLANE_TYPE_Y] = use_angle_delta && av1_is_directional_mode(mbmi->mode) ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED]) : 0; if (!cm->seq_params->monochrome && xd->is_chroma_ref) { mbmi->uv_mode = read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode); if (mbmi->uv_mode == UV_CFL_PRED) { mbmi->cfl_alpha_idx = read_cfl_alphas(xd->tile_ctx, r, &mbmi->cfl_alpha_signs); } const PREDICTION_MODE intra_mode = get_uv_mode(mbmi->uv_mode); mbmi->angle_delta[PLANE_TYPE_UV] = use_angle_delta && av1_is_directional_mode(intra_mode) ? read_angle_delta(r, ec_ctx->angle_delta_cdf[intra_mode - V_PRED]) : 0; } else { // Avoid decoding angle_info if there is no chroma prediction mbmi->uv_mode = UV_DC_PRED; } xd->cfl.store_y = store_cfl_required(cm, xd); mbmi->palette_mode_info.palette_size[0] = 0; mbmi->palette_mode_info.palette_size[1] = 0; if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) read_palette_mode_info(cm, xd, r); read_filter_intra_mode_info(cm, xd, r); } static inline int is_mv_valid(const MV *mv) { return mv->row > MV_LOW && mv->row < MV_UPP && mv->col > MV_LOW && mv->col < MV_UPP; } static inline int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd, PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame[2], int_mv mv[2], int_mv ref_mv[2], int_mv nearest_mv[2], int_mv near_mv[2], int is_compound, int allow_hp, aom_reader *r) { FRAME_CONTEXT *ec_ctx = xd->tile_ctx; MB_MODE_INFO *mbmi = xd->mi[0]; BLOCK_SIZE bsize = mbmi->bsize; FeatureFlags *const features = &cm->features; if (features->cur_frame_force_integer_mv) { allow_hp = MV_SUBPEL_NONE; } switch (mode) { case NEWMV: { nmv_context *const nmvc = &ec_ctx->nmvc; read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp); break; } case NEARESTMV: { mv[0].as_int = nearest_mv[0].as_int; break; } case NEARMV: { mv[0].as_int = near_mv[0].as_int; break; } case GLOBALMV: { mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]], features->allow_high_precision_mv, bsize, xd->mi_col, xd->mi_row, features->cur_frame_force_integer_mv) .as_int; break; } case NEW_NEWMV: { assert(is_compound); for (int i = 0; i < 2; ++i) { nmv_context *const nmvc = &ec_ctx->nmvc; read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, allow_hp); } break; } case NEAREST_NEARESTMV: { assert(is_compound); mv[0].as_int = nearest_mv[0].as_int; mv[1].as_int = nearest_mv[1].as_int; break; } case NEAR_NEARMV: { assert(is_compound); mv[0].as_int = near_mv[0].as_int; mv[1].as_int = near_mv[1].as_int; break; } case NEW_NEARESTMV: { nmv_context *const nmvc = &ec_ctx->nmvc; read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp); assert(is_compound); mv[1].as_int = nearest_mv[1].as_int; break; } case NEAREST_NEWMV: { nmv_context *const nmvc = &ec_ctx->nmvc; mv[0].as_int = nearest_mv[0].as_int; read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp); assert(is_compound); break; } case NEAR_NEWMV: { nmv_context *const nmvc = &ec_ctx->nmvc; mv[0].as_int = near_mv[0].as_int; read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp); assert(is_compound); break; } case NEW_NEARMV: { nmv_context *const nmvc = &ec_ctx->nmvc; read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp); assert(is_compound); mv[1].as_int = near_mv[1].as_int; break; } case GLOBAL_GLOBALMV: { assert(is_compound); mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]], features->allow_high_precision_mv, bsize, xd->mi_col, xd->mi_row, features->cur_frame_force_integer_mv) .as_int; mv[1].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[1]], features->allow_high_precision_mv, bsize, xd->mi_col, xd->mi_row, features->cur_frame_force_integer_mv) .as_int; break; } default: { return 0; } } int ret = is_mv_valid(&mv[0].as_mv); if (is_compound) { ret = ret && is_mv_valid(&mv[1].as_mv); } return ret; } static int read_is_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd, int segment_id, aom_reader *r) { if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { const int frame = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME); if (frame < LAST_FRAME) return 0; return frame != INTRA_FRAME; } if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { return 1; } const int ctx = av1_get_intra_inter_context(xd); FRAME_CONTEXT *ec_ctx = xd->tile_ctx; const int is_inter = aom_read_symbol(r, ec_ctx->intra_inter_cdf[ctx], 2, ACCT_STR); return is_inter; } #if DEC_MISMATCH_DEBUG static void dec_dump_logs(AV1_COMMON *cm, MB_MODE_INFO *const mbmi, int mi_row, int mi_col, int16_t mode_ctx) { int_mv mv[2] = { { 0 } }; for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) mv[ref].as_mv = mbmi->mv[ref].as_mv; const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK; int16_t zeromv_ctx = -1; int16_t refmv_ctx = -1; if (mbmi->mode != NEWMV) { zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; if (mbmi->mode != GLOBALMV) refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; } #define FRAME_TO_CHECK 11 if (cm->current_frame.frame_number == FRAME_TO_CHECK && cm->show_frame == 1) { printf( "=== DECODER ===: " "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, " "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, " "ref[1]=%d, motion_mode=%d, mode_ctx=%d, " "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n", cm->current_frame.frame_number, mi_row, mi_col, mbmi->skip_mode, mbmi->mode, mbmi->sb_type, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col, mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0], mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx, zeromv_ctx, refmv_ctx, mbmi->tx_size); } } #endif // DEC_MISMATCH_DEBUG static void read_inter_block_mode_info(AV1Decoder *const pbi, DecoderCodingBlock *dcb, MB_MODE_INFO *const mbmi, aom_reader *r) { AV1_COMMON *const cm = &pbi->common; FeatureFlags *const features = &cm->features; const BLOCK_SIZE bsize = mbmi->bsize; const int allow_hp = features->allow_high_precision_mv; int_mv nearestmv[2], nearmv[2]; int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } }; int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES]; int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; MACROBLOCKD *const xd = &dcb->xd; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; mbmi->uv_mode = UV_DC_PRED; mbmi->palette_mode_info.palette_size[0] = 0; mbmi->palette_mode_info.palette_size[1] = 0; av1_collect_neighbors_ref_counts(xd); read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame); const int is_compound = has_second_ref(mbmi); const MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame); av1_find_mv_refs(cm, xd, mbmi, ref_frame, dcb->ref_mv_count, xd->ref_mv_stack, xd->weight, ref_mvs, /*global_mvs=*/NULL, inter_mode_ctx); mbmi->ref_mv_idx = 0; if (mbmi->skip_mode) { assert(is_compound); mbmi->mode = NEAREST_NEARESTMV; } else { if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) || segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_GLOBALMV)) { mbmi->mode = GLOBALMV; } else { const int mode_ctx = av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame); if (is_compound) mbmi->mode = read_inter_compound_mode(xd, r, mode_ctx); else mbmi->mode = read_inter_mode(ec_ctx, r, mode_ctx); if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV || have_nearmv_in_inter_mode(mbmi->mode)) read_drl_idx(ec_ctx, dcb, mbmi, r); } } if (is_compound != is_inter_compound_mode(mbmi->mode)) { aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, "Prediction mode %d invalid with ref frame %d %d", mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); } if (!is_compound && mbmi->mode != GLOBALMV) { av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[0]], &nearestmv[0], &nearmv[0], features->cur_frame_force_integer_mv); } if (is_compound && mbmi->mode != GLOBAL_GLOBALMV) { const int ref_mv_idx = mbmi->ref_mv_idx + 1; nearestmv[0] = xd->ref_mv_stack[ref_frame][0].this_mv; nearestmv[1] = xd->ref_mv_stack[ref_frame][0].comp_mv; nearmv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv; nearmv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv; lower_mv_precision(&nearestmv[0].as_mv, allow_hp, features->cur_frame_force_integer_mv); lower_mv_precision(&nearestmv[1].as_mv, allow_hp, features->cur_frame_force_integer_mv); lower_mv_precision(&nearmv[0].as_mv, allow_hp, features->cur_frame_force_integer_mv); lower_mv_precision(&nearmv[1].as_mv, allow_hp, features->cur_frame_force_integer_mv); } else if (mbmi->ref_mv_idx > 0 && mbmi->mode == NEARMV) { nearmv[0] = xd->ref_mv_stack[mbmi->ref_frame[0]][1 + mbmi->ref_mv_idx].this_mv; } int_mv ref_mv[2] = { nearestmv[0], nearestmv[1] }; if (is_compound) { int ref_mv_idx = mbmi->ref_mv_idx; // Special case: NEAR_NEWMV and NEW_NEARMV modes use // 1 + mbmi->ref_mv_idx (like NEARMV) instead of // mbmi->ref_mv_idx (like NEWMV) if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) ref_mv_idx = 1 + mbmi->ref_mv_idx; // TODO(jingning, yunqing): Do we need a lower_mv_precision() call here? if (compound_ref0_mode(mbmi->mode) == NEWMV) ref_mv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv; if (compound_ref1_mode(mbmi->mode) == NEWMV) ref_mv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv; } else { if (mbmi->mode == NEWMV) { if (dcb->ref_mv_count[ref_frame] > 1) ref_mv[0] = xd->ref_mv_stack[ref_frame][mbmi->ref_mv_idx].this_mv; } } if (mbmi->skip_mode) assert(mbmi->mode == NEAREST_NEARESTMV); const int mv_corrupted_flag = !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, mbmi->mv, ref_mv, nearestmv, nearmv, is_compound, allow_hp, r); aom_merge_corrupted_flag(&dcb->corrupted, mv_corrupted_flag); mbmi->use_wedge_interintra = 0; if (cm->seq_params->enable_interintra_compound && !mbmi->skip_mode && is_interintra_allowed(mbmi)) { const int bsize_group = size_group_lookup[bsize]; const int interintra = aom_read_symbol(r, ec_ctx->interintra_cdf[bsize_group], 2, ACCT_STR); assert(mbmi->ref_frame[1] == NONE_FRAME); if (interintra) { const INTERINTRA_MODE interintra_mode = read_interintra_mode(xd, r, bsize_group); mbmi->ref_frame[1] = INTRA_FRAME; mbmi->interintra_mode = interintra_mode; mbmi->angle_delta[PLANE_TYPE_Y] = 0; mbmi->angle_delta[PLANE_TYPE_UV] = 0; mbmi->filter_intra_mode_info.use_filter_intra = 0; if (av1_is_wedge_used(bsize)) { mbmi->use_wedge_interintra = aom_read_symbol( r, ec_ctx->wedge_interintra_cdf[bsize], 2, ACCT_STR); if (mbmi->use_wedge_interintra) { mbmi->interintra_wedge_index = (int8_t)aom_read_symbol( r, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES, ACCT_STR); } } } } for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; xd->block_ref_scale_factors[ref] = get_ref_scale_factors_const(cm, frame); } mbmi->motion_mode = SIMPLE_TRANSLATION; if (is_motion_variation_allowed_bsize(mbmi->bsize) && !mbmi->skip_mode && !has_second_ref(mbmi)) { mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref); } av1_count_overlappable_neighbors(cm, xd); if (mbmi->ref_frame[1] != INTRA_FRAME) mbmi->motion_mode = read_motion_mode(cm, xd, mbmi, r); // init mbmi->comp_group_idx = 0; mbmi->compound_idx = 1; mbmi->interinter_comp.type = COMPOUND_AVERAGE; if (has_second_ref(mbmi) && !mbmi->skip_mode) { // Read idx to indicate current compound inter prediction mode group const int masked_compound_used = is_any_masked_compound_used(bsize) && cm->seq_params->enable_masked_compound; if (masked_compound_used) { const int ctx_comp_group_idx = get_comp_group_idx_context(xd); mbmi->comp_group_idx = (uint8_t)aom_read_symbol( r, ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2, ACCT_STR); } if (mbmi->comp_group_idx == 0) { if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) { const int comp_index_ctx = get_comp_index_context(cm, xd); mbmi->compound_idx = (uint8_t)aom_read_symbol( r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR); mbmi->interinter_comp.type = mbmi->compound_idx ? COMPOUND_AVERAGE : COMPOUND_DISTWTD; } else { // Distance-weighted compound is disabled, so always use average mbmi->compound_idx = 1; mbmi->interinter_comp.type = COMPOUND_AVERAGE; } } else { assert(cm->current_frame.reference_mode != SINGLE_REFERENCE && is_inter_compound_mode(mbmi->mode) && mbmi->motion_mode == SIMPLE_TRANSLATION); assert(masked_compound_used); // compound_diffwtd, wedge if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) { mbmi->interinter_comp.type = COMPOUND_WEDGE + aom_read_symbol(r, ec_ctx->compound_type_cdf[bsize], MASKED_COMPOUND_TYPES, ACCT_STR); } else { mbmi->interinter_comp.type = COMPOUND_DIFFWTD; } if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize)); mbmi->interinter_comp.wedge_index = (int8_t)aom_read_symbol( r, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES, ACCT_STR); mbmi->interinter_comp.wedge_sign = (int8_t)aom_read_bit(r, ACCT_STR); } else { assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD); mbmi->interinter_comp.mask_type = aom_read_literal(r, MAX_DIFFWTD_MASK_BITS, ACCT_STR); } } } read_mb_interp_filter(xd, features->interp_filter, cm->seq_params->enable_dual_filter, mbmi, r); if (mbmi->motion_mode == WARPED_CAUSAL) { const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; mbmi->wm_params.wmtype = DEFAULT_WMTYPE; mbmi->wm_params.invalid = 0; if (mbmi->num_proj_ref > 1) { mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref, bsize); } if (av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col, &mbmi->wm_params, mi_row, mi_col)) { #if WARPED_MOTION_DEBUG printf("Warning: unexpected warped model from aomenc\n"); #endif mbmi->wm_params.invalid = 1; } } xd->cfl.store_y = store_cfl_required(cm, xd); #if DEC_MISMATCH_DEBUG dec_dump_logs(cm, mi, mi_row, mi_col, mode_ctx); #endif // DEC_MISMATCH_DEBUG } static void read_inter_frame_mode_info(AV1Decoder *const pbi, DecoderCodingBlock *dcb, aom_reader *r) { AV1_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &dcb->xd; MB_MODE_INFO *const mbmi = xd->mi[0]; int inter_block = 1; mbmi->mv[0].as_int = 0; mbmi->mv[1].as_int = 0; mbmi->segment_id = read_inter_segment_id(cm, xd, 1, r); mbmi->skip_mode = read_skip_mode(cm, xd, mbmi->segment_id, r); if (mbmi->skip_mode) mbmi->skip_txfm = 1; else mbmi->skip_txfm = read_skip_txfm(cm, xd, mbmi->segment_id, r); if (!cm->seg.segid_preskip) mbmi->segment_id = read_inter_segment_id(cm, xd, 0, r); read_cdef(cm, r, xd); read_delta_q_params(cm, xd, r); if (!mbmi->skip_mode) inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r); mbmi->current_qindex = xd->current_base_qindex; xd->above_txfm_context = cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK); if (inter_block) read_inter_block_mode_info(pbi, dcb, mbmi, r); else read_intra_block_mode_info(cm, xd, mbmi, r); } static void intra_copy_frame_mvs(AV1_COMMON *const cm, int mi_row, int mi_col, int x_mis, int y_mis) { const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1); MV_REF *frame_mvs = cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1); x_mis = ROUND_POWER_OF_TWO(x_mis, 1); y_mis = ROUND_POWER_OF_TWO(y_mis, 1); for (int h = 0; h < y_mis; h++) { MV_REF *mv = frame_mvs; for (int w = 0; w < x_mis; w++) { mv->ref_frame = NONE_FRAME; mv++; } frame_mvs += frame_mvs_stride; } } void av1_read_mode_info(AV1Decoder *const pbi, DecoderCodingBlock *dcb, aom_reader *r, int x_mis, int y_mis) { AV1_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &dcb->xd; MB_MODE_INFO *const mi = xd->mi[0]; mi->use_intrabc = 0; if (frame_is_intra_only(cm)) { read_intra_frame_mode_info(cm, dcb, r); if (cm->seq_params->order_hint_info.enable_ref_frame_mvs) intra_copy_frame_mvs(cm, xd->mi_row, xd->mi_col, x_mis, y_mis); } else { read_inter_frame_mode_info(pbi, dcb, r); if (cm->seq_params->order_hint_info.enable_ref_frame_mvs) av1_copy_frame_mvs(cm, mi, xd->mi_row, xd->mi_col, x_mis, y_mis); } } aom-3.12.1/av1/decoder/decodemv.h000066400000000000000000000021541477627663500164360ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_DECODER_DECODEMV_H_ #define AOM_AV1_DECODER_DECODEMV_H_ #include "aom_dsp/bitreader.h" #include "av1/decoder/decoder.h" #ifdef __cplusplus extern "C" { #endif int av1_neg_deinterleave(int diff, int ref, int max); void av1_read_mode_info(AV1Decoder *const pbi, DecoderCodingBlock *dcb, aom_reader *r, int x_mis, int y_mis); #ifdef __cplusplus } // extern "C" #endif void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row, int blk_col, TX_SIZE tx_size, aom_reader *r); #endif // AOM_AV1_DECODER_DECODEMV_H_ aom-3.12.1/av1/decoder/decoder.c000066400000000000000000000421111477627663500162450ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/av1_rtcd.h" #include "config/aom_dsp_rtcd.h" #include "config/aom_scale_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/aom_timer.h" #include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #include "av1/common/alloccommon.h" #include "av1/common/av1_common_int.h" #include "av1/common/av1_loopfilter.h" #include "av1/common/quant_common.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" #include "av1/decoder/decodeframe.h" #include "av1/decoder/decoder.h" #include "av1/decoder/detokenize.h" #include "av1/decoder/obu.h" static void initialize_dec(void) { av1_rtcd(); aom_dsp_rtcd(); aom_scale_rtcd(); av1_init_intra_predictors(); av1_init_wedge_masks(); } static void dec_set_mb_mi(CommonModeInfoParams *mi_params, int width, int height, BLOCK_SIZE min_partition_size) { (void)min_partition_size; // Ensure that the decoded width and height are both multiples of // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if // subsampling is used). // This simplifies the implementation of various experiments, // eg. cdef, which operates on units of 8x8 luma pixels. const int aligned_width = ALIGN_POWER_OF_TWO(width, 3); const int aligned_height = ALIGN_POWER_OF_TWO(height, 3); mi_params->mi_cols = aligned_width >> MI_SIZE_LOG2; mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2; mi_params->mi_stride = calc_mi_size(mi_params->mi_cols); mi_params->mb_cols = ROUND_POWER_OF_TWO(mi_params->mi_cols, 2); mi_params->mb_rows = ROUND_POWER_OF_TWO(mi_params->mi_rows, 2); mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols; mi_params->mi_alloc_bsize = BLOCK_4X4; mi_params->mi_alloc_stride = mi_params->mi_stride; assert(mi_size_wide[mi_params->mi_alloc_bsize] == mi_size_high[mi_params->mi_alloc_bsize]); } static void dec_setup_mi(CommonModeInfoParams *mi_params) { const int mi_grid_size = mi_params->mi_stride * calc_mi_size(mi_params->mi_rows); memset(mi_params->mi_grid_base, 0, mi_grid_size * sizeof(*mi_params->mi_grid_base)); } static void dec_free_mi(CommonModeInfoParams *mi_params) { aom_free(mi_params->mi_alloc); mi_params->mi_alloc = NULL; mi_params->mi_alloc_size = 0; aom_free(mi_params->mi_grid_base); mi_params->mi_grid_base = NULL; mi_params->mi_grid_size = 0; aom_free(mi_params->tx_type_map); mi_params->tx_type_map = NULL; } AV1Decoder *av1_decoder_create(BufferPool *const pool) { AV1Decoder *volatile const pbi = aom_memalign(32, sizeof(*pbi)); if (!pbi) return NULL; av1_zero(*pbi); AV1_COMMON *volatile const cm = &pbi->common; cm->seq_params = &pbi->seq_params; cm->error = &pbi->error; // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(pbi->error.jmp)) { pbi->error.setjmp = 0; av1_decoder_remove(pbi); return NULL; } pbi->error.setjmp = 1; CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc))); CHECK_MEM_ERROR( cm, cm->default_frame_context, (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context))); memset(cm->fc, 0, sizeof(*cm->fc)); memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context)); pbi->need_resync = 1; initialize_dec(); // Initialize the references to not point to any frame buffers. for (int i = 0; i < REF_FRAMES; i++) { cm->ref_frame_map[i] = NULL; } cm->current_frame.frame_number = 0; pbi->decoding_first_frame = 1; pbi->common.buffer_pool = pool; cm->seq_params->bit_depth = AOM_BITS_8; cm->mi_params.free_mi = dec_free_mi; cm->mi_params.setup_mi = dec_setup_mi; cm->mi_params.set_mb_mi = dec_set_mb_mi; av1_loop_filter_init(cm); av1_qm_init(&cm->quant_params, av1_num_planes(cm)); av1_loop_restoration_precal(); #if CONFIG_ACCOUNTING pbi->acct_enabled = 1; aom_accounting_init(&pbi->accounting); #endif pbi->error.setjmp = 0; aom_get_worker_interface()->init(&pbi->lf_worker); pbi->lf_worker.thread_name = "aom lf worker"; return pbi; } void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info) { if (tile_mt_info != NULL) { #if CONFIG_MULTITHREAD if (tile_mt_info->job_mutex != NULL) { pthread_mutex_destroy(tile_mt_info->job_mutex); aom_free(tile_mt_info->job_mutex); } #endif aom_free(tile_mt_info->job_queue); // clear the structure as the source of this call may be a resize in which // case this call will be followed by an _alloc() which may fail. av1_zero(*tile_mt_info); } } void av1_dec_free_cb_buf(AV1Decoder *pbi) { aom_free(pbi->cb_buffer_base); pbi->cb_buffer_base = NULL; pbi->cb_buffer_alloc_size = 0; } void av1_decoder_remove(AV1Decoder *pbi) { int i; if (!pbi) return; // Free the tile list output buffer. aom_free_frame_buffer(&pbi->tile_list_outbuf); aom_get_worker_interface()->end(&pbi->lf_worker); aom_free(pbi->lf_worker.data1); if (pbi->thread_data) { for (int worker_idx = 1; worker_idx < pbi->num_workers; worker_idx++) { DecWorkerData *const thread_data = pbi->thread_data + worker_idx; if (thread_data->td != NULL) { av1_free_mc_tmp_buf(thread_data->td); aom_free(thread_data->td); } } aom_free(pbi->thread_data); } aom_free(pbi->dcb.xd.seg_mask); for (i = 0; i < pbi->num_workers; ++i) { AVxWorker *const worker = &pbi->tile_workers[i]; aom_get_worker_interface()->end(worker); } #if CONFIG_MULTITHREAD if (pbi->row_mt_mutex_ != NULL) { pthread_mutex_destroy(pbi->row_mt_mutex_); aom_free(pbi->row_mt_mutex_); } if (pbi->row_mt_cond_ != NULL) { pthread_cond_destroy(pbi->row_mt_cond_); aom_free(pbi->row_mt_cond_); } #endif for (i = 0; i < pbi->allocated_tiles; i++) { TileDataDec *const tile_data = pbi->tile_data + i; av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync); } aom_free(pbi->tile_data); aom_free(pbi->tile_workers); if (pbi->num_workers > 0) { av1_loop_filter_dealloc(&pbi->lf_row_sync); av1_loop_restoration_dealloc(&pbi->lr_row_sync); av1_dealloc_dec_jobs(&pbi->tile_mt_info); } av1_dec_free_cb_buf(pbi); #if CONFIG_ACCOUNTING aom_accounting_clear(&pbi->accounting); #endif av1_free_mc_tmp_buf(&pbi->td); aom_img_metadata_array_free(pbi->metadata); av1_remove_common(&pbi->common); aom_free(pbi); } void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, aom_reader *r, palette_visitor_fn_t visit) { if (!is_inter_block(xd->mi[0])) { for (int plane = 0; plane < AOMMIN(2, av1_num_planes(&pbi->common)); ++plane) { if (plane == 0 || xd->is_chroma_ref) { if (xd->mi[0]->palette_mode_info.palette_size[plane]) visit(xd, plane, r); } else { assert(xd->mi[0]->palette_mode_info.palette_size[plane] == 0); } } } } static int equal_dimensions(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) { return a->y_height == b->y_height && a->y_width == b->y_width && a->uv_height == b->uv_height && a->uv_width == b->uv_width; } aom_codec_err_t av1_copy_reference_dec(AV1Decoder *pbi, int idx, YV12_BUFFER_CONFIG *sd) { AV1_COMMON *cm = &pbi->common; const int num_planes = av1_num_planes(cm); const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, idx); if (cfg == NULL) { aom_internal_error(&pbi->error, AOM_CODEC_ERROR, "No reference frame"); return AOM_CODEC_ERROR; } if (!equal_dimensions(cfg, sd)) aom_internal_error(&pbi->error, AOM_CODEC_ERROR, "Incorrect buffer dimensions"); else aom_yv12_copy_frame(cfg, sd, num_planes); return pbi->error.error_code; } static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) { return a->y_height == b->y_height && a->y_width == b->y_width && a->uv_height == b->uv_height && a->uv_width == b->uv_width && a->y_stride == b->y_stride && a->uv_stride == b->uv_stride && a->border == b->border && (a->flags & YV12_FLAG_HIGHBITDEPTH) == (b->flags & YV12_FLAG_HIGHBITDEPTH); } aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx, int use_external_ref, YV12_BUFFER_CONFIG *sd) { const int num_planes = av1_num_planes(cm); YV12_BUFFER_CONFIG *ref_buf = NULL; // Get the destination reference buffer. ref_buf = get_ref_frame(cm, idx); if (ref_buf == NULL) { aom_internal_error(cm->error, AOM_CODEC_ERROR, "No reference frame"); return AOM_CODEC_ERROR; } if (!use_external_ref) { if (!equal_dimensions(ref_buf, sd)) { aom_internal_error(cm->error, AOM_CODEC_ERROR, "Incorrect buffer dimensions"); } else { // Overwrite the reference frame buffer. aom_yv12_copy_frame(sd, ref_buf, num_planes); } } else { if (!equal_dimensions_and_border(ref_buf, sd)) { aom_internal_error(cm->error, AOM_CODEC_ERROR, "Incorrect buffer dimensions"); } else { // Overwrite the reference frame buffer pointers. // Once we no longer need the external reference buffer, these pointers // are restored. ref_buf->store_buf_adr[0] = ref_buf->y_buffer; ref_buf->store_buf_adr[1] = ref_buf->u_buffer; ref_buf->store_buf_adr[2] = ref_buf->v_buffer; ref_buf->y_buffer = sd->y_buffer; ref_buf->u_buffer = sd->u_buffer; ref_buf->v_buffer = sd->v_buffer; ref_buf->use_external_reference_buffers = 1; } } return cm->error->error_code; } aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm, YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *sd) { const int num_planes = av1_num_planes(cm); if (!equal_dimensions_and_border(new_frame, sd)) aom_internal_error(cm->error, AOM_CODEC_ERROR, "Incorrect buffer dimensions"); else aom_yv12_copy_frame(new_frame, sd, num_planes); return cm->error->error_code; } static void release_current_frame(AV1Decoder *pbi) { AV1_COMMON *const cm = &pbi->common; BufferPool *const pool = cm->buffer_pool; cm->cur_frame->buf.corrupted = 1; lock_buffer_pool(pool); decrease_ref_count(cm->cur_frame, pool); unlock_buffer_pool(pool); cm->cur_frame = NULL; } // If any buffer updating is signaled it should be done here. // Consumes a reference to cm->cur_frame. // // This functions returns void. It reports failure by setting // pbi->error.error_code. static void update_frame_buffers(AV1Decoder *pbi, int frame_decoded) { int ref_index = 0, mask; AV1_COMMON *const cm = &pbi->common; BufferPool *const pool = cm->buffer_pool; if (frame_decoded) { lock_buffer_pool(pool); // In ext-tile decoding, the camera frame header is only decoded once. So, // we don't update the references here. if (!pbi->camera_frame_header_ready) { // The following for loop needs to release the reference stored in // cm->ref_frame_map[ref_index] before storing a reference to // cm->cur_frame in cm->ref_frame_map[ref_index]. for (mask = cm->current_frame.refresh_frame_flags; mask; mask >>= 1) { if (mask & 1) { decrease_ref_count(cm->ref_frame_map[ref_index], pool); cm->ref_frame_map[ref_index] = cm->cur_frame; ++cm->cur_frame->ref_count; } ++ref_index; } } if (cm->show_existing_frame || cm->show_frame) { if (pbi->output_all_layers) { // Append this frame to the output queue if (pbi->num_output_frames >= MAX_NUM_SPATIAL_LAYERS) { // We can't store the new frame anywhere, so drop it and return an // error cm->cur_frame->buf.corrupted = 1; decrease_ref_count(cm->cur_frame, pool); pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; } else { pbi->output_frames[pbi->num_output_frames] = cm->cur_frame; pbi->num_output_frames++; } } else { // Replace any existing output frame assert(pbi->num_output_frames == 0 || pbi->num_output_frames == 1); if (pbi->num_output_frames > 0) { decrease_ref_count(pbi->output_frames[0], pool); } pbi->output_frames[0] = cm->cur_frame; pbi->num_output_frames = 1; } } else { decrease_ref_count(cm->cur_frame, pool); } unlock_buffer_pool(pool); } else { // Nothing was decoded, so just drop this frame buffer lock_buffer_pool(pool); decrease_ref_count(cm->cur_frame, pool); unlock_buffer_pool(pool); } cm->cur_frame = NULL; if (!pbi->camera_frame_header_ready) { // Invalidate these references until the next frame starts. for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) { cm->remapped_ref_idx[ref_index] = INVALID_IDX; } } } int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, const uint8_t **psource) { AV1_COMMON *volatile const cm = &pbi->common; const uint8_t *source = *psource; pbi->error.error_code = AOM_CODEC_OK; pbi->error.has_detail = 0; if (size == 0) { // This is used to signal that we are missing frames. // We do not know if the missing frame(s) was supposed to update // any of the reference buffers, but we act conservative and // mark only the last buffer as corrupted. // // TODO(jkoleszar): Error concealment is undefined and non-normative // at this point, but if it becomes so, [0] may not always be the correct // thing to do here. RefCntBuffer *ref_buf = get_ref_frame_buf(cm, LAST_FRAME); if (ref_buf != NULL) ref_buf->buf.corrupted = 1; } if (assign_cur_frame_new_fb(cm) == NULL) { pbi->error.error_code = AOM_CODEC_MEM_ERROR; return 1; } // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(pbi->error.jmp)) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); int i; pbi->error.setjmp = 0; // Synchronize all threads immediately as a subsequent decode call may // cause a resize invalidating some allocations. winterface->sync(&pbi->lf_worker); for (i = 0; i < pbi->num_workers; ++i) { winterface->sync(&pbi->tile_workers[i]); } release_current_frame(pbi); return -1; } pbi->error.setjmp = 1; int frame_decoded = aom_decode_frame_from_obus(pbi, source, source + size, psource); if (frame_decoded < 0) { assert(pbi->error.error_code != AOM_CODEC_OK); release_current_frame(pbi); pbi->error.setjmp = 0; return 1; } #if TXCOEFF_TIMER cm->cum_txcoeff_timer += cm->txcoeff_timer; fprintf(stderr, "txb coeff block number: %d, frame time: %ld, cum time %ld in us\n", cm->txb_count, cm->txcoeff_timer, cm->cum_txcoeff_timer); cm->txcoeff_timer = 0; cm->txb_count = 0; #endif // Note: At this point, this function holds a reference to cm->cur_frame // in the buffer pool. This reference is consumed by update_frame_buffers(). update_frame_buffers(pbi, frame_decoded); if (frame_decoded) { pbi->decoding_first_frame = 0; } if (pbi->error.error_code != AOM_CODEC_OK) { pbi->error.setjmp = 0; return 1; } if (!cm->show_existing_frame) { if (cm->seg.enabled) { if (cm->prev_frame && (cm->mi_params.mi_rows == cm->prev_frame->mi_rows) && (cm->mi_params.mi_cols == cm->prev_frame->mi_cols)) { cm->last_frame_seg_map = cm->prev_frame->seg_map; } else { cm->last_frame_seg_map = NULL; } } } // Update progress in frame parallel decode. pbi->error.setjmp = 0; return 0; } // Get the frame at a particular index in the output queue int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd, aom_film_grain_t **grain_params) { if (index >= pbi->num_output_frames) return -1; *sd = &pbi->output_frames[index]->buf; *grain_params = &pbi->output_frames[index]->film_grain_params; return 0; } // Get the highest-spatial-layer output // TODO(rachelbarker): What should this do? int av1_get_frame_to_show(AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame) { if (pbi->num_output_frames == 0) return -1; *frame = pbi->output_frames[pbi->num_output_frames - 1]->buf; return 0; } aom-3.12.1/av1/decoder/decoder.h000066400000000000000000000354501477627663500162620ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_DECODER_DECODER_H_ #define AOM_AV1_DECODER_DECODER_H_ #include "config/aom_config.h" #include "aom/aom_codec.h" #include "aom_dsp/bitreader.h" #include "aom_scale/yv12config.h" #include "aom_util/aom_thread.h" #include "av1/common/av1_common_int.h" #include "av1/common/thread_common.h" #include "av1/decoder/dthread.h" #if CONFIG_ACCOUNTING #include "av1/decoder/accounting.h" #endif #if CONFIG_INSPECTION #include "av1/decoder/inspection.h" #endif #ifdef __cplusplus extern "C" { #endif /*! * \brief Contains coding block data required by the decoder. * * This includes: * - Coding block info that is common between encoder and decoder. * - Other coding block info only needed by the decoder. * Contrast this with a similar struct MACROBLOCK on encoder side. * This data is also common between ThreadData and AV1Decoder structs. */ typedef struct DecoderCodingBlock { /*! * Coding block info that is common between encoder and decoder. */ DECLARE_ALIGNED(32, MACROBLOCKD, xd); /*! * True if the at least one of the coding blocks decoded was corrupted. */ int corrupted; /*! * Pointer to 'mc_buf' inside 'pbi->td' (single-threaded decoding) or * 'pbi->thread_data[i].td' (multi-threaded decoding). */ uint8_t *mc_buf[2]; /*! * Pointer to 'dqcoeff' inside 'td->cb_buffer_base' or 'pbi->cb_buffer_base' * with appropriate offset for the current superblock, for each plane. */ tran_low_t *dqcoeff_block[MAX_MB_PLANE]; /*! * cb_offset[p] is the offset into the dqcoeff_block[p] for the current coding * block, for each plane 'p'. */ uint16_t cb_offset[MAX_MB_PLANE]; /*! * Pointer to 'eob_data' inside 'td->cb_buffer_base' or 'pbi->cb_buffer_base' * with appropriate offset for the current superblock, for each plane. */ eob_info *eob_data[MAX_MB_PLANE]; /*! * txb_offset[p] is the offset into the eob_data[p] for the current coding * block, for each plane 'p'. */ uint16_t txb_offset[MAX_MB_PLANE]; /*! * ref_mv_count[i] specifies the number of number of motion vector candidates * in xd->ref_mv_stack[i]. */ uint8_t ref_mv_count[MODE_CTX_REF_FRAMES]; } DecoderCodingBlock; /*!\cond */ typedef void (*decode_block_visitor_fn_t)(const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r, const int plane, const int row, const int col, const TX_SIZE tx_size); typedef void (*predict_inter_block_visitor_fn_t)(AV1_COMMON *const cm, DecoderCodingBlock *dcb, BLOCK_SIZE bsize); typedef void (*cfl_store_inter_block_visitor_fn_t)(AV1_COMMON *const cm, MACROBLOCKD *const xd); typedef struct ThreadData { DecoderCodingBlock dcb; // Coding block buffer for the current superblock. // Used only for single-threaded decoding and multi-threaded decoding with // row_mt == 1 cases. // See also: similar buffer in 'AV1Decoder'. CB_BUFFER cb_buffer_base; aom_reader *bit_reader; // Motion compensation buffer used to get a prediction buffer with extended // borders. One buffer for each of the two possible references. uint8_t *mc_buf[2]; // Mask for this block used for compound prediction. uint8_t *seg_mask; // Allocated size of 'mc_buf'. int32_t mc_buf_size; // If true, the pointers in 'mc_buf' were converted from highbd pointers. int mc_buf_use_highbd; // Boolean: whether the byte pointers stored in // mc_buf were converted from highbd pointers. CONV_BUF_TYPE *tmp_conv_dst; uint8_t *tmp_obmc_bufs[2]; decode_block_visitor_fn_t read_coeffs_tx_intra_block_visit; decode_block_visitor_fn_t predict_and_recon_intra_block_visit; decode_block_visitor_fn_t read_coeffs_tx_inter_block_visit; decode_block_visitor_fn_t inverse_tx_inter_block_visit; predict_inter_block_visitor_fn_t predict_inter_block_visit; cfl_store_inter_block_visitor_fn_t cfl_store_inter_block_visit; } ThreadData; typedef struct AV1DecRowMTJobInfo { int tile_row; int tile_col; int mi_row; } AV1DecRowMTJobInfo; typedef struct AV1DecRowMTSyncData { #if CONFIG_MULTITHREAD pthread_mutex_t *mutex_; pthread_cond_t *cond_; #endif int allocated_sb_rows; int *cur_sb_col; // Denotes the superblock interval at which conditional signalling should // happen. Also denotes the minimum number of extra superblocks of the top row // to be complete to start decoding the current superblock. A value of 1 // indicates top-right dependency. int sync_range; // Denotes the additional number of superblocks in the previous row to be // complete to start decoding the current superblock when intraBC tool is // enabled. This additional top-right delay is required to satisfy the // hardware constraints for intraBC tool when row multithreading is enabled. int intrabc_extra_top_right_sb_delay; int mi_rows; int mi_cols; int mi_rows_parse_done; int mi_rows_decode_started; int num_threads_working; } AV1DecRowMTSync; typedef struct AV1DecRowMTInfo { int tile_rows_start; int tile_rows_end; int tile_cols_start; int tile_cols_end; int start_tile; int end_tile; int mi_rows_to_decode; // Invariant: // mi_rows_parse_done >= mi_rows_decode_started. // mi_rows_parse_done and mi_rows_decode_started are both initialized to 0. // mi_rows_parse_done is incremented freely. mi_rows_decode_started may only // be incremented to catch up with mi_rows_parse_done but is not allowed to // surpass mi_rows_parse_done. // // When mi_rows_decode_started reaches mi_rows_to_decode, there are no more // decode jobs. // Indicates the progress of the bit-stream parsing of superblocks. // Initialized to 0. Incremented by sb_mi_size when parse sb row is done. int mi_rows_parse_done; // Indicates the progress of the decoding of superblocks. // Initialized to 0. Incremented by sb_mi_size when decode sb row is started. int mi_rows_decode_started; // Boolean: Initialized to 0 (false). Set to 1 (true) on error to abort // decoding. int row_mt_exit; } AV1DecRowMTInfo; typedef struct TileDataDec { TileInfo tile_info; aom_reader bit_reader; DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx); AV1DecRowMTSync dec_row_mt_sync; } TileDataDec; typedef struct TileBufferDec { const uint8_t *data; size_t size; } TileBufferDec; typedef struct DataBuffer { const uint8_t *data; size_t size; } DataBuffer; typedef struct EXTERNAL_REFERENCES { YV12_BUFFER_CONFIG refs[MAX_EXTERNAL_REFERENCES]; int num; } EXTERNAL_REFERENCES; typedef struct TileJobsDec { TileBufferDec *tile_buffer; TileDataDec *tile_data; } TileJobsDec; typedef struct AV1DecTileMTData { #if CONFIG_MULTITHREAD pthread_mutex_t *job_mutex; #endif TileJobsDec *job_queue; int jobs_enqueued; int jobs_dequeued; int alloc_tile_rows; int alloc_tile_cols; } AV1DecTileMT; typedef struct AV1Decoder { DecoderCodingBlock dcb; DECLARE_ALIGNED(32, AV1_COMMON, common); AVxWorker lf_worker; AV1LfSync lf_row_sync; AV1LrSync lr_row_sync; AV1LrStruct lr_ctxt; AV1CdefSync cdef_sync; AV1CdefWorkerData *cdef_worker; AVxWorker *tile_workers; int num_workers; DecWorkerData *thread_data; ThreadData td; TileDataDec *tile_data; int allocated_tiles; TileBufferDec tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS]; AV1DecTileMT tile_mt_info; // Each time the decoder is called, we expect to receive a full temporal unit. // This can contain up to one shown frame per spatial layer in the current // operating point (note that some layers may be entirely omitted). // If the 'output_all_layers' option is true, we save all of these shown // frames so that they can be returned to the application. If the // 'output_all_layers' option is false, then we only output one image per // temporal unit. // // Note: The saved buffers are released at the start of the next time the // application calls aom_codec_decode(). int output_all_layers; RefCntBuffer *output_frames[MAX_NUM_SPATIAL_LAYERS]; size_t num_output_frames; // How many frames are queued up so far? // In order to properly support random-access decoding, we need // to behave slightly differently for the very first frame we decode. // So we track whether this is the first frame or not. int decoding_first_frame; int allow_lowbitdepth; int max_threads; int inv_tile_order; int need_resync; // wait for key/intra-only frame. int reset_decoder_state; int tile_size_bytes; int tile_col_size_bytes; int dec_tile_row, dec_tile_col; // always -1 for non-VR tile encoding #if CONFIG_ACCOUNTING int acct_enabled; Accounting accounting; #endif int sequence_header_ready; int sequence_header_changed; #if CONFIG_INSPECTION aom_inspect_cb inspect_cb; void *inspect_ctx; #endif int operating_point; int current_operating_point; int seen_frame_header; // The expected start_tile (tg_start syntax element) of the next tile group. int next_start_tile; // State if the camera frame header is already decoded while // large_scale_tile = 1. int camera_frame_header_ready; size_t frame_header_size; DataBuffer obu_size_hdr; int output_frame_width_in_tiles_minus_1; int output_frame_height_in_tiles_minus_1; int tile_count_minus_1; uint32_t coded_tile_data_size; unsigned int ext_tile_debug; // for ext-tile software debug & testing // Decoder has 3 modes of operation: // (1) Single-threaded decoding. // (2) Multi-threaded decoding with each tile decoded in parallel. // (3) In addition to (2), each thread decodes 1 superblock row in parallel. // row_mt = 1 triggers mode (3) above, while row_mt = 0, will trigger mode (1) // or (2) depending on 'max_threads'. unsigned int row_mt; EXTERNAL_REFERENCES ext_refs; YV12_BUFFER_CONFIG tile_list_outbuf; // Coding block buffer for the current frame. // Allocated and used only for multi-threaded decoding with 'row_mt == 0'. // See also: similar buffer in 'ThreadData' struct. CB_BUFFER *cb_buffer_base; // Allocated size of 'cb_buffer_base'. Currently same as the number of // superblocks in the coded frame. int cb_buffer_alloc_size; int allocated_row_mt_sync_rows; #if CONFIG_MULTITHREAD pthread_mutex_t *row_mt_mutex_; pthread_cond_t *row_mt_cond_; #endif AV1DecRowMTInfo frame_row_mt_info; aom_metadata_array_t *metadata; int context_update_tile_id; int skip_loop_filter; int skip_film_grain; int is_annexb; int valid_for_referencing[REF_FRAMES]; int is_fwd_kf_present; int is_arf_frame_present; int num_tile_groups; aom_s_frame_info sframe_info; /*! * Elements part of the sequence header, that are applicable for all the * frames in the video. */ SequenceHeader seq_params; /*! * If true, buffer removal times are present. */ bool buffer_removal_time_present; /*! * Code and details about current error status. */ struct aom_internal_error_info error; /*! * Number of temporal layers: may be > 1 for SVC (scalable vector coding). */ unsigned int number_temporal_layers; /*! * Number of spatial layers: may be > 1 for SVC (scalable vector coding). */ unsigned int number_spatial_layers; } AV1Decoder; // Returns 0 on success. Sets pbi->common.error.error_code to a nonzero error // code and returns a nonzero value on failure. int av1_receive_compressed_data(struct AV1Decoder *pbi, size_t size, const uint8_t **psource); // Get the frame at a particular index in the output queue int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd, aom_film_grain_t **grain_params); int av1_get_frame_to_show(struct AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame); aom_codec_err_t av1_copy_reference_dec(struct AV1Decoder *pbi, int idx, YV12_BUFFER_CONFIG *sd); aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx, int use_external_ref, YV12_BUFFER_CONFIG *sd); aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm, YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *sd); struct AV1Decoder *av1_decoder_create(BufferPool *const pool); void av1_decoder_remove(struct AV1Decoder *pbi); void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info); void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync); void av1_dec_free_cb_buf(AV1Decoder *pbi); static inline void decrease_ref_count(RefCntBuffer *const buf, BufferPool *const pool) { if (buf != NULL) { --buf->ref_count; // Reference counts should never become negative. If this assertion fails, // there is a bug in our reference count management. assert(buf->ref_count >= 0); // A worker may only get a free framebuffer index when calling get_free_fb. // But the raw frame buffer is not set up until we finish decoding header. // So if any error happens during decoding header, frame_bufs[idx] will not // have a valid raw frame buffer. if (buf->ref_count == 0 && buf->raw_frame_buffer.data) { pool->release_fb_cb(pool->cb_priv, &buf->raw_frame_buffer); buf->raw_frame_buffer.data = NULL; buf->raw_frame_buffer.size = 0; buf->raw_frame_buffer.priv = NULL; } } } #define ACCT_STR __func__ static inline int av1_read_uniform(aom_reader *r, int n) { const int l = get_unsigned_bits(n); const int m = (1 << l) - n; const int v = aom_read_literal(r, l - 1, ACCT_STR); assert(l != 0); if (v < m) return v; else return (v << 1) - m + aom_read_literal(r, 1, ACCT_STR); } typedef void (*palette_visitor_fn_t)(MACROBLOCKD *const xd, int plane, aom_reader *r); void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, aom_reader *r, palette_visitor_fn_t visit); typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td, int mi_row, int mi_col, aom_reader *r, PARTITION_TYPE partition, BLOCK_SIZE bsize); /*!\endcond */ #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_DECODER_DECODER_H_ aom-3.12.1/av1/decoder/decodetxb.c000066400000000000000000000322571477627663500166130ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/decoder/decodetxb.h" #include "aom_ports/mem.h" #include "av1/common/idct.h" #include "av1/common/scan.h" #include "av1/common/txb_common.h" #include "av1/decoder/decodemv.h" #define ACCT_STR __func__ static int read_golomb(MACROBLOCKD *xd, aom_reader *r) { int x = 1; int length = 0; int i = 0; while (!i) { i = aom_read_bit(r, ACCT_STR); ++length; if (length > 20) { aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, "Invalid length in read_golomb"); break; } } for (i = 0; i < length - 1; ++i) { x <<= 1; x += aom_read_bit(r, ACCT_STR); } return x - 1; } static inline int rec_eob_pos(const int eob_token, const int extra) { int eob = av1_eob_group_start[eob_token]; if (eob > 2) { eob += extra; } return eob; } static inline int get_dqv(const int16_t *dequant, int coeff_idx, const qm_val_t *iqmatrix) { int dqv = dequant[!!coeff_idx]; if (iqmatrix != NULL) dqv = ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; return dqv; } static inline void read_coeffs_reverse_2d(aom_reader *r, TX_SIZE tx_size, int start_si, int end_si, const int16_t *scan, int bhl, uint8_t *levels, base_cdf_arr base_cdf, br_cdf_arr br_cdf) { for (int c = end_si; c >= start_si; --c) { const int pos = scan[c]; const int coeff_ctx = get_lower_levels_ctx_2d(levels, pos, bhl, tx_size); const int nsymbs = 4; int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR); if (level > NUM_BASE_LEVELS) { const int br_ctx = get_br_ctx_2d(levels, pos, bhl); aom_cdf_prob *cdf = br_cdf[br_ctx]; for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR); level += k; if (k < BR_CDF_SIZE - 1) break; } } levels[get_padded_idx(pos, bhl)] = level; } } static inline void read_coeffs_reverse(aom_reader *r, TX_SIZE tx_size, TX_CLASS tx_class, int start_si, int end_si, const int16_t *scan, int bhl, uint8_t *levels, base_cdf_arr base_cdf, br_cdf_arr br_cdf) { for (int c = end_si; c >= start_si; --c) { const int pos = scan[c]; const int coeff_ctx = get_lower_levels_ctx(levels, pos, bhl, tx_size, tx_class); const int nsymbs = 4; int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR); if (level > NUM_BASE_LEVELS) { const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class); aom_cdf_prob *cdf = br_cdf[br_ctx]; for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR); level += k; if (k < BR_CDF_SIZE - 1) break; } } levels[get_padded_idx(pos, bhl)] = level; } } static uint8_t read_coeffs_txb(const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r, const int blk_row, const int blk_col, const int plane, const TXB_CTX *const txb_ctx, const TX_SIZE tx_size) { MACROBLOCKD *const xd = &dcb->xd; FRAME_CONTEXT *const ec_ctx = xd->tile_ctx; const int32_t max_value = (1 << (7 + xd->bd)) - 1; const int32_t min_value = -(1 << (7 + xd->bd)); const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); const PLANE_TYPE plane_type = get_plane_type(plane); MB_MODE_INFO *const mbmi = xd->mi[0]; struct macroblockd_plane *const pd = &xd->plane[plane]; const int16_t *const dequant = pd->seg_dequant_QTX[mbmi->segment_id]; tran_low_t *const tcoeffs = dcb->dqcoeff_block[plane] + dcb->cb_offset[plane]; const int shift = av1_get_tx_scale(tx_size); const int bhl = get_txb_bhl(tx_size); const int width = get_txb_wide(tx_size); const int height = get_txb_high(tx_size); int cul_level = 0; int dc_val = 0; uint8_t levels_buf[TX_PAD_2D]; uint8_t *const levels = set_levels(levels_buf, height); const int all_zero = aom_read_symbol( r, ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2, ACCT_STR); eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane]; uint16_t *const eob = &(eob_data->eob); uint16_t *const max_scan_line = &(eob_data->max_scan_line); *max_scan_line = 0; *eob = 0; #if CONFIG_INSPECTION if (plane == 0) { const int txk_type_idx = av1_get_txk_type_index(mbmi->bsize, blk_row, blk_col); mbmi->tx_skip[txk_type_idx] = all_zero; } #endif if (all_zero) { *max_scan_line = 0; if (plane == 0) { xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col] = DCT_DCT; } return 0; } if (plane == AOM_PLANE_Y) { // only y plane's tx_type is transmitted av1_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r); } const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, cm->features.reduced_tx_set_used); const TX_CLASS tx_class = tx_type_to_class[tx_type]; const qm_val_t *iqmatrix = av1_get_iqmatrix(&cm->quant_params, xd, plane, tx_size, tx_type); const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); const int16_t *const scan = scan_order->scan; int eob_extra = 0; int eob_pt = 1; const int eob_multi_size = txsize_log2_minus4[tx_size]; const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; switch (eob_multi_size) { case 0: eob_pt = aom_read_symbol(r, ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], 5, ACCT_STR) + 1; break; case 1: eob_pt = aom_read_symbol(r, ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], 6, ACCT_STR) + 1; break; case 2: eob_pt = aom_read_symbol(r, ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], 7, ACCT_STR) + 1; break; case 3: eob_pt = aom_read_symbol(r, ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], 8, ACCT_STR) + 1; break; case 4: eob_pt = aom_read_symbol(r, ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], 9, ACCT_STR) + 1; break; case 5: eob_pt = aom_read_symbol(r, ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], 10, ACCT_STR) + 1; break; case 6: default: eob_pt = aom_read_symbol( r, ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11, ACCT_STR) + 1; break; } const int eob_offset_bits = av1_eob_offset_bits[eob_pt]; if (eob_offset_bits > 0) { const int eob_ctx = eob_pt - 3; int bit = aom_read_symbol( r, ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2, ACCT_STR); if (bit) { eob_extra += (1 << (eob_offset_bits - 1)); } for (int i = 1; i < eob_offset_bits; i++) { bit = aom_read_bit(r, ACCT_STR); if (bit) { eob_extra += (1 << (eob_offset_bits - 1 - i)); } } } *eob = rec_eob_pos(eob_pt, eob_extra); if (*eob > 1) { memset(levels_buf, 0, sizeof(*levels_buf) * ((height + TX_PAD_HOR) * (width + TX_PAD_VER) + TX_PAD_END)); } { // Read the non-zero coefficient with scan index eob-1 // TODO(angiebird): Put this into a function const int c = *eob - 1; const int pos = scan[c]; const int coeff_ctx = get_lower_levels_ctx_eob(bhl, width, c); const int nsymbs = 3; aom_cdf_prob *cdf = ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx]; int level = aom_read_symbol(r, cdf, nsymbs, ACCT_STR) + 1; if (level > NUM_BASE_LEVELS) { const int br_ctx = get_br_ctx_eob(pos, bhl, tx_class); cdf = ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx]; for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR); level += k; if (k < BR_CDF_SIZE - 1) break; } } levels[get_padded_idx(pos, bhl)] = level; } if (*eob > 1) { base_cdf_arr base_cdf = ec_ctx->coeff_base_cdf[txs_ctx][plane_type]; br_cdf_arr br_cdf = ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type]; if (tx_class == TX_CLASS_2D) { read_coeffs_reverse_2d(r, tx_size, 1, *eob - 1 - 1, scan, bhl, levels, base_cdf, br_cdf); read_coeffs_reverse(r, tx_size, tx_class, 0, 0, scan, bhl, levels, base_cdf, br_cdf); } else { read_coeffs_reverse(r, tx_size, tx_class, 0, *eob - 1 - 1, scan, bhl, levels, base_cdf, br_cdf); } } for (int c = 0; c < *eob; ++c) { const int pos = scan[c]; uint8_t sign; tran_low_t level = levels[get_padded_idx(pos, bhl)]; if (level) { *max_scan_line = AOMMAX(*max_scan_line, pos); if (c == 0) { const int dc_sign_ctx = txb_ctx->dc_sign_ctx; sign = aom_read_symbol(r, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], 2, ACCT_STR); } else { sign = aom_read_bit(r, ACCT_STR); } if (level >= MAX_BASE_BR_RANGE) { level += read_golomb(xd, r); } if (c == 0) dc_val = sign ? -level : level; // Bitmasking to clamp level to valid range: // The valid range for 8/10/12 bit vdieo is at most 14/16/18 bit level &= 0xfffff; cul_level += level; tran_low_t dq_coeff; // Bitmasking to clamp dq_coeff to valid range: // The valid range for 8/10/12 bit video is at most 17/19/21 bit dq_coeff = (tran_low_t)((int64_t)level * get_dqv(dequant, scan[c], iqmatrix) & 0xffffff); dq_coeff = dq_coeff >> shift; if (sign) { dq_coeff = -dq_coeff; } tcoeffs[pos] = clamp(dq_coeff, min_value, max_value); } } cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level); // DC value set_dc_sign(&cul_level, dc_val); return cul_level; } void av1_read_coeffs_txb(const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r, const int plane, const int row, const int col, const TX_SIZE tx_size) { #if TXCOEFF_TIMER struct aom_usec_timer timer; aom_usec_timer_start(&timer); #endif MACROBLOCKD *const xd = &dcb->xd; MB_MODE_INFO *const mbmi = xd->mi[0]; struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE bsize = mbmi->bsize; assert(bsize < BLOCK_SIZES_ALL); const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); TXB_CTX txb_ctx; get_txb_ctx(plane_bsize, tx_size, plane, pd->above_entropy_context + col, pd->left_entropy_context + row, &txb_ctx); const uint8_t cul_level = read_coeffs_txb(cm, dcb, r, row, col, plane, &txb_ctx, tx_size); av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col, row); if (is_inter_block(mbmi)) { const PLANE_TYPE plane_type = get_plane_type(plane); // tx_type will be read out in av1_read_coeffs_txb_facade const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size, cm->features.reduced_tx_set_used); if (plane == 0) { const int txw = tx_size_wide_unit[tx_size]; const int txh = tx_size_high_unit[tx_size]; // The 16x16 unit is due to the constraint from tx_64x64 which sets the // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block // size, the constraint takes effect in 32x16 / 16x32 size too. To solve // the intricacy, cover all the 16x16 units inside a 64 level transform. if (txw == tx_size_wide_unit[TX_64X64] || txh == tx_size_high_unit[TX_64X64]) { const int tx_unit = tx_size_wide_unit[TX_16X16]; const int stride = xd->tx_type_map_stride; for (int idy = 0; idy < txh; idy += tx_unit) { for (int idx = 0; idx < txw; idx += tx_unit) { xd->tx_type_map[(row + idy) * stride + col + idx] = tx_type; } } } } } #if TXCOEFF_TIMER aom_usec_timer_mark(&timer); const int64_t elapsed_time = aom_usec_timer_elapsed(&timer); cm->txcoeff_timer += elapsed_time; ++cm->txb_count; #endif } aom-3.12.1/av1/decoder/decodetxb.h000066400000000000000000000017771477627663500166230ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_DECODER_DECODETXB_H_ #define AOM_AV1_DECODER_DECODETXB_H_ #include "av1/common/enums.h" struct aom_reader; struct AV1Common; struct DecoderCodingBlock; struct txb_ctx; void av1_read_coeffs_txb(const struct AV1Common *const cm, struct DecoderCodingBlock *dcb, struct aom_reader *const r, const int plane, const int row, const int col, const TX_SIZE tx_size); #endif // AOM_AV1_DECODER_DECODETXB_H_ aom-3.12.1/av1/decoder/detokenize.c000066400000000000000000000057641477627663500170160ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_config.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "av1/common/blockd.h" #include "av1/decoder/detokenize.h" #define ACCT_STR __func__ #include "av1/common/common.h" #include "av1/common/entropy.h" #include "av1/common/idct.h" static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) { uint8_t color_order[PALETTE_MAX_SIZE]; const int n = param->n_colors; uint8_t *const color_map = param->color_map; MapCdf color_map_cdf = param->map_cdf; int plane_block_width = param->plane_width; int plane_block_height = param->plane_height; int rows = param->rows; int cols = param->cols; // The first color index. color_map[0] = av1_read_uniform(r, n); assert(color_map[0] < n); // Run wavefront on the palette map index decoding. for (int i = 1; i < rows + cols - 1; ++i) { for (int j = AOMMIN(i, cols - 1); j >= AOMMAX(0, i - rows + 1); --j) { const int color_ctx = av1_get_palette_color_index_context( color_map, plane_block_width, (i - j), j, n, color_order, NULL); const int color_idx = aom_read_symbol( r, color_map_cdf[n - PALETTE_MIN_SIZE][color_ctx], n, ACCT_STR); assert(color_idx >= 0 && color_idx < n); color_map[(i - j) * plane_block_width + j] = color_order[color_idx]; } } // Copy last column to extra columns. if (cols < plane_block_width) { for (int i = 0; i < rows; ++i) { memset(color_map + i * plane_block_width + cols, color_map[i * plane_block_width + cols - 1], (plane_block_width - cols)); } } // Copy last row to extra rows. for (int i = rows; i < plane_block_height; ++i) { memcpy(color_map + i * plane_block_width, color_map + (rows - 1) * plane_block_width, plane_block_width); } } void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, aom_reader *r) { assert(plane == 0 || plane == 1); Av1ColorMapParam params; params.color_map = xd->plane[plane].color_index_map + xd->color_index_map_offset[plane]; params.map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf : xd->tile_ctx->palette_y_color_index_cdf; const MB_MODE_INFO *const mbmi = xd->mi[0]; params.n_colors = mbmi->palette_mode_info.palette_size[plane]; av1_get_block_dimensions(mbmi->bsize, plane, xd, ¶ms.plane_width, ¶ms.plane_height, ¶ms.rows, ¶ms.cols); decode_color_map_tokens(¶ms, r); } aom-3.12.1/av1/decoder/detokenize.h000066400000000000000000000016151477627663500170120ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_DECODER_DETOKENIZE_H_ #define AOM_AV1_DECODER_DETOKENIZE_H_ #include "config/aom_config.h" #include "av1/common/scan.h" #include "av1/decoder/decoder.h" #ifdef __cplusplus extern "C" { #endif void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, aom_reader *r); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_DECODER_DETOKENIZE_H_ aom-3.12.1/av1/decoder/dthread.h000066400000000000000000000026731477627663500162710ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_DECODER_DTHREAD_H_ #define AOM_AV1_DECODER_DTHREAD_H_ #include "config/aom_config.h" #include "aom/internal/aom_codec_internal.h" #ifdef __cplusplus extern "C" { #endif struct AV1Common; struct AV1Decoder; struct ThreadData; typedef struct DecWorkerData { struct ThreadData *td; const uint8_t *data_end; struct aom_internal_error_info error_info; } DecWorkerData; // WorkerData for the FrameWorker thread. It contains all the information of // the worker and decode structures for decoding a frame. typedef struct FrameWorkerData { struct AV1Decoder *pbi; const uint8_t *data; const uint8_t *data_end; size_t data_size; void *user_priv; int received_frame; int frame_context_ready; // Current frame's context is ready to read. int frame_decoded; // Finished decoding current frame. } FrameWorkerData; #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_DECODER_DTHREAD_H_ aom-3.12.1/av1/decoder/grain_synthesis.c000066400000000000000000001766171477627663500200740ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief Describes film grain parameters and film grain synthesis * */ #include #include #include #include #include #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "av1/decoder/grain_synthesis.h" // Samples with Gaussian distribution in the range of [-2048, 2047] (12 bits) // with zero mean and standard deviation of about 512. // should be divided by 4 for 10-bit range and 16 for 8-bit range. static const int gaussian_sequence[2048] = { 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820, 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800, 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588, -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368, 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4, 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396, 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740, 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292, 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532, 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704, 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96, -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244, 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136, 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676, -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400, -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844, -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96, -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356, 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280, 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808, 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228, -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136, -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264, -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388, 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500, 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384, 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220, -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148, 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572, -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516, 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916, -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492, 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560, -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108, -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516, -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88, -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196, -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864, 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920, 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564, -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876, -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244, 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184, 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364, -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72, 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24, 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4, -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120, 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108, -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296, 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336, -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164, -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264, 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536, -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296, -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696, 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204, 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212, -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40, 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384, 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8, 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704, -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348, -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592, -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420, 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220, -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208, -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544, -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288, -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240, -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132, 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16, -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044, -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732, 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460, -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52, -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104, -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460, 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716, -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960, 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476, 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692, 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352, -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144, -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44, 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356, 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452, -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552, -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264, -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448, -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588, 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464, 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216, 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132, 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412, 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48, 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196, 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48, -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292, 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32, -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012, -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120, -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56, 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416, -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404, -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92, 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904, 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728, 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584, 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48, 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180, 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528, 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364, -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260, -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324, -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64, 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120, -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168, -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888, 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588, -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484, 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580, 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392, 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80, -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688, 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4, -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300, 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444, 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192, 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160, 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188, -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404, -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400, 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92, -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824, 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620, 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720, 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620, -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508, -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736, 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836, 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180, 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140, -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32, -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916, 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368, -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380, -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572, -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864, 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908, -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84, 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396, -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360, 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928, -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288, 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196, 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504, 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272, 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344, -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208, -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156, -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240, -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432, 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244, 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584, 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24, 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300, -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416, 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380, -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384, 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88, 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876, -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320, -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88, -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196, -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120, 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664, -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0, -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264, -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288, -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56, 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148, 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156, -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144, -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148, 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944, 428, -484 }; static const int gauss_bits = 11; static int luma_subblock_size_y = 32; static int luma_subblock_size_x = 32; static int chroma_subblock_size_y = 16; static int chroma_subblock_size_x = 16; static const int min_luma_legal_range = 16; static const int max_luma_legal_range = 235; static const int min_chroma_legal_range = 16; static const int max_chroma_legal_range = 240; static int scaling_lut_y[256]; static int scaling_lut_cb[256]; static int scaling_lut_cr[256]; static int grain_min; static int grain_max; static uint16_t random_register = 0; // random number generator register static void dealloc_arrays(const aom_film_grain_t *params, int ***pred_pos_luma, int ***pred_pos_chroma, int **luma_grain_block, int **cb_grain_block, int **cr_grain_block, int **y_line_buf, int **cb_line_buf, int **cr_line_buf, int **y_col_buf, int **cb_col_buf, int **cr_col_buf) { int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); int num_pos_chroma = num_pos_luma; if (params->num_y_points > 0) ++num_pos_chroma; if (*pred_pos_luma) { for (int row = 0; row < num_pos_luma; row++) { aom_free((*pred_pos_luma)[row]); } aom_free(*pred_pos_luma); *pred_pos_luma = NULL; } if (*pred_pos_chroma) { for (int row = 0; row < num_pos_chroma; row++) { aom_free((*pred_pos_chroma)[row]); } aom_free(*pred_pos_chroma); *pred_pos_chroma = NULL; } aom_free(*y_line_buf); *y_line_buf = NULL; aom_free(*cb_line_buf); *cb_line_buf = NULL; aom_free(*cr_line_buf); *cr_line_buf = NULL; aom_free(*y_col_buf); *y_col_buf = NULL; aom_free(*cb_col_buf); *cb_col_buf = NULL; aom_free(*cr_col_buf); *cr_col_buf = NULL; aom_free(*luma_grain_block); *luma_grain_block = NULL; aom_free(*cb_grain_block); *cb_grain_block = NULL; aom_free(*cr_grain_block); *cr_grain_block = NULL; } static bool init_arrays(const aom_film_grain_t *params, int luma_stride, int chroma_stride, int ***pred_pos_luma_p, int ***pred_pos_chroma_p, int **luma_grain_block, int **cb_grain_block, int **cr_grain_block, int **y_line_buf, int **cb_line_buf, int **cr_line_buf, int **y_col_buf, int **cb_col_buf, int **cr_col_buf, int luma_grain_samples, int chroma_grain_samples, int chroma_subsamp_y, int chroma_subsamp_x) { *pred_pos_luma_p = NULL; *pred_pos_chroma_p = NULL; *luma_grain_block = NULL; *cb_grain_block = NULL; *cr_grain_block = NULL; *y_line_buf = NULL; *cb_line_buf = NULL; *cr_line_buf = NULL; *y_col_buf = NULL; *cb_col_buf = NULL; *cr_col_buf = NULL; memset(scaling_lut_y, 0, sizeof(*scaling_lut_y) * 256); memset(scaling_lut_cb, 0, sizeof(*scaling_lut_cb) * 256); memset(scaling_lut_cr, 0, sizeof(*scaling_lut_cr) * 256); int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); int num_pos_chroma = num_pos_luma; if (params->num_y_points > 0) ++num_pos_chroma; int **pred_pos_luma; int **pred_pos_chroma; pred_pos_luma = (int **)aom_calloc(num_pos_luma, sizeof(*pred_pos_luma)); if (!pred_pos_luma) return false; for (int row = 0; row < num_pos_luma; row++) { pred_pos_luma[row] = (int *)aom_malloc(sizeof(**pred_pos_luma) * 3); if (!pred_pos_luma[row]) { dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p, luma_grain_block, cb_grain_block, cr_grain_block, y_line_buf, cb_line_buf, cr_line_buf, y_col_buf, cb_col_buf, cr_col_buf); return false; } } pred_pos_chroma = (int **)aom_calloc(num_pos_chroma, sizeof(*pred_pos_chroma)); if (!pred_pos_chroma) { dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p, luma_grain_block, cb_grain_block, cr_grain_block, y_line_buf, cb_line_buf, cr_line_buf, y_col_buf, cb_col_buf, cr_col_buf); return false; } for (int row = 0; row < num_pos_chroma; row++) { pred_pos_chroma[row] = (int *)aom_malloc(sizeof(**pred_pos_chroma) * 3); if (!pred_pos_chroma[row]) { dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p, luma_grain_block, cb_grain_block, cr_grain_block, y_line_buf, cb_line_buf, cr_line_buf, y_col_buf, cb_col_buf, cr_col_buf); return false; } } int pos_ar_index = 0; for (int row = -params->ar_coeff_lag; row < 0; row++) { for (int col = -params->ar_coeff_lag; col < params->ar_coeff_lag + 1; col++) { pred_pos_luma[pos_ar_index][0] = row; pred_pos_luma[pos_ar_index][1] = col; pred_pos_luma[pos_ar_index][2] = 0; pred_pos_chroma[pos_ar_index][0] = row; pred_pos_chroma[pos_ar_index][1] = col; pred_pos_chroma[pos_ar_index][2] = 0; ++pos_ar_index; } } for (int col = -params->ar_coeff_lag; col < 0; col++) { pred_pos_luma[pos_ar_index][0] = 0; pred_pos_luma[pos_ar_index][1] = col; pred_pos_luma[pos_ar_index][2] = 0; pred_pos_chroma[pos_ar_index][0] = 0; pred_pos_chroma[pos_ar_index][1] = col; pred_pos_chroma[pos_ar_index][2] = 0; ++pos_ar_index; } if (params->num_y_points > 0) { pred_pos_chroma[pos_ar_index][0] = 0; pred_pos_chroma[pos_ar_index][1] = 0; pred_pos_chroma[pos_ar_index][2] = 1; } *pred_pos_luma_p = pred_pos_luma; *pred_pos_chroma_p = pred_pos_chroma; *y_line_buf = (int *)aom_malloc(sizeof(**y_line_buf) * luma_stride * 2); *cb_line_buf = (int *)aom_malloc(sizeof(**cb_line_buf) * chroma_stride * (2 >> chroma_subsamp_y)); *cr_line_buf = (int *)aom_malloc(sizeof(**cr_line_buf) * chroma_stride * (2 >> chroma_subsamp_y)); *y_col_buf = (int *)aom_malloc(sizeof(**y_col_buf) * (luma_subblock_size_y + 2) * 2); *cb_col_buf = (int *)aom_malloc(sizeof(**cb_col_buf) * (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) * (2 >> chroma_subsamp_x)); *cr_col_buf = (int *)aom_malloc(sizeof(**cr_col_buf) * (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) * (2 >> chroma_subsamp_x)); *luma_grain_block = (int *)aom_malloc(sizeof(**luma_grain_block) * luma_grain_samples); *cb_grain_block = (int *)aom_malloc(sizeof(**cb_grain_block) * chroma_grain_samples); *cr_grain_block = (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples); if (!(*pred_pos_luma_p && *pred_pos_chroma_p && *y_line_buf && *cb_line_buf && *cr_line_buf && *y_col_buf && *cb_col_buf && *cr_col_buf && *luma_grain_block && *cb_grain_block && *cr_grain_block)) { dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p, luma_grain_block, cb_grain_block, cr_grain_block, y_line_buf, cb_line_buf, cr_line_buf, y_col_buf, cb_col_buf, cr_col_buf); return false; } return true; } // get a number between 0 and 2^bits - 1 static inline int get_random_number(int bits) { uint16_t bit; bit = ((random_register >> 0) ^ (random_register >> 1) ^ (random_register >> 3) ^ (random_register >> 12)) & 1; random_register = (random_register >> 1) | (bit << 15); return (random_register >> (16 - bits)) & ((1 << bits) - 1); } static void init_random_generator(int luma_line, uint16_t seed) { // same for the picture uint16_t msb = (seed >> 8) & 255; uint16_t lsb = seed & 255; random_register = (msb << 8) + lsb; // changes for each row int luma_num = luma_line >> 5; random_register ^= ((luma_num * 37 + 178) & 255) << 8; random_register ^= ((luma_num * 173 + 105) & 255); } static void generate_luma_grain_block( const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block, int luma_block_size_y, int luma_block_size_x, int luma_grain_stride, int left_pad, int top_pad, int right_pad, int bottom_pad) { if (params->num_y_points == 0) { memset(luma_grain_block, 0, sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride); return; } int bit_depth = params->bit_depth; int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift; int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); int rounding_offset = (1 << (params->ar_coeff_shift - 1)); for (int i = 0; i < luma_block_size_y; i++) for (int j = 0; j < luma_block_size_x; j++) luma_grain_block[i * luma_grain_stride + j] = (gaussian_sequence[get_random_number(gauss_bits)] + ((1 << gauss_sec_shift) >> 1)) >> gauss_sec_shift; for (int i = top_pad; i < luma_block_size_y - bottom_pad; i++) for (int j = left_pad; j < luma_block_size_x - right_pad; j++) { int wsum = 0; for (int pos = 0; pos < num_pos_luma; pos++) { wsum = wsum + params->ar_coeffs_y[pos] * luma_grain_block[(i + pred_pos_luma[pos][0]) * luma_grain_stride + j + pred_pos_luma[pos][1]]; } luma_grain_block[i * luma_grain_stride + j] = clamp(luma_grain_block[i * luma_grain_stride + j] + ((wsum + rounding_offset) >> params->ar_coeff_shift), grain_min, grain_max); } } static bool generate_chroma_grain_blocks( const aom_film_grain_t *params, int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block, int *cr_grain_block, int luma_grain_stride, int chroma_block_size_y, int chroma_block_size_x, int chroma_grain_stride, int left_pad, int top_pad, int right_pad, int bottom_pad, int chroma_subsamp_y, int chroma_subsamp_x) { int bit_depth = params->bit_depth; int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift; int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); if (params->num_y_points > 0) ++num_pos_chroma; int rounding_offset = (1 << (params->ar_coeff_shift - 1)); int chroma_grain_block_size = chroma_block_size_y * chroma_grain_stride; if (params->num_cb_points || params->chroma_scaling_from_luma) { init_random_generator(7 << 5, params->random_seed); for (int i = 0; i < chroma_block_size_y; i++) for (int j = 0; j < chroma_block_size_x; j++) cb_grain_block[i * chroma_grain_stride + j] = (gaussian_sequence[get_random_number(gauss_bits)] + ((1 << gauss_sec_shift) >> 1)) >> gauss_sec_shift; } else { memset(cb_grain_block, 0, sizeof(*cb_grain_block) * chroma_grain_block_size); } if (params->num_cr_points || params->chroma_scaling_from_luma) { init_random_generator(11 << 5, params->random_seed); for (int i = 0; i < chroma_block_size_y; i++) for (int j = 0; j < chroma_block_size_x; j++) cr_grain_block[i * chroma_grain_stride + j] = (gaussian_sequence[get_random_number(gauss_bits)] + ((1 << gauss_sec_shift) >> 1)) >> gauss_sec_shift; } else { memset(cr_grain_block, 0, sizeof(*cr_grain_block) * chroma_grain_block_size); } for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++) for (int j = left_pad; j < chroma_block_size_x - right_pad; j++) { int wsum_cb = 0; int wsum_cr = 0; for (int pos = 0; pos < num_pos_chroma; pos++) { if (pred_pos_chroma[pos][2] == 0) { wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * cb_grain_block[(i + pred_pos_chroma[pos][0]) * chroma_grain_stride + j + pred_pos_chroma[pos][1]]; wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * cr_grain_block[(i + pred_pos_chroma[pos][0]) * chroma_grain_stride + j + pred_pos_chroma[pos][1]]; } else if (pred_pos_chroma[pos][2] == 1) { int av_luma = 0; int luma_coord_y = ((i - top_pad) << chroma_subsamp_y) + top_pad; int luma_coord_x = ((j - left_pad) << chroma_subsamp_x) + left_pad; for (int k = luma_coord_y; k < luma_coord_y + chroma_subsamp_y + 1; k++) for (int l = luma_coord_x; l < luma_coord_x + chroma_subsamp_x + 1; l++) av_luma += luma_grain_block[k * luma_grain_stride + l]; av_luma = (av_luma + ((1 << (chroma_subsamp_y + chroma_subsamp_x)) >> 1)) >> (chroma_subsamp_y + chroma_subsamp_x); wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * av_luma; wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * av_luma; } else { fprintf( stderr, "Grain synthesis: prediction between two chroma components is " "not supported!"); return false; } } if (params->num_cb_points || params->chroma_scaling_from_luma) cb_grain_block[i * chroma_grain_stride + j] = clamp(cb_grain_block[i * chroma_grain_stride + j] + ((wsum_cb + rounding_offset) >> params->ar_coeff_shift), grain_min, grain_max); if (params->num_cr_points || params->chroma_scaling_from_luma) cr_grain_block[i * chroma_grain_stride + j] = clamp(cr_grain_block[i * chroma_grain_stride + j] + ((wsum_cr + rounding_offset) >> params->ar_coeff_shift), grain_min, grain_max); } return true; } static void init_scaling_function(const int scaling_points[][2], int num_points, int scaling_lut[]) { if (num_points == 0) return; for (int i = 0; i < scaling_points[0][0]; i++) scaling_lut[i] = scaling_points[0][1]; for (int point = 0; point < num_points - 1; point++) { int delta_y = scaling_points[point + 1][1] - scaling_points[point][1]; int delta_x = scaling_points[point + 1][0] - scaling_points[point][0]; int64_t delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x); for (int x = 0; x < delta_x; x++) { scaling_lut[scaling_points[point][0] + x] = scaling_points[point][1] + (int)((x * delta + 32768) >> 16); } } for (int i = scaling_points[num_points - 1][0]; i < 256; i++) scaling_lut[i] = scaling_points[num_points - 1][1]; } // function that extracts samples from a LUT (and interpolates intemediate // frames for 10- and 12-bit video) static int scale_LUT(int *scaling_lut, int index, int bit_depth) { int x = index >> (bit_depth - 8); if (!(bit_depth - 8) || x == 255) return scaling_lut[x]; else return scaling_lut[x] + (((scaling_lut[x + 1] - scaling_lut[x]) * (index & ((1 << (bit_depth - 8)) - 1)) + (1 << (bit_depth - 9))) >> (bit_depth - 8)); } static void add_noise_to_block(const aom_film_grain_t *params, uint8_t *luma, uint8_t *cb, uint8_t *cr, int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain, int *cr_grain, int luma_grain_stride, int chroma_grain_stride, int half_luma_height, int half_luma_width, int bit_depth, int chroma_subsamp_y, int chroma_subsamp_x, int mc_identity) { int cb_mult = params->cb_mult - 128; // fixed scale int cb_luma_mult = params->cb_luma_mult - 128; // fixed scale int cb_offset = params->cb_offset - 256; int cr_mult = params->cr_mult - 128; // fixed scale int cr_luma_mult = params->cr_luma_mult - 128; // fixed scale int cr_offset = params->cr_offset - 256; int rounding_offset = (1 << (params->scaling_shift - 1)); int apply_y = params->num_y_points > 0 ? 1 : 0; int apply_cb = (params->num_cb_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0; int apply_cr = (params->num_cr_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0; if (params->chroma_scaling_from_luma) { cb_mult = 0; // fixed scale cb_luma_mult = 64; // fixed scale cb_offset = 0; cr_mult = 0; // fixed scale cr_luma_mult = 64; // fixed scale cr_offset = 0; } int min_luma, max_luma, min_chroma, max_chroma; if (params->clip_to_restricted_range) { min_luma = min_luma_legal_range; max_luma = max_luma_legal_range; if (mc_identity) { min_chroma = min_luma_legal_range; max_chroma = max_luma_legal_range; } else { min_chroma = min_chroma_legal_range; max_chroma = max_chroma_legal_range; } } else { min_luma = min_chroma = 0; max_luma = max_chroma = 255; } for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) { for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) { int average_luma = 0; if (chroma_subsamp_x) { average_luma = (luma[(i << chroma_subsamp_y) * luma_stride + (j << chroma_subsamp_x)] + luma[(i << chroma_subsamp_y) * luma_stride + (j << chroma_subsamp_x) + 1] + 1) >> 1; } else { average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j]; } if (apply_cb) { cb[i * chroma_stride + j] = clamp( cb[i * chroma_stride + j] + ((scale_LUT(scaling_lut_cb, clamp(((average_luma * cb_luma_mult + cb_mult * cb[i * chroma_stride + j]) >> 6) + cb_offset, 0, (256 << (bit_depth - 8)) - 1), 8) * cb_grain[i * chroma_grain_stride + j] + rounding_offset) >> params->scaling_shift), min_chroma, max_chroma); } if (apply_cr) { cr[i * chroma_stride + j] = clamp( cr[i * chroma_stride + j] + ((scale_LUT(scaling_lut_cr, clamp(((average_luma * cr_luma_mult + cr_mult * cr[i * chroma_stride + j]) >> 6) + cr_offset, 0, (256 << (bit_depth - 8)) - 1), 8) * cr_grain[i * chroma_grain_stride + j] + rounding_offset) >> params->scaling_shift), min_chroma, max_chroma); } } } if (apply_y) { for (int i = 0; i < (half_luma_height << 1); i++) { for (int j = 0; j < (half_luma_width << 1); j++) { luma[i * luma_stride + j] = clamp(luma[i * luma_stride + j] + ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], 8) * luma_grain[i * luma_grain_stride + j] + rounding_offset) >> params->scaling_shift), min_luma, max_luma); } } } } static void add_noise_to_block_hbd( const aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr, int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain, int *cr_grain, int luma_grain_stride, int chroma_grain_stride, int half_luma_height, int half_luma_width, int bit_depth, int chroma_subsamp_y, int chroma_subsamp_x, int mc_identity) { int cb_mult = params->cb_mult - 128; // fixed scale int cb_luma_mult = params->cb_luma_mult - 128; // fixed scale // offset value depends on the bit depth int cb_offset = (params->cb_offset << (bit_depth - 8)) - (1 << bit_depth); int cr_mult = params->cr_mult - 128; // fixed scale int cr_luma_mult = params->cr_luma_mult - 128; // fixed scale // offset value depends on the bit depth int cr_offset = (params->cr_offset << (bit_depth - 8)) - (1 << bit_depth); int rounding_offset = (1 << (params->scaling_shift - 1)); int apply_y = params->num_y_points > 0 ? 1 : 0; int apply_cb = (params->num_cb_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1 : 0; int apply_cr = (params->num_cr_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1 : 0; if (params->chroma_scaling_from_luma) { cb_mult = 0; // fixed scale cb_luma_mult = 64; // fixed scale cb_offset = 0; cr_mult = 0; // fixed scale cr_luma_mult = 64; // fixed scale cr_offset = 0; } int min_luma, max_luma, min_chroma, max_chroma; if (params->clip_to_restricted_range) { min_luma = min_luma_legal_range << (bit_depth - 8); max_luma = max_luma_legal_range << (bit_depth - 8); if (mc_identity) { min_chroma = min_luma_legal_range << (bit_depth - 8); max_chroma = max_luma_legal_range << (bit_depth - 8); } else { min_chroma = min_chroma_legal_range << (bit_depth - 8); max_chroma = max_chroma_legal_range << (bit_depth - 8); } } else { min_luma = min_chroma = 0; max_luma = max_chroma = (256 << (bit_depth - 8)) - 1; } for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) { for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) { int average_luma = 0; if (chroma_subsamp_x) { average_luma = (luma[(i << chroma_subsamp_y) * luma_stride + (j << chroma_subsamp_x)] + luma[(i << chroma_subsamp_y) * luma_stride + (j << chroma_subsamp_x) + 1] + 1) >> 1; } else { average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j]; } if (apply_cb) { cb[i * chroma_stride + j] = clamp( cb[i * chroma_stride + j] + ((scale_LUT(scaling_lut_cb, clamp(((average_luma * cb_luma_mult + cb_mult * cb[i * chroma_stride + j]) >> 6) + cb_offset, 0, (256 << (bit_depth - 8)) - 1), bit_depth) * cb_grain[i * chroma_grain_stride + j] + rounding_offset) >> params->scaling_shift), min_chroma, max_chroma); } if (apply_cr) { cr[i * chroma_stride + j] = clamp( cr[i * chroma_stride + j] + ((scale_LUT(scaling_lut_cr, clamp(((average_luma * cr_luma_mult + cr_mult * cr[i * chroma_stride + j]) >> 6) + cr_offset, 0, (256 << (bit_depth - 8)) - 1), bit_depth) * cr_grain[i * chroma_grain_stride + j] + rounding_offset) >> params->scaling_shift), min_chroma, max_chroma); } } } if (apply_y) { for (int i = 0; i < (half_luma_height << 1); i++) { for (int j = 0; j < (half_luma_width << 1); j++) { luma[i * luma_stride + j] = clamp(luma[i * luma_stride + j] + ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], bit_depth) * luma_grain[i * luma_grain_stride + j] + rounding_offset) >> params->scaling_shift), min_luma, max_luma); } } } } static void copy_rect(uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int width, int height, int use_high_bit_depth) { int hbd_coeff = use_high_bit_depth ? 2 : 1; while (height) { memcpy(dst, src, width * sizeof(uint8_t) * hbd_coeff); src += src_stride; dst += dst_stride; --height; } return; } static void copy_area(int *src, int src_stride, int *dst, int dst_stride, int width, int height) { while (height) { memcpy(dst, src, width * sizeof(*src)); src += src_stride; dst += dst_stride; --height; } return; } static void extend_even(uint8_t *dst, int dst_stride, int width, int height, int use_high_bit_depth) { if ((width & 1) == 0 && (height & 1) == 0) return; if (use_high_bit_depth) { uint16_t *dst16 = (uint16_t *)dst; int dst16_stride = dst_stride / 2; if (width & 1) { for (int i = 0; i < height; ++i) dst16[i * dst16_stride + width] = dst16[i * dst16_stride + width - 1]; } width = (width + 1) & (~1); if (height & 1) { memcpy(&dst16[height * dst16_stride], &dst16[(height - 1) * dst16_stride], sizeof(*dst16) * width); } } else { if (width & 1) { for (int i = 0; i < height; ++i) dst[i * dst_stride + width] = dst[i * dst_stride + width - 1]; } width = (width + 1) & (~1); if (height & 1) { memcpy(&dst[height * dst_stride], &dst[(height - 1) * dst_stride], sizeof(*dst) * width); } } } static void ver_boundary_overlap(int *left_block, int left_stride, int *right_block, int right_stride, int *dst_block, int dst_stride, int width, int height) { if (width == 1) { while (height) { *dst_block = clamp((*left_block * 23 + *right_block * 22 + 16) >> 5, grain_min, grain_max); left_block += left_stride; right_block += right_stride; dst_block += dst_stride; --height; } return; } else if (width == 2) { while (height) { dst_block[0] = clamp((27 * left_block[0] + 17 * right_block[0] + 16) >> 5, grain_min, grain_max); dst_block[1] = clamp((17 * left_block[1] + 27 * right_block[1] + 16) >> 5, grain_min, grain_max); left_block += left_stride; right_block += right_stride; dst_block += dst_stride; --height; } return; } } static void hor_boundary_overlap(int *top_block, int top_stride, int *bottom_block, int bottom_stride, int *dst_block, int dst_stride, int width, int height) { if (height == 1) { while (width) { *dst_block = clamp((*top_block * 23 + *bottom_block * 22 + 16) >> 5, grain_min, grain_max); ++top_block; ++bottom_block; ++dst_block; --width; } return; } else if (height == 2) { while (width) { dst_block[0] = clamp((27 * top_block[0] + 17 * bottom_block[0] + 16) >> 5, grain_min, grain_max); dst_block[dst_stride] = clamp((17 * top_block[top_stride] + 27 * bottom_block[bottom_stride] + 16) >> 5, grain_min, grain_max); ++top_block; ++bottom_block; ++dst_block; --width; } return; } } /*!\brief Add film grain * * Add film grain to an image * * Returns 0 for success, -1 for failure * * \param[in] grain_params Grain parameters * \param[in] luma luma plane * \param[in] cb cb plane * \param[in] cr cr plane * \param[in] height luma plane height * \param[in] width luma plane width * \param[in] luma_stride luma plane stride * \param[in] chroma_stride chroma plane stride */ static int add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma, uint8_t *cb, uint8_t *cr, int height, int width, int luma_stride, int chroma_stride, int use_high_bit_depth, int chroma_subsamp_y, int chroma_subsamp_x, int mc_identity) { int **pred_pos_luma; int **pred_pos_chroma; int *luma_grain_block; int *cb_grain_block; int *cr_grain_block; int *y_line_buf; int *cb_line_buf; int *cr_line_buf; int *y_col_buf; int *cb_col_buf; int *cr_col_buf; random_register = params->random_seed; int left_pad = 3; int right_pad = 3; // padding to offset for AR coefficients int top_pad = 3; int bottom_pad = 0; int ar_padding = 3; // maximum lag used for stabilization of AR coefficients luma_subblock_size_y = 32; luma_subblock_size_x = 32; chroma_subblock_size_y = luma_subblock_size_y >> chroma_subsamp_y; chroma_subblock_size_x = luma_subblock_size_x >> chroma_subsamp_x; // Initial padding is only needed for generation of // film grain templates (to stabilize the AR process) // Only a 64x64 luma and 32x32 chroma part of a template // is used later for adding grain, padding can be discarded int luma_block_size_y = top_pad + 2 * ar_padding + luma_subblock_size_y * 2 + bottom_pad; int luma_block_size_x = left_pad + 2 * ar_padding + luma_subblock_size_x * 2 + 2 * ar_padding + right_pad; int chroma_block_size_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding + chroma_subblock_size_y * 2 + bottom_pad; int chroma_block_size_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding + chroma_subblock_size_x * 2 + (2 >> chroma_subsamp_x) * ar_padding + right_pad; int luma_grain_stride = luma_block_size_x; int chroma_grain_stride = chroma_block_size_x; int overlap = params->overlap_flag; int bit_depth = params->bit_depth; const int grain_center = 128 << (bit_depth - 8); grain_min = 0 - grain_center; grain_max = grain_center - 1; if (!init_arrays(params, luma_stride, chroma_stride, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block, &cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf, luma_block_size_y * luma_block_size_x, chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y, chroma_subsamp_x)) return -1; generate_luma_grain_block(params, pred_pos_luma, luma_grain_block, luma_block_size_y, luma_block_size_x, luma_grain_stride, left_pad, top_pad, right_pad, bottom_pad); if (!generate_chroma_grain_blocks( params, pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block, luma_grain_stride, chroma_block_size_y, chroma_block_size_x, chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad, chroma_subsamp_y, chroma_subsamp_x)) return -1; init_scaling_function(params->scaling_points_y, params->num_y_points, scaling_lut_y); if (params->chroma_scaling_from_luma) { memcpy(scaling_lut_cb, scaling_lut_y, sizeof(*scaling_lut_y) * 256); memcpy(scaling_lut_cr, scaling_lut_y, sizeof(*scaling_lut_y) * 256); } else { init_scaling_function(params->scaling_points_cb, params->num_cb_points, scaling_lut_cb); init_scaling_function(params->scaling_points_cr, params->num_cr_points, scaling_lut_cr); } for (int y = 0; y < height / 2; y += (luma_subblock_size_y >> 1)) { init_random_generator(y * 2, params->random_seed); for (int x = 0; x < width / 2; x += (luma_subblock_size_x >> 1)) { int offset_y = get_random_number(8); int offset_x = (offset_y >> 4) & 15; offset_y &= 15; int luma_offset_y = left_pad + 2 * ar_padding + (offset_y << 1); int luma_offset_x = top_pad + 2 * ar_padding + (offset_x << 1); int chroma_offset_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding + offset_y * (2 >> chroma_subsamp_y); int chroma_offset_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding + offset_x * (2 >> chroma_subsamp_x); if (overlap && x) { ver_boundary_overlap( y_col_buf, 2, luma_grain_block + luma_offset_y * luma_grain_stride + luma_offset_x, luma_grain_stride, y_col_buf, 2, 2, AOMMIN(luma_subblock_size_y + 2, height - (y << 1))); ver_boundary_overlap( cb_col_buf, 2 >> chroma_subsamp_x, cb_grain_block + chroma_offset_y * chroma_grain_stride + chroma_offset_x, chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_x, AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), (height - (y << 1)) >> chroma_subsamp_y)); ver_boundary_overlap( cr_col_buf, 2 >> chroma_subsamp_x, cr_grain_block + chroma_offset_y * chroma_grain_stride + chroma_offset_x, chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_x, AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), (height - (y << 1)) >> chroma_subsamp_y)); int i = y ? 1 : 0; if (use_high_bit_depth) { add_noise_to_block_hbd( params, (uint16_t *)luma + ((y + i) << 1) * luma_stride + (x << 1), (uint16_t *)cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + (x << (1 - chroma_subsamp_x)), (uint16_t *)cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + (x << (1 - chroma_subsamp_x)), luma_stride, chroma_stride, y_col_buf + i * 4, cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), 2, (2 - chroma_subsamp_x), AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1, bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); } else { add_noise_to_block( params, luma + ((y + i) << 1) * luma_stride + (x << 1), cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + (x << (1 - chroma_subsamp_x)), cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + (x << (1 - chroma_subsamp_x)), luma_stride, chroma_stride, y_col_buf + i * 4, cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), 2, (2 - chroma_subsamp_x), AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1, bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); } } if (overlap && y) { if (x) { hor_boundary_overlap(y_line_buf + (x << 1), luma_stride, y_col_buf, 2, y_line_buf + (x << 1), luma_stride, 2, 2); hor_boundary_overlap(cb_line_buf + x * (2 >> chroma_subsamp_x), chroma_stride, cb_col_buf, 2 >> chroma_subsamp_x, cb_line_buf + x * (2 >> chroma_subsamp_x), chroma_stride, 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y); hor_boundary_overlap(cr_line_buf + x * (2 >> chroma_subsamp_x), chroma_stride, cr_col_buf, 2 >> chroma_subsamp_x, cr_line_buf + x * (2 >> chroma_subsamp_x), chroma_stride, 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y); } hor_boundary_overlap( y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride, luma_grain_block + luma_offset_y * luma_grain_stride + luma_offset_x + (x ? 2 : 0), luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride, AOMMIN(luma_subblock_size_x - ((x ? 1 : 0) << 1), width - ((x ? x + 1 : 0) << 1)), 2); hor_boundary_overlap( cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), chroma_stride, cb_grain_block + chroma_offset_y * chroma_grain_stride + chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)), chroma_grain_stride, cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), chroma_stride, AOMMIN(chroma_subblock_size_x - ((x ? 1 : 0) << (1 - chroma_subsamp_x)), (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x), 2 >> chroma_subsamp_y); hor_boundary_overlap( cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), chroma_stride, cr_grain_block + chroma_offset_y * chroma_grain_stride + chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)), chroma_grain_stride, cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), chroma_stride, AOMMIN(chroma_subblock_size_x - ((x ? 1 : 0) << (1 - chroma_subsamp_x)), (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x), 2 >> chroma_subsamp_y); if (use_high_bit_depth) { add_noise_to_block_hbd( params, (uint16_t *)luma + (y << 1) * luma_stride + (x << 1), (uint16_t *)cb + (y << (1 - chroma_subsamp_y)) * chroma_stride + (x << ((1 - chroma_subsamp_x))), (uint16_t *)cr + (y << (1 - chroma_subsamp_y)) * chroma_stride + (x << ((1 - chroma_subsamp_x))), luma_stride, chroma_stride, y_line_buf + (x << 1), cb_line_buf + (x << (1 - chroma_subsamp_x)), cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride, chroma_stride, 1, AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); } else { add_noise_to_block( params, luma + (y << 1) * luma_stride + (x << 1), cb + (y << (1 - chroma_subsamp_y)) * chroma_stride + (x << ((1 - chroma_subsamp_x))), cr + (y << (1 - chroma_subsamp_y)) * chroma_stride + (x << ((1 - chroma_subsamp_x))), luma_stride, chroma_stride, y_line_buf + (x << 1), cb_line_buf + (x << (1 - chroma_subsamp_x)), cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride, chroma_stride, 1, AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); } } int i = overlap && y ? 1 : 0; int j = overlap && x ? 1 : 0; if (use_high_bit_depth) { add_noise_to_block_hbd( params, (uint16_t *)luma + ((y + i) << 1) * luma_stride + ((x + j) << 1), (uint16_t *)cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + ((x + j) << (1 - chroma_subsamp_x)), (uint16_t *)cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + ((x + j) << (1 - chroma_subsamp_x)), luma_stride, chroma_stride, luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride + luma_offset_x + (j << 1), cb_grain_block + (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * chroma_grain_stride + chroma_offset_x + (j << (1 - chroma_subsamp_x)), cr_grain_block + (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * chroma_grain_stride + chroma_offset_x + (j << (1 - chroma_subsamp_x)), luma_grain_stride, chroma_grain_stride, AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); } else { add_noise_to_block( params, luma + ((y + i) << 1) * luma_stride + ((x + j) << 1), cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + ((x + j) << (1 - chroma_subsamp_x)), cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + ((x + j) << (1 - chroma_subsamp_x)), luma_stride, chroma_stride, luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride + luma_offset_x + (j << 1), cb_grain_block + (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * chroma_grain_stride + chroma_offset_x + (j << (1 - chroma_subsamp_x)), cr_grain_block + (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * chroma_grain_stride + chroma_offset_x + (j << (1 - chroma_subsamp_x)), luma_grain_stride, chroma_grain_stride, AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); } if (overlap) { if (x) { // Copy overlapped column bufer to line buffer copy_area(y_col_buf + (luma_subblock_size_y << 1), 2, y_line_buf + (x << 1), luma_stride, 2, 2); copy_area( cb_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)), 2 >> chroma_subsamp_x, cb_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride, 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y); copy_area( cr_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)), 2 >> chroma_subsamp_x, cr_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride, 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y); } // Copy grain to the line buffer for overlap with a bottom block copy_area( luma_grain_block + (luma_offset_y + luma_subblock_size_y) * luma_grain_stride + luma_offset_x + ((x ? 2 : 0)), luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride, AOMMIN(luma_subblock_size_x, width - (x << 1)) - (x ? 2 : 0), 2); copy_area(cb_grain_block + (chroma_offset_y + chroma_subblock_size_y) * chroma_grain_stride + chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0), chroma_grain_stride, cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), chroma_stride, AOMMIN(chroma_subblock_size_x, ((width - (x << 1)) >> chroma_subsamp_x)) - (x ? 2 >> chroma_subsamp_x : 0), 2 >> chroma_subsamp_y); copy_area(cr_grain_block + (chroma_offset_y + chroma_subblock_size_y) * chroma_grain_stride + chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0), chroma_grain_stride, cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), chroma_stride, AOMMIN(chroma_subblock_size_x, ((width - (x << 1)) >> chroma_subsamp_x)) - (x ? 2 >> chroma_subsamp_x : 0), 2 >> chroma_subsamp_y); // Copy grain to the column buffer for overlap with the next block to // the right copy_area(luma_grain_block + luma_offset_y * luma_grain_stride + luma_offset_x + luma_subblock_size_x, luma_grain_stride, y_col_buf, 2, 2, AOMMIN(luma_subblock_size_y + 2, height - (y << 1))); copy_area(cb_grain_block + chroma_offset_y * chroma_grain_stride + chroma_offset_x + chroma_subblock_size_x, chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_x, AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), (height - (y << 1)) >> chroma_subsamp_y)); copy_area(cr_grain_block + chroma_offset_y * chroma_grain_stride + chroma_offset_x + chroma_subblock_size_x, chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_x, AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), (height - (y << 1)) >> chroma_subsamp_y)); } } } dealloc_arrays(params, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block, &cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf); return 0; } int av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src, aom_image_t *dst) { uint8_t *luma, *cb, *cr; int height, width, luma_stride, chroma_stride; int use_high_bit_depth = 0; int chroma_subsamp_x = 0; int chroma_subsamp_y = 0; int mc_identity = src->mc == AOM_CICP_MC_IDENTITY ? 1 : 0; switch (src->fmt) { case AOM_IMG_FMT_AOMI420: case AOM_IMG_FMT_I420: use_high_bit_depth = 0; chroma_subsamp_x = 1; chroma_subsamp_y = 1; break; case AOM_IMG_FMT_I42016: use_high_bit_depth = 1; chroma_subsamp_x = 1; chroma_subsamp_y = 1; break; // case AOM_IMG_FMT_444A: case AOM_IMG_FMT_I444: use_high_bit_depth = 0; chroma_subsamp_x = 0; chroma_subsamp_y = 0; break; case AOM_IMG_FMT_I44416: use_high_bit_depth = 1; chroma_subsamp_x = 0; chroma_subsamp_y = 0; break; case AOM_IMG_FMT_I422: use_high_bit_depth = 0; chroma_subsamp_x = 1; chroma_subsamp_y = 0; break; case AOM_IMG_FMT_I42216: use_high_bit_depth = 1; chroma_subsamp_x = 1; chroma_subsamp_y = 0; break; default: // unknown input format fprintf(stderr, "Film grain error: input format is not supported!"); return -1; } assert(params->bit_depth == src->bit_depth); dst->fmt = src->fmt; dst->bit_depth = src->bit_depth; dst->r_w = src->r_w; dst->r_h = src->r_h; dst->d_w = src->d_w; dst->d_h = src->d_h; dst->cp = src->cp; dst->tc = src->tc; dst->mc = src->mc; dst->monochrome = src->monochrome; dst->csp = src->csp; dst->range = src->range; dst->x_chroma_shift = src->x_chroma_shift; dst->y_chroma_shift = src->y_chroma_shift; dst->temporal_id = src->temporal_id; dst->spatial_id = src->spatial_id; width = src->d_w % 2 ? src->d_w + 1 : src->d_w; height = src->d_h % 2 ? src->d_h + 1 : src->d_h; copy_rect(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y], dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w, src->d_h, use_high_bit_depth); // Note that dst is already assumed to be aligned to even. extend_even(dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w, src->d_h, use_high_bit_depth); if (!src->monochrome) { copy_rect(src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U], dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U], width >> chroma_subsamp_x, height >> chroma_subsamp_y, use_high_bit_depth); copy_rect(src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V], dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V], width >> chroma_subsamp_x, height >> chroma_subsamp_y, use_high_bit_depth); } luma = dst->planes[AOM_PLANE_Y]; cb = dst->planes[AOM_PLANE_U]; cr = dst->planes[AOM_PLANE_V]; // luma and chroma strides in samples luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth; chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth; return add_film_grain_run(params, luma, cb, cr, height, width, luma_stride, chroma_stride, use_high_bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); } aom-3.12.1/av1/decoder/grain_synthesis.h000066400000000000000000000024041477627663500200570ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief Describes film grain synthesis * */ #ifndef AOM_AV1_DECODER_GRAIN_SYNTHESIS_H_ #define AOM_AV1_DECODER_GRAIN_SYNTHESIS_H_ #ifdef __cplusplus extern "C" { #endif #include #include "aom_dsp/grain_params.h" #include "aom/aom_image.h" /*!\brief Add film grain * * Add film grain to an image * * Returns 0 for success, -1 for failure * * \param[in] grain_params Grain parameters * \param[in] src Source image * \param[out] dst Resulting image with grain */ int av1_add_film_grain(const aom_film_grain_t *grain_params, const aom_image_t *src, aom_image_t *dst); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_DECODER_GRAIN_SYNTHESIS_H_ aom-3.12.1/av1/decoder/inspection.c000066400000000000000000000140111477627663500170110ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "av1/decoder/decoder.h" #include "av1/decoder/inspection.h" #include "av1/common/enums.h" #include "av1/common/cdef.h" static void ifd_init_mi_rc(insp_frame_data *fd, int mi_cols, int mi_rows) { fd->mi_cols = mi_cols; fd->mi_rows = mi_rows; fd->mi_grid = (insp_mi_data *)aom_malloc(sizeof(insp_mi_data) * fd->mi_rows * fd->mi_cols); if (!fd->mi_grid) { fprintf(stderr, "Error allocating inspection data\n"); abort(); } } void ifd_init(insp_frame_data *fd, int frame_width, int frame_height) { int mi_cols = ALIGN_POWER_OF_TWO(frame_width, 3) >> MI_SIZE_LOG2; int mi_rows = ALIGN_POWER_OF_TWO(frame_height, 3) >> MI_SIZE_LOG2; ifd_init_mi_rc(fd, mi_cols, mi_rows); } void ifd_clear(insp_frame_data *fd) { aom_free(fd->mi_grid); fd->mi_grid = NULL; } /* TODO(negge) This function may be called by more than one thread when using a multi-threaded decoder and this may cause a data race. */ int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform) { struct AV1Decoder *pbi = (struct AV1Decoder *)decoder; AV1_COMMON *const cm = &pbi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; const CommonQuantParams *quant_params = &cm->quant_params; if (fd->mi_rows != mi_params->mi_rows || fd->mi_cols != mi_params->mi_cols) { ifd_clear(fd); ifd_init_mi_rc(fd, mi_params->mi_rows, mi_params->mi_cols); } fd->show_existing_frame = cm->show_existing_frame; fd->frame_number = cm->current_frame.frame_number; fd->show_frame = cm->show_frame; fd->frame_type = cm->current_frame.frame_type; fd->base_qindex = quant_params->base_qindex; // Set width and height of the first tile until generic support can be added TileInfo tile_info; av1_tile_set_row(&tile_info, cm, 0); av1_tile_set_col(&tile_info, cm, 0); fd->tile_mi_cols = tile_info.mi_col_end - tile_info.mi_col_start; fd->tile_mi_rows = tile_info.mi_row_end - tile_info.mi_row_start; fd->delta_q_present_flag = cm->delta_q_info.delta_q_present_flag; fd->delta_q_res = cm->delta_q_info.delta_q_res; #if CONFIG_ACCOUNTING fd->accounting = &pbi->accounting; #endif // TODO(negge): copy per frame CDEF data int i, j; for (i = 0; i < MAX_SEGMENTS; i++) { for (j = 0; j < 2; j++) { fd->y_dequant[i][j] = quant_params->y_dequant_QTX[i][j]; fd->u_dequant[i][j] = quant_params->u_dequant_QTX[i][j]; fd->v_dequant[i][j] = quant_params->v_dequant_QTX[i][j]; } } for (j = 0; j < mi_params->mi_rows; j++) { for (i = 0; i < mi_params->mi_cols; i++) { const MB_MODE_INFO *mbmi = mi_params->mi_grid_base[j * mi_params->mi_stride + i]; insp_mi_data *mi = &fd->mi_grid[j * mi_params->mi_cols + i]; // Segment mi->segment_id = mbmi->segment_id; // Motion Vectors mi->mv[0].row = mbmi->mv[0].as_mv.row; mi->mv[0].col = mbmi->mv[0].as_mv.col; mi->mv[1].row = mbmi->mv[1].as_mv.row; mi->mv[1].col = mbmi->mv[1].as_mv.col; // Reference Frames mi->ref_frame[0] = mbmi->ref_frame[0]; mi->ref_frame[1] = mbmi->ref_frame[1]; // Prediction Mode mi->mode = mbmi->mode; mi->intrabc = (int16_t)mbmi->use_intrabc; mi->palette = (int16_t)mbmi->palette_mode_info.palette_size[0]; mi->uv_palette = (int16_t)mbmi->palette_mode_info.palette_size[1]; // Prediction Mode for Chromatic planes if (mi->mode < INTRA_MODES) { mi->uv_mode = mbmi->uv_mode; } else { mi->uv_mode = UV_MODE_INVALID; } mi->motion_mode = mbmi->motion_mode; mi->compound_type = mbmi->interinter_comp.type; // Block Size mi->bsize = mbmi->bsize; // Skip Flag mi->skip = mbmi->skip_txfm; mi->filter[0] = av1_extract_interp_filter(mbmi->interp_filters, 0); mi->filter[1] = av1_extract_interp_filter(mbmi->interp_filters, 1); mi->dual_filter_type = mi->filter[0] * 3 + mi->filter[1]; // Transform // TODO(anyone): extract tx type info from mbmi->txk_type[]. const BLOCK_SIZE bsize = mbmi->bsize; const int c = i % mi_size_wide[bsize]; const int r = j % mi_size_high[bsize]; if (is_inter_block(mbmi) || is_intrabc_block(mbmi)) mi->tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(bsize, r, c)]; else mi->tx_size = mbmi->tx_size; if (skip_not_transform && mi->skip) mi->tx_size = -1; if (mi->skip) { const int tx_type_row = j - j % tx_size_high_unit[mi->tx_size]; const int tx_type_col = i - i % tx_size_wide_unit[mi->tx_size]; const int tx_type_map_idx = tx_type_row * mi_params->mi_stride + tx_type_col; mi->tx_type = mi_params->tx_type_map[tx_type_map_idx]; } else { mi->tx_type = 0; } if (skip_not_transform && (mi->skip || mbmi->tx_skip[av1_get_txk_type_index(bsize, r, c)])) mi->tx_type = -1; mi->cdef_level = cm->cdef_info.cdef_strengths[mbmi->cdef_strength] / CDEF_SEC_STRENGTHS; mi->cdef_strength = cm->cdef_info.cdef_strengths[mbmi->cdef_strength] % CDEF_SEC_STRENGTHS; mi->cdef_strength += mi->cdef_strength == 3; if (mbmi->uv_mode == UV_CFL_PRED) { mi->cfl_alpha_idx = mbmi->cfl_alpha_idx; mi->cfl_alpha_sign = mbmi->cfl_alpha_signs; } else { mi->cfl_alpha_idx = 0; mi->cfl_alpha_sign = 0; } // delta_q mi->current_qindex = mbmi->current_qindex; } } return 1; } aom-3.12.1/av1/decoder/inspection.h000066400000000000000000000043521477627663500170250ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_DECODER_INSPECTION_H_ #define AOM_AV1_DECODER_INSPECTION_H_ #ifdef __cplusplus extern "C" { #endif // __cplusplus #include "av1/common/seg_common.h" #if CONFIG_ACCOUNTING #include "av1/decoder/accounting.h" #endif #ifndef AOM_AOM_AOMDX_H_ typedef void (*aom_inspect_cb)(void *decoder, void *data); #endif typedef struct insp_mv insp_mv; struct insp_mv { int16_t row; int16_t col; }; typedef struct insp_mi_data insp_mi_data; struct insp_mi_data { insp_mv mv[2]; int16_t ref_frame[2]; int16_t mode; int16_t uv_mode; int16_t bsize; int16_t skip; int16_t segment_id; int16_t dual_filter_type; int16_t filter[2]; int16_t tx_type; int16_t tx_size; int16_t cdef_level; int16_t cdef_strength; int16_t cfl_alpha_idx; int16_t cfl_alpha_sign; int16_t current_qindex; int16_t compound_type; int16_t motion_mode; int16_t intrabc; int16_t palette; int16_t uv_palette; }; typedef struct insp_frame_data insp_frame_data; struct insp_frame_data { #if CONFIG_ACCOUNTING Accounting *accounting; #endif insp_mi_data *mi_grid; int16_t frame_number; int show_frame; int frame_type; int base_qindex; int mi_rows; int mi_cols; int tile_mi_rows; int tile_mi_cols; int16_t y_dequant[MAX_SEGMENTS][2]; int16_t u_dequant[MAX_SEGMENTS][2]; int16_t v_dequant[MAX_SEGMENTS][2]; // TODO(negge): add per frame CDEF data int delta_q_present_flag; int delta_q_res; int show_existing_frame; }; void ifd_init(insp_frame_data *fd, int frame_width, int frame_height); void ifd_clear(insp_frame_data *fd); int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform); #ifdef __cplusplus } // extern "C" #endif // __cplusplus #endif // AOM_AV1_DECODER_INSPECTION_H_ aom-3.12.1/av1/decoder/obu.c000066400000000000000000001261521477627663500154350ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_scale_rtcd.h" #include "aom/aom_codec.h" #include "aom_dsp/bitreader_buffer.h" #include "aom_ports/mem_ops.h" #include "av1/common/common.h" #include "av1/common/obu_util.h" #include "av1/common/timing.h" #include "av1/decoder/decoder.h" #include "av1/decoder/decodeframe.h" #include "av1/decoder/obu.h" aom_codec_err_t aom_get_num_layers_from_operating_point_idc( int operating_point_idc, unsigned int *number_spatial_layers, unsigned int *number_temporal_layers) { // derive number of spatial/temporal layers from operating_point_idc if (!number_spatial_layers || !number_temporal_layers) return AOM_CODEC_INVALID_PARAM; if (operating_point_idc == 0) { *number_temporal_layers = 1; *number_spatial_layers = 1; } else { *number_spatial_layers = 0; *number_temporal_layers = 0; for (int j = 0; j < MAX_NUM_SPATIAL_LAYERS; j++) { *number_spatial_layers += (operating_point_idc >> (j + MAX_NUM_TEMPORAL_LAYERS)) & 0x1; } for (int j = 0; j < MAX_NUM_TEMPORAL_LAYERS; j++) { *number_temporal_layers += (operating_point_idc >> j) & 0x1; } } return AOM_CODEC_OK; } static int is_obu_in_current_operating_point(AV1Decoder *pbi, const ObuHeader *obu_header) { if (!pbi->current_operating_point || !obu_header->has_extension) { return 1; } if ((pbi->current_operating_point >> obu_header->temporal_layer_id) & 0x1 && (pbi->current_operating_point >> (obu_header->spatial_layer_id + 8)) & 0x1) { return 1; } return 0; } static int byte_alignment(AV1_COMMON *const cm, struct aom_read_bit_buffer *const rb) { while (rb->bit_offset & 7) { if (aom_rb_read_bit(rb)) { cm->error->error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } } return 0; } static uint32_t read_temporal_delimiter_obu(void) { return 0; } // Returns a boolean that indicates success. static int read_bitstream_level(AV1_LEVEL *seq_level_idx, struct aom_read_bit_buffer *rb) { *seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS); if (!is_valid_seq_level_idx(*seq_level_idx)) return 0; return 1; } // Returns whether two sequence headers are consistent with each other. // Note that the 'op_params' field is not compared per Section 7.5 in the spec: // Within a particular coded video sequence, the contents of // sequence_header_obu must be bit-identical each time the sequence header // appears except for the contents of operating_parameters_info. static int are_seq_headers_consistent(const SequenceHeader *seq_params_old, const SequenceHeader *seq_params_new) { return !memcmp(seq_params_old, seq_params_new, offsetof(SequenceHeader, op_params)); } // On success, sets pbi->sequence_header_ready to 1 and returns the number of // bytes read from 'rb'. // On failure, sets pbi->common.error.error_code and returns 0. static uint32_t read_sequence_header_obu(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) { AV1_COMMON *const cm = &pbi->common; const uint32_t saved_bit_offset = rb->bit_offset; // Verify rb has been configured to report errors. assert(rb->error_handler); // Use a local variable to store the information as we decode. At the end, // if no errors have occurred, cm->seq_params is updated. SequenceHeader sh = *cm->seq_params; SequenceHeader *const seq_params = &sh; seq_params->profile = av1_read_profile(rb); if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) { pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; return 0; } // Still picture or not seq_params->still_picture = aom_rb_read_bit(rb); seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb); // Video must have reduced_still_picture_hdr = 0 if (!seq_params->still_picture && seq_params->reduced_still_picture_hdr) { pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; return 0; } if (seq_params->reduced_still_picture_hdr) { seq_params->timing_info_present = 0; seq_params->decoder_model_info_present_flag = 0; seq_params->display_model_info_present_flag = 0; seq_params->operating_points_cnt_minus_1 = 0; seq_params->operating_point_idc[0] = 0; seq_params->has_nonzero_operating_point_idc = false; if (!read_bitstream_level(&seq_params->seq_level_idx[0], rb)) { pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; return 0; } seq_params->tier[0] = 0; seq_params->op_params[0].decoder_model_param_present_flag = 0; seq_params->op_params[0].display_model_param_present_flag = 0; } else { seq_params->timing_info_present = aom_rb_read_bit(rb); if (seq_params->timing_info_present) { av1_read_timing_info_header(&seq_params->timing_info, &pbi->error, rb); seq_params->decoder_model_info_present_flag = aom_rb_read_bit(rb); if (seq_params->decoder_model_info_present_flag) av1_read_decoder_model_info(&seq_params->decoder_model_info, rb); } else { seq_params->decoder_model_info_present_flag = 0; } seq_params->display_model_info_present_flag = aom_rb_read_bit(rb); seq_params->operating_points_cnt_minus_1 = aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS); seq_params->has_nonzero_operating_point_idc = false; for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) { seq_params->operating_point_idc[i] = aom_rb_read_literal(rb, OP_POINTS_IDC_BITS); if (seq_params->operating_point_idc[i] != 0) seq_params->has_nonzero_operating_point_idc = true; if (!read_bitstream_level(&seq_params->seq_level_idx[i], rb)) { pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; return 0; } // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7 // is equivalent to level 3.3. if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0) seq_params->tier[i] = aom_rb_read_bit(rb); else seq_params->tier[i] = 0; if (seq_params->decoder_model_info_present_flag) { seq_params->op_params[i].decoder_model_param_present_flag = aom_rb_read_bit(rb); if (seq_params->op_params[i].decoder_model_param_present_flag) av1_read_op_parameters_info(&seq_params->op_params[i], seq_params->decoder_model_info .encoder_decoder_buffer_delay_length, rb); } else { seq_params->op_params[i].decoder_model_param_present_flag = 0; } if (seq_params->timing_info_present && (seq_params->timing_info.equal_picture_interval || seq_params->op_params[i].decoder_model_param_present_flag)) { seq_params->op_params[i].bitrate = av1_max_level_bitrate( seq_params->profile, seq_params->seq_level_idx[i], seq_params->tier[i]); // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass // the check if (seq_params->op_params[i].bitrate == 0) aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, "AV1 does not support this combination of " "profile, level, and tier."); // Buffer size in bits/s is bitrate in bits/s * 1 s seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate; } if (seq_params->timing_info_present && seq_params->timing_info.equal_picture_interval && !seq_params->op_params[i].decoder_model_param_present_flag) { // When the decoder_model_parameters are not sent for this op, set // the default ones that can be used with the resource availability mode seq_params->op_params[i].decoder_buffer_delay = 70000; seq_params->op_params[i].encoder_buffer_delay = 20000; seq_params->op_params[i].low_delay_mode_flag = 0; } if (seq_params->display_model_info_present_flag) { seq_params->op_params[i].display_model_param_present_flag = aom_rb_read_bit(rb); if (seq_params->op_params[i].display_model_param_present_flag) { seq_params->op_params[i].initial_display_delay = aom_rb_read_literal(rb, 4) + 1; if (seq_params->op_params[i].initial_display_delay > 10) aom_internal_error( &pbi->error, AOM_CODEC_UNSUP_BITSTREAM, "AV1 does not support more than 10 decoded frames delay"); } else { seq_params->op_params[i].initial_display_delay = 10; } } else { seq_params->op_params[i].display_model_param_present_flag = 0; seq_params->op_params[i].initial_display_delay = 10; } } } // This decoder supports all levels. Choose operating point provided by // external means int operating_point = pbi->operating_point; if (operating_point < 0 || operating_point > seq_params->operating_points_cnt_minus_1) operating_point = 0; pbi->current_operating_point = seq_params->operating_point_idc[operating_point]; if (aom_get_num_layers_from_operating_point_idc( pbi->current_operating_point, &pbi->number_spatial_layers, &pbi->number_temporal_layers) != AOM_CODEC_OK) { pbi->error.error_code = AOM_CODEC_ERROR; return 0; } av1_read_sequence_header(cm, rb, seq_params); av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &pbi->error); if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) && !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) && !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) { aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported, " "%d %d subsampling is not supported.\n", seq_params->subsampling_x, seq_params->subsampling_y); } seq_params->film_grain_params_present = aom_rb_read_bit(rb); if (av1_check_trailing_bits(pbi, rb) != 0) { // pbi->error.error_code is already set. return 0; } // If a sequence header has been decoded before, we check if the new // one is consistent with the old one. if (pbi->sequence_header_ready) { if (!are_seq_headers_consistent(cm->seq_params, seq_params)) pbi->sequence_header_changed = 1; } *cm->seq_params = *seq_params; pbi->sequence_header_ready = 1; return ((rb->bit_offset - saved_bit_offset + 7) >> 3); } // On success, returns the frame header size. On failure, calls // aom_internal_error and does not return. If show existing frame, // also marks the data processing to end after the frame header. static uint32_t read_frame_header_obu(AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data, const uint8_t **p_data_end, int trailing_bits_present) { const uint32_t hdr_size = av1_decode_frame_headers_and_setup(pbi, rb, trailing_bits_present); const AV1_COMMON *cm = &pbi->common; if (cm->show_existing_frame) { *p_data_end = data + hdr_size; } return hdr_size; } // On success, returns the tile group header size. On failure, calls // aom_internal_error() and returns -1. static int32_t read_tile_group_header(AV1Decoder *pbi, struct aom_read_bit_buffer *rb, int *start_tile, int *end_tile, int tile_start_implicit) { AV1_COMMON *const cm = &pbi->common; CommonTileParams *const tiles = &cm->tiles; uint32_t saved_bit_offset = rb->bit_offset; int tile_start_and_end_present_flag = 0; const int num_tiles = tiles->rows * tiles->cols; if (!tiles->large_scale && num_tiles > 1) { tile_start_and_end_present_flag = aom_rb_read_bit(rb); if (tile_start_implicit && tile_start_and_end_present_flag) { aom_internal_error( &pbi->error, AOM_CODEC_UNSUP_BITSTREAM, "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0"); return -1; } } if (tiles->large_scale || num_tiles == 1 || !tile_start_and_end_present_flag) { *start_tile = 0; *end_tile = num_tiles - 1; } else { int tile_bits = tiles->log2_rows + tiles->log2_cols; *start_tile = aom_rb_read_literal(rb, tile_bits); *end_tile = aom_rb_read_literal(rb, tile_bits); } if (*start_tile != pbi->next_start_tile) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "tg_start (%d) must be equal to %d", *start_tile, pbi->next_start_tile); return -1; } if (*start_tile > *end_tile) { aom_internal_error( &pbi->error, AOM_CODEC_CORRUPT_FRAME, "tg_end (%d) must be greater than or equal to tg_start (%d)", *end_tile, *start_tile); return -1; } if (*end_tile >= num_tiles) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "tg_end (%d) must be less than NumTiles (%d)", *end_tile, num_tiles); return -1; } pbi->next_start_tile = (*end_tile == num_tiles - 1) ? 0 : *end_tile + 1; return ((rb->bit_offset - saved_bit_offset + 7) >> 3); } // On success, returns the tile group OBU size. On failure, sets // pbi->common.error.error_code and returns 0. static uint32_t read_one_tile_group_obu( AV1Decoder *pbi, struct aom_read_bit_buffer *rb, int is_first_tg, const uint8_t *data, const uint8_t *data_end, const uint8_t **p_data_end, int *is_last_tg, int tile_start_implicit) { AV1_COMMON *const cm = &pbi->common; int start_tile, end_tile; int32_t header_size, tg_payload_size; assert((rb->bit_offset & 7) == 0); assert(rb->bit_buffer + aom_rb_bytes_read(rb) == data); header_size = read_tile_group_header(pbi, rb, &start_tile, &end_tile, tile_start_implicit); if (header_size == -1 || byte_alignment(cm, rb)) return 0; data += header_size; av1_decode_tg_tiles_and_wrapup(pbi, data, data_end, p_data_end, start_tile, end_tile, is_first_tg); tg_payload_size = (uint32_t)(*p_data_end - data); *is_last_tg = end_tile == cm->tiles.rows * cm->tiles.cols - 1; return header_size + tg_payload_size; } static void alloc_tile_list_buffer(AV1Decoder *pbi, int tile_width_in_pixels, int tile_height_in_pixels) { // The resolution of the output frame is read out from the bitstream. The data // are stored in the order of Y plane, U plane and V plane. As an example, for // image format 4:2:0, the output frame of U plane and V plane is 1/4 of the // output frame. AV1_COMMON *const cm = &pbi->common; const int output_frame_width = (pbi->output_frame_width_in_tiles_minus_1 + 1) * tile_width_in_pixels; const int output_frame_height = (pbi->output_frame_height_in_tiles_minus_1 + 1) * tile_height_in_pixels; // The output frame is used to store the decoded tile list. The decoded tile // list has to fit into 1 output frame. assert((pbi->tile_count_minus_1 + 1) <= (pbi->output_frame_width_in_tiles_minus_1 + 1) * (pbi->output_frame_height_in_tiles_minus_1 + 1)); // Allocate the tile list output buffer. // Note: if cm->seq_params->use_highbitdepth is 1 and // cm->seq_params->bit_depth is 8, we could allocate less memory, namely, 8 // bits/pixel. if (aom_alloc_frame_buffer(&pbi->tile_list_outbuf, output_frame_width, output_frame_height, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, (cm->seq_params->use_highbitdepth && (cm->seq_params->bit_depth > AOM_BITS_8)), 0, cm->features.byte_alignment, false, 0)) aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate the tile list output buffer"); } static void yv12_tile_copy(const YV12_BUFFER_CONFIG *src, int hstart1, int hend1, int vstart1, int vend1, YV12_BUFFER_CONFIG *dst, int hstart2, int vstart2, int plane) { const int src_stride = (plane > 0) ? src->strides[1] : src->strides[0]; const int dst_stride = (plane > 0) ? dst->strides[1] : dst->strides[0]; int row, col; assert(src->flags & YV12_FLAG_HIGHBITDEPTH); assert(!(dst->flags & YV12_FLAG_HIGHBITDEPTH)); const uint16_t *src16 = CONVERT_TO_SHORTPTR(src->buffers[plane] + vstart1 * src_stride + hstart1); uint8_t *dst8 = dst->buffers[plane] + vstart2 * dst_stride + hstart2; for (row = vstart1; row < vend1; ++row) { for (col = 0; col < (hend1 - hstart1); ++col) *dst8++ = (uint8_t)(*src16++); src16 += src_stride - (hend1 - hstart1); dst8 += dst_stride - (hend1 - hstart1); } return; } static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi, int tile_idx, int tile_width_in_pixels, int tile_height_in_pixels) { AV1_COMMON *const cm = &pbi->common; const int ssy = cm->seq_params->subsampling_y; const int ssx = cm->seq_params->subsampling_x; const int num_planes = av1_num_planes(cm); YV12_BUFFER_CONFIG *cur_frame = &cm->cur_frame->buf; const int tr = tile_idx / (pbi->output_frame_width_in_tiles_minus_1 + 1); const int tc = tile_idx % (pbi->output_frame_width_in_tiles_minus_1 + 1); int plane; // Copy decoded tile to the tile list output buffer. for (plane = 0; plane < num_planes; ++plane) { const int shift_x = plane > 0 ? ssx : 0; const int shift_y = plane > 0 ? ssy : 0; const int h = tile_height_in_pixels >> shift_y; const int w = tile_width_in_pixels >> shift_x; // src offset int vstart1 = pbi->dec_tile_row * h; int vend1 = vstart1 + h; int hstart1 = pbi->dec_tile_col * w; int hend1 = hstart1 + w; // dst offset int vstart2 = tr * h; int hstart2 = tc * w; if (cm->seq_params->use_highbitdepth && cm->seq_params->bit_depth == AOM_BITS_8) { yv12_tile_copy(cur_frame, hstart1, hend1, vstart1, vend1, &pbi->tile_list_outbuf, hstart2, vstart2, plane); } else { switch (plane) { case 0: aom_yv12_partial_copy_y(cur_frame, hstart1, hend1, vstart1, vend1, &pbi->tile_list_outbuf, hstart2, vstart2); break; case 1: aom_yv12_partial_copy_u(cur_frame, hstart1, hend1, vstart1, vend1, &pbi->tile_list_outbuf, hstart2, vstart2); break; case 2: aom_yv12_partial_copy_v(cur_frame, hstart1, hend1, vstart1, vend1, &pbi->tile_list_outbuf, hstart2, vstart2); break; default: assert(0); } } } } // Only called while large_scale_tile = 1. // // On success, returns the tile list OBU size. On failure, sets // pbi->common.error.error_code and returns 0. static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data, const uint8_t *data_end, const uint8_t **p_data_end, int *frame_decoding_finished) { AV1_COMMON *const cm = &pbi->common; uint32_t tile_list_payload_size = 0; const int num_tiles = cm->tiles.cols * cm->tiles.rows; const int start_tile = 0; const int end_tile = num_tiles - 1; int i = 0; // Process the tile list info. pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8); pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8); pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16); // The output frame is used to store the decoded tile list. The decoded tile // list has to fit into 1 output frame. if ((pbi->tile_count_minus_1 + 1) > (pbi->output_frame_width_in_tiles_minus_1 + 1) * (pbi->output_frame_height_in_tiles_minus_1 + 1)) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } if (pbi->tile_count_minus_1 > MAX_TILES - 1) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } int tile_width, tile_height; if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } const int tile_width_in_pixels = tile_width * MI_SIZE; const int tile_height_in_pixels = tile_height * MI_SIZE; // Allocate output frame buffer for the tile list. alloc_tile_list_buffer(pbi, tile_width_in_pixels, tile_height_in_pixels); uint32_t tile_list_info_bytes = 4; tile_list_payload_size += tile_list_info_bytes; data += tile_list_info_bytes; int tile_idx = 0; for (i = 0; i <= pbi->tile_count_minus_1; i++) { // Process 1 tile. // Reset the bit reader. rb->bit_offset = 0; rb->bit_buffer = data; // Read out the tile info. uint32_t tile_info_bytes = 5; // Set reference for each tile. int ref_idx = aom_rb_read_literal(rb, 8); if (ref_idx >= MAX_EXTERNAL_REFERENCES) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } av1_set_reference_dec(cm, cm->remapped_ref_idx[0], 1, &pbi->ext_refs.refs[ref_idx]); pbi->dec_tile_row = aom_rb_read_literal(rb, 8); pbi->dec_tile_col = aom_rb_read_literal(rb, 8); if (pbi->dec_tile_row < 0 || pbi->dec_tile_col < 0 || pbi->dec_tile_row >= cm->tiles.rows || pbi->dec_tile_col >= cm->tiles.cols) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } pbi->coded_tile_data_size = aom_rb_read_literal(rb, 16) + 1; data += tile_info_bytes; if ((size_t)(data_end - data) < pbi->coded_tile_data_size) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } av1_decode_tg_tiles_and_wrapup(pbi, data, data + pbi->coded_tile_data_size, p_data_end, start_tile, end_tile, 0); uint32_t tile_payload_size = (uint32_t)(*p_data_end - data); tile_list_payload_size += tile_info_bytes + tile_payload_size; // Update data ptr for next tile decoding. data = *p_data_end; assert(data <= data_end); // Copy the decoded tile to the tile list output buffer. copy_decoded_tile_to_tile_list_buffer(pbi, tile_idx, tile_width_in_pixels, tile_height_in_pixels); tile_idx++; } *frame_decoding_finished = 1; return tile_list_payload_size; } // Returns the last nonzero byte index in 'data'. If there is no nonzero byte in // 'data', returns -1. static int get_last_nonzero_byte_index(const uint8_t *data, size_t sz) { // Scan backward and return on the first nonzero byte. int i = (int)sz - 1; while (i >= 0 && data[i] == 0) { --i; } return i; } // Allocates metadata that was read and adds it to the decoders metadata array. static void alloc_read_metadata(AV1Decoder *const pbi, OBU_METADATA_TYPE metadata_type, const uint8_t *data, size_t sz, aom_metadata_insert_flags_t insert_flag) { if (!pbi->metadata) { pbi->metadata = aom_img_metadata_array_alloc(0); if (!pbi->metadata) { aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate metadata array"); } } aom_metadata_t *metadata = aom_img_metadata_alloc(metadata_type, data, sz, insert_flag); if (!metadata) { aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Error allocating metadata"); } aom_metadata_t **metadata_array = (aom_metadata_t **)realloc(pbi->metadata->metadata_array, (pbi->metadata->sz + 1) * sizeof(metadata)); if (!metadata_array) { aom_img_metadata_free(metadata); aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Error growing metadata array"); } pbi->metadata->metadata_array = metadata_array; pbi->metadata->metadata_array[pbi->metadata->sz] = metadata; pbi->metadata->sz++; } // On failure, calls aom_internal_error() and does not return. static void read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data, size_t sz) { if (sz == 0) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "itu_t_t35_country_code is missing"); } int country_code_size = 1; if (*data == 0xFF) { if (sz == 1) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "itu_t_t35_country_code_extension_byte is missing"); } ++country_code_size; } int end_index = get_last_nonzero_byte_index(data, sz); if (end_index < country_code_size) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "No trailing bits found in ITU-T T.35 metadata OBU"); } // itu_t_t35_payload_bytes is byte aligned. Section 6.7.2 of the spec says: // itu_t_t35_payload_bytes shall be bytes containing data registered as // specified in Recommendation ITU-T T.35. // Therefore the first trailing byte should be 0x80. if (data[end_index] != 0x80) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "The last nonzero byte of the ITU-T T.35 metadata OBU " "is 0x%02x, should be 0x80.", data[end_index]); } alloc_read_metadata(pbi, OBU_METADATA_TYPE_ITUT_T35, data, end_index, AOM_MIF_ANY_FRAME); } // On success, returns the number of bytes read from 'data'. On failure, calls // aom_internal_error() and does not return. static size_t read_metadata_hdr_cll(AV1Decoder *const pbi, const uint8_t *data, size_t sz) { const size_t kHdrCllPayloadSize = 4; if (sz < kHdrCllPayloadSize) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Incorrect HDR CLL metadata payload size"); } alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_CLL, data, kHdrCllPayloadSize, AOM_MIF_ANY_FRAME); return kHdrCllPayloadSize; } // On success, returns the number of bytes read from 'data'. On failure, calls // aom_internal_error() and does not return. static size_t read_metadata_hdr_mdcv(AV1Decoder *const pbi, const uint8_t *data, size_t sz) { const size_t kMdcvPayloadSize = 24; if (sz < kMdcvPayloadSize) { aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Incorrect HDR MDCV metadata payload size"); } alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_MDCV, data, kMdcvPayloadSize, AOM_MIF_ANY_FRAME); return kMdcvPayloadSize; } static void scalability_structure(struct aom_read_bit_buffer *rb) { const int spatial_layers_cnt_minus_1 = aom_rb_read_literal(rb, 2); const int spatial_layer_dimensions_present_flag = aom_rb_read_bit(rb); const int spatial_layer_description_present_flag = aom_rb_read_bit(rb); const int temporal_group_description_present_flag = aom_rb_read_bit(rb); // scalability_structure_reserved_3bits must be set to zero and be ignored by // decoders. aom_rb_read_literal(rb, 3); if (spatial_layer_dimensions_present_flag) { for (int i = 0; i <= spatial_layers_cnt_minus_1; i++) { aom_rb_read_literal(rb, 16); aom_rb_read_literal(rb, 16); } } if (spatial_layer_description_present_flag) { for (int i = 0; i <= spatial_layers_cnt_minus_1; i++) { aom_rb_read_literal(rb, 8); } } if (temporal_group_description_present_flag) { const int temporal_group_size = aom_rb_read_literal(rb, 8); for (int i = 0; i < temporal_group_size; i++) { aom_rb_read_literal(rb, 3); aom_rb_read_bit(rb); aom_rb_read_bit(rb); const int temporal_group_ref_cnt = aom_rb_read_literal(rb, 3); for (int j = 0; j < temporal_group_ref_cnt; j++) { aom_rb_read_literal(rb, 8); } } } } static void read_metadata_scalability(struct aom_read_bit_buffer *rb) { const int scalability_mode_idc = aom_rb_read_literal(rb, 8); if (scalability_mode_idc == SCALABILITY_SS) { scalability_structure(rb); } } static void read_metadata_timecode(struct aom_read_bit_buffer *rb) { aom_rb_read_literal(rb, 5); // counting_type f(5) const int full_timestamp_flag = aom_rb_read_bit(rb); // full_timestamp_flag f(1) aom_rb_read_bit(rb); // discontinuity_flag (f1) aom_rb_read_bit(rb); // cnt_dropped_flag f(1) aom_rb_read_literal(rb, 9); // n_frames f(9) if (full_timestamp_flag) { aom_rb_read_literal(rb, 6); // seconds_value f(6) aom_rb_read_literal(rb, 6); // minutes_value f(6) aom_rb_read_literal(rb, 5); // hours_value f(5) } else { const int seconds_flag = aom_rb_read_bit(rb); // seconds_flag f(1) if (seconds_flag) { aom_rb_read_literal(rb, 6); // seconds_value f(6) const int minutes_flag = aom_rb_read_bit(rb); // minutes_flag f(1) if (minutes_flag) { aom_rb_read_literal(rb, 6); // minutes_value f(6) const int hours_flag = aom_rb_read_bit(rb); // hours_flag f(1) if (hours_flag) { aom_rb_read_literal(rb, 5); // hours_value f(5) } } } } // time_offset_length f(5) const int time_offset_length = aom_rb_read_literal(rb, 5); if (time_offset_length) { // time_offset_value f(time_offset_length) aom_rb_read_literal(rb, time_offset_length); } } // Returns the last nonzero byte in 'data'. If there is no nonzero byte in // 'data', returns 0. // // Call this function to check the following requirement in the spec: // This implies that when any payload data is present for this OBU type, at // least one byte of the payload data (including the trailing bit) shall not // be equal to 0. static uint8_t get_last_nonzero_byte(const uint8_t *data, size_t sz) { // Scan backward and return on the first nonzero byte. size_t i = sz; while (i != 0) { --i; if (data[i] != 0) return data[i]; } return 0; } // Checks the metadata for correct syntax but ignores the parsed metadata. // // On success, returns the number of bytes read from 'data'. On failure, sets // pbi->common.error.error_code and returns 0, or calls aom_internal_error() // and does not return. static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) { size_t type_length; uint64_t type_value; if (aom_uleb_decode(data, sz, &type_value, &type_length) < 0) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } const OBU_METADATA_TYPE metadata_type = (OBU_METADATA_TYPE)type_value; if (metadata_type == 0 || metadata_type >= 6) { // If metadata_type is reserved for future use or a user private value, // ignore the entire OBU and just check trailing bits. if (get_last_nonzero_byte(data + type_length, sz - type_length) == 0) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } return sz; } if (metadata_type == OBU_METADATA_TYPE_ITUT_T35) { // read_metadata_itut_t35() checks trailing bits. read_metadata_itut_t35(pbi, data + type_length, sz - type_length); return sz; } else if (metadata_type == OBU_METADATA_TYPE_HDR_CLL) { size_t bytes_read = type_length + read_metadata_hdr_cll(pbi, data + type_length, sz - type_length); if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } return sz; } else if (metadata_type == OBU_METADATA_TYPE_HDR_MDCV) { size_t bytes_read = type_length + read_metadata_hdr_mdcv(pbi, data + type_length, sz - type_length); if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } return sz; } struct aom_read_bit_buffer rb; av1_init_read_bit_buffer(pbi, &rb, data + type_length, data + sz); if (metadata_type == OBU_METADATA_TYPE_SCALABILITY) { read_metadata_scalability(&rb); } else { assert(metadata_type == OBU_METADATA_TYPE_TIMECODE); read_metadata_timecode(&rb); } if (av1_check_trailing_bits(pbi, &rb) != 0) { // pbi->error.error_code is already set. return 0; } assert((rb.bit_offset & 7) == 0); return type_length + (rb.bit_offset >> 3); } // On success, returns 'sz'. On failure, sets pbi->common.error.error_code and // returns 0. static size_t read_padding(AV1_COMMON *const cm, const uint8_t *data, size_t sz) { // The spec allows a padding OBU to be header-only (i.e., obu_size = 0). So // check trailing bits only if sz > 0. if (sz > 0) { // The payload of a padding OBU is byte aligned. Therefore the first // trailing byte should be 0x80. See https://crbug.com/aomedia/2393. const uint8_t last_nonzero_byte = get_last_nonzero_byte(data, sz); if (last_nonzero_byte != 0x80) { cm->error->error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } } return sz; } // On success, returns a boolean that indicates whether the decoding of the // current frame is finished. On failure, sets pbi->error.error_code and // returns -1. int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, const uint8_t **p_data_end) { AV1_COMMON *const cm = &pbi->common; int frame_decoding_finished = 0; int is_first_tg_obu_received = 1; // Whenever pbi->seen_frame_header is set to 1, frame_header is set to the // beginning of the frame_header_obu and frame_header_size is set to its // size. This allows us to check if a redundant frame_header_obu is a copy // of the previous frame_header_obu. // // Initialize frame_header to a dummy nonnull pointer, otherwise the Clang // Static Analyzer in clang 7.0.1 will falsely warn that a null pointer is // passed as an argument to a 'nonnull' parameter of memcmp(). The initial // value will not be used. const uint8_t *frame_header = data; uint32_t frame_header_size = 0; ObuHeader obu_header; memset(&obu_header, 0, sizeof(obu_header)); pbi->seen_frame_header = 0; pbi->next_start_tile = 0; pbi->num_tile_groups = 0; if (data_end < data) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } // Reset pbi->camera_frame_header_ready to 0 if cm->tiles.large_scale = 0. if (!cm->tiles.large_scale) pbi->camera_frame_header_ready = 0; // decode frame as a series of OBUs while (!frame_decoding_finished && pbi->error.error_code == AOM_CODEC_OK) { struct aom_read_bit_buffer rb; size_t payload_size = 0; size_t decoded_payload_size = 0; size_t obu_payload_offset = 0; size_t bytes_read = 0; const size_t bytes_available = data_end - data; if (bytes_available == 0 && !pbi->seen_frame_header) { *p_data_end = data; pbi->error.error_code = AOM_CODEC_OK; break; } aom_codec_err_t status = aom_read_obu_header_and_size(data, bytes_available, pbi->is_annexb, &obu_header, &payload_size, &bytes_read); if (status != AOM_CODEC_OK) { pbi->error.error_code = status; return -1; } // Record obu size header information. pbi->obu_size_hdr.data = data + obu_header.size; pbi->obu_size_hdr.size = bytes_read - obu_header.size; // Note: aom_read_obu_header_and_size() takes care of checking that this // doesn't cause 'data' to advance past 'data_end'. data += bytes_read; if ((size_t)(data_end - data) < payload_size) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } cm->temporal_layer_id = obu_header.temporal_layer_id; cm->spatial_layer_id = obu_header.spatial_layer_id; if (obu_header.type != OBU_TEMPORAL_DELIMITER && obu_header.type != OBU_SEQUENCE_HEADER) { // don't decode obu if it's not in current operating mode if (!is_obu_in_current_operating_point(pbi, &obu_header)) { data += payload_size; continue; } } av1_init_read_bit_buffer(pbi, &rb, data, data + payload_size); switch (obu_header.type) { case OBU_TEMPORAL_DELIMITER: decoded_payload_size = read_temporal_delimiter_obu(); if (pbi->seen_frame_header) { // A new temporal unit has started, but the frame in the previous // temporal unit is incomplete. pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } break; case OBU_SEQUENCE_HEADER: decoded_payload_size = read_sequence_header_obu(pbi, &rb); if (pbi->error.error_code != AOM_CODEC_OK) return -1; // The sequence header should not change in the middle of a frame. if (pbi->sequence_header_changed && pbi->seen_frame_header) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } break; case OBU_FRAME_HEADER: case OBU_REDUNDANT_FRAME_HEADER: case OBU_FRAME: if (obu_header.type == OBU_REDUNDANT_FRAME_HEADER) { if (!pbi->seen_frame_header) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } } else { // OBU_FRAME_HEADER or OBU_FRAME. if (pbi->seen_frame_header) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } } // Only decode first frame header received if (!pbi->seen_frame_header || (cm->tiles.large_scale && !pbi->camera_frame_header_ready)) { frame_header_size = read_frame_header_obu( pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME); frame_header = data; pbi->seen_frame_header = 1; if (!pbi->ext_tile_debug && cm->tiles.large_scale) pbi->camera_frame_header_ready = 1; } else { // Verify that the frame_header_obu is identical to the original // frame_header_obu. if (frame_header_size > payload_size || memcmp(data, frame_header, frame_header_size) != 0) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } assert(rb.bit_offset == 0); rb.bit_offset = 8 * frame_header_size; } decoded_payload_size = frame_header_size; pbi->frame_header_size = frame_header_size; cm->cur_frame->temporal_id = obu_header.temporal_layer_id; cm->cur_frame->spatial_id = obu_header.spatial_layer_id; if (cm->show_existing_frame) { if (obu_header.type == OBU_FRAME) { pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; return -1; } frame_decoding_finished = 1; pbi->seen_frame_header = 0; if (cm->show_frame && !cm->seq_params->order_hint_info.enable_order_hint) { ++cm->current_frame.frame_number; } break; } // In large scale tile coding, decode the common camera frame header // before any tile list OBU. if (!pbi->ext_tile_debug && pbi->camera_frame_header_ready) { frame_decoding_finished = 1; // Skip the rest of the frame data. decoded_payload_size = payload_size; // Update data_end. *p_data_end = data_end; break; } if (obu_header.type != OBU_FRAME) break; obu_payload_offset = frame_header_size; // Byte align the reader before reading the tile group. // byte_alignment() has set pbi->error.error_code if it returns -1. if (byte_alignment(cm, &rb)) return -1; AOM_FALLTHROUGH_INTENDED; // fall through to read tile group. case OBU_TILE_GROUP: if (!pbi->seen_frame_header) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } if (obu_payload_offset > payload_size) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } decoded_payload_size += read_one_tile_group_obu( pbi, &rb, is_first_tg_obu_received, data + obu_payload_offset, data + payload_size, p_data_end, &frame_decoding_finished, obu_header.type == OBU_FRAME); if (pbi->error.error_code != AOM_CODEC_OK) return -1; is_first_tg_obu_received = 0; if (frame_decoding_finished) { pbi->seen_frame_header = 0; pbi->next_start_tile = 0; } pbi->num_tile_groups++; break; case OBU_METADATA: decoded_payload_size = read_metadata(pbi, data, payload_size); if (pbi->error.error_code != AOM_CODEC_OK) return -1; break; case OBU_TILE_LIST: if (CONFIG_NORMAL_TILE_MODE) { pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; return -1; } // This OBU type is purely for the large scale tile coding mode. // The common camera frame header has to be already decoded. if (!pbi->camera_frame_header_ready) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } cm->tiles.large_scale = 1; av1_set_single_tile_decoding_mode(cm); decoded_payload_size = read_and_decode_one_tile_list(pbi, &rb, data, data + payload_size, p_data_end, &frame_decoding_finished); if (pbi->error.error_code != AOM_CODEC_OK) return -1; break; case OBU_PADDING: decoded_payload_size = read_padding(cm, data, payload_size); if (pbi->error.error_code != AOM_CODEC_OK) return -1; break; default: // Skip unrecognized OBUs if (payload_size > 0 && get_last_nonzero_byte(data, payload_size) == 0) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } decoded_payload_size = payload_size; break; } // Check that the signalled OBU size matches the actual amount of data read if (decoded_payload_size > payload_size) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } // If there are extra padding bytes, they should all be zero while (decoded_payload_size < payload_size) { uint8_t padding_byte = data[decoded_payload_size++]; if (padding_byte != 0) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } } data += payload_size; } if (pbi->error.error_code != AOM_CODEC_OK) return -1; return frame_decoding_finished; } aom-3.12.1/av1/decoder/obu.h000066400000000000000000000023651477627663500154410ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_DECODER_OBU_H_ #define AOM_AV1_DECODER_OBU_H_ #include "aom/aom_codec.h" #include "av1/decoder/decoder.h" // Try to decode one frame from a buffer. // Returns 1 if we decoded a frame, // 0 if we didn't decode a frame but that's okay // (eg, if there was a frame but we skipped it), // or -1 on error int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, const uint8_t **p_data_end); aom_codec_err_t aom_get_num_layers_from_operating_point_idc( int operating_point_idc, unsigned int *number_spatial_layers, unsigned int *number_temporal_layers); #endif // AOM_AV1_DECODER_OBU_H_ aom-3.12.1/av1/encoder/000077500000000000000000000000001477627663500145075ustar00rootroot00000000000000aom-3.12.1/av1/encoder/allintra_vis.c000066400000000000000000001244401477627663500173470ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "aom_util/aom_pthread.h" #if CONFIG_TFLITE #include "tensorflow/lite/c/c_api.h" #include "av1/encoder/deltaq4_model.c" #endif #include "av1/common/common_data.h" #include "av1/common/enums.h" #include "av1/common/idct.h" #include "av1/common/reconinter.h" #include "av1/encoder/allintra_vis.h" #include "av1/encoder/aq_variance.h" #include "av1/encoder/encoder.h" #include "av1/encoder/ethread.h" #include "av1/encoder/hybrid_fwd_txfm.h" #include "av1/encoder/model_rd.h" #include "av1/encoder/rdopt_utils.h" #define MB_WIENER_PRED_BLOCK_SIZE BLOCK_128X128 #define MB_WIENER_PRED_BUF_STRIDE 128 // Maximum delta-q range allowed for Variance Boost after scaling #define VAR_BOOST_MAX_DELTAQ_RANGE 80 // Maximum quantization step boost allowed for Variance Boost #define VAR_BOOST_MAX_BOOST 8.0 void av1_alloc_mb_wiener_var_pred_buf(AV1_COMMON *cm, ThreadData *td) { const int is_high_bitdepth = is_cur_buf_hbd(&td->mb.e_mbd); assert(MB_WIENER_PRED_BLOCK_SIZE < BLOCK_SIZES_ALL); const int buf_width = block_size_wide[MB_WIENER_PRED_BLOCK_SIZE]; const int buf_height = block_size_high[MB_WIENER_PRED_BLOCK_SIZE]; assert(buf_width == MB_WIENER_PRED_BUF_STRIDE); const size_t buf_size = (buf_width * buf_height * sizeof(*td->wiener_tmp_pred_buf)) << is_high_bitdepth; CHECK_MEM_ERROR(cm, td->wiener_tmp_pred_buf, aom_memalign(32, buf_size)); } void av1_dealloc_mb_wiener_var_pred_buf(ThreadData *td) { aom_free(td->wiener_tmp_pred_buf); td->wiener_tmp_pred_buf = NULL; } void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; // This block size is also used to determine number of workers in // multi-threading. If it is changed, one needs to change it accordingly in // "compute_num_ai_workers()". cpi->weber_bsize = BLOCK_8X8; if (cpi->oxcf.enable_rate_guide_deltaq) { if (cpi->mb_weber_stats && cpi->prep_rate_estimates && cpi->ext_rate_distribution) return; } else { if (cpi->mb_weber_stats) return; } CHECK_MEM_ERROR(cm, cpi->mb_weber_stats, aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols, sizeof(*cpi->mb_weber_stats))); if (cpi->oxcf.enable_rate_guide_deltaq) { CHECK_MEM_ERROR( cm, cpi->prep_rate_estimates, aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols, sizeof(*cpi->prep_rate_estimates))); CHECK_MEM_ERROR( cm, cpi->ext_rate_distribution, aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols, sizeof(*cpi->ext_rate_distribution))); } } static int64_t get_satd(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, int mi_col) { AV1_COMMON *const cm = &cpi->common; const int mi_wide = mi_size_wide[bsize]; const int mi_high = mi_size_high[bsize]; const int mi_step = mi_size_wide[cpi->weber_bsize]; int mb_stride = cpi->frame_info.mi_cols; int mb_count = 0; int64_t satd = 0; for (int row = mi_row; row < mi_row + mi_high; row += mi_step) { for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) { if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols) continue; satd += cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)] .satd; ++mb_count; } } if (mb_count) satd = (int)(satd / mb_count); satd = AOMMAX(1, satd); return (int)satd; } static int64_t get_sse(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, int mi_col) { AV1_COMMON *const cm = &cpi->common; const int mi_wide = mi_size_wide[bsize]; const int mi_high = mi_size_high[bsize]; const int mi_step = mi_size_wide[cpi->weber_bsize]; int mb_stride = cpi->frame_info.mi_cols; int mb_count = 0; int64_t distortion = 0; for (int row = mi_row; row < mi_row + mi_high; row += mi_step) { for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) { if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols) continue; distortion += cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)] .distortion; ++mb_count; } } if (mb_count) distortion = (int)(distortion / mb_count); distortion = AOMMAX(1, distortion); return (int)distortion; } static double get_max_scale(const AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, int mi_col) { const AV1_COMMON *const cm = &cpi->common; const int mi_wide = mi_size_wide[bsize]; const int mi_high = mi_size_high[bsize]; const int mi_step = mi_size_wide[cpi->weber_bsize]; int mb_stride = cpi->frame_info.mi_cols; double min_max_scale = 10.0; for (int row = mi_row; row < mi_row + mi_high; row += mi_step) { for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) { if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols) continue; const WeberStats *weber_stats = &cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)]; if (weber_stats->max_scale < 1.0) continue; if (weber_stats->max_scale < min_max_scale) min_max_scale = weber_stats->max_scale; } } return min_max_scale; } static int get_window_wiener_var(const AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, int mi_col) { const AV1_COMMON *const cm = &cpi->common; const int mi_wide = mi_size_wide[bsize]; const int mi_high = mi_size_high[bsize]; const int mi_step = mi_size_wide[cpi->weber_bsize]; int sb_wiener_var = 0; int mb_stride = cpi->frame_info.mi_cols; int mb_count = 0; double base_num = 1; double base_den = 1; double base_reg = 1; for (int row = mi_row; row < mi_row + mi_high; row += mi_step) { for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) { if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols) continue; const WeberStats *weber_stats = &cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)]; base_num += ((double)weber_stats->distortion) * sqrt((double)weber_stats->src_variance) * weber_stats->rec_pix_max; base_den += fabs( weber_stats->rec_pix_max * sqrt((double)weber_stats->src_variance) - weber_stats->src_pix_max * sqrt((double)weber_stats->rec_variance)); base_reg += sqrt((double)weber_stats->distortion) * sqrt((double)weber_stats->src_pix_max) * 0.1; ++mb_count; } } sb_wiener_var = (int)(((base_num + base_reg) / (base_den + base_reg)) / mb_count); sb_wiener_var = AOMMAX(1, sb_wiener_var); return (int)sb_wiener_var; } static int get_var_perceptual_ai(const AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, int mi_col) { const AV1_COMMON *const cm = &cpi->common; const int mi_wide = mi_size_wide[bsize]; const int mi_high = mi_size_high[bsize]; int sb_wiener_var = get_window_wiener_var(cpi, bsize, mi_row, mi_col); if (mi_row >= (mi_high / 2)) { sb_wiener_var = AOMMIN(sb_wiener_var, get_window_wiener_var(cpi, bsize, mi_row - mi_high / 2, mi_col)); } if (mi_row <= (cm->mi_params.mi_rows - mi_high - (mi_high / 2))) { sb_wiener_var = AOMMIN(sb_wiener_var, get_window_wiener_var(cpi, bsize, mi_row + mi_high / 2, mi_col)); } if (mi_col >= (mi_wide / 2)) { sb_wiener_var = AOMMIN(sb_wiener_var, get_window_wiener_var(cpi, bsize, mi_row, mi_col - mi_wide / 2)); } if (mi_col <= (cm->mi_params.mi_cols - mi_wide - (mi_wide / 2))) { sb_wiener_var = AOMMIN(sb_wiener_var, get_window_wiener_var(cpi, bsize, mi_row, mi_col + mi_wide / 2)); } return sb_wiener_var; } static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) { const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob); int rate_cost = 1; for (int idx = 0; idx < eob; ++idx) { int abs_level = abs(qcoeff[scan_order->scan[idx]]); rate_cost += (int)(log1p(abs_level) / log(2.0)) + 1 + (abs_level > 0); } return (rate_cost << AV1_PROB_COST_SHIFT); } void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x, MACROBLOCKD *xd, const int mi_row, int16_t *src_diff, tran_low_t *coeff, tran_low_t *qcoeff, tran_low_t *dqcoeff, double *sum_rec_distortion, double *sum_est_rate, uint8_t *pred_buffer) { AV1_COMMON *const cm = &cpi->common; uint8_t *buffer = cpi->source->y_buffer; int buf_stride = cpi->source->y_stride; MB_MODE_INFO mbmi; memset(&mbmi, 0, sizeof(mbmi)); MB_MODE_INFO *mbmi_ptr = &mbmi; xd->mi = &mbmi_ptr; const BLOCK_SIZE bsize = cpi->weber_bsize; const TX_SIZE tx_size = max_txsize_lookup[bsize]; const int block_size = tx_size_wide[tx_size]; const int coeff_count = block_size * block_size; const int mb_step = mi_size_wide[bsize]; const BitDepthInfo bd_info = get_bit_depth_info(xd); const MultiThreadInfo *const mt_info = &cpi->mt_info; const AV1EncAllIntraMultiThreadInfo *const intra_mt = &mt_info->intra_mt; AV1EncRowMultiThreadSync *const intra_row_mt_sync = &cpi->ppi->intra_row_mt_sync; const int mi_cols = cm->mi_params.mi_cols; const int mt_thread_id = mi_row / mb_step; // TODO(chengchen): test different unit step size const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE]; const int mt_unit_cols = (mi_cols + (mt_unit_step >> 1)) / mt_unit_step; int mt_unit_col = 0; const int is_high_bitdepth = is_cur_buf_hbd(xd); uint8_t *dst_buffer = pred_buffer; const int dst_buffer_stride = MB_WIENER_PRED_BUF_STRIDE; if (is_high_bitdepth) { uint16_t *pred_buffer_16 = (uint16_t *)pred_buffer; dst_buffer = CONVERT_TO_BYTEPTR(pred_buffer_16); } for (int mi_col = 0; mi_col < mi_cols; mi_col += mb_step) { if (mi_col % mt_unit_step == 0) { intra_mt->intra_sync_read_ptr(intra_row_mt_sync, mt_thread_id, mt_unit_col); #if CONFIG_MULTITHREAD const int num_workers = AOMMIN(mt_info->num_mod_workers[MOD_AI], mt_info->num_workers); if (num_workers > 1) { const AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; pthread_mutex_lock(enc_row_mt->mutex_); const bool exit = enc_row_mt->mb_wiener_mt_exit; pthread_mutex_unlock(enc_row_mt->mutex_); // Stop further processing in case any worker has encountered an error. if (exit) break; } #endif } PREDICTION_MODE best_mode = DC_PRED; int best_intra_cost = INT_MAX; const int mi_width = mi_size_wide[bsize]; const int mi_height = mi_size_high[bsize]; set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd, mi_row, mi_col); set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width, AOMMIN(mi_row + mi_height, cm->mi_params.mi_rows), AOMMIN(mi_col + mi_width, cm->mi_params.mi_cols)); set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], av1_num_planes(cm)); xd->mi[0]->bsize = bsize; xd->mi[0]->motion_mode = SIMPLE_TRANSLATION; // Set above and left mbmi to NULL as they are not available in the // preprocessing stage. // They are used to detemine intra edge filter types in intra prediction. if (xd->up_available) { xd->above_mbmi = NULL; } if (xd->left_available) { xd->left_mbmi = NULL; } uint8_t *mb_buffer = buffer + mi_row * MI_SIZE * buf_stride + mi_col * MI_SIZE; for (PREDICTION_MODE mode = INTRA_MODE_START; mode < INTRA_MODE_END; ++mode) { // TODO(chengchen): Here we use src instead of reconstructed frame as // the intra predictor to make single and multithread version match. // Ideally we want to use the reconstructed. av1_predict_intra_block( xd, cm->seq_params->sb_size, cm->seq_params->enable_intra_edge_filter, block_size, block_size, tx_size, mode, 0, 0, FILTER_INTRA_MODES, mb_buffer, buf_stride, dst_buffer, dst_buffer_stride, 0, 0, 0); av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size, mb_buffer, buf_stride, dst_buffer, dst_buffer_stride); av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff); int intra_cost = aom_satd(coeff, coeff_count); if (intra_cost < best_intra_cost) { best_intra_cost = intra_cost; best_mode = mode; } } av1_predict_intra_block( xd, cm->seq_params->sb_size, cm->seq_params->enable_intra_edge_filter, block_size, block_size, tx_size, best_mode, 0, 0, FILTER_INTRA_MODES, mb_buffer, buf_stride, dst_buffer, dst_buffer_stride, 0, 0, 0); av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size, mb_buffer, buf_stride, dst_buffer, dst_buffer_stride); av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff); const struct macroblock_plane *const p = &x->plane[0]; uint16_t eob; const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; QUANT_PARAM quant_param; int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param); #if CONFIG_AV1_HIGHBITDEPTH if (is_cur_buf_hbd(xd)) { av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob, scan_order, &quant_param); } else { av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob, scan_order, &quant_param); } #else av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob, scan_order, &quant_param); #endif // CONFIG_AV1_HIGHBITDEPTH if (cpi->oxcf.enable_rate_guide_deltaq) { const int rate_cost = rate_estimator(qcoeff, eob, tx_size); cpi->prep_rate_estimates[(mi_row / mb_step) * cpi->frame_info.mi_cols + (mi_col / mb_step)] = rate_cost; } av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst_buffer, dst_buffer_stride, eob, 0); WeberStats *weber_stats = &cpi->mb_weber_stats[(mi_row / mb_step) * cpi->frame_info.mi_cols + (mi_col / mb_step)]; weber_stats->rec_pix_max = 1; weber_stats->rec_variance = 0; weber_stats->src_pix_max = 1; weber_stats->src_variance = 0; weber_stats->distortion = 0; int64_t src_mean = 0; int64_t rec_mean = 0; int64_t dist_mean = 0; for (int pix_row = 0; pix_row < block_size; ++pix_row) { for (int pix_col = 0; pix_col < block_size; ++pix_col) { int src_pix, rec_pix; #if CONFIG_AV1_HIGHBITDEPTH if (is_cur_buf_hbd(xd)) { uint16_t *src = CONVERT_TO_SHORTPTR(mb_buffer); uint16_t *rec = CONVERT_TO_SHORTPTR(dst_buffer); src_pix = src[pix_row * buf_stride + pix_col]; rec_pix = rec[pix_row * dst_buffer_stride + pix_col]; } else { src_pix = mb_buffer[pix_row * buf_stride + pix_col]; rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col]; } #else src_pix = mb_buffer[pix_row * buf_stride + pix_col]; rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col]; #endif src_mean += src_pix; rec_mean += rec_pix; dist_mean += src_pix - rec_pix; weber_stats->src_variance += src_pix * src_pix; weber_stats->rec_variance += rec_pix * rec_pix; weber_stats->src_pix_max = AOMMAX(weber_stats->src_pix_max, src_pix); weber_stats->rec_pix_max = AOMMAX(weber_stats->rec_pix_max, rec_pix); weber_stats->distortion += (src_pix - rec_pix) * (src_pix - rec_pix); } } if (cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) { *sum_rec_distortion += weber_stats->distortion; int est_block_rate = 0; int64_t est_block_dist = 0; model_rd_sse_fn[MODELRD_LEGACY](cpi, x, bsize, 0, weber_stats->distortion, pix_num, &est_block_rate, &est_block_dist); *sum_est_rate += est_block_rate; } weber_stats->src_variance -= (src_mean * src_mean) / pix_num; weber_stats->rec_variance -= (rec_mean * rec_mean) / pix_num; weber_stats->distortion -= (dist_mean * dist_mean) / pix_num; weber_stats->satd = best_intra_cost; qcoeff[0] = 0; int max_scale = 0; for (int idx = 1; idx < coeff_count; ++idx) { const int abs_qcoeff = abs(qcoeff[idx]); max_scale = AOMMAX(max_scale, abs_qcoeff); } weber_stats->max_scale = max_scale; if ((mi_col + mb_step) % mt_unit_step == 0 || (mi_col + mb_step) >= mi_cols) { intra_mt->intra_sync_write_ptr(intra_row_mt_sync, mt_thread_id, mt_unit_col, mt_unit_cols); ++mt_unit_col; } } // Set the pointer to null since mbmi is only allocated inside this function. xd->mi = NULL; } static void calc_mb_wiener_var(AV1_COMP *const cpi, double *sum_rec_distortion, double *sum_est_rate) { MACROBLOCK *x = &cpi->td.mb; MACROBLOCKD *xd = &x->e_mbd; const BLOCK_SIZE bsize = cpi->weber_bsize; const int mb_step = mi_size_wide[bsize]; DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]); DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]); DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]); DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]); for (int mi_row = 0; mi_row < cpi->frame_info.mi_rows; mi_row += mb_step) { av1_calc_mb_wiener_var_row(cpi, x, xd, mi_row, src_diff, coeff, qcoeff, dqcoeff, sum_rec_distortion, sum_est_rate, cpi->td.wiener_tmp_pred_buf); } } static int64_t estimate_wiener_var_norm(AV1_COMP *const cpi, const BLOCK_SIZE norm_block_size) { const AV1_COMMON *const cm = &cpi->common; int64_t norm_factor = 1; assert(norm_block_size >= BLOCK_16X16 && norm_block_size <= BLOCK_128X128); const int norm_step = mi_size_wide[norm_block_size]; double sb_wiener_log = 0; double sb_count = 0; for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += norm_step) { for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += norm_step) { const int sb_wiener_var = get_var_perceptual_ai(cpi, norm_block_size, mi_row, mi_col); const int64_t satd = get_satd(cpi, norm_block_size, mi_row, mi_col); const int64_t sse = get_sse(cpi, norm_block_size, mi_row, mi_col); const double scaled_satd = (double)satd / sqrt((double)sse); sb_wiener_log += scaled_satd * log(sb_wiener_var); sb_count += scaled_satd; } } if (sb_count > 0) norm_factor = (int64_t)(exp(sb_wiener_log / sb_count)); norm_factor = AOMMAX(1, norm_factor); return norm_factor; } static void automatic_intra_tools_off(AV1_COMP *cpi, const double sum_rec_distortion, const double sum_est_rate) { if (!cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) return; // Thresholds const int high_quality_qindex = 128; const double high_quality_bpp = 2.0; const double high_quality_dist_per_pix = 4.0; AV1_COMMON *const cm = &cpi->common; const int qindex = cm->quant_params.base_qindex; const double dist_per_pix = (double)sum_rec_distortion / (cm->width * cm->height); // The estimate bpp is not accurate, an empirical constant 100 is divided. const double estimate_bpp = sum_est_rate / (cm->width * cm->height * 100); if (qindex < high_quality_qindex && estimate_bpp > high_quality_bpp && dist_per_pix < high_quality_dist_per_pix) { cpi->oxcf.intra_mode_cfg.enable_smooth_intra = 0; cpi->oxcf.intra_mode_cfg.enable_paeth_intra = 0; cpi->oxcf.intra_mode_cfg.enable_cfl_intra = 0; cpi->oxcf.intra_mode_cfg.enable_diagonal_intra = 0; } } static void ext_rate_guided_quantization(AV1_COMP *cpi) { // Calculation uses 8x8. const int mb_step = mi_size_wide[cpi->weber_bsize]; // Accumulate to 16x16, step size is in the unit of mi. const int block_step = 4; const char *filename = cpi->oxcf.rate_distribution_info; FILE *pfile = fopen(filename, "r"); if (pfile == NULL) { assert(pfile != NULL); return; } double ext_rate_sum = 0.0; for (int row = 0; row < cpi->frame_info.mi_rows; row += block_step) { for (int col = 0; col < cpi->frame_info.mi_cols; col += block_step) { float val; const int fields_converted = fscanf(pfile, "%f", &val); if (fields_converted != 1) { assert(fields_converted == 1); fclose(pfile); return; } ext_rate_sum += val; cpi->ext_rate_distribution[(row / mb_step) * cpi->frame_info.mi_cols + (col / mb_step)] = val; } } fclose(pfile); int uniform_rate_sum = 0; for (int row = 0; row < cpi->frame_info.mi_rows; row += block_step) { for (int col = 0; col < cpi->frame_info.mi_cols; col += block_step) { int rate_sum = 0; for (int r = 0; r < block_step; r += mb_step) { for (int c = 0; c < block_step; c += mb_step) { const int mi_row = row + r; const int mi_col = col + c; rate_sum += cpi->prep_rate_estimates[(mi_row / mb_step) * cpi->frame_info.mi_cols + (mi_col / mb_step)]; } } uniform_rate_sum += rate_sum; } } const double scale = uniform_rate_sum / ext_rate_sum; cpi->ext_rate_scale = scale; } void av1_set_mb_wiener_variance(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const SequenceHeader *const seq_params = cm->seq_params; if (aom_realloc_frame_buffer( &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL, cpi->alloc_pyramid, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); av1_alloc_mb_wiener_var_pred_buf(&cpi->common, &cpi->td); cpi->norm_wiener_variance = 0; MACROBLOCK *x = &cpi->td.mb; MACROBLOCKD *xd = &x->e_mbd; // xd->mi needs to be setup since it is used in av1_frame_init_quantizer. MB_MODE_INFO mbmi; memset(&mbmi, 0, sizeof(mbmi)); MB_MODE_INFO *mbmi_ptr = &mbmi; xd->mi = &mbmi_ptr; cm->quant_params.base_qindex = cpi->oxcf.rc_cfg.cq_level; av1_frame_init_quantizer(cpi); double sum_rec_distortion = 0.0; double sum_est_rate = 0.0; MultiThreadInfo *const mt_info = &cpi->mt_info; const int num_workers = AOMMIN(mt_info->num_mod_workers[MOD_AI], mt_info->num_workers); AV1EncAllIntraMultiThreadInfo *const intra_mt = &mt_info->intra_mt; intra_mt->intra_sync_read_ptr = av1_row_mt_sync_read_dummy; intra_mt->intra_sync_write_ptr = av1_row_mt_sync_write_dummy; // Calculate differential contrast for each block for the entire image. // TODO(chengchen): properly accumulate the distortion and rate in // av1_calc_mb_wiener_var_mt(). Until then, call calc_mb_wiener_var() if // auto_intra_tools_off is true. if (num_workers > 1 && !cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) { intra_mt->intra_sync_read_ptr = av1_row_mt_sync_read; intra_mt->intra_sync_write_ptr = av1_row_mt_sync_write; av1_calc_mb_wiener_var_mt(cpi, num_workers, &sum_rec_distortion, &sum_est_rate); } else { calc_mb_wiener_var(cpi, &sum_rec_distortion, &sum_est_rate); } // Determine whether to turn off several intra coding tools. automatic_intra_tools_off(cpi, sum_rec_distortion, sum_est_rate); // Read external rate distribution and use it to guide delta quantization if (cpi->oxcf.enable_rate_guide_deltaq) ext_rate_guided_quantization(cpi); const BLOCK_SIZE norm_block_size = cm->seq_params->sb_size; cpi->norm_wiener_variance = estimate_wiener_var_norm(cpi, norm_block_size); const int norm_step = mi_size_wide[norm_block_size]; double sb_wiener_log = 0; double sb_count = 0; for (int its_cnt = 0; its_cnt < 2; ++its_cnt) { sb_wiener_log = 0; sb_count = 0; for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += norm_step) { for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += norm_step) { int sb_wiener_var = get_var_perceptual_ai(cpi, norm_block_size, mi_row, mi_col); double beta = (double)cpi->norm_wiener_variance / sb_wiener_var; double min_max_scale = AOMMAX( 1.0, get_max_scale(cpi, cm->seq_params->sb_size, mi_row, mi_col)); beta = AOMMIN(beta, 4); beta = AOMMAX(beta, 0.25); if (beta < 1 / min_max_scale) continue; sb_wiener_var = (int)(cpi->norm_wiener_variance / beta); int64_t satd = get_satd(cpi, norm_block_size, mi_row, mi_col); int64_t sse = get_sse(cpi, norm_block_size, mi_row, mi_col); double scaled_satd = (double)satd / sqrt((double)sse); sb_wiener_log += scaled_satd * log(sb_wiener_var); sb_count += scaled_satd; } } if (sb_count > 0) cpi->norm_wiener_variance = (int64_t)(exp(sb_wiener_log / sb_count)); cpi->norm_wiener_variance = AOMMAX(1, cpi->norm_wiener_variance); } // Set the pointer to null since mbmi is only allocated inside this function. xd->mi = NULL; aom_free_frame_buffer(&cm->cur_frame->buf); av1_dealloc_mb_wiener_var_pred_buf(&cpi->td); } static int get_rate_guided_quantizer(const AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, int mi_col) { // Calculation uses 8x8. const int mb_step = mi_size_wide[cpi->weber_bsize]; // Accumulate to 16x16 const int block_step = mi_size_wide[BLOCK_16X16]; double sb_rate_hific = 0.0; double sb_rate_uniform = 0.0; for (int row = mi_row; row < mi_row + mi_size_wide[bsize]; row += block_step) { for (int col = mi_col; col < mi_col + mi_size_high[bsize]; col += block_step) { sb_rate_hific += cpi->ext_rate_distribution[(row / mb_step) * cpi->frame_info.mi_cols + (col / mb_step)]; for (int r = 0; r < block_step; r += mb_step) { for (int c = 0; c < block_step; c += mb_step) { const int this_row = row + r; const int this_col = col + c; sb_rate_uniform += cpi->prep_rate_estimates[(this_row / mb_step) * cpi->frame_info.mi_cols + (this_col / mb_step)]; } } } } sb_rate_hific *= cpi->ext_rate_scale; const double weight = 1.0; const double rate_diff = weight * (sb_rate_hific - sb_rate_uniform) / sb_rate_uniform; double scale = pow(2, rate_diff); scale = scale * scale; double min_max_scale = AOMMAX(1.0, get_max_scale(cpi, bsize, mi_row, mi_col)); scale = 1.0 / AOMMIN(1.0 / scale, min_max_scale); const AV1_COMMON *const cm = &cpi->common; const int base_qindex = cm->quant_params.base_qindex; int offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, scale); const DeltaQInfo *const delta_q_info = &cm->delta_q_info; const int max_offset = delta_q_info->delta_q_res * 10; offset = AOMMIN(offset, max_offset - 1); offset = AOMMAX(offset, -max_offset + 1); int qindex = cm->quant_params.base_qindex + offset; qindex = AOMMIN(qindex, MAXQ); qindex = AOMMAX(qindex, MINQ); if (base_qindex > MINQ) qindex = AOMMAX(qindex, MINQ + 1); return qindex; } int av1_get_sbq_perceptual_ai(const AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, int mi_col) { if (cpi->oxcf.enable_rate_guide_deltaq) { return get_rate_guided_quantizer(cpi, bsize, mi_row, mi_col); } const AV1_COMMON *const cm = &cpi->common; const int base_qindex = cm->quant_params.base_qindex; int sb_wiener_var = get_var_perceptual_ai(cpi, bsize, mi_row, mi_col); int offset = 0; double beta = (double)cpi->norm_wiener_variance / sb_wiener_var; double min_max_scale = AOMMAX(1.0, get_max_scale(cpi, bsize, mi_row, mi_col)); beta = 1.0 / AOMMIN(1.0 / beta, min_max_scale); // Cap beta such that the delta q value is not much far away from the base q. beta = AOMMIN(beta, 4); beta = AOMMAX(beta, 0.25); offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta); const DeltaQInfo *const delta_q_info = &cm->delta_q_info; offset = AOMMIN(offset, delta_q_info->delta_q_res * 20 - 1); offset = AOMMAX(offset, -delta_q_info->delta_q_res * 20 + 1); int qindex = cm->quant_params.base_qindex + offset; qindex = AOMMIN(qindex, MAXQ); qindex = AOMMAX(qindex, MINQ); if (base_qindex > MINQ) qindex = AOMMAX(qindex, MINQ + 1); return qindex; } void av1_init_mb_ur_var_buffer(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; if (cpi->mb_delta_q) return; CHECK_MEM_ERROR(cm, cpi->mb_delta_q, aom_calloc(cpi->frame_info.mb_rows * cpi->frame_info.mb_cols, sizeof(*cpi->mb_delta_q))); } #if CONFIG_TFLITE static int model_predict(BLOCK_SIZE block_size, int num_cols, int num_rows, int bit_depth, uint8_t *y_buffer, int y_stride, float *predicts0, float *predicts1) { // Create the model and interpreter options. TfLiteModel *model = TfLiteModelCreate(av1_deltaq4_model_file, av1_deltaq4_model_fsize); if (model == NULL) return 1; TfLiteInterpreterOptions *options = TfLiteInterpreterOptionsCreate(); TfLiteInterpreterOptionsSetNumThreads(options, 2); if (options == NULL) { TfLiteModelDelete(model); return 1; } // Create the interpreter. TfLiteInterpreter *interpreter = TfLiteInterpreterCreate(model, options); if (interpreter == NULL) { TfLiteInterpreterOptionsDelete(options); TfLiteModelDelete(model); return 1; } // Allocate tensors and populate the input tensor data. TfLiteInterpreterAllocateTensors(interpreter); TfLiteTensor *input_tensor = TfLiteInterpreterGetInputTensor(interpreter, 0); if (input_tensor == NULL) { TfLiteInterpreterDelete(interpreter); TfLiteInterpreterOptionsDelete(options); TfLiteModelDelete(model); return 1; } size_t input_size = TfLiteTensorByteSize(input_tensor); float *input_data = aom_calloc(input_size, 1); if (input_data == NULL) { TfLiteInterpreterDelete(interpreter); TfLiteInterpreterOptionsDelete(options); TfLiteModelDelete(model); return 1; } const int num_mi_w = mi_size_wide[block_size]; const int num_mi_h = mi_size_high[block_size]; for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { const int row_offset = (row * num_mi_h) << 2; const int col_offset = (col * num_mi_w) << 2; uint8_t *buf = y_buffer + row_offset * y_stride + col_offset; int r = row_offset, pos = 0; const float base = (float)((1 << bit_depth) - 1); while (r < row_offset + (num_mi_h << 2)) { for (int c = 0; c < (num_mi_w << 2); ++c) { input_data[pos++] = bit_depth > 8 ? (float)*CONVERT_TO_SHORTPTR(buf + c) / base : (float)*(buf + c) / base; } buf += y_stride; ++r; } TfLiteTensorCopyFromBuffer(input_tensor, input_data, input_size); // Execute inference. if (TfLiteInterpreterInvoke(interpreter) != kTfLiteOk) { TfLiteInterpreterDelete(interpreter); TfLiteInterpreterOptionsDelete(options); TfLiteModelDelete(model); return 1; } // Extract the output tensor data. const TfLiteTensor *output_tensor = TfLiteInterpreterGetOutputTensor(interpreter, 0); if (output_tensor == NULL) { TfLiteInterpreterDelete(interpreter); TfLiteInterpreterOptionsDelete(options); TfLiteModelDelete(model); return 1; } size_t output_size = TfLiteTensorByteSize(output_tensor); float output_data[2]; TfLiteTensorCopyToBuffer(output_tensor, output_data, output_size); predicts0[row * num_cols + col] = output_data[0]; predicts1[row * num_cols + col] = output_data[1]; } } // Dispose of the model and interpreter objects. TfLiteInterpreterDelete(interpreter); TfLiteInterpreterOptionsDelete(options); TfLiteModelDelete(model); aom_free(input_data); return 0; } void av1_set_mb_ur_variance(AV1_COMP *cpi) { const AV1_COMMON *cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; uint8_t *y_buffer = cpi->source->y_buffer; const int y_stride = cpi->source->y_stride; const int block_size = cpi->common.seq_params->sb_size; const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; const int num_mi_w = mi_size_wide[block_size]; const int num_mi_h = mi_size_high[block_size]; const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w; const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h; // TODO(sdeng): fit a better model_1; disable it at this time. float *mb_delta_q0, *mb_delta_q1, delta_q_avg0 = 0.0f; CHECK_MEM_ERROR(cm, mb_delta_q0, aom_calloc(num_rows * num_cols, sizeof(float))); CHECK_MEM_ERROR(cm, mb_delta_q1, aom_calloc(num_rows * num_cols, sizeof(float))); if (model_predict(block_size, num_cols, num_rows, bit_depth, y_buffer, y_stride, mb_delta_q0, mb_delta_q1)) { aom_internal_error(cm->error, AOM_CODEC_ERROR, "Failed to call TFlite functions."); } // Loop through each SB block. for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { const int index = row * num_cols + col; delta_q_avg0 += mb_delta_q0[index]; } } delta_q_avg0 /= (float)(num_rows * num_cols); float scaling_factor; const float cq_level = (float)cpi->oxcf.rc_cfg.cq_level / (float)MAXQ; if (cq_level < delta_q_avg0) { scaling_factor = cq_level / delta_q_avg0; } else { scaling_factor = 1.0f - (cq_level - delta_q_avg0) / (1.0f - delta_q_avg0); } for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { const int index = row * num_cols + col; cpi->mb_delta_q[index] = RINT((float)cpi->oxcf.q_cfg.deltaq_strength / 100.0f * (float)MAXQ * scaling_factor * (mb_delta_q0[index] - delta_q_avg0)); } } aom_free(mb_delta_q0); aom_free(mb_delta_q1); } #else // !CONFIG_TFLITE void av1_set_mb_ur_variance(AV1_COMP *cpi) { const AV1_COMMON *cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; uint8_t *y_buffer = cpi->source->y_buffer; const int y_stride = cpi->source->y_stride; const int block_size = cpi->common.seq_params->sb_size; const int num_mi_w = mi_size_wide[block_size]; const int num_mi_h = mi_size_high[block_size]; const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w; const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h; int *mb_delta_q[2]; CHECK_MEM_ERROR(cm, mb_delta_q[0], aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[0]))); CHECK_MEM_ERROR(cm, mb_delta_q[1], aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[1]))); // Approximates the model change between current version (Spet 2021) and the // baseline (July 2021). const double model_change[] = { 3.0, 3.0 }; // The following parameters are fitted from user labeled data. const double a[] = { -24.50 * 4.0, -17.20 * 4.0 }; const double b[] = { 0.004898, 0.003093 }; const double c[] = { (29.932 + model_change[0]) * 4.0, (42.100 + model_change[1]) * 4.0 }; int delta_q_avg[2] = { 0, 0 }; // Loop through each SB block. for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { double var = 0.0, num_of_var = 0.0; const int index = row * num_cols + col; // Loop through each 8x8 block. for (int mi_row = row * num_mi_h; mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h; mi_row += 2) { for (int mi_col = col * num_mi_w; mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w; mi_col += 2) { struct buf_2d buf; const int row_offset_y = mi_row << 2; const int col_offset_y = mi_col << 2; buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y; buf.stride = y_stride; unsigned int block_variance; block_variance = av1_get_perpixel_variance_facade( cpi, xd, &buf, BLOCK_8X8, AOM_PLANE_Y); block_variance = AOMMAX(block_variance, 1); var += log((double)block_variance); num_of_var += 1.0; } } var = exp(var / num_of_var); mb_delta_q[0][index] = RINT(a[0] * exp(-b[0] * var) + c[0]); mb_delta_q[1][index] = RINT(a[1] * exp(-b[1] * var) + c[1]); delta_q_avg[0] += mb_delta_q[0][index]; delta_q_avg[1] += mb_delta_q[1][index]; } } delta_q_avg[0] = RINT((double)delta_q_avg[0] / (num_rows * num_cols)); delta_q_avg[1] = RINT((double)delta_q_avg[1] / (num_rows * num_cols)); int model_idx; double scaling_factor; const int cq_level = cpi->oxcf.rc_cfg.cq_level; if (cq_level < delta_q_avg[0]) { model_idx = 0; scaling_factor = (double)cq_level / delta_q_avg[0]; } else if (cq_level < delta_q_avg[1]) { model_idx = 2; scaling_factor = (double)(cq_level - delta_q_avg[0]) / (delta_q_avg[1] - delta_q_avg[0]); } else { model_idx = 1; scaling_factor = (double)(MAXQ - cq_level) / (MAXQ - delta_q_avg[1]); } const double new_delta_q_avg = delta_q_avg[0] + scaling_factor * (delta_q_avg[1] - delta_q_avg[0]); for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { const int index = row * num_cols + col; if (model_idx == 2) { const double delta_q = mb_delta_q[0][index] + scaling_factor * (mb_delta_q[1][index] - mb_delta_q[0][index]); cpi->mb_delta_q[index] = RINT((double)cpi->oxcf.q_cfg.deltaq_strength / 100.0 * (delta_q - new_delta_q_avg)); } else { cpi->mb_delta_q[index] = RINT( (double)cpi->oxcf.q_cfg.deltaq_strength / 100.0 * scaling_factor * (mb_delta_q[model_idx][index] - delta_q_avg[model_idx])); } } } aom_free(mb_delta_q[0]); aom_free(mb_delta_q[1]); } #endif int av1_get_sbq_user_rating_based(const AV1_COMP *const cpi, int mi_row, int mi_col) { const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size; const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; const AV1_COMMON *const cm = &cpi->common; const int base_qindex = cm->quant_params.base_qindex; if (base_qindex == MINQ || base_qindex == MAXQ) return base_qindex; const int num_mi_w = mi_size_wide[bsize]; const int num_mi_h = mi_size_high[bsize]; const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w; const int index = (mi_row / num_mi_h) * num_cols + (mi_col / num_mi_w); const int delta_q = cpi->mb_delta_q[index]; int qindex = base_qindex + delta_q; qindex = AOMMIN(qindex, MAXQ); qindex = AOMMAX(qindex, MINQ + 1); return qindex; } #if !CONFIG_REALTIME_ONLY // Variance Boost: a variance adaptive quantization implementation // SVT-AV1 appendix with an overview and a graphical, step-by-step explanation // of the implementation // https://gitlab.com/AOMediaCodec/SVT-AV1/-/blob/master/Docs/Appendix-Variance-Boost.md int av1_get_sbq_variance_boost(const AV1_COMP *cpi, const MACROBLOCK *x) { const AV1_COMMON *cm = &cpi->common; const int base_qindex = cm->quant_params.base_qindex; const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth; // Variance Boost only supports 64x64 SBs. assert(cm->seq_params->sb_size == BLOCK_64X64); // Strength is currently hard-coded and optimized for still pictures. In the // future, we might want to expose this as a parameter that can be fine-tuned // by the caller. const int strength = 3; unsigned int variance = av1_get_variance_boost_block_variance(cpi, x); // Variance = 0 areas are either completely flat patches or have very fine // gradients. Boost these blocks as if they have a variance of 1. if (variance == 0) { variance = 1; } // Compute a boost based on a fast-growing formula. // High and medium variance SBs essentially get no boost, while lower variance // SBs get increasingly stronger boosts. assert(strength >= 1 && strength <= 4); // Still picture curve, with variance crossover point at 1024. double qstep_ratio = 0.15 * strength * (-log2((double)variance) + 10.0) + 1.0; qstep_ratio = fclamp(qstep_ratio, 1.0, VAR_BOOST_MAX_BOOST); double base_q = av1_convert_qindex_to_q(base_qindex, bit_depth); double target_q = base_q / qstep_ratio; int target_qindex = av1_convert_q_to_qindex(target_q, bit_depth); // Determine the SB's delta_q boost by computing an (unscaled) delta_q from // the base and target q values, then scale that delta_q according to the // frame's base qindex. // The scaling coefficients were chosen empirically to maximize SSIMULACRA 2 // scores, 10th percentile scores, and subjective quality. Boosts become // smaller (for a given variance) the lower the base qindex. int boost = (int)round((base_qindex + 544.0) * (base_qindex - target_qindex) / 1279.0); boost = AOMMIN(VAR_BOOST_MAX_DELTAQ_RANGE, boost); // Variance Boost was designed to always operate in the lossy domain, so MINQ // is excluded. int sb_qindex = AOMMAX(base_qindex - boost, MINQ + 1); return sb_qindex; } #endif aom-3.12.1/av1/encoder/allintra_vis.h000066400000000000000000000035121477627663500173500ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_ALLINTRA_VIS_H_ #define AOM_AV1_ENCODER_ALLINTRA_VIS_H_ #include "config/aom_dsp_rtcd.h" #include "av1/common/enums.h" #include "av1/common/reconintra.h" #include "av1/encoder/block.h" #include "av1/encoder/encoder.h" #define MB_WIENER_MT_UNIT_SIZE BLOCK_64X64 void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi); void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x, MACROBLOCKD *xd, const int mi_row, int16_t *src_diff, tran_low_t *coeff, tran_low_t *qcoeff, tran_low_t *dqcoeff, double *sum_rec_distortion, double *sum_est_rate, uint8_t *pred_buffer); void av1_set_mb_wiener_variance(AV1_COMP *cpi); int av1_get_sbq_perceptual_ai(const AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, int mi_col); // User rating based mode void av1_init_mb_ur_var_buffer(AV1_COMP *cpi); void av1_set_mb_ur_variance(AV1_COMP *cpi); int av1_get_sbq_user_rating_based(const AV1_COMP *const cpi, int mi_row, int mi_col); #if !CONFIG_REALTIME_ONLY int av1_get_sbq_variance_boost(const AV1_COMP *const cpi, const MACROBLOCK *x); #endif #endif // AOM_AV1_ENCODER_ALLINTRA_VIS_H_ aom-3.12.1/av1/encoder/aq_complexity.c000066400000000000000000000152471477627663500175420ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "av1/encoder/aq_complexity.h" #include "av1/encoder/aq_variance.h" #include "av1/encoder/encodeframe.h" #include "av1/common/seg_common.h" #include "av1/encoder/segmentation.h" #include "aom_dsp/aom_dsp_common.h" #define AQ_C_SEGMENTS 5 #define DEFAULT_AQ2_SEG 3 // Neutral Q segment #define AQ_C_STRENGTHS 3 static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { { 1.75, 1.25, 1.05, 1.00, 0.90 }, { 2.00, 1.50, 1.15, 1.00, 0.85 }, { 2.50, 1.75, 1.25, 1.00, 0.80 } }; static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { { 0.15, 0.30, 0.55, 2.00, 100.0 }, { 0.20, 0.40, 0.65, 2.00, 100.0 }, { 0.25, 0.50, 0.75, 2.00, 100.0 } }; static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { { -4.0, -3.0, -2.0, 100.00, 100.0 }, { -3.5, -2.5, -1.5, 100.00, 100.0 }, { -3.0, -2.0, -1.0, 100.00, 100.0 } }; static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) { // Approximate base quatizer (truncated to int) const int base_quant = av1_ac_quant_QTX(q_index, 0, bit_depth) / 4; return (base_quant > 10) + (base_quant > 25); } static bool is_frame_aq_enabled(const AV1_COMP *const cpi) { const AV1_COMMON *const cm = &cpi->common; const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; return frame_is_intra_only(cm) || cm->features.error_resilient_mode || refresh_frame->alt_ref_frame || (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref); } // Segmentation only makes sense if the target bits per SB is above a threshold. // Below this the overheads will usually outweigh any benefit. static bool is_sb_aq_enabled(const AV1_COMP *const cpi) { return cpi->rc.sb64_target_rate >= 256; } void av1_setup_in_frame_q_adj(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const int base_qindex = cm->quant_params.base_qindex; struct segmentation *const seg = &cm->seg; const int resolution_change = cm->prev_frame && (cm->width != cm->prev_frame->width || cm->height != cm->prev_frame->height); // Make SURE use of floating point in this function is safe. if (resolution_change) { memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); av1_clearall_segfeatures(seg); av1_disable_segmentation(seg); return; } if (is_frame_aq_enabled(cpi)) { int segment; const int aq_strength = get_aq_c_strength(base_qindex, cm->seq_params->bit_depth); // Clear down the segment map. memset(cpi->enc_seg.map, DEFAULT_AQ2_SEG, cm->mi_params.mi_rows * cm->mi_params.mi_cols); av1_clearall_segfeatures(seg); if (!is_sb_aq_enabled(cpi)) { av1_disable_segmentation(seg); return; } av1_enable_segmentation(seg); // Default segment "Q" feature is disabled so it defaults to the baseline Q. av1_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q); // Use some of the segments for in frame Q adjustment. for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) { int qindex_delta; if (segment == DEFAULT_AQ2_SEG) continue; qindex_delta = av1_compute_qdelta_by_rate( cpi, cm->current_frame.frame_type, base_qindex, aq_c_q_adj_factor[aq_strength][segment]); // For AQ complexity mode, we dont allow Q0 in a segment if the base // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment // Q delta is sometimes applied without going back around the rd loop. // This could lead to an illegal combination of partition size and q. if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) { qindex_delta = -base_qindex + 1; } if ((base_qindex + qindex_delta) > 0) { av1_enable_segfeature(seg, segment, SEG_LVL_ALT_Q); av1_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta); } } } } #define DEFAULT_LV_THRESH 10.0 #define MIN_DEFAULT_LV_THRESH 8.0 // Select a segment for the current block. // The choice of segment for a block depends on the ratio of the projected // bits for the block vs a target average and its spatial complexity. void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs, int mi_row, int mi_col, int projected_rate) { if ((!is_frame_aq_enabled(cpi)) || (!is_sb_aq_enabled(cpi))) return; const AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); const int mi_offset = mi_row * cm->mi_params.mi_cols + mi_col; const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_size_wide[bs]); const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_size_high[bs]); int i; unsigned char segment; // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh). // It is converted to bits << AV1_PROB_COST_SHIFT units. const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis) << AV1_PROB_COST_SHIFT; const int denom = cm->seq_params->mib_size * cm->seq_params->mib_size; const int target_rate = (int)(num / denom); double logvar; double low_var_thresh; const int aq_strength = get_aq_c_strength(cm->quant_params.base_qindex, cm->seq_params->bit_depth); low_var_thresh = (is_stat_consumption_stage_twopass(cpi)) ? AOMMAX(exp(cpi->twopass_frame.mb_av_energy), MIN_DEFAULT_LV_THRESH) : DEFAULT_LV_THRESH; av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes, bs); logvar = av1_log_block_var(cpi, mb, bs); segment = AQ_C_SEGMENTS - 1; // Just in case no break out below. for (i = 0; i < AQ_C_SEGMENTS; ++i) { // Test rate against a threshold value and variance against a threshold. // Increasing segment number (higher variance and complexity) = higher Q. if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) && (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) { segment = i; break; } } // Fill in the entires in the segment map corresponding to this SB64. const int mi_stride = cm->mi_params.mi_cols; set_segment_id(cpi->enc_seg.map, mi_offset, xmis, ymis, mi_stride, segment); } aom-3.12.1/av1/encoder/aq_complexity.h000066400000000000000000000022641477627663500175420ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ #define AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ #ifdef __cplusplus extern "C" { #endif #include "av1/common/enums.h" struct AV1_COMP; struct macroblock; // Select a segment for the current Block. void av1_caq_select_segment(const struct AV1_COMP *cpi, struct macroblock *, BLOCK_SIZE bs, int mi_row, int mi_col, int projected_rate); // This function sets up a set of segments with delta Q values around // the baseline frame quantizer. void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ aom-3.12.1/av1/encoder/aq_cyclicrefresh.c000066400000000000000000000715061477627663500201720ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "av1/common/pred_common.h" #include "av1/common/seg_common.h" #include "av1/encoder/aq_cyclicrefresh.h" #include "av1/encoder/encoder_utils.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/tokenize.h" #include "aom_dsp/aom_dsp_common.h" CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) { CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr)); if (cr == NULL) return NULL; cr->map = aom_calloc(mi_rows * mi_cols, sizeof(*cr->map)); cr->counter_encode_maxq_scene_change = 0; cr->percent_refresh_adjustment = 5; cr->rate_ratio_qdelta_adjustment = 0.25; if (cr->map == NULL) { av1_cyclic_refresh_free(cr); return NULL; } return cr; } void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) { if (cr != NULL) { aom_free(cr->map); aom_free(cr); } } // Check if this coding block, of size bsize, should be considered for refresh // (lower-qp coding). Decision can be based on various factors, such as // size of the coding block (i.e., below min_block size rejected), coding // mode, and rate/distortion. static int candidate_refresh_aq(const CYCLIC_REFRESH *cr, const MB_MODE_INFO *mbmi, int64_t rate, int64_t dist, BLOCK_SIZE bsize, int noise_level) { MV mv = mbmi->mv[0].as_mv; int is_compound = has_second_ref(mbmi); // Reject the block for lower-qp coding for non-compound mode if // projected distortion is above the threshold, and any of the following // is true: // 1) mode uses large mv // 2) mode is an intra-mode // Otherwise accept for refresh. if (!is_compound && dist > cr->thresh_dist_sb && (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh || mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh || !is_inter_block(mbmi))) return CR_SEGMENT_ID_BASE; else if ((is_compound && noise_level < kMedium) || (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb && is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 && cr->rate_boost_fac > 10)) // More aggressive delta-q for bigger blocks with zero motion. return CR_SEGMENT_ID_BOOST2; else return CR_SEGMENT_ID_BOOST1; } // Compute delta-q for the segment. static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) { const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; int deltaq = av1_compute_qdelta_by_rate( cpi, cpi->common.current_frame.frame_type, q, rate_factor); if ((-deltaq) > cr->max_qdelta_perc * q / 100) { deltaq = -cr->max_qdelta_perc * q / 100; } return deltaq; } int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi, double correction_factor) { const AV1_COMMON *const cm = &cpi->common; const int base_qindex = cm->quant_params.base_qindex; const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; const int mbs = cm->mi_params.MBs; const int num4x4bl = mbs << 4; // Weight for non-base segments: use actual number of blocks refreshed in // previous/just encoded frame. Note number of blocks here is in 4x4 units. double weight_segment1 = (double)cr->actual_num_seg1_blocks / num4x4bl; double weight_segment2 = (double)cr->actual_num_seg2_blocks / num4x4bl; if (cpi->rc.rtc_external_ratectrl) { weight_segment1 = (double)(cr->percent_refresh * cm->mi_params.mi_rows * cm->mi_params.mi_cols / 100) / num4x4bl; weight_segment2 = 0; } // Take segment weighted average for estimated bits. const int estimated_bits = (int)round( (1.0 - weight_segment1 - weight_segment2) * av1_estimate_bits_at_q(cpi, base_qindex, correction_factor) + weight_segment1 * av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[1], correction_factor) + weight_segment2 * av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[2], correction_factor)); return estimated_bits; } int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i, double correction_factor) { const AV1_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; int bits_per_mb; int num4x4bl = cm->mi_params.MBs << 4; // Weight for segment prior to encoding: take the average of the target // number for the frame to be encoded and the actual from the previous frame. double weight_segment = (double)((cr->target_num_seg_blocks + cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) >> 1) / num4x4bl; if (cpi->rc.rtc_external_ratectrl) { weight_segment = (double)((cr->target_num_seg_blocks + cr->percent_refresh * cm->mi_params.mi_rows * cm->mi_params.mi_cols / 100) >> 1) / num4x4bl; } // Compute delta-q corresponding to qindex i. int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta); const int accurate_estimate = cpi->sf.hl_sf.accurate_bit_estimate; // Take segment weighted average for bits per mb. bits_per_mb = (int)round( (1.0 - weight_segment) * av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, i, correction_factor, accurate_estimate) + weight_segment * av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, i + deltaq, correction_factor, accurate_estimate)); return bits_per_mb; } void av1_cyclic_reset_segment_skip(const AV1_COMP *cpi, MACROBLOCK *const x, int mi_row, int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) { int cdf_num; const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const int prev_segment_id = mbmi->segment_id; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; const int bw = mi_size_wide[bsize]; const int bh = mi_size_high[bsize]; const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw); const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh); assert(cm->seg.enabled); if (!cr->skip_over4x4) { mbmi->segment_id = av1_get_spatial_seg_pred(cm, xd, &cdf_num, cr->skip_over4x4); if (prev_segment_id != mbmi->segment_id) { const int block_index = mi_row * cm->mi_params.mi_cols + mi_col; const int mi_stride = cm->mi_params.mi_cols; const uint8_t segment_id = mbmi->segment_id; for (int mi_y = 0; mi_y < ymis; mi_y++) { const int map_offset = block_index + mi_y * mi_stride; memset(&cr->map[map_offset], 0, xmis); memset(&cpi->enc_seg.map[map_offset], segment_id, xmis); memset(&cm->cur_frame->seg_map[map_offset], segment_id, xmis); } } } if (!dry_run) { if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST1) x->actual_num_seg1_blocks -= xmis * ymis; else if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST2) x->actual_num_seg2_blocks -= xmis * ymis; } } void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, MACROBLOCK *const x, int mi_row, int mi_col, BLOCK_SIZE bsize, int64_t rate, int64_t dist, int skip, RUN_TYPE dry_run) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; const int bw = mi_size_wide[bsize]; const int bh = mi_size_high[bsize]; const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw); const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh); const int block_index = mi_row * cm->mi_params.mi_cols + mi_col; int noise_level = 0; if (cpi->noise_estimate.enabled) noise_level = cpi->noise_estimate.level; const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist, bsize, noise_level); int sh = cpi->cyclic_refresh->skip_over4x4 ? 2 : 1; // Default is to not update the refresh map. int new_map_value = cr->map[block_index]; // If this block is labeled for refresh, check if we should reset the // segment_id. if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) { mbmi->segment_id = refresh_this_block; // Reset segment_id if will be skipped. if (skip) mbmi->segment_id = CR_SEGMENT_ID_BASE; } const uint8_t segment_id = mbmi->segment_id; // Update the cyclic refresh map, to be used for setting segmentation map // for the next frame. If the block will be refreshed this frame, mark it // as clean. The magnitude of the -ve influences how long before we consider // it for refresh again. if (cyclic_refresh_segment_id_boosted(segment_id)) { new_map_value = -cr->time_for_refresh; } else if (refresh_this_block) { // Else if it is accepted as candidate for refresh, and has not already // been refreshed (marked as 1) then mark it as a candidate for cleanup // for future time (marked as 0), otherwise don't update it. if (cr->map[block_index] == 1) new_map_value = 0; } else { // Leave it marked as block that is not candidate for refresh. new_map_value = 1; } // Update entries in the cyclic refresh map with new_map_value, and // copy mbmi->segment_id into global segmentation map. const int mi_stride = cm->mi_params.mi_cols; for (int mi_y = 0; mi_y < ymis; mi_y += sh) { const int map_offset = block_index + mi_y * mi_stride; memset(&cr->map[map_offset], new_map_value, xmis); memset(&cpi->enc_seg.map[map_offset], segment_id, xmis); memset(&cm->cur_frame->seg_map[map_offset], segment_id, xmis); } // Accumulate cyclic refresh update counters. if (!dry_run) { if (cyclic_refresh_segment_id(segment_id) == CR_SEGMENT_ID_BOOST1) x->actual_num_seg1_blocks += xmis * ymis; else if (cyclic_refresh_segment_id(segment_id) == CR_SEGMENT_ID_BOOST2) x->actual_num_seg2_blocks += xmis * ymis; } } // Initializes counters used for cyclic refresh. void av1_init_cyclic_refresh_counters(MACROBLOCK *const x) { x->actual_num_seg1_blocks = 0; x->actual_num_seg2_blocks = 0; } // Accumulate cyclic refresh counters. void av1_accumulate_cyclic_refresh_counters( CYCLIC_REFRESH *const cyclic_refresh, const MACROBLOCK *const x) { cyclic_refresh->actual_num_seg1_blocks += x->actual_num_seg1_blocks; cyclic_refresh->actual_num_seg2_blocks += x->actual_num_seg2_blocks; } void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) { RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; // Set minimum gf_interval for GF update to a multiple of the refresh period, // with some max limit. Depending on past encoding stats, GF flag may be // reset and update may not occur until next baseline_gf_interval. const int gf_length_mult[2] = { 8, 4 }; if (cr->percent_refresh > 0) p_rc->baseline_gf_interval = AOMMIN(gf_length_mult[cpi->sf.rt_sf.gf_length_lvl] * (100 / cr->percent_refresh), MAX_GF_INTERVAL_RT); else p_rc->baseline_gf_interval = FIXED_GF_INTERVAL_RT; if (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 40) p_rc->baseline_gf_interval = 16; } // Update the segmentation map, and related quantities: cyclic refresh map, // refresh sb_index, and target number of blocks to be refreshed. // The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to // 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock. // Blocks labeled as BOOST1 may later get set to BOOST2 (during the // encoding of the superblock). static void cyclic_refresh_update_map(AV1_COMP *const cpi) { AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; unsigned char *const seg_map = cpi->enc_seg.map; unsigned char *const active_map_4x4 = cpi->active_map.map; int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame; int xmis, ymis, x, y; uint64_t sb_sad = 0; uint64_t thresh_sad_low = 0; uint64_t thresh_sad = INT64_MAX; const int mi_rows = mi_params->mi_rows, mi_cols = mi_params->mi_cols; const int mi_stride = mi_cols; // Don't set seg_map to 0 if active_maps is enabled. Active_maps will set // seg_map to either 7 or 0 (AM_SEGMENT_ID_INACTIVE/ACTIVE), and cyclic // refresh set below (segment 1 or 2) will only be set for ACTIVE blocks. if (!cpi->active_map.enabled) { memset(seg_map, CR_SEGMENT_ID_BASE, mi_rows * mi_cols); } sb_cols = (mi_cols + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size; sb_rows = (mi_rows + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size; sbs_in_frame = sb_cols * sb_rows; // Number of target blocks to get the q delta (segment 1). block_count = cr->percent_refresh * mi_rows * mi_cols / 100; // Set the segmentation map: cycle through the superblocks, starting at // cr->mb_index, and stopping when either block_count blocks have been found // to be refreshed, or we have passed through whole frame. if (cr->sb_index >= sbs_in_frame) cr->sb_index = 0; assert(cr->sb_index < sbs_in_frame); i = cr->sb_index; cr->last_sb_index = cr->sb_index; cr->target_num_seg_blocks = 0; do { int sum_map = 0; // Get the mi_row/mi_col corresponding to superblock index i. int sb_row_index = (i / sb_cols); int sb_col_index = i - sb_row_index * sb_cols; int mi_row = sb_row_index * cm->seq_params->mib_size; int mi_col = sb_col_index * cm->seq_params->mib_size; assert(mi_row >= 0 && mi_row < mi_rows); assert(mi_col >= 0 && mi_col < mi_cols); bl_index = mi_row * mi_stride + mi_col; // Loop through all MI blocks in superblock and update map. xmis = AOMMIN(mi_cols - mi_col, cm->seq_params->mib_size); ymis = AOMMIN(mi_rows - mi_row, cm->seq_params->mib_size); if (cr->use_block_sad_scene_det && cpi->rc.frames_since_key > 30 && cr->counter_encode_maxq_scene_change > 30 && cpi->src_sad_blk_64x64 != NULL && cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { sb_sad = cpi->src_sad_blk_64x64[sb_col_index + sb_cols * sb_row_index]; int scale = (cm->width * cm->height < 640 * 360) ? 6 : 8; int scale_low = 2; thresh_sad = (scale * 64 * 64); thresh_sad_low = (scale_low * 64 * 64); // For temporal layers: the base temporal layer (temporal_layer_id = 0) // has larger frame separation (2 or 4 frames apart), so use larger sad // thresholds to compensate for larger frame sad. The larger thresholds // also increase the amount of refresh, which is needed for the base // temporal layer. if (cpi->svc.number_temporal_layers > 1 && cpi->svc.temporal_layer_id == 0) { thresh_sad <<= 4; thresh_sad_low <<= 2; } } // cr_map only needed at 8x8 blocks. for (y = 0; y < ymis; y += 2) { for (x = 0; x < xmis; x += 2) { const int bl_index2 = bl_index + y * mi_stride + x; // If the block is as a candidate for clean up then mark it // for possible boost/refresh (segment 1). The segment id may get // reset to 0 later if block gets coded anything other than low motion. // If the block_sad (sb_sad) is very low label it for refresh anyway. // If active_maps is enabled, only allow for setting on ACTIVE blocks. if ((cr->map[bl_index2] == 0 || sb_sad < thresh_sad_low) && (!cpi->active_map.enabled || active_map_4x4[bl_index2] == AM_SEGMENT_ID_ACTIVE)) { sum_map += 4; } else if (cr->map[bl_index2] < 0) { cr->map[bl_index2]++; } } } // Enforce constant segment over superblock. // If segment is at least half of superblock, set to 1. // Enforce that block sad (sb_sad) is not too high. if (sum_map >= (xmis * ymis) >> 1 && sb_sad < thresh_sad) { set_segment_id(seg_map, bl_index, xmis, ymis, mi_stride, CR_SEGMENT_ID_BOOST1); cr->target_num_seg_blocks += xmis * ymis; } i++; if (i == sbs_in_frame) { i = 0; } } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index); cr->sb_index = i; if (cr->target_num_seg_blocks == 0) { // Disable segmentation, seg_map is already set to 0 above. // Don't disable if active_map is being used. if (!cpi->active_map.enabled) av1_disable_segmentation(&cm->seg); } } static int is_scene_change_detected(AV1_COMP *const cpi) { return cpi->rc.high_source_sad; } // Set cyclic refresh parameters. void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) { // TODO(marpan): Parameters need to be tuned. const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const AV1_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; SVC *const svc = &cpi->svc; const int qp_thresh = AOMMAX(16, rc->best_quality + 4); const int qp_max_thresh = 118 * MAXQ >> 7; const int scene_change_detected = is_scene_change_detected(cpi); const int is_screen_content = (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN); // A scene change or key frame marks the start of a cyclic refresh cycle. const int frames_since_scene_change = (cpi->ppi->use_svc || !is_screen_content) ? cpi->rc.frames_since_key : AOMMIN(cpi->rc.frames_since_key, cr->counter_encode_maxq_scene_change); // Cases to reset the cyclic refresh adjustment parameters. if (frame_is_intra_only(cm) || scene_change_detected || cpi->ppi->rtc_ref.bias_recovery_frame) { // Reset adaptive elements for intra only frames and scene changes. cr->percent_refresh_adjustment = 5; cr->rate_ratio_qdelta_adjustment = 0.25; } // Although this segment feature for RTC is only used for // blocks >= 8X8, for more efficient coding of the seg map // cur_frame->seg_map needs to set at 4x4 along with the // function av1_cyclic_reset_segment_skip(). Skipping over // 4x4 will therefore have small bdrate loss (~0.2%), so // we use it only for speed > 9 for now. cr->skip_over4x4 = (cpi->oxcf.speed > 9) ? 1 : 0; // should we enable cyclic refresh on this frame. cr->apply_cyclic_refresh = 1; if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) || cpi->rc.high_motion_content_screen_rtc || scene_change_detected || svc->temporal_layer_id > 0 || svc->prev_number_spatial_layers != svc->number_spatial_layers || p_rc->avg_frame_qindex[INTER_FRAME] < qp_thresh || (svc->number_spatial_layers > 1 && svc->layer_context[svc->temporal_layer_id].is_key_frame) || (frames_since_scene_change > 20 && p_rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) || (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 30 && frames_since_scene_change > 40) || cpi->ppi->rtc_ref.bias_recovery_frame) { cr->apply_cyclic_refresh = 0; return; } // Increase the amount of refresh for #temporal_layers > 2 if (svc->number_temporal_layers > 2) cr->percent_refresh = 15; else cr->percent_refresh = 10 + cr->percent_refresh_adjustment; if (cpi->active_map.enabled) { // Scale down the percent_refresh to target the active blocks only. cr->percent_refresh = cr->percent_refresh * (100 - cpi->rc.percent_blocks_inactive) / 100; if (cr->percent_refresh == 0) { cr->apply_cyclic_refresh = 0; } } cr->max_qdelta_perc = 60; cr->time_for_refresh = 0; cr->use_block_sad_scene_det = (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && cm->seq_params->sb_size == BLOCK_64X64) ? 1 : 0; cr->motion_thresh = 32; cr->rate_boost_fac = (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) ? 10 : 15; // Use larger delta-qp (increase rate_ratio_qdelta) for first few // refresh cycles after a key frame (svc) or scene change (non svc). // For non svc screen content, after a scene change gradually reduce // this boost and supress it further if either of the previous two // frames overshot. if (cr->percent_refresh > 0) { if (cpi->ppi->use_svc || !is_screen_content) { if (frames_since_scene_change < ((4 * svc->number_temporal_layers) * (100 / cr->percent_refresh))) { cr->rate_ratio_qdelta = 3.0 + cr->rate_ratio_qdelta_adjustment; } else { cr->rate_ratio_qdelta = 2.25 + cr->rate_ratio_qdelta_adjustment; } } else { double distance_from_sc_factor = AOMMIN(0.75, (int)(frames_since_scene_change / 10) * 0.1); cr->rate_ratio_qdelta = 3.0 + cr->rate_ratio_qdelta_adjustment - distance_from_sc_factor; if ((frames_since_scene_change < 10) && ((cpi->rc.rc_1_frame < 0) || (cpi->rc.rc_2_frame < 0))) { cr->rate_ratio_qdelta -= 0.25; } } } else { cr->rate_ratio_qdelta = 2.25 + cr->rate_ratio_qdelta_adjustment; } // Adjust some parameters for low resolutions. if (cm->width * cm->height <= 352 * 288) { if (cpi->svc.number_temporal_layers > 1) { cr->motion_thresh = 32; cr->rate_boost_fac = 13; } else { if (rc->avg_frame_bandwidth < 3000) { cr->motion_thresh = 16; cr->rate_boost_fac = 13; } else { cr->max_qdelta_perc = 50; cr->rate_ratio_qdelta = AOMMAX(cr->rate_ratio_qdelta, 2.0); } } } if (cpi->oxcf.rc_cfg.mode == AOM_VBR) { // To be adjusted for VBR mode, e.g., based on gf period and boost. // For now use smaller qp-delta (than CBR), no second boosted seg, and // turn-off (no refresh) on golden refresh (since it's already boosted). cr->percent_refresh = 10; cr->rate_ratio_qdelta = 1.5; cr->rate_boost_fac = 10; if (cpi->refresh_frame.golden_frame) { cr->percent_refresh = 0; cr->rate_ratio_qdelta = 1.0; } } if (rc->rtc_external_ratectrl) { cr->actual_num_seg1_blocks = cr->percent_refresh * cm->mi_params.mi_rows * cm->mi_params.mi_cols / 100; cr->actual_num_seg2_blocks = 0; } } static void cyclic_refresh_reset_resize(AV1_COMP *const cpi) { const AV1_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; memset(cr->map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); cr->sb_index = 0; cr->last_sb_index = 0; cpi->refresh_frame.golden_frame = true; cr->apply_cyclic_refresh = 0; cr->counter_encode_maxq_scene_change = 0; cr->percent_refresh_adjustment = 5; cr->rate_ratio_qdelta_adjustment = 0.25; } // Setup cyclic background refresh: set delta q and segmentation map. void av1_cyclic_refresh_setup(AV1_COMP *const cpi) { AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; struct segmentation *const seg = &cm->seg; const int scene_change_detected = is_scene_change_detected(cpi); const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); const FRAME_TYPE frame_type = cm->current_frame.frame_type; // Set resolution_change flag: for svc only set it when the // number of spatial layers has not changed. const int resolution_change = cm->prev_frame && (cm->width != cm->prev_frame->width || cm->height != cm->prev_frame->height) && cpi->svc.prev_number_spatial_layers == cpi->svc.number_spatial_layers; if (resolution_change) cyclic_refresh_reset_resize(cpi); if (!cr->apply_cyclic_refresh) { // Don't disable and set seg_map to 0 if active_maps is enabled, unless // whole frame is set as inactive (since we only apply cyclic_refresh to // active blocks). if (!cpi->active_map.enabled || cpi->rc.percent_blocks_inactive == 100) { unsigned char *const seg_map = cpi->enc_seg.map; memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); av1_disable_segmentation(&cm->seg); } if (frame_is_intra_only(cm) || scene_change_detected || cpi->ppi->rtc_ref.bias_recovery_frame) { cr->sb_index = 0; cr->last_sb_index = 0; cr->counter_encode_maxq_scene_change = 0; cr->actual_num_seg1_blocks = 0; cr->actual_num_seg2_blocks = 0; } return; } else { cr->counter_encode_maxq_scene_change++; const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex, cm->seq_params->bit_depth); // Set rate threshold to some multiple (set to 2 for now) of the target // rate (target is given by sb64_target_rate and scaled by 256). cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2; // Distortion threshold, quadratic in Q, scale factor to be adjusted. // q will not exceed 457, so (q * q) is within 32bit; see: // av1_convert_qindex_to_q(), av1_ac_quant(), ac_qlookup*[]. cr->thresh_dist_sb = ((int64_t)(q * q)) << 2; // For low-resoln or lower speeds, the rate/dist thresholds need to be // tuned/updated. if (cpi->oxcf.speed <= 7 || (cm->width * cm->height < 640 * 360)) { cr->thresh_dist_sb = 0; cr->thresh_rate_sb = INT64_MAX; } // Set up segmentation. av1_enable_segmentation(&cm->seg); if (!cpi->active_map.enabled) { // Clear down the segment map, only if active_maps is not enabled. av1_clearall_segfeatures(seg); } // Note: setting temporal_update has no effect, as the seg-map coding method // (temporal or spatial) is determined in // av1_choose_segmap_coding_method(), // based on the coding cost of each method. For error_resilient mode on the // last_frame_seg_map is set to 0, so if temporal coding is used, it is // relative to 0 previous map. // seg->temporal_update = 0; // Segment BASE "Q" feature is disabled so it defaults to the baseline Q. av1_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q); // Use segment BOOST1 for in-frame Q adjustment. av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q); // Use segment BOOST2 for more aggressive in-frame Q adjustment. av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q); // Set the q delta for segment BOOST1. const CommonQuantParams *const quant_params = &cm->quant_params; int qindex_delta = compute_deltaq(cpi, quant_params->base_qindex, cr->rate_ratio_qdelta); cr->qindex_delta[1] = qindex_delta; // Compute rd-mult for segment BOOST1. const int qindex2 = clamp( quant_params->base_qindex + quant_params->y_dc_delta_q + qindex_delta, 0, MAXQ); cr->rdmult = av1_compute_rd_mult( qindex2, cm->seq_params->bit_depth, cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi), cpi->oxcf.tune_cfg.tuning); av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta); // Set a more aggressive (higher) q delta for segment BOOST2. qindex_delta = compute_deltaq( cpi, quant_params->base_qindex, AOMMIN(CR_MAX_RATE_TARGET_RATIO, 0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta)); cr->qindex_delta[2] = qindex_delta; av1_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta); // Update the segmentation and refresh map. cyclic_refresh_update_map(cpi); } } int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) { return cr->rdmult; } int av1_cyclic_refresh_disable_lf_cdef(AV1_COMP *const cpi) { CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; const int qindex = cpi->common.quant_params.base_qindex; if (cpi->active_map.enabled && cpi->rc.percent_blocks_inactive > cpi->sf.rt_sf.thresh_active_maps_skip_lf_cdef) return 1; if (cpi->rc.frames_since_key > 30 && cr->percent_refresh > 0 && cr->counter_encode_maxq_scene_change > 300 / cr->percent_refresh && cpi->rc.frame_source_sad < 1000 && qindex < 7 * (cpi->rc.worst_quality >> 3)) return 1; // More aggressive skip. else if (cpi->sf.rt_sf.skip_lf_screen > 1 && !cpi->rc.high_source_sad && cpi->rc.frame_source_sad < 50000 && qindex < cpi->rc.worst_quality) return 1; return 0; } aom-3.12.1/av1/encoder/aq_cyclicrefresh.h000066400000000000000000000247611477627663500202000ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ #define AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ #include "av1/common/blockd.h" #include "av1/encoder/block.h" #include "av1/encoder/tokenize.h" #ifdef __cplusplus extern "C" { #endif // The segment ids used in cyclic refresh: from base (no boost) to increasing // boost (higher delta-qp). #define CR_SEGMENT_ID_BASE 0 #define CR_SEGMENT_ID_BOOST1 1 #define CR_SEGMENT_ID_BOOST2 2 // Maximum rate target ratio for setting segment delta-qp. #define CR_MAX_RATE_TARGET_RATIO 4.0 /*! * \brief The stucture of CYCLIC_REFRESH. * \ingroup cyclic_refresh */ struct CYCLIC_REFRESH { /*! * Percentage of blocks per frame that are targeted as candidates * for cyclic refresh. */ int percent_refresh; /*! * Active adjustment delta for cyclic refresh for rate control. */ int percent_refresh_adjustment; /*! * Maximum q-delta as percentage of base q. */ int max_qdelta_perc; /*! *Superblock starting index for cycling through the frame. */ int sb_index; /*! *Superblock index cyclic refresh index last frame */ int last_sb_index; /*! * Controls how long block will need to wait to be refreshed again, in * excess of the cycle time, i.e., in the case of all zero motion, block * will be refreshed every (100/percent_refresh + time_for_refresh) frames. */ int time_for_refresh; /*! * Target number of (4x4) blocks that are set for delta-q. */ int target_num_seg_blocks; /*! * Actual number of (4x4) blocks that were applied delta-q, * for segment 1. */ int actual_num_seg1_blocks; /*! * Actual number of (4x4) blocks that were applied delta-q, * for segment 2. */ int actual_num_seg2_blocks; /*! * RD mult. parameters for segment 1. */ int rdmult; /*! * Cyclic refresh map. */ int8_t *map; /*! * Threshold applied to the projected rate of the coding block, * when deciding whether block should be refreshed. */ int64_t thresh_rate_sb; /*! * Threshold applied to the projected distortion of the coding block, * when deciding whether block should be refreshed. */ int64_t thresh_dist_sb; /*! * Threshold applied to the motion vector (in units of 1/8 pel) of the * coding block, when deciding whether block should be refreshed. */ int16_t motion_thresh; /*! * Rate target ratio to set q delta. */ double rate_ratio_qdelta; /*! * Active adjustment of qdelta rate ratio for enhanced rate control */ double rate_ratio_qdelta_adjustment; /*! * Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2. */ int rate_boost_fac; /*!\cond */ int qindex_delta[3]; int apply_cyclic_refresh; int skip_over4x4; int counter_encode_maxq_scene_change; int use_block_sad_scene_det; /*!\endcond */ }; struct AV1_COMP; typedef struct CYCLIC_REFRESH CYCLIC_REFRESH; CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols); void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr); /*!\brief Estimate the bits, incorporating the delta-q from the segments. * * For the just encoded frame, estimate the bits, incorporating the delta-q * from non-base segment(s). Note this function is called in the postencode * (called from rc_update_rate_correction_factors()). * * \ingroup cyclic_refresh * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * \param[in] correction_factor rate correction factor * * \return Return the estimated bits at given q. */ int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi, double correction_factor); /*!\brief Estimate the bits per mb, for given q = i and delta-q. * * Prior to encoding the frame, estimate the bits per mb, for a given q = i and * a corresponding delta-q (for segment 1). This function is called in the * rc_regulate_q() to set the base qp index. Note: the segment map is set to * either 0/CR_SEGMENT_ID_BASE (no refresh) or to 1/CR_SEGMENT_ID_BOOST1 * (refresh) for each superblock, prior to encoding. * * \ingroup cyclic_refresh * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * \param[in] i q index * \param[in] correction_factor rate correction factor * * \return Return the estimated bits for q = i and delta-q (segment 1). */ int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i, double correction_factor); /*!\brief Update segment_id for blocks are skipped. * * After encoding a given prediction block, of size bsize at (mi_row, mi_col), * check if we should reset the segment_id based on skip_txfm, * and update the cyclic_refresh map and segmentation counters. * * \ingroup cyclic_refresh * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * \param[in] x Pointer to MACROBLOCK structure * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE * \param[in] mi_col Col coordinate of the block in a step size of MI_SIZE * \param[in] bsize Block size * \param[in] dry_run A code indicating whether it is part of the final * pass for reconstructing the superblock * * \remark Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and * the \c cm->cpi->enc_seg.map. */ void av1_cyclic_reset_segment_skip(const struct AV1_COMP *cpi, MACROBLOCK *const x, int mi_row, int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run); /*!\brief Update segment_id for block based on mode selected. * * Prior to coding a given prediction block, of size bsize at (mi_row, mi_col), * check if we should reset the segment_id (based on mode/motion/skip selected * for that block) and update the cyclic_refresh map and segmentation map. * * \ingroup cyclic_refresh * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * \param[in] x Pointer to MACROBLOCK structure * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE * \param[in] mi_col Col coordinate of the block in a step size of MI_SIZE * \param[in] bsize Block size * \param[in] rate Projected block rate from pickmode * \param[in] dist Projected block dist from pickmode * \param[in] skip Skip flag set from picmode * \param[in] dry_run A code indicating whether it is part of the final * pass for reconstructing the superblock * * \remark Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and * the \c cm->cpi->enc_seg.map. */ void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi, MACROBLOCK *const x, int mi_row, int mi_col, BLOCK_SIZE bsize, int64_t rate, int64_t dist, int skip, RUN_TYPE dry_run); /*!\brief Initialize counters used for cyclic refresh. * * Initializes cyclic refresh counters actual_num_seg1_blocks and * actual_num_seg2_blocks. * * \ingroup cyclic_refresh * \callgraph * \callergraph * * \param[in] x Pointer to MACROBLOCK structure * * \remark Update the \c x->actual_num_seg1_blocks and the * \c x->actual_num_seg2_blocks. */ void av1_init_cyclic_refresh_counters(MACROBLOCK *const x); /*!\brief Accumulate cyclic refresh counters. * * Accumulates cyclic refresh counters actual_num_seg1_blocks and * actual_num_seg2_blocks from MACROBLOCK strcture to CYCLIC_REFRESH strcture. * * \ingroup cyclic_refresh * \callgraph * \callergraph * * \param[in] cyclic_refresh Pointer to CYCLIC_REFRESH structure * \param[in] x Pointer to MACROBLOCK structure * * \remark Update the \c cyclic_refresh->actual_num_seg1_blocks and the * \c cyclic_refresh->actual_num_seg2_blocks. */ void av1_accumulate_cyclic_refresh_counters( CYCLIC_REFRESH *const cyclic_refresh, const MACROBLOCK *const x); /*!\brief Set golden frame update interval nased on cyclic refresh. * * \ingroup cyclic_refresh * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * * \remark Returns the interval in \c cpi->rc.baseline_gf_interval. */ void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi); /*!\brief Set the global/frame level parameters for cyclic refresh. * * First call to the cyclic refresh, before encoding the frame. * Sets the flag on whether cyclic refresh should be applied, sets * the amount/percent of refresh, and the amount of boost applied to * the two segments (set by rate_ratio_qdelta and rate_boost_fac). * * \ingroup cyclic_refresh * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * * \remark Updates the \c cpi->cyclic_refresh with the settings. */ void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi); /*!\brief Setup the cyclic background refresh. * * Set the delta q for the segment(s), and set the segmentation map. * * \ingroup cyclic_refresh * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * * \remark Updates the \c cpi->cyclic_refresh with the cyclic refresh * parameters and the \c cm->seg with the segmentation data. */ void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi); int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr); int av1_cyclic_refresh_disable_lf_cdef(struct AV1_COMP *const cpi); static inline int cyclic_refresh_segment_id_boosted(int segment_id) { return segment_id == CR_SEGMENT_ID_BOOST1 || segment_id == CR_SEGMENT_ID_BOOST2; } static inline int cyclic_refresh_segment_id(int segment_id) { if (segment_id == CR_SEGMENT_ID_BOOST1) return CR_SEGMENT_ID_BOOST1; else if (segment_id == CR_SEGMENT_ID_BOOST2) return CR_SEGMENT_ID_BOOST2; else return CR_SEGMENT_ID_BASE; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ aom-3.12.1/av1/encoder/aq_variance.c000066400000000000000000000264031477627663500171310ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/mem.h" #include "av1/encoder/aq_variance.h" #include "av1/common/seg_common.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/rd.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/dwt.h" #include "config/aom_config.h" #if !CONFIG_REALTIME_ONLY static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0, 0.9, .8, .7, .6 }; static const double deltaq_rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0, 0.75, 1.0, 1.0, 1.0 }; #define ENERGY_MIN (-4) #define ENERGY_MAX (1) #define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1) #define ENERGY_IN_BOUNDS(energy) \ assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX) static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 }; #define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN] void av1_vaq_frame_setup(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; const int base_qindex = cm->quant_params.base_qindex; struct segmentation *seg = &cm->seg; int i; int resolution_change = cm->prev_frame && (cm->width != cm->prev_frame->width || cm->height != cm->prev_frame->height); int avg_energy = (int)(cpi->twopass_frame.mb_av_energy - 2); double avg_ratio; if (avg_energy > 7) avg_energy = 7; if (avg_energy < 0) avg_energy = 0; avg_ratio = rate_ratio[avg_energy]; if (resolution_change) { memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); av1_clearall_segfeatures(seg); av1_disable_segmentation(seg); return; } if (frame_is_intra_only(cm) || cm->features.error_resilient_mode || refresh_frame->alt_ref_frame || (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref)) { cpi->vaq_refresh = 1; av1_enable_segmentation(seg); av1_clearall_segfeatures(seg); for (i = 0; i < MAX_SEGMENTS; ++i) { // Set up avg segment id to be 1.0 and adjust the other segments around // it. int qindex_delta = av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type, base_qindex, rate_ratio[i] / avg_ratio); // We don't allow qindex 0 in a segment if the base value is not 0. // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment // Q delta is sometimes applied without going back around the rd loop. // This could lead to an illegal combination of partition size and q. if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) { qindex_delta = -base_qindex + 1; } av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta); av1_enable_segfeature(seg, i, SEG_LVL_ALT_Q); } } } int av1_log_block_avg(const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bs, int mi_row, int mi_col) { // This functions returns the block average of luma block unsigned int sum, avg, num_pix; int r, c; const int pic_w = cpi->common.width; const int pic_h = cpi->common.height; const int bw = MI_SIZE * mi_size_wide[bs]; const int bh = MI_SIZE * mi_size_high[bs]; const uint16_t *x16 = CONVERT_TO_SHORTPTR(x->plane[0].src.buf); sum = 0; num_pix = 0; avg = 0; int row = mi_row << MI_SIZE_LOG2; int col = mi_col << MI_SIZE_LOG2; for (r = row; (r < (row + bh)) && (r < pic_h); r++) { for (c = col; (c < (col + bw)) && (c < pic_w); c++) { sum += *(x16 + r * x->plane[0].src.stride + c); num_pix++; } } if (num_pix != 0) { avg = sum / num_pix; } return avg; } #define DEFAULT_E_MIDPOINT 10.0 static unsigned int haar_ac_energy(const MACROBLOCK *x, BLOCK_SIZE bs) { const MACROBLOCKD *xd = &x->e_mbd; int stride = x->plane[0].src.stride; const uint8_t *buf = x->plane[0].src.buf; const int num_8x8_cols = block_size_wide[bs] / 8; const int num_8x8_rows = block_size_high[bs] / 8; const int hbd = is_cur_buf_hbd(xd); int64_t var = av1_haar_ac_sad_mxn_uint8_input(buf, stride, hbd, num_8x8_rows, num_8x8_cols); return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs]; } static double log_block_wavelet_energy(const MACROBLOCK *x, BLOCK_SIZE bs) { unsigned int haar_sad = haar_ac_energy(x, bs); return log1p(haar_sad); } int av1_block_wavelet_energy_level(const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bs) { double energy, energy_midpoint; energy_midpoint = (is_stat_consumption_stage_twopass(cpi)) ? cpi->twopass_frame.frame_avg_haar_energy : DEFAULT_E_MIDPOINT; energy = log_block_wavelet_energy(x, bs) - energy_midpoint; return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX); } int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi, int block_var_level) { int rate_level; const AV1_COMMON *const cm = &cpi->common; if (DELTA_Q_PERCEPTUAL_MODULATION == 1) { ENERGY_IN_BOUNDS(block_var_level); rate_level = SEGMENT_ID(block_var_level); } else { rate_level = block_var_level; } const int base_qindex = cm->quant_params.base_qindex; int qindex_delta = av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type, base_qindex, deltaq_rate_ratio[rate_level]); if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) { qindex_delta = -base_qindex + 1; } return base_qindex + qindex_delta; } // Comparer used by qsort() to order an array of unsigned int from smallest to // largest. static int comp_unsigned_int(const void *a, const void *b) { unsigned int arg1 = *(const unsigned int *)a; unsigned int arg2 = *(const unsigned int *)b; return (arg1 > arg2) - (arg1 < arg2); } unsigned int av1_get_variance_boost_block_variance(const AV1_COMP *cpi, const MACROBLOCK *x) { #define SUPERBLOCK_SIZE 64 #define SUBBLOCK_SIZE 8 #define SUBBLOCKS_IN_SB_DIM (SUPERBLOCK_SIZE / SUBBLOCK_SIZE) #define SUBBLOCKS_IN_SB (SUBBLOCKS_IN_SB_DIM * SUBBLOCKS_IN_SB_DIM) #define SUBBLOCKS_IN_OCTILE (SUBBLOCKS_IN_SB / 8) DECLARE_ALIGNED(16, static const uint16_t, av1_highbd_all_zeros[SUBBLOCK_SIZE]) = { 0 }; DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[SUBBLOCK_SIZE]) = { 0 }; const MACROBLOCKD *xd = &x->e_mbd; unsigned int sse; // Octile is currently hard-coded and optimized for still pictures. In the // future, we might want to expose this as a parameter that can be fine-tuned // by the caller. // An octile of 5 was chosen because it was found to strike the best balance // between quality and consistency. Lower octiles tend to score lower in // SSIMU2, while higher octiles tend to harm subjective quality consistency, // especially in <1 MP images. const int octile = 5; const uint8_t *all_zeros = is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(av1_highbd_all_zeros) : av1_all_zeros; unsigned int variances[SUBBLOCKS_IN_SB]; // Calculate subblock variances. aom_variance_fn_t vf = cpi->ppi->fn_ptr[BLOCK_8X8].vf; for (int subb_i = 0; subb_i < SUBBLOCKS_IN_SB_DIM; subb_i++) { int i = subb_i * SUBBLOCK_SIZE; for (int subb_j = 0; subb_j < SUBBLOCKS_IN_SB_DIM; subb_j++) { int j = subb_j * SUBBLOCK_SIZE; // Truncating values to integers (i.e. the 64 term) was found to perform // better than rounding, or returning them as doubles. variances[subb_i * SUBBLOCKS_IN_SB_DIM + subb_j] = vf(x->plane[0].src.buf + i * x->plane[0].src.stride + j, x->plane[0].src.stride, all_zeros, 0, &sse) / 64; } } // Order the 8x8 SB values from smallest to largest variance. qsort(variances, SUBBLOCKS_IN_SB, sizeof(unsigned int), comp_unsigned_int); // Sample three 8x8 variance values: at the specified octile, previous octile, // and next octile. Make sure we use the last subblock in each octile as the // representative of the octile. assert(octile >= 1 && octile <= 8); const int middle_index = octile * SUBBLOCKS_IN_OCTILE - 1; const int lower_index = AOMMAX(SUBBLOCKS_IN_OCTILE - 1, middle_index - SUBBLOCKS_IN_OCTILE); const int upper_index = AOMMIN(SUBBLOCKS_IN_SB - 1, middle_index + SUBBLOCKS_IN_OCTILE); // Weigh the three variances in a 1:2:1 ratio, with rounding (the +2 term). // This allows for smoother delta-q transitions among superblocks with // mixed-variance features. const unsigned int variance = (variances[lower_index] + (variances[middle_index] * 2) + variances[upper_index] + 2) / 4; return variance; } #endif // !CONFIG_REALTIME_ONLY int av1_log_block_var(const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bs) { DECLARE_ALIGNED(16, static const uint16_t, av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 }; DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 }; // This function returns a score for the blocks local variance as calculated // by: sum of the log of the (4x4 variances) of each subblock to the current // block (x,bs) // * 32 / number of pixels in the block_size. // This is used for segmentation because to avoid situations in which a large // block with a gentle gradient gets marked high variance even though each // subblock has a low variance. This allows us to assign the same segment // number for the same sorts of area regardless of how the partitioning goes. const MACROBLOCKD *xd = &x->e_mbd; double var = 0; unsigned int sse; int i, j; int right_overflow = (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0; int bottom_overflow = (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0; const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow; const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow; aom_variance_fn_t vf = cpi->ppi->fn_ptr[BLOCK_4X4].vf; for (i = 0; i < bh; i += 4) { for (j = 0; j < bw; j += 4) { if (is_cur_buf_hbd(xd)) { var += log1p(vf(x->plane[0].src.buf + i * x->plane[0].src.stride + j, x->plane[0].src.stride, CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) / 16.0); } else { var += log1p(vf(x->plane[0].src.buf + i * x->plane[0].src.stride + j, x->plane[0].src.stride, av1_all_zeros, 0, &sse) / 16.0); } } } // Use average of 4x4 log variance. The range for 8 bit 0 - 9.704121561. var /= (bw / 4 * bh / 4); if (var > 7) var = 7; return (int)(var); } aom-3.12.1/av1/encoder/aq_variance.h000066400000000000000000000027711477627663500171400ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_AQ_VARIANCE_H_ #define AOM_AV1_ENCODER_AQ_VARIANCE_H_ #include "av1/encoder/encoder.h" #include "config/aom_config.h" #ifdef __cplusplus extern "C" { #endif #if !CONFIG_REALTIME_ONLY void av1_vaq_frame_setup(AV1_COMP *cpi); int av1_log_block_avg(const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bs, int mi_row, int mi_col); int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi, int block_var_level); int av1_block_wavelet_energy_level(const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bs); unsigned int av1_get_variance_boost_block_variance(const AV1_COMP *cpi, const MACROBLOCK *x); #endif // !CONFIG_REALTIME_ONLY int av1_log_block_var(const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bs); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_AQ_VARIANCE_H_ aom-3.12.1/av1/encoder/arm/000077500000000000000000000000001477627663500152665ustar00rootroot00000000000000aom-3.12.1/av1/encoder/arm/av1_error_neon.c000066400000000000000000000071551477627663500203610ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz) { uint64x2_t err_u64 = vdupq_n_u64(0); int64x2_t ssz_s64 = vdupq_n_s64(0); assert(block_size >= 16); assert((block_size % 16) == 0); do { const int16x8_t c0 = load_tran_low_to_s16q(coeff); const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); // By operating on unsigned integers we can store up to 4 squared diff in a // 32-bit element before having to widen to 64 bits. uint32x4_t err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0)); err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1)); err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1)); err_u64 = vpadalq_u32(err_u64, err); // We can't do the same here as we're operating on signed integers, so we // can only accumulate 2 squares. int32x4_t ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0)); ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0)); ssz_s64 = vpadalq_s32(ssz_s64, ssz0); int32x4_t ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1)); ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1)); ssz_s64 = vpadalq_s32(ssz_s64, ssz1); coeff += 16; dqcoeff += 16; block_size -= 16; } while (block_size != 0); *ssz = horizontal_add_s64x2(ssz_s64); return (int64_t)horizontal_add_u64x2(err_u64); } int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size) { uint64x2_t err_u64 = vdupq_n_u64(0); assert(block_size >= 16); assert((block_size % 16) == 0); do { const int16x8_t c0 = vld1q_s16(coeff); const int16x8_t c1 = vld1q_s16(coeff + 8); const int16x8_t d0 = vld1q_s16(dqcoeff); const int16x8_t d1 = vld1q_s16(dqcoeff + 8); const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); // By operating on unsigned integers we can store up to 4 squared diff in a // 32-bit element before having to widen to 64 bits. uint32x4_t err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0)); err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1)); err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1)); err_u64 = vpadalq_u32(err_u64, err); coeff += 16; dqcoeff += 16; block_size -= 16; } while (block_size != 0); return (int64_t)horizontal_add_u64x2(err_u64); } aom-3.12.1/av1/encoder/arm/av1_error_sve.c000066400000000000000000000073731477627663500202210ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" int64_t av1_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz) { int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; int64x2_t sqcoeff[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; assert(block_size >= 16); assert((block_size % 16) == 0); do { const int16x8_t c0 = load_tran_low_to_s16q(coeff); const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); const int16x8_t diff0 = vsubq_s16(c0, d0); const int16x8_t diff1 = vsubq_s16(c1, d1); error[0] = aom_sdotq_s16(error[0], diff0, diff0); error[1] = aom_sdotq_s16(error[1], diff1, diff1); sqcoeff[0] = aom_sdotq_s16(sqcoeff[0], c0, c0); sqcoeff[1] = aom_sdotq_s16(sqcoeff[1], c1, c1); coeff += 16; dqcoeff += 16; block_size -= 16; } while (block_size != 0); *ssz = vaddvq_s64(vaddq_s64(sqcoeff[0], sqcoeff[1])); return vaddvq_s64(vaddq_s64(error[0], error[1])); } int64_t av1_block_error_lp_sve(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size) { if (block_size % 32 == 0) { int64x2_t error[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0) }; do { const int16x8_t c0 = vld1q_s16(coeff); const int16x8_t c1 = vld1q_s16(coeff + 8); const int16x8_t c2 = vld1q_s16(coeff + 16); const int16x8_t c3 = vld1q_s16(coeff + 24); const int16x8_t d0 = vld1q_s16(dqcoeff); const int16x8_t d1 = vld1q_s16(dqcoeff + 8); const int16x8_t d2 = vld1q_s16(dqcoeff + 16); const int16x8_t d3 = vld1q_s16(dqcoeff + 24); const int16x8_t diff0 = vsubq_s16(c0, d0); const int16x8_t diff1 = vsubq_s16(c1, d1); const int16x8_t diff2 = vsubq_s16(c2, d2); const int16x8_t diff3 = vsubq_s16(c3, d3); error[0] = aom_sdotq_s16(error[0], diff0, diff0); error[1] = aom_sdotq_s16(error[1], diff1, diff1); error[2] = aom_sdotq_s16(error[2], diff2, diff2); error[3] = aom_sdotq_s16(error[3], diff3, diff3); coeff += 32; dqcoeff += 32; block_size -= 32; } while (block_size != 0); error[0] = vaddq_s64(error[0], error[1]); error[2] = vaddq_s64(error[2], error[3]); error[0] = vaddq_s64(error[0], error[2]); return vaddvq_s64(error[0]); } assert(block_size == 16); int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; do { const int16x8_t c0 = vld1q_s16(coeff); const int16x8_t c1 = vld1q_s16(coeff + 8); const int16x8_t d0 = vld1q_s16(dqcoeff); const int16x8_t d1 = vld1q_s16(dqcoeff + 8); const int16x8_t diff0 = vsubq_s16(c0, d0); const int16x8_t diff1 = vsubq_s16(c1, d1); error[0] = aom_sdotq_s16(error[0], diff0, diff0); error[1] = aom_sdotq_s16(error[1], diff1, diff1); coeff += 16; dqcoeff += 16; block_size -= 16; } while (block_size != 0); return vaddvq_s64(vaddq_s64(error[0], error[1])); } aom-3.12.1/av1/encoder/arm/av1_fwd_txfm2d_neon.c000066400000000000000000003700521477627663500212730ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "aom_dsp/txfm_common.h" #include "aom_ports/mem.h" #include "av1/common/av1_txfm.h" #include "av1/encoder/av1_fwd_txfm1d_cfg.h" #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "shift_neon.h" #include "txfm_neon.h" #define TXFM_COS_BIT_MAX 13 // A note on butterfly helper naming: // // butterfly_[input_ty]_[acc_ty]_[input_num]_[weight_num]_[weight_neg]_neon // e.g. butterfly_s32_s32_x4_0231_neon // | | | ^ Weights are applied as indices 0, 2, 3, 1 // | | | (see more detail below) // | | ^ (int32)x4 input/output parameters // | ^ 32-bit accumulators internally // ^ 32-bit input/output parameters // // Weights are stored as 4-tuples in Q2.13 format as (w0, 1-w0, -w0, w0-1) to // avoid needing separate negation instructions. This is represented in the // helper naming by referring to the lane index in the loaded tuple that each // multiply is performed with: // // in0 in1 // /---------- // out0 | w0 w1 ==> out0 = in0 * w0 + in1 * w1 // out1 | w2 w3 ==> out1 = in0 * w2 + in1 * w3 // // So for indices 0331 from the earlier example, we end up with: // // in0 in1 // /------------------ // out0 | (lane 0) (lane 2) ==> out0 = in0 * w0 + in1 * -w0 // out1 | (lane 3) (lane 1) ==> out1 = in0 * (w0-1) + in1 * (1-w0) static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0112_neon( const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1, int32x4_t *out0, int32x4_t *out1) { int32x4_t w0101 = vmovl_s16(w0101_s16); int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0); o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 1); int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1); o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0); *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX); *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX); } static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0332_neon( const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1, int32x4_t *out0, int32x4_t *out1) { int32x4_t w0101 = vmovl_s16(w0101_s16); int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0); o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 1); int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 1); o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0); *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX); *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX); } static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1003_neon( const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1, int32x4_t *out0, int32x4_t *out1) { int32x4_t w0101 = vmovl_s16(w0101_s16); int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1); o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 0); int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0); o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1); *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX); *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX); } static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1223_neon( const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1, int32x4_t *out0, int32x4_t *out1) { int32x4_t w0101 = vmovl_s16(w0101_s16); int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1); o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 0); int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 0); o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1); *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX); *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX); } #define butterfly_s16_s32_x4_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \ out0, out1) \ do { \ int32x4_t u0 = vmull_lane_s16(in0, wvec, lane0); \ u0 = vmlal_lane_s16(u0, in1, wvec, lane1); \ int32x4_t v0 = vmull_lane_s16(in0, wvec, lane2); \ v0 = vmlal_lane_s16(v0, in1, wvec, lane3); \ *out0 = vqrshrn_n_s32(u0, TXFM_COS_BIT_MAX); \ *out1 = vqrshrn_n_s32(v0, TXFM_COS_BIT_MAX); \ } while (0) static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0112_neon( const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1, int16x4_t *out0, int16x4_t *out1) { butterfly_s16_s32_x4_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1); } static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0332_neon( const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1, int16x4_t *out0, int16x4_t *out1) { butterfly_s16_s32_x4_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1); } static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1003_neon( const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1, int16x4_t *out0, int16x4_t *out1) { butterfly_s16_s32_x4_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1); } static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1223_neon( const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1, int16x4_t *out0, int16x4_t *out1) { butterfly_s16_s32_x4_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1); } #define butterfly_s16_s32_x8_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \ out0, out1) \ do { \ int32x4_t u0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane0); \ u0 = vmlal_lane_s16(u0, vget_low_s16(in1), wvec, lane1); \ int32x4_t u1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane0); \ u1 = vmlal_lane_s16(u1, vget_high_s16(in1), wvec, lane1); \ int32x4_t v0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane2); \ v0 = vmlal_lane_s16(v0, vget_low_s16(in1), wvec, lane3); \ int32x4_t v1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane2); \ v1 = vmlal_lane_s16(v1, vget_high_s16(in1), wvec, lane3); \ const int16x4_t c0 = vrshrn_n_s32(u0, TXFM_COS_BIT_MAX); \ const int16x4_t c1 = vrshrn_n_s32(u1, TXFM_COS_BIT_MAX); \ const int16x4_t d0 = vrshrn_n_s32(v0, TXFM_COS_BIT_MAX); \ const int16x4_t d1 = vrshrn_n_s32(v1, TXFM_COS_BIT_MAX); \ *out0 = vcombine_s16(c0, c1); \ *out1 = vcombine_s16(d0, d1); \ } while (0) static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0112_neon( const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1, int16x8_t *out0, int16x8_t *out1) { butterfly_s16_s32_x8_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1); } static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0332_neon( const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1, int16x8_t *out0, int16x8_t *out1) { butterfly_s16_s32_x8_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1); } static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1003_neon( const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1, int16x8_t *out0, int16x8_t *out1) { butterfly_s16_s32_x8_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1); } static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1223_neon( const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1, int16x8_t *out0, int16x8_t *out1) { butterfly_s16_s32_x8_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1); } static AOM_FORCE_INLINE void flip_buf_4_neon(int16x4_t *in, int16x4_t *out, int size) { for (int i = 0; i < size; ++i) { out[size - i - 1] = in[i]; } } static AOM_FORCE_INLINE void flip_buf_8_neon(int16x8_t *in, int16x8_t *out, int size) { for (int i = 0; i < size; ++i) { out[size - i - 1] = in[i]; } } static AOM_FORCE_INLINE void store_buffer_interleaved_s32_x8( int32_t *const out, const int32x4_t *const in1, const int32x4_t *const in2, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { vst1q_s32(out + stride * i, in1[i]); vst1q_s32(out + stride * i + 4, in2[i]); } } static AOM_FORCE_INLINE void load_buffer_s16_x4(const int16_t *in, const int stride, int16x4_t *const out, const int out_size) { for (int i = 0; i < out_size; ++i) { out[i] = vld1_s16(in); in += stride; } } static AOM_FORCE_INLINE void load_buffer_s16_x8(const int16_t *in, int stride, int16x8_t *out, int out_size) { for (int i = 0; i < out_size; ++i) { out[i] = vld1q_s16(in + i * stride); } } static AOM_FORCE_INLINE void store_buffer_s16_x4(const int16x4_t *const in, int32_t *const out, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { vst1q_s32(out + i * stride, vmovl_s16(in[i])); } } static AOM_FORCE_INLINE void store_buffer_s16_x8(const int16x8_t *const in, int32_t *const out, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { vst1q_s32(out + i * stride + 0, vmovl_s16(vget_low_s16(in[i]))); vst1q_s32(out + i * stride + 4, vmovl_s16(vget_high_s16(in[i]))); } } // A note on naming: // round_shift_[sqrt2]_s16_s32_4x1_neon(...) // | | | ^ 1 => a single vector // | | | n => an array of vectors // | | | ^ input/output vector element count // | | ^ output type // | ^ input type // ^ multiplicand and shift identifier static AOM_FORCE_INLINE int16x4_t round_shift_sqrt2_s16_s16_4x1_neon(int16x4_t a) { return vqrshrn_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits); } static AOM_FORCE_INLINE int16x8_t round_shift_sqrt2_s16_s16_8x1_neon(int16x8_t a) { return vcombine_s16(round_shift_sqrt2_s16_s16_4x1_neon(vget_low_s16(a)), round_shift_sqrt2_s16_s16_4x1_neon(vget_high_s16(a))); } static AOM_FORCE_INLINE int16x4_t round_shift_2sqrt2_s16_s16_4x1_neon(int16x4_t a) { return vqrshrn_n_s32(vmull_n_s16(a, 2 * NewSqrt2), NewSqrt2Bits); } static AOM_FORCE_INLINE int16x8_t round_shift_2sqrt2_s16_s16_8x1_neon(int16x8_t a) { return vcombine_s16(round_shift_2sqrt2_s16_s16_4x1_neon(vget_low_s16(a)), round_shift_2sqrt2_s16_s16_4x1_neon(vget_high_s16(a))); } static AOM_FORCE_INLINE int32x4_t round_shift_sqrt2_s16_s32_4x1_neon(int16x4_t a) { return vrshrq_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits); } static AOM_FORCE_INLINE int32x4_t round_shift_sqrt2_s32_s32_4x1_neon(int32x4_t a) { return vrshrq_n_s32(vmulq_n_s32(a, NewSqrt2), NewSqrt2Bits); } #define ROUND_SHIFT_SQRT_LOOP_HELPER(name, type0, type1, fn) \ static AOM_FORCE_INLINE void name(const type0 *in, type1 *out, int size) { \ for (int i = 0; i < size; ++i) { \ out[i] = fn(in[i]); \ } \ } ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s32_s32_4xn_neon, int32x4_t, int32x4_t, round_shift_sqrt2_s32_s32_4x1_neon) ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_4xn_neon, int16x4_t, int16x4_t, round_shift_sqrt2_s16_s16_4x1_neon) ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_8xn_neon, int16x8_t, int16x8_t, round_shift_sqrt2_s16_s16_8x1_neon) ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_4xn_neon, int16x4_t, int16x4_t, round_shift_2sqrt2_s16_s16_4x1_neon) ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_8xn_neon, int16x8_t, int16x8_t, round_shift_2sqrt2_s16_s16_8x1_neon) static AOM_FORCE_INLINE void store_rect_buffer_s16_x4(const int16x4_t *const in, int32_t *const out, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { vst1q_s32(out + i * stride, round_shift_sqrt2_s16_s32_4x1_neon(in[i])); } } static AOM_FORCE_INLINE void store_rect_buffer_s16_x8(const int16x8_t *const in, int32_t *const out, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { vst1q_s32(out + i * stride + 0, round_shift_sqrt2_s16_s32_4x1_neon(vget_low_s16(in[i]))); vst1q_s32(out + i * stride + 4, round_shift_sqrt2_s16_s32_4x1_neon(vget_high_s16(in[i]))); } } static AOM_FORCE_INLINE void fadst4x4_neon(const int16x4_t *input, int16x4_t *output, int cos_bit) { int32x4_t u[6], v[6]; const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit)); const int16x4_t u01 = vqadd_s16(input[0], input[1]); v[5] = vmull_lane_s16(input[2], sinpi, 2); v[0] = vmull_lane_s16(input[1], sinpi, 1); v[0] = vmlal_lane_s16(v[0], input[0], sinpi, 0); v[1] = vmlal_lane_s16(v[5], input[3], sinpi, 3); v[2] = vmull_lane_s16(u01, sinpi, 2); v[3] = vmull_lane_s16(input[0], sinpi, 3); v[3] = vmlsl_lane_s16(v[3], input[1], sinpi, 0); v[4] = vmlsl_lane_s16(v[5], input[3], sinpi, 1); u[0] = vaddq_s32(v[0], v[1]); u[1] = vmlsl_lane_s16(v[2], input[3], sinpi, 2); u[2] = vsubq_s32(v[3], v[4]); u[3] = vsubq_s32(u[2], u[0]); u[3] = vmlaq_n_s32(u[3], v[5], 3); output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX); output[1] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX); output[2] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX); output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX); } static AOM_FORCE_INLINE void fadst4x8_neon(const int16x4_t *input, int16x4_t *output, int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); const int16x4_t cospi32 = vget_low_s16(cospi32_16); const int16x4_t cospi16 = vget_high_s16(cospi32_16); const int16x4_t cospi4 = vget_low_s16(cospi4_12); const int16x4_t cospi12 = vget_high_s16(cospi4_12); const int16x4_t cospi20 = vget_low_s16(cospi20_28); const int16x4_t cospi28 = vget_high_s16(cospi20_28); // stage 1-2 int16x4_t x2[8]; butterfly_s16_s32_x4_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]); butterfly_s16_s32_x4_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]); // stage 3 int16x4_t x3[8]; x3[0] = vqadd_s16(input[0], x2[2]); x3[1] = vqsub_s16(x2[3], input[7]); x3[2] = vqsub_s16(input[0], x2[2]); x3[3] = vqadd_s16(input[7], x2[3]); x3[4] = vqsub_s16(x2[6], input[1]); x3[5] = vqadd_s16(input[6], x2[7]); x3[6] = vqadd_s16(input[1], x2[6]); x3[7] = vqsub_s16(input[6], x2[7]); // stage 4 int16x4_t x4[8]; butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x4[4], &x4[5]); butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x4[6], &x4[7]); // stage 5 int16x4_t x5[8]; x5[0] = vqadd_s16(x3[0], x4[4]); x5[1] = vqadd_s16(x3[1], x4[5]); x5[2] = vqadd_s16(x3[2], x4[6]); x5[3] = vqsub_s16(x4[7], x3[3]); x5[4] = vqsub_s16(x3[0], x4[4]); x5[5] = vqsub_s16(x3[1], x4[5]); x5[6] = vqsub_s16(x3[2], x4[6]); x5[7] = vqadd_s16(x3[3], x4[7]); // stage 6-7 butterfly_s16_s32_x4_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]); butterfly_s16_s32_x4_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]); butterfly_s16_s32_x4_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]); butterfly_s16_s32_x4_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]); } static AOM_FORCE_INLINE void fadst8x4_neon(const int16x8_t *input, int16x8_t *output, int cos_bit) { int32x4_t u_lo[4], u_hi[4]; const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit)); const int16x8_t u01 = vqaddq_s16(input[0], input[1]); u_lo[0] = vmull_lane_s16(vget_low_s16(input[1]), sinpi, 1); u_hi[0] = vmull_lane_s16(vget_high_s16(input[1]), sinpi, 1); u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[0]), sinpi, 0); u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[0]), sinpi, 0); u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[3]), sinpi, 3); u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[3]), sinpi, 3); u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[2]), sinpi, 2); u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[2]), sinpi, 2); u_lo[1] = vmull_lane_s16(vget_low_s16(u01), sinpi, 2); u_hi[1] = vmull_lane_s16(vget_high_s16(u01), sinpi, 2); u_lo[2] = vmull_lane_s16(vget_low_s16(input[0]), sinpi, 3); u_hi[2] = vmull_lane_s16(vget_high_s16(input[0]), sinpi, 3); u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[1]), sinpi, 0); u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[1]), sinpi, 0); u_lo[2] = vmlal_lane_s16(u_lo[2], vget_low_s16(input[3]), sinpi, 1); u_hi[2] = vmlal_lane_s16(u_hi[2], vget_high_s16(input[3]), sinpi, 1); u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[2]), sinpi, 2); u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[2]), sinpi, 2); u_lo[1] = vmlsl_lane_s16(u_lo[1], vget_low_s16(input[3]), sinpi, 2); u_hi[1] = vmlsl_lane_s16(u_hi[1], vget_high_s16(input[3]), sinpi, 2); u_lo[3] = vsubq_s32(u_lo[2], u_lo[0]); u_hi[3] = vsubq_s32(u_hi[2], u_hi[0]); const int16x4_t sinpix3 = vmul_n_s16(sinpi, 3); u_lo[3] = vmlal_lane_s16(u_lo[3], vget_low_s16(input[2]), sinpix3, 2); u_hi[3] = vmlal_lane_s16(u_hi[3], vget_high_s16(input[2]), sinpix3, 2); output[0] = vcombine_s16(vrshrn_n_s32(u_lo[0], TXFM_COS_BIT_MAX), vrshrn_n_s32(u_hi[0], TXFM_COS_BIT_MAX)); output[1] = vcombine_s16(vrshrn_n_s32(u_lo[1], TXFM_COS_BIT_MAX), vrshrn_n_s32(u_hi[1], TXFM_COS_BIT_MAX)); output[2] = vcombine_s16(vrshrn_n_s32(u_lo[2], TXFM_COS_BIT_MAX), vrshrn_n_s32(u_hi[2], TXFM_COS_BIT_MAX)); output[3] = vcombine_s16(vrshrn_n_s32(u_lo[3], TXFM_COS_BIT_MAX), vrshrn_n_s32(u_hi[3], TXFM_COS_BIT_MAX)); } static AOM_FORCE_INLINE void fdct4x4_neon(const int16x4_t *input, int16x4_t *output, int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x4_t cospi16 = vld1_s16(&cospi[4 * 1]); int16x4_t in12a = vadd_s16(input[1], input[2]); int16x4_t in12s = vsub_s16(input[1], input[2]); int16x4_t in03a = vadd_s16(input[0], input[3]); int16x4_t in03s = vsub_s16(input[0], input[3]); int32x4_t u0ad1 = vmull_n_s16(in12a, cospi[4 * 0]); int32x4_t u0ad2 = vmull_n_s16(in03a, cospi[4 * 0]); int32x4_t u[4]; u[0] = vaddq_s32(u0ad1, u0ad2); u[1] = vsubq_s32(u0ad2, u0ad1); u[2] = vmull_lane_s16(in12s, cospi16, 1); u[2] = vmlal_lane_s16(u[2], in03s, cospi16, 0); u[3] = vmull_lane_s16(in03s, cospi16, 1); u[3] = vmlsl_lane_s16(u[3], in12s, cospi16, 0); output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX); output[1] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX); output[2] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX); output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX); } // Butterfly pre-processing: // e.g. n=4: // out[0] = in[0] + in[3] // out[1] = in[1] + in[2] // out[2] = in[1] - in[2] // out[3] = in[0] - in[3] static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x4(const int16x4_t *input, int16x4_t *output, int n) { for (int i = 0; i < n / 2; ++i) { output[i] = vqadd_s16(input[i], input[n - i - 1]); } for (int i = 0; i < n / 2; ++i) { output[n / 2 + i] = vqsub_s16(input[n / 2 - i - 1], input[n / 2 + i]); } } static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x8(const int16x8_t *input, int16x8_t *output, int n) { for (int i = 0; i < n / 2; ++i) { output[i] = vqaddq_s16(input[i], input[n - i - 1]); } for (int i = 0; i < n / 2; ++i) { output[n / 2 + i] = vqsubq_s16(input[n / 2 - i - 1], input[n / 2 + i]); } } static AOM_FORCE_INLINE void butterfly_dct_pre_s32_x4(const int32x4_t *input, int32x4_t *output, int n) { for (int i = 0; i < n / 2; ++i) { output[i] = vqaddq_s32(input[i], input[n - i - 1]); } for (int i = 0; i < n / 2; ++i) { output[n / 2 + i] = vqsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]); } } // Butterfly post-processing: // e.g. n=8: // out[0] = in0[0] + in1[3]; // out[1] = in0[1] + in1[2]; // out[2] = in0[1] - in1[2]; // out[3] = in0[0] - in1[3]; // out[4] = in0[7] - in1[4]; // out[5] = in0[6] - in1[5]; // out[6] = in0[6] + in1[5]; // out[7] = in0[7] + in1[4]; static AOM_FORCE_INLINE void butterfly_dct_post_s16_x4(const int16x4_t *in0, const int16x4_t *in1, int16x4_t *output, int n) { for (int i = 0; i < n / 4; ++i) { output[i] = vqadd_s16(in0[i], in1[n / 2 - i - 1]); } for (int i = 0; i < n / 4; ++i) { output[n / 4 + i] = vqsub_s16(in0[n / 4 - i - 1], in1[n / 4 + i]); } for (int i = 0; i < n / 4; ++i) { output[n / 2 + i] = vqsub_s16(in0[n - i - 1], in1[n / 2 + i]); } for (int i = 0; i < n / 4; ++i) { output[(3 * n) / 4 + i] = vqadd_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]); } } static AOM_FORCE_INLINE void butterfly_dct_post_s16_x8(const int16x8_t *in0, const int16x8_t *in1, int16x8_t *output, int n) { for (int i = 0; i < n / 4; ++i) { output[i] = vqaddq_s16(in0[i], in1[n / 2 - i - 1]); } for (int i = 0; i < n / 4; ++i) { output[n / 4 + i] = vqsubq_s16(in0[n / 4 - i - 1], in1[n / 4 + i]); } for (int i = 0; i < n / 4; ++i) { output[n / 2 + i] = vqsubq_s16(in0[n - i - 1], in1[n / 2 + i]); } for (int i = 0; i < n / 4; ++i) { output[(3 * n) / 4 + i] = vqaddq_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]); } } static AOM_FORCE_INLINE void butterfly_dct_post_s32_x4(const int32x4_t *in0, const int32x4_t *in1, int32x4_t *output, int n) { for (int i = 0; i < n / 4; ++i) { output[i] = vqaddq_s32(in0[i], in1[n / 2 - i - 1]); } for (int i = 0; i < n / 4; ++i) { output[n / 4 + i] = vqsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]); } for (int i = 0; i < n / 4; ++i) { output[n / 2 + i] = vqsubq_s32(in0[n - i - 1], in1[n / 2 + i]); } for (int i = 0; i < n / 4; ++i) { output[(3 * n) / 4 + i] = vqaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]); } } static AOM_FORCE_INLINE void fdct8x4_neon(const int16x8_t *input, int16x8_t *output, int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); const int16x4_t cospi32 = vget_low_s16(cospi32_16); const int16x4_t cospi16 = vget_high_s16(cospi32_16); // stage 1 int16x8_t x1[4]; butterfly_dct_pre_s16_x8(input, x1, 4); // stage 2 int16x8_t x2[4]; butterfly_s16_s32_x8_0112_neon(cospi32, x1[0], x1[1], &x2[0], &x2[1]); butterfly_s16_s32_x8_0112_neon(cospi16, x1[3], x1[2], &x2[2], &x2[3]); // stage 3 output[0] = x2[0]; output[1] = x2[2]; output[2] = x2[1]; output[3] = x2[3]; } static AOM_FORCE_INLINE void fdct4x8_neon(const int16x4_t *input, int16x4_t *output, int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); const int16x4_t cospi32 = vget_low_s16(cospi32_16); const int16x4_t cospi16 = vget_high_s16(cospi32_16); const int16x4_t cospi8 = vget_low_s16(cospi8_24); const int16x4_t cospi24 = vget_high_s16(cospi8_24); // stage 1 int16x4_t x1[8]; butterfly_dct_pre_s16_x4(input, x1, 8); // stage 2 int16x4_t x2[8]; butterfly_dct_pre_s16_x4(x1, x2, 4); butterfly_s16_s32_x4_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]); // stage 3 int16x4_t x3[8]; butterfly_s16_s32_x4_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]); butterfly_s16_s32_x4_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]); butterfly_dct_post_s16_x4(x1 + 4, x2 + 4, x3 + 4, 4); // stage 4-5 butterfly_s16_s32_x4_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]); butterfly_s16_s32_x4_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]); } static AOM_FORCE_INLINE void fdct8x8_neon(const int16x8_t *input, int16x8_t *output, int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); const int16x4_t cospi32 = vget_low_s16(cospi32_16); const int16x4_t cospi16 = vget_high_s16(cospi32_16); const int16x4_t cospi8 = vget_low_s16(cospi8_24); const int16x4_t cospi24 = vget_high_s16(cospi8_24); // stage 1 int16x8_t x1[8]; butterfly_dct_pre_s16_x8(input, x1, 8); // stage 2 int16x8_t x2[8]; butterfly_dct_pre_s16_x8(x1, x2, 4); butterfly_s16_s32_x8_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]); // stage 3 int16x8_t x3[8]; butterfly_s16_s32_x8_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]); butterfly_s16_s32_x8_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]); butterfly_dct_post_s16_x8(x1 + 4, x2 + 4, x3 + 4, 4); // stage 4-5 butterfly_s16_s32_x8_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]); butterfly_s16_s32_x8_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]); } static AOM_FORCE_INLINE void fdct4x16_neon(const int16x4_t *input, int16x4_t *output, int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); const int16x4_t cospi32 = vget_low_s16(cospi32_16); const int16x4_t cospi16 = vget_high_s16(cospi32_16); const int16x4_t cospi8 = vget_low_s16(cospi8_24); const int16x4_t cospi24 = vget_high_s16(cospi8_24); const int16x4_t cospi4 = vget_low_s16(cospi4_12); const int16x4_t cospi12 = vget_high_s16(cospi4_12); const int16x4_t cospi20 = vget_low_s16(cospi20_28); const int16x4_t cospi28 = vget_high_s16(cospi20_28); // stage 1 int16x4_t x1[16]; butterfly_dct_pre_s16_x4(input, x1, 16); // stage 2 int16x4_t x2[16]; butterfly_dct_pre_s16_x4(x1, x2, 8); butterfly_s16_s32_x4_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]); butterfly_s16_s32_x4_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]); // stage 3 int16x4_t x3[16]; butterfly_dct_pre_s16_x4(x2, x3, 4); butterfly_s16_s32_x4_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]); butterfly_dct_post_s16_x4(x1 + 8, x2 + 8, x3 + 8, 8); // stage 4 int16x4_t x4[16]; butterfly_s16_s32_x4_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]); butterfly_s16_s32_x4_0112_neon(cospi16, x3[3], x3[2], &output[4], &output[12]); butterfly_dct_post_s16_x4(x2 + 4, x3 + 4, x4 + 4, 4); butterfly_s16_s32_x4_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]); butterfly_s16_s32_x4_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]); // stage 5 int16x4_t x5[16]; butterfly_s16_s32_x4_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]); butterfly_s16_s32_x4_1003_neon(cospi24, x4[6], x4[5], &output[10], &output[6]); butterfly_dct_post_s16_x4(x3 + 8, x4 + 8, x5 + 8, 4); butterfly_dct_post_s16_x4(x3 + 12, x4 + 12, x5 + 12, 4); // stage 6-7 butterfly_s16_s32_x4_0112_neon(cospi4, x5[15], x5[8], &output[1], &output[15]); butterfly_s16_s32_x4_1003_neon(cospi28, x5[14], x5[9], &output[9], &output[7]); butterfly_s16_s32_x4_0112_neon(cospi20, x5[13], x5[10], &output[5], &output[11]); butterfly_s16_s32_x4_1003_neon(cospi12, x5[12], x5[11], &output[13], &output[3]); } static AOM_FORCE_INLINE void fdct8x16_neon(const int16x8_t *input, int16x8_t *output, int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); const int16x4_t cospi32 = vget_low_s16(cospi32_16); const int16x4_t cospi16 = vget_high_s16(cospi32_16); const int16x4_t cospi8 = vget_low_s16(cospi8_24); const int16x4_t cospi24 = vget_high_s16(cospi8_24); const int16x4_t cospi4 = vget_low_s16(cospi4_12); const int16x4_t cospi12 = vget_high_s16(cospi4_12); const int16x4_t cospi20 = vget_low_s16(cospi20_28); const int16x4_t cospi28 = vget_high_s16(cospi20_28); // stage 1 int16x8_t x1[16]; butterfly_dct_pre_s16_x8(input, x1, 16); // stage 2 int16x8_t x2[16]; butterfly_dct_pre_s16_x8(x1, x2, 8); butterfly_s16_s32_x8_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]); butterfly_s16_s32_x8_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]); // stage 3 int16x8_t x3[16]; butterfly_dct_pre_s16_x8(x2, x3, 4); butterfly_s16_s32_x8_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]); butterfly_dct_post_s16_x8(x1 + 8, x2 + 8, x3 + 8, 8); // stage 4 int16x8_t x4[16]; butterfly_s16_s32_x8_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]); butterfly_s16_s32_x8_0112_neon(cospi16, x3[3], x3[2], &output[4], &output[12]); butterfly_dct_post_s16_x8(x2 + 4, x3 + 4, x4 + 4, 4); butterfly_s16_s32_x8_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]); butterfly_s16_s32_x8_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]); // stage 5 int16x8_t x5[16]; butterfly_s16_s32_x8_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]); butterfly_s16_s32_x8_1003_neon(cospi24, x4[6], x4[5], &output[10], &output[6]); butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 4); butterfly_dct_post_s16_x8(x3 + 12, x4 + 12, x5 + 12, 4); // stage 6-7 butterfly_s16_s32_x8_0112_neon(cospi4, x5[15], x5[8], &output[1], &output[15]); butterfly_s16_s32_x8_1003_neon(cospi28, x5[14], x5[9], &output[9], &output[7]); butterfly_s16_s32_x8_0112_neon(cospi20, x5[13], x5[10], &output[5], &output[11]); butterfly_s16_s32_x8_1003_neon(cospi12, x5[12], x5[11], &output[13], &output[3]); } static AOM_FORCE_INLINE void fdct8x32_neon(const int16x8_t *input, int16x8_t *output, int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); const int16x4_t cospi32 = vget_low_s16(cospi32_16); const int16x4_t cospi16 = vget_high_s16(cospi32_16); const int16x4_t cospi8 = vget_low_s16(cospi8_24); const int16x4_t cospi24 = vget_high_s16(cospi8_24); const int16x4_t cospi4 = vget_low_s16(cospi4_12); const int16x4_t cospi12 = vget_high_s16(cospi4_12); const int16x4_t cospi20 = vget_low_s16(cospi20_28); const int16x4_t cospi28 = vget_high_s16(cospi20_28); const int16x4_t cospi2 = vget_low_s16(cospi2_6); const int16x4_t cospi6 = vget_high_s16(cospi2_6); const int16x4_t cospi10 = vget_low_s16(cospi10_14); const int16x4_t cospi14 = vget_high_s16(cospi10_14); const int16x4_t cospi18 = vget_low_s16(cospi18_22); const int16x4_t cospi22 = vget_high_s16(cospi18_22); const int16x4_t cospi26 = vget_low_s16(cospi26_30); const int16x4_t cospi30 = vget_high_s16(cospi26_30); // stage 1 int16x8_t x1[32]; butterfly_dct_pre_s16_x8(input, x1, 32); // stage 2 int16x8_t x2[32]; butterfly_dct_pre_s16_x8(x1, x2, 16); butterfly_s16_s32_x8_0112_neon(cospi32, x1[27], x1[20], &x2[27], &x2[20]); butterfly_s16_s32_x8_0112_neon(cospi32, x1[26], x1[21], &x2[26], &x2[21]); butterfly_s16_s32_x8_0112_neon(cospi32, x1[25], x1[22], &x2[25], &x2[22]); butterfly_s16_s32_x8_0112_neon(cospi32, x1[24], x1[23], &x2[24], &x2[23]); // stage 3 int16x8_t x3[32]; butterfly_dct_pre_s16_x8(x2, x3, 8); butterfly_s16_s32_x8_0112_neon(cospi32, x2[13], x2[10], &x3[13], &x3[10]); butterfly_s16_s32_x8_0112_neon(cospi32, x2[12], x2[11], &x3[12], &x3[11]); butterfly_dct_post_s16_x8(x1 + 16, x2 + 16, x3 + 16, 16); // stage 4 int16x8_t x4[32]; butterfly_dct_pre_s16_x8(x3, x4, 4); butterfly_s16_s32_x8_0112_neon(cospi32, x3[6], x3[5], &x4[6], &x4[5]); butterfly_dct_post_s16_x8(x2 + 8, x3 + 8, x4 + 8, 8); butterfly_s16_s32_x8_0112_neon(cospi16, x3[29], x3[18], &x4[29], &x4[18]); butterfly_s16_s32_x8_0112_neon(cospi16, x3[28], x3[19], &x4[28], &x4[19]); butterfly_s16_s32_x8_1223_neon(cospi16, x3[27], x3[20], &x4[27], &x4[20]); butterfly_s16_s32_x8_1223_neon(cospi16, x3[26], x3[21], &x4[26], &x4[21]); // stage 5 int16x8_t x5[32]; butterfly_s16_s32_x8_0112_neon(cospi32, x4[0], x4[1], &output[0], &output[16]); butterfly_s16_s32_x8_0112_neon(cospi16, x4[3], x4[2], &output[8], &output[24]); butterfly_dct_post_s16_x8(x3 + 4, x4 + 4, x5 + 4, 4); butterfly_s16_s32_x8_0112_neon(cospi16, x4[14], x4[9], &x5[14], &x5[9]); butterfly_s16_s32_x8_1223_neon(cospi16, x4[13], x4[10], &x5[13], &x5[10]); butterfly_dct_post_s16_x8(x3 + 16, x4 + 16, x5 + 16, 8); butterfly_dct_post_s16_x8(x3 + 24, x4 + 24, x5 + 24, 8); // stage 6 int16x8_t x6[32]; butterfly_s16_s32_x8_0112_neon(cospi8, x5[7], x5[4], &output[4], &output[28]); butterfly_s16_s32_x8_1003_neon(cospi24, x5[6], x5[5], &output[20], &output[12]); butterfly_dct_post_s16_x8(x4 + 8, x5 + 8, x6 + 8, 4); butterfly_dct_post_s16_x8(x4 + 12, x5 + 12, x6 + 12, 4); butterfly_s16_s32_x8_0112_neon(cospi8, x5[30], x5[17], &x6[30], &x6[17]); butterfly_s16_s32_x8_1223_neon(cospi8, x5[29], x5[18], &x6[29], &x6[18]); butterfly_s16_s32_x8_1003_neon(cospi24, x5[26], x5[21], &x6[26], &x6[21]); butterfly_s16_s32_x8_0332_neon(cospi24, x5[25], x5[22], &x6[25], &x6[22]); // stage 7 int16x8_t x7[32]; butterfly_s16_s32_x8_0112_neon(cospi4, x6[15], x6[8], &output[2], &output[30]); butterfly_s16_s32_x8_1003_neon(cospi28, x6[14], x6[9], &output[18], &output[14]); butterfly_s16_s32_x8_0112_neon(cospi20, x6[13], x6[10], &output[10], &output[22]); butterfly_s16_s32_x8_1003_neon(cospi12, x6[12], x6[11], &output[26], &output[6]); butterfly_dct_post_s16_x8(x5 + 16, x6 + 16, x7 + 16, 4); butterfly_dct_post_s16_x8(x5 + 20, x6 + 20, x7 + 20, 4); butterfly_dct_post_s16_x8(x5 + 24, x6 + 24, x7 + 24, 4); butterfly_dct_post_s16_x8(x5 + 28, x6 + 28, x7 + 28, 4); butterfly_s16_s32_x8_0112_neon(cospi2, x7[31], x7[16], &output[1], &output[31]); butterfly_s16_s32_x8_1003_neon(cospi30, x7[30], x7[17], &output[17], &output[15]); butterfly_s16_s32_x8_0112_neon(cospi18, x7[29], x7[18], &output[9], &output[23]); butterfly_s16_s32_x8_1003_neon(cospi14, x7[28], x7[19], &output[25], &output[7]); butterfly_s16_s32_x8_0112_neon(cospi10, x7[27], x7[20], &output[5], &output[27]); butterfly_s16_s32_x8_1003_neon(cospi22, x7[26], x7[21], &output[21], &output[11]); butterfly_s16_s32_x8_0112_neon(cospi26, x7[25], x7[22], &output[13], &output[19]); butterfly_s16_s32_x8_1003_neon(cospi6, x7[24], x7[23], &output[29], &output[3]); } static AOM_FORCE_INLINE void fdct8x64_neon(const int16x8_t *input, int16x8_t *output, int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]); const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]); const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]); const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]); const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]); const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]); const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]); const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]); const int16x4_t cospi32 = vget_low_s16(cospi32_16); const int16x4_t cospi16 = vget_high_s16(cospi32_16); const int16x4_t cospi8 = vget_low_s16(cospi8_24); const int16x4_t cospi24 = vget_high_s16(cospi8_24); const int16x4_t cospi4 = vget_low_s16(cospi4_12); const int16x4_t cospi12 = vget_high_s16(cospi4_12); const int16x4_t cospi20 = vget_low_s16(cospi20_28); const int16x4_t cospi28 = vget_high_s16(cospi20_28); const int16x4_t cospi2 = vget_low_s16(cospi2_6); const int16x4_t cospi6 = vget_high_s16(cospi2_6); const int16x4_t cospi10 = vget_low_s16(cospi10_14); const int16x4_t cospi14 = vget_high_s16(cospi10_14); const int16x4_t cospi18 = vget_low_s16(cospi18_22); const int16x4_t cospi22 = vget_high_s16(cospi18_22); const int16x4_t cospi26 = vget_low_s16(cospi26_30); const int16x4_t cospi30 = vget_high_s16(cospi26_30); const int16x4_t cospi1 = vget_low_s16(cospi1_3); const int16x4_t cospi3 = vget_high_s16(cospi1_3); const int16x4_t cospi5 = vget_low_s16(cospi5_7); const int16x4_t cospi7 = vget_high_s16(cospi5_7); const int16x4_t cospi9 = vget_low_s16(cospi9_11); const int16x4_t cospi11 = vget_high_s16(cospi9_11); const int16x4_t cospi13 = vget_low_s16(cospi13_15); const int16x4_t cospi15 = vget_high_s16(cospi13_15); const int16x4_t cospi17 = vget_low_s16(cospi17_19); const int16x4_t cospi19 = vget_high_s16(cospi17_19); const int16x4_t cospi21 = vget_low_s16(cospi21_23); const int16x4_t cospi23 = vget_high_s16(cospi21_23); const int16x4_t cospi25 = vget_low_s16(cospi25_27); const int16x4_t cospi27 = vget_high_s16(cospi25_27); const int16x4_t cospi29 = vget_low_s16(cospi29_31); const int16x4_t cospi31 = vget_high_s16(cospi29_31); // stage 1 int16x8_t x1[64]; butterfly_dct_pre_s16_x8(input, x1, 64); // stage 2 int16x8_t x2[64]; butterfly_dct_pre_s16_x8(x1, x2, 32); butterfly_s16_s32_x8_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]); butterfly_s16_s32_x8_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]); butterfly_s16_s32_x8_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]); butterfly_s16_s32_x8_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]); butterfly_s16_s32_x8_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]); butterfly_s16_s32_x8_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]); butterfly_s16_s32_x8_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]); butterfly_s16_s32_x8_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]); // stage 3 int16x8_t x3[64]; butterfly_dct_pre_s16_x8(x2, x3, 16); x3[16] = x2[16]; x3[17] = x2[17]; x3[18] = x2[18]; x3[19] = x2[19]; butterfly_s16_s32_x8_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]); butterfly_s16_s32_x8_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]); butterfly_s16_s32_x8_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]); butterfly_s16_s32_x8_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]); x3[28] = x2[28]; x3[29] = x2[29]; x3[30] = x2[30]; x3[31] = x2[31]; butterfly_dct_post_s16_x8(x1 + 32, x2 + 32, x3 + 32, 32); // stage 4 int16x8_t x4[64]; butterfly_dct_pre_s16_x8(x3, x4, 8); butterfly_s16_s32_x8_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]); butterfly_s16_s32_x8_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]); butterfly_dct_post_s16_x8(x3 + 16, x3 + 16, x4 + 16, 16); butterfly_s16_s32_x8_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]); butterfly_s16_s32_x8_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]); butterfly_s16_s32_x8_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]); butterfly_s16_s32_x8_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]); butterfly_s16_s32_x8_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]); butterfly_s16_s32_x8_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]); butterfly_s16_s32_x8_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]); butterfly_s16_s32_x8_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]); // stage 5 int16x8_t x5[64]; butterfly_dct_pre_s16_x8(x4, x5, 4); butterfly_s16_s32_x8_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]); butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 8); butterfly_s16_s32_x8_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]); butterfly_s16_s32_x8_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]); butterfly_s16_s32_x8_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]); butterfly_s16_s32_x8_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]); butterfly_dct_post_s16_x8(x3 + 32, x4 + 32, x5 + 32, 16); butterfly_dct_post_s16_x8(x3 + 48, x4 + 48, x5 + 48, 16); // stage 6 int16x8_t x6[64]; butterfly_s16_s32_x8_0112_neon(cospi32, x5[1], x5[0], &x6[0], &x6[1]); butterfly_s16_s32_x8_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]); butterfly_dct_post_s16_x8(x4 + 4, x5 + 4, x6 + 4, 4); butterfly_s16_s32_x8_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]); butterfly_s16_s32_x8_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]); butterfly_dct_post_s16_x8(x4 + 16, x5 + 16, x6 + 16, 8); butterfly_dct_post_s16_x8(x4 + 24, x5 + 24, x6 + 24, 8); butterfly_s16_s32_x8_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]); butterfly_s16_s32_x8_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]); butterfly_s16_s32_x8_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]); butterfly_s16_s32_x8_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]); butterfly_s16_s32_x8_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]); butterfly_s16_s32_x8_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]); butterfly_s16_s32_x8_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]); butterfly_s16_s32_x8_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]); // stage 7 int16x8_t x7[64]; butterfly_s16_s32_x8_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]); butterfly_s16_s32_x8_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]); butterfly_dct_post_s16_x8(x5 + 8, x6 + 8, x7 + 8, 4); butterfly_dct_post_s16_x8(x5 + 12, x6 + 12, x7 + 12, 4); butterfly_s16_s32_x8_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]); butterfly_s16_s32_x8_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]); butterfly_s16_s32_x8_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]); butterfly_s16_s32_x8_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]); butterfly_dct_post_s16_x8(x5 + 32, x6 + 32, x7 + 32, 8); butterfly_dct_post_s16_x8(x5 + 40, x6 + 40, x7 + 40, 8); butterfly_dct_post_s16_x8(x5 + 48, x6 + 48, x7 + 48, 8); butterfly_dct_post_s16_x8(x5 + 56, x6 + 56, x7 + 56, 8); // stage 8 int16x8_t x8[64]; butterfly_s16_s32_x8_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]); butterfly_s16_s32_x8_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]); butterfly_s16_s32_x8_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]); butterfly_s16_s32_x8_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]); butterfly_dct_post_s16_x8(x6 + 16, x7 + 16, x8 + 16, 4); butterfly_dct_post_s16_x8(x6 + 20, x7 + 20, x8 + 20, 4); butterfly_dct_post_s16_x8(x6 + 24, x7 + 24, x8 + 24, 4); butterfly_dct_post_s16_x8(x6 + 28, x7 + 28, x8 + 28, 4); butterfly_s16_s32_x8_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]); butterfly_s16_s32_x8_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]); butterfly_s16_s32_x8_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]); butterfly_s16_s32_x8_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]); butterfly_s16_s32_x8_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]); butterfly_s16_s32_x8_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]); butterfly_s16_s32_x8_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]); butterfly_s16_s32_x8_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]); // stage 9 int16x8_t x9[64]; butterfly_s16_s32_x8_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]); butterfly_s16_s32_x8_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]); butterfly_s16_s32_x8_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]); butterfly_s16_s32_x8_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]); butterfly_s16_s32_x8_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]); butterfly_s16_s32_x8_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]); butterfly_s16_s32_x8_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]); butterfly_s16_s32_x8_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]); butterfly_dct_post_s16_x8(x7 + 32, x8 + 32, x9 + 32, 4); butterfly_dct_post_s16_x8(x7 + 36, x8 + 36, x9 + 36, 4); butterfly_dct_post_s16_x8(x7 + 40, x8 + 40, x9 + 40, 4); butterfly_dct_post_s16_x8(x7 + 44, x8 + 44, x9 + 44, 4); butterfly_dct_post_s16_x8(x7 + 48, x8 + 48, x9 + 48, 4); butterfly_dct_post_s16_x8(x7 + 52, x8 + 52, x9 + 52, 4); butterfly_dct_post_s16_x8(x7 + 56, x8 + 56, x9 + 56, 4); butterfly_dct_post_s16_x8(x7 + 60, x8 + 60, x9 + 60, 4); // stage 10 butterfly_s16_s32_x8_0112_neon(cospi1, x9[63], x9[32], &output[1], &output[63]); butterfly_s16_s32_x8_1003_neon(cospi31, x9[62], x9[33], &output[33], &output[31]); butterfly_s16_s32_x8_0112_neon(cospi17, x9[61], x9[34], &output[17], &output[47]); butterfly_s16_s32_x8_1003_neon(cospi15, x9[60], x9[35], &output[49], &output[15]); butterfly_s16_s32_x8_0112_neon(cospi9, x9[59], x9[36], &output[9], &output[55]); butterfly_s16_s32_x8_1003_neon(cospi23, x9[58], x9[37], &output[41], &output[23]); butterfly_s16_s32_x8_0112_neon(cospi25, x9[57], x9[38], &output[25], &output[39]); butterfly_s16_s32_x8_1003_neon(cospi7, x9[56], x9[39], &output[57], &output[7]); butterfly_s16_s32_x8_0112_neon(cospi5, x9[55], x9[40], &output[5], &output[59]); butterfly_s16_s32_x8_1003_neon(cospi27, x9[54], x9[41], &output[37], &output[27]); butterfly_s16_s32_x8_0112_neon(cospi21, x9[53], x9[42], &output[21], &output[43]); butterfly_s16_s32_x8_1003_neon(cospi11, x9[52], x9[43], &output[53], &output[11]); butterfly_s16_s32_x8_0112_neon(cospi13, x9[51], x9[44], &output[13], &output[51]); butterfly_s16_s32_x8_1003_neon(cospi19, x9[50], x9[45], &output[45], &output[19]); butterfly_s16_s32_x8_0112_neon(cospi29, x9[49], x9[46], &output[29], &output[35]); butterfly_s16_s32_x8_1003_neon(cospi3, x9[48], x9[47], &output[61], &output[3]); // stage 11 output[0] = x6[0]; output[2] = x9[16]; output[4] = x8[8]; output[6] = x9[24]; output[8] = x7[4]; output[10] = x9[20]; output[12] = x8[12]; output[14] = x9[28]; output[16] = x6[2]; output[18] = x9[18]; output[20] = x8[10]; output[22] = x9[26]; output[24] = x7[6]; output[26] = x9[22]; output[28] = x8[14]; output[30] = x9[30]; output[32] = x6[1]; output[34] = x9[17]; output[36] = x8[9]; output[38] = x9[25]; output[40] = x7[5]; output[42] = x9[21]; output[44] = x8[13]; output[46] = x9[29]; output[48] = x6[3]; output[52] = x8[11]; output[54] = x9[27]; output[56] = x7[7]; output[58] = x9[23]; output[60] = x8[15]; output[62] = x9[31]; } static AOM_FORCE_INLINE void fadst8x8_neon(const int16x8_t *input, int16x8_t *output, int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); const int16x4_t cospi32 = vget_low_s16(cospi32_16); const int16x4_t cospi16 = vget_high_s16(cospi32_16); const int16x4_t cospi4 = vget_low_s16(cospi4_12); const int16x4_t cospi12 = vget_high_s16(cospi4_12); const int16x4_t cospi20 = vget_low_s16(cospi20_28); const int16x4_t cospi28 = vget_high_s16(cospi20_28); // stage 2 int16x8_t x2[8]; butterfly_s16_s32_x8_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]); butterfly_s16_s32_x8_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]); // stage 3 int16x8_t x3[8]; x3[0] = vqaddq_s16(input[0], x2[2]); x3[1] = vqsubq_s16(x2[3], input[7]); x3[2] = vqsubq_s16(input[0], x2[2]); x3[3] = vqaddq_s16(input[7], x2[3]); x3[4] = vqsubq_s16(x2[6], input[1]); x3[5] = vqaddq_s16(input[6], x2[7]); x3[6] = vqaddq_s16(input[1], x2[6]); x3[7] = vqsubq_s16(input[6], x2[7]); // stage 4 butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]); butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]); // stage 5 int16x8_t x5[8]; x5[0] = vqaddq_s16(x3[0], x3[4]); x5[1] = vqaddq_s16(x3[1], x3[5]); x5[2] = vqaddq_s16(x3[2], x3[6]); x5[3] = vqsubq_s16(x3[7], x3[3]); x5[4] = vqsubq_s16(x3[0], x3[4]); x5[5] = vqsubq_s16(x3[1], x3[5]); x5[6] = vqsubq_s16(x3[2], x3[6]); x5[7] = vqaddq_s16(x3[3], x3[7]); // stage 6 butterfly_s16_s32_x8_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]); butterfly_s16_s32_x8_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]); butterfly_s16_s32_x8_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]); butterfly_s16_s32_x8_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]); } static AOM_FORCE_INLINE void fadst4x16_neon(const int16x4_t *input, int16x4_t *output, int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); const int16x4_t cospi32 = vget_low_s16(cospi32_16); const int16x4_t cospi16 = vget_high_s16(cospi32_16); const int16x4_t cospi8 = vget_low_s16(cospi8_24); const int16x4_t cospi24 = vget_high_s16(cospi8_24); const int16x4_t cospi2 = vget_low_s16(cospi2_6); const int16x4_t cospi6 = vget_high_s16(cospi2_6); const int16x4_t cospi10 = vget_low_s16(cospi10_14); const int16x4_t cospi14 = vget_high_s16(cospi10_14); const int16x4_t cospi18 = vget_low_s16(cospi18_22); const int16x4_t cospi22 = vget_high_s16(cospi18_22); const int16x4_t cospi26 = vget_low_s16(cospi26_30); const int16x4_t cospi30 = vget_high_s16(cospi26_30); // stage 2 int16x4_t x2[8]; butterfly_s16_s32_x4_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]); butterfly_s16_s32_x4_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]); butterfly_s16_s32_x4_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]); butterfly_s16_s32_x4_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]); // stage 3 int16x4_t x3[16]; x3[0] = vqadd_s16(input[0], x2[0]); x3[1] = vqsub_s16(x2[1], input[15]); x3[2] = vqsub_s16(input[0], x2[0]); x3[3] = vqadd_s16(input[15], x2[1]); x3[4] = vqsub_s16(x2[2], input[3]); x3[5] = vqadd_s16(input[12], x2[3]); x3[6] = vqadd_s16(input[3], x2[2]); x3[7] = vqsub_s16(input[12], x2[3]); x3[8] = vqsub_s16(x2[4], input[1]); x3[9] = vqadd_s16(input[14], x2[5]); x3[10] = vqadd_s16(input[1], x2[4]); x3[11] = vqsub_s16(input[14], x2[5]); x3[12] = vqadd_s16(input[2], x2[6]); x3[13] = vqsub_s16(x2[7], input[13]); x3[14] = vqsub_s16(input[2], x2[6]); x3[15] = vqadd_s16(input[13], x2[7]); // stage 4 butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]); butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]); butterfly_s16_s32_x4_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]); butterfly_s16_s32_x4_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]); // stage 5 int16x4_t x5[16]; x5[0] = vqadd_s16(x3[0], x3[4]); x5[1] = vqadd_s16(x3[1], x3[5]); x5[2] = vqadd_s16(x3[2], x3[6]); x5[3] = vqsub_s16(x3[7], x3[3]); x5[4] = vqsub_s16(x3[0], x3[4]); x5[5] = vqsub_s16(x3[1], x3[5]); x5[6] = vqsub_s16(x3[2], x3[6]); x5[7] = vqadd_s16(x3[3], x3[7]); x5[8] = vqadd_s16(x3[8], x3[12]); x5[9] = vqadd_s16(x3[9], x3[13]); x5[10] = vqsub_s16(x3[14], x3[10]); x5[11] = vqadd_s16(x3[11], x3[15]); x5[12] = vqsub_s16(x3[8], x3[12]); x5[13] = vqsub_s16(x3[9], x3[13]); x5[14] = vqadd_s16(x3[10], x3[14]); x5[15] = vqsub_s16(x3[11], x3[15]); // stage 6 butterfly_s16_s32_x4_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]); butterfly_s16_s32_x4_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]); butterfly_s16_s32_x4_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]); butterfly_s16_s32_x4_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]); // stage 7 int16x4_t x7[16]; x7[0] = vqadd_s16(x5[0], x5[8]); x7[1] = vqadd_s16(x5[1], x5[9]); x7[2] = vqadd_s16(x5[2], x5[10]); x7[3] = vqadd_s16(x5[3], x5[11]); x7[4] = vqadd_s16(x5[4], x5[12]); x7[5] = vqadd_s16(x5[5], x5[13]); x7[6] = vqadd_s16(x5[6], x5[14]); x7[7] = vqsub_s16(x5[15], x5[7]); x7[8] = vqsub_s16(x5[0], x5[8]); x7[9] = vqsub_s16(x5[1], x5[9]); x7[10] = vqsub_s16(x5[2], x5[10]); x7[11] = vqsub_s16(x5[3], x5[11]); x7[12] = vqsub_s16(x5[4], x5[12]); x7[13] = vqsub_s16(x5[5], x5[13]); x7[14] = vqsub_s16(x5[6], x5[14]); x7[15] = vqadd_s16(x5[7], x5[15]); // stage 8 butterfly_s16_s32_x4_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]); butterfly_s16_s32_x4_0112_neon(cospi10, x7[2], x7[3], &output[13], &output[2]); butterfly_s16_s32_x4_0112_neon(cospi18, x7[4], x7[5], &output[11], &output[4]); butterfly_s16_s32_x4_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]); butterfly_s16_s32_x4_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]); butterfly_s16_s32_x4_1003_neon(cospi22, x7[10], x7[11], &output[5], &output[10]); butterfly_s16_s32_x4_1003_neon(cospi14, x7[12], x7[13], &output[3], &output[12]); butterfly_s16_s32_x4_0112_neon(cospi6, x7[14], x7[15], &output[14], &output[1]); } static AOM_FORCE_INLINE void fadst8x16_neon(const int16x8_t *input, int16x8_t *output, int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); const int16x4_t cospi32 = vget_low_s16(cospi32_16); const int16x4_t cospi16 = vget_high_s16(cospi32_16); const int16x4_t cospi8 = vget_low_s16(cospi8_24); const int16x4_t cospi24 = vget_high_s16(cospi8_24); const int16x4_t cospi2 = vget_low_s16(cospi2_6); const int16x4_t cospi6 = vget_high_s16(cospi2_6); const int16x4_t cospi10 = vget_low_s16(cospi10_14); const int16x4_t cospi14 = vget_high_s16(cospi10_14); const int16x4_t cospi18 = vget_low_s16(cospi18_22); const int16x4_t cospi22 = vget_high_s16(cospi18_22); const int16x4_t cospi26 = vget_low_s16(cospi26_30); const int16x4_t cospi30 = vget_high_s16(cospi26_30); // stage 2 int16x8_t x2[8]; butterfly_s16_s32_x8_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]); butterfly_s16_s32_x8_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]); butterfly_s16_s32_x8_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]); butterfly_s16_s32_x8_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]); // stage 3 int16x8_t x3[16]; x3[0] = vqaddq_s16(input[0], x2[0]); x3[1] = vqsubq_s16(x2[1], input[15]); x3[2] = vqsubq_s16(input[0], x2[0]); x3[3] = vqaddq_s16(input[15], x2[1]); x3[4] = vqsubq_s16(x2[2], input[3]); x3[5] = vqaddq_s16(input[12], x2[3]); x3[6] = vqaddq_s16(input[3], x2[2]); x3[7] = vqsubq_s16(input[12], x2[3]); x3[8] = vqsubq_s16(x2[4], input[1]); x3[9] = vqaddq_s16(input[14], x2[5]); x3[10] = vqaddq_s16(input[1], x2[4]); x3[11] = vqsubq_s16(input[14], x2[5]); x3[12] = vqaddq_s16(input[2], x2[6]); x3[13] = vqsubq_s16(x2[7], input[13]); x3[14] = vqsubq_s16(input[2], x2[6]); x3[15] = vqaddq_s16(input[13], x2[7]); // stage 4 butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]); butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]); butterfly_s16_s32_x8_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]); butterfly_s16_s32_x8_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]); // stage 5 int16x8_t x5[16]; x5[0] = vqaddq_s16(x3[0], x3[4]); x5[1] = vqaddq_s16(x3[1], x3[5]); x5[2] = vqaddq_s16(x3[2], x3[6]); x5[3] = vqsubq_s16(x3[7], x3[3]); x5[4] = vqsubq_s16(x3[0], x3[4]); x5[5] = vqsubq_s16(x3[1], x3[5]); x5[6] = vqsubq_s16(x3[2], x3[6]); x5[7] = vqaddq_s16(x3[3], x3[7]); x5[8] = vqaddq_s16(x3[8], x3[12]); x5[9] = vqaddq_s16(x3[9], x3[13]); x5[10] = vqsubq_s16(x3[14], x3[10]); x5[11] = vqaddq_s16(x3[11], x3[15]); x5[12] = vqsubq_s16(x3[8], x3[12]); x5[13] = vqsubq_s16(x3[9], x3[13]); x5[14] = vqaddq_s16(x3[10], x3[14]); x5[15] = vqsubq_s16(x3[11], x3[15]); // stage 6 butterfly_s16_s32_x8_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]); butterfly_s16_s32_x8_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]); butterfly_s16_s32_x8_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]); butterfly_s16_s32_x8_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]); // stage 7 int16x8_t x7[16]; x7[0] = vqaddq_s16(x5[0], x5[8]); x7[1] = vqaddq_s16(x5[1], x5[9]); x7[2] = vqaddq_s16(x5[2], x5[10]); x7[3] = vqaddq_s16(x5[3], x5[11]); x7[4] = vqaddq_s16(x5[4], x5[12]); x7[5] = vqaddq_s16(x5[5], x5[13]); x7[6] = vqaddq_s16(x5[6], x5[14]); x7[7] = vqsubq_s16(x5[15], x5[7]); x7[8] = vqsubq_s16(x5[0], x5[8]); x7[9] = vqsubq_s16(x5[1], x5[9]); x7[10] = vqsubq_s16(x5[2], x5[10]); x7[11] = vqsubq_s16(x5[3], x5[11]); x7[12] = vqsubq_s16(x5[4], x5[12]); x7[13] = vqsubq_s16(x5[5], x5[13]); x7[14] = vqsubq_s16(x5[6], x5[14]); x7[15] = vqaddq_s16(x5[7], x5[15]); // stage 8 butterfly_s16_s32_x8_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]); butterfly_s16_s32_x8_0112_neon(cospi10, x7[2], x7[3], &output[13], &output[2]); butterfly_s16_s32_x8_0112_neon(cospi18, x7[4], x7[5], &output[11], &output[4]); butterfly_s16_s32_x8_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]); butterfly_s16_s32_x8_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]); butterfly_s16_s32_x8_1003_neon(cospi22, x7[10], x7[11], &output[5], &output[10]); butterfly_s16_s32_x8_1003_neon(cospi14, x7[12], x7[13], &output[3], &output[12]); butterfly_s16_s32_x8_0112_neon(cospi6, x7[14], x7[15], &output[14], &output[1]); } static AOM_FORCE_INLINE void fidentity4x4_neon(const int16x4_t *const input, int16x4_t *const output, const int cos_bit) { (void)cos_bit; round_shift_sqrt2_s16_s16_4xn_neon(input, output, 4); } static AOM_FORCE_INLINE void fidentity8x4_neon(const int16x8_t *const input, int16x8_t *const output, const int cos_bit) { (void)cos_bit; round_shift_sqrt2_s16_s16_8xn_neon(input, output, 4); } static AOM_FORCE_INLINE void fidentity4x8_neon(const int16x4_t *input, int16x4_t *output, int cos_bit) { (void)cos_bit; shift_left_1_s16_x4(input, output, 8); } static AOM_FORCE_INLINE void fidentity8x8_neon(const int16x8_t *input, int16x8_t *output, int cos_bit) { (void)cos_bit; shift_left_1_s16_x8(input, output, 8); } static AOM_FORCE_INLINE void fidentity4x16_neon(const int16x4_t *input, int16x4_t *output, int cos_bit) { (void)cos_bit; round_shift_2sqrt2_s16_s16_4xn_neon(input, output, 16); } static AOM_FORCE_INLINE void fidentity8x16_neon(const int16x8_t *input, int16x8_t *output, int cos_bit) { (void)cos_bit; round_shift_2sqrt2_s16_s16_8xn_neon(input, output, 16); } static AOM_FORCE_INLINE void fidentity8x32_neon(const int16x8_t *input, int16x8_t *output, int cos_bit) { (void)cos_bit; shift_left_2_s16_x8(input, output, 32); } #define TRANSFORM_COL(name, tw, n) \ static void name##_col_neon(const int16_t *input, int16x##tw##_t *output, \ int stride, int cos_bit) { \ int16x##tw##_t buf0[n]; \ load_buffer_s16_x##tw(input, stride, buf0, n); \ shift_left_2_s16_x##tw(buf0, buf0, n); \ name##_neon(buf0, output, cos_bit); \ } TRANSFORM_COL(fadst4x4, 4, 4) TRANSFORM_COL(fadst4x8, 4, 8) TRANSFORM_COL(fadst4x16, 4, 16) TRANSFORM_COL(fadst8x4, 8, 4) TRANSFORM_COL(fadst8x8, 8, 8) TRANSFORM_COL(fadst8x16, 8, 16) TRANSFORM_COL(fdct4x4, 4, 4) TRANSFORM_COL(fdct4x8, 4, 8) TRANSFORM_COL(fdct4x16, 4, 16) TRANSFORM_COL(fdct8x4, 8, 4) TRANSFORM_COL(fdct8x8, 8, 8) TRANSFORM_COL(fdct8x16, 8, 16) TRANSFORM_COL(fdct8x32, 8, 32) TRANSFORM_COL(fidentity4x4, 4, 4) TRANSFORM_COL(fidentity4x8, 4, 8) TRANSFORM_COL(fidentity4x16, 4, 16) TRANSFORM_COL(fidentity8x4, 8, 4) TRANSFORM_COL(fidentity8x8, 8, 8) TRANSFORM_COL(fidentity8x16, 8, 16) TRANSFORM_COL(fidentity8x32, 8, 32) #define TRANSFORM_ROW(name, tw, n) \ static void name##_row_neon(const int16x##tw##_t *input, int32_t *output, \ int stride, int cos_bit) { \ int16x##tw##_t buf0[n]; \ name##_neon(input, buf0, cos_bit); \ store_buffer_s16_x##tw(buf0, output, stride, n); \ } #define TRANSFORM_ROW_RECT(name, tw, n) \ static void name##_row_rect_neon(const int16x##tw##_t *input, \ int32_t *output, int stride, int cos_bit) { \ int16x##tw##_t buf0[n]; \ name##_neon(input, buf0, cos_bit); \ store_rect_buffer_s16_x##tw(buf0, output, stride, n); \ } TRANSFORM_ROW(fadst4x4, 4, 4) TRANSFORM_ROW(fadst4x16, 4, 16) TRANSFORM_ROW(fadst8x4, 8, 4) TRANSFORM_ROW(fadst8x8, 8, 8) TRANSFORM_ROW(fadst8x16, 8, 16) TRANSFORM_ROW(fdct4x4, 4, 4) TRANSFORM_ROW(fdct4x16, 4, 16) TRANSFORM_ROW(fdct8x4, 8, 4) TRANSFORM_ROW(fdct8x8, 8, 8) TRANSFORM_ROW(fdct8x16, 8, 16) TRANSFORM_ROW(fdct8x32, 8, 32) TRANSFORM_ROW(fidentity4x4, 4, 4) TRANSFORM_ROW(fidentity4x16, 4, 16) TRANSFORM_ROW(fidentity8x4, 8, 4) TRANSFORM_ROW(fidentity8x8, 8, 8) TRANSFORM_ROW(fidentity8x16, 8, 16) TRANSFORM_ROW(fidentity8x32, 8, 32) TRANSFORM_ROW_RECT(fadst4x8, 4, 8) TRANSFORM_ROW_RECT(fadst8x4, 8, 4) TRANSFORM_ROW_RECT(fadst8x8, 8, 8) TRANSFORM_ROW_RECT(fadst8x16, 8, 16) TRANSFORM_ROW_RECT(fdct4x8, 4, 8) TRANSFORM_ROW_RECT(fdct8x4, 8, 4) TRANSFORM_ROW_RECT(fdct8x8, 8, 8) TRANSFORM_ROW_RECT(fdct8x16, 8, 16) TRANSFORM_ROW_RECT(fdct8x32, 8, 32) TRANSFORM_ROW_RECT(fidentity4x8, 4, 8) TRANSFORM_ROW_RECT(fidentity8x4, 8, 4) TRANSFORM_ROW_RECT(fidentity8x8, 8, 8) TRANSFORM_ROW_RECT(fidentity8x16, 8, 16) TRANSFORM_ROW_RECT(fidentity8x32, 8, 32) typedef void (*transform_1d_lbd_4_neon)(const int16x4_t *input, int16x4_t *output, int cos_bit); typedef void (*transform_1d_lbd_8_neon)(const int16x8_t *input, int16x8_t *output, int cos_bit); typedef void (*col_transform_1d_lbd_4_neon)(const int16_t *input, int16x4_t *output, int stride, int cos_bit); typedef void (*col_transform_1d_lbd_8_neon)(const int16_t *input, int16x8_t *output, int stride, int cos_bit); typedef void (*row_transform_1d_lbd_4_neon)(const int16x4_t *input, int32_t *output, int stride, int cos_bit); typedef void (*row_transform_1d_lbd_8_neon)(const int16x8_t *input, int32_t *output, int stride, int cos_bit); static const col_transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = { fdct4x8_col_neon, // DCT_DCT fadst4x8_col_neon, // ADST_DCT fdct4x8_col_neon, // DCT_ADST fadst4x8_col_neon, // ADST_ADST fadst4x8_col_neon, // FLIPADST_DCT fdct4x8_col_neon, // DCT_FLIPADST fadst4x8_col_neon, // FLIPADST_FLIPADST fadst4x8_col_neon, // ADST_FLIPADST fadst4x8_col_neon, // FLIPADST_ADST fidentity4x8_col_neon, // IDTX fdct4x8_col_neon, // V_DCT fidentity4x8_col_neon, // H_DCT fadst4x8_col_neon, // V_ADST fidentity4x8_col_neon, // H_ADST fadst4x8_col_neon, // V_FLIPADST fidentity4x8_col_neon // H_FLIPADST }; static const row_transform_1d_lbd_8_neon row_txfm8x4_arr[TX_TYPES] = { fdct8x4_row_neon, // DCT_DCT fdct8x4_row_neon, // ADST_DCT fadst8x4_row_neon, // DCT_ADST fadst8x4_row_neon, // ADST_ADST fdct8x4_row_neon, // FLIPADST_DCT fadst8x4_row_neon, // DCT_FLIPADST fadst8x4_row_neon, // FLIPADST_FLIPADST fadst8x4_row_neon, // ADST_FLIPADST fadst8x4_row_neon, // FLIPADST_ADST fidentity8x4_row_neon, // IDTX fidentity8x4_row_neon, // V_DCT fdct8x4_row_neon, // H_DCT fidentity8x4_row_neon, // V_ADST fadst8x4_row_neon, // H_ADST fidentity8x4_row_neon, // V_FLIPADST fadst8x4_row_neon // H_FLIPADST }; static const row_transform_1d_lbd_8_neon row_rect_txfm8x4_arr[TX_TYPES] = { fdct8x4_row_rect_neon, // DCT_DCT fdct8x4_row_rect_neon, // ADST_DCT fadst8x4_row_rect_neon, // DCT_ADST fadst8x4_row_rect_neon, // ADST_ADST fdct8x4_row_rect_neon, // FLIPADST_DCT fadst8x4_row_rect_neon, // DCT_FLIPADST fadst8x4_row_rect_neon, // FLIPADST_FLIPADST fadst8x4_row_rect_neon, // ADST_FLIPADST fadst8x4_row_rect_neon, // FLIPADST_ADST fidentity8x4_row_rect_neon, // IDTX fidentity8x4_row_rect_neon, // V_DCT fdct8x4_row_rect_neon, // H_DCT fidentity8x4_row_rect_neon, // V_ADST fadst8x4_row_rect_neon, // H_ADST fidentity8x4_row_rect_neon, // V_FLIPADST fadst8x4_row_rect_neon // H_FLIPADST }; static const col_transform_1d_lbd_8_neon col_txfm8x4_arr[TX_TYPES] = { fdct8x4_col_neon, // DCT_DCT fadst8x4_col_neon, // ADST_DCT fdct8x4_col_neon, // DCT_ADST fadst8x4_col_neon, // ADST_ADST fadst8x4_col_neon, // FLIPADST_DCT fdct8x4_col_neon, // DCT_FLIPADST fadst8x4_col_neon, // FLIPADST_FLIPADST fadst8x4_col_neon, // ADST_FLIPADST fadst8x4_col_neon, // FLIPADST_ADST fidentity8x4_col_neon, // IDTX fdct8x4_col_neon, // V_DCT fidentity8x4_col_neon, // H_DCT fadst8x4_col_neon, // V_ADST fidentity8x4_col_neon, // H_ADST fadst8x4_col_neon, // V_FLIPADST fidentity8x4_col_neon // H_FLIPADST }; static const row_transform_1d_lbd_4_neon row_rect_txfm4x8_arr[TX_TYPES] = { fdct4x8_row_rect_neon, // DCT_DCT fdct4x8_row_rect_neon, // ADST_DCT fadst4x8_row_rect_neon, // DCT_ADST fadst4x8_row_rect_neon, // ADST_ADST fdct4x8_row_rect_neon, // FLIPADST_DCT fadst4x8_row_rect_neon, // DCT_FLIPADST fadst4x8_row_rect_neon, // FLIPADST_FLIPADST fadst4x8_row_rect_neon, // ADST_FLIPADST fadst4x8_row_rect_neon, // FLIPADST_ADST fidentity4x8_row_rect_neon, // IDTX fidentity4x8_row_rect_neon, // V_DCT fdct4x8_row_rect_neon, // H_DCT fidentity4x8_row_rect_neon, // V_ADST fadst4x8_row_rect_neon, // H_ADST fidentity4x8_row_rect_neon, // V_FLIPADST fadst4x8_row_rect_neon // H_FLIPADST }; static const col_transform_1d_lbd_8_neon col_txfm8x8_arr[TX_TYPES] = { fdct8x8_col_neon, // DCT_DCT fadst8x8_col_neon, // ADST_DCT fdct8x8_col_neon, // DCT_ADST fadst8x8_col_neon, // ADST_ADST fadst8x8_col_neon, // FLIPADST_DCT fdct8x8_col_neon, // DCT_FLIPADST fadst8x8_col_neon, // FLIPADST_FLIPADST fadst8x8_col_neon, // ADST_FLIPADST fadst8x8_col_neon, // FLIPADST_ADST fidentity8x8_col_neon, // IDTX fdct8x8_col_neon, // V_DCT fidentity8x8_col_neon, // H_DCT fadst8x8_col_neon, // V_ADST fidentity8x8_col_neon, // H_ADST fadst8x8_col_neon, // V_FLIPADST fidentity8x8_col_neon, // H_FLIPADST }; static const row_transform_1d_lbd_8_neon row_txfm8x8_arr[TX_TYPES] = { fdct8x8_row_neon, // DCT_DCT fdct8x8_row_neon, // ADST_DCT fadst8x8_row_neon, // DCT_ADST fadst8x8_row_neon, // ADST_ADST fdct8x8_row_neon, // FLIPADST_DCT fadst8x8_row_neon, // DCT_FLIPADST fadst8x8_row_neon, // FLIPADST_FLIPADST fadst8x8_row_neon, // ADST_FLIPADST fadst8x8_row_neon, // FLIPADST_ADST fidentity8x8_row_neon, // IDTX fidentity8x8_row_neon, // V_DCT fdct8x8_row_neon, // H_DCT fidentity8x8_row_neon, // V_ADST fadst8x8_row_neon, // H_ADST fidentity8x8_row_neon, // V_FLIPADST fadst8x8_row_neon // H_FLIPADST }; static const row_transform_1d_lbd_8_neon row_rect_txfm8x8_arr[TX_TYPES] = { fdct8x8_row_rect_neon, // DCT_DCT fdct8x8_row_rect_neon, // ADST_DCT fadst8x8_row_rect_neon, // DCT_ADST fadst8x8_row_rect_neon, // ADST_ADST fdct8x8_row_rect_neon, // FLIPADST_DCT fadst8x8_row_rect_neon, // DCT_FLIPADST fadst8x8_row_rect_neon, // FLIPADST_FLIPADST fadst8x8_row_rect_neon, // ADST_FLIPADST fadst8x8_row_rect_neon, // FLIPADST_ADST fidentity8x8_row_rect_neon, // IDTX fidentity8x8_row_rect_neon, // V_DCT fdct8x8_row_rect_neon, // H_DCT fidentity8x8_row_rect_neon, // V_ADST fadst8x8_row_rect_neon, // H_ADST fidentity8x8_row_rect_neon, // V_FLIPADST fadst8x8_row_rect_neon // H_FLIPADST }; static const col_transform_1d_lbd_4_neon col_txfm4x16_arr[TX_TYPES] = { fdct4x16_col_neon, // DCT_DCT fadst4x16_col_neon, // ADST_DCT fdct4x16_col_neon, // DCT_ADST fadst4x16_col_neon, // ADST_ADST fadst4x16_col_neon, // FLIPADST_DCT fdct4x16_col_neon, // DCT_FLIPADST fadst4x16_col_neon, // FLIPADST_FLIPADST fadst4x16_col_neon, // ADST_FLIPADST fadst4x16_col_neon, // FLIPADST_ADST fidentity4x16_col_neon, // IDTX fdct4x16_col_neon, // V_DCT fidentity4x16_col_neon, // H_DCT fadst4x16_col_neon, // V_ADST fidentity4x16_col_neon, // H_ADST fadst4x16_col_neon, // V_FLIPADST fidentity4x16_col_neon // H_FLIPADST }; static const row_transform_1d_lbd_4_neon row_txfm4x16_arr[TX_TYPES] = { fdct4x16_row_neon, // DCT_DCT fdct4x16_row_neon, // ADST_DCT fadst4x16_row_neon, // DCT_ADST fadst4x16_row_neon, // ADST_ADST fdct4x16_row_neon, // FLIPADST_DCT fadst4x16_row_neon, // DCT_FLIPADST fadst4x16_row_neon, // FLIPADST_FLIPADST fadst4x16_row_neon, // ADST_FLIPADST fadst4x16_row_neon, // FLIPADST_ADST fidentity4x16_row_neon, // IDTX fidentity4x16_row_neon, // V_DCT fdct4x16_row_neon, // H_DCT fidentity4x16_row_neon, // V_ADST fadst4x16_row_neon, // H_ADST fidentity4x16_row_neon, // V_FLIPADST fadst4x16_row_neon // H_FLIPADST }; static const col_transform_1d_lbd_8_neon col_txfm8x16_arr[TX_TYPES] = { fdct8x16_col_neon, // DCT_DCT fadst8x16_col_neon, // ADST_DCT fdct8x16_col_neon, // DCT_ADST fadst8x16_col_neon, // ADST_ADST fadst8x16_col_neon, // FLIPADST_DCT fdct8x16_col_neon, // DCT_FLIPADST fadst8x16_col_neon, // FLIPADST_FLIPADST fadst8x16_col_neon, // ADST_FLIPADST fadst8x16_col_neon, // FLIPADST_ADST fidentity8x16_col_neon, // IDTX fdct8x16_col_neon, // V_DCT fidentity8x16_col_neon, // H_DCT fadst8x16_col_neon, // V_ADST fidentity8x16_col_neon, // H_ADST fadst8x16_col_neon, // V_FLIPADST fidentity8x16_col_neon // H_FLIPADST }; static const row_transform_1d_lbd_8_neon row_txfm8x16_arr[TX_TYPES] = { fdct8x16_row_neon, // DCT_DCT fdct8x16_row_neon, // ADST_DCT fadst8x16_row_neon, // DCT_ADST fadst8x16_row_neon, // ADST_ADST fdct8x16_row_neon, // FLIPADST_DCT fadst8x16_row_neon, // DCT_FLIPADST fadst8x16_row_neon, // FLIPADST_FLIPADST fadst8x16_row_neon, // ADST_FLIPADST fadst8x16_row_neon, // FLIPADST_ADST fidentity8x16_row_neon, // IDTX fidentity8x16_row_neon, // V_DCT fdct8x16_row_neon, // H_DCT fidentity8x16_row_neon, // V_ADST fadst8x16_row_neon, // H_ADST fidentity8x16_row_neon, // V_FLIPADST fadst8x16_row_neon // H_FLIPADST }; static const row_transform_1d_lbd_8_neon row_rect_txfm8x16_arr[TX_TYPES] = { fdct8x16_row_rect_neon, // DCT_DCT fdct8x16_row_rect_neon, // ADST_DCT fadst8x16_row_rect_neon, // DCT_ADST fadst8x16_row_rect_neon, // ADST_ADST fdct8x16_row_rect_neon, // FLIPADST_DCT fadst8x16_row_rect_neon, // DCT_FLIPADST fadst8x16_row_rect_neon, // FLIPADST_FLIPADST fadst8x16_row_rect_neon, // ADST_FLIPADST fadst8x16_row_rect_neon, // FLIPADST_ADST fidentity8x16_row_rect_neon, // IDTX fidentity8x16_row_rect_neon, // V_DCT fdct8x16_row_rect_neon, // H_DCT fidentity8x16_row_rect_neon, // V_ADST fadst8x16_row_rect_neon, // H_ADST fidentity8x16_row_rect_neon, // V_FLIPADST fadst8x16_row_rect_neon // H_FLIPADST }; static const row_transform_1d_lbd_8_neon row_txfm8x32_arr[TX_TYPES] = { fdct8x32_row_neon, // DCT_DCT NULL, // ADST_DCT NULL, // DCT_ADST NULL, // ADST_ADST NULL, // FLIPADST_DCT NULL, // DCT_FLIPADST NULL, // FLIPADST_FLIPADST NULL, // ADST_FLIPADST NULL, // FLIPADST_ADST fidentity8x32_row_neon, // IDTX fidentity8x32_row_neon, // V_DCT fdct8x32_row_neon, // H_DCT NULL, // V_ADST NULL, // H_ADST NULL, // V_FLIPADST NULL // H_FLIPADST }; static const row_transform_1d_lbd_8_neon row_rect_txfm8x32_arr[TX_TYPES] = { fdct8x32_row_rect_neon, // DCT_DCT NULL, // ADST_DCT NULL, // DCT_ADST NULL, // ADST_ADST NULL, // FLIPADST_DCT NULL, // DCT_FLIPADST NULL, // FLIPADST_FLIPADST NULL, // ADST_FLIPADST NULL, // FLIPADST_ADST fidentity8x32_row_rect_neon, // IDTX fidentity8x32_row_rect_neon, // V_DCT fdct8x32_row_rect_neon, // H_DCT NULL, // V_ADST NULL, // H_ADST NULL, // V_FLIPADST NULL // H_FLIPADST }; static const col_transform_1d_lbd_8_neon col_txfm8x32_arr[TX_TYPES] = { fdct8x32_col_neon, // DCT_DCT NULL, // ADST_DCT NULL, // DCT_ADST NULL, // ADST_ADST NULL, // FLIPADST_DCT NULL, // DCT_FLIPADST NULL, // FLIPADST_FLIPADST NULL, // ADST_FLIPADST NULL, // FLIPADST_ADST fidentity8x32_col_neon, // IDTX fdct8x32_col_neon, // V_DCT fidentity8x32_col_neon, // H_DCT NULL, // V_ADST NULL, // H_ADST NULL, // V_FLIPADST NULL // H_FLIPADST }; static void lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); int16x4_t buf0[4], buf1[4]; switch (tx_type) { case DCT_DCT: fdct4x4_col_neon(input, buf0, stride, 13); transpose_arrays_s16_4x4(buf0, buf1); fdct4x4_row_neon(buf1, output, 4, 13); break; case ADST_DCT: fadst4x4_col_neon(input, buf0, stride, 13); transpose_arrays_s16_4x4(buf0, buf1); fdct4x4_row_neon(buf1, output, 4, 13); break; case DCT_ADST: fdct4x4_col_neon(input, buf0, stride, 13); transpose_arrays_s16_4x4(buf0, buf1); fadst4x4_row_neon(buf1, output, 4, 13); break; case ADST_ADST: fadst4x4_col_neon(input, buf0, stride, 13); transpose_arrays_s16_4x4(buf0, buf1); fadst4x4_row_neon(buf1, output, 4, 13); break; case FLIPADST_DCT: fadst4x4_col_neon(input, buf0, stride, 13); transpose_arrays_s16_4x4(buf0, buf1); fdct4x4_row_neon(buf1, output, 4, 13); break; case DCT_FLIPADST: fdct4x4_col_neon(input, buf0, stride, 13); transpose_arrays_s16_4x4(buf0, buf1); flip_buf_4_neon(buf1, buf0, 4); fadst4x4_row_neon(buf0, output, 4, 13); break; case FLIPADST_FLIPADST: fadst4x4_col_neon(input, buf0, stride, 13); transpose_arrays_s16_4x4(buf0, buf1); flip_buf_4_neon(buf1, buf0, 4); fadst4x4_row_neon(buf0, output, 4, 13); break; case ADST_FLIPADST: fadst4x4_col_neon(input, buf0, stride, 13); transpose_arrays_s16_4x4(buf0, buf1); flip_buf_4_neon(buf1, buf0, 4); fadst4x4_row_neon(buf0, output, 4, 13); break; case FLIPADST_ADST: fadst4x4_col_neon(input, buf0, stride, 13); transpose_arrays_s16_4x4(buf0, buf1); fadst4x4_row_neon(buf1, output, 4, 13); break; case IDTX: fidentity4x4_col_neon(input, buf0, stride, 13); transpose_arrays_s16_4x4(buf0, buf1); fidentity4x4_row_neon(buf1, output, 4, 13); break; case V_DCT: fdct4x4_col_neon(input, buf0, stride, 13); transpose_arrays_s16_4x4(buf0, buf1); fidentity4x4_row_neon(buf1, output, 4, 13); break; case H_DCT: fidentity4x4_col_neon(input, buf0, stride, 13); transpose_arrays_s16_4x4(buf0, buf1); fdct4x4_row_neon(buf1, output, 4, 13); break; case V_ADST: fadst4x4_col_neon(input, buf0, stride, 13); transpose_arrays_s16_4x4(buf0, buf1); fidentity4x4_row_neon(buf1, output, 4, 13); break; case H_ADST: fidentity4x4_col_neon(input, buf0, stride, 13); transpose_arrays_s16_4x4(buf0, buf1); fadst4x4_row_neon(buf1, output, 4, 13); break; case V_FLIPADST: fadst4x4_col_neon(input, buf0, stride, 13); transpose_arrays_s16_4x4(buf0, buf1); fidentity4x4_row_neon(buf1, output, 4, 13); break; case H_FLIPADST: fidentity4x4_col_neon(input, buf0, stride, 13); transpose_arrays_s16_4x4(buf0, buf1); flip_buf_4_neon(buf1, buf0, 4); fadst4x4_row_neon(buf0, output, 4, 13); break; } } static void lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; int16x4_t buf0[8]; int16x8_t buf1[8]; const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x8_arr[tx_type]; const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x4_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); col_txfm(input, buf0, stride, 13); shift_right_1_round_s16_x4(buf0, buf0, 8); transpose_arrays_s16_4x8(buf0, buf1); if (lr_flip) { int16x8_t buf2[8]; flip_buf_8_neon(buf1, buf2, 4); row_txfm(buf2, output, 8, 13); } else { row_txfm(buf1, output, 8, 13); } } static void lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; int16x4_t buf0[16]; int16x8_t buf1[16]; const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x16_arr[tx_type]; const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x4_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); col_txfm(input, buf0, stride, 13); shift_right_1_round_s16_x4(buf0, buf0, 16); transpose_arrays_s16_4x8(buf0, buf1); transpose_arrays_s16_4x8(buf0 + 8, buf1 + 8); for (int i = 0; i < 2; i++) { if (lr_flip) { int16x8_t buf2[16]; flip_buf_8_neon(buf1 + 8 * i, buf2, 4); row_txfm(buf2, output + 8 * i, 16, 12); } else { int16x8_t *buf = buf1 + 8 * i; row_txfm(buf, output + 8 * i, 16, 12); } } } static void lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; int16x8_t buf0[8]; int16x4_t buf1[8]; const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type]; const row_transform_1d_lbd_4_neon row_txfm = row_rect_txfm4x8_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); col_txfm(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 4); transpose_arrays_s16_8x4(buf0, buf1); if (lr_flip) { int16x4_t buf2[8]; flip_buf_4_neon(buf1, buf2, 8); row_txfm(buf2, output, 4, 13); } else { row_txfm(buf1, output, 4, 13); } } static void lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); int16x8_t buf0[8], buf1[8]; switch (tx_type) { case DCT_DCT: fdct8x8_col_neon(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1); fdct8x8_row_neon(buf1, output, 8, 13); break; case ADST_DCT: fadst8x8_col_neon(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1); fdct8x8_row_neon(buf1, output, 8, 13); break; case DCT_ADST: fdct8x8_col_neon(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1); fadst8x8_row_neon(buf1, output, 8, 13); break; case ADST_ADST: fadst8x8_col_neon(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1); fadst8x8_row_neon(buf1, output, 8, 13); break; case FLIPADST_DCT: fadst8x8_col_neon(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1); fdct8x8_row_neon(buf1, output, 8, 13); break; case DCT_FLIPADST: fdct8x8_col_neon(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1); flip_buf_8_neon(buf1, buf0, 8); fadst8x8_row_neon(buf0, output, 8, 13); break; case FLIPADST_FLIPADST: fadst8x8_col_neon(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1); flip_buf_8_neon(buf1, buf0, 8); fadst8x8_row_neon(buf0, output, 8, 13); break; case ADST_FLIPADST: fadst8x8_col_neon(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1); flip_buf_8_neon(buf1, buf0, 8); fadst8x8_row_neon(buf0, output, 8, 13); break; case FLIPADST_ADST: fadst8x8_col_neon(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1); fadst8x8_row_neon(buf1, output, 8, 13); break; case IDTX: fidentity8x8_col_neon(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1); fidentity8x8_row_neon(buf1, output, 8, 13); break; case V_DCT: fdct8x8_col_neon(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1); fidentity8x8_row_neon(buf1, output, 8, 13); break; case H_DCT: fidentity8x8_col_neon(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1); fdct8x8_row_neon(buf1, output, 8, 13); break; case V_ADST: fadst8x8_col_neon(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1); fidentity8x8_row_neon(buf1, output, 8, 13); break; case H_ADST: fidentity8x8_col_neon(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1); fadst8x8_row_neon(buf1, output, 8, 13); break; case V_FLIPADST: fadst8x8_col_neon(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1); fidentity8x8_row_neon(buf1, output, 8, 13); break; case H_FLIPADST: fidentity8x8_col_neon(input, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1); flip_buf_8_neon(buf1, buf0, 8); fadst8x8_row_neon(buf0, output, 8, 13); break; } } static void lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; int16x8_t buf0[16], buf1[16]; const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type]; const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x8_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); col_txfm(input, buf0, stride, 13); shift_right_2_round_s16_x8(buf0, buf0, 16); transpose_arrays_s16_8x8(buf0, buf1); transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8); for (int i = 0; i < 2; i++) { if (lr_flip) { flip_buf_8_neon(buf1 + 8 * i, buf0, 8); row_txfm(buf0, output + 8 * i, 16, 13); } else { int16x8_t *buf = buf1 + 8 * i; row_txfm(buf, output + 8 * i, 16, 13); } } } static void lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; int16x8_t buf0[32], buf1[32]; const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type]; const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 32); col_txfm(input, buf0, stride, 12); shift_right_2_round_s16_x8(buf0, buf0, 32); transpose_arrays_s16_8x8(buf0, buf1); transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8); transpose_arrays_s16_8x8(buf0 + 16, buf1 + 16); transpose_arrays_s16_8x8(buf0 + 24, buf1 + 24); for (int i = 0; i < 4; i++) { if (lr_flip) { flip_buf_8_neon(buf1 + 8 * i, buf0, 8); row_txfm(buf0, output + 8 * i, 32, 12); } else { int16x8_t *buf = buf1 + 8 * i; row_txfm(buf, output + 8 * i, 32, 12); } } } static void lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; int16x8_t buf0[16]; int16x4_t buf1[16]; int16x4_t buf2[16]; const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type]; const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x16_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); for (int i = 0; i < 2; i++) { col_txfm(input + 8 * i, buf0, stride, 13); shift_right_1_round_s16_x8(buf0, buf0, 4); transpose_arrays_s16_8x4(buf0, buf1 + 8 * i); } if (lr_flip) { flip_buf_4_neon(buf1, buf2, 16); row_txfm(buf2, output, 4, 13); } else { row_txfm(buf1, output, 4, 13); } } static void lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; int16x8_t buf0[16], buf1[16]; const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type]; const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); for (int i = 0; i < 2; i++) { col_txfm(input + 8 * i, buf0, stride, 13); shift_right_2_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1 + 8 * i); } if (lr_flip) { flip_buf_8_neon(buf1, buf0, 16); row_txfm(buf0, output, 8, 13); } else { row_txfm(buf1, output, 8, 13); } } static void lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; int16x8_t buf0[16], buf1[32]; const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type]; const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x16_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); for (int i = 0; i < 2; i++) { col_txfm(input + 8 * i, buf0, stride, 13); shift_right_2_round_s16_x8(buf0, buf0, 16); transpose_arrays_s16_8x8(buf0, buf1 + 0 * 16 + 8 * i); transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 16 + 8 * i); } for (int i = 0; i < 2; i++) { if (lr_flip) { flip_buf_8_neon(buf1 + 16 * i, buf0, 16); row_txfm(buf0, output + 8 * i, 16, 12); } else { int16x8_t *buf = buf1 + 16 * i; row_txfm(buf, output + 8 * i, 16, 12); } } } static void lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; int16x8_t buf0[32], buf1[64]; const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type]; const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type]; if (col_txfm == NULL || row_txfm == NULL) { av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd); return; } int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 32); for (int i = 0; i < 2; i++) { col_txfm(input + 8 * i, buf0, stride, 12); shift_right_4_round_s16_x8(buf0, buf0, 32); transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 16 + 8 * i); transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 16 + 8 * i); transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 16 + 8 * i); transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 16 + 8 * i); } for (int i = 0; i < 4; i++) { if (lr_flip) { flip_buf_8_neon(buf1 + 16 * i, buf0, 16); row_txfm(buf0, output + 8 * i, 32, 13); } else { int16x8_t *buf = buf1 + 16 * i; row_txfm(buf, output + 8 * i, 32, 13); } } } static void lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; int16x8_t buf0[32], buf1[32]; const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type]; const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type]; if (col_txfm == NULL || row_txfm == NULL) { av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); return; } int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); for (int i = 0; i < 4; i++) { col_txfm(input + 8 * i, buf0, stride, 13); shift_right_2_round_s16_x8(buf0, buf0, 8); transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i); } if (lr_flip) { flip_buf_8_neon(buf1, buf0, 32); row_txfm(buf0, output, 8, 12); } else { row_txfm(buf1, output, 8, 12); } } static void lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; int16x8_t buf0[32], buf1[64]; const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type]; const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x32_arr[tx_type]; if (col_txfm == NULL || row_txfm == NULL) { av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); return; } int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); for (int i = 0; i < 4; i++) { col_txfm(input + 8 * i, buf0, stride, 13); shift_right_4_round_s16_x8(buf0, buf0, 16); transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i); transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 32 + 8 * i); } for (int i = 0; i < 2; i++) { if (lr_flip) { flip_buf_8_neon(buf1 + 32 * i, buf0, 32); row_txfm(buf0, output + 8 * i, 16, 13); } else { int16x8_t *buf = buf1 + 32 * i; row_txfm(buf, output + 8 * i, 16, 13); } } } static void lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; int16x8_t buf0[32], buf1[128]; const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type]; const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type]; if (col_txfm == NULL || row_txfm == NULL) { av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd); return; } int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 32); for (int i = 0; i < 4; i++) { col_txfm(input + 8 * i, buf0, stride, 12); shift_right_4_round_s16_x8(buf0, buf0, 32); transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 32 + 8 * i); transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 32 + 8 * i); transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 32 + 8 * i); transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 32 + 8 * i); } for (int i = 0; i < 4; i++) { if (lr_flip) { flip_buf_8_neon(buf1 + 32 * i, buf0, 32); row_txfm(buf0, output + 8 * i, 32, 12); } else { int16x8_t *buf = buf1 + 32 * i; row_txfm(buf, output + 8 * i, 32, 12); } } } static void lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; (void)tx_type; assert(tx_type == DCT_DCT); int16x8_t buf0[64], buf1[128]; const transform_1d_lbd_8_neon col_txfm = fdct8x16_neon; const transform_1d_lbd_8_neon row_txfm = fdct8x64_neon; for (int i = 0; i < 8; i++) { load_buffer_s16_x8(input + 8 * i, stride, buf0, 16); shift_left_2_s16_x8(buf0, buf0, 16); col_txfm(buf0, buf0, 13); shift_right_4_round_s16_x8(buf0, buf0, 16); for (int j = 0; j < 2; ++j) { transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i); } } for (int i = 0; i < 2; i++) { int16x8_t *buf = buf1 + 64 * i; row_txfm(buf, buf, 12); store_buffer_s16_x8(buf, output + 8 * i, 16, 32); } // Zero out the bottom 16x32 area. memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); } static void lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; (void)tx_type; assert(tx_type == DCT_DCT); int16x8_t buf0[64], buf1[128]; const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon; const transform_1d_lbd_8_neon row_txfm = fdct8x16_neon; for (int i = 0; i < 2; i++) { load_buffer_s16_x8(input + 8 * i, stride, buf0, 64); col_txfm(buf0, buf0, 13); shift_right_2_round_s16_x8(buf0, buf0, 64); for (int j = 0; j < 8; ++j) { transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 16 + 8 * i); } } for (int i = 0; i < 4; i++) { int16x8_t *buf = buf1 + 16 * i; row_txfm(buf, buf, 12); store_buffer_s16_x8(buf, output + 8 * i, 32, 16); } } static void fdct32_neon(const int32x4_t *input, int32x4_t *output, int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); const int16x4_t cospi32 = vget_low_s16(cospi32_16); const int16x4_t cospi16 = vget_high_s16(cospi32_16); const int16x4_t cospi8 = vget_low_s16(cospi8_24); const int16x4_t cospi24 = vget_high_s16(cospi8_24); const int16x4_t cospi4 = vget_low_s16(cospi4_12); const int16x4_t cospi12 = vget_high_s16(cospi4_12); const int16x4_t cospi20 = vget_low_s16(cospi20_28); const int16x4_t cospi28 = vget_high_s16(cospi20_28); const int16x4_t cospi2 = vget_low_s16(cospi2_6); const int16x4_t cospi6 = vget_high_s16(cospi2_6); const int16x4_t cospi10 = vget_low_s16(cospi10_14); const int16x4_t cospi14 = vget_high_s16(cospi10_14); const int16x4_t cospi18 = vget_low_s16(cospi18_22); const int16x4_t cospi22 = vget_high_s16(cospi18_22); const int16x4_t cospi26 = vget_low_s16(cospi26_30); const int16x4_t cospi30 = vget_high_s16(cospi26_30); int32x4_t buf0[32]; int32x4_t buf1[32]; // stage 1 butterfly_dct_pre_s32_x4(input, buf1, 32); // stage 2 butterfly_dct_pre_s32_x4(buf1, buf0, 16); buf0[16] = buf1[16]; buf0[17] = buf1[17]; buf0[18] = buf1[18]; buf0[19] = buf1[19]; butterfly_s32_s32_x4_0112_neon(cospi32, buf1[27], buf1[20], &buf0[27], &buf0[20]); butterfly_s32_s32_x4_0112_neon(cospi32, buf1[26], buf1[21], &buf0[26], &buf0[21]); butterfly_s32_s32_x4_0112_neon(cospi32, buf1[25], buf1[22], &buf0[25], &buf0[22]); butterfly_s32_s32_x4_0112_neon(cospi32, buf1[24], buf1[23], &buf0[24], &buf0[23]); buf0[28] = buf1[28]; buf0[29] = buf1[29]; buf0[30] = buf1[30]; buf0[31] = buf1[31]; // stage 3 butterfly_dct_pre_s32_x4(buf0, buf1, 8); buf1[8] = buf0[8]; buf1[9] = buf0[9]; butterfly_s32_s32_x4_0112_neon(cospi32, buf0[13], buf0[10], &buf1[13], &buf1[10]); butterfly_s32_s32_x4_0112_neon(cospi32, buf0[12], buf0[11], &buf1[12], &buf1[11]); buf1[14] = buf0[14]; buf1[15] = buf0[15]; butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 16); // stage 4 butterfly_dct_pre_s32_x4(buf1, buf0, 4); buf0[4] = buf1[4]; butterfly_s32_s32_x4_0112_neon(cospi32, buf1[6], buf1[5], &buf0[6], &buf0[5]); buf0[7] = buf1[7]; butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 8); buf0[16] = buf1[16]; buf0[17] = buf1[17]; butterfly_s32_s32_x4_0112_neon(cospi16, buf1[29], buf1[18], &buf0[29], &buf0[18]); butterfly_s32_s32_x4_0112_neon(cospi16, buf1[28], buf1[19], &buf0[28], &buf0[19]); butterfly_s32_s32_x4_1223_neon(cospi16, buf1[27], buf1[20], &buf0[27], &buf0[20]); butterfly_s32_s32_x4_1223_neon(cospi16, buf1[26], buf1[21], &buf0[26], &buf0[21]); buf0[22] = buf1[22]; buf0[23] = buf1[23]; buf0[24] = buf1[24]; buf0[25] = buf1[25]; buf0[30] = buf1[30]; buf0[31] = buf1[31]; // stage 5 butterfly_s32_s32_x4_0112_neon(cospi32, buf0[0], buf0[1], &buf1[0], &buf1[1]); butterfly_s32_s32_x4_0112_neon(cospi16, buf0[3], buf0[2], &buf1[2], &buf1[3]); butterfly_dct_post_s32_x4(buf0 + 4, buf0 + 4, buf1 + 4, 4); buf1[8] = buf0[8]; butterfly_s32_s32_x4_0112_neon(cospi16, buf0[14], buf0[9], &buf1[14], &buf1[9]); butterfly_s32_s32_x4_1223_neon(cospi16, buf0[13], buf0[10], &buf1[13], &buf1[10]); buf1[11] = buf0[11]; buf1[12] = buf0[12]; buf1[15] = buf0[15]; butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 8); butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 8); // stage 6 buf0[0] = buf1[0]; buf0[1] = buf1[1]; buf0[2] = buf1[2]; buf0[3] = buf1[3]; butterfly_s32_s32_x4_0112_neon(cospi8, buf1[7], buf1[4], &buf0[4], &buf0[7]); butterfly_s32_s32_x4_1003_neon(cospi24, buf1[6], buf1[5], &buf0[5], &buf0[6]); butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 4); butterfly_dct_post_s32_x4(buf1 + 12, buf1 + 12, buf0 + 12, 4); buf0[16] = buf1[16]; butterfly_s32_s32_x4_0112_neon(cospi8, buf1[30], buf1[17], &buf0[30], &buf0[17]); butterfly_s32_s32_x4_1223_neon(cospi8, buf1[29], buf1[18], &buf0[29], &buf0[18]); buf0[19] = buf1[19]; buf0[20] = buf1[20]; butterfly_s32_s32_x4_1003_neon(cospi24, buf1[26], buf1[21], &buf0[26], &buf0[21]); butterfly_s32_s32_x4_0332_neon(cospi24, buf1[25], buf1[22], &buf0[25], &buf0[22]); buf0[23] = buf1[23]; buf0[24] = buf1[24]; buf0[27] = buf1[27]; buf0[28] = buf1[28]; buf0[31] = buf1[31]; // stage 7 buf1[0] = buf0[0]; buf1[1] = buf0[1]; buf1[2] = buf0[2]; buf1[3] = buf0[3]; buf1[4] = buf0[4]; buf1[5] = buf0[5]; buf1[6] = buf0[6]; buf1[7] = buf0[7]; butterfly_s32_s32_x4_0112_neon(cospi4, buf0[15], buf0[8], &buf1[8], &buf1[15]); butterfly_s32_s32_x4_1003_neon(cospi28, buf0[14], buf0[9], &buf1[9], &buf1[14]); butterfly_s32_s32_x4_0112_neon(cospi20, buf0[13], buf0[10], &buf1[10], &buf1[13]); butterfly_s32_s32_x4_1003_neon(cospi12, buf0[12], buf0[11], &buf1[11], &buf1[12]); butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 4); butterfly_dct_post_s32_x4(buf0 + 20, buf0 + 20, buf1 + 20, 4); butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 4); butterfly_dct_post_s32_x4(buf0 + 28, buf0 + 28, buf1 + 28, 4); // stage 8 buf0[0] = buf1[0]; buf0[1] = buf1[1]; buf0[2] = buf1[2]; buf0[3] = buf1[3]; buf0[4] = buf1[4]; buf0[5] = buf1[5]; buf0[6] = buf1[6]; buf0[7] = buf1[7]; buf0[8] = buf1[8]; buf0[9] = buf1[9]; buf0[10] = buf1[10]; buf0[11] = buf1[11]; buf0[12] = buf1[12]; buf0[13] = buf1[13]; buf0[14] = buf1[14]; buf0[15] = buf1[15]; butterfly_s32_s32_x4_0112_neon(cospi2, buf1[31], buf1[16], &buf0[16], &buf0[31]); butterfly_s32_s32_x4_1003_neon(cospi30, buf1[30], buf1[17], &buf0[17], &buf0[30]); butterfly_s32_s32_x4_0112_neon(cospi18, buf1[29], buf1[18], &buf0[18], &buf0[29]); butterfly_s32_s32_x4_1003_neon(cospi14, buf1[28], buf1[19], &buf0[19], &buf0[28]); butterfly_s32_s32_x4_0112_neon(cospi10, buf1[27], buf1[20], &buf0[20], &buf0[27]); butterfly_s32_s32_x4_1003_neon(cospi22, buf1[26], buf1[21], &buf0[21], &buf0[26]); butterfly_s32_s32_x4_0112_neon(cospi26, buf1[25], buf1[22], &buf0[22], &buf0[25]); butterfly_s32_s32_x4_1003_neon(cospi6, buf1[24], buf1[23], &buf0[23], &buf0[24]); // stage 9 output[0] = buf0[0]; output[1] = buf0[16]; output[2] = buf0[8]; output[3] = buf0[24]; output[4] = buf0[4]; output[5] = buf0[20]; output[6] = buf0[12]; output[7] = buf0[28]; output[8] = buf0[2]; output[9] = buf0[18]; output[10] = buf0[10]; output[11] = buf0[26]; output[12] = buf0[6]; output[13] = buf0[22]; output[14] = buf0[14]; output[15] = buf0[30]; output[16] = buf0[1]; output[17] = buf0[17]; output[18] = buf0[9]; output[19] = buf0[25]; output[20] = buf0[5]; output[21] = buf0[21]; output[22] = buf0[13]; output[23] = buf0[29]; output[24] = buf0[3]; output[25] = buf0[19]; output[26] = buf0[11]; output[27] = buf0[27]; output[28] = buf0[7]; output[29] = buf0[23]; output[30] = buf0[15]; output[31] = buf0[31]; } static void fdct64_neon(const int32x4_t *input, int32x4_t *output, int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]); const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]); const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]); const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]); const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]); const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]); const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]); const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]); const int16x4_t cospi32 = vget_low_s16(cospi32_16); const int16x4_t cospi16 = vget_high_s16(cospi32_16); const int16x4_t cospi8 = vget_low_s16(cospi8_24); const int16x4_t cospi24 = vget_high_s16(cospi8_24); const int16x4_t cospi4 = vget_low_s16(cospi4_12); const int16x4_t cospi12 = vget_high_s16(cospi4_12); const int16x4_t cospi20 = vget_low_s16(cospi20_28); const int16x4_t cospi28 = vget_high_s16(cospi20_28); const int16x4_t cospi2 = vget_low_s16(cospi2_6); const int16x4_t cospi6 = vget_high_s16(cospi2_6); const int16x4_t cospi10 = vget_low_s16(cospi10_14); const int16x4_t cospi14 = vget_high_s16(cospi10_14); const int16x4_t cospi18 = vget_low_s16(cospi18_22); const int16x4_t cospi22 = vget_high_s16(cospi18_22); const int16x4_t cospi26 = vget_low_s16(cospi26_30); const int16x4_t cospi30 = vget_high_s16(cospi26_30); const int16x4_t cospi1 = vget_low_s16(cospi1_3); const int16x4_t cospi3 = vget_high_s16(cospi1_3); const int16x4_t cospi5 = vget_low_s16(cospi5_7); const int16x4_t cospi7 = vget_high_s16(cospi5_7); const int16x4_t cospi9 = vget_low_s16(cospi9_11); const int16x4_t cospi11 = vget_high_s16(cospi9_11); const int16x4_t cospi13 = vget_low_s16(cospi13_15); const int16x4_t cospi15 = vget_high_s16(cospi13_15); const int16x4_t cospi17 = vget_low_s16(cospi17_19); const int16x4_t cospi19 = vget_high_s16(cospi17_19); const int16x4_t cospi21 = vget_low_s16(cospi21_23); const int16x4_t cospi23 = vget_high_s16(cospi21_23); const int16x4_t cospi25 = vget_low_s16(cospi25_27); const int16x4_t cospi27 = vget_high_s16(cospi25_27); const int16x4_t cospi29 = vget_low_s16(cospi29_31); const int16x4_t cospi31 = vget_high_s16(cospi29_31); // stage 1 int32x4_t x1[64]; butterfly_dct_pre_s32_x4(input, x1, 64); // stage 2 int32x4_t x2[64]; butterfly_dct_pre_s32_x4(x1, x2, 32); butterfly_s32_s32_x4_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]); butterfly_s32_s32_x4_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]); butterfly_s32_s32_x4_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]); butterfly_s32_s32_x4_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]); butterfly_s32_s32_x4_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]); butterfly_s32_s32_x4_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]); butterfly_s32_s32_x4_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]); butterfly_s32_s32_x4_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]); // stage 3 int32x4_t x3[64]; butterfly_dct_pre_s32_x4(x2, x3, 16); butterfly_s32_s32_x4_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]); butterfly_s32_s32_x4_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]); butterfly_s32_s32_x4_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]); butterfly_s32_s32_x4_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]); butterfly_dct_post_s32_x4(x1 + 32, x2 + 32, x3 + 32, 32); // stage 4 int32x4_t x4[64]; butterfly_dct_pre_s32_x4(x3, x4, 8); butterfly_s32_s32_x4_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]); butterfly_s32_s32_x4_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]); butterfly_dct_post_s32_x4(x2 + 16, x3 + 16, x4 + 16, 16); butterfly_s32_s32_x4_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]); butterfly_s32_s32_x4_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]); butterfly_s32_s32_x4_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]); butterfly_s32_s32_x4_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]); butterfly_s32_s32_x4_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]); butterfly_s32_s32_x4_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]); butterfly_s32_s32_x4_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]); butterfly_s32_s32_x4_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]); // stage 5 int32x4_t x5[64]; butterfly_dct_pre_s32_x4(x4, x5, 4); butterfly_s32_s32_x4_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]); butterfly_dct_post_s32_x4(x3 + 8, x4 + 8, x5 + 8, 8); butterfly_s32_s32_x4_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]); butterfly_s32_s32_x4_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]); butterfly_s32_s32_x4_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]); butterfly_s32_s32_x4_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]); butterfly_dct_post_s32_x4(x3 + 32, x4 + 32, x5 + 32, 16); butterfly_dct_post_s32_x4(x3 + 48, x4 + 48, x5 + 48, 16); // stage 6 int32x4_t x6[64]; butterfly_s32_s32_x4_0112_neon(cospi32, x5[0], x5[1], &x6[0], &x6[1]); butterfly_s32_s32_x4_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]); butterfly_dct_post_s32_x4(x4 + 4, x5 + 4, x6 + 4, 4); butterfly_s32_s32_x4_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]); butterfly_s32_s32_x4_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]); butterfly_dct_post_s32_x4(x4 + 16, x5 + 16, x6 + 16, 8); butterfly_dct_post_s32_x4(x4 + 24, x5 + 24, x6 + 24, 8); butterfly_s32_s32_x4_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]); butterfly_s32_s32_x4_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]); butterfly_s32_s32_x4_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]); butterfly_s32_s32_x4_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]); butterfly_s32_s32_x4_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]); butterfly_s32_s32_x4_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]); butterfly_s32_s32_x4_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]); butterfly_s32_s32_x4_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]); // stage 7 int32x4_t x7[64]; butterfly_s32_s32_x4_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]); butterfly_s32_s32_x4_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]); butterfly_dct_post_s32_x4(x5 + 8, x6 + 8, x7 + 8, 4); butterfly_dct_post_s32_x4(x5 + 12, x6 + 12, x7 + 12, 4); butterfly_s32_s32_x4_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]); butterfly_s32_s32_x4_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]); butterfly_s32_s32_x4_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]); butterfly_s32_s32_x4_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]); butterfly_dct_post_s32_x4(x5 + 32, x6 + 32, x7 + 32, 8); butterfly_dct_post_s32_x4(x5 + 40, x6 + 40, x7 + 40, 8); butterfly_dct_post_s32_x4(x5 + 48, x6 + 48, x7 + 48, 8); butterfly_dct_post_s32_x4(x5 + 56, x6 + 56, x7 + 56, 8); // stage 8 int32x4_t x8[64]; butterfly_s32_s32_x4_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]); butterfly_s32_s32_x4_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]); butterfly_s32_s32_x4_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]); butterfly_s32_s32_x4_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]); butterfly_dct_post_s32_x4(x6 + 16, x7 + 16, x8 + 16, 4); butterfly_dct_post_s32_x4(x6 + 20, x7 + 20, x8 + 20, 4); butterfly_dct_post_s32_x4(x6 + 24, x7 + 24, x8 + 24, 4); butterfly_dct_post_s32_x4(x6 + 28, x7 + 28, x8 + 28, 4); butterfly_s32_s32_x4_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]); butterfly_s32_s32_x4_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]); butterfly_s32_s32_x4_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]); butterfly_s32_s32_x4_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]); butterfly_s32_s32_x4_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]); butterfly_s32_s32_x4_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]); butterfly_s32_s32_x4_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]); butterfly_s32_s32_x4_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]); // stage 9 int32x4_t x9[64]; butterfly_s32_s32_x4_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]); butterfly_s32_s32_x4_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]); butterfly_s32_s32_x4_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]); butterfly_s32_s32_x4_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]); butterfly_s32_s32_x4_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]); butterfly_s32_s32_x4_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]); butterfly_s32_s32_x4_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]); butterfly_s32_s32_x4_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]); butterfly_dct_post_s32_x4(x7 + 32, x8 + 32, x9 + 32, 4); butterfly_dct_post_s32_x4(x7 + 36, x8 + 36, x9 + 36, 4); butterfly_dct_post_s32_x4(x7 + 40, x8 + 40, x9 + 40, 4); butterfly_dct_post_s32_x4(x7 + 44, x8 + 44, x9 + 44, 4); butterfly_dct_post_s32_x4(x7 + 48, x8 + 48, x9 + 48, 4); butterfly_dct_post_s32_x4(x7 + 52, x8 + 52, x9 + 52, 4); butterfly_dct_post_s32_x4(x7 + 56, x8 + 56, x9 + 56, 4); butterfly_dct_post_s32_x4(x7 + 60, x8 + 60, x9 + 60, 4); // stage 10 int32x4_t x10[64]; butterfly_s32_s32_x4_0112_neon(cospi1, x9[63], x9[32], &x10[32], &x10[63]); butterfly_s32_s32_x4_1003_neon(cospi31, x9[62], x9[33], &x10[33], &x10[62]); butterfly_s32_s32_x4_0112_neon(cospi17, x9[61], x9[34], &x10[34], &x10[61]); butterfly_s32_s32_x4_1003_neon(cospi15, x9[60], x9[35], &x10[35], &x10[60]); butterfly_s32_s32_x4_0112_neon(cospi9, x9[59], x9[36], &x10[36], &x10[59]); butterfly_s32_s32_x4_1003_neon(cospi23, x9[58], x9[37], &x10[37], &x10[58]); butterfly_s32_s32_x4_0112_neon(cospi25, x9[57], x9[38], &x10[38], &x10[57]); butterfly_s32_s32_x4_1003_neon(cospi7, x9[56], x9[39], &x10[39], &x10[56]); butterfly_s32_s32_x4_0112_neon(cospi5, x9[55], x9[40], &x10[40], &x10[55]); butterfly_s32_s32_x4_1003_neon(cospi27, x9[54], x9[41], &x10[41], &x10[54]); butterfly_s32_s32_x4_0112_neon(cospi21, x9[53], x9[42], &x10[42], &x10[53]); butterfly_s32_s32_x4_1003_neon(cospi11, x9[52], x9[43], &x10[43], &x10[52]); butterfly_s32_s32_x4_0112_neon(cospi13, x9[51], x9[44], &x10[44], &x10[51]); butterfly_s32_s32_x4_1003_neon(cospi19, x9[50], x9[45], &x10[45], &x10[50]); butterfly_s32_s32_x4_0112_neon(cospi29, x9[49], x9[46], &x10[46], &x10[49]); butterfly_s32_s32_x4_1003_neon(cospi3, x9[48], x9[47], &x10[47], &x10[48]); // stage 11, only store into the low 32 output indices. output[0] = x6[0]; output[1] = x10[32]; output[2] = x9[16]; output[3] = x10[48]; output[4] = x8[8]; output[5] = x10[40]; output[6] = x9[24]; output[7] = x10[56]; output[8] = x7[4]; output[9] = x10[36]; output[10] = x9[20]; output[11] = x10[52]; output[12] = x8[12]; output[13] = x10[44]; output[14] = x9[28]; output[15] = x10[60]; output[16] = x6[2]; output[17] = x10[34]; output[18] = x9[18]; output[19] = x10[50]; output[20] = x8[10]; output[21] = x10[42]; output[22] = x9[26]; output[23] = x10[58]; output[24] = x7[6]; output[25] = x10[38]; output[26] = x9[22]; output[27] = x10[54]; output[28] = x8[14]; output[29] = x10[46]; output[30] = x9[30]; output[31] = x10[62]; } static void lowbd_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; (void)tx_type; assert(tx_type == DCT_DCT); int16x8_t buf0[64], buf1[512]; const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon; for (int i = 0; i < 8; i++) { load_buffer_s16_x8(input + 8 * i, stride, buf0, 64); col_txfm(buf0, buf0, 13); shift_right_2_round_s16_x8(buf0, buf0, 64); for (int j = 0; j < 4; ++j) { transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i); } } for (int i = 0; i < 4; i++) { int32x4_t bufA[64]; int32x4_t bufB[64]; int16x8_t *buf = buf1 + 64 * i; for (int j = 0; j < 64; ++j) { bufA[j] = vmovl_s16(vget_low_s16(buf[j])); bufB[j] = vmovl_s16(vget_high_s16(buf[j])); } fdct64_neon(bufA, bufA, 10); fdct64_neon(bufB, bufB, 10); shift_right_2_round_s32_x4(bufA, bufA, 32); shift_right_2_round_s32_x4(bufB, bufB, 32); store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32); } } static void lowbd_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; int16x8_t buf0[64], buf1[256]; const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type]; for (int i = 0; i < 8; i++) { col_txfm(input + 8 * i, buf0, stride, 12); shift_right_4_round_s16_x8(buf0, buf0, 32); for (int j = 0; j < 4; ++j) { transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i); } } assert(tx_type == DCT_DCT); for (int i = 0; i < 4; i++) { int32x4_t bufA[64]; int32x4_t bufB[64]; int16x8_t *buf = buf1 + 64 * i; for (int j = 0; j < 64; ++j) { bufA[j] = vmovl_s16(vget_low_s16(buf[j])); bufB[j] = vmovl_s16(vget_high_s16(buf[j])); } fdct64_neon(bufA, bufA, 11); fdct64_neon(bufB, bufB, 11); shift_right_2_round_s32_x4(bufA, bufA, 32); shift_right_2_round_s32_x4(bufB, bufB, 32); round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32); round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32); store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32); } } static void lowbd_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; (void)tx_type; assert(tx_type == DCT_DCT); int16x8_t buf0[64], buf1[256]; const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon; for (int i = 0; i < 4; i++) { load_buffer_s16_x8(input + 8 * i, stride, buf0, 64); col_txfm(buf0, buf0, 13); shift_right_2_round_s16_x8(buf0, buf0, 64); for (int j = 0; j < 4; ++j) { transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 32 + 8 * i); } } for (int i = 0; i < 4; i++) { int32x4_t bufA[32]; int32x4_t bufB[32]; int16x8_t *buf = buf1 + 32 * i; for (int j = 0; j < 32; ++j) { bufA[j] = vmovl_s16(vget_low_s16(buf[j])); bufB[j] = vmovl_s16(vget_high_s16(buf[j])); } fdct32_neon(bufA, bufA, 11); fdct32_neon(bufB, bufB, 11); shift_right_2_round_s32_x4(bufA, bufA, 32); shift_right_2_round_s32_x4(bufB, bufB, 32); round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32); round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32); store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32); } } static FwdTxfm2dFunc lowbd_fwd_txfm_func_ls[TX_SIZES_ALL] = { lowbd_fwd_txfm2d_4x4_neon, // 4x4 transform lowbd_fwd_txfm2d_8x8_neon, // 8x8 transform lowbd_fwd_txfm2d_16x16_neon, // 16x16 transform lowbd_fwd_txfm2d_32x32_neon, // 32x32 transform lowbd_fwd_txfm2d_64x64_neon, // 64x64 transform lowbd_fwd_txfm2d_4x8_neon, // 4x8 transform lowbd_fwd_txfm2d_8x4_neon, // 8x4 transform lowbd_fwd_txfm2d_8x16_neon, // 8x16 transform lowbd_fwd_txfm2d_16x8_neon, // 16x8 transform lowbd_fwd_txfm2d_16x32_neon, // 16x32 transform lowbd_fwd_txfm2d_32x16_neon, // 32x16 transform lowbd_fwd_txfm2d_32x64_neon, // 32x64 transform lowbd_fwd_txfm2d_64x32_neon, // 64x32 transform lowbd_fwd_txfm2d_4x16_neon, // 4x16 transform lowbd_fwd_txfm2d_16x4_neon, // 16x4 transform lowbd_fwd_txfm2d_8x32_neon, // 8x32 transform lowbd_fwd_txfm2d_32x8_neon, // 32x8 transform lowbd_fwd_txfm2d_16x64_neon, // 16x64 transform lowbd_fwd_txfm2d_64x16_neon, // 64x16 transform }; void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { FwdTxfm2dFunc fwd_txfm2d_func = lowbd_fwd_txfm_func_ls[txfm_param->tx_size]; if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) { av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); } else { fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); } } aom-3.12.1/av1/encoder/arm/av1_highbd_quantize_neon.c000066400000000000000000000142231477627663500223670ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "aom_dsp/arm/mem_neon.h" #include "av1/common/quant_common.h" #include "av1/encoder/av1_quantize.h" static inline uint16x4_t quantize_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32, int32x4_t v_dequant_s32, int32x4_t v_round_s32, int log_scale) { const int32x4_t v_coeff = vld1q_s32(coeff_ptr); const int32x4_t v_coeff_sign = vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0))); const int32x4_t v_log_scale = vdupq_n_s32(log_scale); const int32x4_t v_abs_coeff = vabsq_s32(v_coeff); // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) const int32x4_t v_abs_coeff_scaled = vshlq_s32(v_abs_coeff, vdupq_n_s32(1 + log_scale)); const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32); // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0 const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32), vreinterpretq_s32_u32(v_mask)); // const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale)); const int32x4_t v_abs_qcoeff = vqdmulhq_s32(vshlq_s32(v_tmp, v_log_scale), v_quant_s32); // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); const int32x4_t v_qcoeff = vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); // vshlq_s32 will shift right if shift value is negative. const int32x4_t v_abs_dqcoeff = vshlq_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), vnegq_s32(v_log_scale)); // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); const int32x4_t v_dqcoeff = vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); vst1q_s32(qcoeff_ptr, v_qcoeff); vst1q_s32(dqcoeff_ptr, v_dqcoeff); // Used to find eob. const uint32x4_t nz_qcoeff_mask = vcgtq_s32(v_abs_qcoeff, vdupq_n_s32(0)); return vmovn_u32(nz_qcoeff_mask); } static inline int16x8_t get_max_lane_eob(const int16_t *iscan, int16x8_t v_eobmax, uint16x8_t v_mask) { const int16x8_t v_iscan = vld1q_s16(&iscan[0]); const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1)); const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0)); return vmaxq_s16(v_eobmax, v_nz_iscan); } static inline uint16_t get_max_eob(int16x8_t v_eobmax) { #if AOM_ARCH_AARCH64 return (uint16_t)vmaxvq_s16(v_eobmax); #else const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax)); const int64x1_t v_eobmax_xx32 = vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); const int16x4_t v_eobmax_tmp = vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); const int64x1_t v_eobmax_xxx3 = vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); const int16x4_t v_eobmax_final = vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); return (uint16_t)vget_lane_s16(v_eobmax_final, 0); #endif } void av1_highbd_quantize_fp_neon( const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale) { (void)scan; (void)zbin_ptr; (void)quant_shift_ptr; const int16x4_t v_quant = vld1_s16(quant_ptr); const int16x4_t v_dequant = vld1_s16(dequant_ptr); const int16x4_t v_zero = vdup_n_s16(0); const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero); const int16x4_t v_round_no_scale = vld1_s16(round_ptr); const int16x4_t v_round_log_scale = vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale))); const int16x4_t v_round = vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale); int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); uint16x4_t v_mask_lo, v_mask_hi; int16x8_t v_eobmax = vdupq_n_s16(-1); // DC and first 3 AC v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32, v_round_s32, log_scale); // overwrite the DC constants with AC constants v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1); v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1); v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1); // 4 more AC v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, v_quant_s32, v_dequant_s32, v_round_s32, log_scale); // Find the max lane eob for the first 8 coeffs. v_eobmax = get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); count -= 8; do { coeff_ptr += 8; qcoeff_ptr += 8; dqcoeff_ptr += 8; iscan += 8; v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32, v_round_s32, log_scale); v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, v_quant_s32, v_dequant_s32, v_round_s32, log_scale); // Find the max lane eob for 8 coeffs. v_eobmax = get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); count -= 8; } while (count); *eob_ptr = get_max_eob(v_eobmax); } aom-3.12.1/av1/encoder/arm/av1_k_means_neon.c000066400000000000000000000100541477627663500206350ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom_dsp/arm/sum_neon.h" #include "config/aom_config.h" #include "config/av1_rtcd.h" static int32x4_t k_means_multiply_add_neon(const int16x8_t a) { const int32x4_t l = vmull_s16(vget_low_s16(a), vget_low_s16(a)); const int32x4_t h = vmull_s16(vget_high_s16(a), vget_high_s16(a)); #if AOM_ARCH_AARCH64 return vpaddq_s32(l, h); #else const int32x2_t dl = vpadd_s32(vget_low_s32(l), vget_high_s32(l)); const int32x2_t dh = vpadd_s32(vget_low_s32(h), vget_high_s32(h)); return vcombine_s32(dl, dh); #endif } void av1_calc_indices_dim1_neon(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k) { int64x2_t sum = vdupq_n_s64(0); int16x8_t cents[PALETTE_MAX_SIZE]; for (int j = 0; j < k; ++j) { cents[j] = vdupq_n_s16(centroids[j]); } for (int i = 0; i < n; i += 8) { const int16x8_t in = vld1q_s16(data); uint16x8_t ind = vdupq_n_u16(0); // Compute the distance to the first centroid. int16x8_t dist_min = vabdq_s16(in, cents[0]); for (int j = 1; j < k; ++j) { // Compute the distance to the centroid. const int16x8_t dist = vabdq_s16(in, cents[j]); // Compare to the minimal one. const uint16x8_t cmp = vcgtq_s16(dist_min, dist); dist_min = vminq_s16(dist_min, dist); const uint16x8_t ind1 = vdupq_n_u16(j); ind = vbslq_u16(cmp, ind1, ind); } if (total_dist) { // Square, convert to 32 bit and add together. const int32x4_t l = vmull_s16(vget_low_s16(dist_min), vget_low_s16(dist_min)); const int32x4_t sum32_tmp = vmlal_s16(l, vget_high_s16(dist_min), vget_high_s16(dist_min)); // Pairwise sum, convert to 64 bit and add to sum. sum = vpadalq_s32(sum, sum32_tmp); } vst1_u8(indices, vmovn_u16(ind)); indices += 8; data += 8; } if (total_dist) { *total_dist = horizontal_add_s64x2(sum); } } void av1_calc_indices_dim2_neon(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k) { int64x2_t sum = vdupq_n_s64(0); uint32x4_t ind[2]; int16x8_t cents[PALETTE_MAX_SIZE]; for (int j = 0; j < k; ++j) { const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1]; const int16_t cxcy[8] = { cx, cy, cx, cy, cx, cy, cx, cy }; cents[j] = vld1q_s16(cxcy); } for (int i = 0; i < n; i += 8) { for (int l = 0; l < 2; ++l) { const int16x8_t in = vld1q_s16(data); ind[l] = vdupq_n_u32(0); // Compute the distance to the first centroid. int16x8_t d1 = vsubq_s16(in, cents[0]); int32x4_t dist_min = k_means_multiply_add_neon(d1); for (int j = 1; j < k; ++j) { // Compute the distance to the centroid. d1 = vsubq_s16(in, cents[j]); const int32x4_t dist = k_means_multiply_add_neon(d1); // Compare to the minimal one. const uint32x4_t cmp = vcgtq_s32(dist_min, dist); dist_min = vminq_s32(dist_min, dist); const uint32x4_t ind1 = vdupq_n_u32(j); ind[l] = vbslq_u32(cmp, ind1, ind[l]); } if (total_dist) { // Pairwise sum, convert to 64 bit and add to sum. sum = vpadalq_s32(sum, dist_min); } data += 8; } // Cast to 8 bit and store. vst1_u8(indices, vmovn_u16(vcombine_u16(vmovn_u32(ind[0]), vmovn_u32(ind[1])))); indices += 8; } if (total_dist) { *total_dist = horizontal_add_s64x2(sum); } } aom-3.12.1/av1/encoder/arm/av1_temporal_denoiser_neon.c000066400000000000000000000352001477627663500227330ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_ports/mem.h" #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "av1/common/reconinter.h" #include "av1/encoder/context_tree.h" #include "av1/encoder/av1_temporal_denoiser.h" // Compute the sum of all pixel differences of this MB. static inline int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) { #if AOM_ARCH_AARCH64 return vaddlvq_s8(v_sum_diff_total); #else const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total); const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10); const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210); const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210), vget_low_s64(fedcba98_76543210)); const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0); return sum_diff; #endif } // Denoise a 16x1 vector. static inline int8x16_t denoiser_16x1_neon( const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold, const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment, const uint8x16_t v_delta_level_1_and_2, const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) { const uint8x16_t v_sig = vld1q_u8(sig); const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); /* Calculate absolute difference and sign masks. */ const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); /* Figure out which level that put us in. */ const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff); const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff); const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff); /* Calculate absolute adjustments for level 1, 2 and 3. */ const uint8x16_t v_level2_adjustment = vandq_u8(v_level2_mask, v_delta_level_1_and_2); const uint8x16_t v_level3_adjustment = vandq_u8(v_level3_mask, v_delta_level_2_and_3); const uint8x16_t v_level1and2_adjustment = vaddq_u8(v_level1_adjustment, v_level2_adjustment); const uint8x16_t v_level1and2and3_adjustment = vaddq_u8(v_level1and2_adjustment, v_level3_adjustment); /* Figure adjustment absolute value by selecting between the absolute * difference if in level0 or the value for level 1, 2 and 3. */ const uint8x16_t v_abs_adjustment = vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff); /* Calculate positive and negative adjustments. Apply them to the signal * and accumulate them. Adjustments are less than eight and the maximum * sum of them (7 * 16) can fit in a signed char. */ const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask, v_abs_adjustment); const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask, v_abs_adjustment); uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment); /* Store results. */ vst1q_u8(running_avg_y, v_running_avg_y); /* Sum all the accumulators to have the sum of all pixel differences * for this macroblock. */ { const int8x16_t v_sum_diff = vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment), vreinterpretq_s8_u8(v_neg_adjustment)); v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff); } return v_sum_diff_total; } static inline int8x16_t denoiser_adjust_16x1_neon( const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, const uint8x16_t k_delta, int8x16_t v_sum_diff_total) { uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y); const uint8x16_t v_sig = vld1q_u8(sig); const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); /* Calculate absolute difference and sign masks. */ const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); // Clamp absolute difference to delta to get the adjustment. const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta)); const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask, v_abs_adjustment); const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask, v_abs_adjustment); v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment); v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment); /* Store results. */ vst1q_u8(running_avg_y, v_running_avg_y); { const int8x16_t v_sum_diff = vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment), vreinterpretq_s8_u8(v_pos_adjustment)); v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff); } return v_sum_diff_total; } // Denoise 8x8 and 8x16 blocks. static int av1_denoiser_8xN_neon(const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y, int mc_avg_y_stride, uint8_t *running_avg_y, int avg_y_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude, int width) { int sum_diff_thresh, r, sum_diff = 0; const int shift_inc = (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16]; const uint8x16_t v_level1_adjustment = vmovq_n_u8( (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc); const uint8x16_t v_level2_threshold = vdupq_n_u8(8); const uint8x16_t v_level3_threshold = vdupq_n_u8(16); const int b_height = block_size_high[bs] >> 1; int8x16_t v_sum_diff_total = vdupq_n_s8(0); for (r = 0; r < b_height; ++r) { memcpy(sig_buffer[r], sig, width); memcpy(sig_buffer[r] + width, sig + sig_stride, width); memcpy(mc_running_buffer[r], mc_running_avg_y, width); memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride, width); memcpy(running_buffer[r], running_avg_y, width); memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width); v_sum_diff_total = denoiser_16x1_neon( sig_buffer[r], mc_running_buffer[r], running_buffer[r], v_level1_threshold, v_level2_threshold, v_level3_threshold, v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3, v_sum_diff_total); { const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]); const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer); const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer); vst1_u8(running_avg_y, v_running_buffer_low); vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high); } // Update pointers for next iteration. sig += (sig_stride << 1); mc_running_avg_y += (mc_avg_y_stride << 1); running_avg_y += (avg_y_stride << 1); } { sum_diff = horizontal_add_s8x16(v_sum_diff_total); sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); if (abs(sum_diff) > sum_diff_thresh) { // Before returning to copy the block (i.e., apply no denoising), // check if we can still apply some (weaker) temporal filtering to // this block, that would otherwise not be denoised at all. Simplest // is to apply an additional adjustment to running_avg_y to bring it // closer to sig. The adjustment is capped by a maximum delta, and // chosen such that in most cases the resulting sum_diff will be // within the acceptable range given by sum_diff_thresh. // The delta is set by the excess of absolute pixel diff over the // threshold. const int delta = ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; // Only apply the adjustment for max delta up to 3. if (delta < 4) { const uint8x16_t k_delta = vmovq_n_u8(delta); running_avg_y -= avg_y_stride * (b_height << 1); for (r = 0; r < b_height; ++r) { v_sum_diff_total = denoiser_adjust_16x1_neon( sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta, v_sum_diff_total); { const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]); const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer); const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer); vst1_u8(running_avg_y, v_running_buffer_low); vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high); } // Update pointers for next iteration. running_avg_y += (avg_y_stride << 1); } sum_diff = horizontal_add_s8x16(v_sum_diff_total); if (abs(sum_diff) > sum_diff_thresh) { return COPY_BLOCK; } } else { return COPY_BLOCK; } } } return FILTER_BLOCK; } // Denoise 16x16, to 128x128 blocks. static int av1_denoiser_NxM_neon(const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y, int mc_avg_y_stride, uint8_t *running_avg_y, int avg_y_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude) { const int shift_inc = (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; const uint8x16_t v_level1_adjustment = vmovq_n_u8( (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc); const uint8x16_t v_level2_threshold = vdupq_n_u8(8); const uint8x16_t v_level3_threshold = vdupq_n_u8(16); const int b_width = block_size_wide[bs]; const int b_height = block_size_high[bs]; const int b_width_shift4 = b_width >> 4; int8x16_t v_sum_diff_total[8][8]; int r, c, sum_diff = 0; for (r = 0; r < 8; ++r) { for (c = 0; c < b_width_shift4; ++c) { v_sum_diff_total[c][r] = vdupq_n_s8(0); } } for (r = 0; r < b_height; ++r) { for (c = 0; c < b_width_shift4; ++c) { v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon( sig, mc_running_avg_y, running_avg_y, v_level1_threshold, v_level2_threshold, v_level3_threshold, v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3, v_sum_diff_total[c][r >> 4]); // Update pointers for next iteration. sig += 16; mc_running_avg_y += 16; running_avg_y += 16; } if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { for (c = 0; c < b_width_shift4; ++c) { sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]); } } // Update pointers for next iteration. sig = sig - b_width + sig_stride; mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; running_avg_y = running_avg_y - b_width + avg_y_stride; } { const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); if (abs(sum_diff) > sum_diff_thresh) { const int delta = ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; // Only apply the adjustment for max delta up to 3. if (delta < 4) { const uint8x16_t k_delta = vdupq_n_u8(delta); sig -= sig_stride * b_height; mc_running_avg_y -= mc_avg_y_stride * b_height; running_avg_y -= avg_y_stride * b_height; sum_diff = 0; for (r = 0; r < b_height; ++r) { for (c = 0; c < b_width_shift4; ++c) { v_sum_diff_total[c][r >> 4] = denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y, k_delta, v_sum_diff_total[c][r >> 4]); // Update pointers for next iteration. sig += 16; mc_running_avg_y += 16; running_avg_y += 16; } if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { for (c = 0; c < b_width_shift4; ++c) { sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]); } } sig = sig - b_width + sig_stride; mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; running_avg_y = running_avg_y - b_width + avg_y_stride; } if (abs(sum_diff) > sum_diff_thresh) { return COPY_BLOCK; } } else { return COPY_BLOCK; } } } return FILTER_BLOCK; } int av1_denoiser_filter_neon(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude) { // Rank by frequency of the block type to have an early termination. if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 || bs == BLOCK_128X128 || bs == BLOCK_128X64 || bs == BLOCK_64X128 || bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 || bs == BLOCK_32X64 || bs == BLOCK_64X32) { return av1_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg, avg_stride, increase_denoising, bs, motion_magnitude); } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) { return av1_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg, avg_stride, increase_denoising, bs, motion_magnitude, 8); } return COPY_BLOCK; } aom-3.12.1/av1/encoder/arm/cnn_neon.c000066400000000000000000002127701477627663500172400ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/sum_neon.h" #include "av1/common/av1_common_int.h" #include "av1/encoder/cnn.h" #include "av1/encoder/partition_cnn_weights.h" // The CNN weights used in av1_cnn_convolve_no_maxpool_padding_valid are // declared (av1_intra_mode_cnn_partition_cnn_layer_[01234]_kernel) in // partition_cnn_weights.h. However, to enable linear memory access, rearrange // the weight tables here. static const float weights_layer_1[] = { 0.228403f, 0.031690f, -0.251710f, -0.046230f, 0.413294f, -0.236732f, -0.038291f, 0.210766f, 0.427196f, -0.384319f, -0.439463f, 0.366015f, 0.112263f, -0.144168f, -0.075017f, 0.119629f, 0.325200f, -0.678246f, -0.370826f, -0.341362f, -0.503392f, 0.400884f, 0.465214f, -0.360847f, 0.187100f, -0.190757f, -0.131906f, 0.121492f, -0.303556f, -0.007658f, 0.380077f, -0.066394f, -0.016043f, -1.490730f, -0.120682f, 0.132062f, 0.086185f, -0.042766f, -0.087069f, 0.029426f, 0.309583f, -0.029985f, -0.297429f, -0.018139f, -0.688828f, 0.756607f, 0.706410f, -0.696826f, -0.087793f, -0.023304f, -0.012332f, -0.018043f, -0.410268f, 0.352143f, 0.391284f, -0.363178f, -0.295034f, 0.160246f, -0.149446f, 0.260145f, -0.252249f, 0.190826f, 0.251206f, -0.270796f, -0.979219f, 0.884880f, 0.962057f, -0.847601f, -0.011053f, 0.118765f, -0.028428f, -0.020138f, 0.400274f, -0.382845f, -0.462766f, 0.390654f, 0.361223f, -0.320068f, -0.372084f, 0.313196f, 0.241933f, -0.416614f, -0.008722f, -0.255078f, 0.078730f, -0.381935f, -0.204577f, 0.159768f, 0.071853f, -0.126294f, -0.036186f, -0.007900f, 0.380071f, -0.298882f, 0.387941f, -0.267350f, -0.586802f, 0.477785f, -0.000013f, 0.197296f, -0.079154f, -0.005811f, -0.044300f, -0.021192f, -0.020879f, -0.005265f, 0.082277f, -0.139132f, -0.239237f, 0.440234f, -0.542342f, 0.378360f, -0.070974f, 0.272702f, -0.278939f, -0.044948f, -0.134197f, -0.007172f, -0.353628f, -0.128091f, 0.357458f, -0.037614f, -0.144983f, 0.220623f, -0.003394f, -0.070166f, 0.200370f, -0.166037f, 0.224448f, -0.012990f, -0.098853f, 0.008613f, -0.017669f, 0.070641f, 0.174530f, -0.119822f, -0.065096f, 0.118487f, -0.024764f, -0.050466f, 0.066631f, -0.075896f, -0.062363f, 0.212604f, -0.377322f, 0.306306f, -0.399733f, 0.238624f, 0.233571f, -0.344080f, 0.462491f, -0.565210f, -0.035074f, -0.010459f, 0.084382f, 0.052294f, 0.065714f, 0.013716f, 0.135036f, 0.000588f, 0.181079f, -0.566344f, 0.395561f, -0.398509f, 0.450017f, -1.462710f, 1.138280f, -0.447774f, 0.247936f, -0.417067f, 0.165997f, -0.458632f, -0.018527f, 0.308461f, 0.541266f, 0.162257f, 0.601786f, -1.275840f, -0.373404f, -0.589747f, 0.026539f, -0.219327f, 0.142972f, -0.018496f, 0.075204f, -0.775190f, 0.237307f, -0.348252f, 0.117792f, -0.094332f, 0.363101f, -0.065025f, 0.816662f, 0.590110f, 0.752202f, -0.308599f, 0.258337f, -0.842085f, 0.695788f, -0.205615f, 0.093930f, -0.392536f, 0.463093f, -0.432456f, 0.041660f, -0.827264f, 0.309128f, -0.354658f, 0.451957f, -1.406640f, 0.773192f, -0.892943f, 0.134856f, -0.467808f, 0.306003f, -0.226560f, 0.086865f, -0.104102f, 0.148098f, -0.082658f, 0.316655f, -1.028310f, 0.741566f, -0.345326f, 0.052379f, -0.275613f, 0.191765f, -0.162391f, 0.000976f, 0.093061f, 0.068649f, 0.033582f, 0.239727f, -0.647769f, 0.218493f, -0.397120f, 0.268229f, -0.303424f, 0.185393f, -0.314189f, 0.101728f, -0.163083f, -0.084989f, 0.136783f, -0.264346f, 0.465914f, 0.220395f, -0.252968f, -0.326661f, 0.271483f, 0.374717f, -0.311570f, -0.082119f, 0.020870f, 0.091975f, -0.030582f, -0.487148f, 0.198912f, 0.024554f, -0.749363f, -0.102267f, 0.097787f, 0.141459f, -0.110706f, 0.079467f, -0.082570f, -0.347567f, 0.341043f, -0.137871f, 0.112319f, 0.064733f, -0.082869f, 0.269999f, -0.408184f, -0.183443f, 0.180608f, 0.223345f, -0.357376f, -0.244593f, 0.355348f, -0.072701f, -0.034311f, 0.096544f, 0.016407f, 0.417550f, -0.367772f, -0.484535f, 0.405977f, 0.314243f, -0.099622f, -0.192218f, -0.012780f, 0.434551f, -0.399047f, -0.531499f, 0.484513f, -0.691352f, 0.872823f, 1.207720f, -1.377490f, 0.006872f, -0.041453f, 0.007845f, 0.007463f, 0.467299f, -0.476372f, -0.452606f, 0.452357f, 0.447332f, -0.365632f, -0.332435f, 0.300284f, -0.290504f, 0.255410f, 0.310921f, -0.293717f, -0.616299f, 0.594207f, 0.461347f, -0.449439f, 0.278455f, 0.285085f, -1.201340f, -0.016463f, 0.549095f, 0.610375f, -4.608530f, -1.727390f, 0.150404f, -0.012846f, -0.481148f, -0.182257f, 0.918796f, 0.213872f, 1.050410f, 0.681526f, -0.458777f, -0.710395f, -2.347200f, -0.277197f, 0.213294f, 0.337551f, -0.177710f, -0.152136f, 0.167666f, 0.308403f, -1.248500f, -0.565367f, 0.122054f, 0.087874f, -0.476556f, -0.083548f, -0.358734f, -0.073131f, -0.146320f, -2.241960f, 0.697639f, 0.545581f, -1.889700f, -0.267725f, 0.433045f, 0.298224f, -0.338508f, 0.250226f, 0.405675f, 0.447201f, -1.184690f, -0.473447f, 0.307403f, 0.711236f, -3.191560f, -1.663980f, 0.165201f, 0.101360f, -0.624451f, -0.173269f, 0.089795f, 0.227478f, -0.136664f, 0.007907f, 0.131079f, 0.605374f, -2.991620f, -1.723790f, 0.082428f, 0.006781f, -0.348732f, -0.019271f, -0.032040f, -0.067078f, -0.437166f, -0.144472f, 0.069844f, 0.194625f, -0.162284f, -0.374656f, 0.056472f, -0.236524f, -0.114241f, -0.029161f, -0.222078f, -0.053435f, -0.313938f, -0.555472f, 1.037550f, 0.689968f, 0.575694f, 0.065826f, -0.659979f, -0.881351f, -0.626417f, -0.953975f, -0.576106f, -0.258708f, 0.263004f, -0.229847f, 0.463835f, 1.390960f, -2.614480f, -1.272910f, 0.065780f, -0.058603f, 0.015612f, 0.104703f, 0.198028f, 0.262792f, 0.253616f, -0.079126f, -0.587381f, -0.739021f, -0.822676f, -0.795512f, 0.193644f, 0.234643f, -0.034407f, 0.421478f, -0.572610f, -0.290714f, -0.257803f, -0.644835f, -0.536938f, -0.375899f, -0.651077f, -0.522576f, 0.562564f, 0.834616f, 0.513893f, 0.649689f, 0.356530f, 0.400716f, 0.300606f, 0.290505f, 0.584608f, 0.671574f, 0.564584f, 0.419870f, 0.062061f, 0.018263f, 0.009831f, 0.084103f, -0.128281f, -0.018818f, -0.187244f, 0.067210f, 0.437147f, 0.442029f, 0.444939f, 0.226661f, 0.541609f, 0.444280f, 0.302795f, 0.633026f, -0.180374f, 0.265197f, 0.210404f, -0.118916f, -0.294013f, -0.692627f, -0.402347f, -0.356287f, 0.387578f, 0.385496f, 0.789542f, 0.690396f, -0.203542f, -0.688546f, 0.045319f, -0.448747f, -0.157148f, 0.152581f, 0.022360f, 0.058358f, 0.593007f, 1.131860f, 0.289006f, 1.015560f, 0.144942f, -0.411577f, 0.264794f, -0.085791f, 0.156996f, 0.200340f, 0.169264f, 0.267615f, -0.361015f, -0.601842f, -0.442217f, -0.781086f, 0.112938f, 0.385305f, 0.482454f, 0.470268f, 1.193390f, 0.589642f, 0.127638f, -0.640946f, 0.540310f, 0.741498f, 0.686937f, 0.435879f, 0.534523f, 0.693119f, 0.817577f, 0.783109f, 0.021681f, -0.004973f, 0.201236f, -0.086311f, 0.028628f, 0.227871f, 0.462751f, 0.126832f, -0.389997f, -0.553965f, -0.343953f, -0.448517f, 0.053129f, -0.115083f, 0.018138f, -0.067131f, -0.293468f, -0.220700f, 0.074348f, -0.273153f, 0.263637f, 0.122049f, 0.153025f, 0.076292f, 0.142320f, 0.286734f, 0.100542f, 0.308660f, -0.759591f, -0.750938f, -0.788799f, -0.853076f, -0.588019f, -0.990063f, -0.692327f, -0.722904f, 0.084736f, 0.151068f, 0.159606f, 0.147715f, 1.610180f, 1.950330f, 1.765670f, 2.265110f, 0.008262f, 0.185584f, 0.039337f, 0.164721f, 0.479446f, 0.314083f, 0.043969f, 0.291320f, 0.003400f, -0.551190f, 0.060158f, -0.147591f, 0.089117f, 0.042994f, 0.042802f, 0.127392f, -0.066172f, 0.078370f, 0.051408f, 0.014004f, 0.086726f, 0.133334f, -0.046733f, 0.155100f, -0.118223f, -0.100778f, -0.225245f, -0.460397f, 0.892644f, 1.003770f, 0.405155f, 0.517477f, 0.184585f, 0.279090f, -0.036477f, 0.198703f, 0.027139f, -0.055728f, -0.022396f, -0.147319f, 2.275540f, 2.014990f, 2.296800f, 2.081730f, -0.088713f, 0.105729f, -0.027871f, -0.095047f, 0.012429f, 0.014244f, -0.014755f, -0.003017f, 1.332700f, 1.300040f, 1.464250f, 1.305030f, 0.032568f, 0.118042f, 0.079632f, -0.089405f, 0.163905f, 0.146608f, 0.026502f, 0.065307f, -0.056909f, -0.065052f, 0.069851f, -0.082958f, 0.023419f, -0.026293f, 0.037616f, -0.048096f, -0.073701f, -0.208295f, -0.782095f, 0.000523f, 0.374131f, 0.420946f, 0.466151f, 0.349651f, -0.679275f, -0.745827f, -0.379918f, -0.900107f, 0.044070f, -0.347536f, -1.224390f, 0.740113f, -0.779966f, 0.510920f, -0.968597f, -0.095630f, 0.120805f, 0.676803f, -0.164827f, 0.172996f, -0.106720f, 0.197527f, 0.337561f, 0.571094f, -0.279090f, -0.396697f, -0.253083f, -0.690170f, -0.363291f, 0.516921f, 0.489391f, -0.920628f, 0.497572f, 0.483864f, -0.125696f, -0.338123f, -0.041517f, -0.534630f, -0.388465f, -0.784554f, 0.215227f, 0.055088f, 0.179638f, 0.086997f, 0.569313f, 0.572926f, 0.137182f, -0.045485f, 0.118087f, 0.210383f, 0.212664f, 0.482443f, 0.151921f, 0.307947f, -0.084656f, -0.386206f, 0.542277f, -0.207005f, 0.073792f, -1.013240f, 0.303581f, 0.270527f, 0.265985f, 0.332702f, 0.848609f, 0.686757f, 0.767212f, 0.316901f, -0.502460f, -0.567092f, -0.484799f, -0.173350f, -0.426863f, 0.222375f, -0.200267f, -0.523758f, 0.265180f, -0.175648f, -0.229754f, 0.148740f, 0.402515f, 0.028243f, -0.366109f, 0.157232f, -0.131564f, 0.055136f, 0.211046f, -0.115542f, 0.322379f, -0.137768f, -0.247832f, 0.070394f, 0.058530f, -0.295023f, -0.196022f, -0.109097f, 0.261285f, -0.273585f, -0.240632f, 0.258326f, -0.077364f, 0.071405f, -0.014766f, -0.008751f, -0.203622f, 0.177818f, 0.116726f, -0.116735f, -0.723616f, -0.700154f, 0.145082f, -0.184949f, -0.287076f, 0.150405f, 0.258075f, -0.157764f, -0.120909f, 0.105459f, 0.113288f, -0.092963f, 0.328183f, -0.300115f, -0.361289f, 0.319792f, -0.048875f, 0.135673f, 0.132539f, -0.162481f, 0.002109f, 0.065048f, -0.135969f, 0.061558f, 1.510670f, -0.884925f, -0.827022f, 0.190311f, -0.060088f, -0.033362f, 0.013354f, 0.002847f, 0.353479f, -0.462538f, -0.319638f, 0.424484f, 0.199540f, -0.073843f, -0.140621f, 0.072133f, -0.098662f, 0.070613f, 0.031150f, -0.021869f, -0.511253f, 0.503412f, 0.565963f, -0.576146f, -1.081700f, 0.047670f, 0.266687f, 0.524804f, -2.361150f, 0.147823f, 0.594717f, 0.956842f, -1.048220f, 0.127083f, 0.079581f, 0.065419f, 0.176783f, 0.653953f, 0.260967f, 0.537892f, -1.207580f, 0.245983f, -0.727067f, 0.071755f, -0.343025f, -0.173435f, 0.215289f, 0.268578f, -1.158560f, 0.039263f, -0.132888f, 0.217132f, -0.622195f, -0.071256f, 0.317333f, 0.157614f, -1.588250f, 0.316432f, -0.736720f, -0.041698f, -1.959280f, 0.083451f, 0.570584f, 0.327620f, -1.262200f, -0.026738f, 0.231198f, 0.326861f, -1.644200f, -0.143833f, -0.079495f, 0.493026f, -2.488090f, -0.034046f, 0.165884f, 1.074260f, -1.076980f, 0.248198f, -0.017987f, 0.421900f, -0.105860f, 0.076710f, 0.002072f, 0.070264f, -1.734750f, 0.227145f, 0.209220f, 0.851459f, -0.142369f, 0.066502f, 0.027816f, 0.044321f, -0.186591f, -0.100340f, 0.115580f, 0.192252f, -0.892114f, 0.209531f, -0.308243f, 0.367968f, -0.721770f, 0.220224f, -0.062744f, 0.133754f, 0.040416f, 0.190428f, -0.035428f, 0.162974f, 0.116427f, 0.669393f, 0.278891f, 0.856676f, 1.060390f, 0.936983f, 0.863355f, 0.990560f, -0.147111f, -0.217883f, 0.355794f, -0.186530f, -0.275614f, -0.095719f, 0.167346f, 0.359078f, -0.079223f, -0.581596f, -0.213134f, -0.431123f, -0.516443f, -0.388628f, -0.643821f, -0.202345f, 0.426230f, 0.516923f, 0.548131f, 0.555973f, 0.022286f, 0.361170f, 0.980065f, 0.648400f, -0.056813f, -0.100310f, -0.439481f, -0.166454f, 0.412449f, 0.509400f, 0.316208f, 0.470293f, -0.827838f, -1.078380f, -1.047040f, -1.074560f, 0.274555f, -0.316736f, 0.128818f, 0.228566f, -0.520967f, -0.731674f, -0.687887f, -0.536388f, -0.031187f, 0.041404f, 0.047821f, 0.064397f, 0.054230f, 0.105059f, -0.178671f, 0.176847f, -0.394797f, -0.260255f, -0.333734f, -0.162345f, -0.444650f, -0.928438f, -0.705840f, -0.833162f, 0.306737f, 0.429699f, 0.417298f, 0.478469f, 0.420903f, 0.676871f, 0.429677f, 0.616921f, -0.805199f, -0.643391f, -0.304100f, 0.797599f, -0.172157f, 0.429085f, -0.750676f, 0.149227f, -0.207898f, -0.022534f, -0.341448f, -0.247976f, 0.095325f, -0.561120f, 0.599694f, -0.025236f, 0.292346f, -0.312001f, 0.517478f, 0.301457f, -0.106415f, 0.226263f, -0.184163f, -0.114419f, -0.322702f, 0.172541f, 0.445573f, 0.157213f, 0.670704f, 0.102174f, -0.234667f, -0.293311f, 0.769852f, 0.038028f, -0.036741f, -0.228060f, -0.253335f, 0.424054f, -0.597980f, 0.221007f, -0.114741f, -0.411557f, -0.592201f, 0.442684f, 0.115491f, -0.106896f, -0.028110f, 0.354751f, -0.248375f, 0.242570f, -0.155856f, 0.280528f, -0.198742f, 0.588725f, 0.371065f, 0.078197f, 0.114706f, -0.448021f, 0.065255f, 0.133741f, -0.227522f, -0.047339f, -0.052849f, 0.309480f, 0.597185f, 0.209182f, 0.226108f, -0.601036f, -0.431672f, -0.172601f, -0.000174f, 0.194292f, -0.133937f, 0.130676f, 0.059372f, 0.091381f, 0.098751f, -0.150996f, 0.170514f, -0.085494f, 0.336576f, 0.484004f, 0.033862f, 0.277473f, -0.231482f, -0.328385f, -0.332739f, -0.626957f, 0.510167f, 0.575861f, 0.421494f, 0.482540f, -0.636377f, -0.864661f, -0.694180f, -0.420014f, -0.132781f, 0.017599f, 0.003538f, 0.486934f, 0.133878f, -0.094622f, 0.016132f, 0.010117f, 0.156680f, -0.022201f, -0.014621f, 0.228445f, 0.190826f, 0.171580f, 0.579923f, 0.245428f, 0.322713f, 0.480101f, 0.406320f, 0.412229f, 0.002334f, -0.022349f, 0.074571f, -0.043828f, 0.290453f, 0.451749f, 0.530376f, 0.271879f, 0.095144f, 0.169450f, 0.049482f, 0.114605f, -0.635634f, -0.700768f, -0.558538f, -0.537625f, 0.190255f, -0.308237f, -0.053703f, 0.212489f, 0.056520f, -0.040019f, 0.089822f, -0.014155f, -0.376004f, -0.448752f, -0.526717f, -0.571440f, 0.116482f, 0.162321f, 0.147895f, 0.280527f, 0.159037f, -0.095958f, 0.007931f, -0.086630f, 0.285625f, 0.514914f, 0.208908f, 0.519251f, 0.309368f, 0.379777f, 0.350565f, 0.487487f, -0.541494f, -0.421836f, -0.390001f, -0.500696f, -0.905736f, -0.150439f, -0.942304f, -0.566771f, 0.484233f, 0.767417f, 0.410477f, 0.670196f, 0.070210f, 0.488836f, 0.372805f, 0.197631f, 0.337892f, 0.524423f, 0.777219f, -0.260955f, -0.112981f, -0.060088f, -0.200250f, -0.195671f, 0.007584f, 0.252096f, 0.235511f, 0.366612f, -0.304979f, -0.211068f, -0.420683f, -0.085370f, 0.085762f, -0.097549f, -0.802509f, -0.468079f, -0.192787f, -0.069670f, -0.235162f, -0.077772f, -0.441671f, -0.348479f, -0.431434f, -0.108256f, -0.133779f, 0.017032f, 0.001964f, -0.120647f, -0.187663f, -0.194985f, -0.231742f, -0.175288f, -0.162639f, 0.245110f, 0.049951f, 0.104229f, -0.159634f, -0.076545f, -0.022496f, -0.036532f, -0.147028f, -0.034215f, 0.028213f, -0.059669f, -0.078259f, 0.062993f, -0.124066f, -0.137362f, -0.129977f, -0.010532f, -0.049090f, -0.189401f, 0.495471f, 0.615778f, 0.451437f, 0.803526f, 0.523532f, 0.841339f, 0.699528f, 0.745129f, 0.246264f, -0.198290f, -0.283620f, 0.189917f, -0.018306f, -0.419097f, 0.280363f, -0.098085f, 0.138972f, -0.140867f, -0.117025f, 0.098585f, 0.130979f, 0.268133f, -0.161731f, -0.176629f, -0.357677f, -0.126379f, 0.553128f, -0.126821f, -0.001511f, -0.010081f, -0.031162f, 0.079203f, -0.157731f, 0.072865f, 0.535830f, -0.529989f, -0.570075f, 0.295795f, 0.595613f, -0.449278f, -0.669756f, 0.941452f, 0.356897f, -0.723720f, -0.115203f, -0.134479f, 0.133048f, 0.109860f, -0.024250f, -0.049732f, 0.020098f, 0.048356f, -0.048293f, 0.108754f, 0.062548f, -0.238315f, 0.182700f, 0.312011f, -0.244377f, -0.118012f, 0.012276f, 0.006089f, 0.098068f, -0.079280f, -0.423987f, -0.411931f, -0.027425f, 0.870280f, 0.022825f, -0.024481f, -0.036320f, -0.111189f, 0.364539f, -0.244896f, -0.373060f, 0.266345f, -0.141778f, 0.277549f, 0.059834f, -0.178242f, -0.686222f, 0.594535f, 0.354546f, -0.272516f, 1.060730f, -1.059810f, -0.948126f, 0.993267f, 0.116597f, -0.227574f, -0.436144f, -0.333309f, -0.575746f, -0.828102f, 0.284561f, 0.351668f, -0.080164f, -0.762518f, -0.511108f, -0.212855f, 0.293892f, -0.548664f, 0.072057f, 0.006748f, 1.485110f, 0.124687f, 0.727211f, 1.557560f, -0.064383f, -0.022242f, 0.002921f, -0.151505f, 0.270926f, 0.173632f, -0.640644f, 0.422410f, -0.240699f, -0.361980f, -0.279864f, -0.055165f, -1.084140f, 0.231705f, 0.366172f, -0.347698f, -0.097565f, -0.747227f, -0.243033f, 0.941545f, -0.207460f, -0.353913f, 0.104303f, -0.403151f, 0.203177f, 0.335893f, -0.229033f, 0.029096f, -0.409634f, -0.179599f, -0.442397f, 0.649114f, 0.460774f, 0.170906f, -0.043857f, 0.402066f, -0.226896f, -0.199624f, 0.016650f, 0.207894f, 0.056954f, 0.220329f, 0.374060f, 0.130361f, -0.303960f, -0.078863f, 0.195410f, 0.729438f, 0.246818f, 0.287730f, 0.484876f, 0.111488f, -0.168647f, -0.087878f, -0.070089f, -0.341329f, -0.330280f, 0.259943f, -0.364205f, 0.256555f, -0.756804f, -0.086915f, 0.777351f, 0.006136f, 0.110348f, 0.248743f, 0.209326f, -0.362741f, -0.184416f, 0.422446f, 0.565193f, 0.310072f, -0.011212f, -0.765226f, 0.039466f, 0.301288f, 0.172907f, -1.539450f, 0.606202f, 0.477469f, 0.045894f, -0.222180f, -0.013192f, -0.064077f, -0.241551f, 0.192914f, 0.028004f, -0.540538f, 0.437440f, 0.179087f, -0.753204f, -0.001374f, 1.185930f, -0.151182f, 1.238580f, -1.389900f, 0.277954f, 0.422208f, 0.041553f, -0.542284f, 0.139019f, -0.148580f, -0.130705f, 0.361830f, 0.322953f, -0.092371f, 0.120180f, -0.355299f, -0.028057f, 0.128114f, 0.250947f, -0.349926f, -0.684633f, 0.246175f, 0.186731f, -0.676313f, 0.060535f, 0.333371f, -0.021172f, -0.421266f, -0.079650f, 0.031359f, -0.303658f, -0.298286f, 0.119016f, 0.655585f, 0.200175f, -0.887182f, -0.197539f, -0.318883f, -0.130250f, 0.522487f, -0.092616f, 0.405930f, -0.281678f, 0.089728f, 0.081814f, -0.781745f, 0.348878f, 0.082274f, -0.914136f, 1.098810f, 0.855321f, -1.078170f, -0.268018f, 0.246440f, 0.238347f, -0.027228f, 0.074111f, -0.061197f, -0.063582f, 0.089462f, -0.040347f, 0.117082f, 0.122772f, -0.162816f, -0.148668f, -0.342856f, -0.495604f, -1.453630f, -0.045273f, -0.030463f, 0.043766f, 0.047978f, 0.016910f, -0.009700f, 0.006288f, -0.042556f, 0.632896f, -0.845744f, -0.516844f, 0.709439f, 0.486166f, -1.203050f, -0.978381f, 0.631876f, 0.000705f, 0.123858f, -0.001187f, -0.172312f, -0.422668f, 0.241838f, 0.437400f, -0.268186f, -0.513259f, 0.450209f, 0.542629f, -0.453810f, -0.207119f, 0.072598f, 0.085066f, -0.018986f, -0.149512f, 0.149521f, 0.182105f, -0.227200f, -0.363240f, 0.172670f, -0.502932f, 0.689256f, 0.093760f, -0.090207f, -0.066803f, 0.056759f, -0.002243f, -0.050662f, -0.059324f, 0.152943f, -0.701150f, 0.712540f, 0.660349f, -0.654970f, 0.351772f, -0.303383f, -0.311177f, 0.247653f, 0.013035f, 0.034648f, -0.137832f, 0.041197f, 0.410265f, 0.345129f, 0.653338f, 0.047050f, 0.140399f, 0.018613f, -0.012431f, -0.113632f, -0.029928f, 0.051564f, -0.031349f, 0.151944f, -0.160340f, 0.326798f, -0.458067f, 0.636235f, 0.243184f, 0.514072f, 2.414450f, 1.421980f, -0.001474f, -0.141389f, -0.104817f, -0.141882f, -0.026395f, 0.053014f, 0.143885f, -0.207774f, -0.563846f, -0.242514f, -0.436574f, -0.456796f, -0.520646f, 0.282550f, -0.684924f, 0.061105f, -0.315884f, -0.392624f, 0.009805f, -0.256597f, -0.146732f, 0.331039f, 0.362342f, 0.270851f, 0.067679f, -0.071331f, -0.222423f, 0.081286f, -0.208192f, -0.193816f, -0.008201f, -0.309340f, 0.167556f, 0.106071f, 0.172254f, -0.163790f, -0.142205f, -0.043182f, 0.096145f, 0.145037f, -0.066015f, -0.073194f, 0.132237f, -0.088522f, -0.044292f, -0.487128f, 0.033389f, -0.573548f, 0.185449f, 0.273593f, 0.147503f, 0.457049f, -0.021539f, 0.090786f, 0.009147f, 0.000899f, 0.018088f, 0.115791f, -0.079165f, 0.139388f, }; static const float weights_layer_2[] = { 0.153048f, 0.112901f, 0.136781f, 0.154580f, 0.091610f, 0.045165f, 0.088490f, 0.116991f, -0.463766f, -0.596567f, -0.567008f, -0.630565f, 0.141874f, 0.095726f, 0.175427f, 0.145027f, -0.969824f, -1.018190f, -1.073300f, -1.041130f, -0.070545f, -0.123600f, -0.114967f, -0.169453f, -0.267458f, -0.147730f, -0.161419f, -0.164894f, -0.117508f, -0.204389f, -0.122695f, -0.163107f, -0.003903f, -0.030470f, -0.037433f, -0.059568f, 0.138243f, 0.091019f, 0.160372f, 0.141650f, -0.544565f, -0.620004f, -0.504503f, -0.429979f, -0.099491f, -0.096384f, -0.155265f, -0.188536f, 0.084923f, 0.038345f, 0.066706f, 0.122083f, 0.267087f, 0.184419f, 0.261478f, 0.255746f, -0.245894f, -0.114980f, -0.193880f, -0.227785f, 0.087536f, 0.095712f, 0.106105f, 0.099353f, -0.059473f, -0.173247f, -0.202386f, -0.076010f, 0.125928f, 0.100793f, 0.119638f, 0.129623f, 0.136593f, 0.102984f, 0.156550f, 0.140558f, 0.122524f, 0.051596f, 0.084164f, 0.123630f, 0.072542f, 0.096063f, 0.083236f, 0.087630f, 0.025900f, 0.023738f, 0.036385f, 0.053077f, -0.029501f, 0.010544f, -0.010026f, -0.051268f, 0.086302f, 0.109909f, 0.101385f, 0.127513f, -0.031869f, 0.005340f, -0.056267f, -0.032955f, 0.032748f, 0.023162f, 0.092118f, -0.001780f, -0.123612f, -0.183433f, -0.202377f, -0.317516f, 0.129052f, 0.208112f, 0.145582f, 0.175502f, 0.018476f, 0.036349f, 0.072417f, 0.061194f, 0.086985f, 0.117086f, 0.072465f, 0.129068f, 0.020182f, 0.052114f, 0.017878f, 0.010478f, -0.001381f, -0.034644f, 0.025135f, -0.037748f, 0.004973f, 0.024778f, 0.041816f, 0.032111f, 0.080268f, 0.124998f, 0.105719f, 0.177047f, -0.072114f, -0.011864f, -0.076846f, -0.089840f, 0.069993f, 0.089362f, 0.088035f, 0.120621f, 0.065916f, 0.100946f, -0.006784f, -0.007751f, 0.122039f, 0.126482f, 0.078629f, 0.140299f, 0.074034f, 0.092464f, 0.089798f, 0.108968f, 0.075729f, 0.057128f, 0.013570f, 0.021195f, 0.068901f, 0.054022f, 0.029781f, 0.031404f, -0.209998f, -0.208731f, -0.198310f, -0.212454f, -0.579168f, -0.490190f, -0.607567f, -0.520541f, 0.083863f, 0.056612f, 0.030366f, 0.061790f, -0.004874f, -0.057203f, -0.060429f, -0.049145f, 0.080086f, 0.138602f, 0.223796f, 0.133279f, -0.495954f, -0.612093f, -0.545393f, -0.562310f, 0.070672f, 0.037702f, 0.139013f, 0.080192f, -0.111387f, -0.048165f, 0.074359f, -0.042125f, 0.113633f, 0.106579f, 0.042633f, 0.102734f, -0.068220f, 0.128423f, -0.181821f, -0.013260f, -0.108563f, -0.138667f, -0.109304f, -0.131909f, -0.168667f, -0.126870f, -0.132533f, -0.167096f, -0.184741f, -0.140890f, -0.125361f, -0.150632f, 0.309013f, 0.364376f, 0.361102f, 0.271566f, 0.116552f, 0.091160f, 0.096846f, 0.095954f, 0.046972f, 0.080489f, 0.028766f, -0.012223f, 0.071379f, 0.041535f, -0.000668f, 0.033698f, -0.013493f, -0.027535f, -0.025804f, -0.012267f, -0.097465f, -0.099232f, -0.208863f, -0.225201f, -0.475608f, 0.077358f, -0.002872f, 0.163890f, -0.420298f, 0.072114f, 0.121601f, -0.016727f, 0.573853f, -0.080196f, 0.193053f, 0.053012f, -0.454179f, 0.058563f, 0.067265f, 0.141154f, 0.412541f, 0.086933f, 0.030407f, -0.030413f, 0.478757f, -0.097731f, 0.277072f, -0.086393f, 0.552604f, -0.334201f, 0.091765f, -0.270262f, -1.395060f, 0.271837f, -0.005335f, 0.240499f, 0.175442f, -0.326329f, -0.019353f, -0.270338f, -0.459273f, 0.096183f, 0.153046f, 0.135818f, 0.759028f, -0.177673f, -0.099966f, 0.103363f, 0.697289f, -0.234184f, -0.048706f, -0.116099f, -0.282575f, 0.025655f, -0.184759f, 0.040658f, -0.558267f, 0.214087f, -0.095620f, 0.200522f, 0.278996f, 0.031959f, 0.122936f, -0.209196f, -0.308217f, 0.092917f, 0.113269f, 0.136274f, -0.037046f, 0.017263f, -0.194183f, 0.089133f, -0.161244f, 0.042799f, 0.030557f, 0.153545f, -0.355048f, 0.070928f, -0.152852f, 0.102875f, -0.193649f, 0.007916f, -0.062952f, 0.050602f, 0.073671f, 0.143045f, -5.978970f, -7.013850f, 0.058713f, 0.076116f, 0.026445f, -0.056599f, -0.005966f, 0.032234f, 0.006753f, -0.024528f, 0.120308f, 0.179939f, -6.624630f, -7.638680f, 0.026359f, 0.020758f, 0.194274f, 0.051489f, -0.008491f, -0.028248f, -0.061328f, -0.134423f, -0.103951f, -0.110877f, 0.042263f, 0.127016f, 0.012473f, -0.008595f, 0.031357f, 0.087476f, -0.084022f, -0.015590f, -0.313546f, 0.120072f, 0.123880f, 0.162148f, -6.596560f, -7.358830f, 0.004797f, -0.003415f, 0.048455f, 0.026737f, -0.103702f, 0.034416f, -0.003475f, -0.236827f, 0.005378f, 0.048413f, 0.054612f, -0.079359f, 0.043707f, 0.001085f, 0.023380f, 0.007785f, 0.025938f, -0.052856f, -0.033421f, 0.022643f, 0.034161f, 0.127681f, -5.019490f, -5.233580f, -0.128630f, 0.087741f, -0.239834f, -0.377876f, 0.128082f, 0.142730f, -0.086819f, -0.350927f, 0.089849f, 0.155776f, -6.155120f, -5.721720f, 0.056110f, 0.008761f, 0.045579f, 0.016762f, -0.134076f, -0.101551f, -0.096058f, -0.117146f, 0.003527f, -0.056942f, -0.005578f, 0.071287f, 0.023776f, -0.028003f, -0.075390f, -0.191160f, -0.089672f, -0.104372f, -0.104750f, -0.080813f, -0.249824f, -0.124479f, -0.243593f, -0.244284f, -0.554911f, -0.549095f, -0.564693f, -0.475107f, -0.121771f, -0.143441f, -0.171170f, -0.120920f, 0.109831f, 0.079708f, 0.327295f, 0.308907f, -0.178785f, -0.428316f, -0.418882f, -0.366750f, -0.139296f, -0.129645f, -0.081237f, -0.101533f, -0.006256f, -0.146756f, -0.322110f, -0.338865f, -0.306085f, -0.319592f, -0.454803f, -0.363560f, -0.018557f, 0.006605f, -0.131198f, -0.077708f, 0.138160f, 0.119611f, 0.271098f, 0.232168f, 0.027812f, 0.035390f, -0.202503f, -0.091172f, -0.142020f, -0.159929f, -0.106404f, -0.107433f, -0.381743f, -0.353222f, -0.484159f, -0.469926f, -0.234659f, -0.315674f, -0.178327f, -0.213485f, -0.096207f, -0.190944f, -0.118917f, -0.161288f, 0.015996f, 0.060737f, 0.051390f, 0.060876f, 0.229289f, 0.282418f, 0.250945f, 0.197273f, 0.045131f, -0.008305f, 0.072024f, 0.044547f, -0.050010f, 0.055504f, 0.001343f, -0.014445f, 0.254909f, 0.309091f, 0.228249f, 0.274843f, 0.089778f, -0.046581f, 0.072714f, 0.126814f, -0.048931f, -0.045743f, -0.151333f, -0.004490f, 0.179966f, 0.058150f, -0.178622f, -0.088159f, -0.074416f, -0.005821f, -0.011799f, -0.002225f, -0.069361f, -0.098937f, -0.081575f, -0.034796f, 0.253792f, 0.301039f, 0.219163f, 0.256027f, 0.058007f, -0.041431f, 0.040674f, 0.009019f, -0.099670f, -0.099077f, -0.039437f, 0.017946f, 0.060717f, 0.045796f, 0.109664f, 0.032138f, -0.071094f, 0.023697f, 0.011335f, -0.030465f, 0.068677f, 0.039345f, -0.045078f, 0.084037f, 0.135517f, 0.190417f, 0.175578f, 0.155286f, -0.044505f, 0.010826f, 0.006717f, -0.134715f, 0.068022f, 0.110095f, 0.079966f, 0.034481f, 0.185804f, 0.188273f, 0.227283f, 0.135935f, 0.033447f, 0.031571f, -0.014766f, -0.024565f, 0.021792f, 0.017675f, -0.001333f, -0.040069f, -0.049384f, -0.045256f, -0.014013f, -0.000107f, -0.096928f, -0.111495f, -0.051225f, -0.060449f, 0.071446f, 0.017294f, -0.004822f, 0.006932f, 0.020884f, 0.089425f, 0.061097f, -0.038708f, -0.184029f, -0.089541f, -0.158035f, -0.214607f, -0.377947f, -0.318586f, -0.336977f, -0.323908f, 0.181612f, 0.140018f, 0.233524f, 0.193366f, -0.254507f, -0.271902f, -0.197144f, -0.119539f, 0.042162f, 0.000320f, 0.014708f, -0.014228f, -0.081119f, -0.089326f, 0.001763f, 0.081009f, -0.142618f, -0.160650f, -0.214597f, -0.202143f, -0.053495f, -0.012819f, -0.071468f, -0.010883f, 0.072570f, 0.071507f, 0.091045f, 0.083155f, -0.271237f, -0.289211f, -0.272345f, -0.299411f, 0.031697f, -0.029795f, -0.030045f, -0.013604f, -0.106843f, -0.045212f, -0.122459f, -0.096936f, 0.059793f, 0.006157f, 0.028092f, 0.040589f, -0.014560f, -0.008975f, -0.051404f, -0.014309f, -0.016883f, 0.018332f, 0.040114f, 0.050348f, 0.044921f, -0.002445f, -0.112396f, 0.014395f, 0.115160f, 0.145350f, -0.166814f, -0.121449f, 0.155573f, -0.099446f, -0.161661f, 0.187251f, 0.004711f, 0.024318f, -0.060871f, -0.028311f, -0.098274f, 0.322030f, -0.069242f, -0.153173f, -0.227428f, -0.293965f, 0.228491f, 0.111413f, -1.354720f, -0.344235f, 0.866715f, 0.872344f, 0.078789f, -0.384865f, 0.162388f, 0.109018f, -0.191549f, -0.002638f, 0.305053f, 0.087337f, 0.066506f, -0.055810f, -0.010984f, -0.056160f, -0.114617f, -0.058478f, 0.022059f, -0.124368f, -0.130989f, 0.369432f, -0.248898f, -0.003955f, -0.021578f, 0.115991f, -0.114163f, -0.065232f, 0.339857f, -0.225997f, 0.006282f, -0.125395f, 0.235082f, -0.347785f, 0.662321f, -0.529182f, 0.153297f, -0.001326f, -0.026725f, -0.024677f, -0.088065f, -0.116127f, 0.080896f, 0.212542f, 0.208421f, 0.032047f, -0.211395f, 0.074997f, 0.096659f, 0.096423f, -0.078643f, 0.106556f, -0.123860f, 0.075609f, 0.066008f, -0.097275f, -1.000020f, -0.780154f, -0.856922f, -0.964007f, 0.083135f, -0.018922f, -0.266214f, -0.151480f, 0.051538f, 0.017802f, 0.066774f, -0.021341f, -0.869494f, -0.935252f, -0.895836f, -0.853871f, -0.160490f, 0.085850f, -0.029670f, -0.056675f, 0.159989f, 0.166872f, 0.129970f, 0.194377f, 0.153294f, 0.199593f, 0.037692f, 0.103391f, 0.029335f, -0.085324f, -0.079326f, -0.077216f, 0.501561f, 0.366168f, 0.330196f, 0.296432f, -0.977282f, -0.844295f, -1.014870f, -1.098990f, -0.099858f, -0.129552f, 0.090051f, -0.013378f, 0.081330f, 0.194911f, 0.286501f, 0.177363f, -0.148250f, -0.111700f, -0.243081f, -0.102918f, 0.161069f, -0.012655f, -0.071722f, -0.020329f, -0.077828f, -0.041716f, 0.109247f, 0.062229f, -0.759722f, -0.742756f, -0.563713f, -0.631187f, 0.005911f, 0.268154f, -0.263769f, 0.087149f, -0.163623f, -0.359600f, -0.464577f, -0.369352f, -0.515784f, -0.475822f, -0.523485f, -0.649813f, -0.112419f, -0.029285f, 0.021061f, -0.041515f, 0.149133f, -0.254428f, 0.115776f, -0.061892f, 0.103675f, -0.283363f, 0.005005f, 0.022034f, -0.178454f, 0.035836f, -0.113702f, -0.217823f, 0.209407f, -0.296257f, 0.187976f, -0.157370f, -0.127190f, 0.251780f, 0.055633f, 0.294111f, -0.067773f, 0.467190f, -0.192625f, -0.071084f, -0.445284f, 0.511090f, -0.319728f, 0.267971f, 0.494929f, -0.586727f, 0.454543f, -0.520675f, -0.085900f, 0.325989f, -0.131006f, -0.069501f, 0.199927f, -0.218919f, 0.170055f, -0.106538f, 0.133312f, 0.127629f, -0.561625f, 0.595666f, -0.090927f, 0.363348f, -0.249246f, 0.063068f, -0.016458f, -0.291045f, -0.040509f, 0.017866f, 0.304871f, -0.459214f, 0.214390f, -0.238740f, -0.456541f, 0.545848f, -0.218026f, 0.202475f, 0.128490f, -0.036417f, 0.173885f, -0.049385f, 0.235514f, -0.132587f, -0.015066f, 0.164638f, 0.196873f, -0.125330f, 0.216912f, -0.109398f, 0.121602f, -0.209374f, 0.164400f, -0.123049f, 0.195520f, -0.212932f, -0.015180f, -0.005784f, 0.049726f, -5.822150f, 0.124536f, 0.040689f, -0.018560f, -3.155020f, 0.014690f, 0.076202f, -0.154008f, 1.070630f, -0.071606f, 0.051026f, 0.138285f, -5.836340f, 0.162173f, 0.085890f, -0.186166f, 0.093221f, 0.019240f, -0.017053f, -0.090144f, 0.236254f, -0.125344f, 0.056235f, -0.089813f, -0.252281f, -0.127406f, -0.155088f, 0.009972f, -0.066449f, 0.044222f, 0.025943f, -0.164921f, 0.165463f, -0.001132f, -0.038386f, 0.115194f, -5.757100f, 0.163386f, 0.061226f, 0.024626f, 0.132750f, 0.107279f, -0.001622f, -0.107860f, -0.356009f, -0.138935f, -0.145173f, -0.061198f, -0.646138f, 0.034279f, 0.078187f, 0.108138f, -0.490444f, 0.074719f, 0.034984f, -0.109303f, 0.741785f, -0.066939f, 0.015558f, 0.114229f, -4.001080f, 0.130772f, 0.044675f, -0.165162f, -0.274810f, -0.042987f, -0.048579f, 0.156603f, -1.288370f, 0.076198f, 0.035065f, 0.032043f, -5.002520f, 0.086900f, -0.010886f, 0.030850f, -0.782259f, 0.056211f, -0.097759f, 0.118988f, 0.106638f, 0.091419f, 0.079920f, 0.062325f, 0.097116f, 0.126035f, 0.122530f, -0.278299f, -0.083314f, -0.300563f, -0.197946f, 0.081664f, 0.089925f, 0.074754f, 0.074628f, 0.102338f, 0.088845f, 0.105841f, 0.102381f, 0.003087f, 0.061599f, 0.098326f, 0.040119f, -0.005298f, -0.028834f, 0.059938f, -0.013668f, -0.585882f, -0.631436f, -0.742673f, -0.736666f, 0.025071f, 0.066851f, 0.075046f, 0.091360f, 0.099045f, 0.098261f, 0.106413f, 0.099487f, -0.016742f, -0.097334f, -0.086152f, -0.212444f, -0.028043f, -0.007362f, 0.003914f, -0.055864f, 0.034756f, 0.081361f, 0.080183f, 0.061319f, 0.193396f, 0.173716f, 0.207765f, 0.231701f, -0.074565f, -0.073257f, -0.086470f, -0.083114f, 0.081489f, 0.078477f, 0.033452f, 0.058835f, -0.069665f, -0.031691f, -0.111255f, -0.167754f, 0.184179f, 0.174673f, 0.160288f, 0.190893f, 0.110930f, 0.103495f, 0.098408f, 0.102918f, 0.053764f, 0.089994f, 0.140308f, 0.124867f, 0.074176f, 0.117460f, -0.160775f, -0.144132f, -0.099373f, -0.035913f, 0.081237f, 0.062247f, -0.166421f, 0.062125f, 0.276479f, 0.060955f, 0.066627f, 0.455347f, 0.219953f, 0.109912f, 0.273931f, 0.233153f, 0.102236f, 0.447606f, -0.352243f, 0.499236f, -0.931206f, 0.248595f, 0.254047f, 0.061542f, 0.268804f, 0.309517f, -0.084414f, -0.245828f, -0.144882f, -0.296579f, -0.091628f, -0.142202f, -0.541764f, -0.407470f, 0.053481f, 0.238955f, 0.150188f, -0.060598f, 0.196118f, -0.215617f, -0.086238f, -0.263420f, 0.206877f, 0.241788f, -0.122544f, -0.448790f, 0.286917f, 0.112063f, -0.268408f, -0.041770f, 0.089161f, 0.355811f, -0.078245f, -0.148490f, -0.407301f, -1.296870f, -0.633421f, 0.124253f, 0.275402f, 0.223048f, 0.077016f, 0.160766f, 0.115374f, 0.061053f, -0.231872f, -0.515052f, -0.278331f, -0.235912f, -0.416372f, -0.284106f, -0.055942f, 0.110698f, -0.428288f, -0.298137f, -0.018101f, 0.102677f, -0.019639f, 0.013479f, 0.038549f, 0.048682f, 0.128684f, 0.116416f, 0.044852f, 0.008133f, 0.061597f, 0.083582f, 0.014953f, 0.063716f, -0.155318f, -0.061732f, 0.084855f, 0.129505f, 0.068249f, 0.193775f, -0.088631f, -0.446398f, -0.075710f, -0.061327f, 0.278715f, 0.540366f, 0.618715f, 0.538374f, -0.037843f, 0.062370f, -0.033184f, 0.119901f, -0.008641f, -0.064789f, 0.087498f, 0.043486f, 0.247085f, 0.419992f, 0.299935f, 0.234276f, 0.089283f, 0.070357f, 0.068888f, 0.134311f, 0.109823f, 0.072431f, 0.081676f, 0.091366f, -1.707980f, -2.213110f, -2.149930f, -1.556870f, 0.226598f, 0.191675f, 0.192207f, 0.159566f, -0.070194f, -0.136070f, -0.015172f, -0.204272f, -0.162191f, -0.043313f, -0.158007f, -0.227210f, 0.040398f, 0.043014f, 0.039439f, -0.035439f, 0.245558f, 0.439691f, 0.219659f, 0.138210f, -0.048129f, 0.004954f, -0.102860f, -0.185376f, 0.035548f, 0.006821f, 0.079199f, 0.032901f, 0.039218f, 0.068113f, 0.023075f, -0.037582f, 0.225181f, 0.164562f, 0.106718f, 0.032684f, 0.013402f, 0.018797f, 0.076606f, 0.046512f, -0.070024f, 0.099921f, -0.051231f, 0.074167f, 0.173313f, 0.220212f, 0.142665f, 0.069809f, -0.195130f, -0.007912f, -0.006764f, -0.063687f, 0.306374f, 0.402035f, 0.273759f, 0.449469f, 0.114597f, 0.210745f, 0.355326f, 0.271307f, -0.109943f, -0.171912f, -0.070726f, -0.128932f, 0.138770f, 0.164971f, 0.308516f, 0.332536f, 0.081537f, 0.096939f, 0.054136f, 0.052226f, 0.109489f, 0.010223f, 0.168072f, -0.106279f, 0.525568f, 0.704816f, 0.588942f, 0.473398f, 0.149497f, 0.120835f, 0.080049f, 0.151340f, -0.182038f, -0.191091f, -0.196505f, -0.198309f, -0.801819f, -1.441620f, -1.107780f, -1.025650f, 0.035750f, 0.018049f, -0.029033f, -0.067255f, 0.192049f, 0.009664f, -0.043741f, 0.051557f, 0.082815f, 0.069547f, -0.073379f, 0.010584f, 0.192128f, 0.208586f, 0.141904f, 0.100763f, 0.046183f, 0.044776f, -0.033611f, -0.005812f, 0.012966f, 0.030301f, 0.100665f, 0.103641f, -0.294776f, -0.361573f, -0.420156f, -0.388743f, 0.239287f, 0.191975f, 0.089644f, 0.117591f, 0.069563f, 0.021480f, 0.100287f, 0.174159f, -0.013571f, 0.090960f, 0.010232f, -0.034760f, -0.077205f, 0.060632f, -0.145527f, -0.391110f, -0.143052f, -0.236448f, -0.103902f, -0.188463f, 0.071311f, -0.080171f, 0.021987f, 0.041767f, -0.419487f, -0.515479f, -0.205470f, -0.732132f, 0.150901f, 0.107202f, 0.156307f, 0.143672f, 0.474682f, 0.178137f, 0.150063f, 0.414515f, 0.559891f, 0.697019f, 0.541231f, 0.505310f, -0.478101f, -0.444267f, -0.586539f, -0.445996f, -0.451873f, -0.530085f, -0.447980f, -0.364955f, 0.372435f, 0.318894f, 0.351211f, 0.193961f, 0.212295f, 0.212842f, 0.220003f, 0.243743f, -0.388628f, -0.789620f, -0.536618f, -0.430691f, 0.247004f, 0.266489f, 0.261033f, 0.263692f, 0.050089f, 0.048958f, 0.065207f, 0.120180f, -0.526230f, -0.481969f, -0.422411f, -0.272292f, 0.155593f, 0.229614f, 0.139579f, 0.171805f, -0.251924f, -0.302067f, -0.126157f, -0.346650f, -1.195450f, -1.281100f, -0.987911f, -1.478440f, 0.285667f, 0.284802f, 0.301887f, 0.259556f, -0.194127f, -0.090440f, -0.257959f, -0.259572f, -0.012273f, -0.049993f, -0.099431f, 0.012506f, 0.081526f, 0.166279f, 0.042594f, 0.185121f, 0.148830f, 0.073161f, 0.201728f, 0.125747f, -0.295065f, -0.187585f, -0.333066f, -0.312291f, 0.253458f, 0.321585f, 0.178844f, 0.219944f, -0.763475f, -0.943374f, -0.816825f, -0.709901f, -0.166132f, 0.129186f, 0.015405f, -0.065623f, -0.246006f, -0.340385f, -0.118155f, -0.384905f, -0.233883f, -0.400666f, -0.228597f, -0.228428f, -0.559083f, -0.377784f, -0.541458f, -0.542870f, 0.067400f, 0.122987f, 0.180901f, 0.186004f, -0.482910f, -0.424823f, -0.477831f, -0.394719f, 0.091558f, 0.049248f, 0.049370f, 0.160429f, 0.133641f, 0.096625f, 0.104429f, 0.100782f, -0.238252f, -0.221459f, -0.196974f, -0.250393f, -3.071750f, -2.418450f, -0.861410f, -1.051580f, 0.071263f, 0.118014f, -0.028430f, -0.072073f, -0.074463f, 0.034168f, 0.044089f, -0.091109f, -3.153840f, -2.945850f, -1.977360f, -1.498850f, -0.083429f, 0.131835f, -0.063865f, -0.065785f, -0.069346f, -0.015520f, -0.119551f, 0.044881f, -0.105280f, 0.127516f, 0.005255f, -0.142777f, 0.061055f, -0.117250f, 0.020454f, 0.157879f, -0.213812f, -0.151783f, 0.028583f, 0.137759f, -3.248250f, -3.005940f, -1.510540f, -1.475390f, 0.081874f, -0.171465f, -0.135690f, -0.001989f, -0.227574f, -0.132799f, -0.359742f, -0.137197f, 0.066324f, 0.039194f, -0.050857f, 0.095166f, 0.044475f, 0.011221f, 0.054904f, 0.061414f, -0.039189f, 0.123751f, -0.017171f, -0.008494f, -2.598220f, -2.832670f, -1.622030f, -1.201990f, 0.154313f, -0.021436f, 0.042190f, 0.143947f, -0.090623f, 0.086853f, 0.143137f, 0.099821f, -1.732820f, -1.429730f, -0.775125f, -0.648036f, 0.082176f, 0.079448f, -0.040575f, 0.024511f, -0.064105f, -0.117122f, -0.190323f, -0.182589f, -0.076430f, -0.095615f, -0.112513f, -0.101581f, 0.143037f, 0.148180f, 0.430958f, 0.359225f, 0.001403f, -0.080541f, -0.295001f, -0.156706f, 0.426623f, 0.475597f, 0.455210f, 0.454352f, 0.074365f, 0.099440f, 0.066348f, -0.007078f, 0.008335f, -0.097116f, -0.133687f, -0.110535f, 0.204145f, 0.281478f, 0.078886f, 0.112857f, -0.103620f, -0.068247f, 0.191147f, 0.227593f, -0.011816f, -0.058755f, -0.149477f, -0.101828f, 0.079878f, 0.304949f, 0.557555f, 0.305288f, -0.150955f, -0.118610f, 0.052073f, 0.064707f, -0.121728f, -0.151132f, -0.193987f, -0.175046f, 0.043655f, 0.105270f, -0.120715f, -0.040976f, 0.047776f, -0.004443f, 0.149606f, 0.111240f, -0.047502f, -0.064146f, -0.151858f, -0.151872f, -0.160207f, -0.113846f, -0.081585f, -0.006708f, -0.203760f, -0.068597f, -0.179979f, -0.127779f, -0.062460f, -0.064513f, -0.121479f, -0.111122f, -0.212384f, -0.229157f, -0.283428f, -0.184891f, }; static const float weights_layer_3[] = { -0.039388f, 0.033048f, -0.113003f, -0.011642f, 0.170478f, 0.145713f, 0.040189f, -0.280129f, -0.049050f, -0.043788f, -0.157425f, 0.323829f, -0.250725f, -0.166349f, 0.101650f, -0.049690f, 0.205606f, 0.281131f, 0.623204f, 0.993452f, -0.015115f, -0.138995f, 0.009473f, 0.157673f, -0.024687f, -0.067214f, 0.125566f, -0.317619f, 0.057002f, 0.031202f, -0.018167f, 0.068542f, 0.011609f, -0.020233f, -0.000428f, -0.035956f, -0.843274f, -0.800587f, -0.214917f, -0.221250f, 0.031255f, -0.077330f, -0.074902f, -0.063979f, -0.055562f, 0.679495f, 0.146609f, 1.315330f, -0.118399f, -0.034539f, -0.050377f, 0.172867f, -0.204607f, -0.034930f, 0.176014f, 0.089747f, -0.003889f, 0.044980f, 0.002386f, -0.141723f, -0.035828f, -0.204701f, 0.099813f, 0.123580f, 0.209851f, -0.110989f, -0.043655f, -0.461118f, -0.139664f, 0.026855f, -0.081714f, 0.207623f, 0.089942f, 0.253082f, 0.680568f, 0.811360f, -0.090528f, -0.116818f, -0.432361f, -0.075588f, -0.269924f, -0.276810f, -0.289192f, -0.282570f, 0.245566f, 0.267216f, 0.238622f, 0.286528f, -0.157605f, -0.200401f, -0.138924f, -0.185006f, 0.215203f, 0.203316f, 0.209532f, 0.293135f, 0.928046f, 0.733323f, -0.094120f, 0.036918f, -0.126643f, -0.083371f, -0.147530f, -0.153195f, 0.097097f, 0.101852f, 0.109160f, 0.105129f, -0.051869f, -0.064359f, -0.073469f, -0.059591f, 0.102431f, 0.109444f, 0.113614f, 0.105617f, 0.383311f, 0.325783f, 0.393234f, 0.382508f, 0.194720f, 0.189672f, 0.217477f, 0.177786f, 0.326461f, 0.114789f, 0.317061f, 0.048291f, -0.061143f, -0.134641f, -0.067895f, -0.108446f, 0.082592f, 0.029918f, -0.006580f, 0.015533f, -0.053583f, -0.055540f, -0.063395f, -0.023157f, -0.064955f, -0.073981f, -0.115452f, -0.086626f, -0.036616f, 0.008454f, 0.012029f, -0.008039f, -0.207395f, -0.216419f, -0.205363f, -0.249099f, 0.343308f, 0.413215f, -0.009918f, -0.109978f, -0.059711f, -0.045089f, -0.029130f, -0.038483f, -0.070323f, -0.099409f, -0.008849f, -0.063527f, 0.175963f, 0.185335f, 0.149151f, 0.199997f, -0.027516f, -0.039812f, -0.027760f, -0.047910f, -0.007337f, 0.071065f, 0.086225f, 0.125539f, 0.151390f, 0.215488f, 0.203450f, 0.045380f, 0.095761f, 0.107809f, 0.103918f, 0.122383f, 0.116287f, 0.135455f, 0.115446f, 0.155673f, -0.044648f, -0.027455f, -0.015473f, -0.026657f, 0.089852f, 0.077459f, 0.077631f, 0.082507f, -0.102761f, -0.054669f, -0.132223f, -0.024768f, 0.111573f, 0.060467f, 0.107883f, 0.056621f, 0.219357f, -0.161153f, 0.074379f, -0.118743f, -0.169931f, -0.153995f, -0.220003f, -0.200186f, 0.032318f, -0.060687f, -0.087550f, -0.038022f, 0.026633f, -0.005534f, 0.029532f, 0.027081f, 0.011926f, 0.058412f, 0.010631f, 0.003068f, -0.014911f, 0.063070f, 0.065271f, 0.089550f, 0.012885f, 0.005320f, -0.037494f, -0.019849f, -0.009624f, -0.059090f, -0.021222f, -0.088033f, -0.055261f, -0.055113f, -0.047598f, -0.055478f, -0.023648f, -0.046827f, -0.036572f, -0.057655f, 0.104194f, 0.179800f, 0.175751f, 0.192851f, -0.016950f, -0.073650f, -0.028592f, -0.088219f, 0.011130f, 0.061825f, 0.025643f, 0.034183f, 0.095548f, 0.001457f, -0.132869f, 0.032981f, -0.140178f, -0.105343f, -0.161799f, -0.161983f, 0.177746f, 0.132903f, 0.135627f, 0.152489f, -0.012532f, -0.068747f, -0.085849f, -0.095434f, 0.087037f, 0.139497f, 0.111899f, 0.100189f, -0.024649f, -0.092003f, 0.020783f, -0.115807f, 0.092039f, 0.093943f, 0.109466f, 0.049639f, -0.133727f, 0.128430f, -0.050546f, 0.190632f, 0.123733f, 0.082305f, 0.114878f, 0.122572f, 0.201618f, 0.137588f, 0.065582f, 0.125161f, -0.095179f, -0.120719f, -0.127126f, -0.101961f, -0.118120f, -0.104833f, -0.179632f, -0.131764f, -0.138096f, -0.147861f, -0.131512f, -0.153905f, -0.201816f, -0.206641f, -0.196707f, -0.160013f, -0.212605f, -0.093998f, -0.186258f, -0.076137f, -0.065340f, -0.006969f, -0.071383f, -0.075005f, }; static const float weights_layer_4[] = { -0.016102f, -0.022836f, 0.624049f, 0.273485f, 0.222800f, -0.290175f, -0.518415f, 0.413484f, -0.264495f, 0.498083f, -0.450145f, -0.106419f, 0.095103f, -0.187451f, 0.145933f, -0.371542f, -0.088871f, 0.184017f, -0.429625f, -0.110882f, 0.292781f, 0.289588f, 0.185127f, 0.326017f, -0.432009f, -0.342663f, -0.312206f, 0.004004f, -1.114290f, 0.028497f, -0.264944f, -0.419611f, 0.046336f, 0.138232f, -0.869528f, 0.425557f, -0.954838f, -0.186830f, -0.464622f, -0.757107f, -0.432686f, -0.125978f, -0.402633f, -0.172266f, -0.041749f, -0.822238f, -0.118486f, 0.238617f, -0.198037f, 0.146347f, 0.405257f, 0.513303f, -0.078876f, -0.300385f, -0.010293f, -0.183962f, 0.155738f, 0.186797f, -0.086814f, 0.000179f, 0.123467f, 0.362523f, 0.068805f, 0.371834f, 0.038122f, -0.117867f, -0.120445f, -0.422322f, -0.131402f, 0.285449f, 0.038957f, 0.008844f, -0.020197f, 0.187723f, 0.190433f, 0.146532f, -0.091068f, -0.270865f, -0.194231f, -0.226777f, 0.013548f, 0.248351f, 0.537685f, 0.056316f, -0.171540f, -0.003865f, 0.406439f, 0.126507f, 0.192780f, 0.149335f, -0.149602f, 0.255202f, -0.015426f, 0.032335f, -1.791330f, -0.894602f, -0.196641f, -0.282846f, -0.391100f, -0.040969f, 0.049934f, 0.056348f, -0.041426f, -0.075159f, -0.658335f, -0.827270f, -0.175029f, -0.427235f, 0.311201f, 0.560413f, 0.363408f, 0.374580f, -0.433531f, -0.180580f, 0.142142f, 0.194768f, -0.054118f, -0.376541f, -0.366185f, -0.308782f, -0.273143f, -0.074097f, 0.009000f, -0.182198f, -0.015616f, -0.003882f, -0.174340f, -0.354866f, 0.527972f, 0.348355f, 0.091381f, -0.419828f, -0.530529f, 0.159899f, -0.511867f, -0.104237f, -0.286079f, -0.659039f, -0.266596f, -0.256557f, -0.600437f, -0.446333f, -0.229629f, 0.024931f, -0.143716f, -0.415754f, -0.003760f, -0.107195f, -0.666165f, -0.697312f, -0.650255f, -0.703877f, 0.243402f, 0.426710f, 0.217210f, 0.260255f, 0.027416f, 0.163147f, 0.132188f, 0.142374f, 0.558627f, 0.065717f, 0.382781f, -1.192240f, 0.195492f, 0.028439f, 0.278252f, -0.491806f, 0.497701f, -0.448835f, -0.245079f, -0.014336f, -0.174907f, -0.409633f, 0.207548f, 0.433813f, 0.459889f, 0.431728f, 0.605050f, 0.485520f, 0.218548f, 0.437307f, 0.027023f, -0.204251f, 0.012100f, 0.150677f, -1.097980f, 0.086866f, -1.293130f, -0.372575f, -0.876264f, -0.021818f, 0.322864f, -0.231043f, -0.271608f, 0.132782f, -0.314895f, 0.396800f, 0.262788f, -0.317212f, -0.666308f, 0.830742f, 0.319409f, -0.564373f, -0.178656f, 0.306993f, 0.265634f, -0.332480f, -0.491514f, -0.186745f, -0.063044f, -0.009321f, 0.074944f, -0.372082f, -0.029479f, 0.081548f, 0.028172f, -0.233148f, -0.337938f, -0.087695f, 0.596556f, 0.559530f, 0.139332f, 0.107223f, -0.190915f, 0.137401f, -0.150625f, -0.225484f, -0.191344f, -0.232535f, 0.126510f, 0.296323f, -0.547901f, -0.653080f, 0.358514f, 0.726289f, -0.421725f, -0.243620f, 0.236206f, 0.390823f, -0.076560f, -0.282329f, -0.012460f, -0.428484f, 0.349469f, 0.394629f, 0.421537f, 0.219632f, -0.117550f, -0.087894f, 0.077155f, 0.016000f, -0.289137f, -0.092937f, -0.014518f, -0.027111f, 0.210329f, -0.159678f, 0.013288f, -0.039268f, 0.008112f, 0.003152f, 0.030084f, -0.039859f, 0.322028f, -0.407797f, 0.447087f, -0.381562f, 0.529297f, -0.520298f, 0.562865f, -0.616878f, 0.689389f, 0.754262f, 0.138475f, 0.750697f, -0.760157f, -0.383740f, 0.074219f, 0.556257f, 0.087827f, -0.511826f, -0.305507f, -0.638214f, 0.114833f, -0.444022f, 0.526612f, -0.604984f, -0.100415f, 0.037824f, -0.106264f, 0.337615f, 0.070743f, 0.031129f, 0.281954f, 0.176144f, -0.032833f, -0.073902f, -0.285492f, -0.803803f, -0.015589f, 0.186077f, -0.033351f, 0.517269f, -1.878800f, -1.685210f, -0.416581f, 0.158476f, -0.071929f, -0.624353f, -0.122069f, -0.075065f, 0.311816f, 0.506305f, 0.383896f, 0.259450f, -0.308232f, -0.094221f, -0.421885f, -0.293573f, }; static const float weights_layer_5[] = { 0.131894f, 0.078431f, 0.323121f, -0.230680f, -0.684740f, 0.020895f, 0.364983f, 0.121656f, 0.132448f, -0.731198f, 0.071148f, 0.739642f, 0.318437f, -0.033021f, -1.037080f, 0.135335f, 0.383582f, 0.287332f, 0.054042f, -0.825482f, 0.418533f, 0.305606f, 0.041549f, 0.432422f, -0.826878f, -0.593536f, 0.105657f, 0.125357f, 0.408567f, -0.293338f, 0.233905f, -0.039609f, 0.547727f, -0.435806f, 0.036160f, 0.220275f, -0.020337f, -0.619403f, -0.455858f, 0.681455f, 0.543846f, -0.495084f, 0.251496f, -0.085686f, 0.091395f, -0.476696f, 0.453628f, -0.109663f, 0.383493f, -0.456563f, -0.212935f, 0.020567f, -0.719564f, -0.377813f, -0.737511f, 0.765965f, 0.624309f, -0.063679f, -0.055681f, -0.475969f, -0.069902f, 0.725690f, 0.641094f, 0.439922f, -0.111544f, -0.309061f, 0.280091f, 0.381416f, 0.481168f, 0.483543f, -0.901267f, -0.499230f, 0.043449f, -0.372395f, 0.021216f, -0.002200f, -0.524089f, -0.071485f, -0.273974f, -0.462654f, 0.042369f, -0.138679f, -0.330060f, 0.021886f, -0.306075f, -0.011130f, -0.260224f, -0.288435f, -0.104039f, -0.183563f, 0.118990f, -0.531160f, 0.339632f, -0.028374f, 0.159084f, -0.008824f, -0.791388f, 0.245242f, 0.356510f, 0.469867f, -0.396949f, -0.476146f, -0.168472f, 1.068400f, 0.474629f, -0.117554f, -0.142453f, -0.306604f, 0.348525f, -0.111929f, -0.435384f, 0.019952f, -0.260185f, 0.373376f, 0.109729f, -0.639168f, 0.033392f, -0.082573f, -0.196018f, 0.301637f, -0.124210f, -0.202515f, -1.221920f, -0.253690f, -0.144864f, 0.287753f, -0.161206f, -0.213246f, 0.373968f, 0.141397f, -0.248237f, 0.283090f, -0.008977f, -0.172960f, -0.234146f, -0.720014f, -0.322451f, 0.181083f, 0.310659f, -0.422646f, -0.719994f, -0.354339f, 0.352739f, 0.230923f, 0.427013f, -0.660316f, 0.232140f, 0.685896f, 0.660208f, 0.225748f, -0.918750f, -0.650790f, -0.674525f, -0.450305f, -0.152529f, 0.498480f, 0.895092f, 0.688242f, 0.669057f, 0.612669f, 0.593484f, 0.318204f, -0.169294f, 0.388789f, -0.529777f, -0.219706f, -0.044916f, 0.161697f, -0.145288f, 0.196153f, -0.022212f, -0.434209f, -0.208115f, -0.117745f, -0.279029f, -0.009506f, 0.137474f, 0.330148f, 0.439258f, 0.345879f, -0.845131f, -0.215713f, 0.094463f, 0.638604f, 0.882254f, -0.964082f, -0.383920f, 0.292645f, 0.266341f, 0.747473f, -0.645631f, -0.538896f, -0.319764f, 0.521880f, 0.460091f, -0.470898f, -0.778283f, -0.061622f, -0.142433f, 0.210520f, 0.804197f, 0.285840f, -0.138414f, -0.381846f, -0.499991f, 0.223648f, 0.439025f, 0.321508f, -0.099560f, -0.622893f, 0.750925f, 0.740994f, 0.140405f, 0.074631f, -0.270223f, -0.829049f, -0.753355f, -0.258015f, 0.006285f, -0.730573f, -1.107390f, -0.538015f, -1.005520f, -0.724115f, -0.440183f, -0.395239f, 0.508768f, 0.204620f, -0.267331f, 0.001740f, -0.838709f, 0.659333f, 0.043739f, -0.024099f, 0.262431f, 0.252433f, -0.265215f, 0.057289f, -0.428192f, -0.114350f, -0.011475f, 0.463995f, 0.668833f, -0.604556f, -0.122780f, -0.441645f, 0.145769f, 0.310450f, -1.003500f, 0.936069f, 0.516604f, -0.643386f, -0.518571f, 0.306130f, 0.337387f, 0.583400f, -0.366025f, -0.560035f, -0.262332f, 0.465242f, 0.964332f, -0.545410f, -0.637428f, -0.202695f, 0.378931f, 0.834604f, 0.000970f, -0.553303f, -0.562879f, 0.221665f, 0.395160f, 0.446281f, -0.184394f, -0.591780f, 0.170595f, 1.164390f, 0.227068f, -0.150910f, -0.393690f, -0.131151f, 0.309956f, -0.413518f, -0.768334f, -0.548975f, 0.245384f, -0.256904f, -0.514790f, -0.102616f, -0.347625f, 0.420456f, 0.037804f, -0.283200f, -0.578815f, 0.319282f, 0.674622f, -0.011791f, -0.339329f, 0.466705f, 0.563444f, 0.409660f, 0.445784f, -0.899507f, -0.605116f, 0.622438f, 0.427385f, -0.062509f, 0.666570f, 0.057105f, 0.357894f, -0.811016f, -0.421715f, -0.458397f, 0.288955f, 0.005857f, 0.236331f, 0.107957f, 0.587276f, -0.375800f, 0.323799f, -0.623363f, 0.254122f, -0.198478f, -0.098436f, -0.282531f, 0.452453f, -0.163349f, -0.413382f, -0.448732f, -0.528770f, -0.457449f, -0.619619f, -0.265919f, -0.042760f, 0.438730f, 0.501798f, -0.403851f, 0.519564f, 0.817314f, 0.366203f, 0.492610f, 0.546929f, 0.853094f, 0.289000f, 0.453941f, -0.076152f, 0.007226f, -0.183717f, -0.506252f, -0.599989f, -0.576006f, 0.746488f, 0.631466f, -0.475599f, -0.334991f, -0.879614f, 0.918957f, 0.473471f, -0.043781f, -0.688234f, -0.925875f, -0.188081f, 0.050918f, 0.116855f, 0.221413f, -0.066680f, -0.674395f, -0.481985f, 0.247368f, 0.271129f, 0.637979f, -1.006970f, -0.855441f, 0.144874f, 0.507424f, 1.506960f, -0.338910f, 0.398203f, 0.738000f, 0.263193f, -0.425908f, 0.358271f, -1.072900f, -0.816209f, -0.425519f, 0.264373f, 0.694014f, 0.036333f, 0.635532f, 0.518856f, 0.047585f, -0.854817f, -0.138202f, 0.006811f, -0.052020f, -0.468498f, 0.489080f, -0.105778f, 0.357038f, -0.782875f, 0.649049f, -0.562652f, -0.544392f, -0.328526f, -0.402121f, -0.263172f, -0.668459f, -0.526702f, -0.395829f, 0.190986f, 0.307766f, -1.001830f, -0.293051f, 0.283334f, 0.572450f, 0.906095f, -1.144300f, 0.180989f, 0.421092f, 0.684571f, 0.527276f, -0.122287f, 0.575067f, 0.675221f, 0.755029f, 0.094957f, 0.481403f, 0.825155f, 0.755035f, 0.641420f, 0.034497f, 0.518783f, 0.283800f, 0.293733f, -0.074778f, -0.268720f, 0.798921f, 0.317714f, -0.236391f, -0.375071f, -0.414600f, 0.223413f, -0.349044f, -0.191033f, -0.391779f, -0.596894f, -0.378608f, -0.185920f, -0.822171f, -0.754962f, -0.167706f, 0.755378f, 0.671847f, 0.969414f, 0.793048f, 1.078610f, -0.418963f, 0.367648f, 0.217645f, 0.294232f, 0.113027f, 0.060312f, -0.327488f, -0.305035f, -0.243600f, -0.020588f, -0.326324f, -0.417534f, -0.425868f, -0.404614f, -0.346750f, -0.339145f, -0.348094f, -0.527290f, -0.617825f, -0.258342f, -0.200753f, -0.249779f, -0.321039f, -0.023117f, -0.004167f, -0.206788f, -0.612420f, -0.646428f, -0.548969f, -0.158875f, 0.213814f, -0.084040f, -0.217365f, -0.511895f, -0.653285f, 0.440971f, 0.455591f, -0.123900f, 0.134097f, -0.251241f, 0.682463f, 0.740614f, 0.991212f, 0.565984f, 0.592690f, }; static inline float32x4_t add_f32x4_x4(const float32x4_t a[4]) { float32x4_t sum01 = vaddq_f32(a[0], a[1]); float32x4_t sum23 = vaddq_f32(a[2], a[3]); return vaddq_f32(sum01, sum23); } static inline void av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon( const float **input, int in_width, int in_height, int in_stride, const float *bias, const int skip_width, const int skip_height, const int filter_width, const int filter_height, const int in_channels, const int out_channels, float **output, int out_stride, int start_idx, const float *weights) { assert(filter_height == 2 && filter_width == 2); assert(skip_width == 2 && skip_height == 2); assert(in_width >= 16); const int in_size = in_height * in_width; do { const float32x4_t bias_v = vdupq_n_f32(bias[0]); const float *weight_ptr0 = weights; const float *in_ptr0 = *input; float *out_ptr0 = *output; int h = 0; do { const float *in_ptr1 = in_ptr0; float *out_ptr1 = out_ptr0; int w = 0; do { const float *weight_ptr1 = weight_ptr0; const float *in_ptr2 = in_ptr1; int k = 0; float32x4_t sum0[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; float32x4_t sum1[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; do { const float32x4_t weights0 = vld1q_f32(weight_ptr1); const float32x4_t weights1 = vld1q_f32(weight_ptr1 + 4); const float32x2_t weights0_lo = vget_low_f32(weights0); const float32x2_t weights0_hi = vget_high_f32(weights0); const float32x2_t weights1_lo = vget_low_f32(weights1); const float32x2_t weights1_hi = vget_high_f32(weights1); const float32x4x2_t in0_lo_0 = vld2q_f32(in_ptr2); const float32x4x2_t in0_hi_0 = vld2q_f32(in_ptr2 + in_stride); const float32x4x2_t in1_lo_0 = vld2q_f32(in_ptr2 + in_size); const float32x4x2_t in1_hi_0 = vld2q_f32(in_ptr2 + in_size + in_stride); sum0[0] = vmlaq_lane_f32(sum0[0], in0_lo_0.val[0], weights0_lo, 0); sum0[0] = vmlaq_lane_f32(sum0[0], in0_lo_0.val[1], weights0_lo, 1); sum0[1] = vmlaq_lane_f32(sum0[1], in0_hi_0.val[0], weights0_hi, 0); sum0[1] = vmlaq_lane_f32(sum0[1], in0_hi_0.val[1], weights0_hi, 1); sum0[2] = vmlaq_lane_f32(sum0[2], in1_lo_0.val[0], weights1_lo, 0); sum0[2] = vmlaq_lane_f32(sum0[2], in1_lo_0.val[1], weights1_lo, 1); sum0[3] = vmlaq_lane_f32(sum0[3], in1_hi_0.val[0], weights1_hi, 0); sum0[3] = vmlaq_lane_f32(sum0[3], in1_hi_0.val[1], weights1_hi, 1); const float32x4x2_t in0_lo_1 = vld2q_f32(in_ptr2 + 8); const float32x4x2_t in0_hi_1 = vld2q_f32(in_ptr2 + in_stride + 8); const float32x4x2_t in1_lo_1 = vld2q_f32(in_ptr2 + in_size + 8); const float32x4x2_t in1_hi_1 = vld2q_f32(in_ptr2 + in_size + in_stride + 8); sum1[0] = vmlaq_lane_f32(sum1[0], in0_lo_1.val[0], weights0_lo, 0); sum1[0] = vmlaq_lane_f32(sum1[0], in0_lo_1.val[1], weights0_lo, 1); sum1[1] = vmlaq_lane_f32(sum1[1], in0_hi_1.val[0], weights0_hi, 0); sum1[1] = vmlaq_lane_f32(sum1[1], in0_hi_1.val[1], weights0_hi, 1); sum1[2] = vmlaq_lane_f32(sum1[2], in1_lo_1.val[0], weights1_lo, 0); sum1[2] = vmlaq_lane_f32(sum1[2], in1_lo_1.val[1], weights1_lo, 1); sum1[3] = vmlaq_lane_f32(sum1[3], in1_hi_1.val[0], weights1_hi, 0); sum1[3] = vmlaq_lane_f32(sum1[3], in1_hi_1.val[1], weights1_hi, 1); weight_ptr1 += 8; in_ptr2 += 2 * in_size; k += 2; } while (k < in_channels); vst1q_f32(out_ptr1, add_f32x4_x4(sum0)); vst1q_f32(out_ptr1 + 4, add_f32x4_x4(sum1)); out_ptr1 += 8; in_ptr1 += 8 * skip_width; w += 8 * skip_width; } while (w < in_width - filter_width + 1); out_ptr0 += out_stride; in_ptr0 += skip_height * in_stride; h += skip_height; } while (h < in_height - filter_height + 1); ++bias; ++output; weights += in_channels * filter_height * filter_width; } while (++start_idx < out_channels); } static inline void av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon( const float **input, int in_width, int in_height, int in_stride, const float *bias, const int skip_width, const int skip_height, const int filter_width, const int filter_height, const int in_channels, const int out_channels, float **output, int out_stride, int start_idx, const float *weights) { assert(filter_height == 2 && filter_width == 2); assert(skip_width == 2 && skip_height == 2); assert(in_width == 8); const int in_size = in_height * in_width; do { const float32x4_t bias_v = vdupq_n_f32(*bias); const float *weight_ptr0 = weights; const float *in_ptr0 = *input; float *out_ptr0 = *output; int h = 0; do { const float *in_ptr1 = in_ptr0; float *out_ptr1 = out_ptr0; int w = 0; do { const float *weight_ptr1 = weight_ptr0; const float *in_ptr2 = in_ptr1; int k = 0; float32x4_t sum[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; do { const float32x4_t weights0 = vld1q_f32(weight_ptr1); const float32x4_t weights1 = vld1q_f32(weight_ptr1 + 4); const float32x2_t weights0_lo = vget_low_f32(weights0); const float32x2_t weights0_hi = vget_high_f32(weights0); const float32x2_t weights1_lo = vget_low_f32(weights1); const float32x2_t weights1_hi = vget_high_f32(weights1); const float32x4x2_t in0_lo = vld2q_f32(in_ptr2); const float32x4x2_t in0_hi = vld2q_f32(in_ptr2 + in_stride); const float32x4x2_t in1_lo = vld2q_f32(in_ptr2 + in_size); const float32x4x2_t in1_hi = vld2q_f32(in_ptr2 + in_size + in_stride); sum[0] = vmlaq_lane_f32(sum[0], in0_lo.val[0], weights0_lo, 0); sum[0] = vmlaq_lane_f32(sum[0], in0_lo.val[1], weights0_lo, 1); sum[1] = vmlaq_lane_f32(sum[1], in0_hi.val[0], weights0_hi, 0); sum[1] = vmlaq_lane_f32(sum[1], in0_hi.val[1], weights0_hi, 1); sum[2] = vmlaq_lane_f32(sum[2], in1_lo.val[0], weights1_lo, 0); sum[2] = vmlaq_lane_f32(sum[2], in1_lo.val[1], weights1_lo, 1); sum[3] = vmlaq_lane_f32(sum[3], in1_hi.val[0], weights1_hi, 0); sum[3] = vmlaq_lane_f32(sum[3], in1_hi.val[1], weights1_hi, 1); weight_ptr1 += 8; in_ptr2 += 2 * in_size; k += 2; } while (k < in_channels); vst1q_f32(out_ptr1, add_f32x4_x4(sum)); out_ptr1 += 4; in_ptr1 += 4 * skip_width; w += 4 * skip_width; } while (w < in_width - filter_width + 1); out_ptr0 += out_stride; in_ptr0 += skip_height * in_stride; h += skip_height; } while (h < in_height - filter_height + 1); ++bias; ++output; weights += in_channels * filter_height * filter_width; } while (++start_idx < out_channels); } static inline void av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon( const float **input, int in_width, int in_height, int in_stride, const float *bias, const int skip_width, const int skip_height, const int filter_width, const int filter_height, const int in_channels, const int out_channels, float **output, int out_stride, int start_idx, const float *weights) { assert(filter_height == 5 && filter_width == 5); assert(skip_width == 4 && skip_height == 4); assert(in_width >= 16); assert(in_channels == 1); (void)in_channels; do { const float32x4_t bias_v = vdupq_n_f32(*bias); const float *in_ptr0 = *input; const float *weights_ptr0 = weights; float *out_ptr0 = *output; int h = 0; do { const float *in_ptr1 = in_ptr0; float *out_ptr1 = out_ptr0; int w = 0; do { float32x4_t sum[2] = { bias_v, vdupq_n_f32(0) }; const float32x4_t weight_0_3 = vld1q_f32(weights_ptr0); const float32x4_t weight_4_7 = vld1q_f32(weights_ptr0 + 4); const float32x4_t weight_8_11 = vld1q_f32(weights_ptr0 + 8); const float32x4_t weight_12_15 = vld1q_f32(weights_ptr0 + 12); const float32x4_t weight_16_19 = vld1q_f32(weights_ptr0 + 16); const float32x4_t weight_20_23 = vld1q_f32(weights_ptr0 + 20); const float32x2_t weight_0_3_lo = vget_low_f32(weight_0_3); const float32x2_t weight_0_3_hi = vget_high_f32(weight_0_3); const float32x2_t weight_4_7_lo = vget_low_f32(weight_4_7); const float32x2_t weight_4_7_hi = vget_high_f32(weight_4_7); const float32x2_t weight_8_11_lo = vget_low_f32(weight_8_11); const float32x2_t weight_8_11_hi = vget_high_f32(weight_8_11); const float32x2_t weight_12_15_lo = vget_low_f32(weight_12_15); const float32x2_t weight_12_15_hi = vget_high_f32(weight_12_15); const float32x2_t weight_16_19_lo = vget_low_f32(weight_16_19); const float32x2_t weight_16_19_hi = vget_high_f32(weight_16_19); const float32x2_t weight_20_23_lo = vget_low_f32(weight_20_23); const float32x2_t weight_20_23_hi = vget_high_f32(weight_20_23); const float32x4x4_t in0 = vld4q_f32(in_ptr1 + 0 * in_stride); const float32x4x4_t in1 = vld4q_f32(in_ptr1 + 1 * in_stride); const float32x4x4_t in2 = vld4q_f32(in_ptr1 + 2 * in_stride); const float32x4x4_t in3 = vld4q_f32(in_ptr1 + 3 * in_stride); const float32x4x4_t in4 = vld4q_f32(in_ptr1 + 4 * in_stride); const float32x4_t in0_4 = vextq_f32( in0.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 0 * in_stride)), 1); const float32x4_t in1_4 = vextq_f32( in1.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 1 * in_stride)), 1); const float32x4_t in2_4 = vextq_f32( in2.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 2 * in_stride)), 1); const float32x4_t in3_4 = vextq_f32( in3.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 3 * in_stride)), 1); const float32x4_t in4_4 = vextq_f32( in4.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 4 * in_stride)), 1); // Kernel row 0. sum[0] = vmlaq_lane_f32(sum[0], in0.val[0], weight_0_3_lo, 0); sum[1] = vmlaq_lane_f32(sum[1], in0.val[1], weight_0_3_lo, 1); sum[0] = vmlaq_lane_f32(sum[0], in0.val[2], weight_0_3_hi, 0); sum[1] = vmlaq_lane_f32(sum[1], in0.val[3], weight_0_3_hi, 1); sum[0] = vmlaq_lane_f32(sum[0], in0_4, weight_4_7_lo, 0); // Kernel row 1. sum[1] = vmlaq_lane_f32(sum[1], in1.val[0], weight_4_7_lo, 1); sum[0] = vmlaq_lane_f32(sum[0], in1.val[1], weight_4_7_hi, 0); sum[1] = vmlaq_lane_f32(sum[1], in1.val[2], weight_4_7_hi, 1); sum[0] = vmlaq_lane_f32(sum[0], in1.val[3], weight_8_11_lo, 0); sum[1] = vmlaq_lane_f32(sum[1], in1_4, weight_8_11_lo, 1); // Kernel row 2. sum[0] = vmlaq_lane_f32(sum[0], in2.val[0], weight_8_11_hi, 0); sum[1] = vmlaq_lane_f32(sum[1], in2.val[1], weight_8_11_hi, 1); sum[0] = vmlaq_lane_f32(sum[0], in2.val[2], weight_12_15_lo, 0); sum[1] = vmlaq_lane_f32(sum[1], in2.val[3], weight_12_15_lo, 1); sum[0] = vmlaq_lane_f32(sum[0], in2_4, weight_12_15_hi, 0); // Kernel row 3. sum[1] = vmlaq_lane_f32(sum[1], in3.val[0], weight_12_15_hi, 1); sum[0] = vmlaq_lane_f32(sum[0], in3.val[1], weight_16_19_lo, 0); sum[1] = vmlaq_lane_f32(sum[1], in3.val[2], weight_16_19_lo, 1); sum[0] = vmlaq_lane_f32(sum[0], in3.val[3], weight_16_19_hi, 0); sum[1] = vmlaq_lane_f32(sum[1], in3_4, weight_16_19_hi, 1); // Kernel row 4. sum[0] = vmlaq_lane_f32(sum[0], in4.val[0], weight_20_23_lo, 0); sum[1] = vmlaq_lane_f32(sum[1], in4.val[1], weight_20_23_lo, 1); sum[0] = vmlaq_lane_f32(sum[0], in4.val[2], weight_20_23_hi, 0); sum[1] = vmlaq_lane_f32(sum[1], in4.val[3], weight_20_23_hi, 1); sum[0] = vmlaq_f32(sum[0], vdupq_n_f32(*(weights_ptr0 + 24)), in4_4); vst1q_f32(out_ptr1, vaddq_f32(sum[0], sum[1])); out_ptr1 += 4; in_ptr1 += 4 * skip_width; w += 4 * skip_width; } while (w < in_width - filter_width + 1); out_ptr0 += out_stride; in_ptr0 += skip_height * in_stride; h += skip_height; } while (h < in_height - filter_height + 1); ++output; ++bias; weights += 25; } while (++start_idx < out_channels); } // Neon variant of av1_cnn_convolve_no_maxpool_padding_valid_c(). // As per the current encoder, av1_cnn_convolve function gets called for // block size equal to 64x64. av1_cnn_convolve() uses layer config values // set by av1_intra_mode_cnn_partition_cnn_config. The following are a few // details related to each layer's config parameters. // Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht // 0 64x64 16x16 5 5 4 4 // 1 16x16 8x8 2 2 2 2 // 2 8x8 4x4 2 2 2 2 // 3 4x4 2x2 2 2 2 2 // 4 2x2 1x1 2 2 2 2 // Here, // filter_wd = filter_width and filter_ht = filter_height, // skip_wd = skip_width and skip_ht = skip_height. void av1_cnn_convolve_no_maxpool_padding_valid_neon( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step) { assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) || !layer_config->maxpool); assert(layer_config->filter_height > 1 || layer_config->filter_width > 1); assert(layer_config->pad == PADDING_VALID); assert(channel_step == 1); assert(cstep == layer_config->in_channels * layer_config->out_channels); if (layer_config->filter_width == 5 && layer_config->filter_height == 5 && layer_config->skip_width == 4 && layer_config->skip_height == 4) { av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon( input, in_width, in_height, in_stride, layer_config->bias, layer_config->skip_width, layer_config->skip_height, layer_config->filter_width, layer_config->filter_height, layer_config->in_channels, layer_config->out_channels, output, out_stride, start_idx, weights_layer_5); } else if (layer_config->filter_width == 2 && layer_config->filter_height == 2 && layer_config->skip_width == 2 && layer_config->skip_height == 2) { const float *weights = weights_layer_1; if (layer_config->output_num == av1_intra_mode_cnn_partition_cnn_config.layer_config[2].output_num) { weights = weights_layer_2; } else if ((layer_config->output_num == av1_intra_mode_cnn_partition_cnn_config.layer_config[3] .output_num)) { weights = weights_layer_3; } else if ((layer_config->output_num == av1_intra_mode_cnn_partition_cnn_config.layer_config[4] .output_num)) { weights = weights_layer_4; } if (in_width >= 16) { av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon( input, in_width, in_height, in_stride, layer_config->bias, layer_config->skip_width, layer_config->skip_height, layer_config->filter_width, layer_config->filter_height, layer_config->in_channels, layer_config->out_channels, output, out_stride, start_idx, weights); } else if (in_width == 8) { av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon( input, in_width, in_height, in_stride, layer_config->bias, layer_config->skip_width, layer_config->skip_height, layer_config->filter_width, layer_config->filter_height, layer_config->in_channels, layer_config->out_channels, output, out_stride, start_idx, weights); } else { av1_cnn_convolve_no_maxpool_padding_valid_c( input, in_width, in_height, in_stride, layer_config, output, out_stride, start_idx, cstep, channel_step); } } else { av1_cnn_convolve_no_maxpool_padding_valid_c( input, in_width, in_height, in_stride, layer_config, output, out_stride, start_idx, cstep, channel_step); } } aom-3.12.1/av1/encoder/arm/encodetxb_neon.c000066400000000000000000000545301477627663500204330ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "aom_dsp/arm/mem_neon.h" #include "av1/common/txb_common.h" #include "av1/encoder/encodetxb.h" void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels) { const int stride = height + TX_PAD_HOR; memset(levels - TX_PAD_TOP * stride, 0, sizeof(*levels) * TX_PAD_TOP * stride); memset(levels + stride * width, 0, sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); const int32x4_t zeros = vdupq_n_s32(0); int i = 0; uint8_t *ls = levels; const tran_low_t *cf = coeff; if (height == 4) { do { const int32x4_t coeffA = vld1q_s32(cf); const int32x4_t coeffB = vld1q_s32(cf + height); const int16x8_t coeffAB = vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB)); const int16x8_t absAB = vqabsq_s16(coeffAB); const int8x8_t absABs = vqmovn_s16(absAB); #if AOM_ARCH_AARCH64 const int8x16_t absAB8 = vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros))); const uint8x16_t lsAB = vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros)); #else const int32x2x2_t absAB8 = vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros)); const uint8x16_t lsAB = vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1])); #endif vst1q_u8(ls, lsAB); ls += (stride << 1); cf += (height << 1); i += 2; } while (i < width); } else if (height == 8) { do { const int16x8_t coeffAB = load_tran_low_to_s16q(cf); const int16x8_t absAB = vqabsq_s16(coeffAB); const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8( vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros)))); vst1q_u8(ls, absAB8); ls += stride; cf += height; i += 1; } while (i < width); } else { do { int j = 0; do { const int16x8_t coeffAB = load_tran_low_to_s16q(cf); const int16x8_t coeffCD = load_tran_low_to_s16q(cf + 8); const int16x8_t absAB = vqabsq_s16(coeffAB); const int16x8_t absCD = vqabsq_s16(coeffCD); const uint8x16_t absABCD = vreinterpretq_u8_s8( vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD))); vst1q_u8((ls + j), absABCD); j += 16; cf += 16; } while (j < height); *(int32_t *)(ls + height) = 0; ls += stride; i += 1; } while (i < width); } } // get_4_nz_map_contexts_2d coefficients: static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = { { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 }, { 0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, 21, 21 } }; // get_4_nz_map_contexts_hor coefficients: /* clang-format off */ #define SIG_COEF_CONTEXTS_2D_X4_051010 \ (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \ ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24)) /* clang-format on */ // get_4_nz_map_contexts_ver coefficients: static const DECLARE_ALIGNED(16, uint8_t, c_4_po_hor[16]) = { SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10 }; // get_8_coeff_contexts_2d coefficients: // if (width == 8) static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = { { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 }, { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 } }; // if (width < 8) static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = { { 0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21 }, { 11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21 } }; // if (width > 8) static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = { { 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }, { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 } }; // get_4_nz_map_contexts_ver coefficients: static const DECLARE_ALIGNED(16, uint8_t, c_8_po_ver[16]) = { SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10 }; // get_16n_coeff_contexts_2d coefficients: // real_width == real_height static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = { { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 } }; // real_width < real_height static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = { { 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, { 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, { 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 } }; // real_width > real_height static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = { { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }, { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 } }; // get_16n_coeff_contexts_hor coefficients: static const DECLARE_ALIGNED(16, uint8_t, c_16_po_ver[16]) = { SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10 }; // end of coefficients declaration area static inline uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src, const int byte_stride) { #if AOM_ARCH_AARCH64 uint32x4_t v_data = vld1q_u32((uint32_t *)src); v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1); v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2); v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3); return vreinterpretq_u8_u32(v_data); #else return load_unaligned_u8q(src, byte_stride); #endif } static inline uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src, const int byte_stride) { #if AOM_ARCH_AARCH64 uint64x2_t v_data = vld1q_u64((uint64_t *)src); v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1); return vreinterpretq_u8_u64(v_data); #else uint8x8_t v_data_low = vld1_u8(src); uint8x8_t v_data_high = vld1_u8(src + byte_stride); return vcombine_u8(v_data_low, v_data_high); #endif } static inline uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src, const int byte_stride) { (void)byte_stride; return vld1q_u8(src); } static inline void load_levels_4x4x5(const uint8_t *const src, const int stride, const ptrdiff_t *const offsets, uint8x16_t *const level) { level[0] = load_8bit_4x4_to_1_reg(&src[1], stride); level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride); level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride); level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride); level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride); } static inline void load_levels_8x2x5(const uint8_t *const src, const int stride, const ptrdiff_t *const offsets, uint8x16_t *const level) { level[0] = load_8bit_8x2_to_1_reg(&src[1], stride); level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride); level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride); level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride); level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride); } static inline void load_levels_16x1x5(const uint8_t *const src, const int stride, const ptrdiff_t *const offsets, uint8x16_t *const level) { level[0] = load_8bit_16x1_to_1_reg(&src[1], stride); level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride); level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride); level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride); level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride); } static inline uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) { const uint8x16_t const_3 = vdupq_n_u8(3); const uint8x16_t const_4 = vdupq_n_u8(4); uint8x16_t count; count = vminq_u8(level[0], const_3); level[1] = vminq_u8(level[1], const_3); level[2] = vminq_u8(level[2], const_3); level[3] = vminq_u8(level[3], const_3); level[4] = vminq_u8(level[4], const_3); count = vaddq_u8(count, level[1]); count = vaddq_u8(count, level[2]); count = vaddq_u8(count, level[3]); count = vaddq_u8(count, level[4]); count = vrshrq_n_u8(count, 1); count = vminq_u8(count, const_4); return count; } static inline void get_4_nz_map_contexts_2d(const uint8_t *levels, const int width, const ptrdiff_t *const offsets, uint8_t *const coeff_contexts) { const int stride = 4 + TX_PAD_HOR; const uint8x16_t pos_to_offset_large = vdupq_n_u8(21); uint8x16_t pos_to_offset = (width == 4) ? vld1q_u8(c_4_po_2d[0]) : vld1q_u8(c_4_po_2d[1]); uint8x16_t count; uint8x16_t level[5]; uint8_t *cc = coeff_contexts; assert(!(width % 4)); int col = width; do { load_levels_4x4x5(levels, stride, offsets, level); count = get_coeff_contexts_kernel(level); count = vaddq_u8(count, pos_to_offset); vst1q_u8(cc, count); pos_to_offset = pos_to_offset_large; levels += 4 * stride; cc += 16; col -= 4; } while (col); coeff_contexts[0] = 0; } static inline void get_4_nz_map_contexts_ver(const uint8_t *levels, const int width, const ptrdiff_t *const offsets, uint8_t *coeff_contexts) { const int stride = 4 + TX_PAD_HOR; const uint8x16_t pos_to_offset = vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010)); uint8x16_t count; uint8x16_t level[5]; assert(!(width % 4)); int col = width; do { load_levels_4x4x5(levels, stride, offsets, level); count = get_coeff_contexts_kernel(level); count = vaddq_u8(count, pos_to_offset); vst1q_u8(coeff_contexts, count); levels += 4 * stride; coeff_contexts += 16; col -= 4; } while (col); } static inline void get_4_nz_map_contexts_hor(const uint8_t *levels, const int width, const ptrdiff_t *const offsets, uint8_t *coeff_contexts) { const int stride = 4 + TX_PAD_HOR; const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); uint8x16_t pos_to_offset = vld1q_u8(c_4_po_hor); uint8x16_t count; uint8x16_t level[5]; assert(!(width % 4)); int col = width; do { load_levels_4x4x5(levels, stride, offsets, level); count = get_coeff_contexts_kernel(level); count = vaddq_u8(count, pos_to_offset); vst1q_u8(coeff_contexts, count); pos_to_offset = pos_to_offset_large; levels += 4 * stride; coeff_contexts += 16; col -= 4; } while (col); } static inline void get_8_coeff_contexts_2d(const uint8_t *levels, const int width, const ptrdiff_t *const offsets, uint8_t *coeff_contexts) { const int stride = 8 + TX_PAD_HOR; uint8_t *cc = coeff_contexts; uint8x16_t count; uint8x16_t level[5]; uint8x16_t pos_to_offset[3]; assert(!(width % 2)); if (width == 8) { pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]); pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]); } else if (width < 8) { pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]); pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]); } else { pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]); pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]); } pos_to_offset[2] = vdupq_n_u8(21); int col = width; do { load_levels_8x2x5(levels, stride, offsets, level); count = get_coeff_contexts_kernel(level); count = vaddq_u8(count, pos_to_offset[0]); vst1q_u8(cc, count); pos_to_offset[0] = pos_to_offset[1]; pos_to_offset[1] = pos_to_offset[2]; levels += 2 * stride; cc += 16; col -= 2; } while (col); coeff_contexts[0] = 0; } static inline void get_8_coeff_contexts_ver(const uint8_t *levels, const int width, const ptrdiff_t *const offsets, uint8_t *coeff_contexts) { const int stride = 8 + TX_PAD_HOR; const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_ver); uint8x16_t count; uint8x16_t level[5]; assert(!(width % 2)); int col = width; do { load_levels_8x2x5(levels, stride, offsets, level); count = get_coeff_contexts_kernel(level); count = vaddq_u8(count, pos_to_offset); vst1q_u8(coeff_contexts, count); levels += 2 * stride; coeff_contexts += 16; col -= 2; } while (col); } static inline void get_8_coeff_contexts_hor(const uint8_t *levels, const int width, const ptrdiff_t *const offsets, uint8_t *coeff_contexts) { const int stride = 8 + TX_PAD_HOR; const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0), vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5)); uint8x16_t count; uint8x16_t level[5]; assert(!(width % 2)); int col = width; do { load_levels_8x2x5(levels, stride, offsets, level); count = get_coeff_contexts_kernel(level); count = vaddq_u8(count, pos_to_offset); vst1q_u8(coeff_contexts, count); pos_to_offset = pos_to_offset_large; levels += 2 * stride; coeff_contexts += 16; col -= 2; } while (col); } static inline void get_16n_coeff_contexts_2d(const uint8_t *levels, const int real_width, const int real_height, const int width, const int height, const ptrdiff_t *const offsets, uint8_t *coeff_contexts) { const int stride = height + TX_PAD_HOR; uint8_t *cc = coeff_contexts; int col = width; uint8x16_t pos_to_offset[5]; uint8x16_t pos_to_offset_large[3]; uint8x16_t count; uint8x16_t level[5]; assert(!(height % 16)); pos_to_offset_large[2] = vdupq_n_u8(21); if (real_width == real_height) { pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]); pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]); pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]); pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]); pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2]; } else if (real_width < real_height) { pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]); pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]); pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = vld1q_u8(c_16_po_2d_g[2]); pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2]; } else { // real_width > real_height pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]); pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]); pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]); pos_to_offset[4] = pos_to_offset_large[2]; pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(16); } do { int h = height; do { load_levels_16x1x5(levels, stride, offsets, level); count = get_coeff_contexts_kernel(level); count = vaddq_u8(count, pos_to_offset[0]); vst1q_u8(cc, count); levels += 16; cc += 16; h -= 16; pos_to_offset[0] = pos_to_offset_large[0]; } while (h); pos_to_offset[0] = pos_to_offset[1]; pos_to_offset[1] = pos_to_offset[2]; pos_to_offset[2] = pos_to_offset[3]; pos_to_offset[3] = pos_to_offset[4]; pos_to_offset_large[0] = pos_to_offset_large[1]; pos_to_offset_large[1] = pos_to_offset_large[2]; levels += TX_PAD_HOR; } while (--col); coeff_contexts[0] = 0; } static inline void get_16n_coeff_contexts_ver(const uint8_t *levels, const int width, const int height, const ptrdiff_t *const offsets, uint8_t *coeff_contexts) { const int stride = height + TX_PAD_HOR; const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); uint8x16_t count; uint8x16_t level[5]; assert(!(height % 16)); int col = width; do { uint8x16_t pos_to_offset = vld1q_u8(c_16_po_ver); int h = height; do { load_levels_16x1x5(levels, stride, offsets, level); count = get_coeff_contexts_kernel(level); count = vaddq_u8(count, pos_to_offset); vst1q_u8(coeff_contexts, count); pos_to_offset = pos_to_offset_large; levels += 16; coeff_contexts += 16; h -= 16; } while (h); levels += TX_PAD_HOR; } while (--col); } static inline void get_16n_coeff_contexts_hor(const uint8_t *levels, const int width, const int height, const ptrdiff_t *const offsets, uint8_t *coeff_contexts) { const int stride = height + TX_PAD_HOR; uint8x16_t pos_to_offset[3]; uint8x16_t count; uint8x16_t level[5]; assert(!(height % 16)); pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0); pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5); pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); int col = width; do { int h = height; do { load_levels_16x1x5(levels, stride, offsets, level); count = get_coeff_contexts_kernel(level); count = vaddq_u8(count, pos_to_offset[0]); vst1q_u8(coeff_contexts, count); levels += 16; coeff_contexts += 16; h -= 16; } while (h); pos_to_offset[0] = pos_to_offset[1]; pos_to_offset[1] = pos_to_offset[2]; levels += TX_PAD_HOR; } while (--col); } // Note: levels[] must be in the range [0, 127], inclusive. void av1_get_nz_map_contexts_neon(const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts) { const int last_idx = eob - 1; if (!last_idx) { coeff_contexts[0] = 0; return; } uint8_t *const coefficients = (uint8_t *const)coeff_contexts; const int real_width = tx_size_wide[tx_size]; const int real_height = tx_size_high[tx_size]; const int width = get_txb_wide(tx_size); const int height = get_txb_high(tx_size); const int stride = height + TX_PAD_HOR; ptrdiff_t offsets[3]; /* coeff_contexts must be 16 byte aligned. */ assert(!((intptr_t)coeff_contexts & 0xf)); if (tx_class == TX_CLASS_2D) { offsets[0] = 0 * stride + 2; offsets[1] = 1 * stride + 1; offsets[2] = 2 * stride + 0; if (height == 4) { get_4_nz_map_contexts_2d(levels, width, offsets, coefficients); } else if (height == 8) { get_8_coeff_contexts_2d(levels, width, offsets, coefficients); } else { get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, offsets, coefficients); } } else if (tx_class == TX_CLASS_HORIZ) { offsets[0] = 2 * stride; offsets[1] = 3 * stride; offsets[2] = 4 * stride; if (height == 4) { get_4_nz_map_contexts_hor(levels, width, offsets, coefficients); } else if (height == 8) { get_8_coeff_contexts_hor(levels, width, offsets, coefficients); } else { get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients); } } else { // TX_CLASS_VERT offsets[0] = 2; offsets[1] = 3; offsets[2] = 4; if (height == 4) { get_4_nz_map_contexts_ver(levels, width, offsets, coefficients); } else if (height == 8) { get_8_coeff_contexts_ver(levels, width, offsets, coefficients); } else { get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients); } } const int bhl = get_txb_bhl(tx_size); const int pos = scan[last_idx]; if (last_idx <= (width << bhl) / 8) coeff_contexts[pos] = 1; else if (last_idx <= (width << bhl) / 4) coeff_contexts[pos] = 2; else coeff_contexts[pos] = 3; } aom-3.12.1/av1/encoder/arm/hash_arm_crc32.c000066400000000000000000000035311477627663500202120ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #if defined(_MSC_VER) && !defined(__clang__) #include #else #include #endif #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #define CRC_LOOP(op, crc, type, buf, len) \ while ((len) >= sizeof(type)) { \ (crc) = op((crc), *(type *)(buf)); \ (len) -= sizeof(type); \ buf += sizeof(type); \ } #define CRC_SINGLE(op, crc, type, buf, len) \ if ((len) >= sizeof(type)) { \ (crc) = op((crc), *(type *)(buf)); \ (len) -= sizeof(type); \ buf += sizeof(type); \ } /* Return 32-bit CRC for the input buffer. * Polynomial is 0x1EDC6F41. */ uint32_t av1_get_crc32c_value_arm_crc32(void *crc_calculator, uint8_t *p, size_t len) { (void)crc_calculator; const uint8_t *buf = p; uint32_t crc = 0xFFFFFFFF; #if !AOM_ARCH_AARCH64 // Align input to 8-byte boundary (only necessary for 32-bit builds.) while (len && ((uintptr_t)buf & 7)) { crc = __crc32cb(crc, *buf++); len--; } #endif CRC_LOOP(__crc32cd, crc, uint64_t, buf, len) CRC_SINGLE(__crc32cw, crc, uint32_t, buf, len) CRC_SINGLE(__crc32ch, crc, uint16_t, buf, len) CRC_SINGLE(__crc32cb, crc, uint8_t, buf, len) return ~crc; } aom-3.12.1/av1/encoder/arm/highbd_fwd_txfm_neon.c000066400000000000000000002776221477627663500216140ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/transpose_neon.h" #include "aom_dsp/txfm_common.h" #include "aom_ports/mem.h" #include "av1/common/av1_txfm.h" #include "av1/encoder/av1_fwd_txfm1d_cfg.h" #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "shift_neon.h" #include "txfm_neon.h" static AOM_FORCE_INLINE void transpose_arrays_s32_64x64(const int32x4_t *in, int32x4_t *out) { // This is not quite the same as the other transposes defined in // transpose_neon.h: We only write the low 64x32 sub-matrix since the rest is // unused by the following row transform. for (int j = 0; j < 8; ++j) { for (int i = 0; i < 16; ++i) { transpose_arrays_s32_4x4(in + 64 * i + 4 * j, out + 64 * j + 4 * i); } } } // A note on butterfly helper naming: // // butterfly_[weight_indices]_neon // e.g. butterfly_0312_neon // ^ Weights are applied as indices 0, 3, 2, 1 // (see more detail below) // // Weight indices are treated as an index into the 4-tuple of the weight // itself, plus related and negated constants: w=(w0, 1-w0, -w0, w0-1). // This is then represented in the helper naming by referring to the lane index // in the loaded tuple that each multiply is performed with: // // in0 in1 // /------------ // out0 | w[0] w[1] ==> out0 = in0 * w[0] + in1 * w[1] // out1 | w[2] w[3] ==> out1 = in0 * w[2] + in1 * w[3] // // So for indices 0321 from the earlier example, we end up with: // // in0 in1 // /------------------ // out0 | (lane 0) (lane 3) ==> out0 = in0 * w0 + in1 * (w0-1) // out1 | (lane 2) (lane 1) ==> out1 = in0 * -w0 + in1 * (1-w0) #define butterfly_half_neon(wvec, lane0, lane1, in0, in1, out, v_bit) \ do { \ int32x2x2_t wvecs = { { wvec, vneg_s32(wvec) } }; \ int32x4_t x = vmulq_lane_s32(in0, wvecs.val[lane0 / 2], lane0 % 2); \ x = vmlaq_lane_s32(x, in1, wvecs.val[lane1 / 2], lane1 % 2); \ *out = vrshlq_s32(x, v_bit); \ } while (false) static AOM_FORCE_INLINE void butterfly_0112_neon( const int32_t *cospi, const int widx0, const int32x4_t n0, const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) { int32x2_t w01 = vld1_s32(cospi + 2 * widx0); butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit); butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit); } static AOM_FORCE_INLINE void butterfly_2312_neon( const int32_t *cospi, const int widx0, const int32x4_t n0, const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) { int32x2_t w01 = vld1_s32(cospi + 2 * widx0); butterfly_half_neon(w01, 2, 3, n0, n1, out0, v_bit); butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit); } static AOM_FORCE_INLINE void butterfly_0332_neon( const int32_t *cospi, const int widx0, const int32x4_t n0, const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) { int32x2_t w01 = vld1_s32(cospi + 2 * widx0); butterfly_half_neon(w01, 0, 3, n0, n1, out0, v_bit); butterfly_half_neon(w01, 3, 2, n0, n1, out1, v_bit); } static AOM_FORCE_INLINE void butterfly_0130_neon( const int32_t *cospi, const int widx0, const int32x4_t n0, const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) { int32x2_t w01 = vld1_s32(cospi + 2 * widx0); butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit); butterfly_half_neon(w01, 3, 0, n0, n1, out1, v_bit); } static AOM_FORCE_INLINE void butterfly_cospi32_0002_neon( const int32_t *cospi, const int32x4_t n0, const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) { int32x2_t w01 = vld1_s32(cospi + 2 * 32); butterfly_half_neon(w01, 0, 0, n0, n1, out0, v_bit); butterfly_half_neon(w01, 0, 2, n0, n1, out1, v_bit); } static AOM_FORCE_INLINE void butterfly_cospi32_0222_neon( const int32_t *cospi, const int32x4_t n0, const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) { int32x2_t w01 = vld1_s32(cospi + 2 * 32); butterfly_half_neon(w01, 0, 2, n0, n1, out0, v_bit); butterfly_half_neon(w01, 2, 2, n0, n1, out1, v_bit); } static AOM_FORCE_INLINE void round_rect_array_s32_neon(const int32x4_t *input, int32x4_t *output, const int size) { const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2); int i = 0; do { const int32x4_t r1 = vmulq_s32(input[i], sqrt2); output[i] = vrshrq_n_s32(r1, NewSqrt2Bits); } while (++i < size); } static AOM_FORCE_INLINE void round_shift2_rect_array_s32_neon( const int32x4_t *input, int32x4_t *output, const int size) { const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2); int i = 0; do { const int32x4_t r0 = vrshrq_n_s32(input[i], 2); const int32x4_t r1 = vmulq_s32(r0, sqrt2); output[i] = vrshrq_n_s32(r1, NewSqrt2Bits); } while (++i < size); } #define LOAD_BUFFER_4XH(h) \ static AOM_FORCE_INLINE void load_buffer_4x##h( \ const int16_t *input, int32x4_t *in, int stride, int fliplr) { \ if (fliplr) { \ for (int i = 0; i < (h); ++i) { \ int16x4_t a = vld1_s16(input + i * stride); \ a = vrev64_s16(a); \ in[i] = vshll_n_s16(a, 2); \ } \ } else { \ for (int i = 0; i < (h); ++i) { \ int16x4_t a = vld1_s16(input + i * stride); \ in[i] = vshll_n_s16(a, 2); \ } \ } \ } // AArch32 does not permit the argument to vshll_n_s16 to be zero, so need to // avoid the expression even though the compiler can prove that the code path // is never taken if `shift == 0`. #define shift_left_long_s16(a, shift) \ ((shift) == 0 ? vmovl_s16(a) : vshll_n_s16((a), (shift) == 0 ? 1 : (shift))) #define LOAD_BUFFER_WXH(w, h, shift) \ static AOM_FORCE_INLINE void load_buffer_##w##x##h( \ const int16_t *input, int32x4_t *in, int stride, int fliplr) { \ assert(w >= 8); \ if (fliplr) { \ for (int i = 0; i < (h); ++i) { \ for (int j = 0; j < (w) / 8; ++j) { \ int16x8_t a = vld1q_s16(input + i * stride + j * 8); \ a = vrev64q_s16(a); \ int j2 = (w) / 8 - j - 1; \ in[i + (h) * (2 * j2 + 0)] = \ shift_left_long_s16(vget_high_s16(a), (shift)); \ in[i + (h) * (2 * j2 + 1)] = \ shift_left_long_s16(vget_low_s16(a), (shift)); \ } \ } \ } else { \ for (int i = 0; i < (h); ++i) { \ for (int j = 0; j < (w) / 8; ++j) { \ int16x8_t a = vld1q_s16(input + i * stride + j * 8); \ in[i + (h) * (2 * j + 0)] = \ shift_left_long_s16(vget_low_s16(a), (shift)); \ in[i + (h) * (2 * j + 1)] = \ shift_left_long_s16(vget_high_s16(a), (shift)); \ } \ } \ } \ } LOAD_BUFFER_4XH(4) LOAD_BUFFER_4XH(8) LOAD_BUFFER_4XH(16) LOAD_BUFFER_4XH(32) LOAD_BUFFER_WXH(8, 8, 2) LOAD_BUFFER_WXH(16, 16, 2) LOAD_BUFFER_WXH(32, 64, 0) LOAD_BUFFER_WXH(64, 32, 2) LOAD_BUFFER_WXH(64, 64, 0) #if !CONFIG_REALTIME_ONLY LOAD_BUFFER_WXH(16, 64, 0) LOAD_BUFFER_WXH(64, 16, 2) #endif // !CONFIG_REALTIME_ONLY #define STORE_BUFFER_WXH(w, h) \ static AOM_FORCE_INLINE void store_buffer_##w##x##h( \ const int32x4_t *in, int32_t *out, int stride) { \ for (int i = 0; i < (w); ++i) { \ for (int j = 0; j < (h) / 4; ++j) { \ vst1q_s32(&out[i * stride + j * 4], in[i + j * (w)]); \ } \ } \ } STORE_BUFFER_WXH(4, 4) STORE_BUFFER_WXH(8, 4) STORE_BUFFER_WXH(8, 8) STORE_BUFFER_WXH(16, 4) STORE_BUFFER_WXH(16, 16) STORE_BUFFER_WXH(32, 4) STORE_BUFFER_WXH(32, 32) STORE_BUFFER_WXH(64, 32) #if !CONFIG_REALTIME_ONLY STORE_BUFFER_WXH(16, 32) STORE_BUFFER_WXH(64, 16) #endif // !CONFIG_REALTIME_ONLY static AOM_FORCE_INLINE void highbd_fdct4_x4_neon(const int32x4_t *in, int32x4_t *out, int bit) { const int32_t *const cospi = cospi_arr_s32(bit); const int32x4_t cospi32 = vdupq_n_s32(cospi[2 * 32]); const int32x2_t cospi16_48 = vld1_s32(&cospi[2 * 16]); const int32x4_t a0 = vaddq_s32(in[0], in[3]); const int32x4_t a1 = vsubq_s32(in[0], in[3]); const int32x4_t a2 = vaddq_s32(in[1], in[2]); const int32x4_t a3 = vsubq_s32(in[1], in[2]); const int32x4_t b0 = vmulq_s32(a0, cospi32); const int32x4_t b1 = vmulq_lane_s32(a1, cospi16_48, 1); const int32x4_t b2 = vmulq_s32(a2, cospi32); const int32x4_t b3 = vmulq_lane_s32(a3, cospi16_48, 1); const int32x4_t c0 = vaddq_s32(b0, b2); const int32x4_t c1 = vsubq_s32(b0, b2); const int32x4_t c2 = vmlaq_lane_s32(b3, a1, cospi16_48, 0); const int32x4_t c3 = vmlsq_lane_s32(b1, a3, cospi16_48, 0); const int32x4_t v_bit = vdupq_n_s32(-bit); const int32x4_t d0 = vrshlq_s32(c0, v_bit); const int32x4_t d1 = vrshlq_s32(c1, v_bit); const int32x4_t d2 = vrshlq_s32(c2, v_bit); const int32x4_t d3 = vrshlq_s32(c3, v_bit); out[0] = d0; out[1] = d2; out[2] = d1; out[3] = d3; } static AOM_FORCE_INLINE void highbd_fadst4_x4_neon(const int32x4_t *in, int32x4_t *out, int bit) { const int32x4_t sinpi = vld1q_s32(sinpi_arr(bit) + 1); const int32x4_t a0 = vaddq_s32(in[0], in[1]); const int32x4_t a1 = vmulq_lane_s32(in[0], vget_low_s32(sinpi), 0); const int32x4_t a2 = vmulq_lane_s32(in[0], vget_high_s32(sinpi), 1); const int32x4_t a3 = vmulq_lane_s32(in[2], vget_high_s32(sinpi), 0); const int32x4_t b0 = vmlaq_lane_s32(a1, in[1], vget_low_s32(sinpi), 1); const int32x4_t b1 = vmlsq_lane_s32(a2, in[1], vget_low_s32(sinpi), 0); const int32x4_t b2 = vsubq_s32(a0, in[3]); const int32x4_t c0 = vmlaq_lane_s32(b0, in[3], vget_high_s32(sinpi), 1); const int32x4_t c1 = vmlaq_lane_s32(b1, in[3], vget_low_s32(sinpi), 1); const int32x4_t c2 = vmulq_lane_s32(b2, vget_high_s32(sinpi), 0); const int32x4_t d0 = vaddq_s32(c0, a3); const int32x4_t d1 = vsubq_s32(c1, a3); const int32x4_t d2 = vsubq_s32(c1, c0); const int32x4_t e0 = vaddq_s32(d2, a3); const int32x4_t v_bit = vdupq_n_s32(-bit); out[0] = vrshlq_s32(d0, v_bit); out[1] = vrshlq_s32(c2, v_bit); out[2] = vrshlq_s32(d1, v_bit); out[3] = vrshlq_s32(e0, v_bit); } static AOM_FORCE_INLINE void highbd_fidentity4_x4_neon(const int32x4_t *in, int32x4_t *out, int bit) { (void)bit; int32x4_t fact = vdupq_n_s32(NewSqrt2); for (int i = 0; i < 4; i++) { const int32x4_t a_low = vmulq_s32(in[i], fact); out[i] = vrshrq_n_s32(a_low, NewSqrt2Bits); } } void av1_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *coeff, int input_stride, TX_TYPE tx_type, int bd) { (void)bd; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &input_stride, 4); // Workspace for column/row-wise transforms. int32x4_t buf[4]; switch (tx_type) { case DCT_DCT: load_buffer_4x4(input, buf, input_stride, 0); highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); transpose_arrays_s32_4x4(buf, buf); highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); store_buffer_4x4(buf, coeff, /*stride=*/4); break; case ADST_DCT: load_buffer_4x4(input, buf, input_stride, 0); highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); transpose_arrays_s32_4x4(buf, buf); highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); store_buffer_4x4(buf, coeff, /*stride=*/4); break; case DCT_ADST: load_buffer_4x4(input, buf, input_stride, 0); highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); transpose_arrays_s32_4x4(buf, buf); highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); store_buffer_4x4(buf, coeff, /*stride=*/4); break; case ADST_ADST: load_buffer_4x4(input, buf, input_stride, 0); highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); transpose_arrays_s32_4x4(buf, buf); highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); store_buffer_4x4(buf, coeff, /*stride=*/4); break; case FLIPADST_DCT: load_buffer_4x4(input, buf, input_stride, 0); highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); transpose_arrays_s32_4x4(buf, buf); highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); store_buffer_4x4(buf, coeff, /*stride=*/4); break; case DCT_FLIPADST: load_buffer_4x4(input, buf, input_stride, 1); highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); transpose_arrays_s32_4x4(buf, buf); highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); store_buffer_4x4(buf, coeff, /*stride=*/4); break; case FLIPADST_FLIPADST: load_buffer_4x4(input, buf, input_stride, 1); highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); transpose_arrays_s32_4x4(buf, buf); highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); store_buffer_4x4(buf, coeff, /*stride=*/4); break; case ADST_FLIPADST: load_buffer_4x4(input, buf, input_stride, 1); highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); transpose_arrays_s32_4x4(buf, buf); highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); store_buffer_4x4(buf, coeff, /*stride=*/4); break; case FLIPADST_ADST: load_buffer_4x4(input, buf, input_stride, 0); highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); transpose_arrays_s32_4x4(buf, buf); highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); store_buffer_4x4(buf, coeff, /*stride=*/4); break; case IDTX: load_buffer_4x4(input, buf, input_stride, 0); highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); transpose_arrays_s32_4x4(buf, buf); highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); store_buffer_4x4(buf, coeff, /*stride=*/4); break; case V_DCT: load_buffer_4x4(input, buf, input_stride, 0); highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); transpose_arrays_s32_4x4(buf, buf); highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); store_buffer_4x4(buf, coeff, /*stride=*/4); break; case H_DCT: load_buffer_4x4(input, buf, input_stride, 0); highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); transpose_arrays_s32_4x4(buf, buf); highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); store_buffer_4x4(buf, coeff, /*stride=*/4); break; case V_ADST: load_buffer_4x4(input, buf, input_stride, 0); highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); transpose_arrays_s32_4x4(buf, buf); highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); store_buffer_4x4(buf, coeff, /*stride=*/4); break; case H_ADST: load_buffer_4x4(input, buf, input_stride, 0); highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); transpose_arrays_s32_4x4(buf, buf); highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); store_buffer_4x4(buf, coeff, /*stride=*/4); break; case V_FLIPADST: load_buffer_4x4(input, buf, input_stride, 0); highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); transpose_arrays_s32_4x4(buf, buf); highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); store_buffer_4x4(buf, coeff, /*stride=*/4); break; case H_FLIPADST: load_buffer_4x4(input, buf, input_stride, 1); highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); transpose_arrays_s32_4x4(buf, buf); highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); store_buffer_4x4(buf, coeff, /*stride=*/4); break; default: assert(0); } } // Butterfly pre-processing: // e.g. n=4: // out[0] = in[0] + in[3] // out[1] = in[1] + in[2] // out[2] = in[1] - in[2] // out[3] = in[0] - in[3] static AOM_FORCE_INLINE void butterfly_dct_pre(const int32x4_t *input, int32x4_t *output, int n) { for (int i = 0; i < n / 2; ++i) { output[i] = vaddq_s32(input[i], input[n - i - 1]); } for (int i = 0; i < n / 2; ++i) { output[n / 2 + i] = vsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]); } } // Butterfly post-processing: // e.g. n=8: // out[0] = in0[0] + in1[3]; // out[1] = in0[1] + in1[2]; // out[2] = in0[1] - in1[2]; // out[3] = in0[0] - in1[3]; // out[4] = in0[7] - in1[4]; // out[5] = in0[6] - in1[5]; // out[6] = in0[6] + in1[5]; // out[7] = in0[7] + in1[4]; static AOM_FORCE_INLINE void butterfly_dct_post(const int32x4_t *in0, const int32x4_t *in1, int32x4_t *output, int n) { for (int i = 0; i < n / 4; ++i) { output[i] = vaddq_s32(in0[i], in1[n / 2 - i - 1]); } for (int i = 0; i < n / 4; ++i) { output[n / 4 + i] = vsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]); } for (int i = 0; i < n / 4; ++i) { output[n / 2 + i] = vsubq_s32(in0[n - i - 1], in1[n / 2 + i]); } for (int i = 0; i < n / 4; ++i) { output[(3 * n) / 4 + i] = vaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]); } } static AOM_FORCE_INLINE void highbd_fdct8_x4_neon(const int32x4_t *in, int32x4_t *out, int bit) { const int32_t *const cospi = cospi_arr_s32(bit); const int32x4_t v_bit = vdupq_n_s32(-bit); // stage 1 int32x4_t a[8]; butterfly_dct_pre(in, a, 8); // stage 2 int32x4_t b[8]; butterfly_dct_pre(a, b, 4); butterfly_0130_neon(cospi, 32, a[5], a[6], &b[6], &b[5], v_bit); // stage 3 int32x4_t c[8]; butterfly_0130_neon(cospi, 32, b[1], b[0], &c[0], &c[1], v_bit); butterfly_0112_neon(cospi, 16, b[3], b[2], &c[2], &c[3], v_bit); butterfly_dct_post(a + 4, b + 4, c + 4, 4); // stage 4-5 butterfly_0112_neon(cospi, 8, c[7], c[4], &out[1], &out[7], v_bit); butterfly_0130_neon(cospi, 24, c[5], c[6], &out[5], &out[3], v_bit); out[0] = c[0]; out[2] = c[2]; out[4] = c[1]; out[6] = c[3]; } static AOM_FORCE_INLINE void highbd_fadst8_x4_neon(const int32x4_t *in, int32x4_t *out, int bit) { const int32_t *const cospi = cospi_arr_s32(bit); const int32x4_t v_bit = vdupq_n_s32(-bit); int32x4_t u0, u1, u2, u3, u4, u5, u6, u7; int32x4_t v0, v1, v2, v3, v4, v5, v6, v7; // stage 0-1 u0 = in[0]; u1 = in[7]; u2 = in[3]; u3 = in[4]; u4 = in[1]; u5 = in[6]; u6 = in[2]; u7 = in[5]; // stage 2 v0 = u0; v1 = u1; butterfly_cospi32_0222_neon(cospi, u3, u2, &v2, &v3, v_bit); v4 = u4; v5 = u5; butterfly_cospi32_0002_neon(cospi, u6, u7, &v7, &v6, v_bit); // stage 3 u0 = vaddq_s32(v0, v2); u1 = vsubq_s32(v3, v1); u2 = vsubq_s32(v0, v2); u3 = vaddq_s32(v1, v3); u4 = vsubq_s32(v6, v4); u5 = vaddq_s32(v5, v7); u6 = vaddq_s32(v4, v6); u7 = vsubq_s32(v5, v7); // stage 4 v0 = u0; v1 = u1; v2 = u2; v3 = u3; butterfly_0112_neon(cospi, 16, u4, u5, &v4, &v5, v_bit); butterfly_0112_neon(cospi, 16, u7, u6, &v6, &v7, v_bit); // stage 5 u0 = vaddq_s32(v0, v4); u1 = vaddq_s32(v1, v5); u2 = vaddq_s32(v2, v6); u3 = vsubq_s32(v7, v3); u4 = vsubq_s32(v0, v4); u5 = vsubq_s32(v1, v5); u6 = vsubq_s32(v2, v6); u7 = vaddq_s32(v3, v7); // stage 6 butterfly_0112_neon(cospi, 4, u0, u1, &v0, &v1, v_bit); butterfly_0112_neon(cospi, 20, u2, u3, &v2, &v3, v_bit); butterfly_0130_neon(cospi, 28, u5, u4, &v4, &v5, v_bit); butterfly_0112_neon(cospi, 12, u6, u7, &v7, &v6, v_bit); // stage 7 out[0] = v1; out[1] = v6; out[2] = v3; out[3] = v4; out[4] = v5; out[5] = v2; out[6] = v7; out[7] = v0; } static AOM_FORCE_INLINE void highbd_fidentity8_x4_neon(const int32x4_t *in, int32x4_t *out, int bit) { (void)bit; out[0] = vshlq_n_s32(in[0], 1); out[1] = vshlq_n_s32(in[1], 1); out[2] = vshlq_n_s32(in[2], 1); out[3] = vshlq_n_s32(in[3], 1); out[4] = vshlq_n_s32(in[4], 1); out[5] = vshlq_n_s32(in[5], 1); out[6] = vshlq_n_s32(in[6], 1); out[7] = vshlq_n_s32(in[7], 1); } static AOM_FORCE_INLINE void highbd_fdct8_xn_neon(const int32x4_t *in, int32x4_t *out, int bit, int howmany) { const int stride = 8; int i = 0; do { highbd_fdct8_x4_neon(in + i * stride, out + i * stride, bit); } while (++i < howmany); } static AOM_FORCE_INLINE void highbd_fadst8_xn_neon(const int32x4_t *in, int32x4_t *out, int bit, int howmany) { const int stride = 8; int i = 0; do { highbd_fadst8_x4_neon(in + i * stride, out + i * stride, bit); } while (++i < howmany); } static AOM_FORCE_INLINE void highbd_fidentity8_xn_neon(const int32x4_t *in, int32x4_t *out, int bit, int howmany) { (void)bit; const int stride = 8; int i = 0; do { highbd_fidentity8_x4_neon(in + i * stride, out + i * stride, bit); } while (++i < howmany); } void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)bd; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); // Workspaces for column/row-wise transforms. int32x4_t buf0[16], buf1[16]; switch (tx_type) { case DCT_DCT: load_buffer_8x8(input, buf0, stride, 0); highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_8x8(buf0, buf1); highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); store_buffer_8x8(buf1, coeff, /*stride=*/8); break; case ADST_DCT: load_buffer_8x8(input, buf0, stride, 0); highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_8x8(buf0, buf1); highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); store_buffer_8x8(buf1, coeff, /*stride=*/8); break; case DCT_ADST: load_buffer_8x8(input, buf0, stride, 0); highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_8x8(buf0, buf1); highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); store_buffer_8x8(buf1, coeff, /*stride=*/8); break; case ADST_ADST: load_buffer_8x8(input, buf0, stride, 0); highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_8x8(buf0, buf1); highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); store_buffer_8x8(buf1, coeff, /*stride=*/8); break; case FLIPADST_DCT: load_buffer_8x8(input, buf0, stride, 0); highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_8x8(buf0, buf1); highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); store_buffer_8x8(buf1, coeff, /*stride=*/8); break; case DCT_FLIPADST: load_buffer_8x8(input, buf0, stride, 1); highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_8x8(buf0, buf1); highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); store_buffer_8x8(buf1, coeff, /*stride=*/8); break; case FLIPADST_FLIPADST: load_buffer_8x8(input, buf0, stride, 1); highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_8x8(buf0, buf1); highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); store_buffer_8x8(buf1, coeff, /*stride=*/8); break; case ADST_FLIPADST: load_buffer_8x8(input, buf0, stride, 1); highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_8x8(buf0, buf1); highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); store_buffer_8x8(buf1, coeff, /*stride=*/8); break; case FLIPADST_ADST: load_buffer_8x8(input, buf0, stride, 0); highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_8x8(buf0, buf1); highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); store_buffer_8x8(buf1, coeff, /*stride=*/8); break; case IDTX: load_buffer_8x8(input, buf0, stride, 0); highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_8x8(buf0, buf1); highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); store_buffer_8x8(buf1, coeff, /*stride=*/8); break; case V_DCT: load_buffer_8x8(input, buf0, stride, 0); highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_8x8(buf0, buf1); highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); store_buffer_8x8(buf1, coeff, /*stride=*/8); break; case H_DCT: load_buffer_8x8(input, buf0, stride, 0); highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_8x8(buf0, buf1); highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); store_buffer_8x8(buf1, coeff, /*stride=*/8); break; case V_ADST: load_buffer_8x8(input, buf0, stride, 0); highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_8x8(buf0, buf1); highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); store_buffer_8x8(buf1, coeff, /*stride=*/8); break; case H_ADST: load_buffer_8x8(input, buf0, stride, 0); highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_8x8(buf0, buf1); highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); store_buffer_8x8(buf1, coeff, /*stride=*/8); break; case V_FLIPADST: load_buffer_8x8(input, buf0, stride, 0); highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_8x8(buf0, buf1); highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); store_buffer_8x8(buf1, coeff, /*stride=*/8); break; case H_FLIPADST: load_buffer_8x8(input, buf0, stride, 1); highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_8x8(buf0, buf1); highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); store_buffer_8x8(buf1, coeff, /*stride=*/8); break; default: assert(0); } } static void highbd_fdct16_x4_neon(const int32x4_t *in, int32x4_t *out, int bit) { const int32_t *const cospi = cospi_arr_s32(bit); const int32x4_t v_bit = vdupq_n_s32(-bit); int32x4_t u[16], v[16]; // stage 1 butterfly_dct_pre(in, u, 16); // stage 2 butterfly_dct_pre(u, v, 8); v[8] = u[8]; v[9] = u[9]; butterfly_cospi32_0002_neon(cospi, u[13], u[10], &v[13], &v[10], v_bit); butterfly_cospi32_0002_neon(cospi, u[12], u[11], &v[12], &v[11], v_bit); v[14] = u[14]; v[15] = u[15]; // stage 3 butterfly_dct_pre(v, u, 4); u[4] = v[4]; butterfly_cospi32_0002_neon(cospi, v[6], v[5], &u[6], &u[5], v_bit); u[7] = v[7]; butterfly_dct_post(v + 8, v + 8, u + 8, 8); // stage 4 butterfly_cospi32_0002_neon(cospi, u[0], u[1], &v[0], &v[1], v_bit); butterfly_0112_neon(cospi, 16, u[3], u[2], &v[2], &v[3], v_bit); butterfly_dct_post(u + 4, u + 4, v + 4, 4); v[8] = u[8]; butterfly_0112_neon(cospi, 16, u[14], u[9], &v[14], &v[9], v_bit); butterfly_2312_neon(cospi, 16, u[13], u[10], &v[10], &v[13], v_bit); v[11] = u[11]; v[12] = u[12]; v[15] = u[15]; // stage 5 u[0] = v[0]; u[1] = v[1]; u[2] = v[2]; u[3] = v[3]; butterfly_0112_neon(cospi, 8, v[7], v[4], &u[4], &u[7], v_bit); butterfly_0130_neon(cospi, 24, v[5], v[6], &u[5], &u[6], v_bit); butterfly_dct_post(v + 8, v + 8, u + 8, 4); butterfly_dct_post(v + 12, v + 12, u + 12, 4); // stage 6 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = u[4]; v[5] = u[5]; v[6] = u[6]; v[7] = u[7]; butterfly_0112_neon(cospi, 4, u[15], u[8], &v[8], &v[15], v_bit); butterfly_0130_neon(cospi, 28, u[9], u[14], &v[9], &v[14], v_bit); butterfly_0112_neon(cospi, 20, u[13], u[10], &v[10], &v[13], v_bit); butterfly_0130_neon(cospi, 12, u[11], u[12], &v[11], &v[12], v_bit); out[0] = v[0]; out[1] = v[8]; out[2] = v[4]; out[3] = v[12]; out[4] = v[2]; out[5] = v[10]; out[6] = v[6]; out[7] = v[14]; out[8] = v[1]; out[9] = v[9]; out[10] = v[5]; out[11] = v[13]; out[12] = v[3]; out[13] = v[11]; out[14] = v[7]; out[15] = v[15]; } static void highbd_fadst16_x4_neon(const int32x4_t *in, int32x4_t *out, int bit) { const int32_t *const cospi = cospi_arr_s32(bit); const int32x4_t v_bit = vdupq_n_s32(-bit); int32x4_t u[16], v[16]; // stage 0-1 u[0] = in[0]; u[1] = in[15]; u[2] = in[7]; u[3] = in[8]; u[4] = in[3]; u[5] = in[12]; u[6] = in[4]; u[7] = in[11]; u[8] = in[1]; u[9] = in[14]; u[10] = in[6]; u[11] = in[9]; u[12] = in[2]; u[13] = in[13]; u[14] = in[5]; u[15] = in[10]; // stage 2 v[0] = u[0]; v[1] = u[1]; butterfly_cospi32_0222_neon(cospi, u[3], u[2], &v[2], &v[3], v_bit); v[4] = u[4]; v[5] = u[5]; butterfly_cospi32_0002_neon(cospi, u[6], u[7], &v[7], &v[6], v_bit); v[8] = u[8]; v[9] = u[9]; butterfly_cospi32_0002_neon(cospi, u[10], u[11], &v[11], &v[10], v_bit); v[12] = u[12]; v[13] = u[13]; butterfly_cospi32_0222_neon(cospi, u[15], u[14], &v[14], &v[15], v_bit); // stage 3 u[0] = vaddq_s32(v[0], v[2]); u[1] = vsubq_s32(v[3], v[1]); u[2] = vsubq_s32(v[0], v[2]); u[3] = vaddq_s32(v[1], v[3]); u[4] = vsubq_s32(v[6], v[4]); u[5] = vaddq_s32(v[5], v[7]); u[6] = vaddq_s32(v[4], v[6]); u[7] = vsubq_s32(v[5], v[7]); u[8] = vsubq_s32(v[10], v[8]); u[9] = vaddq_s32(v[9], v[11]); u[10] = vaddq_s32(v[8], v[10]); u[11] = vsubq_s32(v[9], v[11]); u[12] = vaddq_s32(v[12], v[14]); u[13] = vsubq_s32(v[15], v[13]); u[14] = vsubq_s32(v[12], v[14]); u[15] = vaddq_s32(v[13], v[15]); // stage 4 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; butterfly_0112_neon(cospi, 16, u[4], u[5], &v[4], &v[5], v_bit); butterfly_0112_neon(cospi, 16, u[7], u[6], &v[6], &v[7], v_bit); v[8] = u[8]; v[9] = u[9]; v[10] = u[10]; v[11] = u[11]; butterfly_0112_neon(cospi, 16, u[12], u[13], &v[12], &v[13], v_bit); butterfly_0332_neon(cospi, 16, u[14], u[15], &v[15], &v[14], v_bit); // stage 5 u[0] = vaddq_s32(v[0], v[4]); u[1] = vaddq_s32(v[1], v[5]); u[2] = vaddq_s32(v[2], v[6]); u[3] = vsubq_s32(v[7], v[3]); u[4] = vsubq_s32(v[0], v[4]); u[5] = vsubq_s32(v[1], v[5]); u[6] = vsubq_s32(v[2], v[6]); u[7] = vaddq_s32(v[3], v[7]); u[8] = vaddq_s32(v[8], v[12]); u[9] = vaddq_s32(v[9], v[13]); u[10] = vsubq_s32(v[14], v[10]); u[11] = vaddq_s32(v[11], v[15]); u[12] = vsubq_s32(v[8], v[12]); u[13] = vsubq_s32(v[9], v[13]); u[14] = vaddq_s32(v[10], v[14]); u[15] = vsubq_s32(v[11], v[15]); // stage 6 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = u[4]; v[5] = u[5]; v[6] = u[6]; v[7] = u[7]; butterfly_0112_neon(cospi, 8, u[8], u[9], &v[8], &v[9], v_bit); butterfly_0130_neon(cospi, 8, u[12], u[13], &v[13], &v[12], v_bit); butterfly_0130_neon(cospi, 24, u[11], u[10], &v[10], &v[11], v_bit); butterfly_0130_neon(cospi, 24, u[14], u[15], &v[14], &v[15], v_bit); // stage 7 u[0] = vaddq_s32(v[0], v[8]); u[1] = vaddq_s32(v[1], v[9]); u[2] = vaddq_s32(v[2], v[10]); u[3] = vaddq_s32(v[3], v[11]); u[4] = vaddq_s32(v[4], v[12]); u[5] = vaddq_s32(v[5], v[13]); u[6] = vaddq_s32(v[6], v[14]); u[7] = vsubq_s32(v[15], v[7]); u[8] = vsubq_s32(v[0], v[8]); u[9] = vsubq_s32(v[1], v[9]); u[10] = vsubq_s32(v[2], v[10]); u[11] = vsubq_s32(v[3], v[11]); u[12] = vsubq_s32(v[4], v[12]); u[13] = vsubq_s32(v[5], v[13]); u[14] = vsubq_s32(v[6], v[14]); u[15] = vaddq_s32(v[7], v[15]); // stage 8 butterfly_0112_neon(cospi, 2, u[0], u[1], &v[0], &v[1], v_bit); butterfly_0112_neon(cospi, 10, u[2], u[3], &v[2], &v[3], v_bit); butterfly_0112_neon(cospi, 18, u[4], u[5], &v[4], &v[5], v_bit); butterfly_0112_neon(cospi, 26, u[6], u[7], &v[6], &v[7], v_bit); butterfly_0130_neon(cospi, 30, u[9], u[8], &v[8], &v[9], v_bit); butterfly_0130_neon(cospi, 22, u[11], u[10], &v[10], &v[11], v_bit); butterfly_0130_neon(cospi, 14, u[13], u[12], &v[12], &v[13], v_bit); butterfly_0112_neon(cospi, 6, u[14], u[15], &v[15], &v[14], v_bit); // stage 9 out[0] = v[1]; out[1] = v[14]; out[2] = v[3]; out[3] = v[12]; out[4] = v[5]; out[5] = v[10]; out[6] = v[7]; out[7] = v[8]; out[8] = v[9]; out[9] = v[6]; out[10] = v[11]; out[11] = v[4]; out[12] = v[13]; out[13] = v[2]; out[14] = v[15]; out[15] = v[0]; } static void highbd_fidentity16_x4_neon(const int32x4_t *in, int32x4_t *out, int bit) { (void)bit; const int32x4_t fact = vdupq_n_s32(2 * NewSqrt2); const int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1)); for (int i = 0; i < 16; i++) { int32x4_t a = vmulq_s32(in[i], fact); a = vaddq_s32(a, offset); out[i] = vshrq_n_s32(a, NewSqrt2Bits); } } static void highbd_fdct16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit, const int howmany) { const int stride = 16; int i = 0; do { highbd_fdct16_x4_neon(in + i * stride, out + i * stride, bit); } while (++i < howmany); } static void highbd_fadst16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit, int howmany) { const int stride = 16; int i = 0; do { highbd_fadst16_x4_neon(in + i * stride, out + i * stride, bit); } while (++i < howmany); } static void highbd_fidentity16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit, int howmany) { const int stride = 16; int i = 0; do { highbd_fidentity16_x4_neon(in + i * stride, out + i * stride, bit); } while (++i < howmany); } void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)bd; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); // Workspaces for column/row-wise transforms. int32x4_t buf0[64], buf1[64]; switch (tx_type) { case DCT_DCT: load_buffer_16x16(input, buf0, stride, 0); highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); shift_right_2_round_s32_x4(buf0, buf0, 64); transpose_arrays_s32_16x16(buf0, buf1); highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); store_buffer_16x16(buf1, coeff, /*stride=*/16); break; case ADST_DCT: load_buffer_16x16(input, buf0, stride, 0); highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); shift_right_2_round_s32_x4(buf0, buf0, 64); transpose_arrays_s32_16x16(buf0, buf1); highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); store_buffer_16x16(buf1, coeff, /*stride=*/16); break; case DCT_ADST: load_buffer_16x16(input, buf0, stride, 0); highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); shift_right_2_round_s32_x4(buf0, buf0, 64); transpose_arrays_s32_16x16(buf0, buf1); highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); store_buffer_16x16(buf1, coeff, /*stride=*/16); break; case ADST_ADST: load_buffer_16x16(input, buf0, stride, 0); highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); shift_right_2_round_s32_x4(buf0, buf0, 64); transpose_arrays_s32_16x16(buf0, buf1); highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); store_buffer_16x16(buf1, coeff, /*stride=*/16); break; case FLIPADST_DCT: load_buffer_16x16(input, buf0, stride, 0); highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); shift_right_2_round_s32_x4(buf0, buf0, 64); transpose_arrays_s32_16x16(buf0, buf1); highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); store_buffer_16x16(buf1, coeff, /*stride=*/16); break; case DCT_FLIPADST: load_buffer_16x16(input, buf0, stride, 1); highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); shift_right_2_round_s32_x4(buf0, buf0, 64); transpose_arrays_s32_16x16(buf0, buf1); highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); store_buffer_16x16(buf1, coeff, /*stride=*/16); break; case FLIPADST_FLIPADST: load_buffer_16x16(input, buf0, stride, 1); highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); shift_right_2_round_s32_x4(buf0, buf0, 64); transpose_arrays_s32_16x16(buf0, buf1); highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); store_buffer_16x16(buf1, coeff, /*stride=*/16); break; case ADST_FLIPADST: load_buffer_16x16(input, buf0, stride, 1); highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); shift_right_2_round_s32_x4(buf0, buf0, 64); transpose_arrays_s32_16x16(buf0, buf1); highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); store_buffer_16x16(buf1, coeff, /*stride=*/16); break; case FLIPADST_ADST: load_buffer_16x16(input, buf0, stride, 0); highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); shift_right_2_round_s32_x4(buf0, buf0, 64); transpose_arrays_s32_16x16(buf0, buf1); highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); store_buffer_16x16(buf1, coeff, /*stride=*/16); break; case IDTX: load_buffer_16x16(input, buf0, stride, 0); highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); shift_right_2_round_s32_x4(buf0, buf0, 64); transpose_arrays_s32_16x16(buf0, buf1); highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); store_buffer_16x16(buf1, coeff, /*stride=*/16); break; case V_DCT: load_buffer_16x16(input, buf0, stride, 0); highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); shift_right_2_round_s32_x4(buf0, buf0, 64); transpose_arrays_s32_16x16(buf0, buf1); highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); store_buffer_16x16(buf1, coeff, /*stride=*/16); break; case H_DCT: load_buffer_16x16(input, buf0, stride, 0); highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); shift_right_2_round_s32_x4(buf0, buf0, 64); transpose_arrays_s32_16x16(buf0, buf1); highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); store_buffer_16x16(buf1, coeff, /*stride=*/16); break; case V_ADST: load_buffer_16x16(input, buf0, stride, 0); highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); shift_right_2_round_s32_x4(buf0, buf0, 64); transpose_arrays_s32_16x16(buf0, buf1); highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); store_buffer_16x16(buf1, coeff, /*stride=*/16); break; case H_ADST: load_buffer_16x16(input, buf0, stride, 0); highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); shift_right_2_round_s32_x4(buf0, buf0, 64); transpose_arrays_s32_16x16(buf0, buf1); highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); store_buffer_16x16(buf1, coeff, /*stride=*/16); break; case V_FLIPADST: load_buffer_16x16(input, buf0, stride, 0); highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); shift_right_2_round_s32_x4(buf0, buf0, 64); transpose_arrays_s32_16x16(buf0, buf1); highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); store_buffer_16x16(buf1, coeff, /*stride=*/16); break; case H_FLIPADST: load_buffer_16x16(input, buf0, stride, 1); highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); shift_right_2_round_s32_x4(buf0, buf0, 64); transpose_arrays_s32_16x16(buf0, buf1); highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); store_buffer_16x16(buf1, coeff, /*stride=*/16); break; default: assert(0); } } typedef void (*fwd_transform_1d_col_neon)(const int16_t *in, int32x4_t *out, int stride, int bit, int lr_flip); typedef void (*fwd_transform_1d_col_many_neon)(const int16_t *in, int32x4_t *out, int stride, int bit, int lr_flip, int howmany, int hm_stride); typedef void (*fwd_transform_1d_row_neon)(const int32x4_t *in, int32_t *out, int bit, int stride); typedef void (*fwd_transform_1d_row_many_neon)(const int32x4_t *in, int32_t *out, int bit, int howmany, int hm_stride, int stride); // Construct component kernels that include the load_buffer and store_buffer // stages to avoid the need to spill loaded data to the stack between these and // the txfm kernel calls. // The TRANSFORM_*_ONE cases are only ever called in situations where the // howmany parameter would be one, so no need for the loop at all in these // cases. #define TRANSFORM_COL_ONE(name, n) \ static void highbd_##name##_col_neon(const int16_t *input, \ int32x4_t *output, int stride, \ int cos_bit, int lr_flip) { \ int32x4_t buf0[n]; \ load_buffer_4x##n(input, buf0, stride, lr_flip); \ highbd_##name##_x4_neon(buf0, output, cos_bit); \ } #define TRANSFORM_COL_MANY(name, n) \ static void highbd_##name##_col_many_neon( \ const int16_t *input, int32x4_t *output, int stride, int cos_bit, \ int lr_flip, int howmany, int hm_stride) { \ int i = 0; \ do { \ int32x4_t buf0[n]; \ load_buffer_4x##n(input + 4 * i, buf0, stride, lr_flip); \ highbd_##name##_x4_neon(buf0, output + i * hm_stride, cos_bit); \ } while (++i < howmany); \ } #define TRANSFORM_ROW_ONE(name, n) \ static void highbd_##name##_row_neon( \ const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \ int32x4_t buf0[n]; \ highbd_##name##_x4_neon(input, buf0, cos_bit); \ store_buffer_##n##x4(buf0, output, stride); \ } #define TRANSFORM_ROW_RECT_ONE(name, n) \ static void highbd_##name##_row_rect_neon( \ const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \ int32x4_t buf0[n]; \ highbd_##name##_x4_neon(input, buf0, cos_bit); \ round_rect_array_s32_neon(buf0, buf0, (n)); \ store_buffer_##n##x4(buf0, output, stride); \ } #define TRANSFORM_ROW_MANY(name, n) \ static void highbd_##name##_row_many_neon( \ const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \ int hm_stride, int stride) { \ int i = 0; \ do { \ int32x4_t buf0[n]; \ highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit); \ store_buffer_##n##x4(buf0, output + 4 * i, stride); \ } while (++i < howmany); \ } #define TRANSFORM_ROW_RECT_MANY(name, n) \ static void highbd_##name##_row_rect_many_neon( \ const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \ int hm_stride, int stride) { \ int i = 0; \ do { \ int32x4_t buf0[n]; \ highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit); \ round_rect_array_s32_neon(buf0, buf0, (n)); \ store_buffer_##n##x4(buf0, output + 4 * i, stride); \ } while (++i < howmany); \ } TRANSFORM_COL_ONE(fdct8, 8) TRANSFORM_COL_ONE(fadst8, 8) TRANSFORM_COL_ONE(fidentity8, 8) TRANSFORM_COL_MANY(fdct4, 4) TRANSFORM_COL_MANY(fdct8, 8) TRANSFORM_COL_MANY(fdct16, 16) TRANSFORM_COL_MANY(fadst4, 4) TRANSFORM_COL_MANY(fadst8, 8) TRANSFORM_COL_MANY(fadst16, 16) TRANSFORM_COL_MANY(fidentity4, 4) TRANSFORM_COL_MANY(fidentity8, 8) TRANSFORM_COL_MANY(fidentity16, 16) TRANSFORM_ROW_ONE(fdct16, 16) TRANSFORM_ROW_ONE(fadst16, 16) TRANSFORM_ROW_ONE(fidentity16, 16) TRANSFORM_ROW_RECT_ONE(fdct8, 8) TRANSFORM_ROW_RECT_ONE(fadst8, 8) TRANSFORM_ROW_RECT_ONE(fidentity8, 8) #if !CONFIG_REALTIME_ONLY TRANSFORM_ROW_MANY(fdct4, 4) TRANSFORM_ROW_MANY(fdct8, 8) TRANSFORM_ROW_MANY(fadst4, 4) TRANSFORM_ROW_MANY(fadst8, 8) TRANSFORM_ROW_MANY(fidentity4, 4) TRANSFORM_ROW_MANY(fidentity8, 8) #endif TRANSFORM_ROW_RECT_MANY(fdct4, 4) TRANSFORM_ROW_RECT_MANY(fdct8, 8) TRANSFORM_ROW_RECT_MANY(fdct16, 16) TRANSFORM_ROW_RECT_MANY(fadst4, 4) TRANSFORM_ROW_RECT_MANY(fadst8, 8) TRANSFORM_ROW_RECT_MANY(fadst16, 16) TRANSFORM_ROW_RECT_MANY(fidentity4, 4) TRANSFORM_ROW_RECT_MANY(fidentity8, 8) TRANSFORM_ROW_RECT_MANY(fidentity16, 16) static const fwd_transform_1d_col_many_neon col_highbd_txfm8_xn_arr[TX_TYPES] = { highbd_fdct8_col_many_neon, // DCT_DCT highbd_fadst8_col_many_neon, // ADST_DCT highbd_fdct8_col_many_neon, // DCT_ADST highbd_fadst8_col_many_neon, // ADST_ADST highbd_fadst8_col_many_neon, // FLIPADST_DCT highbd_fdct8_col_many_neon, // DCT_FLIPADST highbd_fadst8_col_many_neon, // FLIPADST_FLIPADST highbd_fadst8_col_many_neon, // ADST_FLIPADST highbd_fadst8_col_many_neon, // FLIPADST_ADST highbd_fidentity8_col_many_neon, // IDTX highbd_fdct8_col_many_neon, // V_DCT highbd_fidentity8_col_many_neon, // H_DCT highbd_fadst8_col_many_neon, // V_ADST highbd_fidentity8_col_many_neon, // H_ADST highbd_fadst8_col_many_neon, // V_FLIPADST highbd_fidentity8_col_many_neon // H_FLIPADST }; static const fwd_transform_1d_col_neon col_highbd_txfm8_x4_arr[TX_TYPES] = { highbd_fdct8_col_neon, // DCT_DCT highbd_fadst8_col_neon, // ADST_DCT highbd_fdct8_col_neon, // DCT_ADST highbd_fadst8_col_neon, // ADST_ADST highbd_fadst8_col_neon, // FLIPADST_DCT highbd_fdct8_col_neon, // DCT_FLIPADST highbd_fadst8_col_neon, // FLIPADST_FLIPADST highbd_fadst8_col_neon, // ADST_FLIPADST highbd_fadst8_col_neon, // FLIPADST_ADST highbd_fidentity8_col_neon, // IDTX highbd_fdct8_col_neon, // V_DCT highbd_fidentity8_col_neon, // H_DCT highbd_fadst8_col_neon, // V_ADST highbd_fidentity8_col_neon, // H_ADST highbd_fadst8_col_neon, // V_FLIPADST highbd_fidentity8_col_neon // H_FLIPADST }; static const fwd_transform_1d_col_many_neon col_highbd_txfm16_xn_arr[TX_TYPES] = { highbd_fdct16_col_many_neon, // DCT_DCT highbd_fadst16_col_many_neon, // ADST_DCT highbd_fdct16_col_many_neon, // DCT_ADST highbd_fadst16_col_many_neon, // ADST_ADST highbd_fadst16_col_many_neon, // FLIPADST_DCT highbd_fdct16_col_many_neon, // DCT_FLIPADST highbd_fadst16_col_many_neon, // FLIPADST_FLIPADST highbd_fadst16_col_many_neon, // ADST_FLIPADST highbd_fadst16_col_many_neon, // FLIPADST_ADST highbd_fidentity16_col_many_neon, // IDTX highbd_fdct16_col_many_neon, // V_DCT highbd_fidentity16_col_many_neon, // H_DCT highbd_fadst16_col_many_neon, // V_ADST highbd_fidentity16_col_many_neon, // H_ADST highbd_fadst16_col_many_neon, // V_FLIPADST highbd_fidentity16_col_many_neon // H_FLIPADST }; static const fwd_transform_1d_col_many_neon col_highbd_txfm4_xn_arr[TX_TYPES] = { highbd_fdct4_col_many_neon, // DCT_DCT highbd_fadst4_col_many_neon, // ADST_DCT highbd_fdct4_col_many_neon, // DCT_ADST highbd_fadst4_col_many_neon, // ADST_ADST highbd_fadst4_col_many_neon, // FLIPADST_DCT highbd_fdct4_col_many_neon, // DCT_FLIPADST highbd_fadst4_col_many_neon, // FLIPADST_FLIPADST highbd_fadst4_col_many_neon, // ADST_FLIPADST highbd_fadst4_col_many_neon, // FLIPADST_ADST highbd_fidentity4_col_many_neon, // IDTX highbd_fdct4_col_many_neon, // V_DCT highbd_fidentity4_col_many_neon, // H_DCT highbd_fadst4_col_many_neon, // V_ADST highbd_fidentity4_col_many_neon, // H_ADST highbd_fadst4_col_many_neon, // V_FLIPADST highbd_fidentity4_col_many_neon // H_FLIPADST }; static const fwd_transform_1d_row_neon row_highbd_txfm16_xn_arr[TX_TYPES] = { highbd_fdct16_row_neon, // DCT_DCT highbd_fdct16_row_neon, // ADST_DCT highbd_fadst16_row_neon, // DCT_ADST highbd_fadst16_row_neon, // ADST_ADST highbd_fdct16_row_neon, // FLIPADST_DCT highbd_fadst16_row_neon, // DCT_FLIPADST highbd_fadst16_row_neon, // FLIPADST_FLIPADST highbd_fadst16_row_neon, // ADST_FLIPADST highbd_fadst16_row_neon, // FLIPADST_ADST highbd_fidentity16_row_neon, // IDTX highbd_fidentity16_row_neon, // V_DCT highbd_fdct16_row_neon, // H_DCT highbd_fidentity16_row_neon, // V_ADST highbd_fadst16_row_neon, // H_ADST highbd_fidentity16_row_neon, // V_FLIPADST highbd_fadst16_row_neon // H_FLIPADST }; static const fwd_transform_1d_row_many_neon row_rect_highbd_txfm16_xn_arr[TX_TYPES] = { highbd_fdct16_row_rect_many_neon, // DCT_DCT highbd_fdct16_row_rect_many_neon, // ADST_DCT highbd_fadst16_row_rect_many_neon, // DCT_ADST highbd_fadst16_row_rect_many_neon, // ADST_ADST highbd_fdct16_row_rect_many_neon, // FLIPADST_DCT highbd_fadst16_row_rect_many_neon, // DCT_FLIPADST highbd_fadst16_row_rect_many_neon, // FLIPADST_FLIPADST highbd_fadst16_row_rect_many_neon, // ADST_FLIPADST highbd_fadst16_row_rect_many_neon, // FLIPADST_ADST highbd_fidentity16_row_rect_many_neon, // IDTX highbd_fidentity16_row_rect_many_neon, // V_DCT highbd_fdct16_row_rect_many_neon, // H_DCT highbd_fidentity16_row_rect_many_neon, // V_ADST highbd_fadst16_row_rect_many_neon, // H_ADST highbd_fidentity16_row_rect_many_neon, // V_FLIPADST highbd_fadst16_row_rect_many_neon // H_FLIPADST }; #if !CONFIG_REALTIME_ONLY static const fwd_transform_1d_row_many_neon row_highbd_txfm8_xn_arr[TX_TYPES] = { highbd_fdct8_row_many_neon, // DCT_DCT highbd_fdct8_row_many_neon, // ADST_DCT highbd_fadst8_row_many_neon, // DCT_ADST highbd_fadst8_row_many_neon, // ADST_ADST highbd_fdct8_row_many_neon, // FLIPADST_DCT highbd_fadst8_row_many_neon, // DCT_FLIPADST highbd_fadst8_row_many_neon, // FLIPADST_FLIPADST highbd_fadst8_row_many_neon, // ADST_FLIPADST highbd_fadst8_row_many_neon, // FLIPADST_ADST highbd_fidentity8_row_many_neon, // IDTX highbd_fidentity8_row_many_neon, // V_DCT highbd_fdct8_row_many_neon, // H_DCT highbd_fidentity8_row_many_neon, // V_ADST highbd_fadst8_row_many_neon, // H_ADST highbd_fidentity8_row_many_neon, // V_FLIPADST highbd_fadst8_row_many_neon // H_FLIPADST }; #endif static const fwd_transform_1d_row_many_neon row_rect_highbd_txfm8_xn_arr[TX_TYPES] = { highbd_fdct8_row_rect_many_neon, // DCT_DCT highbd_fdct8_row_rect_many_neon, // ADST_DCT highbd_fadst8_row_rect_many_neon, // DCT_ADST highbd_fadst8_row_rect_many_neon, // ADST_ADST highbd_fdct8_row_rect_many_neon, // FLIPADST_DCT highbd_fadst8_row_rect_many_neon, // DCT_FLIPADST highbd_fadst8_row_rect_many_neon, // FLIPADST_FLIPADST highbd_fadst8_row_rect_many_neon, // ADST_FLIPADST highbd_fadst8_row_rect_many_neon, // FLIPADST_ADST highbd_fidentity8_row_rect_many_neon, // IDTX highbd_fidentity8_row_rect_many_neon, // V_DCT highbd_fdct8_row_rect_many_neon, // H_DCT highbd_fidentity8_row_rect_many_neon, // V_ADST highbd_fadst8_row_rect_many_neon, // H_ADST highbd_fidentity8_row_rect_many_neon, // V_FLIPADST highbd_fadst8_row_rect_many_neon // H_FLIPADST }; static const fwd_transform_1d_row_neon row_highbd_txfm8_x4_arr[TX_TYPES] = { highbd_fdct8_row_rect_neon, // DCT_DCT highbd_fdct8_row_rect_neon, // ADST_DCT highbd_fadst8_row_rect_neon, // DCT_ADST highbd_fadst8_row_rect_neon, // ADST_ADST highbd_fdct8_row_rect_neon, // FLIPADST_DCT highbd_fadst8_row_rect_neon, // DCT_FLIPADST highbd_fadst8_row_rect_neon, // FLIPADST_FLIPADST highbd_fadst8_row_rect_neon, // ADST_FLIPADST highbd_fadst8_row_rect_neon, // FLIPADST_ADST highbd_fidentity8_row_rect_neon, // IDTX highbd_fidentity8_row_rect_neon, // V_DCT highbd_fdct8_row_rect_neon, // H_DCT highbd_fidentity8_row_rect_neon, // V_ADST highbd_fadst8_row_rect_neon, // H_ADST highbd_fidentity8_row_rect_neon, // V_FLIPADST highbd_fadst8_row_rect_neon // H_FLIPADST }; #if !CONFIG_REALTIME_ONLY static const fwd_transform_1d_row_many_neon row_highbd_txfm4_xn_arr[TX_TYPES] = { highbd_fdct4_row_many_neon, // DCT_DCT highbd_fdct4_row_many_neon, // ADST_DCT highbd_fadst4_row_many_neon, // DCT_ADST highbd_fadst4_row_many_neon, // ADST_ADST highbd_fdct4_row_many_neon, // FLIPADST_DCT highbd_fadst4_row_many_neon, // DCT_FLIPADST highbd_fadst4_row_many_neon, // FLIPADST_FLIPADST highbd_fadst4_row_many_neon, // ADST_FLIPADST highbd_fadst4_row_many_neon, // FLIPADST_ADST highbd_fidentity4_row_many_neon, // IDTX highbd_fidentity4_row_many_neon, // V_DCT highbd_fdct4_row_many_neon, // H_DCT highbd_fidentity4_row_many_neon, // V_ADST highbd_fadst4_row_many_neon, // H_ADST highbd_fidentity4_row_many_neon, // V_FLIPADST highbd_fadst4_row_many_neon // H_FLIPADST }; #endif static const fwd_transform_1d_row_many_neon row_rect_highbd_txfm4_xn_arr[TX_TYPES] = { highbd_fdct4_row_rect_many_neon, // DCT_DCT highbd_fdct4_row_rect_many_neon, // ADST_DCT highbd_fadst4_row_rect_many_neon, // DCT_ADST highbd_fadst4_row_rect_many_neon, // ADST_ADST highbd_fdct4_row_rect_many_neon, // FLIPADST_DCT highbd_fadst4_row_rect_many_neon, // DCT_FLIPADST highbd_fadst4_row_rect_many_neon, // FLIPADST_FLIPADST highbd_fadst4_row_rect_many_neon, // ADST_FLIPADST highbd_fadst4_row_rect_many_neon, // FLIPADST_ADST highbd_fidentity4_row_rect_many_neon, // IDTX highbd_fidentity4_row_rect_many_neon, // V_DCT highbd_fdct4_row_rect_many_neon, // H_DCT highbd_fidentity4_row_rect_many_neon, // V_ADST highbd_fadst4_row_rect_many_neon, // H_ADST highbd_fidentity4_row_rect_many_neon, // V_FLIPADST highbd_fadst4_row_rect_many_neon // H_FLIPADST }; static void highbd_fdct32_x4_neon(const int32x4_t *input, int32x4_t *output, int cos_bit) { const int32_t *const cospi = cospi_arr_s32(cos_bit); const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit); // Workspaces for intermediate transform steps. int32x4_t buf0[32]; int32x4_t buf1[32]; // stage 1 butterfly_dct_pre(input, buf1, 32); // stage 2 butterfly_dct_pre(buf1, buf0, 16); buf0[16] = buf1[16]; buf0[17] = buf1[17]; buf0[18] = buf1[18]; buf0[19] = buf1[19]; butterfly_0112_neon(cospi, 32, buf1[27], buf1[20], &buf0[27], &buf0[20], v_cos_bit); butterfly_0112_neon(cospi, 32, buf1[26], buf1[21], &buf0[26], &buf0[21], v_cos_bit); butterfly_0112_neon(cospi, 32, buf1[25], buf1[22], &buf0[25], &buf0[22], v_cos_bit); butterfly_0112_neon(cospi, 32, buf1[24], buf1[23], &buf0[24], &buf0[23], v_cos_bit); buf0[28] = buf1[28]; buf0[29] = buf1[29]; buf0[30] = buf1[30]; buf0[31] = buf1[31]; // stage 3 butterfly_dct_pre(buf0, buf1, 8); buf1[8] = buf0[8]; buf1[9] = buf0[9]; butterfly_0112_neon(cospi, 32, buf0[13], buf0[10], &buf1[13], &buf1[10], v_cos_bit); butterfly_0112_neon(cospi, 32, buf0[12], buf0[11], &buf1[12], &buf1[11], v_cos_bit); buf1[14] = buf0[14]; buf1[15] = buf0[15]; butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 16); // stage 4 butterfly_dct_pre(buf1, buf0, 4); buf0[4] = buf1[4]; butterfly_0112_neon(cospi, 32, buf1[6], buf1[5], &buf0[6], &buf0[5], v_cos_bit); buf0[7] = buf1[7]; butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 8); buf0[16] = buf1[16]; buf0[17] = buf1[17]; butterfly_0112_neon(cospi, 16, buf1[29], buf1[18], &buf0[29], &buf0[18], v_cos_bit); butterfly_0112_neon(cospi, 16, buf1[28], buf1[19], &buf0[28], &buf0[19], v_cos_bit); butterfly_2312_neon(cospi, 16, buf1[27], buf1[20], &buf0[20], &buf0[27], v_cos_bit); butterfly_2312_neon(cospi, 16, buf1[26], buf1[21], &buf0[21], &buf0[26], v_cos_bit); buf0[22] = buf1[22]; buf0[23] = buf1[23]; buf0[24] = buf1[24]; buf0[25] = buf1[25]; buf0[30] = buf1[30]; buf0[31] = buf1[31]; // stage 5 butterfly_0112_neon(cospi, 32, buf0[0], buf0[1], &buf1[0], &buf1[1], v_cos_bit); butterfly_0112_neon(cospi, 16, buf0[3], buf0[2], &buf1[2], &buf1[3], v_cos_bit); butterfly_dct_post(buf0 + 4, buf0 + 4, buf1 + 4, 4); buf1[8] = buf0[8]; butterfly_0112_neon(cospi, 16, buf0[14], buf0[9], &buf1[14], &buf1[9], v_cos_bit); butterfly_2312_neon(cospi, 16, buf0[13], buf0[10], &buf1[10], &buf1[13], v_cos_bit); buf1[11] = buf0[11]; buf1[12] = buf0[12]; buf1[15] = buf0[15]; butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 8); butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 8); // stage 6 buf0[0] = buf1[0]; buf0[1] = buf1[1]; buf0[2] = buf1[2]; buf0[3] = buf1[3]; butterfly_0112_neon(cospi, 8, buf1[7], buf1[4], &buf0[4], &buf0[7], v_cos_bit); butterfly_0112_neon(cospi, 8, buf1[30], buf1[17], &buf0[30], &buf0[17], v_cos_bit); butterfly_2312_neon(cospi, 8, buf1[29], buf1[18], &buf0[18], &buf0[29], v_cos_bit); butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 4); butterfly_dct_post(buf1 + 12, buf1 + 12, buf0 + 12, 4); buf0[16] = buf1[16]; buf0[19] = buf1[19]; buf0[20] = buf1[20]; butterfly_0130_neon(cospi, 24, buf1[5], buf1[6], &buf0[5], &buf0[6], v_cos_bit); butterfly_0130_neon(cospi, 24, buf1[21], buf1[26], &buf0[26], &buf0[21], v_cos_bit); butterfly_0332_neon(cospi, 24, buf1[25], buf1[22], &buf0[25], &buf0[22], v_cos_bit); buf0[23] = buf1[23]; buf0[24] = buf1[24]; buf0[27] = buf1[27]; buf0[28] = buf1[28]; buf0[31] = buf1[31]; // stage 7 buf1[0] = buf0[0]; buf1[1] = buf0[1]; buf1[2] = buf0[2]; buf1[3] = buf0[3]; buf1[4] = buf0[4]; buf1[5] = buf0[5]; buf1[6] = buf0[6]; buf1[7] = buf0[7]; butterfly_0112_neon(cospi, 4, buf0[15], buf0[8], &buf1[8], &buf1[15], v_cos_bit); butterfly_0130_neon(cospi, 28, buf0[9], buf0[14], &buf1[9], &buf1[14], v_cos_bit); butterfly_0112_neon(cospi, 20, buf0[13], buf0[10], &buf1[10], &buf1[13], v_cos_bit); butterfly_0130_neon(cospi, 12, buf0[11], buf0[12], &buf1[11], &buf1[12], v_cos_bit); butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 4); butterfly_dct_post(buf0 + 20, buf0 + 20, buf1 + 20, 4); butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 4); butterfly_dct_post(buf0 + 28, buf0 + 28, buf1 + 28, 4); // stage 8 buf0[0] = buf1[0]; buf0[1] = buf1[1]; buf0[2] = buf1[2]; buf0[3] = buf1[3]; buf0[4] = buf1[4]; buf0[5] = buf1[5]; buf0[6] = buf1[6]; buf0[7] = buf1[7]; buf0[8] = buf1[8]; buf0[9] = buf1[9]; buf0[10] = buf1[10]; buf0[11] = buf1[11]; buf0[12] = buf1[12]; buf0[13] = buf1[13]; buf0[14] = buf1[14]; buf0[15] = buf1[15]; butterfly_0112_neon(cospi, 2, buf1[31], buf1[16], &buf0[16], &buf0[31], v_cos_bit); butterfly_0130_neon(cospi, 30, buf1[17], buf1[30], &buf0[17], &buf0[30], v_cos_bit); butterfly_0112_neon(cospi, 18, buf1[29], buf1[18], &buf0[18], &buf0[29], v_cos_bit); butterfly_0130_neon(cospi, 14, buf1[19], buf1[28], &buf0[19], &buf0[28], v_cos_bit); butterfly_0112_neon(cospi, 10, buf1[27], buf1[20], &buf0[20], &buf0[27], v_cos_bit); butterfly_0130_neon(cospi, 22, buf1[21], buf1[26], &buf0[21], &buf0[26], v_cos_bit); butterfly_0112_neon(cospi, 26, buf1[25], buf1[22], &buf0[22], &buf0[25], v_cos_bit); butterfly_0130_neon(cospi, 6, buf1[23], buf1[24], &buf0[23], &buf0[24], v_cos_bit); // stage 9 output[0] = buf0[0]; output[1] = buf0[16]; output[2] = buf0[8]; output[3] = buf0[24]; output[4] = buf0[4]; output[5] = buf0[20]; output[6] = buf0[12]; output[7] = buf0[28]; output[8] = buf0[2]; output[9] = buf0[18]; output[10] = buf0[10]; output[11] = buf0[26]; output[12] = buf0[6]; output[13] = buf0[22]; output[14] = buf0[14]; output[15] = buf0[30]; output[16] = buf0[1]; output[17] = buf0[17]; output[18] = buf0[9]; output[19] = buf0[25]; output[20] = buf0[5]; output[21] = buf0[21]; output[22] = buf0[13]; output[23] = buf0[29]; output[24] = buf0[3]; output[25] = buf0[19]; output[26] = buf0[11]; output[27] = buf0[27]; output[28] = buf0[7]; output[29] = buf0[23]; output[30] = buf0[15]; output[31] = buf0[31]; } static void highbd_fdct64_x4_neon(const int32x4_t *input, int32x4_t *output, int8_t cos_bit) { const int32_t *const cospi = cospi_arr_s32(cos_bit); const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit); // stage 1 int32x4_t x1[64]; butterfly_dct_pre(input, x1, 64); // stage 2 int32x4_t x2[64]; butterfly_dct_pre(x1, x2, 32); x2[32] = x1[32]; x2[33] = x1[33]; x2[34] = x1[34]; x2[35] = x1[35]; x2[36] = x1[36]; x2[37] = x1[37]; x2[38] = x1[38]; x2[39] = x1[39]; butterfly_0112_neon(cospi, 32, x1[55], x1[40], &x2[55], &x2[40], v_cos_bit); butterfly_0112_neon(cospi, 32, x1[54], x1[41], &x2[54], &x2[41], v_cos_bit); butterfly_0112_neon(cospi, 32, x1[53], x1[42], &x2[53], &x2[42], v_cos_bit); butterfly_0112_neon(cospi, 32, x1[52], x1[43], &x2[52], &x2[43], v_cos_bit); butterfly_0112_neon(cospi, 32, x1[51], x1[44], &x2[51], &x2[44], v_cos_bit); butterfly_0112_neon(cospi, 32, x1[50], x1[45], &x2[50], &x2[45], v_cos_bit); butterfly_0112_neon(cospi, 32, x1[49], x1[46], &x2[49], &x2[46], v_cos_bit); butterfly_0112_neon(cospi, 32, x1[48], x1[47], &x2[48], &x2[47], v_cos_bit); x2[56] = x1[56]; x2[57] = x1[57]; x2[58] = x1[58]; x2[59] = x1[59]; x2[60] = x1[60]; x2[61] = x1[61]; x2[62] = x1[62]; x2[63] = x1[63]; // stage 3 int32x4_t x3[64]; butterfly_dct_pre(x2, x3, 16); x3[16] = x2[16]; x3[17] = x2[17]; x3[18] = x2[18]; x3[19] = x2[19]; butterfly_0112_neon(cospi, 32, x2[27], x2[20], &x3[27], &x3[20], v_cos_bit); butterfly_0112_neon(cospi, 32, x2[26], x2[21], &x3[26], &x3[21], v_cos_bit); butterfly_0112_neon(cospi, 32, x2[25], x2[22], &x3[25], &x3[22], v_cos_bit); butterfly_0112_neon(cospi, 32, x2[24], x2[23], &x3[24], &x3[23], v_cos_bit); x3[28] = x2[28]; x3[29] = x2[29]; x3[30] = x2[30]; x3[31] = x2[31]; butterfly_dct_post(x2 + 32, x2 + 32, x3 + 32, 32); // stage 4 int32x4_t x4[64]; butterfly_dct_pre(x3, x4, 8); x4[8] = x3[8]; x4[9] = x3[9]; butterfly_0112_neon(cospi, 32, x3[13], x3[10], &x4[13], &x4[10], v_cos_bit); butterfly_0112_neon(cospi, 32, x3[12], x3[11], &x4[12], &x4[11], v_cos_bit); x4[14] = x3[14]; x4[15] = x3[15]; butterfly_dct_post(x3 + 16, x3 + 16, x4 + 16, 16); x4[32] = x3[32]; x4[33] = x3[33]; x4[34] = x3[34]; x4[35] = x3[35]; butterfly_0112_neon(cospi, 16, x3[59], x3[36], &x4[59], &x4[36], v_cos_bit); butterfly_0112_neon(cospi, 16, x3[58], x3[37], &x4[58], &x4[37], v_cos_bit); butterfly_0112_neon(cospi, 16, x3[57], x3[38], &x4[57], &x4[38], v_cos_bit); butterfly_0112_neon(cospi, 16, x3[56], x3[39], &x4[56], &x4[39], v_cos_bit); butterfly_2312_neon(cospi, 16, x3[55], x3[40], &x4[40], &x4[55], v_cos_bit); butterfly_2312_neon(cospi, 16, x3[54], x3[41], &x4[41], &x4[54], v_cos_bit); butterfly_2312_neon(cospi, 16, x3[53], x3[42], &x4[42], &x4[53], v_cos_bit); butterfly_2312_neon(cospi, 16, x3[52], x3[43], &x4[43], &x4[52], v_cos_bit); x4[44] = x3[44]; x4[45] = x3[45]; x4[46] = x3[46]; x4[47] = x3[47]; x4[48] = x3[48]; x4[49] = x3[49]; x4[50] = x3[50]; x4[51] = x3[51]; x4[60] = x3[60]; x4[61] = x3[61]; x4[62] = x3[62]; x4[63] = x3[63]; // stage 5 int32x4_t x5[64]; butterfly_dct_pre(x4, x5, 4); x5[4] = x4[4]; butterfly_0112_neon(cospi, 32, x4[6], x4[5], &x5[6], &x5[5], v_cos_bit); x5[7] = x4[7]; butterfly_dct_post(x4 + 8, x4 + 8, x5 + 8, 8); x5[16] = x4[16]; x5[17] = x4[17]; butterfly_0112_neon(cospi, 16, x4[29], x4[18], &x5[29], &x5[18], v_cos_bit); butterfly_0112_neon(cospi, 16, x4[28], x4[19], &x5[28], &x5[19], v_cos_bit); butterfly_2312_neon(cospi, 16, x4[27], x4[20], &x5[20], &x5[27], v_cos_bit); butterfly_2312_neon(cospi, 16, x4[26], x4[21], &x5[21], &x5[26], v_cos_bit); x5[22] = x4[22]; x5[23] = x4[23]; x5[24] = x4[24]; x5[25] = x4[25]; x5[30] = x4[30]; x5[31] = x4[31]; butterfly_dct_post(x4 + 32, x4 + 32, x5 + 32, 16); butterfly_dct_post(x4 + 48, x4 + 48, x5 + 48, 16); // stage 6 int32x4_t x6[64]; butterfly_0112_neon(cospi, 32, x5[0], x5[1], &x6[0], &x6[1], v_cos_bit); butterfly_0112_neon(cospi, 16, x5[3], x5[2], &x6[2], &x6[3], v_cos_bit); butterfly_dct_post(x5 + 4, x5 + 4, x6 + 4, 4); x6[8] = x5[8]; butterfly_0112_neon(cospi, 16, x5[14], x5[9], &x6[14], &x6[9], v_cos_bit); butterfly_2312_neon(cospi, 16, x5[13], x5[10], &x6[10], &x6[13], v_cos_bit); x6[11] = x5[11]; x6[12] = x5[12]; x6[15] = x5[15]; butterfly_dct_post(x5 + 16, x5 + 16, x6 + 16, 8); butterfly_dct_post(x5 + 24, x5 + 24, x6 + 24, 8); x6[32] = x5[32]; x6[33] = x5[33]; butterfly_0112_neon(cospi, 8, x5[61], x5[34], &x6[61], &x6[34], v_cos_bit); butterfly_0112_neon(cospi, 8, x5[60], x5[35], &x6[60], &x6[35], v_cos_bit); butterfly_2312_neon(cospi, 8, x5[59], x5[36], &x6[36], &x6[59], v_cos_bit); butterfly_2312_neon(cospi, 8, x5[58], x5[37], &x6[37], &x6[58], v_cos_bit); x6[38] = x5[38]; x6[39] = x5[39]; x6[40] = x5[40]; x6[41] = x5[41]; butterfly_0130_neon(cospi, 24, x5[42], x5[53], &x6[53], &x6[42], v_cos_bit); butterfly_0130_neon(cospi, 24, x5[43], x5[52], &x6[52], &x6[43], v_cos_bit); butterfly_0332_neon(cospi, 24, x5[51], x5[44], &x6[51], &x6[44], v_cos_bit); butterfly_0332_neon(cospi, 24, x5[50], x5[45], &x6[50], &x6[45], v_cos_bit); x6[46] = x5[46]; x6[47] = x5[47]; x6[48] = x5[48]; x6[49] = x5[49]; x6[54] = x5[54]; x6[55] = x5[55]; x6[56] = x5[56]; x6[57] = x5[57]; x6[62] = x5[62]; x6[63] = x5[63]; // stage 7 int32x4_t x7[64]; x7[0] = x6[0]; x7[1] = x6[1]; x7[2] = x6[2]; x7[3] = x6[3]; butterfly_0112_neon(cospi, 8, x6[7], x6[4], &x7[4], &x7[7], v_cos_bit); butterfly_0130_neon(cospi, 24, x6[5], x6[6], &x7[5], &x7[6], v_cos_bit); butterfly_dct_post(x6 + 8, x6 + 8, x7 + 8, 4); butterfly_dct_post(x6 + 12, x6 + 12, x7 + 12, 4); x7[16] = x6[16]; butterfly_0112_neon(cospi, 8, x6[30], x6[17], &x7[30], &x7[17], v_cos_bit); butterfly_2312_neon(cospi, 8, x6[29], x6[18], &x7[18], &x7[29], v_cos_bit); x7[19] = x6[19]; x7[20] = x6[20]; butterfly_0130_neon(cospi, 24, x6[21], x6[26], &x7[26], &x7[21], v_cos_bit); butterfly_0332_neon(cospi, 24, x6[25], x6[22], &x7[25], &x7[22], v_cos_bit); x7[23] = x6[23]; x7[24] = x6[24]; x7[27] = x6[27]; x7[28] = x6[28]; x7[31] = x6[31]; butterfly_dct_post(x6 + 32, x6 + 32, x7 + 32, 8); butterfly_dct_post(x6 + 40, x6 + 40, x7 + 40, 8); butterfly_dct_post(x6 + 48, x6 + 48, x7 + 48, 8); butterfly_dct_post(x6 + 56, x6 + 56, x7 + 56, 8); // stage 8 int32x4_t x8[64]; x8[0] = x7[0]; x8[1] = x7[1]; x8[2] = x7[2]; x8[3] = x7[3]; x8[4] = x7[4]; x8[5] = x7[5]; x8[6] = x7[6]; x8[7] = x7[7]; butterfly_0112_neon(cospi, 4, x7[15], x7[8], &x8[8], &x8[15], v_cos_bit); butterfly_0130_neon(cospi, 28, x7[9], x7[14], &x8[9], &x8[14], v_cos_bit); butterfly_0112_neon(cospi, 20, x7[13], x7[10], &x8[10], &x8[13], v_cos_bit); butterfly_0130_neon(cospi, 12, x7[11], x7[12], &x8[11], &x8[12], v_cos_bit); butterfly_dct_post(x7 + 16, x7 + 16, x8 + 16, 4); butterfly_dct_post(x7 + 20, x7 + 20, x8 + 20, 4); butterfly_dct_post(x7 + 24, x7 + 24, x8 + 24, 4); butterfly_dct_post(x7 + 28, x7 + 28, x8 + 28, 4); x8[32] = x7[32]; butterfly_0112_neon(cospi, 4, x7[62], x7[33], &x8[62], &x8[33], v_cos_bit); butterfly_2312_neon(cospi, 4, x7[61], x7[34], &x8[34], &x8[61], v_cos_bit); x8[35] = x7[35]; x8[36] = x7[36]; butterfly_0130_neon(cospi, 28, x7[37], x7[58], &x8[58], &x8[37], v_cos_bit); butterfly_0332_neon(cospi, 28, x7[57], x7[38], &x8[57], &x8[38], v_cos_bit); x8[39] = x7[39]; x8[40] = x7[40]; butterfly_0112_neon(cospi, 20, x7[54], x7[41], &x8[54], &x8[41], v_cos_bit); butterfly_2312_neon(cospi, 20, x7[53], x7[42], &x8[42], &x8[53], v_cos_bit); x8[43] = x7[43]; x8[44] = x7[44]; butterfly_0130_neon(cospi, 12, x7[45], x7[50], &x8[50], &x8[45], v_cos_bit); butterfly_0332_neon(cospi, 12, x7[49], x7[46], &x8[49], &x8[46], v_cos_bit); x8[47] = x7[47]; x8[48] = x7[48]; x8[51] = x7[51]; x8[52] = x7[52]; x8[55] = x7[55]; x8[56] = x7[56]; x8[59] = x7[59]; x8[60] = x7[60]; x8[63] = x7[63]; // stage 9 int32x4_t x9[64]; x9[0] = x8[0]; x9[1] = x8[1]; x9[2] = x8[2]; x9[3] = x8[3]; x9[4] = x8[4]; x9[5] = x8[5]; x9[6] = x8[6]; x9[7] = x8[7]; x9[8] = x8[8]; x9[9] = x8[9]; x9[10] = x8[10]; x9[11] = x8[11]; x9[12] = x8[12]; x9[13] = x8[13]; x9[14] = x8[14]; x9[15] = x8[15]; butterfly_0112_neon(cospi, 2, x8[31], x8[16], &x9[16], &x9[31], v_cos_bit); butterfly_0130_neon(cospi, 30, x8[17], x8[30], &x9[17], &x9[30], v_cos_bit); butterfly_0112_neon(cospi, 18, x8[29], x8[18], &x9[18], &x9[29], v_cos_bit); butterfly_0130_neon(cospi, 14, x8[19], x8[28], &x9[19], &x9[28], v_cos_bit); butterfly_0112_neon(cospi, 10, x8[27], x8[20], &x9[20], &x9[27], v_cos_bit); butterfly_0130_neon(cospi, 22, x8[21], x8[26], &x9[21], &x9[26], v_cos_bit); butterfly_0112_neon(cospi, 26, x8[25], x8[22], &x9[22], &x9[25], v_cos_bit); butterfly_0130_neon(cospi, 6, x8[23], x8[24], &x9[23], &x9[24], v_cos_bit); butterfly_dct_post(x8 + 32, x8 + 32, x9 + 32, 4); butterfly_dct_post(x8 + 36, x8 + 36, x9 + 36, 4); butterfly_dct_post(x8 + 40, x8 + 40, x9 + 40, 4); butterfly_dct_post(x8 + 44, x8 + 44, x9 + 44, 4); butterfly_dct_post(x8 + 48, x8 + 48, x9 + 48, 4); butterfly_dct_post(x8 + 52, x8 + 52, x9 + 52, 4); butterfly_dct_post(x8 + 56, x8 + 56, x9 + 56, 4); butterfly_dct_post(x8 + 60, x8 + 60, x9 + 60, 4); // stage 10 int32x4_t x10[64]; x10[0] = x9[0]; x10[1] = x9[1]; x10[2] = x9[2]; x10[3] = x9[3]; x10[4] = x9[4]; x10[5] = x9[5]; x10[6] = x9[6]; x10[7] = x9[7]; x10[8] = x9[8]; x10[9] = x9[9]; x10[10] = x9[10]; x10[11] = x9[11]; x10[12] = x9[12]; x10[13] = x9[13]; x10[14] = x9[14]; x10[15] = x9[15]; x10[16] = x9[16]; x10[17] = x9[17]; x10[18] = x9[18]; x10[19] = x9[19]; x10[20] = x9[20]; x10[21] = x9[21]; x10[22] = x9[22]; x10[23] = x9[23]; x10[24] = x9[24]; x10[25] = x9[25]; x10[26] = x9[26]; x10[27] = x9[27]; x10[28] = x9[28]; x10[29] = x9[29]; x10[30] = x9[30]; x10[31] = x9[31]; butterfly_0112_neon(cospi, 1, x9[63], x9[32], &x10[32], &x10[63], v_cos_bit); butterfly_0130_neon(cospi, 31, x9[33], x9[62], &x10[33], &x10[62], v_cos_bit); butterfly_0112_neon(cospi, 17, x9[61], x9[34], &x10[34], &x10[61], v_cos_bit); butterfly_0130_neon(cospi, 15, x9[35], x9[60], &x10[35], &x10[60], v_cos_bit); butterfly_0112_neon(cospi, 9, x9[59], x9[36], &x10[36], &x10[59], v_cos_bit); butterfly_0130_neon(cospi, 23, x9[37], x9[58], &x10[37], &x10[58], v_cos_bit); butterfly_0112_neon(cospi, 25, x9[57], x9[38], &x10[38], &x10[57], v_cos_bit); butterfly_0130_neon(cospi, 7, x9[39], x9[56], &x10[39], &x10[56], v_cos_bit); butterfly_0112_neon(cospi, 5, x9[55], x9[40], &x10[40], &x10[55], v_cos_bit); butterfly_0130_neon(cospi, 27, x9[41], x9[54], &x10[41], &x10[54], v_cos_bit); butterfly_0112_neon(cospi, 21, x9[53], x9[42], &x10[42], &x10[53], v_cos_bit); butterfly_0130_neon(cospi, 11, x9[43], x9[52], &x10[43], &x10[52], v_cos_bit); butterfly_0112_neon(cospi, 13, x9[51], x9[44], &x10[44], &x10[51], v_cos_bit); butterfly_0130_neon(cospi, 19, x9[45], x9[50], &x10[45], &x10[50], v_cos_bit); butterfly_0112_neon(cospi, 29, x9[49], x9[46], &x10[46], &x10[49], v_cos_bit); butterfly_0130_neon(cospi, 3, x9[47], x9[48], &x10[47], &x10[48], v_cos_bit); // stage 11 output[0] = x10[0]; output[1] = x10[32]; output[2] = x10[16]; output[3] = x10[48]; output[4] = x10[8]; output[5] = x10[40]; output[6] = x10[24]; output[7] = x10[56]; output[8] = x10[4]; output[9] = x10[36]; output[10] = x10[20]; output[11] = x10[52]; output[12] = x10[12]; output[13] = x10[44]; output[14] = x10[28]; output[15] = x10[60]; output[16] = x10[2]; output[17] = x10[34]; output[18] = x10[18]; output[19] = x10[50]; output[20] = x10[10]; output[21] = x10[42]; output[22] = x10[26]; output[23] = x10[58]; output[24] = x10[6]; output[25] = x10[38]; output[26] = x10[22]; output[27] = x10[54]; output[28] = x10[14]; output[29] = x10[46]; output[30] = x10[30]; output[31] = x10[62]; output[32] = x10[1]; output[33] = x10[33]; output[34] = x10[17]; output[35] = x10[49]; output[36] = x10[9]; output[37] = x10[41]; output[38] = x10[25]; output[39] = x10[57]; output[40] = x10[5]; output[41] = x10[37]; output[42] = x10[21]; output[43] = x10[53]; output[44] = x10[13]; output[45] = x10[45]; output[46] = x10[29]; output[47] = x10[61]; output[48] = x10[3]; output[49] = x10[35]; output[50] = x10[19]; output[51] = x10[51]; output[52] = x10[11]; output[53] = x10[43]; output[54] = x10[27]; output[55] = x10[59]; output[56] = x10[7]; output[57] = x10[39]; output[58] = x10[23]; output[59] = x10[55]; output[60] = x10[15]; output[61] = x10[47]; output[62] = x10[31]; output[63] = x10[63]; } static void highbd_fidentity32_x4_neon(const int32x4_t *input, int32x4_t *output, int cos_bit) { (void)cos_bit; for (int i = 0; i < 32; i++) { output[i] = vshlq_n_s32(input[i], 2); } } TRANSFORM_COL_MANY(fdct32, 32) TRANSFORM_COL_MANY(fidentity32, 32) static const fwd_transform_1d_col_many_neon col_highbd_txfm32_x4_arr[TX_TYPES] = { highbd_fdct32_col_many_neon, // DCT_DCT NULL, // ADST_DCT NULL, // DCT_ADST NULL, // ADST_ADST NULL, // FLIPADST_DCT NULL, // DCT_FLIPADST NULL, // FLIPADST_FLIPADST NULL, // ADST_FLIPADST NULL, // FLIPADST_ADST highbd_fidentity32_col_many_neon, // IDTX NULL, // V_DCT NULL, // H_DCT NULL, // V_ADST NULL, // H_ADST NULL, // V_FLIPADST NULL // H_FLIPADST }; TRANSFORM_ROW_MANY(fdct32, 32) TRANSFORM_ROW_MANY(fidentity32, 32) static const fwd_transform_1d_row_many_neon row_highbd_txfm32_x4_arr[TX_TYPES] = { highbd_fdct32_row_many_neon, // DCT_DCT NULL, // ADST_DCT NULL, // DCT_ADST NULL, // ADST_ADST NULL, // FLIPADST_DCT NULL, // DCT_FLIPADST NULL, // FLIPADST_FLIPADST NULL, // ADST_FLIPADST NULL, // FLIPADST_ADST highbd_fidentity32_row_many_neon, // IDTX NULL, // V_DCT NULL, // H_DCT NULL, // V_ADST NULL, // H_ADST NULL, // V_FLIPADST NULL // H_FLIPADST }; TRANSFORM_ROW_RECT_MANY(fdct32, 32) TRANSFORM_ROW_RECT_MANY(fidentity32, 32) static const fwd_transform_1d_row_many_neon row_rect_highbd_txfm32_x4_arr[TX_TYPES] = { highbd_fdct32_row_rect_many_neon, // DCT_DCT NULL, // ADST_DCT NULL, // DCT_ADST NULL, // ADST_ADST NULL, // FLIPADST_DCT NULL, // DCT_FLIPADST NULL, // FLIPADST_FLIPADST NULL, // ADST_FLIPADST NULL, // FLIPADST_ADST highbd_fidentity32_row_rect_many_neon, // IDTX NULL, // V_DCT NULL, // H_DCT NULL, // V_ADST NULL, // H_ADST NULL, // V_FLIPADST NULL // H_FLIPADST }; void av1_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)bd; const fwd_transform_1d_col_many_neon col_txfm = col_highbd_txfm8_xn_arr[tx_type]; const fwd_transform_1d_row_many_neon row_txfm = row_rect_highbd_txfm16_xn_arr[tx_type]; int bit = av1_fwd_cos_bit_col[2][1]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); // Column-wise transform. int32x4_t buf0[32]; if (lr_flip) { col_txfm(input, buf0 + 3 * 8, stride, bit, /*lr_flip=*/1, /*howmany=*/4, /*hm_stride=*/-8); } else { col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/4, /*hm_stride=*/8); } shift_right_2_round_s32_x4(buf0, buf0, 32); int32x4_t buf1[32]; transpose_arrays_s32_16x8(buf0, buf1); // Row-wise transform. row_txfm(buf1, coeff, bit, /*howmany=*/2, /*hm_stride=*/16, /*stride=*/8); } void av1_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)bd; const fwd_transform_1d_col_many_neon col_txfm = col_highbd_txfm16_xn_arr[tx_type]; const fwd_transform_1d_row_many_neon row_txfm = row_rect_highbd_txfm8_xn_arr[tx_type]; int bit = av1_fwd_cos_bit_col[1][2]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); // Column-wise transform. int32x4_t buf0[32]; if (lr_flip) { col_txfm(input, buf0 + 16, stride, bit, /*lr_flip=*/1, /*howmany=*/2, /*hm_stride=*/-16); } else { col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/2, /*hm_stride=*/16); } shift_right_2_round_s32_x4(buf0, buf0, 32); int32x4_t buf1[32]; transpose_arrays_s32_8x16(buf0, buf1); // Row-wise transform. row_txfm(buf1, coeff, bit, /*howmany=*/4, /*hm_stride=*/8, /*stride=*/16); } #if !CONFIG_REALTIME_ONLY void av1_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)bd; int bitcol = av1_fwd_cos_bit_col[0][2]; int bitrow = av1_fwd_cos_bit_row[0][2]; const fwd_transform_1d_col_many_neon col_txfm = col_highbd_txfm16_xn_arr[tx_type]; const fwd_transform_1d_row_many_neon row_txfm = row_highbd_txfm4_xn_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); // Column-wise transform. int32x4_t buf0[16]; if (lr_flip) { col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/1, /*howmany=*/1, /*hm_stride=*/0); } else { col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/1, /*hm_stride=*/0); } shift_right_1_round_s32_x4(buf0, buf0, 16); int32x4_t buf1[16]; transpose_arrays_s32_4x16(buf0, buf1); // Row-wise transform. row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/4, /*stride=*/16); } #endif void av1_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)bd; int bitcol = av1_fwd_cos_bit_col[2][0]; int bitrow = av1_fwd_cos_bit_row[2][0]; const fwd_transform_1d_col_many_neon col_txfm = col_highbd_txfm4_xn_arr[tx_type]; const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm16_xn_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); // Column-wise transform. int32x4_t buf0[16]; if (lr_flip) { col_txfm(input, buf0 + 3 * 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/4, /*hm_stride=*/-4); } else { col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4, /*hm_stride=*/4); } shift_right_1_round_s32_x4(buf0, buf0, 16); transpose_arrays_s32_4x16(buf0, buf0); // Row-wise transform. row_txfm(buf0, coeff, bitrow, /*stride=*/4); } void av1_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)bd; const fwd_transform_1d_col_many_neon col_txfm = col_highbd_txfm32_x4_arr[tx_type]; const fwd_transform_1d_row_many_neon row_txfm = row_rect_highbd_txfm16_xn_arr[tx_type]; int bitcol = av1_fwd_cos_bit_col[2][3]; int bitrow = av1_fwd_cos_bit_row[2][3]; // Column-wise transform. int32x4_t buf0[128]; col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4, /*hm_stride=*/32); shift_right_4_round_s32_x4(buf0, buf0, 128); int32x4_t buf1[128]; transpose_arrays_s32_16x32(buf0, buf1); // Row-wise transform. row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/16, /*stride=*/32); } void av1_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)bd; (void)tx_type; int bitcol = av1_fwd_cos_bit_col[3][4]; int bitrow = av1_fwd_cos_bit_row[3][4]; // Column-wise transform. int32x4_t buf0[512]; load_buffer_32x64(input, buf0, stride, 0); for (int i = 0; i < 8; i++) { highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol); } shift_right_2_round_s32_x4(buf0, buf0, 512); int32x4_t buf1[512]; transpose_arrays_s32_32x64(buf0, buf1); // Row-wise transform. for (int i = 0; i < 16; i++) { highbd_fdct32_x4_neon(buf1 + i * 32, buf1 + i * 32, bitrow); } round_shift2_rect_array_s32_neon(buf1, buf1, 512); store_buffer_32x32(buf1, coeff, /*stride=*/32); } void av1_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)bd; (void)tx_type; int bitcol = av1_fwd_cos_bit_col[4][3]; int bitrow = av1_fwd_cos_bit_row[4][3]; // Column-wise transform. int32x4_t buf0[512]; load_buffer_64x32(input, buf0, stride, 0); for (int i = 0; i < 16; i++) { highbd_fdct32_x4_neon(buf0 + i * 32, buf0 + i * 32, bitcol); } shift_right_4_round_s32_x4(buf0, buf0, 512); int32x4_t buf1[512]; transpose_arrays_s32_64x32(buf0, buf1); // Row-wise transform. for (int i = 0; i < 8; i++) { highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow); } round_shift2_rect_array_s32_neon(buf1, buf1, 512); store_buffer_64x32(buf1, coeff, /*stride=*/32); } void av1_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)bd; const fwd_transform_1d_col_many_neon col_txfm = col_highbd_txfm16_xn_arr[tx_type]; const fwd_transform_1d_row_many_neon row_txfm = row_rect_highbd_txfm32_x4_arr[tx_type]; int bitcol = av1_fwd_cos_bit_col[3][2]; int bitrow = av1_fwd_cos_bit_row[3][2]; // Column-wise transform. int32x4_t buf0[128]; col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8, /*hm_stride=*/16); shift_right_4_round_s32_x4(buf0, buf0, 128); int32x4_t buf1[128]; transpose_arrays_s32_32x16(buf0, buf1); // Row-wise transform. row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/32, /*stride=*/16); } #if !CONFIG_REALTIME_ONLY void av1_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)bd; const fwd_transform_1d_col_many_neon col_txfm = col_highbd_txfm32_x4_arr[tx_type]; const fwd_transform_1d_row_many_neon row_txfm = row_highbd_txfm8_xn_arr[tx_type]; int bitcol = av1_fwd_cos_bit_col[1][3]; int bitrow = av1_fwd_cos_bit_row[1][3]; // Column-wise transform. int32x4_t buf0[64]; col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2, /*hm_stride=*/32); shift_right_2_round_s32_x4(buf0, buf0, 64); int32x4_t buf1[64]; transpose_arrays_s32_8x32(buf0, buf1); // Row-wise transform. row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/8, /*stride=*/32); } void av1_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)bd; const fwd_transform_1d_col_many_neon col_txfm = col_highbd_txfm8_xn_arr[tx_type]; const fwd_transform_1d_row_many_neon row_txfm = row_highbd_txfm32_x4_arr[tx_type]; int bitcol = av1_fwd_cos_bit_col[3][1]; int bitrow = av1_fwd_cos_bit_row[3][1]; // Column-wise transform. int32x4_t buf0[64]; col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8, /*hm_stride=*/8); shift_right_2_round_s32_x4(buf0, buf0, 64); int32x4_t buf1[64]; transpose_arrays_s32_32x8(buf0, buf1); // Row-wise transform. row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/32, /*stride=*/8); } #endif void av1_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)bd; int bitcol = av1_fwd_cos_bit_col[0][1]; int bitrow = av1_fwd_cos_bit_row[0][1]; const fwd_transform_1d_col_neon col_txfm = col_highbd_txfm8_x4_arr[tx_type]; const fwd_transform_1d_row_many_neon row_txfm = row_rect_highbd_txfm4_xn_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); // Column-wise transform. int32x4_t buf0[8]; col_txfm(input, buf0, stride, bitcol, lr_flip); shift_right_1_round_s32_x4(buf0, buf0, 8); int32x4_t buf1[8]; transpose_arrays_s32_4x8(buf0, buf1); // Row-wise transform. row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/4, /*stride=*/8); } void av1_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)bd; const int bitcol = av1_fwd_cos_bit_col[1][0]; const int bitrow = av1_fwd_cos_bit_row[1][0]; const fwd_transform_1d_col_many_neon col_txfm = col_highbd_txfm4_xn_arr[tx_type]; const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm8_x4_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); // Column-wise transform. int32x4_t buf0[8]; if (lr_flip) { col_txfm(input, buf0 + 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/2, /*hm_stride=*/-4); } else { col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2, /*hm_stride=*/4); } shift_right_1_round_s32_x4(buf0, buf0, 8); int32x4_t buf1[8]; transpose_arrays_s32_8x4(buf0, buf1); // Row-wise transform. row_txfm(buf1, coeff, bitrow, /*stride=*/4); } #if !CONFIG_REALTIME_ONLY void av1_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)bd; const int bitcol = av1_fwd_cos_bit_col[2][4]; const int bitrow = av1_fwd_cos_bit_row[2][4]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 64); // Column-wise transform. int32x4_t buf0[256]; load_buffer_16x64(input, buf0, stride, lr_flip); for (int i = 0; i < 4; i++) { highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol); } shift_right_2_round_s32_x4(buf0, buf0, 256); int32x4_t buf1[256]; transpose_arrays_s32_16x64(buf0, buf1); // Row-wise transform. highbd_fdct16_xn_neon(buf1, buf1, bitrow, 8); store_buffer_16x32(buf1, coeff, /*stride=*/32); } void av1_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)bd; const int bitcol = av1_fwd_cos_bit_col[4][2]; const int bitrow = av1_fwd_cos_bit_row[4][2]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); // Column-wise transform. int32x4_t buf0[256]; load_buffer_64x16(input, buf0, stride, lr_flip); highbd_fdct16_xn_neon(buf0, buf0, bitcol, 16); shift_right_4_round_s32_x4(buf0, buf0, 256); int32x4_t buf1[256]; transpose_arrays_s32_64x16(buf0, buf1); // Row-wise transform. for (int i = 0; i < 4; i++) { highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow); } store_buffer_64x16(buf1, coeff, /*stride=*/16); memset(coeff + 16 * 32, 0, 16 * 32 * sizeof(*coeff)); } #endif void av1_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; const fwd_transform_1d_col_many_neon col_txfm = col_highbd_txfm32_x4_arr[tx_type]; const fwd_transform_1d_row_many_neon row_txfm = row_highbd_txfm32_x4_arr[tx_type]; // Column-wise transform. int32x4_t buf0[256]; col_txfm(input, buf0, stride, /*cos_bit=*/12, /*lr_flip=*/0, /*howmany=*/8, /*hm_stride=*/32); shift_right_4_round_s32_x4(buf0, buf0, 256); int32x4_t buf1[256]; transpose_arrays_s32_32x32(buf0, buf1); // Row-wise transform. row_txfm(buf1, output, /*cos_bit=*/12, /*howmany=*/8, /*hm_stride=*/32, /*stride=*/32); } void av1_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; (void)tx_type; // Column-wise transform. int32x4_t buf0[1024]; load_buffer_64x64(input, buf0, stride, 0); for (int col = 0; col < 16; col++) { highbd_fdct64_x4_neon(buf0 + col * 64, buf0 + col * 64, 13); } shift_right_2_round_s32_x4(buf0, buf0, 1024); int32x4_t buf1[1024]; transpose_arrays_s32_64x64(buf0, buf1); // Row-wise transform. for (int col = 0; col < 8; col++) { highbd_fdct64_x4_neon(buf1 + col * 64, buf1 + col * 64, 10); } shift_right_2_round_s32_x4(buf1, buf1, 512); store_buffer_64x32(buf1, output, /*stride=*/32); } aom-3.12.1/av1/encoder/arm/highbd_pickrst_neon.c000066400000000000000000002316061477627663500214450ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "av1/encoder/arm/pickrst_neon.h" #include "av1/encoder/pickrst.h" static inline void highbd_calc_proj_params_r0_r1_neon( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { assert(width % 8 == 0); const int size = width * height; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); int64x2_t h00_lo = vdupq_n_s64(0); int64x2_t h00_hi = vdupq_n_s64(0); int64x2_t h11_lo = vdupq_n_s64(0); int64x2_t h11_hi = vdupq_n_s64(0); int64x2_t h01_lo = vdupq_n_s64(0); int64x2_t h01_hi = vdupq_n_s64(0); int64x2_t c0_lo = vdupq_n_s64(0); int64x2_t c0_hi = vdupq_n_s64(0); int64x2_t c1_lo = vdupq_n_s64(0); int64x2_t c1_hi = vdupq_n_s64(0); do { const uint16_t *src_ptr = src; const uint16_t *dat_ptr = dat; int32_t *flt0_ptr = flt0; int32_t *flt1_ptr = flt1; int w = width; do { uint16x8_t s = vld1q_u16(src_ptr); uint16x8_t d = vld1q_u16(dat_ptr); int32x4_t f0_lo = vld1q_s32(flt0_ptr); int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); int32x4_t f1_lo = vld1q_s32(flt1_ptr); int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); int32x4_t u_lo = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); int32x4_t u_hi = vreinterpretq_s32_u32( vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); int32x4_t s_lo = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); int32x4_t s_hi = vreinterpretq_s32_u32( vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); s_lo = vsubq_s32(s_lo, u_lo); s_hi = vsubq_s32(s_hi, u_hi); f0_lo = vsubq_s32(f0_lo, u_lo); f0_hi = vsubq_s32(f0_hi, u_hi); f1_lo = vsubq_s32(f1_lo, u_lo); f1_hi = vsubq_s32(f1_hi, u_hi); h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo)); h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo)); h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi)); h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi)); c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); src_ptr += 8; dat_ptr += 8; flt0_ptr += 8; flt1_ptr += 8; w -= 8; } while (w != 0); src += src_stride; dat += dat_stride; flt0 += flt0_stride; flt1 += flt1_stride; } while (--height != 0); H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size; H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; H[1][0] = H[0][1]; C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; } static inline void highbd_calc_proj_params_r0_neon( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int64_t H[2][2], int64_t C[2]) { assert(width % 8 == 0); const int size = width * height; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); int64x2_t h00_lo = vdupq_n_s64(0); int64x2_t h00_hi = vdupq_n_s64(0); int64x2_t c0_lo = vdupq_n_s64(0); int64x2_t c0_hi = vdupq_n_s64(0); do { const uint16_t *src_ptr = src; const uint16_t *dat_ptr = dat; int32_t *flt0_ptr = flt0; int w = width; do { uint16x8_t s = vld1q_u16(src_ptr); uint16x8_t d = vld1q_u16(dat_ptr); int32x4_t f0_lo = vld1q_s32(flt0_ptr); int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); int32x4_t u_lo = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); int32x4_t u_hi = vreinterpretq_s32_u32( vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); int32x4_t s_lo = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); int32x4_t s_hi = vreinterpretq_s32_u32( vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); s_lo = vsubq_s32(s_lo, u_lo); s_hi = vsubq_s32(s_hi, u_hi); f0_lo = vsubq_s32(f0_lo, u_lo); f0_hi = vsubq_s32(f0_hi, u_hi); h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); src_ptr += 8; dat_ptr += 8; flt0_ptr += 8; w -= 8; } while (w != 0); src += src_stride; dat += dat_stride; flt0 += flt0_stride; } while (--height != 0); H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; } static inline void highbd_calc_proj_params_r1_neon( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { assert(width % 8 == 0); const int size = width * height; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); int64x2_t h11_lo = vdupq_n_s64(0); int64x2_t h11_hi = vdupq_n_s64(0); int64x2_t c1_lo = vdupq_n_s64(0); int64x2_t c1_hi = vdupq_n_s64(0); do { const uint16_t *src_ptr = src; const uint16_t *dat_ptr = dat; int32_t *flt1_ptr = flt1; int w = width; do { uint16x8_t s = vld1q_u16(src_ptr); uint16x8_t d = vld1q_u16(dat_ptr); int32x4_t f1_lo = vld1q_s32(flt1_ptr); int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); int32x4_t u_lo = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); int32x4_t u_hi = vreinterpretq_s32_u32( vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); int32x4_t s_lo = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); int32x4_t s_hi = vreinterpretq_s32_u32( vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); s_lo = vsubq_s32(s_lo, u_lo); s_hi = vsubq_s32(s_hi, u_hi); f1_lo = vsubq_s32(f1_lo, u_lo); f1_hi = vsubq_s32(f1_hi, u_hi); h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); src_ptr += 8; dat_ptr += 8; flt1_ptr += 8; w -= 8; } while (w != 0); src += src_stride; dat += dat_stride; flt1 += flt1_stride; } while (--height != 0); H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; } // The function calls 3 subfunctions for the following cases : // 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements // of C and H need to be computed. // 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are // non-zero and need to be computed. // 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are // non-zero and need to be computed. void av1_calc_proj_params_high_bd_neon(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params) { if ((params->r[0] > 0) && (params->r[1] > 0)) { highbd_calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, flt1, flt1_stride, H, C); } else if (params->r[0] > 0) { highbd_calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, H, C); } else if (params->r[1] > 0) { highbd_calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, dat_stride, flt1, flt1_stride, H, C); } } static inline void hadd_update_4_stats_neon(const int64_t *const src, const int32x4_t *deltas, int64_t *const dst) { int64x2_t delta0_s64 = vpaddlq_s32(deltas[0]); int64x2_t delta1_s64 = vpaddlq_s32(deltas[1]); int64x2_t delta2_s64 = vpaddlq_s32(deltas[2]); int64x2_t delta3_s64 = vpaddlq_s32(deltas[3]); #if AOM_ARCH_AARCH64 int64x2_t delta01 = vpaddq_s64(delta0_s64, delta1_s64); int64x2_t delta23 = vpaddq_s64(delta2_s64, delta3_s64); int64x2_t src0 = vld1q_s64(src); int64x2_t src1 = vld1q_s64(src + 2); vst1q_s64(dst, vaddq_s64(src0, delta01)); vst1q_s64(dst + 2, vaddq_s64(src1, delta23)); #else dst[0] = src[0] + horizontal_add_s64x2(delta0_s64); dst[1] = src[1] + horizontal_add_s64x2(delta1_s64); dst[2] = src[2] + horizontal_add_s64x2(delta2_s64); dst[3] = src[3] + horizontal_add_s64x2(delta3_s64); #endif } static inline void compute_stats_win5_highbd_neon( const int16_t *const d, const int32_t d_stride, const int16_t *const s, const int32_t s_stride, const int32_t width, const int32_t height, int64_t *const M, int64_t *const H, aom_bit_depth_t bit_depth) { const int32_t wiener_win = WIENER_WIN_CHROMA; const int32_t wiener_win2 = wiener_win * wiener_win; const int32_t w16 = width & ~15; const int32_t h8 = height & ~7; int16x8_t mask[2]; mask[0] = vld1q_s16(&(mask_16bit[16]) - width % 16); mask[1] = vld1q_s16(&(mask_16bit[16]) - width % 16 + 8); int32_t i, j, x, y; const int32_t num_bit_left = 32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 2 /* SIMD */; const int32_t h_allowed = (1 << num_bit_left) / (w16 + ((w16 != width) ? 16 : 0)); // Step 1: Calculate the top edge of the whole matrix, i.e., the top // edge of each triangle and square on the top row. j = 0; do { const int16_t *s_t = s; const int16_t *d_t = d; int32_t height_t = 0; int64x2_t sum_m[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) }; int64x2_t sum_h[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) }; int16x8_t src[2], dgd[2]; do { const int32_t h_t = ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed; int32x4_t row_m[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) }; int32x4_t row_h[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) }; y = h_t; do { x = 0; while (x < w16) { src[0] = vld1q_s16(s_t + x + 0); src[1] = vld1q_s16(s_t + x + 8); dgd[0] = vld1q_s16(d_t + x + 0); dgd[1] = vld1q_s16(d_t + x + 8); stats_top_win5_neon(src, dgd, d_t + j + x, d_stride, row_m, row_h); x += 16; } if (w16 != width) { src[0] = vld1q_s16(s_t + w16 + 0); src[1] = vld1q_s16(s_t + w16 + 8); dgd[0] = vld1q_s16(d_t + w16 + 0); dgd[1] = vld1q_s16(d_t + w16 + 8); src[0] = vandq_s16(src[0], mask[0]); src[1] = vandq_s16(src[1], mask[1]); dgd[0] = vandq_s16(dgd[0], mask[0]); dgd[1] = vandq_s16(dgd[1], mask[1]); stats_top_win5_neon(src, dgd, d_t + j + w16, d_stride, row_m, row_h); } s_t += s_stride; d_t += d_stride; } while (--y); sum_m[0] = vpadalq_s32(sum_m[0], row_m[0]); sum_m[1] = vpadalq_s32(sum_m[1], row_m[1]); sum_m[2] = vpadalq_s32(sum_m[2], row_m[2]); sum_m[3] = vpadalq_s32(sum_m[3], row_m[3]); sum_m[4] = vpadalq_s32(sum_m[4], row_m[4]); sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]); sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]); sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]); sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]); sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]); height_t += h_t; } while (height_t < height); #if AOM_ARCH_AARCH64 int64x2_t sum_m0 = vpaddq_s64(sum_m[0], sum_m[1]); int64x2_t sum_m2 = vpaddq_s64(sum_m[2], sum_m[3]); vst1q_s64(&M[wiener_win * j + 0], sum_m0); vst1q_s64(&M[wiener_win * j + 2], sum_m2); M[wiener_win * j + 4] = vaddvq_s64(sum_m[4]); int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]); int64x2_t sum_h2 = vpaddq_s64(sum_h[2], sum_h[3]); vst1q_s64(&H[wiener_win * j + 0], sum_h0); vst1q_s64(&H[wiener_win * j + 2], sum_h2); H[wiener_win * j + 4] = vaddvq_s64(sum_h[4]); #else M[wiener_win * j + 0] = horizontal_add_s64x2(sum_m[0]); M[wiener_win * j + 1] = horizontal_add_s64x2(sum_m[1]); M[wiener_win * j + 2] = horizontal_add_s64x2(sum_m[2]); M[wiener_win * j + 3] = horizontal_add_s64x2(sum_m[3]); M[wiener_win * j + 4] = horizontal_add_s64x2(sum_m[4]); H[wiener_win * j + 0] = horizontal_add_s64x2(sum_h[0]); H[wiener_win * j + 1] = horizontal_add_s64x2(sum_h[1]); H[wiener_win * j + 2] = horizontal_add_s64x2(sum_h[2]); H[wiener_win * j + 3] = horizontal_add_s64x2(sum_h[3]); H[wiener_win * j + 4] = horizontal_add_s64x2(sum_h[4]); #endif // AOM_ARCH_AARCH64 } while (++j < wiener_win); // Step 2: Calculate the left edge of each square on the top row. j = 1; do { const int16_t *d_t = d; int32_t height_t = 0; int64x2_t sum_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s64(0) }; int16x8_t dgd[2]; do { const int32_t h_t = ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed; int32x4_t row_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s32(0) }; y = h_t; do { x = 0; while (x < w16) { dgd[0] = vld1q_s16(d_t + j + x + 0); dgd[1] = vld1q_s16(d_t + j + x + 8); stats_left_win5_neon(dgd, d_t + x, d_stride, row_h); x += 16; } if (w16 != width) { dgd[0] = vld1q_s16(d_t + j + x + 0); dgd[1] = vld1q_s16(d_t + j + x + 8); dgd[0] = vandq_s16(dgd[0], mask[0]); dgd[1] = vandq_s16(dgd[1], mask[1]); stats_left_win5_neon(dgd, d_t + x, d_stride, row_h); } d_t += d_stride; } while (--y); sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]); sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]); sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]); sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]); height_t += h_t; } while (height_t < height); #if AOM_ARCH_AARCH64 int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]); int64x2_t sum_h1 = vpaddq_s64(sum_h[2], sum_h[3]); vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h0)); vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h0)); vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h1)); vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h1)); #else H[1 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[0]); H[2 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[1]); H[3 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[2]); H[4 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[3]); #endif // AOM_ARCH_AARCH64 } while (++j < wiener_win); // Step 3: Derive the top edge of each triangle along the diagonal. No // triangle in top row. { const int16_t *d_t = d; if (height % 2) { int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; int16x8_t ds[WIENER_WIN * 2]; load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]); load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]); d_t += 4 * d_stride; step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas); transpose_arrays_s32_8x8(deltas, deltas_tr); update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win, deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0), H + 1 * wiener_win * wiener_win2 + 1 * wiener_win); update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win, deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0), H + 2 * wiener_win * wiener_win2 + 2 * wiener_win); update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win, deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0), H + 3 * wiener_win * wiener_win2 + 3 * wiener_win); update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win, deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0), H + 4 * wiener_win * wiener_win2 + 4 * wiener_win); } else { int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) }; int16x8_t ds[WIENER_WIN_CHROMA * 2]; ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width); ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width); ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width); ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width); step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas); transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2], &deltas[3]); update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win, deltas[0], vgetq_lane_s32(deltas[4], 0), H + 1 * wiener_win * wiener_win2 + 1 * wiener_win); update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win, deltas[1], vgetq_lane_s32(deltas[4], 1), H + 2 * wiener_win * wiener_win2 + 2 * wiener_win); update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win, deltas[2], vgetq_lane_s32(deltas[4], 2), H + 3 * wiener_win * wiener_win2 + 3 * wiener_win); update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win, deltas[3], vgetq_lane_s32(deltas[4], 3), H + 4 * wiener_win * wiener_win2 + 4 * wiener_win); } } // Step 4: Derive the top and left edge of each square. No square in top and // bottom row. { y = h8; int16x4_t d_s[12]; int16x4_t d_e[12]; const int16_t *d_t = d; int16x4_t zeros = vdup_n_s16(0); load_s16_4x4(d_t, d_stride, &d_s[0], &d_s[1], &d_s[2], &d_s[3]); load_s16_4x4(d_t + width, d_stride, &d_e[0], &d_e[1], &d_e[2], &d_e[3]); int32x4_t deltas[6][18] = { { vdupq_n_s32(0) }, { vdupq_n_s32(0) } }; while (y >= 8) { load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6], &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]); load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5], &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]); int16x8_t s_tr[8], e_tr[8]; transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5], d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2], &s_tr[3]); transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros, zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6], &s_tr[7]); transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5], d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2], &e_tr[3]); transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros, zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6], &e_tr[7]); int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5]; start_col0[0] = s_tr[0]; start_col0[1] = vextq_s16(s_tr[0], s_tr[4], 1); start_col0[2] = vextq_s16(s_tr[0], s_tr[4], 2); start_col0[3] = vextq_s16(s_tr[0], s_tr[4], 3); start_col0[4] = vextq_s16(s_tr[0], s_tr[4], 4); start_col1[0] = s_tr[1]; start_col1[1] = vextq_s16(s_tr[1], s_tr[5], 1); start_col1[2] = vextq_s16(s_tr[1], s_tr[5], 2); start_col1[3] = vextq_s16(s_tr[1], s_tr[5], 3); start_col1[4] = vextq_s16(s_tr[1], s_tr[5], 4); start_col2[0] = s_tr[2]; start_col2[1] = vextq_s16(s_tr[2], s_tr[6], 1); start_col2[2] = vextq_s16(s_tr[2], s_tr[6], 2); start_col2[3] = vextq_s16(s_tr[2], s_tr[6], 3); start_col2[4] = vextq_s16(s_tr[2], s_tr[6], 4); start_col3[0] = s_tr[3]; start_col3[1] = vextq_s16(s_tr[3], s_tr[7], 1); start_col3[2] = vextq_s16(s_tr[3], s_tr[7], 2); start_col3[3] = vextq_s16(s_tr[3], s_tr[7], 3); start_col3[4] = vextq_s16(s_tr[3], s_tr[7], 4); // i = 1, j = 2; sub_deltas_step4(start_col0, start_col1, deltas[0]); // i = 1, j = 3; sub_deltas_step4(start_col0, start_col2, deltas[1]); // i = 1, j = 4 sub_deltas_step4(start_col0, start_col3, deltas[2]); // i = 2, j =3 sub_deltas_step4(start_col1, start_col2, deltas[3]); // i = 2, j = 4 sub_deltas_step4(start_col1, start_col3, deltas[4]); // i = 3, j = 4 sub_deltas_step4(start_col2, start_col3, deltas[5]); int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5]; end_col0[0] = e_tr[0]; end_col0[1] = vextq_s16(e_tr[0], e_tr[4], 1); end_col0[2] = vextq_s16(e_tr[0], e_tr[4], 2); end_col0[3] = vextq_s16(e_tr[0], e_tr[4], 3); end_col0[4] = vextq_s16(e_tr[0], e_tr[4], 4); end_col1[0] = e_tr[1]; end_col1[1] = vextq_s16(e_tr[1], e_tr[5], 1); end_col1[2] = vextq_s16(e_tr[1], e_tr[5], 2); end_col1[3] = vextq_s16(e_tr[1], e_tr[5], 3); end_col1[4] = vextq_s16(e_tr[1], e_tr[5], 4); end_col2[0] = e_tr[2]; end_col2[1] = vextq_s16(e_tr[2], e_tr[6], 1); end_col2[2] = vextq_s16(e_tr[2], e_tr[6], 2); end_col2[3] = vextq_s16(e_tr[2], e_tr[6], 3); end_col2[4] = vextq_s16(e_tr[2], e_tr[6], 4); end_col3[0] = e_tr[3]; end_col3[1] = vextq_s16(e_tr[3], e_tr[7], 1); end_col3[2] = vextq_s16(e_tr[3], e_tr[7], 2); end_col3[3] = vextq_s16(e_tr[3], e_tr[7], 3); end_col3[4] = vextq_s16(e_tr[3], e_tr[7], 4); // i = 1, j = 2; add_deltas_step4(end_col0, end_col1, deltas[0]); // i = 1, j = 3; add_deltas_step4(end_col0, end_col2, deltas[1]); // i = 1, j = 4 add_deltas_step4(end_col0, end_col3, deltas[2]); // i = 2, j =3 add_deltas_step4(end_col1, end_col2, deltas[3]); // i = 2, j = 4 add_deltas_step4(end_col1, end_col3, deltas[4]); // i = 3, j = 4 add_deltas_step4(end_col2, end_col3, deltas[5]); d_s[0] = d_s[8]; d_s[1] = d_s[9]; d_s[2] = d_s[10]; d_s[3] = d_s[11]; d_e[0] = d_e[8]; d_e[1] = d_e[9]; d_e[2] = d_e[10]; d_e[3] = d_e[11]; d_t += 8 * d_stride; y -= 8; } if (h8 != height) { const int16x8_t mask_h = vld1q_s16(&mask_16bit[16] - (height % 8)); load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6], &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]); load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5], &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]); int16x8_t s_tr[8], e_tr[8]; transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5], d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2], &s_tr[3]); transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros, zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6], &s_tr[7]); transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5], d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2], &e_tr[3]); transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros, zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6], &e_tr[7]); int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5]; start_col0[0] = vandq_s16(s_tr[0], mask_h); start_col0[1] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 1), mask_h); start_col0[2] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 2), mask_h); start_col0[3] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 3), mask_h); start_col0[4] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 4), mask_h); start_col1[0] = vandq_s16(s_tr[1], mask_h); start_col1[1] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 1), mask_h); start_col1[2] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 2), mask_h); start_col1[3] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 3), mask_h); start_col1[4] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 4), mask_h); start_col2[0] = vandq_s16(s_tr[2], mask_h); start_col2[1] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 1), mask_h); start_col2[2] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 2), mask_h); start_col2[3] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 3), mask_h); start_col2[4] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 4), mask_h); start_col3[0] = vandq_s16(s_tr[3], mask_h); start_col3[1] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 1), mask_h); start_col3[2] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 2), mask_h); start_col3[3] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 3), mask_h); start_col3[4] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 4), mask_h); // i = 1, j = 2; sub_deltas_step4(start_col0, start_col1, deltas[0]); // i = 1, j = 3; sub_deltas_step4(start_col0, start_col2, deltas[1]); // i = 1, j = 4 sub_deltas_step4(start_col0, start_col3, deltas[2]); // i = 2, j = 3 sub_deltas_step4(start_col1, start_col2, deltas[3]); // i = 2, j = 4 sub_deltas_step4(start_col1, start_col3, deltas[4]); // i = 3, j = 4 sub_deltas_step4(start_col2, start_col3, deltas[5]); int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5]; end_col0[0] = vandq_s16(e_tr[0], mask_h); end_col0[1] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 1), mask_h); end_col0[2] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 2), mask_h); end_col0[3] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 3), mask_h); end_col0[4] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 4), mask_h); end_col1[0] = vandq_s16(e_tr[1], mask_h); end_col1[1] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 1), mask_h); end_col1[2] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 2), mask_h); end_col1[3] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 3), mask_h); end_col1[4] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 4), mask_h); end_col2[0] = vandq_s16(e_tr[2], mask_h); end_col2[1] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 1), mask_h); end_col2[2] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 2), mask_h); end_col2[3] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 3), mask_h); end_col2[4] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 4), mask_h); end_col3[0] = vandq_s16(e_tr[3], mask_h); end_col3[1] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 1), mask_h); end_col3[2] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 2), mask_h); end_col3[3] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 3), mask_h); end_col3[4] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 4), mask_h); // i = 1, j = 2; add_deltas_step4(end_col0, end_col1, deltas[0]); // i = 1, j = 3; add_deltas_step4(end_col0, end_col2, deltas[1]); // i = 1, j = 4 add_deltas_step4(end_col0, end_col3, deltas[2]); // i = 2, j =3 add_deltas_step4(end_col1, end_col2, deltas[3]); // i = 2, j = 4 add_deltas_step4(end_col1, end_col3, deltas[4]); // i = 3, j = 4 add_deltas_step4(end_col2, end_col3, deltas[5]); } int32x4_t delta[6][2]; int32_t single_delta[6]; delta[0][0] = horizontal_add_4d_s32x4(&deltas[0][0]); delta[1][0] = horizontal_add_4d_s32x4(&deltas[1][0]); delta[2][0] = horizontal_add_4d_s32x4(&deltas[2][0]); delta[3][0] = horizontal_add_4d_s32x4(&deltas[3][0]); delta[4][0] = horizontal_add_4d_s32x4(&deltas[4][0]); delta[5][0] = horizontal_add_4d_s32x4(&deltas[5][0]); delta[0][1] = horizontal_add_4d_s32x4(&deltas[0][5]); delta[1][1] = horizontal_add_4d_s32x4(&deltas[1][5]); delta[2][1] = horizontal_add_4d_s32x4(&deltas[2][5]); delta[3][1] = horizontal_add_4d_s32x4(&deltas[3][5]); delta[4][1] = horizontal_add_4d_s32x4(&deltas[4][5]); delta[5][1] = horizontal_add_4d_s32x4(&deltas[5][5]); single_delta[0] = horizontal_add_s32x4(deltas[0][4]); single_delta[1] = horizontal_add_s32x4(deltas[1][4]); single_delta[2] = horizontal_add_s32x4(deltas[2][4]); single_delta[3] = horizontal_add_s32x4(deltas[3][4]); single_delta[4] = horizontal_add_s32x4(deltas[4][4]); single_delta[5] = horizontal_add_s32x4(deltas[5][4]); int idx = 0; for (i = 1; i < wiener_win - 1; i++) { for (j = i + 1; j < wiener_win; j++) { update_4_stats_neon( H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win, delta[idx][0], H + i * wiener_win * wiener_win2 + j * wiener_win); H[i * wiener_win * wiener_win2 + j * wiener_win + 4] = H[(i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win + 4] + single_delta[idx]; H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(delta[idx][1], 0); H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(delta[idx][1], 1); H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(delta[idx][1], 2); H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(delta[idx][1], 3); idx++; } } } // Step 5: Derive other points of each square. No square in bottom row. i = 0; do { const int16_t *const di = d + i; j = i + 1; do { const int16_t *const dj = d + j; int32x4_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1] = { { vdupq_n_s32(0) }, { vdupq_n_s32(0) } }; int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA]; int16x8_t d_js[WIN_CHROMA], d_je[WIN_CHROMA]; x = 0; while (x < w16) { load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie, d_js, d_je); derive_square_win5_neon(d_is, d_ie, d_js, d_je, deltas); x += 16; } if (w16 != width) { load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie, d_js, d_je); d_is[0] = vandq_s16(d_is[0], mask[0]); d_is[1] = vandq_s16(d_is[1], mask[1]); d_is[2] = vandq_s16(d_is[2], mask[0]); d_is[3] = vandq_s16(d_is[3], mask[1]); d_is[4] = vandq_s16(d_is[4], mask[0]); d_is[5] = vandq_s16(d_is[5], mask[1]); d_is[6] = vandq_s16(d_is[6], mask[0]); d_is[7] = vandq_s16(d_is[7], mask[1]); d_ie[0] = vandq_s16(d_ie[0], mask[0]); d_ie[1] = vandq_s16(d_ie[1], mask[1]); d_ie[2] = vandq_s16(d_ie[2], mask[0]); d_ie[3] = vandq_s16(d_ie[3], mask[1]); d_ie[4] = vandq_s16(d_ie[4], mask[0]); d_ie[5] = vandq_s16(d_ie[5], mask[1]); d_ie[6] = vandq_s16(d_ie[6], mask[0]); d_ie[7] = vandq_s16(d_ie[7], mask[1]); derive_square_win5_neon(d_is, d_ie, d_js, d_je, deltas); } hadd_update_4_stats_neon( H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0], H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1); hadd_update_4_stats_neon( H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1], H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1); hadd_update_4_stats_neon( H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2], H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1); hadd_update_4_stats_neon( H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3], H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1); } while (++j < wiener_win); } while (++i < wiener_win - 1); // Step 6: Derive other points of each upper triangle along the diagonal. i = 0; do { const int16_t *const di = d + i; int32x4_t deltas[WIENER_WIN_CHROMA * 2 + 1] = { vdupq_n_s32(0) }; int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA]; x = 0; while (x < w16) { load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie); derive_triangle_win5_neon(d_is, d_ie, deltas); x += 16; } if (w16 != width) { load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie); d_is[0] = vandq_s16(d_is[0], mask[0]); d_is[1] = vandq_s16(d_is[1], mask[1]); d_is[2] = vandq_s16(d_is[2], mask[0]); d_is[3] = vandq_s16(d_is[3], mask[1]); d_is[4] = vandq_s16(d_is[4], mask[0]); d_is[5] = vandq_s16(d_is[5], mask[1]); d_is[6] = vandq_s16(d_is[6], mask[0]); d_is[7] = vandq_s16(d_is[7], mask[1]); d_ie[0] = vandq_s16(d_ie[0], mask[0]); d_ie[1] = vandq_s16(d_ie[1], mask[1]); d_ie[2] = vandq_s16(d_ie[2], mask[0]); d_ie[3] = vandq_s16(d_ie[3], mask[1]); d_ie[4] = vandq_s16(d_ie[4], mask[0]); d_ie[5] = vandq_s16(d_ie[5], mask[1]); d_ie[6] = vandq_s16(d_ie[6], mask[0]); d_ie[7] = vandq_s16(d_ie[7], mask[1]); derive_triangle_win5_neon(d_is, d_ie, deltas); } // Row 1: 4 points hadd_update_4_stats_neon( H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas, H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1); // Row 2: 3 points int64x2_t delta4_s64 = vpaddlq_s32(deltas[4]); int64x2_t delta5_s64 = vpaddlq_s32(deltas[5]); #if AOM_ARCH_AARCH64 int64x2_t deltas45 = vpaddq_s64(delta4_s64, delta5_s64); int64x2_t src = vld1q_s64(H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1); int64x2_t dst = vaddq_s64(src, deltas45); vst1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2, dst); #else H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2 + 0] = H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1 + 0] + horizontal_add_s64x2(delta4_s64); H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2 + 1] = H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1 + 1] + horizontal_add_s64x2(delta5_s64); #endif // AOM_ARCH_AARCH64 H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 4] = H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 3] + horizontal_long_add_s32x4(deltas[6]); // Row 3: 2 points int64x2_t delta7_s64 = vpaddlq_s32(deltas[7]); int64x2_t delta8_s64 = vpaddlq_s32(deltas[8]); #if AOM_ARCH_AARCH64 int64x2_t deltas78 = vpaddq_s64(delta7_s64, delta8_s64); vst1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3, vaddq_s64(dst, deltas78)); #else H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3 + 0] = H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2 + 0] + horizontal_add_s64x2(delta7_s64); H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3 + 1] = H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2 + 1] + horizontal_add_s64x2(delta8_s64); #endif // AOM_ARCH_AARCH64 // Row 4: 1 point H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] = H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3] + horizontal_long_add_s32x4(deltas[9]); } while (++i < wiener_win); } static inline void hadd_update_6_stats_neon(const int64_t *const src, const int32x4_t *deltas, int64_t *const dst) { int64x2_t delta0_s64 = vpaddlq_s32(deltas[0]); int64x2_t delta1_s64 = vpaddlq_s32(deltas[1]); int64x2_t delta2_s64 = vpaddlq_s32(deltas[2]); int64x2_t delta3_s64 = vpaddlq_s32(deltas[3]); int64x2_t delta4_s64 = vpaddlq_s32(deltas[4]); int64x2_t delta5_s64 = vpaddlq_s32(deltas[5]); #if AOM_ARCH_AARCH64 int64x2_t delta01 = vpaddq_s64(delta0_s64, delta1_s64); int64x2_t delta23 = vpaddq_s64(delta2_s64, delta3_s64); int64x2_t delta45 = vpaddq_s64(delta4_s64, delta5_s64); int64x2_t src0 = vld1q_s64(src); int64x2_t src1 = vld1q_s64(src + 2); int64x2_t src2 = vld1q_s64(src + 4); vst1q_s64(dst, vaddq_s64(src0, delta01)); vst1q_s64(dst + 2, vaddq_s64(src1, delta23)); vst1q_s64(dst + 4, vaddq_s64(src2, delta45)); #else dst[0] = src[0] + horizontal_add_s64x2(delta0_s64); dst[1] = src[1] + horizontal_add_s64x2(delta1_s64); dst[2] = src[2] + horizontal_add_s64x2(delta2_s64); dst[3] = src[3] + horizontal_add_s64x2(delta3_s64); dst[4] = src[4] + horizontal_add_s64x2(delta4_s64); dst[5] = src[5] + horizontal_add_s64x2(delta5_s64); #endif } static inline void compute_stats_win7_highbd_neon( const int16_t *const d, const int32_t d_stride, const int16_t *const s, const int32_t s_stride, const int32_t width, const int32_t height, int64_t *const M, int64_t *const H, aom_bit_depth_t bit_depth) { const int32_t wiener_win = WIENER_WIN; const int32_t wiener_win2 = wiener_win * wiener_win; const int32_t w16 = width & ~15; const int32_t h8 = height & ~7; int16x8_t mask[2]; mask[0] = vld1q_s16(&(mask_16bit[16]) - width % 16); mask[1] = vld1q_s16(&(mask_16bit[16]) - width % 16 + 8); int32_t i, j, x, y; const int32_t num_bit_left = 32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 2 /* SIMD */; const int32_t h_allowed = (1 << num_bit_left) / (w16 + ((w16 != width) ? 16 : 0)); // Step 1: Calculate the top edge of the whole matrix, i.e., the top // edge of each triangle and square on the top row. j = 0; do { const int16_t *s_t = s; const int16_t *d_t = d; int32_t height_t = 0; int64x2_t sum_m[WIENER_WIN] = { vdupq_n_s64(0) }; int64x2_t sum_h[WIENER_WIN] = { vdupq_n_s64(0) }; int16x8_t src[2], dgd[2]; do { const int32_t h_t = ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed; int32x4_t row_m[WIENER_WIN * 2] = { vdupq_n_s32(0) }; int32x4_t row_h[WIENER_WIN * 2] = { vdupq_n_s32(0) }; y = h_t; do { x = 0; while (x < w16) { src[0] = vld1q_s16(s_t + x); src[1] = vld1q_s16(s_t + x + 8); dgd[0] = vld1q_s16(d_t + x); dgd[1] = vld1q_s16(d_t + x + 8); stats_top_win7_neon(src, dgd, d_t + j + x, d_stride, row_m, row_h); x += 16; } if (w16 != width) { src[0] = vld1q_s16(s_t + w16); src[1] = vld1q_s16(s_t + w16 + 8); dgd[0] = vld1q_s16(d_t + w16); dgd[1] = vld1q_s16(d_t + w16 + 8); src[0] = vandq_s16(src[0], mask[0]); src[1] = vandq_s16(src[1], mask[1]); dgd[0] = vandq_s16(dgd[0], mask[0]); dgd[1] = vandq_s16(dgd[1], mask[1]); stats_top_win7_neon(src, dgd, d_t + j + w16, d_stride, row_m, row_h); } s_t += s_stride; d_t += d_stride; } while (--y); sum_m[0] = vpadalq_s32(sum_m[0], row_m[0]); sum_m[1] = vpadalq_s32(sum_m[1], row_m[1]); sum_m[2] = vpadalq_s32(sum_m[2], row_m[2]); sum_m[3] = vpadalq_s32(sum_m[3], row_m[3]); sum_m[4] = vpadalq_s32(sum_m[4], row_m[4]); sum_m[5] = vpadalq_s32(sum_m[5], row_m[5]); sum_m[6] = vpadalq_s32(sum_m[6], row_m[6]); sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]); sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]); sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]); sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]); sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]); sum_h[5] = vpadalq_s32(sum_h[5], row_h[5]); sum_h[6] = vpadalq_s32(sum_h[6], row_h[6]); height_t += h_t; } while (height_t < height); #if AOM_ARCH_AARCH64 vst1q_s64(M + wiener_win * j + 0, vpaddq_s64(sum_m[0], sum_m[1])); vst1q_s64(M + wiener_win * j + 2, vpaddq_s64(sum_m[2], sum_m[3])); vst1q_s64(M + wiener_win * j + 4, vpaddq_s64(sum_m[4], sum_m[5])); M[wiener_win * j + 6] = vaddvq_s64(sum_m[6]); vst1q_s64(H + wiener_win * j + 0, vpaddq_s64(sum_h[0], sum_h[1])); vst1q_s64(H + wiener_win * j + 2, vpaddq_s64(sum_h[2], sum_h[3])); vst1q_s64(H + wiener_win * j + 4, vpaddq_s64(sum_h[4], sum_h[5])); H[wiener_win * j + 6] = vaddvq_s64(sum_h[6]); #else M[wiener_win * j + 0] = horizontal_add_s64x2(sum_m[0]); M[wiener_win * j + 1] = horizontal_add_s64x2(sum_m[1]); M[wiener_win * j + 2] = horizontal_add_s64x2(sum_m[2]); M[wiener_win * j + 3] = horizontal_add_s64x2(sum_m[3]); M[wiener_win * j + 4] = horizontal_add_s64x2(sum_m[4]); M[wiener_win * j + 5] = horizontal_add_s64x2(sum_m[5]); M[wiener_win * j + 6] = horizontal_add_s64x2(sum_m[6]); H[wiener_win * j + 0] = horizontal_add_s64x2(sum_h[0]); H[wiener_win * j + 1] = horizontal_add_s64x2(sum_h[1]); H[wiener_win * j + 2] = horizontal_add_s64x2(sum_h[2]); H[wiener_win * j + 3] = horizontal_add_s64x2(sum_h[3]); H[wiener_win * j + 4] = horizontal_add_s64x2(sum_h[4]); H[wiener_win * j + 5] = horizontal_add_s64x2(sum_h[5]); H[wiener_win * j + 6] = horizontal_add_s64x2(sum_h[6]); #endif // AOM_ARCH_AARCH64 } while (++j < wiener_win); // Step 2: Calculate the left edge of each square on the top row. j = 1; do { const int16_t *d_t = d; int32_t height_t = 0; int64x2_t sum_h[WIENER_WIN - 1] = { vdupq_n_s64(0) }; int16x8_t dgd[2]; do { const int32_t h_t = ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed; int32x4_t row_h[WIENER_WIN - 1] = { vdupq_n_s32(0) }; y = h_t; do { x = 0; while (x < w16) { dgd[0] = vld1q_s16(d_t + j + x + 0); dgd[1] = vld1q_s16(d_t + j + x + 8); stats_left_win7_neon(dgd, d_t + x, d_stride, row_h); x += 16; } if (w16 != width) { dgd[0] = vld1q_s16(d_t + j + x + 0); dgd[1] = vld1q_s16(d_t + j + x + 8); dgd[0] = vandq_s16(dgd[0], mask[0]); dgd[1] = vandq_s16(dgd[1], mask[1]); stats_left_win7_neon(dgd, d_t + x, d_stride, row_h); } d_t += d_stride; } while (--y); sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]); sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]); sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]); sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]); sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]); sum_h[5] = vpadalq_s32(sum_h[5], row_h[5]); height_t += h_t; } while (height_t < height); #if AOM_ARCH_AARCH64 int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]); int64x2_t sum_h2 = vpaddq_s64(sum_h[2], sum_h[3]); int64x2_t sum_h4 = vpaddq_s64(sum_h[4], sum_h[5]); vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h0)); vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h0)); vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h2)); vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h2)); vst1_s64(&H[5 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h4)); vst1_s64(&H[6 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h4)); #else H[1 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[0]); H[2 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[1]); H[3 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[2]); H[4 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[3]); H[5 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[4]); H[6 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[5]); #endif // AOM_ARCH_AARCH64 } while (++j < wiener_win); // Step 3: Derive the top edge of each triangle along the diagonal. No // triangle in top row. { const int16_t *d_t = d; // Pad to call transpose function. int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; int16x8_t ds[WIENER_WIN * 2]; load_s16_8x6(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6], &ds[8], &ds[10]); load_s16_8x6(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7], &ds[9], &ds[11]); d_t += 6 * d_stride; step3_win7_neon(d_t, d_stride, width, height, ds, deltas); transpose_arrays_s32_8x8(deltas, deltas_tr); update_8_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win, deltas_tr[0], deltas_tr[4], H + 1 * wiener_win * wiener_win2 + 1 * wiener_win); update_8_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win, deltas_tr[1], deltas_tr[5], H + 2 * wiener_win * wiener_win2 + 2 * wiener_win); update_8_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win, deltas_tr[2], deltas_tr[6], H + 3 * wiener_win * wiener_win2 + 3 * wiener_win); update_8_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win, deltas_tr[3], deltas_tr[7], H + 4 * wiener_win * wiener_win2 + 4 * wiener_win); update_8_stats_neon(H + 4 * wiener_win * wiener_win2 + 4 * wiener_win, deltas_tr[8], deltas_tr[12], H + 5 * wiener_win * wiener_win2 + 5 * wiener_win); update_8_stats_neon(H + 5 * wiener_win * wiener_win2 + 5 * wiener_win, deltas_tr[9], deltas_tr[13], H + 6 * wiener_win * wiener_win2 + 6 * wiener_win); } // Step 4: Derive the top and left edge of each square. No square in top and // bottom row. i = 1; do { j = i + 1; do { const int16_t *di = d + i - 1; const int16_t *dj = d + j - 1; int32x4_t deltas[(2 * WIENER_WIN - 1) * 2] = { vdupq_n_s32(0) }; int16x8_t dd[WIENER_WIN * 2], ds[WIENER_WIN * 2]; dd[5] = vdupq_n_s16(0); // Initialize to avoid warning. const int16_t dd0_values[] = { di[0 * d_stride], di[1 * d_stride], di[2 * d_stride], di[3 * d_stride], di[4 * d_stride], di[5 * d_stride], 0, 0 }; dd[0] = vld1q_s16(dd0_values); const int16_t dd1_values[] = { di[0 * d_stride + width], di[1 * d_stride + width], di[2 * d_stride + width], di[3 * d_stride + width], di[4 * d_stride + width], di[5 * d_stride + width], 0, 0 }; dd[1] = vld1q_s16(dd1_values); const int16_t ds0_values[] = { dj[0 * d_stride], dj[1 * d_stride], dj[2 * d_stride], dj[3 * d_stride], dj[4 * d_stride], dj[5 * d_stride], 0, 0 }; ds[0] = vld1q_s16(ds0_values); int16_t ds1_values[] = { dj[0 * d_stride + width], dj[1 * d_stride + width], dj[2 * d_stride + width], dj[3 * d_stride + width], dj[4 * d_stride + width], dj[5 * d_stride + width], 0, 0 }; ds[1] = vld1q_s16(ds1_values); y = 0; while (y < h8) { // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e dd[0] = vsetq_lane_s16(di[6 * d_stride], dd[0], 6); dd[0] = vsetq_lane_s16(di[7 * d_stride], dd[0], 7); dd[1] = vsetq_lane_s16(di[6 * d_stride + width], dd[1], 6); dd[1] = vsetq_lane_s16(di[7 * d_stride + width], dd[1], 7); // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e // 01s 11s 21s 31s 41s 51s 61s 71s 01e 11e 21e 31e 41e 51e 61e 71e ds[0] = vsetq_lane_s16(dj[6 * d_stride], ds[0], 6); ds[0] = vsetq_lane_s16(dj[7 * d_stride], ds[0], 7); ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 6); ds[1] = vsetq_lane_s16(dj[7 * d_stride + width], ds[1], 7); load_more_16_neon(di + 8 * d_stride, width, &dd[0], &dd[2]); load_more_16_neon(dj + 8 * d_stride, width, &ds[0], &ds[2]); load_more_16_neon(di + 9 * d_stride, width, &dd[2], &dd[4]); load_more_16_neon(dj + 9 * d_stride, width, &ds[2], &ds[4]); load_more_16_neon(di + 10 * d_stride, width, &dd[4], &dd[6]); load_more_16_neon(dj + 10 * d_stride, width, &ds[4], &ds[6]); load_more_16_neon(di + 11 * d_stride, width, &dd[6], &dd[8]); load_more_16_neon(dj + 11 * d_stride, width, &ds[6], &ds[8]); load_more_16_neon(di + 12 * d_stride, width, &dd[8], &dd[10]); load_more_16_neon(dj + 12 * d_stride, width, &ds[8], &ds[10]); load_more_16_neon(di + 13 * d_stride, width, &dd[10], &dd[12]); load_more_16_neon(dj + 13 * d_stride, width, &ds[10], &ds[12]); madd_neon(&deltas[0], dd[0], ds[0]); madd_neon(&deltas[1], dd[1], ds[1]); madd_neon(&deltas[2], dd[0], ds[2]); madd_neon(&deltas[3], dd[1], ds[3]); madd_neon(&deltas[4], dd[0], ds[4]); madd_neon(&deltas[5], dd[1], ds[5]); madd_neon(&deltas[6], dd[0], ds[6]); madd_neon(&deltas[7], dd[1], ds[7]); madd_neon(&deltas[8], dd[0], ds[8]); madd_neon(&deltas[9], dd[1], ds[9]); madd_neon(&deltas[10], dd[0], ds[10]); madd_neon(&deltas[11], dd[1], ds[11]); madd_neon(&deltas[12], dd[0], ds[12]); madd_neon(&deltas[13], dd[1], ds[13]); madd_neon(&deltas[14], dd[2], ds[0]); madd_neon(&deltas[15], dd[3], ds[1]); madd_neon(&deltas[16], dd[4], ds[0]); madd_neon(&deltas[17], dd[5], ds[1]); madd_neon(&deltas[18], dd[6], ds[0]); madd_neon(&deltas[19], dd[7], ds[1]); madd_neon(&deltas[20], dd[8], ds[0]); madd_neon(&deltas[21], dd[9], ds[1]); madd_neon(&deltas[22], dd[10], ds[0]); madd_neon(&deltas[23], dd[11], ds[1]); madd_neon(&deltas[24], dd[12], ds[0]); madd_neon(&deltas[25], dd[13], ds[1]); dd[0] = vextq_s16(dd[12], vdupq_n_s16(0), 2); dd[1] = vextq_s16(dd[13], vdupq_n_s16(0), 2); ds[0] = vextq_s16(ds[12], vdupq_n_s16(0), 2); ds[1] = vextq_s16(ds[13], vdupq_n_s16(0), 2); di += 8 * d_stride; dj += 8 * d_stride; y += 8; } deltas[0] = hadd_four_32_neon(deltas[0], deltas[2], deltas[4], deltas[6]); deltas[1] = hadd_four_32_neon(deltas[1], deltas[3], deltas[5], deltas[7]); deltas[2] = hadd_four_32_neon(deltas[8], deltas[10], deltas[12], deltas[12]); deltas[3] = hadd_four_32_neon(deltas[9], deltas[11], deltas[13], deltas[13]); deltas[4] = hadd_four_32_neon(deltas[14], deltas[16], deltas[18], deltas[20]); deltas[5] = hadd_four_32_neon(deltas[15], deltas[17], deltas[19], deltas[21]); deltas[6] = hadd_four_32_neon(deltas[22], deltas[24], deltas[22], deltas[24]); deltas[7] = hadd_four_32_neon(deltas[23], deltas[25], deltas[23], deltas[25]); deltas[0] = vsubq_s32(deltas[1], deltas[0]); deltas[1] = vsubq_s32(deltas[3], deltas[2]); deltas[2] = vsubq_s32(deltas[5], deltas[4]); deltas[3] = vsubq_s32(deltas[7], deltas[6]); if (h8 != height) { const int16_t ds0_vals[] = { dj[0 * d_stride], dj[0 * d_stride + width], dj[1 * d_stride], dj[1 * d_stride + width], dj[2 * d_stride], dj[2 * d_stride + width], dj[3 * d_stride], dj[3 * d_stride + width] }; ds[0] = vld1q_s16(ds0_vals); ds[1] = vsetq_lane_s16(dj[4 * d_stride], ds[1], 0); ds[1] = vsetq_lane_s16(dj[4 * d_stride + width], ds[1], 1); ds[1] = vsetq_lane_s16(dj[5 * d_stride], ds[1], 2); ds[1] = vsetq_lane_s16(dj[5 * d_stride + width], ds[1], 3); const int16_t dd4_vals[] = { -di[1 * d_stride], di[1 * d_stride + width], -di[2 * d_stride], di[2 * d_stride + width], -di[3 * d_stride], di[3 * d_stride + width], -di[4 * d_stride], di[4 * d_stride + width] }; dd[4] = vld1q_s16(dd4_vals); dd[5] = vsetq_lane_s16(-di[5 * d_stride], dd[5], 0); dd[5] = vsetq_lane_s16(di[5 * d_stride + width], dd[5], 1); do { dd[0] = vdupq_n_s16(-di[0 * d_stride]); dd[2] = dd[3] = vdupq_n_s16(di[0 * d_stride + width]); dd[0] = dd[1] = vzipq_s16(dd[0], dd[2]).val[0]; ds[4] = vdupq_n_s16(dj[0 * d_stride]); ds[6] = ds[7] = vdupq_n_s16(dj[0 * d_stride + width]); ds[4] = ds[5] = vzipq_s16(ds[4], ds[6]).val[0]; dd[5] = vsetq_lane_s16(-di[6 * d_stride], dd[5], 2); dd[5] = vsetq_lane_s16(di[6 * d_stride + width], dd[5], 3); ds[1] = vsetq_lane_s16(dj[6 * d_stride], ds[1], 4); ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 5); madd_neon_pairwise(&deltas[0], dd[0], ds[0]); madd_neon_pairwise(&deltas[1], dd[1], ds[1]); madd_neon_pairwise(&deltas[2], dd[4], ds[4]); madd_neon_pairwise(&deltas[3], dd[5], ds[5]); int32_t tmp0 = vgetq_lane_s32(vreinterpretq_s32_s16(ds[0]), 0); ds[0] = vextq_s16(ds[0], ds[1], 2); ds[1] = vextq_s16(ds[1], ds[0], 2); ds[1] = vreinterpretq_s16_s32( vsetq_lane_s32(tmp0, vreinterpretq_s32_s16(ds[1]), 3)); int32_t tmp1 = vgetq_lane_s32(vreinterpretq_s32_s16(dd[4]), 0); dd[4] = vextq_s16(dd[4], dd[5], 2); dd[5] = vextq_s16(dd[5], dd[4], 2); dd[5] = vreinterpretq_s16_s32( vsetq_lane_s32(tmp1, vreinterpretq_s32_s16(dd[5]), 3)); di += d_stride; dj += d_stride; } while (++y < height); } // Writing one more element on the top edge of a square falls to // the next square in the same row or the first element in the next // row, which will just be overwritten later. update_8_stats_neon( H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win, deltas[0], deltas[1], H + i * wiener_win * wiener_win2 + j * wiener_win); H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(deltas[2], 0); H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(deltas[2], 1); H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(deltas[2], 2); H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(deltas[2], 3); H[(i * wiener_win + 5) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 5) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(deltas[3], 0); H[(i * wiener_win + 6) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 6) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(deltas[3], 1); } while (++j < wiener_win); } while (++i < wiener_win - 1); // Step 5: Derive other points of each square. No square in bottom row. i = 0; do { const int16_t *const di = d + i; j = i + 1; do { const int16_t *const dj = d + j; int32x4_t deltas[WIENER_WIN - 1][WIN_7] = { { vdupq_n_s32(0) }, { vdupq_n_s32(0) } }; int16x8_t d_is[WIN_7]; int16x8_t d_ie[WIN_7]; int16x8_t d_js[WIN_7]; int16x8_t d_je[WIN_7]; x = 0; while (x < w16) { load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie, d_js, d_je); derive_square_win7_neon(d_is, d_ie, d_js, d_je, deltas); x += 16; } if (w16 != width) { load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie, d_js, d_je); d_is[0] = vandq_s16(d_is[0], mask[0]); d_is[1] = vandq_s16(d_is[1], mask[1]); d_is[2] = vandq_s16(d_is[2], mask[0]); d_is[3] = vandq_s16(d_is[3], mask[1]); d_is[4] = vandq_s16(d_is[4], mask[0]); d_is[5] = vandq_s16(d_is[5], mask[1]); d_is[6] = vandq_s16(d_is[6], mask[0]); d_is[7] = vandq_s16(d_is[7], mask[1]); d_is[8] = vandq_s16(d_is[8], mask[0]); d_is[9] = vandq_s16(d_is[9], mask[1]); d_is[10] = vandq_s16(d_is[10], mask[0]); d_is[11] = vandq_s16(d_is[11], mask[1]); d_ie[0] = vandq_s16(d_ie[0], mask[0]); d_ie[1] = vandq_s16(d_ie[1], mask[1]); d_ie[2] = vandq_s16(d_ie[2], mask[0]); d_ie[3] = vandq_s16(d_ie[3], mask[1]); d_ie[4] = vandq_s16(d_ie[4], mask[0]); d_ie[5] = vandq_s16(d_ie[5], mask[1]); d_ie[6] = vandq_s16(d_ie[6], mask[0]); d_ie[7] = vandq_s16(d_ie[7], mask[1]); d_ie[8] = vandq_s16(d_ie[8], mask[0]); d_ie[9] = vandq_s16(d_ie[9], mask[1]); d_ie[10] = vandq_s16(d_ie[10], mask[0]); d_ie[11] = vandq_s16(d_ie[11], mask[1]); derive_square_win7_neon(d_is, d_ie, d_js, d_je, deltas); } hadd_update_6_stats_neon( H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0], H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1); hadd_update_6_stats_neon( H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1], H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1); hadd_update_6_stats_neon( H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2], H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1); hadd_update_6_stats_neon( H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3], H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1); hadd_update_6_stats_neon( H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win, deltas[4], H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win + 1); hadd_update_6_stats_neon( H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win, deltas[5], H + (i * wiener_win + 6) * wiener_win2 + j * wiener_win + 1); } while (++j < wiener_win); } while (++i < wiener_win - 1); // Step 6: Derive other points of each upper triangle along the diagonal. i = 0; do { const int16_t *const di = d + i; int32x4_t deltas[WIENER_WIN * (WIENER_WIN - 1)] = { vdupq_n_s32(0) }; int16x8_t d_is[WIN_7], d_ie[WIN_7]; x = 0; while (x < w16) { load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie); derive_triangle_win7_neon(d_is, d_ie, deltas); x += 16; } if (w16 != width) { load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie); d_is[0] = vandq_s16(d_is[0], mask[0]); d_is[1] = vandq_s16(d_is[1], mask[1]); d_is[2] = vandq_s16(d_is[2], mask[0]); d_is[3] = vandq_s16(d_is[3], mask[1]); d_is[4] = vandq_s16(d_is[4], mask[0]); d_is[5] = vandq_s16(d_is[5], mask[1]); d_is[6] = vandq_s16(d_is[6], mask[0]); d_is[7] = vandq_s16(d_is[7], mask[1]); d_is[8] = vandq_s16(d_is[8], mask[0]); d_is[9] = vandq_s16(d_is[9], mask[1]); d_is[10] = vandq_s16(d_is[10], mask[0]); d_is[11] = vandq_s16(d_is[11], mask[1]); d_ie[0] = vandq_s16(d_ie[0], mask[0]); d_ie[1] = vandq_s16(d_ie[1], mask[1]); d_ie[2] = vandq_s16(d_ie[2], mask[0]); d_ie[3] = vandq_s16(d_ie[3], mask[1]); d_ie[4] = vandq_s16(d_ie[4], mask[0]); d_ie[5] = vandq_s16(d_ie[5], mask[1]); d_ie[6] = vandq_s16(d_ie[6], mask[0]); d_ie[7] = vandq_s16(d_ie[7], mask[1]); d_ie[8] = vandq_s16(d_ie[8], mask[0]); d_ie[9] = vandq_s16(d_ie[9], mask[1]); d_ie[10] = vandq_s16(d_ie[10], mask[0]); d_ie[11] = vandq_s16(d_ie[11], mask[1]); derive_triangle_win7_neon(d_is, d_ie, deltas); } // Row 1: 6 points hadd_update_6_stats_neon( H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas, H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1); // Row 2: 5 points hadd_update_4_stats_neon( H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1, deltas + 6, H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2); H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 6] = H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 5] + horizontal_long_add_s32x4(deltas[10]); // Row 3: 4 points hadd_update_4_stats_neon( H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2, deltas + 11, H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3); // Row 4: 3 points #if AOM_ARCH_AARCH64 int64x2_t delta15_s64 = vpaddlq_s32(deltas[15]); int64x2_t delta16_s64 = vpaddlq_s32(deltas[16]); int64x2_t delta1516 = vpaddq_s64(delta15_s64, delta16_s64); int64x2_t h0 = vld1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3); vst1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4, vaddq_s64(h0, delta1516)); #else H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4 + 0] = H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3 + 0] + horizontal_long_add_s32x4(deltas[15]); H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4 + 1] = H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3 + 1] + horizontal_long_add_s32x4(deltas[16]); #endif // AOM_ARCH_AARCH64 H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 6] = H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 5] + horizontal_long_add_s32x4(deltas[17]); // Row 5: 2 points int64x2_t delta18_s64 = vpaddlq_s32(deltas[18]); int64x2_t delta19_s64 = vpaddlq_s32(deltas[19]); #if AOM_ARCH_AARCH64 int64x2_t delta1819 = vpaddq_s64(delta18_s64, delta19_s64); int64x2_t h1 = vld1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4); vst1q_s64(H + (i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5, vaddq_s64(h1, delta1819)); #else H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] = H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] + horizontal_add_s64x2(delta18_s64); H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5 + 1] = H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4 + 1] + horizontal_add_s64x2(delta19_s64); #endif // AOM_ARCH_AARCH64 // Row 6: 1 points H[(i * wiener_win + 6) * wiener_win2 + i * wiener_win + 6] = H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] + horizontal_long_add_s32x4(deltas[20]); } while (++i < wiener_win); } static inline void sub_avg_block_highbd_neon(const uint16_t *src, const int32_t src_stride, const uint16_t avg, const int32_t width, const int32_t height, int16_t *dst, const int32_t dst_stride) { const uint16x8_t a = vdupq_n_u16(avg); int32_t i = height + 1; do { int32_t j = 0; while (j < width) { const uint16x8_t s = vld1q_u16(src + j); const uint16x8_t d = vsubq_u16(s, a); vst1q_s16(dst + j, vreinterpretq_s16_u16(d)); j += 8; } src += src_stride; dst += dst_stride; } while (--i); } static inline uint16_t highbd_find_average_neon(const uint16_t *src, int src_stride, int width, int height) { assert(width > 0); assert(height > 0); uint64x2_t sum_u64 = vdupq_n_u64(0); uint64_t sum = 0; const uint16x8_t mask = vreinterpretq_u16_s16(vld1q_s16(&mask_16bit[16] - (width % 8))); int h = height; do { uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int w = width; const uint16_t *row = src; while (w >= 32) { uint16x8_t s0 = vld1q_u16(row + 0); uint16x8_t s1 = vld1q_u16(row + 8); uint16x8_t s2 = vld1q_u16(row + 16); uint16x8_t s3 = vld1q_u16(row + 24); s0 = vaddq_u16(s0, s1); s2 = vaddq_u16(s2, s3); sum_u32[0] = vpadalq_u16(sum_u32[0], s0); sum_u32[1] = vpadalq_u16(sum_u32[1], s2); row += 32; w -= 32; } if (w >= 16) { uint16x8_t s0 = vld1q_u16(row + 0); uint16x8_t s1 = vld1q_u16(row + 8); s0 = vaddq_u16(s0, s1); sum_u32[0] = vpadalq_u16(sum_u32[0], s0); row += 16; w -= 16; } if (w >= 8) { uint16x8_t s0 = vld1q_u16(row); sum_u32[1] = vpadalq_u16(sum_u32[1], s0); row += 8; w -= 8; } if (w) { uint16x8_t s0 = vandq_u16(vld1q_u16(row), mask); sum_u32[1] = vpadalq_u16(sum_u32[1], s0); row += 8; w -= 8; } sum_u64 = vpadalq_u32(sum_u64, vaddq_u32(sum_u32[0], sum_u32[1])); src += src_stride; } while (--h != 0); return (uint16_t)((horizontal_add_u64x2(sum_u64) + sum) / (height * width)); } void av1_compute_stats_highbd_neon(int32_t wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int32_t h_start, int32_t h_end, int32_t v_start, int32_t v_end, int32_t dgd_stride, int32_t src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { const int32_t wiener_win2 = wiener_win * wiener_win; const int32_t wiener_halfwin = (wiener_win >> 1); const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); const int32_t width = h_end - h_start; const int32_t height = v_end - v_start; const uint16_t *dgd_start = dgd + h_start + v_start * dgd_stride; const uint16_t avg = highbd_find_average_neon(dgd_start, dgd_stride, width, height); const int32_t d_stride = (width + 2 * wiener_halfwin + 15) & ~15; const int32_t s_stride = (width + 15) & ~15; sub_avg_block_highbd_neon(src + v_start * src_stride + h_start, src_stride, avg, width, height, src_avg, s_stride); sub_avg_block_highbd_neon( dgd + (v_start - wiener_halfwin) * dgd_stride + h_start - wiener_halfwin, dgd_stride, avg, width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, dgd_avg, d_stride); if (wiener_win == WIENER_WIN) { compute_stats_win7_highbd_neon(dgd_avg, d_stride, src_avg, s_stride, width, height, M, H, bit_depth); } else if (wiener_win == WIENER_WIN_CHROMA) { compute_stats_win5_highbd_neon(dgd_avg, d_stride, src_avg, s_stride, width, height, M, H, bit_depth); } // H is a symmetric matrix, so we only need to fill out the upper triangle. // We can copy it down to the lower triangle outside the (i, j) loops. if (bit_depth == AOM_BITS_8) { diagonal_copy_stats_neon(wiener_win2, H); } else if (bit_depth == AOM_BITS_10) { // bit_depth == AOM_BITS_10 const int32_t k4 = wiener_win2 & ~3; int32_t k = 0; do { int64x2_t dst = div4_neon(vld1q_s64(M + k)); vst1q_s64(M + k, dst); dst = div4_neon(vld1q_s64(M + k + 2)); vst1q_s64(M + k + 2, dst); H[k * wiener_win2 + k] /= 4; k += 4; } while (k < k4); H[k * wiener_win2 + k] /= 4; for (; k < wiener_win2; ++k) { M[k] /= 4; } div4_diagonal_copy_stats_neon(wiener_win2, H); } else { // bit_depth == AOM_BITS_12 const int32_t k4 = wiener_win2 & ~3; int32_t k = 0; do { int64x2_t dst = div16_neon(vld1q_s64(M + k)); vst1q_s64(M + k, dst); dst = div16_neon(vld1q_s64(M + k + 2)); vst1q_s64(M + k + 2, dst); H[k * wiener_win2 + k] /= 16; k += 4; } while (k < k4); H[k * wiener_win2 + k] /= 16; for (; k < wiener_win2; ++k) { M[k] /= 16; } div16_diagonal_copy_stats_neon(wiener_win2, H); } } int64_t av1_highbd_pixel_proj_error_neon( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); int64_t sse = 0; int64x2_t sse_s64 = vdupq_n_s64(0); if (params->r[0] > 0 && params->r[1] > 0) { int32x2_t xq_v = vld1_s32(xq); int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), 4); do { int j = 0; int32x4_t sse_s32 = vdupq_n_s32(0); do { const uint16x8_t d = vld1q_u16(&dat[j]); const uint16x8_t s = vld1q_u16(&src[j]); int32x4_t flt0_0 = vld1q_s32(&flt0[j]); int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]); int32x4_t flt1_0 = vld1q_s32(&flt1[j]); int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]); int32x4_t d_s32_lo = vreinterpretq_s32_u32( vmull_lane_u16(vget_low_u16(d), vreinterpret_u16_s32(xq_sum_v), 0)); int32x4_t d_s32_hi = vreinterpretq_s32_u32(vmull_lane_u16( vget_high_u16(d), vreinterpret_u16_s32(xq_sum_v), 0)); int32x4_t v0 = vsubq_s32( vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), d_s32_lo); int32x4_t v1 = vsubq_s32( vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), d_s32_hi); v0 = vmlaq_lane_s32(v0, flt0_0, xq_v, 0); v1 = vmlaq_lane_s32(v1, flt0_1, xq_v, 0); v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1); v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1); int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), vreinterpretq_s16_u16(vsubq_u16(d, s))); int16x4_t e_lo = vget_low_s16(e); int16x4_t e_hi = vget_high_s16(e); sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); j += 8; } while (j <= width - 8); for (int k = j; k < width; ++k) { int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); v += xq[0] * (flt0[k]) + xq[1] * (flt1[k]); v -= (xq[1] + xq[0]) * (int32_t)(dat[k] << 4); int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; sse += ((int64_t)e * e); } sse_s64 = vpadalq_s32(sse_s64, sse_s32); dat += dat_stride; src += src_stride; flt0 += flt0_stride; flt1 += flt1_stride; } while (--height != 0); } else if (params->r[0] > 0 || params->r[1] > 0) { int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; int32x4_t xq_v = vdupq_n_s32(xq_active); do { int j = 0; int32x4_t sse_s32 = vdupq_n_s32(0); do { const uint16x8_t d0 = vld1q_u16(&dat[j]); const uint16x8_t s0 = vld1q_u16(&src[j]); int32x4_t flt0_0 = vld1q_s32(&flt[j]); int32x4_t flt0_1 = vld1q_s32(&flt[j + 4]); uint16x8_t d_u16 = vshlq_n_u16(d0, 4); int32x4_t sub0 = vreinterpretq_s32_u32( vsubw_u16(vreinterpretq_u32_s32(flt0_0), vget_low_u16(d_u16))); int32x4_t sub1 = vreinterpretq_s32_u32( vsubw_u16(vreinterpretq_u32_s32(flt0_1), vget_high_u16(d_u16))); int32x4_t v0 = vmlaq_s32( vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub0, xq_v); int32x4_t v1 = vmlaq_s32( vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub1, xq_v); int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), vreinterpretq_s16_u16(vsubq_u16(d0, s0))); int16x4_t e_lo = vget_low_s16(e); int16x4_t e_hi = vget_high_s16(e); sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); j += 8; } while (j <= width - 8); for (int k = j; k < width; ++k) { int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); v += xq_active * (int32_t)((uint32_t)flt[k] - (uint16_t)(dat[k] << 4)); const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; sse += ((int64_t)e * e); } sse_s64 = vpadalq_s32(sse_s64, sse_s32); dat += dat_stride; flt += flt_stride; src += src_stride; } while (--height != 0); } else { do { int j = 0; do { const uint16x8_t d = vld1q_u16(&dat[j]); const uint16x8_t s = vld1q_u16(&src[j]); uint16x8_t diff = vabdq_u16(d, s); uint16x4_t diff_lo = vget_low_u16(diff); uint16x4_t diff_hi = vget_high_u16(diff); uint32x4_t sqr_lo = vmull_u16(diff_lo, diff_lo); uint32x4_t sqr_hi = vmull_u16(diff_hi, diff_hi); sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_lo)); sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_hi)); j += 8; } while (j <= width - 8); for (int k = j; k < width; ++k) { int32_t e = dat[k] - src[k]; sse += e * e; } dat += dat_stride; src += src_stride; } while (--height != 0); } sse += horizontal_add_s64x2(sse_s64); return sse; } aom-3.12.1/av1/encoder/arm/highbd_pickrst_sve.c000066400000000000000000000133101477627663500212710ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "av1/encoder/arm/pickrst_neon.h" #include "av1/encoder/arm/pickrst_sve.h" #include "av1/encoder/pickrst.h" static inline uint16_t highbd_find_average_sve(const uint16_t *src, int src_stride, int width, int height) { uint64x2_t avg_u64 = vdupq_n_u64(0); uint16x8_t ones = vdupq_n_u16(1); // Use a predicate to compute the last columns. svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8); int h = height; do { int j = width; const uint16_t *src_ptr = src; while (j > 8) { uint16x8_t s = vld1q_u16(src_ptr); avg_u64 = aom_udotq_u16(avg_u64, s, ones); j -= 8; src_ptr += 8; } uint16x8_t s_end = svget_neonq_u16(svld1_u16(pattern, src_ptr)); avg_u64 = aom_udotq_u16(avg_u64, s_end, ones); src += src_stride; } while (--h != 0); return (uint16_t)(vaddvq_u64(avg_u64) / (width * height)); } static inline void sub_avg_block_highbd_sve(const uint16_t *buf, int buf_stride, int16_t avg, int width, int height, int16_t *buf_avg, int buf_avg_stride) { uint16x8_t avg_u16 = vdupq_n_u16(avg); // Use a predicate to compute the last columns. svbool_t pattern = svwhilelt_b16_u32(0, width % 8 == 0 ? 8 : width % 8); uint16x8_t avg_end = svget_neonq_u16(svdup_n_u16_z(pattern, avg)); do { int j = width; const uint16_t *buf_ptr = buf; int16_t *buf_avg_ptr = buf_avg; while (j > 8) { uint16x8_t d = vld1q_u16(buf_ptr); vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubq_u16(d, avg_u16))); j -= 8; buf_ptr += 8; buf_avg_ptr += 8; } uint16x8_t d_end = svget_neonq_u16(svld1_u16(pattern, buf_ptr)); vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubq_u16(d_end, avg_end))); buf += buf_stride; buf_avg += buf_avg_stride; } while (--height > 0); } void av1_compute_stats_highbd_sve(int32_t wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int32_t h_start, int32_t h_end, int32_t v_start, int32_t v_end, int32_t dgd_stride, int32_t src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { const int32_t wiener_win2 = wiener_win * wiener_win; const int32_t wiener_halfwin = (wiener_win >> 1); const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); const int32_t width = h_end - h_start; const int32_t height = v_end - v_start; const int32_t d_stride = (width + 2 * wiener_halfwin + 15) & ~15; const int32_t s_stride = (width + 15) & ~15; const uint16_t *dgd_start = dgd + h_start + v_start * dgd_stride; const uint16_t *src_start = src + h_start + v_start * src_stride; const uint16_t avg = highbd_find_average_sve(dgd_start, dgd_stride, width, height); sub_avg_block_highbd_sve(src_start, src_stride, avg, width, height, src_avg, s_stride); sub_avg_block_highbd_sve( dgd + (v_start - wiener_halfwin) * dgd_stride + h_start - wiener_halfwin, dgd_stride, avg, width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, dgd_avg, d_stride); if (wiener_win == WIENER_WIN) { compute_stats_win7_sve(dgd_avg, d_stride, src_avg, s_stride, width, height, M, H); } else { assert(wiener_win == WIENER_WIN_CHROMA); compute_stats_win5_sve(dgd_avg, d_stride, src_avg, s_stride, width, height, M, H); } // H is a symmetric matrix, so we only need to fill out the upper triangle. // We can copy it down to the lower triangle outside the (i, j) loops. if (bit_depth == AOM_BITS_8) { diagonal_copy_stats_neon(wiener_win2, H); } else if (bit_depth == AOM_BITS_10) { // bit_depth == EB_TEN_BIT const int32_t k4 = wiener_win2 & ~3; int32_t k = 0; do { int64x2_t dst = div4_neon(vld1q_s64(M + k)); vst1q_s64(M + k, dst); dst = div4_neon(vld1q_s64(M + k + 2)); vst1q_s64(M + k + 2, dst); H[k * wiener_win2 + k] /= 4; k += 4; } while (k < k4); H[k * wiener_win2 + k] /= 4; for (; k < wiener_win2; ++k) { M[k] /= 4; } div4_diagonal_copy_stats_neon(wiener_win2, H); } else { // bit_depth == AOM_BITS_12 const int32_t k4 = wiener_win2 & ~3; int32_t k = 0; do { int64x2_t dst = div16_neon(vld1q_s64(M + k)); vst1q_s64(M + k, dst); dst = div16_neon(vld1q_s64(M + k + 2)); vst1q_s64(M + k + 2, dst); H[k * wiener_win2 + k] /= 16; k += 4; } while (k < k4); H[k * wiener_win2 + k] /= 16; for (; k < wiener_win2; ++k) { M[k] /= 16; } div16_diagonal_copy_stats_neon(wiener_win2, H); } } aom-3.12.1/av1/encoder/arm/highbd_rdopt_neon.c000066400000000000000000000033061477627663500211100ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "aom_dsp/arm/sum_neon.h" int64_t av1_highbd_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd) { uint64x2_t err_u64 = vdupq_n_u64(0); int64x2_t ssz_s64 = vdupq_n_s64(0); const int shift = 2 * (bd - 8); const int rounding = (1 << shift) >> 1; assert(block_size >= 16); assert((block_size % 16) == 0); do { const int32x4_t c = vld1q_s32(coeff); const int32x4_t d = vld1q_s32(dqcoeff); const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d)); err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff)); err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff)); ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c)); ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c)); coeff += 4; dqcoeff += 4; block_size -= 4; } while (block_size != 0); *ssz = (horizontal_add_s64x2(ssz_s64) + rounding) >> shift; return ((int64_t)horizontal_add_u64x2(err_u64) + rounding) >> shift; } aom-3.12.1/av1/encoder/arm/highbd_temporal_filter_neon.c000066400000000000000000000546521477627663500231620ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "av1/encoder/encoder.h" #include "av1/encoder/temporal_filter.h" #include "aom_dsp/mathutils.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" static inline void get_squared_error( const uint16_t *frame1, const uint32_t stride1, const uint16_t *frame2, const uint32_t stride2, const uint32_t block_width, const uint32_t block_height, uint32_t *frame_sse, const unsigned int dst_stride) { uint32_t *dst = frame_sse; uint32_t i = 0; do { uint32_t j = 0; do { uint16x8_t s = vld1q_u16(frame1 + i * stride1 + j); uint16x8_t r = vld1q_u16(frame2 + i * stride2 + j); uint16x8_t abs_diff = vabdq_u16(s, r); uint32x4_t sse_lo = vmull_u16(vget_low_u16(abs_diff), vget_low_u16(abs_diff)); uint32x4_t sse_hi = vmull_u16(vget_high_u16(abs_diff), vget_high_u16(abs_diff)); vst1q_u32(dst + j, sse_lo); vst1q_u32(dst + j + 4, sse_hi); j += 8; } while (j < block_width); dst += dst_stride; i++; } while (i < block_height); } static uint32_t sum_kernel5x5_mask_single(const uint32x4_t vsrc[5][2], const uint32x4_t mask_single) { uint32x4_t vsums = vmulq_u32(vsrc[0][0], mask_single); vsums = vmlaq_u32(vsums, vsrc[1][0], mask_single); vsums = vmlaq_u32(vsums, vsrc[2][0], mask_single); vsums = vmlaq_u32(vsums, vsrc[3][0], mask_single); vsums = vmlaq_u32(vsums, vsrc[4][0], mask_single); return horizontal_add_u32x4(vsums); } static uint32x4_t sum_kernel5x5_mask_double(const uint32x4_t vsrc[5][2], const uint32x4_t mask1, const uint32x4_t mask2) { uint32x4_t vsums = vmulq_u32(vsrc[0][0], mask1); vsums = vmlaq_u32(vsums, vsrc[1][0], mask1); vsums = vmlaq_u32(vsums, vsrc[2][0], mask1); vsums = vmlaq_u32(vsums, vsrc[3][0], mask1); vsums = vmlaq_u32(vsums, vsrc[4][0], mask1); vsums = vmlaq_u32(vsums, vsrc[0][1], mask2); vsums = vmlaq_u32(vsums, vsrc[1][1], mask2); vsums = vmlaq_u32(vsums, vsrc[2][1], mask2); vsums = vmlaq_u32(vsums, vsrc[3][1], mask2); vsums = vmlaq_u32(vsums, vsrc[4][1], mask2); return vsums; } static void highbd_apply_temporal_filter( const uint16_t *frame, const unsigned int stride, const uint32_t block_width, const uint32_t block_height, const int *subblock_mses, unsigned int *accumulator, uint16_t *count, const uint32_t *frame_sse, const uint32_t frame_sse_stride, const uint32_t *luma_sse_sum, const double inv_num_ref_pixels, const double decay_factor, const double inv_factor, const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl, int bd) { assert(((block_width == 16) || (block_width == 32)) && ((block_height == 16) || (block_height == 32))); uint32_t acc_5x5_neon[BH][BW] = { 0 }; const int half_window = TF_WINDOW_LENGTH >> 1; uint32x4_t vsrc[5][2] = { 0 }; const uint32x4_t k0000 = vdupq_n_u32(0); const uint32x4_t k1111 = vdupq_n_u32(1); const uint32_t k3110_u32[4] = { 0, 1, 1, 3 }; const uint32_t k2111_u32[4] = { 1, 1, 1, 2 }; const uint32_t k1112_u32[4] = { 2, 1, 1, 1 }; const uint32_t k0113_u32[4] = { 3, 1, 1, 0 }; const uint32x4_t k3110 = vld1q_u32(k3110_u32); const uint32x4_t k2111 = vld1q_u32(k2111_u32); const uint32x4_t k1112 = vld1q_u32(k1112_u32); const uint32x4_t k0113 = vld1q_u32(k0113_u32); uint32x4_t vmask1[4], vmask2[4]; vmask1[0] = k1111; vmask2[0] = vextq_u32(k1111, k0000, 3); vmask1[1] = vextq_u32(k0000, k1111, 3); vmask2[1] = vextq_u32(k1111, k0000, 2); vmask1[2] = vextq_u32(k0000, k1111, 2); vmask2[2] = vextq_u32(k1111, k0000, 1); vmask1[3] = vextq_u32(k0000, k1111, 1); vmask2[3] = k1111; uint32_t row = 0; do { uint32_t col = 0; const uint32_t *src = frame_sse + row * frame_sse_stride; if (row == 0) { vsrc[2][0] = vld1q_u32(src); vsrc[3][0] = vld1q_u32(src + frame_sse_stride); vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride); // First 2 rows of the 5x5 matrix are padded from the 1st. vsrc[0][0] = vsrc[2][0]; vsrc[1][0] = vsrc[2][0]; } else if (row == 1) { vsrc[1][0] = vld1q_u32(src - frame_sse_stride); vsrc[2][0] = vld1q_u32(src); vsrc[3][0] = vld1q_u32(src + frame_sse_stride); vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride); // First row of the 5x5 matrix are padded from the 1st. vsrc[0][0] = vsrc[1][0]; } else if (row == block_height - 2) { vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride); vsrc[1][0] = vld1q_u32(src - frame_sse_stride); vsrc[2][0] = vld1q_u32(src); vsrc[3][0] = vld1q_u32(src + frame_sse_stride); // Last row of the 5x5 matrix are padded from the one before. vsrc[4][0] = vsrc[3][0]; } else if (row == block_height - 1) { vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride); vsrc[1][0] = vld1q_u32(src - frame_sse_stride); vsrc[2][0] = vld1q_u32(src); // Last 2 rows of the 5x5 matrix are padded from the 3rd. vsrc[3][0] = vsrc[2][0]; vsrc[4][0] = vsrc[2][0]; } else { vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride); vsrc[1][0] = vld1q_u32(src - frame_sse_stride); vsrc[2][0] = vld1q_u32(src); vsrc[3][0] = vld1q_u32(src + frame_sse_stride); vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride); } acc_5x5_neon[row][0] = sum_kernel5x5_mask_single(vsrc, k0113); acc_5x5_neon[row][1] = sum_kernel5x5_mask_single(vsrc, k1112); col += 4; src += 4; // Traverse 4 columns at a time do { if (row == 0) { vsrc[2][1] = vld1q_u32(src); vsrc[3][1] = vld1q_u32(src + frame_sse_stride); vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride); // First 2 rows of the 5x5 matrix are padded from the 1st. vsrc[0][1] = vsrc[2][1]; vsrc[1][1] = vsrc[2][1]; } else if (row == 1) { vsrc[1][1] = vld1q_u32(src - frame_sse_stride); vsrc[2][1] = vld1q_u32(src); vsrc[3][1] = vld1q_u32(src + frame_sse_stride); vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride); // First row of the 5x5 matrix are padded from the 1st. vsrc[0][1] = vsrc[1][1]; } else if (row == block_height - 2) { vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride); vsrc[1][1] = vld1q_u32(src - frame_sse_stride); vsrc[2][1] = vld1q_u32(src); vsrc[3][1] = vld1q_u32(src + frame_sse_stride); // Last row of the 5x5 matrix are padded from the one before. vsrc[4][1] = vsrc[3][1]; } else if (row == block_height - 1) { vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride); vsrc[1][1] = vld1q_u32(src - frame_sse_stride); vsrc[2][1] = vld1q_u32(src); // Last 2 rows of the 5x5 matrix are padded from the 3rd. vsrc[3][1] = vsrc[2][1]; vsrc[4][1] = vsrc[2][1]; } else { vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride); vsrc[1][1] = vld1q_u32(src - frame_sse_stride); vsrc[2][1] = vld1q_u32(src); vsrc[3][1] = vld1q_u32(src + frame_sse_stride); vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride); } uint32x4_t sums[4]; sums[0] = sum_kernel5x5_mask_double(vsrc, vmask1[0], vmask2[0]); sums[1] = sum_kernel5x5_mask_double(vsrc, vmask1[1], vmask2[1]); sums[2] = sum_kernel5x5_mask_double(vsrc, vmask1[2], vmask2[2]); sums[3] = sum_kernel5x5_mask_double(vsrc, vmask1[3], vmask2[3]); vst1q_u32(&acc_5x5_neon[row][col - half_window], horizontal_add_4d_u32x4(sums)); vsrc[0][0] = vsrc[0][1]; vsrc[1][0] = vsrc[1][1]; vsrc[2][0] = vsrc[2][1]; vsrc[3][0] = vsrc[3][1]; vsrc[4][0] = vsrc[4][1]; src += 4; col += 4; } while (col <= block_width - 4); acc_5x5_neon[row][col - half_window] = sum_kernel5x5_mask_single(vsrc, k2111); acc_5x5_neon[row][col - half_window + 1] = sum_kernel5x5_mask_single(vsrc, k3110); row++; } while (row < block_height); // Perform filtering. if (tf_wgt_calc_lvl == 0) { for (unsigned int i = 0, k = 0; i < block_height; i++) { for (unsigned int j = 0; j < block_width; j++, k++) { const int pixel_value = frame[i * stride + j]; // Scale down the difference for high bit depth input. const uint32_t diff_sse = (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2); const double window_error = diff_sse * inv_num_ref_pixels; const int subblock_idx = (i >= block_height / 2) * 2 + (j >= block_width / 2); const double block_error = (double)subblock_mses[subblock_idx]; const double combined_error = weight_factor * window_error + block_error * inv_factor; // Compute filter weight. double scaled_error = combined_error * d_factor[subblock_idx] * decay_factor; scaled_error = AOMMIN(scaled_error, 7); const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); accumulator[k] += weight * pixel_value; count[k] += weight; } } } else { for (unsigned int i = 0, k = 0; i < block_height; i++) { for (unsigned int j = 0; j < block_width; j++, k++) { const int pixel_value = frame[i * stride + j]; // Scale down the difference for high bit depth input. const uint32_t diff_sse = (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2); const double window_error = diff_sse * inv_num_ref_pixels; const int subblock_idx = (i >= block_height / 2) * 2 + (j >= block_width / 2); const double block_error = (double)subblock_mses[subblock_idx]; const double combined_error = weight_factor * window_error + block_error * inv_factor; // Compute filter weight. double scaled_error = combined_error * d_factor[subblock_idx] * decay_factor; scaled_error = AOMMIN(scaled_error, 7); const float fweight = approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; const int weight = iroundpf(fweight); accumulator[k] += weight * pixel_value; count[k] += weight; } } } } void av1_highbd_apply_temporal_filter_neon( const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred8, uint32_t *accum, uint16_t *count) { const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!"); assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); (void)is_high_bitdepth; assert(is_high_bitdepth); // Block information. const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; // Frame information. const int frame_height = frame_to_filter->y_crop_height; const int frame_width = frame_to_filter->y_crop_width; const int min_frame_size = AOMMIN(frame_height, frame_width); // Variables to simplify combined error calculation. const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * TF_SEARCH_ERROR_NORM_WEIGHT); const double weight_factor = (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; // Adjust filtering based on q. // Larger q -> stronger filtering -> larger weight. // Smaller q -> weaker filtering -> smaller weight. double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); q_decay = CLIP(q_decay, 1e-5, 1); if (q_factor >= TF_QINDEX_CUTOFF) { // Max q_factor is 255, therefore the upper bound of q_decay is 8. // We do not need a clip here. q_decay = 0.5 * pow((double)q_factor / 64, 2); } // Smaller strength -> smaller filtering weight. double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); s_decay = CLIP(s_decay, 1e-5, 1); double d_factor[4] = { 0 }; uint32_t frame_sse[BW * BH] = { 0 }; uint32_t luma_sse_sum[BW * BH] = { 0 }; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { // Larger motion vector -> smaller filtering weight. const MV mv = subblock_mvs[subblock_idx]; const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; distance_threshold = AOMMAX(distance_threshold, 1); d_factor[subblock_idx] = distance / distance_threshold; d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); } // Handle planes in sequence. int plane_offset = 0; for (int plane = 0; plane < num_planes; ++plane) { const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; const uint32_t frame_stride = frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1]; const uint32_t frame_sse_stride = plane_w; const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; const uint16_t *ref = CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset; const int ss_x_shift = mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; const int ss_y_shift = mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); const double inv_num_ref_pixels = 1.0 / num_ref_pixels; // Larger noise -> larger filtering weight. const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); // Decay factors for non-local mean approach. const double decay_factor = 1 / (n_decay * q_decay * s_decay); // Filter U-plane and V-plane using Y-plane. This is because motion // search is only done on Y-plane, so the information from Y-plane // will be more accurate. The luma sse sum is reused in both chroma // planes. if (plane == AOM_PLANE_U) { for (unsigned int i = 0; i < plane_h; i++) { for (unsigned int j = 0; j < plane_w; j++) { for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. const int ww = frame_sse_stride << ss_x_shift; // Width of Y-plane. luma_sse_sum[i * BW + j] += frame_sse[yy * ww + xx]; } } } } } get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w, plane_h, frame_sse, frame_sse_stride); highbd_apply_temporal_filter( pred + plane_offset, plane_w, plane_w, plane_h, subblock_mses, accum + plane_offset, count + plane_offset, frame_sse, frame_sse_stride, luma_sse_sum, inv_num_ref_pixels, decay_factor, inv_factor, weight_factor, d_factor, tf_wgt_calc_lvl, mbd->bd); plane_offset += plane_h * plane_w; } } double av1_highbd_estimate_noise_from_single_plane_neon(const uint16_t *src, int height, int width, int stride, int bitdepth, int edge_thresh) { uint16x8_t thresh = vdupq_n_u16(edge_thresh); uint64x2_t acc = vdupq_n_u64(0); // Count is in theory positive as it counts the number of times we're under // the threshold, but it will be counted negatively in order to make best use // of the vclt instruction, which sets every bit of a lane to 1 when the // condition is true. int32x4_t count = vdupq_n_s32(0); int final_count = 0; uint64_t final_acc = 0; const uint16_t *src_start = src + stride + 1; int h = 1; do { int w = 1; const uint16_t *src_ptr = src_start; while (w <= (width - 1) - 8) { uint16x8_t mat[3][3]; mat[0][0] = vld1q_u16(src_ptr - stride - 1); mat[0][1] = vld1q_u16(src_ptr - stride); mat[0][2] = vld1q_u16(src_ptr - stride + 1); mat[1][0] = vld1q_u16(src_ptr - 1); mat[1][1] = vld1q_u16(src_ptr); mat[1][2] = vld1q_u16(src_ptr + 1); mat[2][0] = vld1q_u16(src_ptr + stride - 1); mat[2][1] = vld1q_u16(src_ptr + stride); mat[2][2] = vld1q_u16(src_ptr + stride + 1); // Compute Sobel gradients. uint16x8_t gxa = vaddq_u16(mat[0][0], mat[2][0]); uint16x8_t gxb = vaddq_u16(mat[0][2], mat[2][2]); gxa = vaddq_u16(gxa, vaddq_u16(mat[1][0], mat[1][0])); gxb = vaddq_u16(gxb, vaddq_u16(mat[1][2], mat[1][2])); uint16x8_t gya = vaddq_u16(mat[0][0], mat[0][2]); uint16x8_t gyb = vaddq_u16(mat[2][0], mat[2][2]); gya = vaddq_u16(gya, vaddq_u16(mat[0][1], mat[0][1])); gyb = vaddq_u16(gyb, vaddq_u16(mat[2][1], mat[2][1])); uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb); ga = vrshlq_u16(ga, vdupq_n_s16(8 - bitdepth)); // Check which vector elements are under the threshold. The Laplacian is // then unconditionnally computed and we accumulate zeros if we're not // under the threshold. This is much faster than using an if statement. uint16x8_t thresh_u16 = vcltq_u16(ga, thresh); uint16x8_t center = vshlq_n_u16(mat[1][1], 2); uint16x8_t adj0 = vaddq_u16(mat[0][1], mat[2][1]); uint16x8_t adj1 = vaddq_u16(mat[1][0], mat[1][2]); uint16x8_t adj = vaddq_u16(adj0, adj1); adj = vaddq_u16(adj, adj); uint16x8_t diag0 = vaddq_u16(mat[0][0], mat[0][2]); uint16x8_t diag1 = vaddq_u16(mat[2][0], mat[2][2]); uint16x8_t diag = vaddq_u16(diag0, diag1); uint16x8_t v = vabdq_u16(vaddq_u16(center, diag), adj); v = vandq_u16(vrshlq_u16(v, vdupq_n_s16(8 - bitdepth)), thresh_u16); uint32x4_t v_u32 = vpaddlq_u16(v); acc = vpadalq_u32(acc, v_u32); // Add -1 for each lane where the gradient is under the threshold. count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16)); w += 8; src_ptr += 8; } if (w <= (width - 1) - 4) { uint16x4_t mat[3][3]; mat[0][0] = vld1_u16(src_ptr - stride - 1); mat[0][1] = vld1_u16(src_ptr - stride); mat[0][2] = vld1_u16(src_ptr - stride + 1); mat[1][0] = vld1_u16(src_ptr - 1); mat[1][1] = vld1_u16(src_ptr); mat[1][2] = vld1_u16(src_ptr + 1); mat[2][0] = vld1_u16(src_ptr + stride - 1); mat[2][1] = vld1_u16(src_ptr + stride); mat[2][2] = vld1_u16(src_ptr + stride + 1); // Compute Sobel gradients. uint16x4_t gxa = vadd_u16(mat[0][0], mat[2][0]); uint16x4_t gxb = vadd_u16(mat[0][2], mat[2][2]); gxa = vadd_u16(gxa, vadd_u16(mat[1][0], mat[1][0])); gxb = vadd_u16(gxb, vadd_u16(mat[1][2], mat[1][2])); uint16x4_t gya = vadd_u16(mat[0][0], mat[0][2]); uint16x4_t gyb = vadd_u16(mat[2][0], mat[2][2]); gya = vadd_u16(gya, vadd_u16(mat[0][1], mat[0][1])); gyb = vadd_u16(gyb, vadd_u16(mat[2][1], mat[2][1])); uint16x4_t ga = vaba_u16(vabd_u16(gxa, gxb), gya, gyb); ga = vrshl_u16(ga, vdup_n_s16(8 - bitdepth)); // Check which vector elements are under the threshold. The Laplacian is // then unconditionnally computed and we accumulate zeros if we're not // under the threshold. This is much faster than using an if statement. uint16x4_t thresh_u16 = vclt_u16(ga, vget_low_u16(thresh)); uint16x4_t center = vshl_n_u16(mat[1][1], 2); uint16x4_t adj0 = vadd_u16(mat[0][1], mat[2][1]); uint16x4_t adj1 = vadd_u16(mat[1][0], mat[1][2]); uint16x4_t adj = vadd_u16(adj0, adj1); adj = vadd_u16(adj, adj); uint16x4_t diag0 = vadd_u16(mat[0][0], mat[0][2]); uint16x4_t diag1 = vadd_u16(mat[2][0], mat[2][2]); uint16x4_t diag = vadd_u16(diag0, diag1); uint16x4_t v = vabd_u16(vadd_u16(center, diag), adj); v = vand_u16(v, thresh_u16); uint32x4_t v_u32 = vmovl_u16(vrshl_u16(v, vdup_n_s16(8 - bitdepth))); acc = vpadalq_u32(acc, v_u32); // Add -1 for each lane where the gradient is under the threshold. count = vaddw_s16(count, vreinterpret_s16_u16(thresh_u16)); w += 4; src_ptr += 4; } while (w < width - 1) { int mat[3][3]; mat[0][0] = *(src_ptr - stride - 1); mat[0][1] = *(src_ptr - stride); mat[0][2] = *(src_ptr - stride + 1); mat[1][0] = *(src_ptr - 1); mat[1][1] = *(src_ptr); mat[1][2] = *(src_ptr + 1); mat[2][0] = *(src_ptr + stride - 1); mat[2][1] = *(src_ptr + stride); mat[2][2] = *(src_ptr + stride + 1); // Compute Sobel gradients. const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) + 2 * (mat[1][0] - mat[1][2]); const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) + 2 * (mat[0][1] - mat[2][1]); const int ga = ROUND_POWER_OF_TWO(abs(gx) + abs(gy), bitdepth - 8); // Accumulate Laplacian. const int is_under = ga < edge_thresh; const int v = 4 * mat[1][1] - 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) + (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]); final_acc += ROUND_POWER_OF_TWO(abs(v), bitdepth - 8) * is_under; final_count += is_under; src_ptr++; w++; } src_start += stride; } while (++h < height - 1); // We counted negatively, so subtract to get the final value. final_count -= horizontal_add_s32x4(count); final_acc += horizontal_add_u64x2(acc); return (final_count < 16) ? -1.0 : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2; } aom-3.12.1/av1/encoder/arm/hybrid_fwd_txfm_neon.c000066400000000000000000000043531477627663500216350ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom_dsp/txfm_common.h" #include "config/av1_rtcd.h" static void transpose4x4(int16x8_t in[2], int16x4_t out[4]) { int32x4x2_t b0 = vtrnq_s32(vreinterpretq_s32_s16(in[0]), vreinterpretq_s32_s16(in[1])); int16x4x2_t c0 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[0])), vreinterpret_s16_s32(vget_high_s32(b0.val[0]))); int16x4x2_t c1 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[1])), vreinterpret_s16_s32(vget_high_s32(b0.val[1]))); out[0] = c0.val[0]; out[1] = c0.val[1]; out[2] = c1.val[0]; out[3] = c1.val[1]; } void av1_fwht4x4_neon(const int16_t *input, tran_low_t *output, int stride) { // Load the 4x4 source in transposed form. int16x4_t a1, b1, c1, d1, e; a1 = vld1_s16(&input[0]); b1 = vld1_s16(&input[1 * stride]); c1 = vld1_s16(&input[2 * stride]); d1 = vld1_s16(&input[3 * stride]); // WHT. // Row transforms. a1 = vadd_s16(a1, b1); d1 = vsub_s16(d1, c1); e = vhsub_s16(a1, d1); b1 = vsub_s16(e, b1); c1 = vsub_s16(e, c1); a1 = vsub_s16(a1, c1); d1 = vadd_s16(d1, b1); int16x8_t x[2]; x[0] = vcombine_s16(a1, c1); x[1] = vcombine_s16(d1, b1); int16x4_t s[4]; transpose4x4(x, s); a1 = s[0]; b1 = s[1]; c1 = s[2]; d1 = s[3]; // Row transforms. a1 = vadd_s16(a1, b1); d1 = vsub_s16(d1, c1); e = vhsub_s16(a1, d1); b1 = vsub_s16(e, b1); c1 = vsub_s16(e, c1); a1 = vsub_s16(a1, c1); d1 = vadd_s16(d1, b1); vst1q_s32(&output[0], vshll_n_s16(a1, UNIT_QUANT_SHIFT)); vst1q_s32(&output[4], vshll_n_s16(c1, UNIT_QUANT_SHIFT)); vst1q_s32(&output[8], vshll_n_s16(d1, UNIT_QUANT_SHIFT)); vst1q_s32(&output[12], vshll_n_s16(b1, UNIT_QUANT_SHIFT)); } aom-3.12.1/av1/encoder/arm/ml_neon.c000066400000000000000000000321101477627663500170560ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "av1/encoder/ml.h" static void nn_activate8(float32x4_t *out_h, float32x4_t *out_l, const float32x4_t *zero) { *out_h = vmaxq_f32(*out_h, *zero); *out_l = vmaxq_f32(*out_l, *zero); } static void nn_activate4(float32x4_t *x, const float32x4_t *zero) { *x = vmaxq_f32(*x, *zero); } #define CLAMP_0(x) (x = x > 0 ? x : 0) static void nn_propagate_8to1(int num_inputs, const float *const inputs, const float *const weights, const float *layer_bias, float *const output_nodes, bool output_layer) { const float32x4_t zero = vdupq_n_f32(0); float32x4_t vadd = zero; float total = *layer_bias; for (int in = 0; in < num_inputs; in += 8) { const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]); const float32x4_t inputs_l = vld1q_f32(&inputs[in]); const float32x4_t weights_h = vld1q_f32(&weights[in + 4]); const float32x4_t weights_l = vld1q_f32(&weights[in]); vadd = vmlaq_f32(vadd, inputs_h, weights_h); vadd = vmlaq_f32(vadd, inputs_l, weights_l); } #if AOM_ARCH_AARCH64 total += vaddvq_f32(vadd); #else float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd)); vadd_lo = vpadd_f32(vadd_lo, vadd_lo); total += vget_lane_f32(vadd_lo, 0); #endif if (!output_layer) CLAMP_0(total); *output_nodes = total; } static void nn_propagate_xto1(int num_inputs, const float *const inputs, const float *const weights, const float *layer_bias, float *const output_nodes) { float32x4_t vadd = vdupq_n_f32(0); float total = *layer_bias; int j = num_inputs; int in = 0; while (j > 7) { const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]); const float32x4_t inputs_l = vld1q_f32(&inputs[in]); const float32x4_t weights_h = vld1q_f32(&weights[in + 4]); const float32x4_t weights_l = vld1q_f32(&weights[in]); vadd = vmlaq_f32(vadd, inputs_h, weights_h); vadd = vmlaq_f32(vadd, inputs_l, weights_l); in += 8; j -= 8; } #if AOM_ARCH_AARCH64 total += vaddvq_f32(vadd); #else float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd)); vadd_lo = vpadd_f32(vadd_lo, vadd_lo); total += vget_lane_f32(vadd_lo, 0); #endif for (; in < num_inputs; in++) total += weights[in] * inputs[in]; *output_nodes = CLAMP_0(total); } static void nn_propagate_xsto1(int num_inputs, const float *const inputs, const float *const weights, const float *layer_bias, float *const output_nodes) { float total = *layer_bias; #if AOM_ARCH_AARCH64 const float32x4_t v_inputs = vld1q_f32(inputs); const float32x4_t v_weights = vld1q_f32(weights); const float32x4_t vadd = vmulq_f32(v_inputs, v_weights); total += vaddvq_f32(vadd); int in = 4; #else int in = 0; #endif for (; in < num_inputs; in++) total += weights[in] * inputs[in]; *output_nodes = CLAMP_0(total); } static void nn_propagate_4to1(int num_inputs, const float *const inputs, const float *const weights, const float *layer_bias, float *const output_nodes, bool output_layer) { const float32x4_t zero = vdupq_n_f32(0); float32x4_t vadd = zero; float total = *layer_bias; for (int in = 0; in < num_inputs; in += 4) { const float32x4_t v_inputs = vld1q_f32(&inputs[in]); const float32x4_t v_weights = vld1q_f32(&weights[in]); vadd = vmlaq_f32(vadd, v_inputs, v_weights); } #if AOM_ARCH_AARCH64 total += vaddvq_f32(vadd); #else float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd)); vadd_lo = vpadd_f32(vadd_lo, vadd_lo); total += vget_lane_f32(vadd_lo, 0); #endif if (!output_layer) CLAMP_0(total); *output_nodes = total; } static void nn_propagate_4to4(int num_inputs, const float *const inputs, const float *const weights, const float *layer_bias, float *const output_nodes, bool output_layer) { float32x4_t outputs = vld1q_f32(layer_bias); const float32x4_t zero = vdupq_n_f32(0); float32x4_t mul0[2] = { zero, zero }; float32x4_t mul1[2] = { zero, zero }; for (int in = 0; in < num_inputs; in += 4) { const float32x4_t v_input = vld1q_f32(&inputs[in]); for (int i = 0; i < 2; i++) { const float32x4_t weight0 = vld1q_f32(&weights[in + 2 * i * num_inputs]); mul0[i] = vmlaq_f32(mul0[i], weight0, v_input); const float32x4_t weight1 = vld1q_f32(&weights[in + (2 * i + 1) * num_inputs]); mul1[i] = vmlaq_f32(mul1[i], weight1, v_input); } } for (int i = 0; i < 2; i++) #if AOM_ARCH_AARCH64 mul0[i] = vpaddq_f32(mul0[i], mul1[i]); const float32x4_t hh = vpaddq_f32(mul0[0], mul0[1]); #else mul0[i] = vcombine_f32(vpadd_f32(vget_low_f32(mul0[i]), vget_high_f32(mul0[i])), vpadd_f32(vget_low_f32(mul1[i]), vget_high_f32(mul1[i]))); const float32x4_t hh = vcombine_f32(vpadd_f32(vget_low_f32(mul0[0]), vget_high_f32(mul0[0])), vpadd_f32(vget_low_f32(mul0[1]), vget_high_f32(mul0[1]))); #endif outputs = vaddq_f32(outputs, hh); if (!output_layer) nn_activate4(&outputs, &zero); vst1q_f32(output_nodes, outputs); } static void nn_propagate_4to8(const int num_inputs, const float *const inputs, const float *const weights, const float *layer_bias, float *const output_nodes, bool output_layer) { float32x4_t out_h = vld1q_f32(&layer_bias[4]); float32x4_t out_l = vld1q_f32(layer_bias); const float32x4_t zero = vdupq_n_f32(0); float32x4_t mul0[4] = { zero, zero, zero, zero }; float32x4_t mul1[4] = { zero, zero, zero, zero }; for (int in = 0; in < num_inputs; in += 4) { const float32x4_t v_input = vld1q_f32(&inputs[in]); for (int i = 0; i < 4; i++) { const float32x4_t weight0 = vld1q_f32(&weights[in + 2 * i * num_inputs]); const float32x4_t weight1 = vld1q_f32(&weights[in + (2 * i + 1) * num_inputs]); mul0[i] = vmlaq_f32(mul0[i], v_input, weight0); mul1[i] = vmlaq_f32(mul1[i], v_input, weight1); } } for (int i = 0; i < 4; i++) #if AOM_ARCH_AARCH64 mul0[i] = vpaddq_f32(mul0[i], mul1[i]); const float32x4_t hh0 = vpaddq_f32(mul0[0], mul0[1]); const float32x4_t hh1 = vpaddq_f32(mul0[2], mul0[3]); #else mul0[i] = vcombine_f32(vpadd_f32(vget_low_f32(mul0[i]), vget_high_f32(mul0[i])), vpadd_f32(vget_low_f32(mul1[i]), vget_high_f32(mul1[i]))); const float32x4_t hh0 = vcombine_f32(vpadd_f32(vget_low_f32(mul0[0]), vget_high_f32(mul0[0])), vpadd_f32(vget_low_f32(mul0[1]), vget_high_f32(mul0[1]))); const float32x4_t hh1 = vcombine_f32(vpadd_f32(vget_low_f32(mul0[2]), vget_high_f32(mul0[2])), vpadd_f32(vget_low_f32(mul0[3]), vget_high_f32(mul0[3]))); #endif out_h = vaddq_f32(out_h, hh1); out_l = vaddq_f32(out_l, hh0); if (!output_layer) nn_activate8(&out_h, &out_l, &zero); vst1q_f32(&output_nodes[4], out_h); vst1q_f32(output_nodes, out_l); } static void nn_propagate_8to4(const int num_inputs, const float *const inputs, const float *const weights, const float *layer_bias, float *const output_nodes, bool output_layer) { float32x4_t outputs = vld1q_f32(layer_bias); const float32x4_t zero = vdupq_n_f32(0); float32x4_t add[4] = { zero, zero, zero, zero }; for (int in = 0; in < num_inputs; in += 8) { const float32x4_t inputs_l = vld1q_f32(&inputs[in]); const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]); for (int i = 0; i < 4; i++) { const float32x4_t weight_l = vld1q_f32(&weights[in + i * num_inputs]); const float32x4_t weight_h = vld1q_f32(&weights[in + i * num_inputs + 4]); add[i] = vmlaq_f32(add[i], inputs_l, weight_l); add[i] = vmlaq_f32(add[i], inputs_h, weight_h); } } #if AOM_ARCH_AARCH64 const float32x4_t hadd_h = vpaddq_f32(add[2], add[3]); const float32x4_t hadd_l = vpaddq_f32(add[0], add[1]); const float32x4_t haddhadd = vpaddq_f32(hadd_l, hadd_h); #else const float32x4_t hadd_h = vcombine_f32(vpadd_f32(vget_low_f32(add[2]), vget_high_f32(add[2])), vpadd_f32(vget_low_f32(add[3]), vget_high_f32(add[3]))); const float32x4_t hadd_l = vcombine_f32(vpadd_f32(vget_low_f32(add[0]), vget_high_f32(add[0])), vpadd_f32(vget_low_f32(add[1]), vget_high_f32(add[1]))); const float32x4_t haddhadd = vcombine_f32(vpadd_f32(vget_low_f32(hadd_l), vget_high_f32(hadd_l)), vpadd_f32(vget_low_f32(hadd_h), vget_high_f32(hadd_h))); #endif outputs = vaddq_f32(outputs, haddhadd); if (!output_layer) nn_activate4(&outputs, &zero); vst1q_f32(output_nodes, outputs); } // Calculate prediction based on the given input features and neural net config. // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden // layer. void av1_nn_predict_neon(const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output) { float buf[2][NN_MAX_NODES_PER_LAYER]; int buf_index = 0; int num_inputs = nn_config->num_inputs; // Hidden layers, except the final iteration is the output layer. for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) { const float *layer_weights = nn_config->weights[layer]; const float *layer_bias = nn_config->bias[layer]; bool output_layer = (layer == nn_config->num_hidden_layers); float *const output_nodes = output_layer ? output : buf[buf_index]; const int num_outputs = output_layer ? nn_config->num_outputs : nn_config->num_hidden_nodes[layer]; if (num_inputs % 4 == 0 && num_outputs % 8 == 0) { for (int out = 0; out < num_outputs; out += 8) { nn_propagate_4to8(num_inputs, input_nodes, &layer_weights[out * num_inputs], &layer_bias[out], &output_nodes[out], output_layer); } } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) { for (int out = 0; out < num_outputs; out += 4) { nn_propagate_8to4(num_inputs, input_nodes, &layer_weights[out * num_inputs], &layer_bias[out], &output_nodes[out], output_layer); } } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) { for (int out = 0; out < num_outputs; out += 4) { nn_propagate_4to4(num_inputs, input_nodes, &layer_weights[out * num_inputs], &layer_bias[out], &output_nodes[out], output_layer); } } else if (num_inputs % 8 == 0) { for (int out = 0; out < num_outputs; out++) { nn_propagate_8to1(num_inputs, input_nodes, &layer_weights[out * num_inputs], &layer_bias[out], &output_nodes[out], output_layer); } } else if (num_inputs % 4 == 0) { for (int out = 0; out < num_outputs; out++) { nn_propagate_4to1(num_inputs, input_nodes, &layer_weights[out * num_inputs], &layer_bias[out], &output_nodes[out], output_layer); } } else if (num_inputs > 8) { for (int out = 0; out < num_outputs; out++) { nn_propagate_xto1(num_inputs, input_nodes, &layer_weights[out * num_inputs], &layer_bias[out], &output_nodes[out]); } } else if (num_inputs >= 4) { for (int out = 0; out < num_outputs; out++) { nn_propagate_xsto1(num_inputs, input_nodes, &layer_weights[out * num_inputs], &layer_bias[out], &output_nodes[out]); } } else { for (int node = 0; node < num_outputs; ++node) { float val = layer_bias[node]; for (int i = 0; i < num_inputs; ++i) val += layer_weights[node * num_inputs + i] * input_nodes[i]; // ReLU as activation function. val = val > 0.0f ? val : 0.0f; // Could use AOMMAX(). output_nodes[node] = val; } } input_nodes = output_nodes; num_inputs = num_outputs; buf_index = 1 - buf_index; } if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs); } aom-3.12.1/av1/encoder/arm/pickrst_neon.c000066400000000000000000003237231477627663500201420ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "av1/common/restoration.h" #include "av1/encoder/arm/pickrst_neon.h" #include "av1/encoder/pickrst.h" int64_t av1_lowbd_pixel_proj_error_neon( const uint8_t *src, int width, int height, int src_stride, const uint8_t *dat, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { int64_t sse = 0; int64x2_t sse_s64 = vdupq_n_s64(0); if (params->r[0] > 0 && params->r[1] > 0) { int32x2_t xq_v = vld1_s32(xq); int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), SGRPROJ_RST_BITS); do { int j = 0; int32x4_t sse_s32 = vdupq_n_s32(0); do { const uint8x8_t d = vld1_u8(&dat[j]); const uint8x8_t s = vld1_u8(&src[j]); int32x4_t flt0_0 = vld1q_s32(&flt0[j]); int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]); int32x4_t flt1_0 = vld1q_s32(&flt1[j]); int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]); int32x4_t offset = vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)); int32x4_t v0 = vmlaq_lane_s32(offset, flt0_0, xq_v, 0); int32x4_t v1 = vmlaq_lane_s32(offset, flt0_1, xq_v, 0); v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1); v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1); int16x8_t d_s16 = vreinterpretq_s16_u16(vmovl_u8(d)); v0 = vmlsl_lane_s16(v0, vget_low_s16(d_s16), vreinterpret_s16_s32(xq_sum_v), 0); v1 = vmlsl_lane_s16(v1, vget_high_s16(d_s16), vreinterpret_s16_s32(xq_sum_v), 0); int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s)); int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff); int16x4_t e_lo = vget_low_s16(e); int16x4_t e_hi = vget_high_s16(e); sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); j += 8; } while (j <= width - 8); for (int k = j; k < width; ++k) { int32_t u = (dat[k] << SGRPROJ_RST_BITS); int32_t v = (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)) + xq[0] * flt0[k] + xq[1] * flt1[k] - u * (xq[0] + xq[1]); int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; sse += e * e; } sse_s64 = vpadalq_s32(sse_s64, sse_s32); dat += dat_stride; src += src_stride; flt0 += flt0_stride; flt1 += flt1_stride; } while (--height != 0); } else if (params->r[0] > 0 || params->r[1] > 0) { int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; int32x2_t xq_v = vdup_n_s32(xq_active); do { int32x4_t sse_s32 = vdupq_n_s32(0); int j = 0; do { const uint8x8_t d = vld1_u8(&dat[j]); const uint8x8_t s = vld1_u8(&src[j]); int32x4_t flt_0 = vld1q_s32(&flt[j]); int32x4_t flt_1 = vld1q_s32(&flt[j + 4]); int16x8_t d_s16 = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); int32x4_t sub_0 = vsubw_s16(flt_0, vget_low_s16(d_s16)); int32x4_t sub_1 = vsubw_s16(flt_1, vget_high_s16(d_s16)); int32x4_t offset = vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)); int32x4_t v0 = vmlaq_lane_s32(offset, sub_0, xq_v, 0); int32x4_t v1 = vmlaq_lane_s32(offset, sub_1, xq_v, 0); int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s)); int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff); int16x4_t e_lo = vget_low_s16(e); int16x4_t e_hi = vget_high_s16(e); sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); j += 8; } while (j <= width - 8); for (int k = j; k < width; ++k) { int32_t u = dat[k] << SGRPROJ_RST_BITS; int32_t v = xq_active * (flt[k] - u); int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) + dat[k] - src[k]; sse += e * e; } sse_s64 = vpadalq_s32(sse_s64, sse_s32); dat += dat_stride; src += src_stride; flt += flt_stride; } while (--height != 0); } else { uint32x4_t sse_s32 = vdupq_n_u32(0); do { int j = 0; do { const uint8x16_t d = vld1q_u8(&dat[j]); const uint8x16_t s = vld1q_u8(&src[j]); uint8x16_t diff = vabdq_u8(d, s); uint8x8_t diff_lo = vget_low_u8(diff); uint8x8_t diff_hi = vget_high_u8(diff); sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_lo, diff_lo)); sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_hi, diff_hi)); j += 16; } while (j <= width - 16); for (int k = j; k < width; ++k) { int32_t e = dat[k] - src[k]; sse += e * e; } dat += dat_stride; src += src_stride; } while (--height != 0); sse_s64 = vreinterpretq_s64_u64(vpaddlq_u32(sse_s32)); } sse += horizontal_add_s64x2(sse_s64); return sse; } // We can accumulate up to 32768 8-bit multiplication results in a signed // 32-bit integer. We are processing 2 pixels at a time, so the accumulator max // can be as high as 16384 for the compute stats. #define STAT_ACCUMULATOR_MAX 16384 static inline uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) { #if AOM_ARCH_AARCH64 uint8x16x2_t table = { { a, b } }; return vqtbl2_u8(table, idx); #else uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b), vget_high_u8(b) } }; return vtbl4_u8(table, idx); #endif } static inline uint8x16_t tbl2q(uint8x16_t a, uint8x16_t b, uint8x16_t idx) { #if AOM_ARCH_AARCH64 uint8x16x2_t table = { { a, b } }; return vqtbl2q_u8(table, idx); #else uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b), vget_high_u8(b) } }; return vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)), vtbl4_u8(table, vget_high_u8(idx))); #endif } // The M matrix is accumulated in STAT_ACCUMULATOR_MAX steps to speed-up the // computation. This function computes the final M from the accumulated // (src_s64) and the residual parts (src_s32). It also transposes the result as // the output needs to be column-major. static inline void acc_transpose_M(int64_t *dst, const int64_t *src_s64, const int32_t *src_s32, const int wiener_win, int scale) { for (int i = 0; i < wiener_win; ++i) { for (int j = 0; j < wiener_win; ++j) { int tr_idx = j * wiener_win + i; *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale; } } } // The resulting H is a column-major matrix accumulated from the transposed // (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single // vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This // function transforms back to the originally expected format (double // transpose). The H matrix is accumulated in STAT_ACCUMULATOR_MAX steps to // speed-up the computation. This function computes the final H from the // accumulated (src_s64) and the residual parts (src_s32). The computed H is // only an upper triangle matrix, this function also fills the lower triangle of // the resulting matrix. static void update_H(int64_t *dst, const int64_t *src_s64, const int32_t *src_s32, const int wiener_win, int stride, int scale) { // For a simplified theoretical 3x3 case where `wiener_win` is 3 and // `wiener_win2` is 9, the M matrix is 3x3: // 0, 3, 6 // 1, 4, 7 // 2, 5, 8 // // This is viewed as a vector to compute H (9x9) by vector outer product: // 0, 3, 6, 1, 4, 7, 2, 5, 8 // // Double transpose and upper triangle remapping for 3x3 -> 9x9 case: // 0, 3, 6, 1, 4, 7, 2, 5, 8, // 3, 30, 33, 12, 31, 34, 21, 32, 35, // 6, 33, 60, 15, 42, 61, 24, 51, 62, // 1, 12, 15, 10, 13, 16, 11, 14, 17, // 4, 31, 42, 13, 40, 43, 22, 41, 44, // 7, 34, 61, 16, 43, 70, 25, 52, 71, // 2, 21, 24, 11, 22, 25, 20, 23, 26, // 5, 32, 51, 14, 41, 52, 23, 50, 53, // 8, 35, 62, 17, 44, 71, 26, 53, 80, const int wiener_win2 = wiener_win * wiener_win; // Loop through the indices according to the remapping above, along the // columns: // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ..., // wiener_win - 1, wiener_win - 1 + wiener_win, ... // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8. for (int i = 0; i < wiener_win; ++i) { for (int j = i; j < wiener_win2; j += wiener_win) { // These two inner loops are the same as the two outer loops, but running // along rows instead of columns. For the 3x3 case `l` will be: // 0, 3, 6, 1, 4, 7, 2, 5, 8. for (int k = 0; k < wiener_win; ++k) { for (int l = k; l < wiener_win2; l += wiener_win) { // The nominal double transpose indexing would be: // int idx = stride * j + l; // However we need the upper-triangle indices, it is easy with some // min/max operations. int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l); // Resulting matrix is filled by combining the 64-bit and the residual // 32-bit matrices together with scaling. *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale; } } } } } // Load 7x7 matrix into 3 and a half 128-bit vectors from consecutive rows, the // last load address is offset to prevent out-of-bounds access. static inline void load_and_pack_u8_8x7(uint8x16_t dst[4], const uint8_t *src, ptrdiff_t stride) { dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); src += 2 * stride; dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); src += 2 * stride; dst[2] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); src += 2 * stride; dst[3] = vcombine_u8(vld1_u8(src - 1), vdup_n_u8(0)); } static inline void compute_stats_win7_downsampled_neon( const uint8_t *dgd, const uint8_t *src, int width, int height, int dgd_stride, int src_stride, int avg, int64_t *M, int64_t *H, int downsample_factor) { // Matrix names are capitalized to help readability. DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]); DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]); DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]); DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]); DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); memset(M_s32, 0, sizeof(M_s32)); memset(M_s64, 0, sizeof(M_s64)); memset(H_s32, 0, sizeof(H_s32)); memset(H_s64, 0, sizeof(H_s64)); // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7 // matrices. // clang-format off DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7[96]) = { 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, }; // clang-format on const uint8x16_t lut0 = vld1q_u8(shuffle_stats7 + 0); const uint8x16_t lut1 = vld1q_u8(shuffle_stats7 + 16); const uint8x16_t lut2 = vld1q_u8(shuffle_stats7 + 32); const uint8x16_t lut3 = vld1q_u8(shuffle_stats7 + 48); const uint8x16_t lut4 = vld1q_u8(shuffle_stats7 + 64); const uint8x16_t lut5 = vld1q_u8(shuffle_stats7 + 80); int acc_cnt = STAT_ACCUMULATOR_MAX; const int src_next = downsample_factor * src_stride - width; const int dgd_next = downsample_factor * dgd_stride - width; const uint8x8_t avg_u8 = vdup_n_u8(avg); do { int j = width; while (j >= 2) { // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the // middle 6x7 elements being shared. uint8x16_t dgd_rows[4]; load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride); const uint8_t *dgd_ptr = dgd + dgd_stride * 6; dgd += 2; // Re-arrange (and widen) the combined 8x7 matrix to have the 2 whole 7x7 // matrices (1 for each of the 2 pixels) separated into distinct // int16x8_t[6] arrays. These arrays contain 48 elements of the 49 (7x7). // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 49 // consecutive elements. int16x8_t dgd_avg0[6]; int16x8_t dgd_avg1[6]; uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); uint8x16_t dgd_shuf3 = tbl2q(dgd_rows[0], dgd_rows[1], lut3); dgd_avg0[0] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); dgd_avg0[1] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); dgd_avg1[0] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf3), avg_u8)); dgd_avg1[1] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf3), avg_u8)); vst1q_s16(DGD_AVG0, dgd_avg0[0]); vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); vst1q_s16(DGD_AVG1, dgd_avg1[0]); vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1); uint8x16_t dgd_shuf4 = tbl2q(dgd_rows[1], dgd_rows[2], lut4); dgd_avg0[2] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8)); dgd_avg0[3] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8)); dgd_avg1[2] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf4), avg_u8)); dgd_avg1[3] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf4), avg_u8)); vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]); uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2); uint8x16_t dgd_shuf5 = tbl2q(dgd_rows[2], dgd_rows[3], lut5); dgd_avg0[4] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8)); dgd_avg0[5] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8)); dgd_avg1[4] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf5), avg_u8)); dgd_avg1[5] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf5), avg_u8)); vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]); vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]); // The remaining last (49th) elements of `dgd - avg`. DGD_AVG0[48] = dgd_ptr[6] - avg; DGD_AVG1[48] = dgd_ptr[7] - avg; // Accumulate into row-major variant of matrix M (cross-correlation) for 2 // output pixels at a time. M is of size 7 * 7. It needs to be filled such // that multiplying one element from src with each element of a row of the // wiener window will fill one column of M. However this is not very // convenient in terms of memory access, as it means we do contiguous // loads of dgd but strided stores to M. As a result, we use an // intermediate matrix M_s32 which is instead filled such that one row of // the wiener window gives one row of M_s32. Once fully computed, M_s32 is // then transposed to return M. int src_avg0 = *src++ - avg; int src_avg1 = *src++ - avg; int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], dgd_avg1[0]); update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], dgd_avg1[1]); update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], dgd_avg1[2]); update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3], dgd_avg1[3]); update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4], dgd_avg1[4]); update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5], dgd_avg1[5]); // Last (49th) element of M_s32 can be computed as scalar more efficiently // for 2 output pixels. M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1; // Start accumulating into row-major version of matrix H // (auto-covariance), it expects the DGD_AVG[01] matrices to also be // row-major. H is of size 49 * 49. It is filled by multiplying every pair // of elements of the wiener window together (vector outer product). Since // it is a symmetric matrix, we only compute the upper-right triangle, and // then copy it down to the lower-left later. The upper triangle is // covered by 4x4 tiles. The original algorithm assumes the M matrix is // column-major and the resulting H matrix is also expected to be // column-major. It is not efficient to work with column-major matrices, // so we accumulate into a row-major matrix H_s32. At the end of the // algorithm a double transpose transformation will convert H_s32 back to // the expected output layout. update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1); // The last element of the triangle of H_s32 matrix can be computed as a // scalar more efficiently. H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48]; // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent // overflow. if (--acc_cnt == 0) { acc_cnt = STAT_ACCUMULATOR_MAX; accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2); // The widening accumulation is only needed for the upper triangle part // of the matrix. int64_t *lh = H_s64; int32_t *lh32 = H_s32; for (int k = 0; k < WIENER_WIN2; ++k) { // The widening accumulation is only run for the relevant parts // (upper-right triangle) in a row 4-element aligned. int k4 = k / 4 * 4; accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4); // Last element of the row is computed separately. lh[48] += lh32[48]; lh32[48] = 0; lh += WIENER_WIN2_ALIGN2; lh32 += WIENER_WIN2_ALIGN2; } } j -= 2; } // Computations for odd pixel in the row. if (width & 1) { // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the // middle 6x7 elements being shared. uint8x16_t dgd_rows[4]; load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride); const uint8_t *dgd_ptr = dgd + dgd_stride * 6; ++dgd; // Re-arrange (and widen) the combined 8x7 matrix to have a whole 7x7 // matrix tightly packed into a int16x8_t[6] array. This array contains // 48 elements of the 49 (7x7). Compute `dgd - avg` for the whole buffer. // The DGD_AVG buffer contains 49 consecutive elements. int16x8_t dgd_avg0[6]; uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); dgd_avg0[0] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); dgd_avg0[1] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); vst1q_s16(DGD_AVG0, dgd_avg0[0]); vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1); dgd_avg0[2] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8)); dgd_avg0[3] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8)); vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2); dgd_avg0[4] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8)); dgd_avg0[5] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8)); vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); // The remaining last (49th) element of `dgd - avg`. DGD_AVG0[48] = dgd_ptr[6] - avg; // Accumulate into row-major order variant of matrix M (cross-correlation) // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled // such that multiplying one element from src with each element of a row // of the wiener window will fill one column of M. However this is not // very convenient in terms of memory access, as it means we do // contiguous loads of dgd but strided stores to M. As a result, we use an // intermediate matrix M_s32 which is instead filled such that one row of // the wiener window gives one row of M_s32. Once fully computed, M_s32 is // then transposed to return M. int src_avg0 = *src++ - avg; int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]); update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]); update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]); // Last (49th) element of M_s32 can be computed as scalar more efficiently // for 1 output pixel. M_s32[48] += DGD_AVG0[48] * src_avg0; // Start accumulating into row-major order version of matrix H // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. // H is of size 49 * 49. It is filled by multiplying every pair of // elements of the wiener window together (vector outer product). Since it // is a symmetric matrix, we only compute the upper-right triangle, and // then copy it down to the lower-left later. The upper triangle is // covered by 4x4 tiles. The original algorithm assumes the M matrix is // column-major and the resulting H matrix is also expected to be // column-major. It is not efficient to work column-major matrices, so we // accumulate into a row-major matrix H_s32. At the end of the algorithm a // double transpose transformation will convert H_s32 back to the expected // output layout. update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48); // The last element of the triangle of H_s32 matrix can be computed as // scalar more efficiently. H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48]; } src += src_next; dgd += dgd_next; } while (--height != 0); acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, downsample_factor); update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, downsample_factor); } // Load 5x5 matrix into 2 and a half 128-bit vectors from consecutive rows, the // last load address is offset to prevent out-of-bounds access. static inline void load_and_pack_u8_6x5(uint8x16_t dst[3], const uint8_t *src, ptrdiff_t stride) { dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); src += 2 * stride; dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); src += 2 * stride; dst[2] = vcombine_u8(vld1_u8(src - 3), vdup_n_u8(0)); } static inline void compute_stats_win5_downsampled_neon( const uint8_t *dgd, const uint8_t *src, int width, int height, int dgd_stride, int src_stride, int avg, int64_t *M, int64_t *H, int downsample_factor) { // Matrix names are capitalized to help readability. DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]); DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]); DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]); DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]); DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); memset(M_s32, 0, sizeof(M_s32)); memset(M_s64, 0, sizeof(M_s64)); memset(H_s32, 0, sizeof(H_s32)); memset(H_s64, 0, sizeof(H_s64)); // Look-up tables to create 8x3 matrix with consecutive elements from two 5x5 // matrices. // clang-format off DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5[48]) = { 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 24, 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 17, 18, 19, 20, 21, 25, 9, 10, 11, 12, 19, 20, 21, 22, 10, 11, 12, 13, 20, 21, 22, 23, }; // clang-format on const uint8x16_t lut0 = vld1q_u8(shuffle_stats5 + 0); const uint8x16_t lut1 = vld1q_u8(shuffle_stats5 + 16); const uint8x16_t lut2 = vld1q_u8(shuffle_stats5 + 32); int acc_cnt = STAT_ACCUMULATOR_MAX; const int src_next = downsample_factor * src_stride - width; const int dgd_next = downsample_factor * dgd_stride - width; const uint8x8_t avg_u8 = vdup_n_u8(avg); do { int j = width; while (j >= 2) { // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the // middle 4x5 elements being shared. uint8x16_t dgd_rows[3]; load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride); const uint8_t *dgd_ptr = dgd + dgd_stride * 4; dgd += 2; // Re-arrange (and widen) the combined 6x5 matrix to have the 2 whole 5x5 // matrices (1 for each of the 2 pixels) separated into distinct // int16x8_t[3] arrays. These arrays contain 24 elements of the 25 (5x5). // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 25 // consecutive elements. int16x8_t dgd_avg0[3]; int16x8_t dgd_avg1[3]; uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[0], dgd_rows[1], lut1); uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[1], dgd_rows[2], lut2); dgd_avg0[0] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); dgd_avg0[1] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); dgd_avg0[2] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8)); dgd_avg1[0] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8)); dgd_avg1[1] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8)); dgd_avg1[2] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8)); vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]); vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); vst1q_s16(DGD_AVG1 + 0, dgd_avg1[0]); vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); // The remaining last (25th) elements of `dgd - avg`. DGD_AVG0[24] = dgd_ptr[4] - avg; DGD_AVG1[24] = dgd_ptr[5] - avg; // Accumulate into row-major variant of matrix M (cross-correlation) for 2 // output pixels at a time. M is of size 5 * 5. It needs to be filled such // that multiplying one element from src with each element of a row of the // wiener window will fill one column of M. However this is not very // convenient in terms of memory access, as it means we do contiguous // loads of dgd but strided stores to M. As a result, we use an // intermediate matrix M_s32 which is instead filled such that one row of // the wiener window gives one row of M_s32. Once fully computed, M_s32 is // then transposed to return M. int src_avg0 = *src++ - avg; int src_avg1 = *src++ - avg; int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], dgd_avg1[0]); update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], dgd_avg1[1]); update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], dgd_avg1[2]); // Last (25th) element of M_s32 can be computed as scalar more efficiently // for 2 output pixels. M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1; // Start accumulating into row-major version of matrix H // (auto-covariance), it expects the DGD_AVG[01] matrices to also be // row-major. H is of size 25 * 25. It is filled by multiplying every pair // of elements of the wiener window together (vector outer product). Since // it is a symmetric matrix, we only compute the upper-right triangle, and // then copy it down to the lower-left later. The upper triangle is // covered by 4x4 tiles. The original algorithm assumes the M matrix is // column-major and the resulting H matrix is also expected to be // column-major. It is not efficient to work with column-major matrices, // so we accumulate into a row-major matrix H_s32. At the end of the // algorithm a double transpose transformation will convert H_s32 back to // the expected output layout. update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1); // The last element of the triangle of H_s32 matrix can be computed as a // scalar more efficiently. H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24]; // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent // overflow. if (--acc_cnt == 0) { acc_cnt = STAT_ACCUMULATOR_MAX; accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2); // The widening accumulation is only needed for the upper triangle part // of the matrix. int64_t *lh = H_s64; int32_t *lh32 = H_s32; for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) { // The widening accumulation is only run for the relevant parts // (upper-right triangle) in a row 4-element aligned. int k4 = k / 4 * 4; accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4); // Last element of the row is computed separately. lh[24] += lh32[24]; lh32[24] = 0; lh += WIENER_WIN2_REDUCED_ALIGN2; lh32 += WIENER_WIN2_REDUCED_ALIGN2; } } j -= 2; } // Computations for odd pixel in the row. if (width & 1) { // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the // middle 4x5 elements being shared. uint8x16_t dgd_rows[3]; load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride); const uint8_t *dgd_ptr = dgd + dgd_stride * 4; ++dgd; // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5 // matrix tightly packed into a int16x8_t[3] array. This array contains // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer. // The DGD_AVG buffer contains 25 consecutive elements. int16x8_t dgd_avg0[3]; uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); uint8x8_t dgd_shuf1 = tbl2(dgd_rows[1], dgd_rows[2], vget_low_u8(lut2)); dgd_avg0[0] = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); dgd_avg0[1] = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); dgd_avg0[2] = vreinterpretq_s16_u16(vsubl_u8(dgd_shuf1, avg_u8)); vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]); vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); // The remaining last (25th) element of `dgd - avg`. DGD_AVG0[24] = dgd_ptr[4] - avg; // Accumulate into row-major order variant of matrix M (cross-correlation) // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled // such that multiplying one element from src with each element of a row // of the wiener window will fill one column of M. However this is not // very convenient in terms of memory access, as it means we do // contiguous loads of dgd but strided stores to M. As a result, we use an // intermediate matrix M_s32 which is instead filled such that one row of // the wiener window gives one row of M_s32. Once fully computed, M_s32 is // then transposed to return M. int src_avg0 = *src++ - avg; int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); // Last (25th) element of M_s32 can be computed as scalar more efficiently // for 1 output pixel. M_s32[24] += DGD_AVG0[24] * src_avg0; // Start accumulating into row-major order version of matrix H // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. // H is of size 25 * 25. It is filled by multiplying every pair of // elements of the wiener window together (vector outer product). Since it // is a symmetric matrix, we only compute the upper-right triangle, and // then copy it down to the lower-left later. The upper triangle is // covered by 4x4 tiles. The original algorithm assumes the M matrix is // column-major and the resulting H matrix is also expected to be // column-major. It is not efficient to work column-major matrices, so we // accumulate into a row-major matrix H_s32. At the end of the algorithm a // double transpose transformation will convert H_s32 back to the expected // output layout. update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24); // The last element of the triangle of H_s32 matrix can be computed as a // scalar more efficiently. H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += DGD_AVG0[24] * DGD_AVG0[24]; } src += src_next; dgd += dgd_next; } while (--height != 0); acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, downsample_factor); update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2, downsample_factor); } static inline void hadd_update_6_stats_neon(const int64_t *const src, const int32x4_t *deltas, int64_t *const dst) { int32x4_t delta01 = horizontal_add_2d_s32(deltas[0], deltas[1]); int32x4_t delta23 = horizontal_add_2d_s32(deltas[2], deltas[3]); int32x4_t delta45 = horizontal_add_2d_s32(deltas[4], deltas[5]); int64x2_t delta01_s64 = vpaddlq_s32(delta01); int64x2_t delta23_s64 = vpaddlq_s32(delta23); int64x2_t delta45_s64 = vpaddlq_s32(delta45); int64x2_t src0 = vld1q_s64(src); int64x2_t src1 = vld1q_s64(src + 2); int64x2_t src2 = vld1q_s64(src + 4); vst1q_s64(dst, vaddq_s64(src0, delta01_s64)); vst1q_s64(dst + 2, vaddq_s64(src1, delta23_s64)); vst1q_s64(dst + 4, vaddq_s64(src2, delta45_s64)); } static inline void hadd_update_4_stats_neon(const int64_t *const src, const int32x4_t *deltas, int64_t *const dst) { int32x4_t delta01 = horizontal_add_2d_s32(deltas[0], deltas[1]); int32x4_t delta23 = horizontal_add_2d_s32(deltas[2], deltas[3]); int64x2_t delta01_s64 = vpaddlq_s32(delta01); int64x2_t delta23_s64 = vpaddlq_s32(delta23); int64x2_t src0 = vld1q_s64(src); int64x2_t src1 = vld1q_s64(src + 2); vst1q_s64(dst, vaddq_s64(src0, delta01_s64)); vst1q_s64(dst + 2, vaddq_s64(src1, delta23_s64)); } static inline void compute_stats_win5_neon( const int16_t *const d, const int32_t d_stride, const int16_t *const s, const int32_t s_stride, const int32_t width, const int32_t height, int64_t *const M, int64_t *const H) { const int32_t wiener_win = WIENER_WIN_CHROMA; const int32_t wiener_win2 = wiener_win * wiener_win; const int32_t w16 = width & ~15; const int32_t h8 = height & ~7; int16x8_t mask[2]; mask[0] = vld1q_s16(&(mask_16bit[16]) - width % 16); mask[1] = vld1q_s16(&(mask_16bit[16]) - width % 16 + 8); const int bit_depth = 8; int32_t i, j, x, y; const int32_t num_bit_left = 32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 2 /* SIMD */; const int32_t h_allowed = (1 << num_bit_left) / (w16 + ((w16 != width) ? 16 : 0)); // Step 1: Calculate the top edge of the whole matrix, i.e., the top // edge of each triangle and square on the top row. j = 0; do { const int16_t *s_t = s; const int16_t *d_t = d; int32_t height_t = 0; int64x2_t sum_m[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) }; int64x2_t sum_h[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) }; int16x8_t src[2], dgd[2]; do { const int32_t h_t = ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed; int32x4_t row_m[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) }; int32x4_t row_h[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) }; y = h_t; do { x = 0; while (x < w16) { src[0] = vld1q_s16(s_t + x + 0); src[1] = vld1q_s16(s_t + x + 8); dgd[0] = vld1q_s16(d_t + x + 0); dgd[1] = vld1q_s16(d_t + x + 8); stats_top_win5_neon(src, dgd, d_t + j + x, d_stride, row_m, row_h); x += 16; } if (w16 != width) { src[0] = vld1q_s16(s_t + w16 + 0); src[1] = vld1q_s16(s_t + w16 + 8); dgd[0] = vld1q_s16(d_t + w16 + 0); dgd[1] = vld1q_s16(d_t + w16 + 8); src[0] = vandq_s16(src[0], mask[0]); src[1] = vandq_s16(src[1], mask[1]); dgd[0] = vandq_s16(dgd[0], mask[0]); dgd[1] = vandq_s16(dgd[1], mask[1]); stats_top_win5_neon(src, dgd, d_t + j + w16, d_stride, row_m, row_h); } s_t += s_stride; d_t += d_stride; } while (--y); sum_m[0] = vpadalq_s32(sum_m[0], row_m[0]); sum_m[1] = vpadalq_s32(sum_m[1], row_m[1]); sum_m[2] = vpadalq_s32(sum_m[2], row_m[2]); sum_m[3] = vpadalq_s32(sum_m[3], row_m[3]); sum_m[4] = vpadalq_s32(sum_m[4], row_m[4]); sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]); sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]); sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]); sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]); sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]); height_t += h_t; } while (height_t < height); #if AOM_ARCH_AARCH64 int64x2_t sum_m0 = vpaddq_s64(sum_m[0], sum_m[1]); int64x2_t sum_m2 = vpaddq_s64(sum_m[2], sum_m[3]); vst1q_s64(&M[wiener_win * j + 0], sum_m0); vst1q_s64(&M[wiener_win * j + 2], sum_m2); M[wiener_win * j + 4] = vaddvq_s64(sum_m[4]); int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]); int64x2_t sum_h2 = vpaddq_s64(sum_h[2], sum_h[3]); vst1q_s64(&H[wiener_win * j + 0], sum_h0); vst1q_s64(&H[wiener_win * j + 2], sum_h2); H[wiener_win * j + 4] = vaddvq_s64(sum_h[4]); #else M[wiener_win * j + 0] = horizontal_add_s64x2(sum_m[0]); M[wiener_win * j + 1] = horizontal_add_s64x2(sum_m[1]); M[wiener_win * j + 2] = horizontal_add_s64x2(sum_m[2]); M[wiener_win * j + 3] = horizontal_add_s64x2(sum_m[3]); M[wiener_win * j + 4] = horizontal_add_s64x2(sum_m[4]); H[wiener_win * j + 0] = horizontal_add_s64x2(sum_h[0]); H[wiener_win * j + 1] = horizontal_add_s64x2(sum_h[1]); H[wiener_win * j + 2] = horizontal_add_s64x2(sum_h[2]); H[wiener_win * j + 3] = horizontal_add_s64x2(sum_h[3]); H[wiener_win * j + 4] = horizontal_add_s64x2(sum_h[4]); #endif // AOM_ARCH_AARCH64 } while (++j < wiener_win); // Step 2: Calculate the left edge of each square on the top row. j = 1; do { const int16_t *d_t = d; int32_t height_t = 0; int64x2_t sum_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s64(0) }; int16x8_t dgd[2]; do { const int32_t h_t = ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed; int32x4_t row_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s32(0) }; y = h_t; do { x = 0; while (x < w16) { dgd[0] = vld1q_s16(d_t + j + x + 0); dgd[1] = vld1q_s16(d_t + j + x + 8); stats_left_win5_neon(dgd, d_t + x, d_stride, row_h); x += 16; } if (w16 != width) { dgd[0] = vld1q_s16(d_t + j + x + 0); dgd[1] = vld1q_s16(d_t + j + x + 8); dgd[0] = vandq_s16(dgd[0], mask[0]); dgd[1] = vandq_s16(dgd[1], mask[1]); stats_left_win5_neon(dgd, d_t + x, d_stride, row_h); } d_t += d_stride; } while (--y); sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]); sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]); sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]); sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]); height_t += h_t; } while (height_t < height); #if AOM_ARCH_AARCH64 int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]); int64x2_t sum_h1 = vpaddq_s64(sum_h[2], sum_h[3]); vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h0)); vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h0)); vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h1)); vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h1)); #else H[1 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[0]); H[2 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[1]); H[3 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[2]); H[4 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[3]); #endif // AOM_ARCH_AARCH64 } while (++j < wiener_win); // Step 3: Derive the top edge of each triangle along the diagonal. No // triangle in top row. { const int16_t *d_t = d; if (height % 2) { int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; int16x8_t ds[WIENER_WIN * 2]; load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]); load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]); d_t += 4 * d_stride; step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas); transpose_arrays_s32_8x8(deltas, deltas_tr); update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win, deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0), H + 1 * wiener_win * wiener_win2 + 1 * wiener_win); update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win, deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0), H + 2 * wiener_win * wiener_win2 + 2 * wiener_win); update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win, deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0), H + 3 * wiener_win * wiener_win2 + 3 * wiener_win); update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win, deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0), H + 4 * wiener_win * wiener_win2 + 4 * wiener_win); } else { int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) }; int16x8_t ds[WIENER_WIN_CHROMA * 2]; ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width); ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width); ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width); ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width); step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas); transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2], &deltas[3]); update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win, deltas[0], vgetq_lane_s32(deltas[4], 0), H + 1 * wiener_win * wiener_win2 + 1 * wiener_win); update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win, deltas[1], vgetq_lane_s32(deltas[4], 1), H + 2 * wiener_win * wiener_win2 + 2 * wiener_win); update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win, deltas[2], vgetq_lane_s32(deltas[4], 2), H + 3 * wiener_win * wiener_win2 + 3 * wiener_win); update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win, deltas[3], vgetq_lane_s32(deltas[4], 3), H + 4 * wiener_win * wiener_win2 + 4 * wiener_win); } } // Step 4: Derive the top and left edge of each square. No square in top and // bottom row. { y = h8; int16x4_t d_s[12]; int16x4_t d_e[12]; const int16_t *d_t = d; int16x4_t zeros = vdup_n_s16(0); load_s16_4x4(d_t, d_stride, &d_s[0], &d_s[1], &d_s[2], &d_s[3]); load_s16_4x4(d_t + width, d_stride, &d_e[0], &d_e[1], &d_e[2], &d_e[3]); int32x4_t deltas[6][18] = { { vdupq_n_s32(0) }, { vdupq_n_s32(0) } }; while (y >= 8) { load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6], &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]); load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5], &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]); int16x8_t s_tr[8], e_tr[8]; transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5], d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2], &s_tr[3]); transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros, zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6], &s_tr[7]); transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5], d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2], &e_tr[3]); transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros, zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6], &e_tr[7]); int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5]; start_col0[0] = s_tr[0]; start_col0[1] = vextq_s16(s_tr[0], s_tr[4], 1); start_col0[2] = vextq_s16(s_tr[0], s_tr[4], 2); start_col0[3] = vextq_s16(s_tr[0], s_tr[4], 3); start_col0[4] = vextq_s16(s_tr[0], s_tr[4], 4); start_col1[0] = s_tr[1]; start_col1[1] = vextq_s16(s_tr[1], s_tr[5], 1); start_col1[2] = vextq_s16(s_tr[1], s_tr[5], 2); start_col1[3] = vextq_s16(s_tr[1], s_tr[5], 3); start_col1[4] = vextq_s16(s_tr[1], s_tr[5], 4); start_col2[0] = s_tr[2]; start_col2[1] = vextq_s16(s_tr[2], s_tr[6], 1); start_col2[2] = vextq_s16(s_tr[2], s_tr[6], 2); start_col2[3] = vextq_s16(s_tr[2], s_tr[6], 3); start_col2[4] = vextq_s16(s_tr[2], s_tr[6], 4); start_col3[0] = s_tr[3]; start_col3[1] = vextq_s16(s_tr[3], s_tr[7], 1); start_col3[2] = vextq_s16(s_tr[3], s_tr[7], 2); start_col3[3] = vextq_s16(s_tr[3], s_tr[7], 3); start_col3[4] = vextq_s16(s_tr[3], s_tr[7], 4); // i = 1, j = 2; sub_deltas_step4(start_col0, start_col1, deltas[0]); // i = 1, j = 3; sub_deltas_step4(start_col0, start_col2, deltas[1]); // i = 1, j = 4 sub_deltas_step4(start_col0, start_col3, deltas[2]); // i = 2, j =3 sub_deltas_step4(start_col1, start_col2, deltas[3]); // i = 2, j = 4 sub_deltas_step4(start_col1, start_col3, deltas[4]); // i = 3, j = 4 sub_deltas_step4(start_col2, start_col3, deltas[5]); int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5]; end_col0[0] = e_tr[0]; end_col0[1] = vextq_s16(e_tr[0], e_tr[4], 1); end_col0[2] = vextq_s16(e_tr[0], e_tr[4], 2); end_col0[3] = vextq_s16(e_tr[0], e_tr[4], 3); end_col0[4] = vextq_s16(e_tr[0], e_tr[4], 4); end_col1[0] = e_tr[1]; end_col1[1] = vextq_s16(e_tr[1], e_tr[5], 1); end_col1[2] = vextq_s16(e_tr[1], e_tr[5], 2); end_col1[3] = vextq_s16(e_tr[1], e_tr[5], 3); end_col1[4] = vextq_s16(e_tr[1], e_tr[5], 4); end_col2[0] = e_tr[2]; end_col2[1] = vextq_s16(e_tr[2], e_tr[6], 1); end_col2[2] = vextq_s16(e_tr[2], e_tr[6], 2); end_col2[3] = vextq_s16(e_tr[2], e_tr[6], 3); end_col2[4] = vextq_s16(e_tr[2], e_tr[6], 4); end_col3[0] = e_tr[3]; end_col3[1] = vextq_s16(e_tr[3], e_tr[7], 1); end_col3[2] = vextq_s16(e_tr[3], e_tr[7], 2); end_col3[3] = vextq_s16(e_tr[3], e_tr[7], 3); end_col3[4] = vextq_s16(e_tr[3], e_tr[7], 4); // i = 1, j = 2; add_deltas_step4(end_col0, end_col1, deltas[0]); // i = 1, j = 3; add_deltas_step4(end_col0, end_col2, deltas[1]); // i = 1, j = 4 add_deltas_step4(end_col0, end_col3, deltas[2]); // i = 2, j =3 add_deltas_step4(end_col1, end_col2, deltas[3]); // i = 2, j = 4 add_deltas_step4(end_col1, end_col3, deltas[4]); // i = 3, j = 4 add_deltas_step4(end_col2, end_col3, deltas[5]); d_s[0] = d_s[8]; d_s[1] = d_s[9]; d_s[2] = d_s[10]; d_s[3] = d_s[11]; d_e[0] = d_e[8]; d_e[1] = d_e[9]; d_e[2] = d_e[10]; d_e[3] = d_e[11]; d_t += 8 * d_stride; y -= 8; } if (h8 != height) { const int16x8_t mask_h = vld1q_s16(&mask_16bit[16] - (height % 8)); load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6], &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]); load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5], &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]); int16x8_t s_tr[8], e_tr[8]; transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5], d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2], &s_tr[3]); transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros, zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6], &s_tr[7]); transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5], d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2], &e_tr[3]); transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros, zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6], &e_tr[7]); int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5]; start_col0[0] = vandq_s16(s_tr[0], mask_h); start_col0[1] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 1), mask_h); start_col0[2] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 2), mask_h); start_col0[3] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 3), mask_h); start_col0[4] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 4), mask_h); start_col1[0] = vandq_s16(s_tr[1], mask_h); start_col1[1] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 1), mask_h); start_col1[2] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 2), mask_h); start_col1[3] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 3), mask_h); start_col1[4] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 4), mask_h); start_col2[0] = vandq_s16(s_tr[2], mask_h); start_col2[1] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 1), mask_h); start_col2[2] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 2), mask_h); start_col2[3] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 3), mask_h); start_col2[4] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 4), mask_h); start_col3[0] = vandq_s16(s_tr[3], mask_h); start_col3[1] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 1), mask_h); start_col3[2] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 2), mask_h); start_col3[3] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 3), mask_h); start_col3[4] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 4), mask_h); // i = 1, j = 2; sub_deltas_step4(start_col0, start_col1, deltas[0]); // i = 1, j = 3; sub_deltas_step4(start_col0, start_col2, deltas[1]); // i = 1, j = 4 sub_deltas_step4(start_col0, start_col3, deltas[2]); // i = 2, j = 3 sub_deltas_step4(start_col1, start_col2, deltas[3]); // i = 2, j = 4 sub_deltas_step4(start_col1, start_col3, deltas[4]); // i = 3, j = 4 sub_deltas_step4(start_col2, start_col3, deltas[5]); int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5]; end_col0[0] = vandq_s16(e_tr[0], mask_h); end_col0[1] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 1), mask_h); end_col0[2] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 2), mask_h); end_col0[3] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 3), mask_h); end_col0[4] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 4), mask_h); end_col1[0] = vandq_s16(e_tr[1], mask_h); end_col1[1] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 1), mask_h); end_col1[2] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 2), mask_h); end_col1[3] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 3), mask_h); end_col1[4] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 4), mask_h); end_col2[0] = vandq_s16(e_tr[2], mask_h); end_col2[1] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 1), mask_h); end_col2[2] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 2), mask_h); end_col2[3] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 3), mask_h); end_col2[4] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 4), mask_h); end_col3[0] = vandq_s16(e_tr[3], mask_h); end_col3[1] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 1), mask_h); end_col3[2] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 2), mask_h); end_col3[3] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 3), mask_h); end_col3[4] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 4), mask_h); // i = 1, j = 2; add_deltas_step4(end_col0, end_col1, deltas[0]); // i = 1, j = 3; add_deltas_step4(end_col0, end_col2, deltas[1]); // i = 1, j = 4 add_deltas_step4(end_col0, end_col3, deltas[2]); // i = 2, j =3 add_deltas_step4(end_col1, end_col2, deltas[3]); // i = 2, j = 4 add_deltas_step4(end_col1, end_col3, deltas[4]); // i = 3, j = 4 add_deltas_step4(end_col2, end_col3, deltas[5]); } int32x4_t delta[6][2]; int32_t single_delta[6]; delta[0][0] = horizontal_add_4d_s32x4(&deltas[0][0]); delta[1][0] = horizontal_add_4d_s32x4(&deltas[1][0]); delta[2][0] = horizontal_add_4d_s32x4(&deltas[2][0]); delta[3][0] = horizontal_add_4d_s32x4(&deltas[3][0]); delta[4][0] = horizontal_add_4d_s32x4(&deltas[4][0]); delta[5][0] = horizontal_add_4d_s32x4(&deltas[5][0]); delta[0][1] = horizontal_add_4d_s32x4(&deltas[0][5]); delta[1][1] = horizontal_add_4d_s32x4(&deltas[1][5]); delta[2][1] = horizontal_add_4d_s32x4(&deltas[2][5]); delta[3][1] = horizontal_add_4d_s32x4(&deltas[3][5]); delta[4][1] = horizontal_add_4d_s32x4(&deltas[4][5]); delta[5][1] = horizontal_add_4d_s32x4(&deltas[5][5]); single_delta[0] = horizontal_add_s32x4(deltas[0][4]); single_delta[1] = horizontal_add_s32x4(deltas[1][4]); single_delta[2] = horizontal_add_s32x4(deltas[2][4]); single_delta[3] = horizontal_add_s32x4(deltas[3][4]); single_delta[4] = horizontal_add_s32x4(deltas[4][4]); single_delta[5] = horizontal_add_s32x4(deltas[5][4]); int idx = 0; for (i = 1; i < wiener_win - 1; i++) { for (j = i + 1; j < wiener_win; j++) { update_4_stats_neon( H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win, delta[idx][0], H + i * wiener_win * wiener_win2 + j * wiener_win); H[i * wiener_win * wiener_win2 + j * wiener_win + 4] = H[(i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win + 4] + single_delta[idx]; H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(delta[idx][1], 0); H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(delta[idx][1], 1); H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(delta[idx][1], 2); H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(delta[idx][1], 3); idx++; } } } // Step 5: Derive other points of each square. No square in bottom row. i = 0; do { const int16_t *const di = d + i; j = i + 1; do { const int16_t *const dj = d + j; int32x4_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1] = { { vdupq_n_s32(0) }, { vdupq_n_s32(0) } }; int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA]; int16x8_t d_js[WIN_CHROMA], d_je[WIN_CHROMA]; x = 0; while (x < w16) { load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie, d_js, d_je); derive_square_win5_neon(d_is, d_ie, d_js, d_je, deltas); x += 16; } if (w16 != width) { load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie, d_js, d_je); d_is[0] = vandq_s16(d_is[0], mask[0]); d_is[1] = vandq_s16(d_is[1], mask[1]); d_is[2] = vandq_s16(d_is[2], mask[0]); d_is[3] = vandq_s16(d_is[3], mask[1]); d_is[4] = vandq_s16(d_is[4], mask[0]); d_is[5] = vandq_s16(d_is[5], mask[1]); d_is[6] = vandq_s16(d_is[6], mask[0]); d_is[7] = vandq_s16(d_is[7], mask[1]); d_ie[0] = vandq_s16(d_ie[0], mask[0]); d_ie[1] = vandq_s16(d_ie[1], mask[1]); d_ie[2] = vandq_s16(d_ie[2], mask[0]); d_ie[3] = vandq_s16(d_ie[3], mask[1]); d_ie[4] = vandq_s16(d_ie[4], mask[0]); d_ie[5] = vandq_s16(d_ie[5], mask[1]); d_ie[6] = vandq_s16(d_ie[6], mask[0]); d_ie[7] = vandq_s16(d_ie[7], mask[1]); derive_square_win5_neon(d_is, d_ie, d_js, d_je, deltas); } hadd_update_4_stats_neon( H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0], H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1); hadd_update_4_stats_neon( H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1], H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1); hadd_update_4_stats_neon( H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2], H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1); hadd_update_4_stats_neon( H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3], H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1); } while (++j < wiener_win); } while (++i < wiener_win - 1); // Step 6: Derive other points of each upper triangle along the diagonal. i = 0; do { const int16_t *const di = d + i; int32x4_t deltas[WIENER_WIN_CHROMA * 2 + 1] = { vdupq_n_s32(0) }; int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA]; x = 0; while (x < w16) { load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie); derive_triangle_win5_neon(d_is, d_ie, deltas); x += 16; } if (w16 != width) { load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie); d_is[0] = vandq_s16(d_is[0], mask[0]); d_is[1] = vandq_s16(d_is[1], mask[1]); d_is[2] = vandq_s16(d_is[2], mask[0]); d_is[3] = vandq_s16(d_is[3], mask[1]); d_is[4] = vandq_s16(d_is[4], mask[0]); d_is[5] = vandq_s16(d_is[5], mask[1]); d_is[6] = vandq_s16(d_is[6], mask[0]); d_is[7] = vandq_s16(d_is[7], mask[1]); d_ie[0] = vandq_s16(d_ie[0], mask[0]); d_ie[1] = vandq_s16(d_ie[1], mask[1]); d_ie[2] = vandq_s16(d_ie[2], mask[0]); d_ie[3] = vandq_s16(d_ie[3], mask[1]); d_ie[4] = vandq_s16(d_ie[4], mask[0]); d_ie[5] = vandq_s16(d_ie[5], mask[1]); d_ie[6] = vandq_s16(d_ie[6], mask[0]); d_ie[7] = vandq_s16(d_ie[7], mask[1]); derive_triangle_win5_neon(d_is, d_ie, deltas); } // Row 1: 4 points hadd_update_4_stats_neon( H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas, H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1); // Row 2: 3 points int32x4_t deltas45 = horizontal_add_2d_s32(deltas[4], deltas[5]); int32x4_t deltas78 = horizontal_add_2d_s32(deltas[7], deltas[8]); int64x2_t deltas45_s64 = vpaddlq_s32(deltas45); int64x2_t deltas78_s64 = vpaddlq_s32(deltas78); int64x2_t src = vld1q_s64(H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1); int64x2_t dst = vaddq_s64(src, deltas45_s64); vst1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2, dst); int32x4_t delta69 = horizontal_add_2d_s32(deltas[6], deltas[9]); int64x2_t delta69_s64 = vpaddlq_s32(delta69); H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 4] = H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 3] + vgetq_lane_s64(delta69_s64, 0); // Row 3: 2 points vst1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3, vaddq_s64(dst, deltas78_s64)); // Row 4: 1 point H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] = H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3] + vgetq_lane_s64(delta69_s64, 1); } while (++i < wiener_win); } static inline void compute_stats_win7_neon( const int16_t *const d, const int32_t d_stride, const int16_t *const s, const int32_t s_stride, const int32_t width, const int32_t height, int64_t *const M, int64_t *const H) { const int32_t wiener_win = WIENER_WIN; const int32_t wiener_win2 = wiener_win * wiener_win; const int32_t w16 = width & ~15; const int32_t h8 = height & ~7; int16x8_t mask[2]; mask[0] = vld1q_s16(&(mask_16bit[16]) - width % 16); mask[1] = vld1q_s16(&(mask_16bit[16]) - width % 16 + 8); const int bit_depth = 8; int32_t i, j, x, y; const int32_t num_bit_left = 32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 2 /* SIMD */; const int32_t h_allowed = (1 << num_bit_left) / (w16 + ((w16 != width) ? 16 : 0)); // Step 1: Calculate the top edge of the whole matrix, i.e., the top // edge of each triangle and square on the top row. j = 0; do { const int16_t *s_t = s; const int16_t *d_t = d; int32_t height_t = 0; int64x2_t sum_m[WIENER_WIN] = { vdupq_n_s64(0) }; int64x2_t sum_h[WIENER_WIN] = { vdupq_n_s64(0) }; int16x8_t src[2], dgd[2]; do { const int32_t h_t = ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed; int32x4_t row_m[WIENER_WIN * 2] = { vdupq_n_s32(0) }; int32x4_t row_h[WIENER_WIN * 2] = { vdupq_n_s32(0) }; y = h_t; do { x = 0; while (x < w16) { src[0] = vld1q_s16(s_t + x); src[1] = vld1q_s16(s_t + x + 8); dgd[0] = vld1q_s16(d_t + x); dgd[1] = vld1q_s16(d_t + x + 8); stats_top_win7_neon(src, dgd, d_t + j + x, d_stride, row_m, row_h); x += 16; } if (w16 != width) { src[0] = vld1q_s16(s_t + w16); src[1] = vld1q_s16(s_t + w16 + 8); dgd[0] = vld1q_s16(d_t + w16); dgd[1] = vld1q_s16(d_t + w16 + 8); src[0] = vandq_s16(src[0], mask[0]); src[1] = vandq_s16(src[1], mask[1]); dgd[0] = vandq_s16(dgd[0], mask[0]); dgd[1] = vandq_s16(dgd[1], mask[1]); stats_top_win7_neon(src, dgd, d_t + j + w16, d_stride, row_m, row_h); } s_t += s_stride; d_t += d_stride; } while (--y); sum_m[0] = vpadalq_s32(sum_m[0], row_m[0]); sum_m[1] = vpadalq_s32(sum_m[1], row_m[1]); sum_m[2] = vpadalq_s32(sum_m[2], row_m[2]); sum_m[3] = vpadalq_s32(sum_m[3], row_m[3]); sum_m[4] = vpadalq_s32(sum_m[4], row_m[4]); sum_m[5] = vpadalq_s32(sum_m[5], row_m[5]); sum_m[6] = vpadalq_s32(sum_m[6], row_m[6]); sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]); sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]); sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]); sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]); sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]); sum_h[5] = vpadalq_s32(sum_h[5], row_h[5]); sum_h[6] = vpadalq_s32(sum_h[6], row_h[6]); height_t += h_t; } while (height_t < height); #if AOM_ARCH_AARCH64 vst1q_s64(M + wiener_win * j + 0, vpaddq_s64(sum_m[0], sum_m[1])); vst1q_s64(M + wiener_win * j + 2, vpaddq_s64(sum_m[2], sum_m[3])); vst1q_s64(M + wiener_win * j + 4, vpaddq_s64(sum_m[4], sum_m[5])); M[wiener_win * j + 6] = vaddvq_s64(sum_m[6]); vst1q_s64(H + wiener_win * j + 0, vpaddq_s64(sum_h[0], sum_h[1])); vst1q_s64(H + wiener_win * j + 2, vpaddq_s64(sum_h[2], sum_h[3])); vst1q_s64(H + wiener_win * j + 4, vpaddq_s64(sum_h[4], sum_h[5])); H[wiener_win * j + 6] = vaddvq_s64(sum_h[6]); #else M[wiener_win * j + 0] = horizontal_add_s64x2(sum_m[0]); M[wiener_win * j + 1] = horizontal_add_s64x2(sum_m[1]); M[wiener_win * j + 2] = horizontal_add_s64x2(sum_m[2]); M[wiener_win * j + 3] = horizontal_add_s64x2(sum_m[3]); M[wiener_win * j + 4] = horizontal_add_s64x2(sum_m[4]); M[wiener_win * j + 5] = horizontal_add_s64x2(sum_m[5]); M[wiener_win * j + 6] = horizontal_add_s64x2(sum_m[6]); H[wiener_win * j + 0] = horizontal_add_s64x2(sum_h[0]); H[wiener_win * j + 1] = horizontal_add_s64x2(sum_h[1]); H[wiener_win * j + 2] = horizontal_add_s64x2(sum_h[2]); H[wiener_win * j + 3] = horizontal_add_s64x2(sum_h[3]); H[wiener_win * j + 4] = horizontal_add_s64x2(sum_h[4]); H[wiener_win * j + 5] = horizontal_add_s64x2(sum_h[5]); H[wiener_win * j + 6] = horizontal_add_s64x2(sum_h[6]); #endif // AOM_ARCH_AARCH64 } while (++j < wiener_win); // Step 2: Calculate the left edge of each square on the top row. j = 1; do { const int16_t *d_t = d; int32_t height_t = 0; int64x2_t sum_h[WIENER_WIN - 1] = { vdupq_n_s64(0) }; int16x8_t dgd[2]; do { const int32_t h_t = ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed; int32x4_t row_h[WIENER_WIN - 1] = { vdupq_n_s32(0) }; y = h_t; do { x = 0; while (x < w16) { dgd[0] = vld1q_s16(d_t + j + x + 0); dgd[1] = vld1q_s16(d_t + j + x + 8); stats_left_win7_neon(dgd, d_t + x, d_stride, row_h); x += 16; } if (w16 != width) { dgd[0] = vld1q_s16(d_t + j + x + 0); dgd[1] = vld1q_s16(d_t + j + x + 8); dgd[0] = vandq_s16(dgd[0], mask[0]); dgd[1] = vandq_s16(dgd[1], mask[1]); stats_left_win7_neon(dgd, d_t + x, d_stride, row_h); } d_t += d_stride; } while (--y); sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]); sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]); sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]); sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]); sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]); sum_h[5] = vpadalq_s32(sum_h[5], row_h[5]); height_t += h_t; } while (height_t < height); #if AOM_ARCH_AARCH64 int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]); int64x2_t sum_h2 = vpaddq_s64(sum_h[2], sum_h[3]); int64x2_t sum_h4 = vpaddq_s64(sum_h[4], sum_h[5]); vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h0)); vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h0)); vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h2)); vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h2)); vst1_s64(&H[5 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h4)); vst1_s64(&H[6 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h4)); #else H[1 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[0]); H[2 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[1]); H[3 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[2]); H[4 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[3]); H[5 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[4]); H[6 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[5]); #endif // AOM_ARCH_AARCH64 } while (++j < wiener_win); // Step 3: Derive the top edge of each triangle along the diagonal. No // triangle in top row. { const int16_t *d_t = d; // Pad to call transpose function. int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; int16x8_t ds[WIENER_WIN * 2]; load_s16_8x6(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6], &ds[8], &ds[10]); load_s16_8x6(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7], &ds[9], &ds[11]); d_t += 6 * d_stride; step3_win7_neon(d_t, d_stride, width, height, ds, deltas); transpose_arrays_s32_8x8(deltas, deltas_tr); update_8_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win, deltas_tr[0], deltas_tr[4], H + 1 * wiener_win * wiener_win2 + 1 * wiener_win); update_8_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win, deltas_tr[1], deltas_tr[5], H + 2 * wiener_win * wiener_win2 + 2 * wiener_win); update_8_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win, deltas_tr[2], deltas_tr[6], H + 3 * wiener_win * wiener_win2 + 3 * wiener_win); update_8_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win, deltas_tr[3], deltas_tr[7], H + 4 * wiener_win * wiener_win2 + 4 * wiener_win); update_8_stats_neon(H + 4 * wiener_win * wiener_win2 + 4 * wiener_win, deltas_tr[8], deltas_tr[12], H + 5 * wiener_win * wiener_win2 + 5 * wiener_win); update_8_stats_neon(H + 5 * wiener_win * wiener_win2 + 5 * wiener_win, deltas_tr[9], deltas_tr[13], H + 6 * wiener_win * wiener_win2 + 6 * wiener_win); } // Step 4: Derive the top and left edge of each square. No square in top and // bottom row. i = 1; do { j = i + 1; do { const int16_t *di = d + i - 1; const int16_t *dj = d + j - 1; int32x4_t deltas[(2 * WIENER_WIN - 1) * 2] = { vdupq_n_s32(0) }; int16x8_t dd[WIENER_WIN * 2], ds[WIENER_WIN * 2]; dd[5] = vdupq_n_s16(0); // Initialize to avoid warning. const int16_t dd0_values[] = { di[0 * d_stride], di[1 * d_stride], di[2 * d_stride], di[3 * d_stride], di[4 * d_stride], di[5 * d_stride], 0, 0 }; dd[0] = vld1q_s16(dd0_values); const int16_t dd1_values[] = { di[0 * d_stride + width], di[1 * d_stride + width], di[2 * d_stride + width], di[3 * d_stride + width], di[4 * d_stride + width], di[5 * d_stride + width], 0, 0 }; dd[1] = vld1q_s16(dd1_values); const int16_t ds0_values[] = { dj[0 * d_stride], dj[1 * d_stride], dj[2 * d_stride], dj[3 * d_stride], dj[4 * d_stride], dj[5 * d_stride], 0, 0 }; ds[0] = vld1q_s16(ds0_values); int16_t ds1_values[] = { dj[0 * d_stride + width], dj[1 * d_stride + width], dj[2 * d_stride + width], dj[3 * d_stride + width], dj[4 * d_stride + width], dj[5 * d_stride + width], 0, 0 }; ds[1] = vld1q_s16(ds1_values); y = 0; while (y < h8) { // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e dd[0] = vsetq_lane_s16(di[6 * d_stride], dd[0], 6); dd[0] = vsetq_lane_s16(di[7 * d_stride], dd[0], 7); dd[1] = vsetq_lane_s16(di[6 * d_stride + width], dd[1], 6); dd[1] = vsetq_lane_s16(di[7 * d_stride + width], dd[1], 7); // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e // 01s 11s 21s 31s 41s 51s 61s 71s 01e 11e 21e 31e 41e 51e 61e 71e ds[0] = vsetq_lane_s16(dj[6 * d_stride], ds[0], 6); ds[0] = vsetq_lane_s16(dj[7 * d_stride], ds[0], 7); ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 6); ds[1] = vsetq_lane_s16(dj[7 * d_stride + width], ds[1], 7); load_more_16_neon(di + 8 * d_stride, width, &dd[0], &dd[2]); load_more_16_neon(dj + 8 * d_stride, width, &ds[0], &ds[2]); load_more_16_neon(di + 9 * d_stride, width, &dd[2], &dd[4]); load_more_16_neon(dj + 9 * d_stride, width, &ds[2], &ds[4]); load_more_16_neon(di + 10 * d_stride, width, &dd[4], &dd[6]); load_more_16_neon(dj + 10 * d_stride, width, &ds[4], &ds[6]); load_more_16_neon(di + 11 * d_stride, width, &dd[6], &dd[8]); load_more_16_neon(dj + 11 * d_stride, width, &ds[6], &ds[8]); load_more_16_neon(di + 12 * d_stride, width, &dd[8], &dd[10]); load_more_16_neon(dj + 12 * d_stride, width, &ds[8], &ds[10]); load_more_16_neon(di + 13 * d_stride, width, &dd[10], &dd[12]); load_more_16_neon(dj + 13 * d_stride, width, &ds[10], &ds[12]); madd_neon(&deltas[0], dd[0], ds[0]); madd_neon(&deltas[1], dd[1], ds[1]); madd_neon(&deltas[2], dd[0], ds[2]); madd_neon(&deltas[3], dd[1], ds[3]); madd_neon(&deltas[4], dd[0], ds[4]); madd_neon(&deltas[5], dd[1], ds[5]); madd_neon(&deltas[6], dd[0], ds[6]); madd_neon(&deltas[7], dd[1], ds[7]); madd_neon(&deltas[8], dd[0], ds[8]); madd_neon(&deltas[9], dd[1], ds[9]); madd_neon(&deltas[10], dd[0], ds[10]); madd_neon(&deltas[11], dd[1], ds[11]); madd_neon(&deltas[12], dd[0], ds[12]); madd_neon(&deltas[13], dd[1], ds[13]); madd_neon(&deltas[14], dd[2], ds[0]); madd_neon(&deltas[15], dd[3], ds[1]); madd_neon(&deltas[16], dd[4], ds[0]); madd_neon(&deltas[17], dd[5], ds[1]); madd_neon(&deltas[18], dd[6], ds[0]); madd_neon(&deltas[19], dd[7], ds[1]); madd_neon(&deltas[20], dd[8], ds[0]); madd_neon(&deltas[21], dd[9], ds[1]); madd_neon(&deltas[22], dd[10], ds[0]); madd_neon(&deltas[23], dd[11], ds[1]); madd_neon(&deltas[24], dd[12], ds[0]); madd_neon(&deltas[25], dd[13], ds[1]); dd[0] = vextq_s16(dd[12], vdupq_n_s16(0), 2); dd[1] = vextq_s16(dd[13], vdupq_n_s16(0), 2); ds[0] = vextq_s16(ds[12], vdupq_n_s16(0), 2); ds[1] = vextq_s16(ds[13], vdupq_n_s16(0), 2); di += 8 * d_stride; dj += 8 * d_stride; y += 8; } deltas[0] = hadd_four_32_neon(deltas[0], deltas[2], deltas[4], deltas[6]); deltas[1] = hadd_four_32_neon(deltas[1], deltas[3], deltas[5], deltas[7]); deltas[2] = hadd_four_32_neon(deltas[8], deltas[10], deltas[12], deltas[12]); deltas[3] = hadd_four_32_neon(deltas[9], deltas[11], deltas[13], deltas[13]); deltas[4] = hadd_four_32_neon(deltas[14], deltas[16], deltas[18], deltas[20]); deltas[5] = hadd_four_32_neon(deltas[15], deltas[17], deltas[19], deltas[21]); deltas[6] = hadd_four_32_neon(deltas[22], deltas[24], deltas[22], deltas[24]); deltas[7] = hadd_four_32_neon(deltas[23], deltas[25], deltas[23], deltas[25]); deltas[0] = vsubq_s32(deltas[1], deltas[0]); deltas[1] = vsubq_s32(deltas[3], deltas[2]); deltas[2] = vsubq_s32(deltas[5], deltas[4]); deltas[3] = vsubq_s32(deltas[7], deltas[6]); if (h8 != height) { const int16_t ds0_vals[] = { dj[0 * d_stride], dj[0 * d_stride + width], dj[1 * d_stride], dj[1 * d_stride + width], dj[2 * d_stride], dj[2 * d_stride + width], dj[3 * d_stride], dj[3 * d_stride + width] }; ds[0] = vld1q_s16(ds0_vals); ds[1] = vsetq_lane_s16(dj[4 * d_stride], ds[1], 0); ds[1] = vsetq_lane_s16(dj[4 * d_stride + width], ds[1], 1); ds[1] = vsetq_lane_s16(dj[5 * d_stride], ds[1], 2); ds[1] = vsetq_lane_s16(dj[5 * d_stride + width], ds[1], 3); const int16_t dd4_vals[] = { -di[1 * d_stride], di[1 * d_stride + width], -di[2 * d_stride], di[2 * d_stride + width], -di[3 * d_stride], di[3 * d_stride + width], -di[4 * d_stride], di[4 * d_stride + width] }; dd[4] = vld1q_s16(dd4_vals); dd[5] = vsetq_lane_s16(-di[5 * d_stride], dd[5], 0); dd[5] = vsetq_lane_s16(di[5 * d_stride + width], dd[5], 1); do { dd[0] = vdupq_n_s16(-di[0 * d_stride]); dd[2] = dd[3] = vdupq_n_s16(di[0 * d_stride + width]); dd[0] = dd[1] = vzipq_s16(dd[0], dd[2]).val[0]; ds[4] = vdupq_n_s16(dj[0 * d_stride]); ds[6] = ds[7] = vdupq_n_s16(dj[0 * d_stride + width]); ds[4] = ds[5] = vzipq_s16(ds[4], ds[6]).val[0]; dd[5] = vsetq_lane_s16(-di[6 * d_stride], dd[5], 2); dd[5] = vsetq_lane_s16(di[6 * d_stride + width], dd[5], 3); ds[1] = vsetq_lane_s16(dj[6 * d_stride], ds[1], 4); ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 5); madd_neon_pairwise(&deltas[0], dd[0], ds[0]); madd_neon_pairwise(&deltas[1], dd[1], ds[1]); madd_neon_pairwise(&deltas[2], dd[4], ds[4]); madd_neon_pairwise(&deltas[3], dd[5], ds[5]); int32_t tmp0 = vgetq_lane_s32(vreinterpretq_s32_s16(ds[0]), 0); ds[0] = vextq_s16(ds[0], ds[1], 2); ds[1] = vextq_s16(ds[1], ds[0], 2); ds[1] = vreinterpretq_s16_s32( vsetq_lane_s32(tmp0, vreinterpretq_s32_s16(ds[1]), 3)); int32_t tmp1 = vgetq_lane_s32(vreinterpretq_s32_s16(dd[4]), 0); dd[4] = vextq_s16(dd[4], dd[5], 2); dd[5] = vextq_s16(dd[5], dd[4], 2); dd[5] = vreinterpretq_s16_s32( vsetq_lane_s32(tmp1, vreinterpretq_s32_s16(dd[5]), 3)); di += d_stride; dj += d_stride; } while (++y < height); } // Writing one more element on the top edge of a square falls to // the next square in the same row or the first element in the next // row, which will just be overwritten later. update_8_stats_neon( H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win, deltas[0], deltas[1], H + i * wiener_win * wiener_win2 + j * wiener_win); H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(deltas[2], 0); H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(deltas[2], 1); H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(deltas[2], 2); H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(deltas[2], 3); H[(i * wiener_win + 5) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 5) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(deltas[3], 0); H[(i * wiener_win + 6) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 6) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s32(deltas[3], 1); } while (++j < wiener_win); } while (++i < wiener_win - 1); // Step 5: Derive other points of each square. No square in bottom row. i = 0; do { const int16_t *const di = d + i; j = i + 1; do { const int16_t *const dj = d + j; int32x4_t deltas[WIENER_WIN - 1][WIN_7] = { { vdupq_n_s32(0) }, { vdupq_n_s32(0) } }; int16x8_t d_is[WIN_7]; int16x8_t d_ie[WIN_7]; int16x8_t d_js[WIN_7]; int16x8_t d_je[WIN_7]; x = 0; while (x < w16) { load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie, d_js, d_je); derive_square_win7_neon(d_is, d_ie, d_js, d_je, deltas); x += 16; } if (w16 != width) { load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie, d_js, d_je); d_is[0] = vandq_s16(d_is[0], mask[0]); d_is[1] = vandq_s16(d_is[1], mask[1]); d_is[2] = vandq_s16(d_is[2], mask[0]); d_is[3] = vandq_s16(d_is[3], mask[1]); d_is[4] = vandq_s16(d_is[4], mask[0]); d_is[5] = vandq_s16(d_is[5], mask[1]); d_is[6] = vandq_s16(d_is[6], mask[0]); d_is[7] = vandq_s16(d_is[7], mask[1]); d_is[8] = vandq_s16(d_is[8], mask[0]); d_is[9] = vandq_s16(d_is[9], mask[1]); d_is[10] = vandq_s16(d_is[10], mask[0]); d_is[11] = vandq_s16(d_is[11], mask[1]); d_ie[0] = vandq_s16(d_ie[0], mask[0]); d_ie[1] = vandq_s16(d_ie[1], mask[1]); d_ie[2] = vandq_s16(d_ie[2], mask[0]); d_ie[3] = vandq_s16(d_ie[3], mask[1]); d_ie[4] = vandq_s16(d_ie[4], mask[0]); d_ie[5] = vandq_s16(d_ie[5], mask[1]); d_ie[6] = vandq_s16(d_ie[6], mask[0]); d_ie[7] = vandq_s16(d_ie[7], mask[1]); d_ie[8] = vandq_s16(d_ie[8], mask[0]); d_ie[9] = vandq_s16(d_ie[9], mask[1]); d_ie[10] = vandq_s16(d_ie[10], mask[0]); d_ie[11] = vandq_s16(d_ie[11], mask[1]); derive_square_win7_neon(d_is, d_ie, d_js, d_je, deltas); } hadd_update_6_stats_neon( H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0], H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1); hadd_update_6_stats_neon( H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1], H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1); hadd_update_6_stats_neon( H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2], H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1); hadd_update_6_stats_neon( H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3], H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1); hadd_update_6_stats_neon( H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win, deltas[4], H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win + 1); hadd_update_6_stats_neon( H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win, deltas[5], H + (i * wiener_win + 6) * wiener_win2 + j * wiener_win + 1); } while (++j < wiener_win); } while (++i < wiener_win - 1); // Step 6: Derive other points of each upper triangle along the diagonal. i = 0; do { const int16_t *const di = d + i; int32x4_t deltas[WIENER_WIN * (WIENER_WIN - 1)] = { vdupq_n_s32(0) }; int16x8_t d_is[WIN_7], d_ie[WIN_7]; x = 0; while (x < w16) { load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie); derive_triangle_win7_neon(d_is, d_ie, deltas); x += 16; } if (w16 != width) { load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie); d_is[0] = vandq_s16(d_is[0], mask[0]); d_is[1] = vandq_s16(d_is[1], mask[1]); d_is[2] = vandq_s16(d_is[2], mask[0]); d_is[3] = vandq_s16(d_is[3], mask[1]); d_is[4] = vandq_s16(d_is[4], mask[0]); d_is[5] = vandq_s16(d_is[5], mask[1]); d_is[6] = vandq_s16(d_is[6], mask[0]); d_is[7] = vandq_s16(d_is[7], mask[1]); d_is[8] = vandq_s16(d_is[8], mask[0]); d_is[9] = vandq_s16(d_is[9], mask[1]); d_is[10] = vandq_s16(d_is[10], mask[0]); d_is[11] = vandq_s16(d_is[11], mask[1]); d_ie[0] = vandq_s16(d_ie[0], mask[0]); d_ie[1] = vandq_s16(d_ie[1], mask[1]); d_ie[2] = vandq_s16(d_ie[2], mask[0]); d_ie[3] = vandq_s16(d_ie[3], mask[1]); d_ie[4] = vandq_s16(d_ie[4], mask[0]); d_ie[5] = vandq_s16(d_ie[5], mask[1]); d_ie[6] = vandq_s16(d_ie[6], mask[0]); d_ie[7] = vandq_s16(d_ie[7], mask[1]); d_ie[8] = vandq_s16(d_ie[8], mask[0]); d_ie[9] = vandq_s16(d_ie[9], mask[1]); d_ie[10] = vandq_s16(d_ie[10], mask[0]); d_ie[11] = vandq_s16(d_ie[11], mask[1]); derive_triangle_win7_neon(d_is, d_ie, deltas); } // Row 1: 6 points hadd_update_6_stats_neon( H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas, H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1); int32x4_t delta1710 = horizontal_add_2d_s32(deltas[17], deltas[10]); int32x4_t delta1516 = horizontal_add_2d_s32(deltas[15], deltas[16]); int64x2_t delta1710_s64 = vpaddlq_s32(delta1710); int64x2_t delta1516_s64 = vpaddlq_s32(delta1516); // Row 2: 5 points hadd_update_4_stats_neon( H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1, deltas + 6, H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2); H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 6] = H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 5] + vgetq_lane_s64(delta1710_s64, 1); // Row 3: 4 points hadd_update_4_stats_neon( H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2, deltas + 11, H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3); // Row 4: 3 points int64x2_t h0 = vld1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3); vst1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4, vaddq_s64(h0, delta1516_s64)); H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 6] = H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 5] + vgetq_lane_s64(delta1710_s64, 0); int32x4_t delta1819 = horizontal_add_2d_s32(deltas[18], deltas[19]); int64x2_t delta1819_s64 = vpaddlq_s32(delta1819); // Row 5: 2 points int64x2_t h1 = vld1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4); vst1q_s64(H + (i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5, vaddq_s64(h1, delta1819_s64)); // Row 6: 1 points H[(i * wiener_win + 6) * wiener_win2 + i * wiener_win + 6] = H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] + horizontal_long_add_s32x4(deltas[20]); } while (++i < wiener_win); } static inline uint8_t find_average_neon(const uint8_t *src, int src_stride, int width, int height) { uint64_t sum = 0; if (width >= 16) { int h = 0; // We can accumulate up to 257 8-bit values in a 16-bit value, given // that each 16-bit vector has 8 elements, that means we can process up to // int(257*8/width) rows before we need to widen to 32-bit vector // elements. int h_overflow = 257 * 8 / width; int h_limit = height > h_overflow ? h_overflow : height; uint32x4_t avg_u32 = vdupq_n_u32(0); do { uint16x8_t avg_u16 = vdupq_n_u16(0); do { int j = width; const uint8_t *src_ptr = src; do { uint8x16_t s = vld1q_u8(src_ptr); avg_u16 = vpadalq_u8(avg_u16, s); j -= 16; src_ptr += 16; } while (j >= 16); if (j >= 8) { uint8x8_t s = vld1_u8(src_ptr); avg_u16 = vaddw_u8(avg_u16, s); j -= 8; src_ptr += 8; } // Scalar tail case. while (j > 0) { sum += src[width - j]; j--; } src += src_stride; } while (++h < h_limit); avg_u32 = vpadalq_u16(avg_u32, avg_u16); h_limit += h_overflow; h_limit = height > h_overflow ? h_overflow : height; } while (h < height); return (uint8_t)((horizontal_long_add_u32x4(avg_u32) + sum) / (width * height)); } if (width >= 8) { int h = 0; // We can accumulate up to 257 8-bit values in a 16-bit value, given // that each 16-bit vector has 4 elements, that means we can process up to // int(257*4/width) rows before we need to widen to 32-bit vector // elements. int h_overflow = 257 * 4 / width; int h_limit = height > h_overflow ? h_overflow : height; uint32x2_t avg_u32 = vdup_n_u32(0); do { uint16x4_t avg_u16 = vdup_n_u16(0); do { int j = width; const uint8_t *src_ptr = src; uint8x8_t s = vld1_u8(src_ptr); avg_u16 = vpadal_u8(avg_u16, s); j -= 8; src_ptr += 8; // Scalar tail case. while (j > 0) { sum += src[width - j]; j--; } src += src_stride; } while (++h < h_limit); avg_u32 = vpadal_u16(avg_u32, avg_u16); h_limit += h_overflow; h_limit = height > h_overflow ? h_overflow : height; } while (h < height); return (uint8_t)((horizontal_long_add_u32x2(avg_u32) + sum) / (width * height)); } int i = height; do { int j = 0; do { sum += src[j]; } while (++j < width); src += src_stride; } while (--i != 0); return (uint8_t)(sum / (width * height)); } static inline void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg, int16_t *buf_avg, int buf_avg_stride, int width, int height, int downsample_factor) { uint8x8_t avg_u8 = vdup_n_u8(avg); if (width > 8) { int i = 0; do { int j = width; const uint8_t *buf_ptr = buf; int16_t *buf_avg_ptr = buf_avg; do { uint8x8_t d = vld1_u8(buf_ptr); vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8))); j -= 8; buf_ptr += 8; buf_avg_ptr += 8; } while (j >= 8); while (j > 0) { *buf_avg_ptr = (int16_t)buf[width - j] - (int16_t)avg; buf_avg_ptr++; j--; } buf += buf_stride; buf_avg += buf_avg_stride; i += downsample_factor; } while (i < height); } else { // For width < 8, don't use Neon. for (int i = 0; i < height; i = i + downsample_factor) { for (int j = 0; j < width; j++) { buf_avg[j] = (int16_t)buf[j] - (int16_t)avg; } buf += buf_stride; buf_avg += buf_avg_stride; } } } static inline void av1_compute_stats_downsampled_neon( int wiener_win, const uint8_t *dgd, const uint8_t *src, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats) { assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA); assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4); (void)dgd_avg; (void)src_avg; const int wiener_win2 = wiener_win * wiener_win; const int wiener_halfwin = wiener_win >> 1; const int width = h_end - h_start; const int height = v_end - v_start; const uint8_t *dgd_start = dgd + h_start + v_start * dgd_stride; const uint8_t *src_start = src + h_start + v_start * src_stride; // The wiener window will slide along the dgd frame, centered on each pixel. // For the top left pixel and all the pixels on the side of the frame this // means half of the window will be outside of the frame. As such the actual // buffer that we need to subtract the avg from will be 2 * wiener_halfwin // wider and 2 * wiener_halfwin higher than the original dgd buffer. const int vert_offset = v_start - wiener_halfwin; const int horiz_offset = h_start - wiener_halfwin; const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height); // Since the height is not necessarily a multiple of the downsample factor, // the last line of src will be scaled according to how many rows remain. int downsample_factor = use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; int downsampled_height = height / downsample_factor; int downsample_remainder = height % downsample_factor; memset(M, 0, wiener_win2 * sizeof(*M)); memset(H, 0, wiener_win2 * wiener_win2 * sizeof(*H)); // Calculate the M and H matrices for the normal and downsampled cases. if (downsampled_height > 0) { if (wiener_win == WIENER_WIN) { compute_stats_win7_downsampled_neon( dgd_win, src_start, width, downsampled_height, dgd_stride, src_stride, avg, M, H, downsample_factor); } else { compute_stats_win5_downsampled_neon( dgd_win, src_start, width, downsampled_height, dgd_stride, src_stride, avg, M, H, downsample_factor); } } // Accumulate the remaining last rows in the downsampled case. if (downsample_remainder > 0) { int remainder_offset = height - downsample_remainder; if (wiener_win == WIENER_WIN) { compute_stats_win7_downsampled_neon( dgd_win + remainder_offset * dgd_stride, src_start + remainder_offset * src_stride, width, 1, dgd_stride, src_stride, avg, M, H, downsample_remainder); } else { compute_stats_win5_downsampled_neon( dgd_win + remainder_offset * dgd_stride, src_start + remainder_offset * src_stride, width, 1, dgd_stride, src_stride, avg, M, H, downsample_remainder); } } } void av1_compute_stats_neon(int32_t wiener_win, const uint8_t *dgd, const uint8_t *src, int16_t *dgd_avg, int16_t *src_avg, int32_t h_start, int32_t h_end, int32_t v_start, int32_t v_end, int32_t dgd_stride, int32_t src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats) { assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4); if (use_downsampled_wiener_stats) { av1_compute_stats_downsampled_neon( wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, use_downsampled_wiener_stats); return; } const int32_t wiener_win2 = wiener_win * wiener_win; const int32_t wiener_halfwin = (wiener_win >> 1); const int32_t width = h_end - h_start; const int32_t height = v_end - v_start; const uint8_t *dgd_start = dgd + h_start + v_start * dgd_stride; const uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height); const int32_t d_stride = (width + 2 * wiener_halfwin + 15) & ~15; const int32_t s_stride = (width + 15) & ~15; compute_sub_avg(src + v_start * src_stride + h_start, src_stride, avg, src_avg, s_stride, width, height, 1); compute_sub_avg( dgd + (v_start - wiener_halfwin) * dgd_stride + h_start - wiener_halfwin, dgd_stride, avg, dgd_avg, d_stride, width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1); if (wiener_win == WIENER_WIN) { compute_stats_win7_neon(dgd_avg, d_stride, src_avg, s_stride, width, height, M, H); } else if (wiener_win == WIENER_WIN_CHROMA) { compute_stats_win5_neon(dgd_avg, d_stride, src_avg, s_stride, width, height, M, H); } // H is a symmetric matrix, so we only need to fill out the upper triangle. // We can copy it down to the lower triangle outside the (i, j) loops. diagonal_copy_stats_neon(wiener_win2, H); } static inline void calc_proj_params_r0_r1_neon( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { assert(width % 8 == 0); const int size = width * height; int64x2_t h00_lo = vdupq_n_s64(0); int64x2_t h00_hi = vdupq_n_s64(0); int64x2_t h11_lo = vdupq_n_s64(0); int64x2_t h11_hi = vdupq_n_s64(0); int64x2_t h01_lo = vdupq_n_s64(0); int64x2_t h01_hi = vdupq_n_s64(0); int64x2_t c0_lo = vdupq_n_s64(0); int64x2_t c0_hi = vdupq_n_s64(0); int64x2_t c1_lo = vdupq_n_s64(0); int64x2_t c1_hi = vdupq_n_s64(0); do { const uint8_t *src_ptr = src8; const uint8_t *dat_ptr = dat8; int32_t *flt0_ptr = flt0; int32_t *flt1_ptr = flt1; int w = width; do { uint8x8_t s = vld1_u8(src_ptr); uint8x8_t d = vld1_u8(dat_ptr); int32x4_t f0_lo = vld1q_s32(flt0_ptr); int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); int32x4_t f1_lo = vld1q_s32(flt1_ptr); int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS)); int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u)); int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u)); f0_lo = vsubw_s16(f0_lo, vget_low_s16(u)); f0_hi = vsubw_s16(f0_hi, vget_high_s16(u)); f1_lo = vsubw_s16(f1_lo, vget_low_s16(u)); f1_hi = vsubw_s16(f1_hi, vget_high_s16(u)); h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo)); h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo)); h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi)); h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi)); c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); src_ptr += 8; dat_ptr += 8; flt0_ptr += 8; flt1_ptr += 8; w -= 8; } while (w != 0); src8 += src_stride; dat8 += dat_stride; flt0 += flt0_stride; flt1 += flt1_stride; } while (--height != 0); H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size; H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; H[1][0] = H[0][1]; C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; } static inline void calc_proj_params_r0_neon(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int64_t H[2][2], int64_t C[2]) { assert(width % 8 == 0); const int size = width * height; int64x2_t h00_lo = vdupq_n_s64(0); int64x2_t h00_hi = vdupq_n_s64(0); int64x2_t c0_lo = vdupq_n_s64(0); int64x2_t c0_hi = vdupq_n_s64(0); do { const uint8_t *src_ptr = src8; const uint8_t *dat_ptr = dat8; int32_t *flt0_ptr = flt0; int w = width; do { uint8x8_t s = vld1_u8(src_ptr); uint8x8_t d = vld1_u8(dat_ptr); int32x4_t f0_lo = vld1q_s32(flt0_ptr); int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS)); int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u)); int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u)); f0_lo = vsubw_s16(f0_lo, vget_low_s16(u)); f0_hi = vsubw_s16(f0_hi, vget_high_s16(u)); h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); src_ptr += 8; dat_ptr += 8; flt0_ptr += 8; w -= 8; } while (w != 0); src8 += src_stride; dat8 += dat_stride; flt0 += flt0_stride; } while (--height != 0); H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; } static inline void calc_proj_params_r1_neon(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { assert(width % 8 == 0); const int size = width * height; int64x2_t h11_lo = vdupq_n_s64(0); int64x2_t h11_hi = vdupq_n_s64(0); int64x2_t c1_lo = vdupq_n_s64(0); int64x2_t c1_hi = vdupq_n_s64(0); do { const uint8_t *src_ptr = src8; const uint8_t *dat_ptr = dat8; int32_t *flt1_ptr = flt1; int w = width; do { uint8x8_t s = vld1_u8(src_ptr); uint8x8_t d = vld1_u8(dat_ptr); int32x4_t f1_lo = vld1q_s32(flt1_ptr); int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS)); int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u)); int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u)); f1_lo = vsubw_s16(f1_lo, vget_low_s16(u)); f1_hi = vsubw_s16(f1_hi, vget_high_s16(u)); h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); src_ptr += 8; dat_ptr += 8; flt1_ptr += 8; w -= 8; } while (w != 0); src8 += src_stride; dat8 += dat_stride; flt1 += flt1_stride; } while (--height != 0); H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; } // The function calls 3 subfunctions for the following cases : // 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements // of C and H need to be computed. // 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are // non-zero and need to be computed. // 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are // non-zero and need to be computed. void av1_calc_proj_params_neon(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params) { if ((params->r[0] > 0) && (params->r[1] > 0)) { calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, flt1, flt1_stride, H, C); } else if (params->r[0] > 0) { calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, H, C); } else if (params->r[1] > 0) { calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, dat_stride, flt1, flt1_stride, H, C); } } aom-3.12.1/av1/encoder/arm/pickrst_neon.h000066400000000000000000001427621477627663500201510ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_ #define AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_ #include #include "av1/common/restoration.h" #define WIN_7 ((WIENER_WIN - 1) * 2) #define WIN_CHROMA ((WIENER_WIN_CHROMA - 1) * 2) // Aligned sizes for Wiener filters. #define WIENER_WIN2_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2, 2) #define WIENER_WIN2_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2, 3) #define WIENER_WIN2_REDUCED ((WIENER_WIN_REDUCED) * (WIENER_WIN_REDUCED)) #define WIENER_WIN2_REDUCED_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 2) #define WIENER_WIN2_REDUCED_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 3) // Compute 8 values of M (cross correlation) for a single source pixel and // accumulate. static inline void update_M_1pixel(int32_t *M_s32, int16x4_t src_avg, int16x8_t dgd_avg) { int32x4_t lo = vld1q_s32(M_s32 + 0); int32x4_t hi = vld1q_s32(M_s32 + 4); lo = vmlal_s16(lo, vget_low_s16(dgd_avg), src_avg); hi = vmlal_s16(hi, vget_high_s16(dgd_avg), src_avg); vst1q_s32(M_s32 + 0, lo); vst1q_s32(M_s32 + 4, hi); } // Compute 8 values of M (cross correlation) for two source pixels and // accumulate. static inline void update_M_2pixels(int32_t *M_s32, int16x4_t src_avg0, int16x4_t src_avg1, int16x8_t dgd_avg0, int16x8_t dgd_avg1) { int32x4_t lo = vld1q_s32(M_s32 + 0); int32x4_t hi = vld1q_s32(M_s32 + 4); lo = vmlal_s16(lo, vget_low_s16(dgd_avg0), src_avg0); hi = vmlal_s16(hi, vget_high_s16(dgd_avg0), src_avg0); lo = vmlal_s16(lo, vget_low_s16(dgd_avg1), src_avg1); hi = vmlal_s16(hi, vget_high_s16(dgd_avg1), src_avg1); vst1q_s32(M_s32 + 0, lo); vst1q_s32(M_s32 + 4, hi); } static inline void update_H_1pixel(int32_t *H_s32, const int16_t *dgd_avg, int width, int height) { for (int i = 0; i < height; i += 4) { int16x4_t di = vld1_s16(dgd_avg + i); for (int j = i; j < width; j += 4) { int16x4_t dj = vld1_s16(dgd_avg + j); int32x4_t h0 = vld1q_s32(H_s32 + 0 * width + j); int32x4_t h1 = vld1q_s32(H_s32 + 1 * width + j); int32x4_t h2 = vld1q_s32(H_s32 + 2 * width + j); int32x4_t h3 = vld1q_s32(H_s32 + 3 * width + j); h0 = vmlal_lane_s16(h0, dj, di, 0); h1 = vmlal_lane_s16(h1, dj, di, 1); h2 = vmlal_lane_s16(h2, dj, di, 2); h3 = vmlal_lane_s16(h3, dj, di, 3); vst1q_s32(H_s32 + 0 * width + j, h0); vst1q_s32(H_s32 + 1 * width + j, h1); vst1q_s32(H_s32 + 2 * width + j, h2); vst1q_s32(H_s32 + 3 * width + j, h3); } H_s32 += 4 * width; } } static inline void update_H_5x5_2pixels(int32_t *H_s32, const int16_t *dgd_avg0, const int16_t *dgd_avg1) { for (int i = 0; i < 24; i += 4) { int16x4_t di0 = vld1_s16(dgd_avg0 + i); int16x4_t di1 = vld1_s16(dgd_avg1 + i); for (int j = i + 0; j < WIENER_WIN2_REDUCED_ALIGN2; j += 4) { int16x4_t dj0 = vld1_s16(dgd_avg0 + j); int16x4_t dj1 = vld1_s16(dgd_avg1 + j); int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j); int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j); int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j); int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j); h0 = vmlal_lane_s16(h0, dj0, di0, 0); h0 = vmlal_lane_s16(h0, dj1, di1, 0); h1 = vmlal_lane_s16(h1, dj0, di0, 1); h1 = vmlal_lane_s16(h1, dj1, di1, 1); h2 = vmlal_lane_s16(h2, dj0, di0, 2); h2 = vmlal_lane_s16(h2, dj1, di1, 2); h3 = vmlal_lane_s16(h3, dj0, di0, 3); h3 = vmlal_lane_s16(h3, dj1, di1, 3); vst1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j, h0); vst1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j, h1); vst1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j, h2); vst1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j, h3); } H_s32 += 4 * WIENER_WIN2_REDUCED_ALIGN2; } } static inline void update_H_7x7_2pixels(int32_t *H_s32, const int16_t *dgd_avg0, const int16_t *dgd_avg1) { for (int i = 0; i < 48; i += 4) { int16x4_t di0 = vld1_s16(dgd_avg0 + i); int16x4_t di1 = vld1_s16(dgd_avg1 + i); int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i); int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i); int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i); int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i); h0 = vmlal_lane_s16(h0, di0, di0, 0); h0 = vmlal_lane_s16(h0, di1, di1, 0); h1 = vmlal_lane_s16(h1, di0, di0, 1); h1 = vmlal_lane_s16(h1, di1, di1, 1); h2 = vmlal_lane_s16(h2, di0, di0, 2); h2 = vmlal_lane_s16(h2, di1, di1, 2); h3 = vmlal_lane_s16(h3, di0, di0, 3); h3 = vmlal_lane_s16(h3, di1, di1, 3); vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i, h0); vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i, h1); vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i, h2); vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i, h3); for (int j = i + 4; j < WIENER_WIN2_ALIGN2; j += 4) { int16x4_t dj0 = vld1_s16(dgd_avg0 + j); int16x4_t dj1 = vld1_s16(dgd_avg1 + j); h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j); h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j); h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j); h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j); h0 = vmlal_lane_s16(h0, dj0, di0, 0); h0 = vmlal_lane_s16(h0, dj1, di1, 0); h1 = vmlal_lane_s16(h1, dj0, di0, 1); h1 = vmlal_lane_s16(h1, dj1, di1, 1); h2 = vmlal_lane_s16(h2, dj0, di0, 2); h2 = vmlal_lane_s16(h2, dj1, di1, 2); h3 = vmlal_lane_s16(h3, dj0, di0, 3); h3 = vmlal_lane_s16(h3, dj1, di1, 3); vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j, h0); vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j, h1); vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j, h2); vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j, h3); } H_s32 += 4 * WIENER_WIN2_ALIGN2; } } // Widen 32-bit src data and accumulate into 64-bit dst. Clear src data. static inline void accumulate_and_clear(int64_t *dst, int32_t *src, int length) { do { int32x4_t s32 = vld1q_s32(src); vst1q_s32(src, vdupq_n_s32(0)); src += 4; int64x2_t d_lo = vld1q_s64(dst + 0); int64x2_t d_hi = vld1q_s64(dst + 2); d_lo = vaddw_s32(d_lo, vget_low_s32(s32)); d_hi = vaddw_s32(d_hi, vget_high_s32(s32)); vst1q_s64(dst + 0, d_lo); vst1q_s64(dst + 2, d_hi); dst += 4; length -= 4; } while (length > 0); } // clang-format off // Constant pool to act as a mask to zero n top elements in an int16x8_t vector. // The index we load from depends on n. static const int16_t mask_16bit[32] = { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; // clang-format on static inline void madd_neon_pairwise(int32x4_t *sum, const int16x8_t src, const int16x8_t dgd) { const int32x4_t sd = horizontal_add_2d_s32(vmull_s16(vget_low_s16(src), vget_low_s16(dgd)), vmull_s16(vget_high_s16(src), vget_high_s16(dgd))); *sum = vaddq_s32(*sum, sd); } static inline void madd_neon(int32x4_t *sum, const int16x8_t src, const int16x8_t dgd) { *sum = vmlal_s16(*sum, vget_low_s16(src), vget_low_s16(dgd)); *sum = vmlal_s16(*sum, vget_high_s16(src), vget_high_s16(dgd)); } static inline void msub_neon(int32x4_t *sum, const int16x8_t src, const int16x8_t dgd) { *sum = vmlsl_s16(*sum, vget_low_s16(src), vget_low_s16(dgd)); *sum = vmlsl_s16(*sum, vget_high_s16(src), vget_high_s16(dgd)); } static inline void compute_delta_step3(int32x4_t *sum0, int32x4_t *sum1, const int16x8_t src0, const int16x8_t src1, const int16x8_t dgd0, const int16x8_t dgd1) { *sum0 = vmlsl_s16(*sum0, vget_low_s16(src0), vget_low_s16(dgd0)); *sum0 = vmlal_s16(*sum0, vget_low_s16(src1), vget_low_s16(dgd1)); *sum1 = vmlsl_s16(*sum1, vget_high_s16(src0), vget_high_s16(dgd0)); *sum1 = vmlal_s16(*sum1, vget_high_s16(src1), vget_high_s16(dgd1)); } static inline int32x4_t hadd_four_32_neon(const int32x4_t src0, const int32x4_t src1, const int32x4_t src2, const int32x4_t src3) { int32x4_t src[4] = { src0, src1, src2, src3 }; return horizontal_add_4d_s32x4(src); } static inline void update_4_stats_neon(const int64_t *const src, const int32x4_t delta, int64_t *const dst) { const int64x2_t s1 = vld1q_s64(src); const int64x2_t s2 = vld1q_s64(src + 2); const int64x2_t d1 = vaddw_s32(s1, vget_low_s32(delta)); const int64x2_t d2 = vaddw_s32(s2, vget_high_s32(delta)); vst1q_s64(dst, d1); vst1q_s64(dst + 2, d2); } static inline void load_more_16_neon(const int16_t *const src, const int32_t width, const int16x8_t org[2], int16x8_t dst[2]) { int16x8_t s0 = vld1q_dup_s16(src); int16x8_t s1 = vld1q_dup_s16(src + width); dst[0] = vextq_s16(org[0], s0, 1); dst[1] = vextq_s16(org[1], s1, 1); } static inline void stats_top_win5_neon(const int16x8_t src[2], const int16x8_t dgd[2], const int16_t *const d, const int32_t d_stride, int32x4_t *sum_m, int32x4_t *sum_h) { int16x8_t dgds[WIENER_WIN_CHROMA * 2]; load_s16_8x5(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6], &dgds[8]); load_s16_8x5(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7], &dgds[9]); madd_neon(&sum_m[0], src[0], dgds[0]); madd_neon(&sum_m[0], src[1], dgds[1]); madd_neon(&sum_m[1], src[0], dgds[2]); madd_neon(&sum_m[1], src[1], dgds[3]); madd_neon(&sum_m[2], src[0], dgds[4]); madd_neon(&sum_m[2], src[1], dgds[5]); madd_neon(&sum_m[3], src[0], dgds[6]); madd_neon(&sum_m[3], src[1], dgds[7]); madd_neon(&sum_m[4], src[0], dgds[8]); madd_neon(&sum_m[4], src[1], dgds[9]); madd_neon(&sum_h[0], dgd[0], dgds[0]); madd_neon(&sum_h[0], dgd[1], dgds[1]); madd_neon(&sum_h[1], dgd[0], dgds[2]); madd_neon(&sum_h[1], dgd[1], dgds[3]); madd_neon(&sum_h[2], dgd[0], dgds[4]); madd_neon(&sum_h[2], dgd[1], dgds[5]); madd_neon(&sum_h[3], dgd[0], dgds[6]); madd_neon(&sum_h[3], dgd[1], dgds[7]); madd_neon(&sum_h[4], dgd[0], dgds[8]); madd_neon(&sum_h[4], dgd[1], dgds[9]); } static inline void stats_left_win5_neon(const int16x8_t src[2], const int16_t *d, const int32_t d_stride, int32x4_t *sum) { int16x8_t dgds[WIN_CHROMA]; load_s16_8x4(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6]); load_s16_8x4(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7]); madd_neon(&sum[0], src[0], dgds[0]); madd_neon(&sum[0], src[1], dgds[1]); madd_neon(&sum[1], src[0], dgds[2]); madd_neon(&sum[1], src[1], dgds[3]); madd_neon(&sum[2], src[0], dgds[4]); madd_neon(&sum[2], src[1], dgds[5]); madd_neon(&sum[3], src[0], dgds[6]); madd_neon(&sum[3], src[1], dgds[7]); } static inline void derive_square_win5_neon( const int16x8_t *d_is, const int16x8_t *d_ie, const int16x8_t *d_js, const int16x8_t *d_je, int32x4_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1]) { msub_neon(&deltas[0][0], d_is[0], d_js[0]); msub_neon(&deltas[0][0], d_is[1], d_js[1]); msub_neon(&deltas[0][1], d_is[0], d_js[2]); msub_neon(&deltas[0][1], d_is[1], d_js[3]); msub_neon(&deltas[0][2], d_is[0], d_js[4]); msub_neon(&deltas[0][2], d_is[1], d_js[5]); msub_neon(&deltas[0][3], d_is[0], d_js[6]); msub_neon(&deltas[0][3], d_is[1], d_js[7]); msub_neon(&deltas[1][0], d_is[2], d_js[0]); msub_neon(&deltas[1][0], d_is[3], d_js[1]); msub_neon(&deltas[1][1], d_is[2], d_js[2]); msub_neon(&deltas[1][1], d_is[3], d_js[3]); msub_neon(&deltas[1][2], d_is[2], d_js[4]); msub_neon(&deltas[1][2], d_is[3], d_js[5]); msub_neon(&deltas[1][3], d_is[2], d_js[6]); msub_neon(&deltas[1][3], d_is[3], d_js[7]); msub_neon(&deltas[2][0], d_is[4], d_js[0]); msub_neon(&deltas[2][0], d_is[5], d_js[1]); msub_neon(&deltas[2][1], d_is[4], d_js[2]); msub_neon(&deltas[2][1], d_is[5], d_js[3]); msub_neon(&deltas[2][2], d_is[4], d_js[4]); msub_neon(&deltas[2][2], d_is[5], d_js[5]); msub_neon(&deltas[2][3], d_is[4], d_js[6]); msub_neon(&deltas[2][3], d_is[5], d_js[7]); msub_neon(&deltas[3][0], d_is[6], d_js[0]); msub_neon(&deltas[3][0], d_is[7], d_js[1]); msub_neon(&deltas[3][1], d_is[6], d_js[2]); msub_neon(&deltas[3][1], d_is[7], d_js[3]); msub_neon(&deltas[3][2], d_is[6], d_js[4]); msub_neon(&deltas[3][2], d_is[7], d_js[5]); msub_neon(&deltas[3][3], d_is[6], d_js[6]); msub_neon(&deltas[3][3], d_is[7], d_js[7]); madd_neon(&deltas[0][0], d_ie[0], d_je[0]); madd_neon(&deltas[0][0], d_ie[1], d_je[1]); madd_neon(&deltas[0][1], d_ie[0], d_je[2]); madd_neon(&deltas[0][1], d_ie[1], d_je[3]); madd_neon(&deltas[0][2], d_ie[0], d_je[4]); madd_neon(&deltas[0][2], d_ie[1], d_je[5]); madd_neon(&deltas[0][3], d_ie[0], d_je[6]); madd_neon(&deltas[0][3], d_ie[1], d_je[7]); madd_neon(&deltas[1][0], d_ie[2], d_je[0]); madd_neon(&deltas[1][0], d_ie[3], d_je[1]); madd_neon(&deltas[1][1], d_ie[2], d_je[2]); madd_neon(&deltas[1][1], d_ie[3], d_je[3]); madd_neon(&deltas[1][2], d_ie[2], d_je[4]); madd_neon(&deltas[1][2], d_ie[3], d_je[5]); madd_neon(&deltas[1][3], d_ie[2], d_je[6]); madd_neon(&deltas[1][3], d_ie[3], d_je[7]); madd_neon(&deltas[2][0], d_ie[4], d_je[0]); madd_neon(&deltas[2][0], d_ie[5], d_je[1]); madd_neon(&deltas[2][1], d_ie[4], d_je[2]); madd_neon(&deltas[2][1], d_ie[5], d_je[3]); madd_neon(&deltas[2][2], d_ie[4], d_je[4]); madd_neon(&deltas[2][2], d_ie[5], d_je[5]); madd_neon(&deltas[2][3], d_ie[4], d_je[6]); madd_neon(&deltas[2][3], d_ie[5], d_je[7]); madd_neon(&deltas[3][0], d_ie[6], d_je[0]); madd_neon(&deltas[3][0], d_ie[7], d_je[1]); madd_neon(&deltas[3][1], d_ie[6], d_je[2]); madd_neon(&deltas[3][1], d_ie[7], d_je[3]); madd_neon(&deltas[3][2], d_ie[6], d_je[4]); madd_neon(&deltas[3][2], d_ie[7], d_je[5]); madd_neon(&deltas[3][3], d_ie[6], d_je[6]); madd_neon(&deltas[3][3], d_ie[7], d_je[7]); } static inline void load_square_win5_neon(const int16_t *const di, const int16_t *const dj, const int32_t d_stride, const int32_t height, int16x8_t *d_is, int16x8_t *d_ie, int16x8_t *d_js, int16x8_t *d_je) { load_s16_8x4(di + 0, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6]); load_s16_8x4(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7]); load_s16_8x4(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6]); load_s16_8x4(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7]); load_s16_8x4(di + height * d_stride + 0, d_stride, &d_ie[0], &d_ie[2], &d_ie[4], &d_ie[6]); load_s16_8x4(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3], &d_ie[5], &d_ie[7]); load_s16_8x4(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2], &d_je[4], &d_je[6]); load_s16_8x4(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3], &d_je[5], &d_je[7]); } static inline void update_5_stats_neon(const int64_t *const src, const int32x4_t delta, const int64_t delta4, int64_t *const dst) { update_4_stats_neon(src + 0, delta, dst + 0); dst[4] = src[4] + delta4; } static inline void compute_delta_step3_two_lines(int32x4_t *sum, const int16x8_t src, const int16x8_t dgd) { *sum = vmlsl_s16(*sum, vget_low_s16(src), vget_low_s16(dgd)); *sum = vmlal_s16(*sum, vget_high_s16(src), vget_high_s16(dgd)); } static inline void step3_win5_neon(const int16_t *d, const int32_t d_stride, const int32_t width, const int32_t height, int16x8_t *ds, int32x4_t *deltas) { int32_t y = height; do { ds[4] = load_unaligned_s16_4x2(d + 0 * d_stride, width); ds[5] = load_unaligned_s16_4x2(d + 1 * d_stride, width); compute_delta_step3_two_lines(&deltas[0], ds[0], ds[0]); compute_delta_step3_two_lines(&deltas[1], ds[0], ds[1]); compute_delta_step3_two_lines(&deltas[2], ds[0], ds[2]); compute_delta_step3_two_lines(&deltas[3], ds[0], ds[3]); compute_delta_step3_two_lines(&deltas[4], ds[0], ds[4]); compute_delta_step3_two_lines(&deltas[0], ds[1], ds[1]); compute_delta_step3_two_lines(&deltas[1], ds[1], ds[2]); compute_delta_step3_two_lines(&deltas[2], ds[1], ds[3]); compute_delta_step3_two_lines(&deltas[3], ds[1], ds[4]); compute_delta_step3_two_lines(&deltas[4], ds[1], ds[5]); ds[0] = ds[2]; ds[1] = ds[3]; ds[2] = ds[4]; ds[3] = ds[5]; d += 2 * d_stride; y -= 2; } while (y); } static inline void step3_win5_oneline_neon(const int16_t **const d, const int32_t d_stride, const int32_t width, const int32_t height, int16x8_t *ds, int32x4_t *deltas) { int32_t y = height; do { ds[8] = vld1q_s16(*d); ds[9] = vld1q_s16(*d + width); compute_delta_step3(&deltas[0], &deltas[4], ds[0], ds[1], ds[0], ds[1]); compute_delta_step3(&deltas[1], &deltas[5], ds[0], ds[1], ds[2], ds[3]); compute_delta_step3(&deltas[2], &deltas[6], ds[0], ds[1], ds[4], ds[5]); compute_delta_step3(&deltas[3], &deltas[7], ds[0], ds[1], ds[6], ds[7]); compute_delta_step3(&deltas[8], &deltas[12], ds[0], ds[1], ds[8], ds[9]); ds[0] = ds[2]; ds[1] = ds[3]; ds[2] = ds[4]; ds[3] = ds[5]; ds[4] = ds[6]; ds[5] = ds[7]; ds[6] = ds[8]; ds[7] = ds[9]; *d += d_stride; } while (--y); } static inline void derive_triangle_win5_neon(const int16x8_t *d_is, const int16x8_t *d_ie, int32x4_t *deltas) { msub_neon(&deltas[0], d_is[0], d_is[0]); msub_neon(&deltas[0], d_is[1], d_is[1]); msub_neon(&deltas[1], d_is[0], d_is[2]); msub_neon(&deltas[1], d_is[1], d_is[3]); msub_neon(&deltas[2], d_is[0], d_is[4]); msub_neon(&deltas[2], d_is[1], d_is[5]); msub_neon(&deltas[3], d_is[0], d_is[6]); msub_neon(&deltas[3], d_is[1], d_is[7]); msub_neon(&deltas[4], d_is[2], d_is[2]); msub_neon(&deltas[4], d_is[3], d_is[3]); msub_neon(&deltas[5], d_is[2], d_is[4]); msub_neon(&deltas[5], d_is[3], d_is[5]); msub_neon(&deltas[6], d_is[2], d_is[6]); msub_neon(&deltas[6], d_is[3], d_is[7]); msub_neon(&deltas[7], d_is[4], d_is[4]); msub_neon(&deltas[7], d_is[5], d_is[5]); msub_neon(&deltas[8], d_is[4], d_is[6]); msub_neon(&deltas[8], d_is[5], d_is[7]); msub_neon(&deltas[9], d_is[6], d_is[6]); msub_neon(&deltas[9], d_is[7], d_is[7]); madd_neon(&deltas[0], d_ie[0], d_ie[0]); madd_neon(&deltas[0], d_ie[1], d_ie[1]); madd_neon(&deltas[1], d_ie[0], d_ie[2]); madd_neon(&deltas[1], d_ie[1], d_ie[3]); madd_neon(&deltas[2], d_ie[0], d_ie[4]); madd_neon(&deltas[2], d_ie[1], d_ie[5]); madd_neon(&deltas[3], d_ie[0], d_ie[6]); madd_neon(&deltas[3], d_ie[1], d_ie[7]); madd_neon(&deltas[4], d_ie[2], d_ie[2]); madd_neon(&deltas[4], d_ie[3], d_ie[3]); madd_neon(&deltas[5], d_ie[2], d_ie[4]); madd_neon(&deltas[5], d_ie[3], d_ie[5]); madd_neon(&deltas[6], d_ie[2], d_ie[6]); madd_neon(&deltas[6], d_ie[3], d_ie[7]); madd_neon(&deltas[7], d_ie[4], d_ie[4]); madd_neon(&deltas[7], d_ie[5], d_ie[5]); madd_neon(&deltas[8], d_ie[4], d_ie[6]); madd_neon(&deltas[8], d_ie[5], d_ie[7]); madd_neon(&deltas[9], d_ie[6], d_ie[6]); madd_neon(&deltas[9], d_ie[7], d_ie[7]); } static inline void load_triangle_win5_neon(const int16_t *const di, const int32_t d_stride, const int32_t height, int16x8_t *d_is, int16x8_t *d_ie) { load_s16_8x4(di + 0, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6]); load_s16_8x4(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7]); load_s16_8x4(di + height * d_stride + 0, d_stride, &d_ie[0], &d_ie[2], &d_ie[4], &d_ie[6]); load_s16_8x4(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3], &d_ie[5], &d_ie[7]); } static inline void sub_deltas_step4(int16x8_t *A, int16x8_t *B, int32x4_t *deltas) { deltas[0] = vmlsl_s16(deltas[0], vget_low_s16(A[0]), vget_low_s16(B[0])); deltas[0] = vmlsl_s16(deltas[0], vget_high_s16(A[0]), vget_high_s16(B[0])); deltas[1] = vmlsl_s16(deltas[1], vget_low_s16(A[0]), vget_low_s16(B[1])); deltas[1] = vmlsl_s16(deltas[1], vget_high_s16(A[0]), vget_high_s16(B[1])); deltas[2] = vmlsl_s16(deltas[2], vget_low_s16(A[0]), vget_low_s16(B[2])); deltas[2] = vmlsl_s16(deltas[2], vget_high_s16(A[0]), vget_high_s16(B[2])); deltas[3] = vmlsl_s16(deltas[3], vget_low_s16(A[0]), vget_low_s16(B[3])); deltas[3] = vmlsl_s16(deltas[3], vget_high_s16(A[0]), vget_high_s16(B[3])); deltas[4] = vmlsl_s16(deltas[4], vget_low_s16(A[0]), vget_low_s16(B[4])); deltas[4] = vmlsl_s16(deltas[4], vget_high_s16(A[0]), vget_high_s16(B[4])); deltas[5] = vmlsl_s16(deltas[5], vget_low_s16(A[1]), vget_low_s16(B[0])); deltas[5] = vmlsl_s16(deltas[5], vget_high_s16(A[1]), vget_high_s16(B[0])); deltas[6] = vmlsl_s16(deltas[6], vget_low_s16(A[2]), vget_low_s16(B[0])); deltas[6] = vmlsl_s16(deltas[6], vget_high_s16(A[2]), vget_high_s16(B[0])); deltas[7] = vmlsl_s16(deltas[7], vget_low_s16(A[3]), vget_low_s16(B[0])); deltas[7] = vmlsl_s16(deltas[7], vget_high_s16(A[3]), vget_high_s16(B[0])); deltas[8] = vmlsl_s16(deltas[8], vget_low_s16(A[4]), vget_low_s16(B[0])); deltas[8] = vmlsl_s16(deltas[8], vget_high_s16(A[4]), vget_high_s16(B[0])); } static inline void add_deltas_step4(int16x8_t *A, int16x8_t *B, int32x4_t *deltas) { deltas[0] = vmlal_s16(deltas[0], vget_low_s16(A[0]), vget_low_s16(B[0])); deltas[0] = vmlal_s16(deltas[0], vget_high_s16(A[0]), vget_high_s16(B[0])); deltas[1] = vmlal_s16(deltas[1], vget_low_s16(A[0]), vget_low_s16(B[1])); deltas[1] = vmlal_s16(deltas[1], vget_high_s16(A[0]), vget_high_s16(B[1])); deltas[2] = vmlal_s16(deltas[2], vget_low_s16(A[0]), vget_low_s16(B[2])); deltas[2] = vmlal_s16(deltas[2], vget_high_s16(A[0]), vget_high_s16(B[2])); deltas[3] = vmlal_s16(deltas[3], vget_low_s16(A[0]), vget_low_s16(B[3])); deltas[3] = vmlal_s16(deltas[3], vget_high_s16(A[0]), vget_high_s16(B[3])); deltas[4] = vmlal_s16(deltas[4], vget_low_s16(A[0]), vget_low_s16(B[4])); deltas[4] = vmlal_s16(deltas[4], vget_high_s16(A[0]), vget_high_s16(B[4])); deltas[5] = vmlal_s16(deltas[5], vget_low_s16(A[1]), vget_low_s16(B[0])); deltas[5] = vmlal_s16(deltas[5], vget_high_s16(A[1]), vget_high_s16(B[0])); deltas[6] = vmlal_s16(deltas[6], vget_low_s16(A[2]), vget_low_s16(B[0])); deltas[6] = vmlal_s16(deltas[6], vget_high_s16(A[2]), vget_high_s16(B[0])); deltas[7] = vmlal_s16(deltas[7], vget_low_s16(A[3]), vget_low_s16(B[0])); deltas[7] = vmlal_s16(deltas[7], vget_high_s16(A[3]), vget_high_s16(B[0])); deltas[8] = vmlal_s16(deltas[8], vget_low_s16(A[4]), vget_low_s16(B[0])); deltas[8] = vmlal_s16(deltas[8], vget_high_s16(A[4]), vget_high_s16(B[0])); } static inline void stats_top_win7_neon(const int16x8_t src[2], const int16x8_t dgd[2], const int16_t *const d, const int32_t d_stride, int32x4_t *sum_m, int32x4_t *sum_h) { int16x8_t dgds[WIENER_WIN * 2]; load_s16_8x7(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6], &dgds[8], &dgds[10], &dgds[12]); load_s16_8x7(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7], &dgds[9], &dgds[11], &dgds[13]); madd_neon(&sum_m[0], src[0], dgds[0]); madd_neon(&sum_m[0], src[1], dgds[1]); madd_neon(&sum_m[1], src[0], dgds[2]); madd_neon(&sum_m[1], src[1], dgds[3]); madd_neon(&sum_m[2], src[0], dgds[4]); madd_neon(&sum_m[2], src[1], dgds[5]); madd_neon(&sum_m[3], src[0], dgds[6]); madd_neon(&sum_m[3], src[1], dgds[7]); madd_neon(&sum_m[4], src[0], dgds[8]); madd_neon(&sum_m[4], src[1], dgds[9]); madd_neon(&sum_m[5], src[0], dgds[10]); madd_neon(&sum_m[5], src[1], dgds[11]); madd_neon(&sum_m[6], src[0], dgds[12]); madd_neon(&sum_m[6], src[1], dgds[13]); madd_neon(&sum_h[0], dgd[0], dgds[0]); madd_neon(&sum_h[0], dgd[1], dgds[1]); madd_neon(&sum_h[1], dgd[0], dgds[2]); madd_neon(&sum_h[1], dgd[1], dgds[3]); madd_neon(&sum_h[2], dgd[0], dgds[4]); madd_neon(&sum_h[2], dgd[1], dgds[5]); madd_neon(&sum_h[3], dgd[0], dgds[6]); madd_neon(&sum_h[3], dgd[1], dgds[7]); madd_neon(&sum_h[4], dgd[0], dgds[8]); madd_neon(&sum_h[4], dgd[1], dgds[9]); madd_neon(&sum_h[5], dgd[0], dgds[10]); madd_neon(&sum_h[5], dgd[1], dgds[11]); madd_neon(&sum_h[6], dgd[0], dgds[12]); madd_neon(&sum_h[6], dgd[1], dgds[13]); } static inline void derive_square_win7_neon(const int16x8_t *d_is, const int16x8_t *d_ie, const int16x8_t *d_js, const int16x8_t *d_je, int32x4_t deltas[][WIN_7]) { msub_neon(&deltas[0][0], d_is[0], d_js[0]); msub_neon(&deltas[0][0], d_is[1], d_js[1]); msub_neon(&deltas[0][1], d_is[0], d_js[2]); msub_neon(&deltas[0][1], d_is[1], d_js[3]); msub_neon(&deltas[0][2], d_is[0], d_js[4]); msub_neon(&deltas[0][2], d_is[1], d_js[5]); msub_neon(&deltas[0][3], d_is[0], d_js[6]); msub_neon(&deltas[0][3], d_is[1], d_js[7]); msub_neon(&deltas[0][4], d_is[0], d_js[8]); msub_neon(&deltas[0][4], d_is[1], d_js[9]); msub_neon(&deltas[0][5], d_is[0], d_js[10]); msub_neon(&deltas[0][5], d_is[1], d_js[11]); msub_neon(&deltas[1][0], d_is[2], d_js[0]); msub_neon(&deltas[1][0], d_is[3], d_js[1]); msub_neon(&deltas[1][1], d_is[2], d_js[2]); msub_neon(&deltas[1][1], d_is[3], d_js[3]); msub_neon(&deltas[1][2], d_is[2], d_js[4]); msub_neon(&deltas[1][2], d_is[3], d_js[5]); msub_neon(&deltas[1][3], d_is[2], d_js[6]); msub_neon(&deltas[1][3], d_is[3], d_js[7]); msub_neon(&deltas[1][4], d_is[2], d_js[8]); msub_neon(&deltas[1][4], d_is[3], d_js[9]); msub_neon(&deltas[1][5], d_is[2], d_js[10]); msub_neon(&deltas[1][5], d_is[3], d_js[11]); msub_neon(&deltas[2][0], d_is[4], d_js[0]); msub_neon(&deltas[2][0], d_is[5], d_js[1]); msub_neon(&deltas[2][1], d_is[4], d_js[2]); msub_neon(&deltas[2][1], d_is[5], d_js[3]); msub_neon(&deltas[2][2], d_is[4], d_js[4]); msub_neon(&deltas[2][2], d_is[5], d_js[5]); msub_neon(&deltas[2][3], d_is[4], d_js[6]); msub_neon(&deltas[2][3], d_is[5], d_js[7]); msub_neon(&deltas[2][4], d_is[4], d_js[8]); msub_neon(&deltas[2][4], d_is[5], d_js[9]); msub_neon(&deltas[2][5], d_is[4], d_js[10]); msub_neon(&deltas[2][5], d_is[5], d_js[11]); msub_neon(&deltas[3][0], d_is[6], d_js[0]); msub_neon(&deltas[3][0], d_is[7], d_js[1]); msub_neon(&deltas[3][1], d_is[6], d_js[2]); msub_neon(&deltas[3][1], d_is[7], d_js[3]); msub_neon(&deltas[3][2], d_is[6], d_js[4]); msub_neon(&deltas[3][2], d_is[7], d_js[5]); msub_neon(&deltas[3][3], d_is[6], d_js[6]); msub_neon(&deltas[3][3], d_is[7], d_js[7]); msub_neon(&deltas[3][4], d_is[6], d_js[8]); msub_neon(&deltas[3][4], d_is[7], d_js[9]); msub_neon(&deltas[3][5], d_is[6], d_js[10]); msub_neon(&deltas[3][5], d_is[7], d_js[11]); msub_neon(&deltas[4][0], d_is[8], d_js[0]); msub_neon(&deltas[4][0], d_is[9], d_js[1]); msub_neon(&deltas[4][1], d_is[8], d_js[2]); msub_neon(&deltas[4][1], d_is[9], d_js[3]); msub_neon(&deltas[4][2], d_is[8], d_js[4]); msub_neon(&deltas[4][2], d_is[9], d_js[5]); msub_neon(&deltas[4][3], d_is[8], d_js[6]); msub_neon(&deltas[4][3], d_is[9], d_js[7]); msub_neon(&deltas[4][4], d_is[8], d_js[8]); msub_neon(&deltas[4][4], d_is[9], d_js[9]); msub_neon(&deltas[4][5], d_is[8], d_js[10]); msub_neon(&deltas[4][5], d_is[9], d_js[11]); msub_neon(&deltas[5][0], d_is[10], d_js[0]); msub_neon(&deltas[5][0], d_is[11], d_js[1]); msub_neon(&deltas[5][1], d_is[10], d_js[2]); msub_neon(&deltas[5][1], d_is[11], d_js[3]); msub_neon(&deltas[5][2], d_is[10], d_js[4]); msub_neon(&deltas[5][2], d_is[11], d_js[5]); msub_neon(&deltas[5][3], d_is[10], d_js[6]); msub_neon(&deltas[5][3], d_is[11], d_js[7]); msub_neon(&deltas[5][4], d_is[10], d_js[8]); msub_neon(&deltas[5][4], d_is[11], d_js[9]); msub_neon(&deltas[5][5], d_is[10], d_js[10]); msub_neon(&deltas[5][5], d_is[11], d_js[11]); madd_neon(&deltas[0][0], d_ie[0], d_je[0]); madd_neon(&deltas[0][0], d_ie[1], d_je[1]); madd_neon(&deltas[0][1], d_ie[0], d_je[2]); madd_neon(&deltas[0][1], d_ie[1], d_je[3]); madd_neon(&deltas[0][2], d_ie[0], d_je[4]); madd_neon(&deltas[0][2], d_ie[1], d_je[5]); madd_neon(&deltas[0][3], d_ie[0], d_je[6]); madd_neon(&deltas[0][3], d_ie[1], d_je[7]); madd_neon(&deltas[0][4], d_ie[0], d_je[8]); madd_neon(&deltas[0][4], d_ie[1], d_je[9]); madd_neon(&deltas[0][5], d_ie[0], d_je[10]); madd_neon(&deltas[0][5], d_ie[1], d_je[11]); madd_neon(&deltas[1][0], d_ie[2], d_je[0]); madd_neon(&deltas[1][0], d_ie[3], d_je[1]); madd_neon(&deltas[1][1], d_ie[2], d_je[2]); madd_neon(&deltas[1][1], d_ie[3], d_je[3]); madd_neon(&deltas[1][2], d_ie[2], d_je[4]); madd_neon(&deltas[1][2], d_ie[3], d_je[5]); madd_neon(&deltas[1][3], d_ie[2], d_je[6]); madd_neon(&deltas[1][3], d_ie[3], d_je[7]); madd_neon(&deltas[1][4], d_ie[2], d_je[8]); madd_neon(&deltas[1][4], d_ie[3], d_je[9]); madd_neon(&deltas[1][5], d_ie[2], d_je[10]); madd_neon(&deltas[1][5], d_ie[3], d_je[11]); madd_neon(&deltas[2][0], d_ie[4], d_je[0]); madd_neon(&deltas[2][0], d_ie[5], d_je[1]); madd_neon(&deltas[2][1], d_ie[4], d_je[2]); madd_neon(&deltas[2][1], d_ie[5], d_je[3]); madd_neon(&deltas[2][2], d_ie[4], d_je[4]); madd_neon(&deltas[2][2], d_ie[5], d_je[5]); madd_neon(&deltas[2][3], d_ie[4], d_je[6]); madd_neon(&deltas[2][3], d_ie[5], d_je[7]); madd_neon(&deltas[2][4], d_ie[4], d_je[8]); madd_neon(&deltas[2][4], d_ie[5], d_je[9]); madd_neon(&deltas[2][5], d_ie[4], d_je[10]); madd_neon(&deltas[2][5], d_ie[5], d_je[11]); madd_neon(&deltas[3][0], d_ie[6], d_je[0]); madd_neon(&deltas[3][0], d_ie[7], d_je[1]); madd_neon(&deltas[3][1], d_ie[6], d_je[2]); madd_neon(&deltas[3][1], d_ie[7], d_je[3]); madd_neon(&deltas[3][2], d_ie[6], d_je[4]); madd_neon(&deltas[3][2], d_ie[7], d_je[5]); madd_neon(&deltas[3][3], d_ie[6], d_je[6]); madd_neon(&deltas[3][3], d_ie[7], d_je[7]); madd_neon(&deltas[3][4], d_ie[6], d_je[8]); madd_neon(&deltas[3][4], d_ie[7], d_je[9]); madd_neon(&deltas[3][5], d_ie[6], d_je[10]); madd_neon(&deltas[3][5], d_ie[7], d_je[11]); madd_neon(&deltas[4][0], d_ie[8], d_je[0]); madd_neon(&deltas[4][0], d_ie[9], d_je[1]); madd_neon(&deltas[4][1], d_ie[8], d_je[2]); madd_neon(&deltas[4][1], d_ie[9], d_je[3]); madd_neon(&deltas[4][2], d_ie[8], d_je[4]); madd_neon(&deltas[4][2], d_ie[9], d_je[5]); madd_neon(&deltas[4][3], d_ie[8], d_je[6]); madd_neon(&deltas[4][3], d_ie[9], d_je[7]); madd_neon(&deltas[4][4], d_ie[8], d_je[8]); madd_neon(&deltas[4][4], d_ie[9], d_je[9]); madd_neon(&deltas[4][5], d_ie[8], d_je[10]); madd_neon(&deltas[4][5], d_ie[9], d_je[11]); madd_neon(&deltas[5][0], d_ie[10], d_je[0]); madd_neon(&deltas[5][0], d_ie[11], d_je[1]); madd_neon(&deltas[5][1], d_ie[10], d_je[2]); madd_neon(&deltas[5][1], d_ie[11], d_je[3]); madd_neon(&deltas[5][2], d_ie[10], d_je[4]); madd_neon(&deltas[5][2], d_ie[11], d_je[5]); madd_neon(&deltas[5][3], d_ie[10], d_je[6]); madd_neon(&deltas[5][3], d_ie[11], d_je[7]); madd_neon(&deltas[5][4], d_ie[10], d_je[8]); madd_neon(&deltas[5][4], d_ie[11], d_je[9]); madd_neon(&deltas[5][5], d_ie[10], d_je[10]); madd_neon(&deltas[5][5], d_ie[11], d_je[11]); } static inline void update_8_stats_neon(const int64_t *const src, const int32x4_t delta0, const int32x4_t delta1, int64_t *const dst) { update_4_stats_neon(src + 0, delta0, dst + 0); update_4_stats_neon(src + 4, delta1, dst + 4); } static inline void load_square_win7_neon(const int16_t *const di, const int16_t *const dj, const int32_t d_stride, const int32_t height, int16x8_t *d_is, int16x8_t *d_ie, int16x8_t *d_js, int16x8_t *d_je) { load_s16_8x6(di + 0, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6], &d_is[8], &d_is[10]); load_s16_8x6(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7], &d_is[9], &d_is[11]); load_s16_8x6(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6], &d_js[8], &d_js[10]); load_s16_8x6(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7], &d_js[9], &d_js[11]); load_s16_8x6(di + height * d_stride + 0, d_stride, &d_ie[0], &d_ie[2], &d_ie[4], &d_ie[6], &d_ie[8], &d_ie[10]); load_s16_8x6(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3], &d_ie[5], &d_ie[7], &d_ie[9], &d_ie[11]); load_s16_8x6(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2], &d_je[4], &d_je[6], &d_je[8], &d_je[10]); load_s16_8x6(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3], &d_je[5], &d_je[7], &d_je[9], &d_je[11]); } static inline void load_triangle_win7_neon(const int16_t *const di, const int32_t d_stride, const int32_t height, int16x8_t *d_is, int16x8_t *d_ie) { load_s16_8x6(di, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6], &d_is[8], &d_is[10]); load_s16_8x6(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7], &d_is[9], &d_is[11]); load_s16_8x6(di + height * d_stride, d_stride, &d_ie[0], &d_ie[2], &d_ie[4], &d_ie[6], &d_ie[8], &d_ie[10]); load_s16_8x6(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3], &d_ie[5], &d_ie[7], &d_ie[9], &d_ie[11]); } static inline void stats_left_win7_neon(const int16x8_t src[2], const int16_t *d, const int32_t d_stride, int32x4_t *sum) { int16x8_t dgds[WIN_7]; load_s16_8x6(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6], &dgds[8], &dgds[10]); load_s16_8x6(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7], &dgds[9], &dgds[11]); madd_neon(&sum[0], src[0], dgds[0]); madd_neon(&sum[0], src[1], dgds[1]); madd_neon(&sum[1], src[0], dgds[2]); madd_neon(&sum[1], src[1], dgds[3]); madd_neon(&sum[2], src[0], dgds[4]); madd_neon(&sum[2], src[1], dgds[5]); madd_neon(&sum[3], src[0], dgds[6]); madd_neon(&sum[3], src[1], dgds[7]); madd_neon(&sum[4], src[0], dgds[8]); madd_neon(&sum[4], src[1], dgds[9]); madd_neon(&sum[5], src[0], dgds[10]); madd_neon(&sum[5], src[1], dgds[11]); } static inline void step3_win7_neon(const int16_t *d, const int32_t d_stride, const int32_t width, const int32_t height, int16x8_t *ds, int32x4_t *deltas) { int32_t y = height; do { ds[12] = vld1q_s16(d); ds[13] = vld1q_s16(d + width); compute_delta_step3(&deltas[0], &deltas[4], ds[0], ds[1], ds[0], ds[1]); compute_delta_step3(&deltas[1], &deltas[5], ds[0], ds[1], ds[2], ds[3]); compute_delta_step3(&deltas[2], &deltas[6], ds[0], ds[1], ds[4], ds[5]); compute_delta_step3(&deltas[3], &deltas[7], ds[0], ds[1], ds[6], ds[7]); compute_delta_step3(&deltas[8], &deltas[12], ds[0], ds[1], ds[8], ds[9]); compute_delta_step3(&deltas[9], &deltas[13], ds[0], ds[1], ds[10], ds[11]); compute_delta_step3(&deltas[10], &deltas[14], ds[0], ds[1], ds[12], ds[13]); ds[0] = ds[2]; ds[1] = ds[3]; ds[2] = ds[4]; ds[3] = ds[5]; ds[4] = ds[6]; ds[5] = ds[7]; ds[6] = ds[8]; ds[7] = ds[9]; ds[8] = ds[10]; ds[9] = ds[11]; ds[10] = ds[12]; ds[11] = ds[13]; d += d_stride; } while (--y); } static inline void derive_triangle_win7_neon(const int16x8_t *d_is, const int16x8_t *d_ie, int32x4_t *deltas) { msub_neon(&deltas[0], d_is[0], d_is[0]); msub_neon(&deltas[0], d_is[1], d_is[1]); msub_neon(&deltas[1], d_is[0], d_is[2]); msub_neon(&deltas[1], d_is[1], d_is[3]); msub_neon(&deltas[2], d_is[0], d_is[4]); msub_neon(&deltas[2], d_is[1], d_is[5]); msub_neon(&deltas[3], d_is[0], d_is[6]); msub_neon(&deltas[3], d_is[1], d_is[7]); msub_neon(&deltas[4], d_is[0], d_is[8]); msub_neon(&deltas[4], d_is[1], d_is[9]); msub_neon(&deltas[5], d_is[0], d_is[10]); msub_neon(&deltas[5], d_is[1], d_is[11]); msub_neon(&deltas[6], d_is[2], d_is[2]); msub_neon(&deltas[6], d_is[3], d_is[3]); msub_neon(&deltas[7], d_is[2], d_is[4]); msub_neon(&deltas[7], d_is[3], d_is[5]); msub_neon(&deltas[8], d_is[2], d_is[6]); msub_neon(&deltas[8], d_is[3], d_is[7]); msub_neon(&deltas[9], d_is[2], d_is[8]); msub_neon(&deltas[9], d_is[3], d_is[9]); msub_neon(&deltas[10], d_is[2], d_is[10]); msub_neon(&deltas[10], d_is[3], d_is[11]); msub_neon(&deltas[11], d_is[4], d_is[4]); msub_neon(&deltas[11], d_is[5], d_is[5]); msub_neon(&deltas[12], d_is[4], d_is[6]); msub_neon(&deltas[12], d_is[5], d_is[7]); msub_neon(&deltas[13], d_is[4], d_is[8]); msub_neon(&deltas[13], d_is[5], d_is[9]); msub_neon(&deltas[14], d_is[4], d_is[10]); msub_neon(&deltas[14], d_is[5], d_is[11]); msub_neon(&deltas[15], d_is[6], d_is[6]); msub_neon(&deltas[15], d_is[7], d_is[7]); msub_neon(&deltas[16], d_is[6], d_is[8]); msub_neon(&deltas[16], d_is[7], d_is[9]); msub_neon(&deltas[17], d_is[6], d_is[10]); msub_neon(&deltas[17], d_is[7], d_is[11]); msub_neon(&deltas[18], d_is[8], d_is[8]); msub_neon(&deltas[18], d_is[9], d_is[9]); msub_neon(&deltas[19], d_is[8], d_is[10]); msub_neon(&deltas[19], d_is[9], d_is[11]); msub_neon(&deltas[20], d_is[10], d_is[10]); msub_neon(&deltas[20], d_is[11], d_is[11]); madd_neon(&deltas[0], d_ie[0], d_ie[0]); madd_neon(&deltas[0], d_ie[1], d_ie[1]); madd_neon(&deltas[1], d_ie[0], d_ie[2]); madd_neon(&deltas[1], d_ie[1], d_ie[3]); madd_neon(&deltas[2], d_ie[0], d_ie[4]); madd_neon(&deltas[2], d_ie[1], d_ie[5]); madd_neon(&deltas[3], d_ie[0], d_ie[6]); madd_neon(&deltas[3], d_ie[1], d_ie[7]); madd_neon(&deltas[4], d_ie[0], d_ie[8]); madd_neon(&deltas[4], d_ie[1], d_ie[9]); madd_neon(&deltas[5], d_ie[0], d_ie[10]); madd_neon(&deltas[5], d_ie[1], d_ie[11]); madd_neon(&deltas[6], d_ie[2], d_ie[2]); madd_neon(&deltas[6], d_ie[3], d_ie[3]); madd_neon(&deltas[7], d_ie[2], d_ie[4]); madd_neon(&deltas[7], d_ie[3], d_ie[5]); madd_neon(&deltas[8], d_ie[2], d_ie[6]); madd_neon(&deltas[8], d_ie[3], d_ie[7]); madd_neon(&deltas[9], d_ie[2], d_ie[8]); madd_neon(&deltas[9], d_ie[3], d_ie[9]); madd_neon(&deltas[10], d_ie[2], d_ie[10]); madd_neon(&deltas[10], d_ie[3], d_ie[11]); madd_neon(&deltas[11], d_ie[4], d_ie[4]); madd_neon(&deltas[11], d_ie[5], d_ie[5]); madd_neon(&deltas[12], d_ie[4], d_ie[6]); madd_neon(&deltas[12], d_ie[5], d_ie[7]); madd_neon(&deltas[13], d_ie[4], d_ie[8]); madd_neon(&deltas[13], d_ie[5], d_ie[9]); madd_neon(&deltas[14], d_ie[4], d_ie[10]); madd_neon(&deltas[14], d_ie[5], d_ie[11]); madd_neon(&deltas[15], d_ie[6], d_ie[6]); madd_neon(&deltas[15], d_ie[7], d_ie[7]); madd_neon(&deltas[16], d_ie[6], d_ie[8]); madd_neon(&deltas[16], d_ie[7], d_ie[9]); madd_neon(&deltas[17], d_ie[6], d_ie[10]); madd_neon(&deltas[17], d_ie[7], d_ie[11]); madd_neon(&deltas[18], d_ie[8], d_ie[8]); madd_neon(&deltas[18], d_ie[9], d_ie[9]); madd_neon(&deltas[19], d_ie[8], d_ie[10]); madd_neon(&deltas[19], d_ie[9], d_ie[11]); madd_neon(&deltas[20], d_ie[10], d_ie[10]); madd_neon(&deltas[20], d_ie[11], d_ie[11]); } static inline void diagonal_copy_stats_neon(const int32_t wiener_win2, int64_t *const H) { for (int32_t i = 0; i < wiener_win2 - 1; i += 4) { int64x2_t in[8], out[8]; in[0] = vld1q_s64(H + (i + 0) * wiener_win2 + i + 1); in[1] = vld1q_s64(H + (i + 0) * wiener_win2 + i + 3); in[2] = vld1q_s64(H + (i + 1) * wiener_win2 + i + 1); in[3] = vld1q_s64(H + (i + 1) * wiener_win2 + i + 3); in[4] = vld1q_s64(H + (i + 2) * wiener_win2 + i + 1); in[5] = vld1q_s64(H + (i + 2) * wiener_win2 + i + 3); in[6] = vld1q_s64(H + (i + 3) * wiener_win2 + i + 1); in[7] = vld1q_s64(H + (i + 3) * wiener_win2 + i + 3); transpose_arrays_s64_4x4(in, out); vst1_s64(H + (i + 1) * wiener_win2 + i, vget_low_s64(out[0])); vst1q_s64(H + (i + 2) * wiener_win2 + i, out[2]); vst1q_s64(H + (i + 3) * wiener_win2 + i, out[4]); vst1q_s64(H + (i + 3) * wiener_win2 + i + 2, out[5]); vst1q_s64(H + (i + 4) * wiener_win2 + i, out[6]); vst1q_s64(H + (i + 4) * wiener_win2 + i + 2, out[7]); for (int32_t j = i + 5; j < wiener_win2; j += 4) { in[0] = vld1q_s64(H + (i + 0) * wiener_win2 + j); in[1] = vld1q_s64(H + (i + 0) * wiener_win2 + j + 2); in[2] = vld1q_s64(H + (i + 1) * wiener_win2 + j); in[3] = vld1q_s64(H + (i + 1) * wiener_win2 + j + 2); in[4] = vld1q_s64(H + (i + 2) * wiener_win2 + j); in[5] = vld1q_s64(H + (i + 2) * wiener_win2 + j + 2); in[6] = vld1q_s64(H + (i + 3) * wiener_win2 + j); in[7] = vld1q_s64(H + (i + 3) * wiener_win2 + j + 2); transpose_arrays_s64_4x4(in, out); vst1q_s64(H + (j + 0) * wiener_win2 + i, out[0]); vst1q_s64(H + (j + 0) * wiener_win2 + i + 2, out[1]); vst1q_s64(H + (j + 1) * wiener_win2 + i, out[2]); vst1q_s64(H + (j + 1) * wiener_win2 + i + 2, out[3]); vst1q_s64(H + (j + 2) * wiener_win2 + i, out[4]); vst1q_s64(H + (j + 2) * wiener_win2 + i + 2, out[5]); vst1q_s64(H + (j + 3) * wiener_win2 + i, out[6]); vst1q_s64(H + (j + 3) * wiener_win2 + i + 2, out[7]); } } } static inline int64x2_t div4_neon(const int64x2_t src) { #if AOM_ARCH_AARCH64 uint64x2_t sign = vcltzq_s64(src); int64x2_t abs = vabsq_s64(src); // divide by 4 abs = vshrq_n_s64(abs, 2); // re-apply sign return vbslq_s64(sign, vnegq_s64(abs), abs); #else int64x2_t sign = vshrq_n_s64(src, 63); int64x2_t abs = vsubq_s64(veorq_s64(src, sign), sign); // divide by 4 abs = vshrq_n_s64(abs, 2); // re-apply sign return vsubq_s64(veorq_s64(abs, sign), sign); #endif // AOM_ARCH_AARCH64 } static inline void div4_4x4_neon(const int32_t wiener_win2, int64_t *const H, int64x2_t out[8]) { out[0] = vld1q_s64(H + 0 * wiener_win2 + 0); out[1] = vld1q_s64(H + 0 * wiener_win2 + 2); out[2] = vld1q_s64(H + 1 * wiener_win2 + 0); out[3] = vld1q_s64(H + 1 * wiener_win2 + 2); out[4] = vld1q_s64(H + 2 * wiener_win2 + 0); out[5] = vld1q_s64(H + 2 * wiener_win2 + 2); out[6] = vld1q_s64(H + 3 * wiener_win2 + 0); out[7] = vld1q_s64(H + 3 * wiener_win2 + 2); out[0] = div4_neon(out[0]); out[1] = div4_neon(out[1]); out[2] = div4_neon(out[2]); out[3] = div4_neon(out[3]); out[4] = div4_neon(out[4]); out[5] = div4_neon(out[5]); out[6] = div4_neon(out[6]); out[7] = div4_neon(out[7]); vst1q_s64(H + 0 * wiener_win2 + 0, out[0]); vst1q_s64(H + 0 * wiener_win2 + 2, out[1]); vst1q_s64(H + 1 * wiener_win2 + 0, out[2]); vst1q_s64(H + 1 * wiener_win2 + 2, out[3]); vst1q_s64(H + 2 * wiener_win2 + 0, out[4]); vst1q_s64(H + 2 * wiener_win2 + 2, out[5]); vst1q_s64(H + 3 * wiener_win2 + 0, out[6]); vst1q_s64(H + 3 * wiener_win2 + 2, out[7]); } static inline int64x2_t div16_neon(const int64x2_t src) { #if AOM_ARCH_AARCH64 uint64x2_t sign = vcltzq_s64(src); int64x2_t abs = vabsq_s64(src); // divide by 16 abs = vshrq_n_s64(abs, 4); // re-apply sign return vbslq_s64(sign, vnegq_s64(abs), abs); #else int64x2_t sign = vshrq_n_s64(src, 63); int64x2_t abs = vsubq_s64(veorq_s64(src, sign), sign); // divide by 16 abs = vshrq_n_s64(abs, 4); // re-apply sign return vsubq_s64(veorq_s64(abs, sign), sign); #endif // AOM_ARCH_AARCH64 } static inline void div16_4x4_neon(const int32_t wiener_win2, int64_t *const H, int64x2_t out[8]) { out[0] = vld1q_s64(H + 0 * wiener_win2 + 0); out[1] = vld1q_s64(H + 0 * wiener_win2 + 2); out[2] = vld1q_s64(H + 1 * wiener_win2 + 0); out[3] = vld1q_s64(H + 1 * wiener_win2 + 2); out[4] = vld1q_s64(H + 2 * wiener_win2 + 0); out[5] = vld1q_s64(H + 2 * wiener_win2 + 2); out[6] = vld1q_s64(H + 3 * wiener_win2 + 0); out[7] = vld1q_s64(H + 3 * wiener_win2 + 2); out[0] = div16_neon(out[0]); out[1] = div16_neon(out[1]); out[2] = div16_neon(out[2]); out[3] = div16_neon(out[3]); out[4] = div16_neon(out[4]); out[5] = div16_neon(out[5]); out[6] = div16_neon(out[6]); out[7] = div16_neon(out[7]); vst1q_s64(H + 0 * wiener_win2 + 0, out[0]); vst1q_s64(H + 0 * wiener_win2 + 2, out[1]); vst1q_s64(H + 1 * wiener_win2 + 0, out[2]); vst1q_s64(H + 1 * wiener_win2 + 2, out[3]); vst1q_s64(H + 2 * wiener_win2 + 0, out[4]); vst1q_s64(H + 2 * wiener_win2 + 2, out[5]); vst1q_s64(H + 3 * wiener_win2 + 0, out[6]); vst1q_s64(H + 3 * wiener_win2 + 2, out[7]); } static inline void div4_diagonal_copy_stats_neon(const int32_t wiener_win2, int64_t *const H) { for (int32_t i = 0; i < wiener_win2 - 1; i += 4) { int64x2_t in[8], out[8]; div4_4x4_neon(wiener_win2, H + i * wiener_win2 + i + 1, in); transpose_arrays_s64_4x4(in, out); vst1_s64(H + (i + 1) * wiener_win2 + i + 0, vget_low_s64(out[0])); vst1q_s64(H + (i + 2) * wiener_win2 + i + 0, out[2]); vst1q_s64(H + (i + 3) * wiener_win2 + i + 0, out[4]); vst1q_s64(H + (i + 3) * wiener_win2 + i + 2, out[5]); vst1q_s64(H + (i + 4) * wiener_win2 + i + 0, out[6]); vst1q_s64(H + (i + 4) * wiener_win2 + i + 2, out[7]); for (int32_t j = i + 5; j < wiener_win2; j += 4) { div4_4x4_neon(wiener_win2, H + i * wiener_win2 + j, in); transpose_arrays_s64_4x4(in, out); vst1q_s64(H + (j + 0) * wiener_win2 + i + 0, out[0]); vst1q_s64(H + (j + 0) * wiener_win2 + i + 2, out[1]); vst1q_s64(H + (j + 1) * wiener_win2 + i + 0, out[2]); vst1q_s64(H + (j + 1) * wiener_win2 + i + 2, out[3]); vst1q_s64(H + (j + 2) * wiener_win2 + i + 0, out[4]); vst1q_s64(H + (j + 2) * wiener_win2 + i + 2, out[5]); vst1q_s64(H + (j + 3) * wiener_win2 + i + 0, out[6]); vst1q_s64(H + (j + 3) * wiener_win2 + i + 2, out[7]); } } } static inline void div16_diagonal_copy_stats_neon(const int32_t wiener_win2, int64_t *const H) { for (int32_t i = 0; i < wiener_win2 - 1; i += 4) { int64x2_t in[8], out[8]; div16_4x4_neon(wiener_win2, H + i * wiener_win2 + i + 1, in); transpose_arrays_s64_4x4(in, out); vst1_s64(H + (i + 1) * wiener_win2 + i + 0, vget_low_s64(out[0])); vst1q_s64(H + (i + 2) * wiener_win2 + i + 0, out[2]); vst1q_s64(H + (i + 3) * wiener_win2 + i + 0, out[4]); vst1q_s64(H + (i + 3) * wiener_win2 + i + 2, out[5]); vst1q_s64(H + (i + 4) * wiener_win2 + i + 0, out[6]); vst1q_s64(H + (i + 4) * wiener_win2 + i + 2, out[7]); for (int32_t j = i + 5; j < wiener_win2; j += 4) { div16_4x4_neon(wiener_win2, H + i * wiener_win2 + j, in); transpose_arrays_s64_4x4(in, out); vst1q_s64(H + (j + 0) * wiener_win2 + i + 0, out[0]); vst1q_s64(H + (j + 0) * wiener_win2 + i + 2, out[1]); vst1q_s64(H + (j + 1) * wiener_win2 + i + 0, out[2]); vst1q_s64(H + (j + 1) * wiener_win2 + i + 2, out[3]); vst1q_s64(H + (j + 2) * wiener_win2 + i + 0, out[4]); vst1q_s64(H + (j + 2) * wiener_win2 + i + 2, out[5]); vst1q_s64(H + (j + 3) * wiener_win2 + i + 0, out[6]); vst1q_s64(H + (j + 3) * wiener_win2 + i + 2, out[7]); } } } #endif // AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_ aom-3.12.1/av1/encoder/arm/pickrst_sve.c000066400000000000000000000534141477627663500177750ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_dsp/arm/transpose_neon.h" #include "av1/common/restoration.h" #include "av1/encoder/pickrst.h" #include "av1/encoder/arm/pickrst_sve.h" static inline uint8_t find_average_sve(const uint8_t *src, int src_stride, int width, int height) { uint32x4_t avg_u32 = vdupq_n_u32(0); uint8x16_t ones = vdupq_n_u8(1); // Use a predicate to compute the last columns. svbool_t pattern = svwhilelt_b8_u32(0, width % 16); int h = height; do { int j = width; const uint8_t *src_ptr = src; while (j >= 16) { uint8x16_t s = vld1q_u8(src_ptr); avg_u32 = vdotq_u32(avg_u32, s, ones); j -= 16; src_ptr += 16; } uint8x16_t s_end = svget_neonq_u8(svld1_u8(pattern, src_ptr)); avg_u32 = vdotq_u32(avg_u32, s_end, ones); src += src_stride; } while (--h != 0); return (uint8_t)(vaddlvq_u32(avg_u32) / (width * height)); } static inline void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg, int16_t *buf_avg, int buf_avg_stride, int width, int height, int downsample_factor) { uint8x8_t avg_u8 = vdup_n_u8(avg); // Use a predicate to compute the last columns. svbool_t pattern = svwhilelt_b8_u32(0, width % 8); uint8x8_t avg_end = vget_low_u8(svget_neonq_u8(svdup_n_u8_z(pattern, avg))); do { int j = width; const uint8_t *buf_ptr = buf; int16_t *buf_avg_ptr = buf_avg; while (j >= 8) { uint8x8_t d = vld1_u8(buf_ptr); vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8))); j -= 8; buf_ptr += 8; buf_avg_ptr += 8; } uint8x8_t d_end = vget_low_u8(svget_neonq_u8(svld1_u8(pattern, buf_ptr))); vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d_end, avg_end))); buf += buf_stride; buf_avg += buf_avg_stride; height -= downsample_factor; } while (height > 0); } static inline void copy_upper_triangle(int64_t *H, int64_t *H_tmp, const int wiener_win2, const int scale) { for (int i = 0; i < wiener_win2 - 2; i = i + 2) { // Transpose the first 2x2 square. It needs a special case as the element // of the bottom left is on the diagonal. int64x2_t row0 = vld1q_s64(H_tmp + i * wiener_win2 + i + 1); int64x2_t row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + i + 1); int64x2_t tr_row = aom_vtrn2q_s64(row0, row1); vst1_s64(H_tmp + (i + 1) * wiener_win2 + i, vget_low_s64(row0)); vst1q_s64(H_tmp + (i + 2) * wiener_win2 + i, tr_row); // Transpose and store all the remaining 2x2 squares of the line. for (int j = i + 3; j < wiener_win2; j = j + 2) { row0 = vld1q_s64(H_tmp + i * wiener_win2 + j); row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + j); int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1); int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1); vst1q_s64(H_tmp + j * wiener_win2 + i, tr_row0); vst1q_s64(H_tmp + (j + 1) * wiener_win2 + i, tr_row1); } } for (int i = 0; i < wiener_win2 * wiener_win2; i++) { H[i] += H_tmp[i] * scale; } } // Transpose the matrix that has just been computed and accumulate it in M. static inline void acc_transpose_M(int64_t *M, const int64_t *M_trn, const int wiener_win, int scale) { for (int i = 0; i < wiener_win; ++i) { for (int j = 0; j < wiener_win; ++j) { int tr_idx = j * wiener_win + i; *M++ += (int64_t)(M_trn[tr_idx] * scale); } } } // This function computes two matrices: the cross-correlation between the src // buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). // // M is of size 7 * 7. It needs to be filled such that multiplying one element // from src with each element of a row of the wiener window will fill one // column of M. However this is not very convenient in terms of memory // accesses, as it means we do contiguous loads of dgd but strided stores to M. // As a result, we use an intermediate matrix M_trn which is instead filled // such that one row of the wiener window gives one row of M_trn. Once fully // computed, M_trn is then transposed to return M. // // H is of size 49 * 49. It is filled by multiplying every pair of elements of // the wiener window together. Since it is a symmetric matrix, we only compute // the upper triangle, and then copy it down to the lower one. Here we fill it // by taking each different pair of columns, and multiplying all the elements of // the first one with all the elements of the second one, with a special case // when multiplying a column by itself. static inline void compute_stats_win7_downsampled_sve( int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride, int width, int height, int64_t *M, int64_t *H, int downsample_factor) { const int wiener_win = 7; const int wiener_win2 = wiener_win * wiener_win; // Use a predicate to compute the last columns of the block for H. svbool_t pattern = svwhilelt_b16_u32(0, width % 8); // Use intermediate matrices for H and M to perform the computation, they // will be accumulated into the original H and M at the end. int64_t M_trn[49]; memset(M_trn, 0, sizeof(M_trn)); int64_t H_tmp[49 * 49]; memset(H_tmp, 0, sizeof(H_tmp)); assert(height > 0); do { // Cross-correlation (M). for (int row = 0; row < wiener_win; row++) { int j = 0; while (j < width) { int16x8_t dgd[7]; load_s16_8x7(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1], &dgd[2], &dgd[3], &dgd[4], &dgd[5], &dgd[6]); int16x8_t s = vld1q_s16(src_avg + j); // Compute all the elements of one row of M. compute_M_one_row_win7(s, dgd, M_trn, row); j += 8; } } // Auto-covariance (H). int j = 0; while (j <= width - 8) { for (int col0 = 0; col0 < wiener_win; col0++) { int16x8_t dgd0[7]; load_s16_8x7(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1], &dgd0[2], &dgd0[3], &dgd0[4], &dgd0[5], &dgd0[6]); // Perform computation of the first column with itself (28 elements). // For the first column this will fill the upper triangle of the 7x7 // matrix at the top left of the H matrix. For the next columns this // will fill the upper triangle of the other 7x7 matrices around H's // diagonal. compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); // All computation next to the matrix diagonal has already been done. for (int col1 = col0 + 1; col1 < wiener_win; col1++) { // Load second column and scale based on downsampling factor. int16x8_t dgd1[7]; load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]); // Compute all elements from the combination of both columns (49 // elements). compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp); } } j += 8; } if (j < width) { // Process remaining columns using a predicate to discard excess elements. for (int col0 = 0; col0 < wiener_win; col0++) { // Load first column. int16x8_t dgd0[7]; dgd0[0] = svget_neonq_s16( svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0)); dgd0[1] = svget_neonq_s16( svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0)); dgd0[2] = svget_neonq_s16( svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0)); dgd0[3] = svget_neonq_s16( svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0)); dgd0[4] = svget_neonq_s16( svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0)); dgd0[5] = svget_neonq_s16( svld1_s16(pattern, dgd_avg + 5 * dgd_avg_stride + j + col0)); dgd0[6] = svget_neonq_s16( svld1_s16(pattern, dgd_avg + 6 * dgd_avg_stride + j + col0)); // Perform computation of the first column with itself (28 elements). // For the first column this will fill the upper triangle of the 7x7 // matrix at the top left of the H matrix. For the next columns this // will fill the upper triangle of the other 7x7 matrices around H's // diagonal. compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); // All computation next to the matrix diagonal has already been done. for (int col1 = col0 + 1; col1 < wiener_win; col1++) { // Load second column and scale based on downsampling factor. int16x8_t dgd1[7]; load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]); // Compute all elements from the combination of both columns (49 // elements). compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp); } } } dgd_avg += downsample_factor * dgd_avg_stride; src_avg += src_avg_stride; } while (--height != 0); // Transpose M_trn. acc_transpose_M(M, M_trn, 7, downsample_factor); // Copy upper triangle of H in the lower one. copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor); } // This function computes two matrices: the cross-correlation between the src // buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). // // M is of size 5 * 5. It needs to be filled such that multiplying one element // from src with each element of a row of the wiener window will fill one // column of M. However this is not very convenient in terms of memory // accesses, as it means we do contiguous loads of dgd but strided stores to M. // As a result, we use an intermediate matrix M_trn which is instead filled // such that one row of the wiener window gives one row of M_trn. Once fully // computed, M_trn is then transposed to return M. // // H is of size 25 * 25. It is filled by multiplying every pair of elements of // the wiener window together. Since it is a symmetric matrix, we only compute // the upper triangle, and then copy it down to the lower one. Here we fill it // by taking each different pair of columns, and multiplying all the elements of // the first one with all the elements of the second one, with a special case // when multiplying a column by itself. static inline void compute_stats_win5_downsampled_sve( int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride, int width, int height, int64_t *M, int64_t *H, int downsample_factor) { const int wiener_win = 5; const int wiener_win2 = wiener_win * wiener_win; // Use a predicate to compute the last columns of the block for H. svbool_t pattern = svwhilelt_b16_u32(0, width % 8); // Use intermediate matrices for H and M to perform the computation, they // will be accumulated into the original H and M at the end. int64_t M_trn[25]; memset(M_trn, 0, sizeof(M_trn)); int64_t H_tmp[25 * 25]; memset(H_tmp, 0, sizeof(H_tmp)); assert(height > 0); do { // Cross-correlation (M). for (int row = 0; row < wiener_win; row++) { int j = 0; while (j < width) { int16x8_t dgd[5]; load_s16_8x5(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1], &dgd[2], &dgd[3], &dgd[4]); int16x8_t s = vld1q_s16(src_avg + j); // Compute all the elements of one row of M. compute_M_one_row_win5(s, dgd, M_trn, row); j += 8; } } // Auto-covariance (H). int j = 0; while (j <= width - 8) { for (int col0 = 0; col0 < wiener_win; col0++) { // Load first column. int16x8_t dgd0[5]; load_s16_8x5(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1], &dgd0[2], &dgd0[3], &dgd0[4]); // Perform computation of the first column with itself (15 elements). // For the first column this will fill the upper triangle of the 5x5 // matrix at the top left of the H matrix. For the next columns this // will fill the upper triangle of the other 5x5 matrices around H's // diagonal. compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); // All computation next to the matrix diagonal has already been done. for (int col1 = col0 + 1; col1 < wiener_win; col1++) { // Load second column and scale based on downsampling factor. int16x8_t dgd1[5]; load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], &dgd1[2], &dgd1[3], &dgd1[4]); // Compute all elements from the combination of both columns (25 // elements). compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp); } } j += 8; } // Process remaining columns using a predicate to discard excess elements. if (j < width) { for (int col0 = 0; col0 < wiener_win; col0++) { int16x8_t dgd0[5]; dgd0[0] = svget_neonq_s16( svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0)); dgd0[1] = svget_neonq_s16( svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0)); dgd0[2] = svget_neonq_s16( svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0)); dgd0[3] = svget_neonq_s16( svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0)); dgd0[4] = svget_neonq_s16( svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0)); // Perform computation of the first column with itself (15 elements). // For the first column this will fill the upper triangle of the 5x5 // matrix at the top left of the H matrix. For the next columns this // will fill the upper triangle of the other 5x5 matrices around H's // diagonal. compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2); // All computation next to the matrix diagonal has already been done. for (int col1 = col0 + 1; col1 < wiener_win; col1++) { // Load second column and scale based on downsampling factor. int16x8_t dgd1[5]; load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1], &dgd1[2], &dgd1[3], &dgd1[4]); // Compute all elements from the combination of both columns (25 // elements). compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp); } } } dgd_avg += downsample_factor * dgd_avg_stride; src_avg += src_avg_stride; } while (--height != 0); // Transpose M_trn. acc_transpose_M(M, M_trn, 5, downsample_factor); // Copy upper triangle of H in the lower one. copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor); } static inline void av1_compute_stats_downsampled_sve( int wiener_win, const uint8_t *dgd, const uint8_t *src, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) { assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA); const int wiener_win2 = wiener_win * wiener_win; const int wiener_halfwin = wiener_win >> 1; const int32_t width = h_end - h_start; const int32_t height = v_end - v_start; const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start]; memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); memset(M, 0, sizeof(*M) * wiener_win * wiener_win); const uint8_t avg = find_average_sve(dgd_start, dgd_stride, width, height); const int downsample_factor = WIENER_STATS_DOWNSAMPLE_FACTOR; // dgd_avg and src_avg have been memset to zero before calling this // function, so round up the stride to the next multiple of 8 so that we // don't have to worry about a tail loop when computing M. const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8; const int src_avg_stride = (width & ~7) + 8; // Compute (dgd - avg) and store it in dgd_avg. // The wiener window will slide along the dgd frame, centered on each pixel. // For the top left pixel and all the pixels on the side of the frame this // means half of the window will be outside of the frame. As such the actual // buffer that we need to subtract the avg from will be 2 * wiener_halfwin // wider and 2 * wiener_halfwin higher than the original dgd buffer. const int vert_offset = v_start - wiener_halfwin; const int horiz_offset = h_start - wiener_halfwin; const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride, width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1); // Compute (src - avg), downsample and store in src-avg. const uint8_t *src_start = src + h_start + v_start * src_stride; compute_sub_avg(src_start, src_stride * downsample_factor, avg, src_avg, src_avg_stride, width, height, downsample_factor); const int downsample_height = height / downsample_factor; // Since the height is not necessarily a multiple of the downsample factor, // the last line of src will be scaled according to how many rows remain. const int downsample_remainder = height % downsample_factor; if (downsample_height > 0) { if (wiener_win == WIENER_WIN) { compute_stats_win7_downsampled_sve( dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, width, downsample_height, M, H, downsample_factor); } else { compute_stats_win5_downsampled_sve( dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, width, downsample_height, M, H, downsample_factor); } } if (downsample_remainder > 0) { const int remainder_offset = height - downsample_remainder; if (wiener_win == WIENER_WIN) { compute_stats_win7_downsampled_sve( dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride, src_avg + downsample_height * src_avg_stride, src_avg_stride, width, 1, M, H, downsample_remainder); } else { compute_stats_win5_downsampled_sve( dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride, src_avg + downsample_height * src_avg_stride, src_avg_stride, width, 1, M, H, downsample_remainder); } } } void av1_compute_stats_sve(int wiener_win, const uint8_t *dgd, const uint8_t *src, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats) { assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA); if (use_downsampled_wiener_stats) { av1_compute_stats_downsampled_sve(wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H); return; } const int wiener_win2 = wiener_win * wiener_win; const int wiener_halfwin = wiener_win >> 1; const int32_t width = h_end - h_start; const int32_t height = v_end - v_start; const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start]; memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); memset(M, 0, sizeof(*M) * wiener_win * wiener_win); const uint8_t avg = find_average_sve(dgd_start, dgd_stride, width, height); // dgd_avg and src_avg have been memset to zero before calling this // function, so round up the stride to the next multiple of 8 so that we // don't have to worry about a tail loop when computing M. const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8; const int src_avg_stride = (width & ~7) + 8; // Compute (dgd - avg) and store it in dgd_avg. // The wiener window will slide along the dgd frame, centered on each pixel. // For the top left pixel and all the pixels on the side of the frame this // means half of the window will be outside of the frame. As such the actual // buffer that we need to subtract the avg from will be 2 * wiener_halfwin // wider and 2 * wiener_halfwin higher than the original dgd buffer. const int vert_offset = v_start - wiener_halfwin; const int horiz_offset = h_start - wiener_halfwin; const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride, width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1); // Compute (src - avg), and store in src-avg. const uint8_t *src_start = src + h_start + v_start * src_stride; compute_sub_avg(src_start, src_stride, avg, src_avg, src_avg_stride, width, height, 1); if (wiener_win == WIENER_WIN) { compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, width, height, M, H); } else { compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, width, height, M, H); } // H is a symmetric matrix, so we only need to fill out the upper triangle. // We can copy it down to the lower triangle outside the (i, j) loops. diagonal_copy_stats_neon(wiener_win2, H); } aom-3.12.1/av1/encoder/arm/pickrst_sve.h000066400000000000000000002611201477627663500177750ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_ #define AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_ #include #include #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "av1/encoder/arm/pickrst_neon.h" // Swap each half of the dgd vectors so that we can accumulate the result of // the dot-products directly in the destination matrix. static inline int16x8x2_t transpose_dgd(int16x8_t dgd0, int16x8_t dgd1) { int16x8_t dgd_trn0 = vreinterpretq_s16_s64( vzip1q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1))); int16x8_t dgd_trn1 = vreinterpretq_s16_s64( vzip2q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1))); return (struct int16x8x2_t){ dgd_trn0, dgd_trn1 }; } static inline void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd[5], int64_t *M, int row) { const int wiener_win = 5; int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0); int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]); int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0); cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1); vst1q_s64(M + row * wiener_win + 0, cross_corr01); int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2); int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]); int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0); cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1); vst1q_s64(M + row * wiener_win + 2, cross_corr23); int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[4]); M[row * wiener_win + 4] += vaddvq_s64(m4); } static inline void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd[7], int64_t *M, int row) { const int wiener_win = 7; int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0); int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]); int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0); cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1); vst1q_s64(M + row * wiener_win + 0, cross_corr01); int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2); int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]); int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0); cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1); vst1q_s64(M + row * wiener_win + 2, cross_corr23); int64x2_t m45 = vld1q_s64(M + row * wiener_win + 4); int16x8x2_t dgd45 = transpose_dgd(dgd[4], dgd[5]); int64x2_t cross_corr45 = aom_svdot_lane_s16(m45, dgd45.val[0], src, 0); cross_corr45 = aom_svdot_lane_s16(cross_corr45, dgd45.val[1], src, 1); vst1q_s64(M + row * wiener_win + 4, cross_corr45); int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[6]); M[row * wiener_win + 6] += vaddvq_s64(m6); } static inline void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H, const int wiener_win, const int wiener_win2) { for (int row0 = 0; row0 < wiener_win; row0++) { for (int row1 = row0; row1 < wiener_win; row1++) { int auto_cov_idx = (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1; int64x2_t auto_cov = aom_sdotq_s16(vdupq_n_s64(0), dgd[row0], dgd[row1]); H[auto_cov_idx] += vaddvq_s64(auto_cov); } } } static inline void compute_H_two_rows_win5(int16x8_t *dgd0, int16x8_t *dgd1, int row0, int row1, int64_t *H) { for (int col0 = 0; col0 < 5; col0++) { int auto_cov_idx = (row0 * 5 + col0) * 25 + (row1 * 5); int64x2_t h01 = vld1q_s64(H + auto_cov_idx); int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]); int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0); auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1); vst1q_s64(H + auto_cov_idx, auto_cov01); int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2); int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]); int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0); auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1); vst1q_s64(H + auto_cov_idx + 2, auto_cov23); int64x2_t auto_cov4 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[4]); H[auto_cov_idx + 4] += vaddvq_s64(auto_cov4); } } static inline void compute_H_two_rows_win7(int16x8_t *dgd0, int16x8_t *dgd1, int row0, int row1, int64_t *H) { for (int col0 = 0; col0 < 7; col0++) { int auto_cov_idx = (row0 * 7 + col0) * 49 + (row1 * 7); int64x2_t h01 = vld1q_s64(H + auto_cov_idx); int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]); int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0); auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1); vst1q_s64(H + auto_cov_idx, auto_cov01); int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2); int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]); int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0); auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1); vst1q_s64(H + auto_cov_idx + 2, auto_cov23); int64x2_t h45 = vld1q_s64(H + auto_cov_idx + 4); int16x8x2_t dgd45 = transpose_dgd(dgd1[4], dgd1[5]); int64x2_t auto_cov45 = aom_svdot_lane_s16(h45, dgd45.val[0], dgd0[col0], 0); auto_cov45 = aom_svdot_lane_s16(auto_cov45, dgd45.val[1], dgd0[col0], 1); vst1q_s64(H + auto_cov_idx + 4, auto_cov45); int64x2_t auto_cov6 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[6]); H[auto_cov_idx + 6] += vaddvq_s64(auto_cov6); } } static inline void stats_top_win5_sve(const int16x8_t src[2], const int16x8_t dgd[2], const int16_t *const d, const int32_t d_stride, int64x2_t *sum_m, int64x2_t *sum_h) { int16x8_t dgds[WIENER_WIN_CHROMA * 2]; load_s16_8x5(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6], &dgds[8]); load_s16_8x5(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7], &dgds[9]); sum_m[0] = aom_sdotq_s16(sum_m[0], src[0], dgds[0]); sum_m[0] = aom_sdotq_s16(sum_m[0], src[1], dgds[1]); sum_m[1] = aom_sdotq_s16(sum_m[1], src[0], dgds[2]); sum_m[1] = aom_sdotq_s16(sum_m[1], src[1], dgds[3]); sum_m[2] = aom_sdotq_s16(sum_m[2], src[0], dgds[4]); sum_m[2] = aom_sdotq_s16(sum_m[2], src[1], dgds[5]); sum_m[3] = aom_sdotq_s16(sum_m[3], src[0], dgds[6]); sum_m[3] = aom_sdotq_s16(sum_m[3], src[1], dgds[7]); sum_m[4] = aom_sdotq_s16(sum_m[4], src[0], dgds[8]); sum_m[4] = aom_sdotq_s16(sum_m[4], src[1], dgds[9]); sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[0], dgds[0]); sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[1], dgds[1]); sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[0], dgds[2]); sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[1], dgds[3]); sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[0], dgds[4]); sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[1], dgds[5]); sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[0], dgds[6]); sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[1], dgds[7]); sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[0], dgds[8]); sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[1], dgds[9]); } static inline void stats_left_win5_sve(const int16x8_t src[2], const int16_t *d, const int32_t d_stride, int64x2_t *sum) { int16x8_t dgds[WIN_CHROMA]; load_s16_8x4(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6]); load_s16_8x4(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7]); sum[0] = aom_sdotq_s16(sum[0], src[0], dgds[0]); sum[0] = aom_sdotq_s16(sum[0], src[1], dgds[1]); sum[1] = aom_sdotq_s16(sum[1], src[0], dgds[2]); sum[1] = aom_sdotq_s16(sum[1], src[1], dgds[3]); sum[2] = aom_sdotq_s16(sum[2], src[0], dgds[4]); sum[2] = aom_sdotq_s16(sum[2], src[1], dgds[5]); sum[3] = aom_sdotq_s16(sum[3], src[0], dgds[6]); sum[3] = aom_sdotq_s16(sum[3], src[1], dgds[7]); } static inline void sub_deltas_step4_sve(int16x8_t *A, int16x8_t *B, int64x2_t *deltas) { deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(A[0]), B[0]); deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(A[0]), B[1]); deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(A[0]), B[2]); deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(A[0]), B[3]); deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(A[0]), B[4]); deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(A[1]), B[0]); deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(A[2]), B[0]); deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(A[3]), B[0]); deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(A[4]), B[0]); } static inline void add_deltas_step4_sve(int16x8_t *A, int16x8_t *B, int64x2_t *deltas) { deltas[0] = aom_sdotq_s16(deltas[0], A[0], B[0]); deltas[1] = aom_sdotq_s16(deltas[1], A[0], B[1]); deltas[2] = aom_sdotq_s16(deltas[2], A[0], B[2]); deltas[3] = aom_sdotq_s16(deltas[3], A[0], B[3]); deltas[4] = aom_sdotq_s16(deltas[4], A[0], B[4]); deltas[5] = aom_sdotq_s16(deltas[5], A[1], B[0]); deltas[6] = aom_sdotq_s16(deltas[6], A[2], B[0]); deltas[7] = aom_sdotq_s16(deltas[7], A[3], B[0]); deltas[8] = aom_sdotq_s16(deltas[8], A[4], B[0]); } static inline void load_square_win5_sve( const int16_t *const di, const int16_t *const dj, const int32_t d_stride, const int32_t height, int16x8_t *d_is, int16x8_t *d_ie, int16x8_t *d_js, int16x8_t *d_je, svbool_t p0, svbool_t p1) { d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0)); d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8)); d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0)); d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8)); d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0)); d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8)); d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0)); d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8)); d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0)); d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8)); d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0)); d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8)); d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0)); d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8)); d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0)); d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8)); load_s16_8x4(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6]); load_s16_8x4(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7]); load_s16_8x4(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2], &d_je[4], &d_je[6]); load_s16_8x4(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3], &d_je[5], &d_je[7]); } static inline void update_4_stats_sve(const int64_t *const src, const int64x2_t *delta, int64_t *const dst) { const int64x2_t s1 = vld1q_s64(src); const int64x2_t s2 = vld1q_s64(src + 2); vst1q_s64(dst + 0, vaddq_s64(s1, delta[0])); vst1q_s64(dst + 2, vaddq_s64(s2, delta[1])); } static inline void derive_square_win5_sve( int16x8_t *d_is, const int16x8_t *d_ie, const int16x8_t *d_js, const int16x8_t *d_je, int64x2_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1]) { d_is[0] = vnegq_s16(d_is[0]); d_is[1] = vnegq_s16(d_is[1]); d_is[2] = vnegq_s16(d_is[2]); d_is[3] = vnegq_s16(d_is[3]); d_is[4] = vnegq_s16(d_is[4]); d_is[5] = vnegq_s16(d_is[5]); d_is[6] = vnegq_s16(d_is[6]); d_is[7] = vnegq_s16(d_is[7]); deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[0], d_js[0]); deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[1], d_js[1]); deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[0], d_js[2]); deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[1], d_js[3]); deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[0], d_js[4]); deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[1], d_js[5]); deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[0], d_js[6]); deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[1], d_js[7]); deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[2], d_js[0]); deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[3], d_js[1]); deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[2], d_js[2]); deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[3], d_js[3]); deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[2], d_js[4]); deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[3], d_js[5]); deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[2], d_js[6]); deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[3], d_js[7]); deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[4], d_js[0]); deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[5], d_js[1]); deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[4], d_js[2]); deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[5], d_js[3]); deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[4], d_js[4]); deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[5], d_js[5]); deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[4], d_js[6]); deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[5], d_js[7]); deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[6], d_js[0]); deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[7], d_js[1]); deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[6], d_js[2]); deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[7], d_js[3]); deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[6], d_js[4]); deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[7], d_js[5]); deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[6], d_js[6]); deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[7], d_js[7]); deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[0], d_je[0]); deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[1], d_je[1]); deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[0], d_je[2]); deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[1], d_je[3]); deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[0], d_je[4]); deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[1], d_je[5]); deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[0], d_je[6]); deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[1], d_je[7]); deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[2], d_je[0]); deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[3], d_je[1]); deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[2], d_je[2]); deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[3], d_je[3]); deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[2], d_je[4]); deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[3], d_je[5]); deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[2], d_je[6]); deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[3], d_je[7]); deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[4], d_je[0]); deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[5], d_je[1]); deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[4], d_je[2]); deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[5], d_je[3]); deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[4], d_je[4]); deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[5], d_je[5]); deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[4], d_je[6]); deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[5], d_je[7]); deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[6], d_je[0]); deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[7], d_je[1]); deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[6], d_je[2]); deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[7], d_je[3]); deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[6], d_je[4]); deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[7], d_je[5]); deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[6], d_je[6]); deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[7], d_je[7]); } static inline void hadd_update_4_stats_sve(const int64_t *const src, const int64x2_t *deltas, int64_t *const dst) { int64x2_t src0 = vld1q_s64(src); int64x2_t src1 = vld1q_s64(src + 2); vst1q_s64(dst + 0, vaddq_s64(src0, vpaddq_s64(deltas[0], deltas[1]))); vst1q_s64(dst + 2, vaddq_s64(src1, vpaddq_s64(deltas[2], deltas[3]))); } static inline void load_triangle_win5_sve(const int16_t *const di, const int32_t d_stride, const int32_t height, int16x8_t *d_is, int16x8_t *d_ie, svbool_t p0, svbool_t p1) { d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0)); d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8)); d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0)); d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8)); d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0)); d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8)); d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0)); d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8)); d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0)); d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8)); d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0)); d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8)); d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0)); d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8)); d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0)); d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8)); } static inline void derive_triangle_win5_sve(const int16x8_t *d_is, const int16x8_t *d_ie, int64x2_t *deltas) { deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[0]), d_is[0]); deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[1]), d_is[1]); deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[0]), d_is[2]); deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[1]), d_is[3]); deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[0]), d_is[4]); deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[1]), d_is[5]); deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[0]), d_is[6]); deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[1]), d_is[7]); deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[2]), d_is[2]); deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[3]), d_is[3]); deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[2]), d_is[4]); deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[3]), d_is[5]); deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[2]), d_is[6]); deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[3]), d_is[7]); deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[4]), d_is[4]); deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[5]), d_is[5]); deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[4]), d_is[6]); deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[5]), d_is[7]); deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[6]), d_is[6]); deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[7]), d_is[7]); deltas[0] = aom_sdotq_s16(deltas[0], d_ie[0], d_ie[0]); deltas[0] = aom_sdotq_s16(deltas[0], d_ie[1], d_ie[1]); deltas[1] = aom_sdotq_s16(deltas[1], d_ie[0], d_ie[2]); deltas[1] = aom_sdotq_s16(deltas[1], d_ie[1], d_ie[3]); deltas[2] = aom_sdotq_s16(deltas[2], d_ie[0], d_ie[4]); deltas[2] = aom_sdotq_s16(deltas[2], d_ie[1], d_ie[5]); deltas[3] = aom_sdotq_s16(deltas[3], d_ie[0], d_ie[6]); deltas[3] = aom_sdotq_s16(deltas[3], d_ie[1], d_ie[7]); deltas[4] = aom_sdotq_s16(deltas[4], d_ie[2], d_ie[2]); deltas[4] = aom_sdotq_s16(deltas[4], d_ie[3], d_ie[3]); deltas[5] = aom_sdotq_s16(deltas[5], d_ie[2], d_ie[4]); deltas[5] = aom_sdotq_s16(deltas[5], d_ie[3], d_ie[5]); deltas[6] = aom_sdotq_s16(deltas[6], d_ie[2], d_ie[6]); deltas[6] = aom_sdotq_s16(deltas[6], d_ie[3], d_ie[7]); deltas[7] = aom_sdotq_s16(deltas[7], d_ie[4], d_ie[4]); deltas[7] = aom_sdotq_s16(deltas[7], d_ie[5], d_ie[5]); deltas[8] = aom_sdotq_s16(deltas[8], d_ie[4], d_ie[6]); deltas[8] = aom_sdotq_s16(deltas[8], d_ie[5], d_ie[7]); deltas[9] = aom_sdotq_s16(deltas[9], d_ie[6], d_ie[6]); deltas[9] = aom_sdotq_s16(deltas[9], d_ie[7], d_ie[7]); } static inline void compute_stats_win5_sve( const int16_t *const d, const int32_t d_stride, const int16_t *const s, const int32_t s_stride, const int32_t width, const int32_t height, int64_t *const M, int64_t *const H) { const int32_t wiener_win = WIENER_WIN_CHROMA; const int32_t wiener_win2 = wiener_win * wiener_win; const int32_t h8 = height & ~7; int32_t i, j, x, y; // Use a predicate to compute the last columns. svbool_t p0 = svwhilelt_b16_u32(0, width % 16 == 0 ? 16 : width % 16); svbool_t p1 = svwhilelt_b16_u32(8, width % 16 == 0 ? 16 : width % 16); // Step 1: Calculate the top edge of the whole matrix, i.e., the top // edge of each triangle and square on the top row. j = 0; do { const int16_t *s_t = s; const int16_t *d_t = d; int64x2_t sum_m[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) }; int64x2_t sum_h[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) }; int16x8_t src[2], dgd[2]; y = height; do { x = 0; while (x < width - 16) { src[0] = vld1q_s16(s_t + x + 0); src[1] = vld1q_s16(s_t + x + 8); dgd[0] = vld1q_s16(d_t + x + 0); dgd[1] = vld1q_s16(d_t + x + 8); stats_top_win5_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h); x += 16; } src[0] = svget_neonq_s16(svld1_s16(p0, s_t + x + 0)); src[1] = svget_neonq_s16(svld1_s16(p1, s_t + x + 8)); dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + x + 0)); dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + x + 8)); stats_top_win5_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h); s_t += s_stride; d_t += d_stride; } while (--y); vst1q_s64(&M[wiener_win * j + 0], vpaddq_s64(sum_m[0], sum_m[1])); vst1q_s64(&M[wiener_win * j + 2], vpaddq_s64(sum_m[2], sum_m[3])); M[wiener_win * j + 4] = vaddvq_s64(sum_m[4]); vst1q_s64(&H[wiener_win * j + 0], vpaddq_s64(sum_h[0], sum_h[1])); vst1q_s64(&H[wiener_win * j + 2], vpaddq_s64(sum_h[2], sum_h[3])); H[wiener_win * j + 4] = vaddvq_s64(sum_h[4]); } while (++j < wiener_win); // Step 2: Calculate the left edge of each square on the top row. j = 1; do { const int16_t *d_t = d; int64x2_t sum_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s64(0) }; int16x8_t dgd[2]; y = height; do { x = 0; while (x < width - 16) { dgd[0] = vld1q_s16(d_t + j + x + 0); dgd[1] = vld1q_s16(d_t + j + x + 8); stats_left_win5_sve(dgd, d_t + x, d_stride, sum_h); x += 16; } dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + j + x + 0)); dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + j + x + 8)); stats_left_win5_sve(dgd, d_t + x, d_stride, sum_h); d_t += d_stride; } while (--y); int64x2_t sum_h01 = vpaddq_s64(sum_h[0], sum_h[1]); int64x2_t sum_h23 = vpaddq_s64(sum_h[2], sum_h[3]); vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h01)); vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h01)); vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h23)); vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h23)); } while (++j < wiener_win); // Step 3: Derive the top edge of each triangle along the diagonal. No // triangle in top row. { const int16_t *d_t = d; if (height % 2) { int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; int16x8_t ds[WIENER_WIN * 2]; load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]); load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]); d_t += 4 * d_stride; step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas); transpose_arrays_s32_8x8(deltas, deltas_tr); update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win, deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0), H + 1 * wiener_win * wiener_win2 + 1 * wiener_win); update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win, deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0), H + 2 * wiener_win * wiener_win2 + 2 * wiener_win); update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win, deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0), H + 3 * wiener_win * wiener_win2 + 3 * wiener_win); update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win, deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0), H + 4 * wiener_win * wiener_win2 + 4 * wiener_win); } else { int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) }; int16x8_t ds[WIENER_WIN_CHROMA * 2]; ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width); ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width); ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width); ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width); step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas); transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2], &deltas[3]); update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win, deltas[0], vgetq_lane_s32(deltas[4], 0), H + 1 * wiener_win * wiener_win2 + 1 * wiener_win); update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win, deltas[1], vgetq_lane_s32(deltas[4], 1), H + 2 * wiener_win * wiener_win2 + 2 * wiener_win); update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win, deltas[2], vgetq_lane_s32(deltas[4], 2), H + 3 * wiener_win * wiener_win2 + 3 * wiener_win); update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win, deltas[3], vgetq_lane_s32(deltas[4], 3), H + 4 * wiener_win * wiener_win2 + 4 * wiener_win); } } // Step 4: Derive the top and left edge of each square. No square in top and // bottom row. { y = h8; int16x4_t d_s[12]; int16x4_t d_e[12]; const int16_t *d_t = d; int16x4_t zeros = vdup_n_s16(0); load_s16_4x4(d_t, d_stride, &d_s[0], &d_s[1], &d_s[2], &d_s[3]); load_s16_4x4(d_t + width, d_stride, &d_e[0], &d_e[1], &d_e[2], &d_e[3]); int64x2_t deltas[6][18] = { { vdupq_n_s64(0) }, { vdupq_n_s64(0) } }; while (y >= 8) { load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6], &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]); load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5], &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]); int16x8_t s_tr[8], e_tr[8]; transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5], d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2], &s_tr[3]); transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros, zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6], &s_tr[7]); transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5], d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2], &e_tr[3]); transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros, zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6], &e_tr[7]); int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5]; start_col0[0] = s_tr[0]; start_col0[1] = vextq_s16(s_tr[0], s_tr[4], 1); start_col0[2] = vextq_s16(s_tr[0], s_tr[4], 2); start_col0[3] = vextq_s16(s_tr[0], s_tr[4], 3); start_col0[4] = vextq_s16(s_tr[0], s_tr[4], 4); start_col1[0] = s_tr[1]; start_col1[1] = vextq_s16(s_tr[1], s_tr[5], 1); start_col1[2] = vextq_s16(s_tr[1], s_tr[5], 2); start_col1[3] = vextq_s16(s_tr[1], s_tr[5], 3); start_col1[4] = vextq_s16(s_tr[1], s_tr[5], 4); start_col2[0] = s_tr[2]; start_col2[1] = vextq_s16(s_tr[2], s_tr[6], 1); start_col2[2] = vextq_s16(s_tr[2], s_tr[6], 2); start_col2[3] = vextq_s16(s_tr[2], s_tr[6], 3); start_col2[4] = vextq_s16(s_tr[2], s_tr[6], 4); start_col3[0] = s_tr[3]; start_col3[1] = vextq_s16(s_tr[3], s_tr[7], 1); start_col3[2] = vextq_s16(s_tr[3], s_tr[7], 2); start_col3[3] = vextq_s16(s_tr[3], s_tr[7], 3); start_col3[4] = vextq_s16(s_tr[3], s_tr[7], 4); // i = 1, j = 2; sub_deltas_step4_sve(start_col0, start_col1, deltas[0]); // i = 1, j = 3; sub_deltas_step4_sve(start_col0, start_col2, deltas[1]); // i = 1, j = 4 sub_deltas_step4_sve(start_col0, start_col3, deltas[2]); // i = 2, j =3 sub_deltas_step4_sve(start_col1, start_col2, deltas[3]); // i = 2, j = 4 sub_deltas_step4_sve(start_col1, start_col3, deltas[4]); // i = 3, j = 4 sub_deltas_step4_sve(start_col2, start_col3, deltas[5]); int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5]; end_col0[0] = e_tr[0]; end_col0[1] = vextq_s16(e_tr[0], e_tr[4], 1); end_col0[2] = vextq_s16(e_tr[0], e_tr[4], 2); end_col0[3] = vextq_s16(e_tr[0], e_tr[4], 3); end_col0[4] = vextq_s16(e_tr[0], e_tr[4], 4); end_col1[0] = e_tr[1]; end_col1[1] = vextq_s16(e_tr[1], e_tr[5], 1); end_col1[2] = vextq_s16(e_tr[1], e_tr[5], 2); end_col1[3] = vextq_s16(e_tr[1], e_tr[5], 3); end_col1[4] = vextq_s16(e_tr[1], e_tr[5], 4); end_col2[0] = e_tr[2]; end_col2[1] = vextq_s16(e_tr[2], e_tr[6], 1); end_col2[2] = vextq_s16(e_tr[2], e_tr[6], 2); end_col2[3] = vextq_s16(e_tr[2], e_tr[6], 3); end_col2[4] = vextq_s16(e_tr[2], e_tr[6], 4); end_col3[0] = e_tr[3]; end_col3[1] = vextq_s16(e_tr[3], e_tr[7], 1); end_col3[2] = vextq_s16(e_tr[3], e_tr[7], 2); end_col3[3] = vextq_s16(e_tr[3], e_tr[7], 3); end_col3[4] = vextq_s16(e_tr[3], e_tr[7], 4); // i = 1, j = 2; add_deltas_step4_sve(end_col0, end_col1, deltas[0]); // i = 1, j = 3; add_deltas_step4_sve(end_col0, end_col2, deltas[1]); // i = 1, j = 4 add_deltas_step4_sve(end_col0, end_col3, deltas[2]); // i = 2, j =3 add_deltas_step4_sve(end_col1, end_col2, deltas[3]); // i = 2, j = 4 add_deltas_step4_sve(end_col1, end_col3, deltas[4]); // i = 3, j = 4 add_deltas_step4_sve(end_col2, end_col3, deltas[5]); d_s[0] = d_s[8]; d_s[1] = d_s[9]; d_s[2] = d_s[10]; d_s[3] = d_s[11]; d_e[0] = d_e[8]; d_e[1] = d_e[9]; d_e[2] = d_e[10]; d_e[3] = d_e[11]; d_t += 8 * d_stride; y -= 8; } if (h8 != height) { const int16x8_t mask_h = vld1q_s16(&mask_16bit[16] - (height % 8)); load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6], &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]); load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5], &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]); int16x8_t s_tr[8], e_tr[8]; transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5], d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2], &s_tr[3]); transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros, zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6], &s_tr[7]); transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5], d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2], &e_tr[3]); transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros, zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6], &e_tr[7]); int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5]; start_col0[0] = vandq_s16(s_tr[0], mask_h); start_col0[1] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 1), mask_h); start_col0[2] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 2), mask_h); start_col0[3] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 3), mask_h); start_col0[4] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 4), mask_h); start_col1[0] = vandq_s16(s_tr[1], mask_h); start_col1[1] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 1), mask_h); start_col1[2] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 2), mask_h); start_col1[3] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 3), mask_h); start_col1[4] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 4), mask_h); start_col2[0] = vandq_s16(s_tr[2], mask_h); start_col2[1] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 1), mask_h); start_col2[2] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 2), mask_h); start_col2[3] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 3), mask_h); start_col2[4] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 4), mask_h); start_col3[0] = vandq_s16(s_tr[3], mask_h); start_col3[1] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 1), mask_h); start_col3[2] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 2), mask_h); start_col3[3] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 3), mask_h); start_col3[4] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 4), mask_h); // i = 1, j = 2; sub_deltas_step4_sve(start_col0, start_col1, deltas[0]); // i = 1, j = 3; sub_deltas_step4_sve(start_col0, start_col2, deltas[1]); // i = 1, j = 4 sub_deltas_step4_sve(start_col0, start_col3, deltas[2]); // i = 2, j = 3 sub_deltas_step4_sve(start_col1, start_col2, deltas[3]); // i = 2, j = 4 sub_deltas_step4_sve(start_col1, start_col3, deltas[4]); // i = 3, j = 4 sub_deltas_step4_sve(start_col2, start_col3, deltas[5]); int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5]; end_col0[0] = vandq_s16(e_tr[0], mask_h); end_col0[1] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 1), mask_h); end_col0[2] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 2), mask_h); end_col0[3] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 3), mask_h); end_col0[4] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 4), mask_h); end_col1[0] = vandq_s16(e_tr[1], mask_h); end_col1[1] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 1), mask_h); end_col1[2] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 2), mask_h); end_col1[3] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 3), mask_h); end_col1[4] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 4), mask_h); end_col2[0] = vandq_s16(e_tr[2], mask_h); end_col2[1] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 1), mask_h); end_col2[2] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 2), mask_h); end_col2[3] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 3), mask_h); end_col2[4] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 4), mask_h); end_col3[0] = vandq_s16(e_tr[3], mask_h); end_col3[1] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 1), mask_h); end_col3[2] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 2), mask_h); end_col3[3] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 3), mask_h); end_col3[4] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 4), mask_h); // i = 1, j = 2; add_deltas_step4_sve(end_col0, end_col1, deltas[0]); // i = 1, j = 3; add_deltas_step4_sve(end_col0, end_col2, deltas[1]); // i = 1, j = 4 add_deltas_step4_sve(end_col0, end_col3, deltas[2]); // i = 2, j =3 add_deltas_step4_sve(end_col1, end_col2, deltas[3]); // i = 2, j = 4 add_deltas_step4_sve(end_col1, end_col3, deltas[4]); // i = 3, j = 4 add_deltas_step4_sve(end_col2, end_col3, deltas[5]); } int64_t single_delta[6]; deltas[0][0] = vpaddq_s64(deltas[0][0], deltas[0][1]); deltas[0][1] = vpaddq_s64(deltas[0][2], deltas[0][3]); deltas[1][0] = vpaddq_s64(deltas[1][0], deltas[1][1]); deltas[1][1] = vpaddq_s64(deltas[1][2], deltas[1][3]); deltas[2][0] = vpaddq_s64(deltas[2][0], deltas[2][1]); deltas[2][1] = vpaddq_s64(deltas[2][2], deltas[2][3]); deltas[3][0] = vpaddq_s64(deltas[3][0], deltas[3][1]); deltas[3][1] = vpaddq_s64(deltas[3][2], deltas[3][3]); deltas[4][0] = vpaddq_s64(deltas[4][0], deltas[4][1]); deltas[4][1] = vpaddq_s64(deltas[4][2], deltas[4][3]); deltas[5][0] = vpaddq_s64(deltas[5][0], deltas[5][1]); deltas[5][1] = vpaddq_s64(deltas[5][2], deltas[5][3]); deltas[0][5] = vpaddq_s64(deltas[0][5], deltas[0][6]); deltas[0][7] = vpaddq_s64(deltas[0][7], deltas[0][8]); deltas[1][5] = vpaddq_s64(deltas[1][5], deltas[1][6]); deltas[1][7] = vpaddq_s64(deltas[1][7], deltas[1][8]); deltas[2][5] = vpaddq_s64(deltas[2][5], deltas[2][6]); deltas[2][7] = vpaddq_s64(deltas[2][7], deltas[2][8]); deltas[3][5] = vpaddq_s64(deltas[3][5], deltas[3][6]); deltas[3][7] = vpaddq_s64(deltas[3][7], deltas[3][8]); deltas[4][5] = vpaddq_s64(deltas[4][5], deltas[4][6]); deltas[4][7] = vpaddq_s64(deltas[4][7], deltas[4][8]); deltas[5][5] = vpaddq_s64(deltas[5][5], deltas[5][6]); deltas[5][7] = vpaddq_s64(deltas[5][7], deltas[5][8]); vst1q_s64(single_delta + 0, vpaddq_s64(deltas[0][4], deltas[1][4])); vst1q_s64(single_delta + 2, vpaddq_s64(deltas[2][4], deltas[3][4])); vst1q_s64(single_delta + 4, vpaddq_s64(deltas[4][4], deltas[5][4])); int idx = 0; for (i = 1; i < wiener_win - 1; i++) { for (j = i + 1; j < wiener_win; j++) { update_4_stats_sve( H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win, deltas[idx], H + i * wiener_win * wiener_win2 + j * wiener_win); H[i * wiener_win * wiener_win2 + j * wiener_win + 4] = H[(i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win + 4] + single_delta[idx]; H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s64(deltas[idx][5], 0); H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s64(deltas[idx][5], 1); H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s64(deltas[idx][7], 0); H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s64(deltas[idx][7], 1); idx++; } } } // Step 5: Derive other points of each square. No square in bottom row. i = 0; do { const int16_t *const di = d + i; j = i + 1; do { const int16_t *const dj = d + j; int64x2_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1] = { { vdupq_n_s64(0) }, { vdupq_n_s64(0) } }; int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA]; int16x8_t d_js[WIN_CHROMA], d_je[WIN_CHROMA]; x = 0; while (x < width - 16) { load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie, d_js, d_je); derive_square_win5_sve(d_is, d_ie, d_js, d_je, deltas); x += 16; } load_square_win5_sve(di + x, dj + x, d_stride, height, d_is, d_ie, d_js, d_je, p0, p1); derive_square_win5_sve(d_is, d_ie, d_js, d_je, deltas); hadd_update_4_stats_sve( H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0], H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1); hadd_update_4_stats_sve( H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1], H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1); hadd_update_4_stats_sve( H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2], H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1); hadd_update_4_stats_sve( H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3], H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1); } while (++j < wiener_win); } while (++i < wiener_win - 1); // Step 6: Derive other points of each upper triangle along the diagonal. i = 0; do { const int16_t *const di = d + i; int64x2_t deltas[WIENER_WIN_CHROMA * 2 + 1] = { vdupq_n_s64(0) }; int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA]; x = 0; while (x < width - 16) { load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie); derive_triangle_win5_sve(d_is, d_ie, deltas); x += 16; } load_triangle_win5_sve(di + x, d_stride, height, d_is, d_ie, p0, p1); derive_triangle_win5_sve(d_is, d_ie, deltas); // Row 1: 4 points hadd_update_4_stats_sve( H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas, H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1); // Row 2: 3 points int64x2_t src0 = vld1q_s64(H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1); vst1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2, vaddq_s64(src0, vpaddq_s64(deltas[4], deltas[5]))); int64x2_t deltas69 = vpaddq_s64(deltas[6], deltas[9]); H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 4] = H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 3] + vgetq_lane_s64(deltas69, 0); // Row 3: 2 points int64x2_t src1 = vld1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2); vst1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3, vaddq_s64(src1, vpaddq_s64(deltas[7], deltas[8]))); // Row 4: 1 point H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] = H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3] + vgetq_lane_s64(deltas69, 1); } while (++i < wiener_win); } static inline void stats_top_win7_sve(const int16x8_t src[2], const int16x8_t dgd[2], const int16_t *const d, const int32_t d_stride, int64x2_t *sum_m, int64x2_t *sum_h) { int16x8_t dgds[WIENER_WIN * 2]; load_s16_8x7(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6], &dgds[8], &dgds[10], &dgds[12]); load_s16_8x7(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7], &dgds[9], &dgds[11], &dgds[13]); sum_m[0] = aom_sdotq_s16(sum_m[0], src[0], dgds[0]); sum_m[0] = aom_sdotq_s16(sum_m[0], src[1], dgds[1]); sum_m[1] = aom_sdotq_s16(sum_m[1], src[0], dgds[2]); sum_m[1] = aom_sdotq_s16(sum_m[1], src[1], dgds[3]); sum_m[2] = aom_sdotq_s16(sum_m[2], src[0], dgds[4]); sum_m[2] = aom_sdotq_s16(sum_m[2], src[1], dgds[5]); sum_m[3] = aom_sdotq_s16(sum_m[3], src[0], dgds[6]); sum_m[3] = aom_sdotq_s16(sum_m[3], src[1], dgds[7]); sum_m[4] = aom_sdotq_s16(sum_m[4], src[0], dgds[8]); sum_m[4] = aom_sdotq_s16(sum_m[4], src[1], dgds[9]); sum_m[5] = aom_sdotq_s16(sum_m[5], src[0], dgds[10]); sum_m[5] = aom_sdotq_s16(sum_m[5], src[1], dgds[11]); sum_m[6] = aom_sdotq_s16(sum_m[6], src[0], dgds[12]); sum_m[6] = aom_sdotq_s16(sum_m[6], src[1], dgds[13]); sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[0], dgds[0]); sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[1], dgds[1]); sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[0], dgds[2]); sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[1], dgds[3]); sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[0], dgds[4]); sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[1], dgds[5]); sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[0], dgds[6]); sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[1], dgds[7]); sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[0], dgds[8]); sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[1], dgds[9]); sum_h[5] = aom_sdotq_s16(sum_h[5], dgd[0], dgds[10]); sum_h[5] = aom_sdotq_s16(sum_h[5], dgd[1], dgds[11]); sum_h[6] = aom_sdotq_s16(sum_h[6], dgd[0], dgds[12]); sum_h[6] = aom_sdotq_s16(sum_h[6], dgd[1], dgds[13]); } static inline void stats_left_win7_sve(const int16x8_t src[2], const int16_t *d, const int32_t d_stride, int64x2_t *sum) { int16x8_t dgds[WIN_7]; load_s16_8x6(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6], &dgds[8], &dgds[10]); load_s16_8x6(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7], &dgds[9], &dgds[11]); sum[0] = aom_sdotq_s16(sum[0], src[0], dgds[0]); sum[0] = aom_sdotq_s16(sum[0], src[1], dgds[1]); sum[1] = aom_sdotq_s16(sum[1], src[0], dgds[2]); sum[1] = aom_sdotq_s16(sum[1], src[1], dgds[3]); sum[2] = aom_sdotq_s16(sum[2], src[0], dgds[4]); sum[2] = aom_sdotq_s16(sum[2], src[1], dgds[5]); sum[3] = aom_sdotq_s16(sum[3], src[0], dgds[6]); sum[3] = aom_sdotq_s16(sum[3], src[1], dgds[7]); sum[4] = aom_sdotq_s16(sum[4], src[0], dgds[8]); sum[4] = aom_sdotq_s16(sum[4], src[1], dgds[9]); sum[5] = aom_sdotq_s16(sum[5], src[0], dgds[10]); sum[5] = aom_sdotq_s16(sum[5], src[1], dgds[11]); } static inline void load_square_win7_sve( const int16_t *const di, const int16_t *const dj, const int32_t d_stride, const int32_t height, int16x8_t *d_is, int16x8_t *d_ie, int16x8_t *d_js, int16x8_t *d_je, svbool_t p0, svbool_t p1) { d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0)); d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8)); d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0)); d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8)); d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0)); d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8)); d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0)); d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8)); d_is[8] = svget_neonq_s16(svld1_s16(p0, di + 4 * d_stride + 0)); d_is[9] = svget_neonq_s16(svld1_s16(p1, di + 4 * d_stride + 8)); d_is[10] = svget_neonq_s16(svld1_s16(p0, di + 5 * d_stride + 0)); d_is[11] = svget_neonq_s16(svld1_s16(p1, di + 5 * d_stride + 8)); d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0)); d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8)); d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0)); d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8)); d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0)); d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8)); d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0)); d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8)); d_ie[8] = svget_neonq_s16(svld1_s16(p0, di + (height + 4) * d_stride + 0)); d_ie[9] = svget_neonq_s16(svld1_s16(p1, di + (height + 4) * d_stride + 8)); d_ie[10] = svget_neonq_s16(svld1_s16(p0, di + (height + 5) * d_stride + 0)); d_ie[11] = svget_neonq_s16(svld1_s16(p1, di + (height + 5) * d_stride + 8)); load_s16_8x6(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6], &d_js[8], &d_js[10]); load_s16_8x6(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7], &d_js[9], &d_js[11]); load_s16_8x6(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2], &d_je[4], &d_je[6], &d_je[8], &d_je[10]); load_s16_8x6(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3], &d_je[5], &d_je[7], &d_je[9], &d_je[11]); } static inline void derive_square_win7_sve(int16x8_t *d_is, const int16x8_t *d_ie, const int16x8_t *d_js, const int16x8_t *d_je, int64x2_t deltas[][WIN_7]) { d_is[0] = vnegq_s16(d_is[0]); d_is[1] = vnegq_s16(d_is[1]); d_is[2] = vnegq_s16(d_is[2]); d_is[3] = vnegq_s16(d_is[3]); d_is[4] = vnegq_s16(d_is[4]); d_is[5] = vnegq_s16(d_is[5]); d_is[6] = vnegq_s16(d_is[6]); d_is[7] = vnegq_s16(d_is[7]); d_is[8] = vnegq_s16(d_is[8]); d_is[9] = vnegq_s16(d_is[9]); d_is[10] = vnegq_s16(d_is[10]); d_is[11] = vnegq_s16(d_is[11]); deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[0], d_js[0]); deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[1], d_js[1]); deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[0], d_js[2]); deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[1], d_js[3]); deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[0], d_js[4]); deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[1], d_js[5]); deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[0], d_js[6]); deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[1], d_js[7]); deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_is[0], d_js[8]); deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_is[1], d_js[9]); deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_is[0], d_js[10]); deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_is[1], d_js[11]); deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[2], d_js[0]); deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[3], d_js[1]); deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[2], d_js[2]); deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[3], d_js[3]); deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[2], d_js[4]); deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[3], d_js[5]); deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[2], d_js[6]); deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[3], d_js[7]); deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_is[2], d_js[8]); deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_is[3], d_js[9]); deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_is[2], d_js[10]); deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_is[3], d_js[11]); deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[4], d_js[0]); deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[5], d_js[1]); deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[4], d_js[2]); deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[5], d_js[3]); deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[4], d_js[4]); deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[5], d_js[5]); deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[4], d_js[6]); deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[5], d_js[7]); deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_is[4], d_js[8]); deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_is[5], d_js[9]); deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_is[4], d_js[10]); deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_is[5], d_js[11]); deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[6], d_js[0]); deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[7], d_js[1]); deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[6], d_js[2]); deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[7], d_js[3]); deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[6], d_js[4]); deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[7], d_js[5]); deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[6], d_js[6]); deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[7], d_js[7]); deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_is[6], d_js[8]); deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_is[7], d_js[9]); deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_is[6], d_js[10]); deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_is[7], d_js[11]); deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_is[8], d_js[0]); deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_is[9], d_js[1]); deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_is[8], d_js[2]); deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_is[9], d_js[3]); deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_is[8], d_js[4]); deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_is[9], d_js[5]); deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_is[8], d_js[6]); deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_is[9], d_js[7]); deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_is[8], d_js[8]); deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_is[9], d_js[9]); deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_is[8], d_js[10]); deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_is[9], d_js[11]); deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_is[10], d_js[0]); deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_is[11], d_js[1]); deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_is[10], d_js[2]); deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_is[11], d_js[3]); deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_is[10], d_js[4]); deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_is[11], d_js[5]); deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_is[10], d_js[6]); deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_is[11], d_js[7]); deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_is[10], d_js[8]); deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_is[11], d_js[9]); deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_is[10], d_js[10]); deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_is[11], d_js[11]); deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[0], d_je[0]); deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[1], d_je[1]); deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[0], d_je[2]); deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[1], d_je[3]); deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[0], d_je[4]); deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[1], d_je[5]); deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[0], d_je[6]); deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[1], d_je[7]); deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_ie[0], d_je[8]); deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_ie[1], d_je[9]); deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_ie[0], d_je[10]); deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_ie[1], d_je[11]); deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[2], d_je[0]); deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[3], d_je[1]); deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[2], d_je[2]); deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[3], d_je[3]); deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[2], d_je[4]); deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[3], d_je[5]); deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[2], d_je[6]); deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[3], d_je[7]); deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_ie[2], d_je[8]); deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_ie[3], d_je[9]); deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_ie[2], d_je[10]); deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_ie[3], d_je[11]); deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[4], d_je[0]); deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[5], d_je[1]); deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[4], d_je[2]); deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[5], d_je[3]); deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[4], d_je[4]); deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[5], d_je[5]); deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[4], d_je[6]); deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[5], d_je[7]); deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_ie[4], d_je[8]); deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_ie[5], d_je[9]); deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_ie[4], d_je[10]); deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_ie[5], d_je[11]); deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[6], d_je[0]); deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[7], d_je[1]); deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[6], d_je[2]); deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[7], d_je[3]); deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[6], d_je[4]); deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[7], d_je[5]); deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[6], d_je[6]); deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[7], d_je[7]); deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_ie[6], d_je[8]); deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_ie[7], d_je[9]); deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_ie[6], d_je[10]); deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_ie[7], d_je[11]); deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_ie[8], d_je[0]); deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_ie[9], d_je[1]); deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_ie[8], d_je[2]); deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_ie[9], d_je[3]); deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_ie[8], d_je[4]); deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_ie[9], d_je[5]); deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_ie[8], d_je[6]); deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_ie[9], d_je[7]); deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_ie[8], d_je[8]); deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_ie[9], d_je[9]); deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_ie[8], d_je[10]); deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_ie[9], d_je[11]); deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_ie[10], d_je[0]); deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_ie[11], d_je[1]); deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_ie[10], d_je[2]); deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_ie[11], d_je[3]); deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_ie[10], d_je[4]); deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_ie[11], d_je[5]); deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_ie[10], d_je[6]); deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_ie[11], d_je[7]); deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_ie[10], d_je[8]); deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_ie[11], d_je[9]); deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_ie[10], d_je[10]); deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_ie[11], d_je[11]); } static inline void hadd_update_6_stats_sve(const int64_t *const src, const int64x2_t *deltas, int64_t *const dst) { int64x2_t src0 = vld1q_s64(src + 0); int64x2_t src1 = vld1q_s64(src + 2); int64x2_t src2 = vld1q_s64(src + 4); int64x2_t deltas01 = vpaddq_s64(deltas[0], deltas[1]); int64x2_t deltas23 = vpaddq_s64(deltas[2], deltas[3]); int64x2_t deltas45 = vpaddq_s64(deltas[4], deltas[5]); vst1q_s64(dst + 0, vaddq_s64(src0, deltas01)); vst1q_s64(dst + 2, vaddq_s64(src1, deltas23)); vst1q_s64(dst + 4, vaddq_s64(src2, deltas45)); } static inline void load_triangle_win7_sve(const int16_t *const di, const int32_t d_stride, const int32_t height, int16x8_t *d_is, int16x8_t *d_ie, svbool_t p0, svbool_t p1) { d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0)); d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8)); d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0)); d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8)); d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0)); d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8)); d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0)); d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8)); d_is[8] = svget_neonq_s16(svld1_s16(p0, di + 4 * d_stride + 0)); d_is[9] = svget_neonq_s16(svld1_s16(p1, di + 4 * d_stride + 8)); d_is[10] = svget_neonq_s16(svld1_s16(p0, di + 5 * d_stride + 0)); d_is[11] = svget_neonq_s16(svld1_s16(p1, di + 5 * d_stride + 8)); d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0)); d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8)); d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0)); d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8)); d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0)); d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8)); d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0)); d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8)); d_ie[8] = svget_neonq_s16(svld1_s16(p0, di + (height + 4) * d_stride + 0)); d_ie[9] = svget_neonq_s16(svld1_s16(p1, di + (height + 4) * d_stride + 8)); d_ie[10] = svget_neonq_s16(svld1_s16(p0, di + (height + 5) * d_stride + 0)); d_ie[11] = svget_neonq_s16(svld1_s16(p1, di + (height + 5) * d_stride + 8)); } static inline void derive_triangle_win7_sve(const int16x8_t *d_is, const int16x8_t *d_ie, int64x2_t *deltas) { deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[0]), d_is[0]); deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[1]), d_is[1]); deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[0]), d_is[2]); deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[1]), d_is[3]); deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[0]), d_is[4]); deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[1]), d_is[5]); deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[0]), d_is[6]); deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[1]), d_is[7]); deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[0]), d_is[8]); deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[1]), d_is[9]); deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[0]), d_is[10]); deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[1]), d_is[11]); deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[2]), d_is[2]); deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[3]), d_is[3]); deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[2]), d_is[4]); deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[3]), d_is[5]); deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[2]), d_is[6]); deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[3]), d_is[7]); deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[2]), d_is[8]); deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[3]), d_is[9]); deltas[10] = aom_sdotq_s16(deltas[10], vnegq_s16(d_is[2]), d_is[10]); deltas[10] = aom_sdotq_s16(deltas[10], vnegq_s16(d_is[3]), d_is[11]); deltas[11] = aom_sdotq_s16(deltas[11], vnegq_s16(d_is[4]), d_is[4]); deltas[11] = aom_sdotq_s16(deltas[11], vnegq_s16(d_is[5]), d_is[5]); deltas[12] = aom_sdotq_s16(deltas[12], vnegq_s16(d_is[4]), d_is[6]); deltas[12] = aom_sdotq_s16(deltas[12], vnegq_s16(d_is[5]), d_is[7]); deltas[13] = aom_sdotq_s16(deltas[13], vnegq_s16(d_is[4]), d_is[8]); deltas[13] = aom_sdotq_s16(deltas[13], vnegq_s16(d_is[5]), d_is[9]); deltas[14] = aom_sdotq_s16(deltas[14], vnegq_s16(d_is[4]), d_is[10]); deltas[14] = aom_sdotq_s16(deltas[14], vnegq_s16(d_is[5]), d_is[11]); deltas[15] = aom_sdotq_s16(deltas[15], vnegq_s16(d_is[6]), d_is[6]); deltas[15] = aom_sdotq_s16(deltas[15], vnegq_s16(d_is[7]), d_is[7]); deltas[16] = aom_sdotq_s16(deltas[16], vnegq_s16(d_is[6]), d_is[8]); deltas[16] = aom_sdotq_s16(deltas[16], vnegq_s16(d_is[7]), d_is[9]); deltas[17] = aom_sdotq_s16(deltas[17], vnegq_s16(d_is[6]), d_is[10]); deltas[17] = aom_sdotq_s16(deltas[17], vnegq_s16(d_is[7]), d_is[11]); deltas[18] = aom_sdotq_s16(deltas[18], vnegq_s16(d_is[8]), d_is[8]); deltas[18] = aom_sdotq_s16(deltas[18], vnegq_s16(d_is[9]), d_is[9]); deltas[19] = aom_sdotq_s16(deltas[19], vnegq_s16(d_is[8]), d_is[10]); deltas[19] = aom_sdotq_s16(deltas[19], vnegq_s16(d_is[9]), d_is[11]); deltas[20] = aom_sdotq_s16(deltas[20], vnegq_s16(d_is[10]), d_is[10]); deltas[20] = aom_sdotq_s16(deltas[20], vnegq_s16(d_is[11]), d_is[11]); deltas[0] = aom_sdotq_s16(deltas[0], d_ie[0], d_ie[0]); deltas[0] = aom_sdotq_s16(deltas[0], d_ie[1], d_ie[1]); deltas[1] = aom_sdotq_s16(deltas[1], d_ie[0], d_ie[2]); deltas[1] = aom_sdotq_s16(deltas[1], d_ie[1], d_ie[3]); deltas[2] = aom_sdotq_s16(deltas[2], d_ie[0], d_ie[4]); deltas[2] = aom_sdotq_s16(deltas[2], d_ie[1], d_ie[5]); deltas[3] = aom_sdotq_s16(deltas[3], d_ie[0], d_ie[6]); deltas[3] = aom_sdotq_s16(deltas[3], d_ie[1], d_ie[7]); deltas[4] = aom_sdotq_s16(deltas[4], d_ie[0], d_ie[8]); deltas[4] = aom_sdotq_s16(deltas[4], d_ie[1], d_ie[9]); deltas[5] = aom_sdotq_s16(deltas[5], d_ie[0], d_ie[10]); deltas[5] = aom_sdotq_s16(deltas[5], d_ie[1], d_ie[11]); deltas[6] = aom_sdotq_s16(deltas[6], d_ie[2], d_ie[2]); deltas[6] = aom_sdotq_s16(deltas[6], d_ie[3], d_ie[3]); deltas[7] = aom_sdotq_s16(deltas[7], d_ie[2], d_ie[4]); deltas[7] = aom_sdotq_s16(deltas[7], d_ie[3], d_ie[5]); deltas[8] = aom_sdotq_s16(deltas[8], d_ie[2], d_ie[6]); deltas[8] = aom_sdotq_s16(deltas[8], d_ie[3], d_ie[7]); deltas[9] = aom_sdotq_s16(deltas[9], d_ie[2], d_ie[8]); deltas[9] = aom_sdotq_s16(deltas[9], d_ie[3], d_ie[9]); deltas[10] = aom_sdotq_s16(deltas[10], d_ie[2], d_ie[10]); deltas[10] = aom_sdotq_s16(deltas[10], d_ie[3], d_ie[11]); deltas[11] = aom_sdotq_s16(deltas[11], d_ie[4], d_ie[4]); deltas[11] = aom_sdotq_s16(deltas[11], d_ie[5], d_ie[5]); deltas[12] = aom_sdotq_s16(deltas[12], d_ie[4], d_ie[6]); deltas[12] = aom_sdotq_s16(deltas[12], d_ie[5], d_ie[7]); deltas[13] = aom_sdotq_s16(deltas[13], d_ie[4], d_ie[8]); deltas[13] = aom_sdotq_s16(deltas[13], d_ie[5], d_ie[9]); deltas[14] = aom_sdotq_s16(deltas[14], d_ie[4], d_ie[10]); deltas[14] = aom_sdotq_s16(deltas[14], d_ie[5], d_ie[11]); deltas[15] = aom_sdotq_s16(deltas[15], d_ie[6], d_ie[6]); deltas[15] = aom_sdotq_s16(deltas[15], d_ie[7], d_ie[7]); deltas[16] = aom_sdotq_s16(deltas[16], d_ie[6], d_ie[8]); deltas[16] = aom_sdotq_s16(deltas[16], d_ie[7], d_ie[9]); deltas[17] = aom_sdotq_s16(deltas[17], d_ie[6], d_ie[10]); deltas[17] = aom_sdotq_s16(deltas[17], d_ie[7], d_ie[11]); deltas[18] = aom_sdotq_s16(deltas[18], d_ie[8], d_ie[8]); deltas[18] = aom_sdotq_s16(deltas[18], d_ie[9], d_ie[9]); deltas[19] = aom_sdotq_s16(deltas[19], d_ie[8], d_ie[10]); deltas[19] = aom_sdotq_s16(deltas[19], d_ie[9], d_ie[11]); deltas[20] = aom_sdotq_s16(deltas[20], d_ie[10], d_ie[10]); deltas[20] = aom_sdotq_s16(deltas[20], d_ie[11], d_ie[11]); } static inline void compute_stats_win7_sve( const int16_t *const d, const int32_t d_stride, const int16_t *const s, const int32_t s_stride, const int32_t width, const int32_t height, int64_t *const M, int64_t *const H) { const int32_t wiener_win = WIENER_WIN; const int32_t wiener_win2 = wiener_win * wiener_win; const int32_t h8 = height & ~7; int32_t i, j, x, y; // Use a predicate to compute the last columns. svbool_t p0 = svwhilelt_b16_u32(0, width % 16 == 0 ? 16 : width % 16); svbool_t p1 = svwhilelt_b16_u32(8, width % 16 == 0 ? 16 : width % 16); // Step 1: Calculate the top edge of the whole matrix, i.e., the top // edge of each triangle and square on the top row. j = 0; do { const int16_t *s_t = s; const int16_t *d_t = d; int64x2_t sum_m[WIENER_WIN] = { vdupq_n_s64(0) }; int64x2_t sum_h[WIENER_WIN] = { vdupq_n_s64(0) }; int16x8_t src[2], dgd[2]; y = height; do { x = 0; while (x < width - 16) { src[0] = vld1q_s16(s_t + x + 0); src[1] = vld1q_s16(s_t + x + 8); dgd[0] = vld1q_s16(d_t + x + 0); dgd[1] = vld1q_s16(d_t + x + 8); stats_top_win7_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h); x += 16; } src[0] = svget_neonq_s16(svld1_s16(p0, s_t + x + 0)); src[1] = svget_neonq_s16(svld1_s16(p1, s_t + x + 8)); dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + x + 0)); dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + x + 8)); stats_top_win7_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h); s_t += s_stride; d_t += d_stride; } while (--y); vst1q_s64(M + wiener_win * j + 0, vpaddq_s64(sum_m[0], sum_m[1])); vst1q_s64(M + wiener_win * j + 2, vpaddq_s64(sum_m[2], sum_m[3])); vst1q_s64(M + wiener_win * j + 4, vpaddq_s64(sum_m[4], sum_m[5])); M[wiener_win * j + 6] = vaddvq_s64(sum_m[6]); vst1q_s64(H + wiener_win * j + 0, vpaddq_s64(sum_h[0], sum_h[1])); vst1q_s64(H + wiener_win * j + 2, vpaddq_s64(sum_h[2], sum_h[3])); vst1q_s64(H + wiener_win * j + 4, vpaddq_s64(sum_h[4], sum_h[5])); H[wiener_win * j + 6] = vaddvq_s64(sum_h[6]); } while (++j < wiener_win); // Step 2: Calculate the left edge of each square on the top row. j = 1; do { const int16_t *d_t = d; int64x2_t sum_h[WIENER_WIN - 1] = { vdupq_n_s64(0) }; int16x8_t dgd[2]; y = height; do { x = 0; while (x < width - 16) { dgd[0] = vld1q_s16(d_t + j + x + 0); dgd[1] = vld1q_s16(d_t + j + x + 8); stats_left_win7_sve(dgd, d_t + x, d_stride, sum_h); x += 16; } dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + j + x + 0)); dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + j + x + 8)); stats_left_win7_sve(dgd, d_t + x, d_stride, sum_h); d_t += d_stride; } while (--y); int64x2_t sum_h01 = vpaddq_s64(sum_h[0], sum_h[1]); int64x2_t sum_h23 = vpaddq_s64(sum_h[2], sum_h[3]); int64x2_t sum_h45 = vpaddq_s64(sum_h[4], sum_h[5]); vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h01)); vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h01)); vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h23)); vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h23)); vst1_s64(&H[5 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h45)); vst1_s64(&H[6 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h45)); } while (++j < wiener_win); // Step 3: Derive the top edge of each triangle along the diagonal. No // triangle in top row. { const int16_t *d_t = d; // Pad to call transpose function. int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) }; int16x8_t ds[WIENER_WIN * 2]; load_s16_8x6(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6], &ds[8], &ds[10]); load_s16_8x6(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7], &ds[9], &ds[11]); d_t += 6 * d_stride; step3_win7_neon(d_t, d_stride, width, height, ds, deltas); transpose_arrays_s32_8x8(deltas, deltas_tr); update_8_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win, deltas_tr[0], deltas_tr[4], H + 1 * wiener_win * wiener_win2 + 1 * wiener_win); update_8_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win, deltas_tr[1], deltas_tr[5], H + 2 * wiener_win * wiener_win2 + 2 * wiener_win); update_8_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win, deltas_tr[2], deltas_tr[6], H + 3 * wiener_win * wiener_win2 + 3 * wiener_win); update_8_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win, deltas_tr[3], deltas_tr[7], H + 4 * wiener_win * wiener_win2 + 4 * wiener_win); update_8_stats_neon(H + 4 * wiener_win * wiener_win2 + 4 * wiener_win, deltas_tr[8], deltas_tr[12], H + 5 * wiener_win * wiener_win2 + 5 * wiener_win); update_8_stats_neon(H + 5 * wiener_win * wiener_win2 + 5 * wiener_win, deltas_tr[9], deltas_tr[13], H + 6 * wiener_win * wiener_win2 + 6 * wiener_win); } // Step 4: Derive the top and left edge of each square. No square in top and // bottom row. i = 1; do { j = i + 1; do { const int16_t *di = d + i - 1; const int16_t *dj = d + j - 1; int64x2_t deltas[(2 * WIENER_WIN - 1) * 2] = { vdupq_n_s64(0) }; int16x8_t dd[WIENER_WIN * 2], ds[WIENER_WIN * 2]; dd[5] = vdupq_n_s16(0); // Initialize to avoid warning. const int16_t dd0_values[] = { di[0 * d_stride], di[1 * d_stride], di[2 * d_stride], di[3 * d_stride], di[4 * d_stride], di[5 * d_stride], 0, 0 }; dd[0] = vld1q_s16(dd0_values); const int16_t dd1_values[] = { di[0 * d_stride + width], di[1 * d_stride + width], di[2 * d_stride + width], di[3 * d_stride + width], di[4 * d_stride + width], di[5 * d_stride + width], 0, 0 }; dd[1] = vld1q_s16(dd1_values); const int16_t ds0_values[] = { dj[0 * d_stride], dj[1 * d_stride], dj[2 * d_stride], dj[3 * d_stride], dj[4 * d_stride], dj[5 * d_stride], 0, 0 }; ds[0] = vld1q_s16(ds0_values); int16_t ds1_values[] = { dj[0 * d_stride + width], dj[1 * d_stride + width], dj[2 * d_stride + width], dj[3 * d_stride + width], dj[4 * d_stride + width], dj[5 * d_stride + width], 0, 0 }; ds[1] = vld1q_s16(ds1_values); y = 0; while (y < h8) { // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e dd[0] = vsetq_lane_s16(di[6 * d_stride], dd[0], 6); dd[0] = vsetq_lane_s16(di[7 * d_stride], dd[0], 7); dd[1] = vsetq_lane_s16(di[6 * d_stride + width], dd[1], 6); dd[1] = vsetq_lane_s16(di[7 * d_stride + width], dd[1], 7); // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e // 01s 11s 21s 31s 41s 51s 61s 71s 01e 11e 21e 31e 41e 51e 61e 71e ds[0] = vsetq_lane_s16(dj[6 * d_stride], ds[0], 6); ds[0] = vsetq_lane_s16(dj[7 * d_stride], ds[0], 7); ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 6); ds[1] = vsetq_lane_s16(dj[7 * d_stride + width], ds[1], 7); load_more_16_neon(di + 8 * d_stride, width, &dd[0], &dd[2]); load_more_16_neon(dj + 8 * d_stride, width, &ds[0], &ds[2]); load_more_16_neon(di + 9 * d_stride, width, &dd[2], &dd[4]); load_more_16_neon(dj + 9 * d_stride, width, &ds[2], &ds[4]); load_more_16_neon(di + 10 * d_stride, width, &dd[4], &dd[6]); load_more_16_neon(dj + 10 * d_stride, width, &ds[4], &ds[6]); load_more_16_neon(di + 11 * d_stride, width, &dd[6], &dd[8]); load_more_16_neon(dj + 11 * d_stride, width, &ds[6], &ds[8]); load_more_16_neon(di + 12 * d_stride, width, &dd[8], &dd[10]); load_more_16_neon(dj + 12 * d_stride, width, &ds[8], &ds[10]); load_more_16_neon(di + 13 * d_stride, width, &dd[10], &dd[12]); load_more_16_neon(dj + 13 * d_stride, width, &ds[10], &ds[12]); deltas[0] = aom_sdotq_s16(deltas[0], dd[0], ds[0]); deltas[1] = aom_sdotq_s16(deltas[1], dd[1], ds[1]); deltas[2] = aom_sdotq_s16(deltas[2], dd[0], ds[2]); deltas[3] = aom_sdotq_s16(deltas[3], dd[1], ds[3]); deltas[4] = aom_sdotq_s16(deltas[4], dd[0], ds[4]); deltas[5] = aom_sdotq_s16(deltas[5], dd[1], ds[5]); deltas[6] = aom_sdotq_s16(deltas[6], dd[0], ds[6]); deltas[7] = aom_sdotq_s16(deltas[7], dd[1], ds[7]); deltas[8] = aom_sdotq_s16(deltas[8], dd[0], ds[8]); deltas[9] = aom_sdotq_s16(deltas[9], dd[1], ds[9]); deltas[10] = aom_sdotq_s16(deltas[10], dd[0], ds[10]); deltas[11] = aom_sdotq_s16(deltas[11], dd[1], ds[11]); deltas[12] = aom_sdotq_s16(deltas[12], dd[0], ds[12]); deltas[13] = aom_sdotq_s16(deltas[13], dd[1], ds[13]); deltas[14] = aom_sdotq_s16(deltas[14], dd[2], ds[0]); deltas[15] = aom_sdotq_s16(deltas[15], dd[3], ds[1]); deltas[16] = aom_sdotq_s16(deltas[16], dd[4], ds[0]); deltas[17] = aom_sdotq_s16(deltas[17], dd[5], ds[1]); deltas[18] = aom_sdotq_s16(deltas[18], dd[6], ds[0]); deltas[19] = aom_sdotq_s16(deltas[19], dd[7], ds[1]); deltas[20] = aom_sdotq_s16(deltas[20], dd[8], ds[0]); deltas[21] = aom_sdotq_s16(deltas[21], dd[9], ds[1]); deltas[22] = aom_sdotq_s16(deltas[22], dd[10], ds[0]); deltas[23] = aom_sdotq_s16(deltas[23], dd[11], ds[1]); deltas[24] = aom_sdotq_s16(deltas[24], dd[12], ds[0]); deltas[25] = aom_sdotq_s16(deltas[25], dd[13], ds[1]); dd[0] = vextq_s16(dd[12], vdupq_n_s16(0), 2); dd[1] = vextq_s16(dd[13], vdupq_n_s16(0), 2); ds[0] = vextq_s16(ds[12], vdupq_n_s16(0), 2); ds[1] = vextq_s16(ds[13], vdupq_n_s16(0), 2); di += 8 * d_stride; dj += 8 * d_stride; y += 8; } int64x2_t deltas02 = vpaddq_s64(deltas[0], deltas[2]); int64x2_t deltas13 = vpaddq_s64(deltas[1], deltas[3]); int64x2_t deltas46 = vpaddq_s64(deltas[4], deltas[6]); int64x2_t deltas57 = vpaddq_s64(deltas[5], deltas[7]); int64x2_t deltas810 = vpaddq_s64(deltas[8], deltas[10]); int64x2_t deltas911 = vpaddq_s64(deltas[9], deltas[11]); int64x2_t deltas1212 = vpaddq_s64(deltas[12], deltas[12]); int64x2_t deltas1313 = vpaddq_s64(deltas[13], deltas[13]); int64x2_t deltas1416 = vpaddq_s64(deltas[14], deltas[16]); int64x2_t deltas1820 = vpaddq_s64(deltas[18], deltas[20]); int64x2_t deltas1517 = vpaddq_s64(deltas[15], deltas[17]); int64x2_t deltas1921 = vpaddq_s64(deltas[19], deltas[21]); int64x2_t deltas2224 = vpaddq_s64(deltas[22], deltas[24]); int64x2_t deltas2325 = vpaddq_s64(deltas[23], deltas[25]); deltas02 = vsubq_s64(deltas13, deltas02); deltas46 = vsubq_s64(deltas57, deltas46); deltas810 = vsubq_s64(deltas911, deltas810); deltas1212 = vsubq_s64(deltas1313, deltas1212); deltas1416 = vsubq_s64(deltas1517, deltas1416); deltas1820 = vsubq_s64(deltas1921, deltas1820); deltas2224 = vsubq_s64(deltas2325, deltas2224); if (h8 != height) { const int16_t ds0_vals[] = { dj[0 * d_stride], dj[0 * d_stride + width], dj[1 * d_stride], dj[1 * d_stride + width], dj[2 * d_stride], dj[2 * d_stride + width], dj[3 * d_stride], dj[3 * d_stride + width] }; ds[0] = vld1q_s16(ds0_vals); ds[1] = vsetq_lane_s16(dj[4 * d_stride], ds[1], 0); ds[1] = vsetq_lane_s16(dj[4 * d_stride + width], ds[1], 1); ds[1] = vsetq_lane_s16(dj[5 * d_stride], ds[1], 2); ds[1] = vsetq_lane_s16(dj[5 * d_stride + width], ds[1], 3); const int16_t dd4_vals[] = { -di[1 * d_stride], di[1 * d_stride + width], -di[2 * d_stride], di[2 * d_stride + width], -di[3 * d_stride], di[3 * d_stride + width], -di[4 * d_stride], di[4 * d_stride + width] }; dd[4] = vld1q_s16(dd4_vals); dd[5] = vsetq_lane_s16(-di[5 * d_stride], dd[5], 0); dd[5] = vsetq_lane_s16(di[5 * d_stride + width], dd[5], 1); do { dd[0] = vdupq_n_s16(-di[0 * d_stride]); dd[2] = dd[3] = vdupq_n_s16(di[0 * d_stride + width]); dd[0] = dd[1] = vzip1q_s16(dd[0], dd[2]); ds[4] = vdupq_n_s16(dj[0 * d_stride]); ds[6] = ds[7] = vdupq_n_s16(dj[0 * d_stride + width]); ds[4] = ds[5] = vzip1q_s16(ds[4], ds[6]); dd[5] = vsetq_lane_s16(-di[6 * d_stride], dd[5], 2); dd[5] = vsetq_lane_s16(di[6 * d_stride + width], dd[5], 3); ds[1] = vsetq_lane_s16(dj[6 * d_stride], ds[1], 4); ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 5); const int32x4_t res0 = vpaddq_s32(vmull_s16(vget_low_s16(dd[0]), vget_low_s16(ds[0])), vmull_s16(vget_high_s16(dd[0]), vget_high_s16(ds[0]))); deltas02 = vaddw_s32(deltas02, vget_low_s32(res0)); deltas46 = vaddw_s32(deltas46, vget_high_s32(res0)); const int32x4_t res1 = vpaddq_s32(vmull_s16(vget_low_s16(dd[1]), vget_low_s16(ds[1])), vmull_s16(vget_high_s16(dd[1]), vget_high_s16(ds[1]))); deltas810 = vaddw_s32(deltas810, vget_low_s32(res1)); deltas1212 = vaddw_s32(deltas1212, vget_high_s32(res1)); const int32x4_t res2 = vpaddq_s32(vmull_s16(vget_low_s16(dd[4]), vget_low_s16(ds[4])), vmull_s16(vget_high_s16(dd[4]), vget_high_s16(ds[4]))); deltas1416 = vaddw_s32(deltas1416, vget_low_s32(res2)); deltas1820 = vaddw_s32(deltas1820, vget_high_s32(res2)); const int32x4_t res3 = vpaddq_s32(vmull_s16(vget_low_s16(dd[5]), vget_low_s16(ds[5])), vmull_s16(vget_high_s16(dd[5]), vget_high_s16(ds[5]))); deltas2224 = vaddw_s32(deltas2224, vget_low_s32(res3)); int32_t tmp0 = vgetq_lane_s32(vreinterpretq_s32_s16(ds[0]), 0); ds[0] = vextq_s16(ds[0], ds[1], 2); ds[1] = vextq_s16(ds[1], ds[0], 2); ds[1] = vreinterpretq_s16_s32( vsetq_lane_s32(tmp0, vreinterpretq_s32_s16(ds[1]), 3)); int32_t tmp1 = vgetq_lane_s32(vreinterpretq_s32_s16(dd[4]), 0); dd[4] = vextq_s16(dd[4], dd[5], 2); dd[5] = vextq_s16(dd[5], dd[4], 2); dd[5] = vreinterpretq_s16_s32( vsetq_lane_s32(tmp1, vreinterpretq_s32_s16(dd[5]), 3)); di += d_stride; dj += d_stride; } while (++y < height); } // Writing one more element on the top edge of a square falls to // the next square in the same row or the first element in the next // row, which will just be overwritten later. int64x2_t s0 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win + 0); int64x2_t s1 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win + 2); int64x2_t s2 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win + 4); int64x2_t s3 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win + 6); vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 0, vaddq_s64(s0, deltas02)); vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 2, vaddq_s64(s1, deltas46)); vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 4, vaddq_s64(s2, deltas810)); vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 6, vaddq_s64(s3, deltas1212)); H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s64(deltas1416, 0); H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s64(deltas1416, 1); H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s64(deltas1820, 0); H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s64(deltas1820, 1); H[(i * wiener_win + 5) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 5) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s64(deltas2224, 0); H[(i * wiener_win + 6) * wiener_win2 + j * wiener_win] = H[((i - 1) * wiener_win + 6) * wiener_win2 + (j - 1) * wiener_win] + vgetq_lane_s64(deltas2224, 1); } while (++j < wiener_win); } while (++i < wiener_win - 1); // Step 5: Derive other points of each square. No square in bottom row. i = 0; do { const int16_t *const di = d + i; j = i + 1; do { const int16_t *const dj = d + j; int64x2_t deltas[WIENER_WIN - 1][WIN_7] = { { vdupq_n_s64(0) }, { vdupq_n_s64(0) } }; int16x8_t d_is[WIN_7]; int16x8_t d_ie[WIN_7]; int16x8_t d_js[WIN_7]; int16x8_t d_je[WIN_7]; x = 0; while (x < width - 16) { load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie, d_js, d_je); derive_square_win7_sve(d_is, d_ie, d_js, d_je, deltas); x += 16; } load_square_win7_sve(di + x, dj + x, d_stride, height, d_is, d_ie, d_js, d_je, p0, p1); derive_square_win7_sve(d_is, d_ie, d_js, d_je, deltas); hadd_update_6_stats_sve( H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0], H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1); hadd_update_6_stats_sve( H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1], H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1); hadd_update_6_stats_sve( H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2], H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1); hadd_update_6_stats_sve( H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3], H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1); hadd_update_6_stats_sve( H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win, deltas[4], H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win + 1); hadd_update_6_stats_sve( H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win, deltas[5], H + (i * wiener_win + 6) * wiener_win2 + j * wiener_win + 1); } while (++j < wiener_win); } while (++i < wiener_win - 1); // Step 6: Derive other points of each upper triangle along the diagonal. i = 0; do { const int16_t *const di = d + i; int64x2_t deltas[3 * WIENER_WIN] = { vdupq_n_s64(0) }; int16x8_t d_is[WIN_7], d_ie[WIN_7]; x = 0; while (x < width - 16) { load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie); derive_triangle_win7_sve(d_is, d_ie, deltas); x += 16; } load_triangle_win7_sve(di + x, d_stride, height, d_is, d_ie, p0, p1); derive_triangle_win7_sve(d_is, d_ie, deltas); // Row 1: 6 points hadd_update_6_stats_sve( H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas, H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1); int64x2_t deltas1017 = vpaddq_s64(deltas[10], deltas[17]); // Row 2: 5 points hadd_update_4_stats_sve( H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1, deltas + 6, H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2); H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 6] = H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 5] + vgetq_lane_s64(deltas1017, 0); // Row 3: 4 points hadd_update_4_stats_sve( H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2, deltas + 11, H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3); // Row 4: 3 points int64x2_t h0 = vld1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3); vst1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4, vaddq_s64(h0, vpaddq_s64(deltas[15], deltas[16]))); H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 6] = H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 5] + vgetq_lane_s64(deltas1017, 1); // Row 5: 2 points int64x2_t h1 = vld1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4); vst1q_s64(H + (i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5, vaddq_s64(h1, vpaddq_s64(deltas[18], deltas[19]))); // Row 6: 1 points H[(i * wiener_win + 6) * wiener_win2 + i * wiener_win + 6] = H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] + vaddvq_s64(deltas[20]); } while (++i < wiener_win); } #endif // AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_ aom-3.12.1/av1/encoder/arm/quantize_neon.c000066400000000000000000001171111477627663500203130ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #include "aom_mem/aom_mem.h" #include "av1/common/quant_common.h" #include "av1/common/seg_common.h" #include "av1/encoder/av1_quantize.h" #include "av1/encoder/encoder.h" #include "av1/encoder/rd.h" static inline uint16_t get_max_eob(int16x8_t v_eobmax) { #if AOM_ARCH_AARCH64 return (uint16_t)vmaxvq_s16(v_eobmax); #else const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax)); const int64x1_t v_eobmax_xx32 = vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); const int16x4_t v_eobmax_tmp = vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); const int64x1_t v_eobmax_xxx3 = vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); const int16x4_t v_eobmax_final = vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); return (uint16_t)vget_lane_s16(v_eobmax_final, 0); #endif } static inline int16x8_t get_max_lane_eob(const int16_t *iscan, int16x8_t v_eobmax, uint16x8_t v_mask) { const int16x8_t v_iscan = vld1q_s16(&iscan[0]); const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1)); const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0)); return vmaxq_s16(v_eobmax, v_nz_iscan); } static inline uint16x8_t quantize_fp_8(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant, int16x8_t v_round, int16x8_t v_zero) { const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); const int16x8_t v_abs = vabsq_s16(v_coeff); const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1); const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero); const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); store_s16q_to_tran_low(&qcoeff_ptr[0], v_qcoeff); store_s16q_to_tran_low(&dqcoeff_ptr[0], v_dqcoeff); return v_nz_mask; } void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { // TODO(jingning) Decide the need of these arguments after the // quantization process is completed. (void)zbin_ptr; (void)quant_shift_ptr; (void)scan; // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. const int16x8_t v_zero = vdupq_n_s16(0); int16x8_t v_quant = vld1q_s16(quant_ptr); int16x8_t v_dequant = vld1q_s16(dequant_ptr); int16x8_t v_round = vld1q_s16(round_ptr); int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); uint16x8_t v_nz_mask; // process dc and the first seven ac coeffs v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, v_dequant, v_round, v_zero); v_eobmax_76543210 = get_max_lane_eob(&iscan[0], v_eobmax_76543210, v_nz_mask); // overwrite the dc constants with ac constants v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1); v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1); v_round = vdupq_lane_s16(vget_low_s16(v_round), 1); count -= 8; // now process the rest of the ac coeffs do { coeff_ptr += 8; qcoeff_ptr += 8; dqcoeff_ptr += 8; iscan += 8; v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, v_dequant, v_round, v_zero); v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); count -= 8; } while (count > 0); *eob_ptr = get_max_eob(v_eobmax_76543210); } static inline uint16x8_t quantize_lp_8(const int16_t *coeff_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant, int16x8_t v_round, int16x8_t v_zero) { const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]); const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); const int16x8_t v_abs = vabsq_s16(v_coeff); const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1); const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero); const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); vst1q_s16(qcoeff_ptr, v_qcoeff); vst1q_s16(dqcoeff_ptr, v_dqcoeff); return v_nz_mask; } void av1_quantize_lp_neon(const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { (void)scan; // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. const int16x8_t v_zero = vdupq_n_s16(0); int16x8_t v_quant = vld1q_s16(quant_ptr); int16x8_t v_dequant = vld1q_s16(dequant_ptr); int16x8_t v_round = vld1q_s16(round_ptr); int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); uint16x8_t v_nz_mask; intptr_t count = n_coeffs; // process dc and the first seven ac coeffs v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, v_dequant, v_round, v_zero); v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); // overwrite the dc constants with ac constants v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1); v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1); v_round = vdupq_lane_s16(vget_low_s16(v_round), 1); count -= 8; // now process the rest of the ac coeffs do { coeff_ptr += 8; qcoeff_ptr += 8; dqcoeff_ptr += 8; iscan += 8; v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, v_dequant, v_round, v_zero); v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); count -= 8; } while (count != 0); *eob_ptr = get_max_eob(v_eobmax_76543210); } static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale_8( const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant, int16x8_t v_round, int16x8_t v_zero, int log_scale) { const int16x8_t v_log_scale_minus_1 = vdupq_n_s16(log_scale - 1); const int16x8_t v_neg_log_scale_plus_1 = vdupq_n_s16(-(1 + log_scale)); const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); const int16x8_t v_abs_coeff = vabsq_s16(v_coeff); const uint16x8_t v_mask = vcgeq_s16(v_abs_coeff, vshlq_s16(v_dequant, v_neg_log_scale_plus_1)); // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0 const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round), vreinterpretq_s16_u16(v_mask)); const int16x8_t v_tmp2 = vqdmulhq_s16(vshlq_s16(v_tmp, v_log_scale_minus_1), v_quant); const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero); const int16x8_t v_qcoeff = vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign); // Multiplying by dequant here will use all 16 bits. Cast to unsigned before // shifting right. (vshlq_s16 will shift right if shift value is negative) const uint16x8_t v_abs_dqcoeff = vshlq_u16(vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)), vdupq_n_s16(-log_scale)); const int16x8_t v_dqcoeff = vsubq_s16(veorq_s16(vreinterpretq_s16_u16(v_abs_dqcoeff), v_coeff_sign), v_coeff_sign); store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff); return v_nz_mask; } static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale2_8( const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant, int16x8_t v_round, int16x8_t v_zero) { const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); const int16x8_t v_abs_coeff = vabsq_s16(v_coeff); const uint16x8_t v_mask = vcgeq_u16(vshlq_n_u16(vreinterpretq_u16_s16(v_abs_coeff), 1), vshrq_n_u16(vreinterpretq_u16_s16(v_dequant), 2)); // abs_coeff = vmask ? (int64_t)abs_coeff + log_scaled_round : 0 const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round), vreinterpretq_s16_u16(v_mask)); // tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale)); const int16x8_t v_tmp2 = vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1), vreinterpretq_s16_u16(vshrq_n_u16( vreinterpretq_u16_s16(vmulq_s16(v_tmp, v_quant)), 14))); const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero); const int16x8_t v_qcoeff = vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign); // const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[rc != 0]) >> log_scale; const int16x8_t v_abs_dqcoeff = vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp2, v_dequant), 13), vreinterpretq_s16_u16(vshrq_n_u16( vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)), 2))); const int16x8_t v_dqcoeff = vsubq_s16(veorq_s16(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff); return v_nz_mask; } static AOM_FORCE_INLINE void quantize_fp_no_qmatrix_neon( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *iscan, int log_scale) { const int16x8_t v_zero = vdupq_n_s16(0); int16x8_t v_quant = vld1q_s16(quant_ptr); int16x8_t v_dequant = vld1q_s16(dequant_ptr); const int16x8_t v_round_no_scale = vld1q_s16(round_ptr); int16x8_t v_round = vqrdmulhq_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale))); int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); intptr_t non_zero_count = n_coeffs; assert(n_coeffs > 16); // Pre-scan pass const int16x8_t v_dequant_scaled = vshlq_s16(v_dequant, vdupq_n_s16(-(1 + log_scale))); const int16x8_t v_zbin_s16 = vdupq_lane_s16(vget_low_s16(v_dequant_scaled), 1); intptr_t i = n_coeffs; do { const int16x8_t v_coeff_a = load_tran_low_to_s16q(coeff_ptr + i - 8); const int16x8_t v_coeff_b = load_tran_low_to_s16q(coeff_ptr + i - 16); const int16x8_t v_abs_coeff_a = vabsq_s16(v_coeff_a); const int16x8_t v_abs_coeff_b = vabsq_s16(v_coeff_b); const uint16x8_t v_mask_a = vcgeq_s16(v_abs_coeff_a, v_zbin_s16); const uint16x8_t v_mask_b = vcgeq_s16(v_abs_coeff_b, v_zbin_s16); // If the coefficient is in the base ZBIN range, then discard. if (horizontal_long_add_u16x8(v_mask_a, v_mask_b) == 0) { non_zero_count -= 16; } else { break; } i -= 16; } while (i > 0); const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count; memset(qcoeff_ptr + non_zero_count, 0, remaining_zcoeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr + non_zero_count, 0, remaining_zcoeffs * sizeof(*dqcoeff_ptr)); // process dc and the first seven ac coeffs uint16x8_t v_nz_mask; if (log_scale == 2) { v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, v_dequant, v_round, v_zero); } else { v_nz_mask = quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, v_dequant, v_round, v_zero, log_scale); } v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); // overwrite the dc constants with ac constants v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1); v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1); v_round = vdupq_lane_s16(vget_low_s16(v_round), 1); for (intptr_t count = non_zero_count - 8; count > 0; count -= 8) { coeff_ptr += 8; qcoeff_ptr += 8; dqcoeff_ptr += 8; iscan += 8; if (log_scale == 2) { v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, v_dequant, v_round, v_zero); } else { v_nz_mask = quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, v_dequant, v_round, v_zero, log_scale); } v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); } *eob_ptr = get_max_eob(v_eobmax_76543210); } void av1_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { (void)zbin_ptr; (void)quant_shift_ptr; (void)scan; quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, iscan, 1); } void av1_quantize_fp_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { (void)zbin_ptr; (void)quant_shift_ptr; (void)scan; quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, iscan, 2); } void aom_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { (void)quant_shift_ptr; (void)scan; const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); const int16x8_t zero = vdupq_n_s16(0); int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero)); int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]); int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]); int16x8_t vquant = vdupq_n_s16(quant_ptr[1]); int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]); int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); int16x8_t v_abs = vabsq_s16(v_coeff); vzbins = vsetq_lane_s16(zbins[0], vzbins, 0); uint16x8_t vcond = vcgeq_s16(v_abs, vzbins); uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); if (nz_check) { vround = vsetq_lane_s16(round_ptr[0], vround, 0); vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0); vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0); vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0); int16x8_t vtmp = vqaddq_s16(v_abs, vround); int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1); int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); int16x8_t coeff_nz_mask = vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0])); store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask); int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant); vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); coeff_nz_mask = vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0])); store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask); vround = vsetq_lane_s16(round_ptr[1], vround, 0); vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0); vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0); vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0); uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); int16x8_t v_iscan = vld1q_s16(&iscan[0]); vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); } vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); for (int i = 8; i < n_coeffs; i += 8) { v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); v_coeff_sign = vshrq_n_s16(v_coeff, 15); v_abs = vabsq_s16(v_coeff); vcond = vcgeq_s16(v_abs, vzbins); nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); if (nz_check) { int16x8_t vtmp = vqaddq_s16(v_abs, vround); int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1); int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); int16x8_t coeff_nz_mask = vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i])); store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask); int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant); vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); coeff_nz_mask = vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i])); store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask); uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); int16x8_t v_iscan = vld1q_s16(&iscan[i]); vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); } } *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; } #define QM_MULL_SHIFT(x0, x1) \ vreinterpretq_s16_u16(vorrq_u16( \ vreinterpretq_u16_s16(vshlq_n_s16( \ vqdmulhq_s16(x0, vreinterpretq_s16_u16(x1)), 15 - AOM_QM_BITS)), \ vshrq_n_u16(vmulq_u16(vreinterpretq_u16_s16(x0), x1), AOM_QM_BITS))) static void aom_quantize_b_helper_16x16_neon( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { (void)scan; uint16x8_t vwt, viwt; const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); const int16x8_t zero = vdupq_n_s16(0); int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero)); int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]); int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]); int16x8_t vquant = vdupq_n_s16(quant_ptr[1]); int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]); int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); int16x8_t v_abs = vabsq_s16(v_coeff); vzbins = vsetq_lane_s16(zbins[0], vzbins, 0); uint16x8_t vcond; if (qm_ptr == NULL) { vcond = vcgeq_s16(v_abs, vzbins); } else { vwt = vmovl_u8(vld1_u8(&qm_ptr[0])); vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); } uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); if (nz_check) { vround = vsetq_lane_s16(round_ptr[0], vround, 0); vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0); vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0); vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0); int16x8_t vtmp = vqaddq_s16(v_abs, vround); int16x8_t vtmp2; if (qm_ptr == NULL) { vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); } else { vtmp2 = QM_MULL_SHIFT(vtmp, vwt); vtmp2 = vaddq_s16(vtmp2, vtmp); } vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1); int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); int16x8_t coeff_nz_mask = vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0])); store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask); if (iqm_ptr != NULL) { viwt = vmovl_u8(vld1_u8(&iqm_ptr[0])); vdequant = QM_MULL_SHIFT(vdequant, viwt); } int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant); vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); coeff_nz_mask = vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0])); store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask); vround = vsetq_lane_s16(round_ptr[1], vround, 0); vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0); vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0); vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0); uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); int16x8_t v_iscan = vld1q_s16(&iscan[0]); vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); } vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); for (int i = 8; i < n_coeffs; i += 8) { v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); v_coeff_sign = vshrq_n_s16(v_coeff, 15); v_abs = vabsq_s16(v_coeff); if (qm_ptr == NULL) { vcond = vcgeq_s16(v_abs, vzbins); } else { vwt = vmovl_u8(vld1_u8(&qm_ptr[i])); vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); } nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); if (nz_check) { int16x8_t vtmp = vqaddq_s16(v_abs, vround); int16x8_t vtmp2; if (qm_ptr == NULL) { vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); } else { vtmp2 = QM_MULL_SHIFT(vtmp, vwt); vtmp2 = vaddq_s16(vtmp2, vtmp); } vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1); int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); int16x8_t coeff_nz_mask = vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i])); store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask); if (iqm_ptr != NULL) { viwt = vmovl_u8(vld1_u8(&iqm_ptr[i])); vdequant = QM_MULL_SHIFT(vdequant, viwt); } int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant); vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); coeff_nz_mask = vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i])); store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask); uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); int16x8_t v_iscan = vld1q_s16(&iscan[i]); vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); } } *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; } static void aom_quantize_b_helper_32x32_neon( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { (void)scan; uint16x8_t vwt, viwt; const int log_scale = 1; const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); const int16x8_t zero = vdupq_n_s16(0); int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero)); const int16x8_t v_log_scale = v_eobmax_76543210; int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale)); int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]); int16x8_t vquant = vdupq_n_s16(quant_ptr[1]); int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]); int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); int16x8_t v_abs = vabsq_s16(v_coeff); vzbins = vsetq_lane_s16(zbins[0], vzbins, 0); uint16x8_t vcond; if (qm_ptr == NULL) { vcond = vcgeq_s16(v_abs, vzbins); } else { vwt = vmovl_u8(vld1_u8(&qm_ptr[0])); vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); } uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); if (nz_check) { vround = vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0); vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0); vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0); vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0); int16x8_t vtmp = vqaddq_s16(v_abs, vround); int16x8_t vtmp2; if (qm_ptr == NULL) { vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); } else { vtmp2 = QM_MULL_SHIFT(vtmp, vwt); vtmp2 = vaddq_s16(vtmp2, vtmp); } vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift); int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); int16x8_t coeff_nz_mask = vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0])); store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask); if (iqm_ptr != NULL) { viwt = vmovl_u8(vld1_u8(&iqm_ptr[0])); vdequant = QM_MULL_SHIFT(vdequant, viwt); } int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16( vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale)); vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); coeff_nz_mask = vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0])); store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask); vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); vround = vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0); vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0); vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0); vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0); uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); int16x8_t v_iscan = vld1q_s16(&iscan[0]); vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); } vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); for (int i = 8; i < n_coeffs; i += 8) { v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); v_coeff_sign = vshrq_n_s16(v_coeff, 15); v_abs = vabsq_s16(v_coeff); if (qm_ptr == NULL) { vcond = vcgeq_s16(v_abs, vzbins); } else { vwt = vmovl_u8(vld1_u8(&qm_ptr[i])); vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); } nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); if (nz_check) { int16x8_t vtmp = vqaddq_s16(v_abs, vround); int16x8_t vtmp2; if (qm_ptr == NULL) { vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); } else { vtmp2 = QM_MULL_SHIFT(vtmp, vwt); vtmp2 = vaddq_s16(vtmp2, vtmp); } vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift); int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); int16x8_t coeff_nz_mask = vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i])); store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask); if (iqm_ptr != NULL) { viwt = vmovl_u8(vld1_u8(&iqm_ptr[i])); vdequant = QM_MULL_SHIFT(vdequant, viwt); } int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16( vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale)); vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); coeff_nz_mask = vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i])); store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask); uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); int16x8_t v_iscan = vld1q_s16(&iscan[i]); vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); } } *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; } static void aom_quantize_b_helper_64x64_neon( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { (void)scan; uint16x8_t vwt, viwt; const int log_scale = 2; const int16x8_t v_log_scale = vreinterpretq_s16_s64(vdupq_n_s64(0xFFFEFFFEFFFEFFFE)); const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); const int16x8_t zero = vdupq_n_s16(0); int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero)); int16x8_t v_ones = vnegq_s16(v_eobmax_76543210); int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale)); int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]); int16x8_t vquant = vdupq_n_s16(quant_ptr[1]); int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]); int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); int16x8_t v_abs = vabsq_s16(v_coeff); vzbins = vsetq_lane_s16(zbins[0], vzbins, 0); uint16x8_t vcond; if (qm_ptr == NULL) { vcond = vcgeq_s16(v_abs, vzbins); } else { vwt = vmovl_u8(vld1_u8(&qm_ptr[0])); vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); } uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); if (nz_check) { vround = vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0); vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0); vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0); vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0); int16x8_t vtmp = vqaddq_s16(v_abs, vround); int16x8_t vtmp2; if (qm_ptr == NULL) { vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); } else { vtmp2 = QM_MULL_SHIFT(vtmp, vwt); vtmp2 = vaddq_s16(vtmp2, vtmp); } int16x8_t ones = vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones); vtmp2 = vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones); int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); int16x8_t coeff_nz_mask = vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0])); store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask); if (iqm_ptr != NULL) { viwt = vmovl_u8(vld1_u8(&iqm_ptr[0])); vdequant = QM_MULL_SHIFT(vdequant, viwt); } int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16( vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale)); v_deq_abs = vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs); vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); coeff_nz_mask = vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0])); store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask); vround = vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0); vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0); vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0); vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0); uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); int16x8_t v_iscan = vld1q_s16(&iscan[0]); vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); } vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); for (int i = 8; i < n_coeffs; i += 8) { v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); v_coeff_sign = vshrq_n_s16(v_coeff, 15); v_abs = vabsq_s16(v_coeff); if (qm_ptr == NULL) { vcond = vcgeq_s16(v_abs, vzbins); } else { vwt = vmovl_u8(vld1_u8(&qm_ptr[i])); vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); } nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); if (nz_check) { int16x8_t vtmp = vqaddq_s16(v_abs, vround); int16x8_t vtmp2; if (qm_ptr == NULL) { vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); } else { vtmp2 = QM_MULL_SHIFT(vtmp, vwt); vtmp2 = vaddq_s16(vtmp2, vtmp); } int16x8_t ones = vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones); vtmp2 = vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones); int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); int16x8_t coeff_nz_mask = vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i])); store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask); if (iqm_ptr != NULL) { viwt = vmovl_u8(vld1_u8(&iqm_ptr[i])); vdequant = QM_MULL_SHIFT(vdequant, viwt); } int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16( vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale)); v_deq_abs = vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs); vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); coeff_nz_mask = vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i])); store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask); uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); int16x8_t v_iscan = vld1q_s16(&iscan[i]); vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); } } *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; } void aom_quantize_b_helper_neon( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) { switch (log_scale) { // log_scale for AV1 encoder can be only 0, 1, 2 case 0: aom_quantize_b_helper_16x16_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, qm_ptr, iqm_ptr); break; case 1: aom_quantize_b_helper_32x32_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, qm_ptr, iqm_ptr); break; case 2: aom_quantize_b_helper_64x64_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, qm_ptr, iqm_ptr); break; } } void aom_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1); } void aom_quantize_b_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2); } aom-3.12.1/av1/encoder/arm/rdopt_neon.c000066400000000000000000000442171477627663500176110ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "av1/encoder/rdopt.h" #include "config/aom_config.h" #include "config/av1_rtcd.h" // Process horizontal and vertical correlations in a 4x4 block of pixels. // We actually use the 4x4 pixels to calculate correlations corresponding to // the top-left 3x3 pixels, so this function must be called with 1x1 overlap, // moving the window along/down by 3 pixels at a time. static inline void horver_correlation_4x4(const int16_t *diff, int stride, int32x4_t *xy_sum_32, int32x4_t *xz_sum_32, int32x4_t *x_sum_32, int32x4_t *x2_sum_32) { // Pixels in this 4x4 [ a b c d ] // are referred to as: [ e f g h ] // [ i j k l ] // [ m n o p ] const int16x4_t pixelsa_2_lo = vld1_s16(diff + (0 * stride)); const int16x4_t pixelsa_2_sli = vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_2_lo), 16)); const int16x4_t pixelsb_2_lo = vld1_s16(diff + (1 * stride)); const int16x4_t pixelsb_2_sli = vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_2_lo), 16)); const int16x4_t pixelsa_1_lo = vld1_s16(diff + (2 * stride)); const int16x4_t pixelsa_1_sli = vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_1_lo), 16)); const int16x4_t pixelsb_1_lo = vld1_s16(diff + (3 * stride)); const int16x4_t pixelsb_1_sli = vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_1_lo), 16)); const int16x8_t slli_a = vcombine_s16(pixelsa_1_sli, pixelsa_2_sli); *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_1_lo, pixelsa_1_sli); *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_2_lo, pixelsa_2_sli); *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsb_2_lo, pixelsb_2_sli); *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_1_sli); *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_2_sli, pixelsb_2_sli); *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_2_sli); // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k // (sum up every element in slli_a and swap_b) *x_sum_32 = vpadalq_s16(*x_sum_32, slli_a); *x_sum_32 = vaddw_s16(*x_sum_32, pixelsb_2_sli); // Also sum their squares *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_1_sli, pixelsa_1_sli); *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_2_sli, pixelsa_2_sli); *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsb_2_sli, pixelsb_2_sli); } void av1_get_horver_correlation_full_neon(const int16_t *diff, int stride, int width, int height, float *hcorr, float *vcorr) { // The following notation is used: // x - current pixel // y - right neighbour pixel // z - below neighbour pixel // w - down-right neighbour pixel int64_t xy_sum = 0, xz_sum = 0; int64_t x_sum = 0, x2_sum = 0; int32x4_t zero = vdupq_n_s32(0); int64x2_t v_x_sum = vreinterpretq_s64_s32(zero); int64x2_t v_xy_sum = vreinterpretq_s64_s32(zero); int64x2_t v_xz_sum = vreinterpretq_s64_s32(zero); int64x2_t v_x2_sum = vreinterpretq_s64_s32(zero); // Process horizontal and vertical correlations through the body in 4x4 // blocks. This excludes the final row and column and possibly one extra // column depending how 3 divides into width and height for (int i = 0; i <= height - 4; i += 3) { int32x4_t xy_sum_32 = zero; int32x4_t xz_sum_32 = zero; int32x4_t x_sum_32 = zero; int32x4_t x2_sum_32 = zero; for (int j = 0; j <= width - 4; j += 3) { horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32, &xz_sum_32, &x_sum_32, &x2_sum_32); } v_xy_sum = vpadalq_s32(v_xy_sum, xy_sum_32); v_xz_sum = vpadalq_s32(v_xz_sum, xz_sum_32); v_x_sum = vpadalq_s32(v_x_sum, x_sum_32); v_x2_sum = vpadalq_s32(v_x2_sum, x2_sum_32); } #if AOM_ARCH_AARCH64 xy_sum = vaddvq_s64(v_xy_sum); xz_sum = vaddvq_s64(v_xz_sum); x2_sum = vaddvq_s64(v_x2_sum); x_sum = vaddvq_s64(v_x_sum); #else xy_sum = vget_lane_s64( vadd_s64(vget_low_s64(v_xy_sum), vget_high_s64(v_xy_sum)), 0); xz_sum = vget_lane_s64( vadd_s64(vget_low_s64(v_xz_sum), vget_high_s64(v_xz_sum)), 0); x2_sum = vget_lane_s64( vadd_s64(vget_low_s64(v_x2_sum), vget_high_s64(v_x2_sum)), 0); x_sum = vget_lane_s64(vadd_s64(vget_low_s64(v_x_sum), vget_high_s64(v_x_sum)), 0); #endif // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0; // Do we have 2 rows remaining or just the one? Note that width and height // are powers of 2, so each modulo 3 must be 1 or 2. if (height % 3 == 1) { // Just horiz corrs on the final row const int16_t x0 = diff[(height - 1) * stride]; x_sum += x0; x_finalrow += x0; x2_sum += x0 * x0; x2_finalrow += x0 * x0; if (width >= 8) { int32x4_t v_y_sum = zero; int32x4_t v_y2_sum = zero; int32x4_t v_xy_sum_a = zero; int k = width - 1; int j = 0; while ((k - 8) > 0) { const int16x8_t v_x = vld1q_s16(&diff[(height - 1) * stride + j]); const int16x8_t v_y = vld1q_s16(&diff[(height - 1) * stride + j + 1]); const int16x4_t v_x_lo = vget_low_s16(v_x); const int16x4_t v_x_hi = vget_high_s16(v_x); const int16x4_t v_y_lo = vget_low_s16(v_y); const int16x4_t v_y_hi = vget_high_s16(v_y); v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo); v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi); v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo); v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi); v_y_sum = vpadalq_s16(v_y_sum, v_y); k -= 8; j += 8; } const int16x8_t v_l = vld1q_s16(&diff[(height - 1) * stride] + j); const int16x8_t v_x = vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7), vreinterpretq_s16_s32(zero), 1); const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1); const int16x4_t v_x_lo = vget_low_s16(v_x); const int16x4_t v_x_hi = vget_high_s16(v_x); const int16x4_t v_y_lo = vget_low_s16(v_y); const int16x4_t v_y_hi = vget_high_s16(v_y); v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo); v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi); v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo); v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi); const int32x4_t v_y_sum_a = vpadalq_s16(v_y_sum, v_y); const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a); #if AOM_ARCH_AARCH64 const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum); xy_sum += vaddvq_s64(v_xy_sum2); const int32_t y = vaddvq_s32(v_y_sum_a); const int64_t y2 = vaddvq_s64(v_y2_sum_a); #else xy_sum += vget_lane_s64( vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0); const int64x2_t v_y_a = vpaddlq_s32(v_y_sum_a); const int64_t y = vget_lane_s64(vadd_s64(vget_low_s64(v_y_a), vget_high_s64(v_y_a)), 0); const int64x2_t v_y2_sum_b = vpaddlq_s32(v_y2_sum); int64_t y2 = vget_lane_s64( vadd_s64(vget_low_s64(v_y2_sum_b), vget_high_s64(v_y2_sum_b)), 0); #endif x_sum += y; x2_sum += y2; x_finalrow += y; x2_finalrow += y2; } else { for (int j = 0; j < width - 1; ++j) { const int16_t x = diff[(height - 1) * stride + j]; const int16_t y = diff[(height - 1) * stride + j + 1]; xy_sum += x * y; x_sum += y; x2_sum += y * y; x_finalrow += y; x2_finalrow += y * y; } } } else { // Two rows remaining to do const int16_t x0 = diff[(height - 2) * stride]; const int16_t z0 = diff[(height - 1) * stride]; x_sum += x0 + z0; x2_sum += x0 * x0 + z0 * z0; x_finalrow += z0; x2_finalrow += z0 * z0; if (width >= 8) { int32x4_t v_y2_sum = zero; int32x4_t v_w2_sum = zero; int32x4_t v_xy_sum_a = zero; int32x4_t v_xz_sum_a = zero; int32x4_t v_x_sum_a = zero; int32x4_t v_w_sum = zero; int k = width - 1; int j = 0; while ((k - 8) > 0) { const int16x8_t v_x = vld1q_s16(&diff[(height - 2) * stride + j]); const int16x8_t v_y = vld1q_s16(&diff[(height - 2) * stride + j + 1]); const int16x8_t v_z = vld1q_s16(&diff[(height - 1) * stride + j]); const int16x8_t v_w = vld1q_s16(&diff[(height - 1) * stride + j + 1]); const int16x4_t v_x_lo = vget_low_s16(v_x); const int16x4_t v_y_lo = vget_low_s16(v_y); const int16x4_t v_z_lo = vget_low_s16(v_z); const int16x4_t v_w_lo = vget_low_s16(v_w); const int16x4_t v_x_hi = vget_high_s16(v_x); const int16x4_t v_y_hi = vget_high_s16(v_y); const int16x4_t v_z_hi = vget_high_s16(v_z); const int16x4_t v_w_hi = vget_high_s16(v_w); v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo); v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi); v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo); v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi); v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo); v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi); v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo); v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi); v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo); v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi); v_w_sum = vpadalq_s16(v_w_sum, v_w); v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y); v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w); k -= 8; j += 8; } const int16x8_t v_l = vld1q_s16(&diff[(height - 2) * stride] + j); const int16x8_t v_x = vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7), vreinterpretq_s16_s32(zero), 1); const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1); const int16x8_t v_l_2 = vld1q_s16(&diff[(height - 1) * stride] + j); const int16x8_t v_z = vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l_2, 7), vreinterpretq_s16_s32(zero), 1); const int16x8_t v_w = vextq_s16(v_l_2, vreinterpretq_s16_s32(zero), 1); const int16x4_t v_x_lo = vget_low_s16(v_x); const int16x4_t v_y_lo = vget_low_s16(v_y); const int16x4_t v_z_lo = vget_low_s16(v_z); const int16x4_t v_w_lo = vget_low_s16(v_w); const int16x4_t v_x_hi = vget_high_s16(v_x); const int16x4_t v_y_hi = vget_high_s16(v_y); const int16x4_t v_z_hi = vget_high_s16(v_z); const int16x4_t v_w_hi = vget_high_s16(v_w); v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo); v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi); v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo); v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi); v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo); v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi); v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo); v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi); v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo); v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi); v_w_sum = vpadalq_s16(v_w_sum, v_w); v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y); v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w); #if AOM_ARCH_AARCH64 xy_sum += vaddvq_s64(vpaddlq_s32(v_xy_sum_a)); xz_sum += vaddvq_s64(vpaddlq_s32(v_xz_sum_a)); x_sum += vaddvq_s32(v_x_sum_a); x_finalrow += vaddvq_s32(v_w_sum); int64_t y2 = vaddvq_s64(vpaddlq_s32(v_y2_sum)); int64_t w2 = vaddvq_s64(vpaddlq_s32(v_w2_sum)); #else const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a); xy_sum += vget_lane_s64( vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0); const int64x2_t v_xz_sum2 = vpaddlq_s32(v_xz_sum_a); xz_sum += vget_lane_s64( vadd_s64(vget_low_s64(v_xz_sum2), vget_high_s64(v_xz_sum2)), 0); const int64x2_t v_x_sum2 = vpaddlq_s32(v_x_sum_a); x_sum += vget_lane_s64( vadd_s64(vget_low_s64(v_x_sum2), vget_high_s64(v_x_sum2)), 0); const int64x2_t v_w_sum_a = vpaddlq_s32(v_w_sum); x_finalrow += vget_lane_s64( vadd_s64(vget_low_s64(v_w_sum_a), vget_high_s64(v_w_sum_a)), 0); const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum); int64_t y2 = vget_lane_s64( vadd_s64(vget_low_s64(v_y2_sum_a), vget_high_s64(v_y2_sum_a)), 0); const int64x2_t v_w2_sum_a = vpaddlq_s32(v_w2_sum); int64_t w2 = vget_lane_s64( vadd_s64(vget_low_s64(v_w2_sum_a), vget_high_s64(v_w2_sum_a)), 0); #endif x2_sum += y2 + w2; x2_finalrow += w2; } else { for (int j = 0; j < width - 1; ++j) { const int16_t x = diff[(height - 2) * stride + j]; const int16_t y = diff[(height - 2) * stride + j + 1]; const int16_t z = diff[(height - 1) * stride + j]; const int16_t w = diff[(height - 1) * stride + j + 1]; // Horizontal and vertical correlations for the penultimate row: xy_sum += x * y; xz_sum += x * z; // Now just horizontal correlations for the final row: xy_sum += z * w; x_sum += y + w; x2_sum += y * y + w * w; x_finalrow += w; x2_finalrow += w * w; } } } // Do we have 2 columns remaining or just the one? if (width % 3 == 1) { // Just vert corrs on the final col const int16_t x0 = diff[width - 1]; x_sum += x0; x_finalcol += x0; x2_sum += x0 * x0; x2_finalcol += x0 * x0; for (int i = 0; i < height - 1; ++i) { const int16_t x = diff[i * stride + width - 1]; const int16_t z = diff[(i + 1) * stride + width - 1]; xz_sum += x * z; x_finalcol += z; x2_finalcol += z * z; // So the bottom-right elements don't get counted twice: if (i < height - (height % 3 == 1 ? 2 : 3)) { x_sum += z; x2_sum += z * z; } } } else { // Two cols remaining const int16_t x0 = diff[width - 2]; const int16_t y0 = diff[width - 1]; x_sum += x0 + y0; x2_sum += x0 * x0 + y0 * y0; x_finalcol += y0; x2_finalcol += y0 * y0; for (int i = 0; i < height - 1; ++i) { const int16_t x = diff[i * stride + width - 2]; const int16_t y = diff[i * stride + width - 1]; const int16_t z = diff[(i + 1) * stride + width - 2]; const int16_t w = diff[(i + 1) * stride + width - 1]; // Horizontal and vertical correlations for the penultimate col: // Skip these on the last iteration of this loop if we also had two // rows remaining, otherwise the final horizontal and vertical correlation // get erroneously processed twice if (i < height - 2 || height % 3 == 1) { xy_sum += x * y; xz_sum += x * z; } x_finalcol += w; x2_finalcol += w * w; // So the bottom-right elements don't get counted twice: if (i < height - (height % 3 == 1 ? 2 : 3)) { x_sum += z + w; x2_sum += z * z + w * w; } // Now just vertical correlations for the final column: xz_sum += y * w; } } // Calculate the simple sums and squared-sums int64_t x_firstrow = 0, x_firstcol = 0; int64_t x2_firstrow = 0, x2_firstcol = 0; if (width >= 8) { int32x4_t v_x_firstrow = zero; int32x4_t v_x2_firstrow = zero; for (int j = 0; j < width; j += 8) { const int16x8_t v_diff = vld1q_s16(diff + j); const int16x4_t v_diff_lo = vget_low_s16(v_diff); const int16x4_t v_diff_hi = vget_high_s16(v_diff); v_x_firstrow = vpadalq_s16(v_x_firstrow, v_diff); v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_lo, v_diff_lo); v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_hi, v_diff_hi); } #if AOM_ARCH_AARCH64 x_firstrow += vaddvq_s32(v_x_firstrow); x2_firstrow += vaddvq_s32(v_x2_firstrow); #else const int64x2_t v_x_firstrow_64 = vpaddlq_s32(v_x_firstrow); x_firstrow += vget_lane_s64( vadd_s64(vget_low_s64(v_x_firstrow_64), vget_high_s64(v_x_firstrow_64)), 0); const int64x2_t v_x2_firstrow_64 = vpaddlq_s32(v_x2_firstrow); x2_firstrow += vget_lane_s64(vadd_s64(vget_low_s64(v_x2_firstrow_64), vget_high_s64(v_x2_firstrow_64)), 0); #endif } else { for (int j = 0; j < width; ++j) { x_firstrow += diff[j]; x2_firstrow += diff[j] * diff[j]; } } for (int i = 0; i < height; ++i) { x_firstcol += diff[i * stride]; x2_firstcol += diff[i * stride] * diff[i * stride]; } int64_t xhor_sum = x_sum - x_finalcol; int64_t xver_sum = x_sum - x_finalrow; int64_t y_sum = x_sum - x_firstcol; int64_t z_sum = x_sum - x_firstrow; int64_t x2hor_sum = x2_sum - x2_finalcol; int64_t x2ver_sum = x2_sum - x2_finalrow; int64_t y2_sum = x2_sum - x2_firstcol; int64_t z2_sum = x2_sum - x2_firstrow; const float num_hor = (float)(height * (width - 1)); const float num_ver = (float)((height - 1) * width); const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; if (xhor_var_n > 0 && y_var_n > 0) { *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); *hcorr = *hcorr < 0 ? 0 : *hcorr; } else { *hcorr = 1.0; } if (xver_var_n > 0 && z_var_n > 0) { *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); *vcorr = *vcorr < 0 ? 0 : *vcorr; } else { *vcorr = 1.0; } } aom-3.12.1/av1/encoder/arm/reconinter_enc_neon.c000066400000000000000000000246051477627663500214550ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/arm/mem_neon.h" #include "av1/encoder/reconinter_enc.h" void aom_upsampled_pred_neon(MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search) { // expect xd == NULL only in tests if (xd != NULL) { const MB_MODE_INFO *mi = xd->mi[0]; const int ref_num = 0; const int is_intrabc = is_intrabc_block(mi); const struct scale_factors *const sf = is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; const int is_scaled = av1_is_scaled(sf); if (is_scaled) { int plane = 0; const int mi_x = mi_col * MI_SIZE; const int mi_y = mi_row * MI_SIZE; const struct macroblockd_plane *const pd = &xd->plane[plane]; const struct buf_2d *const dst_buf = &pd->dst; const struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref_num]; InterPredParams inter_pred_params; inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); const int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); av1_init_inter_params( &inter_pred_params, width, height, mi_y >> pd->subsampling_y, mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); av1_enc_build_one_inter_predictor(comp_pred, width, mv, &inter_pred_params); return; } } const InterpFilterParams *filter_params = av1_get_filter(subpel_search); if (!subpel_x_q3 && !subpel_y_q3) { if (width > 8) { assert(width % 16 == 0); int i = height; do { int j = 0; do { uint8x16_t r = vld1q_u8(ref + j); vst1q_u8(comp_pred + j, r); j += 16; } while (j < width); ref += ref_stride; comp_pred += width; } while (--i != 0); } else if (width == 8) { int i = height; do { uint8x8_t r = vld1_u8(ref); vst1_u8(comp_pred, r); ref += ref_stride; comp_pred += width; } while (--i != 0); } else { assert(width == 4); int i = height / 2; do { uint8x8_t r = load_unaligned_u8(ref, ref_stride); vst1_u8(comp_pred, r); ref += 2 * ref_stride; comp_pred += 2 * width; } while (--i != 0); } } else if (!subpel_y_q3) { const int16_t *const filter_x = av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q3 << 1); aom_convolve8_horiz(ref, ref_stride, comp_pred, width, filter_x, 16, NULL, -1, width, height); } else if (!subpel_x_q3) { const int16_t *const filter_y = av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q3 << 1); aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, filter_y, 16, width, height); } else { DECLARE_ALIGNED(16, uint8_t, im_block[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); const int16_t *const filter_x = av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q3 << 1); const int16_t *const filter_y = av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q3 << 1); const int im_stride = MAX_SB_SIZE; const int im_height = (((height - 1) * 8 + subpel_y_q3) >> 3) + SUBPEL_TAPS; const int ref_vert_offset = ref_stride * ((SUBPEL_TAPS >> 1) - 1); const int im_vert_offset = im_stride * ((filter_params->taps >> 1) - 1); assert(im_height <= (MAX_SB_SIZE * 2 + 16) + 16); aom_convolve8_horiz(ref - ref_vert_offset, ref_stride, im_block, MAX_SB_SIZE, filter_x, 16, NULL, -1, width, im_height); aom_convolve8_vert(im_block + im_vert_offset, MAX_SB_SIZE, comp_pred, width, NULL, -1, filter_y, 16, width, height); } } void aom_comp_avg_upsampled_pred_neon(MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search) { aom_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); aom_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred, width); } #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_upsampled_pred_neon(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search) { // expect xd == NULL only in tests if (xd != NULL) { const MB_MODE_INFO *mi = xd->mi[0]; const int ref_num = 0; const int is_intrabc = is_intrabc_block(mi); const struct scale_factors *const sf = is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; const int is_scaled = av1_is_scaled(sf); if (is_scaled) { int plane = 0; const int mi_x = mi_col * MI_SIZE; const int mi_y = mi_row * MI_SIZE; const struct macroblockd_plane *const pd = &xd->plane[plane]; const struct buf_2d *const dst_buf = &pd->dst; const struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref_num]; InterPredParams inter_pred_params; inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); const int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); av1_init_inter_params( &inter_pred_params, width, height, mi_y >> pd->subsampling_y, mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); av1_enc_build_one_inter_predictor(comp_pred8, width, mv, &inter_pred_params); return; } } const InterpFilterParams *filter = av1_get_filter(subpel_search); if (!subpel_x_q3 && !subpel_y_q3) { const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); if (width > 4) { assert(width % 8 == 0); int i = height; do { int j = 0; do { uint16x8_t r = vld1q_u16(ref + j); vst1q_u16(comp_pred + j, r); j += 8; } while (j < width); ref += ref_stride; comp_pred += width; } while (--i != 0); } else if (width == 4) { int i = height; do { uint16x4_t r = vld1_u16(ref); vst1_u16(comp_pred, r); ref += ref_stride; comp_pred += width; } while (--i != 0); } else { assert(width == 2); int i = height / 2; do { uint16x4_t r = load_u16_2x2(ref, ref_stride); store_u16x2_strided_x2(comp_pred, width, r); ref += 2 * ref_stride; comp_pred += 2 * width; } while (--i != 0); } } else if (!subpel_y_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); aom_highbd_convolve8_horiz_neon(ref8, ref_stride, comp_pred8, width, kernel, 16, NULL, -1, width, height, bd); } else if (!subpel_x_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); aom_highbd_convolve8_vert_neon(ref8, ref_stride, comp_pred8, width, NULL, -1, kernel, 16, width, height, bd); } else { DECLARE_ALIGNED(16, uint16_t, temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); const int16_t *const kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); const int intermediate_height = (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); aom_highbd_convolve8_horiz_neon( ref8 - ref_stride * ((filter->taps >> 1) - 1), ref_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd); aom_highbd_convolve8_vert_neon( CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, bd); } } void aom_highbd_comp_avg_upsampled_pred_neon( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search) { aom_highbd_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred8, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, bd, subpel_search); aom_highbd_comp_avg_pred_neon(comp_pred8, pred8, width, height, comp_pred8, width); } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/av1/encoder/arm/shift_neon.h000066400000000000000000000042431477627663500175760ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_ARM_SHIFT_NEON_H_ #define AOM_AV1_ENCODER_ARM_SHIFT_NEON_H_ #include #define SHIFT_LOOP_HELPER(name, type, intrinsic, arg) \ static inline void name(const type *in, type *out, int size) { \ int i = 0; \ do { \ out[i] = intrinsic(in[i], arg); \ } while (++i < size); \ } SHIFT_LOOP_HELPER(shift_left_2_s16_x4, int16x4_t, vshl_n_s16, 2) SHIFT_LOOP_HELPER(shift_left_2_s16_x8, int16x8_t, vshlq_n_s16, 2) SHIFT_LOOP_HELPER(shift_left_2_s32_x4, int32x4_t, vshlq_n_s32, 2) SHIFT_LOOP_HELPER(shift_right_2_round_s16_x8, int16x8_t, vrshrq_n_s16, 2) SHIFT_LOOP_HELPER(shift_right_2_round_s32_x4, int32x4_t, vrshrq_n_s32, 2) SHIFT_LOOP_HELPER(shift_right_4_round_s16_x8, int16x8_t, vrshrq_n_s16, 4) SHIFT_LOOP_HELPER(shift_right_4_round_s32_x4, int32x4_t, vrshrq_n_s32, 4) // Addition instructions have slightly better performance compared to shift // instructions on some micro-architectures, so use these for shifts by one. SHIFT_LOOP_HELPER(shift_left_1_s16_x4, int16x4_t, vadd_s16, in[i]) SHIFT_LOOP_HELPER(shift_left_1_s16_x8, int16x8_t, vaddq_s16, in[i]) SHIFT_LOOP_HELPER(shift_right_1_round_s16_x4, int16x4_t, vrhadd_s16, vdup_n_s16(0)) SHIFT_LOOP_HELPER(shift_right_1_round_s16_x8, int16x8_t, vrhaddq_s16, vdupq_n_s16(0)) SHIFT_LOOP_HELPER(shift_right_1_round_s32_x4, int32x4_t, vrhaddq_s32, vdupq_n_s32(0)) #undef SHIFT_LOOP_HELPER #endif // AOM_AV1_ENCODER_ARM_SHIFT_NEON_H_ aom-3.12.1/av1/encoder/arm/temporal_filter_neon.c000066400000000000000000000551601477627663500216500ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "av1/encoder/encoder.h" #include "av1/encoder/temporal_filter.h" #include "aom_dsp/mathutils.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" // For the squared error buffer, add padding for 4 samples. #define SSE_STRIDE (BW + 4) // When using vld1q_u16_x4 compilers may insert an alignment hint of 256 bits. DECLARE_ALIGNED(32, static const uint16_t, kSlidingWindowMask[]) = { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }; static inline void get_squared_error( const uint8_t *frame1, const uint32_t stride1, const uint8_t *frame2, const uint32_t stride2, const uint32_t block_width, const uint32_t block_height, uint16_t *frame_sse, const unsigned int dst_stride) { uint16_t *dst = frame_sse; uint32_t i = 0; do { uint32_t j = 0; do { uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j); uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j); uint8x16_t abs_diff = vabdq_u8(s, r); uint16x8_t sse_lo = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); uint16x8_t sse_hi = vmull_u8(vget_high_u8(abs_diff), vget_high_u8(abs_diff)); vst1q_u16(dst + j + 2, sse_lo); vst1q_u16(dst + j + 10, sse_hi); j += 16; } while (j < block_width); dst += dst_stride; } while (++i < block_height); } static inline uint16x8_t load_and_pad(const uint16_t *src, const uint32_t col, const uint32_t block_width) { uint16x8_t s = vld1q_u16(src); if (col == 0) { const uint16_t lane2 = vgetq_lane_u16(s, 2); s = vsetq_lane_u16(lane2, s, 0); s = vsetq_lane_u16(lane2, s, 1); } else if (col >= block_width - 4) { const uint16_t lane5 = vgetq_lane_u16(s, 5); s = vsetq_lane_u16(lane5, s, 6); s = vsetq_lane_u16(lane5, s, 7); } return s; } static void apply_temporal_filter( const uint8_t *frame, const unsigned int stride, const uint32_t block_width, const uint32_t block_height, const int *subblock_mses, unsigned int *accumulator, uint16_t *count, const uint16_t *frame_sse, const uint32_t *luma_sse_sum, const double inv_num_ref_pixels, const double decay_factor, const double inv_factor, const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) { assert(((block_width == 16) || (block_width == 32)) && ((block_height == 16) || (block_height == 32))); uint32_t diff_sse[BH][BW]; const uint16x8x4_t vmask = vld1q_u16_x4(kSlidingWindowMask); // Traverse 4 columns at a time - first and last two columns need padding. for (uint32_t col = 0; col < block_width; col += 4) { uint16x8_t vsrc[5]; const uint16_t *src = frame_sse + col; // Load and pad (for first and last two columns) 3 rows from the top. for (int i = 2; i < 5; i++) { vsrc[i] = load_and_pad(src, col, block_width); src += SSE_STRIDE; } // Pad the top 2 rows. vsrc[0] = vsrc[2]; vsrc[1] = vsrc[2]; uint32x4_t vsum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; for (int i = 0; i < 4; i++) { vsum[i] = vpadalq_u16(vsum[i], vandq_u16(vsrc[0], vmask.val[i])); vsum[i] = vpadalq_u16(vsum[i], vandq_u16(vsrc[1], vmask.val[i])); vsum[i] = vpadalq_u16(vsum[i], vandq_u16(vsrc[2], vmask.val[i])); vsum[i] = vpadalq_u16(vsum[i], vandq_u16(vsrc[3], vmask.val[i])); vsum[i] = vpadalq_u16(vsum[i], vandq_u16(vsrc[4], vmask.val[i])); } for (unsigned int row = 0; row < block_height; row++) { uint32x4_t sum_luma = vld1q_u32(luma_sse_sum + row * BW + col); uint32x4_t sum_src = horizontal_add_4d_u32x4(vsum); vst1q_u32(&diff_sse[row][col], vaddq_u32(sum_src, sum_luma)); for (int i = 0; i < 4; i++) { uint32x4_t vsum_0 = vpaddlq_u16(vandq_u16(vsrc[0], vmask.val[i])); vsum[i] = vsubq_u32(vsum[i], vsum_0); } // Push all rows in the sliding window up one. for (int i = 0; i < 4; i++) { vsrc[i] = vsrc[i + 1]; } if (row <= block_height - 4) { // Load next row into the bottom of the sliding window. vsrc[4] = load_and_pad(src, col, block_width); src += SSE_STRIDE; } else { // Pad the bottom 2 rows. vsrc[4] = vsrc[3]; } for (int i = 0; i < 4; i++) { vsum[i] = vpadalq_u16(vsum[i], vandq_u16(vsrc[4], vmask.val[i])); } } } // Perform filtering. if (tf_wgt_calc_lvl == 0) { for (unsigned int i = 0, k = 0; i < block_height; i++) { for (unsigned int j = 0; j < block_width; j++, k++) { const int pixel_value = frame[i * stride + j]; const double window_error = diff_sse[i][j] * inv_num_ref_pixels; const int subblock_idx = (i >= block_height / 2) * 2 + (j >= block_width / 2); const double block_error = (double)subblock_mses[subblock_idx]; const double combined_error = weight_factor * window_error + block_error * inv_factor; // Compute filter weight. double scaled_error = combined_error * d_factor[subblock_idx] * decay_factor; scaled_error = AOMMIN(scaled_error, 7); const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); accumulator[k] += weight * pixel_value; count[k] += weight; } } } else { for (unsigned int i = 0, k = 0; i < block_height; i++) { for (unsigned int j = 0; j < block_width; j++, k++) { const int pixel_value = frame[i * stride + j]; const double window_error = diff_sse[i][j] * inv_num_ref_pixels; const int subblock_idx = (i >= block_height / 2) * 2 + (j >= block_width / 2); const double block_error = (double)subblock_mses[subblock_idx]; const double combined_error = weight_factor * window_error + block_error * inv_factor; // Compute filter weight. double scaled_error = combined_error * d_factor[subblock_idx] * decay_factor; scaled_error = AOMMIN(scaled_error, 7); const float fweight = approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; const int weight = iroundpf(fweight); accumulator[k] += weight * pixel_value; count[k] += weight; } } } } void av1_apply_temporal_filter_neon( const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count) { const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!"); assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!"); assert(!is_high_bitdepth && "Only support low bit-depth with Neon!"); assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); (void)is_high_bitdepth; // Block information. const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; // Frame information. const int frame_height = frame_to_filter->y_crop_height; const int frame_width = frame_to_filter->y_crop_width; const int min_frame_size = AOMMIN(frame_height, frame_width); // Variables to simplify combined error calculation. const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * TF_SEARCH_ERROR_NORM_WEIGHT); const double weight_factor = (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; // Adjust filtering based on q. // Larger q -> stronger filtering -> larger weight. // Smaller q -> weaker filtering -> smaller weight. double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); q_decay = CLIP(q_decay, 1e-5, 1); if (q_factor >= TF_QINDEX_CUTOFF) { // Max q_factor is 255, therefore the upper bound of q_decay is 8. // We do not need a clip here. q_decay = 0.5 * pow((double)q_factor / 64, 2); } // Smaller strength -> smaller filtering weight. double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); s_decay = CLIP(s_decay, 1e-5, 1); double d_factor[4] = { 0 }; uint16_t frame_sse[SSE_STRIDE * BH] = { 0 }; uint32_t luma_sse_sum[BW * BH] = { 0 }; for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { // Larger motion vector -> smaller filtering weight. const MV mv = subblock_mvs[subblock_idx]; const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; distance_threshold = AOMMAX(distance_threshold, 1); d_factor[subblock_idx] = distance / distance_threshold; d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); } // Handle planes in sequence. int plane_offset = 0; for (int plane = 0; plane < num_planes; ++plane) { const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; const uint32_t frame_stride = frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1]; const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset; const int ss_x_shift = mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; const int ss_y_shift = mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); const double inv_num_ref_pixels = 1.0 / num_ref_pixels; // Larger noise -> larger filtering weight. const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); // Decay factors for non-local mean approach. const double decay_factor = 1 / (n_decay * q_decay * s_decay); // Filter U-plane and V-plane using Y-plane. This is because motion // search is only done on Y-plane, so the information from Y-plane // will be more accurate. The luma sse sum is reused in both chroma // planes. if (plane == AOM_PLANE_U) { for (unsigned int i = 0; i < plane_h; i++) { for (unsigned int j = 0; j < plane_w; j++) { for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2]; } } } } } get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w, plane_h, frame_sse, SSE_STRIDE); apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h, subblock_mses, accum + plane_offset, count + plane_offset, frame_sse, luma_sse_sum, inv_num_ref_pixels, decay_factor, inv_factor, weight_factor, d_factor, tf_wgt_calc_lvl); plane_offset += plane_h * plane_w; } } double av1_estimate_noise_from_single_plane_neon(const uint8_t *src, int height, int width, int stride, int edge_thresh) { uint16x8_t thresh = vdupq_n_u16(edge_thresh); uint32x4_t acc = vdupq_n_u32(0); // Count is in theory positive as it counts the number of times we're under // the threshold, but it will be counted negatively in order to make best use // of the vclt instruction, which sets every bit of a lane to 1 when the // condition is true. int32x4_t count = vdupq_n_s32(0); int final_count = 0; int64_t final_acc = 0; const uint8_t *src_start = src + stride + 1; int h = 1; do { int w = 1; const uint8_t *src_ptr = src_start; while (w <= (width - 1) - 16) { uint8x16_t mat[3][3]; mat[0][0] = vld1q_u8(src_ptr - stride - 1); mat[0][1] = vld1q_u8(src_ptr - stride); mat[0][2] = vld1q_u8(src_ptr - stride + 1); mat[1][0] = vld1q_u8(src_ptr - 1); mat[1][1] = vld1q_u8(src_ptr); mat[1][2] = vld1q_u8(src_ptr + 1); mat[2][0] = vld1q_u8(src_ptr + stride - 1); mat[2][1] = vld1q_u8(src_ptr + stride); mat[2][2] = vld1q_u8(src_ptr + stride + 1); // Compute Sobel gradients. uint16x8_t gxa_lo = vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[2][0])); uint16x8_t gxa_hi = vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[2][0])); uint16x8_t gxb_lo = vaddl_u8(vget_low_u8(mat[0][2]), vget_low_u8(mat[2][2])); uint16x8_t gxb_hi = vaddl_u8(vget_high_u8(mat[0][2]), vget_high_u8(mat[2][2])); gxa_lo = vaddq_u16( gxa_lo, vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][0]))); gxa_hi = vaddq_u16( gxa_hi, vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][0]))); gxb_lo = vaddq_u16( gxb_lo, vaddl_u8(vget_low_u8(mat[1][2]), vget_low_u8(mat[1][2]))); gxb_hi = vaddq_u16( gxb_hi, vaddl_u8(vget_high_u8(mat[1][2]), vget_high_u8(mat[1][2]))); uint16x8_t gya_lo = vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2])); uint16x8_t gya_hi = vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2])); uint16x8_t gyb_lo = vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2])); uint16x8_t gyb_hi = vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2])); gya_lo = vaddq_u16( gya_lo, vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[0][1]))); gya_hi = vaddq_u16( gya_hi, vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[0][1]))); gyb_lo = vaddq_u16( gyb_lo, vaddl_u8(vget_low_u8(mat[2][1]), vget_low_u8(mat[2][1]))); gyb_hi = vaddq_u16( gyb_hi, vaddl_u8(vget_high_u8(mat[2][1]), vget_high_u8(mat[2][1]))); uint16x8_t ga_lo = vabaq_u16(vabdq_u16(gxa_lo, gxb_lo), gya_lo, gyb_lo); uint16x8_t ga_hi = vabaq_u16(vabdq_u16(gxa_hi, gxb_hi), gya_hi, gyb_hi); // Check which vector elements are under the threshold. The Laplacian is // then unconditionally computed and we accumulate zeros if we're not // under the threshold. This is much faster than using an if statement. uint16x8_t thresh_u16_lo = vcltq_u16(ga_lo, thresh); uint16x8_t thresh_u16_hi = vcltq_u16(ga_hi, thresh); uint16x8_t center_lo = vshll_n_u8(vget_low_u8(mat[1][1]), 2); uint16x8_t center_hi = vshll_n_u8(vget_high_u8(mat[1][1]), 2); uint16x8_t adj0_lo = vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[2][1])); uint16x8_t adj0_hi = vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[2][1])); uint16x8_t adj1_lo = vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][2])); uint16x8_t adj1_hi = vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][2])); uint16x8_t adj_lo = vaddq_u16(adj0_lo, adj1_lo); adj_lo = vaddq_u16(adj_lo, adj_lo); uint16x8_t adj_hi = vaddq_u16(adj0_hi, adj1_hi); adj_hi = vaddq_u16(adj_hi, adj_hi); uint16x8_t diag0_lo = vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2])); uint16x8_t diag0_hi = vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2])); uint16x8_t diag1_lo = vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2])); uint16x8_t diag1_hi = vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2])); uint16x8_t diag_lo = vaddq_u16(diag0_lo, diag1_lo); uint16x8_t diag_hi = vaddq_u16(diag0_hi, diag1_hi); uint16x8_t v_lo = vaddq_u16(center_lo, diag_lo); v_lo = vabdq_u16(v_lo, adj_lo); uint16x8_t v_hi = vaddq_u16(center_hi, diag_hi); v_hi = vabdq_u16(v_hi, adj_hi); acc = vpadalq_u16(acc, vandq_u16(v_lo, thresh_u16_lo)); acc = vpadalq_u16(acc, vandq_u16(v_hi, thresh_u16_hi)); // Add -1 for each lane where the gradient is under the threshold. count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_lo)); count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_hi)); w += 16; src_ptr += 16; } if (w <= (width - 1) - 8) { uint8x8_t mat[3][3]; mat[0][0] = vld1_u8(src_ptr - stride - 1); mat[0][1] = vld1_u8(src_ptr - stride); mat[0][2] = vld1_u8(src_ptr - stride + 1); mat[1][0] = vld1_u8(src_ptr - 1); mat[1][1] = vld1_u8(src_ptr); mat[1][2] = vld1_u8(src_ptr + 1); mat[2][0] = vld1_u8(src_ptr + stride - 1); mat[2][1] = vld1_u8(src_ptr + stride); mat[2][2] = vld1_u8(src_ptr + stride + 1); // Compute Sobel gradients. uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]); uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]); gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0])); gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2])); uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]); uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]); gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1])); gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1])); uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb); // Check which vector elements are under the threshold. The Laplacian is // then unconditionally computed and we accumulate zeros if we're not // under the threshold. This is much faster than using an if statement. uint16x8_t thresh_u16 = vcltq_u16(ga, thresh); uint16x8_t center = vshll_n_u8(mat[1][1], 2); uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]); uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]); uint16x8_t adj = vaddq_u16(adj0, adj1); adj = vaddq_u16(adj, adj); uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]); uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]); uint16x8_t diag = vaddq_u16(diag0, diag1); uint16x8_t v = vaddq_u16(center, diag); v = vabdq_u16(v, adj); acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16)); // Add -1 for each lane where the gradient is under the threshold. count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16)); w += 8; src_ptr += 8; } if (w <= (width - 1) - 4) { uint16x8_t mask = vcombine_u16(vdup_n_u16(65535), vdup_n_u16(0)); uint8x8_t mat[3][3]; mat[0][0] = load_unaligned_u8_4x1(src_ptr - stride - 1); mat[0][1] = load_unaligned_u8_4x1(src_ptr - stride); mat[0][2] = load_unaligned_u8_4x1(src_ptr - stride + 1); mat[1][0] = load_unaligned_u8_4x1(src_ptr - 1); mat[1][1] = load_unaligned_u8_4x1(src_ptr); mat[1][2] = load_unaligned_u8_4x1(src_ptr + 1); mat[2][0] = load_unaligned_u8_4x1(src_ptr + stride - 1); mat[2][1] = load_unaligned_u8_4x1(src_ptr + stride); mat[2][2] = load_unaligned_u8_4x1(src_ptr + stride + 1); // Compute Sobel gradients. uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]); uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]); gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0])); gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2])); uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]); uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]); gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1])); gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1])); uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb); // Check which vector elements are under the threshold. The Laplacian is // then unconditionally computed and we accumulate zeros if we're not // under the threshold. This is much faster than using an if statement. uint16x8_t thresh_u16 = vandq_u16(vcltq_u16(ga, thresh), mask); uint16x8_t center = vshll_n_u8(mat[1][1], 2); uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]); uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]); uint16x8_t adj = vaddq_u16(adj0, adj1); adj = vaddq_u16(adj, adj); uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]); uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]); uint16x8_t diag = vaddq_u16(diag0, diag1); uint16x8_t v = vaddq_u16(center, diag); v = vabdq_u16(v, adj); acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16)); // Add -1 for each lane where the gradient is under the threshold. count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16)); w += 4; src_ptr += 4; } while (w < width - 1) { int mat[3][3]; mat[0][0] = *(src_ptr - stride - 1); mat[0][1] = *(src_ptr - stride); mat[0][2] = *(src_ptr - stride + 1); mat[1][0] = *(src_ptr - 1); mat[1][1] = *(src_ptr); mat[1][2] = *(src_ptr + 1); mat[2][0] = *(src_ptr + stride - 1); mat[2][1] = *(src_ptr + stride); mat[2][2] = *(src_ptr + stride + 1); // Compute Sobel gradients. const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) + 2 * (mat[1][0] - mat[1][2]); const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) + 2 * (mat[0][1] - mat[2][1]); const int ga = abs(gx) + abs(gy); // Accumulate Laplacian. const int is_under = ga < edge_thresh; const int v = 4 * mat[1][1] - 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) + (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]); final_acc += abs(v) * is_under; final_count += is_under; src_ptr++; w++; } src_start += stride; } while (++h < height - 1); // We counted negatively, so subtract to get the final value. final_count -= horizontal_add_s32x4(count); final_acc += horizontal_long_add_u32x4(acc); return (final_count < 16) ? -1.0 : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2; } aom-3.12.1/av1/encoder/arm/temporal_filter_neon_dotprod.c000066400000000000000000000324711477627663500234030ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "av1/encoder/encoder.h" #include "av1/encoder/temporal_filter.h" #include "aom_dsp/mathutils.h" #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" // For the squared error buffer, add padding for 4 samples. #define SSE_STRIDE (BW + 4) // clang-format off // Table used to pad the first and last columns and apply the sliding window. DECLARE_ALIGNED(16, static const uint8_t, kLoadPad[4][16]) = { { 2, 2, 2, 3, 4, 255, 255, 255, 255, 2, 2, 3, 4, 5, 255, 255 }, { 255, 255, 2, 3, 4, 5, 6, 255, 255, 255, 255, 3, 4, 5, 6, 7 }, { 0, 1, 2, 3, 4, 255, 255, 255, 255, 1, 2, 3, 4, 5, 255, 255 }, { 255, 255, 2, 3, 4, 5, 5, 255, 255, 255, 255, 3, 4, 5, 5, 5 } }; // For columns that don't need to be padded it's just a simple mask. DECLARE_ALIGNED(16, static const uint8_t, kSlidingWindowMask[]) = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; // clang-format on static inline void get_abs_diff(const uint8_t *frame1, const uint32_t stride1, const uint8_t *frame2, const uint32_t stride2, const uint32_t block_width, const uint32_t block_height, uint8_t *frame_abs_diff, const unsigned int dst_stride) { uint8_t *dst = frame_abs_diff; uint32_t i = 0; do { uint32_t j = 0; do { uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j); uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j); uint8x16_t abs_diff = vabdq_u8(s, r); vst1q_u8(dst + j + 2, abs_diff); j += 16; } while (j < block_width); dst += dst_stride; } while (++i < block_height); } static void apply_temporal_filter( const uint8_t *frame, const unsigned int stride, const uint32_t block_width, const uint32_t block_height, const int *subblock_mses, unsigned int *accumulator, uint16_t *count, const uint8_t *frame_abs_diff, const uint32_t *luma_sse_sum, const double inv_num_ref_pixels, const double decay_factor, const double inv_factor, const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) { assert(((block_width == 16) || (block_width == 32)) && ((block_height == 16) || (block_height == 32))); uint32_t diff_sse[BH][BW]; const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask); const uint8x16_t pad_tbl0 = vld1q_u8(kLoadPad[0]); const uint8x16_t pad_tbl1 = vld1q_u8(kLoadPad[1]); const uint8x16_t pad_tbl2 = vld1q_u8(kLoadPad[2]); const uint8x16_t pad_tbl3 = vld1q_u8(kLoadPad[3]); // Traverse 4 columns at a time - first and last two columns need padding. for (uint32_t col = 0; col < block_width; col += 4) { uint8x16_t vsrc[5][2]; const uint8_t *src = frame_abs_diff + col; // Load, pad (for first and last two columns) and mask 3 rows from the top. for (int i = 2; i < 5; i++) { uint8x8_t s = vld1_u8(src); uint8x16_t s_dup = vcombine_u8(s, s); if (col == 0) { vsrc[i][0] = vqtbl1q_u8(s_dup, pad_tbl0); vsrc[i][1] = vqtbl1q_u8(s_dup, pad_tbl1); } else if (col >= block_width - 4) { vsrc[i][0] = vqtbl1q_u8(s_dup, pad_tbl2); vsrc[i][1] = vqtbl1q_u8(s_dup, pad_tbl3); } else { vsrc[i][0] = vandq_u8(s_dup, vmask.val[0]); vsrc[i][1] = vandq_u8(s_dup, vmask.val[1]); } src += SSE_STRIDE; } // Pad the top 2 rows. vsrc[0][0] = vsrc[2][0]; vsrc[0][1] = vsrc[2][1]; vsrc[1][0] = vsrc[2][0]; vsrc[1][1] = vsrc[2][1]; uint32x4_t sum_01 = vdupq_n_u32(0); uint32x4_t sum_23 = vdupq_n_u32(0); sum_01 = vdotq_u32(sum_01, vsrc[0][0], vsrc[0][0]); sum_01 = vdotq_u32(sum_01, vsrc[1][0], vsrc[1][0]); sum_01 = vdotq_u32(sum_01, vsrc[2][0], vsrc[2][0]); sum_01 = vdotq_u32(sum_01, vsrc[3][0], vsrc[3][0]); sum_01 = vdotq_u32(sum_01, vsrc[4][0], vsrc[4][0]); sum_23 = vdotq_u32(sum_23, vsrc[0][1], vsrc[0][1]); sum_23 = vdotq_u32(sum_23, vsrc[1][1], vsrc[1][1]); sum_23 = vdotq_u32(sum_23, vsrc[2][1], vsrc[2][1]); sum_23 = vdotq_u32(sum_23, vsrc[3][1], vsrc[3][1]); sum_23 = vdotq_u32(sum_23, vsrc[4][1], vsrc[4][1]); for (unsigned int row = 0; row < block_height; row++) { uint32x4_t sum_luma = vld1q_u32(luma_sse_sum + row * BW + col); uint32x4_t sum_0123 = vpaddq_u32(sum_01, sum_23); vst1q_u32(&diff_sse[row][col], vaddq_u32(sum_0123, sum_luma)); uint32x4_t sub_01 = vdotq_u32(vdupq_n_u32(0), vsrc[0][0], vsrc[0][0]); uint32x4_t sub_23 = vdotq_u32(vdupq_n_u32(0), vsrc[0][1], vsrc[0][1]); sum_01 = vsubq_u32(sum_01, sub_01); sum_23 = vsubq_u32(sum_23, sub_23); // Push all rows in the sliding window up one. for (int i = 0; i < 4; i++) { vsrc[i][0] = vsrc[i + 1][0]; vsrc[i][1] = vsrc[i + 1][1]; } if (row <= block_height - 4) { // Load next row into the bottom of the sliding window. uint8x8_t s = vld1_u8(src); uint8x16_t s_dup = vcombine_u8(s, s); if (col == 0) { vsrc[4][0] = vqtbl1q_u8(s_dup, pad_tbl0); vsrc[4][1] = vqtbl1q_u8(s_dup, pad_tbl1); } else if (col >= block_width - 4) { vsrc[4][0] = vqtbl1q_u8(s_dup, pad_tbl2); vsrc[4][1] = vqtbl1q_u8(s_dup, pad_tbl3); } else { vsrc[4][0] = vandq_u8(s_dup, vmask.val[0]); vsrc[4][1] = vandq_u8(s_dup, vmask.val[1]); } src += SSE_STRIDE; } else { // Pad the bottom 2 rows. vsrc[4][0] = vsrc[3][0]; vsrc[4][1] = vsrc[3][1]; } sum_01 = vdotq_u32(sum_01, vsrc[4][0], vsrc[4][0]); sum_23 = vdotq_u32(sum_23, vsrc[4][1], vsrc[4][1]); } } // Perform filtering. if (tf_wgt_calc_lvl == 0) { for (unsigned int i = 0, k = 0; i < block_height; i++) { for (unsigned int j = 0; j < block_width; j++, k++) { const int pixel_value = frame[i * stride + j]; const double window_error = diff_sse[i][j] * inv_num_ref_pixels; const int subblock_idx = (i >= block_height / 2) * 2 + (j >= block_width / 2); const double block_error = (double)subblock_mses[subblock_idx]; const double combined_error = weight_factor * window_error + block_error * inv_factor; // Compute filter weight. double scaled_error = combined_error * d_factor[subblock_idx] * decay_factor; scaled_error = AOMMIN(scaled_error, 7); const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); accumulator[k] += weight * pixel_value; count[k] += weight; } } } else { for (unsigned int i = 0, k = 0; i < block_height; i++) { for (unsigned int j = 0; j < block_width; j++, k++) { const int pixel_value = frame[i * stride + j]; const double window_error = diff_sse[i][j] * inv_num_ref_pixels; const int subblock_idx = (i >= block_height / 2) * 2 + (j >= block_width / 2); const double block_error = (double)subblock_mses[subblock_idx]; const double combined_error = weight_factor * window_error + block_error * inv_factor; // Compute filter weight. double scaled_error = combined_error * d_factor[subblock_idx] * decay_factor; scaled_error = AOMMIN(scaled_error, 7); const float fweight = approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; const int weight = iroundpf(fweight); accumulator[k] += weight * pixel_value; count[k] += weight; } } } } void av1_apply_temporal_filter_neon_dotprod( const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count) { const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!"); assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!"); assert(!is_high_bitdepth && "Only support low bit-depth with Neon!"); assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); (void)is_high_bitdepth; // Block information. const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; // Frame information. const int frame_height = frame_to_filter->y_crop_height; const int frame_width = frame_to_filter->y_crop_width; const int min_frame_size = AOMMIN(frame_height, frame_width); // Variables to simplify combined error calculation. const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * TF_SEARCH_ERROR_NORM_WEIGHT); const double weight_factor = (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; // Adjust filtering based on q. // Larger q -> stronger filtering -> larger weight. // Smaller q -> weaker filtering -> smaller weight. double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); q_decay = CLIP(q_decay, 1e-5, 1); if (q_factor >= TF_QINDEX_CUTOFF) { // Max q_factor is 255, therefore the upper bound of q_decay is 8. // We do not need a clip here. q_decay = 0.5 * pow((double)q_factor / 64, 2); } // Smaller strength -> smaller filtering weight. double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); s_decay = CLIP(s_decay, 1e-5, 1); double d_factor[4] = { 0 }; uint8_t frame_abs_diff[SSE_STRIDE * BH] = { 0 }; uint32_t luma_sse_sum[BW * BH] = { 0 }; for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { // Larger motion vector -> smaller filtering weight. const MV mv = subblock_mvs[subblock_idx]; const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; distance_threshold = AOMMAX(distance_threshold, 1); d_factor[subblock_idx] = distance / distance_threshold; d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); } // Handle planes in sequence. int plane_offset = 0; for (int plane = 0; plane < num_planes; ++plane) { const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; const uint32_t frame_stride = frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1]; const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset; const int ss_x_shift = mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; const int ss_y_shift = mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); const double inv_num_ref_pixels = 1.0 / num_ref_pixels; // Larger noise -> larger filtering weight. const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); // Decay factors for non-local mean approach. const double decay_factor = 1 / (n_decay * q_decay * s_decay); // Filter U-plane and V-plane using Y-plane. This is because motion // search is only done on Y-plane, so the information from Y-plane // will be more accurate. The luma sse sum is reused in both chroma // planes. if (plane == AOM_PLANE_U) { for (unsigned int i = 0; i < plane_h; i++) { for (unsigned int j = 0; j < plane_w; j++) { for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. luma_sse_sum[i * BW + j] += (frame_abs_diff[yy * SSE_STRIDE + xx + 2] * frame_abs_diff[yy * SSE_STRIDE + xx + 2]); } } } } } get_abs_diff(ref, frame_stride, pred + plane_offset, plane_w, plane_w, plane_h, frame_abs_diff, SSE_STRIDE); apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h, subblock_mses, accum + plane_offset, count + plane_offset, frame_abs_diff, luma_sse_sum, inv_num_ref_pixels, decay_factor, inv_factor, weight_factor, d_factor, tf_wgt_calc_lvl); plane_offset += plane_h * plane_w; } } aom-3.12.1/av1/encoder/arm/txfm_neon.h000066400000000000000000000017221477627663500174360ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_ARM_TXFM_NEON_H_ #define AOM_AV1_ENCODER_ARM_TXFM_NEON_H_ #include static inline void ud_adjust_input_and_stride(int ud_flip, const int16_t **input, int *stride, int out_size) { if (ud_flip) { *input = *input + (out_size - 1) * *stride; *stride = -*stride; } } #endif // AOM_AV1_ENCODER_ARM_TXFM_NEON_H_ aom-3.12.1/av1/encoder/arm/wedge_utils_neon.c000066400000000000000000000113151477627663500207650ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/sum_neon.h" #include "av1/common/reconinter.h" #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) /** * See av1_wedge_sse_from_residuals_c for details of the parameters and * computation. */ uint64_t av1_wedge_sse_from_residuals_neon(const int16_t *r1, const int16_t *d, const uint8_t *m, int N) { assert(N % 64 == 0); uint64x2_t v_csse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; int i = 0; do { int32x4_t sum[4]; int32x4_t sse[2]; int16x4_t sum_s16[4]; const int16x8_t r1_l = vld1q_s16(r1 + i); const int16x8_t r1_h = vld1q_s16(r1 + i + 8); const int16x8_t d_l = vld1q_s16(d + i); const int16x8_t d_h = vld1q_s16(d + i + 8); // The following three lines are a bit inelegant compared to using a pair // of vmovl_u8()... but it forces the compiler to emit a ZIP1, ZIP2 pair - // which can be executed in parallel with the subsequent SSHL instructions. // (SSHL can only be executed on half of the Neon pipes in modern Arm // cores, whereas ZIP1/2 can be executed on all of them.) const uint8x16x2_t m_u16 = vzipq_u8(vld1q_u8(m + i), vdupq_n_u8(0)); const int16x8_t m_l = vreinterpretq_s16_u8(m_u16.val[0]); const int16x8_t m_h = vreinterpretq_s16_u8(m_u16.val[1]); sum[0] = vshll_n_s16(vget_low_s16(r1_l), WEDGE_WEIGHT_BITS); sum[1] = vshll_n_s16(vget_high_s16(r1_l), WEDGE_WEIGHT_BITS); sum[2] = vshll_n_s16(vget_low_s16(r1_h), WEDGE_WEIGHT_BITS); sum[3] = vshll_n_s16(vget_high_s16(r1_h), WEDGE_WEIGHT_BITS); sum[0] = vmlal_s16(sum[0], vget_low_s16(m_l), vget_low_s16(d_l)); sum[1] = vmlal_s16(sum[1], vget_high_s16(m_l), vget_high_s16(d_l)); sum[2] = vmlal_s16(sum[2], vget_low_s16(m_h), vget_low_s16(d_h)); sum[3] = vmlal_s16(sum[3], vget_high_s16(m_h), vget_high_s16(d_h)); sum_s16[0] = vqmovn_s32(sum[0]); sum_s16[1] = vqmovn_s32(sum[1]); sum_s16[2] = vqmovn_s32(sum[2]); sum_s16[3] = vqmovn_s32(sum[3]); sse[0] = vmull_s16(sum_s16[0], sum_s16[0]); sse[1] = vmull_s16(sum_s16[2], sum_s16[2]); sse[0] = vmlal_s16(sse[0], sum_s16[1], sum_s16[1]); sse[1] = vmlal_s16(sse[1], sum_s16[3], sum_s16[3]); v_csse[0] = vpadalq_u32(v_csse[0], vreinterpretq_u32_s32(sse[0])); v_csse[1] = vpadalq_u32(v_csse[1], vreinterpretq_u32_s32(sse[1])); i += 16; } while (i < N); uint64_t csse = horizontal_add_u64x2(vaddq_u64(v_csse[0], v_csse[1])); return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); } int8_t av1_wedge_sign_from_residuals_neon(const int16_t *ds, const uint8_t *m, int N, int64_t limit) { int32x4_t acc[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0) }; do { int16x8_t ds_l = vld1q_s16(ds); int16x8_t ds_h = vld1q_s16(ds + 8); int8x16_t m_s8 = vreinterpretq_s8_u8(vld1q_u8(m)); int16x8_t m_l = vmovl_s8(vget_low_s8(m_s8)); int16x8_t m_h = vmovl_s8(vget_high_s8(m_s8)); acc[0] = vmlal_s16(acc[0], vget_low_s16(ds_l), vget_low_s16(m_l)); acc[1] = vmlal_s16(acc[1], vget_high_s16(ds_l), vget_high_s16(m_l)); acc[2] = vmlal_s16(acc[2], vget_low_s16(ds_h), vget_low_s16(m_h)); acc[3] = vmlal_s16(acc[3], vget_high_s16(ds_h), vget_high_s16(m_h)); ds += 16; m += 16; N -= 16; } while (N != 0); int64x2_t sum = vpaddlq_s32(acc[0]); sum = vpadalq_s32(sum, acc[1]); sum = vpadalq_s32(sum, acc[2]); sum = vpadalq_s32(sum, acc[3]); return (horizontal_add_s64x2(sum) > limit); } void av1_wedge_compute_delta_squares_neon(int16_t *d_ptr, const int16_t *a_ptr, const int16_t *b_ptr, int N) { do { int16x8_t a = vld1q_s16(a_ptr); int16x8_t b = vld1q_s16(b_ptr); int32x4_t sq_lo = vmull_s16(vget_low_s16(a), vget_low_s16(a)); int32x4_t sq_hi = vmull_s16(vget_high_s16(a), vget_high_s16(a)); sq_lo = vmlsl_s16(sq_lo, vget_low_s16(b), vget_low_s16(b)); sq_hi = vmlsl_s16(sq_hi, vget_high_s16(b), vget_high_s16(b)); int16x8_t res = vcombine_s16(vqmovn_s32(sq_lo), vqmovn_s32(sq_hi)); vst1q_s16(d_ptr, res); d_ptr += 8; a_ptr += 8; b_ptr += 8; N -= 8; } while (N != 0); } aom-3.12.1/av1/encoder/arm/wedge_utils_sve.c000066400000000000000000000064531477627663500206320ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/arm/aom_neon_sve_bridge.h" #include "aom_dsp/arm/sum_neon.h" #include "av1/common/reconinter.h" uint64_t av1_wedge_sse_from_residuals_sve(const int16_t *r1, const int16_t *d, const uint8_t *m, int N) { assert(N % 64 == 0); // Predicate pattern with first 8 elements true. const svbool_t pattern = svptrue_pat_b16(SV_VL8); int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; int i = 0; do { int32x4_t sum[4]; int16x8_t sum_s16[2]; const int16x8_t r1_l = vld1q_s16(r1 + i); const int16x8_t r1_h = vld1q_s16(r1 + i + 8); const int16x8_t d_l = vld1q_s16(d + i); const int16x8_t d_h = vld1q_s16(d + i + 8); // Use a zero-extending load to widen the vector elements. const int16x8_t m_l = svget_neonq_s16(svld1ub_s16(pattern, m + i)); const int16x8_t m_h = svget_neonq_s16(svld1ub_s16(pattern, m + i + 8)); sum[0] = vshll_n_s16(vget_low_s16(r1_l), WEDGE_WEIGHT_BITS); sum[1] = vshll_n_s16(vget_high_s16(r1_l), WEDGE_WEIGHT_BITS); sum[2] = vshll_n_s16(vget_low_s16(r1_h), WEDGE_WEIGHT_BITS); sum[3] = vshll_n_s16(vget_high_s16(r1_h), WEDGE_WEIGHT_BITS); sum[0] = vmlal_s16(sum[0], vget_low_s16(m_l), vget_low_s16(d_l)); sum[1] = vmlal_s16(sum[1], vget_high_s16(m_l), vget_high_s16(d_l)); sum[2] = vmlal_s16(sum[2], vget_low_s16(m_h), vget_low_s16(d_h)); sum[3] = vmlal_s16(sum[3], vget_high_s16(m_h), vget_high_s16(d_h)); sum_s16[0] = vcombine_s16(vqmovn_s32(sum[0]), vqmovn_s32(sum[1])); sum_s16[1] = vcombine_s16(vqmovn_s32(sum[2]), vqmovn_s32(sum[3])); sse[0] = aom_sdotq_s16(sse[0], sum_s16[0], sum_s16[0]); sse[1] = aom_sdotq_s16(sse[1], sum_s16[1], sum_s16[1]); i += 16; } while (i < N); const uint64_t csse = (uint64_t)horizontal_add_s64x2(vaddq_s64(sse[0], sse[1])); return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); } int8_t av1_wedge_sign_from_residuals_sve(const int16_t *ds, const uint8_t *m, int N, int64_t limit) { assert(N % 16 == 0); // Predicate pattern with first 8 elements true. svbool_t pattern = svptrue_pat_b16(SV_VL8); int64x2_t acc_l = vdupq_n_s64(0); int64x2_t acc_h = vdupq_n_s64(0); do { const int16x8_t ds_l = vld1q_s16(ds); const int16x8_t ds_h = vld1q_s16(ds + 8); // Use a zero-extending load to widen the vector elements. const int16x8_t m_l = svget_neonq_s16(svld1ub_s16(pattern, m)); const int16x8_t m_h = svget_neonq_s16(svld1ub_s16(pattern, m + 8)); acc_l = aom_sdotq_s16(acc_l, ds_l, m_l); acc_h = aom_sdotq_s16(acc_h, ds_h, m_h); ds += 16; m += 16; N -= 16; } while (N != 0); const int64x2_t sum = vaddq_s64(acc_l, acc_h); return horizontal_add_s64x2(sum) > limit; } aom-3.12.1/av1/encoder/av1_fwd_txfm1d.c000066400000000000000000001740411477627663500174740ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/encoder/av1_fwd_txfm1d.h" #include "av1/common/av1_txfm.h" void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { const int32_t size = 4; const int32_t *cospi; int32_t stage = 0; int32_t *bf0, *bf1; int32_t step[4]; // stage 0; av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; bf1 = output; bf1[0] = input[0] + input[3]; bf1[1] = input[1] + input[2]; bf1[2] = -input[2] + input[1]; bf1[3] = -input[3] + input[0]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; bf0 = step; bf1 = output; bf1[0] = bf0[0]; bf1[1] = bf0[2]; bf1[2] = bf0[1]; bf1[3] = bf0[3]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { const int32_t size = 8; const int32_t *cospi; int32_t stage = 0; int32_t *bf0, *bf1; int32_t step[8]; // stage 0; av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; bf1 = output; bf1[0] = input[0] + input[7]; bf1[1] = input[1] + input[6]; bf1[2] = input[2] + input[5]; bf1[3] = input[3] + input[4]; bf1[4] = -input[4] + input[3]; bf1[5] = -input[5] + input[2]; bf1[6] = -input[6] + input[1]; bf1[7] = -input[7] + input[0]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0] + bf0[3]; bf1[1] = bf0[1] + bf0[2]; bf1[2] = -bf0[2] + bf0[1]; bf1[3] = -bf0[3] + bf0[0]; bf1[4] = bf0[4]; bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); bf1[7] = bf0[7]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; cospi = cospi_arr(cos_bit); bf0 = step; bf1 = output; bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); bf1[4] = bf0[4] + bf0[5]; bf1[5] = -bf0[5] + bf0[4]; bf1[6] = -bf0[6] + bf0[7]; bf1[7] = bf0[7] + bf0[6]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; bf0 = step; bf1 = output; bf1[0] = bf0[0]; bf1[1] = bf0[4]; bf1[2] = bf0[2]; bf1[3] = bf0[6]; bf1[4] = bf0[1]; bf1[5] = bf0[5]; bf1[6] = bf0[3]; bf1[7] = bf0[7]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { const int32_t size = 16; const int32_t *cospi; int32_t stage = 0; int32_t *bf0, *bf1; int32_t step[16]; // stage 0; av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; bf1 = output; bf1[0] = input[0] + input[15]; bf1[1] = input[1] + input[14]; bf1[2] = input[2] + input[13]; bf1[3] = input[3] + input[12]; bf1[4] = input[4] + input[11]; bf1[5] = input[5] + input[10]; bf1[6] = input[6] + input[9]; bf1[7] = input[7] + input[8]; bf1[8] = -input[8] + input[7]; bf1[9] = -input[9] + input[6]; bf1[10] = -input[10] + input[5]; bf1[11] = -input[11] + input[4]; bf1[12] = -input[12] + input[3]; bf1[13] = -input[13] + input[2]; bf1[14] = -input[14] + input[1]; bf1[15] = -input[15] + input[0]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0] + bf0[7]; bf1[1] = bf0[1] + bf0[6]; bf1[2] = bf0[2] + bf0[5]; bf1[3] = bf0[3] + bf0[4]; bf1[4] = -bf0[4] + bf0[3]; bf1[5] = -bf0[5] + bf0[2]; bf1[6] = -bf0[6] + bf0[1]; bf1[7] = -bf0[7] + bf0[0]; bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); bf1[14] = bf0[14]; bf1[15] = bf0[15]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; cospi = cospi_arr(cos_bit); bf0 = step; bf1 = output; bf1[0] = bf0[0] + bf0[3]; bf1[1] = bf0[1] + bf0[2]; bf1[2] = -bf0[2] + bf0[1]; bf1[3] = -bf0[3] + bf0[0]; bf1[4] = bf0[4]; bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); bf1[7] = bf0[7]; bf1[8] = bf0[8] + bf0[11]; bf1[9] = bf0[9] + bf0[10]; bf1[10] = -bf0[10] + bf0[9]; bf1[11] = -bf0[11] + bf0[8]; bf1[12] = -bf0[12] + bf0[15]; bf1[13] = -bf0[13] + bf0[14]; bf1[14] = bf0[14] + bf0[13]; bf1[15] = bf0[15] + bf0[12]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); bf1[4] = bf0[4] + bf0[5]; bf1[5] = -bf0[5] + bf0[4]; bf1[6] = -bf0[6] + bf0[7]; bf1[7] = bf0[7] + bf0[6]; bf1[8] = bf0[8]; bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); bf1[11] = bf0[11]; bf1[12] = bf0[12]; bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); bf1[15] = bf0[15]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; cospi = cospi_arr(cos_bit); bf0 = step; bf1 = output; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); bf1[8] = bf0[8] + bf0[9]; bf1[9] = -bf0[9] + bf0[8]; bf1[10] = -bf0[10] + bf0[11]; bf1[11] = bf0[11] + bf0[10]; bf1[12] = bf0[12] + bf0[13]; bf1[13] = -bf0[13] + bf0[12]; bf1[14] = -bf0[14] + bf0[15]; bf1[15] = bf0[15] + bf0[14]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; bf0 = step; bf1 = output; bf1[0] = bf0[0]; bf1[1] = bf0[8]; bf1[2] = bf0[4]; bf1[3] = bf0[12]; bf1[4] = bf0[2]; bf1[5] = bf0[10]; bf1[6] = bf0[6]; bf1[7] = bf0[14]; bf1[8] = bf0[1]; bf1[9] = bf0[9]; bf1[10] = bf0[5]; bf1[11] = bf0[13]; bf1[12] = bf0[3]; bf1[13] = bf0[11]; bf1[14] = bf0[7]; bf1[15] = bf0[15]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { const int32_t size = 32; const int32_t *cospi; int32_t stage = 0; int32_t *bf0, *bf1; int32_t step[32]; // stage 0; av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; bf1 = output; bf1[0] = input[0] + input[31]; bf1[1] = input[1] + input[30]; bf1[2] = input[2] + input[29]; bf1[3] = input[3] + input[28]; bf1[4] = input[4] + input[27]; bf1[5] = input[5] + input[26]; bf1[6] = input[6] + input[25]; bf1[7] = input[7] + input[24]; bf1[8] = input[8] + input[23]; bf1[9] = input[9] + input[22]; bf1[10] = input[10] + input[21]; bf1[11] = input[11] + input[20]; bf1[12] = input[12] + input[19]; bf1[13] = input[13] + input[18]; bf1[14] = input[14] + input[17]; bf1[15] = input[15] + input[16]; bf1[16] = -input[16] + input[15]; bf1[17] = -input[17] + input[14]; bf1[18] = -input[18] + input[13]; bf1[19] = -input[19] + input[12]; bf1[20] = -input[20] + input[11]; bf1[21] = -input[21] + input[10]; bf1[22] = -input[22] + input[9]; bf1[23] = -input[23] + input[8]; bf1[24] = -input[24] + input[7]; bf1[25] = -input[25] + input[6]; bf1[26] = -input[26] + input[5]; bf1[27] = -input[27] + input[4]; bf1[28] = -input[28] + input[3]; bf1[29] = -input[29] + input[2]; bf1[30] = -input[30] + input[1]; bf1[31] = -input[31] + input[0]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0] + bf0[15]; bf1[1] = bf0[1] + bf0[14]; bf1[2] = bf0[2] + bf0[13]; bf1[3] = bf0[3] + bf0[12]; bf1[4] = bf0[4] + bf0[11]; bf1[5] = bf0[5] + bf0[10]; bf1[6] = bf0[6] + bf0[9]; bf1[7] = bf0[7] + bf0[8]; bf1[8] = -bf0[8] + bf0[7]; bf1[9] = -bf0[9] + bf0[6]; bf1[10] = -bf0[10] + bf0[5]; bf1[11] = -bf0[11] + bf0[4]; bf1[12] = -bf0[12] + bf0[3]; bf1[13] = -bf0[13] + bf0[2]; bf1[14] = -bf0[14] + bf0[1]; bf1[15] = -bf0[15] + bf0[0]; bf1[16] = bf0[16]; bf1[17] = bf0[17]; bf1[18] = bf0[18]; bf1[19] = bf0[19]; bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit); bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit); bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit); bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit); bf1[28] = bf0[28]; bf1[29] = bf0[29]; bf1[30] = bf0[30]; bf1[31] = bf0[31]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; cospi = cospi_arr(cos_bit); bf0 = step; bf1 = output; bf1[0] = bf0[0] + bf0[7]; bf1[1] = bf0[1] + bf0[6]; bf1[2] = bf0[2] + bf0[5]; bf1[3] = bf0[3] + bf0[4]; bf1[4] = -bf0[4] + bf0[3]; bf1[5] = -bf0[5] + bf0[2]; bf1[6] = -bf0[6] + bf0[1]; bf1[7] = -bf0[7] + bf0[0]; bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); bf1[14] = bf0[14]; bf1[15] = bf0[15]; bf1[16] = bf0[16] + bf0[23]; bf1[17] = bf0[17] + bf0[22]; bf1[18] = bf0[18] + bf0[21]; bf1[19] = bf0[19] + bf0[20]; bf1[20] = -bf0[20] + bf0[19]; bf1[21] = -bf0[21] + bf0[18]; bf1[22] = -bf0[22] + bf0[17]; bf1[23] = -bf0[23] + bf0[16]; bf1[24] = -bf0[24] + bf0[31]; bf1[25] = -bf0[25] + bf0[30]; bf1[26] = -bf0[26] + bf0[29]; bf1[27] = -bf0[27] + bf0[28]; bf1[28] = bf0[28] + bf0[27]; bf1[29] = bf0[29] + bf0[26]; bf1[30] = bf0[30] + bf0[25]; bf1[31] = bf0[31] + bf0[24]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0] + bf0[3]; bf1[1] = bf0[1] + bf0[2]; bf1[2] = -bf0[2] + bf0[1]; bf1[3] = -bf0[3] + bf0[0]; bf1[4] = bf0[4]; bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); bf1[7] = bf0[7]; bf1[8] = bf0[8] + bf0[11]; bf1[9] = bf0[9] + bf0[10]; bf1[10] = -bf0[10] + bf0[9]; bf1[11] = -bf0[11] + bf0[8]; bf1[12] = -bf0[12] + bf0[15]; bf1[13] = -bf0[13] + bf0[14]; bf1[14] = bf0[14] + bf0[13]; bf1[15] = bf0[15] + bf0[12]; bf1[16] = bf0[16]; bf1[17] = bf0[17]; bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); bf1[22] = bf0[22]; bf1[23] = bf0[23]; bf1[24] = bf0[24]; bf1[25] = bf0[25]; bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit); bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit); bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit); bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); bf1[30] = bf0[30]; bf1[31] = bf0[31]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; cospi = cospi_arr(cos_bit); bf0 = step; bf1 = output; bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); bf1[4] = bf0[4] + bf0[5]; bf1[5] = -bf0[5] + bf0[4]; bf1[6] = -bf0[6] + bf0[7]; bf1[7] = bf0[7] + bf0[6]; bf1[8] = bf0[8]; bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); bf1[11] = bf0[11]; bf1[12] = bf0[12]; bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); bf1[15] = bf0[15]; bf1[16] = bf0[16] + bf0[19]; bf1[17] = bf0[17] + bf0[18]; bf1[18] = -bf0[18] + bf0[17]; bf1[19] = -bf0[19] + bf0[16]; bf1[20] = -bf0[20] + bf0[23]; bf1[21] = -bf0[21] + bf0[22]; bf1[22] = bf0[22] + bf0[21]; bf1[23] = bf0[23] + bf0[20]; bf1[24] = bf0[24] + bf0[27]; bf1[25] = bf0[25] + bf0[26]; bf1[26] = -bf0[26] + bf0[25]; bf1[27] = -bf0[27] + bf0[24]; bf1[28] = -bf0[28] + bf0[31]; bf1[29] = -bf0[29] + bf0[30]; bf1[30] = bf0[30] + bf0[29]; bf1[31] = bf0[31] + bf0[28]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); bf1[8] = bf0[8] + bf0[9]; bf1[9] = -bf0[9] + bf0[8]; bf1[10] = -bf0[10] + bf0[11]; bf1[11] = bf0[11] + bf0[10]; bf1[12] = bf0[12] + bf0[13]; bf1[13] = -bf0[13] + bf0[12]; bf1[14] = -bf0[14] + bf0[15]; bf1[15] = bf0[15] + bf0[14]; bf1[16] = bf0[16]; bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); bf1[19] = bf0[19]; bf1[20] = bf0[20]; bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); bf1[23] = bf0[23]; bf1[24] = bf0[24]; bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit); bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit); bf1[27] = bf0[27]; bf1[28] = bf0[28]; bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); bf1[31] = bf0[31]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; cospi = cospi_arr(cos_bit); bf0 = step; bf1 = output; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); bf1[16] = bf0[16] + bf0[17]; bf1[17] = -bf0[17] + bf0[16]; bf1[18] = -bf0[18] + bf0[19]; bf1[19] = bf0[19] + bf0[18]; bf1[20] = bf0[20] + bf0[21]; bf1[21] = -bf0[21] + bf0[20]; bf1[22] = -bf0[22] + bf0[23]; bf1[23] = bf0[23] + bf0[22]; bf1[24] = bf0[24] + bf0[25]; bf1[25] = -bf0[25] + bf0[24]; bf1[26] = -bf0[26] + bf0[27]; bf1[27] = bf0[27] + bf0[26]; bf1[28] = bf0[28] + bf0[29]; bf1[29] = -bf0[29] + bf0[28]; bf1[30] = -bf0[30] + bf0[31]; bf1[31] = bf0[31] + bf0[30]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = bf0[10]; bf1[11] = bf0[11]; bf1[12] = bf0[12]; bf1[13] = bf0[13]; bf1[14] = bf0[14]; bf1[15] = bf0[15]; bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit); bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit); bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit); bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit); bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit); bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit); bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit); bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit); bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit); bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit); bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit); bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit); bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit); bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit); bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit); bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; bf0 = step; bf1 = output; bf1[0] = bf0[0]; bf1[1] = bf0[16]; bf1[2] = bf0[8]; bf1[3] = bf0[24]; bf1[4] = bf0[4]; bf1[5] = bf0[20]; bf1[6] = bf0[12]; bf1[7] = bf0[28]; bf1[8] = bf0[2]; bf1[9] = bf0[18]; bf1[10] = bf0[10]; bf1[11] = bf0[26]; bf1[12] = bf0[6]; bf1[13] = bf0[22]; bf1[14] = bf0[14]; bf1[15] = bf0[30]; bf1[16] = bf0[1]; bf1[17] = bf0[17]; bf1[18] = bf0[9]; bf1[19] = bf0[25]; bf1[20] = bf0[5]; bf1[21] = bf0[21]; bf1[22] = bf0[13]; bf1[23] = bf0[29]; bf1[24] = bf0[3]; bf1[25] = bf0[19]; bf1[26] = bf0[11]; bf1[27] = bf0[27]; bf1[28] = bf0[7]; bf1[29] = bf0[23]; bf1[30] = bf0[15]; bf1[31] = bf0[31]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { int bit = cos_bit; const int32_t *sinpi = sinpi_arr(bit); int32_t x0, x1, x2, x3; int32_t s0, s1, s2, s3, s4, s5, s6, s7; // stage 0 av1_range_check_buf(0, input, input, 4, stage_range[0]); x0 = input[0]; x1 = input[1]; x2 = input[2]; x3 = input[3]; if (!(x0 | x1 | x2 | x3)) { output[0] = output[1] = output[2] = output[3] = 0; return; } // stage 1 s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]); s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]); s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]); s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]); s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]); s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]); s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]); s7 = range_check_value(x0 + x1, stage_range[1]); // stage 2 s7 = range_check_value(s7 - x3, stage_range[2]); // stage 3 x0 = range_check_value(s0 + s2, bit + stage_range[3]); x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]); x2 = range_check_value(s1 - s3, bit + stage_range[3]); x3 = range_check_value(s4, bit + stage_range[3]); // stage 4 x0 = range_check_value(x0 + s5, bit + stage_range[4]); x2 = range_check_value(x2 + s6, bit + stage_range[4]); // stage 5 s0 = range_check_value(x0 + x3, bit + stage_range[5]); s1 = range_check_value(x1, bit + stage_range[5]); s2 = range_check_value(x2 - x3, bit + stage_range[5]); s3 = range_check_value(x2 - x0, bit + stage_range[5]); // stage 6 s3 = range_check_value(s3 + x3, bit + stage_range[6]); // 1-D transform scaling factor is sqrt(2). output[0] = round_shift(s0, bit); output[1] = round_shift(s1, bit); output[2] = round_shift(s2, bit); output[3] = round_shift(s3, bit); av1_range_check_buf(6, input, output, 4, stage_range[6]); } void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { const int32_t size = 8; const int32_t *cospi; int32_t stage = 0; int32_t *bf0, *bf1; int32_t step[8]; // stage 0; av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; assert(output != input); bf1 = output; bf1[0] = input[0]; bf1[1] = -input[7]; bf1[2] = -input[3]; bf1[3] = input[4]; bf1[4] = -input[1]; bf1[5] = input[6]; bf1[6] = input[2]; bf1[7] = -input[5]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; bf0 = step; bf1 = output; bf1[0] = bf0[0] + bf0[2]; bf1[1] = bf0[1] + bf0[3]; bf1[2] = bf0[0] - bf0[2]; bf1[3] = bf0[1] - bf0[3]; bf1[4] = bf0[4] + bf0[6]; bf1[5] = bf0[5] + bf0[7]; bf1[6] = bf0[4] - bf0[6]; bf1[7] = bf0[5] - bf0[7]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; bf0 = step; bf1 = output; bf1[0] = bf0[0] + bf0[4]; bf1[1] = bf0[1] + bf0[5]; bf1[2] = bf0[2] + bf0[6]; bf1[3] = bf0[3] + bf0[7]; bf1[4] = bf0[0] - bf0[4]; bf1[5] = bf0[1] - bf0[5]; bf1[6] = bf0[2] - bf0[6]; bf1[7] = bf0[3] - bf0[7]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit); bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit); bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit); bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit); bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit); bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit); bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit); bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; bf0 = step; bf1 = output; bf1[0] = bf0[1]; bf1[1] = bf0[6]; bf1[2] = bf0[3]; bf1[3] = bf0[4]; bf1[4] = bf0[5]; bf1[5] = bf0[2]; bf1[6] = bf0[7]; bf1[7] = bf0[0]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { const int32_t size = 16; const int32_t *cospi; int32_t stage = 0; int32_t *bf0, *bf1; int32_t step[16]; // stage 0; av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; assert(output != input); bf1 = output; bf1[0] = input[0]; bf1[1] = -input[15]; bf1[2] = -input[7]; bf1[3] = input[8]; bf1[4] = -input[3]; bf1[5] = input[12]; bf1[6] = input[4]; bf1[7] = -input[11]; bf1[8] = -input[1]; bf1[9] = input[14]; bf1[10] = input[6]; bf1[11] = -input[9]; bf1[12] = input[2]; bf1[13] = -input[13]; bf1[14] = -input[5]; bf1[15] = input[10]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit); bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit); bf1[12] = bf0[12]; bf1[13] = bf0[13]; bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; bf0 = step; bf1 = output; bf1[0] = bf0[0] + bf0[2]; bf1[1] = bf0[1] + bf0[3]; bf1[2] = bf0[0] - bf0[2]; bf1[3] = bf0[1] - bf0[3]; bf1[4] = bf0[4] + bf0[6]; bf1[5] = bf0[5] + bf0[7]; bf1[6] = bf0[4] - bf0[6]; bf1[7] = bf0[5] - bf0[7]; bf1[8] = bf0[8] + bf0[10]; bf1[9] = bf0[9] + bf0[11]; bf1[10] = bf0[8] - bf0[10]; bf1[11] = bf0[9] - bf0[11]; bf1[12] = bf0[12] + bf0[14]; bf1[13] = bf0[13] + bf0[15]; bf1[14] = bf0[12] - bf0[14]; bf1[15] = bf0[13] - bf0[15]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = bf0[10]; bf1[11] = bf0[11]; bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit); bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit); bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; bf0 = step; bf1 = output; bf1[0] = bf0[0] + bf0[4]; bf1[1] = bf0[1] + bf0[5]; bf1[2] = bf0[2] + bf0[6]; bf1[3] = bf0[3] + bf0[7]; bf1[4] = bf0[0] - bf0[4]; bf1[5] = bf0[1] - bf0[5]; bf1[6] = bf0[2] - bf0[6]; bf1[7] = bf0[3] - bf0[7]; bf1[8] = bf0[8] + bf0[12]; bf1[9] = bf0[9] + bf0[13]; bf1[10] = bf0[10] + bf0[14]; bf1[11] = bf0[11] + bf0[15]; bf1[12] = bf0[8] - bf0[12]; bf1[13] = bf0[9] - bf0[13]; bf1[14] = bf0[10] - bf0[14]; bf1[15] = bf0[11] - bf0[15]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit); bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit); bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit); bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit); bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit); bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit); bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; bf0 = step; bf1 = output; bf1[0] = bf0[0] + bf0[8]; bf1[1] = bf0[1] + bf0[9]; bf1[2] = bf0[2] + bf0[10]; bf1[3] = bf0[3] + bf0[11]; bf1[4] = bf0[4] + bf0[12]; bf1[5] = bf0[5] + bf0[13]; bf1[6] = bf0[6] + bf0[14]; bf1[7] = bf0[7] + bf0[15]; bf1[8] = bf0[0] - bf0[8]; bf1[9] = bf0[1] - bf0[9]; bf1[10] = bf0[2] - bf0[10]; bf1[11] = bf0[3] - bf0[11]; bf1[12] = bf0[4] - bf0[12]; bf1[13] = bf0[5] - bf0[13]; bf1[14] = bf0[6] - bf0[14]; bf1[15] = bf0[7] - bf0[15]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit); bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit); bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit); bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit); bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit); bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit); bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit); bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit); bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit); bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit); bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit); bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit); bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit); bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit); bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit); bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; bf0 = step; bf1 = output; bf1[0] = bf0[1]; bf1[1] = bf0[14]; bf1[2] = bf0[3]; bf1[3] = bf0[12]; bf1[4] = bf0[5]; bf1[5] = bf0[10]; bf1[6] = bf0[7]; bf1[7] = bf0[8]; bf1[8] = bf0[9]; bf1[9] = bf0[6]; bf1[10] = bf0[11]; bf1[11] = bf0[4]; bf1[12] = bf0[13]; bf1[13] = bf0[2]; bf1[14] = bf0[15]; bf1[15] = bf0[0]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { (void)cos_bit; for (int i = 0; i < 4; ++i) output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits); assert(stage_range[0] + NewSqrt2Bits <= 32); av1_range_check_buf(0, input, output, 4, stage_range[0]); } void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { (void)cos_bit; for (int i = 0; i < 8; ++i) output[i] = input[i] * 2; av1_range_check_buf(0, input, output, 8, stage_range[0]); } void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { (void)cos_bit; for (int i = 0; i < 16; ++i) output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits); assert(stage_range[0] + NewSqrt2Bits <= 32); av1_range_check_buf(0, input, output, 16, stage_range[0]); } void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { (void)cos_bit; for (int i = 0; i < 32; ++i) output[i] = input[i] * 4; av1_range_check_buf(0, input, output, 32, stage_range[0]); } void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range) { const int32_t size = 64; const int32_t *cospi; int32_t stage = 0; int32_t *bf0, *bf1; int32_t step[64]; // stage 0; av1_range_check_buf(stage, input, input, size, stage_range[stage]); // stage 1; stage++; bf1 = output; bf1[0] = input[0] + input[63]; bf1[1] = input[1] + input[62]; bf1[2] = input[2] + input[61]; bf1[3] = input[3] + input[60]; bf1[4] = input[4] + input[59]; bf1[5] = input[5] + input[58]; bf1[6] = input[6] + input[57]; bf1[7] = input[7] + input[56]; bf1[8] = input[8] + input[55]; bf1[9] = input[9] + input[54]; bf1[10] = input[10] + input[53]; bf1[11] = input[11] + input[52]; bf1[12] = input[12] + input[51]; bf1[13] = input[13] + input[50]; bf1[14] = input[14] + input[49]; bf1[15] = input[15] + input[48]; bf1[16] = input[16] + input[47]; bf1[17] = input[17] + input[46]; bf1[18] = input[18] + input[45]; bf1[19] = input[19] + input[44]; bf1[20] = input[20] + input[43]; bf1[21] = input[21] + input[42]; bf1[22] = input[22] + input[41]; bf1[23] = input[23] + input[40]; bf1[24] = input[24] + input[39]; bf1[25] = input[25] + input[38]; bf1[26] = input[26] + input[37]; bf1[27] = input[27] + input[36]; bf1[28] = input[28] + input[35]; bf1[29] = input[29] + input[34]; bf1[30] = input[30] + input[33]; bf1[31] = input[31] + input[32]; bf1[32] = -input[32] + input[31]; bf1[33] = -input[33] + input[30]; bf1[34] = -input[34] + input[29]; bf1[35] = -input[35] + input[28]; bf1[36] = -input[36] + input[27]; bf1[37] = -input[37] + input[26]; bf1[38] = -input[38] + input[25]; bf1[39] = -input[39] + input[24]; bf1[40] = -input[40] + input[23]; bf1[41] = -input[41] + input[22]; bf1[42] = -input[42] + input[21]; bf1[43] = -input[43] + input[20]; bf1[44] = -input[44] + input[19]; bf1[45] = -input[45] + input[18]; bf1[46] = -input[46] + input[17]; bf1[47] = -input[47] + input[16]; bf1[48] = -input[48] + input[15]; bf1[49] = -input[49] + input[14]; bf1[50] = -input[50] + input[13]; bf1[51] = -input[51] + input[12]; bf1[52] = -input[52] + input[11]; bf1[53] = -input[53] + input[10]; bf1[54] = -input[54] + input[9]; bf1[55] = -input[55] + input[8]; bf1[56] = -input[56] + input[7]; bf1[57] = -input[57] + input[6]; bf1[58] = -input[58] + input[5]; bf1[59] = -input[59] + input[4]; bf1[60] = -input[60] + input[3]; bf1[61] = -input[61] + input[2]; bf1[62] = -input[62] + input[1]; bf1[63] = -input[63] + input[0]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 2 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0] + bf0[31]; bf1[1] = bf0[1] + bf0[30]; bf1[2] = bf0[2] + bf0[29]; bf1[3] = bf0[3] + bf0[28]; bf1[4] = bf0[4] + bf0[27]; bf1[5] = bf0[5] + bf0[26]; bf1[6] = bf0[6] + bf0[25]; bf1[7] = bf0[7] + bf0[24]; bf1[8] = bf0[8] + bf0[23]; bf1[9] = bf0[9] + bf0[22]; bf1[10] = bf0[10] + bf0[21]; bf1[11] = bf0[11] + bf0[20]; bf1[12] = bf0[12] + bf0[19]; bf1[13] = bf0[13] + bf0[18]; bf1[14] = bf0[14] + bf0[17]; bf1[15] = bf0[15] + bf0[16]; bf1[16] = -bf0[16] + bf0[15]; bf1[17] = -bf0[17] + bf0[14]; bf1[18] = -bf0[18] + bf0[13]; bf1[19] = -bf0[19] + bf0[12]; bf1[20] = -bf0[20] + bf0[11]; bf1[21] = -bf0[21] + bf0[10]; bf1[22] = -bf0[22] + bf0[9]; bf1[23] = -bf0[23] + bf0[8]; bf1[24] = -bf0[24] + bf0[7]; bf1[25] = -bf0[25] + bf0[6]; bf1[26] = -bf0[26] + bf0[5]; bf1[27] = -bf0[27] + bf0[4]; bf1[28] = -bf0[28] + bf0[3]; bf1[29] = -bf0[29] + bf0[2]; bf1[30] = -bf0[30] + bf0[1]; bf1[31] = -bf0[31] + bf0[0]; bf1[32] = bf0[32]; bf1[33] = bf0[33]; bf1[34] = bf0[34]; bf1[35] = bf0[35]; bf1[36] = bf0[36]; bf1[37] = bf0[37]; bf1[38] = bf0[38]; bf1[39] = bf0[39]; bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit); bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit); bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit); bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit); bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit); bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit); bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit); bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit); bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit); bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit); bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit); bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit); bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit); bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit); bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit); bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit); bf1[56] = bf0[56]; bf1[57] = bf0[57]; bf1[58] = bf0[58]; bf1[59] = bf0[59]; bf1[60] = bf0[60]; bf1[61] = bf0[61]; bf1[62] = bf0[62]; bf1[63] = bf0[63]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 3 stage++; cospi = cospi_arr(cos_bit); bf0 = step; bf1 = output; bf1[0] = bf0[0] + bf0[15]; bf1[1] = bf0[1] + bf0[14]; bf1[2] = bf0[2] + bf0[13]; bf1[3] = bf0[3] + bf0[12]; bf1[4] = bf0[4] + bf0[11]; bf1[5] = bf0[5] + bf0[10]; bf1[6] = bf0[6] + bf0[9]; bf1[7] = bf0[7] + bf0[8]; bf1[8] = -bf0[8] + bf0[7]; bf1[9] = -bf0[9] + bf0[6]; bf1[10] = -bf0[10] + bf0[5]; bf1[11] = -bf0[11] + bf0[4]; bf1[12] = -bf0[12] + bf0[3]; bf1[13] = -bf0[13] + bf0[2]; bf1[14] = -bf0[14] + bf0[1]; bf1[15] = -bf0[15] + bf0[0]; bf1[16] = bf0[16]; bf1[17] = bf0[17]; bf1[18] = bf0[18]; bf1[19] = bf0[19]; bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit); bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit); bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit); bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit); bf1[28] = bf0[28]; bf1[29] = bf0[29]; bf1[30] = bf0[30]; bf1[31] = bf0[31]; bf1[32] = bf0[32] + bf0[47]; bf1[33] = bf0[33] + bf0[46]; bf1[34] = bf0[34] + bf0[45]; bf1[35] = bf0[35] + bf0[44]; bf1[36] = bf0[36] + bf0[43]; bf1[37] = bf0[37] + bf0[42]; bf1[38] = bf0[38] + bf0[41]; bf1[39] = bf0[39] + bf0[40]; bf1[40] = -bf0[40] + bf0[39]; bf1[41] = -bf0[41] + bf0[38]; bf1[42] = -bf0[42] + bf0[37]; bf1[43] = -bf0[43] + bf0[36]; bf1[44] = -bf0[44] + bf0[35]; bf1[45] = -bf0[45] + bf0[34]; bf1[46] = -bf0[46] + bf0[33]; bf1[47] = -bf0[47] + bf0[32]; bf1[48] = -bf0[48] + bf0[63]; bf1[49] = -bf0[49] + bf0[62]; bf1[50] = -bf0[50] + bf0[61]; bf1[51] = -bf0[51] + bf0[60]; bf1[52] = -bf0[52] + bf0[59]; bf1[53] = -bf0[53] + bf0[58]; bf1[54] = -bf0[54] + bf0[57]; bf1[55] = -bf0[55] + bf0[56]; bf1[56] = bf0[56] + bf0[55]; bf1[57] = bf0[57] + bf0[54]; bf1[58] = bf0[58] + bf0[53]; bf1[59] = bf0[59] + bf0[52]; bf1[60] = bf0[60] + bf0[51]; bf1[61] = bf0[61] + bf0[50]; bf1[62] = bf0[62] + bf0[49]; bf1[63] = bf0[63] + bf0[48]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 4 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0] + bf0[7]; bf1[1] = bf0[1] + bf0[6]; bf1[2] = bf0[2] + bf0[5]; bf1[3] = bf0[3] + bf0[4]; bf1[4] = -bf0[4] + bf0[3]; bf1[5] = -bf0[5] + bf0[2]; bf1[6] = -bf0[6] + bf0[1]; bf1[7] = -bf0[7] + bf0[0]; bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); bf1[14] = bf0[14]; bf1[15] = bf0[15]; bf1[16] = bf0[16] + bf0[23]; bf1[17] = bf0[17] + bf0[22]; bf1[18] = bf0[18] + bf0[21]; bf1[19] = bf0[19] + bf0[20]; bf1[20] = -bf0[20] + bf0[19]; bf1[21] = -bf0[21] + bf0[18]; bf1[22] = -bf0[22] + bf0[17]; bf1[23] = -bf0[23] + bf0[16]; bf1[24] = -bf0[24] + bf0[31]; bf1[25] = -bf0[25] + bf0[30]; bf1[26] = -bf0[26] + bf0[29]; bf1[27] = -bf0[27] + bf0[28]; bf1[28] = bf0[28] + bf0[27]; bf1[29] = bf0[29] + bf0[26]; bf1[30] = bf0[30] + bf0[25]; bf1[31] = bf0[31] + bf0[24]; bf1[32] = bf0[32]; bf1[33] = bf0[33]; bf1[34] = bf0[34]; bf1[35] = bf0[35]; bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit); bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit); bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit); bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit); bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit); bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit); bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit); bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit); bf1[44] = bf0[44]; bf1[45] = bf0[45]; bf1[46] = bf0[46]; bf1[47] = bf0[47]; bf1[48] = bf0[48]; bf1[49] = bf0[49]; bf1[50] = bf0[50]; bf1[51] = bf0[51]; bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit); bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit); bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit); bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit); bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit); bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit); bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit); bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit); bf1[60] = bf0[60]; bf1[61] = bf0[61]; bf1[62] = bf0[62]; bf1[63] = bf0[63]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 5 stage++; cospi = cospi_arr(cos_bit); bf0 = step; bf1 = output; bf1[0] = bf0[0] + bf0[3]; bf1[1] = bf0[1] + bf0[2]; bf1[2] = -bf0[2] + bf0[1]; bf1[3] = -bf0[3] + bf0[0]; bf1[4] = bf0[4]; bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); bf1[7] = bf0[7]; bf1[8] = bf0[8] + bf0[11]; bf1[9] = bf0[9] + bf0[10]; bf1[10] = -bf0[10] + bf0[9]; bf1[11] = -bf0[11] + bf0[8]; bf1[12] = -bf0[12] + bf0[15]; bf1[13] = -bf0[13] + bf0[14]; bf1[14] = bf0[14] + bf0[13]; bf1[15] = bf0[15] + bf0[12]; bf1[16] = bf0[16]; bf1[17] = bf0[17]; bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); bf1[22] = bf0[22]; bf1[23] = bf0[23]; bf1[24] = bf0[24]; bf1[25] = bf0[25]; bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit); bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit); bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit); bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); bf1[30] = bf0[30]; bf1[31] = bf0[31]; bf1[32] = bf0[32] + bf0[39]; bf1[33] = bf0[33] + bf0[38]; bf1[34] = bf0[34] + bf0[37]; bf1[35] = bf0[35] + bf0[36]; bf1[36] = -bf0[36] + bf0[35]; bf1[37] = -bf0[37] + bf0[34]; bf1[38] = -bf0[38] + bf0[33]; bf1[39] = -bf0[39] + bf0[32]; bf1[40] = -bf0[40] + bf0[47]; bf1[41] = -bf0[41] + bf0[46]; bf1[42] = -bf0[42] + bf0[45]; bf1[43] = -bf0[43] + bf0[44]; bf1[44] = bf0[44] + bf0[43]; bf1[45] = bf0[45] + bf0[42]; bf1[46] = bf0[46] + bf0[41]; bf1[47] = bf0[47] + bf0[40]; bf1[48] = bf0[48] + bf0[55]; bf1[49] = bf0[49] + bf0[54]; bf1[50] = bf0[50] + bf0[53]; bf1[51] = bf0[51] + bf0[52]; bf1[52] = -bf0[52] + bf0[51]; bf1[53] = -bf0[53] + bf0[50]; bf1[54] = -bf0[54] + bf0[49]; bf1[55] = -bf0[55] + bf0[48]; bf1[56] = -bf0[56] + bf0[63]; bf1[57] = -bf0[57] + bf0[62]; bf1[58] = -bf0[58] + bf0[61]; bf1[59] = -bf0[59] + bf0[60]; bf1[60] = bf0[60] + bf0[59]; bf1[61] = bf0[61] + bf0[58]; bf1[62] = bf0[62] + bf0[57]; bf1[63] = bf0[63] + bf0[56]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 6 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); bf1[4] = bf0[4] + bf0[5]; bf1[5] = -bf0[5] + bf0[4]; bf1[6] = -bf0[6] + bf0[7]; bf1[7] = bf0[7] + bf0[6]; bf1[8] = bf0[8]; bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); bf1[11] = bf0[11]; bf1[12] = bf0[12]; bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); bf1[15] = bf0[15]; bf1[16] = bf0[16] + bf0[19]; bf1[17] = bf0[17] + bf0[18]; bf1[18] = -bf0[18] + bf0[17]; bf1[19] = -bf0[19] + bf0[16]; bf1[20] = -bf0[20] + bf0[23]; bf1[21] = -bf0[21] + bf0[22]; bf1[22] = bf0[22] + bf0[21]; bf1[23] = bf0[23] + bf0[20]; bf1[24] = bf0[24] + bf0[27]; bf1[25] = bf0[25] + bf0[26]; bf1[26] = -bf0[26] + bf0[25]; bf1[27] = -bf0[27] + bf0[24]; bf1[28] = -bf0[28] + bf0[31]; bf1[29] = -bf0[29] + bf0[30]; bf1[30] = bf0[30] + bf0[29]; bf1[31] = bf0[31] + bf0[28]; bf1[32] = bf0[32]; bf1[33] = bf0[33]; bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit); bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit); bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit); bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit); bf1[38] = bf0[38]; bf1[39] = bf0[39]; bf1[40] = bf0[40]; bf1[41] = bf0[41]; bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit); bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit); bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit); bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit); bf1[46] = bf0[46]; bf1[47] = bf0[47]; bf1[48] = bf0[48]; bf1[49] = bf0[49]; bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit); bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit); bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit); bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit); bf1[54] = bf0[54]; bf1[55] = bf0[55]; bf1[56] = bf0[56]; bf1[57] = bf0[57]; bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit); bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit); bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit); bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit); bf1[62] = bf0[62]; bf1[63] = bf0[63]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 7 stage++; cospi = cospi_arr(cos_bit); bf0 = step; bf1 = output; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); bf1[8] = bf0[8] + bf0[9]; bf1[9] = -bf0[9] + bf0[8]; bf1[10] = -bf0[10] + bf0[11]; bf1[11] = bf0[11] + bf0[10]; bf1[12] = bf0[12] + bf0[13]; bf1[13] = -bf0[13] + bf0[12]; bf1[14] = -bf0[14] + bf0[15]; bf1[15] = bf0[15] + bf0[14]; bf1[16] = bf0[16]; bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); bf1[19] = bf0[19]; bf1[20] = bf0[20]; bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); bf1[23] = bf0[23]; bf1[24] = bf0[24]; bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit); bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit); bf1[27] = bf0[27]; bf1[28] = bf0[28]; bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); bf1[31] = bf0[31]; bf1[32] = bf0[32] + bf0[35]; bf1[33] = bf0[33] + bf0[34]; bf1[34] = -bf0[34] + bf0[33]; bf1[35] = -bf0[35] + bf0[32]; bf1[36] = -bf0[36] + bf0[39]; bf1[37] = -bf0[37] + bf0[38]; bf1[38] = bf0[38] + bf0[37]; bf1[39] = bf0[39] + bf0[36]; bf1[40] = bf0[40] + bf0[43]; bf1[41] = bf0[41] + bf0[42]; bf1[42] = -bf0[42] + bf0[41]; bf1[43] = -bf0[43] + bf0[40]; bf1[44] = -bf0[44] + bf0[47]; bf1[45] = -bf0[45] + bf0[46]; bf1[46] = bf0[46] + bf0[45]; bf1[47] = bf0[47] + bf0[44]; bf1[48] = bf0[48] + bf0[51]; bf1[49] = bf0[49] + bf0[50]; bf1[50] = -bf0[50] + bf0[49]; bf1[51] = -bf0[51] + bf0[48]; bf1[52] = -bf0[52] + bf0[55]; bf1[53] = -bf0[53] + bf0[54]; bf1[54] = bf0[54] + bf0[53]; bf1[55] = bf0[55] + bf0[52]; bf1[56] = bf0[56] + bf0[59]; bf1[57] = bf0[57] + bf0[58]; bf1[58] = -bf0[58] + bf0[57]; bf1[59] = -bf0[59] + bf0[56]; bf1[60] = -bf0[60] + bf0[63]; bf1[61] = -bf0[61] + bf0[62]; bf1[62] = bf0[62] + bf0[61]; bf1[63] = bf0[63] + bf0[60]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 8 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); bf1[16] = bf0[16] + bf0[17]; bf1[17] = -bf0[17] + bf0[16]; bf1[18] = -bf0[18] + bf0[19]; bf1[19] = bf0[19] + bf0[18]; bf1[20] = bf0[20] + bf0[21]; bf1[21] = -bf0[21] + bf0[20]; bf1[22] = -bf0[22] + bf0[23]; bf1[23] = bf0[23] + bf0[22]; bf1[24] = bf0[24] + bf0[25]; bf1[25] = -bf0[25] + bf0[24]; bf1[26] = -bf0[26] + bf0[27]; bf1[27] = bf0[27] + bf0[26]; bf1[28] = bf0[28] + bf0[29]; bf1[29] = -bf0[29] + bf0[28]; bf1[30] = -bf0[30] + bf0[31]; bf1[31] = bf0[31] + bf0[30]; bf1[32] = bf0[32]; bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit); bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit); bf1[35] = bf0[35]; bf1[36] = bf0[36]; bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit); bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit); bf1[39] = bf0[39]; bf1[40] = bf0[40]; bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit); bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit); bf1[43] = bf0[43]; bf1[44] = bf0[44]; bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit); bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit); bf1[47] = bf0[47]; bf1[48] = bf0[48]; bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit); bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit); bf1[51] = bf0[51]; bf1[52] = bf0[52]; bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit); bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit); bf1[55] = bf0[55]; bf1[56] = bf0[56]; bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit); bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit); bf1[59] = bf0[59]; bf1[60] = bf0[60]; bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit); bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit); bf1[63] = bf0[63]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 9 stage++; cospi = cospi_arr(cos_bit); bf0 = step; bf1 = output; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = bf0[10]; bf1[11] = bf0[11]; bf1[12] = bf0[12]; bf1[13] = bf0[13]; bf1[14] = bf0[14]; bf1[15] = bf0[15]; bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit); bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit); bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit); bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit); bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit); bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit); bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit); bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit); bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit); bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit); bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit); bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit); bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit); bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit); bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit); bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit); bf1[32] = bf0[32] + bf0[33]; bf1[33] = -bf0[33] + bf0[32]; bf1[34] = -bf0[34] + bf0[35]; bf1[35] = bf0[35] + bf0[34]; bf1[36] = bf0[36] + bf0[37]; bf1[37] = -bf0[37] + bf0[36]; bf1[38] = -bf0[38] + bf0[39]; bf1[39] = bf0[39] + bf0[38]; bf1[40] = bf0[40] + bf0[41]; bf1[41] = -bf0[41] + bf0[40]; bf1[42] = -bf0[42] + bf0[43]; bf1[43] = bf0[43] + bf0[42]; bf1[44] = bf0[44] + bf0[45]; bf1[45] = -bf0[45] + bf0[44]; bf1[46] = -bf0[46] + bf0[47]; bf1[47] = bf0[47] + bf0[46]; bf1[48] = bf0[48] + bf0[49]; bf1[49] = -bf0[49] + bf0[48]; bf1[50] = -bf0[50] + bf0[51]; bf1[51] = bf0[51] + bf0[50]; bf1[52] = bf0[52] + bf0[53]; bf1[53] = -bf0[53] + bf0[52]; bf1[54] = -bf0[54] + bf0[55]; bf1[55] = bf0[55] + bf0[54]; bf1[56] = bf0[56] + bf0[57]; bf1[57] = -bf0[57] + bf0[56]; bf1[58] = -bf0[58] + bf0[59]; bf1[59] = bf0[59] + bf0[58]; bf1[60] = bf0[60] + bf0[61]; bf1[61] = -bf0[61] + bf0[60]; bf1[62] = -bf0[62] + bf0[63]; bf1[63] = bf0[63] + bf0[62]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 10 stage++; cospi = cospi_arr(cos_bit); bf0 = output; bf1 = step; bf1[0] = bf0[0]; bf1[1] = bf0[1]; bf1[2] = bf0[2]; bf1[3] = bf0[3]; bf1[4] = bf0[4]; bf1[5] = bf0[5]; bf1[6] = bf0[6]; bf1[7] = bf0[7]; bf1[8] = bf0[8]; bf1[9] = bf0[9]; bf1[10] = bf0[10]; bf1[11] = bf0[11]; bf1[12] = bf0[12]; bf1[13] = bf0[13]; bf1[14] = bf0[14]; bf1[15] = bf0[15]; bf1[16] = bf0[16]; bf1[17] = bf0[17]; bf1[18] = bf0[18]; bf1[19] = bf0[19]; bf1[20] = bf0[20]; bf1[21] = bf0[21]; bf1[22] = bf0[22]; bf1[23] = bf0[23]; bf1[24] = bf0[24]; bf1[25] = bf0[25]; bf1[26] = bf0[26]; bf1[27] = bf0[27]; bf1[28] = bf0[28]; bf1[29] = bf0[29]; bf1[30] = bf0[30]; bf1[31] = bf0[31]; bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit); bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit); bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit); bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit); bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit); bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit); bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit); bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit); bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit); bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit); bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit); bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit); bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit); bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit); bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit); bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit); bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit); bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit); bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit); bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit); bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit); bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit); bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit); bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit); bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit); bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit); bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit); bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit); bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit); bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit); bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit); bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit); av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); // stage 11 stage++; bf0 = step; bf1 = output; bf1[0] = bf0[0]; bf1[1] = bf0[32]; bf1[2] = bf0[16]; bf1[3] = bf0[48]; bf1[4] = bf0[8]; bf1[5] = bf0[40]; bf1[6] = bf0[24]; bf1[7] = bf0[56]; bf1[8] = bf0[4]; bf1[9] = bf0[36]; bf1[10] = bf0[20]; bf1[11] = bf0[52]; bf1[12] = bf0[12]; bf1[13] = bf0[44]; bf1[14] = bf0[28]; bf1[15] = bf0[60]; bf1[16] = bf0[2]; bf1[17] = bf0[34]; bf1[18] = bf0[18]; bf1[19] = bf0[50]; bf1[20] = bf0[10]; bf1[21] = bf0[42]; bf1[22] = bf0[26]; bf1[23] = bf0[58]; bf1[24] = bf0[6]; bf1[25] = bf0[38]; bf1[26] = bf0[22]; bf1[27] = bf0[54]; bf1[28] = bf0[14]; bf1[29] = bf0[46]; bf1[30] = bf0[30]; bf1[31] = bf0[62]; bf1[32] = bf0[1]; bf1[33] = bf0[33]; bf1[34] = bf0[17]; bf1[35] = bf0[49]; bf1[36] = bf0[9]; bf1[37] = bf0[41]; bf1[38] = bf0[25]; bf1[39] = bf0[57]; bf1[40] = bf0[5]; bf1[41] = bf0[37]; bf1[42] = bf0[21]; bf1[43] = bf0[53]; bf1[44] = bf0[13]; bf1[45] = bf0[45]; bf1[46] = bf0[29]; bf1[47] = bf0[61]; bf1[48] = bf0[3]; bf1[49] = bf0[35]; bf1[50] = bf0[19]; bf1[51] = bf0[51]; bf1[52] = bf0[11]; bf1[53] = bf0[43]; bf1[54] = bf0[27]; bf1[55] = bf0[59]; bf1[56] = bf0[7]; bf1[57] = bf0[39]; bf1[58] = bf0[23]; bf1[59] = bf0[55]; bf1[60] = bf0[15]; bf1[61] = bf0[47]; bf1[62] = bf0[31]; bf1[63] = bf0[63]; av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); } aom-3.12.1/av1/encoder/av1_fwd_txfm1d.h000066400000000000000000000042221477627663500174720ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ #define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ #include "av1/common/av1_txfm.h" #ifdef __cplusplus extern "C" { #endif void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, const int8_t *stage_range); #ifdef __cplusplus } #endif #endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ aom-3.12.1/av1/encoder/av1_fwd_txfm1d_cfg.h000066400000000000000000000015761477627663500203220ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ #define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ #include "av1/common/enums.h" #include "av1/encoder/av1_fwd_txfm1d.h" extern const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL]; extern const int8_t av1_fwd_cos_bit_col[5][5]; extern const int8_t av1_fwd_cos_bit_row[5][5]; #endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ aom-3.12.1/av1/encoder/av1_fwd_txfm2d.c000066400000000000000000000425111477627663500174710ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" #include "aom_dsp/txfm_common.h" #include "av1/common/enums.h" #include "av1/common/av1_txfm.h" #include "av1/encoder/av1_fwd_txfm1d.h" #include "av1/encoder/av1_fwd_txfm1d_cfg.h" static inline TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { switch (txfm_type) { case TXFM_TYPE_DCT4: return av1_fdct4; case TXFM_TYPE_DCT8: return av1_fdct8; case TXFM_TYPE_DCT16: return av1_fdct16; case TXFM_TYPE_DCT32: return av1_fdct32; case TXFM_TYPE_DCT64: return av1_fdct64; case TXFM_TYPE_ADST4: return av1_fadst4; case TXFM_TYPE_ADST8: return av1_fadst8; case TXFM_TYPE_ADST16: return av1_fadst16; case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c; case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c; case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c; case TXFM_TYPE_IDENTITY32: return av1_fidentity32_c; default: assert(0); return NULL; } } void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, const TXFM_2D_FLIP_CFG *cfg, int bd) { // Take the shift from the larger dimension in the rectangular case. const int8_t *shift = cfg->shift; // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) { stage_range_col[i] = cfg->stage_range_col[i] + shift[0] + bd + 1; } // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) { stage_range_row[i] = cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1; } } static inline void fwd_txfm2d_c(const int16_t *input, int32_t *output, const int stride, const TXFM_2D_FLIP_CFG *cfg, int32_t *buf, int bd) { int c, r; // Note when assigning txfm_size_col, we use the txfm_size from the // row configuration and vice versa. This is intentionally done to // accurately perform rectangular transforms. When the transform is // rectangular, the number of columns will be the same as the // txfm_size stored in the row cfg struct. It will make no difference // for square transforms. const int txfm_size_col = tx_size_wide[cfg->tx_size]; const int txfm_size_row = tx_size_high[cfg->tx_size]; // Take the shift from the larger dimension in the rectangular case. const int8_t *shift = cfg->shift; const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM); assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM); av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bd); const int8_t cos_bit_col = cfg->cos_bit_col; const int8_t cos_bit_row = cfg->cos_bit_row; const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row); // use output buffer as temp buffer int32_t *temp_in = output; int32_t *temp_out = output + txfm_size_row; // Columns for (c = 0; c < txfm_size_col; ++c) { if (cfg->ud_flip == 0) { for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * stride + c]; } else { for (r = 0; r < txfm_size_row; ++r) // flip upside down temp_in[r] = input[(txfm_size_row - r - 1) * stride + c]; } av1_round_shift_array(temp_in, txfm_size_row, -shift[0]); txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col); av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); if (cfg->lr_flip == 0) { for (r = 0; r < txfm_size_row; ++r) buf[r * txfm_size_col + c] = temp_out[r]; } else { for (r = 0; r < txfm_size_row; ++r) // flip from left to right buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r]; } } DECLARE_ALIGNED(16, int32_t, row_buffer[MAX_TX_SIZE]); // Rows for (r = 0; r < txfm_size_row; ++r) { txfm_func_row(buf + r * txfm_size_col, row_buffer, cos_bit_row, stage_range_row); av1_round_shift_array(row_buffer, txfm_size_col, -shift[2]); if (abs(rect_type) == 1) { // Multiply everything by Sqrt2 if the transform is rectangular and the // size difference is a factor of 2. for (c = 0; c < txfm_size_col; ++c) { row_buffer[c] = round_shift((int64_t)row_buffer[c] * NewSqrt2, NewSqrt2Bits); } } for (c = 0; c < txfm_size_col; ++c) { output[c * txfm_size_row + r] = row_buffer[c]; } } } void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8]); TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_4X8, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); } void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { int32_t txfm_buf[8 * 4]; TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_8X4, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); } void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16]); TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_8X16, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); } void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { int32_t txfm_buf[16 * 8]; TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_16X8, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); } void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32]); TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_16X32, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); } void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { int32_t txfm_buf[32 * 16]; TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_32X16, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); } #if !CONFIG_REALTIME_ONLY void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]); TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_4X16, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); } #endif // !CONFIG_REALTIME_ONLY void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { int32_t txfm_buf[16 * 4]; TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_16X4, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); } #if !CONFIG_REALTIME_ONLY void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]); TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_8X32, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); } void av1_fwd_txfm2d_32x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { int32_t txfm_buf[32 * 8]; TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_32X8, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); } #endif // !CONFIG_REALTIME_ONLY void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { int32_t txfm_buf[4 * 4]; TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_4X4, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); } void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { int32_t txfm_buf[8 * 8]; TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_8X8, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); } void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { int32_t txfm_buf[16 * 16]; TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_16X16, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); } void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { int32_t txfm_buf[32 * 32]; TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); } void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { int32_t txfm_buf[64 * 64]; TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); // Zero out top-right 32x32 area. for (int col = 0; col < 32; ++col) { memset(output + col * 64 + 32, 0, 32 * sizeof(*output)); } // Zero out the bottom 64x32 area. memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output)); // Re-pack non-zero coeffs in the first 32x32 indices. for (int col = 1; col < 32; ++col) { memcpy(output + col * 32, output + col * 64, 32 * sizeof(*output)); } } void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 64]); TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_32X64, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); // Zero out right 32x32 area. for (int col = 0; col < 32; ++col) { memset(output + col * 64 + 32, 0, 32 * sizeof(*output)); } // Re-pack non-zero coeffs in the first 32x32 indices. for (int col = 1; col < 32; ++col) { memcpy(output + col * 32, output + col * 64, 32 * sizeof(*output)); } } void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { int32_t txfm_buf[64 * 32]; TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_64X32, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); // Zero out the bottom 32x32 area. memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output)); // Note: no repacking needed here. } #if !CONFIG_REALTIME_ONLY void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]); TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_16X64, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); // Zero out right 32x16 area. for (int row = 0; row < 16; ++row) { memset(output + row * 64 + 32, 0, 32 * sizeof(*output)); } // Re-pack non-zero coeffs in the first 32x16 indices. for (int row = 1; row < 16; ++row) { memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output)); } } void av1_fwd_txfm2d_64x16_c(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { int32_t txfm_buf[64 * 16]; TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_64X16, &cfg); fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); // Zero out the bottom 16x32 area. memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); // Note: no repacking needed here. } #endif // !CONFIG_REALTIME_ONLY static const int8_t fwd_shift_4x4[3] = { 2, 0, 0 }; static const int8_t fwd_shift_8x8[3] = { 2, -1, 0 }; static const int8_t fwd_shift_16x16[3] = { 2, -2, 0 }; static const int8_t fwd_shift_32x32[3] = { 2, -4, 0 }; static const int8_t fwd_shift_64x64[3] = { 0, -2, -2 }; static const int8_t fwd_shift_4x8[3] = { 2, -1, 0 }; static const int8_t fwd_shift_8x4[3] = { 2, -1, 0 }; static const int8_t fwd_shift_8x16[3] = { 2, -2, 0 }; static const int8_t fwd_shift_16x8[3] = { 2, -2, 0 }; static const int8_t fwd_shift_16x32[3] = { 2, -4, 0 }; static const int8_t fwd_shift_32x16[3] = { 2, -4, 0 }; static const int8_t fwd_shift_32x64[3] = { 0, -2, -2 }; static const int8_t fwd_shift_64x32[3] = { 2, -4, -2 }; static const int8_t fwd_shift_4x16[3] = { 2, -1, 0 }; static const int8_t fwd_shift_16x4[3] = { 2, -1, 0 }; static const int8_t fwd_shift_8x32[3] = { 2, -2, 0 }; static const int8_t fwd_shift_32x8[3] = { 2, -2, 0 }; static const int8_t fwd_shift_16x64[3] = { 0, -2, 0 }; static const int8_t fwd_shift_64x16[3] = { 2, -4, 0 }; const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL] = { fwd_shift_4x4, fwd_shift_8x8, fwd_shift_16x16, fwd_shift_32x32, fwd_shift_64x64, fwd_shift_4x8, fwd_shift_8x4, fwd_shift_8x16, fwd_shift_16x8, fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64, fwd_shift_64x32, fwd_shift_4x16, fwd_shift_16x4, fwd_shift_8x32, fwd_shift_32x8, fwd_shift_16x64, fwd_shift_64x16, }; const int8_t av1_fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/] [MAX_TXWH_IDX /*txh_idx*/] = { { 13, 13, 13, 0, 0 }, { 13, 13, 13, 12, 0 }, { 13, 13, 13, 12, 13 }, { 0, 13, 13, 12, 13 }, { 0, 0, 13, 12, 13 } }; const int8_t av1_fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/] [MAX_TXWH_IDX /*txh_idx*/] = { { 13, 13, 12, 0, 0 }, { 13, 13, 13, 12, 0 }, { 13, 13, 12, 13, 12 }, { 0, 12, 13, 12, 11 }, { 0, 0, 12, 11, 10 } }; static const int8_t fdct4_range_mult2[4] = { 0, 2, 3, 3 }; static const int8_t fdct8_range_mult2[6] = { 0, 2, 4, 5, 5, 5 }; static const int8_t fdct16_range_mult2[8] = { 0, 2, 4, 6, 7, 7, 7, 7 }; static const int8_t fdct32_range_mult2[10] = { 0, 2, 4, 6, 8, 9, 9, 9, 9, 9 }; static const int8_t fdct64_range_mult2[12] = { 0, 2, 4, 6, 8, 10, 11, 11, 11, 11, 11, 11 }; static const int8_t fadst4_range_mult2[7] = { 0, 2, 4, 3, 3, 3, 3 }; static const int8_t fadst8_range_mult2[8] = { 0, 0, 1, 3, 3, 5, 5, 5 }; static const int8_t fadst16_range_mult2[10] = { 0, 0, 1, 3, 3, 5, 5, 7, 7, 7 }; static const int8_t fidtx4_range_mult2[1] = { 1 }; static const int8_t fidtx8_range_mult2[1] = { 2 }; static const int8_t fidtx16_range_mult2[1] = { 3 }; static const int8_t fidtx32_range_mult2[1] = { 4 }; static const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = { fdct4_range_mult2, fdct8_range_mult2, fdct16_range_mult2, fdct32_range_mult2, fdct64_range_mult2, fadst4_range_mult2, fadst8_range_mult2, fadst16_range_mult2, fidtx4_range_mult2, fidtx8_range_mult2, fidtx16_range_mult2, fidtx32_range_mult2 }; static inline void set_fwd_txfm_non_scale_range(TXFM_2D_FLIP_CFG *cfg) { av1_zero(cfg->stage_range_col); av1_zero(cfg->stage_range_row); const int8_t *const range_mult2_col = fwd_txfm_range_mult2_list[cfg->txfm_type_col]; const int stage_num_col = cfg->stage_num_col; // i < MAX_TXFM_STAGE_NUM will quiet -Wstringop-overflow. for (int i = 0; i < stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1; const int8_t *const range_mult2_row = fwd_txfm_range_mult2_list[cfg->txfm_type_row]; const int stage_num_row = cfg->stage_num_row; // i < MAX_TXFM_STAGE_NUM will quiet -Wstringop-overflow. for (int i = 0; i < stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) { cfg->stage_range_row[i] = (range_mult2_col[stage_num_col - 1] + range_mult2_row[i] + 1) >> 1; } } void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, TXFM_2D_FLIP_CFG *cfg) { assert(cfg != NULL); cfg->tx_size = tx_size; set_flip_cfg(tx_type, cfg); const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type]; const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); cfg->shift = av1_fwd_txfm_shift_ls[tx_size]; cfg->cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; cfg->cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col]; assert(cfg->txfm_type_col != TXFM_TYPE_INVALID); cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row]; assert(cfg->txfm_type_row != TXFM_TYPE_INVALID); cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col]; cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row]; set_fwd_txfm_non_scale_range(cfg); } aom-3.12.1/av1/encoder/av1_ml_partition_models.h000066400000000000000000000144661477627663500215060ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_ #define AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_ #ifdef __cplusplus extern "C" { #endif #include "av1/encoder/ml.h" // TODO(kyslov): Replace with proper weights after training AV1 models #define FEATURES 6 static const float av1_var_part_nn_weights_64_layer0[FEATURES * 8] = { 0.35755366f, 0.86281112f, -0.20871686f, 0.0409634f, 0.97305766f, 0.75510254f, 0.04860447f, 0.77095283f, -0.44105278f, -0.3755049f, -0.08456618f, 1.1821136f, -0.73956301f, 1.30016453f, 0.45566902f, 0.4742967f, 0.44213975f, 0.4876028f, 0.26720522f, -0.34429858f, -0.25148252f, -0.49623932f, -0.46747941f, -0.36656624f, 0.10213375f, 0.60262819f, -0.54788715f, -0.27272022f, 1.0995462f, -0.36338376f, -0.64836313f, 0.16057039f, 1.02782791f, 0.9985311f, 0.90607883f, 0.80570411f, -0.07750863f, -0.74006402f, 1.72839526f, 1.72355343f, 1.69288916f, 1.59102043f, 0.14140216f, -1.47262839f, 0.4262519f, -0.33805936f, -0.02449707f, 0.67203692f }; static const float av1_var_part_nn_bias_64_layer0[8] = { 0.39995694f, 0.65593756f, 1.12876737f, 1.28790576f, 0.53468556f, 0.3177908f, -0.74388266f, -1.81131248f }; static const float av1_var_part_nn_weights_64_layer1[8] = { -1.31174053f, 0.69696917f, 0.78721456f, 0.45326379f, 0.79258322f, 1.74626188f, -5.41831f, 3.33887435f }; static const float av1_var_part_nn_bias_64_layer1[1] = { -0.90951047f }; static const float av1_var_part_means_64[FEATURES] = { 5.36750249f, 11.58023127f, 0.25550964f, 0.23809917f, 0.24650665f, 0.22117687f }; static const float av1_var_part_vars_64[FEATURES] = { 0.89599769f, 2.2686018f, 0.02568608f, 0.02523411f, 0.02443085f, 0.01922085f }; static const NN_CONFIG av1_var_part_nnconfig_64 = { FEATURES, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 8, }, // num_hidden_nodes { av1_var_part_nn_weights_64_layer0, av1_var_part_nn_weights_64_layer1, }, { av1_var_part_nn_bias_64_layer0, av1_var_part_nn_bias_64_layer1, }, }; static const float av1_var_part_nn_weights_32_layer0[FEATURES * 8] = { 0.97886049f, -1.66262011f, 0.94902798f, 0.7080922f, 0.91181186f, 0.35222601f, -0.04428585f, 0.42086472f, -0.0206325f, -0.77937809f, -0.70947522f, -1.24463119f, 0.23739497f, -1.34327359f, 0.01024804f, 0.4544633f, -0.96907661f, 0.67279522f, 0.23180693f, 1.54063368f, -0.15700707f, 0.18597331f, 0.34167589f, 0.40736558f, 0.69213366f, -1.33584593f, 1.21190814f, 1.26725267f, 1.21284802f, 1.26611399f, 0.17546514f, -0.30248399f, -1.32589316f, -1.37432674f, -1.37423023f, -1.26890855f, 0.12166347f, -0.94565678f, -1.47475267f, -0.69279948f, -0.10166587f, -0.23489881f, 0.57123565f, 0.80051137f, -1.28411946f, -1.36576732f, -1.30257508f, -1.30575106f }; static const float av1_var_part_nn_bias_32_layer0[8] = { -1.6301435f, 0.61879037f, -1.68612662f, 1.66960165f, -0.0838243f, 0.32253287f, -0.65755282f, 0.96661531f }; static const float av1_var_part_nn_weights_32_layer1[8] = { 1.99257161f, 0.7331492f, 1.33539961f, 1.13501456f, -2.21154528f, 1.85858542f, -0.85565298f, -1.96410246f }; static const float av1_var_part_nn_bias_32_layer1[1] = { -0.14880827f }; static const float av1_var_part_means_32[FEATURES] = { 5.36360686f, 9.88421868f, 0.23543671f, 0.23621205f, 0.23409667f, 0.22855539f }; static const float av1_var_part_vars_32[FEATURES] = { 0.89077225f, 2.32312894f, 0.02167654f, 0.02392842f, 0.02466495f, 0.02047641f }; static const NN_CONFIG av1_var_part_nnconfig_32 = { FEATURES, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 8, }, // num_hidden_nodes { av1_var_part_nn_weights_32_layer0, av1_var_part_nn_weights_32_layer1, }, { av1_var_part_nn_bias_32_layer0, av1_var_part_nn_bias_32_layer1, }, }; static const float av1_var_part_nn_weights_16_layer0[FEATURES * 8] = { 0.45118305f, -0.22068295f, 0.4604435f, -0.1446326f, -0.15765035f, 0.42260198f, -0.0945916f, 0.49544996f, 0.62781567f, -0.41564372f, -0.39103292f, 0.44407624f, 0.48382613f, -0.85424238f, -0.00961433f, 0.25383582f, 0.14403897f, 0.00901859f, -0.83201967f, -0.19323284f, 0.59271213f, 0.69487457f, 0.6897112f, 0.62768521f, 0.9204492f, -1.42448347f, -0.16491054f, -0.10114424f, -0.1069687f, -0.11289049f, 0.26290832f, -0.41850393f, 0.17239733f, 0.41770622f, 0.43725942f, 0.19362467f, -0.35955731f, -0.899446f, 0.49726389f, 0.66569571f, 0.65893982f, 0.53199654f, -0.1158694f, -0.26472603f, 0.4155923f, 0.15059544f, 0.09596755f, 0.26247133f }; static const float av1_var_part_nn_bias_16_layer0[8] = { 1.64486321f, -0.11851574f, 1.29322833f, -0.61193136f, 0.33027532f, 1.04197232f, -0.80716674f, 0.88681233f }; static const float av1_var_part_nn_weights_16_layer1[8] = { -1.02832118f, 0.72800106f, -0.42904783f, 1.44490586f, -1.03888227f, -0.9023916f, -1.51543102f, -0.43059521f }; static const float av1_var_part_nn_bias_16_layer1[1] = { -0.85087946f }; static const float av1_var_part_means_16[FEATURES] = { 5.32551326f, 8.218448f, 0.21954822f, 0.22808377f, 0.23019798f, 0.22320699f }; static const float av1_var_part_vars_16[FEATURES] = { 0.86806032f, 2.39938956f, 0.01958579f, 0.02437927f, 0.02420755f, 0.0192003f }; static const NN_CONFIG av1_var_part_nnconfig_16 = { FEATURES, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 8, }, // num_hidden_nodes { av1_var_part_nn_weights_16_layer0, av1_var_part_nn_weights_16_layer1, }, { av1_var_part_nn_bias_16_layer0, av1_var_part_nn_bias_16_layer1, }, }; #undef FEATURES #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_ aom-3.12.1/av1/encoder/av1_noise_estimate.c000066400000000000000000000261421477627663500204370ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_scale/yv12config.h" #include "aom/aom_integer.h" #include "av1/encoder/context_tree.h" #include "av1/encoder/av1_noise_estimate.h" #include "av1/encoder/encoder.h" #if CONFIG_AV1_TEMPORAL_DENOISING #include "av1/encoder/av1_temporal_denoiser.h" #endif #if CONFIG_AV1_TEMPORAL_DENOISING // For SVC: only do noise estimation on top spatial layer. static inline int noise_est_svc(const struct AV1_COMP *const cpi) { return (!cpi->ppi->use_svc || (cpi->ppi->use_svc && cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)); } #endif void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) { const int64_t area = (int64_t)width * height; ne->enabled = 0; ne->level = (area < 1280 * 720) ? kLowLow : kLow; ne->value = 0; ne->count = 0; ne->thresh = 90; ne->last_w = 0; ne->last_h = 0; if (area >= 1920 * 1080) { ne->thresh = 200; } else if (area >= 1280 * 720) { ne->thresh = 140; } else if (area >= 640 * 360) { ne->thresh = 115; } ne->num_frames_estimate = 15; ne->adapt_thresh = (3 * ne->thresh) >> 1; } static int enable_noise_estimation(AV1_COMP *const cpi) { const int resize_pending = is_frame_resize_pending(cpi); #if CONFIG_AV1_HIGHBITDEPTH if (cpi->common.seq_params->use_highbitdepth) return 0; #endif // Enable noise estimation if denoising is on. #if CONFIG_AV1_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) && cpi->common.width >= 320 && cpi->common.height >= 180) return 1; #endif // Only allow noise estimate under certain encoding mode. // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original. // Not enabled for SVC mode and screen_content_mode. // Not enabled for low resolutions. if (cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->oxcf.rc_cfg.mode == AOM_CBR && cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 && resize_pending == 0 && !cpi->ppi->use_svc && cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && cpi->common.width * cpi->common.height >= 640 * 360) return 1; else return 0; } #if CONFIG_AV1_TEMPORAL_DENOISING static void copy_frame(YV12_BUFFER_CONFIG *const dest, const YV12_BUFFER_CONFIG *const src) { const uint8_t *srcbuf = src->y_buffer; uint8_t *destbuf = dest->y_buffer; assert(dest->y_width == src->y_width); assert(dest->y_height == src->y_height); for (int r = 0; r < dest->y_height; ++r) { memcpy(destbuf, srcbuf, dest->y_width); destbuf += dest->y_stride; srcbuf += src->y_stride; } } #endif // CONFIG_AV1_TEMPORAL_DENOISING NOISE_LEVEL av1_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) { int noise_level = kLowLow; if (ne->value > (ne->thresh << 1)) { noise_level = kHigh; } else { if (ne->value > ne->thresh) noise_level = kMedium; else if (ne->value > (ne->thresh >> 1)) noise_level = kLow; else noise_level = kLowLow; } return noise_level; } void av1_update_noise_estimate(AV1_COMP *const cpi) { const AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; NOISE_ESTIMATE *const ne = &cpi->noise_estimate; const int low_res = (cm->width <= 352 && cm->height <= 288); // Estimate of noise level every frame_period frames. int frame_period = 8; int thresh_consec_zeromv = 2; int frame_counter = cm->current_frame.frame_number; // Estimate is between current source and last source. YV12_BUFFER_CONFIG *last_source = cpi->last_source; #if CONFIG_AV1_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) { last_source = &cpi->denoiser.last_source; // Tune these thresholds for different resolutions when denoising is // enabled. if (cm->width > 640 && cm->width <= 1920) { thresh_consec_zeromv = 2; } } #endif ne->enabled = enable_noise_estimation(cpi); if (cpi->svc.number_spatial_layers > 1) frame_counter = cpi->svc.current_superframe; if (!ne->enabled || frame_counter % frame_period != 0 || last_source == NULL || (cpi->svc.number_spatial_layers == 1 && (ne->last_w != cm->width || ne->last_h != cm->height))) { #if CONFIG_AV1_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) copy_frame(&cpi->denoiser.last_source, cpi->source); #endif if (last_source != NULL) { ne->last_w = cm->width; ne->last_h = cm->height; } return; } else if (frame_counter > 60 && cpi->svc.num_encoded_top_layer > 1 && cpi->rc.frames_since_key > cpi->svc.number_spatial_layers && cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 && cpi->rc.avg_frame_low_motion < (low_res ? 60 : 40)) { // Force noise estimation to 0 and denoiser off if content has high motion. ne->level = kLowLow; ne->count = 0; ne->num_frames_estimate = 10; #if CONFIG_AV1_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) && cpi->svc.current_superframe > 1) { av1_denoiser_set_noise_level(cpi, ne->level); copy_frame(&cpi->denoiser.last_source, cpi->source); } #endif return; } else { unsigned int bin_size = 100; unsigned int hist[MAX_VAR_HIST_BINS] = { 0 }; unsigned int hist_avg[MAX_VAR_HIST_BINS]; unsigned int max_bin = 0; unsigned int max_bin_count = 0; unsigned int bin_cnt; BLOCK_SIZE bsize = BLOCK_16X16; // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have // been encoded as zero/small mv at least x consecutive frames, compute // the variance to update estimate of noise in the source. const uint8_t *src_y = cpi->source->y_buffer; const int src_ystride = cpi->source->y_stride; const uint8_t *last_src_y = last_source->y_buffer; const int last_src_ystride = last_source->y_stride; int mi_row, mi_col; int num_low_motion = 0; int frame_low_motion = 1; for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row += 2) { for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col += 2) { int bl_index = (mi_row >> 1) * (mi_params->mi_cols >> 1) + (mi_col >> 1); if (cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv) num_low_motion++; } } if (num_low_motion < (((3 * (mi_params->mi_rows * mi_params->mi_cols) >> 2)) >> 3)) frame_low_motion = 0; for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row++) { for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col++) { // 16x16 blocks, 1/4 sample of frame. if (mi_row % 8 == 0 && mi_col % 8 == 0 && mi_row < mi_params->mi_rows - 3 && mi_col < mi_params->mi_cols - 3) { int bl_index = (mi_row >> 1) * (mi_params->mi_cols >> 1) + (mi_col >> 1); int bl_index1 = bl_index + 1; int bl_index2 = bl_index + (mi_params->mi_cols >> 1); int bl_index3 = bl_index2 + 1; int consec_zeromv = AOMMIN(cpi->consec_zero_mv[bl_index], AOMMIN(cpi->consec_zero_mv[bl_index1], AOMMIN(cpi->consec_zero_mv[bl_index2], cpi->consec_zero_mv[bl_index3]))); // Only consider blocks that are likely steady background. i.e, have // been encoded as zero/low motion x (= thresh_consec_zeromv) frames // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all // 4 sub-blocks for 16x16 block. And exclude this frame if // high_source_sad is true (i.e., scene/content change). if (frame_low_motion && consec_zeromv > thresh_consec_zeromv && !cpi->rc.high_source_sad) { unsigned int sse; // Compute variance between co-located blocks from current and // last input frames. unsigned int variance = cpi->ppi->fn_ptr[bsize].vf( src_y, src_ystride, last_src_y, last_src_ystride, &sse); unsigned int hist_index = variance / bin_size; if (hist_index < MAX_VAR_HIST_BINS) hist[hist_index]++; else if (hist_index < 3 * (MAX_VAR_HIST_BINS >> 1)) hist[MAX_VAR_HIST_BINS - 1]++; // Account for the tail } } src_y += 4; last_src_y += 4; } src_y += (src_ystride << 2) - (mi_params->mi_cols << 2); last_src_y += (last_src_ystride << 2) - (mi_params->mi_cols << 2); } ne->last_w = cm->width; ne->last_h = cm->height; // Adjust histogram to account for effect that histogram flattens // and shifts to zero as scene darkens. if (hist[0] > 10 && (hist[MAX_VAR_HIST_BINS - 1] > hist[0] >> 2)) { hist[0] = 0; hist[1] >>= 2; hist[2] >>= 2; hist[3] >>= 2; hist[4] >>= 1; hist[5] >>= 1; hist[6] = 3 * hist[6] >> 1; hist[MAX_VAR_HIST_BINS - 1] >>= 1; } // Average hist[] and find largest bin for (bin_cnt = 0; bin_cnt < MAX_VAR_HIST_BINS; bin_cnt++) { if (bin_cnt == 0) hist_avg[bin_cnt] = (hist[0] + hist[1] + hist[2]) / 3; else if (bin_cnt == MAX_VAR_HIST_BINS - 1) hist_avg[bin_cnt] = hist[MAX_VAR_HIST_BINS - 1] >> 2; else if (bin_cnt == MAX_VAR_HIST_BINS - 2) hist_avg[bin_cnt] = (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + (hist[bin_cnt + 1] >> 1) + 2) >> 2; else hist_avg[bin_cnt] = (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + hist[bin_cnt + 1] + 2) >> 2; if (hist_avg[bin_cnt] > max_bin_count) { max_bin_count = hist_avg[bin_cnt]; max_bin = bin_cnt; } } // Scale by 40 to work with existing thresholds ne->value = (int)((3 * ne->value + max_bin * 40) >> 2); // Quickly increase VNR strength when the noise level increases suddenly. if (ne->level < kMedium && ne->value > ne->adapt_thresh) { ne->count = ne->num_frames_estimate; } else { ne->count++; } if (ne->count == ne->num_frames_estimate) { // Reset counter and check noise level condition. ne->num_frames_estimate = 30; ne->count = 0; ne->level = av1_noise_estimate_extract_level(ne); #if CONFIG_AV1_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) av1_denoiser_set_noise_level(cpi, ne->level); #endif } } #if CONFIG_AV1_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) copy_frame(&cpi->denoiser.last_source, cpi->source); #endif } aom-3.12.1/av1/encoder/av1_noise_estimate.h000066400000000000000000000025201477627663500204360ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_ #define AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_ #include "av1/encoder/block.h" #include "aom_scale/yv12config.h" #ifdef __cplusplus extern "C" { #endif #define MAX_VAR_HIST_BINS 20 typedef enum noise_level { kLowLow, kLow, kMedium, kHigh } NOISE_LEVEL; typedef struct noise_estimate { int enabled; NOISE_LEVEL level; int value; int thresh; int adapt_thresh; int count; int last_w; int last_h; int num_frames_estimate; } NOISE_ESTIMATE; struct AV1_COMP; void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height); NOISE_LEVEL av1_noise_estimate_extract_level(NOISE_ESTIMATE *const ne); void av1_update_noise_estimate(struct AV1_COMP *const cpi); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_ aom-3.12.1/av1/encoder/av1_quantize.c000066400000000000000000001315111477627663500172640ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/quantize.h" #include "aom_mem/aom_mem.h" #include "aom_ports/bitops.h" #include "aom_ports/mem.h" #include "av1/common/idct.h" #include "av1/common/quant_common.h" #include "av1/common/scan.h" #include "av1/common/seg_common.h" #include "av1/encoder/av1_quantize.h" #include "av1/encoder/encoder.h" #include "av1/encoder/rd.h" void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); *eob_ptr = 0; } int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2], const int16_t dequant_ptr[2], const int16_t round_ptr[2], int log_scale, const int16_t *scan, int coeff_count, const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) { memset(qcoeff_ptr, 0, coeff_count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, coeff_count * sizeof(*dqcoeff_ptr)); const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale), ROUND_POWER_OF_TWO(round_ptr[1], log_scale) }; int eob = 0; for (int i = 0; i < coeff_count; i++) { const int rc = scan[i]; const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]); const int coeff = coeff_ptr[rc]; const int coeff_sign = AOMSIGN(coeff); int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; int tmp32 = 0; if ((abs_coeff << (1 + log_scale)) >= thresh) { abs_coeff = clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX); tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale)); if (tmp32) { qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[rc != 0]) >> log_scale; dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign; } } if (tmp32) eob = i + 1; } return eob; } static void quantize_fp_helper_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, int log_scale) { int i, eob = -1; const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale), ROUND_POWER_OF_TWO(round_ptr[1], log_scale) }; // TODO(jingning) Decide the need of these arguments after the // quantization process is completed. (void)zbin_ptr; (void)quant_shift_ptr; (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); if (qm_ptr == NULL && iqm_ptr == NULL) { *eob_ptr = av1_quantize_fp_no_qmatrix(quant_ptr, dequant_ptr, round_ptr, log_scale, scan, (int)n_coeffs, coeff_ptr, qcoeff_ptr, dqcoeff_ptr); } else { // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. for (i = 0; i < n_coeffs; i++) { const int rc = scan[i]; const int coeff = coeff_ptr[rc]; const qm_val_t wt = qm_ptr ? qm_ptr[rc] : (1 << AOM_QM_BITS); const qm_val_t iwt = iqm_ptr ? iqm_ptr[rc] : (1 << AOM_QM_BITS); const int dequant = (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; const int coeff_sign = AOMSIGN(coeff); int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; int tmp32 = 0; if (abs_coeff * wt >= (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) { abs_coeff += rounding[rc != 0]; abs_coeff = clamp64(abs_coeff, INT16_MIN, INT16_MAX); tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >> (16 - log_scale + AOM_QM_BITS)); qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign; } if (tmp32) eob = i; } *eob_ptr = eob + 1; } } #if CONFIG_AV1_HIGHBITDEPTH static void highbd_quantize_fp_helper_c( const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, int log_scale) { int i; int eob = -1; const int shift = 16 - log_scale; // TODO(jingning) Decide the need of these arguments after the // quantization process is completed. (void)zbin_ptr; (void)quant_shift_ptr; (void)iscan; if (qm_ptr || iqm_ptr) { // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. for (i = 0; i < count; i++) { const int rc = scan[i]; const int coeff = coeff_ptr[rc]; const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); const int dequant = (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; const int coeff_sign = AOMSIGN(coeff); const int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; int abs_qcoeff = 0; if (abs_coeff * wt >= (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) { const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); abs_qcoeff = (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS)); qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); if (abs_qcoeff) eob = i; } else { qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; } } } else { const int log_scaled_round_arr[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale), ROUND_POWER_OF_TWO(round_ptr[1], log_scale), }; for (i = 0; i < count; i++) { const int rc = scan[i]; const int coeff = coeff_ptr[rc]; const int rc01 = (rc != 0); const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int log_scaled_round = log_scaled_round_arr[rc01]; if ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) { const int quant = quant_ptr[rc01]; const int dequant = dequant_ptr[rc01]; const int64_t tmp = (int64_t)abs_coeff + log_scaled_round; const int abs_qcoeff = (int)((tmp * quant) >> shift); qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; if (abs_qcoeff) eob = i; dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); } else { qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; } } } *eob_ptr = eob + 1; } #endif // CONFIG_AV1_HIGHBITDEPTH void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0); } void av1_quantize_lp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { (void)iscan; int eob = -1; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. for (int i = 0; i < n_coeffs; i++) { const int rc = scan[i]; const int coeff = coeff_ptr[rc]; const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); tmp = (tmp * quant_ptr[rc != 0]) >> 16; qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; if (tmp) eob = i; } *eob_ptr = eob + 1; } void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1); } void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2); } void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; if (qm_ptr != NULL && iqm_ptr != NULL) { quantize_fp_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); } else { switch (qparam->log_scale) { case 0: av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; case 1: av1_quantize_fp_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; case 2: av1_quantize_fp_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; default: assert(0); } } } void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; #if !CONFIG_REALTIME_ONLY if (qparam->use_quant_b_adapt) { // TODO(sarahparker) These quantize_b optimizations need SIMD // implementations if (qm_ptr != NULL && iqm_ptr != NULL) { aom_quantize_b_adaptive_helper_c( coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); } else { switch (qparam->log_scale) { case 0: aom_quantize_b_adaptive(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; case 1: aom_quantize_b_32x32_adaptive( coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; case 2: aom_quantize_b_64x64_adaptive( coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; default: assert(0); } } return; } #endif // !CONFIG_REALTIME_ONLY if (qm_ptr != NULL && iqm_ptr != NULL) { aom_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); } else { switch (qparam->log_scale) { case 0: aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; case 1: aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; case 2: aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; default: assert(0); } } } static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) { const int rc = 0; const int coeff = coeff_ptr[rc]; const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; int64_t tmp; int eob = -1; int32_t tmp32; int dequant; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); if (!skip_block) { const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), INT16_MIN, INT16_MAX); tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS)); qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); if (tmp32) eob = 0; } *eob_ptr = eob + 1; } void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { // obsolete skip_block const int skip_block = 0; (void)sc; assert(qparam->log_scale >= 0 && qparam->log_scale < (3)); const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX, p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX[0], eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale); } #if CONFIG_AV1_HIGHBITDEPTH void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; if (qm_ptr != NULL && iqm_ptr != NULL) { highbd_quantize_fp_helper_c( coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); } else { av1_highbd_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, qparam->log_scale); } } void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; #if !CONFIG_REALTIME_ONLY if (qparam->use_quant_b_adapt) { if (qm_ptr != NULL && iqm_ptr != NULL) { aom_highbd_quantize_b_adaptive_helper_c( coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); } else { switch (qparam->log_scale) { case 0: aom_highbd_quantize_b_adaptive( coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; case 1: aom_highbd_quantize_b_32x32_adaptive( coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; case 2: aom_highbd_quantize_b_64x64_adaptive( coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; default: assert(0); } } return; } #endif // !CONFIG_REALTIME_ONLY if (qm_ptr != NULL && iqm_ptr != NULL) { aom_highbd_quantize_b_helper_c( coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); } else { switch (qparam->log_scale) { case 0: aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; case 1: aom_highbd_quantize_b_32x32( coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; case 2: aom_highbd_quantize_b_64x64( coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); break; default: assert(0); } } } static inline void highbd_quantize_dc( const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) { int eob = -1; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); if (!skip_block) { const qm_val_t wt = qm_ptr != NULL ? qm_ptr[0] : (1 << AOM_QM_BITS); const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[0] : (1 << AOM_QM_BITS); const int coeff = coeff_ptr[0]; const int coeff_sign = AOMSIGN(coeff); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], log_scale); const int64_t tmpw = tmp * wt; const int abs_qcoeff = (int)((tmpw * quant) >> (16 - log_scale + AOM_QM_BITS)); qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); const int dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; dqcoeff_ptr[0] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); if (abs_qcoeff) eob = 0; } *eob_ptr = eob + 1; } void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { // obsolete skip_block const int skip_block = 0; const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; (void)sc; highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX, p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX[0], eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale); } void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale) { highbd_quantize_fp_helper_c(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, log_scale); } #endif // CONFIG_AV1_HIGHBITDEPTH static void invert_quant(int16_t *quant, int16_t *shift, int d) { uint32_t t; int l, m; t = d; l = get_msb(t); m = 1 + (1 << (16 + l)) / d; *quant = (int16_t)(m - (1 << 16)); *shift = 1 << (16 - l); } static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) { const int quant = av1_dc_quant_QTX(q, 0, bit_depth); switch (bit_depth) { case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80); case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80); case AOM_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80); default: assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); return -1; } } void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q, int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q, int v_ac_delta_q, QUANTS *const quants, Dequants *const deq) { int i, q, quant_QTX; for (q = 0; q < QINDEX_RANGE; q++) { const int qzbin_factor = get_qzbin_factor(q, bit_depth); const int qrounding_factor = q == 0 ? 64 : 48; for (i = 0; i < 2; ++i) { const int qrounding_factor_fp = 64; // y quantizer with TX scale quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth) : av1_ac_quant_QTX(q, 0, bit_depth); invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant_QTX); quants->y_quant_fp[q][i] = (1 << 16) / quant_QTX; quants->y_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7; quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7); quants->y_round[q][i] = (qrounding_factor * quant_QTX) >> 7; deq->y_dequant_QTX[q][i] = quant_QTX; // u quantizer with TX scale quant_QTX = i == 0 ? av1_dc_quant_QTX(q, u_dc_delta_q, bit_depth) : av1_ac_quant_QTX(q, u_ac_delta_q, bit_depth); invert_quant(&quants->u_quant[q][i], &quants->u_quant_shift[q][i], quant_QTX); quants->u_quant_fp[q][i] = (1 << 16) / quant_QTX; quants->u_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7; quants->u_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7); quants->u_round[q][i] = (qrounding_factor * quant_QTX) >> 7; deq->u_dequant_QTX[q][i] = quant_QTX; // v quantizer with TX scale quant_QTX = i == 0 ? av1_dc_quant_QTX(q, v_dc_delta_q, bit_depth) : av1_ac_quant_QTX(q, v_ac_delta_q, bit_depth); invert_quant(&quants->v_quant[q][i], &quants->v_quant_shift[q][i], quant_QTX); quants->v_quant_fp[q][i] = (1 << 16) / quant_QTX; quants->v_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7; quants->v_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7); quants->v_round[q][i] = (qrounding_factor * quant_QTX) >> 7; deq->v_dequant_QTX[q][i] = quant_QTX; } for (i = 2; i < 8; i++) { // 8: SIMD width quants->y_quant[q][i] = quants->y_quant[q][1]; quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1]; quants->y_round_fp[q][i] = quants->y_round_fp[q][1]; quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1]; quants->y_zbin[q][i] = quants->y_zbin[q][1]; quants->y_round[q][i] = quants->y_round[q][1]; deq->y_dequant_QTX[q][i] = deq->y_dequant_QTX[q][1]; quants->u_quant[q][i] = quants->u_quant[q][1]; quants->u_quant_fp[q][i] = quants->u_quant_fp[q][1]; quants->u_round_fp[q][i] = quants->u_round_fp[q][1]; quants->u_quant_shift[q][i] = quants->u_quant_shift[q][1]; quants->u_zbin[q][i] = quants->u_zbin[q][1]; quants->u_round[q][i] = quants->u_round[q][1]; deq->u_dequant_QTX[q][i] = deq->u_dequant_QTX[q][1]; quants->v_quant[q][i] = quants->v_quant[q][1]; quants->v_quant_fp[q][i] = quants->v_quant_fp[q][1]; quants->v_round_fp[q][i] = quants->v_round_fp[q][1]; quants->v_quant_shift[q][i] = quants->v_quant_shift[q][1]; quants->v_zbin[q][i] = quants->v_zbin[q][1]; quants->v_round[q][i] = quants->v_round[q][1]; deq->v_dequant_QTX[q][i] = deq->v_dequant_QTX[q][1]; } } } static inline bool deltaq_params_have_changed( const DeltaQuantParams *prev_deltaq_params, const CommonQuantParams *quant_params) { return (prev_deltaq_params->y_dc_delta_q != quant_params->y_dc_delta_q || prev_deltaq_params->u_dc_delta_q != quant_params->u_dc_delta_q || prev_deltaq_params->v_dc_delta_q != quant_params->v_dc_delta_q || prev_deltaq_params->u_ac_delta_q != quant_params->u_ac_delta_q || prev_deltaq_params->v_ac_delta_q != quant_params->v_ac_delta_q); } void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params, const CommonQuantParams *quant_params, aom_bit_depth_t bit_depth) { DeltaQuantParams *const prev_deltaq_params = &enc_quant_dequant_params->prev_deltaq_params; // Re-initialize the quantizer only if any of the dc/ac deltaq parameters // change. if (!deltaq_params_have_changed(prev_deltaq_params, quant_params)) return; QUANTS *const quants = &enc_quant_dequant_params->quants; Dequants *const dequants = &enc_quant_dequant_params->dequants; av1_build_quantizer(bit_depth, quant_params->y_dc_delta_q, quant_params->u_dc_delta_q, quant_params->u_ac_delta_q, quant_params->v_dc_delta_q, quant_params->v_ac_delta_q, quants, dequants); // Record the state of deltaq parameters. prev_deltaq_params->y_dc_delta_q = quant_params->y_dc_delta_q; prev_deltaq_params->u_dc_delta_q = quant_params->u_dc_delta_q; prev_deltaq_params->v_dc_delta_q = quant_params->v_dc_delta_q; prev_deltaq_params->u_ac_delta_q = quant_params->u_ac_delta_q; prev_deltaq_params->v_ac_delta_q = quant_params->v_ac_delta_q; } /*!\brief Update quantize parameters in MACROBLOCK * * \param[in] enc_quant_dequant_params This parameter cached the quantize and * dequantize parameters for all q * indices. * \param[in] qindex Quantize index used for the current * superblock. * \param[out] x A superblock data structure for * encoder. */ static void set_q_index(const EncQuantDequantParams *enc_quant_dequant_params, int qindex, MACROBLOCK *x) { const QUANTS *const quants = &enc_quant_dequant_params->quants; const Dequants *const dequants = &enc_quant_dequant_params->dequants; x->qindex = qindex; x->seg_skip_block = 0; // TODO(angiebird): Find a proper place to init this variable. // Y x->plane[0].quant_QTX = quants->y_quant[qindex]; x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex]; x->plane[0].round_fp_QTX = quants->y_round_fp[qindex]; x->plane[0].quant_shift_QTX = quants->y_quant_shift[qindex]; x->plane[0].zbin_QTX = quants->y_zbin[qindex]; x->plane[0].round_QTX = quants->y_round[qindex]; x->plane[0].dequant_QTX = dequants->y_dequant_QTX[qindex]; // U x->plane[1].quant_QTX = quants->u_quant[qindex]; x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex]; x->plane[1].round_fp_QTX = quants->u_round_fp[qindex]; x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex]; x->plane[1].zbin_QTX = quants->u_zbin[qindex]; x->plane[1].round_QTX = quants->u_round[qindex]; x->plane[1].dequant_QTX = dequants->u_dequant_QTX[qindex]; // V x->plane[2].quant_QTX = quants->v_quant[qindex]; x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex]; x->plane[2].round_fp_QTX = quants->v_round_fp[qindex]; x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex]; x->plane[2].zbin_QTX = quants->v_zbin[qindex]; x->plane[2].round_QTX = quants->v_round[qindex]; x->plane[2].dequant_QTX = dequants->v_dequant_QTX[qindex]; } /*!\brief Update quantize matrix in MACROBLOCKD based on segment id * * \param[in] quant_params Quantize parameters used by encoder and decoder * \param[in] segment_id Segment id. * \param[out] xd A superblock data structure used by encoder and * decoder. */ static void set_qmatrix(const CommonQuantParams *quant_params, int segment_id, MACROBLOCKD *xd) { const int use_qmatrix = av1_use_qmatrix(quant_params, xd, segment_id); const int qmlevel_y = use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1; const int qmlevel_u = use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1; const int qmlevel_v = use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1; const int qmlevel_ls[MAX_MB_PLANE] = { qmlevel_y, qmlevel_u, qmlevel_v }; for (int i = 0; i < MAX_MB_PLANE; ++i) { const int qmlevel = qmlevel_ls[i]; memcpy(&xd->plane[i].seg_qmatrix[segment_id], quant_params->gqmatrix[qmlevel][i], sizeof(quant_params->gqmatrix[qmlevel][i])); memcpy(&xd->plane[i].seg_iqmatrix[segment_id], quant_params->giqmatrix[qmlevel][i], sizeof(quant_params->giqmatrix[qmlevel][i])); } } void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x, int segment_id, const int do_update) { const AV1_COMMON *const cm = &cpi->common; const CommonQuantParams *const quant_params = &cm->quant_params; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); const FRAME_TYPE frame_type = cm->current_frame.frame_type; int qindex_rd; const int current_qindex = AOMMAX( 0, AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag ? quant_params->base_qindex + x->delta_qindex : quant_params->base_qindex)); const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex); if (cpi->oxcf.sb_qp_sweep) { const int current_rd_qindex = AOMMAX(0, AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag ? quant_params->base_qindex + x->rdmult_delta_qindex : quant_params->base_qindex)); qindex_rd = av1_get_qindex(&cm->seg, segment_id, current_rd_qindex); } else { qindex_rd = qindex; } const int qindex_rdmult = qindex_rd + quant_params->y_dc_delta_q; const int rdmult = av1_compute_rd_mult( qindex_rdmult, cm->seq_params->bit_depth, cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi), cpi->oxcf.tune_cfg.tuning); const int qindex_change = x->qindex != qindex; if (qindex_change || do_update) { set_q_index(&cpi->enc_quant_dequant_params, qindex, x); } MACROBLOCKD *const xd = &x->e_mbd; if ((segment_id != x->prev_segment_id) || av1_use_qmatrix(quant_params, xd, segment_id)) { set_qmatrix(quant_params, segment_id, xd); } x->seg_skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); av1_set_error_per_bit(&x->errorperbit, rdmult); av1_set_sad_per_bit(cpi, &x->sadperbit, qindex_rd); x->prev_segment_id = segment_id; } void av1_frame_init_quantizer(AV1_COMP *cpi) { MACROBLOCK *const x = &cpi->td.mb; MACROBLOCKD *const xd = &x->e_mbd; x->prev_segment_id = -1; av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 1); } static int adjust_hdr_cb_deltaq(int base_qindex) { double baseQp = base_qindex / QP_SCALE_FACTOR; const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET; const double dcbQP = CHROMA_CB_QP_SCALE * chromaQp * QP_SCALE_FACTOR; int dqpCb = (int)(dcbQP + (dcbQP < 0 ? -0.5 : 0.5)); dqpCb = AOMMIN(0, dqpCb); dqpCb = (int)CLIP(dqpCb, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR); return dqpCb; } static int adjust_hdr_cr_deltaq(int base_qindex) { double baseQp = base_qindex / QP_SCALE_FACTOR; const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET; const double dcrQP = CHROMA_CR_QP_SCALE * chromaQp * QP_SCALE_FACTOR; int dqpCr = (int)(dcrQP + (dcrQP < 0 ? -0.5 : 0.5)); dqpCr = AOMMIN(0, dqpCr); dqpCr = (int)CLIP(dqpCr, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR); return dqpCr; } void av1_set_quantizer(AV1_COMMON *const cm, int min_qmlevel, int max_qmlevel, int q, int enable_chroma_deltaq, int enable_hdr_deltaq, bool is_allintra, aom_tune_metric tuning) { // quantizer has to be reinitialized with av1_init_quantizer() if any // delta_q changes. CommonQuantParams *quant_params = &cm->quant_params; quant_params->base_qindex = AOMMAX(cm->delta_q_info.delta_q_present_flag, q); quant_params->y_dc_delta_q = 0; if (enable_chroma_deltaq) { if (is_allintra && tuning == AOM_TUNE_IQ) { int chroma_dc_delta_q = 0; int chroma_ac_delta_q = 0; if (cm->seq_params->subsampling_x == 1 && cm->seq_params->subsampling_y == 1) { // 4:2:0 subsampling: Constant chroma delta_q decrease (i.e. improved // chroma quality relative to luma) with gradual ramp-down for very low // qindexes. // Lowering chroma delta_q by 16 was found to improve SSIMULACRA 2 // BD-Rate by 1.5-2% on Daala's subset1, as well as reducing chroma // artifacts (smudging, discoloration) during subjective quality // evaluations. // The ramp-down of chroma increase was determined by generating the // convex hull of SSIMULACRA 2 scores (for all boosts from 0-16), and // finding a linear equation that fits the convex hull. chroma_dc_delta_q = -clamp((quant_params->base_qindex / 2) - 14, 0, 16); chroma_ac_delta_q = chroma_dc_delta_q; } else if (cm->seq_params->subsampling_x == 1 && cm->seq_params->subsampling_y == 0) { // 4:2:2 subsampling: Constant chroma AC delta_q increase (i.e. improved // luma quality relative to chroma) with gradual ramp-down for very low // qindexes. // SSIMULACRA 2 appears to have some issues correctly scoring 4:2:2 // material. Solely optimizing for maximum scores suggests a chroma AC // delta_q of 12 is the most efficient. However, visual inspection on // difficult-to-encode material resulted in chroma quality degrading too // much relative to luma, and chroma channels ending up being too small // compared to equivalent 4:4:4 or 4:2:0 encodes. // A chroma AC delta_q of 6 was selected because encoded chroma channels // have a much closer size to 4:4:4 and 4:2:0 encodes, and have more // favorable visual quality characteristics. // The ramp-down of chroma decrease was put into place to match 4:2:0 // and 4:4:4 behavior. There were no special considerations on // SSIMULACRA 2 scores. chroma_dc_delta_q = 0; chroma_ac_delta_q = clamp((quant_params->base_qindex / 2), 0, 6); } else if (cm->seq_params->subsampling_x == 0 && cm->seq_params->subsampling_y == 0) { // 4:4:4 subsampling: Constant chroma AC delta_q increase (i.e. improved // luma quality relative to chroma) with gradual ramp-down for very low // qindexes. // Raising chroma AC delta_q by 24 was found to improve SSIMULACRA 2 // BD-Rate by 2.5-3% on Daala's subset1, as well as providing a more // balanced bit allocation between the (relatively-starved) luma and // chroma channels. // Raising chroma DC delta_q appears to be harmful, both for SSIMULACRA // 2 scores and subjective quality (harshens blocking artifacts). // The ramp-down of chroma decrease was put into place so (lossy) QP 0 // encodes still score within 0.1 SSIMULACRA 2 points of the equivalent // with no chroma delta_q (with a small efficiency improvement), while // encodes in the SSIMULACRA 2 <=90 range yield full benefits from this // adjustment. chroma_dc_delta_q = 0; chroma_ac_delta_q = clamp((quant_params->base_qindex / 2), 0, 24); } // TODO: bug https://crbug.com/aomedia/375221136 - find chroma_delta_q // values for 4:2:2 subsampling mode. quant_params->u_dc_delta_q = chroma_dc_delta_q; quant_params->u_ac_delta_q = chroma_ac_delta_q; quant_params->v_dc_delta_q = chroma_dc_delta_q; quant_params->v_ac_delta_q = chroma_ac_delta_q; } else { // TODO(aomedia:2717): need to design better delta quant_params->u_dc_delta_q = 2; quant_params->u_ac_delta_q = 2; quant_params->v_dc_delta_q = 2; quant_params->v_ac_delta_q = 2; } } else { quant_params->u_dc_delta_q = 0; quant_params->u_ac_delta_q = 0; quant_params->v_dc_delta_q = 0; quant_params->v_ac_delta_q = 0; } // following section 8.3.2 in T-REC-H.Sup15 document // to apply to AV1 qindex in the range of [0, 255] if (enable_hdr_deltaq) { int dqpCb = adjust_hdr_cb_deltaq(quant_params->base_qindex); int dqpCr = adjust_hdr_cr_deltaq(quant_params->base_qindex); quant_params->u_dc_delta_q = quant_params->u_ac_delta_q = dqpCb; quant_params->v_dc_delta_q = quant_params->v_ac_delta_q = dqpCr; if (dqpCb != dqpCr) { cm->seq_params->separate_uv_delta_q = 1; } } // Select the best luma and chroma QM formulas based on encoding mode and // tuning int (*get_luma_qmlevel)(int, int, int); int (*get_chroma_qmlevel)(int, int, int); if (is_allintra) { if (tuning == AOM_TUNE_IQ) { // Use luma QM formula specifically tailored for tune IQ get_luma_qmlevel = aom_get_qmlevel_luma_iq; if (cm->seq_params->subsampling_x == 0 && cm->seq_params->subsampling_y == 0) { // 4:4:4 subsampling mode has 4x the number of chroma coefficients // compared to 4:2:0 (2x on each dimension). This means the encoder // should use lower chroma QM levels that more closely match the scaling // of an equivalent 4:2:0 chroma QM. get_chroma_qmlevel = aom_get_qmlevel_444_chroma_iq; } else { // For all other chroma subsampling modes, use the all intra QM formula get_chroma_qmlevel = aom_get_qmlevel_allintra; } } else { get_luma_qmlevel = aom_get_qmlevel_allintra; get_chroma_qmlevel = aom_get_qmlevel_allintra; } } else { get_luma_qmlevel = aom_get_qmlevel; get_chroma_qmlevel = aom_get_qmlevel; } quant_params->qmatrix_level_y = get_luma_qmlevel(quant_params->base_qindex, min_qmlevel, max_qmlevel); quant_params->qmatrix_level_u = get_chroma_qmlevel(quant_params->base_qindex + quant_params->u_ac_delta_q, min_qmlevel, max_qmlevel); if (cm->seq_params->separate_uv_delta_q) { quant_params->qmatrix_level_v = get_chroma_qmlevel( quant_params->base_qindex + quant_params->v_ac_delta_q, min_qmlevel, max_qmlevel); } else { quant_params->qmatrix_level_v = quant_params->qmatrix_level_u; } } // Table that converts 0-63 Q-range values passed in outside to the Qindex // range used internally. static const int quantizer_to_qindex[] = { 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255, }; int av1_quantizer_to_qindex(int quantizer) { return quantizer_to_qindex[quantizer]; } int av1_qindex_to_quantizer(int qindex) { int quantizer; for (quantizer = 0; quantizer < 64; ++quantizer) if (quantizer_to_qindex[quantizer] >= qindex) return quantizer; return 63; } aom-3.12.1/av1/encoder/av1_quantize.h000066400000000000000000000210611477627663500172670ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_AV1_QUANTIZE_H_ #define AOM_AV1_ENCODER_AV1_QUANTIZE_H_ #include #include "config/aom_config.h" #include "aom/aomcx.h" #include "av1/common/quant_common.h" #include "av1/common/scan.h" #include "av1/encoder/block.h" #ifdef __cplusplus extern "C" { #endif typedef struct QUANT_PARAM { int log_scale; TX_SIZE tx_size; const qm_val_t *qmatrix; const qm_val_t *iqmatrix; int use_quant_b_adapt; int use_optimize_b; int xform_quant_idx; } QUANT_PARAM; typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam); // The QUANTS structure is used only for internal quantizer setup in // av1_quantize.c. // All of its fields use the same coefficient shift/scaling at TX. typedef struct { // 0: dc 1: ac 2-8: ac repeated to SIMD width DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]); // TODO(jingning): in progress of re-working the quantization. will decide // if we want to deprecate the current use of y_quant. DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, u_quant_fp[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, v_quant_fp[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, u_round_fp[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, v_round_fp[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, u_quant[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, v_quant[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, u_quant_shift[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, v_quant_shift[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, u_zbin[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, v_zbin[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, u_round[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, v_round[QINDEX_RANGE][8]); } QUANTS; // The Dequants structure is used only for internal quantizer setup in // av1_quantize.c. // Fields are suffixed according to whether or not they're expressed in // the same coefficient shift/precision as TX or a fixed Q3 format. typedef struct { DECLARE_ALIGNED(16, int16_t, y_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width DECLARE_ALIGNED(16, int16_t, u_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width DECLARE_ALIGNED(16, int16_t, v_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width } Dequants; // The DeltaQuantParams structure holds the dc/ac deltaq parameters. typedef struct { int y_dc_delta_q; int u_dc_delta_q; int u_ac_delta_q; int v_dc_delta_q; int v_ac_delta_q; } DeltaQuantParams; typedef struct { // Quantization parameters for internal quantizer setup. QUANTS quants; // Dequantization parameters for internal quantizer setup. Dequants dequants; // Deltaq parameters to track the state of the dc/ac deltaq parameters in // cm->quant_params. It is used to decide whether the quantizer tables need // to be re-initialized. DeltaQuantParams prev_deltaq_params; } EncQuantDequantParams; struct AV1_COMP; struct AV1Common; void av1_frame_init_quantizer(struct AV1_COMP *cpi); void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x, int segment_id, const int do_update); void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q, int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q, int v_ac_delta_q, QUANTS *const quants, Dequants *const deq); void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params, const CommonQuantParams *quant_params, aom_bit_depth_t bit_depth); void av1_set_quantizer(struct AV1Common *const cm, int min_qmlevel, int max_qmlevel, int q, int enable_chroma_deltaq, int enable_hdr_deltaq, bool is_allintra, aom_tune_metric tuning); int av1_quantizer_to_qindex(int quantizer); int av1_qindex_to_quantizer(int qindex); void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr); /*!\brief Quantize transform coefficients without using qmatrix * * quant_ptr, dequant_ptr and round_ptr are size 2 arrays, * where index 0 corresponds to dc coeff and index 1 corresponds to ac coeffs. * * \param[in] quant_ptr 16-bit fixed point representation of inverse * quantize step size, i.e. 2^16/dequant * \param[in] dequant_ptr quantize step size * \param[in] round_ptr rounding * \param[in] log_scale the relative log scale of the transform * coefficients * \param[in] scan scan[i] indicates the position of ith to-be-coded * coefficient * \param[in] coeff_count number of coefficients * \param[out] qcoeff_ptr quantized coefficients * \param[out] dqcoeff_ptr dequantized coefficients * * \return The last non-zero coefficient's scan index plus 1 */ int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2], const int16_t dequant_ptr[2], const int16_t round_ptr[2], int log_scale, const int16_t *scan, int coeff_count, const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr); void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam); void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam); void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam); #if CONFIG_AV1_HIGHBITDEPTH void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam); void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam); void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc, const QUANT_PARAM *qparam); #endif #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_AV1_QUANTIZE_H_ aom-3.12.1/av1/encoder/av1_temporal_denoiser.c000066400000000000000000000704571477627663500211520ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_scale/yv12config.h" #include "aom/aom_integer.h" #include "av1/common/reconinter.h" #include "av1/encoder/reconinter_enc.h" #include "av1/encoder/context_tree.h" #include "av1/encoder/av1_temporal_denoiser.h" #include "av1/encoder/encoder.h" #ifdef OUTPUT_YUV_DENOISED static void make_grayscale(YV12_BUFFER_CONFIG *yuv); #endif static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) { (void)bs; return 3 + (increase_denoising ? 1 : 0); } static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) { (void)bs; (void)increase_denoising; return 4; } static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) { (void)bs; (void)increase_denoising; return 625; } static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) { return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 80 : 40); } static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising, int motion_magnitude) { if (motion_magnitude > noise_motion_thresh(bs, increase_denoising)) { if (increase_denoising) return (1 << num_pels_log2_lookup[bs]) << 2; else return 0; } else { return (1 << num_pels_log2_lookup[bs]) << 4; } } static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) { return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2); } // TODO(kyslov): If increase_denoising is enabled in the future, // we might need to update the code for calculating 'total_adj' in // case the C code is not bit-exact with corresponding sse2 code. int av1_denoiser_filter_c(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude) { int r, c; const uint8_t *sig_start = sig; const uint8_t *mc_avg_start = mc_avg; uint8_t *avg_start = avg; int diff, adj, absdiff, delta; int adj_val[] = { 3, 4, 6 }; int total_adj = 0; int shift_inc = 1; // If motion_magnitude is small, making the denoiser more aggressive by // increasing the adjustment for each level. Add another increment for // blocks that are labeled for increase denoising. if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) { if (increase_denoising) { shift_inc = 2; } adj_val[0] += shift_inc; adj_val[1] += shift_inc; adj_val[2] += shift_inc; } // First attempt to apply a strong temporal denoising filter. for (r = 0; r < block_size_high[bs]; ++r) { for (c = 0; c < block_size_wide[bs]; ++c) { diff = mc_avg[c] - sig[c]; absdiff = abs(diff); if (absdiff <= absdiff_thresh(bs, increase_denoising)) { avg[c] = mc_avg[c]; total_adj += diff; } else { switch (absdiff) { case 4: case 5: case 6: case 7: adj = adj_val[0]; break; case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: adj = adj_val[1]; break; default: adj = adj_val[2]; } if (diff > 0) { avg[c] = AOMMIN(UINT8_MAX, sig[c] + adj); total_adj += adj; } else { avg[c] = AOMMAX(0, sig[c] - adj); total_adj -= adj; } } } sig += sig_stride; avg += avg_stride; mc_avg += mc_avg_stride; } // If the strong filter did not modify the signal too much, we're all set. if (abs(total_adj) <= total_adj_strong_thresh(bs, increase_denoising)) { return FILTER_BLOCK; } // Otherwise, we try to dampen the filter if the delta is not too high. delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising)) >> num_pels_log2_lookup[bs]) + 1; if (delta >= delta_thresh(bs, increase_denoising)) { return COPY_BLOCK; } mc_avg = mc_avg_start; avg = avg_start; sig = sig_start; for (r = 0; r < block_size_high[bs]; ++r) { for (c = 0; c < block_size_wide[bs]; ++c) { diff = mc_avg[c] - sig[c]; adj = abs(diff); if (adj > delta) { adj = delta; } if (diff > 0) { // Diff positive means we made positive adjustment above // (in first try/attempt), so now make negative adjustment to bring // denoised signal down. avg[c] = AOMMAX(0, avg[c] - adj); total_adj -= adj; } else { // Diff negative means we made negative adjustment above // (in first try/attempt), so now make positive adjustment to bring // denoised signal up. avg[c] = AOMMIN(UINT8_MAX, avg[c] + adj); total_adj += adj; } } sig += sig_stride; avg += avg_stride; mc_avg += mc_avg_stride; } // We can use the filter if it has been sufficiently dampened if (abs(total_adj) <= total_adj_weak_thresh(bs, increase_denoising)) { return FILTER_BLOCK; } return COPY_BLOCK; } static uint8_t *block_start(uint8_t *framebuf, int stride, int mi_row, int mi_col) { return framebuf + (stride * mi_row << 2) + (mi_col << 2); } static AV1_DENOISER_DECISION perform_motion_compensation( AV1_COMMON *const cm, AV1_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs, int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx, int motion_magnitude, int *zeromv_filter, int num_spatial_layers, int width, int lst_fb_idx, int gld_fb_idx, int use_svc, int spatial_layer, int use_gf_temporal_ref) { const int sse_diff = (ctx->newmv_sse == UINT_MAX) ? 0 : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse); int frame; int denoise_layer_idx = 0; MACROBLOCKD *filter_mbd = &mb->e_mbd; MB_MODE_INFO *mi = filter_mbd->mi[0]; MB_MODE_INFO saved_mi; int i; struct buf_2d saved_dst[MAX_MB_PLANE]; struct buf_2d saved_pre[MAX_MB_PLANE]; // const RefBuffer *saved_block_refs[2]; MV_REFERENCE_FRAME saved_frame; frame = ctx->best_reference_frame; saved_mi = *mi; // Avoid denoising small blocks. When noise > kDenLow or frame width > 480, // denoise 16x16 blocks. if (bs == BLOCK_8X8 || bs == BLOCK_8X16 || bs == BLOCK_16X8 || (bs == BLOCK_16X16 && width > 480 && denoiser->denoising_level <= kDenLow)) return COPY_BLOCK; // If the best reference frame uses inter-prediction and there is enough of a // difference in sum-squared-error, use it. if (frame != INTRA_FRAME && frame != ALTREF_FRAME && frame != GOLDEN_FRAME && sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) { mi->ref_frame[0] = ctx->best_reference_frame; mi->mode = ctx->best_sse_inter_mode; mi->mv[0] = ctx->best_sse_mv; } else { // Otherwise, use the zero reference frame. frame = ctx->best_zeromv_reference_frame; ctx->newmv_sse = ctx->zeromv_sse; // Bias to last reference. if ((num_spatial_layers > 1 && !use_gf_temporal_ref) || frame == ALTREF_FRAME || (frame == GOLDEN_FRAME && use_gf_temporal_ref) || (frame != LAST_FRAME && ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) || denoiser->denoising_level >= kDenHigh))) { frame = LAST_FRAME; ctx->newmv_sse = ctx->zeromv_lastref_sse; } mi->ref_frame[0] = frame; mi->mode = GLOBALMV; mi->mv[0].as_int = 0; ctx->best_sse_inter_mode = GLOBALMV; ctx->best_sse_mv.as_int = 0; *zeromv_filter = 1; if (denoiser->denoising_level > kDenMedium) { motion_magnitude = 0; } } saved_frame = frame; // When using SVC, we need to map REF_FRAME to the frame buffer index. if (use_svc) { if (frame == LAST_FRAME) frame = lst_fb_idx + 1; else if (frame == GOLDEN_FRAME) frame = gld_fb_idx + 1; // Shift for the second spatial layer. if (num_spatial_layers - spatial_layer == 2) frame = frame + denoiser->num_ref_frames; denoise_layer_idx = num_spatial_layers - spatial_layer - 1; } // Force copy (no denoise, copy source in denoised buffer) if // running_avg_y[frame] is NULL. if (denoiser->running_avg_y[frame].buffer_alloc == NULL) { // Restore everything to its original state *mi = saved_mi; return COPY_BLOCK; } if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) { // Restore everything to its original state *mi = saved_mi; return COPY_BLOCK; } if (motion_magnitude > (noise_motion_thresh(bs, increase_denoising) << 3)) { // Restore everything to its original state *mi = saved_mi; return COPY_BLOCK; } // We will restore these after motion compensation. for (i = 0; i < MAX_MB_PLANE; ++i) { saved_pre[i] = filter_mbd->plane[i].pre[0]; saved_dst[i] = filter_mbd->plane[i].dst; } // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser // struct. set_ref_ptrs(cm, filter_mbd, saved_frame, NONE); av1_setup_pre_planes(filter_mbd, 0, &(denoiser->running_avg_y[frame]), mi_row, mi_col, filter_mbd->block_ref_scale_factors[0], 1); av1_setup_dst_planes(filter_mbd->plane, bs, &(denoiser->mc_running_avg_y[denoise_layer_idx]), mi_row, mi_col, 0, 1); av1_enc_build_inter_predictor_y(filter_mbd, mi_row, mi_col); // Restore everything to its original state *mi = saved_mi; for (i = 0; i < MAX_MB_PLANE; ++i) { filter_mbd->plane[i].pre[0] = saved_pre[i]; filter_mbd->plane[i].dst = saved_dst[i]; } return FILTER_BLOCK; } void av1_denoiser_denoise(AV1_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx, AV1_DENOISER_DECISION *denoiser_decision, int use_gf_temporal_ref) { int mv_col, mv_row; int motion_magnitude = 0; int zeromv_filter = 0; AV1_DENOISER *denoiser = &cpi->denoiser; AV1_DENOISER_DECISION decision = COPY_BLOCK; const int shift = cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 ? denoiser->num_ref_frames : 0; YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift]; const int denoise_layer_index = cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1; YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index]; uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col); uint8_t *mc_avg_start = block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col); struct buf_2d src = mb->plane[0].src; int increase_denoising = 0; int last_is_reference = cpi->ref_frame_flags & AOM_LAST_FLAG; mv_col = ctx->best_sse_mv.as_mv.col; mv_row = ctx->best_sse_mv.as_mv.row; motion_magnitude = mv_row * mv_row + mv_col * mv_col; if (denoiser->denoising_level == kDenHigh) increase_denoising = 1; // Copy block if LAST_FRAME is not a reference. // Last doesn't always exist when SVC layers are dynamically changed, e.g. top // spatial layer doesn't have last reference when it's brought up for the // first time on the fly. if (last_is_reference && denoiser->denoising_level >= kDenLow && !ctx->sb_skip_denoising) decision = perform_motion_compensation( &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx, motion_magnitude, &zeromv_filter, cpi->svc.number_spatial_layers, cpi->source->y_width, cpi->ppi->rtc_ref.ref_idx[0], cpi->ppi->rtc_ref.ref_idx[3], cpi->ppi->use_svc, cpi->svc.spatial_layer_id, use_gf_temporal_ref); if (decision == FILTER_BLOCK) { decision = av1_denoiser_filter(src.buf, src.stride, mc_avg_start, mc_avg.y_stride, avg_start, avg.y_stride, increase_denoising, bs, motion_magnitude); } if (decision == FILTER_BLOCK) { aom_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, block_size_wide[bs], block_size_high[bs]); } else { // COPY_BLOCK aom_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, block_size_wide[bs], block_size_high[bs]); } *denoiser_decision = decision; if (decision == FILTER_BLOCK && zeromv_filter == 1) *denoiser_decision = FILTER_ZEROMV_BLOCK; } static void copy_frame(YV12_BUFFER_CONFIG *const dest, const YV12_BUFFER_CONFIG *const src) { int r; const uint8_t *srcbuf = src->y_buffer; uint8_t *destbuf = dest->y_buffer; assert(dest->y_width == src->y_width); assert(dest->y_height == src->y_height); for (r = 0; r < dest->y_height; ++r) { memcpy(destbuf, srcbuf, dest->y_width); destbuf += dest->y_stride; srcbuf += src->y_stride; } } static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest, YV12_BUFFER_CONFIG *const src) { uint8_t *tmp_buf = dest->y_buffer; assert(dest->y_width == src->y_width); assert(dest->y_height == src->y_height); dest->y_buffer = src->y_buffer; src->y_buffer = tmp_buf; } void av1_denoiser_update_frame_info( AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct RTC_REF *rtc_ref, struct SVC *svc, FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer) { const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0; // Copy source into denoised reference buffers on KEY_FRAME or // if the just encoded frame was resized. For SVC, copy source if the base // spatial layer was key frame. if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset || svc_refresh_denoiser_buffers) { int i; // Start at 1 so as not to overwrite the INTRA_FRAME for (i = 1; i < denoiser->num_ref_frames; ++i) { if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL) copy_frame(&denoiser->running_avg_y[i + shift], &src); } denoiser->reset = 0; return; } if (rtc_ref->set_ref_frame_config) { int i; for (i = 0; i < REF_FRAMES; i++) { if (rtc_ref->refresh[svc->spatial_layer_id] & (1 << i)) copy_frame(&denoiser->running_avg_y[i + 1 + shift], &denoiser->running_avg_y[INTRA_FRAME + shift]); } } else { // If more than one refresh occurs, must copy frame buffer. if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > 1) { if (refresh_alt_ref_frame) { copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], &denoiser->running_avg_y[INTRA_FRAME + shift]); } if (refresh_golden_frame) { copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], &denoiser->running_avg_y[INTRA_FRAME + shift]); } if (refresh_last_frame) { copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], &denoiser->running_avg_y[INTRA_FRAME + shift]); } } else { if (refresh_alt_ref_frame) { swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], &denoiser->running_avg_y[INTRA_FRAME + shift]); } if (refresh_golden_frame) { swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], &denoiser->running_avg_y[INTRA_FRAME + shift]); } if (refresh_last_frame) { swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], &denoiser->running_avg_y[INTRA_FRAME + shift]); } } } } void av1_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) { ctx->zeromv_sse = INT64_MAX; ctx->newmv_sse = INT64_MAX; ctx->zeromv_lastref_sse = INT64_MAX; ctx->best_sse_mv.as_int = 0; } void av1_denoiser_update_frame_stats(MB_MODE_INFO *mi, int64_t sse, PREDICTION_MODE mode, PICK_MODE_CONTEXT *ctx) { if (mi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) { ctx->zeromv_sse = sse; ctx->best_zeromv_reference_frame = mi->ref_frame[0]; if (mi->ref_frame[0] == LAST_FRAME) ctx->zeromv_lastref_sse = sse; } if (mi->mv[0].as_int != 0 && sse < ctx->newmv_sse) { ctx->newmv_sse = sse; ctx->best_sse_inter_mode = mode; ctx->best_sse_mv = mi->mv[0]; ctx->best_reference_frame = mi->ref_frame[0]; } } static int av1_denoiser_realloc_svc_helper(AV1_COMMON *cm, AV1_DENOISER *denoiser, int fb_idx) { int fail = 0; if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) { fail = aom_alloc_frame_buffer( &denoiser->running_avg_y[fb_idx], cm->width, cm->height, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->features.byte_alignment, false, 0); if (fail) { av1_denoiser_free(denoiser); return 1; } } return 0; } int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser, struct RTC_REF *rtc_ref, struct SVC *svc, int svc_buf_shift, int refresh_alt, int refresh_gld, int refresh_lst, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx) { int fail = 0; if (rtc_ref->set_ref_frame_config) { int i; for (i = 0; i < REF_FRAMES; i++) { if (cm->current_frame.frame_type == KEY_FRAME || rtc_ref->refresh[svc->spatial_layer_id] & (1 << i)) { fail = av1_denoiser_realloc_svc_helper(cm, denoiser, i + 1 + svc_buf_shift); } } } else { if (refresh_alt) { // Increase the frame buffer index by 1 to map it to the buffer index in // the denoiser. fail = av1_denoiser_realloc_svc_helper(cm, denoiser, alt_fb_idx + 1 + svc_buf_shift); if (fail) return 1; } if (refresh_gld) { fail = av1_denoiser_realloc_svc_helper(cm, denoiser, gld_fb_idx + 1 + svc_buf_shift); if (fail) return 1; } if (refresh_lst) { fail = av1_denoiser_realloc_svc_helper(cm, denoiser, lst_fb_idx + 1 + svc_buf_shift); if (fail) return 1; } } return 0; } int av1_denoiser_alloc(AV1_COMMON *cm, struct SVC *svc, AV1_DENOISER *denoiser, int use_svc, int noise_sen, int width, int height, int ssx, int ssy, int use_highbitdepth, int border) { int i, layer, fail, init_num_ref_frames; const int legacy_byte_alignment = 0; int num_layers = 1; int scaled_width = width; int scaled_height = height; if (use_svc) { LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers + svc->temporal_layer_id]; av1_get_layer_resolution(width, height, lc->scaling_factor_num, lc->scaling_factor_den, &scaled_width, &scaled_height); // For SVC: only denoise at most 2 spatial (highest) layers. if (noise_sen >= 2) // Denoise from one spatial layer below the top. svc->first_layer_denoise = AOMMAX(svc->number_spatial_layers - 2, 0); else // Only denoise the top spatial layer. svc->first_layer_denoise = AOMMAX(svc->number_spatial_layers - 1, 0); num_layers = svc->number_spatial_layers - svc->first_layer_denoise; } assert(denoiser != NULL); denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES; init_num_ref_frames = use_svc ? REF_FRAMES : NONSVC_REF_FRAMES; denoiser->num_layers = num_layers; CHECK_MEM_ERROR(cm, denoiser->running_avg_y, aom_calloc(denoiser->num_ref_frames * num_layers, sizeof(denoiser->running_avg_y[0]))); CHECK_MEM_ERROR( cm, denoiser->mc_running_avg_y, aom_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0]))); for (layer = 0; layer < num_layers; ++layer) { const int denoise_width = (layer == 0) ? width : scaled_width; const int denoise_height = (layer == 0) ? height : scaled_height; for (i = 0; i < init_num_ref_frames; ++i) { fail = aom_alloc_frame_buffer( &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer], denoise_width, denoise_height, ssx, ssy, use_highbitdepth, border, legacy_byte_alignment, false, 0); if (fail) { av1_denoiser_free(denoiser); return 1; } #ifdef OUTPUT_YUV_DENOISED make_grayscale(&denoiser->running_avg_y[i]); #endif } fail = aom_alloc_frame_buffer( &denoiser->mc_running_avg_y[layer], denoise_width, denoise_height, ssx, ssy, use_highbitdepth, border, legacy_byte_alignment, false, 0); if (fail) { av1_denoiser_free(denoiser); return 1; } } // denoiser->last_source only used for noise_estimation, so only for top // layer. fail = aom_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy, use_highbitdepth, border, legacy_byte_alignment, false, 0); if (fail) { av1_denoiser_free(denoiser); return 1; } #ifdef OUTPUT_YUV_DENOISED make_grayscale(&denoiser->running_avg_y[i]); #endif denoiser->frame_buffer_initialized = 1; denoiser->denoising_level = kDenMedium; denoiser->prev_denoising_level = kDenMedium; denoiser->reset = 0; denoiser->current_denoiser_frame = 0; return 0; } void av1_denoiser_free(AV1_DENOISER *denoiser) { int i; if (denoiser == NULL) { return; } denoiser->frame_buffer_initialized = 0; for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) { aom_free_frame_buffer(&denoiser->running_avg_y[i]); } aom_free(denoiser->running_avg_y); denoiser->running_avg_y = NULL; for (i = 0; i < denoiser->num_layers; ++i) { aom_free_frame_buffer(&denoiser->mc_running_avg_y[i]); } aom_free(denoiser->mc_running_avg_y); denoiser->mc_running_avg_y = NULL; aom_free_frame_buffer(&denoiser->last_source); } // TODO(kyslov) Enable when SVC temporal denosing is implemented #if 0 static void force_refresh_longterm_ref(AV1_COMP *const cpi) { SVC *const svc = &cpi->svc; // If long term reference is used, force refresh of that slot, so // denoiser buffer for long term reference stays in sync. if (svc->use_gf_temporal_ref_current_layer) { int index = svc->spatial_layer_id; if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; assert(index >= 0); cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx; cpi->refresh_alt_ref_frame = 1; } } #endif void av1_denoiser_set_noise_level(AV1_COMP *const cpi, int noise_level) { AV1_DENOISER *const denoiser = &cpi->denoiser; denoiser->denoising_level = noise_level; if (denoiser->denoising_level > kDenLowLow && denoiser->prev_denoising_level == kDenLowLow) { denoiser->reset = 1; // TODO(kyslov) Enable when SVC temporal denosing is implemented #if 0 force_refresh_longterm_ref(cpi); #endif } else { denoiser->reset = 0; } denoiser->prev_denoising_level = denoiser->denoising_level; } // Scale/increase the partition threshold // for denoiser speed-up. int64_t av1_scale_part_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level, CONTENT_STATE_SB content_state, int temporal_layer_id) { if ((content_state.source_sad_nonrd <= kLowSad && content_state.low_sumdiff) || (content_state.source_sad_nonrd == kHighSad && content_state.low_sumdiff) || (content_state.lighting_change && !content_state.low_sumdiff) || (noise_level == kDenHigh) || (temporal_layer_id != 0)) { int64_t scaled_thr = (temporal_layer_id < 2) ? (3 * threshold) >> 1 : (7 * threshold) >> 2; return scaled_thr; } else { return (5 * threshold) >> 2; } } // Scale/increase the ac skip threshold for // denoiser speed-up. int64_t av1_scale_acskip_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level, int abs_sumdiff, int temporal_layer_id) { if (noise_level >= kDenLow && abs_sumdiff < 5) threshold *= (noise_level == kDenLow) ? 2 : (temporal_layer_id == 2) ? 10 : 6; return threshold; } void av1_denoiser_reset_on_first_frame(AV1_COMP *const cpi) { if (/*av1_denoise_svc_non_key(cpi) &&*/ cpi->denoiser.current_denoiser_frame == 0) { cpi->denoiser.reset = 1; // TODO(kyslov) Enable when SVC temporal denosing is implemented #if 0 force_refresh_longterm_ref(cpi); #endif } } void av1_denoiser_update_ref_frame(AV1_COMP *const cpi) { AV1_COMMON *const cm = &cpi->common; RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; SVC *const svc = &cpi->svc; if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && cpi->denoiser.denoising_level > kDenLowLow) { int svc_refresh_denoiser_buffers = 0; int denoise_svc_second_layer = 0; FRAME_TYPE frame_type = cm->current_frame.frame_type == INTRA_ONLY_FRAME ? KEY_FRAME : cm->current_frame.frame_type; cpi->denoiser.current_denoiser_frame++; const int resize_pending = is_frame_resize_pending(cpi); if (cpi->ppi->use_svc) { // TODO(kyslov) Enable when SVC temporal denosing is implemented #if 0 const int svc_buf_shift = svc->number_spatial_layers - svc->spatial_layer_id == 2 ? cpi->denoiser.num_ref_frames : 0; int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, svc->number_temporal_layers); LAYER_CONTEXT *const lc = &svc->layer_context[layer]; svc_refresh_denoiser_buffers = lc->is_key_frame || svc->spatial_layer_sync[svc->spatial_layer_id]; denoise_svc_second_layer = svc->number_spatial_layers - svc->spatial_layer_id == 2 ? 1 : 0; // Check if we need to allocate extra buffers in the denoiser // for refreshed frames. if (av1_denoiser_realloc_svc(cm, &cpi->denoiser, rtc_ref, svc, svc_buf_shift, cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame, cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx, cpi->lst_fb_idx)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to re-allocate denoiser for SVC"); #endif } av1_denoiser_update_frame_info( &cpi->denoiser, *cpi->source, rtc_ref, svc, frame_type, cpi->refresh_frame.alt_ref_frame, cpi->refresh_frame.golden_frame, 1, rtc_ref->ref_idx[6], rtc_ref->ref_idx[3], rtc_ref->ref_idx[0], resize_pending, svc_refresh_denoiser_buffers, denoise_svc_second_layer); } } #ifdef OUTPUT_YUV_DENOISED static void make_grayscale(YV12_BUFFER_CONFIG *yuv) { int r, c; uint8_t *u = yuv->u_buffer; uint8_t *v = yuv->v_buffer; for (r = 0; r < yuv->uv_height; ++r) { for (c = 0; c < yuv->uv_width; ++c) { u[c] = UINT8_MAX / 2; v[c] = UINT8_MAX / 2; } u += yuv->uv_stride; v += yuv->uv_stride; } } void aom_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) { unsigned char *src = s->y_buffer; int h = s->y_crop_height; do { fwrite(src, s->y_width, 1, yuv_file); src += s->y_stride; } while (--h); src = s->u_buffer; h = s->uv_crop_height; do { fwrite(src, s->uv_width, 1, yuv_file); src += s->uv_stride; } while (--h); src = s->v_buffer; h = s->uv_crop_height; do { fwrite(src, s->uv_width, 1, yuv_file); src += s->uv_stride; } while (--h); } #endif aom-3.12.1/av1/encoder/av1_temporal_denoiser.h000066400000000000000000000112001477627663500211340ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_ #define AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_ #include "av1/encoder/block.h" #include "aom_scale/yv12config.h" #ifdef __cplusplus extern "C" { #endif #define MOTION_MAGNITUDE_THRESHOLD (8 * 3) // Denoiser is used in non svc real-time mode which does not use alt-ref, so no // need to allocate for it, and hence we need MAX_REF_FRAME - 1 #define NONSVC_REF_FRAMES REF_FRAMES - 1 // Number of frame buffers when SVC is used. [0] for current denoised buffer and // [1..8] for REF_FRAMES #define SVC_REF_FRAMES 9 typedef enum av1_denoiser_decision { COPY_BLOCK, FILTER_BLOCK, FILTER_ZEROMV_BLOCK } AV1_DENOISER_DECISION; typedef enum av1_denoiser_level { kDenLowLow, kDenLow, kDenMedium, kDenHigh } AV1_DENOISER_LEVEL; typedef struct av1_denoiser { YV12_BUFFER_CONFIG *running_avg_y; YV12_BUFFER_CONFIG *mc_running_avg_y; YV12_BUFFER_CONFIG last_source; int frame_buffer_initialized; int reset; int num_ref_frames; int num_layers; unsigned int current_denoiser_frame; AV1_DENOISER_LEVEL denoising_level; AV1_DENOISER_LEVEL prev_denoising_level; } AV1_DENOISER; typedef struct { int64_t zero_last_cost_orig; unsigned int *ref_frame_cost; int_mv (*frame_mv)[REF_FRAMES]; int reuse_inter_pred; TX_SIZE best_tx_size; PREDICTION_MODE best_mode; MV_REFERENCE_FRAME best_ref_frame; int_interpfilters best_pred_filter; uint8_t best_mode_skip_txfm; } AV1_PICKMODE_CTX_DEN; struct AV1_COMP; struct SVC; struct RTC_REF; void av1_denoiser_update_frame_info( AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct RTC_REF *rtc_ref, struct SVC *svc, FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer); void av1_denoiser_denoise(struct AV1_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx, AV1_DENOISER_DECISION *denoiser_decision, int use_gf_temporal_ref); void av1_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx); void av1_denoiser_update_frame_stats(MB_MODE_INFO *mi, int64_t sse, PREDICTION_MODE mode, PICK_MODE_CONTEXT *ctx); int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser, struct RTC_REF *rtc, struct SVC *svc, int svc_buf_shift, int refresh_alt, int refresh_gld, int refresh_lst, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx); int av1_denoiser_alloc(AV1_COMMON *cm, struct SVC *svc, AV1_DENOISER *denoiser, int use_svc, int noise_sen, int width, int height, int ssx, int ssy, int use_highbitdepth, int border); #if CONFIG_AV1_TEMPORAL_DENOISING // This function is used by both c and sse2 denoiser implementations. // Define it as a static function within the scope where av1_denoiser.h // is referenced. static inline int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) { return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2); } #endif void av1_denoiser_free(AV1_DENOISER *denoiser); void av1_denoiser_set_noise_level(struct AV1_COMP *const cpi, int noise_level); void av1_denoiser_reset_on_first_frame(struct AV1_COMP *const cpi); int64_t av1_scale_part_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level, CONTENT_STATE_SB content_state, int temporal_layer_id); int64_t av1_scale_acskip_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level, int abs_sumdiff, int temporal_layer_id); void av1_denoiser_update_ref_frame(struct AV1_COMP *const cpi); void aom_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_ aom-3.12.1/av1/encoder/bitstream.c000066400000000000000000005163471477627663500166650ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include #include #include "aom/aom_encoder.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/binary_codes_writer.h" #include "aom_dsp/bitwriter_buffer.h" #include "aom_mem/aom_mem.h" #include "aom_ports/bitops.h" #include "aom_ports/mem_ops.h" #if CONFIG_BITSTREAM_DEBUG #include "aom_util/debug_util.h" #endif // CONFIG_BITSTREAM_DEBUG #include "av1/common/cdef.h" #include "av1/common/cfl.h" #include "av1/common/debugmodes.h" #include "av1/common/entropy.h" #include "av1/common/entropymode.h" #include "av1/common/entropymv.h" #include "av1/common/mvref_common.h" #include "av1/common/pred_common.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" #include "av1/common/seg_common.h" #include "av1/common/tile_common.h" #include "av1/encoder/bitstream.h" #include "av1/encoder/cost.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/encodetxb.h" #include "av1/encoder/ethread.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/palette.h" #include "av1/encoder/pickrst.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/tokenize.h" #define ENC_MISMATCH_DEBUG 0 #define SETUP_TIME_OH_CONST 5 // Setup time overhead constant per worker #define JOB_DISP_TIME_OH_CONST 1 // Job dispatch time overhead per tile static inline void write_uniform(aom_writer *w, int n, int v) { const int l = get_unsigned_bits(n); const int m = (1 << l) - n; if (l == 0) return; if (v < m) { aom_write_literal(w, v, l - 1); } else { aom_write_literal(w, m + ((v - m) >> 1), l - 1); aom_write_literal(w, (v - m) & 1, 1); } } #if !CONFIG_REALTIME_ONLY static inline void loop_restoration_write_sb_coeffs( const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx, aom_writer *const w, int plane, FRAME_COUNTS *counts); #endif static inline void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx, const MB_MODE_INFO *mi, const MB_MODE_INFO *above_mi, const MB_MODE_INFO *left_mi, PREDICTION_MODE mode, aom_writer *w) { assert(!is_intrabc_block(mi)); (void)mi; aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi), INTRA_MODES); } static inline void write_inter_mode(aom_writer *w, PREDICTION_MODE mode, FRAME_CONTEXT *ec_ctx, const int16_t mode_ctx) { const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK; aom_write_symbol(w, mode != NEWMV, ec_ctx->newmv_cdf[newmv_ctx], 2); if (mode != NEWMV) { const int16_t zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; aom_write_symbol(w, mode != GLOBALMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2); if (mode != GLOBALMV) { int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; aom_write_symbol(w, mode != NEARESTMV, ec_ctx->refmv_cdf[refmv_ctx], 2); } } } static inline void write_drl_idx(FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) { assert(mbmi->ref_mv_idx < 3); const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV; if (new_mv) { int idx; for (idx = 0; idx < 2; ++idx) { if (mbmi_ext_frame->ref_mv_count > idx + 1) { uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx); aom_write_symbol(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_cdf[drl_ctx], 2); if (mbmi->ref_mv_idx == idx) return; } } return; } if (have_nearmv_in_inter_mode(mbmi->mode)) { int idx; // TODO(jingning): Temporary solution to compensate the NEARESTMV offset. for (idx = 1; idx < 3; ++idx) { if (mbmi_ext_frame->ref_mv_count > idx + 1) { uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx); aom_write_symbol(w, mbmi->ref_mv_idx != (idx - 1), ec_ctx->drl_cdf[drl_ctx], 2); if (mbmi->ref_mv_idx == (idx - 1)) return; } } return; } } static inline void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w, PREDICTION_MODE mode, const int16_t mode_ctx) { assert(is_inter_compound_mode(mode)); aom_write_symbol(w, INTER_COMPOUND_OFFSET(mode), xd->tile_ctx->inter_compound_mode_cdf[mode_ctx], INTER_COMPOUND_MODES); } static inline void write_tx_size_vartx(MACROBLOCKD *xd, const MB_MODE_INFO *mbmi, TX_SIZE tx_size, int depth, int blk_row, int blk_col, aom_writer *w) { FRAME_CONTEXT *const ec_ctx = xd->tile_ctx; const int max_blocks_high = max_block_high(xd, mbmi->bsize, 0); const int max_blocks_wide = max_block_wide(xd, mbmi->bsize, 0); if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; if (depth == MAX_VARTX_DEPTH) { txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, tx_size, tx_size); return; } const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, mbmi->bsize, tx_size); const int txb_size_index = av1_get_txb_size_index(mbmi->bsize, blk_row, blk_col); const int write_txfm_partition = tx_size == mbmi->inter_tx_size[txb_size_index]; if (write_txfm_partition) { aom_write_symbol(w, 0, ec_ctx->txfm_partition_cdf[ctx], 2); txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, tx_size, tx_size); // TODO(yuec): set correct txfm partition update for qttx } else { const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; aom_write_symbol(w, 1, ec_ctx->txfm_partition_cdf[ctx], 2); if (sub_txs == TX_4X4) { txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, sub_txs, tx_size); return; } assert(bsw > 0 && bsh > 0); for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { const int offsetr = blk_row + row; for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { const int offsetc = blk_col + col; write_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, w); } } } } static inline void write_selected_tx_size(const MACROBLOCKD *xd, aom_writer *w) { const MB_MODE_INFO *const mbmi = xd->mi[0]; const BLOCK_SIZE bsize = mbmi->bsize; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; if (block_signals_txsize(bsize)) { const TX_SIZE tx_size = mbmi->tx_size; const int tx_size_ctx = get_tx_size_context(xd); const int depth = tx_size_to_depth(tx_size, bsize); const int max_depths = bsize_to_max_depth(bsize); const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); assert(depth >= 0 && depth <= max_depths); assert(!is_inter_block(mbmi)); assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi))); aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx], max_depths + 1); } } static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd, uint8_t segment_id, const MB_MODE_INFO *mi, aom_writer *w) { if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 1; } else { const int skip_txfm = mi->skip_txfm; const int ctx = av1_get_skip_txfm_context(xd); FRAME_CONTEXT *ec_ctx = xd->tile_ctx; aom_write_symbol(w, skip_txfm, ec_ctx->skip_txfm_cdfs[ctx], 2); return skip_txfm; } } static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd, uint8_t segment_id, const MB_MODE_INFO *mi, aom_writer *w) { if (!cm->current_frame.skip_mode_info.skip_mode_flag) return 0; if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 0; } const int skip_mode = mi->skip_mode; if (!is_comp_ref_allowed(mi->bsize)) { assert(!skip_mode); return 0; } if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) || segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { // These features imply single-reference mode, while skip mode implies // compound reference. Hence, the two are mutually exclusive. // In other words, skip_mode is implicitly 0 here. assert(!skip_mode); return 0; } const int ctx = av1_get_skip_mode_context(xd); aom_write_symbol(w, skip_mode, xd->tile_ctx->skip_mode_cdfs[ctx], 2); return skip_mode; } static inline void write_is_inter(const AV1_COMMON *cm, const MACROBLOCKD *xd, uint8_t segment_id, aom_writer *w, const int is_inter) { if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { assert(is_inter); return; } const int ctx = av1_get_intra_inter_context(xd); FRAME_CONTEXT *ec_ctx = xd->tile_ctx; aom_write_symbol(w, is_inter, ec_ctx->intra_inter_cdf[ctx], 2); } } static inline void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd, const MB_MODE_INFO *mbmi, aom_writer *w) { MOTION_MODE last_motion_mode_allowed = cm->features.switchable_motion_mode ? motion_mode_allowed(cm->global_motion, xd, mbmi, cm->features.allow_warped_motion) : SIMPLE_TRANSLATION; assert(mbmi->motion_mode <= last_motion_mode_allowed); switch (last_motion_mode_allowed) { case SIMPLE_TRANSLATION: break; case OBMC_CAUSAL: aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL, xd->tile_ctx->obmc_cdf[mbmi->bsize], 2); break; default: aom_write_symbol(w, mbmi->motion_mode, xd->tile_ctx->motion_mode_cdf[mbmi->bsize], MOTION_MODES); } } static inline void write_delta_qindex(const MACROBLOCKD *xd, int delta_qindex, aom_writer *w) { int sign = delta_qindex < 0; int abs = sign ? -delta_qindex : delta_qindex; int rem_bits, thr; int smallval = abs < DELTA_Q_SMALL ? 1 : 0; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1); if (!smallval) { rem_bits = get_msb(abs - 1); thr = (1 << rem_bits) + 1; aom_write_literal(w, rem_bits - 1, 3); aom_write_literal(w, abs - thr, rem_bits); } if (abs > 0) { aom_write_bit(w, sign); } } static inline void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd, int lf_id, int delta_lflevel, int delta_lf_multi, aom_writer *w) { int sign = delta_lflevel < 0; int abs = sign ? -delta_lflevel : delta_lflevel; int rem_bits, thr; int smallval = abs < DELTA_LF_SMALL ? 1 : 0; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; (void)cm; if (delta_lf_multi) { assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2)); aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_multi_cdf[lf_id], DELTA_LF_PROBS + 1); } else { aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf, DELTA_LF_PROBS + 1); } if (!smallval) { rem_bits = get_msb(abs - 1); thr = (1 << rem_bits) + 1; aom_write_literal(w, rem_bits - 1, 3); aom_write_literal(w, abs - thr, rem_bits); } if (abs > 0) { aom_write_bit(w, sign); } } static inline void pack_map_tokens(aom_writer *w, const TokenExtra **tp, int n, int num, MapCdf map_pb_cdf) { const TokenExtra *p = *tp; const int palette_size_idx = n - PALETTE_MIN_SIZE; write_uniform(w, n, p->token); // The first color index. ++p; --num; for (int i = 0; i < num; ++i) { assert((p->color_ctx >= 0) && (p->color_ctx < PALETTE_COLOR_INDEX_CONTEXTS)); aom_cdf_prob *color_map_cdf = map_pb_cdf[palette_size_idx][p->color_ctx]; aom_write_symbol(w, p->token, color_map_cdf, n); ++p; } *tp = p; } static inline void pack_txb_tokens( aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x, const TokenExtra **tp, const TokenExtra *const tok_end, MACROBLOCKD *xd, MB_MODE_INFO *mbmi, int plane, BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth, int block, int blk_row, int blk_col, TX_SIZE tx_size, TOKEN_STATS *token_stats) { const int max_blocks_high = max_block_high(xd, plane_bsize, plane); const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; const struct macroblockd_plane *const pd = &xd->plane[plane]; const TX_SIZE plane_tx_size = plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x, pd->subsampling_y) : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, blk_col)]; if (tx_size == plane_tx_size || plane) { av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane, block, tx_size); #if CONFIG_RD_DEBUG TOKEN_STATS tmp_token_stats; init_token_stats(&tmp_token_stats); token_stats->cost += tmp_token_stats.cost; #endif } else { const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; const int step = bsh * bsw; const int row_end = AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); const int col_end = AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); assert(bsw > 0 && bsh > 0); for (int r = 0; r < row_end; r += bsh) { const int offsetr = blk_row + r; for (int c = 0; c < col_end; c += bsw) { const int offsetc = blk_col + c; pack_txb_tokens(w, cm, x, tp, tok_end, xd, mbmi, plane, plane_bsize, bit_depth, block, offsetr, offsetc, sub_txs, token_stats); block += step; } } } } static inline void set_spatial_segment_id( const CommonModeInfoParams *const mi_params, uint8_t *segment_ids, BLOCK_SIZE bsize, int mi_row, int mi_col, uint8_t segment_id) { const int mi_offset = mi_row * mi_params->mi_cols + mi_col; const int bw = mi_size_wide[bsize]; const int bh = mi_size_high[bsize]; const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw); const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh); const int mi_stride = mi_params->mi_cols; set_segment_id(segment_ids, mi_offset, xmis, ymis, mi_stride, segment_id); } int av1_neg_interleave(int x, int ref, int max) { assert(x < max); const int diff = x - ref; if (!ref) return x; if (ref >= (max - 1)) return -x + max - 1; if (2 * ref < max) { if (abs(diff) <= ref) { if (diff > 0) return (diff << 1) - 1; else return ((-diff) << 1); } return x; } else { if (abs(diff) < (max - ref)) { if (diff > 0) return (diff << 1) - 1; else return ((-diff) << 1); } return (max - x) - 1; } } static inline void write_segment_id(AV1_COMP *cpi, MACROBLOCKD *const xd, const MB_MODE_INFO *const mbmi, aom_writer *w, const struct segmentation *seg, struct segmentation_probs *segp, int skip_txfm) { if (!seg->enabled || !seg->update_map) return; AV1_COMMON *const cm = &cpi->common; int cdf_num; const uint8_t pred = av1_get_spatial_seg_pred( cm, xd, &cdf_num, cpi->cyclic_refresh->skip_over4x4); const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; if (skip_txfm) { // Still need to transmit tx size for intra blocks even if skip_txfm is // true. Changing segment_id may make the tx size become invalid, e.g // changing from lossless to lossy. assert(is_inter_block(mbmi) || !cpi->enc_seg.has_lossless_segment); set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize, mi_row, mi_col, pred); set_spatial_segment_id(&cm->mi_params, cpi->enc_seg.map, mbmi->bsize, mi_row, mi_col, pred); /* mbmi is read only but we need to update segment_id */ ((MB_MODE_INFO *)mbmi)->segment_id = pred; return; } const int coded_id = av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1); aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num]; aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS); set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize, mi_row, mi_col, mbmi->segment_id); } #define WRITE_REF_BIT(bname, pname) \ aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(xd), 2) // This function encodes the reference frame static inline void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd, aom_writer *w) { const MB_MODE_INFO *const mbmi = xd->mi[0]; const int is_compound = has_second_ref(mbmi); const uint8_t segment_id = mbmi->segment_id; // If segment level coding of this signal is disabled... // or the segment allows multiple reference frame options if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { assert(!is_compound); assert(mbmi->ref_frame[0] == get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME)); } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) || segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { assert(!is_compound); assert(mbmi->ref_frame[0] == LAST_FRAME); } else { // does the feature use compound prediction or not // (if not specified at the frame/segment level) if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) { if (is_comp_ref_allowed(mbmi->bsize)) aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(xd), 2); } else { assert((!is_compound) == (cm->current_frame.reference_mode == SINGLE_REFERENCE)); } if (is_compound) { const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi) ? UNIDIR_COMP_REFERENCE : BIDIR_COMP_REFERENCE; aom_write_symbol(w, comp_ref_type, av1_get_comp_reference_type_cdf(xd), 2); if (comp_ref_type == UNIDIR_COMP_REFERENCE) { const int bit = mbmi->ref_frame[0] == BWDREF_FRAME; WRITE_REF_BIT(bit, uni_comp_ref_p); if (!bit) { assert(mbmi->ref_frame[0] == LAST_FRAME); const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME || mbmi->ref_frame[1] == GOLDEN_FRAME; WRITE_REF_BIT(bit1, uni_comp_ref_p1); if (bit1) { const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME; WRITE_REF_BIT(bit2, uni_comp_ref_p2); } } else { assert(mbmi->ref_frame[1] == ALTREF_FRAME); } return; } assert(comp_ref_type == BIDIR_COMP_REFERENCE); const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME || mbmi->ref_frame[0] == LAST3_FRAME); WRITE_REF_BIT(bit, comp_ref_p); if (!bit) { const int bit1 = mbmi->ref_frame[0] == LAST2_FRAME; WRITE_REF_BIT(bit1, comp_ref_p1); } else { const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME; WRITE_REF_BIT(bit2, comp_ref_p2); } const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME; WRITE_REF_BIT(bit_bwd, comp_bwdref_p); if (!bit_bwd) { WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1); } } else { const int bit0 = (mbmi->ref_frame[0] <= ALTREF_FRAME && mbmi->ref_frame[0] >= BWDREF_FRAME); WRITE_REF_BIT(bit0, single_ref_p1); if (bit0) { const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME; WRITE_REF_BIT(bit1, single_ref_p2); if (!bit1) { WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6); } } else { const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME || mbmi->ref_frame[0] == GOLDEN_FRAME); WRITE_REF_BIT(bit2, single_ref_p3); if (!bit2) { const int bit3 = mbmi->ref_frame[0] != LAST_FRAME; WRITE_REF_BIT(bit3, single_ref_p4); } else { const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME; WRITE_REF_BIT(bit4, single_ref_p5); } } } } } static inline void write_filter_intra_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, aom_writer *w) { if (av1_filter_intra_allowed(cm, mbmi)) { aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra, xd->tile_ctx->filter_intra_cdfs[mbmi->bsize], 2); if (mbmi->filter_intra_mode_info.use_filter_intra) { const FILTER_INTRA_MODE mode = mbmi->filter_intra_mode_info.filter_intra_mode; aom_write_symbol(w, mode, xd->tile_ctx->filter_intra_mode_cdf, FILTER_INTRA_MODES); } } } static inline void write_angle_delta(aom_writer *w, int angle_delta, aom_cdf_prob *cdf) { aom_write_symbol(w, angle_delta + MAX_ANGLE_DELTA, cdf, 2 * MAX_ANGLE_DELTA + 1); } static inline void write_mb_interp_filter(AV1_COMMON *const cm, ThreadData *td, aom_writer *w) { const MACROBLOCKD *xd = &td->mb.e_mbd; const MB_MODE_INFO *const mbmi = xd->mi[0]; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; if (!av1_is_interp_needed(xd)) { int_interpfilters filters = av1_broadcast_interp_filter( av1_unswitchable_filter(cm->features.interp_filter)); assert(mbmi->interp_filters.as_int == filters.as_int); (void)filters; return; } if (cm->features.interp_filter == SWITCHABLE) { int dir; for (dir = 0; dir < 2; ++dir) { const int ctx = av1_get_pred_context_switchable_interp(xd, dir); InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir); aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS); ++td->interp_filter_selected[filter]; if (cm->seq_params->enable_dual_filter == 0) return; } } } // Transmit color values with delta encoding. Write the first value as // literal, and the deltas between each value and the previous one. "min_val" is // the smallest possible value of the deltas. static inline void delta_encode_palette_colors(const int *colors, int num, int bit_depth, int min_val, aom_writer *w) { if (num <= 0) return; assert(colors[0] < (1 << bit_depth)); aom_write_literal(w, colors[0], bit_depth); if (num == 1) return; int max_delta = 0; int deltas[PALETTE_MAX_SIZE]; memset(deltas, 0, sizeof(deltas)); for (int i = 1; i < num; ++i) { assert(colors[i] < (1 << bit_depth)); const int delta = colors[i] - colors[i - 1]; deltas[i - 1] = delta; assert(delta >= min_val); if (delta > max_delta) max_delta = delta; } const int min_bits = bit_depth - 3; int bits = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits); assert(bits <= bit_depth); int range = (1 << bit_depth) - colors[0] - min_val; aom_write_literal(w, bits - min_bits, 2); for (int i = 0; i < num - 1; ++i) { aom_write_literal(w, deltas[i] - min_val, bits); range -= deltas[i]; bits = AOMMIN(bits, av1_ceil_log2(range)); } } // Transmit luma palette color values. First signal if each color in the color // cache is used. Those colors that are not in the cache are transmitted with // delta encoding. static inline void write_palette_colors_y(const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi, int bit_depth, aom_writer *w) { const int n = pmi->palette_size[0]; uint16_t color_cache[2 * PALETTE_MAX_SIZE]; const int n_cache = av1_get_palette_cache(xd, 0, color_cache); int out_cache_colors[PALETTE_MAX_SIZE]; uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; const int n_out_cache = av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n, cache_color_found, out_cache_colors); int n_in_cache = 0; for (int i = 0; i < n_cache && n_in_cache < n; ++i) { const int found = cache_color_found[i]; aom_write_bit(w, found); n_in_cache += found; } assert(n_in_cache + n_out_cache == n); delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 1, w); } // Write chroma palette color values. U channel is handled similarly to the luma // channel. For v channel, either use delta encoding or transmit raw values // directly, whichever costs less. static inline void write_palette_colors_uv(const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi, int bit_depth, aom_writer *w) { const int n = pmi->palette_size[1]; const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE; const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE; // U channel colors. uint16_t color_cache[2 * PALETTE_MAX_SIZE]; const int n_cache = av1_get_palette_cache(xd, 1, color_cache); int out_cache_colors[PALETTE_MAX_SIZE]; uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; const int n_out_cache = av1_index_color_cache( color_cache, n_cache, colors_u, n, cache_color_found, out_cache_colors); int n_in_cache = 0; for (int i = 0; i < n_cache && n_in_cache < n; ++i) { const int found = cache_color_found[i]; aom_write_bit(w, found); n_in_cache += found; } delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 0, w); // V channel colors. Don't use color cache as the colors are not sorted. const int max_val = 1 << bit_depth; int zero_count = 0, min_bits_v = 0; int bits_v = av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v); const int rate_using_delta = 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count; const int rate_using_raw = bit_depth * n; if (rate_using_delta < rate_using_raw) { // delta encoding assert(colors_v[0] < (1 << bit_depth)); aom_write_bit(w, 1); aom_write_literal(w, bits_v - min_bits_v, 2); aom_write_literal(w, colors_v[0], bit_depth); for (int i = 1; i < n; ++i) { assert(colors_v[i] < (1 << bit_depth)); if (colors_v[i] == colors_v[i - 1]) { // No need to signal sign bit. aom_write_literal(w, 0, bits_v); continue; } const int delta = abs((int)colors_v[i] - colors_v[i - 1]); const int sign_bit = colors_v[i] < colors_v[i - 1]; if (delta <= max_val - delta) { aom_write_literal(w, delta, bits_v); aom_write_bit(w, sign_bit); } else { aom_write_literal(w, max_val - delta, bits_v); aom_write_bit(w, !sign_bit); } } } else { // Transmit raw values. aom_write_bit(w, 0); for (int i = 0; i < n; ++i) { assert(colors_v[i] < (1 << bit_depth)); aom_write_literal(w, colors_v[i], bit_depth); } } } static inline void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, aom_writer *w) { const int num_planes = av1_num_planes(cm); const BLOCK_SIZE bsize = mbmi->bsize; assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize)); const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); if (mbmi->mode == DC_PRED) { const int n = pmi->palette_size[0]; const int palette_y_mode_ctx = av1_get_palette_mode_ctx(xd); aom_write_symbol( w, n > 0, xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_y_mode_ctx], 2); if (n > 0) { aom_write_symbol(w, n - PALETTE_MIN_SIZE, xd->tile_ctx->palette_y_size_cdf[bsize_ctx], PALETTE_SIZES); write_palette_colors_y(xd, pmi, cm->seq_params->bit_depth, w); } } const int uv_dc_pred = num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref; if (uv_dc_pred) { const int n = pmi->palette_size[1]; const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0); aom_write_symbol(w, n > 0, xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2); if (n > 0) { aom_write_symbol(w, n - PALETTE_MIN_SIZE, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx], PALETTE_SIZES); write_palette_colors_uv(xd, pmi, cm->seq_params->bit_depth, w); } } } void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w) { MB_MODE_INFO *mbmi = xd->mi[0]; const FeatureFlags *const features = &cm->features; const int is_inter = is_inter_block(mbmi); if (get_ext_tx_types(tx_size, is_inter, features->reduced_tx_set_used) > 1 && ((!cm->seg.enabled && cm->quant_params.base_qindex > 0) || (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) && !mbmi->skip_txfm && !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { FRAME_CONTEXT *ec_ctx = xd->tile_ctx; const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; const TxSetType tx_set_type = av1_get_ext_tx_set_type( tx_size, is_inter, features->reduced_tx_set_used); const int eset = get_ext_tx_set(tx_size, is_inter, features->reduced_tx_set_used); // eset == 0 should correspond to a set with only DCT_DCT and there // is no need to send the tx_type assert(eset > 0); assert(av1_ext_tx_used[tx_set_type][tx_type]); if (is_inter) { aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type], ec_ctx->inter_ext_tx_cdf[eset][square_tx_size], av1_num_ext_tx_set[tx_set_type]); } else { PREDICTION_MODE intra_dir; if (mbmi->filter_intra_mode_info.use_filter_intra) intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode]; else intra_dir = mbmi->mode; aom_write_symbol( w, av1_ext_tx_ind[tx_set_type][tx_type], ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir], av1_num_ext_tx_set[tx_set_type]); } } } static inline void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize, PREDICTION_MODE mode, aom_writer *w) { aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]], INTRA_MODES); } static inline void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx, UV_PREDICTION_MODE uv_mode, PREDICTION_MODE y_mode, CFL_ALLOWED_TYPE cfl_allowed, aom_writer *w) { aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[cfl_allowed][y_mode], UV_INTRA_MODES - !cfl_allowed); } static inline void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, uint8_t idx, int8_t joint_sign, aom_writer *w) { aom_write_symbol(w, joint_sign, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS); // Magnitudes are only signaled for nonzero codes. if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) { aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; aom_write_symbol(w, CFL_IDX_U(idx), cdf_u, CFL_ALPHABET_SIZE); } if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) { aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; aom_write_symbol(w, CFL_IDX_V(idx), cdf_v, CFL_ALPHABET_SIZE); } } static inline void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd, aom_writer *w, int skip) { if (cm->features.coded_lossless || cm->features.allow_intrabc) return; // At the start of a superblock, mark that we haven't yet written CDEF // strengths for any of the CDEF units contained in this superblock. const int sb_mask = (cm->seq_params->mib_size - 1); const int mi_row_in_sb = (xd->mi_row & sb_mask); const int mi_col_in_sb = (xd->mi_col & sb_mask); if (mi_row_in_sb == 0 && mi_col_in_sb == 0) { xd->cdef_transmitted[0] = xd->cdef_transmitted[1] = xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false; } // CDEF unit size is 64x64 irrespective of the superblock size. const int cdef_size = 1 << (6 - MI_SIZE_LOG2); // Find index of this CDEF unit in this superblock. const int index_mask = cdef_size; const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0); const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0); const int index = (cm->seq_params->sb_size == BLOCK_128X128) ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb : 0; // Write CDEF strength to the first non-skip coding block in this CDEF unit. if (!xd->cdef_transmitted[index] && !skip) { // CDEF strength for this CDEF unit needs to be stored in the MB_MODE_INFO // of the 1st block in this CDEF unit. const int first_block_mask = ~(cdef_size - 1); const CommonModeInfoParams *const mi_params = &cm->mi_params; const int grid_idx = get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask, xd->mi_col & first_block_mask); const MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx]; aom_write_literal(w, mbmi->cdef_strength, cm->cdef_info.cdef_bits); xd->cdef_transmitted[index] = true; } } static inline void write_inter_segment_id(AV1_COMP *cpi, MACROBLOCKD *const xd, aom_writer *w, const struct segmentation *const seg, struct segmentation_probs *const segp, int skip, int preskip) { MB_MODE_INFO *const mbmi = xd->mi[0]; AV1_COMMON *const cm = &cpi->common; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; if (seg->update_map) { if (preskip) { if (!seg->segid_preskip) return; } else { if (seg->segid_preskip) return; if (skip) { write_segment_id(cpi, xd, mbmi, w, seg, segp, 1); if (seg->temporal_update) mbmi->seg_id_predicted = 0; return; } } if (seg->temporal_update) { const int pred_flag = mbmi->seg_id_predicted; aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd); aom_write_symbol(w, pred_flag, pred_cdf, 2); if (!pred_flag) { write_segment_id(cpi, xd, mbmi, w, seg, segp, 0); } if (pred_flag) { set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize, mi_row, mi_col, mbmi->segment_id); } } else { write_segment_id(cpi, xd, mbmi, w, seg, segp, 0); } } } // If delta q is present, writes delta_q index. // Also writes delta_q loop filter levels, if present. static inline void write_delta_q_params(AV1_COMMON *const cm, MACROBLOCKD *const xd, int skip, aom_writer *w) { const DeltaQInfo *const delta_q_info = &cm->delta_q_info; if (delta_q_info->delta_q_present_flag) { const MB_MODE_INFO *const mbmi = xd->mi[0]; const BLOCK_SIZE bsize = mbmi->bsize; const int super_block_upper_left = ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) && ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0); if ((bsize != cm->seq_params->sb_size || skip == 0) && super_block_upper_left) { assert(mbmi->current_qindex > 0); const int reduced_delta_qindex = (mbmi->current_qindex - xd->current_base_qindex) / delta_q_info->delta_q_res; write_delta_qindex(xd, reduced_delta_qindex, w); xd->current_base_qindex = mbmi->current_qindex; if (delta_q_info->delta_lf_present_flag) { if (delta_q_info->delta_lf_multi) { const int frame_lf_count = av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { int reduced_delta_lflevel = (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / delta_q_info->delta_lf_res; write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, 1, w); xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; } } else { int reduced_delta_lflevel = (mbmi->delta_lf_from_base - xd->delta_lf_from_base) / delta_q_info->delta_lf_res; write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, 0, w); xd->delta_lf_from_base = mbmi->delta_lf_from_base; } } } } } static inline void write_intra_prediction_modes(const AV1_COMMON *cm, MACROBLOCKD *const xd, int is_keyframe, aom_writer *w) { FRAME_CONTEXT *ec_ctx = xd->tile_ctx; const MB_MODE_INFO *const mbmi = xd->mi[0]; const PREDICTION_MODE mode = mbmi->mode; const BLOCK_SIZE bsize = mbmi->bsize; // Y mode. if (is_keyframe) { const MB_MODE_INFO *const above_mi = xd->above_mbmi; const MB_MODE_INFO *const left_mi = xd->left_mbmi; write_intra_y_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w); } else { write_intra_y_mode_nonkf(ec_ctx, bsize, mode, w); } // Y angle delta. const int use_angle_delta = av1_use_angle_delta(bsize); if (use_angle_delta && av1_is_directional_mode(mode)) { write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y], ec_ctx->angle_delta_cdf[mode - V_PRED]); } // UV mode and UV angle delta. if (!cm->seq_params->monochrome && xd->is_chroma_ref) { const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w); if (uv_mode == UV_CFL_PRED) write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w); const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode); if (use_angle_delta && av1_is_directional_mode(intra_mode)) { write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV], ec_ctx->angle_delta_cdf[intra_mode - V_PRED]); } } // Palette. if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) { write_palette_mode_info(cm, xd, mbmi, w); } // Filter intra. write_filter_intra_mode_info(cm, xd, mbmi, w); } static inline int16_t mode_context_analyzer( const int16_t mode_context, const MV_REFERENCE_FRAME *const rf) { if (rf[1] <= INTRA_FRAME) return mode_context; const int16_t newmv_ctx = mode_context & NEWMV_CTX_MASK; const int16_t refmv_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN( newmv_ctx, COMP_NEWMV_CTXS - 1)]; return comp_ctx; } static inline int_mv get_ref_mv_from_stack( int ref_idx, const MV_REFERENCE_FRAME *ref_frame, int ref_mv_idx, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame) { const int8_t ref_frame_type = av1_ref_frame_type(ref_frame); const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack; if (ref_frame[1] > INTRA_FRAME) { assert(ref_idx == 0 || ref_idx == 1); return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv : curr_ref_mv_stack[ref_mv_idx].this_mv; } assert(ref_idx == 0); return ref_mv_idx < mbmi_ext_frame->ref_mv_count ? curr_ref_mv_stack[ref_mv_idx].this_mv : mbmi_ext_frame->global_mvs[ref_frame_type]; } static inline int_mv get_ref_mv(const MACROBLOCK *x, int ref_idx) { const MACROBLOCKD *xd = &x->e_mbd; const MB_MODE_INFO *mbmi = xd->mi[0]; int ref_mv_idx = mbmi->ref_mv_idx; if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) { assert(has_second_ref(mbmi)); ref_mv_idx += 1; } return get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx, x->mbmi_ext_frame); } static inline void pack_inter_mode_mvs(AV1_COMP *cpi, ThreadData *const td, aom_writer *w) { AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; const struct segmentation *const seg = &cm->seg; struct segmentation_probs *const segp = &ec_ctx->seg; const MB_MODE_INFO *const mbmi = xd->mi[0]; const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame = x->mbmi_ext_frame; const PREDICTION_MODE mode = mbmi->mode; const uint8_t segment_id = mbmi->segment_id; const BLOCK_SIZE bsize = mbmi->bsize; const int allow_hp = cm->features.allow_high_precision_mv; const int is_inter = is_inter_block(mbmi); const int is_compound = has_second_ref(mbmi); int ref; write_inter_segment_id(cpi, xd, w, seg, segp, 0, 1); write_skip_mode(cm, xd, segment_id, mbmi, w); assert(IMPLIES(mbmi->skip_mode, mbmi->skip_txfm)); const int skip = mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w); write_inter_segment_id(cpi, xd, w, seg, segp, skip, 0); write_cdef(cm, xd, w, skip); write_delta_q_params(cm, xd, skip, w); if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter); if (mbmi->skip_mode) return; if (!is_inter) { write_intra_prediction_modes(cm, xd, 0, w); } else { int16_t mode_ctx; av1_collect_neighbors_ref_counts(xd); write_ref_frames(cm, xd, w); mode_ctx = mode_context_analyzer(mbmi_ext_frame->mode_context, mbmi->ref_frame); // If segment skip is not enabled code the mode. if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { if (is_inter_compound_mode(mode)) write_inter_compound_mode(xd, w, mode, mode_ctx); else if (is_inter_singleref_mode(mode)) write_inter_mode(w, mode, ec_ctx, mode_ctx); if (mode == NEWMV || mode == NEW_NEWMV || have_nearmv_in_inter_mode(mode)) write_drl_idx(ec_ctx, mbmi, mbmi_ext_frame, w); else assert(mbmi->ref_mv_idx == 0); } if (mode == NEWMV || mode == NEW_NEWMV) { for (ref = 0; ref < 1 + is_compound; ++ref) { nmv_context *nmvc = &ec_ctx->nmvc; const int_mv ref_mv = get_ref_mv(x, ref); av1_encode_mv(cpi, w, td, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc, allow_hp); } } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) { nmv_context *nmvc = &ec_ctx->nmvc; const int_mv ref_mv = get_ref_mv(x, 1); av1_encode_mv(cpi, w, td, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, allow_hp); } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) { nmv_context *nmvc = &ec_ctx->nmvc; const int_mv ref_mv = get_ref_mv(x, 0); av1_encode_mv(cpi, w, td, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, allow_hp); } if (cpi->common.current_frame.reference_mode != COMPOUND_REFERENCE && cpi->common.seq_params->enable_interintra_compound && is_interintra_allowed(mbmi)) { const int interintra = mbmi->ref_frame[1] == INTRA_FRAME; const int bsize_group = size_group_lookup[bsize]; aom_write_symbol(w, interintra, ec_ctx->interintra_cdf[bsize_group], 2); if (interintra) { aom_write_symbol(w, mbmi->interintra_mode, ec_ctx->interintra_mode_cdf[bsize_group], INTERINTRA_MODES); if (av1_is_wedge_used(bsize)) { aom_write_symbol(w, mbmi->use_wedge_interintra, ec_ctx->wedge_interintra_cdf[bsize], 2); if (mbmi->use_wedge_interintra) { aom_write_symbol(w, mbmi->interintra_wedge_index, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES); } } } } if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mbmi, w); // First write idx to indicate current compound inter prediction mode group // Group A (0): dist_wtd_comp, compound_average // Group B (1): interintra, compound_diffwtd, wedge if (has_second_ref(mbmi)) { const int masked_compound_used = is_any_masked_compound_used(bsize) && cm->seq_params->enable_masked_compound; if (masked_compound_used) { const int ctx_comp_group_idx = get_comp_group_idx_context(xd); aom_write_symbol(w, mbmi->comp_group_idx, ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2); } else { assert(mbmi->comp_group_idx == 0); } if (mbmi->comp_group_idx == 0) { if (mbmi->compound_idx) assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE); if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) { const int comp_index_ctx = get_comp_index_context(cm, xd); aom_write_symbol(w, mbmi->compound_idx, ec_ctx->compound_index_cdf[comp_index_ctx], 2); } else { assert(mbmi->compound_idx == 1); } } else { assert(cpi->common.current_frame.reference_mode != SINGLE_REFERENCE && is_inter_compound_mode(mbmi->mode) && mbmi->motion_mode == SIMPLE_TRANSLATION); assert(masked_compound_used); // compound_diffwtd, wedge assert(mbmi->interinter_comp.type == COMPOUND_WEDGE || mbmi->interinter_comp.type == COMPOUND_DIFFWTD); if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) aom_write_symbol(w, mbmi->interinter_comp.type - COMPOUND_WEDGE, ec_ctx->compound_type_cdf[bsize], MASKED_COMPOUND_TYPES); if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize)); aom_write_symbol(w, mbmi->interinter_comp.wedge_index, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES); aom_write_bit(w, mbmi->interinter_comp.wedge_sign); } else { assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD); aom_write_literal(w, mbmi->interinter_comp.mask_type, MAX_DIFFWTD_MASK_BITS); } } } write_mb_interp_filter(cm, td, w); } } static inline void write_intrabc_info( MACROBLOCKD *xd, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) { const MB_MODE_INFO *const mbmi = xd->mi[0]; int use_intrabc = is_intrabc_block(mbmi); FRAME_CONTEXT *ec_ctx = xd->tile_ctx; aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2); if (use_intrabc) { assert(mbmi->mode == DC_PRED); assert(mbmi->uv_mode == UV_DC_PRED); assert(mbmi->motion_mode == SIMPLE_TRANSLATION); int_mv dv_ref = mbmi_ext_frame->ref_mv_stack[0].this_mv; av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc); } } static inline void write_mb_modes_kf( AV1_COMP *cpi, MACROBLOCKD *xd, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) { AV1_COMMON *const cm = &cpi->common; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; const struct segmentation *const seg = &cm->seg; struct segmentation_probs *const segp = &ec_ctx->seg; const MB_MODE_INFO *const mbmi = xd->mi[0]; if (seg->segid_preskip && seg->update_map) write_segment_id(cpi, xd, mbmi, w, seg, segp, 0); const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w); if (!seg->segid_preskip && seg->update_map) write_segment_id(cpi, xd, mbmi, w, seg, segp, skip); write_cdef(cm, xd, w, skip); write_delta_q_params(cm, xd, skip, w); if (av1_allow_intrabc(cm)) { write_intrabc_info(xd, mbmi_ext_frame, w); if (is_intrabc_block(mbmi)) return; } write_intra_prediction_modes(cm, xd, 1, w); } #if CONFIG_RD_DEBUG static inline void dump_mode_info(MB_MODE_INFO *mi) { printf("\nmi->mi_row == %d\n", mi->mi_row); printf("&& mi->mi_col == %d\n", mi->mi_col); printf("&& mi->bsize == %d\n", mi->bsize); printf("&& mi->tx_size == %d\n", mi->tx_size); printf("&& mi->mode == %d\n", mi->mode); } static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats, int plane) { if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) { printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n", plane, rd_stats->txb_coeff_cost[plane], token_stats->cost); return 1; } return 0; } #endif #if ENC_MISMATCH_DEBUG static inline void enc_dump_logs( const AV1_COMMON *const cm, const MBMIExtFrameBufferInfo *const mbmi_ext_info, int mi_row, int mi_col) { const MB_MODE_INFO *const mbmi = *( cm->mi_params.mi_grid_base + (mi_row * cm->mi_params.mi_stride + mi_col)); const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame = mbmi_ext_info->frame_base + get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize, mbmi_ext_info->stride); if (is_inter_block(mbmi)) { #define FRAME_TO_CHECK 11 if (cm->current_frame.frame_number == FRAME_TO_CHECK && cm->show_frame == 1) { const BLOCK_SIZE bsize = mbmi->bsize; int_mv mv[2] = { 0 }; const int is_comp_ref = has_second_ref(mbmi); for (int ref = 0; ref < 1 + is_comp_ref; ++ref) mv[ref].as_mv = mbmi->mv[ref].as_mv; if (!is_comp_ref) { mv[1].as_int = 0; } const int16_t mode_ctx = is_comp_ref ? 0 : mode_context_analyzer(mbmi_ext_frame->mode_context, mbmi->ref_frame); const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK; int16_t zeromv_ctx = -1; int16_t refmv_ctx = -1; if (mbmi->mode != NEWMV) { zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; if (mbmi->mode != GLOBALMV) refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; } printf( "=== ENCODER ===: " "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, " "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, " "ref[1]=%d, motion_mode=%d, mode_ctx=%d, " "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n", cm->current_frame.frame_number, mi_row, mi_col, mbmi->skip_mode, mbmi->mode, bsize, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col, mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0], mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx, zeromv_ctx, refmv_ctx, mbmi->tx_size); } } } #endif // ENC_MISMATCH_DEBUG static inline void write_mbmi_b(AV1_COMP *cpi, ThreadData *const td, aom_writer *w) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &td->mb.e_mbd; MB_MODE_INFO *m = xd->mi[0]; if (frame_is_intra_only(cm)) { write_mb_modes_kf(cpi, xd, td->mb.mbmi_ext_frame, w); } else { // has_subpel_mv_component needs the ref frame buffers set up to look // up if they are scaled. has_subpel_mv_component is in turn needed by // write_switchable_interp_filter, which is called by pack_inter_mode_mvs. set_ref_ptrs(cm, xd, m->ref_frame[0], m->ref_frame[1]); #if ENC_MISMATCH_DEBUG enc_dump_logs(cm, &cpi->mbmi_ext_info, xd->mi_row, xd->mi_col); #endif // ENC_MISMATCH_DEBUG pack_inter_mode_mvs(cpi, td, w); } } static inline void write_inter_txb_coeff( AV1_COMMON *const cm, MACROBLOCK *const x, MB_MODE_INFO *const mbmi, aom_writer *w, const TokenExtra **tok, const TokenExtra *const tok_end, TOKEN_STATS *token_stats, const int row, const int col, int *block, const int plane) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE bsize = mbmi->bsize; assert(bsize < BLOCK_SIZES_ALL); const int ss_x = pd->subsampling_x; const int ss_y = pd->subsampling_y; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); assert(plane_bsize < BLOCK_SIZES_ALL); const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); const int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; const int bkw = tx_size_wide_unit[max_tx_size]; const int bkh = tx_size_high_unit[max_tx_size]; const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, ss_x, ss_y); const int num_4x4_w = mi_size_wide[plane_bsize]; const int num_4x4_h = mi_size_high[plane_bsize]; const int mu_blocks_wide = mi_size_wide[max_unit_bsize]; const int mu_blocks_high = mi_size_high[max_unit_bsize]; const int unit_height = AOMMIN(mu_blocks_high + (row >> ss_y), num_4x4_h); const int unit_width = AOMMIN(mu_blocks_wide + (col >> ss_x), num_4x4_w); for (int blk_row = row >> ss_y; blk_row < unit_height; blk_row += bkh) { for (int blk_col = col >> ss_x; blk_col < unit_width; blk_col += bkw) { pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize, cm->seq_params->bit_depth, *block, blk_row, blk_col, max_tx_size, token_stats); *block += step; } } } static inline void write_tokens_b(AV1_COMP *cpi, MACROBLOCK *const x, aom_writer *w, const TokenExtra **tok, const TokenExtra *const tok_end) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const BLOCK_SIZE bsize = mbmi->bsize; assert(!mbmi->skip_txfm); const int is_inter = is_inter_block(mbmi); if (!is_inter) { av1_write_intra_coeffs_mb(cm, x, w, bsize); } else { int block[MAX_MB_PLANE] = { 0 }; assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x, xd->plane[0].subsampling_y)); const int num_4x4_w = mi_size_wide[bsize]; const int num_4x4_h = mi_size_high[bsize]; TOKEN_STATS token_stats; init_token_stats(&token_stats); const BLOCK_SIZE max_unit_bsize = BLOCK_64X64; assert(max_unit_bsize == get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x, xd->plane[0].subsampling_y)); int mu_blocks_wide = mi_size_wide[max_unit_bsize]; int mu_blocks_high = mi_size_high[max_unit_bsize]; mu_blocks_wide = AOMMIN(num_4x4_w, mu_blocks_wide); mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high); const int num_planes = av1_num_planes(cm); for (int row = 0; row < num_4x4_h; row += mu_blocks_high) { for (int col = 0; col < num_4x4_w; col += mu_blocks_wide) { for (int plane = 0; plane < num_planes; ++plane) { if (plane && !xd->is_chroma_ref) break; write_inter_txb_coeff(cm, x, mbmi, w, tok, tok_end, &token_stats, row, col, &block[plane], plane); } } } #if CONFIG_RD_DEBUG for (int plane = 0; plane < num_planes; ++plane) { if (mbmi->bsize >= BLOCK_8X8 && rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) { dump_mode_info(mbmi); assert(0); } } #endif // CONFIG_RD_DEBUG } } static inline void write_modes_b(AV1_COMP *cpi, ThreadData *const td, const TileInfo *const tile, aom_writer *w, const TokenExtra **tok, const TokenExtra *const tok_end, int mi_row, int mi_col) { const AV1_COMMON *cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; MACROBLOCKD *xd = &td->mb.e_mbd; FRAME_CONTEXT *tile_ctx = xd->tile_ctx; const int grid_idx = mi_row * mi_params->mi_stride + mi_col; xd->mi = mi_params->mi_grid_base + grid_idx; td->mb.mbmi_ext_frame = cpi->mbmi_ext_info.frame_base + get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize, cpi->mbmi_ext_info.stride); xd->tx_type_map = mi_params->tx_type_map + grid_idx; xd->tx_type_map_stride = mi_params->mi_stride; const MB_MODE_INFO *mbmi = xd->mi[0]; const BLOCK_SIZE bsize = mbmi->bsize; assert(bsize <= cm->seq_params->sb_size || (bsize >= BLOCK_SIZES && bsize < BLOCK_SIZES_ALL)); const int bh = mi_size_high[bsize]; const int bw = mi_size_wide[bsize]; set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows, mi_params->mi_cols); xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); write_mbmi_b(cpi, td, w); for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) { const uint8_t palette_size_plane = mbmi->palette_mode_info.palette_size[plane]; assert(!mbmi->skip_mode || !palette_size_plane); if (palette_size_plane > 0) { assert(mbmi->use_intrabc == 0); assert(av1_allow_palette(cm->features.allow_screen_content_tools, mbmi->bsize)); assert(!plane || xd->is_chroma_ref); int rows, cols; av1_get_block_dimensions(mbmi->bsize, plane, xd, NULL, NULL, &rows, &cols); assert(*tok < tok_end); MapCdf map_pb_cdf = plane ? tile_ctx->palette_uv_color_index_cdf : tile_ctx->palette_y_color_index_cdf; pack_map_tokens(w, tok, palette_size_plane, rows * cols, map_pb_cdf); } } const int is_inter_tx = is_inter_block(mbmi); const int skip_txfm = mbmi->skip_txfm; const uint8_t segment_id = mbmi->segment_id; if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) && !(is_inter_tx && skip_txfm) && !xd->lossless[segment_id]) { if (is_inter_tx) { // This implies skip flag is 0. const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0); const int txbh = tx_size_high_unit[max_tx_size]; const int txbw = tx_size_wide_unit[max_tx_size]; const int width = mi_size_wide[bsize]; const int height = mi_size_high[bsize]; for (int idy = 0; idy < height; idy += txbh) { for (int idx = 0; idx < width; idx += txbw) { write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w); } } } else { write_selected_tx_size(xd, w); set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, 0, xd); } } else { set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, skip_txfm && is_inter_tx, xd); } if (!mbmi->skip_txfm) { int start = aom_tell_size(w); write_tokens_b(cpi, &td->mb, w, tok, tok_end); const int end = aom_tell_size(w); td->coefficient_size += end - start; } } static inline void write_partition(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, int hbs, int mi_row, int mi_col, PARTITION_TYPE p, BLOCK_SIZE bsize, aom_writer *w) { const int is_partition_point = bsize >= BLOCK_8X8; if (!is_partition_point) return; const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows; const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols; const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize); FRAME_CONTEXT *ec_ctx = xd->tile_ctx; if (!has_rows && !has_cols) { assert(p == PARTITION_SPLIT); return; } if (has_rows && has_cols) { aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], partition_cdf_length(bsize)); } else if (!has_rows && has_cols) { assert(p == PARTITION_SPLIT || p == PARTITION_HORZ); assert(bsize > BLOCK_8X8); aom_cdf_prob cdf[2]; partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx], bsize); aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2); } else { assert(has_rows && !has_cols); assert(p == PARTITION_SPLIT || p == PARTITION_VERT); assert(bsize > BLOCK_8X8); aom_cdf_prob cdf[2]; partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx], bsize); aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2); } } static inline void write_modes_sb(AV1_COMP *const cpi, ThreadData *const td, const TileInfo *const tile, aom_writer *const w, const TokenExtra **tok, const TokenExtra *const tok_end, int mi_row, int mi_col, BLOCK_SIZE bsize) { const AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; MACROBLOCKD *const xd = &td->mb.e_mbd; assert(bsize < BLOCK_SIZES_ALL); const int hbs = mi_size_wide[bsize] / 2; const int quarter_step = mi_size_wide[bsize] / 4; int i; const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize); const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; #if !CONFIG_REALTIME_ONLY const int num_planes = av1_num_planes(cm); for (int plane = 0; plane < num_planes; ++plane) { int rcol0, rcol1, rrow0, rrow1; // Skip some unnecessary work if loop restoration is disabled if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize, &rcol0, &rcol1, &rrow0, &rrow1)) { const int rstride = cm->rst_info[plane].horz_units; for (int rrow = rrow0; rrow < rrow1; ++rrow) { for (int rcol = rcol0; rcol < rcol1; ++rcol) { const int runit_idx = rcol + rrow * rstride; loop_restoration_write_sb_coeffs(cm, xd, runit_idx, w, plane, td->counts); } } } } #endif write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w); switch (partition) { case PARTITION_NONE: write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); break; case PARTITION_HORZ: write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); if (mi_row + hbs < mi_params->mi_rows) write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col); break; case PARTITION_VERT: write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); if (mi_col + hbs < mi_params->mi_cols) write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs); break; case PARTITION_SPLIT: write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col, subsize); write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs, subsize); write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col, subsize); write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs, subsize); break; case PARTITION_HORZ_A: write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs); write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col); break; case PARTITION_HORZ_B: write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col); write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); break; case PARTITION_VERT_A: write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col); write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs); break; case PARTITION_VERT_B: write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs); write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); break; case PARTITION_HORZ_4: for (i = 0; i < 4; ++i) { int this_mi_row = mi_row + i * quarter_step; if (i > 0 && this_mi_row >= mi_params->mi_rows) break; write_modes_b(cpi, td, tile, w, tok, tok_end, this_mi_row, mi_col); } break; case PARTITION_VERT_4: for (i = 0; i < 4; ++i) { int this_mi_col = mi_col + i * quarter_step; if (i > 0 && this_mi_col >= mi_params->mi_cols) break; write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, this_mi_col); } break; default: assert(0); } // update partition context update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); } // Populate token pointers appropriately based on token_info. static inline void get_token_pointers(const TokenInfo *token_info, const int tile_row, int tile_col, const int sb_row_in_tile, const TokenExtra **tok, const TokenExtra **tok_end) { if (!is_token_info_allocated(token_info)) { *tok = NULL; *tok_end = NULL; return; } *tok = token_info->tplist[tile_row][tile_col][sb_row_in_tile].start; *tok_end = *tok + token_info->tplist[tile_row][tile_col][sb_row_in_tile].count; } static inline void write_modes(AV1_COMP *const cpi, ThreadData *const td, const TileInfo *const tile, aom_writer *const w, int tile_row, int tile_col) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &td->mb.e_mbd; const int mi_row_start = tile->mi_row_start; const int mi_row_end = tile->mi_row_end; const int mi_col_start = tile->mi_col_start; const int mi_col_end = tile->mi_col_end; const int num_planes = av1_num_planes(cm); av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row); av1_init_above_context(&cm->above_contexts, num_planes, tile->tile_row, xd); if (cpi->common.delta_q_info.delta_q_present_flag) { xd->current_base_qindex = cpi->common.quant_params.base_qindex; if (cpi->common.delta_q_info.delta_lf_present_flag) { av1_reset_loop_filter_delta(xd, num_planes); } } for (int mi_row = mi_row_start; mi_row < mi_row_end; mi_row += cm->seq_params->mib_size) { const int sb_row_in_tile = (mi_row - tile->mi_row_start) >> cm->seq_params->mib_size_log2; const TokenInfo *token_info = &cpi->token_info; const TokenExtra *tok; const TokenExtra *tok_end; get_token_pointers(token_info, tile_row, tile_col, sb_row_in_tile, &tok, &tok_end); av1_zero_left_context(xd); for (int mi_col = mi_col_start; mi_col < mi_col_end; mi_col += cm->seq_params->mib_size) { td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col); write_modes_sb(cpi, td, tile, w, &tok, tok_end, mi_row, mi_col, cm->seq_params->sb_size); } assert(tok == tok_end); } } static inline void encode_restoration_mode(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { assert(!cm->features.all_lossless); if (!cm->seq_params->enable_restoration) return; if (cm->features.allow_intrabc) return; const int num_planes = av1_num_planes(cm); int all_none = 1, chroma_none = 1; for (int p = 0; p < num_planes; ++p) { RestorationInfo *rsi = &cm->rst_info[p]; if (rsi->frame_restoration_type != RESTORE_NONE) { all_none = 0; chroma_none &= p == 0; } switch (rsi->frame_restoration_type) { case RESTORE_NONE: aom_wb_write_bit(wb, 0); aom_wb_write_bit(wb, 0); break; case RESTORE_WIENER: aom_wb_write_bit(wb, 1); aom_wb_write_bit(wb, 0); break; case RESTORE_SGRPROJ: aom_wb_write_bit(wb, 1); aom_wb_write_bit(wb, 1); break; case RESTORE_SWITCHABLE: aom_wb_write_bit(wb, 0); aom_wb_write_bit(wb, 1); break; default: assert(0); } } if (!all_none) { assert(cm->seq_params->sb_size == BLOCK_64X64 || cm->seq_params->sb_size == BLOCK_128X128); const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64; RestorationInfo *rsi = &cm->rst_info[0]; assert(rsi->restoration_unit_size >= sb_size); assert(RESTORATION_UNITSIZE_MAX == 256); if (sb_size == 64) { aom_wb_write_bit(wb, rsi->restoration_unit_size > 64); } if (rsi->restoration_unit_size > 64) { aom_wb_write_bit(wb, rsi->restoration_unit_size > 128); } } if (num_planes > 1) { int s = AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y); if (s && !chroma_none) { aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size != cm->rst_info[0].restoration_unit_size); assert(cm->rst_info[1].restoration_unit_size == cm->rst_info[0].restoration_unit_size || cm->rst_info[1].restoration_unit_size == (cm->rst_info[0].restoration_unit_size >> s)); assert(cm->rst_info[2].restoration_unit_size == cm->rst_info[1].restoration_unit_size); } else if (!s) { assert(cm->rst_info[1].restoration_unit_size == cm->rst_info[0].restoration_unit_size); assert(cm->rst_info[2].restoration_unit_size == cm->rst_info[1].restoration_unit_size); } } } #if !CONFIG_REALTIME_ONLY static inline void write_wiener_filter(int wiener_win, const WienerInfo *wiener_info, WienerInfo *ref_wiener_info, aom_writer *wb) { if (wiener_win == WIENER_WIN) aom_write_primitive_refsubexpfin( wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, WIENER_FILT_TAP0_SUBEXP_K, ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV); else assert(wiener_info->vfilter[0] == 0 && wiener_info->vfilter[WIENER_WIN - 1] == 0); aom_write_primitive_refsubexpfin( wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, WIENER_FILT_TAP1_SUBEXP_K, ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV); aom_write_primitive_refsubexpfin( wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, WIENER_FILT_TAP2_SUBEXP_K, ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV); if (wiener_win == WIENER_WIN) aom_write_primitive_refsubexpfin( wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, WIENER_FILT_TAP0_SUBEXP_K, ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV); else assert(wiener_info->hfilter[0] == 0 && wiener_info->hfilter[WIENER_WIN - 1] == 0); aom_write_primitive_refsubexpfin( wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, WIENER_FILT_TAP1_SUBEXP_K, ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV); aom_write_primitive_refsubexpfin( wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, WIENER_FILT_TAP2_SUBEXP_K, ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV); memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info)); } static inline void write_sgrproj_filter(const SgrprojInfo *sgrproj_info, SgrprojInfo *ref_sgrproj_info, aom_writer *wb) { aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS); const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep]; if (params->r[0] == 0) { assert(sgrproj_info->xqd[0] == 0); aom_write_primitive_refsubexpfin( wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); } else if (params->r[1] == 0) { aom_write_primitive_refsubexpfin( wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); } else { aom_write_primitive_refsubexpfin( wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); aom_write_primitive_refsubexpfin( wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); } memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info)); } static inline void loop_restoration_write_sb_coeffs( const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx, aom_writer *const w, int plane, FRAME_COUNTS *counts) { const RestorationUnitInfo *rui = &cm->rst_info[plane].unit_info[runit_idx]; const RestorationInfo *rsi = cm->rst_info + plane; RestorationType frame_rtype = rsi->frame_restoration_type; assert(frame_rtype != RESTORE_NONE); (void)counts; assert(!cm->features.all_lossless); const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN; WienerInfo *ref_wiener_info = &xd->wiener_info[plane]; SgrprojInfo *ref_sgrproj_info = &xd->sgrproj_info[plane]; RestorationType unit_rtype = rui->restoration_type; if (frame_rtype == RESTORE_SWITCHABLE) { aom_write_symbol(w, unit_rtype, xd->tile_ctx->switchable_restore_cdf, RESTORE_SWITCHABLE_TYPES); #if CONFIG_ENTROPY_STATS ++counts->switchable_restore[unit_rtype]; #endif switch (unit_rtype) { case RESTORE_WIENER: #if DEBUG_LR_COSTING assert(!memcmp( ref_wiener_info, &lr_ref_params[RESTORE_SWITCHABLE][plane][runit_idx].wiener_info, sizeof(*ref_wiener_info))); #endif write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w); break; case RESTORE_SGRPROJ: #if DEBUG_LR_COSTING assert(!memcmp(&ref_sgrproj_info->xqd, &lr_ref_params[RESTORE_SWITCHABLE][plane][runit_idx] .sgrproj_info.xqd, sizeof(ref_sgrproj_info->xqd))); #endif write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w); break; default: assert(unit_rtype == RESTORE_NONE); break; } } else if (frame_rtype == RESTORE_WIENER) { aom_write_symbol(w, unit_rtype != RESTORE_NONE, xd->tile_ctx->wiener_restore_cdf, 2); #if CONFIG_ENTROPY_STATS ++counts->wiener_restore[unit_rtype != RESTORE_NONE]; #endif if (unit_rtype != RESTORE_NONE) { #if DEBUG_LR_COSTING assert( !memcmp(ref_wiener_info, &lr_ref_params[RESTORE_WIENER][plane][runit_idx].wiener_info, sizeof(*ref_wiener_info))); #endif write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w); } } else if (frame_rtype == RESTORE_SGRPROJ) { aom_write_symbol(w, unit_rtype != RESTORE_NONE, xd->tile_ctx->sgrproj_restore_cdf, 2); #if CONFIG_ENTROPY_STATS ++counts->sgrproj_restore[unit_rtype != RESTORE_NONE]; #endif if (unit_rtype != RESTORE_NONE) { #if DEBUG_LR_COSTING assert(!memcmp( &ref_sgrproj_info->xqd, &lr_ref_params[RESTORE_SGRPROJ][plane][runit_idx].sgrproj_info.xqd, sizeof(ref_sgrproj_info->xqd))); #endif write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w); } } } #endif // !CONFIG_REALTIME_ONLY // Only write out the ref delta section if any of the elements // will signal a delta. static bool is_mode_ref_delta_meaningful(AV1_COMMON *cm) { struct loopfilter *lf = &cm->lf; if (!lf->mode_ref_delta_update) { return 0; } const RefCntBuffer *buf = get_primary_ref_frame_buf(cm); int8_t last_ref_deltas[REF_FRAMES]; int8_t last_mode_deltas[MAX_MODE_LF_DELTAS]; if (buf == NULL) { av1_set_default_ref_deltas(last_ref_deltas); av1_set_default_mode_deltas(last_mode_deltas); } else { memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES); memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS); } for (int i = 0; i < REF_FRAMES; i++) { if (lf->ref_deltas[i] != last_ref_deltas[i]) { return true; } } for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) { if (lf->mode_deltas[i] != last_mode_deltas[i]) { return true; } } return false; } static inline void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { assert(!cm->features.coded_lossless); if (cm->features.allow_intrabc) return; const int num_planes = av1_num_planes(cm); struct loopfilter *lf = &cm->lf; // Encode the loop filter level and type aom_wb_write_literal(wb, lf->filter_level[0], 6); aom_wb_write_literal(wb, lf->filter_level[1], 6); if (num_planes > 1) { if (lf->filter_level[0] || lf->filter_level[1]) { aom_wb_write_literal(wb, lf->filter_level_u, 6); aom_wb_write_literal(wb, lf->filter_level_v, 6); } } aom_wb_write_literal(wb, lf->sharpness_level, 3); aom_wb_write_bit(wb, lf->mode_ref_delta_enabled); // Write out loop filter deltas applied at the MB level based on mode or // ref frame (if they are enabled), only if there is information to write. int meaningful = is_mode_ref_delta_meaningful(cm); aom_wb_write_bit(wb, meaningful); if (!meaningful) { return; } const RefCntBuffer *buf = get_primary_ref_frame_buf(cm); int8_t last_ref_deltas[REF_FRAMES]; int8_t last_mode_deltas[MAX_MODE_LF_DELTAS]; if (buf == NULL) { av1_set_default_ref_deltas(last_ref_deltas); av1_set_default_mode_deltas(last_mode_deltas); } else { memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES); memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS); } for (int i = 0; i < REF_FRAMES; i++) { const int delta = lf->ref_deltas[i]; const int changed = delta != last_ref_deltas[i]; aom_wb_write_bit(wb, changed); if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6); } for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) { const int delta = lf->mode_deltas[i]; const int changed = delta != last_mode_deltas[i]; aom_wb_write_bit(wb, changed); if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6); } } static inline void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { assert(!cm->features.coded_lossless); if (!cm->seq_params->enable_cdef) return; if (cm->features.allow_intrabc) return; const int num_planes = av1_num_planes(cm); int i; aom_wb_write_literal(wb, cm->cdef_info.cdef_damping - 3, 2); aom_wb_write_literal(wb, cm->cdef_info.cdef_bits, 2); for (i = 0; i < cm->cdef_info.nb_cdef_strengths; i++) { aom_wb_write_literal(wb, cm->cdef_info.cdef_strengths[i], CDEF_STRENGTH_BITS); if (num_planes > 1) aom_wb_write_literal(wb, cm->cdef_info.cdef_uv_strengths[i], CDEF_STRENGTH_BITS); } } static inline void write_delta_q(struct aom_write_bit_buffer *wb, int delta_q) { if (delta_q != 0) { aom_wb_write_bit(wb, 1); aom_wb_write_inv_signed_literal(wb, delta_q, 6); } else { aom_wb_write_bit(wb, 0); } } static inline void encode_quantization( const CommonQuantParams *const quant_params, int num_planes, bool separate_uv_delta_q, struct aom_write_bit_buffer *wb) { aom_wb_write_literal(wb, quant_params->base_qindex, QINDEX_BITS); write_delta_q(wb, quant_params->y_dc_delta_q); if (num_planes > 1) { int diff_uv_delta = (quant_params->u_dc_delta_q != quant_params->v_dc_delta_q) || (quant_params->u_ac_delta_q != quant_params->v_ac_delta_q); if (separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta); write_delta_q(wb, quant_params->u_dc_delta_q); write_delta_q(wb, quant_params->u_ac_delta_q); if (diff_uv_delta) { write_delta_q(wb, quant_params->v_dc_delta_q); write_delta_q(wb, quant_params->v_ac_delta_q); } } aom_wb_write_bit(wb, quant_params->using_qmatrix); if (quant_params->using_qmatrix) { aom_wb_write_literal(wb, quant_params->qmatrix_level_y, QM_LEVEL_BITS); aom_wb_write_literal(wb, quant_params->qmatrix_level_u, QM_LEVEL_BITS); if (!separate_uv_delta_q) assert(quant_params->qmatrix_level_u == quant_params->qmatrix_level_v); else aom_wb_write_literal(wb, quant_params->qmatrix_level_v, QM_LEVEL_BITS); } } static inline void encode_segmentation(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { int i, j; struct segmentation *seg = &cm->seg; aom_wb_write_bit(wb, seg->enabled); if (!seg->enabled) return; // Write update flags if (cm->features.primary_ref_frame != PRIMARY_REF_NONE) { aom_wb_write_bit(wb, seg->update_map); if (seg->update_map) aom_wb_write_bit(wb, seg->temporal_update); aom_wb_write_bit(wb, seg->update_data); } // Segmentation data if (seg->update_data) { for (i = 0; i < MAX_SEGMENTS; i++) { for (j = 0; j < SEG_LVL_MAX; j++) { const int active = segfeature_active(seg, i, j); aom_wb_write_bit(wb, active); if (active) { const int data_max = av1_seg_feature_data_max(j); const int data_min = -data_max; const int ubits = get_unsigned_bits(data_max); const int data = clamp(get_segdata(seg, i, j), data_min, data_max); if (av1_is_segfeature_signed(j)) { aom_wb_write_inv_signed_literal(wb, data, ubits); } else { aom_wb_write_literal(wb, data, ubits); } } } } } } static inline void write_frame_interp_filter(InterpFilter filter, struct aom_write_bit_buffer *wb) { aom_wb_write_bit(wb, filter == SWITCHABLE); if (filter != SWITCHABLE) aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS); } // Same function as write_uniform but writing to uncompresses header wb static inline void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, int v) { const int l = get_unsigned_bits(n); const int m = (1 << l) - n; if (l == 0) return; if (v < m) { aom_wb_write_literal(wb, v, l - 1); } else { aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1); aom_wb_write_literal(wb, (v - m) & 1, 1); } } static inline void write_tile_info_max_tile(const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { int width_sb = CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2); int height_sb = CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2); int size_sb, i; const CommonTileParams *const tiles = &cm->tiles; aom_wb_write_bit(wb, tiles->uniform_spacing); if (tiles->uniform_spacing) { int ones = tiles->log2_cols - tiles->min_log2_cols; while (ones--) { aom_wb_write_bit(wb, 1); } if (tiles->log2_cols < tiles->max_log2_cols) { aom_wb_write_bit(wb, 0); } // rows ones = tiles->log2_rows - tiles->min_log2_rows; while (ones--) { aom_wb_write_bit(wb, 1); } if (tiles->log2_rows < tiles->max_log2_rows) { aom_wb_write_bit(wb, 0); } } else { // Explicit tiles with configurable tile widths and heights // columns for (i = 0; i < tiles->cols; i++) { size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; wb_write_uniform(wb, AOMMIN(width_sb, tiles->max_width_sb), size_sb - 1); width_sb -= size_sb; } assert(width_sb == 0); // rows for (i = 0; i < tiles->rows; i++) { size_sb = tiles->row_start_sb[i + 1] - tiles->row_start_sb[i]; wb_write_uniform(wb, AOMMIN(height_sb, tiles->max_height_sb), size_sb - 1); height_sb -= size_sb; } assert(height_sb == 0); } } static inline void write_tile_info(const AV1_COMMON *const cm, struct aom_write_bit_buffer *saved_wb, struct aom_write_bit_buffer *wb) { write_tile_info_max_tile(cm, wb); *saved_wb = *wb; if (cm->tiles.rows * cm->tiles.cols > 1) { // tile id used for cdf update aom_wb_write_literal(wb, 0, cm->tiles.log2_cols + cm->tiles.log2_rows); // Number of bytes in tile size - 1 aom_wb_write_literal(wb, 3, 2); } } static inline void write_ext_tile_info(const AV1_COMMON *const cm, struct aom_write_bit_buffer *saved_wb, struct aom_write_bit_buffer *wb) { // This information is stored as a separate byte. int mod = wb->bit_offset % CHAR_BIT; if (mod > 0) aom_wb_write_literal(wb, 0, CHAR_BIT - mod); assert(aom_wb_is_byte_aligned(wb)); *saved_wb = *wb; if (cm->tiles.rows * cm->tiles.cols > 1) { // Note that the last item in the uncompressed header is the data // describing tile configuration. // Number of bytes in tile column size - 1 aom_wb_write_literal(wb, 0, 2); // Number of bytes in tile size - 1 aom_wb_write_literal(wb, 0, 2); } } static inline int find_identical_tile( const int tile_row, const int tile_col, TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) { const MV32 candidate_offset[1] = { { 1, 0 } }; const uint8_t *const cur_tile_data = tile_buffers[tile_row][tile_col].data + 4; const size_t cur_tile_size = tile_buffers[tile_row][tile_col].size; int i; if (tile_row == 0) return 0; // (TODO: yunqingwang) For now, only above tile is checked and used. // More candidates such as left tile can be added later. for (i = 0; i < 1; i++) { int row_offset = candidate_offset[0].row; int col_offset = candidate_offset[0].col; int row = tile_row - row_offset; int col = tile_col - col_offset; const uint8_t *tile_data; TileBufferEnc *candidate; if (row < 0 || col < 0) continue; const uint32_t tile_hdr = mem_get_le32(tile_buffers[row][col].data); // Read out tile-copy-mode bit: if ((tile_hdr >> 31) == 1) { // The candidate is a copy tile itself: the offset is stored in bits // 30 through 24 inclusive. row_offset += (tile_hdr >> 24) & 0x7f; row = tile_row - row_offset; } candidate = &tile_buffers[row][col]; if (row_offset >= 128 || candidate->size != cur_tile_size) continue; tile_data = candidate->data + 4; if (memcmp(tile_data, cur_tile_data, cur_tile_size) != 0) continue; // Identical tile found assert(row_offset > 0); return row_offset; } // No identical tile found return 0; } static inline void write_render_size(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { const int scaling_active = av1_resize_scaled(cm); aom_wb_write_bit(wb, scaling_active); if (scaling_active) { aom_wb_write_literal(wb, cm->render_width - 1, 16); aom_wb_write_literal(wb, cm->render_height - 1, 16); } } static inline void write_superres_scale(const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { const SequenceHeader *const seq_params = cm->seq_params; if (!seq_params->enable_superres) { assert(cm->superres_scale_denominator == SCALE_NUMERATOR); return; } // First bit is whether to to scale or not if (cm->superres_scale_denominator == SCALE_NUMERATOR) { aom_wb_write_bit(wb, 0); // no scaling } else { aom_wb_write_bit(wb, 1); // scaling, write scale factor assert(cm->superres_scale_denominator >= SUPERRES_SCALE_DENOMINATOR_MIN); assert(cm->superres_scale_denominator < SUPERRES_SCALE_DENOMINATOR_MIN + (1 << SUPERRES_SCALE_BITS)); aom_wb_write_literal( wb, cm->superres_scale_denominator - SUPERRES_SCALE_DENOMINATOR_MIN, SUPERRES_SCALE_BITS); } } static inline void write_frame_size(const AV1_COMMON *cm, int frame_size_override, struct aom_write_bit_buffer *wb) { const int coded_width = cm->superres_upscaled_width - 1; const int coded_height = cm->superres_upscaled_height - 1; if (frame_size_override) { const SequenceHeader *seq_params = cm->seq_params; int num_bits_width = seq_params->num_bits_width; int num_bits_height = seq_params->num_bits_height; aom_wb_write_literal(wb, coded_width, num_bits_width); aom_wb_write_literal(wb, coded_height, num_bits_height); } write_superres_scale(cm, wb); write_render_size(cm, wb); } static inline void write_frame_size_with_refs(const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { int found = 0; MV_REFERENCE_FRAME ref_frame; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame); if (cfg != NULL) { found = cm->superres_upscaled_width == cfg->y_crop_width && cm->superres_upscaled_height == cfg->y_crop_height; found &= cm->render_width == cfg->render_width && cm->render_height == cfg->render_height; } aom_wb_write_bit(wb, found); if (found) { write_superres_scale(cm, wb); break; } } if (!found) { int frame_size_override = 1; // Always equal to 1 in this function write_frame_size(cm, frame_size_override, wb); } } static inline void write_profile(BITSTREAM_PROFILE profile, struct aom_write_bit_buffer *wb) { assert(profile >= PROFILE_0 && profile < MAX_PROFILES); aom_wb_write_literal(wb, profile, PROFILE_BITS); } static inline void write_bitdepth(const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) { // Profile 0/1: [0] for 8 bit, [1] 10-bit // Profile 2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_8 ? 0 : 1); if (seq_params->profile == PROFILE_2 && seq_params->bit_depth != AOM_BITS_8) { aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_10 ? 0 : 1); } } static inline void write_color_config(const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) { write_bitdepth(seq_params, wb); const int is_monochrome = seq_params->monochrome; // monochrome bit if (seq_params->profile != PROFILE_1) aom_wb_write_bit(wb, is_monochrome); else assert(!is_monochrome); if (seq_params->color_primaries == AOM_CICP_CP_UNSPECIFIED && seq_params->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED && seq_params->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) { aom_wb_write_bit(wb, 0); // No color description present } else { aom_wb_write_bit(wb, 1); // Color description present aom_wb_write_literal(wb, seq_params->color_primaries, 8); aom_wb_write_literal(wb, seq_params->transfer_characteristics, 8); aom_wb_write_literal(wb, seq_params->matrix_coefficients, 8); } if (is_monochrome) { // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] aom_wb_write_bit(wb, seq_params->color_range); return; } if (seq_params->color_primaries == AOM_CICP_CP_BT_709 && seq_params->transfer_characteristics == AOM_CICP_TC_SRGB && seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0); assert(seq_params->profile == PROFILE_1 || (seq_params->profile == PROFILE_2 && seq_params->bit_depth == AOM_BITS_12)); } else { // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] aom_wb_write_bit(wb, seq_params->color_range); if (seq_params->profile == PROFILE_0) { // 420 only assert(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1); } else if (seq_params->profile == PROFILE_1) { // 444 only assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0); } else if (seq_params->profile == PROFILE_2) { if (seq_params->bit_depth == AOM_BITS_12) { // 420, 444 or 422 aom_wb_write_bit(wb, seq_params->subsampling_x); if (seq_params->subsampling_x == 0) { assert(seq_params->subsampling_y == 0 && "4:4:0 subsampling not allowed in AV1"); } else { aom_wb_write_bit(wb, seq_params->subsampling_y); } } else { // 422 only assert(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0); } } if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0); } if (seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) { aom_wb_write_literal(wb, seq_params->chroma_sample_position, 2); } } aom_wb_write_bit(wb, seq_params->separate_uv_delta_q); } static inline void write_timing_info_header( const aom_timing_info_t *const timing_info, struct aom_write_bit_buffer *wb) { aom_wb_write_unsigned_literal(wb, timing_info->num_units_in_display_tick, 32); aom_wb_write_unsigned_literal(wb, timing_info->time_scale, 32); aom_wb_write_bit(wb, timing_info->equal_picture_interval); if (timing_info->equal_picture_interval) { aom_wb_write_uvlc(wb, timing_info->num_ticks_per_picture - 1); } } static inline void write_decoder_model_info( const aom_dec_model_info_t *const decoder_model_info, struct aom_write_bit_buffer *wb) { aom_wb_write_literal( wb, decoder_model_info->encoder_decoder_buffer_delay_length - 1, 5); aom_wb_write_unsigned_literal( wb, decoder_model_info->num_units_in_decoding_tick, 32); aom_wb_write_literal(wb, decoder_model_info->buffer_removal_time_length - 1, 5); aom_wb_write_literal( wb, decoder_model_info->frame_presentation_time_length - 1, 5); } static inline void write_dec_model_op_parameters( const aom_dec_model_op_parameters_t *op_params, int buffer_delay_length, struct aom_write_bit_buffer *wb) { aom_wb_write_unsigned_literal(wb, op_params->decoder_buffer_delay, buffer_delay_length); aom_wb_write_unsigned_literal(wb, op_params->encoder_buffer_delay, buffer_delay_length); aom_wb_write_bit(wb, op_params->low_delay_mode_flag); } static inline void write_tu_pts_info(AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { aom_wb_write_unsigned_literal( wb, cm->frame_presentation_time, cm->seq_params->decoder_model_info.frame_presentation_time_length); } static inline void write_film_grain_params(const AV1_COMP *const cpi, struct aom_write_bit_buffer *wb) { const AV1_COMMON *const cm = &cpi->common; const aom_film_grain_t *const pars = &cm->cur_frame->film_grain_params; aom_wb_write_bit(wb, pars->apply_grain); if (!pars->apply_grain) return; aom_wb_write_literal(wb, pars->random_seed, 16); if (cm->current_frame.frame_type == INTER_FRAME) aom_wb_write_bit(wb, pars->update_parameters); if (!pars->update_parameters) { int ref_frame, ref_idx; for (ref_frame = LAST_FRAME; ref_frame < REF_FRAMES; ref_frame++) { ref_idx = get_ref_frame_map_idx(cm, ref_frame); assert(ref_idx != INVALID_IDX); const RefCntBuffer *const buf = cm->ref_frame_map[ref_idx]; if (buf->film_grain_params_present && aom_check_grain_params_equiv(pars, &buf->film_grain_params)) { break; } } assert(ref_frame < REF_FRAMES); aom_wb_write_literal(wb, ref_idx, 3); return; } // Scaling functions parameters aom_wb_write_literal(wb, pars->num_y_points, 4); // max 14 for (int i = 0; i < pars->num_y_points; i++) { aom_wb_write_literal(wb, pars->scaling_points_y[i][0], 8); aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8); } if (!cm->seq_params->monochrome) { aom_wb_write_bit(wb, pars->chroma_scaling_from_luma); } else { assert(!pars->chroma_scaling_from_luma); } if (cm->seq_params->monochrome || pars->chroma_scaling_from_luma || ((cm->seq_params->subsampling_x == 1) && (cm->seq_params->subsampling_y == 1) && (pars->num_y_points == 0))) { assert(pars->num_cb_points == 0 && pars->num_cr_points == 0); } else { aom_wb_write_literal(wb, pars->num_cb_points, 4); // max 10 for (int i = 0; i < pars->num_cb_points; i++) { aom_wb_write_literal(wb, pars->scaling_points_cb[i][0], 8); aom_wb_write_literal(wb, pars->scaling_points_cb[i][1], 8); } aom_wb_write_literal(wb, pars->num_cr_points, 4); // max 10 for (int i = 0; i < pars->num_cr_points; i++) { aom_wb_write_literal(wb, pars->scaling_points_cr[i][0], 8); aom_wb_write_literal(wb, pars->scaling_points_cr[i][1], 8); } } aom_wb_write_literal(wb, pars->scaling_shift - 8, 2); // 8 + value // AR coefficients // Only sent if the corresponsing scaling function has // more than 0 points aom_wb_write_literal(wb, pars->ar_coeff_lag, 2); int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); int num_pos_chroma = num_pos_luma; if (pars->num_y_points > 0) ++num_pos_chroma; if (pars->num_y_points) for (int i = 0; i < num_pos_luma; i++) aom_wb_write_literal(wb, pars->ar_coeffs_y[i] + 128, 8); if (pars->num_cb_points || pars->chroma_scaling_from_luma) for (int i = 0; i < num_pos_chroma; i++) aom_wb_write_literal(wb, pars->ar_coeffs_cb[i] + 128, 8); if (pars->num_cr_points || pars->chroma_scaling_from_luma) for (int i = 0; i < num_pos_chroma; i++) aom_wb_write_literal(wb, pars->ar_coeffs_cr[i] + 128, 8); aom_wb_write_literal(wb, pars->ar_coeff_shift - 6, 2); // 8 + value aom_wb_write_literal(wb, pars->grain_scale_shift, 2); if (pars->num_cb_points) { aom_wb_write_literal(wb, pars->cb_mult, 8); aom_wb_write_literal(wb, pars->cb_luma_mult, 8); aom_wb_write_literal(wb, pars->cb_offset, 9); } if (pars->num_cr_points) { aom_wb_write_literal(wb, pars->cr_mult, 8); aom_wb_write_literal(wb, pars->cr_luma_mult, 8); aom_wb_write_literal(wb, pars->cr_offset, 9); } aom_wb_write_bit(wb, pars->overlap_flag); aom_wb_write_bit(wb, pars->clip_to_restricted_range); } static inline void write_sb_size(const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) { (void)seq_params; (void)wb; assert(seq_params->mib_size == mi_size_wide[seq_params->sb_size]); assert(seq_params->mib_size == 1 << seq_params->mib_size_log2); assert(seq_params->sb_size == BLOCK_128X128 || seq_params->sb_size == BLOCK_64X64); aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0); } static inline void write_sequence_header(const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) { aom_wb_write_literal(wb, seq_params->num_bits_width - 1, 4); aom_wb_write_literal(wb, seq_params->num_bits_height - 1, 4); aom_wb_write_literal(wb, seq_params->max_frame_width - 1, seq_params->num_bits_width); aom_wb_write_literal(wb, seq_params->max_frame_height - 1, seq_params->num_bits_height); if (!seq_params->reduced_still_picture_hdr) { aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag); if (seq_params->frame_id_numbers_present_flag) { // We must always have delta_frame_id_length < frame_id_length, // in order for a frame to be referenced with a unique delta. // Avoid wasting bits by using a coding that enforces this restriction. aom_wb_write_literal(wb, seq_params->delta_frame_id_length - 2, 4); aom_wb_write_literal( wb, seq_params->frame_id_length - seq_params->delta_frame_id_length - 1, 3); } } write_sb_size(seq_params, wb); aom_wb_write_bit(wb, seq_params->enable_filter_intra); aom_wb_write_bit(wb, seq_params->enable_intra_edge_filter); if (!seq_params->reduced_still_picture_hdr) { aom_wb_write_bit(wb, seq_params->enable_interintra_compound); aom_wb_write_bit(wb, seq_params->enable_masked_compound); aom_wb_write_bit(wb, seq_params->enable_warped_motion); aom_wb_write_bit(wb, seq_params->enable_dual_filter); aom_wb_write_bit(wb, seq_params->order_hint_info.enable_order_hint); if (seq_params->order_hint_info.enable_order_hint) { aom_wb_write_bit(wb, seq_params->order_hint_info.enable_dist_wtd_comp); aom_wb_write_bit(wb, seq_params->order_hint_info.enable_ref_frame_mvs); } if (seq_params->force_screen_content_tools == 2) { aom_wb_write_bit(wb, 1); } else { aom_wb_write_bit(wb, 0); aom_wb_write_bit(wb, seq_params->force_screen_content_tools); } if (seq_params->force_screen_content_tools > 0) { if (seq_params->force_integer_mv == 2) { aom_wb_write_bit(wb, 1); } else { aom_wb_write_bit(wb, 0); aom_wb_write_bit(wb, seq_params->force_integer_mv); } } else { assert(seq_params->force_integer_mv == 2); } if (seq_params->order_hint_info.enable_order_hint) aom_wb_write_literal( wb, seq_params->order_hint_info.order_hint_bits_minus_1, 3); } aom_wb_write_bit(wb, seq_params->enable_superres); aom_wb_write_bit(wb, seq_params->enable_cdef); aom_wb_write_bit(wb, seq_params->enable_restoration); } static inline void write_global_motion_params( const WarpedMotionParams *params, const WarpedMotionParams *ref_params, struct aom_write_bit_buffer *wb, int allow_hp) { const TransformationType type = params->wmtype; // As a workaround for an AV1 spec bug, we avoid choosing TRANSLATION // type models. Check here that we don't accidentally pick one somehow. // See comments in gm_get_motion_vector() for details on the bug we're // working around here assert(type != TRANSLATION); aom_wb_write_bit(wb, type != IDENTITY); if (type != IDENTITY) { aom_wb_write_bit(wb, type == ROTZOOM); if (type != ROTZOOM) aom_wb_write_bit(wb, type == TRANSLATION); } if (type >= ROTZOOM) { aom_wb_write_signed_primitive_refsubexpfin( wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS), (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); aom_wb_write_signed_primitive_refsubexpfin( wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF), (params->wmmat[3] >> GM_ALPHA_PREC_DIFF)); } if (type >= AFFINE) { aom_wb_write_signed_primitive_refsubexpfin( wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF), (params->wmmat[4] >> GM_ALPHA_PREC_DIFF)); aom_wb_write_signed_primitive_refsubexpfin( wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS), (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); } if (type >= TRANSLATION) { const int trans_bits = (type == TRANSLATION) ? GM_ABS_TRANS_ONLY_BITS - !allow_hp : GM_ABS_TRANS_BITS; const int trans_prec_diff = (type == TRANSLATION) ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp : GM_TRANS_PREC_DIFF; aom_wb_write_signed_primitive_refsubexpfin( wb, (1 << trans_bits) + 1, SUBEXPFIN_K, (ref_params->wmmat[0] >> trans_prec_diff), (params->wmmat[0] >> trans_prec_diff)); aom_wb_write_signed_primitive_refsubexpfin( wb, (1 << trans_bits) + 1, SUBEXPFIN_K, (ref_params->wmmat[1] >> trans_prec_diff), (params->wmmat[1] >> trans_prec_diff)); } } static inline void write_global_motion(AV1_COMP *cpi, struct aom_write_bit_buffer *wb) { AV1_COMMON *const cm = &cpi->common; int frame; for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) { const WarpedMotionParams *ref_params = cm->prev_frame ? &cm->prev_frame->global_motion[frame] : &default_warp_params; write_global_motion_params(&cm->global_motion[frame], ref_params, wb, cm->features.allow_high_precision_mv); // TODO(sarahparker, debargha): The logic in the commented out code below // does not work currently and causes mismatches when resize is on. // Fix it before turning the optimization back on. /* YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_yv12_buf(cpi, frame); if (cpi->source->y_crop_width == ref_buf->y_crop_width && cpi->source->y_crop_height == ref_buf->y_crop_height) { write_global_motion_params(&cm->global_motion[frame], &cm->prev_frame->global_motion[frame], wb, cm->features.allow_high_precision_mv); } else { assert(cm->global_motion[frame].wmtype == IDENTITY && "Invalid warp type for frames of different resolutions"); } */ /* printf("Frame %d/%d: Enc Ref %d: %d %d %d %d\n", cm->current_frame.frame_number, cm->show_frame, frame, cm->global_motion[frame].wmmat[0], cm->global_motion[frame].wmmat[1], cm->global_motion[frame].wmmat[2], cm->global_motion[frame].wmmat[3]); */ } } static int check_frame_refs_short_signaling(AV1_COMMON *const cm, bool enable_ref_short_signaling) { // In rtc case when res < 360p and speed >= 9, we turn on // frame_refs_short_signaling if it won't break the decoder. if (enable_ref_short_signaling) { const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME); const int base = 1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1); const int order_hint_group_cur = cm->current_frame.display_order_hint / base; const int order_hint_group_gld = cm->ref_frame_map[gld_map_idx]->display_order_hint / base; const int relative_dist = cm->current_frame.order_hint - cm->ref_frame_map[gld_map_idx]->order_hint; // If current frame and GOLDEN frame are in the same order_hint group, and // they are not far apart (i.e., > 64 frames), then return 1. if (order_hint_group_cur == order_hint_group_gld && relative_dist >= 0 && relative_dist <= 64) { return 1; } return 0; } // Check whether all references are distinct frames. const RefCntBuffer *seen_bufs[INTER_REFS_PER_FRAME] = { NULL }; int num_refs = 0; for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); if (buf != NULL) { int seen = 0; for (int i = 0; i < num_refs; i++) { if (seen_bufs[i] == buf) { seen = 1; break; } } if (!seen) seen_bufs[num_refs++] = buf; } } // We only turn on frame_refs_short_signaling when all references are // distinct. if (num_refs < INTER_REFS_PER_FRAME) { // It indicates that there exist more than one reference frame pointing to // the same reference buffer, i.e. two or more references are duplicate. return 0; } // Check whether the encoder side ref frame choices are aligned with that to // be derived at the decoder side. int remapped_ref_idx_decoder[REF_FRAMES]; const int lst_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME); const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME); // Set up the frame refs mapping indexes according to the // frame_refs_short_signaling policy. av1_set_frame_refs(cm, remapped_ref_idx_decoder, lst_map_idx, gld_map_idx); // We only turn on frame_refs_short_signaling when the encoder side decision // on ref frames is identical to that at the decoder side. int frame_refs_short_signaling = 1; for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ++ref_idx) { // Compare the buffer index between two reference frames indexed // respectively by the encoder and the decoder side decisions. RefCntBuffer *ref_frame_buf_new = NULL; if (remapped_ref_idx_decoder[ref_idx] != INVALID_IDX) { ref_frame_buf_new = cm->ref_frame_map[remapped_ref_idx_decoder[ref_idx]]; } if (get_ref_frame_buf(cm, LAST_FRAME + ref_idx) != ref_frame_buf_new) { frame_refs_short_signaling = 0; break; } } #if 0 // For debug printf("\nFrame=%d: \n", cm->current_frame.frame_number); printf("***frame_refs_short_signaling=%d\n", frame_refs_short_signaling); for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { printf("enc_ref(map_idx=%d)=%d, vs. " "dec_ref(map_idx=%d)=%d\n", get_ref_frame_map_idx(cm, ref_frame), ref_frame, cm->remapped_ref_idx[ref_frame - LAST_FRAME], ref_frame); } #endif // 0 return frame_refs_short_signaling; } // New function based on HLS R18 static inline void write_uncompressed_header_obu( AV1_COMP *cpi, MACROBLOCKD *const xd, struct aom_write_bit_buffer *saved_wb, struct aom_write_bit_buffer *wb) { AV1_COMMON *const cm = &cpi->common; const SequenceHeader *const seq_params = cm->seq_params; const CommonQuantParams *quant_params = &cm->quant_params; CurrentFrame *const current_frame = &cm->current_frame; FeatureFlags *const features = &cm->features; if (!cpi->sf.rt_sf.enable_ref_short_signaling || !seq_params->order_hint_info.enable_order_hint || seq_params->order_hint_info.enable_ref_frame_mvs) { current_frame->frame_refs_short_signaling = 0; } else { current_frame->frame_refs_short_signaling = 1; } if (seq_params->still_picture) { assert(cm->show_existing_frame == 0); assert(cm->show_frame == 1); assert(current_frame->frame_type == KEY_FRAME); } if (!seq_params->reduced_still_picture_hdr) { if (encode_show_existing_frame(cm)) { aom_wb_write_bit(wb, 1); // show_existing_frame aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3); if (seq_params->decoder_model_info_present_flag && seq_params->timing_info.equal_picture_interval == 0) { write_tu_pts_info(cm, wb); } if (seq_params->frame_id_numbers_present_flag) { int frame_id_len = seq_params->frame_id_length; int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show]; aom_wb_write_literal(wb, display_frame_id, frame_id_len); } return; } else { aom_wb_write_bit(wb, 0); // show_existing_frame } aom_wb_write_literal(wb, current_frame->frame_type, 2); aom_wb_write_bit(wb, cm->show_frame); if (cm->show_frame) { if (seq_params->decoder_model_info_present_flag && seq_params->timing_info.equal_picture_interval == 0) write_tu_pts_info(cm, wb); } else { aom_wb_write_bit(wb, cm->showable_frame); } if (frame_is_sframe(cm)) { assert(features->error_resilient_mode); } else if (!(current_frame->frame_type == KEY_FRAME && cm->show_frame)) { aom_wb_write_bit(wb, features->error_resilient_mode); } } aom_wb_write_bit(wb, features->disable_cdf_update); if (seq_params->force_screen_content_tools == 2) { aom_wb_write_bit(wb, features->allow_screen_content_tools); } else { assert(features->allow_screen_content_tools == seq_params->force_screen_content_tools); } if (features->allow_screen_content_tools) { if (seq_params->force_integer_mv == 2) { aom_wb_write_bit(wb, features->cur_frame_force_integer_mv); } else { assert(features->cur_frame_force_integer_mv == seq_params->force_integer_mv); } } else { assert(features->cur_frame_force_integer_mv == 0); } int frame_size_override_flag = 0; if (seq_params->reduced_still_picture_hdr) { assert(cm->superres_upscaled_width == seq_params->max_frame_width && cm->superres_upscaled_height == seq_params->max_frame_height); } else { if (seq_params->frame_id_numbers_present_flag) { int frame_id_len = seq_params->frame_id_length; aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len); } if (cm->superres_upscaled_width > seq_params->max_frame_width || cm->superres_upscaled_height > seq_params->max_frame_height) { aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Frame dimensions are larger than the maximum values"); } frame_size_override_flag = frame_is_sframe(cm) ? 1 : (cm->superres_upscaled_width != seq_params->max_frame_width || cm->superres_upscaled_height != seq_params->max_frame_height); if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag); if (seq_params->order_hint_info.enable_order_hint) aom_wb_write_literal( wb, current_frame->order_hint, seq_params->order_hint_info.order_hint_bits_minus_1 + 1); if (!features->error_resilient_mode && !frame_is_intra_only(cm)) { aom_wb_write_literal(wb, features->primary_ref_frame, PRIMARY_REF_BITS); } } if (seq_params->decoder_model_info_present_flag) { aom_wb_write_bit(wb, cpi->ppi->buffer_removal_time_present); if (cpi->ppi->buffer_removal_time_present) { for (int op_num = 0; op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) { if (seq_params->op_params[op_num].decoder_model_param_present_flag) { if (seq_params->operating_point_idc[op_num] == 0 || ((seq_params->operating_point_idc[op_num] >> cm->temporal_layer_id) & 0x1 && (seq_params->operating_point_idc[op_num] >> (cm->spatial_layer_id + 8)) & 0x1)) { aom_wb_write_unsigned_literal( wb, cm->buffer_removal_times[op_num], seq_params->decoder_model_info.buffer_removal_time_length); cm->buffer_removal_times[op_num]++; if (cm->buffer_removal_times[op_num] == 0) { aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "buffer_removal_time overflowed"); } } } } } } // Shown keyframes and switch-frames automatically refreshes all reference // frames. For all other frame types, we need to write refresh_frame_flags. if ((current_frame->frame_type == KEY_FRAME && !cm->show_frame) || current_frame->frame_type == INTER_FRAME || current_frame->frame_type == INTRA_ONLY_FRAME) aom_wb_write_literal(wb, current_frame->refresh_frame_flags, REF_FRAMES); if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xff) { // Write all ref frame order hints if error_resilient_mode == 1 if (features->error_resilient_mode && seq_params->order_hint_info.enable_order_hint) { for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) { aom_wb_write_literal( wb, cm->ref_frame_map[ref_idx]->order_hint, seq_params->order_hint_info.order_hint_bits_minus_1 + 1); } } } if (current_frame->frame_type == KEY_FRAME) { write_frame_size(cm, frame_size_override_flag, wb); assert(!av1_superres_scaled(cm) || !features->allow_intrabc); if (features->allow_screen_content_tools && !av1_superres_scaled(cm)) aom_wb_write_bit(wb, features->allow_intrabc); } else { if (current_frame->frame_type == INTRA_ONLY_FRAME) { write_frame_size(cm, frame_size_override_flag, wb); assert(!av1_superres_scaled(cm) || !features->allow_intrabc); if (features->allow_screen_content_tools && !av1_superres_scaled(cm)) aom_wb_write_bit(wb, features->allow_intrabc); } else if (current_frame->frame_type == INTER_FRAME || frame_is_sframe(cm)) { MV_REFERENCE_FRAME ref_frame; // NOTE: Error resilient mode turns off frame_refs_short_signaling // automatically. #define FRAME_REFS_SHORT_SIGNALING 0 #if FRAME_REFS_SHORT_SIGNALING current_frame->frame_refs_short_signaling = seq_params->order_hint_info.enable_order_hint; #endif // FRAME_REFS_SHORT_SIGNALING if (current_frame->frame_refs_short_signaling) { // In rtc case when cpi->sf.rt_sf.enable_ref_short_signaling is true, // we turn on frame_refs_short_signaling when the current frame and // golden frame are in the same order_hint group, and their relative // distance is <= 64 (in order to be decodable). // For other cases, an example solution for encoder-side // implementation on frame_refs_short_signaling is also provided in // this function, where frame_refs_short_signaling is only turned on // when the encoder side decision on ref frames is identical to that // at the decoder side. current_frame->frame_refs_short_signaling = check_frame_refs_short_signaling( cm, cpi->sf.rt_sf.enable_ref_short_signaling); } if (seq_params->order_hint_info.enable_order_hint) aom_wb_write_bit(wb, current_frame->frame_refs_short_signaling); if (current_frame->frame_refs_short_signaling) { const int lst_ref = get_ref_frame_map_idx(cm, LAST_FRAME); aom_wb_write_literal(wb, lst_ref, REF_FRAMES_LOG2); const int gld_ref = get_ref_frame_map_idx(cm, GOLDEN_FRAME); aom_wb_write_literal(wb, gld_ref, REF_FRAMES_LOG2); } int first_ref_map_idx = INVALID_IDX; if (cpi->ppi->rtc_ref.set_ref_frame_config) { for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { if (cpi->ppi->rtc_ref.reference[ref_frame - 1] == 1) { first_ref_map_idx = cpi->ppi->rtc_ref.ref_idx[ref_frame - 1]; break; } } } for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { assert(get_ref_frame_map_idx(cm, ref_frame) != INVALID_IDX); if (!current_frame->frame_refs_short_signaling) { if (cpi->ppi->rtc_ref.set_ref_frame_config && first_ref_map_idx != INVALID_IDX && cpi->svc.number_spatial_layers == 1 && !seq_params->order_hint_info.enable_order_hint) { // For the usage of set_ref_frame_config: // for any reference not used set their ref_map_idx // to the first used reference. const int map_idx = cpi->ppi->rtc_ref.reference[ref_frame - 1] ? get_ref_frame_map_idx(cm, ref_frame) : first_ref_map_idx; aom_wb_write_literal(wb, map_idx, REF_FRAMES_LOG2); } else { aom_wb_write_literal(wb, get_ref_frame_map_idx(cm, ref_frame), REF_FRAMES_LOG2); } } if (seq_params->frame_id_numbers_present_flag) { int i = get_ref_frame_map_idx(cm, ref_frame); int frame_id_len = seq_params->frame_id_length; int diff_len = seq_params->delta_frame_id_length; int delta_frame_id_minus_1 = ((cm->current_frame_id - cm->ref_frame_id[i] + (1 << frame_id_len)) % (1 << frame_id_len)) - 1; if (delta_frame_id_minus_1 < 0 || delta_frame_id_minus_1 >= (1 << diff_len)) { aom_internal_error(cm->error, AOM_CODEC_ERROR, "Invalid delta_frame_id_minus_1"); } aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len); } } if (!features->error_resilient_mode && frame_size_override_flag) { write_frame_size_with_refs(cm, wb); } else { write_frame_size(cm, frame_size_override_flag, wb); } if (!features->cur_frame_force_integer_mv) aom_wb_write_bit(wb, features->allow_high_precision_mv); write_frame_interp_filter(features->interp_filter, wb); aom_wb_write_bit(wb, features->switchable_motion_mode); if (frame_might_allow_ref_frame_mvs(cm)) { aom_wb_write_bit(wb, features->allow_ref_frame_mvs); } else { assert(features->allow_ref_frame_mvs == 0); } } } const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) && !(features->disable_cdf_update); if (cm->tiles.large_scale) assert(features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED); if (might_bwd_adapt) { aom_wb_write_bit( wb, features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED); } write_tile_info(cm, saved_wb, wb); encode_quantization(quant_params, av1_num_planes(cm), cm->seq_params->separate_uv_delta_q, wb); encode_segmentation(cm, wb); const DeltaQInfo *const delta_q_info = &cm->delta_q_info; if (delta_q_info->delta_q_present_flag) assert(quant_params->base_qindex > 0); if (quant_params->base_qindex > 0) { aom_wb_write_bit(wb, delta_q_info->delta_q_present_flag); if (delta_q_info->delta_q_present_flag) { aom_wb_write_literal(wb, get_msb(delta_q_info->delta_q_res), 2); xd->current_base_qindex = quant_params->base_qindex; if (features->allow_intrabc) assert(delta_q_info->delta_lf_present_flag == 0); else aom_wb_write_bit(wb, delta_q_info->delta_lf_present_flag); if (delta_q_info->delta_lf_present_flag) { aom_wb_write_literal(wb, get_msb(delta_q_info->delta_lf_res), 2); aom_wb_write_bit(wb, delta_q_info->delta_lf_multi); av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); } } } if (features->all_lossless) { assert(!av1_superres_scaled(cm)); } else { if (!features->coded_lossless) { encode_loopfilter(cm, wb); encode_cdef(cm, wb); } encode_restoration_mode(cm, wb); } // Write TX mode if (features->coded_lossless) assert(features->tx_mode == ONLY_4X4); else aom_wb_write_bit(wb, features->tx_mode == TX_MODE_SELECT); if (!frame_is_intra_only(cm)) { const int use_hybrid_pred = current_frame->reference_mode == REFERENCE_MODE_SELECT; aom_wb_write_bit(wb, use_hybrid_pred); } if (current_frame->skip_mode_info.skip_mode_allowed) aom_wb_write_bit(wb, current_frame->skip_mode_info.skip_mode_flag); if (frame_might_allow_warped_motion(cm)) aom_wb_write_bit(wb, features->allow_warped_motion); else assert(!features->allow_warped_motion); aom_wb_write_bit(wb, features->reduced_tx_set_used); if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb); if (seq_params->film_grain_params_present && (cm->show_frame || cm->showable_frame)) write_film_grain_params(cpi, wb); if (cm->tiles.large_scale) write_ext_tile_info(cm, saved_wb, wb); } static int choose_size_bytes(uint32_t size, int spare_msbs) { // Choose the number of bytes required to represent size, without // using the 'spare_msbs' number of most significant bits. // Make sure we will fit in 4 bytes to start with.. if (spare_msbs > 0 && size >> (32 - spare_msbs) != 0) return -1; // Normalise to 32 bits size <<= spare_msbs; if (size >> 24 != 0) return 4; else if (size >> 16 != 0) return 3; else if (size >> 8 != 0) return 2; else return 1; } static inline void mem_put_varsize(uint8_t *const dst, const int sz, const int val) { switch (sz) { case 1: dst[0] = (uint8_t)(val & 0xff); break; case 2: mem_put_le16(dst, val); break; case 3: mem_put_le24(dst, val); break; case 4: mem_put_le32(dst, val); break; default: assert(0 && "Invalid size"); break; } } static int remux_tiles(const CommonTileParams *const tiles, uint8_t *dst, const uint32_t data_size, const uint32_t max_tile_size, const uint32_t max_tile_col_size, int *const tile_size_bytes, int *const tile_col_size_bytes) { // Choose the tile size bytes (tsb) and tile column size bytes (tcsb) int tsb; int tcsb; if (tiles->large_scale) { // The top bit in the tile size field indicates tile copy mode, so we // have 1 less bit to code the tile size tsb = choose_size_bytes(max_tile_size, 1); tcsb = choose_size_bytes(max_tile_col_size, 0); } else { tsb = choose_size_bytes(max_tile_size, 0); tcsb = 4; // This is ignored (void)max_tile_col_size; } assert(tsb > 0); assert(tcsb > 0); *tile_size_bytes = tsb; *tile_col_size_bytes = tcsb; if (tsb == 4 && tcsb == 4) return data_size; uint32_t wpos = 0; uint32_t rpos = 0; if (tiles->large_scale) { int tile_row; int tile_col; for (tile_col = 0; tile_col < tiles->cols; tile_col++) { // All but the last column has a column header if (tile_col < tiles->cols - 1) { uint32_t tile_col_size = mem_get_le32(dst + rpos); rpos += 4; // Adjust the tile column size by the number of bytes removed // from the tile size fields. tile_col_size -= (4 - tsb) * tiles->rows; mem_put_varsize(dst + wpos, tcsb, tile_col_size); wpos += tcsb; } for (tile_row = 0; tile_row < tiles->rows; tile_row++) { // All, including the last row has a header uint32_t tile_header = mem_get_le32(dst + rpos); rpos += 4; // If this is a copy tile, we need to shift the MSB to the // top bit of the new width, and there is no data to copy. if (tile_header >> 31 != 0) { if (tsb < 4) tile_header >>= 32 - 8 * tsb; mem_put_varsize(dst + wpos, tsb, tile_header); wpos += tsb; } else { mem_put_varsize(dst + wpos, tsb, tile_header); wpos += tsb; tile_header += AV1_MIN_TILE_SIZE_BYTES; memmove(dst + wpos, dst + rpos, tile_header); rpos += tile_header; wpos += tile_header; } } } assert(rpos > wpos); assert(rpos == data_size); return wpos; } const int n_tiles = tiles->cols * tiles->rows; int n; for (n = 0; n < n_tiles; n++) { int tile_size; if (n == n_tiles - 1) { tile_size = data_size - rpos; } else { tile_size = mem_get_le32(dst + rpos); rpos += 4; mem_put_varsize(dst + wpos, tsb, tile_size); tile_size += AV1_MIN_TILE_SIZE_BYTES; wpos += tsb; } memmove(dst + wpos, dst + rpos, tile_size); rpos += tile_size; wpos += tile_size; } assert(rpos > wpos); assert(rpos == data_size); return wpos; } uint32_t av1_write_obu_header(AV1LevelParams *const level_params, int *frame_header_count, OBU_TYPE obu_type, bool has_nonzero_operating_point_idc, bool is_layer_specific_obu, int obu_extension, uint8_t *const dst) { assert(IMPLIES(!has_nonzero_operating_point_idc, obu_extension == 0)); if (level_params->keep_level_stats && (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER)) ++(*frame_header_count); uint32_t size = 0; // The AV1 spec draft version (as of git commit 5e04f) // has the following requirements on the OBU extension header: // // 6.4.1. General sequence header OBU semantics: // If operating_point_idc[ op ] is not equal to 0 for any value of op from 0 // to operating_points_cnt_minus_1, it is a requirement of bitstream // conformance that obu_extension_flag is equal to 1 for all layer-specific // OBUs in the coded video sequence. // (...) // It is a requirement of bitstream conformance that if OperatingPointIdc // is equal to 0, then obu_extension_flag is equal to 0 for all OBUs that // follow this sequence header until the next sequence header. // // Set obu_extension_flag to satisfy these requirements. const int obu_extension_flag = has_nonzero_operating_point_idc && is_layer_specific_obu; const int obu_has_size_field = 1; dst[0] = ((int)obu_type << 3) | (obu_extension_flag << 2) | (obu_has_size_field << 1); size++; if (obu_extension_flag) { dst[1] = obu_extension & 0xFF; size++; } return size; } int av1_write_uleb_obu_size(size_t obu_payload_size, uint8_t *dest, size_t dest_size) { size_t coded_obu_size = 0; if (aom_uleb_encode(obu_payload_size, dest_size, dest, &coded_obu_size) != 0) { return AOM_CODEC_ERROR; } if (coded_obu_size != dest_size) { return AOM_CODEC_ERROR; } return AOM_CODEC_OK; } // Deprecated. Use av1_write_uleb_obu_size() instead. static int av1_write_uleb_obu_size_unsafe(size_t obu_payload_size, uint8_t *dest) { size_t coded_obu_size = 0; if (aom_uleb_encode(obu_payload_size, sizeof(uint32_t), dest, &coded_obu_size) != 0) { return AOM_CODEC_ERROR; } return AOM_CODEC_OK; } // Returns 0 on failure. static size_t obu_memmove(size_t obu_header_size, size_t obu_payload_size, uint8_t *data, size_t data_size) { const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size); const size_t move_dst_offset = obu_header_size + length_field_size; const size_t move_src_offset = obu_header_size; const size_t move_size = obu_payload_size; if (move_size > data_size || move_src_offset > data_size - move_size) { assert(0 && "obu_memmove: output buffer overflow"); return 0; } if (move_dst_offset > data_size - move_size) { // Buffer full. return 0; } memmove(data + move_dst_offset, data + move_src_offset, move_size); return length_field_size; } // Deprecated. Use obu_memmove() instead. static size_t obu_memmove_unsafe(size_t obu_header_size, size_t obu_payload_size, uint8_t *data) { const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size); const size_t move_dst_offset = obu_header_size + length_field_size; const size_t move_src_offset = obu_header_size; const size_t move_size = obu_payload_size; memmove(data + move_dst_offset, data + move_src_offset, move_size); return length_field_size; } static inline void add_trailing_bits(struct aom_write_bit_buffer *wb) { if (aom_wb_is_byte_aligned(wb)) { aom_wb_write_literal(wb, 0x80, 8); } else { // assumes that the other bits are already 0s aom_wb_write_bit(wb, 1); } } static inline void write_bitstream_level(AV1_LEVEL seq_level_idx, struct aom_write_bit_buffer *wb) { assert(is_valid_seq_level_idx(seq_level_idx)); aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS); } uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params, uint8_t *const dst, size_t dst_size) { // TODO: bug 42302568 - Use dst_size. (void)dst_size; struct aom_write_bit_buffer wb = { dst, 0 }; uint32_t size = 0; write_profile(seq_params->profile, &wb); // Still picture or not aom_wb_write_bit(&wb, seq_params->still_picture); assert(IMPLIES(!seq_params->still_picture, !seq_params->reduced_still_picture_hdr)); // whether to use reduced still picture header aom_wb_write_bit(&wb, seq_params->reduced_still_picture_hdr); if (seq_params->reduced_still_picture_hdr) { assert(seq_params->timing_info_present == 0); assert(seq_params->decoder_model_info_present_flag == 0); assert(seq_params->display_model_info_present_flag == 0); write_bitstream_level(seq_params->seq_level_idx[0], &wb); } else { aom_wb_write_bit( &wb, seq_params->timing_info_present); // timing info present flag if (seq_params->timing_info_present) { // timing_info write_timing_info_header(&seq_params->timing_info, &wb); aom_wb_write_bit(&wb, seq_params->decoder_model_info_present_flag); if (seq_params->decoder_model_info_present_flag) { write_decoder_model_info(&seq_params->decoder_model_info, &wb); } } aom_wb_write_bit(&wb, seq_params->display_model_info_present_flag); aom_wb_write_literal(&wb, seq_params->operating_points_cnt_minus_1, OP_POINTS_CNT_MINUS_1_BITS); int i; for (i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) { aom_wb_write_literal(&wb, seq_params->operating_point_idc[i], OP_POINTS_IDC_BITS); write_bitstream_level(seq_params->seq_level_idx[i], &wb); if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0) aom_wb_write_bit(&wb, seq_params->tier[i]); if (seq_params->decoder_model_info_present_flag) { aom_wb_write_bit( &wb, seq_params->op_params[i].decoder_model_param_present_flag); if (seq_params->op_params[i].decoder_model_param_present_flag) { write_dec_model_op_parameters( &seq_params->op_params[i], seq_params->decoder_model_info .encoder_decoder_buffer_delay_length, &wb); } } if (seq_params->display_model_info_present_flag) { aom_wb_write_bit( &wb, seq_params->op_params[i].display_model_param_present_flag); if (seq_params->op_params[i].display_model_param_present_flag) { assert(seq_params->op_params[i].initial_display_delay >= 1); assert(seq_params->op_params[i].initial_display_delay <= 10); aom_wb_write_literal( &wb, seq_params->op_params[i].initial_display_delay - 1, 4); } } } } write_sequence_header(seq_params, &wb); write_color_config(seq_params, &wb); aom_wb_write_bit(&wb, seq_params->film_grain_params_present); add_trailing_bits(&wb); size = aom_wb_bytes_written(&wb); return size; } static uint32_t write_frame_header_obu(AV1_COMP *cpi, MACROBLOCKD *const xd, struct aom_write_bit_buffer *saved_wb, uint8_t *const dst, int append_trailing_bits) { struct aom_write_bit_buffer wb = { dst, 0 }; write_uncompressed_header_obu(cpi, xd, saved_wb, &wb); if (append_trailing_bits) add_trailing_bits(&wb); return aom_wb_bytes_written(&wb); } static uint32_t write_tile_group_header(uint8_t *const dst, int start_tile, int end_tile, int tiles_log2, int tile_start_and_end_present_flag) { struct aom_write_bit_buffer wb = { dst, 0 }; uint32_t size = 0; if (!tiles_log2) return size; aom_wb_write_bit(&wb, tile_start_and_end_present_flag); if (tile_start_and_end_present_flag) { aom_wb_write_literal(&wb, start_tile, tiles_log2); aom_wb_write_literal(&wb, end_tile, tiles_log2); } size = aom_wb_bytes_written(&wb); return size; } typedef struct { uint32_t tg_hdr_size; uint32_t frame_header_size; } LargeTileFrameOBU; // Initialize OBU header for large scale tile case. static uint32_t init_large_scale_tile_obu_header( AV1_COMP *const cpi, uint8_t **data, struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header, LargeTileFrameOBU *lst_obu) { AV1LevelParams *const level_params = &cpi->ppi->level_params; CurrentFrame *const current_frame = &cpi->common.current_frame; // For large_scale_tile case, we always have only one tile group, so it can // be written as an OBU_FRAME. const OBU_TYPE obu_type = OBU_FRAME; lst_obu->tg_hdr_size = av1_write_obu_header( level_params, &cpi->frame_header_count, obu_type, cpi->common.seq_params->has_nonzero_operating_point_idc, /*is_layer_specific_obu=*/true, obu_extension_header, *data); *data += lst_obu->tg_hdr_size; const uint32_t frame_header_size = write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, saved_wb, *data, 0); *data += frame_header_size; lst_obu->frame_header_size = frame_header_size; // (yunqing) This test ensures the correctness of large scale tile coding. if (cpi->oxcf.tile_cfg.enable_ext_tile_debug) { char fn[20] = "./fh"; fn[4] = current_frame->frame_number / 100 + '0'; fn[5] = (current_frame->frame_number % 100) / 10 + '0'; fn[6] = (current_frame->frame_number % 10) + '0'; fn[7] = '\0'; av1_print_uncompressed_frame_header(*data - frame_header_size, frame_header_size, fn); } return frame_header_size; } // Write total buffer size and related information into the OBU header for large // scale tile case. static void write_large_scale_tile_obu_size( const CommonTileParams *const tiles, uint8_t *const dst, uint8_t *data, struct aom_write_bit_buffer *saved_wb, LargeTileFrameOBU *const lst_obu, int have_tiles, uint32_t *total_size, int max_tile_size, int max_tile_col_size) { int tile_size_bytes = 0; int tile_col_size_bytes = 0; if (have_tiles) { *total_size = remux_tiles( tiles, data, *total_size - lst_obu->frame_header_size, max_tile_size, max_tile_col_size, &tile_size_bytes, &tile_col_size_bytes); *total_size += lst_obu->frame_header_size; } // In EXT_TILE case, only use 1 tile group. Follow the obu syntax, write // current tile group size before tile data(include tile column header). // Tile group size doesn't include the bytes storing tg size. *total_size += lst_obu->tg_hdr_size; const uint32_t obu_payload_size = *total_size - lst_obu->tg_hdr_size; const size_t length_field_size = obu_memmove_unsafe(lst_obu->tg_hdr_size, obu_payload_size, dst); if (av1_write_uleb_obu_size_unsafe( obu_payload_size, dst + lst_obu->tg_hdr_size) != AOM_CODEC_OK) assert(0); *total_size += (uint32_t)length_field_size; saved_wb->bit_buffer += length_field_size; // Now fill in the gaps in the uncompressed header. if (have_tiles) { assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4); aom_wb_overwrite_literal(saved_wb, tile_col_size_bytes - 1, 2); assert(tile_size_bytes >= 1 && tile_size_bytes <= 4); aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2); } } // Store information on each large scale tile in the OBU header. static void write_large_scale_tile_obu( AV1_COMP *const cpi, uint8_t *const dst, LargeTileFrameOBU *const lst_obu, int *const largest_tile_id, uint32_t *total_size, const int have_tiles, unsigned int *const max_tile_size, unsigned int *const max_tile_col_size) { AV1_COMMON *const cm = &cpi->common; const CommonTileParams *const tiles = &cm->tiles; TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS]; const int tile_cols = tiles->cols; const int tile_rows = tiles->rows; unsigned int tile_size = 0; av1_reset_pack_bs_thread_data(&cpi->td); for (int tile_col = 0; tile_col < tile_cols; tile_col++) { TileInfo tile_info; const int is_last_col = (tile_col == tile_cols - 1); const uint32_t col_offset = *total_size; av1_tile_set_col(&tile_info, cm, tile_col); // The last column does not have a column header if (!is_last_col) *total_size += 4; for (int tile_row = 0; tile_row < tile_rows; tile_row++) { TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col]; const int data_offset = have_tiles ? 4 : 0; const int tile_idx = tile_row * tile_cols + tile_col; TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; av1_tile_set_row(&tile_info, cm, tile_row); aom_writer mode_bc; buf->data = dst + *total_size + lst_obu->tg_hdr_size; // Is CONFIG_EXT_TILE = 1, every tile in the row has a header, // even for the last one, unless no tiling is used at all. *total_size += data_offset; cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; mode_bc.allow_update_cdf = !tiles->large_scale; mode_bc.allow_update_cdf = mode_bc.allow_update_cdf && !cm->features.disable_cdf_update; aom_start_encode(&mode_bc, buf->data + data_offset); write_modes(cpi, &cpi->td, &tile_info, &mode_bc, tile_row, tile_col); if (aom_stop_encode(&mode_bc) < 0) { aom_internal_error(cm->error, AOM_CODEC_ERROR, "Error writing modes"); } tile_size = mode_bc.pos; buf->size = tile_size; // Record the maximum tile size we see, so we can compact headers later. if (tile_size > *max_tile_size) { *max_tile_size = tile_size; *largest_tile_id = tile_cols * tile_row + tile_col; } if (have_tiles) { // tile header: size of this tile, or copy offset uint32_t tile_header = tile_size - AV1_MIN_TILE_SIZE_BYTES; const int tile_copy_mode = ((AOMMAX(tiles->width, tiles->height) << MI_SIZE_LOG2) <= 256) ? 1 : 0; // If tile_copy_mode = 1, check if this tile is a copy tile. // Very low chances to have copy tiles on the key frames, so don't // search on key frames to reduce unnecessary search. if (cm->current_frame.frame_type != KEY_FRAME && tile_copy_mode) { const int identical_tile_offset = find_identical_tile(tile_row, tile_col, tile_buffers); // Indicate a copy-tile by setting the most significant bit. // The row-offset to copy from is stored in the highest byte. // remux_tiles will move these around later if (identical_tile_offset > 0) { tile_size = 0; tile_header = identical_tile_offset | 0x80; tile_header <<= 24; } } mem_put_le32(buf->data, (MEM_VALUE_T)tile_header); } *total_size += tile_size; } if (!is_last_col) { uint32_t col_size = *total_size - col_offset - 4; mem_put_le32(dst + col_offset + lst_obu->tg_hdr_size, col_size); // Record the maximum tile column size we see. *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size); } } av1_accumulate_pack_bs_thread_data(cpi, &cpi->td); } // Packs information in the obu header for large scale tiles. static inline uint32_t pack_large_scale_tiles_in_tg_obus( AV1_COMP *const cpi, uint8_t *const dst, struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header, int *const largest_tile_id) { AV1_COMMON *const cm = &cpi->common; const CommonTileParams *const tiles = &cm->tiles; uint32_t total_size = 0; unsigned int max_tile_size = 0; unsigned int max_tile_col_size = 0; const int have_tiles = tiles->cols * tiles->rows > 1; uint8_t *data = dst; LargeTileFrameOBU lst_obu; total_size += init_large_scale_tile_obu_header( cpi, &data, saved_wb, obu_extension_header, &lst_obu); write_large_scale_tile_obu(cpi, dst, &lst_obu, largest_tile_id, &total_size, have_tiles, &max_tile_size, &max_tile_col_size); write_large_scale_tile_obu_size(tiles, dst, data, saved_wb, &lst_obu, have_tiles, &total_size, max_tile_size, max_tile_col_size); return total_size; } // Writes obu, tile group and uncompressed headers to bitstream. void av1_write_obu_tg_tile_headers(AV1_COMP *const cpi, MACROBLOCKD *const xd, PackBSParams *const pack_bs_params, const int tile_idx) { AV1_COMMON *const cm = &cpi->common; const CommonTileParams *const tiles = &cm->tiles; int *const curr_tg_hdr_size = &pack_bs_params->curr_tg_hdr_size; const int tg_size = (tiles->rows * tiles->cols + cpi->num_tg - 1) / cpi->num_tg; // Write Tile group, frame and OBU header // A new tile group begins at this tile. Write the obu header and // tile group header const OBU_TYPE obu_type = (cpi->num_tg == 1) ? OBU_FRAME : OBU_TILE_GROUP; *curr_tg_hdr_size = av1_write_obu_header( &cpi->ppi->level_params, &cpi->frame_header_count, obu_type, cm->seq_params->has_nonzero_operating_point_idc, /*is_layer_specific_obu=*/true, pack_bs_params->obu_extn_header, pack_bs_params->tile_data_curr); pack_bs_params->obu_header_size = *curr_tg_hdr_size; if (cpi->num_tg == 1) *curr_tg_hdr_size += write_frame_header_obu( cpi, xd, pack_bs_params->saved_wb, pack_bs_params->tile_data_curr + *curr_tg_hdr_size, 0); *curr_tg_hdr_size += write_tile_group_header( pack_bs_params->tile_data_curr + *curr_tg_hdr_size, tile_idx, AOMMIN(tile_idx + tg_size - 1, tiles->cols * tiles->rows - 1), (tiles->log2_rows + tiles->log2_cols), cpi->num_tg > 1); *pack_bs_params->total_size += *curr_tg_hdr_size; } // Pack tile data in the bitstream with tile_group, frame // and OBU header. void av1_pack_tile_info(AV1_COMP *const cpi, ThreadData *const td, PackBSParams *const pack_bs_params) { aom_writer mode_bc; AV1_COMMON *const cm = &cpi->common; int tile_row = pack_bs_params->tile_row; int tile_col = pack_bs_params->tile_col; uint32_t *const total_size = pack_bs_params->total_size; TileInfo tile_info; av1_tile_set_col(&tile_info, cm, tile_col); av1_tile_set_row(&tile_info, cm, tile_row); mode_bc.allow_update_cdf = 1; mode_bc.allow_update_cdf = mode_bc.allow_update_cdf && !cm->features.disable_cdf_update; unsigned int tile_size; const int num_planes = av1_num_planes(cm); av1_reset_loop_restoration(&td->mb.e_mbd, num_planes); pack_bs_params->buf.data = pack_bs_params->dst + *total_size; // The last tile of the tile group does not have a header. if (!pack_bs_params->is_last_tile_in_tg) *total_size += 4; // Pack tile data aom_start_encode(&mode_bc, pack_bs_params->dst + *total_size); write_modes(cpi, td, &tile_info, &mode_bc, tile_row, tile_col); if (aom_stop_encode(&mode_bc) < 0) { aom_internal_error(td->mb.e_mbd.error_info, AOM_CODEC_ERROR, "Error writing modes"); } tile_size = mode_bc.pos; assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES); pack_bs_params->buf.size = tile_size; // Write tile size if (!pack_bs_params->is_last_tile_in_tg) { // size of this tile mem_put_le32(pack_bs_params->buf.data, tile_size - AV1_MIN_TILE_SIZE_BYTES); } } void av1_write_last_tile_info( AV1_COMP *const cpi, const FrameHeaderInfo *fh_info, struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size, uint8_t *curr_tg_start, uint32_t *const total_size, uint8_t **tile_data_start, int *const largest_tile_id, int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header) { // write current tile group size const size_t obu_payload_size = *curr_tg_data_size - obu_header_size; const size_t length_field_size = obu_memmove_unsafe(obu_header_size, obu_payload_size, curr_tg_start); if (av1_write_uleb_obu_size_unsafe( obu_payload_size, curr_tg_start + obu_header_size) != AOM_CODEC_OK) { aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, "av1_write_last_tile_info: output buffer full"); } *curr_tg_data_size += length_field_size; *total_size += (uint32_t)length_field_size; *tile_data_start += length_field_size; if (cpi->num_tg == 1) { // if this tg is combined with the frame header then update saved // frame header base offset according to length field size saved_wb->bit_buffer += length_field_size; } if (!(*is_first_tg) && cpi->common.features.error_resilient_mode) { // Make room for a duplicate Frame Header OBU. memmove(curr_tg_start + fh_info->total_length, curr_tg_start, *curr_tg_data_size); // Insert a copy of the Frame Header OBU. memcpy(curr_tg_start, fh_info->frame_header, fh_info->total_length); // Force context update tile to be the first tile in error // resilient mode as the duplicate frame headers will have // context_update_tile_id set to 0 *largest_tile_id = 0; // Rewrite the OBU header to change the OBU type to Redundant Frame // Header. av1_write_obu_header( &cpi->ppi->level_params, &cpi->frame_header_count, OBU_REDUNDANT_FRAME_HEADER, cpi->common.seq_params->has_nonzero_operating_point_idc, /*is_layer_specific_obu=*/true, obu_extn_header, &curr_tg_start[fh_info->obu_header_byte_offset]); *curr_tg_data_size += fh_info->total_length; *total_size += (uint32_t)fh_info->total_length; } *is_first_tg = 0; } void av1_reset_pack_bs_thread_data(ThreadData *const td) { td->coefficient_size = 0; td->max_mv_magnitude = 0; av1_zero(td->interp_filter_selected); } void av1_accumulate_pack_bs_thread_data(AV1_COMP *const cpi, ThreadData const *td) { int do_max_mv_magnitude_update = 1; cpi->rc.coefficient_size += td->coefficient_size; // Disable max_mv_magnitude update for parallel frames based on update flag. if (!cpi->do_frame_data_update) do_max_mv_magnitude_update = 0; if (cpi->sf.mv_sf.auto_mv_step_size && do_max_mv_magnitude_update) cpi->mv_search_params.max_mv_magnitude = AOMMAX(cpi->mv_search_params.max_mv_magnitude, td->max_mv_magnitude); for (InterpFilter filter = EIGHTTAP_REGULAR; filter < SWITCHABLE; filter++) cpi->common.cur_frame->interp_filter_selected[filter] += td->interp_filter_selected[filter]; } // Store information related to each default tile in the OBU header. static void write_tile_obu( AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size, struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header, const FrameHeaderInfo *fh_info, int *const largest_tile_id, unsigned int *max_tile_size, uint32_t *const obu_header_size, uint8_t **tile_data_start) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; const CommonTileParams *const tiles = &cm->tiles; const int tile_cols = tiles->cols; const int tile_rows = tiles->rows; // Fixed size tile groups for the moment const int num_tg_hdrs = cpi->num_tg; const int tg_size = (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs; int tile_count = 0; size_t curr_tg_data_size = 0; uint8_t *tile_data_curr = dst; int new_tg = 1; int is_first_tg = 1; av1_reset_pack_bs_thread_data(&cpi->td); for (int tile_row = 0; tile_row < tile_rows; tile_row++) { for (int tile_col = 0; tile_col < tile_cols; tile_col++) { const int tile_idx = tile_row * tile_cols + tile_col; TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; int is_last_tile_in_tg = 0; if (new_tg) { tile_data_curr = dst + *total_size; tile_count = 0; } tile_count++; if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) is_last_tile_in_tg = 1; xd->tile_ctx = &this_tile->tctx; // PackBSParams stores all parameters required to pack tile and header // info. PackBSParams pack_bs_params; pack_bs_params.dst = dst; pack_bs_params.curr_tg_hdr_size = 0; pack_bs_params.is_last_tile_in_tg = is_last_tile_in_tg; pack_bs_params.new_tg = new_tg; pack_bs_params.obu_extn_header = obu_extn_header; pack_bs_params.obu_header_size = 0; pack_bs_params.saved_wb = saved_wb; pack_bs_params.tile_col = tile_col; pack_bs_params.tile_row = tile_row; pack_bs_params.tile_data_curr = tile_data_curr; pack_bs_params.total_size = total_size; if (new_tg) av1_write_obu_tg_tile_headers(cpi, xd, &pack_bs_params, tile_idx); av1_pack_tile_info(cpi, &cpi->td, &pack_bs_params); if (new_tg) { curr_tg_data_size = pack_bs_params.curr_tg_hdr_size; *tile_data_start += pack_bs_params.curr_tg_hdr_size; *obu_header_size = pack_bs_params.obu_header_size; new_tg = 0; } if (is_last_tile_in_tg) new_tg = 1; curr_tg_data_size += (pack_bs_params.buf.size + (is_last_tile_in_tg ? 0 : 4)); if (pack_bs_params.buf.size > *max_tile_size) { *largest_tile_id = tile_idx; *max_tile_size = (unsigned int)pack_bs_params.buf.size; } if (is_last_tile_in_tg) av1_write_last_tile_info(cpi, fh_info, saved_wb, &curr_tg_data_size, tile_data_curr, total_size, tile_data_start, largest_tile_id, &is_first_tg, *obu_header_size, obu_extn_header); *total_size += (uint32_t)pack_bs_params.buf.size; } } av1_accumulate_pack_bs_thread_data(cpi, &cpi->td); } // Write total buffer size and related information into the OBU header for // default tile case. static void write_tile_obu_size(AV1_COMP *const cpi, uint8_t *const dst, struct aom_write_bit_buffer *saved_wb, int largest_tile_id, uint32_t *const total_size, unsigned int max_tile_size, uint32_t obu_header_size, uint8_t *tile_data_start) { const CommonTileParams *const tiles = &cpi->common.tiles; // Fill in context_update_tile_id indicating the tile to use for the // cdf update. The encoder currently sets it to the largest tile // (but is up to the encoder) aom_wb_overwrite_literal(saved_wb, largest_tile_id, (tiles->log2_cols + tiles->log2_rows)); // If more than one tile group. tile_size_bytes takes the default value 4 // and does not need to be set. For a single tile group it is set in the // section below. if (cpi->num_tg != 1) return; int tile_size_bytes = 4, unused; const uint32_t tile_data_offset = (uint32_t)(tile_data_start - dst); const uint32_t tile_data_size = *total_size - tile_data_offset; *total_size = remux_tiles(tiles, tile_data_start, tile_data_size, max_tile_size, 0, &tile_size_bytes, &unused); *total_size += tile_data_offset; assert(tile_size_bytes >= 1 && tile_size_bytes <= 4); aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2); // Update the OBU length if remux_tiles() reduced the size. uint64_t payload_size; size_t length_field_size; int res = aom_uleb_decode(dst + obu_header_size, *total_size - obu_header_size, &payload_size, &length_field_size); assert(res == 0); (void)res; const uint64_t new_payload_size = *total_size - obu_header_size - length_field_size; if (new_payload_size != payload_size) { size_t new_length_field_size; res = aom_uleb_encode(new_payload_size, length_field_size, dst + obu_header_size, &new_length_field_size); assert(res == 0); if (new_length_field_size < length_field_size) { const size_t src_offset = obu_header_size + length_field_size; const size_t dst_offset = obu_header_size + new_length_field_size; memmove(dst + dst_offset, dst + src_offset, (size_t)payload_size); *total_size -= (int)(length_field_size - new_length_field_size); } } } // As per the experiments, single-thread bitstream packing is better for // frames with a smaller bitstream size. This behavior is due to setup time // overhead of multithread function would be more than that of time required // to pack the smaller bitstream of such frames. This function computes the // number of required number of workers based on setup time overhead and job // dispatch time overhead for given tiles and available workers. static int calc_pack_bs_mt_workers(const TileDataEnc *tile_data, int num_tiles, int avail_workers, bool pack_bs_mt_enabled) { if (!pack_bs_mt_enabled) return 1; uint64_t frame_abs_sum_level = 0; for (int idx = 0; idx < num_tiles; idx++) frame_abs_sum_level += tile_data[idx].abs_sum_level; int ideal_num_workers = 1; const float job_disp_time_const = (float)num_tiles * JOB_DISP_TIME_OH_CONST; float max_sum = 0.0; for (int num_workers = avail_workers; num_workers > 1; num_workers--) { const float fas_per_worker_const = ((float)(num_workers - 1) / num_workers) * frame_abs_sum_level; const float setup_time_const = (float)num_workers * SETUP_TIME_OH_CONST; const float this_sum = fas_per_worker_const - setup_time_const - job_disp_time_const / num_workers; if (this_sum > max_sum) { max_sum = this_sum; ideal_num_workers = num_workers; } } return ideal_num_workers; } static inline uint32_t pack_tiles_in_tg_obus( AV1_COMP *const cpi, uint8_t *const dst, struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header, const FrameHeaderInfo *fh_info, int *const largest_tile_id) { const CommonTileParams *const tiles = &cpi->common.tiles; uint32_t total_size = 0; unsigned int max_tile_size = 0; uint32_t obu_header_size = 0; uint8_t *tile_data_start = dst; const int tile_cols = tiles->cols; const int tile_rows = tiles->rows; const int num_tiles = tile_rows * tile_cols; const int num_workers = calc_pack_bs_mt_workers( cpi->tile_data, num_tiles, cpi->mt_info.num_mod_workers[MOD_PACK_BS], cpi->mt_info.pack_bs_mt_enabled); if (num_workers > 1) { av1_write_tile_obu_mt(cpi, dst, &total_size, saved_wb, obu_extension_header, fh_info, largest_tile_id, &max_tile_size, &obu_header_size, &tile_data_start, num_workers); } else { write_tile_obu(cpi, dst, &total_size, saved_wb, obu_extension_header, fh_info, largest_tile_id, &max_tile_size, &obu_header_size, &tile_data_start); } if (num_tiles > 1) write_tile_obu_size(cpi, dst, saved_wb, *largest_tile_id, &total_size, max_tile_size, obu_header_size, tile_data_start); return total_size; } static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, size_t dst_size, struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header, const FrameHeaderInfo *fh_info, int *const largest_tile_id) { // TODO: bug 42302568 - Use dst_size. (void)dst_size; AV1_COMMON *const cm = &cpi->common; const CommonTileParams *const tiles = &cm->tiles; *largest_tile_id = 0; // Select the coding strategy (temporal or spatial) if (cm->seg.enabled && cm->seg.update_map) { if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) { cm->seg.temporal_update = 0; } else { cm->seg.temporal_update = 1; if (cpi->td.rd_counts.seg_tmp_pred_cost[0] < cpi->td.rd_counts.seg_tmp_pred_cost[1]) cm->seg.temporal_update = 0; } } if (tiles->large_scale) return pack_large_scale_tiles_in_tg_obus( cpi, dst, saved_wb, obu_extension_header, largest_tile_id); return pack_tiles_in_tg_obus(cpi, dst, saved_wb, obu_extension_header, fh_info, largest_tile_id); } // Returns the number of bytes written on success. Returns 0 on failure. static size_t av1_write_metadata_obu(const aom_metadata_t *metadata, uint8_t *const dst, size_t dst_size) { size_t coded_metadata_size = 0; const uint64_t metadata_type = (uint64_t)metadata->type; if (aom_uleb_encode(metadata_type, dst_size, dst, &coded_metadata_size) != 0) { return 0; } if (coded_metadata_size + metadata->sz + 1 > dst_size) { return 0; } memcpy(dst + coded_metadata_size, metadata->payload, metadata->sz); // Add trailing bits. dst[coded_metadata_size + metadata->sz] = 0x80; return coded_metadata_size + metadata->sz + 1; } static size_t av1_write_metadata_array(AV1_COMP *const cpi, uint8_t *dst, size_t dst_size) { if (!cpi->source) return 0; AV1_COMMON *const cm = &cpi->common; aom_metadata_array_t *arr = cpi->source->metadata; if (!arr) return 0; size_t obu_header_size = 0; size_t obu_payload_size = 0; size_t total_bytes_written = 0; size_t length_field_size = 0; for (size_t i = 0; i < arr->sz; i++) { aom_metadata_t *current_metadata = arr->metadata_array[i]; if (current_metadata && current_metadata->payload) { if ((cm->current_frame.frame_type == KEY_FRAME && current_metadata->insert_flag == AOM_MIF_KEY_FRAME) || (cm->current_frame.frame_type != KEY_FRAME && current_metadata->insert_flag == AOM_MIF_NON_KEY_FRAME) || current_metadata->insert_flag == AOM_MIF_ANY_FRAME) { // OBU header is either one or two bytes. if (dst_size < 2) { aom_internal_error(cm->error, AOM_CODEC_ERROR, "av1_write_metadata_array: output buffer full"); } // According to the AV1 spec draft version (as of git commit 5e04f) // Section 6.7.1, some metadata types can be layer specific, but we // currently only support non-layer specific metadata. obu_header_size = av1_write_obu_header( &cpi->ppi->level_params, &cpi->frame_header_count, OBU_METADATA, cm->seq_params->has_nonzero_operating_point_idc, /*is_layer_specific_obu=*/false, 0, dst); assert(obu_header_size <= 2); obu_payload_size = av1_write_metadata_obu(current_metadata, dst + obu_header_size, dst_size - obu_header_size); if (obu_payload_size == 0) { aom_internal_error(cm->error, AOM_CODEC_ERROR, "av1_write_metadata_array: output buffer full"); } length_field_size = obu_memmove(obu_header_size, obu_payload_size, dst, dst_size); if (length_field_size == 0) { aom_internal_error(cm->error, AOM_CODEC_ERROR, "av1_write_metadata_array: output buffer full"); } if (av1_write_uleb_obu_size(obu_payload_size, dst + obu_header_size, length_field_size) == AOM_CODEC_OK) { const size_t obu_size = obu_header_size + length_field_size + obu_payload_size; dst += obu_size; dst_size -= obu_size; total_bytes_written += obu_size; } else { aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, "av1_write_metadata_array: output buffer full"); } } } } return total_bytes_written; } int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t dst_size, size_t *size, int *const largest_tile_id) { uint8_t *data = dst; size_t data_size = dst_size; AV1_COMMON *const cm = &cpi->common; AV1LevelParams *const level_params = &cpi->ppi->level_params; uint32_t obu_header_size = 0; uint32_t obu_payload_size = 0; FrameHeaderInfo fh_info = { NULL, 0, 0 }; const uint8_t obu_extension_header = cm->temporal_layer_id << 5 | cm->spatial_layer_id << 3 | 0; // If no non-zero delta_q has been used, reset delta_q_present_flag if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) { cm->delta_q_info.delta_q_present_flag = 0; } #if CONFIG_BITSTREAM_DEBUG bitstream_queue_reset_write(); #endif cpi->frame_header_count = 0; // The TD is now written outside the frame encode loop // write sequence header obu at each key frame or intra_only frame, // preceded by 4-byte size if (cm->current_frame.frame_type == INTRA_ONLY_FRAME || cm->current_frame.frame_type == KEY_FRAME) { // OBU header is either one or two bytes. if (data_size < 2) { return AOM_CODEC_ERROR; } obu_header_size = av1_write_obu_header( level_params, &cpi->frame_header_count, OBU_SEQUENCE_HEADER, cm->seq_params->has_nonzero_operating_point_idc, /*is_layer_specific_obu=*/false, 0, data); assert(obu_header_size <= 2); obu_payload_size = av1_write_sequence_header_obu( cm->seq_params, data + obu_header_size, data_size - obu_header_size); const size_t length_field_size = obu_memmove(obu_header_size, obu_payload_size, data, data_size); if (length_field_size == 0) { return AOM_CODEC_ERROR; } if (av1_write_uleb_obu_size(obu_payload_size, data + obu_header_size, length_field_size) != AOM_CODEC_OK) { return AOM_CODEC_ERROR; } const size_t bytes_written = obu_header_size + length_field_size + obu_payload_size; data += bytes_written; data_size -= bytes_written; } // write metadata obus before the frame obu that has the show_frame flag set if (cm->show_frame) { const size_t bytes_written = av1_write_metadata_array(cpi, data, data_size); data += bytes_written; data_size -= bytes_written; } const int write_frame_header = (cpi->num_tg > 1 || encode_show_existing_frame(cm)); struct aom_write_bit_buffer saved_wb = { NULL, 0 }; size_t length_field = 0; if (write_frame_header) { // Write Frame Header OBU. fh_info.frame_header = data; // OBU header is either one or two bytes. if (data_size < 2) { return AOM_CODEC_ERROR; } obu_header_size = av1_write_obu_header( level_params, &cpi->frame_header_count, OBU_FRAME_HEADER, cm->seq_params->has_nonzero_operating_point_idc, /*is_layer_specific_obu=*/true, obu_extension_header, data); // TODO: bug 42302568 - Pass data_size - obu_header_size to // write_frame_header_obu(). obu_payload_size = write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, &saved_wb, data + obu_header_size, 1); length_field = obu_memmove(obu_header_size, obu_payload_size, data, data_size); if (length_field == 0) { return AOM_CODEC_ERROR; } if (av1_write_uleb_obu_size(obu_payload_size, data + obu_header_size, length_field) != AOM_CODEC_OK) { return AOM_CODEC_ERROR; } fh_info.obu_header_byte_offset = 0; fh_info.total_length = obu_header_size + length_field + obu_payload_size; // Make sure it is safe to cast fh_info.total_length to uint32_t. if (fh_info.total_length > UINT32_MAX) { return AOM_CODEC_ERROR; } data += fh_info.total_length; data_size -= fh_info.total_length; } if (!encode_show_existing_frame(cm)) { // Since length_field is determined adaptively after frame header // encoding, saved_wb must be adjusted accordingly. if (saved_wb.bit_buffer != NULL) { saved_wb.bit_buffer += length_field; } // Each tile group obu will be preceded by 4-byte size of the tile group // obu const size_t bytes_written = write_tiles_in_tg_obus(cpi, data, data_size, &saved_wb, obu_extension_header, &fh_info, largest_tile_id); data += bytes_written; data_size -= bytes_written; } *size = data - dst; (void)data_size; return AOM_CODEC_OK; } aom-3.12.1/av1/encoder/bitstream.h000066400000000000000000000130611477627663500166530ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_BITSTREAM_H_ #define AOM_AV1_ENCODER_BITSTREAM_H_ #ifdef __cplusplus extern "C" { #endif #include #include #include #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/enums.h" #include "av1/encoder/level.h" #include "aom_dsp/bitwriter.h" #include "aom_util/aom_pthread.h" struct aom_write_bit_buffer; struct AV1_COMP; struct ThreadData; /*!\cond */ // Stores the location and size of a tile's data in the bitstream. Used for // later identifying identical tiles typedef struct { uint8_t *data; size_t size; } TileBufferEnc; typedef struct { uint8_t *frame_header; size_t obu_header_byte_offset; size_t total_length; } FrameHeaderInfo; typedef struct { struct aom_write_bit_buffer *saved_wb; // Bit stream buffer writer structure TileBufferEnc buf; // Structure to hold bitstream buffer and size uint32_t *total_size; // Size of the bitstream buffer for the tile in bytes uint8_t *dst; // Base address of tile bitstream buffer uint8_t *tile_data_curr; // Base address of tile-group bitstream buffer size_t tile_buf_size; // Available bitstream buffer for the tile in bytes uint8_t obu_extn_header; // Presence of OBU extension header uint32_t obu_header_size; // Size of the OBU header int curr_tg_hdr_size; // Size of the obu, tg, frame headers int tile_size_mi; // Tile size in mi units int tile_row; // Number of tile rows int tile_col; // Number of tile columns int is_last_tile_in_tg; // Flag to indicate last tile in a tile-group int new_tg; // Flag to indicate starting of a new tile-group } PackBSParams; typedef struct { uint64_t abs_sum_level; uint16_t tile_idx; } PackBSTileOrder; // Pack bitstream data for pack bitstream multi-threading. typedef struct { #if CONFIG_MULTITHREAD // Mutex lock used while dispatching jobs. pthread_mutex_t *mutex_; #endif // Tile order structure of pack bitstream multithreading. PackBSTileOrder pack_bs_tile_order[MAX_TILES]; // Index of next job to be processed. int next_job_idx; // Initialized to false, set to true by the worker thread that encounters an // error in order to abort the processing of other worker threads. bool pack_bs_mt_exit; } AV1EncPackBSSync; /*!\endcond */ // Writes only the OBU Sequence Header payload, and returns the size of the // payload written to 'dst'. This function does not write the OBU header, the // optional extension, or the OBU size to 'dst'. uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params, uint8_t *const dst, size_t dst_size); // Writes the OBU header byte, and the OBU header extension byte when both // has_nonzero_operating_point_idc and is_layer_specific_obu are true. // Returns number of bytes written to 'dst'. uint32_t av1_write_obu_header(AV1LevelParams *const level_params, int *frame_header_count, OBU_TYPE obu_type, bool has_nonzero_operating_point_idc, bool is_layer_specific_obu, int obu_extension, uint8_t *const dst); // Encodes obu_payload_size as a leb128 integer and writes it to the dest // buffer. The output must fill the buffer exactly. Returns AOM_CODEC_OK on // success, AOM_CODEC_ERROR on failure. int av1_write_uleb_obu_size(size_t obu_payload_size, uint8_t *dest, size_t dest_size); // Pack tile data in the bitstream with tile_group, frame // and OBU header. void av1_pack_tile_info(struct AV1_COMP *const cpi, struct ThreadData *const td, PackBSParams *const pack_bs_params); void av1_write_last_tile_info( struct AV1_COMP *const cpi, const FrameHeaderInfo *fh_info, struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size, uint8_t *curr_tg_start, uint32_t *const total_size, uint8_t **tile_data_start, int *const largest_tile_id, int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header); /*!\brief Pack the bitstream for one frame * * \ingroup high_level_algo * \callgraph */ int av1_pack_bitstream(struct AV1_COMP *const cpi, uint8_t *dst, size_t dst_size, size_t *size, int *const largest_tile_id); void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w); void av1_reset_pack_bs_thread_data(struct ThreadData *const td); void av1_accumulate_pack_bs_thread_data(struct AV1_COMP *const cpi, struct ThreadData const *td); void av1_write_obu_tg_tile_headers(struct AV1_COMP *const cpi, MACROBLOCKD *const xd, PackBSParams *const pack_bs_params, const int tile_idx); int av1_neg_interleave(int x, int ref, int max); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_BITSTREAM_H_ aom-3.12.1/av1/encoder/block.h000066400000000000000000001504451477627663500157630ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*! \file * Declares various structs used to encode the current partition block. */ #ifndef AOM_AV1_ENCODER_BLOCK_H_ #define AOM_AV1_ENCODER_BLOCK_H_ #include "av1/common/blockd.h" #include "av1/common/entropymv.h" #include "av1/common/entropy.h" #include "av1/common/enums.h" #include "av1/common/mvref_common.h" #include "av1/encoder/enc_enums.h" #include "av1/encoder/mcomp_structs.h" #if !CONFIG_REALTIME_ONLY #include "av1/encoder/partition_cnn_weights.h" #endif #include "av1/encoder/hash_motion.h" #ifdef __cplusplus extern "C" { #endif //! Minimum linear dimension of a tpl block #define MIN_TPL_BSIZE_1D 16 //! Maximum number of tpl block in a super block #define MAX_TPL_BLK_IN_SB (MAX_SB_SIZE / MIN_TPL_BSIZE_1D) //! Number of txfm hash records kept for the partition block. #define RD_RECORD_BUFFER_LEN 8 /*! Maximum value taken by transform type probabilities */ #define MAX_TX_TYPE_PROB 1024 //! Compute color sensitivity index for given plane #define COLOR_SENS_IDX(plane) ((plane)-1) //! Enable timer statistics of mode search in non-rd #define COLLECT_NONRD_PICK_MODE_STAT 0 /*!\cond */ #if COLLECT_NONRD_PICK_MODE_STAT #include "aom_ports/aom_timer.h" typedef struct _mode_search_stat_nonrd { int32_t num_blocks[BLOCK_SIZES]; int64_t total_block_times[BLOCK_SIZES]; int32_t num_searches[BLOCK_SIZES][MB_MODE_COUNT]; int32_t num_nonskipped_searches[BLOCK_SIZES][MB_MODE_COUNT]; int64_t search_times[BLOCK_SIZES][MB_MODE_COUNT]; int64_t nonskipped_search_times[BLOCK_SIZES][MB_MODE_COUNT]; int64_t ms_time[BLOCK_SIZES][MB_MODE_COUNT]; int64_t ifs_time[BLOCK_SIZES][MB_MODE_COUNT]; int64_t model_rd_time[BLOCK_SIZES][MB_MODE_COUNT]; int64_t txfm_time[BLOCK_SIZES][MB_MODE_COUNT]; struct aom_usec_timer timer1; struct aom_usec_timer timer2; struct aom_usec_timer bsize_timer; } mode_search_stat_nonrd; #endif // COLLECT_NONRD_PICK_MODE_STAT /*!\endcond */ /*! \brief Superblock level encoder info * * SuperblockEnc stores superblock level information used by the encoder for * more efficient encoding. Currently this is mostly used to store TPL data * for the current superblock. */ typedef struct { //! Maximum partition size for the sb. BLOCK_SIZE min_partition_size; //! Minimum partition size for the sb. BLOCK_SIZE max_partition_size; /***************************************************************************** * \name TPL Info * * Information gathered from tpl_model at tpl block precision for the * superblock to speed up the encoding process.. ****************************************************************************/ /**@{*/ //! Number of TPL blocks in this superblock. int tpl_data_count; //! TPL's estimate of inter cost for each tpl block. int64_t tpl_inter_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB]; //! TPL's estimate of tpl cost for each tpl block. int64_t tpl_intra_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB]; //! Motion vectors found by TPL model for each tpl block. int_mv tpl_mv[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB][INTER_REFS_PER_FRAME]; //! TPL's stride for the arrays in this struct. int tpl_stride; /**@}*/ } SuperBlockEnc; /*! \brief Stores the best performing modes. */ typedef struct { //! The mbmi used to reconstruct the winner mode. MB_MODE_INFO mbmi; //! Rdstats of the winner mode. RD_STATS rd_cost; //! Rdcost of the winner mode int64_t rd; //! Luma rate of the winner mode. int rate_y; //! Chroma rate of the winner mode. int rate_uv; //! The color map needed to reconstruct palette mode. uint8_t color_index_map[MAX_SB_SQUARE]; //! The current winner mode. THR_MODES mode_index; } WinnerModeStats; /*! \brief Each source plane of the current macroblock * * This struct also stores the txfm buffers and quantizer settings. */ typedef struct macroblock_plane { //! Stores source - pred so the txfm can be computed later int16_t *src_diff; //! Dequantized coefficients tran_low_t *dqcoeff; //! Quantized coefficients tran_low_t *qcoeff; //! Transformed coefficients tran_low_t *coeff; //! Location of the end of qcoeff (end of block). uint16_t *eobs; //! Contexts used to code the transform coefficients. uint8_t *txb_entropy_ctx; //! A buffer containing the source frame. struct buf_2d src; /*! \name Quantizer Settings * * \attention These are used/accessed only in the quantization process. * RDO does not and *must not* depend on any of these values. * All values below share the coefficient scale/shift used in TX. */ /**@{*/ //! Quantization step size used by AV1_XFORM_QUANT_FP. const int16_t *quant_fp_QTX; //! Offset used for rounding in the quantizer process by AV1_XFORM_QUANT_FP. const int16_t *round_fp_QTX; //! Quantization step size used by AV1_XFORM_QUANT_B. const int16_t *quant_QTX; //! Offset used for rounding in the quantizer process by AV1_XFORM_QUANT_B. const int16_t *round_QTX; //! Scale factor to shift coefficients toward zero. Only used by QUANT_B. const int16_t *quant_shift_QTX; //! Size of the quantization bin around 0. Only Used by QUANT_B const int16_t *zbin_QTX; //! Dequantizer const int16_t *dequant_QTX; /**@}*/ } MACROBLOCK_PLANE; /*! \brief Costs for encoding the coefficients within a level. * * Covers everything including txb_skip, eob, dc_sign, */ typedef struct { //! Cost to skip txfm for the current txfm block. int txb_skip_cost[TXB_SKIP_CONTEXTS][2]; /*! \brief Cost for encoding the base_eob of a level. * * Decoder uses base_eob to derive the base_level as base_eob := base_eob+1. */ int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3]; /*! \brief Cost for encoding the base level of a coefficient. * * Decoder derives coeff_base as coeff_base := base_eob + 1. */ int base_cost[SIG_COEF_CONTEXTS][8]; /*! \brief Cost for encoding the last non-zero coefficient. * * Eob is derived from eob_extra at the decoder as eob := eob_extra + 1 */ int eob_extra_cost[EOB_COEF_CONTEXTS][2]; //! Cost for encoding the dc_sign int dc_sign_cost[DC_SIGN_CONTEXTS][2]; //! Cost for encoding an increment to the coefficient int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1]; } LV_MAP_COEFF_COST; /*! \brief Costs for encoding the eob. */ typedef struct { //! eob_cost. int eob_cost[2][11]; } LV_MAP_EOB_COST; /*! \brief Stores the transforms coefficients for the whole superblock. */ typedef struct { //! The transformed coefficients. tran_low_t *tcoeff[MAX_MB_PLANE]; //! Where the transformed coefficients end. uint16_t *eobs[MAX_MB_PLANE]; /*! \brief Transform block entropy contexts. * * Each element is used as a bit field. * - Bits 0~3: txb_skip_ctx * - Bits 4~5: dc_sign_ctx. */ uint8_t *entropy_ctx[MAX_MB_PLANE]; } CB_COEFF_BUFFER; /*! \brief Extended mode info derived from mbmi. */ typedef struct { // TODO(angiebird): Reduce the buffer size according to sb_type //! The reference mv list for the current block. CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE]; //! The weights used to compute the ref mvs. uint16_t weight[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE]; //! Number of ref mvs in the drl. uint8_t ref_mv_count[MODE_CTX_REF_FRAMES]; //! Global mvs int_mv global_mvs[REF_FRAMES]; //! Context used to encode the current mode. int16_t mode_context[MODE_CTX_REF_FRAMES]; } MB_MODE_INFO_EXT; /*! \brief Stores best extended mode information at frame level. * * The frame level in here is used in bitstream preparation stage. The * information in \ref MB_MODE_INFO_EXT are copied to this struct to save * memory. */ typedef struct { //! \copydoc MB_MODE_INFO_EXT::ref_mv_stack CANDIDATE_MV ref_mv_stack[USABLE_REF_MV_STACK_SIZE]; //! \copydoc MB_MODE_INFO_EXT::weight uint16_t weight[USABLE_REF_MV_STACK_SIZE]; //! \copydoc MB_MODE_INFO_EXT::ref_mv_count uint8_t ref_mv_count; // TODO(Ravi/Remya): Reduce the buffer size of global_mvs //! \copydoc MB_MODE_INFO_EXT::global_mvs int_mv global_mvs[REF_FRAMES]; //! \copydoc MB_MODE_INFO_EXT::mode_context int16_t mode_context; //! Offset of current coding block's coeff buffer relative to the sb. uint16_t cb_offset[PLANE_TYPES]; } MB_MODE_INFO_EXT_FRAME; /*! \brief Inter-mode txfm results for a partition block. */ typedef struct { //! Txfm size used if the current mode is intra mode. TX_SIZE tx_size; //! Txfm sizes used if the current mode is inter mode. TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN]; //! Map showing which txfm block skips the txfm process. uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; //! Map showing the txfm types for each block. uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; //! Rd_stats for the whole partition block. RD_STATS rd_stats; //! Hash value of the current record. uint32_t hash_value; } MB_RD_INFO; /*! \brief Hash records of the inter-mode transform results * * Hash records of the inter-mode transform results for a whole partition block * based on the residue. Since this operates on the partition block level, this * can give us a whole txfm partition tree. */ typedef struct { /*! Circular buffer that stores the inter-mode txfm results of a partition * block. */ MB_RD_INFO mb_rd_info[RD_RECORD_BUFFER_LEN]; //! Index to insert the newest rd record. int index_start; //! Number of info stored in this record. int num; //! Hash function CRC32C crc_calculator; } MB_RD_RECORD; //! Number of compound rd stats #define MAX_COMP_RD_STATS 64 /*! \brief Rdcost stats in compound mode. */ typedef struct { //! Rate of the compound modes. int32_t rate[COMPOUND_TYPES]; //! Distortion of the compound modes. int64_t dist[COMPOUND_TYPES]; //! Estimated rate of the compound modes. int32_t model_rate[COMPOUND_TYPES]; //! Estimated distortion of the compound modes. int64_t model_dist[COMPOUND_TYPES]; //! Rate need to send the mask type. int comp_rs2[COMPOUND_TYPES]; //! Motion vector for each predictor. int_mv mv[2]; //! Ref frame for each predictor. MV_REFERENCE_FRAME ref_frames[2]; //! Current prediction mode. PREDICTION_MODE mode; //! Current interpolation filter. int_interpfilters filter; //! Refmv index in the drl. int ref_mv_idx; //! Whether the predictors are GLOBALMV. int is_global[2]; //! Current parameters for interinter mode. INTERINTER_COMPOUND_DATA interinter_comp; } COMP_RD_STATS; /*! \brief Contains buffers used to speed up rdopt for obmc. * * See the comments for calc_target_weighted_pred for details. */ typedef struct { /*! \brief A new source weighted with the above and left predictors. * * Used to efficiently construct multiple obmc predictors during rdopt. */ int32_t *wsrc; /*! \brief A new mask constructed from the original horz/vert mask. * * \copydetails wsrc */ int32_t *mask; /*! \brief Prediction from the up predictor. * * Used to build the obmc predictor. */ uint8_t *above_pred; /*! \brief Prediction from the up predictor. * * \copydetails above_pred */ uint8_t *left_pred; } OBMCBuffer; /*! \brief Contains color maps used in palette mode. */ typedef struct { //! The best color map found. uint8_t best_palette_color_map[MAX_PALETTE_SQUARE]; //! A temporary buffer used for k-means clustering. int16_t kmeans_data_buf[2 * MAX_PALETTE_SQUARE]; } PALETTE_BUFFER; /*! \brief Contains buffers used by av1_compound_type_rd() * * For sizes and alignment of these arrays, refer to * alloc_compound_type_rd_buffers() function. */ typedef struct { //! First prediction. uint8_t *pred0; //! Second prediction. uint8_t *pred1; //! Source - first prediction. int16_t *residual1; //! Second prediction - first prediction. int16_t *diff10; //! Backup of the best segmentation mask. uint8_t *tmp_best_mask_buf; } CompoundTypeRdBuffers; /*! \brief Holds some parameters related to partitioning schemes in AV1. */ // TODO(chiyotsai@google.com): Consolidate this with SIMPLE_MOTION_DATA_TREE typedef struct { #if !CONFIG_REALTIME_ONLY // The following 4 parameters are used for cnn-based partitioning on intra // frame. /*! \brief Current index on the partition block quad tree. * * Used to index into the cnn buffer for partition decision. */ int quad_tree_idx; //! Whether the CNN buffer contains valid output. int cnn_output_valid; //! A buffer used by our segmentation CNN for intra-frame partitioning. float cnn_buffer[CNN_OUT_BUF_SIZE]; //! log of the quantization parameter of the ancestor BLOCK_64X64. float log_q; #endif /*! \brief Variance of the subblocks in the superblock. * * This is used by rt mode for variance based partitioning. * The indices corresponds to the following block sizes: * - 0 - 128x128 * - 1-2 - 128x64 * - 3-4 - 64x128 * - 5-8 - 64x64 * - 9-16 - 64x32 * - 17-24 - 32x64 * - 25-40 - 32x32 * - 41-104 - 16x16 */ uint8_t variance_low[105]; } PartitionSearchInfo; /*!\cond */ enum { /** * Do not prune transform depths. */ TX_PRUNE_NONE = 0, /** * Prune largest transform (depth 0) based on NN model. */ TX_PRUNE_LARGEST = 1, /** * Prune split transforms (depth>=1) based on NN model. */ TX_PRUNE_SPLIT = 2, } UENUM1BYTE(TX_PRUNE_TYPE); /*!\endcond */ /*! \brief Defines the parameters used to perform txfm search. * * For the most part, this determines how various speed features are used. */ typedef struct { /*! \brief Whether to limit the intra txfm search type to the default txfm. * * This could either be a result of either sequence parameter or speed * features. */ int use_default_intra_tx_type; /*! Probability threshold used for conditionally forcing tx type*/ int default_inter_tx_type_prob_thresh; //! Whether to prune 2d transforms based on 1d transform results. int prune_2d_txfm_mode; /*! \brief Variable from \ref WinnerModeParams based on current eval mode. * * See the documentation for \ref WinnerModeParams for more detail. */ unsigned int coeff_opt_thresholds[2]; /*! \copydoc coeff_opt_thresholds */ unsigned int tx_domain_dist_threshold; /*! \copydoc coeff_opt_thresholds */ TX_SIZE_SEARCH_METHOD tx_size_search_method; /*! \copydoc coeff_opt_thresholds */ unsigned int use_transform_domain_distortion; /*! \copydoc coeff_opt_thresholds */ unsigned int skip_txfm_level; /*! \brief How to search for the optimal tx_size * * If ONLY_4X4, use TX_4X4; if TX_MODE_LARGEST, use the largest tx_size for * the current partition block; if TX_MODE_SELECT, search through the whole * tree. * * \attention * Although this looks suspicious similar to a bitstream element, this * tx_mode_search_type is only used internally by the encoder, and is *not* * written to the bitstream. It determines what kind of tx_mode would be * searched. For example, we might set it to TX_MODE_LARGEST to find a good * candidate, then code it as TX_MODE_SELECT. */ TX_MODE tx_mode_search_type; /*! * Determines whether a block can be predicted as transform skip or DC only * based on residual mean and variance. * Type 0 : No skip block or DC only block prediction * Type 1 : Prediction of skip block based on residual mean and variance * Type 2 : Prediction of skip block or DC only block based on residual mean * and variance */ unsigned int predict_dc_level; /*! * Whether or not we should use the quantization matrix as weights for PSNR * during RD search. */ int use_qm_dist_metric; /*! * Keep track of previous mode evaluation stage type. This will be used to * reset mb rd hash record when mode evaluation type changes. */ int mode_eval_type; #if !CONFIG_REALTIME_ONLY //! Indicates the transform depths for which RD evaluation is skipped. TX_PRUNE_TYPE nn_prune_depths_for_intra_tx; /*! \brief Indicates if NN model should be invoked to prune transform depths. * * Used to signal whether NN model should be evaluated to prune the R-D * evaluation of specific transform depths. */ bool enable_nn_prune_intra_tx_depths; #endif } TxfmSearchParams; /*!\cond */ #define MAX_NUM_8X8_TXBS ((MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)) #define MAX_NUM_16X16_TXBS ((MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)) #define MAX_NUM_32X32_TXBS ((MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)) #define MAX_NUM_64X64_TXBS ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)) /*!\endcond */ /*! \brief Stores various encoding/search decisions related to txfm search. * * This struct contains a cache of previous txfm results, and some buffers for * the current txfm decision. */ typedef struct { //! Whether to skip transform and quantization on a partition block level. uint8_t skip_txfm; /*! \brief Whether to skip transform and quantization on a txfm block level. * * Skips transform and quantization on a transform block level inside the * current partition block. Each element of this array is used as a bit-field. * So for example, the we are skipping on the luma plane, then the last bit * would be set to 1. */ uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; /*! \brief Transform types inside the partition block * * Keeps a record of what kind of transform to use for each of the transform * block inside the partition block. * \attention The buffer here is *never* directly used. Instead, this just * allocates the memory for MACROBLOCKD::tx_type_map during rdopt on the * partition block. So if we need to save memory, we could move the allocation * to pick_sb_mode instead. */ uint8_t tx_type_map_[MAX_MIB_SIZE * MAX_MIB_SIZE]; //! Txfm hash records of inter-modes. MB_RD_RECORD *mb_rd_record; /*! \brief Number of txb splits. * * Keep track of how many times we've used split tx partition for transform * blocks. Somewhat misleadingly, this parameter doesn't actually keep track * of the count of the current block. Instead, it's a cumulative count across * of the whole frame. The main usage is that if txb_split_count is zero, then * we can signal TX_MODE_LARGEST at frame level. */ // TODO(chiyotsai@google.com): Move this to a more appropriate location such // as ThreadData. unsigned int txb_split_count; #if CONFIG_SPEED_STATS //! For debugging. Used to check how many txfm searches we are doing. unsigned int tx_search_count; #endif // CONFIG_SPEED_STATS } TxfmSearchInfo; #undef MAX_NUM_8X8_TXBS #undef MAX_NUM_16X16_TXBS #undef MAX_NUM_32X32_TXBS #undef MAX_NUM_64X64_TXBS /*! \brief Holds the entropy costs for various modes sent to the bitstream. * * \attention This does not include the costs for mv and transformed * coefficients. */ typedef struct { /***************************************************************************** * \name Partition Costs ****************************************************************************/ /**@{*/ //! Cost for coding the partition. int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES]; /**@}*/ /***************************************************************************** * \name Intra Costs: General ****************************************************************************/ /**@{*/ //! Luma mode cost for inter frame. int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES]; //! Luma mode cost for intra frame. int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; //! Chroma mode cost int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES]; //! filter_intra_cost int filter_intra_cost[BLOCK_SIZES_ALL][2]; //! filter_intra_mode_cost int filter_intra_mode_cost[FILTER_INTRA_MODES]; //! angle_delta_cost int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1]; //! Rate rate associated with each alpha codeword int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE]; /**@}*/ /***************************************************************************** * \name Intra Costs: Screen Contents ****************************************************************************/ /**@{*/ //! intrabc_cost int intrabc_cost[2]; //! palette_y_size_cost int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; //! palette_uv_size_cost int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; //! palette_y_color_cost int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS] [PALETTE_COLORS]; //! palette_uv_color_cost int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS] [PALETTE_COLORS]; //! palette_y_mode_cost int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2]; //! palette_uv_mode_cost int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2]; /**@}*/ /***************************************************************************** * \name Inter Costs: MV Modes ****************************************************************************/ /**@{*/ //! skip_mode_cost int skip_mode_cost[SKIP_MODE_CONTEXTS][2]; //! newmv_mode_cost int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2]; //! zeromv_mode_cost int zeromv_mode_cost[GLOBALMV_MODE_CONTEXTS][2]; //! refmv_mode_cost int refmv_mode_cost[REFMV_MODE_CONTEXTS][2]; //! drl_mode_cost0 int drl_mode_cost0[DRL_MODE_CONTEXTS][2]; /**@}*/ /***************************************************************************** * \name Inter Costs: Ref Frame Types ****************************************************************************/ /**@{*/ //! single_ref_cost int single_ref_cost[REF_CONTEXTS][SINGLE_REFS - 1][2]; //! comp_inter_cost int comp_inter_cost[COMP_INTER_CONTEXTS][2]; //! comp_ref_type_cost int comp_ref_type_cost[COMP_REF_TYPE_CONTEXTS] [CDF_SIZE(COMP_REFERENCE_TYPES)]; //! uni_comp_ref_cost int uni_comp_ref_cost[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1] [CDF_SIZE(2)]; /*! \brief Cost for signaling ref_frame[0] in bidir-comp mode * * Includes LAST_FRAME, LAST2_FRAME, LAST3_FRAME, and GOLDEN_FRAME. */ int comp_ref_cost[REF_CONTEXTS][FWD_REFS - 1][2]; /*! \brief Cost for signaling ref_frame[1] in bidir-comp mode * * Includes ALTREF_FRAME, ALTREF2_FRAME, and BWDREF_FRAME. */ int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2]; /**@}*/ /***************************************************************************** * \name Inter Costs: Compound Types ****************************************************************************/ /**@{*/ //! intra_inter_cost int intra_inter_cost[INTRA_INTER_CONTEXTS][2]; //! inter_compound_mode_cost int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES]; //! compound_type_cost int compound_type_cost[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES]; //! wedge_idx_cost int wedge_idx_cost[BLOCK_SIZES_ALL][16]; //! interintra_cost int interintra_cost[BLOCK_SIZE_GROUPS][2]; //! wedge_interintra_cost int wedge_interintra_cost[BLOCK_SIZES_ALL][2]; //! interintra_mode_cost int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES]; /**@}*/ /***************************************************************************** * \name Inter Costs: Compound Masks ****************************************************************************/ /**@{*/ //! comp_idx_cost int comp_idx_cost[COMP_INDEX_CONTEXTS][2]; //! comp_group_idx_cost int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2]; /**@}*/ /***************************************************************************** * \name Inter Costs: Motion Modes/Filters ****************************************************************************/ /**@{*/ //! motion_mode_cost int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES]; //! motion_mode_cost1 int motion_mode_cost1[BLOCK_SIZES_ALL][2]; //! switchable_interp_costs int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; /**@}*/ /***************************************************************************** * \name Txfm Mode Costs ****************************************************************************/ /**@{*/ //! skip_txfm_cost int skip_txfm_cost[SKIP_CONTEXTS][2]; //! tx_size_cost int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES]; //! txfm_partition_cost int txfm_partition_cost[TXFM_PARTITION_CONTEXTS][2]; //! inter_tx_type_costs int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES]; //! intra_tx_type_costs int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES] [TX_TYPES]; /**@}*/ /***************************************************************************** * \name Restoration Mode Costs ****************************************************************************/ /**@{*/ //! switchable_restore_cost int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES]; //! wiener_restore_cost int wiener_restore_cost[2]; //! sgrproj_restore_cost int sgrproj_restore_cost[2]; /**@}*/ /***************************************************************************** * \name Segmentation Mode Costs ****************************************************************************/ /**@{*/ //! tmp_pred_cost int tmp_pred_cost[SEG_TEMPORAL_PRED_CTXS][2]; //! spatial_pred_cost int spatial_pred_cost[SPATIAL_PREDICTION_PROBS][MAX_SEGMENTS]; /**@}*/ } ModeCosts; /*! \brief Holds mv costs for encoding and motion search. */ typedef struct { /***************************************************************************** * \name Encoding Costs * Here are the entropy costs needed to encode a given mv. * \ref nmv_cost_alloc and \ref nmv_cost_hp_alloc are two arrays that holds * the memory for holding the mv cost. But since the motion vectors can be * negative, we shift them to the middle and store the resulting pointer in * \ref nmv_cost and \ref nmv_cost_hp for easier referencing. Finally, \ref * mv_cost_stack points to the \ref nmv_cost with the mv precision we are * currently working with. In essence, only \ref mv_cost_stack is needed for * motion search, the other can be considered private. ****************************************************************************/ /**@{*/ //! Costs for coding the zero components. int nmv_joint_cost[MV_JOINTS]; //! Allocates memory for 1/4-pel motion vector costs. int nmv_cost_alloc[2][MV_VALS]; //! Allocates memory for 1/8-pel motion vector costs. int nmv_cost_hp_alloc[2][MV_VALS]; //! Points to the middle of \ref nmv_cost_alloc int *nmv_cost[2]; //! Points to the middle of \ref nmv_cost_hp_alloc int *nmv_cost_hp[2]; //! Points to the nmv_cost_hp in use. int **mv_cost_stack; /**@}*/ } MvCosts; /*! \brief Holds mv costs for intrabc. */ typedef struct { /*! Costs for coding the joint mv. */ int joint_mv[MV_JOINTS]; /*! \brief Cost of transmitting the actual motion vector. * dv_costs_alloc[0][i] is the cost of motion vector with horizontal * component (mv_row) equal to i - MV_MAX. dv_costs_alloc[1][i] is the cost of * motion vector with vertical component (mv_col) equal to i - MV_MAX. */ int dv_costs_alloc[2][MV_VALS]; /*! Points to the middle of \ref dv_costs_alloc. */ int *dv_costs[2]; } IntraBCMVCosts; /*! \brief Holds the costs needed to encode the coefficients */ typedef struct { //! Costs for coding the coefficients. LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES]; //! Costs for coding the eobs. LV_MAP_EOB_COST eob_costs[7][2]; } CoeffCosts; /*!\cond */ // 4: NEAREST, NEW, NEAR, GLOBAL #define SINGLE_REF_MODES ((REF_FRAMES - 1) * 4) /*!\endcond */ struct inter_modes_info; /*! \brief Holds the motion samples for warp motion model estimation */ typedef struct { //! Number of samples. int num; //! Sample locations in current frame. int pts[16]; //! Sample location in the reference frame. int pts_inref[16]; } WARP_SAMPLE_INFO; /*!\cond */ typedef enum { kZeroSad = 0, kVeryLowSad = 1, kLowSad = 2, kMedSad = 3, kHighSad = 4 } SOURCE_SAD; typedef struct { //! SAD levels in non-rd path SOURCE_SAD source_sad_nonrd; //! SAD levels in rd-path for var-based part qindex thresholds SOURCE_SAD source_sad_rd; int lighting_change; int low_sumdiff; } CONTENT_STATE_SB; // Structure to hold pixel level gradient info. typedef struct { uint16_t abs_dx_abs_dy_sum; int8_t hist_bin_idx; bool is_dx_zero; } PixelLevelGradientInfo; // Structure to hold the variance and log(1 + variance) for 4x4 sub-blocks. typedef struct { double log_var; int var; } Block4x4VarInfo; #ifndef NDEBUG typedef struct SetOffsetsLoc { int mi_row; int mi_col; BLOCK_SIZE bsize; } SetOffsetsLoc; #endif // NDEBUG /*!\endcond */ /*! \brief Encoder's parameters related to the current coding block. * * This struct contains most of the information the encoder needs to encode the * current coding block. This includes the src and pred buffer, a copy of the * decoder's view of the current block, the txfm coefficients. This struct also * contains various buffers and data used to speed up the encoding process. */ typedef struct macroblock { /***************************************************************************** * \name Source, Buffers and Decoder ****************************************************************************/ /**@{*/ /*! \brief Each of the encoding plane. * * An array holding the src buffer for each of plane of the current block. It * also contains the txfm and quantized txfm coefficients. */ struct macroblock_plane plane[MAX_MB_PLANE]; /*! \brief Decoder's view of current coding block. * * Contains the encoder's copy of what the decoder sees in the current block. * Most importantly, this struct contains pointers to mbmi that is used in * final bitstream packing. */ MACROBLOCKD e_mbd; /*! \brief Derived coding information. * * Contains extra information not transmitted in the bitstream but are * derived. For example, this contains the stack of ref_mvs. */ MB_MODE_INFO_EXT mbmi_ext; /*! \brief Finalized mbmi_ext for the whole frame. * * Contains the finalized info in mbmi_ext that gets used at the frame level * for bitstream packing. */ MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame; //! Entropy context for the current row. FRAME_CONTEXT *row_ctx; /*! \brief Entropy context for the current tile. * * This context will be used to update color_map_cdf pointer which would be * used during pack bitstream. For single thread and tile-multithreading case * this pointer will be same as xd->tile_ctx, but for the case of row-mt: * xd->tile_ctx will point to a temporary context while tile_pb_ctx will point * to the accurate tile context. */ FRAME_CONTEXT *tile_pb_ctx; /*! \brief Buffer of transformed coefficients * * Points to cb_coef_buff in the AV1_COMP struct, which contains the finalized * coefficients. This is here to conveniently copy the best coefficients to * frame level for bitstream packing. Since CB_COEFF_BUFFER is allocated on a * superblock level, we need to combine it with cb_offset to get the proper * position for the current coding block. */ CB_COEFF_BUFFER *cb_coef_buff; //! Offset of current coding block's coeff buffer relative to the sb. uint16_t cb_offset[PLANE_TYPES]; //! Modified source and masks used for fast OBMC search. OBMCBuffer obmc_buffer; //! Buffer to store the best palette map. PALETTE_BUFFER *palette_buffer; //! Buffer used for compound_type_rd(). CompoundTypeRdBuffers comp_rd_buffer; //! Buffer to store convolution during averaging process in compound mode. CONV_BUF_TYPE *tmp_conv_dst; /*! \brief Temporary buffer to hold prediction. * * Points to a buffer that is used to hold temporary prediction results. This * is used in two ways: * - This is a temporary buffer used to ping-pong the prediction in * handle_inter_mode. * - xd->tmp_obmc_bufs also points to this buffer, and is used in ombc * prediction. */ uint8_t *tmp_pred_bufs[2]; /**@}*/ /***************************************************************************** * \name Rdopt Costs ****************************************************************************/ /**@{*/ /*! \brief Quantization index for the current partition block. * * This is used to as the index to find quantization parameter for luma and * chroma transformed coefficients. */ int qindex; /*! \brief Difference between frame-level qindex and current qindex. * * This is used to track whether a non-zero delta for qindex is used at least * once in the current frame. */ int delta_qindex; /*! \brief Difference between frame-level qindex and qindex used to * compute rdmult (lambda). * * rdmult_delta_qindex is assigned the same as delta_qindex before qp sweep. * During qp sweep, delta_qindex is changed and used to calculate the actual * quant params, while rdmult_delta_qindex remains the same, and is used to * calculate the rdmult in "set_deltaq_rdmult". */ int rdmult_delta_qindex; /*! \brief Current qindex (before being adjusted by delta_q_res) used to * derive rdmult_delta_qindex. */ int rdmult_cur_qindex; /*! \brief Rate-distortion multiplier. * * The rd multiplier used to determine the rate-distortion trade-off. This is * roughly proportional to the inverse of q-index for a given frame, but this * can be manipulated for better rate-control. For example, in tune_ssim * mode, this is scaled by a factor related to the variance of the current * block. */ int rdmult; //! Intra only, per sb rd adjustment. int intra_sb_rdmult_modifier; //! Superblock level distortion propagation factor. double rb; //! Energy in the current source coding block. Used to calculate \ref rdmult int mb_energy; //! Energy in the current source superblock. Used to calculate \ref rdmult int sb_energy_level; //! The rate needed to signal a mode to the bitstream. ModeCosts mode_costs; //! The rate needed to encode a new motion vector to the bitstream and some //! multipliers for motion search. MvCosts *mv_costs; /*! The rate needed to encode a new motion vector to the bitstream in intrabc * mode. */ IntraBCMVCosts *dv_costs; //! The rate needed to signal the txfm coefficients to the bitstream. CoeffCosts coeff_costs; /**@}*/ /***************************************************************************** * \name Rate to Distortion Multipliers ****************************************************************************/ /**@{*/ //! A multiplier that converts mv cost to l2 error. int errorperbit; //! A multiplier that converts mv cost to l1 error. int sadperbit; /**@}*/ /****************************************************************************** * \name Segmentation *****************************************************************************/ /**@{*/ /*! \brief Skip mode for the segment * * A syntax element of the segmentation mode. In skip_block mode, all mvs are * set 0 and all txfms are skipped. */ int seg_skip_block; /*! \brief Number of segment 1 blocks * Actual number of (4x4) blocks that were applied delta-q, * for segment 1. */ int actual_num_seg1_blocks; /*!\brief Number of segment 2 blocks * Actual number of (4x4) blocks that were applied delta-q, * for segment 2. */ int actual_num_seg2_blocks; /*!\brief Number of zero motion vectors */ int cnt_zeromv; /*!\brief Flag to force zeromv-skip at superblock level, for nonrd path. * * 0/1 imply zeromv-skip is disabled/enabled. 2 implies that the blocks * in the superblock may be marked as zeromv-skip at block level. */ int force_zeromv_skip_for_sb; /*!\brief Flag to force zeromv-skip at block level, for nonrd path. */ int force_zeromv_skip_for_blk; /*! \brief Previous segment id for which qmatrices were updated. * This is used to bypass setting of qmatrices if no change in qindex. */ int prev_segment_id; /**@}*/ /***************************************************************************** * \name Superblock ****************************************************************************/ /**@{*/ //! Information on a whole superblock level. // TODO(chiyotsai@google.com): Refactor this out of macroblock SuperBlockEnc sb_enc; /*! \brief Characteristics of the current superblock. * * Characteristics like whether the block has high sad, low sad, etc. This is * only used by av1 realtime mode. */ CONTENT_STATE_SB content_state_sb; /**@}*/ /***************************************************************************** * \name Reference Frame Search ****************************************************************************/ /**@{*/ /*! \brief Sum absolute distortion of the predicted mv for each ref frame. * * This is used to measure how viable a reference frame is. */ int pred_mv_sad[REF_FRAMES]; /*! \brief The minimum of \ref pred_mv_sad. * * Index 0 stores the minimum \ref pred_mv_sad across past reference frames. * Index 1 stores the minimum \ref pred_mv_sad across future reference frames. */ int best_pred_mv_sad[2]; //! The sad of the 1st mv ref (nearest). int pred_mv0_sad[REF_FRAMES]; //! The sad of the 2nd mv ref (near). int pred_mv1_sad[REF_FRAMES]; /*! \brief Disables certain ref frame pruning based on tpl. * * Determines whether a given ref frame is "good" based on data from the TPL * model. If so, this stops selective_ref frame from pruning the given ref * frame at block level. */ uint8_t tpl_keep_ref_frame[REF_FRAMES]; /*! \brief Warp motion samples buffer. * * Store the motion samples used for warp motion. */ WARP_SAMPLE_INFO warp_sample_info[REF_FRAMES]; /*! \brief Reference frames picked by the square subblocks in a superblock. * * Keeps track of ref frames that are selected by square partition blocks * within a superblock, in MI resolution. They can be used to prune ref frames * for rectangular blocks. */ int picked_ref_frames_mask[MAX_MIB_SIZE * MAX_MIB_SIZE]; /*! \brief Prune ref frames in real-time mode. * * Determines whether to prune reference frames in real-time mode. For the * most part, this is the same as nonrd_prune_ref_frame_search in * cpi->sf.rt_sf.nonrd_prune_ref_frame_search, but this can be selectively * turned off if the only frame available is GOLDEN_FRAME. */ int nonrd_prune_ref_frame_search; /**@}*/ /***************************************************************************** * \name Partition Search ****************************************************************************/ /**@{*/ //! Stores some partition-search related buffers. PartitionSearchInfo part_search_info; /*! \brief Whether to disable some features to force a mode in current block. * * In some cases, our speed features can be overly aggressive and remove all * modes search in the superblock. When this happens, we set * must_find_valid_partition to 1 to reduce the number of speed features, and * recode the superblock again. */ int must_find_valid_partition; /**@}*/ /***************************************************************************** * \name Prediction Mode Search ****************************************************************************/ /**@{*/ /*! \brief Inter skip mode. * * Skip mode tries to use the closest forward and backward references for * inter prediction. Skip here means to skip transmitting the reference * frames, not to be confused with skip_txfm. */ int skip_mode; /*! \brief Factors used for rd-thresholding. * * Determines a rd threshold to determine whether to continue searching the * current mode. If the current best rd is already <= threshold, then we skip * the current mode. */ int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES]; /*! \brief Tracks the winner modes in the current coding block. * * Winner mode is a two-pass strategy to find the best prediction mode. In the * first pass, we search the prediction modes with a limited set of txfm * options, and keep the top modes. These modes are called the winner modes. * In the second pass, we retry the winner modes with more thorough txfm * options. */ WinnerModeStats *winner_mode_stats; //! Tracks how many winner modes there are. int winner_mode_count; /*! \brief The model used for rd-estimation to avoid txfm * * These are for inter_mode_rd_model_estimation, which is another two pass * approach. In this speed feature, we collect data in the first couple frames * to build an rd model to estimate the rdcost of a prediction model based on * the residue error. Once enough data is collected, this speed feature uses * the estimated rdcost to find the most performant prediction mode. Then we * follow up with a second pass find the best transform for the mode. * Determines if one would go with reduced complexity transform block * search model to select prediction modes, or full complexity model * to select transform kernel. */ TXFM_RD_MODEL rd_model; /*! \brief Stores the inter mode information needed to build an rd model. * * These are for inter_mode_rd_model_estimation, which is another two pass * approach. In this speed feature, we collect data in the first couple frames * to build an rd model to estimate the rdcost of a prediction model based on * the residue error. Once enough data is collected, this speed feature uses * the estimated rdcost to find the most performant prediction mode. Then we * follow up with a second pass find the best transform for the mode. */ // TODO(any): try to consolidate this speed feature with winner mode // processing. struct inter_modes_info *inter_modes_info; //! How to blend the compound predictions. uint8_t compound_idx; //! A caches of results of compound type search so they can be reused later. COMP_RD_STATS comp_rd_stats[MAX_COMP_RD_STATS]; //! The idx for the latest compound mode in the cache \ref comp_rd_stats. int comp_rd_stats_idx; /*! \brief Whether to recompute the luma prediction. * * In interpolation search, we can usually skip recalculating the luma * prediction because it is already calculated by a previous predictor. This * flag signifies that some modes might have been skipped, so we need to * rebuild the prediction. */ int recalc_luma_mc_data; /*! \brief Data structure to speed up intrabc search. * * Contains the hash table, hash function, and buffer used for intrabc. */ IntraBCHashInfo intrabc_hash_info; /*! \brief Whether to reuse the mode stored in mb_mode_cache. */ int use_mb_mode_cache; /*! \brief The mode to reuse during \ref av1_rd_pick_intra_mode_sb and * \ref av1_rd_pick_inter_mode. */ const MB_MODE_INFO *mb_mode_cache; /*! \brief Pointer to the buffer which caches gradient information. * * Pointer to the array of structures to store gradient information of each * pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level * structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV). */ PixelLevelGradientInfo *pixel_gradient_info; /*! \brief Flags indicating the availability of cached gradient info. */ bool is_sb_gradient_cached[PLANE_TYPES]; /*! \brief Flag to reuse predicted samples of inter block. */ bool reuse_inter_pred; /**@}*/ /***************************************************************************** * \name MV Search ****************************************************************************/ /**@{*/ /*! \brief Context used to determine the initial step size in motion search. * * This context is defined as the \f$l_\inf\f$ norm of the best ref_mvs for * each frame. */ unsigned int max_mv_context[REF_FRAMES]; /*! \brief Limit for the range of motion vectors. * * These define limits to motion vector components to prevent them from * extending outside the UMV borders */ FullMvLimits mv_limits; /*! \brief Buffer for storing the search site config. * * When resize mode or super resolution mode is on, the stride of the * reference frame does not always match what's specified in \ref * MotionVectorSearchParams::search_site_cfg. When his happens, we update the * search_sine_config buffer here and use it for motion search. */ search_site_config search_site_cfg_buf[NUM_DISTINCT_SEARCH_METHODS]; /**@}*/ /***************************************************************************** * \name Txfm Search ****************************************************************************/ /**@{*/ /*! \brief Parameters that control how motion search is done. * * Stores various txfm search related parameters such as txfm_type, txfm_size, * trellis eob search, etc. */ TxfmSearchParams txfm_search_params; /*! \brief Results of the txfm searches that have been done. * * Caches old txfm search results and keeps the current txfm decisions to * facilitate rdopt. */ TxfmSearchInfo txfm_search_info; /*! \brief Whether there is a strong color activity. * * Used in REALTIME coding mode to enhance the visual quality at the boundary * of moving color objects. */ uint8_t color_sensitivity_sb[MAX_MB_PLANE - 1]; //! Color sensitivity flag for the superblock for golden reference. uint8_t color_sensitivity_sb_g[MAX_MB_PLANE - 1]; //! Color sensitivity flag for the superblock for altref reference. uint8_t color_sensitivity_sb_alt[MAX_MB_PLANE - 1]; //! Color sensitivity flag for the coding block. uint8_t color_sensitivity[MAX_MB_PLANE - 1]; //! Coding block distortion value for uv/color, minimum over the inter modes. int64_t min_dist_inter_uv; //! Threshold on the number of colors for testing palette mode. int color_palette_thresh; //! Used in REALTIME coding mode: flag to indicate if the color_sensitivity // should be checked at the coding block level. int force_color_check_block_level; //! The buffer used by search_tx_type() to swap dqcoeff in macroblockd_plane // so we can keep dqcoeff of the best tx_type. tran_low_t *dqcoeff_buf; /**@}*/ /***************************************************************************** * \name Misc ****************************************************************************/ /**@{*/ //! Variance of the source frame. unsigned int source_variance; //! Flag to indicate coding block is zero sad. int block_is_zero_sad; //! Flag to indicate superblock ME in variance partition is determined to be // good/reliable, and so the superblock MV will be tested in the // nonrd_pickmode. This is only used for LAST_FRAME. int sb_me_partition; //! Flag to indicate to test the superblock MV for the coding block in the // nonrd_pickmode. int sb_me_block; //! Motion vector from superblock MV derived from int_pro_motion() in // the variance_partitioning. int_mv sb_me_mv; //! Flag to indicate if a fixed partition should be used, only if the // speed feature rt_sf->use_fast_fixed_part is enabled. int sb_force_fixed_part; //! SSE of the current predictor. unsigned int pred_sse[REF_FRAMES]; //! Prediction for ML based partition. #if CONFIG_RT_ML_PARTITIONING DECLARE_ALIGNED(16, uint8_t, est_pred[128 * 128]); #endif /**@}*/ /*! \brief NONE partition evaluated for merge. * * In variance based partitioning scheme, NONE & SPLIT partitions are * evaluated to check the SPLIT can be merged as NONE. This flag signifies the * partition is evaluated in the scheme. */ int try_merge_partition; /*! \brief Pointer to buffer which caches sub-block variances in a superblock. * * Pointer to the array of structures to store source variance information of * each 4x4 sub-block in a superblock. Block4x4VarInfo structure is used to * store source variance and log of source variance of each 4x4 sub-block. */ Block4x4VarInfo *src_var_info_of_4x4_sub_blocks; #ifndef NDEBUG /*! \brief A hash to make sure av1_set_offsets is called */ SetOffsetsLoc last_set_offsets_loc; #endif // NDEBUG #if COLLECT_NONRD_PICK_MODE_STAT mode_search_stat_nonrd ms_stat_nonrd; #endif // COLLECT_NONRD_PICK_MODE_STAT /*!\brief Number of pixels in current thread that choose palette mode in the * fast encoding stage for screen content tool detemination. */ int palette_pixels; /*!\brief Pointer to the structure which stores the statistics used by * sb-level multi-pass encoding. */ struct SB_FIRST_PASS_STATS *sb_stats_cache; /*!\brief Pointer to the structure which stores the statistics used by * first-pass when superblock is searched twice consecutively. */ struct SB_FIRST_PASS_STATS *sb_fp_stats; #if CONFIG_PARTITION_SEARCH_ORDER /*!\brief Pointer to RD_STATS structure to be used in * av1_rd_partition_search(). */ RD_STATS *rdcost; #endif // CONFIG_PARTITION_SEARCH_ORDER } MACROBLOCK; #undef SINGLE_REF_MODES /*!\cond */ // Zeroes out 'n_stats' elements in the array x->winner_mode_stats. // It only zeroes out what is necessary in 'color_index_map' (just the block // size, not the whole array). static inline void zero_winner_mode_stats(BLOCK_SIZE bsize, int n_stats, WinnerModeStats *stats) { // When winner mode stats are not required, the memory allocation is avoided // for x->winner_mode_stats. The stats pointer will be NULL in such cases. if (stats == NULL) return; const int block_height = block_size_high[bsize]; const int block_width = block_size_wide[bsize]; for (int i = 0; i < n_stats; ++i) { WinnerModeStats *const stat = &stats[i]; memset(&stat->mbmi, 0, sizeof(stat->mbmi)); memset(&stat->rd_cost, 0, sizeof(stat->rd_cost)); memset(&stat->rd, 0, sizeof(stat->rd)); memset(&stat->rate_y, 0, sizeof(stat->rate_y)); memset(&stat->rate_uv, 0, sizeof(stat->rate_uv)); // Do not reset the whole array as it is CPU intensive. memset(&stat->color_index_map, 0, block_width * block_height * sizeof(stat->color_index_map[0])); memset(&stat->mode_index, 0, sizeof(stat->mode_index)); } } static inline int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) { static const char LUT[BLOCK_SIZES_ALL] = { 0, // BLOCK_4X4 1, // BLOCK_4X8 1, // BLOCK_8X4 0, // BLOCK_8X8 1, // BLOCK_8X16 1, // BLOCK_16X8 0, // BLOCK_16X16 1, // BLOCK_16X32 1, // BLOCK_32X16 0, // BLOCK_32X32 1, // BLOCK_32X64 1, // BLOCK_64X32 0, // BLOCK_64X64 0, // BLOCK_64X128 0, // BLOCK_128X64 0, // BLOCK_128X128 1, // BLOCK_4X16 1, // BLOCK_16X4 1, // BLOCK_8X32 1, // BLOCK_32X8 1, // BLOCK_16X64 1, // BLOCK_64X16 }; return LUT[bsize]; } static inline int is_rect_tx_allowed(const MACROBLOCKD *xd, const MB_MODE_INFO *mbmi) { return is_rect_tx_allowed_bsize(mbmi->bsize) && !xd->lossless[mbmi->segment_id]; } static inline int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) { TX_SIZE ctx_size = max_txsize_rect_lookup[bsize]; int depth = 0; while (tx_size != ctx_size) { depth++; ctx_size = sub_tx_size_map[ctx_size]; assert(depth <= MAX_TX_DEPTH); } return depth; } static inline void set_blk_skip(uint8_t txb_skip[], int plane, int blk_idx, int skip) { if (skip) txb_skip[blk_idx] |= 1UL << plane; else txb_skip[blk_idx] &= ~(1UL << plane); #ifndef NDEBUG // Set chroma planes to uninitialized states when luma is set to check if // it will be set later if (plane == 0) { txb_skip[blk_idx] |= 1UL << (1 + 4); txb_skip[blk_idx] |= 1UL << (2 + 4); } // Clear the initialization checking bit txb_skip[blk_idx] &= ~(1UL << (plane + 4)); #endif } static inline int is_blk_skip(uint8_t *txb_skip, int plane, int blk_idx) { #ifndef NDEBUG // Check if this is initialized assert(!(txb_skip[blk_idx] & (1UL << (plane + 4)))); // The magic number is 0x77, this is to test if there is garbage data assert((txb_skip[blk_idx] & 0x88) == 0); #endif return (txb_skip[blk_idx] >> plane) & 1; } /*!\endcond */ #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_BLOCK_H_ aom-3.12.1/av1/encoder/blockiness.c000066400000000000000000000106311477627663500170100ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/encoder/blockiness.h" #include #include static int horizontal_filter(const uint8_t *s) { return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6; } static int vertical_filter(const uint8_t *s, int p) { return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6; } static int variance(int sum, int sum_squared, int size) { return sum_squared / size - (sum / size) * (sum / size); } // Calculate a blockiness level for a vertical block edge. // This function returns a new blockiness metric that's defined as // p0 p1 p2 p3 // q0 q1 q2 q3 // block edge -> // r0 r1 r2 r3 // s0 s1 s2 s3 // blockiness = p0*-2+q0*6+r0*-6+s0*2 + // p1*-2+q1*6+r1*-6+s1*2 + // p2*-2+q2*6+r2*-6+s2*2 + // p3*-2+q3*6+r3*-6+s3*2 ; // reconstructed_blockiness = abs(blockiness from reconstructed buffer - // blockiness from source buffer,0) // // I make the assumption that flat blocks are much more visible than high // contrast blocks. As such, I scale the result of the blockiness calc // by dividing the blockiness by the variance of the pixels on either side // of the edge as follows: // var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2 // var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2 // The returned blockiness is the scaled value // Reconstructed blockiness / ( 1 + var_0 + var_1 ) ; static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r, int rp, int size) { int s_blockiness = 0; int r_blockiness = 0; int sum_0 = 0; int sum_sq_0 = 0; int sum_1 = 0; int sum_sq_1 = 0; int i; int var_0; int var_1; for (i = 0; i < size; ++i, s += sp, r += rp) { s_blockiness += horizontal_filter(s); r_blockiness += horizontal_filter(r); sum_0 += s[0]; sum_sq_0 += s[0] * s[0]; sum_1 += s[-1]; sum_sq_1 += s[-1] * s[-1]; } var_0 = variance(sum_0, sum_sq_0, size); var_1 = variance(sum_1, sum_sq_1, size); r_blockiness = abs(r_blockiness); s_blockiness = abs(s_blockiness); if (r_blockiness > s_blockiness) return (r_blockiness - s_blockiness) / (1 + var_0 + var_1); else return 0; } // Calculate a blockiness level for a horizontal block edge // same as above. static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r, int rp, int size) { int s_blockiness = 0; int r_blockiness = 0; int sum_0 = 0; int sum_sq_0 = 0; int sum_1 = 0; int sum_sq_1 = 0; int i; int var_0; int var_1; for (i = 0; i < size; ++i, ++s, ++r) { s_blockiness += vertical_filter(s, sp); r_blockiness += vertical_filter(r, rp); sum_0 += s[0]; sum_sq_0 += s[0] * s[0]; sum_1 += s[-sp]; sum_sq_1 += s[-sp] * s[-sp]; } var_0 = variance(sum_0, sum_sq_0, size); var_1 = variance(sum_1, sum_sq_1, size); r_blockiness = abs(r_blockiness); s_blockiness = abs(s_blockiness); if (r_blockiness > s_blockiness) return (r_blockiness - s_blockiness) / (1 + var_0 + var_1); else return 0; } // This function returns the blockiness for the entire frame currently by // looking at all borders in steps of 4. double av1_get_blockiness(const unsigned char *img1, int img1_pitch, const unsigned char *img2, int img2_pitch, int width, int height) { double blockiness = 0; int i, j; for (i = 0; i < height; i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { for (j = 0; j < width; j += 4) { if (i > 0 && i < height && j > 0 && j < width) { blockiness += blockiness_vertical(img1 + j, img1_pitch, img2 + j, img2_pitch, 4); blockiness += blockiness_horizontal(img1 + j, img1_pitch, img2 + j, img2_pitch, 4); } } } blockiness /= width * height / 16; return blockiness; } aom-3.12.1/av1/encoder/blockiness.h000066400000000000000000000015061477627663500170160ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_BLOCKINESS_H_ #define AOM_AV1_ENCODER_BLOCKINESS_H_ double av1_get_blockiness(const unsigned char *img1, int img1_pitch, const unsigned char *img2, int img2_pitch, int width, int height); #endif // AOM_AV1_ENCODER_BLOCKINESS_H_ aom-3.12.1/av1/encoder/cnn.c000066400000000000000000001376411477627663500154450ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "aom_dsp/aom_dsp_common.h" #include "av1/common/av1_common_int.h" #include "av1/encoder/cnn.h" #define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a))) typedef struct { const float **input; int in_width; int in_height; int in_stride; const CNN_LAYER_CONFIG *layer_config; float **output; int out_stride; int start_idx; int th_step; } CONVOLVE_OPS; static inline float softsign(float x) { return x / (fabsf(x) + 1.0f); } static inline float relu(float x) { return (x < 0) ? 0 : x; } typedef struct { int allocsize; int channels; int width, height, stride; float *buf[CNN_MAX_CHANNELS]; } TENSOR; static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); } static void free_tensor(TENSOR *tensor) { if (tensor->allocsize) { aom_free(tensor->buf[0]); tensor->buf[0] = NULL; tensor->allocsize = 0; } } static bool realloc_tensor(TENSOR *tensor, int channels, int width, int height) { const int newallocsize = channels * width * height; if (tensor->allocsize < newallocsize) { free_tensor(tensor); tensor->buf[0] = (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize); if (!tensor->buf[0]) return false; tensor->allocsize = newallocsize; } tensor->width = width; tensor->height = height; tensor->stride = width; tensor->channels = channels; for (int c = 1; c < channels; ++c) tensor->buf[c] = &tensor->buf[0][c * width * height]; return true; } static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset, TENSOR *dst) { assert(src->width == dst->width); assert(src->height == dst->height); assert(copy_channels <= src->channels); if (src->stride == dst->width && dst->stride == dst->width) { for (int c = 0; c < copy_channels; ++c) { memcpy(dst->buf[dst_offset + c], src->buf[c], sizeof(*dst->buf[0]) * src->width * src->height); } } else { for (int c = 0; c < copy_channels; ++c) { for (int r = 0; r < dst->height; ++r) { memcpy(&dst->buf[dst_offset + c][r * dst->stride], &src->buf[c][r * src->stride], dst->width * sizeof(*dst->buf[c])); } } } } static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS], int channels, int width, int height, int stride) { tensor->allocsize = 0; tensor->channels = channels; tensor->width = width; tensor->height = height; tensor->stride = stride; if (buf) { for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c]; } else { for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL; } } static void swap_tensor(TENSOR *t1, TENSOR *t2) { TENSOR t = *t1; *t1 = *t2; *t2 = t; } // The concatenated tensor goes into dst with first the channels in // original dst followed by the channels in the src static bool concat_tensor(const TENSOR *src, TENSOR *dst) { assert(src->width == dst->width); assert(src->height == dst->height); const int dst_channels = dst->channels; const int channels = dst->channels + src->channels; const int newallocsize = channels * dst->width * dst->height; if (dst->allocsize < newallocsize) { TENSOR t; init_tensor(&t); // allocate new buffers and copy first the dst channels if (!realloc_tensor(&t, channels, dst->width, dst->height)) return false; copy_tensor(dst, dst->channels, 0, &t); // Swap the tensors and free the old buffers swap_tensor(dst, &t); free_tensor(&t); } for (int c = 1; c < channels; ++c) dst->buf[c] = &dst->buf[0][c * dst->width * dst->height]; // Copy the channels in src after the first dst_channels channels. copy_tensor(src, src->channels, dst_channels, dst); return true; } #ifndef NDEBUG static int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) { return (t1->width == t2->width && t1->height == t2->height); } static int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) { return (t1->channels == t2->channels && t1->width == t2->width && t1->height == t2->height); } #endif // NDEBUG void av1_find_cnn_layer_output_size(int in_width, int in_height, const CNN_LAYER_CONFIG *layer_config, int *out_width, int *out_height) { assert(layer_config->skip_width > 0); assert(layer_config->skip_height > 0); if (!layer_config->deconvolve) { switch (layer_config->pad) { case PADDING_SAME_ZERO: case PADDING_SAME_REPLICATE: *out_width = (in_width + layer_config->skip_width - 1) / layer_config->skip_width; *out_height = (in_height + layer_config->skip_height - 1) / layer_config->skip_height; break; case PADDING_VALID: *out_width = (in_width - layer_config->filter_width + layer_config->skip_width) / layer_config->skip_width; *out_height = (in_height - layer_config->filter_height + layer_config->skip_height) / layer_config->skip_height; break; default: assert(0 && "Unknown padding type"); } } else { switch (layer_config->pad) { case PADDING_SAME_ZERO: case PADDING_SAME_REPLICATE: *out_width = in_width * layer_config->skip_width; *out_height = in_height * layer_config->skip_height; break; case PADDING_VALID: *out_width = (in_width - 1) * layer_config->skip_width + layer_config->filter_width; *out_height = (in_height - 1) * layer_config->skip_height + layer_config->filter_height; break; default: assert(0 && "Unknown padding type"); } } } static void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config, int channels_per_branch[]) { int branch = layer_config->branch; const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { if ((branch_config->input_to_branches & (1 << b)) && b != branch) { if (layer_config->branch_copy_type == BRANCH_INPUT) { channels_per_branch[b] = layer_config->in_channels; } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) { channels_per_branch[b] = layer_config->out_channels; } else if (layer_config->branch_copy_type == BRANCH_COMBINED) { channels_per_branch[b] = layer_config->out_channels; for (int c = 0; c < CNN_MAX_BRANCHES; ++c) { if ((branch_config->branches_to_combine & (1 << c)) && c != branch) { assert(channels_per_branch[c] > 0); channels_per_branch[b] += channels_per_branch[c]; } } } } } channels_per_branch[branch] = layer_config->out_channels; for (int c = 0; c < CNN_MAX_BRANCHES; ++c) { if ((branch_config->branches_to_combine & (1 << c)) && c != branch) { assert(channels_per_branch[c] > 0); channels_per_branch[branch] += channels_per_branch[c]; } } } #if CONFIG_DEBUG static inline int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) { const int num_layers = cnn_config->num_layers; const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config; for (int idx = 0; idx < num_layers; idx++) { if (layer_configs[idx].output_num != -1) { return 1; } } return 0; } #endif void av1_find_cnn_output_size(int in_width, int in_height, const CNN_CONFIG *cnn_config, int *out_width, int *out_height, int *out_channels) { int channels_per_branch[CNN_MAX_BRANCHES] = { 0 }; int i_width[CNN_MAX_BRANCHES] = { 0 }; int i_height[CNN_MAX_BRANCHES] = { 0 }; i_width[0] = in_width + cnn_config->ext_width * 2; i_height[0] = in_height + cnn_config->ext_height * 2; #if CONFIG_DEBUG assert(cnn_has_at_least_one_output(cnn_config)); #endif for (int i = 0; i < cnn_config->num_layers; ++i) { const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i]; const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; const int branch = layer_config->branch; int o_width = 0, o_height = 0; if (layer_config->branch_copy_type == BRANCH_INPUT) { for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { if ((branch_config->input_to_branches & (1 << b)) && b != branch) { assert(i_width[branch] > 0 && i_height[branch] > 0); i_width[b] = i_width[branch]; i_height[b] = i_height[branch]; } } } av1_find_cnn_layer_output_size(i_width[branch], i_height[branch], layer_config, &o_width, &o_height); i_width[branch] = o_width; i_height[branch] = o_height; if (layer_config->branch_copy_type == BRANCH_OUTPUT) { for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { if ((branch_config->input_to_branches & (1 << b)) && b != branch) { i_width[b] = o_width; i_height[b] = o_height; } } } find_cnn_out_channels(layer_config, channels_per_branch); const int output_num = layer_config->output_num; if (output_num != -1) { // Current layer is an output layer out_width[output_num] = o_width; out_height[output_num] = o_height; out_channels[output_num] = channels_per_branch[layer_config->branch]; } } } static inline int get_start_shift_convolve(int width, int filt_width, int stride) { const int mod = (width % stride); const int filt_off = (filt_width - 1) / 2; const int dif = (mod ? mod - 1 : stride - 1); return AOMMIN((dif + (filt_width % 2)) / 2, filt_off); } void av1_cnn_add_c(float **output, int channels, int width, int height, int stride, const float **add) { for (int c = 0; c < channels; ++c) { for (int i = 0; i < height; ++i) for (int j = 0; j < width; ++j) output[c][i * stride + j] += add[c][i * stride + j]; } } void av1_cnn_activate_c(float **output, int channels, int width, int height, int stride, ACTIVATION layer_activation) { if (layer_activation == RELU) { for (int c = 0; c < channels; ++c) { for (int i = 0; i < height; ++i) for (int j = 0; j < width; ++j) output[c][i * stride + j] = relu(output[c][i * stride + j]); } } else if (layer_activation == SOFTSIGN) { for (int c = 0; c < channels; ++c) { for (int i = 0; i < height; ++i) for (int j = 0; j < width; ++j) output[c][i * stride + j] = softsign(output[c][i * stride + j]); } } else if (layer_activation == SIGMOID) { assert(0 && "Sigmoid has not been supported in CNN."); // TO DO } else if (layer_activation != NONE) { assert(0 && "Unknown activation type"); } } static bool copy_active_tensor_to_branches(const TENSOR *layer_active_tensor, const CNN_LAYER_CONFIG *layer_config, int branch, TENSOR branch_output[]) { const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { if ((branch_config->input_to_branches & (1 << b)) && b != branch) { // Copy layer's active tensor to output tensor of branch b if set in // mask. The output becomes the input of the first layer of the branch // because the layer of the branch is not the first layer. int copy_channels = branch_config->channels_to_copy > 0 ? branch_config->channels_to_copy : layer_active_tensor->channels; if (!realloc_tensor(&branch_output[b], copy_channels, layer_active_tensor->width, layer_active_tensor->height)) { return false; } copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]); } } return true; } // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height // greater than 1 and padding equal to PADDING_SAME_ZERO. static void convolve_maxpool_padding_zero( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, const int cstep, const int filter_width_half, const int filter_height_half) { for (int i = 0; i < layer_config->out_channels; ++i) { for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) { for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) { for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height); ++hh) { for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width); ++ww) { float sum = layer_config->bias[i]; for (int k = 0; k < layer_config->in_channels; ++k) { int off = k * layer_config->out_channels + i; for (int l = 0; l < layer_config->filter_height; ++l) { const int ii = hh + l - filter_height_half; for (int m = 0; m < layer_config->filter_width; ++m, off += cstep) { const int jj = ww + m - filter_width_half; if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width) continue; sum += layer_config->weights[off] * input[k][ii * in_stride + jj]; } } } const float a = sum; if (h == hh && w == ww) output[i][u * out_stride + v] = a; else output[i][u * out_stride + v] = AOMMAX(output[i][u * out_stride + v], a); } } } } } } // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height // greater than 1 and padding equal to PADDING_SAME_REPLICATE. static void convolve_maxpool_padding_replicate( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, const int cstep, const int filter_width_half, const int filter_height_half) { for (int i = 0; i < layer_config->out_channels; ++i) { for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) { for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) { for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height); ++hh) { for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width); ++ww) { float sum = layer_config->bias[i]; for (int k = 0; k < layer_config->in_channels; ++k) { int off = k * layer_config->out_channels + i; for (int l = 0; l < layer_config->filter_height; ++l) { const int ii = CLAMPINDEX(hh + l - filter_height_half, in_height); for (int m = 0; m < layer_config->filter_width; ++m, off += cstep) { const int jj = CLAMPINDEX(ww + m - filter_width_half, in_width); assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); sum += layer_config->weights[off] * input[k][ii * in_stride + jj]; } } } const float a = sum; if (h == hh && w == ww) output[i][u * out_stride + v] = a; else output[i][u * out_stride + v] = AOMMAX(output[i][u * out_stride + v], a); } } } } } } // CNNConvolve specific to maxpool set as 1, either skip_width or skip_height // greater than 1 and padding equal to PADDING_VALID. static void convolve_maxpool_padding_valid( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, const int cstep) { for (int i = 0; i < layer_config->out_channels; ++i) { for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1; h += layer_config->skip_height, ++u) { for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1; w += layer_config->skip_width, ++v) { for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height); ++hh) { for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width); ++ww) { float sum = layer_config->bias[i]; for (int k = 0; k < layer_config->in_channels; ++k) { int off = k * layer_config->out_channels + i; for (int l = 0; l < layer_config->filter_height; ++l) { const int ii = hh + l; for (int m = 0; m < layer_config->filter_width; ++m, off += cstep) { const int jj = ww + m; assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); sum += layer_config->weights[off] * input[k][ii * in_stride + jj]; } } } const float a = sum; if (h == hh && w == ww) output[i][u * out_stride + v] = a; else output[i][u * out_stride + v] = AOMMAX(output[i][u * out_stride + v], a); } } } } } } // CNNConvolve specific to maxpool set as 0 with filter_height and filter_width // equal to 1. static void convolve_element_wise(const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, int start_idx, int step) { const int start_h = get_start_shift_convolve( in_height, layer_config->filter_height, layer_config->skip_height); const int start_w = get_start_shift_convolve(in_width, layer_config->filter_width, layer_config->skip_width) + start_idx * layer_config->skip_width; const int out_w_step = AOMMAX(step, 1); const int in_w_step = layer_config->skip_width * out_w_step; for (int i = 0; i < layer_config->out_channels; ++i) { for (int h = start_h, u = 0; h < in_height; h += layer_config->skip_height, ++u) { const int in_h = h * in_stride; const int out_h = u * out_stride + start_idx; for (int w = start_w, out_index = out_h; w < in_width; w += in_w_step, out_index += out_w_step) { float sum = layer_config->bias[i]; for (int k = 0; k < layer_config->in_channels; ++k) { sum += layer_config->weights[k * layer_config->out_channels + i] * input[k][in_h + w]; } output[i][out_index] = sum; } } } } // CNNConvolve specific to maxpool set as 0 and padding equal to // PADDING_SAME_ZERO. static void convolve_no_maxpool_padding_zero( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, int start_idx, const int cstep, const int filter_width_half, const int filter_height_half, const int ii_shift, const int jj_shift, const int channel_step) { const int start_h = get_start_shift_convolve( in_height, layer_config->filter_height, layer_config->skip_height); const int start_w = get_start_shift_convolve( in_width, layer_config->filter_width, layer_config->skip_width); const int end_ii_shift = filter_height_half + 1; const int end_jj_shift = filter_width_half + 1; // *_filter_margin stores the number of pixels along a dimension in the // intersection of the complement of the image in the extended image // and the filter. const int top_filter_margin = layer_config->filter_width * ii_shift; const int right_filter_margin = end_jj_shift - in_width; for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { for (int h = start_h, u = 0; h < in_height; h += layer_config->skip_height, ++u) { const int out_h = u * out_stride; const int top_cstep = AOMMAX(0, top_filter_margin - h * layer_config->filter_width) * cstep + i; const int start_ii = AOMMAX(0, h - ii_shift); const int end_ii = AOMMIN(in_height, h + end_ii_shift); for (int w = start_w, out_index = out_h; w < in_width; w += layer_config->skip_width, ++out_index) { const int left_cstep = AOMMAX(0, jj_shift - w) * cstep; const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep; const int start_jj = AOMMAX(0, w - jj_shift); const int end_jj = AOMMIN(in_width, w + end_jj_shift); float sum = layer_config->bias[i]; for (int k = 0; k < layer_config->in_channels; ++k) { int off = k * layer_config->out_channels + top_cstep; for (int ii = start_ii; ii < end_ii; ++ii) { off += left_cstep; for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) { sum += layer_config->weights[off] * input[k][ii * in_stride + jj]; } off += right_cstep; } } output[i][out_index] = sum; } } } } // CNNConvolve specific to maxpool set as 0 and padding equal to // PADDING_SAME_REPLICATE. static void convolve_no_maxpool_padding_replicate( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, int start_idx, const int cstep, const int ii_shift, const int jj_shift, const int channel_step) { // h and w are shifted to an offset coordinate system to reduce in-loop // computation. const int start_h = get_start_shift_convolve(in_height, layer_config->filter_height, layer_config->skip_height) - ii_shift; const int start_w = get_start_shift_convolve(in_width, layer_config->filter_width, layer_config->skip_width) - jj_shift; const int end_h = in_height - ii_shift; const int end_w = in_width - jj_shift; for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { for (int h = start_h, u = 0; h < end_h; h += layer_config->skip_height, ++u) { const int out_h = u * out_stride; const int upper_ii_index = layer_config->filter_height + h; for (int w = start_w, out_index = out_h; w < end_w; w += layer_config->skip_width, ++out_index) { const int upper_jj_index = layer_config->filter_width + w; float sum = layer_config->bias[i]; for (int k = 0; k < layer_config->in_channels; ++k) { int off = k * layer_config->out_channels + i; for (int ii = h; ii < upper_ii_index; ++ii) { const int clamped_ii = CLAMPINDEX(ii, in_height); for (int jj = w; jj < upper_jj_index; ++jj) { const int clamped_jj = CLAMPINDEX(jj, in_width); assert(clamped_ii >= 0 && clamped_ii < in_height && clamped_jj >= 0 && clamped_jj < in_width); sum += layer_config->weights[off] * input[k][clamped_ii * in_stride + clamped_jj]; off += cstep; } } } output[i][out_index] = sum; } } } } // CNNConvolve specific to maxpool set as 0 and padding equal to // PADDING_VALID. void av1_cnn_convolve_no_maxpool_padding_valid_c( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step) { assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) || !layer_config->maxpool); assert(layer_config->filter_height > 1 || layer_config->filter_width > 1); assert(layer_config->pad == PADDING_VALID); for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1; h += layer_config->skip_height, ++u) { const int out_h = u * out_stride; const int upper_ii_index = layer_config->filter_height + h; for (int w = 0, out_index = out_h; w < in_width - layer_config->filter_width + 1; w += layer_config->skip_width, ++out_index) { const int upper_jj_index = layer_config->filter_width + w; float sum = layer_config->bias[i]; for (int k = 0; k < layer_config->in_channels; ++k) { int off = k * layer_config->out_channels + i; for (int ii = h; ii < upper_ii_index; ++ii) { for (int jj = w; jj < upper_jj_index; ++jj) { assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); sum += layer_config->weights[off] * input[k][ii * in_stride + jj]; off += cstep; } } } output[i][out_index] = sum; } } } } static void av1_cnn_convolve(const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int step) { assert(!layer_config->deconvolve); const int cstep = layer_config->in_channels * layer_config->out_channels; const int filter_height_half = layer_config->filter_height >> 1; const int filter_width_half = layer_config->filter_width >> 1; const int channel_step = AOMMAX(step, 1); if (layer_config->maxpool && (layer_config->skip_height > 1 || layer_config->skip_width > 1)) { switch (layer_config->pad) { case PADDING_SAME_ZERO: convolve_maxpool_padding_zero(input, in_width, in_height, in_stride, layer_config, output, out_stride, cstep, filter_width_half, filter_height_half); break; case PADDING_SAME_REPLICATE: convolve_maxpool_padding_replicate( input, in_width, in_height, in_stride, layer_config, output, out_stride, cstep, filter_width_half, filter_height_half); break; case PADDING_VALID: convolve_maxpool_padding_valid(input, in_width, in_height, in_stride, layer_config, output, out_stride, cstep); break; default: assert(0 && "Unknown padding type"); } } else { // Results in element-wise matrix multiplication. if (layer_config->filter_height == 1 && layer_config->filter_width == 1) { convolve_element_wise(input, in_width, in_height, in_stride, layer_config, output, out_stride, start_idx, step); return; } const int ii_shift = filter_height_half - (layer_config->filter_height - 1) % 2; const int jj_shift = filter_width_half - (layer_config->filter_width - 1) % 2; switch (layer_config->pad) { case PADDING_SAME_ZERO: convolve_no_maxpool_padding_zero( input, in_width, in_height, in_stride, layer_config, output, out_stride, start_idx, cstep, filter_width_half, filter_height_half, ii_shift, jj_shift, channel_step); break; case PADDING_SAME_REPLICATE: convolve_no_maxpool_padding_replicate( input, in_width, in_height, in_stride, layer_config, output, out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step); break; case PADDING_VALID: av1_cnn_convolve_no_maxpool_padding_valid( input, in_width, in_height, in_stride, layer_config, output, out_stride, start_idx, cstep, channel_step); break; default: assert(0 && "Unknown padding type"); } } } static int convolve_layer(void *arg1, void *arg2) { const CONVOLVE_OPS *convolve_ops = arg1; (void)arg2; av1_cnn_convolve( convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height, convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output, convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step); return 1; } static void convolve_layer_mt(const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, const CNN_THREAD_DATA *thread_data, float **output, int out_stride) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); const int num_workers = thread_data->num_workers; assert(thread_data->workers); CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS]; for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) { AVxWorker *const worker = &thread_data->workers[th]; winterface->reset(worker); CONVOLVE_OPS convolve_op = { input, in_width, in_height, in_stride, layer_config, output, out_stride, th, num_workers }; convolve_ops[th] = convolve_op; worker->hook = convolve_layer; worker->data1 = &(convolve_ops[th]); worker->data2 = NULL; // Start convolving. if (th == num_workers - 1) { winterface->execute(worker); } else { winterface->launch(worker); } } // Wait until all workers have finished. for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) { winterface->sync(&thread_data->workers[th]); } } static inline int get_start_shift_deconvolve(int filt_width, int stride) { const int dif = AOMMAX(filt_width - stride, 0); return dif / 2; } void av1_cnn_batchnorm_c(float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std) { assert(gamma && beta && beta && std && "batchnorm has null parameter!"); for (int ch = 0; ch < channels; ch++) { const float ch_gamma = gamma[ch]; const float ch_beta = beta[ch]; const float ch_mean = mean[ch]; const float ch_std = std[ch]; float *image_row = image[ch]; for (int row = 0; row < height; row++) { for (int col = 0; col < width; col++) { image_row[col] = ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta; } image_row += stride; } } } void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride) { assert(layer_config->deconvolve); const int cstep = layer_config->in_channels * layer_config->out_channels; int out_width = 0; int out_height = 0; av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width, &out_height); switch (layer_config->pad) { case PADDING_SAME_ZERO: for (int i = 0; i < layer_config->out_channels; ++i) { for (int u = 0; u < out_height; ++u) { for (int v = 0; v < out_width; ++v) { float sum = layer_config->bias[i]; for (int k = 0; k < layer_config->in_channels; ++k) { int off = k * layer_config->out_channels + i; for (int l = 0; l < layer_config->filter_height; ++l) { const int h = u - l + get_start_shift_deconvolve(layer_config->filter_height, layer_config->skip_height); for (int m = 0; m < layer_config->filter_width; ++m, off += cstep) { const int w = v - m + get_start_shift_deconvolve(layer_config->filter_width, layer_config->skip_width); if ((h % layer_config->skip_height) != 0 || (w % layer_config->skip_width) != 0) continue; const int ii = h / layer_config->skip_height; const int jj = w / layer_config->skip_width; if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width) continue; sum += layer_config->weights[off] * input[k][ii * in_stride + jj]; } } } output[i][u * out_stride + v] = sum; } } } break; case PADDING_SAME_REPLICATE: for (int i = 0; i < layer_config->out_channels; ++i) { for (int u = 0; u < out_height; ++u) { for (int v = 0; v < out_width; ++v) { float sum = layer_config->bias[i]; for (int k = 0; k < layer_config->in_channels; ++k) { int off = k * layer_config->out_channels + i; for (int l = 0; l < layer_config->filter_height; ++l) { const int h = u - l + get_start_shift_deconvolve(layer_config->filter_height, layer_config->skip_height); for (int m = 0; m < layer_config->filter_width; ++m, off += cstep) { const int w = v - m + get_start_shift_deconvolve(layer_config->filter_width, layer_config->skip_width); if ((h % layer_config->skip_height) != 0 || (w % layer_config->skip_width) != 0) continue; const int ii = CLAMPINDEX(h / layer_config->skip_height, in_height); const int jj = CLAMPINDEX(w / layer_config->skip_width, in_width); assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); sum += layer_config->weights[off] * input[k][ii * in_stride + jj]; } } } output[i][u * out_stride + v] = sum; } } } break; case PADDING_VALID: for (int i = 0; i < layer_config->out_channels; ++i) { for (int u = 0; u < out_height; ++u) { for (int v = 0; v < out_width; ++v) { float sum = layer_config->bias[i]; for (int k = 0; k < layer_config->in_channels; ++k) { int off = k * layer_config->out_channels + i; for (int l = 0; l < layer_config->filter_height; ++l) { const int h = u - l; for (int m = 0; m < layer_config->filter_width; ++m, off += cstep) { const int w = v - m; if ((h % layer_config->skip_height) != 0 || (w % layer_config->skip_width) != 0) continue; const int ii = h / layer_config->skip_height; const int jj = w / layer_config->skip_width; if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width) continue; sum += layer_config->weights[off] * input[k][ii * in_stride + jj]; } } } output[i][u * out_stride + v] = sum; } } } break; default: assert(0 && "Unknown padding type"); } } bool av1_cnn_predict_c(const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct) { bool success = false; TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } }; TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } }; float **output[CNN_MAX_BRANCHES]; const int *out_chs = output_struct->output_channels; output[0] = output_struct->output_buffer; for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) { output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1]; } int i_width = in_width; int i_height = in_height; int o_width = 0, o_height = 0; for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { init_tensor(&tensor1[b]); init_tensor(&tensor2[b]); } const int *out_stride = output_struct->output_strides; for (int layer = 0; layer < cnn_config->num_layers; ++layer) { const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer]; const int branch = layer_config->branch; const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; // Allocate input tensor if (layer == 0) { // First layer assert(branch == 0); // First layer must be primary branch assign_tensor(&tensor1[branch], (float **)input, layer_config->in_channels, in_width, in_height, in_stride); } else { // Non-first layer // Swap tensor1 and tensor2 swap_tensor(&tensor1[branch], &tensor2[branch]); i_width = tensor1[branch].width; i_height = tensor1[branch].height; } // Allocate output tensor av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width, &o_height); const int output_num = layer_config->output_num; if (output_num == -1) { // Non-output layer if (!realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width, o_height)) { goto Error; } } else { // Output layer free_tensor(&tensor2[branch]); assign_tensor(&tensor2[branch], output[output_num], layer_config->out_channels, o_width, o_height, out_stride[output_num]); } // If we are combining branches make sure that the branch to combine // is different from the current branch. assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC, !(branch_config->branches_to_combine & (1 << branch)))); if (layer_config->branch_copy_type == BRANCH_INPUT) { if (!copy_active_tensor_to_branches(&tensor1[branch], layer_config, branch, tensor2)) { goto Error; } } // Check consistency of input and output channels assert(tensor1[branch].channels == layer_config->in_channels); assert(tensor2[branch].channels == layer_config->out_channels); // Convolve/Deconvolve if (!cnn_config->layer_config[layer].deconvolve) { if (thread_data->num_workers > 1) { convolve_layer_mt((const float **)tensor1[branch].buf, tensor1[branch].width, tensor1[branch].height, tensor1[branch].stride, layer_config, thread_data, tensor2[branch].buf, tensor2[branch].stride); } else { av1_cnn_convolve((const float **)tensor1[branch].buf, tensor1[branch].width, tensor1[branch].height, tensor1[branch].stride, layer_config, tensor2[branch].buf, tensor2[branch].stride, 0, 1); } } else { av1_cnn_deconvolve((const float **)tensor1[branch].buf, tensor1[branch].width, tensor1[branch].height, tensor1[branch].stride, layer_config, tensor2[branch].buf, tensor2[branch].stride); } if (layer_config->branch_copy_type == BRANCH_OUTPUT) { if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch, tensor2)) { goto Error; } } // Add tensors from other branches if needed if (layer_config->branch_combine_type == BRANCH_ADD) { for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch])); av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width, tensor2[branch].height, tensor2[branch].stride, (const float **)tensor2[b].buf); } } } // Non-linearity av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width, tensor2[branch].height, tensor2[branch].stride, layer_config->activation); if (layer_config->bn_params.bn_gamma) { av1_cnn_batchnorm( tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width, tensor2[branch].height, tensor2[branch].stride, layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta, layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std); } // Concatenate tensors if (layer_config->branch_combine_type == BRANCH_CAT) { if (output_num == -1) { // Non-output layer for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch])); assert(tensor2[b].channels > 0); if (!concat_tensor(&tensor2[b], &tensor2[branch])) goto Error; } } } else { // Output layer const int existing_channels = tensor2[branch].channels; int num_chs = existing_channels; for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch])); // Needed only to assign the new channel buffers num_chs += tensor2[b].channels; } } assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width, o_height, out_stride[output_num]); num_chs = existing_channels; for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch])); // Needed only to assign the new channel buffers copy_tensor(&tensor2[b], tensor2[b].channels, num_chs, &tensor2[branch]); num_chs += tensor2[b].channels; } } } } if (layer_config->branch_copy_type == BRANCH_COMBINED) { if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch, tensor2)) { goto Error; } } } success = true; Error: for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { free_tensor(&tensor1[b]); free_tensor(&tensor2[b]); } return success; } // Assume output already has proper allocation // Assume input image buffers all have same resolution and strides bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height, int stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output) { const float max_val = 255.0; const int in_width = width + 2 * cnn_config->ext_width; const int in_height = height + 2 * cnn_config->ext_height; const int in_channels = cnn_config->layer_config[0].in_channels; float *inputs[CNN_MAX_CHANNELS]; float *input_ = (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_)); if (!input_) return false; const int in_stride = in_width; for (int c = 0; c < in_channels; ++c) { inputs[c] = input_ + c * in_stride * in_height; float *input = inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width; if (cnn_config->strict_bounds) { for (int i = 0; i < height; ++i) for (int j = 0; j < width; ++j) input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; // extend left and right for (int i = 0; i < height; ++i) { for (int j = -cnn_config->ext_width; j < 0; ++j) input[i * in_stride + j] = input[i * in_stride]; for (int j = width; j < width + cnn_config->ext_width; ++j) input[i * in_stride + j] = input[i * in_stride + width - 1]; } // extend top and bottom for (int i = -cnn_config->ext_height; i < 0; ++i) memcpy(&input[i * in_stride - cnn_config->ext_width], &input[-cnn_config->ext_width], in_width * sizeof(*input)); for (int i = height; i < height + cnn_config->ext_height; ++i) memcpy(&input[i * in_stride - cnn_config->ext_width], &input[(height - 1) * in_stride - cnn_config->ext_width], in_width * sizeof(*input)); } else { for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height; ++i) for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width; ++j) input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; } } bool success = av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride, cnn_config, thread_data, output); aom_free(input_); return success; } // Assume output already has proper allocation // Assume input image buffers all have same resolution and strides bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height, int stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, int bit_depth, CNN_MULTI_OUT *output) { const float max_val = (float)((1 << bit_depth) - 1); const int in_width = width + 2 * cnn_config->ext_width; const int in_height = height + 2 * cnn_config->ext_height; const int in_channels = cnn_config->layer_config[0].in_channels; float *inputs[CNN_MAX_CHANNELS]; float *input_ = (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_)); if (!input_) return false; const int in_stride = in_width; for (int c = 0; c < in_channels; ++c) { inputs[c] = input_ + c * in_stride * in_height; float *input = inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width; if (cnn_config->strict_bounds) { for (int i = 0; i < height; ++i) for (int j = 0; j < width; ++j) input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; // extend left and right for (int i = 0; i < height; ++i) { for (int j = -cnn_config->ext_width; j < 0; ++j) input[i * in_stride + j] = input[i * in_stride]; for (int j = width; j < width + cnn_config->ext_width; ++j) input[i * in_stride + j] = input[i * in_stride + width - 1]; } // extend top and bottom for (int i = -cnn_config->ext_height; i < 0; ++i) memcpy(&input[i * in_stride - cnn_config->ext_width], &input[-cnn_config->ext_width], in_width * sizeof(*input)); for (int i = height; i < height + cnn_config->ext_height; ++i) memcpy(&input[i * in_stride - cnn_config->ext_width], &input[(height - 1) * in_stride - cnn_config->ext_width], in_width * sizeof(*input)); } else { for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height; ++i) for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width; ++j) input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; } } bool success = av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride, cnn_config, thread_data, output); aom_free(input_); return success; } aom-3.12.1/av1/encoder/cnn.h000066400000000000000000000174501477627663500154450ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_CNN_H_ #define AOM_AV1_ENCODER_CNN_H_ #ifdef __cplusplus extern "C" { #endif #include #include #include "aom_util/aom_thread.h" #include "config/av1_rtcd.h" struct AV1Common; #define CNN_MAX_HIDDEN_LAYERS 64 #define CNN_MAX_LAYERS (CNN_MAX_HIDDEN_LAYERS + 1) #define CNN_MAX_CHANNELS 256 #define CNN_MAX_BRANCHES 4 #define CNN_MAX_THREADS 32 #define NO_BRANCH_CONFIG \ { 0, 0, 0 } #define NO_BN_PARAMS \ { NULL, NULL, NULL, NULL } enum { PADDING_SAME_ZERO, // tensorflow's SAME padding with pixels outside // the image area assumed to be 0 (default) PADDING_SAME_REPLICATE, // tensorflow's SAME padding with pixels outside // the image area replicated from closest edge PADDING_VALID // tensorflow's VALID padding } UENUM1BYTE(PADDING_TYPE); // enum { NONE, RELU, SOFTSIGN } UENUM1BYTE(ACTIVATION); // Times when input tensor may be copied to branches given in input_to_branches. // BRANCH_NO_COPY: doesn't copy any tensor. // BRANCH_INPUT: copies the input tensor to branches. // BRANCH_OUTPUT: copies the convolved tensor to branches. // BRANCH_COMBINED: copies the combined (after convolving and branch combining) // tensor. If no combinations happen at this layer, then this option // has the same effect as COPY_OUTPUT. enum { BRANCH_NO_COPY, BRANCH_INPUT, BRANCH_OUTPUT, BRANCH_COMBINED } UENUM1BYTE(BRANCH_COPY); // Types of combining branches with output of current layer: // BRANCH_NOC: no branch combining // BRANCH_ADD: Add previously stored branch tensor to output of layer // BRANCH_CAT: Concatenate branch tensor to output of layer enum { BRANCH_NOC, BRANCH_ADD, BRANCH_CAT } UENUM1BYTE(BRANCH_COMBINE); // The parameters used to scale each channel in batch // normalization. The processing in done on a per-channel basis. // e.g. bn_mean[c] is the mean for all pixels in channel c. This // is always applied after activation. The output is given by // out[c,i,j] = norm[c,i,j] * bn_gamma[c] + bn_beta[c] where // norm[c,i,j] = (in[c,i,j] - bn_mean[c]) / bn_std[c] // here we assume that the effect of variance_epsilon is already // taken into account when bn_std is calculated. The pointers // needs to be either all zero or all valid. If all zero, then // batchnorm is disabled, else batchnorm is applied. struct CNN_BATCHNORM_PARAMS { const float *bn_gamma; const float *bn_beta; const float *bn_mean; const float *bn_std; }; struct CNN_BRANCH_CONFIG { int input_to_branches; // If nonzero, copy the active tensor to the current // layer and store for future use in branches // specified in the field as a binary mask. For // example, if input_to_branch = 0x06, it means the // input tensor to the current branch is copied to // branches 1 and 2 (where 0 represents the primary // branch). One restriction is that the mask // cannot indicate copying to the current branch. // If greater than 0, only copies the channels up // to the given index. int channels_to_copy; // Within the layer, input a copy of active // tensor to branches given in input_to_branches. int branches_to_combine; // mask of branches to combine with output of // current layer, if // branch_combine_type != BRANCH_NOC // For example, if branches_to_combine = 0x0A, // it means that braches 1 and 3 are combined // with the current branch. }; struct CNN_LAYER_CONFIG { int in_channels; int filter_width; int filter_height; int out_channels; int skip_width; int skip_height; int maxpool; // whether to use maxpool or not (only effective when // skip width or skip_height are > 1) const float *weights; // array of length filter_height x filter_width x // in_channels x out_channels where the inner-most // scan is out_channels and the outer most scan is // filter_height. const float *bias; // array of length out_channels PADDING_TYPE pad; // padding type ACTIVATION activation; // the activation function to use after convolution int deconvolve; // whether this is a deconvolution layer. // 0: If skip_width or skip_height are > 1, then we // reduce resolution // 1: If skip_width or skip_height are > 1, then we // increase resolution int branch; // branch index in [0, CNN_MAX_BRANCHES - 1], where // 0 refers to the primary branch. BRANCH_COPY branch_copy_type; BRANCH_COMBINE branch_combine_type; struct CNN_BRANCH_CONFIG branch_config; struct CNN_BATCHNORM_PARAMS bn_params; // A struct that contains the parameters // used for batch normalization. int output_num; // The output buffer idx to which the layer output is // written. Set to -1 to disable writing it to the output. In // the case that branch_combine_type is BRANCH_CAT, all // concatenated channels will be written to output. In the // case of BRANCH_ADD, the output will be the result of // summation. }; struct CNN_CONFIG { int num_layers; // number of CNN layers ( = number of hidden layers + 1) int is_residue; // whether the output activation is a residue int ext_width, ext_height; // extension horizontally and vertically int strict_bounds; // whether the input bounds are strict or not. // If strict, the extension area is filled by // replication; if not strict, image data is // assumed available beyond the bounds. CNN_LAYER_CONFIG layer_config[CNN_MAX_LAYERS]; }; struct CNN_THREAD_DATA { int num_workers; AVxWorker *workers; }; struct CNN_MULTI_OUT { int num_outputs; const int *output_channels; const int *output_strides; float **output_buffer; }; // Function to return size of output void av1_find_cnn_output_size(int in_width, int in_height, const CNN_CONFIG *cnn_config, int *out_width, int *out_height, int *out_channels); // Function to return output width and output height of given layer. void av1_find_cnn_layer_output_size(int in_width, int in_height, const CNN_LAYER_CONFIG *layer_config, int *out_width, int *out_height); // Prediction functions from set of input image buffers. This function supports // CNN with multiple outputs. bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height, int stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, struct CNN_MULTI_OUT *output); bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height, int stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, int bit_depth, CNN_MULTI_OUT *output); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_CNN_H_ aom-3.12.1/av1/encoder/compound_type.c000066400000000000000000002170411477627663500175450ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/pred_common.h" #include "av1/encoder/compound_type.h" #include "av1/encoder/encoder_alloc.h" #include "av1/encoder/model_rd.h" #include "av1/encoder/motion_search_facade.h" #include "av1/encoder/rdopt_utils.h" #include "av1/encoder/reconinter_enc.h" #include "av1/encoder/tx_search.h" typedef int64_t (*pick_interinter_mask_type)( const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1, const int16_t *const residual1, const int16_t *const diff10, uint64_t *best_sse); // Checks if characteristics of search match static inline int is_comp_rd_match(const AV1_COMP *const cpi, const MACROBLOCK *const x, const COMP_RD_STATS *st, const MB_MODE_INFO *const mi, int32_t *comp_rate, int64_t *comp_dist, int32_t *comp_model_rate, int64_t *comp_model_dist, int *comp_rs2) { // TODO(ranjit): Ensure that compound type search use regular filter always // and check if following check can be removed // Check if interp filter matches with previous case if (st->filter.as_int != mi->interp_filters.as_int) return 0; const MACROBLOCKD *const xd = &x->e_mbd; // Match MV and reference indices for (int i = 0; i < 2; ++i) { if ((st->ref_frames[i] != mi->ref_frame[i]) || (st->mv[i].as_int != mi->mv[i].as_int)) { return 0; } const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[i]]; if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0; } int reuse_data[COMPOUND_TYPES] = { 1, 1, 0, 0 }; // For compound wedge, reuse data if newmv search is disabled when NEWMV is // present or if NEWMV is not present in either of the directions if ((!have_newmv_in_inter_mode(mi->mode) && !have_newmv_in_inter_mode(st->mode)) || (cpi->sf.inter_sf.disable_interinter_wedge_newmv_search)) reuse_data[COMPOUND_WEDGE] = 1; // For compound diffwtd, reuse data if fast search is enabled (no newmv search // when NEWMV is present) or if NEWMV is not present in either of the // directions if (cpi->sf.inter_sf.enable_fast_compound_mode_search || (!have_newmv_in_inter_mode(mi->mode) && !have_newmv_in_inter_mode(st->mode))) reuse_data[COMPOUND_DIFFWTD] = 1; // Store the stats for the different compound types for (int comp_type = COMPOUND_AVERAGE; comp_type < COMPOUND_TYPES; comp_type++) { if (reuse_data[comp_type]) { comp_rate[comp_type] = st->rate[comp_type]; comp_dist[comp_type] = st->dist[comp_type]; comp_model_rate[comp_type] = st->model_rate[comp_type]; comp_model_dist[comp_type] = st->model_dist[comp_type]; comp_rs2[comp_type] = st->comp_rs2[comp_type]; } } return 1; } // Checks if similar compound type search case is accounted earlier // If found, returns relevant rd data static inline int find_comp_rd_in_stats(const AV1_COMP *const cpi, const MACROBLOCK *x, const MB_MODE_INFO *const mbmi, int32_t *comp_rate, int64_t *comp_dist, int32_t *comp_model_rate, int64_t *comp_model_dist, int *comp_rs2, int *match_index) { for (int j = 0; j < x->comp_rd_stats_idx; ++j) { if (is_comp_rd_match(cpi, x, &x->comp_rd_stats[j], mbmi, comp_rate, comp_dist, comp_model_rate, comp_model_dist, comp_rs2)) { *match_index = j; return 1; } } return 0; // no match result found } static inline bool enable_wedge_search( MACROBLOCK *const x, const unsigned int disable_wedge_var_thresh) { // Enable wedge search if source variance and edge strength are above // the thresholds. return x->source_variance > disable_wedge_var_thresh; } static inline bool enable_wedge_interinter_search(MACROBLOCK *const x, const AV1_COMP *const cpi) { return enable_wedge_search( x, cpi->sf.inter_sf.disable_interinter_wedge_var_thresh) && cpi->oxcf.comp_type_cfg.enable_interinter_wedge; } static inline bool enable_wedge_interintra_search(MACROBLOCK *const x, const AV1_COMP *const cpi) { return enable_wedge_search( x, cpi->sf.inter_sf.disable_interintra_wedge_var_thresh) && cpi->oxcf.comp_type_cfg.enable_interintra_wedge; } static int8_t estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x, const BLOCK_SIZE bsize, const uint8_t *pred0, int stride0, const uint8_t *pred1, int stride1) { static const BLOCK_SIZE split_qtr[BLOCK_SIZES_ALL] = { // 4X4 BLOCK_INVALID, // 4X8, 8X4, 8X8 BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4, // 8X16, 16X8, 16X16 BLOCK_4X8, BLOCK_8X4, BLOCK_8X8, // 16X32, 32X16, 32X32 BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, // 32X64, 64X32, 64X64 BLOCK_16X32, BLOCK_32X16, BLOCK_32X32, // 64x128, 128x64, 128x128 BLOCK_32X64, BLOCK_64X32, BLOCK_64X64, // 4X16, 16X4, 8X32 BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16, // 32X8, 16X64, 64X16 BLOCK_16X4, BLOCK_8X32, BLOCK_32X8 }; const struct macroblock_plane *const p = &x->plane[0]; const uint8_t *src = p->src.buf; int src_stride = p->src.stride; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int bw_by2 = bw >> 1; const int bh_by2 = bh >> 1; uint32_t esq[2][2]; int64_t tl, br; const BLOCK_SIZE f_index = split_qtr[bsize]; assert(f_index != BLOCK_INVALID); if (is_cur_buf_hbd(&x->e_mbd)) { pred0 = CONVERT_TO_BYTEPTR(pred0); pred1 = CONVERT_TO_BYTEPTR(pred1); } // Residual variance computation over relevant quandrants in order to // find TL + BR, TL = sum(1st,2nd,3rd) quadrants of (pred0 - pred1), // BR = sum(2nd,3rd,4th) quadrants of (pred1 - pred0) // The 2nd and 3rd quadrants cancel out in TL + BR // Hence TL + BR = 1st quadrant of (pred0-pred1) + 4th of (pred1-pred0) // TODO(nithya): Sign estimation assumes 45 degrees (1st and 4th quadrants) // for all codebooks; experiment with other quadrant combinations for // 0, 90 and 135 degrees also. cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]); cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride, pred0 + bh_by2 * stride0 + bw_by2, stride0, &esq[0][1]); cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]); cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride, pred1 + bh_by2 * stride1 + bw_by2, stride0, &esq[1][1]); tl = ((int64_t)esq[0][0]) - ((int64_t)esq[1][0]); br = ((int64_t)esq[1][1]) - ((int64_t)esq[0][1]); return (tl + br > 0); } // Choose the best wedge index and sign static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, const BLOCK_SIZE bsize, const uint8_t *const p0, const int16_t *const residual1, const int16_t *const diff10, int8_t *const best_wedge_sign, int8_t *const best_wedge_index, uint64_t *best_sse) { const MACROBLOCKD *const xd = &x->e_mbd; const struct buf_2d *const src = &x->plane[0].src; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int N = bw * bh; assert(N >= 64); int rate; int64_t dist; int64_t rd, best_rd = INT64_MAX; int8_t wedge_index; int8_t wedge_sign; const int8_t wedge_types = get_wedge_types_lookup(bsize); const uint8_t *mask; uint64_t sse; const int hbd = is_cur_buf_hbd(xd); const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]); // src - pred0 #if CONFIG_AV1_HIGHBITDEPTH if (hbd) { aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, CONVERT_TO_BYTEPTR(p0), bw); } else { aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw); } #else (void)hbd; aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw); #endif int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) - (int64_t)aom_sum_squares_i16(residual1, N)) * (1 << WEDGE_WEIGHT_BITS) / 2; int16_t *ds = residual0; av1_wedge_compute_delta_squares(ds, residual0, residual1, N); for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize); wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit); mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); sse = ROUND_POWER_OF_TWO(sse, bd_round); model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, &rate, &dist); // int rate2; // int64_t dist2; // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2); // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n", // sse, rate, dist, rate2, dist2); dist = dist2; // rate = rate2; rate += x->mode_costs.wedge_idx_cost[bsize][wedge_index]; rd = RDCOST(x->rdmult, rate, dist); if (rd < best_rd) { *best_wedge_index = wedge_index; *best_wedge_sign = wedge_sign; best_rd = rd; *best_sse = sse; } } return best_rd - RDCOST(x->rdmult, x->mode_costs.wedge_idx_cost[bsize][*best_wedge_index], 0); } // Choose the best wedge index the specified sign static int64_t pick_wedge_fixed_sign( const AV1_COMP *const cpi, const MACROBLOCK *const x, const BLOCK_SIZE bsize, const int16_t *const residual1, const int16_t *const diff10, const int8_t wedge_sign, int8_t *const best_wedge_index, uint64_t *best_sse) { const MACROBLOCKD *const xd = &x->e_mbd; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int N = bw * bh; assert(N >= 64); int rate; int64_t dist; int64_t rd, best_rd = INT64_MAX; int8_t wedge_index; const int8_t wedge_types = get_wedge_types_lookup(bsize); const uint8_t *mask; uint64_t sse; const int hbd = is_cur_buf_hbd(xd); const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); sse = ROUND_POWER_OF_TWO(sse, bd_round); model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, &rate, &dist); rate += x->mode_costs.wedge_idx_cost[bsize][wedge_index]; rd = RDCOST(x->rdmult, rate, dist); if (rd < best_rd) { *best_wedge_index = wedge_index; best_rd = rd; *best_sse = sse; } } return best_rd - RDCOST(x->rdmult, x->mode_costs.wedge_idx_cost[bsize][*best_wedge_index], 0); } static int64_t pick_interinter_wedge( const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1, const int16_t *const residual1, const int16_t *const diff10, uint64_t *best_sse) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const int bw = block_size_wide[bsize]; int64_t rd; int8_t wedge_index = -1; int8_t wedge_sign = 0; assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize)); assert(cpi->common.seq_params->enable_masked_compound); if (cpi->sf.inter_sf.fast_wedge_sign_estimate) { wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw); rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, wedge_sign, &wedge_index, best_sse); } else { rd = pick_wedge(cpi, x, bsize, p0, residual1, diff10, &wedge_sign, &wedge_index, best_sse); } mbmi->interinter_comp.wedge_sign = wedge_sign; mbmi->interinter_comp.wedge_index = wedge_index; return rd; } static int64_t pick_interinter_seg(const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1, const int16_t *const residual1, const int16_t *const diff10, uint64_t *best_sse) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int N = 1 << num_pels_log2_lookup[bsize]; int rate; int64_t dist; DIFFWTD_MASK_TYPE cur_mask_type; int64_t best_rd = INT64_MAX; DIFFWTD_MASK_TYPE best_mask_type = 0; const int hbd = is_cur_buf_hbd(xd); const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]); uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask }; // try each mask type and its inverse for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) { // build mask and inverse #if CONFIG_AV1_HIGHBITDEPTH if (hbd) av1_build_compound_diffwtd_mask_highbd( tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw, CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd); else av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type, p0, bw, p1, bw, bh, bw); #else (void)hbd; av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type, p0, bw, p1, bw, bh, bw); #endif // CONFIG_AV1_HIGHBITDEPTH // compute rd for mask uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10, tmp_mask[cur_mask_type], N); sse = ROUND_POWER_OF_TWO(sse, bd_round); model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, &rate, &dist); const int64_t rd0 = RDCOST(x->rdmult, rate, dist); if (rd0 < best_rd) { best_mask_type = cur_mask_type; best_rd = rd0; *best_sse = sse; } } mbmi->interinter_comp.mask_type = best_mask_type; if (best_mask_type == DIFFWTD_38_INV) { memcpy(xd->seg_mask, seg_mask, N * 2); } return best_rd; } static int64_t pick_interintra_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1) { const MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; assert(av1_is_wedge_used(bsize)); assert(cpi->common.seq_params->enable_interintra_compound); const struct buf_2d *const src = &x->plane[0].src; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1 DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0 #if CONFIG_AV1_HIGHBITDEPTH if (is_cur_buf_hbd(xd)) { aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, CONVERT_TO_BYTEPTR(p1), bw); aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw, CONVERT_TO_BYTEPTR(p0), bw); } else { aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw); aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw); } #else aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw); aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw); #endif int8_t wedge_index = -1; uint64_t sse; int64_t rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, 0, &wedge_index, &sse); mbmi->interintra_wedge_index = wedge_index; return rd; } static inline void get_inter_predictors_masked_compound( MACROBLOCK *x, const BLOCK_SIZE bsize, uint8_t **preds0, uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides) { MACROBLOCKD *xd = &x->e_mbd; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; // get inter predictors to use for masked compound modes av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0, preds0, strides); av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1, preds1, strides); const struct buf_2d *const src = &x->plane[0].src; #if CONFIG_AV1_HIGHBITDEPTH if (is_cur_buf_hbd(xd)) { aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, CONVERT_TO_BYTEPTR(*preds1), bw); aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1), bw, CONVERT_TO_BYTEPTR(*preds0), bw); } else { aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1, bw); aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw); } #else aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1, bw); aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw); #endif } // Computes the rd cost for the given interintra mode and updates the best static inline void compute_best_interintra_mode( const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd, MACROBLOCK *const x, const int *const interintra_mode_cost, const BUFFER_SET *orig_dst, uint8_t *intrapred, const uint8_t *tmp_buf, INTERINTRA_MODE *best_interintra_mode, int64_t *best_interintra_rd, INTERINTRA_MODE interintra_mode, BLOCK_SIZE bsize) { const AV1_COMMON *const cm = &cpi->common; int rate; uint8_t skip_txfm_sb; int64_t dist, skip_sse_sb; const int bw = block_size_wide[bsize]; mbmi->interintra_mode = interintra_mode; int rmode = interintra_mode_cost[interintra_mode]; av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, intrapred, bw); av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](cpi, bsize, x, xd, 0, 0, &rate, &dist, &skip_txfm_sb, &skip_sse_sb, NULL, NULL, NULL); int64_t rd = RDCOST(x->rdmult, rate + rmode, dist); if (rd < *best_interintra_rd) { *best_interintra_rd = rd; *best_interintra_mode = mbmi->interintra_mode; } } static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs, MACROBLOCK *x, int64_t ref_best_rd, RD_STATS *rd_stats) { MACROBLOCKD *const xd = &x->e_mbd; if (ref_best_rd < 0) return INT64_MAX; av1_subtract_plane(x, bs, 0); const int64_t rd = av1_estimate_txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs, max_txsize_rect_lookup[bs]); if (rd != INT64_MAX) { const int skip_ctx = av1_get_skip_txfm_context(xd); if (rd_stats->skip_txfm) { const int s1 = x->mode_costs.skip_txfm_cost[skip_ctx][1]; rd_stats->rate = s1; } else { const int s0 = x->mode_costs.skip_txfm_cost[skip_ctx][0]; rd_stats->rate += s0; } } return rd; } // Computes the rd_threshold for smooth interintra rd search. static inline int64_t compute_rd_thresh(MACROBLOCK *const x, int total_mode_rate, int64_t ref_best_rd) { const int64_t rd_thresh = get_rd_thresh_from_best_rd( ref_best_rd, (1 << INTER_INTRA_RD_THRESH_SHIFT), INTER_INTRA_RD_THRESH_SCALE); const int64_t mode_rd = RDCOST(x->rdmult, total_mode_rate, 0); return (rd_thresh - mode_rd); } // Computes the best wedge interintra mode static inline int64_t compute_best_wedge_interintra( const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd, MACROBLOCK *const x, const int *const interintra_mode_cost, const BUFFER_SET *orig_dst, uint8_t *intrapred_, uint8_t *tmp_buf_, int *best_mode, int *best_wedge_index, BLOCK_SIZE bsize) { const AV1_COMMON *const cm = &cpi->common; const int bw = block_size_wide[bsize]; int64_t best_interintra_rd_wedge = INT64_MAX; int64_t best_total_rd = INT64_MAX; uint8_t *intrapred = get_buf_by_bd(xd, intrapred_); for (INTERINTRA_MODE mode = 0; mode < INTERINTRA_MODES; ++mode) { mbmi->interintra_mode = mode; av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, intrapred, bw); int64_t rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); const int rate_overhead = interintra_mode_cost[mode] + x->mode_costs.wedge_idx_cost[bsize][mbmi->interintra_wedge_index]; const int64_t total_rd = rd + RDCOST(x->rdmult, rate_overhead, 0); if (total_rd < best_total_rd) { best_total_rd = total_rd; best_interintra_rd_wedge = rd; *best_mode = mbmi->interintra_mode; *best_wedge_index = mbmi->interintra_wedge_index; } } return best_interintra_rd_wedge; } static int handle_smooth_inter_intra_mode( const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, MB_MODE_INFO *mbmi, int64_t ref_best_rd, int *rate_mv, INTERINTRA_MODE *best_interintra_mode, int64_t *best_rd, int *best_mode_rate, const BUFFER_SET *orig_dst, uint8_t *tmp_buf, uint8_t *intrapred, HandleInterModeArgs *args) { MACROBLOCKD *xd = &x->e_mbd; const ModeCosts *mode_costs = &x->mode_costs; const int *const interintra_mode_cost = mode_costs->interintra_mode_cost[size_group_lookup[bsize]]; const AV1_COMMON *const cm = &cpi->common; const int bw = block_size_wide[bsize]; mbmi->use_wedge_interintra = 0; if (cpi->sf.inter_sf.reuse_inter_intra_mode == 0 || *best_interintra_mode == INTERINTRA_MODES) { int64_t best_interintra_rd = INT64_MAX; for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES; ++cur_mode) { if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra || cpi->sf.intra_sf.disable_smooth_intra) && cur_mode == II_SMOOTH_PRED) continue; compute_best_interintra_mode( cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred, tmp_buf, best_interintra_mode, &best_interintra_rd, cur_mode, bsize); } args->inter_intra_mode[mbmi->ref_frame[0]] = *best_interintra_mode; } assert(IMPLIES(!cpi->oxcf.comp_type_cfg.enable_smooth_interintra, *best_interintra_mode != II_SMOOTH_PRED)); // Recompute prediction if required bool interintra_mode_reuse = cpi->sf.inter_sf.reuse_inter_intra_mode || *best_interintra_mode != INTERINTRA_MODES; if (interintra_mode_reuse || *best_interintra_mode != INTERINTRA_MODES - 1) { mbmi->interintra_mode = *best_interintra_mode; av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, intrapred, bw); av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); } // Compute rd cost for best smooth_interintra RD_STATS rd_stats; const int is_wedge_used = av1_is_wedge_used(bsize); const int rmode = interintra_mode_cost[*best_interintra_mode] + (is_wedge_used ? mode_costs->wedge_interintra_cost[bsize][0] : 0); const int total_mode_rate = rmode + *rate_mv; const int64_t rd_thresh = compute_rd_thresh(x, total_mode_rate, ref_best_rd); int64_t rd = estimate_yrd_for_sb(cpi, bsize, x, rd_thresh, &rd_stats); if (rd != INT64_MAX) { rd = RDCOST(x->rdmult, total_mode_rate + rd_stats.rate, rd_stats.dist); } else { return IGNORE_MODE; } *best_rd = rd; *best_mode_rate = rmode; // Return early if best rd not good enough if (ref_best_rd < INT64_MAX && (*best_rd >> INTER_INTRA_RD_THRESH_SHIFT) * INTER_INTRA_RD_THRESH_SCALE > ref_best_rd) { return IGNORE_MODE; } return 0; } static int handle_wedge_inter_intra_mode( const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, MB_MODE_INFO *mbmi, int *rate_mv, INTERINTRA_MODE *best_interintra_mode, int64_t *best_rd, const BUFFER_SET *orig_dst, uint8_t *tmp_buf_, uint8_t *tmp_buf, uint8_t *intrapred_, uint8_t *intrapred, HandleInterModeArgs *args, int *tmp_rate_mv, int *rate_overhead, int_mv *tmp_mv, int64_t best_rd_no_wedge) { MACROBLOCKD *xd = &x->e_mbd; const ModeCosts *mode_costs = &x->mode_costs; const int *const interintra_mode_cost = mode_costs->interintra_mode_cost[size_group_lookup[bsize]]; const AV1_COMMON *const cm = &cpi->common; const int bw = block_size_wide[bsize]; const int try_smooth_interintra = cpi->oxcf.comp_type_cfg.enable_smooth_interintra; mbmi->use_wedge_interintra = 1; if (!cpi->sf.inter_sf.fast_interintra_wedge_search) { // Exhaustive search of all wedge and mode combinations. int best_mode = 0; int best_wedge_index = 0; *best_rd = compute_best_wedge_interintra( cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred_, tmp_buf_, &best_mode, &best_wedge_index, bsize); mbmi->interintra_mode = best_mode; mbmi->interintra_wedge_index = best_wedge_index; if (best_mode != INTERINTRA_MODES - 1) { av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, intrapred, bw); } } else if (!try_smooth_interintra) { if (*best_interintra_mode == INTERINTRA_MODES) { mbmi->interintra_mode = INTERINTRA_MODES - 1; *best_interintra_mode = INTERINTRA_MODES - 1; av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, intrapred, bw); // Pick wedge mask based on INTERINTRA_MODES - 1 *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); // Find the best interintra mode for the chosen wedge mask for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES; ++cur_mode) { compute_best_interintra_mode( cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred, tmp_buf, best_interintra_mode, best_rd, cur_mode, bsize); } args->inter_intra_mode[mbmi->ref_frame[0]] = *best_interintra_mode; mbmi->interintra_mode = *best_interintra_mode; // Recompute prediction if required if (*best_interintra_mode != INTERINTRA_MODES - 1) { av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, intrapred, bw); } } else { // Pick wedge mask for the best interintra mode (reused) mbmi->interintra_mode = *best_interintra_mode; av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, intrapred, bw); *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); } } else { // Pick wedge mask for the best interintra mode from smooth_interintra *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); } *rate_overhead = interintra_mode_cost[mbmi->interintra_mode] + mode_costs->wedge_idx_cost[bsize][mbmi->interintra_wedge_index] + mode_costs->wedge_interintra_cost[bsize][1]; *best_rd += RDCOST(x->rdmult, *rate_overhead + *rate_mv, 0); int64_t rd = INT64_MAX; const int_mv mv0 = mbmi->mv[0]; // Refine motion vector for NEWMV case. if (have_newmv_in_inter_mode(mbmi->mode)) { int rate_sum; uint8_t skip_txfm_sb; int64_t dist_sum, skip_sse_sb; // get negative of mask const uint8_t *mask = av1_get_contiguous_soft_mask(mbmi->interintra_wedge_index, 1, bsize); av1_compound_single_motion_search(cpi, x, bsize, &tmp_mv->as_mv, intrapred, mask, bw, tmp_rate_mv, 0); if (mbmi->mv[0].as_int != tmp_mv->as_int) { mbmi->mv[0].as_int = tmp_mv->as_int; // Set ref_frame[1] to NONE_FRAME temporarily so that the intra // predictor is not calculated again in av1_enc_build_inter_predictor(). mbmi->ref_frame[1] = NONE_FRAME; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, AOM_PLANE_Y, AOM_PLANE_Y); mbmi->ref_frame[1] = INTRA_FRAME; av1_combine_interintra(xd, bsize, 0, xd->plane[AOM_PLANE_Y].dst.buf, xd->plane[AOM_PLANE_Y].dst.stride, intrapred, bw); model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &skip_txfm_sb, &skip_sse_sb, NULL, NULL, NULL); rd = RDCOST(x->rdmult, *tmp_rate_mv + *rate_overhead + rate_sum, dist_sum); } } if (rd >= *best_rd) { tmp_mv->as_int = mv0.as_int; *tmp_rate_mv = *rate_mv; av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); } // Evaluate closer to true rd RD_STATS rd_stats; const int64_t mode_rd = RDCOST(x->rdmult, *rate_overhead + *tmp_rate_mv, 0); const int64_t tmp_rd_thresh = best_rd_no_wedge - mode_rd; rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats); if (rd != INT64_MAX) { rd = RDCOST(x->rdmult, *rate_overhead + *tmp_rate_mv + rd_stats.rate, rd_stats.dist); } else { if (*best_rd == INT64_MAX) return IGNORE_MODE; } *best_rd = rd; return 0; } int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, MB_MODE_INFO *mbmi, HandleInterModeArgs *args, int64_t ref_best_rd, int *rate_mv, int *tmp_rate2, const BUFFER_SET *orig_dst) { const int try_smooth_interintra = cpi->oxcf.comp_type_cfg.enable_smooth_interintra; const int is_wedge_used = av1_is_wedge_used(bsize); const int try_wedge_interintra = is_wedge_used && enable_wedge_interintra_search(x, cpi); const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; const int bw = block_size_wide[bsize]; DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]); DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]); uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_); uint8_t *intrapred = get_buf_by_bd(xd, intrapred_); const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; // Single reference inter prediction mbmi->ref_frame[1] = NONE_FRAME; xd->plane[0].dst.buf = tmp_buf; xd->plane[0].dst.stride = bw; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, AOM_PLANE_Y, AOM_PLANE_Y); const int num_planes = av1_num_planes(cm); // Restore the buffers for intra prediction restore_dst_buf(xd, *orig_dst, num_planes); mbmi->ref_frame[1] = INTRA_FRAME; INTERINTRA_MODE best_interintra_mode = args->inter_intra_mode[mbmi->ref_frame[0]]; // Compute smooth_interintra int64_t best_interintra_rd_nowedge = INT64_MAX; int best_mode_rate = INT_MAX; if (try_smooth_interintra) { int ret = handle_smooth_inter_intra_mode( cpi, x, bsize, mbmi, ref_best_rd, rate_mv, &best_interintra_mode, &best_interintra_rd_nowedge, &best_mode_rate, orig_dst, tmp_buf, intrapred, args); if (ret == IGNORE_MODE) { return IGNORE_MODE; } } // Compute wedge interintra int64_t best_interintra_rd_wedge = INT64_MAX; const int_mv mv0 = mbmi->mv[0]; int_mv tmp_mv = mv0; int tmp_rate_mv = 0; int rate_overhead = 0; if (try_wedge_interintra) { int ret = handle_wedge_inter_intra_mode( cpi, x, bsize, mbmi, rate_mv, &best_interintra_mode, &best_interintra_rd_wedge, orig_dst, tmp_buf_, tmp_buf, intrapred_, intrapred, args, &tmp_rate_mv, &rate_overhead, &tmp_mv, best_interintra_rd_nowedge); if (ret == IGNORE_MODE) { return IGNORE_MODE; } } if (best_interintra_rd_nowedge == INT64_MAX && best_interintra_rd_wedge == INT64_MAX) { return IGNORE_MODE; } if (best_interintra_rd_wedge < best_interintra_rd_nowedge) { mbmi->mv[0].as_int = tmp_mv.as_int; *tmp_rate2 += tmp_rate_mv - *rate_mv; *rate_mv = tmp_rate_mv; best_mode_rate = rate_overhead; } else if (try_smooth_interintra && try_wedge_interintra) { // If smooth was best, but we over-wrote the values when evaluating the // wedge mode, we need to recompute the smooth values. mbmi->use_wedge_interintra = 0; mbmi->interintra_mode = best_interintra_mode; mbmi->mv[0].as_int = mv0.as_int; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, AOM_PLANE_Y, AOM_PLANE_Y); } *tmp_rate2 += best_mode_rate; if (num_planes > 1) { av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, AOM_PLANE_U, num_planes - 1); } return 0; } // Computes the valid compound_types to be evaluated static inline int compute_valid_comp_types(MACROBLOCK *x, const AV1_COMP *const cpi, BLOCK_SIZE bsize, int masked_compound_used, int mode_search_mask, COMPOUND_TYPE *valid_comp_types) { const AV1_COMMON *cm = &cpi->common; int valid_type_count = 0; int comp_type, valid_check; int8_t enable_masked_type[MASKED_COMPOUND_TYPES] = { 0, 0 }; const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE)); const int try_distwtd_comp = ((mode_search_mask & (1 << COMPOUND_DISTWTD)) && cm->seq_params->order_hint_info.enable_dist_wtd_comp == 1 && cpi->sf.inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED); // Check if COMPOUND_AVERAGE and COMPOUND_DISTWTD are valid cases for (comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD; comp_type++) { valid_check = (comp_type == COMPOUND_AVERAGE) ? try_average_comp : try_distwtd_comp; if (valid_check && is_interinter_compound_used(comp_type, bsize)) valid_comp_types[valid_type_count++] = comp_type; } // Check if COMPOUND_WEDGE and COMPOUND_DIFFWTD are valid cases if (masked_compound_used) { // enable_masked_type[0] corresponds to COMPOUND_WEDGE // enable_masked_type[1] corresponds to COMPOUND_DIFFWTD enable_masked_type[0] = enable_wedge_interinter_search(x, cpi); enable_masked_type[1] = cpi->oxcf.comp_type_cfg.enable_diff_wtd_comp; for (comp_type = COMPOUND_WEDGE; comp_type <= COMPOUND_DIFFWTD; comp_type++) { if ((mode_search_mask & (1 << comp_type)) && is_interinter_compound_used(comp_type, bsize) && enable_masked_type[comp_type - COMPOUND_WEDGE]) valid_comp_types[valid_type_count++] = comp_type; } } return valid_type_count; } // Calculates the cost for compound type mask static inline void calc_masked_type_cost( const ModeCosts *mode_costs, BLOCK_SIZE bsize, int comp_group_idx_ctx, int comp_index_ctx, int masked_compound_used, int *masked_type_cost) { av1_zero_array(masked_type_cost, COMPOUND_TYPES); // Account for group index cost when wedge and/or diffwtd prediction are // enabled if (masked_compound_used) { // Compound group index of average and distwtd is 0 // Compound group index of wedge and diffwtd is 1 masked_type_cost[COMPOUND_AVERAGE] += mode_costs->comp_group_idx_cost[comp_group_idx_ctx][0]; masked_type_cost[COMPOUND_DISTWTD] += masked_type_cost[COMPOUND_AVERAGE]; masked_type_cost[COMPOUND_WEDGE] += mode_costs->comp_group_idx_cost[comp_group_idx_ctx][1]; masked_type_cost[COMPOUND_DIFFWTD] += masked_type_cost[COMPOUND_WEDGE]; } // Compute the cost to signal compound index/type masked_type_cost[COMPOUND_AVERAGE] += mode_costs->comp_idx_cost[comp_index_ctx][1]; masked_type_cost[COMPOUND_DISTWTD] += mode_costs->comp_idx_cost[comp_index_ctx][0]; masked_type_cost[COMPOUND_WEDGE] += mode_costs->compound_type_cost[bsize][0]; masked_type_cost[COMPOUND_DIFFWTD] += mode_costs->compound_type_cost[bsize][1]; } // Updates mbmi structure with the relevant compound type info static inline void update_mbmi_for_compound_type(MB_MODE_INFO *mbmi, COMPOUND_TYPE cur_type) { mbmi->interinter_comp.type = cur_type; mbmi->comp_group_idx = (cur_type >= COMPOUND_WEDGE); mbmi->compound_idx = (cur_type != COMPOUND_DISTWTD); } // When match is found, populate the compound type data // and calculate the rd cost using the stored stats and // update the mbmi appropriately. static inline int populate_reuse_comp_type_data( const MACROBLOCK *x, MB_MODE_INFO *mbmi, BEST_COMP_TYPE_STATS *best_type_stats, int_mv *cur_mv, int32_t *comp_rate, int64_t *comp_dist, int *comp_rs2, int *rate_mv, int64_t *rd, int match_index) { const int winner_comp_type = x->comp_rd_stats[match_index].interinter_comp.type; if (comp_rate[winner_comp_type] == INT_MAX) return best_type_stats->best_compmode_interinter_cost; update_mbmi_for_compound_type(mbmi, winner_comp_type); mbmi->interinter_comp = x->comp_rd_stats[match_index].interinter_comp; *rd = RDCOST( x->rdmult, comp_rs2[winner_comp_type] + *rate_mv + comp_rate[winner_comp_type], comp_dist[winner_comp_type]); mbmi->mv[0].as_int = cur_mv[0].as_int; mbmi->mv[1].as_int = cur_mv[1].as_int; return comp_rs2[winner_comp_type]; } // Updates rd cost and relevant compound type data for the best compound type static inline void update_best_info(const MB_MODE_INFO *const mbmi, int64_t *rd, BEST_COMP_TYPE_STATS *best_type_stats, int64_t best_rd_cur, int64_t comp_model_rd_cur, int rs2) { *rd = best_rd_cur; best_type_stats->comp_best_model_rd = comp_model_rd_cur; best_type_stats->best_compound_data = mbmi->interinter_comp; best_type_stats->best_compmode_interinter_cost = rs2; } // Updates best_mv for masked compound types static inline void update_mask_best_mv(const MB_MODE_INFO *const mbmi, int_mv *best_mv, int *best_tmp_rate_mv, int tmp_rate_mv) { *best_tmp_rate_mv = tmp_rate_mv; best_mv[0].as_int = mbmi->mv[0].as_int; best_mv[1].as_int = mbmi->mv[1].as_int; } static inline void save_comp_rd_search_stat( MACROBLOCK *x, const MB_MODE_INFO *const mbmi, const int32_t *comp_rate, const int64_t *comp_dist, const int32_t *comp_model_rate, const int64_t *comp_model_dist, const int_mv *cur_mv, const int *comp_rs2) { const int offset = x->comp_rd_stats_idx; if (offset < MAX_COMP_RD_STATS) { COMP_RD_STATS *const rd_stats = x->comp_rd_stats + offset; memcpy(rd_stats->rate, comp_rate, sizeof(rd_stats->rate)); memcpy(rd_stats->dist, comp_dist, sizeof(rd_stats->dist)); memcpy(rd_stats->model_rate, comp_model_rate, sizeof(rd_stats->model_rate)); memcpy(rd_stats->model_dist, comp_model_dist, sizeof(rd_stats->model_dist)); memcpy(rd_stats->comp_rs2, comp_rs2, sizeof(rd_stats->comp_rs2)); memcpy(rd_stats->mv, cur_mv, sizeof(rd_stats->mv)); memcpy(rd_stats->ref_frames, mbmi->ref_frame, sizeof(rd_stats->ref_frames)); rd_stats->mode = mbmi->mode; rd_stats->filter = mbmi->interp_filters; rd_stats->ref_mv_idx = mbmi->ref_mv_idx; const MACROBLOCKD *const xd = &x->e_mbd; for (int i = 0; i < 2; ++i) { const WarpedMotionParams *const wm = &xd->global_motion[mbmi->ref_frame[i]]; rd_stats->is_global[i] = is_global_mv_block(mbmi, wm->wmtype); } memcpy(&rd_stats->interinter_comp, &mbmi->interinter_comp, sizeof(rd_stats->interinter_comp)); ++x->comp_rd_stats_idx; } } static inline int get_interinter_compound_mask_rate( const ModeCosts *const mode_costs, const MB_MODE_INFO *const mbmi) { const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type; // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD if (compound_type == COMPOUND_WEDGE) { return av1_is_wedge_used(mbmi->bsize) ? av1_cost_literal(1) + mode_costs ->wedge_idx_cost[mbmi->bsize] [mbmi->interinter_comp.wedge_index] : 0; } else { assert(compound_type == COMPOUND_DIFFWTD); return av1_cost_literal(1); } } // Takes a backup of rate, distortion and model_rd for future reuse static inline void backup_stats(COMPOUND_TYPE cur_type, int32_t *comp_rate, int64_t *comp_dist, int32_t *comp_model_rate, int64_t *comp_model_dist, int rate_sum, int64_t dist_sum, RD_STATS *rd_stats, int *comp_rs2, int rs2) { comp_rate[cur_type] = rd_stats->rate; comp_dist[cur_type] = rd_stats->dist; comp_model_rate[cur_type] = rate_sum; comp_model_dist[cur_type] = dist_sum; comp_rs2[cur_type] = rs2; } static inline int save_mask_search_results(const PREDICTION_MODE this_mode, const int reuse_level) { if (reuse_level || (this_mode == NEW_NEWMV)) return 1; else return 0; } static inline int prune_mode_by_skip_rd(const AV1_COMP *const cpi, MACROBLOCK *x, MACROBLOCKD *xd, const BLOCK_SIZE bsize, int64_t ref_skip_rd, int mode_rate) { int eval_txfm = 1; const int txfm_rd_gate_level = get_txfm_rd_gate_level(cpi->common.seq_params->enable_masked_compound, cpi->sf.inter_sf.txfm_rd_gate_level, bsize, TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0); // Check if the mode is good enough based on skip rd if (txfm_rd_gate_level) { int64_t sse_y = compute_sse_plane(x, xd, PLANE_TYPE_Y, bsize); int64_t skip_rd = RDCOST(x->rdmult, mode_rate, (sse_y << 4)); eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd, txfm_rd_gate_level, 1); } return eval_txfm; } static int64_t masked_compound_type_rd( const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2, int rate_mv, const BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides, int mode_rate, int64_t rd_thresh, int *calc_pred_masked_compound, int32_t *comp_rate, int64_t *comp_dist, int32_t *comp_model_rate, int64_t *comp_model_dist, const int64_t comp_best_model_rd, int64_t *const comp_model_rd_cur, int *comp_rs2, int64_t ref_skip_rd) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; int64_t best_rd_cur = INT64_MAX; int64_t rd = INT64_MAX; const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type; // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD assert(compound_type == COMPOUND_WEDGE || compound_type == COMPOUND_DIFFWTD); int rate_sum; uint8_t tmp_skip_txfm_sb; int64_t dist_sum, tmp_skip_sse_sb; pick_interinter_mask_type pick_interinter_mask[2] = { pick_interinter_wedge, pick_interinter_seg }; // TODO(any): Save pred and mask calculation as well into records. However // this may increase memory requirements as compound segment mask needs to be // stored in each record. if (*calc_pred_masked_compound) { get_inter_predictors_masked_compound(x, bsize, preds0, preds1, residual1, diff10, strides); *calc_pred_masked_compound = 0; } if (compound_type == COMPOUND_WEDGE) { unsigned int sse; if (is_cur_buf_hbd(xd)) (void)cpi->ppi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides, CONVERT_TO_BYTEPTR(*preds1), *strides, &sse); else (void)cpi->ppi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides, &sse); const unsigned int mse = ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]); // If two predictors are very similar, skip wedge compound mode search if (mse < 8 || (!have_newmv_in_inter_mode(this_mode) && mse < 64)) { *comp_model_rd_cur = INT64_MAX; return INT64_MAX; } } // Function pointer to pick the appropriate mask // compound_type == COMPOUND_WEDGE, calls pick_interinter_wedge() // compound_type == COMPOUND_DIFFWTD, calls pick_interinter_seg() uint64_t cur_sse = UINT64_MAX; best_rd_cur = pick_interinter_mask[compound_type - COMPOUND_WEDGE]( cpi, x, bsize, *preds0, *preds1, residual1, diff10, &cur_sse); *rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0); assert(cur_sse != UINT64_MAX); int64_t skip_rd_cur = RDCOST(x->rdmult, *rs2 + rate_mv, (cur_sse << 4)); // Although the true rate_mv might be different after motion search, but it // is unlikely to be the best mode considering the transform rd cost and other // mode overhead cost int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0); if (mode_rd > rd_thresh) { *comp_model_rd_cur = INT64_MAX; return INT64_MAX; } // Check if the mode is good enough based on skip rd // TODO(nithya): Handle wedge_newmv_search if extending for lower speed // setting const int txfm_rd_gate_level = get_txfm_rd_gate_level(cm->seq_params->enable_masked_compound, cpi->sf.inter_sf.txfm_rd_gate_level, bsize, TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0); if (txfm_rd_gate_level) { int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd_cur, txfm_rd_gate_level, 1); if (!eval_txfm) { *comp_model_rd_cur = INT64_MAX; return INT64_MAX; } } // Compute cost if matching record not found, else, reuse data if (comp_rate[compound_type] == INT_MAX) { // Check whether new MV search for wedge is to be done int wedge_newmv_search = have_newmv_in_inter_mode(this_mode) && (compound_type == COMPOUND_WEDGE) && (!cpi->sf.inter_sf.disable_interinter_wedge_newmv_search); // Search for new MV if needed and build predictor if (wedge_newmv_search) { *out_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, bsize, this_mode); const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, ctx, bsize, AOM_PLANE_Y, AOM_PLANE_Y); } else { *out_rate_mv = rate_mv; av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides, preds1, strides); } // Get the RD cost from model RD model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum); *comp_model_rd_cur = rd; // Override with best if current is worse than best for new MV if (wedge_newmv_search) { if (rd >= best_rd_cur) { mbmi->mv[0].as_int = cur_mv[0].as_int; mbmi->mv[1].as_int = cur_mv[1].as_int; *out_rate_mv = rate_mv; av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides, preds1, strides); *comp_model_rd_cur = best_rd_cur; } } if (cpi->sf.inter_sf.prune_comp_type_by_model_rd && (*comp_model_rd_cur > comp_best_model_rd) && comp_best_model_rd != INT64_MAX) { *comp_model_rd_cur = INT64_MAX; return INT64_MAX; } // Compute RD cost for the current type RD_STATS rd_stats; const int64_t tmp_mode_rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv, 0); const int64_t tmp_rd_thresh = rd_thresh - tmp_mode_rd; rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats); if (rd != INT64_MAX) { rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rd_stats.rate, rd_stats.dist); // Backup rate and distortion for future reuse backup_stats(compound_type, comp_rate, comp_dist, comp_model_rate, comp_model_dist, rate_sum, dist_sum, &rd_stats, comp_rs2, *rs2); } } else { // Reuse data as matching record is found assert(comp_dist[compound_type] != INT64_MAX); // When disable_interinter_wedge_newmv_search is set, motion refinement is // disabled. Hence rate and distortion can be reused in this case as well assert(IMPLIES((have_newmv_in_inter_mode(this_mode) && (compound_type == COMPOUND_WEDGE)), cpi->sf.inter_sf.disable_interinter_wedge_newmv_search)); assert(mbmi->mv[0].as_int == cur_mv[0].as_int); assert(mbmi->mv[1].as_int == cur_mv[1].as_int); *out_rate_mv = rate_mv; // Calculate RD cost based on stored stats rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_rate[compound_type], comp_dist[compound_type]); // Recalculate model rdcost with the updated rate *comp_model_rd_cur = RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_model_rate[compound_type], comp_model_dist[compound_type]); } return rd; } // scaling values to be used for gating wedge/compound segment based on best // approximate rd static const int comp_type_rd_threshold_mul[3] = { 1, 11, 12 }; static const int comp_type_rd_threshold_div[3] = { 3, 16, 16 }; int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, HandleInterModeArgs *args, BLOCK_SIZE bsize, int_mv *cur_mv, int mode_search_mask, int masked_compound_used, const BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst, const CompoundTypeRdBuffers *buffers, int *rate_mv, int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd, int64_t ref_skip_rd, int *is_luma_interp_done, int64_t rd_thresh) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; const PREDICTION_MODE this_mode = mbmi->mode; int ref_frame = av1_ref_frame_type(mbmi->ref_frame); const int bw = block_size_wide[bsize]; int rs2; int_mv best_mv[2]; int best_tmp_rate_mv = *rate_mv; BEST_COMP_TYPE_STATS best_type_stats; // Initializing BEST_COMP_TYPE_STATS best_type_stats.best_compound_data.type = COMPOUND_AVERAGE; best_type_stats.best_compmode_interinter_cost = 0; best_type_stats.comp_best_model_rd = INT64_MAX; uint8_t *preds0[1] = { buffers->pred0 }; uint8_t *preds1[1] = { buffers->pred1 }; int strides[1] = { bw }; int tmp_rate_mv; COMPOUND_TYPE cur_type; // Local array to store the mask cost for different compound types int masked_type_cost[COMPOUND_TYPES]; int calc_pred_masked_compound = 1; int64_t comp_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX }; int32_t comp_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; int comp_rs2[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; int32_t comp_model_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; int64_t comp_model_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX }; int match_index = 0; const int match_found = find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist, comp_model_rate, comp_model_dist, comp_rs2, &match_index); best_mv[0].as_int = cur_mv[0].as_int; best_mv[1].as_int = cur_mv[1].as_int; *rd = INT64_MAX; // Local array to store the valid compound types to be evaluated in the core // loop COMPOUND_TYPE valid_comp_types[COMPOUND_TYPES] = { COMPOUND_AVERAGE, COMPOUND_DISTWTD, COMPOUND_WEDGE, COMPOUND_DIFFWTD }; int valid_type_count = 0; // compute_valid_comp_types() returns the number of valid compound types to be // evaluated and populates the same in the local array valid_comp_types[]. // It also sets the flag 'try_average_and_distwtd_comp' valid_type_count = compute_valid_comp_types( x, cpi, bsize, masked_compound_used, mode_search_mask, valid_comp_types); // The following context indices are independent of compound type const int comp_group_idx_ctx = get_comp_group_idx_context(xd); const int comp_index_ctx = get_comp_index_context(cm, xd); // Populates masked_type_cost local array for the 4 compound types calc_masked_type_cost(&x->mode_costs, bsize, comp_group_idx_ctx, comp_index_ctx, masked_compound_used, masked_type_cost); int64_t comp_model_rd_cur = INT64_MAX; int64_t best_rd_cur = ref_best_rd; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; // If the match is found, calculate the rd cost using the // stored stats and update the mbmi appropriately. if (match_found && cpi->sf.inter_sf.reuse_compound_type_decision) { return populate_reuse_comp_type_data(x, mbmi, &best_type_stats, cur_mv, comp_rate, comp_dist, comp_rs2, rate_mv, rd, match_index); } // If COMPOUND_AVERAGE is not valid, use the spare buffer if (valid_comp_types[0] != COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1); // Loop over valid compound types for (int i = 0; i < valid_type_count; i++) { cur_type = valid_comp_types[i]; if (args->cmp_mode[ref_frame] == COMPOUND_AVERAGE) { if (cur_type == COMPOUND_WEDGE) continue; } comp_model_rd_cur = INT64_MAX; tmp_rate_mv = *rate_mv; best_rd_cur = INT64_MAX; ref_best_rd = AOMMIN(ref_best_rd, *rd); update_mbmi_for_compound_type(mbmi, cur_type); rs2 = masked_type_cost[cur_type]; int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0); if (mode_rd >= ref_best_rd) continue; // Derive the flags to indicate enabling/disabling of MV refinement process. const int enable_fast_compound_mode_search = cpi->sf.inter_sf.enable_fast_compound_mode_search; const bool skip_mv_refinement_for_avg_distwtd = enable_fast_compound_mode_search == 3 || (enable_fast_compound_mode_search == 2 && (this_mode != NEW_NEWMV)); const bool skip_mv_refinement_for_diffwtd = (!enable_fast_compound_mode_search && cur_type == COMPOUND_DIFFWTD); // Case COMPOUND_AVERAGE and COMPOUND_DISTWTD if (cur_type < COMPOUND_WEDGE) { if (skip_mv_refinement_for_avg_distwtd) { int rate_sum; uint8_t tmp_skip_txfm_sb; int64_t dist_sum, tmp_skip_sse_sb; // Reuse data if matching record is found if (comp_rate[cur_type] == INT_MAX) { av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, AOM_PLANE_Y, AOM_PLANE_Y); if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1; // Compute RD cost for the current type RD_STATS est_rd_stats; const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh) - mode_rd; int64_t est_rd = INT64_MAX; int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, rs2 + *rate_mv); // Evaluate further if skip rd is low enough if (eval_txfm) { est_rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &est_rd_stats); } if (est_rd != INT64_MAX) { best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate, est_rd_stats.dist); model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); comp_model_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum); // Backup rate and distortion for future reuse backup_stats(cur_type, comp_rate, comp_dist, comp_model_rate, comp_model_dist, rate_sum, dist_sum, &est_rd_stats, comp_rs2, rs2); } } else { // Calculate RD cost based on stored stats assert(comp_dist[cur_type] != INT64_MAX); best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + comp_rate[cur_type], comp_dist[cur_type]); // Recalculate model rdcost with the updated rate comp_model_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + comp_model_rate[cur_type], comp_model_dist[cur_type]); } } else { tmp_rate_mv = *rate_mv; if (have_newmv_in_inter_mode(this_mode)) { InterPredParams inter_pred_params; av1_dist_wtd_comp_weight_assign( &cpi->common, mbmi, &inter_pred_params.conv_params.fwd_offset, &inter_pred_params.conv_params.bck_offset, &inter_pred_params.conv_params.use_dist_wtd_comp_avg, 1); int mask_value = inter_pred_params.conv_params.fwd_offset * 4; memset(xd->seg_mask, mask_value, sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE); tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, bsize, this_mode); } av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, AOM_PLANE_Y, AOM_PLANE_Y); if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1; int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, rs2 + *rate_mv); if (eval_txfm) { RD_STATS est_rd_stats; estimate_yrd_for_sb(cpi, bsize, x, INT64_MAX, &est_rd_stats); best_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate, est_rd_stats.dist); } } // use spare buffer for following compound type try if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1); } else if (cur_type == COMPOUND_WEDGE) { int best_mask_index = 0; int best_wedge_sign = 0; int_mv tmp_mv[2] = { mbmi->mv[0], mbmi->mv[1] }; int best_rs2 = 0; int best_rate_mv = *rate_mv; int wedge_mask_size = get_wedge_types_lookup(bsize); int need_mask_search = args->wedge_index == -1; int wedge_newmv_search = have_newmv_in_inter_mode(this_mode) && !cpi->sf.inter_sf.disable_interinter_wedge_newmv_search; if (need_mask_search && !wedge_newmv_search) { // short cut repeated single reference block build av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0, preds0, strides); av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1, preds1, strides); } for (int wedge_mask = 0; wedge_mask < wedge_mask_size && need_mask_search; ++wedge_mask) { for (int wedge_sign = 0; wedge_sign < 2; ++wedge_sign) { tmp_rate_mv = *rate_mv; mbmi->interinter_comp.wedge_index = wedge_mask; mbmi->interinter_comp.wedge_sign = wedge_sign; rs2 = masked_type_cost[cur_type]; rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0); if (mode_rd >= ref_best_rd / 2) continue; if (wedge_newmv_search) { tmp_rate_mv = av1_interinter_compound_motion_search( cpi, x, cur_mv, bsize, this_mode); av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, AOM_PLANE_Y, AOM_PLANE_Y); } else { av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides, preds1, strides); } RD_STATS est_rd_stats; int64_t this_rd_cur = INT64_MAX; int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, rs2 + *rate_mv); if (eval_txfm) { this_rd_cur = estimate_yrd_for_sb( cpi, bsize, x, AOMMIN(best_rd_cur, ref_best_rd), &est_rd_stats); } if (this_rd_cur < INT64_MAX) { this_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate, est_rd_stats.dist); } if (this_rd_cur < best_rd_cur) { best_mask_index = wedge_mask; best_wedge_sign = wedge_sign; best_rd_cur = this_rd_cur; tmp_mv[0] = mbmi->mv[0]; tmp_mv[1] = mbmi->mv[1]; best_rate_mv = tmp_rate_mv; best_rs2 = rs2; } } // Consider the asymmetric partitions for oblique angle only if the // corresponding symmetric partition is the best so far. // Note: For horizontal and vertical types, both symmetric and // asymmetric partitions are always considered. if (cpi->sf.inter_sf.enable_fast_wedge_mask_search) { // The first 4 entries in wedge_codebook_16_heqw/hltw/hgtw[16] // correspond to symmetric partitions of the 4 oblique angles, the // next 4 entries correspond to the vertical/horizontal // symmetric/asymmetric partitions and the last 8 entries correspond // to the asymmetric partitions of oblique types. const int idx_before_asym_oblique = 7; const int last_oblique_sym_idx = 3; if (wedge_mask == idx_before_asym_oblique) { if (best_mask_index > last_oblique_sym_idx) { break; } else { // Asymmetric (Index-1) map for the corresponding oblique masks. // WEDGE_OBLIQUE27: sym - 0, asym - 8, 9 // WEDGE_OBLIQUE63: sym - 1, asym - 12, 13 // WEDGE_OBLIQUE117: sym - 2, asym - 14, 15 // WEDGE_OBLIQUE153: sym - 3, asym - 10, 11 const int asym_mask_idx[4] = { 7, 11, 13, 9 }; wedge_mask = asym_mask_idx[best_mask_index]; wedge_mask_size = wedge_mask + 3; } } } } if (need_mask_search) { if (save_mask_search_results( this_mode, cpi->sf.inter_sf.reuse_mask_search_results)) { args->wedge_index = best_mask_index; args->wedge_sign = best_wedge_sign; } } else { mbmi->interinter_comp.wedge_index = args->wedge_index; mbmi->interinter_comp.wedge_sign = args->wedge_sign; rs2 = masked_type_cost[cur_type]; rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); if (wedge_newmv_search) { tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, bsize, this_mode); } best_mask_index = args->wedge_index; best_wedge_sign = args->wedge_sign; tmp_mv[0] = mbmi->mv[0]; tmp_mv[1] = mbmi->mv[1]; best_rate_mv = tmp_rate_mv; best_rs2 = masked_type_cost[cur_type]; best_rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, AOM_PLANE_Y, AOM_PLANE_Y); int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, best_rs2 + *rate_mv); if (eval_txfm) { RD_STATS est_rd_stats; estimate_yrd_for_sb(cpi, bsize, x, INT64_MAX, &est_rd_stats); best_rd_cur = RDCOST(x->rdmult, best_rs2 + tmp_rate_mv + est_rd_stats.rate, est_rd_stats.dist); } } mbmi->interinter_comp.wedge_index = best_mask_index; mbmi->interinter_comp.wedge_sign = best_wedge_sign; mbmi->mv[0] = tmp_mv[0]; mbmi->mv[1] = tmp_mv[1]; tmp_rate_mv = best_rate_mv; rs2 = best_rs2; } else if (skip_mv_refinement_for_diffwtd) { int_mv tmp_mv[2]; int best_mask_index = 0; rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); int need_mask_search = args->diffwtd_index == -1; for (int mask_index = 0; mask_index < 2 && need_mask_search; ++mask_index) { tmp_rate_mv = *rate_mv; mbmi->interinter_comp.mask_type = mask_index; if (have_newmv_in_inter_mode(this_mode)) { // hard coded number for diff wtd int mask_value = mask_index == 0 ? 38 : 26; memset(xd->seg_mask, mask_value, sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE); tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, bsize, this_mode); } av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, AOM_PLANE_Y, AOM_PLANE_Y); RD_STATS est_rd_stats; int64_t this_rd_cur = INT64_MAX; int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, rs2 + *rate_mv); if (eval_txfm) { this_rd_cur = estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats); } if (this_rd_cur < INT64_MAX) { this_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate, est_rd_stats.dist); } if (this_rd_cur < best_rd_cur) { best_rd_cur = this_rd_cur; best_mask_index = mbmi->interinter_comp.mask_type; tmp_mv[0] = mbmi->mv[0]; tmp_mv[1] = mbmi->mv[1]; } } if (need_mask_search) { if (save_mask_search_results(this_mode, 0)) args->diffwtd_index = best_mask_index; } else { mbmi->interinter_comp.mask_type = args->diffwtd_index; rs2 = masked_type_cost[cur_type]; rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); int mask_value = mbmi->interinter_comp.mask_type == 0 ? 38 : 26; memset(xd->seg_mask, mask_value, sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE); if (have_newmv_in_inter_mode(this_mode)) { tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, bsize, this_mode); } best_mask_index = mbmi->interinter_comp.mask_type; tmp_mv[0] = mbmi->mv[0]; tmp_mv[1] = mbmi->mv[1]; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, AOM_PLANE_Y, AOM_PLANE_Y); RD_STATS est_rd_stats; int64_t this_rd_cur = INT64_MAX; int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, rs2 + *rate_mv); if (eval_txfm) { this_rd_cur = estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats); } if (this_rd_cur < INT64_MAX) { best_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate, est_rd_stats.dist); } } mbmi->interinter_comp.mask_type = best_mask_index; mbmi->mv[0] = tmp_mv[0]; mbmi->mv[1] = tmp_mv[1]; } else { // Handle masked compound types bool eval_masked_comp_type = true; if (*rd != INT64_MAX) { // Factors to control gating of compound type selection based on best // approximate rd so far const int max_comp_type_rd_threshold_mul = comp_type_rd_threshold_mul[cpi->sf.inter_sf .prune_comp_type_by_comp_avg]; const int max_comp_type_rd_threshold_div = comp_type_rd_threshold_div[cpi->sf.inter_sf .prune_comp_type_by_comp_avg]; // Evaluate COMPOUND_WEDGE / COMPOUND_DIFFWTD if approximated cost is // within threshold const int64_t approx_rd = ((*rd / max_comp_type_rd_threshold_div) * max_comp_type_rd_threshold_mul); if (approx_rd >= ref_best_rd) eval_masked_comp_type = false; } if (eval_masked_comp_type) { const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh); best_rd_cur = masked_compound_type_rd( cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst, &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10, strides, rd_stats->rate, tmp_rd_thresh, &calc_pred_masked_compound, comp_rate, comp_dist, comp_model_rate, comp_model_dist, best_type_stats.comp_best_model_rd, &comp_model_rd_cur, comp_rs2, ref_skip_rd); } } // Update stats for best compound type if (best_rd_cur < *rd) { update_best_info(mbmi, rd, &best_type_stats, best_rd_cur, comp_model_rd_cur, rs2); if (have_newmv_in_inter_mode(this_mode)) update_mask_best_mv(mbmi, best_mv, &best_tmp_rate_mv, tmp_rate_mv); } // reset to original mvs for next iteration mbmi->mv[0].as_int = cur_mv[0].as_int; mbmi->mv[1].as_int = cur_mv[1].as_int; } mbmi->comp_group_idx = (best_type_stats.best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1; mbmi->compound_idx = !(best_type_stats.best_compound_data.type == COMPOUND_DISTWTD); mbmi->interinter_comp = best_type_stats.best_compound_data; if (have_newmv_in_inter_mode(this_mode)) { mbmi->mv[0].as_int = best_mv[0].as_int; mbmi->mv[1].as_int = best_mv[1].as_int; rd_stats->rate += best_tmp_rate_mv - *rate_mv; *rate_mv = best_tmp_rate_mv; } if (this_mode == NEW_NEWMV) args->cmp_mode[ref_frame] = mbmi->interinter_comp.type; restore_dst_buf(xd, *orig_dst, 1); if (!match_found) save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rate, comp_model_dist, cur_mv, comp_rs2); return best_type_stats.best_compmode_interinter_cost; } aom-3.12.1/av1/encoder/compound_type.h000066400000000000000000000041271477627663500175510ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_COMPOUND_TYPE_H_ #define AOM_AV1_ENCODER_COMPOUND_TYPE_H_ #include "av1/encoder/encoder.h" #include "av1/encoder/interp_search.h" #ifdef __cplusplus extern "C" { #endif // Structure to store the compound type related stats for best compound type typedef struct { INTERINTER_COMPOUND_DATA best_compound_data; int64_t comp_best_model_rd; int best_compmode_interinter_cost; } BEST_COMP_TYPE_STATS; #define IGNORE_MODE -1 // Searches for the best inter-intra mode. Returns IGNORE_MODE if no good mode // is found, 0 otherwise. int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, MB_MODE_INFO *mbmi, HandleInterModeArgs *args, int64_t ref_best_rd, int *rate_mv, int *tmp_rate2, const BUFFER_SET *orig_dst); int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, HandleInterModeArgs *args, BLOCK_SIZE bsize, int_mv *cur_mv, int mode_search_mask, int masked_compound_used, const BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst, const CompoundTypeRdBuffers *buffers, int *rate_mv, int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd, int64_t ref_skip_rd, int *is_luma_interp_done, int64_t rd_thresh); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_COMPOUND_TYPE_H_ aom-3.12.1/av1/encoder/context_tree.c000066400000000000000000000250631477627663500173640ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/encoder/context_tree.h" #include "av1/encoder/encoder.h" #include "av1/encoder/rd.h" #include void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx, PICK_MODE_CONTEXT *src_ctx) { dst_ctx->mic = src_ctx->mic; dst_ctx->mbmi_ext_best = src_ctx->mbmi_ext_best; dst_ctx->num_4x4_blk = src_ctx->num_4x4_blk; dst_ctx->skippable = src_ctx->skippable; #if CONFIG_INTERNAL_STATS dst_ctx->best_mode_index = src_ctx->best_mode_index; #endif // CONFIG_INTERNAL_STATS memcpy(dst_ctx->blk_skip, src_ctx->blk_skip, sizeof(uint8_t) * src_ctx->num_4x4_blk); av1_copy_array(dst_ctx->tx_type_map, src_ctx->tx_type_map, src_ctx->num_4x4_blk); dst_ctx->rd_stats = src_ctx->rd_stats; dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready; } void av1_setup_shared_coeff_buffer(const SequenceHeader *const seq_params, PC_TREE_SHARED_BUFFERS *shared_bufs, struct aom_internal_error_info *error) { const int num_planes = seq_params->monochrome ? 1 : MAX_MB_PLANE; const int max_sb_square_y = 1 << num_pels_log2_lookup[seq_params->sb_size]; const int max_sb_square_uv = max_sb_square_y >> (seq_params->subsampling_x + seq_params->subsampling_y); for (int i = 0; i < num_planes; i++) { const int max_num_pix = (i == AOM_PLANE_Y) ? max_sb_square_y : max_sb_square_uv; AOM_CHECK_MEM_ERROR(error, shared_bufs->coeff_buf[i], aom_memalign(32, max_num_pix * sizeof(tran_low_t))); AOM_CHECK_MEM_ERROR(error, shared_bufs->qcoeff_buf[i], aom_memalign(32, max_num_pix * sizeof(tran_low_t))); AOM_CHECK_MEM_ERROR(error, shared_bufs->dqcoeff_buf[i], aom_memalign(32, max_num_pix * sizeof(tran_low_t))); } } void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs) { for (int i = 0; i < 3; i++) { aom_free(shared_bufs->coeff_buf[i]); aom_free(shared_bufs->qcoeff_buf[i]); aom_free(shared_bufs->dqcoeff_buf[i]); shared_bufs->coeff_buf[i] = NULL; shared_bufs->qcoeff_buf[i] = NULL; shared_bufs->dqcoeff_buf[i] = NULL; } } PICK_MODE_CONTEXT *av1_alloc_pmc(const struct AV1_COMP *const cpi, BLOCK_SIZE bsize, PC_TREE_SHARED_BUFFERS *shared_bufs) { PICK_MODE_CONTEXT *volatile ctx = NULL; const AV1_COMMON *const cm = &cpi->common; struct aom_internal_error_info error; if (setjmp(error.jmp)) { av1_free_pmc(ctx, av1_num_planes(cm)); return NULL; } error.setjmp = 1; AOM_CHECK_MEM_ERROR(&error, ctx, aom_calloc(1, sizeof(*ctx))); ctx->rd_mode_is_ready = 0; const int num_planes = av1_num_planes(cm); const int num_pix = block_size_wide[bsize] * block_size_high[bsize]; const int num_blk = num_pix / 16; AOM_CHECK_MEM_ERROR(&error, ctx->blk_skip, aom_calloc(num_blk, sizeof(*ctx->blk_skip))); AOM_CHECK_MEM_ERROR(&error, ctx->tx_type_map, aom_calloc(num_blk, sizeof(*ctx->tx_type_map))); ctx->num_4x4_blk = num_blk; for (int i = 0; i < num_planes; ++i) { ctx->coeff[i] = shared_bufs->coeff_buf[i]; ctx->qcoeff[i] = shared_bufs->qcoeff_buf[i]; ctx->dqcoeff[i] = shared_bufs->dqcoeff_buf[i]; AOM_CHECK_MEM_ERROR(&error, ctx->eobs[i], aom_memalign(32, num_blk * sizeof(*ctx->eobs[i]))); AOM_CHECK_MEM_ERROR( &error, ctx->txb_entropy_ctx[i], aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i]))); } if (num_pix <= MAX_PALETTE_SQUARE) { for (int i = 0; i < 2; ++i) { if (cm->features.allow_screen_content_tools) { AOM_CHECK_MEM_ERROR( &error, ctx->color_index_map[i], aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i]))); } else { ctx->color_index_map[i] = NULL; } } } av1_invalid_rd_stats(&ctx->rd_stats); return ctx; } void av1_reset_pmc(PICK_MODE_CONTEXT *ctx) { av1_zero_array(ctx->blk_skip, ctx->num_4x4_blk); av1_zero_array(ctx->tx_type_map, ctx->num_4x4_blk); av1_invalid_rd_stats(&ctx->rd_stats); } void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes) { if (ctx == NULL) return; aom_free(ctx->blk_skip); ctx->blk_skip = NULL; aom_free(ctx->tx_type_map); for (int i = 0; i < num_planes; ++i) { ctx->coeff[i] = NULL; ctx->qcoeff[i] = NULL; ctx->dqcoeff[i] = NULL; aom_free(ctx->eobs[i]); ctx->eobs[i] = NULL; aom_free(ctx->txb_entropy_ctx[i]); ctx->txb_entropy_ctx[i] = NULL; } for (int i = 0; i < 2; ++i) { if (ctx->color_index_map[i]) { aom_free(ctx->color_index_map[i]); ctx->color_index_map[i] = NULL; } } aom_free(ctx); } PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize) { PC_TREE *pc_tree = aom_calloc(1, sizeof(*pc_tree)); if (pc_tree == NULL) return NULL; pc_tree->partitioning = PARTITION_NONE; pc_tree->block_size = bsize; return pc_tree; } #define FREE_PMC_NODE(CTX) \ do { \ av1_free_pmc(CTX, num_planes); \ CTX = NULL; \ } while (0) void av1_free_pc_tree_recursive(PC_TREE *pc_tree, int num_planes, int keep_best, int keep_none, PARTITION_SEARCH_TYPE partition_search_type) { if (pc_tree == NULL) return; // Avoid freeing of extended partitions as they are not supported when // partition_search_type is VAR_BASED_PARTITION. if (partition_search_type == VAR_BASED_PARTITION && !keep_best && !keep_none) { FREE_PMC_NODE(pc_tree->none); for (int i = 0; i < 2; ++i) { FREE_PMC_NODE(pc_tree->horizontal[i]); FREE_PMC_NODE(pc_tree->vertical[i]); } #if !defined(NDEBUG) && !CONFIG_REALTIME_ONLY for (int i = 0; i < 3; ++i) { assert(pc_tree->horizontala[i] == NULL); assert(pc_tree->horizontalb[i] == NULL); assert(pc_tree->verticala[i] == NULL); assert(pc_tree->verticalb[i] == NULL); } for (int i = 0; i < 4; ++i) { assert(pc_tree->horizontal4[i] == NULL); assert(pc_tree->vertical4[i] == NULL); } #endif for (int i = 0; i < 4; ++i) { if (pc_tree->split[i] != NULL) { av1_free_pc_tree_recursive(pc_tree->split[i], num_planes, 0, 0, partition_search_type); pc_tree->split[i] = NULL; } } aom_free(pc_tree); return; } const PARTITION_TYPE partition = pc_tree->partitioning; if (!keep_none && (!keep_best || (partition != PARTITION_NONE))) FREE_PMC_NODE(pc_tree->none); for (int i = 0; i < 2; ++i) { if (!keep_best || (partition != PARTITION_HORZ)) FREE_PMC_NODE(pc_tree->horizontal[i]); if (!keep_best || (partition != PARTITION_VERT)) FREE_PMC_NODE(pc_tree->vertical[i]); } #if !CONFIG_REALTIME_ONLY for (int i = 0; i < 3; ++i) { if (!keep_best || (partition != PARTITION_HORZ_A)) FREE_PMC_NODE(pc_tree->horizontala[i]); if (!keep_best || (partition != PARTITION_HORZ_B)) FREE_PMC_NODE(pc_tree->horizontalb[i]); if (!keep_best || (partition != PARTITION_VERT_A)) FREE_PMC_NODE(pc_tree->verticala[i]); if (!keep_best || (partition != PARTITION_VERT_B)) FREE_PMC_NODE(pc_tree->verticalb[i]); } for (int i = 0; i < 4; ++i) { if (!keep_best || (partition != PARTITION_HORZ_4)) FREE_PMC_NODE(pc_tree->horizontal4[i]); if (!keep_best || (partition != PARTITION_VERT_4)) FREE_PMC_NODE(pc_tree->vertical4[i]); } #endif if (!keep_best || (partition != PARTITION_SPLIT)) { for (int i = 0; i < 4; ++i) { if (pc_tree->split[i] != NULL) { av1_free_pc_tree_recursive(pc_tree->split[i], num_planes, 0, 0, partition_search_type); pc_tree->split[i] = NULL; } } } if (!keep_best && !keep_none) aom_free(pc_tree); } int av1_setup_sms_tree(AV1_COMP *const cpi, ThreadData *td) { // The structure 'sms_tree' is used to store the simple motion search data for // partition pruning in inter frames. Hence, the memory allocations and // initializations related to it are avoided for allintra encoding mode. if (cpi->oxcf.kf_cfg.key_freq_max == 0) return 0; AV1_COMMON *const cm = &cpi->common; const int stat_generation_stage = is_stat_generation_stage(cpi); const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128; const int tree_nodes = av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage); int sms_tree_index = 0; SIMPLE_MOTION_DATA_TREE *this_sms; int square_index = 1; int nodes; aom_free(td->sms_tree); td->sms_tree = (SIMPLE_MOTION_DATA_TREE *)aom_calloc(tree_nodes, sizeof(*td->sms_tree)); if (!td->sms_tree) return -1; this_sms = &td->sms_tree[0]; if (!stat_generation_stage) { const int leaf_factor = is_sb_size_128 ? 4 : 1; const int leaf_nodes = 256 * leaf_factor; // Sets up all the leaf nodes in the tree. for (sms_tree_index = 0; sms_tree_index < leaf_nodes; ++sms_tree_index) { SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index]; tree->block_size = square[0]; } // Each node has 4 leaf nodes, fill each block_size level of the tree // from leafs to the root. for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) { for (int i = 0; i < nodes; ++i) { SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index]; tree->block_size = square[square_index]; for (int j = 0; j < 4; j++) tree->split[j] = this_sms++; ++sms_tree_index; } ++square_index; } } else { // Allocation for firstpass/LAP stage // TODO(Mufaddal): refactor square_index to use a common block_size macro // from firstpass.c SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index]; square_index = 2; tree->block_size = square[square_index]; } // Set up the root node for the largest superblock size td->sms_root = &td->sms_tree[tree_nodes - 1]; return 0; } void av1_free_sms_tree(ThreadData *td) { aom_free(td->sms_tree); td->sms_tree = NULL; } aom-3.12.1/av1/encoder/context_tree.h000066400000000000000000000111071477627663500173630ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_CONTEXT_TREE_H_ #define AOM_AV1_ENCODER_CONTEXT_TREE_H_ #include "config/aom_config.h" #include "av1/common/blockd.h" #include "av1/encoder/block.h" #include "av1/encoder/speed_features.h" #ifdef __cplusplus extern "C" { #endif struct AV1_PRIMARY; struct AV1_COMP; struct AV1Common; struct ThreadData; typedef struct { tran_low_t *coeff_buf[MAX_MB_PLANE]; tran_low_t *qcoeff_buf[MAX_MB_PLANE]; tran_low_t *dqcoeff_buf[MAX_MB_PLANE]; } PC_TREE_SHARED_BUFFERS; // Structure to hold snapshot of coding context during the mode picking process typedef struct PICK_MODE_CONTEXT { MB_MODE_INFO mic; MB_MODE_INFO_EXT_FRAME mbmi_ext_best; uint8_t *color_index_map[2]; uint8_t *blk_skip; tran_low_t *coeff[MAX_MB_PLANE]; tran_low_t *qcoeff[MAX_MB_PLANE]; tran_low_t *dqcoeff[MAX_MB_PLANE]; uint16_t *eobs[MAX_MB_PLANE]; uint8_t *txb_entropy_ctx[MAX_MB_PLANE]; uint8_t *tx_type_map; int num_4x4_blk; // For current partition, only if all Y, U, and V transform blocks' // coefficients are quantized to 0, skippable is set to 1. int skippable; #if CONFIG_INTERNAL_STATS THR_MODES best_mode_index; #endif // CONFIG_INTERNAL_STATS RD_STATS rd_stats; int rd_mode_is_ready; // Flag to indicate whether rd pick mode decision has // been made. #if CONFIG_AV1_TEMPORAL_DENOISING int64_t newmv_sse; int64_t zeromv_sse; int64_t zeromv_lastref_sse; PREDICTION_MODE best_sse_inter_mode; int_mv best_sse_mv; MV_REFERENCE_FRAME best_reference_frame; MV_REFERENCE_FRAME best_zeromv_reference_frame; int sb_skip_denoising; #endif } PICK_MODE_CONTEXT; typedef struct PC_TREE { PARTITION_TYPE partitioning; BLOCK_SIZE block_size; PICK_MODE_CONTEXT *none; PICK_MODE_CONTEXT *horizontal[2]; PICK_MODE_CONTEXT *vertical[2]; #if !CONFIG_REALTIME_ONLY PICK_MODE_CONTEXT *horizontala[3]; PICK_MODE_CONTEXT *horizontalb[3]; PICK_MODE_CONTEXT *verticala[3]; PICK_MODE_CONTEXT *verticalb[3]; PICK_MODE_CONTEXT *horizontal4[4]; PICK_MODE_CONTEXT *vertical4[4]; #endif struct PC_TREE *split[4]; int index; } PC_TREE; typedef struct SIMPLE_MOTION_DATA_TREE { BLOCK_SIZE block_size; PARTITION_TYPE partitioning; struct SIMPLE_MOTION_DATA_TREE *split[4]; // Simple motion search_features FULLPEL_MV start_mvs[REF_FRAMES]; unsigned int sms_none_feat[2]; unsigned int sms_rect_feat[8]; int sms_none_valid; int sms_rect_valid; } SIMPLE_MOTION_DATA_TREE; void av1_setup_shared_coeff_buffer(const SequenceHeader *const seq_params, PC_TREE_SHARED_BUFFERS *shared_bufs, struct aom_internal_error_info *error); void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs); PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize); void av1_free_pc_tree_recursive(PC_TREE *tree, int num_planes, int keep_best, int keep_none, PARTITION_SEARCH_TYPE partition_search_type); PICK_MODE_CONTEXT *av1_alloc_pmc(const struct AV1_COMP *const cpi, BLOCK_SIZE bsize, PC_TREE_SHARED_BUFFERS *shared_bufs); void av1_reset_pmc(PICK_MODE_CONTEXT *ctx); void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes); void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx, PICK_MODE_CONTEXT *src_ctx); static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = { BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128, }; static inline int av1_get_pc_tree_nodes(const int is_sb_size_128, int stat_generation_stage) { const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0; const int tree_nodes = stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1); return tree_nodes; } // Returns 0 on success, -1 on memory allocation failure. int av1_setup_sms_tree(struct AV1_COMP *const cpi, struct ThreadData *td); void av1_free_sms_tree(struct ThreadData *td); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_CONTEXT_TREE_H_ aom-3.12.1/av1/encoder/cost.c000066400000000000000000000035631477627663500156320ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/encoder/cost.h" #include "av1/common/entropy.h" // round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT)); i = 128~255. const uint16_t av1_prob_cost[128] = { 512, 506, 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435, 430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371, 366, 361, 356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311, 307, 302, 298, 294, 289, 285, 281, 277, 273, 268, 264, 260, 256, 252, 248, 244, 240, 236, 232, 228, 224, 220, 216, 212, 209, 205, 201, 197, 194, 190, 186, 182, 179, 175, 171, 168, 164, 161, 157, 153, 150, 146, 143, 139, 136, 132, 129, 125, 122, 119, 115, 112, 109, 105, 102, 99, 95, 92, 89, 86, 82, 79, 76, 73, 70, 66, 63, 60, 57, 54, 51, 48, 45, 42, 38, 35, 32, 29, 26, 23, 20, 18, 15, 12, 9, 6, 3, }; void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf, const int *inv_map) { int i; aom_cdf_prob prev_cdf = 0; for (i = 0;; ++i) { aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf; p15 = (p15 < EC_MIN_PROB) ? EC_MIN_PROB : p15; prev_cdf = AOM_ICDF(cdf[i]); if (inv_map) costs[inv_map[i]] = av1_cost_symbol(p15); else costs[i] = av1_cost_symbol(p15); // Stop once we reach the end of the CDF if (cdf[i] == AOM_ICDF(CDF_PROB_TOP)) break; } } aom-3.12.1/av1/encoder/cost.h000066400000000000000000000034311477627663500156310ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_COST_H_ #define AOM_AV1_ENCODER_COST_H_ #include "aom_dsp/prob.h" #include "aom/aom_integer.h" #ifdef __cplusplus extern "C" { #endif extern const uint16_t av1_prob_cost[128]; // The factor to scale from cost in bits to cost in av1_prob_cost units. #define AV1_PROB_COST_SHIFT 9 // Cost of coding an n bit literal, using 128 (i.e. 50%) probability // for each bit. #define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT)) // Calculate the cost of a symbol with probability p15 / 2^15 static inline int av1_cost_symbol(aom_cdf_prob p15) { // p15 can be out of range [1, CDF_PROB_TOP - 1]. Clamping it, so that the // following cost calculation works correctly. Otherwise, if p15 = // CDF_PROB_TOP, shift would be -1, and "p15 << shift" would be wrong. p15 = (aom_cdf_prob)clamp(p15, 1, CDF_PROB_TOP - 1); assert(0 < p15 && p15 < CDF_PROB_TOP); const int shift = CDF_PROB_BITS - 1 - get_msb(p15); const int prob = get_prob(p15 << shift, CDF_PROB_TOP); assert(prob >= 128); return av1_prob_cost[prob - 128] + av1_cost_literal(shift); } void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf, const int *inv_map); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_COST_H_ aom-3.12.1/av1/encoder/deltaq4_model.c000066400000000000000000022764571477627663500174200ustar00rootroot00000000000000/* Embedded file: model.tflite */ const int av1_deltaq4_model_fsize = 101032; const unsigned char av1_deltaq4_model_file[101032] = { 0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x14, 0x00, 0x20, 0x00, 0x1c, 0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0xc0, 0x7e, 0x01, 0x00, 0xd0, 0x7e, 0x01, 0x00, 0x24, 0x8a, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6a, 0x80, 0xfe, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x73, 0x65, 0x72, 0x76, 0x69, 0x6e, 0x67, 0x5f, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xb4, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xca, 0x81, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00, 0x17, 0x00, 0x00, 0x00, 0xfc, 0x7d, 0x01, 0x00, 0xf4, 0x7d, 0x01, 0x00, 0xdc, 0x7d, 0x01, 0x00, 0x84, 0x7d, 0x01, 0x00, 0xf4, 0x7c, 0x01, 0x00, 0xa4, 0x7c, 0x01, 0x00, 0x74, 0x7c, 0x01, 0x00, 0x5c, 0x7c, 0x01, 0x00, 0x4c, 0x5c, 0x00, 0x00, 0xbc, 0x5b, 0x00, 0x00, 0x8c, 0x5a, 0x00, 0x00, 0x7c, 0x48, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x7e, 0x82, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x77, 0xfe, 0xff, 0x54, 0x77, 0xfe, 0xff, 0x58, 0x77, 0xfe, 0xff, 0x5c, 0x77, 0xfe, 0xff, 0x60, 0x77, 0xfe, 0xff, 0x64, 0x77, 0xfe, 0xff, 0x68, 0x77, 0xfe, 0xff, 0x6c, 0x77, 0xfe, 0xff, 0x70, 0x77, 0xfe, 0xff, 0xbe, 0x82, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x3e, 0x84, 0xfc, 0x3b, 0xef, 0x95, 0x2f, 0xbd, 0xd3, 0x21, 0x96, 0xbd, 0x11, 0x9a, 0xc6, 0x3d, 0xd9, 0x7e, 0x0c, 0xbe, 0xcb, 0xd2, 0x8c, 0xbb, 0x60, 0xf5, 0x92, 0xbd, 0x70, 0xce, 0x9e, 0x3d, 0x26, 0x67, 0xc4, 0x3d, 0x9b, 0x2a, 0x8b, 0x3b, 0x3b, 0xdd, 0x2a, 0xbd, 0xf9, 0x09, 0x8a, 0xbd, 0x1b, 0xae, 0xd7, 0x3c, 0xbf, 0x39, 0x87, 0xbd, 0x4c, 0x9e, 0xe2, 0x3d, 0x50, 0x9c, 0xe7, 0xbd, 0x1e, 0x58, 0x57, 0x3d, 0x38, 0x8c, 0x58, 0xbd, 0x48, 0x9f, 0x4a, 0x3d, 0xcb, 0x1c, 0x93, 0xbd, 0xeb, 0xb8, 0x5a, 0xbc, 0x63, 0x04, 0x4b, 0xbd, 0x9b, 0x76, 0xa8, 0x3d, 0x20, 0xb4, 0x69, 0x3d, 0xee, 0xcc, 0xe5, 0x3a, 0x4f, 0x40, 0x02, 0x3e, 0x21, 0x2e, 0x03, 0x3e, 0x25, 0x77, 0x99, 0xbd, 0xf5, 0xa1, 0xd0, 0x3c, 0xc5, 0x15, 0xeb, 0x3c, 0x58, 0xb5, 0xb7, 0x3c, 0x80, 0x63, 0x33, 0xbd, 0xc9, 0x66, 0x63, 0xbd, 0xf6, 0xef, 0xb8, 0xbd, 0xd7, 0xbf, 0x9f, 0x3b, 0x93, 0x68, 0x35, 0x3d, 0x60, 0xfc, 0xf3, 0xbd, 0xed, 0xd9, 0x35, 0xbd, 0x57, 0xef, 0x8a, 0x3d, 0x31, 0x97, 0xa4, 0x3d, 0x8e, 0x55, 0xe2, 0x3d, 0x27, 0xa5, 0xe9, 0x3d, 0x36, 0x26, 0x67, 0xbc, 0xeb, 0xd1, 0x9e, 0xbd, 0xc7, 0xcd, 0x37, 0x3d, 0x31, 0xfc, 0xce, 0x3d, 0x5e, 0xe3, 0x96, 0xbd, 0xeb, 0x24, 0x4d, 0x3c, 0xe6, 0x00, 0xe2, 0xbd, 0x9b, 0x00, 0x17, 0xbd, 0xee, 0x9f, 0xc4, 0xbd, 0x6a, 0xcd, 0xba, 0xbc, 0x2c, 0x2b, 0x97, 0xbd, 0x8a, 0x02, 0x68, 0xbc, 0xc3, 0x46, 0x9f, 0xbd, 0x85, 0x3d, 0xc2, 0x3d, 0xbc, 0x16, 0x22, 0x3c, 0xf1, 0xca, 0xdf, 0x3d, 0xaf, 0xef, 0xbc, 0x3c, 0x4c, 0xde, 0xe8, 0xbd, 0x5c, 0x5a, 0xc9, 0xbb, 0x35, 0xe5, 0xc1, 0x3d, 0x14, 0xc7, 0xba, 0xbc, 0x05, 0xfb, 0x1d, 0x3d, 0x61, 0x23, 0xb7, 0xbb, 0x17, 0x50, 0xb0, 0xbd, 0x14, 0x5b, 0xf4, 0xbd, 0xb1, 0x4d, 0x40, 0x3d, 0x7e, 0x3d, 0xd8, 0x3d, 0x35, 0x2e, 0x90, 0x3d, 0x93, 0xcd, 0x0d, 0xbe, 0x8d, 0x60, 0x70, 0x3d, 0x4a, 0x7c, 0xf2, 0x3c, 0x07, 0x2a, 0x7f, 0x3d, 0x2c, 0xab, 0xd8, 0x3d, 0xb3, 0x1f, 0x1d, 0xbd, 0x44, 0x69, 0xf7, 0x3c, 0x71, 0xfd, 0x5e, 0x3c, 0xc8, 0x14, 0x28, 0x3d, 0x71, 0x2e, 0x0c, 0x3b, 0x7f, 0xa3, 0xb5, 0x3d, 0x55, 0x5c, 0x07, 0x3e, 0x0f, 0xf0, 0x3b, 0x3c, 0xd9, 0xc2, 0xbd, 0xbc, 0x71, 0xaa, 0xc5, 0xbb, 0xa3, 0x86, 0xc7, 0x3d, 0xcf, 0x37, 0x95, 0xbd, 0x09, 0x63, 0xc3, 0x3d, 0x0c, 0x01, 0x4e, 0xbd, 0xf1, 0xf9, 0x8d, 0x3d, 0xe2, 0x98, 0x45, 0x3d, 0x76, 0xbc, 0x3b, 0x3d, 0x2a, 0xa2, 0x47, 0x3d, 0x8c, 0x1d, 0xae, 0xbd, 0x5f, 0x35, 0x8c, 0xbd, 0x17, 0xeb, 0x05, 0x3d, 0x75, 0x62, 0xdb, 0xbd, 0x37, 0xf8, 0xea, 0x3d, 0xf8, 0xa6, 0x6c, 0xbd, 0x8a, 0x86, 0x03, 0x3d, 0x67, 0x6c, 0x8d, 0xbd, 0x58, 0xaf, 0xc5, 0xbd, 0x36, 0x51, 0x14, 0xbe, 0x60, 0xac, 0xe3, 0x3d, 0x86, 0x4f, 0xf4, 0x3c, 0xf6, 0xa3, 0x29, 0x3d, 0xc3, 0x1d, 0x9a, 0x3c, 0x44, 0xdc, 0x0e, 0xbc, 0x6b, 0x97, 0x8f, 0x3c, 0xc9, 0x3d, 0x88, 0xbc, 0x74, 0x90, 0x9d, 0x3d, 0x0f, 0x02, 0xec, 0xbd, 0x12, 0xec, 0xb2, 0x3d, 0x6c, 0x32, 0x31, 0x3d, 0x0b, 0x84, 0x35, 0x3d, 0xfc, 0xc2, 0x3c, 0x3d, 0x59, 0xdf, 0x16, 0x3d, 0x8e, 0x29, 0xee, 0x3d, 0x83, 0xc3, 0xb7, 0xbd, 0x66, 0xbd, 0x84, 0xbd, 0xb7, 0x49, 0x1b, 0x3d, 0x3f, 0xc1, 0x4a, 0x3d, 0x1a, 0x7d, 0xdf, 0x3d, 0xee, 0x12, 0xb1, 0x3c, 0x29, 0x47, 0xe6, 0xbd, 0xd6, 0x04, 0xd6, 0x3d, 0xc2, 0x31, 0x6f, 0xbd, 0xb0, 0x2c, 0x3e, 0xbd, 0x20, 0xd8, 0x43, 0xbd, 0x2d, 0x0c, 0x26, 0xbd, 0x23, 0x47, 0x06, 0xbe, 0xb9, 0xd2, 0xb9, 0xbd, 0x7b, 0xef, 0xc8, 0x3d, 0x23, 0x06, 0x06, 0x3d, 0x65, 0xc6, 0x45, 0xbd, 0x20, 0xc9, 0x24, 0xbc, 0xf7, 0x2b, 0xf5, 0x3d, 0x41, 0x91, 0x15, 0xbd, 0x90, 0xbe, 0x0f, 0x3d, 0xe8, 0x94, 0x8c, 0xbd, 0xdf, 0x96, 0x72, 0x3c, 0x8d, 0xb4, 0xed, 0x3d, 0x33, 0xf0, 0xb3, 0xbd, 0x60, 0x49, 0xbc, 0xbd, 0x32, 0xf2, 0xd5, 0x3d, 0x3e, 0x3e, 0x6b, 0xbd, 0xb4, 0x31, 0x09, 0x3e, 0xc6, 0x40, 0xfb, 0xbc, 0x75, 0x1a, 0x88, 0xbd, 0xbf, 0x13, 0xb2, 0xbd, 0xe3, 0x78, 0xc4, 0xba, 0x68, 0xfc, 0x10, 0x3e, 0x27, 0x4c, 0xf5, 0x3c, 0xfc, 0x68, 0x27, 0x3d, 0xb2, 0x2c, 0xe0, 0x3c, 0x6e, 0x4f, 0x9a, 0xbb, 0xbb, 0x9f, 0xa1, 0xbd, 0x91, 0x7b, 0x9a, 0xbc, 0x17, 0x21, 0x52, 0xba, 0x39, 0x8e, 0x4c, 0xbd, 0x03, 0xf5, 0xe5, 0x3d, 0x3a, 0x22, 0xcd, 0xbd, 0x90, 0x1c, 0x78, 0xbd, 0x3f, 0xb1, 0x8d, 0xbd, 0xfc, 0x77, 0x25, 0xbe, 0x48, 0x9a, 0xfd, 0x3c, 0xca, 0x6a, 0xa2, 0x3d, 0x45, 0xd6, 0x7a, 0xbd, 0xce, 0x9d, 0xbf, 0x3d, 0x94, 0x1c, 0xbe, 0xbd, 0xcc, 0xc4, 0x83, 0xbc, 0xe9, 0xc7, 0xf3, 0xbc, 0xdc, 0x31, 0x19, 0x39, 0x3a, 0x36, 0xea, 0x3d, 0x40, 0xa6, 0x72, 0xbd, 0x66, 0xeb, 0x85, 0xb9, 0x68, 0xa0, 0x97, 0xbd, 0xa7, 0xeb, 0xa9, 0x3c, 0x4d, 0x79, 0xf9, 0x3c, 0x55, 0x67, 0xb2, 0x3c, 0x80, 0x2a, 0x8f, 0xbd, 0xd5, 0x70, 0x17, 0x3b, 0x41, 0xfb, 0xed, 0xbd, 0xae, 0xfe, 0x0e, 0xbd, 0x6d, 0x06, 0xd6, 0xbc, 0x90, 0xc9, 0xd1, 0x3d, 0xb4, 0x6c, 0x19, 0x3b, 0xa3, 0x4f, 0x11, 0x3c, 0xb1, 0x71, 0xc1, 0xbd, 0xcc, 0x5b, 0x20, 0xbc, 0x7a, 0xb5, 0xe9, 0x3d, 0x6f, 0x8c, 0x95, 0x3d, 0x10, 0x56, 0x79, 0xbd, 0x45, 0x06, 0x69, 0x3c, 0xe4, 0x89, 0x9f, 0xbd, 0xad, 0x43, 0x82, 0xbd, 0x7a, 0x1f, 0xbd, 0xbd, 0xbb, 0x25, 0x9b, 0x3c, 0x27, 0xdc, 0x0f, 0xbe, 0x42, 0x7b, 0xe1, 0x3d, 0xaa, 0xd9, 0xcb, 0xbd, 0xa4, 0xdf, 0x0e, 0x3e, 0xdd, 0x57, 0xbe, 0xbd, 0xf0, 0xb7, 0x87, 0xbd, 0xbb, 0x8a, 0x73, 0xbd, 0x20, 0x8b, 0xb5, 0x3c, 0xb3, 0xac, 0x57, 0xbd, 0x4a, 0x5c, 0x68, 0x3d, 0x46, 0xc5, 0x6e, 0x3b, 0x44, 0xd8, 0x22, 0xbd, 0xc8, 0x88, 0x93, 0xbd, 0x71, 0x42, 0xd3, 0xbc, 0x80, 0x60, 0xf6, 0xbc, 0xe0, 0xb7, 0x04, 0x3d, 0xcb, 0x28, 0xf7, 0xbd, 0xfd, 0x2e, 0x9d, 0xbd, 0xd8, 0x81, 0x5b, 0x3d, 0x90, 0x88, 0x06, 0xbd, 0xb1, 0x2d, 0x8b, 0xbc, 0x74, 0x4d, 0x80, 0xbd, 0x1b, 0xce, 0x54, 0x3d, 0xd3, 0xea, 0x89, 0xbd, 0x7a, 0x0a, 0xc6, 0x3c, 0x8b, 0x33, 0xa2, 0x3d, 0x68, 0xe5, 0x8b, 0x3d, 0xcf, 0x19, 0x63, 0xbd, 0x50, 0x05, 0xc1, 0xbd, 0x2b, 0x1f, 0xc4, 0xbc, 0x9f, 0xed, 0xaf, 0xbd, 0xc6, 0x72, 0x07, 0xbb, 0xc1, 0x58, 0xa2, 0x3d, 0xf6, 0x27, 0x43, 0xbc, 0xa1, 0x5b, 0x36, 0x3d, 0x6b, 0x6b, 0x20, 0x3d, 0x03, 0xb0, 0xfb, 0xbd, 0xf9, 0xf7, 0x9b, 0xbd, 0x9a, 0xbf, 0x92, 0x3d, 0xa2, 0x0c, 0x5c, 0x3d, 0xd2, 0xc2, 0x73, 0xbd, 0x5c, 0xd3, 0xac, 0x3d, 0x9f, 0x28, 0xa6, 0x3d, 0x23, 0xf4, 0x46, 0xbd, 0xf5, 0xfe, 0x6b, 0x3d, 0x2d, 0x03, 0x56, 0x3d, 0x0c, 0x21, 0xe8, 0x3c, 0x6f, 0xdb, 0xe5, 0xbd, 0xd4, 0x8c, 0xe3, 0xbd, 0xdf, 0x9d, 0x62, 0x3d, 0x38, 0xa0, 0xd1, 0xbd, 0x67, 0x9e, 0x8d, 0xbc, 0xab, 0x78, 0x46, 0x3d, 0xf8, 0x88, 0x8e, 0xbc, 0x5a, 0x87, 0xd3, 0xbd, 0x40, 0xba, 0xab, 0xbd, 0x45, 0xf8, 0x9a, 0x3d, 0x77, 0x60, 0x49, 0xbd, 0xa5, 0x29, 0x98, 0xbc, 0xf9, 0xa7, 0x6b, 0x3d, 0xf8, 0x57, 0x1b, 0x3e, 0xf9, 0x7f, 0xcb, 0x3d, 0xc8, 0x38, 0x3f, 0xbb, 0x0e, 0x77, 0xd9, 0x3d, 0xa9, 0x8f, 0xca, 0x3d, 0x78, 0xbc, 0x92, 0x3d, 0xde, 0xe4, 0x31, 0xbc, 0x7f, 0x35, 0xec, 0x3d, 0x0b, 0x98, 0x5c, 0x3d, 0x3a, 0x86, 0xa0, 0x3d, 0x9d, 0xb7, 0xad, 0xbd, 0x42, 0x3c, 0xc2, 0xbc, 0x26, 0x4b, 0x7b, 0x3d, 0xbe, 0x8b, 0x0a, 0xb9, 0x28, 0x3e, 0xc5, 0x3d, 0xef, 0xac, 0xbb, 0xbd, 0xb3, 0xcc, 0x69, 0xbd, 0xb9, 0xff, 0x07, 0x3d, 0x30, 0xf6, 0x26, 0x3d, 0xa9, 0x18, 0xe6, 0x3d, 0x85, 0x72, 0xdb, 0xbd, 0xda, 0x6e, 0xa1, 0x3d, 0x3b, 0x16, 0xf7, 0x3c, 0xb1, 0x3d, 0x96, 0xbd, 0xd9, 0x88, 0xeb, 0x3b, 0x52, 0x76, 0x9a, 0xbd, 0xb9, 0x81, 0x1a, 0xbd, 0x81, 0x94, 0x96, 0xbc, 0xd4, 0x4b, 0xe8, 0x3d, 0x0f, 0x6c, 0xe4, 0xbc, 0xc0, 0xbd, 0xab, 0x3c, 0x1b, 0xdd, 0x76, 0x3c, 0x98, 0x18, 0xae, 0xbd, 0xfb, 0x1a, 0x6f, 0xbd, 0x72, 0x50, 0x83, 0xbd, 0x46, 0x0b, 0x12, 0xbc, 0x64, 0x93, 0xf2, 0x3d, 0x1f, 0xad, 0x71, 0x3b, 0xcf, 0x26, 0x77, 0xbd, 0x8b, 0x31, 0x2d, 0xbd, 0x0d, 0xb7, 0x54, 0x3b, 0x5b, 0x00, 0xc4, 0x3d, 0x57, 0x4c, 0x58, 0x3d, 0x11, 0x4c, 0x15, 0x3d, 0x1a, 0xfc, 0xa2, 0xbc, 0xf2, 0xed, 0xea, 0x3d, 0x9e, 0xad, 0xf7, 0xbd, 0x47, 0x8d, 0x41, 0x3d, 0xce, 0xc5, 0x96, 0xbb, 0x2a, 0x72, 0xa0, 0xbd, 0x93, 0x27, 0x9a, 0xbd, 0x3f, 0xcb, 0xef, 0xbb, 0xb5, 0xa5, 0x1e, 0x3d, 0xd6, 0x2a, 0xfd, 0xbc, 0xf5, 0xe0, 0xd4, 0xbc, 0xa1, 0x7d, 0x9d, 0x3d, 0xbb, 0x60, 0x22, 0xbd, 0x32, 0x15, 0x16, 0x3e, 0x80, 0x77, 0xb7, 0xbc, 0xba, 0x1c, 0xa4, 0xbd, 0x45, 0xb7, 0x0b, 0xbd, 0x6a, 0x33, 0x9a, 0x3d, 0xfc, 0x27, 0xab, 0xbc, 0x10, 0xcd, 0x2c, 0x3e, 0xb3, 0xf1, 0xa5, 0x3d, 0x03, 0xf7, 0xa3, 0x3c, 0x25, 0x0c, 0xe1, 0x3c, 0xc4, 0x82, 0xaa, 0xbd, 0x3a, 0x4a, 0x15, 0x3c, 0x5c, 0x56, 0x9e, 0x3d, 0x96, 0x52, 0xee, 0x3d, 0x67, 0xf7, 0x96, 0x3d, 0x3e, 0xb0, 0xd6, 0xbd, 0x6e, 0xbd, 0x8e, 0xbd, 0x16, 0xb3, 0x85, 0x3d, 0x84, 0xca, 0x6e, 0xbd, 0x0f, 0xfc, 0x40, 0x3d, 0x2d, 0xe0, 0xdc, 0x3d, 0xc1, 0xa1, 0xde, 0x39, 0x30, 0x79, 0xe7, 0x3d, 0x0a, 0xab, 0xba, 0x3d, 0x35, 0x57, 0xc7, 0xbd, 0x7e, 0x38, 0xa1, 0x3d, 0xe3, 0x25, 0x60, 0x3d, 0x47, 0xbd, 0x56, 0x3d, 0x62, 0xcf, 0xf6, 0x3d, 0xad, 0x06, 0xd5, 0xbd, 0x41, 0xda, 0xe8, 0x3a, 0x81, 0xcb, 0xbb, 0x3d, 0xce, 0x38, 0x4c, 0xbc, 0x17, 0xc0, 0x88, 0xbd, 0x12, 0x25, 0xd7, 0xbd, 0x3b, 0xf5, 0x9b, 0xbd, 0x4e, 0xa0, 0xb1, 0xbc, 0xa1, 0x8c, 0x9c, 0x3d, 0xc5, 0x2f, 0xb3, 0x3d, 0xe0, 0xc2, 0x08, 0x3e, 0x0b, 0xcc, 0x2f, 0x3d, 0x87, 0x3f, 0x1d, 0x3e, 0x76, 0xcd, 0xc3, 0xbd, 0x4f, 0x1d, 0xd4, 0xbd, 0x65, 0x6f, 0x00, 0x3e, 0x95, 0x4f, 0x9a, 0x3d, 0xa2, 0x66, 0x28, 0xbd, 0xaf, 0x81, 0x90, 0x3d, 0x16, 0x50, 0xde, 0x3b, 0x65, 0xec, 0xe3, 0xbd, 0x47, 0x6c, 0x34, 0xbc, 0xae, 0xe8, 0xe5, 0xbd, 0x5b, 0x7c, 0xa6, 0xbb, 0x1d, 0x4d, 0x8d, 0xbc, 0xb1, 0x7a, 0x1d, 0x3e, 0xbf, 0x37, 0xe6, 0xbc, 0x7b, 0x0c, 0x70, 0x3d, 0x09, 0x57, 0xe2, 0x3d, 0x10, 0x4a, 0x35, 0xbc, 0x5d, 0x58, 0xf5, 0xbc, 0xb9, 0x89, 0xa1, 0x3d, 0x6a, 0xb2, 0x68, 0xbd, 0xf4, 0xf6, 0x03, 0x3e, 0xf1, 0xc6, 0x3a, 0xbd, 0xf5, 0x3b, 0xe2, 0x3d, 0x3a, 0xd2, 0x4a, 0x3d, 0xe7, 0xb8, 0x9e, 0xbd, 0x18, 0xe7, 0xd9, 0x3c, 0x1d, 0x95, 0x8e, 0x3d, 0xde, 0x6f, 0x9e, 0xbc, 0xae, 0x7d, 0x0f, 0x3e, 0xb0, 0xf3, 0x04, 0x3d, 0xe0, 0xdc, 0x6b, 0x3d, 0x02, 0x2c, 0xee, 0xbd, 0x7c, 0xb2, 0x9f, 0xbd, 0xae, 0x94, 0xc3, 0x3c, 0x82, 0xba, 0xab, 0x3d, 0x07, 0x80, 0xde, 0x3c, 0x75, 0xec, 0xb3, 0xbd, 0x34, 0x42, 0x74, 0xbd, 0x44, 0xce, 0x7a, 0x3d, 0x21, 0xac, 0x28, 0xbe, 0xb1, 0xbb, 0x14, 0xbd, 0xe2, 0xe1, 0xdb, 0x3c, 0x41, 0x82, 0xc7, 0x3d, 0x3e, 0x0f, 0x9c, 0xbd, 0x92, 0x4e, 0x97, 0x3d, 0x69, 0x45, 0xf2, 0x3d, 0xc3, 0x86, 0xc4, 0xbb, 0x57, 0x0f, 0xb1, 0x3d, 0x8c, 0xa7, 0xc6, 0x3d, 0x27, 0xe2, 0xf3, 0xbc, 0xdd, 0x31, 0x44, 0xbd, 0x94, 0x2c, 0x29, 0xbc, 0xe6, 0xeb, 0xd1, 0xbd, 0x74, 0xf9, 0x02, 0x3d, 0x43, 0x51, 0x92, 0xbd, 0x38, 0xb8, 0x72, 0x3d, 0x73, 0xd3, 0x89, 0xbc, 0x06, 0x13, 0xdb, 0x3d, 0x75, 0xc5, 0xb2, 0x3b, 0x9a, 0xe9, 0x95, 0xbc, 0xd2, 0x6a, 0x05, 0x3e, 0x65, 0xc5, 0xa3, 0x3d, 0x59, 0x09, 0x72, 0xbd, 0x93, 0x0e, 0x85, 0xbc, 0x0d, 0x55, 0x6b, 0xbd, 0x55, 0x64, 0x16, 0xbd, 0x50, 0x04, 0x9f, 0x3d, 0x93, 0x37, 0x14, 0xbd, 0xe9, 0x24, 0x58, 0x3d, 0x04, 0x8e, 0xe9, 0xbd, 0xe4, 0x6e, 0x2b, 0xbd, 0x43, 0xbc, 0xba, 0xbd, 0x80, 0xa1, 0xc3, 0xbd, 0x32, 0x81, 0xf5, 0xbd, 0x94, 0x5a, 0x10, 0x3d, 0xfb, 0x5d, 0x27, 0x3c, 0xd7, 0x26, 0xc5, 0x3d, 0xf5, 0xc3, 0x4b, 0x3d, 0x32, 0xca, 0xdc, 0x3d, 0xb2, 0xe8, 0x35, 0xbc, 0xb2, 0x47, 0xb9, 0xbd, 0xfa, 0x59, 0x29, 0xbe, 0xab, 0x6f, 0x0a, 0x3e, 0x81, 0xa5, 0x10, 0xbd, 0x73, 0x96, 0x99, 0xbd, 0x39, 0x77, 0x23, 0xbc, 0xa8, 0x50, 0xf8, 0xbd, 0x4c, 0x1d, 0xdd, 0xbd, 0xf8, 0xf5, 0xb9, 0xbd, 0x65, 0x4e, 0x12, 0x3e, 0xc0, 0xa1, 0x7a, 0xbd, 0x16, 0x33, 0x27, 0x3d, 0xc4, 0xc6, 0x31, 0x3b, 0x0e, 0xcd, 0x48, 0xbd, 0xd2, 0x7f, 0xb4, 0xbd, 0x2c, 0x3a, 0x8b, 0x3c, 0x6f, 0x43, 0x59, 0x3d, 0x4e, 0x8a, 0x52, 0x3d, 0x91, 0x68, 0xc4, 0x3d, 0xa2, 0x78, 0x16, 0xbd, 0xe5, 0x2c, 0x60, 0x3d, 0x7f, 0x73, 0x8f, 0x3d, 0x9f, 0x70, 0x09, 0xbe, 0xf2, 0xf2, 0x05, 0x3c, 0x1e, 0x58, 0x98, 0x3d, 0xec, 0xfc, 0x03, 0x3e, 0x88, 0xbf, 0x56, 0xbd, 0x2b, 0xc8, 0x99, 0xbd, 0x9e, 0x13, 0x9a, 0xbc, 0x4f, 0x72, 0xca, 0xbd, 0x79, 0x6e, 0xef, 0x3d, 0x87, 0xc3, 0x80, 0xbc, 0xe7, 0xef, 0x05, 0x3d, 0xc7, 0x99, 0x0a, 0x3d, 0x17, 0x7c, 0x56, 0x3d, 0x01, 0xab, 0xd3, 0xbd, 0x48, 0x8b, 0xa2, 0xbd, 0x06, 0xad, 0xcc, 0xbc, 0xf0, 0xf5, 0x6d, 0xbd, 0x6a, 0x67, 0x0c, 0xbe, 0x7e, 0x2e, 0x6e, 0x3d, 0x53, 0x50, 0x29, 0xbd, 0x8c, 0x40, 0xb3, 0x3d, 0x5c, 0x9a, 0x0f, 0xbd, 0xe9, 0x4e, 0x0a, 0x3e, 0x4d, 0x05, 0xac, 0x3d, 0xf9, 0x1a, 0x8e, 0x3d, 0x0d, 0x69, 0xa6, 0xbd, 0x88, 0x94, 0x60, 0x3d, 0x48, 0x2a, 0x8a, 0xbb, 0x5a, 0x5d, 0x39, 0x3d, 0x88, 0x56, 0xc8, 0x3c, 0xb8, 0x91, 0x93, 0x3a, 0x64, 0x69, 0x8b, 0x3d, 0x4b, 0x48, 0x43, 0xbd, 0xb8, 0x91, 0xa7, 0xbd, 0x92, 0x96, 0xe5, 0x3d, 0x4c, 0x62, 0xd6, 0x3d, 0xa6, 0x7a, 0x88, 0xbd, 0x6c, 0xdb, 0xc6, 0x3d, 0x1c, 0x4d, 0xab, 0x3d, 0xe0, 0x1d, 0x57, 0x3c, 0x2a, 0xa3, 0x0c, 0x3d, 0xac, 0xff, 0xe8, 0xbb, 0x12, 0x86, 0x89, 0xbd, 0xc6, 0x68, 0xd3, 0xbd, 0xe7, 0xb0, 0xa6, 0xbc, 0x3c, 0xd2, 0xfa, 0xbb, 0xf2, 0xd6, 0xda, 0xbd, 0x80, 0x95, 0xc5, 0xbd, 0x0a, 0x19, 0x93, 0xbd, 0x94, 0xc1, 0xe4, 0xbd, 0xdd, 0x20, 0x18, 0x3e, 0xb3, 0x48, 0xba, 0xbd, 0xdd, 0x6b, 0x86, 0xbd, 0x3d, 0xbc, 0xb1, 0xbd, 0xbe, 0xc1, 0x7f, 0xbc, 0xfc, 0x54, 0x83, 0x3d, 0xb5, 0x4e, 0x1e, 0xbd, 0x5f, 0x54, 0xc3, 0x3c, 0xe4, 0x2e, 0x0a, 0x3e, 0xc9, 0x05, 0x05, 0x3d, 0xc7, 0x8d, 0x2c, 0xbc, 0x37, 0x21, 0xc2, 0xbc, 0xea, 0x7e, 0x96, 0x3d, 0x64, 0x7a, 0xca, 0x3d, 0xcb, 0xcf, 0xc8, 0x3b, 0x5a, 0xd4, 0x00, 0xbe, 0x5f, 0x49, 0xd0, 0x3d, 0xbe, 0x56, 0x15, 0x3e, 0x3f, 0x1d, 0x9e, 0xbd, 0xd4, 0x91, 0xa9, 0x3d, 0xf1, 0xea, 0x4b, 0xbb, 0x78, 0x4a, 0xa5, 0x3c, 0xc2, 0x9b, 0xac, 0xbd, 0x8c, 0xd3, 0x94, 0xbd, 0xb1, 0x52, 0x94, 0xbd, 0x55, 0xdd, 0x0d, 0xbe, 0x93, 0x2e, 0xa1, 0x3d, 0x31, 0x1e, 0xe0, 0x3c, 0xaf, 0xba, 0x6c, 0x3d, 0x8e, 0xec, 0x8f, 0xbd, 0x38, 0x79, 0xd2, 0xbc, 0x21, 0x7e, 0x9d, 0x3d, 0xbb, 0x21, 0xeb, 0x3d, 0x6e, 0x68, 0xec, 0x3d, 0xc2, 0xf4, 0xb6, 0xbd, 0x80, 0xe2, 0x91, 0xbc, 0x45, 0xa5, 0x8f, 0xbb, 0xf8, 0xb2, 0xc7, 0xbd, 0xe4, 0x47, 0x3a, 0xbd, 0xa2, 0x4f, 0xe9, 0xbd, 0xcc, 0x37, 0x53, 0x3c, 0x51, 0x03, 0x4f, 0x3d, 0x35, 0xa2, 0xfa, 0x3d, 0xea, 0x64, 0x7b, 0xbc, 0xbf, 0x49, 0xfb, 0x3d, 0x3d, 0x8e, 0x7b, 0x3b, 0x9c, 0x4b, 0x35, 0xbd, 0x62, 0xf1, 0x10, 0xbe, 0xac, 0xd2, 0xd8, 0xbd, 0x80, 0x00, 0x9d, 0x3d, 0xcc, 0x19, 0xaf, 0xbc, 0x97, 0x73, 0xdb, 0x3d, 0x6d, 0xb6, 0xf3, 0x3d, 0x19, 0xe7, 0x7a, 0xbd, 0xcf, 0xba, 0xc6, 0x3c, 0x77, 0xfc, 0x23, 0x3d, 0xd6, 0xfe, 0x3f, 0x3d, 0x73, 0xf2, 0xdb, 0xbd, 0x3d, 0x21, 0x95, 0xbb, 0x58, 0xb8, 0x86, 0xbd, 0x01, 0x3c, 0x6f, 0x3d, 0xaf, 0x2e, 0x3e, 0xbd, 0x7b, 0x6d, 0x73, 0xbd, 0x33, 0xe2, 0x5f, 0xbc, 0x64, 0x5f, 0xdb, 0xbd, 0x31, 0xf5, 0xb6, 0xbd, 0xfc, 0x90, 0xd4, 0xbd, 0x25, 0xd8, 0xc4, 0xbd, 0x38, 0xdf, 0xb9, 0x3d, 0x89, 0x14, 0x8b, 0x3d, 0x8d, 0x05, 0x2c, 0xbd, 0x20, 0xb8, 0xa3, 0xbc, 0xaf, 0x68, 0x12, 0x3d, 0xce, 0x53, 0xb0, 0xbd, 0xca, 0x8a, 0x95, 0x3d, 0x11, 0x84, 0x8a, 0x3d, 0x6d, 0xbd, 0x67, 0xbb, 0xe8, 0xd5, 0x76, 0xbc, 0xac, 0xc8, 0xfb, 0xbd, 0xa9, 0x8b, 0xa4, 0xbb, 0x3e, 0x3a, 0xba, 0x3d, 0xe2, 0xa5, 0x50, 0x3d, 0xf0, 0x4d, 0x81, 0x3b, 0x96, 0x79, 0x31, 0xbd, 0x87, 0xaf, 0xe5, 0x3a, 0x27, 0xb7, 0xa5, 0x3d, 0xd4, 0x71, 0xb5, 0xbd, 0x95, 0x06, 0xd1, 0xbd, 0x82, 0x3d, 0x1c, 0xbc, 0xdc, 0xe4, 0x6e, 0x3d, 0x21, 0xcf, 0x80, 0xbc, 0xbe, 0xc7, 0xb7, 0xbc, 0x21, 0x87, 0x3c, 0x3d, 0x11, 0x3a, 0x67, 0xbd, 0xa5, 0xd3, 0xe8, 0xbd, 0x9a, 0xb7, 0xc2, 0x3d, 0x2e, 0xa7, 0x86, 0xbc, 0xbe, 0x03, 0x26, 0xbc, 0x5e, 0x12, 0x08, 0xbe, 0x1d, 0xd9, 0xf8, 0xbd, 0xf3, 0x79, 0xe4, 0xbd, 0x38, 0xaa, 0x04, 0x3e, 0x98, 0x40, 0xa7, 0x3d, 0xfa, 0xd9, 0xce, 0xbd, 0x08, 0x73, 0x16, 0xb9, 0xd6, 0x47, 0x2c, 0x3d, 0x08, 0xb5, 0x8b, 0xbd, 0x04, 0x66, 0x70, 0x3c, 0x9f, 0xe6, 0xe4, 0xbd, 0x7f, 0xcd, 0xa5, 0x3b, 0x5b, 0x92, 0x8b, 0xbd, 0x29, 0x55, 0x19, 0xbd, 0x79, 0x98, 0x26, 0x3d, 0x32, 0x3d, 0xc3, 0xb9, 0x29, 0x8a, 0x05, 0xbe, 0xe8, 0x61, 0x92, 0x3d, 0x4f, 0x64, 0xa9, 0x3d, 0x00, 0x9a, 0xa0, 0xbd, 0x34, 0xcc, 0xd8, 0x3c, 0xcd, 0x8a, 0xaf, 0x3d, 0x69, 0xc6, 0x5c, 0x3c, 0xe0, 0x76, 0xd3, 0x3d, 0x49, 0x6a, 0x79, 0x3b, 0x33, 0x10, 0xbd, 0x3c, 0xe9, 0x47, 0x2a, 0xbd, 0x7f, 0xb4, 0x3e, 0xbb, 0x80, 0xd2, 0x18, 0xbe, 0xf3, 0x5c, 0x90, 0xbd, 0x0b, 0x88, 0xaf, 0xbd, 0x24, 0x0c, 0x94, 0xbd, 0xfd, 0xa9, 0xa1, 0xbd, 0x40, 0xc9, 0x82, 0xbd, 0x24, 0x56, 0xa0, 0x3c, 0xa0, 0x3e, 0x09, 0x3e, 0x30, 0x93, 0xc7, 0x3d, 0x03, 0xa3, 0x0c, 0x3c, 0x88, 0xdc, 0x96, 0x3d, 0xac, 0x34, 0xc7, 0xbd, 0x64, 0xb0, 0xe5, 0x3d, 0x61, 0x56, 0xc8, 0x3d, 0x08, 0x55, 0x99, 0x3d, 0xb5, 0xa9, 0x56, 0xbd, 0xfb, 0x4f, 0x95, 0xbd, 0xe9, 0xeb, 0x55, 0x3d, 0xbf, 0x4c, 0xdf, 0xbd, 0xbf, 0x4a, 0x12, 0xbb, 0x93, 0x9d, 0x65, 0xbd, 0x26, 0xd0, 0xce, 0x3d, 0x89, 0x19, 0x64, 0xbd, 0x91, 0x3d, 0x3f, 0x3d, 0x23, 0x3a, 0x3b, 0xbd, 0xc8, 0x9d, 0x20, 0xbc, 0xa1, 0x2c, 0xff, 0xbb, 0x8c, 0x39, 0xb2, 0x3b, 0xf3, 0xbe, 0x86, 0x3d, 0xa3, 0xfa, 0xcc, 0xbd, 0x3d, 0x3c, 0x07, 0xbe, 0xd4, 0xb4, 0xa7, 0xbd, 0x94, 0xfc, 0x71, 0x3d, 0x8b, 0xe6, 0x2e, 0x3d, 0x94, 0x30, 0x41, 0xbd, 0xb3, 0x63, 0x18, 0x3d, 0xbf, 0x35, 0x3c, 0xbb, 0x4c, 0xaa, 0xd9, 0xbd, 0x20, 0x83, 0xa1, 0x3d, 0xdb, 0xca, 0x49, 0x3c, 0x1d, 0xbb, 0xac, 0xbb, 0x3c, 0xea, 0x1c, 0xbc, 0x5b, 0xc3, 0xd1, 0x3d, 0x15, 0xd3, 0xc9, 0xbd, 0xb9, 0x30, 0x12, 0xbb, 0xe3, 0x34, 0xde, 0xbd, 0xa0, 0x31, 0xeb, 0xbd, 0xc2, 0x64, 0xe2, 0x3d, 0xb2, 0xfd, 0xf4, 0xbd, 0x45, 0xa5, 0xbe, 0x3c, 0xa1, 0x40, 0x56, 0xbd, 0x52, 0x01, 0xed, 0x3d, 0xd0, 0x6b, 0xfc, 0xbd, 0xef, 0x73, 0xb2, 0xbd, 0x03, 0xa0, 0xcd, 0xbd, 0x24, 0x69, 0xbe, 0x3c, 0x76, 0xcd, 0x9e, 0x3d, 0xbe, 0xcb, 0x3b, 0x3d, 0x55, 0x49, 0x4e, 0xbd, 0x99, 0xe9, 0xd5, 0xbc, 0x9c, 0x73, 0x88, 0x3c, 0x9a, 0x64, 0x75, 0xbd, 0x53, 0x89, 0xb2, 0xbd, 0x73, 0xa4, 0xb9, 0x3d, 0xa8, 0x68, 0xf3, 0xbd, 0x2a, 0xf3, 0x89, 0xbd, 0x8d, 0x63, 0x85, 0x3c, 0xbb, 0x72, 0x63, 0x3d, 0x29, 0x8a, 0xe8, 0xbd, 0x87, 0x03, 0xab, 0x3d, 0xbf, 0x88, 0x44, 0xbd, 0x74, 0x28, 0xae, 0xbd, 0xf7, 0xe8, 0x87, 0xbd, 0x16, 0x46, 0x04, 0xbd, 0x87, 0xf6, 0xcf, 0xbd, 0x8b, 0x67, 0x44, 0xbd, 0xac, 0xd4, 0xa5, 0xbd, 0xed, 0x0b, 0xf2, 0xbd, 0x20, 0x9e, 0xf5, 0xbd, 0xc1, 0xbd, 0x70, 0x3d, 0xae, 0xfe, 0x77, 0x3d, 0x27, 0x07, 0x82, 0xbd, 0xbe, 0x56, 0x19, 0xbd, 0xae, 0x94, 0xc9, 0xbd, 0x7a, 0x52, 0xc6, 0xbd, 0x4e, 0x64, 0x4d, 0x3c, 0xf7, 0xe4, 0x18, 0x3d, 0xef, 0x06, 0xa4, 0xbd, 0x8c, 0xad, 0xa8, 0xbd, 0xab, 0xcc, 0x62, 0xbc, 0x4a, 0x7c, 0x09, 0xba, 0x01, 0x0d, 0x2b, 0xbd, 0x3d, 0x77, 0xb6, 0x3b, 0xd3, 0x48, 0xc8, 0x3d, 0x89, 0xcf, 0x05, 0x3e, 0xdb, 0x48, 0x92, 0x3d, 0x1e, 0xa5, 0xc9, 0x3c, 0xc7, 0xad, 0x74, 0x3d, 0x66, 0x26, 0x4e, 0xbd, 0x8f, 0x4c, 0x85, 0x3d, 0xe2, 0x14, 0xe3, 0x3d, 0xad, 0x90, 0x2b, 0xbd, 0xcd, 0x7c, 0xf4, 0x3d, 0xe6, 0xae, 0x98, 0x3c, 0xa6, 0x86, 0x66, 0x3c, 0x18, 0x11, 0x1f, 0xbc, 0xb8, 0xe5, 0xa3, 0xbc, 0xea, 0xd7, 0x47, 0xbd, 0x39, 0x8a, 0xbb, 0x3d, 0x1c, 0x27, 0x4c, 0xba, 0x50, 0x9a, 0x4b, 0xbd, 0xda, 0x55, 0x5c, 0xbd, 0xa7, 0xd6, 0xb4, 0x3d, 0x40, 0x3f, 0xa0, 0xbd, 0x26, 0xa7, 0xba, 0xbd, 0x4c, 0xc0, 0x5c, 0x3d, 0x5c, 0xe1, 0x96, 0x3d, 0x50, 0xd9, 0x36, 0xbb, 0x8b, 0xf8, 0x7e, 0xbb, 0xb4, 0x9c, 0xf0, 0x3d, 0x88, 0xf4, 0xa8, 0xbd, 0x92, 0x72, 0x0e, 0xbd, 0x18, 0xc1, 0xa0, 0x3c, 0x78, 0x3f, 0xc6, 0xbd, 0xfa, 0xec, 0xe8, 0xbd, 0xa4, 0xbc, 0x3d, 0xbd, 0x47, 0x9d, 0xc6, 0xbc, 0x8e, 0x10, 0x4b, 0x3d, 0x18, 0x89, 0x51, 0xbd, 0x26, 0xd5, 0x9b, 0xbd, 0xb9, 0xbb, 0x0a, 0xbe, 0xa7, 0x0f, 0x8f, 0x3d, 0x62, 0x63, 0x4b, 0xbb, 0xfe, 0x46, 0x56, 0xbd, 0x64, 0xcc, 0xbb, 0x3d, 0x85, 0x17, 0x52, 0x3d, 0x08, 0xa8, 0x0e, 0x3d, 0x75, 0xdc, 0x4c, 0xbd, 0xf9, 0xc3, 0x92, 0x3d, 0xe0, 0x13, 0x84, 0x3d, 0xa1, 0x30, 0xe8, 0xbd, 0x2d, 0x2b, 0xd0, 0xbd, 0x68, 0x62, 0x91, 0xbc, 0x32, 0xd7, 0xd3, 0xbb, 0xac, 0xd6, 0xdb, 0x3d, 0x0d, 0x70, 0xe9, 0xbd, 0xed, 0xea, 0x69, 0x3d, 0xa4, 0xa3, 0x99, 0x3d, 0x60, 0xa0, 0xcd, 0xbd, 0xd8, 0x9b, 0x20, 0x3c, 0x29, 0x39, 0xaf, 0x3d, 0xd3, 0x2d, 0x2e, 0x3d, 0x10, 0xd7, 0x60, 0x3d, 0x2b, 0x82, 0xb1, 0xbd, 0x3d, 0x6b, 0x94, 0xbd, 0x73, 0xa6, 0x24, 0x3d, 0x33, 0x6b, 0xf9, 0xbd, 0x94, 0xe1, 0xac, 0x3d, 0xdf, 0x2c, 0x77, 0x3d, 0x82, 0x66, 0xa0, 0x3c, 0x9d, 0x7c, 0xd1, 0xbd, 0x67, 0x66, 0x39, 0x3d, 0x1b, 0xb4, 0x5e, 0x3d, 0x0a, 0x50, 0x7f, 0x3d, 0x1a, 0x08, 0x6c, 0x3d, 0x6c, 0x55, 0xac, 0xbd, 0x27, 0x4d, 0x04, 0xbc, 0x28, 0x6e, 0x54, 0x3c, 0x8d, 0x2e, 0x95, 0xbd, 0x56, 0x25, 0xd5, 0x3a, 0x8d, 0xf8, 0xde, 0xbd, 0x53, 0xd6, 0xe0, 0x3c, 0x09, 0xfc, 0x3f, 0x3d, 0x95, 0x29, 0xbe, 0xba, 0x9b, 0x98, 0xa6, 0x3d, 0xfd, 0xd1, 0xe1, 0x3d, 0x00, 0x2a, 0x04, 0xbe, 0x06, 0x73, 0x8b, 0xbd, 0x1e, 0x77, 0xcd, 0x3d, 0xf3, 0x47, 0x01, 0xbe, 0x41, 0x8d, 0xd2, 0xbc, 0x98, 0xba, 0x02, 0xbe, 0x14, 0x4e, 0x84, 0xbc, 0x7b, 0xee, 0xc1, 0x3d, 0x5c, 0x1f, 0x5f, 0xbd, 0x66, 0x1e, 0xd4, 0xbd, 0xa7, 0x18, 0x51, 0x3d, 0xaa, 0xbb, 0x7f, 0x3b, 0x9a, 0x15, 0x33, 0x3d, 0xcd, 0x6b, 0x8d, 0x3d, 0x9c, 0x73, 0x6d, 0xbd, 0x76, 0x3e, 0x54, 0x3c, 0x3d, 0x4f, 0xe4, 0x3d, 0x89, 0xaf, 0xf9, 0x3d, 0x0f, 0x5f, 0x8b, 0xbd, 0x5d, 0xcc, 0x9c, 0xbd, 0x8b, 0x08, 0xf1, 0xbd, 0xe3, 0xc3, 0x04, 0xbd, 0x5f, 0x0b, 0xf8, 0x3d, 0x4f, 0xd8, 0xaf, 0x3d, 0x2f, 0xff, 0x3e, 0x3d, 0x07, 0xf0, 0x5f, 0xbb, 0xcd, 0x6b, 0xbd, 0xbd, 0x0a, 0x80, 0xee, 0x3d, 0x58, 0xa2, 0xbd, 0x3c, 0xa6, 0x43, 0xf9, 0xbc, 0x7e, 0x76, 0xbb, 0x3d, 0x0b, 0x75, 0x11, 0xb9, 0x7c, 0x78, 0x46, 0x3d, 0xe9, 0xf0, 0x73, 0x3d, 0x6d, 0x01, 0x50, 0xbc, 0x6f, 0x55, 0x80, 0x3d, 0x88, 0x5d, 0xd4, 0xbc, 0x20, 0x61, 0x94, 0xbd, 0xbd, 0x32, 0xa3, 0x3c, 0x91, 0x29, 0xb3, 0xbd, 0x7a, 0x60, 0x62, 0xbc, 0xd8, 0x67, 0x99, 0xbb, 0xea, 0xd6, 0x4a, 0xbd, 0xb2, 0xb3, 0x14, 0xbd, 0x15, 0x9f, 0xf6, 0x3d, 0xc4, 0x35, 0xbe, 0xbd, 0xc6, 0x0b, 0x63, 0x3d, 0x43, 0x76, 0x43, 0xbd, 0x4f, 0x5e, 0x18, 0xbc, 0x6b, 0xac, 0xb1, 0x3d, 0x4e, 0xca, 0xd8, 0xbd, 0x2f, 0xef, 0xc3, 0x3d, 0x96, 0xc3, 0x48, 0x3c, 0x1c, 0x73, 0x17, 0x3d, 0x56, 0x34, 0xfb, 0x3c, 0x25, 0xa7, 0xb2, 0x3d, 0x29, 0x5e, 0xac, 0x3d, 0xdd, 0x3b, 0x80, 0x3d, 0x5a, 0xec, 0x37, 0x3c, 0xdc, 0xf9, 0x92, 0x3b, 0x66, 0x0b, 0xc6, 0xbd, 0x75, 0x09, 0xfc, 0xbc, 0x55, 0xd9, 0xea, 0xbd, 0x01, 0xed, 0x7a, 0x3c, 0x90, 0x7d, 0x5e, 0xbd, 0xb8, 0x38, 0xc9, 0x3d, 0xb8, 0x23, 0xa6, 0x3d, 0xb8, 0x83, 0x01, 0x3e, 0xe8, 0x22, 0xda, 0x3c, 0x66, 0xf5, 0x92, 0x3d, 0x82, 0xe0, 0x87, 0x3c, 0x6f, 0xa1, 0x6e, 0x3d, 0x27, 0xca, 0xaf, 0x3c, 0x7f, 0x68, 0xd6, 0xbd, 0x38, 0x98, 0x93, 0x3d, 0x4d, 0xdc, 0x5e, 0x3d, 0xc8, 0xb8, 0xb2, 0x3d, 0xab, 0xeb, 0x8a, 0xbb, 0x39, 0x48, 0xbb, 0xbd, 0x17, 0xe6, 0x0f, 0x3d, 0x57, 0x79, 0xea, 0xbc, 0xb2, 0x5e, 0xdb, 0x3d, 0x0c, 0x19, 0xc7, 0xbd, 0xeb, 0x33, 0x2b, 0x3d, 0x4b, 0x15, 0xf6, 0x3d, 0x96, 0x9b, 0xa1, 0xbc, 0x5c, 0xc8, 0x03, 0xbd, 0x88, 0x56, 0x21, 0x3e, 0x85, 0x0c, 0xa5, 0x3c, 0x85, 0xcb, 0xf4, 0xbd, 0x61, 0x03, 0x4d, 0x3c, 0xf1, 0xf4, 0x8c, 0xbd, 0x7b, 0x39, 0x34, 0x3b, 0xf4, 0xa2, 0x47, 0xbc, 0x10, 0x2d, 0xfc, 0xbd, 0xe8, 0xdd, 0xe6, 0x3c, 0xa5, 0x7c, 0x85, 0x3c, 0x3f, 0xcd, 0xeb, 0xbc, 0x42, 0x94, 0xba, 0xbd, 0x50, 0x23, 0xe3, 0xbd, 0x92, 0xf6, 0xa7, 0xbd, 0x5c, 0x36, 0xd0, 0xbd, 0x27, 0x9e, 0x18, 0x3e, 0x33, 0x9a, 0xe8, 0xbc, 0x80, 0x3a, 0x5d, 0x3d, 0xd0, 0xdc, 0x9c, 0xbd, 0xa3, 0x93, 0x51, 0xbd, 0x36, 0xab, 0x7a, 0x3d, 0x74, 0x9c, 0x63, 0x3d, 0x1c, 0x19, 0x9b, 0xbd, 0xa6, 0x10, 0xb4, 0xbd, 0xf4, 0x80, 0xb4, 0xbc, 0xd3, 0x9c, 0xd2, 0xbc, 0x6d, 0x1b, 0x68, 0xbd, 0x31, 0x6a, 0xfd, 0xbd, 0xdc, 0xa4, 0x82, 0xbd, 0xa7, 0xe7, 0x37, 0xbd, 0x5c, 0xd1, 0x07, 0xbd, 0x4e, 0x82, 0x15, 0xbc, 0x31, 0x43, 0x16, 0x3e, 0xe2, 0xf3, 0x1e, 0x3e, 0x62, 0x22, 0x14, 0x3e, 0x27, 0x65, 0x0d, 0x39, 0xaa, 0x9e, 0x8f, 0x3d, 0xdd, 0x59, 0x4c, 0x3c, 0x4a, 0xc5, 0xc5, 0xbd, 0x4a, 0xa5, 0xc7, 0x3b, 0xb9, 0x73, 0xcc, 0x3d, 0x10, 0x62, 0x5c, 0x3c, 0x87, 0xd8, 0xb2, 0xbd, 0x15, 0x50, 0xf8, 0x3d, 0xd7, 0x7f, 0x91, 0xbd, 0xf4, 0x07, 0xfb, 0x3c, 0x93, 0x09, 0xae, 0xbc, 0x54, 0x19, 0x76, 0x3a, 0x42, 0x4f, 0xbe, 0xbc, 0x6a, 0xef, 0xee, 0x3d, 0x98, 0x97, 0xb7, 0x3d, 0x33, 0x07, 0x3c, 0xbd, 0xe0, 0xc2, 0x46, 0x3c, 0x33, 0x5f, 0x80, 0x3c, 0x4d, 0x5e, 0xff, 0xbc, 0x4e, 0x02, 0xe8, 0xbc, 0x1f, 0x5b, 0xcd, 0xbc, 0x2d, 0x41, 0x8a, 0x3d, 0x2d, 0xeb, 0x5e, 0xbd, 0xff, 0x53, 0xb0, 0x3d, 0x7c, 0x37, 0xb0, 0x3c, 0x0b, 0xc9, 0x87, 0xbd, 0x32, 0xd1, 0xe6, 0xbb, 0xc0, 0x2f, 0xcf, 0x3d, 0x42, 0x5e, 0xb5, 0x3d, 0xd4, 0xbf, 0x36, 0xbd, 0x26, 0xd8, 0xf1, 0xbd, 0xf3, 0x8b, 0xc2, 0x3d, 0x1d, 0xd9, 0xe7, 0xbb, 0xab, 0xf9, 0x16, 0x3d, 0x13, 0x82, 0x93, 0x3d, 0x5e, 0xab, 0xbc, 0xbd, 0x57, 0xf5, 0x2f, 0x3c, 0x86, 0x19, 0x96, 0x3c, 0x17, 0xb1, 0x3e, 0x3d, 0xcd, 0xfd, 0x72, 0xbd, 0xae, 0x8d, 0xbf, 0x3c, 0x5e, 0x94, 0x5c, 0x3d, 0x16, 0x67, 0x88, 0x3d, 0xf1, 0xcb, 0x43, 0xbd, 0xc5, 0x5e, 0x6b, 0xbd, 0xa0, 0xc2, 0xdb, 0x3d, 0x94, 0x36, 0x11, 0xbd, 0x26, 0xb6, 0xb2, 0xbd, 0xe6, 0x9d, 0x93, 0xbd, 0x66, 0x04, 0x5e, 0xbd, 0xed, 0xfe, 0xaf, 0xbb, 0xbc, 0x70, 0x50, 0x3d, 0x0a, 0xeb, 0xd0, 0xbd, 0x3d, 0x06, 0xb5, 0x3d, 0xa7, 0x77, 0x31, 0xbd, 0x5f, 0x4b, 0xa6, 0xbd, 0x9b, 0x0f, 0x96, 0xbc, 0x7e, 0x02, 0xd4, 0xbc, 0x39, 0x52, 0xc4, 0xbd, 0xc3, 0x4e, 0x09, 0x3e, 0x5c, 0xc9, 0x48, 0x3d, 0xa4, 0x28, 0x36, 0xbd, 0xe3, 0xa7, 0x31, 0x3b, 0xdd, 0x29, 0xf4, 0x3d, 0x30, 0x52, 0x76, 0x3d, 0x10, 0xa8, 0x27, 0x3c, 0x0c, 0x16, 0x56, 0x3d, 0x84, 0xd6, 0x1a, 0xbd, 0x34, 0xea, 0xaa, 0x3c, 0x8b, 0xaa, 0x50, 0xbc, 0x02, 0x56, 0xc2, 0x3c, 0xee, 0x61, 0xe8, 0xbd, 0xf2, 0xaa, 0xb0, 0x3d, 0x22, 0xd5, 0x23, 0x3e, 0x2d, 0x7d, 0x62, 0xbd, 0x8a, 0x95, 0x6d, 0xbc, 0x6a, 0xaf, 0xb4, 0xbb, 0x34, 0x65, 0xad, 0x3d, 0x14, 0xff, 0xda, 0xbd, 0x43, 0xdc, 0x04, 0xbd, 0x26, 0xed, 0xa8, 0xbd, 0x97, 0xc7, 0xc3, 0x3d, 0x76, 0x2d, 0xd3, 0xbc, 0xe1, 0xc3, 0xbd, 0xbd, 0x75, 0x52, 0xca, 0x3c, 0x84, 0xfa, 0x13, 0x3c, 0x2e, 0xea, 0x00, 0xbd, 0xb9, 0xbc, 0xcf, 0x3d, 0xcb, 0x67, 0x65, 0xbd, 0xda, 0x95, 0xac, 0xbd, 0x51, 0x71, 0xed, 0x3c, 0xaf, 0xe1, 0x2c, 0xbd, 0xbf, 0x09, 0x2c, 0xba, 0xd1, 0xdc, 0xab, 0xbd, 0x60, 0xab, 0x71, 0xbc, 0x10, 0xa2, 0x2b, 0xbd, 0xb7, 0xba, 0x8f, 0xbd, 0x5e, 0x4b, 0x18, 0x3d, 0x4f, 0x72, 0xa6, 0xbc, 0xbb, 0x54, 0xc5, 0x3d, 0x2a, 0x54, 0xeb, 0xbd, 0x5b, 0x2e, 0x67, 0xbd, 0xc0, 0xd2, 0x61, 0x3b, 0x30, 0x8d, 0x34, 0x3d, 0xaa, 0x2e, 0xfe, 0xbc, 0x37, 0xa2, 0x7b, 0xbd, 0xb0, 0x0d, 0x7c, 0xbd, 0x05, 0x3f, 0x39, 0x3d, 0x52, 0xfc, 0xb2, 0x3d, 0xe8, 0x4a, 0xe6, 0xbd, 0x49, 0x3f, 0xd0, 0x3c, 0x1d, 0x43, 0x1a, 0xbd, 0x52, 0xcc, 0xc7, 0x3d, 0x6a, 0x3f, 0x72, 0x3b, 0x47, 0x6e, 0xdb, 0xbd, 0x6b, 0x97, 0xc2, 0xbd, 0xa0, 0x78, 0xe5, 0xbc, 0x01, 0xb0, 0xd8, 0xbc, 0xd0, 0x9f, 0x9f, 0xbc, 0x51, 0x99, 0x79, 0x3d, 0xf1, 0xd4, 0x1d, 0x3b, 0xe6, 0x19, 0x78, 0x3c, 0xb0, 0x8a, 0x8e, 0xbd, 0x90, 0xfc, 0xc9, 0x3d, 0x91, 0xe7, 0x85, 0x3d, 0xdd, 0xe2, 0x09, 0x3d, 0xb6, 0xf7, 0x5a, 0xbd, 0x26, 0xe8, 0xdc, 0xbd, 0x42, 0xca, 0x18, 0xbd, 0x2a, 0x1d, 0xb4, 0xbd, 0x83, 0x0b, 0xf1, 0x3a, 0xbd, 0x7b, 0x15, 0x3c, 0xf1, 0x7b, 0xa6, 0xbd, 0x55, 0xe4, 0x4d, 0xbd, 0xed, 0x07, 0xf8, 0xbc, 0xf3, 0x73, 0xa0, 0x3d, 0x75, 0x8a, 0xc5, 0xbd, 0x44, 0x2f, 0x7f, 0x3d, 0x35, 0x6c, 0x87, 0x3c, 0x61, 0x2c, 0x4b, 0xbc, 0x67, 0xde, 0x7d, 0xbd, 0x17, 0xaf, 0xe9, 0x3c, 0xaa, 0xd5, 0x0c, 0x3d, 0x98, 0xf5, 0xd8, 0xbc, 0x86, 0xa5, 0x2c, 0xbb, 0xad, 0x8e, 0x43, 0x3d, 0xd2, 0x59, 0xbd, 0xbd, 0x94, 0xc9, 0x69, 0xbd, 0x15, 0xa0, 0x81, 0x3d, 0x18, 0x49, 0x1e, 0x3d, 0xe7, 0xd7, 0xb5, 0xbd, 0x1f, 0x20, 0x10, 0xbd, 0xb0, 0x8b, 0xe0, 0xbd, 0xe0, 0x7c, 0x46, 0x3d, 0x1f, 0xc6, 0x5c, 0xbd, 0xbc, 0xc1, 0x1b, 0x3d, 0xc1, 0x1c, 0xc5, 0xbd, 0xf3, 0x52, 0x48, 0xbb, 0x39, 0x79, 0x86, 0x3d, 0x72, 0xbd, 0x36, 0x3c, 0xa5, 0xd7, 0x95, 0xbd, 0x73, 0xe0, 0x13, 0x3c, 0xe4, 0x9a, 0x50, 0xbd, 0x90, 0x58, 0x93, 0xbd, 0x3d, 0x9e, 0xac, 0x3d, 0x57, 0x08, 0xbb, 0x3d, 0x4e, 0xaf, 0x84, 0xbd, 0xdc, 0x16, 0xbc, 0xbd, 0x51, 0x1a, 0xbf, 0x3d, 0x62, 0x61, 0x97, 0x3d, 0x7a, 0xeb, 0x45, 0x3d, 0xa1, 0x27, 0xe7, 0x3d, 0x20, 0xcb, 0x45, 0xbd, 0xc3, 0x36, 0xda, 0x3d, 0xa2, 0x88, 0x48, 0x3d, 0x7c, 0x0d, 0x0d, 0x3b, 0x00, 0xa8, 0xaf, 0xbd, 0xda, 0x09, 0x51, 0xbd, 0xbd, 0xb3, 0x99, 0xbc, 0x6e, 0x40, 0x6a, 0xbd, 0x31, 0xdb, 0x71, 0x3c, 0x14, 0x0e, 0x0b, 0xbd, 0xe8, 0x4f, 0xae, 0xbd, 0xbb, 0xf3, 0xd4, 0x3d, 0xad, 0xdb, 0x8d, 0x3c, 0x72, 0x12, 0x66, 0xbd, 0x1f, 0xea, 0x98, 0xbd, 0xf7, 0xd0, 0x68, 0x3d, 0x47, 0x27, 0x13, 0x3d, 0xe9, 0x9d, 0xa2, 0xbd, 0x01, 0x07, 0xa9, 0x3d, 0x81, 0xa9, 0xa2, 0x3c, 0x54, 0x75, 0xb5, 0xbc, 0xbc, 0x9f, 0x8e, 0x3c, 0xdd, 0x55, 0x8c, 0x3c, 0xf6, 0x8f, 0xdc, 0x3d, 0x63, 0x45, 0xe7, 0x3c, 0xc2, 0x06, 0x48, 0x3c, 0x63, 0x7a, 0xe9, 0xbd, 0xb0, 0x14, 0x3f, 0x3d, 0x1b, 0x99, 0xe4, 0xbd, 0x0d, 0xa5, 0x89, 0x3d, 0x5d, 0x1e, 0xc4, 0xbd, 0x9b, 0x12, 0x8e, 0x3d, 0x47, 0xa7, 0xb6, 0xbc, 0xc7, 0x3f, 0xf3, 0xbd, 0x82, 0x32, 0x8f, 0xbd, 0xed, 0x11, 0xbe, 0x3d, 0xe4, 0x1e, 0xc6, 0xbc, 0x9d, 0x73, 0xee, 0xbd, 0xce, 0x18, 0xe3, 0xbd, 0x3f, 0x2c, 0x90, 0xbd, 0xc6, 0x82, 0xad, 0x3d, 0xa4, 0x9e, 0xf1, 0xbd, 0x6e, 0x4f, 0xe7, 0x3d, 0x63, 0x8b, 0x28, 0xbd, 0x0a, 0x66, 0x80, 0xbd, 0xa0, 0xa5, 0x84, 0xbd, 0xb0, 0xce, 0xbb, 0xbd, 0x72, 0xba, 0xa1, 0xbd, 0x42, 0x55, 0xa6, 0xbd, 0x36, 0x00, 0xce, 0x3d, 0x11, 0x44, 0xbc, 0x3b, 0xb4, 0x63, 0xa9, 0x3d, 0x07, 0x61, 0x9b, 0x3d, 0x50, 0xb7, 0xb3, 0xbd, 0xe1, 0xcc, 0x74, 0xbd, 0xa1, 0x8e, 0x6c, 0x3d, 0xa6, 0x54, 0xb6, 0xbd, 0xce, 0xde, 0xb4, 0x3c, 0x29, 0xd3, 0x31, 0xbc, 0x74, 0x1c, 0x78, 0xbd, 0xa7, 0xa4, 0x25, 0xbb, 0x01, 0xe0, 0x85, 0x3d, 0x67, 0xc7, 0xbd, 0xbc, 0xae, 0xdb, 0x3a, 0xbd, 0xaa, 0x9c, 0xdd, 0xbd, 0x7a, 0x65, 0xaa, 0xbc, 0x11, 0x1d, 0x53, 0xbd, 0xc0, 0xf8, 0x3a, 0xbd, 0x50, 0xd4, 0x84, 0xbc, 0x3b, 0x49, 0x7f, 0xbd, 0x44, 0x79, 0xde, 0x3d, 0xb9, 0x83, 0xfb, 0x3d, 0x12, 0x34, 0x8d, 0xbd, 0x0a, 0x31, 0xf0, 0x3c, 0x16, 0x71, 0x4e, 0xbd, 0xc4, 0x6a, 0x5f, 0x3d, 0x5a, 0xbe, 0x7e, 0x3d, 0xca, 0x56, 0xe7, 0xbc, 0xe7, 0xa1, 0xb8, 0xbd, 0xf7, 0xac, 0x17, 0x3d, 0xf1, 0x7c, 0x83, 0xbd, 0xe4, 0x5f, 0xec, 0xbd, 0x18, 0x92, 0xa9, 0xbb, 0x71, 0x9a, 0x3d, 0xbd, 0xd1, 0x18, 0x20, 0xbd, 0x94, 0xfa, 0xbd, 0x3d, 0x2f, 0x1f, 0x85, 0xbd, 0xc1, 0xc3, 0xa3, 0x3d, 0x36, 0xdb, 0x96, 0x3d, 0xa5, 0xae, 0x4e, 0xbc, 0xaa, 0x11, 0x9c, 0xbd, 0x44, 0xa2, 0x95, 0x3d, 0xe7, 0x39, 0x73, 0x3b, 0x1d, 0x57, 0x86, 0xbd, 0x14, 0x17, 0xa7, 0xbd, 0xaf, 0xc3, 0x09, 0xbd, 0x2f, 0x90, 0x20, 0xbd, 0x08, 0x91, 0x9c, 0x3c, 0x88, 0x0c, 0xd1, 0x3d, 0x56, 0x99, 0x9d, 0xbd, 0xb3, 0x75, 0xb2, 0x3d, 0xa1, 0x04, 0x59, 0xbb, 0x44, 0x0a, 0x6f, 0x3b, 0x5a, 0x42, 0xce, 0xbd, 0x1b, 0x3b, 0x91, 0x3d, 0x14, 0xb8, 0xdf, 0xbd, 0x85, 0x51, 0x8c, 0xbc, 0xa7, 0xd5, 0x5f, 0x3d, 0xe7, 0x88, 0x61, 0xbd, 0x97, 0x11, 0xd9, 0x39, 0x5c, 0x0b, 0x6d, 0xbd, 0xe4, 0xe3, 0xb1, 0xbd, 0xeb, 0xfe, 0xeb, 0xbd, 0xd3, 0x37, 0x66, 0x3c, 0x4b, 0x72, 0x49, 0xbd, 0x12, 0x06, 0xbf, 0x3b, 0x12, 0x40, 0x77, 0x3d, 0x7c, 0x9d, 0x92, 0x3d, 0xb2, 0xcd, 0xad, 0x3d, 0xb2, 0xe3, 0x65, 0x3d, 0x91, 0x55, 0xbd, 0x3c, 0x31, 0x00, 0xc0, 0xbd, 0xc9, 0x3b, 0x46, 0x3d, 0x51, 0xd9, 0xa6, 0x3d, 0xb9, 0xcb, 0xaf, 0xbd, 0xf8, 0x85, 0xd4, 0xbd, 0x47, 0x6f, 0xf2, 0xbd, 0x70, 0xd4, 0x13, 0x3d, 0x2c, 0x38, 0x55, 0x3d, 0x61, 0x11, 0xd7, 0x3d, 0x62, 0x90, 0xed, 0xbc, 0xd0, 0x71, 0x79, 0xbd, 0xc5, 0xc9, 0x87, 0xbd, 0x6d, 0x23, 0x96, 0xbc, 0xc1, 0x06, 0x9b, 0xbd, 0xc8, 0x2d, 0xfc, 0xbc, 0x79, 0x8d, 0xb8, 0xbd, 0xb3, 0x32, 0xca, 0xbc, 0x17, 0x71, 0xd3, 0xbd, 0x51, 0x07, 0xc6, 0xbc, 0x59, 0x04, 0x49, 0x3d, 0x15, 0x14, 0x8a, 0xbd, 0xd0, 0xae, 0xa4, 0xbd, 0x4c, 0x5f, 0xdd, 0x3d, 0xb5, 0x52, 0xbc, 0x3b, 0x4d, 0xca, 0x3f, 0xbd, 0x85, 0x21, 0xb0, 0xbd, 0x9e, 0x8b, 0xc3, 0xbd, 0x51, 0xd9, 0xa8, 0x3d, 0x53, 0x49, 0xd1, 0x3c, 0x35, 0x6f, 0xe3, 0xbd, 0x7f, 0xe2, 0x9e, 0xbd, 0x42, 0xd8, 0x14, 0xbd, 0x00, 0x6f, 0x19, 0x3d, 0xe1, 0x4e, 0x53, 0x3d, 0xda, 0xc8, 0x66, 0xbd, 0xf1, 0x51, 0xea, 0xbd, 0x8a, 0x7f, 0xbb, 0x3d, 0xa6, 0x85, 0x10, 0xbd, 0x4e, 0xcc, 0xd7, 0x3d, 0x8b, 0x94, 0xad, 0xbd, 0xaa, 0x92, 0x92, 0xbc, 0xdb, 0xcd, 0x3a, 0x3d, 0x43, 0x71, 0x99, 0x3d, 0xa0, 0xeb, 0xe1, 0x3d, 0xbe, 0x5e, 0xe3, 0x3c, 0x43, 0x28, 0x98, 0xbd, 0x04, 0x2b, 0x96, 0xbd, 0xc6, 0x1a, 0x21, 0xbb, 0xce, 0xba, 0xd3, 0xbd, 0x57, 0xee, 0x04, 0x3d, 0x87, 0xf6, 0x8a, 0xbb, 0xda, 0x72, 0x99, 0x3d, 0xcb, 0x2f, 0x8a, 0x3d, 0x1f, 0x20, 0xb5, 0xbd, 0xbe, 0x1f, 0x1e, 0xbd, 0x17, 0x5e, 0x84, 0xbd, 0xfd, 0xce, 0xb2, 0xbd, 0xfc, 0xcc, 0x74, 0x3d, 0x66, 0x53, 0xca, 0x3c, 0x35, 0x5e, 0x9e, 0x3d, 0x6c, 0x9b, 0xb4, 0x3d, 0x08, 0xbd, 0x90, 0x3d, 0x45, 0xc0, 0xc1, 0xbd, 0x83, 0x2c, 0xd3, 0xbc, 0x85, 0xa9, 0x81, 0xbc, 0xa4, 0x47, 0xbc, 0x3d, 0xc2, 0xc6, 0x91, 0xbb, 0x45, 0xf7, 0x51, 0x3d, 0x7c, 0x74, 0x32, 0x3d, 0x64, 0x6d, 0x67, 0xbd, 0xaf, 0x34, 0x37, 0x3d, 0xea, 0xb0, 0x95, 0xbd, 0xe6, 0x42, 0x22, 0x3d, 0xe4, 0x2b, 0xf9, 0xbd, 0x27, 0x85, 0x8c, 0xbc, 0x57, 0x16, 0xd4, 0x3d, 0x0d, 0x41, 0xb9, 0xbc, 0xde, 0xf7, 0xb3, 0xbc, 0xb1, 0x86, 0x5a, 0x3d, 0x16, 0x06, 0x99, 0x3d, 0x36, 0x5c, 0xf2, 0x3d, 0x96, 0x49, 0xfc, 0xbd, 0xd0, 0xda, 0x0b, 0xbd, 0x74, 0x35, 0xfd, 0x3d, 0x3c, 0x9d, 0x12, 0xbd, 0x88, 0xae, 0xc0, 0xbd, 0xd6, 0xe7, 0x5e, 0x3d, 0x31, 0x3f, 0xba, 0xbd, 0x0a, 0x05, 0xb9, 0xbd, 0x8d, 0xe3, 0x35, 0xbd, 0x83, 0xd0, 0x26, 0xbd, 0x04, 0xba, 0x97, 0xbc, 0x46, 0x99, 0xbf, 0xbd, 0xa1, 0x44, 0x75, 0x3b, 0xb8, 0x9b, 0x07, 0x3e, 0x32, 0xe6, 0xd5, 0xbd, 0xc0, 0x9f, 0xf3, 0x3d, 0x7f, 0x4f, 0x36, 0xbc, 0x42, 0xda, 0xe3, 0x3d, 0x3b, 0xb2, 0x5c, 0x3c, 0x97, 0x30, 0xd7, 0x3d, 0x51, 0xe8, 0xea, 0xbc, 0x6e, 0x73, 0x4d, 0x3d, 0x2f, 0x77, 0xb5, 0x3b, 0x0b, 0x79, 0xc1, 0x3c, 0x2f, 0xd9, 0x8c, 0xbd, 0x0e, 0x78, 0xbf, 0xbd, 0x3c, 0xec, 0x84, 0x3d, 0x59, 0xa9, 0xaa, 0xbd, 0x35, 0xdc, 0xe4, 0xbd, 0x91, 0xcf, 0x2e, 0x3d, 0x3c, 0x17, 0x0d, 0xbc, 0x10, 0xd0, 0xf9, 0x3d, 0xab, 0xca, 0xf9, 0xbd, 0x4b, 0xd7, 0x9b, 0x3d, 0xd0, 0x10, 0xc9, 0xbd, 0x11, 0x82, 0x05, 0x3e, 0xd0, 0x14, 0x21, 0xbd, 0x6d, 0x61, 0x99, 0xbd, 0xae, 0x85, 0x7a, 0xbd, 0x67, 0xc0, 0x86, 0xbb, 0x1e, 0xd0, 0xbf, 0x3d, 0x92, 0x46, 0xf8, 0xbc, 0x0d, 0xad, 0xa1, 0x3c, 0xea, 0x8d, 0xd0, 0x3c, 0x61, 0x10, 0x49, 0x3c, 0x8a, 0x7e, 0xe9, 0xbc, 0x31, 0x95, 0xdf, 0xb9, 0xb5, 0x03, 0x0d, 0x3d, 0x0b, 0xf5, 0xd9, 0xbb, 0xba, 0x95, 0x8f, 0xbd, 0x7c, 0x81, 0xde, 0xbd, 0xfc, 0x64, 0xcb, 0x3d, 0x0e, 0x80, 0x2c, 0x3d, 0x64, 0xa8, 0x0b, 0x3d, 0x58, 0xd7, 0xcc, 0xbc, 0x06, 0x10, 0x81, 0x3d, 0xd6, 0x24, 0x2f, 0xbe, 0x2f, 0x77, 0x4e, 0xbd, 0x53, 0x72, 0x1a, 0xbd, 0xc1, 0x05, 0x6e, 0x3d, 0x0b, 0x99, 0x8e, 0xbd, 0x30, 0x10, 0x04, 0xbd, 0xc3, 0x1c, 0x00, 0xbd, 0xf1, 0x16, 0xba, 0xbd, 0x00, 0x43, 0x03, 0xbc, 0xb8, 0x2d, 0xf4, 0x3c, 0x18, 0x18, 0x4d, 0x3d, 0x70, 0x7c, 0x99, 0xb9, 0x49, 0xef, 0xd2, 0xbc, 0x8a, 0xa4, 0x11, 0x3d, 0xe4, 0x8b, 0x5b, 0xbc, 0x16, 0xc1, 0x8c, 0xb9, 0x71, 0xa4, 0x37, 0x3d, 0xb2, 0xa4, 0xb0, 0x3c, 0x79, 0x6c, 0x8a, 0x3d, 0xb6, 0x86, 0x96, 0x3c, 0x06, 0xd1, 0x58, 0xbd, 0xae, 0x40, 0x92, 0xbc, 0x4c, 0x63, 0xa7, 0x3d, 0xac, 0x67, 0xb4, 0xbd, 0x5b, 0xda, 0x17, 0xbd, 0xeb, 0xfc, 0x09, 0x3d, 0x44, 0x95, 0x68, 0x3c, 0x03, 0xee, 0xd7, 0x3d, 0x57, 0x9f, 0xc2, 0x3d, 0x9c, 0xa6, 0xe7, 0x3b, 0xff, 0x8e, 0xcd, 0xbc, 0x22, 0x41, 0xf7, 0x3c, 0x19, 0xe0, 0x1d, 0xbd, 0xae, 0xcc, 0xe2, 0x3b, 0x70, 0xb1, 0x9f, 0x3d, 0xd8, 0x1d, 0xb7, 0x3d, 0xa1, 0xde, 0x4d, 0x3c, 0x12, 0xb6, 0x08, 0x3e, 0x1d, 0x9c, 0xbf, 0x3d, 0xd8, 0x48, 0x4a, 0xbb, 0x07, 0xd1, 0x5e, 0xbd, 0xd3, 0x82, 0xb1, 0x3d, 0x82, 0xef, 0x8d, 0x3d, 0x40, 0x79, 0xe5, 0xbc, 0x3f, 0x85, 0x8b, 0x3d, 0x6a, 0xa3, 0xa7, 0xbd, 0xed, 0xd4, 0xaf, 0xbd, 0x15, 0xf2, 0x96, 0xbd, 0x16, 0x8b, 0xf2, 0xbc, 0xdc, 0x5f, 0xc8, 0xbd, 0xef, 0x46, 0xb3, 0xbd, 0x41, 0x7a, 0x8c, 0xbd, 0x24, 0xfe, 0x62, 0xbd, 0xdf, 0xab, 0x89, 0xbb, 0xa9, 0x9c, 0xd6, 0x3d, 0xf5, 0xc0, 0x2c, 0x3d, 0x20, 0x81, 0xef, 0x3d, 0x1d, 0x1f, 0xd8, 0x3d, 0xe3, 0xea, 0xb7, 0xbc, 0xe5, 0x98, 0xb7, 0x3d, 0x97, 0x67, 0x48, 0x3d, 0x42, 0x5e, 0x10, 0xbe, 0x52, 0xdd, 0xb2, 0xbd, 0x79, 0x0f, 0x60, 0x3d, 0x7e, 0xc5, 0x1c, 0x3d, 0x9b, 0x47, 0x8a, 0xbd, 0xfe, 0x5a, 0x90, 0xba, 0xb3, 0x60, 0x7e, 0xbd, 0x59, 0x16, 0x7e, 0xbd, 0xb6, 0xb7, 0x01, 0x3d, 0x0d, 0x3c, 0xed, 0xbc, 0x0d, 0x44, 0x3c, 0xbb, 0x77, 0x3f, 0xf6, 0xbc, 0x74, 0x91, 0xb9, 0x3d, 0x15, 0xa6, 0x38, 0xbd, 0x6f, 0xa1, 0x39, 0x3d, 0xc8, 0x2e, 0xd8, 0x3d, 0x70, 0xf9, 0x7c, 0xbc, 0x17, 0x9c, 0xa5, 0x3a, 0xfd, 0x15, 0x0a, 0x3d, 0x55, 0x8c, 0xa7, 0x3d, 0xff, 0x06, 0x22, 0xbd, 0x2d, 0x31, 0x15, 0xbe, 0x70, 0x92, 0x92, 0xbd, 0x29, 0x8a, 0x0d, 0x3b, 0x6b, 0xca, 0x3d, 0xbd, 0xf2, 0xe1, 0x28, 0xbc, 0x36, 0x7a, 0x44, 0xbc, 0xea, 0x62, 0xd9, 0x3a, 0xd2, 0xdd, 0x9e, 0xbc, 0xda, 0xce, 0x16, 0xbe, 0x79, 0x5e, 0x97, 0x3b, 0x26, 0x34, 0x38, 0xbd, 0x77, 0x5d, 0x97, 0x3c, 0xc6, 0xcb, 0x84, 0xbd, 0xed, 0xa4, 0xda, 0x3d, 0xd2, 0x4f, 0x6d, 0xbc, 0x35, 0x16, 0xdc, 0xbd, 0xea, 0xfb, 0x08, 0xbe, 0x84, 0xea, 0x1e, 0xbd, 0x0e, 0x3a, 0x60, 0xb8, 0x4f, 0x4b, 0x0a, 0xbe, 0xfe, 0x33, 0x87, 0x3d, 0x63, 0x5e, 0x8d, 0x3d, 0x68, 0x29, 0x17, 0x3e, 0xa5, 0x25, 0x8f, 0xbc, 0x0a, 0x09, 0x78, 0xbd, 0x43, 0x98, 0x6d, 0xbd, 0x98, 0xa8, 0xa0, 0xbd, 0x7c, 0xa3, 0x13, 0x3d, 0xd4, 0xb8, 0x6d, 0xbc, 0x20, 0x1f, 0xc5, 0xbc, 0x06, 0xb5, 0x16, 0x3e, 0xcd, 0x4d, 0x90, 0xbd, 0xb8, 0xcc, 0xd4, 0x3d, 0xbd, 0xe9, 0xd1, 0xbd, 0x90, 0x68, 0xcf, 0x3d, 0xa7, 0xc6, 0x08, 0xbe, 0x1c, 0xe5, 0x5c, 0xbd, 0x6e, 0x56, 0xa6, 0x3d, 0x74, 0x4f, 0xa5, 0x3d, 0x96, 0x2b, 0x5a, 0x3d, 0xbe, 0xc6, 0x9b, 0xbd, 0x94, 0x33, 0x18, 0x3d, 0x57, 0x1a, 0x6b, 0xbd, 0xd7, 0x3d, 0x03, 0xbe, 0x6a, 0x36, 0x65, 0xbd, 0x13, 0x36, 0xbf, 0x3d, 0x82, 0x9a, 0x0a, 0x3d, 0x3c, 0x1d, 0xca, 0xbd, 0x0c, 0x40, 0x0e, 0xbe, 0x3f, 0x94, 0xae, 0xbd, 0x1f, 0x7e, 0x89, 0x3d, 0xe3, 0xbf, 0x30, 0xbe, 0x7a, 0x48, 0x23, 0x3a, 0xe5, 0x0e, 0x5d, 0x3d, 0x91, 0xd3, 0xf2, 0x3d, 0xb6, 0xef, 0x4a, 0xbd, 0xd4, 0xb3, 0x08, 0xbe, 0xa9, 0xba, 0xac, 0x3d, 0x31, 0x40, 0x86, 0x3d, 0xc2, 0xc7, 0x04, 0xbe, 0x7c, 0x3b, 0xdb, 0x3d, 0x11, 0x25, 0x04, 0xbd, 0x3f, 0x5d, 0xf3, 0xbc, 0xc2, 0x3f, 0xfb, 0x3c, 0x12, 0xac, 0xf4, 0xbd, 0xa7, 0xc4, 0x32, 0x3c, 0xc9, 0xea, 0xe3, 0x3c, 0x7d, 0xda, 0x36, 0x3c, 0x43, 0x55, 0x09, 0x3e, 0x5f, 0xd8, 0x22, 0xbd, 0x33, 0xf5, 0x29, 0x3e, 0xb8, 0x23, 0x8a, 0xbc, 0xfb, 0x3f, 0x52, 0xbe, 0xec, 0x1c, 0x79, 0x3d, 0x09, 0x9e, 0x24, 0xbd, 0x5b, 0x3c, 0xd3, 0xbd, 0x9f, 0x0b, 0x1f, 0x3e, 0x1f, 0xa2, 0xfc, 0xbd, 0x3b, 0x42, 0x9b, 0x3b, 0x0a, 0xae, 0xc4, 0xbc, 0x8b, 0xc8, 0xa7, 0x3d, 0x88, 0xaa, 0x9b, 0xbd, 0xaa, 0x37, 0xb6, 0x3d, 0x0d, 0x6a, 0x15, 0x3d, 0x47, 0xa8, 0x87, 0x3d, 0x53, 0xb1, 0xe3, 0x3d, 0xf7, 0x63, 0x0e, 0x3c, 0x37, 0x70, 0x8e, 0xbc, 0xc5, 0x5c, 0x32, 0xbe, 0x72, 0x7a, 0xd5, 0x3d, 0xcb, 0xac, 0xc7, 0xbd, 0x6f, 0xf1, 0x3a, 0xbd, 0x74, 0x40, 0x99, 0x3d, 0x35, 0x16, 0x88, 0xbc, 0xb4, 0x80, 0x14, 0x3e, 0x0b, 0x98, 0xd9, 0x3c, 0xa7, 0x98, 0x17, 0xbc, 0x6e, 0xd0, 0x60, 0xbb, 0xd9, 0xc2, 0x8f, 0x3d, 0xea, 0x37, 0xe1, 0xbd, 0x00, 0x42, 0xfd, 0x3d, 0xde, 0xb0, 0x3a, 0x3d, 0x4f, 0xe2, 0x50, 0x3c, 0x76, 0x9f, 0x42, 0xbd, 0x73, 0x18, 0x4e, 0xbe, 0x9b, 0xfd, 0x69, 0xbd, 0x69, 0xb2, 0x88, 0xbc, 0x6a, 0x13, 0x3e, 0xbd, 0x29, 0xf0, 0x0c, 0x3c, 0x1f, 0x81, 0x18, 0x3d, 0x03, 0x2e, 0x0c, 0x3e, 0xff, 0xf1, 0x4a, 0xbc, 0xb7, 0x9c, 0x14, 0xbe, 0xd5, 0x52, 0xce, 0xbd, 0xf6, 0x45, 0xf0, 0x3d, 0x8d, 0xc8, 0x55, 0xbd, 0x8f, 0xf0, 0x88, 0x3d, 0x8c, 0x8f, 0x20, 0xbd, 0x38, 0x7c, 0x4d, 0x3e, 0x6d, 0xba, 0x95, 0xbd, 0xdc, 0x7b, 0x0d, 0xbe, 0x3d, 0xbf, 0x2d, 0x3c, 0xee, 0xf6, 0xcb, 0x3c, 0x42, 0x85, 0x2e, 0x3d, 0x43, 0x4c, 0xb3, 0x3d, 0xe6, 0x70, 0x91, 0xbd, 0x58, 0x98, 0xfd, 0x3d, 0x70, 0x75, 0x52, 0xbd, 0xb7, 0x44, 0x34, 0xbe, 0x62, 0x65, 0xdc, 0xbd, 0xb8, 0xc7, 0x83, 0x3c, 0x0d, 0x0a, 0xaa, 0xbd, 0x09, 0xcb, 0x92, 0x3c, 0xbd, 0x5d, 0xc7, 0xb9, 0x3a, 0x4e, 0xa6, 0xbd, 0xd8, 0xfb, 0xa6, 0xbd, 0xcd, 0xfc, 0x72, 0xbe, 0x12, 0xdc, 0x4d, 0xbd, 0x0a, 0x7c, 0x5d, 0x3d, 0x8c, 0xce, 0x7a, 0x3d, 0xe8, 0x3d, 0x83, 0xbd, 0x0d, 0x6c, 0x9e, 0x3d, 0x14, 0xb3, 0x3c, 0x3d, 0x05, 0x0e, 0xdf, 0x3d, 0xf7, 0x27, 0xb7, 0xbd, 0xa3, 0x18, 0x08, 0x3d, 0x54, 0xdb, 0x6a, 0x3c, 0x93, 0x1a, 0x80, 0xbd, 0xf9, 0x13, 0x05, 0x3e, 0xd9, 0x61, 0x87, 0x3d, 0x08, 0xa5, 0x9b, 0xbd, 0x70, 0x5d, 0xc9, 0xbc, 0x9b, 0x99, 0x94, 0xbd, 0xc5, 0x6e, 0xd4, 0xbd, 0xc8, 0x60, 0xad, 0x3d, 0x29, 0x62, 0x05, 0xbd, 0x83, 0xd8, 0xc1, 0xbd, 0xa2, 0x72, 0xf1, 0x3d, 0x57, 0x3f, 0x2e, 0xbb, 0xb8, 0x1a, 0xcf, 0xbc, 0xc3, 0xda, 0x96, 0xbd, 0xd3, 0xbc, 0x81, 0xbd, 0xca, 0x52, 0xa1, 0xbb, 0xe8, 0xaf, 0x6a, 0x3d, 0x49, 0xaa, 0xf8, 0x3c, 0x5f, 0x2a, 0x9a, 0xbd, 0xcb, 0x12, 0x6b, 0xbd, 0xc9, 0x4a, 0x8f, 0xbc, 0xce, 0x3c, 0xfd, 0x3d, 0x71, 0x17, 0xed, 0x3d, 0x54, 0x40, 0xea, 0xbd, 0xcb, 0x7f, 0x2d, 0xbd, 0x2c, 0x13, 0x86, 0x3d, 0xcd, 0x8c, 0x44, 0xbd, 0xe4, 0x65, 0xa6, 0xbb, 0x06, 0x81, 0x04, 0x3d, 0x64, 0x45, 0x8e, 0x3d, 0xef, 0x80, 0x22, 0xbd, 0x35, 0x90, 0xaa, 0xbd, 0x02, 0xb6, 0x48, 0x3d, 0x76, 0xba, 0x39, 0x3d, 0xf3, 0xce, 0x66, 0xbd, 0x3f, 0x8e, 0xf1, 0xbd, 0x2a, 0x81, 0x0e, 0xbd, 0x82, 0x05, 0x0b, 0x3e, 0x7b, 0xdb, 0x2f, 0x3d, 0x86, 0xe3, 0xba, 0x3d, 0xac, 0x47, 0x17, 0x3e, 0xcb, 0x96, 0x8f, 0x3c, 0x3b, 0x58, 0xe7, 0xbd, 0x38, 0x64, 0x46, 0xbe, 0x9e, 0x73, 0x88, 0xbd, 0x0f, 0xf0, 0x8e, 0xbd, 0xc1, 0x4c, 0x00, 0xbd, 0x70, 0xbb, 0x54, 0xbd, 0x74, 0x55, 0x20, 0x3b, 0x1f, 0x22, 0x8d, 0x3d, 0xc9, 0x1d, 0xce, 0x3c, 0xad, 0x53, 0x3f, 0x3d, 0x7e, 0xd8, 0xb2, 0x3d, 0x9e, 0xc0, 0xf5, 0x3d, 0x79, 0x01, 0x32, 0xbd, 0x49, 0x13, 0x2e, 0x3d, 0xff, 0x7a, 0xce, 0x3d, 0xb5, 0xbc, 0x46, 0x3d, 0x43, 0xa5, 0xc8, 0xbd, 0xf2, 0x4d, 0xd3, 0x3b, 0x78, 0x3e, 0x39, 0x3d, 0x2c, 0x01, 0xc7, 0xbd, 0x5d, 0x5b, 0x8d, 0xbd, 0xb1, 0x3b, 0xa3, 0xbd, 0x1f, 0x70, 0x6e, 0x3c, 0x62, 0x07, 0x58, 0xbd, 0x29, 0xd9, 0xc8, 0xba, 0x13, 0xa6, 0xd3, 0xbd, 0xc1, 0x45, 0xbf, 0xbc, 0x3e, 0x9f, 0xea, 0xbc, 0x7c, 0x4d, 0xcc, 0x3d, 0x6c, 0x0c, 0x2e, 0xbd, 0xcf, 0xa0, 0x9a, 0x3b, 0x83, 0x9e, 0xfa, 0xbd, 0x77, 0x21, 0xaa, 0x3d, 0xcf, 0x18, 0xf5, 0xbd, 0xfe, 0x30, 0x79, 0x3d, 0x24, 0x33, 0x4d, 0x3d, 0xf7, 0x5f, 0x54, 0x3d, 0xda, 0x9d, 0xc9, 0xbd, 0x28, 0x08, 0x16, 0x3d, 0x53, 0x5a, 0xf6, 0xbc, 0xa5, 0x86, 0x84, 0xbd, 0x91, 0x39, 0xc5, 0xbc, 0x54, 0x2b, 0xda, 0xbd, 0x49, 0x34, 0xae, 0xbd, 0x9d, 0xad, 0x3a, 0xbd, 0x43, 0x59, 0xf1, 0x3d, 0x5c, 0xef, 0x06, 0x3e, 0xc7, 0xe0, 0x32, 0x3d, 0x43, 0xb3, 0x87, 0x3d, 0x12, 0x6c, 0x02, 0xbe, 0x9c, 0xdc, 0x02, 0x3e, 0x22, 0xcc, 0x1b, 0xbe, 0x46, 0x37, 0xe8, 0x3d, 0xf0, 0x11, 0x3b, 0xbd, 0x0d, 0x62, 0x51, 0x3d, 0x8b, 0x64, 0x2f, 0x3d, 0x57, 0x97, 0x5e, 0x3d, 0x53, 0xdd, 0xd6, 0x3c, 0x00, 0xf5, 0xfb, 0xbc, 0x6f, 0x83, 0xea, 0x3b, 0xec, 0x88, 0x20, 0xbb, 0xe5, 0x7f, 0xe6, 0x3d, 0xe6, 0xc4, 0xb5, 0x3d, 0x05, 0x76, 0x0f, 0xbe, 0x4a, 0x2f, 0x61, 0xbd, 0xa0, 0x69, 0xe2, 0x3d, 0xab, 0xc9, 0xb4, 0x3d, 0xeb, 0xd7, 0x88, 0xbc, 0x8f, 0x65, 0xfb, 0xbd, 0xc5, 0xca, 0x93, 0xbc, 0x1f, 0xe5, 0xa9, 0x3d, 0x0b, 0x34, 0x06, 0x3e, 0xbd, 0x9e, 0xe1, 0x3d, 0x58, 0x9d, 0xec, 0xbd, 0x60, 0x28, 0xe3, 0xbc, 0x62, 0x2e, 0x85, 0x3d, 0xec, 0x10, 0xb6, 0x3d, 0xd4, 0x0e, 0x55, 0x3d, 0x6a, 0xd9, 0x22, 0xbd, 0xa4, 0x2c, 0xb0, 0xbd, 0x8f, 0x8c, 0x8b, 0x3d, 0x05, 0xa0, 0xbb, 0x3d, 0x7b, 0xf7, 0xc0, 0x3d, 0xca, 0x2f, 0x90, 0xbc, 0x07, 0x79, 0xe3, 0xbd, 0x8b, 0x7d, 0x83, 0xbd, 0xfe, 0x8a, 0x93, 0xbc, 0xc0, 0xe9, 0xd0, 0x3d, 0xfb, 0x88, 0x76, 0xbc, 0x2d, 0x4b, 0x99, 0x3c, 0x69, 0x04, 0xd3, 0x3c, 0xb6, 0xd2, 0x88, 0x3d, 0xeb, 0xe2, 0x71, 0xbd, 0xa8, 0xb5, 0x98, 0x3d, 0x08, 0x79, 0xea, 0xbd, 0x7c, 0x53, 0x03, 0xbd, 0xb1, 0xda, 0xf9, 0xbd, 0xf1, 0x53, 0x83, 0xbc, 0xa0, 0xb3, 0x49, 0xbd, 0x7c, 0x79, 0x07, 0x3c, 0x68, 0x60, 0x21, 0x3c, 0xb1, 0x1f, 0x38, 0x3d, 0x5d, 0x0c, 0x4e, 0x3d, 0x36, 0x83, 0x62, 0x3c, 0x87, 0x96, 0x22, 0xbd, 0xd2, 0x3a, 0x09, 0x3c, 0xa2, 0x6e, 0x7a, 0xbd, 0x54, 0xc7, 0x31, 0xbc, 0x3a, 0x58, 0x1e, 0xbd, 0x51, 0x31, 0x94, 0x3d, 0x28, 0x85, 0xde, 0xbc, 0x52, 0x0e, 0xce, 0xbd, 0x79, 0x6a, 0xfb, 0xbd, 0x0f, 0x76, 0x14, 0xbd, 0xb4, 0xf0, 0xb3, 0x3c, 0x30, 0x4e, 0xab, 0xbd, 0xbc, 0x21, 0x2a, 0x3d, 0xa7, 0x29, 0x93, 0x3d, 0x05, 0x5e, 0x79, 0x3c, 0xc0, 0xdc, 0x93, 0xbd, 0x8c, 0x46, 0xd3, 0x3d, 0x6d, 0xef, 0x21, 0x3d, 0xcd, 0x62, 0xe5, 0x3d, 0xf2, 0x5f, 0xbc, 0xbd, 0xec, 0xb5, 0x6e, 0x3d, 0x8f, 0xdd, 0xd1, 0x3c, 0xb6, 0x13, 0x93, 0xbd, 0x1e, 0x1d, 0x0a, 0x3e, 0xfe, 0x00, 0x0a, 0x3d, 0xfe, 0xea, 0x70, 0x3c, 0x1e, 0x69, 0x94, 0xbd, 0x54, 0x92, 0xdf, 0x3d, 0x8d, 0xc4, 0xe3, 0xbd, 0xa8, 0x26, 0xc1, 0x3d, 0x90, 0x69, 0x97, 0x3d, 0x5f, 0xf7, 0x21, 0x3e, 0xd8, 0xf4, 0x13, 0x3d, 0x8e, 0x0f, 0x2a, 0x3d, 0x1a, 0xf3, 0xe8, 0x3d, 0xb1, 0x70, 0x75, 0xbd, 0x3d, 0x10, 0x87, 0x3d, 0xf2, 0x55, 0x8f, 0xbd, 0x7f, 0x15, 0x07, 0xbe, 0xe0, 0x3c, 0xba, 0x3d, 0x6d, 0x1f, 0xc2, 0xbc, 0xd6, 0xbf, 0x2c, 0xbd, 0x01, 0x4c, 0x87, 0x3c, 0xd8, 0xe5, 0x93, 0x3d, 0x6e, 0x5a, 0x12, 0x3d, 0xff, 0x3a, 0xd1, 0x3d, 0xfa, 0x05, 0x0a, 0x3d, 0x5a, 0xce, 0xa3, 0xbc, 0xc5, 0x2b, 0xd8, 0x3d, 0x98, 0xb3, 0xce, 0xbd, 0x6b, 0x72, 0x90, 0x3d, 0xa7, 0x35, 0xbb, 0xbd, 0xe2, 0xcb, 0xae, 0xbc, 0x8e, 0xe3, 0x74, 0x3d, 0xcd, 0x32, 0xcf, 0xbd, 0x76, 0x8d, 0x1d, 0x3d, 0x27, 0xc5, 0x0c, 0xbe, 0x27, 0x7e, 0x6c, 0xbd, 0x54, 0xf1, 0xdb, 0x3d, 0x39, 0x03, 0xed, 0xbc, 0xd7, 0x4b, 0xe1, 0x3a, 0x19, 0x67, 0x90, 0x3d, 0xf5, 0x03, 0x89, 0x3d, 0x31, 0x9d, 0xd4, 0x3a, 0x06, 0x9d, 0x05, 0x3e, 0xde, 0xaf, 0x63, 0xbd, 0xed, 0xfe, 0x54, 0x3c, 0xdd, 0x40, 0xc5, 0xbd, 0xf5, 0x54, 0x0d, 0xbc, 0x3e, 0xaa, 0xcd, 0x3c, 0x08, 0x18, 0xbf, 0xbd, 0x79, 0x2e, 0x90, 0xbd, 0x15, 0xe3, 0x8a, 0x3d, 0x7b, 0x54, 0x7c, 0xbd, 0x85, 0x07, 0xd0, 0x3d, 0xfb, 0x39, 0x01, 0xbd, 0x12, 0x57, 0xf0, 0xbd, 0x56, 0x7c, 0x8d, 0xbd, 0xae, 0x9e, 0xaf, 0x3c, 0x90, 0xc3, 0x85, 0x3d, 0x9c, 0x00, 0x88, 0x3d, 0x1f, 0x9a, 0x8f, 0xbd, 0x80, 0xef, 0xc4, 0xb9, 0x60, 0xba, 0x5b, 0xbd, 0x05, 0x25, 0xd8, 0x3c, 0x76, 0x60, 0x6d, 0x3d, 0xc5, 0xf0, 0xe1, 0x3c, 0x0d, 0x00, 0xf7, 0x3d, 0x57, 0xb7, 0x24, 0x3d, 0x2c, 0x11, 0x06, 0xbe, 0x48, 0x15, 0x5b, 0xbd, 0x0c, 0x67, 0x22, 0xbd, 0xc9, 0x10, 0x07, 0x3c, 0x69, 0x42, 0xbb, 0xbd, 0x5b, 0x32, 0xb8, 0xbd, 0x62, 0x5e, 0x35, 0xbd, 0xfc, 0xe1, 0x22, 0xbd, 0xff, 0xb3, 0x51, 0xbd, 0x6e, 0x4d, 0x2d, 0x3c, 0xfb, 0xca, 0xc5, 0xbd, 0x15, 0x16, 0x32, 0x3d, 0x50, 0xff, 0xbe, 0xbd, 0xf7, 0x84, 0x5e, 0xbb, 0x27, 0xa2, 0x17, 0x3c, 0x83, 0x85, 0xda, 0xbd, 0xd3, 0x8f, 0xd8, 0x3d, 0x19, 0xd4, 0x9d, 0xbd, 0x05, 0x56, 0xbd, 0x3b, 0x80, 0x5c, 0x8d, 0xbd, 0x02, 0x07, 0x01, 0x3e, 0x46, 0x0a, 0xd0, 0x3c, 0x28, 0x0a, 0x74, 0x3d, 0x45, 0xd8, 0x9c, 0x3d, 0x51, 0x8c, 0xe1, 0x3d, 0x94, 0x9d, 0x44, 0xbc, 0x1a, 0xfd, 0x6d, 0x3d, 0x6a, 0xa7, 0x00, 0x3e, 0x03, 0xb0, 0xa5, 0xbd, 0x84, 0xb6, 0x94, 0x3c, 0x6e, 0x1b, 0xd2, 0xbd, 0xff, 0xcf, 0xbd, 0xbd, 0x7f, 0x7c, 0x6c, 0xbd, 0xa0, 0xb0, 0x4a, 0xbd, 0x8c, 0xfc, 0xca, 0xbc, 0xf4, 0xa1, 0x81, 0xbd, 0x22, 0xad, 0xe2, 0x3c, 0xfa, 0x91, 0xaf, 0x3d, 0xf4, 0x2e, 0x19, 0xbd, 0x0b, 0x57, 0x71, 0xbc, 0x21, 0xca, 0x8d, 0x3c, 0xee, 0x8c, 0x2b, 0x3a, 0x46, 0x1a, 0xc1, 0xbb, 0x51, 0xbe, 0x2c, 0xbd, 0xc0, 0x3f, 0x40, 0x3d, 0xb2, 0xbb, 0x96, 0x3d, 0x88, 0x43, 0x23, 0xbe, 0x26, 0xd9, 0xe8, 0xbd, 0xf7, 0xfc, 0x9d, 0xbd, 0x4e, 0xf6, 0xd3, 0xbc, 0x2a, 0xda, 0xba, 0xbd, 0xe1, 0x21, 0xe1, 0x3d, 0x81, 0xea, 0x2e, 0xbd, 0xde, 0xaa, 0xd2, 0xbb, 0xde, 0x20, 0xbe, 0x3d, 0x15, 0x2f, 0x44, 0x3d, 0x37, 0x58, 0x6e, 0xbd, 0xcd, 0x34, 0x4c, 0xbb, 0x8d, 0xad, 0x08, 0xbc, 0xd9, 0xe2, 0x21, 0x3d, 0xfe, 0x8b, 0xab, 0x3d, 0xa2, 0x7f, 0x47, 0xbd, 0xad, 0xbe, 0xe3, 0xbc, 0x5f, 0x5d, 0x20, 0x3d, 0xa7, 0xa7, 0x19, 0xbe, 0x27, 0x1b, 0x8a, 0xbd, 0x2e, 0xcf, 0x4d, 0x3d, 0x68, 0x43, 0xb0, 0x3d, 0x54, 0xe8, 0xec, 0x3b, 0x5f, 0x47, 0x57, 0xbd, 0xde, 0x1b, 0xc4, 0x3d, 0xd2, 0x08, 0xfa, 0xbb, 0x23, 0x97, 0xe5, 0x3d, 0xb3, 0x70, 0x6b, 0x3d, 0x33, 0x68, 0x2a, 0xbc, 0xbb, 0xc7, 0xb5, 0xbd, 0x31, 0xe2, 0xcd, 0xbd, 0xe3, 0x77, 0x44, 0x3d, 0xb1, 0xf5, 0x60, 0x3d, 0x03, 0x24, 0xf7, 0xbd, 0x6c, 0x04, 0xb0, 0x3c, 0xba, 0x53, 0xa9, 0xbd, 0xcb, 0x94, 0x03, 0xbe, 0x19, 0x25, 0xfc, 0xbb, 0x8d, 0xaf, 0xe5, 0x3d, 0x95, 0xec, 0xa3, 0x3d, 0xca, 0x8d, 0xcb, 0xbd, 0x71, 0x02, 0xee, 0x3c, 0x31, 0x55, 0xdf, 0xbd, 0x85, 0xd6, 0x69, 0x3d, 0xa1, 0xd8, 0x1d, 0x3d, 0xd6, 0x60, 0x12, 0xbb, 0x46, 0x47, 0x46, 0x3d, 0x75, 0xf9, 0x97, 0x3d, 0x4c, 0xd5, 0x87, 0x3d, 0xc4, 0x77, 0xb7, 0x3c, 0x0a, 0xd5, 0x08, 0x3d, 0x7f, 0x4d, 0x74, 0xbd, 0xdd, 0x0e, 0x07, 0xbe, 0x0d, 0xb1, 0x51, 0xbb, 0x95, 0xf0, 0xa7, 0x3d, 0x8d, 0xdc, 0xe7, 0xbd, 0x11, 0x22, 0xd1, 0x3d, 0x81, 0xad, 0x8c, 0x3d, 0x51, 0x36, 0x1e, 0x3d, 0xe3, 0x75, 0x01, 0x3e, 0xa1, 0xd1, 0x9a, 0x3d, 0x4f, 0xd4, 0xc4, 0x3d, 0x50, 0x2a, 0x61, 0x3c, 0x9a, 0xd5, 0xbd, 0xbd, 0x37, 0xd1, 0xd5, 0x3c, 0xd5, 0x83, 0x8e, 0x3d, 0xbd, 0x05, 0xb6, 0xbb, 0x52, 0x6b, 0x66, 0x3d, 0x25, 0xcb, 0x0c, 0xbe, 0x3a, 0xff, 0xd3, 0xbd, 0xaf, 0xdc, 0xb3, 0xbd, 0xde, 0xdf, 0x06, 0x3d, 0x91, 0x0f, 0xc8, 0xbd, 0x62, 0xa1, 0x8f, 0xbc, 0x1c, 0x36, 0x40, 0x3c, 0x7d, 0x4f, 0xfa, 0x3d, 0x99, 0x76, 0xd5, 0x3d, 0xc3, 0x21, 0x5c, 0xbb, 0x61, 0x54, 0x52, 0xbc, 0xc4, 0x07, 0x9b, 0xbd, 0xb3, 0x00, 0x44, 0xbc, 0xbe, 0x1b, 0x06, 0xbd, 0x35, 0x4c, 0x5d, 0x3d, 0x6b, 0x45, 0x17, 0xbd, 0x10, 0xd6, 0xe5, 0xbd, 0x40, 0x57, 0x83, 0x3d, 0x62, 0xd1, 0x64, 0xbd, 0x79, 0x90, 0xbd, 0xbc, 0xce, 0xf0, 0x07, 0x3e, 0xc0, 0xbd, 0xaf, 0x3d, 0x88, 0xe1, 0x84, 0xbd, 0xf0, 0xdb, 0x4c, 0x3d, 0x17, 0x35, 0x02, 0x3b, 0x30, 0x1c, 0xed, 0xbd, 0x4f, 0xfc, 0xda, 0x3d, 0x92, 0x80, 0x87, 0xbc, 0x02, 0x74, 0x1a, 0xbe, 0xdc, 0xb1, 0xb3, 0xbd, 0x6c, 0x01, 0xc0, 0xbc, 0x8f, 0x2d, 0x8c, 0x3d, 0xf5, 0x96, 0xc0, 0xbd, 0x77, 0xbc, 0x7f, 0xbd, 0x8a, 0x64, 0xf1, 0x3c, 0xb7, 0x6c, 0xb4, 0xbd, 0x1c, 0x6f, 0x84, 0x3d, 0xa1, 0xd5, 0xc0, 0xbd, 0xbf, 0x63, 0xd4, 0x3d, 0xd6, 0xd7, 0xe7, 0x3d, 0x89, 0x1e, 0x64, 0x3c, 0xf3, 0x81, 0xbe, 0xbd, 0xb3, 0x57, 0xe9, 0xbd, 0x84, 0x5e, 0x9a, 0x3d, 0x77, 0x22, 0x01, 0xbe, 0x53, 0xa3, 0xb8, 0xbd, 0xc0, 0x62, 0xff, 0x3b, 0x9a, 0xfb, 0xbd, 0x3d, 0x13, 0x1a, 0xeb, 0x3b, 0x3b, 0x96, 0x78, 0x3d, 0xfc, 0xc6, 0x93, 0x3d, 0xfc, 0x33, 0x92, 0x3d, 0xcc, 0xc1, 0x62, 0xbd, 0x63, 0x7c, 0x77, 0xbd, 0x69, 0x92, 0x05, 0xbd, 0xbd, 0xee, 0xb8, 0x3a, 0xa2, 0x9d, 0x0e, 0xbe, 0xf3, 0xba, 0xed, 0xbd, 0x2f, 0x6a, 0xaa, 0x3d, 0x77, 0x4a, 0xc6, 0x3d, 0x4f, 0xe7, 0xa8, 0x3d, 0x1e, 0x3f, 0xbb, 0xbd, 0xae, 0x6c, 0xb8, 0xbc, 0x75, 0xf1, 0x6d, 0xbd, 0xc1, 0x5d, 0x11, 0xbe, 0x2b, 0xe2, 0x4f, 0xbd, 0x54, 0x21, 0xf6, 0x3b, 0x5c, 0xe2, 0x96, 0x3c, 0xbe, 0xe8, 0x2e, 0x3d, 0x38, 0x39, 0x93, 0x3c, 0xc3, 0x50, 0xbc, 0x3d, 0x67, 0x1d, 0xc4, 0x3d, 0xe6, 0x29, 0x56, 0xbc, 0x4d, 0x70, 0x4d, 0x3c, 0xd2, 0xca, 0xc4, 0xbd, 0xa1, 0x30, 0x3b, 0xbd, 0x97, 0x9b, 0xb5, 0xbd, 0x65, 0x99, 0x9b, 0xbd, 0xb5, 0x65, 0xb7, 0xbd, 0x51, 0xe1, 0x9a, 0xbd, 0x2f, 0x56, 0x4a, 0xbb, 0x9c, 0x68, 0x98, 0xbd, 0x36, 0x75, 0x73, 0xbd, 0x19, 0xe1, 0x83, 0xbd, 0x37, 0x69, 0xee, 0x3d, 0xe7, 0xd1, 0xad, 0xbd, 0x3b, 0x29, 0x95, 0xbd, 0xcd, 0x10, 0x75, 0x3d, 0xb4, 0x82, 0xc2, 0xbc, 0x72, 0xd7, 0x91, 0x3d, 0xc8, 0x77, 0x49, 0xbd, 0x96, 0x67, 0x4d, 0xbd, 0xc5, 0x75, 0x98, 0xbd, 0x96, 0x67, 0xcc, 0x3d, 0xba, 0x7a, 0x1e, 0xbe, 0x30, 0x3a, 0x02, 0x3d, 0xc1, 0xf8, 0x78, 0x3d, 0x46, 0xfc, 0xc1, 0x3d, 0x99, 0x3c, 0xc5, 0xbd, 0xbc, 0x69, 0x39, 0x3d, 0x7f, 0x95, 0xf0, 0x3b, 0x50, 0x78, 0x57, 0xbd, 0xfa, 0xf7, 0xa9, 0xbc, 0xb2, 0xae, 0x2b, 0x3c, 0x22, 0x75, 0x0d, 0x3e, 0x63, 0xaa, 0x03, 0x3d, 0xfa, 0x00, 0xd7, 0x3d, 0xc3, 0xcb, 0x60, 0x3c, 0xab, 0xf2, 0x61, 0x3c, 0x1b, 0x9a, 0x38, 0xbd, 0x1a, 0x33, 0xef, 0xbd, 0x9e, 0x11, 0xc5, 0x3d, 0xf5, 0xb1, 0x99, 0xbc, 0x65, 0xee, 0x5e, 0xbc, 0xde, 0x02, 0xe8, 0xbd, 0xef, 0x87, 0x58, 0x3d, 0x0e, 0x01, 0xcf, 0x3d, 0x51, 0xf7, 0xcb, 0xbc, 0x9e, 0x48, 0x50, 0xbd, 0xd2, 0xc8, 0x88, 0xbc, 0x56, 0x0a, 0x18, 0x3e, 0x49, 0xa6, 0xce, 0xbd, 0x9d, 0x8d, 0xf4, 0x3d, 0xd9, 0x71, 0x7e, 0x3d, 0x49, 0xcb, 0x67, 0x3d, 0x3d, 0x4f, 0xdb, 0x3c, 0x8c, 0x3b, 0xaa, 0xbd, 0xce, 0xc4, 0x1f, 0x3d, 0xda, 0x94, 0xaa, 0x3c, 0x4c, 0xae, 0x89, 0x3d, 0xac, 0x7e, 0x8d, 0x3d, 0xff, 0xfe, 0xf7, 0x3d, 0x89, 0xba, 0xbd, 0xbd, 0x98, 0xc1, 0x5c, 0x3d, 0x9a, 0xcf, 0x1b, 0xba, 0xdb, 0x22, 0xf3, 0x3d, 0x3a, 0xa6, 0x58, 0xbd, 0x6b, 0x7d, 0x2b, 0x3d, 0x22, 0x6f, 0xa2, 0xbd, 0x95, 0xf3, 0x07, 0x3e, 0x14, 0xfb, 0x7a, 0x3d, 0xda, 0x56, 0x40, 0xbd, 0x85, 0xe7, 0xcf, 0xbd, 0x7f, 0x4c, 0xb8, 0x3c, 0xf0, 0x6d, 0xc1, 0xbd, 0xb1, 0x01, 0xbd, 0x3d, 0xb4, 0xc0, 0xc0, 0xbd, 0x4f, 0x5f, 0xca, 0xbd, 0x4e, 0x96, 0xe1, 0x3d, 0x92, 0x0a, 0xa6, 0x3d, 0xd6, 0xd9, 0xb7, 0x3d, 0x8b, 0x52, 0xa8, 0x3d, 0xa9, 0xe6, 0xb4, 0xbc, 0x16, 0x49, 0xc0, 0x3b, 0xed, 0x64, 0xd1, 0x3d, 0xf1, 0xaf, 0x20, 0xbc, 0x8f, 0x44, 0xd9, 0x3b, 0xc0, 0x7a, 0xb4, 0x3d, 0x31, 0xb6, 0x15, 0xbe, 0x82, 0x8e, 0x62, 0xbd, 0xb3, 0x93, 0x1e, 0xbd, 0xae, 0x33, 0x8c, 0xbd, 0x82, 0xf3, 0xa6, 0x3c, 0xd2, 0x41, 0xb2, 0xbc, 0x58, 0x37, 0xce, 0x3d, 0xb9, 0xd2, 0xce, 0x3d, 0x99, 0x90, 0x69, 0x3d, 0xc3, 0x4b, 0xc8, 0x3d, 0xba, 0xfa, 0xcb, 0x3d, 0xee, 0x4a, 0xfe, 0xbc, 0x24, 0xc5, 0x3c, 0xbd, 0x5a, 0x95, 0xb3, 0xbd, 0xb1, 0xc0, 0x1f, 0xbd, 0x61, 0x53, 0xb4, 0x3c, 0x2e, 0x79, 0xc7, 0xbd, 0xd6, 0x70, 0x9d, 0xbd, 0x9d, 0xe7, 0x16, 0x3d, 0x4f, 0xe9, 0xa9, 0xbc, 0x7d, 0xbb, 0x7c, 0xbd, 0xf0, 0xdf, 0xe9, 0xbc, 0x66, 0xc4, 0x3f, 0xbd, 0xfc, 0xd3, 0x20, 0xbd, 0xd3, 0x4f, 0x36, 0xbd, 0x72, 0x8d, 0xec, 0x3d, 0x79, 0xbc, 0xaa, 0x3d, 0x69, 0x95, 0xe7, 0x3d, 0x46, 0xb6, 0xcc, 0xbc, 0xdd, 0x97, 0x70, 0xbd, 0x96, 0x31, 0x0c, 0xbe, 0x48, 0x86, 0xeb, 0x3d, 0x74, 0xf6, 0xa3, 0x3c, 0xe8, 0x26, 0xa1, 0x3d, 0xe3, 0xdd, 0x70, 0xbd, 0xcf, 0xbd, 0x02, 0x3c, 0x13, 0x3e, 0xbc, 0xbd, 0x69, 0xad, 0x05, 0xbd, 0xc0, 0xad, 0x53, 0x3c, 0xb6, 0x7c, 0xb2, 0xbd, 0x27, 0xc3, 0xfd, 0xbc, 0x5f, 0x42, 0xc5, 0x3d, 0x2f, 0x17, 0xd6, 0x3d, 0xb2, 0x68, 0xda, 0xbd, 0x95, 0xe5, 0x4f, 0x3c, 0xae, 0x99, 0xe4, 0x3d, 0x8f, 0x5c, 0xde, 0xbd, 0xf1, 0x87, 0x02, 0xbb, 0x17, 0x17, 0x7a, 0x3d, 0x75, 0x72, 0x1f, 0x3d, 0x70, 0x34, 0xa4, 0xbd, 0x43, 0x2a, 0xb2, 0x3d, 0xd9, 0x5a, 0xc7, 0x3d, 0xa5, 0x58, 0xc6, 0x3d, 0xa3, 0xb8, 0x76, 0xbd, 0x5b, 0xf5, 0x27, 0x3c, 0x58, 0xfa, 0x60, 0x3c, 0xcc, 0x2e, 0xd4, 0x3d, 0x71, 0xc3, 0x54, 0x3c, 0x75, 0xe3, 0x6b, 0x3d, 0x29, 0xf3, 0x9a, 0x3d, 0x9d, 0x62, 0x8b, 0xbd, 0xcd, 0xa8, 0x9f, 0xbd, 0xee, 0xaa, 0xbf, 0x3c, 0xd7, 0xe4, 0x20, 0xbd, 0x9f, 0x2c, 0xa4, 0x3c, 0x3a, 0x5e, 0x76, 0xbd, 0x9b, 0xcb, 0x07, 0x3e, 0x3e, 0x33, 0x34, 0x3d, 0x69, 0x57, 0x26, 0x3c, 0xf5, 0x54, 0xef, 0xbd, 0xf5, 0x3d, 0xe9, 0xbd, 0x8e, 0xed, 0x2b, 0x3d, 0x86, 0xf8, 0xb2, 0x3c, 0xb2, 0x7f, 0x45, 0x3d, 0xe1, 0x4f, 0xbd, 0x3c, 0xa7, 0xc8, 0x91, 0xbd, 0xea, 0x4c, 0xc5, 0x3d, 0x7a, 0x60, 0x7c, 0x3d, 0xce, 0x3e, 0xb6, 0x3d, 0xc3, 0x22, 0x52, 0xbd, 0xbf, 0x54, 0xd3, 0xbc, 0xc7, 0xe0, 0xe1, 0xbd, 0x08, 0x86, 0xc8, 0x3c, 0x98, 0x6c, 0xc3, 0xbd, 0xe6, 0xe1, 0x25, 0xbd, 0xdb, 0x07, 0x53, 0xbb, 0xbd, 0x04, 0x5f, 0xbd, 0x12, 0xfd, 0xe6, 0xbd, 0x2d, 0x0f, 0xe8, 0x3d, 0x9e, 0x08, 0x47, 0x3d, 0x93, 0xc8, 0xdc, 0xbd, 0x97, 0x91, 0xc9, 0xbd, 0xbd, 0x45, 0x88, 0xbd, 0x45, 0x8e, 0x0b, 0xbe, 0x8f, 0xb7, 0xd1, 0xbd, 0x9b, 0x3c, 0xc2, 0x3c, 0x04, 0xc5, 0xda, 0xba, 0xce, 0x19, 0x9a, 0x3d, 0xaf, 0xee, 0x25, 0x3e, 0xdf, 0x56, 0x48, 0xbd, 0x9d, 0x42, 0x02, 0x3e, 0x2c, 0x6a, 0xef, 0x3c, 0x25, 0x99, 0x07, 0x3c, 0x74, 0xa1, 0xca, 0x3c, 0xae, 0x08, 0x9e, 0x3c, 0xe5, 0xec, 0x25, 0xbd, 0x63, 0x8f, 0xd5, 0x3d, 0xf3, 0x4a, 0xc5, 0xbc, 0xab, 0x02, 0x53, 0xbd, 0x3e, 0xec, 0x5e, 0x3d, 0xea, 0xf2, 0x8f, 0x3d, 0xb9, 0xa3, 0x91, 0xbd, 0xa9, 0x34, 0x93, 0xbd, 0xd4, 0x95, 0x78, 0x3d, 0x84, 0x2b, 0x04, 0x3e, 0xe7, 0x61, 0x87, 0x3d, 0x41, 0x40, 0xe9, 0x3d, 0x3f, 0xea, 0xdc, 0xbc, 0xc9, 0xfd, 0xa4, 0x3d, 0xf6, 0xd5, 0x69, 0x3d, 0xa5, 0x93, 0x99, 0xbb, 0x21, 0x84, 0x76, 0x3d, 0xaa, 0xf2, 0x52, 0x3d, 0xbb, 0x3d, 0x9f, 0xbd, 0xd3, 0xd6, 0x6c, 0x3d, 0xe6, 0xb2, 0xcc, 0xbc, 0x18, 0x3b, 0x30, 0x3d, 0x25, 0xcf, 0xc5, 0xbc, 0xe0, 0xfd, 0xb4, 0x3c, 0x5c, 0x92, 0x6b, 0x3d, 0xa8, 0x01, 0x17, 0x3d, 0xf6, 0xed, 0xa2, 0xbd, 0x42, 0x7b, 0xec, 0x3d, 0x8e, 0x87, 0xd7, 0x3d, 0xfa, 0x30, 0xb7, 0x3d, 0x54, 0x66, 0x38, 0xbd, 0x68, 0xb5, 0xa9, 0xbd, 0x30, 0x1e, 0x7d, 0x3d, 0x93, 0xf4, 0xd5, 0xbc, 0x69, 0x6a, 0x98, 0xbd, 0x8f, 0x2b, 0x4f, 0xbd, 0xd3, 0x99, 0x9a, 0xbd, 0x9b, 0x72, 0xfe, 0xbc, 0xaf, 0xc3, 0xad, 0xbd, 0xe2, 0xdf, 0xde, 0x3c, 0xdc, 0x3e, 0xd3, 0x3d, 0x46, 0xb7, 0x92, 0xbd, 0x22, 0xd0, 0x21, 0xbd, 0x7a, 0x5e, 0xae, 0x3c, 0xb6, 0x91, 0xa4, 0x3d, 0xba, 0xda, 0x8f, 0xbc, 0xad, 0xb4, 0x18, 0x3b, 0xb1, 0x16, 0x9c, 0xbd, 0x2f, 0xf7, 0x89, 0xbd, 0x89, 0x33, 0xba, 0xbd, 0x03, 0x89, 0x61, 0xbd, 0xa8, 0x17, 0x50, 0xbd, 0xf5, 0xfe, 0x1a, 0x3d, 0xd2, 0x25, 0x02, 0x3d, 0xbb, 0xc9, 0x67, 0xbd, 0xc8, 0x32, 0xe0, 0x3d, 0x8e, 0xb2, 0x9e, 0xbd, 0x57, 0x57, 0x2a, 0xbc, 0xb4, 0xc4, 0x76, 0x3d, 0xfd, 0x46, 0x11, 0x3b, 0x38, 0x45, 0xe8, 0x3a, 0x90, 0x49, 0xc6, 0xbd, 0xc3, 0x50, 0x0b, 0xbe, 0x19, 0xca, 0xd9, 0x3d, 0x17, 0x4d, 0xe0, 0x3d, 0x68, 0x36, 0x3f, 0xbc, 0x3a, 0x6e, 0xda, 0xbd, 0x50, 0xd8, 0xde, 0x3d, 0x6f, 0x09, 0x29, 0xbe, 0x9d, 0x50, 0x03, 0xbd, 0x9a, 0x25, 0xf6, 0xbd, 0x43, 0xa2, 0xbc, 0x3d, 0x9a, 0x55, 0xa5, 0x3d, 0xa9, 0x0d, 0x2f, 0xbd, 0x5c, 0x8e, 0x22, 0xbd, 0x2e, 0xc1, 0x58, 0xbd, 0x5a, 0x05, 0x2c, 0xbd, 0xec, 0x19, 0xa1, 0xbd, 0xd7, 0x75, 0x7b, 0x3d, 0x9a, 0xcf, 0x82, 0x3c, 0x46, 0xc6, 0xff, 0x3c, 0x37, 0xc8, 0xca, 0x3d, 0xa0, 0xb7, 0x28, 0x3d, 0xaa, 0xb5, 0x2f, 0x3d, 0xaa, 0xa3, 0x9e, 0xbb, 0x01, 0x2b, 0xd6, 0xbd, 0xa5, 0x6d, 0xb1, 0x3d, 0x2c, 0x3d, 0x97, 0xbc, 0x63, 0xfb, 0x18, 0xbe, 0xb9, 0xa9, 0xcb, 0x3d, 0xb0, 0x7d, 0xb4, 0x3d, 0x22, 0x6a, 0x65, 0x3d, 0x7a, 0xaf, 0xf5, 0xba, 0xed, 0x29, 0x0e, 0x3d, 0x5c, 0xd5, 0x6f, 0xbd, 0xbe, 0xd9, 0xa0, 0xbc, 0x05, 0x8b, 0xe2, 0x3c, 0x35, 0xec, 0x8b, 0xbc, 0xa9, 0x59, 0x0d, 0x3c, 0x0b, 0x4c, 0x56, 0x3c, 0x39, 0x59, 0xad, 0xbd, 0x41, 0x06, 0xe3, 0xbd, 0xb1, 0xcd, 0xaa, 0x3d, 0xa8, 0xcc, 0xa1, 0xbd, 0x35, 0x63, 0x36, 0xbd, 0x44, 0xf9, 0x43, 0x3c, 0xee, 0x2c, 0xdb, 0x3c, 0x79, 0xd4, 0x78, 0x3d, 0x81, 0x34, 0x96, 0x3d, 0xc0, 0x43, 0xda, 0x3b, 0x9f, 0x9c, 0x0b, 0xbd, 0xaf, 0x07, 0xac, 0x3d, 0xcf, 0xe3, 0xf0, 0x3c, 0x44, 0x9b, 0xf8, 0x3d, 0xd4, 0x1f, 0x4e, 0xbd, 0xa6, 0xab, 0x9f, 0x3d, 0xcb, 0xd4, 0x30, 0x3d, 0x4b, 0xd4, 0x17, 0x3d, 0x7e, 0xf2, 0x3d, 0x3b, 0x47, 0x47, 0xac, 0x3b, 0x2f, 0xda, 0xa8, 0xbd, 0xb0, 0x53, 0xde, 0xbd, 0x2e, 0x06, 0xdc, 0x3d, 0x9a, 0x92, 0x9a, 0xbd, 0x86, 0xf9, 0xf2, 0xbd, 0xb0, 0x9b, 0xd6, 0xbd, 0x8f, 0x36, 0x53, 0x3d, 0x09, 0x68, 0x99, 0x3d, 0x25, 0xbb, 0xeb, 0x3d, 0x76, 0x5e, 0xfb, 0xbc, 0x24, 0x11, 0x05, 0xbd, 0xcf, 0xaf, 0xb7, 0xbd, 0x97, 0xcd, 0x65, 0xbd, 0xeb, 0x59, 0xf7, 0xb8, 0x95, 0x28, 0xb1, 0xbc, 0xff, 0xba, 0x91, 0xbd, 0x58, 0x33, 0xf0, 0x3c, 0x42, 0x68, 0xd9, 0xbd, 0xa7, 0x71, 0x95, 0xbb, 0x41, 0x0b, 0x6a, 0x3d, 0xe4, 0x83, 0x06, 0x3d, 0xae, 0x90, 0xa0, 0xbd, 0xfe, 0xf5, 0x27, 0xbd, 0x7f, 0xdc, 0xb4, 0x3d, 0x32, 0xf0, 0x75, 0xbd, 0x99, 0xfa, 0x7b, 0x3d, 0x5f, 0xca, 0x7a, 0x3d, 0xd9, 0x7e, 0x49, 0xbd, 0x7f, 0x2b, 0x5b, 0x3d, 0x02, 0x92, 0x46, 0xbb, 0x20, 0x77, 0x5b, 0x3c, 0x57, 0xa6, 0xd1, 0x3a, 0x74, 0x68, 0xb2, 0xbd, 0xa2, 0x4c, 0x0a, 0xbe, 0xb9, 0xcf, 0x43, 0xbd, 0xd6, 0x2e, 0x2d, 0xbc, 0x0f, 0x5d, 0xde, 0x3d, 0xfc, 0xdc, 0x1c, 0xb9, 0x6d, 0x7b, 0x91, 0xbc, 0x33, 0x39, 0x97, 0x3d, 0x37, 0xcf, 0x1f, 0x3d, 0xb3, 0x0b, 0xe3, 0x3d, 0x45, 0xbe, 0xa0, 0x3d, 0xda, 0x7c, 0x0e, 0x3d, 0x66, 0xd7, 0x25, 0xbd, 0xa7, 0xe0, 0x0f, 0x3d, 0xd2, 0x48, 0x8f, 0xbc, 0x2b, 0xbd, 0x9a, 0x3d, 0xf9, 0xe3, 0xd9, 0x3d, 0x0d, 0x1e, 0xf3, 0x3c, 0x12, 0xc5, 0xfe, 0xbc, 0x59, 0x75, 0x9f, 0x3c, 0x76, 0x0e, 0x46, 0xbd, 0xa3, 0x5d, 0xb9, 0x3d, 0x8c, 0x5a, 0xc9, 0x3c, 0xb5, 0x90, 0xbd, 0x3d, 0xe5, 0xaa, 0x42, 0x3d, 0xaf, 0x43, 0x9b, 0xbd, 0x50, 0x0e, 0xc9, 0xbc, 0xea, 0x53, 0x75, 0x3d, 0xfd, 0x0d, 0x4b, 0x3d, 0x7d, 0xc8, 0x17, 0x3d, 0xdd, 0xf0, 0xb5, 0xbd, 0x00, 0x53, 0xf4, 0xba, 0xa6, 0x3a, 0x54, 0xbd, 0x7f, 0x57, 0x5f, 0xbd, 0x00, 0x98, 0x56, 0xbd, 0xe6, 0x33, 0xbe, 0x3c, 0xe2, 0x66, 0x96, 0x3c, 0x41, 0x08, 0x88, 0x3c, 0x66, 0x40, 0x88, 0xbd, 0xfd, 0x89, 0xbb, 0x3d, 0xa6, 0xde, 0x99, 0x3a, 0xa4, 0x22, 0xf4, 0x3c, 0x94, 0xbc, 0xaf, 0xbd, 0x94, 0x01, 0xcd, 0xbd, 0x89, 0x93, 0x0d, 0x3d, 0x74, 0x5a, 0xdf, 0x3b, 0x5b, 0x0a, 0xce, 0xbd, 0xee, 0x6d, 0x87, 0x3d, 0x7c, 0x6a, 0xb0, 0x3d, 0x6d, 0xb0, 0x7b, 0x3c, 0x6f, 0xb8, 0x4e, 0x3d, 0x06, 0x6a, 0x25, 0xbd, 0x7c, 0xb9, 0xcc, 0x3d, 0xf5, 0x54, 0xb0, 0xbd, 0xf3, 0xf9, 0xe1, 0xbd, 0xcf, 0x6d, 0x91, 0x3c, 0x8d, 0x15, 0xa4, 0x3c, 0x15, 0xa1, 0x86, 0x3d, 0x47, 0x35, 0xc3, 0xbd, 0x34, 0xa8, 0x16, 0xbd, 0x11, 0xda, 0x49, 0x3d, 0x45, 0xb4, 0x61, 0x3d, 0x41, 0x15, 0xbf, 0xbc, 0xd4, 0x07, 0xfa, 0x3d, 0xb0, 0x3a, 0x18, 0x3d, 0xda, 0x7f, 0x69, 0xbd, 0x6b, 0xec, 0x9f, 0xbd, 0x6e, 0xfc, 0xe6, 0x3d, 0xc9, 0x5d, 0xb4, 0x3d, 0xa2, 0x1d, 0x12, 0xbc, 0x51, 0x23, 0xce, 0xbd, 0x0a, 0x20, 0x86, 0xbc, 0xc4, 0x1f, 0xbe, 0x3d, 0x18, 0x10, 0x6a, 0x3d, 0xe1, 0x58, 0x9f, 0x3c, 0x22, 0x7f, 0xc9, 0xbc, 0x1a, 0xed, 0x1e, 0xbe, 0x47, 0x93, 0x87, 0x3c, 0x4d, 0x77, 0x31, 0xbc, 0xf9, 0x29, 0xb2, 0x3d, 0xa9, 0xb3, 0x77, 0xbd, 0x43, 0x16, 0x0a, 0x3d, 0x88, 0x2f, 0x98, 0x3d, 0x3b, 0x7c, 0x2b, 0x3d, 0xfc, 0x29, 0x07, 0x3e, 0xa6, 0x27, 0x93, 0xbd, 0x5a, 0xa8, 0x13, 0xbe, 0xa8, 0xb8, 0x88, 0xbd, 0x9b, 0x64, 0xc5, 0xbc, 0xef, 0xb1, 0xe6, 0x3d, 0x33, 0x47, 0xc3, 0x38, 0x56, 0x92, 0x7b, 0xbd, 0x87, 0x81, 0xc7, 0x3c, 0x94, 0xe2, 0x21, 0x3c, 0xc2, 0x28, 0x75, 0x3d, 0xb7, 0x6f, 0x8b, 0xbd, 0x2b, 0xdd, 0x09, 0xbc, 0x1f, 0xb9, 0xbc, 0xbd, 0xd6, 0xef, 0x90, 0xbd, 0x52, 0xc7, 0xa5, 0xbc, 0xf7, 0x2c, 0x4d, 0x3c, 0xc7, 0xfe, 0x94, 0x3c, 0x24, 0x12, 0x46, 0xbc, 0x95, 0x3b, 0x59, 0x3c, 0x64, 0x96, 0xd7, 0xbc, 0xb3, 0x3c, 0xc7, 0xbd, 0xe6, 0x41, 0xbc, 0x3d, 0x70, 0xd8, 0x5c, 0x3b, 0xe2, 0x16, 0x88, 0xbd, 0x21, 0x12, 0xfc, 0x3d, 0xbd, 0x55, 0x1e, 0xbe, 0x3a, 0xf9, 0x1f, 0xbd, 0x59, 0xd3, 0x27, 0xbd, 0x14, 0x3b, 0xd7, 0x3d, 0x13, 0xf9, 0x66, 0x3d, 0x79, 0x92, 0x77, 0xbd, 0x9a, 0x35, 0x63, 0x3d, 0x07, 0xf2, 0x75, 0xbc, 0xc1, 0x6f, 0x73, 0x3d, 0x0f, 0x02, 0xc2, 0x3c, 0xd0, 0x45, 0x0c, 0x3d, 0x37, 0x87, 0x5e, 0x3d, 0x03, 0x9e, 0xce, 0x3d, 0x2b, 0x90, 0x13, 0xbd, 0xf4, 0x1a, 0xc5, 0xbd, 0xdf, 0x42, 0xdb, 0x3d, 0x47, 0x02, 0x58, 0xbd, 0x0f, 0x74, 0x1a, 0xbd, 0x1d, 0x5f, 0x05, 0x3d, 0x99, 0x81, 0xff, 0xbc, 0x56, 0x85, 0xb3, 0x3d, 0xac, 0x62, 0x17, 0xbd, 0xaa, 0x30, 0xc3, 0x3d, 0xdc, 0x53, 0x0f, 0xbe, 0x9b, 0x95, 0x49, 0x3d, 0xf8, 0x4e, 0xa7, 0x3d, 0x76, 0x74, 0x10, 0xbd, 0x2c, 0xe0, 0x9c, 0x3d, 0x7b, 0xc1, 0xc7, 0xbd, 0x15, 0x39, 0xe6, 0x3d, 0x52, 0xb3, 0xff, 0xbd, 0x72, 0x77, 0xd3, 0x3d, 0x6a, 0xc4, 0xfb, 0x3c, 0x27, 0x15, 0x5b, 0x3d, 0xba, 0xa2, 0x6b, 0xbd, 0x2b, 0xbc, 0x02, 0x3e, 0x6c, 0x7c, 0xda, 0x3c, 0x24, 0xa1, 0x61, 0xbb, 0xfb, 0x9b, 0xc9, 0xbc, 0x20, 0xcb, 0x93, 0xbc, 0x95, 0x98, 0x6c, 0xbd, 0x96, 0x34, 0xda, 0x3d, 0x5b, 0xa3, 0xe1, 0xbc, 0x71, 0xff, 0x07, 0x3d, 0x5e, 0x18, 0xd0, 0xbd, 0xc1, 0x9e, 0x26, 0x3e, 0x8b, 0x3d, 0x9c, 0x3d, 0x90, 0xe5, 0x84, 0x3d, 0x0d, 0xaa, 0x37, 0x3b, 0x99, 0x2d, 0xf6, 0x3c, 0x40, 0x23, 0xca, 0x3d, 0x1c, 0x56, 0xb4, 0xbd, 0xa9, 0x04, 0x97, 0xbd, 0x41, 0xa7, 0x9e, 0x3a, 0xb3, 0xfe, 0xb9, 0xbd, 0xf9, 0x34, 0x02, 0xbd, 0x44, 0x97, 0xb4, 0xbd, 0x67, 0x43, 0x80, 0xbd, 0xb0, 0xce, 0x36, 0xbd, 0x28, 0x48, 0xa2, 0x3d, 0x32, 0x52, 0xd3, 0x3d, 0x2a, 0xd4, 0x12, 0x3e, 0x8e, 0x41, 0xd5, 0x3c, 0x5e, 0x6b, 0x64, 0xbd, 0x19, 0x1a, 0xee, 0xbd, 0x91, 0xf3, 0xb1, 0xbb, 0x9e, 0x4f, 0x9b, 0x3d, 0x50, 0x3a, 0x9d, 0x3d, 0x25, 0xbc, 0xb5, 0xbd, 0xf7, 0xd6, 0x7b, 0x3d, 0x69, 0x87, 0x94, 0xbb, 0xed, 0x33, 0x31, 0xbd, 0x8f, 0xf3, 0xaa, 0xbd, 0x5b, 0x0b, 0xc0, 0x3d, 0xd9, 0xac, 0x60, 0xbd, 0x24, 0xa6, 0x9c, 0x3d, 0xfb, 0x17, 0x3f, 0x3d, 0x49, 0x6a, 0x97, 0x3d, 0x02, 0xe9, 0xef, 0xbd, 0x44, 0xbe, 0xb5, 0xbc, 0x61, 0x77, 0x94, 0xbb, 0x9e, 0x6d, 0xe1, 0xbc, 0xfa, 0x8c, 0xf2, 0xbc, 0x9c, 0xfc, 0x45, 0xbd, 0xed, 0x91, 0xde, 0xbd, 0xcd, 0xa8, 0xe7, 0x3d, 0x4e, 0x05, 0x10, 0xbe, 0x33, 0x4d, 0xa1, 0x3c, 0x01, 0x95, 0x91, 0x3d, 0x33, 0xf9, 0x13, 0xbd, 0x78, 0x50, 0x03, 0xbd, 0x7f, 0xa1, 0xd7, 0xbd, 0x0f, 0xe3, 0x92, 0x3d, 0x46, 0x19, 0x9e, 0x3d, 0xa8, 0xa7, 0x06, 0xbc, 0x0e, 0x64, 0xa6, 0x3d, 0xb4, 0x52, 0xe8, 0xbd, 0x87, 0xc6, 0x8f, 0xbd, 0x50, 0x8c, 0xbf, 0xbb, 0x76, 0x39, 0x34, 0x3d, 0xd2, 0x2f, 0x0b, 0xbd, 0xf4, 0xa3, 0x51, 0xbd, 0xb0, 0x28, 0x7d, 0xbd, 0x83, 0x61, 0x57, 0x3d, 0xca, 0x95, 0xb5, 0x3d, 0xdc, 0x22, 0x32, 0xbc, 0x58, 0xb3, 0x69, 0xbd, 0x09, 0x10, 0x79, 0x3c, 0x3c, 0x79, 0x35, 0xbd, 0xa0, 0x99, 0xa9, 0xbd, 0xdf, 0x93, 0x18, 0x3e, 0x6f, 0x5f, 0xad, 0x3d, 0xb2, 0x0b, 0x8e, 0xbd, 0xf5, 0xf2, 0xaa, 0x3d, 0xf2, 0x2e, 0xa9, 0xbd, 0xf6, 0xe2, 0x23, 0x3d, 0x17, 0xa2, 0xaf, 0x3d, 0xd9, 0x35, 0x8e, 0xbd, 0xf1, 0x8d, 0x08, 0x3e, 0xcc, 0x76, 0xb4, 0xbd, 0x71, 0xb4, 0xc9, 0xbd, 0x00, 0x10, 0xd4, 0xbc, 0xbe, 0x87, 0xf0, 0x3c, 0xe8, 0x15, 0xad, 0xbd, 0xfb, 0x2e, 0x5e, 0xbd, 0x6f, 0x3b, 0x99, 0xbc, 0x77, 0xc7, 0xe5, 0xbd, 0xf4, 0x52, 0x03, 0xbe, 0x74, 0x7b, 0x00, 0xbe, 0xe8, 0x51, 0x8c, 0x3d, 0xe1, 0x8d, 0x1c, 0xbc, 0x3d, 0x3c, 0x16, 0x3d, 0x94, 0x51, 0xd5, 0x3d, 0xff, 0x2e, 0xb0, 0x3d, 0xf5, 0x3c, 0xaa, 0xbc, 0x39, 0x6b, 0xb2, 0x3d, 0x1f, 0x8b, 0x44, 0x3d, 0xe4, 0xa4, 0xa8, 0x3d, 0xa9, 0xbc, 0x81, 0x3d, 0x67, 0x10, 0x83, 0xbd, 0x03, 0x1b, 0x08, 0x3d, 0xed, 0xef, 0x29, 0x3d, 0x46, 0x38, 0x58, 0xbc, 0x98, 0x03, 0xa3, 0x3d, 0x7d, 0xd6, 0x34, 0xbd, 0x36, 0xbd, 0xf7, 0x3d, 0xe7, 0xf9, 0x5d, 0xbd, 0x9c, 0x88, 0x87, 0x3d, 0x85, 0x7d, 0xa3, 0x3d, 0x81, 0x29, 0x75, 0xbc, 0xca, 0x17, 0x97, 0x3d, 0xbf, 0xd1, 0x04, 0x3e, 0xc9, 0x18, 0xfa, 0x3b, 0x0f, 0x59, 0xc3, 0x3d, 0x40, 0xa6, 0x05, 0xbd, 0x5e, 0x98, 0x8d, 0x3c, 0x8f, 0x73, 0xff, 0x3c, 0xb2, 0x58, 0xde, 0xbc, 0x97, 0x10, 0x04, 0xbd, 0x2d, 0xd2, 0x1c, 0x3d, 0xac, 0x03, 0x6e, 0xbd, 0xa8, 0x9a, 0xa8, 0x3d, 0x1c, 0x0e, 0x41, 0x3d, 0x30, 0x7a, 0xab, 0xbd, 0xec, 0x58, 0x14, 0xbd, 0xac, 0xe9, 0x9e, 0xbb, 0x0b, 0x14, 0x02, 0x3d, 0xac, 0x78, 0x00, 0x3e, 0xa1, 0xb6, 0xc2, 0xbd, 0x04, 0x51, 0x91, 0xbc, 0x57, 0x51, 0xf1, 0xbd, 0x95, 0x42, 0x49, 0x3d, 0x91, 0x54, 0xa2, 0x3c, 0xbd, 0x0f, 0x03, 0xbe, 0x0a, 0xf8, 0x17, 0xbd, 0xbb, 0x25, 0x14, 0x3d, 0xf2, 0x00, 0x19, 0xbd, 0x79, 0xea, 0x85, 0xbd, 0x4a, 0xf9, 0xb6, 0xbc, 0x4f, 0x1c, 0x34, 0xbc, 0x2e, 0x3e, 0x31, 0x3d, 0xe3, 0x63, 0x5e, 0xbd, 0x63, 0xf1, 0xaf, 0x3d, 0x4e, 0xee, 0xaa, 0x3d, 0x91, 0xc0, 0xcc, 0xbc, 0xc3, 0x43, 0xb2, 0xbc, 0xab, 0x9d, 0x54, 0xbd, 0x0b, 0x92, 0xa3, 0xbc, 0xc5, 0xe0, 0xf6, 0x3d, 0xb5, 0x2d, 0x52, 0xbd, 0x89, 0x8d, 0xf0, 0xbd, 0xd4, 0x40, 0x0c, 0xbe, 0x88, 0xf8, 0xaa, 0x3d, 0xc6, 0x0d, 0x10, 0x3d, 0xe0, 0x7d, 0xcb, 0xbc, 0x14, 0x58, 0xba, 0x3a, 0x11, 0x9d, 0x24, 0xbd, 0x14, 0x54, 0x03, 0x3b, 0x2c, 0xb4, 0x7d, 0x3c, 0x5a, 0x71, 0x99, 0xbd, 0x5d, 0xa3, 0xa3, 0xbd, 0xfc, 0xd0, 0xe5, 0x39, 0x4a, 0x6c, 0xf8, 0xbd, 0x81, 0x0e, 0xab, 0x3d, 0x0d, 0x40, 0x9a, 0x3d, 0x89, 0xff, 0x07, 0x3d, 0xd4, 0x8c, 0x97, 0x3b, 0x8a, 0x7a, 0xc5, 0x3c, 0xbb, 0xbf, 0xe3, 0x3a, 0xcb, 0x47, 0x41, 0x3d, 0x80, 0x8d, 0x29, 0x3d, 0x16, 0xe7, 0xf6, 0xbc, 0x01, 0x5f, 0xc0, 0x3d, 0xf1, 0x20, 0xe3, 0xbc, 0xec, 0x9f, 0x29, 0x3e, 0x8f, 0x46, 0x8d, 0x3d, 0x20, 0x99, 0xe9, 0x3c, 0x90, 0x04, 0x00, 0x3e, 0x35, 0xda, 0xba, 0xbd, 0x6c, 0xc5, 0x5b, 0x3d, 0x9a, 0x42, 0x41, 0xbd, 0x1a, 0x84, 0x6f, 0x3d, 0x94, 0xc4, 0x0c, 0xbd, 0x08, 0x43, 0x8a, 0x3d, 0xd8, 0xdb, 0xa4, 0x3d, 0xac, 0xc6, 0xa8, 0x3d, 0xa5, 0xf4, 0xff, 0xb9, 0xdc, 0x01, 0x58, 0xbc, 0x43, 0x37, 0xf0, 0x3d, 0xed, 0x73, 0x3b, 0xbd, 0x8d, 0x1f, 0x00, 0x3c, 0x4c, 0x89, 0x71, 0x3d, 0xb0, 0xbf, 0x4e, 0x3d, 0x1e, 0x61, 0x83, 0xbd, 0x82, 0xf6, 0x02, 0xbe, 0x3c, 0x97, 0xf9, 0x3d, 0x06, 0x96, 0x97, 0x3d, 0x5c, 0x13, 0xd7, 0xbd, 0xce, 0x77, 0x88, 0xbd, 0x26, 0x76, 0xba, 0x3c, 0x46, 0x28, 0xc4, 0x3d, 0x35, 0x72, 0x8d, 0x3c, 0x3e, 0x63, 0x81, 0xbd, 0x06, 0x13, 0x9b, 0x3d, 0xf9, 0x80, 0x20, 0x3d, 0x9c, 0xfb, 0x94, 0x3c, 0x50, 0x2c, 0x16, 0xbd, 0xdb, 0x7d, 0x59, 0xbd, 0x7a, 0xa8, 0x8d, 0x3d, 0x8b, 0x56, 0x94, 0xbd, 0xa5, 0x49, 0x8b, 0x3d, 0x76, 0xae, 0x99, 0xbc, 0x6e, 0x40, 0x84, 0x3d, 0xe0, 0x5a, 0x40, 0xbd, 0x33, 0xb8, 0x0b, 0xbd, 0x96, 0x14, 0x25, 0x3c, 0x3e, 0x5c, 0x78, 0xbd, 0x31, 0x40, 0x06, 0x3e, 0x05, 0x0b, 0xb7, 0x3c, 0x24, 0x3e, 0xe5, 0xbd, 0x94, 0x06, 0x12, 0x3d, 0x14, 0x07, 0x96, 0xbd, 0x14, 0x1d, 0x80, 0xbd, 0xfc, 0xd3, 0x66, 0xbd, 0xfa, 0xef, 0x67, 0x3d, 0x62, 0x1e, 0x9f, 0x3c, 0x27, 0x05, 0x2a, 0xbc, 0xbb, 0x0b, 0xa2, 0x3d, 0x07, 0x02, 0xaf, 0x3d, 0xcb, 0x9d, 0xc9, 0x3d, 0xbe, 0x5c, 0x15, 0x3b, 0x73, 0xc6, 0x92, 0xbd, 0x70, 0x29, 0xe4, 0x3d, 0x46, 0xa2, 0xb2, 0xbc, 0x56, 0xb8, 0xe1, 0x3d, 0x82, 0xf9, 0x0d, 0xbd, 0x9b, 0x59, 0xa8, 0xbd, 0x42, 0x59, 0x98, 0x3d, 0xae, 0x31, 0x22, 0xbd, 0x0d, 0xa2, 0x1f, 0x3e, 0xc8, 0xfd, 0x58, 0xbc, 0x4e, 0xd4, 0xca, 0x3d, 0xbd, 0x39, 0x81, 0xbd, 0x7c, 0x0a, 0x25, 0x3e, 0xdb, 0x88, 0x7f, 0x3c, 0xf1, 0x64, 0x07, 0x3e, 0xd2, 0x99, 0x1d, 0x3d, 0x2c, 0xc9, 0xb0, 0xbd, 0x7a, 0xe0, 0x9d, 0xbc, 0x9e, 0x93, 0x19, 0x3d, 0x7f, 0xfd, 0xd2, 0xbc, 0xec, 0x44, 0xd5, 0x3d, 0x69, 0x81, 0xbf, 0x3d, 0x9e, 0xff, 0xac, 0x3c, 0x60, 0x6b, 0x6a, 0xbd, 0xe6, 0x22, 0x48, 0xbd, 0x3b, 0xc4, 0xa3, 0xbd, 0x0c, 0xd3, 0xf5, 0x3c, 0x08, 0x03, 0x62, 0x3c, 0x5c, 0x46, 0x16, 0x3e, 0xd3, 0x2a, 0xce, 0x3c, 0xfc, 0x31, 0xa8, 0x3d, 0xbd, 0x02, 0x95, 0x3c, 0xe8, 0xc7, 0x7a, 0x3c, 0xff, 0xc5, 0xf8, 0x3c, 0x3a, 0xb0, 0x79, 0x3b, 0xe6, 0xfd, 0x37, 0xbd, 0x5e, 0xd3, 0x06, 0x3e, 0x21, 0x21, 0xe8, 0x3c, 0xa1, 0x6f, 0xf1, 0x3d, 0xa6, 0xc2, 0x54, 0x3d, 0x9c, 0xae, 0x9c, 0x3d, 0xcb, 0xfd, 0x0a, 0x3c, 0x3e, 0x2e, 0x00, 0xbd, 0xdc, 0xf2, 0x4b, 0xbd, 0x7a, 0xdf, 0xbd, 0x3d, 0xbd, 0x27, 0x8b, 0x3c, 0x1c, 0x12, 0x2d, 0xbd, 0xf9, 0xf3, 0x28, 0x3e, 0x4c, 0x90, 0xb3, 0xbd, 0x49, 0xfc, 0x84, 0x3d, 0x2e, 0xc1, 0x82, 0x3d, 0x54, 0xc7, 0x62, 0x3d, 0xcb, 0x24, 0xf9, 0x3d, 0xf4, 0x6a, 0x2b, 0x3c, 0x38, 0x27, 0x1c, 0xbd, 0x05, 0xf1, 0xf5, 0x3d, 0xc0, 0x87, 0xa2, 0x3d, 0x7e, 0x5c, 0x92, 0x3d, 0xef, 0x33, 0xad, 0x3d, 0x34, 0xff, 0x43, 0x3d, 0x87, 0x47, 0xc6, 0x3d, 0x58, 0x18, 0x76, 0xbd, 0x1d, 0x74, 0x9e, 0x3d, 0xae, 0x41, 0xb1, 0xbc, 0x7d, 0x42, 0x94, 0xbd, 0x37, 0x01, 0x66, 0x3d, 0xb4, 0x18, 0x96, 0xbd, 0x69, 0x31, 0xc4, 0x3c, 0xe7, 0x09, 0x00, 0xbe, 0x46, 0x1a, 0x2b, 0xbd, 0x76, 0xd4, 0x7b, 0xbd, 0x48, 0xcd, 0xfc, 0x3b, 0xf9, 0x98, 0xf6, 0xbc, 0x33, 0x91, 0x2c, 0xbe, 0xe1, 0x08, 0xf5, 0xbd, 0xb0, 0xcd, 0x79, 0x3d, 0xd3, 0x1d, 0x0f, 0x3e, 0x5a, 0x9f, 0x13, 0xbd, 0x7d, 0x6b, 0x44, 0x3c, 0xcf, 0x14, 0x38, 0x3d, 0xe3, 0xfb, 0x47, 0x3d, 0x37, 0x1e, 0x2f, 0x3c, 0x89, 0xa0, 0xb2, 0xbd, 0x89, 0x21, 0x81, 0xbd, 0x04, 0xda, 0xc5, 0x3d, 0xa7, 0xa8, 0x16, 0xbc, 0x07, 0x2e, 0xc1, 0xbb, 0x8c, 0x6f, 0xc2, 0x3c, 0x3b, 0x0c, 0x03, 0xbd, 0x74, 0xc2, 0xa5, 0x3d, 0x3f, 0xeb, 0xb2, 0xbd, 0x2f, 0x66, 0x94, 0xbd, 0x4f, 0x30, 0xab, 0xbd, 0xc4, 0xdd, 0x45, 0x3d, 0x4a, 0xb7, 0x48, 0x3d, 0x55, 0x77, 0x26, 0x3e, 0xbe, 0x1c, 0x96, 0xbb, 0x5b, 0xca, 0x62, 0xbd, 0xcf, 0x1e, 0xd3, 0x3c, 0xa7, 0x0e, 0xb9, 0xbd, 0x67, 0x75, 0x2b, 0xbd, 0x26, 0x12, 0xd5, 0xbc, 0xb6, 0x0f, 0xc0, 0xbd, 0x12, 0xab, 0x23, 0x3d, 0xf6, 0x23, 0xb2, 0x3d, 0x3f, 0x71, 0x83, 0x3d, 0x2a, 0x08, 0x95, 0xbc, 0xd8, 0x6e, 0xdc, 0xbd, 0x1c, 0x85, 0xa6, 0xbd, 0xc4, 0xbc, 0x52, 0xbd, 0xa8, 0xe0, 0x9c, 0x3d, 0xf8, 0xa9, 0xe5, 0x3d, 0xfe, 0xbd, 0x9c, 0x3d, 0x9d, 0x62, 0xc3, 0x3c, 0xe6, 0x95, 0xd6, 0xbc, 0x08, 0x07, 0x68, 0xbc, 0x99, 0x7b, 0xe4, 0xbd, 0xcf, 0x18, 0xb0, 0x3d, 0xdb, 0x65, 0x8e, 0xbd, 0x47, 0x34, 0xa9, 0xbd, 0x65, 0xab, 0x0a, 0xbe, 0xb3, 0x57, 0x24, 0xbe, 0x1f, 0xce, 0xa2, 0xbc, 0xd2, 0x8a, 0xb7, 0xbc, 0x1e, 0xd4, 0x53, 0x3d, 0xec, 0x02, 0x14, 0xbd, 0xd7, 0xc2, 0x05, 0x3d, 0x05, 0xe3, 0xcb, 0xbc, 0x18, 0xc7, 0x9d, 0x3d, 0x99, 0x69, 0x0a, 0xbe, 0xee, 0x58, 0xa1, 0x3d, 0xae, 0xa3, 0x36, 0xbe, 0x5c, 0x5d, 0x9c, 0xbd, 0x39, 0xfb, 0x00, 0xbd, 0x38, 0xcd, 0x70, 0xbd, 0x2f, 0x77, 0xf2, 0xbd, 0x8a, 0x7d, 0x74, 0xbd, 0x4b, 0x08, 0x7b, 0xbd, 0x42, 0xaf, 0x4a, 0xba, 0x56, 0x2e, 0x80, 0xbd, 0x81, 0x9b, 0xb9, 0x3d, 0xf0, 0x6d, 0x86, 0x3c, 0xfe, 0x53, 0x82, 0xbd, 0xb8, 0xac, 0x56, 0xbd, 0xf7, 0xc9, 0x14, 0x3d, 0xea, 0xe6, 0x1f, 0xbd, 0x9f, 0x23, 0xd0, 0xbd, 0x73, 0xd5, 0x6a, 0x3d, 0x24, 0xdb, 0xba, 0xbd, 0xf5, 0xf1, 0xda, 0xbc, 0xe6, 0x8b, 0x34, 0xbd, 0x6c, 0x15, 0x8a, 0x3c, 0x26, 0x05, 0x63, 0x3d, 0x27, 0xc2, 0x8b, 0xbd, 0x62, 0xb2, 0x83, 0x3d, 0x71, 0x11, 0x50, 0xbc, 0x67, 0x3d, 0xe4, 0x3d, 0xa5, 0x3d, 0x59, 0xbd, 0x18, 0xa4, 0x70, 0x3c, 0x6b, 0x86, 0x9c, 0x3d, 0xa6, 0xe4, 0xbf, 0x3d, 0x3a, 0x8f, 0xe2, 0xbd, 0xd7, 0xf8, 0x71, 0x3d, 0x1d, 0x46, 0x00, 0xbd, 0x3c, 0x59, 0xc0, 0xbc, 0x1f, 0x60, 0x50, 0xbd, 0x91, 0xe2, 0xe6, 0xbd, 0x4c, 0x72, 0xb6, 0xbd, 0x49, 0x1e, 0xba, 0x3d, 0xdd, 0x1e, 0x77, 0xbc, 0x35, 0x26, 0xab, 0x3c, 0x63, 0x83, 0xd7, 0xbd, 0x41, 0x6f, 0xa8, 0x3d, 0x6d, 0xf0, 0x50, 0xbd, 0xdc, 0x5f, 0x2f, 0xbd, 0x73, 0x67, 0xce, 0xbc, 0x10, 0x47, 0x0b, 0xbd, 0xdc, 0x85, 0x41, 0x3c, 0xcd, 0x61, 0xc9, 0xbd, 0x9d, 0x79, 0x77, 0x3d, 0xbd, 0xe5, 0xb5, 0xbd, 0xa4, 0x88, 0xf7, 0xbd, 0x43, 0xf7, 0x5e, 0x3b, 0x95, 0x23, 0x26, 0xbd, 0x39, 0x1e, 0xa7, 0x3d, 0x60, 0xd5, 0x2e, 0xbd, 0x78, 0xa7, 0x1b, 0x3d, 0xad, 0x5b, 0xcd, 0x3d, 0x73, 0xba, 0x9d, 0xbd, 0xb7, 0xe0, 0x91, 0x3d, 0xa7, 0x90, 0x8e, 0x3d, 0x12, 0x0d, 0x11, 0x3d, 0x6d, 0xf8, 0x9b, 0xbd, 0x7d, 0xd4, 0xdf, 0x3d, 0x67, 0x4c, 0xa3, 0x3d, 0x21, 0x33, 0x88, 0xbc, 0xc8, 0xd2, 0xc7, 0xbd, 0x93, 0xea, 0x80, 0xbd, 0x4d, 0xe7, 0x42, 0xbd, 0x0b, 0x43, 0xfb, 0xbc, 0xb0, 0x8c, 0x7f, 0xbc, 0x16, 0x83, 0xc3, 0x3d, 0x42, 0xd0, 0x86, 0xbd, 0x7f, 0x6f, 0xa6, 0x3d, 0xed, 0xee, 0x4c, 0x3d, 0xc9, 0x3e, 0x03, 0x3d, 0x72, 0x47, 0x9e, 0xbd, 0x2f, 0x66, 0xda, 0x3d, 0x3d, 0x45, 0x80, 0x3b, 0x3c, 0xab, 0xa6, 0xbd, 0x73, 0xe8, 0x9f, 0xbd, 0xf6, 0x76, 0xc2, 0xbd, 0x18, 0xaf, 0xb4, 0x3d, 0x94, 0x94, 0x9f, 0xbd, 0x46, 0xcd, 0xad, 0xbd, 0xdb, 0xe6, 0x87, 0xbd, 0x67, 0x03, 0x07, 0x3d, 0x05, 0xc2, 0x84, 0xbc, 0xb7, 0x1f, 0x8d, 0xbd, 0x19, 0x72, 0xa1, 0x3d, 0xd8, 0xa5, 0x52, 0x3d, 0x63, 0x90, 0x03, 0xbd, 0xf5, 0xe3, 0xcd, 0x3d, 0xd8, 0xfb, 0x9c, 0x3d, 0x74, 0xd7, 0x06, 0xbd, 0x8c, 0xb5, 0xdd, 0xbd, 0x20, 0x07, 0xba, 0xbd, 0x83, 0xa1, 0xd2, 0x3d, 0x4c, 0x58, 0xe3, 0x3d, 0x31, 0x7d, 0xe1, 0xbd, 0x29, 0x06, 0xa1, 0xbd, 0x64, 0xa9, 0x2e, 0xbd, 0x79, 0x6c, 0xb5, 0xbd, 0x8f, 0xe5, 0xac, 0x3d, 0x68, 0xc1, 0xc3, 0x3c, 0xd5, 0xa7, 0xf2, 0xbd, 0x2e, 0x24, 0x40, 0xbd, 0xd6, 0x39, 0xe7, 0x3d, 0xe0, 0xaf, 0x02, 0xbd, 0xe1, 0xd6, 0xe1, 0xbd, 0xfa, 0xa0, 0x25, 0x3d, 0x26, 0xe8, 0x57, 0x3d, 0xa5, 0x58, 0xf6, 0xbd, 0xd2, 0x32, 0x0f, 0xbd, 0x8e, 0xa1, 0x8d, 0x3c, 0xb6, 0x98, 0xce, 0xbc, 0x71, 0x96, 0xfa, 0xbc, 0xe2, 0x69, 0x35, 0x3c, 0x3d, 0x07, 0x21, 0x3d, 0xc1, 0x9f, 0x8a, 0x3d, 0x0a, 0x9e, 0x64, 0xbd, 0x3b, 0x91, 0x57, 0xbb, 0x99, 0x41, 0x8c, 0x3d, 0xcf, 0x60, 0x8f, 0xbd, 0x5e, 0xe6, 0x25, 0xbd, 0xec, 0x60, 0xb0, 0xbd, 0xcf, 0xd7, 0x87, 0x3d, 0x1a, 0x3f, 0x4e, 0xbd, 0xd7, 0xbf, 0x78, 0xbd, 0xe3, 0x77, 0xd9, 0x3d, 0x81, 0xd8, 0x81, 0xbd, 0x52, 0x2a, 0xd3, 0x3d, 0xc1, 0x32, 0x80, 0xbd, 0xaa, 0xbf, 0x9d, 0x3d, 0xbf, 0x21, 0x3b, 0x3d, 0x30, 0x5e, 0x9e, 0xbd, 0xfa, 0xf3, 0xda, 0xbc, 0x41, 0xeb, 0x9c, 0xbd, 0x71, 0x88, 0xd3, 0xbc, 0xf1, 0x4c, 0x00, 0xbd, 0x38, 0xd5, 0x2f, 0x3c, 0xcd, 0xd9, 0x3e, 0x3d, 0xf4, 0xf8, 0xa4, 0x3d, 0xbc, 0x2f, 0x0e, 0xbd, 0x28, 0x35, 0x34, 0x3d, 0x3a, 0x20, 0x5c, 0x3d, 0x97, 0x22, 0xdb, 0xbd, 0x75, 0xd3, 0x5f, 0xbd, 0xf9, 0x3b, 0x66, 0xbd, 0x4a, 0x18, 0xe7, 0xbb, 0x4e, 0x21, 0x5d, 0xbd, 0x9c, 0x6c, 0x45, 0xbd, 0x2c, 0xb8, 0xe7, 0x3c, 0x65, 0xbf, 0x45, 0x3d, 0x15, 0xbb, 0xa5, 0xbd, 0x7e, 0x1c, 0xba, 0xbd, 0xfa, 0x2d, 0xfc, 0x3c, 0xc2, 0xfb, 0x20, 0xbd, 0x62, 0xc3, 0xa6, 0xbd, 0xae, 0x66, 0xc1, 0x3b, 0x8e, 0x5e, 0x29, 0xbd, 0x1a, 0x5d, 0x27, 0xbd, 0xce, 0x36, 0xaf, 0xbd, 0x6d, 0x03, 0xdd, 0x3d, 0xb5, 0x5d, 0x95, 0x3c, 0xd2, 0x9d, 0x60, 0xbd, 0xf0, 0xb5, 0x60, 0xbc, 0x80, 0x21, 0x34, 0xbd, 0xf1, 0x05, 0xc8, 0x3b, 0x2c, 0x2a, 0x2f, 0x3e, 0x99, 0x23, 0x3c, 0x3d, 0x73, 0x2f, 0xe4, 0x3d, 0xc8, 0x22, 0xce, 0x3d, 0xbf, 0x98, 0xad, 0xbd, 0xa5, 0xb2, 0xd4, 0xbd, 0x6d, 0xca, 0x3b, 0xbe, 0xd1, 0xa0, 0x95, 0x3c, 0xa0, 0xed, 0xe1, 0x3b, 0x8c, 0x5d, 0x6f, 0x3d, 0x10, 0x04, 0x88, 0xbd, 0x76, 0x62, 0xe7, 0x3d, 0x53, 0x28, 0x8c, 0xbd, 0x7b, 0x4f, 0x5d, 0xbd, 0x2e, 0x69, 0x8b, 0x3c, 0xe7, 0x7f, 0x79, 0x3c, 0x2e, 0xe5, 0xbf, 0x3c, 0x56, 0x90, 0xf6, 0xbc, 0x8a, 0xc6, 0x3b, 0x3d, 0x86, 0xbf, 0xb8, 0xbd, 0xe6, 0xf7, 0xd7, 0xbc, 0xc5, 0x96, 0xcb, 0x3d, 0x48, 0xe0, 0x9a, 0xbd, 0xd8, 0xe1, 0x45, 0xbd, 0xa7, 0x00, 0xd7, 0xbd, 0xda, 0x57, 0x1c, 0xbc, 0x8e, 0x49, 0x40, 0x3d, 0x8b, 0x52, 0x0a, 0x3d, 0xe2, 0xe8, 0x1b, 0xbd, 0x74, 0xd1, 0x0f, 0x3e, 0x17, 0x20, 0xc1, 0x3d, 0x3a, 0xbe, 0x8a, 0xbd, 0xa4, 0xd5, 0xca, 0x3c, 0x4f, 0x17, 0x82, 0xbc, 0x1f, 0xea, 0x09, 0xbd, 0x8e, 0xcb, 0xd0, 0x3d, 0x9c, 0x1a, 0x36, 0xbd, 0x99, 0xee, 0x5b, 0xbd, 0x5c, 0x1d, 0x10, 0xbe, 0x9e, 0x99, 0x22, 0x3d, 0x8f, 0x8f, 0xda, 0x3c, 0x42, 0xa7, 0x2e, 0x3d, 0x37, 0x33, 0x03, 0xbe, 0x11, 0x7b, 0x8f, 0xbd, 0xb8, 0xa1, 0x7e, 0x3d, 0x31, 0x04, 0x62, 0x3d, 0x93, 0x03, 0xfe, 0x3b, 0x59, 0x82, 0xa0, 0xbd, 0x07, 0xb8, 0x24, 0x3d, 0x7a, 0x45, 0xf2, 0x3d, 0xab, 0xf4, 0xd7, 0xbd, 0x2f, 0xbd, 0xc6, 0x3d, 0xb2, 0x1c, 0x47, 0x3d, 0xbe, 0xf6, 0xb2, 0x3d, 0xe2, 0xd0, 0x92, 0xbd, 0x0d, 0xec, 0xb2, 0xbd, 0x40, 0x5c, 0xc0, 0xbd, 0xa8, 0xf7, 0x0e, 0x3c, 0xef, 0x56, 0xb1, 0xbd, 0x91, 0x09, 0x4f, 0xbd, 0x47, 0x51, 0xcc, 0x3d, 0xcd, 0x6d, 0x85, 0xbd, 0xfe, 0xb2, 0x6f, 0xbd, 0x3f, 0x9b, 0xec, 0x3c, 0x64, 0x20, 0x98, 0xbb, 0x82, 0x78, 0x09, 0x3d, 0x2f, 0xbf, 0xe7, 0xbc, 0x5d, 0x5e, 0x01, 0xbd, 0x0c, 0xca, 0x4b, 0x3d, 0xf2, 0xa2, 0x89, 0xbd, 0xa6, 0x59, 0x54, 0x3d, 0x62, 0x46, 0x04, 0x3c, 0x99, 0x2f, 0x48, 0xbd, 0x22, 0x21, 0x1b, 0xbd, 0x07, 0x3b, 0xb4, 0xbd, 0x88, 0x42, 0x0a, 0x3e, 0x7e, 0x29, 0xc3, 0xbb, 0xab, 0x7a, 0x86, 0x3d, 0xe7, 0x26, 0xc0, 0x3c, 0xac, 0x99, 0x0f, 0xbd, 0x6e, 0xdb, 0x74, 0x3d, 0xba, 0x02, 0xdb, 0x3d, 0x3c, 0x38, 0xae, 0x3d, 0xdf, 0x34, 0xe1, 0xbd, 0x53, 0xa6, 0x26, 0xbe, 0x26, 0xa7, 0x82, 0x3d, 0x7b, 0x0f, 0x03, 0xbe, 0x85, 0xb6, 0xaa, 0xbc, 0xc5, 0x08, 0xbf, 0x3c, 0x4f, 0xd1, 0xa8, 0xbb, 0x9f, 0x58, 0xa6, 0x3c, 0x51, 0xdc, 0xfb, 0x3d, 0x2e, 0x30, 0xab, 0xbd, 0x38, 0x19, 0x19, 0x3c, 0xa2, 0x6a, 0x7c, 0x3d, 0x1d, 0x52, 0xd5, 0xbc, 0x15, 0x5f, 0xb3, 0x3b, 0x9b, 0xd8, 0x75, 0xbd, 0x5f, 0xa1, 0x13, 0xbd, 0xdc, 0xc7, 0xfd, 0xbb, 0x44, 0x9b, 0x73, 0xbd, 0x41, 0x1d, 0x82, 0xbd, 0xa7, 0x0b, 0x15, 0x3c, 0x87, 0x91, 0x80, 0x3c, 0x74, 0x55, 0xab, 0xbd, 0xf4, 0xb6, 0x3d, 0x3b, 0xa7, 0x2c, 0xcd, 0xbd, 0x19, 0xa5, 0x96, 0xbc, 0xea, 0x8f, 0xfa, 0x3d, 0x98, 0x47, 0x12, 0xbd, 0xfc, 0x40, 0x62, 0x3d, 0x72, 0x61, 0xa0, 0xbd, 0x79, 0x4d, 0x71, 0x3d, 0x2f, 0x4a, 0x89, 0x3d, 0xb8, 0xdc, 0x98, 0x3d, 0x66, 0x46, 0x6f, 0x3d, 0xa2, 0xf2, 0x0d, 0x3d, 0x36, 0xf5, 0xd4, 0x3c, 0xb9, 0xe5, 0x88, 0x3d, 0xa4, 0x93, 0x05, 0x3e, 0x64, 0x7e, 0x18, 0xbe, 0xb6, 0x47, 0x76, 0x3d, 0x8e, 0x31, 0xca, 0x3d, 0x2f, 0x72, 0xf3, 0x3d, 0x73, 0x45, 0x0d, 0x3e, 0xf4, 0x52, 0xfa, 0xbc, 0x40, 0x37, 0x88, 0xbd, 0x44, 0x13, 0xae, 0xbc, 0x25, 0x7e, 0x0a, 0xbd, 0xbe, 0x26, 0x45, 0xbd, 0x2c, 0xf1, 0x37, 0x3d, 0x29, 0xbd, 0x9f, 0xbd, 0xcb, 0xff, 0x1c, 0xbd, 0x62, 0xf2, 0xa0, 0xba, 0x20, 0x57, 0xa8, 0xbc, 0xaa, 0xc1, 0x9c, 0xbd, 0xfb, 0xd0, 0x3b, 0x3d, 0xe2, 0xae, 0x3f, 0x3d, 0x41, 0x4d, 0x93, 0x3d, 0x28, 0x11, 0xcc, 0x3d, 0x52, 0x6e, 0x06, 0x3e, 0x8f, 0x9b, 0xc0, 0x3d, 0x40, 0xb0, 0xa4, 0xbc, 0xb0, 0x45, 0x86, 0x3d, 0xc9, 0x85, 0x40, 0xbd, 0xfa, 0xdb, 0xe3, 0xbd, 0xf3, 0x0e, 0x9b, 0x3d, 0x48, 0x39, 0x03, 0xbe, 0xc4, 0xfc, 0x2f, 0xbd, 0xb9, 0xbf, 0xbe, 0x3d, 0xd9, 0x2f, 0x11, 0xbd, 0x71, 0x6a, 0x75, 0x3c, 0x89, 0x2b, 0xc2, 0xbd, 0x21, 0x82, 0xd4, 0xbd, 0x36, 0xcc, 0xf5, 0x3d, 0xa3, 0x91, 0x3d, 0x3d, 0x16, 0xd1, 0x7d, 0xbd, 0x40, 0xba, 0x75, 0x3b, 0x5a, 0x82, 0xfa, 0x3d, 0xc1, 0x09, 0xaf, 0x3d, 0x1e, 0x44, 0xa3, 0x3d, 0xd7, 0x2a, 0x37, 0xbd, 0xd9, 0x72, 0xcc, 0x3d, 0x58, 0x58, 0x9a, 0xbd, 0xea, 0x90, 0x35, 0xbc, 0x0e, 0x69, 0x92, 0x3c, 0x68, 0x7e, 0x5c, 0xbc, 0x0a, 0xba, 0x55, 0x3d, 0x7e, 0xd4, 0xb9, 0x3b, 0x45, 0x5b, 0xe7, 0xbd, 0x6b, 0xe6, 0xd5, 0xbc, 0xbc, 0x3e, 0x14, 0xbd, 0xe8, 0xb5, 0x09, 0x3d, 0xbd, 0xde, 0xaf, 0x3d, 0xcf, 0x2d, 0x94, 0xbd, 0x12, 0x0f, 0xac, 0x3d, 0x21, 0x99, 0xc2, 0xbd, 0x45, 0x93, 0x0d, 0x3d, 0x8a, 0x1e, 0xe4, 0x3d, 0xe8, 0xfe, 0xb2, 0x3d, 0x0e, 0x69, 0xb8, 0xbd, 0xab, 0x2a, 0x91, 0xbc, 0x02, 0x24, 0x8f, 0xbd, 0xef, 0x96, 0xa7, 0x3b, 0x39, 0x39, 0xda, 0xbd, 0x31, 0x03, 0xcd, 0x3d, 0xe5, 0xf7, 0x4c, 0x3c, 0xca, 0x45, 0x3f, 0x3c, 0xb4, 0xf6, 0x8c, 0xbd, 0x4a, 0x36, 0x4f, 0x3c, 0x5c, 0xe7, 0x56, 0x3d, 0xe3, 0x81, 0xd6, 0xbd, 0x44, 0x9d, 0x3d, 0xbd, 0xb2, 0xf5, 0xe2, 0x3d, 0xaa, 0xd0, 0xff, 0xbc, 0x49, 0x86, 0x4b, 0x3d, 0x79, 0x40, 0x51, 0xbd, 0x60, 0xd2, 0x91, 0xbd, 0x9d, 0x61, 0x26, 0xbe, 0x32, 0x82, 0xe5, 0x3d, 0xa3, 0x28, 0xc5, 0xbc, 0x3f, 0x02, 0x08, 0xbd, 0x9b, 0xe8, 0xca, 0x3d, 0xb4, 0x34, 0xed, 0x3c, 0x48, 0x7f, 0xea, 0x3d, 0xd6, 0x07, 0xa1, 0xbd, 0xf9, 0xad, 0x18, 0x3c, 0xba, 0x0d, 0x8b, 0x3d, 0xa6, 0x13, 0x0f, 0x3e, 0x25, 0xfc, 0x99, 0x3c, 0xc4, 0x8e, 0xc1, 0x3c, 0xfe, 0xa2, 0x14, 0x3d, 0x0f, 0x96, 0xd5, 0xbc, 0x21, 0x99, 0xbb, 0xbc, 0xd7, 0x9c, 0xd1, 0x3d, 0x14, 0xd2, 0xa2, 0x3d, 0x8b, 0x64, 0xd9, 0xbd, 0x11, 0x36, 0xa2, 0x3c, 0xec, 0xbe, 0x24, 0xbd, 0x9f, 0x0f, 0x2a, 0x3d, 0x9d, 0xd5, 0xa6, 0xbd, 0xba, 0xe4, 0x83, 0xbd, 0xc1, 0xce, 0x45, 0xbd, 0x4a, 0x99, 0x8c, 0xbd, 0xa0, 0x8d, 0x99, 0x3b, 0xf1, 0x4b, 0x7a, 0xbc, 0x9d, 0x76, 0xd1, 0xbd, 0x65, 0x96, 0xd5, 0x3d, 0x65, 0xd5, 0x0a, 0xbd, 0x03, 0xb9, 0x60, 0x3c, 0xbe, 0xb3, 0x0e, 0xbe, 0xf3, 0x86, 0xf3, 0x3d, 0x28, 0xc1, 0x0f, 0x3d, 0x88, 0x69, 0xc0, 0xbc, 0x0e, 0x06, 0x7e, 0x3d, 0x42, 0x82, 0xa5, 0x3d, 0x28, 0x95, 0x1b, 0x3d, 0xb7, 0x6d, 0xac, 0xbd, 0xe0, 0xc9, 0x14, 0xbd, 0x5c, 0xf4, 0xb3, 0x3d, 0x74, 0x9e, 0xd4, 0xbd, 0x8d, 0x9a, 0xed, 0x3c, 0x9c, 0xe3, 0x01, 0x3d, 0x08, 0x0d, 0xc5, 0xbd, 0xc5, 0xba, 0xa7, 0xbd, 0xf2, 0xf8, 0x30, 0x3c, 0x41, 0x3c, 0xa8, 0x3d, 0x15, 0x63, 0x60, 0xbd, 0x31, 0x27, 0xc6, 0xbc, 0x61, 0x0f, 0xe8, 0xbd, 0xcf, 0x0c, 0xbb, 0xbc, 0xf5, 0x06, 0xbd, 0x3d, 0x99, 0x20, 0xb4, 0x3c, 0x5c, 0x27, 0x2d, 0xbd, 0x5f, 0x29, 0x4b, 0xbd, 0xe6, 0x17, 0xef, 0x3d, 0x9c, 0x60, 0x84, 0xbd, 0x6a, 0x76, 0xce, 0x3d, 0xf7, 0x48, 0x92, 0x3d, 0x6a, 0x72, 0xa3, 0x3d, 0x07, 0x7e, 0x04, 0x3e, 0x71, 0x2a, 0xa8, 0x3d, 0x9a, 0x94, 0x74, 0x3d, 0x78, 0x1b, 0xf6, 0x3d, 0x98, 0x1e, 0xfd, 0xbc, 0x3a, 0xf5, 0xc4, 0x39, 0x5f, 0x45, 0xc6, 0x3d, 0x14, 0xc4, 0x8b, 0x3d, 0xea, 0x0c, 0x16, 0xbd, 0x43, 0x08, 0x98, 0x3c, 0x42, 0x6d, 0x04, 0x3d, 0x8f, 0x4f, 0xc5, 0xbd, 0x88, 0x9e, 0x35, 0xbd, 0xfd, 0x1d, 0xfc, 0xbc, 0x82, 0x9f, 0xa5, 0x3c, 0xfe, 0xe2, 0x30, 0xbc, 0x6a, 0x80, 0xf1, 0x3c, 0xc0, 0x61, 0x39, 0x3d, 0xcd, 0x81, 0x08, 0xbe, 0x6f, 0xa9, 0xa9, 0xbd, 0x51, 0x50, 0x2b, 0xba, 0xaa, 0xd4, 0xa1, 0xbd, 0x13, 0x64, 0xdf, 0xbd, 0xa4, 0xd4, 0x5c, 0xbc, 0x2d, 0x83, 0xad, 0xbd, 0xc3, 0x31, 0x07, 0x3d, 0x7d, 0x7a, 0x97, 0xbc, 0xa7, 0x23, 0xf7, 0xbd, 0x61, 0x7f, 0xda, 0xbd, 0x1d, 0x39, 0xd4, 0xbd, 0x0b, 0x50, 0x8f, 0xbc, 0xfc, 0xa2, 0x06, 0x3e, 0x7b, 0x0e, 0x90, 0x3d, 0xf8, 0xa0, 0x9d, 0xbd, 0x25, 0x0f, 0x6d, 0x3d, 0xae, 0x7f, 0xb7, 0xbc, 0xe9, 0x1f, 0x10, 0xbe, 0x5b, 0x7f, 0x52, 0xbd, 0xe5, 0x86, 0x0d, 0xbd, 0x03, 0x12, 0x58, 0x3c, 0xee, 0x04, 0xaa, 0xbd, 0x08, 0x85, 0x0a, 0x3d, 0x73, 0x0b, 0x93, 0xbd, 0x4c, 0x42, 0x0d, 0xbd, 0xe9, 0xa4, 0x7f, 0x3d, 0x3b, 0x8a, 0xa8, 0x3c, 0xa6, 0x4d, 0x88, 0x3d, 0x44, 0xe9, 0x1e, 0x3c, 0x05, 0x39, 0xd0, 0x3d, 0x09, 0xc4, 0xc7, 0x3b, 0xdb, 0x43, 0x88, 0xbd, 0xb2, 0x44, 0x9d, 0x3d, 0x00, 0x42, 0x13, 0xbe, 0x25, 0x15, 0x9a, 0x3d, 0xee, 0x5d, 0x9d, 0x3d, 0x04, 0x63, 0x5b, 0xbb, 0x67, 0x1c, 0x9e, 0x3d, 0xe1, 0x8e, 0xb4, 0x3d, 0x68, 0xae, 0x8c, 0x3d, 0x1a, 0xdc, 0xac, 0x3d, 0xdb, 0x00, 0x86, 0x3d, 0x60, 0xb7, 0x07, 0xbd, 0x92, 0x7c, 0xbc, 0xbd, 0x47, 0xb6, 0x8f, 0x3c, 0x16, 0x03, 0xc1, 0x3d, 0xbb, 0x65, 0x94, 0x3d, 0x0c, 0x98, 0x05, 0xbd, 0xf1, 0xe1, 0xc2, 0x3d, 0xb5, 0xf2, 0x01, 0xbe, 0xf2, 0xe0, 0x01, 0x3d, 0xb4, 0x4a, 0xa5, 0x3d, 0x7c, 0x67, 0x97, 0x3d, 0xa4, 0xbe, 0x52, 0x3d, 0x17, 0x60, 0x1c, 0x3d, 0x95, 0x83, 0x5b, 0xbc, 0x33, 0x59, 0xd3, 0xbd, 0x45, 0x05, 0xf7, 0xbd, 0xa5, 0x82, 0xbe, 0x3d, 0x91, 0xc4, 0x46, 0x3d, 0x5c, 0x4b, 0x27, 0xb8, 0x32, 0xe3, 0xf9, 0x3c, 0xdf, 0xcb, 0xcc, 0x3d, 0xc3, 0x94, 0x6f, 0xbd, 0x10, 0xa2, 0xec, 0x3d, 0x2e, 0xaf, 0x09, 0xbc, 0x49, 0x91, 0x8d, 0x3d, 0x6e, 0xc8, 0xc5, 0xbc, 0x45, 0x0e, 0x66, 0xbc, 0x37, 0xd6, 0xfd, 0xbc, 0x2a, 0xea, 0x81, 0xbd, 0xf7, 0xc2, 0xc2, 0x3d, 0x12, 0x27, 0x6b, 0x3c, 0x97, 0x69, 0xf3, 0x3b, 0xc8, 0xb7, 0xa6, 0xbc, 0xd6, 0xdf, 0x96, 0xbc, 0xe0, 0x8a, 0x1b, 0x3e, 0xe3, 0x34, 0xc5, 0x3c, 0x96, 0xcd, 0x12, 0xbe, 0xcd, 0x75, 0x5a, 0x3c, 0x81, 0xd5, 0xd6, 0xbd, 0x2f, 0x97, 0x6e, 0xbd, 0x92, 0x28, 0x45, 0xbc, 0x81, 0xaf, 0xce, 0x3d, 0xc3, 0x35, 0xd3, 0x3d, 0x97, 0x1f, 0x99, 0x3c, 0x48, 0xb6, 0x5b, 0x3d, 0x98, 0x96, 0x9d, 0x3d, 0xed, 0x0a, 0xa3, 0x3c, 0x5e, 0x72, 0xe5, 0xbb, 0xad, 0x65, 0xaa, 0xbd, 0x16, 0x57, 0x8c, 0xbd, 0x4a, 0x37, 0x6b, 0xbd, 0x18, 0x35, 0xbe, 0xbd, 0xa8, 0xaa, 0x07, 0xbd, 0xbe, 0xcb, 0xf5, 0xbb, 0xbe, 0x69, 0xad, 0x3c, 0x1f, 0x82, 0x54, 0x3d, 0x32, 0xbe, 0x87, 0xbd, 0x67, 0x54, 0x41, 0x3d, 0x46, 0xb6, 0x2e, 0xbd, 0x04, 0xb2, 0x75, 0x3c, 0xb8, 0xf0, 0xcd, 0xbc, 0x63, 0x01, 0x7f, 0x3d, 0x92, 0xb6, 0x84, 0xbd, 0x43, 0x6b, 0xe0, 0x3d, 0x4a, 0xa8, 0xb3, 0x3c, 0x05, 0x93, 0x8f, 0xbd, 0xca, 0xa0, 0x84, 0x3d, 0x84, 0x4b, 0x27, 0x3e, 0x68, 0xce, 0xe2, 0xbd, 0x30, 0x5d, 0x22, 0x3d, 0xa3, 0x3c, 0xc0, 0x3d, 0xc3, 0xa5, 0x37, 0xbd, 0xc8, 0xb2, 0xa3, 0x3d, 0x79, 0xee, 0x82, 0x3d, 0xc6, 0xb3, 0xab, 0x3a, 0x72, 0xa4, 0x65, 0xbb, 0x5c, 0x20, 0xa7, 0x3d, 0xdd, 0xd9, 0xe5, 0xba, 0xbe, 0xcb, 0x9d, 0xbd, 0xdc, 0x19, 0xc5, 0xbd, 0xa8, 0x93, 0xc8, 0x3d, 0x4d, 0x2f, 0x1a, 0x3d, 0x24, 0x73, 0xa2, 0x3d, 0x11, 0xb1, 0x08, 0x3e, 0x8a, 0x27, 0xcf, 0x3d, 0xb6, 0xee, 0xab, 0xbd, 0x1f, 0xd7, 0xe1, 0x3d, 0x5d, 0xcf, 0x5f, 0xbd, 0x8e, 0xa9, 0xb0, 0x3c, 0x86, 0xb9, 0x31, 0x3d, 0xd7, 0xa8, 0x92, 0xbd, 0x7f, 0x37, 0xd0, 0x3d, 0x4c, 0xbb, 0xb6, 0x3d, 0xa4, 0x4d, 0x09, 0xbd, 0xc5, 0x8e, 0x0f, 0xbd, 0xbf, 0x27, 0xa8, 0xbd, 0x62, 0x94, 0xb2, 0x3d, 0x2d, 0x35, 0xe8, 0x3d, 0xd5, 0x78, 0xee, 0xbd, 0x2a, 0x5b, 0x5a, 0xbd, 0x72, 0x89, 0x4d, 0x3d, 0x7f, 0x5b, 0xfd, 0xb8, 0x11, 0x80, 0x58, 0xbd, 0x69, 0xa9, 0xbc, 0xbc, 0xdb, 0xe9, 0xd3, 0xbc, 0x45, 0x3b, 0xf5, 0xbc, 0xa6, 0x28, 0xc5, 0x3d, 0xe2, 0x48, 0x31, 0x3d, 0x49, 0xab, 0x36, 0x3b, 0xca, 0xd2, 0xc6, 0xbc, 0x29, 0x1f, 0x5a, 0x3d, 0x90, 0xe6, 0x3b, 0xbd, 0xf7, 0x5f, 0xa0, 0x3d, 0xb7, 0xc1, 0x91, 0x3d, 0x18, 0xcc, 0xc4, 0x3c, 0x0a, 0xc0, 0x8a, 0xbd, 0x2a, 0x5e, 0x63, 0xbd, 0xa1, 0x2f, 0xb7, 0xbc, 0xf2, 0xfb, 0xac, 0x3b, 0xa4, 0xed, 0x17, 0x3d, 0xc1, 0x09, 0x59, 0xbd, 0xe9, 0xf7, 0xf4, 0x3d, 0xad, 0xe5, 0x8f, 0xbd, 0xa9, 0x9e, 0xd0, 0x3d, 0x0a, 0x98, 0x40, 0xbd, 0xbc, 0x1f, 0x95, 0x3d, 0x0b, 0x17, 0xf0, 0x3c, 0x64, 0x3f, 0x60, 0xbd, 0xc0, 0xb2, 0xc7, 0x3b, 0x42, 0x3f, 0x62, 0x3c, 0x6a, 0x39, 0x8c, 0xbd, 0xbf, 0x72, 0xfd, 0xbd, 0x47, 0x3d, 0xd1, 0xbd, 0x7c, 0x0b, 0x6d, 0x3d, 0xf3, 0x4a, 0xda, 0xbc, 0xce, 0x57, 0x9d, 0x3d, 0xf0, 0x13, 0x53, 0x3b, 0x94, 0x39, 0x31, 0x3d, 0x3d, 0xa7, 0x3f, 0xbd, 0xfa, 0x3e, 0x6b, 0x3d, 0xfb, 0x19, 0xa9, 0x3d, 0x07, 0xfc, 0x5e, 0xbd, 0xfa, 0x47, 0xd3, 0x3d, 0xd6, 0x83, 0x9a, 0xbd, 0x2c, 0xa9, 0x14, 0x3e, 0x01, 0xb5, 0x7e, 0x3d, 0x27, 0xfb, 0x00, 0x3a, 0x7d, 0xe5, 0x35, 0xbd, 0x68, 0x50, 0x05, 0xbc, 0x87, 0xdb, 0x19, 0x3d, 0xbe, 0x2e, 0xe3, 0x3d, 0xe4, 0x41, 0x07, 0xbd, 0x53, 0x57, 0xcc, 0xb9, 0x28, 0x92, 0x96, 0x3d, 0xb6, 0x14, 0xa4, 0xbc, 0xad, 0x84, 0x69, 0x3c, 0x19, 0xe4, 0xde, 0xbd, 0x3b, 0xad, 0x04, 0xbe, 0xd9, 0xe3, 0xbc, 0x3d, 0x5b, 0x59, 0xd3, 0x3d, 0x00, 0x12, 0xcc, 0xbd, 0x2d, 0x0c, 0x8a, 0xbd, 0xc6, 0x1c, 0x79, 0x3d, 0x03, 0xf3, 0x14, 0xbc, 0xb7, 0x28, 0xa6, 0x3d, 0x28, 0x0d, 0xa5, 0xbd, 0xa9, 0x8e, 0x32, 0x3b, 0x60, 0xef, 0x30, 0x3d, 0x21, 0x9f, 0x68, 0xbc, 0x13, 0x02, 0x83, 0xbc, 0x21, 0x90, 0x9e, 0x3c, 0x78, 0xfa, 0xf4, 0xbc, 0xf9, 0x40, 0x6e, 0x3a, 0x11, 0xdb, 0x05, 0x3e, 0xc1, 0xb7, 0xff, 0x3b, 0x04, 0x47, 0x65, 0xbd, 0x6b, 0x8a, 0x85, 0xbd, 0x30, 0xd5, 0x95, 0x3d, 0x3c, 0x4a, 0x92, 0x3d, 0xa6, 0x20, 0x11, 0x3d, 0x03, 0xd8, 0xb1, 0x3c, 0x7d, 0x1e, 0x0b, 0xbd, 0xe9, 0x0a, 0x92, 0x3d, 0x7e, 0x9d, 0xb8, 0x3c, 0xb5, 0x1e, 0x6d, 0x3d, 0x6d, 0x4e, 0x6f, 0x3d, 0xbc, 0x1e, 0xdc, 0x3c, 0x2e, 0x87, 0xa0, 0x3d, 0x2d, 0x00, 0x5c, 0xb8, 0x8f, 0xfb, 0xb3, 0xbd, 0x9e, 0x36, 0x08, 0x3d, 0xa4, 0x19, 0xe0, 0xbb, 0x5f, 0xc0, 0xb7, 0xbb, 0xc7, 0x3c, 0x78, 0x3d, 0x53, 0xe4, 0x65, 0x3d, 0xca, 0xdf, 0xc9, 0x3d, 0x18, 0x8b, 0x27, 0xbd, 0x19, 0x05, 0xa6, 0x3d, 0x23, 0xa2, 0xa2, 0x3d, 0xc2, 0x4b, 0xac, 0xbd, 0x1b, 0x23, 0xd7, 0xbd, 0xc2, 0x53, 0x97, 0x3d, 0x2e, 0xb2, 0x45, 0xbd, 0x73, 0x7b, 0xbc, 0xbd, 0x33, 0xfc, 0x47, 0xbc, 0x0b, 0x36, 0x91, 0x3d, 0xaa, 0x1e, 0x0b, 0xbd, 0xc8, 0x3a, 0xda, 0x3c, 0x22, 0x29, 0xc5, 0x3d, 0x62, 0x18, 0xf3, 0x3c, 0x75, 0x25, 0xc1, 0xbc, 0xe8, 0x19, 0xb8, 0x3d, 0x30, 0x46, 0x47, 0x3d, 0x22, 0x80, 0x9f, 0xbc, 0x59, 0xcc, 0xcf, 0x3d, 0x00, 0x51, 0x95, 0xbc, 0x8b, 0x00, 0xbf, 0xbc, 0xf5, 0xca, 0x89, 0xbd, 0xca, 0x56, 0xe4, 0x3d, 0x7f, 0x86, 0x24, 0x3e, 0x23, 0xd7, 0x14, 0x3d, 0xe2, 0x8f, 0xa7, 0xbc, 0x1d, 0x6d, 0xb3, 0x3c, 0xa4, 0x8a, 0x85, 0xbd, 0x4a, 0x36, 0x40, 0xbd, 0x20, 0xa4, 0xa7, 0xbd, 0xfe, 0x10, 0xa3, 0xbc, 0xa3, 0x3b, 0xce, 0x3d, 0x88, 0x99, 0x12, 0xbd, 0x3d, 0x58, 0xd5, 0xbd, 0x76, 0xe5, 0x7f, 0x3c, 0x87, 0xa0, 0x68, 0xbd, 0x8a, 0xd4, 0xb7, 0xbd, 0xdb, 0x68, 0x6f, 0x3c, 0x22, 0x84, 0x2e, 0xbc, 0x94, 0x63, 0xa6, 0xbc, 0x35, 0xa4, 0xa9, 0x3d, 0x17, 0xec, 0x0d, 0xbd, 0xd4, 0x25, 0x9b, 0xbd, 0xf1, 0x84, 0x04, 0xbd, 0x3a, 0x19, 0xdd, 0x3d, 0xd8, 0xba, 0xb1, 0x3d, 0xb2, 0xb7, 0x21, 0xbd, 0xeb, 0x7e, 0x19, 0x3d, 0xb9, 0xd3, 0xb9, 0x3b, 0xa5, 0x6a, 0x88, 0xbd, 0xdc, 0x78, 0x99, 0xbd, 0xf4, 0x9f, 0xc4, 0x3d, 0x23, 0xfe, 0x49, 0xbb, 0xbe, 0xa0, 0x98, 0xbb, 0x05, 0xe8, 0x84, 0xbd, 0x0e, 0x24, 0x20, 0x3d, 0x30, 0x96, 0x80, 0xbd, 0xd8, 0x1e, 0xef, 0x3c, 0x0a, 0xad, 0xfe, 0x3d, 0xa3, 0xaa, 0x3b, 0xbd, 0x24, 0xd1, 0xb9, 0xbd, 0xfd, 0xb4, 0xd6, 0x3c, 0xe7, 0xfe, 0xe9, 0xbb, 0xf7, 0xd6, 0xaa, 0x3c, 0xa5, 0x35, 0xc1, 0xbc, 0x39, 0xbd, 0x00, 0xbe, 0x19, 0xed, 0x3b, 0x3d, 0x7f, 0x4e, 0x99, 0x3d, 0x09, 0x63, 0xe3, 0xbd, 0x74, 0xc3, 0x73, 0xbd, 0xb7, 0x7d, 0xa4, 0x3d, 0x68, 0x37, 0x50, 0xbd, 0xb0, 0xb0, 0xe8, 0xbd, 0x28, 0x4f, 0xa7, 0xbd, 0x22, 0x85, 0x9e, 0xbd, 0x32, 0xce, 0x12, 0x3e, 0x60, 0x47, 0xbb, 0x3c, 0xdb, 0xa8, 0xc6, 0x3d, 0x50, 0xcf, 0x0c, 0x3d, 0x4b, 0x7d, 0x9c, 0x3b, 0xa9, 0xeb, 0xb9, 0xbd, 0x07, 0x97, 0x13, 0x3c, 0xbe, 0x6b, 0x8f, 0xbd, 0x9c, 0xb3, 0xa9, 0x3d, 0x64, 0xd6, 0x96, 0xbd, 0x75, 0x6a, 0xc4, 0x3c, 0x20, 0xb6, 0x7e, 0x3d, 0x9b, 0x0e, 0x0c, 0x3e, 0xf3, 0xd5, 0xc5, 0x3d, 0x54, 0xb8, 0xdf, 0xbd, 0x12, 0x6e, 0xf2, 0x3a, 0x7b, 0xe4, 0xaa, 0x3c, 0xe3, 0x7c, 0xb5, 0xbd, 0xe6, 0x11, 0x05, 0x3d, 0xc6, 0x65, 0xa2, 0x3d, 0x95, 0x9e, 0x0c, 0x3d, 0x7f, 0xfe, 0xea, 0xbc, 0x22, 0x51, 0xcf, 0x3b, 0x7b, 0xdd, 0x98, 0xbc, 0x6e, 0x2f, 0xba, 0xbc, 0xb3, 0x8e, 0xe6, 0xbd, 0x5e, 0x5e, 0x76, 0x3d, 0x3e, 0xd4, 0xaf, 0xbd, 0x25, 0xbc, 0xa8, 0x3d, 0xb0, 0xd0, 0x81, 0x3c, 0x4c, 0x3f, 0x52, 0x3c, 0x10, 0xd7, 0x13, 0xbd, 0xd0, 0x83, 0x02, 0x3e, 0xd3, 0x03, 0xa5, 0x3d, 0xeb, 0xa7, 0xca, 0xbd, 0x91, 0x09, 0x1b, 0x3d, 0x7a, 0x8c, 0xbf, 0x3c, 0x89, 0x04, 0xdb, 0xbd, 0xf8, 0xfc, 0x56, 0xbd, 0x8a, 0x66, 0x36, 0x3d, 0x42, 0x8f, 0x6e, 0xbd, 0xc9, 0x79, 0x87, 0x3d, 0xbf, 0xfb, 0x26, 0x3d, 0x56, 0xeb, 0xbc, 0xbb, 0x3b, 0xa7, 0x17, 0x3d, 0x17, 0x46, 0x27, 0x3d, 0x87, 0xfb, 0xb4, 0x3d, 0x09, 0x7b, 0x9d, 0xbc, 0xf4, 0xdc, 0x30, 0x3d, 0xca, 0xee, 0xf7, 0xbd, 0x08, 0x73, 0xec, 0x3d, 0x60, 0xed, 0x24, 0x3d, 0x77, 0xa3, 0x26, 0x3c, 0x07, 0x95, 0xe2, 0x3c, 0x27, 0x2f, 0xde, 0x3c, 0xd3, 0x8a, 0x94, 0xbc, 0x58, 0x57, 0xaa, 0xbd, 0x86, 0xdd, 0x0d, 0x3d, 0x29, 0x14, 0x56, 0x3d, 0x94, 0xdf, 0xa8, 0x3d, 0x33, 0x86, 0xbd, 0x3d, 0xb2, 0x8a, 0x7b, 0x3c, 0x8d, 0x7b, 0x26, 0xbc, 0x2f, 0x59, 0xb8, 0xbd, 0x65, 0xc2, 0x87, 0xbd, 0xd3, 0x4b, 0x76, 0x3d, 0x16, 0x20, 0x22, 0x3d, 0xb9, 0xef, 0x62, 0x3b, 0xda, 0x3b, 0x6b, 0x3d, 0xce, 0x75, 0x59, 0x3d, 0x90, 0xde, 0x33, 0x3d, 0x77, 0x8b, 0xf7, 0x3d, 0x98, 0xfd, 0xa0, 0xbd, 0xcc, 0xa0, 0xd2, 0x3d, 0xec, 0x73, 0x84, 0xbd, 0x2c, 0x7a, 0x34, 0x3c, 0xbd, 0x44, 0x07, 0x3e, 0xd8, 0xf6, 0x74, 0xbd, 0x0a, 0x72, 0x8c, 0xbd, 0xad, 0xd3, 0xd5, 0xbd, 0x78, 0xf7, 0xc9, 0x3d, 0x28, 0xef, 0x5f, 0x3d, 0x01, 0xbf, 0x80, 0xbd, 0xcc, 0xd6, 0x01, 0xbd, 0x37, 0x34, 0x75, 0xbd, 0x4a, 0x00, 0x87, 0x3d, 0x4c, 0xd9, 0x4c, 0xbb, 0xcd, 0x86, 0x42, 0xbd, 0x7b, 0xef, 0x1a, 0x3d, 0x98, 0x2b, 0x3a, 0x3d, 0x97, 0x7a, 0x18, 0x3c, 0xd0, 0x24, 0xe6, 0xbd, 0xcd, 0xc5, 0xc2, 0x3c, 0x8d, 0x69, 0x7f, 0xbc, 0xed, 0xef, 0x88, 0xbd, 0x54, 0x72, 0xd6, 0x3d, 0xc4, 0x5b, 0xba, 0x3d, 0x13, 0xd9, 0x1d, 0xbd, 0xa9, 0x69, 0xd5, 0x3d, 0xf6, 0xab, 0x4b, 0x3d, 0xaf, 0x3c, 0xab, 0x3d, 0xad, 0x17, 0x02, 0x3d, 0xfe, 0x82, 0x97, 0xbd, 0xe7, 0x5b, 0xca, 0x3d, 0x0d, 0x04, 0x1b, 0x3d, 0x6a, 0x95, 0xb5, 0x3d, 0xa7, 0x5f, 0xc5, 0x3d, 0x57, 0xf4, 0xdc, 0x3d, 0x25, 0xf3, 0xa2, 0xbd, 0xad, 0x96, 0xd3, 0x3d, 0x16, 0xb7, 0x2f, 0xbe, 0x61, 0x4c, 0xaa, 0x3d, 0x71, 0x82, 0xcc, 0x3d, 0x44, 0x36, 0xbb, 0x3d, 0xba, 0x8f, 0xca, 0xbc, 0xe0, 0xa3, 0x63, 0x3c, 0xfa, 0x02, 0xb3, 0xbd, 0x0a, 0xcf, 0x00, 0xbe, 0x4b, 0xce, 0x7e, 0xbd, 0xe9, 0x90, 0xcf, 0x3b, 0x32, 0x0d, 0xa9, 0xbd, 0x54, 0x4d, 0x42, 0x3d, 0x30, 0x36, 0x32, 0x3d, 0x04, 0xa6, 0xb2, 0xbd, 0x79, 0x05, 0x0a, 0x3e, 0xbb, 0x45, 0xe6, 0x3c, 0xfd, 0xf6, 0x79, 0x3d, 0x1c, 0x9f, 0x1d, 0x3d, 0xe5, 0x27, 0x97, 0x3c, 0x31, 0xf4, 0x02, 0xbd, 0x30, 0x19, 0x45, 0x3d, 0xa4, 0x54, 0x06, 0x3d, 0x94, 0x4d, 0xb9, 0xbd, 0x3b, 0x21, 0xdf, 0xbd, 0xbb, 0x79, 0x1f, 0xbd, 0x41, 0x34, 0x9f, 0x3d, 0x02, 0x58, 0xb8, 0x3d, 0xe1, 0xb2, 0x03, 0xbe, 0x5e, 0x71, 0x29, 0x3d, 0x9e, 0xf7, 0xbf, 0xbd, 0xc7, 0x01, 0x75, 0xbd, 0x0d, 0xe3, 0x14, 0xbd, 0x38, 0x23, 0xa3, 0x3d, 0x93, 0xbc, 0xaa, 0xbd, 0xc9, 0x19, 0x91, 0x3d, 0xcb, 0xba, 0x69, 0x3d, 0xfc, 0xfa, 0xd7, 0x3d, 0x95, 0xd9, 0x38, 0xbd, 0x4e, 0x3f, 0x75, 0x3d, 0x73, 0xdb, 0x15, 0xbe, 0xdf, 0x76, 0x8d, 0x3d, 0x0f, 0xb1, 0x13, 0x3d, 0x90, 0x32, 0x24, 0x3e, 0x3a, 0x17, 0xf9, 0xbd, 0xcd, 0xd1, 0x38, 0xbd, 0x27, 0xf4, 0x9b, 0xbd, 0x10, 0x6c, 0xa3, 0xbc, 0x1e, 0x12, 0x42, 0x3d, 0xee, 0x38, 0xff, 0xbc, 0xb4, 0x28, 0x2e, 0x3d, 0xba, 0x69, 0xbd, 0xbc, 0x7c, 0x69, 0xbb, 0xbc, 0x1a, 0xe8, 0xde, 0xbd, 0xd8, 0xa2, 0x17, 0x3c, 0xb8, 0x9e, 0xb6, 0xbb, 0xae, 0x5e, 0x96, 0x3c, 0x4f, 0xbb, 0x03, 0xbd, 0x8f, 0x72, 0xb4, 0xbc, 0x94, 0x57, 0xd7, 0x3d, 0xf5, 0xe3, 0xaf, 0xbc, 0xa4, 0x0c, 0x0d, 0xbd, 0x13, 0xbb, 0x83, 0x3d, 0x62, 0x06, 0xda, 0x3d, 0xb7, 0xa5, 0x1c, 0x3e, 0x90, 0xd8, 0x86, 0xbd, 0xf5, 0x7e, 0xd0, 0xbd, 0x8b, 0x5e, 0xcb, 0xbd, 0x0e, 0x81, 0xf5, 0xbd, 0xfe, 0xf3, 0xe4, 0xbc, 0xe2, 0xc9, 0xd6, 0xbc, 0x4c, 0xa9, 0xc8, 0x3b, 0x04, 0xd2, 0x49, 0xbc, 0xf0, 0xb2, 0xa5, 0xbd, 0xc7, 0xd6, 0xea, 0x3d, 0xa6, 0xa6, 0x77, 0x3d, 0xdf, 0x24, 0x03, 0x3d, 0x05, 0x9e, 0x86, 0xbd, 0xce, 0x27, 0x31, 0x3d, 0x46, 0x54, 0xa4, 0x3d, 0x27, 0x9b, 0x35, 0xbd, 0x28, 0x86, 0x68, 0xbb, 0x2c, 0x1e, 0xc1, 0xbd, 0xda, 0x7e, 0xa2, 0x3b, 0xa6, 0xe6, 0xe9, 0x3d, 0x8a, 0xcf, 0x0f, 0x3d, 0x5e, 0xf0, 0x6f, 0xbd, 0xa0, 0xc6, 0xb1, 0xbb, 0x08, 0xc6, 0x77, 0xbc, 0x6d, 0x17, 0x16, 0xbd, 0xf5, 0xc6, 0x21, 0x3d, 0x70, 0x2a, 0x11, 0xbd, 0x3f, 0x5a, 0x6c, 0xbd, 0xfb, 0xd9, 0xbc, 0x3d, 0x91, 0x33, 0xb4, 0x3c, 0xc1, 0xc7, 0x84, 0x3d, 0xd9, 0xca, 0x41, 0xbd, 0xd8, 0x5d, 0xec, 0x3d, 0x17, 0xe2, 0x94, 0x3d, 0xbf, 0x3f, 0x04, 0xbe, 0x24, 0xa8, 0x66, 0xbd, 0xc4, 0xcd, 0xc0, 0x3d, 0x07, 0xce, 0x9e, 0xbd, 0x67, 0x5d, 0xe0, 0x3d, 0x9e, 0xdd, 0x1c, 0xbe, 0x77, 0xe5, 0x5c, 0x3d, 0x98, 0x1f, 0xaf, 0x3d, 0x8a, 0xfd, 0x02, 0x3e, 0x9f, 0x9a, 0xba, 0xbc, 0x40, 0xe9, 0xbb, 0x3c, 0x4e, 0x51, 0x10, 0xbc, 0xc6, 0xcc, 0x81, 0x3d, 0x83, 0x18, 0x78, 0xbc, 0x7f, 0x25, 0xe8, 0xbd, 0x2e, 0xa6, 0xcb, 0x3c, 0x2f, 0x8c, 0x3e, 0x3c, 0x38, 0xdc, 0x67, 0xbb, 0x57, 0xf8, 0xbd, 0x3d, 0xa2, 0x4b, 0x13, 0x3e, 0x6d, 0x76, 0x64, 0x3d, 0xcf, 0x5e, 0x98, 0x3c, 0x09, 0xc1, 0x8a, 0x3c, 0x42, 0x2b, 0x82, 0x3d, 0xa3, 0x83, 0x4a, 0x3d, 0xe3, 0x74, 0xb9, 0xbb, 0x26, 0xf8, 0x62, 0x3d, 0xd6, 0x4d, 0xa4, 0xbc, 0x68, 0x44, 0x13, 0x3d, 0x3b, 0x7d, 0x54, 0x3d, 0xf4, 0xdf, 0x8c, 0x3d, 0xef, 0x72, 0xcf, 0xbd, 0x4e, 0xd6, 0x85, 0x3c, 0x6a, 0x11, 0x38, 0xbc, 0xa5, 0xec, 0x83, 0xbd, 0x23, 0x95, 0x86, 0xbd, 0x93, 0xa0, 0xbf, 0x3c, 0x91, 0xc5, 0x11, 0xbd, 0x96, 0x1b, 0x23, 0x3d, 0xbc, 0x6d, 0x00, 0x3d, 0x55, 0xb7, 0x9d, 0x3d, 0x44, 0x45, 0x8d, 0x3c, 0x83, 0x34, 0x19, 0xbd, 0x1c, 0x2e, 0xbe, 0xbd, 0xfb, 0x4b, 0xd5, 0x3c, 0x25, 0xec, 0xd9, 0xba, 0xe0, 0xcd, 0xa9, 0x3d, 0x72, 0x99, 0xa1, 0x3d, 0xa6, 0xa1, 0x91, 0xbd, 0xc8, 0x70, 0x39, 0xbd, 0x33, 0x54, 0x24, 0x3d, 0x80, 0x25, 0xd8, 0x3c, 0x3c, 0x36, 0xdb, 0x3b, 0x04, 0x22, 0x3c, 0xbd, 0xc8, 0x81, 0xfb, 0x3d, 0x89, 0x15, 0xe1, 0x3d, 0xa5, 0x9d, 0x17, 0xbd, 0x68, 0xad, 0x64, 0xbd, 0xad, 0xbd, 0x59, 0xbc, 0xfc, 0x1a, 0xa5, 0xbd, 0xf5, 0x88, 0x44, 0x3d, 0x53, 0xa7, 0x9b, 0x3d, 0x2e, 0x00, 0x93, 0xbd, 0xbd, 0xb1, 0xb9, 0x3c, 0x61, 0x54, 0xc8, 0x3c, 0xe3, 0xe9, 0xd7, 0x3d, 0x78, 0xe2, 0xe0, 0x3d, 0x6c, 0xe0, 0x08, 0xbe, 0x80, 0xc2, 0xaf, 0x3d, 0x2a, 0x5c, 0x10, 0xbd, 0x60, 0xcb, 0xf0, 0x3d, 0x7a, 0xa1, 0xf0, 0xbb, 0x02, 0x56, 0xa9, 0x3d, 0x11, 0xf1, 0x1c, 0x3c, 0x39, 0xec, 0xa9, 0xbd, 0x73, 0xfd, 0x24, 0xbd, 0xd5, 0x86, 0x8c, 0x3d, 0xdc, 0x85, 0x21, 0x3c, 0xa7, 0x6f, 0xf6, 0x3d, 0xe0, 0x6b, 0x0c, 0xbd, 0x08, 0x15, 0xf2, 0x3d, 0xd6, 0x6a, 0xed, 0x3d, 0xda, 0xc1, 0x51, 0xbd, 0x27, 0x6e, 0x11, 0xbe, 0xbe, 0x8f, 0xcf, 0xbc, 0xa9, 0xf1, 0x05, 0x3d, 0xa1, 0x30, 0x8d, 0xbd, 0x35, 0x5e, 0x97, 0xbd, 0xee, 0x02, 0x9d, 0xbc, 0xf8, 0xba, 0xe9, 0xbd, 0x61, 0xe1, 0xb5, 0xbd, 0xaa, 0x6d, 0x0c, 0xbd, 0xeb, 0x1f, 0x5d, 0xbd, 0x17, 0x11, 0xda, 0x3c, 0xe3, 0x75, 0x55, 0xbd, 0x8b, 0x40, 0x4a, 0x3d, 0xb2, 0x5b, 0x17, 0xbd, 0xc2, 0xbb, 0x66, 0xbd, 0x42, 0x20, 0xf7, 0x3d, 0x05, 0x75, 0xff, 0xbd, 0xce, 0xd3, 0xca, 0x3c, 0x76, 0x10, 0xbb, 0x3d, 0x66, 0xa2, 0xcc, 0xbc, 0x96, 0x30, 0xf7, 0xba, 0xad, 0xa8, 0x16, 0xbc, 0x32, 0x10, 0x77, 0x3b, 0x98, 0xde, 0x1f, 0xbd, 0xc7, 0xd6, 0x72, 0x3d, 0x33, 0xea, 0xe1, 0x3d, 0xb5, 0x5d, 0x8d, 0x3c, 0xfe, 0xf1, 0x64, 0x3d, 0x3f, 0xe1, 0x88, 0x3c, 0x0d, 0xa2, 0x92, 0x3d, 0x52, 0x90, 0x20, 0xbd, 0xcd, 0x17, 0x88, 0xbd, 0xf7, 0xf1, 0x7b, 0x3d, 0x55, 0xbe, 0x9c, 0x3b, 0x1a, 0x3f, 0xd1, 0x3c, 0x46, 0xbe, 0x0d, 0x3d, 0x53, 0xd7, 0xd9, 0x3d, 0xda, 0x58, 0xb5, 0xbc, 0x3a, 0x41, 0x78, 0xbd, 0x78, 0xc0, 0x54, 0xbd, 0x3c, 0x27, 0x10, 0x3e, 0x16, 0x00, 0xe9, 0x3b, 0x6e, 0xcd, 0xc5, 0x3d, 0xd9, 0xf0, 0x82, 0x3d, 0x44, 0x3e, 0x82, 0x3d, 0xde, 0x31, 0x83, 0x3d, 0x10, 0x32, 0x4e, 0xbd, 0x13, 0x46, 0xd7, 0xbd, 0x60, 0xa0, 0xbb, 0xbc, 0x33, 0xc9, 0xb0, 0xbd, 0x8d, 0x52, 0xfb, 0x3d, 0x5e, 0xa7, 0x07, 0x3d, 0x05, 0xd7, 0xb7, 0x3d, 0x34, 0x8c, 0x71, 0x3d, 0xcf, 0x5d, 0x66, 0xbd, 0x2a, 0x61, 0x1c, 0x3d, 0xa5, 0xa5, 0x70, 0xbd, 0xd2, 0xb9, 0x67, 0x3b, 0x9e, 0x63, 0x5a, 0x3d, 0xbe, 0xea, 0xd4, 0xbc, 0x57, 0xe9, 0xb5, 0x3d, 0x03, 0xe4, 0xa6, 0x3d, 0xc4, 0x6b, 0xb3, 0x3d, 0x6e, 0x60, 0x9f, 0x3d, 0xac, 0x31, 0xa0, 0x3d, 0xcf, 0xcc, 0xb5, 0x3d, 0xd0, 0x80, 0xd6, 0x3d, 0xb9, 0x3f, 0x96, 0xbd, 0x2d, 0x17, 0x17, 0xbb, 0x6f, 0xf2, 0xe4, 0xbd, 0x17, 0x51, 0x6e, 0x3d, 0xc2, 0xe2, 0xc2, 0x3d, 0xfe, 0x71, 0x59, 0x3d, 0x0e, 0x1c, 0x78, 0xbd, 0xc9, 0xc7, 0xbc, 0xbd, 0x40, 0xb0, 0xa8, 0x3d, 0xbf, 0xff, 0x42, 0xbd, 0xe4, 0x2e, 0x67, 0x3d, 0xca, 0x73, 0x81, 0xbd, 0x0b, 0x0d, 0xf3, 0x3d, 0xce, 0x97, 0x70, 0x3d, 0xe9, 0x59, 0xe9, 0x3d, 0x45, 0x22, 0x73, 0xbd, 0x24, 0xb8, 0xdf, 0x3d, 0x96, 0xbb, 0x3f, 0x3c, 0x02, 0xed, 0x65, 0x3d, 0x84, 0x40, 0x25, 0x3c, 0x6c, 0xc5, 0xd2, 0x3c, 0xea, 0x38, 0x4a, 0x3d, 0xf9, 0xa2, 0xc9, 0x3d, 0x6f, 0x30, 0xbc, 0x3a, 0x2d, 0xd5, 0x81, 0xbd, 0xd2, 0xae, 0xa3, 0xbb, 0x8e, 0x91, 0xe7, 0x3c, 0x28, 0x6b, 0xc4, 0xbd, 0xf3, 0x0c, 0xbf, 0xbc, 0x66, 0xf8, 0xd3, 0x3b, 0x6d, 0x3e, 0x01, 0x3d, 0xf3, 0xbf, 0xc2, 0xbc, 0x0d, 0xc5, 0x6f, 0xbd, 0xb7, 0x9b, 0x9c, 0x3d, 0xeb, 0x79, 0x88, 0x3d, 0x81, 0x8a, 0x7d, 0xbc, 0xde, 0x8b, 0x14, 0x3d, 0xa4, 0x3f, 0x7d, 0x3d, 0xb4, 0x27, 0xa9, 0x3d, 0xb7, 0x75, 0x51, 0x3d, 0xff, 0x73, 0x85, 0x3d, 0x3f, 0xf3, 0x51, 0x3d, 0xe6, 0xdd, 0xe2, 0xbb, 0x83, 0xc7, 0x65, 0xbd, 0x6a, 0x16, 0xb6, 0xbd, 0xcf, 0xe8, 0x90, 0x3d, 0x5b, 0xc8, 0xad, 0xbc, 0xa1, 0x27, 0x29, 0xbd, 0x57, 0xbd, 0x3d, 0x3d, 0x61, 0x4e, 0x41, 0xbc, 0x21, 0x2f, 0x29, 0x3d, 0x55, 0x0b, 0xba, 0x3d, 0xaa, 0x67, 0xf3, 0xba, 0x7d, 0x60, 0xe4, 0x3d, 0xab, 0xe7, 0x20, 0xbd, 0x01, 0x71, 0x9f, 0x3d, 0x5a, 0xd5, 0x95, 0xbd, 0x2f, 0x75, 0xd5, 0x3d, 0x7c, 0x91, 0xf6, 0x3d, 0xaa, 0xd6, 0x0c, 0x3d, 0x6d, 0x1c, 0xd9, 0xbd, 0xb4, 0x4e, 0x82, 0xbc, 0x3f, 0x5a, 0x1a, 0x3b, 0xb4, 0x94, 0xfb, 0x3d, 0x0a, 0x71, 0x3c, 0xbd, 0x97, 0xba, 0x12, 0xbc, 0xfd, 0x3d, 0x33, 0xbd, 0xa3, 0x4d, 0x01, 0x3e, 0x54, 0xe2, 0x33, 0xbd, 0x8d, 0x32, 0x5d, 0x3d, 0x92, 0x84, 0xcb, 0x3d, 0x91, 0x67, 0xde, 0xbd, 0x4b, 0xfd, 0xc7, 0xbd, 0x4b, 0x11, 0x04, 0xbe, 0x3e, 0xde, 0xac, 0x3d, 0xe4, 0x9e, 0x3c, 0x3d, 0x5e, 0x7d, 0xfb, 0x3d, 0xfd, 0x4d, 0xae, 0x3d, 0x63, 0xcf, 0x6f, 0xbd, 0xa0, 0x4f, 0x8b, 0x3d, 0x46, 0x2c, 0x84, 0xbd, 0xda, 0x69, 0x11, 0x3b, 0xca, 0x5b, 0x1c, 0xbd, 0x59, 0x23, 0x26, 0x3e, 0x16, 0xb1, 0x68, 0xbd, 0x1c, 0xd4, 0x98, 0xbd, 0x9c, 0x91, 0x6e, 0xbd, 0xa5, 0xc6, 0x55, 0xbc, 0xd0, 0xf3, 0xcc, 0xbd, 0xe8, 0x91, 0xe0, 0xbd, 0xdf, 0xe3, 0xb4, 0x3d, 0x04, 0x77, 0xc2, 0xbd, 0xcc, 0x21, 0xda, 0xbd, 0x7d, 0xed, 0x1d, 0x3d, 0x1c, 0xa9, 0x0f, 0x3e, 0x25, 0x19, 0x67, 0x3d, 0xcc, 0x29, 0x65, 0xbd, 0x34, 0x00, 0xdd, 0x3d, 0xe3, 0x04, 0x15, 0xbd, 0x79, 0xb8, 0x50, 0xbd, 0x98, 0x5b, 0x44, 0xbc, 0x32, 0x55, 0xd1, 0x3d, 0x19, 0x20, 0x2a, 0xbd, 0xbd, 0x28, 0xb6, 0x3c, 0x33, 0xf4, 0xc4, 0xbb, 0x95, 0x26, 0x9f, 0xbb, 0x93, 0xb7, 0x7f, 0x3d, 0x16, 0xbc, 0x5f, 0x3d, 0x0a, 0x14, 0x82, 0x3c, 0x3a, 0x40, 0x12, 0x3e, 0x99, 0x9c, 0xbe, 0x3c, 0x6c, 0x22, 0x72, 0x3d, 0xb3, 0x18, 0x10, 0xbe, 0x2b, 0x6f, 0x4b, 0x3d, 0xaf, 0x83, 0x90, 0x3c, 0x67, 0x6b, 0x57, 0x3d, 0xae, 0xba, 0x1d, 0xbd, 0x42, 0x58, 0xda, 0xbd, 0xcd, 0x16, 0xc6, 0xbd, 0x28, 0x11, 0xa1, 0xbd, 0xc3, 0xfa, 0x6b, 0x3d, 0xff, 0x35, 0xc4, 0x3d, 0xca, 0x54, 0x9d, 0x3d, 0x65, 0xc0, 0x0a, 0x3d, 0xbe, 0xbd, 0x73, 0xbc, 0xee, 0xf8, 0xfb, 0x3a, 0x88, 0xcf, 0x2c, 0x3d, 0xa4, 0x2d, 0xb9, 0x3d, 0x30, 0xbf, 0x9c, 0xbd, 0x16, 0xf6, 0x97, 0x3c, 0x72, 0xf4, 0x12, 0x3d, 0x4c, 0xc6, 0x01, 0xbd, 0x68, 0x2e, 0xc0, 0xbd, 0x38, 0xd4, 0x2c, 0x3d, 0xe6, 0xb4, 0xbf, 0x3d, 0xf5, 0x15, 0x66, 0xbd, 0x29, 0x0f, 0x83, 0x3d, 0x44, 0x2b, 0xb0, 0x3d, 0xa1, 0x53, 0xeb, 0x3d, 0xc6, 0x86, 0x8a, 0x3d, 0xe0, 0x36, 0x48, 0xbd, 0x29, 0xff, 0x22, 0xbd, 0xff, 0x33, 0xae, 0x3d, 0xa2, 0x5b, 0x13, 0xbd, 0x1d, 0x6f, 0x9e, 0x3d, 0x0e, 0x6d, 0x09, 0x3d, 0x7f, 0x06, 0x01, 0xbe, 0xc8, 0x08, 0xc7, 0x3d, 0xc2, 0xe8, 0xae, 0x3d, 0xe6, 0x4a, 0xc7, 0x3d, 0x29, 0x40, 0xb3, 0x3d, 0xb5, 0x99, 0x83, 0xbd, 0xa4, 0x23, 0x8f, 0x3d, 0x4a, 0xa2, 0x9c, 0x3d, 0x0d, 0xe2, 0x04, 0x3d, 0x40, 0xff, 0x07, 0x3d, 0xa4, 0x8c, 0x30, 0x3d, 0x75, 0x00, 0x1c, 0x3d, 0x45, 0x9b, 0x02, 0x3e, 0xb2, 0xce, 0x2e, 0x3d, 0x16, 0x9d, 0x3f, 0xbd, 0x8e, 0xf1, 0x1b, 0xbc, 0x9b, 0x59, 0x04, 0xbd, 0xae, 0xd7, 0xd3, 0x3d, 0x2b, 0x15, 0x05, 0x3b, 0x12, 0xec, 0x5d, 0x3c, 0x30, 0xe9, 0xea, 0x3d, 0x58, 0xe5, 0xe4, 0xbd, 0x9b, 0x54, 0x86, 0xbd, 0xf0, 0x47, 0x4e, 0xbd, 0x21, 0xa7, 0xef, 0x3b, 0x89, 0xf9, 0x23, 0x3d, 0xec, 0x14, 0x48, 0xbd, 0xfc, 0x86, 0x20, 0x3e, 0x08, 0x69, 0x95, 0x3d, 0x26, 0x08, 0xb6, 0xbd, 0xd9, 0xe2, 0xb3, 0xbd, 0x27, 0x6f, 0xf0, 0x3d, 0x9d, 0xc4, 0x1c, 0xbe, 0x1a, 0x6e, 0x22, 0x3d, 0xc5, 0xe3, 0x68, 0x3d, 0x45, 0x2d, 0x8a, 0xbb, 0xbe, 0xf3, 0x84, 0x3d, 0x63, 0xef, 0x10, 0x3d, 0x54, 0xfa, 0xde, 0x3c, 0x57, 0x4c, 0xc4, 0x3d, 0xa7, 0x44, 0x8b, 0xbd, 0x9e, 0xf0, 0x33, 0xbd, 0x9a, 0x6c, 0x89, 0x3d, 0x6c, 0xc9, 0x21, 0xbe, 0x0e, 0x60, 0x9d, 0xbd, 0xd9, 0x35, 0x1f, 0xbd, 0x0d, 0x4f, 0x9a, 0x3d, 0xd4, 0x24, 0xca, 0x3d, 0xc4, 0x5c, 0x45, 0xbd, 0x28, 0x24, 0xea, 0x3c, 0xee, 0xea, 0xef, 0xbd, 0x4d, 0xae, 0x89, 0x3d, 0x91, 0x99, 0x79, 0xbc, 0xb6, 0x1b, 0xc2, 0x3d, 0xcb, 0x8d, 0xb4, 0xbc, 0x63, 0xaa, 0x7f, 0xbd, 0x19, 0xbc, 0xe6, 0xbc, 0x82, 0x28, 0x4e, 0xbd, 0xf4, 0x7a, 0xbc, 0x3d, 0xe4, 0xe7, 0xcd, 0xbd, 0x2c, 0xe3, 0xda, 0xbd, 0xc6, 0x98, 0xec, 0x3d, 0xd7, 0xfc, 0xf8, 0xbc, 0xd4, 0x80, 0x76, 0x3d, 0xbf, 0x17, 0x3e, 0xbd, 0x20, 0x69, 0x48, 0x3a, 0x1c, 0x2c, 0xa2, 0x3d, 0xc2, 0x8b, 0x95, 0x3d, 0xc4, 0xb5, 0xa9, 0x3d, 0x43, 0x5b, 0xde, 0xbc, 0xf1, 0x1e, 0x0f, 0xbd, 0x52, 0x3e, 0xbb, 0x3d, 0xff, 0xaf, 0xfd, 0x3d, 0x66, 0x65, 0x59, 0x3d, 0x03, 0x95, 0x55, 0x3d, 0x97, 0x22, 0x04, 0xbe, 0xcb, 0x24, 0x32, 0xbd, 0xf3, 0x26, 0xa5, 0xbd, 0xaa, 0xd3, 0xdb, 0xbc, 0x75, 0x5b, 0x41, 0xbd, 0x2e, 0x2c, 0xc4, 0x3d, 0xd5, 0x98, 0xc4, 0x3c, 0xa3, 0x19, 0x01, 0x3c, 0x4e, 0x3f, 0x3c, 0x3d, 0xea, 0xee, 0x2d, 0xbd, 0x3f, 0x97, 0x13, 0xbc, 0xed, 0xdd, 0x55, 0x3d, 0x49, 0xba, 0xfb, 0xbd, 0x5c, 0xbd, 0xc9, 0xbd, 0xe8, 0x9f, 0xad, 0x3d, 0x9c, 0x26, 0x32, 0xbd, 0xf6, 0xfa, 0x15, 0xbe, 0x09, 0x88, 0xc0, 0xbd, 0xe2, 0xcc, 0xaf, 0xbd, 0xdb, 0x22, 0x56, 0x3d, 0x78, 0x3f, 0x0f, 0xbc, 0x50, 0xe5, 0x93, 0xbd, 0x55, 0x90, 0x09, 0x3d, 0xac, 0xec, 0x6d, 0xbd, 0x93, 0x0e, 0xce, 0xbc, 0x5b, 0xde, 0x85, 0x3d, 0x08, 0x1d, 0x4b, 0x3d, 0x8f, 0x16, 0xf4, 0xbd, 0x89, 0xf8, 0x83, 0xbd, 0x65, 0xf3, 0xf8, 0xbc, 0xe3, 0x37, 0x09, 0x3b, 0x37, 0x89, 0x91, 0xbc, 0x69, 0xea, 0x2f, 0xbd, 0x2c, 0xf2, 0xbf, 0x3c, 0xd0, 0x57, 0xa7, 0x3d, 0xae, 0x94, 0xbf, 0x3d, 0x15, 0x1d, 0x63, 0x3d, 0x53, 0x20, 0x4b, 0xbd, 0x4f, 0xf2, 0x00, 0x3e, 0x29, 0x36, 0x54, 0xbd, 0x49, 0x2d, 0x8c, 0xbd, 0x29, 0xbc, 0xb6, 0x3d, 0x08, 0xc4, 0xc7, 0x3d, 0xb6, 0x3d, 0xf9, 0xbd, 0x84, 0x0f, 0xa1, 0x3d, 0xe8, 0x20, 0xb1, 0xbd, 0x8b, 0xf6, 0xa8, 0xbd, 0x51, 0xec, 0x75, 0x3d, 0x85, 0xeb, 0x13, 0xbe, 0x5c, 0xe5, 0x4f, 0x3d, 0xe5, 0x90, 0xf3, 0xbc, 0x5a, 0xb0, 0x39, 0xbd, 0xbf, 0x7a, 0x63, 0x3d, 0xa4, 0x35, 0x08, 0x3e, 0xae, 0x8a, 0xa6, 0xbd, 0x4d, 0x53, 0x46, 0xbd, 0x8e, 0xb0, 0x46, 0xbc, 0x9d, 0x94, 0x15, 0x3d, 0x6d, 0xdc, 0x62, 0x3c, 0x75, 0x33, 0x29, 0x3d, 0x61, 0xba, 0x3d, 0x3d, 0x0a, 0xdb, 0x72, 0xbc, 0x18, 0x43, 0xdb, 0xbc, 0xb0, 0xca, 0x83, 0xbc, 0x33, 0x9b, 0x12, 0xbe, 0xdb, 0x85, 0xb2, 0xbd, 0xe1, 0x52, 0xc7, 0xbd, 0xd6, 0xbc, 0x12, 0xbd, 0x19, 0x0f, 0x90, 0xbc, 0x75, 0xb0, 0x4c, 0x3d, 0x91, 0x46, 0xd2, 0x3b, 0xae, 0x95, 0x0e, 0x3d, 0x51, 0xa0, 0x74, 0x3d, 0x9b, 0x73, 0x90, 0xba, 0xec, 0x61, 0x85, 0x3c, 0xaa, 0x01, 0xb7, 0x3d, 0x83, 0x19, 0x96, 0xbd, 0xeb, 0x6f, 0xce, 0x3c, 0x46, 0x50, 0x15, 0xbe, 0x4c, 0x9d, 0xe2, 0xbb, 0xee, 0x86, 0x59, 0xbb, 0xd9, 0xea, 0x8c, 0x3d, 0x5e, 0x80, 0x96, 0x3b, 0x9e, 0x36, 0xf2, 0x3d, 0xfc, 0x4e, 0xa8, 0x3c, 0x67, 0x32, 0xb0, 0x3d, 0x93, 0xf9, 0x1a, 0x3d, 0x71, 0x3b, 0xaa, 0xbd, 0xd4, 0xcf, 0x34, 0x3d, 0x93, 0x11, 0x84, 0xbd, 0x76, 0x9c, 0xc7, 0x3d, 0x6b, 0xee, 0xd5, 0xbd, 0xb6, 0x03, 0xd8, 0x3d, 0xb8, 0x56, 0x53, 0xbd, 0x61, 0x89, 0xab, 0xbd, 0x69, 0x71, 0x46, 0xbc, 0x79, 0x31, 0x81, 0xbd, 0xa0, 0xaa, 0x9d, 0xbc, 0xab, 0x17, 0x0c, 0x3d, 0x31, 0xb8, 0x0a, 0x3d, 0xc3, 0x40, 0xb4, 0xbd, 0xab, 0xb6, 0x97, 0x3d, 0xc1, 0x3a, 0x47, 0x3d, 0x31, 0xdc, 0xdb, 0xbc, 0xb4, 0x23, 0x60, 0xbc, 0x9d, 0x47, 0x93, 0x3d, 0xc9, 0x69, 0xa1, 0x3d, 0xbb, 0x2f, 0x7a, 0x3d, 0x07, 0x8d, 0x91, 0x3d, 0x20, 0xdb, 0xca, 0x3d, 0xf8, 0x44, 0xd3, 0xbd, 0x68, 0xfc, 0x66, 0xbc, 0xfa, 0xab, 0x29, 0x3d, 0xcb, 0xb6, 0xa4, 0x3d, 0x9e, 0xbd, 0x06, 0x3d, 0xd1, 0x54, 0xb1, 0x3d, 0x06, 0x7e, 0xcb, 0xbd, 0x24, 0x71, 0xc4, 0x3d, 0x08, 0x17, 0x40, 0x3d, 0x7a, 0xf7, 0xae, 0xbd, 0xc0, 0x66, 0xc1, 0xbd, 0xfa, 0x2a, 0x22, 0xbd, 0xf0, 0x3d, 0xd2, 0xbc, 0x2e, 0xc7, 0x71, 0xbd, 0xc5, 0x4f, 0xd0, 0xbd, 0xf7, 0x68, 0x85, 0xbd, 0xab, 0xeb, 0x92, 0xbd, 0x5e, 0xb7, 0xe8, 0xbd, 0x66, 0xc1, 0xef, 0xbd, 0xb7, 0x07, 0x06, 0xbd, 0x5b, 0x2f, 0x40, 0x3d, 0xd6, 0xb0, 0xa8, 0xbd, 0xb8, 0x1a, 0xe8, 0x3d, 0x9f, 0xb7, 0xc4, 0x3d, 0x3c, 0xb5, 0x8f, 0xbd, 0x23, 0x9f, 0xbc, 0x3d, 0xfd, 0x90, 0x88, 0xbd, 0xa2, 0xa9, 0x27, 0xbc, 0x41, 0xe4, 0xd7, 0xbd, 0x29, 0x97, 0x07, 0xbd, 0xff, 0x72, 0x04, 0x3c, 0x56, 0x5a, 0x34, 0xbd, 0xf4, 0x8a, 0x9d, 0xbd, 0x7e, 0x5d, 0x83, 0xbd, 0xd2, 0x00, 0x4e, 0x3d, 0xbe, 0x7e, 0x5d, 0x3d, 0x03, 0xd1, 0x38, 0xbd, 0xb2, 0x2b, 0xbc, 0xbd, 0x04, 0xa8, 0x4d, 0x3d, 0xa8, 0x0b, 0xaa, 0xbd, 0x84, 0x50, 0xac, 0xbd, 0x09, 0xef, 0xbf, 0xbc, 0xfa, 0xb8, 0xb2, 0xbd, 0xeb, 0x7e, 0xd9, 0x3d, 0x54, 0x08, 0xda, 0xbd, 0x21, 0x24, 0x61, 0xbd, 0xae, 0x1e, 0xae, 0xbd, 0xb4, 0x50, 0x3a, 0xbc, 0x2e, 0x07, 0xe9, 0xbd, 0xec, 0xb1, 0x9d, 0xbd, 0x88, 0x5d, 0xca, 0xbc, 0x0c, 0x8a, 0x8c, 0x3d, 0x58, 0x56, 0xf9, 0x3c, 0x57, 0x0f, 0xe7, 0x3d, 0xd4, 0xd9, 0x1c, 0xbd, 0x87, 0xfe, 0x38, 0xbd, 0x1c, 0x08, 0x17, 0xbd, 0x72, 0xbb, 0xc1, 0xbc, 0x5b, 0xa9, 0xf7, 0xba, 0xf2, 0xd5, 0x34, 0xbd, 0x71, 0x2f, 0x4b, 0xbd, 0x6a, 0xd6, 0xab, 0xbd, 0x07, 0x81, 0xcd, 0x3d, 0x03, 0xf0, 0x2e, 0x3d, 0xcd, 0x20, 0xd4, 0xbd, 0x0e, 0xf4, 0x3f, 0xbc, 0xf3, 0xed, 0xe1, 0x3d, 0xf6, 0xc4, 0x82, 0x3d, 0x0b, 0x42, 0x48, 0x3d, 0xf9, 0xcd, 0x87, 0x3d, 0x91, 0x7d, 0x49, 0x3b, 0x9a, 0xc7, 0x28, 0xbd, 0xf6, 0x02, 0xc3, 0x3d, 0x6e, 0x82, 0xa4, 0xbd, 0x41, 0x1f, 0xe7, 0x3d, 0x44, 0x06, 0x76, 0x3d, 0x3b, 0xbc, 0xc1, 0x3b, 0x20, 0xf7, 0x7c, 0xbd, 0x0d, 0x0d, 0xe0, 0xbd, 0x2b, 0xa5, 0xc5, 0x3d, 0x51, 0x84, 0x6f, 0xbd, 0xd0, 0x24, 0x22, 0x3d, 0x33, 0x68, 0xb7, 0x3d, 0x37, 0x88, 0x87, 0x3d, 0x24, 0x04, 0x98, 0xbd, 0x1b, 0xba, 0x04, 0xbd, 0x48, 0x09, 0xdf, 0x3b, 0xac, 0x9e, 0x3c, 0xbd, 0x4b, 0xbf, 0x2c, 0x3c, 0x07, 0xba, 0xf4, 0xbd, 0x6e, 0x91, 0x84, 0x3d, 0x99, 0x5a, 0x7e, 0x3c, 0x21, 0x9e, 0xeb, 0x3c, 0xde, 0x69, 0x18, 0x3d, 0x1f, 0x8f, 0xaa, 0x3d, 0x09, 0x55, 0x08, 0xbd, 0x42, 0xf3, 0xe5, 0xbd, 0x61, 0x6b, 0x82, 0xbd, 0xe1, 0xe2, 0xd2, 0x3d, 0x3f, 0xd1, 0xb6, 0x3d, 0xf9, 0xf5, 0xc7, 0xbd, 0x47, 0x47, 0x90, 0xbd, 0x74, 0xa3, 0x42, 0xbd, 0xa5, 0xda, 0x3e, 0x3d, 0xaf, 0x45, 0xc1, 0x3d, 0x68, 0x46, 0xe5, 0xbd, 0x79, 0x83, 0x31, 0x3d, 0x7e, 0xd3, 0xce, 0x3c, 0xea, 0x30, 0xca, 0xbd, 0x00, 0xb0, 0xae, 0x3b, 0x66, 0x91, 0xde, 0xbd, 0x0e, 0x11, 0xc0, 0xbd, 0xd0, 0x6a, 0x41, 0xbd, 0x6d, 0x7a, 0x8e, 0xbd, 0x0a, 0xe2, 0x70, 0x3d, 0x7b, 0x4d, 0xcf, 0x3d, 0x2c, 0x2b, 0x3d, 0xbd, 0x7e, 0xc3, 0x6f, 0xbd, 0xd0, 0x38, 0xac, 0x3c, 0xac, 0x35, 0xd0, 0xbd, 0x88, 0x08, 0xe3, 0xbd, 0x78, 0x27, 0xbf, 0x3d, 0x80, 0x1e, 0xf8, 0xbc, 0x52, 0x7a, 0x84, 0xbc, 0x77, 0x84, 0xbb, 0xbc, 0x22, 0xdf, 0x2b, 0x3d, 0xa8, 0x16, 0xe9, 0xbd, 0xec, 0xab, 0xda, 0x3b, 0xb9, 0x2f, 0x9b, 0x3d, 0x28, 0x97, 0xd6, 0x3d, 0x08, 0xde, 0x2c, 0xbc, 0x8a, 0x6c, 0x29, 0x3d, 0xdd, 0xfe, 0xa4, 0xbc, 0x13, 0xb3, 0x4e, 0xbc, 0x4f, 0x72, 0x81, 0xbc, 0x33, 0x6c, 0xcc, 0x3d, 0x1c, 0xbc, 0x76, 0xbc, 0xfd, 0xd7, 0x8f, 0xbd, 0x99, 0xfd, 0x53, 0xbd, 0x2c, 0x76, 0x80, 0xbd, 0x65, 0x2e, 0x1d, 0xbd, 0x9d, 0xd5, 0x8e, 0x3d, 0xeb, 0x16, 0xac, 0x3d, 0xa6, 0x14, 0x3d, 0x3d, 0x75, 0x14, 0x97, 0x3d, 0x5e, 0x11, 0xf5, 0xbc, 0xca, 0x20, 0x46, 0xbb, 0xb1, 0x04, 0xa1, 0xbd, 0x90, 0xcd, 0x3a, 0x3d, 0x70, 0xaf, 0x01, 0xbe, 0x9d, 0xe3, 0xb2, 0xbd, 0xc3, 0xdf, 0x99, 0x3d, 0x20, 0x09, 0xab, 0x3d, 0x35, 0x91, 0x06, 0xbd, 0x10, 0x3a, 0xa0, 0xbc, 0xc2, 0xd1, 0xad, 0x3d, 0x60, 0x90, 0xe4, 0x3d, 0x9f, 0x47, 0xfd, 0x3c, 0x84, 0xa1, 0x5f, 0x3d, 0x06, 0x5e, 0xf0, 0x3c, 0xab, 0x8c, 0x07, 0xbc, 0xf4, 0x6c, 0x16, 0x3d, 0x64, 0x06, 0x04, 0xbe, 0xa8, 0x16, 0x85, 0x3d, 0xea, 0x1a, 0xa1, 0xbd, 0x0d, 0xb4, 0xdc, 0xbd, 0xf4, 0x77, 0xc0, 0xbc, 0x5d, 0x03, 0x28, 0xbd, 0x29, 0x7d, 0xcc, 0xbc, 0xae, 0x19, 0x9f, 0x3d, 0x09, 0x2a, 0xcd, 0x3d, 0xa4, 0x58, 0xaa, 0xbd, 0x6d, 0xb8, 0xa9, 0x3c, 0xa1, 0xb7, 0xe6, 0xbd, 0xa9, 0x41, 0x9a, 0xbd, 0x69, 0xa4, 0xab, 0x3c, 0xdd, 0x32, 0xa9, 0x3d, 0x19, 0x90, 0xd4, 0x3d, 0x52, 0xa8, 0xea, 0xbd, 0x1e, 0x3d, 0xd4, 0x39, 0x84, 0x91, 0x03, 0xbe, 0xc9, 0x63, 0x3f, 0x3d, 0x81, 0x1e, 0xe0, 0x3d, 0x05, 0xc5, 0x95, 0xbd, 0x2e, 0x1d, 0xc9, 0xbd, 0xf2, 0x9c, 0x7c, 0xbc, 0x69, 0x19, 0xdb, 0xbc, 0x09, 0x3d, 0x6f, 0xbd, 0x58, 0x94, 0xf8, 0x3d, 0x2c, 0x78, 0xb6, 0x3d, 0x96, 0xbe, 0xf8, 0x3d, 0x98, 0x4e, 0xb6, 0x3d, 0x1a, 0xa0, 0x90, 0x3d, 0xa3, 0xeb, 0xd2, 0xbd, 0x4c, 0xfb, 0x2d, 0xbd, 0xcb, 0xca, 0xa8, 0xbc, 0xa7, 0xca, 0x80, 0xbd, 0x65, 0xe2, 0x87, 0xbd, 0x9d, 0x9a, 0x25, 0x3c, 0xc7, 0xf2, 0xcc, 0x3c, 0x38, 0x81, 0x48, 0xbd, 0xd3, 0x83, 0xea, 0x3d, 0x4f, 0x72, 0xad, 0xbd, 0x6d, 0xef, 0x3f, 0xbc, 0x22, 0xc7, 0xbf, 0xbc, 0xb6, 0x25, 0x64, 0x3c, 0x82, 0x76, 0x53, 0xbd, 0xd7, 0x9a, 0x89, 0x3c, 0x01, 0xa7, 0x40, 0x3d, 0xbe, 0x03, 0x69, 0xbd, 0x5c, 0x79, 0x0e, 0xbe, 0xeb, 0x87, 0x9f, 0xbd, 0x14, 0xa6, 0xad, 0x3c, 0x78, 0x6b, 0x25, 0x3d, 0xea, 0xa0, 0xd7, 0x3d, 0x19, 0xb6, 0x22, 0xbd, 0xc6, 0xf6, 0xba, 0xbc, 0xe9, 0xd6, 0xe4, 0x3c, 0x55, 0x68, 0x2a, 0xbd, 0xc0, 0x4c, 0xb0, 0xbc, 0xf5, 0xa5, 0x01, 0x3e, 0x59, 0x9a, 0xd0, 0xbd, 0x4a, 0xb2, 0xfc, 0x3d, 0x3a, 0x59, 0x8f, 0x3d, 0x4a, 0x0a, 0xb4, 0xbd, 0x7d, 0xc4, 0x63, 0x3d, 0xb6, 0xb8, 0xb9, 0x3d, 0xb0, 0x95, 0x81, 0x3c, 0x2f, 0x7a, 0x32, 0x3d, 0x32, 0x87, 0xe4, 0xbc, 0xf0, 0xfc, 0xd5, 0x3d, 0xfc, 0xe6, 0xf1, 0x3d, 0x04, 0x66, 0x98, 0x3c, 0x14, 0x23, 0x72, 0x3c, 0xfe, 0x50, 0x95, 0x3d, 0xdf, 0xe6, 0x4c, 0x3d, 0x84, 0x80, 0x8e, 0x3d, 0x13, 0xe8, 0x4c, 0xbd, 0xd4, 0xca, 0x83, 0xbd, 0x20, 0x86, 0xb0, 0xbd, 0xed, 0x66, 0x89, 0x3c, 0x6a, 0x59, 0x19, 0xbd, 0xc2, 0x32, 0xc3, 0xbd, 0x04, 0x3f, 0x8d, 0xbc, 0x51, 0xcc, 0x23, 0xbc, 0xb4, 0x4f, 0xa3, 0xbc, 0x30, 0x98, 0xc8, 0x3d, 0x29, 0xaa, 0xd4, 0xbb, 0x5c, 0x7d, 0x88, 0xbd, 0x3a, 0xe9, 0xa9, 0xbd, 0xc3, 0x4f, 0x40, 0xbd, 0x2d, 0x12, 0x49, 0xbd, 0x9e, 0x4e, 0x9a, 0xbd, 0xf1, 0xa9, 0x84, 0xbd, 0x29, 0x09, 0x94, 0x3d, 0x98, 0x3c, 0xf0, 0x3d, 0x5f, 0xfe, 0x2a, 0xbd, 0xd8, 0xa8, 0x46, 0xbd, 0xa1, 0xc8, 0x1c, 0xbb, 0x12, 0x3d, 0xbc, 0x3d, 0x38, 0x39, 0x51, 0x3c, 0x3a, 0x00, 0x95, 0x3d, 0xd8, 0x2e, 0x67, 0x3c, 0x48, 0x7e, 0xe0, 0xbd, 0x8c, 0x90, 0x79, 0x3c, 0xf2, 0x3d, 0x50, 0x3d, 0xbc, 0x2f, 0xa1, 0x3c, 0xf9, 0xf0, 0x8a, 0x3d, 0x0e, 0x11, 0x30, 0x3c, 0x7c, 0xc8, 0xf8, 0x3c, 0xe0, 0x88, 0x10, 0x3d, 0x4b, 0xaa, 0xbe, 0xbd, 0xa4, 0x0a, 0x5b, 0x3d, 0xe2, 0x3c, 0x94, 0x3d, 0xdd, 0x36, 0x95, 0xbd, 0xc7, 0x70, 0x89, 0xbd, 0x95, 0xe7, 0x89, 0x3d, 0x91, 0x0e, 0x23, 0x3c, 0xfe, 0x32, 0x4f, 0x3b, 0xd4, 0x79, 0xc2, 0x3d, 0x52, 0xab, 0xb4, 0xbd, 0xb3, 0x98, 0xd2, 0x3d, 0xb8, 0x70, 0x88, 0xbd, 0x2e, 0x3e, 0x77, 0x3d, 0xb5, 0x44, 0x00, 0x3d, 0xb4, 0xe9, 0x59, 0x3d, 0xae, 0x3b, 0x9d, 0x3d, 0x3d, 0x89, 0x36, 0x3d, 0x22, 0x67, 0x9b, 0xbb, 0xca, 0xca, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0xcf, 0x02, 0xcf, 0x3d, 0x6b, 0xe2, 0x84, 0x3d, 0x62, 0xaa, 0xdc, 0x3d, 0xdf, 0x55, 0xef, 0x3b, 0xc1, 0x2b, 0x41, 0xbd, 0x6e, 0x82, 0xb3, 0xbd, 0x08, 0xc0, 0x6c, 0xbd, 0x7c, 0xb9, 0x10, 0xbe, 0x97, 0x76, 0xbb, 0xbc, 0xa3, 0x52, 0x00, 0xbe, 0xd9, 0x90, 0x32, 0xbe, 0xac, 0x38, 0x62, 0x3d, 0x6c, 0xdc, 0xae, 0xbc, 0x2a, 0x7d, 0x01, 0xbe, 0x2f, 0xf8, 0x30, 0xbd, 0x8f, 0x24, 0x45, 0xbe, 0x0c, 0x74, 0x1f, 0xbe, 0x5e, 0x0b, 0x0f, 0xbd, 0xf7, 0xb6, 0xc5, 0x3d, 0xe9, 0x3c, 0xbb, 0xbd, 0x61, 0x11, 0x19, 0x3d, 0x68, 0xf0, 0x44, 0x3e, 0x26, 0x64, 0x95, 0x3c, 0xa1, 0xde, 0x54, 0x3d, 0x25, 0x8b, 0x14, 0x3e, 0x0f, 0xed, 0xfe, 0x3b, 0x1b, 0x37, 0xf4, 0xbd, 0x9e, 0x28, 0xbd, 0x3d, 0x26, 0x5c, 0xca, 0x3d, 0xbb, 0xad, 0x02, 0x3d, 0x1f, 0xc1, 0x25, 0x3e, 0x85, 0x0a, 0x39, 0xbe, 0xfa, 0xc3, 0xf7, 0xbd, 0xda, 0x75, 0xc6, 0xbd, 0x06, 0x2d, 0x4a, 0x3c, 0x1a, 0xc1, 0x94, 0xbd, 0xb0, 0x62, 0xa0, 0xbd, 0x63, 0x0c, 0x0e, 0xbe, 0xf3, 0x67, 0x01, 0xbe, 0xd9, 0x42, 0x48, 0xbe, 0xaa, 0xf0, 0xf6, 0xbd, 0xc7, 0xa6, 0x39, 0xbe, 0xf6, 0xef, 0xb2, 0x3d, 0xe6, 0x6f, 0xd7, 0xbd, 0x14, 0x4f, 0xfb, 0xbc, 0x7f, 0xb1, 0x86, 0x3d, 0xcc, 0xca, 0xd9, 0xbd, 0x34, 0x6f, 0x3e, 0xbc, 0x90, 0x24, 0xe8, 0x3d, 0xda, 0x5a, 0xf9, 0x3d, 0x78, 0xc9, 0xf0, 0xbd, 0x1e, 0x50, 0xa5, 0x3d, 0xce, 0xed, 0x6d, 0xbd, 0x65, 0x3b, 0x62, 0xbd, 0x52, 0x36, 0x3d, 0xbd, 0xf8, 0x54, 0x70, 0x3d, 0x01, 0x85, 0x39, 0x3c, 0x57, 0xf0, 0xa8, 0xbc, 0xf5, 0x69, 0xda, 0xbd, 0xd5, 0x00, 0xda, 0x3d, 0x47, 0x0a, 0xe6, 0x3d, 0xf1, 0xed, 0xae, 0xbd, 0x1b, 0x51, 0x93, 0x3d, 0x25, 0x8d, 0x1e, 0x3e, 0x65, 0x36, 0x24, 0x3e, 0xab, 0x4e, 0x3b, 0xbe, 0x73, 0x91, 0x7b, 0x3d, 0x79, 0x2a, 0xa6, 0x3c, 0x6e, 0x13, 0x29, 0x3e, 0xae, 0x98, 0x8b, 0x3d, 0x61, 0xec, 0x36, 0xbe, 0xee, 0xd9, 0x8a, 0x3d, 0xe8, 0xd8, 0xff, 0xbd, 0x87, 0xae, 0x13, 0xbe, 0x45, 0x02, 0xae, 0x3d, 0xbc, 0x03, 0x94, 0xbd, 0xf6, 0x5b, 0x17, 0xbe, 0x3c, 0x46, 0x15, 0x3e, 0x99, 0xe3, 0x3b, 0x3e, 0x6c, 0x0a, 0x82, 0xbd, 0x67, 0xb1, 0xb4, 0x3c, 0x68, 0xc6, 0x0a, 0x3e, 0x7f, 0xe1, 0xa5, 0x3d, 0x38, 0x5c, 0x61, 0x3e, 0x0d, 0x37, 0xdd, 0xbd, 0x14, 0xae, 0xff, 0xbc, 0x00, 0xba, 0x97, 0x3d, 0x61, 0xf4, 0xd7, 0x3c, 0xb9, 0x7e, 0x0b, 0xbe, 0x87, 0xa5, 0x59, 0xbc, 0x01, 0x95, 0x19, 0x3c, 0x3e, 0xf3, 0x72, 0xbd, 0x8b, 0x32, 0x0e, 0xbe, 0x8e, 0x5c, 0x30, 0x3e, 0xd1, 0x09, 0x10, 0x3e, 0xfb, 0xc9, 0x13, 0x3e, 0x82, 0x6f, 0xe2, 0x3d, 0x71, 0xd7, 0xc8, 0xbd, 0x57, 0x14, 0xbb, 0xbd, 0x0f, 0x10, 0x40, 0x3d, 0xa6, 0x30, 0x1e, 0x3d, 0xc8, 0x3f, 0x4a, 0x3e, 0x06, 0xe9, 0x15, 0xbd, 0x8a, 0x87, 0x11, 0x3e, 0xe2, 0xa4, 0x0b, 0xbe, 0xe5, 0x96, 0x3d, 0x3e, 0x5e, 0x78, 0x0c, 0x3e, 0x32, 0x79, 0x7a, 0xba, 0x24, 0x9f, 0x1f, 0xbe, 0xe1, 0x2d, 0xc3, 0xbc, 0xdf, 0x43, 0xb4, 0xbd, 0xb1, 0x00, 0xde, 0x3d, 0x7e, 0x34, 0x4b, 0xbe, 0xeb, 0x21, 0xdd, 0xbd, 0xbe, 0x43, 0xe2, 0xbd, 0x4b, 0x49, 0x9f, 0x3d, 0xa3, 0xd0, 0x8e, 0x3d, 0xdf, 0x84, 0x17, 0xbe, 0x12, 0x0b, 0xc8, 0xbd, 0xcb, 0x0e, 0x64, 0xbd, 0xdd, 0x25, 0x83, 0xbd, 0xa0, 0x78, 0x1b, 0x3e, 0x2e, 0x77, 0x1e, 0xbe, 0x94, 0x81, 0xc8, 0xbd, 0x8d, 0x3e, 0xba, 0xbd, 0xff, 0xe9, 0x32, 0x3e, 0xb0, 0x76, 0xb9, 0xbd, 0xfd, 0x8a, 0x71, 0xbd, 0xab, 0xf3, 0x4c, 0xbc, 0x0c, 0xa0, 0x0c, 0x3e, 0xa2, 0x36, 0xb2, 0xbc, 0x1b, 0x34, 0xb2, 0xbd, 0x44, 0x18, 0x8c, 0xbd, 0xa3, 0xe3, 0x83, 0xbd, 0x45, 0x8c, 0xae, 0xbd, 0x4e, 0x7d, 0x09, 0xbe, 0xdf, 0x58, 0x19, 0xbd, 0xae, 0x8f, 0x5f, 0x3d, 0xa7, 0x36, 0x80, 0xbd, 0xfb, 0x12, 0x22, 0x3e, 0x25, 0x11, 0x99, 0xbb, 0x51, 0xc9, 0x4a, 0x3d, 0x99, 0x68, 0x32, 0x3e, 0x44, 0xcc, 0x7a, 0xbc, 0xa8, 0x46, 0xb7, 0x3d, 0x5f, 0xbb, 0x8a, 0xbd, 0xd3, 0xbb, 0x3a, 0x3e, 0x46, 0x2c, 0x89, 0x3d, 0x26, 0xcb, 0x79, 0x3d, 0xe1, 0x45, 0x40, 0xbd, 0x01, 0xc4, 0xe3, 0x3d, 0x42, 0x18, 0x24, 0x3e, 0x34, 0x73, 0x19, 0x3e, 0x00, 0x53, 0xb7, 0x3d, 0x33, 0x6d, 0xf8, 0x3c, 0x2c, 0x5d, 0x3f, 0xbd, 0x85, 0xa9, 0x1b, 0xbe, 0x18, 0xda, 0xb8, 0xbc, 0xaa, 0x92, 0xb4, 0x3d, 0x53, 0x65, 0x43, 0x3e, 0x4f, 0xda, 0x03, 0xbd, 0xba, 0x8e, 0x40, 0xbe, 0xc1, 0x11, 0xb8, 0xbb, 0x3e, 0x07, 0x66, 0x3e, 0xb8, 0x25, 0xe0, 0x3c, 0x7f, 0x4d, 0x0f, 0xbd, 0x35, 0x57, 0xaa, 0xbd, 0xe5, 0x8b, 0xec, 0xbd, 0x70, 0xda, 0x08, 0xbc, 0x03, 0xc2, 0xf5, 0xbb, 0xa5, 0x57, 0x83, 0xbd, 0xf1, 0x0b, 0x74, 0x3e, 0x9a, 0x63, 0x5a, 0xbd, 0x8f, 0xb3, 0xa1, 0xbb, 0xe3, 0x0a, 0xd1, 0x3c, 0xa8, 0xc3, 0xfd, 0x3d, 0x58, 0x80, 0x04, 0xbe, 0xfb, 0xca, 0xe0, 0x3d, 0x01, 0x75, 0x04, 0xbe, 0xbe, 0xa9, 0x55, 0xbd, 0x59, 0x90, 0xff, 0xbd, 0x6a, 0xf0, 0x64, 0xbd, 0x89, 0xdc, 0x1d, 0xbe, 0xb8, 0x8f, 0x26, 0xbd, 0x3b, 0x31, 0xc8, 0xbd, 0x2c, 0x3d, 0x88, 0xbd, 0x48, 0xea, 0x0f, 0xbd, 0xce, 0x3f, 0x22, 0x3d, 0x8b, 0x31, 0xe7, 0x3d, 0xa1, 0x13, 0x55, 0xbd, 0x2a, 0x96, 0xcc, 0x3d, 0xa1, 0xd9, 0xcf, 0x3d, 0x9f, 0x0f, 0xcf, 0x3c, 0xac, 0x8b, 0xa4, 0xbc, 0x88, 0x69, 0xb6, 0x3d, 0x35, 0x40, 0xc8, 0x3d, 0x5a, 0x6e, 0x23, 0xbe, 0x5f, 0xd9, 0x17, 0xbe, 0x4b, 0x8e, 0x9f, 0xbd, 0x44, 0xeb, 0x15, 0xbe, 0xe9, 0x93, 0xba, 0x3d, 0x4b, 0x93, 0x08, 0xbe, 0x79, 0x4d, 0x09, 0x3e, 0x5a, 0x98, 0x6d, 0xbd, 0x02, 0x95, 0x24, 0xbe, 0x80, 0x67, 0x9d, 0xbd, 0xd2, 0x10, 0x1f, 0xbe, 0x64, 0xd2, 0x62, 0xbd, 0x01, 0x92, 0x09, 0x3e, 0x96, 0x6e, 0xca, 0xbd, 0x62, 0x32, 0xf3, 0xbd, 0xe1, 0x10, 0x50, 0x3d, 0x61, 0x3e, 0xdc, 0x3d, 0x7e, 0x6e, 0xd5, 0xbd, 0xf4, 0xea, 0x1f, 0x3e, 0x2a, 0xd2, 0x10, 0xbd, 0x04, 0xa4, 0xdd, 0x3b, 0x7f, 0x19, 0x50, 0xbd, 0xad, 0x49, 0x0e, 0x3e, 0x63, 0x14, 0xe3, 0x3d, 0x6f, 0x2d, 0x99, 0x3d, 0x4a, 0x0b, 0x08, 0xbe, 0xd6, 0x54, 0xdd, 0xbd, 0xfb, 0x6b, 0x9e, 0xbd, 0xc0, 0x42, 0xe9, 0xbd, 0xba, 0xef, 0x40, 0xbb, 0x9c, 0x44, 0xc5, 0x3d, 0x1e, 0x3a, 0xde, 0xbd, 0xce, 0x6d, 0xef, 0x3d, 0x92, 0x4d, 0xf6, 0xbd, 0xa3, 0xc5, 0x0c, 0xbe, 0x74, 0x63, 0xd8, 0xbd, 0xff, 0xd4, 0x11, 0x3e, 0x02, 0x10, 0x28, 0xbd, 0x86, 0xf5, 0x4f, 0x3d, 0x6a, 0xfb, 0xc6, 0x3d, 0x6d, 0x29, 0x1f, 0xbe, 0xa4, 0x55, 0xab, 0x3d, 0xaa, 0xc8, 0xc7, 0x3d, 0xf4, 0xec, 0x59, 0x3d, 0xd1, 0x44, 0x75, 0x3d, 0xe6, 0x18, 0x3c, 0x3e, 0xd7, 0x83, 0xb5, 0x3d, 0xdc, 0xa3, 0xb1, 0xbd, 0xbb, 0xa7, 0x73, 0xbd, 0x03, 0x00, 0x3c, 0x3d, 0x3b, 0x59, 0x8d, 0xbd, 0x27, 0x1f, 0x07, 0xbe, 0x46, 0x5f, 0xcf, 0xbd, 0x5b, 0xf5, 0x13, 0xbe, 0xe9, 0xa9, 0x1b, 0x3e, 0x05, 0x6e, 0x0e, 0x3e, 0xd2, 0xa7, 0xad, 0xbc, 0x55, 0xda, 0x12, 0x3e, 0xd4, 0xd5, 0xcc, 0xbd, 0x5e, 0x0d, 0x33, 0xbe, 0x5f, 0xfa, 0x99, 0xbd, 0xa1, 0xd4, 0x96, 0xbd, 0x7b, 0xec, 0x08, 0x3d, 0xf0, 0x43, 0x04, 0xbe, 0xd6, 0x6a, 0x3e, 0x3d, 0x9c, 0x4c, 0xa5, 0xbd, 0xc1, 0x25, 0xeb, 0x3c, 0x00, 0x84, 0x7f, 0xbd, 0x8e, 0x5b, 0x2d, 0xbd, 0x5a, 0x0d, 0x93, 0x3c, 0x14, 0x09, 0x5e, 0x3d, 0x0e, 0x7c, 0x25, 0x3d, 0x4b, 0x3f, 0x0f, 0xbe, 0xad, 0x31, 0xd8, 0xbd, 0x81, 0xa4, 0x66, 0xbd, 0x25, 0x37, 0x32, 0xbe, 0x64, 0x42, 0x6f, 0x3d, 0x9c, 0xdb, 0xc2, 0x3d, 0x1f, 0x78, 0xcc, 0x3c, 0x45, 0xa8, 0x0c, 0x3e, 0xe8, 0x27, 0xe3, 0x3d, 0xbf, 0xb1, 0xff, 0x3d, 0x3e, 0x13, 0xc6, 0x3d, 0xf2, 0x5b, 0x64, 0x3d, 0xf1, 0xf8, 0x16, 0x3e, 0x24, 0x46, 0x40, 0x3d, 0xa1, 0x7e, 0x99, 0x3c, 0x6d, 0x30, 0x1e, 0xbe, 0x04, 0xdd, 0x2a, 0xbe, 0x03, 0x25, 0x20, 0xbd, 0x07, 0xf4, 0x74, 0xbc, 0xc8, 0x71, 0x03, 0xbd, 0x46, 0xf3, 0xd9, 0xbc, 0x33, 0x6d, 0xbb, 0xbd, 0xbd, 0x8a, 0xd5, 0x3d, 0x68, 0xbd, 0x9e, 0xbc, 0x1c, 0x26, 0x09, 0xbe, 0x0f, 0x3c, 0x9d, 0xbd, 0xde, 0x13, 0x53, 0xbd, 0x73, 0xe9, 0x90, 0x3d, 0xdc, 0x50, 0xef, 0x3c, 0x6f, 0x00, 0x32, 0xbc, 0x42, 0x79, 0x18, 0x3e, 0xa8, 0xe4, 0xb3, 0xbd, 0x04, 0x2f, 0x6e, 0xbd, 0x41, 0xb2, 0x51, 0x3e, 0x56, 0x54, 0xe7, 0x3d, 0x0c, 0x44, 0xbb, 0xbd, 0xa4, 0xce, 0x8b, 0x3c, 0xad, 0x8a, 0xec, 0x3d, 0xf7, 0xc9, 0x44, 0xbd, 0xc5, 0xdc, 0x2a, 0x3b, 0xde, 0x9e, 0xb6, 0x3d, 0x20, 0x2c, 0x1c, 0xbe, 0x04, 0x0c, 0x9f, 0xbd, 0x41, 0x5f, 0xd4, 0xbd, 0x76, 0x92, 0x06, 0xbe, 0x6a, 0x98, 0x30, 0xbe, 0xc4, 0xa0, 0xd3, 0x3c, 0x38, 0x33, 0xf5, 0xbd, 0x94, 0x28, 0x0d, 0xbd, 0x42, 0x60, 0x1e, 0x3d, 0xfd, 0x72, 0xca, 0x3d, 0xee, 0xf6, 0x0d, 0x3e, 0x35, 0xb3, 0x27, 0x3e, 0x15, 0xde, 0x08, 0xbe, 0x34, 0xc4, 0x8b, 0xbd, 0x4a, 0x4f, 0x9a, 0x3d, 0x87, 0x8f, 0x06, 0xbc, 0x68, 0x43, 0x10, 0xbd, 0x36, 0x40, 0xb6, 0xbc, 0xf2, 0xad, 0x82, 0xbd, 0xc5, 0xef, 0x13, 0xbe, 0x4c, 0x38, 0xcd, 0xbd, 0x4a, 0xdf, 0x9d, 0x3c, 0x9d, 0xb0, 0x9a, 0x3d, 0xe8, 0xf7, 0xd4, 0x3d, 0x9d, 0x50, 0x34, 0x3d, 0xc9, 0x92, 0xdf, 0x3d, 0x20, 0x66, 0xeb, 0x3d, 0x54, 0x5c, 0x85, 0xbd, 0x2d, 0x0e, 0xc6, 0x3d, 0x90, 0xea, 0x64, 0xbd, 0xcd, 0xa5, 0x5c, 0xbd, 0x77, 0x8d, 0x7b, 0x3d, 0xf7, 0xda, 0x98, 0xbd, 0xc2, 0x98, 0xcb, 0x3d, 0x79, 0xa4, 0x2d, 0x3d, 0x52, 0x42, 0x15, 0x3e, 0xc5, 0x68, 0x47, 0xbd, 0xbf, 0xa0, 0xe7, 0xbd, 0xbf, 0xa4, 0xbd, 0x3b, 0x6f, 0xe3, 0x05, 0xbd, 0xd3, 0xda, 0xdb, 0xbd, 0x40, 0x3a, 0xa8, 0xbd, 0x87, 0x88, 0x36, 0xbe, 0xaf, 0x1d, 0xe5, 0x3d, 0xf6, 0xe8, 0x2e, 0xbe, 0xbc, 0x78, 0x9b, 0x3d, 0x8b, 0x27, 0xf6, 0xbd, 0x18, 0x45, 0xef, 0xbd, 0x8c, 0x3f, 0x3e, 0x3e, 0x94, 0x69, 0x16, 0xbe, 0x4f, 0xce, 0x48, 0xbe, 0x0c, 0xfa, 0x0b, 0xbc, 0x01, 0x50, 0x37, 0x3e, 0x87, 0x13, 0x0b, 0xbe, 0xd0, 0xb1, 0x38, 0x3e, 0x71, 0x2c, 0xa1, 0x3d, 0x4a, 0x15, 0xb4, 0xbd, 0x80, 0x28, 0x2b, 0xbd, 0xc7, 0x3d, 0x7e, 0x3c, 0xe5, 0xe1, 0xf1, 0x3d, 0x43, 0x56, 0x2c, 0x3d, 0x18, 0xba, 0x20, 0xbe, 0x4e, 0x30, 0x8d, 0x3d, 0x0b, 0x52, 0x20, 0x3b, 0x2d, 0xbc, 0x48, 0xbd, 0xf8, 0xff, 0xcf, 0xbb, 0x34, 0xb2, 0xaf, 0x3c, 0xea, 0xad, 0xf0, 0x3d, 0xed, 0xbd, 0x8d, 0x3d, 0x41, 0x8c, 0xde, 0xbd, 0xb0, 0xb4, 0x32, 0x3e, 0xf8, 0x16, 0x2e, 0xbe, 0x0c, 0x4a, 0x8c, 0x3d, 0x89, 0x92, 0x13, 0x3e, 0x8b, 0xd2, 0xbb, 0xbd, 0xf5, 0xce, 0x0f, 0x3e, 0x31, 0x82, 0x7b, 0xbb, 0x7f, 0xac, 0x0e, 0x3e, 0x9f, 0xe7, 0x0a, 0xbe, 0x5b, 0xef, 0x2b, 0x3d, 0xa9, 0x7f, 0x0d, 0x3e, 0xa4, 0xc0, 0xde, 0x3d, 0xde, 0x0d, 0xbc, 0xbc, 0x59, 0x6f, 0x81, 0x3a, 0x46, 0x0c, 0x1b, 0xbe, 0xd0, 0xba, 0xf5, 0xbc, 0xe5, 0x6d, 0x1d, 0x3e, 0x31, 0x08, 0x5a, 0x3d, 0xab, 0x1c, 0xb5, 0xbc, 0xe7, 0xaa, 0x18, 0x3e, 0xaa, 0xcc, 0x14, 0x3e, 0x4e, 0x1e, 0x08, 0xbd, 0xfc, 0x9f, 0xbe, 0xbd, 0x44, 0x7b, 0x2b, 0xbe, 0xf1, 0xfa, 0x90, 0x3c, 0xa4, 0x75, 0x16, 0xbe, 0x27, 0x3b, 0x05, 0xbe, 0xf3, 0x41, 0xde, 0xbd, 0xb9, 0x96, 0x10, 0xbd, 0xd0, 0x44, 0x6a, 0x3b, 0x5b, 0x04, 0x02, 0xbe, 0x3c, 0xf7, 0x41, 0xbd, 0xe6, 0xaf, 0x06, 0xbe, 0x52, 0x74, 0x08, 0x3e, 0xda, 0x81, 0x54, 0x3d, 0xcd, 0xe8, 0xbc, 0x3d, 0xf8, 0x07, 0xdc, 0x3d, 0x84, 0x6f, 0xd8, 0xbd, 0xe0, 0x65, 0x2a, 0x3e, 0x04, 0xae, 0xe1, 0xbd, 0x34, 0xd5, 0x27, 0xbd, 0x5c, 0xb4, 0x70, 0xbd, 0x0d, 0x68, 0xfa, 0x3d, 0x04, 0xb0, 0xc5, 0xbd, 0xa0, 0xf7, 0x87, 0x3d, 0xdc, 0x08, 0x18, 0x3e, 0x86, 0xb9, 0x0f, 0xbe, 0x21, 0x03, 0x75, 0x3d, 0x2b, 0x4f, 0x15, 0xbd, 0x3c, 0x86, 0x8e, 0xbc, 0xc7, 0xd0, 0x73, 0x3d, 0xe0, 0x50, 0x37, 0x3c, 0xd6, 0x8d, 0xce, 0x3d, 0x3b, 0x42, 0x1b, 0x3e, 0xa9, 0xfc, 0x29, 0x3e, 0xe4, 0x58, 0x1d, 0x3d, 0x5d, 0xab, 0x3b, 0xbe, 0x28, 0x32, 0x07, 0xbd, 0x54, 0x37, 0x9c, 0x3d, 0xd4, 0xdd, 0x04, 0x3d, 0x28, 0xe1, 0xad, 0xbc, 0x98, 0x0e, 0x13, 0x3e, 0xae, 0x57, 0x2a, 0xbe, 0xc4, 0xf0, 0x70, 0xbd, 0xf9, 0x8d, 0x0d, 0xbe, 0x5e, 0x46, 0x17, 0xbe, 0x90, 0x6a, 0xbc, 0x3d, 0x12, 0xa1, 0xf3, 0xbd, 0x0f, 0xf9, 0x88, 0xbd, 0x60, 0xd9, 0x2f, 0xbd, 0x07, 0x99, 0xa2, 0xbd, 0x0b, 0xa5, 0x1b, 0xbc, 0x92, 0x9d, 0xaf, 0xbc, 0x37, 0xf5, 0x5a, 0x3c, 0x88, 0xf0, 0xcf, 0x3d, 0x96, 0xdd, 0x54, 0x3d, 0x2f, 0xd2, 0x0a, 0x3e, 0xe5, 0xbd, 0x46, 0x3c, 0xd2, 0x65, 0xcb, 0xbd, 0x19, 0x00, 0x0b, 0xbe, 0xd6, 0xf6, 0xb0, 0x3d, 0x39, 0xc2, 0x14, 0x3e, 0x44, 0x63, 0x3f, 0x3e, 0x4a, 0x6c, 0x1d, 0x3e, 0xf3, 0x6a, 0xe1, 0xbc, 0x31, 0xa5, 0x28, 0xbe, 0x54, 0x4d, 0x49, 0xbd, 0xd4, 0xbf, 0x64, 0xbd, 0xec, 0x58, 0xbc, 0xbd, 0xff, 0xc6, 0xd0, 0x3c, 0xb7, 0xf1, 0xa7, 0x3d, 0x55, 0x15, 0x26, 0xbd, 0xe6, 0x14, 0xe2, 0x3c, 0x6b, 0x28, 0x05, 0x3e, 0x83, 0xaf, 0xbc, 0xbd, 0xc6, 0xb7, 0x6a, 0x3d, 0x6f, 0xa9, 0x01, 0x3e, 0x93, 0x78, 0x62, 0xb9, 0x23, 0x46, 0x3f, 0xbd, 0x89, 0xbd, 0x88, 0x3d, 0x4d, 0xeb, 0xa0, 0x3d, 0x5e, 0x68, 0x74, 0xbd, 0x3d, 0xe2, 0x86, 0xbd, 0x11, 0x15, 0x62, 0xbd, 0x01, 0xde, 0xc8, 0xbd, 0xf0, 0x96, 0xc0, 0xbd, 0xf4, 0x9d, 0xff, 0xbd, 0x04, 0xcb, 0x80, 0x3c, 0x4f, 0x43, 0x35, 0x3d, 0x65, 0x45, 0x6c, 0x3d, 0x45, 0x55, 0xaa, 0xbc, 0xe1, 0x1a, 0x59, 0x3d, 0x4c, 0x54, 0x20, 0xbe, 0x35, 0xaf, 0xe3, 0x3d, 0xd2, 0x5e, 0xae, 0xbd, 0xa7, 0xaa, 0x15, 0x3e, 0xea, 0x3c, 0xe9, 0x3c, 0xa4, 0xc9, 0x08, 0xbe, 0xca, 0xec, 0x82, 0x3b, 0x8b, 0x49, 0xfa, 0xbd, 0x9d, 0x1e, 0x8b, 0xbc, 0x1b, 0xb4, 0xed, 0xbd, 0x1d, 0xbe, 0xc9, 0x3d, 0x8c, 0xdf, 0x2a, 0xbe, 0x8c, 0xba, 0xe3, 0x3d, 0x1f, 0xa2, 0x14, 0x3d, 0x61, 0xf2, 0xcf, 0xba, 0xd5, 0x67, 0x88, 0xbd, 0xa7, 0xd0, 0x5d, 0x3e, 0x71, 0x6e, 0xfd, 0x3d, 0xd5, 0xcf, 0x02, 0xbd, 0x0c, 0x25, 0xb5, 0x3c, 0xa6, 0x27, 0x90, 0x3c, 0x86, 0x80, 0x1c, 0x3e, 0x41, 0x4f, 0x02, 0xbe, 0xe1, 0x7a, 0x28, 0x3e, 0xef, 0xf7, 0x96, 0xbd, 0x0f, 0x11, 0xd3, 0x3d, 0xd9, 0x11, 0x00, 0x3e, 0x77, 0x16, 0x98, 0x3d, 0x6a, 0xbc, 0x03, 0xbe, 0xbc, 0x2b, 0xc9, 0xbd, 0xc0, 0xc5, 0x99, 0x3d, 0xf4, 0x17, 0xc9, 0x3d, 0x37, 0xc7, 0xea, 0x3d, 0xd0, 0x01, 0x29, 0xbe, 0xae, 0xfd, 0x37, 0xbd, 0x7a, 0xce, 0xba, 0xbc, 0x7d, 0x16, 0x19, 0x3e, 0x2b, 0x5f, 0x32, 0x3a, 0x54, 0x01, 0x96, 0xbd, 0xd6, 0xb6, 0x73, 0x3c, 0x8f, 0x5c, 0xa9, 0x3c, 0x67, 0x4e, 0xac, 0x3d, 0x52, 0x49, 0xab, 0x3d, 0x05, 0x07, 0x29, 0x3e, 0x43, 0x4c, 0x28, 0xbe, 0x0c, 0x1a, 0x12, 0xbe, 0x05, 0x18, 0x3c, 0x3c, 0x29, 0x0f, 0x22, 0x3e, 0xf3, 0x49, 0x54, 0x3e, 0xbf, 0xcd, 0x46, 0x3d, 0xea, 0x9f, 0x53, 0x3d, 0xf6, 0xcc, 0xb5, 0x3d, 0x80, 0x51, 0x9e, 0x3d, 0xff, 0xc1, 0x69, 0x3d, 0x94, 0x19, 0x41, 0xbd, 0x7b, 0x33, 0x75, 0x3c, 0x9e, 0x51, 0x2f, 0x3e, 0x58, 0x6e, 0x21, 0x3c, 0x46, 0x38, 0x22, 0x3e, 0x73, 0xf9, 0x15, 0xbe, 0xfa, 0x12, 0x04, 0xbe, 0xaf, 0x1d, 0x1e, 0xbe, 0xad, 0x03, 0x11, 0xbe, 0xb3, 0xa7, 0x07, 0x3d, 0x4b, 0x76, 0x58, 0xbd, 0x68, 0xaa, 0x21, 0xbe, 0x18, 0xb3, 0x24, 0xbe, 0x59, 0xa7, 0x9d, 0xbd, 0x8a, 0x64, 0x92, 0x3d, 0xf4, 0xe8, 0x00, 0xbe, 0xed, 0xd4, 0x85, 0x3c, 0x77, 0x84, 0xf0, 0xbd, 0x3f, 0x0d, 0x37, 0x3e, 0x2c, 0x42, 0x64, 0x3c, 0x5b, 0x23, 0x27, 0x3e, 0x3e, 0xc6, 0xb0, 0x3d, 0x1c, 0xba, 0xfe, 0xbc, 0xcf, 0xde, 0xb4, 0xbc, 0x97, 0x05, 0x1c, 0xbd, 0x0d, 0xa5, 0x92, 0xbb, 0x6a, 0x79, 0x50, 0x3e, 0x62, 0x30, 0x19, 0x3e, 0xd7, 0x23, 0x02, 0x3e, 0x9d, 0xc1, 0x7e, 0x3d, 0xb5, 0x03, 0x9c, 0xbd, 0x7b, 0xc5, 0x72, 0x3d, 0xc3, 0xd4, 0x22, 0xbe, 0x55, 0x27, 0x63, 0x3d, 0xb7, 0x8f, 0x2e, 0xbe, 0x18, 0xe1, 0xbd, 0xbd, 0xa9, 0x10, 0xf0, 0xbd, 0x51, 0xd4, 0x4d, 0x3d, 0x62, 0x08, 0xe2, 0x3d, 0x3b, 0xf4, 0x5e, 0x3d, 0xa1, 0xeb, 0xb4, 0x3d, 0xed, 0x6f, 0x72, 0x3d, 0x1c, 0x3b, 0xba, 0xbd, 0x56, 0xa6, 0xc8, 0xbd, 0x1e, 0x39, 0x3b, 0xbe, 0x83, 0xc7, 0xb4, 0x3d, 0x04, 0xe6, 0xd6, 0x3d, 0x2a, 0x2c, 0x91, 0x3d, 0x78, 0x72, 0x9f, 0x3d, 0x62, 0xf9, 0xdd, 0xbd, 0x21, 0x97, 0x28, 0xbe, 0x52, 0xaa, 0x06, 0x3e, 0x55, 0x9e, 0x26, 0xbe, 0xb0, 0x2a, 0x4f, 0xbd, 0x72, 0x66, 0xeb, 0x3c, 0xa8, 0x84, 0xed, 0x3d, 0x02, 0xca, 0xaf, 0xbd, 0xbd, 0x90, 0x64, 0xbd, 0x91, 0xd5, 0x81, 0xbd, 0xcd, 0x4a, 0x24, 0x3e, 0x57, 0x13, 0x44, 0xbd, 0x35, 0x93, 0x1b, 0xbb, 0x9e, 0x75, 0xe0, 0x3d, 0x86, 0xfb, 0x25, 0xbe, 0x7a, 0xe1, 0xe5, 0x3d, 0x15, 0x97, 0x28, 0x3d, 0xa5, 0x78, 0xe4, 0x3d, 0x22, 0xf8, 0x0d, 0x3d, 0x18, 0xbb, 0xcb, 0xbc, 0xfc, 0x53, 0x99, 0xbd, 0xd5, 0x40, 0xcc, 0xbd, 0x2e, 0x47, 0xf6, 0x3d, 0xd0, 0x5c, 0x1c, 0xbb, 0xac, 0x38, 0xb3, 0x3c, 0x25, 0xfd, 0x8e, 0x3c, 0xd0, 0xc9, 0x4c, 0xbd, 0x37, 0xc4, 0xfe, 0xbd, 0x1d, 0xca, 0x17, 0xbe, 0x54, 0x50, 0x8f, 0xbd, 0xc1, 0xfb, 0xed, 0xbd, 0xb9, 0x2f, 0x24, 0x3e, 0xc0, 0x6d, 0x1c, 0xbe, 0xe2, 0xd7, 0x95, 0x3d, 0x21, 0xa6, 0x7c, 0x3d, 0x1b, 0x02, 0x3c, 0x3d, 0xc6, 0x73, 0x4b, 0x3d, 0x28, 0x7a, 0xcf, 0x3d, 0x6c, 0x4f, 0xf5, 0x3c, 0x0a, 0x47, 0x88, 0xbd, 0xe1, 0xc9, 0x39, 0xbe, 0x0d, 0x2d, 0x04, 0x3c, 0x80, 0xf8, 0xd7, 0xbb, 0x8e, 0xa6, 0xf3, 0xbd, 0x10, 0x3c, 0xe1, 0x3d, 0xde, 0x10, 0xb2, 0xbd, 0x9c, 0x3f, 0x46, 0xbd, 0xd4, 0x42, 0x01, 0x3e, 0x63, 0x0f, 0x82, 0x3d, 0xab, 0x71, 0xe9, 0xbd, 0x06, 0xe4, 0x11, 0x3e, 0x12, 0x15, 0x0a, 0xbe, 0x46, 0x0a, 0x5a, 0xbd, 0x83, 0xff, 0x9a, 0xbc, 0xe4, 0x96, 0xdc, 0xbd, 0xc7, 0xaf, 0x7a, 0x3d, 0x64, 0x84, 0xbe, 0x3d, 0x90, 0x0c, 0x04, 0xbd, 0xb4, 0x26, 0xb1, 0xbc, 0x35, 0xf6, 0x23, 0x3e, 0x81, 0x0c, 0x89, 0xbd, 0x8a, 0xe7, 0xd7, 0xbc, 0x3b, 0xce, 0xa5, 0x3d, 0xc1, 0x40, 0x83, 0x3d, 0x44, 0x14, 0x9a, 0x3d, 0xeb, 0x57, 0xbe, 0x3c, 0xde, 0x7c, 0x01, 0x3d, 0xa0, 0x13, 0xe4, 0xbc, 0x54, 0xae, 0xca, 0x3d, 0x9d, 0xd5, 0xc7, 0x3b, 0x59, 0x7b, 0xfc, 0xbd, 0xae, 0x12, 0x00, 0x3e, 0x79, 0xac, 0x07, 0x3e, 0x40, 0x9b, 0x83, 0xbd, 0x7b, 0xb9, 0xeb, 0xbb, 0x12, 0x58, 0xf6, 0x3d, 0x10, 0x80, 0x8c, 0xbd, 0x73, 0x18, 0xc8, 0xbd, 0x5e, 0x85, 0xbc, 0xbd, 0xf4, 0x7c, 0xd0, 0xbd, 0x3b, 0x06, 0x66, 0xbd, 0x88, 0xaf, 0x82, 0xbc, 0x43, 0x81, 0x80, 0x3d, 0x03, 0x7a, 0x20, 0x3e, 0xc1, 0x44, 0xd1, 0x3c, 0x2f, 0xa0, 0x76, 0x3d, 0x63, 0x3e, 0x06, 0x3c, 0x80, 0xb6, 0xa4, 0x3d, 0x6d, 0x3d, 0x20, 0x3e, 0xee, 0xe4, 0xb3, 0x3d, 0x3f, 0xb3, 0xfc, 0x3c, 0x66, 0x46, 0x52, 0x3e, 0x93, 0x86, 0x14, 0xbd, 0x1f, 0x77, 0x8e, 0xbd, 0x99, 0x66, 0x88, 0x3c, 0xbb, 0xb7, 0xc1, 0x3d, 0x30, 0x43, 0xcd, 0xbd, 0xd6, 0x81, 0xbe, 0x39, 0x60, 0x9d, 0x21, 0xbe, 0x77, 0xb4, 0x16, 0x3e, 0x50, 0x6b, 0x88, 0xbb, 0xbe, 0x2a, 0xe1, 0xbc, 0x7e, 0xfb, 0x13, 0xbe, 0x04, 0xd2, 0x01, 0x3e, 0xd7, 0xf2, 0xfb, 0xbd, 0xa1, 0x97, 0xa5, 0x3d, 0x51, 0xb1, 0x1d, 0x3e, 0xa6, 0xe9, 0x11, 0x3e, 0x28, 0xe3, 0xb0, 0xbc, 0xd6, 0xd7, 0xcf, 0xbd, 0xf7, 0x89, 0x10, 0x3e, 0x2d, 0x9d, 0x0b, 0xbe, 0x08, 0x0a, 0x0e, 0xbd, 0xc7, 0x1e, 0x08, 0x3d, 0x18, 0x40, 0xad, 0xbd, 0xef, 0x48, 0x05, 0xbd, 0xf6, 0xc0, 0x23, 0xbe, 0xf6, 0x7d, 0xa6, 0x3d, 0x05, 0xb5, 0x6c, 0x3d, 0x7f, 0x05, 0xd4, 0xbd, 0xd5, 0x2a, 0x1f, 0x3e, 0x60, 0x90, 0xee, 0xbd, 0x82, 0x03, 0x26, 0xbd, 0x27, 0x9d, 0x05, 0xbd, 0x2d, 0x05, 0x9c, 0x3c, 0xa0, 0x72, 0xef, 0x3d, 0x4a, 0xd9, 0xad, 0x3d, 0x9f, 0x2a, 0x46, 0xbd, 0x47, 0x6e, 0xfb, 0xbc, 0x43, 0x4b, 0xde, 0xbd, 0xf0, 0x40, 0x97, 0x3d, 0xd9, 0xf7, 0xe1, 0xbd, 0xbd, 0xae, 0xce, 0x3c, 0x79, 0xae, 0x8c, 0xbd, 0x34, 0xc9, 0x34, 0xbe, 0x99, 0x0a, 0xae, 0xbd, 0xae, 0xe2, 0xe9, 0x3d, 0xe7, 0x97, 0xf7, 0x3d, 0xd1, 0x30, 0x05, 0x3e, 0x14, 0xd3, 0x0c, 0x3d, 0xcd, 0x90, 0x63, 0x3d, 0x50, 0xac, 0x27, 0xbd, 0x06, 0x6c, 0x30, 0xbe, 0x31, 0x20, 0xa1, 0xbd, 0xf3, 0x98, 0x87, 0x3d, 0x31, 0x34, 0xac, 0xbd, 0x2e, 0xc3, 0xb3, 0xbb, 0xec, 0xb6, 0x4d, 0xbd, 0x6f, 0x2c, 0x02, 0xbc, 0xcc, 0xcb, 0x80, 0xbd, 0x7b, 0x15, 0x29, 0xbe, 0x8f, 0xb6, 0x8b, 0x3c, 0xca, 0x8b, 0x51, 0xbd, 0x64, 0x5f, 0x45, 0xbd, 0x0f, 0xa3, 0xa4, 0x3d, 0xed, 0x79, 0x9c, 0xbd, 0x31, 0xa0, 0xbb, 0x3d, 0xe9, 0x06, 0x26, 0x3e, 0x85, 0x78, 0x21, 0x3e, 0x81, 0x35, 0xcd, 0xbd, 0x05, 0x31, 0x11, 0xbe, 0x9d, 0x19, 0xde, 0xbd, 0x9a, 0xd3, 0x11, 0xbe, 0x58, 0xa7, 0xff, 0xbc, 0x9f, 0x4a, 0x29, 0x3d, 0xda, 0x56, 0x8c, 0xbc, 0xf6, 0xf9, 0x79, 0x3d, 0x11, 0xbe, 0x82, 0x3d, 0xda, 0x43, 0x04, 0x3e, 0xed, 0xce, 0xe1, 0x3d, 0x3a, 0x95, 0x3a, 0x3d, 0x56, 0x31, 0x4e, 0x3d, 0x82, 0x65, 0xbd, 0x3b, 0x4c, 0x6f, 0xa8, 0xbc, 0xa4, 0xa1, 0x25, 0xbc, 0xad, 0x79, 0x2f, 0xbe, 0x73, 0xac, 0x2b, 0x3e, 0x2d, 0x80, 0x3f, 0xbd, 0x97, 0xee, 0x80, 0xbd, 0xd8, 0x02, 0x77, 0x3d, 0xb2, 0xcb, 0x9b, 0x3d, 0x7c, 0x94, 0xc9, 0xbd, 0xce, 0xd1, 0xdd, 0x3d, 0x12, 0xef, 0x8b, 0x3d, 0x3a, 0xbe, 0x08, 0x3e, 0x73, 0x80, 0x1d, 0xbe, 0x2f, 0xdb, 0x2d, 0xbe, 0x58, 0x7d, 0xd7, 0xbd, 0x44, 0x0f, 0xae, 0x3d, 0xd6, 0xe7, 0x3d, 0x3e, 0xe0, 0x3a, 0xad, 0x3c, 0x7b, 0x10, 0x19, 0x3e, 0x1b, 0x4e, 0x78, 0xbd, 0x3f, 0xf3, 0x07, 0xbe, 0x8c, 0xcc, 0xf7, 0xbd, 0x5a, 0x20, 0xb9, 0xbd, 0x53, 0x04, 0x34, 0x3d, 0x6b, 0xcf, 0x24, 0x3e, 0x32, 0x1b, 0xc2, 0xbd, 0x92, 0x01, 0xee, 0x3c, 0x79, 0x75, 0xd8, 0xbd, 0xdf, 0x4b, 0x0a, 0x3c, 0xf3, 0x93, 0xce, 0x3d, 0x76, 0xf7, 0x31, 0xbd, 0xd7, 0x71, 0x17, 0xbe, 0xac, 0xed, 0x1f, 0xbe, 0xb5, 0x4d, 0x46, 0x3d, 0xb0, 0xb9, 0x0b, 0xbe, 0x02, 0xb8, 0x9f, 0x3d, 0x7d, 0x42, 0x28, 0xbe, 0x65, 0x07, 0xc7, 0x3d, 0xb2, 0xd4, 0xb5, 0x3d, 0x28, 0x07, 0xd3, 0x3c, 0x55, 0x93, 0x2c, 0xbe, 0x79, 0x7c, 0x29, 0x3e, 0x59, 0x10, 0x0a, 0xbe, 0x9d, 0x0a, 0x08, 0xbd, 0xa3, 0x61, 0x5d, 0x3d, 0xf8, 0xb5, 0xde, 0xbb, 0x54, 0x24, 0xa7, 0x3d, 0xe3, 0xe4, 0x32, 0xbe, 0x20, 0x3b, 0x3d, 0xbe, 0x48, 0x67, 0xc2, 0xbd, 0x3c, 0x7b, 0x2b, 0xbd, 0x69, 0xee, 0x56, 0xbd, 0xa9, 0x90, 0xcb, 0x3d, 0xff, 0xf1, 0xa7, 0xbd, 0xa9, 0xd8, 0x43, 0xbd, 0xb8, 0xcd, 0xb7, 0x3c, 0xcd, 0xfb, 0xbb, 0x3d, 0xd6, 0x26, 0x8a, 0xbd, 0x45, 0xa4, 0x81, 0x3d, 0xd2, 0xc9, 0x29, 0x3e, 0xdb, 0xf4, 0xdd, 0xbd, 0x93, 0x95, 0xa9, 0x3d, 0x11, 0xbb, 0x12, 0x3e, 0xdf, 0xf4, 0xcd, 0xbd, 0xb9, 0xde, 0x82, 0x3c, 0xdf, 0x26, 0x76, 0x3d, 0xb6, 0x47, 0x32, 0xbe, 0x91, 0x0f, 0x6f, 0x3b, 0x56, 0x16, 0x4c, 0xbe, 0x77, 0x77, 0x00, 0xbe, 0x2c, 0x1f, 0xd1, 0xbd, 0xf6, 0x43, 0x12, 0x3e, 0xd8, 0x7c, 0x16, 0x3e, 0x26, 0xec, 0x0c, 0xbe, 0xaf, 0x69, 0xe0, 0x3d, 0x5a, 0x3b, 0xdf, 0x3d, 0xbb, 0x0f, 0x99, 0x3d, 0xe2, 0x32, 0x2b, 0xbd, 0xf3, 0x1e, 0x1d, 0x3e, 0x9e, 0xdc, 0xf3, 0x3c, 0x77, 0x8b, 0xf7, 0xbd, 0x46, 0xb5, 0x48, 0xbc, 0x28, 0xce, 0xbd, 0x3c, 0x22, 0x68, 0x1a, 0x3e, 0x92, 0x40, 0xf0, 0x3c, 0x35, 0xf1, 0xbe, 0xbd, 0x8d, 0xed, 0xd0, 0x3d, 0x93, 0x67, 0x5e, 0xbd, 0xc8, 0xa3, 0xb0, 0xbd, 0x83, 0x61, 0x2f, 0x3d, 0x39, 0xce, 0x81, 0x3b, 0xa5, 0x87, 0x1d, 0x3e, 0xe0, 0x8f, 0x38, 0x3c, 0xce, 0x6f, 0x26, 0x3d, 0x09, 0x7f, 0x9a, 0x3d, 0x6c, 0x04, 0x8f, 0xbd, 0x31, 0x13, 0x9c, 0xbb, 0xab, 0xbc, 0x3f, 0xbd, 0xe1, 0x11, 0xc2, 0xbd, 0x47, 0xa8, 0x3a, 0x3d, 0x76, 0xc5, 0x0b, 0xbe, 0x0d, 0x71, 0xff, 0x3d, 0x30, 0x8e, 0x41, 0x3d, 0xdc, 0xf6, 0x2d, 0xbe, 0x1a, 0x84, 0x1f, 0x3d, 0xe2, 0xd4, 0x09, 0x3e, 0xe7, 0x1f, 0x1d, 0xbd, 0x20, 0x25, 0x26, 0x3d, 0x68, 0x8f, 0x61, 0x3d, 0xe7, 0xdf, 0x1f, 0xbe, 0xad, 0x57, 0x1b, 0xbe, 0x3e, 0xec, 0x1b, 0xbe, 0x6f, 0xe4, 0x09, 0xbe, 0x87, 0x7d, 0xb5, 0xbc, 0xce, 0x89, 0x07, 0x3d, 0x8a, 0x34, 0xbe, 0x3b, 0x7a, 0x7d, 0x24, 0x3e, 0xde, 0xc8, 0xfa, 0x3d, 0xa4, 0xc7, 0x9e, 0xbd, 0x5b, 0x97, 0xf0, 0xbd, 0x16, 0xf7, 0x3b, 0xbe, 0x91, 0xad, 0x27, 0x3e, 0x06, 0x69, 0xf3, 0xbd, 0x6d, 0xb9, 0xe6, 0xbd, 0xfc, 0xa1, 0x33, 0x3e, 0x73, 0x47, 0xd4, 0xbd, 0xd1, 0x35, 0xc0, 0x3d, 0x74, 0x47, 0x12, 0x3d, 0x2d, 0x04, 0x23, 0x3d, 0xfc, 0xc6, 0x1b, 0x3d, 0x75, 0x18, 0x0e, 0xbe, 0xa5, 0x96, 0x55, 0x3c, 0xb8, 0x10, 0xad, 0xbc, 0x93, 0x9b, 0xde, 0xbd, 0x9f, 0xa2, 0xf4, 0x3d, 0xb8, 0x21, 0xf6, 0xba, 0xd7, 0x96, 0x09, 0xbd, 0x2a, 0x6c, 0xd9, 0xbd, 0xb1, 0x32, 0x45, 0x3d, 0xc0, 0x16, 0x94, 0xbd, 0x78, 0xac, 0x97, 0xbd, 0x97, 0xd4, 0xdf, 0xbd, 0x68, 0x97, 0x36, 0xbd, 0x28, 0xce, 0x2f, 0x3d, 0x12, 0x02, 0x3d, 0xbd, 0x5b, 0x8f, 0x23, 0x3d, 0xf5, 0xc3, 0xda, 0xba, 0xa6, 0x72, 0x41, 0x3e, 0x27, 0xa9, 0xcd, 0xbd, 0x9c, 0x9a, 0x3c, 0x3d, 0xf2, 0x7f, 0x45, 0x3e, 0x1c, 0x9f, 0x40, 0x3e, 0xa9, 0xdf, 0x74, 0x3c, 0x6a, 0x72, 0x6e, 0xbd, 0x46, 0x83, 0xa5, 0x3d, 0x3b, 0x67, 0x6c, 0x3c, 0xfc, 0x84, 0x2a, 0x3d, 0x3c, 0xf4, 0x35, 0x3e, 0xb4, 0x2c, 0x79, 0xbd, 0x43, 0xb9, 0xd6, 0x3d, 0xe6, 0xae, 0x13, 0xbd, 0xeb, 0x77, 0xd0, 0xbd, 0x31, 0x51, 0xbe, 0x3d, 0x5f, 0x2e, 0x23, 0x3c, 0x7a, 0xbe, 0x15, 0x3e, 0x4b, 0x59, 0xdc, 0xbd, 0xa0, 0x8f, 0xe7, 0xbd, 0x76, 0xa8, 0xf3, 0xbd, 0x88, 0x1c, 0x74, 0x3d, 0x85, 0x4d, 0xdd, 0xbd, 0x45, 0x96, 0x36, 0xbd, 0xe8, 0x39, 0x98, 0x3d, 0xbe, 0x82, 0xf9, 0x3d, 0x1d, 0xdb, 0x2d, 0x3b, 0x6f, 0xac, 0x63, 0xbd, 0x8c, 0xc8, 0xe1, 0xbd, 0xcf, 0x49, 0x73, 0xbd, 0x8a, 0xdd, 0xe3, 0xbd, 0xf8, 0x00, 0x19, 0xbd, 0x17, 0xe8, 0xdf, 0xbd, 0xba, 0x22, 0x5b, 0x3c, 0xf1, 0x54, 0x21, 0xbe, 0x7b, 0x38, 0x58, 0xbd, 0x48, 0x88, 0x67, 0xbd, 0x5e, 0xe2, 0x6c, 0x3d, 0xa5, 0x44, 0x20, 0xbe, 0x69, 0x7f, 0xbf, 0xbc, 0x7c, 0xfa, 0x25, 0x3e, 0xc1, 0xd9, 0xd5, 0xbd, 0x46, 0x87, 0x75, 0xbd, 0x13, 0x1c, 0x01, 0xbd, 0xe5, 0xc3, 0x19, 0xbb, 0x2d, 0xc8, 0x30, 0xbe, 0xad, 0xd8, 0xf2, 0x3d, 0xd9, 0x37, 0x14, 0xbd, 0xd2, 0xb5, 0x9a, 0x3d, 0xf4, 0x37, 0x8d, 0x3c, 0x2f, 0x8f, 0xc0, 0x3d, 0x8e, 0xe9, 0xc5, 0xbd, 0xf5, 0x4d, 0x21, 0xbe, 0xfd, 0x9a, 0xaa, 0xbd, 0x91, 0xb6, 0x00, 0xbe, 0xf0, 0x0d, 0xbf, 0x3c, 0xe4, 0x94, 0xed, 0x3d, 0x64, 0xbe, 0x8d, 0x3c, 0x27, 0xcf, 0x2f, 0x3e, 0x22, 0xa5, 0xf1, 0x3d, 0x96, 0xf2, 0xbf, 0xbd, 0x62, 0xde, 0xe5, 0xbd, 0x4b, 0x4a, 0x89, 0x3d, 0x7a, 0x3c, 0x1d, 0x3e, 0xfc, 0x83, 0xab, 0xbc, 0x0f, 0x00, 0x2e, 0xbe, 0xd5, 0xd1, 0x93, 0x3d, 0x32, 0x51, 0xca, 0xbd, 0x27, 0x77, 0x31, 0xbd, 0x6e, 0xe6, 0xe2, 0x3d, 0xdd, 0xb0, 0x03, 0xbe, 0xd7, 0xec, 0xe5, 0xbd, 0x97, 0x8e, 0x82, 0x3b, 0x7b, 0xaf, 0x03, 0xbe, 0xbe, 0x24, 0xc3, 0x3d, 0x1e, 0x4c, 0x51, 0x3e, 0x07, 0x32, 0x10, 0x3e, 0xac, 0xdb, 0x01, 0xbe, 0xef, 0x14, 0x38, 0x3e, 0x1b, 0xbb, 0x73, 0x3d, 0x6a, 0x42, 0x35, 0xbd, 0x79, 0x72, 0x13, 0xbe, 0x05, 0x8c, 0xe9, 0x3d, 0xc1, 0x57, 0xe5, 0x3b, 0x50, 0x38, 0x71, 0x3d, 0x47, 0xb5, 0xe4, 0xbd, 0x0f, 0x18, 0x01, 0xbe, 0xd6, 0x1c, 0x76, 0x3b, 0x99, 0x36, 0x1c, 0xbe, 0x6d, 0xee, 0x1a, 0x3d, 0x2d, 0xcb, 0x39, 0xbd, 0xc0, 0x54, 0x24, 0x3e, 0xcb, 0x5b, 0xfb, 0x3c, 0x8d, 0xc8, 0x85, 0x3a, 0x10, 0xcb, 0xd6, 0x3c, 0xfd, 0x81, 0xd8, 0x3c, 0xc7, 0xab, 0x1b, 0xba, 0xf5, 0xe1, 0xb5, 0xbd, 0x7a, 0x09, 0xfc, 0x3d, 0x98, 0x7b, 0x6b, 0xbd, 0x31, 0x74, 0x46, 0xbe, 0x13, 0x26, 0x02, 0x3e, 0x67, 0x37, 0x03, 0xbe, 0x68, 0x29, 0xc4, 0xbd, 0x8a, 0xc5, 0x8b, 0xbd, 0x50, 0x23, 0x22, 0xbc, 0x6d, 0x99, 0xf5, 0x3d, 0x01, 0x6c, 0xc5, 0xbd, 0xd6, 0xce, 0x14, 0xbe, 0x29, 0xd4, 0xef, 0xbd, 0x7c, 0xe1, 0x8b, 0x3c, 0x8f, 0x04, 0xd6, 0xbc, 0x29, 0xf1, 0x60, 0x3c, 0x02, 0x1a, 0x2c, 0x3b, 0x76, 0x21, 0x00, 0xbe, 0x16, 0x98, 0x66, 0xbd, 0x2a, 0x64, 0x3f, 0xbd, 0xbf, 0x81, 0x24, 0x3d, 0x30, 0x34, 0x27, 0x3e, 0x90, 0xee, 0x9b, 0x3d, 0xe1, 0x6c, 0xdd, 0x3c, 0x25, 0x40, 0x25, 0x3e, 0xc0, 0x85, 0x57, 0x3b, 0x16, 0xa8, 0x4f, 0x3e, 0xa9, 0xfb, 0x48, 0xbd, 0x38, 0x1c, 0xf8, 0x3b, 0x7a, 0x4a, 0xb0, 0xbd, 0x29, 0xe7, 0xf3, 0xbd, 0xa5, 0x5c, 0x42, 0x3d, 0xab, 0x54, 0x09, 0x3e, 0x94, 0x68, 0x75, 0x3d, 0x24, 0x37, 0x03, 0xbe, 0x4e, 0xba, 0x09, 0x3e, 0x16, 0xba, 0x09, 0x3e, 0xbd, 0x97, 0x00, 0xbe, 0x92, 0xe4, 0x95, 0xbd, 0x74, 0xf5, 0x9f, 0xbd, 0x40, 0x16, 0x81, 0x3d, 0x83, 0x4c, 0x26, 0x3e, 0x61, 0xd1, 0x25, 0x3e, 0xfb, 0x74, 0x1d, 0xbe, 0x9b, 0x9f, 0x0f, 0x3d, 0xe8, 0x7e, 0x10, 0x3d, 0x9e, 0xb0, 0x15, 0x3d, 0x34, 0xe6, 0xee, 0x3d, 0xaf, 0xef, 0xf0, 0xbb, 0xaa, 0x06, 0x24, 0xbe, 0x43, 0x5e, 0xdb, 0x3d, 0x10, 0xd8, 0xa4, 0x3d, 0x6e, 0xc9, 0x0c, 0xbd, 0x1c, 0xfe, 0xa9, 0x3d, 0xf0, 0xf3, 0x31, 0x3d, 0x38, 0xf5, 0x7e, 0xba, 0x24, 0x31, 0xe0, 0x3d, 0x6e, 0xf2, 0xa2, 0x3d, 0xbe, 0x8b, 0xd4, 0xbd, 0x65, 0xc3, 0x25, 0x3c, 0xa3, 0xde, 0x67, 0xba, 0x41, 0xe9, 0x13, 0xbe, 0x83, 0xd0, 0x02, 0xbd, 0x8b, 0x91, 0x3a, 0x3d, 0x29, 0x20, 0x4c, 0xbc, 0xfc, 0x3f, 0xcd, 0xbd, 0x5a, 0x01, 0xae, 0xbd, 0x6c, 0x48, 0x1e, 0xbe, 0xe0, 0x29, 0x80, 0x3d, 0x18, 0x74, 0xa0, 0xbd, 0x2a, 0xeb, 0xbd, 0x39, 0x28, 0xe6, 0x2e, 0xbe, 0x4b, 0x70, 0x59, 0x3d, 0xd7, 0xcf, 0xd7, 0xbc, 0x34, 0x77, 0xa5, 0x3c, 0xef, 0x6d, 0x58, 0xbb, 0x31, 0xcc, 0xde, 0xbb, 0xf6, 0xe6, 0xc2, 0xbd, 0x8b, 0xee, 0x14, 0x3e, 0xf3, 0x70, 0x12, 0xbe, 0x88, 0x93, 0xae, 0xbd, 0x57, 0xd4, 0xfc, 0x3d, 0x48, 0x74, 0x36, 0x3e, 0xb5, 0xcb, 0x08, 0xbe, 0x32, 0x08, 0xbe, 0xbd, 0x95, 0xe2, 0x2e, 0xbd, 0x6c, 0xa0, 0xc3, 0x3d, 0x83, 0xdb, 0xc4, 0x3a, 0xc8, 0x25, 0xf0, 0x3d, 0x8a, 0x78, 0x0f, 0x3e, 0xed, 0xd4, 0x02, 0xbc, 0xd4, 0x18, 0xad, 0xbd, 0x70, 0x10, 0xbf, 0xbd, 0x9f, 0x8e, 0x1c, 0xbe, 0x41, 0xdf, 0xf2, 0x3d, 0x20, 0x72, 0x45, 0x3d, 0x7f, 0x52, 0x16, 0xbe, 0xd7, 0xf4, 0x25, 0xbe, 0x6d, 0x3f, 0x3d, 0x3e, 0xd4, 0xb0, 0x26, 0xbe, 0x23, 0x8c, 0x87, 0x3d, 0x6c, 0x4e, 0xb9, 0xbc, 0x67, 0x6c, 0x44, 0x3c, 0x35, 0x7b, 0xde, 0x3d, 0x19, 0x66, 0xd7, 0x3d, 0x1c, 0xc9, 0xc2, 0x3d, 0xf1, 0xee, 0xba, 0xbd, 0xa3, 0xe1, 0xc8, 0x3d, 0xf5, 0xf9, 0x82, 0x3c, 0x3d, 0x0e, 0x81, 0x3d, 0xea, 0xc7, 0x5d, 0x3d, 0x19, 0x63, 0x25, 0x3e, 0x59, 0x2f, 0x13, 0xbd, 0xf2, 0x44, 0xeb, 0x3d, 0xf0, 0xb5, 0xf1, 0xbc, 0x85, 0x77, 0x03, 0x3d, 0xda, 0x66, 0x11, 0xbd, 0xef, 0xae, 0x1b, 0x3d, 0xe1, 0x4f, 0x94, 0xbd, 0x25, 0x17, 0x56, 0xbd, 0x74, 0x34, 0x0c, 0x3e, 0xf8, 0x12, 0x88, 0x3d, 0x96, 0x08, 0x97, 0xbd, 0x04, 0xb9, 0x75, 0xbc, 0x72, 0x9f, 0x8e, 0x3d, 0x0d, 0xf3, 0x7d, 0xbd, 0x51, 0xe7, 0x56, 0xbc, 0x93, 0x6d, 0x08, 0xbe, 0xa7, 0xd8, 0x09, 0x3e, 0x80, 0xd5, 0xa8, 0xbd, 0x40, 0x03, 0xd1, 0x3c, 0xe2, 0x44, 0x1f, 0xbd, 0x3e, 0x1f, 0xd6, 0xbd, 0x9f, 0x62, 0xe7, 0x3c, 0xf7, 0x6d, 0xae, 0xbd, 0xf4, 0x14, 0xf6, 0x3a, 0x54, 0x99, 0xea, 0x3b, 0x9c, 0xab, 0xf7, 0xbd, 0x74, 0x21, 0xdd, 0x3d, 0x87, 0x18, 0x95, 0xbd, 0x49, 0x55, 0x0c, 0xbe, 0xd6, 0xdc, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20, 0x01, 0x00, 0x00, 0x5a, 0xd4, 0xee, 0x3d, 0x38, 0x39, 0x64, 0x3e, 0x55, 0xb4, 0x79, 0x3d, 0x1d, 0xa3, 0xb9, 0x3d, 0xb9, 0x79, 0xe0, 0x3b, 0x30, 0xff, 0xd1, 0x3d, 0x7a, 0x3b, 0x2d, 0xbd, 0x18, 0x51, 0x07, 0xbe, 0x5c, 0x31, 0x3d, 0x3e, 0x46, 0x0f, 0x51, 0xbe, 0x29, 0x32, 0x13, 0x3e, 0x7c, 0x11, 0xf3, 0xbd, 0x3a, 0xbd, 0x4a, 0xbd, 0x56, 0xb3, 0xce, 0xbd, 0x37, 0xd0, 0xf6, 0x3d, 0xd5, 0x9b, 0xd8, 0x3d, 0xa8, 0xbc, 0x5a, 0xbe, 0x1b, 0x22, 0x0e, 0xbc, 0x03, 0x98, 0xf9, 0x3d, 0x64, 0xf4, 0x47, 0x3e, 0xa2, 0xb5, 0x2f, 0xbe, 0x70, 0x7a, 0x89, 0xbe, 0x9c, 0x58, 0x60, 0x3e, 0x71, 0xac, 0x25, 0xbe, 0x17, 0x1c, 0x01, 0x3e, 0x48, 0x73, 0x93, 0xbd, 0x0d, 0x92, 0xa3, 0x3d, 0xf1, 0xff, 0x62, 0xbe, 0x56, 0xe9, 0x71, 0xbe, 0x09, 0xf7, 0x96, 0xbe, 0x91, 0x7a, 0x0a, 0x3e, 0xc1, 0x6d, 0x88, 0x3c, 0x6c, 0xd0, 0x4f, 0xbe, 0x71, 0x75, 0x99, 0xbd, 0x7d, 0x92, 0x01, 0xbe, 0x35, 0x21, 0x96, 0xbe, 0xd9, 0x0e, 0x2d, 0x3e, 0x63, 0x17, 0x8b, 0x3d, 0x53, 0x6d, 0xb7, 0x3c, 0xb9, 0x06, 0x20, 0x3d, 0xdf, 0x56, 0x11, 0x3e, 0xc4, 0xcd, 0xa9, 0x3c, 0x7d, 0x0a, 0x3b, 0x3e, 0xd6, 0x23, 0x7f, 0xbc, 0xaf, 0x06, 0xc4, 0xbc, 0xe0, 0xe3, 0x63, 0xbd, 0x34, 0x50, 0x2a, 0x3e, 0x1f, 0xff, 0x4c, 0x3e, 0x34, 0x98, 0x79, 0xbe, 0x4c, 0xbd, 0x18, 0x3e, 0x5b, 0x8b, 0x0f, 0x3e, 0x33, 0x44, 0x34, 0xbd, 0xd6, 0xd7, 0x90, 0xbe, 0x51, 0x5e, 0x55, 0x3d, 0x46, 0x2b, 0x54, 0xbe, 0xd8, 0x49, 0x30, 0xbe, 0x45, 0xb3, 0x72, 0xbe, 0x93, 0x18, 0xcd, 0x3d, 0x86, 0xe1, 0x73, 0xbd, 0x94, 0x56, 0xf3, 0x3d, 0x0a, 0x54, 0xd7, 0xbd, 0x01, 0xd9, 0x98, 0x3e, 0xd5, 0x11, 0x01, 0xbb, 0x69, 0x07, 0x62, 0xbe, 0x81, 0x33, 0x03, 0xbb, 0x98, 0xf9, 0x9f, 0x3c, 0xe8, 0x77, 0x96, 0x3e, 0x3a, 0xc2, 0x73, 0x3e, 0xa1, 0x45, 0x35, 0xbe, 0xea, 0x1c, 0x86, 0xbc, 0xad, 0x90, 0x45, 0xbe, 0x0b, 0xd2, 0x03, 0x3d, 0x02, 0xde, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0xa1, 0xc6, 0xcd, 0xbe, 0x46, 0xa7, 0xbd, 0x3e, 0x7c, 0xe3, 0x00, 0x3f, 0x13, 0x8d, 0xb6, 0xbe, 0x21, 0x72, 0x8b, 0x3e, 0x16, 0x68, 0x68, 0x3e, 0x05, 0xb7, 0xb6, 0xbe, 0xa0, 0xd3, 0xd4, 0x3e, 0x98, 0x82, 0x83, 0xbd, 0x8c, 0xb1, 0xe2, 0x3d, 0xd6, 0x94, 0x82, 0x3e, 0x07, 0x6a, 0x70, 0xbe, 0x6b, 0x74, 0x0b, 0x3f, 0xd8, 0xf5, 0x3d, 0x3e, 0xfb, 0xf3, 0x19, 0xbd, 0x2c, 0x72, 0xbf, 0x3e, 0xff, 0x95, 0x49, 0x3d, 0xee, 0x70, 0x78, 0x3e, 0xb0, 0x3f, 0x58, 0x3d, 0x78, 0xea, 0x9d, 0xbe, 0x53, 0x1d, 0x15, 0x3f, 0x0d, 0xfc, 0xbe, 0xbe, 0xad, 0x10, 0x07, 0xbf, 0xb4, 0x11, 0x87, 0xbe, 0x20, 0x92, 0x62, 0x3e, 0x58, 0x61, 0xbd, 0x3e, 0xea, 0x54, 0x4a, 0xbd, 0xbd, 0x55, 0xce, 0xbe, 0x12, 0x48, 0xa2, 0x3e, 0xe0, 0x74, 0x90, 0x3d, 0xce, 0x80, 0xf5, 0x3e, 0xa5, 0xb7, 0x15, 0x3f, 0x8e, 0xde, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x20, 0x01, 0x00, 0x2c, 0xcf, 0x79, 0xbd, 0x8c, 0x37, 0x5a, 0xbc, 0x00, 0x4c, 0x6f, 0x3c, 0x14, 0x0b, 0x8e, 0x3d, 0xa8, 0xc3, 0x12, 0x3c, 0x10, 0x9f, 0xa5, 0xbb, 0xe8, 0x7e, 0x17, 0xbd, 0x43, 0x60, 0x74, 0xbd, 0xc6, 0x62, 0x6f, 0x3d, 0x88, 0x83, 0x6c, 0xbd, 0xf7, 0xf2, 0x36, 0xbd, 0xb7, 0x11, 0x81, 0xbd, 0x69, 0x1c, 0x30, 0xbd, 0xde, 0xd0, 0x4e, 0x3c, 0xa4, 0x9f, 0x6e, 0xbc, 0x06, 0xd8, 0xd6, 0xbc, 0x21, 0x75, 0x5f, 0xbd, 0x68, 0x6f, 0x0c, 0xbc, 0xbd, 0x21, 0xcf, 0xbb, 0x20, 0x31, 0xb0, 0x3b, 0x88, 0xa3, 0x32, 0x3c, 0xa0, 0xec, 0x56, 0x3d, 0x19, 0xfd, 0xf8, 0x3c, 0x99, 0xd1, 0x75, 0x3d, 0x99, 0x54, 0x3d, 0x3c, 0x4d, 0x0f, 0x12, 0x3b, 0x34, 0xf2, 0x37, 0xbd, 0xaa, 0x3b, 0x85, 0xbb, 0x23, 0xfe, 0xde, 0xbb, 0x8a, 0xe4, 0x21, 0x3c, 0xbd, 0x46, 0x8d, 0x3d, 0xd8, 0xf0, 0x03, 0x3d, 0xfa, 0xb6, 0xb6, 0x3c, 0xb8, 0x2e, 0xc9, 0xbc, 0xac, 0x52, 0x4a, 0xbd, 0xd2, 0x5d, 0x00, 0x3c, 0x7d, 0x64, 0x6f, 0xbd, 0xe6, 0x47, 0x77, 0x3d, 0xe0, 0x29, 0xbe, 0x3b, 0x5a, 0xb3, 0xee, 0xbc, 0x40, 0x76, 0xe3, 0xbb, 0x18, 0xf0, 0x8b, 0x3c, 0xbc, 0x5f, 0x3a, 0x3d, 0x47, 0xdd, 0x08, 0x3d, 0x0b, 0xae, 0x39, 0xbc, 0xa1, 0xca, 0xd9, 0xbc, 0xf8, 0x6b, 0x92, 0xbc, 0xf8, 0x2b, 0x42, 0x3d, 0xef, 0x4c, 0x14, 0xbd, 0x64, 0xd7, 0x4b, 0xbd, 0x22, 0x18, 0x18, 0x3c, 0x20, 0xf8, 0x29, 0xbd, 0x00, 0x5d, 0xdd, 0x3a, 0x56, 0x0c, 0x5f, 0xbd, 0x47, 0x5d, 0x84, 0xbd, 0x5e, 0xea, 0xa1, 0x3c, 0xc4, 0x53, 0x89, 0xbd, 0x53, 0xde, 0x4d, 0xbc, 0xe7, 0xc7, 0x88, 0xbc, 0x35, 0xef, 0x56, 0x3d, 0x45, 0x2c, 0xb4, 0x3c, 0xd8, 0x97, 0x7b, 0xbd, 0x17, 0xec, 0x89, 0x3d, 0xe1, 0x90, 0x45, 0x3d, 0x89, 0xf2, 0x3f, 0xbd, 0xf1, 0x11, 0xff, 0xbb, 0x1b, 0x6f, 0x03, 0xbd, 0xf7, 0xf7, 0x3d, 0x3b, 0xc4, 0x7d, 0x91, 0x3c, 0x44, 0x07, 0x0b, 0x3d, 0x4a, 0xc0, 0x6f, 0x3d, 0x79, 0x51, 0x8f, 0x3d, 0x66, 0x5e, 0x41, 0x3d, 0xf1, 0x9b, 0x8c, 0xbd, 0x38, 0xb9, 0xca, 0x3c, 0xe3, 0xf8, 0xe8, 0x3c, 0xcd, 0xce, 0x8f, 0xbb, 0xe4, 0xe9, 0x6b, 0x3c, 0x92, 0xd8, 0x39, 0x3d, 0xbe, 0x6d, 0x52, 0xbd, 0x38, 0xed, 0x4a, 0xbd, 0x68, 0xd4, 0x28, 0xbc, 0x6f, 0x16, 0x67, 0xbd, 0xd7, 0x55, 0x8a, 0x3d, 0xe0, 0x69, 0xb0, 0xbb, 0xfa, 0x9c, 0x93, 0xbd, 0x14, 0xe4, 0x21, 0x3d, 0x96, 0x1c, 0x7b, 0x3d, 0x4c, 0x31, 0x34, 0x3c, 0xa8, 0x41, 0x5c, 0x3c, 0x90, 0xe5, 0x8c, 0x3d, 0x11, 0x9f, 0x98, 0x3c, 0xf0, 0x3d, 0x16, 0x3d, 0x53, 0xd1, 0x91, 0xbd, 0x50, 0xc5, 0xef, 0x3c, 0x25, 0x52, 0x83, 0x3c, 0x9e, 0xce, 0x1f, 0x3c, 0x91, 0xa7, 0x0c, 0xbd, 0xb8, 0x95, 0x03, 0x3c, 0x7a, 0x4c, 0x35, 0x3d, 0x8e, 0xc4, 0x44, 0x3d, 0x1c, 0x66, 0x2c, 0x3d, 0x00, 0x89, 0x40, 0xba, 0xe1, 0xa3, 0x83, 0x3d, 0x68, 0xf2, 0x2b, 0xbd, 0x30, 0xd4, 0xde, 0x3b, 0xcf, 0xa1, 0xbc, 0x3c, 0x24, 0x79, 0x39, 0xbd, 0xe5, 0xf4, 0xb7, 0xbc, 0x79, 0x8d, 0x25, 0x3c, 0x95, 0xb6, 0x38, 0x3d, 0xd8, 0xc2, 0x74, 0x3c, 0xaa, 0x8e, 0x80, 0xbd, 0x0d, 0x74, 0xf3, 0x3c, 0x73, 0x5b, 0x98, 0xbc, 0x00, 0x64, 0x5e, 0xbc, 0x44, 0x82, 0xcb, 0x3c, 0x5a, 0x25, 0x53, 0xbd, 0xe2, 0xd0, 0x93, 0xbd, 0x3b, 0x7a, 0x77, 0xbd, 0x93, 0x3e, 0xd4, 0x3c, 0x39, 0x81, 0x28, 0xbd, 0x54, 0xd5, 0xef, 0x3c, 0x6c, 0x29, 0xe1, 0x3c, 0x69, 0xc8, 0x09, 0x3d, 0x83, 0xb3, 0x36, 0xbd, 0x90, 0xe1, 0xd4, 0xbb, 0x95, 0xa7, 0x1a, 0xbd, 0x39, 0xf5, 0x2b, 0xbc, 0x0c, 0xdf, 0x64, 0xbd, 0x74, 0xec, 0xdc, 0xbc, 0x20, 0xc6, 0x3b, 0x3d, 0x40, 0x20, 0x46, 0x3c, 0x18, 0x09, 0x3f, 0xbd, 0x96, 0x4c, 0xdc, 0xbc, 0x98, 0x98, 0x8d, 0xbd, 0xb4, 0xdd, 0x27, 0xbd, 0x74, 0x45, 0xbb, 0x3c, 0x49, 0xd9, 0x08, 0xbd, 0x8e, 0x06, 0xa8, 0x3b, 0x91, 0x10, 0xb4, 0x3c, 0xf8, 0x58, 0xf3, 0xbc, 0x06, 0xe9, 0x5e, 0x3d, 0x14, 0xc8, 0x26, 0x3d, 0xc5, 0xf7, 0x20, 0xbb, 0x6b, 0x78, 0xc0, 0x3c, 0xae, 0x64, 0x7f, 0x3c, 0xbb, 0xbf, 0x8b, 0x3c, 0x82, 0x4e, 0x0c, 0xbd, 0xb0, 0xd0, 0xdf, 0xbc, 0xfe, 0x53, 0x97, 0xbc, 0x8a, 0x9e, 0x24, 0xbd, 0xdf, 0x79, 0x84, 0x3d, 0x7e, 0xff, 0x8e, 0xbd, 0x66, 0x7b, 0xda, 0x3c, 0xb0, 0xdd, 0x8d, 0xbd, 0xab, 0x91, 0xbb, 0xbc, 0x23, 0x20, 0xb0, 0xbc, 0xbe, 0x43, 0x3f, 0xbd, 0x64, 0x80, 0xda, 0x3c, 0x32, 0x00, 0xde, 0x3c, 0xb2, 0x8a, 0x86, 0x3c, 0x68, 0x45, 0x05, 0x3d, 0x8b, 0x7c, 0xd8, 0x3b, 0x68, 0x97, 0xe7, 0x3c, 0x82, 0x8d, 0x6b, 0x3d, 0xa6, 0x53, 0x2d, 0x3d, 0xc0, 0x43, 0x23, 0x3c, 0xaa, 0xe6, 0x2d, 0xbd, 0x34, 0x06, 0x57, 0xbc, 0xfc, 0x9f, 0x0c, 0xbd, 0x42, 0x77, 0xc6, 0x3c, 0x51, 0x7a, 0x70, 0x3c, 0xe5, 0xe4, 0x7c, 0x3d, 0x86, 0x00, 0x67, 0xbd, 0x95, 0xb8, 0x37, 0xbd, 0xdd, 0x7a, 0x8d, 0x3d, 0x97, 0x08, 0xa9, 0x3c, 0xfd, 0xb6, 0x09, 0x3d, 0xdc, 0xb7, 0x81, 0x3d, 0xe0, 0x6c, 0x68, 0xbc, 0x79, 0x9b, 0x03, 0xbd, 0xb8, 0xc7, 0x78, 0xbb, 0x94, 0x60, 0x0f, 0x3d, 0x3b, 0x0e, 0x80, 0x3d, 0x11, 0xe6, 0x80, 0x3d, 0xb3, 0xab, 0x86, 0x3d, 0xed, 0xe6, 0x9d, 0xbc, 0xd8, 0xeb, 0xd9, 0xbc, 0xaa, 0x62, 0x80, 0x3d, 0x12, 0xc5, 0x00, 0x3d, 0x2b, 0x4b, 0x23, 0xbc, 0xc7, 0x31, 0xff, 0xbc, 0xe4, 0x95, 0xdb, 0x3b, 0xa7, 0x90, 0x66, 0x3c, 0xd3, 0x65, 0xdb, 0xbc, 0x50, 0xe3, 0x47, 0x3d, 0xd4, 0x25, 0x84, 0xbd, 0x5a, 0xd5, 0xae, 0xbc, 0x90, 0x5e, 0xba, 0x3c, 0x8c, 0x60, 0x90, 0xbd, 0xfc, 0x57, 0x4c, 0x3d, 0x99, 0x08, 0x7d, 0xbd, 0x9f, 0xac, 0x3b, 0x3c, 0x1c, 0xb1, 0x61, 0xbc, 0x6a, 0xb5, 0x33, 0xbc, 0x10, 0xb0, 0x28, 0x3c, 0x89, 0x5d, 0x9f, 0x3c, 0xd2, 0x80, 0x84, 0xbc, 0xb4, 0xb1, 0xd5, 0xba, 0x41, 0x1e, 0xa0, 0x3c, 0xd1, 0xd9, 0xd0, 0xbb, 0x04, 0xda, 0xd2, 0x3c, 0x58, 0x46, 0x90, 0xbc, 0xc1, 0x5c, 0x19, 0xbc, 0x01, 0x66, 0x2c, 0xbd, 0xad, 0xdc, 0x88, 0xbd, 0x32, 0xab, 0xb6, 0xbc, 0x14, 0x1f, 0x0b, 0x3d, 0x87, 0xf0, 0x69, 0x3d, 0x55, 0x30, 0x26, 0xbd, 0x2e, 0x3a, 0x05, 0xbd, 0xda, 0x08, 0x0e, 0xbd, 0xef, 0x31, 0x57, 0xbd, 0x0e, 0x44, 0x13, 0xbd, 0x53, 0x11, 0x29, 0xbd, 0x00, 0xd2, 0xea, 0x3a, 0x47, 0x72, 0xae, 0xbc, 0x54, 0x4a, 0x4d, 0xbd, 0x8a, 0x13, 0x2b, 0xbd, 0xa3, 0xaf, 0x92, 0x3d, 0x68, 0x15, 0x0d, 0x3c, 0x18, 0x17, 0x35, 0x3c, 0xb8, 0xf2, 0x6a, 0x3c, 0x15, 0xf8, 0xb2, 0x3c, 0x1d, 0x9d, 0xcd, 0x3c, 0xd3, 0x90, 0x81, 0xbd, 0x51, 0xe8, 0x21, 0x3d, 0x74, 0x43, 0xa9, 0x3c, 0x00, 0x0b, 0xa0, 0x3c, 0x8e, 0x69, 0xfb, 0xba, 0x81, 0x27, 0xfa, 0x3c, 0x6b, 0x7c, 0xf5, 0xbc, 0x61, 0x68, 0x84, 0x3d, 0xe4, 0x1a, 0x6b, 0xbd, 0xd0, 0xe9, 0xc8, 0x3c, 0x26, 0xff, 0x47, 0xbd, 0x64, 0xb7, 0xe9, 0x3b, 0xf3, 0xad, 0x36, 0x3d, 0x8a, 0x00, 0x3f, 0xbd, 0x94, 0x41, 0xcf, 0xbc, 0x01, 0xba, 0x55, 0x3d, 0x8c, 0x08, 0x36, 0xbd, 0xa4, 0x6b, 0x1a, 0x3d, 0x59, 0xfd, 0x83, 0x3d, 0xcc, 0xdd, 0x60, 0xbd, 0x59, 0xc2, 0xfe, 0xbc, 0xa6, 0x99, 0x2a, 0x3d, 0xbd, 0x45, 0x8b, 0x3d, 0xe2, 0x5e, 0x8c, 0x3d, 0x18, 0x83, 0x87, 0xbc, 0x10, 0x63, 0xda, 0x3b, 0x58, 0xa1, 0xc2, 0x3c, 0x78, 0xfa, 0x78, 0x3c, 0xfc, 0x33, 0xf0, 0x3c, 0xc4, 0xab, 0x5b, 0xbd, 0xde, 0x4b, 0x07, 0x3d, 0x53, 0x76, 0x1b, 0xbd, 0xee, 0xd8, 0x86, 0x3d, 0x7f, 0xd6, 0x7c, 0xbd, 0x68, 0xb5, 0x8e, 0x3c, 0x49, 0xdd, 0xd5, 0xbc, 0x83, 0x63, 0xed, 0xbb, 0x4e, 0x00, 0x91, 0xbd, 0x69, 0xce, 0xd5, 0xbb, 0x2f, 0x57, 0x71, 0xbc, 0x9a, 0xc3, 0x8f, 0xbd, 0x65, 0x27, 0x47, 0x3d, 0x2d, 0x6b, 0x77, 0xbd, 0xdd, 0x54, 0x43, 0xbc, 0xf7, 0x1f, 0xe8, 0xbc, 0x12, 0x8f, 0x87, 0xbd, 0x4f, 0xcf, 0x2f, 0x3d, 0x15, 0x51, 0x4b, 0xbd, 0x9d, 0x1f, 0x86, 0x3d, 0x68, 0x35, 0x58, 0xbd, 0x16, 0xe4, 0x4e, 0xbd, 0xd0, 0x03, 0x91, 0xbd, 0x39, 0xc6, 0x90, 0x3c, 0xdd, 0xbb, 0x0a, 0xbd, 0x58, 0x1b, 0x33, 0xbd, 0x55, 0x86, 0x91, 0xbd, 0x48, 0xe7, 0x90, 0xbc, 0xf4, 0x14, 0x3f, 0xbc, 0xc0, 0x75, 0x9e, 0xba, 0x7e, 0x8f, 0xa8, 0xbc, 0x8c, 0x2b, 0x55, 0x3d, 0x54, 0x4b, 0x70, 0xbd, 0x56, 0x74, 0x52, 0x3d, 0x6d, 0xf4, 0x02, 0x3b, 0x7d, 0x46, 0x5c, 0x3b, 0x76, 0xf4, 0x0c, 0xbd, 0xac, 0xa2, 0x1d, 0xbd, 0x5c, 0x63, 0xe2, 0xbc, 0x64, 0x4d, 0x31, 0x3c, 0xf9, 0x3e, 0x3f, 0x3d, 0xed, 0x12, 0x2c, 0xbd, 0xc8, 0x12, 0xb0, 0xbc, 0x4d, 0x90, 0x8f, 0x3d, 0x1d, 0xef, 0x89, 0x3d, 0xf0, 0x4f, 0x93, 0xbd, 0x88, 0x79, 0xd8, 0x3c, 0x74, 0x42, 0x1f, 0xbd, 0xba, 0x43, 0x90, 0x3c, 0xd5, 0x7e, 0xe3, 0xbc, 0x71, 0x49, 0x7b, 0xbd, 0x5d, 0x36, 0x16, 0x3d, 0x91, 0xb8, 0x22, 0xbd, 0xd4, 0x0e, 0x1e, 0x3d, 0xaa, 0x17, 0x2d, 0x3c, 0xca, 0x4d, 0xb9, 0x3b, 0x8a, 0x9d, 0x01, 0x3d, 0x60, 0xcf, 0xc3, 0xbb, 0xc4, 0xc0, 0x00, 0x3b, 0x6d, 0xeb, 0x09, 0xbd, 0x88, 0x55, 0x9e, 0xbc, 0x04, 0x54, 0xc3, 0xbc, 0x00, 0x93, 0xf2, 0x3a, 0xe2, 0x88, 0x6e, 0x3d, 0xa0, 0xdb, 0xd4, 0xbc, 0x12, 0x3b, 0xa4, 0x3b, 0x5d, 0x20, 0x88, 0x3d, 0xb4, 0xe5, 0xdc, 0xbc, 0x93, 0xf0, 0x70, 0xbc, 0xf6, 0x1a, 0x31, 0xbd, 0xe0, 0xc3, 0x75, 0x3c, 0xbc, 0x2b, 0x96, 0x3c, 0x5b, 0x81, 0x44, 0xbd, 0x6e, 0x2f, 0xab, 0xbc, 0x4c, 0x4e, 0x82, 0x3d, 0x6c, 0x17, 0x9b, 0xbc, 0x70, 0x5a, 0x16, 0xbc, 0x70, 0x5e, 0x10, 0x3c, 0x81, 0xf0, 0x7d, 0xbd, 0x55, 0xca, 0x3d, 0x3d, 0xca, 0x75, 0xa2, 0xbc, 0x7f, 0xc2, 0xe2, 0xbb, 0xc4, 0x59, 0x82, 0x3d, 0xbd, 0xde, 0xd0, 0xbc, 0xe6, 0x4c, 0x3a, 0x3d, 0x62, 0xc7, 0x62, 0x3d, 0x3e, 0xd2, 0xc1, 0xba, 0xeb, 0xae, 0xb3, 0xbb, 0x39, 0xf0, 0xa2, 0x3c, 0xd0, 0xa2, 0x18, 0xbd, 0x65, 0xea, 0x99, 0x3b, 0xd0, 0x01, 0x8d, 0xbc, 0x34, 0x0c, 0x84, 0xbd, 0xc3, 0x10, 0x3f, 0xbd, 0xb0, 0x26, 0xc4, 0x3b, 0xde, 0xc4, 0x2e, 0x3d, 0xb4, 0x3f, 0xe5, 0x3c, 0x80, 0x6d, 0xda, 0x3b, 0xd3, 0x01, 0x8f, 0x3d, 0x7b, 0x2e, 0x70, 0x3b, 0x95, 0x55, 0x51, 0xbd, 0xc2, 0x13, 0x4a, 0x3d, 0x70, 0xd8, 0x4a, 0x3d, 0x6d, 0xf3, 0xc7, 0xbb, 0x40, 0x46, 0xe8, 0x3c, 0x71, 0x53, 0x85, 0x3a, 0xea, 0x87, 0xf9, 0x3c, 0xb0, 0xb0, 0xf5, 0x3c, 0xf2, 0x2a, 0x58, 0x3d, 0xe8, 0xd7, 0xc4, 0x3c, 0x57, 0xd9, 0xc8, 0x3c, 0xf3, 0x05, 0x79, 0xbd, 0x9c, 0x0e, 0xf5, 0xbb, 0xcd, 0xaa, 0x1b, 0xbc, 0x42, 0xa2, 0x22, 0x3d, 0x3e, 0x81, 0xe3, 0x3c, 0x66, 0x13, 0x2a, 0xbd, 0x6d, 0xfd, 0x8f, 0x3d, 0xd3, 0x64, 0xab, 0x3c, 0x1e, 0x94, 0xba, 0x3c, 0x68, 0x42, 0x45, 0xbd, 0x4c, 0x0e, 0xaf, 0xbc, 0x90, 0xbf, 0x7e, 0x3d, 0x6f, 0x71, 0x91, 0x3d, 0xc3, 0xb6, 0x80, 0x3d, 0x3a, 0xbd, 0x32, 0xbd, 0x08, 0x63, 0x11, 0xbc, 0xec, 0xf4, 0x08, 0x3d, 0x60, 0x5c, 0xcc, 0x3b, 0x66, 0x5b, 0x59, 0xbd, 0xb9, 0xcb, 0x8d, 0xbd, 0xfd, 0x30, 0x54, 0x3d, 0x2e, 0xaa, 0x0f, 0xbc, 0x80, 0x26, 0x1a, 0xbb, 0x47, 0x43, 0x19, 0xbd, 0x2c, 0x5d, 0xb8, 0x3c, 0x6c, 0xa6, 0xe8, 0x3c, 0xec, 0x3c, 0xcb, 0xbc, 0x61, 0x53, 0xa4, 0x3c, 0x68, 0xf1, 0x0a, 0x3c, 0x9c, 0x5f, 0x30, 0x3d, 0x5b, 0x39, 0xb8, 0xbc, 0xd2, 0x8d, 0x99, 0xbc, 0xe7, 0x1e, 0x31, 0xbd, 0x61, 0x4e, 0x2c, 0xbd, 0x11, 0xeb, 0xb3, 0xbc, 0x80, 0x2e, 0x0b, 0xbc, 0x57, 0xbf, 0x75, 0x3c, 0xbb, 0xd3, 0x2b, 0x3d, 0xba, 0xc5, 0x1b, 0x3d, 0x43, 0x78, 0x80, 0x3d, 0xeb, 0x30, 0x0a, 0x3c, 0xf7, 0xf8, 0x04, 0x3d, 0x1f, 0x88, 0x17, 0xbd, 0x7c, 0x55, 0xf0, 0xbc, 0x4a, 0x93, 0x3c, 0x3d, 0x7a, 0x12, 0x5c, 0xbd, 0x54, 0x6b, 0x42, 0xbd, 0xa0, 0x16, 0xd8, 0x3b, 0x20, 0x3e, 0x3b, 0x3b, 0x3c, 0xde, 0x72, 0xbd, 0x68, 0x37, 0x68, 0xbd, 0x37, 0x55, 0x97, 0xbb, 0x19, 0x7b, 0x43, 0xbd, 0x82, 0xce, 0x8a, 0xbd, 0xcf, 0xc2, 0x88, 0xbd, 0x30, 0xde, 0xd8, 0x3b, 0xf1, 0xc1, 0xa9, 0x3c, 0x68, 0x51, 0x2d, 0x3d, 0x76, 0xd5, 0xac, 0x3c, 0xb8, 0x4b, 0x78, 0xbb, 0x0f, 0x1c, 0x5d, 0xbd, 0xf7, 0x31, 0x25, 0xbd, 0x72, 0x4c, 0x91, 0x3d, 0x6e, 0x4f, 0x51, 0x3d, 0xb4, 0x9b, 0x21, 0xbd, 0x03, 0x73, 0xdd, 0xbc, 0x38, 0x49, 0x4f, 0x3c, 0xb8, 0xc7, 0x4f, 0x3d, 0x6a, 0x17, 0x0a, 0xba, 0xf4, 0x4f, 0xcd, 0x3c, 0x93, 0x14, 0x86, 0xbd, 0xde, 0x1e, 0x31, 0x3c, 0x57, 0x45, 0xf1, 0x3c, 0x53, 0xc3, 0x7c, 0x3d, 0xc8, 0x1a, 0xd8, 0x3c, 0x85, 0xf4, 0x8d, 0x3d, 0xf2, 0xaa, 0x46, 0x3d, 0xa6, 0x5c, 0x73, 0x3d, 0xf8, 0x5a, 0x3c, 0x3d, 0xd0, 0x85, 0xaf, 0x3c, 0x60, 0x1f, 0xa0, 0x3c, 0xef, 0xcb, 0x45, 0xbd, 0x68, 0xc2, 0x24, 0x3d, 0x25, 0x65, 0x14, 0x3b, 0x0c, 0x01, 0x67, 0x3d, 0x43, 0x57, 0x65, 0xbd, 0x50, 0x8f, 0xec, 0x3b, 0x88, 0xf5, 0x16, 0x3d, 0xde, 0xa3, 0xe2, 0xbc, 0x92, 0x11, 0xfb, 0x3c, 0x35, 0x93, 0x26, 0x3d, 0x96, 0xe4, 0x70, 0x3d, 0x30, 0xea, 0x40, 0x3c, 0x50, 0x65, 0x37, 0x3c, 0x56, 0xf8, 0x84, 0xbd, 0x36, 0xc0, 0x8e, 0x3d, 0x58, 0x45, 0x6b, 0xbd, 0x46, 0xcc, 0x5e, 0xbc, 0x41, 0x2a, 0x4f, 0xbd, 0x5f, 0xce, 0x80, 0xbb, 0xfb, 0x75, 0xae, 0xbc, 0x19, 0xe3, 0x0b, 0xbd, 0x54, 0x3e, 0x8a, 0x3c, 0x41, 0x54, 0xb7, 0x39, 0x8f, 0xb4, 0x80, 0x3d, 0xfb, 0x42, 0x00, 0x3d, 0x5e, 0x0b, 0x19, 0xbd, 0x5d, 0x03, 0xb5, 0x3c, 0xd8, 0x30, 0x78, 0x3c, 0x3e, 0xef, 0x90, 0xbc, 0xe0, 0x2c, 0xdb, 0x3b, 0x0a, 0x5a, 0xfc, 0xbc, 0x24, 0x7e, 0x90, 0xbd, 0x1a, 0xd4, 0x1b, 0x3d, 0x10, 0x0a, 0x87, 0x3d, 0xa3, 0x8c, 0x3b, 0xbd, 0x3f, 0x54, 0xda, 0xbc, 0x0f, 0x59, 0xd8, 0x3b, 0xbe, 0xea, 0xea, 0x3c, 0x39, 0x2d, 0x7e, 0xbd, 0x19, 0xa0, 0x73, 0xba, 0x3c, 0xc5, 0x60, 0xbd, 0x57, 0x9e, 0x70, 0xbd, 0xdc, 0x65, 0xfb, 0x3b, 0xbc, 0x13, 0x32, 0xbd, 0xa4, 0xd0, 0x81, 0xbd, 0x5f, 0x74, 0x85, 0x3d, 0x1a, 0xf5, 0x58, 0x3d, 0xa3, 0x35, 0x7c, 0x3d, 0xb3, 0x3d, 0x87, 0x3c, 0x83, 0xc6, 0x6b, 0x3d, 0xff, 0xe3, 0x8e, 0x3d, 0x97, 0xab, 0x01, 0xbd, 0x7c, 0xd4, 0x85, 0x3d, 0xa0, 0xbd, 0x83, 0xbc, 0x04, 0x12, 0x41, 0x3d, 0x9e, 0x3d, 0x57, 0xbd, 0xa2, 0x37, 0xc1, 0x3c, 0xf2, 0xa6, 0x81, 0xbd, 0xe0, 0xde, 0xe6, 0xbc, 0xa0, 0x4b, 0xd4, 0xbb, 0xe8, 0x33, 0xd8, 0xbc, 0x9a, 0x4c, 0x55, 0x3d, 0x16, 0xc0, 0x91, 0xbd, 0x28, 0xa0, 0x1e, 0x3c, 0xfc, 0xc7, 0x5f, 0xbc, 0xc1, 0x5e, 0x95, 0x3c, 0xc4, 0x85, 0xa0, 0x3c, 0xf5, 0x01, 0xd7, 0xbc, 0xf3, 0x15, 0xcc, 0xbb, 0x52, 0x0c, 0x2c, 0xbd, 0xea, 0xdf, 0x7b, 0x3d, 0x06, 0xe0, 0x26, 0xbc, 0x7a, 0x9a, 0x8d, 0xbd, 0x9c, 0xdb, 0xac, 0x3c, 0x4b, 0xfa, 0x2f, 0x3d, 0xe4, 0x93, 0xf1, 0x3c, 0x89, 0xe5, 0x91, 0xbd, 0xda, 0x41, 0x28, 0xbd, 0x52, 0x6f, 0x58, 0x3d, 0x89, 0x2f, 0x43, 0xbd, 0x74, 0xe4, 0x00, 0xbd, 0x59, 0xd4, 0x26, 0xbd, 0x97, 0x79, 0xa9, 0x3c, 0xb0, 0x62, 0x9f, 0xb9, 0xbc, 0xac, 0x04, 0x3d, 0x5c, 0xce, 0x3d, 0xbd, 0x15, 0x58, 0x67, 0xbd, 0x0a, 0xce, 0xf4, 0xbc, 0x3a, 0x8f, 0x01, 0xbd, 0x50, 0xd2, 0x73, 0xbc, 0x8e, 0x54, 0x16, 0xbc, 0xea, 0xd7, 0x3c, 0x3d, 0xf0, 0xbe, 0xd7, 0x3c, 0x1a, 0x3d, 0x82, 0xbd, 0xba, 0x91, 0x2f, 0x3d, 0x10, 0xb0, 0x92, 0xbd, 0xf8, 0x36, 0x1c, 0x3d, 0x50, 0x2a, 0x8f, 0xbd, 0xb0, 0x09, 0x5e, 0x3d, 0x3b, 0xc8, 0x8f, 0xba, 0xf4, 0xce, 0x92, 0xbd, 0x38, 0xc4, 0x78, 0xbd, 0xe0, 0x8c, 0x5c, 0xbc, 0x98, 0x6b, 0x8b, 0x3d, 0x16, 0x7f, 0x4a, 0x3d, 0x18, 0xc0, 0xfe, 0xbc, 0x66, 0xbb, 0x4b, 0xbd, 0x90, 0xb6, 0xe1, 0x3b, 0x98, 0xca, 0x8c, 0x3c, 0x05, 0xfe, 0xec, 0xbc, 0x58, 0x1c, 0x17, 0x3d, 0x37, 0x17, 0x80, 0x3d, 0x41, 0x6e, 0x14, 0x3d, 0xee, 0x95, 0xcb, 0xbb, 0x1a, 0x56, 0x1f, 0xbd, 0xae, 0xc7, 0x2c, 0x3c, 0x28, 0x3a, 0x80, 0x3b, 0x00, 0x13, 0x76, 0xbc, 0x69, 0xaf, 0x5e, 0xbc, 0x80, 0xcc, 0x02, 0xbd, 0xa8, 0xea, 0x04, 0xba, 0xb8, 0xae, 0x09, 0x3d, 0xb3, 0x0d, 0x8d, 0x3d, 0xc0, 0x22, 0x84, 0xba, 0x04, 0x62, 0x5c, 0xbd, 0xd8, 0x28, 0x09, 0x3c, 0x68, 0xd3, 0x41, 0x3c, 0x62, 0x52, 0x1e, 0x3d, 0x99, 0x42, 0x03, 0xbd, 0x3b, 0x4b, 0xd9, 0xba, 0x68, 0x5e, 0x32, 0xbd, 0x8b, 0x9e, 0x26, 0xbb, 0x9c, 0xd7, 0xcd, 0x3c, 0x4e, 0xdc, 0x16, 0x3d, 0x42, 0x1a, 0x07, 0x3d, 0xbb, 0xa6, 0x96, 0xbb, 0xf4, 0x47, 0x59, 0xbc, 0x13, 0xa3, 0xa1, 0xbc, 0x8f, 0x58, 0x0f, 0xbc, 0x88, 0xd1, 0x1d, 0xbd, 0xe0, 0x0f, 0xfb, 0x3c, 0x81, 0xd3, 0x90, 0x3d, 0xe0, 0x4b, 0x4f, 0xbd, 0x3f, 0x4a, 0x80, 0x3d, 0x3a, 0x63, 0x67, 0x3d, 0xe2, 0xee, 0x1e, 0x3c, 0xf8, 0x65, 0xdd, 0x3b, 0x1c, 0x30, 0x09, 0xbd, 0xe9, 0x2f, 0xdb, 0xbc, 0x94, 0x36, 0x55, 0xbd, 0x2c, 0xa4, 0x95, 0x3a, 0x78, 0x24, 0x2f, 0x3d, 0xc7, 0x9c, 0x44, 0xbd, 0xb5, 0x09, 0x10, 0xbd, 0x7d, 0x10, 0x49, 0xbd, 0x60, 0xd3, 0x43, 0x3c, 0xef, 0x67, 0x05, 0xbd, 0x0a, 0x1d, 0x6c, 0x3d, 0xaa, 0x4d, 0x0c, 0x3d, 0x84, 0xfc, 0x8a, 0xbc, 0x0d, 0xf7, 0x65, 0xbd, 0x5c, 0x71, 0x93, 0xbc, 0xd8, 0xe9, 0x2a, 0x3d, 0x1d, 0xd9, 0xc6, 0xbc, 0xd6, 0xeb, 0x70, 0xbd, 0xef, 0x92, 0x41, 0xbd, 0x4a, 0xd3, 0x83, 0xbd, 0x1e, 0xf1, 0x74, 0x3b, 0xa3, 0xb4, 0x1e, 0xbc, 0x4f, 0x0c, 0x12, 0x3d, 0x69, 0xf6, 0x25, 0x3d, 0x5a, 0x52, 0x35, 0x3d, 0xb5, 0x14, 0x37, 0x3d, 0x2b, 0xf9, 0x2d, 0xbd, 0xb8, 0xc6, 0x12, 0x3d, 0x2e, 0xeb, 0xf8, 0xbb, 0x31, 0xe0, 0x43, 0xbd, 0x37, 0x68, 0xf4, 0x3b, 0x4e, 0xd7, 0x55, 0xbd, 0xf2, 0x8f, 0x06, 0x3d, 0xa3, 0xe0, 0x8a, 0x3d, 0x47, 0xcb, 0x91, 0x3d, 0xc3, 0xaa, 0x1c, 0xbd, 0x43, 0x44, 0x24, 0x3d, 0x5a, 0xcc, 0x30, 0xbd, 0x72, 0xbe, 0x27, 0x3c, 0xfc, 0xd5, 0xbe, 0x3c, 0x34, 0x0e, 0x3f, 0x3d, 0xdc, 0x3d, 0x7b, 0xbc, 0x64, 0xe1, 0xa9, 0x3c, 0x00, 0x61, 0x80, 0x3b, 0x19, 0xd4, 0x82, 0xbd, 0x41, 0xef, 0x8c, 0x3d, 0x90, 0x50, 0x11, 0xbd, 0x0d, 0x32, 0x8d, 0x3d, 0x56, 0x78, 0x5f, 0x3c, 0x71, 0x44, 0x6c, 0x3d, 0x21, 0xe4, 0x22, 0x3d, 0x31, 0xfd, 0xb4, 0xbb, 0xcc, 0x10, 0x7e, 0x3c, 0x7a, 0xb4, 0x06, 0x3d, 0xc5, 0xde, 0x22, 0xbc, 0xd2, 0x57, 0xfe, 0x3c, 0x30, 0x95, 0x81, 0xbd, 0x00, 0x6d, 0xde, 0x39, 0xfd, 0x2b, 0x3f, 0x3d, 0x8f, 0xe7, 0xf4, 0x3b, 0x2b, 0xf8, 0xa3, 0xbc, 0xcf, 0x7c, 0x4e, 0x3d, 0x86, 0xee, 0xf7, 0x3c, 0x20, 0x5a, 0x22, 0xbb, 0x1a, 0xa9, 0x62, 0xbd, 0x0f, 0x24, 0x7f, 0x3d, 0x74, 0x7e, 0x00, 0x3d, 0x24, 0xd2, 0xcb, 0xbc, 0x06, 0xc6, 0x44, 0xbd, 0xe1, 0x53, 0xa3, 0x3c, 0x7d, 0x24, 0x08, 0x3d, 0xf6, 0x9f, 0x23, 0xbd, 0x3f, 0xb0, 0x84, 0xbd, 0xb0, 0xbb, 0xbc, 0x3c, 0x74, 0x6c, 0x22, 0xbc, 0x0b, 0x32, 0x50, 0xbd, 0x81, 0x6f, 0x8b, 0x3d, 0x98, 0x37, 0xc3, 0x3c, 0xfd, 0x30, 0x08, 0xbd, 0x11, 0x42, 0x01, 0xbd, 0xd6, 0x91, 0x16, 0x3c, 0x6e, 0xf1, 0xc2, 0x3a, 0xed, 0x4b, 0x8c, 0xbd, 0x51, 0x70, 0x34, 0xbd, 0x2a, 0x7e, 0x1c, 0x3b, 0x5a, 0x96, 0xcd, 0x37, 0x9a, 0x8e, 0xf8, 0x3c, 0xce, 0x8a, 0x6d, 0x3d, 0x62, 0xb2, 0x38, 0x3d, 0x70, 0x0a, 0xbe, 0xbc, 0xd0, 0x3f, 0x66, 0xbc, 0xf4, 0xfe, 0x24, 0x3d, 0xbe, 0xf9, 0x89, 0x3c, 0xa0, 0x2b, 0xc1, 0xbc, 0x02, 0x6d, 0x41, 0x3c, 0xa4, 0x00, 0x14, 0xbd, 0xbc, 0xa1, 0xd1, 0x3b, 0xbc, 0x27, 0xa6, 0x3c, 0xc8, 0x08, 0xfd, 0xbc, 0xa1, 0x0e, 0x9c, 0xbc, 0xa1, 0x28, 0x07, 0xbc, 0x33, 0xf3, 0x71, 0x3c, 0x96, 0xed, 0x1f, 0x3d, 0xf6, 0x6d, 0x5e, 0xbd, 0x30, 0x7c, 0x12, 0xbc, 0xf2, 0xaf, 0x7b, 0x3d, 0x56, 0xfa, 0x36, 0xbd, 0x7a, 0x6f, 0x3a, 0x3d, 0x40, 0x65, 0x8f, 0x3c, 0x2c, 0xa1, 0x4f, 0xbc, 0x80, 0x0f, 0x7b, 0x3b, 0xaf, 0xc3, 0xf2, 0x3c, 0xae, 0x39, 0x8a, 0xbd, 0xd5, 0xf6, 0x42, 0xbd, 0x12, 0x9c, 0x33, 0x3d, 0x88, 0x27, 0x4d, 0x3d, 0x61, 0x05, 0x1e, 0xbd, 0x02, 0xcd, 0x04, 0xbd, 0xe8, 0x6f, 0xe1, 0x3c, 0xf8, 0xd2, 0x73, 0x3d, 0xb9, 0xa3, 0x61, 0xbd, 0x64, 0x01, 0x92, 0x3c, 0x4f, 0x8e, 0x21, 0xbc, 0x8b, 0xf5, 0x18, 0x3d, 0xce, 0x3b, 0x77, 0x3d, 0x8d, 0x0e, 0x97, 0x3a, 0x30, 0xfc, 0x85, 0x3c, 0x1f, 0x24, 0x8e, 0x3a, 0xca, 0xdd, 0x4e, 0x3d, 0x5f, 0x7c, 0xfe, 0x3b, 0x84, 0xdf, 0x2d, 0x3d, 0x7a, 0x5c, 0x8c, 0x3d, 0x90, 0xf3, 0x79, 0xbc, 0x4f, 0x99, 0x17, 0xbd, 0x30, 0xb1, 0xd2, 0xbb, 0x1c, 0x5a, 0x32, 0xbd, 0xd4, 0x8c, 0xd9, 0x3c, 0x08, 0x56, 0xec, 0x3c, 0xf0, 0xcf, 0x64, 0xbd, 0xf0, 0x2a, 0xf1, 0xbb, 0x28, 0x09, 0x0c, 0xbc, 0x0f, 0xf7, 0x8d, 0xbd, 0x86, 0x8f, 0x59, 0xbd, 0xfa, 0xbf, 0x52, 0xbd, 0x76, 0x65, 0x4c, 0xbd, 0x79, 0xaa, 0x16, 0xbd, 0x9e, 0x6f, 0xa7, 0xbc, 0xac, 0x9e, 0x8f, 0xbd, 0x5a, 0xfc, 0x7b, 0xbd, 0x90, 0xe3, 0x20, 0x3d, 0xd0, 0x2b, 0x81, 0x3d, 0xc1, 0xbf, 0x85, 0x3d, 0x48, 0x79, 0x44, 0x3d, 0x3e, 0x7b, 0x6d, 0x3d, 0x2b, 0x83, 0x11, 0x3d, 0x45, 0x84, 0x38, 0x3d, 0xbd, 0x6d, 0x47, 0xb8, 0xe9, 0x7c, 0x29, 0xbd, 0x51, 0xd2, 0xc9, 0x3c, 0x77, 0x53, 0xf0, 0x3b, 0xca, 0xc2, 0x17, 0xbd, 0xb2, 0xbc, 0x13, 0x3d, 0xbc, 0x58, 0xf9, 0x3c, 0xed, 0x65, 0xed, 0x3c, 0x05, 0xdd, 0x8e, 0xbc, 0x0f, 0xa5, 0x96, 0xbc, 0xd2, 0x96, 0x00, 0x3d, 0x90, 0xfe, 0x5c, 0x3d, 0x1f, 0x18, 0x90, 0xbd, 0x68, 0xbb, 0xc8, 0x3c, 0x86, 0xae, 0xbb, 0xbc, 0x8a, 0x69, 0xea, 0xbc, 0x28, 0x6a, 0x7c, 0x3c, 0x32, 0x5f, 0x70, 0x3d, 0xdd, 0x12, 0xd4, 0xba, 0xca, 0x54, 0x56, 0xbd, 0x46, 0x94, 0x3f, 0xbd, 0x28, 0x3e, 0xa6, 0x3c, 0x93, 0x06, 0x43, 0xbd, 0x58, 0xc7, 0xf0, 0x3c, 0x5d, 0x14, 0xa9, 0xbb, 0x58, 0x98, 0xc8, 0xbc, 0x89, 0x34, 0x8d, 0x3d, 0x39, 0x90, 0x7b, 0x3d, 0x66, 0x18, 0x63, 0x3d, 0x60, 0x47, 0x4d, 0x3b, 0x1d, 0x50, 0x6c, 0xbd, 0x55, 0x74, 0x27, 0x3d, 0x11, 0xf1, 0x66, 0xbd, 0x14, 0xe6, 0x90, 0x3d, 0xdf, 0x99, 0x88, 0x3d, 0x9b, 0xc6, 0x67, 0x3d, 0x16, 0xca, 0xd3, 0xbc, 0x79, 0xad, 0x87, 0x3d, 0x52, 0x56, 0x7b, 0x3d, 0x6e, 0x19, 0x14, 0xbc, 0x12, 0x02, 0x26, 0x3d, 0xaf, 0x26, 0x1b, 0xbd, 0x5e, 0x09, 0x8c, 0xbd, 0xa2, 0x3c, 0x5f, 0x3d, 0x60, 0x7e, 0x7d, 0xbd, 0x10, 0xc0, 0x85, 0xbd, 0x70, 0x15, 0xc4, 0x3b, 0xe0, 0xfa, 0xf8, 0x3b, 0xe6, 0x2e, 0x00, 0x3d, 0xf7, 0xd5, 0x1f, 0x3d, 0x48, 0x70, 0x60, 0x3d, 0x2a, 0x3a, 0xed, 0xbc, 0xfd, 0x05, 0x26, 0xbc, 0x67, 0xf0, 0xee, 0x3a, 0x7e, 0x6e, 0x46, 0x3d, 0x57, 0x87, 0x90, 0x3d, 0x22, 0xdb, 0x65, 0xbd, 0x70, 0xad, 0x7a, 0x3c, 0xa6, 0xb5, 0xc3, 0x3c, 0xd4, 0xfa, 0x12, 0x3c, 0x4e, 0x84, 0x2f, 0xbd, 0x00, 0x37, 0x63, 0xbb, 0xfb, 0x25, 0x41, 0xbc, 0x38, 0xa5, 0x84, 0x3d, 0x8a, 0xd7, 0x5a, 0xbd, 0x11, 0xf7, 0xd6, 0xbb, 0xd1, 0x99, 0x22, 0xbd, 0xc8, 0xfc, 0x83, 0x3c, 0xd8, 0x91, 0xd8, 0xbc, 0xa6, 0xf0, 0x3f, 0xbd, 0x08, 0x4d, 0x3b, 0x3d, 0xdd, 0x56, 0x4c, 0xbd, 0xeb, 0x23, 0x8d, 0xbd, 0x23, 0x09, 0xcc, 0x3c, 0xbb, 0x3d, 0x8a, 0x3d, 0x47, 0xb9, 0x75, 0xbd, 0x69, 0x75, 0x82, 0x3d, 0x30, 0x78, 0x86, 0x3c, 0x0c, 0xc2, 0xd6, 0xbc, 0x2a, 0x22, 0x51, 0x3d, 0x9c, 0xfa, 0x3b, 0xbc, 0x00, 0x4b, 0xbf, 0x39, 0x10, 0x58, 0xe6, 0xbb, 0x22, 0xa4, 0x47, 0x3d, 0x8b, 0xd1, 0x6f, 0x3c, 0xf3, 0x8b, 0x23, 0xbd, 0xad, 0x67, 0x71, 0xbd, 0xa4, 0xbb, 0x71, 0xbc, 0x68, 0x9d, 0x36, 0x3d, 0x79, 0xda, 0x00, 0x3d, 0x30, 0x88, 0x15, 0x3d, 0xc4, 0x55, 0xab, 0x3c, 0xd0, 0xbe, 0x4f, 0x3d, 0x43, 0xa2, 0x8b, 0x3d, 0xc0, 0x0b, 0x27, 0xbc, 0xfe, 0x35, 0x91, 0xbd, 0x27, 0x33, 0x5b, 0xbc, 0xc5, 0x00, 0x91, 0xb9, 0x3e, 0x30, 0x74, 0xbd, 0x1c, 0x92, 0x70, 0xbd, 0xfe, 0x13, 0x56, 0xbb, 0x63, 0x1b, 0x84, 0x3d, 0x24, 0x9a, 0xa1, 0x3c, 0x93, 0x78, 0x83, 0xbc, 0x29, 0xb2, 0xce, 0x3c, 0x05, 0x6f, 0x8f, 0x3d, 0xe8, 0xb4, 0x3b, 0xbd, 0x12, 0x90, 0x8e, 0x3d, 0x58, 0x6a, 0x76, 0xbd, 0xee, 0x8f, 0x90, 0xbd, 0x1e, 0x98, 0xde, 0xbc, 0x88, 0x22, 0x40, 0x3d, 0x1b, 0x7f, 0x87, 0xbd, 0x3e, 0x25, 0x5e, 0x3d, 0x38, 0xf3, 0x0c, 0xbc, 0x77, 0x6a, 0x8b, 0xbd, 0x0c, 0x98, 0x08, 0xbc, 0xbd, 0x52, 0xf6, 0x3c, 0x2d, 0x2f, 0x03, 0xbd, 0x15, 0xbf, 0x91, 0x3d, 0xba, 0x41, 0xef, 0xbc, 0xdf, 0x02, 0xab, 0xbc, 0xe4, 0xac, 0x7e, 0x3d, 0x9e, 0x8c, 0x51, 0x3d, 0xcc, 0x12, 0x01, 0x3d, 0xfc, 0xfb, 0x1b, 0xbd, 0x75, 0x2b, 0x81, 0xbd, 0x6a, 0xbf, 0x20, 0x3d, 0xbb, 0x3c, 0x77, 0xbd, 0xae, 0x2f, 0x74, 0xbd, 0x58, 0x94, 0x53, 0xbd, 0xa0, 0xcf, 0xd4, 0x3c, 0x68, 0x51, 0xd1, 0x3c, 0x1c, 0x40, 0x22, 0xbd, 0x86, 0x62, 0x04, 0x3d, 0x9c, 0x10, 0x02, 0xbd, 0x5d, 0x31, 0x49, 0xbb, 0x5d, 0x8e, 0xf5, 0xbc, 0xb8, 0xef, 0x44, 0xbc, 0x06, 0xe5, 0x50, 0xbd, 0xe6, 0x33, 0x40, 0xbd, 0x20, 0x2e, 0x39, 0x3b, 0x00, 0x2f, 0x96, 0xbb, 0x75, 0x2e, 0x80, 0xbd, 0x2c, 0x9f, 0x4e, 0x3d, 0xd0, 0x40, 0xf6, 0x3b, 0x2e, 0x56, 0x8e, 0x3d, 0xcf, 0x00, 0x15, 0x3d, 0xae, 0x5d, 0xc7, 0x3b, 0x44, 0x47, 0x05, 0x3d, 0x80, 0x19, 0x71, 0xbb, 0x8c, 0xce, 0x87, 0xbd, 0xd2, 0x30, 0x78, 0xbd, 0xcc, 0x7b, 0x14, 0xbd, 0xf4, 0xb8, 0x91, 0xbd, 0xbe, 0x76, 0x64, 0x3d, 0xf9, 0x7e, 0x80, 0x3d, 0xda, 0xf8, 0x13, 0xbd, 0x92, 0xd0, 0x11, 0xbd, 0x03, 0x64, 0x55, 0xbc, 0x50, 0x1a, 0xe8, 0xbc, 0x97, 0xeb, 0x5e, 0xbd, 0x7c, 0xf8, 0x90, 0x3d, 0xc4, 0x26, 0x4b, 0x3d, 0xc2, 0x04, 0x7d, 0xbd, 0x25, 0x41, 0x14, 0x3b, 0xac, 0xc2, 0xdf, 0x3c, 0xda, 0x60, 0xd3, 0xbc, 0x1b, 0x00, 0x45, 0xbd, 0x7e, 0x09, 0xac, 0xbc, 0x28, 0x65, 0xcb, 0xbc, 0xe6, 0xd0, 0xb2, 0xbc, 0xb8, 0xdf, 0xae, 0x3c, 0xc8, 0xb7, 0xca, 0x3c, 0x98, 0x50, 0xa1, 0x3c, 0x5c, 0xa2, 0xa0, 0xbc, 0x8c, 0x18, 0x56, 0x3d, 0xea, 0x98, 0x8e, 0xbd, 0xb5, 0xba, 0x49, 0x3b, 0xff, 0x2b, 0xaf, 0x3c, 0x91, 0xf6, 0x49, 0xbd, 0x0a, 0x19, 0x4d, 0x3d, 0xa1, 0x7e, 0x69, 0xbd, 0x6c, 0x77, 0x3e, 0xbc, 0xa0, 0x00, 0x6e, 0x3d, 0x81, 0xc6, 0xb1, 0x3b, 0x8b, 0xbf, 0x40, 0xbd, 0x5e, 0x71, 0xf5, 0xbc, 0x74, 0x2c, 0x96, 0xbc, 0x3d, 0x0c, 0x8b, 0xbd, 0x45, 0x9a, 0x8a, 0xbd, 0xdb, 0x49, 0xcb, 0x3c, 0x9b, 0x5b, 0x10, 0x3d, 0xf5, 0x79, 0x45, 0x3d, 0x5a, 0x50, 0x86, 0xbd, 0xf9, 0x2f, 0x7c, 0xbd, 0xf6, 0x3d, 0x19, 0xbd, 0x54, 0x10, 0x0c, 0x3b, 0xaf, 0x59, 0x27, 0xbd, 0x1f, 0x75, 0x78, 0x3d, 0x10, 0xb2, 0x9a, 0xbc, 0xc3, 0xb1, 0x99, 0xbc, 0xb4, 0x08, 0xac, 0x3c, 0x15, 0x41, 0x86, 0x3d, 0xc0, 0x2d, 0x46, 0xbb, 0xc4, 0x49, 0x56, 0xbc, 0xef, 0x2e, 0x7b, 0xbd, 0x6c, 0xee, 0x14, 0x3d, 0x70, 0xe7, 0x9c, 0x3c, 0x78, 0x7e, 0xfb, 0xbc, 0xf7, 0x06, 0x51, 0xbd, 0x52, 0xd4, 0x1a, 0xbd, 0xb0, 0x2b, 0xeb, 0xbc, 0xad, 0xad, 0x4e, 0xbd, 0xa4, 0x7c, 0xe3, 0x3c, 0x18, 0xa1, 0xd8, 0xbc, 0x6e, 0xa6, 0x8f, 0xbd, 0x79, 0x0d, 0xb7, 0xba, 0xb2, 0x10, 0x10, 0x3d, 0xe6, 0xcf, 0x52, 0x3d, 0x8e, 0x88, 0x35, 0x3d, 0xdd, 0x92, 0x8d, 0x3d, 0x54, 0x69, 0x83, 0xbc, 0xab, 0xa9, 0x88, 0xbd, 0xe0, 0xa7, 0x1c, 0xbb, 0x86, 0x10, 0x2c, 0xbd, 0x24, 0xde, 0x18, 0x3d, 0x4a, 0x04, 0x87, 0xbd, 0x42, 0x3c, 0x16, 0xbd, 0x62, 0x25, 0x90, 0xbd, 0xce, 0x01, 0x64, 0xbd, 0x2c, 0x76, 0x6f, 0xbd, 0xd2, 0x15, 0x0b, 0xbd, 0x45, 0x72, 0x73, 0x3b, 0xeb, 0x46, 0x02, 0xbd, 0x05, 0x12, 0x1c, 0xbd, 0xb8, 0x16, 0x22, 0xbd, 0xe5, 0x22, 0x89, 0x3d, 0x8c, 0x8a, 0xf4, 0x3c, 0x40, 0x6b, 0xe4, 0x3a, 0x5c, 0xe2, 0x70, 0xbd, 0x56, 0x08, 0x67, 0xbd, 0x5b, 0xec, 0x4d, 0x3d, 0xba, 0x4d, 0x2a, 0xbd, 0xb9, 0x55, 0xa4, 0xbc, 0xb7, 0xd7, 0x39, 0x3d, 0xa0, 0x88, 0xfe, 0x3c, 0xbf, 0x7d, 0x6b, 0xbd, 0xcd, 0xdf, 0xe3, 0xbc, 0x26, 0xa0, 0x3e, 0x3d, 0x19, 0x4b, 0x17, 0x3d, 0x54, 0x84, 0xa7, 0xbc, 0x78, 0x9a, 0x6a, 0xbd, 0x80, 0xcc, 0xa7, 0x3c, 0x58, 0x48, 0x3a, 0x3d, 0xd9, 0x9a, 0xe3, 0xbc, 0xe0, 0xa2, 0xb8, 0x3c, 0x3f, 0x32, 0x4d, 0x3d, 0x8e, 0xa6, 0x80, 0xbc, 0x0f, 0xfc, 0xd6, 0xbb, 0x40, 0x70, 0x8b, 0xbd, 0xe3, 0xa3, 0xf6, 0xbb, 0x40, 0x26, 0x33, 0xbb, 0x43, 0xb2, 0x01, 0xbd, 0x2e, 0xf9, 0x27, 0xbd, 0x6c, 0xcf, 0x54, 0x3c, 0xae, 0xca, 0x4d, 0x3c, 0x6e, 0x2d, 0x1d, 0x3a, 0x04, 0xda, 0x94, 0xbc, 0x2c, 0x2b, 0xc6, 0x3c, 0x59, 0xc8, 0x1a, 0xbd, 0x80, 0x56, 0xcb, 0x3b, 0xf4, 0xce, 0xa1, 0x3c, 0x84, 0xdd, 0xeb, 0x3c, 0x95, 0x36, 0x83, 0xbd, 0x60, 0xeb, 0x47, 0x3d, 0x90, 0xf8, 0x63, 0x3d, 0x8a, 0xc4, 0x6a, 0xbc, 0x40, 0x25, 0xa9, 0x3b, 0x7a, 0xfc, 0x65, 0x3d, 0xe2, 0xcd, 0x33, 0x3d, 0x69, 0x80, 0xe5, 0xbc, 0xf7, 0xc5, 0x42, 0xbc, 0x17, 0xf4, 0x31, 0xbd, 0xbe, 0xb3, 0x79, 0x3d, 0xff, 0xfc, 0x6c, 0x3d, 0xc5, 0x04, 0x7d, 0xbc, 0xd9, 0x4f, 0x8e, 0x3d, 0xfe, 0xd3, 0x86, 0xbd, 0xcd, 0xeb, 0x3f, 0x3d, 0xd8, 0x90, 0x2e, 0xbd, 0x56, 0x17, 0xbf, 0x3c, 0xbb, 0x23, 0x83, 0xbd, 0x69, 0x4a, 0x43, 0x3d, 0x0a, 0x76, 0x5e, 0xbd, 0xee, 0x69, 0x8d, 0x3d, 0x75, 0xda, 0x1c, 0x3c, 0xe8, 0xf7, 0xe0, 0xbc, 0x53, 0xbe, 0xda, 0xb8, 0xc2, 0x03, 0x2e, 0xbd, 0xe4, 0xa0, 0x38, 0xbc, 0xbc, 0x5e, 0x3b, 0xbd, 0xfc, 0xfc, 0xb7, 0x3c, 0xd4, 0xfb, 0x13, 0xbd, 0xf6, 0x8c, 0x44, 0x3d, 0x70, 0x13, 0x9d, 0x3c, 0xf8, 0xb8, 0x11, 0xbc, 0xcc, 0x9b, 0x3b, 0xbd, 0xf7, 0x18, 0xe4, 0xbc, 0x89, 0xc3, 0x31, 0x3d, 0xde, 0x7c, 0x32, 0xbd, 0x3c, 0xc7, 0x97, 0x3c, 0x2e, 0xc0, 0xb8, 0xbc, 0xa2, 0xfe, 0x29, 0xbd, 0x17, 0xb2, 0x35, 0xbd, 0xaa, 0x83, 0xdd, 0x3c, 0x1e, 0xfa, 0x83, 0x3d, 0xc6, 0x4c, 0x16, 0x3d, 0xfd, 0x0f, 0x29, 0x3d, 0x2d, 0x90, 0xac, 0x3b, 0xfe, 0xe5, 0xc8, 0x3b, 0xac, 0x11, 0xc7, 0xbc, 0x2d, 0xf3, 0xfa, 0x3c, 0x2a, 0x75, 0x81, 0xbd, 0x2d, 0x84, 0xb4, 0x3c, 0xfd, 0xad, 0x66, 0xbc, 0xaa, 0x80, 0x2a, 0xbd, 0x58, 0x82, 0x8c, 0x3d, 0x75, 0x06, 0x78, 0x3d, 0x1b, 0xdd, 0x21, 0xbc, 0x1c, 0x40, 0x38, 0x3d, 0xe0, 0xdc, 0x6e, 0x3d, 0x50, 0xb8, 0x32, 0xbc, 0x80, 0x13, 0x4f, 0xbb, 0x32, 0x50, 0x6c, 0x3d, 0xce, 0x1b, 0xf1, 0xbc, 0xd8, 0x20, 0x02, 0x3d, 0x43, 0x68, 0xa2, 0x3c, 0x9a, 0x6c, 0x29, 0xbd, 0x8d, 0x90, 0x22, 0xbd, 0x14, 0xff, 0xe6, 0xbb, 0xb8, 0xcf, 0xc1, 0x3c, 0xa6, 0x3b, 0x4a, 0x3d, 0xac, 0xad, 0x11, 0x3d, 0x60, 0x19, 0xc9, 0x3c, 0x55, 0xae, 0xf1, 0xbc, 0x3d, 0xc0, 0x23, 0xbd, 0xa3, 0x00, 0xcd, 0xbb, 0x44, 0x9e, 0x17, 0x3d, 0xc0, 0x31, 0xe2, 0x3a, 0x30, 0xdf, 0xf4, 0x3c, 0x31, 0x09, 0x92, 0xbc, 0xa8, 0xbd, 0x66, 0x3c, 0xa5, 0x06, 0x4f, 0x3c, 0xdc, 0x2e, 0x92, 0xbd, 0xfb, 0x54, 0x87, 0xb9, 0x9b, 0x34, 0x1f, 0x3d, 0xd8, 0xf7, 0xa7, 0xbb, 0xff, 0x1d, 0x62, 0xbd, 0xe0, 0xf8, 0x3c, 0x3d, 0x85, 0x58, 0x8f, 0xbd, 0x75, 0xf9, 0x62, 0xbd, 0xef, 0xf5, 0x7a, 0xbd, 0x58, 0x32, 0x86, 0x3d, 0x90, 0x17, 0x29, 0x3c, 0x64, 0xcc, 0x4a, 0xbd, 0xf0, 0x07, 0xc1, 0xbc, 0x72, 0xdc, 0x64, 0xbd, 0x68, 0x3e, 0x2e, 0x3c, 0x38, 0x6d, 0x60, 0xbd, 0x46, 0x1f, 0x59, 0x3d, 0xd0, 0xa7, 0x3e, 0x3d, 0x77, 0x1d, 0x49, 0x3d, 0xcb, 0xed, 0x7f, 0xbd, 0xd8, 0x47, 0x40, 0x3c, 0x00, 0xf0, 0xee, 0x39, 0xcc, 0xea, 0x57, 0x3d, 0x10, 0x1d, 0x8a, 0xbd, 0xb9, 0x55, 0x5f, 0xbd, 0x17, 0x3c, 0x66, 0xbc, 0x02, 0xb8, 0x06, 0xbd, 0x5f, 0xfb, 0x16, 0xbd, 0x58, 0x15, 0x8c, 0x3d, 0x18, 0x99, 0x5f, 0x3d, 0x5f, 0x73, 0xb3, 0xbc, 0x61, 0x73, 0x63, 0x3d, 0x61, 0xf2, 0x7b, 0xbc, 0xbd, 0x2b, 0xad, 0x3a, 0xda, 0x99, 0x5c, 0xbd, 0x81, 0xd1, 0xd0, 0x3c, 0xf0, 0xf9, 0xb0, 0x3c, 0x84, 0x54, 0x68, 0x3c, 0x24, 0x10, 0x84, 0x3d, 0x4d, 0xec, 0xa2, 0x3b, 0xd3, 0xab, 0x1e, 0xbd, 0xbd, 0x4d, 0x84, 0x3d, 0xd0, 0xd9, 0xb6, 0x3c, 0x84, 0xdc, 0x71, 0xbd, 0x84, 0x4a, 0x03, 0x3d, 0x54, 0xb8, 0xc6, 0x3c, 0x0a, 0x84, 0x0e, 0x3d, 0xdc, 0xfe, 0x64, 0xbd, 0xa6, 0xc2, 0x19, 0x3d, 0xd1, 0x79, 0x4c, 0x3c, 0x7c, 0x16, 0xbd, 0x3c, 0xc1, 0x7d, 0x3c, 0xbc, 0xb2, 0xe7, 0x94, 0xbc, 0xf0, 0x46, 0x69, 0xbc, 0x2d, 0x5f, 0x68, 0x3c, 0xbc, 0x78, 0x44, 0xbd, 0xcf, 0x27, 0x97, 0xbd, 0x03, 0xfb, 0x4b, 0xbd, 0x0c, 0xc4, 0xcd, 0xbc, 0xd7, 0xc5, 0x11, 0xbd, 0x6b, 0xe3, 0xf5, 0xbb, 0xda, 0x4d, 0x75, 0x3d, 0xb0, 0xf1, 0x39, 0xbd, 0x02, 0x4e, 0x00, 0xbd, 0xcf, 0x22, 0x81, 0x3d, 0x48, 0x54, 0x10, 0xbd, 0x93, 0x8c, 0x42, 0x3a, 0x62, 0x1e, 0x18, 0x3d, 0xb5, 0x1d, 0x8d, 0x3d, 0xbe, 0x37, 0x54, 0xbc, 0x9e, 0xa3, 0x92, 0xbc, 0x6a, 0x91, 0x7b, 0x3d, 0xc5, 0x13, 0x8c, 0xbb, 0x30, 0x93, 0x55, 0xbd, 0x01, 0x29, 0x2b, 0xbd, 0xd4, 0x57, 0x3a, 0xbd, 0xaf, 0xbc, 0xed, 0x3c, 0x65, 0xfe, 0x66, 0xbd, 0x2c, 0x98, 0x11, 0x3d, 0x6e, 0xcf, 0x7c, 0xbd, 0xbe, 0xb4, 0x49, 0x3d, 0x17, 0x7c, 0x4f, 0xbc, 0x13, 0xfc, 0x28, 0x3d, 0x28, 0xca, 0x2b, 0xbd, 0xdf, 0x3e, 0xa3, 0x3b, 0x7e, 0xf4, 0x99, 0xbd, 0x9d, 0x89, 0x35, 0xbc, 0x70, 0x4c, 0x8a, 0xbd, 0xf9, 0x58, 0x3a, 0xbd, 0x6f, 0xa9, 0x4f, 0x3d, 0x30, 0xce, 0x59, 0xbc, 0x52, 0xd4, 0x41, 0xbd, 0x0d, 0x88, 0x2d, 0xbd, 0x94, 0xe1, 0x30, 0x3d, 0x7a, 0x53, 0xcd, 0xbb, 0x2d, 0xcc, 0x75, 0x3c, 0x18, 0x30, 0x24, 0x3d, 0xfb, 0xa8, 0x07, 0x3d, 0xa8, 0x1f, 0x19, 0xbc, 0xdf, 0x0a, 0x1c, 0x3d, 0x76, 0x06, 0x31, 0x3d, 0x6c, 0x40, 0x82, 0x3c, 0x72, 0xb0, 0x82, 0xbd, 0x10, 0xae, 0x67, 0x3d, 0x00, 0x02, 0xb5, 0x3a, 0x0a, 0xcd, 0x29, 0x3d, 0x7a, 0xf4, 0x27, 0x3c, 0x9d, 0xe2, 0x75, 0xbd, 0x1e, 0xcd, 0x09, 0x3c, 0xa7, 0x3e, 0x25, 0xbd, 0x90, 0xb7, 0x8b, 0xbd, 0xac, 0x2e, 0x6c, 0x3c, 0x22, 0x59, 0x79, 0x3d, 0xaf, 0x3b, 0x02, 0xba, 0x40, 0xb8, 0x2c, 0x3d, 0xe8, 0x48, 0x6e, 0x3d, 0x13, 0xdb, 0x2f, 0x3b, 0x89, 0x0e, 0x82, 0x3c, 0xdf, 0xe9, 0xc4, 0xbc, 0xc9, 0x26, 0x19, 0xbc, 0x67, 0x6b, 0x50, 0x3d, 0xc0, 0x4c, 0x10, 0xbd, 0x30, 0xa9, 0x40, 0x3c, 0x12, 0x2f, 0xb1, 0x3c, 0x3e, 0x0e, 0x00, 0xbd, 0xe9, 0x1b, 0x6f, 0xbd, 0xe4, 0x4b, 0x81, 0xbd, 0x93, 0xc1, 0x7f, 0x3d, 0xb7, 0x8d, 0x04, 0xbd, 0x68, 0x33, 0x29, 0xbc, 0xa4, 0x5e, 0x60, 0x3d, 0x23, 0xc0, 0x0a, 0xbd, 0xf0, 0x22, 0x80, 0xbd, 0x79, 0xea, 0x47, 0x3d, 0x10, 0x77, 0x87, 0x3d, 0xc1, 0xfb, 0x19, 0xbd, 0x9c, 0xf7, 0x7c, 0x3d, 0x27, 0x74, 0xb9, 0xbc, 0xc6, 0xea, 0x25, 0x3d, 0x54, 0xbc, 0xa4, 0x3c, 0x88, 0x18, 0x36, 0x3d, 0x74, 0xd5, 0xd3, 0x3c, 0x68, 0x6e, 0x24, 0x3d, 0x36, 0xb4, 0x49, 0x3d, 0x3e, 0x98, 0x2c, 0xbd, 0x99, 0x3e, 0x47, 0xbd, 0x21, 0xac, 0x15, 0x3d, 0xef, 0x4f, 0x26, 0xbd, 0xb4, 0x49, 0x3f, 0xbd, 0xf5, 0xbc, 0x0a, 0xbd, 0x04, 0x05, 0x6f, 0x3d, 0xf1, 0x5f, 0x15, 0x3d, 0xca, 0x51, 0x3f, 0x3d, 0xc2, 0x88, 0x3a, 0xbd, 0x40, 0xeb, 0xbf, 0x3c, 0x4c, 0x13, 0xb6, 0x3c, 0xe6, 0x26, 0xfe, 0x3c, 0xda, 0xab, 0x95, 0xbd, 0xd8, 0xcf, 0x81, 0x3d, 0xa2, 0x19, 0x53, 0xbd, 0x5d, 0x5e, 0x0d, 0xbd, 0xfe, 0x6b, 0x36, 0x3d, 0xfb, 0x27, 0x4c, 0xbd, 0x36, 0x92, 0x43, 0xbd, 0x94, 0xee, 0x45, 0xbc, 0x8a, 0x6d, 0xe4, 0x3c, 0xa8, 0xb1, 0x52, 0xbc, 0x1f, 0x82, 0x88, 0xbb, 0x73, 0x6b, 0x53, 0xbd, 0x56, 0xc3, 0x6f, 0x3d, 0x78, 0x17, 0x4a, 0x3d, 0xf2, 0x2e, 0x77, 0xbd, 0x2e, 0xae, 0x2a, 0x3d, 0xa0, 0xd4, 0xa8, 0x3c, 0xe0, 0xb4, 0xd8, 0x3c, 0x24, 0x6d, 0x6a, 0xbd, 0x16, 0xd2, 0x58, 0xbd, 0x56, 0xf5, 0x5d, 0x3b, 0xae, 0xdb, 0x76, 0xbd, 0x16, 0x9a, 0x9a, 0xbd, 0x7c, 0x79, 0x51, 0x3d, 0x72, 0x5b, 0xa7, 0xbc, 0xce, 0xbf, 0x62, 0x3d, 0xab, 0xd8, 0x23, 0x3d, 0x7e, 0xfd, 0x23, 0x3d, 0x0c, 0x3d, 0x6b, 0x3d, 0x6c, 0x2f, 0x87, 0x3c, 0x1e, 0x26, 0x00, 0xbc, 0xc3, 0x94, 0x6f, 0xbd, 0xb3, 0x7d, 0x24, 0xbd, 0x2a, 0xfb, 0x71, 0x3d, 0xee, 0x5a, 0xeb, 0xbc, 0x6c, 0x3e, 0x60, 0xbd, 0x6c, 0x46, 0xf5, 0x3c, 0x83, 0xe3, 0x17, 0x3b, 0xe6, 0x15, 0x32, 0xbd, 0x45, 0xba, 0x05, 0xbd, 0x18, 0x9a, 0x72, 0x3d, 0x45, 0x9c, 0x83, 0xbd, 0x08, 0x2b, 0x5e, 0x3d, 0x75, 0xea, 0xe8, 0xbc, 0x81, 0xb6, 0x84, 0x3b, 0x4b, 0xf4, 0x16, 0xbd, 0x90, 0xf4, 0x16, 0x3d, 0x2b, 0x95, 0x53, 0xbc, 0x53, 0x27, 0x4b, 0xbd, 0x00, 0x6c, 0xe7, 0x3b, 0x62, 0xbd, 0x83, 0xbd, 0xd8, 0x6f, 0x87, 0x3c, 0x3c, 0x17, 0x65, 0x3c, 0x3b, 0x64, 0x7e, 0x3d, 0xbd, 0x05, 0x09, 0xbd, 0x7f, 0x37, 0x88, 0xbd, 0x63, 0x0e, 0x98, 0xbd, 0x03, 0x67, 0x71, 0x3c, 0x02, 0x06, 0xe5, 0x39, 0xe4, 0x9f, 0xe7, 0x3b, 0x93, 0x66, 0x93, 0xbd, 0xc6, 0xcd, 0x7c, 0xbd, 0xde, 0xaf, 0x20, 0x3d, 0xd2, 0x18, 0x54, 0x3c, 0xac, 0xeb, 0x62, 0xbd, 0x93, 0xf7, 0xa2, 0x3c, 0x4c, 0x4b, 0x00, 0x3d, 0x38, 0x67, 0x3d, 0xbd, 0x81, 0xcb, 0xa2, 0x3c, 0x9b, 0xd5, 0x90, 0x3c, 0x35, 0x26, 0x0f, 0x3c, 0xcb, 0x77, 0x45, 0xbd, 0x38, 0xe0, 0x48, 0xbd, 0x96, 0x9e, 0x1d, 0x3b, 0x7c, 0x3f, 0xaf, 0xbc, 0xef, 0x49, 0xac, 0xbc, 0x07, 0x74, 0xcc, 0x3c, 0xc0, 0x22, 0x42, 0xbb, 0x5b, 0x72, 0x62, 0x3d, 0xd0, 0x55, 0x95, 0xbd, 0xf7, 0x7d, 0x82, 0x3d, 0x90, 0x79, 0xd9, 0x3b, 0xd0, 0xa1, 0x96, 0x3c, 0xbf, 0x32, 0x8a, 0x3d, 0xbd, 0xf0, 0x57, 0x3d, 0x5f, 0xf9, 0x3b, 0x3c, 0x4f, 0xea, 0x86, 0x3d, 0xbb, 0x72, 0xaa, 0x3c, 0x42, 0x3b, 0x4c, 0x3d, 0x86, 0x1d, 0x86, 0x3c, 0x90, 0xc6, 0x2a, 0xbd, 0x4f, 0x86, 0x76, 0x3d, 0x92, 0x79, 0x3d, 0x3d, 0x0d, 0x95, 0x92, 0x3d, 0xbf, 0x77, 0x4e, 0x3d, 0x8b, 0x45, 0x03, 0xbd, 0x95, 0x0c, 0xff, 0xbc, 0x62, 0x35, 0x11, 0xbb, 0xbd, 0x74, 0x28, 0x3d, 0xaf, 0x87, 0x7f, 0xbd, 0x8e, 0xb8, 0x06, 0xbd, 0x0f, 0xbd, 0x3e, 0x3d, 0xe6, 0xd4, 0x41, 0xbd, 0x80, 0x81, 0xac, 0x3c, 0x7a, 0xec, 0x82, 0xbc, 0x01, 0xac, 0x93, 0xbd, 0xe8, 0xba, 0xb3, 0xbb, 0xcf, 0x47, 0x8f, 0xbb, 0x11, 0x6f, 0x57, 0x3d, 0x74, 0xf5, 0x9d, 0x3c, 0x67, 0x6e, 0x01, 0xbd, 0xa6, 0x8c, 0x8f, 0xbd, 0xe4, 0x48, 0x30, 0xbd, 0x80, 0xa7, 0x88, 0xbb, 0x48, 0x69, 0xea, 0x3c, 0x20, 0x78, 0x14, 0x3b, 0x18, 0xc4, 0xca, 0xbc, 0xd6, 0x83, 0xcb, 0x3c, 0x88, 0x63, 0xd1, 0x3c, 0x02, 0x3a, 0x1b, 0xbc, 0x02, 0x15, 0x13, 0x3c, 0xbe, 0x71, 0xf0, 0xbb, 0xe1, 0x3c, 0x12, 0xbd, 0xa6, 0x23, 0x33, 0x3c, 0xc8, 0x04, 0xee, 0x3c, 0x78, 0x7e, 0x4d, 0x3c, 0x7f, 0xd1, 0x95, 0xbc, 0xa3, 0x48, 0x22, 0x3c, 0x6d, 0x33, 0x77, 0xbd, 0xfc, 0x4f, 0xc7, 0xbc, 0x8c, 0x5c, 0x8c, 0xbd, 0x98, 0x32, 0x02, 0xbd, 0x5f, 0x37, 0x00, 0x3d, 0x41, 0xea, 0x7f, 0x3d, 0x4b, 0x38, 0x77, 0xbc, 0x47, 0x90, 0x92, 0xbd, 0x56, 0x10, 0x1f, 0xbd, 0x10, 0x70, 0x8e, 0xbb, 0x0a, 0x99, 0x7a, 0x3c, 0x46, 0x4c, 0x7d, 0x3d, 0xc0, 0x71, 0x6d, 0x3d, 0xd8, 0x3f, 0x28, 0x3d, 0x84, 0xe3, 0x2b, 0x3d, 0x31, 0xdc, 0x55, 0xbd, 0x6e, 0x0a, 0x34, 0x3d, 0x10, 0xff, 0x85, 0x3c, 0x72, 0x7b, 0x1d, 0xbd, 0x7f, 0xf5, 0xb4, 0xbb, 0xfb, 0xef, 0x87, 0x3d, 0xb5, 0x8a, 0x4f, 0x3c, 0x20, 0xd7, 0x40, 0xbd, 0x17, 0x2c, 0x38, 0xbd, 0xcb, 0xd4, 0x6d, 0x3d, 0x3c, 0x24, 0x7a, 0xbd, 0xb3, 0x3d, 0x92, 0xbd, 0x18, 0xbe, 0x99, 0xba, 0x29, 0xe3, 0x42, 0xbc, 0xf7, 0x2c, 0x8f, 0xbd, 0x34, 0xd9, 0xc7, 0x3c, 0xac, 0x8c, 0x99, 0xbd, 0x40, 0xe4, 0xa5, 0x3c, 0x8d, 0xcf, 0x3d, 0x3d, 0x81, 0xe9, 0x3e, 0x3d, 0x7a, 0xbb, 0x3f, 0x3d, 0xc7, 0x9b, 0x25, 0xbc, 0x84, 0x26, 0xc3, 0xbb, 0x52, 0x3f, 0x7a, 0x3d, 0x7b, 0xdb, 0x69, 0xbd, 0x99, 0x0e, 0x71, 0xbd, 0x4c, 0xb5, 0xa5, 0x3b, 0xcf, 0x2f, 0xfd, 0xbb, 0x6b, 0x5b, 0x0c, 0x3b, 0x9e, 0xeb, 0x04, 0xbc, 0x00, 0x9d, 0xdc, 0xbb, 0x10, 0xc2, 0xc0, 0x3c, 0x08, 0xa2, 0x31, 0xbd, 0xc0, 0x3c, 0xf9, 0x3a, 0xad, 0xd5, 0x55, 0xbd, 0x11, 0xea, 0xf3, 0x3c, 0x80, 0x63, 0xfa, 0x3a, 0x30, 0x82, 0x48, 0x3b, 0x58, 0x5f, 0x2c, 0xbd, 0xd4, 0x00, 0x83, 0xbd, 0x12, 0x38, 0x8a, 0xbd, 0xd2, 0xdf, 0x1e, 0x3c, 0xd0, 0x71, 0x1b, 0x3d, 0x92, 0x5f, 0x56, 0xbd, 0x51, 0x29, 0x94, 0xbd, 0x40, 0x81, 0x92, 0xbd, 0x04, 0x93, 0x82, 0xbd, 0x8c, 0xf7, 0x84, 0x3d, 0x8a, 0x96, 0x85, 0xbd, 0x2a, 0x93, 0x3b, 0xba, 0xc7, 0x7c, 0x3b, 0xbd, 0xb0, 0x3d, 0x50, 0x3d, 0xa0, 0xcb, 0x42, 0x3d, 0xad, 0x3c, 0x16, 0xbc, 0x59, 0xaa, 0x30, 0xbd, 0xcd, 0x10, 0x91, 0xbc, 0xe8, 0xea, 0x35, 0xbd, 0x53, 0x63, 0x36, 0xbd, 0xa9, 0x85, 0x82, 0x3c, 0x23, 0xbd, 0x36, 0xbd, 0x25, 0x81, 0xe9, 0x3c, 0x76, 0x54, 0x6d, 0x3d, 0xc1, 0x4f, 0x69, 0xbd, 0x55, 0x6c, 0x8f, 0x3d, 0xd5, 0x0a, 0x7d, 0xbd, 0x48, 0xbe, 0xd2, 0x3c, 0x5b, 0xce, 0x84, 0x3d, 0xaa, 0x8e, 0x46, 0xbc, 0x9c, 0x93, 0xc9, 0x3c, 0x66, 0xb1, 0x45, 0x3d, 0xf1, 0xc0, 0x90, 0xbc, 0x2d, 0x09, 0x22, 0x3d, 0xcc, 0x52, 0x20, 0x3d, 0xaa, 0xec, 0x70, 0x3d, 0x3a, 0xbd, 0xac, 0xbb, 0x70, 0x69, 0x81, 0x3d, 0x43, 0x3f, 0x8b, 0xbc, 0x46, 0x6a, 0x04, 0xbd, 0xac, 0x25, 0x5a, 0xbd, 0xc2, 0xb9, 0x74, 0xbd, 0x35, 0x78, 0xeb, 0x3c, 0xe2, 0x31, 0x54, 0xbd, 0xa0, 0xb1, 0xfe, 0x3c, 0xaf, 0xd2, 0xf8, 0x3c, 0x00, 0x44, 0x82, 0x3a, 0x70, 0xcc, 0x91, 0xbd, 0x82, 0x1f, 0x57, 0xbd, 0xc2, 0xe4, 0x03, 0x3d, 0xd0, 0xbd, 0x80, 0xbd, 0x7a, 0xde, 0x41, 0xbd, 0xe9, 0xf4, 0x3b, 0x3c, 0xf9, 0x96, 0x1a, 0xbd, 0xe2, 0x2e, 0x46, 0xbd, 0xae, 0xbd, 0x34, 0xbd, 0xb4, 0xa2, 0x8c, 0xbc, 0xa8, 0x0e, 0x30, 0xbd, 0x56, 0xf8, 0x33, 0xbd, 0xce, 0x69, 0x35, 0x3d, 0x52, 0x2f, 0xeb, 0xbc, 0x9f, 0xe0, 0x0f, 0xbd, 0xc9, 0x34, 0x29, 0xbd, 0x43, 0x26, 0x1e, 0x3d, 0xc8, 0x03, 0x05, 0x3c, 0x0f, 0x46, 0x97, 0x3c, 0x18, 0x4c, 0x0c, 0xbd, 0xb8, 0xf9, 0x1c, 0xbd, 0xbd, 0x84, 0x86, 0xbd, 0xbe, 0x50, 0xb1, 0xbc, 0x26, 0x15, 0x57, 0x3c, 0xca, 0x9f, 0x77, 0xbc, 0xc0, 0xea, 0xca, 0xba, 0x23, 0xde, 0x41, 0xbd, 0x9d, 0xb4, 0x5c, 0xbd, 0x46, 0x03, 0x30, 0xbd, 0xd0, 0xb3, 0x37, 0x3d, 0xfd, 0xe6, 0x3e, 0x3d, 0x8a, 0x0e, 0x6a, 0xbd, 0xf8, 0x91, 0x64, 0x3d, 0xb4, 0x0b, 0x76, 0x3d, 0xf2, 0x94, 0x5f, 0x3d, 0x98, 0xe6, 0x78, 0x3c, 0xc4, 0xab, 0x1e, 0xbd, 0xdd, 0xb6, 0x77, 0xbd, 0x56, 0x1e, 0x8c, 0x3d, 0x0f, 0xee, 0x15, 0xbd, 0x42, 0xb6, 0x92, 0xbd, 0x2c, 0xea, 0x96, 0xbc, 0x90, 0xc4, 0x30, 0xbd, 0x2e, 0xdc, 0xc8, 0xbb, 0xe4, 0x79, 0xb0, 0xbc, 0x2e, 0xe6, 0x08, 0x3d, 0x74, 0x81, 0x34, 0x3d, 0xc0, 0xd5, 0x48, 0xbc, 0xd3, 0xf2, 0x3c, 0xbd, 0x34, 0x47, 0xef, 0x3c, 0x9a, 0xcb, 0xe5, 0x3c, 0xe0, 0x94, 0xef, 0xba, 0x80, 0x36, 0x23, 0xbc, 0x08, 0xf9, 0x35, 0xbd, 0x0f, 0x9d, 0x99, 0xbd, 0x71, 0xdf, 0x2e, 0xbd, 0xb5, 0xa6, 0x78, 0xbd, 0xfa, 0xa8, 0x69, 0x3d, 0x97, 0xc3, 0xda, 0xbb, 0x37, 0x74, 0xdf, 0x3c, 0x7f, 0xc2, 0x88, 0xbd, 0x53, 0x20, 0xbe, 0x3b, 0x9c, 0x7a, 0xd9, 0x3c, 0xa9, 0x4b, 0x01, 0xbd, 0xfb, 0xf7, 0x00, 0xbd, 0xd5, 0xda, 0x41, 0x3d, 0x9d, 0x2a, 0x82, 0x3d, 0x9a, 0x03, 0x01, 0x3d, 0x38, 0xa7, 0x1b, 0x3d, 0x40, 0x75, 0xef, 0x3c, 0x4a, 0xdc, 0x1b, 0xbc, 0xd1, 0x1a, 0x41, 0x3d, 0x04, 0xee, 0x74, 0x3d, 0xdb, 0x3f, 0x71, 0xbd, 0x86, 0xc4, 0x22, 0x3d, 0x99, 0x74, 0x78, 0xbc, 0x48, 0x90, 0x54, 0xbd, 0x88, 0xae, 0xf9, 0x3c, 0x4f, 0xbe, 0x10, 0x3d, 0x7d, 0x35, 0x68, 0xbd, 0xb3, 0xf9, 0x3d, 0x3d, 0x1b, 0x89, 0x85, 0xbb, 0x85, 0x05, 0xae, 0x3c, 0xfd, 0x18, 0x5b, 0xbd, 0x2d, 0xfa, 0x7f, 0xbd, 0x6e, 0xad, 0x8c, 0xbd, 0x67, 0x72, 0x28, 0x3d, 0x2c, 0x8b, 0x9a, 0x3c, 0xb3, 0x94, 0x57, 0xbd, 0xa4, 0x3e, 0xa8, 0xbc, 0xa6, 0x6a, 0x06, 0x3d, 0xf8, 0x03, 0x33, 0x3d, 0x56, 0xb0, 0x7a, 0xbd, 0x47, 0x97, 0x68, 0xbc, 0xd0, 0x17, 0x7a, 0xbd, 0xe8, 0xab, 0x7d, 0xbd, 0xec, 0x67, 0xf9, 0xbb, 0x3d, 0x92, 0x83, 0xbd, 0x36, 0xa4, 0x00, 0xbd, 0x00, 0x1b, 0x45, 0x3a, 0x39, 0x13, 0x88, 0xbd, 0x05, 0x63, 0x26, 0x3c, 0x53, 0x7b, 0xc9, 0x3c, 0x67, 0x97, 0x7a, 0xbb, 0xfe, 0x71, 0xd6, 0xbc, 0x24, 0x84, 0x1e, 0xbd, 0x02, 0xa3, 0x76, 0x3d, 0xff, 0x16, 0x69, 0x3d, 0x80, 0xf0, 0x21, 0x3d, 0x90, 0x11, 0x48, 0xbd, 0xc8, 0xa9, 0x3f, 0xbd, 0xc8, 0x06, 0x25, 0xbd, 0xaa, 0xfe, 0x96, 0xbd, 0xa4, 0xbe, 0x57, 0xbc, 0x6e, 0x82, 0x1d, 0x3d, 0xd6, 0xfa, 0x66, 0xbb, 0x9a, 0x25, 0x20, 0x3d, 0xa3, 0x94, 0x27, 0xbb, 0x23, 0x2f, 0xcd, 0x3c, 0x5e, 0xa4, 0x4e, 0x3d, 0x2a, 0x3b, 0x09, 0xbd, 0x4a, 0x40, 0x6f, 0x3d, 0xfe, 0xd8, 0xe4, 0x3c, 0xab, 0xce, 0x56, 0xbd, 0x1d, 0x9a, 0x65, 0x3d, 0xb6, 0xf5, 0x76, 0xbd, 0x88, 0x3d, 0x52, 0x3d, 0x0f, 0x1c, 0x50, 0xbd, 0x1d, 0x0d, 0x6a, 0x3d, 0x99, 0x66, 0x98, 0xbd, 0x6e, 0xe2, 0xb9, 0x3c, 0x4c, 0x26, 0x82, 0xbd, 0xe2, 0x3f, 0x65, 0xbd, 0x09, 0xa4, 0x8a, 0x3c, 0x19, 0x7d, 0x7d, 0xbd, 0xe6, 0xf8, 0x1d, 0xbd, 0xfc, 0xe2, 0xee, 0xbc, 0x1d, 0xab, 0x89, 0x3d, 0x8e, 0xb4, 0xfe, 0xbc, 0x68, 0x9c, 0x83, 0x3c, 0xf7, 0xa9, 0x0b, 0xbd, 0x3c, 0xed, 0x92, 0x3c, 0x90, 0x72, 0xa5, 0x3c, 0x02, 0xd9, 0x69, 0xbd, 0xa9, 0x64, 0x2a, 0xbb, 0x6d, 0x20, 0xf5, 0xbc, 0x0e, 0x44, 0x37, 0xbd, 0xc7, 0xf0, 0xde, 0x3c, 0xb6, 0xdb, 0x71, 0x3d, 0xea, 0x6b, 0xda, 0xbc, 0xc8, 0x8f, 0x1d, 0xbd, 0xb9, 0x43, 0x05, 0xbd, 0x6c, 0x4a, 0x78, 0xbc, 0xc0, 0xc3, 0x82, 0x3b, 0x4b, 0x41, 0x49, 0xbd, 0xc1, 0xfc, 0xcb, 0x3b, 0x93, 0x21, 0x8d, 0xbd, 0xcf, 0x67, 0x7a, 0xbd, 0x58, 0x9d, 0xdb, 0x3c, 0xd3, 0x71, 0x03, 0x3d, 0xaf, 0x55, 0x84, 0x3d, 0x71, 0x0c, 0x5d, 0xbd, 0x4c, 0x19, 0x89, 0x3c, 0x7f, 0x29, 0x8b, 0x3d, 0xf6, 0xcd, 0xa9, 0x3c, 0xaa, 0x00, 0x4c, 0x3d, 0x2b, 0xaa, 0x19, 0xbc, 0x93, 0xde, 0x16, 0xb9, 0xda, 0xaf, 0x90, 0xbb, 0xf6, 0xde, 0x48, 0x3d, 0x00, 0x08, 0x29, 0x3b, 0xb2, 0xe0, 0x82, 0xbc, 0x84, 0xf3, 0x40, 0xbc, 0xd4, 0x75, 0x08, 0x3d, 0x88, 0xe7, 0x64, 0xbd, 0x68, 0xd6, 0x95, 0x3c, 0x1b, 0x70, 0x3f, 0x3d, 0x64, 0xfa, 0xfd, 0xbc, 0xfc, 0x82, 0x61, 0x3d, 0x8e, 0x6e, 0x11, 0xbd, 0x0a, 0x0a, 0x9f, 0xbc, 0xb5, 0x1d, 0x68, 0x3c, 0x7d, 0x9f, 0x86, 0x3d, 0xe6, 0x3f, 0x83, 0x3d, 0xf9, 0xd6, 0xfe, 0x3c, 0x68, 0x0c, 0x61, 0xbd, 0x65, 0x33, 0x27, 0x3d, 0x2c, 0xcf, 0x68, 0x3d, 0xb0, 0xc0, 0x14, 0xbd, 0xb0, 0xb2, 0x81, 0x3d, 0xc0, 0x9c, 0x89, 0xbc, 0xae, 0x60, 0x8e, 0xbd, 0x92, 0xdd, 0x91, 0xbd, 0xc9, 0x0b, 0x85, 0x3d, 0xa4, 0x00, 0xb1, 0xbc, 0x80, 0x9d, 0xf8, 0x3c, 0x1d, 0xc1, 0x98, 0xbd, 0x3e, 0x88, 0xcd, 0x3c, 0x67, 0xc9, 0x66, 0x3c, 0x00, 0x46, 0x64, 0xba, 0x80, 0x3e, 0x19, 0xbd, 0x18, 0xe0, 0x20, 0x3c, 0x50, 0xcb, 0xc0, 0x3b, 0xe3, 0xf3, 0x8c, 0xbc, 0xac, 0x02, 0xd6, 0x3c, 0xca, 0x7a, 0x45, 0x3d, 0x95, 0xab, 0x47, 0xbd, 0xe6, 0x14, 0x55, 0x3d, 0x88, 0x82, 0x09, 0x3d, 0x1c, 0x74, 0x91, 0x3c, 0xbf, 0x00, 0x2f, 0x3c, 0x8c, 0xfc, 0x96, 0xbd, 0xcb, 0xa8, 0x9e, 0xbb, 0xb5, 0x6b, 0x42, 0x3d, 0x0f, 0xed, 0x99, 0xbd, 0x6a, 0x9e, 0x45, 0xba, 0x50, 0xa3, 0x2d, 0xbc, 0x6a, 0x95, 0x52, 0x3d, 0x18, 0x66, 0xd7, 0xbb, 0x65, 0x63, 0x7c, 0xbd, 0xfe, 0xa8, 0xe1, 0xbc, 0x48, 0x89, 0x50, 0xbd, 0x64, 0x1d, 0xbe, 0x3c, 0x54, 0xe9, 0x07, 0x3d, 0x2f, 0x27, 0x2b, 0x3d, 0x55, 0x02, 0x00, 0x3d, 0xb2, 0xbe, 0x53, 0xbd, 0xd8, 0x03, 0x72, 0xbd, 0xd4, 0x63, 0x69, 0x3d, 0x1c, 0x9b, 0x7c, 0xbd, 0x87, 0x6b, 0x83, 0xbd, 0xc8, 0x0e, 0x0f, 0xbd, 0xed, 0x88, 0x30, 0xbd, 0xce, 0x02, 0x31, 0xbd, 0xae, 0xdd, 0x17, 0xbd, 0x03, 0x61, 0x43, 0xbd, 0xcf, 0xd3, 0x03, 0xbd, 0x56, 0x0b, 0x57, 0xbd, 0x85, 0x33, 0x0d, 0xbd, 0x36, 0x8f, 0x0b, 0xbd, 0x8e, 0x7d, 0x2c, 0xbc, 0x99, 0x21, 0x40, 0xbd, 0x9b, 0xf2, 0x62, 0xbb, 0xcc, 0xaf, 0x3f, 0x3d, 0x3f, 0xc0, 0xab, 0x3c, 0xc1, 0x4d, 0x27, 0x3c, 0x4b, 0x78, 0x30, 0x3d, 0x04, 0x65, 0xfe, 0x3b, 0xbe, 0x78, 0xb0, 0xbc, 0x9a, 0xb9, 0xe8, 0xbc, 0x58, 0x9c, 0x5d, 0x3d, 0x95, 0x93, 0x65, 0x3d, 0xd9, 0xa8, 0x41, 0xbd, 0x91, 0xb5, 0x36, 0x3d, 0x48, 0xc5, 0x84, 0xbd, 0xf8, 0x98, 0x3c, 0x3c, 0x07, 0x2e, 0x96, 0xbd, 0xf2, 0xa1, 0x2b, 0xba, 0xdc, 0xa1, 0x10, 0xbd, 0x3a, 0xa4, 0xdb, 0xbc, 0x03, 0x75, 0x63, 0xbd, 0x5f, 0x46, 0x3d, 0x3a, 0x75, 0x7d, 0x56, 0x3d, 0x68, 0x12, 0xa8, 0xbc, 0x03, 0xf5, 0x98, 0xbd, 0xe0, 0x3c, 0xe7, 0xbc, 0x90, 0xb6, 0xbb, 0xbb, 0x48, 0x0e, 0x08, 0x3d, 0x68, 0x30, 0x35, 0x3c, 0xb4, 0x17, 0xcf, 0x3c, 0xf9, 0xd9, 0xf8, 0x3c, 0xc8, 0x7e, 0x09, 0xbc, 0x84, 0xde, 0x45, 0xbd, 0xfe, 0xad, 0xf7, 0xbc, 0xdb, 0x10, 0x8b, 0xbd, 0x65, 0xac, 0x40, 0x3d, 0x2f, 0xc7, 0x12, 0x3c, 0x60, 0x81, 0x62, 0x3d, 0x96, 0xbd, 0xf6, 0x3c, 0xee, 0x7e, 0x80, 0x3d, 0x76, 0x78, 0x25, 0x3d, 0xec, 0x17, 0x1b, 0xbc, 0x17, 0xa7, 0x2f, 0xbd, 0x5c, 0x17, 0x4e, 0x3d, 0x92, 0x4e, 0x99, 0xbb, 0xe6, 0xec, 0x1d, 0xbd, 0xcf, 0xd4, 0x15, 0x3d, 0x36, 0x68, 0xcb, 0x3c, 0x05, 0xd3, 0x68, 0x3c, 0x4d, 0x37, 0x96, 0x3c, 0x85, 0x4b, 0x98, 0x3b, 0x3e, 0xf9, 0x6a, 0x3d, 0x42, 0xd5, 0x85, 0xbc, 0x35, 0xf1, 0x48, 0xbd, 0xae, 0x5a, 0x69, 0x3b, 0xfc, 0xc3, 0x81, 0xbd, 0x3d, 0xe3, 0x71, 0xbd, 0xdb, 0x3b, 0x18, 0xbd, 0x40, 0x90, 0x26, 0xbd, 0x5d, 0xef, 0x80, 0xbc, 0x94, 0x89, 0x9a, 0xbc, 0x96, 0x7a, 0x33, 0xbd, 0x94, 0x61, 0x71, 0x3d, 0xe6, 0xaf, 0x5a, 0x3d, 0x5f, 0x3d, 0x6a, 0x3b, 0x22, 0xcf, 0x23, 0xbc, 0xb1, 0x6f, 0x4b, 0xbb, 0x9a, 0x4b, 0xbe, 0x3c, 0xd7, 0x02, 0x95, 0xbc, 0xb5, 0xfa, 0x4b, 0xbd, 0x8d, 0x7e, 0x85, 0xbc, 0x12, 0x0b, 0x3c, 0x3d, 0xa5, 0x2c, 0xfc, 0xbb, 0xb0, 0xcc, 0xb2, 0xbb, 0xf2, 0x03, 0x4a, 0xbd, 0x87, 0xe3, 0x1d, 0xbd, 0xcc, 0xd7, 0xed, 0x3c, 0x16, 0x63, 0x73, 0xbc, 0x18, 0x4e, 0x47, 0x3d, 0x70, 0x95, 0x37, 0xbd, 0xfb, 0xdd, 0xc4, 0x3c, 0x3d, 0x65, 0xfb, 0x3c, 0x96, 0xa0, 0x84, 0x3d, 0x60, 0x19, 0xff, 0xbb, 0xa4, 0xbf, 0x4b, 0x3c, 0x5b, 0x63, 0x03, 0xbd, 0x8d, 0x86, 0xcb, 0xbb, 0x62, 0xee, 0x76, 0xbd, 0x9c, 0x16, 0x73, 0x3d, 0x4f, 0xd8, 0x81, 0x3d, 0xe2, 0x7d, 0xba, 0xbc, 0xd6, 0x7a, 0xb4, 0x3b, 0x61, 0x45, 0x87, 0x3d, 0xe1, 0x5e, 0x8a, 0xbd, 0xfc, 0x1f, 0xc0, 0xbc, 0xc0, 0x87, 0x14, 0xbd, 0x3d, 0x53, 0x16, 0x3d, 0x86, 0x91, 0x17, 0x3c, 0xa6, 0x1a, 0x71, 0xbc, 0xe7, 0x57, 0xf9, 0xbc, 0x27, 0x13, 0x87, 0x3d, 0x98, 0x4e, 0x02, 0x3d, 0xe5, 0x9d, 0x13, 0x3d, 0x89, 0xbf, 0x2e, 0x3c, 0xa0, 0x5f, 0x21, 0x3b, 0x80, 0xc1, 0xf4, 0x3b, 0x14, 0x22, 0x2a, 0xbc, 0x33, 0xd3, 0x93, 0x3c, 0xd7, 0x3d, 0x6e, 0x3d, 0x2e, 0xcd, 0x81, 0xbd, 0x71, 0xa3, 0x45, 0xbd, 0xde, 0xd6, 0x4f, 0x3d, 0xb7, 0xe7, 0x41, 0xbd, 0x27, 0x86, 0xd6, 0x3c, 0x6b, 0x72, 0x85, 0x3d, 0x6d, 0x89, 0x11, 0xbd, 0x21, 0x7b, 0x1a, 0xbd, 0x18, 0xf1, 0x38, 0xbd, 0xc3, 0xf7, 0xb1, 0x3c, 0xd7, 0xa0, 0x8e, 0xbd, 0x6e, 0x16, 0x24, 0x3d, 0xc2, 0x2b, 0x2f, 0x3d, 0xc8, 0x1c, 0x82, 0x3c, 0x53, 0x30, 0x24, 0xbc, 0xd9, 0x49, 0x1f, 0xbd, 0xea, 0x81, 0x3f, 0x3d, 0xc4, 0xb7, 0x1a, 0x3d, 0xc3, 0x0a, 0x0b, 0xbd, 0x29, 0x5d, 0x88, 0x3d, 0x3f, 0xb6, 0x9f, 0xbc, 0x97, 0x16, 0x72, 0xbd, 0x67, 0x40, 0xa4, 0xbc, 0x67, 0x64, 0x59, 0xbc, 0xd0, 0x90, 0xfd, 0xbc, 0x48, 0xa3, 0x1b, 0xbd, 0x5f, 0x6c, 0xf2, 0x3c, 0xe4, 0x81, 0x97, 0xbd, 0x2b, 0xe9, 0x86, 0x3d, 0x6c, 0xa1, 0x06, 0xbd, 0xa8, 0x7c, 0x2a, 0x3c, 0x07, 0xca, 0x8d, 0x3b, 0x1f, 0x0c, 0x21, 0xbd, 0xb0, 0x7f, 0x90, 0xbd, 0xe5, 0x3f, 0x17, 0x3d, 0x03, 0x58, 0x43, 0xbd, 0xe7, 0x24, 0x42, 0xbd, 0xdd, 0xf2, 0x95, 0xbd, 0x58, 0xd0, 0xd9, 0x3c, 0xa9, 0xbe, 0x00, 0x3d, 0x40, 0x4c, 0x97, 0xbd, 0x06, 0x0f, 0x63, 0xbd, 0x44, 0x04, 0x42, 0xbd, 0x69, 0xfa, 0xd6, 0xbb, 0x40, 0x95, 0xca, 0xba, 0xba, 0x29, 0x80, 0xbd, 0x40, 0x04, 0x8f, 0xbd, 0x9b, 0xd2, 0x71, 0xbd, 0x16, 0x0f, 0x36, 0xbd, 0xcf, 0xe9, 0x77, 0x3d, 0x00, 0x20, 0xe2, 0xb8, 0x77, 0xed, 0x89, 0xba, 0x27, 0x9d, 0x7d, 0xbd, 0x8b, 0x7d, 0xa1, 0x3c, 0xaf, 0x02, 0x41, 0xbd, 0x76, 0x0a, 0x80, 0xbd, 0xc5, 0xbe, 0x0c, 0x3c, 0x65, 0xbc, 0x53, 0x3c, 0x23, 0x57, 0x71, 0x3d, 0x4c, 0x69, 0xad, 0x3c, 0xe6, 0x35, 0x70, 0xbd, 0x4a, 0x71, 0x0f, 0x3d, 0x60, 0x74, 0x60, 0xbd, 0x00, 0x21, 0xff, 0xbc, 0x2e, 0x9e, 0x15, 0xbd, 0x5b, 0xfa, 0xfb, 0xbc, 0x70, 0x17, 0xe6, 0x3c, 0xb8, 0x5a, 0x03, 0x3d, 0x26, 0x71, 0x82, 0x3d, 0x40, 0xf1, 0xe2, 0xbb, 0xad, 0xa1, 0x7d, 0xbd, 0xbb, 0x38, 0xb0, 0xbc, 0xa8, 0x2e, 0x18, 0x3d, 0x29, 0xe4, 0x01, 0xbd, 0x3d, 0xed, 0x75, 0xbc, 0xc1, 0x90, 0x09, 0x3d, 0x7a, 0x35, 0xf9, 0xbc, 0x0a, 0x1f, 0x8e, 0xbc, 0x7b, 0x9e, 0x05, 0xbc, 0x00, 0xe1, 0x18, 0x3c, 0x90, 0xf1, 0xc1, 0xbc, 0xbc, 0xfc, 0x87, 0x3d, 0x28, 0x2a, 0x48, 0x3c, 0xcf, 0x41, 0xf4, 0xbc, 0xa3, 0x20, 0x7a, 0xbd, 0x58, 0x65, 0x0c, 0x3b, 0x5b, 0x8e, 0xd7, 0xbc, 0x09, 0x03, 0x87, 0x3d, 0xfa, 0xcf, 0xaa, 0xbc, 0x12, 0x45, 0x83, 0xbd, 0x29, 0x24, 0x89, 0xbd, 0x77, 0x6e, 0x98, 0xbd, 0x50, 0xf7, 0x91, 0xbb, 0x3e, 0x17, 0x86, 0x3c, 0xcf, 0x82, 0x54, 0x3d, 0x12, 0x48, 0xff, 0xbb, 0xa8, 0x39, 0xa6, 0x3c, 0x57, 0xfc, 0xb4, 0xbc, 0xc5, 0x25, 0x30, 0xbd, 0xcd, 0xbc, 0x04, 0xbd, 0x10, 0x87, 0xb4, 0xbc, 0x16, 0x7b, 0x6e, 0xbd, 0xba, 0x00, 0x5f, 0xbd, 0xf8, 0x14, 0xac, 0x3c, 0xdf, 0x4d, 0x88, 0xbd, 0x2e, 0xd2, 0xb6, 0xbc, 0x8e, 0x7a, 0x8e, 0xbd, 0xac, 0xdb, 0xe2, 0x3c, 0x7b, 0x12, 0x8b, 0x3d, 0x03, 0xe2, 0x91, 0xbd, 0x43, 0xac, 0x3c, 0xbc, 0x5a, 0xc7, 0x52, 0x3d, 0x5e, 0xec, 0x40, 0x3d, 0x1a, 0xb0, 0x1f, 0xbc, 0x1d, 0x9c, 0x92, 0xbd, 0xd3, 0x03, 0xfd, 0x3c, 0xdd, 0x22, 0x0a, 0xbb, 0xe2, 0x2a, 0x89, 0x3d, 0x94, 0xb6, 0xd4, 0xbb, 0x74, 0x26, 0xb8, 0xbc, 0xc6, 0x7a, 0x35, 0xbd, 0xa8, 0xb7, 0x8e, 0xbd, 0xbe, 0x94, 0x36, 0xbd, 0x22, 0xc0, 0x03, 0xbd, 0x40, 0xb4, 0xe5, 0x3a, 0x53, 0xb5, 0x14, 0xbc, 0xac, 0x00, 0x3a, 0xbc, 0xb3, 0xd9, 0xee, 0x3c, 0xb5, 0x7c, 0xae, 0xbb, 0xd6, 0xb2, 0x75, 0x3c, 0x2f, 0x0e, 0x1a, 0xbd, 0xf0, 0xb2, 0x47, 0xbd, 0xad, 0x36, 0x50, 0xbb, 0x19, 0x86, 0x36, 0xbd, 0xb4, 0x02, 0xe4, 0xbc, 0xe2, 0x37, 0x10, 0x3d, 0x17, 0xcb, 0x86, 0xbd, 0x33, 0x35, 0x5e, 0x3c, 0x63, 0xfe, 0x8f, 0x3d, 0x8e, 0x91, 0x6c, 0xbd, 0xf8, 0x55, 0x6f, 0x3c, 0x60, 0xc0, 0xb6, 0x3c, 0x09, 0x23, 0x8d, 0xbd, 0x75, 0xae, 0x89, 0x3d, 0x4e, 0xb2, 0x76, 0x3d, 0xbc, 0x52, 0x57, 0xbd, 0x5c, 0xf2, 0xde, 0xbc, 0x5a, 0xc5, 0xc5, 0xbc, 0x01, 0xbf, 0x1a, 0xbd, 0xc4, 0x10, 0x37, 0xbd, 0xe9, 0xe5, 0x7a, 0x3b, 0xa0, 0x03, 0x58, 0xbd, 0x4f, 0xe4, 0x66, 0x3d, 0xbd, 0xc0, 0xa8, 0xbc, 0xd0, 0x05, 0xb9, 0x3c, 0xd3, 0xb7, 0xd9, 0x3c, 0xf2, 0x28, 0x2d, 0x3d, 0x69, 0x78, 0x38, 0xbd, 0x55, 0x58, 0x49, 0xbc, 0xc5, 0x5b, 0xc2, 0x3c, 0x67, 0x0d, 0x40, 0x3d, 0x02, 0xec, 0x2b, 0x3d, 0x60, 0x6a, 0xac, 0x3c, 0x6a, 0x9c, 0x65, 0x3d, 0x19, 0x18, 0x4d, 0xbd, 0x05, 0xaf, 0xbd, 0xbc, 0x22, 0x2b, 0x54, 0xbd, 0x1d, 0x0c, 0xd9, 0xbc, 0x0a, 0xf7, 0xfd, 0x3a, 0x5a, 0x18, 0x23, 0x3d, 0xeb, 0xfc, 0x84, 0xbd, 0xaf, 0x71, 0x0c, 0xbc, 0x98, 0x72, 0x5e, 0x3c, 0x18, 0x8b, 0x88, 0x3c, 0xa4, 0x1d, 0x8f, 0xbb, 0x3c, 0x3d, 0xbf, 0xbc, 0x18, 0x7a, 0xc7, 0x3c, 0x2e, 0x1c, 0x77, 0xbd, 0x50, 0x47, 0x55, 0x3c, 0x5c, 0xa7, 0x23, 0xbc, 0x0c, 0x4e, 0xda, 0x3c, 0x00, 0x25, 0x7f, 0x3d, 0xdc, 0xbd, 0x85, 0xbd, 0xee, 0x84, 0x91, 0xbc, 0x0b, 0xcb, 0x81, 0x3d, 0x7a, 0x5f, 0x04, 0xbc, 0xde, 0x3d, 0x7b, 0xbb, 0x05, 0xa9, 0x79, 0x3d, 0x6c, 0x47, 0x2e, 0xbd, 0x9a, 0x8c, 0x7c, 0x3d, 0xee, 0xc6, 0x93, 0xbd, 0xaf, 0xd0, 0xd9, 0xbc, 0x33, 0x14, 0x3c, 0xbd, 0xe3, 0x36, 0x6e, 0x3d, 0x0b, 0x9a, 0x55, 0xbc, 0xe9, 0x83, 0x84, 0x3d, 0xd6, 0xb4, 0x6c, 0x3d, 0xc4, 0xea, 0xd4, 0x3c, 0x48, 0xb4, 0x20, 0x3d, 0x6e, 0xc9, 0x53, 0x3d, 0x4e, 0x95, 0xbb, 0xbc, 0x15, 0x0c, 0x86, 0x3d, 0xdc, 0x7a, 0x40, 0xbd, 0x98, 0x24, 0x6d, 0xbc, 0x2f, 0xea, 0x8a, 0xbd, 0x78, 0x00, 0xb4, 0x3c, 0x8f, 0x53, 0x52, 0x3d, 0xc2, 0xfb, 0x11, 0x3d, 0x10, 0x7e, 0x81, 0x3c, 0xae, 0xf3, 0x3e, 0x3d, 0x34, 0x8d, 0xeb, 0x3c, 0x72, 0x86, 0xd6, 0xbc, 0xd5, 0x02, 0xad, 0x3b, 0x9d, 0x1c, 0x41, 0xbd, 0xda, 0x6b, 0x23, 0x3d, 0xaf, 0xa0, 0x2b, 0x3d, 0x91, 0xd9, 0x5c, 0x3d, 0xce, 0x13, 0x4c, 0xbd, 0xa8, 0x7a, 0x4a, 0x3d, 0xfd, 0xc5, 0x29, 0xbd, 0xff, 0xa6, 0x50, 0xbd, 0x9d, 0x04, 0x43, 0x3d, 0x49, 0x9f, 0x82, 0xbd, 0xe0, 0x8c, 0x87, 0xbd, 0xb7, 0xb5, 0x64, 0xbd, 0x5e, 0x55, 0x27, 0x3d, 0x8d, 0xde, 0x41, 0x3d, 0x19, 0x6b, 0x23, 0xbc, 0x6f, 0x71, 0xf6, 0x3c, 0x04, 0x56, 0x24, 0x3d, 0xb8, 0x20, 0x3a, 0x3c, 0x97, 0xb4, 0x91, 0xbd, 0x87, 0xf5, 0x6d, 0x3d, 0x80, 0x5b, 0x9d, 0x3c, 0x70, 0x4c, 0xad, 0x3b, 0xff, 0x49, 0x81, 0x3d, 0x88, 0x14, 0x89, 0xbc, 0x72, 0xde, 0x25, 0xbd, 0x62, 0xa9, 0x21, 0x3d, 0x94, 0x43, 0x59, 0xbc, 0xb1, 0x5a, 0x92, 0x3d, 0x9d, 0x57, 0x6b, 0x3c, 0x5d, 0xa8, 0x8d, 0x3d, 0xd7, 0xf7, 0x08, 0x3d, 0x1c, 0x07, 0xe3, 0xbc, 0xdd, 0xfc, 0xb5, 0xbc, 0xbc, 0xca, 0x84, 0x3d, 0x5c, 0x9e, 0x18, 0xbd, 0xd5, 0x6d, 0x86, 0x3d, 0x42, 0x2b, 0x58, 0x3c, 0x0a, 0xc6, 0x33, 0x3d, 0x2c, 0x1e, 0xf6, 0xbc, 0xb8, 0x48, 0x46, 0xbd, 0x26, 0xd6, 0x88, 0xbd, 0xd8, 0x45, 0x2e, 0x3d, 0x7f, 0x28, 0x4f, 0x3d, 0x52, 0x42, 0x40, 0xbc, 0xad, 0xc8, 0x45, 0xbd, 0xaa, 0x1c, 0x27, 0xbd, 0x32, 0x83, 0x72, 0xbb, 0xd2, 0xc5, 0x33, 0x3b, 0x1e, 0x2f, 0x6f, 0x3d, 0x9e, 0x5c, 0x1c, 0x3d, 0x2d, 0xfb, 0xc5, 0xbc, 0x3d, 0x12, 0x68, 0x3b, 0xb4, 0x98, 0xe9, 0x3c, 0xb9, 0xbd, 0xdf, 0x3a, 0xe0, 0xac, 0x2c, 0x3d, 0x10, 0x5c, 0x87, 0x3c, 0x80, 0xd6, 0x2d, 0xba, 0x18, 0x73, 0x94, 0x3c, 0xb8, 0x3c, 0x39, 0xbc, 0x48, 0x64, 0xda, 0x3c, 0x54, 0xdf, 0x05, 0x3d, 0x04, 0x35, 0xdf, 0x3c, 0xdb, 0xf8, 0xfb, 0xba, 0xc3, 0x2d, 0xc1, 0xb8, 0x0e, 0x8c, 0xd1, 0x3c, 0x4f, 0x12, 0x14, 0x3d, 0x50, 0xbc, 0x7d, 0xbc, 0xc7, 0x20, 0x88, 0xbd, 0x79, 0x45, 0x2f, 0xbd, 0x77, 0x83, 0x55, 0xbc, 0x42, 0x7e, 0x95, 0xbd, 0x9d, 0xfb, 0x4d, 0xbd, 0x92, 0xcc, 0x89, 0xbd, 0x84, 0x1d, 0x03, 0xbd, 0x1f, 0xe1, 0x86, 0xbb, 0xca, 0xee, 0x4e, 0x3c, 0x15, 0x39, 0x55, 0xbd, 0x94, 0x4b, 0x87, 0xbd, 0xf3, 0xf0, 0x0d, 0xbd, 0x4d, 0x17, 0x7b, 0x3d, 0xe5, 0x0b, 0x95, 0xbc, 0x10, 0x50, 0x20, 0xbd, 0x60, 0x74, 0x7c, 0xbd, 0x50, 0x76, 0xad, 0xbc, 0xdd, 0x59, 0x89, 0x3c, 0xa1, 0xcc, 0x10, 0x3d, 0x23, 0x4c, 0x37, 0x3c, 0x50, 0x0e, 0xa6, 0x3c, 0x02, 0x0e, 0x24, 0xbd, 0x9d, 0x9f, 0x40, 0xbd, 0xba, 0xe1, 0x51, 0xbd, 0x9e, 0xe5, 0x2a, 0xbd, 0x44, 0x07, 0xc8, 0x3c, 0xc0, 0x11, 0x85, 0x3c, 0x1c, 0xde, 0x40, 0xbd, 0x34, 0xd3, 0xe3, 0x3c, 0xf1, 0xae, 0xdb, 0xbc, 0xea, 0xbb, 0xf0, 0xbc, 0x32, 0x81, 0xb7, 0x3c, 0x1b, 0xe9, 0x4f, 0xbd, 0x47, 0xd3, 0xb7, 0xbc, 0xc4, 0x4b, 0xe7, 0xbc, 0xf3, 0x52, 0x3b, 0x3d, 0x10, 0xb8, 0xb6, 0x3b, 0x0b, 0xb8, 0x33, 0xbc, 0xb1, 0xba, 0x29, 0x3d, 0x93, 0xfc, 0x00, 0xbd, 0xdf, 0x63, 0x30, 0xbd, 0xac, 0x1d, 0x1e, 0x3d, 0x52, 0xf7, 0x15, 0xbd, 0x7f, 0xea, 0x53, 0xbd, 0x29, 0xe4, 0x2f, 0xbc, 0x5e, 0xf0, 0xb7, 0x3c, 0xb1, 0xff, 0x09, 0xbd, 0xc9, 0x0f, 0xae, 0x3c, 0x5a, 0xc0, 0x06, 0xbd, 0x34, 0x15, 0x10, 0xbd, 0x76, 0xea, 0x95, 0xbc, 0x60, 0xd8, 0x2d, 0x3c, 0x4c, 0x12, 0x77, 0xbc, 0x2d, 0xb6, 0x88, 0x3d, 0x7f, 0x15, 0xe4, 0x3c, 0xb0, 0xef, 0xf0, 0xbc, 0x79, 0x32, 0x1c, 0xbd, 0x4d, 0xbc, 0x4b, 0xbd, 0xae, 0x6d, 0x64, 0x3d, 0x0c, 0x44, 0x82, 0xbc, 0x15, 0x4f, 0x3e, 0xbd, 0x86, 0x54, 0xab, 0xbc, 0x78, 0xea, 0x0d, 0xbd, 0x73, 0xc6, 0x87, 0xbd, 0x06, 0xed, 0x32, 0xbd, 0xfd, 0x03, 0x8a, 0xbd, 0x89, 0x8b, 0x30, 0xbd, 0x40, 0x73, 0x0d, 0xbd, 0xcf, 0x80, 0x84, 0xbd, 0x3c, 0x00, 0x69, 0xbd, 0xeb, 0x8a, 0xf8, 0x3b, 0xc1, 0xa4, 0x93, 0xbd, 0x25, 0x74, 0x69, 0xbd, 0x11, 0xe5, 0x00, 0x3d, 0x2d, 0xa0, 0x01, 0x3d, 0xf9, 0x7d, 0x02, 0xbc, 0x55, 0x26, 0x30, 0x3d, 0xad, 0xf7, 0x50, 0x3c, 0xd6, 0xb1, 0x68, 0x3d, 0xce, 0x49, 0x71, 0xbd, 0xcf, 0xde, 0xaa, 0x3b, 0x5d, 0x6e, 0x91, 0xbd, 0xb4, 0xf1, 0x1a, 0xbd, 0xc7, 0xeb, 0xc2, 0x3c, 0x50, 0x74, 0xd4, 0xbb, 0xe8, 0x25, 0x1f, 0x3d, 0xdb, 0x0a, 0x8e, 0xbc, 0x9d, 0x5d, 0x73, 0xbd, 0x70, 0xce, 0x01, 0xbc, 0xc4, 0x22, 0x84, 0x3d, 0x80, 0x3b, 0x1d, 0x3c, 0x3d, 0xfa, 0x15, 0xbd, 0x45, 0xd7, 0x9a, 0xbd, 0x4d, 0xa2, 0x4e, 0xbd, 0x41, 0x6e, 0x96, 0xbc, 0xbf, 0xe4, 0x6c, 0x3d, 0x90, 0x3c, 0x21, 0x3d, 0x99, 0x76, 0x83, 0x3c, 0xe1, 0xb9, 0x6f, 0x3d, 0x24, 0xb9, 0xcf, 0xbc, 0xc0, 0x33, 0xee, 0xbb, 0x8d, 0xa6, 0xf0, 0xbc, 0x40, 0x81, 0x3f, 0x3d, 0x43, 0x82, 0x7e, 0x3c, 0xfa, 0x13, 0x7a, 0x3d, 0x91, 0xcd, 0x0a, 0xbc, 0x80, 0x3e, 0x61, 0x3d, 0x65, 0xef, 0x56, 0xbd, 0x44, 0x57, 0x90, 0xbd, 0xb4, 0x86, 0x7a, 0x3c, 0x70, 0xf5, 0xbd, 0x3c, 0x90, 0x5c, 0xdc, 0x3c, 0x13, 0xe5, 0xeb, 0xbc, 0x30, 0x7a, 0x48, 0x3d, 0xfa, 0x4c, 0xbe, 0x3c, 0x4d, 0x35, 0x2e, 0xbd, 0x32, 0x33, 0xdb, 0xbc, 0xab, 0x4c, 0x0a, 0xbd, 0x12, 0x58, 0xad, 0xbc, 0x20, 0x07, 0x0c, 0x3c, 0xbc, 0xb5, 0xa6, 0x3c, 0xb6, 0x70, 0x8f, 0xbd, 0xbc, 0x9a, 0x57, 0x3d, 0xb3, 0x6f, 0x82, 0xbd, 0x52, 0xb9, 0x5c, 0x3c, 0x0d, 0x71, 0xd9, 0x3c, 0x18, 0x70, 0x0a, 0x3d, 0x80, 0x7b, 0x0a, 0x3b, 0xee, 0x75, 0x27, 0xbc, 0x63, 0x74, 0x56, 0xbd, 0xf0, 0x20, 0x5f, 0x3b, 0xfb, 0x77, 0x1e, 0xba, 0xb8, 0x6c, 0xee, 0x3c, 0x01, 0xd0, 0xef, 0x3c, 0xb2, 0x68, 0x12, 0xbd, 0x51, 0xf6, 0x3c, 0xbd, 0x12, 0xb0, 0x2e, 0xbd, 0x11, 0xfd, 0x5e, 0xbd, 0x48, 0xea, 0xb4, 0xbc, 0xce, 0xca, 0x88, 0x3d, 0x38, 0x57, 0x40, 0x3d, 0x11, 0xfa, 0x8b, 0x3d, 0xc0, 0x34, 0x36, 0x3d, 0xe4, 0x82, 0x8e, 0xbd, 0xbd, 0x95, 0x59, 0xbd, 0xf0, 0x8b, 0x43, 0xbd, 0x93, 0x9b, 0x0a, 0xbc, 0xb7, 0x99, 0x4d, 0x3c, 0x46, 0x42, 0x1d, 0x3d, 0x00, 0x19, 0x3a, 0xbd, 0x1c, 0xd3, 0x5a, 0xbd, 0xff, 0x09, 0x02, 0xbd, 0xa1, 0x01, 0x8e, 0x3d, 0xc3, 0x9e, 0xd8, 0xbb, 0x28, 0xb5, 0x2d, 0x3d, 0x56, 0x9c, 0x16, 0x3d, 0x78, 0xe6, 0x1e, 0xbc, 0x06, 0x56, 0x14, 0x3d, 0xbc, 0x3f, 0x88, 0xbd, 0x34, 0x45, 0x94, 0xbc, 0xfb, 0xb1, 0x0a, 0xbd, 0x67, 0x87, 0x90, 0xbd, 0x4d, 0x75, 0x27, 0xbd, 0x9f, 0xc8, 0x60, 0x3b, 0x02, 0xc4, 0xb0, 0xbc, 0x54, 0x5b, 0x5f, 0xbd, 0xe3, 0x43, 0xff, 0xbc, 0xf6, 0xf7, 0x39, 0xbc, 0x99, 0x4c, 0x82, 0xbd, 0xda, 0x99, 0xa9, 0x3b, 0x6a, 0xd5, 0xee, 0xbc, 0x1e, 0xc1, 0x93, 0xbd, 0xc2, 0x21, 0x52, 0xbc, 0x52, 0xfc, 0x06, 0xbc, 0x70, 0x59, 0x85, 0xbd, 0x5d, 0xbd, 0x8a, 0xbd, 0xe2, 0x10, 0x77, 0x3d, 0x36, 0x83, 0x90, 0xbd, 0x66, 0x9f, 0x90, 0xbc, 0x30, 0x78, 0x4c, 0x3d, 0xd4, 0x2c, 0x8b, 0x3c, 0xe0, 0x8b, 0x4e, 0xbc, 0x31, 0x0f, 0x80, 0xbd, 0x4a, 0xb7, 0x5b, 0xbd, 0x52, 0xd0, 0x1a, 0xbd, 0x5c, 0x20, 0xe3, 0x3c, 0x5a, 0x77, 0x29, 0xbd, 0x90, 0x0b, 0x00, 0xbd, 0x62, 0x10, 0x4c, 0x3d, 0x40, 0x52, 0x58, 0x3c, 0x18, 0x5e, 0x46, 0x3c, 0xc6, 0x6b, 0x37, 0x3d, 0x17, 0x5c, 0x90, 0x3d, 0x28, 0x6c, 0xfd, 0xbc, 0x7e, 0x4b, 0x28, 0xbd, 0x86, 0x7b, 0x1d, 0xbd, 0x2b, 0x78, 0x83, 0x3d, 0x48, 0x65, 0x53, 0x3d, 0x91, 0x41, 0x7b, 0xbd, 0x0a, 0x32, 0x65, 0xbd, 0x80, 0xb5, 0x83, 0xbd, 0x93, 0x10, 0x8b, 0x3d, 0x40, 0xc2, 0x9b, 0x3a, 0xe8, 0xe9, 0xcc, 0x3c, 0xb8, 0xf5, 0x00, 0x3d, 0x2a, 0x60, 0x70, 0x3d, 0xbb, 0xa9, 0x18, 0xbd, 0xbf, 0xca, 0x76, 0xbd, 0xf4, 0x83, 0xda, 0xbc, 0xcc, 0x89, 0xeb, 0x3c, 0xa0, 0x01, 0x27, 0xbb, 0x90, 0x98, 0x1e, 0x3d, 0x2d, 0x7a, 0x91, 0xbd, 0x00, 0x8e, 0x71, 0xbd, 0xc7, 0x30, 0x1a, 0xbd, 0x22, 0xe9, 0x3d, 0x3d, 0x1a, 0xb3, 0x46, 0x3d, 0xbe, 0x20, 0x5a, 0x3d, 0x02, 0x34, 0x0b, 0xbd, 0x8d, 0x91, 0x5c, 0xbd, 0x84, 0xeb, 0xdc, 0xbc, 0xaa, 0x4b, 0xd6, 0xbc, 0xab, 0xd1, 0x91, 0x3d, 0xb8, 0x2c, 0x95, 0x3c, 0x0c, 0xf7, 0x59, 0x3d, 0xc9, 0xea, 0x8e, 0xbd, 0x23, 0xb1, 0x83, 0xbd, 0x27, 0x20, 0x85, 0xbd, 0x40, 0xdb, 0xaa, 0x3a, 0x4c, 0x7b, 0x48, 0xbc, 0x00, 0x62, 0x9d, 0x3b, 0xaf, 0xeb, 0x83, 0x3d, 0xe0, 0x4e, 0x1d, 0x3b, 0x90, 0xf9, 0xdc, 0xbc, 0xd6, 0x49, 0x60, 0x3d, 0x4e, 0x96, 0x66, 0x3d, 0xbe, 0x9e, 0x9b, 0xbc, 0xec, 0x9e, 0xff, 0x3c, 0xd0, 0xa1, 0x0b, 0x3d, 0xb4, 0x2d, 0x39, 0x3d, 0x28, 0x62, 0x9a, 0x3c, 0xce, 0xdc, 0x67, 0x3d, 0xe8, 0xb6, 0x68, 0x3c, 0xb6, 0x37, 0x87, 0xbd, 0xee, 0xd3, 0x67, 0x3d, 0x18, 0xfb, 0x31, 0x3c, 0x27, 0x89, 0x26, 0xbd, 0x30, 0x9e, 0xc0, 0x3c, 0xd0, 0x5b, 0x30, 0xbd, 0x90, 0x96, 0x33, 0x3c, 0x1e, 0xf8, 0x20, 0xbd, 0x48, 0xa2, 0xa2, 0x3c, 0x2e, 0x6b, 0x3f, 0xbd, 0x32, 0x37, 0x1e, 0x3d, 0x10, 0x9e, 0x26, 0xbd, 0x1c, 0xd5, 0x60, 0xbd, 0xf5, 0x5f, 0x06, 0xbd, 0x87, 0xff, 0x71, 0xbd, 0x1d, 0xba, 0x8c, 0xbd, 0x00, 0xe0, 0x8c, 0xba, 0x20, 0x94, 0x0d, 0xbc, 0x5a, 0x15, 0x84, 0xbc, 0x36, 0x58, 0x50, 0x3d, 0x7a, 0x21, 0x5c, 0x3d, 0x78, 0x57, 0x39, 0xbd, 0x8d, 0x3b, 0x59, 0xbd, 0x90, 0x90, 0x80, 0xbb, 0xf0, 0x93, 0xbe, 0x3b, 0x50, 0x34, 0xe1, 0xbb, 0xc0, 0xac, 0xd3, 0xba, 0x42, 0x75, 0xb4, 0xbc, 0x38, 0xaa, 0x30, 0xbd, 0xa6, 0x79, 0x49, 0x3d, 0xfc, 0xd2, 0x37, 0xbc, 0xe0, 0x0d, 0xd6, 0xbb, 0xc1, 0x2d, 0x73, 0xbd, 0x4a, 0xf1, 0x5b, 0xbd, 0xd4, 0x0c, 0x82, 0x3c, 0xce, 0x51, 0x0c, 0xbd, 0xe0, 0x9c, 0x4e, 0xbd, 0x3e, 0x98, 0x6a, 0x3d, 0x7e, 0xbf, 0x27, 0x3d, 0x00, 0xb2, 0x6f, 0xbd, 0x0c, 0xcd, 0x4d, 0x3d, 0xfa, 0x7b, 0x22, 0x3d, 0x18, 0x3f, 0x02, 0xbc, 0xa4, 0x1a, 0xb7, 0xbc, 0xe2, 0xf5, 0x45, 0x3d, 0xf0, 0x66, 0xe6, 0xbb, 0xd2, 0x56, 0x54, 0x3d, 0x72, 0xff, 0x64, 0x3d, 0x68, 0xbf, 0x41, 0x3d, 0x8c, 0xa8, 0x39, 0xbd, 0x4b, 0x80, 0x88, 0x3d, 0x40, 0x05, 0x8f, 0x3c, 0x9a, 0x58, 0x6b, 0xbd, 0xb6, 0xc7, 0x58, 0xbd, 0x66, 0x73, 0x12, 0x3d, 0x9c, 0x2b, 0x50, 0xbd, 0xc8, 0x47, 0x7d, 0xbc, 0xb7, 0x6a, 0x04, 0xbd, 0xe6, 0x6a, 0x23, 0x3d, 0xdb, 0x11, 0x1f, 0xbd, 0x60, 0x1d, 0x5e, 0xbc, 0x80, 0x70, 0x72, 0xbd, 0x08, 0xed, 0x51, 0x3c, 0xb8, 0x35, 0x0c, 0xbc, 0x2e, 0xef, 0x47, 0x3d, 0xd0, 0xfb, 0xdf, 0x3b, 0xee, 0xea, 0x5c, 0x3d, 0x52, 0xa6, 0x7f, 0x3d, 0x1c, 0xd4, 0x92, 0x3c, 0x0c, 0xe1, 0xe3, 0x3c, 0x0b, 0x0e, 0x8b, 0x3d, 0x1e, 0x6f, 0x20, 0x3d, 0xee, 0xf3, 0x45, 0xbd, 0x28, 0xef, 0xfc, 0x3c, 0x48, 0x19, 0x8c, 0xbd, 0x02, 0x87, 0x7f, 0xbd, 0x6c, 0xc1, 0x4b, 0x3d, 0x30, 0x88, 0x72, 0xbc, 0x00, 0xb2, 0xce, 0x39, 0x68, 0x2f, 0xf1, 0xbc, 0x00, 0xa0, 0x3b, 0xb8, 0x0c, 0x90, 0x7b, 0xbd, 0xd0, 0x97, 0x45, 0xbd, 0xf6, 0xf5, 0x5d, 0x3d, 0x50, 0x0b, 0x0e, 0x3c, 0x48, 0x51, 0xf9, 0x3c, 0xb7, 0xe4, 0x4d, 0xbd, 0xca, 0x8d, 0xcf, 0xbc, 0x49, 0x0d, 0x88, 0xbd, 0xb1, 0x3c, 0x8f, 0x3d, 0xef, 0x72, 0x8a, 0x3d, 0x90, 0x23, 0x02, 0x3d, 0xe8, 0x60, 0x05, 0x3c, 0xc0, 0x9f, 0xb6, 0xba, 0xd5, 0x57, 0x03, 0xbd, 0x22, 0xae, 0x66, 0x3d, 0x61, 0x03, 0x8b, 0xbd, 0xcc, 0x23, 0xea, 0xbc, 0x80, 0x58, 0x4f, 0x3c, 0x60, 0xea, 0xd0, 0x3b, 0xae, 0x19, 0x2e, 0xbd, 0x5e, 0xee, 0xb5, 0xbc, 0x50, 0x19, 0x18, 0x3c, 0x6d, 0xd7, 0x78, 0xbd, 0x40, 0xcb, 0xe9, 0xbc, 0xea, 0x76, 0x53, 0xbd, 0x2c, 0x0e, 0x6b, 0xbc, 0xd8, 0xd6, 0x6a, 0x3c, 0xe0, 0x3d, 0x80, 0xbd, 0x80, 0x36, 0xf1, 0xba, 0x30, 0x30, 0x51, 0x3c, 0x40, 0x41, 0xa3, 0xba, 0xc8, 0xe8, 0x80, 0xbd, 0x72, 0x33, 0x67, 0x3d, 0xdd, 0x7d, 0x0c, 0xbd, 0x1c, 0xcf, 0xbe, 0x3c, 0x8c, 0x1d, 0x8f, 0xbd, 0x4c, 0x5a, 0x3a, 0x3d, 0xa0, 0x35, 0xff, 0x3b, 0x50, 0xb8, 0xea, 0xbb, 0x58, 0x63, 0x26, 0xbc, 0x70, 0x33, 0x0c, 0xbc, 0x58, 0xbb, 0x09, 0xbc, 0x1a, 0xd0, 0xf6, 0xbc, 0x02, 0xb0, 0x08, 0x3d, 0x4c, 0x72, 0xa7, 0x3c, 0x10, 0xa0, 0xa7, 0x3b, 0x7c, 0xab, 0x3f, 0x3d, 0x12, 0x95, 0xc6, 0xbc, 0x58, 0xe5, 0xac, 0xbc, 0x80, 0xbc, 0x56, 0x3b, 0x00, 0xd2, 0xda, 0xbb, 0x26, 0xff, 0xaa, 0xbc, 0xf2, 0xdc, 0x71, 0x3d, 0x30, 0xaf, 0x85, 0xbb, 0x88, 0xf9, 0x14, 0x3d, 0x50, 0x89, 0xc5, 0xbb, 0xc0, 0xd0, 0xf1, 0x3b, 0x95, 0xf2, 0x7b, 0xbd, 0x66, 0x43, 0xfa, 0xbc, 0xa0, 0x68, 0xf3, 0xbb, 0x60, 0xa0, 0xdc, 0x3c, 0x0e, 0x67, 0x6e, 0x3d, 0xdd, 0xec, 0x8a, 0xbd, 0xca, 0x1e, 0x8f, 0xbd, 0x64, 0x84, 0x6c, 0xbd, 0xee, 0x7b, 0x7a, 0xbd, 0xd2, 0xdc, 0x97, 0xbc, 0x84, 0x44, 0x77, 0xbd, 0xf8, 0xec, 0x0e, 0xbd, 0xea, 0x25, 0x03, 0x3d, 0x8e, 0x42, 0x27, 0xbd, 0x31, 0x0b, 0x87, 0x3d, 0xba, 0x5e, 0x31, 0xbd, 0x74, 0xee, 0xa5, 0x3c, 0xb5, 0xa1, 0x83, 0x3d, 0x48, 0x87, 0xad, 0x3c, 0x5c, 0xc4, 0x04, 0xbd, 0xe6, 0xe7, 0x4e, 0x3d, 0x24, 0xa4, 0xb2, 0xbc, 0x02, 0x4a, 0x8d, 0xbd, 0xfa, 0x96, 0x92, 0xbd, 0xf8, 0x1e, 0xaf, 0x3c, 0x80, 0xdb, 0xfe, 0x3a, 0x20, 0x48, 0xff, 0xbb, 0xf2, 0xdd, 0x63, 0x3d, 0x2c, 0x12, 0xaf, 0x3c, 0x8a, 0x05, 0xcf, 0xbc, 0xd8, 0x3a, 0x23, 0x3d, 0x2b, 0x32, 0x89, 0xbd, 0xd0, 0xff, 0x8b, 0x3b, 0x58, 0xd1, 0x13, 0xbd, 0x00, 0xac, 0x96, 0x3a, 0x8a, 0x92, 0x33, 0x3d, 0x1c, 0xdb, 0x2f, 0xbc, 0x8a, 0x30, 0x69, 0xbd, 0x80, 0xcc, 0x7a, 0x3b, 0x88, 0xaa, 0x7b, 0xbd, 0x03, 0xda, 0x8e, 0xbd, 0x10, 0x40, 0xfe, 0x3b, 0x74, 0x92, 0x0b, 0x3d, 0x54, 0x61, 0x7e, 0xbd, 0xdd, 0x2f, 0x75, 0xbd, 0xa8, 0xcd, 0x52, 0x3c, 0x20, 0xf1, 0x57, 0x3d, 0x98, 0x18, 0x05, 0xbc, 0x86, 0x14, 0x3a, 0x3d, 0xf0, 0xa5, 0x94, 0x3b, 0x13, 0xd7, 0x8b, 0x3d, 0xbe, 0x38, 0x1e, 0x3d, 0xe6, 0xa2, 0x8d, 0xbc, 0xc0, 0x39, 0xdf, 0x3c, 0xf8, 0x3f, 0x8b, 0xbd, 0xc9, 0x86, 0x8a, 0x3d, 0x51, 0xa4, 0x6d, 0xbd, 0x7b, 0xe0, 0x82, 0x3d, 0x50, 0x6e, 0x6d, 0x3c, 0xd0, 0x15, 0x60, 0xbd, 0x46, 0xec, 0x06, 0xbd, 0x50, 0x8b, 0x0f, 0x3d, 0x8e, 0x36, 0xab, 0xbc, 0x7f, 0x46, 0x74, 0xbd, 0x4e, 0x2b, 0x63, 0xbd, 0x6e, 0xdf, 0x2c, 0x3d, 0xee, 0x87, 0x60, 0x3d, 0x4e, 0x24, 0x6e, 0xbd, 0x06, 0xbf, 0x7d, 0x3d, 0x40, 0xf6, 0x25, 0x3c, 0xba, 0xea, 0x01, 0x3d, 0x29, 0x4f, 0x8c, 0xbd, 0xf3, 0x02, 0x8b, 0xbd, 0x7c, 0x06, 0x30, 0xbd, 0xda, 0x97, 0x1e, 0x3d, 0xad, 0x89, 0x8b, 0xbd, 0x90, 0x78, 0xd1, 0x3b, 0x2c, 0x75, 0xb5, 0x3c, 0x41, 0x04, 0x40, 0xbd, 0x52, 0x9d, 0x08, 0x3d, 0xf4, 0x53, 0xbf, 0x3c, 0x48, 0x82, 0x16, 0x3c, 0x3a, 0xa1, 0x72, 0x3d, 0xc8, 0x73, 0x32, 0x3d, 0x5a, 0x20, 0x20, 0x3d, 0x08, 0xb1, 0x48, 0x3d, 0x46, 0x6e, 0x73, 0x3d, 0x59, 0x17, 0x0f, 0xbd, 0xb8, 0xa7, 0x01, 0x3c, 0x10, 0x53, 0x46, 0x3c, 0x27, 0xc2, 0x3f, 0xbd, 0x77, 0x6b, 0x91, 0x3d, 0xa8, 0x1c, 0xec, 0x3c, 0xfd, 0x09, 0x92, 0xbd, 0x1c, 0x87, 0x89, 0xbd, 0x60, 0x10, 0xdc, 0xbb, 0x00, 0x40, 0xd1, 0x36, 0x48, 0xb3, 0x28, 0x3c, 0xc8, 0xb3, 0x94, 0x3c, 0xfa, 0x6c, 0x8e, 0xbc, 0x98, 0x5b, 0x68, 0xbc, 0x32, 0xc1, 0x3b, 0x3d, 0xb7, 0xd5, 0x81, 0x3d, 0x48, 0xb6, 0x10, 0x3d, 0x5c, 0x95, 0x58, 0xbd, 0xf6, 0xb9, 0x00, 0xbd, 0xaa, 0xbe, 0x51, 0xbd, 0x2e, 0xbc, 0x70, 0x3d, 0xc8, 0x89, 0x06, 0x3c, 0x00, 0x00, 0x41, 0xb9, 0x31, 0x3e, 0x10, 0xbd, 0xf0, 0x26, 0x14, 0xbc, 0x98, 0xfc, 0xf2, 0x3c, 0xf3, 0x6d, 0x27, 0xbd, 0xd0, 0xdd, 0x2e, 0xbc, 0xee, 0x5b, 0x92, 0xbd, 0xc6, 0x4c, 0x24, 0x3d, 0x3c, 0x5e, 0x01, 0x3d, 0x6a, 0xe6, 0x26, 0xbd, 0x90, 0xd6, 0x1f, 0x3c, 0xbc, 0x88, 0xcd, 0x3c, 0xb0, 0xad, 0xee, 0x3c, 0xd4, 0xc5, 0xdf, 0x3c, 0xa6, 0x0f, 0xe7, 0xbc, 0x51, 0x99, 0x84, 0x3d, 0xc4, 0x84, 0x6a, 0xbc, 0xa8, 0xb6, 0x5c, 0xbc, 0x00, 0xba, 0x3a, 0x39, 0x28, 0x4f, 0x59, 0x3d, 0x80, 0x55, 0x45, 0xba, 0x48, 0x20, 0x84, 0xbc, 0x3f, 0xfd, 0x90, 0x3d, 0x74, 0x17, 0x82, 0xbd, 0x93, 0xd5, 0x26, 0xbd, 0xc0, 0x02, 0xbf, 0xbc, 0x42, 0xdf, 0x24, 0x3d, 0x0e, 0xac, 0xd5, 0xbc, 0x42, 0xcc, 0x7a, 0xbd, 0xd0, 0x21, 0xf6, 0x3b, 0x88, 0x2e, 0x63, 0xbd, 0x08, 0xdd, 0xc4, 0xbc, 0x08, 0xa7, 0x6b, 0x3c, 0x17, 0x07, 0x83, 0xbd, 0x31, 0xfd, 0x81, 0x3d, 0x68, 0xb0, 0x3f, 0x3c, 0xec, 0x78, 0xc0, 0xbc, 0x40, 0x91, 0x3b, 0x3c, 0x80, 0x96, 0xbf, 0x3a, 0x94, 0xed, 0xa7, 0x3c, 0xb0, 0xf7, 0x2a, 0x3c, 0x00, 0x90, 0xc6, 0x37, 0xb4, 0x0d, 0x89, 0xbd, 0xd0, 0x28, 0xb0, 0xbb, 0xf0, 0x65, 0x06, 0x3c, 0xcd, 0xc8, 0x8d, 0x3d, 0x66, 0xa5, 0x6f, 0x3d, 0x36, 0x46, 0x4c, 0x3d, 0x00, 0x80, 0x67, 0x36, 0xaf, 0x78, 0x20, 0xbd, 0xce, 0x83, 0x08, 0x3d, 0x7f, 0x32, 0x84, 0xbd, 0x23, 0x80, 0x8e, 0x3d, 0xb4, 0xa5, 0x56, 0x3d, 0xe4, 0xc2, 0x10, 0xbd, 0xc0, 0xf4, 0xe9, 0xba, 0xa6, 0x4e, 0x6d, 0x3d, 0x04, 0x19, 0xad, 0xbc, 0x0c, 0xf2, 0x38, 0x3d, 0xc6, 0x2c, 0x29, 0xbd, 0xba, 0x51, 0x5c, 0x3d, 0x20, 0x92, 0xae, 0x3c, 0x68, 0x55, 0xf7, 0x3c, 0x40, 0x10, 0x08, 0x3d, 0x86, 0x95, 0x62, 0x3d, 0x36, 0xef, 0x80, 0xbd, 0xd8, 0x21, 0x37, 0xbd, 0x28, 0x37, 0x93, 0xbc, 0x20, 0xb5, 0x35, 0x3b, 0x2f, 0x41, 0x86, 0xbd, 0xf0, 0xf4, 0xfd, 0xbc, 0x3e, 0xa1, 0x8a, 0xbd, 0x38, 0xf3, 0x8f, 0xbd, 0x15, 0xd9, 0x6e, 0xbd, 0xb8, 0xd9, 0x4b, 0x3d, 0x6e, 0x7c, 0x61, 0xbd, 0x00, 0x0e, 0x4d, 0xbb, 0xf8, 0xa5, 0x58, 0xbc, 0x20, 0x15, 0xb6, 0x3b, 0xa0, 0x58, 0x09, 0x3b, 0xed, 0x15, 0x72, 0xbd, 0x00, 0xc6, 0x1a, 0x3a, 0x90, 0xdf, 0x44, 0x3d, 0x70, 0xb4, 0x28, 0xbd, 0x66, 0x55, 0x7d, 0xbd, 0x94, 0x94, 0x84, 0x3c, 0x49, 0xde, 0x32, 0xbd, 0x32, 0x47, 0x13, 0x3d, 0x2e, 0x3b, 0x4a, 0xbd, 0x8a, 0x6d, 0x53, 0xbd, 0x88, 0x9e, 0x8b, 0xbc, 0xfe, 0x9b, 0xd0, 0xbc, 0xf0, 0xb2, 0x16, 0x3c, 0x8c, 0x8a, 0x85, 0x3c, 0xd5, 0x73, 0x8b, 0xbd, 0xd6, 0xd6, 0x02, 0xbd, 0x70, 0x96, 0x22, 0x3d, 0x8a, 0x4b, 0x1c, 0x3d, 0x80, 0x91, 0xeb, 0x3a, 0x80, 0x29, 0x95, 0x3c, 0x71, 0xf1, 0x8d, 0x3d, 0x3e, 0x5e, 0x5e, 0xbd, 0xd2, 0x53, 0x63, 0x3d, 0x0b, 0xcb, 0x8d, 0xbd, 0x58, 0x76, 0x5f, 0xbc, 0xc2, 0xe8, 0x02, 0x3d, 0x9c, 0x96, 0x99, 0x3c, 0xbc, 0xe8, 0x96, 0x3c, 0xff, 0x05, 0x45, 0xbd, 0x48, 0xa6, 0x02, 0x3d, 0x83, 0x34, 0x87, 0xbd, 0xe4, 0x9a, 0x47, 0x3d, 0xd8, 0x5f, 0xc5, 0x3c, 0x0c, 0x1c, 0xee, 0xbc, 0x3e, 0x65, 0x46, 0x3d, 0xe5, 0xd2, 0x10, 0xbd, 0x00, 0x98, 0x9a, 0xbb, 0x06, 0x89, 0x8d, 0xbc, 0xb8, 0x08, 0xc5, 0xbc, 0x9e, 0xeb, 0xbd, 0xbc, 0x98, 0x4b, 0x78, 0xbd, 0x7d, 0x8a, 0x7d, 0xbd, 0x00, 0x70, 0xf6, 0x39, 0xe0, 0x0c, 0xba, 0x3b, 0xa2, 0xf4, 0xdf, 0xbc, 0xca, 0x61, 0x79, 0xbd, 0x44, 0x6f, 0xa3, 0xbc, 0x3c, 0x56, 0xe1, 0x3c, 0x90, 0xfd, 0x3c, 0xbd, 0x71, 0x08, 0x35, 0xbd, 0xde, 0x28, 0x6b, 0xbd, 0xae, 0xe2, 0x36, 0x3d, 0xe7, 0x04, 0x1e, 0xbd, 0x94, 0x0b, 0x1a, 0x3d, 0x3a, 0x8f, 0x26, 0x3d, 0x40, 0xbe, 0x07, 0xbc, 0x10, 0x36, 0x8d, 0xbd, 0x40, 0x7b, 0x06, 0x3b, 0xd8, 0x7b, 0x2c, 0x3d, 0x4f, 0x09, 0x59, 0xbd, 0x28, 0xc9, 0xeb, 0x3c, 0x1c, 0xee, 0x7c, 0xbc, 0xf0, 0x79, 0x19, 0x3c, 0xf8, 0x06, 0x72, 0x3c, 0xe0, 0x83, 0xb5, 0x3b, 0xc8, 0xca, 0x47, 0x3c, 0x88, 0x99, 0x0c, 0x3d, 0xe6, 0x5f, 0xaf, 0xbc, 0x14, 0x1b, 0x4f, 0xbc, 0x13, 0x70, 0x80, 0xbd, 0xdd, 0x13, 0x18, 0xbd, 0x4e, 0xae, 0xe3, 0xbc, 0xaa, 0x98, 0x7d, 0x3d, 0x00, 0xf9, 0x2f, 0x3c, 0xdd, 0xd1, 0x8c, 0x3d, 0x28, 0x5c, 0x3c, 0x3d, 0x90, 0x81, 0x38, 0x3d, 0x3a, 0xf4, 0x5d, 0x3d, 0xc2, 0x24, 0x53, 0x3d, 0x00, 0x34, 0x42, 0xbb, 0x32, 0xc8, 0x78, 0x3d, 0x7a, 0x94, 0xe6, 0xbc, 0x76, 0x8f, 0x80, 0xbc, 0x83, 0xca, 0x8b, 0x3d, 0x62, 0xfb, 0x78, 0x3d, 0xe9, 0x00, 0x90, 0x3d, 0xe8, 0x9b, 0x1c, 0xbd, 0x66, 0xd9, 0x8d, 0xbd, 0xa2, 0xe7, 0x73, 0x3d, 0xd8, 0xb6, 0xb9, 0xbc, 0xa0, 0x55, 0x70, 0x3b, 0x08, 0x5b, 0x00, 0x3c, 0xb4, 0xd0, 0x58, 0xbd, 0xe4, 0x3b, 0x52, 0xbd, 0xb0, 0x22, 0x3d, 0x3d, 0x4a, 0x4f, 0x81, 0xbd, 0x48, 0xf0, 0x6a, 0x3c, 0x61, 0xf4, 0x65, 0xbd, 0x34, 0x4e, 0x00, 0x3d, 0xd1, 0x71, 0x3c, 0xbd, 0x8e, 0x3e, 0x70, 0x3d, 0x55, 0x7a, 0x27, 0xbd, 0x68, 0x22, 0xd5, 0xbc, 0x59, 0x71, 0x90, 0xbd, 0xc8, 0xb0, 0x60, 0x3c, 0x74, 0x5b, 0x36, 0xbd, 0xdc, 0x16, 0xbf, 0x3c, 0x62, 0x7a, 0xe3, 0xbc, 0x00, 0x21, 0x8e, 0xba, 0x1e, 0x0d, 0x08, 0xbd, 0xa3, 0x7a, 0x07, 0xbd, 0xb4, 0x92, 0xee, 0x3c, 0x8d, 0xd2, 0x81, 0x3d, 0x40, 0xc6, 0x98, 0x3c, 0x78, 0xc1, 0x69, 0x3c, 0x36, 0x9a, 0x72, 0x3d, 0xd2, 0xfa, 0xe3, 0xbc, 0x42, 0x4c, 0x0e, 0x3d, 0x97, 0x2c, 0x88, 0x3d, 0x78, 0x6f, 0x13, 0xbc, 0x40, 0x90, 0x7a, 0x3b, 0x66, 0x40, 0x95, 0xbc, 0xb8, 0xe6, 0x33, 0x3d, 0x64, 0x0c, 0xf1, 0x3c, 0xb3, 0xc0, 0x1f, 0xbd, 0x67, 0x03, 0x03, 0xbd, 0xe4, 0x7c, 0xfb, 0x3c, 0x7e, 0x22, 0x0e, 0xbd, 0xd6, 0x60, 0x8d, 0xbd, 0xcc, 0xa2, 0x2c, 0xbd, 0x00, 0xa4, 0xd6, 0x39, 0xf8, 0x7d, 0x8d, 0xbd, 0xe4, 0x27, 0x9a, 0xbc, 0xd8, 0x19, 0x61, 0xbd, 0xb8, 0x49, 0x54, 0xbd, 0x70, 0xcb, 0xd3, 0x3b, 0x49, 0xe1, 0x89, 0x3d, 0x06, 0x6c, 0x78, 0x3d, 0xc0, 0xbe, 0x82, 0x3c, 0x4d, 0x99, 0x8f, 0x3d, 0xd8, 0x0d, 0xe6, 0x3c, 0x4e, 0x2d, 0x60, 0x3d, 0x1c, 0xab, 0x99, 0x3c, 0x66, 0xc6, 0xcc, 0xbc, 0x28, 0x76, 0x0b, 0xbc, 0x7b, 0x6e, 0x90, 0x3d, 0x3b, 0x2f, 0x1c, 0xbd, 0x60, 0x1e, 0x83, 0x3b, 0xc8, 0x88, 0xfd, 0x3c, 0x00, 0x48, 0xa8, 0x3c, 0x40, 0x3d, 0xd4, 0x3b, 0xa4, 0x83, 0xfc, 0x3c, 0x3c, 0xe7, 0xd8, 0x3c, 0xfe, 0xaa, 0x6f, 0x3d, 0xbb, 0x22, 0x90, 0xbd, 0xd6, 0xf5, 0x29, 0x3d, 0x8e, 0x7e, 0x65, 0x3d, 0xae, 0x3b, 0xe4, 0xbc, 0xea, 0x04, 0x54, 0x3d, 0x64, 0x22, 0x1f, 0x3d, 0x24, 0x95, 0x90, 0x3c, 0xcd, 0x7b, 0x21, 0xbd, 0xd0, 0xf8, 0xb9, 0x3b, 0x26, 0xf8, 0x28, 0xbd, 0x6a, 0x37, 0x5b, 0x3d, 0x6e, 0x7e, 0x70, 0x3d, 0xa0, 0x90, 0xec, 0x3c, 0x00, 0x8e, 0x0d, 0xbb, 0xe0, 0xbe, 0x5b, 0xbb, 0x58, 0xf6, 0x9c, 0x3c, 0xbe, 0x59, 0xc0, 0xbc, 0x64, 0x78, 0xa4, 0x3c, 0x79, 0xfb, 0x86, 0x3d, 0x60, 0x6c, 0x85, 0xbc, 0xba, 0x44, 0x18, 0xbd, 0x5e, 0xea, 0x6a, 0xbd, 0x6c, 0xf4, 0x36, 0xbd, 0xee, 0xd4, 0x4c, 0xbd, 0xa2, 0x17, 0x16, 0x3d, 0x98, 0x59, 0xb9, 0x3c, 0x90, 0x41, 0x3d, 0x3c, 0x66, 0x14, 0x06, 0x3d, 0x40, 0xa2, 0x17, 0xbb, 0xdd, 0x83, 0x75, 0xbd, 0x2c, 0x19, 0x8f, 0x3c, 0xfe, 0xde, 0x49, 0xbd, 0x57, 0x3d, 0x85, 0x3d, 0x1c, 0xb3, 0xef, 0xbc, 0x58, 0xdb, 0x3f, 0xbd, 0x0e, 0x38, 0x20, 0x3d, 0x80, 0xbf, 0xa7, 0x3a, 0xf0, 0xe2, 0x91, 0xbd, 0xcc, 0x0f, 0x0a, 0x3d, 0xc7, 0xad, 0x4d, 0xbd, 0x64, 0x33, 0x69, 0xbd, 0xc0, 0xc0, 0xd7, 0xbb, 0xb0, 0x16, 0x83, 0xbd, 0xd0, 0xbf, 0x3c, 0x3d, 0x11, 0x62, 0x87, 0x3d, 0x68, 0x04, 0x0f, 0x3d, 0x6e, 0xee, 0x2a, 0x3d, 0xb8, 0x70, 0x37, 0xbc, 0x62, 0x76, 0x7e, 0x3d, 0x84, 0xbc, 0xa0, 0x3c, 0xc0, 0xc9, 0x26, 0xbd, 0x82, 0x1a, 0x85, 0xbd, 0x80, 0x55, 0x8e, 0xbd, 0xe4, 0xdb, 0x48, 0x3d, 0x60, 0xa5, 0xd6, 0x3b, 0x39, 0x18, 0x92, 0x3d, 0x36, 0x5a, 0x6c, 0xbd, 0xe8, 0x77, 0xcb, 0x3c, 0x48, 0x9e, 0x12, 0x3d, 0x3b, 0x40, 0x91, 0xbd, 0x00, 0xe0, 0xf6, 0x38, 0xd6, 0xa0, 0x2f, 0xbd, 0xe0, 0xe2, 0x0f, 0xbc, 0xf4, 0x85, 0x50, 0x3d, 0x64, 0xf7, 0x9b, 0x3c, 0xdc, 0x72, 0x53, 0x3d, 0x28, 0x0b, 0x45, 0xbc, 0x4e, 0xb5, 0x3f, 0xbd, 0x34, 0x7a, 0xea, 0x3c, 0x58, 0xe1, 0x71, 0x3c, 0x60, 0x5b, 0xf8, 0xbc, 0xf8, 0x3d, 0x52, 0x3c, 0xd0, 0xdc, 0x67, 0xbd, 0xee, 0x2d, 0x0c, 0x3d, 0x70, 0x47, 0xb0, 0x3c, 0x70, 0x7c, 0x29, 0x3d, 0xf4, 0x97, 0xc9, 0x3c, 0x74, 0x63, 0x32, 0x3d, 0x6c, 0x17, 0x94, 0x3c, 0x87, 0xdc, 0x7a, 0xbd, 0xb6, 0xf5, 0x7c, 0x3d, 0x62, 0xd2, 0xe7, 0xbc, 0x99, 0xa5, 0x50, 0xbd, 0x4c, 0xa2, 0xb1, 0xbc, 0xf0, 0x38, 0xdd, 0xbb, 0xac, 0x44, 0x3f, 0xbd, 0x34, 0xb7, 0x06, 0x3d, 0xf6, 0x65, 0x25, 0x3d, 0xdb, 0x01, 0x1e, 0xbd, 0x68, 0xee, 0x19, 0xbc, 0x4c, 0xdd, 0x8a, 0x3c, 0xe0, 0xe4, 0x14, 0xbc, 0x9e, 0x6f, 0x21, 0x3d, 0x18, 0xd1, 0x59, 0x3d, 0x0c, 0xdd, 0xe1, 0xbc, 0x84, 0xa1, 0xe6, 0x3c, 0x5c, 0x56, 0xfa, 0x3c, 0xc4, 0x30, 0x8d, 0x3c, 0x9c, 0xba, 0x12, 0xbd, 0xe0, 0x85, 0xbf, 0xbc, 0x00, 0x1d, 0x62, 0xbb, 0xe4, 0x7a, 0x13, 0x3d, 0x36, 0x6c, 0x07, 0x3d, 0x88, 0xb1, 0x2a, 0x3c, 0x06, 0xba, 0x16, 0xbd, 0x24, 0x12, 0xaf, 0x3c, 0x7c, 0x97, 0x3b, 0xbc, 0xe4, 0x3d, 0x2e, 0xbd, 0x8c, 0x86, 0xa9, 0xbc, 0x6c, 0x70, 0x06, 0x3d, 0x0b, 0x2c, 0x76, 0xbd, 0x72, 0x24, 0xe8, 0xbc, 0x22, 0xeb, 0x70, 0x3d, 0xf0, 0xfb, 0x7b, 0x3c, 0x62, 0x51, 0x08, 0xbd, 0x52, 0x97, 0x88, 0xbd, 0x58, 0x8d, 0x76, 0x3c, 0x3c, 0x79, 0xf1, 0x3c, 0x6c, 0x9b, 0xbd, 0xbc, 0xa4, 0xf4, 0xe9, 0x3c, 0x80, 0x4d, 0x22, 0x3a, 0x78, 0x12, 0x81, 0x3c, 0x9a, 0xc5, 0x4a, 0x3d, 0xfa, 0x9b, 0x4a, 0x3d, 0x0c, 0x20, 0x7f, 0xbd, 0x36, 0x46, 0x06, 0xbd, 0x60, 0x13, 0xbd, 0xbb, 0x8e, 0x08, 0x92, 0xbc, 0xca, 0x25, 0x1c, 0x3d, 0xb2, 0x84, 0x3f, 0x3d, 0x98, 0x3f, 0x47, 0x3d, 0x58, 0x18, 0x4b, 0x3d, 0x60, 0x91, 0x63, 0xbb, 0xa2, 0x5c, 0xea, 0xbc, 0xc4, 0x8e, 0x86, 0x3c, 0x5c, 0x76, 0x91, 0xbd, 0x10, 0xa2, 0x1d, 0xbc, 0xe0, 0xcb, 0xb5, 0xbb, 0x50, 0xd2, 0xe2, 0x3c, 0x98, 0xbd, 0x88, 0xbd, 0x00, 0xd8, 0x0f, 0x39, 0x72, 0x33, 0x20, 0x3d, 0x00, 0x13, 0xbd, 0x39, 0xae, 0xc3, 0xd1, 0xbc, 0xec, 0x7e, 0xb8, 0xbc, 0x78, 0xb4, 0x90, 0xbc, 0xc2, 0x01, 0x68, 0x3d, 0x40, 0x0a, 0x4f, 0xbb, 0xb7, 0xe6, 0x87, 0x3d, 0x35, 0xe8, 0x85, 0x3d, 0x94, 0x2a, 0xe6, 0x3c, 0xd8, 0x5c, 0x69, 0x3c, 0x20, 0x8e, 0xc2, 0xbb, 0x4c, 0xa2, 0x92, 0x3c, 0xd6, 0xc7, 0x73, 0x3d, 0xf8, 0x0c, 0xb8, 0x3c, 0x40, 0x90, 0xb9, 0x3a, 0x2e, 0x2b, 0x31, 0x3d, 0x18, 0xf5, 0x8a, 0x3c, 0x91, 0x95, 0x5b, 0xbd, 0xc0, 0xfa, 0xc8, 0x3a, 0x72, 0xf1, 0xa9, 0xbc, 0x36, 0x77, 0x48, 0xbd, 0x73, 0x0d, 0x6c, 0xbd, 0x70, 0x22, 0xe4, 0xbb, 0x88, 0x5c, 0x28, 0x3d, 0xc6, 0x18, 0x3e, 0x3d, 0x94, 0x3c, 0xd1, 0xbc, 0x7f, 0x43, 0x15, 0xbd, 0xee, 0x0d, 0x9e, 0xbc, 0x62, 0xff, 0x29, 0x3d, 0xf0, 0x56, 0xf2, 0x3b, 0x22, 0x3f, 0x4e, 0x3d, 0xb6, 0x94, 0x39, 0xbd, 0x9e, 0xf1, 0x45, 0xbd, 0x87, 0xdb, 0x85, 0x3d, 0xd8, 0x35, 0x65, 0x3c, 0xcc, 0x13, 0x8a, 0x3c, 0x44, 0x89, 0x64, 0xbc, 0xe6, 0xb5, 0x2a, 0xbd, 0x28, 0x4f, 0x69, 0x3c, 0x36, 0x45, 0x53, 0x3d, 0x3a, 0xd2, 0xfe, 0xbc, 0xce, 0xa8, 0xa2, 0xbc, 0x8a, 0x16, 0x7d, 0xbd, 0xc2, 0xd5, 0xd9, 0xbc, 0xa0, 0x4a, 0x87, 0xbd, 0x9e, 0xc2, 0x2c, 0x3d, 0xfc, 0x3a, 0xaf, 0x3c, 0x9e, 0x10, 0x40, 0xbd, 0xe0, 0x3a, 0x82, 0x3b, 0x0c, 0xe4, 0xfc, 0x3c, 0xd8, 0x07, 0x57, 0xbd, 0xba, 0x34, 0x91, 0xbd, 0xc6, 0x42, 0x51, 0x3d, 0xc0, 0xe9, 0xe1, 0x3b, 0x9c, 0x4a, 0x2a, 0xbc, 0xc6, 0x92, 0x7b, 0x3d, 0x12, 0x9f, 0x59, 0xbd, 0x0c, 0x62, 0xfd, 0xbc, 0x6c, 0x1a, 0xe6, 0x3c, 0x72, 0x2c, 0x4b, 0x3d, 0x7a, 0xa5, 0x3b, 0xbd, 0xfa, 0x37, 0x7b, 0x3d, 0xc0, 0xf0, 0x87, 0xbc, 0x28, 0xd1, 0x5a, 0x3c, 0xd7, 0x35, 0x6b, 0xbd, 0x7e, 0x9c, 0x6f, 0x3d, 0x1a, 0xf6, 0x23, 0xbd, 0x66, 0x3b, 0xa2, 0xbc, 0x00, 0xb5, 0x5d, 0xba, 0xbb, 0xc3, 0x52, 0xbd, 0x24, 0x0d, 0x14, 0x3d, 0x6f, 0x6f, 0x7d, 0xbd, 0x74, 0x88, 0x90, 0xbd, 0xda, 0x8a, 0x68, 0xbd, 0xb4, 0xe0, 0x5f, 0xbc, 0xb8, 0x32, 0x88, 0xbd, 0x13, 0xc0, 0x81, 0x3d, 0x2c, 0x07, 0x2e, 0xbd, 0xd0, 0x8a, 0x8a, 0x3b, 0xe2, 0x9e, 0x8a, 0xbd, 0x60, 0x09, 0x8a, 0x3b, 0xd5, 0x6b, 0x92, 0xbd, 0x90, 0x61, 0x50, 0x3d, 0x62, 0x32, 0x0f, 0xbd, 0x9b, 0x7c, 0x6f, 0xbd, 0x10, 0x7c, 0xa3, 0x3c, 0x80, 0x22, 0xcc, 0xbb, 0x20, 0xc6, 0x3a, 0x3d, 0x40, 0xcb, 0x3f, 0x3b, 0xca, 0xa4, 0xdd, 0xbc, 0xc0, 0x36, 0xbf, 0x3c, 0x40, 0x4f, 0x85, 0x3b, 0x13, 0x52, 0x6c, 0xbd, 0x6b, 0xa9, 0x6f, 0xbd, 0x58, 0x41, 0x5d, 0xbc, 0xa8, 0x0e, 0x82, 0x3c, 0x7c, 0x92, 0xf5, 0x3c, 0xfa, 0xd8, 0x5a, 0xbd, 0xcc, 0x79, 0x54, 0x3d, 0xc4, 0x8f, 0x2a, 0xbc, 0x78, 0xec, 0xdb, 0x3c, 0xf0, 0x95, 0xa9, 0x3b, 0x78, 0x9d, 0xf6, 0xbc, 0x53, 0x59, 0x55, 0xbd, 0x08, 0x4e, 0xca, 0x3c, 0xcc, 0x95, 0xbb, 0x3c, 0xe4, 0x91, 0xb4, 0xbc, 0xfb, 0x9d, 0x86, 0xbd, 0x08, 0x68, 0x3f, 0xbc, 0x5d, 0x1b, 0x84, 0xbd, 0xd0, 0xc8, 0x83, 0x3b, 0x4a, 0x39, 0x54, 0x3d, 0x3c, 0x6e, 0xb6, 0xbc, 0x70, 0xdd, 0x1b, 0x3c, 0xf4, 0xfc, 0x21, 0xbd, 0x68, 0x25, 0x5e, 0x3c, 0x01, 0xfc, 0x8e, 0xbd, 0x60, 0xe5, 0x2a, 0x3b, 0x98, 0x51, 0x23, 0xbc, 0x00, 0xef, 0x0a, 0xba, 0xfc, 0x95, 0x1f, 0xbc, 0xf4, 0x89, 0x55, 0x3d, 0x76, 0x2e, 0x29, 0x3d, 0xdb, 0x02, 0x86, 0x3d, 0x64, 0xaa, 0x31, 0xbc, 0x7c, 0x3a, 0x9c, 0xbc, 0x00, 0xf2, 0x64, 0xbd, 0x86, 0xf3, 0x51, 0xbd, 0xc0, 0x2f, 0x9a, 0x3a, 0xf2, 0xf2, 0xd3, 0xbc, 0x1e, 0x43, 0xcb, 0xbc, 0x6d, 0x44, 0x92, 0x3d, 0x40, 0xc6, 0x90, 0xba, 0xaa, 0xc9, 0x3e, 0xbd, 0x02, 0xc1, 0x5b, 0x3d, 0x66, 0xeb, 0x1e, 0x3d, 0xf2, 0x34, 0x63, 0xbd, 0xea, 0xba, 0x66, 0x3d, 0xee, 0x8c, 0x1a, 0x3d, 0x3b, 0xb9, 0x1e, 0xbd, 0x0a, 0xd2, 0x13, 0x3d, 0xa0, 0xaf, 0x3e, 0x3c, 0xc0, 0x24, 0x83, 0x3c, 0x90, 0x69, 0xf0, 0xbb, 0x1f, 0x73, 0x86, 0x3d, 0x9d, 0x21, 0x77, 0xbd, 0x45, 0x4f, 0x8c, 0x3d, 0x40, 0x6d, 0xfe, 0x3c, 0xcb, 0xa5, 0x8d, 0xbd, 0x00, 0x8d, 0xe5, 0x39, 0x56, 0x9b, 0x55, 0x3d, 0x26, 0x49, 0x5a, 0xbd, 0x66, 0x93, 0x7a, 0x3d, 0x80, 0x29, 0x4f, 0xba, 0xff, 0xff, 0x82, 0xbd, 0x50, 0xf9, 0x65, 0x3c, 0x28, 0xa6, 0xb5, 0xbc, 0xdf, 0x70, 0x54, 0xbd, 0x17, 0xd1, 0x8e, 0xbd, 0x00, 0x3a, 0xb9, 0x3b, 0x26, 0x45, 0x86, 0xbc, 0xad, 0x85, 0x33, 0xbd, 0x94, 0x78, 0x32, 0x3d, 0x70, 0xcb, 0xa1, 0x3b, 0x40, 0xe5, 0x21, 0x3d, 0x32, 0xd5, 0xc2, 0xbc, 0xf8, 0x3d, 0x27, 0x3d, 0x28, 0xc0, 0x39, 0xbc, 0xac, 0xc8, 0x7a, 0xbc, 0xe6, 0xc2, 0xd4, 0xbc, 0x91, 0x81, 0x5c, 0xbd, 0xe1, 0x6a, 0x90, 0xbd, 0xa9, 0xc8, 0x1d, 0xbd, 0x00, 0x94, 0xcb, 0xb9, 0xe0, 0x0d, 0x31, 0x3c, 0x00, 0x2a, 0xbe, 0xbb, 0x9a, 0x1e, 0x2a, 0xbd, 0x06, 0xef, 0x7f, 0x3d, 0xc0, 0xcc, 0x0d, 0x3c, 0xd6, 0x50, 0x74, 0xbd, 0x10, 0x24, 0xcd, 0x3b, 0x22, 0x4f, 0x0c, 0xbd, 0xc8, 0xf2, 0xaa, 0x3c, 0x9e, 0x84, 0xc8, 0xbc, 0x80, 0xf2, 0x4e, 0x3c, 0x0c, 0x38, 0x77, 0xbd, 0x6c, 0xab, 0x63, 0xbd, 0xb7, 0x31, 0x11, 0xbd, 0x25, 0x39, 0x84, 0x3d, 0x31, 0x0b, 0x91, 0x3d, 0xe3, 0x1d, 0x08, 0xbd, 0x92, 0xb6, 0x1b, 0xbd, 0x65, 0xca, 0x88, 0x3d, 0x1c, 0x62, 0x2c, 0xbd, 0xda, 0x7b, 0x73, 0x3d, 0xff, 0xbb, 0x85, 0xbd, 0xc4, 0xc7, 0x51, 0x3d, 0x98, 0xd2, 0x6f, 0xbd, 0x70, 0xa4, 0xe9, 0x3c, 0x74, 0x65, 0xd7, 0x3c, 0x18, 0xdd, 0x5e, 0x3c, 0x78, 0x1d, 0x04, 0x3d, 0x2c, 0xef, 0x43, 0xbd, 0x48, 0x7d, 0x5e, 0xbd, 0xd6, 0x02, 0x9f, 0xbc, 0x80, 0x29, 0xa1, 0x3c, 0x70, 0x64, 0x54, 0x3d, 0x3e, 0xe0, 0x50, 0x3d, 0xd3, 0x7d, 0x2e, 0xbd, 0x64, 0xdf, 0x55, 0xbd, 0x72, 0x47, 0x8c, 0xbd, 0xfb, 0x45, 0x12, 0xbd, 0xd6, 0x49, 0x9d, 0xbc, 0xca, 0xd5, 0x67, 0x3d, 0x50, 0xb9, 0xf4, 0x3c, 0x93, 0xca, 0x1f, 0xbd, 0xa7, 0xe1, 0x8f, 0xbd, 0xcc, 0x00, 0x52, 0x3d, 0x07, 0xd3, 0x20, 0xbd, 0xd0, 0x26, 0x82, 0xbc, 0x2a, 0x6e, 0x69, 0x3d, 0x0c, 0x67, 0x70, 0xbd, 0xaa, 0x35, 0xe9, 0xbc, 0xae, 0x97, 0xba, 0xbc, 0xea, 0x69, 0x3d, 0xbd, 0x28, 0xa0, 0x6f, 0xbc, 0x2a, 0x6a, 0x67, 0x3d, 0x50, 0xd0, 0x6e, 0x3c, 0x16, 0x90, 0x06, 0x3d, 0x4a, 0xdf, 0x3f, 0x3d, 0xa0, 0x4e, 0x07, 0x3d, 0x48, 0x0d, 0x55, 0xbd, 0x50, 0x0b, 0xc6, 0xbc, 0xc4, 0xf3, 0x47, 0xbd, 0x90, 0x09, 0xb3, 0xbb, 0x20, 0xe9, 0x7f, 0xbd, 0xbf, 0x2e, 0x86, 0xbd, 0xba, 0xcf, 0x74, 0x3d, 0x86, 0xd8, 0xf6, 0xbc, 0x20, 0x65, 0x57, 0x3d, 0x82, 0xc5, 0x50, 0xbd, 0xac, 0x70, 0x41, 0x3d, 0x0e, 0xb0, 0x40, 0xbd, 0x4c, 0x30, 0x39, 0xbd, 0x80, 0xa0, 0xe5, 0x3c, 0x20, 0xc2, 0x86, 0xbb, 0xb8, 0x3d, 0x8c, 0x3c, 0xdf, 0x7e, 0x5f, 0xbd, 0xe0, 0xfd, 0x37, 0x3b, 0x0b, 0x70, 0x15, 0xbd, 0x00, 0xc1, 0x97, 0xba, 0x9a, 0x38, 0x56, 0xbd, 0x32, 0x67, 0xdb, 0xbc, 0x4a, 0x22, 0x38, 0x3d, 0x12, 0x1c, 0x7f, 0x3d, 0x88, 0x38, 0xee, 0x3c, 0x0a, 0x76, 0x61, 0x3d, 0x6d, 0xd7, 0x0a, 0xbd, 0xba, 0xb0, 0x3c, 0x3d, 0x28, 0xbe, 0x91, 0xbc, 0xa8, 0x3e, 0x0b, 0x3c, 0x54, 0x53, 0xb7, 0x3c, 0x50, 0x41, 0x57, 0x3c, 0xb4, 0x5d, 0x9b, 0x3c, 0x04, 0xb9, 0x18, 0xbd, 0xa8, 0xd5, 0x9c, 0xbc, 0x7c, 0x5f, 0x15, 0xbd, 0x64, 0xf3, 0x0d, 0x3d, 0x17, 0x85, 0x90, 0x3d, 0x5d, 0xf4, 0x51, 0xbd, 0x97, 0x93, 0x30, 0xbd, 0x40, 0x65, 0xe6, 0xbb, 0x20, 0xa7, 0xc3, 0x3c, 0x10, 0xb1, 0x90, 0x3c, 0xc8, 0x2f, 0x36, 0x3c, 0x6b, 0x38, 0x8e, 0xbd, 0xd6, 0x6c, 0x62, 0x3d, 0x94, 0x52, 0x4b, 0xbd, 0x48, 0xe5, 0x15, 0x3d, 0x48, 0x7a, 0x3f, 0x3d, 0x60, 0xb0, 0xdf, 0xbb, 0xc2, 0x53, 0x05, 0xbd, 0xc0, 0xaa, 0x94, 0x3a, 0xf2, 0xef, 0x68, 0xbd, 0xb0, 0x4d, 0x46, 0xbc, 0xa0, 0xdc, 0x0e, 0x3b, 0x9c, 0x99, 0x5d, 0xbd, 0xd0, 0x37, 0x63, 0xbd, 0x61, 0x02, 0x03, 0xbd, 0x80, 0x26, 0x51, 0x3a, 0xa0, 0xab, 0xb5, 0xbb, 0x65, 0x1e, 0x8d, 0x3d, 0xa0, 0x46, 0xc6, 0x3c, 0x00, 0x48, 0xa3, 0x3c, 0x4d, 0xdf, 0x84, 0x3d, 0x1c, 0xf1, 0x34, 0xbd, 0x1a, 0xb0, 0x00, 0x3d, 0x86, 0x6e, 0x5a, 0x3d, 0x02, 0xfe, 0x8b, 0xbd, 0x0e, 0x96, 0x32, 0x3d, 0xe6, 0x1e, 0x91, 0xbc, 0x8a, 0xe9, 0x6b, 0xbd, 0x4c, 0x53, 0x38, 0x3d, 0x39, 0xf5, 0x90, 0xbd, 0x66, 0x81, 0x7e, 0x3d, 0xec, 0x33, 0xaa, 0xbc, 0x3e, 0xc4, 0x5c, 0x3d, 0xd8, 0x19, 0x87, 0xbc, 0x70, 0xd6, 0x52, 0x3d, 0x00, 0x6a, 0xab, 0x3a, 0xda, 0x41, 0x81, 0xbc, 0xf0, 0xbd, 0xe3, 0x3c, 0x38, 0x66, 0x1e, 0x3c, 0x62, 0x7d, 0x8e, 0xbd, 0xa5, 0x2a, 0x15, 0xbd, 0xf6, 0x6a, 0x72, 0x3d, 0x72, 0x22, 0x33, 0x3d, 0x8c, 0xb7, 0x8e, 0xbd, 0xe2, 0xf8, 0x6a, 0xbd, 0x01, 0x40, 0x35, 0xbd, 0xb3, 0xe4, 0x79, 0xbd, 0xdc, 0xb4, 0x65, 0xbc, 0x3d, 0x74, 0x91, 0x3d, 0x94, 0x0a, 0xe8, 0x3c, 0x16, 0x25, 0x57, 0xbd, 0xd6, 0x05, 0x0b, 0x3d, 0x16, 0x2b, 0x5f, 0x3d, 0x38, 0x59, 0xcd, 0xbc, 0x8c, 0x9f, 0x0e, 0x3d, 0xac, 0x67, 0x9c, 0x3c, 0x00, 0xe1, 0xb3, 0x39, 0x1c, 0x2e, 0xf8, 0x3c, 0xed, 0xfd, 0x80, 0x3d, 0xc6, 0x8b, 0x2b, 0xbd, 0x08, 0x4d, 0xe0, 0x3c, 0xff, 0x55, 0x85, 0x3d, 0x3c, 0xd0, 0xe9, 0x3c, 0x30, 0x7c, 0x79, 0x3c, 0xd0, 0xf7, 0x8c, 0x3b, 0x82, 0xe9, 0x7d, 0xbd, 0x54, 0x3f, 0x46, 0x3d, 0xb8, 0x88, 0xc0, 0x3c, 0xc8, 0xf4, 0x35, 0xbc, 0xe9, 0x19, 0x85, 0x3d, 0x01, 0x5f, 0x62, 0xbd, 0xea, 0x7f, 0x0f, 0x3d, 0xf8, 0x73, 0x42, 0xbd, 0x41, 0x97, 0x8f, 0x3d, 0x13, 0xec, 0x80, 0x3d, 0xe7, 0xa8, 0x40, 0xbd, 0x08, 0x47, 0x4b, 0x3c, 0x80, 0xce, 0x77, 0xbc, 0xb6, 0x2d, 0x4f, 0xbd, 0xe0, 0xa7, 0x0b, 0x3b, 0xda, 0xb6, 0x76, 0x3d, 0xc8, 0xce, 0x14, 0x3c, 0xe0, 0xbf, 0x20, 0xbb, 0x10, 0xa1, 0x94, 0x3b, 0x02, 0x4e, 0x3f, 0x3d, 0xa0, 0xe9, 0x0c, 0xbc, 0x6a, 0x57, 0x2b, 0xbd, 0x22, 0x09, 0x1d, 0xbd, 0xa8, 0xa6, 0x4c, 0x3c, 0x21, 0x7d, 0x40, 0xbd, 0x91, 0xdf, 0x87, 0x3d, 0x65, 0xe4, 0x05, 0xbd, 0xdc, 0xd6, 0x84, 0xbd, 0x22, 0x49, 0x79, 0x3d, 0xf4, 0xf7, 0x40, 0xbc, 0x2c, 0x16, 0x86, 0xbc, 0xa8, 0x26, 0x40, 0x3d, 0xaa, 0x89, 0xa9, 0xbc, 0xc4, 0x74, 0xc5, 0xbc, 0x3c, 0x76, 0x83, 0xbc, 0x2b, 0xf7, 0x90, 0x3d, 0xa8, 0x0c, 0x6f, 0xbc, 0xdc, 0x96, 0x2c, 0x3d, 0xe0, 0x71, 0x88, 0x3c, 0x66, 0x9f, 0x2a, 0xbd, 0xf1, 0x10, 0x82, 0x3d, 0x41, 0x73, 0x41, 0xbd, 0x7e, 0x2c, 0x21, 0xbd, 0xf0, 0xea, 0x08, 0x3c, 0x54, 0xb4, 0x2a, 0xbc, 0xf6, 0xf5, 0x64, 0xbd, 0x46, 0xf9, 0x2a, 0xbd, 0x54, 0xa4, 0x29, 0x3d, 0x1e, 0x79, 0xee, 0xbc, 0xf5, 0x8b, 0x83, 0x3d, 0x30, 0x04, 0x10, 0x3d, 0x14, 0x83, 0x4e, 0x3d, 0x67, 0x9f, 0x62, 0xbd, 0x00, 0x01, 0x10, 0xbd, 0x96, 0xc8, 0x2c, 0x3d, 0x3f, 0x58, 0x8e, 0x3d, 0x34, 0xeb, 0xe1, 0x3c, 0x12, 0x5d, 0x87, 0xbc, 0x0b, 0x23, 0x80, 0x3d, 0x0a, 0x55, 0x81, 0xbd, 0xc2, 0x80, 0x16, 0xbd, 0x58, 0xa6, 0x7a, 0x3c, 0xec, 0x9a, 0xf1, 0x3c, 0xf0, 0x0e, 0xaa, 0x3c, 0xe2, 0x06, 0x9a, 0xbc, 0x20, 0x57, 0xec, 0xbb, 0xe8, 0x5b, 0xc6, 0x3c, 0x40, 0x51, 0x3b, 0x3c, 0x47, 0xf6, 0x8e, 0x3d, 0x6e, 0xc5, 0x06, 0xbd, 0xac, 0xf6, 0x2b, 0x3d, 0xec, 0x29, 0x05, 0x3d, 0x76, 0xd9, 0x2e, 0x3d, 0x7c, 0x02, 0x40, 0xbc, 0x5e, 0x98, 0x8b, 0xbc, 0x20, 0xf8, 0x8b, 0x3c, 0xcc, 0x04, 0x59, 0xbc, 0xd7, 0xfe, 0x8a, 0x3d, 0xda, 0xed, 0x1a, 0xbd, 0x82, 0x45, 0x9b, 0xbc, 0xfc, 0xa0, 0x7b, 0xbc, 0x14, 0x19, 0x0a, 0x3d, 0x7c, 0x3a, 0x7d, 0xbd, 0x46, 0x32, 0x91, 0xbd, 0xc0, 0xea, 0x8b, 0x3c, 0x0e, 0x44, 0x78, 0x3d, 0x96, 0x53, 0x2a, 0x3d, 0x3a, 0xbb, 0x79, 0x3d, 0x1f, 0xe3, 0x19, 0xbd, 0x56, 0xbb, 0x67, 0x3d, 0x44, 0x48, 0x86, 0x3c, 0x33, 0x5f, 0x8e, 0xbd, 0xc0, 0x86, 0x8c, 0xbc, 0xb0, 0x2a, 0x8e, 0x3b, 0x20, 0xd2, 0x8f, 0xbd, 0x16, 0x08, 0x67, 0x3d, 0x4a, 0xc7, 0x67, 0x3d, 0x50, 0x7c, 0xfd, 0xbc, 0xb0, 0xc1, 0x3f, 0xbd, 0xc0, 0x77, 0xde, 0x3b, 0x98, 0x6b, 0x98, 0xbc, 0x10, 0x91, 0xa0, 0x3b, 0x80, 0x9a, 0xed, 0x3c, 0xdd, 0xc9, 0x82, 0x3d, 0x2c, 0x20, 0x4d, 0x3d, 0x05, 0xe9, 0x78, 0xbd, 0x44, 0xae, 0xcd, 0x3c, 0xd8, 0x92, 0x81, 0x3c, 0x57, 0xa3, 0x77, 0xbd, 0xbe, 0x2e, 0x65, 0xbd, 0x74, 0xfc, 0x41, 0x3d, 0xa2, 0x99, 0x7b, 0x3d, 0xe0, 0x55, 0x98, 0x3b, 0xe4, 0xdf, 0xa5, 0x3c, 0xcf, 0x0c, 0x16, 0xbd, 0x68, 0x3f, 0x78, 0xbd, 0xbe, 0xe3, 0x4e, 0x3d, 0xf4, 0x7f, 0x4a, 0x3d, 0xaa, 0x64, 0x3b, 0xbd, 0xa7, 0xe7, 0x83, 0xbd, 0xe0, 0x45, 0x60, 0x3b, 0x41, 0x1e, 0x0c, 0xbd, 0x14, 0xa6, 0x90, 0xbd, 0x71, 0x37, 0x5f, 0xbd, 0x72, 0x90, 0xb8, 0xbc, 0xc6, 0x6e, 0x3b, 0xbd, 0x4d, 0x5e, 0xe0, 0xbc, 0x40, 0x74, 0x5b, 0xbb, 0xb2, 0x61, 0x06, 0x3d, 0xc8, 0xd6, 0xc1, 0x3c, 0xa9, 0x80, 0x85, 0xbd, 0x76, 0xe9, 0x20, 0x3d, 0x1a, 0xcc, 0x80, 0x3d, 0x39, 0x17, 0xdf, 0xbc, 0xe1, 0x45, 0x8c, 0x3c, 0x67, 0x35, 0x48, 0x3d, 0x9d, 0x17, 0x76, 0xbd, 0x38, 0xa6, 0xb2, 0xba, 0xad, 0x55, 0xaf, 0x3c, 0xf4, 0x50, 0x5e, 0x3d, 0x02, 0x7b, 0xd9, 0xba, 0x0a, 0x74, 0x0f, 0xbd, 0xa9, 0x69, 0x54, 0x3d, 0x3e, 0xa8, 0x6c, 0x3d, 0xcc, 0xde, 0x27, 0xbd, 0x4f, 0x51, 0xa7, 0xbb, 0xbf, 0x78, 0x26, 0xbd, 0x66, 0xcc, 0x84, 0xbd, 0xce, 0x30, 0xcd, 0xbc, 0xab, 0x28, 0x60, 0x3d, 0x97, 0xdb, 0x31, 0xbd, 0x6f, 0x6f, 0xc3, 0x3b, 0xe0, 0x7e, 0x8c, 0xbd, 0x06, 0xe2, 0xc0, 0xbc, 0xce, 0x5b, 0x7a, 0xbd, 0xa5, 0xfb, 0xe1, 0xbc, 0xbd, 0x3b, 0x44, 0xbd, 0x90, 0xa1, 0xbd, 0x3b, 0xc9, 0xba, 0x34, 0xbc, 0x5f, 0xab, 0x08, 0xbd, 0xf8, 0x5a, 0x5f, 0x3c, 0x23, 0xbe, 0x8c, 0x3d, 0xbc, 0x19, 0xad, 0xbc, 0xb1, 0xd8, 0x19, 0xbd, 0x33, 0x7a, 0x85, 0x3d, 0xa5, 0x19, 0xc7, 0x3b, 0x83, 0x55, 0x83, 0xbc, 0x9d, 0x63, 0x08, 0x3d, 0x36, 0x98, 0x1c, 0x3d, 0x20, 0x2d, 0x2d, 0xbc, 0x6b, 0xc3, 0x68, 0xbd, 0xbc, 0x22, 0xb6, 0x3c, 0x93, 0xdb, 0xc0, 0x3a, 0x88, 0x17, 0xdf, 0x3c, 0x0d, 0x0d, 0x2c, 0xbd, 0xc0, 0x40, 0x60, 0x3b, 0xea, 0xf9, 0x3f, 0xbd, 0x0d, 0xd7, 0x03, 0xbd, 0x45, 0x08, 0x68, 0xbd, 0xb3, 0xa4, 0xe9, 0xbc, 0xfd, 0xe9, 0x5f, 0x3d, 0x4c, 0x45, 0x0c, 0x3d, 0xff, 0xdb, 0xa3, 0xbc, 0x12, 0x16, 0x88, 0xbd, 0x70, 0x42, 0xe5, 0xbc, 0x60, 0xda, 0x1c, 0x3c, 0x2b, 0x55, 0xf8, 0x3b, 0x07, 0x82, 0x87, 0x3c, 0x08, 0x94, 0x83, 0xbd, 0x66, 0xf3, 0x44, 0x3d, 0x0b, 0xed, 0x10, 0x3c, 0x1b, 0x7e, 0x8f, 0xbd, 0xbe, 0x4c, 0xb5, 0xbc, 0xc4, 0x84, 0x26, 0x3d, 0x80, 0x5f, 0x6a, 0xbc, 0xb8, 0x41, 0x29, 0x3d, 0xfa, 0xbc, 0x4a, 0x3d, 0xbe, 0x44, 0x47, 0xbc, 0xc1, 0x9b, 0x21, 0x3d, 0x33, 0xb8, 0xd7, 0xbc, 0x54, 0xe6, 0x53, 0x3d, 0xd8, 0x95, 0x3d, 0xbd, 0x2b, 0x4d, 0x90, 0x3d, 0x0c, 0x3c, 0x3a, 0xbc, 0x6c, 0x41, 0x24, 0xbd, 0x31, 0xfd, 0x66, 0xbd, 0x43, 0x29, 0x4a, 0x3d, 0x00, 0x8d, 0xc3, 0xb9, 0x20, 0xd6, 0xe2, 0xbb, 0xb7, 0xf6, 0x22, 0xbd, 0xe9, 0xd7, 0x3f, 0x3d, 0x8d, 0xb7, 0xf7, 0x3c, 0x2b, 0x56, 0x8b, 0x3d, 0xa6, 0xa7, 0x70, 0xbd, 0xdf, 0x62, 0x56, 0x3d, 0xe9, 0x4b, 0xb0, 0x3c, 0x40, 0xb6, 0x04, 0x3c, 0x34, 0x8c, 0x04, 0xbd, 0xb9, 0x1a, 0x1b, 0x3d, 0x25, 0xbc, 0x05, 0xbd, 0x3d, 0x10, 0x1c, 0xbd, 0x77, 0x24, 0x8c, 0xbd, 0x53, 0x9b, 0xdf, 0x3b, 0x80, 0xc9, 0x53, 0x3d, 0x40, 0xc7, 0x6c, 0xbc, 0x00, 0xb3, 0xbe, 0xba, 0xe5, 0xe9, 0x89, 0x3d, 0xb0, 0x72, 0x88, 0xbd, 0xcd, 0x2d, 0x0c, 0xbd, 0x27, 0x35, 0x07, 0xbd, 0x6b, 0x6a, 0x49, 0xbd, 0x99, 0x9b, 0x51, 0xbd, 0x1c, 0x94, 0x51, 0x3c, 0x78, 0x26, 0x6a, 0xbd, 0xc2, 0x3e, 0x04, 0x3d, 0xf3, 0x19, 0x16, 0xbd, 0x9c, 0xb7, 0x0b, 0xbd, 0xb8, 0x3d, 0xf9, 0x3c, 0x69, 0xdb, 0x14, 0x3d, 0x0a, 0xe3, 0x0f, 0xbd, 0x1a, 0xd5, 0x80, 0xbd, 0xed, 0x79, 0x8d, 0x3c, 0x1b, 0x21, 0x00, 0xbb, 0x9a, 0x88, 0x0e, 0x3d, 0xc0, 0x1c, 0x66, 0x3d, 0x60, 0x74, 0x82, 0xbd, 0x7b, 0x96, 0x1c, 0x3d, 0x53, 0x16, 0x49, 0x3d, 0xeb, 0xfc, 0x8d, 0x3d, 0xb0, 0x52, 0x32, 0x3c, 0xa0, 0xa5, 0x5a, 0xbd, 0xfe, 0xf7, 0x9c, 0xbc, 0x19, 0x78, 0x4a, 0x3c, 0x78, 0xd1, 0xc2, 0x3c, 0xb4, 0x51, 0x91, 0xbd, 0x47, 0x08, 0x76, 0xbd, 0x7e, 0x70, 0x02, 0x3d, 0x8b, 0x90, 0x80, 0xbd, 0xc0, 0xad, 0x10, 0xbd, 0xc6, 0x2e, 0x4d, 0xbd, 0x0e, 0xe4, 0x0b, 0x3d, 0x9e, 0x8e, 0x8f, 0x3b, 0xd6, 0x81, 0x8a, 0xbd, 0xb9, 0x43, 0x05, 0xbd, 0xfd, 0xb4, 0x3d, 0xbd, 0x69, 0x1b, 0xa9, 0xbb, 0x0b, 0xb6, 0x88, 0xbd, 0xe3, 0x8f, 0x64, 0x3d, 0xd9, 0xda, 0x4d, 0x3c, 0xa8, 0xa9, 0x66, 0xbd, 0x87, 0x10, 0x23, 0x3d, 0xf6, 0x03, 0x3b, 0x3d, 0xa4, 0xcb, 0x83, 0x3c, 0x36, 0xd0, 0x2a, 0xbd, 0x22, 0x31, 0x27, 0x3d, 0xf0, 0xfb, 0x18, 0x3d, 0x8e, 0xa1, 0x04, 0x3d, 0x67, 0x0e, 0x67, 0xbc, 0x77, 0x07, 0x90, 0x3d, 0xaf, 0x11, 0x72, 0x3d, 0x7b, 0xdd, 0x80, 0x3d, 0x18, 0xd2, 0x6e, 0xbc, 0x0c, 0xfa, 0x5e, 0xbd, 0xe8, 0x92, 0xaf, 0xbc, 0x8f, 0x89, 0xe9, 0x3c, 0x15, 0x06, 0x1d, 0x3c, 0x02, 0x7f, 0x81, 0x3d, 0x88, 0xe0, 0x0f, 0xbd, 0x16, 0x6a, 0xab, 0xbc, 0xc4, 0x1f, 0xdf, 0x3c, 0x38, 0xab, 0x4b, 0x3c, 0x40, 0xfd, 0x83, 0x3b, 0x71, 0x9a, 0x52, 0xbd, 0x90, 0x3f, 0x04, 0xbd, 0xe4, 0x23, 0x81, 0x3d, 0x4a, 0xaa, 0x39, 0xbd, 0xc1, 0xb6, 0x7c, 0x3d, 0xa4, 0xb4, 0x2d, 0x3d, 0x3c, 0x8b, 0xea, 0x3b, 0xf3, 0x93, 0x8e, 0x3d, 0x9b, 0xea, 0x87, 0xbc, 0x25, 0x22, 0x91, 0xbd, 0xeb, 0x03, 0x1a, 0x3d, 0xde, 0xb3, 0x41, 0x3d, 0xb3, 0x03, 0x59, 0xbd, 0x98, 0xea, 0x1d, 0xbd, 0xaf, 0x46, 0xd9, 0xbc, 0xc0, 0x55, 0x3e, 0xbd, 0x4d, 0xe2, 0x45, 0x3d, 0x85, 0xa0, 0x44, 0x3c, 0x00, 0xe5, 0x3e, 0xbd, 0x6f, 0x4e, 0x4b, 0xbb, 0xe1, 0xcd, 0x86, 0x3c, 0x90, 0xaa, 0x08, 0xbd, 0xb6, 0xb9, 0x7a, 0x3d, 0x45, 0x80, 0x5c, 0x3d, 0xda, 0x7b, 0x28, 0xbd, 0x4e, 0x73, 0xc1, 0xbc, 0x8b, 0xff, 0x1b, 0x3d, 0xe0, 0xad, 0x71, 0xbc, 0x5c, 0xa3, 0xd3, 0xbc, 0x93, 0x08, 0x85, 0x3d, 0xce, 0x42, 0x3a, 0x3d, 0x31, 0x10, 0x86, 0x3d, 0x28, 0x95, 0x86, 0x3a, 0x81, 0x0e, 0x39, 0xbd, 0xa6, 0xb2, 0x57, 0x3d, 0x97, 0xab, 0xf8, 0xbc, 0x53, 0x5b, 0x9f, 0xbc, 0x79, 0x78, 0x54, 0x3d, 0xdc, 0x5b, 0x8b, 0x3d, 0xf5, 0xe7, 0x2d, 0x3d, 0xe7, 0x23, 0xa4, 0xbc, 0x6a, 0xff, 0x83, 0x3d, 0x53, 0xe7, 0x48, 0x3d, 0x27, 0x3c, 0x8c, 0x3d, 0x44, 0xdf, 0x74, 0xbd, 0x58, 0xe8, 0xf3, 0xbc, 0x4c, 0x9f, 0x57, 0x3c, 0x6c, 0xb6, 0x95, 0x3c, 0xbd, 0x8e, 0x65, 0x3d, 0x11, 0x3e, 0xcb, 0x3c, 0x88, 0x0e, 0x02, 0xbd, 0x68, 0x1c, 0x8d, 0xbb, 0xe9, 0xaa, 0x81, 0x3d, 0x00, 0xcc, 0x35, 0xbd, 0x4f, 0x0b, 0x8f, 0xbd, 0xa4, 0xaa, 0x40, 0xbc, 0x0a, 0x00, 0xac, 0xbc, 0xe2, 0x2a, 0x40, 0xbd, 0xc3, 0xff, 0x05, 0xbd, 0x09, 0xbe, 0x65, 0xbd, 0xe6, 0xde, 0x7e, 0xbd, 0x30, 0x36, 0x17, 0x3c, 0x50, 0x30, 0x0e, 0xbc, 0x64, 0x36, 0xfa, 0x3c, 0x9d, 0x5a, 0x85, 0xbb, 0x50, 0x2c, 0x65, 0xbc, 0x90, 0x5a, 0xae, 0xbb, 0x37, 0xe6, 0x41, 0xbd, 0xfd, 0x21, 0xf7, 0xbc, 0xb5, 0x91, 0x8b, 0xbb, 0x15, 0xaa, 0xbe, 0x3c, 0x86, 0x46, 0x78, 0xbd, 0xd4, 0x41, 0xf8, 0xbc, 0xf2, 0xb7, 0xe4, 0x3c, 0x1b, 0x84, 0x5a, 0x3c, 0x5a, 0xc8, 0x5e, 0x3d, 0x74, 0xad, 0xa8, 0x3c, 0x71, 0xbe, 0xa0, 0xbc, 0x9b, 0xaf, 0x2b, 0x3d, 0x43, 0x1b, 0x69, 0xbd, 0xb3, 0xe7, 0x88, 0x3d, 0xbd, 0xe2, 0x5c, 0x3d, 0x6b, 0xa4, 0x35, 0xbd, 0xe9, 0xbc, 0x8f, 0xbd, 0x16, 0xc0, 0x74, 0x3d, 0x92, 0xb9, 0x4c, 0x3d, 0x5d, 0xee, 0x91, 0x3c, 0x74, 0xda, 0x1d, 0xbd, 0xda, 0x42, 0x5a, 0xbb, 0x70, 0x1b, 0xbc, 0x3c, 0xc3, 0x23, 0xd9, 0xba, 0x6c, 0xf4, 0xa4, 0x3c, 0x9c, 0x95, 0x0a, 0x3d, 0xb8, 0x03, 0x9e, 0x3c, 0x05, 0x7b, 0x84, 0x3d, 0x88, 0x24, 0x29, 0x3d, 0x6e, 0xb3, 0x72, 0x3d, 0x36, 0x31, 0x62, 0x3c, 0xea, 0x27, 0x24, 0xbd, 0x6d, 0xf3, 0xe5, 0x3c, 0x2e, 0x24, 0x1f, 0x3d, 0x69, 0x95, 0x6b, 0xbd, 0xa6, 0xdf, 0x42, 0xba, 0xdd, 0x6e, 0x90, 0xbd, 0xb3, 0x52, 0x00, 0xbd, 0xbe, 0x22, 0x02, 0x3d, 0xbf, 0x61, 0x80, 0xbd, 0x8d, 0xde, 0x82, 0x3d, 0xf4, 0x40, 0x28, 0x3d, 0x7b, 0xeb, 0xb7, 0xba, 0xe1, 0x73, 0x94, 0x3c, 0xae, 0x7f, 0x12, 0xba, 0x02, 0xf0, 0x40, 0xbb, 0xf1, 0xb7, 0x05, 0x3d, 0x0d, 0xbb, 0x6b, 0xbd, 0xe2, 0x4f, 0x12, 0xbd, 0x0a, 0x66, 0x09, 0xbd, 0xb7, 0xe9, 0x8f, 0x3d, 0x0d, 0x7c, 0x14, 0x3d, 0x11, 0xf4, 0xbe, 0xba, 0x09, 0x4d, 0x38, 0xbd, 0x80, 0x94, 0x41, 0x3a, 0xd3, 0x89, 0xc2, 0x3c, 0xd8, 0x3a, 0x3d, 0x3c, 0x28, 0x00, 0x5f, 0xbc, 0xc4, 0x2a, 0x91, 0xbc, 0x50, 0x98, 0xe6, 0xbc, 0xfa, 0x52, 0x16, 0x3d, 0x3c, 0xb5, 0x87, 0x3d, 0xed, 0xcf, 0x70, 0x3c, 0x78, 0x9e, 0x72, 0xbb, 0x93, 0x6b, 0x23, 0x3d, 0xf0, 0xaf, 0x64, 0xbd, 0xce, 0xd7, 0x5e, 0xbd, 0x6c, 0x20, 0x7b, 0xbc, 0xd0, 0x7a, 0xe0, 0xbb, 0x60, 0xfd, 0xef, 0x3b, 0x95, 0xe5, 0x5f, 0xbd, 0xdf, 0x49, 0x33, 0x3c, 0x11, 0x3d, 0x80, 0x3d, 0xd4, 0x04, 0xc8, 0x3c, 0x58, 0xc0, 0x41, 0xbd, 0x50, 0x35, 0x63, 0x3d, 0xd2, 0x8a, 0xc8, 0xbc, 0x67, 0xf0, 0x8b, 0xbd, 0x69, 0x02, 0x55, 0x3d, 0x0c, 0xa1, 0x76, 0xbd, 0xa8, 0x5e, 0x05, 0xbb, 0xd0, 0xc3, 0x16, 0x3d, 0x78, 0x7f, 0x23, 0xbc, 0x59, 0x25, 0x5c, 0xbd, 0xb4, 0xaf, 0x36, 0xbd, 0x26, 0xc1, 0xd0, 0xb9, 0xa3, 0xb9, 0x54, 0x3d, 0xd3, 0x99, 0xea, 0xbc, 0x56, 0x87, 0xfc, 0xbc, 0x86, 0x17, 0x16, 0xbd, 0x80, 0x75, 0x17, 0xbd, 0xe9, 0xe9, 0x26, 0xbd, 0x73, 0xd9, 0x7f, 0xbd, 0x78, 0xf7, 0x08, 0x3d, 0xb4, 0x6e, 0x24, 0x3d, 0xdb, 0x78, 0x04, 0x3d, 0x91, 0x4e, 0x5e, 0x3d, 0x93, 0x73, 0x86, 0x3d, 0xd5, 0xc8, 0x41, 0xbd, 0x18, 0x68, 0x79, 0x3d, 0x1e, 0x5e, 0x74, 0xbd, 0x05, 0x92, 0x43, 0x3d, 0xed, 0xd7, 0xcb, 0x3c, 0x90, 0x04, 0x48, 0xbd, 0x2a, 0x81, 0x59, 0xbd, 0xa6, 0xf8, 0x8f, 0xbd, 0x21, 0x1b, 0x82, 0x3d, 0x47, 0x2f, 0x03, 0xbd, 0x49, 0x8a, 0xea, 0x3b, 0x82, 0x20, 0x29, 0x3d, 0x3e, 0x06, 0x0a, 0x3b, 0x0d, 0xe3, 0x93, 0x3c, 0x3f, 0xb2, 0x83, 0x3d, 0x57, 0x42, 0xe4, 0x3b, 0x02, 0x82, 0xde, 0xbc, 0x75, 0x96, 0x0a, 0xbd, 0x66, 0xb5, 0x0a, 0x3d, 0x11, 0xed, 0x8d, 0xbd, 0xc5, 0x7c, 0x61, 0xbd, 0x85, 0xde, 0x56, 0xbc, 0x2f, 0x3e, 0x41, 0xbd, 0x65, 0x92, 0x70, 0x3d, 0x10, 0x6d, 0xd8, 0xbb, 0x6e, 0x7b, 0x45, 0x3d, 0xe0, 0xcd, 0x58, 0x3d, 0x5a, 0xa0, 0x6c, 0xbd, 0x25, 0x13, 0x2f, 0xbd, 0x95, 0xcf, 0x6b, 0xbd, 0x42, 0x36, 0x20, 0xbc, 0x3c, 0x82, 0x47, 0x3c, 0x71, 0xef, 0x16, 0x3c, 0x50, 0xa2, 0xb8, 0xba, 0x7e, 0xc4, 0x61, 0x3c, 0xa6, 0xc5, 0x78, 0xbd, 0xb9, 0x33, 0x32, 0xbd, 0x47, 0x60, 0x81, 0x3d, 0x58, 0xd9, 0x16, 0x3d, 0x3a, 0x50, 0x7a, 0xbd, 0x47, 0xc7, 0x15, 0x3d, 0x00, 0xca, 0x8a, 0xbd, 0x6f, 0x8f, 0x83, 0xbd, 0x7b, 0x4f, 0x58, 0xba, 0x30, 0x8f, 0x43, 0xbd, 0xd1, 0x28, 0xd6, 0xbb, 0x20, 0x94, 0xf7, 0xbc, 0x84, 0xef, 0x25, 0xbd, 0x06, 0x79, 0x6f, 0x3d, 0xdb, 0x3e, 0xcd, 0x3c, 0xc7, 0xce, 0x79, 0x3d, 0x23, 0x71, 0x97, 0xbc, 0x5c, 0x5c, 0x38, 0x3d, 0xc8, 0xb6, 0x03, 0xbd, 0xd6, 0x31, 0xc6, 0xbc, 0x33, 0xe1, 0xd0, 0xbb, 0x66, 0xf2, 0xd5, 0xbc, 0xe2, 0x07, 0x49, 0x3d, 0x2c, 0x67, 0xc9, 0xbc, 0x71, 0xd2, 0x41, 0xbd, 0x1a, 0xb4, 0x81, 0x3c, 0xf0, 0x27, 0x7d, 0x3d, 0xca, 0xcc, 0xd5, 0xbc, 0x3f, 0x3e, 0x30, 0xbd, 0x50, 0xe1, 0x26, 0xba, 0x53, 0x7d, 0x00, 0x3d, 0x8e, 0x75, 0x4d, 0x3b, 0x0a, 0x56, 0x20, 0x3d, 0x61, 0xaf, 0xf4, 0xbc, 0x55, 0x41, 0x98, 0xbc, 0x16, 0x66, 0x13, 0x3d, 0x40, 0x96, 0x67, 0xbd, 0x40, 0x3a, 0x0b, 0xbd, 0xbe, 0x16, 0x88, 0xbc, 0x54, 0xd1, 0x56, 0xbd, 0xd5, 0xa2, 0xba, 0xbb, 0x97, 0x30, 0x1f, 0xbb, 0x37, 0x2d, 0x18, 0xbd, 0xe7, 0xe3, 0x8e, 0xbd, 0x82, 0x9b, 0x29, 0x3c, 0x8f, 0x41, 0x24, 0xbd, 0xa2, 0x55, 0x8f, 0x3b, 0x25, 0xa4, 0x18, 0x3c, 0xb6, 0xee, 0xe7, 0x3c, 0x3a, 0x0b, 0x12, 0xbd, 0x27, 0xfb, 0xb4, 0xb9, 0x70, 0x41, 0x0a, 0xbc, 0xe8, 0x8b, 0x62, 0xbd, 0x04, 0x95, 0xc5, 0x3c, 0xa4, 0x51, 0x46, 0xbd, 0x42, 0x1e, 0x65, 0xbd, 0x4f, 0x3d, 0x4a, 0x3d, 0x6f, 0x9d, 0x19, 0x3d, 0xb8, 0xdb, 0x8c, 0xbd, 0x9a, 0xfe, 0x23, 0x3c, 0x0c, 0x8a, 0x58, 0x3d, 0xe2, 0x61, 0x62, 0xbd, 0x1f, 0xee, 0x64, 0x3c, 0x0c, 0xb0, 0x9a, 0x3b, 0xe8, 0x9f, 0xf7, 0xbc, 0x54, 0xf9, 0xef, 0xbc, 0xbb, 0x3b, 0x57, 0x3a, 0xcc, 0x92, 0xa6, 0x3c, 0xfa, 0x7f, 0xf0, 0x3c, 0x92, 0x0c, 0x03, 0x3d, 0xc4, 0xa7, 0x0b, 0xbd, 0x3d, 0xf1, 0x8b, 0xbd, 0x6a, 0x7a, 0x4c, 0xbd, 0xfe, 0x96, 0xdc, 0x3c, 0xf8, 0x93, 0x99, 0x3b, 0xe4, 0xd7, 0x70, 0x3d, 0x72, 0x25, 0x4f, 0x3d, 0xc0, 0xa1, 0x80, 0xbd, 0xb8, 0xac, 0x50, 0x3d, 0x87, 0x18, 0x87, 0xbc, 0xcc, 0xe2, 0x01, 0xbd, 0x70, 0x67, 0xfb, 0xbb, 0xda, 0x29, 0x7c, 0x3d, 0xe6, 0xf0, 0x67, 0x3d, 0x98, 0xd8, 0x0e, 0x3d, 0xe8, 0xf6, 0x45, 0xbd, 0xcc, 0x76, 0x57, 0xbd, 0x12, 0xec, 0x02, 0x3d, 0x02, 0x73, 0xbf, 0x3c, 0xea, 0x67, 0x9e, 0x3a, 0x29, 0x29, 0x1f, 0x3d, 0x19, 0x65, 0x2a, 0x3d, 0x9c, 0x3a, 0x86, 0x3d, 0xd8, 0xcd, 0x15, 0xbd, 0xf3, 0xed, 0x75, 0xbd, 0xa6, 0x30, 0xff, 0xbc, 0x87, 0x2e, 0xc7, 0x3c, 0xe6, 0x41, 0xb9, 0x3c, 0x38, 0xf9, 0xb0, 0x3c, 0x49, 0x88, 0x8c, 0xbd, 0xf2, 0x2b, 0x70, 0x3d, 0x3d, 0x58, 0xec, 0x3b, 0xa2, 0x59, 0x3a, 0x3c, 0x3f, 0x5f, 0x3a, 0x3d, 0x5f, 0xb9, 0x48, 0xbd, 0x09, 0x9a, 0xc5, 0x3b, 0x12, 0x63, 0x84, 0xbd, 0x11, 0x76, 0x5e, 0x3d, 0x4f, 0xa0, 0x84, 0x3d, 0x90, 0x8b, 0x29, 0xbd, 0x03, 0xcc, 0x2c, 0xbd, 0xbe, 0x89, 0x8f, 0xbd, 0xa5, 0x7a, 0x81, 0x3d, 0x54, 0xa8, 0xd0, 0x3c, 0x54, 0x70, 0x9d, 0xbb, 0x4a, 0xe4, 0xb9, 0xbc, 0x94, 0x65, 0xfe, 0xbc, 0x3c, 0xef, 0xac, 0x3c, 0x4c, 0x87, 0x16, 0xbd, 0x0a, 0xda, 0x85, 0xbc, 0x89, 0x04, 0x88, 0x3d, 0xb6, 0xe7, 0x19, 0x3d, 0x38, 0x06, 0x08, 0xbd, 0x37, 0x6c, 0x3d, 0xbd, 0x75, 0x70, 0x09, 0x3d, 0x13, 0x5c, 0x7f, 0xbd, 0xe2, 0x25, 0xfb, 0x3c, 0x74, 0xe4, 0x06, 0x3d, 0xd8, 0xcb, 0x82, 0x3d, 0xbc, 0xa0, 0xeb, 0xbc, 0xaf, 0xb1, 0x8e, 0xbd, 0x30, 0x53, 0xdc, 0x3b, 0x4b, 0x94, 0x84, 0x3d, 0xc9, 0x6d, 0xcd, 0x3c, 0xd1, 0x47, 0x8e, 0x3d, 0x5e, 0x1a, 0x15, 0xbc, 0x0b, 0xe3, 0xb2, 0x3c, 0x4c, 0x7f, 0xfb, 0x3c, 0x6e, 0x6d, 0x53, 0x3d, 0xdc, 0xa5, 0x8d, 0x3d, 0x71, 0x25, 0x85, 0xbd, 0xc8, 0xa9, 0x17, 0xbc, 0xe1, 0xcd, 0xf3, 0xbc, 0xbd, 0xc5, 0x5f, 0xbd, 0xde, 0xbc, 0x07, 0x3d, 0x2a, 0x50, 0x91, 0x3c, 0x12, 0x64, 0x9a, 0x3b, 0x54, 0x8b, 0x02, 0x3d, 0x2d, 0x77, 0x8b, 0xbd, 0x83, 0x37, 0x82, 0x3d, 0x5f, 0xdb, 0x50, 0xbd, 0xba, 0xe6, 0x63, 0x3d, 0x2d, 0x97, 0x21, 0x3d, 0xfe, 0xba, 0x80, 0x3d, 0xe4, 0xc2, 0x39, 0xbd, 0x8d, 0x37, 0x94, 0x3c, 0x8d, 0xe8, 0xb0, 0xbc, 0x0e, 0xbc, 0xa9, 0xbc, 0xbb, 0xfb, 0xb1, 0xbb, 0xff, 0xdb, 0x13, 0xbd, 0x15, 0x1e, 0x1f, 0xbd, 0xe6, 0x81, 0x51, 0xbd, 0xf1, 0x39, 0xaf, 0xbc, 0x86, 0x69, 0x68, 0xbd, 0x33, 0x5c, 0xe8, 0x3c, 0x25, 0xd3, 0x5d, 0xbd, 0x77, 0xf4, 0x0e, 0xbd, 0x5f, 0x4b, 0xec, 0x3c, 0xc4, 0x6c, 0xfc, 0x3c, 0x39, 0x1e, 0xc9, 0x3c, 0x2c, 0xdc, 0x6f, 0xbd, 0xf0, 0xdd, 0x5b, 0x3c, 0xba, 0x58, 0x63, 0x3d, 0x20, 0xb8, 0x9c, 0x3b, 0x58, 0x4e, 0xb6, 0xbc, 0x47, 0x2d, 0xc4, 0xbc, 0x0c, 0x5b, 0x6b, 0x3d, 0x00, 0x18, 0xed, 0xb9, 0x96, 0xa9, 0x9e, 0x3c, 0x42, 0x5c, 0x4a, 0xbb, 0x94, 0x9f, 0x85, 0xbd, 0x10, 0xdd, 0xcd, 0x3c, 0x47, 0x98, 0x8c, 0xbd, 0x28, 0x33, 0x6f, 0xbd, 0x6c, 0x52, 0x21, 0x3d, 0x41, 0x5c, 0x45, 0x3c, 0xf7, 0x7c, 0x36, 0xbd, 0x6d, 0xf5, 0xdb, 0xbc, 0x30, 0x95, 0x87, 0x3d, 0xed, 0x8a, 0x8f, 0xbd, 0x79, 0x78, 0x88, 0xbd, 0x0c, 0x54, 0x1c, 0xbc, 0x82, 0xa3, 0xa7, 0x3b, 0x1f, 0xcf, 0x76, 0xbd, 0x71, 0x23, 0x8b, 0x3c, 0x01, 0xc3, 0x87, 0x3d, 0x54, 0xb5, 0xe5, 0x3c, 0x3e, 0x2f, 0x17, 0xbd, 0x99, 0xb5, 0x13, 0x3d, 0x69, 0xf7, 0xad, 0x3c, 0xb1, 0x19, 0x13, 0xbc, 0x0e, 0xf8, 0x5b, 0xbd, 0x74, 0x52, 0x82, 0x3d, 0x7a, 0x5f, 0xfd, 0xbb, 0x2b, 0x17, 0x15, 0xbd, 0x05, 0x3c, 0x72, 0xbd, 0x18, 0xbd, 0xb9, 0xba, 0xaf, 0x8e, 0xc5, 0xbc, 0x7a, 0x8f, 0xc3, 0xbb, 0xd9, 0x64, 0x14, 0xbd, 0x97, 0xdf, 0x55, 0x3d, 0x99, 0x96, 0xac, 0xba, 0x4f, 0x5c, 0x84, 0x3d, 0xa4, 0x57, 0x27, 0x3d, 0xf8, 0x8e, 0x81, 0xbd, 0xf8, 0xef, 0x55, 0x3c, 0x0e, 0x2d, 0x59, 0xbd, 0xf1, 0xeb, 0x52, 0x3a, 0x06, 0xde, 0x94, 0x3c, 0x53, 0x8e, 0x17, 0xbd, 0x5d, 0x25, 0x86, 0x3c, 0x1c, 0x8c, 0x8b, 0xbc, 0x32, 0xa0, 0x1c, 0x3d, 0x2e, 0xb3, 0x53, 0x3d, 0x2e, 0x1c, 0x3f, 0x3d, 0x38, 0xb0, 0xf1, 0x3c, 0x95, 0xc2, 0x55, 0xbb, 0x74, 0x05, 0x39, 0xbd, 0x4a, 0xa6, 0x27, 0x3b, 0xb3, 0x63, 0xd8, 0x3c, 0xd6, 0x03, 0x83, 0x3d, 0x24, 0x65, 0x49, 0xbd, 0x18, 0x9e, 0xee, 0x3c, 0x26, 0xf0, 0x85, 0xbd, 0xfc, 0xd0, 0x67, 0xbd, 0x43, 0xca, 0x12, 0xbd, 0xb1, 0xec, 0x03, 0x3d, 0x00, 0x1e, 0x74, 0x3c, 0xb5, 0x32, 0xa6, 0xbc, 0x3d, 0x56, 0x65, 0x3d, 0x8b, 0x0e, 0xa9, 0xbc, 0x03, 0x1e, 0x91, 0x3d, 0x64, 0x8f, 0x88, 0x3d, 0x1c, 0x50, 0xb5, 0xbc, 0xe4, 0xb3, 0x05, 0xbd, 0x2c, 0x4f, 0x59, 0xbd, 0x29, 0x30, 0x23, 0xbd, 0x0c, 0x23, 0x56, 0xbd, 0x7d, 0x77, 0x82, 0xbc, 0x45, 0x1a, 0xa4, 0x3c, 0xb7, 0x9c, 0x0f, 0xbc, 0xc5, 0x76, 0xd8, 0xbc, 0x7f, 0x4f, 0x78, 0xbd, 0xb4, 0x07, 0x82, 0x3c, 0x56, 0xcc, 0x6a, 0xbd, 0xc3, 0x11, 0x29, 0x3c, 0xa5, 0xf6, 0x7a, 0x3d, 0x8a, 0x88, 0xc4, 0x3c, 0x00, 0xf8, 0xa2, 0xbc, 0x30, 0x08, 0x50, 0xbd, 0x59, 0xcf, 0xb1, 0xbc, 0xd1, 0xba, 0x52, 0xbd, 0xc0, 0xe8, 0xbe, 0x3b, 0xc3, 0xb8, 0xfe, 0xbc, 0x22, 0xc5, 0x84, 0xbd, 0xef, 0x51, 0xbd, 0x3a, 0x75, 0x42, 0xc8, 0xbc, 0x1a, 0x32, 0x88, 0x3d, 0x2a, 0x26, 0xc2, 0xbc, 0x66, 0x17, 0x2a, 0xbd, 0x1d, 0x0f, 0x7f, 0x3d, 0x55, 0x2f, 0x8f, 0x3b, 0x01, 0x47, 0x8c, 0x3d, 0x3a, 0x01, 0x18, 0x3d, 0xca, 0xa0, 0xea, 0xbc, 0x3e, 0x16, 0x34, 0xbd, 0xe8, 0xf7, 0x75, 0x3c, 0x20, 0xee, 0x49, 0x3c, 0x6a, 0xc1, 0x3b, 0xbd, 0xa0, 0x98, 0x5c, 0xbd, 0x60, 0x8e, 0x94, 0x3b, 0xa2, 0x9b, 0x8a, 0x3d, 0x10, 0x4d, 0x4f, 0x3d, 0x87, 0xe4, 0x45, 0xbd, 0xb6, 0x17, 0xdd, 0x3b, 0xee, 0x06, 0x71, 0xbd, 0xca, 0xb4, 0xe0, 0x3c, 0xd4, 0x9d, 0x0b, 0xbd, 0xba, 0x3a, 0x21, 0x3d, 0x6c, 0xfd, 0xaa, 0x3c, 0x35, 0x20, 0x61, 0xbd, 0x20, 0x51, 0x52, 0x3d, 0x96, 0xcc, 0x29, 0xbd, 0x9f, 0x99, 0x22, 0x3d, 0x06, 0x2d, 0xdb, 0xba, 0xdb, 0xf1, 0x90, 0x3c, 0xf9, 0x05, 0x06, 0x3d, 0xdf, 0x02, 0xcb, 0x3c, 0x02, 0xb8, 0xf8, 0xbc, 0x70, 0x14, 0x50, 0xbd, 0x51, 0xdc, 0x88, 0x3d, 0xa8, 0xa5, 0xd6, 0xbc, 0x69, 0xd7, 0x8e, 0x3d, 0xbe, 0x91, 0x86, 0xbd, 0x5d, 0x93, 0x12, 0xbd, 0x7c, 0x23, 0x60, 0xbd, 0xb2, 0x55, 0xb7, 0x3c, 0x38, 0xb8, 0x0e, 0x3d, 0x88, 0x86, 0x0e, 0x3c, 0x9a, 0x4b, 0x0d, 0x3d, 0x00, 0xfa, 0x1a, 0x3b, 0xb8, 0x59, 0xbf, 0x3c, 0xbe, 0xa8, 0xea, 0x3c, 0xfc, 0xf4, 0xf3, 0x3c, 0xbf, 0x69, 0x17, 0x3d, 0x82, 0xe6, 0x84, 0xbd, 0x9d, 0xde, 0x3e, 0xbd, 0x3a, 0x02, 0x5b, 0xbd, 0x04, 0x34, 0x8b, 0xbd, 0x83, 0x26, 0xc5, 0x3c, 0x71, 0x0c, 0x17, 0x3d, 0x44, 0x33, 0x5a, 0xbd, 0xe0, 0x15, 0xe4, 0x3b, 0xd9, 0x25, 0x80, 0xbd, 0xbb, 0xac, 0x56, 0xbd, 0x54, 0x26, 0x6f, 0xbd, 0x30, 0x23, 0xa2, 0x3b, 0x08, 0x7c, 0x27, 0xbd, 0xba, 0x00, 0xde, 0xbc, 0x80, 0x47, 0x8f, 0xbd, 0xca, 0x52, 0x17, 0xbd, 0xf0, 0x9a, 0x0a, 0x3d, 0xe9, 0x6a, 0xea, 0x3b, 0x12, 0xaa, 0x65, 0x3d, 0x3e, 0x1a, 0x49, 0x3d, 0x3b, 0x68, 0x30, 0xbd, 0xfb, 0x34, 0x3d, 0x3d, 0x0c, 0x21, 0xe3, 0x3c, 0x13, 0x68, 0x67, 0xbb, 0xe5, 0xaf, 0x8b, 0xbd, 0xfe, 0x2b, 0x00, 0xbd, 0x5e, 0x1e, 0x4a, 0xbd, 0xb2, 0x94, 0x70, 0x3d, 0xa0, 0x7e, 0x47, 0x3b, 0xde, 0xa9, 0xef, 0xbc, 0x84, 0x2f, 0x1a, 0x3a, 0x26, 0xb6, 0xf8, 0x3c, 0xe4, 0xab, 0xd9, 0xbc, 0xa8, 0x0b, 0x87, 0xbd, 0x70, 0x2c, 0xbd, 0x3c, 0x32, 0xb2, 0x8c, 0x3c, 0xce, 0x0f, 0x34, 0xba, 0xc7, 0xc9, 0x3b, 0xbd, 0x22, 0xdb, 0xf3, 0xbc, 0x8d, 0x4e, 0x48, 0xbd, 0xf0, 0x63, 0x53, 0x3d, 0x04, 0xd6, 0xc7, 0x3b, 0xfa, 0x40, 0x6c, 0xbd, 0x22, 0xfb, 0x80, 0x38, 0xe9, 0x8c, 0x0e, 0x3c, 0xc4, 0x60, 0x27, 0x3d, 0xaa, 0xcf, 0x60, 0x3d, 0xfe, 0x59, 0x08, 0x3d, 0x6e, 0x69, 0x43, 0xbd, 0xcb, 0xa1, 0x03, 0xbd, 0x16, 0x47, 0x72, 0x3d, 0xc1, 0x37, 0x5d, 0x3d, 0x53, 0x6f, 0x8b, 0xbd, 0x50, 0x99, 0x18, 0x3d, 0x65, 0x92, 0x89, 0x3d, 0x12, 0x80, 0x94, 0xbd, 0x8d, 0x1d, 0x21, 0xbd, 0x6e, 0xc6, 0x69, 0x3d, 0x18, 0x1d, 0x23, 0x3d, 0x3e, 0x2b, 0x00, 0x3d, 0xe4, 0x71, 0x4f, 0xbd, 0xfb, 0xc5, 0x0e, 0xbd, 0x6e, 0x24, 0x47, 0x3d, 0x34, 0xf0, 0x50, 0x3c, 0x3f, 0x38, 0x89, 0x3d, 0xb5, 0x84, 0x41, 0xbc, 0xb8, 0xdc, 0x56, 0x3d, 0x3b, 0x56, 0x60, 0xbc, 0x5a, 0x3b, 0x58, 0x3d, 0x86, 0x56, 0x6d, 0xbd, 0x4f, 0x33, 0x43, 0x3d, 0x7e, 0x6c, 0x7d, 0x3c, 0xb9, 0x4c, 0x8b, 0x3d, 0x00, 0x88, 0x3f, 0x3a, 0x3a, 0xb8, 0xc1, 0x3c, 0x02, 0x18, 0x30, 0x3d, 0x6b, 0xb4, 0x4c, 0xbd, 0x0d, 0xd8, 0x3c, 0x3d, 0x9a, 0x25, 0x61, 0xbd, 0x87, 0x7b, 0xa7, 0xbc, 0x76, 0x8e, 0x06, 0xbb, 0x47, 0xf9, 0x73, 0xbd, 0x80, 0xfa, 0x28, 0xbb, 0xd4, 0xd1, 0x76, 0xbd, 0x9a, 0xcb, 0x29, 0xbd, 0xf6, 0x0f, 0xe5, 0xbc, 0x6d, 0xeb, 0x4f, 0xbd, 0x46, 0xe8, 0x69, 0xbc, 0x9a, 0x72, 0x69, 0x3d, 0x55, 0x19, 0x86, 0xbd, 0xba, 0x77, 0x0f, 0x3d, 0x4d, 0xf6, 0x64, 0x3d, 0xf4, 0xf6, 0x19, 0x3d, 0xc3, 0x53, 0x4a, 0x3d, 0x83, 0xc4, 0x7f, 0x3c, 0xb6, 0xcb, 0x53, 0xbd, 0xc5, 0x99, 0x83, 0xbd, 0xa9, 0xcb, 0x4e, 0xbd, 0xbc, 0xc0, 0xf3, 0x3c, 0xc3, 0x45, 0x2c, 0x3d, 0x6a, 0x2f, 0x93, 0xbd, 0x8d, 0x05, 0x67, 0x3d, 0xec, 0x6f, 0x3a, 0x3d, 0xf5, 0x47, 0x5a, 0x3d, 0xca, 0xa6, 0x79, 0x3d, 0x16, 0x97, 0x7d, 0xbd, 0x53, 0x30, 0x52, 0x3d, 0x07, 0x81, 0x52, 0x3d, 0xf7, 0xae, 0xa6, 0xbc, 0xa3, 0xc2, 0xa4, 0xbc, 0x5c, 0xd8, 0x23, 0xbd, 0xc5, 0x77, 0x50, 0x3d, 0x28, 0x78, 0x47, 0x3c, 0xe7, 0xe2, 0x04, 0xbd, 0xcc, 0x6f, 0x83, 0xbd, 0x4c, 0x2b, 0xfc, 0xbc, 0x42, 0xf8, 0xf6, 0x3c, 0x03, 0x7c, 0x87, 0x3d, 0x2d, 0x4d, 0x80, 0xbd, 0x08, 0x59, 0x65, 0x3d, 0x2b, 0x4a, 0x3a, 0xbd, 0xae, 0xec, 0x68, 0x3d, 0x1e, 0x42, 0x85, 0xbd, 0xd6, 0x06, 0x6a, 0x3d, 0x6e, 0xfe, 0x65, 0xbd, 0x77, 0xef, 0xb0, 0x3c, 0x81, 0xb1, 0x48, 0x3c, 0x86, 0x4b, 0x57, 0xbd, 0x1e, 0x45, 0x82, 0x3c, 0x9b, 0x6c, 0x0f, 0xbd, 0xeb, 0x5f, 0x1c, 0xbd, 0xc3, 0x49, 0x3b, 0x3d, 0x5b, 0x31, 0x7b, 0xbd, 0xee, 0xcb, 0x0c, 0xbd, 0x49, 0xa6, 0xa7, 0x3c, 0x89, 0x96, 0x73, 0xbd, 0x4d, 0xcf, 0x89, 0x3d, 0xec, 0x73, 0xe1, 0x3b, 0x0e, 0x74, 0x0b, 0x3c, 0xc4, 0x52, 0xe1, 0xbc, 0xf9, 0x15, 0x5f, 0x3d, 0x4a, 0x6c, 0x6c, 0xbd, 0x1d, 0x1d, 0xc7, 0xbb, 0xa2, 0x11, 0x26, 0x3d, 0x92, 0xa6, 0x00, 0xbd, 0xe8, 0x29, 0x52, 0x3d, 0x6c, 0x9f, 0xc3, 0x3c, 0xa9, 0xf6, 0xea, 0xbc, 0x0b, 0xce, 0x84, 0x3d, 0x3a, 0x7a, 0x83, 0x3d, 0x95, 0x99, 0xff, 0x3c, 0x26, 0xc1, 0xae, 0xbc, 0x4c, 0x73, 0xab, 0x3c, 0x10, 0x47, 0x5f, 0xbd, 0x6c, 0x99, 0xab, 0x3c, 0x40, 0x91, 0xee, 0x3a, 0x30, 0xe9, 0x43, 0xbd, 0xd8, 0xdf, 0xed, 0x3c, 0x93, 0xd4, 0x98, 0xbc, 0x05, 0xf8, 0x8c, 0x3d, 0x8d, 0x54, 0x89, 0xbd, 0x29, 0x6a, 0x5a, 0xbd, 0x54, 0x2f, 0x2d, 0xbd, 0x11, 0x76, 0x90, 0xbd, 0x62, 0x24, 0xdf, 0x3c, 0x1f, 0x0c, 0x92, 0xbd, 0x87, 0xb7, 0x06, 0xbd, 0x28, 0x1b, 0x92, 0xbd, 0x41, 0xb6, 0x19, 0xbd, 0x90, 0xa9, 0xc8, 0xbc, 0x10, 0x06, 0xa2, 0x3c, 0x9b, 0x59, 0x72, 0x3d, 0x9f, 0x9b, 0xc4, 0x3c, 0xc2, 0x44, 0xb9, 0xbb, 0xe4, 0x46, 0x90, 0x3d, 0xe9, 0x54, 0x40, 0xbd, 0x18, 0xdd, 0xc8, 0xbc, 0xff, 0x78, 0x44, 0xbd, 0x6e, 0xaa, 0x92, 0xbc, 0x76, 0xaa, 0x31, 0x3c, 0x37, 0x94, 0xe8, 0xbc, 0x2b, 0x84, 0xf6, 0x3c, 0xce, 0x29, 0x8f, 0xbc, 0x37, 0xdc, 0xaf, 0x3c, 0x40, 0x76, 0xbd, 0x3c, 0xd6, 0x49, 0x50, 0x3d, 0x48, 0x72, 0x36, 0xbd, 0xc7, 0x51, 0x63, 0xbd, 0x04, 0x47, 0x70, 0xbc, 0x02, 0x99, 0x7c, 0xbc, 0x83, 0xb4, 0x44, 0xbd, 0x1d, 0x3b, 0x83, 0xbd, 0x55, 0xe3, 0x41, 0x3d, 0x2c, 0x05, 0xcf, 0x3a, 0x52, 0x65, 0x2f, 0x3d, 0x8e, 0x0d, 0x2d, 0x3d, 0x59, 0x13, 0x43, 0xbd, 0xe6, 0x6e, 0xf3, 0x3c, 0xc3, 0xfc, 0xac, 0x3c, 0x82, 0x9e, 0x5f, 0xbc, 0x07, 0xd9, 0x6f, 0xbd, 0xf0, 0xf1, 0x9d, 0x3b, 0x09, 0xcd, 0x07, 0xbd, 0x99, 0xc1, 0x87, 0x3d, 0xfa, 0xef, 0x73, 0x3d, 0xe5, 0x18, 0xfc, 0x3c, 0xbc, 0x08, 0x06, 0x3d, 0x5e, 0x91, 0x90, 0xbd, 0x9c, 0x69, 0xf7, 0x3b, 0x71, 0x14, 0xef, 0xbc, 0x90, 0x77, 0xf9, 0x3c, 0x4c, 0x17, 0x6e, 0xbd, 0x59, 0x66, 0xe5, 0xbb, 0x6d, 0x0b, 0x5f, 0xbc, 0x8a, 0xde, 0x57, 0x3d, 0xdf, 0x37, 0x84, 0xbd, 0x6a, 0x62, 0x7b, 0x3d, 0x19, 0x4c, 0xc5, 0xbc, 0xf0, 0x81, 0x2b, 0x3d, 0x0c, 0xe8, 0x3f, 0xbd, 0x2c, 0xac, 0x36, 0xbd, 0x2a, 0x6a, 0x2e, 0x3d, 0x90, 0xcc, 0x94, 0xbb, 0x07, 0xfd, 0x28, 0xbd, 0x5e, 0x9f, 0xb7, 0x3b, 0xcc, 0xf7, 0x83, 0xbd, 0x2e, 0x4f, 0xa0, 0xbc, 0x06, 0x60, 0xcc, 0x3c, 0xc6, 0xbf, 0x5d, 0x3c, 0x48, 0x40, 0x6b, 0xbd, 0x69, 0x48, 0x03, 0x3d, 0x75, 0x47, 0x48, 0x3d, 0xc4, 0x2f, 0x0f, 0x3d, 0x2d, 0xa5, 0x6e, 0xbd, 0x5a, 0x05, 0x41, 0xbd, 0x7c, 0x10, 0xff, 0x3c, 0x2c, 0x2e, 0x78, 0xbd, 0x16, 0x4f, 0x7d, 0x3d, 0xcf, 0x20, 0x5f, 0x3d, 0xd7, 0x5c, 0x87, 0xbd, 0x96, 0x63, 0x1e, 0xbc, 0x2b, 0xf3, 0x8c, 0xbc, 0x6e, 0x52, 0x00, 0xbd, 0xb0, 0xb0, 0x47, 0x3d, 0x6e, 0x8c, 0xa2, 0xbc, 0x26, 0xa4, 0xbd, 0x3c, 0x50, 0xfb, 0xc4, 0xbc, 0x16, 0xc5, 0xe2, 0x3c, 0x34, 0xbe, 0xba, 0xbc, 0x58, 0x77, 0x06, 0xbc, 0xb6, 0x0f, 0x02, 0x3d, 0x00, 0xc0, 0x67, 0xbd, 0x19, 0x7b, 0x0f, 0xbd, 0xdf, 0xca, 0x42, 0xbd, 0x28, 0x6b, 0x5d, 0xbd, 0xe8, 0x7b, 0x0b, 0x3d, 0x0f, 0xd3, 0x9b, 0xbc, 0x0e, 0x94, 0x3c, 0x3d, 0x56, 0xcd, 0x32, 0xbd, 0x39, 0x73, 0x82, 0xbd, 0x32, 0x4b, 0x06, 0xbd, 0x77, 0xbe, 0x35, 0xbd, 0x4f, 0x03, 0x0b, 0x3d, 0x40, 0x14, 0x8b, 0x3d, 0xe0, 0x32, 0x60, 0xbd, 0x4f, 0xd0, 0x85, 0x3d, 0x0f, 0xfc, 0x74, 0xbc, 0xa1, 0xfc, 0xfa, 0xbb, 0x83, 0x11, 0x49, 0x3b, 0x48, 0x21, 0x1b, 0xbc, 0x4d, 0x36, 0xe6, 0xbc, 0x27, 0x47, 0x6c, 0xbc, 0x6f, 0x04, 0x37, 0xbd, 0xc6, 0x57, 0x6a, 0x3d, 0xa0, 0x16, 0x4d, 0x3b, 0x1a, 0xeb, 0x55, 0x3d, 0x6e, 0x5f, 0x2d, 0xbd, 0xde, 0xff, 0x65, 0xbd, 0x68, 0x46, 0x49, 0x3c, 0x3c, 0x27, 0x3c, 0xbd, 0xfd, 0xdc, 0x0e, 0xbd, 0xb9, 0xff, 0x24, 0xbd, 0xf0, 0x8f, 0x5c, 0xbd, 0xa8, 0x9d, 0x32, 0x3d, 0x5c, 0x6d, 0x4d, 0xbd, 0x0d, 0xc2, 0x47, 0x3d, 0xf5, 0xe0, 0x8b, 0x3c, 0x4e, 0xd4, 0xfb, 0xbc, 0x2f, 0xef, 0x7d, 0x3d, 0x0d, 0xbf, 0x03, 0x3d, 0x54, 0x6e, 0x16, 0x3d, 0x51, 0x8b, 0x85, 0xbd, 0xac, 0x6b, 0x19, 0xbb, 0x2e, 0x99, 0x9e, 0x3c, 0xd9, 0xa5, 0x35, 0x3d, 0x90, 0x56, 0x59, 0x3d, 0xda, 0xee, 0x7c, 0x3d, 0x63, 0x87, 0x1b, 0xbb, 0x12, 0x90, 0x39, 0xbd, 0x4b, 0xb8, 0x39, 0x3d, 0x3f, 0x49, 0x94, 0xbc, 0xeb, 0x8f, 0x80, 0x3d, 0x8a, 0x9f, 0x81, 0xbd, 0xdb, 0x11, 0x0c, 0x3d, 0x13, 0x28, 0x29, 0x3d, 0x70, 0x84, 0xfc, 0xbc, 0x48, 0x74, 0x10, 0x3c, 0xcc, 0xb3, 0x30, 0xbd, 0x48, 0x07, 0x16, 0x3c, 0x5d, 0x4f, 0x19, 0xbd, 0x2b, 0x80, 0xf7, 0xbb, 0x16, 0x87, 0x08, 0xbd, 0x07, 0x00, 0x88, 0x3d, 0x12, 0x69, 0x44, 0x3d, 0x18, 0x31, 0x0d, 0x3c, 0x57, 0xd3, 0x06, 0x3d, 0x24, 0x3d, 0x07, 0x3d, 0xcc, 0x07, 0x7f, 0x3d, 0xab, 0x2a, 0x79, 0xbd, 0x7e, 0x3c, 0x79, 0xbd, 0xa9, 0x22, 0xfb, 0xbc, 0x3d, 0xa3, 0x3f, 0x3d, 0x9b, 0x63, 0x40, 0x3c, 0x8f, 0xd5, 0x9b, 0x3c, 0x38, 0x24, 0x2b, 0x3d, 0x73, 0x53, 0x02, 0x3d, 0xf4, 0xe3, 0xfb, 0x3c, 0xab, 0x4b, 0x81, 0x3d, 0x6c, 0x44, 0x17, 0x3d, 0xe9, 0xbe, 0x8e, 0x3d, 0x79, 0xc1, 0x23, 0x3c, 0x19, 0xfd, 0x91, 0x3c, 0xf9, 0xea, 0x83, 0x3c, 0x5a, 0xee, 0x86, 0x3c, 0xa7, 0x51, 0x2f, 0xbd, 0x4a, 0xa1, 0x43, 0x3d, 0xf7, 0xc3, 0xdd, 0x3b, 0x41, 0x5d, 0x48, 0xbd, 0x91, 0x94, 0x92, 0xbd, 0x76, 0xb0, 0x87, 0x3d, 0xad, 0x39, 0x8e, 0x3d, 0xa0, 0x5a, 0xc3, 0xbb, 0x13, 0xd2, 0x42, 0xbd, 0x93, 0x32, 0x41, 0xbc, 0x02, 0x56, 0x91, 0xbd, 0x6e, 0x37, 0x12, 0xbd, 0x70, 0x73, 0xe7, 0x3b, 0x85, 0xd7, 0x78, 0x3b, 0xb0, 0xfb, 0x3f, 0xbd, 0x44, 0xb8, 0x2e, 0xbd, 0xcd, 0x1c, 0x92, 0xbd, 0x78, 0xee, 0xe1, 0xbc, 0xb4, 0x56, 0x52, 0xbd, 0xa6, 0xbd, 0x62, 0x3d, 0xdc, 0x38, 0xe8, 0xbc, 0x30, 0xaf, 0x68, 0x3c, 0xe0, 0x72, 0x05, 0xbc, 0x06, 0xad, 0xd5, 0x3b, 0xd9, 0x62, 0x23, 0x3d, 0xf8, 0xa2, 0xee, 0xbc, 0x44, 0x13, 0x07, 0x3d, 0x04, 0xcc, 0xf2, 0x3a, 0xce, 0x3f, 0x2c, 0x3d, 0x25, 0x8b, 0x28, 0x3c, 0x55, 0xd2, 0x7a, 0xbc, 0x19, 0x6f, 0x83, 0x3d, 0x62, 0xaa, 0x32, 0xbd, 0xf2, 0x19, 0x1c, 0xbc, 0x54, 0xc3, 0x8b, 0xbd, 0xdd, 0xeb, 0x52, 0x3c, 0x2a, 0xc7, 0x7c, 0x3d, 0x04, 0xf0, 0xb9, 0x3b, 0xe8, 0x91, 0x84, 0x3d, 0x8d, 0xa2, 0xa3, 0x3c, 0x01, 0xde, 0x7d, 0xbd, 0x14, 0xf3, 0x25, 0xbd, 0xde, 0x87, 0x8e, 0xbd, 0x6b, 0x3b, 0x85, 0x3d, 0x02, 0x85, 0x84, 0xbd, 0x6b, 0x77, 0x6d, 0xbc, 0xb6, 0x9a, 0x53, 0x3d, 0x0f, 0xb3, 0xaa, 0xbb, 0x13, 0x69, 0x55, 0xbd, 0x65, 0x98, 0x57, 0xbd, 0xef, 0x9c, 0xb2, 0xbc, 0xd2, 0x02, 0xd4, 0x3c, 0x8e, 0xca, 0x27, 0x3d, 0x64, 0xc8, 0x42, 0xbd, 0xca, 0x34, 0x39, 0xbd, 0xec, 0x45, 0x78, 0xbc, 0xe3, 0xe3, 0x15, 0xbd, 0xad, 0x80, 0x30, 0x3d, 0xa3, 0xc8, 0x12, 0xbd, 0x11, 0x8e, 0x40, 0x3d, 0x9a, 0x5f, 0x29, 0xbc, 0xbe, 0xc0, 0x8e, 0xbd, 0x2e, 0x01, 0x05, 0xba, 0xde, 0x16, 0x2d, 0x3d, 0xce, 0xc7, 0x68, 0x3d, 0x08, 0x78, 0x4b, 0x3d, 0xb9, 0xc7, 0x8f, 0xbd, 0x99, 0x7d, 0x71, 0x3d, 0x20, 0x52, 0x85, 0x3b, 0x8e, 0x86, 0xcc, 0xbc, 0x18, 0x1e, 0x1e, 0x3d, 0x06, 0x84, 0x35, 0x3d, 0xd8, 0x65, 0x71, 0xbd, 0xb1, 0x95, 0x1e, 0x3d, 0xa8, 0x12, 0x4f, 0x3d, 0xf0, 0x82, 0x6b, 0x3c, 0x82, 0x05, 0x05, 0xbd, 0x78, 0x40, 0xef, 0x3c, 0xea, 0xf1, 0x91, 0xbd, 0x06, 0x99, 0x82, 0x3d, 0x65, 0x80, 0x81, 0xbc, 0xc7, 0xd2, 0x98, 0xbc, 0x1b, 0xab, 0x8c, 0x3b, 0x8d, 0xe6, 0xa2, 0x3c, 0x5a, 0xb0, 0xe8, 0xbc, 0x74, 0x5c, 0x65, 0x3c, 0x53, 0x81, 0x88, 0x3d, 0x77, 0xe4, 0x83, 0xbd, 0x05, 0x68, 0x3f, 0xbd, 0x7f, 0xa0, 0x34, 0xbd, 0x23, 0xc6, 0x57, 0xbd, 0xe8, 0x03, 0x4c, 0xbd, 0xef, 0x5a, 0x91, 0x3c, 0x85, 0x78, 0x46, 0xbd, 0xc3, 0x5f, 0x2e, 0xbd, 0x38, 0x74, 0x09, 0x3d, 0x71, 0x8d, 0x2a, 0xbd, 0x7c, 0xb3, 0x40, 0x3d, 0x26, 0xf6, 0x72, 0xbd, 0x84, 0xfa, 0x4f, 0xbd, 0x34, 0x53, 0xa7, 0x3c, 0x2c, 0x63, 0x6f, 0x3d, 0xe4, 0xa4, 0x29, 0xbd, 0x00, 0x17, 0x21, 0xbb, 0x82, 0x9e, 0x6f, 0x3d, 0x8a, 0x61, 0x8d, 0xbd, 0xc4, 0xd7, 0x45, 0x3d, 0x20, 0x1a, 0xce, 0x3c, 0x86, 0x39, 0x27, 0xbd, 0xf1, 0x45, 0x1f, 0xbd, 0xe0, 0x3e, 0xd4, 0x3c, 0x8a, 0x80, 0x70, 0xbc, 0x80, 0xae, 0xd4, 0x3c, 0x04, 0x93, 0x0a, 0x3d, 0xff, 0x3c, 0x78, 0x3d, 0x31, 0x0e, 0x48, 0x3c, 0x20, 0xa8, 0x89, 0xbd, 0x98, 0x75, 0x07, 0xbc, 0x68, 0xa1, 0x71, 0x3d, 0xe0, 0xe8, 0x8e, 0xbc, 0xe9, 0x29, 0x19, 0x3d, 0x79, 0x7c, 0x4f, 0xbc, 0x90, 0x98, 0xd5, 0x3c, 0x3b, 0xec, 0x1c, 0xbd, 0x36, 0x46, 0x84, 0xb9, 0x18, 0x09, 0x8a, 0xbc, 0x84, 0xce, 0x0d, 0xbc, 0xb8, 0x2c, 0xa8, 0x3c, 0x20, 0x84, 0x18, 0xbc, 0xa0, 0x54, 0x72, 0xbd, 0x5f, 0xd9, 0x82, 0xbd, 0xe7, 0x32, 0x69, 0xbc, 0x58, 0xf3, 0x30, 0xbc, 0x12, 0xff, 0x89, 0x3b, 0x38, 0xb3, 0x50, 0x3c, 0x5c, 0xf7, 0x48, 0x3c, 0x40, 0xb3, 0xb9, 0x3c, 0x08, 0x01, 0x2b, 0x3d, 0xcb, 0x34, 0xc0, 0xbc, 0x9c, 0x64, 0x51, 0xbd, 0x58, 0x1a, 0x2f, 0xbd, 0x4a, 0x45, 0x8a, 0xbc, 0x6a, 0x88, 0xe3, 0x3b, 0xf2, 0xe0, 0x74, 0x3d, 0x08, 0xa7, 0x2d, 0xbd, 0x73, 0x61, 0x17, 0xbd, 0xf0, 0xee, 0xce, 0xbc, 0xda, 0xbc, 0x20, 0xbd, 0x57, 0x27, 0xc6, 0x3c, 0x3c, 0xfc, 0xb2, 0x3d, 0xf9, 0x52, 0x72, 0x3d, 0x98, 0x21, 0x23, 0x3a, 0x64, 0x0e, 0x39, 0xbd, 0x3c, 0x50, 0xff, 0xbd, 0xf0, 0xb9, 0x36, 0xbd, 0xff, 0xe2, 0xa3, 0x3d, 0x1c, 0xad, 0x24, 0xbd, 0x17, 0x26, 0x4b, 0x3d, 0x32, 0xdb, 0xca, 0x3b, 0xc6, 0x04, 0x3c, 0x3d, 0x3c, 0x98, 0x9c, 0x3d, 0xd7, 0xd3, 0x80, 0xbc, 0x30, 0x4e, 0xd9, 0x3c, 0xff, 0xc1, 0x21, 0x3d, 0x66, 0xcc, 0xa5, 0xbc, 0x61, 0x87, 0x98, 0x3d, 0x98, 0x20, 0x32, 0x3d, 0xec, 0xf1, 0x87, 0xbd, 0x40, 0x73, 0xb9, 0xbd, 0xed, 0x67, 0x98, 0x3d, 0x82, 0xde, 0x83, 0x3c, 0xef, 0xb3, 0xe9, 0x3c, 0xf6, 0xd1, 0x2f, 0x3d, 0xb6, 0xa2, 0x6c, 0xbd, 0xfa, 0x55, 0x87, 0xbd, 0x5e, 0x0d, 0x4b, 0xbd, 0x52, 0x83, 0x1b, 0x3d, 0x38, 0xa3, 0x32, 0xbd, 0x68, 0xa3, 0xd0, 0x3c, 0x6b, 0x9b, 0x0e, 0xbd, 0xe8, 0x58, 0x83, 0x3b, 0xac, 0xf2, 0x1d, 0x3d, 0xdc, 0x01, 0xfe, 0xbb, 0x45, 0xd1, 0x37, 0x3d, 0x7d, 0x74, 0x10, 0x3d, 0x39, 0x6f, 0x42, 0xbd, 0x1f, 0x11, 0xd3, 0xbc, 0x58, 0x36, 0x98, 0x3d, 0xe6, 0x99, 0x19, 0xbd, 0x2e, 0x3f, 0x44, 0x3c, 0x04, 0xd0, 0x08, 0xbd, 0x9e, 0x8c, 0x74, 0xbc, 0x73, 0x43, 0xeb, 0xbc, 0xa2, 0x01, 0x9b, 0xbd, 0x30, 0x8a, 0x29, 0xbd, 0x4d, 0xe1, 0x50, 0xbd, 0xc8, 0x2a, 0x1d, 0x3d, 0x2d, 0x12, 0x7d, 0x3d, 0xdd, 0x75, 0x24, 0xbc, 0xd7, 0x2b, 0x48, 0x3c, 0x84, 0x77, 0xf0, 0x3c, 0xf8, 0x69, 0x8a, 0x3d, 0x0d, 0x62, 0x23, 0x3d, 0x8d, 0x2a, 0x65, 0x3d, 0x33, 0xc6, 0xce, 0x3b, 0x34, 0xb9, 0x97, 0x3b, 0xf3, 0x86, 0xe2, 0xbb, 0x5d, 0x2a, 0x53, 0xbd, 0xea, 0x2b, 0x9a, 0xba, 0xbf, 0xd8, 0x91, 0xbc, 0x3d, 0x5f, 0xfa, 0xbc, 0x04, 0x71, 0x82, 0x3d, 0x02, 0x09, 0xbe, 0x3d, 0xa2, 0xb3, 0xad, 0x3c, 0x6c, 0x47, 0x28, 0xbd, 0xce, 0xd6, 0x16, 0xbd, 0x95, 0x44, 0xff, 0x3c, 0x6c, 0x62, 0x82, 0x3d, 0x2a, 0x15, 0xba, 0xbc, 0xc1, 0xa7, 0x83, 0xbb, 0x69, 0x42, 0x7c, 0xbd, 0x03, 0x6e, 0x01, 0x3d, 0xd9, 0x8c, 0x1b, 0xbd, 0xc7, 0x85, 0xdc, 0x3c, 0x76, 0x04, 0x4d, 0x3d, 0x99, 0x3b, 0x69, 0x3c, 0xee, 0x8a, 0x6f, 0x3d, 0x2c, 0xb5, 0x34, 0xbd, 0x95, 0xc2, 0x32, 0xbd, 0x34, 0x5b, 0x8a, 0x3c, 0x0d, 0x52, 0x44, 0xbb, 0xe8, 0xfd, 0xe3, 0xbc, 0x6c, 0x8f, 0x6c, 0x3d, 0x22, 0xe9, 0xce, 0xbc, 0x38, 0x1d, 0xa4, 0x3d, 0x37, 0xb9, 0xcc, 0xbb, 0x58, 0x8e, 0xbb, 0xbc, 0x13, 0x85, 0x8d, 0x3d, 0x7b, 0x10, 0x9d, 0xbd, 0xb0, 0x74, 0x20, 0xbd, 0xbf, 0x6b, 0x24, 0xbc, 0x0b, 0xb2, 0x6f, 0xbd, 0xbe, 0x9c, 0xae, 0x3d, 0x64, 0xfc, 0x34, 0x3d, 0x84, 0x44, 0x59, 0x3b, 0xc5, 0x97, 0xb6, 0xbc, 0x25, 0x1b, 0x42, 0xbd, 0x1c, 0x64, 0x59, 0x3d, 0x00, 0x12, 0x82, 0x3d, 0x64, 0xac, 0x91, 0x3b, 0x3b, 0xae, 0x6b, 0xbd, 0x18, 0x6c, 0xd0, 0x3d, 0x9e, 0xea, 0x60, 0x3d, 0xf3, 0xf6, 0x49, 0xbd, 0xd3, 0xfc, 0x5b, 0xbc, 0xe5, 0x37, 0x64, 0x3c, 0xbe, 0x33, 0x9c, 0xbc, 0x0e, 0x7a, 0x70, 0xbd, 0xf7, 0x19, 0x32, 0xbd, 0x7a, 0x54, 0xac, 0xbd, 0x94, 0x9a, 0x45, 0xbc, 0xb6, 0xa0, 0x55, 0x3d, 0x72, 0x8b, 0x81, 0x3d, 0xec, 0xf7, 0x1d, 0x3c, 0x7c, 0xc0, 0x65, 0xbd, 0x21, 0x3d, 0xa8, 0x3d, 0xfe, 0x98, 0x91, 0xbc, 0xfc, 0x4e, 0x99, 0xbd, 0xd5, 0x77, 0xa0, 0xbd, 0x9a, 0xec, 0x0b, 0x3d, 0xc2, 0xc5, 0x2e, 0xbd, 0x58, 0x39, 0x9b, 0x3d, 0x1a, 0x19, 0x4e, 0xbd, 0x32, 0x1e, 0x11, 0xbd, 0xe2, 0x81, 0x2f, 0xbd, 0x72, 0x93, 0x82, 0x3d, 0xb5, 0x33, 0x96, 0x3d, 0xfd, 0x32, 0x31, 0xbd, 0xf0, 0x5e, 0x7b, 0xbd, 0x37, 0x76, 0x4d, 0xbd, 0x5e, 0xa1, 0x9a, 0x3d, 0x58, 0xb2, 0x89, 0xbd, 0xc0, 0x61, 0x93, 0x3a, 0x12, 0xf4, 0x7a, 0x3d, 0xad, 0xe5, 0x32, 0xba, 0xf3, 0xfe, 0x75, 0x3d, 0xbd, 0xec, 0x57, 0xbd, 0x4d, 0x5b, 0x09, 0x3d, 0x27, 0x1d, 0x1b, 0xbd, 0x26, 0x5e, 0x77, 0xbc, 0x33, 0xd7, 0x30, 0xbd, 0x93, 0xde, 0x6d, 0xbd, 0xfe, 0xdd, 0x6f, 0x3d, 0x07, 0x21, 0xad, 0x3d, 0xb6, 0xfb, 0x77, 0x3d, 0xc7, 0xd4, 0x12, 0x3d, 0xee, 0xd1, 0x1a, 0x3b, 0x57, 0x6a, 0xdf, 0xbc, 0x9a, 0x69, 0x98, 0xbd, 0x18, 0xb5, 0x8b, 0xbd, 0x3f, 0x2a, 0x1b, 0xbc, 0xba, 0x61, 0x4e, 0x3d, 0xf7, 0xfc, 0x15, 0x3d, 0x15, 0x6a, 0x89, 0x3d, 0x0c, 0x26, 0x12, 0xbd, 0x3c, 0x56, 0x75, 0x3d, 0x31, 0x95, 0x49, 0x3c, 0x80, 0x89, 0x27, 0xbd, 0xc5, 0xc8, 0x2d, 0xba, 0xd4, 0xb2, 0x99, 0x3d, 0xbd, 0xfe, 0x19, 0xbd, 0x88, 0x62, 0x88, 0x3d, 0x1a, 0xea, 0xb6, 0x3d, 0x06, 0xc5, 0x95, 0xbd, 0xbe, 0x0c, 0x2d, 0xbd, 0x09, 0x1b, 0x59, 0x3d, 0xf7, 0xd4, 0xbe, 0xba, 0x23, 0x7e, 0x0d, 0xbd, 0x3f, 0x6a, 0x9f, 0x3c, 0x29, 0x6c, 0x86, 0x3c, 0x50, 0x53, 0xad, 0xbc, 0x4d, 0x7e, 0xd5, 0xbd, 0xd2, 0xac, 0x6b, 0x3d, 0xfd, 0xc0, 0x8d, 0xbd, 0x96, 0xc2, 0x3f, 0x3d, 0xc7, 0x50, 0x9d, 0xbc, 0xf8, 0x74, 0xa7, 0xbc, 0x20, 0xcb, 0xbe, 0xbd, 0x39, 0xaa, 0x5d, 0x3d, 0x53, 0x49, 0x99, 0xbc, 0xfe, 0x92, 0xca, 0xbd, 0xf2, 0x46, 0x75, 0xbd, 0x71, 0xfe, 0x6e, 0xbd, 0x9f, 0x2f, 0x59, 0xbd, 0x0b, 0xe7, 0x3f, 0xbc, 0xad, 0x3f, 0x80, 0x3d, 0xec, 0x4d, 0x81, 0xbd, 0x53, 0x8f, 0x8a, 0x3d, 0xfb, 0x2c, 0x54, 0x3d, 0x20, 0x2c, 0x57, 0xbd, 0xc1, 0xeb, 0xe2, 0xba, 0x98, 0xed, 0x46, 0x3d, 0x6a, 0x20, 0xc1, 0x3c, 0x54, 0x95, 0x2c, 0xbd, 0xac, 0xc1, 0x2b, 0x3c, 0x29, 0x2a, 0xf8, 0xbd, 0x4e, 0x69, 0x7f, 0x3d, 0x17, 0x04, 0x29, 0xbd, 0xf2, 0xbb, 0xeb, 0xbb, 0xf1, 0x49, 0x40, 0x3d, 0x00, 0x69, 0x01, 0x3d, 0x8d, 0x53, 0x64, 0x3d, 0xb7, 0x21, 0x0b, 0xbd, 0x43, 0xc5, 0xc7, 0xbd, 0x1b, 0xa3, 0x48, 0x3d, 0xcb, 0x7c, 0x09, 0xbd, 0x20, 0xcb, 0x6e, 0xbb, 0x94, 0x3f, 0x2e, 0x3d, 0xf7, 0x32, 0x72, 0xbd, 0x9a, 0x1e, 0x40, 0xbd, 0x5b, 0xf3, 0x47, 0x3d, 0x02, 0xea, 0x77, 0xba, 0x63, 0xf3, 0xe8, 0x3c, 0xac, 0x35, 0x06, 0xbd, 0xbd, 0x03, 0x4c, 0xbd, 0x11, 0xf6, 0x92, 0x3d, 0x1b, 0x1a, 0x64, 0x3d, 0x51, 0x88, 0x58, 0xbc, 0x61, 0xbf, 0x83, 0xbd, 0xdd, 0x44, 0x73, 0xbd, 0xe7, 0xe5, 0xd0, 0x3c, 0xc9, 0x5f, 0x87, 0x3d, 0xec, 0x20, 0xbe, 0x3d, 0xd9, 0x21, 0x0f, 0x3d, 0xf9, 0xdd, 0xe7, 0xbc, 0xf3, 0x32, 0x91, 0xbd, 0x71, 0xb6, 0x4a, 0x3d, 0x29, 0x35, 0x86, 0x3d, 0xba, 0xf4, 0x40, 0xbd, 0x1c, 0x2b, 0x17, 0xbd, 0x70, 0xfb, 0x3c, 0xbd, 0xed, 0x3e, 0xdf, 0xbc, 0x60, 0xf1, 0x3d, 0x3d, 0x53, 0x6e, 0x87, 0xbd, 0x0f, 0x52, 0x3d, 0x3d, 0x58, 0xd1, 0x47, 0xbd, 0xab, 0x7f, 0xc3, 0x3c, 0x3d, 0x5d, 0xa8, 0xbd, 0xe9, 0x7f, 0x11, 0xbd, 0x88, 0x93, 0x50, 0xbd, 0xf2, 0xd2, 0x0f, 0x3d, 0x24, 0x59, 0x90, 0x3a, 0x99, 0x86, 0x8b, 0xbd, 0x27, 0x21, 0x5f, 0xbd, 0xf4, 0xa1, 0x80, 0x3d, 0x0b, 0xbb, 0x89, 0x3c, 0xbc, 0xda, 0x79, 0x3d, 0xe8, 0x9b, 0x56, 0xbc, 0x42, 0xca, 0xf1, 0x3c, 0x74, 0xe2, 0x86, 0x3c, 0xe4, 0x85, 0x0f, 0x3d, 0x07, 0x57, 0x2e, 0x3d, 0x41, 0x24, 0x85, 0x3d, 0x48, 0x7e, 0x08, 0xbd, 0x91, 0xa8, 0xdd, 0x3c, 0x8c, 0xe1, 0xb7, 0xbc, 0x04, 0xae, 0x2f, 0x3d, 0xe4, 0x63, 0xa2, 0x3c, 0x6e, 0x28, 0x06, 0xbc, 0x8d, 0xd9, 0x67, 0xbd, 0x88, 0x14, 0x43, 0x3d, 0xe5, 0x9a, 0xde, 0x3c, 0x45, 0x3e, 0x9d, 0x3d, 0x03, 0x22, 0xcb, 0xbc, 0x71, 0x92, 0x7c, 0x3d, 0xf7, 0xc6, 0x0d, 0x3d, 0xfb, 0x47, 0xa4, 0x3d, 0x45, 0x18, 0x91, 0xbd, 0xda, 0x0b, 0x79, 0xbc, 0x18, 0x17, 0x71, 0xbd, 0xa2, 0x74, 0x4e, 0xbd, 0xd7, 0xdb, 0x46, 0x3d, 0x35, 0x53, 0xbb, 0x3c, 0x0c, 0x62, 0x0f, 0xbc, 0xe9, 0x2d, 0xdf, 0xbd, 0x33, 0xc7, 0x60, 0x3c, 0x18, 0x74, 0xa8, 0x3c, 0xa3, 0x75, 0x87, 0xbd, 0x7b, 0x58, 0xf3, 0xbd, 0x30, 0xcd, 0xfa, 0x3c, 0x35, 0xbd, 0x9c, 0xbd, 0x93, 0xcf, 0xdb, 0xbc, 0xc2, 0x35, 0xd9, 0xbc, 0x5e, 0x5a, 0x06, 0x3d, 0x3d, 0x8b, 0x39, 0xbd, 0xb7, 0x5d, 0x33, 0xbc, 0x50, 0xca, 0xb8, 0x3c, 0x8b, 0x71, 0xfb, 0x3c, 0x80, 0x8e, 0x2a, 0x3d, 0xa0, 0x72, 0x80, 0xbc, 0x08, 0x4a, 0x00, 0xbd, 0x9b, 0x6f, 0xd2, 0x3b, 0xda, 0x83, 0xf9, 0xbc, 0xed, 0x0c, 0x0b, 0x3c, 0x5d, 0x80, 0x40, 0xbc, 0x84, 0x40, 0x25, 0xbd, 0x52, 0x1e, 0x03, 0x3d, 0x53, 0xd4, 0x54, 0x3c, 0x0b, 0x6b, 0xda, 0x3c, 0xcc, 0x67, 0x17, 0x3b, 0x58, 0x05, 0xe5, 0xba, 0x63, 0x8d, 0x95, 0x3c, 0xc6, 0xa5, 0x5a, 0x3d, 0xdf, 0x29, 0x23, 0xbd, 0x4b, 0x72, 0x9b, 0x3d, 0xef, 0x78, 0x4b, 0xbd, 0xa5, 0x08, 0xb7, 0xbd, 0x9c, 0xb5, 0x78, 0xbc, 0xdf, 0x0c, 0x88, 0x3d, 0x07, 0xab, 0x19, 0x3d, 0xdc, 0xad, 0xc9, 0xbd, 0x5e, 0x37, 0x4f, 0x3d, 0xe6, 0x99, 0x77, 0xbd, 0x12, 0x5f, 0x48, 0xbc, 0x89, 0x82, 0xf2, 0x3b, 0x86, 0x89, 0x44, 0x3c, 0x66, 0x1b, 0xb7, 0xbc, 0x2f, 0x07, 0xd0, 0x3b, 0xb5, 0x85, 0x76, 0xb9, 0xb2, 0xc4, 0x11, 0xbd, 0x5b, 0x02, 0x30, 0xbd, 0xed, 0xed, 0xee, 0x3c, 0x77, 0xbd, 0x24, 0xbb, 0x36, 0xe9, 0x97, 0xbd, 0x2a, 0xe1, 0x6d, 0x3d, 0x75, 0x29, 0xaf, 0x3d, 0xff, 0x38, 0xac, 0xbb, 0x76, 0x6d, 0xe4, 0xbc, 0xf8, 0x03, 0x15, 0xbd, 0x6f, 0x3d, 0x9a, 0xbc, 0x6b, 0x64, 0x1f, 0x3d, 0xa6, 0x7c, 0x6f, 0xbd, 0xa7, 0x60, 0x83, 0x3c, 0xe1, 0xa5, 0x53, 0xbd, 0x04, 0x4f, 0xb6, 0xbc, 0xe7, 0x0b, 0x28, 0x3d, 0x4c, 0x15, 0xa9, 0xbc, 0x68, 0x90, 0x73, 0xbb, 0x77, 0x3e, 0x8e, 0x3c, 0xdd, 0x42, 0x0c, 0xbd, 0x07, 0x7d, 0x22, 0xbd, 0x35, 0x15, 0x82, 0xbd, 0xed, 0x56, 0xe0, 0x3c, 0xfa, 0x8d, 0x7e, 0x3d, 0xab, 0xb5, 0x85, 0xbd, 0x8c, 0x4b, 0xa4, 0xbc, 0xe5, 0xee, 0x53, 0xbc, 0x9e, 0x26, 0x4f, 0xbd, 0xaa, 0xdf, 0x63, 0xbd, 0xd2, 0x48, 0x11, 0x3c, 0xd6, 0x9c, 0x58, 0x3d, 0xa9, 0x90, 0x00, 0x3d, 0x9b, 0xfa, 0x8c, 0x3b, 0x2a, 0x97, 0x1d, 0x3d, 0x37, 0xe9, 0x3e, 0xbd, 0x51, 0xd8, 0xf0, 0xbd, 0x92, 0x65, 0x2b, 0xbd, 0x06, 0x73, 0x21, 0x3c, 0x85, 0x89, 0xad, 0x3d, 0x50, 0x07, 0x60, 0x3d, 0x01, 0x61, 0x9a, 0x3d, 0xcf, 0xba, 0x9c, 0x3d, 0x7c, 0x6f, 0x69, 0x3d, 0x20, 0x79, 0x71, 0xbd, 0xc8, 0x59, 0xd1, 0xbc, 0x2f, 0x68, 0x1e, 0xbd, 0xb2, 0xed, 0x87, 0xbd, 0x3e, 0xe7, 0xa0, 0xba, 0xb1, 0xf0, 0xd0, 0x3c, 0x1c, 0xf1, 0xdd, 0xbc, 0xb0, 0x4a, 0x83, 0xbb, 0xb5, 0x00, 0x55, 0xbc, 0xc6, 0x63, 0x0b, 0x3d, 0xa8, 0x88, 0x2f, 0x3d, 0x3c, 0x6e, 0xd7, 0x3c, 0x68, 0x1d, 0x14, 0xbc, 0xac, 0xd1, 0x37, 0x3d, 0x7f, 0xb7, 0x66, 0x3d, 0xca, 0xd0, 0xc7, 0xbb, 0x72, 0x5a, 0x91, 0x3d, 0x64, 0x09, 0xaf, 0x3c, 0xea, 0x7a, 0x0d, 0xbb, 0x87, 0xd8, 0x4f, 0xbb, 0x88, 0xdf, 0xa5, 0x3c, 0x1a, 0xd5, 0x73, 0xbc, 0x55, 0x5b, 0xce, 0x3a, 0xff, 0x62, 0x16, 0x3d, 0xb9, 0x06, 0xa8, 0xbd, 0xbc, 0x96, 0xc0, 0xbc, 0x77, 0x06, 0x17, 0xbc, 0xe9, 0xdf, 0x7e, 0xba, 0x94, 0x5f, 0xcd, 0x3b, 0x7b, 0x66, 0xf2, 0xbc, 0xc3, 0xdf, 0x7d, 0xbd, 0x9c, 0x07, 0x0e, 0xbd, 0xaa, 0x4e, 0x0a, 0xbd, 0x42, 0x2d, 0x7f, 0x3c, 0x6f, 0x45, 0xb9, 0x3c, 0x6a, 0xf4, 0x2c, 0xbd, 0x66, 0x01, 0x23, 0xbd, 0x5a, 0x2e, 0x12, 0xbc, 0x00, 0x0c, 0xc4, 0xbd, 0x56, 0xf3, 0xd9, 0xbc, 0x57, 0x20, 0x14, 0xbd, 0x8f, 0xae, 0xbd, 0x3c, 0x0a, 0x85, 0xbb, 0xbd, 0x51, 0x63, 0x28, 0xbd, 0xc3, 0x45, 0x19, 0xbd, 0x1a, 0xc0, 0x66, 0x3d, 0x58, 0xac, 0x77, 0xbd, 0x2e, 0xb6, 0xdc, 0xbc, 0xaa, 0x45, 0xe6, 0xbc, 0x06, 0xba, 0x43, 0xbd, 0x71, 0x36, 0xac, 0x3d, 0xf5, 0xcb, 0x96, 0x3d, 0x5b, 0x32, 0x58, 0xba, 0x6a, 0xe8, 0xe0, 0xb9, 0x39, 0xb6, 0xbe, 0x3c, 0x56, 0xcc, 0xc5, 0x3b, 0x6b, 0xde, 0xad, 0xbc, 0x6c, 0xd9, 0xf4, 0xbc, 0xb2, 0xe9, 0x43, 0x3d, 0xf9, 0xd2, 0x1b, 0xbc, 0xb1, 0x0f, 0x19, 0x3d, 0xb3, 0xe0, 0x05, 0x3b, 0xdd, 0x85, 0xa8, 0x3d, 0x92, 0x70, 0xc0, 0xbc, 0xaf, 0xa0, 0x22, 0xbd, 0x9f, 0x05, 0x33, 0xbd, 0x4a, 0xe4, 0xa8, 0x3c, 0x80, 0xf3, 0xc9, 0xba, 0x9f, 0x4c, 0x31, 0xbd, 0x5e, 0x75, 0xa4, 0xbc, 0x4e, 0xa3, 0x73, 0xbd, 0x32, 0x14, 0x96, 0xbd, 0xf1, 0xc8, 0xb1, 0x3c, 0xa6, 0x72, 0x15, 0xbd, 0x06, 0xbc, 0x4c, 0x3d, 0xd6, 0x84, 0x96, 0x3b, 0xbd, 0x95, 0x27, 0x3d, 0x89, 0x66, 0xd8, 0x3c, 0x14, 0xc8, 0xf8, 0xbc, 0x48, 0xc6, 0x2a, 0x3d, 0x68, 0x7c, 0xa4, 0x3d, 0x0b, 0xfe, 0x48, 0x3d, 0x03, 0x4e, 0xa0, 0x3c, 0x14, 0xeb, 0x9e, 0x3d, 0x54, 0x79, 0x17, 0xbd, 0x8d, 0xe5, 0x44, 0x3c, 0x89, 0xb2, 0x14, 0xbc, 0x37, 0x64, 0x98, 0x3d, 0xd5, 0x7d, 0x54, 0xbd, 0x82, 0x97, 0x92, 0xbd, 0x97, 0x4c, 0x7c, 0x3b, 0xf8, 0x3f, 0x2b, 0x3d, 0xa2, 0x52, 0xc8, 0x3c, 0x67, 0x7b, 0x49, 0xbd, 0x8b, 0xdc, 0x84, 0xbc, 0xfc, 0xd2, 0x1c, 0xbd, 0x50, 0x53, 0x8d, 0xbb, 0xa7, 0x93, 0xfe, 0xbc, 0xab, 0xb3, 0xff, 0xbc, 0xb0, 0x0d, 0x12, 0x3c, 0x90, 0xde, 0x69, 0x3d, 0x19, 0x4a, 0x31, 0x3d, 0xba, 0x86, 0xbe, 0xbd, 0xf0, 0xd1, 0x6f, 0xbd, 0x2a, 0x37, 0xa2, 0x3c, 0xba, 0x72, 0x91, 0xbc, 0x69, 0xfe, 0x8f, 0xbb, 0xb4, 0xe0, 0x26, 0x3d, 0x9e, 0x8e, 0x6f, 0x3d, 0x28, 0x1c, 0xa4, 0xbc, 0xeb, 0x11, 0x0b, 0x3d, 0xd3, 0x1a, 0x27, 0x3c, 0x89, 0x93, 0xa3, 0x3d, 0x22, 0xbf, 0x46, 0x3d, 0xe2, 0x27, 0xe5, 0xbc, 0xa1, 0x10, 0x8a, 0xbc, 0xe9, 0x93, 0x65, 0xbd, 0xef, 0x81, 0xce, 0x3c, 0x0c, 0x10, 0x44, 0x3c, 0xdc, 0x0d, 0x15, 0xbd, 0x8d, 0x3b, 0x09, 0x3d, 0xc2, 0xe2, 0x35, 0xbd, 0xc3, 0xde, 0x09, 0x3c, 0x68, 0xc5, 0x8f, 0x3d, 0xa2, 0xb3, 0x38, 0x3d, 0x94, 0xa6, 0x66, 0x3c, 0x5f, 0x15, 0x79, 0x3d, 0x74, 0x80, 0x7e, 0x3d, 0x00, 0xb6, 0xb0, 0xbb, 0xdb, 0xb6, 0x98, 0xbb, 0x8c, 0x1a, 0xb7, 0xbc, 0xa0, 0xf9, 0x7e, 0x3c, 0x66, 0x95, 0x47, 0x3d, 0xca, 0x33, 0xf0, 0xbc, 0xde, 0x00, 0xfa, 0x3b, 0x57, 0x05, 0xfb, 0xbb, 0xfc, 0x7f, 0xcb, 0xbc, 0x31, 0x1c, 0x11, 0x3d, 0x16, 0xe4, 0xfd, 0x3b, 0x3d, 0xd5, 0xb5, 0x3c, 0x8c, 0xd4, 0x69, 0xbd, 0x40, 0x7f, 0x87, 0xbb, 0x26, 0x9d, 0x77, 0xbc, 0x6b, 0xa7, 0xde, 0x3c, 0xf4, 0xd2, 0x00, 0x3c, 0xff, 0x0d, 0xbc, 0x3c, 0xab, 0xfb, 0x6f, 0x3d, 0x5a, 0x15, 0x8b, 0x3b, 0x05, 0x27, 0x77, 0x3d, 0xd8, 0xa8, 0x54, 0x3d, 0xa7, 0xf2, 0x01, 0x3d, 0x20, 0x41, 0x70, 0x3c, 0x19, 0x99, 0xfd, 0xbc, 0xc0, 0xea, 0x48, 0x3d, 0xd7, 0x09, 0x26, 0x3b, 0x79, 0x58, 0x6b, 0x3d, 0x2b, 0x43, 0x2e, 0xbd, 0x58, 0x06, 0x76, 0x3c, 0xc3, 0x4a, 0x8c, 0x3d, 0x4b, 0x5b, 0x62, 0x3d, 0xb2, 0xff, 0x1f, 0xbd, 0xeb, 0x73, 0x08, 0x3d, 0x39, 0xd4, 0x77, 0xbd, 0xfc, 0x94, 0x83, 0xbc, 0x0e, 0x0d, 0x6c, 0x3d, 0x5c, 0x29, 0x73, 0x3d, 0x96, 0xc4, 0x92, 0xba, 0x00, 0x64, 0x97, 0xbd, 0x3b, 0x52, 0x3a, 0xbd, 0x3a, 0x2d, 0x91, 0xbd, 0x62, 0x65, 0x97, 0xbd, 0x72, 0xde, 0xd2, 0xbd, 0x1d, 0x30, 0x00, 0xbd, 0x74, 0x93, 0x95, 0xbd, 0xae, 0x2c, 0xd7, 0xbc, 0xe3, 0xae, 0x27, 0x3d, 0x67, 0x7f, 0x0b, 0x3c, 0xfc, 0xcf, 0x74, 0xbc, 0x7f, 0x2b, 0x74, 0x3d, 0x00, 0x49, 0xa2, 0xba, 0x13, 0xfa, 0x0e, 0xbd, 0x7e, 0xfe, 0x9f, 0xbc, 0xa6, 0x05, 0xc7, 0xbb, 0xc2, 0xa7, 0x2a, 0xbc, 0xb3, 0x63, 0x9b, 0x3a, 0x9c, 0x14, 0x0e, 0x3d, 0x82, 0xc6, 0xb0, 0xbc, 0xc1, 0x25, 0xc0, 0x3c, 0x03, 0x95, 0x45, 0xbd, 0x61, 0xb6, 0x50, 0xbd, 0xf8, 0x77, 0xea, 0x3a, 0x9d, 0xa7, 0xaa, 0x3a, 0xf2, 0x18, 0x1d, 0xbd, 0x42, 0x15, 0x94, 0x3d, 0x7e, 0x0e, 0x47, 0xbd, 0xa5, 0x82, 0x84, 0x3d, 0xed, 0xbe, 0x3b, 0x3d, 0x3b, 0xdc, 0x2e, 0xbd, 0x5c, 0x8c, 0x4b, 0xbd, 0x37, 0xbc, 0x99, 0xbb, 0xb7, 0x55, 0x54, 0x3d, 0x8e, 0x6d, 0xa8, 0xbd, 0x09, 0x3c, 0x3f, 0x3d, 0x83, 0x0e, 0x3a, 0xbd, 0x8f, 0x1f, 0x91, 0x3d, 0x8b, 0x2b, 0x33, 0xbd, 0x92, 0x57, 0x58, 0x3d, 0x71, 0xcd, 0x27, 0xbd, 0xcf, 0x53, 0x30, 0x3d, 0x20, 0x81, 0x64, 0x3d, 0x50, 0x82, 0x60, 0xbd, 0x98, 0x46, 0x2f, 0x3d, 0x32, 0x95, 0x28, 0xbd, 0x70, 0xf5, 0x71, 0x3c, 0x9d, 0x96, 0xb0, 0xbc, 0x5b, 0x59, 0x56, 0xbd, 0x10, 0x59, 0x90, 0x3d, 0xc0, 0x1e, 0xbb, 0x3c, 0x5c, 0x37, 0x9d, 0x3d, 0xbd, 0x75, 0x61, 0x3d, 0xcf, 0x8b, 0x84, 0xbc, 0xb2, 0x23, 0x46, 0x3d, 0x0a, 0x82, 0x02, 0x3d, 0xaf, 0xd4, 0x8e, 0xbb, 0x60, 0x87, 0xca, 0x3c, 0xdb, 0x73, 0x1a, 0xbd, 0x52, 0xa2, 0x09, 0x3d, 0xa2, 0x5b, 0x4a, 0xbd, 0x1d, 0x5d, 0xa0, 0xbb, 0x30, 0x20, 0x7e, 0xbd, 0x84, 0x2a, 0x78, 0xbd, 0x74, 0x5f, 0x6a, 0xbd, 0xa5, 0x1a, 0xa5, 0xbd, 0xa8, 0x46, 0x92, 0x3c, 0xe5, 0x7e, 0x50, 0xbd, 0xc1, 0x19, 0x4b, 0x3c, 0x1a, 0x20, 0x71, 0x3d, 0xa1, 0xa7, 0x48, 0xbc, 0xc3, 0xa7, 0xeb, 0x3c, 0xd4, 0x58, 0x6c, 0xbd, 0x06, 0x40, 0x08, 0x3d, 0x07, 0x97, 0x93, 0x3d, 0x36, 0xb8, 0x5c, 0xbd, 0x69, 0x31, 0xc4, 0x3d, 0x5d, 0x20, 0x62, 0xbc, 0x73, 0x3a, 0xbf, 0xbc, 0xea, 0xff, 0x3f, 0x3d, 0x39, 0x07, 0xec, 0x3c, 0xeb, 0x30, 0xb4, 0xbb, 0x0b, 0x38, 0x72, 0xbd, 0x12, 0x71, 0xfd, 0xbc, 0xc5, 0x09, 0x82, 0x3b, 0x5d, 0x51, 0x84, 0xbd, 0xff, 0x16, 0x49, 0xbd, 0x5e, 0xd1, 0x13, 0xbd, 0xd8, 0xaf, 0x96, 0x3c, 0xea, 0x7c, 0x7e, 0xbd, 0x9b, 0x71, 0x1c, 0x3d, 0xe0, 0xff, 0xaf, 0xbc, 0xac, 0x24, 0x57, 0x3d, 0x8a, 0xf8, 0x49, 0x3d, 0x24, 0xfd, 0xbc, 0xbc, 0x46, 0x2c, 0xac, 0xbd, 0xc8, 0xdf, 0x63, 0xbc, 0x61, 0xc6, 0x2e, 0xbd, 0x9d, 0xec, 0xd9, 0xbc, 0xb1, 0x44, 0x86, 0xbd, 0x85, 0x38, 0x47, 0x3d, 0x7b, 0x49, 0x5a, 0xbd, 0xb0, 0x9c, 0xee, 0xbc, 0x03, 0x6f, 0x33, 0xbd, 0x55, 0x8c, 0x23, 0xbc, 0xd5, 0xcc, 0x82, 0xbc, 0x82, 0xc2, 0xcc, 0xbc, 0xac, 0x00, 0x85, 0x3c, 0xf6, 0xf5, 0x70, 0x3d, 0xb0, 0x0f, 0x03, 0x37, 0xa3, 0xfd, 0x5a, 0xbd, 0x13, 0x57, 0x38, 0x3c, 0x25, 0xe4, 0xea, 0xbc, 0x1a, 0xb8, 0x0e, 0x3c, 0x80, 0x95, 0x20, 0xbb, 0x84, 0x35, 0x36, 0x3d, 0x27, 0x0c, 0x1f, 0xbd, 0x4e, 0x46, 0x8d, 0x3d, 0xa4, 0xb0, 0xef, 0x3c, 0xe1, 0xf5, 0xce, 0xbc, 0x34, 0x54, 0x9d, 0xbc, 0x9f, 0x03, 0xd9, 0x3b, 0x22, 0xe9, 0xed, 0xbc, 0xd3, 0x7d, 0x30, 0xbd, 0xb8, 0x86, 0x1f, 0xbc, 0xed, 0xc3, 0x44, 0x3d, 0xbf, 0x32, 0xa1, 0x39, 0x74, 0xe5, 0x38, 0xbd, 0xa3, 0xe4, 0x6c, 0xbd, 0x56, 0x19, 0x33, 0xbd, 0x17, 0x60, 0xbd, 0xbc, 0xd5, 0xec, 0x4a, 0x3c, 0xa2, 0x27, 0xa4, 0x3d, 0x50, 0xea, 0x77, 0xbd, 0x5a, 0xb3, 0x91, 0x39, 0xf3, 0xc2, 0x19, 0x3d, 0xd2, 0xb9, 0x4f, 0xbd, 0x60, 0x90, 0x81, 0x3d, 0xbf, 0x14, 0x60, 0xbd, 0x7a, 0xdd, 0x62, 0x3c, 0x43, 0x4c, 0xa5, 0xbb, 0xad, 0x1c, 0xe1, 0xbc, 0xc8, 0x0b, 0x15, 0x3d, 0xe1, 0xbd, 0x0f, 0x3d, 0xc6, 0x1f, 0x92, 0x3d, 0xdf, 0x9a, 0x86, 0xbd, 0x08, 0x1a, 0xed, 0x3c, 0xfa, 0x1f, 0x00, 0x3c, 0x90, 0x94, 0x1b, 0x3d, 0x4a, 0x1c, 0x25, 0xbd, 0x79, 0xe4, 0xff, 0xbc, 0xdf, 0xeb, 0x91, 0x3d, 0x43, 0x22, 0x81, 0x3d, 0x1f, 0x1c, 0xa2, 0xbd, 0x54, 0xaf, 0x48, 0xbd, 0xbb, 0x7d, 0x4a, 0x3c, 0x32, 0xcd, 0x6a, 0x3d, 0xc0, 0x75, 0x8b, 0x3d, 0x9a, 0xad, 0x67, 0x3c, 0xd1, 0xe6, 0x30, 0xbd, 0x85, 0x2b, 0x33, 0x3c, 0xee, 0x90, 0x69, 0x3b, 0x7b, 0xdc, 0x96, 0xbd, 0x38, 0x29, 0xad, 0x3b, 0xd8, 0x2b, 0xff, 0xbb, 0x72, 0x62, 0x57, 0x3c, 0x55, 0x29, 0x86, 0x3d, 0xc7, 0x7c, 0x90, 0xbd, 0xfa, 0xa6, 0x71, 0xbd, 0x7f, 0x51, 0x15, 0x3c, 0x7a, 0x11, 0x61, 0xbd, 0xd8, 0xd1, 0x64, 0x3b, 0xbc, 0x7e, 0x8e, 0x3c, 0x06, 0x60, 0xe6, 0x3b, 0x1a, 0xd8, 0x43, 0x3d, 0x9b, 0xa8, 0x99, 0xbd, 0x30, 0x98, 0x17, 0x3d, 0x82, 0xd8, 0x7a, 0xbd, 0xca, 0x23, 0x14, 0x3d, 0x45, 0x6d, 0x18, 0xbd, 0x0d, 0x33, 0x8d, 0x3c, 0xd9, 0x88, 0xb5, 0xbc, 0x9c, 0x01, 0xc6, 0x3b, 0xc2, 0x52, 0xe5, 0x3c, 0xc6, 0xbf, 0x5a, 0x3d, 0xa8, 0x06, 0x1f, 0xbd, 0x1f, 0xaf, 0x4e, 0x3d, 0x84, 0x35, 0xca, 0xbd, 0x50, 0xc8, 0xee, 0x3c, 0x64, 0xe8, 0x35, 0xbd, 0xbc, 0x23, 0x31, 0x3d, 0x36, 0x1d, 0xbf, 0xbd, 0x7c, 0x88, 0x94, 0xbc, 0x0f, 0x8f, 0x1b, 0x3d, 0x08, 0x54, 0x81, 0x3c, 0x12, 0x2f, 0x8a, 0xbd, 0xd7, 0x70, 0x3c, 0xbc, 0xb8, 0x2a, 0x50, 0x3d, 0xc8, 0xed, 0x0e, 0xbd, 0xb7, 0xa3, 0x54, 0x3d, 0xc9, 0x64, 0x6c, 0xbc, 0x89, 0x83, 0x25, 0xbd, 0xef, 0x72, 0x3b, 0x3b, 0xeb, 0xf8, 0xec, 0x3b, 0xe6, 0x5e, 0x0b, 0xbc, 0xd4, 0xc0, 0xf5, 0xbc, 0x8a, 0x04, 0x92, 0x3d, 0xe8, 0x04, 0x39, 0xbd, 0x0f, 0x74, 0xea, 0x3c, 0xfc, 0x8b, 0x01, 0xbc, 0xb2, 0xe0, 0x73, 0x3d, 0xc8, 0xa1, 0xea, 0x3c, 0x99, 0xfe, 0x4f, 0x3d, 0xde, 0x4f, 0x36, 0xbd, 0x73, 0xe5, 0x76, 0xbd, 0x8b, 0xd2, 0xdb, 0x3b, 0x96, 0x72, 0x79, 0x3c, 0xd0, 0x9b, 0x14, 0x3d, 0x3d, 0x6f, 0x6a, 0x3d, 0x21, 0x55, 0x16, 0x3d, 0xeb, 0x2a, 0x91, 0x3d, 0x8c, 0xd0, 0x33, 0xbd, 0x45, 0xdd, 0x54, 0xbd, 0x7e, 0x94, 0x90, 0xbc, 0xd4, 0x4c, 0x8b, 0x3c, 0x4a, 0x6b, 0x19, 0x3d, 0x9e, 0x42, 0xeb, 0x3c, 0x7d, 0xf2, 0x4f, 0x3d, 0x17, 0x4f, 0xab, 0x3c, 0x28, 0x37, 0xa1, 0x3c, 0x6d, 0xb8, 0x88, 0xbd, 0xc1, 0xe3, 0x1e, 0xbd, 0x8f, 0x8c, 0x60, 0x3d, 0xe9, 0x88, 0x93, 0x3c, 0x54, 0x12, 0x8e, 0x3d, 0x04, 0x68, 0xcb, 0xbc, 0x6e, 0xbf, 0xb0, 0xb9, 0xba, 0x8b, 0x16, 0x3d, 0x3a, 0x30, 0xd5, 0x39, 0x89, 0x43, 0x89, 0x3c, 0x89, 0x8c, 0xc0, 0x3b, 0x93, 0x98, 0xd9, 0xbd, 0xc5, 0x26, 0x3e, 0xbd, 0x2a, 0x4f, 0xa9, 0xbb, 0x35, 0xa6, 0xe6, 0xbc, 0xeb, 0x89, 0x1f, 0x3d, 0xea, 0x85, 0xb7, 0xbc, 0xa7, 0x52, 0xbb, 0xbc, 0x02, 0xda, 0x86, 0x3d, 0x82, 0xad, 0xfd, 0xba, 0x01, 0x20, 0x2f, 0xbd, 0xb8, 0x8c, 0x9d, 0xbd, 0x9c, 0xbd, 0x1b, 0x3d, 0x1d, 0xad, 0xe6, 0x3c, 0xac, 0x48, 0x6b, 0x3c, 0xdd, 0x13, 0xcb, 0xbd, 0xee, 0xcd, 0x8a, 0xbd, 0x8b, 0x33, 0x7c, 0x3d, 0xc5, 0x0a, 0x2a, 0x3d, 0x13, 0x49, 0x77, 0x3d, 0x7e, 0x78, 0xd1, 0xbd, 0xd3, 0x18, 0x3c, 0x3c, 0xb7, 0xaa, 0xb1, 0xbc, 0x54, 0x3a, 0xce, 0xbc, 0x86, 0x08, 0x97, 0xbd, 0x04, 0x21, 0x01, 0xbc, 0x72, 0xa8, 0x65, 0x3d, 0x71, 0x0b, 0xf3, 0x3b, 0x14, 0x9e, 0x88, 0x3c, 0x9c, 0xc6, 0x90, 0x3d, 0x1d, 0xdb, 0x37, 0xbd, 0x8e, 0x9e, 0x59, 0x3c, 0xf6, 0xa9, 0x1a, 0xbd, 0xfd, 0xec, 0x19, 0x3d, 0xa3, 0x01, 0x5a, 0xbd, 0xcc, 0xe7, 0x15, 0xbd, 0x26, 0xe6, 0x51, 0x3d, 0xeb, 0x5f, 0x8d, 0x3d, 0x93, 0x7a, 0x73, 0x3c, 0x94, 0x02, 0x10, 0x3d, 0x5d, 0x7e, 0xa7, 0x3c, 0x52, 0x78, 0x12, 0xbd, 0xe2, 0xfb, 0x44, 0x3d, 0xb8, 0xdf, 0xa4, 0x3c, 0x84, 0x3d, 0x0e, 0xbd, 0xad, 0xae, 0x0e, 0x3c, 0x52, 0xda, 0x1e, 0x3d, 0xfe, 0x93, 0x92, 0xbd, 0xe8, 0xe3, 0xde, 0xbd, 0x7a, 0xdc, 0xd9, 0xbc, 0xc3, 0xb0, 0x68, 0x3d, 0x58, 0x56, 0x25, 0xbd, 0x3a, 0x61, 0xdc, 0xbc, 0x71, 0xa2, 0xbc, 0x3c, 0x1b, 0xab, 0x30, 0x3d, 0x2a, 0x68, 0xbd, 0xbb, 0x5e, 0xaf, 0x8b, 0xbd, 0xb4, 0x4d, 0x30, 0x3d, 0xa0, 0x46, 0x72, 0x3d, 0x4e, 0xd2, 0x10, 0x3d, 0x71, 0x47, 0x4e, 0xbd, 0xe5, 0xd4, 0xe6, 0xbc, 0x25, 0x05, 0x87, 0x3c, 0x33, 0x85, 0xec, 0x3c, 0x84, 0x58, 0x5f, 0xbd, 0xb0, 0xfa, 0xc0, 0xbd, 0xc0, 0xdb, 0x87, 0xba, 0xa0, 0x30, 0x13, 0x3d, 0x84, 0x01, 0xe2, 0xbc, 0xee, 0x8d, 0xa1, 0x3c, 0xc8, 0x8c, 0x24, 0x3c, 0x2b, 0x33, 0xf0, 0x3c, 0xc5, 0xdd, 0x55, 0x3c, 0x89, 0x7c, 0xa5, 0xbc, 0x3b, 0x39, 0x19, 0xbd, 0xed, 0x0d, 0x74, 0x3d, 0x98, 0xdf, 0x24, 0xbc, 0xdd, 0xdc, 0x38, 0xbd, 0xab, 0x9f, 0x75, 0x3b, 0xd7, 0x20, 0xf3, 0x3c, 0x96, 0xa3, 0x78, 0x3c, 0x58, 0x44, 0x90, 0xbd, 0x21, 0xcb, 0xf2, 0x3b, 0x18, 0x22, 0x58, 0xbd, 0x7c, 0x1c, 0x1b, 0xbd, 0xdc, 0x4d, 0x19, 0xbd, 0xff, 0x68, 0x35, 0xbb, 0x34, 0xc5, 0x5e, 0x3c, 0x48, 0x3a, 0x90, 0xbd, 0xa1, 0x84, 0xa7, 0x3c, 0x96, 0xc6, 0x46, 0xbd, 0x20, 0x22, 0xb3, 0xbc, 0x16, 0x95, 0x18, 0x3d, 0x84, 0xa2, 0x5e, 0x3d, 0x78, 0x3a, 0x29, 0xbd, 0x37, 0x9a, 0x5a, 0xbd, 0x93, 0x8b, 0x80, 0x3d, 0x25, 0xff, 0x49, 0xbd, 0xf0, 0x1e, 0x8c, 0xbb, 0xde, 0xa1, 0x48, 0x3d, 0x58, 0x67, 0x2d, 0x3d, 0x09, 0x18, 0x26, 0x3d, 0x37, 0x68, 0x85, 0x3d, 0xa0, 0x28, 0x70, 0x3d, 0x33, 0xf5, 0x9f, 0xbc, 0x81, 0xcc, 0x97, 0xbd, 0x75, 0x24, 0x45, 0xbd, 0x60, 0x45, 0x29, 0x3d, 0x6b, 0x87, 0x25, 0xbd, 0x67, 0xd9, 0xb5, 0xbc, 0x15, 0xcb, 0x01, 0xbd, 0x39, 0xa5, 0xc6, 0xbd, 0xd2, 0xbe, 0xb9, 0xbd, 0x7c, 0x53, 0x20, 0xbd, 0x1a, 0x64, 0xb4, 0xbd, 0x5a, 0xc1, 0x1d, 0x3d, 0xdf, 0xdd, 0x50, 0xbc, 0x8e, 0x86, 0x2b, 0x3d, 0x20, 0xeb, 0x4d, 0x3d, 0x9a, 0xf8, 0x88, 0x3d, 0x92, 0xf1, 0x5e, 0xbd, 0x24, 0xb3, 0xd8, 0xbb, 0x19, 0xbc, 0xd9, 0xbc, 0x8d, 0x97, 0x8f, 0xbd, 0x6d, 0xf5, 0x7b, 0x3c, 0xfe, 0x33, 0x66, 0xbc, 0x35, 0x64, 0xfa, 0x3b, 0xe6, 0x00, 0x9d, 0xbc, 0xd6, 0x9c, 0x63, 0xbd, 0x02, 0xff, 0x8e, 0xbd, 0x10, 0xa1, 0x23, 0xbd, 0x93, 0x33, 0x0f, 0xbd, 0x59, 0xfc, 0x1b, 0x3d, 0x43, 0x0c, 0x7f, 0x3d, 0x06, 0xbd, 0x96, 0x3d, 0xe1, 0x5b, 0x9f, 0xbc, 0x44, 0x05, 0xf8, 0x3c, 0x1c, 0x60, 0xec, 0xbd, 0x33, 0x7f, 0x8c, 0xbd, 0x93, 0xcb, 0x0c, 0xbc, 0xc0, 0x8d, 0x0e, 0xbb, 0x16, 0x45, 0x65, 0xbd, 0x76, 0x93, 0x88, 0xbd, 0x49, 0xd0, 0xb3, 0xbd, 0xeb, 0x0e, 0x56, 0xbd, 0x8f, 0x1a, 0xab, 0x3d, 0x30, 0xde, 0x72, 0xb8, 0xcf, 0xc7, 0x1d, 0xbd, 0x12, 0xc3, 0x31, 0xbd, 0x6e, 0x1d, 0x47, 0xbd, 0xb3, 0x0f, 0x8c, 0x3d, 0x31, 0x82, 0x80, 0x3d, 0x44, 0xc4, 0x6b, 0xbc, 0x07, 0x28, 0x5a, 0x3d, 0xa3, 0x3c, 0x3d, 0xbd, 0x13, 0x5c, 0x6a, 0x3d, 0x1c, 0x3f, 0x11, 0x3d, 0x50, 0xac, 0xb5, 0xbc, 0x9f, 0x0e, 0xd9, 0x3c, 0x55, 0xfb, 0xde, 0xbc, 0x6b, 0x4f, 0x6a, 0xbd, 0x38, 0x5f, 0x3f, 0x3b, 0x5a, 0x26, 0x98, 0xbc, 0x32, 0x8c, 0x36, 0x3d, 0x78, 0x0a, 0x73, 0x3c, 0x7f, 0xd4, 0x51, 0x3d, 0x69, 0xdb, 0x97, 0x3d, 0x52, 0x37, 0x80, 0x3d, 0x9b, 0x10, 0x88, 0xbd, 0xc0, 0xbf, 0x90, 0xbd, 0x43, 0x84, 0x44, 0x3d, 0x12, 0x73, 0xc8, 0xbc, 0x84, 0xe0, 0x42, 0x3d, 0xf5, 0x79, 0xd2, 0xbc, 0x88, 0x3b, 0x05, 0x3d, 0xf6, 0x10, 0xf3, 0x3b, 0x73, 0x77, 0x8d, 0x3d, 0x92, 0xf0, 0x77, 0x3d, 0xd4, 0xcd, 0x55, 0xbd, 0x44, 0x7c, 0x88, 0xbd, 0x3b, 0xe3, 0x5f, 0xbd, 0x0c, 0x35, 0x87, 0x3c, 0x09, 0x68, 0xf0, 0x3c, 0x60, 0x3e, 0x47, 0x3a, 0xf6, 0x12, 0xb2, 0xbd, 0x2b, 0xe9, 0x9d, 0x3d, 0x8e, 0x7c, 0x97, 0xbc, 0xb1, 0x05, 0x2e, 0xbc, 0x99, 0x6b, 0x14, 0xbd, 0xb2, 0xa1, 0x85, 0x3d, 0x1c, 0xd1, 0x31, 0x3d, 0x18, 0xe6, 0xf5, 0x3c, 0xa7, 0x25, 0x5a, 0x3c, 0xe0, 0x75, 0x9e, 0xbd, 0x1b, 0xe1, 0x69, 0xbd, 0x1b, 0x22, 0xc0, 0x3d, 0xc4, 0x04, 0x8e, 0x3d, 0x92, 0x7f, 0x9d, 0x3d, 0xd3, 0xf3, 0x80, 0xbb, 0x69, 0x7a, 0x58, 0x3c, 0xd5, 0xc2, 0x92, 0xbc, 0x26, 0x08, 0xa2, 0xbd, 0x9f, 0xe8, 0x45, 0x3d, 0x10, 0xc9, 0x44, 0x3d, 0x7e, 0xac, 0x61, 0x3d, 0x88, 0xa8, 0xf1, 0x3c, 0xa2, 0xd1, 0x87, 0xbd, 0x8c, 0xa7, 0xd1, 0xbc, 0x77, 0x21, 0x86, 0xbd, 0x3b, 0x5a, 0xaa, 0x3d, 0x27, 0x8b, 0xb7, 0x3d, 0xe2, 0x8c, 0x39, 0x3d, 0x16, 0x70, 0xc0, 0xbc, 0x45, 0xcc, 0x81, 0xbd, 0xfd, 0x54, 0x09, 0x3d, 0x7f, 0x19, 0x0d, 0x3c, 0x0a, 0xfe, 0x39, 0xbd, 0xaf, 0x91, 0x66, 0xbd, 0x1c, 0xf9, 0xa3, 0x3d, 0x6d, 0xfa, 0xa7, 0x3b, 0x55, 0x1d, 0xa2, 0x3d, 0xd4, 0x1c, 0x8a, 0x3d, 0x21, 0xeb, 0xbd, 0xbc, 0xd7, 0x77, 0x45, 0xbc, 0x2b, 0xb9, 0x37, 0xbd, 0x7b, 0x7c, 0xbd, 0xbd, 0x59, 0xa0, 0x92, 0xbd, 0xb9, 0x28, 0x2f, 0xbd, 0x1c, 0xb6, 0x8c, 0xbc, 0x48, 0x52, 0x58, 0xbd, 0x90, 0x67, 0xa3, 0x3b, 0x92, 0xff, 0x79, 0x3d, 0x55, 0x80, 0x9d, 0x3c, 0x68, 0x54, 0x98, 0xbd, 0xc6, 0xff, 0xbc, 0xbc, 0x76, 0xb5, 0x72, 0xbd, 0x00, 0x62, 0x86, 0xbd, 0x6b, 0x01, 0xe3, 0xbc, 0x42, 0x03, 0x6e, 0xbd, 0xd6, 0xe1, 0x7d, 0xbd, 0xcd, 0xed, 0x8b, 0x3c, 0x67, 0x9d, 0x49, 0x3d, 0x6a, 0xe8, 0x31, 0x3d, 0xfd, 0x25, 0x4c, 0x3d, 0x87, 0x12, 0xe8, 0xbb, 0x31, 0x54, 0x92, 0xbc, 0xbe, 0xab, 0x98, 0xbb, 0x85, 0x6c, 0xf7, 0x3b, 0xb8, 0x0e, 0xbc, 0xbc, 0xf8, 0xea, 0x9a, 0x3d, 0x36, 0x13, 0xe2, 0xbc, 0x9f, 0xd7, 0x6d, 0x3d, 0x4f, 0x0a, 0xb1, 0x3d, 0xba, 0x5c, 0x6b, 0xbd, 0xae, 0x73, 0x60, 0xbc, 0x61, 0xf2, 0x8b, 0x3c, 0x90, 0x4c, 0x7b, 0xbd, 0x50, 0xef, 0xe9, 0xbd, 0x54, 0x83, 0x99, 0xbc, 0x8f, 0xd5, 0x4d, 0x3d, 0x6b, 0x02, 0x37, 0x3d, 0xc8, 0xe7, 0x84, 0x3d, 0x4e, 0x73, 0x87, 0x3d, 0x7a, 0xcc, 0xaa, 0x3c, 0x0e, 0xde, 0x26, 0xbd, 0xef, 0xfb, 0xc8, 0xbd, 0x96, 0xe9, 0x11, 0xbd, 0xd2, 0xd6, 0x26, 0xbc, 0x01, 0xea, 0x72, 0xbd, 0xf4, 0xb7, 0xad, 0xbb, 0x5b, 0xe7, 0x9e, 0x3d, 0xe6, 0xa1, 0x06, 0xbe, 0x4d, 0xa9, 0xd4, 0x3c, 0x83, 0xc9, 0xdf, 0x3c, 0x31, 0x26, 0x85, 0x3c, 0x4d, 0x25, 0xcf, 0xbb, 0x6c, 0xea, 0x91, 0x3d, 0xb3, 0x55, 0x5d, 0x3c, 0x7f, 0x1d, 0x70, 0xbd, 0x0d, 0x6f, 0x85, 0x3d, 0xbe, 0xe6, 0x35, 0xbd, 0x0f, 0x5b, 0x02, 0xbc, 0x1e, 0xad, 0x60, 0xbd, 0xeb, 0x48, 0x4c, 0x3d, 0x73, 0x67, 0xaf, 0x3c, 0xda, 0x33, 0x03, 0x3d, 0xd9, 0xa3, 0x0d, 0xbb, 0x6e, 0x31, 0x11, 0x3d, 0xb3, 0x7e, 0xfc, 0x3c, 0xc4, 0x86, 0x49, 0x3c, 0x0a, 0x52, 0x0b, 0x3d, 0x68, 0x25, 0xae, 0x3d, 0xe0, 0x16, 0x02, 0x3d, 0xc0, 0x47, 0x3f, 0xbd, 0x98, 0x55, 0x70, 0x3c, 0x1a, 0xbb, 0x38, 0x3d, 0xcf, 0x31, 0xe4, 0xbc, 0xe0, 0x45, 0x39, 0xbd, 0x7c, 0xa1, 0x3f, 0xbd, 0xcc, 0x5b, 0x91, 0xbd, 0x55, 0x28, 0x59, 0x3a, 0x75, 0xdc, 0x02, 0xbd, 0xd8, 0x0d, 0xfe, 0xbb, 0x38, 0x7f, 0x92, 0xbd, 0x0f, 0xeb, 0x83, 0xbc, 0xcf, 0xe7, 0x0c, 0xbd, 0xb5, 0xf8, 0x59, 0x3d, 0xfc, 0xd4, 0xcf, 0xbb, 0xa3, 0x75, 0x8a, 0x3d, 0xac, 0xe9, 0x8e, 0xbd, 0x4a, 0xf9, 0x71, 0x3d, 0xee, 0x83, 0x32, 0xbc, 0x7c, 0x78, 0xa0, 0xbd, 0x87, 0x86, 0x6a, 0xbd, 0x1a, 0x3c, 0xe4, 0xbc, 0x89, 0x4a, 0xa1, 0x3d, 0xa0, 0x39, 0xdd, 0x3c, 0x93, 0xa3, 0x93, 0x3c, 0xdd, 0x08, 0xa2, 0x3d, 0x9a, 0x87, 0x98, 0xbd, 0xe6, 0x5a, 0x32, 0xbd, 0xeb, 0x4d, 0xea, 0xbb, 0x48, 0xda, 0x6b, 0x3c, 0x36, 0x23, 0x82, 0x3d, 0x80, 0x78, 0x90, 0x3d, 0x0e, 0x4c, 0x1b, 0xbd, 0xb9, 0x3c, 0x54, 0x3d, 0x5f, 0x8b, 0xf5, 0xbb, 0x54, 0x40, 0x54, 0xbd, 0x35, 0x04, 0x8e, 0xbc, 0x38, 0xcf, 0xe0, 0x3b, 0x2f, 0xf6, 0x55, 0xbd, 0xe0, 0xed, 0x7e, 0x3c, 0x84, 0x12, 0x9c, 0x3d, 0x74, 0x34, 0xfb, 0xbc, 0x02, 0xd9, 0x93, 0xbd, 0xff, 0x27, 0xa8, 0xbd, 0x83, 0xf3, 0xaf, 0xbb, 0x99, 0x16, 0x7d, 0x3d, 0xc6, 0xd9, 0x32, 0xbd, 0xb1, 0xa4, 0xbd, 0xbc, 0xd2, 0x1c, 0x5b, 0x3d, 0xb3, 0xdb, 0x31, 0x3d, 0xe4, 0x10, 0x03, 0x3c, 0x29, 0xb0, 0x0b, 0xbd, 0x16, 0x47, 0x9b, 0x3d, 0x75, 0x6b, 0xfd, 0xbc, 0x09, 0x92, 0xac, 0x3c, 0x12, 0x2c, 0x07, 0x3d, 0x5a, 0xb3, 0xa0, 0x3c, 0xc9, 0x3d, 0x21, 0xbd, 0xc1, 0x80, 0x6d, 0xbd, 0xa9, 0x20, 0x9c, 0x3d, 0xf5, 0x5b, 0x07, 0xbe, 0x9a, 0x76, 0x6f, 0xbd, 0xd5, 0x11, 0xff, 0x3d, 0x58, 0xda, 0xd4, 0x3c, 0x18, 0x2f, 0xb9, 0x3d, 0xd4, 0xa0, 0x6c, 0xbd, 0x4d, 0xe5, 0x2b, 0xbc, 0x97, 0x9d, 0x5f, 0xbc, 0x55, 0xe6, 0x9b, 0xbd, 0x61, 0xee, 0xb3, 0x3c, 0x24, 0x06, 0xbf, 0x3c, 0xc2, 0x90, 0x09, 0xbd, 0x91, 0xaf, 0x63, 0x3d, 0xde, 0x86, 0x7b, 0x3c, 0xca, 0x42, 0x0d, 0x3c, 0x5f, 0xda, 0xcd, 0xbc, 0x7b, 0x27, 0x13, 0x3d, 0xf9, 0xd1, 0x14, 0x3c, 0xb6, 0x83, 0x4a, 0x3d, 0x37, 0x74, 0x63, 0xbd, 0xbb, 0x85, 0x40, 0xbd, 0x3e, 0x15, 0x13, 0x3d, 0x00, 0xe1, 0x22, 0xbd, 0xef, 0xdd, 0x63, 0xbd, 0x95, 0xdb, 0xa6, 0x3c, 0xf4, 0xc1, 0x86, 0xbd, 0xfd, 0xf0, 0xe5, 0x3c, 0x84, 0xc1, 0x69, 0xbd, 0xe4, 0x85, 0xf5, 0x3c, 0x18, 0xfa, 0x79, 0xbd, 0xe3, 0xd5, 0x2e, 0xbd, 0x32, 0x90, 0x8f, 0xbc, 0x40, 0xfa, 0x08, 0xbc, 0xa4, 0x5f, 0xcb, 0xbc, 0x5a, 0xa7, 0x3f, 0x3d, 0x09, 0x40, 0x23, 0x3d, 0x7b, 0x17, 0x0e, 0xbd, 0x6e, 0x70, 0xb9, 0x3b, 0xc7, 0x3d, 0x4d, 0xbd, 0xe9, 0x57, 0x5d, 0x3d, 0x5c, 0x02, 0x91, 0x3c, 0xc8, 0x08, 0x31, 0xbd, 0x09, 0xea, 0xe3, 0x3c, 0x14, 0x23, 0xf6, 0x3c, 0x95, 0xd1, 0x22, 0xbd, 0xba, 0x27, 0xce, 0x3c, 0xb2, 0x59, 0x42, 0xbd, 0x29, 0x50, 0x6d, 0x3d, 0x20, 0xe5, 0x10, 0xbd, 0xc2, 0x68, 0x5a, 0xbd, 0x04, 0x6e, 0x81, 0xbd, 0xd6, 0xc7, 0xa4, 0xbc, 0x16, 0x22, 0x33, 0x3d, 0x80, 0xbf, 0x70, 0x3c, 0xbf, 0x62, 0x02, 0xbd, 0xdd, 0x19, 0x28, 0xbd, 0x8d, 0x5c, 0x60, 0x3d, 0x96, 0xb4, 0x24, 0xbd, 0x9a, 0xb5, 0x6e, 0xbd, 0x52, 0xb5, 0x81, 0x3d, 0xf3, 0x49, 0x85, 0xbd, 0x4a, 0x65, 0xcc, 0x3c, 0x06, 0xca, 0x13, 0xbd, 0x18, 0x94, 0x07, 0x3d, 0xde, 0x60, 0x45, 0x3c, 0x7a, 0x2d, 0x69, 0x3d, 0x7e, 0xc6, 0xba, 0xbc, 0xff, 0xcf, 0x64, 0x3d, 0x3e, 0x22, 0x98, 0xbd, 0xe1, 0x87, 0xc8, 0x3c, 0xec, 0x54, 0x90, 0xbd, 0x60, 0x0b, 0x09, 0x3d, 0x5e, 0xc7, 0x95, 0x3c, 0x54, 0x1c, 0x5b, 0x3b, 0xac, 0x77, 0xfe, 0x3c, 0x4c, 0x43, 0xea, 0xbc, 0xe4, 0x4d, 0xb3, 0x3c, 0xab, 0x96, 0x20, 0xbd, 0xf7, 0x8a, 0x48, 0xbd, 0xcc, 0xcb, 0x70, 0x3d, 0x25, 0x01, 0x91, 0xbc, 0x9c, 0x9a, 0x96, 0x3c, 0x9c, 0x7d, 0x56, 0x3d, 0x3e, 0x2b, 0x47, 0xbd, 0x44, 0x48, 0x15, 0xbd, 0x38, 0x4e, 0xc1, 0x3c, 0x9e, 0x72, 0x05, 0x3d, 0xe9, 0xbd, 0x44, 0xbc, 0x96, 0xdd, 0x6f, 0x3d, 0x17, 0x2b, 0x4e, 0x3c, 0x21, 0x91, 0x4c, 0x3d, 0x2f, 0x87, 0x8e, 0xbd, 0xf2, 0xd2, 0x31, 0x3d, 0x47, 0x07, 0xad, 0xbc, 0x41, 0x54, 0x89, 0x3c, 0xee, 0xa9, 0x4d, 0x3d, 0xf2, 0xb1, 0x80, 0x3d, 0x6a, 0xd9, 0x78, 0xbd, 0x55, 0x4a, 0x32, 0xbd, 0xd1, 0xd8, 0x44, 0x3d, 0xda, 0x72, 0x7d, 0x3d, 0xa1, 0xd1, 0xbc, 0x3b, 0x7a, 0xf4, 0x32, 0xbd, 0xf0, 0x44, 0x84, 0x3d, 0xd3, 0x0b, 0x8c, 0x3d, 0xd9, 0xc8, 0x58, 0xbd, 0xdd, 0x2c, 0x7c, 0x3d, 0x49, 0x3e, 0x8f, 0x3d, 0x39, 0xbd, 0x95, 0xbd, 0x99, 0x46, 0x25, 0x3d, 0x63, 0xfe, 0x20, 0xbd, 0x0a, 0x1d, 0x62, 0xbc, 0x4b, 0xae, 0x3b, 0xbc, 0x3c, 0x28, 0x84, 0xbc, 0x79, 0x24, 0x25, 0xbd, 0x62, 0x6b, 0x56, 0xbd, 0xe9, 0x9a, 0x88, 0x3d, 0xd6, 0x9f, 0x85, 0xbc, 0xad, 0xf6, 0x51, 0xbd, 0xc2, 0x72, 0x85, 0x3d, 0xf6, 0x0d, 0x89, 0xbd, 0x3e, 0x76, 0xca, 0x39, 0x90, 0x96, 0x89, 0x3d, 0xa1, 0x6e, 0x25, 0xbd, 0x4b, 0xbd, 0x18, 0x3c, 0x0e, 0x05, 0x69, 0xbc, 0x03, 0x9e, 0x76, 0x3d, 0xa3, 0xae, 0x67, 0x3d, 0xc4, 0x38, 0x5a, 0x3d, 0x8c, 0x9d, 0x53, 0xbd, 0x35, 0x24, 0x42, 0xbd, 0x36, 0xfa, 0xcf, 0x3c, 0xe8, 0x09, 0x0f, 0xbd, 0xe9, 0x6e, 0x15, 0xbd, 0x51, 0x03, 0x1b, 0xbd, 0xf7, 0x1d, 0x32, 0x3d, 0x08, 0xfc, 0x2f, 0xbd, 0x9d, 0x4c, 0x65, 0x3d, 0x9d, 0xf0, 0x98, 0xbb, 0xb0, 0xba, 0x0d, 0xbc, 0x64, 0xee, 0x03, 0xbb, 0x92, 0x82, 0x16, 0xbc, 0xa5, 0xa0, 0x94, 0xbd, 0xd0, 0x1f, 0xf1, 0x3c, 0xeb, 0x06, 0x8c, 0xbb, 0xb5, 0xc2, 0x64, 0x3c, 0x7e, 0x30, 0x55, 0x3c, 0x68, 0x89, 0x64, 0x3c, 0xec, 0x1e, 0x9e, 0x3c, 0xf0, 0xc9, 0x57, 0x3d, 0xfe, 0x25, 0x0c, 0xbd, 0x2f, 0xb4, 0x0b, 0x3c, 0x32, 0x76, 0x7a, 0xbd, 0xd2, 0x15, 0xea, 0xba, 0xc0, 0xc9, 0x45, 0xbd, 0xb7, 0xda, 0x48, 0xbc, 0x5e, 0x85, 0x6c, 0x3c, 0xbc, 0xda, 0x84, 0xbc, 0xc6, 0x56, 0x35, 0xbd, 0x21, 0xfd, 0x7d, 0x3d, 0xbf, 0x0c, 0x0f, 0x3b, 0xc2, 0x28, 0xa4, 0xbc, 0xad, 0xa3, 0xe7, 0xbb, 0x77, 0xd9, 0x55, 0x3d, 0x6d, 0x5a, 0x21, 0xbc, 0x3f, 0xa0, 0xd9, 0xbc, 0x1b, 0x86, 0x85, 0x3d, 0x38, 0x2f, 0x1f, 0xbd, 0xd5, 0xa5, 0x43, 0x3d, 0xdb, 0x04, 0x8d, 0xbd, 0xbc, 0x0d, 0x25, 0x3d, 0xf5, 0x71, 0x86, 0x3d, 0xa8, 0x4e, 0x88, 0xbd, 0xca, 0xab, 0x24, 0x3c, 0x8d, 0x03, 0xda, 0x3c, 0xad, 0x77, 0x19, 0xbc, 0x2e, 0x7c, 0xf5, 0x3c, 0x75, 0x45, 0x6e, 0x3d, 0x9b, 0x9f, 0x80, 0xbd, 0x1d, 0xce, 0x85, 0x3d, 0xb6, 0xbe, 0x86, 0xbc, 0xc0, 0x1c, 0x55, 0xbb, 0xd0, 0xc7, 0x5c, 0xbd, 0x1f, 0x60, 0x64, 0x3c, 0x4f, 0x04, 0x60, 0xbd, 0x04, 0xc9, 0x64, 0x3d, 0x0a, 0xbb, 0x10, 0x3b, 0x08, 0x41, 0x92, 0xbd, 0xac, 0x5b, 0x15, 0xbd, 0x44, 0xe8, 0x27, 0x3b, 0x9c, 0x98, 0x0c, 0x3d, 0x09, 0x52, 0x7a, 0x3d, 0x33, 0xe4, 0xcd, 0xbc, 0xda, 0x48, 0x17, 0xbd, 0x26, 0xe5, 0x5d, 0xbb, 0x2f, 0xfc, 0x69, 0xbd, 0x9f, 0xfd, 0x54, 0x3d, 0x1d, 0x45, 0x07, 0xbd, 0x86, 0x69, 0x91, 0x3c, 0x9e, 0x1a, 0xbe, 0xbc, 0xfa, 0xf4, 0x5e, 0x3d, 0xb5, 0x9d, 0x00, 0xbd, 0xe0, 0xfd, 0x90, 0x3c, 0x3a, 0xac, 0xc9, 0xbc, 0x11, 0xa7, 0xb0, 0xbb, 0x3e, 0x18, 0xa8, 0x3c, 0x79, 0x2e, 0x55, 0xbd, 0xe0, 0xb2, 0xfd, 0xbb, 0x72, 0xb0, 0x5d, 0xbc, 0xe1, 0xd9, 0x6f, 0x3d, 0xd5, 0x3a, 0x9f, 0xbc, 0xc8, 0x8f, 0x1a, 0xbd, 0x18, 0x60, 0x3b, 0x3c, 0xc0, 0x90, 0x24, 0xbc, 0x78, 0xb6, 0x50, 0x3d, 0x84, 0xc6, 0x81, 0xbd, 0x98, 0x2d, 0x46, 0x3d, 0x7f, 0x8a, 0x3b, 0x3d, 0x03, 0xd9, 0x7f, 0x3d, 0x50, 0x04, 0xae, 0x3c, 0xaf, 0xae, 0x6b, 0xbd, 0xcd, 0x34, 0x48, 0xbd, 0xbd, 0x05, 0xa8, 0x3c, 0x84, 0xc8, 0x3f, 0xbd, 0xcb, 0x46, 0x89, 0x3d, 0x92, 0x2b, 0x16, 0x3d, 0x98, 0xfb, 0xcd, 0xbc, 0x80, 0x5b, 0x43, 0xbd, 0xac, 0x5e, 0x78, 0x3c, 0xd6, 0xbf, 0x7e, 0x3b, 0x32, 0xec, 0x81, 0x3b, 0xce, 0xab, 0xf1, 0x3b, 0xb2, 0xd7, 0x86, 0xbc, 0xb1, 0xe3, 0x09, 0x3d, 0x4f, 0xc6, 0xa5, 0xbc, 0x4c, 0x1b, 0x89, 0x3c, 0xd6, 0x09, 0x2b, 0x3d, 0x61, 0x67, 0x4a, 0xbc, 0x7a, 0x5e, 0x87, 0xbc, 0x6c, 0x32, 0x55, 0x3c, 0x6b, 0xe0, 0xa7, 0xba, 0x41, 0xc8, 0xb5, 0xbc, 0x94, 0x54, 0x64, 0xbc, 0x81, 0xb6, 0x33, 0x3d, 0x3a, 0x05, 0x59, 0x3d, 0x42, 0x25, 0x46, 0xbd, 0xfc, 0xda, 0x8c, 0xbd, 0x17, 0x64, 0x87, 0x3d, 0x55, 0x39, 0x61, 0x3d, 0x4f, 0xcf, 0x25, 0xbd, 0xfc, 0x4d, 0x26, 0x3c, 0x7c, 0x18, 0xd8, 0x3c, 0x4f, 0x1b, 0x5c, 0x3d, 0x3a, 0x09, 0xcd, 0x3c, 0x27, 0x4a, 0x00, 0x3d, 0x1c, 0xb7, 0xb7, 0xbc, 0x0a, 0x1b, 0x38, 0xbc, 0x88, 0x6d, 0x2f, 0x3d, 0x96, 0xdf, 0x6a, 0xbd, 0x7e, 0x7e, 0xa0, 0xb9, 0x10, 0x23, 0x10, 0xbc, 0xec, 0x6b, 0xbf, 0x3c, 0x1a, 0x8e, 0x7a, 0xbc, 0x68, 0xb1, 0x7c, 0x3d, 0xb0, 0xcc, 0x30, 0xbd, 0xec, 0x59, 0xef, 0x3c, 0x8d, 0xd5, 0x41, 0x3b, 0x82, 0xa1, 0xec, 0xbc, 0x29, 0x35, 0x51, 0xbd, 0x6e, 0x6e, 0x91, 0xbc, 0xf9, 0x6d, 0x2a, 0x3d, 0x5d, 0x97, 0x17, 0x3d, 0xcb, 0xad, 0x29, 0x3c, 0xc4, 0x47, 0x41, 0x3d, 0x40, 0x7c, 0x6a, 0xbc, 0xa6, 0x09, 0x1e, 0x3d, 0x14, 0x9c, 0xf2, 0xbc, 0x70, 0x31, 0x5d, 0x3c, 0xd1, 0x54, 0x70, 0xbc, 0xd8, 0x58, 0xdd, 0x3a, 0x65, 0x21, 0x6a, 0xbd, 0x64, 0x81, 0x99, 0xbd, 0x51, 0x5a, 0x64, 0x3c, 0x8c, 0xa6, 0x90, 0x3c, 0xe6, 0xb6, 0x2a, 0xbd, 0x3d, 0x2a, 0x15, 0xbd, 0x82, 0xbe, 0x8d, 0xbc, 0x65, 0x32, 0x68, 0xbd, 0x0a, 0x5d, 0x6d, 0xbc, 0x24, 0x8c, 0xd6, 0xbc, 0x70, 0x4d, 0xe7, 0x3c, 0x06, 0x58, 0x01, 0x3c, 0x22, 0xd2, 0x58, 0x3d, 0x62, 0x60, 0x88, 0x3c, 0xfc, 0xe6, 0x12, 0x3d, 0x31, 0x59, 0xdb, 0x3c, 0x5d, 0xfb, 0x96, 0xbc, 0xb6, 0x50, 0x7f, 0x3b, 0xd7, 0x01, 0x37, 0x3d, 0x6a, 0x71, 0xc4, 0xbc, 0x8d, 0x28, 0xc9, 0x3c, 0x33, 0x39, 0x4f, 0xbb, 0x14, 0x14, 0x1b, 0x3d, 0x32, 0x36, 0x62, 0xbd, 0xa7, 0xf1, 0x89, 0x3d, 0xc4, 0x12, 0x13, 0x3d, 0xf3, 0x79, 0xde, 0x3c, 0xc0, 0x39, 0xb3, 0xbb, 0x36, 0xb5, 0x54, 0xbd, 0x04, 0xf2, 0xcc, 0xbc, 0x45, 0x14, 0xf8, 0x3a, 0x4b, 0x1d, 0x55, 0xbd, 0x13, 0x35, 0xc6, 0xbc, 0x7a, 0x92, 0x1b, 0xbd, 0x71, 0xb0, 0x3b, 0xbd, 0xfe, 0x84, 0x2f, 0xbd, 0xd4, 0x64, 0x60, 0x3d, 0xa7, 0x0b, 0xb7, 0xbb, 0xd1, 0xc7, 0x8a, 0xbd, 0x21, 0x20, 0x78, 0x3d, 0x1b, 0x25, 0x77, 0x3d, 0x5e, 0x06, 0x20, 0xbd, 0x7d, 0xfa, 0xe0, 0xbc, 0x5b, 0x2b, 0x38, 0x3d, 0x8c, 0x10, 0x90, 0xbd, 0xbe, 0xc0, 0xb2, 0x3c, 0x5a, 0x88, 0x94, 0xbd, 0x80, 0x87, 0x94, 0x3c, 0x73, 0xed, 0x81, 0xbd, 0x73, 0x42, 0x3f, 0xba, 0xdc, 0xf8, 0x4e, 0x3d, 0x9a, 0xd4, 0x8d, 0xbc, 0x3a, 0x6f, 0x72, 0xbc, 0x37, 0xe8, 0x06, 0x3d, 0xbb, 0x35, 0x61, 0x3d, 0x64, 0xc6, 0x4a, 0x3d, 0xee, 0x94, 0x13, 0xb9, 0xc0, 0x4b, 0xaf, 0xba, 0x60, 0x4b, 0x42, 0x3d, 0x40, 0x88, 0xb1, 0x3c, 0xc6, 0x61, 0x6c, 0x3d, 0x92, 0xd0, 0x40, 0x3d, 0x32, 0xc0, 0x8d, 0xbd, 0x90, 0x66, 0xc2, 0xbc, 0x52, 0x1f, 0x14, 0xbd, 0x03, 0x9d, 0x23, 0x3d, 0x81, 0x60, 0xe1, 0x3c, 0xe3, 0x31, 0x5f, 0x3d, 0x38, 0xbc, 0x52, 0x3d, 0x23, 0x3e, 0x3b, 0xbd, 0xf6, 0x53, 0x8e, 0xbd, 0xc9, 0xb1, 0x88, 0xbd, 0x02, 0x0c, 0xc6, 0xbc, 0x2e, 0x6d, 0x26, 0xbd, 0xe2, 0x88, 0x87, 0xbd, 0x45, 0x45, 0x28, 0x3d, 0xbc, 0x73, 0xd7, 0xba, 0x17, 0x1e, 0x15, 0xbc, 0xa6, 0x0c, 0x9c, 0xbc, 0x5a, 0x74, 0x63, 0x3d, 0x05, 0x28, 0xf6, 0x3c, 0xe5, 0xda, 0x4d, 0xbd, 0x02, 0x69, 0x42, 0xbd, 0x8a, 0xb0, 0x2c, 0x3d, 0x27, 0x22, 0x07, 0x3d, 0x6a, 0x7a, 0x08, 0x3b, 0x88, 0xb6, 0x03, 0x3d, 0x80, 0xad, 0xac, 0xbb, 0xc9, 0x67, 0x6d, 0xbb, 0x80, 0xf0, 0x8d, 0xbd, 0x53, 0x78, 0x85, 0x3d, 0x14, 0x99, 0x24, 0xbb, 0x86, 0x7c, 0x0c, 0x3d, 0xbe, 0xff, 0x79, 0x3d, 0x01, 0x39, 0xb4, 0x3c, 0x19, 0x42, 0x52, 0x3c, 0x4d, 0x8b, 0x73, 0x3d, 0xb4, 0x6b, 0xf1, 0x3a, 0x6e, 0x53, 0xb4, 0xbc, 0x09, 0x88, 0x11, 0xbd, 0xdf, 0x5e, 0x86, 0xbd, 0x10, 0xdc, 0x5a, 0xbd, 0x6b, 0xb3, 0x3a, 0xbd, 0x7e, 0x23, 0x84, 0xbd, 0x95, 0x50, 0x8c, 0xbd, 0xd1, 0x50, 0x93, 0x3c, 0x5f, 0x43, 0x67, 0x3a, 0x92, 0xc2, 0x91, 0xbd, 0xbe, 0xb0, 0x4e, 0xbd, 0x8c, 0xeb, 0x36, 0xbd, 0x4e, 0x0e, 0x82, 0xbd, 0xc5, 0x15, 0x0b, 0xbd, 0x1c, 0x66, 0x5a, 0xbd, 0xf6, 0xe4, 0x19, 0x3b, 0x4d, 0x1c, 0x07, 0x3d, 0x70, 0x1f, 0x24, 0x3d, 0x59, 0x80, 0x3b, 0xbd, 0x8e, 0x9e, 0xae, 0xbb, 0x11, 0x6f, 0x8f, 0x3b, 0x5f, 0xc9, 0x74, 0xbd, 0x36, 0x65, 0x2b, 0x3c, 0x43, 0xb4, 0xcf, 0x3c, 0x7f, 0xbf, 0x18, 0x3d, 0x91, 0x58, 0x16, 0xbd, 0x72, 0xc4, 0xf3, 0xbc, 0x80, 0xd3, 0x8a, 0x3b, 0x95, 0x0e, 0xe7, 0x3c, 0xdd, 0x17, 0x1d, 0x3d, 0x55, 0x74, 0x98, 0xbd, 0x5c, 0x6b, 0x1e, 0xbc, 0x02, 0x65, 0x61, 0xba, 0x01, 0x7f, 0x81, 0xbc, 0x97, 0x95, 0x73, 0xbd, 0xd8, 0x60, 0xfd, 0xbc, 0xd4, 0x64, 0x8a, 0x3a, 0xe5, 0x81, 0x24, 0x3c, 0xfd, 0x2b, 0x14, 0x3d, 0x60, 0x49, 0xff, 0x3b, 0x6f, 0x63, 0x33, 0xbd, 0xe0, 0x83, 0x4b, 0xbd, 0xed, 0x7a, 0x10, 0x3d, 0x5b, 0x26, 0x33, 0x3d, 0x03, 0xff, 0x2d, 0x3d, 0xcd, 0xca, 0x42, 0xbd, 0x4c, 0x09, 0x3f, 0x3d, 0xcb, 0xcb, 0x95, 0xbc, 0xff, 0x04, 0x18, 0x3c, 0x99, 0x48, 0x6c, 0xbd, 0xb6, 0x3f, 0x04, 0x3a, 0x68, 0x3d, 0x67, 0x3c, 0x71, 0xd9, 0x7a, 0xbc, 0x88, 0x7d, 0x02, 0x3c, 0x0f, 0xfa, 0x3b, 0xbd, 0x78, 0x64, 0xfc, 0x3c, 0xab, 0x8c, 0x37, 0x3d, 0x08, 0x19, 0xcf, 0xbc, 0x03, 0xe0, 0x85, 0xbd, 0x1b, 0xaf, 0x79, 0xbd, 0x92, 0x9e, 0x67, 0x3d, 0x31, 0x3e, 0x94, 0xbd, 0xe8, 0xd1, 0x1f, 0xbd, 0x4d, 0xa1, 0xcb, 0x3c, 0x9f, 0xc0, 0xf7, 0x3c, 0xa8, 0x88, 0xe1, 0xbc, 0xf7, 0x13, 0x8b, 0x3c, 0x77, 0x1b, 0xfe, 0xbc, 0x11, 0xf0, 0x4d, 0x3d, 0x02, 0x73, 0xff, 0xbc, 0x20, 0x4b, 0x2f, 0x3d, 0x50, 0x14, 0x28, 0x3c, 0xa2, 0x0a, 0xc1, 0xbc, 0xb3, 0xf6, 0xe1, 0xbc, 0x32, 0x98, 0xa1, 0x3c, 0x3f, 0xef, 0xcc, 0x3b, 0xd6, 0xbf, 0x37, 0xbd, 0x4e, 0x0a, 0x15, 0x3d, 0xfd, 0x81, 0x24, 0xbd, 0x62, 0x05, 0x43, 0x3d, 0x4b, 0x8d, 0xb5, 0xbc, 0x0e, 0xe7, 0x7c, 0x3d, 0xd1, 0x64, 0x88, 0xbd, 0xca, 0x03, 0xd3, 0xbb, 0xc9, 0xaa, 0x9f, 0xbb, 0xb5, 0x0e, 0xbf, 0xbc, 0x48, 0x82, 0xe7, 0x3c, 0xa1, 0x4b, 0x10, 0x3d, 0x40, 0x51, 0x68, 0xbb, 0xc0, 0x36, 0xc4, 0x3c, 0xcc, 0xd9, 0x37, 0xbc, 0xec, 0x40, 0xcf, 0x3c, 0xb2, 0x38, 0x52, 0xbd, 0x15, 0xe7, 0x0c, 0xbd, 0x52, 0xea, 0x59, 0x3c, 0xcf, 0xe3, 0xd1, 0xbc, 0x9e, 0xb7, 0x94, 0xbc, 0x1a, 0x13, 0xc8, 0x3c, 0x04, 0x51, 0xa0, 0x3b, 0x7f, 0xb4, 0x32, 0x3d, 0x5e, 0x43, 0x5a, 0x3d, 0x8b, 0x6d, 0x98, 0xba, 0xa4, 0x70, 0x47, 0x3d, 0xe6, 0x23, 0x60, 0x3d, 0x48, 0xf3, 0x8b, 0xbc, 0x85, 0xfe, 0x60, 0x3d, 0x33, 0x94, 0xc7, 0xbc, 0xdd, 0xbf, 0x80, 0xbd, 0x31, 0x98, 0xbb, 0x3b, 0x76, 0x70, 0x8a, 0x3c, 0x72, 0xc5, 0x4e, 0x3c, 0x31, 0x53, 0x20, 0x3d, 0xcd, 0xda, 0x03, 0x3b, 0x8c, 0xc0, 0x3d, 0x3d, 0x9c, 0xaa, 0x90, 0xbd, 0xb5, 0x9f, 0xab, 0x3c, 0x45, 0x77, 0x31, 0xbd, 0xea, 0x85, 0x8e, 0xbd, 0x15, 0x6d, 0x8b, 0xbc, 0xb9, 0x98, 0xb1, 0xbc, 0x09, 0x9b, 0xff, 0x3c, 0x1e, 0xcf, 0x3c, 0x3d, 0x3c, 0xe3, 0x2a, 0xbd, 0x2a, 0xff, 0x20, 0x3d, 0xbb, 0x1c, 0x4a, 0x3b, 0x8f, 0x19, 0x83, 0xbd, 0xad, 0x9f, 0xe5, 0x3c, 0x43, 0x3d, 0x44, 0x3d, 0xaa, 0xb9, 0xe3, 0x3c, 0x8c, 0xd1, 0x86, 0x3d, 0xfa, 0x93, 0x7c, 0x3d, 0x31, 0xe5, 0x67, 0xbc, 0x3f, 0x25, 0x8a, 0xbd, 0x90, 0x91, 0x5e, 0x3b, 0xbf, 0xd8, 0xfe, 0xbc, 0x68, 0xaa, 0x85, 0x3c, 0xb3, 0xb6, 0x07, 0xbd, 0x6f, 0x51, 0x91, 0xbd, 0x3c, 0x5d, 0xc8, 0xbc, 0xba, 0xf5, 0xd3, 0xbb, 0x8d, 0x90, 0xd5, 0xbc, 0x02, 0x78, 0x2f, 0xbc, 0x12, 0x94, 0x10, 0x3d, 0xb2, 0x26, 0x82, 0xbd, 0x49, 0x2a, 0x70, 0x3d, 0x9c, 0xf4, 0x67, 0xbd, 0x8d, 0x33, 0xf3, 0xbc, 0x22, 0xa0, 0xc3, 0x3c, 0x38, 0xb2, 0x31, 0x3d, 0x71, 0xe9, 0x87, 0xbd, 0x7c, 0xc5, 0x96, 0xbd, 0x5b, 0x13, 0xa5, 0xbc, 0x2d, 0x8a, 0x8a, 0x3d, 0x80, 0xc2, 0x24, 0x3d, 0x1e, 0xc5, 0x74, 0x3d, 0xec, 0x3a, 0xca, 0x3c, 0x37, 0xb4, 0x00, 0xbc, 0x29, 0xe2, 0x0c, 0x3d, 0xbc, 0x36, 0x20, 0x3d, 0x58, 0x3a, 0x5f, 0x3d, 0x8a, 0xe4, 0x24, 0xbd, 0x22, 0x99, 0x45, 0xbd, 0xbe, 0xef, 0x0d, 0xbd, 0xbe, 0xae, 0x0f, 0xbc, 0xe1, 0xe9, 0x4e, 0x3c, 0xd2, 0xed, 0x54, 0xbd, 0x62, 0xcb, 0x7d, 0x3c, 0xc8, 0xe4, 0x0d, 0xbc, 0x61, 0xaa, 0xa8, 0x3b, 0x68, 0x56, 0x92, 0xbb, 0x83, 0xb3, 0x25, 0xbd, 0x0a, 0x28, 0x39, 0xbd, 0x9d, 0xd4, 0x13, 0x3c, 0x5c, 0x3c, 0x27, 0x3d, 0x34, 0x21, 0x30, 0x3d, 0x9d, 0xac, 0x54, 0xbd, 0xaa, 0xe8, 0x60, 0x3d, 0xb4, 0xaf, 0xe5, 0x3c, 0xb0, 0x22, 0x1d, 0x3d, 0x9c, 0x7e, 0x64, 0x3d, 0x3e, 0xd9, 0x7b, 0x3d, 0x55, 0x9e, 0x46, 0x3d, 0x47, 0xf9, 0xfe, 0x3a, 0x00, 0xf0, 0x79, 0xbc, 0x49, 0x93, 0xd5, 0xbb, 0x98, 0x75, 0x29, 0xbc, 0xfb, 0xdc, 0x37, 0xbd, 0x9a, 0x0e, 0x65, 0x3d, 0x7a, 0x74, 0x93, 0xbd, 0x39, 0x83, 0xba, 0x3c, 0x20, 0xa3, 0x94, 0xbd, 0xbf, 0x32, 0x18, 0xbc, 0xbd, 0x90, 0x19, 0x3c, 0x31, 0xbe, 0x94, 0xbd, 0x1f, 0xd5, 0x9b, 0x3a, 0x09, 0xa3, 0x44, 0xbd, 0xe4, 0x91, 0xae, 0xbc, 0x98, 0x84, 0x73, 0xbd, 0xe6, 0x64, 0x70, 0x3d, 0xcc, 0x0d, 0x01, 0xbd, 0xb0, 0xd6, 0xce, 0x3c, 0x2a, 0x8b, 0x78, 0xbd, 0x51, 0x8a, 0xcd, 0x3c, 0x76, 0x3b, 0x0b, 0x3b, 0x85, 0xe3, 0x76, 0xbd, 0xad, 0x98, 0x6f, 0x3d, 0xf8, 0xa1, 0x92, 0xbd, 0x22, 0xb9, 0x24, 0xbd, 0x81, 0xf4, 0x62, 0xbd, 0xeb, 0x97, 0x83, 0x3d, 0x0d, 0xa9, 0x91, 0x3a, 0x62, 0x88, 0x0c, 0xbc, 0x99, 0x64, 0x48, 0x3d, 0x0b, 0x11, 0x80, 0xba, 0x94, 0xe3, 0x70, 0xbc, 0xa3, 0x42, 0x56, 0x3c, 0x1c, 0x41, 0xec, 0x3c, 0x68, 0x56, 0x29, 0x3c, 0x50, 0x4a, 0x05, 0x3d, 0xfa, 0x33, 0x37, 0x3d, 0x5d, 0x7c, 0x8d, 0x3d, 0xa8, 0x02, 0x3f, 0x3c, 0xa6, 0x1d, 0x68, 0x3d, 0x41, 0x3b, 0x76, 0x3d, 0x29, 0xa1, 0x56, 0xbd, 0xbd, 0x90, 0x7c, 0x3b, 0xd9, 0x96, 0x62, 0xbd, 0xf2, 0x15, 0xd8, 0xbc, 0xad, 0x62, 0x38, 0x3d, 0x19, 0xc7, 0x0d, 0x3d, 0xda, 0xcc, 0xf8, 0x3b, 0x63, 0xaf, 0x84, 0xbd, 0x42, 0x94, 0x3f, 0xbc, 0x60, 0x67, 0x83, 0x3d, 0x13, 0xdb, 0xa8, 0x3c, 0x8f, 0xcb, 0x5e, 0x3d, 0x97, 0x69, 0x14, 0xbd, 0xd5, 0x52, 0x97, 0x3c, 0x28, 0xb2, 0x09, 0xbb, 0xd0, 0x5c, 0x0f, 0x3d, 0x08, 0x01, 0x38, 0xbd, 0x2a, 0xd1, 0x75, 0xbd, 0xb6, 0x48, 0x5e, 0xbd, 0xe6, 0x3a, 0x40, 0x3d, 0x91, 0x52, 0xb5, 0x3c, 0xe6, 0xe6, 0x2f, 0x3d, 0x7b, 0x0a, 0x0b, 0x3d, 0x05, 0xa6, 0xf1, 0xbb, 0xe5, 0x14, 0x12, 0x3c, 0x70, 0x4a, 0x61, 0xbd, 0xc0, 0xd5, 0x77, 0x3c, 0xea, 0x92, 0x4e, 0x3d, 0xe8, 0xea, 0x7a, 0x3c, 0x85, 0xec, 0x8d, 0xbc, 0x1f, 0x06, 0x3a, 0x3d, 0x24, 0x7d, 0x43, 0x3c, 0x3b, 0xfb, 0x4e, 0x3d, 0x10, 0xdb, 0x26, 0xbc, 0x3c, 0xe4, 0x44, 0x3d, 0x5f, 0x54, 0xe6, 0x3c, 0x32, 0x15, 0xdf, 0xbc, 0x07, 0x77, 0x1f, 0x3d, 0x68, 0x58, 0xea, 0x3c, 0xbe, 0x48, 0x90, 0xbc, 0x42, 0x47, 0x35, 0x3d, 0x21, 0x06, 0x7d, 0xbd, 0x96, 0xd4, 0x67, 0x3c, 0x17, 0x5e, 0x79, 0x3b, 0xd0, 0x09, 0x93, 0xbd, 0xaf, 0x34, 0x3d, 0x3d, 0xc6, 0xd3, 0x8f, 0xbc, 0xae, 0x06, 0x0c, 0x3c, 0x84, 0xeb, 0x04, 0xbd, 0x44, 0xf4, 0x2e, 0xbd, 0xad, 0x8d, 0x61, 0x3c, 0xb0, 0x1e, 0xaf, 0xb9, 0xb6, 0xd3, 0x57, 0xbc, 0x78, 0x89, 0x97, 0x3c, 0x39, 0xa2, 0x41, 0xbd, 0x1c, 0xb3, 0x30, 0xbd, 0x44, 0xc4, 0x90, 0x3c, 0xa3, 0x43, 0x03, 0xbd, 0xe0, 0xe2, 0xc4, 0xbb, 0xf0, 0xf3, 0x4d, 0x3c, 0x6c, 0xf3, 0x85, 0x3d, 0x8f, 0xa9, 0x56, 0xbd, 0x36, 0x75, 0x5c, 0x3d, 0x7e, 0x57, 0x89, 0x3c, 0x3a, 0xb8, 0x29, 0x3c, 0x2c, 0x10, 0x40, 0xbd, 0x5f, 0x74, 0x32, 0xbd, 0xaf, 0x9e, 0x09, 0xbd, 0x60, 0xe4, 0x4b, 0xbd, 0x49, 0xb4, 0xd7, 0x3c, 0xa0, 0x1f, 0x31, 0xbd, 0xd6, 0x5e, 0xde, 0x3c, 0x4e, 0xb1, 0xdb, 0xbc, 0x98, 0x5a, 0x1e, 0x3d, 0x03, 0xe2, 0xa0, 0xba, 0x76, 0xc1, 0x63, 0xbd, 0xbd, 0x03, 0xcf, 0x3c, 0xde, 0x4d, 0x22, 0x3d, 0x6a, 0x58, 0x5c, 0xbb, 0xc3, 0xb8, 0x19, 0xbd, 0xf3, 0x01, 0x8f, 0x3d, 0x40, 0x62, 0xdc, 0x3b, 0x58, 0x64, 0xa0, 0xbc, 0xdc, 0xd4, 0x6d, 0x3d, 0x62, 0x98, 0x1d, 0xbd, 0x96, 0x88, 0x4d, 0x3b, 0x0e, 0xab, 0x46, 0x3d, 0xcb, 0xee, 0xce, 0x3b, 0xc5, 0x27, 0xe2, 0xbb, 0xe4, 0xe4, 0x1c, 0x3d, 0x75, 0x86, 0x08, 0xbd, 0xf0, 0xce, 0x1c, 0x3d, 0xcb, 0x9d, 0x7a, 0x3d, 0x24, 0x56, 0x42, 0xbc, 0x3a, 0x7f, 0xc4, 0xbc, 0x6e, 0xfd, 0x6e, 0x3d, 0xa1, 0x3f, 0x80, 0x3d, 0xfb, 0x13, 0xc9, 0xbc, 0x5f, 0x8f, 0xb9, 0x3c, 0xe3, 0xde, 0x94, 0xbd, 0x9f, 0x88, 0x88, 0xbd, 0x79, 0x27, 0x71, 0x3d, 0xeb, 0xc8, 0x36, 0x3d, 0xe7, 0x2c, 0x9e, 0xbc, 0xb1, 0x19, 0x4d, 0xbd, 0x1e, 0x82, 0x79, 0x3d, 0x75, 0xfe, 0x94, 0xbd, 0xdc, 0xd7, 0x96, 0xbd, 0x3a, 0x57, 0x84, 0x3d, 0x70, 0xcd, 0x09, 0xbd, 0x08, 0xd9, 0x01, 0xbd, 0xa6, 0x1a, 0x85, 0x3d, 0x5e, 0x34, 0xec, 0xbc, 0x3c, 0x0f, 0xa6, 0xbc, 0x0a, 0xc2, 0x6f, 0x3d, 0x72, 0x1c, 0x89, 0x3d, 0xb0, 0x55, 0x12, 0xbd, 0x71, 0x87, 0x1f, 0x3d, 0x03, 0xf0, 0x07, 0x3c, 0x52, 0x7d, 0x29, 0x3d, 0xe0, 0x13, 0x55, 0xbc, 0xe0, 0xac, 0xbb, 0x3c, 0x36, 0x1f, 0x58, 0x3d, 0x34, 0x2f, 0xe3, 0x3c, 0xb5, 0xb7, 0x89, 0xbc, 0x06, 0xfa, 0x93, 0xbd, 0xe7, 0x2e, 0x20, 0xbc, 0xc8, 0x71, 0x4c, 0x3d, 0x03, 0x3b, 0xf6, 0xbb, 0x1c, 0xf7, 0x24, 0x3d, 0x88, 0x07, 0x09, 0x3d, 0xa6, 0x16, 0xde, 0xbc, 0xd4, 0xfa, 0xf5, 0xbc, 0x2e, 0x35, 0x3f, 0x3d, 0x22, 0x36, 0x5c, 0xbd, 0x99, 0xea, 0x90, 0x3d, 0x7c, 0xfd, 0xe6, 0x3c, 0xda, 0x89, 0x2e, 0x3d, 0xea, 0x83, 0x39, 0x3c, 0xe2, 0x35, 0x12, 0x3d, 0xa6, 0xee, 0x46, 0x3d, 0x7b, 0x4e, 0x36, 0xbd, 0x0a, 0x6d, 0xd1, 0x3b, 0x90, 0x59, 0x08, 0xbc, 0x3e, 0xee, 0x86, 0x3b, 0x18, 0x92, 0x13, 0x3d, 0x71, 0xd5, 0x69, 0x3c, 0x5f, 0xc2, 0x8d, 0xbd, 0xb0, 0x51, 0x81, 0x3c, 0x5a, 0x81, 0x9e, 0x3c, 0xcf, 0xae, 0x13, 0x3d, 0xa4, 0x0d, 0x54, 0x3d, 0xb6, 0x82, 0x77, 0x3d, 0x6a, 0x20, 0xf7, 0xbc, 0x60, 0xcc, 0x56, 0xbd, 0x45, 0x8f, 0x23, 0xbd, 0x92, 0x5c, 0x69, 0xbc, 0x8d, 0xb5, 0x5d, 0xbd, 0x39, 0x60, 0x29, 0xbc, 0x06, 0x25, 0x6b, 0x3c, 0xad, 0x40, 0x32, 0xbd, 0xcd, 0xbe, 0xf3, 0xbc, 0x7e, 0xd6, 0x74, 0x3d, 0x2e, 0x72, 0x63, 0x3d, 0xc3, 0xaa, 0x0c, 0xbd, 0x74, 0xfc, 0x6a, 0xbd, 0xff, 0xa6, 0x7b, 0x3d, 0xa8, 0x4f, 0xec, 0xbc, 0x8a, 0x91, 0x39, 0xbd, 0xd1, 0xa4, 0x7b, 0x3d, 0xff, 0x3a, 0x99, 0x3b, 0xe9, 0xd2, 0x4e, 0xbd, 0xc6, 0x84, 0x1e, 0x3d, 0xe7, 0x73, 0xdf, 0xbc, 0x88, 0xfb, 0x08, 0x3d, 0xf9, 0x98, 0xa2, 0xbc, 0x41, 0x1d, 0x8d, 0x3d, 0xe6, 0x32, 0x38, 0x3d, 0x5f, 0xea, 0x1a, 0xbd, 0xce, 0x8f, 0x92, 0xbd, 0xea, 0x1f, 0x69, 0x3d, 0x5b, 0x6e, 0x58, 0xbc, 0x6d, 0xfc, 0x2d, 0x3d, 0xa9, 0x01, 0x83, 0x3d, 0xbc, 0xdb, 0x53, 0x3d, 0x70, 0xea, 0x72, 0xbd, 0xa4, 0xc0, 0xae, 0xbc, 0x80, 0x8a, 0x54, 0x3a, 0x4a, 0x00, 0x80, 0xbc, 0x4a, 0x66, 0x78, 0xbc, 0xbe, 0x62, 0x79, 0xbd, 0xe8, 0x24, 0x84, 0xbc, 0x0d, 0xef, 0x0f, 0x3d, 0xa9, 0xa6, 0x26, 0x3d, 0xb8, 0x68, 0x83, 0xbd, 0xe2, 0x7b, 0x27, 0xbd, 0xdc, 0xda, 0x80, 0xbd, 0x5e, 0x50, 0x88, 0xbd, 0x76, 0x41, 0x8d, 0x3d, 0xee, 0x0a, 0x95, 0xbc, 0xc4, 0x0b, 0x41, 0x3c, 0x6e, 0x16, 0xe0, 0xbc, 0xb2, 0x34, 0x58, 0x3d, 0x65, 0xd4, 0x06, 0x3d, 0x8a, 0x8a, 0x18, 0xbd, 0x99, 0xdd, 0x47, 0x3d, 0x2b, 0xec, 0x00, 0x3d, 0xc3, 0xb1, 0xad, 0xb9, 0xf9, 0x57, 0x77, 0x3c, 0xae, 0xc6, 0x8a, 0xbd, 0x55, 0x51, 0x43, 0x3d, 0x34, 0xd3, 0x1b, 0xbd, 0xda, 0x9e, 0x47, 0x3d, 0xe5, 0x3a, 0x1f, 0x3d, 0x6d, 0xf2, 0x59, 0x3d, 0x14, 0x27, 0xb7, 0xbc, 0xb0, 0x72, 0x8f, 0x3d, 0xbe, 0x91, 0x83, 0xbd, 0xbb, 0x8f, 0x39, 0xbd, 0x40, 0x7f, 0x7e, 0xbd, 0x2d, 0x3e, 0x86, 0x3b, 0xca, 0x43, 0x29, 0xbc, 0xe2, 0xb8, 0x4d, 0x3d, 0x48, 0x31, 0x85, 0xbd, 0xcb, 0x54, 0x1b, 0x3d, 0xb4, 0xc8, 0x56, 0x3d, 0x09, 0x2f, 0x1d, 0x3d, 0xca, 0x8f, 0x10, 0x3d, 0xe1, 0x8d, 0x4c, 0x3a, 0xdb, 0x4d, 0xd2, 0xbc, 0x4a, 0xc7, 0xd1, 0xbc, 0xc8, 0x03, 0xfa, 0x3c, 0x4e, 0x3f, 0xa4, 0xbc, 0x5f, 0x9e, 0x90, 0xbd, 0x13, 0x82, 0xc0, 0x3c, 0x59, 0x55, 0x54, 0x3c, 0xb6, 0x95, 0xa5, 0xbb, 0xef, 0x59, 0xa4, 0x3b, 0x7e, 0x93, 0x1e, 0xbd, 0xaf, 0x49, 0x81, 0xbc, 0xe7, 0xd1, 0xc6, 0xbb, 0xc0, 0xa3, 0xc9, 0x3b, 0x53, 0xa9, 0x77, 0xbb, 0xfa, 0x26, 0x74, 0xbc, 0x06, 0x1b, 0x63, 0x3d, 0xe4, 0x90, 0x0a, 0xbd, 0x64, 0x50, 0x31, 0x3d, 0xff, 0x66, 0x82, 0x3d, 0x9d, 0x1c, 0x06, 0xbd, 0x38, 0x29, 0x40, 0xbd, 0x6f, 0xea, 0x89, 0x3d, 0xdc, 0x8a, 0x3f, 0xbd, 0xd1, 0x88, 0x02, 0x3d, 0x2f, 0x23, 0x27, 0x3c, 0x9c, 0x85, 0x56, 0x3d, 0x41, 0xc7, 0x41, 0xbd, 0x67, 0x51, 0x49, 0x3c, 0x5f, 0x41, 0xf9, 0xbb, 0x15, 0x37, 0xdb, 0xbc, 0x51, 0x7a, 0xd9, 0x3a, 0x05, 0xc0, 0x90, 0xbd, 0x8f, 0xdb, 0x84, 0xbd, 0x3a, 0xc1, 0x48, 0xb9, 0x22, 0x3c, 0xfb, 0x3c, 0x7d, 0xf5, 0x14, 0xbd, 0x26, 0xe6, 0x53, 0xbc, 0xde, 0x94, 0xa0, 0xbc, 0xd9, 0xc4, 0x5e, 0x3d, 0xd4, 0xcf, 0xa6, 0xba, 0xfa, 0x43, 0x18, 0xbd, 0xee, 0x62, 0x19, 0xbd, 0xfb, 0x61, 0x66, 0xbb, 0x1e, 0x8b, 0x82, 0xbd, 0x26, 0xec, 0x87, 0xbd, 0xc2, 0xf6, 0x04, 0x3d, 0x2b, 0x2e, 0xe4, 0xbc, 0x60, 0xa6, 0x4e, 0x3d, 0x21, 0x99, 0x5c, 0x3d, 0xdd, 0xde, 0x37, 0x3d, 0x8e, 0xfc, 0xf5, 0x3c, 0x6d, 0x33, 0xc2, 0x39, 0x48, 0xea, 0x34, 0x3d, 0x79, 0x3e, 0x85, 0xbd, 0x20, 0xb1, 0x3d, 0xbb, 0xdc, 0xe9, 0x64, 0xbc, 0xd2, 0xac, 0x4a, 0xbd, 0x1a, 0x4a, 0x8d, 0xbd, 0xb5, 0xa2, 0xf3, 0x3c, 0xcd, 0x54, 0xb6, 0xbc, 0xc1, 0x9b, 0x2c, 0x3c, 0xd0, 0xea, 0xad, 0xbc, 0x3f, 0xbc, 0x7f, 0x3c, 0xde, 0xe3, 0xe9, 0xbc, 0x1e, 0x28, 0x6f, 0xbc, 0xd1, 0xce, 0xfe, 0xbc, 0xcc, 0x16, 0x21, 0x3d, 0x2a, 0x10, 0x18, 0xbd, 0x5e, 0x73, 0xe9, 0xbb, 0xb3, 0x67, 0xa1, 0xbb, 0x94, 0x7d, 0x0d, 0x3c, 0x1d, 0x67, 0x3b, 0xbd, 0xa9, 0xb9, 0x84, 0x3c, 0xe1, 0xc1, 0x89, 0xba, 0x49, 0x7f, 0x91, 0xbd, 0x47, 0xf8, 0x57, 0xbc, 0x00, 0x6a, 0x24, 0x3d, 0x61, 0x71, 0x6f, 0x3c, 0xd7, 0x6e, 0x4e, 0xbc, 0x07, 0xda, 0x60, 0xbb, 0x2d, 0xd9, 0x8e, 0x3d, 0x0d, 0x9d, 0xc5, 0x3b, 0x50, 0x74, 0xe2, 0xbc, 0xaf, 0x90, 0x2d, 0xbd, 0xce, 0x93, 0x2a, 0x3d, 0x56, 0xee, 0xee, 0xbc, 0x62, 0x58, 0x0a, 0x3d, 0x25, 0x7c, 0x64, 0x3d, 0x23, 0x8d, 0x80, 0x3d, 0x3b, 0xfd, 0x55, 0xbd, 0x8f, 0x71, 0xe2, 0xbc, 0x9c, 0xae, 0x07, 0x3d, 0x0e, 0xe4, 0xdd, 0xbc, 0x93, 0xc9, 0xd7, 0x3c, 0x87, 0x9c, 0xe5, 0xbb, 0xa3, 0xd5, 0x5d, 0x3d, 0x23, 0xdb, 0x3a, 0xbd, 0x67, 0xb3, 0x1a, 0x3d, 0x9e, 0xa1, 0x6b, 0x3d, 0x93, 0x17, 0xc2, 0xbc, 0x0c, 0xb7, 0x33, 0xbd, 0xc0, 0xba, 0xeb, 0xbc, 0x16, 0x2c, 0x4d, 0xbd, 0xed, 0x60, 0x78, 0x3c, 0x54, 0xa3, 0x93, 0xbd, 0x62, 0xa6, 0x8a, 0xbd, 0xdc, 0x16, 0x25, 0xbd, 0xa9, 0xaf, 0x76, 0xbd, 0xab, 0x3c, 0x5d, 0xbd, 0xcf, 0x78, 0x9c, 0x3c, 0x74, 0xf2, 0x97, 0x3c, 0xaa, 0x5d, 0x3b, 0x3d, 0x9c, 0xd2, 0xef, 0x3c, 0xd8, 0x6a, 0x37, 0x3c, 0x44, 0xd2, 0xb9, 0xbc, 0x41, 0x5d, 0x7e, 0x3d, 0x74, 0x3c, 0x7d, 0xbd, 0x40, 0x08, 0x0c, 0xbd, 0xbb, 0xc3, 0x04, 0xbd, 0xd7, 0xd3, 0x5d, 0xbd, 0x41, 0xe7, 0x7c, 0x3d, 0x65, 0x20, 0x6f, 0x3b, 0x4e, 0xef, 0x81, 0x3a, 0xae, 0xe0, 0x5d, 0xbd, 0x3f, 0xfb, 0x82, 0xbd, 0xf1, 0xc5, 0x58, 0xbd, 0x96, 0xab, 0x45, 0x3b, 0x97, 0x5f, 0xcd, 0x3b, 0x39, 0x48, 0x5b, 0x3b, 0x6d, 0xf0, 0x28, 0xbd, 0x08, 0xcc, 0x9f, 0x3c, 0x21, 0xd5, 0x2b, 0xbd, 0xc1, 0xe3, 0x1c, 0x3d, 0x86, 0x52, 0xb4, 0x3c, 0x02, 0xd4, 0xc6, 0xbc, 0xbe, 0xab, 0x27, 0xbd, 0x18, 0x8f, 0x84, 0x3c, 0x7d, 0x47, 0x2e, 0x3d, 0x0a, 0x58, 0x9c, 0x3b, 0x52, 0x72, 0xe4, 0xbc, 0x98, 0x57, 0x5e, 0x3c, 0x24, 0xf1, 0x04, 0xbc, 0x3b, 0xec, 0x0f, 0xbd, 0xf5, 0x54, 0x13, 0x3d, 0x6f, 0xf9, 0x80, 0x3c, 0x80, 0x19, 0xa2, 0xbc, 0xfa, 0x89, 0x35, 0x3d, 0xd8, 0x61, 0x82, 0x3c, 0x21, 0x81, 0x8b, 0x3d, 0x40, 0x2d, 0x65, 0xbc, 0xc6, 0x21, 0x61, 0x3d, 0x51, 0x3d, 0xa9, 0xbc, 0x47, 0x12, 0x55, 0x3d, 0x7e, 0x85, 0x71, 0xbd, 0x22, 0x14, 0x05, 0x3d, 0x94, 0x35, 0x97, 0xbd, 0x3c, 0x00, 0x86, 0xbd, 0x3a, 0x46, 0x5f, 0x3d, 0x18, 0x14, 0x06, 0xbd, 0xb4, 0xea, 0x8c, 0xbd, 0xdc, 0x2e, 0xfe, 0x3b, 0x21, 0x96, 0x3d, 0xbd, 0x3a, 0xf6, 0x8b, 0xbc, 0x3a, 0x3b, 0x6d, 0xbb, 0x39, 0x87, 0x13, 0x3c, 0x15, 0xbc, 0x92, 0xbd, 0x24, 0xb7, 0x13, 0x3d, 0x9c, 0x66, 0x7a, 0xbd, 0x6b, 0xf2, 0x41, 0xbd, 0x1d, 0x15, 0x6a, 0xbc, 0x20, 0x2a, 0x73, 0x3d, 0x25, 0x95, 0x40, 0x3d, 0x23, 0x8f, 0x90, 0xbd, 0xd6, 0x95, 0xa7, 0xbc, 0xbe, 0xce, 0x4f, 0x3d, 0xaf, 0xe0, 0x3f, 0x3d, 0x1b, 0x9f, 0x47, 0x3c, 0x57, 0x37, 0x14, 0x3d, 0x33, 0x06, 0x86, 0x3d, 0xe5, 0x3c, 0x77, 0x3d, 0x60, 0x46, 0x95, 0x3b, 0xee, 0xd2, 0x97, 0xbc, 0x38, 0x20, 0x9c, 0x3c, 0xe6, 0x90, 0xdf, 0xba, 0x77, 0x4f, 0x30, 0x3d, 0x54, 0x87, 0x03, 0x3d, 0x86, 0x7c, 0x25, 0x3d, 0xdb, 0x5a, 0x18, 0x3d, 0x60, 0x84, 0xf9, 0xbc, 0x84, 0x3c, 0xd0, 0xbc, 0xe9, 0x8c, 0x87, 0xbb, 0x39, 0xb9, 0x81, 0x3d, 0x2e, 0x3e, 0x67, 0x3d, 0x5d, 0x57, 0xf8, 0xba, 0x60, 0x31, 0x38, 0x3c, 0xf4, 0x31, 0x02, 0xbd, 0x31, 0x10, 0x98, 0x3c, 0x85, 0x28, 0x16, 0x3d, 0xc5, 0xcd, 0xef, 0x3c, 0x92, 0x8d, 0x59, 0x3d, 0x6a, 0x54, 0x27, 0xbc, 0x72, 0x4a, 0xf7, 0xbc, 0x0d, 0x8d, 0x81, 0x3d, 0xbd, 0x74, 0x8f, 0xbd, 0x80, 0xed, 0x5c, 0x3b, 0xbe, 0x52, 0x7e, 0x3d, 0x49, 0x3f, 0x28, 0xbd, 0xcc, 0xc5, 0xea, 0xbc, 0x2f, 0x46, 0x6b, 0xbd, 0x05, 0xd4, 0x0c, 0xbc, 0x41, 0x09, 0x02, 0x3d, 0x2e, 0xa8, 0x53, 0xbc, 0xc7, 0x56, 0x56, 0xbd, 0xc2, 0x01, 0x88, 0xbd, 0x7a, 0x9c, 0x6f, 0x3d, 0x3c, 0x49, 0x1c, 0x3d, 0x2b, 0x80, 0xe3, 0x3b, 0x43, 0x27, 0x7d, 0x3d, 0x91, 0xa0, 0x58, 0x3d, 0xdb, 0x70, 0x76, 0xbc, 0xc4, 0xfa, 0x04, 0xbd, 0x5e, 0x76, 0xcc, 0x3b, 0x0a, 0xcf, 0xc0, 0xbc, 0xfa, 0x3f, 0x08, 0xbd, 0x26, 0x65, 0xaa, 0x3c, 0x2f, 0xec, 0x37, 0x3d, 0xa0, 0xae, 0x51, 0x3d, 0xbd, 0x0e, 0x4e, 0x3d, 0x4d, 0x36, 0xae, 0xbc, 0xf1, 0xc8, 0x3f, 0xbd, 0x79, 0xe5, 0x84, 0xbc, 0xac, 0x19, 0xf7, 0x3b, 0x5f, 0x52, 0x70, 0xbd, 0x46, 0x15, 0x01, 0xbd, 0x17, 0xb1, 0xb1, 0x3c, 0x2e, 0x19, 0x87, 0xbd, 0x0c, 0xe6, 0x98, 0x3c, 0x35, 0xd0, 0x22, 0xbd, 0xe3, 0x8f, 0x8a, 0xbd, 0x23, 0x8b, 0xfa, 0x3c, 0x01, 0x67, 0x80, 0x3d, 0x6c, 0x9e, 0xb2, 0x3a, 0x6b, 0xbe, 0x8b, 0x3d, 0x74, 0x68, 0xdb, 0x3c, 0x4c, 0x13, 0xae, 0xbc, 0x94, 0xfe, 0x50, 0xbd, 0xdc, 0x7e, 0x2f, 0x3d, 0x78, 0x0a, 0x6e, 0xbc, 0x0e, 0x2b, 0xe9, 0xbc, 0x3b, 0x4b, 0x08, 0x3d, 0x4d, 0x1a, 0x3d, 0xbd, 0x55, 0x7e, 0x51, 0xbb, 0x15, 0xa6, 0xb4, 0xbc, 0xac, 0x1b, 0x86, 0xbb, 0x8a, 0x27, 0x22, 0x3d, 0x39, 0xc8, 0x34, 0xbc, 0x65, 0x0e, 0x1a, 0xbb, 0x4c, 0x08, 0xdb, 0x3b, 0x60, 0x75, 0x2d, 0xbc, 0x25, 0xba, 0x64, 0xbc, 0x8c, 0x05, 0x70, 0x3d, 0x0e, 0xdc, 0xaa, 0xbc, 0x63, 0x17, 0x03, 0x3d, 0x03, 0x9d, 0x36, 0x3c, 0xe3, 0xf5, 0x6e, 0x3d, 0x01, 0xf8, 0x12, 0xbd, 0x15, 0x62, 0xb3, 0x3c, 0xe1, 0x20, 0x1f, 0x3d, 0xbd, 0x41, 0x8d, 0x3d, 0x7b, 0x02, 0x47, 0x3d, 0x8e, 0x9c, 0x93, 0xbc, 0x82, 0xa1, 0x81, 0xbd, 0xb9, 0x59, 0x6e, 0x3c, 0xc6, 0x93, 0x07, 0xbd, 0x4c, 0x87, 0x44, 0x3d, 0x6a, 0x66, 0x49, 0xbd, 0x80, 0xd5, 0x4b, 0xbb, 0x70, 0xd5, 0x09, 0x3c, 0x20, 0x85, 0x06, 0x3c, 0x7e, 0xd6, 0x42, 0x3d, 0x5d, 0x10, 0x01, 0x3c, 0x71, 0xbe, 0x6c, 0xbc, 0xcc, 0xba, 0x2d, 0xbd, 0xbf, 0xf6, 0x90, 0xbd, 0x59, 0xb8, 0x8c, 0x3d, 0x4a, 0xe8, 0x87, 0xbc, 0xee, 0xd3, 0xd1, 0x3c, 0xde, 0xdd, 0xa6, 0xbb, 0x26, 0x06, 0x6a, 0xbc, 0x1f, 0xa2, 0x88, 0xbd, 0x00, 0x6c, 0x24, 0xbb, 0x36, 0xf0, 0x00, 0x3c, 0x1e, 0x54, 0x86, 0xbb, 0x55, 0x5e, 0x01, 0xbc, 0x3e, 0x0e, 0xe8, 0x3c, 0xbd, 0x02, 0x70, 0xbb, 0x8e, 0xb9, 0x85, 0x3d, 0x8e, 0x8a, 0x5d, 0xbb, 0xa4, 0x21, 0x13, 0x3d, 0xd1, 0x77, 0x16, 0xbc, 0x40, 0x95, 0x1d, 0x3c, 0x58, 0x2f, 0xbb, 0x3c, 0xf5, 0x88, 0x86, 0xbb, 0xa0, 0x02, 0x83, 0xbd, 0x93, 0xb8, 0x0a, 0x3c, 0xfd, 0x65, 0xe2, 0xbb, 0x24, 0x21, 0x11, 0x3d, 0xc6, 0x89, 0x8c, 0xbd, 0xc3, 0xa9, 0x7a, 0xbd, 0x43, 0xcf, 0x81, 0xbd, 0xde, 0x81, 0x58, 0xbd, 0x3d, 0x35, 0x23, 0x3d, 0xbe, 0x81, 0x90, 0xbd, 0xd3, 0xd2, 0xbb, 0x3c, 0x60, 0x68, 0xe5, 0xbc, 0x25, 0x64, 0xa8, 0xbb, 0x8e, 0x5e, 0x4e, 0xbd, 0xc3, 0xa4, 0xd3, 0xbc, 0xb0, 0x99, 0xf7, 0xbc, 0x2d, 0x56, 0x17, 0xbd, 0x44, 0x65, 0x2b, 0x3d, 0xa7, 0x80, 0x05, 0xbd, 0xfc, 0xe1, 0x02, 0x3d, 0x65, 0xa7, 0x68, 0x3d, 0x52, 0x5d, 0x8b, 0xbd, 0x6a, 0x9e, 0x83, 0xbd, 0xd4, 0xac, 0x1a, 0xbc, 0x3e, 0x6b, 0x7d, 0xbc, 0xeb, 0xff, 0x40, 0xbd, 0xcd, 0xd2, 0x21, 0x3d, 0x7e, 0xf1, 0x70, 0xbd, 0x9b, 0xc6, 0x6a, 0xbb, 0x1e, 0xb9, 0x20, 0x3d, 0xfd, 0x9b, 0x61, 0xbd, 0x57, 0xf3, 0x5a, 0xbd, 0x5d, 0xbe, 0xbb, 0x3b, 0xd3, 0xc8, 0x50, 0xbd, 0x38, 0x8a, 0x5e, 0xbd, 0x86, 0x65, 0x57, 0x3d, 0x02, 0xc7, 0x85, 0xbd, 0x95, 0x0a, 0x80, 0x3d, 0x08, 0xcd, 0x66, 0x3c, 0x68, 0x38, 0x3d, 0x3c, 0xad, 0x64, 0x12, 0xbd, 0x20, 0x0d, 0xcc, 0x3c, 0x63, 0x2c, 0x3f, 0x3d, 0xf6, 0xe1, 0xdc, 0x3c, 0x5f, 0xa6, 0x35, 0x3d, 0x7b, 0xf6, 0x68, 0xbd, 0x9e, 0x65, 0xd2, 0x3c, 0x13, 0x63, 0x9d, 0xbb, 0xd6, 0x42, 0x51, 0xbc, 0xa2, 0xc5, 0x52, 0xbc, 0x6a, 0x3d, 0x3f, 0x3d, 0xa6, 0xde, 0xf8, 0xbc, 0x01, 0xa1, 0x5b, 0x3d, 0x8d, 0xdf, 0x16, 0xbd, 0x62, 0x4d, 0x35, 0xba, 0x22, 0xca, 0x30, 0xbd, 0x50, 0x22, 0x72, 0xbc, 0xf1, 0xaa, 0x96, 0xbd, 0x52, 0xf4, 0xd9, 0x3c, 0x08, 0x89, 0x6d, 0x3d, 0x90, 0x97, 0xa9, 0x3c, 0x20, 0x9d, 0x0b, 0x3c, 0x47, 0x97, 0xf5, 0xbc, 0x7f, 0xc1, 0x3c, 0x3d, 0x77, 0xa7, 0xeb, 0x3b, 0xe2, 0x0c, 0x77, 0x3d, 0xca, 0x57, 0x3e, 0x3d, 0x16, 0x46, 0x38, 0xbd, 0x15, 0xde, 0x87, 0x3d, 0x10, 0x09, 0x0a, 0xbd, 0xa0, 0xfa, 0x56, 0x3b, 0xba, 0x6c, 0x2f, 0x3d, 0x0f, 0xb9, 0x70, 0x3c, 0x35, 0xb8, 0x8c, 0xbd, 0x88, 0xad, 0xc5, 0xbc, 0xb2, 0x0b, 0x40, 0xbd, 0x63, 0x62, 0x80, 0xbd, 0xb4, 0xd9, 0x78, 0x3c, 0x91, 0x49, 0x8a, 0xbd, 0x59, 0x3c, 0x47, 0x3d, 0xb1, 0xb7, 0x3a, 0xbd, 0x0f, 0x07, 0xea, 0x3b, 0xca, 0x89, 0x50, 0xbd, 0xf6, 0x2c, 0x27, 0xbd, 0x3f, 0xf7, 0x37, 0x3c, 0x1c, 0x12, 0x23, 0x3c, 0x6d, 0x88, 0x97, 0xbd, 0x06, 0x09, 0x66, 0x3d, 0x40, 0xac, 0x80, 0xbc, 0xac, 0xea, 0x7c, 0xbd, 0x7e, 0xfb, 0x1a, 0x3d, 0x11, 0xd1, 0x65, 0x3d, 0x56, 0x13, 0xee, 0xbc, 0xa5, 0xe1, 0x69, 0xbd, 0x47, 0xff, 0x45, 0xbc, 0x20, 0xba, 0x2e, 0xbd, 0xff, 0x15, 0x48, 0xbc, 0x01, 0xd5, 0x8f, 0x3d, 0x42, 0x0f, 0x37, 0x3c, 0x68, 0xbc, 0xcc, 0x3c, 0xf4, 0x1e, 0x39, 0xbd, 0x00, 0x6c, 0x07, 0xb9, 0xe4, 0x6e, 0xb2, 0x3c, 0x9b, 0x53, 0x88, 0xbd, 0x20, 0xf2, 0xef, 0xbc, 0xd3, 0xf3, 0x8e, 0x3d, 0xbc, 0xe9, 0xa6, 0xbc, 0xa3, 0xb6, 0x6b, 0xbc, 0x73, 0xeb, 0xdd, 0xbc, 0xdf, 0xa3, 0x04, 0xbd, 0x1a, 0x9f, 0x21, 0x3c, 0x1d, 0xb7, 0x89, 0xbb, 0x28, 0x66, 0x85, 0xbc, 0xf9, 0x7f, 0x95, 0xbd, 0x4c, 0x07, 0xfa, 0xbc, 0x52, 0x7d, 0x29, 0x3d, 0x66, 0x78, 0x24, 0xbc, 0xd4, 0x70, 0xfa, 0xbc, 0x20, 0xdb, 0x02, 0xbd, 0x51, 0x27, 0x09, 0xbd, 0xb6, 0xb6, 0x42, 0x3d, 0x37, 0xa4, 0x3f, 0xbd, 0xfc, 0x30, 0xb2, 0xbb, 0x2b, 0xa7, 0xb7, 0x3c, 0x77, 0xf6, 0x2e, 0x3d, 0x4e, 0x18, 0x6c, 0x3d, 0xb0, 0xb9, 0xe4, 0x3c, 0xa6, 0xce, 0x89, 0xbd, 0x18, 0x9a, 0xc2, 0x3c, 0x8d, 0xdc, 0x51, 0xbd, 0x50, 0x09, 0x0a, 0x3d, 0xd8, 0x90, 0x6c, 0xbc, 0x28, 0x48, 0x96, 0xbc, 0x50, 0x5f, 0x62, 0xbc, 0x8b, 0xbc, 0x82, 0xbd, 0xb0, 0x24, 0xce, 0x3b, 0x54, 0xb0, 0x4b, 0x3c, 0xd8, 0x02, 0x59, 0x3c, 0x0b, 0x7d, 0xa0, 0x3c, 0x2a, 0x6f, 0xfa, 0xbc, 0x51, 0xf4, 0x0a, 0xbd, 0xe5, 0xdd, 0x45, 0x3d, 0x69, 0xcb, 0x5f, 0x3d, 0x59, 0xee, 0x1b, 0x3d, 0x15, 0x0c, 0x6d, 0x3d, 0xb4, 0xe8, 0x3a, 0x3c, 0xd6, 0x4c, 0x71, 0x3d, 0x2c, 0x6c, 0x5f, 0xbc, 0x23, 0xc7, 0x96, 0x3c, 0x90, 0xfd, 0xef, 0xb9, 0x80, 0x9a, 0xce, 0xbc, 0xc8, 0xa7, 0xfa, 0xbc, 0x3f, 0x84, 0x4d, 0xbc, 0xb9, 0x1e, 0x63, 0x3d, 0x91, 0xff, 0x16, 0xbd, 0xe4, 0x6d, 0x65, 0xbc, 0xbb, 0x19, 0x69, 0xbc, 0xf0, 0xba, 0xfe, 0xbc, 0xbb, 0xe6, 0x30, 0x3d, 0x12, 0x3a, 0x4d, 0x3d, 0x08, 0xa7, 0x79, 0x3d, 0x37, 0x6c, 0x88, 0x3d, 0xb4, 0x66, 0xf1, 0xba, 0xb8, 0x48, 0xcc, 0xbc, 0x61, 0xb9, 0x1d, 0xbd, 0x8a, 0x51, 0x45, 0xbd, 0x2e, 0x8a, 0x59, 0x3d, 0x88, 0xe0, 0x7d, 0xbd, 0x53, 0xc6, 0x8e, 0xbd, 0x0e, 0x7b, 0x5a, 0x3d, 0x13, 0xc2, 0xcb, 0xbc, 0x57, 0xcd, 0x8b, 0xbd, 0x60, 0x8c, 0x4e, 0xbd, 0xe2, 0x03, 0x07, 0x3d, 0x5f, 0x0d, 0x80, 0x3c, 0x5f, 0xc8, 0x3d, 0x3d, 0x89, 0x06, 0xc8, 0x3c, 0x17, 0x2b, 0x88, 0x3d, 0xf6, 0x31, 0x63, 0x3d, 0x51, 0x2b, 0x60, 0xbd, 0xc9, 0x26, 0x67, 0xbd, 0x02, 0x8e, 0x4f, 0xbd, 0xbd, 0x67, 0x20, 0x3d, 0x53, 0xfa, 0x64, 0xbb, 0x27, 0x16, 0x28, 0xbd, 0x45, 0x52, 0xfb, 0xbb, 0x66, 0x53, 0x8d, 0x3c, 0x0c, 0x18, 0x74, 0xbc, 0x60, 0x98, 0x19, 0x3d, 0xd2, 0x7c, 0x3c, 0x3d, 0x77, 0x65, 0x90, 0xbc, 0x69, 0x1e, 0x3e, 0xbd, 0x04, 0x22, 0x7f, 0xbc, 0x7c, 0x5d, 0x2c, 0xbc, 0x51, 0xb3, 0x1f, 0xbc, 0xc4, 0xaf, 0xbf, 0xbc, 0xa8, 0xc5, 0x59, 0x3c, 0xfe, 0x08, 0x62, 0x3d, 0x7c, 0x3a, 0x56, 0x3d, 0x4a, 0xaf, 0x38, 0x3d, 0xd9, 0x9e, 0x26, 0xbd, 0x48, 0xc2, 0x16, 0xbc, 0x6e, 0xcc, 0xec, 0xbc, 0x05, 0x78, 0x0e, 0xbc, 0xd2, 0x5c, 0x51, 0xbd, 0x44, 0x63, 0x6b, 0x3d, 0x7c, 0xfd, 0xca, 0xbb, 0x62, 0xda, 0x30, 0x3c, 0xc4, 0xcc, 0x61, 0x3d, 0xdc, 0xa6, 0x34, 0xbd, 0xff, 0x8f, 0x24, 0xbc, 0x68, 0x37, 0xf6, 0xbc, 0xd1, 0x4d, 0x25, 0xbd, 0x33, 0x6e, 0x91, 0x3c, 0x60, 0x57, 0x6b, 0x3d, 0x04, 0xf7, 0x34, 0xbd, 0x90, 0xe7, 0x30, 0x3d, 0x8e, 0x22, 0x65, 0xbd, 0x62, 0xcf, 0xb6, 0x3c, 0xce, 0x5d, 0x9f, 0x3c, 0xa0, 0x0a, 0x43, 0xbd, 0x1e, 0x7b, 0x56, 0xbd, 0x1f, 0x6a, 0x93, 0xbd, 0x60, 0x5e, 0x39, 0x3d, 0x4d, 0x17, 0x8e, 0xbd, 0x28, 0x00, 0xad, 0x3c, 0x79, 0xd0, 0xab, 0xbb, 0x15, 0xf3, 0x1a, 0xbd, 0x28, 0x13, 0x05, 0x3c, 0x90, 0x55, 0x20, 0x3d, 0x98, 0x9b, 0xc4, 0x3c, 0x32, 0x5f, 0x86, 0xbd, 0x6d, 0xf8, 0x52, 0xbd, 0xcc, 0x28, 0xae, 0x3c, 0x96, 0xc7, 0x81, 0x3d, 0x04, 0x2e, 0x5b, 0xbc, 0xdd, 0xce, 0xb2, 0x3c, 0x14, 0x5d, 0x67, 0x3d, 0x74, 0xe8, 0x77, 0x3d, 0x2e, 0xf5, 0x51, 0x3d, 0x21, 0x78, 0x7a, 0xbd, 0x62, 0xea, 0x6a, 0xbd, 0x36, 0x1c, 0xf4, 0xbc, 0xd0, 0x98, 0xda, 0x3b, 0x26, 0x14, 0x8a, 0xbd, 0xf2, 0xa4, 0x67, 0xbd, 0xb2, 0xa7, 0x39, 0xbd, 0x93, 0xa6, 0xd6, 0x3c, 0xe1, 0xa9, 0xe4, 0x3b, 0x49, 0xca, 0x3f, 0x3d, 0x07, 0xe3, 0x64, 0x3d, 0x1e, 0xf5, 0x4d, 0xbd, 0x4e, 0xc3, 0x8a, 0xbd, 0x88, 0xf9, 0xf8, 0x3c, 0xc6, 0x2a, 0xba, 0xbc, 0x56, 0xd7, 0xb1, 0xbc, 0xbd, 0xff, 0x10, 0x3c, 0xfe, 0x3d, 0x16, 0xbd, 0x88, 0xdd, 0x5f, 0x3c, 0x66, 0xd4, 0x50, 0xbd, 0xe2, 0x59, 0x62, 0x3d, 0x1c, 0xdf, 0xac, 0x3c, 0xc2, 0x72, 0xb7, 0xbc, 0xe2, 0x19, 0x4d, 0xbd, 0xc1, 0xbb, 0xa1, 0x3c, 0xf2, 0x8f, 0x24, 0x3d, 0x2f, 0xb1, 0xeb, 0xbc, 0xa7, 0xe6, 0x13, 0xbd, 0x4c, 0x51, 0x7c, 0xbd, 0x23, 0x87, 0x3e, 0xbd, 0x65, 0x03, 0x86, 0x3b, 0x5d, 0x13, 0x15, 0x3d, 0x44, 0x77, 0x96, 0xba, 0xe9, 0x74, 0x0a, 0x3d, 0xb4, 0xd0, 0x59, 0xbd, 0x4c, 0x9a, 0x22, 0x3d, 0x82, 0x1b, 0x85, 0x3d, 0x09, 0x1e, 0xf9, 0x3c, 0x20, 0xcf, 0x97, 0xbd, 0xf9, 0x46, 0x0e, 0xbd, 0xba, 0x0d, 0x82, 0x3d, 0xf6, 0xf1, 0xd7, 0x3c, 0x8e, 0x08, 0xf8, 0xbc, 0x4d, 0xbf, 0x22, 0xbd, 0xd0, 0x25, 0x8a, 0x3c, 0xa8, 0x71, 0x2e, 0xbd, 0xd9, 0xaa, 0x24, 0x3a, 0x48, 0x85, 0x6c, 0xbd, 0x90, 0x0e, 0x8c, 0x3c, 0x3c, 0x45, 0x50, 0x3d, 0x71, 0xab, 0x65, 0x3d, 0x60, 0x38, 0xdb, 0x3b, 0x9b, 0x94, 0x81, 0xbd, 0xc0, 0xaa, 0xb3, 0xbc, 0xc8, 0x46, 0x93, 0xbc, 0x3a, 0x19, 0xea, 0xbc, 0x16, 0xab, 0x36, 0xbc, 0x20, 0x52, 0x74, 0xbd, 0xbd, 0x3b, 0x75, 0x3d, 0xea, 0xef, 0xc3, 0xbc, 0x54, 0xbe, 0x26, 0xbd, 0x88, 0x03, 0x6c, 0x3d, 0xa0, 0x3e, 0x4a, 0x3d, 0x46, 0x60, 0x0a, 0x3d, 0xf9, 0x88, 0x59, 0x3d, 0xa2, 0x8a, 0x87, 0xbd, 0xde, 0x60, 0x48, 0x3d, 0xc6, 0x87, 0x60, 0x3d, 0x05, 0x18, 0x3d, 0xbc, 0xa8, 0x15, 0x01, 0x3d, 0x68, 0x46, 0x41, 0xbd, 0x7f, 0x8e, 0x58, 0x3d, 0xc6, 0xa4, 0xf6, 0x3c, 0x22, 0xbc, 0x73, 0x3d, 0xe8, 0x2d, 0x83, 0x3c, 0x97, 0x7f, 0x8b, 0xbb, 0xe6, 0x83, 0x81, 0xbc, 0x42, 0x79, 0x5b, 0x3d, 0x62, 0xfb, 0xd4, 0x3b, 0xf3, 0x51, 0x06, 0xbd, 0xb0, 0x65, 0x79, 0x3d, 0xbc, 0x83, 0xdc, 0x3c, 0xbe, 0xbd, 0x8c, 0x3d, 0x64, 0xdf, 0x13, 0x3d, 0x1f, 0xa8, 0x44, 0xbd, 0x1e, 0x7f, 0x87, 0xbc, 0x15, 0x05, 0x6c, 0xbd, 0x43, 0x6b, 0x75, 0xbd, 0x38, 0x5a, 0x64, 0x3d, 0xb8, 0x35, 0x2c, 0x3c, 0x93, 0x41, 0xd5, 0xb9, 0xf4, 0x66, 0x79, 0xbc, 0xd9, 0xda, 0xae, 0xbc, 0xd6, 0x82, 0xd4, 0x3b, 0x48, 0x9e, 0x3e, 0xbd, 0x0c, 0x2c, 0xb7, 0xbc, 0xba, 0x9c, 0x2f, 0xbd, 0x9c, 0x53, 0x4f, 0x3d, 0xf5, 0x5f, 0xe6, 0x3c, 0x60, 0x8e, 0x1f, 0x3b, 0xa6, 0x27, 0x4a, 0xbd, 0xe5, 0x82, 0x9b, 0x3c, 0xb7, 0xe1, 0x84, 0x3d, 0x13, 0x34, 0x34, 0xbc, 0x58, 0xca, 0x09, 0x3d, 0xe2, 0x9f, 0x70, 0x3d, 0x7b, 0x73, 0xa1, 0xbc, 0xdb, 0x26, 0x08, 0xbd, 0xc0, 0x46, 0xce, 0xba, 0xfc, 0xde, 0xe1, 0x3c, 0xf5, 0xd5, 0xbc, 0x3c, 0x03, 0x9b, 0x16, 0x3d, 0x61, 0xda, 0x16, 0xbd, 0x9c, 0x34, 0x15, 0xbd, 0x6c, 0xae, 0x50, 0xbd, 0xc0, 0x47, 0x89, 0xbd, 0xf0, 0xff, 0x52, 0x3d, 0xa2, 0xf2, 0x01, 0x3d, 0x7c, 0x68, 0x1a, 0x3d, 0x70, 0x77, 0x58, 0xbd, 0x62, 0xb8, 0xb3, 0x3c, 0xd8, 0x2e, 0x07, 0xbc, 0xe6, 0x32, 0x8b, 0x3d, 0x6b, 0xa2, 0x53, 0x3d, 0x12, 0xfa, 0x55, 0xbd, 0x7d, 0x83, 0x28, 0x3d, 0x92, 0xa8, 0x73, 0xbd, 0xd5, 0xd5, 0x9c, 0x3c, 0xe5, 0x93, 0x83, 0x3c, 0xf9, 0xc8, 0xb3, 0xbc, 0xfb, 0x27, 0x78, 0xbd, 0xa6, 0x7d, 0x5b, 0x3d, 0x9c, 0x51, 0x4d, 0x3d, 0x25, 0x60, 0x4b, 0x3d, 0xba, 0x91, 0x96, 0xb9, 0xd7, 0xaf, 0xc3, 0x3c, 0x34, 0x25, 0x3c, 0x3d, 0x3a, 0x04, 0x3a, 0x3d, 0x86, 0xb2, 0x30, 0x3c, 0x90, 0xcf, 0x46, 0x3d, 0x96, 0xee, 0xe2, 0xbc, 0x9c, 0x30, 0xa7, 0x3c, 0x56, 0xe3, 0x5a, 0xbd, 0x2f, 0xb6, 0x23, 0x3d, 0xda, 0x3e, 0x3c, 0xbd, 0x6e, 0xa0, 0x5c, 0x3d, 0x28, 0xe0, 0x6e, 0xbd, 0x1a, 0x52, 0x34, 0x3d, 0xb8, 0xcd, 0x27, 0xbc, 0x4a, 0xb4, 0x22, 0x3d, 0x1c, 0xd7, 0x64, 0xbc, 0x8f, 0xd9, 0x1d, 0xbd, 0xa2, 0x1e, 0x17, 0x3d, 0x78, 0xed, 0xe2, 0x3c, 0x82, 0x5e, 0x0d, 0x3c, 0x93, 0x9d, 0x58, 0xbd, 0x35, 0x43, 0x8a, 0xbd, 0xbd, 0xa6, 0xdf, 0x3c, 0x11, 0xc3, 0x3b, 0x3d, 0x6c, 0xad, 0x58, 0xbd, 0x2e, 0x39, 0x1f, 0x3d, 0x45, 0x7d, 0x00, 0x3a, 0xa9, 0xb2, 0x5b, 0x3d, 0x00, 0x38, 0x81, 0x38, 0xaa, 0x9f, 0xc9, 0x3a, 0xaa, 0x79, 0x73, 0xbd, 0x39, 0x7b, 0xf7, 0x3b, 0xc4, 0x9f, 0x4e, 0xbd, 0xa1, 0x0c, 0x64, 0x3a, 0x9b, 0x06, 0x5f, 0xbd, 0x32, 0x21, 0x6d, 0xbd, 0xbe, 0x94, 0x4e, 0x3d, 0x7c, 0x40, 0xf9, 0x3c, 0xc8, 0xac, 0xca, 0x3c, 0x30, 0x76, 0x50, 0xbd, 0x08, 0x66, 0x93, 0xbd, 0x0b, 0x4c, 0xb9, 0x3c, 0x8e, 0xef, 0x26, 0x3d, 0xe3, 0x00, 0x68, 0x3d, 0x51, 0x3a, 0x84, 0xbd, 0x54, 0xac, 0xb3, 0xbc, 0x95, 0x17, 0x91, 0xbd, 0x04, 0xf2, 0x31, 0x3d, 0x48, 0xbb, 0x20, 0x3c, 0xf3, 0x82, 0x88, 0xbd, 0xdd, 0x5e, 0x4e, 0xbd, 0x95, 0x9e, 0x45, 0xbd, 0x62, 0xce, 0x51, 0xbd, 0xa3, 0x8b, 0x3b, 0x3d, 0x40, 0xdb, 0x85, 0x3d, 0x33, 0xdc, 0xc1, 0xbc, 0xa7, 0xb6, 0x7d, 0xbd, 0xd3, 0x99, 0x40, 0xbc, 0x6b, 0x63, 0x18, 0x3d, 0x73, 0x2f, 0x63, 0xbc, 0xf8, 0xa2, 0x4a, 0xbc, 0xa5, 0x0b, 0x76, 0x3d, 0xd5, 0x88, 0x79, 0x3d, 0x97, 0x41, 0x98, 0x3c, 0xe8, 0x20, 0x16, 0x3d, 0xcc, 0x47, 0x78, 0xbd, 0xfd, 0x9a, 0xae, 0x3c, 0xf2, 0xe2, 0x8a, 0xbd, 0x07, 0xd1, 0x19, 0x3d, 0xd4, 0xef, 0x68, 0xbc, 0x82, 0x5d, 0x51, 0x3d, 0x0c, 0x61, 0xc8, 0xba, 0xc1, 0xd5, 0x36, 0xbd, 0xf2, 0x3c, 0x1d, 0x3d, 0x86, 0xdf, 0x65, 0x3d, 0x04, 0x4c, 0x87, 0x3d, 0xe9, 0x46, 0x91, 0x3d, 0xc0, 0x63, 0x33, 0xbc, 0x7c, 0xd0, 0xbf, 0x3c, 0xe8, 0xfe, 0x55, 0xbd, 0x18, 0x50, 0x53, 0x3c, 0x51, 0x99, 0xb0, 0xbb, 0x50, 0x90, 0xec, 0x3b, 0x3d, 0x3a, 0x69, 0xbd, 0x6e, 0x49, 0x09, 0xbc, 0x74, 0x12, 0xde, 0xbc, 0xad, 0x0c, 0x87, 0x3c, 0x35, 0x8f, 0x41, 0x3d, 0x5e, 0xa8, 0x3b, 0xbd, 0x28, 0x85, 0x61, 0x3d, 0xfe, 0xb2, 0xe1, 0x3b, 0xec, 0xbb, 0x0e, 0x3d, 0x04, 0xe3, 0x05, 0x3d, 0x10, 0xeb, 0x07, 0xbd, 0x63, 0x3a, 0x68, 0x3d, 0x55, 0x9c, 0x49, 0x3b, 0x58, 0xdc, 0x62, 0x3d, 0x33, 0x78, 0x03, 0x3d, 0x0f, 0xc8, 0x7a, 0xbd, 0xa3, 0x94, 0x83, 0xbd, 0xf7, 0x86, 0x5d, 0xbd, 0xcb, 0xd6, 0x82, 0x3d, 0xcb, 0x78, 0x82, 0xbd, 0xcb, 0x8b, 0x46, 0xbc, 0x44, 0xff, 0x75, 0xbd, 0x63, 0xc6, 0x48, 0x3d, 0x50, 0x1b, 0x14, 0xbc, 0x57, 0xd1, 0xe1, 0x3c, 0x60, 0xa8, 0xe2, 0x3c, 0x00, 0xa0, 0xf8, 0xb9, 0x9c, 0x9f, 0x24, 0x3d, 0x10, 0x2c, 0x4a, 0x3c, 0x90, 0xdf, 0xbc, 0xbc, 0x9e, 0xae, 0xa4, 0xbc, 0xf7, 0x31, 0x66, 0xbd, 0x1e, 0x83, 0x14, 0x3c, 0x9b, 0xaa, 0x91, 0x3b, 0x91, 0x24, 0x11, 0xbd, 0x54, 0x0b, 0x90, 0x3b, 0x30, 0xa4, 0x64, 0x3d, 0x69, 0xa8, 0x81, 0x3d, 0x5e, 0x35, 0x03, 0xbb, 0xcc, 0xce, 0xa6, 0x3c, 0x2f, 0x18, 0xfd, 0xbc, 0x50, 0x81, 0xe2, 0xbb, 0x40, 0x4b, 0x16, 0x3d, 0xc0, 0x66, 0x63, 0xbd, 0x5f, 0xcd, 0x9b, 0xbc, 0x2f, 0xf8, 0x25, 0xbd, 0xa0, 0x4d, 0x7a, 0x3c, 0x81, 0x0c, 0x5a, 0xbd, 0x54, 0xa9, 0x6a, 0x3d, 0xc0, 0x3b, 0x3c, 0xbd, 0xb4, 0x63, 0xfb, 0x3c, 0x26, 0x9c, 0x11, 0x3d, 0x06, 0xea, 0xa3, 0xbc, 0x3f, 0x44, 0x92, 0xbc, 0x00, 0x88, 0x6f, 0x3b, 0xd8, 0x6f, 0x36, 0xbd, 0xe0, 0xad, 0x89, 0x3d, 0x52, 0xfb, 0x72, 0x3d, 0x64, 0x05, 0x64, 0xbc, 0xd7, 0x2a, 0x57, 0xbd, 0x02, 0x49, 0xad, 0xbc, 0x38, 0xf1, 0x2d, 0xbd, 0x8a, 0x2e, 0x8b, 0x3d, 0x39, 0x44, 0x12, 0xbd, 0xfc, 0xa0, 0xb8, 0xbc, 0x32, 0x17, 0x8a, 0xbd, 0x7e, 0xbf, 0x6b, 0x3d, 0x32, 0x76, 0xad, 0xbc, 0xb0, 0x21, 0x58, 0x3d, 0x62, 0xf5, 0x59, 0x3d, 0xb3, 0x5f, 0x98, 0x3c, 0xa4, 0x02, 0x2c, 0x3b, 0x59, 0x69, 0x97, 0xbd, 0x70, 0xcf, 0x91, 0x3b, 0x6b, 0xc3, 0x47, 0xbd, 0x10, 0xfe, 0xd4, 0xbc, 0x08, 0x93, 0xd1, 0x3b, 0xf5, 0xe9, 0x14, 0xbd, 0x9a, 0x9c, 0x7b, 0x3d, 0x15, 0x75, 0x54, 0x3d, 0x09, 0xbf, 0x57, 0xbc, 0xbf, 0x09, 0x29, 0xbb, 0xf5, 0x6d, 0x91, 0xbd, 0xb8, 0x41, 0xbd, 0x3c, 0x80, 0x60, 0x6e, 0x3c, 0xab, 0xf2, 0x4f, 0xbd, 0x81, 0x36, 0x79, 0x3d, 0x6a, 0x5a, 0x85, 0xbd, 0xf2, 0xac, 0x36, 0x3d, 0x92, 0x7c, 0xc0, 0xbc, 0x00, 0x12, 0x06, 0x3c, 0xfe, 0x9c, 0x66, 0x3d, 0xa0, 0xf3, 0xbb, 0xbb, 0x37, 0xb0, 0x74, 0xbd, 0x18, 0xb1, 0x10, 0xbd, 0x82, 0xd7, 0xe2, 0xbc, 0x87, 0xee, 0x14, 0x3d, 0xe9, 0x2a, 0x40, 0xbd, 0xe3, 0x0d, 0x53, 0x3c, 0x5c, 0x02, 0x93, 0x3c, 0x25, 0x0f, 0x49, 0xbd, 0x88, 0xd8, 0x3f, 0x3d, 0x58, 0xf0, 0x39, 0xbd, 0xe3, 0x0a, 0x3b, 0xbd, 0xeb, 0x61, 0x01, 0x3d, 0xb4, 0xa0, 0x6b, 0xbd, 0x1d, 0x4b, 0x90, 0xbd, 0xb2, 0x31, 0x34, 0xbd, 0xaa, 0x20, 0xad, 0x3a, 0xd5, 0x1e, 0x3a, 0xbd, 0xf4, 0x05, 0x38, 0x3d, 0x1b, 0xb2, 0x46, 0xbc, 0x2c, 0xd7, 0x3e, 0x3d, 0xec, 0x98, 0xc7, 0x3c, 0xe7, 0xd3, 0x21, 0xbd, 0x07, 0x35, 0x60, 0xbd, 0x2b, 0xb9, 0xfd, 0xbc, 0x9b, 0x69, 0x36, 0x3d, 0xdf, 0xdf, 0x6f, 0xbd, 0x5a, 0x80, 0x81, 0xbd, 0x9b, 0x67, 0xf2, 0x3b, 0x20, 0x94, 0xde, 0xbb, 0xc5, 0xfc, 0x29, 0xbd, 0x0c, 0x34, 0x30, 0xbd, 0x50, 0xbb, 0xc9, 0xbc, 0x92, 0x32, 0x93, 0xbc, 0x12, 0xf9, 0x69, 0xbd, 0x1c, 0x84, 0x3a, 0xbc, 0x88, 0x93, 0x84, 0xbd, 0x07, 0x7e, 0xb5, 0x3c, 0xe6, 0xb8, 0x4a, 0x3d, 0xde, 0x7c, 0x55, 0x3d, 0x16, 0x69, 0xf0, 0xbc, 0x91, 0x57, 0x5b, 0xbd, 0xa2, 0x4a, 0x26, 0x3d, 0x5b, 0xdc, 0xaf, 0xba, 0xe8, 0x30, 0xe1, 0xbc, 0xf8, 0x97, 0x21, 0x3d, 0x00, 0x3e, 0x11, 0x3c, 0x92, 0x1c, 0xb1, 0xbc, 0xce, 0x5f, 0xa3, 0x3c, 0x2d, 0x13, 0x88, 0xbd, 0xbc, 0x64, 0xbc, 0x3c, 0xd1, 0x47, 0x97, 0xbb, 0xf2, 0x46, 0x55, 0x3d, 0x70, 0x6e, 0x09, 0x3d, 0x6b, 0x66, 0x93, 0xbd, 0x26, 0xf4, 0xcb, 0xbc, 0x59, 0xb5, 0x84, 0xbc, 0x13, 0x19, 0x8d, 0x3d, 0x35, 0xf3, 0x3e, 0xbc, 0x9d, 0xf8, 0x78, 0x3d, 0x75, 0x6d, 0x4f, 0x3d, 0xd4, 0x8a, 0xd7, 0x3c, 0x74, 0x49, 0x0d, 0xbd, 0x40, 0x3d, 0xcd, 0x3a, 0xa2, 0xb6, 0x64, 0x3d, 0x73, 0xc5, 0x90, 0x3d, 0x5b, 0x4e, 0x85, 0xbd, 0xf6, 0x1b, 0x64, 0x3d, 0x15, 0x44, 0xbf, 0xbc, 0x4c, 0xb6, 0x0e, 0x3d, 0xaf, 0x91, 0x06, 0xbc, 0xa0, 0xc6, 0xdf, 0x3c, 0xb7, 0xb5, 0x66, 0x3d, 0x23, 0x0d, 0x68, 0xbd, 0xcf, 0x9f, 0xe9, 0xbc, 0xcd, 0xa5, 0x1f, 0xbd, 0x92, 0x3c, 0x5b, 0x3d, 0x0c, 0x92, 0x57, 0x3d, 0x73, 0xa2, 0x2e, 0xbd, 0x4a, 0xeb, 0x23, 0xbc, 0x6b, 0xa1, 0x3c, 0xba, 0xd2, 0x19, 0xbb, 0xbc, 0x44, 0x55, 0x29, 0xbd, 0xcd, 0x07, 0x34, 0xbd, 0xbf, 0xaa, 0xf9, 0xba, 0x18, 0x7b, 0x8a, 0xbc, 0x4a, 0xe1, 0x5d, 0x3d, 0x28, 0x1b, 0x38, 0x3c, 0xfd, 0x1b, 0xd0, 0x3b, 0xdd, 0x1c, 0x92, 0xbb, 0xf4, 0x64, 0x31, 0x3c, 0x82, 0x22, 0x44, 0x3d, 0x22, 0xd5, 0x0c, 0xbd, 0x63, 0x1f, 0x24, 0xbd, 0xd0, 0xe3, 0x03, 0x3c, 0xfc, 0x32, 0x22, 0xbc, 0x26, 0x4e, 0xba, 0xbc, 0xf2, 0x18, 0xa8, 0xbc, 0x1d, 0xb1, 0x43, 0xbc, 0x4b, 0x52, 0x17, 0xbd, 0xe1, 0xf7, 0x05, 0x3d, 0xdb, 0xfb, 0xd9, 0x3c, 0x0b, 0x58, 0x8e, 0xbc, 0xc1, 0x1f, 0x81, 0x3d, 0xa0, 0x6f, 0x36, 0xbd, 0x52, 0xec, 0x57, 0xbd, 0x6a, 0x3b, 0x06, 0xbd, 0xb5, 0x5b, 0x9c, 0xbc, 0x08, 0xb1, 0x32, 0xbc, 0xc0, 0xde, 0x85, 0xbd, 0x2d, 0xd5, 0xd2, 0x3c, 0xa6, 0x1d, 0x14, 0xbc, 0x8d, 0x5e, 0xd8, 0x3c, 0x83, 0x8e, 0xcf, 0xbc, 0xa0, 0xc2, 0x83, 0xbd, 0xce, 0x5f, 0x3b, 0xbd, 0x60, 0xbc, 0x7d, 0xbc, 0x8e, 0x9c, 0x7f, 0xbd, 0xb3, 0x61, 0x0b, 0xbd, 0x1c, 0x2b, 0xc9, 0x3c, 0xbc, 0xb7, 0x6f, 0x3c, 0x61, 0x58, 0xda, 0xbc, 0xcc, 0x72, 0x23, 0x3c, 0x28, 0x64, 0x61, 0x3c, 0x5a, 0x19, 0x42, 0x3d, 0xb0, 0x39, 0x13, 0x3c, 0xe6, 0x3a, 0xf7, 0xbc, 0xc4, 0xaf, 0xc4, 0x3c, 0xd2, 0x14, 0xd0, 0xbc, 0x1a, 0x00, 0xb8, 0xbc, 0xf9, 0x9e, 0x23, 0xbd, 0xdf, 0x82, 0x6a, 0xbd, 0x7a, 0xc2, 0x18, 0xbc, 0xbf, 0xb0, 0x11, 0xbc, 0x2d, 0x48, 0x5b, 0xbd, 0xff, 0xff, 0x46, 0x3c, 0x6c, 0x6c, 0x36, 0x3c, 0xec, 0x21, 0x8a, 0xbd, 0x02, 0x85, 0xe0, 0x3c, 0xdf, 0x2e, 0x42, 0xbd, 0xf0, 0xa5, 0x24, 0x3d, 0x0a, 0xd1, 0x00, 0x3d, 0x58, 0x44, 0xb3, 0x3c, 0xc9, 0xe4, 0x33, 0x39, 0xba, 0x0f, 0xb9, 0xbc, 0xba, 0x18, 0x64, 0x3c, 0x9e, 0xc4, 0x50, 0xbc, 0x5f, 0x96, 0x4c, 0x3d, 0xbc, 0xdc, 0x61, 0x3d, 0xba, 0xaf, 0x38, 0x3d, 0xf1, 0x21, 0x89, 0x3d, 0x60, 0x95, 0x05, 0x3c, 0xc6, 0xb2, 0x6e, 0xbc, 0x5f, 0x2d, 0x21, 0xbd, 0xee, 0x52, 0x23, 0x3d, 0x3c, 0xc0, 0x1d, 0xbc, 0x3e, 0xcd, 0x84, 0x3d, 0x00, 0xc5, 0xa8, 0x39, 0x06, 0x5b, 0x4a, 0xbd, 0xec, 0x4b, 0x1b, 0xbd, 0x05, 0x4c, 0x17, 0xbd, 0x18, 0x01, 0x56, 0x3c, 0xcd, 0x05, 0x87, 0xbd, 0xe4, 0x37, 0x41, 0xbc, 0xdc, 0x36, 0x84, 0x3d, 0xa1, 0xd7, 0x09, 0x3d, 0x44, 0xf4, 0x63, 0xbd, 0x56, 0x62, 0x78, 0xbd, 0x12, 0x57, 0x3b, 0xbd, 0x43, 0xcd, 0x71, 0xbb, 0xa3, 0xf6, 0x10, 0x3d, 0x3a, 0x9f, 0xff, 0xbc, 0x6f, 0xdd, 0x8d, 0x3d, 0xb3, 0xd7, 0x08, 0xbd, 0x3e, 0x97, 0x76, 0x3d, 0x99, 0x60, 0x02, 0xbd, 0x08, 0x27, 0x8d, 0x3d, 0xf1, 0x51, 0x29, 0x3d, 0x48, 0x9d, 0xfe, 0x3c, 0x97, 0xb9, 0x72, 0xbd, 0x35, 0x21, 0xab, 0xbc, 0xc3, 0x96, 0x69, 0x3c, 0x05, 0x44, 0x05, 0x3d, 0x80, 0x79, 0x75, 0x3a, 0x94, 0x62, 0xfe, 0x3b, 0x47, 0xb4, 0x64, 0x3c, 0xbb, 0x50, 0x29, 0xbd, 0xe9, 0xb8, 0x6e, 0xbd, 0x2e, 0xab, 0x26, 0xbc, 0x54, 0x42, 0xb6, 0xbc, 0x08, 0xdb, 0x22, 0xbd, 0xae, 0x42, 0x78, 0x3d, 0x3c, 0xba, 0x2c, 0xbc, 0x46, 0xf1, 0x6e, 0x3d, 0xed, 0xb1, 0x88, 0xbd, 0x96, 0x2c, 0x75, 0x3d, 0x26, 0x69, 0x90, 0xbd, 0x9b, 0x7b, 0x77, 0xbc, 0x9a, 0xbc, 0x05, 0xbd, 0x85, 0xb1, 0x19, 0xbd, 0xb8, 0x33, 0x8b, 0xbd, 0xfa, 0xa3, 0x8b, 0xbc, 0xc6, 0x36, 0xf2, 0x3c, 0x4e, 0x81, 0xa2, 0xbc, 0xa7, 0x85, 0x73, 0xbd, 0xca, 0xe5, 0x93, 0xbc, 0xc8, 0x3d, 0x0e, 0x3d, 0x75, 0x3c, 0x00, 0xbd, 0x28, 0x32, 0x0e, 0x3d, 0x8f, 0x29, 0x04, 0xbc, 0x0c, 0x29, 0x37, 0xbd, 0x47, 0x11, 0x83, 0xbd, 0x82, 0x57, 0x2a, 0xbd, 0x45, 0x1f, 0x6b, 0xbc, 0x66, 0xaf, 0x7d, 0xbd, 0xa8, 0x5a, 0x25, 0xbd, 0x96, 0xc0, 0x14, 0x3b, 0xba, 0xf0, 0x1b, 0xbd, 0xe0, 0x71, 0x44, 0xbb, 0x9c, 0x09, 0xb9, 0xbc, 0x45, 0xda, 0x77, 0x3c, 0x2b, 0x5d, 0x80, 0x3d, 0xaa, 0xf0, 0x21, 0x3d, 0xa0, 0x25, 0x31, 0x3d, 0x34, 0xc8, 0x3b, 0xbd, 0x90, 0x50, 0xf6, 0xbc, 0x53, 0xed, 0x04, 0x3a, 0x26, 0xf8, 0x6e, 0x3d, 0x6d, 0x73, 0x0f, 0x3d, 0xe8, 0xac, 0x43, 0x3d, 0xf1, 0x03, 0x8a, 0x3c, 0xc4, 0x94, 0x3d, 0x3d, 0x3c, 0x89, 0x8b, 0x3d, 0x62, 0x99, 0x0f, 0x3d, 0xb6, 0x30, 0x8d, 0x3c, 0xfa, 0x8f, 0x25, 0x3c, 0x4c, 0x45, 0xd2, 0xbc, 0x00, 0x5d, 0xc0, 0x3c, 0xae, 0x8d, 0x6c, 0xbd, 0xcb, 0xa3, 0x92, 0xbd, 0xc4, 0x1e, 0xbb, 0xbc, 0x63, 0xf8, 0xaa, 0x3c, 0xd7, 0x7c, 0x81, 0x3d, 0xbf, 0x33, 0x41, 0x3c, 0x80, 0x59, 0x69, 0xbb, 0x0a, 0x75, 0x37, 0xbd, 0x29, 0xdc, 0x1b, 0xbd, 0x10, 0x1f, 0x46, 0xbd, 0xee, 0xb4, 0x5d, 0x3d, 0xfa, 0x40, 0x95, 0xbd, 0x02, 0xd8, 0x19, 0xbd, 0xa8, 0xd0, 0xf0, 0xbc, 0x0a, 0xb8, 0xc4, 0x3c, 0x68, 0xa8, 0x11, 0xbd, 0x24, 0x4f, 0x3e, 0x3d, 0x39, 0x99, 0x90, 0xbd, 0x7c, 0x43, 0x13, 0xbd, 0x86, 0xe5, 0x8f, 0xbd, 0xa4, 0x16, 0xb4, 0xbc, 0xa0, 0xe9, 0xf2, 0x3c, 0x91, 0x68, 0x5d, 0xbd, 0x51, 0x92, 0x85, 0x3d, 0xd2, 0x4d, 0x35, 0xbd, 0xc7, 0x44, 0x3e, 0xbd, 0x20, 0xf6, 0xe0, 0x3c, 0x6b, 0x38, 0x35, 0x3d, 0xd2, 0x2b, 0x2a, 0xbb, 0xc8, 0xbf, 0x0c, 0xbd, 0xec, 0xd6, 0xfc, 0x3b, 0x1c, 0xae, 0xa9, 0xbc, 0x28, 0x65, 0xb3, 0x3c, 0xdf, 0x29, 0x98, 0xbc, 0x11, 0x52, 0xbd, 0x3c, 0x4d, 0x7d, 0xac, 0x3c, 0x95, 0xcb, 0x09, 0xbc, 0xc5, 0xc5, 0xf8, 0xbc, 0xe6, 0x99, 0x3f, 0x3c, 0xb0, 0x51, 0xfd, 0xbc, 0x88, 0x6b, 0xe0, 0xbc, 0xaa, 0x84, 0x83, 0xbd, 0x98, 0x79, 0x8d, 0x3c, 0xda, 0x5f, 0xf2, 0x3c, 0xb3, 0xcc, 0x7a, 0x3d, 0xc9, 0x55, 0x08, 0x3d, 0xd1, 0x83, 0x33, 0x3d, 0x6c, 0xc1, 0x66, 0xbc, 0x80, 0xf9, 0x62, 0xba, 0xe4, 0xd5, 0x88, 0xbd, 0x60, 0x31, 0xd2, 0xbc, 0x2b, 0x89, 0x86, 0x3d, 0x1b, 0x1e, 0x53, 0xbd, 0xfa, 0x0c, 0x07, 0xbd, 0x50, 0xe8, 0xb5, 0xbc, 0x4f, 0xc6, 0x65, 0xbd, 0xef, 0x09, 0x75, 0xbd, 0xd5, 0x47, 0x0c, 0xbd, 0xcc, 0x4e, 0x89, 0xbd, 0x9c, 0x69, 0xe3, 0x3c, 0x52, 0xea, 0x9d, 0xbc, 0x01, 0x0e, 0x86, 0xbc, 0x2a, 0x61, 0x72, 0xbd, 0x85, 0xbc, 0x87, 0x3d, 0x21, 0xf7, 0x42, 0x3d, 0x0b, 0x60, 0x23, 0xbd, 0x0f, 0x0f, 0xed, 0xbc, 0x7d, 0x05, 0xd2, 0xbc, 0x6e, 0x5e, 0x5f, 0xbd, 0x36, 0x52, 0x92, 0xbd, 0x7e, 0x96, 0x05, 0xbb, 0x6e, 0x51, 0x98, 0x3a, 0xe5, 0x11, 0x19, 0xbd, 0x00, 0xcf, 0x84, 0xbb, 0x61, 0x5e, 0xed, 0x3c, 0x60, 0xcf, 0x50, 0xbb, 0xce, 0xbe, 0x07, 0x3c, 0x5c, 0x81, 0x20, 0x3d, 0x45, 0x85, 0xf6, 0xbc, 0x1d, 0xb7, 0x91, 0x3d, 0x38, 0x08, 0x59, 0x3c, 0x28, 0x93, 0x4b, 0x3d, 0x3a, 0xc4, 0x87, 0xbd, 0x44, 0x7f, 0x04, 0xbd, 0xdd, 0x17, 0x81, 0x3d, 0xbe, 0x94, 0x48, 0x3d, 0x88, 0x6a, 0xce, 0xba, 0x93, 0x5b, 0x20, 0x3d, 0xab, 0x05, 0x90, 0xbd, 0xf9, 0x71, 0xc4, 0x3c, 0x6c, 0xd4, 0x7a, 0x3d, 0x4a, 0x2d, 0x20, 0x3d, 0x94, 0xd7, 0x88, 0x3d, 0x82, 0xb5, 0x87, 0xbd, 0x55, 0x15, 0xec, 0x3b, 0xc0, 0x09, 0xe4, 0xba, 0x31, 0x50, 0xfc, 0x3c, 0x25, 0x49, 0x6e, 0x3c, 0x5c, 0x79, 0x92, 0xbc, 0xed, 0xab, 0x14, 0xbd, 0x24, 0x3e, 0xaa, 0x3c, 0x98, 0x43, 0x58, 0x3d, 0x2f, 0x00, 0x62, 0x3d, 0x3c, 0x09, 0x2d, 0x3d, 0xe3, 0x27, 0x85, 0x3c, 0x7a, 0x37, 0x06, 0x3d, 0x49, 0xe6, 0x62, 0xbd, 0x71, 0x53, 0x94, 0xbd, 0xc4, 0xeb, 0xd0, 0xbb, 0xd8, 0xed, 0x11, 0x3c, 0xfe, 0x75, 0x8c, 0xbc, 0xc4, 0xeb, 0x16, 0xbd, 0xb8, 0xb8, 0xf7, 0x3c, 0x30, 0x85, 0xaa, 0xbb, 0xcb, 0x9f, 0x16, 0xbd, 0x1d, 0xed, 0x8d, 0x3d, 0x0f, 0xf3, 0x08, 0xbd, 0x8e, 0x3c, 0x13, 0x3d, 0xc4, 0x04, 0x74, 0x3d, 0x60, 0xeb, 0x35, 0xbd, 0xe7, 0xcf, 0x38, 0x3d, 0x12, 0xde, 0xaf, 0x3c, 0xca, 0x71, 0x04, 0x3d, 0x1c, 0xd8, 0xeb, 0x3c, 0xc6, 0xfc, 0xb3, 0x3c, 0xa0, 0x37, 0x5a, 0x3d, 0xbe, 0xcc, 0x59, 0x3c, 0x4c, 0x95, 0x9a, 0xbc, 0xa6, 0xff, 0xa8, 0x3b, 0xcd, 0x7d, 0x7d, 0xbd, 0x5c, 0xe7, 0xba, 0x3c, 0xf9, 0x97, 0x02, 0xbd, 0x3a, 0xd3, 0x80, 0xbd, 0xcd, 0xbe, 0x97, 0xbd, 0x3b, 0x0d, 0x35, 0xba, 0x76, 0x27, 0x44, 0x3d, 0x63, 0xae, 0x8a, 0x3d, 0x03, 0x4c, 0x68, 0xbd, 0xe5, 0x9d, 0x0f, 0xbc, 0x6f, 0x5d, 0x45, 0xbb, 0x48, 0x3a, 0x74, 0x3d, 0x85, 0xfa, 0x37, 0xbd, 0x31, 0xf5, 0x1c, 0x3d, 0x0b, 0x19, 0x52, 0xbd, 0x00, 0xcd, 0x9e, 0xb9, 0xdb, 0xe5, 0x84, 0xbd, 0x83, 0xf1, 0x7f, 0xbd, 0xb7, 0x44, 0x63, 0xbd, 0x44, 0x0a, 0x98, 0xbd, 0x60, 0xd8, 0x23, 0xbb, 0xd1, 0x69, 0x61, 0xbd, 0x71, 0x41, 0x5a, 0xbd, 0x2f, 0xd9, 0x70, 0xbd, 0xc3, 0xb8, 0xd3, 0x3c, 0x38, 0xa7, 0x99, 0x3c, 0xe0, 0xa0, 0x21, 0xbd, 0xd2, 0x90, 0xa8, 0xb8, 0xff, 0xae, 0x32, 0x3c, 0x65, 0x1a, 0x0d, 0x3d, 0xa6, 0xd0, 0x39, 0xbd, 0xdd, 0xb4, 0x18, 0xbd, 0xb0, 0xa0, 0xbc, 0x3c, 0xa0, 0xe4, 0x8b, 0x3d, 0x90, 0xe6, 0x25, 0x3d, 0x7c, 0x20, 0x5d, 0x3d, 0x74, 0x50, 0xda, 0xbb, 0x4a, 0xe0, 0x70, 0x3d, 0x02, 0x36, 0x13, 0x3d, 0xaa, 0xab, 0x05, 0xbd, 0xec, 0xda, 0x10, 0xbd, 0xd1, 0x40, 0x35, 0xbd, 0xd2, 0x14, 0x3a, 0xbd, 0xd6, 0x7f, 0x06, 0xbd, 0x55, 0xf8, 0x31, 0x3d, 0xea, 0xc4, 0x5c, 0x3d, 0xd6, 0x89, 0x52, 0x3d, 0x68, 0xe6, 0x44, 0x3d, 0xd5, 0x64, 0x20, 0xbd, 0x18, 0x41, 0xc8, 0x3c, 0x10, 0xfa, 0x44, 0x3d, 0x30, 0x39, 0x20, 0xbc, 0x27, 0x26, 0x85, 0x3d, 0x9e, 0x02, 0x48, 0x3d, 0x59, 0xbb, 0xad, 0xbc, 0x67, 0x3c, 0xe3, 0xbc, 0xcc, 0x6e, 0x4b, 0xbd, 0x08, 0xf9, 0x1c, 0xbd, 0x50, 0x02, 0xa8, 0x3c, 0x77, 0x8c, 0x21, 0xbd, 0x1b, 0x8e, 0x0c, 0x3c, 0x0a, 0xe3, 0x76, 0x3d, 0x60, 0xa0, 0xa6, 0xbc, 0x30, 0x1d, 0x2c, 0x3d, 0x89, 0xab, 0x57, 0xbd, 0x39, 0xdf, 0x8e, 0x3b, 0x4e, 0xd0, 0x81, 0x3d, 0x6f, 0xc7, 0x0c, 0x3d, 0xb8, 0x21, 0x12, 0x3d, 0x32, 0xe6, 0x5a, 0x3d, 0x26, 0xbf, 0x64, 0x3c, 0xa8, 0xaf, 0x35, 0x3d, 0x0e, 0x6e, 0xb4, 0xbc, 0x78, 0x59, 0xa8, 0x3c, 0xd1, 0xca, 0x5c, 0xbd, 0x3a, 0x40, 0x53, 0x3d, 0x30, 0x50, 0x0c, 0xbc, 0x11, 0xd3, 0x35, 0xbd, 0x06, 0x5b, 0x89, 0xbd, 0x2e, 0xe3, 0x63, 0x3d, 0xc5, 0xdc, 0x0e, 0xbd, 0x60, 0x04, 0x2d, 0xbb, 0xae, 0xfb, 0x42, 0x3d, 0x83, 0x52, 0xcd, 0xbc, 0x20, 0x53, 0x06, 0x3d, 0xd5, 0xc6, 0x38, 0x3c, 0xa7, 0xa9, 0xf4, 0xbc, 0x9b, 0x2d, 0x89, 0x3d, 0x70, 0x74, 0x83, 0x3c, 0x06, 0x87, 0xe7, 0x3b, 0x97, 0xa3, 0x92, 0x3c, 0x38, 0x5f, 0xf7, 0x3c, 0xdf, 0x71, 0x3b, 0xbd, 0xfe, 0x14, 0x4d, 0x3d, 0x0a, 0x42, 0xb8, 0xbc, 0xb4, 0xf6, 0x2f, 0x3c, 0x33, 0xe6, 0x94, 0xbd, 0x26, 0x39, 0x71, 0xbd, 0x10, 0xf4, 0x6e, 0xbd, 0xe4, 0x3f, 0x09, 0xbd, 0x35, 0xe6, 0xb7, 0x3c, 0x9b, 0x3a, 0x10, 0xbd, 0x4d, 0x58, 0x43, 0xbd, 0x3e, 0x25, 0x2c, 0xbd, 0x38, 0xdc, 0x4f, 0x3c, 0x06, 0xf5, 0xff, 0xbc, 0x33, 0x3e, 0x81, 0xbd, 0x27, 0x99, 0x8e, 0xbb, 0x27, 0xc9, 0x68, 0xbd, 0xce, 0x6c, 0x81, 0x3c, 0x0e, 0xab, 0x67, 0xbd, 0x50, 0x8a, 0x2f, 0x3c, 0x30, 0x32, 0x37, 0x3d, 0x49, 0xd1, 0x0e, 0xbd, 0x60, 0xe2, 0x38, 0x3d, 0xf8, 0xd0, 0x9f, 0x3c, 0x3e, 0x8a, 0x0d, 0x3d, 0x7e, 0x2f, 0x6a, 0xbd, 0xe8, 0x0f, 0xab, 0x3b, 0x6e, 0x3d, 0x49, 0xbd, 0xba, 0xdd, 0x00, 0x3d, 0x80, 0x40, 0xdc, 0x3b, 0x18, 0x06, 0x76, 0x3d, 0x48, 0xe5, 0x6d, 0x3d, 0xca, 0xcf, 0xa9, 0xbc, 0x3c, 0xb8, 0x50, 0xbc, 0x70, 0xbf, 0x76, 0x3c, 0x0c, 0xbc, 0x1c, 0x3d, 0x59, 0x70, 0xf3, 0xbc, 0x21, 0xaa, 0x83, 0xbc, 0xf6, 0x67, 0x4f, 0xbd, 0x86, 0xa6, 0x71, 0x3c, 0x69, 0xd6, 0x48, 0x3c, 0x50, 0x60, 0x56, 0x3d, 0x9c, 0x25, 0x50, 0xbd, 0x10, 0x27, 0x76, 0x3c, 0x98, 0x24, 0x7b, 0xbd, 0x6c, 0xb9, 0x01, 0xbc, 0xe6, 0xea, 0x85, 0x3d, 0x0e, 0xa0, 0xf5, 0x3b, 0xb4, 0xb3, 0x0e, 0x3d, 0xe2, 0xc0, 0xa1, 0x3c, 0x4c, 0x2c, 0xf6, 0xbc, 0xc8, 0x58, 0x25, 0x3c, 0xd0, 0x2c, 0xeb, 0x3c, 0xa8, 0x0f, 0xfa, 0x3c, 0x50, 0xc1, 0xd6, 0xbb, 0x42, 0x81, 0x4d, 0xbd, 0x37, 0x4c, 0x88, 0xbd, 0xf4, 0x1a, 0xd2, 0xbc, 0x94, 0xb7, 0xaf, 0xbb, 0xaf, 0xeb, 0x0f, 0x3d, 0xed, 0x56, 0xa3, 0x3c, 0x5e, 0x0a, 0x87, 0x3d, 0x5c, 0x4a, 0x64, 0xbc, 0x37, 0x90, 0x62, 0x3c, 0x57, 0xcd, 0xbb, 0x3b, 0x50, 0x0c, 0x76, 0xbd, 0x1c, 0x48, 0x87, 0xbc, 0x38, 0x8a, 0x4e, 0x3c, 0xda, 0x2b, 0x3a, 0x3d, 0xba, 0x1a, 0x81, 0xbc, 0x29, 0xca, 0xba, 0x3c, 0x78, 0x39, 0x2b, 0xbd, 0xd4, 0x80, 0xe2, 0xbb, 0x08, 0x96, 0x95, 0x3c, 0x55, 0x08, 0x50, 0x3c, 0xbd, 0xed, 0x15, 0xbd, 0xd0, 0xeb, 0xe5, 0xbb, 0xa5, 0x5a, 0x22, 0xbc, 0x6c, 0xe7, 0x8f, 0xbc, 0x63, 0x73, 0xb2, 0x3c, 0xc0, 0xae, 0x13, 0x3c, 0x54, 0xbd, 0x6f, 0xbd, 0x9e, 0x5a, 0x60, 0x3d, 0x62, 0xe8, 0x34, 0x3d, 0x38, 0x91, 0x24, 0x3d, 0x10, 0xac, 0x03, 0x3c, 0x04, 0xc0, 0x83, 0xbd, 0x16, 0x48, 0x7e, 0xbd, 0x64, 0x7a, 0x40, 0xbc, 0x52, 0xcf, 0x4a, 0x3d, 0xa1, 0x54, 0x1f, 0xb9, 0x61, 0x19, 0x8c, 0x3d, 0x08, 0xfa, 0x5a, 0xbd, 0x2a, 0xf5, 0x67, 0x3d, 0xb3, 0xcc, 0x12, 0xbd, 0xc3, 0x2a, 0x65, 0x3d, 0x06, 0xbb, 0x41, 0xbd, 0xfc, 0xc0, 0x09, 0xbd, 0x2c, 0xdf, 0xa7, 0xbc, 0xb7, 0xfe, 0x5d, 0xbd, 0xcb, 0x10, 0xa3, 0xbb, 0x75, 0xc3, 0xcd, 0x3c, 0x2b, 0xd5, 0x0e, 0x3d, 0x11, 0x1c, 0x83, 0x3d, 0x71, 0xdc, 0xb2, 0xbc, 0xda, 0xe1, 0x86, 0xbd, 0x39, 0xf2, 0x50, 0x3c, 0x40, 0x25, 0x50, 0x3b, 0x18, 0x17, 0x43, 0xbc, 0x6b, 0xa6, 0x88, 0x3c, 0x60, 0x10, 0x5d, 0xbd, 0x0e, 0x88, 0xa1, 0x3c, 0xa6, 0xd3, 0xe4, 0xbc, 0x11, 0x76, 0x88, 0xbc, 0x1e, 0x07, 0x6c, 0x3d, 0xa6, 0x6e, 0x1b, 0x3d, 0xc0, 0x30, 0x30, 0x3d, 0xf2, 0x34, 0x8d, 0xbd, 0xc0, 0xe2, 0x18, 0x3b, 0xce, 0xef, 0x83, 0xbc, 0xe7, 0x31, 0x0e, 0xbd, 0xd1, 0xf1, 0x8b, 0xbd, 0xba, 0x6e, 0x3e, 0xbc, 0xc7, 0x45, 0x08, 0xbd, 0x57, 0x7e, 0x56, 0x3d, 0x6d, 0xaf, 0x68, 0xbd, 0xef, 0x94, 0x28, 0xbd, 0x65, 0xf5, 0xa5, 0x3c, 0xea, 0x2c, 0x43, 0xbd, 0x5c, 0xc6, 0x5d, 0x3c, 0x3e, 0x7e, 0x3f, 0xbd, 0xd4, 0xa5, 0x7c, 0xbd, 0x14, 0x39, 0x35, 0xbd, 0xc5, 0x8a, 0x08, 0xbd, 0x7e, 0xc0, 0x0c, 0x3d, 0x45, 0xbb, 0x84, 0x3c, 0x0d, 0x10, 0x6f, 0x39, 0x81, 0x04, 0x4b, 0x3c, 0x5b, 0x45, 0xff, 0x3c, 0xab, 0xd1, 0x74, 0xbd, 0x98, 0x8a, 0x38, 0x3c, 0xe3, 0xc7, 0xa9, 0x3c, 0x8b, 0x12, 0x7f, 0xbd, 0x6f, 0xb7, 0xc5, 0x3a, 0x95, 0x7e, 0xaf, 0x3c, 0x50, 0xc8, 0xc5, 0x3b, 0xf9, 0x02, 0x89, 0xbd, 0x6e, 0x63, 0xa2, 0xbc, 0x0c, 0x74, 0x32, 0x3d, 0xea, 0x32, 0x79, 0x3d, 0x0e, 0x34, 0x91, 0xbd, 0xa1, 0x87, 0xec, 0xbc, 0x1c, 0xd4, 0x17, 0x3d, 0xe1, 0xb0, 0x74, 0x3d, 0xe9, 0x8e, 0xc6, 0x3c, 0x8a, 0x62, 0x55, 0xbc, 0x51, 0x37, 0x95, 0xbd, 0x2b, 0xc8, 0xbd, 0xbc, 0x8e, 0xe4, 0xef, 0xbc, 0x11, 0x49, 0x0d, 0x3d, 0xe8, 0xcc, 0x16, 0x3d, 0xc6, 0xa8, 0xc8, 0x3c, 0x98, 0x01, 0x88, 0x3c, 0xbd, 0x8e, 0x46, 0xbd, 0xab, 0x7d, 0xd4, 0xbc, 0x7a, 0xde, 0xb6, 0xbc, 0xf9, 0x44, 0xcd, 0xbc, 0xad, 0xae, 0x13, 0xbc, 0x8d, 0xb5, 0x21, 0xbd, 0x48, 0xfb, 0x05, 0xbc, 0x1d, 0x6d, 0x84, 0x3d, 0x4c, 0x32, 0x8a, 0x3c, 0xa8, 0xe9, 0x69, 0x3c, 0xa6, 0xba, 0x1b, 0xbd, 0xe5, 0xfa, 0x12, 0x3d, 0xea, 0xea, 0x11, 0x3d, 0xa4, 0xa1, 0x10, 0xbd, 0x0c, 0x0e, 0xad, 0x3d, 0x04, 0xeb, 0x1c, 0xbd, 0xe5, 0x6d, 0x0f, 0xbd, 0x1e, 0x40, 0xea, 0x3d, 0xfa, 0xc5, 0x36, 0x3d, 0x7a, 0xd3, 0x34, 0xbd, 0xe2, 0xe5, 0x4b, 0xbd, 0x27, 0x35, 0xf0, 0xbd, 0x60, 0x53, 0xc6, 0xbc, 0xb4, 0x7c, 0x0b, 0xbd, 0x0c, 0xc1, 0xbd, 0x39, 0x4b, 0xfb, 0x67, 0x3c, 0x4c, 0x65, 0xc4, 0x3c, 0x23, 0x9d, 0x88, 0x3c, 0x7c, 0x7e, 0xa0, 0x3b, 0x7f, 0xd2, 0x94, 0x3b, 0x45, 0xd2, 0x24, 0x3d, 0x00, 0xd4, 0xf5, 0xbb, 0x13, 0xf0, 0x99, 0x3d, 0xd6, 0x36, 0xa0, 0x3a, 0x28, 0xb0, 0x5d, 0x3d, 0x9f, 0xf9, 0x81, 0xbd, 0x42, 0x4b, 0x98, 0x3d, 0x29, 0x10, 0x7d, 0x3d, 0x8e, 0xe9, 0xf5, 0xbc, 0xfb, 0xc1, 0x91, 0xbc, 0x71, 0xda, 0xe2, 0xbc, 0x1e, 0x75, 0x3b, 0xbd, 0xbe, 0x22, 0x2f, 0x3d, 0xfa, 0xb6, 0x27, 0xba, 0x8c, 0x36, 0x86, 0x3c, 0x45, 0x63, 0xcf, 0xbc, 0x13, 0x05, 0x5e, 0xbc, 0xba, 0xc5, 0x24, 0xbd, 0xcd, 0x6d, 0x0b, 0x3c, 0x5d, 0xe6, 0x00, 0x3b, 0x82, 0xbb, 0xcf, 0xbc, 0xdb, 0x1f, 0x31, 0xbd, 0x91, 0x32, 0x95, 0xbc, 0x81, 0xff, 0x0b, 0xba, 0xa7, 0xe4, 0x0f, 0x3d, 0x50, 0xd4, 0x2c, 0x3d, 0x4c, 0x82, 0x27, 0x3c, 0x54, 0x76, 0x69, 0x3c, 0xef, 0x41, 0x53, 0xbb, 0x7b, 0x88, 0x26, 0xbd, 0xfa, 0x19, 0x51, 0x3d, 0x83, 0xe9, 0x89, 0xbd, 0x96, 0xa7, 0x4a, 0x3d, 0x87, 0xf0, 0xe6, 0xbc, 0x2b, 0x59, 0x61, 0xbc, 0x4a, 0x9a, 0x7d, 0x3d, 0x7c, 0x95, 0x54, 0x38, 0xa6, 0x6e, 0x69, 0x3d, 0xf3, 0x84, 0x27, 0xbd, 0x84, 0x7f, 0x26, 0x3c, 0xc3, 0xe1, 0x58, 0x3b, 0xa7, 0x2d, 0xa5, 0x3d, 0x13, 0x70, 0x2a, 0xbd, 0xae, 0x66, 0x1f, 0x3d, 0x6d, 0x44, 0xff, 0xbc, 0x66, 0x10, 0xb2, 0x3c, 0x94, 0xd5, 0x98, 0xb9, 0x00, 0xc8, 0xef, 0x3d, 0x5c, 0x00, 0x2f, 0xbc, 0xd7, 0xb1, 0xf6, 0x3c, 0x1b, 0xdb, 0xe1, 0x3c, 0xaa, 0x78, 0xe0, 0x3c, 0xb5, 0xe8, 0xd1, 0x3c, 0xda, 0x9e, 0x39, 0xbc, 0xe4, 0x90, 0x84, 0xbc, 0x42, 0x92, 0x6f, 0xbd, 0xdd, 0xd7, 0x8a, 0x3d, 0xd3, 0x62, 0x90, 0x3c, 0x1c, 0x20, 0x52, 0x3d, 0x1e, 0x29, 0x72, 0xbd, 0xf4, 0x8e, 0x1c, 0x3d, 0xd9, 0xda, 0xaf, 0xbc, 0x60, 0x11, 0x8e, 0xbb, 0x71, 0xc1, 0xbf, 0xbc, 0xec, 0x7f, 0x3d, 0x3c, 0xe5, 0x10, 0x3d, 0xbd, 0x1a, 0xbf, 0x69, 0x3d, 0x3f, 0x56, 0x0b, 0xbb, 0x19, 0x64, 0x9d, 0x3c, 0xe1, 0x00, 0x05, 0x3d, 0x4f, 0x77, 0x8e, 0x3d, 0x0f, 0x4d, 0x35, 0x3d, 0xe5, 0x6d, 0x4d, 0xbd, 0x9d, 0xb6, 0x58, 0x3c, 0x64, 0x44, 0x30, 0xba, 0x08, 0xe8, 0xaa, 0x3c, 0x73, 0xe7, 0x0b, 0x3d, 0x71, 0x00, 0x8c, 0x3d, 0x1a, 0xd9, 0xeb, 0x3c, 0xde, 0x78, 0xf2, 0xbb, 0xe5, 0x50, 0xcb, 0x3d, 0x03, 0x80, 0x7f, 0x3b, 0xb4, 0xf7, 0x1a, 0x3d, 0x32, 0xf5, 0xb0, 0x3d, 0x1c, 0x38, 0xe5, 0x3c, 0xb1, 0x72, 0x05, 0x3d, 0xc3, 0x92, 0xcf, 0x3c, 0xdc, 0x7b, 0x0c, 0xbe, 0x95, 0x0b, 0xfc, 0x3c, 0x5f, 0x34, 0x18, 0x3d, 0xc2, 0x08, 0x19, 0xbd, 0x25, 0xd4, 0x7b, 0x3d, 0x1e, 0xca, 0x88, 0xbd, 0x57, 0x5f, 0x9a, 0x3d, 0x57, 0x98, 0x80, 0x3d, 0x20, 0x7d, 0xdd, 0x3c, 0xdf, 0xb3, 0x65, 0x3d, 0x88, 0xde, 0x8d, 0xbd, 0x45, 0x90, 0x9d, 0x3d, 0x8a, 0xf8, 0xfa, 0xbc, 0xdf, 0xe2, 0xef, 0xb9, 0x21, 0x8d, 0x5a, 0xbc, 0x3e, 0x45, 0x17, 0x3c, 0x11, 0x8d, 0x8d, 0xbd, 0xb9, 0xd3, 0x2b, 0xb9, 0xd1, 0x2b, 0x24, 0xbc, 0x7e, 0x0e, 0x00, 0x3b, 0xfd, 0xc2, 0x2e, 0xbd, 0x80, 0x7d, 0x0d, 0x3d, 0x91, 0x8a, 0x49, 0x3d, 0xba, 0x7e, 0x10, 0x3d, 0xc3, 0x56, 0x2a, 0x3d, 0x1a, 0x4d, 0x6e, 0x3d, 0x20, 0x44, 0x90, 0x3c, 0x2f, 0xd8, 0x79, 0x3d, 0x7b, 0x5c, 0xab, 0x3d, 0x64, 0xa5, 0xe1, 0x3c, 0x26, 0x94, 0x31, 0x3d, 0xcc, 0xaf, 0xec, 0xbd, 0xc0, 0x25, 0x4b, 0xbd, 0xd1, 0x06, 0x87, 0x3d, 0x97, 0x3c, 0x44, 0xbd, 0x9c, 0x81, 0xc2, 0xbc, 0x0a, 0xd3, 0x1a, 0xbd, 0x0d, 0xe3, 0x00, 0xbd, 0x08, 0x6e, 0x53, 0xbd, 0x67, 0x84, 0x1a, 0x3d, 0xeb, 0xd0, 0x2f, 0x3d, 0x76, 0xea, 0x46, 0x3b, 0x3e, 0x6e, 0xbe, 0xbc, 0xf3, 0x6a, 0x11, 0x3d, 0x13, 0xed, 0xb8, 0x3c, 0xc1, 0x4f, 0x9a, 0x3d, 0xd6, 0x9a, 0x31, 0xbd, 0xcc, 0x51, 0x0e, 0x3d, 0x60, 0x8c, 0x89, 0x3d, 0x66, 0xc1, 0x41, 0xbd, 0x75, 0x80, 0xa2, 0x3d, 0x40, 0xbb, 0x5c, 0x3b, 0x6f, 0xb6, 0x90, 0x3d, 0xb7, 0x62, 0x02, 0x3c, 0x54, 0x75, 0x78, 0x3d, 0x3d, 0x29, 0xaf, 0x3d, 0x53, 0x5f, 0x97, 0x3d, 0xaf, 0x83, 0x91, 0xbc, 0xc9, 0x29, 0x55, 0x3d, 0xda, 0x00, 0x82, 0xbb, 0x8d, 0xcd, 0x2e, 0x3d, 0x9d, 0xcb, 0x88, 0xbd, 0x4d, 0x93, 0x3d, 0xbd, 0x55, 0xb8, 0x66, 0xbd, 0x98, 0xf2, 0x4e, 0xbc, 0xf9, 0xe0, 0x28, 0xbc, 0x6f, 0x30, 0x2d, 0x3d, 0xd8, 0xe6, 0x9e, 0x3d, 0x81, 0xcf, 0x31, 0xbd, 0x31, 0x50, 0x45, 0xbd, 0x90, 0x9e, 0x2f, 0xbd, 0x4b, 0x9a, 0x9a, 0x3d, 0x2f, 0x1a, 0xb3, 0xbc, 0x05, 0x59, 0x9b, 0xbc, 0xa6, 0x4f, 0x9b, 0xbc, 0x24, 0x10, 0x9e, 0xbd, 0x91, 0x8e, 0xa5, 0x3c, 0x0c, 0x2a, 0x43, 0x3d, 0x85, 0x85, 0x87, 0xbd, 0x00, 0x61, 0x36, 0xbd, 0x10, 0xb9, 0x43, 0xbc, 0x58, 0x2c, 0x24, 0x3b, 0xb7, 0x4f, 0x80, 0x3d, 0x46, 0x0f, 0x29, 0xbd, 0x76, 0x68, 0x44, 0xbd, 0x57, 0xcf, 0x18, 0xbd, 0x24, 0x15, 0x94, 0x3d, 0x13, 0x57, 0x98, 0x3d, 0x5e, 0xd6, 0x9c, 0x3d, 0xa0, 0x16, 0x9e, 0x3d, 0x66, 0x87, 0x83, 0xbd, 0x19, 0x6d, 0x8b, 0x3d, 0x24, 0x60, 0x9a, 0xbc, 0x00, 0x60, 0xea, 0xbb, 0xba, 0x09, 0x5f, 0xbd, 0xdc, 0xdd, 0xaa, 0x3b, 0x95, 0x08, 0xe9, 0xbc, 0x82, 0x0c, 0xc6, 0x3c, 0x19, 0xb1, 0xda, 0xbc, 0x80, 0x2e, 0x4b, 0x3c, 0xed, 0xab, 0x29, 0x3d, 0x17, 0x38, 0x51, 0x3d, 0x52, 0xa3, 0xef, 0x3c, 0xfd, 0x1c, 0x88, 0xbc, 0x40, 0x9f, 0x3a, 0x3c, 0x87, 0x8a, 0xbe, 0xbc, 0xe5, 0xf4, 0x2a, 0xbd, 0x01, 0x1f, 0x32, 0x3d, 0x2c, 0xbf, 0x3d, 0xbc, 0x33, 0xd3, 0xf9, 0xbb, 0xc4, 0x58, 0x2d, 0xbd, 0x5d, 0xa3, 0x8f, 0x3d, 0x27, 0x5d, 0x90, 0xbc, 0xcf, 0x00, 0x82, 0x3d, 0x0b, 0x65, 0xa7, 0x3d, 0x52, 0x11, 0xff, 0xbc, 0x37, 0xca, 0x18, 0xbd, 0xb9, 0x2f, 0x9d, 0x3c, 0x36, 0x90, 0x68, 0x3d, 0x85, 0x61, 0x6b, 0x3d, 0x27, 0xb0, 0x89, 0xbc, 0xcb, 0xb5, 0xac, 0xbb, 0xf4, 0x4b, 0x79, 0xbc, 0x34, 0x73, 0xe7, 0xbc, 0x81, 0x9b, 0x86, 0x3c, 0x58, 0xc2, 0xce, 0x3c, 0x0a, 0x63, 0x2c, 0xbd, 0xf6, 0xd3, 0xcf, 0xbd, 0xea, 0xf1, 0x01, 0xbd, 0x7a, 0x64, 0xe0, 0xbc, 0x12, 0x3a, 0x28, 0x3d, 0x98, 0xe9, 0x98, 0x3d, 0x95, 0xf1, 0xa8, 0xbc, 0x88, 0xb4, 0x2a, 0x3d, 0x81, 0xdf, 0xc4, 0xbc, 0x62, 0xb8, 0xfb, 0xbc, 0x46, 0xd2, 0x90, 0xbd, 0x74, 0x0a, 0xc4, 0x3c, 0x8e, 0x57, 0x6f, 0x3d, 0xf9, 0xea, 0x78, 0x3d, 0xdc, 0x6e, 0x62, 0xbd, 0x46, 0xe2, 0x16, 0xbd, 0xa6, 0x36, 0x37, 0xbd, 0xf5, 0x36, 0x35, 0xbd, 0x9a, 0x4f, 0xb8, 0xbc, 0xf2, 0xab, 0x15, 0x3c, 0xee, 0x55, 0xd7, 0x3b, 0xfa, 0xd0, 0x1c, 0xbd, 0xd4, 0x6b, 0x97, 0xbc, 0x91, 0x57, 0x51, 0xbd, 0x7c, 0xc9, 0x64, 0x3d, 0xf8, 0x29, 0xcd, 0xbc, 0x75, 0x65, 0x67, 0x3d, 0xaa, 0xd9, 0xa3, 0x3c, 0x55, 0xff, 0x8f, 0x3c, 0x7c, 0x18, 0x46, 0xbd, 0x92, 0x18, 0x2c, 0x3d, 0x3a, 0x9f, 0x8a, 0xbc, 0xee, 0xd4, 0x05, 0x3d, 0x37, 0x03, 0xaa, 0xbd, 0xe9, 0x50, 0x07, 0xbe, 0x1a, 0x94, 0x18, 0x3d, 0x79, 0x69, 0x03, 0xbd, 0x7f, 0xc8, 0xd4, 0xbc, 0x25, 0xa7, 0x86, 0x3a, 0x17, 0xf1, 0x00, 0x3c, 0xfd, 0x40, 0x10, 0x3d, 0x6e, 0x29, 0xf7, 0x3c, 0x05, 0xb0, 0x38, 0xbd, 0x7e, 0x44, 0x5a, 0xbc, 0x0e, 0xdf, 0x66, 0x3d, 0x08, 0x9d, 0x10, 0xbc, 0xff, 0x12, 0x8e, 0xbb, 0x01, 0x3f, 0x67, 0xbc, 0x6e, 0xa6, 0x4f, 0x3d, 0xca, 0x07, 0x63, 0xbd, 0x97, 0x61, 0x4b, 0x3d, 0x71, 0x21, 0x34, 0x3d, 0x4f, 0xa2, 0x6d, 0x3d, 0x8f, 0xf5, 0xe8, 0xbd, 0x72, 0x55, 0x4b, 0xbd, 0xee, 0xb2, 0xe9, 0xbc, 0xf2, 0x49, 0xa7, 0x3d, 0x89, 0x22, 0xf5, 0x3c, 0xd8, 0x73, 0xcb, 0x3d, 0xbb, 0x15, 0x81, 0x3d, 0x33, 0xf1, 0x5c, 0x3d, 0xa7, 0x30, 0x96, 0xbd, 0x4b, 0x2c, 0x58, 0xbd, 0x34, 0x05, 0x00, 0x3d, 0xbd, 0x81, 0x92, 0x3d, 0x67, 0x5b, 0x5f, 0xbc, 0xb4, 0x1e, 0xe6, 0xbd, 0x7c, 0x56, 0x00, 0x3c, 0x7c, 0x6d, 0xa8, 0x3c, 0x9b, 0x21, 0xbd, 0xbb, 0x71, 0xf4, 0x48, 0xbd, 0xf8, 0xe1, 0x87, 0xbd, 0xd7, 0x4f, 0xaf, 0xbc, 0x08, 0xef, 0xd9, 0x3c, 0x3e, 0x7b, 0x24, 0x3c, 0xa8, 0xcc, 0xe7, 0x3c, 0xf0, 0xa0, 0x4a, 0xbd, 0x45, 0xbf, 0x39, 0xbd, 0x4e, 0xb6, 0xd6, 0x3c, 0xfb, 0xfb, 0x49, 0x3d, 0xdd, 0x90, 0x4e, 0x3c, 0x0c, 0xb0, 0x83, 0x3d, 0x2d, 0x83, 0x42, 0x3c, 0x1f, 0x45, 0xeb, 0xbb, 0xd3, 0x7e, 0xf2, 0x3b, 0x4d, 0x22, 0xa6, 0xbd, 0x40, 0x45, 0x5c, 0xbb, 0x8c, 0xa5, 0x1c, 0xbd, 0x57, 0xd9, 0x86, 0x3d, 0x45, 0xfc, 0x4e, 0x3d, 0xc5, 0x64, 0x24, 0x3d, 0xc9, 0xf4, 0x27, 0x3c, 0xc7, 0x86, 0x08, 0x3d, 0x9c, 0x3c, 0x13, 0x3b, 0xab, 0x69, 0x12, 0x3d, 0x0d, 0xfa, 0x80, 0x3d, 0x6b, 0x86, 0x15, 0xbd, 0x93, 0x11, 0x1e, 0xbd, 0x70, 0x3b, 0x02, 0x3b, 0x50, 0x75, 0x06, 0xbd, 0x61, 0xe8, 0x7b, 0xbc, 0x5a, 0x15, 0xa7, 0x3d, 0x47, 0x26, 0x0b, 0x3c, 0xb8, 0x03, 0x98, 0x3c, 0xce, 0xcc, 0x8e, 0x3d, 0x12, 0x6c, 0xba, 0xbc, 0xca, 0x74, 0x5f, 0xbd, 0x84, 0x45, 0xd6, 0x3d, 0x2a, 0xc6, 0xb3, 0xbc, 0x75, 0x88, 0x53, 0x3d, 0x44, 0xc0, 0x37, 0x3c, 0x69, 0x7c, 0x59, 0x3d, 0xc1, 0xa5, 0xe5, 0xbc, 0x61, 0xc0, 0x9f, 0x3c, 0xbc, 0x7d, 0x7e, 0xbc, 0x9c, 0x18, 0x79, 0xbd, 0x09, 0x70, 0x16, 0x3d, 0xdd, 0x36, 0x0b, 0x3d, 0xcc, 0xba, 0xc8, 0x3c, 0xe6, 0xae, 0x18, 0xbc, 0xd6, 0x1a, 0x20, 0xbd, 0x43, 0x22, 0x24, 0xbc, 0xcc, 0x3e, 0xd4, 0x3c, 0xe2, 0x43, 0x1a, 0xbb, 0x02, 0x94, 0xd5, 0x3c, 0x24, 0x73, 0x3d, 0x3d, 0x4d, 0x1c, 0xce, 0x3c, 0x94, 0xea, 0x4a, 0x3d, 0x33, 0x7a, 0x09, 0x3d, 0xf4, 0xcc, 0x66, 0xbd, 0x13, 0xb9, 0x9e, 0xbd, 0x98, 0xbe, 0xb4, 0xbc, 0x19, 0x14, 0x21, 0x3d, 0x97, 0xca, 0x50, 0x3d, 0x8f, 0x3f, 0x2f, 0xbc, 0x69, 0x98, 0x25, 0x3d, 0x55, 0x13, 0x80, 0xbc, 0xef, 0x2e, 0x82, 0x3d, 0x24, 0xea, 0x71, 0xbd, 0x84, 0x97, 0x32, 0xbd, 0xb0, 0xaa, 0xaf, 0x3c, 0xfa, 0x13, 0x9b, 0x3d, 0x56, 0xa5, 0x2b, 0x3d, 0x03, 0x06, 0x2d, 0xbc, 0x6c, 0x24, 0x39, 0xbd, 0x46, 0x80, 0x29, 0x3d, 0x64, 0xdb, 0x61, 0xbb, 0x85, 0x2a, 0x22, 0xbd, 0x9f, 0x47, 0xc1, 0x3d, 0x71, 0xc5, 0x85, 0xbd, 0x00, 0x31, 0x9c, 0xb9, 0xc4, 0xd0, 0x2e, 0xbd, 0x08, 0x5d, 0x36, 0x3d, 0x41, 0x70, 0x3f, 0xbd, 0x01, 0xc0, 0x87, 0x3c, 0x05, 0xf1, 0x37, 0xbc, 0xaf, 0x5d, 0xd4, 0xbb, 0x10, 0xa9, 0x1c, 0x3d, 0xb8, 0xa9, 0x62, 0xba, 0xae, 0x29, 0x71, 0x3d, 0x51, 0x57, 0x73, 0xbc, 0x05, 0x0a, 0xb8, 0xbd, 0xe3, 0x38, 0xa1, 0xbd, 0x3d, 0x08, 0x13, 0x3d, 0x54, 0x69, 0x80, 0xbd, 0xe9, 0x65, 0x60, 0xbd, 0x2e, 0x02, 0x88, 0x3d, 0x00, 0xdf, 0x58, 0xbb, 0xde, 0x06, 0x35, 0xbd, 0x1e, 0x3f, 0x0a, 0xbd, 0x35, 0xe2, 0x15, 0xbd, 0xa6, 0xe3, 0x99, 0x3d, 0x42, 0x8e, 0x2e, 0xbd, 0x9b, 0x10, 0x97, 0xbd, 0xd9, 0x36, 0xca, 0x3b, 0x27, 0x9f, 0x5c, 0xbd, 0xb8, 0x0c, 0x25, 0xbd, 0x61, 0xe3, 0x8e, 0x3d, 0x8b, 0x23, 0xa5, 0xbc, 0xf4, 0xda, 0x47, 0xbd, 0x30, 0x95, 0xac, 0x3c, 0xe1, 0xb0, 0xab, 0xbd, 0xb0, 0x5a, 0x15, 0x3d, 0x58, 0x7e, 0x35, 0x3d, 0x13, 0xeb, 0x48, 0xbc, 0x00, 0xe6, 0x80, 0x3c, 0x39, 0x59, 0x21, 0xbb, 0xca, 0xf7, 0xbe, 0x3d, 0x2a, 0xb9, 0x37, 0x3d, 0x26, 0x13, 0x80, 0x3d, 0x9e, 0xbd, 0xc7, 0x3c, 0xb6, 0xd6, 0x50, 0xbd, 0xa6, 0x52, 0x82, 0x3d, 0x39, 0xa3, 0x81, 0xb9, 0xe3, 0xb2, 0xf8, 0xbd, 0xc5, 0x84, 0x54, 0xbd, 0xba, 0xea, 0x27, 0x3d, 0x1e, 0xce, 0xcf, 0x3c, 0x0d, 0xd3, 0x6f, 0x3c, 0xa7, 0xce, 0x87, 0xbc, 0x67, 0xe3, 0x5e, 0xbd, 0xf6, 0xdc, 0x3b, 0x3d, 0xca, 0x8f, 0x23, 0xbd, 0x69, 0x20, 0x9e, 0x3b, 0x32, 0x59, 0x2e, 0x3d, 0x12, 0x32, 0x09, 0xbd, 0xa1, 0xc3, 0x2a, 0x3c, 0x68, 0x2a, 0x6b, 0xbc, 0xf7, 0xbf, 0x92, 0xbc, 0x97, 0x8c, 0x97, 0x3d, 0x8e, 0xc6, 0x74, 0x3c, 0x04, 0x01, 0x47, 0x3c, 0x6b, 0x51, 0xf0, 0x3d, 0x0e, 0xf6, 0x3b, 0x3b, 0xee, 0xeb, 0x5d, 0x3d, 0x98, 0x69, 0x9b, 0x3c, 0xb5, 0x47, 0xfc, 0xbc, 0x5e, 0x56, 0x40, 0xbc, 0x15, 0x4e, 0xad, 0xbb, 0x84, 0xcf, 0x96, 0x3c, 0xe3, 0x32, 0xbe, 0xbc, 0x36, 0xcd, 0xc8, 0x3d, 0x70, 0xb8, 0x97, 0x3d, 0xd9, 0xc3, 0x28, 0xbd, 0x6c, 0xec, 0x7b, 0x3d, 0xbf, 0x32, 0xc6, 0xbd, 0x98, 0x0d, 0x0f, 0xbe, 0x32, 0xaa, 0x95, 0x3d, 0x6e, 0x2c, 0xfd, 0xbc, 0x10, 0x45, 0xc1, 0xbb, 0x4d, 0x8b, 0x03, 0x3d, 0xe4, 0x05, 0xde, 0xbc, 0x0d, 0x7c, 0xbe, 0x3c, 0x07, 0x24, 0x77, 0x3d, 0x98, 0xb0, 0x2a, 0x3c, 0x21, 0xc9, 0xa3, 0x3c, 0x1a, 0x6d, 0x69, 0x3d, 0x33, 0xf6, 0xeb, 0xbc, 0x40, 0x77, 0x90, 0x3d, 0x6c, 0xf5, 0x99, 0x3c, 0x42, 0x69, 0x08, 0x3d, 0x9b, 0x3f, 0xde, 0xbc, 0xe0, 0x71, 0x04, 0xbd, 0x6a, 0xcd, 0xfe, 0xbb, 0x77, 0xd6, 0xb3, 0x3d, 0xf9, 0xb4, 0xcc, 0x3b, 0x6a, 0x1c, 0x70, 0x3d, 0x10, 0x34, 0x15, 0xbc, 0x82, 0x15, 0x3a, 0x3d, 0xa8, 0xa6, 0x02, 0x3d, 0x06, 0x03, 0xaa, 0x3d, 0x15, 0x2c, 0xe6, 0xbc, 0xac, 0xf0, 0xdc, 0x3c, 0xa7, 0x3b, 0xef, 0xbc, 0x7a, 0xa7, 0x93, 0x3d, 0xaf, 0x46, 0x87, 0x3c, 0xf9, 0x13, 0x76, 0xbb, 0x30, 0x99, 0x15, 0xbd, 0x36, 0xd1, 0x8f, 0xbc, 0xc9, 0x26, 0xaf, 0x3d, 0xc0, 0xa3, 0x5b, 0x3c, 0x69, 0x65, 0x84, 0xbd, 0x1e, 0x30, 0x81, 0x3d, 0xb4, 0xbc, 0x22, 0x3d, 0x16, 0x60, 0x52, 0x3d, 0x5e, 0xfe, 0x6a, 0xbc, 0x16, 0x65, 0x34, 0xbd, 0xfe, 0xab, 0xf0, 0x3c, 0xe1, 0xfd, 0x90, 0x3d, 0xd4, 0x61, 0x6a, 0xbd, 0x55, 0xd1, 0x85, 0xbd, 0x87, 0x6f, 0x66, 0xbd, 0x29, 0x4a, 0x8d, 0x3a, 0xec, 0x8f, 0x91, 0x3d, 0x07, 0x75, 0x5a, 0x3b, 0x95, 0x09, 0x27, 0x3b, 0x25, 0x10, 0xd3, 0x3d, 0xde, 0xfe, 0x0b, 0xbd, 0xe8, 0xd4, 0xc4, 0x3c, 0x4e, 0xda, 0x7d, 0x3c, 0x54, 0xb5, 0xe8, 0xba, 0x69, 0x46, 0x40, 0x3d, 0xd1, 0xd6, 0x48, 0x3c, 0xfa, 0xb9, 0x87, 0x39, 0x5a, 0x17, 0x20, 0xbc, 0xd5, 0x9b, 0x66, 0x3d, 0x19, 0x23, 0xac, 0x3c, 0x56, 0x76, 0x5a, 0xbd, 0x7e, 0x50, 0x3c, 0xbc, 0x02, 0x8b, 0x17, 0xbd, 0x42, 0x85, 0xc6, 0xbd, 0x06, 0x12, 0x9f, 0x3d, 0xad, 0x96, 0xc7, 0xbb, 0xd9, 0xfc, 0xff, 0xbb, 0xb9, 0x86, 0x71, 0x3c, 0xc7, 0xf6, 0x3f, 0xbd, 0xc2, 0x39, 0xf7, 0x3a, 0x25, 0xcb, 0xf0, 0x3c, 0xfe, 0x25, 0xb0, 0xbb, 0xd3, 0x39, 0x02, 0x3d, 0xf8, 0xa3, 0x08, 0xbd, 0xba, 0xf2, 0x4e, 0xbd, 0x53, 0x83, 0x46, 0xbd, 0xae, 0x06, 0x06, 0x3d, 0x69, 0xf3, 0x8f, 0x3d, 0xd3, 0x57, 0x35, 0x3c, 0x05, 0x92, 0xb9, 0x3c, 0x60, 0x8e, 0x5b, 0x3b, 0xab, 0x7a, 0x8d, 0xbc, 0xf6, 0xdf, 0x87, 0xbd, 0x0d, 0xc5, 0x81, 0x3d, 0xec, 0x93, 0x5f, 0x3d, 0xf6, 0x54, 0x85, 0x3d, 0x86, 0xb3, 0x16, 0xbc, 0x7d, 0x95, 0x97, 0x3d, 0xff, 0xd8, 0x0c, 0x3d, 0x21, 0x38, 0x6e, 0xbd, 0x68, 0xfc, 0x83, 0x3d, 0x5c, 0x54, 0x1b, 0xbc, 0x26, 0x1d, 0x03, 0x3d, 0xd8, 0xaa, 0x90, 0xbd, 0xa9, 0x58, 0x0b, 0x3b, 0x02, 0x4e, 0x40, 0xbd, 0xdc, 0x76, 0xe0, 0xbb, 0x14, 0x2e, 0x24, 0x3d, 0xbb, 0x6b, 0xfe, 0x3b, 0xfd, 0xb5, 0x99, 0xbd, 0x4b, 0x2b, 0x0e, 0xbd, 0x2f, 0xc8, 0x69, 0xbd, 0xff, 0xf0, 0x04, 0x3d, 0x46, 0x9c, 0x13, 0x3c, 0x74, 0x89, 0x2e, 0x3d, 0xbe, 0x6e, 0x52, 0xbd, 0x59, 0x23, 0x34, 0x3d, 0x72, 0x3a, 0x3e, 0xbd, 0xf8, 0x03, 0x7a, 0x3d, 0x8e, 0xab, 0x74, 0x3c, 0x6e, 0x5e, 0x82, 0x3d, 0x16, 0x5b, 0x25, 0x3c, 0x56, 0x2c, 0xe7, 0xbd, 0x19, 0x4d, 0xc0, 0x3d, 0x8a, 0xb3, 0xdb, 0xbd, 0x34, 0xe5, 0x67, 0xbc, 0x0f, 0x5d, 0x35, 0x3d, 0xad, 0xad, 0x94, 0x3d, 0xa5, 0xc3, 0xba, 0xba, 0xb4, 0x7f, 0x02, 0x3e, 0xde, 0xcd, 0x8d, 0x3d, 0xc3, 0xa4, 0xa4, 0xbd, 0x7e, 0x1b, 0x37, 0x3d, 0xde, 0xb4, 0x91, 0xbd, 0x78, 0xf2, 0x62, 0xbd, 0x25, 0x4f, 0x60, 0xbd, 0x4e, 0xd2, 0x25, 0xbd, 0xd3, 0xc3, 0xe8, 0xbb, 0x7f, 0x00, 0x68, 0x3d, 0x7a, 0x9c, 0x1e, 0xbd, 0x17, 0x70, 0x81, 0x3c, 0xda, 0xb3, 0x68, 0x3d, 0xab, 0xf3, 0xb4, 0xbc, 0x46, 0x70, 0x16, 0xbd, 0x22, 0xe5, 0x82, 0x3d, 0x75, 0x02, 0x5a, 0x3d, 0xb5, 0xce, 0x86, 0xbd, 0x20, 0x29, 0xa8, 0xbb, 0xe5, 0x29, 0x95, 0xbd, 0x63, 0x0c, 0x5f, 0xbd, 0x42, 0x39, 0x99, 0xbc, 0x27, 0xd6, 0x82, 0xbb, 0x33, 0x1c, 0xda, 0xbc, 0x93, 0x96, 0x76, 0x3d, 0xd3, 0x8c, 0xd3, 0xbd, 0x75, 0x39, 0xe1, 0x3d, 0x42, 0x5b, 0x98, 0xbd, 0x5a, 0xc4, 0x4f, 0x3d, 0x3b, 0xb0, 0x14, 0xbd, 0xfc, 0x99, 0x4b, 0xbc, 0xd4, 0x88, 0x13, 0xbb, 0x6c, 0xca, 0xc4, 0x3d, 0xd4, 0xdc, 0xb1, 0x3d, 0x62, 0x2a, 0x8d, 0x3c, 0xd8, 0x1b, 0xb7, 0x3c, 0x0b, 0x8d, 0xba, 0xbb, 0x78, 0x25, 0x5c, 0xbd, 0xb9, 0xc6, 0xbb, 0xba, 0x26, 0x58, 0xc5, 0xbd, 0x5d, 0x48, 0xb7, 0xbd, 0x71, 0x0d, 0x0e, 0x3d, 0xa8, 0xa7, 0x54, 0xbd, 0x88, 0xfe, 0x84, 0xbc, 0x0b, 0x64, 0x1b, 0xbc, 0xba, 0xaa, 0x8e, 0x3c, 0x89, 0x54, 0xa5, 0xbc, 0xde, 0x32, 0x9c, 0x3c, 0x90, 0x13, 0x66, 0xbd, 0xb2, 0x5e, 0x11, 0xbd, 0xd0, 0x5e, 0xfb, 0xbb, 0x2e, 0x6c, 0x8c, 0xbd, 0x09, 0x4b, 0x2f, 0xbc, 0xa8, 0x5d, 0x27, 0xbd, 0xad, 0xd8, 0x2e, 0x3d, 0x78, 0x5e, 0xf0, 0x3c, 0x8e, 0xc0, 0x12, 0x3d, 0x49, 0xb5, 0xca, 0xbd, 0x1b, 0x2e, 0xb0, 0x3d, 0xeb, 0x3c, 0x8b, 0xbd, 0xe2, 0x4b, 0xd6, 0xbc, 0x14, 0xdf, 0xc3, 0x3c, 0x42, 0x9c, 0x87, 0x3c, 0xb7, 0x90, 0x18, 0x3d, 0xcb, 0x8a, 0xd8, 0x3d, 0xc1, 0x0c, 0x97, 0x3d, 0x35, 0xe8, 0xd3, 0x3c, 0xb1, 0x05, 0x28, 0x3d, 0x03, 0xd2, 0xbc, 0x3d, 0x56, 0xce, 0x44, 0x3d, 0x9f, 0xbf, 0x24, 0x3d, 0x21, 0x81, 0x81, 0xbd, 0xc0, 0xa2, 0xda, 0xbd, 0x50, 0x42, 0x27, 0x3d, 0x5f, 0xb2, 0xb9, 0x3c, 0x04, 0x67, 0x6c, 0x3d, 0xce, 0x89, 0x2c, 0xbd, 0x08, 0x2d, 0x4b, 0x3c, 0x88, 0x86, 0xf7, 0x3c, 0xcd, 0x8e, 0x94, 0x3d, 0x5a, 0x47, 0x6f, 0x3d, 0x67, 0xf4, 0xa2, 0xbd, 0xe3, 0x50, 0x91, 0xbd, 0xde, 0x9e, 0x84, 0x3d, 0xb3, 0x05, 0xbf, 0x3c, 0x10, 0x17, 0x34, 0x3d, 0xf4, 0x1f, 0x0e, 0xbd, 0x47, 0xb9, 0x49, 0x3d, 0xb1, 0x61, 0x10, 0x3d, 0x2a, 0x64, 0x90, 0xbd, 0x1e, 0xc9, 0xb8, 0x3c, 0x7d, 0x23, 0xb8, 0xbd, 0x19, 0x60, 0x85, 0x3d, 0x44, 0xb5, 0x4d, 0xbd, 0x05, 0x79, 0xec, 0x3b, 0xea, 0x1e, 0x21, 0xbd, 0xeb, 0x34, 0x59, 0x3d, 0x50, 0xa9, 0x00, 0x3d, 0x72, 0xf1, 0x4c, 0xb9, 0x98, 0x35, 0xc1, 0x3d, 0xbb, 0x18, 0x36, 0x3d, 0x19, 0x70, 0x62, 0xbd, 0xc5, 0xae, 0x75, 0x3d, 0x27, 0x77, 0xec, 0xbc, 0xab, 0x6d, 0xe1, 0xbd, 0x75, 0x4a, 0xae, 0x3c, 0x2d, 0xea, 0x18, 0xbb, 0xdc, 0x0e, 0x7b, 0x3d, 0xb2, 0x28, 0x24, 0xbd, 0x69, 0xd2, 0x78, 0xbd, 0xed, 0x29, 0x5f, 0xbc, 0xd9, 0x6e, 0x44, 0x3d, 0x3c, 0x6c, 0x87, 0xbd, 0xa5, 0xdf, 0x96, 0xbc, 0x1c, 0x4c, 0x35, 0x3d, 0x54, 0x97, 0x57, 0xbd, 0xe9, 0x88, 0x40, 0xbd, 0x6d, 0x9d, 0x71, 0x3c, 0x3f, 0x74, 0xaf, 0xbb, 0x41, 0xfa, 0x4b, 0x3d, 0x20, 0xe8, 0x7a, 0xbc, 0xe4, 0x37, 0xbe, 0xbd, 0xfa, 0xa2, 0x44, 0xbc, 0x2a, 0x3c, 0x61, 0xbd, 0xec, 0x0f, 0x0c, 0x3d, 0xd7, 0xef, 0x82, 0xbd, 0x0b, 0xe4, 0xd2, 0xbc, 0xd2, 0x57, 0x04, 0x3c, 0xa8, 0x6e, 0xce, 0x3d, 0x3c, 0xd8, 0xa4, 0x3b, 0x1d, 0x19, 0x45, 0xbd, 0xd6, 0x4d, 0x70, 0x3c, 0xed, 0x12, 0xf0, 0xbc, 0x1f, 0xc6, 0x4c, 0x3c, 0xeb, 0x27, 0x8e, 0xbc, 0x6a, 0xf8, 0x4f, 0x3d, 0xcf, 0x2c, 0xe3, 0xbd, 0x3b, 0xc9, 0x05, 0xbb, 0xe0, 0xfa, 0xfd, 0x3c, 0xfe, 0xb8, 0xfb, 0xbc, 0x84, 0xd9, 0x8b, 0x3d, 0xad, 0x88, 0x00, 0x3d, 0x21, 0xfa, 0x47, 0x3d, 0xf6, 0x17, 0x0d, 0xbd, 0xc5, 0x0c, 0xf1, 0x3c, 0xec, 0x3c, 0x13, 0xbd, 0x1a, 0x06, 0x4b, 0xbd, 0x76, 0x04, 0xa4, 0xbc, 0x89, 0x87, 0x92, 0x3d, 0xd2, 0xc6, 0xaf, 0x3d, 0xb1, 0xb1, 0x12, 0x3d, 0x99, 0xa4, 0x23, 0x3d, 0x25, 0x73, 0x75, 0x3b, 0x18, 0x34, 0xa1, 0xbd, 0xc0, 0x90, 0xa5, 0x3d, 0xaa, 0xa8, 0x14, 0xbd, 0x6c, 0xbc, 0xf3, 0x3c, 0x8a, 0x47, 0x51, 0xbc, 0xab, 0xfc, 0x2a, 0x3d, 0xc8, 0xb7, 0x68, 0x3d, 0xff, 0xbf, 0x72, 0x3d, 0x38, 0x39, 0x95, 0x3d, 0xdc, 0x49, 0x94, 0xbc, 0xbd, 0xce, 0x90, 0x3c, 0xcd, 0x13, 0x35, 0x3d, 0xd4, 0xd9, 0x51, 0xbd, 0x16, 0xde, 0xfb, 0xbc, 0xc7, 0x00, 0xb9, 0xbd, 0x38, 0x8e, 0x2e, 0xbc, 0xcb, 0xce, 0x5e, 0x3d, 0x44, 0x22, 0x7a, 0x3c, 0x70, 0x0a, 0x93, 0x3d, 0x9c, 0x88, 0x81, 0x3a, 0x02, 0x89, 0x01, 0xbd, 0x52, 0x9b, 0x50, 0xbc, 0xc7, 0x6f, 0x46, 0x3c, 0x41, 0xb4, 0x57, 0x3d, 0x79, 0x89, 0xd2, 0x3b, 0x20, 0xab, 0x75, 0x3b, 0x40, 0xf2, 0xea, 0x3c, 0x8f, 0x29, 0x8c, 0x3d, 0xb0, 0x20, 0x45, 0xbd, 0xf4, 0x67, 0x8c, 0x3d, 0xbf, 0x3f, 0x9d, 0x3c, 0xa7, 0x71, 0x01, 0xbd, 0x37, 0x6b, 0x02, 0xbc, 0x68, 0xc4, 0x2a, 0x3d, 0x43, 0x60, 0x9b, 0xbc, 0x72, 0xb9, 0x73, 0xbd, 0x90, 0xc4, 0x13, 0x3c, 0xba, 0xbf, 0x50, 0xbb, 0x86, 0x75, 0x78, 0xbd, 0x2e, 0xaf, 0x69, 0xbc, 0xdb, 0x89, 0xbc, 0x3d, 0x05, 0x7f, 0xa8, 0xbd, 0x42, 0x5f, 0x02, 0x3d, 0xe1, 0x3c, 0x12, 0xbd, 0xfd, 0xdf, 0x41, 0x3d, 0x2e, 0xda, 0xe3, 0xbb, 0x80, 0x3c, 0x5f, 0xbd, 0x26, 0x2b, 0x1f, 0xbd, 0xa8, 0xed, 0xd5, 0x3c, 0xa6, 0x84, 0xf1, 0x3c, 0xbe, 0xd2, 0x9a, 0xbb, 0x5b, 0x04, 0x61, 0x3d, 0x2b, 0xe5, 0x06, 0xbd, 0xc9, 0xb8, 0x85, 0x3c, 0x64, 0x7a, 0xc7, 0x3d, 0x4c, 0x12, 0xc9, 0x3c, 0x69, 0x12, 0x63, 0xbd, 0x88, 0x73, 0xbf, 0x3c, 0xfc, 0x66, 0x50, 0xbb, 0x64, 0x31, 0x9a, 0xbd, 0xeb, 0x81, 0x8d, 0x3d, 0x7e, 0x4e, 0xc5, 0x3c, 0x15, 0x80, 0x96, 0x3d, 0xb9, 0x1f, 0x65, 0xbd, 0xe3, 0x99, 0xda, 0xbd, 0x94, 0x02, 0x4a, 0x3c, 0xbf, 0x7b, 0x26, 0x3d, 0x20, 0xae, 0x9d, 0xbb, 0x84, 0x49, 0x1e, 0x3d, 0x88, 0x11, 0x17, 0x3d, 0x45, 0x77, 0x73, 0x3c, 0x76, 0x33, 0xaa, 0x3c, 0x28, 0x4d, 0x4b, 0x3d, 0x49, 0x89, 0x37, 0x3c, 0x3f, 0xe6, 0x92, 0xbd, 0xc8, 0x39, 0xa0, 0x3c, 0xd6, 0xff, 0x0a, 0x3b, 0xb4, 0xef, 0xad, 0xbd, 0xdb, 0x17, 0x19, 0x3c, 0x9a, 0x54, 0x7c, 0xbd, 0xe7, 0x50, 0xcc, 0x3c, 0x91, 0xeb, 0x75, 0xbd, 0x9a, 0x45, 0xac, 0x3d, 0xd3, 0x80, 0x4d, 0xbd, 0x17, 0x6c, 0x19, 0x3c, 0x47, 0xb1, 0x1f, 0xbd, 0xef, 0x17, 0x1d, 0xbd, 0xa2, 0xc8, 0x58, 0xbc, 0xf9, 0xc6, 0x81, 0xbb, 0x70, 0xfc, 0xa1, 0x3b, 0x70, 0x74, 0x38, 0x3d, 0xb9, 0x93, 0x6c, 0x3d, 0xb5, 0x22, 0x89, 0x3d, 0xa8, 0x15, 0xed, 0xbb, 0xee, 0x0c, 0xac, 0xbc, 0xbf, 0xca, 0xbe, 0xbc, 0x8e, 0x0d, 0xbf, 0xbd, 0xfb, 0x0c, 0x92, 0x3c, 0x3d, 0x1e, 0x61, 0xbd, 0xe1, 0xb2, 0x08, 0xbd, 0xcd, 0xab, 0x75, 0xbb, 0xc5, 0x1a, 0x2f, 0x3d, 0x4f, 0x02, 0x92, 0x3c, 0x8f, 0x47, 0x20, 0x3d, 0x33, 0xac, 0xc3, 0x3d, 0xc9, 0xdc, 0xbd, 0xbc, 0x68, 0x6e, 0xb4, 0x3b, 0x32, 0x32, 0xdc, 0x3d, 0xd8, 0xff, 0x92, 0x3d, 0xb3, 0xa4, 0x6f, 0xbd, 0xf0, 0xbe, 0x13, 0xbd, 0xff, 0xf5, 0xdf, 0xbd, 0x67, 0xeb, 0x94, 0x3c, 0xb2, 0xe8, 0x57, 0xbb, 0x92, 0x3f, 0xdc, 0xbb, 0xe3, 0x5f, 0x6b, 0x3c, 0x02, 0xcc, 0x6c, 0xbd, 0x25, 0xa1, 0x57, 0xbd, 0x22, 0x01, 0x82, 0x3d, 0xc3, 0xcf, 0xb2, 0x3c, 0xed, 0x35, 0x56, 0xbb, 0xe3, 0xf0, 0x8c, 0x3d, 0xdb, 0xf1, 0xb1, 0xbc, 0xaa, 0xe4, 0xc2, 0x3b, 0x53, 0x9c, 0xf6, 0xbc, 0x15, 0x86, 0x92, 0x3d, 0xe4, 0xf9, 0x39, 0x3d, 0x09, 0xa5, 0xa8, 0xbc, 0x6e, 0x89, 0xd1, 0xbc, 0x47, 0xd4, 0x7b, 0x3c, 0x7b, 0xff, 0xab, 0x3c, 0x15, 0x58, 0x8d, 0xbd, 0x7b, 0x21, 0xac, 0x3c, 0xda, 0xe5, 0xad, 0xbc, 0x8b, 0xfc, 0xd8, 0xbc, 0x8c, 0xe1, 0x0e, 0xbc, 0x36, 0x43, 0xc6, 0x3d, 0xfa, 0x15, 0x8b, 0xbc, 0xb8, 0xd0, 0x07, 0x3d, 0xd9, 0x12, 0x9c, 0x3c, 0x81, 0x20, 0x4f, 0xbd, 0xd8, 0x7f, 0x18, 0x3b, 0x38, 0xd4, 0x33, 0xbc, 0x00, 0x0f, 0xe2, 0xbd, 0x25, 0xa8, 0xf2, 0x3c, 0x87, 0xa6, 0x96, 0xbd, 0x84, 0xc3, 0xa8, 0x3c, 0xf4, 0x7a, 0x8b, 0x3c, 0xfd, 0xbd, 0x55, 0xbc, 0x45, 0x00, 0x97, 0xbd, 0x81, 0x3a, 0xbd, 0x3b, 0x21, 0x43, 0x30, 0xbd, 0x94, 0x58, 0xa5, 0x3b, 0x30, 0x2f, 0x12, 0xbd, 0xcb, 0xd3, 0x32, 0x3d, 0x36, 0xd2, 0x7c, 0xbd, 0xf2, 0x77, 0x49, 0x3d, 0x87, 0xdd, 0x87, 0xbc, 0x3d, 0x1a, 0x02, 0x3d, 0x5a, 0x1b, 0xc1, 0x3c, 0x04, 0xaf, 0x33, 0xbd, 0x84, 0x02, 0x1d, 0x3d, 0x47, 0x7d, 0x21, 0xbd, 0x46, 0xc4, 0x24, 0x3d, 0x8f, 0x16, 0x27, 0x3d, 0xce, 0x48, 0x22, 0x3d, 0xd9, 0x6b, 0xa3, 0x3c, 0x31, 0x91, 0xbb, 0x3c, 0xef, 0x24, 0x88, 0xbb, 0x1e, 0x6e, 0x41, 0xbd, 0x81, 0xea, 0x80, 0x3d, 0xa6, 0xa7, 0xf2, 0x3d, 0x74, 0xcf, 0xd7, 0x3c, 0x4c, 0x85, 0xf6, 0xbc, 0x57, 0xac, 0x0f, 0x3c, 0x1c, 0x44, 0x53, 0xbd, 0x44, 0x55, 0x35, 0x3d, 0x14, 0x45, 0x11, 0x3d, 0x0d, 0xfa, 0xff, 0xbc, 0xe0, 0xef, 0x32, 0x3d, 0x6c, 0x60, 0xac, 0x3b, 0xd2, 0xe0, 0xab, 0xbb, 0x77, 0x02, 0x3f, 0xbd, 0xcd, 0x77, 0x44, 0x3d, 0x4f, 0x8c, 0x3e, 0xbd, 0x74, 0xd6, 0x5a, 0xbd, 0x33, 0xb6, 0xf2, 0xbc, 0x94, 0xe4, 0x0e, 0x3b, 0x6c, 0x9b, 0xa9, 0x3a, 0x61, 0xd7, 0xea, 0xbc, 0xf6, 0x70, 0xe9, 0x3c, 0x06, 0x81, 0xeb, 0xbc, 0x51, 0x88, 0x47, 0xbb, 0x6c, 0xfb, 0x6d, 0x3d, 0x0a, 0x9d, 0x29, 0xbb, 0xa0, 0x45, 0x36, 0x3c, 0xe5, 0xd9, 0xb8, 0x3c, 0x09, 0xf4, 0x09, 0xbd, 0x2a, 0x13, 0x54, 0xbc, 0xad, 0xb0, 0xa3, 0x3d, 0x5a, 0x07, 0xff, 0x3c, 0x18, 0x10, 0xc9, 0x3c, 0x15, 0xf6, 0x07, 0xbd, 0x05, 0x70, 0x60, 0x3d, 0xb5, 0xbd, 0x50, 0x3d, 0xeb, 0xe1, 0x11, 0x3d, 0xdf, 0x70, 0x40, 0xbd, 0x51, 0x6f, 0x67, 0xbd, 0x61, 0xbf, 0xd0, 0x3c, 0x39, 0x5e, 0x14, 0xbd, 0xae, 0x58, 0xa1, 0x3d, 0xa2, 0x03, 0x88, 0x3d, 0x85, 0x40, 0x89, 0xbd, 0x3e, 0x4f, 0x21, 0x3c, 0x8b, 0x40, 0xcf, 0x3c, 0xa8, 0x0d, 0x76, 0x3d, 0x2f, 0x57, 0xf4, 0x3b, 0x78, 0x71, 0x8f, 0x3c, 0x15, 0x80, 0x72, 0x3d, 0x35, 0xc6, 0xe6, 0xbc, 0x1e, 0xdb, 0x8d, 0x3d, 0xc1, 0x52, 0x58, 0x3d, 0x1e, 0x0c, 0x37, 0x3d, 0x68, 0xdd, 0x25, 0x3d, 0x1a, 0x65, 0x59, 0xbc, 0x22, 0xe3, 0x8b, 0x3d, 0x29, 0xb2, 0x44, 0xbd, 0x56, 0x71, 0x34, 0xbd, 0x1c, 0x3f, 0x7c, 0xbb, 0x88, 0x17, 0x72, 0xbc, 0xbb, 0xb5, 0xae, 0x3c, 0xdd, 0x7b, 0xd5, 0x3c, 0xd3, 0x2f, 0x93, 0x3d, 0x07, 0x46, 0x38, 0x3d, 0x55, 0x2b, 0x47, 0x3d, 0xd2, 0x5c, 0xda, 0x3d, 0xa4, 0x8e, 0x80, 0x3d, 0xe6, 0xdb, 0xc9, 0x3c, 0xf3, 0x2d, 0x3f, 0xbd, 0x66, 0x10, 0xd1, 0xbd, 0xde, 0xa5, 0xda, 0x3c, 0xab, 0x8c, 0xe4, 0x3c, 0x85, 0x1c, 0xc0, 0x3c, 0xba, 0xe5, 0x95, 0xbd, 0x25, 0x50, 0x92, 0x3c, 0x25, 0x15, 0xc9, 0xba, 0x43, 0xdc, 0x63, 0xbc, 0x65, 0xd6, 0x07, 0x3d, 0x87, 0x8c, 0x0e, 0xbc, 0x0d, 0x90, 0x87, 0x3d, 0x9a, 0x0e, 0x4a, 0x3d, 0x67, 0x54, 0x4a, 0x3d, 0x63, 0x8b, 0x24, 0xbd, 0x56, 0x2c, 0xcf, 0xbc, 0x28, 0x2a, 0x23, 0x3d, 0xc6, 0x80, 0xa3, 0xbc, 0x66, 0xe5, 0x09, 0xbd, 0x69, 0xdb, 0x93, 0x3d, 0x00, 0xc7, 0x7e, 0xbd, 0xe0, 0x18, 0x06, 0x3d, 0x02, 0xb9, 0x77, 0xbd, 0x43, 0x60, 0x55, 0x3c, 0x46, 0x45, 0xa4, 0x3d, 0xb1, 0x0a, 0xac, 0x3c, 0x8a, 0xc5, 0x8e, 0x3d, 0xf6, 0x60, 0x31, 0xbc, 0x9b, 0x2d, 0xb0, 0x3a, 0xc3, 0xc4, 0x4a, 0xbd, 0x96, 0x31, 0x82, 0xbd, 0x4e, 0x50, 0x59, 0x3c, 0x2f, 0xf7, 0xd4, 0xbd, 0x18, 0xc1, 0x2b, 0xbd, 0xb8, 0x26, 0x9d, 0x3c, 0xd6, 0x9c, 0x3b, 0xbd, 0xb6, 0xdd, 0x11, 0xbd, 0x4e, 0x51, 0xd9, 0x3b, 0xbd, 0xfd, 0x3b, 0xbd, 0xe2, 0xe9, 0x35, 0xbc, 0x0d, 0xb1, 0x9c, 0x3c, 0x02, 0x6e, 0xab, 0x3c, 0xc9, 0x70, 0x25, 0x3c, 0xae, 0xe4, 0x60, 0xbd, 0x11, 0xc2, 0x49, 0x3d, 0x9b, 0x09, 0xaf, 0xbc, 0xbc, 0x74, 0x75, 0x3c, 0x38, 0x61, 0x16, 0x3d, 0x0c, 0x99, 0x94, 0x3d, 0x01, 0x83, 0x03, 0xbb, 0xc5, 0x45, 0x1b, 0x3d, 0x82, 0xab, 0x6f, 0x3c, 0xe1, 0x41, 0xce, 0x3c, 0x86, 0xd5, 0x79, 0xbd, 0x0e, 0x6c, 0x69, 0x3d, 0xcf, 0xbb, 0x87, 0x3d, 0x65, 0x17, 0xb4, 0xbc, 0xca, 0x64, 0x07, 0x3e, 0x7d, 0x34, 0xca, 0x3d, 0x40, 0x0d, 0xfb, 0x3c, 0x0e, 0xea, 0xc2, 0x3c, 0x06, 0x26, 0x88, 0xbc, 0xed, 0x76, 0x84, 0x3d, 0xca, 0x92, 0xa4, 0xbc, 0x4c, 0x98, 0x74, 0xbd, 0x62, 0x77, 0xdb, 0xbd, 0x97, 0xba, 0x87, 0x3d, 0xe9, 0x05, 0x95, 0xbd, 0xcc, 0xfd, 0x99, 0x3d, 0x36, 0x01, 0x0b, 0xbd, 0x23, 0x33, 0x7d, 0x3d, 0x2f, 0xba, 0x5c, 0x3d, 0xaa, 0xed, 0xb2, 0xbc, 0xfc, 0xe7, 0x97, 0x3d, 0xaa, 0x40, 0x7d, 0x3d, 0x2a, 0x5f, 0x5e, 0x3d, 0x51, 0x91, 0x7d, 0xbd, 0xc8, 0xf8, 0x2a, 0x3d, 0x7b, 0x8c, 0x2f, 0x3d, 0x35, 0xe0, 0xb9, 0xbb, 0xc4, 0x0b, 0x56, 0xbd, 0xcf, 0xd0, 0xb8, 0x3c, 0xf7, 0xef, 0x61, 0x3d, 0xf5, 0x33, 0x9a, 0x3d, 0x07, 0xd8, 0xf0, 0xbc, 0x34, 0x49, 0x61, 0xbd, 0x7c, 0x0c, 0x74, 0xbd, 0x0c, 0x85, 0xf7, 0xbc, 0xeb, 0x13, 0xdd, 0xbc, 0x70, 0x3a, 0xd1, 0x3c, 0xd0, 0x31, 0xe1, 0x3d, 0xbf, 0xb4, 0x90, 0xbd, 0x6c, 0x8a, 0x4f, 0xbc, 0x89, 0x66, 0x29, 0xbc, 0x5d, 0x8a, 0x18, 0xbd, 0xa4, 0x2b, 0x91, 0xbd, 0x6a, 0x8d, 0x2b, 0xb9, 0x44, 0x9f, 0xf1, 0xbd, 0xe3, 0x9a, 0x87, 0x3c, 0x3c, 0x77, 0x5c, 0x3d, 0x1b, 0x6f, 0x50, 0xbd, 0x43, 0x9e, 0x41, 0xbd, 0x13, 0x6f, 0x5d, 0x3d, 0x44, 0x7f, 0x67, 0x3c, 0xf5, 0x9e, 0x31, 0x3c, 0xc0, 0x48, 0x8b, 0x3d, 0x48, 0xc4, 0xd0, 0xbc, 0x80, 0x20, 0x17, 0x3a, 0x4c, 0x44, 0x42, 0x3b, 0xcd, 0x50, 0x0e, 0x3d, 0xf8, 0xdd, 0x6a, 0x3d, 0xa7, 0xa4, 0x57, 0x3c, 0x5c, 0x60, 0x94, 0x3c, 0xd4, 0x6e, 0x34, 0xbc, 0xa3, 0xa2, 0x8e, 0xbd, 0x88, 0xe0, 0xad, 0x3d, 0xdb, 0xd6, 0x9f, 0xbd, 0x14, 0xcb, 0x61, 0xbd, 0x02, 0x50, 0x7f, 0xbd, 0xb9, 0x4c, 0x9d, 0x3d, 0x0d, 0x5a, 0x88, 0x3d, 0x8b, 0x0a, 0x06, 0x3c, 0xdf, 0x17, 0x8e, 0x3d, 0x75, 0x07, 0x0c, 0x3d, 0x5d, 0xd3, 0x52, 0xbd, 0x22, 0x56, 0x0b, 0x3a, 0x62, 0x34, 0xcb, 0xbc, 0x55, 0x58, 0xaa, 0x3c, 0x72, 0x28, 0xa3, 0xbd, 0x60, 0x8d, 0x3f, 0xbc, 0x5b, 0xaa, 0x51, 0xbb, 0xa8, 0x60, 0x31, 0xbd, 0x8c, 0xc5, 0xfb, 0x3c, 0x90, 0x97, 0x3f, 0xbc, 0x94, 0x3a, 0x45, 0xbd, 0xb5, 0xc1, 0x8d, 0xbd, 0x07, 0xd0, 0x08, 0x3d, 0x47, 0x05, 0xe2, 0xbb, 0x69, 0x2e, 0x16, 0x3d, 0xd0, 0x2d, 0x50, 0xbd, 0xd3, 0x88, 0x9e, 0x3d, 0x2f, 0x19, 0xbb, 0xbc, 0x20, 0x1f, 0xa4, 0x3d, 0x38, 0x4e, 0x9c, 0xbc, 0x71, 0x5a, 0x6e, 0x3c, 0x47, 0x9a, 0x49, 0x3d, 0x7a, 0x7b, 0x07, 0x3a, 0x54, 0xf5, 0xcd, 0x3d, 0x54, 0xb0, 0xde, 0x3c, 0xb0, 0xbd, 0x1b, 0x3c, 0x31, 0x85, 0x2c, 0xbd, 0xda, 0x03, 0xe4, 0xbb, 0x9e, 0xf5, 0x87, 0x3d, 0xef, 0x15, 0x41, 0x3d, 0x82, 0x56, 0xa3, 0x3d, 0xfa, 0x31, 0x5e, 0xbd, 0xf2, 0x5e, 0x5f, 0xbb, 0x1c, 0xda, 0x9f, 0x3d, 0x45, 0x09, 0x71, 0xbc, 0x37, 0x80, 0x9a, 0x3b, 0x5a, 0x7a, 0xfd, 0xbc, 0x37, 0x4f, 0x1a, 0xbe, 0xfa, 0x30, 0xeb, 0xbc, 0xa9, 0xd5, 0x74, 0xbd, 0x18, 0xad, 0x9b, 0xbc, 0x00, 0xc4, 0xce, 0x3a, 0x98, 0x58, 0x19, 0x3c, 0xf0, 0x22, 0xa1, 0x3b, 0x84, 0xfa, 0x08, 0xbd, 0x6f, 0xfe, 0x96, 0x3d, 0xe3, 0xc4, 0x90, 0x3d, 0xa0, 0xc8, 0x5a, 0xbc, 0x97, 0x7f, 0xc2, 0xbc, 0xea, 0xcc, 0xcc, 0x3c, 0xae, 0xb0, 0x9c, 0xbc, 0x49, 0xdf, 0x97, 0xbc, 0xdd, 0x01, 0x18, 0xbd, 0x66, 0x26, 0xa7, 0xbc, 0x2a, 0x3d, 0x59, 0xbd, 0x93, 0x1b, 0x1a, 0x3d, 0xd9, 0x46, 0xcc, 0x3c, 0x00, 0xf0, 0x34, 0x3a, 0x99, 0x3d, 0xc0, 0xbc, 0x08, 0xb1, 0x09, 0x3c, 0xbe, 0xfb, 0x79, 0x3d, 0xa9, 0x90, 0x86, 0xbd, 0xa2, 0x17, 0x8f, 0xbd, 0x30, 0x94, 0x8a, 0xbb, 0xd9, 0xd7, 0x82, 0x3d, 0xe4, 0xea, 0x2f, 0xbd, 0x7e, 0x59, 0x73, 0xbd, 0x46, 0x73, 0xe2, 0xbc, 0xe0, 0xd4, 0x42, 0xbc, 0x3c, 0x6c, 0xdf, 0x3c, 0x08, 0xce, 0xf9, 0x3c, 0xfc, 0xe4, 0x79, 0xbd, 0xac, 0x5c, 0x4f, 0xbd, 0x60, 0x67, 0x12, 0xbb, 0xb2, 0xcf, 0xbf, 0xbc, 0xe2, 0x7c, 0x31, 0xbd, 0xb6, 0xc7, 0x18, 0x3d, 0xdc, 0x89, 0x90, 0xbd, 0x0c, 0xf7, 0x99, 0xbc, 0xa0, 0x2a, 0x3c, 0xbd, 0x92, 0x1b, 0x38, 0x3d, 0x34, 0xe9, 0x86, 0xbd, 0x69, 0x76, 0x6d, 0xbd, 0x76, 0x2b, 0x6e, 0x3d, 0x70, 0x53, 0x3f, 0x3d, 0x22, 0xe5, 0x4c, 0x3d, 0x52, 0x57, 0xfc, 0xbc, 0xf8, 0x6b, 0x31, 0xbd, 0xb4, 0xb1, 0xa3, 0x3c, 0x10, 0x0c, 0x60, 0x3c, 0xbc, 0x80, 0x85, 0xbd, 0xe6, 0x9f, 0x78, 0xbd, 0x00, 0x20, 0x90, 0xba, 0xbc, 0x54, 0x5d, 0xbd, 0x6c, 0xd7, 0xc5, 0xbc, 0x87, 0x6b, 0x87, 0x3d, 0x0a, 0x34, 0x0c, 0x3d, 0x44, 0xe5, 0x47, 0xbd, 0xe0, 0xd3, 0x05, 0x3b, 0x23, 0x83, 0x11, 0xbd, 0xab, 0x22, 0x8c, 0xbd, 0x48, 0x17, 0xe9, 0x3c, 0xbd, 0x8a, 0x89, 0x3d, 0xc0, 0x3a, 0x71, 0x3b, 0x08, 0x52, 0x61, 0x3c, 0x40, 0xb4, 0x6d, 0x3c, 0xa0, 0x6a, 0xa0, 0x3b, 0x00, 0xc4, 0xb9, 0x39, 0x74, 0x71, 0xa8, 0x3c, 0x13, 0xa7, 0x90, 0xbd, 0x04, 0xb5, 0xb4, 0xbc, 0x70, 0x36, 0x31, 0x3c, 0x28, 0x25, 0x0f, 0x3c, 0xfc, 0x08, 0x46, 0xbd, 0x80, 0xa0, 0xa5, 0xba, 0xe2, 0x11, 0x6f, 0xbd, 0x39, 0xf0, 0x31, 0xbd, 0xd8, 0xbe, 0x2f, 0xbd, 0x68, 0x21, 0x4d, 0xbd, 0x64, 0x1b, 0x8e, 0xbd, 0x80, 0xd4, 0x78, 0xba, 0x92, 0x81, 0x5a, 0xbd, 0xf4, 0xf9, 0x57, 0xbd, 0x80, 0x59, 0xa2, 0x3c, 0x22, 0xe6, 0xde, 0xbc, 0x91, 0xdf, 0x87, 0xbd, 0x3a, 0xea, 0x22, 0xbd, 0xba, 0xf7, 0x75, 0x3d, 0xba, 0x8a, 0x0c, 0x3d, 0x81, 0xa7, 0x8d, 0xbd, 0x90, 0xee, 0x50, 0xbd, 0x14, 0xa3, 0x90, 0xbd, 0xdc, 0xdf, 0x81, 0x3c, 0x4a, 0xb5, 0x66, 0xbd, 0x10, 0xa0, 0x94, 0x3b, 0x9a, 0x12, 0x2d, 0xbd, 0xda, 0x60, 0x42, 0xbd, 0xea, 0x9f, 0xb0, 0xbc, 0x38, 0xfc, 0x02, 0x3d, 0xa6, 0x08, 0x04, 0x3d, 0x23, 0xf6, 0x03, 0xbd, 0xa2, 0x7a, 0x63, 0x3d, 0x26, 0xca, 0x36, 0x3d, 0x96, 0xd3, 0x0d, 0x3d, 0x3f, 0xfd, 0x89, 0x3d, 0x08, 0xa3, 0x24, 0xbd, 0x28, 0x10, 0x57, 0xbc, 0xbb, 0xb9, 0x83, 0x3d, 0x50, 0x2b, 0xb5, 0x3b, 0x9c, 0x94, 0x19, 0xbc, 0xc4, 0x4d, 0x9a, 0xbc, 0x91, 0xf8, 0x0d, 0xbd, 0x63, 0x13, 0x7d, 0xbd, 0xed, 0xd0, 0x02, 0xbd, 0x1c, 0x10, 0x85, 0xbd, 0x00, 0xca, 0x36, 0x3c, 0xc8, 0x17, 0x7a, 0x3c, 0x24, 0x32, 0xc7, 0xbc, 0x88, 0x75, 0xa5, 0x3c, 0x2e, 0x18, 0x39, 0xbd, 0xd4, 0xa9, 0xfb, 0x3c, 0x8c, 0x61, 0x48, 0x3d, 0x40, 0x34, 0xb1, 0xba, 0xb7, 0xec, 0x83, 0x3d, 0x7c, 0x1d, 0x5a, 0x3d, 0x30, 0x5c, 0x91, 0x3c, 0xcb, 0x9d, 0x85, 0x3d, 0x74, 0xa8, 0x35, 0x3d, 0x93, 0x54, 0x76, 0xbd, 0xa3, 0xb8, 0x8c, 0xbd, 0xf3, 0x38, 0x8d, 0xbd, 0x45, 0x41, 0x8d, 0xbd, 0xb0, 0x35, 0x2c, 0x3d, 0x79, 0x2f, 0x91, 0x3d, 0x1c, 0xa0, 0xde, 0xbc, 0x26, 0xd7, 0x53, 0xbd, 0xec, 0x6e, 0x11, 0x3d, 0x1c, 0x44, 0x8f, 0x3c, 0x2b, 0x97, 0x2b, 0xbd, 0x78, 0x4e, 0x62, 0xbc, 0x4a, 0x20, 0xe3, 0xbc, 0x2e, 0x7e, 0xd5, 0xbc, 0x34, 0xe0, 0xcc, 0xbc, 0x00, 0xd9, 0x05, 0x3d, 0x6e, 0xe3, 0xd8, 0xbc, 0x32, 0x01, 0x51, 0x3d, 0x57, 0x4a, 0x83, 0x3d, 0x98, 0x90, 0x4c, 0xbd, 0x0d, 0x8e, 0x8b, 0x3d, 0x76, 0x2c, 0x32, 0x3d, 0x6a, 0x76, 0x91, 0xbd, 0xc8, 0xf9, 0x85, 0x3c, 0x40, 0x2b, 0x80, 0x3a, 0xe0, 0x00, 0xe3, 0xbb, 0x00, 0x06, 0x79, 0xb9, 0x27, 0xbd, 0x8f, 0x3d, 0xce, 0x76, 0x2c, 0x3d, 0x56, 0x63, 0xd7, 0xbc, 0x30, 0x52, 0xf0, 0xbb, 0x69, 0x1f, 0x85, 0xbd, 0x7e, 0xdb, 0x64, 0xbd, 0x85, 0xd6, 0x87, 0x3d, 0x92, 0xc0, 0x70, 0x3d, 0x4c, 0x7a, 0x78, 0xbc, 0x6c, 0x7d, 0x2b, 0xbd, 0x6f, 0x2b, 0x85, 0x3d, 0x98, 0x48, 0x39, 0xbd, 0x8c, 0x9d, 0xce, 0x3c, 0x08, 0xf9, 0x5c, 0xbc, 0xe8, 0x5a, 0xcd, 0x3c, 0x88, 0xb0, 0x3c, 0x3d, 0xf8, 0x88, 0x4e, 0xbd, 0x30, 0x8f, 0x38, 0x3c, 0xba, 0xa1, 0xc9, 0xbc, 0xba, 0xdc, 0x6d, 0x3d, 0xc0, 0x39, 0x5a, 0xbb, 0xa6, 0x2d, 0x1d, 0x3d, 0x04, 0xde, 0xe4, 0x3c, 0x24, 0x67, 0x4f, 0xbd, 0xde, 0xc0, 0x7c, 0x3d, 0x31, 0x68, 0x09, 0xbd, 0x01, 0x59, 0x80, 0xbd, 0x13, 0x09, 0x91, 0x3d, 0xc8, 0xdd, 0x18, 0x3d, 0x2b, 0x88, 0x91, 0x3d, 0x50, 0xef, 0x80, 0x3c, 0xec, 0x4a, 0x65, 0xbc, 0xb0, 0xca, 0x0a, 0x3d, 0x48, 0x1f, 0x29, 0xbd, 0x56, 0xe9, 0x3a, 0x3d, 0xd0, 0x9c, 0x67, 0xbc, 0xe0, 0x47, 0xdb, 0xbc, 0xd8, 0x70, 0x4a, 0xbd, 0x86, 0x63, 0x39, 0xbd, 0xfb, 0x2a, 0x10, 0xbd, 0xbc, 0xfb, 0x42, 0xbd, 0xdc, 0x59, 0xe4, 0xbc, 0x2e, 0x08, 0x5f, 0xbd, 0x34, 0xb6, 0xe1, 0x3c, 0x76, 0x68, 0x22, 0x3d, 0x18, 0x3d, 0x14, 0x3c, 0xa5, 0xa2, 0x8b, 0xbd, 0x9c, 0x97, 0x87, 0xbd, 0xbd, 0x22, 0x87, 0x3d, 0x20, 0x18, 0x57, 0x3c, 0xb6, 0x45, 0x5e, 0x3d, 0xa4, 0x1e, 0x63, 0xbd, 0x88, 0x1f, 0x68, 0x3c, 0xe0, 0x00, 0x4f, 0x3d, 0x34, 0xe0, 0x5a, 0xbc, 0xd4, 0xd3, 0x61, 0xbc, 0x40, 0x8f, 0x14, 0xbb, 0xae, 0x4e, 0x94, 0xbc, 0x8d, 0x80, 0x61, 0xbd, 0x11, 0xcc, 0x85, 0x3d, 0xb4, 0x7b, 0x24, 0xbd, 0x3e, 0x81, 0x15, 0x3d, 0xaa, 0xe5, 0x85, 0xbd, 0xa0, 0xa4, 0x2c, 0xbb, 0x02, 0x5e, 0x25, 0x3d, 0x5d, 0x8b, 0x37, 0xbd, 0xa1, 0xb0, 0x25, 0xbd, 0x4a, 0xa5, 0x6b, 0x3d, 0xd3, 0x4a, 0x92, 0x3d, 0x40, 0x57, 0x06, 0x3d, 0x20, 0xdd, 0x30, 0x3b, 0xb0, 0x9e, 0xd3, 0x3c, 0x62, 0xb5, 0xd8, 0xbc, 0xa0, 0xec, 0x93, 0xbb, 0x20, 0xc4, 0x7a, 0x3b, 0xc0, 0x64, 0xfe, 0x3b, 0xcb, 0xb4, 0x90, 0x3d, 0x3f, 0x87, 0x8c, 0x3d, 0xfa, 0x94, 0x21, 0x3d, 0x9c, 0xc3, 0x03, 0x3d, 0xc2, 0x4f, 0x8d, 0xbc, 0x22, 0x1e, 0xd2, 0xbc, 0xa0, 0xd5, 0x66, 0xbc, 0xba, 0xf8, 0xcd, 0xbc, 0x7f, 0x26, 0x60, 0xbd, 0x6c, 0x27, 0x90, 0x3c, 0xf4, 0xd5, 0x85, 0x3c, 0xc0, 0x88, 0x3c, 0xbb, 0x8e, 0x17, 0x9d, 0xbc, 0x34, 0xb8, 0xef, 0x3c, 0x78, 0x16, 0xbd, 0x3c, 0x41, 0x5e, 0x90, 0xbd, 0x3e, 0x1c, 0x40, 0x3d, 0xeb, 0xf2, 0x8c, 0x3d, 0xd4, 0xb2, 0xa8, 0xbc, 0x0a, 0xae, 0x29, 0x3d, 0x40, 0x78, 0x1c, 0xbb, 0x60, 0xfb, 0xd1, 0x3c, 0x9d, 0xd0, 0x84, 0x3d, 0x8a, 0xcc, 0x08, 0x3d, 0x72, 0x4d, 0x41, 0x3d, 0xa9, 0x49, 0x50, 0xbd, 0x92, 0x44, 0x1c, 0x3d, 0xc8, 0x15, 0x5f, 0xbd, 0x1a, 0xda, 0xb6, 0xbc, 0xb4, 0x03, 0xd1, 0x3c, 0xdc, 0x8e, 0xb0, 0x3c, 0x88, 0x61, 0x7a, 0xbc, 0xb0, 0xab, 0xc4, 0xbb, 0xa2, 0x9f, 0x35, 0xbd, 0xac, 0xc1, 0x1e, 0xbd, 0x78, 0xd0, 0x54, 0x3d, 0x22, 0x03, 0xa9, 0xbc, 0x00, 0x71, 0x30, 0xbb, 0x30, 0xaa, 0xc8, 0x3b, 0xa9, 0x9c, 0x35, 0xbd, 0x00, 0xb3, 0x09, 0xbb, 0x40, 0x51, 0x2e, 0x3c, 0xc8, 0xb4, 0x23, 0x3c, 0x6d, 0xf4, 0x06, 0xbd, 0xaa, 0x77, 0x6f, 0x3d, 0xce, 0xc4, 0xb1, 0xbc, 0x6f, 0x91, 0x8b, 0x3d, 0x5f, 0xc4, 0x8a, 0x3d, 0xe4, 0x1f, 0xac, 0x3c, 0x4c, 0xc1, 0x89, 0x3c, 0x4c, 0x09, 0x5d, 0xbd, 0x38, 0x91, 0x3e, 0x3c, 0xe0, 0x15, 0x30, 0xbd, 0x60, 0x09, 0xd2, 0x3c, 0xe0, 0x4f, 0x35, 0xbb, 0xe8, 0xf2, 0xdf, 0xbc, 0x40, 0xa5, 0xcc, 0xba, 0x28, 0xaa, 0x04, 0xbc, 0xb4, 0x3b, 0x3d, 0xbc, 0xa8, 0xbc, 0x9d, 0x3c, 0x22, 0x77, 0x51, 0x3d, 0xd3, 0x53, 0x48, 0xbd, 0x80, 0x2a, 0x2c, 0x3b, 0x4e, 0x95, 0x79, 0x3d, 0x9c, 0x2c, 0x52, 0xbd, 0xac, 0x7e, 0xd9, 0x3c, 0x76, 0xd7, 0x78, 0x3d, 0x00, 0xe8, 0x78, 0xbd, 0x2e, 0x63, 0x0f, 0x3d, 0xeb, 0x59, 0x14, 0xbd, 0x84, 0xd4, 0x1c, 0xbc, 0x1d, 0x54, 0x1a, 0xbd, 0xe0, 0x16, 0x5c, 0xbb, 0x5c, 0xf1, 0x48, 0x3d, 0x94, 0x95, 0x59, 0xbc, 0x48, 0x14, 0x37, 0xbd, 0x3e, 0x60, 0x76, 0x3d, 0xb4, 0x88, 0xdb, 0x3c, 0x24, 0xf3, 0x8b, 0xbc, 0xb8, 0x6e, 0x0f, 0x3d, 0x00, 0x2c, 0xda, 0x3a, 0x79, 0x80, 0x88, 0x3d, 0x58, 0xf7, 0x26, 0x3c, 0x10, 0x19, 0x45, 0x3d, 0xf9, 0xba, 0x6a, 0xbd, 0x0e, 0x30, 0x43, 0x3d, 0xe0, 0x09, 0x68, 0x3b, 0x51, 0x84, 0x8f, 0xbd, 0x6a, 0xa1, 0x7a, 0xbd, 0xbc, 0x1c, 0x72, 0xbd, 0x94, 0xf7, 0x75, 0xbd, 0xc8, 0x32, 0x69, 0xbd, 0xf5, 0x29, 0x1e, 0xbd, 0x00, 0xe7, 0x59, 0x3a, 0x90, 0x9c, 0x84, 0xbd, 0x5c, 0x5f, 0x2f, 0xbd, 0x50, 0x8c, 0x95, 0xbb, 0x00, 0x13, 0x85, 0xbd, 0x26, 0xab, 0x7f, 0xbd, 0xc8, 0x91, 0x2a, 0xbc, 0x34, 0xda, 0xd2, 0xbc, 0x2c, 0xb7, 0x4b, 0x3d, 0x73, 0xe4, 0x2b, 0xbd, 0x48, 0x46, 0x8f, 0xbd, 0x0c, 0xa7, 0x36, 0xbd, 0x58, 0x23, 0x9f, 0x3c, 0xec, 0x5b, 0x2e, 0x3d, 0x28, 0xde, 0x34, 0xbd, 0x00, 0xd5, 0x8e, 0x3b, 0x76, 0xa2, 0x76, 0x3d, 0x64, 0xe8, 0x4d, 0x3d, 0x47, 0xc2, 0x82, 0xbd, 0x90, 0x0c, 0x8b, 0xbd, 0x9c, 0x98, 0x1a, 0x3d, 0x74, 0xd4, 0xd1, 0xbc, 0xd6, 0x3b, 0x78, 0x3d, 0x88, 0xad, 0x04, 0xbd, 0x5c, 0x4e, 0xbf, 0x3c, 0x20, 0xd8, 0x5b, 0x3c, 0x68, 0x77, 0x0e, 0xbc, 0xc0, 0x8a, 0xc8, 0x3b, 0x00, 0x68, 0x5d, 0xba, 0x4c, 0x05, 0x30, 0x3d, 0x20, 0xb7, 0x56, 0x3d, 0xa0, 0x6e, 0xef, 0x3c, 0xb4, 0x50, 0x1c, 0x3d, 0x5c, 0x0f, 0x68, 0xbd, 0xf7, 0x3c, 0x53, 0xbd, 0x96, 0xa5, 0x0c, 0x3d, 0x3a, 0x6c, 0x07, 0x3d, 0xa0, 0x60, 0x2c, 0xbd, 0x20, 0xaf, 0xbf, 0xbc, 0x00, 0x2d, 0x05, 0xbb, 0xe0, 0x97, 0x4b, 0x3b, 0x32, 0xdc, 0x37, 0x3d, 0xe2, 0x39, 0x54, 0xbd, 0x2a, 0xde, 0xeb, 0xbc, 0x1e, 0x8b, 0x6d, 0x3d, 0x0c, 0x92, 0xd6, 0xbc, 0xec, 0x48, 0x19, 0xbc, 0x23, 0xd9, 0x90, 0xbd, 0x84, 0x8b, 0x83, 0xbd, 0xc8, 0x8c, 0x7c, 0x3c, 0xfe, 0xca, 0x7d, 0xbd, 0x06, 0xb7, 0x69, 0x3d, 0x34, 0x35, 0xb0, 0x3c, 0x52, 0x14, 0x56, 0xbd, 0xf4, 0xf3, 0x43, 0xbd, 0x34, 0x5e, 0xbf, 0xbc, 0x9c, 0x32, 0x1e, 0x3d, 0xa0, 0x4d, 0xe0, 0x3b, 0x00, 0x68, 0x5d, 0xb8, 0x9e, 0x47, 0x7b, 0x3d, 0xe1, 0xcd, 0x8b, 0x3d, 0xb8, 0x10, 0x8f, 0xbc, 0xc8, 0x30, 0x28, 0x3c, 0xec, 0x42, 0x28, 0x3d, 0xfe, 0xea, 0x8a, 0xbd, 0x36, 0x76, 0x1a, 0xbd, 0xfa, 0x9c, 0xca, 0xbc, 0x10, 0xe9, 0x82, 0xbd, 0x72, 0x8b, 0x7b, 0x3d, 0x46, 0x75, 0x1c, 0xbd, 0x5a, 0xb9, 0x06, 0xbd, 0x6c, 0xa7, 0x25, 0xbc, 0x6a, 0x37, 0xd3, 0xbc, 0xbc, 0x78, 0x85, 0x3c, 0x98, 0xb7, 0x01, 0x3d, 0x3c, 0xb7, 0x0d, 0x3d, 0x3c, 0x57, 0x21, 0xbc, 0x28, 0xfb, 0xa7, 0x3c, 0x18, 0x3f, 0x49, 0x3c, 0x81, 0x34, 0x8d, 0xbd, 0xb4, 0xfb, 0x6e, 0xbd, 0x60, 0x97, 0x95, 0x3c, 0xac, 0xdd, 0x86, 0xbc, 0xd8, 0x6e, 0xda, 0x3c, 0xd8, 0xd9, 0x3d, 0x3d, 0x90, 0xa6, 0xea, 0x3c, 0x40, 0x67, 0x3f, 0x3d, 0x3a, 0x43, 0x69, 0x3d, 0x0a, 0x20, 0x5e, 0x3d, 0x33, 0x91, 0x12, 0xbd, 0xb4, 0xc5, 0x31, 0xbd, 0x0e, 0x96, 0x45, 0x3d, 0xc6, 0x22, 0x37, 0xbd, 0x7c, 0x12, 0x44, 0x3d, 0xc9, 0x61, 0x8a, 0x3d, 0x1c, 0x66, 0x44, 0x3d, 0xa2, 0x51, 0x30, 0x3d, 0xc8, 0xdb, 0xd9, 0x3c, 0xd3, 0xfb, 0x8e, 0xbd, 0x08, 0x6a, 0x91, 0xbd, 0xea, 0x2e, 0x48, 0xbd, 0x60, 0x5b, 0x22, 0xbb, 0x06, 0x39, 0x53, 0x3d, 0x84, 0xb4, 0x0b, 0xbd, 0xa0, 0x77, 0xfa, 0x3b, 0x84, 0xaf, 0xaa, 0x3c, 0x47, 0xd2, 0x86, 0xbd, 0xe3, 0xef, 0x43, 0xbd, 0x36, 0x8d, 0x16, 0x3d, 0x85, 0xa6, 0x85, 0x3d, 0x8e, 0xda, 0xa0, 0xbc, 0xc3, 0x58, 0x80, 0xbd, 0x93, 0x30, 0x0f, 0xbd, 0x0c, 0x85, 0xcf, 0xbc, 0xc0, 0x8c, 0x2a, 0x3c, 0x02, 0xe2, 0x0d, 0xbd, 0xe9, 0xf8, 0x8c, 0xbd, 0x15, 0x8d, 0x8b, 0x3d, 0xf3, 0x1f, 0x8b, 0xbd, 0x0f, 0xa0, 0x80, 0xbd, 0xee, 0x04, 0x63, 0x3d, 0xb4, 0x7a, 0xf6, 0xbc, 0x60, 0x5b, 0x2e, 0xbc, 0x04, 0x6d, 0x42, 0x3d, 0x8a, 0xfc, 0x1c, 0x3d, 0x52, 0xb0, 0x27, 0x3d, 0xe8, 0xf9, 0x35, 0xbd, 0xd4, 0xc2, 0x1b, 0x3d, 0x00, 0x3a, 0x0b, 0xbb, 0x80, 0x7e, 0x4b, 0x3c, 0x06, 0xba, 0x3e, 0xbd, 0x70, 0xc9, 0x35, 0xbd, 0xe0, 0x8b, 0x9d, 0xbb, 0x16, 0x05, 0x2f, 0xbd, 0xa0, 0xeb, 0x03, 0x3c, 0x40, 0x3e, 0x95, 0xbc, 0xea, 0x76, 0x73, 0xbd, 0x90, 0xb0, 0xe8, 0x3c, 0x3e, 0x61, 0x42, 0xbd, 0x17, 0x02, 0x8d, 0xbd, 0x42, 0x66, 0x1d, 0x3d, 0xfe, 0x31, 0x68, 0x3d, 0x52, 0x8e, 0x30, 0xbd, 0x6b, 0xca, 0x10, 0xbd, 0xbd, 0xcc, 0x80, 0xbd, 0x38, 0x91, 0x53, 0xbd, 0x90, 0xd7, 0xd3, 0x3c, 0x00, 0x0c, 0xf4, 0x3b, 0x82, 0xf5, 0x3f, 0xbd, 0xb2, 0xa9, 0x04, 0x3d, 0x62, 0x67, 0x5c, 0x3d, 0x86, 0xab, 0x91, 0xbc, 0xc2, 0x2b, 0xe8, 0xbc, 0x3a, 0x8a, 0x67, 0xbd, 0xcc, 0x83, 0xdb, 0x3c, 0xf0, 0x8a, 0x03, 0x3c, 0x94, 0x78, 0x53, 0x3d, 0x9c, 0x1b, 0xd4, 0x3c, 0xdb, 0xf9, 0x89, 0x3d, 0x40, 0xa5, 0x10, 0x3b, 0x89, 0xed, 0x80, 0xbd, 0x6e, 0xb8, 0x57, 0xbd, 0x12, 0xc2, 0xcf, 0xbc, 0x44, 0x32, 0xb1, 0x3c, 0xd5, 0xed, 0x34, 0xbd, 0x5e, 0x6c, 0x5c, 0xbd, 0x68, 0x69, 0x85, 0x3c, 0x30, 0xdb, 0xb6, 0xbb, 0x00, 0x7f, 0xe0, 0x3c, 0x80, 0x24, 0x1e, 0x3b, 0x78, 0x6f, 0x81, 0xbc, 0x3a, 0x27, 0x1b, 0x3d, 0x7f, 0xb5, 0x8a, 0xbd, 0xbb, 0xc1, 0x8e, 0x3d, 0xa8, 0x7e, 0x69, 0x3c, 0x00, 0x80, 0x47, 0xbb, 0x21, 0xb9, 0x15, 0xbd, 0x14, 0x0b, 0x8e, 0x3c, 0xa2, 0x1b, 0x55, 0x3d, 0x28, 0xea, 0x5b, 0xbd, 0x10, 0x9a, 0x43, 0x3d, 0x40, 0xf6, 0x8a, 0x3a, 0x58, 0xb1, 0x92, 0xbc, 0x5c, 0x0a, 0x4e, 0xbd, 0x10, 0xec, 0x1f, 0xbd, 0xa8, 0x31, 0xa7, 0x3c, 0x60, 0xfa, 0x9f, 0xbb, 0xf0, 0x04, 0xa3, 0xbb, 0xc4, 0xd8, 0x5f, 0xbd, 0xba, 0x5f, 0x66, 0xbd, 0x52, 0x94, 0x97, 0xbc, 0x1a, 0x9b, 0x22, 0xbd, 0xaa, 0x28, 0x59, 0x3d, 0xaa, 0x06, 0x64, 0xbd, 0xe7, 0xc2, 0x83, 0xbd, 0xd0, 0x3d, 0xd0, 0xbc, 0x00, 0x8c, 0xa3, 0x39, 0xd0, 0x27, 0x0c, 0xbc, 0x40, 0x8f, 0x79, 0xbc, 0x9e, 0x32, 0x7f, 0x3d, 0xac, 0x9b, 0xfd, 0xbc, 0xb1, 0x17, 0x91, 0x3d, 0xa8, 0xca, 0x4e, 0x3d, 0x40, 0xc3, 0xb7, 0x3a, 0xc0, 0x8e, 0x78, 0xbb, 0x3f, 0x3c, 0x83, 0x3d, 0x47, 0xdc, 0x81, 0xbd, 0x5b, 0xe6, 0x1c, 0xbd, 0x70, 0xe3, 0xc8, 0xbc, 0x70, 0x12, 0xd6, 0xbb, 0x0c, 0xb6, 0xe3, 0x3c, 0x88, 0x2a, 0x22, 0x3c, 0xd6, 0xbf, 0x8d, 0xbd, 0xde, 0x15, 0x20, 0x3d, 0x76, 0x83, 0x3e, 0xbd, 0x85, 0x35, 0x80, 0x3d, 0xc1, 0x0b, 0x87, 0x3d, 0xbf, 0x64, 0x18, 0xbd, 0x80, 0x22, 0x68, 0x3b, 0xc4, 0xb0, 0xb0, 0x3c, 0xa2, 0xf2, 0x4f, 0xbd, 0xb6, 0x63, 0x04, 0x3d, 0xc0, 0x4a, 0xc9, 0x3c, 0x36, 0x66, 0xc0, 0xbc, 0x64, 0x7a, 0x4c, 0x3d, 0xc1, 0x5b, 0x8c, 0x3d, 0xae, 0xa2, 0x41, 0x3d, 0x66, 0x93, 0x01, 0x3d, 0x6c, 0xb7, 0x37, 0xbd, 0x8c, 0x03, 0x28, 0xbd, 0x7c, 0xf6, 0x69, 0xbd, 0xa2, 0xe7, 0x0d, 0xbd, 0xb0, 0xf3, 0x41, 0x3d, 0xc0, 0xbf, 0xc4, 0x3b, 0xe2, 0x58, 0x46, 0xbd, 0x02, 0xb4, 0x60, 0x3d, 0xa2, 0xf8, 0x29, 0x3d, 0x90, 0xf7, 0xc8, 0x3b, 0xee, 0xad, 0x43, 0x3d, 0x1b, 0x51, 0x12, 0xbd, 0xee, 0xc3, 0x91, 0xbd, 0x20, 0xad, 0x58, 0x3c, 0xc6, 0x54, 0x3a, 0x3d, 0xea, 0xba, 0x60, 0xbd, 0x7e, 0x31, 0x22, 0x3d, 0x98, 0xe6, 0x80, 0xbd, 0x00, 0x41, 0x29, 0x3b, 0x85, 0xec, 0x8c, 0x3d, 0x7a, 0x8e, 0x3e, 0x3d, 0x42, 0x31, 0xfc, 0xbc, 0x58, 0x3c, 0x08, 0x3c, 0xdc, 0x04, 0xb5, 0xbc, 0x9e, 0xbf, 0x0f, 0xbd, 0x70, 0xad, 0x2a, 0xbc, 0x6c, 0x83, 0x8c, 0xbc, 0x6a, 0xd4, 0x6c, 0xbd, 0x62, 0x1b, 0x8e, 0xbc, 0x94, 0x48, 0x1f, 0xbd, 0x35, 0xe0, 0x3d, 0xbd, 0x60, 0x91, 0x88, 0x3b, 0x6c, 0x16, 0x07, 0x3d, 0x30, 0xa0, 0x93, 0x3b, 0x3c, 0xec, 0x5e, 0xbc, 0x66, 0xbf, 0x51, 0xbd, 0xfc, 0x42, 0x47, 0x3d, 0x78, 0x73, 0x71, 0x3c, 0x62, 0x96, 0x89, 0xbd, 0x50, 0x2b, 0xca, 0x3c, 0x98, 0xc5, 0x21, 0x3c, 0xbb, 0x4b, 0x19, 0xbd, 0x36, 0x22, 0x75, 0x3d, 0x44, 0x6e, 0x7d, 0xbd, 0xec, 0x88, 0x8d, 0x3c, 0xa8, 0x57, 0x0e, 0x3c, 0x96, 0x97, 0x01, 0x3d, 0x1c, 0x9c, 0x59, 0x3d, 0xc4, 0x0b, 0x31, 0x3d, 0x60, 0xf0, 0x6c, 0xbc, 0xb8, 0xa9, 0xb4, 0x3c, 0xd8, 0xbb, 0x33, 0xbc, 0x98, 0x35, 0x99, 0x3c, 0xd2, 0x49, 0x3d, 0xbd, 0xe6, 0xc9, 0x5b, 0x3d, 0x42, 0xf7, 0x41, 0x3d, 0xda, 0x13, 0x37, 0xbd, 0x96, 0x91, 0x94, 0xbc, 0xb8, 0xde, 0x89, 0x3c, 0xda, 0x37, 0x08, 0xbd, 0x20, 0xda, 0x3e, 0x3c, 0xda, 0xe8, 0x61, 0xbd, 0x70, 0x8a, 0x29, 0x3d, 0x18, 0xa4, 0x8f, 0xbd, 0x20, 0xee, 0x56, 0x3c, 0x70, 0xc3, 0xc8, 0xbc, 0x5c, 0xf4, 0x99, 0x3c, 0x54, 0xd5, 0x4b, 0xbd, 0x88, 0xcf, 0x6a, 0x3c, 0xa5, 0xc7, 0x1c, 0xbd, 0x10, 0x98, 0xb3, 0xbb, 0x9a, 0xe0, 0x86, 0xbd, 0x3e, 0x34, 0x87, 0xbd, 0xfa, 0x36, 0x7d, 0x3d, 0x40, 0x64, 0xfe, 0xbc, 0xd0, 0x4f, 0x67, 0xbd, 0x21, 0xda, 0x72, 0xbd, 0x2e, 0x02, 0x38, 0xbd, 0xc6, 0xd9, 0xff, 0xbc, 0x1a, 0x30, 0xb9, 0xbc, 0x58, 0xea, 0x58, 0x3c, 0xb1, 0xb7, 0x03, 0xbd, 0x80, 0x5b, 0xfc, 0x3a, 0x43, 0x60, 0x80, 0x3d, 0xa8, 0x67, 0x4a, 0xbd, 0x68, 0xd8, 0x3e, 0x3c, 0xf0, 0xe8, 0x2a, 0x3c, 0x68, 0x26, 0x3f, 0xbd, 0x28, 0x26, 0x73, 0xbd, 0x38, 0xe5, 0x24, 0x3d, 0x00, 0xb0, 0xa1, 0xba, 0x7e, 0x0f, 0x18, 0xbd, 0x35, 0x0d, 0x7c, 0xbd, 0x14, 0xa7, 0x3f, 0x3d, 0x16, 0x49, 0x0e, 0x3d, 0x2e, 0xd8, 0x90, 0xbd, 0x50, 0xc3, 0x21, 0xbd, 0xd4, 0x13, 0x44, 0x3d, 0x70, 0x10, 0xfd, 0x3b, 0x7b, 0x43, 0x87, 0x3d, 0x64, 0xb7, 0xf9, 0x3c, 0xd6, 0xc6, 0xb7, 0xbc, 0x00, 0xd8, 0xbb, 0x3b, 0xe0, 0x1b, 0x42, 0xbb, 0x68, 0x5c, 0xcf, 0xbc, 0xea, 0xfb, 0x8e, 0xbd, 0xdc, 0x09, 0x33, 0x3d, 0x80, 0xef, 0xb9, 0x3c, 0x00, 0xde, 0x92, 0xb9, 0x31, 0x42, 0x08, 0xbd, 0x80, 0x6d, 0x40, 0x3b, 0x80, 0xab, 0x20, 0x3d, 0xc0, 0x60, 0xc3, 0xba, 0x0b, 0xb6, 0x5e, 0xbd, 0xd4, 0x28, 0x3e, 0xbd, 0x47, 0x7b, 0x87, 0x3d, 0x81, 0x52, 0x84, 0x3d, 0x90, 0x8e, 0xc2, 0x3c, 0x04, 0x5b, 0xf3, 0xbc, 0x70, 0xa9, 0xea, 0x3c, 0x55, 0x55, 0x4d, 0xbd, 0x52, 0x8b, 0x59, 0xbd, 0xf2, 0xeb, 0x56, 0x3d, 0x1e, 0xc7, 0x3f, 0x3d, 0xe0, 0x52, 0xa3, 0x3b, 0x16, 0x93, 0x9d, 0xbc, 0x28, 0xeb, 0x36, 0x3d, 0x70, 0x4c, 0x1d, 0x3d, 0x8d, 0x81, 0x14, 0xbd, 0xb0, 0x22, 0xa0, 0xbb, 0x50, 0xfa, 0x87, 0x3c, 0x33, 0xc6, 0x2d, 0xbd, 0xd3, 0xd8, 0x85, 0x3d, 0xe8, 0xfd, 0x15, 0x3c, 0x20, 0x79, 0xe4, 0x3b, 0xb0, 0xd4, 0x4f, 0xbd, 0x24, 0xe9, 0xb5, 0x3c, 0xba, 0x47, 0x27, 0x3d, 0x23, 0xef, 0x02, 0xbd, 0xf0, 0xac, 0x31, 0x3d, 0x62, 0xde, 0xdd, 0xbc, 0x2c, 0xa0, 0x29, 0x3d, 0xa5, 0xec, 0x85, 0x3d, 0xa9, 0x1b, 0x8d, 0x3d, 0x2c, 0x6c, 0xa2, 0xbc, 0xf0, 0xc7, 0x37, 0xbc, 0x6c, 0xf7, 0xc5, 0xbc, 0xf4, 0x1d, 0x1c, 0xbc, 0x20, 0x3c, 0xc9, 0x3b, 0x9d, 0xff, 0x0b, 0xbd, 0x10, 0xa3, 0x53, 0x3d, 0x64, 0xbb, 0xc9, 0xbc, 0xfc, 0x8d, 0xe8, 0xbc, 0x20, 0x1f, 0x5a, 0x3c, 0x11, 0xe2, 0x17, 0xbd, 0xe0, 0x37, 0x97, 0x3b, 0x88, 0x44, 0x2a, 0xbd, 0x88, 0x79, 0x4c, 0xbd, 0xa8, 0x9e, 0x0d, 0x3c, 0x15, 0x54, 0x8c, 0x3d, 0xcb, 0x9b, 0x87, 0x3d, 0x18, 0xdd, 0x07, 0xbd, 0x2b, 0x33, 0x81, 0xbd, 0xb2, 0x57, 0x2e, 0xbd, 0x18, 0xc5, 0x2b, 0xbd, 0x88, 0x10, 0x91, 0xbd, 0x66, 0x69, 0x15, 0x3d, 0x98, 0x6c, 0xf7, 0x3c, 0x10, 0x05, 0x07, 0xbc, 0x44, 0x3b, 0xc6, 0xbc, 0x30, 0x43, 0xa8, 0x3b, 0x5b, 0xd8, 0x38, 0xbd, 0x66, 0x01, 0xe8, 0xbc, 0x36, 0xef, 0xaf, 0xbc, 0x88, 0x76, 0x24, 0x3c, 0x3a, 0x71, 0x5d, 0x3d, 0x30, 0xa0, 0x38, 0xbc, 0x04, 0x86, 0xf5, 0xbc, 0x30, 0xdc, 0x7c, 0x3c, 0x0c, 0x37, 0x2f, 0xbd, 0x80, 0xa4, 0x1f, 0xba, 0x2c, 0xa1, 0x2f, 0xbd, 0xb0, 0xb7, 0xa0, 0x3c, 0x37, 0xb1, 0x14, 0xbd, 0xb6, 0x07, 0x54, 0xbd, 0xb0, 0xbf, 0xd7, 0xbc, 0x6c, 0xc8, 0x2c, 0x3d, 0x2c, 0x09, 0x31, 0x3d, 0x04, 0x69, 0xe4, 0xbc, 0xa0, 0x5e, 0x7a, 0xbb, 0x90, 0x52, 0xb3, 0x3c, 0x4e, 0x6b, 0x84, 0xbd, 0xcc, 0x7e, 0x25, 0x3d, 0x30, 0x08, 0x99, 0xbb, 0x00, 0x08, 0xfc, 0x3b, 0xaa, 0xf0, 0x66, 0x3d, 0x13, 0xa5, 0x8a, 0x3d, 0xc8, 0x1c, 0xad, 0xbc, 0xf1, 0x48, 0x82, 0x3d, 0x7d, 0x18, 0x80, 0xbd, 0x14, 0x52, 0xa6, 0x3c, 0x10, 0x21, 0x9c, 0xbb, 0xfc, 0xda, 0x31, 0xbc, 0x0e, 0x65, 0xd2, 0xbc, 0x74, 0x2a, 0xcd, 0xbc, 0xb6, 0xb6, 0x64, 0x3d, 0x24, 0x32, 0x55, 0x3d, 0x8e, 0xc7, 0xbc, 0xbc, 0x94, 0x15, 0x89, 0x3c, 0x72, 0x1e, 0x3b, 0x3d, 0xb0, 0x0e, 0x25, 0x3c, 0xf8, 0x00, 0xad, 0x3c, 0xc1, 0xb3, 0x92, 0xbd, 0xce, 0xcf, 0x33, 0x3d, 0xe8, 0xec, 0x6a, 0x3c, 0x9e, 0x76, 0x9c, 0xbc, 0x4e, 0x5f, 0x29, 0xbd, 0x7c, 0xa7, 0x88, 0x3c, 0x00, 0xf3, 0xbf, 0x3c, 0x10, 0x12, 0x26, 0x3c, 0xf4, 0x7c, 0x4b, 0x3d, 0x90, 0x83, 0xec, 0xbb, 0xb6, 0x48, 0x92, 0xbd, 0x5c, 0x63, 0x47, 0x3d, 0x3f, 0xb2, 0x71, 0xbd, 0x60, 0x1f, 0x7e, 0xbc, 0xbc, 0xff, 0x9a, 0xbc, 0x96, 0x17, 0xb2, 0xbc, 0x78, 0x09, 0x0a, 0x3c, 0xa5, 0xbb, 0x8d, 0x3d, 0x80, 0x7e, 0xbd, 0x3a, 0x8c, 0x61, 0x8f, 0xbd, 0x70, 0x44, 0x19, 0x3d, 0xde, 0x63, 0x4b, 0x3d, 0x00, 0x61, 0x0b, 0xbb, 0x36, 0x70, 0x32, 0xbd, 0xc6, 0x8f, 0x71, 0x3d, 0xf0, 0xf7, 0xa0, 0xbc, 0x00, 0x80, 0x01, 0xb8, 0xe4, 0xc6, 0x93, 0x3c, 0x08, 0xd4, 0x3b, 0x3c, 0x96, 0x32, 0x40, 0x3d, 0xb8, 0x22, 0x31, 0x3d, 0x4a, 0xd9, 0x6f, 0x3d, 0x28, 0x10, 0x2c, 0xbc, 0x94, 0x4b, 0x9c, 0xbc, 0x90, 0x38, 0x57, 0x3d, 0xa4, 0x0d, 0x81, 0xbc, 0x90, 0xa5, 0xb6, 0x3c, 0x9d, 0xfe, 0x78, 0xbd, 0x3c, 0x24, 0x19, 0x3d, 0xa8, 0x56, 0x0c, 0x3d, 0x6b, 0xec, 0x54, 0xbd, 0x10, 0x49, 0x94, 0xbb, 0x80, 0x25, 0xe9, 0x3c, 0xe4, 0xb5, 0xe2, 0xbc, 0x68, 0xb2, 0x10, 0x3d, 0x6a, 0x13, 0xe0, 0xbc, 0x3a, 0x69, 0x44, 0xbd, 0x18, 0x3f, 0xfc, 0x3c, 0x6e, 0x08, 0x60, 0x3d, 0x5e, 0x5b, 0xa2, 0xbc, 0x7c, 0xbd, 0x81, 0xbd, 0xf0, 0xf9, 0xd6, 0x3b, 0xfa, 0x80, 0x14, 0xbd, 0xdb, 0xb0, 0x8d, 0xbd, 0xb0, 0x41, 0xe5, 0x3b, 0xe0, 0x03, 0xe3, 0x3c, 0xf4, 0x88, 0x07, 0xbd, 0x52, 0x89, 0xd0, 0xbc, 0x90, 0x90, 0x10, 0x3d, 0x9c, 0xc3, 0x3e, 0x3d, 0x2f, 0x07, 0x09, 0xbd, 0x7e, 0x67, 0xf6, 0xbc, 0xde, 0x88, 0xe1, 0xbc, 0xbe, 0x4b, 0x08, 0xbd, 0xac, 0xc1, 0x24, 0x3d, 0x5e, 0xd5, 0x3c, 0x3d, 0x80, 0x9e, 0x01, 0xbc, 0xa6, 0xdb, 0xc7, 0xbc, 0xbb, 0x37, 0x83, 0xbd, 0x34, 0x71, 0x50, 0x3d, 0x10, 0x46, 0x2d, 0xbd, 0x71, 0x50, 0x67, 0xbd, 0x20, 0x2e, 0x15, 0xbb, 0xaa, 0x05, 0x74, 0x3d, 0xc1, 0xb5, 0x79, 0xbd, 0x21, 0xaa, 0x44, 0xbd, 0xda, 0xbd, 0x0c, 0xbd, 0xb1, 0xee, 0x8c, 0x3d, 0x54, 0x83, 0x83, 0xbd, 0x5e, 0xe5, 0x75, 0x3d, 0x52, 0x3d, 0x73, 0x3d, 0x40, 0xf3, 0xd4, 0x3c, 0x9a, 0x1a, 0x78, 0x3d, 0x85, 0x49, 0x62, 0xbd, 0x6b, 0x57, 0x91, 0x3d, 0x30, 0xd7, 0x3f, 0x3d, 0xed, 0x16, 0x3f, 0xbd, 0xd0, 0xf4, 0x85, 0xbb, 0x47, 0x5e, 0x1e, 0xbd, 0x70, 0xe9, 0x87, 0x3c, 0x87, 0x5d, 0x80, 0xbd, 0xa0, 0x7a, 0xb6, 0xbb, 0x03, 0x86, 0x84, 0xbd, 0x50, 0x4c, 0x74, 0x3c, 0x85, 0x86, 0x80, 0x3d, 0x00, 0xe2, 0x56, 0xbb, 0x7e, 0xb0, 0x16, 0xbd, 0x10, 0xa9, 0x80, 0xbd, 0xe0, 0x8b, 0x47, 0x3d, 0x19, 0x07, 0x68, 0xbd, 0x4e, 0xd8, 0x70, 0x3d, 0xa8, 0x10, 0x2a, 0x3d, 0x22, 0x23, 0x96, 0xbc, 0x92, 0xe3, 0x72, 0xbd, 0xb8, 0x0f, 0x13, 0x3d, 0x16, 0xc3, 0x53, 0x3d, 0xa4, 0x95, 0x41, 0x3d, 0x02, 0xc3, 0x6f, 0x3d, 0x48, 0x02, 0xac, 0xbc, 0x40, 0x53, 0x6d, 0x3b, 0xf4, 0x2a, 0x19, 0xbc, 0x10, 0x1f, 0xc2, 0xbb, 0x21, 0xb8, 0x69, 0xbd, 0x97, 0x8c, 0x8a, 0x3d, 0x38, 0x13, 0xb4, 0x3c, 0xf1, 0x0d, 0x8d, 0x3d, 0x00, 0x69, 0x30, 0x3d, 0x38, 0x92, 0xf9, 0x3c, 0xb5, 0xff, 0x8a, 0x3d, 0x15, 0x27, 0x91, 0x3d, 0x96, 0xd4, 0x00, 0x3d, 0x66, 0xde, 0x1c, 0x3d, 0x7c, 0x48, 0x40, 0x3d, 0x08, 0x06, 0xf2, 0x3c, 0x8e, 0xfe, 0x71, 0x3d, 0x90, 0xa1, 0xc6, 0xbb, 0x88, 0x57, 0x05, 0x3c, 0x80, 0x92, 0x6d, 0x3a, 0x80, 0x99, 0xc9, 0xba, 0x0f, 0x0f, 0x33, 0xbd, 0x76, 0xfc, 0x31, 0x3d, 0xd8, 0x9f, 0x23, 0xbd, 0x8c, 0x07, 0x07, 0xbd, 0x68, 0x38, 0x5e, 0x3c, 0xf0, 0x39, 0xbf, 0xbc, 0x6c, 0x16, 0xfc, 0x3c, 0x94, 0xf2, 0xb4, 0xbc, 0x20, 0x52, 0xc4, 0xbb, 0xb7, 0x3f, 0x02, 0xbd, 0x78, 0x48, 0x61, 0xbd, 0x48, 0xad, 0x6b, 0xbd, 0xcd, 0xb1, 0x8c, 0x3d, 0x20, 0x28, 0xcd, 0x3c, 0xb4, 0x49, 0x53, 0x3d, 0x30, 0x59, 0x06, 0x3c, 0xda, 0xea, 0x83, 0xbd, 0xf8, 0xe2, 0x16, 0xbd, 0x96, 0xc3, 0x77, 0x3d, 0x2c, 0x90, 0xf6, 0x3c, 0x94, 0x78, 0x4d, 0xbc, 0x75, 0x0d, 0x2f, 0xbd, 0xa2, 0x00, 0xa7, 0xbc, 0x32, 0xec, 0x7c, 0x3d, 0x6c, 0x7a, 0x5a, 0xbc, 0x7e, 0x59, 0x58, 0x3d, 0x60, 0x65, 0x91, 0x3b, 0x28, 0x8b, 0x75, 0xbd, 0x22, 0xa7, 0x7b, 0x3d, 0xc4, 0xdd, 0x39, 0x3d, 0xe4, 0x54, 0xa3, 0xbc, 0xb6, 0x39, 0x30, 0x3d, 0x38, 0x91, 0x35, 0x3c, 0xd0, 0xb9, 0x10, 0x3c, 0x4c, 0x8a, 0xab, 0x3c, 0x04, 0x8d, 0x0e, 0xbd, 0x20, 0xc2, 0xcb, 0x3b, 0x32, 0xbe, 0x58, 0xbd, 0xec, 0x4e, 0x03, 0x3d, 0xf0, 0x59, 0xee, 0x3c, 0x18, 0x48, 0x0d, 0xbc, 0xa0, 0xfd, 0xe6, 0xbb, 0x8c, 0x9c, 0x4b, 0x3d, 0xa8, 0xe8, 0x13, 0x3c, 0x14, 0xb9, 0x4e, 0xbd, 0xe6, 0xbf, 0x03, 0x3d, 0xf0, 0x7a, 0xdd, 0xbc, 0xc8, 0x1b, 0x91, 0xbc, 0x9b, 0x2a, 0x24, 0xbd, 0x98, 0x93, 0x01, 0xbc, 0x1a, 0x0c, 0x34, 0x3d, 0xfe, 0xfa, 0xa3, 0xbc, 0x7c, 0x82, 0xbd, 0x3c, 0x70, 0x96, 0xe8, 0x3c, 0xa6, 0x08, 0x67, 0x3d, 0x48, 0x11, 0x68, 0xbc, 0x90, 0xfb, 0x58, 0xbd, 0x91, 0x9e, 0x8b, 0xbd, 0x4b, 0xd8, 0x87, 0xbd, 0x6a, 0x90, 0x63, 0x3d, 0x36, 0xa5, 0x20, 0x3d, 0x30, 0x61, 0x3d, 0x3d, 0x56, 0x99, 0x11, 0xbd, 0xce, 0xff, 0x70, 0x3d, 0xd5, 0x52, 0x3d, 0xbd, 0x44, 0x1e, 0x92, 0x3c, 0x6e, 0xb4, 0x44, 0xbd, 0x42, 0xeb, 0xec, 0xbc, 0xa2, 0xea, 0x85, 0xbc, 0x40, 0x48, 0x01, 0x3b, 0x52, 0xcd, 0x75, 0x3d, 0xe9, 0xa7, 0x08, 0xbd, 0x61, 0x2e, 0x0c, 0xbd, 0x06, 0xda, 0x24, 0x3d, 0xce, 0xfc, 0xf7, 0xbc, 0x62, 0xab, 0x7d, 0x3d, 0x2f, 0x02, 0x89, 0xbd, 0xea, 0x05, 0x48, 0xbd, 0xea, 0x7c, 0x7b, 0xbd, 0x80, 0x05, 0x8c, 0xba, 0xba, 0x77, 0x3d, 0xbd, 0xfa, 0xee, 0x34, 0xbd, 0xd2, 0x24, 0x28, 0x3d, 0x30, 0xb2, 0x40, 0xbd, 0x52, 0x8b, 0x18, 0x3d, 0xe3, 0xfc, 0x8b, 0x3d, 0x58, 0x86, 0x65, 0xbc, 0x64, 0x1e, 0xa8, 0xbc, 0xba, 0xc7, 0x75, 0x3d, 0xdb, 0xb4, 0x80, 0x3d, 0x07, 0x16, 0x67, 0xbd, 0x84, 0x95, 0x6d, 0xbc, 0x11, 0xb3, 0x1e, 0xbd, 0x40, 0x9b, 0x56, 0xbb, 0x7e, 0x66, 0x57, 0x3d, 0xca, 0x1c, 0x5e, 0x3d, 0x20, 0xef, 0xe5, 0x3b, 0xd3, 0x0f, 0x2e, 0xbd, 0x8a, 0xdf, 0x81, 0xbd, 0x58, 0xc9, 0x0f, 0x3d, 0xbc, 0x54, 0x63, 0xbd, 0x60, 0x24, 0x85, 0xbd, 0x5a, 0xa5, 0xda, 0xbc, 0x12, 0x87, 0x01, 0x3d, 0xf6, 0xc0, 0x96, 0xbc, 0x78, 0x46, 0x1d, 0x3d, 0xb6, 0x90, 0x62, 0xbd, 0xc0, 0x43, 0x94, 0x3b, 0xf0, 0xed, 0xce, 0xbb, 0xb8, 0x25, 0x14, 0xbc, 0xf4, 0x5c, 0x20, 0xbc, 0xd8, 0x5b, 0x1c, 0x3d, 0x44, 0xcb, 0x4c, 0xbc, 0x2e, 0xf6, 0x36, 0x3d, 0x94, 0xa7, 0xe6, 0xbc, 0xd8, 0xac, 0x4f, 0x3c, 0x06, 0x78, 0x11, 0x3d, 0xe6, 0x53, 0x14, 0x3d, 0x3b, 0x4b, 0x25, 0xbd, 0x03, 0xb6, 0x88, 0xbd, 0xd0, 0xc2, 0x2b, 0x3c, 0xc5, 0xf9, 0x12, 0xbd, 0x78, 0x6f, 0xf5, 0x3c, 0xc6, 0xc0, 0x63, 0x3d, 0x60, 0xd4, 0xa9, 0x3c, 0x1b, 0x87, 0x92, 0x3d, 0x70, 0x70, 0x35, 0xbd, 0xb8, 0xaa, 0x17, 0x3d, 0xec, 0x13, 0xde, 0xbc, 0x04, 0xc8, 0x8c, 0x3c, 0x3c, 0xcd, 0xf4, 0x3c, 0x66, 0x81, 0x4b, 0x3d, 0x3e, 0x59, 0x8b, 0xbd, 0xb8, 0xab, 0x04, 0x3c, 0xdc, 0x9a, 0xd8, 0x3c, 0x00, 0x22, 0x4d, 0x3d, 0x08, 0x10, 0x93, 0x3c, 0x64, 0x64, 0x7e, 0xbc, 0x32, 0xd1, 0x00, 0x3d, 0xfc, 0x6a, 0x2a, 0xbd, 0x04, 0x05, 0xa8, 0x3c, 0x4c, 0xb2, 0xc3, 0x3c, 0x57, 0x68, 0x0d, 0xbd, 0x18, 0x0f, 0x6e, 0xbd, 0x31, 0x3c, 0x0d, 0xbd, 0xa0, 0xef, 0xe0, 0xbb, 0x5a, 0xa3, 0xf2, 0xbc, 0xb3, 0xcd, 0x88, 0x3d, 0x0c, 0x86, 0x6e, 0xbc, 0x78, 0x6a, 0x14, 0xbc, 0x51, 0x9b, 0x2e, 0xbd, 0x45, 0x0b, 0x22, 0xbd, 0xf0, 0x38, 0x9e, 0x3c, 0x53, 0x6c, 0x87, 0x3d, 0x00, 0x20, 0x2d, 0x3a, 0x40, 0xea, 0xd2, 0xba, 0xcd, 0x35, 0x88, 0xbd, 0xb2, 0xad, 0x62, 0x3d, 0xf6, 0x83, 0xb9, 0xbc, 0x92, 0xb4, 0x4b, 0x3d, 0xe6, 0x0e, 0x86, 0xbc, 0x55, 0x4e, 0x85, 0x3d, 0x7e, 0x89, 0x05, 0x3d, 0xa1, 0xb1, 0x83, 0x3d, 0x7c, 0x7c, 0xf5, 0x3c, 0xdb, 0x2e, 0x8c, 0xbd, 0x98, 0x94, 0x5c, 0xbd, 0x0c, 0xfd, 0xb9, 0xbc, 0x40, 0x7e, 0xa5, 0x3c, 0xc0, 0x1e, 0xd6, 0x3a, 0x88, 0x80, 0x1d, 0x3c, 0x48, 0x6f, 0xfe, 0x3c, 0x2a, 0x7a, 0xde, 0xbc, 0x9c, 0x7d, 0x1a, 0xbd, 0x70, 0xd8, 0x1b, 0x3c, 0xa8, 0x27, 0x75, 0xbd, 0x92, 0x9a, 0x53, 0x3d, 0xb3, 0x0a, 0x8b, 0x3d, 0xd0, 0xe2, 0x10, 0x3c, 0xb0, 0x82, 0x9d, 0x3b, 0x38, 0x23, 0x10, 0x3c, 0xc0, 0xfb, 0xab, 0xbb, 0x7a, 0xff, 0x77, 0xbd, 0x3f, 0x50, 0x91, 0x3d, 0x30, 0x33, 0x01, 0x3c, 0x48, 0x28, 0x43, 0x3d, 0xd4, 0x59, 0xac, 0xbc, 0xa3, 0xa9, 0x0d, 0xbd, 0x1c, 0x90, 0x52, 0xbd, 0x40, 0xa7, 0x57, 0x3c, 0x94, 0x79, 0x28, 0xbd, 0xf0, 0x27, 0x9b, 0x3c, 0x02, 0x37, 0x7d, 0x3d, 0x14, 0x5b, 0x94, 0xbc, 0xde, 0x3f, 0x2c, 0xbd, 0x06, 0xe5, 0x2b, 0xbd, 0x58, 0x3a, 0x01, 0xbd, 0xda, 0x88, 0xa5, 0xbc, 0x27, 0x42, 0x08, 0xbd, 0x30, 0x39, 0xd1, 0x3b, 0xdc, 0xf2, 0xb6, 0xbc, 0x78, 0xe4, 0xe9, 0x3c, 0x56, 0xdd, 0x8c, 0xbc, 0x20, 0xbf, 0x17, 0x3d, 0x8a, 0x7a, 0x5e, 0xbd, 0x6a, 0x3e, 0xac, 0xbc, 0xb2, 0x0d, 0x7b, 0x3d, 0x02, 0x11, 0xae, 0xbc, 0x8c, 0x5a, 0x14, 0x3d, 0xba, 0x7e, 0xa6, 0xbc, 0xdc, 0x76, 0x0c, 0x3d, 0xfc, 0x09, 0x5a, 0x3d, 0x4e, 0x8d, 0x8b, 0xbd, 0xd4, 0x0c, 0xa3, 0xbc, 0x7f, 0x0e, 0x8f, 0xbd, 0x20, 0x38, 0x62, 0xbb, 0xe0, 0x57, 0xf8, 0xbb, 0x00, 0x7b, 0x12, 0xba, 0x5c, 0x6f, 0xbe, 0x3c, 0x40, 0xc3, 0x2a, 0x3b, 0xf4, 0xe3, 0xb4, 0x3c, 0xda, 0x17, 0x4d, 0x3d, 0xd0, 0xca, 0x1e, 0x3d, 0x80, 0x09, 0xaa, 0x3c, 0xce, 0x89, 0x5d, 0x3d, 0x24, 0x5d, 0x0f, 0x3d, 0xa0, 0x6d, 0x44, 0x3c, 0x0e, 0x09, 0x92, 0xbc, 0x00, 0xde, 0x57, 0x3c, 0x91, 0x01, 0x73, 0xbd, 0x5e, 0x90, 0x1a, 0x3d, 0x4c, 0xf8, 0xd6, 0x3c, 0xf8, 0x9a, 0x91, 0xbd, 0xe2, 0x1c, 0x5d, 0xbd, 0x80, 0xde, 0x76, 0x3b, 0xd6, 0x26, 0x2c, 0x3d, 0x00, 0xd0, 0x39, 0xbc, 0xfc, 0x5d, 0xee, 0xbc, 0x7a, 0xdc, 0x83, 0xbc, 0x3b, 0x14, 0x81, 0x3d, 0x30, 0x85, 0xf3, 0x3c, 0x0e, 0x0d, 0x85, 0xbd, 0x86, 0x9f, 0xcf, 0xbc, 0x32, 0xf9, 0xfa, 0xbc, 0xdc, 0x92, 0x8e, 0xbd, 0xf0, 0xf2, 0x45, 0x3c, 0xb2, 0xcd, 0x31, 0xbd, 0x40, 0x13, 0xcc, 0xba, 0x81, 0x90, 0x0b, 0xbd, 0xf5, 0xd9, 0x7d, 0xbd, 0x74, 0xf2, 0xc1, 0xbc, 0x8e, 0xb9, 0x2b, 0x3d, 0xb0, 0xef, 0x7e, 0xbd, 0x00, 0x57, 0x81, 0x3c, 0xc2, 0x40, 0x76, 0xbd, 0xaf, 0xe7, 0x08, 0xbd, 0x02, 0x79, 0x26, 0x3d, 0x77, 0x1f, 0x2f, 0xbd, 0x20, 0x66, 0x1c, 0x3c, 0x28, 0x56, 0xc2, 0x3c, 0xe8, 0x78, 0x0e, 0x3c, 0xb8, 0x4e, 0x2c, 0xbc, 0xd0, 0x97, 0x26, 0xbc, 0x5e, 0x8f, 0x3b, 0x3d, 0x30, 0xff, 0x28, 0x3c, 0x91, 0x25, 0x92, 0x3d, 0x20, 0xd1, 0x20, 0xbc, 0x24, 0xb8, 0x23, 0xbd, 0xfc, 0xca, 0x55, 0xbc, 0xf8, 0x46, 0xf0, 0x3c, 0xf7, 0x15, 0x88, 0x3d, 0x96, 0x4a, 0x78, 0x3d, 0x40, 0xdb, 0xce, 0xba, 0x50, 0x38, 0xed, 0x3b, 0x3a, 0xfd, 0x00, 0x3d, 0x40, 0x1d, 0x3d, 0xbb, 0x8a, 0xd6, 0xae, 0xbc, 0x10, 0x55, 0x7a, 0xbd, 0x91, 0x66, 0x59, 0x3d, 0x40, 0x74, 0xd5, 0xbc, 0x76, 0x92, 0xb9, 0xbc, 0xa0, 0x5c, 0x4d, 0x3d, 0x59, 0xd0, 0x4a, 0x3d, 0x65, 0xa7, 0x5e, 0xbd, 0x45, 0x6b, 0xea, 0x3d, 0x2b, 0x08, 0xdf, 0x3c, 0xb3, 0x37, 0x6e, 0x3d, 0xfa, 0xad, 0xe0, 0xbc, 0xc3, 0xd2, 0x01, 0xbe, 0x24, 0x15, 0x90, 0x3d, 0x42, 0xd3, 0xc4, 0x3c, 0x2b, 0xd6, 0x00, 0x3c, 0x9b, 0xf7, 0xcc, 0x3d, 0x7c, 0xc1, 0x37, 0x3d, 0x4c, 0x98, 0xb6, 0x3d, 0x65, 0xac, 0x04, 0x3d, 0xbe, 0x0d, 0xf6, 0x3c, 0x0a, 0x47, 0xb9, 0xbd, 0xa0, 0x2d, 0x4f, 0x3b, 0x44, 0x5d, 0xd1, 0xbc, 0x3c, 0x8b, 0x82, 0x3d, 0xf8, 0xf9, 0x02, 0xbd, 0x21, 0xa7, 0x39, 0xbd, 0xa2, 0x22, 0x82, 0x3d, 0xda, 0x8a, 0xb9, 0xbd, 0x6c, 0x42, 0x95, 0xbc, 0x98, 0x7b, 0x9a, 0x3d, 0x1d, 0x34, 0x40, 0xbd, 0x68, 0xfa, 0x6f, 0x3c, 0xd6, 0x23, 0xa0, 0x3d, 0x5a, 0xe0, 0x71, 0x3d, 0xda, 0xb5, 0x20, 0xbd, 0x0d, 0x43, 0xe0, 0x3c, 0x77, 0xeb, 0x0c, 0x3d, 0x97, 0x10, 0xf9, 0x3c, 0xdb, 0xd9, 0xe6, 0x3a, 0xcb, 0xff, 0x63, 0xbd, 0x75, 0x4f, 0xbf, 0xb9, 0x69, 0x4a, 0x20, 0xbd, 0xa2, 0xbf, 0x56, 0x3d, 0xcc, 0xfe, 0x0e, 0xbe, 0xbe, 0xe9, 0x2e, 0x3d, 0x32, 0x25, 0x5d, 0xbd, 0x77, 0x8a, 0x43, 0xbd, 0xc8, 0x8d, 0x4d, 0x3d, 0xd7, 0x87, 0xe4, 0x3c, 0xc4, 0xf1, 0x50, 0x3d, 0x1a, 0xb6, 0x1a, 0x3d, 0x70, 0x13, 0x0f, 0x3c, 0xeb, 0x1e, 0x6f, 0xbc, 0x4a, 0x22, 0x12, 0x3d, 0x7b, 0xe9, 0xcd, 0x3c, 0x1a, 0x2d, 0x93, 0xbd, 0x21, 0xcd, 0x4b, 0xbd, 0x52, 0x94, 0x21, 0x3d, 0x1c, 0xb7, 0x0e, 0xbd, 0x15, 0xea, 0x0c, 0xbd, 0x55, 0x60, 0xb0, 0x3b, 0xb4, 0x1d, 0xd0, 0x3d, 0x43, 0xa2, 0x7b, 0xbd, 0xc9, 0x7b, 0x12, 0xbd, 0x64, 0x4f, 0x87, 0xbd, 0xea, 0x0f, 0x8c, 0x3d, 0x07, 0x3a, 0xbb, 0xbd, 0xa8, 0xb6, 0x62, 0xbd, 0x74, 0xe8, 0x84, 0x3d, 0xc2, 0x72, 0x6a, 0x3d, 0x58, 0xba, 0x67, 0xbb, 0x31, 0xf4, 0xb2, 0x3d, 0x04, 0x0e, 0x92, 0xbd, 0xd4, 0x9f, 0x7a, 0x3d, 0x81, 0xd4, 0x89, 0xbc, 0xe5, 0xe2, 0xe7, 0xbd, 0xb2, 0xd7, 0x51, 0xbd, 0x64, 0x57, 0x52, 0xbd, 0xb4, 0x3f, 0x73, 0xbc, 0x22, 0x15, 0x4e, 0x3d, 0xe9, 0xf0, 0x4c, 0x3d, 0x05, 0x9b, 0xfa, 0xbc, 0x28, 0xc4, 0xa1, 0x3d, 0xd2, 0x16, 0x51, 0x3d, 0xa0, 0x9f, 0x8f, 0xbb, 0xc9, 0x02, 0x82, 0x3d, 0x13, 0x45, 0x84, 0x3c, 0x0a, 0x79, 0xc9, 0x3c, 0xb9, 0x89, 0x19, 0xbd, 0x57, 0x1f, 0x86, 0xbb, 0xaa, 0xfa, 0xa0, 0x3d, 0x27, 0x94, 0x00, 0xbd, 0x95, 0xf0, 0x86, 0xbd, 0x70, 0x37, 0x81, 0xbc, 0x0a, 0x32, 0x09, 0x3d, 0x18, 0x6d, 0x18, 0xbd, 0x16, 0x40, 0x7e, 0x3d, 0x69, 0xfb, 0xaa, 0xbc, 0x31, 0x93, 0x17, 0xbd, 0x3e, 0xc6, 0x59, 0xbc, 0x17, 0xc8, 0xe7, 0x3c, 0x9e, 0x08, 0xc3, 0x3c, 0x79, 0x41, 0x12, 0x3d, 0xc8, 0xc2, 0x37, 0xbc, 0x3f, 0xc1, 0x8f, 0xbd, 0xd9, 0x75, 0x94, 0xbd, 0x8c, 0xc3, 0x97, 0x3d, 0x36, 0xad, 0x1b, 0xbe, 0x28, 0x9f, 0x80, 0xbc, 0x79, 0x5c, 0x84, 0xbc, 0x20, 0x29, 0x6b, 0x3d, 0xe1, 0xad, 0xd1, 0xbb, 0xa4, 0x2c, 0x08, 0x3d, 0x6e, 0x13, 0x52, 0xbd, 0x4c, 0x51, 0x60, 0x3d, 0xc0, 0xae, 0x92, 0x3d, 0xd3, 0x90, 0x35, 0xbd, 0x04, 0x9e, 0x5f, 0xbd, 0x8c, 0xad, 0xee, 0xbc, 0x6f, 0x0b, 0x3e, 0x3d, 0xfb, 0x15, 0x1c, 0x3c, 0x2f, 0x67, 0x98, 0xbb, 0x90, 0x7f, 0x9f, 0x3d, 0x21, 0x97, 0x2a, 0xbc, 0xa0, 0x67, 0x9d, 0xbd, 0x5d, 0x64, 0x18, 0x3d, 0xaf, 0x36, 0xd9, 0x3b, 0xe0, 0x06, 0xdc, 0x3c, 0xd0, 0x51, 0x8e, 0x3c, 0x48, 0x40, 0x56, 0x3d, 0xac, 0x63, 0xb2, 0xbc, 0x63, 0x31, 0xf6, 0xbc, 0x48, 0x65, 0x07, 0x3d, 0x9c, 0x92, 0x8d, 0xbd, 0x5c, 0xbb, 0x96, 0xbc, 0xa7, 0xdc, 0x07, 0x3c, 0xc4, 0xe5, 0xd8, 0x3c, 0xb9, 0xea, 0x11, 0x3c, 0x10, 0x39, 0x13, 0x3a, 0x18, 0x34, 0x28, 0xbd, 0xf4, 0x41, 0x6c, 0x3c, 0x25, 0x46, 0x12, 0xbd, 0xf9, 0x23, 0x3f, 0x3d, 0xfc, 0x1d, 0xd9, 0x3d, 0x68, 0xc6, 0xa9, 0xbc, 0x97, 0x32, 0x1c, 0xbd, 0x3f, 0x51, 0xbf, 0x3d, 0x7e, 0xd5, 0x3c, 0x3c, 0xda, 0x77, 0xcb, 0xbd, 0x10, 0x52, 0xb6, 0xbc, 0xd8, 0xbd, 0x9b, 0x3d, 0x43, 0xd7, 0x7c, 0x3d, 0x4c, 0x78, 0xb2, 0xbc, 0x7c, 0xda, 0xc9, 0xbc, 0x31, 0x8c, 0x4d, 0x3d, 0x82, 0x0e, 0xcb, 0xbc, 0xed, 0xf9, 0xe8, 0x3b, 0xa8, 0x08, 0x4b, 0x3d, 0x38, 0x3c, 0x4a, 0xbd, 0x1d, 0xd9, 0x0f, 0xbd, 0xd6, 0x17, 0x86, 0x3b, 0xa1, 0x90, 0xab, 0x3d, 0x91, 0xcc, 0x8f, 0xbd, 0x07, 0xfa, 0x39, 0x3d, 0x11, 0x95, 0x03, 0x3d, 0x29, 0x0f, 0x31, 0xbc, 0x87, 0xab, 0x3c, 0x3d, 0xc8, 0xe5, 0x5c, 0xb9, 0x44, 0x79, 0x44, 0xbd, 0x6d, 0x4c, 0x90, 0xbc, 0x86, 0x90, 0xa5, 0xbc, 0x47, 0x61, 0x39, 0xbe, 0xf9, 0xeb, 0x17, 0x3b, 0xea, 0x28, 0xe4, 0xbc, 0x79, 0x88, 0x12, 0xbc, 0x7a, 0x61, 0xdd, 0x3d, 0x7f, 0xfe, 0x49, 0x3d, 0x78, 0x92, 0x5c, 0xbd, 0x6d, 0xe2, 0xa4, 0x3b, 0x68, 0x57, 0x27, 0xbd, 0x61, 0x22, 0xaf, 0x3c, 0x02, 0x98, 0x6e, 0x3d, 0x74, 0x02, 0xbb, 0x3d, 0x33, 0x4d, 0x24, 0xbd, 0x3e, 0x93, 0x81, 0xbc, 0xb2, 0x1e, 0x1f, 0x3d, 0xb5, 0x79, 0x64, 0x3b, 0xbc, 0xfb, 0xf6, 0xbc, 0x61, 0x0c, 0xcd, 0xbd, 0xc1, 0x64, 0x08, 0x3c, 0x6f, 0x3d, 0x27, 0xbd, 0x10, 0xd3, 0xdb, 0xbc, 0xe4, 0xb6, 0xd2, 0x3b, 0x51, 0x12, 0x81, 0x3d, 0x37, 0xee, 0x87, 0xbc, 0xdd, 0x80, 0xaf, 0x39, 0x90, 0x85, 0xaf, 0x3d, 0x80, 0x5f, 0x12, 0xbc, 0xcb, 0x3c, 0x63, 0xbd, 0x81, 0x3c, 0x85, 0x3d, 0x10, 0xe7, 0x54, 0xbc, 0xa6, 0xb7, 0x98, 0xbc, 0x07, 0x98, 0x2f, 0x3d, 0x70, 0x80, 0x28, 0xbe, 0x7a, 0xe5, 0x77, 0x3d, 0x0b, 0x81, 0x51, 0xbd, 0xb1, 0xdf, 0x35, 0xbc, 0xd2, 0xf7, 0x0b, 0x3d, 0xbe, 0x9e, 0x02, 0xbd, 0xa2, 0xc0, 0x03, 0x3d, 0x97, 0xf5, 0x2f, 0xbb, 0xc6, 0x6b, 0x13, 0xbd, 0x81, 0xbc, 0xe8, 0xbb, 0x2a, 0x57, 0x63, 0x3d, 0x49, 0x18, 0x51, 0xbc, 0xd7, 0x9e, 0x44, 0xbd, 0x51, 0x59, 0xb8, 0x3b, 0x5b, 0x9b, 0x86, 0x3c, 0x1d, 0x63, 0x8a, 0x3d, 0x15, 0xc7, 0x94, 0xbd, 0x43, 0xc8, 0x05, 0xbd, 0x7b, 0xc8, 0x26, 0x3d, 0xdc, 0x03, 0xbd, 0x3c, 0xa0, 0x16, 0x2b, 0xbd, 0x33, 0x15, 0xfa, 0x3c, 0xfe, 0xce, 0x91, 0xbc, 0x0f, 0x1e, 0xe3, 0x3b, 0x01, 0x19, 0x2b, 0xbd, 0x26, 0xff, 0x53, 0x3c, 0x4f, 0x22, 0x91, 0xbb, 0xf6, 0x4f, 0x84, 0xbd, 0xc5, 0xf6, 0x8a, 0x3d, 0x76, 0xcf, 0x90, 0xbd, 0x4d, 0x0e, 0xb7, 0x3d, 0x90, 0x1f, 0xd0, 0xbc, 0xd8, 0xa6, 0x7c, 0xbd, 0x39, 0xa0, 0x70, 0x3c, 0x33, 0x14, 0x91, 0xbd, 0xa4, 0x66, 0x12, 0xbb, 0xfd, 0x3b, 0x4e, 0x3d, 0x87, 0x72, 0x0c, 0x3d, 0xa1, 0x1b, 0x7b, 0xbc, 0xe0, 0x0f, 0xb5, 0xbc, 0x74, 0x49, 0x42, 0xbd, 0x61, 0x8f, 0x34, 0x3d, 0x40, 0x4a, 0xb0, 0xbc, 0x19, 0xf3, 0x14, 0x3d, 0x5c, 0xd5, 0x8a, 0x3d, 0x4e, 0xd1, 0x54, 0x3d, 0xd8, 0x0b, 0x0d, 0x3d, 0x04, 0x61, 0x85, 0x3d, 0x7e, 0x9e, 0x33, 0x3d, 0xd7, 0x75, 0xcb, 0x3b, 0x71, 0x7a, 0x89, 0xbb, 0xb5, 0x56, 0x62, 0xbd, 0x00, 0xe5, 0x87, 0xbc, 0x84, 0x92, 0xca, 0xbc, 0xf4, 0x15, 0xbb, 0xbc, 0xe7, 0xae, 0xc5, 0x3a, 0x8a, 0x96, 0x98, 0x3c, 0x55, 0xb6, 0x9a, 0xbc, 0x59, 0x6f, 0x2c, 0x3d, 0x5b, 0x3b, 0x14, 0x3c, 0xd7, 0xb4, 0xa6, 0x3b, 0x3f, 0x09, 0x21, 0x3d, 0x64, 0xfc, 0x54, 0x3c, 0x03, 0xd5, 0xf4, 0xbc, 0x06, 0x74, 0xb6, 0xbd, 0xd5, 0x70, 0x0b, 0xbd, 0xa6, 0xf8, 0x4b, 0x3c, 0xea, 0x46, 0x32, 0xbd, 0xb4, 0x06, 0x3b, 0x3c, 0xc2, 0xa8, 0x0d, 0xbb, 0x12, 0x60, 0x6f, 0x3c, 0x20, 0xca, 0x10, 0x3c, 0x05, 0xcc, 0xa6, 0xbc, 0x7a, 0xdd, 0xdf, 0xbb, 0xcc, 0x65, 0x9e, 0x3c, 0x02, 0x81, 0xe3, 0x3c, 0x58, 0x15, 0x90, 0x3d, 0x80, 0x4a, 0xb2, 0xbd, 0xd3, 0x92, 0x8d, 0x3d, 0xc8, 0x03, 0xd9, 0xbc, 0xc9, 0xce, 0x49, 0xbd, 0x57, 0xb1, 0x87, 0xbc, 0xf8, 0xc8, 0xb9, 0x3d, 0xb5, 0x6a, 0x02, 0xbd, 0x60, 0xe3, 0x24, 0x3d, 0xb3, 0xdd, 0x4d, 0x3d, 0x87, 0x6d, 0x0e, 0xbd, 0xea, 0x2d, 0x67, 0xbd, 0x62, 0x3b, 0xa9, 0xbc, 0xd1, 0x23, 0x79, 0x3d, 0x27, 0x90, 0x1a, 0x3d, 0xfa, 0xf4, 0xa3, 0x3c, 0x88, 0xf8, 0x76, 0xbd, 0x48, 0x27, 0x4e, 0xbd, 0xad, 0xe7, 0x6d, 0x3c, 0xbd, 0x3f, 0xba, 0x3d, 0x6a, 0x30, 0xb8, 0xbd, 0x2e, 0x5c, 0xc7, 0xbb, 0x76, 0x8f, 0x85, 0xbc, 0x9d, 0x0f, 0x48, 0x3d, 0xae, 0x8b, 0xa4, 0x3d, 0x72, 0xca, 0x36, 0x3d, 0xcd, 0xab, 0xad, 0xbc, 0xf4, 0x68, 0x11, 0xbd, 0xe4, 0xf0, 0x20, 0x39, 0x85, 0x8d, 0x52, 0xbd, 0x73, 0x80, 0x89, 0x3d, 0x3e, 0x97, 0x11, 0xbd, 0x44, 0xe7, 0x13, 0x3d, 0x25, 0xc3, 0x68, 0x3d, 0x4f, 0x88, 0x1c, 0x3d, 0x51, 0x5f, 0x86, 0xbc, 0xce, 0x97, 0xfb, 0xbc, 0x0e, 0x5c, 0x11, 0xbd, 0x00, 0x0f, 0x05, 0x3d, 0x8c, 0x5a, 0xe2, 0x3c, 0xdb, 0x30, 0x8c, 0x3d, 0x69, 0xac, 0xd6, 0x3c, 0xb6, 0x26, 0x22, 0x3d, 0x11, 0x74, 0x72, 0xbd, 0x85, 0xc5, 0x4e, 0x3b, 0x9c, 0x72, 0x9e, 0x3d, 0xa6, 0x49, 0x25, 0xbd, 0x9e, 0x77, 0x23, 0x3c, 0x01, 0xbf, 0x35, 0xbc, 0xf9, 0x0a, 0x06, 0xbd, 0x66, 0xc8, 0x70, 0xbd, 0xb9, 0x54, 0x80, 0x3d, 0x70, 0x83, 0xd1, 0xbc, 0x7b, 0x7a, 0xd5, 0xbc, 0x72, 0x5e, 0x1e, 0xbd, 0x7d, 0xb0, 0x24, 0x3d, 0x88, 0x95, 0x3b, 0x3d, 0xb9, 0xc0, 0x4f, 0xbc, 0xf6, 0xf0, 0xcc, 0x3c, 0x6e, 0x8d, 0x20, 0x3c, 0x0e, 0xe0, 0x8f, 0xbd, 0xfe, 0xd6, 0x2f, 0xbe, 0x40, 0x5e, 0x05, 0x3c, 0x43, 0x3c, 0x1f, 0x3d, 0x2b, 0xfe, 0x63, 0xbd, 0xac, 0xfc, 0x78, 0x3d, 0x89, 0xc7, 0x7b, 0xbd, 0xf8, 0x57, 0x38, 0xbd, 0x27, 0xf8, 0x9f, 0x3c, 0xfe, 0xbe, 0x93, 0xbc, 0xa7, 0x0b, 0x52, 0xbc, 0xf9, 0xc1, 0xae, 0x3c, 0x84, 0xf4, 0x6a, 0xbc, 0x3c, 0xcf, 0xf6, 0xba, 0x16, 0x08, 0x95, 0xbc, 0xcf, 0xf0, 0x57, 0xbd, 0x5e, 0x93, 0x98, 0xbd, 0x84, 0x6a, 0xb4, 0x3d, 0xf6, 0x01, 0xe7, 0xbc, 0x52, 0x9a, 0x85, 0xbc, 0x25, 0x22, 0x99, 0x3d, 0x00, 0xa0, 0x87, 0xbb, 0xf8, 0xb5, 0x0e, 0xbc, 0xcd, 0xd6, 0x3d, 0x3d, 0x01, 0x80, 0x2d, 0xbe, 0xf5, 0xcb, 0x94, 0x3d, 0x65, 0x93, 0x7f, 0xbc, 0x90, 0x42, 0x98, 0x3c, 0x1c, 0x10, 0x13, 0x3d, 0xed, 0xb4, 0x8e, 0x3d, 0xdb, 0xd9, 0x01, 0xbd, 0x18, 0xe6, 0x8b, 0x3c, 0x64, 0x69, 0x60, 0x3b, 0x63, 0x00, 0x1c, 0xbd, 0xe4, 0x57, 0x43, 0x3d, 0xac, 0x16, 0xdc, 0x3d, 0x3d, 0x41, 0x3d, 0xbd, 0x18, 0xcb, 0x34, 0xbd, 0x28, 0x93, 0x06, 0x3b, 0xf2, 0x17, 0x02, 0xbd, 0x2d, 0x29, 0x07, 0xbd, 0xde, 0xd1, 0x88, 0xbc, 0xd8, 0x1e, 0x86, 0x3d, 0xda, 0xd2, 0xe3, 0xbb, 0xb6, 0xd8, 0x66, 0xbd, 0xe9, 0xbd, 0x91, 0x3d, 0xd2, 0xf8, 0xa1, 0x3d, 0xce, 0x41, 0x1f, 0x3d, 0x33, 0x84, 0xfa, 0xbc, 0xa7, 0x81, 0x8f, 0x3c, 0xe2, 0xf0, 0xda, 0xbc, 0x8d, 0x67, 0x2a, 0x3d, 0xee, 0x5c, 0xef, 0x3d, 0x00, 0xf6, 0x3c, 0xbb, 0xcd, 0xa3, 0x70, 0x3d, 0x3a, 0x58, 0x89, 0x3d, 0x03, 0xe3, 0x15, 0xbe, 0xfc, 0x75, 0x10, 0x3c, 0xcc, 0xc4, 0x23, 0xbc, 0xd8, 0x48, 0x1f, 0x3c, 0xb2, 0x7c, 0xa1, 0x3a, 0x7f, 0x0b, 0xda, 0x3d, 0x0d, 0xd0, 0x03, 0x3d, 0xf3, 0xca, 0xd9, 0x3b, 0x72, 0x97, 0x1a, 0x3c, 0x5c, 0x19, 0xfa, 0xbd, 0xaa, 0x5d, 0x12, 0x3d, 0x75, 0xda, 0x58, 0x3d, 0xec, 0x05, 0xb1, 0x3c, 0x6a, 0x21, 0xd9, 0xbc, 0x1d, 0x2c, 0x8c, 0x3c, 0xfa, 0x2f, 0x1e, 0xbd, 0x93, 0x81, 0x98, 0xba, 0x42, 0x27, 0x62, 0xbd, 0x1a, 0xe3, 0xa5, 0x3d, 0x17, 0x24, 0x18, 0xbc, 0x73, 0x8a, 0x24, 0xbd, 0xea, 0x88, 0x92, 0xbc, 0x9d, 0x8d, 0xf7, 0xbc, 0xb4, 0xa6, 0xc8, 0xbd, 0xa0, 0xdd, 0x8e, 0xbd, 0x4c, 0x81, 0x72, 0x3d, 0x59, 0x67, 0x48, 0xbd, 0x23, 0x21, 0xb3, 0x3c, 0x6a, 0xc5, 0x43, 0x3d, 0x13, 0x50, 0x85, 0x3d, 0x0a, 0xd5, 0xb9, 0x3c, 0xf3, 0xe6, 0x2b, 0xbd, 0x32, 0x6c, 0xe6, 0xbc, 0x11, 0x7c, 0x05, 0x3d, 0x99, 0xeb, 0x48, 0xbc, 0x7d, 0x87, 0x35, 0xbd, 0x8b, 0x42, 0x5f, 0x3d, 0xae, 0x56, 0x10, 0x3d, 0x02, 0x1e, 0x96, 0x3d, 0xf7, 0x64, 0xab, 0x3d, 0x66, 0xc3, 0xa2, 0x3c, 0xe6, 0x36, 0xd8, 0xbc, 0x8c, 0xaa, 0x29, 0x3d, 0x52, 0x0b, 0x8b, 0xbc, 0xce, 0x93, 0xef, 0xbc, 0xd9, 0x9b, 0x2c, 0xbd, 0x4a, 0x7a, 0xe6, 0x3c, 0xa1, 0xdb, 0xaa, 0x3d, 0xfe, 0xac, 0x77, 0x3c, 0xd0, 0x02, 0xe2, 0xbc, 0x1c, 0xec, 0xef, 0xbc, 0xe0, 0x92, 0xad, 0xbd, 0x46, 0xe8, 0x02, 0x3d, 0xd0, 0x99, 0x45, 0x3b, 0x8a, 0xbc, 0x3f, 0xbd, 0x02, 0x86, 0x84, 0xbd, 0x34, 0xfb, 0xc3, 0xbd, 0x71, 0xb4, 0xb7, 0x3d, 0xc0, 0x74, 0x42, 0xbb, 0xba, 0xef, 0x5d, 0xbc, 0x2b, 0xd3, 0x21, 0x3c, 0x5a, 0xa2, 0xe4, 0xbc, 0x9f, 0xa9, 0x80, 0xbd, 0xa0, 0x48, 0xb3, 0x3d, 0x39, 0xbb, 0xa4, 0xbd, 0xa9, 0x25, 0xb4, 0x3d, 0xb7, 0x12, 0xf3, 0xbc, 0x25, 0x61, 0x37, 0xbd, 0xb9, 0x66, 0x80, 0x3d, 0xcd, 0xce, 0xcf, 0x3d, 0x9f, 0xd0, 0x90, 0xbc, 0xd7, 0xbd, 0xf4, 0x3c, 0x20, 0x96, 0x8e, 0xbd, 0xd9, 0xdf, 0x00, 0xbe, 0x8c, 0xf9, 0x5d, 0xbc, 0x58, 0xf0, 0x1e, 0x3d, 0xee, 0xec, 0x2f, 0xbd, 0x32, 0x6b, 0x46, 0xbd, 0x72, 0x10, 0x2e, 0x3d, 0x33, 0x5a, 0x09, 0xbd, 0x43, 0x78, 0x14, 0x3d, 0x33, 0xde, 0xa1, 0xbd, 0xcd, 0x6e, 0x35, 0x3c, 0x05, 0x48, 0x22, 0xbd, 0x5b, 0x57, 0x80, 0x3d, 0x66, 0x64, 0xd7, 0x3b, 0x26, 0xf1, 0x1a, 0x3c, 0x81, 0x24, 0x8a, 0xbd, 0x00, 0x84, 0x5e, 0xbd, 0xbc, 0xc0, 0xdc, 0x3b, 0x74, 0x77, 0xa3, 0x3d, 0x8a, 0x55, 0xe3, 0x3c, 0x84, 0x75, 0x2e, 0x3d, 0x45, 0x17, 0x3c, 0x3d, 0xcf, 0xd9, 0x62, 0xbd, 0x6e, 0x1c, 0xd2, 0x3c, 0x6e, 0xe1, 0x21, 0xbe, 0x36, 0xf2, 0x95, 0x3d, 0x44, 0x50, 0x00, 0xba, 0x87, 0x5b, 0xc8, 0xbc, 0xeb, 0xe0, 0xbd, 0x3d, 0x92, 0x7c, 0xff, 0x3c, 0x34, 0x97, 0x32, 0x3d, 0x8f, 0x57, 0x73, 0x3d, 0x70, 0xfe, 0x5b, 0x3c, 0xba, 0x43, 0xee, 0xbc, 0xa8, 0x7b, 0x06, 0x3c, 0xfc, 0x87, 0x8f, 0x3d, 0xf2, 0xd6, 0x43, 0xbd, 0x18, 0x3c, 0x11, 0xbc, 0x1e, 0xc3, 0x62, 0x3c, 0x46, 0x98, 0x9e, 0x3c, 0x5a, 0x90, 0xc4, 0xbc, 0xe6, 0x6b, 0x72, 0xbd, 0xce, 0x30, 0xa7, 0x3d, 0x81, 0xa2, 0x10, 0xbd, 0x4e, 0x75, 0x24, 0x3d, 0xff, 0x9d, 0xea, 0xbc, 0x25, 0x08, 0x92, 0x3c, 0x50, 0x0a, 0xf0, 0xbb, 0xf0, 0x91, 0x8d, 0xbc, 0x4c, 0xd8, 0xc8, 0x3c, 0x16, 0xbb, 0x5d, 0xbd, 0x24, 0x8d, 0x32, 0x3d, 0x75, 0x67, 0x64, 0x3d, 0xe0, 0x67, 0x46, 0x3b, 0xbc, 0x93, 0xbb, 0x3c, 0xd2, 0x74, 0x17, 0xbd, 0x45, 0x88, 0x21, 0xbe, 0x4d, 0x15, 0x95, 0x3d, 0x41, 0x5c, 0xe7, 0xbb, 0xc9, 0x97, 0xfd, 0xbc, 0x3b, 0xe2, 0x0f, 0xbd, 0x57, 0x38, 0xab, 0x3d, 0x13, 0x12, 0xeb, 0x3c, 0x92, 0x5d, 0x4f, 0x3d, 0xf0, 0x1f, 0xbf, 0xbc, 0x37, 0x63, 0xf7, 0xbc, 0xa8, 0x76, 0x32, 0x3c, 0x97, 0xd3, 0xc9, 0xbc, 0x28, 0x83, 0x5b, 0x3d, 0xe2, 0x0f, 0x90, 0xbd, 0x31, 0x0b, 0x8a, 0xbd, 0x04, 0x7c, 0xd5, 0xbc, 0x16, 0x5d, 0xa7, 0x3a, 0x54, 0x36, 0x4f, 0xbd, 0x4d, 0xae, 0x64, 0x3d, 0xfd, 0x4c, 0x94, 0xbc, 0x72, 0x3f, 0x96, 0xbc, 0x41, 0xd7, 0xfa, 0x3b, 0x52, 0x45, 0x03, 0xbc, 0x1f, 0x50, 0xa6, 0xbd, 0x28, 0xb9, 0x78, 0x3c, 0x16, 0xa5, 0x77, 0x3c, 0xf2, 0x4e, 0xa1, 0x3c, 0x84, 0xb6, 0x84, 0xbd, 0xc5, 0x78, 0xdc, 0x3c, 0xb4, 0xd1, 0x27, 0xbd, 0x04, 0x20, 0x8d, 0xbd, 0xa0, 0x12, 0x36, 0x3c, 0xce, 0xb5, 0x31, 0xbe, 0x4b, 0xfd, 0x44, 0xbc, 0xe3, 0x38, 0x00, 0xbd, 0xca, 0x35, 0x60, 0x3c, 0xc6, 0xe4, 0x93, 0xb6, 0xc9, 0x84, 0xc0, 0x3a, 0xb3, 0x53, 0x88, 0x3d, 0x08, 0x37, 0x0b, 0x3c, 0xd9, 0x6d, 0x00, 0xbb, 0x54, 0x22, 0xcc, 0xbb, 0x3c, 0x72, 0xa7, 0xbc, 0x39, 0xbd, 0xc0, 0x3d, 0xc7, 0xb5, 0x0a, 0x3b, 0xe3, 0xbc, 0x38, 0xbc, 0x0d, 0x1c, 0x1f, 0xbc, 0xbc, 0x5b, 0x42, 0xbc, 0xf3, 0x43, 0xb2, 0x3c, 0x5e, 0x7e, 0xc3, 0xbc, 0x40, 0xbf, 0x47, 0x3c, 0xe7, 0x7d, 0x3e, 0xbc, 0x30, 0xf4, 0x13, 0xbc, 0x5f, 0x8d, 0xd1, 0x3c, 0xe1, 0x93, 0xe7, 0xbc, 0x73, 0x12, 0x87, 0xbc, 0x52, 0xb6, 0x9d, 0x3b, 0xf6, 0xda, 0x8d, 0x3d, 0x6b, 0xb8, 0x03, 0x3c, 0x58, 0x8e, 0x25, 0xbd, 0x7b, 0xaa, 0x8a, 0xbc, 0x75, 0xd1, 0x84, 0x3d, 0x0e, 0x90, 0xcd, 0xbc, 0x17, 0x0e, 0x8b, 0x3d, 0x87, 0x5e, 0x04, 0xbd, 0xe5, 0x99, 0x9b, 0xbc, 0x0a, 0xdd, 0x3b, 0x3d, 0x22, 0xc9, 0x83, 0xbc, 0xb8, 0x42, 0x3f, 0x3d, 0x86, 0x99, 0x90, 0x3d, 0x41, 0x4e, 0xa2, 0x3d, 0xf0, 0x89, 0x4f, 0xbd, 0xa6, 0x28, 0x75, 0xbd, 0xea, 0xf1, 0x56, 0xbd, 0x96, 0xb0, 0x9b, 0xbc, 0x01, 0x85, 0xb5, 0x3d, 0xcf, 0x71, 0x4c, 0x3d, 0x98, 0xf9, 0x6d, 0xbc, 0xc8, 0x59, 0x38, 0xbd, 0x12, 0x6f, 0x7b, 0x3d, 0x61, 0xac, 0xf1, 0xbb, 0xd4, 0x32, 0x4a, 0x3d, 0x92, 0x25, 0x45, 0x3d, 0x53, 0x88, 0x6d, 0xbd, 0xa0, 0x69, 0xda, 0xbb, 0xf2, 0xf2, 0xda, 0x3b, 0xf3, 0x4d, 0x84, 0xbc, 0x61, 0x96, 0xda, 0x3c, 0xa3, 0x9c, 0x9a, 0x3b, 0x70, 0x04, 0x93, 0xbb, 0x11, 0x0f, 0xe7, 0xbc, 0x06, 0x52, 0x86, 0xbd, 0x0f, 0xf5, 0x6c, 0xbd, 0xe1, 0x4c, 0x8d, 0x3d, 0x59, 0x20, 0xa0, 0xbd, 0xf8, 0x29, 0x94, 0x3d, 0x3f, 0x89, 0x86, 0xbd, 0x15, 0x66, 0x15, 0xbd, 0xad, 0x80, 0xdf, 0x3c, 0x5b, 0xd4, 0x6c, 0xbc, 0x2c, 0x5f, 0x60, 0x3c, 0x2b, 0x82, 0xd5, 0x3c, 0x3f, 0x7e, 0x14, 0xbd, 0x6c, 0xe8, 0xaf, 0xbb, 0xee, 0x8b, 0x27, 0xbd, 0xa0, 0xa8, 0x20, 0xbd, 0xe8, 0x39, 0x54, 0xbc, 0x9b, 0x57, 0xb7, 0x3d, 0x6a, 0x42, 0x81, 0x3d, 0xd3, 0x09, 0x10, 0xbd, 0x95, 0xd4, 0x3a, 0x3d, 0x48, 0xe1, 0xb8, 0xbc, 0xf4, 0x91, 0xa0, 0xbd, 0x8e, 0x67, 0x5e, 0xbd, 0x3b, 0x3d, 0xa0, 0x3d, 0x82, 0x2e, 0x85, 0x3d, 0x10, 0x91, 0x8c, 0xbb, 0x63, 0xb7, 0x75, 0xbd, 0xf5, 0xd8, 0x35, 0xbd, 0xea, 0x58, 0x11, 0xbb, 0xc4, 0x87, 0xe5, 0xbc, 0xb4, 0x14, 0xce, 0x3d, 0x86, 0x00, 0x0b, 0x3c, 0x91, 0x4b, 0xb2, 0xbd, 0xa9, 0x2e, 0x93, 0x3d, 0xc3, 0x3a, 0xc3, 0xbb, 0x7c, 0x8a, 0x83, 0xbd, 0xd2, 0xb1, 0x2e, 0xbd, 0xbb, 0x27, 0xa9, 0xbd, 0xa7, 0x9f, 0x41, 0x3d, 0x0a, 0x47, 0x15, 0xbd, 0xeb, 0x11, 0xca, 0x3c, 0xfe, 0x0d, 0xef, 0xbc, 0x71, 0x53, 0x52, 0x3d, 0x0b, 0x4b, 0x44, 0x3c, 0x9d, 0xbf, 0x10, 0xbb, 0xf9, 0x31, 0xe6, 0x3c, 0x97, 0x60, 0xbd, 0xbd, 0x8c, 0x40, 0x87, 0x3c, 0x30, 0x66, 0x18, 0x3d, 0x1a, 0x2b, 0xcd, 0x3c, 0x52, 0x92, 0x7e, 0xbd, 0x58, 0xee, 0x02, 0x3d, 0x0a, 0x85, 0xf7, 0xbc, 0x76, 0x75, 0x7f, 0xbd, 0xff, 0x11, 0xde, 0x3b, 0x5b, 0x43, 0x4b, 0x3d, 0xa2, 0x53, 0x3f, 0xbd, 0x90, 0xf3, 0x42, 0xbd, 0x5b, 0xb9, 0x1e, 0x3d, 0x43, 0x66, 0x46, 0xbc, 0x3e, 0x79, 0x7f, 0xbd, 0x24, 0xa8, 0xa0, 0xbd, 0xd5, 0xb2, 0xd2, 0x3c, 0xf6, 0x82, 0x7d, 0x3b, 0x52, 0x09, 0x4e, 0xbd, 0x23, 0x30, 0xfa, 0x3d, 0x62, 0xb4, 0x72, 0x3d, 0xa6, 0x3c, 0x98, 0x3c, 0x20, 0x3f, 0xdd, 0xbb, 0xb0, 0xfa, 0x4f, 0xbd, 0x0f, 0x36, 0x24, 0xbb, 0x19, 0xbc, 0x7d, 0xbd, 0x8d, 0xab, 0x2e, 0x3d, 0x1e, 0x67, 0x61, 0x3d, 0x8a, 0x39, 0x61, 0xbb, 0xb1, 0xa0, 0x01, 0xbc, 0x0d, 0x75, 0x64, 0xbc, 0x89, 0xd7, 0x84, 0xbd, 0x1f, 0x26, 0xa6, 0xbd, 0x7a, 0x67, 0x62, 0x3d, 0x3d, 0x4d, 0x06, 0xbb, 0xff, 0xe4, 0x92, 0x3d, 0x32, 0x12, 0x95, 0xbc, 0x4b, 0x2e, 0x8b, 0xbc, 0x8b, 0x4a, 0x14, 0x3c, 0xea, 0x08, 0x81, 0xbd, 0xb3, 0x3e, 0xb3, 0xbd, 0x96, 0x40, 0xef, 0x3c, 0xc6, 0xf4, 0x83, 0xbd, 0x70, 0x8a, 0xad, 0xbc, 0x28, 0x6d, 0x26, 0xbd, 0x0e, 0x8f, 0x89, 0x3a, 0xbc, 0x30, 0xc8, 0xbd, 0x81, 0x3c, 0x22, 0xbd, 0x19, 0x06, 0xb4, 0x3d, 0x2a, 0xbf, 0x2a, 0x3d, 0xc9, 0xd4, 0x00, 0xbd, 0x74, 0x7d, 0x9b, 0x3b, 0xc5, 0x7a, 0x13, 0xbd, 0xbf, 0x24, 0x18, 0xbc, 0x63, 0x21, 0xfd, 0x3c, 0x8f, 0x45, 0xf6, 0xbd, 0xf6, 0xb7, 0x85, 0x3c, 0x49, 0xc7, 0xee, 0xbb, 0x31, 0x16, 0x9c, 0x3d, 0x86, 0x9e, 0x44, 0x3d, 0x97, 0x25, 0x99, 0x3d, 0x33, 0x23, 0xa6, 0x3d, 0x7f, 0x66, 0x2b, 0x3d, 0xbd, 0xe9, 0x43, 0x3d, 0x11, 0x56, 0x76, 0xbc, 0x30, 0x7c, 0x87, 0xbb, 0xfe, 0xae, 0xfb, 0xb8, 0x4c, 0x48, 0x47, 0xbd, 0x74, 0x13, 0x8b, 0xbd, 0x26, 0x22, 0x87, 0x3d, 0x22, 0xb0, 0x87, 0x3d, 0x9f, 0xc6, 0x74, 0xbd, 0x7a, 0x47, 0x70, 0x3c, 0xe0, 0x41, 0x8b, 0x3d, 0xfb, 0xa2, 0x43, 0xbc, 0x63, 0x0d, 0x21, 0xbd, 0x8a, 0x60, 0x36, 0xbb, 0x54, 0xe8, 0x59, 0x3c, 0x21, 0xd4, 0xa9, 0x3b, 0x00, 0x5b, 0x20, 0x3d, 0x61, 0x25, 0x72, 0x3d, 0x39, 0x8d, 0x3b, 0x3d, 0x5e, 0xcd, 0x4f, 0x3d, 0xa0, 0x47, 0x0c, 0xbd, 0x34, 0xc9, 0x09, 0x3d, 0xb8, 0x59, 0xa2, 0xbc, 0x9a, 0xa3, 0x82, 0x3d, 0x1b, 0xd4, 0x1f, 0xbe, 0xa4, 0x45, 0x9d, 0x3d, 0x9e, 0x03, 0xc6, 0x3c, 0x0c, 0x23, 0x30, 0x3d, 0x9c, 0xb4, 0xec, 0xbb, 0xf8, 0x66, 0x9c, 0xbc, 0x6c, 0x32, 0x7e, 0x3d, 0x4b, 0x32, 0x51, 0x3d, 0x64, 0x32, 0x75, 0x3d, 0x1b, 0xc9, 0xd1, 0x3c, 0x98, 0xac, 0x05, 0x3d, 0x4a, 0x99, 0x74, 0x3b, 0x40, 0x86, 0x41, 0xbd, 0xf6, 0xa7, 0x03, 0xbd, 0x95, 0x47, 0x23, 0x3c, 0x78, 0xf3, 0x0c, 0x3d, 0xf4, 0x66, 0xdc, 0x3b, 0x4d, 0x45, 0xbf, 0xbb, 0x65, 0x4b, 0x73, 0xbc, 0x51, 0x10, 0x8c, 0x3c, 0x5e, 0x5a, 0x67, 0x3d, 0xd7, 0x47, 0x82, 0x3d, 0xdc, 0x32, 0x9c, 0xbc, 0xe4, 0xa5, 0x87, 0xbd, 0xc2, 0xd2, 0xc4, 0xbd, 0x08, 0xbe, 0x6e, 0x3d, 0xa8, 0x8b, 0xf1, 0x3c, 0x10, 0xc0, 0xb1, 0xbc, 0x12, 0x09, 0x88, 0x3d, 0x3f, 0x54, 0x25, 0x3d, 0x11, 0x70, 0x26, 0x3b, 0xdd, 0x48, 0x18, 0x3c, 0x01, 0x3c, 0xee, 0xbd, 0x4f, 0x63, 0x36, 0xbc, 0xea, 0x7e, 0x3f, 0x3d, 0x86, 0x4d, 0x45, 0x3d, 0x4b, 0x63, 0x70, 0xbc, 0x32, 0xdf, 0xc0, 0x3d, 0x50, 0x3c, 0x13, 0x3c, 0x0e, 0x61, 0xa3, 0x3d, 0xe8, 0xc5, 0x37, 0xbd, 0x3b, 0xd7, 0x01, 0xbd, 0x20, 0x1b, 0x89, 0xbc, 0x70, 0x18, 0xee, 0xbc, 0x3e, 0xeb, 0xfa, 0xbb, 0x18, 0xda, 0xda, 0x3c, 0xd6, 0x82, 0x19, 0xbd, 0xf1, 0x7e, 0x88, 0xbd, 0x39, 0x1d, 0xb8, 0xbb, 0x67, 0x98, 0x1c, 0x3d, 0x72, 0x83, 0x90, 0x3d, 0xd3, 0x17, 0x6b, 0xbd, 0xcc, 0x55, 0xa8, 0x3c, 0x18, 0x2e, 0x2c, 0xbd, 0x08, 0xc4, 0x34, 0x3c, 0xf8, 0x8f, 0x51, 0xbd, 0x88, 0x62, 0xfe, 0x3c, 0xbc, 0xe0, 0xb1, 0xbc, 0x09, 0x93, 0x88, 0xbb, 0x95, 0x9c, 0xda, 0x3c, 0x83, 0xda, 0x3a, 0xbd, 0xb8, 0x82, 0x81, 0x3c, 0x39, 0xa8, 0x8a, 0xbd, 0x8b, 0xb0, 0x31, 0xbb, 0x4a, 0x2c, 0x07, 0xbe, 0xec, 0x84, 0x9b, 0x3c, 0xc9, 0x97, 0x56, 0x3d, 0x3d, 0xce, 0x97, 0xbd, 0xa6, 0xe3, 0xbc, 0x3d, 0x91, 0xc4, 0x0f, 0x3d, 0x35, 0xe9, 0xd1, 0xbc, 0x10, 0x48, 0x17, 0x3c, 0x9a, 0x86, 0x86, 0xbd, 0x08, 0x63, 0xf9, 0xbc, 0xb0, 0xb0, 0x98, 0x3c, 0x3e, 0x7e, 0x4e, 0x3d, 0xe0, 0x6f, 0x73, 0xbc, 0xa5, 0x9e, 0x03, 0xbd, 0x7c, 0x39, 0x53, 0x39, 0x6d, 0x86, 0x40, 0xba, 0x1d, 0x71, 0x86, 0x3d, 0x62, 0xec, 0x9d, 0x3c, 0x03, 0x1e, 0x29, 0x3d, 0xbd, 0xbf, 0xd2, 0xbd, 0xce, 0x1c, 0x0c, 0x3d, 0x7f, 0xb3, 0x9c, 0x3d, 0x93, 0xa6, 0xa1, 0xbc, 0xb9, 0xf4, 0x6b, 0xbd, 0x17, 0xce, 0x40, 0xbd, 0x33, 0x15, 0x00, 0x3d, 0xd3, 0x33, 0x9c, 0x3d, 0x01, 0xc6, 0xec, 0x3c, 0x65, 0x42, 0xba, 0x3c, 0x33, 0x73, 0xec, 0xbc, 0x47, 0xf8, 0x00, 0x3d, 0xd1, 0x1b, 0x66, 0x3d, 0x10, 0x9b, 0x0b, 0xbe, 0xe6, 0x45, 0x48, 0xbd, 0x90, 0x46, 0xbd, 0x3c, 0x29, 0xe0, 0xb5, 0xbc, 0x50, 0x42, 0x6a, 0x3d, 0x00, 0x37, 0x9e, 0x3d, 0xc1, 0x54, 0xa0, 0x3c, 0x00, 0x3c, 0x2f, 0xbb, 0x05, 0x4f, 0xa7, 0xbc, 0x3d, 0x86, 0x68, 0xbd, 0x24, 0x65, 0x51, 0xbc, 0xff, 0x74, 0x21, 0x3d, 0x81, 0x5d, 0x25, 0x3d, 0x5d, 0xd0, 0x7a, 0xbd, 0x37, 0xb1, 0x40, 0xbd, 0xf0, 0xfd, 0x3d, 0x3d, 0x1e, 0xb2, 0x2a, 0xbc, 0x62, 0x35, 0x9e, 0xbd, 0xeb, 0x65, 0x51, 0xbc, 0x6f, 0xf6, 0x9a, 0xbd, 0x82, 0x5b, 0x81, 0xbc, 0xd7, 0x8a, 0x29, 0x3d, 0x5a, 0x89, 0x81, 0xbb, 0x6d, 0xf8, 0xe0, 0x3c, 0xa6, 0x56, 0x3c, 0x3d, 0x9d, 0xc6, 0x49, 0xbc, 0xdf, 0x38, 0x79, 0x3c, 0x51, 0x74, 0x4e, 0x3d, 0x02, 0xb4, 0x2e, 0xbd, 0x6e, 0x2c, 0x52, 0xbd, 0x98, 0x05, 0x96, 0x3c, 0x5e, 0xef, 0x12, 0x3d, 0xa9, 0x44, 0x29, 0xbd, 0x29, 0xcf, 0x47, 0x3d, 0x08, 0x33, 0xa3, 0xbd, 0xc7, 0xe5, 0x26, 0x3c, 0x16, 0xf0, 0xc7, 0xbc, 0x89, 0xde, 0xa2, 0x3a, 0x57, 0x77, 0xb9, 0x3b, 0xa0, 0x30, 0x9d, 0x3c, 0xd9, 0xf8, 0x91, 0xbc, 0xdc, 0xac, 0x41, 0x3c, 0xc9, 0xe5, 0x1a, 0xbd, 0x66, 0xcc, 0x89, 0x3d, 0xae, 0x83, 0x95, 0xbd, 0xf6, 0x92, 0xd3, 0x3c, 0x6a, 0x9a, 0xf7, 0x3c, 0xb4, 0xf9, 0x7c, 0xbb, 0x79, 0xd8, 0x99, 0xbc, 0x82, 0x88, 0xb6, 0xbc, 0xf7, 0xdf, 0xb3, 0x3d, 0x57, 0xa6, 0xa7, 0xbd, 0x2e, 0x22, 0xd9, 0xbc, 0xd6, 0x67, 0x91, 0xbc, 0x54, 0x25, 0x32, 0x3d, 0xc3, 0x91, 0x93, 0xbd, 0x1d, 0x77, 0x33, 0x3b, 0x56, 0xc9, 0x8b, 0x3d, 0xbf, 0xe2, 0x21, 0x3c, 0xf5, 0x88, 0x80, 0xbd, 0xee, 0x4f, 0xd8, 0xbc, 0xbf, 0x1c, 0x83, 0xbd, 0xa4, 0x91, 0x61, 0x3d, 0xdc, 0xc1, 0x74, 0x3d, 0xb4, 0x4d, 0x90, 0xbd, 0x80, 0x3d, 0xbb, 0x3c, 0x27, 0x03, 0xa2, 0xbb, 0x7e, 0x7e, 0xd9, 0x3c, 0xf4, 0x18, 0x5f, 0xbc, 0xb1, 0xde, 0x83, 0x3d, 0xd5, 0xee, 0x20, 0xbd, 0xbe, 0xa8, 0x7a, 0xbc, 0x01, 0x94, 0x03, 0xbd, 0x27, 0xa8, 0xfc, 0xbd, 0x72, 0x14, 0x56, 0x3d, 0x79, 0x46, 0x0d, 0xbc, 0x69, 0x23, 0xd1, 0x3c, 0x3b, 0x33, 0x49, 0x3d, 0x8d, 0xef, 0x18, 0x3b, 0xe9, 0xe1, 0x8f, 0xbd, 0x4f, 0x45, 0x05, 0x3d, 0x28, 0x80, 0x49, 0x3c, 0xbd, 0x49, 0x18, 0x3d, 0xfd, 0xd4, 0x86, 0x3c, 0xcc, 0x56, 0xa6, 0x3c, 0x37, 0x8e, 0xef, 0x3a, 0x57, 0x1e, 0x5f, 0x3d, 0xc2, 0xef, 0x68, 0xbc, 0x24, 0xc0, 0xbe, 0xbd, 0x9c, 0xfd, 0xa0, 0x3b, 0x48, 0x3b, 0x5d, 0x3d, 0xcf, 0xe0, 0x2c, 0xbd, 0x49, 0x51, 0xa7, 0x3d, 0x65, 0xcf, 0x7a, 0xbc, 0x27, 0x68, 0x4c, 0xbd, 0x00, 0xed, 0x99, 0xbc, 0x2a, 0xac, 0x5d, 0xbd, 0x6b, 0x5c, 0x9a, 0x3c, 0x71, 0xb7, 0x51, 0x3c, 0x1a, 0x04, 0x60, 0xbd, 0x4b, 0xb8, 0x42, 0x3d, 0xf6, 0x92, 0x4f, 0x3d, 0xcb, 0x7a, 0xc4, 0x3c, 0xc2, 0x1f, 0x85, 0x3d, 0xbf, 0x4c, 0x3b, 0x3b, 0x52, 0x04, 0x9a, 0xbd, 0x3a, 0x5c, 0x29, 0x3d, 0x5f, 0x4e, 0xb1, 0x3d, 0xfc, 0x4e, 0x87, 0xbc, 0x59, 0x10, 0xaa, 0x3d, 0x99, 0xff, 0x43, 0x3d, 0x20, 0x80, 0x8e, 0x3c, 0x79, 0x81, 0x3e, 0xbd, 0xfe, 0x38, 0xab, 0xbd, 0x3d, 0x72, 0xad, 0x3d, 0x18, 0xa1, 0x64, 0xbd, 0xa0, 0x6e, 0xb0, 0xbb, 0x19, 0x6b, 0x00, 0x3d, 0x6b, 0x7b, 0x15, 0xbc, 0x45, 0xb5, 0xa6, 0xbd, 0xef, 0x81, 0x05, 0xbd, 0x9f, 0xe8, 0x37, 0x3d, 0x71, 0xbe, 0xb6, 0xbc, 0x22, 0x55, 0xd6, 0xbc, 0x0d, 0x9b, 0xcf, 0x3c, 0x47, 0xa3, 0x92, 0x3d, 0xfd, 0x13, 0x74, 0x3d, 0x4f, 0xef, 0x53, 0x3d, 0x8b, 0xeb, 0x0f, 0xbd, 0xf9, 0x86, 0x00, 0x3d, 0xb8, 0xd1, 0x68, 0xbc, 0x68, 0xa4, 0x1c, 0xbd, 0x96, 0x27, 0x01, 0x3d, 0x28, 0x65, 0x4a, 0x3d, 0xef, 0xa3, 0x41, 0xbd, 0xdd, 0xd4, 0xac, 0x3c, 0x24, 0x42, 0x48, 0x3d, 0x55, 0x49, 0x99, 0x39, 0x7a, 0x2f, 0xde, 0xbc, 0x7f, 0xff, 0x94, 0x3d, 0x76, 0x44, 0x14, 0xbd, 0xea, 0xa9, 0x05, 0x3d, 0xd1, 0xa5, 0x2c, 0x3d, 0xfa, 0x4f, 0x0c, 0xbd, 0xda, 0x0a, 0x6d, 0xbd, 0x52, 0x92, 0x47, 0x3d, 0x8b, 0x87, 0x8b, 0x3d, 0xd0, 0x89, 0x48, 0xbd, 0xaa, 0xbe, 0x03, 0x3d, 0xa0, 0x14, 0x6d, 0xbd, 0x20, 0x3a, 0x80, 0x3d, 0x08, 0x2f, 0x86, 0xbd, 0xf9, 0xfd, 0xa4, 0xbd, 0xde, 0xd5, 0x92, 0xbc, 0xcd, 0x8a, 0x64, 0x3d, 0x48, 0xd0, 0x6c, 0x3d, 0x6a, 0xa3, 0xfa, 0xbc, 0xc3, 0xc7, 0x36, 0xbd, 0xb1, 0x87, 0x2e, 0xbd, 0x3b, 0x6c, 0x9e, 0x3d, 0x56, 0x18, 0x1a, 0xbe, 0x9e, 0xd1, 0xf5, 0x3c, 0xb9, 0xfe, 0xc3, 0xbc, 0x46, 0xbc, 0x40, 0xbd, 0x94, 0x3a, 0x48, 0x3d, 0xbc, 0x4e, 0xbb, 0x3d, 0xa0, 0x7b, 0x94, 0xbc, 0xd8, 0xeb, 0x91, 0x3d, 0x95, 0xa1, 0x99, 0xbd, 0xf4, 0x73, 0x9c, 0x3b, 0x23, 0x2d, 0x8e, 0x3d, 0x46, 0x9c, 0xa5, 0xbb, 0x61, 0x13, 0x50, 0xbd, 0xad, 0x99, 0xf8, 0x3c, 0xd2, 0xac, 0x7d, 0xbd, 0xc1, 0xb2, 0x6d, 0xbc, 0xf7, 0xde, 0x9f, 0xbd, 0x60, 0x72, 0x15, 0x3d, 0x69, 0xaf, 0xa2, 0x3d, 0xfd, 0x72, 0x79, 0x3d, 0xd0, 0xc0, 0xa1, 0xbb, 0x80, 0x21, 0x4f, 0x3d, 0xbc, 0x91, 0x0a, 0xbc, 0x23, 0xa3, 0xee, 0xbc, 0xd0, 0x1a, 0xbb, 0xbd, 0x2a, 0x71, 0x35, 0x3d, 0x21, 0x26, 0x66, 0x3d, 0xb4, 0x17, 0x89, 0xbb, 0x54, 0x4f, 0x80, 0xbc, 0x47, 0x10, 0xf3, 0xbc, 0x22, 0x75, 0x6c, 0x3d, 0xb1, 0x75, 0x00, 0x3d, 0xe2, 0xf4, 0xf5, 0xbd, 0xbe, 0xbc, 0x7b, 0x3d, 0xe3, 0x01, 0xc1, 0xbc, 0x05, 0x25, 0x82, 0xbb, 0x3f, 0x02, 0x5d, 0xbb, 0xa9, 0xc1, 0x5a, 0x3d, 0xea, 0xe4, 0x5e, 0x3c, 0x96, 0xd6, 0xa5, 0x3c, 0xcb, 0x77, 0xa4, 0x3c, 0xb2, 0x4f, 0x06, 0xbd, 0x84, 0xc3, 0x2c, 0xbd, 0x48, 0xdc, 0x9d, 0x3b, 0xdb, 0xd6, 0xbb, 0xbc, 0xc8, 0xdf, 0x98, 0xbc, 0x29, 0x14, 0x31, 0x3d, 0x6f, 0xfa, 0x4f, 0xbd, 0x7c, 0xb4, 0xaa, 0xbd, 0xe0, 0xeb, 0x2e, 0xbd, 0x53, 0x3f, 0xc4, 0x3d, 0xbc, 0xcb, 0x38, 0x3d, 0x30, 0x45, 0x30, 0x3c, 0xf0, 0xc1, 0x0c, 0xbd, 0xb3, 0x20, 0x39, 0xbd, 0x80, 0xe2, 0x8b, 0x3b, 0x35, 0x31, 0x05, 0xbd, 0xf5, 0xaa, 0x49, 0xbc, 0x7d, 0x08, 0x0a, 0x3d, 0xdd, 0x96, 0x84, 0xbc, 0x0f, 0xb9, 0x4c, 0x3d, 0x49, 0xea, 0x86, 0x3d, 0xc9, 0xd0, 0x75, 0xbb, 0xcd, 0x9b, 0xd1, 0x3d, 0x7a, 0x5e, 0x6f, 0xbd, 0x4a, 0x2e, 0xc0, 0xba, 0x3b, 0x7d, 0x7d, 0xbd, 0x2b, 0x8f, 0xfe, 0xbb, 0x2a, 0xf4, 0xce, 0x3d, 0xf6, 0xfc, 0x06, 0xbc, 0xdd, 0x02, 0x4a, 0x3c, 0x71, 0x3c, 0x03, 0xbd, 0x03, 0x9a, 0x90, 0xbd, 0x76, 0xb7, 0xb3, 0xbd, 0xa2, 0xd1, 0x47, 0xbd, 0xc1, 0x56, 0x6e, 0x3d, 0xff, 0x97, 0x57, 0x3d, 0x50, 0x57, 0xe6, 0xbc, 0x8f, 0xb3, 0x3d, 0xbd, 0x75, 0x8e, 0x80, 0xbd, 0xc7, 0x6c, 0x43, 0xbc, 0xaa, 0xe3, 0x9d, 0xbd, 0x6f, 0xe4, 0x1d, 0x3d, 0x3a, 0x57, 0x98, 0x3c, 0x6c, 0x08, 0x5c, 0x3d, 0xeb, 0xd2, 0xa5, 0xbb, 0xf7, 0x60, 0x08, 0xbc, 0x72, 0x03, 0x3b, 0xbd, 0xe7, 0xc1, 0x8f, 0x3d, 0xb6, 0x1f, 0x98, 0x3d, 0x59, 0xff, 0x88, 0x3d, 0x51, 0xe9, 0x73, 0xbc, 0x1f, 0x91, 0xa5, 0x3d, 0x3b, 0x64, 0x17, 0xbd, 0x5b, 0xa5, 0x80, 0x3d, 0x03, 0x38, 0x85, 0x3d, 0xbe, 0x27, 0x90, 0xbd, 0x4e, 0x87, 0xa3, 0xbc, 0xc1, 0xbb, 0x22, 0xbc, 0x8b, 0x25, 0xd0, 0xbb, 0x6a, 0x2f, 0x1d, 0x3d, 0x0a, 0xdd, 0x48, 0x3d, 0x0b, 0x37, 0x37, 0x3d, 0x2a, 0x68, 0x1a, 0x3d, 0xc8, 0x85, 0x4a, 0x3d, 0x0a, 0xa5, 0x03, 0x3c, 0xd2, 0x41, 0x12, 0x3d, 0x25, 0xc3, 0x24, 0x3b, 0x1a, 0x95, 0x33, 0x3d, 0xbf, 0xfd, 0xd7, 0x3c, 0xce, 0xff, 0x6e, 0xbc, 0x91, 0xc5, 0x0f, 0x3c, 0x7e, 0x5f, 0x64, 0xbd, 0x64, 0x7d, 0x1c, 0xbd, 0x42, 0x2d, 0xba, 0x3d, 0x99, 0x69, 0xa5, 0x3c, 0x39, 0x7d, 0x72, 0xbd, 0x6a, 0xbf, 0x8f, 0x3b, 0xaa, 0x43, 0x02, 0x3d, 0xb7, 0xb7, 0x35, 0xbd, 0x97, 0xaf, 0x6c, 0x3c, 0x62, 0x39, 0xd6, 0xbc, 0x33, 0xd6, 0x85, 0x3d, 0x4c, 0x50, 0x47, 0x3d, 0x26, 0x4b, 0x57, 0x3d, 0xf8, 0x80, 0x15, 0x3c, 0x9e, 0x69, 0x05, 0xbc, 0xa4, 0x13, 0xb5, 0x3d, 0x41, 0x17, 0xda, 0xbd, 0x48, 0x79, 0x2b, 0xbb, 0xb4, 0x86, 0xcc, 0xbb, 0xad, 0x20, 0x95, 0xbd, 0x20, 0xf5, 0x01, 0x3e, 0x23, 0x9e, 0x9b, 0x3d, 0xdb, 0xfe, 0x38, 0x3b, 0x23, 0x42, 0x57, 0x3b, 0x42, 0x99, 0x59, 0x3d, 0xf2, 0x9d, 0xba, 0xbd, 0x92, 0xe5, 0x5d, 0x3d, 0x20, 0x17, 0x07, 0xbb, 0xf0, 0x57, 0x08, 0x3d, 0x7d, 0xed, 0x91, 0xbc, 0x2e, 0xc4, 0x8d, 0xbd, 0xdb, 0x15, 0xc2, 0x3c, 0xaa, 0xc3, 0xe6, 0xbb, 0x90, 0x5d, 0xb4, 0xbc, 0xee, 0xaa, 0x9a, 0x3d, 0x74, 0x6d, 0x22, 0xbb, 0x00, 0x65, 0xc2, 0xb9, 0x37, 0x30, 0x07, 0xbd, 0x85, 0xbd, 0x60, 0xbb, 0x2b, 0x40, 0xd7, 0x3c, 0xca, 0x82, 0x33, 0xbd, 0x29, 0xb2, 0x81, 0x3d, 0x08, 0xee, 0xd5, 0x3c, 0x28, 0x34, 0xdf, 0x3c, 0x3d, 0x41, 0x67, 0xbd, 0x0c, 0x1e, 0xf7, 0x3c, 0x9c, 0x86, 0xe4, 0x3c, 0x36, 0x7c, 0x07, 0x3d, 0xc7, 0x27, 0x04, 0xbd, 0x45, 0xcb, 0x77, 0x3d, 0xcf, 0x66, 0x14, 0xbd, 0x29, 0xae, 0x3f, 0xbd, 0x70, 0x86, 0x25, 0xbc, 0x08, 0xc9, 0xa6, 0x3c, 0x70, 0xa3, 0xa8, 0xbb, 0xbe, 0x82, 0x49, 0x3d, 0x13, 0xa1, 0x73, 0xbd, 0xd5, 0x6c, 0x35, 0xbd, 0x98, 0xfa, 0x3a, 0x3c, 0xff, 0x0c, 0xe2, 0xb9, 0x37, 0xe9, 0xf2, 0xbb, 0x78, 0x2d, 0x89, 0xbd, 0xec, 0x2c, 0x88, 0xbc, 0x97, 0x7f, 0x2e, 0x3d, 0x9e, 0x32, 0x88, 0xbd, 0x17, 0xdb, 0x20, 0xbd, 0xde, 0xbd, 0xc7, 0x3b, 0x30, 0x01, 0xf4, 0x3c, 0xf8, 0x47, 0x05, 0xbd, 0xab, 0x0c, 0xdf, 0x3c, 0x8b, 0xdc, 0xa5, 0x3c, 0x62, 0x53, 0x78, 0xbd, 0xf1, 0x6e, 0x56, 0x3d, 0x1e, 0xf2, 0x79, 0x3d, 0x0a, 0xce, 0x9b, 0xbc, 0x18, 0xed, 0xaf, 0x3c, 0xd1, 0x1d, 0x8a, 0x3d, 0x78, 0xe8, 0x6e, 0x3c, 0x1d, 0x2a, 0x84, 0x3d, 0x90, 0xb3, 0x80, 0x3d, 0x26, 0x1f, 0x74, 0x3d, 0x14, 0xc6, 0x79, 0xbb, 0x37, 0x9d, 0x18, 0x3d, 0x1a, 0x28, 0x86, 0x3d, 0x8b, 0x8e, 0x0f, 0xbd, 0x50, 0x3e, 0x82, 0xbc, 0x6f, 0x35, 0x70, 0xbd, 0xa5, 0xa6, 0x88, 0x3d, 0xb6, 0xe7, 0x2a, 0xbd, 0x57, 0x46, 0x0a, 0x3d, 0xd6, 0xba, 0x34, 0xbd, 0xc2, 0xf8, 0xc1, 0xbc, 0x2e, 0xe5, 0x30, 0xbd, 0xd5, 0x76, 0x85, 0x3d, 0xb4, 0xeb, 0x88, 0xbd, 0xb5, 0x44, 0x40, 0x3d, 0x08, 0x9a, 0x8f, 0xbd, 0xe4, 0xa2, 0xdf, 0x3c, 0x40, 0x83, 0xaf, 0x3a, 0xe0, 0xfb, 0x20, 0x3b, 0x84, 0xc3, 0xf1, 0x3c, 0x13, 0x24, 0x88, 0xbd, 0x03, 0x21, 0x4a, 0xbd, 0xd6, 0x14, 0x39, 0x3d, 0x10, 0x2c, 0x84, 0xbd, 0x47, 0xe0, 0xed, 0xbc, 0x8e, 0xfd, 0x91, 0xbc, 0x0e, 0x42, 0x93, 0xbc, 0xe4, 0x43, 0x6b, 0x3d, 0x96, 0xc7, 0x36, 0x3d, 0xb0, 0xc2, 0xac, 0xbb, 0x28, 0x29, 0x74, 0x3d, 0xf0, 0x10, 0xb5, 0xbb, 0x09, 0x5e, 0x6c, 0x3d, 0xc3, 0xa9, 0x97, 0x3c, 0x4f, 0xc1, 0x9c, 0x3c, 0x4e, 0xc4, 0xf0, 0x3c, 0x4e, 0x42, 0xfa, 0xbc, 0x9a, 0x53, 0x79, 0x3c, 0x9e, 0xc3, 0xd8, 0xbc, 0xfe, 0x1e, 0x57, 0x3c, 0xa2, 0xec, 0x3f, 0xba, 0xfa, 0x34, 0x12, 0x3d, 0x43, 0x1c, 0xd4, 0x3c, 0xf3, 0x3f, 0xa5, 0x3a, 0xda, 0xa7, 0x96, 0xbd, 0x6a, 0x5f, 0x2a, 0x3d, 0xbd, 0x83, 0xd3, 0xbb, 0xb8, 0x9c, 0x5b, 0xbd, 0x67, 0xbb, 0x2d, 0x3c, 0x44, 0x9a, 0xb0, 0xbc, 0x5c, 0x1b, 0xe6, 0x3c, 0x10, 0xfd, 0x67, 0xbd, 0x3b, 0x8e, 0x94, 0xbd, 0xf3, 0x97, 0xca, 0xbb, 0x3a, 0xae, 0x3f, 0x3c, 0xd2, 0xbe, 0x81, 0x3d, 0xd7, 0x2c, 0x86, 0xbd, 0x48, 0xc8, 0xbf, 0xbc, 0x00, 0x15, 0x5e, 0xbc, 0x43, 0x09, 0x1d, 0x3d, 0x3d, 0xe7, 0x75, 0xbd, 0x38, 0xe4, 0x5f, 0x3c, 0x8f, 0xe1, 0x09, 0x3d, 0xab, 0xa4, 0x16, 0xbd, 0x69, 0x15, 0x35, 0x3d, 0x6d, 0x6a, 0x20, 0xbd, 0xa1, 0xd2, 0x9b, 0xbb, 0x89, 0xfb, 0xd1, 0x3c, 0x91, 0x05, 0x82, 0x3d, 0x5c, 0x10, 0x3c, 0xbd, 0x7e, 0x4d, 0x5d, 0x3d, 0x5a, 0xac, 0x44, 0xbc, 0xe5, 0x82, 0xfd, 0xbc, 0xd7, 0xc2, 0x82, 0xbd, 0xe7, 0xd3, 0x5f, 0x3d, 0x3e, 0x16, 0x1e, 0x3d, 0x72, 0xcf, 0x9c, 0xbd, 0xf9, 0x44, 0xa2, 0xbc, 0x1c, 0x64, 0x69, 0xba, 0x9e, 0xc1, 0x01, 0x3c, 0x07, 0xc9, 0x81, 0xbd, 0x18, 0x75, 0x25, 0xbd, 0x12, 0x0b, 0xfd, 0xbc, 0x00, 0x54, 0xd5, 0x38, 0x73, 0x47, 0x85, 0xbd, 0xaa, 0x08, 0x68, 0x3d, 0xa5, 0xf5, 0xa8, 0xbc, 0xd7, 0xea, 0x16, 0x3d, 0x38, 0x81, 0x2a, 0xbd, 0xb0, 0x44, 0x45, 0x3d, 0xe6, 0x66, 0x71, 0x3d, 0x39, 0x4d, 0x58, 0xbc, 0x6c, 0xd5, 0xbc, 0xbc, 0x40, 0x65, 0xab, 0x3c, 0x92, 0x4f, 0x83, 0x3d, 0x46, 0xb4, 0x83, 0x3d, 0xf3, 0x7b, 0x5e, 0xbd, 0x8f, 0x77, 0x98, 0xbc, 0x28, 0xd3, 0xe2, 0xbc, 0xa8, 0x94, 0xdc, 0xbc, 0xdc, 0x3a, 0x03, 0x39, 0x6e, 0xd2, 0x81, 0x3c, 0x49, 0x64, 0xb8, 0xbc, 0xdb, 0x96, 0x03, 0xbd, 0xeb, 0x90, 0x4c, 0x3d, 0xcc, 0xc7, 0x45, 0xbc, 0xca, 0xbc, 0x4a, 0xbd, 0xcc, 0xf4, 0x90, 0x3c, 0x1e, 0x78, 0x93, 0x3b, 0xe8, 0x46, 0x68, 0xbd, 0x02, 0xe7, 0x78, 0xbc, 0x95, 0x12, 0x48, 0xbd, 0x36, 0xd3, 0x60, 0xbd, 0x0b, 0x6a, 0x1c, 0x3d, 0x9c, 0xa6, 0xb4, 0x3c, 0x20, 0xe6, 0xca, 0x3c, 0x52, 0x5e, 0x97, 0xbd, 0xe8, 0x0f, 0x10, 0xbd, 0x01, 0xe8, 0x51, 0xbd, 0xf1, 0x2a, 0x0e, 0xbd, 0x1d, 0x03, 0x85, 0x3a, 0x00, 0x7f, 0x50, 0x3d, 0x5a, 0x91, 0xd7, 0xbc, 0xc5, 0x55, 0x3b, 0x3d, 0xd6, 0x47, 0x8a, 0xbd, 0x2d, 0x40, 0x80, 0x3d, 0x49, 0x84, 0xd9, 0xbb, 0x2c, 0x7d, 0x5a, 0x3d, 0x94, 0x2d, 0xcd, 0x3c, 0x84, 0xe9, 0x90, 0xbd, 0x67, 0xf2, 0x95, 0xbd, 0xf6, 0x29, 0x12, 0xbd, 0x7b, 0x2e, 0x64, 0x3d, 0xf5, 0x42, 0x01, 0xbd, 0x42, 0x57, 0x2b, 0x3d, 0x0d, 0xd5, 0x99, 0xbd, 0xdf, 0xd5, 0x4b, 0xbd, 0xc4, 0x97, 0x4a, 0xbd, 0xb1, 0xb5, 0xa0, 0x3c, 0x97, 0xa5, 0x13, 0xbb, 0xda, 0x02, 0x11, 0x3d, 0x6e, 0x22, 0xce, 0xbb, 0x9f, 0x3e, 0xf0, 0x3c, 0x92, 0x5d, 0xb5, 0xbc, 0xda, 0x5e, 0x45, 0x3d, 0x53, 0x93, 0x0a, 0x3d, 0xa4, 0xf0, 0x8b, 0x3c, 0x4a, 0x4c, 0x04, 0x3d, 0x76, 0xc7, 0x8e, 0x3c, 0x55, 0xba, 0x39, 0x3c, 0xa5, 0xed, 0x8c, 0xbd, 0x16, 0x33, 0x80, 0xbd, 0x32, 0xd7, 0x3b, 0x3d, 0x07, 0xe9, 0x62, 0xbd, 0x6e, 0x01, 0x76, 0x3d, 0x42, 0x8b, 0x5e, 0xbd, 0x30, 0x56, 0x07, 0x3d, 0x2c, 0x8b, 0xdb, 0xbc, 0xaf, 0xff, 0x8f, 0xbd, 0xf3, 0x4a, 0x5d, 0xbd, 0xb0, 0x52, 0xb7, 0x3b, 0x29, 0x47, 0x9c, 0xbc, 0x5a, 0x8d, 0x30, 0xbd, 0x71, 0xf8, 0x07, 0x3d, 0xc0, 0x46, 0x27, 0xbd, 0x93, 0x7d, 0x89, 0xbc, 0xd2, 0x61, 0x39, 0x3d, 0x8d, 0x18, 0x69, 0x3c, 0x43, 0xd6, 0x18, 0xbc, 0x00, 0x37, 0x0f, 0xba, 0x68, 0x4c, 0x4a, 0x3d, 0x4a, 0x6d, 0x6c, 0xbd, 0x63, 0x4a, 0x7c, 0xbc, 0x0e, 0xed, 0x6b, 0xbd, 0x43, 0xc3, 0x97, 0xbd, 0xd0, 0x48, 0xa4, 0xbb, 0xb4, 0x48, 0xa0, 0x3c, 0x89, 0x3c, 0x89, 0xbd, 0x00, 0xa7, 0xb4, 0x39, 0xe2, 0xd3, 0x5e, 0x3d, 0x19, 0x2b, 0x10, 0xbc, 0x46, 0xef, 0x9a, 0xbd, 0x1c, 0x32, 0xac, 0x3c, 0xe2, 0x57, 0x4b, 0x3d, 0xf7, 0x44, 0x41, 0x3d, 0x84, 0x06, 0x89, 0xbc, 0x20, 0xf0, 0xb7, 0x3b, 0x3a, 0x7b, 0x50, 0x3d, 0xc0, 0xe4, 0x59, 0xbd, 0x06, 0x58, 0x19, 0x3d, 0x80, 0x23, 0xe1, 0x3b, 0xe2, 0xdc, 0x8c, 0xbd, 0xdc, 0x0a, 0x84, 0x3d, 0x96, 0xfe, 0x23, 0xbb, 0x45, 0x27, 0x40, 0xbd, 0x5d, 0xc4, 0x0f, 0x3d, 0xcc, 0xe2, 0xab, 0xbc, 0x64, 0xec, 0xf8, 0xbc, 0x5e, 0x9d, 0x1f, 0xbd, 0xa4, 0x84, 0x16, 0xbd, 0x26, 0x34, 0x99, 0xbd, 0xeb, 0x94, 0x91, 0x3d, 0xae, 0x2b, 0x25, 0x3d, 0x7d, 0x8a, 0x2c, 0x3d, 0x65, 0xdb, 0xa1, 0xbc, 0xb9, 0x5c, 0x2a, 0x3d, 0xe4, 0x06, 0x1d, 0xbb, 0xb6, 0xca, 0x17, 0x3d, 0xc8, 0xd8, 0x12, 0x3d, 0x5c, 0xf3, 0x28, 0xbd, 0x44, 0x6b, 0x85, 0xbc, 0xa0, 0x1c, 0x05, 0x3b, 0x1e, 0x13, 0x49, 0x3d, 0xd0, 0xbc, 0x07, 0x3d, 0xe4, 0xe8, 0x33, 0x3c, 0xe1, 0xbe, 0x4c, 0x3d, 0xcf, 0xa9, 0x0d, 0x3c, 0x52, 0x61, 0x62, 0x3d, 0x2e, 0x19, 0x63, 0x3d, 0xbe, 0x72, 0x86, 0x3d, 0x20, 0x7b, 0x34, 0x3c, 0xa0, 0x1b, 0x6d, 0xbb, 0xbe, 0xdf, 0xd9, 0x3a, 0x6b, 0xae, 0x4e, 0x3d, 0x3b, 0x38, 0x7d, 0xbd, 0xa1, 0xee, 0x3b, 0x3d, 0x51, 0x91, 0x37, 0x3b, 0x26, 0x34, 0xe4, 0xbc, 0x13, 0x50, 0x8c, 0xbd, 0x5b, 0x2d, 0x52, 0xbd, 0xb3, 0xf6, 0x5d, 0xbc, 0x82, 0x69, 0x3f, 0xbb, 0xf3, 0x6b, 0x14, 0x3d, 0xe8, 0x54, 0x9a, 0x3c, 0x42, 0xa5, 0x35, 0x3d, 0x99, 0x10, 0x0b, 0xbc, 0x87, 0x55, 0x2d, 0xbd, 0x1f, 0x1a, 0x16, 0xbd, 0x99, 0xaa, 0x16, 0xbc, 0x1a, 0x04, 0x3e, 0xbd, 0x62, 0x5f, 0x12, 0x3d, 0xea, 0x90, 0x18, 0x3d, 0x32, 0x9f, 0x17, 0x3d, 0x1c, 0x6f, 0xba, 0x3c, 0xce, 0xe2, 0x13, 0x3d, 0x47, 0xa2, 0xdb, 0xbc, 0xf7, 0x85, 0x4f, 0xbd, 0x24, 0x60, 0xc8, 0xbc, 0xea, 0x00, 0x5e, 0xbd, 0x08, 0x73, 0x58, 0x3d, 0xf3, 0x42, 0x85, 0xbd, 0x0e, 0xcd, 0x91, 0xbd, 0x3c, 0xba, 0xb1, 0xbc, 0x48, 0x41, 0x01, 0x3d, 0xb1, 0xcf, 0x64, 0x3d, 0x6f, 0x25, 0x9a, 0xbc, 0xda, 0xaa, 0xce, 0x3c, 0x22, 0x5f, 0x62, 0x3d, 0xf9, 0x36, 0x9b, 0xbd, 0x85, 0x6f, 0x81, 0x3d, 0x22, 0xd8, 0x2e, 0xbd, 0x72, 0x49, 0x19, 0xbd, 0x21, 0x3c, 0xb9, 0xba, 0xc5, 0x69, 0x8a, 0xbd, 0x68, 0xec, 0x08, 0xbd, 0xd9, 0x7e, 0x06, 0xbd, 0x0e, 0xa4, 0x36, 0x3d, 0x9e, 0xbb, 0x65, 0xbd, 0xaf, 0x04, 0x81, 0x3d, 0x07, 0xa0, 0x7b, 0xbd, 0xa7, 0x30, 0x51, 0xbd, 0x15, 0x8e, 0x05, 0x3c, 0xe0, 0x7a, 0x7c, 0x3c, 0x43, 0x90, 0x04, 0x3d, 0x00, 0xf1, 0x4b, 0xbb, 0xe0, 0xe9, 0x29, 0x3b, 0x6f, 0x91, 0x1d, 0xbd, 0xff, 0xc5, 0xd0, 0x3c, 0x6b, 0x02, 0xe3, 0x3c, 0xba, 0x1f, 0x53, 0xbc, 0x0e, 0xd5, 0x7e, 0x3d, 0x54, 0xe0, 0x97, 0xbc, 0x00, 0x7a, 0xf2, 0xb9, 0x66, 0x00, 0x84, 0x3d, 0x62, 0x17, 0x08, 0xbd, 0x5a, 0x30, 0x46, 0x3d, 0x75, 0xb1, 0x37, 0xbd, 0x6f, 0x28, 0x55, 0x3c, 0xe0, 0xc4, 0x82, 0xbd, 0xfc, 0xf5, 0xb2, 0xbc, 0x96, 0xdc, 0x0a, 0xbb, 0x83, 0x2a, 0x91, 0x3c, 0x29, 0x21, 0x40, 0x3d, 0xff, 0x1f, 0x9c, 0xbd, 0x82, 0xb2, 0x5d, 0x3d, 0x8e, 0x14, 0x2c, 0x3d, 0xec, 0xb2, 0xed, 0xbc, 0xb8, 0xa0, 0x3a, 0xbc, 0x66, 0x70, 0x11, 0xbc, 0x49, 0xa6, 0xd0, 0xbc, 0x55, 0x34, 0x14, 0xbc, 0xb4, 0x65, 0x80, 0x3d, 0x76, 0x98, 0x87, 0xbd, 0x23, 0x3d, 0xa2, 0x3c, 0xaa, 0xc5, 0x7e, 0x3d, 0xb7, 0x41, 0x91, 0xbd, 0x9f, 0xe6, 0x80, 0xbd, 0x20, 0x0a, 0x13, 0x3c, 0xc8, 0xa0, 0xf3, 0x3c, 0x51, 0xf3, 0x04, 0x3d, 0x61, 0x7e, 0x0c, 0x3d, 0xbe, 0x25, 0x47, 0x3d, 0x25, 0x2b, 0x2b, 0x3d, 0xa9, 0x7a, 0x3f, 0xbd, 0xc2, 0xd4, 0xe3, 0xbc, 0x67, 0xc5, 0x79, 0x3d, 0x10, 0x4b, 0xb0, 0x3c, 0xb8, 0xd1, 0x87, 0x3c, 0xd3, 0x7b, 0x54, 0xbd, 0x81, 0x81, 0xcc, 0x3c, 0x85, 0x81, 0x15, 0x3d, 0xaa, 0xa8, 0xb0, 0x3b, 0x4b, 0x90, 0xae, 0x3c, 0xaa, 0x38, 0x0f, 0x3d, 0x92, 0x82, 0x0a, 0xbd, 0xfd, 0x99, 0x51, 0x3d, 0x90, 0x87, 0x0b, 0xbd, 0xc6, 0x71, 0x58, 0xbd, 0x4f, 0x17, 0x86, 0x38, 0x03, 0x9a, 0x00, 0xbd, 0xeb, 0xae, 0x34, 0xbd, 0xab, 0x28, 0x19, 0x3b, 0xc5, 0x48, 0x6c, 0xbd, 0x4a, 0xa3, 0x7c, 0xbd, 0x1f, 0xe7, 0x00, 0x3c, 0xf4, 0xd8, 0xd8, 0x3c, 0xbc, 0x01, 0x59, 0xbd, 0xa9, 0x77, 0xb5, 0xbb, 0x67, 0xc3, 0x82, 0x3d, 0x37, 0xd8, 0x8c, 0x3d, 0xea, 0x92, 0x59, 0x3d, 0x30, 0x97, 0x31, 0x3d, 0x36, 0xb9, 0x23, 0xbb, 0x98, 0x99, 0x7f, 0xbd, 0x0b, 0xfd, 0x8e, 0xbc, 0x80, 0xc6, 0x5c, 0xbd, 0xb2, 0xf0, 0x76, 0x3d, 0x7e, 0x01, 0xe5, 0xbc, 0x0a, 0x94, 0x08, 0x3d, 0xb2, 0x9b, 0x7b, 0xbd, 0xdc, 0x27, 0x6b, 0xbd, 0x32, 0x1e, 0x41, 0x3d, 0x4b, 0xd8, 0x8a, 0xbd, 0xe6, 0xdc, 0xd5, 0x3c, 0x72, 0xfd, 0x09, 0xbd, 0x33, 0x80, 0xc5, 0xba, 0xbc, 0xdd, 0xc0, 0x3b, 0xf4, 0x31, 0x9a, 0xbd, 0x29, 0x45, 0xd9, 0x3c, 0x02, 0x33, 0xd8, 0xbc, 0x97, 0x48, 0x73, 0x3d, 0x7f, 0x13, 0x88, 0xbd, 0x9b, 0xed, 0x40, 0xbd, 0xae, 0x86, 0x7d, 0xbd, 0xea, 0xa5, 0x4a, 0x3b, 0x8d, 0xd4, 0xd8, 0x3c, 0x57, 0xc1, 0x28, 0xbc, 0x6a, 0xb8, 0x15, 0x3d, 0x30, 0xb0, 0xdc, 0xbb, 0x71, 0x34, 0x05, 0xbd, 0x39, 0x9c, 0x8a, 0x3d, 0x98, 0xdd, 0x45, 0xbc, 0xf1, 0xcc, 0xcb, 0xbc, 0xe1, 0xf6, 0xd8, 0x3c, 0xae, 0xb9, 0x18, 0xbb, 0x67, 0x50, 0x82, 0x3d, 0x20, 0x71, 0x82, 0x3d, 0x0e, 0x45, 0x4a, 0xbd, 0x30, 0x86, 0xbe, 0xbb, 0x60, 0xc7, 0x07, 0x3d, 0xdb, 0xf7, 0x04, 0xbd, 0x9a, 0xc3, 0xb2, 0xbc, 0xe0, 0x58, 0xf5, 0xbc, 0x12, 0x0a, 0x48, 0x3d, 0xf7, 0x85, 0x2e, 0x3d, 0xab, 0x2b, 0xe6, 0x3b, 0xed, 0x4c, 0x15, 0xbc, 0x99, 0x4b, 0xb1, 0xbc, 0xa1, 0x82, 0x09, 0x3d, 0x8b, 0x84, 0x09, 0xbd, 0x85, 0x5a, 0x38, 0xbb, 0x83, 0xc7, 0x80, 0xbd, 0xfe, 0xf3, 0x67, 0xbd, 0x6e, 0x25, 0x6f, 0x3d, 0x00, 0xa4, 0xf8, 0xbc, 0x3a, 0x24, 0x17, 0xbc, 0xb2, 0x0d, 0x8a, 0x3c, 0x87, 0xac, 0x69, 0x3d, 0xcd, 0x5f, 0x89, 0xbc, 0x9e, 0x08, 0x7d, 0xbd, 0x4c, 0xa4, 0xa0, 0xbc, 0x63, 0x21, 0x2c, 0x3d, 0x5a, 0x78, 0x71, 0xbd, 0xa2, 0xe8, 0x71, 0x3d, 0x2b, 0xc9, 0xc1, 0xbb, 0x6f, 0x4f, 0x78, 0xbd, 0xa9, 0xee, 0xdf, 0x3c, 0x3c, 0xe2, 0xb3, 0xbc, 0x64, 0xa2, 0x7d, 0xbc, 0xcc, 0x2c, 0x35, 0x3d, 0xfd, 0x8c, 0x86, 0x3d, 0xe9, 0x57, 0xf3, 0x3c, 0xc1, 0x84, 0x82, 0x3d, 0x8e, 0x7a, 0x6c, 0xbd, 0xf1, 0x40, 0x04, 0x3d, 0x7e, 0x17, 0x5b, 0x3d, 0x74, 0xba, 0x83, 0x3a, 0x6f, 0x01, 0x86, 0xbd, 0x62, 0x58, 0x69, 0xbd, 0x33, 0xcd, 0x07, 0x3d, 0x6e, 0xc5, 0x8c, 0xbd, 0x5a, 0x4c, 0x99, 0x3c, 0x87, 0xb8, 0xf0, 0x3c, 0xc1, 0x64, 0x8a, 0x3c, 0x4c, 0x69, 0x23, 0xbd, 0x93, 0x75, 0x80, 0x3d, 0x54, 0x27, 0x87, 0xbd, 0xdc, 0x3e, 0x62, 0x3d, 0x9e, 0xdb, 0x43, 0xbc, 0x03, 0xd4, 0x65, 0xbd, 0x4c, 0xb6, 0x59, 0x3d, 0xc4, 0xa1, 0xe8, 0xbc, 0xf3, 0xdc, 0x87, 0x3d, 0xf5, 0x34, 0x82, 0xbc, 0x4e, 0x2d, 0xe2, 0x3b, 0xd6, 0x1e, 0x3d, 0xbd, 0xea, 0x0c, 0x83, 0x3d, 0x34, 0x3e, 0x20, 0xbd, 0xb6, 0x87, 0x77, 0x3c, 0x9c, 0x9a, 0xe4, 0xba, 0x48, 0x21, 0xa5, 0xbc, 0xb3, 0x81, 0x89, 0x3d, 0xf4, 0x2c, 0x49, 0x3d, 0x98, 0xb5, 0xd6, 0xbc, 0x88, 0xdb, 0x30, 0xbd, 0xa4, 0x2f, 0x88, 0xbc, 0x67, 0xc1, 0xb6, 0xbc, 0x8e, 0xba, 0xb8, 0xbc, 0xdd, 0x22, 0xc2, 0x3c, 0xaf, 0x08, 0x8f, 0x3b, 0xa5, 0x85, 0xcb, 0xbc, 0x26, 0x24, 0x2c, 0x3d, 0x2c, 0x73, 0x35, 0x3c, 0xf9, 0xb2, 0xaf, 0xbb, 0xf2, 0x50, 0x2f, 0xbd, 0x15, 0x10, 0x31, 0x3c, 0x75, 0xdb, 0x67, 0x3d, 0x5c, 0xe2, 0xfe, 0x3c, 0x51, 0xe0, 0x8d, 0x3d, 0x1c, 0x25, 0xb9, 0x3c, 0xcf, 0x20, 0x80, 0x3d, 0x5c, 0x61, 0xdf, 0x3c, 0x9a, 0x2e, 0x5d, 0x3d, 0x4d, 0x63, 0xd8, 0x3c, 0x23, 0x0e, 0x32, 0xbc, 0x6a, 0xaa, 0x61, 0x3d, 0xa3, 0x74, 0x86, 0xbd, 0x60, 0x32, 0x73, 0x3b, 0xe3, 0x8b, 0x73, 0xbc, 0x6d, 0x26, 0x40, 0x3d, 0x8c, 0xbb, 0xbf, 0xbb, 0x4f, 0x89, 0xf9, 0x3c, 0x6a, 0xfe, 0x0b, 0x3d, 0x43, 0x89, 0x3f, 0xbd, 0xe6, 0x1f, 0xda, 0xbc, 0xdf, 0x48, 0x36, 0xbd, 0xd8, 0x5a, 0x8f, 0xbd, 0x58, 0x20, 0xfc, 0x3c, 0xec, 0xc0, 0x69, 0x3d, 0xc9, 0x17, 0x06, 0xbd, 0xc1, 0x2b, 0xd9, 0x3b, 0xba, 0x7f, 0x73, 0x3a, 0xde, 0xd4, 0xbd, 0xbc, 0x9f, 0x94, 0xd6, 0x3c, 0xfe, 0xb3, 0x56, 0x3c, 0xbd, 0xda, 0xd0, 0xbc, 0x9c, 0x13, 0x6c, 0xbc, 0x10, 0x12, 0xab, 0x3c, 0x94, 0x9f, 0x1d, 0xbd, 0x78, 0xbb, 0x9d, 0x3c, 0x6c, 0xca, 0x00, 0xbd, 0x4c, 0xb7, 0xb8, 0x3c, 0x09, 0x38, 0xd3, 0x3c, 0x4c, 0x70, 0x91, 0x3c, 0xe9, 0x6b, 0x26, 0xbc, 0x57, 0x19, 0xa4, 0x3c, 0xd2, 0xf7, 0x54, 0x3d, 0x0f, 0x9a, 0x48, 0x3d, 0xd0, 0xe2, 0x8f, 0x3b, 0x58, 0x63, 0x13, 0x3c, 0x81, 0xda, 0x1b, 0xbd, 0x77, 0x24, 0x83, 0x3c, 0xd7, 0x64, 0xc7, 0x3b, 0xb0, 0xf6, 0x6b, 0xbc, 0x8a, 0xaa, 0x62, 0x3d, 0xa4, 0x13, 0xbb, 0xbc, 0xe8, 0x06, 0xb3, 0x3c, 0xb1, 0x41, 0x77, 0x3d, 0x1c, 0xac, 0xe0, 0x3c, 0x40, 0x0f, 0x25, 0x3c, 0x89, 0xc0, 0x54, 0x3c, 0xec, 0x1d, 0x7a, 0x3d, 0x41, 0x1e, 0x31, 0x3d, 0x51, 0x3e, 0x26, 0x3d, 0x00, 0x55, 0x39, 0xbd, 0x2e, 0x9d, 0x7f, 0x3d, 0x2f, 0xe9, 0x4d, 0xbd, 0x46, 0x85, 0x35, 0xbd, 0xa2, 0x67, 0xf8, 0x3c, 0x16, 0x0f, 0x82, 0xbd, 0xcd, 0x48, 0x9a, 0x3b, 0x62, 0xd9, 0x08, 0x3d, 0x67, 0x0f, 0x5a, 0xbc, 0xd0, 0x09, 0x56, 0xbc, 0x31, 0x38, 0xda, 0xbc, 0x67, 0xf7, 0xa1, 0xbc, 0x8c, 0x2a, 0x79, 0xbd, 0xb3, 0xf5, 0xb1, 0xbc, 0xe8, 0xf4, 0x8b, 0xbd, 0x5f, 0x45, 0x11, 0xbd, 0x9f, 0x79, 0x1e, 0xbd, 0xf5, 0xbf, 0x86, 0x3d, 0x4e, 0xd8, 0xed, 0xbc, 0xcd, 0x66, 0x5b, 0x3c, 0x4a, 0x74, 0x8f, 0x3b, 0xe3, 0x98, 0x4f, 0x3d, 0x0d, 0x54, 0x91, 0xbb, 0x24, 0xb6, 0x1b, 0x3d, 0xd8, 0x0d, 0xb7, 0xbc, 0x04, 0x76, 0x31, 0xbd, 0x10, 0x43, 0x11, 0xbd, 0x0e, 0xc2, 0x02, 0xbd, 0x88, 0x66, 0x43, 0x3c, 0xb5, 0xda, 0x95, 0xbb, 0x07, 0x09, 0x28, 0xbd, 0x22, 0xcc, 0x19, 0xbd, 0xf0, 0x47, 0xfe, 0x3c, 0x10, 0x43, 0xfb, 0xbc, 0x5f, 0x5f, 0x2c, 0x3d, 0xfb, 0xce, 0x18, 0xbc, 0xcd, 0x87, 0x6a, 0x3d, 0xee, 0xf6, 0x61, 0xbd, 0x37, 0x86, 0x12, 0x3d, 0x4c, 0x01, 0xb7, 0x3c, 0x8c, 0x44, 0x19, 0xbd, 0xc1, 0x3d, 0xa6, 0x3c, 0xcd, 0xf1, 0x5e, 0xbb, 0x9e, 0xe0, 0x41, 0x3d, 0x8c, 0xfb, 0x95, 0xbd, 0xa7, 0x04, 0xc1, 0xbb, 0xcc, 0xf0, 0x25, 0xbd, 0x1c, 0x72, 0x81, 0x3c, 0x76, 0xf2, 0x6d, 0x3d, 0x3b, 0xf9, 0x86, 0x3d, 0xc2, 0xbe, 0x4a, 0x3d, 0x5d, 0x80, 0x5a, 0xbd, 0x63, 0x28, 0x3b, 0xbd, 0xb4, 0xb7, 0x5e, 0x3d, 0x04, 0x5b, 0x57, 0x3d, 0x64, 0xac, 0x56, 0xbd, 0xb6, 0x67, 0x35, 0xbd, 0xb1, 0xc7, 0x0b, 0x3d, 0x0c, 0xae, 0x2d, 0x3d, 0xcc, 0x4c, 0x7d, 0xbc, 0x2f, 0x01, 0x34, 0x3d, 0xa8, 0x4e, 0x63, 0x3d, 0xa3, 0xad, 0xb8, 0xbc, 0x32, 0x0c, 0x25, 0xbd, 0x66, 0x15, 0xab, 0xbc, 0x8a, 0x1a, 0x10, 0x3d, 0xca, 0xcb, 0x46, 0x3d, 0x4a, 0xe5, 0xfe, 0x3c, 0x4a, 0xcc, 0xa6, 0x3c, 0x2e, 0x05, 0x4f, 0xbb, 0x31, 0xef, 0x62, 0xbc, 0xa0, 0xeb, 0x7c, 0xbd, 0x49, 0x9b, 0x13, 0x3d, 0x07, 0x55, 0x82, 0x3d, 0xca, 0x81, 0x1d, 0xbd, 0x67, 0xc0, 0x52, 0x3b, 0xae, 0xd6, 0x0d, 0x3d, 0x53, 0x79, 0x70, 0xbd, 0x9c, 0x93, 0xa8, 0xbc, 0x5b, 0xbb, 0x58, 0x3d, 0x73, 0x1d, 0x0b, 0xbd, 0xe8, 0xe9, 0x0f, 0x3d, 0x3b, 0xda, 0xbd, 0xbb, 0x66, 0x91, 0x80, 0x3d, 0x46, 0xcc, 0xe8, 0xbc, 0x86, 0xe3, 0x32, 0x3d, 0x37, 0x9f, 0x5f, 0xbc, 0x9a, 0x06, 0x19, 0xbd, 0xec, 0xb6, 0x78, 0xbd, 0xd9, 0xd5, 0x49, 0xbd, 0xe8, 0xf9, 0x59, 0x3c, 0x48, 0x30, 0x8c, 0x3c, 0x03, 0x1d, 0x8a, 0x3d, 0x4d, 0x47, 0xc6, 0x3c, 0x77, 0x88, 0x9d, 0xbd, 0x3e, 0xf0, 0x63, 0xbd, 0x83, 0x92, 0x2b, 0xbd, 0x9a, 0xb0, 0x05, 0x3d, 0xee, 0x10, 0x86, 0x3c, 0xf1, 0xb2, 0x92, 0xbd, 0x2a, 0x0e, 0x3f, 0xbd, 0x6c, 0xfc, 0xbb, 0xbb, 0x62, 0xee, 0x16, 0x3a, 0xf8, 0xdb, 0xa1, 0x3c, 0x1c, 0xce, 0x43, 0xbd, 0xd3, 0xbf, 0x64, 0xbd, 0xe6, 0xb9, 0xc4, 0x3c, 0x43, 0x6b, 0x63, 0x3c, 0xe8, 0xbd, 0x87, 0x3c, 0x95, 0x2d, 0x29, 0x3d, 0x10, 0xbd, 0x7a, 0xbc, 0x26, 0xe3, 0x8e, 0xbd, 0xa1, 0x64, 0x70, 0xbd, 0xf7, 0x22, 0x8f, 0x3d, 0x68, 0x73, 0x95, 0xbc, 0x33, 0x1c, 0xdb, 0xbc, 0x95, 0x44, 0x11, 0x3d, 0xc5, 0x6c, 0x86, 0xbd, 0xf8, 0x9b, 0x8a, 0xbd, 0x48, 0xba, 0x13, 0x3c, 0x6a, 0x54, 0x28, 0xbd, 0xd0, 0xaa, 0x15, 0xbd, 0x32, 0x4e, 0x56, 0x3d, 0x8e, 0x65, 0x4b, 0x3d, 0x62, 0x4d, 0x76, 0xbc, 0x65, 0x5f, 0x05, 0x3d, 0x40, 0xb5, 0xb5, 0xbb, 0x1a, 0xd6, 0x83, 0x3d, 0x9d, 0xea, 0xa7, 0x3b, 0x73, 0x19, 0x59, 0x3c, 0xb2, 0x83, 0x25, 0xbd, 0x38, 0x93, 0x9e, 0x3c, 0x95, 0xe2, 0x7a, 0x3c, 0xc6, 0x09, 0x95, 0xbd, 0xfe, 0x8a, 0x84, 0x3d, 0x09, 0x99, 0x8c, 0x3d, 0x3d, 0xb5, 0x0e, 0xbd, 0x1e, 0x91, 0x8c, 0xbd, 0xc1, 0x52, 0xce, 0x3c, 0xc2, 0xa5, 0x88, 0xbd, 0x9c, 0x3f, 0x97, 0xbd, 0x79, 0x5b, 0xd3, 0x3c, 0x20, 0xf6, 0xfd, 0x3c, 0xcf, 0x37, 0x5f, 0x3c, 0x41, 0xc8, 0x6e, 0xbd, 0xa4, 0xde, 0xf8, 0x3c, 0xe6, 0x88, 0x19, 0xbc, 0xe3, 0x00, 0x01, 0x3d, 0xa7, 0x4e, 0x1e, 0xbd, 0xb8, 0xa1, 0x65, 0xbd, 0xbf, 0xfd, 0x81, 0xbd, 0xf0, 0x80, 0xe8, 0xbb, 0x3c, 0x62, 0xdc, 0x3c, 0x02, 0x96, 0x70, 0x3d, 0x05, 0x55, 0x7d, 0xbd, 0x66, 0xb3, 0x15, 0x3d, 0xa7, 0x8e, 0x16, 0xbd, 0xf5, 0xcf, 0x06, 0x3d, 0x5b, 0x78, 0xdf, 0xbc, 0x54, 0xcc, 0x2c, 0xbd, 0xdc, 0x15, 0xc6, 0xbc, 0xeb, 0xaf, 0x87, 0x3d, 0x3b, 0x65, 0x95, 0xbd, 0x52, 0x02, 0x65, 0x3d, 0x0a, 0x99, 0x0a, 0xbc, 0x6a, 0xfd, 0x67, 0x3d, 0x00, 0x53, 0x3e, 0xbd, 0xa0, 0xbe, 0xe4, 0xbc, 0xaa, 0x76, 0xf4, 0x3c, 0xd9, 0x22, 0x3c, 0xbd, 0x28, 0xa2, 0x3b, 0x3b, 0x44, 0x27, 0x7e, 0xbd, 0xb3, 0xd4, 0xa8, 0x3c, 0xb3, 0x30, 0x29, 0x3b, 0xd0, 0x0f, 0x3b, 0x3b, 0x74, 0x3e, 0x8a, 0xbd, 0x2f, 0x61, 0x1f, 0xbd, 0x58, 0x65, 0x4a, 0xbd, 0xd7, 0xb7, 0xf8, 0xbc, 0xfd, 0x91, 0x25, 0xbd, 0xfd, 0xd2, 0x39, 0xbd, 0x49, 0xa6, 0x82, 0x3d, 0xd8, 0x60, 0x04, 0x3d, 0xf8, 0x76, 0xac, 0x3c, 0x18, 0x61, 0x2d, 0xbc, 0xd6, 0xf2, 0x0b, 0xbd, 0x18, 0x53, 0x01, 0x3c, 0xac, 0x10, 0xb7, 0x3c, 0x22, 0xab, 0xd0, 0xbc, 0x40, 0x50, 0x3b, 0x3a, 0xf4, 0x70, 0x44, 0xbd, 0xb8, 0xaa, 0x81, 0xbd, 0x09, 0x70, 0x8f, 0x3c, 0x51, 0x00, 0xc5, 0xbc, 0x41, 0x17, 0xb8, 0xbc, 0xd2, 0xe1, 0x07, 0xbd, 0x58, 0xa0, 0x95, 0xbd, 0x7d, 0x24, 0x4b, 0xbd, 0x47, 0x50, 0x5f, 0x3d, 0x4a, 0x41, 0x1e, 0x3d, 0xc1, 0x38, 0x21, 0xbd, 0xbd, 0x82, 0x13, 0x3d, 0xdb, 0xe8, 0x4d, 0xbd, 0x76, 0x8d, 0x1d, 0xbc, 0x96, 0x2f, 0x72, 0x3d, 0xa9, 0x4c, 0x56, 0xbd, 0xe3, 0x39, 0x79, 0x3d, 0xf2, 0xaa, 0x0e, 0x3d, 0xee, 0xfa, 0x27, 0x3d, 0x70, 0x0c, 0x24, 0x3c, 0x3c, 0xf8, 0x7e, 0xbd, 0xc2, 0x3b, 0x55, 0xbb, 0x83, 0x9c, 0xcc, 0x3b, 0x52, 0x0f, 0x5d, 0x3d, 0x86, 0x3f, 0x3a, 0xbc, 0xf0, 0xbb, 0xbc, 0xbb, 0xe0, 0xff, 0xaf, 0x3c, 0x12, 0xca, 0x22, 0x3c, 0xd4, 0x78, 0x41, 0xbc, 0xc9, 0xaa, 0x1f, 0xbd, 0x7c, 0x59, 0x9e, 0x3a, 0x1a, 0x15, 0x4d, 0xbc, 0x25, 0x53, 0xfa, 0xbc, 0x6e, 0xbb, 0x82, 0xbc, 0xc2, 0x7d, 0x8d, 0x3c, 0xa8, 0x73, 0x19, 0xbd, 0x04, 0x34, 0x4c, 0xbc, 0xbb, 0x37, 0x5e, 0x3d, 0xb8, 0xc0, 0x30, 0x3d, 0xac, 0x71, 0x9d, 0xbd, 0xf8, 0x58, 0x2a, 0x3b, 0xd0, 0x94, 0xa4, 0x3b, 0xeb, 0x76, 0x5a, 0xbc, 0xcf, 0x43, 0x94, 0x3c, 0x48, 0x10, 0x66, 0x3d, 0x35, 0xee, 0x78, 0xbc, 0x29, 0x9a, 0x64, 0x3c, 0x39, 0x2a, 0x27, 0x3d, 0xab, 0x94, 0x8a, 0x3d, 0xb2, 0x3c, 0x0f, 0xbd, 0x76, 0x7f, 0x46, 0xbd, 0x68, 0xb2, 0x96, 0xbc, 0x98, 0xa2, 0x61, 0x3d, 0x97, 0x72, 0x92, 0xbd, 0xde, 0xac, 0x51, 0xbd, 0x03, 0xb8, 0x74, 0x3d, 0xb5, 0x3b, 0x8a, 0xbc, 0x70, 0xbf, 0x42, 0xbd, 0xf0, 0x0f, 0xf9, 0x3b, 0xb6, 0x4d, 0xc5, 0x3c, 0x16, 0xeb, 0x72, 0x3d, 0x90, 0x81, 0xcd, 0xbb, 0x00, 0x8b, 0x0b, 0xbc, 0xb1, 0x02, 0xa5, 0x3c, 0xee, 0xa7, 0x7d, 0xbd, 0xf0, 0x26, 0x0e, 0xbd, 0x1c, 0xb0, 0x52, 0xbd, 0x80, 0xdd, 0x2f, 0xbd, 0x43, 0xbb, 0xeb, 0xbc, 0xf9, 0xa6, 0xd1, 0xbc, 0xb1, 0x67, 0x29, 0xbd, 0xaa, 0xee, 0xf4, 0x3b, 0xc4, 0xab, 0x59, 0xbd, 0xb8, 0x83, 0x36, 0x3d, 0x20, 0xfc, 0x60, 0x3b, 0x28, 0xdd, 0x59, 0xbd, 0x5c, 0x16, 0xd1, 0xbc, 0x00, 0xbc, 0xcb, 0xbc, 0x9f, 0x8e, 0x62, 0xbc, 0x8e, 0xde, 0x53, 0xbd, 0xec, 0x4f, 0x26, 0x3d, 0xde, 0x94, 0x46, 0xbd, 0x50, 0x30, 0x0e, 0x3c, 0x20, 0xef, 0x7b, 0xbd, 0x83, 0x86, 0x38, 0x3c, 0x5a, 0xff, 0x1f, 0xbd, 0x61, 0x3e, 0xd5, 0xbc, 0x0b, 0xac, 0x65, 0x3c, 0xfd, 0x06, 0xa5, 0x3c, 0x2c, 0x94, 0x47, 0xbd, 0xe2, 0xc3, 0x7e, 0x3d, 0x40, 0xac, 0x67, 0x3d, 0xa4, 0x7a, 0x77, 0xbc, 0xfc, 0x13, 0xe7, 0x3c, 0x56, 0x69, 0x80, 0x3d, 0x27, 0x58, 0x18, 0x3d, 0x1e, 0x95, 0x0e, 0x3d, 0x3f, 0xa8, 0x41, 0x3d, 0x0f, 0xbb, 0x16, 0xbd, 0x45, 0x72, 0x89, 0xbd, 0xf1, 0xd2, 0xfb, 0x3c, 0x8f, 0x6b, 0x65, 0x3d, 0x50, 0x8a, 0x05, 0x3c, 0x99, 0x24, 0x90, 0xbd, 0xc8, 0x4d, 0x4f, 0x3d, 0x80, 0xb8, 0xd2, 0x3b, 0xe5, 0x51, 0xae, 0x3b, 0x25, 0x33, 0x2a, 0xbd, 0x05, 0x12, 0xd7, 0x3c, 0xc2, 0x1b, 0x33, 0x3c, 0x5f, 0x8d, 0x07, 0xbc, 0x79, 0x60, 0x26, 0x3d, 0xf7, 0x63, 0x83, 0x3d, 0x88, 0xb4, 0xc7, 0xbc, 0x40, 0x5d, 0xb0, 0xba, 0x6e, 0xaf, 0x39, 0xbd, 0x50, 0x93, 0xf3, 0x3c, 0xc4, 0x3b, 0x53, 0x3c, 0xf9, 0x8b, 0x60, 0xbd, 0x74, 0x4e, 0xbd, 0x3c, 0x40, 0xe6, 0xdd, 0x3c, 0x30, 0x78, 0x18, 0x3d, 0xaa, 0xed, 0x76, 0x3d, 0xd7, 0x20, 0x4b, 0x3d, 0x30, 0x08, 0xd1, 0x3c, 0x52, 0xf0, 0x61, 0x3d, 0x75, 0xea, 0x6a, 0x3d, 0x93, 0xef, 0xeb, 0x3c, 0x35, 0xad, 0x96, 0xbd, 0xca, 0x41, 0x21, 0x3d, 0x59, 0x18, 0x1e, 0x3d, 0x2c, 0xa8, 0x81, 0xbd, 0x7e, 0xdb, 0xd7, 0x3c, 0xfc, 0x7e, 0x1b, 0xbd, 0x26, 0x25, 0x86, 0x3d, 0xa9, 0x58, 0x9b, 0xbd, 0x0a, 0xef, 0xfa, 0xbc, 0xfe, 0x74, 0x74, 0x3d, 0xb0, 0x51, 0x80, 0xbd, 0x29, 0x42, 0x88, 0x3a, 0x56, 0xe7, 0x8c, 0xbb, 0x16, 0x5f, 0x43, 0x3d, 0x5b, 0x1d, 0x4c, 0x3c, 0xae, 0x9d, 0xbd, 0xbb, 0xbc, 0xcf, 0x44, 0xbc, 0x78, 0x8d, 0x6c, 0x3d, 0x30, 0x99, 0x2c, 0x3d, 0x52, 0x17, 0x9e, 0xbc, 0x3d, 0x52, 0x18, 0xbd, 0xfa, 0xcc, 0xb4, 0x3c, 0x9d, 0x56, 0x8d, 0x3d, 0x7e, 0xa0, 0x18, 0x3d, 0x88, 0x7b, 0x94, 0xbd, 0xe8, 0x02, 0xc7, 0xbc, 0x08, 0x22, 0x37, 0x3c, 0x18, 0x3b, 0x5d, 0xbd, 0xa4, 0xbb, 0xb4, 0x3c, 0xb0, 0x8d, 0x06, 0x3d, 0xe8, 0xf4, 0xb0, 0xbb, 0xb4, 0x8b, 0x31, 0xbc, 0xf8, 0xdf, 0xf4, 0x3c, 0x29, 0x19, 0x80, 0xbb, 0x29, 0x4c, 0x60, 0x3c, 0x4b, 0x11, 0x93, 0xbd, 0x4b, 0xbd, 0x66, 0xbd, 0x62, 0x8e, 0x88, 0x3c, 0xfe, 0xa2, 0x37, 0x3d, 0x41, 0xe1, 0x36, 0xbd, 0xbe, 0x7b, 0xc1, 0x3b, 0x6c, 0xff, 0xba, 0x3c, 0x8f, 0xae, 0xab, 0xbc, 0x7b, 0x37, 0xd5, 0xbc, 0x0d, 0xac, 0x18, 0xbd, 0xf2, 0xcb, 0x1d, 0x3d, 0xbb, 0xb0, 0x30, 0x3c, 0xbb, 0x1a, 0x41, 0x3b, 0x5b, 0x36, 0x11, 0xbd, 0x96, 0xb3, 0x86, 0x3d, 0x0b, 0xcb, 0xf9, 0x3c, 0x5c, 0x23, 0x60, 0xbc, 0x62, 0xe1, 0x33, 0xbd, 0x10, 0x91, 0x5e, 0x3d, 0xdf, 0xc8, 0x6c, 0xbd, 0xe7, 0x19, 0x60, 0x3d, 0x87, 0xa0, 0x5b, 0x3c, 0x8a, 0xc5, 0x65, 0x3d, 0x6c, 0x2e, 0x31, 0x3d, 0x99, 0xc7, 0x1a, 0x3d, 0xe8, 0xe6, 0x6f, 0x3c, 0x10, 0x95, 0xd9, 0x3b, 0x1d, 0xdd, 0x19, 0xbd, 0xdc, 0xfe, 0x32, 0x3d, 0x83, 0x85, 0x05, 0x3d, 0xd8, 0x24, 0x16, 0x3d, 0xf7, 0x73, 0x20, 0xbd, 0x77, 0x07, 0xc4, 0x3c, 0xdf, 0xd0, 0x92, 0x3c, 0x1a, 0x7d, 0x2c, 0xba, 0xb0, 0x19, 0xe8, 0xbc, 0x9e, 0x97, 0xec, 0xbb, 0x33, 0xb2, 0xb1, 0x3c, 0x89, 0xde, 0x81, 0xbd, 0x9d, 0xae, 0x57, 0xbc, 0x31, 0xd9, 0xbb, 0x3c, 0xa0, 0x2d, 0x27, 0x3d, 0x00, 0x99, 0x43, 0x3c, 0x2e, 0x32, 0x9d, 0xbc, 0xa2, 0x6d, 0x81, 0x3d, 0x38, 0xce, 0xc3, 0xbc, 0x8e, 0xd7, 0x7a, 0x3d, 0x2a, 0x89, 0x00, 0xbc, 0x2e, 0x52, 0x9f, 0xbc, 0x20, 0x47, 0x4d, 0xbd, 0xd9, 0x79, 0x5f, 0x3d, 0x09, 0x2c, 0x97, 0x3c, 0x9c, 0x28, 0x5f, 0x3b, 0x9d, 0xd3, 0x65, 0x3d, 0x44, 0x63, 0xbb, 0xbc, 0x0c, 0xfe, 0xc0, 0x3c, 0x71, 0xfa, 0x08, 0xbd, 0x40, 0x4a, 0xac, 0x3b, 0xca, 0x9d, 0x7a, 0x3d, 0xbd, 0x1c, 0x52, 0xbd, 0xc8, 0x90, 0x0e, 0x3d, 0x6b, 0x89, 0xbd, 0xbc, 0xa0, 0x74, 0x77, 0x3c, 0x8a, 0xe4, 0x44, 0xbd, 0x5f, 0x81, 0x56, 0x3c, 0x39, 0x9a, 0xc9, 0xbc, 0x33, 0xf4, 0x07, 0xbd, 0x48, 0xe0, 0x94, 0xbd, 0x3f, 0xfc, 0xdf, 0xbc, 0x41, 0x3e, 0xa9, 0x3c, 0x18, 0x06, 0x0e, 0x3c, 0xfb, 0xb9, 0xe2, 0x3c, 0x12, 0x14, 0x26, 0xbc, 0x8b, 0x15, 0x97, 0xbd, 0x43, 0xc8, 0x23, 0xbd, 0x8e, 0x30, 0xf7, 0x3a, 0x4c, 0xdc, 0x4f, 0xbd, 0x52, 0x50, 0x3c, 0xbc, 0xda, 0x70, 0x1b, 0x3d, 0xfc, 0xbc, 0x3a, 0x3d, 0x76, 0x5a, 0x39, 0xbd, 0x48, 0xc3, 0x50, 0x3d, 0xf9, 0xd3, 0x81, 0xbd, 0x1e, 0xdf, 0x09, 0xbd, 0xd3, 0xa3, 0x7a, 0x3d, 0x71, 0x42, 0x6b, 0xbd, 0x7e, 0x3a, 0x4e, 0x3d, 0xd0, 0x26, 0xc5, 0xbb, 0xde, 0x7d, 0x2d, 0x3d, 0xc0, 0xda, 0xd8, 0xba, 0x18, 0x43, 0x63, 0x3c, 0xb5, 0x93, 0xb6, 0x3c, 0xc7, 0xee, 0x49, 0xbd, 0xb2, 0x73, 0x47, 0xbd, 0xa6, 0x66, 0x3b, 0x3d, 0xea, 0xa2, 0x04, 0xbd, 0xde, 0x2b, 0x44, 0x3d, 0x41, 0x80, 0xee, 0x3c, 0x11, 0xbe, 0x72, 0x3c, 0x46, 0xdf, 0x63, 0xbc, 0x4d, 0xc3, 0xfb, 0xbc, 0x3d, 0xbc, 0x86, 0x3d, 0xf7, 0xad, 0x02, 0xbd, 0x7d, 0xb7, 0x0f, 0xbd, 0x99, 0x8c, 0x51, 0x3c, 0x85, 0xce, 0x50, 0xbd, 0x0d, 0xe0, 0x41, 0x3d, 0x3a, 0xb3, 0x21, 0xbb, 0xd0, 0x0b, 0xdd, 0xbb, 0x94, 0x62, 0x25, 0xbd, 0xc0, 0xab, 0xd1, 0xbc, 0xf0, 0xf6, 0x89, 0xbb, 0xbe, 0x10, 0xb9, 0xbc, 0x68, 0x2e, 0x3a, 0x3c, 0x22, 0x34, 0x20, 0xbd, 0x4d, 0xd9, 0x75, 0xbc, 0x74, 0x5d, 0x00, 0x3d, 0xf3, 0xd5, 0x5e, 0x3d, 0x7c, 0x61, 0xcc, 0xbc, 0x56, 0x76, 0x13, 0x3d, 0xda, 0x68, 0xe3, 0x3b, 0xa3, 0xa1, 0x89, 0x3d, 0xd0, 0xfa, 0x16, 0x3d, 0xf1, 0x86, 0x48, 0x3c, 0x71, 0x81, 0x83, 0x3b, 0x31, 0x30, 0x2a, 0xbd, 0x4e, 0xc0, 0xd6, 0x3c, 0xe6, 0xf3, 0xfd, 0xba, 0x6d, 0x46, 0x96, 0x3c, 0x60, 0xcc, 0x67, 0xbd, 0x11, 0x9c, 0xc6, 0x3c, 0xa8, 0x63, 0x21, 0xbd, 0xdb, 0xb3, 0x70, 0xbc, 0x42, 0x46, 0x38, 0xbd, 0x88, 0x73, 0x00, 0xbc, 0x48, 0x5e, 0x4e, 0x3d, 0x2d, 0x95, 0x26, 0xbd, 0xa0, 0x22, 0xb3, 0x3c, 0x56, 0xfb, 0x91, 0xbd, 0x51, 0x13, 0x06, 0x3c, 0x85, 0x69, 0x8a, 0x3d, 0x23, 0xf8, 0x89, 0xbd, 0x61, 0x24, 0xd3, 0xbc, 0x28, 0xd0, 0x0a, 0x3c, 0xe9, 0x4e, 0x85, 0x3d, 0xde, 0x12, 0x93, 0xbb, 0x18, 0x55, 0xdd, 0x3b, 0x57, 0xc2, 0x22, 0xbd, 0x85, 0x3f, 0x0a, 0xbd, 0x9d, 0x49, 0x86, 0x3d, 0x50, 0x01, 0x8f, 0x3b, 0x2c, 0xbf, 0xf5, 0xbc, 0x6b, 0xec, 0x04, 0x3c, 0x92, 0x0e, 0x9b, 0xbc, 0xfc, 0xe0, 0x28, 0xbd, 0x16, 0xeb, 0x9d, 0xbb, 0x20, 0xde, 0xf9, 0x3c, 0x58, 0x77, 0x06, 0xbd, 0x5c, 0x2a, 0x92, 0xbc, 0x62, 0x8d, 0xf6, 0xbc, 0x88, 0xcc, 0xa3, 0xbb, 0x60, 0xbf, 0xdb, 0x3c, 0x2c, 0xcb, 0x69, 0xbd, 0xe3, 0xcf, 0x89, 0xbb, 0x35, 0xad, 0x81, 0xbd, 0xf1, 0x3d, 0x3d, 0xbd, 0x05, 0x62, 0x81, 0x3d, 0x4e, 0xbe, 0x4d, 0x3c, 0x7e, 0xbf, 0x85, 0x3d, 0xfb, 0xc4, 0x23, 0xbb, 0xd8, 0x1b, 0x78, 0x3d, 0x1d, 0xd7, 0x9d, 0xbd, 0x5d, 0x69, 0x15, 0x3d, 0xb6, 0x7a, 0x93, 0xbc, 0x8c, 0xf1, 0xdf, 0xbc, 0xec, 0xfa, 0x2b, 0x3d, 0x40, 0xda, 0x86, 0x3a, 0x1c, 0x0e, 0x2f, 0xbd, 0x38, 0x71, 0x4c, 0x3d, 0x68, 0x87, 0x9a, 0xbd, 0x12, 0x86, 0x91, 0xbd, 0x60, 0x8f, 0x95, 0xbd, 0xd0, 0xe1, 0xf4, 0xbc, 0xa2, 0x77, 0x3f, 0x3d, 0xc0, 0xcd, 0xa1, 0x3c, 0xa2, 0x69, 0x6e, 0xbd, 0xba, 0xc9, 0x79, 0x3d, 0x6d, 0x05, 0xec, 0xbc, 0xb0, 0x63, 0x57, 0x3d, 0xfa, 0x05, 0xd4, 0xbc, 0xb2, 0xd2, 0x93, 0x3b, 0x7e, 0x40, 0x09, 0xbd, 0xf0, 0x2e, 0xd6, 0x3c, 0x00, 0x7b, 0x69, 0xbd, 0x6e, 0x10, 0x29, 0xbd, 0x69, 0x91, 0x92, 0xbb, 0x90, 0x9e, 0x38, 0x3d, 0x99, 0x1b, 0x69, 0xbd, 0x32, 0xd2, 0x49, 0x3d, 0x9d, 0xa4, 0x5d, 0xbd, 0x8b, 0x8e, 0x20, 0xbd, 0xcf, 0x0b, 0x92, 0xbd, 0x3c, 0xb7, 0xfb, 0x3c, 0xdf, 0xf9, 0x58, 0x3d, 0xa7, 0xf0, 0x3e, 0xbb, 0x6c, 0x7e, 0xbd, 0x3c, 0x83, 0xdf, 0x12, 0x3d, 0x37, 0x97, 0x84, 0x3d, 0xe0, 0x4e, 0x36, 0x3d, 0xf6, 0x06, 0x90, 0xbd, 0x07, 0xc0, 0xce, 0x3c, 0xb1, 0xc0, 0x49, 0x3d, 0x7b, 0x76, 0x02, 0x3c, 0x29, 0x97, 0x93, 0x3b, 0x16, 0x46, 0x45, 0xbd, 0x10, 0xb1, 0x92, 0x3b, 0x26, 0x69, 0x45, 0x3d, 0x1e, 0x1a, 0x6d, 0x3d, 0x60, 0x9f, 0xe3, 0x3b, 0x07, 0xab, 0x5f, 0x3d, 0x65, 0xce, 0x35, 0xbd, 0x61, 0x0d, 0x43, 0xbd, 0x56, 0xa7, 0x79, 0x3d, 0x61, 0x67, 0x37, 0x3d, 0x26, 0xf4, 0x90, 0xbd, 0x73, 0x2e, 0x1b, 0x3d, 0x39, 0x48, 0xe2, 0xb9, 0x57, 0x1e, 0x32, 0x3d, 0xaa, 0x2d, 0x16, 0x3c, 0xae, 0x6a, 0x94, 0xbc, 0xc1, 0x8b, 0x1e, 0xbd, 0xf1, 0x42, 0x4f, 0xbd, 0x6d, 0x34, 0x66, 0x3d, 0xc2, 0x39, 0x6a, 0xbd, 0x6e, 0x02, 0xab, 0x3c, 0xa8, 0x60, 0x3d, 0xbd, 0x69, 0x24, 0x93, 0xbd, 0xd2, 0x91, 0x8a, 0xbd, 0xfe, 0xa0, 0x30, 0xbd, 0xbd, 0x15, 0x28, 0xbd, 0x00, 0x1c, 0x02, 0x3a, 0x2e, 0xe2, 0x5b, 0xbb, 0xda, 0x90, 0x4d, 0x3d, 0x56, 0xc4, 0xd3, 0xbc, 0x25, 0xb8, 0x6d, 0x3d, 0x89, 0xe0, 0x47, 0x3d, 0x60, 0x4b, 0x04, 0xbb, 0x00, 0xd5, 0xdc, 0x39, 0x33, 0xc0, 0x7e, 0x3d, 0xce, 0x0c, 0x51, 0xbd, 0xb2, 0x49, 0xf0, 0xbc, 0xc8, 0x62, 0xa2, 0xbc, 0xdc, 0x45, 0x2a, 0x3d, 0x5e, 0xe2, 0x1b, 0xbd, 0xa6, 0x02, 0x9a, 0xbd, 0xe2, 0xf0, 0x89, 0xbd, 0xff, 0x15, 0xa8, 0xbc, 0xc2, 0x94, 0xb9, 0x3c, 0x8a, 0x28, 0x8b, 0xbc, 0x27, 0x32, 0x7d, 0x3d, 0x2b, 0x24, 0x75, 0xbd, 0xc1, 0x7f, 0x05, 0xbd, 0x8b, 0x7f, 0x28, 0xbd, 0xa4, 0xd9, 0x9a, 0xbc, 0x03, 0xc7, 0x23, 0xbc, 0xac, 0xd5, 0x6d, 0xbc, 0xfb, 0xf5, 0x70, 0xbc, 0x5c, 0x28, 0x5c, 0xbd, 0xf5, 0xa5, 0x54, 0x3d, 0xc4, 0x5f, 0x87, 0xbd, 0x28, 0x92, 0x51, 0x3c, 0x10, 0xc1, 0x87, 0x3d, 0x00, 0xeb, 0x1c, 0x3c, 0x9a, 0x6a, 0x52, 0x3d, 0x95, 0xc5, 0x1a, 0x3d, 0x9d, 0x84, 0x9b, 0x3c, 0x56, 0x33, 0xda, 0xbc, 0x28, 0x01, 0x64, 0x3d, 0xb1, 0x80, 0x4f, 0xbd, 0x50, 0x61, 0x89, 0xbd, 0xe0, 0x1f, 0x30, 0xbb, 0x63, 0x5a, 0x86, 0x3d, 0x06, 0x30, 0x56, 0x3d, 0xc6, 0x8e, 0x4e, 0xbd, 0xd1, 0xb8, 0xc6, 0xbc, 0xc6, 0x6c, 0xf4, 0xbc, 0x6c, 0x6f, 0x21, 0x3d, 0xea, 0x45, 0x86, 0x3c, 0xe7, 0x7b, 0x1c, 0xbd, 0xba, 0x38, 0x54, 0xbd, 0xa4, 0x78, 0x82, 0x3d, 0xdc, 0x98, 0x18, 0xbc, 0xa0, 0x85, 0x0d, 0x3d, 0x9e, 0xe7, 0x55, 0xbd, 0x8e, 0x64, 0x30, 0x3d, 0xda, 0xf4, 0x48, 0x3d, 0x69, 0xdc, 0xe8, 0x3c, 0x68, 0xc7, 0x0d, 0xbd, 0xdf, 0x7e, 0xb4, 0x3c, 0x3a, 0x30, 0x57, 0x3d, 0xc5, 0x7a, 0x1a, 0xbc, 0x42, 0xa7, 0x8c, 0x3d, 0xb1, 0x9c, 0x4f, 0x3d, 0xa0, 0x74, 0x36, 0xbc, 0x7e, 0x74, 0x25, 0x3d, 0xc8, 0x7c, 0x48, 0x3d, 0x7f, 0x68, 0x55, 0x3c, 0xa6, 0x62, 0xf8, 0xbc, 0x16, 0x5b, 0x2d, 0x3d, 0x79, 0x57, 0x6a, 0xbd, 0x86, 0xf0, 0x8b, 0xbc, 0x20, 0x1c, 0x3f, 0x3c, 0x92, 0x3d, 0x20, 0x3d, 0x40, 0x29, 0x7b, 0xbd, 0x32, 0x88, 0x5b, 0x3d, 0x28, 0x79, 0x2c, 0x3c, 0xeb, 0x80, 0xe3, 0x3c, 0xe5, 0x28, 0xa1, 0x3c, 0x95, 0xbb, 0x88, 0x3d, 0x1b, 0xa9, 0x95, 0xbc, 0xb0, 0x35, 0x5b, 0x3d, 0x02, 0xbd, 0x8e, 0xbc, 0x62, 0xe7, 0x1d, 0xbd, 0xad, 0xe5, 0xca, 0x3c, 0x6f, 0x93, 0x3f, 0xb9, 0x51, 0x7d, 0x48, 0xbd, 0x06, 0x75, 0x68, 0x3d, 0xa7, 0x08, 0x7b, 0xbd, 0x5e, 0xeb, 0x73, 0xba, 0xa1, 0x83, 0x31, 0x3d, 0xcd, 0x92, 0x55, 0x3c, 0x88, 0xdb, 0x3f, 0xbd, 0x67, 0x9c, 0x35, 0x3d, 0xa9, 0x4b, 0x14, 0x3d, 0x94, 0x6b, 0x6c, 0xbc, 0x6c, 0xa8, 0xe7, 0x3c, 0xc0, 0x02, 0xf7, 0xbb, 0xcb, 0xbc, 0x85, 0x3a, 0xf1, 0x91, 0xf0, 0xbc, 0x72, 0x77, 0x83, 0x3d, 0x68, 0xab, 0x30, 0x3d, 0xa0, 0x17, 0x96, 0xbc, 0x7d, 0xe6, 0x19, 0xbd, 0x18, 0x2c, 0x22, 0x3d, 0x88, 0x14, 0xaa, 0x3c, 0x40, 0x4d, 0xb3, 0xbc, 0x4c, 0xc2, 0x7a, 0xbc, 0xf8, 0x68, 0x53, 0x3c, 0x16, 0x1d, 0xc6, 0xbb, 0x2f, 0x2c, 0x71, 0xbd, 0xa3, 0x55, 0x80, 0x3d, 0x96, 0x18, 0x07, 0x3d, 0x34, 0xa8, 0xa1, 0xbc, 0x2b, 0x39, 0x58, 0x3d, 0x23, 0xc6, 0x68, 0x3d, 0x46, 0x84, 0x55, 0x3d, 0x0d, 0xd6, 0x3e, 0x3c, 0x2e, 0xc2, 0x0d, 0x3d, 0x88, 0x20, 0x26, 0x3c, 0x44, 0x1b, 0x23, 0x3d, 0x7f, 0x54, 0x8b, 0xbd, 0xda, 0xa3, 0x54, 0xbd, 0x9e, 0xad, 0x32, 0x3d, 0x17, 0x7c, 0x78, 0x3d, 0xcd, 0x11, 0x9f, 0xbc, 0x2c, 0x53, 0x57, 0x3b, 0x1a, 0x5a, 0x0a, 0xbd, 0x6d, 0x40, 0x67, 0x3d, 0x52, 0xb6, 0x56, 0x3d, 0x1c, 0x07, 0x96, 0xbd, 0xb0, 0x1c, 0x14, 0xbd, 0xc3, 0xda, 0x2b, 0x3c, 0x7a, 0x02, 0x61, 0x3d, 0xbd, 0x9f, 0x2a, 0xbd, 0x72, 0xf9, 0xbf, 0xbc, 0x79, 0xfe, 0xa3, 0x3c, 0xfc, 0x45, 0x43, 0xbd, 0x9e, 0xd3, 0x7b, 0x3d, 0x70, 0x3a, 0x6e, 0xbd, 0x78, 0xdc, 0x30, 0x3c, 0x93, 0x36, 0x67, 0x3d, 0x63, 0x08, 0x84, 0x3d, 0x5e, 0x4f, 0x40, 0x3a, 0xc5, 0xd9, 0xc1, 0x3c, 0xea, 0x6b, 0x31, 0x3d, 0x1e, 0xf8, 0xdc, 0xbb, 0x0b, 0x30, 0xfd, 0xbc, 0xc6, 0xf2, 0x87, 0x3d, 0xc5, 0xc9, 0xc7, 0x3c, 0x98, 0x0c, 0xba, 0x3b, 0xcf, 0x1a, 0x8d, 0xbd, 0x90, 0xa5, 0xe1, 0xbb, 0x16, 0xc3, 0x64, 0x3d, 0x03, 0x3a, 0x95, 0x3c, 0xaa, 0x98, 0x32, 0xbd, 0x95, 0xa5, 0x95, 0xbd, 0xde, 0x9e, 0x88, 0x3a, 0xbb, 0x39, 0x8e, 0xbd, 0x3d, 0xf1, 0x30, 0x3d, 0x6e, 0x57, 0x8c, 0x3d, 0xf3, 0x90, 0x25, 0xbd, 0xf8, 0x97, 0x2e, 0xbd, 0x21, 0xf3, 0x1b, 0x3d, 0x34, 0xd9, 0x5d, 0xbc, 0x24, 0x60, 0x23, 0xbc, 0x32, 0x24, 0xa6, 0x3b, 0x01, 0xf1, 0x61, 0xbd, 0x69, 0x3b, 0xaa, 0x3c, 0x54, 0xf0, 0x53, 0xbd, 0x40, 0x67, 0x64, 0x3b, 0x00, 0x84, 0xa1, 0xbb, 0xda, 0xb5, 0x6e, 0x3d, 0x0f, 0xfb, 0x3d, 0xbc, 0xf9, 0xf3, 0x0c, 0xbd, 0x5b, 0x52, 0xd1, 0xbb, 0x43, 0xf7, 0x04, 0xbd, 0xf9, 0x67, 0x7c, 0x3d, 0x36, 0xed, 0x30, 0xbd, 0xcf, 0x53, 0x62, 0x3c, 0x03, 0xbb, 0x79, 0xbd, 0x6d, 0xc8, 0x40, 0x3d, 0xc5, 0x5c, 0x19, 0x3d, 0x0e, 0xd5, 0x2d, 0xbd, 0x2d, 0x89, 0x92, 0x3d, 0xf3, 0xcc, 0x15, 0x3d, 0xe2, 0x92, 0x9e, 0xbc, 0x44, 0x74, 0x8e, 0xbd, 0x6b, 0x27, 0x96, 0xbd, 0x86, 0xcb, 0xe8, 0x3c, 0xab, 0xda, 0x99, 0xbb, 0xf6, 0x99, 0x19, 0xbb, 0xe8, 0xb3, 0x49, 0x3d, 0xa4, 0x79, 0x85, 0x3c, 0x4f, 0xb4, 0xf5, 0xbc, 0x5c, 0x1a, 0xa9, 0xbc, 0xa7, 0x63, 0x1f, 0xbd, 0x33, 0xff, 0x46, 0xbd, 0x39, 0x7f, 0x97, 0xbd, 0xd8, 0x75, 0x85, 0xbd, 0x55, 0x97, 0x94, 0xbc, 0x3e, 0x73, 0xb0, 0x3c, 0xf8, 0xb8, 0xee, 0x3c, 0xa0, 0xe4, 0x6e, 0x3b, 0x00, 0xde, 0x54, 0x3b, 0x3b, 0x2d, 0x90, 0xbc, 0xae, 0xd9, 0x89, 0xbd, 0x65, 0x3d, 0xf9, 0x3c, 0x5f, 0x64, 0x8a, 0xbd, 0x88, 0x25, 0x7c, 0xbb, 0x8c, 0x64, 0x35, 0xbc, 0x63, 0x28, 0x0c, 0x3d, 0x2d, 0x9c, 0xde, 0xbb, 0x62, 0x5c, 0x96, 0xbc, 0x12, 0x3c, 0x35, 0x3d, 0x50, 0x11, 0xcc, 0x3b, 0x56, 0x1a, 0x80, 0xbd, 0xd0, 0x1a, 0x98, 0xba, 0x88, 0xe4, 0x58, 0x3d, 0x09, 0xc2, 0x9e, 0x3b, 0xce, 0xc4, 0x3c, 0xbc, 0x88, 0x46, 0x09, 0xbd, 0xea, 0xde, 0x04, 0x3c, 0xd4, 0x45, 0x5d, 0xbd, 0x18, 0x90, 0x7e, 0x3d, 0x99, 0x67, 0x91, 0x3d, 0x8d, 0x01, 0xd7, 0xbc, 0x61, 0xdc, 0x6b, 0x3d, 0x36, 0x17, 0x96, 0x3c, 0x7e, 0x27, 0x6f, 0x3d, 0x52, 0xcb, 0xf7, 0x3c, 0xfc, 0x54, 0x75, 0xbc, 0x36, 0xbd, 0x25, 0x3d, 0x86, 0xd1, 0x7b, 0xbd, 0x5c, 0x19, 0x12, 0x3d, 0xda, 0xfb, 0x03, 0x3d, 0xee, 0x5f, 0x37, 0xbd, 0xd4, 0x39, 0x34, 0xbd, 0xb4, 0x2f, 0x8b, 0xbd, 0x29, 0xd4, 0x99, 0xbd, 0x4e, 0x31, 0x4a, 0x3c, 0x3a, 0x73, 0x7b, 0x3d, 0x97, 0x99, 0xac, 0xbb, 0x77, 0xe4, 0xac, 0xbc, 0x0c, 0x31, 0xc3, 0xbb, 0xd7, 0xdb, 0x85, 0x3d, 0x31, 0x4d, 0xd5, 0xbb, 0xb8, 0x71, 0xda, 0x3c, 0x7c, 0x01, 0x5a, 0x3d, 0x32, 0xe9, 0x57, 0x3d, 0x6f, 0xd9, 0x7a, 0x3d, 0x38, 0x6a, 0x77, 0xbc, 0x7b, 0x63, 0x5c, 0xbd, 0x8c, 0xe0, 0x02, 0xbd, 0xf2, 0x35, 0x47, 0x3d, 0x93, 0x0e, 0x59, 0xbd, 0xf8, 0xfa, 0x63, 0x3d, 0x1c, 0x59, 0x49, 0xbd, 0x48, 0x00, 0x3c, 0xbc, 0x52, 0xd8, 0x14, 0x3d, 0xc3, 0x56, 0x42, 0x3c, 0x7d, 0x74, 0xa9, 0x3c, 0x15, 0x40, 0x83, 0x3d, 0x9c, 0x8d, 0xe2, 0xbc, 0x47, 0xdb, 0x86, 0x3d, 0xcc, 0x7f, 0x2d, 0xbd, 0x39, 0xdd, 0x8f, 0x3d, 0xe8, 0xe7, 0x0c, 0x3c, 0xc0, 0xc6, 0xfa, 0x3a, 0x5e, 0x6c, 0x85, 0xbd, 0xae, 0x8d, 0x79, 0x3d, 0x29, 0x90, 0xd8, 0x3c, 0x09, 0x17, 0x85, 0xbc, 0x4d, 0xf9, 0x71, 0xbd, 0x74, 0xa6, 0xf3, 0xbb, 0xf0, 0x65, 0xee, 0xbc, 0x42, 0x45, 0x7b, 0x3d, 0xdc, 0x2b, 0x5e, 0xbd, 0x35, 0x5f, 0x3f, 0x3d, 0x10, 0x00, 0xdd, 0x3b, 0xb8, 0xd0, 0x94, 0xbc, 0xe8, 0xb4, 0xcc, 0xbc, 0xb3, 0x71, 0x2d, 0x3c, 0x00, 0x36, 0xc0, 0x3c, 0x3e, 0x20, 0x1e, 0xbd, 0x0e, 0xdf, 0x62, 0x3c, 0x55, 0xdc, 0x44, 0x3d, 0x27, 0x0e, 0x3a, 0xbc, 0x6b, 0xd4, 0x8c, 0x3c, 0xcc, 0xcc, 0x7f, 0xbd, 0xd4, 0x43, 0x3d, 0xbd, 0x5b, 0xac, 0x58, 0x3c, 0xf0, 0x58, 0xd2, 0xbc, 0x49, 0x1d, 0x38, 0x3d, 0x09, 0x7c, 0x1d, 0xbd, 0x7a, 0x5b, 0x00, 0xbd, 0xe4, 0x6e, 0xf0, 0x3c, 0x4a, 0xd3, 0x56, 0x3d, 0x28, 0x12, 0x8d, 0xbc, 0xbe, 0x44, 0x65, 0x3d, 0x0a, 0xd4, 0x16, 0xbc, 0xb0, 0x96, 0x16, 0xbd, 0xfa, 0xf1, 0x8d, 0x3d, 0x41, 0xd6, 0x74, 0x3d, 0xb5, 0x79, 0x85, 0xbd, 0x5d, 0xfb, 0x8e, 0xbc, 0xd8, 0x46, 0x86, 0xba, 0x2f, 0xa2, 0x8b, 0xbd, 0xd8, 0x91, 0x90, 0xbc, 0xf7, 0x73, 0xe6, 0xbc, 0x6c, 0x45, 0xac, 0x3c, 0xe4, 0xbe, 0x60, 0xbc, 0x4b, 0x18, 0x7f, 0x3d, 0x1f, 0xb0, 0x39, 0x3c, 0xc0, 0x64, 0x71, 0x3d, 0x2f, 0x99, 0x3e, 0xbd, 0xa8, 0x87, 0x2f, 0x3d, 0xdc, 0xb3, 0x94, 0xbd, 0xfa, 0xe2, 0x8c, 0xbd, 0x28, 0xb5, 0x2a, 0x3c, 0xa3, 0x13, 0x31, 0xbd, 0xe6, 0xae, 0xfc, 0xbc, 0x98, 0xb6, 0x68, 0xbd, 0x41, 0xdf, 0x66, 0x3b, 0xde, 0xc5, 0x2e, 0xbd, 0x24, 0x8c, 0x4c, 0xbd, 0xdb, 0x77, 0xe8, 0x3b, 0xc0, 0x23, 0xc1, 0xbc, 0x50, 0xcb, 0x98, 0xbc, 0x44, 0x4b, 0x32, 0x3d, 0xd0, 0xd5, 0xf9, 0xbc, 0x40, 0x77, 0xea, 0x3b, 0xaf, 0x97, 0xbc, 0x3c, 0x9f, 0x07, 0x8d, 0x3d, 0x26, 0xc4, 0x87, 0xbc, 0x48, 0xff, 0x1b, 0x3d, 0x90, 0x07, 0xc0, 0x3b, 0xa0, 0xeb, 0x61, 0xbb, 0x61, 0x90, 0x8c, 0x3d, 0x46, 0x0b, 0x89, 0xbd, 0x61, 0x99, 0x09, 0xbd, 0x27, 0xb3, 0x3a, 0xbc, 0xad, 0x56, 0xff, 0xbc, 0xa6, 0xaf, 0x7f, 0x3d, 0x50, 0x1d, 0x09, 0xbd, 0x82, 0xfd, 0xcd, 0xbc, 0x31, 0x6c, 0x4d, 0x3d, 0x6d, 0xe8, 0x8c, 0x3c, 0x59, 0x5e, 0xb7, 0xbb, 0xa8, 0x14, 0x49, 0x3d, 0x86, 0xe4, 0x89, 0xbc, 0x41, 0xc7, 0x0c, 0xbd, 0xf5, 0x84, 0x80, 0x3d, 0x31, 0x71, 0x88, 0x3d, 0x3b, 0xcf, 0x84, 0xbd, 0x4f, 0xc3, 0x89, 0x3d, 0x24, 0x62, 0x21, 0xbd, 0xb0, 0xc2, 0xdb, 0x3b, 0xf8, 0xc8, 0x46, 0xbd, 0xa5, 0xe0, 0x89, 0x3d, 0x89, 0x41, 0x29, 0x3c, 0x90, 0xbd, 0xe7, 0x3c, 0x78, 0xc9, 0x42, 0xbc, 0x1f, 0xd6, 0x82, 0x3d, 0xfb, 0xcd, 0x87, 0xbd, 0x2a, 0xd2, 0x24, 0xbd, 0x86, 0x49, 0x6d, 0xbd, 0x62, 0x20, 0xc8, 0xba, 0xb0, 0xc4, 0xec, 0xbc, 0xdf, 0x68, 0xb4, 0x3a, 0xe3, 0x0f, 0xe7, 0x3c, 0x41, 0xd5, 0x2e, 0xbd, 0xd4, 0xd6, 0x7c, 0xbd, 0xb6, 0xd8, 0x2f, 0x3d, 0x2e, 0x95, 0xf2, 0xbc, 0x7c, 0xa4, 0xd0, 0xbc, 0x84, 0x63, 0x61, 0x3d, 0xfe, 0x1c, 0x26, 0x3d, 0x29, 0x38, 0x6e, 0x3c, 0xff, 0xb9, 0x12, 0xbd, 0xbc, 0xc6, 0x8d, 0x3d, 0xe1, 0xf5, 0x94, 0xbd, 0xd6, 0x91, 0x86, 0xbd, 0x88, 0xb9, 0x58, 0xbc, 0x50, 0x18, 0xb0, 0xbb, 0x95, 0x6f, 0x84, 0x3d, 0xd1, 0x02, 0x2c, 0xbd, 0xdd, 0xec, 0x00, 0x3d, 0x2c, 0x87, 0x33, 0x3c, 0x83, 0xae, 0x83, 0xbd, 0xf9, 0xfc, 0xc7, 0x3b, 0x54, 0x47, 0x34, 0xbc, 0xdc, 0xeb, 0x44, 0xbc, 0xc1, 0x33, 0x1f, 0xbd, 0x2e, 0xa0, 0xe7, 0xbc, 0x18, 0x92, 0x5b, 0xbc, 0x75, 0xee, 0x48, 0x3d, 0xcf, 0xe5, 0x29, 0x3c, 0xdd, 0xfb, 0xcd, 0xbc, 0x1e, 0xfe, 0x15, 0xbd, 0xfa, 0x83, 0x24, 0xbd, 0x74, 0xa7, 0x1b, 0x3d, 0x79, 0x43, 0xf6, 0x3c, 0xc1, 0x09, 0xcc, 0xbb, 0x23, 0xce, 0x51, 0x3d, 0x90, 0xbd, 0x6d, 0xbd, 0xd3, 0x87, 0xa9, 0x3c, 0xa6, 0x5c, 0x6b, 0x3d, 0x30, 0xbc, 0xd0, 0xbb, 0x43, 0x24, 0x71, 0xbd, 0xf1, 0xc3, 0x69, 0xbc, 0xcc, 0x77, 0x5d, 0xbd, 0xf5, 0x11, 0x95, 0xbd, 0x90, 0x17, 0xc7, 0xbc, 0x44, 0x6c, 0x85, 0xbd, 0xeb, 0x43, 0xd6, 0x3c, 0xe3, 0x8d, 0x8b, 0x3d, 0xbf, 0x68, 0x3d, 0xbd, 0x6d, 0x69, 0x86, 0xbd, 0xb5, 0x14, 0x8f, 0xbd, 0xe9, 0x70, 0x0c, 0xbc, 0x97, 0x30, 0x78, 0x3d, 0xd2, 0x1f, 0x57, 0xbd, 0x08, 0xe4, 0x28, 0x3d, 0x34, 0x1f, 0xf3, 0xbc, 0x18, 0xb7, 0x66, 0xbc, 0x00, 0x60, 0x30, 0x3c, 0xc1, 0x3d, 0x1f, 0xbd, 0x26, 0x9a, 0x85, 0x3d, 0xc6, 0x32, 0x88, 0xbd, 0x36, 0x33, 0x5c, 0xbd, 0x81, 0xb7, 0x89, 0xbd, 0x9f, 0x29, 0xeb, 0xbb, 0xe3, 0x50, 0x3d, 0x3d, 0x24, 0x66, 0x88, 0xbd, 0xcc, 0xc0, 0x0d, 0x3d, 0xd2, 0xa9, 0x92, 0x3c, 0x54, 0x72, 0x02, 0x3d, 0xd5, 0x3b, 0x90, 0xbb, 0x3d, 0x9f, 0x63, 0xbd, 0xed, 0xbe, 0x18, 0xbd, 0x59, 0xec, 0x6e, 0x3b, 0x28, 0xf2, 0x29, 0xbc, 0xc7, 0xce, 0xab, 0x3c, 0xf4, 0xc8, 0x79, 0xbd, 0x7c, 0x71, 0x30, 0x3d, 0x75, 0xbb, 0x80, 0xbc, 0x5c, 0xc6, 0x6b, 0xbd, 0x61, 0x73, 0x3c, 0x3d, 0x74, 0x82, 0x33, 0xbd, 0xd2, 0x32, 0x79, 0x3c, 0x9c, 0x80, 0xb6, 0xbb, 0xef, 0xee, 0x5f, 0x3d, 0xf8, 0x07, 0x30, 0xbd, 0xb1, 0x7f, 0x2f, 0xbd, 0xc2, 0x76, 0x36, 0xbd, 0x9e, 0x38, 0xa3, 0x3c, 0x7c, 0x4e, 0x47, 0xbc, 0x48, 0xce, 0x1a, 0x3d, 0xfc, 0xcd, 0xc2, 0x3c, 0x65, 0xb0, 0x07, 0x3d, 0x51, 0x39, 0x1c, 0x3d, 0x27, 0x56, 0x87, 0x3d, 0x63, 0x07, 0xdd, 0x3c, 0x2b, 0xd5, 0x82, 0x3d, 0xb0, 0x9d, 0x85, 0xbd, 0xc5, 0x43, 0xf0, 0x3c, 0x19, 0x0c, 0x95, 0x3b, 0x28, 0x64, 0x6b, 0xbd, 0x8e, 0x23, 0x09, 0xbd, 0xfa, 0x58, 0xfc, 0x3b, 0x40, 0xca, 0x5d, 0x3c, 0xa0, 0xbe, 0x58, 0xbd, 0xb1, 0x3b, 0x91, 0xbd, 0xd1, 0x73, 0xf0, 0x3a, 0x1d, 0x07, 0x31, 0x3d, 0x7d, 0x80, 0x07, 0x3d, 0xda, 0x52, 0x44, 0x3c, 0x78, 0x62, 0x58, 0x3c, 0x8d, 0x84, 0x01, 0x3d, 0x66, 0x36, 0x76, 0xbd, 0x68, 0xd0, 0x03, 0xbc, 0x43, 0x54, 0x56, 0x3c, 0xae, 0xac, 0x59, 0x3d, 0x36, 0xce, 0x48, 0xbd, 0xd4, 0xc1, 0x65, 0xbc, 0xd9, 0xee, 0x34, 0x3c, 0x80, 0x4c, 0x66, 0xba, 0x88, 0xe1, 0x3c, 0x3c, 0xc8, 0xb7, 0x04, 0x3d, 0x90, 0xdf, 0xdf, 0x3c, 0x20, 0x76, 0x1c, 0x3b, 0xfb, 0x80, 0x1e, 0x3d, 0x7e, 0xbd, 0x19, 0x3d, 0x1f, 0x28, 0x96, 0xbb, 0x19, 0xa6, 0x3c, 0x3c, 0x3f, 0xc7, 0xf9, 0xbc, 0x4a, 0xc2, 0x1a, 0xbd, 0xd5, 0xa0, 0x86, 0xbd, 0x3a, 0xc8, 0xd6, 0x3c, 0xc3, 0x1a, 0x5a, 0x3d, 0x1a, 0x8c, 0x91, 0xbd, 0xd0, 0x10, 0x67, 0x3d, 0x42, 0x5b, 0x16, 0x3d, 0xa3, 0xd2, 0x5b, 0xbc, 0x6c, 0xa0, 0xb6, 0x3c, 0x65, 0xe2, 0x1d, 0xbd, 0x9a, 0xdf, 0x0e, 0xbd, 0xc0, 0x74, 0xcf, 0x3b, 0x84, 0xe1, 0xc1, 0x3c, 0x2a, 0xed, 0x60, 0x3d, 0xe3, 0x10, 0xe4, 0xbc, 0x3f, 0xcc, 0x8b, 0xbd, 0x95, 0xa5, 0x8b, 0x3d, 0xd8, 0xc3, 0x00, 0xbd, 0x85, 0x56, 0x75, 0x3d, 0xac, 0x3a, 0x5b, 0x3d, 0x6a, 0x5d, 0xed, 0xbb, 0xbb, 0xd3, 0xd5, 0x3c, 0xac, 0xb0, 0x3f, 0x3d, 0x70, 0x1a, 0x6b, 0x3c, 0x70, 0xca, 0x28, 0x3c, 0xa2, 0x71, 0xde, 0xbc, 0x00, 0x22, 0x77, 0x3a, 0x43, 0x45, 0x21, 0xbd, 0x17, 0xa9, 0x34, 0x3d, 0x4d, 0x49, 0x2d, 0xbd, 0xb5, 0xd6, 0x8b, 0x3d, 0x84, 0xa5, 0xbd, 0xbc, 0x9d, 0x7f, 0x02, 0xbd, 0x85, 0x08, 0x80, 0xbd, 0xff, 0x2d, 0x8f, 0xbc, 0x04, 0x5f, 0x3b, 0xbd, 0xba, 0xce, 0x17, 0xbd, 0xf3, 0xfc, 0x80, 0x3d, 0xe1, 0x9c, 0x8c, 0xbd, 0xaf, 0x1c, 0xc6, 0x3c, 0x77, 0x31, 0x12, 0x3d, 0xde, 0x28, 0x49, 0xbd, 0x0d, 0xe3, 0x1f, 0xbd, 0x2a, 0x71, 0x30, 0xbc, 0x1e, 0x04, 0x35, 0x3d, 0x08, 0x0a, 0xad, 0x3b, 0xe9, 0x97, 0x98, 0xbc, 0x26, 0xe3, 0x00, 0x3c, 0xbe, 0xf9, 0xbb, 0xbc, 0x77, 0x23, 0x34, 0xbd, 0x55, 0x69, 0x61, 0x3d, 0xc4, 0xb9, 0x8d, 0xbd, 0x5f, 0x82, 0x81, 0x3d, 0x68, 0xff, 0x16, 0xbc, 0x2c, 0xa2, 0x91, 0xbc, 0x67, 0x62, 0x78, 0xbd, 0x76, 0x32, 0x13, 0x3d, 0x68, 0x26, 0x2b, 0x3d, 0x1a, 0xbb, 0xdc, 0xbc, 0xae, 0x91, 0x84, 0x3d, 0xc0, 0xfe, 0x8d, 0xbd, 0xfe, 0x28, 0x88, 0xbc, 0x02, 0x43, 0x0e, 0xbc, 0x0b, 0x35, 0x69, 0xbb, 0xb4, 0xf8, 0x8b, 0xbd, 0xad, 0x86, 0x6e, 0xbd, 0x5c, 0x92, 0x19, 0xbd, 0x03, 0x18, 0x59, 0xbd, 0x58, 0x48, 0x55, 0xbc, 0x2e, 0xaf, 0x4d, 0x3d, 0x70, 0x1a, 0x59, 0xbc, 0x63, 0xf3, 0x3d, 0xbd, 0x97, 0xcd, 0x8f, 0xbd, 0x4b, 0x2b, 0x75, 0x3d, 0x78, 0xf6, 0x78, 0xbd, 0x40, 0x84, 0x01, 0xbd, 0x04, 0xb6, 0x05, 0xbd, 0x21, 0xa7, 0xf7, 0x3c, 0x9e, 0x08, 0xc5, 0x3c, 0x3b, 0xde, 0xa8, 0xbc, 0x04, 0x81, 0x85, 0x3c, 0x7d, 0x36, 0xd2, 0x3c, 0x02, 0xf0, 0xd0, 0xbc, 0xcb, 0xe0, 0x68, 0x3d, 0xb3, 0x19, 0x89, 0xbd, 0x39, 0xf7, 0x5f, 0x3d, 0x6a, 0x8f, 0x05, 0xbc, 0x7c, 0xc8, 0x91, 0xbc, 0xec, 0xc4, 0x93, 0x3c, 0xa0, 0x62, 0x3a, 0xbb, 0x59, 0xfc, 0x1a, 0xbd, 0xc9, 0xcd, 0x95, 0xbd, 0x57, 0xc3, 0x5b, 0xbb, 0x67, 0x2f, 0xe4, 0x3c, 0x13, 0xcc, 0xa5, 0x3c, 0x1d, 0x6c, 0x39, 0xbc, 0x50, 0x64, 0x83, 0x3c, 0x50, 0x6d, 0x5b, 0xbc, 0xda, 0x2a, 0xcd, 0x3c, 0x09, 0xb3, 0x96, 0xbd, 0x91, 0x4f, 0x34, 0x3d, 0x33, 0xd0, 0x17, 0xbd, 0x1d, 0x22, 0x86, 0xbd, 0x9c, 0x1e, 0x0d, 0xbd, 0xd4, 0x2b, 0x9c, 0xba, 0x67, 0xb5, 0xa7, 0xbc, 0x0f, 0xe2, 0x76, 0xbd, 0x4b, 0xb9, 0x71, 0x3d, 0x69, 0xa9, 0x9c, 0xbc, 0x30, 0x44, 0x47, 0x3d, 0xf0, 0xdc, 0x95, 0x3c, 0xe2, 0x1d, 0x22, 0xbd, 0xaa, 0xb5, 0x58, 0xbd, 0x9d, 0x59, 0x7d, 0xbd, 0xa4, 0x92, 0x95, 0x3c, 0x40, 0xaa, 0x8d, 0xbd, 0xf0, 0x3e, 0xb4, 0x3c, 0xc2, 0x03, 0x2a, 0xbd, 0xb0, 0xc5, 0x29, 0xbd, 0xc0, 0x7c, 0x42, 0xbd, 0xea, 0x99, 0x7e, 0x3d, 0xd6, 0xbc, 0x15, 0x3d, 0xb9, 0xda, 0x37, 0xbd, 0xd0, 0x21, 0x9e, 0x3c, 0x79, 0x2e, 0xab, 0xbb, 0x73, 0x17, 0xcd, 0xbc, 0x7c, 0x01, 0xe3, 0x3c, 0xb7, 0xb8, 0xf2, 0x3c, 0x11, 0x4b, 0x45, 0x3d, 0x87, 0x86, 0x9a, 0x3c, 0x2c, 0x70, 0x57, 0xbd, 0x55, 0xdf, 0x1d, 0xbd, 0xf5, 0x86, 0xa6, 0xbc, 0x21, 0x96, 0x49, 0xbd, 0x36, 0x4c, 0x75, 0xbd, 0xc9, 0x1c, 0xa0, 0x3c, 0x5d, 0xba, 0x26, 0x3d, 0xd6, 0x56, 0x02, 0x3d, 0x69, 0x90, 0x12, 0xbc, 0x08, 0x5b, 0x0f, 0xbd, 0x81, 0xce, 0x92, 0xbc, 0x3a, 0xb8, 0x5f, 0x3d, 0x7a, 0xaf, 0xe7, 0x3c, 0x4d, 0x4b, 0x60, 0xbc, 0x78, 0xc0, 0x6c, 0xbd, 0x85, 0x6f, 0xe7, 0x3c, 0xaa, 0xc1, 0xb3, 0x3c, 0x8b, 0xe4, 0xb7, 0x3c, 0xdd, 0xd0, 0x39, 0x3d, 0x48, 0x49, 0x1b, 0x3d, 0xe2, 0x74, 0x28, 0xbd, 0x86, 0x4a, 0x47, 0x3d, 0x30, 0x77, 0xad, 0x3b, 0xe0, 0xa8, 0x0e, 0xbc, 0xec, 0x36, 0xd1, 0x3c, 0xe3, 0x01, 0x8f, 0xbd, 0x56, 0x6c, 0x34, 0xbd, 0x8a, 0x99, 0x20, 0xbb, 0xb1, 0x89, 0x12, 0x3d, 0xea, 0x43, 0x39, 0xbd, 0x26, 0x16, 0xd2, 0x3c, 0xe2, 0x88, 0xc8, 0x3c, 0x63, 0x15, 0xa0, 0x3c, 0x8d, 0x95, 0x3a, 0x3d, 0x86, 0x69, 0x26, 0xbd, 0x4c, 0x38, 0xdb, 0x3b, 0xe0, 0xfa, 0x49, 0x3d, 0x62, 0xdf, 0xb4, 0xbc, 0x6a, 0xe4, 0x89, 0xbc, 0x63, 0x50, 0x6d, 0x3d, 0xfa, 0x35, 0x46, 0xbd, 0xcb, 0xcb, 0x8c, 0xbc, 0x46, 0x94, 0x66, 0x3d, 0xdd, 0xf8, 0xa2, 0xbc, 0x00, 0x34, 0x8c, 0x3d, 0x0a, 0xa1, 0x05, 0x3d, 0x73, 0x92, 0x91, 0xbd, 0x64, 0x3e, 0xf4, 0xbc, 0xcd, 0x5a, 0xa4, 0xbc, 0xe6, 0xce, 0x4b, 0x3d, 0x68, 0xb0, 0xcf, 0xbc, 0x38, 0xd3, 0xe2, 0x3b, 0xfd, 0x03, 0x38, 0xbd, 0x11, 0xc0, 0x92, 0xbd, 0xa8, 0x82, 0x50, 0x3d, 0x2a, 0x9a, 0xaf, 0xbc, 0x0e, 0xea, 0x7b, 0x3d, 0x11, 0xf4, 0x95, 0xbc, 0x34, 0xed, 0xb6, 0x3c, 0x2b, 0x26, 0x6f, 0xbd, 0x15, 0xad, 0x7c, 0x3d, 0x19, 0xc6, 0xed, 0x3c, 0x00, 0xf8, 0x81, 0xbd, 0x74, 0x82, 0x63, 0xbd, 0x62, 0x76, 0x53, 0xbd, 0x48, 0x4f, 0x78, 0x3d, 0x76, 0x0e, 0x5c, 0xbb, 0x24, 0x30, 0x30, 0xbd, 0x86, 0x0a, 0x14, 0x3d, 0x08, 0x29, 0xb3, 0xbc, 0xef, 0x7c, 0x2a, 0xbd, 0x90, 0xb8, 0x09, 0x3d, 0x47, 0x45, 0x66, 0xbc, 0x30, 0x23, 0xb7, 0xbc, 0x8f, 0xd2, 0x5e, 0x3d, 0x31, 0x72, 0x33, 0x3d, 0x26, 0xdc, 0x88, 0xbd, 0xeb, 0x0b, 0x24, 0xbc, 0x14, 0x3c, 0xe9, 0xbc, 0x38, 0xc6, 0xd3, 0x3c, 0x55, 0xd6, 0x09, 0xbd, 0xe5, 0xf7, 0x21, 0xbb, 0x7d, 0x03, 0x0d, 0x3d, 0xe9, 0x91, 0xd6, 0xbb, 0x00, 0x90, 0xe4, 0x3a, 0x21, 0x2c, 0x1a, 0x3d, 0x0c, 0xe1, 0x82, 0x3c, 0x0a, 0xb6, 0x38, 0x3d, 0x6c, 0x03, 0xe9, 0x3c, 0x83, 0x86, 0x05, 0x3d, 0x01, 0x6e, 0x86, 0x3d, 0x99, 0xc2, 0x47, 0xbd, 0x27, 0x07, 0x57, 0x3d, 0xed, 0xd2, 0x59, 0x3d, 0x0f, 0xa1, 0x0a, 0xbc, 0x12, 0x62, 0x6c, 0x3d, 0x16, 0x50, 0xf8, 0x3b, 0x00, 0xf3, 0xdc, 0x3c, 0x5c, 0x4e, 0xa6, 0xbc, 0xfa, 0x73, 0x42, 0x3c, 0xd2, 0x38, 0x8a, 0xbd, 0x35, 0x94, 0x8d, 0xbc, 0x69, 0x22, 0x3e, 0xbd, 0x83, 0xec, 0x6f, 0xbc, 0xb6, 0x37, 0xb4, 0x3c, 0xf1, 0xa7, 0x83, 0x3d, 0x62, 0xbc, 0x82, 0x3d, 0x88, 0x5d, 0xb8, 0xbc, 0xdd, 0x4d, 0x96, 0xbc, 0xaa, 0x38, 0x23, 0xbd, 0x88, 0x3f, 0x4d, 0xbc, 0xc5, 0x2d, 0xfc, 0x3c, 0x78, 0x63, 0x20, 0x3d, 0xe5, 0x87, 0x88, 0x3d, 0x08, 0xed, 0x77, 0xbc, 0x38, 0xef, 0x85, 0xbc, 0x19, 0xc5, 0x90, 0x3d, 0xba, 0xc7, 0x4e, 0x3d, 0xe4, 0xc2, 0xd6, 0x3c, 0xac, 0x97, 0x22, 0xbc, 0xa4, 0x4d, 0x55, 0xbd, 0x02, 0x71, 0x8b, 0xbd, 0xce, 0x55, 0x86, 0x3d, 0xf9, 0x00, 0x9c, 0xbc, 0xbc, 0x84, 0x51, 0x3d, 0x3c, 0xaa, 0x21, 0xbd, 0xb3, 0x0f, 0x43, 0xbd, 0x15, 0x2e, 0x90, 0xbd, 0xa9, 0x5c, 0x7a, 0x3d, 0x11, 0x1e, 0x4b, 0x3d, 0xc7, 0x35, 0xc9, 0xbc, 0x86, 0x61, 0x77, 0xbd, 0x5c, 0xbb, 0x21, 0xbc, 0x39, 0x3c, 0x6d, 0x3d, 0xaa, 0xde, 0xdd, 0x3a, 0xe5, 0xad, 0x0b, 0xbd, 0xd5, 0x2c, 0x8f, 0xbd, 0x9b, 0xd2, 0x40, 0xbc, 0xae, 0xd1, 0x27, 0x3d, 0xa4, 0x43, 0x61, 0x3c, 0x96, 0x2f, 0x26, 0xbd, 0x4c, 0xdb, 0x50, 0xbd, 0xd0, 0xee, 0x55, 0xbc, 0xa9, 0xdf, 0x62, 0x3d, 0xa9, 0xc7, 0x14, 0xbd, 0x02, 0x65, 0x41, 0x3b, 0xdc, 0x7c, 0x20, 0x3c, 0xb5, 0xb9, 0x89, 0x3d, 0x43, 0xc8, 0x8f, 0xbd, 0xe5, 0x6b, 0x3e, 0x3c, 0xcb, 0x96, 0x8d, 0xbd, 0xe8, 0x9b, 0x7d, 0xbd, 0xad, 0x41, 0x91, 0x3d, 0x84, 0x7b, 0xc2, 0x3c, 0xe9, 0xf8, 0x8c, 0x3c, 0x6d, 0x06, 0xf1, 0xbb, 0xac, 0xcc, 0x43, 0x3d, 0x11, 0xd2, 0xe3, 0x3c, 0x69, 0xb6, 0x76, 0xbc, 0x19, 0x3b, 0x71, 0xbd, 0x82, 0x8a, 0xb9, 0xbc, 0x28, 0x56, 0x3a, 0x3d, 0xf6, 0x2b, 0x3c, 0x3d, 0x0f, 0x6e, 0xe1, 0xbb, 0x96, 0x11, 0x84, 0xbc, 0xae, 0xf7, 0x81, 0x3d, 0xd2, 0xd1, 0x80, 0x3d, 0x97, 0xc3, 0xe6, 0xbc, 0x89, 0xe2, 0x57, 0x3c, 0x3d, 0x6e, 0x8e, 0xbc, 0xca, 0x02, 0x4d, 0xbd, 0x62, 0x3c, 0xc1, 0xbc, 0x16, 0x10, 0xed, 0xba, 0x3f, 0xe1, 0xef, 0x3c, 0x0a, 0x5c, 0xab, 0xbc, 0x21, 0xad, 0xd1, 0xbb, 0xbc, 0xfe, 0x32, 0x3c, 0xac, 0x6c, 0x71, 0xbd, 0x15, 0x98, 0x14, 0x3d, 0xb6, 0xee, 0x3a, 0x3c, 0x35, 0x4c, 0x87, 0x3d, 0xb6, 0xcd, 0x4c, 0x3d, 0x10, 0xf7, 0xcc, 0x3b, 0xdb, 0x8a, 0x19, 0xbd, 0x00, 0x38, 0xdb, 0xb8, 0xb3, 0x1b, 0x8e, 0xbd, 0x50, 0xa8, 0x41, 0xbd, 0x64, 0x53, 0x85, 0xbd, 0x46, 0xcf, 0xcd, 0xbb, 0x65, 0xaf, 0xa4, 0x3c, 0x78, 0x82, 0x22, 0xbd, 0xb1, 0xb2, 0x19, 0xbd, 0xaa, 0x2b, 0xe5, 0xbc, 0xb8, 0x9c, 0x3d, 0x3d, 0x30, 0x82, 0x8c, 0x3c, 0xd9, 0x2c, 0x89, 0xbd, 0x27, 0x33, 0x8f, 0x3d, 0x20, 0x09, 0x87, 0x3d, 0x50, 0x15, 0x05, 0xbd, 0x4b, 0xc1, 0x96, 0xbd, 0x82, 0x2a, 0x33, 0x3d, 0xc1, 0x9b, 0x6c, 0xbd, 0xac, 0x51, 0x0c, 0xbd, 0xd7, 0xbc, 0x59, 0xbd, 0x69, 0x2b, 0x37, 0x3c, 0xc0, 0xef, 0x26, 0xbd, 0xc8, 0xba, 0x59, 0x3c, 0xda, 0x1b, 0x18, 0xbd, 0x11, 0xfb, 0x8b, 0x3d, 0xbf, 0xc8, 0x3d, 0xbd, 0x52, 0x1b, 0x00, 0x3d, 0xe8, 0x9d, 0x4d, 0xba, 0xe4, 0x9d, 0x44, 0x3d, 0x87, 0x63, 0x06, 0xbd, 0x76, 0xc3, 0x83, 0x3d, 0x32, 0xe3, 0x84, 0xbd, 0x5a, 0x34, 0x11, 0x3d, 0xe0, 0xb2, 0x0e, 0xbd, 0xa8, 0x02, 0x8a, 0xbd, 0x9c, 0x92, 0x10, 0x3d, 0x47, 0xfd, 0x90, 0xbd, 0x24, 0x45, 0x3c, 0x3d, 0x67, 0x62, 0x96, 0xbd, 0xbb, 0x91, 0x79, 0xbd, 0x80, 0x99, 0x5b, 0xbd, 0x93, 0x7f, 0x83, 0xbd, 0x75, 0x82, 0x10, 0xbd, 0x07, 0xb0, 0xa7, 0xbb, 0x5b, 0x41, 0x66, 0xbd, 0x82, 0xeb, 0x7a, 0xbc, 0x52, 0xca, 0x57, 0xbd, 0x7e, 0xe3, 0x66, 0x3c, 0xab, 0x22, 0x68, 0xbd, 0x51, 0x4b, 0xa9, 0xbc, 0x5e, 0x13, 0xa7, 0xbc, 0xe3, 0x6b, 0x88, 0xbb, 0x80, 0x4c, 0x02, 0x3d, 0xf3, 0x3c, 0x59, 0xbd, 0xb2, 0x10, 0x7e, 0x3d, 0x1a, 0x9d, 0x13, 0xbd, 0x8d, 0xd0, 0x5b, 0x3d, 0xca, 0x7a, 0x74, 0x3d, 0x16, 0x53, 0x4b, 0x3d, 0xc9, 0x0a, 0x89, 0xbd, 0x44, 0x7e, 0x1b, 0xbc, 0x11, 0xca, 0xb2, 0xbc, 0x09, 0xe0, 0x27, 0xbd, 0xe4, 0xed, 0xfb, 0x3c, 0xe4, 0x1a, 0xf9, 0xbc, 0x50, 0x47, 0x2e, 0x3d, 0x1b, 0xed, 0x4e, 0x3d, 0x6d, 0x7c, 0x81, 0xbd, 0x72, 0x2a, 0xdc, 0xbc, 0x6f, 0xa7, 0x59, 0x3d, 0xc0, 0xbd, 0x1e, 0xbc, 0xb2, 0xaf, 0xb9, 0xbc, 0x07, 0x39, 0xba, 0xbc, 0xf4, 0x63, 0x46, 0xbd, 0x45, 0x7b, 0x1a, 0x3d, 0x79, 0xe9, 0xf7, 0x3c, 0x9e, 0xba, 0xf0, 0xbc, 0xc1, 0x09, 0xbb, 0x3c, 0x0e, 0x21, 0x52, 0xbc, 0xed, 0x78, 0x43, 0x3b, 0x73, 0x07, 0x62, 0x3d, 0x71, 0x92, 0x84, 0x3d, 0x7b, 0x59, 0xb2, 0xbc, 0xe0, 0xba, 0x34, 0xbc, 0x0c, 0x23, 0x14, 0xbd, 0x93, 0x93, 0x1f, 0xbd, 0xb7, 0x20, 0x6b, 0xbd, 0x8e, 0x60, 0x8c, 0xbd, 0x00, 0xe9, 0x8c, 0x3d, 0xdf, 0xb4, 0xe1, 0xbb, 0xa0, 0x1a, 0xbf, 0xbc, 0xf6, 0x4c, 0x80, 0x3c, 0x74, 0xeb, 0x18, 0x3d, 0x28, 0x64, 0x8c, 0x3c, 0xba, 0xbd, 0xd3, 0xbc, 0x56, 0xc0, 0x6f, 0x3d, 0x09, 0x02, 0x88, 0xbd, 0x02, 0xd5, 0x58, 0x3d, 0xc1, 0x57, 0x31, 0x3d, 0xfc, 0x52, 0x48, 0x3d, 0x61, 0xdc, 0x64, 0xbd, 0xa7, 0xc3, 0x2b, 0x3d, 0x3b, 0xea, 0x13, 0xbc, 0x0e, 0xac, 0x3c, 0xbd, 0x7e, 0x92, 0x86, 0x3c, 0xbf, 0x14, 0x29, 0xbc, 0xf3, 0x91, 0x7f, 0x3d, 0xf1, 0x9a, 0xac, 0x3c, 0xf8, 0xf5, 0x76, 0x3c, 0xa2, 0x0f, 0x86, 0xbd, 0xc3, 0xeb, 0xb7, 0x3a, 0xff, 0x56, 0x6c, 0x3d, 0x1c, 0xcc, 0x5a, 0xbd, 0x97, 0x3f, 0x78, 0x3d, 0x92, 0xea, 0x9d, 0xbc, 0xbc, 0x51, 0x6a, 0x3d, 0xc5, 0x44, 0x65, 0x3c, 0xbc, 0x66, 0x30, 0x3d, 0x70, 0xe2, 0x26, 0xbd, 0x2e, 0xbe, 0x19, 0x3d, 0x5e, 0xf3, 0x82, 0x3d, 0x32, 0x2f, 0x86, 0xbd, 0x53, 0x73, 0x81, 0x3d, 0x86, 0xef, 0xa2, 0xbc, 0xdb, 0xda, 0x62, 0xbd, 0x82, 0x4e, 0xd3, 0xbc, 0x80, 0xed, 0x93, 0xba, 0x50, 0xc2, 0xd6, 0x3b, 0x82, 0x22, 0xf1, 0xbc, 0x49, 0xd7, 0x7a, 0xbc, 0xe9, 0x00, 0x85, 0x3d, 0xb7, 0x12, 0x4c, 0xbd, 0x90, 0x25, 0x08, 0xb9, 0x2e, 0x76, 0xcb, 0xbc, 0x47, 0x11, 0x97, 0xbd, 0x06, 0x96, 0x2f, 0x3d, 0x44, 0x62, 0x65, 0x3d, 0xe7, 0xa5, 0x1f, 0x3d, 0x2e, 0x9e, 0xbf, 0xbc, 0x00, 0xd8, 0x6c, 0xbc, 0x20, 0xd1, 0x44, 0xbb, 0x19, 0x61, 0x32, 0x3c, 0xf4, 0x7a, 0x30, 0x3d, 0x11, 0x7b, 0xe4, 0xbc, 0x6e, 0x1c, 0x50, 0x3b, 0x9b, 0x64, 0x64, 0xbd, 0x89, 0x52, 0x1f, 0x3d, 0x65, 0x20, 0x2c, 0x3d, 0xb9, 0x45, 0xd7, 0x3c, 0xe8, 0x37, 0x8e, 0x3d, 0x40, 0x5e, 0x50, 0x3c, 0x7a, 0x66, 0x68, 0xbd, 0x45, 0x1b, 0x31, 0xbd, 0xcb, 0x31, 0x47, 0x3d, 0x2f, 0x4a, 0xb3, 0x3c, 0x97, 0x3d, 0xbc, 0xbc, 0x55, 0x24, 0x80, 0xbd, 0x85, 0x56, 0x69, 0xbc, 0x0e, 0x0a, 0x34, 0x3d, 0xec, 0xe8, 0x54, 0xbd, 0xeb, 0x92, 0x6d, 0xbd, 0xe2, 0x61, 0x41, 0x3c, 0xf3, 0x3c, 0x93, 0xbd, 0x10, 0xea, 0xbd, 0xb7, 0x42, 0xec, 0x3b, 0xbd, 0x66, 0xe6, 0x80, 0xbd, 0x84, 0xd9, 0x85, 0x3d, 0x2c, 0xd8, 0xac, 0x3c, 0x72, 0x8e, 0x48, 0x3c, 0x11, 0xa8, 0x9c, 0xbc, 0x08, 0x31, 0x39, 0x3d, 0x0f, 0x3c, 0x7c, 0x3d, 0x58, 0xba, 0x25, 0x3d, 0xce, 0x5f, 0x27, 0x3c, 0x7c, 0x7b, 0x65, 0x3d, 0x96, 0xd6, 0x1e, 0x3d, 0x48, 0x03, 0x73, 0xbd, 0x84, 0x7a, 0x26, 0xbd, 0x92, 0x82, 0x72, 0xbd, 0xeb, 0x8a, 0x0c, 0xbd, 0x84, 0xe7, 0x5f, 0xbd, 0x0b, 0x83, 0xfc, 0x3c, 0xfb, 0xed, 0x8e, 0xbd, 0x52, 0xe2, 0x65, 0x3d, 0xd1, 0xa1, 0x4e, 0xbb, 0x5f, 0x41, 0xce, 0xbc, 0x4b, 0x3d, 0x15, 0xbb, 0x20, 0xc8, 0x90, 0xbd, 0x29, 0xfb, 0x28, 0xbd, 0x04, 0x06, 0x8a, 0xbd, 0x8a, 0x65, 0x30, 0x3d, 0x00, 0x49, 0x93, 0x3a, 0x6e, 0xb0, 0x61, 0x3d, 0x94, 0xcc, 0x87, 0xbc, 0x10, 0x13, 0x3a, 0x3d, 0x5a, 0x7e, 0x7f, 0xbd, 0x4c, 0x1f, 0xd7, 0xbc, 0x82, 0xb3, 0x1e, 0x3d, 0x7e, 0xca, 0x00, 0xbc, 0xe7, 0x69, 0xe4, 0xbb, 0xd5, 0xad, 0x1f, 0x3d, 0xb6, 0x02, 0x72, 0x3d, 0x4b, 0x4f, 0x91, 0xbc, 0x69, 0xd1, 0xd2, 0xbc, 0xf4, 0x42, 0xce, 0x3c, 0xf9, 0x95, 0x8f, 0x3d, 0x5f, 0xd1, 0x52, 0x3c, 0xec, 0xd5, 0x67, 0x3d, 0x79, 0x25, 0x84, 0xba, 0xf3, 0x43, 0x5f, 0x3d, 0x39, 0xdc, 0x2b, 0x3d, 0xc6, 0x40, 0x67, 0xbd, 0xbb, 0xfa, 0x02, 0xbd, 0xf6, 0x13, 0x31, 0xbc, 0x1a, 0x8a, 0x5b, 0x3d, 0x28, 0x8c, 0x3d, 0xba, 0xbd, 0x41, 0x46, 0x3d, 0xc8, 0xb7, 0x80, 0xbb, 0xd7, 0xc5, 0x71, 0x3b, 0x2a, 0x9d, 0x51, 0xbd, 0xfb, 0xe8, 0x66, 0xbd, 0x49, 0x55, 0xad, 0xbc, 0x80, 0x74, 0x36, 0xbd, 0x00, 0x48, 0xc7, 0xbc, 0xec, 0x9e, 0xf8, 0x3c, 0x2d, 0x31, 0x7e, 0x3d, 0x5d, 0xdd, 0x94, 0xbd, 0xfd, 0xce, 0x57, 0x3d, 0xe2, 0x28, 0x0b, 0xbc, 0x00, 0xec, 0x38, 0x3d, 0x88, 0x2f, 0xc9, 0xbc, 0xe8, 0x5d, 0x69, 0x3d, 0xd8, 0x1a, 0x04, 0xbc, 0xa5, 0x91, 0x78, 0x3d, 0x4f, 0x30, 0x06, 0xbc, 0xdf, 0x59, 0x51, 0x3d, 0x00, 0xb6, 0x8f, 0x3a, 0x9f, 0x7e, 0x76, 0xbd, 0x66, 0xc5, 0x1d, 0x3d, 0x99, 0x26, 0x91, 0xbd, 0x82, 0x51, 0x8e, 0xbd, 0xf6, 0xf9, 0x81, 0xbc, 0x60, 0x4a, 0x9d, 0x3c, 0x40, 0xfa, 0xf8, 0xbb, 0x96, 0x7a, 0xf4, 0xbb, 0x8d, 0xfb, 0x02, 0xbd, 0xf0, 0xf1, 0xa8, 0x3c, 0xc9, 0xa7, 0x38, 0xbd, 0x85, 0xc8, 0x4b, 0xbc, 0xc8, 0x56, 0x13, 0x3d, 0x61, 0x4d, 0x88, 0xbd, 0x4e, 0xe1, 0x42, 0x3d, 0xec, 0x20, 0x7c, 0xbc, 0x49, 0x1c, 0x91, 0x3d, 0x40, 0xea, 0x8d, 0xbd, 0x90, 0xa9, 0x5b, 0xbd, 0xe1, 0x98, 0x8e, 0xbd, 0x2f, 0x06, 0xed, 0xbc, 0xa9, 0xa1, 0xe0, 0x3c, 0x54, 0xa1, 0x76, 0xbd, 0x21, 0x88, 0x70, 0xbd, 0x16, 0x25, 0x23, 0xbd, 0xb6, 0xdf, 0x4f, 0x3d, 0xaf, 0x39, 0x57, 0x3d, 0x3f, 0xfa, 0x2a, 0xbd, 0xda, 0x39, 0xcf, 0x3c, 0xf6, 0x8b, 0x5e, 0x3d, 0x49, 0x9e, 0xec, 0xbc, 0x5c, 0x6b, 0x7f, 0x3d, 0x38, 0xf8, 0x8a, 0xbc, 0x15, 0xc8, 0x8a, 0xbd, 0xc9, 0xb5, 0x3f, 0x3d, 0x1c, 0xcd, 0x97, 0xbd, 0x3c, 0xa4, 0xb0, 0xba, 0x85, 0x05, 0x18, 0xbc, 0x0b, 0xf9, 0x81, 0xbd, 0xa7, 0x64, 0x84, 0xbc, 0x17, 0xa4, 0x86, 0x3d, 0x74, 0xbc, 0x6d, 0xbd, 0xbe, 0xaa, 0xe0, 0x3c, 0x70, 0x71, 0x01, 0x3d, 0x34, 0x7c, 0x3b, 0x3d, 0xf7, 0xe5, 0x4a, 0x3d, 0x0b, 0x8a, 0xe2, 0x3c, 0x3a, 0xce, 0x8c, 0xbd, 0xc3, 0x45, 0x17, 0xbc, 0x06, 0x14, 0x40, 0xbd, 0xc8, 0x4e, 0x2a, 0x3d, 0x1e, 0x87, 0x38, 0x3d, 0x12, 0xe6, 0x8e, 0x3d, 0x5d, 0x26, 0x24, 0xbc, 0x96, 0x16, 0x0e, 0xbb, 0xbd, 0x7b, 0xe7, 0xbb, 0xee, 0xf1, 0x86, 0xbc, 0x21, 0x44, 0xe1, 0xba, 0x34, 0xc7, 0x76, 0xbd, 0x84, 0x41, 0x0f, 0xba, 0x79, 0x2a, 0x77, 0x3d, 0xe0, 0x52, 0xce, 0x3c, 0xd3, 0xbd, 0x0c, 0x3d, 0xff, 0x57, 0x8b, 0x3d, 0xc6, 0x60, 0xed, 0x3b, 0xfc, 0x72, 0x7f, 0xbd, 0x18, 0xaa, 0x20, 0x3c, 0xcd, 0x28, 0x0d, 0x3d, 0x18, 0xf7, 0xdb, 0x3a, 0xd6, 0x93, 0x6a, 0x3d, 0x46, 0x48, 0x55, 0xbd, 0x01, 0x2f, 0x7c, 0x3d, 0x75, 0x2d, 0x80, 0x3c, 0x4c, 0x22, 0xd0, 0x3c, 0x17, 0x6d, 0x8b, 0xbb, 0x34, 0x25, 0xec, 0xbc, 0x04, 0x8e, 0x56, 0x3d, 0xd8, 0xab, 0x88, 0x3d, 0x20, 0x51, 0x88, 0xbc, 0x71, 0xdb, 0xd4, 0x3c, 0x41, 0xe5, 0x03, 0xbd, 0x28, 0x8d, 0x0c, 0x3c, 0xa1, 0xe2, 0x7d, 0xbd, 0x10, 0xb2, 0xcd, 0x3c, 0x3b, 0xa9, 0xdf, 0xbc, 0x2d, 0x71, 0x73, 0x3d, 0xfa, 0xcb, 0xd3, 0x3c, 0xb4, 0x04, 0x10, 0xbb, 0xca, 0xec, 0x8c, 0xbd, 0xd1, 0x28, 0x9a, 0x3c, 0x0f, 0x12, 0x2f, 0x3d, 0x93, 0x67, 0x2a, 0x3d, 0x94, 0x98, 0xb7, 0x3c, 0x8e, 0x0f, 0xae, 0xbc, 0xc6, 0x7c, 0xd9, 0x3c, 0xa0, 0x4d, 0x3b, 0xbb, 0x20, 0xf7, 0xd5, 0x3c, 0x7b, 0xa2, 0x72, 0xbd, 0xc5, 0xb9, 0xbd, 0x3c, 0x59, 0x61, 0x1e, 0x3d, 0x8b, 0x95, 0x8c, 0xbd, 0xbe, 0xbf, 0x9b, 0xbc, 0x0f, 0x63, 0x7b, 0x3d, 0x92, 0x1a, 0x66, 0x3c, 0x4f, 0xef, 0xa0, 0x38, 0x8c, 0x24, 0xd9, 0xbc, 0x7d, 0xfa, 0xf8, 0xbc, 0xde, 0xe7, 0x85, 0x3d, 0xa2, 0xd6, 0x13, 0xbd, 0x5e, 0x38, 0x3d, 0xbd, 0xe7, 0x7e, 0xb0, 0x3d, 0xc5, 0x86, 0xba, 0xbc, 0x49, 0x12, 0x93, 0xbd, 0x8e, 0x9e, 0xea, 0x3d, 0x48, 0x93, 0x84, 0xbd, 0x33, 0x48, 0xc7, 0xbc, 0x23, 0x1f, 0x5f, 0x3d, 0x51, 0x20, 0xb5, 0xbb, 0x93, 0xfa, 0x90, 0x3d, 0x99, 0xe1, 0x31, 0xbd, 0x82, 0x3e, 0x89, 0xbd, 0x99, 0x5e, 0xe0, 0xbc, 0x0c, 0xc2, 0x03, 0x3d, 0xe2, 0x69, 0xb2, 0x3c, 0x3d, 0xdb, 0x6e, 0xbd, 0x37, 0xd2, 0x36, 0x3c, 0x89, 0x66, 0x1e, 0xbd, 0xeb, 0x8a, 0x88, 0x3d, 0x1a, 0x34, 0x3d, 0x3d, 0x84, 0x3a, 0x24, 0x3d, 0x2f, 0xd2, 0x78, 0xbd, 0x45, 0x13, 0x82, 0x3d, 0x70, 0x07, 0x94, 0x3d, 0xf9, 0xc5, 0x7f, 0xbd, 0x40, 0x1b, 0x04, 0xbd, 0x74, 0x6f, 0x3a, 0x3d, 0xa0, 0x7d, 0xf8, 0xbc, 0x7e, 0x95, 0x61, 0x3d, 0xc0, 0x56, 0x5d, 0x3b, 0x16, 0xa4, 0x06, 0x3d, 0x4b, 0x46, 0xbf, 0xbd, 0x64, 0x97, 0xe8, 0xbc, 0x79, 0xbd, 0x75, 0x3a, 0x50, 0xb6, 0x6a, 0x3c, 0x7b, 0xcc, 0x29, 0x3c, 0xa8, 0x8f, 0x17, 0x3d, 0xf0, 0xf6, 0xbc, 0x3b, 0x48, 0x26, 0x78, 0xbd, 0x96, 0x9b, 0xe4, 0x3b, 0x87, 0xe5, 0x70, 0x3c, 0x88, 0xf2, 0xac, 0xbb, 0x79, 0x75, 0x05, 0x3c, 0x06, 0x38, 0xa5, 0x3d, 0x8b, 0x4e, 0x0a, 0x3d, 0xf9, 0x2d, 0x95, 0x3d, 0x08, 0xca, 0x7f, 0x3d, 0xc7, 0x5e, 0x1c, 0x3d, 0xf2, 0xbc, 0x57, 0xbc, 0xc6, 0xaf, 0x5a, 0xbd, 0x7f, 0xc5, 0xc7, 0x3c, 0x69, 0x5c, 0x00, 0x3c, 0x69, 0xaf, 0x8a, 0x3d, 0x60, 0x07, 0x01, 0x3d, 0xc3, 0x8f, 0xff, 0x3a, 0xd5, 0x44, 0x1d, 0x3d, 0x66, 0x63, 0x2a, 0xbd, 0xe9, 0xd3, 0x9a, 0xbd, 0x50, 0xc0, 0x0a, 0xbd, 0x32, 0x2d, 0xc6, 0xbc, 0xf0, 0xb1, 0xd4, 0xbb, 0x48, 0xcc, 0xdc, 0x3a, 0xcd, 0x33, 0x6f, 0x3d, 0xea, 0x34, 0x95, 0xbd, 0xb8, 0x4b, 0x2f, 0xbc, 0xe0, 0xa1, 0x0f, 0xbc, 0x0f, 0xee, 0x01, 0x3c, 0x5e, 0x3d, 0x35, 0x3d, 0x6e, 0x51, 0x81, 0xbd, 0xfa, 0x8d, 0x8b, 0x3c, 0x51, 0xc5, 0x0a, 0x3d, 0x8a, 0xa8, 0xc4, 0xbc, 0x66, 0x86, 0x19, 0xbd, 0x50, 0x08, 0x8e, 0x3d, 0x22, 0x74, 0xdd, 0x3b, 0xdb, 0xf4, 0xea, 0x3a, 0xa1, 0x2d, 0x68, 0x3d, 0x7e, 0x82, 0xc6, 0x3d, 0xe6, 0x89, 0x16, 0xbd, 0xe2, 0x72, 0x78, 0xbd, 0x25, 0xe0, 0x82, 0xbd, 0xc2, 0x61, 0x66, 0x3c, 0xb2, 0x57, 0x66, 0x3d, 0x47, 0xa3, 0x40, 0xbc, 0xf7, 0x00, 0x3e, 0xbd, 0x78, 0x7e, 0x42, 0x3d, 0xc3, 0x09, 0x83, 0x3d, 0x1d, 0xac, 0x09, 0x3d, 0x37, 0xc0, 0xd7, 0x3b, 0xae, 0xbb, 0x34, 0xbd, 0x12, 0x34, 0x95, 0x3d, 0xf8, 0x3f, 0x20, 0x3d, 0xa8, 0x30, 0x0b, 0xbd, 0x09, 0x71, 0x02, 0xbd, 0xb7, 0xbc, 0x80, 0x3d, 0x9e, 0x24, 0x48, 0x3d, 0xbb, 0xe7, 0xa6, 0x3d, 0x59, 0xd4, 0x28, 0xbd, 0x98, 0x85, 0x14, 0xbc, 0x25, 0xbe, 0xae, 0x3c, 0x1b, 0x82, 0x85, 0x3c, 0x6c, 0x23, 0xc3, 0x3c, 0x7a, 0xe2, 0x03, 0xbd, 0x75, 0x65, 0x3a, 0x3d, 0x9e, 0x34, 0x76, 0x3b, 0xe1, 0x36, 0x05, 0x3d, 0xd6, 0x9a, 0x37, 0xbd, 0x66, 0x1c, 0x99, 0x3c, 0x9d, 0x65, 0x2a, 0xbd, 0xc3, 0xdd, 0x60, 0xbc, 0x6c, 0xa8, 0x06, 0xbd, 0xb8, 0xb4, 0x85, 0xbd, 0xca, 0x5d, 0x65, 0x3c, 0xe2, 0xce, 0xfa, 0x3c, 0x18, 0xe2, 0x29, 0x3d, 0x4a, 0xd0, 0x31, 0xbc, 0x78, 0xd4, 0x52, 0x3d, 0x7a, 0x03, 0x47, 0x3d, 0x0e, 0x3a, 0xde, 0xbc, 0xd1, 0x1c, 0x72, 0xbd, 0x39, 0xb2, 0x8c, 0xbd, 0x1a, 0x1c, 0xba, 0xbd, 0x20, 0x30, 0x5e, 0x3b, 0x4b, 0x1f, 0x40, 0xbc, 0x70, 0x8b, 0xbd, 0x3c, 0x02, 0x15, 0x12, 0xbd, 0x92, 0x7d, 0x52, 0xbd, 0x98, 0x66, 0x78, 0xbc, 0x73, 0x75, 0x74, 0x3d, 0x91, 0x42, 0x88, 0x3d, 0x8a, 0x00, 0x26, 0xbd, 0xca, 0xd7, 0x86, 0x3d, 0xea, 0xcb, 0x66, 0xbd, 0xb8, 0x28, 0x26, 0x3c, 0xd5, 0x36, 0x90, 0xbd, 0xfa, 0x19, 0x5a, 0x3d, 0xb2, 0x02, 0x81, 0xbd, 0xe3, 0x63, 0x8d, 0x3d, 0xad, 0x2e, 0x0e, 0x3d, 0x01, 0x74, 0x4b, 0xbd, 0xa3, 0x91, 0x08, 0x3d, 0x6d, 0xa0, 0x23, 0xbd, 0x84, 0xbd, 0x0a, 0xbd, 0x28, 0x54, 0x95, 0xba, 0x1c, 0x4a, 0x2f, 0x3d, 0xf0, 0x67, 0xaf, 0xbc, 0xcc, 0x1e, 0x18, 0x3d, 0xd5, 0xf0, 0x29, 0x3d, 0xd9, 0x19, 0x0a, 0xbc, 0x91, 0xf8, 0x1c, 0xbc, 0xf0, 0x4b, 0x1a, 0x3d, 0xc8, 0xdc, 0x52, 0xbc, 0x65, 0x2b, 0x6c, 0xbd, 0x9f, 0x08, 0x9a, 0xbd, 0x11, 0xd4, 0x9e, 0xbc, 0xb0, 0xa3, 0x0d, 0x3c, 0x20, 0x50, 0xd7, 0x3c, 0x65, 0xfc, 0xb7, 0xbc, 0x43, 0xf5, 0x0d, 0xbd, 0xb9, 0x3c, 0x2a, 0x3d, 0x66, 0xb3, 0x5b, 0x3d, 0x6d, 0x26, 0xa0, 0x3d, 0x3a, 0xc0, 0x15, 0xbb, 0x67, 0x1b, 0x0b, 0x3c, 0x20, 0x72, 0xa6, 0xbd, 0xe2, 0x14, 0xa5, 0xbc, 0x37, 0x10, 0x92, 0x3d, 0x24, 0x2d, 0x1c, 0x3d, 0x47, 0xbd, 0x2b, 0xbd, 0x68, 0x0f, 0xa5, 0x3d, 0x96, 0x58, 0x98, 0x3d, 0x25, 0x20, 0xd3, 0x3b, 0xc2, 0x1b, 0xbd, 0x3d, 0x17, 0x2a, 0xa5, 0xbb, 0x34, 0x7e, 0x47, 0x3d, 0x36, 0xb6, 0xd0, 0x3b, 0x6a, 0xba, 0xf3, 0x3c, 0x54, 0x95, 0x25, 0xbd, 0x99, 0x51, 0x81, 0x3d, 0xe6, 0x1b, 0x20, 0xbc, 0x2e, 0xc2, 0x3b, 0xbd, 0xb8, 0xa6, 0x17, 0xbd, 0x86, 0x1f, 0xd7, 0x3c, 0x60, 0x69, 0x8d, 0x3d, 0x00, 0x02, 0x76, 0xbd, 0x86, 0xdb, 0x85, 0x3b, 0x52, 0xb1, 0xd7, 0x3d, 0x7c, 0xd1, 0x4f, 0xbd, 0xb0, 0xe7, 0x13, 0xbd, 0xee, 0xe2, 0x0f, 0x3d, 0x2e, 0x0a, 0x11, 0xbd, 0x59, 0x7e, 0x04, 0xbd, 0xf1, 0xdf, 0x10, 0xbc, 0x9f, 0xfd, 0x90, 0xbc, 0x0a, 0xec, 0x47, 0x3c, 0x9b, 0x06, 0x5a, 0x3d, 0x0e, 0xe3, 0xee, 0xbc, 0x3b, 0xbf, 0xc7, 0x3b, 0x1e, 0xc7, 0x17, 0xbd, 0x65, 0x6d, 0x75, 0x3c, 0x81, 0x92, 0xc3, 0x3c, 0xee, 0x48, 0x9e, 0x3c, 0x6d, 0x2e, 0x4f, 0xbd, 0x42, 0x85, 0x64, 0xbd, 0xe9, 0x0a, 0xbb, 0xbc, 0x73, 0x3f, 0x40, 0xbd, 0xbd, 0x8c, 0xae, 0x3b, 0x4a, 0xae, 0x31, 0x3d, 0x9e, 0x39, 0xfd, 0x3c, 0xd7, 0x4e, 0xe0, 0xbd, 0xf6, 0x05, 0x05, 0xbd, 0xbf, 0x61, 0x31, 0x3c, 0xba, 0x2f, 0x51, 0x3d, 0x16, 0xef, 0xdd, 0x3c, 0x23, 0x64, 0x18, 0x3c, 0x44, 0x4b, 0xce, 0xbc, 0x13, 0xbd, 0xd7, 0xbc, 0xc8, 0xc8, 0xb8, 0xbc, 0x76, 0x69, 0x19, 0xbd, 0x76, 0x51, 0x9c, 0xbd, 0xbe, 0xbc, 0x7d, 0x3d, 0xa3, 0xa2, 0x74, 0x3d, 0xfe, 0xad, 0x06, 0x3c, 0x74, 0xb4, 0x0f, 0x3b, 0x9f, 0x83, 0x8d, 0x3d, 0xa5, 0x84, 0x70, 0x3d, 0x99, 0xa1, 0xe6, 0xbc, 0xf2, 0xf1, 0xbd, 0xbc, 0x29, 0xd8, 0x42, 0xbc, 0x48, 0xb0, 0xa7, 0x3c, 0xce, 0x31, 0x0b, 0xbd, 0x8b, 0xef, 0x39, 0x3d, 0xc5, 0x28, 0xa4, 0x3c, 0xcd, 0x1b, 0xb7, 0x3c, 0x3f, 0x50, 0x55, 0xbd, 0xf4, 0xa8, 0x9d, 0x3d, 0xe3, 0xdb, 0xac, 0x3c, 0x5c, 0xae, 0x68, 0xbc, 0x8e, 0xf1, 0x0f, 0xbc, 0x17, 0x29, 0x87, 0x3c, 0x19, 0x45, 0x23, 0xbd, 0xf0, 0x0f, 0x12, 0xbd, 0x06, 0x74, 0x8b, 0xbd, 0x10, 0x65, 0x00, 0x3d, 0xa3, 0x9d, 0x8a, 0x3d, 0x1e, 0xf4, 0x3d, 0x3d, 0x4e, 0x40, 0x7b, 0x3c, 0xa0, 0xc8, 0xf7, 0xbb, 0x2e, 0x19, 0x1a, 0xbc, 0x37, 0x47, 0x36, 0xbd, 0x8b, 0x65, 0x6d, 0x3d, 0xc0, 0xcd, 0x21, 0xbd, 0x60, 0xb6, 0xa3, 0xbb, 0xa9, 0x58, 0x42, 0xbc, 0x94, 0x1c, 0x73, 0xbd, 0x82, 0xa5, 0xad, 0xbc, 0x51, 0xe5, 0xb5, 0x3d, 0xbd, 0xa1, 0x59, 0x3d, 0x13, 0x5b, 0xdb, 0xbc, 0x44, 0xdc, 0xd3, 0xbc, 0xc8, 0x3f, 0xa5, 0x3d, 0x5d, 0x7c, 0x68, 0x3d, 0xcd, 0xb4, 0xa7, 0xbc, 0x58, 0x2b, 0x48, 0x3d, 0xe6, 0x22, 0xf6, 0xbc, 0xde, 0x4b, 0x0b, 0xbd, 0x71, 0x8f, 0x44, 0xbd, 0x8d, 0xa0, 0x17, 0xbd, 0xd3, 0xd3, 0x36, 0x3d, 0x40, 0x04, 0x3c, 0xbd, 0x4a, 0xdf, 0x82, 0x3b, 0x23, 0x72, 0x20, 0x3d, 0xf5, 0x84, 0x80, 0xbd, 0xf9, 0x1c, 0xf3, 0xbc, 0x84, 0xd9, 0x86, 0xbd, 0x28, 0x42, 0x48, 0xbd, 0x90, 0xd7, 0x32, 0x3d, 0x80, 0x98, 0x01, 0xbc, 0x7f, 0x7a, 0x82, 0xbd, 0x59, 0x12, 0xf3, 0x3c, 0x9b, 0x63, 0xaa, 0xbc, 0x5e, 0x84, 0xb5, 0xbd, 0x95, 0x77, 0x90, 0x3d, 0xad, 0x26, 0xb4, 0xbd, 0xda, 0xfb, 0x0a, 0xbd, 0x44, 0x70, 0x73, 0x3d, 0x70, 0x45, 0x41, 0x3d, 0xe6, 0x6b, 0x73, 0x3c, 0x93, 0x01, 0x78, 0xbd, 0xc3, 0xda, 0xa2, 0x3d, 0x46, 0x41, 0x83, 0x3d, 0x16, 0x40, 0x32, 0x3d, 0xa7, 0xfb, 0xa7, 0xbd, 0xc0, 0x57, 0x28, 0x3b, 0xd0, 0x2b, 0x84, 0xbc, 0x85, 0x89, 0x88, 0x3d, 0xc4, 0xa3, 0x8f, 0xbc, 0xbb, 0xc6, 0x96, 0xbd, 0x7c, 0xae, 0x36, 0xbd, 0xf8, 0x8b, 0x85, 0x3d, 0xfa, 0x35, 0xf5, 0x3c, 0xad, 0x86, 0x63, 0xbc, 0x7c, 0xc1, 0x54, 0x3d, 0xad, 0xfc, 0x09, 0xbd, 0x3a, 0x1f, 0xf2, 0x3c, 0xf4, 0x35, 0x65, 0x3c, 0xd0, 0x53, 0x38, 0xbd, 0x99, 0xf8, 0x36, 0x3d, 0x95, 0xaf, 0x67, 0x3d, 0xd2, 0x76, 0x44, 0x3d, 0x03, 0x46, 0x82, 0x3d, 0xdc, 0xe2, 0x53, 0xbd, 0x49, 0x59, 0x7b, 0xbd, 0x1c, 0x8b, 0xaf, 0x3a, 0x80, 0x30, 0x27, 0xbd, 0xdb, 0x9c, 0x87, 0xbd, 0x8e, 0x09, 0x5c, 0x3d, 0x5e, 0x5d, 0x5d, 0x3d, 0xcc, 0x97, 0xaa, 0xbb, 0x81, 0xe0, 0xb9, 0xbc, 0x61, 0x3a, 0x9a, 0x3b, 0xc9, 0x99, 0x9f, 0x3d, 0x2d, 0x52, 0x10, 0xbd, 0x90, 0x0b, 0xa1, 0x3c, 0xaf, 0x88, 0x81, 0xbd, 0xf4, 0x7a, 0x89, 0xbc, 0xb3, 0xe1, 0xc5, 0xbc, 0x8e, 0xe5, 0x8a, 0xbd, 0x6d, 0xd9, 0x70, 0x3b, 0xdd, 0x1b, 0xa1, 0x3c, 0xdd, 0xeb, 0x42, 0xbd, 0x01, 0xcb, 0xf2, 0x3c, 0x8e, 0x4f, 0xff, 0xbc, 0x28, 0x5e, 0x6a, 0xbc, 0x3f, 0xff, 0x26, 0x3d, 0xc4, 0xfa, 0x87, 0xbc, 0xcb, 0x5e, 0x32, 0xbd, 0x1f, 0xb7, 0xd1, 0xbd, 0x40, 0xb6, 0x8b, 0x3c, 0x22, 0xf5, 0xa5, 0xbc, 0x5e, 0xa1, 0xf7, 0xbc, 0x1a, 0x43, 0x11, 0x3d, 0xc9, 0xfe, 0x18, 0xbd, 0x34, 0x8b, 0x2f, 0x3d, 0x2f, 0xe3, 0x8d, 0x3d, 0xaf, 0x7b, 0x69, 0xbd, 0x63, 0x9d, 0xac, 0x3d, 0xce, 0x45, 0x50, 0xbd, 0xe1, 0x8f, 0x6b, 0xbd, 0x6e, 0xc6, 0x07, 0xbd, 0x58, 0x1e, 0x12, 0x3c, 0x79, 0xdd, 0x06, 0x3d, 0xea, 0x26, 0x83, 0xbd, 0xaa, 0x63, 0xce, 0x3d, 0x3a, 0xb3, 0x81, 0x3b, 0x35, 0x9a, 0xc6, 0x3c, 0x27, 0xc4, 0x59, 0xbd, 0x74, 0x21, 0x30, 0x3d, 0xfe, 0x21, 0x8f, 0xbc, 0xb2, 0x86, 0x78, 0xbc, 0xbb, 0x4f, 0xd7, 0xbd, 0xda, 0xfe, 0x2c, 0xbd, 0x7b, 0x99, 0x21, 0x3b, 0x61, 0xe4, 0x68, 0xbd, 0x66, 0xfd, 0xb2, 0xba, 0xbe, 0x3d, 0x53, 0x3d, 0x53, 0x3f, 0x5c, 0xbd, 0x5b, 0xf9, 0xc4, 0x3c, 0x1c, 0xa3, 0x6c, 0x3d, 0x61, 0x44, 0xfa, 0x3c, 0x35, 0xb8, 0xd9, 0x3c, 0x6d, 0x40, 0xc8, 0xbc, 0xbf, 0x20, 0x2a, 0x3d, 0x84, 0xbd, 0x80, 0x3c, 0x19, 0x27, 0x1c, 0x3d, 0xc8, 0xf0, 0x56, 0x3c, 0x74, 0x85, 0x29, 0x3c, 0xce, 0x5a, 0x91, 0xbc, 0x1f, 0xc3, 0x89, 0xbc, 0x8a, 0xec, 0x62, 0x3d, 0xd0, 0xc0, 0xd2, 0xbb, 0x29, 0x30, 0x36, 0x3d, 0x71, 0xd4, 0xaf, 0x3c, 0x29, 0x52, 0xb9, 0xbc, 0x33, 0xc8, 0x2c, 0x3a, 0x97, 0x8e, 0x18, 0xbb, 0xda, 0xa7, 0x28, 0xbd, 0xaf, 0x8c, 0xc1, 0xbc, 0x62, 0xbb, 0xc7, 0x3b, 0xda, 0x12, 0xbb, 0xbc, 0x7a, 0xfb, 0x3a, 0xbd, 0x04, 0xc0, 0xe3, 0x3c, 0x0f, 0x84, 0xdd, 0xbd, 0xa4, 0x83, 0x87, 0x3d, 0x38, 0x8b, 0x5f, 0xbd, 0x60, 0xb4, 0x98, 0x3c, 0x99, 0xef, 0x5d, 0x3b, 0xda, 0x0b, 0x83, 0x3d, 0x49, 0xf9, 0x93, 0x3d, 0xe4, 0x29, 0x51, 0xbd, 0x5e, 0x33, 0x4b, 0xbd, 0x7a, 0xc5, 0xd5, 0x3b, 0xc2, 0xbc, 0x67, 0x3d, 0x89, 0xa1, 0x55, 0xbd, 0x91, 0x0f, 0x55, 0x3d, 0xf8, 0x89, 0x82, 0xbd, 0x4c, 0xdc, 0xc6, 0xbc, 0xc9, 0xb0, 0x3e, 0xbd, 0x7c, 0x95, 0x25, 0x3d, 0xa2, 0x9f, 0xe1, 0x3b, 0x17, 0xcf, 0x90, 0xbb, 0xd6, 0x9c, 0x47, 0x3b, 0xf6, 0x12, 0x74, 0x3d, 0xba, 0x2e, 0xde, 0x3c, 0x3e, 0x06, 0x74, 0x3d, 0x32, 0x23, 0x5e, 0xbc, 0x02, 0xf3, 0x88, 0xbd, 0x16, 0x5d, 0xdd, 0xbc, 0x50, 0x9b, 0x0a, 0xbd, 0x8e, 0x56, 0xb9, 0xbc, 0xc8, 0x8b, 0x18, 0x3d, 0xfd, 0x15, 0x80, 0x3d, 0x4c, 0x97, 0x5a, 0xbc, 0xe2, 0x63, 0xa4, 0xbc, 0xc3, 0x3d, 0x84, 0xbc, 0x7e, 0xa2, 0x83, 0x3b, 0x6e, 0x8b, 0x4e, 0x3c, 0x24, 0xb4, 0xb3, 0xbb, 0x03, 0x9e, 0xfd, 0x3b, 0xa4, 0x8b, 0x53, 0x3d, 0xbc, 0x81, 0x61, 0xbd, 0x59, 0xde, 0x48, 0x3d, 0x21, 0x16, 0x61, 0xbd, 0x31, 0xbc, 0x1c, 0xbd, 0xfc, 0xe8, 0xf4, 0x3c, 0x88, 0x36, 0x59, 0x3d, 0x12, 0x10, 0xf8, 0xbb, 0xe4, 0x7b, 0x5f, 0xbc, 0xf0, 0x9d, 0x9e, 0x3c, 0xfb, 0x94, 0xdb, 0xbc, 0x54, 0x67, 0x65, 0xbc, 0x5e, 0x6e, 0x3b, 0xbd, 0x12, 0x92, 0x59, 0x3c, 0xf3, 0x69, 0x8b, 0x3b, 0x78, 0x99, 0xdd, 0x3c, 0x85, 0x31, 0x21, 0x3d, 0xe4, 0x6c, 0x33, 0x3d, 0x9c, 0x58, 0x87, 0xbd, 0xd9, 0xf5, 0x31, 0xbc, 0xce, 0xac, 0xb9, 0x3d, 0x0e, 0x2c, 0x5c, 0x3d, 0x6a, 0x94, 0xa9, 0x3d, 0x0e, 0xca, 0x4d, 0xbc, 0x68, 0x0f, 0x4d, 0xbd, 0xd5, 0x31, 0xa6, 0xbc, 0xf1, 0xdc, 0x9b, 0x3d, 0x71, 0x4d, 0xfd, 0xbc, 0xcc, 0x43, 0x1a, 0x3d, 0x1f, 0x4f, 0x51, 0x3d, 0xf0, 0x07, 0xa4, 0x3b, 0x1a, 0x75, 0x40, 0x3d, 0xf6, 0xef, 0x13, 0x3d, 0x58, 0x08, 0x04, 0xbd, 0xf3, 0x55, 0x58, 0x3d, 0x55, 0x7e, 0x6d, 0xbd, 0x96, 0x39, 0x78, 0xbd, 0x19, 0x7d, 0x7f, 0xbd, 0xc3, 0x4a, 0x9a, 0xbd, 0x64, 0xad, 0x24, 0x3d, 0xc8, 0xab, 0x10, 0x3b, 0xa2, 0x7f, 0x76, 0xbd, 0xdd, 0xb6, 0x2e, 0x3d, 0xdb, 0xbf, 0x88, 0x3d, 0x49, 0x2e, 0xbd, 0xbb, 0xdb, 0xdc, 0x86, 0x3d, 0x06, 0xf9, 0x85, 0xbd, 0x3c, 0x44, 0x39, 0xbc, 0x8b, 0x1c, 0x32, 0x3d, 0xf6, 0x3c, 0x7a, 0x3d, 0x68, 0x1f, 0x13, 0xbd, 0x1d, 0x1c, 0xed, 0x3c, 0xa8, 0x9b, 0x08, 0xbc, 0xe4, 0x25, 0xf6, 0xbc, 0xf6, 0xd8, 0x19, 0xbd, 0x24, 0x39, 0x2f, 0xbd, 0x59, 0x25, 0x86, 0xbd, 0xbf, 0xf8, 0x78, 0xbd, 0x33, 0xec, 0x93, 0xbd, 0x65, 0xdd, 0x55, 0xbd, 0x9d, 0x16, 0x05, 0xbd, 0x69, 0xe6, 0x79, 0x3d, 0x64, 0xfd, 0xf0, 0xbc, 0xf7, 0xa3, 0x63, 0xbc, 0xb4, 0x5f, 0xdb, 0xbc, 0x72, 0x22, 0x13, 0x3d, 0x0e, 0x28, 0x03, 0xbd, 0x64, 0x4b, 0xad, 0x3c, 0xcb, 0x9c, 0x15, 0xbd, 0x58, 0x24, 0x55, 0x3d, 0x85, 0x90, 0x18, 0xbc, 0x87, 0xb7, 0x95, 0x3d, 0x5e, 0xd9, 0x78, 0xbd, 0xa6, 0x19, 0x80, 0x3d, 0xd3, 0xf6, 0x08, 0x3d, 0x8c, 0x74, 0x43, 0xbd, 0x06, 0x77, 0x8f, 0xbd, 0x68, 0xc4, 0x6f, 0xbd, 0x6f, 0x45, 0x03, 0x3b, 0xb4, 0xf9, 0x9c, 0x3c, 0xe2, 0x85, 0x8f, 0x3c, 0x3a, 0x70, 0x92, 0x3d, 0x06, 0xaa, 0x28, 0xbd, 0x51, 0x46, 0xc2, 0xbd, 0x39, 0xf2, 0x8f, 0x3d, 0xda, 0xbd, 0x4e, 0x3d, 0x68, 0x6d, 0x57, 0xbc, 0xb3, 0x41, 0x8b, 0x3d, 0xa8, 0x83, 0xa3, 0xbc, 0x3a, 0x05, 0xbf, 0xbc, 0x5b, 0x8d, 0x6e, 0x3d, 0xfa, 0x17, 0x8b, 0xbd, 0xff, 0x33, 0x03, 0x3c, 0x4e, 0x35, 0x6d, 0xbb, 0xf5, 0x98, 0x31, 0xbd, 0xfe, 0x46, 0x20, 0x3c, 0xb7, 0x91, 0x5d, 0x3d, 0xa9, 0x64, 0x97, 0x3c, 0xd8, 0x6a, 0x59, 0xbd, 0x0b, 0xfb, 0x7c, 0x3d, 0x05, 0xf1, 0x26, 0xbd, 0xd4, 0xfd, 0x2a, 0x3d, 0x70, 0xca, 0x1d, 0x3d, 0x76, 0x80, 0xc7, 0xbc, 0xfa, 0x43, 0x7e, 0x3d, 0x6e, 0xda, 0xb6, 0x3c, 0x63, 0x63, 0x25, 0xbd, 0x39, 0xad, 0x9c, 0xbc, 0x89, 0xa0, 0xbf, 0xbd, 0xc7, 0xd6, 0x19, 0x3d, 0x36, 0x1d, 0x22, 0x3c, 0x11, 0x87, 0x8b, 0xbd, 0xa8, 0x59, 0x39, 0xbd, 0xe4, 0x1d, 0x02, 0x3c, 0xf1, 0x0d, 0xf7, 0xbd, 0x16, 0x10, 0xb8, 0x3b, 0x03, 0xfc, 0xa4, 0x3c, 0x32, 0x06, 0x8f, 0xbc, 0x47, 0x59, 0xa3, 0xbc, 0xac, 0x7f, 0xda, 0xbc, 0x4b, 0x26, 0x80, 0x3d, 0x73, 0x33, 0x31, 0xbc, 0x83, 0x75, 0x98, 0xbd, 0xb7, 0x95, 0x65, 0xbd, 0x64, 0x01, 0x21, 0xbd, 0xb8, 0x86, 0x8a, 0x3b, 0xe5, 0x85, 0x4a, 0xbd, 0xe5, 0xc1, 0x45, 0xbc, 0x97, 0x00, 0xab, 0x3c, 0xb6, 0x55, 0x1b, 0xbd, 0x41, 0xcb, 0x01, 0x3d, 0x3c, 0x4e, 0x2f, 0xbc, 0x4c, 0x54, 0xad, 0x3c, 0x70, 0xec, 0x58, 0x3c, 0x57, 0x6e, 0xf9, 0x3c, 0xac, 0xa8, 0x28, 0xbd, 0xea, 0x4c, 0xce, 0xbb, 0x5f, 0x87, 0x1d, 0xbd, 0x0d, 0xe2, 0x5c, 0x3d, 0x1d, 0x21, 0x31, 0xbd, 0xf5, 0x47, 0xd7, 0xbd, 0xb5, 0xd5, 0x0c, 0xbd, 0x81, 0x2b, 0xff, 0x3c, 0x40, 0x81, 0xd2, 0x3c, 0xc3, 0x64, 0x77, 0x3c, 0xd6, 0xdd, 0xc9, 0xbc, 0xee, 0x42, 0x9e, 0xbc, 0x4a, 0xdb, 0x3c, 0x3d, 0xc2, 0x58, 0x82, 0x3d, 0xfa, 0x36, 0x24, 0xbd, 0x36, 0x2e, 0x86, 0x3d, 0x68, 0xee, 0x5e, 0xbd, 0x3c, 0x29, 0x1e, 0xbc, 0x80, 0x1f, 0x88, 0xbd, 0x27, 0xab, 0xb7, 0xbc, 0xce, 0x18, 0xa7, 0xbd, 0xf6, 0x96, 0xa7, 0xbc, 0xde, 0x1b, 0x0a, 0xbd, 0x15, 0x9b, 0x1d, 0x3c, 0x2e, 0xb4, 0x9d, 0x3d, 0x61, 0xba, 0xbe, 0xbc, 0xb8, 0xc8, 0x6a, 0x3d, 0xcc, 0x06, 0xa8, 0xbd, 0x83, 0xae, 0x13, 0xbc, 0x3d, 0xb4, 0x4c, 0xbd, 0xcc, 0xb5, 0x65, 0xbc, 0x0d, 0xad, 0x8b, 0x3c, 0x0e, 0x2f, 0x91, 0x3c, 0x1a, 0xfa, 0x1e, 0x3d, 0xbf, 0xe3, 0xf8, 0x3c, 0x21, 0x8d, 0x8c, 0xbc, 0x30, 0x1b, 0xcb, 0xbc, 0x34, 0x68, 0xf2, 0x3a, 0xed, 0x13, 0x0f, 0xbd, 0x66, 0x39, 0x61, 0xbd, 0xee, 0x87, 0x42, 0x3d, 0xc0, 0x58, 0x69, 0xbc, 0x3e, 0xe4, 0xd5, 0x3c, 0x46, 0x68, 0x30, 0xbd, 0x6c, 0x68, 0xad, 0x3c, 0x36, 0x63, 0x13, 0x3d, 0x0c, 0xf5, 0xf7, 0xbc, 0x56, 0x99, 0x71, 0x3d, 0x4a, 0xba, 0x10, 0x3d, 0xfc, 0xba, 0x3e, 0x3d, 0x5a, 0xd8, 0x82, 0x3d, 0x70, 0x17, 0x92, 0xbd, 0x0f, 0x9b, 0x77, 0xbd, 0x06, 0x4d, 0x78, 0x3d, 0xcb, 0x90, 0x96, 0x3d, 0xa5, 0x6d, 0x04, 0xbd, 0x4a, 0x4f, 0x0f, 0xbc, 0x83, 0x77, 0x3a, 0x3d, 0xdf, 0x43, 0x39, 0x3d, 0x17, 0x17, 0xf7, 0x3c, 0x3d, 0x1a, 0x44, 0xbd, 0x42, 0x1b, 0xdb, 0xbc, 0x1f, 0x26, 0x82, 0xbd, 0xfd, 0x51, 0xa5, 0x3d, 0xc5, 0x70, 0x45, 0x3d, 0x00, 0x17, 0xa1, 0x3c, 0xe1, 0x5c, 0x56, 0xbd, 0x57, 0x8c, 0xe6, 0xbc, 0x87, 0x07, 0xef, 0x3b, 0x9b, 0x41, 0xbf, 0xbd, 0xa1, 0x85, 0xd5, 0x3c, 0x07, 0x20, 0x0a, 0xbd, 0xc0, 0x19, 0xf3, 0xbb, 0x1f, 0xb5, 0xba, 0x3b, 0xa0, 0x79, 0x86, 0xbc, 0x62, 0x56, 0x40, 0xbd, 0x51, 0xf1, 0xa8, 0x3c, 0x83, 0x80, 0x86, 0x3c, 0x18, 0x2b, 0x2d, 0x3d, 0x8d, 0x66, 0xb6, 0x3c, 0x1d, 0xac, 0x2e, 0xbd, 0x91, 0xbc, 0x3e, 0xbd, 0xfb, 0x80, 0x75, 0x3d, 0x7d, 0xa1, 0x54, 0xba, 0x0f, 0xd1, 0x2f, 0xbd, 0xcb, 0x3a, 0x14, 0xbd, 0x76, 0xd3, 0x82, 0xbc, 0x15, 0x06, 0xf5, 0x39, 0xa4, 0xdb, 0x6e, 0x3d, 0x42, 0x46, 0xb7, 0x3c, 0xa3, 0x20, 0x00, 0x3d, 0xfc, 0x4f, 0x2b, 0xbd, 0x06, 0xb1, 0x7e, 0x3d, 0xf8, 0x37, 0xc9, 0xbc, 0x0d, 0x90, 0xd7, 0xbc, 0xb7, 0x8e, 0x0e, 0x3d, 0x68, 0xd8, 0x1d, 0xbc, 0x57, 0xb5, 0x11, 0x3d, 0x68, 0x20, 0x0b, 0x3d, 0x85, 0xda, 0x1e, 0xbd, 0xe0, 0xc0, 0x6b, 0xbd, 0x44, 0x69, 0x96, 0xbd, 0xec, 0xbd, 0x38, 0xbc, 0x09, 0x65, 0x85, 0xbd, 0xb4, 0xf4, 0x57, 0xbd, 0x35, 0xe4, 0xb2, 0xbc, 0xf7, 0x90, 0xd0, 0x3c, 0x78, 0xd1, 0x83, 0xbd, 0xe7, 0x8d, 0x1b, 0xbd, 0x49, 0xa3, 0x94, 0x3d, 0x56, 0xf3, 0x44, 0xbd, 0xb2, 0xce, 0x5e, 0x3d, 0x42, 0x8e, 0x37, 0xbd, 0x22, 0x3e, 0x79, 0xbd, 0xa0, 0x71, 0x6c, 0x3d, 0x23, 0x13, 0xb3, 0xbb, 0x0d, 0x32, 0x21, 0x3c, 0x35, 0x5e, 0xfd, 0xba, 0x0d, 0x0c, 0xbd, 0x3b, 0xcb, 0x0c, 0xaa, 0xbb, 0x33, 0xe8, 0x08, 0xbd, 0x43, 0x7a, 0xa5, 0xbc, 0x15, 0x50, 0x89, 0x3d, 0xd1, 0x86, 0x5b, 0x3d, 0x2a, 0xd8, 0x4c, 0x3d, 0xe1, 0x63, 0x19, 0xbc, 0xee, 0xf0, 0x6f, 0x3d, 0xfa, 0xc2, 0x44, 0x3d, 0x88, 0x3c, 0x6b, 0xbd, 0xe3, 0x24, 0xbb, 0xbc, 0x4c, 0xe6, 0x21, 0x3b, 0x47, 0xf2, 0xa1, 0xbc, 0x46, 0x96, 0xfd, 0x3c, 0x4c, 0x21, 0x86, 0xbd, 0x32, 0x28, 0x83, 0xbc, 0x70, 0x39, 0xa0, 0xbd, 0x80, 0xca, 0x4d, 0xbd, 0xc4, 0x91, 0x8d, 0xbc, 0xab, 0xae, 0x08, 0x3c, 0x54, 0xff, 0xb5, 0xbb, 0x76, 0xae, 0xbe, 0x3c, 0xd8, 0xd1, 0xa5, 0x3d, 0x03, 0x0c, 0x44, 0x3d, 0x92, 0x96, 0x40, 0xbd, 0xd5, 0xc5, 0x1f, 0x3d, 0xdf, 0x09, 0xc0, 0x3c, 0xfb, 0x0d, 0x5f, 0x3d, 0xfd, 0x07, 0x04, 0x3d, 0x1c, 0x43, 0x9a, 0xbd, 0xd7, 0x14, 0x72, 0xbd, 0x2d, 0x50, 0x84, 0xbd, 0x6a, 0x16, 0x7d, 0x38, 0xa6, 0xff, 0x90, 0x3d, 0x44, 0xb7, 0xcc, 0x3c, 0x5d, 0x5f, 0x69, 0xbd, 0x92, 0x8d, 0x6d, 0x3d, 0xf9, 0x02, 0x99, 0xbc, 0xe5, 0x7a, 0xc5, 0xbd, 0xde, 0x5c, 0x69, 0x3d, 0xee, 0xbf, 0xf4, 0x3c, 0x92, 0x19, 0x96, 0x3d, 0xf3, 0x5b, 0x35, 0xbd, 0xf3, 0x90, 0x3b, 0x3d, 0x90, 0xe2, 0xc2, 0xbc, 0x98, 0x91, 0xf9, 0xbc, 0x3b, 0x3b, 0x82, 0xbd, 0xb0, 0x85, 0x30, 0x3d, 0x14, 0x12, 0xea, 0xbc, 0x21, 0x84, 0x8c, 0x3d, 0x93, 0xcd, 0x65, 0x3d, 0xc9, 0x26, 0xda, 0xbc, 0xd5, 0xc3, 0x4e, 0x3c, 0xcc, 0x6e, 0x0f, 0x3d, 0x8d, 0xaf, 0x47, 0x3c, 0x9c, 0xfa, 0xe1, 0x3c, 0x3c, 0xe0, 0x4c, 0x3d, 0x79, 0x22, 0xed, 0x3c, 0xf4, 0x05, 0x3a, 0x3d, 0x59, 0xc0, 0x22, 0xbd, 0x5e, 0xaa, 0xf8, 0xbc, 0xc4, 0xda, 0x22, 0x3c, 0x76, 0x88, 0xaf, 0x3c, 0x1c, 0xf4, 0x3b, 0x3d, 0x4e, 0x6a, 0x1b, 0x3d, 0x60, 0xc7, 0x85, 0x3c, 0xb2, 0xc7, 0x75, 0x3d, 0xbd, 0xe4, 0xbe, 0xbc, 0x54, 0x8e, 0x82, 0x3d, 0x36, 0x27, 0x6a, 0xbc, 0x0d, 0x99, 0x00, 0xbd, 0x38, 0x5e, 0x9f, 0xbc, 0x9d, 0x49, 0xd6, 0x3d, 0xbb, 0x1a, 0x85, 0x3d, 0x6f, 0x89, 0x9f, 0x3c, 0xc5, 0x0b, 0xa7, 0xbc, 0x9e, 0x5a, 0xfa, 0xbc, 0xd3, 0x59, 0x50, 0xba, 0x3f, 0xc6, 0xbc, 0xbd, 0xb3, 0x9c, 0x12, 0xbd, 0x05, 0x39, 0xd6, 0x3b, 0x58, 0x14, 0x0d, 0x3d, 0x63, 0x0e, 0x19, 0x3d, 0x69, 0x9b, 0xa2, 0x3d, 0x68, 0x4d, 0x13, 0x3c, 0x06, 0x73, 0x64, 0xbd, 0x28, 0x79, 0x3c, 0xbd, 0x26, 0x23, 0x28, 0xbc, 0xb5, 0xa2, 0xa5, 0xba, 0xf6, 0x5f, 0x89, 0xbc, 0x66, 0x2e, 0x79, 0xbd, 0x90, 0xee, 0x54, 0xbc, 0x99, 0xf4, 0x4e, 0x3c, 0xdb, 0xdc, 0xd0, 0xbc, 0x3f, 0xed, 0x43, 0xbd, 0x03, 0xdf, 0xf4, 0x3c, 0x7d, 0x40, 0x2b, 0x3c, 0xfb, 0x1d, 0x64, 0x3d, 0xcd, 0x1f, 0xb8, 0x3d, 0xb1, 0xb2, 0x0f, 0x3d, 0x30, 0xf6, 0x38, 0xbd, 0x54, 0xef, 0x84, 0xbc, 0x2f, 0x3f, 0xac, 0xbd, 0xe0, 0xe1, 0xc4, 0xbc, 0x49, 0x0a, 0x03, 0xbd, 0xb8, 0x78, 0x43, 0xbc, 0xbf, 0xbc, 0x80, 0x3a, 0x1a, 0x41, 0x39, 0x3d, 0xd0, 0x5d, 0x8c, 0x3d, 0x8d, 0x8f, 0x5e, 0xbc, 0xfd, 0x1b, 0xed, 0xbd, 0x22, 0x7c, 0x99, 0xbc, 0x4c, 0xb3, 0x1d, 0xbc, 0x10, 0xbb, 0x1c, 0x3c, 0x19, 0x89, 0xd3, 0xbc, 0x2a, 0x64, 0x37, 0x3d, 0x11, 0x87, 0x00, 0x3c, 0x39, 0x0d, 0x1c, 0x3d, 0xb8, 0xeb, 0xde, 0xbc, 0x26, 0x9d, 0x05, 0xbd, 0x51, 0xca, 0x0d, 0xbd, 0xa9, 0xe0, 0xbc, 0x3c, 0xd6, 0x01, 0x2d, 0xbd, 0x72, 0x14, 0xd3, 0x3c, 0xf2, 0x07, 0x81, 0x3c, 0xe4, 0xbb, 0x00, 0x3d, 0x0b, 0x42, 0x09, 0x3b, 0x0e, 0x99, 0x71, 0xbd, 0x32, 0x91, 0x10, 0xbd, 0xa0, 0x0b, 0x05, 0xbd, 0x7f, 0xf8, 0xf6, 0x3c, 0xd4, 0x72, 0xbd, 0x3c, 0xdf, 0xcc, 0x8a, 0x3d, 0x0e, 0x3d, 0x24, 0x3d, 0x71, 0x5a, 0x52, 0xbd, 0xb6, 0x11, 0xda, 0xbc, 0x5b, 0xec, 0x9c, 0x3d, 0x4a, 0x73, 0xfd, 0xbc, 0xc1, 0x2b, 0x9f, 0xbd, 0x06, 0xed, 0x2f, 0xbd, 0x38, 0x4c, 0x53, 0x3d, 0x36, 0x8d, 0xc1, 0x3c, 0x14, 0x26, 0xa3, 0xbd, 0x2d, 0x2f, 0x0a, 0xbb, 0xfd, 0x7d, 0xa5, 0xbd, 0x10, 0xbe, 0xe4, 0x3b, 0x77, 0x22, 0x6a, 0x3d, 0xdd, 0x33, 0xc3, 0x3c, 0x3e, 0x8e, 0xbb, 0xbd, 0x60, 0x54, 0x81, 0x3d, 0x02, 0xcf, 0x15, 0x3d, 0x06, 0x28, 0xd5, 0x3d, 0xda, 0xb6, 0x6f, 0xbd, 0xf6, 0x93, 0x86, 0xbc, 0x98, 0x16, 0x45, 0x3d, 0xdc, 0x9e, 0x47, 0x3c, 0x8b, 0x3a, 0x82, 0xbd, 0x11, 0x05, 0xb6, 0xbd, 0x0e, 0x26, 0xc1, 0xbc, 0xe2, 0xdc, 0xab, 0x3d, 0x10, 0x6e, 0x84, 0x3d, 0x49, 0x2f, 0x1c, 0xbb, 0x0e, 0x73, 0x7a, 0x3c, 0x82, 0x17, 0x29, 0x3d, 0x88, 0x40, 0x91, 0x3b, 0x2d, 0xcd, 0xf3, 0xbc, 0xcc, 0x39, 0x37, 0xbd, 0xb0, 0x03, 0x17, 0x3d, 0xb8, 0xd0, 0x22, 0x3d, 0xc6, 0x69, 0x90, 0x3c, 0x09, 0x0f, 0xc2, 0x3b, 0x7a, 0x64, 0xcc, 0xbc, 0x26, 0x93, 0x22, 0x3d, 0xa3, 0xe0, 0x4b, 0xbd, 0x7d, 0xca, 0x2f, 0xbb, 0xda, 0x26, 0x19, 0x3d, 0xe7, 0x88, 0x47, 0xbc, 0x4e, 0x0f, 0x3b, 0x3d, 0xf8, 0x1c, 0x1c, 0x3d, 0xb4, 0x23, 0x8e, 0x3d, 0xaf, 0xa6, 0x10, 0xbd, 0xfc, 0x9a, 0x9c, 0x3c, 0x35, 0x69, 0x9f, 0x3d, 0xe4, 0x5f, 0x8f, 0xbd, 0xc7, 0xe3, 0x98, 0x3d, 0xab, 0xb8, 0xcc, 0x3b, 0x6a, 0xa9, 0x0f, 0xbd, 0x0d, 0x8a, 0x6a, 0xbd, 0x1e, 0xec, 0x10, 0x3d, 0xa0, 0x13, 0xe8, 0x3b, 0xc0, 0x77, 0x93, 0x3c, 0x3f, 0x03, 0x0b, 0x3d, 0xde, 0x40, 0xb4, 0x3c, 0xfc, 0xdb, 0x06, 0xbd, 0xc3, 0x86, 0x90, 0x3d, 0x54, 0x89, 0x37, 0x3d, 0x55, 0xd4, 0x8d, 0xbd, 0x39, 0x31, 0xb7, 0xbc, 0xab, 0x31, 0xc0, 0xbc, 0x60, 0x17, 0xdb, 0xbb, 0x49, 0xa9, 0x2f, 0xbc, 0xbf, 0xcb, 0xd6, 0x3b, 0x83, 0x93, 0x16, 0x3d, 0xba, 0xdd, 0x1b, 0xbd, 0xd1, 0x6a, 0x17, 0x3d, 0x45, 0x0f, 0x1d, 0xbd, 0xa3, 0xc1, 0xb5, 0xbd, 0x88, 0x0e, 0x6e, 0x3d, 0x41, 0x5d, 0x06, 0x3d, 0xd8, 0xeb, 0xb4, 0x3c, 0xe5, 0xc8, 0x88, 0xbb, 0x48, 0x65, 0x47, 0x3d, 0xff, 0xe8, 0xa6, 0xbd, 0x12, 0x2a, 0x10, 0xbd, 0xd0, 0x90, 0x8b, 0x3d, 0x17, 0x08, 0xfc, 0xbc, 0x8e, 0xb4, 0x9a, 0xbc, 0x70, 0x79, 0x3f, 0x3d, 0xd8, 0xad, 0x06, 0x3c, 0xf8, 0x4e, 0x81, 0xbd, 0x82, 0xf1, 0x71, 0xbd, 0x9f, 0x19, 0xcc, 0xbd, 0xaf, 0x6a, 0x45, 0x3d, 0x4e, 0x39, 0x25, 0x3d, 0x17, 0x43, 0x74, 0x3d, 0x52, 0x51, 0x53, 0xbd, 0x53, 0x10, 0x5f, 0xbd, 0x5f, 0x60, 0xf7, 0x3c, 0xf4, 0x07, 0x6d, 0x3d, 0x68, 0x1d, 0x29, 0x3d, 0xd6, 0xf7, 0xad, 0xbc, 0x09, 0x0d, 0x8f, 0xbd, 0x17, 0xae, 0xd7, 0x3c, 0x63, 0xf2, 0xc7, 0xbc, 0x4e, 0xa0, 0x05, 0xbd, 0x53, 0x3b, 0xc5, 0xbc, 0x81, 0xf4, 0x82, 0x3d, 0x5e, 0xc9, 0x56, 0xbd, 0x32, 0xb8, 0xbd, 0xbc, 0xf2, 0x3e, 0xc7, 0xbc, 0x76, 0x7f, 0x76, 0xbd, 0x19, 0x45, 0x13, 0xbd, 0xb9, 0x17, 0x88, 0x3d, 0xef, 0x15, 0x68, 0xbd, 0x7a, 0xb8, 0xf6, 0x3a, 0xa8, 0x56, 0x72, 0xbb, 0x96, 0x68, 0xce, 0x3d, 0x13, 0x43, 0x0a, 0xbd, 0x87, 0x3f, 0x91, 0x3c, 0xd7, 0x12, 0x8b, 0x3b, 0x2f, 0x85, 0xbf, 0xbc, 0x33, 0xfc, 0x62, 0xbc, 0x5f, 0xb3, 0x8f, 0xbc, 0x9f, 0x1a, 0xf5, 0xbc, 0x3b, 0x75, 0x68, 0x3d, 0x58, 0xae, 0x3c, 0x3d, 0xe3, 0x00, 0x5d, 0x3d, 0xcf, 0x69, 0x9c, 0x3d, 0xdb, 0x20, 0xb3, 0x39, 0x31, 0x1a, 0x7a, 0xbc, 0x11, 0x37, 0xd0, 0x3c, 0x1d, 0x5d, 0x84, 0x3d, 0xb2, 0x5d, 0xe9, 0xbc, 0x24, 0x74, 0xe5, 0xbc, 0x86, 0x1d, 0xea, 0xbb, 0x65, 0x94, 0x76, 0x3d, 0x9a, 0xb2, 0xeb, 0x3c, 0x62, 0x9f, 0x44, 0xbb, 0xca, 0x35, 0xa8, 0xbc, 0x25, 0x51, 0x23, 0x3d, 0xa9, 0xac, 0x00, 0xbd, 0xb9, 0x13, 0xa6, 0x3d, 0x3e, 0x3e, 0x10, 0xbc, 0x5f, 0x40, 0x8b, 0x3d, 0x75, 0xef, 0x70, 0x3b, 0xf8, 0x66, 0xa4, 0x3c, 0x69, 0x24, 0x84, 0x3c, 0x2a, 0xd2, 0x76, 0xbc, 0x67, 0xef, 0x9f, 0xbc, 0xe1, 0x67, 0xcb, 0xbc, 0xe1, 0x4c, 0xa9, 0xbd, 0x18, 0xb6, 0x96, 0x3d, 0x29, 0xaa, 0x84, 0xbd, 0x80, 0x0d, 0x5b, 0x3d, 0x35, 0xe7, 0x02, 0x3d, 0xea, 0xf8, 0x46, 0xbd, 0xba, 0x63, 0x42, 0x3d, 0x3e, 0x6d, 0x83, 0x3d, 0x0d, 0x47, 0x3c, 0xbd, 0x79, 0xe3, 0xa1, 0x3c, 0x7b, 0x77, 0x17, 0xbd, 0x4d, 0x55, 0x53, 0x3d, 0xc3, 0x91, 0x7e, 0xbd, 0x9b, 0x6b, 0x49, 0x3d, 0x30, 0xad, 0xc7, 0xbc, 0xc1, 0x27, 0x3e, 0xbd, 0xea, 0xaf, 0x51, 0x3d, 0x12, 0x3a, 0x94, 0xbc, 0xf1, 0x36, 0xf1, 0x3c, 0x6a, 0x5a, 0x93, 0x3b, 0x88, 0x1e, 0xb1, 0xbc, 0x3c, 0x43, 0x37, 0xbd, 0x74, 0xda, 0x9a, 0xbd, 0x53, 0x3d, 0x7b, 0x3d, 0xe7, 0x18, 0xdd, 0xbc, 0xba, 0x1b, 0xd9, 0xbc, 0xe8, 0x9a, 0x64, 0xbd, 0xca, 0x36, 0x2b, 0x3d, 0xc6, 0x99, 0xbc, 0x3c, 0xa6, 0x76, 0x72, 0x3d, 0x59, 0x8a, 0xb5, 0x3c, 0x07, 0xf8, 0xd7, 0x3d, 0xdd, 0xaf, 0x2a, 0xb8, 0x77, 0xac, 0xb7, 0x3c, 0x53, 0xd6, 0x12, 0xbd, 0x19, 0x6c, 0x63, 0x3c, 0xe0, 0xf5, 0x32, 0xbd, 0x72, 0xc2, 0xae, 0xbd, 0x04, 0x6b, 0x12, 0x3c, 0xea, 0x76, 0x99, 0x3d, 0x5e, 0x14, 0x25, 0xbd, 0x16, 0x01, 0x01, 0xbc, 0x6d, 0x0e, 0xb8, 0x3d, 0x78, 0x70, 0x85, 0x3b, 0x7b, 0xb9, 0x55, 0xbb, 0x59, 0xa4, 0x2f, 0x3d, 0xbb, 0xf1, 0x4e, 0xbc, 0x6e, 0x1e, 0x6f, 0x3d, 0x6d, 0xd0, 0x82, 0x3d, 0xa1, 0x2a, 0x38, 0xbd, 0x82, 0x0e, 0x81, 0x3d, 0x51, 0x1a, 0xe8, 0x3c, 0x78, 0x0f, 0xb2, 0xbc, 0xdb, 0x4a, 0x9f, 0x3d, 0xeb, 0xf7, 0x5f, 0x3b, 0xf0, 0x3e, 0xe2, 0xbc, 0x9c, 0x11, 0x91, 0x3c, 0xb0, 0xbd, 0x1a, 0x3c, 0xce, 0x3f, 0x1c, 0xbb, 0x0e, 0xe3, 0x0b, 0x3d, 0x2e, 0x44, 0x15, 0x3d, 0x90, 0x12, 0xe8, 0x3c, 0x84, 0xb7, 0x46, 0x3d, 0x4f, 0x51, 0x90, 0x3c, 0x5f, 0xee, 0xe8, 0x3c, 0x8f, 0xa8, 0xd2, 0xbb, 0x86, 0x20, 0x7c, 0x3d, 0xe8, 0x1f, 0x48, 0xbc, 0xbb, 0x7f, 0x59, 0x3d, 0x62, 0xf1, 0x8a, 0xbc, 0x94, 0x28, 0x0c, 0x3c, 0xdd, 0x8f, 0x1a, 0xbd, 0xad, 0x5a, 0xa8, 0x39, 0x4d, 0x0c, 0x71, 0x3d, 0x96, 0xa2, 0x91, 0x3d, 0xe7, 0x9c, 0x69, 0xbc, 0x1f, 0x9d, 0x0c, 0xbd, 0x6e, 0xbe, 0xe7, 0x3c, 0x97, 0x28, 0x35, 0xbd, 0x11, 0xb7, 0x8c, 0xbd, 0x3b, 0xc0, 0xc1, 0x3c, 0x02, 0x96, 0xd7, 0x3c, 0x79, 0x02, 0x4d, 0xbc, 0x6c, 0xad, 0xb7, 0x3c, 0x9a, 0xef, 0x29, 0x3d, 0xe9, 0x73, 0x9b, 0x3d, 0x58, 0xd3, 0x17, 0x3d, 0xea, 0xcc, 0x2d, 0xbd, 0x64, 0x3a, 0x9e, 0xbd, 0x9a, 0x8b, 0x3c, 0xbd, 0x4f, 0x97, 0x88, 0xbc, 0x1b, 0x18, 0x27, 0xbc, 0x22, 0xdc, 0xde, 0xbd, 0xb4, 0xbe, 0x94, 0xba, 0x5a, 0xc7, 0xe0, 0x3b, 0xe9, 0xd7, 0x07, 0x3c, 0xcb, 0x47, 0xf2, 0x3c, 0x04, 0xca, 0x2f, 0x3d, 0x25, 0x4d, 0xd9, 0x3c, 0xc1, 0xb9, 0x37, 0xbd, 0xa1, 0x9a, 0x0c, 0x3d, 0x78, 0xae, 0x88, 0xbd, 0x02, 0xb5, 0x98, 0x3d, 0x63, 0x8b, 0x79, 0xbd, 0xab, 0xe4, 0xaa, 0x3d, 0x5a, 0x1e, 0x02, 0xbc, 0x16, 0x17, 0x68, 0x3b, 0xf8, 0x36, 0x0d, 0x3b, 0x1f, 0x67, 0x8c, 0xbd, 0xbc, 0x52, 0xe2, 0xbc, 0x2f, 0xee, 0xe2, 0xbb, 0x46, 0x45, 0x08, 0x3d, 0xd2, 0xea, 0xc9, 0x3c, 0x00, 0xcc, 0x5c, 0x3d, 0x1e, 0x1f, 0x54, 0x3c, 0x10, 0x3e, 0x8e, 0x3c, 0x1e, 0x6d, 0x5f, 0xbd, 0xfb, 0xdb, 0x64, 0x3d, 0x62, 0x27, 0xb5, 0xbd, 0x0a, 0x8c, 0x51, 0xbd, 0x5e, 0x4d, 0xae, 0xbd, 0xd4, 0xd2, 0x65, 0x3d, 0x88, 0xc4, 0xc0, 0x3c, 0x25, 0x97, 0xb9, 0xbb, 0x6d, 0x7c, 0x5b, 0x3d, 0x42, 0x2f, 0x0e, 0xbb, 0x42, 0xfc, 0xb3, 0xba, 0x38, 0x1c, 0xae, 0xbc, 0x4d, 0xba, 0x7a, 0xbd, 0x15, 0xf7, 0x9d, 0x3d, 0x51, 0xc4, 0x82, 0x3d, 0x70, 0xa9, 0x47, 0x3d, 0x68, 0x1c, 0xdf, 0x3c, 0xef, 0x44, 0x71, 0x3c, 0xdf, 0x7d, 0x80, 0x3d, 0x6c, 0x6c, 0xcd, 0xbc, 0x9b, 0xf2, 0x68, 0x3d, 0x61, 0x10, 0x64, 0x3d, 0x31, 0x19, 0xda, 0x3c, 0xc3, 0x1c, 0xdc, 0xbb, 0xe1, 0x30, 0x13, 0xbc, 0x4d, 0xd5, 0xaf, 0xbb, 0x39, 0xaa, 0x43, 0xbd, 0x9a, 0x51, 0x75, 0xbd, 0xc3, 0x2b, 0x5e, 0x3c, 0x2f, 0x60, 0xed, 0x3c, 0x2a, 0x8e, 0x87, 0x3d, 0x0e, 0x88, 0x08, 0xbd, 0xcb, 0x1a, 0xc2, 0x3b, 0x86, 0xdb, 0x44, 0xbd, 0x3c, 0xb2, 0xd8, 0xbc, 0xd8, 0x5c, 0x2a, 0x3d, 0xf9, 0xb9, 0x06, 0xbd, 0xf6, 0x2f, 0x52, 0x3d, 0xda, 0x46, 0xe9, 0x3b, 0xeb, 0x10, 0xd5, 0x3c, 0x5a, 0x5a, 0x70, 0x3b, 0x58, 0xd3, 0x30, 0x3c, 0xb3, 0x7e, 0x00, 0xbd, 0x81, 0x37, 0x56, 0xbd, 0x0a, 0x66, 0x12, 0xbd, 0xd7, 0xca, 0x80, 0xbd, 0x89, 0x4c, 0x52, 0x3d, 0x42, 0x49, 0xab, 0x3c, 0x79, 0xe8, 0xa6, 0xbd, 0xa2, 0x35, 0xd5, 0xbd, 0xa3, 0x0c, 0x0e, 0xbd, 0x4f, 0x10, 0x8a, 0x3d, 0xd4, 0xbe, 0x64, 0x3d, 0x38, 0x13, 0xfd, 0x3d, 0x86, 0xc8, 0x82, 0xbd, 0xd2, 0x11, 0x46, 0x3d, 0xcc, 0x13, 0x6a, 0x3d, 0x29, 0x91, 0xe2, 0xbc, 0x9a, 0x59, 0xc8, 0xbc, 0x6d, 0xd3, 0x79, 0xbd, 0x00, 0x17, 0xbd, 0x3d, 0x2f, 0x3d, 0x13, 0xbd, 0xf2, 0x5e, 0x5a, 0x3d, 0x91, 0xd3, 0x22, 0xbc, 0x8d, 0x7d, 0xdd, 0x3c, 0xcb, 0xd3, 0x47, 0x3d, 0x51, 0x39, 0x43, 0x3d, 0x8e, 0xba, 0xb3, 0x3c, 0xcf, 0xdc, 0x5d, 0xbc, 0xe8, 0xf4, 0x69, 0xbd, 0x75, 0xed, 0x4a, 0xbd, 0x3e, 0xa3, 0x52, 0x3d, 0x55, 0xbe, 0x6e, 0xbd, 0x84, 0x86, 0xb3, 0xbc, 0x7d, 0x3b, 0x4f, 0xbd, 0xd0, 0x9c, 0x8f, 0xbb, 0xe4, 0x9f, 0x39, 0x3d, 0x10, 0x5c, 0xf0, 0xbb, 0x64, 0x15, 0x82, 0xbc, 0x12, 0xf8, 0x45, 0x3d, 0xf6, 0xfc, 0x40, 0x3d, 0x64, 0x01, 0x84, 0xbc, 0x4e, 0x97, 0x28, 0x3d, 0xc0, 0xb8, 0x30, 0x3d, 0xf8, 0x94, 0x71, 0xbd, 0x59, 0x5a, 0x61, 0xbd, 0x9e, 0x55, 0x8d, 0xbd, 0x00, 0x77, 0xfa, 0xbc, 0x9c, 0xbf, 0x17, 0x3d, 0x94, 0x7a, 0x4f, 0xbd, 0xb1, 0xa6, 0x8f, 0xbd, 0xad, 0xc3, 0x8a, 0x3d, 0xf0, 0xca, 0x8b, 0x3c, 0x2a, 0xe4, 0x2b, 0xbd, 0x34, 0x81, 0x44, 0xbd, 0x48, 0x55, 0x52, 0xbd, 0x2e, 0x7e, 0x63, 0x3d, 0x3a, 0x07, 0x4e, 0x3d, 0xb0, 0xb9, 0x7a, 0x3c, 0x18, 0x7d, 0x6e, 0xbc, 0x7a, 0x0e, 0x3c, 0xbd, 0xdc, 0x81, 0x8c, 0xbd, 0xc8, 0xa4, 0x71, 0x3c, 0xca, 0x20, 0x28, 0x3d, 0x28, 0x36, 0xf6, 0x3c, 0x28, 0xef, 0x3c, 0x3d, 0x88, 0x83, 0x3e, 0x3c, 0x74, 0x45, 0x34, 0x3d, 0x80, 0x11, 0x06, 0xba, 0x8c, 0xd1, 0x79, 0xbc, 0x84, 0x71, 0x26, 0xbd, 0x98, 0x15, 0x15, 0x3c, 0x4a, 0x0e, 0x92, 0xbc, 0x75, 0x17, 0x83, 0x3d, 0xfc, 0x9c, 0xc1, 0xbc, 0x4c, 0xe3, 0xb5, 0x3c, 0x10, 0xc9, 0x23, 0x3c, 0xd0, 0xde, 0x1a, 0x3c, 0x22, 0x15, 0x92, 0xbd, 0xe6, 0x39, 0x48, 0xbd, 0x16, 0x40, 0x91, 0xbd, 0x5c, 0xf1, 0xb4, 0x3c, 0x4a, 0xf7, 0xbc, 0xbc, 0x80, 0x48, 0x44, 0x3c, 0xc8, 0x47, 0x15, 0xbc, 0xcb, 0x39, 0x4d, 0xbd, 0x04, 0xe1, 0xc0, 0x3c, 0x86, 0x40, 0x43, 0xbd, 0x3f, 0x39, 0x6a, 0xbd, 0x00, 0xfd, 0x30, 0xbb, 0x18, 0x14, 0x60, 0xbc, 0xf0, 0x88, 0x12, 0x3d, 0x21, 0xf7, 0x90, 0x3d, 0xfc, 0xcc, 0xa1, 0x3c, 0xa6, 0x1f, 0x2d, 0x3d, 0x0a, 0x14, 0x46, 0xbd, 0x37, 0x3c, 0x5f, 0xbd, 0x32, 0x53, 0x94, 0xbc, 0x58, 0x51, 0xb1, 0xbc, 0xd7, 0x03, 0x89, 0x3d, 0xfe, 0x03, 0x37, 0xbd, 0x9e, 0x06, 0x89, 0xbd, 0xbc, 0xf6, 0x41, 0x3d, 0xf0, 0x87, 0x32, 0x3d, 0xdc, 0x11, 0xeb, 0xbc, 0x4a, 0x89, 0x3b, 0x3d, 0xd2, 0xf1, 0x2b, 0x3d, 0x78, 0xcb, 0x38, 0xbc, 0x46, 0xda, 0xff, 0xbc, 0xee, 0x9c, 0x8d, 0xbd, 0x14, 0x8e, 0xcd, 0xbc, 0x08, 0x6f, 0x05, 0x3d, 0x00, 0xac, 0x8e, 0xbd, 0x90, 0xa2, 0x84, 0xbb, 0x9b, 0x36, 0x32, 0xbd, 0x2b, 0x3f, 0x89, 0x3d, 0x80, 0x9a, 0x03, 0xbb, 0x06, 0xac, 0x17, 0x3d, 0xf8, 0x22, 0x3f, 0xbd, 0x75, 0xae, 0x90, 0xbd, 0x76, 0xdd, 0x3e, 0xbd, 0x7c, 0x72, 0x92, 0x3c, 0x4c, 0x38, 0x44, 0xbd, 0xba, 0x8f, 0x21, 0x3d, 0x00, 0x88, 0x7e, 0xbb, 0xdc, 0xd2, 0x92, 0x3c, 0x1a, 0x45, 0x77, 0x3d, 0x54, 0xa1, 0x50, 0xbc, 0x44, 0xea, 0x2d, 0x3d, 0x8e, 0xbd, 0x1d, 0x3d, 0x1b, 0xb9, 0x88, 0x3d, 0x20, 0xc4, 0x8b, 0xbd, 0x43, 0x9e, 0x05, 0xbd, 0x80, 0x93, 0x4a, 0x3d, 0x02, 0xb3, 0x8a, 0xbd, 0x40, 0x5c, 0xbb, 0x3b, 0x54, 0x22, 0x37, 0xbd, 0x04, 0xd5, 0xed, 0xbc, 0xae, 0xce, 0x87, 0xbd, 0x0c, 0x0f, 0xe3, 0xbc, 0xc1, 0x1f, 0x48, 0xbd, 0x68, 0x6a, 0x9a, 0x3c, 0xd0, 0x0b, 0x8f, 0x3c, 0xc8, 0x5c, 0x00, 0x3d, 0x60, 0xf9, 0xd5, 0xbb, 0x57, 0x9a, 0x88, 0xbd, 0xf2, 0x1a, 0x8d, 0xbd, 0x52, 0x69, 0x63, 0x3d, 0xb8, 0x69, 0x89, 0x3c, 0x56, 0xfb, 0x0a, 0x3d, 0x00, 0xc3, 0x10, 0xba, 0x0e, 0xcd, 0x56, 0xbd, 0x1a, 0xf7, 0x61, 0x3d, 0xf8, 0x95, 0x8b, 0xbd, 0x3c, 0x34, 0x14, 0xbd, 0xed, 0xc6, 0x8f, 0x3d, 0xee, 0xc2, 0x1c, 0x3d, 0xa0, 0x9d, 0x04, 0xbb, 0xfd, 0x06, 0x56, 0xbd, 0xa0, 0xe7, 0x12, 0x3b, 0xae, 0x01, 0xbd, 0xbc, 0xb0, 0x52, 0x16, 0x3d, 0x00, 0x9e, 0x97, 0xba, 0x40, 0xaf, 0x58, 0x3d, 0xa4, 0x80, 0x97, 0x3c, 0xa0, 0x07, 0x22, 0x3b, 0x59, 0x3b, 0x01, 0xbd, 0x83, 0x64, 0x87, 0x3d, 0x0e, 0xfd, 0x96, 0xbc, 0x3a, 0xf8, 0x7b, 0xbd, 0x7d, 0x61, 0x0a, 0xbd, 0xe2, 0x4c, 0x58, 0xbd, 0xc0, 0x1b, 0x81, 0xbb, 0x70, 0x48, 0x0b, 0x3d, 0x5a, 0x4c, 0x94, 0xbc, 0x6a, 0x49, 0x5b, 0x3d, 0x58, 0x79, 0x7a, 0x3c, 0x54, 0xe4, 0x10, 0xbd, 0x0f, 0x05, 0x8c, 0x3d, 0x00, 0x70, 0xb3, 0xba, 0xfe, 0x52, 0xec, 0xbc, 0x80, 0x87, 0xe5, 0x3b, 0x76, 0x35, 0x7f, 0x3d, 0x20, 0x23, 0x36, 0x3b, 0x48, 0xe0, 0x16, 0x3d, 0x0e, 0xdb, 0x53, 0x3d, 0x76, 0x7d, 0xcb, 0xbc, 0x79, 0xf8, 0x5c, 0xbd, 0x8a, 0x7c, 0x39, 0x3d, 0x8c, 0x87, 0x1d, 0x3d, 0x3a, 0x32, 0x08, 0xbd, 0x54, 0xa9, 0x6a, 0xbc, 0x22, 0xad, 0xad, 0xbc, 0xd2, 0x4b, 0x68, 0x3d, 0x86, 0x89, 0xee, 0xbc, 0x42, 0xee, 0x7d, 0x3d, 0x56, 0x9e, 0x46, 0x3d, 0x58, 0xcd, 0xd0, 0x3c, 0xb4, 0x6d, 0x9f, 0x3c, 0x0c, 0x5b, 0x20, 0xbd, 0x40, 0xe8, 0x2c, 0x3b, 0x23, 0xd1, 0x80, 0x3d, 0xee, 0x0f, 0xc8, 0xbc, 0x1c, 0x52, 0xd5, 0x3c, 0x68, 0x8d, 0x63, 0xbc, 0x9c, 0xb3, 0x37, 0xbd, 0x0c, 0x04, 0xde, 0x3c, 0x50, 0x20, 0x93, 0x3b, 0xac, 0xef, 0xf6, 0x3c, 0xac, 0x6e, 0x93, 0xbc, 0x92, 0x06, 0x64, 0x3d, 0x28, 0xdd, 0x74, 0x3c, 0xf7, 0x67, 0x86, 0x3d, 0x2c, 0x86, 0x43, 0x3d, 0x30, 0x55, 0x89, 0xbd, 0xa0, 0xf0, 0xd7, 0xbb, 0xe4, 0x7f, 0x05, 0x3d, 0x18, 0xf7, 0x3f, 0x3c, 0x46, 0xaf, 0xcb, 0xbc, 0x80, 0xf0, 0xb3, 0x3b, 0xdc, 0xe9, 0x81, 0x3c, 0xef, 0x3f, 0x5c, 0xbd, 0xfe, 0xb8, 0xa1, 0xbc, 0x90, 0x44, 0x41, 0x3c, 0x4e, 0xc8, 0x30, 0xbd, 0x63, 0x6e, 0x72, 0xbd, 0xbc, 0x52, 0xbf, 0xbc, 0x7c, 0x04, 0x47, 0xbd, 0x4c, 0xe3, 0x4e, 0xbd, 0x34, 0x8b, 0x36, 0x3d, 0xd1, 0xf2, 0x33, 0xbd, 0x16, 0x48, 0x09, 0x3d, 0x8c, 0x31, 0x00, 0xbd, 0xd9, 0x91, 0x8e, 0xbd, 0xf2, 0x8d, 0x64, 0xbd, 0x48, 0x20, 0xbf, 0xbc, 0x60, 0x89, 0x53, 0x3b, 0x00, 0x96, 0x71, 0x3a, 0x44, 0x6e, 0x8c, 0xbd, 0x90, 0x6b, 0x7d, 0xbd, 0x64, 0x71, 0xa6, 0x3c, 0x52, 0x23, 0x70, 0x3d, 0xf3, 0x05, 0x80, 0x3d, 0xb4, 0xe2, 0x68, 0xbd, 0x20, 0x6f, 0xf9, 0x3b, 0x60, 0x31, 0x2c, 0x3d, 0x30, 0x78, 0x4b, 0xbd, 0xd8, 0xae, 0x23, 0xbc, 0x40, 0xea, 0xc5, 0x3a, 0xd0, 0xe7, 0x86, 0xbd, 0xa0, 0x57, 0x47, 0x3d, 0x70, 0x78, 0xab, 0x3b, 0x1c, 0xab, 0xb1, 0xbc, 0x2a, 0x75, 0x5d, 0xbd, 0xd0, 0xd1, 0x26, 0xbd, 0x90, 0x93, 0x3a, 0xbd, 0xb4, 0x8a, 0xe9, 0xbc, 0xac, 0xf1, 0xa5, 0xbc, 0x10, 0xa3, 0xa7, 0xbb, 0x02, 0xb2, 0x73, 0xbd, 0x2e, 0x27, 0xb7, 0xbc, 0xd0, 0x0c, 0x92, 0xbd, 0x0e, 0x8e, 0x77, 0x3d, 0x5a, 0x78, 0x0a, 0x3d, 0xf4, 0xa9, 0xc5, 0x3c, 0x82, 0x8a, 0x15, 0x3d, 0x3d, 0x25, 0x13, 0xbd, 0x7e, 0x35, 0x12, 0xbd, 0x2a, 0xd2, 0x6e, 0x3d, 0x78, 0x60, 0xcb, 0xbc, 0x70, 0x92, 0x81, 0xbd, 0xca, 0x3f, 0x2f, 0xbd, 0x3b, 0x71, 0x67, 0xbd, 0x80, 0x79, 0x83, 0xba, 0xc6, 0x2a, 0x47, 0x3d, 0x86, 0x99, 0x72, 0x3d, 0x6c, 0x59, 0x8f, 0x3c, 0x73, 0x59, 0x14, 0xbd, 0x23, 0x83, 0x82, 0x3d, 0x94, 0x4d, 0x8b, 0xbd, 0x9c, 0x05, 0x2f, 0xbd, 0x60, 0xae, 0x57, 0x3d, 0x95, 0x1c, 0x86, 0x3d, 0x26, 0xaf, 0x78, 0x3d, 0x47, 0x4b, 0x4e, 0xbd, 0x96, 0xfd, 0x75, 0x3d, 0xb2, 0x63, 0x35, 0x3d, 0xc0, 0x00, 0xa3, 0x3b, 0x12, 0x16, 0x3d, 0x3d, 0x8e, 0xd2, 0x56, 0xbd, 0x02, 0xff, 0xec, 0xbc, 0x96, 0x20, 0xcc, 0xbc, 0xf4, 0x61, 0x0b, 0x3d, 0x20, 0x12, 0x58, 0x3b, 0x5a, 0xa3, 0x4c, 0x3d, 0x80, 0x86, 0x64, 0x3b, 0x0e, 0x77, 0x70, 0x3d, 0xd0, 0x7b, 0xe8, 0xbb, 0x92, 0x2d, 0x20, 0xbd, 0xc8, 0x33, 0x6f, 0xbc, 0xf8, 0x0f, 0x76, 0x3c, 0x3a, 0xea, 0x36, 0x3d, 0xc0, 0x6c, 0x47, 0x3b, 0x00, 0x3b, 0x98, 0xbc, 0x88, 0x52, 0x3b, 0x3c, 0xa8, 0x58, 0x54, 0x3c, 0x5a, 0xff, 0x4f, 0x3d, 0xfe, 0x26, 0x5e, 0x3d, 0x7c, 0x39, 0x8e, 0xbc, 0x96, 0x37, 0x75, 0x3d, 0xbd, 0x95, 0x86, 0xbd, 0x6b, 0x40, 0x91, 0x3d, 0x40, 0x14, 0x3a, 0xbb, 0xf0, 0xe0, 0x0f, 0xbc, 0xeb, 0x23, 0x82, 0x3d, 0xe0, 0x7c, 0x8e, 0x3b, 0x60, 0x71, 0x11, 0xbc, 0x3e, 0x89, 0x2c, 0xbd, 0x9a, 0x0a, 0x7f, 0xbd, 0xe8, 0x86, 0xcd, 0x3c, 0xd4, 0x1d, 0xfe, 0x3c, 0xc6, 0x1f, 0x63, 0x3d, 0xe8, 0x6a, 0x2d, 0x3c, 0xec, 0xb5, 0x02, 0x3d, 0x78, 0xcb, 0xe0, 0xbc, 0x74, 0x19, 0x64, 0xbc, 0xf0, 0xf7, 0x69, 0xbc, 0x11, 0x97, 0x92, 0xbd, 0xe2, 0x89, 0x8b, 0xbd, 0x36, 0xe1, 0xa2, 0xbc, 0x38, 0x7d, 0xb2, 0xbc, 0xf4, 0x26, 0x16, 0x3d, 0x70, 0x40, 0x90, 0xbd, 0xe0, 0x0a, 0x70, 0x3c, 0x86, 0xb8, 0x35, 0x3d, 0x67, 0xd7, 0x8d, 0x3d, 0xd0, 0xdc, 0x17, 0xbc, 0x10, 0xf7, 0xcd, 0xbb, 0xfe, 0x64, 0x59, 0x3d, 0x34, 0xf3, 0x3c, 0xbd, 0x40, 0xfe, 0xae, 0xba, 0xd1, 0x87, 0x85, 0x3d, 0x10, 0x58, 0x65, 0xbd, 0x66, 0xaf, 0x5d, 0xbd, 0x42, 0x56, 0x5d, 0x3d, 0x7c, 0xce, 0x5f, 0xbd, 0xc0, 0x38, 0x96, 0x3a, 0x33, 0x59, 0x90, 0x3d, 0x06, 0x1a, 0xa6, 0xbc, 0xd4, 0xb0, 0x83, 0x3c, 0xa8, 0xf4, 0x07, 0x3c, 0xa5, 0x8f, 0x90, 0x3d, 0x36, 0xd8, 0xc0, 0xbc, 0xf0, 0xf5, 0x31, 0x3d, 0x30, 0x56, 0x88, 0xbd, 0x3c, 0x96, 0x05, 0xbd, 0x89, 0xc2, 0x89, 0x3d, 0x19, 0x10, 0x06, 0xbd, 0xa2, 0xaa, 0x63, 0x3d, 0x5e, 0x9b, 0x76, 0xbd, 0xa5, 0x57, 0x8c, 0x3d, 0x48, 0xe9, 0x2a, 0x3c, 0xe0, 0xd9, 0x3a, 0x3b, 0xd3, 0x1c, 0x7f, 0xbd, 0x8c, 0x60, 0x21, 0xbc, 0x38, 0xc1, 0x67, 0xbc, 0xf0, 0x83, 0x62, 0x3c, 0x58, 0xcb, 0x3f, 0x3d, 0xc7, 0xd9, 0x83, 0x3d, 0x3e, 0xf5, 0x90, 0xbd, 0xeb, 0xb8, 0x8b, 0xbd, 0x0a, 0x86, 0x05, 0x3d, 0x61, 0xb6, 0x39, 0xbd, 0x56, 0x8f, 0x04, 0x3d, 0x19, 0xbd, 0x33, 0xbd, 0x24, 0xd1, 0x50, 0x3d, 0xd0, 0x14, 0xf8, 0x3c, 0x2c, 0x43, 0x49, 0x3d, 0x98, 0xa1, 0x53, 0xbc, 0xc2, 0x43, 0x26, 0x3d, 0x8e, 0xed, 0xff, 0xbc, 0xb7, 0x58, 0x75, 0xbd, 0x00, 0xb7, 0x85, 0x3a, 0x8c, 0xb1, 0x83, 0xbc, 0x08, 0x40, 0x92, 0xbd, 0x35, 0x28, 0x08, 0xbd, 0x30, 0x4f, 0x84, 0x3c, 0x34, 0x0b, 0x22, 0xbc, 0x30, 0x1a, 0x07, 0x3c, 0xaa, 0xd6, 0x87, 0xbd, 0xa2, 0xfd, 0x7d, 0xbd, 0xfe, 0xa0, 0xb7, 0xbc, 0xa2, 0x0a, 0x33, 0x3d, 0x10, 0x60, 0xe4, 0xbb, 0x64, 0x49, 0x10, 0xbd, 0xf4, 0xd0, 0x48, 0xbc, 0x12, 0x7a, 0x38, 0x3d, 0x28, 0xb9, 0xee, 0xbc, 0x05, 0xbe, 0x50, 0xbd, 0xce, 0x2f, 0xd5, 0xbc, 0x04, 0x8f, 0x39, 0xbd, 0xa8, 0x16, 0x0c, 0xbd, 0x64, 0xe1, 0x79, 0xbc, 0xd4, 0x20, 0x8c, 0x3c, 0x28, 0x73, 0x1c, 0x3d, 0x20, 0x66, 0x97, 0x3c, 0x66, 0x6e, 0xc1, 0xbc, 0x6d, 0xfc, 0x91, 0xbd, 0xc5, 0x79, 0x89, 0xbd, 0xd0, 0x3c, 0x90, 0x3c, 0xfc, 0x19, 0x55, 0xbd, 0x72, 0x96, 0x80, 0xbd, 0x80, 0x81, 0x46, 0x3d, 0xea, 0x10, 0x30, 0x3d, 0x00, 0xdc, 0xe2, 0x3b, 0x44, 0x30, 0x78, 0xbc, 0x3a, 0x5b, 0x39, 0x3d, 0x00, 0x8d, 0x8c, 0xbb, 0x70, 0x9f, 0x3b, 0xbc, 0x1c, 0xa9, 0x5c, 0xbc, 0x04, 0xa9, 0xe4, 0xbc, 0x3a, 0xd9, 0x39, 0x3d, 0xa0, 0x11, 0xfd, 0x3c, 0x76, 0x3b, 0xf9, 0xbc, 0xb9, 0xdd, 0x6f, 0xbd, 0xf5, 0xcb, 0x91, 0xbd, 0xee, 0x45, 0x5d, 0xbd, 0x13, 0x1c, 0x8d, 0xbd, 0x10, 0xb7, 0xb6, 0x3b, 0x60, 0xc8, 0x77, 0x3b, 0x70, 0x4d, 0xbf, 0xbb, 0x38, 0x4f, 0x80, 0xbd, 0xa9, 0x6b, 0x92, 0xbd, 0x78, 0x8e, 0x7e, 0x3c, 0x70, 0xd1, 0x6e, 0x3c, 0x79, 0x4c, 0x85, 0xbd, 0xcc, 0xac, 0x2b, 0x3d, 0x49, 0x46, 0x5f, 0xbd, 0x68, 0x60, 0x6d, 0xbc, 0x50, 0x53, 0xe4, 0x3b, 0x35, 0x39, 0x81, 0x3d, 0xf0, 0x01, 0x12, 0x3c, 0x4c, 0x27, 0x8b, 0xbd, 0xce, 0x8d, 0x71, 0x3d, 0xcc, 0x9a, 0x8e, 0xbd, 0x9e, 0x6f, 0xcd, 0xbc, 0xea, 0x23, 0x19, 0x3d, 0xac, 0xed, 0x95, 0x3c, 0x76, 0x32, 0x68, 0x3d, 0x08, 0xcc, 0x58, 0x3c, 0xc8, 0xe2, 0xcc, 0x3c, 0xf1, 0x85, 0x81, 0x3d, 0x06, 0xdc, 0x6b, 0x3d, 0x16, 0x15, 0xf0, 0xbc, 0xda, 0x56, 0x4e, 0x3d, 0x58, 0x5c, 0x90, 0xbc, 0xe4, 0x79, 0x37, 0xbd, 0x40, 0x1b, 0x6a, 0xbd, 0x00, 0x4e, 0x63, 0x3b, 0xbc, 0xfc, 0x35, 0x3d, 0xe6, 0x87, 0xf9, 0xbc, 0xb0, 0xfc, 0x0c, 0x3d, 0x96, 0x7f, 0x53, 0xbd, 0x1e, 0xe1, 0x04, 0x3d, 0x10, 0x11, 0x87, 0x3c, 0xce, 0xd1, 0x42, 0x3d, 0x1c, 0x27, 0xca, 0xbc, 0xd8, 0x71, 0xfa, 0x3c, 0xea, 0xce, 0x76, 0x3d, 0x2c, 0x0e, 0xbc, 0x3c, 0x9b, 0x96, 0x48, 0xbd, 0x60, 0x7b, 0x93, 0xbb, 0x8a, 0x69, 0xa8, 0xbc, 0xc0, 0xcd, 0x79, 0x3c, 0xd0, 0xe0, 0x87, 0xbd, 0xe6, 0x91, 0x53, 0xbd, 0x96, 0xe0, 0x03, 0x3d, 0x8b, 0x7a, 0x81, 0xbd, 0x16, 0x64, 0x80, 0xbd, 0x84, 0xac, 0x87, 0x3c, 0xf8, 0xb7, 0xfc, 0xbc, 0x63, 0x2a, 0x38, 0xbd, 0x5a, 0x71, 0x35, 0xbd, 0xda, 0xff, 0x49, 0xbd, 0x50, 0xcd, 0xdb, 0xbb, 0xc0, 0x85, 0x37, 0xbb, 0x2a, 0x21, 0x35, 0x3d, 0xb6, 0x59, 0xcc, 0xbc, 0x10, 0x02, 0xe7, 0x3b, 0x78, 0xf5, 0x54, 0xbc, 0xb0, 0x3c, 0x58, 0x3c, 0xf4, 0x96, 0x59, 0x3d, 0x10, 0xd7, 0xd2, 0xbb, 0x1a, 0x0c, 0x79, 0x3d, 0x48, 0x2c, 0x6b, 0x3c, 0xc0, 0x44, 0x89, 0xbb, 0x5c, 0xf0, 0xa3, 0x3c, 0xd0, 0x1c, 0x07, 0x3d, 0x02, 0xcd, 0x94, 0xbc, 0xa8, 0x51, 0x99, 0xbc, 0xc0, 0xb9, 0x40, 0x3c, 0xe0, 0x85, 0x86, 0x3c, 0x74, 0x77, 0x9f, 0x3c, 0x15, 0xe0, 0x71, 0xbd, 0x00, 0xf1, 0xfc, 0xb9, 0x50, 0x39, 0x11, 0x3c, 0xb7, 0x13, 0x81, 0x3d, 0x60, 0x31, 0xe5, 0x3c, 0x8c, 0x42, 0xf6, 0xbc, 0x4c, 0x34, 0x8a, 0xbc, 0xb8, 0x26, 0xe6, 0x3c, 0xf4, 0x56, 0x69, 0xbc, 0xcc, 0xb4, 0xa1, 0x3c, 0xf0, 0x8e, 0x48, 0xbd, 0xcb, 0xab, 0x91, 0xbd, 0x00, 0xc4, 0x5e, 0xbb, 0xdd, 0xf5, 0x8c, 0x3d, 0xc8, 0x1a, 0x8a, 0x3c, 0x1c, 0x9c, 0xda, 0xbc, 0x89, 0x6e, 0x83, 0x3d, 0x00, 0x6e, 0x3c, 0x39, 0x80, 0x82, 0xd0, 0x3a, 0x00, 0x09, 0xc2, 0xb9, 0x04, 0x06, 0x38, 0xbc, 0x0a, 0x7a, 0xf7, 0xbc, 0x50, 0xac, 0x1d, 0x3c, 0x9e, 0xd8, 0xfa, 0xbc, 0xea, 0xed, 0x71, 0xbd, 0x7f, 0xf6, 0x0a, 0xbd, 0x20, 0x2d, 0x30, 0x3b, 0xd0, 0x7c, 0x96, 0x3b, 0x2e, 0x61, 0x3f, 0x3d, 0xb0, 0x0a, 0x2d, 0x3d, 0x80, 0xac, 0x47, 0xbb, 0x7a, 0x9e, 0xe6, 0xbc, 0x50, 0x90, 0x44, 0x3c, 0x0d, 0x23, 0x8e, 0xbd, 0x00, 0x3a, 0x59, 0x3a, 0x12, 0xa5, 0x52, 0xbd, 0xbc, 0x90, 0xac, 0x3c, 0x00, 0x77, 0xe1, 0x3a, 0x83, 0x27, 0x8a, 0xbd, 0x40, 0xcd, 0xb0, 0xbc, 0x6a, 0xf8, 0x22, 0x3d, 0xc0, 0xfe, 0xc8, 0xbb, 0x52, 0x28, 0x63, 0x3d, 0xb2, 0xd2, 0xbe, 0xbc, 0x80, 0x68, 0x42, 0xbc, 0xa4, 0x31, 0x58, 0xbc, 0xae, 0xda, 0x3a, 0xbd, 0xcb, 0xd7, 0x80, 0xbd, 0x32, 0x43, 0x60, 0x3d, 0x52, 0xc1, 0xa9, 0xbc, 0x18, 0x3a, 0x2d, 0x3c, 0x8e, 0x17, 0x5f, 0xbd, 0x9d, 0xcc, 0x85, 0x3d, 0x5c, 0x7c, 0x12, 0x3d, 0xde, 0x24, 0x78, 0x3d, 0xec, 0xba, 0x16, 0x3d, 0xd1, 0xb1, 0x3d, 0xbd, 0xf0, 0x7f, 0xe3, 0x3c, 0xe0, 0xf7, 0xef, 0xbb, 0x28, 0x65, 0x18, 0xbd, 0x7a, 0x38, 0x48, 0x3d, 0xad, 0xff, 0x81, 0xbd, 0x72, 0xe6, 0x69, 0x3d, 0x98, 0x35, 0x08, 0xbd, 0x16, 0xb5, 0x3a, 0xbd, 0x26, 0x18, 0x52, 0xbd, 0xc4, 0xb5, 0xc9, 0x3c, 0xbc, 0xcc, 0x93, 0x3c, 0x6e, 0x74, 0xc9, 0xbc, 0xae, 0x05, 0x14, 0x3d, 0x96, 0x6c, 0x78, 0x3d, 0x48, 0xe7, 0x7a, 0xbc, 0xe2, 0x8b, 0x65, 0xbd, 0xda, 0x9c, 0x97, 0xbc, 0xbc, 0xc8, 0xab, 0x3c, 0xf0, 0xb1, 0x5f, 0xbd, 0xbe, 0x43, 0x3d, 0x3d, 0xf8, 0xc7, 0x81, 0xbd, 0xd0, 0xc7, 0xcd, 0x3c, 0xfe, 0x77, 0x72, 0xbd, 0x32, 0x3c, 0x7c, 0x3d, 0xfa, 0x2e, 0x84, 0xbc, 0x4c, 0xbc, 0x04, 0x3d, 0xc6, 0x29, 0x8f, 0xbd, 0x4c, 0x07, 0xb8, 0x3c, 0x51, 0xb8, 0x45, 0xbd, 0x4c, 0x84, 0x7b, 0xbd, 0x8e, 0x26, 0x3e, 0xbd, 0x48, 0xcc, 0x96, 0xbc, 0xb0, 0x59, 0x32, 0x3d, 0xd6, 0x47, 0xba, 0xbc, 0xf9, 0x32, 0x81, 0x3d, 0xb0, 0xb8, 0x88, 0xbb, 0x80, 0x93, 0xfd, 0x3a, 0x4a, 0x8d, 0x39, 0x3d, 0x88, 0x34, 0xa1, 0x3c, 0x20, 0x3b, 0x53, 0x3b, 0x10, 0x26, 0x35, 0x3d, 0x50, 0xab, 0x77, 0xbc, 0x89, 0x68, 0x69, 0xbd, 0x56, 0xd0, 0x15, 0x3d, 0x56, 0x3f, 0x3e, 0xbd, 0xa0, 0x94, 0xb5, 0x3c, 0xa9, 0x10, 0x90, 0xbd, 0xfa, 0xe9, 0x48, 0xbd, 0x66, 0x62, 0x6a, 0x3d, 0xdc, 0x51, 0xb0, 0x3c, 0x20, 0x13, 0x4d, 0xbd, 0x40, 0xbf, 0xe5, 0xba, 0x50, 0x61, 0x9e, 0x3b, 0xa0, 0xbd, 0xeb, 0xbc, 0xd9, 0x55, 0x48, 0xbd, 0x4c, 0xbf, 0x0e, 0xbd, 0x80, 0x28, 0x20, 0x3b, 0xea, 0x77, 0x72, 0x3d, 0x08, 0xd6, 0x02, 0x3d, 0x7b, 0x14, 0x42, 0xbd, 0x8c, 0x7f, 0x91, 0x3c, 0x82, 0xe4, 0x16, 0xbd, 0x30, 0x61, 0xaf, 0x3c, 0xd2, 0x5c, 0x5a, 0xbd, 0xc0, 0x16, 0x69, 0x3b, 0xe9, 0x5b, 0x84, 0x3d, 0x49, 0xc3, 0x7e, 0xbd, 0x90, 0x7f, 0xf7, 0x3c, 0x3e, 0xd5, 0x85, 0xbd, 0x38, 0xb7, 0x43, 0x3c, 0x4e, 0x4d, 0xc0, 0xbc, 0x00, 0x78, 0xea, 0x3a, 0x32, 0xb2, 0x92, 0xbd, 0xb0, 0xc3, 0x1d, 0x3c, 0x90, 0xc2, 0x23, 0x3c, 0x80, 0x14, 0xc5, 0x3b, 0x00, 0xf1, 0x87, 0xbc, 0x26, 0xf4, 0x8a, 0xbd, 0x10, 0xa6, 0x9a, 0x3b, 0x78, 0x8b, 0x72, 0xbd, 0x85, 0xef, 0x12, 0xbd, 0xd8, 0x93, 0x02, 0x3d, 0x80, 0x8b, 0xca, 0x3a, 0x18, 0x72, 0x17, 0xbc, 0x65, 0x2d, 0x83, 0x3d, 0xfb, 0xe9, 0x81, 0x3d, 0x60, 0xf3, 0x46, 0xbd, 0xb4, 0xab, 0x1a, 0xbc, 0x30, 0x0c, 0xf9, 0x3c, 0xb6, 0xc5, 0x63, 0xbd, 0x8e, 0x20, 0xdd, 0xbc, 0x5c, 0x18, 0x97, 0xbc, 0x10, 0x42, 0x43, 0x3d, 0x11, 0xab, 0x84, 0x3d, 0xec, 0xcf, 0x30, 0x3d, 0x38, 0x0e, 0x6a, 0x3c, 0x3e, 0x40, 0xd9, 0xbc, 0xce, 0x14, 0x14, 0x3d, 0x5c, 0xe6, 0x71, 0xbc, 0xf8, 0xd8, 0xf2, 0x3c, 0x98, 0x96, 0x21, 0xbc, 0xbe, 0xdb, 0x18, 0xbd, 0xe6, 0x7f, 0x28, 0xbd, 0xab, 0x56, 0x23, 0xbd, 0xc2, 0x40, 0x8e, 0xbd, 0x8c, 0x92, 0xc3, 0x3c, 0xd4, 0x0a, 0x13, 0xbd, 0xbe, 0x25, 0x05, 0x3d, 0x12, 0x58, 0x0d, 0x3d, 0xd7, 0x65, 0x79, 0xbd, 0x9c, 0x54, 0x4e, 0x3d, 0x02, 0x2a, 0x40, 0x3d, 0xef, 0xcd, 0x01, 0xbd, 0x11, 0x5c, 0x92, 0x3d, 0xb0, 0x03, 0x95, 0x3c, 0xa0, 0x08, 0x19, 0x3b, 0x79, 0xad, 0x8c, 0x3d, 0x19, 0x93, 0x7a, 0xbd, 0x40, 0xfa, 0xc6, 0xbb, 0x68, 0xb6, 0xa8, 0x3c, 0x45, 0x29, 0x8d, 0xbd, 0x90, 0x3e, 0x13, 0xbc, 0x1a, 0x2d, 0x70, 0x3d, 0xc1, 0xdd, 0x6a, 0xbd, 0x50, 0x75, 0x01, 0xbd, 0xc1, 0x8d, 0x91, 0xbd, 0xdd, 0x3f, 0x84, 0xbd, 0xa3, 0xc6, 0x8d, 0x3d, 0xce, 0x23, 0x5b, 0x3d, 0x7e, 0xfb, 0x7d, 0x3d, 0xd5, 0xf4, 0x23, 0xbd, 0x4c, 0x65, 0x8d, 0xbc, 0xb0, 0x76, 0x89, 0xbd, 0x28, 0xc4, 0x82, 0xbd, 0x40, 0x70, 0x71, 0x3b, 0xfa, 0x55, 0x8e, 0xbc, 0x40, 0x08, 0xf0, 0x3a, 0x02, 0x81, 0x56, 0x3d, 0xfe, 0x51, 0xf8, 0xbc, 0x1a, 0xcd, 0x91, 0xbd, 0xfb, 0x66, 0x7b, 0xbd, 0xb0, 0xbb, 0xf2, 0xbc, 0xbb, 0x24, 0x23, 0xbd, 0x5c, 0x6c, 0x6d, 0xbd, 0x08, 0xa0, 0x8b, 0x3c, 0xb7, 0x93, 0x1d, 0xbd, 0x74, 0x9f, 0x21, 0x3d, 0x1c, 0x43, 0x33, 0xbd, 0x66, 0x2c, 0x1c, 0xbd, 0xfe, 0xf5, 0x11, 0xbd, 0x10, 0x32, 0xef, 0xbc, 0x40, 0x70, 0x6f, 0xbb, 0xa1, 0xca, 0x8f, 0x3d, 0x12, 0x42, 0x13, 0x3d, 0x38, 0x2e, 0xf3, 0x3c, 0x16, 0x69, 0x77, 0x3d, 0x6d, 0xa9, 0x1e, 0xbd, 0xdc, 0xf5, 0xba, 0xbc, 0xc4, 0xe8, 0x1f, 0xbd, 0xfc, 0xc7, 0x08, 0x3d, 0x8c, 0x9a, 0x28, 0x3d, 0x80, 0xbb, 0x14, 0x3b, 0xce, 0x47, 0x68, 0x3d, 0xd3, 0x75, 0x10, 0xbd, 0x30, 0x9e, 0xb1, 0x3b, 0x48, 0x08, 0x80, 0x3c, 0x53, 0xbe, 0x7e, 0xbd, 0x54, 0xdd, 0x5c, 0xbd, 0x89, 0x15, 0x77, 0xbd, 0x20, 0x13, 0x00, 0x3b, 0xab, 0x6a, 0x15, 0xbd, 0x70, 0x62, 0x0b, 0xbc, 0xb6, 0x69, 0x44, 0x3d, 0x9e, 0x71, 0x44, 0x3d, 0xfb, 0x84, 0x1e, 0xbd, 0xc8, 0x25, 0x3e, 0xbc, 0xa8, 0x9e, 0xa6, 0x3c, 0xa0, 0x0c, 0x0b, 0x3d, 0x48, 0xe7, 0xb1, 0xbc, 0x2f, 0xfc, 0x8a, 0x3d, 0xbc, 0x2a, 0x27, 0xbc, 0x80, 0x69, 0x38, 0x3c, 0xa0, 0x89, 0xb4, 0xbb, 0x10, 0xb6, 0x56, 0xbc, 0x80, 0xaa, 0x37, 0x3b, 0xbd, 0x66, 0x1d, 0xbd, 0xb9, 0x3e, 0x6c, 0xbd, 0x14, 0xc1, 0x1e, 0x3d, 0x10, 0xd3, 0xa5, 0x3b, 0x1c, 0x9a, 0x43, 0xbc, 0xa0, 0xb3, 0xdd, 0xbc, 0xf8, 0x82, 0xb8, 0x3c, 0xc8, 0x76, 0x1b, 0x3d, 0x7e, 0x2b, 0x5c, 0x3d, 0x20, 0xd8, 0x7f, 0xbd, 0x88, 0xe0, 0xa0, 0x3c, 0x1c, 0x48, 0x26, 0x3d, 0x50, 0x53, 0x1e, 0x3c, 0xf0, 0x07, 0x54, 0x3c, 0xc9, 0xde, 0x05, 0xbd, 0x2c, 0x34, 0x84, 0x3c, 0xa8, 0x30, 0x1b, 0x3c, 0x6c, 0xa1, 0x3c, 0xbd, 0x00, 0x58, 0xc1, 0xb8, 0xf0, 0xd4, 0xf9, 0x3b, 0xf0, 0xb3, 0x2e, 0x3d, 0x14, 0xe3, 0x4f, 0x3d, 0x70, 0x0b, 0x73, 0x3c, 0x8b, 0xca, 0x89, 0xbd, 0x9c, 0xd8, 0x85, 0x3c, 0x9c, 0x34, 0x4b, 0xbc, 0xf5, 0x38, 0x71, 0xbd, 0x01, 0xe5, 0x84, 0x3d, 0xd4, 0xde, 0x25, 0xbc, 0x80, 0xc0, 0xb1, 0xbb, 0x80, 0xca, 0xfc, 0x3b, 0x78, 0xe0, 0x2d, 0xbd, 0xda, 0x90, 0x29, 0xbd, 0x3a, 0xdb, 0x37, 0xbd, 0x00, 0x81, 0xa1, 0xbb, 0x3a, 0xcb, 0x71, 0xbd, 0x1c, 0x8e, 0x29, 0xbc, 0x68, 0x0a, 0x5f, 0xbc, 0x0f, 0x86, 0x91, 0xbd, 0x98, 0x61, 0x62, 0x3c, 0x82, 0x06, 0x4e, 0xbd, 0xa0, 0x7a, 0x35, 0x3b, 0xfa, 0xbc, 0x31, 0x3d, 0xee, 0x18, 0x3a, 0x3d, 0xe0, 0xf0, 0x9d, 0xbb, 0x87, 0xba, 0x8f, 0x3d, 0x0e, 0x75, 0x24, 0x3d, 0x92, 0xf6, 0x77, 0x3d, 0x78, 0xda, 0x72, 0xbc, 0xe4, 0x5c, 0x55, 0xbc, 0xe3, 0xbf, 0x87, 0x3d, 0x74, 0x55, 0x5c, 0xbd, 0x88, 0x2b, 0x0b, 0xbc, 0x68, 0xd5, 0x21, 0x3d, 0x0a, 0x05, 0x94, 0xbc, 0x5f, 0xb7, 0x8a, 0x3d, 0x48, 0x83, 0x5c, 0x3c, 0x08, 0x83, 0x77, 0xbc, 0xc4, 0x31, 0xd6, 0x3c, 0xb8, 0x48, 0x52, 0x3c, 0x00, 0xcb, 0xda, 0x3b, 0x32, 0x6a, 0x5f, 0xbd, 0x76, 0x7f, 0x8f, 0xbd, 0xc0, 0xb7, 0xb2, 0x3c, 0x91, 0x5e, 0x1d, 0xbd, 0x92, 0x5d, 0x62, 0x3d, 0x9c, 0x2b, 0x65, 0xbd, 0x3e, 0xe5, 0x2a, 0x3d, 0x29, 0xb7, 0x81, 0xbd, 0x74, 0xa2, 0xda, 0x3c, 0x1a, 0xcb, 0x15, 0x3d, 0x56, 0x35, 0x60, 0x3d, 0x50, 0x4a, 0x4f, 0xbc, 0xb2, 0x3c, 0x73, 0x3d, 0x88, 0x39, 0x71, 0xbd, 0xa0, 0x73, 0x7d, 0xbd, 0x18, 0x14, 0xac, 0x3c, 0xa8, 0x1a, 0x57, 0x3d, 0x00, 0x3a, 0x77, 0xbc, 0x2a, 0xd5, 0x93, 0xbc, 0x7e, 0x27, 0x41, 0x3d, 0xa0, 0x96, 0x19, 0x3d, 0x18, 0x3e, 0xe5, 0x3c, 0x56, 0xda, 0x0d, 0x3d, 0xb2, 0x5f, 0x1d, 0x3d, 0x0c, 0x27, 0xd6, 0x3c, 0xc6, 0x34, 0x89, 0xbd, 0x84, 0xe7, 0x65, 0xbd, 0xfc, 0x87, 0xba, 0x3c, 0xd6, 0x7b, 0x3b, 0xbd, 0xe8, 0xf4, 0x49, 0xbd, 0x70, 0x19, 0x0d, 0x3c, 0x5a, 0x0c, 0x18, 0x3d, 0xe6, 0x0e, 0x26, 0x3d, 0x12, 0xa0, 0x61, 0xbd, 0xec, 0xa3, 0x26, 0x3d, 0xf4, 0xef, 0xe0, 0x3c, 0xdd, 0xc0, 0x88, 0xbd, 0x08, 0x87, 0x0e, 0x3d, 0x2b, 0xb7, 0x18, 0xbd, 0xe6, 0xd5, 0x1f, 0xbd, 0x38, 0xc1, 0x37, 0x3c, 0x88, 0x9a, 0x74, 0xbd, 0x04, 0xce, 0x04, 0x3d, 0x00, 0x5c, 0xab, 0xbc, 0xbd, 0x47, 0x4b, 0xbd, 0xf0, 0xc1, 0x33, 0xbc, 0x2c, 0x4d, 0xca, 0x3c, 0x84, 0xfd, 0xed, 0xbc, 0x6c, 0xf2, 0x2c, 0x3d, 0x1b, 0x24, 0x87, 0x3d, 0x7a, 0x67, 0x8f, 0xbc, 0x84, 0xab, 0x50, 0xbc, 0x84, 0xd2, 0x0b, 0x3d, 0x18, 0x03, 0x03, 0x3d, 0x80, 0x54, 0x01, 0x3d, 0xbc, 0x41, 0xd8, 0x3c, 0x60, 0xe4, 0x34, 0x3d, 0x3d, 0xfb, 0x26, 0xbd, 0xcc, 0x6f, 0x1f, 0x3d, 0xc0, 0xb0, 0x30, 0xbb, 0x7f, 0xb2, 0x83, 0xbd, 0x8f, 0xed, 0x91, 0x3d, 0xa0, 0xe6, 0xe2, 0xbb, 0xfa, 0x94, 0x67, 0x3d, 0x70, 0xd4, 0x69, 0xbd, 0x80, 0xba, 0xed, 0x3c, 0xce, 0x26, 0xb8, 0xbc, 0xfe, 0xd9, 0x1c, 0x3d, 0xae, 0x09, 0x0e, 0x3d, 0x4f, 0x3d, 0x52, 0xbd, 0x87, 0xde, 0x62, 0xbd, 0x02, 0x63, 0xff, 0xbc, 0x70, 0x60, 0xbd, 0x3b, 0x3c, 0x3f, 0xe7, 0x3c, 0x9c, 0x9c, 0x34, 0xbd, 0x82, 0xcf, 0x82, 0xbd, 0xa2, 0xdb, 0x39, 0x3d, 0x70, 0x89, 0xe8, 0x3c, 0xad, 0x61, 0x80, 0xbd, 0xd8, 0x58, 0x34, 0xbd, 0xf6, 0x79, 0x5f, 0xbd, 0xd0, 0x9b, 0xc6, 0x3c, 0x02, 0x91, 0x0f, 0x3d, 0x90, 0xe4, 0xc1, 0x3b, 0xff, 0xa7, 0x8e, 0x3d, 0x99, 0x07, 0x92, 0xbd, 0x30, 0x36, 0xe4, 0x3b, 0xf0, 0xd6, 0x38, 0xbd, 0xea, 0x6d, 0x2d, 0xbd, 0x0e, 0x11, 0xf6, 0xbc, 0x80, 0x5b, 0x53, 0x3b, 0x1c, 0x44, 0x41, 0x3d, 0xab, 0x98, 0x7b, 0xbd, 0x20, 0x36, 0x71, 0x3b, 0x87, 0x93, 0x20, 0xbd, 0xb0, 0x35, 0x27, 0xbd, 0xd2, 0x2b, 0x75, 0x3d, 0x90, 0x12, 0xdc, 0xbc, 0x06, 0x6c, 0x2b, 0x3d, 0xe0, 0x86, 0x20, 0xbb, 0x9d, 0xdd, 0x88, 0x3d, 0xec, 0xe2, 0x19, 0x3d, 0x70, 0x76, 0xb4, 0x3c, 0x0e, 0x49, 0x42, 0xbd, 0x34, 0x9c, 0xe3, 0x3c, 0xe0, 0x1d, 0xf8, 0xbb, 0xfc, 0x83, 0xc2, 0xbc, 0xdc, 0xe1, 0x8d, 0xbc, 0x04, 0x9b, 0xa7, 0x3c, 0x54, 0x5a, 0xfc, 0x3c, 0x80, 0x63, 0x14, 0xba, 0xcc, 0x46, 0x08, 0x3d, 0x46, 0xf5, 0x2b, 0x3d, 0xe0, 0x8b, 0x48, 0x3d, 0xa0, 0x99, 0xfd, 0x3b, 0x41, 0x57, 0x87, 0x3d, 0xe4, 0xcb, 0x56, 0xbd, 0x1f, 0xa4, 0x3f, 0xbd, 0xac, 0x66, 0x85, 0x3c, 0xaa, 0x3a, 0x55, 0x3d, 0x32, 0x06, 0x29, 0x3d, 0x9a, 0xb8, 0x5a, 0xbd, 0x00, 0xfc, 0xbb, 0xba, 0xd7, 0x80, 0x86, 0x3d, 0xb4, 0x7c, 0xf5, 0x3c, 0xac, 0xf4, 0x36, 0x3d, 0x82, 0xef, 0x65, 0x3d, 0x49, 0x63, 0x5c, 0xbd, 0x66, 0xe0, 0x8f, 0xbd, 0x42, 0x66, 0x28, 0x3d, 0xfc, 0xec, 0x08, 0x3d, 0x0a, 0x9c, 0x1e, 0x3d, 0x65, 0x3c, 0x45, 0xbd, 0x73, 0x4f, 0x88, 0x3d, 0xec, 0x1e, 0xbf, 0xbc, 0xee, 0xa7, 0x55, 0x3d, 0x10, 0x84, 0x57, 0x3c, 0xd4, 0x12, 0xdf, 0x3c, 0xa8, 0x8f, 0x8f, 0xbd, 0x56, 0x80, 0x89, 0xbd, 0x08, 0xc5, 0x09, 0xbc, 0xfd, 0x84, 0x22, 0xbd, 0xb2, 0x0a, 0x66, 0x3d, 0x0a, 0x86, 0x61, 0x3d, 0x79, 0xf8, 0x81, 0xbd, 0x7a, 0x81, 0x49, 0xbd, 0x88, 0x62, 0x7f, 0x3c, 0x8c, 0x81, 0x71, 0xbd, 0x42, 0x9e, 0x86, 0xbd, 0x30, 0x5d, 0xf6, 0x3b, 0x6c, 0xc0, 0x29, 0xbc, 0x88, 0x30, 0xdf, 0xbc, 0xda, 0xed, 0xf4, 0xbc, 0x98, 0x29, 0x34, 0xbd, 0xc0, 0x10, 0xbe, 0x3a, 0x9b, 0x69, 0x8c, 0x3d, 0x40, 0x02, 0x98, 0xba, 0x2b, 0x85, 0x76, 0xbd, 0x0c, 0xfd, 0xd3, 0x3c, 0x62, 0x37, 0x08, 0x3d, 0x0a, 0xe3, 0xe9, 0xbc, 0x80, 0x1c, 0xc9, 0x3a, 0x54, 0x4b, 0x39, 0xbc, 0x28, 0xae, 0x7a, 0x3c, 0x60, 0xd7, 0xe9, 0x3b, 0x08, 0xbe, 0x52, 0xbd, 0x04, 0x99, 0x3d, 0xbd, 0xd0, 0xd2, 0x13, 0xbd, 0x1a, 0x86, 0x8e, 0xbc, 0xeb, 0xaa, 0x6a, 0xbd, 0x00, 0x23, 0xa3, 0xb9, 0xc8, 0x76, 0x77, 0xbc, 0x36, 0x45, 0x72, 0xbd, 0xe4, 0xd7, 0x8a, 0xbc, 0xfd, 0xfa, 0x8c, 0x3d, 0x2b, 0xc3, 0x07, 0xbd, 0x6d, 0xd0, 0x87, 0x3d, 0xec, 0xa4, 0xde, 0x3c, 0x92, 0x4b, 0x65, 0x3d, 0x20, 0x6c, 0x2c, 0xbd, 0x00, 0xb7, 0x0c, 0x3b, 0x96, 0x7f, 0x4b, 0x3d, 0xec, 0xe9, 0xdb, 0xbc, 0xaa, 0x06, 0x3b, 0x3d, 0x20, 0x8c, 0x33, 0x3d, 0xe1, 0x03, 0x18, 0xbd, 0xe0, 0xa5, 0x0a, 0xbc, 0x30, 0x1d, 0x5f, 0x3c, 0xfc, 0x28, 0x6d, 0xbd, 0x43, 0x41, 0x90, 0x3d, 0x58, 0x87, 0x30, 0x3c, 0xdd, 0x8c, 0x60, 0xbd, 0xec, 0x2a, 0xba, 0xbc, 0xf2, 0x9d, 0xa9, 0xbc, 0x30, 0xb0, 0x06, 0x3c, 0x68, 0x3e, 0x53, 0x3c, 0x78, 0xab, 0xff, 0xbc, 0xa8, 0x34, 0x0d, 0xbc, 0x4e, 0x3f, 0x01, 0x3d, 0x00, 0x96, 0x44, 0x3b, 0x2c, 0xa3, 0xda, 0x3c, 0xba, 0xc4, 0x2e, 0xbd, 0x72, 0xbd, 0x2f, 0x3d, 0xfc, 0x1b, 0x7d, 0xbc, 0x9e, 0xbf, 0x7e, 0x3d, 0x02, 0x94, 0x19, 0x3d, 0x94, 0x36, 0x4f, 0x3d, 0xf1, 0xee, 0x68, 0xbd, 0x54, 0x9c, 0x87, 0x3c, 0xfa, 0x3e, 0x7e, 0x3d, 0x02, 0xec, 0x84, 0xbc, 0x12, 0xe7, 0x89, 0xbd, 0xa4, 0x90, 0xa6, 0x3c, 0x3c, 0x7a, 0x89, 0xbc, 0x86, 0x5d, 0x54, 0x3d, 0xa4, 0xad, 0x53, 0xbc, 0x32, 0xc5, 0x00, 0x3d, 0x1e, 0x53, 0x0b, 0x3d, 0xef, 0xae, 0x02, 0xbd, 0x7c, 0xd8, 0x03, 0x3d, 0x38, 0x0e, 0xa5, 0xbc, 0x51, 0xc4, 0x83, 0x3d, 0x66, 0xcb, 0x8f, 0xbd, 0xa6, 0xfe, 0xb6, 0xbc, 0xa4, 0xb1, 0x97, 0x3c, 0x00, 0xad, 0xb2, 0x3a, 0x0f, 0xb7, 0x33, 0xbd, 0x37, 0x1f, 0x6f, 0xbd, 0x57, 0x39, 0x8c, 0x3d, 0x54, 0xe4, 0xb7, 0xbc, 0x1e, 0x63, 0x52, 0xbd, 0x00, 0x3b, 0x43, 0xbd, 0x50, 0x48, 0xf1, 0xbb, 0x18, 0x01, 0x81, 0xbd, 0x90, 0x1c, 0xaf, 0xbc, 0x06, 0xf8, 0x7d, 0xbd, 0xf0, 0xe0, 0xa5, 0xbc, 0x08, 0x06, 0xc3, 0x3c, 0x22, 0xff, 0x83, 0xbc, 0x4c, 0xef, 0x88, 0xbd, 0x36, 0xf2, 0x77, 0x3d, 0x54, 0x3b, 0xd4, 0xbc, 0xa7, 0xa2, 0x8e, 0x3d, 0xac, 0xb2, 0x99, 0x3c, 0x10, 0x08, 0x88, 0xbb, 0x81, 0x58, 0x8d, 0xbd, 0xf8, 0x25, 0x29, 0xbd, 0x1c, 0x0f, 0x26, 0xbd, 0x8e, 0x7a, 0x81, 0xbd, 0x5c, 0x14, 0x8d, 0xbd, 0x81, 0xdd, 0x8f, 0xbd, 0xc8, 0xa2, 0x5f, 0xbc, 0xc0, 0x48, 0xda, 0xba, 0xfe, 0x26, 0x14, 0x3d, 0xe2, 0x9a, 0x89, 0xbd, 0x66, 0x8d, 0x59, 0x3d, 0xd8, 0xf8, 0x45, 0x3d, 0x0b, 0xb1, 0x04, 0xbd, 0x7a, 0x32, 0xdd, 0xbc, 0x00, 0x01, 0x24, 0xbb, 0xc5, 0x97, 0x87, 0xbd, 0x7c, 0xea, 0x46, 0x3d, 0x85, 0xc1, 0x81, 0x3d, 0xe8, 0x63, 0x24, 0x3d, 0x5d, 0xb3, 0x84, 0xbd, 0xca, 0xa4, 0x04, 0x3d, 0xea, 0xe8, 0xf0, 0xbc, 0xdc, 0x41, 0x05, 0xbd, 0xe8, 0x40, 0x4c, 0xbd, 0xb0, 0xb7, 0x2d, 0x3d, 0xa9, 0x0c, 0x1f, 0xbd, 0xd0, 0x50, 0x97, 0x3b, 0x3f, 0x9c, 0x0f, 0xbd, 0xac, 0xa8, 0x59, 0xbd, 0xdb, 0x76, 0x87, 0x3d, 0x08, 0xd7, 0x52, 0x3c, 0xc8, 0xf0, 0x1c, 0x3d, 0xec, 0xc1, 0x4a, 0x3d, 0x44, 0x87, 0x81, 0x3c, 0xbe, 0x6f, 0x13, 0x3d, 0x80, 0x36, 0x49, 0x3c, 0xae, 0xea, 0x73, 0x3d, 0x70, 0xd3, 0x2d, 0x3d, 0xde, 0xbb, 0x9d, 0xbc, 0xaa, 0xba, 0x32, 0x3d, 0x7b, 0xc1, 0x3c, 0xbd, 0x42, 0x4e, 0x5f, 0xbd, 0x9a, 0xd4, 0x75, 0xbd, 0x52, 0x8d, 0x4a, 0x3d, 0xb4, 0x42, 0x8f, 0x3c, 0x20, 0x32, 0x92, 0xbc, 0x39, 0x52, 0x0a, 0xbd, 0xd8, 0xf6, 0x21, 0xbd, 0x8b, 0x5e, 0x26, 0xbd, 0x42, 0x45, 0x5b, 0xbd, 0x06, 0x86, 0x7f, 0xbd, 0x65, 0x5a, 0x57, 0xbd, 0x78, 0x0a, 0x41, 0xbd, 0x5d, 0x12, 0x89, 0xbd, 0x40, 0x70, 0x34, 0xbc, 0xa0, 0x15, 0x43, 0xbb, 0x76, 0xc5, 0x48, 0x3d, 0x40, 0x0b, 0x36, 0x3d, 0x40, 0x3a, 0x3f, 0x3b, 0x58, 0xc4, 0xa3, 0x3c, 0x70, 0xdc, 0xdf, 0x3c, 0x50, 0x13, 0x1c, 0x3d, 0xc0, 0x6d, 0xcc, 0xbb, 0x62, 0xc7, 0x32, 0xbd, 0x15, 0x3f, 0x8b, 0x3d, 0xb5, 0x5b, 0x14, 0xbd, 0xf1, 0x00, 0x3f, 0xbd, 0x90, 0xe9, 0x53, 0x3c, 0xae, 0xa0, 0x1f, 0xbd, 0x54, 0x4f, 0xc8, 0xbc, 0x7c, 0x0b, 0x3a, 0xbc, 0x96, 0x74, 0x38, 0x3d, 0xa6, 0x9b, 0x3f, 0xbd, 0xf4, 0xfd, 0x88, 0xbc, 0x18, 0x1c, 0x97, 0xbc, 0xc8, 0xcf, 0xea, 0x3c, 0xd9, 0x76, 0x8c, 0x3d, 0x3e, 0x07, 0x87, 0xbc, 0xa8, 0xb5, 0x3f, 0x3c, 0x74, 0x96, 0x79, 0xbd, 0x30, 0xfc, 0x4e, 0x3c, 0x60, 0x75, 0x25, 0x3d, 0x28, 0xd6, 0x7a, 0x3c, 0x38, 0xf6, 0x3e, 0x3c, 0x90, 0xd8, 0xf6, 0xbc, 0x0a, 0x8b, 0x78, 0x3d, 0x94, 0x29, 0xc7, 0xbc, 0xa0, 0x3e, 0xe9, 0xbc, 0x20, 0xfc, 0xa9, 0x3c, 0xde, 0xab, 0xd2, 0xbc, 0x97, 0x63, 0x8b, 0xbd, 0xa0, 0xe7, 0x52, 0xbb, 0xa4, 0xf2, 0x36, 0xbc, 0x50, 0x49, 0xb9, 0xbb, 0x1f, 0x9e, 0x88, 0x3d, 0x86, 0xea, 0x9d, 0xbc, 0x38, 0x1b, 0xf5, 0x3c, 0x46, 0xea, 0x1e, 0xbd, 0x00, 0xad, 0x18, 0xba, 0x1e, 0x19, 0x6b, 0xbd, 0xa4, 0x1f, 0x90, 0x3c, 0xf5, 0xb4, 0x42, 0xbd, 0x48, 0xf2, 0x1f, 0xbd, 0x26, 0x05, 0x12, 0x3d, 0x80, 0x01, 0x58, 0xbd, 0xee, 0x98, 0x51, 0xbd, 0xb8, 0xcd, 0x96, 0xbc, 0x65, 0xbc, 0x81, 0x3d, 0x90, 0x57, 0xcd, 0x3b, 0xa0, 0x9a, 0x30, 0x3c, 0xa6, 0xa4, 0x82, 0xbd, 0x20, 0xa1, 0xc6, 0xbb, 0x95, 0x3a, 0x8c, 0xbd, 0x00, 0xa2, 0x72, 0x3c, 0x00, 0xd6, 0x58, 0x3b, 0xc8, 0x1f, 0x7d, 0x3c, 0xf0, 0x98, 0xe1, 0xbb, 0x02, 0x83, 0xe7, 0xbc, 0x9a, 0xc9, 0x67, 0x3d, 0xf5, 0x03, 0x90, 0xbd, 0x00, 0x9e, 0x55, 0xba, 0x80, 0xa0, 0x05, 0x3b, 0x00, 0x53, 0x6d, 0x3c, 0x16, 0xc9, 0x6a, 0x3d, 0x96, 0x11, 0x04, 0x3d, 0x10, 0x45, 0xff, 0xbb, 0xd2, 0x78, 0x2a, 0xbd, 0xbb, 0xe1, 0x8d, 0xbd, 0x8c, 0x4a, 0xc7, 0xbc, 0x20, 0x1c, 0x23, 0x3d, 0x10, 0xb3, 0xff, 0x3b, 0xd8, 0xec, 0x36, 0x3c, 0x64, 0xf1, 0xa7, 0x3d, 0x22, 0xd3, 0xb0, 0xbd, 0xba, 0xd3, 0xc4, 0x3c, 0x7f, 0x35, 0x0a, 0x3d, 0xb1, 0xba, 0xc0, 0x3d, 0x70, 0x6e, 0x10, 0x3c, 0x0b, 0x3f, 0x43, 0x3d, 0x75, 0x57, 0x4f, 0xbd, 0xf7, 0xae, 0x5e, 0xbd, 0xd6, 0xc7, 0x9f, 0x3d, 0x15, 0x89, 0x08, 0x3d, 0x02, 0x77, 0x49, 0x3c, 0x19, 0x3b, 0xc5, 0xbc, 0xa2, 0x8d, 0x43, 0xbd, 0x7b, 0x63, 0x22, 0xbc, 0xb8, 0x4c, 0xbe, 0x3d, 0x98, 0x23, 0x2a, 0xbd, 0xd2, 0x49, 0x69, 0xbd, 0x58, 0xae, 0x14, 0x3d, 0xdc, 0x52, 0x85, 0xbd, 0xd0, 0x91, 0xea, 0x3c, 0x93, 0x04, 0x5c, 0x3d, 0xdf, 0xf9, 0x20, 0x3d, 0xd3, 0x87, 0x3f, 0xbd, 0xae, 0xe4, 0x6a, 0x3c, 0xed, 0x34, 0x27, 0x3c, 0x79, 0x2d, 0x67, 0x3d, 0x63, 0xb8, 0x57, 0xbc, 0x9f, 0x7f, 0x79, 0xbd, 0x44, 0x92, 0x9b, 0x3d, 0x60, 0x08, 0x40, 0xbd, 0xde, 0x4c, 0x9c, 0x3c, 0xdd, 0x61, 0x21, 0x3c, 0x86, 0xd4, 0x15, 0xbd, 0xf9, 0xd9, 0xe1, 0xbd, 0x40, 0xc7, 0x2f, 0x3d, 0xa7, 0x36, 0x89, 0x3d, 0x8a, 0xdc, 0xa0, 0xbd, 0x5a, 0x12, 0x99, 0x3c, 0x8a, 0x63, 0xfa, 0xba, 0x77, 0x80, 0xa2, 0xbd, 0x68, 0x8f, 0x19, 0xbc, 0x91, 0x17, 0xfc, 0x3c, 0xc7, 0x5f, 0xa0, 0x3c, 0x21, 0x34, 0xf2, 0xbc, 0x09, 0x55, 0x1d, 0xbc, 0xcf, 0x87, 0x01, 0xbc, 0xba, 0xe9, 0x8c, 0x3d, 0x07, 0xf7, 0x93, 0x3c, 0xe2, 0x86, 0x80, 0x3c, 0xd7, 0xf7, 0x45, 0xbd, 0x8d, 0x5c, 0x55, 0x3d, 0x40, 0x89, 0x73, 0x3c, 0x7a, 0xe1, 0x5c, 0x3c, 0x6a, 0x34, 0xe7, 0xbc, 0x25, 0x79, 0xaa, 0x3a, 0x13, 0x23, 0xa1, 0x3d, 0x4b, 0x1e, 0xe1, 0x3c, 0x49, 0xbb, 0xb5, 0xbc, 0xa6, 0x19, 0xa9, 0x3c, 0x4e, 0xf1, 0x2a, 0x3d, 0x69, 0x81, 0xac, 0x3c, 0x00, 0x31, 0x46, 0x3c, 0x84, 0x9b, 0x17, 0xbd, 0xa3, 0x50, 0x70, 0x3d, 0xf9, 0x6d, 0x91, 0xbd, 0x41, 0x1f, 0xad, 0x3b, 0x9c, 0x7c, 0xa5, 0xbc, 0xd7, 0xa0, 0x8f, 0xbb, 0xfe, 0xeb, 0x05, 0x3d, 0xc5, 0x31, 0xc5, 0x3a, 0x9a, 0x3c, 0x08, 0x3d, 0xc2, 0x6d, 0x27, 0xbd, 0xa5, 0xc1, 0x7a, 0x3c, 0x4c, 0x25, 0x41, 0xbd, 0x3e, 0x6e, 0xd0, 0x3c, 0x6b, 0x0e, 0x6d, 0x3d, 0xb4, 0x47, 0x86, 0x3c, 0x60, 0xc8, 0x03, 0x3d, 0x78, 0xb8, 0xb3, 0x3d, 0xfb, 0x4b, 0x0d, 0x3d, 0x44, 0x4c, 0xc0, 0x3b, 0xd1, 0xa8, 0x33, 0xbc, 0xf8, 0x4d, 0x8d, 0xbd, 0x3b, 0xeb, 0x15, 0xbd, 0x16, 0xef, 0x19, 0xbb, 0x66, 0x45, 0x2c, 0xbd, 0x50, 0x0b, 0xab, 0xbb, 0x95, 0x0b, 0x06, 0xbd, 0x2c, 0x1f, 0x33, 0xbd, 0xe4, 0xa5, 0xb7, 0x3a, 0xa0, 0xa0, 0xe4, 0xbc, 0x6c, 0x3b, 0x65, 0x3d, 0x1e, 0xa8, 0x8b, 0x3b, 0xe0, 0xb7, 0x82, 0x3c, 0x3f, 0x77, 0x5b, 0x3d, 0xd1, 0xd3, 0x0a, 0x3c, 0xdd, 0xbc, 0xaa, 0xbd, 0xb2, 0x81, 0x91, 0xbc, 0x0f, 0xcb, 0x5d, 0x3d, 0x08, 0xa9, 0xf0, 0xbc, 0x9b, 0xc4, 0x0c, 0x3c, 0xf7, 0x0d, 0x64, 0xbc, 0x1c, 0xa0, 0xa5, 0xbc, 0x5b, 0x1d, 0x2d, 0xbd, 0x03, 0x78, 0x59, 0x3d, 0x1b, 0x8a, 0x13, 0x3d, 0xaa, 0x9c, 0x14, 0xbd, 0x57, 0xe2, 0xf1, 0x3c, 0x5f, 0xaa, 0x58, 0x3d, 0x6c, 0x19, 0xb5, 0xbc, 0x20, 0xeb, 0x3c, 0x3d, 0xe0, 0xda, 0xd5, 0x3c, 0x54, 0x6f, 0x6f, 0xbd, 0x91, 0x64, 0x82, 0x3d, 0xed, 0xcd, 0x10, 0x3b, 0xec, 0x91, 0x1c, 0x3d, 0xad, 0xee, 0xc0, 0x3c, 0xb9, 0x84, 0xb8, 0x3d, 0x67, 0xe4, 0x19, 0xba, 0xc5, 0xca, 0x00, 0x3b, 0xbc, 0x29, 0xcb, 0xbc, 0xca, 0x3c, 0x20, 0xbd, 0x6e, 0xed, 0x2e, 0xbd, 0xd8, 0x47, 0x83, 0xbd, 0x1f, 0x0b, 0x52, 0xbd, 0x10, 0x29, 0x29, 0x3c, 0xfa, 0x35, 0xd2, 0xbc, 0xbe, 0x31, 0x1b, 0x3d, 0x9c, 0x28, 0xdc, 0xbc, 0xb7, 0x93, 0x70, 0xbb, 0x7b, 0xa8, 0x83, 0xbc, 0xcb, 0xf0, 0x9a, 0x3c, 0x53, 0x7d, 0x31, 0xbd, 0x8a, 0x47, 0x4a, 0x3c, 0xf2, 0xe7, 0x79, 0xbd, 0xe7, 0x10, 0x64, 0xbc, 0x69, 0xf1, 0xa9, 0xbc, 0x5c, 0xfc, 0x9b, 0x3d, 0x5a, 0xcf, 0x14, 0x3d, 0xec, 0x08, 0x63, 0x3d, 0x69, 0x0f, 0x99, 0xbd, 0x6a, 0x76, 0xeb, 0x3c, 0xbd, 0x2f, 0x8f, 0x3d, 0xa0, 0x54, 0x8f, 0x3d, 0x7e, 0x08, 0x84, 0x3d, 0xba, 0x94, 0x42, 0x3d, 0x7c, 0xae, 0xf9, 0xbd, 0x70, 0x32, 0x7f, 0x3c, 0x2f, 0xd3, 0x88, 0xbc, 0x9a, 0x1a, 0x49, 0x3d, 0xf6, 0xed, 0x54, 0xbd, 0x7e, 0x15, 0x66, 0x3d, 0x81, 0x94, 0x7f, 0x3d, 0x4a, 0xfb, 0x5f, 0x3c, 0xd7, 0x10, 0x3a, 0x3c, 0xf8, 0x02, 0x89, 0xbd, 0x9f, 0x9c, 0xb9, 0xbc, 0x02, 0x4c, 0x5b, 0x3d, 0x80, 0xe7, 0x33, 0x3c, 0x55, 0x86, 0x99, 0x3d, 0x9d, 0xa9, 0xad, 0xbd, 0x9e, 0x1b, 0x76, 0xbb, 0xb8, 0x62, 0x49, 0x3d, 0x22, 0x21, 0x65, 0x3d, 0x22, 0x6d, 0x0f, 0x3d, 0x60, 0x23, 0x87, 0xbc, 0xc8, 0xfc, 0x26, 0xbd, 0xc5, 0x47, 0x8c, 0xbd, 0x22, 0x6e, 0xe2, 0xbc, 0xf0, 0x78, 0x2e, 0x3d, 0xa4, 0x7f, 0xa5, 0xbc, 0xf1, 0x41, 0xae, 0x3d, 0xa4, 0x08, 0x0b, 0x3d, 0xe8, 0xbb, 0x1c, 0xbc, 0xf8, 0xdd, 0x85, 0xbc, 0x72, 0x87, 0xea, 0x3c, 0x4a, 0xaa, 0x9a, 0x3d, 0x86, 0xdb, 0xb6, 0x3d, 0x0f, 0xb5, 0xd1, 0xba, 0xfc, 0x88, 0x62, 0xbd, 0x08, 0x54, 0xfd, 0x3d, 0x35, 0xf8, 0x2e, 0xbd, 0x3b, 0xbb, 0xc9, 0x3d, 0x9c, 0xb6, 0x57, 0x3d, 0x03, 0x65, 0x58, 0x3d, 0x13, 0xd0, 0x1d, 0xbd, 0xbb, 0xb1, 0xbf, 0xbc, 0x78, 0x00, 0xde, 0xbc, 0x5c, 0xcb, 0x48, 0xbd, 0xd3, 0xa1, 0x85, 0x3d, 0x08, 0x35, 0xf6, 0xbc, 0x4c, 0x66, 0x89, 0x3d, 0x09, 0x92, 0xa6, 0xbc, 0x64, 0x99, 0x9e, 0xbd, 0xae, 0x80, 0x85, 0xbd, 0x99, 0xe0, 0xe2, 0x3c, 0x8e, 0x75, 0x66, 0xbc, 0x1e, 0x8c, 0xb9, 0xbd, 0x57, 0x43, 0xa8, 0x3c, 0x31, 0x71, 0xac, 0xbc, 0xb5, 0x75, 0x01, 0x3d, 0x10, 0x39, 0x5c, 0xbd, 0xa6, 0xf9, 0x7b, 0xbd, 0xf6, 0xea, 0x5d, 0x3d, 0xd3, 0x34, 0xc7, 0xbc, 0x4e, 0xdc, 0x76, 0xbc, 0x7c, 0x98, 0x26, 0x3c, 0xfb, 0x7a, 0x27, 0xbd, 0x44, 0xe6, 0x44, 0xbd, 0x26, 0xc5, 0xb2, 0x3d, 0xb1, 0x6e, 0xfa, 0xbd, 0x79, 0xcc, 0x29, 0xbd, 0x08, 0xae, 0x46, 0xbc, 0x9d, 0x74, 0x67, 0x3d, 0xa3, 0xb6, 0x98, 0x3d, 0x92, 0xae, 0x3f, 0xbc, 0xef, 0x8c, 0x90, 0x3d, 0xeb, 0x4c, 0x02, 0xbc, 0x21, 0x7d, 0xe5, 0x3c, 0xd4, 0x6f, 0x47, 0xbd, 0x1a, 0xe8, 0x84, 0x3c, 0x0c, 0x96, 0x85, 0xbd, 0xa9, 0x69, 0xa7, 0xbb, 0x8c, 0x1e, 0x82, 0xba, 0xff, 0x78, 0x04, 0xbc, 0x25, 0xb9, 0xaa, 0xbd, 0x0b, 0x03, 0x48, 0xbc, 0xb3, 0xbb, 0x88, 0xbd, 0x00, 0x26, 0xba, 0xbd, 0x82, 0x41, 0x81, 0x3d, 0xfa, 0x3d, 0xc7, 0x3c, 0x38, 0x5c, 0x49, 0xbd, 0x0d, 0x4d, 0x3a, 0x3d, 0x67, 0x58, 0x0a, 0xbd, 0x7e, 0xf6, 0x82, 0x3b, 0x1a, 0x7a, 0x7b, 0x3d, 0xba, 0xff, 0x84, 0x3c, 0x46, 0x87, 0x84, 0x3c, 0xe8, 0x6c, 0x29, 0x3d, 0x8c, 0x6a, 0xac, 0xbc, 0x89, 0x34, 0x91, 0xbd, 0xb9, 0xaf, 0xa6, 0x3c, 0xe0, 0x9e, 0xaf, 0xbc, 0xd2, 0x7a, 0x38, 0x3d, 0xac, 0xbf, 0xc9, 0x3d, 0x73, 0xa1, 0x13, 0x3d, 0x7d, 0xe1, 0xf2, 0x3c, 0x73, 0xec, 0xcf, 0x3b, 0xfd, 0x7b, 0x8e, 0x3d, 0x1e, 0xb2, 0xf3, 0xbc, 0xdc, 0x32, 0x03, 0xbe, 0x5e, 0xfa, 0x1b, 0x3d, 0xdc, 0x1a, 0x25, 0x3d, 0x00, 0xcd, 0x48, 0xba, 0x13, 0x9d, 0xbe, 0x3d, 0x2e, 0x05, 0x77, 0xbd, 0x17, 0x74, 0x9e, 0xbd, 0xae, 0xc5, 0x62, 0x3c, 0x95, 0xf4, 0x59, 0x3d, 0x36, 0xd2, 0xa4, 0x3d, 0xab, 0x2b, 0x84, 0xbc, 0x87, 0x89, 0x55, 0x3d, 0xd0, 0xde, 0x5d, 0xbc, 0xcd, 0xb0, 0xce, 0xbc, 0x29, 0xa0, 0xc8, 0xbc, 0x8a, 0x0b, 0xf1, 0x3c, 0xb8, 0xce, 0x9c, 0x3c, 0x14, 0xd1, 0x36, 0x3d, 0x50, 0x4b, 0x08, 0xbd, 0x85, 0x95, 0x4b, 0xbd, 0x31, 0x9e, 0xcf, 0xbc, 0xff, 0x96, 0x83, 0x3d, 0x6c, 0x32, 0x15, 0x3c, 0x6d, 0xfd, 0xb0, 0x3d, 0x05, 0xd8, 0x33, 0xbd, 0x1b, 0x74, 0x8d, 0xbd, 0xfb, 0x92, 0x21, 0xbd, 0xde, 0x6c, 0x8f, 0xbc, 0xcc, 0x1e, 0x0f, 0xbd, 0xfa, 0xc4, 0xb8, 0xbb, 0xc6, 0xe2, 0x1e, 0x3d, 0x9b, 0xd2, 0x99, 0xbb, 0x0f, 0x21, 0x5a, 0xbd, 0x32, 0xb3, 0x8b, 0x3c, 0x08, 0x0c, 0x2e, 0x3b, 0x81, 0xda, 0x5f, 0xbd, 0x44, 0x42, 0x81, 0x3c, 0x11, 0xf4, 0xb3, 0xbb, 0xf5, 0x91, 0xdd, 0xbd, 0x20, 0xdd, 0xb0, 0x3b, 0x94, 0xc1, 0xe4, 0x3c, 0x7c, 0x2f, 0x5d, 0xbd, 0x8b, 0x1f, 0xf3, 0x3c, 0xf7, 0xc1, 0xd1, 0xbd, 0x2e, 0x5f, 0x5d, 0xbd, 0x35, 0x2c, 0x92, 0x3b, 0x47, 0x24, 0x34, 0x3d, 0x7f, 0x44, 0x71, 0x3d, 0x39, 0xd7, 0xfc, 0x3c, 0x60, 0x34, 0x49, 0xbd, 0x70, 0xdc, 0x80, 0x3c, 0x3b, 0xe4, 0x5d, 0xbc, 0x7d, 0x7f, 0xe3, 0x3c, 0x6d, 0x96, 0x2e, 0x3d, 0x7b, 0x5c, 0x15, 0x3d, 0xc3, 0x8f, 0x78, 0x3c, 0x5b, 0x2f, 0x2d, 0xbc, 0x30, 0xfd, 0x3a, 0x3d, 0x79, 0x6a, 0xbb, 0x3d, 0x1a, 0xb0, 0x4d, 0x3c, 0xe2, 0x91, 0x9a, 0x3b, 0x3c, 0x03, 0xa4, 0x3d, 0xa9, 0x2a, 0x3a, 0xbd, 0xfc, 0xbb, 0x88, 0x3d, 0x16, 0x7f, 0x2a, 0x3c, 0xdd, 0xfc, 0x43, 0x3d, 0x41, 0x34, 0x3f, 0x3d, 0x80, 0x68, 0x76, 0xbd, 0xbb, 0xab, 0xa9, 0x3d, 0x4f, 0x4c, 0x17, 0x3d, 0xa3, 0x6e, 0x48, 0x3c, 0x24, 0xdf, 0xed, 0xbc, 0xa9, 0xca, 0x8e, 0xbd, 0x28, 0x64, 0x51, 0x3d, 0x65, 0xea, 0x94, 0x3d, 0x80, 0xc3, 0x08, 0x3b, 0xba, 0xc6, 0x38, 0x3d, 0xa3, 0x2f, 0x64, 0xba, 0x16, 0xc1, 0x28, 0x3d, 0xfb, 0x5a, 0x4c, 0x3c, 0xd9, 0x21, 0x26, 0xbd, 0xb9, 0x19, 0xbd, 0x3d, 0xba, 0x00, 0x59, 0x3c, 0xeb, 0x40, 0x14, 0xbc, 0x24, 0x37, 0xe9, 0xbc, 0x5e, 0x99, 0xd0, 0xbc, 0x7c, 0xbc, 0x18, 0xbd, 0x71, 0x23, 0x56, 0x3d, 0xca, 0xa7, 0x30, 0xbe, 0x37, 0x29, 0x5b, 0xbd, 0x73, 0xfa, 0x30, 0x3d, 0xb7, 0x67, 0xcd, 0xbc, 0x92, 0xa3, 0x54, 0x3c, 0xf8, 0x54, 0xaa, 0x3d, 0xba, 0x13, 0x8c, 0x3d, 0x35, 0xa3, 0xa6, 0x3c, 0x11, 0x44, 0x1d, 0xbc, 0x56, 0xe4, 0x18, 0xbd, 0xd6, 0x33, 0xab, 0x3c, 0x2c, 0x70, 0xa8, 0xbc, 0xa0, 0xd7, 0xc8, 0xb8, 0x56, 0xd9, 0x69, 0x3d, 0xab, 0xaf, 0x5e, 0xbd, 0x09, 0xbf, 0xb1, 0xbd, 0xad, 0xf1, 0x50, 0x3c, 0xe0, 0x69, 0x47, 0xbd, 0x21, 0x32, 0x2b, 0xbb, 0x66, 0x24, 0x90, 0xbd, 0xf8, 0xca, 0xbf, 0xbc, 0x1f, 0x85, 0x02, 0xbd, 0xc9, 0x47, 0xa6, 0x3d, 0xaa, 0xeb, 0x9b, 0xbc, 0xcf, 0x49, 0x88, 0xbd, 0x40, 0xf0, 0x4e, 0xbc, 0xe3, 0x45, 0x16, 0x3d, 0xd4, 0x2e, 0xa4, 0xbc, 0xaf, 0xe6, 0x81, 0x3d, 0x62, 0xef, 0x2c, 0xbc, 0x95, 0xea, 0x63, 0xbd, 0x33, 0x76, 0x9e, 0x3d, 0x16, 0xdf, 0xd6, 0xbd, 0xa4, 0xb0, 0xde, 0x39, 0xee, 0xfc, 0x89, 0x3d, 0xbd, 0x48, 0xbe, 0x3b, 0xd1, 0xbb, 0x31, 0xbc, 0x69, 0x1b, 0x26, 0xbd, 0xc1, 0x34, 0xec, 0x3c, 0x33, 0x47, 0xd5, 0x3c, 0xd0, 0xfb, 0x5c, 0x3b, 0xec, 0x71, 0x27, 0xbc, 0x48, 0x88, 0x62, 0x3c, 0x60, 0x89, 0x76, 0x3b, 0x4c, 0x07, 0xe8, 0x3c, 0xd5, 0xb4, 0x16, 0x3d, 0x9d, 0x21, 0x9f, 0x3c, 0x9d, 0x78, 0xb3, 0xbd, 0xeb, 0x74, 0x21, 0xbd, 0xdb, 0x5e, 0x75, 0xbd, 0x02, 0xf1, 0x9b, 0x3d, 0x50, 0x67, 0x30, 0xbc, 0xc4, 0xa7, 0xe6, 0x3c, 0x77, 0x75, 0x6e, 0x3c, 0xfd, 0x7e, 0x9e, 0xbb, 0x79, 0xed, 0x77, 0xbc, 0x18, 0x82, 0x40, 0x3d, 0x18, 0xd1, 0x93, 0x3d, 0x4a, 0xa2, 0x32, 0xbb, 0x83, 0xd5, 0x51, 0x3c, 0xa1, 0x52, 0xd9, 0x38, 0x6a, 0x5e, 0xb4, 0x3d, 0x73, 0xb2, 0x1f, 0xbd, 0x02, 0xe7, 0x06, 0xbd, 0x25, 0x20, 0x5c, 0xbd, 0x6a, 0x66, 0x16, 0x3d, 0xef, 0x75, 0x7c, 0x3d, 0x4b, 0xa8, 0x89, 0x3d, 0x17, 0x5e, 0x82, 0xbc, 0xd7, 0x41, 0x80, 0x3d, 0x67, 0x41, 0xaf, 0xbc, 0x93, 0x11, 0x9b, 0x3d, 0x4a, 0x03, 0xb3, 0xbd, 0x0d, 0x82, 0x32, 0xbd, 0x39, 0x35, 0xee, 0xbc, 0x07, 0x60, 0x87, 0xbd, 0x51, 0xb7, 0x4d, 0x3b, 0xe4, 0x6e, 0xbf, 0xbb, 0x24, 0x01, 0x36, 0xbd, 0x24, 0x02, 0x10, 0xbd, 0xfe, 0x24, 0x4f, 0xbd, 0xaf, 0xc2, 0x34, 0xbc, 0x21, 0x39, 0xd9, 0x3c, 0x80, 0x73, 0x88, 0x3c, 0x8e, 0xaf, 0x84, 0xbd, 0x1e, 0x05, 0x8b, 0xbd, 0xd2, 0xa7, 0x0e, 0x3d, 0x53, 0xe6, 0x89, 0x3b, 0xf3, 0xd7, 0xa7, 0x3d, 0x58, 0xf7, 0x29, 0x3d, 0xb1, 0x45, 0x9f, 0x3c, 0x3d, 0xf4, 0x73, 0x3d, 0x73, 0xd2, 0x4d, 0xbd, 0x6f, 0x4a, 0x0f, 0x3d, 0xc1, 0x60, 0x95, 0xbd, 0xf4, 0x0f, 0x8e, 0x3d, 0x83, 0x58, 0xed, 0xbd, 0x58, 0x39, 0x12, 0x3c, 0x20, 0x58, 0x39, 0x3d, 0xf4, 0xc9, 0x14, 0x3d, 0x5f, 0xa1, 0x0a, 0x3d, 0xd0, 0x80, 0x42, 0xbd, 0x2b, 0xc9, 0x35, 0xbd, 0xa5, 0xe0, 0xf9, 0xbc, 0x11, 0xe4, 0x8b, 0x3c, 0x0f, 0x18, 0x33, 0xbd, 0xb7, 0x53, 0x8f, 0xbc, 0xa8, 0xfe, 0x4f, 0xbd, 0x1f, 0x8d, 0xf9, 0x3b, 0x33, 0x31, 0xa6, 0x3d, 0xb7, 0x6d, 0x03, 0x3c, 0x80, 0xaa, 0xda, 0xbd, 0x82, 0x6e, 0xc5, 0x3c, 0x22, 0xaa, 0xba, 0x3c, 0xfd, 0xd9, 0xcd, 0x3c, 0x16, 0x60, 0x5a, 0x3c, 0x48, 0xdb, 0x36, 0x3d, 0x10, 0xf4, 0x84, 0xbc, 0x78, 0xf4, 0x8c, 0x3d, 0x24, 0xd3, 0xf2, 0xbc, 0x8e, 0xac, 0x16, 0xbd, 0x41, 0x7a, 0xf1, 0x3c, 0xd3, 0x25, 0x77, 0x3d, 0x26, 0xf2, 0x63, 0x3d, 0x7a, 0xb2, 0xa0, 0x3d, 0x00, 0xbb, 0xa4, 0x3c, 0x11, 0xd2, 0xf7, 0xbc, 0x92, 0x58, 0xa7, 0x3d, 0xa1, 0x9e, 0xaf, 0xbd, 0x38, 0xb3, 0x0b, 0x3c, 0xf3, 0xbb, 0x62, 0x3c, 0x98, 0x07, 0x9c, 0x3d, 0xa3, 0x56, 0xba, 0xba, 0x1a, 0x8d, 0x95, 0x3d, 0x13, 0x14, 0x7b, 0x3d, 0xfe, 0x05, 0xb3, 0x3d, 0xd2, 0x56, 0x01, 0x3c, 0x9e, 0xad, 0x44, 0x3d, 0xc7, 0xd7, 0x98, 0x3c, 0x1e, 0xfb, 0x18, 0x3d, 0x58, 0x4c, 0x53, 0xbc, 0xf2, 0x16, 0xf1, 0xbb, 0xae, 0x3a, 0xad, 0xbd, 0x3d, 0xdd, 0x40, 0xbd, 0x9f, 0xa1, 0x9c, 0xbd, 0xb6, 0xb7, 0x09, 0xbc, 0x74, 0xc3, 0xbc, 0xbd, 0x22, 0xf9, 0x61, 0xbc, 0x71, 0x46, 0x80, 0xbc, 0x26, 0x48, 0x53, 0xbd, 0x6a, 0xb7, 0x5d, 0x3d, 0xb9, 0xc9, 0x66, 0x3d, 0xaf, 0x27, 0x00, 0xbd, 0x24, 0x28, 0xd3, 0x3a, 0x53, 0xfb, 0x5d, 0xbd, 0xf4, 0x8b, 0x8a, 0x3d, 0x80, 0x14, 0x8e, 0xbd, 0x72, 0xcc, 0xa7, 0x3d, 0xd4, 0x5b, 0xff, 0xbc, 0xdf, 0x54, 0x43, 0xbd, 0x6a, 0x25, 0xe1, 0x3b, 0xe2, 0xe9, 0x09, 0xbd, 0x55, 0xad, 0x63, 0xbd, 0x14, 0xb6, 0xa9, 0x3b, 0x0c, 0xba, 0xd8, 0xbc, 0xc3, 0x6d, 0x53, 0xbd, 0x42, 0xa5, 0x5f, 0xbd, 0x7b, 0x04, 0x22, 0xbd, 0x15, 0x56, 0x77, 0x3c, 0x53, 0x67, 0xe6, 0xbc, 0x69, 0xe6, 0x89, 0x3c, 0x80, 0xcc, 0xbb, 0xbb, 0xea, 0x11, 0xb5, 0x3d, 0x02, 0x35, 0xb6, 0x3b, 0x98, 0x78, 0x19, 0x3d, 0xae, 0x02, 0xdd, 0xbd, 0x88, 0x78, 0x35, 0x3c, 0x30, 0x8b, 0x9d, 0xbd, 0xce, 0x4f, 0xad, 0xbd, 0x27, 0xf3, 0xcf, 0x3c, 0xda, 0x15, 0x82, 0xbd, 0x50, 0x43, 0x86, 0x3c, 0xff, 0x0b, 0xca, 0x3b, 0xec, 0x3f, 0xd1, 0xbc, 0x53, 0xc4, 0x15, 0x3d, 0x72, 0x9f, 0x12, 0x3d, 0xcb, 0x3b, 0xcc, 0x3c, 0x90, 0xd2, 0x3a, 0x3d, 0x42, 0x53, 0x0d, 0xbc, 0x46, 0x82, 0x93, 0x3d, 0xe9, 0x9a, 0xb1, 0xbd, 0x05, 0x99, 0x98, 0xbb, 0x52, 0x17, 0x71, 0xbd, 0x6e, 0xb6, 0x8d, 0xbd, 0x0f, 0xe1, 0x66, 0xbd, 0x2b, 0x2f, 0x1b, 0x3d, 0x97, 0x2f, 0xf4, 0xbc, 0xc0, 0xc0, 0x0f, 0x3d, 0xf3, 0x36, 0x6f, 0x3d, 0x38, 0x99, 0x97, 0x3c, 0xca, 0x4a, 0xca, 0xbd, 0xe2, 0x66, 0x11, 0x3b, 0xa8, 0xe8, 0x03, 0xbd, 0x60, 0xbf, 0x7e, 0xbb, 0x6d, 0x53, 0xb9, 0x3d, 0x50, 0x02, 0x0c, 0x3c, 0xe3, 0x5f, 0xbb, 0xbd, 0xd1, 0xc0, 0xbd, 0xbc, 0x42, 0x35, 0x89, 0x3d, 0x36, 0x8e, 0x9c, 0xbd, 0xac, 0x4a, 0x92, 0xbd, 0x7c, 0xb8, 0x65, 0xbd, 0x77, 0xdd, 0x5e, 0xbd, 0x58, 0x55, 0x38, 0xbd, 0x2e, 0xa6, 0x67, 0x3c, 0x7d, 0x81, 0x0b, 0xbd, 0x7b, 0xda, 0x92, 0x3d, 0x07, 0xec, 0x98, 0xbc, 0x6c, 0x89, 0x35, 0xbd, 0x1b, 0x09, 0x0a, 0x3d, 0xca, 0x57, 0x27, 0x3c, 0xab, 0xff, 0x2e, 0x3d, 0x97, 0xd7, 0x8d, 0xbd, 0xfa, 0x59, 0xb3, 0x3d, 0xb2, 0x38, 0x31, 0x3d, 0xd2, 0x30, 0x2b, 0x3d, 0xa5, 0x8d, 0xa4, 0x3b, 0xc9, 0xca, 0xe4, 0x3c, 0x0a, 0x75, 0x99, 0x3d, 0x3f, 0x85, 0x08, 0x3d, 0xff, 0x4e, 0x4e, 0x3d, 0x00, 0xfb, 0x74, 0x3d, 0x90, 0x22, 0xb2, 0xbb, 0xed, 0xe6, 0x8c, 0xbb, 0x23, 0x48, 0xe6, 0x3b, 0xfc, 0x6e, 0x62, 0xbd, 0xd5, 0x72, 0x58, 0x3d, 0xc8, 0x23, 0xce, 0x3c, 0xf2, 0x1f, 0x3b, 0x3c, 0xd0, 0x69, 0xc6, 0x3b, 0x18, 0x15, 0x62, 0x3c, 0xa8, 0x0a, 0x2b, 0x3d, 0x94, 0xed, 0x79, 0xbd, 0xf1, 0xff, 0x81, 0xbc, 0xb8, 0x90, 0x3e, 0xbd, 0x4d, 0x8e, 0x25, 0x3d, 0x04, 0x91, 0xef, 0x3d, 0xb9, 0x57, 0x17, 0x3d, 0x3a, 0xef, 0x01, 0xbd, 0xc4, 0x52, 0x59, 0xbc, 0x8a, 0x5e, 0x8e, 0xbd, 0xe7, 0x23, 0xf5, 0xbc, 0x4f, 0xe7, 0x1f, 0xbd, 0x1f, 0x86, 0x82, 0xbc, 0x1e, 0xf9, 0x53, 0x3d, 0xdf, 0x9c, 0x0a, 0x3c, 0xbf, 0xc9, 0xcc, 0x3c, 0xec, 0xa1, 0x3e, 0xbc, 0x9c, 0x8e, 0x5e, 0x3a, 0xfd, 0xd8, 0x90, 0xbc, 0xe8, 0x4c, 0xc7, 0xbc, 0xf2, 0x0f, 0x4b, 0x3a, 0x08, 0x9d, 0xbc, 0xbc, 0xab, 0x39, 0x4d, 0x3d, 0xea, 0x3d, 0x6b, 0x3d, 0x5c, 0x84, 0x80, 0x3d, 0x7d, 0x95, 0xf8, 0xbc, 0x70, 0xb2, 0x18, 0xbd, 0x2a, 0x02, 0x79, 0x3d, 0xe8, 0xd9, 0x3c, 0x3d, 0x67, 0xaf, 0x29, 0x3d, 0x39, 0x45, 0x27, 0xbd, 0x0a, 0x7b, 0x12, 0xbd, 0xbb, 0xdc, 0xe9, 0xbc, 0x73, 0x04, 0x83, 0xbd, 0x5d, 0xe4, 0x1c, 0xbd, 0xf0, 0x70, 0x29, 0x3d, 0x87, 0x1e, 0x0d, 0xbd, 0x39, 0x86, 0xf0, 0x3c, 0xf5, 0x57, 0x3e, 0xbd, 0xc8, 0x3c, 0x18, 0xbc, 0xf4, 0xa8, 0xa0, 0x3d, 0x5c, 0xa0, 0x6c, 0x3d, 0x02, 0x7a, 0x7e, 0xbc, 0x0b, 0xb6, 0x6d, 0xbd, 0xb0, 0x9a, 0xa8, 0x3c, 0xee, 0x24, 0x11, 0x3d, 0x54, 0x87, 0xf7, 0xbc, 0x57, 0x52, 0x70, 0xbd, 0x1e, 0x35, 0x46, 0xbd, 0x38, 0x2d, 0x82, 0x3d, 0x9d, 0x1a, 0x3c, 0xbd, 0x53, 0x7b, 0xa6, 0x3d, 0x29, 0x4b, 0xab, 0x3d, 0x0c, 0x43, 0x2d, 0x3d, 0x1a, 0x12, 0x95, 0x3d, 0x3b, 0xf1, 0x3e, 0x3d, 0x80, 0xf6, 0x8d, 0xbd, 0x1b, 0xb6, 0xb4, 0xbc, 0x98, 0x23, 0x79, 0xbd, 0xb7, 0xf6, 0xc5, 0x3d, 0x10, 0xd5, 0x48, 0x3d, 0x58, 0x7c, 0x9f, 0xbd, 0xa0, 0x5a, 0x16, 0xbd, 0x82, 0xfb, 0x8e, 0xbd, 0x0b, 0xec, 0xed, 0xbc, 0x92, 0xb7, 0xa3, 0xbd, 0xd5, 0xfd, 0x85, 0xbd, 0x54, 0xc9, 0x20, 0x3d, 0xad, 0xa1, 0x90, 0xbd, 0x83, 0xd6, 0xfb, 0xbc, 0xe2, 0x46, 0x43, 0x3b, 0xfe, 0xa6, 0xbd, 0xb7, 0x8f, 0xd3, 0xaf, 0x3d, 0x75, 0xb9, 0x9d, 0x3d, 0xd5, 0xfc, 0x2a, 0x3c, 0xc6, 0x7e, 0xd6, 0xbc, 0x08, 0xcd, 0x4c, 0xbd, 0xcf, 0x4f, 0x73, 0x3d, 0x3e, 0x7f, 0xb7, 0xbc, 0xbc, 0xa9, 0xfd, 0xbc, 0xf4, 0x8b, 0xa6, 0xbc, 0x11, 0x90, 0xd0, 0xbc, 0x47, 0xf7, 0x4d, 0x3c, 0xed, 0x09, 0x64, 0xbd, 0x61, 0x49, 0x8d, 0xbc, 0xc8, 0xd3, 0x3c, 0x3d, 0x72, 0x23, 0x88, 0x3d, 0xc3, 0xa7, 0x2e, 0x3d, 0x67, 0x01, 0x2d, 0xbd, 0xcc, 0x34, 0xa0, 0xbd, 0x7e, 0xc7, 0xf8, 0xbc, 0x0c, 0xf5, 0xaf, 0xbb, 0x6e, 0xa6, 0x4f, 0x3d, 0xe2, 0xb9, 0x88, 0xbd, 0x87, 0x6f, 0xf9, 0xbc, 0x82, 0x23, 0x16, 0x3c, 0x10, 0x0c, 0x69, 0x3b, 0xab, 0x02, 0xe2, 0x3c, 0x57, 0x6a, 0x08, 0xba, 0x4e, 0xc7, 0x6a, 0x3d, 0x30, 0x86, 0x6d, 0x3c, 0xee, 0xb3, 0x84, 0x3d, 0xf9, 0xc4, 0x3a, 0x3d, 0x6f, 0x21, 0x8d, 0xbb, 0xef, 0x7e, 0xc1, 0x3b, 0x05, 0xca, 0x12, 0xbc, 0x8a, 0x77, 0x2b, 0xbd, 0x1e, 0x23, 0x32, 0x3d, 0x32, 0x8b, 0x03, 0x3d, 0xd3, 0x33, 0x0a, 0xbd, 0x3f, 0xdd, 0x59, 0xbd, 0x18, 0xfa, 0x00, 0x3d, 0x46, 0x0b, 0xdd, 0x3b, 0x96, 0x2b, 0x4c, 0xbd, 0xc8, 0xcc, 0xa7, 0x3d, 0xe2, 0xad, 0x2e, 0x3d, 0xbc, 0x68, 0x54, 0x3d, 0xcb, 0x88, 0xae, 0x3c, 0x00, 0xd8, 0x15, 0xbc, 0x18, 0x4b, 0xb5, 0xbd, 0x89, 0x31, 0x93, 0xbd, 0x84, 0xd3, 0x57, 0x3d, 0x86, 0x2c, 0x6c, 0x3d, 0x18, 0x08, 0xb1, 0x3d, 0x14, 0x61, 0xbc, 0xbc, 0x25, 0xa4, 0x27, 0xbd, 0xfa, 0xdd, 0xb7, 0xbd, 0x81, 0xaf, 0x1d, 0xbc, 0x06, 0x91, 0x5d, 0x3d, 0x54, 0xfb, 0xc9, 0xbc, 0x0b, 0x35, 0x9a, 0x3b, 0x48, 0x7f, 0x1c, 0xbd, 0xaa, 0x85, 0x54, 0x3d, 0x3e, 0x43, 0xfe, 0xbb, 0xcb, 0xf9, 0xbf, 0x3b, 0x4b, 0x03, 0xed, 0x3c, 0xe0, 0x7f, 0x85, 0x3d, 0xe2, 0x52, 0x82, 0x3d, 0x98, 0x11, 0x94, 0x3d, 0x39, 0x2d, 0x26, 0x3c, 0xce, 0x96, 0x5e, 0xbd, 0x6c, 0x42, 0x31, 0xbd, 0xca, 0x90, 0xd4, 0x3b, 0x66, 0xa9, 0xc0, 0xbd, 0x23, 0x2e, 0x8d, 0x3d, 0x26, 0xc8, 0x4a, 0xbc, 0x2a, 0xbd, 0x09, 0xbd, 0x26, 0xa5, 0xe6, 0x3c, 0x1e, 0x7c, 0xaa, 0x3d, 0x1b, 0x52, 0x15, 0x3d, 0xb2, 0xa4, 0x81, 0x3d, 0x73, 0x78, 0x8a, 0x3c, 0x60, 0x6d, 0x4a, 0xbd, 0x60, 0xc1, 0x3b, 0xbc, 0x14, 0xc6, 0xfb, 0x3c, 0x48, 0x70, 0x05, 0xbd, 0xc1, 0xa4, 0x98, 0x3d, 0x71, 0x0a, 0xc4, 0xbd, 0x25, 0xdd, 0x31, 0xbd, 0x99, 0x3a, 0x94, 0xbd, 0xa1, 0x45, 0xbf, 0x3c, 0x54, 0x14, 0xbf, 0xbc, 0xfd, 0x98, 0xd2, 0xbd, 0xca, 0x27, 0x87, 0xbd, 0x1a, 0x52, 0x3a, 0x3d, 0xc3, 0xcf, 0x42, 0xbc, 0x4c, 0x2f, 0xe0, 0x3a, 0x96, 0x3f, 0x5e, 0x3b, 0xba, 0xc2, 0x1d, 0xbd, 0xed, 0x26, 0x42, 0xbd, 0xf6, 0xe0, 0xb4, 0x3d, 0xbe, 0x39, 0x23, 0xbc, 0x05, 0x9d, 0xba, 0x3c, 0xe9, 0x38, 0x2f, 0xbb, 0x15, 0x9c, 0xbb, 0x3d, 0x22, 0xca, 0x66, 0x3c, 0x10, 0x16, 0xdb, 0xbc, 0x11, 0x3d, 0xda, 0x3d, 0xac, 0x48, 0x37, 0xbd, 0xac, 0x3e, 0x08, 0xbd, 0x8b, 0xb1, 0x7f, 0x3d, 0xe7, 0x31, 0xa3, 0x3c, 0xd5, 0xe9, 0xb6, 0x3d, 0x53, 0xc1, 0x19, 0xbd, 0x2f, 0xc2, 0x35, 0xbd, 0xf9, 0xa6, 0xa2, 0xbd, 0x46, 0x22, 0x2b, 0x3d, 0x2a, 0x2c, 0x3b, 0xbd, 0xf3, 0x8e, 0x07, 0x3c, 0xff, 0xb1, 0x09, 0xbd, 0xbd, 0x01, 0x0f, 0xbb, 0x04, 0x7f, 0x4a, 0xbd, 0xb9, 0xca, 0x87, 0x3d, 0x4e, 0x96, 0x12, 0xbc, 0x7b, 0x9a, 0x7d, 0x3d, 0x1b, 0x48, 0x08, 0xbc, 0x1b, 0x36, 0x8a, 0x3d, 0xd1, 0x48, 0xe1, 0x3c, 0xb9, 0xb0, 0x6f, 0x3d, 0x51, 0x6a, 0x83, 0xbb, 0xaa, 0xf0, 0xac, 0x3d, 0x61, 0xdb, 0x43, 0xbd, 0x2e, 0xcf, 0xa2, 0x3d, 0xa6, 0x41, 0x89, 0x3d, 0x53, 0x86, 0xe1, 0xbc, 0xda, 0x91, 0x9a, 0xbd, 0xba, 0xf7, 0x86, 0x3d, 0x8b, 0x8c, 0xab, 0xbd, 0xa2, 0x2c, 0x6b, 0x3d, 0x31, 0x66, 0x83, 0x3c, 0xce, 0xd5, 0x0e, 0xbd, 0x35, 0x29, 0x73, 0x3d, 0x9b, 0xf7, 0xb0, 0x3d, 0x51, 0x33, 0x21, 0x3d, 0x4c, 0xa1, 0x4b, 0x3d, 0x58, 0xe3, 0xd5, 0xbc, 0x9f, 0xe4, 0x68, 0x3b, 0xed, 0x0b, 0x1e, 0x3b, 0xc8, 0x06, 0x8c, 0x3c, 0x67, 0x47, 0x17, 0xbd, 0x63, 0xb4, 0xd1, 0xbc, 0xf3, 0x34, 0x55, 0xbc, 0xde, 0x7b, 0x31, 0xbd, 0x17, 0x4e, 0x74, 0xba, 0x8b, 0x65, 0x43, 0xbc, 0x01, 0xcc, 0xa0, 0x3d, 0xc7, 0x20, 0xa2, 0xbd, 0x63, 0x70, 0x67, 0x3c, 0x65, 0xa0, 0x8d, 0x3d, 0xdf, 0xc9, 0x3d, 0xbc, 0x2f, 0xfa, 0x44, 0x3b, 0xd2, 0xcf, 0x42, 0x3d, 0x9a, 0x40, 0x06, 0x3d, 0x67, 0x53, 0x4b, 0xbc, 0x43, 0x50, 0x4a, 0x3c, 0x23, 0xb9, 0xa1, 0xbc, 0xad, 0x34, 0xe3, 0xbc, 0xac, 0xc4, 0x4f, 0xbd, 0x4b, 0x40, 0xe5, 0xbb, 0xc3, 0xf1, 0x50, 0xbd, 0x98, 0x34, 0x28, 0xbd, 0x28, 0xf8, 0xae, 0x3d, 0xd1, 0x27, 0x8f, 0x3c, 0xb4, 0x8c, 0x8b, 0x3d, 0x73, 0xf2, 0x07, 0xbb, 0x65, 0x39, 0x61, 0xbd, 0x9a, 0x90, 0xcb, 0xbb, 0x18, 0x2f, 0x8e, 0xbd, 0x65, 0xab, 0x4b, 0x3d, 0xd1, 0x40, 0x64, 0xbd, 0x10, 0xdb, 0x83, 0xbd, 0x3b, 0x12, 0xa5, 0x3d, 0x31, 0x45, 0x78, 0x3d, 0xa4, 0xb1, 0x26, 0x3d, 0xac, 0x10, 0x42, 0xbc, 0xbe, 0x62, 0xb3, 0xbd, 0x4e, 0x3d, 0x76, 0x3c, 0x66, 0x0e, 0xde, 0xbc, 0x4f, 0x82, 0xd0, 0xbd, 0xf1, 0x86, 0x8e, 0xbd, 0xf1, 0xe8, 0x37, 0x3c, 0xb7, 0xbb, 0x0e, 0x3d, 0x1c, 0xc4, 0x05, 0x3d, 0x15, 0x50, 0x86, 0x3d, 0x81, 0x10, 0x92, 0x3b, 0x0a, 0xff, 0xed, 0x3c, 0x91, 0x9b, 0xb3, 0xbb, 0xb5, 0xba, 0x26, 0xbc, 0x89, 0xef, 0x0f, 0x3d, 0x52, 0xde, 0x47, 0x3d, 0x9d, 0x0f, 0x0c, 0x3d, 0x80, 0xee, 0xcb, 0xbd, 0xe2, 0xc7, 0x82, 0xbd, 0x1a, 0xf6, 0x64, 0x3c, 0xaf, 0xa7, 0xbf, 0xbc, 0xfc, 0x41, 0x37, 0x3c, 0xf9, 0x88, 0xfe, 0xbc, 0xdf, 0x47, 0x8d, 0xbc, 0x55, 0x09, 0x0b, 0xbd, 0x32, 0x50, 0x00, 0xbd, 0x83, 0x62, 0xaf, 0xbc, 0xdc, 0xac, 0x5e, 0xbd, 0xb6, 0x22, 0x54, 0xbd, 0x74, 0xd7, 0x00, 0x3c, 0xe3, 0x5a, 0xcb, 0xbc, 0xaa, 0x37, 0x25, 0xbd, 0x64, 0x98, 0x5f, 0x3d, 0x81, 0xdf, 0x8b, 0x3c, 0x23, 0xef, 0x66, 0x3b, 0x84, 0x67, 0x55, 0xbb, 0xd2, 0x11, 0x98, 0xbd, 0x2b, 0x15, 0x82, 0x3d, 0xeb, 0x1e, 0xc6, 0x3c, 0x56, 0x83, 0xcb, 0xba, 0xd0, 0xc7, 0x2d, 0x3d, 0xd1, 0xcd, 0x0c, 0x3d, 0xe4, 0x5c, 0x5a, 0xbc, 0x4a, 0xf3, 0x73, 0xbd, 0x43, 0xdc, 0xfe, 0x3c, 0x00, 0xd6, 0x2f, 0x3d, 0x06, 0x22, 0x49, 0xbb, 0x4e, 0x45, 0x71, 0xbc, 0xb3, 0x3c, 0x00, 0x3d, 0x1a, 0xae, 0x58, 0xbd, 0x15, 0x61, 0x92, 0x3d, 0x14, 0xb9, 0xf8, 0xbc, 0x15, 0x2c, 0x1b, 0x3d, 0x31, 0x97, 0x3b, 0xbc, 0xe2, 0xe7, 0x18, 0x3d, 0xcf, 0xf0, 0x1f, 0xbd, 0x7c, 0x1e, 0x0f, 0x3d, 0xb1, 0x27, 0x7f, 0xbd, 0xb8, 0xdd, 0xb2, 0xbd, 0xcc, 0xc2, 0x44, 0x3d, 0x44, 0x5c, 0x06, 0xbd, 0x4f, 0x6a, 0x4a, 0xbd, 0x43, 0x2c, 0x87, 0x3d, 0xb7, 0xe9, 0x48, 0xbd, 0x60, 0x01, 0x07, 0xbd, 0x0b, 0xe4, 0x78, 0x3a, 0x92, 0x5d, 0x64, 0xbd, 0x7c, 0xcf, 0x81, 0xbc, 0xe2, 0x59, 0xab, 0x3c, 0xf0, 0xbc, 0x68, 0xbc, 0xc3, 0x2d, 0x3d, 0x3d, 0x27, 0xb2, 0xce, 0x3d, 0x44, 0x61, 0x0e, 0x3c, 0x94, 0x6d, 0x02, 0xbd, 0xe5, 0x6f, 0xc2, 0x3c, 0x70, 0xab, 0x8a, 0x3a, 0x14, 0xab, 0x04, 0x3c, 0x9d, 0xd4, 0xab, 0x3d, 0x0a, 0x7d, 0x64, 0x3c, 0x17, 0xb5, 0xce, 0x3b, 0x66, 0xbd, 0x24, 0x3d, 0xed, 0xce, 0x77, 0xbd, 0xed, 0x6e, 0x7f, 0xbd, 0x70, 0xe8, 0x10, 0xbc, 0x6a, 0x80, 0x37, 0x3d, 0x2d, 0x0b, 0x83, 0x3d, 0x8e, 0x4b, 0x5e, 0xbd, 0xd6, 0x38, 0x34, 0xbd, 0xce, 0xaf, 0x88, 0x3d, 0xef, 0x64, 0x10, 0xbc, 0xa0, 0x8b, 0xac, 0xbd, 0x70, 0xa5, 0x50, 0x3c, 0x87, 0x3d, 0x83, 0x3d, 0x70, 0x63, 0x57, 0xbd, 0xf3, 0x6a, 0x44, 0x3d, 0x3a, 0x49, 0xda, 0xbd, 0x1b, 0x74, 0xde, 0xbd, 0x0d, 0xb2, 0x34, 0x3d, 0x04, 0x0f, 0x87, 0x3d, 0x04, 0xb1, 0x25, 0xbd, 0x5f, 0x2c, 0x01, 0xbc, 0x9a, 0x55, 0x6b, 0x3b, 0xad, 0xdf, 0x5e, 0x3d, 0x7f, 0x85, 0x2a, 0x3c, 0xfa, 0x88, 0xfa, 0xbc, 0x0d, 0x79, 0x8b, 0xbd, 0x01, 0x45, 0x73, 0x3d, 0x11, 0xde, 0xb6, 0x3c, 0xcc, 0xb5, 0xa4, 0x3c, 0xe8, 0xc5, 0x67, 0xbc, 0x66, 0x99, 0x92, 0x3d, 0x36, 0xb0, 0x79, 0xbd, 0x14, 0x41, 0xa7, 0x3d, 0xfe, 0x98, 0xcf, 0x3c, 0x32, 0xf7, 0x0a, 0x3d, 0xa6, 0x4a, 0x45, 0x3d, 0x83, 0xa0, 0x9e, 0x3d, 0x86, 0x2e, 0x71, 0x3d, 0x92, 0x9c, 0x4d, 0x3d, 0xed, 0x24, 0xeb, 0xbc, 0x3e, 0xfe, 0xc0, 0xbc, 0xcd, 0x6e, 0x4f, 0x3c, 0x83, 0x86, 0xa5, 0xbd, 0xa4, 0xd7, 0xa5, 0xbc, 0xe0, 0x9a, 0x38, 0x3d, 0xe2, 0x79, 0xcd, 0x3c, 0x4a, 0xe2, 0xa1, 0x3c, 0x94, 0x66, 0xd1, 0xbc, 0xe6, 0xed, 0x9b, 0x3c, 0x68, 0xb1, 0x41, 0x3b, 0x1b, 0x65, 0x0b, 0x3d, 0xdd, 0x50, 0xae, 0xbd, 0x29, 0xf9, 0xfc, 0xbc, 0x33, 0xe6, 0x37, 0xbd, 0xb6, 0x53, 0xbb, 0x3c, 0x0c, 0x5e, 0xf6, 0x3d, 0x75, 0xbb, 0xf6, 0xbc, 0xf8, 0xc6, 0x9a, 0x3d, 0x8f, 0xe5, 0xc4, 0x3c, 0x88, 0xee, 0x33, 0xbc, 0x73, 0xb2, 0x87, 0x3c, 0xd4, 0xd8, 0x58, 0x3c, 0x15, 0x37, 0x82, 0x3d, 0xc1, 0x4f, 0x38, 0xbc, 0xba, 0x8e, 0xf9, 0xbb, 0x7c, 0x56, 0xe0, 0xbd, 0xca, 0x23, 0x94, 0xbc, 0x24, 0x41, 0xae, 0x3d, 0x89, 0x4e, 0x9a, 0x3c, 0xcb, 0x28, 0xe3, 0x3c, 0xf1, 0xfa, 0x05, 0x3d, 0xe3, 0xa4, 0x80, 0xbd, 0x6f, 0xda, 0x16, 0x3d, 0xc7, 0xee, 0x77, 0xbd, 0xa8, 0xe3, 0xb1, 0xbc, 0x6f, 0x70, 0x90, 0xbc, 0x78, 0x35, 0x48, 0x3d, 0xac, 0xdb, 0x23, 0xbd, 0x4e, 0xbd, 0xe4, 0xbb, 0x79, 0x88, 0xd0, 0xbb, 0xf2, 0xa9, 0xb6, 0xbd, 0x54, 0x46, 0x5d, 0xbd, 0xc6, 0xb2, 0x95, 0x3d, 0xe6, 0x67, 0x52, 0x3d, 0xa6, 0x5d, 0x7f, 0xbd, 0x0b, 0xe5, 0xad, 0x3b, 0x91, 0xf6, 0x0c, 0x3c, 0x33, 0x45, 0xab, 0xbc, 0xa7, 0x84, 0xb3, 0xbc, 0xf5, 0xb0, 0x6c, 0x3c, 0x08, 0xc9, 0xb4, 0x3c, 0x61, 0x9d, 0x8b, 0x3c, 0x0d, 0x19, 0x87, 0x3d, 0xaa, 0xbc, 0xd3, 0xbc, 0x85, 0x92, 0x8e, 0x3b, 0xfc, 0x26, 0x49, 0xbd, 0x56, 0x7e, 0x7f, 0x3d, 0xf3, 0x85, 0x61, 0xbd, 0x8c, 0x5b, 0xf0, 0x3c, 0x14, 0x09, 0x65, 0xbd, 0x66, 0x78, 0x38, 0xbb, 0x2c, 0x69, 0x4d, 0xbd, 0x33, 0x31, 0x46, 0x3d, 0x6d, 0xb8, 0xa6, 0xbc, 0x69, 0x4e, 0xc3, 0x3d, 0xc9, 0x54, 0x93, 0xbd, 0x1a, 0x80, 0x83, 0x3d, 0x06, 0x1b, 0xa8, 0x3c, 0xf0, 0x64, 0x65, 0x3c, 0xae, 0xd7, 0xb2, 0x3d, 0x03, 0xc0, 0xf0, 0x3c, 0x9d, 0xbf, 0x84, 0xbd, 0xa6, 0x60, 0xfd, 0xbd, 0x58, 0x27, 0x41, 0x3d, 0x3f, 0x70, 0x9f, 0x3c, 0x13, 0x59, 0x37, 0xbd, 0x6b, 0x61, 0x4e, 0xbd, 0xb5, 0xf3, 0x26, 0x39, 0x10, 0x99, 0xc5, 0x3c, 0x7c, 0xda, 0x28, 0x3d, 0x23, 0x7b, 0x78, 0x3b, 0xa5, 0x5f, 0x1c, 0xbd, 0x8e, 0x82, 0xd0, 0x3c, 0x42, 0x5a, 0x29, 0x3d, 0x5c, 0x7a, 0x1d, 0xb8, 0xf8, 0x4e, 0x3c, 0xbc, 0x24, 0xee, 0x52, 0x3b, 0x56, 0xfa, 0x0b, 0x3d, 0xe2, 0xa4, 0xc4, 0x3b, 0xd1, 0x51, 0xe1, 0xbd, 0x22, 0xbb, 0x7f, 0xbd, 0xd3, 0x54, 0x6d, 0x3d, 0x75, 0x61, 0xaa, 0x3d, 0x4a, 0xd4, 0x33, 0x3d, 0x2d, 0x5f, 0x91, 0x3c, 0x38, 0xc6, 0xe3, 0xb9, 0x91, 0x94, 0x38, 0x3d, 0x87, 0x92, 0xd5, 0x3c, 0xb3, 0x59, 0x34, 0xbd, 0x74, 0x48, 0x64, 0xbd, 0x90, 0xb1, 0xba, 0x3c, 0xd1, 0x21, 0x97, 0x3c, 0xb9, 0x24, 0xa7, 0x3c, 0xa0, 0xe7, 0xe8, 0xbd, 0xf1, 0xc5, 0x45, 0x3c, 0x93, 0x0e, 0x2e, 0x3d, 0x31, 0x84, 0xd5, 0xbc, 0xd7, 0x86, 0xbf, 0x3c, 0x5b, 0xae, 0xb8, 0x3c, 0xc3, 0x7e, 0xf3, 0xbc, 0xb1, 0xd7, 0x0c, 0x3d, 0x2a, 0x33, 0xcc, 0x3d, 0x86, 0x09, 0x6b, 0x3d, 0xb6, 0xa4, 0x97, 0x3d, 0x15, 0x03, 0x89, 0x3d, 0x5c, 0x5c, 0x85, 0x3d, 0x47, 0x39, 0x65, 0x3d, 0xd2, 0x8b, 0x06, 0xbd, 0x6c, 0xed, 0x55, 0x3b, 0x30, 0xd5, 0x99, 0xbc, 0x7d, 0x00, 0xb5, 0xbb, 0x54, 0xe8, 0x12, 0xbd, 0x8c, 0x6f, 0x3e, 0x3c, 0x07, 0x15, 0x9a, 0x3d, 0xf2, 0x93, 0xa1, 0x3d, 0x0a, 0xf7, 0x7c, 0x3d, 0x89, 0xe9, 0xc0, 0x3c, 0xc4, 0x63, 0x6d, 0x3d, 0x02, 0x6a, 0xa9, 0x3d, 0x85, 0x9b, 0x4b, 0x3d, 0x20, 0x90, 0x99, 0x3c, 0xcd, 0xb5, 0x1f, 0x3d, 0x7f, 0x5e, 0x72, 0xbd, 0x19, 0x42, 0x08, 0xbc, 0x4c, 0xd0, 0x60, 0xbd, 0x28, 0x45, 0x5d, 0xbd, 0x9f, 0x9e, 0x95, 0xbd, 0xf8, 0x82, 0x82, 0xbd, 0x14, 0xd6, 0x3c, 0x3d, 0x55, 0x69, 0x6e, 0x3d, 0x6e, 0xd1, 0x37, 0xbc, 0x6a, 0x72, 0x34, 0xbd, 0x67, 0x77, 0xa4, 0xbc, 0xd0, 0xb2, 0xaa, 0x3d, 0xfa, 0xbb, 0x32, 0x3d, 0x5b, 0xfd, 0x1e, 0x3d, 0x6b, 0x18, 0x8a, 0x3b, 0xd1, 0xe0, 0x3b, 0x3c, 0x0e, 0xaa, 0xb8, 0xbc, 0xd8, 0x60, 0x73, 0x3d, 0x18, 0xea, 0xac, 0x3d, 0x0a, 0x98, 0x8c, 0xbd, 0xa8, 0xae, 0x90, 0x3d, 0xa4, 0x92, 0x81, 0x3b, 0xfa, 0x7d, 0x67, 0x3d, 0xd1, 0x86, 0xad, 0x3d, 0xa0, 0x03, 0x2e, 0xbc, 0xa7, 0x6d, 0xf7, 0x3c, 0x93, 0xfe, 0x81, 0x3d, 0x55, 0x43, 0xdd, 0x3b, 0x9e, 0xc7, 0x19, 0x3d, 0xc1, 0x4e, 0x1e, 0x3d, 0x4a, 0xb6, 0x3c, 0xbd, 0xae, 0x17, 0x16, 0xbd, 0xa1, 0xf5, 0x4d, 0xbd, 0x89, 0x2c, 0x04, 0xbd, 0xd3, 0xeb, 0x93, 0x3d, 0x35, 0xae, 0x19, 0x3c, 0xf8, 0x48, 0xa5, 0x3c, 0x94, 0x41, 0xf4, 0xbc, 0x67, 0x32, 0x41, 0xbd, 0x19, 0x2d, 0x38, 0x3d, 0x57, 0x90, 0x6f, 0xbc, 0xea, 0xb3, 0x89, 0xbc, 0x73, 0x19, 0x5b, 0x3d, 0x9d, 0x72, 0xae, 0x3d, 0xb9, 0x8b, 0x23, 0xbd, 0xa4, 0x13, 0x43, 0xbc, 0xd0, 0x4d, 0x12, 0x3d, 0xd7, 0xa3, 0x38, 0xbd, 0xc9, 0xb4, 0xd5, 0x3d, 0x4b, 0x93, 0x24, 0x3c, 0xd2, 0xfa, 0xe8, 0xbc, 0xdb, 0xa3, 0x0b, 0xbd, 0xc2, 0xdd, 0x5e, 0x3d, 0x4c, 0x2c, 0xa5, 0xbd, 0xd2, 0x24, 0x77, 0xbd, 0x50, 0xd3, 0xa1, 0x3d, 0xca, 0xe7, 0x00, 0x3a, 0xbf, 0x15, 0xed, 0xbc, 0x83, 0xc3, 0x60, 0x3d, 0xba, 0x44, 0x82, 0x3d, 0xa4, 0x8d, 0x93, 0x3d, 0x7a, 0xdf, 0x92, 0xbd, 0x2e, 0x60, 0xcd, 0x3b, 0x8a, 0xc9, 0x67, 0x3d, 0xbc, 0x59, 0x2e, 0xbd, 0xd6, 0x96, 0xb0, 0x3d, 0x89, 0x2f, 0xd1, 0xbc, 0x18, 0xd2, 0x0c, 0xbc, 0xc4, 0xf8, 0x84, 0x3d, 0x50, 0xc8, 0x52, 0xbd, 0xa8, 0xc1, 0x58, 0xbd, 0xa3, 0xe1, 0x26, 0x3d, 0x61, 0x05, 0x00, 0x3d, 0x5d, 0xe9, 0x84, 0x3d, 0xc2, 0x44, 0x37, 0x3d, 0xfb, 0xf3, 0xb0, 0xbc, 0x69, 0x4b, 0x6c, 0xbd, 0xa9, 0x6b, 0xa4, 0xbc, 0x77, 0x53, 0x84, 0x3c, 0x12, 0x21, 0x0c, 0xbd, 0x0d, 0x59, 0x08, 0xbc, 0x44, 0xb6, 0x11, 0xbd, 0xaa, 0xef, 0x8e, 0x3d, 0x4e, 0x39, 0x32, 0x3d, 0x40, 0x7f, 0x7a, 0xbd, 0xa8, 0x2d, 0xbf, 0xbc, 0x3a, 0xff, 0x30, 0x3d, 0xff, 0x61, 0xbb, 0x3b, 0xc3, 0xdf, 0x96, 0xbc, 0x22, 0x74, 0x53, 0xbd, 0x69, 0x07, 0x8a, 0xbd, 0x46, 0x58, 0xe0, 0x3c, 0x91, 0x62, 0x31, 0xbd, 0x38, 0x57, 0x01, 0xbc, 0x09, 0x74, 0x93, 0xbc, 0x3e, 0xb2, 0x8a, 0x3c, 0xd8, 0x12, 0x1d, 0xbd, 0xd7, 0xf6, 0xc2, 0xbc, 0x86, 0x55, 0x11, 0x3c, 0x28, 0x0d, 0x70, 0x3d, 0x98, 0xa3, 0x8a, 0x3d, 0x7b, 0xf0, 0x93, 0xbd, 0xc2, 0x7c, 0x0b, 0xbd, 0xfa, 0x05, 0xcc, 0x3c, 0x5f, 0x77, 0x19, 0x3d, 0xe0, 0x09, 0xb3, 0x3c, 0x13, 0x77, 0x8a, 0xbc, 0x1f, 0x76, 0x36, 0x3c, 0xfb, 0x4f, 0x97, 0x3d, 0x1f, 0xec, 0x31, 0x3d, 0xf9, 0x14, 0x79, 0x3d, 0x50, 0xab, 0x92, 0xbd, 0xda, 0x3c, 0xf3, 0xba, 0x2f, 0x4d, 0x72, 0xbc, 0x0f, 0x3a, 0xc6, 0x3c, 0x7e, 0xf5, 0x40, 0xbd, 0x0f, 0xf2, 0x87, 0xbd, 0xc9, 0x6e, 0xef, 0xbc, 0x06, 0xec, 0xce, 0xbc, 0x3d, 0x26, 0x2b, 0xbd, 0x4a, 0x6a, 0x53, 0x3d, 0x1b, 0x90, 0x1a, 0xbb, 0x39, 0xb6, 0x23, 0x3d, 0xa2, 0xbd, 0x88, 0xbd, 0xd7, 0x0d, 0x2a, 0xbc, 0xf5, 0xf6, 0x94, 0xbd, 0xf0, 0xd7, 0x52, 0xbc, 0x85, 0x99, 0x83, 0xbd, 0xdd, 0xc4, 0x8c, 0xbd, 0xaa, 0x19, 0x4a, 0x3d, 0x26, 0x21, 0xec, 0x3c, 0x0f, 0xe7, 0x1b, 0xbc, 0x39, 0x8e, 0xea, 0xbc, 0x03, 0xdc, 0x2f, 0xbd, 0x03, 0x8c, 0x8c, 0x3d, 0xe4, 0xcb, 0x7f, 0xbc, 0xc6, 0xb9, 0xfd, 0x3b, 0x78, 0x5b, 0x44, 0xbd, 0xd0, 0x3d, 0x89, 0xbc, 0xe0, 0xdb, 0xc2, 0xbc, 0x84, 0x8d, 0x39, 0xbd, 0x9a, 0x7b, 0x9a, 0x3b, 0x5d, 0xb4, 0x88, 0xbc, 0xf3, 0xf0, 0x8e, 0xbd, 0x27, 0x0c, 0x41, 0x3d, 0xe7, 0x60, 0xa0, 0x3c, 0x86, 0xb6, 0xa9, 0xbc, 0x15, 0x55, 0x4f, 0xbd, 0xf4, 0x53, 0xfb, 0xbc, 0xdf, 0x4d, 0x0d, 0x3d, 0x06, 0x46, 0x7d, 0xbd, 0x37, 0x4d, 0xb0, 0xbc, 0x7d, 0x65, 0x1e, 0xbd, 0x30, 0x1a, 0x00, 0xbb, 0x16, 0x56, 0x28, 0xbd, 0xb4, 0xef, 0xdd, 0xbc, 0xcc, 0xbc, 0x40, 0xbd, 0x95, 0xce, 0x84, 0xbd, 0x97, 0x26, 0x98, 0xbd, 0x86, 0x1f, 0x80, 0xbd, 0x64, 0x16, 0x97, 0x3c, 0x9b, 0xd0, 0x22, 0x3c, 0x05, 0x08, 0x52, 0xbb, 0xd2, 0x11, 0x8e, 0xbd, 0x3c, 0xa3, 0x8c, 0x3d, 0x4c, 0xdb, 0xa0, 0xbd, 0x24, 0xe2, 0x0a, 0xbd, 0x24, 0x87, 0x69, 0x3c, 0x7c, 0x72, 0xb2, 0x3c, 0xda, 0xcd, 0x0c, 0x3d, 0xd1, 0x51, 0x4c, 0x3d, 0xb6, 0xaf, 0x30, 0xbd, 0x07, 0xa0, 0x64, 0x3d, 0x09, 0x30, 0x59, 0x3d, 0x68, 0xb3, 0x06, 0xbd, 0x01, 0x85, 0xe4, 0xbc, 0x10, 0x9f, 0x2a, 0xbd, 0xe0, 0x85, 0x93, 0x3d, 0x71, 0xe0, 0x13, 0xbd, 0x28, 0x8b, 0x8e, 0x3c, 0x53, 0x74, 0x71, 0xbc, 0x6a, 0x6d, 0xad, 0x3d, 0x88, 0xf7, 0x32, 0x3c, 0xfb, 0xde, 0x41, 0x3c, 0x90, 0x33, 0x4c, 0xba, 0x89, 0xe4, 0x1d, 0x3c, 0x47, 0x26, 0xb5, 0xbc, 0x5c, 0x9c, 0x9d, 0xbd, 0xd4, 0xe8, 0xdb, 0x3b, 0x7f, 0x88, 0x99, 0x3d, 0x79, 0xd9, 0xb8, 0xbc, 0x76, 0x00, 0xb9, 0x3d, 0x74, 0x04, 0xb9, 0xbc, 0xde, 0x84, 0x38, 0x3d, 0x5c, 0x38, 0x91, 0x3d, 0x80, 0x37, 0x04, 0xbd, 0xfa, 0x1a, 0x34, 0x3d, 0x36, 0x16, 0x11, 0x3d, 0xf3, 0x66, 0x86, 0x3d, 0x84, 0x83, 0x16, 0xbd, 0xec, 0x1a, 0x43, 0xbd, 0x06, 0xf8, 0x64, 0x3d, 0x96, 0x19, 0x31, 0x3b, 0x75, 0x30, 0x9e, 0x3d, 0xf5, 0xfa, 0xd1, 0xbb, 0x96, 0xf3, 0xc8, 0xbc, 0x84, 0x0f, 0x6d, 0xbd, 0xd1, 0x3e, 0x77, 0x3c, 0xbb, 0xb8, 0xf1, 0xbc, 0x49, 0xf5, 0x70, 0x3d, 0x33, 0x33, 0x44, 0xbd, 0xc9, 0xca, 0xf5, 0x3c, 0x5d, 0xe3, 0x2c, 0xbc, 0x06, 0x48, 0xb8, 0x3d, 0xfe, 0xac, 0x12, 0x3d, 0x1d, 0xd6, 0x86, 0x3d, 0x54, 0xa5, 0x39, 0x3d, 0x4d, 0x88, 0xeb, 0x3c, 0x14, 0xe2, 0x3e, 0x3c, 0xb5, 0xe9, 0xd3, 0xbc, 0x97, 0xe0, 0x7e, 0x3c, 0x9b, 0xa2, 0x5a, 0xbc, 0x14, 0xab, 0x89, 0x3d, 0x4a, 0xdc, 0x93, 0x3d, 0xe8, 0xee, 0xb5, 0xbc, 0x5f, 0x9a, 0x9b, 0x3b, 0x26, 0x69, 0x55, 0x3c, 0x7d, 0x50, 0x89, 0xbc, 0xe0, 0x93, 0x8c, 0x3b, 0x44, 0xbc, 0x23, 0xbd, 0x47, 0x76, 0x85, 0x3d, 0xfd, 0x6a, 0x25, 0x39, 0x3e, 0x57, 0x9c, 0x3d, 0x70, 0xdd, 0xd0, 0x3b, 0x40, 0xdf, 0x3b, 0x3d, 0x47, 0x5c, 0xbd, 0xbc, 0x90, 0x3d, 0x33, 0xbd, 0xd8, 0xc6, 0x76, 0xbd, 0xf2, 0xd8, 0x51, 0x3d, 0x17, 0x60, 0x9c, 0xbd, 0x32, 0x78, 0x1b, 0xbd, 0xb4, 0xef, 0x70, 0x3d, 0xfa, 0x9d, 0xb6, 0x3b, 0x88, 0x5c, 0xe0, 0x3a, 0x47, 0x1b, 0xf8, 0xbc, 0x3b, 0x66, 0xcb, 0xba, 0x30, 0xe1, 0x04, 0xbd, 0x58, 0xbe, 0x87, 0xbd, 0xc2, 0xa5, 0x10, 0xbc, 0x48, 0x34, 0xa3, 0x3d, 0x44, 0xa4, 0x77, 0x3d, 0x7d, 0xe5, 0x94, 0xba, 0x23, 0xd9, 0xa3, 0xbc, 0xf6, 0xf6, 0xc6, 0xbc, 0xea, 0xd8, 0x31, 0xbd, 0x9f, 0x50, 0x24, 0x3d, 0xc8, 0x2a, 0x37, 0x3d, 0xaf, 0xe4, 0x82, 0x3d, 0x28, 0x20, 0x70, 0x3d, 0xa3, 0x27, 0x52, 0x3d, 0xbd, 0x34, 0x8a, 0x3c, 0x8c, 0x2c, 0xde, 0x3c, 0x35, 0xf4, 0x70, 0xbd, 0x35, 0x89, 0x19, 0x3d, 0x54, 0x59, 0x46, 0xb9, 0xa6, 0xfb, 0xc0, 0xbc, 0x56, 0x95, 0x8d, 0x3d, 0xd1, 0x4f, 0x71, 0x3d, 0xe1, 0xe3, 0x9f, 0x3d, 0x05, 0xe2, 0x82, 0xbd, 0xb7, 0xcf, 0x06, 0x3d, 0x02, 0x28, 0xa3, 0xbc, 0xd0, 0xcf, 0x48, 0x3d, 0x8e, 0x69, 0x3b, 0xbc, 0x1e, 0x83, 0x14, 0xbb, 0x72, 0x67, 0x82, 0x3b, 0x64, 0x7d, 0xeb, 0xbc, 0x2a, 0x76, 0xe5, 0xba, 0x6a, 0xd8, 0x3c, 0xbd, 0x10, 0xc0, 0x4c, 0x3d, 0x64, 0x44, 0x64, 0x3d, 0xbe, 0xb4, 0x31, 0xbd, 0x0c, 0x43, 0x09, 0xbd, 0xa4, 0x6d, 0x8d, 0xbd, 0xd0, 0xbf, 0x4a, 0x3d, 0x09, 0x76, 0x90, 0xbd, 0x29, 0x9c, 0x0b, 0x3d, 0x7c, 0x61, 0x74, 0xbd, 0xb9, 0x1c, 0x1c, 0xbd, 0x09, 0x6d, 0xad, 0x3b, 0x3e, 0xb4, 0x93, 0xbc, 0x1f, 0x5a, 0xa4, 0x3c, 0xe2, 0x7a, 0x89, 0xbd, 0x1c, 0x1d, 0x49, 0x3c, 0x0c, 0xc3, 0x06, 0xbd, 0xf9, 0xe2, 0xd6, 0x3c, 0x1a, 0x44, 0x57, 0xbd, 0x7a, 0xac, 0x50, 0x3d, 0x39, 0xe4, 0xc4, 0x3c, 0xfb, 0x1e, 0x04, 0x3d, 0x8a, 0xf6, 0x53, 0xbd, 0xfc, 0xac, 0x62, 0xbc, 0x44, 0xcc, 0x20, 0x3d, 0xf6, 0x5e, 0xa0, 0x3c, 0x88, 0x20, 0xcd, 0xba, 0x6b, 0xc7, 0x1c, 0xbd, 0x66, 0xd2, 0x16, 0xbb, 0x8b, 0x02, 0x58, 0xbd, 0x17, 0x15, 0x83, 0x3d, 0xef, 0x6a, 0x84, 0x3d, 0x00, 0x91, 0xd1, 0xba, 0x9a, 0xa6, 0x83, 0x3d, 0x6e, 0x12, 0x9c, 0xbd, 0x4c, 0x00, 0x46, 0x3d, 0x08, 0x8e, 0xcf, 0x3b, 0x53, 0x98, 0xb9, 0xbc, 0x5c, 0x33, 0x43, 0x3d, 0x05, 0x7b, 0x03, 0xbd, 0x82, 0x26, 0x35, 0xbd, 0xbf, 0x76, 0x75, 0xbd, 0x08, 0x78, 0x49, 0xbd, 0xe1, 0x7e, 0x53, 0xbc, 0xf0, 0x64, 0xf2, 0x3c, 0x56, 0xaf, 0x1a, 0x3d, 0x1c, 0x8f, 0x08, 0x3d, 0x11, 0xac, 0x91, 0xbd, 0xe8, 0x21, 0x06, 0x3d, 0xf5, 0xbb, 0xdb, 0xbc, 0x0c, 0xc9, 0x81, 0xbd, 0x74, 0x76, 0x83, 0xbd, 0x5e, 0xf3, 0x40, 0xbd, 0xd6, 0xbb, 0x98, 0x3d, 0x4b, 0x9a, 0x93, 0x3c, 0x25, 0x64, 0x9d, 0xbd, 0xf4, 0xf4, 0x9e, 0xbc, 0x66, 0xbe, 0x2b, 0xbb, 0xad, 0xa4, 0x82, 0x3c, 0x76, 0x08, 0x5d, 0xbd, 0x2c, 0xf4, 0x2f, 0xbd, 0xb3, 0x5e, 0x84, 0x3d, 0x62, 0xad, 0x06, 0x3d, 0x6a, 0xe5, 0xea, 0xbc, 0xd8, 0x06, 0x23, 0x3d, 0x85, 0x25, 0xeb, 0xbc, 0xa9, 0x01, 0xab, 0xbb, 0x28, 0xe4, 0xf3, 0x3c, 0x9f, 0x9e, 0x8e, 0xbd, 0x3f, 0xe2, 0x2c, 0xbc, 0xe0, 0xfd, 0xc1, 0x3c, 0x84, 0x67, 0xa7, 0xbb, 0xc5, 0x1d, 0xfc, 0xbc, 0xee, 0x05, 0x6b, 0xbd, 0x9a, 0x29, 0xc9, 0xbc, 0x35, 0x9c, 0x0f, 0x3d, 0xff, 0xd3, 0x1c, 0xbd, 0x60, 0x5c, 0x3d, 0xbd, 0x85, 0xf0, 0x81, 0x3d, 0xe6, 0x58, 0x0f, 0xbc, 0xda, 0x46, 0x01, 0xbd, 0xe4, 0xae, 0x88, 0xbd, 0xe2, 0x4a, 0x47, 0xbd, 0x51, 0xf0, 0x7e, 0xbd, 0x18, 0xc7, 0x82, 0x3d, 0x85, 0xf7, 0x26, 0x3d, 0x7f, 0xe0, 0xc0, 0xbc, 0x28, 0xa7, 0x56, 0x3b, 0x86, 0xe9, 0x17, 0xbb, 0x75, 0xc7, 0x81, 0x3d, 0x0c, 0x95, 0x19, 0xbc, 0x27, 0x0d, 0x62, 0xbd, 0xae, 0x2f, 0x14, 0x3b, 0xcf, 0x26, 0x47, 0xbd, 0x75, 0xe8, 0x26, 0x3d, 0x99, 0x94, 0x48, 0x3d, 0xac, 0xe6, 0x3f, 0x3d, 0x50, 0xa8, 0xee, 0x3c, 0x25, 0x3e, 0xef, 0xbc, 0x98, 0xfe, 0x37, 0xbc, 0x05, 0x4b, 0x28, 0x3d, 0xa5, 0x42, 0xfc, 0x3c, 0x40, 0xda, 0x68, 0x3d, 0xf7, 0x91, 0x35, 0x3d, 0xae, 0xa1, 0x1a, 0x3d, 0xeb, 0xc7, 0x1b, 0xbd, 0x98, 0x7d, 0xb1, 0x3c, 0xf7, 0xe7, 0x0b, 0xbd, 0x72, 0x31, 0x47, 0x3d, 0x47, 0xeb, 0x85, 0xbd, 0x4f, 0x71, 0x1f, 0xbc, 0xae, 0x19, 0x1b, 0xbd, 0x30, 0xc5, 0xd7, 0xbb, 0x94, 0xbe, 0x05, 0x3d, 0x39, 0x66, 0x94, 0x3c, 0x68, 0xab, 0x65, 0xbc, 0x4a, 0x43, 0xd3, 0xbc, 0x66, 0x6e, 0x22, 0x3d, 0x2c, 0xb6, 0x45, 0x3d, 0xec, 0xf0, 0x09, 0xbd, 0x15, 0x84, 0xd6, 0x3c, 0x67, 0xb6, 0x5e, 0xbd, 0x48, 0xb9, 0x1b, 0x3d, 0xef, 0x6b, 0x36, 0x3d, 0xfa, 0x9f, 0x60, 0x3c, 0xfb, 0x49, 0x8c, 0x3d, 0x50, 0x0b, 0xfd, 0x3c, 0x43, 0x24, 0xf5, 0x3c, 0x48, 0xf5, 0x1c, 0x3d, 0x24, 0xed, 0x55, 0xbd, 0x12, 0x2a, 0x33, 0xbd, 0x6f, 0x59, 0x3b, 0xbb, 0xeb, 0x66, 0xe0, 0xbc, 0x7b, 0x67, 0x60, 0xbb, 0x19, 0x8c, 0x85, 0x3c, 0x72, 0x71, 0x22, 0x3b, 0x7f, 0xa1, 0x22, 0xbd, 0x9e, 0xcd, 0x04, 0x3d, 0x00, 0xf6, 0xff, 0xb9, 0xdf, 0x8b, 0x16, 0xbd, 0xc1, 0x0c, 0xfd, 0x3c, 0x9b, 0xf9, 0x5b, 0xbd, 0x71, 0x73, 0x8c, 0x3d, 0x0f, 0x55, 0x63, 0x3d, 0x20, 0xbf, 0xb9, 0x3c, 0xa3, 0xc5, 0x85, 0x3d, 0xfd, 0x98, 0x2e, 0xbd, 0xb4, 0x02, 0x2e, 0xbc, 0xe2, 0x12, 0x46, 0xbc, 0x90, 0x41, 0x6f, 0xbd, 0x0d, 0xc7, 0x68, 0x3d, 0x4e, 0x58, 0x4f, 0x3c, 0xc0, 0xeb, 0x1d, 0xbb, 0x3d, 0xcb, 0x9f, 0xbd, 0x29, 0x0c, 0x7f, 0x3d, 0x8a, 0x62, 0x4d, 0xbc, 0x01, 0x3c, 0x7b, 0x3d, 0x3c, 0x41, 0xb8, 0x3c, 0xa9, 0x70, 0x53, 0x3d, 0x32, 0x94, 0xab, 0x3d, 0xdc, 0x75, 0x4c, 0x3d, 0xab, 0x5d, 0xd6, 0xbc, 0xae, 0x74, 0x0a, 0xbd, 0x7f, 0xf5, 0xec, 0x3c, 0xff, 0x6e, 0x4c, 0xbd, 0x0c, 0x65, 0x16, 0xbc, 0x4f, 0x2a, 0x58, 0x3c, 0xe2, 0x17, 0xa0, 0x3d, 0x6a, 0x10, 0x83, 0xbc, 0xfc, 0x40, 0xc0, 0x3d, 0xbc, 0xa0, 0xad, 0xbc, 0xde, 0xdc, 0x98, 0x3d, 0xaf, 0x54, 0x84, 0xbb, 0x64, 0xcd, 0xdf, 0x3c, 0xab, 0x93, 0x2c, 0xbc, 0x44, 0x5c, 0x29, 0x3c, 0xac, 0x7f, 0x27, 0x3d, 0xb2, 0x34, 0xee, 0x3c, 0x66, 0xf2, 0xd9, 0x3c, 0x4d, 0xaf, 0x86, 0x3d, 0xee, 0x79, 0x10, 0xbd, 0xa2, 0x84, 0x31, 0xbd, 0xe2, 0xf9, 0x43, 0x3d, 0x26, 0x87, 0xf1, 0x3b, 0xf0, 0x3a, 0x8f, 0xbd, 0x3e, 0x23, 0x5d, 0xbd, 0x75, 0x0a, 0x7c, 0x3d, 0x15, 0xe4, 0x5a, 0xbd, 0x45, 0xb3, 0xb2, 0x3c, 0xe3, 0xc4, 0x36, 0x3d, 0x7d, 0x89, 0x9f, 0x3c, 0x9e, 0x54, 0xaa, 0xbb, 0x89, 0x2e, 0x88, 0xbd, 0xad, 0xe0, 0x89, 0xbc, 0x69, 0xe9, 0x66, 0xbd, 0x94, 0xa9, 0xf4, 0xbc, 0xb3, 0xde, 0x21, 0xbd, 0x0b, 0x5a, 0x82, 0xbd, 0x55, 0x78, 0x00, 0x3d, 0x1f, 0x1d, 0xa2, 0xbd, 0x5c, 0xe4, 0x4b, 0xbd, 0x63, 0x9e, 0xa6, 0xbd, 0x44, 0xdb, 0x75, 0xbd, 0x6a, 0xe7, 0xf3, 0xbc, 0xdc, 0xa5, 0x2c, 0xbd, 0xc7, 0xcd, 0x8d, 0x3c, 0xd4, 0x97, 0x85, 0x3c, 0xc5, 0x19, 0x4a, 0xbc, 0x48, 0x7d, 0x09, 0xbc, 0xd6, 0x74, 0x2c, 0xbd, 0x94, 0xb6, 0xf9, 0x3c, 0xfd, 0x54, 0x8d, 0x3d, 0xdf, 0x85, 0x57, 0x3d, 0x82, 0x58, 0x67, 0x3d, 0x67, 0x4a, 0xe8, 0xba, 0xec, 0xb0, 0xe9, 0x3c, 0x9a, 0xf0, 0x1f, 0x3d, 0x80, 0xbc, 0x7e, 0xbd, 0x15, 0xe3, 0x16, 0x3d, 0x49, 0xb7, 0x33, 0xbc, 0x03, 0xbe, 0x65, 0xbd, 0x6c, 0x41, 0x8b, 0x3d, 0x93, 0x68, 0x85, 0xbc, 0x50, 0x1a, 0x50, 0xbd, 0x10, 0xbe, 0x7f, 0xbc, 0x15, 0x0c, 0x58, 0xbc, 0x48, 0xe9, 0x92, 0xbd, 0x48, 0x67, 0x3e, 0xbc, 0x38, 0x60, 0x66, 0xbd, 0x76, 0xac, 0x9e, 0xbd, 0x4d, 0xc9, 0x61, 0x3d, 0x0b, 0xa6, 0x9f, 0xbd, 0x8f, 0x08, 0xcb, 0x3c, 0x60, 0x17, 0x35, 0x3d, 0x60, 0x75, 0x7a, 0x3c, 0x24, 0x97, 0x48, 0x3a, 0x64, 0x78, 0x90, 0xbc, 0xf3, 0x93, 0xb8, 0xbb, 0x46, 0x84, 0x69, 0xbd, 0xd6, 0x71, 0x43, 0x3d, 0xb4, 0x2b, 0x62, 0xbc, 0x47, 0x6b, 0x08, 0x3c, 0x0e, 0x23, 0xeb, 0xbc, 0xf4, 0xc8, 0xb0, 0xbc, 0x3f, 0x17, 0xbe, 0xbc, 0x11, 0xc5, 0x99, 0x3d, 0x50, 0x81, 0x15, 0x3d, 0x8e, 0xd8, 0x7d, 0x3d, 0xfd, 0x07, 0x8d, 0xbb, 0x7a, 0x46, 0xea, 0x3c, 0x7d, 0xc9, 0x2c, 0x3d, 0x1e, 0x27, 0x2f, 0x3d, 0x67, 0x04, 0x05, 0xbc, 0x8f, 0x0a, 0x71, 0xbc, 0x44, 0xcb, 0x78, 0xbc, 0x3b, 0x8e, 0x17, 0x3d, 0x8c, 0x61, 0xf6, 0x3c, 0xdf, 0x7a, 0x54, 0x3d, 0x93, 0xe6, 0xaa, 0xbc, 0xef, 0x19, 0xd2, 0xbc, 0xb8, 0xec, 0x13, 0x3d, 0xed, 0x16, 0x39, 0x3d, 0x7c, 0xb2, 0xdc, 0x3c, 0x03, 0xf9, 0x84, 0xb9, 0xe7, 0xbd, 0x70, 0xbc, 0xea, 0x33, 0x77, 0x3d, 0xa8, 0xd3, 0x55, 0x3c, 0x3b, 0x55, 0x04, 0x3c, 0x72, 0x75, 0x67, 0xbc, 0xde, 0x63, 0x4b, 0xbc, 0x73, 0xc5, 0x01, 0xbd, 0x2e, 0x1b, 0x01, 0x3c, 0xb2, 0xeb, 0x57, 0x3d, 0x81, 0xaa, 0x2d, 0xbd, 0x68, 0x5f, 0x1c, 0xbd, 0x0e, 0x36, 0x77, 0x3d, 0xd9, 0xb5, 0x27, 0x3c, 0x99, 0x74, 0x27, 0x3d, 0xae, 0x86, 0x74, 0xbd, 0x57, 0x12, 0x0e, 0xbd, 0x37, 0x30, 0x2a, 0x3d, 0x5e, 0xf5, 0x3b, 0x3d, 0x37, 0x81, 0x6f, 0x3d, 0xd3, 0xe7, 0x4b, 0xbd, 0x4a, 0x7f, 0x85, 0x3d, 0xce, 0x31, 0x21, 0x3d, 0xda, 0xf8, 0x86, 0xbc, 0x5e, 0x6d, 0x1f, 0x3c, 0x80, 0x1b, 0x06, 0x3b, 0xd7, 0x82, 0x5f, 0x3d, 0x74, 0xc0, 0x26, 0xbd, 0x1d, 0x0e, 0x8d, 0xbc, 0x00, 0xfe, 0x06, 0x3d, 0x5f, 0x91, 0x79, 0xbd, 0x53, 0x7a, 0xee, 0xbc, 0x64, 0x03, 0x41, 0x3d, 0x66, 0xa9, 0xfa, 0xba, 0x67, 0x37, 0x40, 0xbd, 0xd8, 0x7f, 0x23, 0xbd, 0x1a, 0x9f, 0x03, 0xbc, 0x93, 0x26, 0x03, 0xbd, 0xeb, 0xf7, 0x58, 0xbc, 0x04, 0xe4, 0xdc, 0xb9, 0xb6, 0xbb, 0x9b, 0x3b, 0x9e, 0x4b, 0x14, 0x3d, 0x5a, 0x9a, 0xd4, 0xba, 0x59, 0xcd, 0x21, 0xbd, 0x00, 0xc3, 0x85, 0x3c, 0xec, 0xbf, 0xf2, 0xbc, 0x0e, 0x59, 0x3a, 0xbd, 0xa7, 0x8f, 0x81, 0x3d, 0x11, 0x2d, 0x63, 0xbd, 0x55, 0x42, 0xe8, 0xbc, 0x6b, 0x6e, 0x8c, 0x3c, 0xa3, 0x84, 0x1d, 0xbd, 0x8c, 0xda, 0x4f, 0x3c, 0xb2, 0x36, 0xd1, 0x3c, 0x4f, 0x27, 0x71, 0x3d, 0xf8, 0x32, 0x8c, 0x3c, 0x5c, 0xe8, 0x69, 0xbc, 0x42, 0xcb, 0x24, 0x3d, 0x8f, 0xd8, 0x6b, 0xbd, 0x87, 0xd2, 0x9c, 0xbd, 0xc5, 0x3f, 0xb5, 0x3c, 0x08, 0xfc, 0xf9, 0x3c, 0x5b, 0x21, 0x7e, 0x3d, 0xef, 0x06, 0x65, 0xbc, 0xda, 0x92, 0x02, 0x3c, 0xb1, 0xf0, 0x99, 0xbc, 0x2e, 0x72, 0xe7, 0xbc, 0x32, 0x44, 0x6a, 0xbd, 0xdd, 0xbb, 0x20, 0x3b, 0xa1, 0xbf, 0xa3, 0x3c, 0xd2, 0x4f, 0x9b, 0x3c, 0xf8, 0x55, 0xbe, 0x3c, 0x35, 0xe3, 0x0a, 0x3d, 0xf0, 0x8a, 0x89, 0xbc, 0xd7, 0xd7, 0x6f, 0x3d, 0x96, 0xd9, 0x70, 0xbd, 0x00, 0x50, 0x20, 0x39, 0x1f, 0xa7, 0x17, 0x3d, 0x4f, 0x4f, 0xc3, 0xbb, 0xf6, 0x99, 0x40, 0xbd, 0x87, 0xd4, 0x2a, 0xbd, 0x09, 0x54, 0x06, 0x3d, 0x87, 0x46, 0xf4, 0xbb, 0x9c, 0x12, 0x12, 0x3c, 0x2f, 0xc9, 0xd1, 0x3c, 0x4c, 0x47, 0x4e, 0x3d, 0xf9, 0x77, 0x64, 0xbd, 0xd1, 0xa5, 0x17, 0xbd, 0xf3, 0x5b, 0xdb, 0x3c, 0x98, 0x30, 0x55, 0x3d, 0x3f, 0x3d, 0x37, 0xbd, 0x54, 0x12, 0xed, 0xbc, 0x30, 0x26, 0x1d, 0x3d, 0x72, 0x80, 0x8a, 0x3d, 0xf1, 0xd7, 0x4c, 0xbd, 0xa9, 0xc7, 0x83, 0x3d, 0x86, 0xba, 0x93, 0xbd, 0x6b, 0x0a, 0x90, 0xbd, 0x96, 0x8c, 0x64, 0xbd, 0x40, 0x70, 0xf1, 0x3a, 0xc0, 0x39, 0x79, 0x3d, 0x27, 0xda, 0x24, 0xbc, 0x36, 0x2e, 0x3c, 0x3d, 0xb0, 0xbe, 0x90, 0xbd, 0x20, 0x68, 0x14, 0xbc, 0x00, 0xa4, 0x3e, 0xbc, 0x85, 0xb9, 0x44, 0xbd, 0xa2, 0x06, 0x52, 0xbd, 0x6e, 0xae, 0x4a, 0xbd, 0xbe, 0x73, 0x6c, 0xbd, 0x49, 0xee, 0x3e, 0xbd, 0x36, 0x8a, 0xe0, 0x3c, 0x7f, 0x94, 0x8a, 0xbd, 0x19, 0x1d, 0x11, 0xbd, 0x15, 0x3e, 0x55, 0xbd, 0x4b, 0xcd, 0x7b, 0x3d, 0x63, 0xd7, 0x9f, 0xba, 0x83, 0xcb, 0x37, 0xbd, 0xa4, 0x4f, 0x21, 0xbd, 0xa5, 0xaf, 0xec, 0xbc, 0xcd, 0x46, 0xae, 0xbd, 0xe8, 0x66, 0x9d, 0x3c, 0x7c, 0x84, 0xa6, 0xbc, 0x85, 0xcc, 0x7f, 0x3d, 0xa5, 0x28, 0xa6, 0xbd, 0x2f, 0x3a, 0x55, 0xbc, 0xb4, 0x8b, 0xc8, 0xbc, 0xd3, 0x90, 0x5e, 0x3d, 0x49, 0x79, 0x81, 0xbd, 0x50, 0xc3, 0x79, 0xbc, 0x90, 0x04, 0x9b, 0xbd, 0x1e, 0xdb, 0x73, 0x3d, 0x97, 0x15, 0x7e, 0x3c, 0x5f, 0xf6, 0x83, 0x3d, 0x1d, 0x20, 0x32, 0x3c, 0xda, 0x32, 0x7a, 0xbd, 0x8f, 0xa0, 0x69, 0x3c, 0x20, 0xe0, 0x87, 0xbd, 0x08, 0xb7, 0x2f, 0x3d, 0x5e, 0x6c, 0x26, 0xbd, 0xba, 0xa8, 0xbe, 0xbc, 0xb3, 0x9b, 0xb7, 0xbc, 0xc1, 0x3e, 0x8e, 0x3d, 0x45, 0x90, 0x3f, 0xbd, 0x82, 0xee, 0x0c, 0x3d, 0x62, 0xe1, 0x38, 0xbc, 0x30, 0x95, 0x8b, 0x3c, 0xc6, 0x6b, 0x58, 0x3d, 0x7c, 0xca, 0x06, 0xbd, 0x03, 0xa3, 0x7b, 0x3d, 0x77, 0xef, 0x83, 0x3c, 0x24, 0xc7, 0x69, 0x3d, 0xf6, 0xed, 0x35, 0xbd, 0xaa, 0x2d, 0x33, 0x3d, 0x71, 0x69, 0x72, 0x3c, 0xed, 0x0d, 0x80, 0x3c, 0x02, 0x0d, 0x47, 0x3d, 0x30, 0x51, 0x86, 0xbc, 0x0a, 0xad, 0x8d, 0xbc, 0x80, 0xab, 0x1c, 0x3d, 0x68, 0x17, 0x3d, 0x3d, 0x47, 0x3c, 0x36, 0xbd, 0x32, 0x58, 0xfb, 0x3c, 0x27, 0x47, 0x82, 0x3d, 0xb8, 0x9c, 0x92, 0xbc, 0xab, 0xa8, 0xaf, 0xbb, 0x97, 0xb4, 0x7b, 0x3d, 0xdb, 0x16, 0xad, 0xbc, 0xa8, 0x50, 0x8b, 0xbd, 0x50, 0x91, 0x4d, 0x3c, 0xe1, 0x69, 0x73, 0x3c, 0x62, 0x4f, 0x30, 0xbd, 0x00, 0x70, 0x6a, 0x3c, 0x57, 0xbb, 0x8f, 0x3d, 0xe6, 0x60, 0x44, 0xbd, 0x33, 0x5a, 0xc2, 0xbc, 0xe6, 0xae, 0x82, 0xbd, 0x1e, 0xad, 0x6e, 0xbd, 0xc9, 0x43, 0x30, 0x3d, 0x30, 0x4a, 0x65, 0x3c, 0x79, 0x1d, 0xc7, 0x3c, 0x97, 0xab, 0x1e, 0x3b, 0x95, 0x60, 0xd7, 0xbc, 0xcc, 0xed, 0xa1, 0xbc, 0xa3, 0x6d, 0x6b, 0xbd, 0xd8, 0xc4, 0x30, 0x3c, 0xcf, 0x3e, 0x8b, 0xbc, 0x82, 0xd9, 0x0d, 0xbc, 0x6b, 0x1f, 0xdb, 0xbc, 0xb7, 0x65, 0x76, 0xbd, 0x19, 0x3a, 0xfb, 0x3c, 0xe8, 0x08, 0x08, 0xbd, 0x0b, 0xdb, 0x00, 0xbd, 0x4c, 0x51, 0x19, 0xbd, 0x2e, 0x6c, 0x37, 0x3d, 0xc0, 0xdf, 0x1e, 0x3b, 0x64, 0x10, 0x49, 0x3d, 0x77, 0x9b, 0xca, 0xbc, 0xca, 0x17, 0xfb, 0xbc, 0xe6, 0xa4, 0x92, 0x3d, 0xfd, 0x90, 0x77, 0x3d, 0x82, 0x5e, 0x6b, 0x3d, 0xe5, 0x15, 0x3c, 0x3d, 0xc3, 0x45, 0xf9, 0xbb, 0x0c, 0x61, 0x88, 0xbd, 0x26, 0xa1, 0x68, 0xbd, 0x67, 0x2c, 0x1e, 0xbd, 0x2b, 0xfe, 0x3e, 0xbd, 0xb9, 0x45, 0x0b, 0xbd, 0x8e, 0x79, 0x09, 0xbd, 0x16, 0xdf, 0x45, 0xbd, 0x52, 0xbb, 0x24, 0xbc, 0x84, 0x55, 0x78, 0xbd, 0xb7, 0x6d, 0x55, 0x3d, 0xb8, 0xe4, 0x8a, 0x3d, 0xcc, 0x8e, 0x2d, 0xbd, 0xf8, 0x0a, 0x13, 0x3c, 0xda, 0x22, 0x23, 0x3d, 0xee, 0x07, 0x1e, 0x3d, 0xee, 0x5c, 0x38, 0xbd, 0x1b, 0xfa, 0xc1, 0xbc, 0x62, 0x88, 0x82, 0xbc, 0x9e, 0x6c, 0x39, 0xbd, 0xe8, 0xc8, 0x90, 0xbd, 0xb2, 0xaf, 0x0e, 0xbd, 0x87, 0xc1, 0x61, 0xbc, 0x91, 0xcf, 0x21, 0x3b, 0xaa, 0x52, 0x88, 0xbd, 0x2b, 0xcb, 0x8e, 0xbd, 0x42, 0x58, 0xb0, 0x3c, 0x72, 0x3e, 0x9a, 0x3c, 0x1e, 0x92, 0x09, 0x3d, 0xc6, 0x67, 0x9a, 0xbd, 0xa0, 0xb0, 0x29, 0x3b, 0x51, 0x6e, 0x0c, 0xbd, 0x88, 0x0d, 0x4d, 0xbd, 0x1c, 0xc3, 0xee, 0x3c, 0x43, 0xfc, 0x61, 0x3d, 0x74, 0x13, 0x84, 0x3c, 0x10, 0xbc, 0xd4, 0x3c, 0x8a, 0x20, 0x9d, 0x39, 0x0a, 0x33, 0xdd, 0x3b, 0xee, 0x75, 0x96, 0xbd, 0x77, 0x4f, 0xa2, 0x3c, 0x1a, 0x55, 0xe4, 0xbc, 0x17, 0x4b, 0x5c, 0xbc, 0xe8, 0x22, 0x5a, 0xbd, 0xcf, 0xa8, 0x46, 0x3c, 0x2e, 0x1d, 0x2c, 0xbd, 0x7c, 0x53, 0x62, 0xbc, 0x4e, 0xdc, 0x25, 0x3d, 0x3c, 0x94, 0x4e, 0xbd, 0xba, 0x9a, 0x3b, 0xbd, 0x32, 0x01, 0x02, 0x3d, 0x57, 0xd2, 0x80, 0x3d, 0x88, 0x7d, 0xb4, 0xbc, 0x81, 0xbf, 0x7f, 0xbd, 0xf7, 0xbb, 0x89, 0x3d, 0xa0, 0xba, 0x30, 0x3d, 0x13, 0xd5, 0x91, 0x3d, 0xc7, 0x59, 0x37, 0x3d, 0x3c, 0xc1, 0x95, 0xbd, 0x41, 0x62, 0x94, 0xbc, 0x09, 0x66, 0x25, 0xbc, 0x4a, 0x10, 0x84, 0xbd, 0xf0, 0x61, 0x09, 0x3d, 0x7c, 0xba, 0x6d, 0x3d, 0x43, 0x44, 0x60, 0x3d, 0xbc, 0x42, 0x2d, 0x3d, 0x09, 0x6d, 0x2d, 0x3d, 0x3b, 0x61, 0xb1, 0x3c, 0xd7, 0xb2, 0x36, 0xbc, 0x10, 0xe9, 0x06, 0xbd, 0xd4, 0x30, 0x64, 0x3d, 0x4e, 0xb2, 0x8d, 0xbc, 0x54, 0x0d, 0x24, 0xbd, 0xb6, 0x13, 0xe8, 0x3c, 0xe1, 0xd2, 0xd3, 0x3c, 0xd2, 0xc8, 0x99, 0xbc, 0x5c, 0x05, 0x75, 0x3d, 0x58, 0x19, 0x91, 0x3d, 0x66, 0x5b, 0x03, 0xbd, 0xf4, 0x88, 0xbd, 0xbc, 0xff, 0x51, 0x93, 0xbc, 0xaa, 0xc8, 0x3e, 0x3d, 0x57, 0x16, 0xbc, 0xba, 0xf4, 0xe1, 0xa0, 0xbd, 0x3a, 0x82, 0x94, 0xbd, 0x77, 0xfa, 0x86, 0xbd, 0xa6, 0xfd, 0x84, 0xbb, 0x91, 0x28, 0xeb, 0xbb, 0x86, 0xfd, 0xca, 0xbc, 0x7f, 0xd4, 0x10, 0xbc, 0xea, 0x09, 0x08, 0xbd, 0xbe, 0x9e, 0x23, 0xbc, 0x5a, 0x6a, 0x4f, 0xbd, 0x00, 0xf1, 0x54, 0x3d, 0xf4, 0x72, 0xb8, 0xbc, 0x0a, 0xde, 0x0f, 0x3d, 0x27, 0x61, 0x1b, 0x3d, 0xed, 0xb6, 0x49, 0xbd, 0x11, 0x6d, 0xfb, 0x3c, 0x51, 0x41, 0x75, 0x3d, 0x0b, 0x3b, 0x68, 0x3d, 0x1e, 0xb2, 0x6c, 0xbd, 0xd0, 0x5a, 0xfe, 0x3c, 0x3d, 0xa0, 0x30, 0xbd, 0xc8, 0xf9, 0x89, 0x3c, 0x10, 0x06, 0x72, 0x3d, 0xed, 0x61, 0xe1, 0x3a, 0x35, 0x65, 0x7e, 0x3d, 0x16, 0x6c, 0x4d, 0x3d, 0x8a, 0xf6, 0x5a, 0x3d, 0x3e, 0x18, 0x64, 0x3d, 0x36, 0x9a, 0xbe, 0x3c, 0x14, 0xa7, 0xba, 0xbc, 0x93, 0x98, 0xe3, 0x3c, 0x14, 0x13, 0x30, 0x3d, 0xa8, 0x9a, 0x71, 0xbc, 0xd0, 0x9e, 0xfd, 0xbc, 0x10, 0x8b, 0xa7, 0xbd, 0xb9, 0x47, 0x2f, 0x3d, 0x44, 0xff, 0x9c, 0xbd, 0x5b, 0x84, 0x3e, 0xbd, 0xc6, 0xa4, 0xaa, 0x3c, 0x5b, 0xa9, 0x0e, 0xbd, 0x6b, 0xa6, 0x33, 0x3d, 0x65, 0x26, 0x46, 0x3d, 0x8e, 0x5d, 0xdc, 0xbc, 0x62, 0xcf, 0x43, 0xbd, 0xfd, 0x0e, 0x86, 0x3d, 0x52, 0xd5, 0xf3, 0x3c, 0x10, 0x00, 0x50, 0xbc, 0x55, 0xec, 0x6c, 0xbd, 0x9b, 0x21, 0x46, 0x3d, 0xb3, 0xe4, 0x80, 0xbc, 0xa1, 0xf7, 0x84, 0xbd, 0x64, 0x01, 0x4e, 0xbd, 0x01, 0xfb, 0x3e, 0xbc, 0x28, 0xfc, 0xac, 0xbc, 0x84, 0xf6, 0x17, 0x3c, 0x69, 0x7c, 0xd9, 0xbc, 0x30, 0xb8, 0xfe, 0xbc, 0x0e, 0x3a, 0x87, 0xbd, 0x88, 0xad, 0x93, 0xbd, 0xe1, 0x85, 0x8d, 0xbd, 0x42, 0x8c, 0x12, 0x3d, 0x41, 0x59, 0x84, 0xbd, 0x1c, 0x0e, 0x70, 0xbb, 0xb0, 0x9e, 0xd3, 0xbc, 0x3c, 0x03, 0xdb, 0xbb, 0xf4, 0x19, 0x01, 0x3d, 0x6f, 0x20, 0xc6, 0x3c, 0x77, 0xc0, 0xb4, 0x3c, 0x4a, 0xa0, 0xa7, 0x3c, 0x1c, 0xaa, 0x2a, 0xbd, 0x49, 0x9b, 0x60, 0xbd, 0x30, 0xff, 0xf9, 0xbc, 0x2f, 0x70, 0xc9, 0xbb, 0x72, 0x4b, 0x8f, 0xbd, 0x47, 0xc6, 0x34, 0x3d, 0x18, 0x49, 0x21, 0x3c, 0x04, 0x19, 0x30, 0x3d, 0x74, 0xbe, 0x7b, 0xbb, 0xbc, 0x92, 0x43, 0xbc, 0x6f, 0xb6, 0xdf, 0xbc, 0x20, 0xdb, 0x90, 0x3c, 0x45, 0x29, 0x95, 0xbc, 0x4c, 0x9c, 0xa6, 0x3c, 0x2b, 0xbf, 0xe4, 0xbc, 0xa9, 0x41, 0xff, 0xbc, 0x62, 0x15, 0xd4, 0x3c, 0x29, 0x60, 0x8e, 0xbd, 0x8d, 0xce, 0x56, 0xbc, 0x84, 0x09, 0x41, 0x3d, 0x16, 0xb8, 0x35, 0x3d, 0x03, 0x5c, 0x09, 0xbd, 0x82, 0xfe, 0x64, 0x3d, 0x16, 0x2e, 0x6d, 0xbd, 0xbf, 0x4b, 0x05, 0xbd, 0x15, 0x9a, 0x28, 0xbd, 0x1d, 0x3d, 0x4f, 0xbd, 0x7c, 0x8a, 0x99, 0x3b, 0xf9, 0x8c, 0x35, 0xbd, 0xef, 0xc2, 0x2a, 0xbd, 0xe6, 0xea, 0x85, 0xbc, 0xfd, 0xf1, 0xde, 0x3b, 0xce, 0xb3, 0x5f, 0x3d, 0x2f, 0x4a, 0x30, 0xbc, 0xc5, 0xa1, 0x09, 0xbd, 0x63, 0x5f, 0x5e, 0xbd, 0x44, 0xc9, 0xc2, 0xbc, 0xb6, 0x2a, 0xf8, 0xbc, 0x58, 0x39, 0x34, 0x3d, 0x49, 0xbe, 0x5c, 0xbd, 0x45, 0xad, 0x1d, 0x3c, 0x3f, 0x9f, 0x19, 0xbd, 0xfb, 0xef, 0x2e, 0x3c, 0xd5, 0xe8, 0x88, 0x3c, 0x13, 0x36, 0x5c, 0xbd, 0x04, 0xeb, 0x78, 0x3c, 0x6e, 0x39, 0x64, 0x3d, 0xdc, 0x1e, 0x70, 0x3d, 0x79, 0x43, 0x4d, 0x3d, 0xfd, 0x0f, 0x30, 0xbd, 0xd2, 0x88, 0x18, 0x3d, 0x87, 0x62, 0xcc, 0x3c, 0x00, 0x39, 0x30, 0x3d, 0xba, 0xa0, 0xfa, 0xbc, 0x00, 0x3d, 0x41, 0x3d, 0xed, 0xfa, 0x73, 0xbd, 0x0c, 0x09, 0x54, 0xbd, 0x77, 0x2f, 0x5f, 0xbd, 0x01, 0x38, 0x7f, 0xbd, 0x98, 0x08, 0xee, 0xbc, 0x53, 0x34, 0x48, 0xbc, 0x8a, 0x25, 0x72, 0xbc, 0xf3, 0x71, 0x70, 0xbd, 0x44, 0xdf, 0x1b, 0x3d, 0xd8, 0x6e, 0x6f, 0xbd, 0xdf, 0x4d, 0x23, 0x3c, 0x9c, 0xfb, 0x21, 0x3d, 0x72, 0xe1, 0xa4, 0xbc, 0x74, 0xc3, 0x2e, 0xbd, 0x63, 0x0c, 0x8a, 0xbc, 0x24, 0x09, 0x6e, 0xbd, 0xbb, 0x68, 0x68, 0xbd, 0x7d, 0xd7, 0x6c, 0x3d, 0xd8, 0x63, 0x63, 0x3c, 0x1a, 0x16, 0xdb, 0xbb, 0x86, 0x5e, 0x40, 0xbd, 0x50, 0x6d, 0x31, 0xbb, 0xdd, 0xb6, 0x96, 0xbd, 0x19, 0x27, 0x56, 0xbd, 0xf3, 0xd5, 0x11, 0x3d, 0x91, 0x8e, 0x68, 0x3d, 0xea, 0xed, 0x86, 0xbd, 0xd6, 0x51, 0x87, 0xbc, 0xfb, 0x6c, 0x76, 0xbd, 0x50, 0x6f, 0x38, 0x3d, 0x9b, 0xa5, 0x71, 0xbd, 0x9b, 0x1f, 0x16, 0xbd, 0x25, 0xee, 0x93, 0x3d, 0xa9, 0x05, 0xca, 0xbc, 0x9f, 0xee, 0x36, 0xbd, 0x5c, 0x03, 0x28, 0x3d, 0x52, 0x3b, 0xb1, 0x3c, 0xe3, 0x45, 0x13, 0x3d, 0x38, 0xec, 0x82, 0xbd, 0xba, 0xc6, 0x5f, 0x3d, 0x18, 0xf7, 0x59, 0x3d, 0xc4, 0x2f, 0x89, 0x3c, 0x3c, 0x23, 0xd1, 0xbc, 0x39, 0xa7, 0x28, 0x3d, 0x07, 0x78, 0x17, 0xbc, 0x72, 0xe3, 0xaf, 0xbc, 0x15, 0x2e, 0x2d, 0x3d, 0x2c, 0x3d, 0xa3, 0x3c, 0x33, 0x96, 0x18, 0xbd, 0xee, 0x47, 0x30, 0xbd, 0x56, 0xc0, 0x0e, 0xbd, 0xae, 0x3b, 0x74, 0x3c, 0x79, 0x3e, 0x94, 0x3d, 0xee, 0x19, 0x3d, 0xbd, 0x8d, 0x14, 0x7a, 0xbd, 0x49, 0xfa, 0x2e, 0x3d, 0x9a, 0x0e, 0x8e, 0xbd, 0x41, 0x87, 0x45, 0x3c, 0x3b, 0x28, 0x66, 0xbd, 0x3d, 0xbd, 0x20, 0x3d, 0x60, 0x4e, 0x80, 0xbd, 0x7a, 0x3c, 0x50, 0xbd, 0xaa, 0x0f, 0x9e, 0xbd, 0xa2, 0x81, 0x57, 0xbd, 0x69, 0xf7, 0x27, 0x3d, 0x62, 0x88, 0x17, 0xbc, 0x47, 0x5d, 0xac, 0x3c, 0xe7, 0x41, 0x31, 0xbd, 0xde, 0xec, 0x85, 0xbd, 0x74, 0xa1, 0x48, 0xbd, 0x80, 0x0d, 0x2a, 0xbd, 0x5e, 0x67, 0x7e, 0x3c, 0x35, 0xa5, 0xc6, 0x3c, 0xc4, 0xeb, 0x89, 0xbc, 0xcb, 0xa7, 0x97, 0x3c, 0x0f, 0xca, 0x68, 0x3c, 0xeb, 0x57, 0xea, 0xbc, 0x88, 0xf8, 0xb3, 0x3c, 0x44, 0x92, 0xee, 0x3c, 0x89, 0xa1, 0x92, 0x3d, 0x61, 0xa5, 0x23, 0x3a, 0x1e, 0x6c, 0x28, 0xbd, 0x18, 0x89, 0xa4, 0x3c, 0xd1, 0x26, 0x47, 0x3b, 0x4a, 0x06, 0x80, 0x3c, 0x3a, 0x5f, 0x58, 0xbd, 0x6e, 0x1d, 0x77, 0xbd, 0xe1, 0x43, 0x89, 0x3a, 0x41, 0xd0, 0x71, 0xbc, 0x90, 0x43, 0x40, 0xbd, 0xa5, 0xc3, 0x3a, 0x3c, 0xc2, 0x45, 0xb1, 0xbb, 0xf1, 0x81, 0x32, 0x3d, 0x80, 0x8e, 0x20, 0x3d, 0x0a, 0xbd, 0x14, 0x3d, 0xbb, 0x93, 0x3e, 0xbd, 0x50, 0x1f, 0x5b, 0x3d, 0xb7, 0xd1, 0x99, 0xbd, 0xbe, 0x77, 0x4b, 0x3d, 0x5f, 0xd4, 0x58, 0x3d, 0xdc, 0xab, 0xa4, 0x3c, 0x41, 0x6c, 0x78, 0xbd, 0xbd, 0x11, 0x71, 0x3c, 0xc9, 0x97, 0x50, 0xbd, 0x93, 0xca, 0xe9, 0x3b, 0xec, 0x1b, 0xb4, 0xbc, 0xcf, 0xb1, 0x48, 0x3c, 0x26, 0xd1, 0x99, 0x3c, 0x9b, 0xca, 0x26, 0xbd, 0xe0, 0xaf, 0x2f, 0xbc, 0xef, 0x23, 0x84, 0xbd, 0x10, 0x75, 0xe1, 0x3b, 0xe6, 0x8c, 0x3c, 0x3d, 0xad, 0x1a, 0x48, 0x3d, 0xfe, 0x04, 0x3f, 0x3d, 0xf2, 0x2f, 0xe0, 0xbc, 0x98, 0x58, 0xe3, 0xbb, 0xe2, 0x78, 0x84, 0x3d, 0xde, 0x9e, 0x97, 0x3b, 0xe3, 0x90, 0x35, 0xbd, 0xb9, 0xf5, 0x57, 0x3c, 0x29, 0x97, 0x18, 0x3c, 0xa7, 0xe6, 0x02, 0x3d, 0x6e, 0xd3, 0x0b, 0x3d, 0x09, 0x9f, 0x51, 0xbd, 0xca, 0x5b, 0xac, 0x3a, 0x38, 0xd9, 0x55, 0xbd, 0xc0, 0x50, 0x0b, 0x3d, 0x63, 0xe8, 0x69, 0xbd, 0x96, 0xeb, 0x86, 0xbd, 0x43, 0x18, 0x26, 0x3d, 0x76, 0xab, 0xd8, 0x3a, 0xe3, 0x0e, 0xb9, 0xbc, 0xed, 0xb2, 0x33, 0x3c, 0x67, 0x1d, 0x7c, 0xbd, 0x13, 0x39, 0xa8, 0x3b, 0x4b, 0xa3, 0x39, 0xbd, 0x17, 0xb9, 0x44, 0xbd, 0x88, 0x76, 0x43, 0xbd, 0xdd, 0x31, 0x61, 0xbd, 0x2d, 0x7d, 0xae, 0xbc, 0xe9, 0xb8, 0x05, 0x3d, 0xdd, 0x80, 0x2a, 0xbd, 0x55, 0x66, 0x08, 0xbd, 0xea, 0x09, 0x8a, 0xbd, 0x13, 0xd8, 0x0d, 0xbd, 0x7e, 0x9d, 0x5a, 0x3d, 0x08, 0x68, 0x8d, 0x3c, 0x02, 0x87, 0xdc, 0x3c, 0xfb, 0x55, 0xda, 0xb9, 0xc4, 0x69, 0x71, 0xbd, 0xd1, 0x02, 0xf6, 0xbc, 0x92, 0x01, 0x0c, 0x3d, 0xbb, 0x2c, 0x40, 0xbd, 0x82, 0x69, 0x97, 0x3d, 0x2b, 0xda, 0x57, 0xbd, 0x7b, 0x9b, 0xe0, 0x3b, 0xff, 0xfd, 0x4b, 0xbd, 0x5c, 0xa6, 0x2e, 0x3d, 0x40, 0xec, 0x85, 0xbd, 0x3b, 0x5d, 0x17, 0xbd, 0x52, 0x04, 0x2c, 0xbd, 0x61, 0x00, 0x20, 0x3c, 0x65, 0x33, 0x28, 0xbc, 0x77, 0x76, 0x07, 0x3d, 0x7a, 0xff, 0x32, 0x3b, 0xb9, 0x96, 0x59, 0xbd, 0xe0, 0xe1, 0x43, 0xbd, 0x17, 0xa7, 0x6b, 0xbd, 0xf8, 0xa6, 0x4d, 0xbd, 0x4f, 0xc3, 0x9d, 0xbb, 0xfa, 0x3a, 0x39, 0xbd, 0xe3, 0x59, 0x9a, 0xbd, 0xbd, 0xb9, 0x43, 0xbc, 0x21, 0xc4, 0x0c, 0x3c, 0x3e, 0x70, 0x47, 0xbd, 0x42, 0xcf, 0x93, 0x3b, 0x9b, 0xe0, 0x34, 0x3d, 0x00, 0x5d, 0xeb, 0x39, 0x5f, 0x65, 0x80, 0xbd, 0x37, 0x8a, 0x65, 0x3d, 0x0e, 0x1b, 0x67, 0xbc, 0xa0, 0x0a, 0x68, 0x3c, 0xc5, 0x6d, 0xf7, 0x3c, 0xe1, 0x9d, 0x85, 0x3d, 0xa8, 0xe7, 0x69, 0xbd, 0x30, 0x9c, 0x36, 0xbd, 0xcf, 0x55, 0xdf, 0x3c, 0x85, 0xe9, 0x4c, 0x3d, 0x3e, 0x03, 0x8a, 0xbd, 0x19, 0xe1, 0x86, 0xbb, 0xa0, 0x51, 0xec, 0x3c, 0x11, 0xc9, 0x84, 0x3d, 0x48, 0xa9, 0x1d, 0x3d, 0x1c, 0xd6, 0xee, 0x3b, 0x82, 0x07, 0x96, 0xbc, 0x33, 0x6b, 0xd0, 0x3c, 0x62, 0x62, 0xb6, 0x3c, 0x4a, 0x35, 0x62, 0x3d, 0x10, 0x85, 0x66, 0xbd, 0xc9, 0xf5, 0x53, 0xbc, 0x70, 0x4a, 0xfa, 0x3b, 0xa5, 0x21, 0x33, 0xbd, 0xe7, 0x07, 0x40, 0x3b, 0x6d, 0xe3, 0x16, 0x3d, 0x11, 0xa2, 0xa7, 0x3a, 0x01, 0x73, 0x95, 0xbc, 0x5c, 0xd1, 0x2e, 0xbd, 0x5c, 0x41, 0x00, 0xbd, 0x02, 0x40, 0x8a, 0x3d, 0x66, 0xcf, 0x2b, 0x3d, 0x3d, 0x54, 0x8b, 0xbc, 0x1b, 0x25, 0x44, 0x3d, 0x56, 0xda, 0x15, 0xbd, 0xfc, 0x0c, 0xc1, 0xbc, 0x4d, 0xcd, 0x5e, 0xbd, 0x40, 0x55, 0x2c, 0x3d, 0xb9, 0xe6, 0xc5, 0xbc, 0x6b, 0x0d, 0xd2, 0xba, 0xd0, 0x10, 0x28, 0x3c, 0x6b, 0xd8, 0x63, 0xbd, 0xf7, 0xed, 0xca, 0x3c, 0xa3, 0x63, 0x5a, 0x3b, 0x45, 0x41, 0x8e, 0x3d, 0x48, 0x23, 0xd7, 0x3c, 0x71, 0xbb, 0xa8, 0x3c, 0xe2, 0x55, 0x98, 0x3c, 0x27, 0xae, 0x5e, 0xbc, 0x06, 0x79, 0xb4, 0xbb, 0x8c, 0xdb, 0x13, 0xbd, 0x7b, 0x59, 0x18, 0x3d, 0xbb, 0x91, 0xfc, 0xbc, 0x4b, 0x7d, 0x80, 0xbd, 0x58, 0x76, 0x8a, 0x3c, 0x5f, 0x71, 0xa8, 0x3c, 0xb3, 0x8f, 0x89, 0xbd, 0xb4, 0x4c, 0x64, 0xbd, 0xf9, 0x1a, 0x81, 0x3d, 0x8f, 0xa5, 0x90, 0xbd, 0x24, 0x93, 0xbf, 0x3c, 0x1c, 0x73, 0x68, 0x3d, 0xa5, 0x53, 0x4a, 0xbd, 0xec, 0x40, 0x34, 0xbd, 0xb2, 0x5f, 0x90, 0x3d, 0x0d, 0xe3, 0x11, 0x3d, 0x5b, 0x77, 0x91, 0x3d, 0xe4, 0x5b, 0x8b, 0x3d, 0x99, 0x6e, 0x6a, 0xbd, 0x05, 0xcb, 0x99, 0xbd, 0xb5, 0x26, 0x1f, 0xbd, 0xfd, 0xc3, 0x2f, 0xbd, 0xd2, 0x82, 0x96, 0x3d, 0x06, 0xf6, 0x78, 0xbd, 0x8e, 0x08, 0x30, 0x3d, 0x16, 0x22, 0x6d, 0xbd, 0xda, 0x25, 0x4b, 0x3d, 0xf7, 0x44, 0x43, 0xbc, 0xba, 0x20, 0xbc, 0xbc, 0x41, 0xd7, 0x04, 0xbc, 0xe1, 0x62, 0x0d, 0xbd, 0x93, 0x78, 0x2f, 0xbd, 0x2a, 0xad, 0xd5, 0xbc, 0x13, 0xd3, 0x6f, 0xbd, 0x88, 0xc4, 0x12, 0xbd, 0x49, 0x73, 0x84, 0xbd, 0xd6, 0x50, 0x2c, 0x3d, 0xa9, 0xb7, 0x7d, 0xbd, 0x9a, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x80, 0x04, 0x00, 0x00, 0xae, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xc0, 0x02, 0x74, 0xbb, 0xc6, 0x58, 0x47, 0x39, 0x07, 0x36, 0x4d, 0x3c, 0xf5, 0x20, 0xc5, 0x3c, 0xce, 0x88, 0x6c, 0x3a, 0xd2, 0x40, 0x7d, 0xbc, 0x2f, 0x7e, 0xf5, 0x3a, 0x3d, 0xe1, 0x3e, 0xbc, 0xda, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x1d, 0xe1, 0xa3, 0xbc, 0xe7, 0x98, 0x88, 0x3c, 0xe4, 0xc0, 0x49, 0x3b, 0xa6, 0x49, 0x38, 0x3c, 0x0e, 0x65, 0xbc, 0xbc, 0xd8, 0x59, 0x73, 0xbc, 0x15, 0x66, 0x0a, 0xbd, 0x7c, 0x75, 0x24, 0xba, 0x37, 0xc4, 0x65, 0x3c, 0x94, 0x0d, 0x84, 0x3c, 0x26, 0xcc, 0x87, 0x3c, 0x59, 0xea, 0x03, 0xbd, 0x33, 0x39, 0x48, 0xbc, 0xac, 0x3e, 0x6d, 0x3c, 0xc7, 0x46, 0xb1, 0xbb, 0xcf, 0xee, 0x07, 0x3d, 0x26, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x7c, 0xe9, 0x43, 0x3c, 0xd3, 0x16, 0xd7, 0xbc, 0x15, 0x37, 0x4a, 0xba, 0xa4, 0xad, 0x1c, 0x3c, 0x20, 0x66, 0x3b, 0xbb, 0x22, 0x84, 0x97, 0x3a, 0xa5, 0x65, 0x86, 0x3c, 0x68, 0x0b, 0xf7, 0xbb, 0x52, 0xaf, 0x8c, 0x3b, 0xe1, 0x81, 0x00, 0x3d, 0x3c, 0xf9, 0xd9, 0x3c, 0x96, 0xa8, 0x80, 0x3c, 0x94, 0xdf, 0x21, 0x3c, 0xc7, 0x26, 0xd7, 0x3a, 0x96, 0xb2, 0x8c, 0x3c, 0x17, 0x29, 0x20, 0x3c, 0xfa, 0xe0, 0x59, 0x3c, 0xf7, 0x08, 0x14, 0x3c, 0xad, 0x71, 0x61, 0x3c, 0x2e, 0x73, 0x1a, 0xbc, 0x0f, 0xd0, 0x55, 0xbb, 0xa8, 0xde, 0x68, 0x3c, 0xd9, 0x86, 0x44, 0x3c, 0x54, 0x22, 0x05, 0xbc, 0x3c, 0x7a, 0x92, 0x3c, 0x70, 0x16, 0x01, 0x3c, 0x69, 0x1e, 0xaf, 0xbb, 0xe8, 0x4b, 0xc5, 0xbc, 0x8b, 0xfd, 0x23, 0x3c, 0xb8, 0x1e, 0xfd, 0xbc, 0x49, 0x11, 0x50, 0xbb, 0x2a, 0x7b, 0x9c, 0x3c, 0xb2, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x6e, 0x5f, 0x06, 0xba, 0xca, 0x9c, 0x99, 0xbb, 0x00, 0x00, 0x00, 0x00, 0xa4, 0x8a, 0xfe, 0xba, 0x12, 0xed, 0xa7, 0x3c, 0xc0, 0x7d, 0x37, 0xbb, 0xa3, 0x8a, 0x30, 0xbb, 0xd0, 0x95, 0x99, 0xbc, 0x00, 0x00, 0x00, 0x00, 0x81, 0x9c, 0x1c, 0x3d, 0x5c, 0x2a, 0x8e, 0xbb, 0x8c, 0xc0, 0x1a, 0xbb, 0x5b, 0xa1, 0xe5, 0x3b, 0x00, 0x00, 0x00, 0x00, 0x6a, 0x50, 0xef, 0x3c, 0xdc, 0xbc, 0x9a, 0x3a, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x6e, 0x6b, 0xdf, 0xbb, 0x54, 0xe6, 0xe6, 0x3c, 0xd0, 0xf4, 0xff, 0xff, 0xd4, 0xf4, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, 0x4d, 0x4c, 0x49, 0x52, 0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xa0, 0x02, 0x00, 0x00, 0xa4, 0x02, 0x00, 0x00, 0xa8, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x38, 0x02, 0x00, 0x00, 0xd4, 0x01, 0x00, 0x00, 0x80, 0x01, 0x00, 0x00, 0x3c, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x5a, 0xfe, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x10, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x68, 0xf5, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x8e, 0xfe, 0xff, 0xff, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x24, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xee, 0xfe, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xde, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xd0, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x7e, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x24, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x6e, 0xff, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x5e, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x50, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x1a, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x0b, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x34, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x17, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x00, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x0b, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x28, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00, 0x13, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x10, 0x08, 0x00, 0x00, 0xc4, 0x07, 0x00, 0x00, 0x7c, 0x07, 0x00, 0x00, 0x44, 0x07, 0x00, 0x00, 0x0c, 0x07, 0x00, 0x00, 0xd4, 0x06, 0x00, 0x00, 0x88, 0x06, 0x00, 0x00, 0x2c, 0x06, 0x00, 0x00, 0xe0, 0x05, 0x00, 0x00, 0x8c, 0x05, 0x00, 0x00, 0x38, 0x05, 0x00, 0x00, 0xe4, 0x04, 0x00, 0x00, 0x28, 0x04, 0x00, 0x00, 0xb4, 0x03, 0x00, 0x00, 0xf8, 0x02, 0x00, 0x00, 0x84, 0x02, 0x00, 0x00, 0xc8, 0x01, 0x00, 0x00, 0x54, 0x01, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x58, 0xf8, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x3c, 0xf8, 0xff, 0xff, 0x19, 0x00, 0x00, 0x00, 0x53, 0x74, 0x61, 0x74, 0x65, 0x66, 0x75, 0x6c, 0x50, 0x61, 0x72, 0x74, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x65, 0x64, 0x43, 0x61, 0x6c, 0x6c, 0x3a, 0x30, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xac, 0xf8, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x90, 0xf8, 0xff, 0xff, 0x5b, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x42, 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x40, 0xf9, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x80, 0x04, 0x00, 0x00, 0x24, 0xf9, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x66, 0x6c, 0x61, 0x74, 0x74, 0x65, 0x6e, 0x5f, 0x37, 0x32, 0x2f, 0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x80, 0x04, 0x00, 0x00, 0x9c, 0xf9, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x88, 0xf9, 0xff, 0xff, 0x27, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, 0x6f, 0x6f, 0x6c, 0x69, 0x6e, 0x67, 0x32, 0x64, 0x5f, 0x31, 0x39, 0x38, 0x2f, 0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f, 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x0c, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xf8, 0xf9, 0xff, 0xff, 0x6e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x42, 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x3b, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xc4, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0e, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xb0, 0xfa, 0xff, 0xff, 0x27, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, 0x6f, 0x6f, 0x6c, 0x69, 0x6e, 0x67, 0x32, 0x64, 0x5f, 0x31, 0x39, 0x37, 0x2f, 0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f, 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x34, 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x1d, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x20, 0xfb, 0xff, 0xff, 0x6e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x42, 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x3b, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xec, 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xd8, 0xfb, 0xff, 0xff, 0x27, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, 0x6f, 0x6f, 0x6c, 0x69, 0x6e, 0x67, 0x32, 0x64, 0x5f, 0x31, 0x39, 0x36, 0x2f, 0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f, 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x5c, 0xfc, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x3e, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0xfc, 0xff, 0xff, 0x6e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x42, 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x3b, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x56, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0xe8, 0xfc, 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xa6, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x38, 0xfd, 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xf6, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x88, 0xfd, 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x46, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0xd8, 0xfd, 0xff, 0xff, 0x1e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x34, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x8e, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x20, 0xfe, 0xff, 0xff, 0x1e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x80, 0x04, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x14, 0x00, 0x13, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x2c, 0x00, 0x00, 0x00, 0x7c, 0xfe, 0xff, 0xff, 0x1e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x66, 0x6c, 0x61, 0x74, 0x74, 0x65, 0x6e, 0x5f, 0x37, 0x32, 0x2f, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2e, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xc0, 0xfe, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x62, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xf4, 0xfe, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x96, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x28, 0xff, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xca, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x5c, 0xff, 0xff, 0xff, 0x0e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xa0, 0xff, 0xff, 0xff, 0x0e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x34, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x40, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x73, 0x65, 0x72, 0x76, 0x69, 0x6e, 0x67, 0x5f, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3a, 0x30, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xdc, 0xff, 0xff, 0xff, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0xe8, 0xff, 0xff, 0xff, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16, 0xf4, 0xff, 0xff, 0xff, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x0c, 0x00, 0x0c, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 }; aom-3.12.1/av1/encoder/dwt.c000066400000000000000000000101521477627663500154500ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/av1_rtcd.h" #include "av1/encoder/dwt.h" // Note: block length must be even for this implementation static void analysis_53_row(int length, tran_low_t *x, tran_low_t *lowpass, tran_low_t *highpass) { int n; tran_low_t r, *a, *b; n = length >> 1; b = highpass; a = lowpass; while (--n) { *a++ = (r = *x++) * 2; *b++ = *x - ((r + x[1] + 1) >> 1); x++; } *a = (r = *x++) * 2; *b = *x - r; n = length >> 1; b = highpass; a = lowpass; r = *highpass; while (n--) { *a++ += (r + (*b) + 1) >> 1; r = *b++; } } static void analysis_53_col(int length, tran_low_t *x, tran_low_t *lowpass, tran_low_t *highpass) { int n; tran_low_t r, *a, *b; n = length >> 1; b = highpass; a = lowpass; while (--n) { *a++ = (r = *x++); *b++ = (((*x) * 2) - (r + x[1]) + 2) >> 2; x++; } *a = (r = *x++); *b = (*x - r + 1) >> 1; n = length >> 1; b = highpass; a = lowpass; r = *highpass; while (n--) { *a++ += (r + (*b) + 1) >> 1; r = *b++; } } static void dyadic_analyze_53_uint8_input(int levels, int width, int height, const uint8_t *x, int pitch_x, tran_low_t *c, int pitch_c, int dwt_scale_bits, int hbd) { int lv, i, j, nh, nw, hh = height, hw = width; tran_low_t buffer[2 * DWT_MAX_LENGTH]; if (hbd) { const uint16_t *x16 = CONVERT_TO_SHORTPTR(x); for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { c[i * pitch_c + j] = x16[i * pitch_x + j] << dwt_scale_bits; } } } else { for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits; } } } for (lv = 0; lv < levels; lv++) { nh = hh; hh = (hh + 1) >> 1; nw = hw; hw = (hw + 1) >> 1; if ((nh < 2) || (nw < 2)) return; for (i = 0; i < nh; i++) { memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t)); analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw); } for (j = 0; j < nw; j++) { for (i = 0; i < nh; i++) buffer[i + nh] = c[i * pitch_c + j]; analysis_53_col(nh, buffer + nh, buffer, buffer + hh); for (i = 0; i < nh; i++) c[i * pitch_c + j] = buffer[i]; } } } void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output, int stride, int hbd) { dyadic_analyze_53_uint8_input(4, 8, 8, input, stride, output, 8, 2, hbd); } static int haar_ac_sad(const tran_low_t *output, int bw, int bh, int stride) { int acsad = 0; for (int r = 0; r < bh; ++r) for (int c = 0; c < bw; ++c) { if (r >= bh / 2 || c >= bw / 2) acsad += abs(output[r * stride + c]); } return acsad; } static int haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride, int hbd) { tran_low_t output[64]; av1_fdwt8x8_uint8_input(input, output, stride, hbd); return haar_ac_sad(output, 8, 8, 8); } int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride, int hbd, int num_8x8_rows, int num_8x8_cols) { int64_t wavelet_energy = 0; for (int r8 = 0; r8 < num_8x8_rows; ++r8) { for (int c8 = 0; c8 < num_8x8_cols; ++c8) { wavelet_energy += haar_ac_sad_8x8_uint8_input( input + c8 * 8 + r8 * 8 * stride, stride, hbd); } } return wavelet_energy; } aom-3.12.1/av1/encoder/dwt.h000066400000000000000000000016261477627663500154630ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_DWT_H_ #define AOM_AV1_ENCODER_DWT_H_ #include "av1/common/common.h" #include "av1/common/enums.h" #define DWT_MAX_LENGTH 64 int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride, int hbd, int num_8x8_rows, int num_8x8_cols); #endif // AOM_AV1_ENCODER_DWT_H_ aom-3.12.1/av1/encoder/enc_enums.h000066400000000000000000000131761477627663500166440ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_ENC_ENUMS_H_ #define AOM_AV1_ENCODER_ENC_ENUMS_H_ #include "aom_ports/mem.h" #ifdef __cplusplus extern "C" { #endif #define MAX_NUM_THREADS 64 // This enumerator type needs to be kept aligned with the mode order in // const MODE_DEFINITION av1_mode_defs[MAX_MODES] used in the rd code. enum { THR_NEARESTMV, THR_NEARESTL2, THR_NEARESTL3, THR_NEARESTB, THR_NEARESTA2, THR_NEARESTA, THR_NEARESTG, THR_NEWMV, THR_NEWL2, THR_NEWL3, THR_NEWB, THR_NEWA2, THR_NEWA, THR_NEWG, THR_NEARMV, THR_NEARL2, THR_NEARL3, THR_NEARB, THR_NEARA2, THR_NEARA, THR_NEARG, THR_GLOBALMV, THR_GLOBALL2, THR_GLOBALL3, THR_GLOBALB, THR_GLOBALA2, THR_GLOBALA, THR_GLOBALG, THR_COMP_NEAREST_NEARESTLA, THR_COMP_NEAREST_NEARESTL2A, THR_COMP_NEAREST_NEARESTL3A, THR_COMP_NEAREST_NEARESTGA, THR_COMP_NEAREST_NEARESTLB, THR_COMP_NEAREST_NEARESTL2B, THR_COMP_NEAREST_NEARESTL3B, THR_COMP_NEAREST_NEARESTGB, THR_COMP_NEAREST_NEARESTLA2, THR_COMP_NEAREST_NEARESTL2A2, THR_COMP_NEAREST_NEARESTL3A2, THR_COMP_NEAREST_NEARESTGA2, THR_COMP_NEAREST_NEARESTLL2, THR_COMP_NEAREST_NEARESTLL3, THR_COMP_NEAREST_NEARESTLG, THR_COMP_NEAREST_NEARESTBA, THR_COMP_NEAR_NEARLB, THR_COMP_NEW_NEWLB, THR_COMP_NEW_NEARESTLB, THR_COMP_NEAREST_NEWLB, THR_COMP_NEW_NEARLB, THR_COMP_NEAR_NEWLB, THR_COMP_GLOBAL_GLOBALLB, THR_COMP_NEAR_NEARLA, THR_COMP_NEW_NEWLA, THR_COMP_NEW_NEARESTLA, THR_COMP_NEAREST_NEWLA, THR_COMP_NEW_NEARLA, THR_COMP_NEAR_NEWLA, THR_COMP_GLOBAL_GLOBALLA, THR_COMP_NEAR_NEARL2A, THR_COMP_NEW_NEWL2A, THR_COMP_NEW_NEARESTL2A, THR_COMP_NEAREST_NEWL2A, THR_COMP_NEW_NEARL2A, THR_COMP_NEAR_NEWL2A, THR_COMP_GLOBAL_GLOBALL2A, THR_COMP_NEAR_NEARL3A, THR_COMP_NEW_NEWL3A, THR_COMP_NEW_NEARESTL3A, THR_COMP_NEAREST_NEWL3A, THR_COMP_NEW_NEARL3A, THR_COMP_NEAR_NEWL3A, THR_COMP_GLOBAL_GLOBALL3A, THR_COMP_NEAR_NEARGA, THR_COMP_NEW_NEWGA, THR_COMP_NEW_NEARESTGA, THR_COMP_NEAREST_NEWGA, THR_COMP_NEW_NEARGA, THR_COMP_NEAR_NEWGA, THR_COMP_GLOBAL_GLOBALGA, THR_COMP_NEAR_NEARL2B, THR_COMP_NEW_NEWL2B, THR_COMP_NEW_NEARESTL2B, THR_COMP_NEAREST_NEWL2B, THR_COMP_NEW_NEARL2B, THR_COMP_NEAR_NEWL2B, THR_COMP_GLOBAL_GLOBALL2B, THR_COMP_NEAR_NEARL3B, THR_COMP_NEW_NEWL3B, THR_COMP_NEW_NEARESTL3B, THR_COMP_NEAREST_NEWL3B, THR_COMP_NEW_NEARL3B, THR_COMP_NEAR_NEWL3B, THR_COMP_GLOBAL_GLOBALL3B, THR_COMP_NEAR_NEARGB, THR_COMP_NEW_NEWGB, THR_COMP_NEW_NEARESTGB, THR_COMP_NEAREST_NEWGB, THR_COMP_NEW_NEARGB, THR_COMP_NEAR_NEWGB, THR_COMP_GLOBAL_GLOBALGB, THR_COMP_NEAR_NEARLA2, THR_COMP_NEW_NEWLA2, THR_COMP_NEW_NEARESTLA2, THR_COMP_NEAREST_NEWLA2, THR_COMP_NEW_NEARLA2, THR_COMP_NEAR_NEWLA2, THR_COMP_GLOBAL_GLOBALLA2, THR_COMP_NEAR_NEARL2A2, THR_COMP_NEW_NEWL2A2, THR_COMP_NEW_NEARESTL2A2, THR_COMP_NEAREST_NEWL2A2, THR_COMP_NEW_NEARL2A2, THR_COMP_NEAR_NEWL2A2, THR_COMP_GLOBAL_GLOBALL2A2, THR_COMP_NEAR_NEARL3A2, THR_COMP_NEW_NEWL3A2, THR_COMP_NEW_NEARESTL3A2, THR_COMP_NEAREST_NEWL3A2, THR_COMP_NEW_NEARL3A2, THR_COMP_NEAR_NEWL3A2, THR_COMP_GLOBAL_GLOBALL3A2, THR_COMP_NEAR_NEARGA2, THR_COMP_NEW_NEWGA2, THR_COMP_NEW_NEARESTGA2, THR_COMP_NEAREST_NEWGA2, THR_COMP_NEW_NEARGA2, THR_COMP_NEAR_NEWGA2, THR_COMP_GLOBAL_GLOBALGA2, THR_COMP_NEAR_NEARLL2, THR_COMP_NEW_NEWLL2, THR_COMP_NEW_NEARESTLL2, THR_COMP_NEAREST_NEWLL2, THR_COMP_NEW_NEARLL2, THR_COMP_NEAR_NEWLL2, THR_COMP_GLOBAL_GLOBALLL2, THR_COMP_NEAR_NEARLL3, THR_COMP_NEW_NEWLL3, THR_COMP_NEW_NEARESTLL3, THR_COMP_NEAREST_NEWLL3, THR_COMP_NEW_NEARLL3, THR_COMP_NEAR_NEWLL3, THR_COMP_GLOBAL_GLOBALLL3, THR_COMP_NEAR_NEARLG, THR_COMP_NEW_NEWLG, THR_COMP_NEW_NEARESTLG, THR_COMP_NEAREST_NEWLG, THR_COMP_NEW_NEARLG, THR_COMP_NEAR_NEWLG, THR_COMP_GLOBAL_GLOBALLG, THR_COMP_NEAR_NEARBA, THR_COMP_NEW_NEWBA, THR_COMP_NEW_NEARESTBA, THR_COMP_NEAREST_NEWBA, THR_COMP_NEW_NEARBA, THR_COMP_NEAR_NEWBA, THR_COMP_GLOBAL_GLOBALBA, THR_DC, THR_PAETH, THR_SMOOTH, THR_SMOOTH_V, THR_SMOOTH_H, THR_H_PRED, THR_V_PRED, THR_D135_PRED, THR_D203_PRED, THR_D157_PRED, THR_D67_PRED, THR_D113_PRED, THR_D45_PRED, MAX_MODES, SINGLE_REF_MODE_START = THR_NEARESTMV, SINGLE_REF_MODE_END = THR_COMP_NEAREST_NEARESTLA, NUM_SINGLE_REF_MODES = SINGLE_REF_MODE_END - SINGLE_REF_MODE_START, THR_MODE_START = THR_NEARESTMV, THR_MODE_END = MAX_MODES, THR_INTER_MODE_START = THR_MODE_START, THR_INTER_MODE_END = THR_DC, THR_INVALID = 255 } UENUM1BYTE(THR_MODES); enum { THR_LAST, THR_LAST2, THR_LAST3, THR_BWDR, THR_ALTR2, THR_GOLD, THR_ALTR, THR_COMP_LA, THR_COMP_L2A, THR_COMP_L3A, THR_COMP_GA, THR_COMP_LB, THR_COMP_L2B, THR_COMP_L3B, THR_COMP_GB, THR_COMP_LA2, THR_COMP_L2A2, THR_COMP_L3A2, THR_COMP_GA2, THR_INTRA, MAX_REFS } UENUM1BYTE(THR_MODES_SUB8X8); enum { FULL_TXFM_RD, LOW_TXFM_RD, } UENUM1BYTE(TXFM_RD_MODEL); enum { USE_FULL_RD = 0, USE_FAST_RD, USE_LARGESTALL, } UENUM1BYTE(TX_SIZE_SEARCH_METHOD); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_ENC_ENUMS_H_ aom-3.12.1/av1/encoder/encode_strategy.c000066400000000000000000002054661477627663500200470ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/common/blockd.h" #include "config/aom_config.h" #include "config/aom_scale_rtcd.h" #include "aom/aom_codec.h" #include "aom/aom_encoder.h" #if CONFIG_MISMATCH_DEBUG #include "aom_util/debug_util.h" #endif // CONFIG_MISMATCH_DEBUG #include "av1/common/av1_common_int.h" #include "av1/common/reconinter.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encode_strategy.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encoder_alloc.h" #include "av1/encoder/firstpass.h" #include "av1/encoder/gop_structure.h" #include "av1/encoder/pass2_strategy.h" #include "av1/encoder/temporal_filter.h" #if CONFIG_THREE_PASS #include "av1/encoder/thirdpass.h" #endif // CONFIG_THREE_PASS #include "av1/encoder/tpl_model.h" #if CONFIG_TUNE_VMAF #include "av1/encoder/tune_vmaf.h" #endif #define TEMPORAL_FILTER_KEY_FRAME (CONFIG_REALTIME_ONLY ? 0 : 1) static inline void set_refresh_frame_flags( RefreshFrameInfo *const refresh_frame, bool refresh_gf, bool refresh_bwdref, bool refresh_arf) { refresh_frame->golden_frame = refresh_gf; refresh_frame->bwd_ref_frame = refresh_bwdref; refresh_frame->alt_ref_frame = refresh_arf; } void av1_configure_buffer_updates(AV1_COMP *const cpi, RefreshFrameInfo *const refresh_frame, const FRAME_UPDATE_TYPE type, const REFBUF_STATE refbuf_state, int force_refresh_all) { // NOTE(weitinglin): Should we define another function to take care of // cpi->rc.is_$Source_Type to make this function as it is in the comment? const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags = &cpi->ext_flags.refresh_frame; cpi->rc.is_src_frame_alt_ref = 0; switch (type) { case KF_UPDATE: set_refresh_frame_flags(refresh_frame, true, true, true); break; case LF_UPDATE: set_refresh_frame_flags(refresh_frame, false, false, false); break; case GF_UPDATE: set_refresh_frame_flags(refresh_frame, true, false, false); break; case OVERLAY_UPDATE: if (refbuf_state == REFBUF_RESET) set_refresh_frame_flags(refresh_frame, true, true, true); else set_refresh_frame_flags(refresh_frame, true, false, false); cpi->rc.is_src_frame_alt_ref = 1; break; case ARF_UPDATE: // NOTE: BWDREF does not get updated along with ALTREF_FRAME. if (refbuf_state == REFBUF_RESET) set_refresh_frame_flags(refresh_frame, true, true, true); else set_refresh_frame_flags(refresh_frame, false, false, true); break; case INTNL_OVERLAY_UPDATE: set_refresh_frame_flags(refresh_frame, false, false, false); cpi->rc.is_src_frame_alt_ref = 1; break; case INTNL_ARF_UPDATE: set_refresh_frame_flags(refresh_frame, false, true, false); break; default: assert(0); break; } if (ext_refresh_frame_flags->update_pending && (!is_stat_generation_stage(cpi))) { set_refresh_frame_flags(refresh_frame, ext_refresh_frame_flags->golden_frame, ext_refresh_frame_flags->bwd_ref_frame, ext_refresh_frame_flags->alt_ref_frame); GF_GROUP *gf_group = &cpi->ppi->gf_group; if (ext_refresh_frame_flags->golden_frame) gf_group->update_type[cpi->gf_frame_index] = GF_UPDATE; if (ext_refresh_frame_flags->alt_ref_frame) gf_group->update_type[cpi->gf_frame_index] = ARF_UPDATE; if (ext_refresh_frame_flags->bwd_ref_frame) gf_group->update_type[cpi->gf_frame_index] = INTNL_ARF_UPDATE; } if (force_refresh_all) set_refresh_frame_flags(refresh_frame, true, true, true); } static void set_additional_frame_flags(const AV1_COMMON *const cm, unsigned int *const frame_flags) { if (frame_is_intra_only(cm)) { *frame_flags |= FRAMEFLAGS_INTRAONLY; } if (frame_is_sframe(cm)) { *frame_flags |= FRAMEFLAGS_SWITCH; } if (cm->features.error_resilient_mode) { *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT; } } static void set_ext_overrides(AV1_COMMON *const cm, EncodeFrameParams *const frame_params, ExternalFlags *const ext_flags) { // Overrides the defaults with the externally supplied values with // av1_update_reference() and av1_update_entropy() calls // Note: The overrides are valid only for the next frame passed // to av1_encode_lowlevel() if (ext_flags->use_s_frame) { frame_params->frame_type = S_FRAME; } if (ext_flags->refresh_frame_context_pending) { cm->features.refresh_frame_context = ext_flags->refresh_frame_context; ext_flags->refresh_frame_context_pending = 0; } cm->features.allow_ref_frame_mvs = ext_flags->use_ref_frame_mvs; frame_params->error_resilient_mode = ext_flags->use_error_resilient; // A keyframe is already error resilient and keyframes with // error_resilient_mode interferes with the use of show_existing_frame // when forward reference keyframes are enabled. frame_params->error_resilient_mode &= frame_params->frame_type != KEY_FRAME; // For bitstream conformance, s-frames must be error-resilient frame_params->error_resilient_mode |= frame_params->frame_type == S_FRAME; } static int choose_primary_ref_frame( AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) { const AV1_COMMON *const cm = &cpi->common; const int intra_only = frame_params->frame_type == KEY_FRAME || frame_params->frame_type == INTRA_ONLY_FRAME; if (intra_only || frame_params->error_resilient_mode || cpi->ext_flags.use_primary_ref_none) { return PRIMARY_REF_NONE; } #if !CONFIG_REALTIME_ONLY if (cpi->use_ducky_encode) { int wanted_fb = cpi->ppi->gf_group.primary_ref_idx[cpi->gf_frame_index]; for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) return ref_frame - LAST_FRAME; } return PRIMARY_REF_NONE; } #endif // !CONFIG_REALTIME_ONLY // In large scale case, always use Last frame's frame contexts. // Note(yunqing): In other cases, primary_ref_frame is chosen based on // cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index], which also controls // frame bit allocation. if (cm->tiles.large_scale) return (LAST_FRAME - LAST_FRAME); if (cpi->ppi->use_svc || cpi->ppi->rtc_ref.set_ref_frame_config) return av1_svc_primary_ref_frame(cpi); // Find the most recent reference frame with the same reference type as the // current frame const int current_ref_type = get_current_frame_ref_type(cpi); int wanted_fb = cpi->ppi->fb_of_context_type[current_ref_type]; #if CONFIG_FPMT_TEST if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { GF_GROUP *const gf_group = &cpi->ppi->gf_group; if (gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) { int frame_level = gf_group->frame_parallel_level[cpi->gf_frame_index]; // Book keep wanted_fb of frame_parallel_level 1 frame in an FP2 set. if (frame_level == 1) { cpi->wanted_fb = wanted_fb; } // Use the wanted_fb of level 1 frame in an FP2 for a level 2 frame in the // set. if (frame_level == 2 && gf_group->update_type[cpi->gf_frame_index - 1] == INTNL_ARF_UPDATE) { assert(gf_group->frame_parallel_level[cpi->gf_frame_index - 1] == 1); wanted_fb = cpi->wanted_fb; } } } #endif // CONFIG_FPMT_TEST int primary_ref_frame = PRIMARY_REF_NONE; for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) { primary_ref_frame = ref_frame - LAST_FRAME; } } return primary_ref_frame; } static void adjust_frame_rate(AV1_COMP *cpi, int64_t ts_start, int64_t ts_end) { TimeStamps *time_stamps = &cpi->time_stamps; int64_t this_duration; int step = 0; // Clear down mmx registers if (cpi->ppi->use_svc && cpi->ppi->rtc_ref.set_ref_frame_config && cpi->svc.number_spatial_layers > 1) { // ts_start is the timestamp for the current frame and ts_end is the // expected next timestamp given the duration passed into codec_encode(). // See the setting in encoder_encode() in av1_cx_iface.c: // ts_start = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol), // ts_end = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol + // duration). So the difference ts_end - ts_start is the duration passed // in by the user. For spatial layers SVC set the framerate based directly // on the duration, and bypass the adjustments below. this_duration = ts_end - ts_start; if (this_duration > 0) { cpi->new_framerate = 10000000.0 / this_duration; av1_new_framerate(cpi, cpi->new_framerate); time_stamps->prev_ts_start = ts_start; time_stamps->prev_ts_end = ts_end; return; } } if (ts_start == time_stamps->first_ts_start) { this_duration = ts_end - ts_start; step = 1; } else { int64_t last_duration = time_stamps->prev_ts_end - time_stamps->prev_ts_start; this_duration = ts_end - time_stamps->prev_ts_end; // do a step update if the duration changes by 10% if (last_duration) step = (int)((this_duration - last_duration) * 10 / last_duration); } if (this_duration) { if (step) { cpi->new_framerate = 10000000.0 / this_duration; av1_new_framerate(cpi, cpi->new_framerate); } else { // Average this frame's rate into the last second's average // frame rate. If we haven't seen 1 second yet, then average // over the whole interval seen. const double interval = AOMMIN((double)(ts_end - time_stamps->first_ts_start), 10000000.0); double avg_duration = 10000000.0 / cpi->framerate; avg_duration *= (interval - avg_duration + this_duration); avg_duration /= interval; cpi->new_framerate = (10000000.0 / avg_duration); // For parallel frames update cpi->framerate with new_framerate // during av1_post_encode_updates() double framerate = (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) ? cpi->framerate : cpi->new_framerate; av1_new_framerate(cpi, framerate); } } time_stamps->prev_ts_start = ts_start; time_stamps->prev_ts_end = ts_end; } // Determine whether there is a forced keyframe pending in the lookahead buffer int is_forced_keyframe_pending(struct lookahead_ctx *lookahead, const int up_to_index, const COMPRESSOR_STAGE compressor_stage) { for (int i = 0; i <= up_to_index; i++) { const struct lookahead_entry *e = av1_lookahead_peek(lookahead, i, compressor_stage); if (e == NULL) { // We have reached the end of the lookahead buffer and not early-returned // so there isn't a forced key-frame pending. return -1; } else if (e->flags == AOM_EFLAG_FORCE_KF) { return i; } else { continue; } } return -1; // Never reached } // Check if we should encode an ARF or internal ARF. If not, try a LAST // Do some setup associated with the chosen source // temporal_filtered, flush, and frame_update_type are outputs. // Return the frame source, or NULL if we couldn't find one static struct lookahead_entry *choose_frame_source( AV1_COMP *const cpi, int *const flush, int *pop_lookahead, struct lookahead_entry **last_source, int *const show_frame) { AV1_COMMON *const cm = &cpi->common; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; struct lookahead_entry *source = NULL; // Source index in lookahead buffer. int src_index = gf_group->arf_src_offset[cpi->gf_frame_index]; // TODO(Aasaipriya): Forced key frames need to be fixed when rc_mode != AOM_Q if (src_index && (is_forced_keyframe_pending(cpi->ppi->lookahead, src_index, cpi->compressor_stage) != -1) && cpi->oxcf.rc_cfg.mode != AOM_Q && !is_stat_generation_stage(cpi)) { src_index = 0; *flush = 1; } // If the current frame is arf, then we should not pop from the lookahead // buffer. If the current frame is not arf, then pop it. This assumes the // first frame in the GF group is not arf. May need to change if it is not // true. *pop_lookahead = (src_index == 0); // If this is a key frame and keyframe filtering is enabled with overlay, // then do not pop. if (*pop_lookahead && cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1 && gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE && !is_stat_generation_stage(cpi) && cpi->ppi->lookahead) { if (cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].sz && (*flush || cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].sz == cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].pop_sz)) { *pop_lookahead = 0; } } // LAP stage does not have ARFs or forward key-frames, // hence, always pop_lookahead here. if (is_stat_generation_stage(cpi)) { *pop_lookahead = 1; src_index = 0; } *show_frame = *pop_lookahead; #if CONFIG_FPMT_TEST if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_ENCODE) { #else { #endif // CONFIG_FPMT_TEST // Future frame in parallel encode set if (gf_group->src_offset[cpi->gf_frame_index] != 0 && !is_stat_generation_stage(cpi)) src_index = gf_group->src_offset[cpi->gf_frame_index]; } if (*show_frame) { // show frame, pop from buffer // Get last frame source. if (cm->current_frame.frame_number > 0) { *last_source = av1_lookahead_peek(cpi->ppi->lookahead, src_index - 1, cpi->compressor_stage); } // Read in the source frame. source = av1_lookahead_peek(cpi->ppi->lookahead, src_index, cpi->compressor_stage); } else { // no show frames are arf frames source = av1_lookahead_peek(cpi->ppi->lookahead, src_index, cpi->compressor_stage); if (source != NULL) { cm->showable_frame = 1; } } return source; } // Don't allow a show_existing_frame to coincide with an error resilient or // S-Frame. An exception can be made in the case of a keyframe, since it does // not depend on any previous frames. static int allow_show_existing(const AV1_COMP *const cpi, unsigned int frame_flags) { if (cpi->common.current_frame.frame_number == 0) return 0; const struct lookahead_entry *lookahead_src = av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage); if (lookahead_src == NULL) return 1; const int is_error_resilient = cpi->oxcf.tool_cfg.error_resilient_mode || (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT); const int is_s_frame = cpi->oxcf.kf_cfg.enable_sframe || (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME); const int is_key_frame = (cpi->rc.frames_to_key == 0) || (frame_flags & FRAMEFLAGS_KEY); return !(is_error_resilient || is_s_frame) || is_key_frame; } // Update frame_flags to tell the encoder's caller what sort of frame was // encoded. static void update_frame_flags(const AV1_COMMON *const cm, const RefreshFrameInfo *const refresh_frame, unsigned int *frame_flags) { if (encode_show_existing_frame(cm)) { *frame_flags &= ~(uint32_t)FRAMEFLAGS_GOLDEN; *frame_flags &= ~(uint32_t)FRAMEFLAGS_BWDREF; *frame_flags &= ~(uint32_t)FRAMEFLAGS_ALTREF; *frame_flags &= ~(uint32_t)FRAMEFLAGS_KEY; return; } if (refresh_frame->golden_frame) { *frame_flags |= FRAMEFLAGS_GOLDEN; } else { *frame_flags &= ~(uint32_t)FRAMEFLAGS_GOLDEN; } if (refresh_frame->alt_ref_frame) { *frame_flags |= FRAMEFLAGS_ALTREF; } else { *frame_flags &= ~(uint32_t)FRAMEFLAGS_ALTREF; } if (refresh_frame->bwd_ref_frame) { *frame_flags |= FRAMEFLAGS_BWDREF; } else { *frame_flags &= ~(uint32_t)FRAMEFLAGS_BWDREF; } if (cm->current_frame.frame_type == KEY_FRAME) { *frame_flags |= FRAMEFLAGS_KEY; } else { *frame_flags &= ~(uint32_t)FRAMEFLAGS_KEY; } } #define DUMP_REF_FRAME_IMAGES 0 #if DUMP_REF_FRAME_IMAGES == 1 static int dump_one_image(AV1_COMMON *cm, const YV12_BUFFER_CONFIG *const ref_buf, char *file_name) { int h; FILE *f_ref = NULL; if (ref_buf == NULL) { printf("Frame data buffer is NULL.\n"); return AOM_CODEC_MEM_ERROR; } if ((f_ref = fopen(file_name, "wb")) == NULL) { printf("Unable to open file %s to write.\n", file_name); return AOM_CODEC_MEM_ERROR; } // --- Y --- for (h = 0; h < cm->height; ++h) { fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref); } // --- U --- for (h = 0; h < (cm->height >> 1); ++h) { fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1), f_ref); } // --- V --- for (h = 0; h < (cm->height >> 1); ++h) { fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1), f_ref); } fclose(f_ref); return AOM_CODEC_OK; } static void dump_ref_frame_images(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; MV_REFERENCE_FRAME ref_frame; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { char file_name[256] = ""; snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv", cm->current_frame.frame_number, ref_frame); dump_one_image(cm, get_ref_frame_yv12_buf(cpi, ref_frame), file_name); } } #endif // DUMP_REF_FRAME_IMAGES == 1 int av1_get_refresh_ref_frame_map(int refresh_frame_flags) { int ref_map_index; for (ref_map_index = 0; ref_map_index < REF_FRAMES; ++ref_map_index) if ((refresh_frame_flags >> ref_map_index) & 1) break; if (ref_map_index == REF_FRAMES) ref_map_index = INVALID_IDX; return ref_map_index; } static int get_free_ref_map_index(RefFrameMapPair ref_map_pairs[REF_FRAMES]) { for (int idx = 0; idx < REF_FRAMES; ++idx) if (ref_map_pairs[idx].disp_order == -1) return idx; return INVALID_IDX; } static int get_refresh_idx(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], int update_arf, GF_GROUP *gf_group, int gf_index, int enable_refresh_skip, int cur_frame_disp) { int arf_count = 0; int oldest_arf_order = INT32_MAX; int oldest_arf_idx = -1; int oldest_frame_order = INT32_MAX; int oldest_idx = -1; for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) { RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx]; if (ref_pair.disp_order == -1) continue; const int frame_order = ref_pair.disp_order; const int reference_frame_level = ref_pair.pyr_level; // Keep future frames and three closest previous frames in output order. if (frame_order > cur_frame_disp - 3) continue; if (enable_refresh_skip) { int skip_frame = 0; // Prevent refreshing a frame in gf_group->skip_frame_refresh. for (int i = 0; i < REF_FRAMES; i++) { int frame_to_skip = gf_group->skip_frame_refresh[gf_index][i]; if (frame_to_skip == INVALID_IDX) break; if (frame_order == frame_to_skip) { skip_frame = 1; break; } } if (skip_frame) continue; } // Keep track of the oldest level 1 frame if the current frame is also level // 1. if (reference_frame_level == 1) { // If there are more than 2 level 1 frames in the reference list, // discard the oldest. if (frame_order < oldest_arf_order) { oldest_arf_order = frame_order; oldest_arf_idx = map_idx; } arf_count++; continue; } // Update the overall oldest reference frame. if (frame_order < oldest_frame_order) { oldest_frame_order = frame_order; oldest_idx = map_idx; } } if (update_arf && arf_count > 2) return oldest_arf_idx; if (oldest_idx >= 0) return oldest_idx; if (oldest_arf_idx >= 0) return oldest_arf_idx; if (oldest_idx == -1) { assert(arf_count > 2 && enable_refresh_skip); return oldest_arf_idx; } assert(0 && "No valid refresh index found"); return -1; } // Computes the reference refresh index for INTNL_ARF_UPDATE frame. int av1_calc_refresh_idx_for_intnl_arf( AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], int gf_index) { GF_GROUP *const gf_group = &cpi->ppi->gf_group; // Search for the open slot to store the current frame. int free_fb_index = get_free_ref_map_index(ref_frame_map_pairs); // Use a free slot if available. if (free_fb_index != INVALID_IDX) { return free_fb_index; } else { int enable_refresh_skip = !is_one_pass_rt_params(cpi); int refresh_idx = get_refresh_idx(ref_frame_map_pairs, 0, gf_group, gf_index, enable_refresh_skip, gf_group->display_idx[gf_index]); return refresh_idx; } } int av1_get_refresh_frame_flags( const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params, FRAME_UPDATE_TYPE frame_update_type, int gf_index, int cur_disp_order, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) { const AV1_COMMON *const cm = &cpi->common; const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags = &cpi->ext_flags.refresh_frame; GF_GROUP *gf_group = &cpi->ppi->gf_group; if (gf_group->refbuf_state[gf_index] == REFBUF_RESET) return SELECT_ALL_BUF_SLOTS; // TODO(jingning): Deprecate the following operations. // Switch frames and shown key-frames overwrite all reference slots if (frame_params->frame_type == S_FRAME) return SELECT_ALL_BUF_SLOTS; // show_existing_frames don't actually send refresh_frame_flags so set the // flags to 0 to keep things consistent. if (frame_params->show_existing_frame) return 0; const RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; if (is_frame_droppable(rtc_ref, ext_refresh_frame_flags)) return 0; #if !CONFIG_REALTIME_ONLY if (cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) { int new_fb_map_idx = cpi->ppi->gf_group.update_ref_idx[gf_index]; if (new_fb_map_idx == INVALID_IDX) return 0; return 1 << new_fb_map_idx; } #endif // !CONFIG_REALTIME_ONLY int refresh_mask = 0; if (ext_refresh_frame_flags->update_pending) { if (rtc_ref->set_ref_frame_config || use_rtc_reference_structure_one_layer(cpi)) { for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { int ref_frame_map_idx = rtc_ref->ref_idx[i]; refresh_mask |= rtc_ref->refresh[ref_frame_map_idx] << ref_frame_map_idx; } return refresh_mask; } // Unfortunately the encoder interface reflects the old refresh_*_frame // flags so we have to replicate the old refresh_frame_flags logic here in // order to preserve the behaviour of the flag overrides. int ref_frame_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME); if (ref_frame_map_idx != INVALID_IDX) refresh_mask |= ext_refresh_frame_flags->last_frame << ref_frame_map_idx; ref_frame_map_idx = get_ref_frame_map_idx(cm, EXTREF_FRAME); if (ref_frame_map_idx != INVALID_IDX) refresh_mask |= ext_refresh_frame_flags->bwd_ref_frame << ref_frame_map_idx; ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF2_FRAME); if (ref_frame_map_idx != INVALID_IDX) refresh_mask |= ext_refresh_frame_flags->alt2_ref_frame << ref_frame_map_idx; if (frame_update_type == OVERLAY_UPDATE) { ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME); if (ref_frame_map_idx != INVALID_IDX) refresh_mask |= ext_refresh_frame_flags->golden_frame << ref_frame_map_idx; } else { ref_frame_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME); if (ref_frame_map_idx != INVALID_IDX) refresh_mask |= ext_refresh_frame_flags->golden_frame << ref_frame_map_idx; ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME); if (ref_frame_map_idx != INVALID_IDX) refresh_mask |= ext_refresh_frame_flags->alt_ref_frame << ref_frame_map_idx; } return refresh_mask; } // Search for the open slot to store the current frame. int free_fb_index = get_free_ref_map_index(ref_frame_map_pairs); // No refresh necessary for these frame types. if (frame_update_type == OVERLAY_UPDATE || frame_update_type == INTNL_OVERLAY_UPDATE) return refresh_mask; // If there is an open slot, refresh that one instead of replacing a // reference. if (free_fb_index != INVALID_IDX) { refresh_mask = 1 << free_fb_index; return refresh_mask; } const int enable_refresh_skip = !is_one_pass_rt_params(cpi); const int update_arf = frame_update_type == ARF_UPDATE; const int refresh_idx = get_refresh_idx(ref_frame_map_pairs, update_arf, &cpi->ppi->gf_group, gf_index, enable_refresh_skip, cur_disp_order); return 1 << refresh_idx; } #if !CONFIG_REALTIME_ONLY // Apply temporal filtering to source frames and encode the filtered frame. // If the current frame does not require filtering, this function is identical // to av1_encode() except that tpl is not performed. static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest, size_t dest_size, EncodeFrameInput *const frame_input, const EncodeFrameParams *const frame_params, size_t *const frame_size) { #if CONFIG_COLLECT_COMPONENT_TIMING if (cpi->oxcf.pass == 2) start_timing(cpi, denoise_and_encode_time); #endif const AV1EncoderConfig *const oxcf = &cpi->oxcf; AV1_COMMON *const cm = &cpi->common; GF_GROUP *const gf_group = &cpi->ppi->gf_group; FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); const int is_second_arf = av1_gop_is_second_arf(gf_group, cpi->gf_frame_index); // Decide whether to apply temporal filtering to the source frame. int apply_filtering = av1_is_temporal_filter_on(oxcf) && !is_stat_generation_stage(cpi); if (update_type != KF_UPDATE && update_type != ARF_UPDATE && !is_second_arf) { apply_filtering = 0; } if (apply_filtering) { if (frame_params->frame_type == KEY_FRAME) { // TODO(angiebird): Move the noise level check to av1_tf_info_filtering. // Decide whether it is allowed to perform key frame filtering int allow_kf_filtering = oxcf->kf_cfg.enable_keyframe_filtering && !frame_params->show_existing_frame && !is_lossless_requested(&oxcf->rc_cfg); if (allow_kf_filtering) { double y_noise_level = 0.0; av1_estimate_noise_level( frame_input->source, &y_noise_level, AOM_PLANE_Y, AOM_PLANE_Y, cm->seq_params->bit_depth, NOISE_ESTIMATION_EDGE_THRESHOLD); apply_filtering = y_noise_level > 0; } else { apply_filtering = 0; } // If we are doing kf filtering, set up a few things. if (apply_filtering) { av1_setup_past_independence(cm); } } else if (is_second_arf) { apply_filtering = cpi->sf.hl_sf.second_alt_ref_filtering; } } #if CONFIG_COLLECT_COMPONENT_TIMING if (cpi->oxcf.pass == 2) start_timing(cpi, apply_filtering_time); #endif // Save the pointer to the original source image. YV12_BUFFER_CONFIG *source_buffer = frame_input->source; // apply filtering to frame if (apply_filtering) { int show_existing_alt_ref = 0; FRAME_DIFF frame_diff; int top_index = 0; int bottom_index = 0; const int q_index = av1_rc_pick_q_and_bounds( cpi, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height, cpi->gf_frame_index, &bottom_index, &top_index); // TODO(bohanli): figure out why we need frame_type in cm here. cm->current_frame.frame_type = frame_params->frame_type; if (update_type == KF_UPDATE || update_type == ARF_UPDATE) { YV12_BUFFER_CONFIG *tf_buf = av1_tf_info_get_filtered_buf( &cpi->ppi->tf_info, cpi->gf_frame_index, &frame_diff); if (tf_buf != NULL) { frame_input->source = tf_buf; show_existing_alt_ref = av1_check_show_filtered_frame( tf_buf, &frame_diff, q_index, cm->seq_params->bit_depth); if (show_existing_alt_ref) { cpi->common.showable_frame |= 1; } else { cpi->common.showable_frame = 0; } } if (gf_group->frame_type[cpi->gf_frame_index] != KEY_FRAME) { cpi->ppi->show_existing_alt_ref = show_existing_alt_ref; } } if (is_second_arf) { // Allocate the memory for tf_buf_second_arf buffer, only when it is // required. int ret = aom_realloc_frame_buffer( &cpi->ppi->tf_info.tf_buf_second_arf, oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL, cpi->alloc_pyramid, 0); if (ret) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate tf_buf_second_arf"); YV12_BUFFER_CONFIG *tf_buf_second_arf = &cpi->ppi->tf_info.tf_buf_second_arf; // We didn't apply temporal filtering for second arf ahead in // av1_tf_info_filtering(). const int arf_src_index = gf_group->arf_src_offset[cpi->gf_frame_index]; // Right now, we are still using tf_buf_second_arf due to // implementation complexity. // TODO(angiebird): Reuse tf_info->tf_buf here. av1_temporal_filter(cpi, arf_src_index, cpi->gf_frame_index, &frame_diff, tf_buf_second_arf); show_existing_alt_ref = av1_check_show_filtered_frame( tf_buf_second_arf, &frame_diff, q_index, cm->seq_params->bit_depth); if (show_existing_alt_ref) { aom_extend_frame_borders(tf_buf_second_arf, av1_num_planes(cm)); frame_input->source = tf_buf_second_arf; } // Currently INTNL_ARF_UPDATE only do show_existing. cpi->common.showable_frame |= 1; } // Copy source metadata to the temporal filtered frame if (source_buffer->metadata && aom_copy_metadata_to_frame_buffer(frame_input->source, source_buffer->metadata)) { aom_internal_error( cm->error, AOM_CODEC_MEM_ERROR, "Failed to copy source metadata to the temporal filtered frame"); } } #if CONFIG_COLLECT_COMPONENT_TIMING if (cpi->oxcf.pass == 2) end_timing(cpi, apply_filtering_time); #endif int set_mv_params = frame_params->frame_type == KEY_FRAME || update_type == ARF_UPDATE || update_type == GF_UPDATE; cm->show_frame = frame_params->show_frame; cm->current_frame.frame_type = frame_params->frame_type; // TODO(bohanli): Why is this? what part of it is necessary? av1_set_frame_size(cpi, cm->width, cm->height); if (set_mv_params) av1_set_mv_search_params(cpi); #if CONFIG_RD_COMMAND if (frame_params->frame_type == KEY_FRAME) { char filepath[] = "rd_command.txt"; av1_read_rd_command(filepath, &cpi->rd_command); } #endif // CONFIG_RD_COMMAND if (cpi->gf_frame_index == 0 && !is_stat_generation_stage(cpi)) { // perform tpl after filtering int allow_tpl = oxcf->gf_cfg.lag_in_frames > 1 && oxcf->algo_cfg.enable_tpl_model; if (gf_group->size > MAX_LENGTH_TPL_FRAME_STATS) { allow_tpl = 0; } if (frame_params->frame_type != KEY_FRAME) { // In rare case, it's possible to have non ARF/GF update_type here. // We should set allow_tpl to zero in the situation allow_tpl = allow_tpl && (update_type == ARF_UPDATE || update_type == GF_UPDATE || (cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL)); } if (allow_tpl) { if (!cpi->skip_tpl_setup_stats) { av1_tpl_preload_rc_estimate(cpi, frame_params); av1_tpl_setup_stats(cpi, 0, frame_params); #if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS assert(cpi->gf_frame_index == 0); av1_vbr_rc_update_q_index_list(&cpi->vbr_rc_info, &cpi->ppi->tpl_data, gf_group, cm->seq_params->bit_depth); #endif } } else { av1_init_tpl_stats(&cpi->ppi->tpl_data); } #if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS if (cpi->oxcf.pass == AOM_RC_SECOND_PASS && cpi->second_pass_log_stream != NULL) { TPL_INFO *tpl_info; AOM_CHECK_MEM_ERROR(cm->error, tpl_info, aom_malloc(sizeof(*tpl_info))); av1_pack_tpl_info(tpl_info, gf_group, &cpi->ppi->tpl_data); av1_write_tpl_info(tpl_info, cpi->second_pass_log_stream, cpi->common.error); aom_free(tpl_info); } #endif // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS } if (av1_encode(cpi, dest, dest_size, frame_input, frame_params, frame_size) != AOM_CODEC_OK) { return AOM_CODEC_ERROR; } // Set frame_input source to true source for psnr calculation. if (apply_filtering && is_psnr_calc_enabled(cpi)) { cpi->source = av1_realloc_and_scale_if_required( cm, source_buffer, &cpi->scaled_source, cm->features.interp_filter, 0, false, true, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); cpi->unscaled_source = source_buffer; } #if CONFIG_COLLECT_COMPONENT_TIMING if (cpi->oxcf.pass == 2) end_timing(cpi, denoise_and_encode_time); #endif return AOM_CODEC_OK; } #endif // !CONFIG_REALTIME_ONLY /*!\cond */ // Struct to keep track of relevant reference frame data. typedef struct { int map_idx; int disp_order; int pyr_level; int used; } RefBufMapData; /*!\endcond */ // Comparison function to sort reference frames in ascending display order. static int compare_map_idx_pair_asc(const void *a, const void *b) { if (((RefBufMapData *)a)->disp_order == ((RefBufMapData *)b)->disp_order) { return 0; } else if (((const RefBufMapData *)a)->disp_order > ((const RefBufMapData *)b)->disp_order) { return 1; } else { return -1; } } // Checks to see if a particular reference frame is already in the reference // frame map. static int is_in_ref_map(RefBufMapData *map, int disp_order, int n_frames) { for (int i = 0; i < n_frames; i++) { if (disp_order == map[i].disp_order) return 1; } return 0; } // Add a reference buffer index to a named reference slot. static void add_ref_to_slot(RefBufMapData *ref, int *const remapped_ref_idx, int frame) { remapped_ref_idx[frame - LAST_FRAME] = ref->map_idx; ref->used = 1; } // Threshold dictating when we are allowed to start considering // leaving lowest level frames unmapped. #define LOW_LEVEL_FRAMES_TR 5 // Find which reference buffer should be left out of the named mapping. // This is because there are 8 reference buffers and only 7 named slots. static void set_unmapped_ref(RefBufMapData *buffer_map, int n_bufs, int n_min_level_refs, int min_level, int cur_frame_disp) { int max_dist = 0; int unmapped_idx = -1; if (n_bufs <= ALTREF_FRAME) return; for (int i = 0; i < n_bufs; i++) { if (buffer_map[i].used) continue; if (buffer_map[i].pyr_level != min_level || n_min_level_refs >= LOW_LEVEL_FRAMES_TR) { int dist = abs(cur_frame_disp - buffer_map[i].disp_order); if (dist > max_dist) { max_dist = dist; unmapped_idx = i; } } } assert(unmapped_idx >= 0 && "Unmapped reference not found"); buffer_map[unmapped_idx].used = 1; } void av1_get_ref_frames(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], int cur_frame_disp, const AV1_COMP *cpi, int gf_index, int is_parallel_encode, int remapped_ref_idx[REF_FRAMES]) { int buf_map_idx = 0; // Initialize reference frame mappings. for (int i = 0; i < REF_FRAMES; ++i) remapped_ref_idx[i] = INVALID_IDX; #if !CONFIG_REALTIME_ONLY if (cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) { for (int rf = LAST_FRAME; rf < REF_FRAMES; ++rf) { if (cpi->ppi->gf_group.ref_frame_list[gf_index][rf] != INVALID_IDX) { remapped_ref_idx[rf - LAST_FRAME] = cpi->ppi->gf_group.ref_frame_list[gf_index][rf]; } } int valid_rf_idx = 0; static const int ref_frame_type_order[REF_FRAMES - LAST_FRAME] = { GOLDEN_FRAME, ALTREF_FRAME, LAST_FRAME, BWDREF_FRAME, ALTREF2_FRAME, LAST2_FRAME, LAST3_FRAME }; for (int i = 0; i < REF_FRAMES - LAST_FRAME; i++) { int rf = ref_frame_type_order[i]; if (remapped_ref_idx[rf - LAST_FRAME] != INVALID_IDX) { valid_rf_idx = remapped_ref_idx[rf - LAST_FRAME]; break; } } for (int i = 0; i < REF_FRAMES; ++i) { if (remapped_ref_idx[i] == INVALID_IDX) { remapped_ref_idx[i] = valid_rf_idx; } } return; } #endif // !CONFIG_REALTIME_ONLY RefBufMapData buffer_map[REF_FRAMES]; int n_bufs = 0; memset(buffer_map, 0, REF_FRAMES * sizeof(buffer_map[0])); int min_level = MAX_ARF_LAYERS; int max_level = 0; GF_GROUP *gf_group = &cpi->ppi->gf_group; int skip_ref_unmapping = 0; int is_one_pass_rt = is_one_pass_rt_params(cpi); // Go through current reference buffers and store display order, pyr level, // and map index. for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) { // Get reference frame buffer. RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx]; if (ref_pair.disp_order == -1) continue; const int frame_order = ref_pair.disp_order; // Avoid duplicates. if (is_in_ref_map(buffer_map, frame_order, n_bufs)) continue; const int reference_frame_level = ref_pair.pyr_level; // Keep track of the lowest and highest levels that currently exist. if (reference_frame_level < min_level) min_level = reference_frame_level; if (reference_frame_level > max_level) max_level = reference_frame_level; buffer_map[n_bufs].map_idx = map_idx; buffer_map[n_bufs].disp_order = frame_order; buffer_map[n_bufs].pyr_level = reference_frame_level; buffer_map[n_bufs].used = 0; n_bufs++; } // Sort frames in ascending display order. qsort(buffer_map, n_bufs, sizeof(buffer_map[0]), compare_map_idx_pair_asc); int n_min_level_refs = 0; int closest_past_ref = -1; int golden_idx = -1; int altref_idx = -1; // Find the GOLDEN_FRAME and BWDREF_FRAME. // Also collect various stats about the reference frames for the remaining // mappings. for (int i = n_bufs - 1; i >= 0; i--) { if (buffer_map[i].pyr_level == min_level) { // Keep track of the number of lowest level frames. n_min_level_refs++; if (buffer_map[i].disp_order < cur_frame_disp && golden_idx == -1 && remapped_ref_idx[GOLDEN_FRAME - LAST_FRAME] == INVALID_IDX) { // Save index for GOLDEN. golden_idx = i; } else if (buffer_map[i].disp_order > cur_frame_disp && altref_idx == -1 && remapped_ref_idx[ALTREF_FRAME - LAST_FRAME] == INVALID_IDX) { // Save index for ALTREF. altref_idx = i; } } else if (buffer_map[i].disp_order == cur_frame_disp) { // Map the BWDREF_FRAME if this is the show_existing_frame. add_ref_to_slot(&buffer_map[i], remapped_ref_idx, BWDREF_FRAME); } // During parallel encodes of lower layer frames, exclude the first frame // (frame_parallel_level 1) from being used for the reference assignment of // the second frame (frame_parallel_level 2). if (!is_one_pass_rt && gf_group->frame_parallel_level[gf_index] == 2 && gf_group->frame_parallel_level[gf_index - 1] == 1 && gf_group->update_type[gf_index - 1] == INTNL_ARF_UPDATE) { assert(gf_group->update_type[gf_index] == INTNL_ARF_UPDATE); #if CONFIG_FPMT_TEST is_parallel_encode = (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_ENCODE) ? is_parallel_encode : 0; #endif // CONFIG_FPMT_TEST // If parallel cpis are active, use ref_idx_to_skip, else, use display // index. assert(IMPLIES(is_parallel_encode, cpi->ref_idx_to_skip != INVALID_IDX)); assert(IMPLIES(!is_parallel_encode, gf_group->skip_frame_as_ref[gf_index] != INVALID_IDX)); buffer_map[i].used = is_parallel_encode ? (buffer_map[i].map_idx == cpi->ref_idx_to_skip) : (buffer_map[i].disp_order == gf_group->skip_frame_as_ref[gf_index]); // In case a ref frame is excluded from being used during assignment, // skip the call to set_unmapped_ref(). Applicable in steady state. if (buffer_map[i].used) skip_ref_unmapping = 1; } // Keep track of where the frames change from being past frames to future // frames. if (buffer_map[i].disp_order < cur_frame_disp && closest_past_ref < 0) closest_past_ref = i; } // Do not map GOLDEN and ALTREF based on their pyramid level if all reference // frames have the same level. if (n_min_level_refs <= n_bufs) { // Map the GOLDEN_FRAME. if (golden_idx > -1) add_ref_to_slot(&buffer_map[golden_idx], remapped_ref_idx, GOLDEN_FRAME); // Map the ALTREF_FRAME. if (altref_idx > -1) add_ref_to_slot(&buffer_map[altref_idx], remapped_ref_idx, ALTREF_FRAME); } // Find the buffer to be excluded from the mapping. if (!skip_ref_unmapping) set_unmapped_ref(buffer_map, n_bufs, n_min_level_refs, min_level, cur_frame_disp); // Place past frames in LAST_FRAME, LAST2_FRAME, and LAST3_FRAME. for (int frame = LAST_FRAME; frame < GOLDEN_FRAME; frame++) { // Continue if the current ref slot is already full. if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue; // Find the next unmapped reference buffer // in decreasing ouptut order relative to current picture. int next_buf_max = 0; int next_disp_order = INT_MIN; for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) { if (!buffer_map[buf_map_idx].used && buffer_map[buf_map_idx].disp_order < cur_frame_disp && buffer_map[buf_map_idx].disp_order > next_disp_order) { next_disp_order = buffer_map[buf_map_idx].disp_order; next_buf_max = buf_map_idx; } } buf_map_idx = next_buf_max; if (buf_map_idx < 0) break; if (buffer_map[buf_map_idx].used) break; add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame); } // Place future frames (if there are any) in BWDREF_FRAME and ALTREF2_FRAME. for (int frame = BWDREF_FRAME; frame < REF_FRAMES; frame++) { // Continue if the current ref slot is already full. if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue; // Find the next unmapped reference buffer // in increasing ouptut order relative to current picture. int next_buf_max = 0; int next_disp_order = INT_MAX; for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) { if (!buffer_map[buf_map_idx].used && buffer_map[buf_map_idx].disp_order > cur_frame_disp && buffer_map[buf_map_idx].disp_order < next_disp_order) { next_disp_order = buffer_map[buf_map_idx].disp_order; next_buf_max = buf_map_idx; } } buf_map_idx = next_buf_max; if (buf_map_idx < 0) break; if (buffer_map[buf_map_idx].used) break; add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame); } // Place remaining past frames. buf_map_idx = closest_past_ref; for (int frame = LAST_FRAME; frame < REF_FRAMES; frame++) { // Continue if the current ref slot is already full. if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue; // Find the next unmapped reference buffer. for (; buf_map_idx >= 0; buf_map_idx--) { if (!buffer_map[buf_map_idx].used) break; } if (buf_map_idx < 0) break; if (buffer_map[buf_map_idx].used) break; add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame); } // Place remaining future frames. buf_map_idx = n_bufs - 1; for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; frame--) { // Continue if the current ref slot is already full. if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue; // Find the next unmapped reference buffer. for (; buf_map_idx > closest_past_ref; buf_map_idx--) { if (!buffer_map[buf_map_idx].used) break; } if (buf_map_idx < 0) break; if (buffer_map[buf_map_idx].used) break; add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame); } // Fill any slots that are empty (should only happen for the first 7 frames). for (int i = 0; i < REF_FRAMES; ++i) if (remapped_ref_idx[i] == INVALID_IDX) remapped_ref_idx[i] = 0; } int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, uint8_t *const dest, size_t dest_size, unsigned int *frame_flags, int64_t *const time_stamp, int64_t *const time_end, const aom_rational64_t *const timestamp_ratio, int *const pop_lookahead, int flush) { AV1EncoderConfig *const oxcf = &cpi->oxcf; AV1_COMMON *const cm = &cpi->common; GF_GROUP *gf_group = &cpi->ppi->gf_group; ExternalFlags *const ext_flags = &cpi->ext_flags; GFConfig *const gf_cfg = &oxcf->gf_cfg; EncodeFrameInput frame_input; EncodeFrameParams frame_params; size_t frame_size; memset(&frame_input, 0, sizeof(frame_input)); memset(&frame_params, 0, sizeof(frame_params)); frame_size = 0; #if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS VBR_RATECTRL_INFO *vbr_rc_info = &cpi->vbr_rc_info; if (oxcf->pass == AOM_RC_THIRD_PASS && vbr_rc_info->ready == 0) { THIRD_PASS_FRAME_INFO frame_info[MAX_THIRD_PASS_BUF]; av1_open_second_pass_log(cpi, 1); FILE *second_pass_log_stream = cpi->second_pass_log_stream; fseek(second_pass_log_stream, 0, SEEK_END); size_t file_size = ftell(second_pass_log_stream); rewind(second_pass_log_stream); size_t read_size = 0; while (read_size < file_size) { THIRD_PASS_GOP_INFO gop_info; struct aom_internal_error_info *error = cpi->common.error; // Read in GOP information from the second pass file. av1_read_second_pass_gop_info(second_pass_log_stream, &gop_info, error); TPL_INFO *tpl_info; AOM_CHECK_MEM_ERROR(cm->error, tpl_info, aom_malloc(sizeof(*tpl_info))); av1_read_tpl_info(tpl_info, second_pass_log_stream, error); // Read in per-frame info from second-pass encoding av1_read_second_pass_per_frame_info(second_pass_log_stream, frame_info, gop_info.num_frames, error); av1_vbr_rc_append_tpl_info(vbr_rc_info, tpl_info); read_size = ftell(second_pass_log_stream); aom_free(tpl_info); } av1_close_second_pass_log(cpi); if (cpi->oxcf.rc_cfg.mode == AOM_Q) { vbr_rc_info->base_q_index = cpi->oxcf.rc_cfg.cq_level; av1_vbr_rc_compute_q_indices( vbr_rc_info->base_q_index, vbr_rc_info->total_frame_count, vbr_rc_info->qstep_ratio_list, cm->seq_params->bit_depth, vbr_rc_info->q_index_list); } else { vbr_rc_info->base_q_index = av1_vbr_rc_info_estimate_base_q( vbr_rc_info->total_bit_budget, cm->seq_params->bit_depth, vbr_rc_info->scale_factors, vbr_rc_info->total_frame_count, vbr_rc_info->update_type_list, vbr_rc_info->qstep_ratio_list, vbr_rc_info->txfm_stats_list, vbr_rc_info->q_index_list, NULL); } vbr_rc_info->ready = 1; #if CONFIG_RATECTRL_LOG rc_log_record_chunk_info(&cpi->rc_log, vbr_rc_info->base_q_index, vbr_rc_info->total_frame_count); #endif // CONFIG_RATECTRL_LOG } #endif // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS // Check if we need to stuff more src frames if (flush == 0) { int srcbuf_size = av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage); int pop_size = av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage); // Continue buffering look ahead buffer. if (srcbuf_size < pop_size) return -1; } if (!av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage)) { #if !CONFIG_REALTIME_ONLY if (flush && oxcf->pass == AOM_RC_FIRST_PASS && !cpi->ppi->twopass.first_pass_done) { av1_end_first_pass(cpi); /* get last stats packet */ cpi->ppi->twopass.first_pass_done = 1; } #endif return -1; } // TODO(sarahparker) finish bit allocation for one pass pyramid if (has_no_stats_stage(cpi)) { gf_cfg->gf_max_pyr_height = AOMMIN(gf_cfg->gf_max_pyr_height, USE_ALTREF_FOR_ONE_PASS); gf_cfg->gf_min_pyr_height = AOMMIN(gf_cfg->gf_min_pyr_height, gf_cfg->gf_max_pyr_height); } // Allocation of mi buffers. alloc_mb_mode_info_buffers(cpi); cpi->skip_tpl_setup_stats = 0; #if !CONFIG_REALTIME_ONLY if (oxcf->pass != AOM_RC_FIRST_PASS) { TplParams *const tpl_data = &cpi->ppi->tpl_data; if (tpl_data->tpl_stats_pool[0] == NULL) { av1_setup_tpl_buffers(cpi->ppi, &cm->mi_params, oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, 0, oxcf->gf_cfg.lag_in_frames); } } cpi->twopass_frame.this_frame = NULL; const int use_one_pass_rt_params = is_one_pass_rt_params(cpi); if (!use_one_pass_rt_params && !is_stat_generation_stage(cpi)) { #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, av1_get_second_pass_params_time); #endif // Initialise frame_level_rate_correction_factors with value previous // to the parallel frames. if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { for (int i = 0; i < RATE_FACTOR_LEVELS; i++) { cpi->rc.frame_level_rate_correction_factors[i] = #if CONFIG_FPMT_TEST (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? cpi->ppi->p_rc.temp_rate_correction_factors[i] : #endif // CONFIG_FPMT_TEST cpi->ppi->p_rc.rate_correction_factors[i]; } } // copy mv_stats from ppi to frame_level cpi. cpi->mv_stats = cpi->ppi->mv_stats; av1_get_second_pass_params(cpi, &frame_params, *frame_flags); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, av1_get_second_pass_params_time); #endif } #endif if (!is_stat_generation_stage(cpi)) { // TODO(jingning): fwd key frame always uses show existing frame? if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE && gf_group->refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) { frame_params.show_existing_frame = 1; } else { frame_params.show_existing_frame = (cpi->ppi->show_existing_alt_ref && gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) || gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE; } frame_params.show_existing_frame &= allow_show_existing(cpi, *frame_flags); // Special handling to reset 'show_existing_frame' in case of dropped // frames. if (oxcf->rc_cfg.drop_frames_water_mark && (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE || gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE)) { // During the encode of an OVERLAY_UPDATE/INTNL_OVERLAY_UPDATE frame, loop // over the gf group to check if the corresponding // ARF_UPDATE/INTNL_ARF_UPDATE frame was dropped. int cur_disp_idx = gf_group->display_idx[cpi->gf_frame_index]; for (int idx = 0; idx < cpi->gf_frame_index; idx++) { if (cur_disp_idx == gf_group->display_idx[idx]) { assert(IMPLIES( gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE, gf_group->update_type[idx] == ARF_UPDATE)); assert(IMPLIES(gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE, gf_group->update_type[idx] == INTNL_ARF_UPDATE)); // Reset show_existing_frame and set cpi->is_dropped_frame to true if // the frame was dropped during its first encode. if (gf_group->is_frame_dropped[idx]) { frame_params.show_existing_frame = 0; assert(!cpi->is_dropped_frame); cpi->is_dropped_frame = true; } break; } } } // Reset show_existing_alt_ref decision to 0 after it is used. if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) { cpi->ppi->show_existing_alt_ref = 0; } } else { frame_params.show_existing_frame = 0; } struct lookahead_entry *source = NULL; struct lookahead_entry *last_source = NULL; if (frame_params.show_existing_frame) { source = av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage); *pop_lookahead = 1; frame_params.show_frame = 1; } else { source = choose_frame_source(cpi, &flush, pop_lookahead, &last_source, &frame_params.show_frame); } if (source == NULL) { // If no source was found, we can't encode a frame. #if !CONFIG_REALTIME_ONLY if (flush && oxcf->pass == AOM_RC_FIRST_PASS && !cpi->ppi->twopass.first_pass_done) { av1_end_first_pass(cpi); /* get last stats packet */ cpi->ppi->twopass.first_pass_done = 1; } #endif return -1; } // reset src_offset to allow actual encode call for this frame to get its // source. gf_group->src_offset[cpi->gf_frame_index] = 0; // Source may be changed if temporal filtered later. frame_input.source = &source->img; if ((cpi->ppi->use_svc || cpi->rc.prev_frame_is_dropped) && last_source != NULL) av1_svc_set_last_source(cpi, &frame_input, &last_source->img); else frame_input.last_source = last_source != NULL ? &last_source->img : NULL; frame_input.ts_duration = source->ts_end - source->ts_start; // Save unfiltered source. It is used in av1_get_second_pass_params(). cpi->unfiltered_source = frame_input.source; *time_stamp = source->ts_start; *time_end = source->ts_end; if (source->ts_start < cpi->time_stamps.first_ts_start) { cpi->time_stamps.first_ts_start = source->ts_start; cpi->time_stamps.prev_ts_end = source->ts_start; } av1_apply_encoding_flags(cpi, source->flags); *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; #if CONFIG_FPMT_TEST if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { cpi->framerate = cpi->temp_framerate; } } #endif // CONFIG_FPMT_TEST // Shown frames and arf-overlay frames need frame-rate considering if (frame_params.show_frame) adjust_frame_rate(cpi, source->ts_start, source->ts_end); if (!frame_params.show_existing_frame) { #if !CONFIG_REALTIME_ONLY if (cpi->film_grain_table) { cm->cur_frame->film_grain_params_present = aom_film_grain_table_lookup( cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */, &cm->film_grain_params); } else { cm->cur_frame->film_grain_params_present = cm->seq_params->film_grain_params_present; } #endif // only one operating point supported now const int64_t pts64 = ticks_to_timebase_units(timestamp_ratio, *time_stamp); if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR; cm->frame_presentation_time = (uint32_t)pts64; } #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, av1_get_one_pass_rt_params_time); #endif #if CONFIG_REALTIME_ONLY av1_get_one_pass_rt_params(cpi, &frame_params.frame_type, &frame_input, *frame_flags); if (use_rtc_reference_structure_one_layer(cpi)) av1_set_rtc_reference_structure_one_layer(cpi, cpi->gf_frame_index == 0); #else if (use_one_pass_rt_params) { av1_get_one_pass_rt_params(cpi, &frame_params.frame_type, &frame_input, *frame_flags); if (use_rtc_reference_structure_one_layer(cpi)) av1_set_rtc_reference_structure_one_layer(cpi, cpi->gf_frame_index == 0); } #endif #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, av1_get_one_pass_rt_params_time); #endif FRAME_UPDATE_TYPE frame_update_type = get_frame_update_type(gf_group, cpi->gf_frame_index); if (frame_params.show_existing_frame && frame_params.frame_type != KEY_FRAME) { // Force show-existing frames to be INTER, except forward keyframes frame_params.frame_type = INTER_FRAME; } // Per-frame encode speed. In theory this can vary, but things may have // been written assuming speed-level will not change within a sequence, so // this parameter should be used with caution. frame_params.speed = oxcf->speed; #if !CONFIG_REALTIME_ONLY // Set forced key frames when necessary. For two-pass encoding / lap mode, // this is already handled by av1_get_second_pass_params. However when no // stats are available, we still need to check if the new frame is a keyframe. // For one pass rt, this is already checked in av1_get_one_pass_rt_params. if (!use_one_pass_rt_params && (is_stat_generation_stage(cpi) || has_no_stats_stage(cpi))) { // Current frame is coded as a key-frame for any of the following cases: // 1) First frame of a video // 2) For all-intra frame encoding // 3) When a key-frame is forced const int kf_requested = (cm->current_frame.frame_number == 0 || oxcf->kf_cfg.key_freq_max == 0 || (*frame_flags & FRAMEFLAGS_KEY)); if (kf_requested && frame_update_type != OVERLAY_UPDATE && frame_update_type != INTNL_OVERLAY_UPDATE) { frame_params.frame_type = KEY_FRAME; } else if (is_stat_generation_stage(cpi)) { // For stats generation, set the frame type to inter here. frame_params.frame_type = INTER_FRAME; } } #endif // Work out some encoding parameters specific to the pass: if (has_no_stats_stage(cpi) && oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) { av1_cyclic_refresh_update_parameters(cpi); } else if (is_stat_generation_stage(cpi)) { cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&oxcf->rc_cfg); } else if (is_stat_consumption_stage(cpi)) { #if CONFIG_MISMATCH_DEBUG mismatch_move_frame_idx_w(); #endif #if TXCOEFF_COST_TIMER cm->txcoeff_cost_timer = 0; cm->txcoeff_cost_count = 0; #endif } if (!is_stat_generation_stage(cpi)) set_ext_overrides(cm, &frame_params, ext_flags); // Shown keyframes and S frames refresh all reference buffers const int force_refresh_all = ((frame_params.frame_type == KEY_FRAME && frame_params.show_frame) || frame_params.frame_type == S_FRAME) && !frame_params.show_existing_frame; av1_configure_buffer_updates( cpi, &frame_params.refresh_frame, frame_update_type, gf_group->refbuf_state[cpi->gf_frame_index], force_refresh_all); if (!is_stat_generation_stage(cpi)) { const YV12_BUFFER_CONFIG *ref_frame_buf[INTER_REFS_PER_FRAME]; RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]; init_ref_map_pair(cpi, ref_frame_map_pairs); const int order_offset = gf_group->arf_src_offset[cpi->gf_frame_index]; const int cur_frame_disp = cpi->common.current_frame.frame_number + order_offset; int get_ref_frames = 0; #if CONFIG_FPMT_TEST get_ref_frames = (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 1 : 0; #endif // CONFIG_FPMT_TEST if (get_ref_frames || gf_group->frame_parallel_level[cpi->gf_frame_index] == 0) { if (!ext_flags->refresh_frame.update_pending) { av1_get_ref_frames(ref_frame_map_pairs, cur_frame_disp, cpi, cpi->gf_frame_index, 1, cm->remapped_ref_idx); } else if (cpi->ppi->rtc_ref.set_ref_frame_config || use_rtc_reference_structure_one_layer(cpi)) { for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) cm->remapped_ref_idx[i] = cpi->ppi->rtc_ref.ref_idx[i]; } } // Get the reference frames bool has_ref_frames = false; for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { const RefCntBuffer *ref_frame = get_ref_frame_buf(cm, ref_frame_priority_order[i]); ref_frame_buf[i] = ref_frame != NULL ? &ref_frame->buf : NULL; if (ref_frame != NULL) has_ref_frames = true; } if (!has_ref_frames && (frame_params.frame_type == INTER_FRAME || frame_params.frame_type == S_FRAME)) { return AOM_CODEC_ERROR; } // Work out which reference frame slots may be used. frame_params.ref_frame_flags = get_ref_frame_flags(&cpi->sf, is_one_pass_rt_params(cpi), ref_frame_buf, ext_flags->ref_frame_flags); // Set primary_ref_frame of non-reference frames as PRIMARY_REF_NONE. if (cpi->ppi->gf_group.is_frame_non_ref[cpi->gf_frame_index]) { frame_params.primary_ref_frame = PRIMARY_REF_NONE; } else { frame_params.primary_ref_frame = choose_primary_ref_frame(cpi, &frame_params); } frame_params.order_offset = gf_group->arf_src_offset[cpi->gf_frame_index]; // Call av1_get_refresh_frame_flags() if refresh index not available. if (!cpi->refresh_idx_available) { frame_params.refresh_frame_flags = av1_get_refresh_frame_flags( cpi, &frame_params, frame_update_type, cpi->gf_frame_index, cur_frame_disp, ref_frame_map_pairs); } else { assert(cpi->ref_refresh_index != INVALID_IDX); frame_params.refresh_frame_flags = (1 << cpi->ref_refresh_index); } // Make the frames marked as is_frame_non_ref to non-reference frames. if (gf_group->is_frame_non_ref[cpi->gf_frame_index]) frame_params.refresh_frame_flags = 0; frame_params.existing_fb_idx_to_show = INVALID_IDX; // Find the frame buffer to show based on display order. if (frame_params.show_existing_frame) { for (int frame = 0; frame < REF_FRAMES; frame++) { const RefCntBuffer *const buf = cm->ref_frame_map[frame]; if (buf == NULL) continue; const int frame_order = (int)buf->display_order_hint; if (frame_order == cur_frame_disp) frame_params.existing_fb_idx_to_show = frame; } } } // The way frame_params->remapped_ref_idx is setup is a placeholder. // Currently, reference buffer assignment is done by update_ref_frame_map() // which is called by high-level strategy AFTER encoding a frame. It // modifies cm->remapped_ref_idx. If you want to use an alternative method // to determine reference buffer assignment, just put your assignments into // frame_params->remapped_ref_idx here and they will be used when encoding // this frame. If frame_params->remapped_ref_idx is setup independently of // cm->remapped_ref_idx then update_ref_frame_map() will have no effect. memcpy(frame_params.remapped_ref_idx, cm->remapped_ref_idx, REF_FRAMES * sizeof(*cm->remapped_ref_idx)); cpi->td.mb.rdmult_delta_qindex = cpi->td.mb.delta_qindex = 0; if (!frame_params.show_existing_frame) { cm->quant_params.using_qmatrix = oxcf->q_cfg.using_qm; } const int is_intra_frame = frame_params.frame_type == KEY_FRAME || frame_params.frame_type == INTRA_ONLY_FRAME; FeatureFlags *const features = &cm->features; if (!is_stat_generation_stage(cpi) && (oxcf->pass == AOM_RC_ONE_PASS || oxcf->pass >= AOM_RC_SECOND_PASS) && is_intra_frame) { av1_set_screen_content_options(cpi, features); } #if CONFIG_REALTIME_ONLY if (av1_encode(cpi, dest, dest_size, &frame_input, &frame_params, &frame_size) != AOM_CODEC_OK) { return AOM_CODEC_ERROR; } #else if (has_no_stats_stage(cpi) && oxcf->mode == REALTIME && gf_cfg->lag_in_frames == 0) { if (av1_encode(cpi, dest, dest_size, &frame_input, &frame_params, &frame_size) != AOM_CODEC_OK) { return AOM_CODEC_ERROR; } } else if (denoise_and_encode(cpi, dest, dest_size, &frame_input, &frame_params, &frame_size) != AOM_CODEC_OK) { return AOM_CODEC_ERROR; } #endif // CONFIG_REALTIME_ONLY // This is used in rtc temporal filter case. Use true source in the PSNR // calculation. if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf) { assert(cpi->orig_source.buffer_alloc_sz > 0); cpi->source = &cpi->orig_source; } if (!is_stat_generation_stage(cpi)) { // First pass doesn't modify reference buffer assignment or produce frame // flags update_frame_flags(&cpi->common, &cpi->refresh_frame, frame_flags); set_additional_frame_flags(cm, frame_flags); } #if !CONFIG_REALTIME_ONLY #if TXCOEFF_COST_TIMER if (!is_stat_generation_stage(cpi)) { cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer; fprintf(stderr, "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld " "in us\n", cm->txcoeff_cost_count, cm->txcoeff_cost_timer, cm->cum_txcoeff_cost_timer); } #endif #endif // !CONFIG_REALTIME_ONLY #if CONFIG_TUNE_VMAF if (!is_stat_generation_stage(cpi) && (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) { av1_update_vmaf_curve(cpi); } #endif *size = frame_size; // Leave a signal for a higher level caller about if this frame is droppable if (*size > 0) { cpi->droppable = is_frame_droppable(&cpi->ppi->rtc_ref, &ext_flags->refresh_frame); } // For SVC, or when frame-dropper is enabled: // keep track of the (unscaled) source corresponding to the refresh of LAST // reference (base temporal layer - TL0). Copy only for the // top spatial enhancement layer so all spatial layers of the next // superframe have last_source to be aligned with previous TL0 superframe. // Avoid cases where resolution changes for unscaled source (top spatial // layer). Only needs to be done for frame that are encoded (size > 0). if (*size > 0 && (cpi->ppi->use_svc || cpi->oxcf.rc_cfg.drop_frames_water_mark > 0) && cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 && cpi->svc.temporal_layer_id == 0 && cpi->unscaled_source->y_width == cpi->svc.source_last_TL0.y_width && cpi->unscaled_source->y_height == cpi->svc.source_last_TL0.y_height) { aom_yv12_copy_y(cpi->unscaled_source, &cpi->svc.source_last_TL0, 1); aom_yv12_copy_u(cpi->unscaled_source, &cpi->svc.source_last_TL0, 1); aom_yv12_copy_v(cpi->unscaled_source, &cpi->svc.source_last_TL0, 1); } return AOM_CODEC_OK; } aom-3.12.1/av1/encoder/encode_strategy.h000066400000000000000000000130121477627663500200340ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief Declares frame encoding functions. */ #ifndef AOM_AV1_ENCODER_ENCODE_STRATEGY_H_ #define AOM_AV1_ENCODER_ENCODE_STRATEGY_H_ #ifdef __cplusplus extern "C" { #endif #include #include "aom/aom_encoder.h" #include "av1/encoder/encoder.h" #include "av1/encoder/firstpass.h" /*!\brief Implement high-level encode strategy * * \ingroup high_level_algo * \callgraph * \callergraph * This function will implement high-level encode strategy, choosing frame type, * frame placement, etc. It populates an EncodeFrameParams struct with the * results of these decisions and then encodes the frame. The caller should use * the output parameters *time_stamp and *time_end only when this function * returns AOM_CODEC_OK. * * \param[in] cpi Top-level encoder structure * \param[out] size Bitstream size * \param[out] dest Bitstream output buffer * \param[in] dest_size Bitstream output buffer size * \param[in] frame_flags Flags to decide how to encoding the frame * \param[out] time_stamp Time stamp of the frame * \param[out] time_end Time end * \param[in] timestamp_ratio Time base * \param[in] pop_lookahead Decide to pop the source frame from queue * \param[in] flush Decide to encode one frame or the rest of frames * * \return Returns a value to indicate if the encoding is done successfully. * \retval #AOM_CODEC_OK * \retval -1 * \retval #AOM_CODEC_ERROR */ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, uint8_t *const dest, size_t dest_size, unsigned int *frame_flags, int64_t *const time_stamp, int64_t *const time_end, const aom_rational64_t *const timestamp_ratio, int *const pop_lookahead, int flush); /*!\cond */ // Set individual buffer update flags based on frame reference type. // force_refresh_all is used when we have a KEY_FRAME or S_FRAME. It forces all // refresh_*_frame flags to be set, because we refresh all buffers in this case. void av1_configure_buffer_updates(AV1_COMP *const cpi, RefreshFrameInfo *const refresh_frame, const FRAME_UPDATE_TYPE type, const REFBUF_STATE refbuf_state, int force_refresh_all); int av1_get_refresh_frame_flags( const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params, FRAME_UPDATE_TYPE frame_update_type, int gf_index, int cur_disp_order, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]); int av1_get_refresh_ref_frame_map(int refresh_frame_flags); /*!\brief Obtain indices of reference frames in ref_frame_map * * \callgraph * \callergraph * * \param[out] remapped_ref_idx An array for storing indices of reference * frames. The index is used to retrieve a * reference frame buffer from ref_frame_map * in AV1Common. */ void av1_get_ref_frames(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], int cur_frame_disp, const AV1_COMP *cpi, int gf_index, int is_parallel_encode, int remapped_ref_idx[REF_FRAMES]); int is_forced_keyframe_pending(struct lookahead_ctx *lookahead, const int up_to_index, const COMPRESSOR_STAGE compressor_stage); static inline int is_frame_droppable( const RTC_REF *const rtc_ref, const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags) { // Droppable frame is only used by external refresh flags. VoD setting won't // trigger its use case. if (rtc_ref->set_ref_frame_config) return rtc_ref->non_reference_frame; else if (ext_refresh_frame_flags->update_pending) return !(ext_refresh_frame_flags->alt_ref_frame || ext_refresh_frame_flags->alt2_ref_frame || ext_refresh_frame_flags->bwd_ref_frame || ext_refresh_frame_flags->golden_frame || ext_refresh_frame_flags->last_frame); else return 0; } static inline int get_current_frame_ref_type(const AV1_COMP *const cpi) { // We choose the reference "type" of this frame from the flags which indicate // which reference frames will be refreshed by it. More than one of these // flags may be set, so the order here implies an order of precedence. This is // just used to choose the primary_ref_frame (as the most recent reference // buffer of the same reference-type as the current frame). switch (cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]) { case 0: return 0; case 1: return 1; case MAX_ARF_LAYERS: case MAX_ARF_LAYERS + 1: return 4; default: return 7; } } int av1_calc_refresh_idx_for_intnl_arf( AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], int gf_index); /*!\endcond */ #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_ENCODE_STRATEGY_H_ aom-3.12.1/av1/encoder/encodeframe.c000066400000000000000000003044021477627663500171260ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/binary_codes_writer.h" #include "aom_ports/mem.h" #include "aom_ports/aom_timer.h" #include "aom_util/aom_pthread.h" #if CONFIG_MISMATCH_DEBUG #include "aom_util/debug_util.h" #endif // CONFIG_MISMATCH_DEBUG #include "av1/common/cfl.h" #include "av1/common/common.h" #include "av1/common/common_data.h" #include "av1/common/entropy.h" #include "av1/common/entropymode.h" #include "av1/common/idct.h" #include "av1/common/mv.h" #include "av1/common/mvref_common.h" #include "av1/common/pred_common.h" #include "av1/common/quant_common.h" #include "av1/common/reconintra.h" #include "av1/common/reconinter.h" #include "av1/common/seg_common.h" #include "av1/common/tile_common.h" #include "av1/common/warped_motion.h" #include "av1/encoder/allintra_vis.h" #include "av1/encoder/aq_complexity.h" #include "av1/encoder/aq_cyclicrefresh.h" #include "av1/encoder/aq_variance.h" #include "av1/encoder/global_motion_facade.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodeframe_utils.h" #include "av1/encoder/encodemb.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/encodetxb.h" #include "av1/encoder/ethread.h" #include "av1/encoder/extend.h" #include "av1/encoder/intra_mode_search_utils.h" #include "av1/encoder/ml.h" #include "av1/encoder/motion_search_facade.h" #include "av1/encoder/partition_strategy.h" #if !CONFIG_REALTIME_ONLY #include "av1/encoder/partition_model_weights.h" #endif #include "av1/encoder/partition_search.h" #include "av1/encoder/rd.h" #include "av1/encoder/rdopt.h" #include "av1/encoder/reconinter_enc.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/tokenize.h" #include "av1/encoder/tpl_model.h" #include "av1/encoder/var_based_part.h" #if CONFIG_TUNE_VMAF #include "av1/encoder/tune_vmaf.h" #endif /*!\cond */ // This is used as a reference when computing the source variance for the // purposes of activity masking. // Eventually this should be replaced by custom no-reference routines, // which will be faster. static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; #if CONFIG_AV1_HIGHBITDEPTH static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = { 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4 }; static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = { 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16 }; #endif // CONFIG_AV1_HIGHBITDEPTH /*!\endcond */ // For the given bit depth, returns a constant array used to assist the // calculation of source block variance, which will then be used to decide // adaptive quantizers. static const uint8_t *get_var_offs(int use_hbd, int bd) { #if CONFIG_AV1_HIGHBITDEPTH if (use_hbd) { assert(bd == 8 || bd == 10 || bd == 12); const int off_index = (bd - 8) >> 1; static const uint16_t *high_var_offs[3] = { AV1_HIGH_VAR_OFFS_8, AV1_HIGH_VAR_OFFS_10, AV1_HIGH_VAR_OFFS_12 }; return CONVERT_TO_BYTEPTR(high_var_offs[off_index]); } #else (void)use_hbd; (void)bd; assert(!use_hbd); #endif assert(bd == 8); return AV1_VAR_OFFS; } void av1_init_rtc_counters(MACROBLOCK *const x) { av1_init_cyclic_refresh_counters(x); x->cnt_zeromv = 0; } void av1_accumulate_rtc_counters(AV1_COMP *cpi, const MACROBLOCK *const x) { if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) av1_accumulate_cyclic_refresh_counters(cpi->cyclic_refresh, x); cpi->rc.cnt_zeromv += x->cnt_zeromv; } unsigned int av1_get_perpixel_variance(const AV1_COMP *cpi, const MACROBLOCKD *xd, const struct buf_2d *ref, BLOCK_SIZE bsize, int plane, int use_hbd) { const int subsampling_x = xd->plane[plane].subsampling_x; const int subsampling_y = xd->plane[plane].subsampling_y; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, subsampling_x, subsampling_y); unsigned int sse; const unsigned int var = cpi->ppi->fn_ptr[plane_bsize].vf( ref->buf, ref->stride, get_var_offs(use_hbd, xd->bd), 0, &sse); return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[plane_bsize]); } unsigned int av1_get_perpixel_variance_facade(const AV1_COMP *cpi, const MACROBLOCKD *xd, const struct buf_2d *ref, BLOCK_SIZE bsize, int plane) { const int use_hbd = is_cur_buf_hbd(xd); return av1_get_perpixel_variance(cpi, xd, ref, bsize, plane, use_hbd); } void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, const int num_planes, BLOCK_SIZE bsize) { // Set current frame pointer. x->e_mbd.cur_buf = src; // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet // the static analysis warnings. for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); i++) { const int is_uv = i > 0; setup_pred_plane( &x->plane[i].src, bsize, src->buffers[i], src->crop_widths[is_uv], src->crop_heights[is_uv], src->strides[is_uv], mi_row, mi_col, NULL, x->e_mbd.plane[i].subsampling_x, x->e_mbd.plane[i].subsampling_y); } } #if !CONFIG_REALTIME_ONLY /*!\brief Assigns different quantization parameters to each super * block based on its TPL weight. * * \ingroup tpl_modelling * * \param[in] cpi Top level encoder instance structure * \param[in,out] td Thread data structure * \param[in,out] x Macro block level data for this block. * \param[in] tile_info Tile infromation / identification * \param[in] mi_row Block row (in "MI_SIZE" units) index * \param[in] mi_col Block column (in "MI_SIZE" units) index * \param[out] num_planes Number of image planes (e.g. Y,U,V) * * \remark No return value but updates macroblock and thread data * related to the q / q delta to be used. */ static inline void setup_delta_q(AV1_COMP *const cpi, ThreadData *td, MACROBLOCK *const x, const TileInfo *const tile_info, int mi_row, int mi_col, int num_planes) { AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; const DeltaQInfo *const delta_q_info = &cm->delta_q_info; assert(delta_q_info->delta_q_present_flag); const BLOCK_SIZE sb_size = cm->seq_params->sb_size; // Delta-q modulation based on variance av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size); const int delta_q_res = delta_q_info->delta_q_res; int current_qindex = cm->quant_params.base_qindex; if (cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.qp_mode == DUCKY_ENCODE_FRAME_MODE_QINDEX) { const int sb_row = mi_row >> cm->seq_params->mib_size_log2; const int sb_col = mi_col >> cm->seq_params->mib_size_log2; const int sb_cols = CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2); const int sb_index = sb_row * sb_cols + sb_col; current_qindex = cpi->ducky_encode_info.frame_info.superblock_encode_qindex[sb_index]; } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL) { if (DELTA_Q_PERCEPTUAL_MODULATION == 1) { const int block_wavelet_energy_level = av1_block_wavelet_energy_level(cpi, x, sb_size); x->sb_energy_level = block_wavelet_energy_level; current_qindex = av1_compute_q_from_energy_level_deltaq_mode( cpi, block_wavelet_energy_level); } else { const int block_var_level = av1_log_block_var(cpi, x, sb_size); x->sb_energy_level = block_var_level; current_qindex = av1_compute_q_from_energy_level_deltaq_mode(cpi, block_var_level); } } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_OBJECTIVE && cpi->oxcf.algo_cfg.enable_tpl_model) { // Setup deltaq based on tpl stats current_qindex = av1_get_q_for_deltaq_objective(cpi, td, NULL, sb_size, mi_row, mi_col); } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI) { current_qindex = av1_get_sbq_perceptual_ai(cpi, sb_size, mi_row, mi_col); } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) { current_qindex = av1_get_sbq_user_rating_based(cpi, mi_row, mi_col); } else if (cpi->oxcf.q_cfg.enable_hdr_deltaq) { current_qindex = av1_get_q_for_hdr(cpi, x, sb_size, mi_row, mi_col); } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_VARIANCE_BOOST) { current_qindex = av1_get_sbq_variance_boost(cpi, x); } x->rdmult_cur_qindex = current_qindex; MACROBLOCKD *const xd = &x->e_mbd; const int adjusted_qindex = av1_adjust_q_from_delta_q_res( delta_q_res, xd->current_base_qindex, current_qindex); if (cpi->use_ducky_encode) { assert(adjusted_qindex == current_qindex); } current_qindex = adjusted_qindex; x->delta_qindex = current_qindex - cm->quant_params.base_qindex; x->rdmult_delta_qindex = x->delta_qindex; av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); xd->mi[0]->current_qindex = current_qindex; av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 0); // keep track of any non-zero delta-q used td->deltaq_used |= (x->delta_qindex != 0); if (cpi->oxcf.tool_cfg.enable_deltalf_mode) { const int delta_lf_res = delta_q_info->delta_lf_res; const int lfmask = ~(delta_lf_res - 1); const int delta_lf_from_base = ((x->delta_qindex / 4 + delta_lf_res / 2) & lfmask); const int8_t delta_lf = (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); const int frame_lf_count = av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; const int mib_size = cm->seq_params->mib_size; // pre-set the delta lf for loop filter. Note that this value is set // before mi is assigned for each block in current superblock for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) { for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) { const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k); mi_params->mi_alloc[grid_idx].delta_lf_from_base = delta_lf; for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { mi_params->mi_alloc[grid_idx].delta_lf[lf_id] = delta_lf; } } } } } static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row, int mi_col) { const AV1_COMMON *cm = &cpi->common; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const CommonModeInfoParams *const mi_params = &cm->mi_params; MACROBLOCK *x = &td->mb; const int frame_idx = cpi->gf_frame_index; TplParams *const tpl_data = &cpi->ppi->tpl_data; const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; av1_zero(x->tpl_keep_ref_frame); if (!av1_tpl_stats_ready(tpl_data, frame_idx)) return; if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return; if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return; const int is_overlay = cpi->ppi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE; if (is_overlay) { memset(x->tpl_keep_ref_frame, 1, sizeof(x->tpl_keep_ref_frame)); return; } TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx]; TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; const int tpl_stride = tpl_frame->stride; int64_t inter_cost[INTER_REFS_PER_FRAME] = { 0 }; const int step = 1 << block_mis_log2; const BLOCK_SIZE sb_size = cm->seq_params->sb_size; const int mi_row_end = AOMMIN(mi_size_high[sb_size] + mi_row, mi_params->mi_rows); const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); const int mi_col_sr = coded_to_superres_mi(mi_col, cm->superres_scale_denominator); const int mi_col_end_sr = AOMMIN(coded_to_superres_mi(mi_col + mi_size_wide[sb_size], cm->superres_scale_denominator), mi_cols_sr); const int row_step = step; const int col_step_sr = coded_to_superres_mi(step, cm->superres_scale_denominator); for (int row = mi_row; row < mi_row_end; row += row_step) { for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) { const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)]; int64_t tpl_pred_error[INTER_REFS_PER_FRAME] = { 0 }; // Find the winner ref frame idx for the current block int64_t best_inter_cost = this_stats->pred_error[0]; int best_rf_idx = 0; for (int idx = 1; idx < INTER_REFS_PER_FRAME; ++idx) { if ((this_stats->pred_error[idx] < best_inter_cost) && (this_stats->pred_error[idx] != 0)) { best_inter_cost = this_stats->pred_error[idx]; best_rf_idx = idx; } } // tpl_pred_error is the pred_error reduction of best_ref w.r.t. // LAST_FRAME. tpl_pred_error[best_rf_idx] = this_stats->pred_error[best_rf_idx] - this_stats->pred_error[LAST_FRAME - 1]; for (int rf_idx = 1; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx) inter_cost[rf_idx] += tpl_pred_error[rf_idx]; } } int rank_index[INTER_REFS_PER_FRAME - 1]; for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) { rank_index[idx] = idx + 1; for (int i = idx; i > 0; --i) { if (inter_cost[rank_index[i - 1]] > inter_cost[rank_index[i]]) { const int tmp = rank_index[i - 1]; rank_index[i - 1] = rank_index[i]; rank_index[i] = tmp; } } } x->tpl_keep_ref_frame[INTRA_FRAME] = 1; x->tpl_keep_ref_frame[LAST_FRAME] = 1; int cutoff_ref = 0; for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) { x->tpl_keep_ref_frame[rank_index[idx] + LAST_FRAME] = 1; if (idx > 2) { if (!cutoff_ref) { // If the predictive coding gains are smaller than the previous more // relevant frame over certain amount, discard this frame and all the // frames afterwards. if (llabs(inter_cost[rank_index[idx]]) < llabs(inter_cost[rank_index[idx - 1]]) / 8 || inter_cost[rank_index[idx]] == 0) cutoff_ref = 1; } if (cutoff_ref) x->tpl_keep_ref_frame[rank_index[idx] + LAST_FRAME] = 0; } } } static inline void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col) { const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size; const int orig_rdmult = cpi->rd.RDMULT; assert(IMPLIES(cpi->ppi->gf_group.size > 0, cpi->gf_frame_index < cpi->ppi->gf_group.size)); const int gf_group_index = cpi->gf_frame_index; if (cpi->oxcf.algo_cfg.enable_tpl_model && cpi->oxcf.q_cfg.aq_mode == NO_AQ && cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 && cpi->ppi->gf_group.update_type[gf_group_index] == ARF_UPDATE) { const int dr = av1_get_rdmult_delta(cpi, sb_size, mi_row, mi_col, orig_rdmult); x->rdmult = dr; } } #endif // !CONFIG_REALTIME_ONLY #if CONFIG_RT_ML_PARTITIONING // Get a prediction(stored in x->est_pred) for the whole superblock. static void get_estimated_pred(AV1_COMP *cpi, const TileInfo *const tile, MACROBLOCK *x, int mi_row, int mi_col) { AV1_COMMON *const cm = &cpi->common; const int is_key_frame = frame_is_intra_only(cm); MACROBLOCKD *xd = &x->e_mbd; // TODO(kyslov) Extend to 128x128 assert(cm->seq_params->sb_size == BLOCK_64X64); av1_set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); if (!is_key_frame) { MB_MODE_INFO *mi = xd->mi[0]; const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); assert(yv12 != NULL); av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, get_ref_scale_factors(cm, LAST_FRAME), 1); mi->ref_frame[0] = LAST_FRAME; mi->ref_frame[1] = NONE; mi->bsize = BLOCK_64X64; mi->mv[0].as_int = 0; mi->interp_filters = av1_broadcast_interp_filter(BILINEAR); set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); xd->plane[0].dst.buf = x->est_pred; xd->plane[0].dst.stride = 64; av1_enc_build_inter_predictor_y(xd, mi_row, mi_col); } else { #if CONFIG_AV1_HIGHBITDEPTH switch (xd->bd) { case 8: memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); break; case 10: memset(x->est_pred, 128 * 4, 64 * 64 * sizeof(x->est_pred[0])); break; case 12: memset(x->est_pred, 128 * 16, 64 * 64 * sizeof(x->est_pred[0])); break; } #else memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); #endif // CONFIG_VP9_HIGHBITDEPTH } } #endif // CONFIG_RT_ML_PARTITIONING #define AVG_CDF_WEIGHT_LEFT 3 #define AVG_CDF_WEIGHT_TOP_RIGHT 1 /*!\brief Encode a superblock (minimal RD search involved) * * \ingroup partition_search * Encodes the superblock by a pre-determined partition pattern, only minor * rd-based searches are allowed to adjust the initial pattern. It is only used * by realtime encoding. */ static inline void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, const int mi_row, const int mi_col, const int seg_skip) { AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &td->mb; const SPEED_FEATURES *const sf = &cpi->sf; const TileInfo *const tile_info = &tile_data->tile_info; MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + get_mi_grid_idx(&cm->mi_params, mi_row, mi_col); const BLOCK_SIZE sb_size = cm->seq_params->sb_size; PC_TREE *const pc_root = td->pc_root; #if CONFIG_RT_ML_PARTITIONING if (sf->part_sf.partition_search_type == ML_BASED_PARTITION) { RD_STATS dummy_rdc; get_estimated_pred(cpi, tile_info, x, mi_row, mi_col); av1_nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rdc, 1, INT64_MAX, pc_root); return; } #endif // Set the partition if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip || (sf->rt_sf.use_fast_fixed_part && x->sb_force_fixed_part == 1 && (!frame_is_intra_only(cm) && (!cpi->ppi->use_svc || !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)))) { // set a fixed-size partition av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); BLOCK_SIZE bsize_select = sf->part_sf.fixed_partition_size; if (sf->rt_sf.use_fast_fixed_part && x->content_state_sb.source_sad_nonrd < kLowSad) { bsize_select = cm->seq_params->sb_size; } if (cpi->sf.rt_sf.skip_encoding_non_reference_slide_change && cpi->rc.high_source_sad && cpi->ppi->rtc_ref.non_reference_frame) { bsize_select = cm->seq_params->sb_size; x->force_zeromv_skip_for_sb = 1; } const BLOCK_SIZE bsize = seg_skip ? sb_size : bsize_select; if (x->content_state_sb.source_sad_nonrd > kZeroSad) x->force_color_check_block_level = 1; av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); } else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) { // set a variance-based partition av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col); } assert(sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip || sf->part_sf.partition_search_type == VAR_BASED_PARTITION); set_cb_offsets(td->mb.cb_offset, 0, 0); // Initialize the flag to skip cdef to 1. if (sf->rt_sf.skip_cdef_sb) { const int block64_in_sb = (sb_size == BLOCK_128X128) ? 2 : 1; // If 128x128 block is used, we need to set the flag for all 4 64x64 sub // "blocks". for (int r = 0; r < block64_in_sb; ++r) { for (int c = 0; c < block64_in_sb; ++c) { const int idx_in_sb = r * MI_SIZE_64X64 * cm->mi_params.mi_stride + c * MI_SIZE_64X64; if (mi[idx_in_sb]) mi[idx_in_sb]->cdef_strength = 1; } } } #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, nonrd_use_partition_time); #endif av1_nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size, pc_root); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, nonrd_use_partition_time); #endif } // This function initializes the stats for encode_rd_sb. static inline void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td, const TileDataEnc *tile_data, SIMPLE_MOTION_DATA_TREE *sms_root, RD_STATS *rd_cost, int mi_row, int mi_col, int gather_tpl_data) { const AV1_COMMON *cm = &cpi->common; const TileInfo *tile_info = &tile_data->tile_info; MACROBLOCK *x = &td->mb; const SPEED_FEATURES *sf = &cpi->sf; const int use_simple_motion_search = (sf->part_sf.simple_motion_search_split || sf->part_sf.simple_motion_search_prune_rect || sf->part_sf.simple_motion_search_early_term_none || sf->part_sf.ml_early_term_after_part_split_level) && !frame_is_intra_only(cm); if (use_simple_motion_search) { av1_init_simple_motion_search_mvs_for_sb(cpi, tile_info, x, sms_root, mi_row, mi_col); } #if !CONFIG_REALTIME_ONLY if (!(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME && cpi->oxcf.gf_cfg.lag_in_frames == 0)) { init_ref_frame_space(cpi, td, mi_row, mi_col); x->sb_energy_level = 0; x->part_search_info.cnn_output_valid = 0; if (gather_tpl_data) { if (cm->delta_q_info.delta_q_present_flag) { const int num_planes = av1_num_planes(cm); const BLOCK_SIZE sb_size = cm->seq_params->sb_size; setup_delta_q(cpi, td, x, tile_info, mi_row, mi_col, num_planes); av1_tpl_rdmult_setup_sb(cpi, x, sb_size, mi_row, mi_col); } // TODO(jingning): revisit this function. if (cpi->oxcf.algo_cfg.enable_tpl_model && (0)) { adjust_rdmult_tpl_model(cpi, x, mi_row, mi_col); } } } #else (void)tile_info; (void)mi_row; (void)mi_col; (void)gather_tpl_data; #endif x->reuse_inter_pred = false; x->txfm_search_params.mode_eval_type = DEFAULT_EVAL; reset_mb_rd_record(x->txfm_search_info.mb_rd_record); av1_zero(x->picked_ref_frames_mask); av1_invalid_rd_stats(rd_cost); } #if !CONFIG_REALTIME_ONLY static void sb_qp_sweep_init_quantizers(AV1_COMP *cpi, ThreadData *td, const TileDataEnc *tile_data, SIMPLE_MOTION_DATA_TREE *sms_tree, RD_STATS *rd_cost, int mi_row, int mi_col, int delta_qp_ofs) { AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &td->mb; const BLOCK_SIZE sb_size = cm->seq_params->sb_size; const TileInfo *tile_info = &tile_data->tile_info; const CommonModeInfoParams *const mi_params = &cm->mi_params; const DeltaQInfo *const delta_q_info = &cm->delta_q_info; assert(delta_q_info->delta_q_present_flag); const int delta_q_res = delta_q_info->delta_q_res; const SPEED_FEATURES *sf = &cpi->sf; const int use_simple_motion_search = (sf->part_sf.simple_motion_search_split || sf->part_sf.simple_motion_search_prune_rect || sf->part_sf.simple_motion_search_early_term_none || sf->part_sf.ml_early_term_after_part_split_level) && !frame_is_intra_only(cm); if (use_simple_motion_search) { av1_init_simple_motion_search_mvs_for_sb(cpi, tile_info, x, sms_tree, mi_row, mi_col); } int current_qindex = x->rdmult_cur_qindex + delta_qp_ofs; MACROBLOCKD *const xd = &x->e_mbd; current_qindex = av1_adjust_q_from_delta_q_res( delta_q_res, xd->current_base_qindex, current_qindex); x->delta_qindex = current_qindex - cm->quant_params.base_qindex; av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); xd->mi[0]->current_qindex = current_qindex; av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 0); // keep track of any non-zero delta-q used td->deltaq_used |= (x->delta_qindex != 0); if (cpi->oxcf.tool_cfg.enable_deltalf_mode) { const int delta_lf_res = delta_q_info->delta_lf_res; const int lfmask = ~(delta_lf_res - 1); const int delta_lf_from_base = ((x->delta_qindex / 4 + delta_lf_res / 2) & lfmask); const int8_t delta_lf = (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); const int frame_lf_count = av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; const int mib_size = cm->seq_params->mib_size; // pre-set the delta lf for loop filter. Note that this value is set // before mi is assigned for each block in current superblock for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) { for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) { const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k); mi_params->mi_alloc[grid_idx].delta_lf_from_base = delta_lf; for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { mi_params->mi_alloc[grid_idx].delta_lf[lf_id] = delta_lf; } } } } x->reuse_inter_pred = false; x->txfm_search_params.mode_eval_type = DEFAULT_EVAL; reset_mb_rd_record(x->txfm_search_info.mb_rd_record); av1_zero(x->picked_ref_frames_mask); av1_invalid_rd_stats(rd_cost); } static int sb_qp_sweep(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, SIMPLE_MOTION_DATA_TREE *sms_tree, SB_FIRST_PASS_STATS *sb_org_stats) { AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &td->mb; RD_STATS rdc_winner, cur_rdc; av1_invalid_rd_stats(&rdc_winner); int best_qindex = td->mb.rdmult_delta_qindex; const int start = cm->current_frame.frame_type == KEY_FRAME ? -20 : -12; const int end = cm->current_frame.frame_type == KEY_FRAME ? 20 : 12; const int step = cm->delta_q_info.delta_q_res; for (int sweep_qp_delta = start; sweep_qp_delta <= end; sweep_qp_delta += step) { sb_qp_sweep_init_quantizers(cpi, td, tile_data, sms_tree, &cur_rdc, mi_row, mi_col, sweep_qp_delta); const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col); const int backup_current_qindex = cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex; av1_reset_mbmi(&cm->mi_params, bsize, mi_row, mi_col); av1_restore_sb_state(sb_org_stats, cpi, td, tile_data, mi_row, mi_col); cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex = backup_current_qindex; td->pc_root = av1_alloc_pc_tree_node(bsize); if (!td->pc_root) aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, &cur_rdc, cur_rdc, td->pc_root, sms_tree, NULL, SB_DRY_PASS, NULL); if ((rdc_winner.rdcost > cur_rdc.rdcost) || (abs(sweep_qp_delta) < abs(best_qindex - x->rdmult_delta_qindex) && rdc_winner.rdcost == cur_rdc.rdcost)) { rdc_winner = cur_rdc; best_qindex = x->rdmult_delta_qindex + sweep_qp_delta; } } return best_qindex; } #endif //! CONFIG_REALTIME_ONLY /*!\brief Encode a superblock (RD-search-based) * * \ingroup partition_search * Conducts partition search for a superblock, based on rate-distortion costs, * from scratch or adjusting from a pre-calculated partition pattern. */ static inline void encode_rd_sb(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, const int mi_row, const int mi_col, const int seg_skip) { AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; const SPEED_FEATURES *const sf = &cpi->sf; const TileInfo *const tile_info = &tile_data->tile_info; MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + get_mi_grid_idx(&cm->mi_params, mi_row, mi_col); const BLOCK_SIZE sb_size = cm->seq_params->sb_size; const int num_planes = av1_num_planes(cm); int dummy_rate; int64_t dummy_dist; RD_STATS dummy_rdc; SIMPLE_MOTION_DATA_TREE *const sms_root = td->sms_root; #if CONFIG_REALTIME_ONLY (void)seg_skip; #endif // CONFIG_REALTIME_ONLY init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row, mi_col, 1); // Encode the superblock if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) { // partition search starting from a variance-based partition av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col); #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, rd_use_partition_time); #endif td->pc_root = av1_alloc_pc_tree_node(sb_size); if (!td->pc_root) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size, &dummy_rate, &dummy_dist, 1, td->pc_root); av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0, sf->part_sf.partition_search_type); td->pc_root = NULL; #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, rd_use_partition_time); #endif } #if !CONFIG_REALTIME_ONLY else if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) { // partition search by adjusting a fixed-size partition av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); const BLOCK_SIZE bsize = seg_skip ? sb_size : sf->part_sf.fixed_partition_size; av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); td->pc_root = av1_alloc_pc_tree_node(sb_size); if (!td->pc_root) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size, &dummy_rate, &dummy_dist, 1, td->pc_root); av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0, sf->part_sf.partition_search_type); td->pc_root = NULL; } else { // The most exhaustive recursive partition search SuperBlockEnc *sb_enc = &x->sb_enc; // No stats for overlay frames. Exclude key frame. av1_get_tpl_stats_sb(cpi, sb_size, mi_row, mi_col, sb_enc); // Reset the tree for simple motion search data av1_reset_simple_motion_tree_partition(sms_root, sb_size); #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, rd_pick_partition_time); #endif // Estimate the maximum square partition block size, which will be used // as the starting block size for partitioning the sb set_max_min_partition_size(sb_enc, cpi, x, sf, sb_size, mi_row, mi_col); // The superblock can be searched only once, or twice consecutively for // better quality. Note that the meaning of passes here is different from // the general concept of 1-pass/2-pass encoders. const int num_passes = cpi->oxcf.unit_test_cfg.sb_multipass_unit_test ? 2 : 1; if (cpi->oxcf.sb_qp_sweep && !(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME && cpi->oxcf.gf_cfg.lag_in_frames == 0) && cm->delta_q_info.delta_q_present_flag) { AOM_CHECK_MEM_ERROR( x->e_mbd.error_info, td->mb.sb_stats_cache, (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(*td->mb.sb_stats_cache))); av1_backup_sb_state(td->mb.sb_stats_cache, cpi, td, tile_data, mi_row, mi_col); assert(x->rdmult_delta_qindex == x->delta_qindex); const int best_qp_diff = sb_qp_sweep(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, sms_root, td->mb.sb_stats_cache) - x->rdmult_delta_qindex; sb_qp_sweep_init_quantizers(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row, mi_col, best_qp_diff); const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col); const int backup_current_qindex = cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex; av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col); av1_restore_sb_state(td->mb.sb_stats_cache, cpi, td, tile_data, mi_row, mi_col); cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex = backup_current_qindex; aom_free(td->mb.sb_stats_cache); td->mb.sb_stats_cache = NULL; } if (num_passes == 1) { #if CONFIG_PARTITION_SEARCH_ORDER if (cpi->ext_part_controller.ready && !frame_is_intra_only(cm)) { av1_reset_part_sf(&cpi->sf.part_sf); av1_reset_sf_for_ext_part(cpi); RD_STATS this_rdc; av1_rd_partition_search(cpi, td, tile_data, tp, sms_root, mi_row, mi_col, sb_size, &this_rdc); } else { td->pc_root = av1_alloc_pc_tree_node(sb_size); if (!td->pc_root) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL, SB_SINGLE_PASS, NULL); } #else td->pc_root = av1_alloc_pc_tree_node(sb_size); if (!td->pc_root) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL, SB_SINGLE_PASS, NULL); #endif // CONFIG_PARTITION_SEARCH_ORDER } else { // First pass AOM_CHECK_MEM_ERROR( x->e_mbd.error_info, td->mb.sb_fp_stats, (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(*td->mb.sb_fp_stats))); av1_backup_sb_state(td->mb.sb_fp_stats, cpi, td, tile_data, mi_row, mi_col); td->pc_root = av1_alloc_pc_tree_node(sb_size); if (!td->pc_root) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL, SB_DRY_PASS, NULL); // Second pass init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row, mi_col, 0); av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col); av1_reset_simple_motion_tree_partition(sms_root, sb_size); av1_restore_sb_state(td->mb.sb_fp_stats, cpi, td, tile_data, mi_row, mi_col); td->pc_root = av1_alloc_pc_tree_node(sb_size); if (!td->pc_root) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL, SB_WET_PASS, NULL); aom_free(td->mb.sb_fp_stats); td->mb.sb_fp_stats = NULL; } // Reset to 0 so that it wouldn't be used elsewhere mistakenly. sb_enc->tpl_data_count = 0; #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, rd_pick_partition_time); #endif } #endif // !CONFIG_REALTIME_ONLY // Update the inter rd model // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile. if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 && cm->tiles.cols == 1 && cm->tiles.rows == 1) { av1_inter_mode_data_fit(tile_data, x->rdmult); } } // Check if the cost update of symbols mode, coeff and dv are tile or off. static inline int is_mode_coeff_dv_upd_freq_tile_or_off( const AV1_COMP *const cpi) { const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf; return (inter_sf->coeff_cost_upd_level <= INTERNAL_COST_UPD_TILE && inter_sf->mode_cost_upd_level <= INTERNAL_COST_UPD_TILE && cpi->sf.intra_sf.dv_cost_upd_level <= INTERNAL_COST_UPD_TILE); } // When row-mt is enabled and cost update frequencies are set to off/tile, // processing of current SB can start even before processing of top-right SB // is finished. This function checks if it is sufficient to wait for top SB // to finish processing before current SB starts processing. static inline int delay_wait_for_top_right_sb(const AV1_COMP *const cpi) { const MODE mode = cpi->oxcf.mode; if (mode == GOOD) return 0; if (mode == ALLINTRA) return is_mode_coeff_dv_upd_freq_tile_or_off(cpi); else if (mode == REALTIME) return (is_mode_coeff_dv_upd_freq_tile_or_off(cpi) && cpi->sf.inter_sf.mv_cost_upd_level <= INTERNAL_COST_UPD_TILE); else return 0; } /*!\brief Calculate source SAD at superblock level using 64x64 block source SAD * * \ingroup partition_search * \callgraph * \callergraph */ static inline uint64_t get_sb_source_sad(const AV1_COMP *cpi, int mi_row, int mi_col) { if (cpi->src_sad_blk_64x64 == NULL) return UINT64_MAX; const AV1_COMMON *const cm = &cpi->common; const int blk_64x64_in_mis = (cm->seq_params->sb_size == BLOCK_128X128) ? (cm->seq_params->mib_size >> 1) : cm->seq_params->mib_size; const int num_blk_64x64_cols = (cm->mi_params.mi_cols + blk_64x64_in_mis - 1) / blk_64x64_in_mis; const int num_blk_64x64_rows = (cm->mi_params.mi_rows + blk_64x64_in_mis - 1) / blk_64x64_in_mis; const int blk_64x64_col_index = mi_col / blk_64x64_in_mis; const int blk_64x64_row_index = mi_row / blk_64x64_in_mis; uint64_t curr_sb_sad = UINT64_MAX; // Avoid the border as sad_blk_64x64 may not be set for the border // in the scene detection. if ((blk_64x64_row_index >= num_blk_64x64_rows - 1) || (blk_64x64_col_index >= num_blk_64x64_cols - 1)) { return curr_sb_sad; } const uint64_t *const src_sad_blk_64x64_data = &cpi->src_sad_blk_64x64[blk_64x64_col_index + blk_64x64_row_index * num_blk_64x64_cols]; if (cm->seq_params->sb_size == BLOCK_128X128) { // Calculate SB source SAD by accumulating source SAD of 64x64 blocks in the // superblock curr_sb_sad = src_sad_blk_64x64_data[0] + src_sad_blk_64x64_data[1] + src_sad_blk_64x64_data[num_blk_64x64_cols] + src_sad_blk_64x64_data[num_blk_64x64_cols + 1]; } else if (cm->seq_params->sb_size == BLOCK_64X64) { curr_sb_sad = src_sad_blk_64x64_data[0]; } return curr_sb_sad; } /*!\brief Determine whether grading content can be skipped based on sad stat * * \ingroup partition_search * \callgraph * \callergraph */ static inline bool is_calc_src_content_needed(AV1_COMP *cpi, MACROBLOCK *const x, int mi_row, int mi_col) { if (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) return true; const uint64_t curr_sb_sad = get_sb_source_sad(cpi, mi_row, mi_col); if (curr_sb_sad == UINT64_MAX) return true; if (curr_sb_sad == 0) { x->content_state_sb.source_sad_nonrd = kZeroSad; return false; } AV1_COMMON *const cm = &cpi->common; bool do_calc_src_content = true; if (cpi->oxcf.speed < 9) return do_calc_src_content; // TODO(yunqing): Tune/validate the thresholds for 128x128 SB size. if (AOMMIN(cm->width, cm->height) < 360) { // Derive Average 64x64 block source SAD from SB source SAD const uint64_t avg_64x64_blk_sad = (cm->seq_params->sb_size == BLOCK_128X128) ? ((curr_sb_sad + 2) >> 2) : curr_sb_sad; // The threshold is determined based on kLowSad and kHighSad threshold and // test results. uint64_t thresh_low = 15000; uint64_t thresh_high = 40000; if (cpi->sf.rt_sf.increase_source_sad_thresh) { thresh_low = thresh_low << 1; thresh_high = thresh_high << 1; } if (avg_64x64_blk_sad > thresh_low && avg_64x64_blk_sad < thresh_high) { do_calc_src_content = false; // Note: set x->content_state_sb.source_sad_rd as well if this is extended // to RTC rd path. x->content_state_sb.source_sad_nonrd = kMedSad; } } return do_calc_src_content; } /*!\brief Determine whether grading content is needed based on sf and frame stat * * \ingroup partition_search * \callgraph * \callergraph */ // TODO(any): consolidate sfs to make interface cleaner static inline void grade_source_content_sb(AV1_COMP *cpi, MACROBLOCK *const x, TileDataEnc *tile_data, int mi_row, int mi_col) { AV1_COMMON *const cm = &cpi->common; if (cm->current_frame.frame_type == KEY_FRAME || (cpi->ppi->use_svc && cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) { assert(x->content_state_sb.source_sad_nonrd == kMedSad); assert(x->content_state_sb.source_sad_rd == kMedSad); return; } bool calc_src_content = false; if (cpi->sf.rt_sf.source_metrics_sb_nonrd) { if (!cpi->sf.rt_sf.check_scene_detection || cpi->rc.frame_source_sad > 0) { calc_src_content = is_calc_src_content_needed(cpi, x, mi_row, mi_col); } else { x->content_state_sb.source_sad_nonrd = kZeroSad; } } else if ((cpi->sf.rt_sf.var_part_based_on_qidx >= 1) && (cm->width * cm->height <= 352 * 288)) { if (cpi->rc.frame_source_sad > 0) calc_src_content = true; else x->content_state_sb.source_sad_rd = kZeroSad; } if (calc_src_content) av1_source_content_sb(cpi, x, tile_data, mi_row, mi_col); } /*!\brief Encode a superblock row by breaking it into superblocks * * \ingroup partition_search * \callgraph * \callergraph * Do partition and mode search for an sb row: one row of superblocks filling up * the width of the current tile. */ static inline void encode_sb_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row, TokenExtra **tp) { AV1_COMMON *const cm = &cpi->common; const TileInfo *const tile_info = &tile_data->tile_info; MultiThreadInfo *const mt_info = &cpi->mt_info; AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync; bool row_mt_enabled = mt_info->row_mt_enabled; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info); const BLOCK_SIZE sb_size = cm->seq_params->sb_size; const int mib_size = cm->seq_params->mib_size; const int mib_size_log2 = cm->seq_params->mib_size_log2; const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2; const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode; #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, encode_sb_row_time); #endif // Initialize the left context for the new SB row av1_zero_left_context(xd); // Reset delta for quantizer and loof filters at the beginning of every tile if (mi_row == tile_info->mi_row_start || row_mt_enabled) { if (cm->delta_q_info.delta_q_present_flag) xd->current_base_qindex = cm->quant_params.base_qindex; if (cm->delta_q_info.delta_lf_present_flag) { av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); } } reset_thresh_freq_fact(x); // Code each SB in the row for (int mi_col = tile_info->mi_col_start, sb_col_in_tile = 0; mi_col < tile_info->mi_col_end; mi_col += mib_size, sb_col_in_tile++) { // In realtime/allintra mode and when frequency of cost updates is off/tile, // wait for the top superblock to finish encoding. Otherwise, wait for the // top-right superblock to finish encoding. enc_row_mt->sync_read_ptr( row_mt_sync, sb_row, sb_col_in_tile - delay_wait_for_top_right_sb(cpi)); #if CONFIG_MULTITHREAD if (row_mt_enabled) { pthread_mutex_lock(enc_row_mt->mutex_); const bool row_mt_exit = enc_row_mt->row_mt_exit; pthread_mutex_unlock(enc_row_mt->mutex_); // Exit in case any worker has encountered an error. if (row_mt_exit) return; } #endif const int update_cdf = tile_data->allow_update_cdf && row_mt_enabled; if (update_cdf && (tile_info->mi_row_start != mi_row)) { if ((tile_info->mi_col_start == mi_col)) { // restore frame context at the 1st column sb memcpy(xd->tile_ctx, x->row_ctx, sizeof(*xd->tile_ctx)); } else { // update context int wt_left = AVG_CDF_WEIGHT_LEFT; int wt_tr = AVG_CDF_WEIGHT_TOP_RIGHT; if (tile_info->mi_col_end > (mi_col + mib_size)) av1_avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile, wt_left, wt_tr); else av1_avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile - 1, wt_left, wt_tr); } } // Update the rate cost tables for some symbols av1_set_cost_upd_freq(cpi, td, tile_info, mi_row, mi_col); // Reset color coding related parameters av1_zero(x->color_sensitivity_sb); av1_zero(x->color_sensitivity_sb_g); av1_zero(x->color_sensitivity_sb_alt); av1_zero(x->color_sensitivity); x->content_state_sb.source_sad_nonrd = kMedSad; x->content_state_sb.source_sad_rd = kMedSad; x->content_state_sb.lighting_change = 0; x->content_state_sb.low_sumdiff = 0; x->force_zeromv_skip_for_sb = 0; x->sb_me_block = 0; x->sb_me_partition = 0; x->sb_me_mv.as_int = 0; x->sb_force_fixed_part = 1; x->color_palette_thresh = 64; x->force_color_check_block_level = 0; x->nonrd_prune_ref_frame_search = cpi->sf.rt_sf.nonrd_prune_ref_frame_search; if (cpi->oxcf.mode == ALLINTRA) { x->intra_sb_rdmult_modifier = 128; } xd->cur_frame_force_integer_mv = cm->features.cur_frame_force_integer_mv; x->source_variance = UINT_MAX; td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col); // Get segment id and skip flag const struct segmentation *const seg = &cm->seg; int seg_skip = 0; if (seg->enabled) { const uint8_t *const map = seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map; const uint8_t segment_id = map ? get_segment_id(&cm->mi_params, map, sb_size, mi_row, mi_col) : 0; seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); } produce_gradients_for_sb(cpi, x, sb_size, mi_row, mi_col); init_src_var_info_of_4x4_sub_blocks(cpi, x->src_var_info_of_4x4_sub_blocks, sb_size); // Grade the temporal variation of the sb, the grade will be used to decide // fast mode search strategy for coding blocks if (!seg_skip) grade_source_content_sb(cpi, x, tile_data, mi_row, mi_col); // encode the superblock if (use_nonrd_mode) { encode_nonrd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip); } else { encode_rd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip); } // Update the top-right context in row_mt coding if (update_cdf && (tile_info->mi_row_end > (mi_row + mib_size))) { if (sb_cols_in_tile == 1) memcpy(x->row_ctx, xd->tile_ctx, sizeof(*xd->tile_ctx)); else if (sb_col_in_tile >= 1) memcpy(x->row_ctx + sb_col_in_tile - 1, xd->tile_ctx, sizeof(*xd->tile_ctx)); } enc_row_mt->sync_write_ptr(row_mt_sync, sb_row, sb_col_in_tile, sb_cols_in_tile); } #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, encode_sb_row_time); #endif } static inline void init_encode_frame_mb_context(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCK *const x = &cpi->td.mb; MACROBLOCKD *const xd = &x->e_mbd; // Copy data over into macro block data structures. av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, cm->seq_params->sb_size); av1_setup_block_planes(xd, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, num_planes); } void av1_alloc_tile_data(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; av1_row_mt_mem_dealloc(cpi); aom_free(cpi->tile_data); cpi->allocated_tiles = 0; enc_row_mt->allocated_tile_cols = 0; enc_row_mt->allocated_tile_rows = 0; CHECK_MEM_ERROR( cm, cpi->tile_data, aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data))); cpi->allocated_tiles = tile_cols * tile_rows; enc_row_mt->allocated_tile_cols = tile_cols; enc_row_mt->allocated_tile_rows = tile_rows; for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { const int tile_index = tile_row * tile_cols + tile_col; TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; av1_zero(this_tile->row_mt_sync); this_tile->row_ctx = NULL; } } } void av1_init_tile_data(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; int tile_col, tile_row; TokenInfo *const token_info = &cpi->token_info; TokenExtra *pre_tok = token_info->tile_tok[0][0]; TokenList *tplist = token_info->tplist[0][0]; unsigned int tile_tok = 0; int tplist_count = 0; if (!is_stat_generation_stage(cpi) && cm->features.allow_screen_content_tools) { // Number of tokens for which token info needs to be allocated. unsigned int tokens_required = get_token_alloc(cm->mi_params.mb_rows, cm->mi_params.mb_cols, MAX_SB_SIZE_LOG2, num_planes); // Allocate/reallocate memory for token related info if the number of tokens // required is more than the number of tokens already allocated. This could // occur in case of the following: // 1) If the memory is not yet allocated // 2) If the frame dimensions have changed const bool realloc_tokens = tokens_required > token_info->tokens_allocated; if (realloc_tokens) { free_token_info(token_info); alloc_token_info(cm, token_info, tokens_required); pre_tok = token_info->tile_tok[0][0]; tplist = token_info->tplist[0][0]; } } for (tile_row = 0; tile_row < tile_rows; ++tile_row) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { TileDataEnc *const tile_data = &cpi->tile_data[tile_row * tile_cols + tile_col]; TileInfo *const tile_info = &tile_data->tile_info; av1_tile_init(tile_info, cm, tile_row, tile_col); tile_data->firstpass_top_mv = kZeroMv; tile_data->abs_sum_level = 0; if (is_token_info_allocated(token_info)) { token_info->tile_tok[tile_row][tile_col] = pre_tok + tile_tok; pre_tok = token_info->tile_tok[tile_row][tile_col]; tile_tok = allocated_tokens( tile_info, cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, num_planes); token_info->tplist[tile_row][tile_col] = tplist + tplist_count; tplist = token_info->tplist[tile_row][tile_col]; tplist_count = av1_get_sb_rows_in_tile(cm, tile_info); } tile_data->allow_update_cdf = !cm->tiles.large_scale; tile_data->allow_update_cdf = tile_data->allow_update_cdf && !cm->features.disable_cdf_update && !delay_wait_for_top_right_sb(cpi); tile_data->tctx = *cm->fc; } } } // Populate the start palette token info prior to encoding an SB row. static inline void get_token_start(AV1_COMP *cpi, const TileInfo *tile_info, int tile_row, int tile_col, int mi_row, TokenExtra **tp) { const TokenInfo *token_info = &cpi->token_info; if (!is_token_info_allocated(token_info)) return; const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); TokenList *const tplist = cpi->token_info.tplist[tile_row][tile_col]; const int sb_row_in_tile = (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2; get_start_tok(cpi, tile_row, tile_col, mi_row, tp, cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, num_planes); assert(tplist != NULL); tplist[sb_row_in_tile].start = *tp; } // Populate the token count after encoding an SB row. static inline void populate_token_count(AV1_COMP *cpi, const TileInfo *tile_info, int tile_row, int tile_col, int mi_row, TokenExtra *tok) { const TokenInfo *token_info = &cpi->token_info; if (!is_token_info_allocated(token_info)) return; const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); TokenList *const tplist = token_info->tplist[tile_row][tile_col]; const int sb_row_in_tile = (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2; const int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2; const int num_mb_rows_in_sb = ((1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4; tplist[sb_row_in_tile].count = (unsigned int)(tok - tplist[sb_row_in_tile].start); assert((unsigned int)(tok - tplist[sb_row_in_tile].start) <= get_token_alloc(num_mb_rows_in_sb, tile_mb_cols, cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, num_planes)); (void)num_planes; (void)tile_mb_cols; (void)num_mb_rows_in_sb; } /*!\brief Encode a superblock row * * \ingroup partition_search */ void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row, int tile_col, int mi_row) { AV1_COMMON *const cm = &cpi->common; const int tile_cols = cm->tiles.cols; TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; const TileInfo *const tile_info = &this_tile->tile_info; TokenExtra *tok = NULL; get_token_start(cpi, tile_info, tile_row, tile_col, mi_row, &tok); encode_sb_row(cpi, td, this_tile, mi_row, &tok); populate_token_count(cpi, tile_info, tile_row, tile_col, mi_row, tok); } /*!\brief Encode a tile * * \ingroup partition_search */ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row, int tile_col) { AV1_COMMON *const cm = &cpi->common; TileDataEnc *const this_tile = &cpi->tile_data[tile_row * cm->tiles.cols + tile_col]; const TileInfo *const tile_info = &this_tile->tile_info; if (!cpi->sf.rt_sf.use_nonrd_pick_mode) av1_inter_mode_data_init(this_tile); av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start, tile_info->mi_col_end, tile_row); av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, &td->mb.e_mbd); #if !CONFIG_REALTIME_ONLY if (cpi->oxcf.intra_mode_cfg.enable_cfl_intra) cfl_init(&td->mb.e_mbd.cfl, cm->seq_params); #endif if (td->mb.txfm_search_info.mb_rd_record != NULL) { av1_crc32c_calculator_init( &td->mb.txfm_search_info.mb_rd_record->crc_calculator); } for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; mi_row += cm->seq_params->mib_size) { av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row); } this_tile->abs_sum_level = td->abs_sum_level; } /*!\brief Break one frame into tiles and encode the tiles * * \ingroup partition_search * * \param[in] cpi Top-level encoder structure */ static inline void encode_tiles(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; int tile_col, tile_row; MACROBLOCK *const mb = &cpi->td.mb; assert(IMPLIES(cpi->tile_data == NULL, cpi->allocated_tiles < tile_cols * tile_rows)); if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi); av1_init_tile_data(cpi); av1_alloc_mb_data(cpi, mb); for (tile_row = 0; tile_row < tile_rows; ++tile_row) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { TileDataEnc *const this_tile = &cpi->tile_data[tile_row * cm->tiles.cols + tile_col]; cpi->td.intrabc_used = 0; cpi->td.deltaq_used = 0; cpi->td.abs_sum_level = 0; cpi->td.rd_counts.seg_tmp_pred_cost[0] = 0; cpi->td.rd_counts.seg_tmp_pred_cost[1] = 0; cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; cpi->td.mb.tile_pb_ctx = &this_tile->tctx; av1_init_rtc_counters(&cpi->td.mb); cpi->td.mb.palette_pixels = 0; av1_encode_tile(cpi, &cpi->td, tile_row, tile_col); if (!frame_is_intra_only(&cpi->common)) av1_accumulate_rtc_counters(cpi, &cpi->td.mb); cpi->palette_pixel_num += cpi->td.mb.palette_pixels; cpi->intrabc_used |= cpi->td.intrabc_used; cpi->deltaq_used |= cpi->td.deltaq_used; } } av1_dealloc_mb_data(mb, av1_num_planes(cm)); } // Set the relative distance of a reference frame w.r.t. current frame static inline void set_rel_frame_dist( const AV1_COMMON *const cm, RefFrameDistanceInfo *const ref_frame_dist_info, const int ref_frame_flags) { MV_REFERENCE_FRAME ref_frame; int min_past_dist = INT32_MAX, min_future_dist = INT32_MAX; ref_frame_dist_info->nearest_past_ref = NONE_FRAME; ref_frame_dist_info->nearest_future_ref = NONE_FRAME; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] = 0; if (ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { int dist = av1_encoder_get_relative_dist( cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME], cm->current_frame.display_order_hint); ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] = dist; // Get the nearest ref_frame in the past if (abs(dist) < min_past_dist && dist < 0) { ref_frame_dist_info->nearest_past_ref = ref_frame; min_past_dist = abs(dist); } // Get the nearest ref_frame in the future if (dist < min_future_dist && dist > 0) { ref_frame_dist_info->nearest_future_ref = ref_frame; min_future_dist = dist; } } } } static inline int refs_are_one_sided(const AV1_COMMON *cm) { assert(!frame_is_intra_only(cm)); int one_sided_refs = 1; const int cur_display_order_hint = cm->current_frame.display_order_hint; for (int ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) { const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref); if (buf == NULL) continue; if (av1_encoder_get_relative_dist(buf->display_order_hint, cur_display_order_hint) > 0) { one_sided_refs = 0; // bwd reference break; } } return one_sided_refs; } static inline void get_skip_mode_ref_offsets(const AV1_COMMON *cm, int ref_order_hint[2]) { const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info; ref_order_hint[0] = ref_order_hint[1] = 0; if (!skip_mode_info->skip_mode_allowed) return; const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_0); const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_1); assert(buf_0 != NULL && buf_1 != NULL); ref_order_hint[0] = buf_0->order_hint; ref_order_hint[1] = buf_1->order_hint; } static int check_skip_mode_enabled(AV1_COMP *const cpi) { AV1_COMMON *const cm = &cpi->common; av1_setup_skip_mode_allowed(cm); if (!cm->current_frame.skip_mode_info.skip_mode_allowed) return 0; // Turn off skip mode if the temporal distances of the reference pair to the // current frame are different by more than 1 frame. const int cur_offset = (int)cm->current_frame.order_hint; int ref_offset[2]; get_skip_mode_ref_offsets(cm, ref_offset); const int cur_to_ref0 = get_relative_dist(&cm->seq_params->order_hint_info, cur_offset, ref_offset[0]); const int cur_to_ref1 = abs(get_relative_dist( &cm->seq_params->order_hint_info, cur_offset, ref_offset[1])); if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0; // High Latency: Turn off skip mode if all refs are fwd. if (cpi->all_one_sided_refs && cpi->oxcf.gf_cfg.lag_in_frames > 0) return 0; const int ref_frame[2] = { cm->current_frame.skip_mode_info.ref_frame_idx_0 + LAST_FRAME, cm->current_frame.skip_mode_info.ref_frame_idx_1 + LAST_FRAME }; if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[0]]) || !(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[1]])) return 0; return 1; } static inline void set_default_interp_skip_flags( const AV1_COMMON *cm, InterpSearchFlags *interp_search_flags) { const int num_planes = av1_num_planes(cm); interp_search_flags->default_interp_skip_flags = (num_planes == 1) ? INTERP_SKIP_LUMA_EVAL_CHROMA : INTERP_SKIP_LUMA_SKIP_CHROMA; } static inline void setup_prune_ref_frame_mask(AV1_COMP *cpi) { if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp || cpi->sf.inter_sf.disable_onesided_comp) && cpi->all_one_sided_refs) { // Disable all compound references cpi->prune_ref_frame_mask = (1 << MODE_CTX_REF_FRAMES) - (1 << REF_FRAMES); } else if (!cpi->sf.rt_sf.use_nonrd_pick_mode && cpi->sf.inter_sf.selective_ref_frame >= 2) { AV1_COMMON *const cm = &cpi->common; const int cur_frame_display_order_hint = cm->current_frame.display_order_hint; unsigned int *ref_display_order_hint = cm->cur_frame->ref_display_order_hint; const int arf2_dist = av1_encoder_get_relative_dist( ref_display_order_hint[ALTREF2_FRAME - LAST_FRAME], cur_frame_display_order_hint); const int bwd_dist = av1_encoder_get_relative_dist( ref_display_order_hint[BWDREF_FRAME - LAST_FRAME], cur_frame_display_order_hint); for (int ref_idx = REF_FRAMES; ref_idx < MODE_CTX_REF_FRAMES; ++ref_idx) { MV_REFERENCE_FRAME rf[2]; av1_set_ref_frame(rf, ref_idx); if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) || !(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]])) { continue; } if (!cpi->all_one_sided_refs) { int ref_dist[2]; for (int i = 0; i < 2; ++i) { ref_dist[i] = av1_encoder_get_relative_dist( ref_display_order_hint[rf[i] - LAST_FRAME], cur_frame_display_order_hint); } // One-sided compound is used only when all reference frames are // one-sided. if ((ref_dist[0] > 0) == (ref_dist[1] > 0)) { cpi->prune_ref_frame_mask |= 1 << ref_idx; } } if (cpi->sf.inter_sf.selective_ref_frame >= 4 && (rf[0] == ALTREF2_FRAME || rf[1] == ALTREF2_FRAME) && (cpi->ref_frame_flags & av1_ref_frame_flag_list[BWDREF_FRAME])) { // Check if both ALTREF2_FRAME and BWDREF_FRAME are future references. if (arf2_dist > 0 && bwd_dist > 0 && bwd_dist <= arf2_dist) { // Drop ALTREF2_FRAME as a reference if BWDREF_FRAME is a closer // reference to the current frame than ALTREF2_FRAME cpi->prune_ref_frame_mask |= 1 << ref_idx; } } } } } static int allow_deltaq_mode(AV1_COMP *cpi) { #if !CONFIG_REALTIME_ONLY AV1_COMMON *const cm = &cpi->common; BLOCK_SIZE sb_size = cm->seq_params->sb_size; int sbs_wide = mi_size_wide[sb_size]; int sbs_high = mi_size_high[sb_size]; int64_t delta_rdcost = 0; for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += sbs_high) { for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += sbs_wide) { int64_t this_delta_rdcost = 0; av1_get_q_for_deltaq_objective(cpi, &cpi->td, &this_delta_rdcost, sb_size, mi_row, mi_col); delta_rdcost += this_delta_rdcost; } } return delta_rdcost < 0; #else (void)cpi; return 1; #endif // !CONFIG_REALTIME_ONLY } #define FORCE_ZMV_SKIP_128X128_BLK_DIFF 10000 #define FORCE_ZMV_SKIP_MAX_PER_PIXEL_DIFF 4 // Populates block level thresholds for force zeromv-skip decision static void populate_thresh_to_force_zeromv_skip(AV1_COMP *cpi) { if (cpi->sf.rt_sf.part_early_exit_zeromv == 0) return; // Threshold for forcing zeromv-skip decision is as below: // For 128x128 blocks, threshold is 10000 and per pixel threshold is 0.6103. // For 64x64 blocks, threshold is 5000 and per pixel threshold is 1.221 // allowing slightly higher error for smaller blocks. // Per Pixel Threshold of 64x64 block Area of 64x64 block 1 1 // ------------------------------------=sqrt(---------------------)=sqrt(-)=- // Per Pixel Threshold of 128x128 block Area of 128x128 block 4 2 // Thus, per pixel thresholds for blocks of size 32x32, 16x16,... can be // chosen as 2.442, 4.884,.... As the per pixel error tends to be higher for // small blocks, the same is clipped to 4. const unsigned int thresh_exit_128x128_part = FORCE_ZMV_SKIP_128X128_BLK_DIFF; const int num_128x128_pix = block_size_wide[BLOCK_128X128] * block_size_high[BLOCK_128X128]; for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; bsize++) { const int num_block_pix = block_size_wide[bsize] * block_size_high[bsize]; // Calculate the threshold for zeromv-skip decision based on area of the // partition unsigned int thresh_exit_part_blk = (unsigned int)(thresh_exit_128x128_part * sqrt((double)num_block_pix / num_128x128_pix) + 0.5); thresh_exit_part_blk = AOMMIN( thresh_exit_part_blk, (unsigned int)(FORCE_ZMV_SKIP_MAX_PER_PIXEL_DIFF * num_block_pix)); cpi->zeromv_skip_thresh_exit_part[bsize] = thresh_exit_part_blk; } } static void free_block_hash_buffers(uint32_t *block_hash_values[2][2], int8_t *is_block_same[2][3]) { for (int k = 0; k < 2; ++k) { for (int j = 0; j < 2; ++j) { aom_free(block_hash_values[k][j]); } for (int j = 0; j < 3; ++j) { aom_free(is_block_same[k][j]); } } } /*!\brief Determines delta_q_res value for Variance Boost modulation. */ static int aom_get_variance_boost_delta_q_res(int qindex) { // Signaling delta_q changes across superblocks comes with inherent syntax // element overhead, which adds up to total payload size. This overhead // becomes proportionally bigger the higher the base qindex (i.e. lower // quality, smaller file size), so a balance needs to be struck. // - Smaller delta_q_res: more granular delta_q control, more bits spent // signaling deltas. // - Larger delta_q_res: coarser delta_q control, less bits spent signaling // deltas. // // At the same time, SB qindex fluctuations become larger the higher // the base qindex (between lowest and highest-variance regions): // - For QP 5: up to 8 qindexes // - For QP 60: up to 52 qindexes // // With these factors in mind, it was found that the best strategy that // maximizes quality per bitrate is by having very finely-grained delta_q // values for the lowest picture qindexes (to preserve tiny qindex SB deltas), // and progressively making them coarser as base qindex increases (to reduce // total signaling overhead). int delta_q_res = 1; if (qindex >= 160) { delta_q_res = 8; } else if (qindex >= 120) { delta_q_res = 4; } else if (qindex >= 80) { delta_q_res = 2; } else { delta_q_res = 1; } return delta_q_res; } /*!\brief Encoder setup(only for the current frame), encoding, and recontruction * for a single frame * * \ingroup high_level_algo */ static inline void encode_frame_internal(AV1_COMP *cpi) { ThreadData *const td = &cpi->td; MACROBLOCK *const x = &td->mb; AV1_COMMON *const cm = &cpi->common; CommonModeInfoParams *const mi_params = &cm->mi_params; FeatureFlags *const features = &cm->features; MACROBLOCKD *const xd = &x->e_mbd; RD_COUNTS *const rdc = &cpi->td.rd_counts; #if CONFIG_FPMT_TEST FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs; FrameProbInfo *const temp_frame_probs_simulation = &cpi->ppi->temp_frame_probs_simulation; #endif FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs; IntraBCHashInfo *const intrabc_hash_info = &x->intrabc_hash_info; MultiThreadInfo *const mt_info = &cpi->mt_info; AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const DELTAQ_MODE deltaq_mode = oxcf->q_cfg.deltaq_mode; int i; if (!cpi->sf.rt_sf.use_nonrd_pick_mode) { mi_params->setup_mi(mi_params); } set_mi_offsets(mi_params, xd, 0, 0); av1_zero(*td->counts); av1_zero(rdc->tx_type_used); av1_zero(rdc->obmc_used); av1_zero(rdc->warped_used); av1_zero(rdc->seg_tmp_pred_cost); // Reset the flag. cpi->intrabc_used = 0; // Need to disable intrabc when superres is selected if (av1_superres_scaled(cm)) { features->allow_intrabc = 0; } features->allow_intrabc &= (oxcf->kf_cfg.enable_intrabc); if (features->allow_warped_motion && cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); int warped_probability = #if CONFIG_FPMT_TEST cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? temp_frame_probs->warped_probs[update_type] : #endif // CONFIG_FPMT_TEST frame_probs->warped_probs[update_type]; if (warped_probability < cpi->sf.inter_sf.prune_warped_prob_thresh) features->allow_warped_motion = 0; } int hash_table_created = 0; if (!is_stat_generation_stage(cpi) && av1_use_hash_me(cpi) && !cpi->sf.rt_sf.use_nonrd_pick_mode) { // TODO(any): move this outside of the recoding loop to avoid recalculating // the hash table. // add to hash table const int pic_width = cpi->source->y_crop_width; const int pic_height = cpi->source->y_crop_height; uint32_t *block_hash_values[2][2] = { { NULL } }; int8_t *is_block_same[2][3] = { { NULL } }; int k, j; bool error = false; for (k = 0; k < 2 && !error; ++k) { for (j = 0; j < 2; ++j) { block_hash_values[k][j] = (uint32_t *)aom_malloc( sizeof(*block_hash_values[0][0]) * pic_width * pic_height); if (!block_hash_values[k][j]) { error = true; break; } } for (j = 0; j < 3 && !error; ++j) { is_block_same[k][j] = (int8_t *)aom_malloc( sizeof(*is_block_same[0][0]) * pic_width * pic_height); if (!is_block_same[k][j]) error = true; } } av1_hash_table_init(intrabc_hash_info); if (error || !av1_hash_table_create(&intrabc_hash_info->intrabc_hash_table)) { free_block_hash_buffers(block_hash_values, is_block_same); aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Error allocating intrabc_hash_table and buffers"); } hash_table_created = 1; av1_generate_block_2x2_hash_value(intrabc_hash_info, cpi->source, block_hash_values[0], is_block_same[0]); // Hash data generated for screen contents is used for intraBC ME const int min_alloc_size = block_size_wide[mi_params->mi_alloc_bsize]; const int max_sb_size = (1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2)); int src_idx = 0; for (int size = 4; size <= max_sb_size; size *= 2, src_idx = !src_idx) { const int dst_idx = !src_idx; av1_generate_block_hash_value( intrabc_hash_info, cpi->source, size, block_hash_values[src_idx], block_hash_values[dst_idx], is_block_same[src_idx], is_block_same[dst_idx]); if (size >= min_alloc_size) { if (!av1_add_to_hash_map_by_row_with_precal_data( &intrabc_hash_info->intrabc_hash_table, block_hash_values[dst_idx], is_block_same[dst_idx][2], pic_width, pic_height, size)) { error = true; break; } } } free_block_hash_buffers(block_hash_values, is_block_same); if (error) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Error adding data to intrabc_hash_table"); } } const CommonQuantParams *quant_params = &cm->quant_params; for (i = 0; i < MAX_SEGMENTS; ++i) { const int qindex = cm->seg.enabled ? av1_get_qindex(&cm->seg, i, quant_params->base_qindex) : quant_params->base_qindex; xd->lossless[i] = qindex == 0 && quant_params->y_dc_delta_q == 0 && quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 && quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0; if (xd->lossless[i]) cpi->enc_seg.has_lossless_segment = 1; xd->qindex[i] = qindex; if (xd->lossless[i]) { cpi->optimize_seg_arr[i] = NO_TRELLIS_OPT; } else { cpi->optimize_seg_arr[i] = cpi->sf.rd_sf.optimize_coefficients; } } features->coded_lossless = is_coded_lossless(cm, xd); features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm); // Fix delta q resolution for the moment cm->delta_q_info.delta_q_res = 0; if (cpi->use_ducky_encode) { cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_DUCKY_ENCODE; } else if (cpi->oxcf.q_cfg.aq_mode != CYCLIC_REFRESH_AQ) { if (deltaq_mode == DELTA_Q_OBJECTIVE) cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_OBJECTIVE; else if (deltaq_mode == DELTA_Q_PERCEPTUAL) cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL; else if (deltaq_mode == DELTA_Q_PERCEPTUAL_AI) cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL; else if (deltaq_mode == DELTA_Q_USER_RATING_BASED) cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL; else if (deltaq_mode == DELTA_Q_HDR) cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL; else if (deltaq_mode == DELTA_Q_VARIANCE_BOOST) cm->delta_q_info.delta_q_res = aom_get_variance_boost_delta_q_res(quant_params->base_qindex); // Set delta_q_present_flag before it is used for the first time cm->delta_q_info.delta_lf_res = DEFAULT_DELTA_LF_RES; cm->delta_q_info.delta_q_present_flag = deltaq_mode != NO_DELTA_Q; // Turn off cm->delta_q_info.delta_q_present_flag if objective delta_q // is used for ineligible frames. That effectively will turn off row_mt // usage. Note objective delta_q and tpl eligible frames are only altref // frames currently. const GF_GROUP *gf_group = &cpi->ppi->gf_group; if (cm->delta_q_info.delta_q_present_flag) { if (deltaq_mode == DELTA_Q_OBJECTIVE && gf_group->update_type[cpi->gf_frame_index] == LF_UPDATE) cm->delta_q_info.delta_q_present_flag = 0; if (deltaq_mode == DELTA_Q_OBJECTIVE && cm->delta_q_info.delta_q_present_flag) { cm->delta_q_info.delta_q_present_flag &= allow_deltaq_mode(cpi); } } // Reset delta_q_used flag cpi->deltaq_used = 0; cm->delta_q_info.delta_lf_present_flag = cm->delta_q_info.delta_q_present_flag && oxcf->tool_cfg.enable_deltalf_mode; cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI; // update delta_q_present_flag and delta_lf_present_flag based on // base_qindex cm->delta_q_info.delta_q_present_flag &= quant_params->base_qindex > 0; cm->delta_q_info.delta_lf_present_flag &= quant_params->base_qindex > 0; } else if (cpi->cyclic_refresh->apply_cyclic_refresh || cpi->svc.number_temporal_layers == 1) { cpi->cyclic_refresh->actual_num_seg1_blocks = 0; cpi->cyclic_refresh->actual_num_seg2_blocks = 0; } cpi->rc.cnt_zeromv = 0; av1_frame_init_quantizer(cpi); init_encode_frame_mb_context(cpi); set_default_interp_skip_flags(cm, &cpi->interp_search_flags); if (cm->prev_frame && cm->prev_frame->seg.enabled) cm->last_frame_seg_map = cm->prev_frame->seg_map; else cm->last_frame_seg_map = NULL; if (features->allow_intrabc || features->coded_lossless) { av1_set_default_ref_deltas(cm->lf.ref_deltas); av1_set_default_mode_deltas(cm->lf.mode_deltas); } else if (cm->prev_frame) { memcpy(cm->lf.ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES); memcpy(cm->lf.mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS); } memcpy(cm->cur_frame->ref_deltas, cm->lf.ref_deltas, REF_FRAMES); memcpy(cm->cur_frame->mode_deltas, cm->lf.mode_deltas, MAX_MODE_LF_DELTAS); cpi->all_one_sided_refs = frame_is_intra_only(cm) ? 0 : refs_are_one_sided(cm); cpi->prune_ref_frame_mask = 0; // Figure out which ref frames can be skipped at frame level. setup_prune_ref_frame_mask(cpi); x->txfm_search_info.txb_split_count = 0; #if CONFIG_SPEED_STATS x->txfm_search_info.tx_search_count = 0; #endif // CONFIG_SPEED_STATS #if !CONFIG_REALTIME_ONLY #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, av1_compute_global_motion_time); #endif av1_compute_global_motion_facade(cpi); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, av1_compute_global_motion_time); #endif #endif // !CONFIG_REALTIME_ONLY #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, av1_setup_motion_field_time); #endif av1_calculate_ref_frame_side(cm); if (features->allow_ref_frame_mvs) av1_setup_motion_field(cm); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, av1_setup_motion_field_time); #endif cm->current_frame.skip_mode_info.skip_mode_flag = check_skip_mode_enabled(cpi); // Initialization of skip mode cost depends on the value of // 'skip_mode_flag'. This initialization happens in the function // av1_fill_mode_rates(), which is in turn called in // av1_initialize_rd_consts(). Thus, av1_initialize_rd_consts() // has to be called after 'skip_mode_flag' is initialized. av1_initialize_rd_consts(cpi); av1_set_sad_per_bit(cpi, &x->sadperbit, quant_params->base_qindex); populate_thresh_to_force_zeromv_skip(cpi); enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy; enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy; mt_info->row_mt_enabled = 0; mt_info->pack_bs_mt_enabled = AOMMIN(mt_info->num_mod_workers[MOD_PACK_BS], cm->tiles.cols * cm->tiles.rows) > 1; if (oxcf->row_mt && (mt_info->num_workers > 1)) { mt_info->row_mt_enabled = 1; enc_row_mt->sync_read_ptr = av1_row_mt_sync_read; enc_row_mt->sync_write_ptr = av1_row_mt_sync_write; av1_encode_tiles_row_mt(cpi); } else { if (AOMMIN(mt_info->num_workers, cm->tiles.cols * cm->tiles.rows) > 1) { av1_encode_tiles_mt(cpi); } else { // Preallocate the pc_tree for realtime coding to reduce the cost of // memory allocation. const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode; if (use_nonrd_mode) { td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size); if (!td->pc_root) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); } else { td->pc_root = NULL; } encode_tiles(cpi); av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, cpi->sf.part_sf.partition_search_type); td->pc_root = NULL; } } // If intrabc is allowed but never selected, reset the allow_intrabc flag. if (features->allow_intrabc && !cpi->intrabc_used) { features->allow_intrabc = 0; } if (features->allow_intrabc) { cm->delta_q_info.delta_lf_present_flag = 0; } if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) { cm->delta_q_info.delta_q_present_flag = 0; } // Set the transform size appropriately before bitstream creation const MODE_EVAL_TYPE eval_type = cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch ? WINNER_MODE_EVAL : DEFAULT_EVAL; const TX_SIZE_SEARCH_METHOD tx_search_type = cpi->winner_mode_params.tx_size_search_methods[eval_type]; assert(oxcf->txfm_cfg.enable_tx64 || tx_search_type != USE_LARGESTALL); features->tx_mode = select_tx_mode(cm, tx_search_type); // Retain the frame level probability update conditions for parallel frames. // These conditions will be consumed during postencode stage to update the // probability. if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { cpi->do_update_frame_probs_txtype[cpi->num_frame_recode] = cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats; cpi->do_update_frame_probs_obmc[cpi->num_frame_recode] = (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX); cpi->do_update_frame_probs_warp[cpi->num_frame_recode] = (features->allow_warped_motion && cpi->sf.inter_sf.prune_warped_prob_thresh > 0); cpi->do_update_frame_probs_interpfilter[cpi->num_frame_recode] = (cm->current_frame.frame_type != KEY_FRAME && cpi->sf.interp_sf.adaptive_interp_filter_search == 2 && features->interp_filter == SWITCHABLE); } if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats || ((cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != INT_MAX) && (cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != 0))) { const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); for (i = 0; i < TX_SIZES_ALL; i++) { int sum = 0; int j; int left = MAX_TX_TYPE_PROB; for (j = 0; j < TX_TYPES; j++) sum += cpi->td.rd_counts.tx_type_used[i][j]; for (j = TX_TYPES - 1; j >= 0; j--) { int update_txtype_frameprobs = 1; const int new_prob = sum ? (int)((int64_t)MAX_TX_TYPE_PROB * cpi->td.rd_counts.tx_type_used[i][j] / sum) : (j ? 0 : MAX_TX_TYPE_PROB); #if CONFIG_FPMT_TEST if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { int prob = (temp_frame_probs_simulation->tx_type_probs[update_type][i][j] + new_prob) >> 1; left -= prob; if (j == 0) prob += left; temp_frame_probs_simulation->tx_type_probs[update_type][i][j] = prob; // Copy temp_frame_probs_simulation to temp_frame_probs for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; update_type_idx++) { temp_frame_probs->tx_type_probs[update_type_idx][i][j] = temp_frame_probs_simulation ->tx_type_probs[update_type_idx][i][j]; } } update_txtype_frameprobs = 0; } #endif // CONFIG_FPMT_TEST // Track the frame probabilities of parallel encode frames to update // during postencode stage. if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { update_txtype_frameprobs = 0; cpi->frame_new_probs[cpi->num_frame_recode] .tx_type_probs[update_type][i][j] = new_prob; } if (update_txtype_frameprobs) { int prob = (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1; left -= prob; if (j == 0) prob += left; frame_probs->tx_type_probs[update_type][i][j] = prob; } } } } if (cm->seg.enabled) { cm->seg.temporal_update = 1; if (rdc->seg_tmp_pred_cost[0] < rdc->seg_tmp_pred_cost[1]) cm->seg.temporal_update = 0; } if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) { const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); for (i = 0; i < BLOCK_SIZES_ALL; i++) { int sum = 0; int update_obmc_frameprobs = 1; for (int j = 0; j < 2; j++) sum += cpi->td.rd_counts.obmc_used[i][j]; const int new_prob = sum ? 128 * cpi->td.rd_counts.obmc_used[i][1] / sum : 0; #if CONFIG_FPMT_TEST if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { temp_frame_probs_simulation->obmc_probs[update_type][i] = (temp_frame_probs_simulation->obmc_probs[update_type][i] + new_prob) >> 1; // Copy temp_frame_probs_simulation to temp_frame_probs for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; update_type_idx++) { temp_frame_probs->obmc_probs[update_type_idx][i] = temp_frame_probs_simulation->obmc_probs[update_type_idx][i]; } } update_obmc_frameprobs = 0; } #endif // CONFIG_FPMT_TEST // Track the frame probabilities of parallel encode frames to update // during postencode stage. if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { update_obmc_frameprobs = 0; cpi->frame_new_probs[cpi->num_frame_recode].obmc_probs[update_type][i] = new_prob; } if (update_obmc_frameprobs) { frame_probs->obmc_probs[update_type][i] = (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1; } } } if (features->allow_warped_motion && cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); int update_warp_frameprobs = 1; int sum = 0; for (i = 0; i < 2; i++) sum += cpi->td.rd_counts.warped_used[i]; const int new_prob = sum ? 128 * cpi->td.rd_counts.warped_used[1] / sum : 0; #if CONFIG_FPMT_TEST if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { temp_frame_probs_simulation->warped_probs[update_type] = (temp_frame_probs_simulation->warped_probs[update_type] + new_prob) >> 1; // Copy temp_frame_probs_simulation to temp_frame_probs for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; update_type_idx++) { temp_frame_probs->warped_probs[update_type_idx] = temp_frame_probs_simulation->warped_probs[update_type_idx]; } } update_warp_frameprobs = 0; } #endif // CONFIG_FPMT_TEST // Track the frame probabilities of parallel encode frames to update // during postencode stage. if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { update_warp_frameprobs = 0; cpi->frame_new_probs[cpi->num_frame_recode].warped_probs[update_type] = new_prob; } if (update_warp_frameprobs) { frame_probs->warped_probs[update_type] = (frame_probs->warped_probs[update_type] + new_prob) >> 1; } } if (cm->current_frame.frame_type != KEY_FRAME && cpi->sf.interp_sf.adaptive_interp_filter_search == 2 && features->interp_filter == SWITCHABLE) { const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { int sum = 0; int j; int left = 1536; for (j = 0; j < SWITCHABLE_FILTERS; j++) { sum += cpi->td.counts->switchable_interp[i][j]; } for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) { int update_interpfilter_frameprobs = 1; const int new_prob = sum ? 1536 * cpi->td.counts->switchable_interp[i][j] / sum : (j ? 0 : 1536); #if CONFIG_FPMT_TEST if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { int prob = (temp_frame_probs_simulation ->switchable_interp_probs[update_type][i][j] + new_prob) >> 1; left -= prob; if (j == 0) prob += left; temp_frame_probs_simulation ->switchable_interp_probs[update_type][i][j] = prob; // Copy temp_frame_probs_simulation to temp_frame_probs for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; update_type_idx++) { temp_frame_probs->switchable_interp_probs[update_type_idx][i][j] = temp_frame_probs_simulation ->switchable_interp_probs[update_type_idx][i][j]; } } update_interpfilter_frameprobs = 0; } #endif // CONFIG_FPMT_TEST // Track the frame probabilities of parallel encode frames to update // during postencode stage. if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { update_interpfilter_frameprobs = 0; cpi->frame_new_probs[cpi->num_frame_recode] .switchable_interp_probs[update_type][i][j] = new_prob; } if (update_interpfilter_frameprobs) { int prob = (frame_probs->switchable_interp_probs[update_type][i][j] + new_prob) >> 1; left -= prob; if (j == 0) prob += left; frame_probs->switchable_interp_probs[update_type][i][j] = prob; } } } } if (hash_table_created) { av1_hash_table_destroy(&intrabc_hash_info->intrabc_hash_table); } } /*!\brief Setup reference frame buffers and encode a frame * * \ingroup high_level_algo * \callgraph * \callergraph * * \param[in] cpi Top-level encoder structure */ void av1_encode_frame(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; CurrentFrame *const current_frame = &cm->current_frame; FeatureFlags *const features = &cm->features; RD_COUNTS *const rdc = &cpi->td.rd_counts; const AV1EncoderConfig *const oxcf = &cpi->oxcf; // Indicates whether or not to use a default reduced set for ext-tx // rather than the potential full set of 16 transforms features->reduced_tx_set_used = oxcf->txfm_cfg.reduced_tx_type_set; // Make sure segment_id is no larger than last_active_segid. if (cm->seg.enabled && cm->seg.update_map) { const int mi_rows = cm->mi_params.mi_rows; const int mi_cols = cm->mi_params.mi_cols; const int last_active_segid = cm->seg.last_active_segid; uint8_t *map = cpi->enc_seg.map; for (int mi_row = 0; mi_row < mi_rows; ++mi_row) { for (int mi_col = 0; mi_col < mi_cols; ++mi_col) { map[mi_col] = AOMMIN(map[mi_col], last_active_segid); } map += mi_cols; } } av1_setup_frame_buf_refs(cm); enforce_max_ref_frames(cpi, &cpi->ref_frame_flags, cm->cur_frame->ref_display_order_hint, cm->current_frame.display_order_hint); set_rel_frame_dist(&cpi->common, &cpi->ref_frame_dist_info, cpi->ref_frame_flags); av1_setup_frame_sign_bias(cm); // If global motion is enabled, then every buffer which is used as either // a source or a ref frame should have an image pyramid allocated. // Check here so that issues can be caught early in debug mode #if !defined(NDEBUG) && !CONFIG_REALTIME_ONLY if (cpi->alloc_pyramid) { assert(cpi->source->y_pyramid); for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); if (buf != NULL) { assert(buf->buf.y_pyramid); } } } #endif // !defined(NDEBUG) && !CONFIG_REALTIME_ONLY #if CONFIG_MISMATCH_DEBUG mismatch_reset_frame(av1_num_planes(cm)); #endif rdc->newmv_or_intra_blocks = 0; cpi->palette_pixel_num = 0; if (cpi->sf.hl_sf.frame_parameter_update || cpi->sf.rt_sf.use_comp_ref_nonrd) { if (frame_is_intra_only(cm)) current_frame->reference_mode = SINGLE_REFERENCE; else current_frame->reference_mode = REFERENCE_MODE_SELECT; features->interp_filter = SWITCHABLE; if (cm->tiles.large_scale) features->interp_filter = EIGHTTAP_REGULAR; features->switchable_motion_mode = is_switchable_motion_mode_allowed( features->allow_warped_motion, oxcf->motion_mode_cfg.enable_obmc); rdc->compound_ref_used_flag = 0; rdc->skip_mode_used_flag = 0; encode_frame_internal(cpi); if (current_frame->reference_mode == REFERENCE_MODE_SELECT) { // Use a flag that includes 4x4 blocks if (rdc->compound_ref_used_flag == 0) { current_frame->reference_mode = SINGLE_REFERENCE; #if CONFIG_ENTROPY_STATS av1_zero(cpi->td.counts->comp_inter); #endif // CONFIG_ENTROPY_STATS } } // Re-check on the skip mode status as reference mode may have been // changed. SkipModeInfo *const skip_mode_info = ¤t_frame->skip_mode_info; if (frame_is_intra_only(cm) || current_frame->reference_mode == SINGLE_REFERENCE) { skip_mode_info->skip_mode_allowed = 0; skip_mode_info->skip_mode_flag = 0; } if (skip_mode_info->skip_mode_flag && rdc->skip_mode_used_flag == 0) skip_mode_info->skip_mode_flag = 0; if (!cm->tiles.large_scale) { if (features->tx_mode == TX_MODE_SELECT && cpi->td.mb.txfm_search_info.txb_split_count == 0) features->tx_mode = TX_MODE_LARGEST; } } else { // This is needed if real-time speed setting is changed on the fly // from one using compound prediction to one using single reference. if (current_frame->reference_mode == REFERENCE_MODE_SELECT) current_frame->reference_mode = SINGLE_REFERENCE; encode_frame_internal(cpi); } } aom-3.12.1/av1/encoder/encodeframe.h000066400000000000000000000034421477627663500171330ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_ENCODEFRAME_H_ #define AOM_AV1_ENCODER_ENCODEFRAME_H_ #include "aom/aom_integer.h" #include "av1/common/blockd.h" #include "av1/common/enums.h" #include "av1/encoder/global_motion.h" #ifdef __cplusplus extern "C" { #endif #define DELTA_Q_PERCEPTUAL_MODULATION \ 1 // 0: variance based // 1: wavelet AC energy based struct macroblock; struct yv12_buffer_config; struct AV1_COMP; struct ThreadData; void av1_init_rtc_counters(struct macroblock *const x); void av1_accumulate_rtc_counters(struct AV1_COMP *cpi, const struct macroblock *const x); void av1_setup_src_planes(struct macroblock *x, const struct yv12_buffer_config *src, int mi_row, int mi_col, const int num_planes, BLOCK_SIZE bsize); void av1_encode_frame(struct AV1_COMP *cpi); void av1_alloc_tile_data(struct AV1_COMP *cpi); void av1_init_tile_data(struct AV1_COMP *cpi); void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row, int tile_col); void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row, int tile_col, int mi_row); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_ENCODEFRAME_H_ aom-3.12.1/av1/encoder/encodeframe_utils.c000066400000000000000000002133401477627663500203460ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/common_data.h" #include "av1/common/quant_common.h" #include "av1/common/reconintra.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encodeframe_utils.h" #include "av1/encoder/encoder_utils.h" #include "av1/encoder/rdopt.h" void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit, const BLOCK_SIZE bsize, const int mi_row, const int mi_col, int *const rdmult) { const AV1_COMMON *const cm = &cpi->common; const BLOCK_SIZE bsize_base = BLOCK_16X16; const int num_mi_w = mi_size_wide[bsize_base]; const int num_mi_h = mi_size_high[bsize_base]; const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w; const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h; int row, col; double num_of_mi = 0.0; double geom_mean_of_scale = 1.0; // To avoid overflow of 'geom_mean_of_scale', bsize_base must be at least // BLOCK_8X8. // // For bsize=BLOCK_128X128 and bsize_base=BLOCK_8X8, the loop below would // iterate 256 times. Considering the maximum value of // cpi->ssim_rdmult_scaling_factors (see av1_set_mb_ssim_rdmult_scaling()), // geom_mean_of_scale can go up to 4.8323^256, which is within DBL_MAX // (maximum value a double data type can hold). If bsize_base is modified to // BLOCK_4X4 (minimum possible block size), geom_mean_of_scale can go up // to 4.8323^1024 and exceed DBL_MAX, resulting in data overflow. assert(bsize_base >= BLOCK_8X8); assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM || cpi->oxcf.tune_cfg.tuning == AOM_TUNE_IQ); for (row = mi_row / num_mi_w; row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { for (col = mi_col / num_mi_h; col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) { const int index = row * num_cols + col; assert(cpi->ssim_rdmult_scaling_factors[index] != 0.0); geom_mean_of_scale *= cpi->ssim_rdmult_scaling_factors[index]; num_of_mi += 1.0; } } geom_mean_of_scale = pow(geom_mean_of_scale, (1.0 / num_of_mi)); *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5); *rdmult = AOMMAX(*rdmult, 0); av1_set_error_per_bit(errorperbit, *rdmult); } #if CONFIG_SALIENCY_MAP void av1_set_saliency_map_vmaf_rdmult(const AV1_COMP *const cpi, int *errorperbit, const BLOCK_SIZE bsize, const int mi_row, const int mi_col, int *const rdmult) { const AV1_COMMON *const cm = &cpi->common; const int num_mi_w = mi_size_wide[bsize]; const int num_mi_h = mi_size_high[bsize]; const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; *rdmult = (int)(*rdmult * cpi->sm_scaling_factor[(mi_row / num_mi_h) * num_cols + (mi_col / num_mi_w)]); *rdmult = AOMMAX(*rdmult, 0); av1_set_error_per_bit(errorperbit, *rdmult); } #endif // TODO(angiebird): Move this function to tpl_model.c #if !CONFIG_REALTIME_ONLY int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize, const int mi_row, const int mi_col) { const AV1_COMMON *const cm = &cpi->common; assert(IMPLIES(cpi->ppi->gf_group.size > 0, cpi->gf_frame_index < cpi->ppi->gf_group.size)); const int tpl_idx = cpi->gf_frame_index; int deltaq_rdmult = set_rdmult(cpi, x, -1); if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult; if (cm->superres_scale_denominator != SCALE_NUMERATOR) return deltaq_rdmult; if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult; if (x->rb == 0) return deltaq_rdmult; TplParams *const tpl_data = &cpi->ppi->tpl_data; TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; const int mi_wide = mi_size_wide[bsize]; const int mi_high = mi_size_high[bsize]; int tpl_stride = tpl_frame->stride; double intra_cost_base = 0; double mc_dep_cost_base = 0; double cbcmp_base = 0; const int step = 1 << tpl_data->tpl_stats_block_mis_log2; for (int row = mi_row; row < mi_row + mi_high; row += step) { for (int col = mi_col; col < mi_col + mi_wide; col += step) { if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols) continue; TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; double cbcmp = (double)this_stats->srcrf_dist; int64_t mc_dep_delta = RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, this_stats->mc_dep_dist); double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS); intra_cost_base += log(dist_scaled) * cbcmp; mc_dep_cost_base += log(3 * dist_scaled + mc_dep_delta) * cbcmp; cbcmp_base += cbcmp; } } if (cbcmp_base == 0) return deltaq_rdmult; double rk = exp((intra_cost_base - mc_dep_cost_base) / cbcmp_base); deltaq_rdmult = (int)(deltaq_rdmult * (rk / x->rb)); return AOMMAX(deltaq_rdmult, 1); } #endif // !CONFIG_REALTIME_ONLY static inline void update_filter_type_count(FRAME_COUNTS *counts, const MACROBLOCKD *xd, const MB_MODE_INFO *mbmi) { int dir; for (dir = 0; dir < 2; ++dir) { const int ctx = av1_get_pred_context_switchable_interp(xd, dir); InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir); // Only allow the 3 valid SWITCHABLE_FILTERS. assert(filter < SWITCHABLE_FILTERS); ++counts->switchable_interp[ctx][filter]; } } // This function will copy the best reference mode information from // MB_MODE_INFO_EXT_FRAME to MB_MODE_INFO_EXT. static inline void copy_mbmi_ext_frame_to_mbmi_ext( MB_MODE_INFO_EXT *mbmi_ext, const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_best, uint8_t ref_frame_type) { memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack, sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE])); memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight, sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE])); mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context; mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count; memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs, sizeof(mbmi_ext->global_mvs)); } void av1_update_state(const AV1_COMP *const cpi, ThreadData *td, const PICK_MODE_CONTEXT *const ctx, int mi_row, int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) { int i, x_idx, y; const AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; const int num_planes = av1_num_planes(cm); MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; const MB_MODE_INFO *const mi = &ctx->mic; MB_MODE_INFO *const mi_addr = xd->mi[0]; const struct segmentation *const seg = &cm->seg; assert(bsize < BLOCK_SIZES_ALL); const int bw = mi_size_wide[mi->bsize]; const int bh = mi_size_high[mi->bsize]; const int mis = mi_params->mi_stride; const int mi_width = mi_size_wide[bsize]; const int mi_height = mi_size_high[bsize]; TxfmSearchInfo *txfm_info = &x->txfm_search_info; assert(mi->bsize == bsize); *mi_addr = *mi; copy_mbmi_ext_frame_to_mbmi_ext(&x->mbmi_ext, &ctx->mbmi_ext_best, av1_ref_frame_type(ctx->mic.ref_frame)); memcpy(txfm_info->blk_skip, ctx->blk_skip, sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); txfm_info->skip_txfm = ctx->rd_stats.skip_txfm; xd->tx_type_map = ctx->tx_type_map; xd->tx_type_map_stride = mi_size_wide[bsize]; // If not dry_run, copy the transform type data into the frame level buffer. // Encoder will fetch tx types when writing bitstream. if (!dry_run) { const int grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col); uint8_t *const tx_type_map = mi_params->tx_type_map + grid_idx; const int mi_stride = mi_params->mi_stride; for (int blk_row = 0; blk_row < bh; ++blk_row) { av1_copy_array(tx_type_map + blk_row * mi_stride, xd->tx_type_map + blk_row * xd->tx_type_map_stride, bw); } xd->tx_type_map = tx_type_map; xd->tx_type_map_stride = mi_stride; } // If segmentation in use if (seg->enabled) { // For in frame complexity AQ copy the segment id from the segment map. if (cpi->oxcf.q_cfg.aq_mode == COMPLEXITY_AQ) { const uint8_t *const map = seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map; mi_addr->segment_id = map ? get_segment_id(mi_params, map, bsize, mi_row, mi_col) : 0; } // Else for cyclic refresh mode update the segment map, set the segment id // and then update the quantizer. if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mi_addr->segment_id != AM_SEGMENT_ID_INACTIVE && !cpi->rc.rtc_external_ratectrl) { av1_cyclic_refresh_update_segment(cpi, x, mi_row, mi_col, bsize, ctx->rd_stats.rate, ctx->rd_stats.dist, txfm_info->skip_txfm, dry_run); } if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd)) mi_addr->uv_mode = UV_DC_PRED; if (!dry_run && !mi_addr->skip_txfm) { int cdf_num; const uint8_t spatial_pred = av1_get_spatial_seg_pred( cm, xd, &cdf_num, cpi->cyclic_refresh->skip_over4x4); const uint8_t coded_id = av1_neg_interleave( mi_addr->segment_id, spatial_pred, seg->last_active_segid + 1); int64_t spatial_cost = x->mode_costs.spatial_pred_cost[cdf_num][coded_id]; td->rd_counts.seg_tmp_pred_cost[0] += spatial_cost; const int pred_segment_id = cm->last_frame_seg_map ? get_segment_id(mi_params, cm->last_frame_seg_map, bsize, mi_row, mi_col) : 0; const int use_tmp_pred = pred_segment_id == mi_addr->segment_id; const uint8_t tmp_pred_ctx = av1_get_pred_context_seg_id(xd); td->rd_counts.seg_tmp_pred_cost[1] += x->mode_costs.tmp_pred_cost[tmp_pred_ctx][use_tmp_pred]; if (!use_tmp_pred) { td->rd_counts.seg_tmp_pred_cost[1] += spatial_cost; } } } // Count zero motion vector. if (!dry_run && !frame_is_intra_only(cm)) { const MV mv = mi->mv[0].as_mv; if (is_inter_block(mi) && mi->ref_frame[0] == LAST_FRAME && abs(mv.row) < 8 && abs(mv.col) < 8) { const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh); // Accumulate low_content_frame. for (int mi_y = 0; mi_y < ymis; mi_y += 2) x->cnt_zeromv += bw << 1; } } for (i = 0; i < num_planes; ++i) { p[i].coeff = ctx->coeff[i]; p[i].qcoeff = ctx->qcoeff[i]; p[i].dqcoeff = ctx->dqcoeff[i]; p[i].eobs = ctx->eobs[i]; p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; } for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; // Restore the coding context of the MB to that that was in place // when the mode was picked for it const int cols = AOMMIN((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width, mi_width); const int rows = AOMMIN( (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height, mi_height); for (y = 0; y < rows; y++) { for (x_idx = 0; x_idx < cols; x_idx++) xd->mi[x_idx + y * mis] = mi_addr; } if (cpi->oxcf.q_cfg.aq_mode) av1_init_plane_quantizers(cpi, x, mi_addr->segment_id, 0); if (dry_run) return; #if CONFIG_INTERNAL_STATS { unsigned int *const mode_chosen_counts = (unsigned int *)cpi->mode_chosen_counts; // Cast const away. if (frame_is_intra_only(cm)) { static const int kf_mode_index[] = { THR_DC /*DC_PRED*/, THR_V_PRED /*V_PRED*/, THR_H_PRED /*H_PRED*/, THR_D45_PRED /*D45_PRED*/, THR_D135_PRED /*D135_PRED*/, THR_D113_PRED /*D113_PRED*/, THR_D157_PRED /*D157_PRED*/, THR_D203_PRED /*D203_PRED*/, THR_D67_PRED /*D67_PRED*/, THR_SMOOTH, /*SMOOTH_PRED*/ THR_SMOOTH_V, /*SMOOTH_V_PRED*/ THR_SMOOTH_H, /*SMOOTH_H_PRED*/ THR_PAETH /*PAETH_PRED*/, }; ++mode_chosen_counts[kf_mode_index[mi_addr->mode]]; } else { // Note how often each mode chosen as best ++mode_chosen_counts[ctx->best_mode_index]; } } #endif if (!frame_is_intra_only(cm)) { if (is_inter_block(mi) && cm->features.interp_filter == SWITCHABLE) { // When the frame interp filter is SWITCHABLE, several cases that always // use the default type (EIGHTTAP_REGULAR) are described in // av1_is_interp_needed(). Here, we should keep the counts for all // applicable blocks, so the frame filter resetting decision in // fix_interp_filter() is made correctly. update_filter_type_count(td->counts, xd, mi_addr); } } const int x_mis = AOMMIN(bw, mi_params->mi_cols - mi_col); const int y_mis = AOMMIN(bh, mi_params->mi_rows - mi_row); if (cm->seq_params->order_hint_info.enable_ref_frame_mvs) av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis); } void av1_update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts, PREDICTION_MODE mode, int16_t mode_context) { (void)counts; int16_t mode_ctx = mode_context & NEWMV_CTX_MASK; if (mode == NEWMV) { #if CONFIG_ENTROPY_STATS ++counts->newmv_mode[mode_ctx][0]; #endif update_cdf(fc->newmv_cdf[mode_ctx], 0, 2); return; } #if CONFIG_ENTROPY_STATS ++counts->newmv_mode[mode_ctx][1]; #endif update_cdf(fc->newmv_cdf[mode_ctx], 1, 2); mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; if (mode == GLOBALMV) { #if CONFIG_ENTROPY_STATS ++counts->zeromv_mode[mode_ctx][0]; #endif update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2); return; } #if CONFIG_ENTROPY_STATS ++counts->zeromv_mode[mode_ctx][1]; #endif update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2); mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; #if CONFIG_ENTROPY_STATS ++counts->refmv_mode[mode_ctx][mode != NEARESTMV]; #endif update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2); } static void update_palette_cdf(MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, FRAME_COUNTS *counts) { FRAME_CONTEXT *fc = xd->tile_ctx; const BLOCK_SIZE bsize = mbmi->bsize; const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize); (void)counts; if (mbmi->mode == DC_PRED) { const int n = pmi->palette_size[0]; const int palette_mode_ctx = av1_get_palette_mode_ctx(xd); #if CONFIG_ENTROPY_STATS ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0]; #endif update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx], n > 0, 2); if (n > 0) { #if CONFIG_ENTROPY_STATS ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE]; #endif update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx], n - PALETTE_MIN_SIZE, PALETTE_SIZES); } } if (mbmi->uv_mode == UV_DC_PRED) { const int n = pmi->palette_size[1]; const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0); #if CONFIG_ENTROPY_STATS ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0]; #endif update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2); if (n > 0) { #if CONFIG_ENTROPY_STATS ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE]; #endif update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx], n - PALETTE_MIN_SIZE, PALETTE_SIZES); } } } void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts, MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, const MB_MODE_INFO *above_mi, const MB_MODE_INFO *left_mi, const int intraonly) { FRAME_CONTEXT *fc = xd->tile_ctx; const PREDICTION_MODE y_mode = mbmi->mode; (void)counts; const BLOCK_SIZE bsize = mbmi->bsize; if (intraonly) { #if CONFIG_ENTROPY_STATS const PREDICTION_MODE above = av1_above_block_mode(above_mi); const PREDICTION_MODE left = av1_left_block_mode(left_mi); const int above_ctx = intra_mode_context[above]; const int left_ctx = intra_mode_context[left]; ++counts->kf_y_mode[above_ctx][left_ctx][y_mode]; #endif // CONFIG_ENTROPY_STATS update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES); } else { #if CONFIG_ENTROPY_STATS ++counts->y_mode[size_group_lookup[bsize]][y_mode]; #endif // CONFIG_ENTROPY_STATS update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES); } if (av1_filter_intra_allowed(cm, mbmi)) { const int use_filter_intra_mode = mbmi->filter_intra_mode_info.use_filter_intra; #if CONFIG_ENTROPY_STATS ++counts->filter_intra[mbmi->bsize][use_filter_intra_mode]; if (use_filter_intra_mode) { ++counts ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode]; } #endif // CONFIG_ENTROPY_STATS update_cdf(fc->filter_intra_cdfs[mbmi->bsize], use_filter_intra_mode, 2); if (use_filter_intra_mode) { update_cdf(fc->filter_intra_mode_cdf, mbmi->filter_intra_mode_info.filter_intra_mode, FILTER_INTRA_MODES); } } if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) { #if CONFIG_ENTROPY_STATS ++counts->angle_delta[mbmi->mode - V_PRED] [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA]; #endif update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED], mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA, 2 * MAX_ANGLE_DELTA + 1); } if (!xd->is_chroma_ref) return; const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd); #if CONFIG_ENTROPY_STATS ++counts->uv_mode[cfl_allowed][y_mode][uv_mode]; #endif // CONFIG_ENTROPY_STATS update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode, UV_INTRA_MODES - !cfl_allowed); if (uv_mode == UV_CFL_PRED) { const int8_t joint_sign = mbmi->cfl_alpha_signs; const uint8_t idx = mbmi->cfl_alpha_idx; #if CONFIG_ENTROPY_STATS ++counts->cfl_sign[joint_sign]; #endif update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS); if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) { aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; #if CONFIG_ENTROPY_STATS ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)]; #endif update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE); } if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) { aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; #if CONFIG_ENTROPY_STATS ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)]; #endif update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE); } } const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode); if (av1_is_directional_mode(intra_mode) && av1_use_angle_delta(bsize)) { #if CONFIG_ENTROPY_STATS ++counts->angle_delta[intra_mode - V_PRED] [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA]; #endif update_cdf(fc->angle_delta_cdf[intra_mode - V_PRED], mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA, 2 * MAX_ANGLE_DELTA + 1); } if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) { update_palette_cdf(xd, mbmi, counts); } } void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_row, int mi_col, BLOCK_SIZE bsize, const int num_planes) { MACROBLOCKD *xd = &x->e_mbd; int p; const int num_4x4_blocks_wide = mi_size_wide[bsize]; const int num_4x4_blocks_high = mi_size_high[bsize]; int mi_width = mi_size_wide[bsize]; int mi_height = mi_size_high[bsize]; for (p = 0; p < num_planes; p++) { int tx_col = mi_col; int tx_row = mi_row & MAX_MIB_MASK; memcpy( xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x), ctx->a + num_4x4_blocks_wide * p, (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> xd->plane[p].subsampling_x); memcpy(xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y), ctx->l + num_4x4_blocks_high * p, (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> xd->plane[p].subsampling_y); } memcpy(xd->above_partition_context + mi_col, ctx->sa, sizeof(*xd->above_partition_context) * mi_width); memcpy(xd->left_partition_context + (mi_row & MAX_MIB_MASK), ctx->sl, sizeof(xd->left_partition_context[0]) * mi_height); xd->above_txfm_context = ctx->p_ta; xd->left_txfm_context = ctx->p_tl; memcpy(xd->above_txfm_context, ctx->ta, sizeof(*xd->above_txfm_context) * mi_width); memcpy(xd->left_txfm_context, ctx->tl, sizeof(*xd->left_txfm_context) * mi_height); } void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_row, int mi_col, BLOCK_SIZE bsize, const int num_planes) { const MACROBLOCKD *xd = &x->e_mbd; int p; int mi_width = mi_size_wide[bsize]; int mi_height = mi_size_high[bsize]; // buffer the above/left context information of the block in search. for (p = 0; p < num_planes; ++p) { int tx_col = mi_col; int tx_row = mi_row & MAX_MIB_MASK; memcpy( ctx->a + mi_width * p, xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x), (sizeof(ENTROPY_CONTEXT) * mi_width) >> xd->plane[p].subsampling_x); memcpy(ctx->l + mi_height * p, xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y), (sizeof(ENTROPY_CONTEXT) * mi_height) >> xd->plane[p].subsampling_y); } memcpy(ctx->sa, xd->above_partition_context + mi_col, sizeof(*xd->above_partition_context) * mi_width); memcpy(ctx->sl, xd->left_partition_context + (mi_row & MAX_MIB_MASK), sizeof(xd->left_partition_context[0]) * mi_height); memcpy(ctx->ta, xd->above_txfm_context, sizeof(*xd->above_txfm_context) * mi_width); memcpy(ctx->tl, xd->left_txfm_context, sizeof(*xd->left_txfm_context) * mi_height); ctx->p_ta = xd->above_txfm_context; ctx->p_tl = xd->left_txfm_context; } static void set_partial_sb_partition(const AV1_COMMON *const cm, MB_MODE_INFO *mi, int bh_in, int bw_in, int mi_rows_remaining, int mi_cols_remaining, BLOCK_SIZE bsize, MB_MODE_INFO **mib) { int bh = bh_in; int r, c; for (r = 0; r < cm->seq_params->mib_size; r += bh) { int bw = bw_in; for (c = 0; c < cm->seq_params->mib_size; c += bw) { const int grid_index = get_mi_grid_idx(&cm->mi_params, r, c); const int mi_index = get_alloc_mi_idx(&cm->mi_params, r, c); mib[grid_index] = mi + mi_index; mib[grid_index]->bsize = find_partition_size( bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw); } } } // This function attempts to set all mode info entries in a given superblock // to the same block partition size. // However, at the bottom and right borders of the image the requested size // may not be allowed in which case this code attempts to choose the largest // allowable partition. void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile, MB_MODE_INFO **mib, int mi_row, int mi_col, BLOCK_SIZE bsize) { AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; const int mi_rows_remaining = tile->mi_row_end - mi_row; const int mi_cols_remaining = tile->mi_col_end - mi_col; MB_MODE_INFO *const mi_upper_left = mi_params->mi_alloc + get_alloc_mi_idx(mi_params, mi_row, mi_col); int bh = mi_size_high[bsize]; int bw = mi_size_wide[bsize]; assert(bsize >= mi_params->mi_alloc_bsize && "Attempted to use bsize < mi_params->mi_alloc_bsize"); assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0)); // Apply the requested partition size to the SB if it is all "in image" if ((mi_cols_remaining >= cm->seq_params->mib_size) && (mi_rows_remaining >= cm->seq_params->mib_size)) { for (int block_row = 0; block_row < cm->seq_params->mib_size; block_row += bh) { for (int block_col = 0; block_col < cm->seq_params->mib_size; block_col += bw) { const int grid_index = get_mi_grid_idx(mi_params, block_row, block_col); const int mi_index = get_alloc_mi_idx(mi_params, block_row, block_col); mib[grid_index] = mi_upper_left + mi_index; mib[grid_index]->bsize = bsize; } } } else { // Else this is a partial SB. set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining, mi_cols_remaining, bsize, mib); } } int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col, BLOCK_SIZE bsize) { const int bs = mi_size_wide[bsize]; const int hbs = bs / 2; assert(bsize >= BLOCK_8X8); const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); for (int i = 0; i < 4; i++) { int x_idx = (i & 1) * hbs; int y_idx = (i >> 1) * hbs; if ((mi_row + y_idx >= cm->mi_params.mi_rows) || (mi_col + x_idx >= cm->mi_params.mi_cols)) return 0; if (get_partition(cm, mi_row + y_idx, mi_col + x_idx, subsize) != PARTITION_NONE && subsize != BLOCK_8X8) return 0; } return 1; } #if !CONFIG_REALTIME_ONLY int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int orig_rdmult) { AV1_COMMON *const cm = &cpi->common; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; assert(IMPLIES(cpi->ppi->gf_group.size > 0, cpi->gf_frame_index < cpi->ppi->gf_group.size)); const int tpl_idx = cpi->gf_frame_index; TplParams *const tpl_data = &cpi->ppi->tpl_data; const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; int64_t intra_cost = 0; int64_t mc_dep_cost = 0; const int mi_wide = mi_size_wide[bsize]; const int mi_high = mi_size_high[bsize]; TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; int tpl_stride = tpl_frame->stride; if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, cpi->gf_frame_index)) { return orig_rdmult; } if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) { return orig_rdmult; } #ifndef NDEBUG int mi_count = 0; #endif const int mi_col_sr = coded_to_superres_mi(mi_col, cm->superres_scale_denominator); const int mi_col_end_sr = coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator); const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); const int step = 1 << block_mis_log2; const int row_step = step; const int col_step_sr = coded_to_superres_mi(step, cm->superres_scale_denominator); for (int row = mi_row; row < mi_row + mi_high; row += row_step) { for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) { if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue; TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)]; int64_t mc_dep_delta = RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, this_stats->mc_dep_dist); intra_cost += this_stats->recrf_dist << RDDIV_BITS; mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta; #ifndef NDEBUG mi_count++; #endif } } assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB); double beta = 1.0; if (mc_dep_cost > 0 && intra_cost > 0) { const double r0 = cpi->rd.r0; const double rk = (double)intra_cost / mc_dep_cost; beta = (r0 / rk); } int rdmult = av1_get_adaptive_rdmult(cpi, beta); rdmult = AOMMIN(rdmult, orig_rdmult * 3 / 2); rdmult = AOMMAX(rdmult, orig_rdmult * 1 / 2); rdmult = AOMMAX(1, rdmult); return rdmult; } // Checks to see if a super block is on a horizontal image edge. // In most cases this is the "real" edge unless there are formatting // bars embedded in the stream. int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) { int top_edge = 0; int bottom_edge = cpi->common.mi_params.mi_rows; int is_active_h_edge = 0; // For two pass account for any formatting bars detected. if (is_stat_consumption_stage_twopass(cpi)) { const AV1_COMMON *const cm = &cpi->common; const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats( &cpi->ppi->twopass, cm->current_frame.display_order_hint); if (this_frame_stats == NULL) return AOM_CODEC_ERROR; // The inactive region is specified in MBs not mi units. // The image edge is in the following MB row. top_edge += (int)(this_frame_stats->inactive_zone_rows * 4); bottom_edge -= (int)(this_frame_stats->inactive_zone_rows * 4); bottom_edge = AOMMAX(top_edge, bottom_edge); } if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) || ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) { is_active_h_edge = 1; } return is_active_h_edge; } // Checks to see if a super block is on a vertical image edge. // In most cases this is the "real" edge unless there are formatting // bars embedded in the stream. int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) { int left_edge = 0; int right_edge = cpi->common.mi_params.mi_cols; int is_active_v_edge = 0; // For two pass account for any formatting bars detected. if (is_stat_consumption_stage_twopass(cpi)) { const AV1_COMMON *const cm = &cpi->common; const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats( &cpi->ppi->twopass, cm->current_frame.display_order_hint); if (this_frame_stats == NULL) return AOM_CODEC_ERROR; // The inactive region is specified in MBs not mi units. // The image edge is in the following MB row. left_edge += (int)(this_frame_stats->inactive_zone_cols * 4); right_edge -= (int)(this_frame_stats->inactive_zone_cols * 4); right_edge = AOMMAX(left_edge, right_edge); } if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) || ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) { is_active_v_edge = 1; } return is_active_v_edge; } void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, SuperBlockEnc *sb_enc) { sb_enc->tpl_data_count = 0; if (!cpi->oxcf.algo_cfg.enable_tpl_model) return; if (cpi->common.current_frame.frame_type == KEY_FRAME) return; const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); if (update_type == INTNL_OVERLAY_UPDATE || update_type == OVERLAY_UPDATE) return; assert(IMPLIES(cpi->ppi->gf_group.size > 0, cpi->gf_frame_index < cpi->ppi->gf_group.size)); AV1_COMMON *const cm = &cpi->common; const int gf_group_index = cpi->gf_frame_index; TplParams *const tpl_data = &cpi->ppi->tpl_data; if (!av1_tpl_stats_ready(tpl_data, gf_group_index)) return; const int mi_wide = mi_size_wide[bsize]; const int mi_high = mi_size_high[bsize]; TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index]; TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; int tpl_stride = tpl_frame->stride; int mi_count = 0; int count = 0; const int mi_col_sr = coded_to_superres_mi(mi_col, cm->superres_scale_denominator); const int mi_col_end_sr = coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator); // mi_cols_sr is mi_cols at superres case. const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); // TPL store unit size is not the same as the motion estimation unit size. // Here always use motion estimation size to avoid getting repetitive inter/ // intra cost. const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d); assert(mi_size_wide[tpl_bsize] == mi_size_high[tpl_bsize]); const int row_step = mi_size_high[tpl_bsize]; const int col_step_sr = coded_to_superres_mi(mi_size_wide[tpl_bsize], cm->superres_scale_denominator); // Stride is only based on SB size, and we fill in values for every 16x16 // block in a SB. sb_enc->tpl_stride = (mi_col_end_sr - mi_col_sr) / col_step_sr; for (int row = mi_row; row < mi_row + mi_high; row += row_step) { for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) { assert(count < MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB); // Handle partial SB, so that no invalid values are used later. if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) { sb_enc->tpl_inter_cost[count] = INT64_MAX; sb_enc->tpl_intra_cost[count] = INT64_MAX; for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { sb_enc->tpl_mv[count][i].as_int = INVALID_MV; } count++; continue; } TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; sb_enc->tpl_inter_cost[count] = this_stats->inter_cost << TPL_DEP_COST_SCALE_LOG2; sb_enc->tpl_intra_cost[count] = this_stats->intra_cost << TPL_DEP_COST_SCALE_LOG2; memcpy(sb_enc->tpl_mv[count], this_stats->mv, sizeof(this_stats->mv)); mi_count++; count++; } } assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB); sb_enc->tpl_data_count = mi_count; } // analysis_type 0: Use mc_dep_cost and intra_cost // analysis_type 1: Use count of best inter predictor chosen // analysis_type 2: Use cost reduction from intra to inter for best inter // predictor chosen int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, ThreadData *td, int64_t *delta_dist, BLOCK_SIZE bsize, int mi_row, int mi_col) { AV1_COMMON *const cm = &cpi->common; assert(IMPLIES(cpi->ppi->gf_group.size > 0, cpi->gf_frame_index < cpi->ppi->gf_group.size)); const int tpl_idx = cpi->gf_frame_index; TplParams *const tpl_data = &cpi->ppi->tpl_data; const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; double intra_cost = 0; double mc_dep_reg = 0; double mc_dep_cost = 0; double cbcmp_base = 1; double srcrf_dist = 0; double srcrf_sse = 0; double srcrf_rate = 0; const int mi_wide = mi_size_wide[bsize]; const int mi_high = mi_size_high[bsize]; const int base_qindex = cm->quant_params.base_qindex; if (tpl_idx >= MAX_TPL_FRAME_IDX) return base_qindex; TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; int tpl_stride = tpl_frame->stride; if (!tpl_frame->is_valid) return base_qindex; #ifndef NDEBUG int mi_count = 0; #endif const int mi_col_sr = coded_to_superres_mi(mi_col, cm->superres_scale_denominator); const int mi_col_end_sr = coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator); const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); const int step = 1 << block_mis_log2; const int row_step = step; const int col_step_sr = coded_to_superres_mi(step, cm->superres_scale_denominator); for (int row = mi_row; row < mi_row + mi_high; row += row_step) { for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) { if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue; TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)]; double cbcmp = (double)this_stats->srcrf_dist; int64_t mc_dep_delta = RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, this_stats->mc_dep_dist); double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS); intra_cost += log(dist_scaled) * cbcmp; mc_dep_cost += log(dist_scaled + mc_dep_delta) * cbcmp; mc_dep_reg += log(3 * dist_scaled + mc_dep_delta) * cbcmp; srcrf_dist += (double)(this_stats->srcrf_dist << RDDIV_BITS); srcrf_sse += (double)(this_stats->srcrf_sse << RDDIV_BITS); srcrf_rate += (double)(this_stats->srcrf_rate << TPL_DEP_COST_SCALE_LOG2); #ifndef NDEBUG mi_count++; #endif cbcmp_base += cbcmp; } } assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB); int offset = 0; double beta = 1.0; double rk; if (mc_dep_cost > 0 && intra_cost > 0) { const double r0 = cpi->rd.r0; rk = exp((intra_cost - mc_dep_cost) / cbcmp_base); td->mb.rb = exp((intra_cost - mc_dep_reg) / cbcmp_base); beta = (r0 / rk); assert(beta > 0.0); } else { return base_qindex; } offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta); const DeltaQInfo *const delta_q_info = &cm->delta_q_info; offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1); offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1); int qindex = cm->quant_params.base_qindex + offset; qindex = AOMMIN(qindex, MAXQ); qindex = AOMMAX(qindex, MINQ); int frm_qstep = av1_dc_quant_QTX(base_qindex, 0, cm->seq_params->bit_depth); int sbs_qstep = av1_dc_quant_QTX(base_qindex, offset, cm->seq_params->bit_depth); if (delta_dist) { double sbs_dist = srcrf_dist * pow((double)sbs_qstep / frm_qstep, 2.0); double sbs_rate = srcrf_rate * ((double)frm_qstep / sbs_qstep); sbs_dist = AOMMIN(sbs_dist, srcrf_sse); *delta_dist = (int64_t)((sbs_dist - srcrf_dist) / rk); *delta_dist += RDCOST(tpl_frame->base_rdmult, 4 * 256, 0); *delta_dist += RDCOST(tpl_frame->base_rdmult, sbs_rate - srcrf_rate, 0); } return qindex; } #if !DISABLE_HDR_LUMA_DELTAQ // offset table defined in Table3 of T-REC-H.Sup15 document. static const int hdr_thres[HDR_QP_LEVELS + 1] = { 0, 301, 367, 434, 501, 567, 634, 701, 767, 834, 1024 }; static const int hdr10_qp_offset[HDR_QP_LEVELS] = { 3, 2, 1, 0, -1, -2, -3, -4, -5, -6 }; #endif int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int mi_row, int mi_col) { AV1_COMMON *const cm = &cpi->common; assert(cm->seq_params->bit_depth == AOM_BITS_10); #if DISABLE_HDR_LUMA_DELTAQ (void)x; (void)bsize; (void)mi_row; (void)mi_col; return cm->quant_params.base_qindex; #else // calculate pixel average const int block_luma_avg = av1_log_block_avg(cpi, x, bsize, mi_row, mi_col); // adjust offset based on average of the pixel block int offset = 0; for (int i = 0; i < HDR_QP_LEVELS; i++) { if (block_luma_avg >= hdr_thres[i] && block_luma_avg < hdr_thres[i + 1]) { offset = (int)(hdr10_qp_offset[i] * QP_SCALE_FACTOR); break; } } const DeltaQInfo *const delta_q_info = &cm->delta_q_info; offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1); offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1); int qindex = cm->quant_params.base_qindex + offset; qindex = AOMMIN(qindex, MAXQ); qindex = AOMMAX(qindex, MINQ); return qindex; #endif } #endif // !CONFIG_REALTIME_ONLY void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree, BLOCK_SIZE bsize) { if (sms_tree == NULL) return; sms_tree->partitioning = PARTITION_NONE; if (bsize >= BLOCK_8X8) { BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); for (int idx = 0; idx < 4; ++idx) av1_reset_simple_motion_tree_partition(sms_tree->split[idx], subsize); } } // Record the ref frames that have been selected by square partition blocks. void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type, BLOCK_SIZE bsize, int mib_size, int mi_row, int mi_col) { assert(mi_size_wide[bsize] == mi_size_high[bsize]); const int sb_size_mask = mib_size - 1; const int mi_row_in_sb = mi_row & sb_size_mask; const int mi_col_in_sb = mi_col & sb_size_mask; const int mi_size = mi_size_wide[bsize]; for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size; ++i) { for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size; ++j) { x->picked_ref_frames_mask[i * 32 + j] |= 1 << ref_type; } } } static void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left, aom_cdf_prob *cdf_ptr_tr, int num_cdfs, int cdf_stride, int nsymbs, int wt_left, int wt_tr) { for (int i = 0; i < num_cdfs; i++) { for (int j = 0; j <= nsymbs; j++) { cdf_ptr_left[i * cdf_stride + j] = (aom_cdf_prob)(((int)cdf_ptr_left[i * cdf_stride + j] * wt_left + (int)cdf_ptr_tr[i * cdf_stride + j] * wt_tr + ((wt_left + wt_tr) / 2)) / (wt_left + wt_tr)); assert(cdf_ptr_left[i * cdf_stride + j] >= 0 && cdf_ptr_left[i * cdf_stride + j] < CDF_PROB_TOP); } } } #define AVERAGE_CDF(cname_left, cname_tr, nsymbs) \ AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, CDF_SIZE(nsymbs)) #define AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, cdf_stride) \ do { \ aom_cdf_prob *cdf_ptr_left = (aom_cdf_prob *)cname_left; \ aom_cdf_prob *cdf_ptr_tr = (aom_cdf_prob *)cname_tr; \ int array_size = (int)sizeof(cname_left) / sizeof(aom_cdf_prob); \ int num_cdfs = array_size / cdf_stride; \ avg_cdf_symbol(cdf_ptr_left, cdf_ptr_tr, num_cdfs, cdf_stride, nsymbs, \ wt_left, wt_tr); \ } while (0) static void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr, int wt_left, int wt_tr) { AVERAGE_CDF(nmv_left->joints_cdf, nmv_tr->joints_cdf, 4); for (int i = 0; i < 2; i++) { AVERAGE_CDF(nmv_left->comps[i].classes_cdf, nmv_tr->comps[i].classes_cdf, MV_CLASSES); AVERAGE_CDF(nmv_left->comps[i].class0_fp_cdf, nmv_tr->comps[i].class0_fp_cdf, MV_FP_SIZE); AVERAGE_CDF(nmv_left->comps[i].fp_cdf, nmv_tr->comps[i].fp_cdf, MV_FP_SIZE); AVERAGE_CDF(nmv_left->comps[i].sign_cdf, nmv_tr->comps[i].sign_cdf, 2); AVERAGE_CDF(nmv_left->comps[i].class0_hp_cdf, nmv_tr->comps[i].class0_hp_cdf, 2); AVERAGE_CDF(nmv_left->comps[i].hp_cdf, nmv_tr->comps[i].hp_cdf, 2); AVERAGE_CDF(nmv_left->comps[i].class0_cdf, nmv_tr->comps[i].class0_cdf, CLASS0_SIZE); AVERAGE_CDF(nmv_left->comps[i].bits_cdf, nmv_tr->comps[i].bits_cdf, 2); } } // In case of row-based multi-threading of encoder, since we always // keep a top - right sync, we can average the top - right SB's CDFs and // the left SB's CDFs and use the same for current SB's encoding to // improve the performance. This function facilitates the averaging // of CDF and used only when row-mt is enabled in encoder. void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr, int wt_left, int wt_tr) { AVERAGE_CDF(ctx_left->txb_skip_cdf, ctx_tr->txb_skip_cdf, 2); AVERAGE_CDF(ctx_left->eob_extra_cdf, ctx_tr->eob_extra_cdf, 2); AVERAGE_CDF(ctx_left->dc_sign_cdf, ctx_tr->dc_sign_cdf, 2); AVERAGE_CDF(ctx_left->eob_flag_cdf16, ctx_tr->eob_flag_cdf16, 5); AVERAGE_CDF(ctx_left->eob_flag_cdf32, ctx_tr->eob_flag_cdf32, 6); AVERAGE_CDF(ctx_left->eob_flag_cdf64, ctx_tr->eob_flag_cdf64, 7); AVERAGE_CDF(ctx_left->eob_flag_cdf128, ctx_tr->eob_flag_cdf128, 8); AVERAGE_CDF(ctx_left->eob_flag_cdf256, ctx_tr->eob_flag_cdf256, 9); AVERAGE_CDF(ctx_left->eob_flag_cdf512, ctx_tr->eob_flag_cdf512, 10); AVERAGE_CDF(ctx_left->eob_flag_cdf1024, ctx_tr->eob_flag_cdf1024, 11); AVERAGE_CDF(ctx_left->coeff_base_eob_cdf, ctx_tr->coeff_base_eob_cdf, 3); AVERAGE_CDF(ctx_left->coeff_base_cdf, ctx_tr->coeff_base_cdf, 4); AVERAGE_CDF(ctx_left->coeff_br_cdf, ctx_tr->coeff_br_cdf, BR_CDF_SIZE); AVERAGE_CDF(ctx_left->newmv_cdf, ctx_tr->newmv_cdf, 2); AVERAGE_CDF(ctx_left->zeromv_cdf, ctx_tr->zeromv_cdf, 2); AVERAGE_CDF(ctx_left->refmv_cdf, ctx_tr->refmv_cdf, 2); AVERAGE_CDF(ctx_left->drl_cdf, ctx_tr->drl_cdf, 2); AVERAGE_CDF(ctx_left->inter_compound_mode_cdf, ctx_tr->inter_compound_mode_cdf, INTER_COMPOUND_MODES); AVERAGE_CDF(ctx_left->compound_type_cdf, ctx_tr->compound_type_cdf, MASKED_COMPOUND_TYPES); AVERAGE_CDF(ctx_left->wedge_idx_cdf, ctx_tr->wedge_idx_cdf, 16); AVERAGE_CDF(ctx_left->interintra_cdf, ctx_tr->interintra_cdf, 2); AVERAGE_CDF(ctx_left->wedge_interintra_cdf, ctx_tr->wedge_interintra_cdf, 2); AVERAGE_CDF(ctx_left->interintra_mode_cdf, ctx_tr->interintra_mode_cdf, INTERINTRA_MODES); AVERAGE_CDF(ctx_left->motion_mode_cdf, ctx_tr->motion_mode_cdf, MOTION_MODES); AVERAGE_CDF(ctx_left->obmc_cdf, ctx_tr->obmc_cdf, 2); AVERAGE_CDF(ctx_left->palette_y_size_cdf, ctx_tr->palette_y_size_cdf, PALETTE_SIZES); AVERAGE_CDF(ctx_left->palette_uv_size_cdf, ctx_tr->palette_uv_size_cdf, PALETTE_SIZES); for (int j = 0; j < PALETTE_SIZES; j++) { int nsymbs = j + PALETTE_MIN_SIZE; AVG_CDF_STRIDE(ctx_left->palette_y_color_index_cdf[j], ctx_tr->palette_y_color_index_cdf[j], nsymbs, CDF_SIZE(PALETTE_COLORS)); AVG_CDF_STRIDE(ctx_left->palette_uv_color_index_cdf[j], ctx_tr->palette_uv_color_index_cdf[j], nsymbs, CDF_SIZE(PALETTE_COLORS)); } AVERAGE_CDF(ctx_left->palette_y_mode_cdf, ctx_tr->palette_y_mode_cdf, 2); AVERAGE_CDF(ctx_left->palette_uv_mode_cdf, ctx_tr->palette_uv_mode_cdf, 2); AVERAGE_CDF(ctx_left->comp_inter_cdf, ctx_tr->comp_inter_cdf, 2); AVERAGE_CDF(ctx_left->single_ref_cdf, ctx_tr->single_ref_cdf, 2); AVERAGE_CDF(ctx_left->comp_ref_type_cdf, ctx_tr->comp_ref_type_cdf, 2); AVERAGE_CDF(ctx_left->uni_comp_ref_cdf, ctx_tr->uni_comp_ref_cdf, 2); AVERAGE_CDF(ctx_left->comp_ref_cdf, ctx_tr->comp_ref_cdf, 2); AVERAGE_CDF(ctx_left->comp_bwdref_cdf, ctx_tr->comp_bwdref_cdf, 2); AVERAGE_CDF(ctx_left->txfm_partition_cdf, ctx_tr->txfm_partition_cdf, 2); AVERAGE_CDF(ctx_left->compound_index_cdf, ctx_tr->compound_index_cdf, 2); AVERAGE_CDF(ctx_left->comp_group_idx_cdf, ctx_tr->comp_group_idx_cdf, 2); AVERAGE_CDF(ctx_left->skip_mode_cdfs, ctx_tr->skip_mode_cdfs, 2); AVERAGE_CDF(ctx_left->skip_txfm_cdfs, ctx_tr->skip_txfm_cdfs, 2); AVERAGE_CDF(ctx_left->intra_inter_cdf, ctx_tr->intra_inter_cdf, 2); avg_nmv(&ctx_left->nmvc, &ctx_tr->nmvc, wt_left, wt_tr); avg_nmv(&ctx_left->ndvc, &ctx_tr->ndvc, wt_left, wt_tr); AVERAGE_CDF(ctx_left->intrabc_cdf, ctx_tr->intrabc_cdf, 2); AVERAGE_CDF(ctx_left->seg.pred_cdf, ctx_tr->seg.pred_cdf, 2); AVERAGE_CDF(ctx_left->seg.spatial_pred_seg_cdf, ctx_tr->seg.spatial_pred_seg_cdf, MAX_SEGMENTS); AVERAGE_CDF(ctx_left->filter_intra_cdfs, ctx_tr->filter_intra_cdfs, 2); AVERAGE_CDF(ctx_left->filter_intra_mode_cdf, ctx_tr->filter_intra_mode_cdf, FILTER_INTRA_MODES); AVERAGE_CDF(ctx_left->switchable_restore_cdf, ctx_tr->switchable_restore_cdf, RESTORE_SWITCHABLE_TYPES); AVERAGE_CDF(ctx_left->wiener_restore_cdf, ctx_tr->wiener_restore_cdf, 2); AVERAGE_CDF(ctx_left->sgrproj_restore_cdf, ctx_tr->sgrproj_restore_cdf, 2); AVERAGE_CDF(ctx_left->y_mode_cdf, ctx_tr->y_mode_cdf, INTRA_MODES); AVG_CDF_STRIDE(ctx_left->uv_mode_cdf[0], ctx_tr->uv_mode_cdf[0], UV_INTRA_MODES - 1, CDF_SIZE(UV_INTRA_MODES)); AVERAGE_CDF(ctx_left->uv_mode_cdf[1], ctx_tr->uv_mode_cdf[1], UV_INTRA_MODES); for (int i = 0; i < PARTITION_CONTEXTS; i++) { if (i < 4) { AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 4, CDF_SIZE(10)); } else if (i < 16) { AVERAGE_CDF(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 10); } else { AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 8, CDF_SIZE(10)); } } AVERAGE_CDF(ctx_left->switchable_interp_cdf, ctx_tr->switchable_interp_cdf, SWITCHABLE_FILTERS); AVERAGE_CDF(ctx_left->kf_y_cdf, ctx_tr->kf_y_cdf, INTRA_MODES); AVERAGE_CDF(ctx_left->angle_delta_cdf, ctx_tr->angle_delta_cdf, 2 * MAX_ANGLE_DELTA + 1); AVG_CDF_STRIDE(ctx_left->tx_size_cdf[0], ctx_tr->tx_size_cdf[0], MAX_TX_DEPTH, CDF_SIZE(MAX_TX_DEPTH + 1)); AVERAGE_CDF(ctx_left->tx_size_cdf[1], ctx_tr->tx_size_cdf[1], MAX_TX_DEPTH + 1); AVERAGE_CDF(ctx_left->tx_size_cdf[2], ctx_tr->tx_size_cdf[2], MAX_TX_DEPTH + 1); AVERAGE_CDF(ctx_left->tx_size_cdf[3], ctx_tr->tx_size_cdf[3], MAX_TX_DEPTH + 1); AVERAGE_CDF(ctx_left->delta_q_cdf, ctx_tr->delta_q_cdf, DELTA_Q_PROBS + 1); AVERAGE_CDF(ctx_left->delta_lf_cdf, ctx_tr->delta_lf_cdf, DELTA_LF_PROBS + 1); for (int i = 0; i < FRAME_LF_COUNT; i++) { AVERAGE_CDF(ctx_left->delta_lf_multi_cdf[i], ctx_tr->delta_lf_multi_cdf[i], DELTA_LF_PROBS + 1); } AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[1], ctx_tr->intra_ext_tx_cdf[1], 7, CDF_SIZE(TX_TYPES)); AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[2], ctx_tr->intra_ext_tx_cdf[2], 5, CDF_SIZE(TX_TYPES)); AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[1], ctx_tr->inter_ext_tx_cdf[1], 16, CDF_SIZE(TX_TYPES)); AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[2], ctx_tr->inter_ext_tx_cdf[2], 12, CDF_SIZE(TX_TYPES)); AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[3], ctx_tr->inter_ext_tx_cdf[3], 2, CDF_SIZE(TX_TYPES)); AVERAGE_CDF(ctx_left->cfl_sign_cdf, ctx_tr->cfl_sign_cdf, CFL_JOINT_SIGNS); AVERAGE_CDF(ctx_left->cfl_alpha_cdf, ctx_tr->cfl_alpha_cdf, CFL_ALPHABET_SIZE); } // Check neighbor blocks' motion information. static int check_neighbor_blocks(MB_MODE_INFO **mi, int mi_stride, const TileInfo *const tile_info, int mi_row, int mi_col) { int is_above_low_motion = 1; int is_left_low_motion = 1; const int thr = 24; // Check above block. if (mi_row > tile_info->mi_row_start) { const MB_MODE_INFO *above_mbmi = mi[-mi_stride]; const int_mv above_mv = above_mbmi->mv[0]; if (above_mbmi->mode >= INTRA_MODE_END && (abs(above_mv.as_mv.row) > thr || abs(above_mv.as_mv.col) > thr)) is_above_low_motion = 0; } // Check left block. if (mi_col > tile_info->mi_col_start) { const MB_MODE_INFO *left_mbmi = mi[-1]; const int_mv left_mv = left_mbmi->mv[0]; if (left_mbmi->mode >= INTRA_MODE_END && (abs(left_mv.as_mv.row) > thr || abs(left_mv.as_mv.col) > thr)) is_left_low_motion = 0; } return (is_above_low_motion && is_left_low_motion); } // Check this block's motion in a fast way. static int fast_detect_non_zero_motion(AV1_COMP *cpi, const uint8_t *src_y, int src_ystride, const uint8_t *last_src_y, int last_src_ystride, int mi_row, int mi_col) { AV1_COMMON *const cm = &cpi->common; const BLOCK_SIZE bsize = cm->seq_params->sb_size; unsigned int blk_sad = INT_MAX; if (cpi->src_sad_blk_64x64 != NULL) { const int sb_size_by_mb = (bsize == BLOCK_128X128) ? (cm->seq_params->mib_size >> 1) : cm->seq_params->mib_size; const int sb_cols = (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb; const int sbi_col = mi_col / sb_size_by_mb; const int sbi_row = mi_row / sb_size_by_mb; blk_sad = (unsigned int)cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols]; } else { blk_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, last_src_ystride); } // Search 4 1-away points. const uint8_t *const search_pos[4] = { last_src_y - last_src_ystride, last_src_y - 1, last_src_y + 1, last_src_y + last_src_ystride, }; unsigned int sad_arr[4]; cpi->ppi->fn_ptr[bsize].sdx4df(src_y, src_ystride, search_pos, last_src_ystride, sad_arr); blk_sad = (blk_sad * 5) >> 3; return (blk_sad < sad_arr[0] && blk_sad < sad_arr[1] && blk_sad < sad_arr[2] && blk_sad < sad_arr[3]); } // Grade the temporal variation of the source by comparing the current sb and // its collocated block in the last frame. void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int mi_row, int mi_col) { if (cpi->last_source->y_width != cpi->source->y_width || cpi->last_source->y_height != cpi->source->y_height) return; #if CONFIG_AV1_HIGHBITDEPTH if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) return; #endif unsigned int tmp_sse; unsigned int tmp_variance; const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size; uint8_t *src_y = cpi->source->y_buffer; const int src_ystride = cpi->source->y_stride; const int src_offset = src_ystride * (mi_row << 2) + (mi_col << 2); uint8_t *last_src_y = cpi->last_source->y_buffer; const int last_src_ystride = cpi->last_source->y_stride; const int last_src_offset = last_src_ystride * (mi_row << 2) + (mi_col << 2); uint64_t avg_source_sse_threshold_verylow = 10000; // ~1.5*1.5*(64*64) uint64_t avg_source_sse_threshold_low[2] = { 100000, // ~5*5*(64*64) 36000 }; // ~3*3*(64*64) uint64_t avg_source_sse_threshold_high = 1000000; // ~15*15*(64*64) if (cpi->sf.rt_sf.increase_source_sad_thresh) { avg_source_sse_threshold_high = avg_source_sse_threshold_high << 1; avg_source_sse_threshold_low[0] = avg_source_sse_threshold_low[0] << 1; avg_source_sse_threshold_verylow = avg_source_sse_threshold_verylow << 1; } uint64_t sum_sq_thresh = 10000; // sum = sqrt(thresh / 64*64)) ~1.5 src_y += src_offset; last_src_y += last_src_offset; tmp_variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y, last_src_ystride, &tmp_sse); // rd thresholds if (tmp_sse < avg_source_sse_threshold_low[1]) x->content_state_sb.source_sad_rd = kLowSad; // nonrd thresholds if (tmp_sse == 0) { x->content_state_sb.source_sad_nonrd = kZeroSad; return; } if (tmp_sse < avg_source_sse_threshold_verylow) x->content_state_sb.source_sad_nonrd = kVeryLowSad; else if (tmp_sse < avg_source_sse_threshold_low[0]) x->content_state_sb.source_sad_nonrd = kLowSad; else if (tmp_sse > avg_source_sse_threshold_high) x->content_state_sb.source_sad_nonrd = kHighSad; // Detect large lighting change. // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12) if (tmp_variance < (tmp_sse >> 1) && (tmp_sse - tmp_variance) > sum_sq_thresh) x->content_state_sb.lighting_change = 1; if ((tmp_sse - tmp_variance) < (sum_sq_thresh >> 1)) x->content_state_sb.low_sumdiff = 1; if (tmp_sse > ((avg_source_sse_threshold_high * 7) >> 3) && !x->content_state_sb.lighting_change && !x->content_state_sb.low_sumdiff) x->sb_force_fixed_part = 0; if (!cpi->sf.rt_sf.use_rtc_tf || cpi->rc.high_source_sad || cpi->rc.frame_source_sad > 20000 || cpi->svc.number_spatial_layers > 1) return; // In-place temporal filter. If psnr calculation is enabled, we store the // source for that. AV1_COMMON *const cm = &cpi->common; // Calculate n*mean^2 const unsigned int nmean2 = tmp_sse - tmp_variance; const int ac_q_step = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, cm->seq_params->bit_depth); const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const int avg_q_step = av1_ac_quant_QTX(p_rc->avg_frame_qindex[INTER_FRAME], 0, cm->seq_params->bit_depth); const unsigned int threshold = (cpi->sf.rt_sf.use_rtc_tf == 1) ? (clamp(avg_q_step, 250, 1000)) * ac_q_step : 250 * ac_q_step; // TODO(yunqing): use a weighted sum instead of averaging in filtering. if (tmp_variance <= threshold && nmean2 <= 15) { // Check neighbor blocks. If neighbor blocks aren't low-motion blocks, // skip temporal filtering for this block. MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + get_mi_grid_idx(&cm->mi_params, mi_row, mi_col); const TileInfo *const tile_info = &tile_data->tile_info; const int is_neighbor_blocks_low_motion = check_neighbor_blocks( mi, cm->mi_params.mi_stride, tile_info, mi_row, mi_col); if (!is_neighbor_blocks_low_motion) return; // Only consider 64x64 SB for now. Need to extend to 128x128 for large SB // size. // Test several nearby points. If non-zero mv exists, don't do temporal // filtering. const int is_this_blk_low_motion = fast_detect_non_zero_motion( cpi, src_y, src_ystride, last_src_y, last_src_ystride, mi_row, mi_col); if (!is_this_blk_low_motion) return; const int shift_x[2] = { 0, cpi->source->subsampling_x }; const int shift_y[2] = { 0, cpi->source->subsampling_y }; const uint8_t h = block_size_high[bsize]; const uint8_t w = block_size_wide[bsize]; for (int plane = 0; plane < av1_num_planes(cm); ++plane) { uint8_t *src = cpi->source->buffers[plane]; const int src_stride = cpi->source->strides[plane != 0]; uint8_t *last_src = cpi->last_source->buffers[plane]; const int last_src_stride = cpi->last_source->strides[plane != 0]; src += src_stride * (mi_row << (2 - shift_y[plane != 0])) + (mi_col << (2 - shift_x[plane != 0])); last_src += last_src_stride * (mi_row << (2 - shift_y[plane != 0])) + (mi_col << (2 - shift_x[plane != 0])); for (int i = 0; i < (h >> shift_y[plane != 0]); ++i) { for (int j = 0; j < (w >> shift_x[plane != 0]); ++j) { src[j] = (last_src[j] + src[j]) >> 1; } src += src_stride; last_src += last_src_stride; } } } } // Memset the mbmis at the current superblock to 0 void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size, int mi_row, int mi_col) { // size of sb in unit of mi (BLOCK_4X4) const int sb_size_mi = mi_size_wide[sb_size]; const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; // size of sb in unit of allocated mi size const int sb_size_alloc_mi = mi_size_wide[sb_size] / mi_alloc_size_1d; assert(mi_params->mi_alloc_stride % sb_size_alloc_mi == 0 && "mi is not allocated as a multiple of sb!"); assert(mi_params->mi_stride % sb_size_mi == 0 && "mi_grid_base is not allocated as a multiple of sb!"); const int mi_rows = mi_size_high[sb_size]; for (int cur_mi_row = 0; cur_mi_row < mi_rows; cur_mi_row++) { assert(get_mi_grid_idx(mi_params, 0, mi_col + mi_alloc_size_1d) < mi_params->mi_stride); const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row + cur_mi_row, mi_col); const int alloc_mi_idx = get_alloc_mi_idx(mi_params, mi_row + cur_mi_row, mi_col); memset(&mi_params->mi_grid_base[mi_grid_idx], 0, sb_size_mi * sizeof(*mi_params->mi_grid_base)); memset(&mi_params->tx_type_map[mi_grid_idx], 0, sb_size_mi * sizeof(*mi_params->tx_type_map)); if (cur_mi_row % mi_alloc_size_1d == 0) { memset(&mi_params->mi_alloc[alloc_mi_idx], 0, sb_size_alloc_mi * sizeof(*mi_params->mi_alloc)); } } } void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi, ThreadData *td, const TileDataEnc *tile_data, int mi_row, int mi_col) { MACROBLOCK *x = &td->mb; MACROBLOCKD *xd = &x->e_mbd; const TileInfo *tile_info = &tile_data->tile_info; const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); const BLOCK_SIZE sb_size = cm->seq_params->sb_size; xd->above_txfm_context = cm->above_contexts.txfm[tile_info->tile_row] + mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); av1_save_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes); sb_fp_stats->rd_count = td->rd_counts; sb_fp_stats->split_count = x->txfm_search_info.txb_split_count; sb_fp_stats->fc = *td->counts; // Don't copy in row_mt case, otherwise run into data race. No behavior change // in row_mt case. if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { memcpy(sb_fp_stats->inter_mode_rd_models, tile_data->inter_mode_rd_models, sizeof(sb_fp_stats->inter_mode_rd_models)); } memcpy(sb_fp_stats->thresh_freq_fact, x->thresh_freq_fact, sizeof(sb_fp_stats->thresh_freq_fact)); const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col); sb_fp_stats->current_qindex = cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex; #if CONFIG_INTERNAL_STATS memcpy(sb_fp_stats->mode_chosen_counts, cpi->mode_chosen_counts, sizeof(sb_fp_stats->mode_chosen_counts)); #endif // CONFIG_INTERNAL_STATS } void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row, int mi_col) { MACROBLOCK *x = &td->mb; const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); const BLOCK_SIZE sb_size = cm->seq_params->sb_size; av1_restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes); td->rd_counts = sb_fp_stats->rd_count; x->txfm_search_info.txb_split_count = sb_fp_stats->split_count; *td->counts = sb_fp_stats->fc; if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { memcpy(tile_data->inter_mode_rd_models, sb_fp_stats->inter_mode_rd_models, sizeof(sb_fp_stats->inter_mode_rd_models)); } memcpy(x->thresh_freq_fact, sb_fp_stats->thresh_freq_fact, sizeof(sb_fp_stats->thresh_freq_fact)); const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col); cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex = sb_fp_stats->current_qindex; #if CONFIG_INTERNAL_STATS memcpy(cpi->mode_chosen_counts, sb_fp_stats->mode_chosen_counts, sizeof(sb_fp_stats->mode_chosen_counts)); #endif // CONFIG_INTERNAL_STATS } /*! Checks whether to skip updating the entropy cost based on tile info. * * This function contains the common code used to skip the cost update of coeff, * mode, mv and dv symbols. */ static int skip_cost_update(const SequenceHeader *seq_params, const TileInfo *const tile_info, const int mi_row, const int mi_col, INTERNAL_COST_UPDATE_TYPE upd_level) { if (upd_level == INTERNAL_COST_UPD_SB) return 0; if (upd_level == INTERNAL_COST_UPD_OFF) return 1; // upd_level is at most as frequent as each sb_row in a tile. if (mi_col != tile_info->mi_col_start) return 1; if (upd_level == INTERNAL_COST_UPD_SBROW_SET) { const int mib_size_log2 = seq_params->mib_size_log2; const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2; const int sb_size = seq_params->mib_size * MI_SIZE; const int tile_height = (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE; // When upd_level = INTERNAL_COST_UPD_SBROW_SET, the cost update happens // once for 2, 4 sb rows for sb size 128, sb size 64 respectively. However, // as the update will not be equally spaced in smaller resolutions making // it equally spaced by calculating (mv_num_rows_cost_update) the number of // rows after which the cost update should happen. const int sb_size_update_freq_map[2] = { 2, 4 }; const int update_freq_sb_rows = sb_size_update_freq_map[sb_size != MAX_SB_SIZE]; const int update_freq_num_rows = sb_size * update_freq_sb_rows; // Round-up the division result to next integer. const int num_updates_per_tile = (tile_height + update_freq_num_rows - 1) / update_freq_num_rows; const int num_rows_update_per_tile = num_updates_per_tile * sb_size; // Round-up the division result to next integer. const int num_sb_rows_per_update = (tile_height + num_rows_update_per_tile - 1) / num_rows_update_per_tile; if ((sb_row % num_sb_rows_per_update) != 0) return 1; } return 0; } // Checks for skip status of mv cost update. static int skip_mv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info, const int mi_row, const int mi_col) { const AV1_COMMON *cm = &cpi->common; // For intra frames, mv cdfs are not updated during the encode. Hence, the mv // cost calculation is skipped in this case. if (frame_is_intra_only(cm)) return 1; return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col, cpi->sf.inter_sf.mv_cost_upd_level); } // Checks for skip status of dv cost update. static int skip_dv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info, const int mi_row, const int mi_col) { const AV1_COMMON *cm = &cpi->common; // Intrabc is only applicable to intra frames. So skip if intrabc is not // allowed. if (!av1_allow_intrabc(cm) || is_stat_generation_stage(cpi)) { return 1; } return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col, cpi->sf.intra_sf.dv_cost_upd_level); } // Update the rate costs of some symbols according to the frequency directed // by speed features void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td, const TileInfo *const tile_info, const int mi_row, const int mi_col) { AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; if (cm->features.disable_cdf_update) { return; } switch (cpi->sf.inter_sf.coeff_cost_upd_level) { case INTERNAL_COST_UPD_OFF: case INTERNAL_COST_UPD_TILE: // Tile level break; case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile case INTERNAL_COST_UPD_SBROW: // SB row level in tile case INTERNAL_COST_UPD_SB: // SB level if (skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col, cpi->sf.inter_sf.coeff_cost_upd_level)) break; av1_fill_coeff_costs(&x->coeff_costs, xd->tile_ctx, num_planes); break; default: assert(0); } switch (cpi->sf.inter_sf.mode_cost_upd_level) { case INTERNAL_COST_UPD_OFF: case INTERNAL_COST_UPD_TILE: // Tile level break; case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile case INTERNAL_COST_UPD_SBROW: // SB row level in tile case INTERNAL_COST_UPD_SB: // SB level if (skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col, cpi->sf.inter_sf.mode_cost_upd_level)) break; av1_fill_mode_rates(cm, &x->mode_costs, xd->tile_ctx); break; default: assert(0); } switch (cpi->sf.inter_sf.mv_cost_upd_level) { case INTERNAL_COST_UPD_OFF: case INTERNAL_COST_UPD_TILE: // Tile level break; case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile case INTERNAL_COST_UPD_SBROW: // SB row level in tile case INTERNAL_COST_UPD_SB: // SB level // Checks for skip status of mv cost update. if (skip_mv_cost_update(cpi, tile_info, mi_row, mi_col)) break; av1_fill_mv_costs(&xd->tile_ctx->nmvc, cm->features.cur_frame_force_integer_mv, cm->features.allow_high_precision_mv, x->mv_costs); break; default: assert(0); } switch (cpi->sf.intra_sf.dv_cost_upd_level) { case INTERNAL_COST_UPD_OFF: case INTERNAL_COST_UPD_TILE: // Tile level break; case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile case INTERNAL_COST_UPD_SBROW: // SB row level in tile case INTERNAL_COST_UPD_SB: // SB level // Checks for skip status of dv cost update. if (skip_dv_cost_update(cpi, tile_info, mi_row, mi_col)) break; av1_fill_dv_costs(&xd->tile_ctx->ndvc, x->dv_costs); break; default: assert(0); } } void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes) { for (int plane = 0; plane < num_planes; ++plane) { aom_free(mb->plane[plane].src_diff); mb->plane[plane].src_diff = NULL; } } void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb) { const int num_planes = av1_num_planes(cm); #ifndef NDEBUG for (int plane = 0; plane < num_planes; ++plane) { assert(!mb->plane[plane].src_diff); } #endif for (int plane = 0; plane < num_planes; ++plane) { const int subsampling_xy = plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y : 0; const int sb_size = MAX_SB_SQUARE >> subsampling_xy; CHECK_MEM_ERROR(cm, mb->plane[plane].src_diff, (int16_t *)aom_memalign( 32, sizeof(*mb->plane[plane].src_diff) * sb_size)); } } aom-3.12.1/av1/encoder/encodeframe_utils.h000066400000000000000000000520661477627663500203610ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_ #define AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_ #include "aom_ports/aom_timer.h" #include "av1/common/reconinter.h" #include "av1/encoder/encoder.h" #include "av1/encoder/rdopt.h" #ifdef __cplusplus extern "C" { #endif #define WRITE_FEATURE_TO_FILE 0 #define FEATURE_SIZE_SMS_SPLIT_FAST 6 #define FEATURE_SIZE_SMS_SPLIT 17 #define FEATURE_SIZE_SMS_PRUNE_PART 25 #define FEATURE_SIZE_SMS_TERM_NONE 28 #define FEATURE_SIZE_FP_SMS_TERM_NONE 20 #define FEATURE_SIZE_MAX_MIN_PART_PRED 13 #define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4 #define FEATURE_SMS_NONE_FLAG 1 #define FEATURE_SMS_SPLIT_FLAG (1 << 1) #define FEATURE_SMS_RECT_FLAG (1 << 2) #define FEATURE_SMS_PRUNE_PART_FLAG \ (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG) #define FEATURE_SMS_SPLIT_MODEL_FLAG \ (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG) // Number of sub-partitions in rectangular partition types. #define SUB_PARTITIONS_RECT 2 // Number of sub-partitions in split partition type. #define SUB_PARTITIONS_SPLIT 4 // Number of sub-partitions in AB partition types. #define SUB_PARTITIONS_AB 3 // Number of sub-partitions in 4-way partition types. #define SUB_PARTITIONS_PART4 4 // 4part partition types. enum { HORZ4 = 0, VERT4, NUM_PART4_TYPES } UENUM1BYTE(PART4_TYPES); // AB partition types. enum { HORZ_A = 0, HORZ_B, VERT_A, VERT_B, NUM_AB_PARTS } UENUM1BYTE(AB_PART_TYPE); // Rectangular partition types. enum { HORZ = 0, VERT, NUM_RECT_PARTS } UENUM1BYTE(RECT_PART_TYPE); // Structure to keep win flags for HORZ and VERT partition evaluations. typedef struct { int rect_part_win[NUM_RECT_PARTS]; } RD_RECT_PART_WIN_INFO; enum { PICK_MODE_RD = 0, PICK_MODE_NONRD }; enum { SB_SINGLE_PASS, // Single pass encoding: all ctxs get updated normally SB_DRY_PASS, // First pass of multi-pass: does not update the ctxs SB_WET_PASS // Second pass of multi-pass: finalize and update the ctx } UENUM1BYTE(SB_MULTI_PASS_MODE); typedef struct { ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE]; ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE]; PARTITION_CONTEXT sa[MAX_MIB_SIZE]; PARTITION_CONTEXT sl[MAX_MIB_SIZE]; TXFM_CONTEXT *p_ta; TXFM_CONTEXT *p_tl; TXFM_CONTEXT ta[MAX_MIB_SIZE]; TXFM_CONTEXT tl[MAX_MIB_SIZE]; } RD_SEARCH_MACROBLOCK_CONTEXT; // This struct is used to store the statistics used by sb-level multi-pass // encoding. Currently, this is only used to make a copy of the state before we // perform the first pass typedef struct SB_FIRST_PASS_STATS { RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; RD_COUNTS rd_count; int split_count; FRAME_COUNTS fc; InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES]; int current_qindex; #if CONFIG_INTERNAL_STATS unsigned int mode_chosen_counts[MAX_MODES]; #endif // CONFIG_INTERNAL_STATS } SB_FIRST_PASS_STATS; // This structure contains block size related // variables for use in rd_pick_partition(). typedef struct { // Half of block width to determine block edge. int mi_step; // Block row and column indices. int mi_row; int mi_col; // Block edge row and column indices. int mi_row_edge; int mi_col_edge; // Block width of current partition block. int width; // Block width of minimum partition size allowed. int min_partition_size_1d; // Flag to indicate if partition is 8x8 or higher size. int bsize_at_least_8x8; // Indicates edge blocks in frame. int has_rows; int has_cols; // Block size of current partition. BLOCK_SIZE bsize; // Size of current sub-partition. BLOCK_SIZE subsize; // Size of split partition. BLOCK_SIZE split_bsize2; } PartitionBlkParams; #if CONFIG_COLLECT_PARTITION_STATS typedef struct PartitionTimingStats { // Tracks the number of partition decision used in the current call to \ref // av1_rd_pick_partition int partition_decisions[EXT_PARTITION_TYPES]; // Tracks the number of partition_block searched in the current call to \ref // av1_rd_pick_partition int partition_attempts[EXT_PARTITION_TYPES]; // Tracks the time spent on each partition search in the current call to \ref // av1_rd_pick_partition int64_t partition_times[EXT_PARTITION_TYPES]; // Tracks the rdcost spent on each partition search in the current call to // \ref av1_rd_pick_partition int64_t partition_rdcost[EXT_PARTITION_TYPES]; // Timer used to time the partitions. struct aom_usec_timer timer; // Whether the timer is on int timer_is_on; } PartitionTimingStats; #endif // CONFIG_COLLECT_PARTITION_STATS // Structure holding state variables for partition search. typedef struct { // Intra partitioning related info. PartitionSearchInfo *intra_part_info; // Parameters related to partition block size. PartitionBlkParams part_blk_params; // Win flags for HORZ and VERT partition evaluations. RD_RECT_PART_WIN_INFO split_part_rect_win[SUB_PARTITIONS_SPLIT]; // RD cost for the current block of given partition type. RD_STATS this_rdc; // RD cost summed across all blocks of partition type. RD_STATS sum_rdc; // Array holding partition type cost. int tmp_partition_cost[PARTITION_TYPES]; // Pointer to partition cost buffer int *partition_cost; // RD costs for different partition types. int64_t none_rd; int64_t split_rd[SUB_PARTITIONS_SPLIT]; // RD costs for rectangular partitions. // rect_part_rd[0][i] is the RD cost of ith partition index of PARTITION_HORZ. // rect_part_rd[1][i] is the RD cost of ith partition index of PARTITION_VERT. int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT]; // Flags indicating if the corresponding partition was winner or not. // Used to bypass similar blocks during AB partition evaluation. int is_split_ctx_is_ready[2]; int is_rect_ctx_is_ready[NUM_RECT_PARTS]; // If true, skips the rest of partition evaluation at the current bsize level. int terminate_partition_search; // If false, skips rdopt on PARTITION_NONE. int partition_none_allowed; // If partition_rect_allowed[HORZ] is false, skips searching PARTITION_HORZ, // PARTITION_HORZ_A, PARTITIO_HORZ_B, PARTITION_HORZ_4. Same holds for VERT. int partition_rect_allowed[NUM_RECT_PARTS]; // If false, skips searching rectangular partition unless some logic related // to edge detection holds. int do_rectangular_split; // If false, skips searching PARTITION_SPLIT. int do_square_split; // If true, prunes the corresponding PARTITION_HORZ/PARTITION_VERT. Note that // this does not directly affect the extended partitions, so this can be used // to prune out PARTITION_HORZ/PARTITION_VERT while still allowing rdopt of // PARTITION_HORZ_AB4, etc. int prune_rect_part[NUM_RECT_PARTS]; // Chroma subsampling in x and y directions. int ss_x; int ss_y; // Partition plane context index. int pl_ctx_idx; // This flag will be set if best partition is found from the search. bool found_best_partition; #if CONFIG_COLLECT_PARTITION_STATS PartitionTimingStats part_timing_stats; #endif // CONFIG_COLLECT_PARTITION_STATS } PartitionSearchState; static inline void av1_disable_square_split_partition( PartitionSearchState *part_state) { part_state->do_square_split = 0; } // Disables all possible rectangular splits. This includes PARTITION_AB4 as they // depend on the corresponding partition_rect_allowed. static inline void av1_disable_rect_partitions( PartitionSearchState *part_state) { part_state->do_rectangular_split = 0; part_state->partition_rect_allowed[HORZ] = 0; part_state->partition_rect_allowed[VERT] = 0; } // Disables all possible splits so that only PARTITION_NONE *might* be allowed. static inline void av1_disable_all_splits(PartitionSearchState *part_state) { av1_disable_square_split_partition(part_state); av1_disable_rect_partitions(part_state); } static inline void av1_set_square_split_only(PartitionSearchState *part_state) { part_state->partition_none_allowed = 0; part_state->do_square_split = 1; av1_disable_rect_partitions(part_state); } static inline bool av1_blk_has_rows_and_cols( const PartitionBlkParams *blk_params) { return blk_params->has_rows && blk_params->has_cols; } static inline bool av1_is_whole_blk_in_frame( const PartitionBlkParams *blk_params, const CommonModeInfoParams *mi_params) { const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; const BLOCK_SIZE bsize = blk_params->bsize; return mi_row + mi_size_high[bsize] <= mi_params->mi_rows && mi_col + mi_size_wide[bsize] <= mi_params->mi_cols; } static inline void update_filter_type_cdf(const MACROBLOCKD *xd, const MB_MODE_INFO *mbmi, int dual_filter) { for (int dir = 0; dir < 2; ++dir) { if (dir && !dual_filter) break; const int ctx = av1_get_pred_context_switchable_interp(xd, dir); InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir); update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter, SWITCHABLE_FILTERS); } } static inline int set_rdmult(const AV1_COMP *const cpi, const MACROBLOCK *const x, int segment_id) { const AV1_COMMON *const cm = &cpi->common; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const CommonQuantParams *quant_params = &cm->quant_params; const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth; const FRAME_UPDATE_TYPE update_type = cpi->ppi->gf_group.update_type[cpi->gf_frame_index]; const FRAME_TYPE frame_type = cm->current_frame.frame_type; const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); int qindex; if (segment_id >= 0) { qindex = av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex); } else { qindex = quant_params->base_qindex + x->rdmult_delta_qindex + quant_params->y_dc_delta_q; } return av1_compute_rd_mult( qindex, bit_depth, update_type, layer_depth, boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi), cpi->oxcf.tune_cfg.tuning); } static inline int do_split_check(BLOCK_SIZE bsize) { return (bsize == BLOCK_16X16 || bsize == BLOCK_32X32); } #if !CONFIG_REALTIME_ONLY static inline const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p, int frm) { assert(frm >= 0); if (frm < 0 || p->stats_buf_ctx->stats_in_start + frm > p->stats_buf_ctx->stats_in_end) { return NULL; } return &p->stats_buf_ctx->stats_in_start[frm]; } int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int orig_rdmult); int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step); int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step); void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, SuperBlockEnc *sb_enc); int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, ThreadData *td, int64_t *delta_dist, BLOCK_SIZE bsize, int mi_row, int mi_col); int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int mi_row, int mi_col); int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize, const int mi_row, const int mi_col); #endif // !CONFIG_REALTIME_ONLY void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit, const BLOCK_SIZE bsize, const int mi_row, const int mi_col, int *const rdmult); #if CONFIG_SALIENCY_MAP void av1_set_saliency_map_vmaf_rdmult(const AV1_COMP *const cpi, int *errorperbit, const BLOCK_SIZE bsize, const int mi_row, const int mi_col, int *const rdmult); #endif void av1_update_state(const AV1_COMP *const cpi, ThreadData *td, const PICK_MODE_CONTEXT *const ctx, int mi_row, int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run); void av1_update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts, PREDICTION_MODE mode, int16_t mode_context); void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts, MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, const MB_MODE_INFO *above_mi, const MB_MODE_INFO *left_mi, const int intraonly); void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_row, int mi_col, BLOCK_SIZE bsize, const int num_planes); void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_row, int mi_col, BLOCK_SIZE bsize, const int num_planes); void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile, MB_MODE_INFO **mib, int mi_row, int mi_col, BLOCK_SIZE bsize); int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col, BLOCK_SIZE bsize); void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree, BLOCK_SIZE bsize); void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type, BLOCK_SIZE bsize, int mib_size, int mi_row, int mi_col); void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr, int wt_left, int wt_tr); void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int mi_row, int mi_col); void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size, int mi_row, int mi_col); void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi, ThreadData *td, const TileDataEnc *tile_data, int mi_row, int mi_col); void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row, int mi_col); void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td, const TileInfo *const tile_info, const int mi_row, const int mi_col); void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes); static inline void av1_dealloc_mb_data(struct macroblock *mb, int num_planes) { aom_free(mb->txfm_search_info.mb_rd_record); mb->txfm_search_info.mb_rd_record = NULL; aom_free(mb->inter_modes_info); mb->inter_modes_info = NULL; av1_dealloc_src_diff_buf(mb, num_planes); aom_free(mb->e_mbd.seg_mask); mb->e_mbd.seg_mask = NULL; aom_free(mb->winner_mode_stats); mb->winner_mode_stats = NULL; aom_free(mb->dqcoeff_buf); mb->dqcoeff_buf = NULL; } static inline void allocate_winner_mode_stats(const AV1_COMP *cpi, struct macroblock *mb) { const SPEED_FEATURES *sf = &cpi->sf; // The winner_mode_stats buffer is not required in these cases. if (is_stat_generation_stage(cpi) || (sf->rt_sf.use_nonrd_pick_mode && !sf->rt_sf.hybrid_intra_pickmode) || (sf->winner_mode_sf.multi_winner_mode_type == MULTI_WINNER_MODE_OFF)) return; const AV1_COMMON *cm = &cpi->common; const int winner_mode_count = winner_mode_count_allowed[sf->winner_mode_sf.multi_winner_mode_type]; CHECK_MEM_ERROR(cm, mb->winner_mode_stats, (WinnerModeStats *)aom_malloc( winner_mode_count * sizeof(mb->winner_mode_stats[0]))); } void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb); static inline void av1_alloc_mb_data(const AV1_COMP *cpi, struct macroblock *mb) { const AV1_COMMON *cm = &cpi->common; const SPEED_FEATURES *sf = &cpi->sf; if (!sf->rt_sf.use_nonrd_pick_mode) { // Memory for mb_rd_record is allocated only when use_mb_rd_hash sf is // enabled. if (sf->rd_sf.use_mb_rd_hash) CHECK_MEM_ERROR(cm, mb->txfm_search_info.mb_rd_record, (MB_RD_RECORD *)aom_malloc(sizeof(MB_RD_RECORD))); if (!frame_is_intra_only(cm)) CHECK_MEM_ERROR( cm, mb->inter_modes_info, (InterModesInfo *)aom_malloc(sizeof(*mb->inter_modes_info))); } av1_alloc_src_diff_buf(cm, mb); CHECK_MEM_ERROR(cm, mb->e_mbd.seg_mask, (uint8_t *)aom_memalign( 16, 2 * MAX_SB_SQUARE * sizeof(mb->e_mbd.seg_mask[0]))); allocate_winner_mode_stats(cpi, mb); const int max_sb_square_y = 1 << num_pels_log2_lookup[cm->seq_params->sb_size]; CHECK_MEM_ERROR( cm, mb->dqcoeff_buf, (tran_low_t *)aom_memalign(32, max_sb_square_y * sizeof(tran_low_t))); } // This function will compute the number of reference frames to be disabled // based on selective_ref_frame speed feature. static inline unsigned int get_num_refs_to_disable( const AV1_COMP *cpi, const int *ref_frame_flags, const unsigned int *ref_display_order_hint, unsigned int cur_frame_display_index) { unsigned int num_refs_to_disable = 0; if (cpi->sf.inter_sf.selective_ref_frame >= 3) { num_refs_to_disable++; if (cpi->sf.inter_sf.selective_ref_frame >= 6) { // Disable LAST2_FRAME and ALTREF2_FRAME num_refs_to_disable += 2; } else if (cpi->sf.inter_sf.selective_ref_frame == 5 && *ref_frame_flags & av1_ref_frame_flag_list[LAST2_FRAME]) { const int last2_frame_dist = av1_encoder_get_relative_dist( ref_display_order_hint[LAST2_FRAME - LAST_FRAME], cur_frame_display_index); // Disable LAST2_FRAME if it is a temporally distant frame if (abs(last2_frame_dist) > 2) { num_refs_to_disable++; } #if !CONFIG_REALTIME_ONLY else if (is_stat_consumption_stage_twopass(cpi)) { const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(&cpi->ppi->twopass, cur_frame_display_index); const double coded_error_per_mb = this_frame_stats->coded_error; // Disable LAST2_FRAME if the coded error of the current frame based on // first pass stats is very low. if (coded_error_per_mb < 100.0) num_refs_to_disable++; } #endif // CONFIG_REALTIME_ONLY } } return num_refs_to_disable; } static inline int get_max_allowed_ref_frames( const AV1_COMP *cpi, const int *ref_frame_flags, const unsigned int *ref_display_order_hint, unsigned int cur_frame_display_index) { const unsigned int max_reference_frames = cpi->oxcf.ref_frm_cfg.max_reference_frames; const unsigned int num_refs_to_disable = get_num_refs_to_disable( cpi, ref_frame_flags, ref_display_order_hint, cur_frame_display_index); const unsigned int max_allowed_refs_for_given_speed = INTER_REFS_PER_FRAME - num_refs_to_disable; return AOMMIN(max_allowed_refs_for_given_speed, max_reference_frames); } // Enforce the number of references for each arbitrary frame based on user // options and speed. static inline void enforce_max_ref_frames( AV1_COMP *cpi, int *ref_frame_flags, const unsigned int *ref_display_order_hint, unsigned int cur_frame_display_index) { MV_REFERENCE_FRAME ref_frame; int total_valid_refs = 0; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { if (*ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { total_valid_refs++; } } const int max_allowed_refs = get_max_allowed_ref_frames( cpi, ref_frame_flags, ref_display_order_hint, cur_frame_display_index); for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) { const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i]; if (!(*ref_frame_flags & av1_ref_frame_flag_list[ref_frame_to_disable])) { continue; } switch (ref_frame_to_disable) { case LAST3_FRAME: *ref_frame_flags &= ~AOM_LAST3_FLAG; break; case LAST2_FRAME: *ref_frame_flags &= ~AOM_LAST2_FLAG; break; case ALTREF2_FRAME: *ref_frame_flags &= ~AOM_ALT2_FLAG; break; case BWDREF_FRAME: *ref_frame_flags &= ~AOM_GOLD_FLAG; break; default: assert(0); } --total_valid_refs; } assert(total_valid_refs <= max_allowed_refs); } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_ aom-3.12.1/av1/encoder/encodemb.c000066400000000000000000001061531477627663500164350ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/bitwriter.h" #include "aom_dsp/quantize.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "aom_util/debug_util.h" #endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "av1/common/cfl.h" #include "av1/common/idct.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" #include "av1/common/scan.h" #include "av1/encoder/av1_quantize.h" #include "av1/encoder/encodemb.h" #include "av1/encoder/hybrid_fwd_txfm.h" #include "av1/encoder/txb_rdopt.h" #include "av1/encoder/rd.h" #include "av1/encoder/rdopt.h" void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src8, ptrdiff_t src_stride, const uint8_t *pred8, ptrdiff_t pred_stride) { assert(rows >= 4 && cols >= 4); #if CONFIG_AV1_HIGHBITDEPTH if (bd_info.use_highbitdepth_buf) { aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8, pred_stride); return; } #endif (void)bd_info; aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8, pred_stride); } void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, int blk_col, int blk_row, TX_SIZE tx_size) { MACROBLOCKD *const xd = &x->e_mbd; const BitDepthInfo bd_info = get_bit_depth_info(xd); struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; const int diff_stride = block_size_wide[plane_bsize]; const int src_stride = p->src.stride; const int dst_stride = pd->dst.stride; const int tx1d_width = tx_size_wide[tx_size]; const int tx1d_height = tx_size_high[tx_size]; uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2]; int16_t *src_diff = &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2]; av1_subtract_block(bd_info, tx1d_height, tx1d_width, src_diff, diff_stride, src, src_stride, dst, dst_stride); } void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) { struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; assert(plane_bsize < BLOCK_SIZES_ALL); const int bw = block_size_wide[plane_bsize]; const int bh = block_size_high[plane_bsize]; const MACROBLOCKD *xd = &x->e_mbd; const BitDepthInfo bd_info = get_bit_depth_info(xd); av1_subtract_block(bd_info, bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); } int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, TX_TYPE tx_type, const TXB_CTX *const txb_ctx, int *rate_cost) { MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *const p = &x->plane[plane]; const int eob = p->eobs[block]; const int segment_id = xd->mi[0]->segment_id; if (eob == 0 || !cpi->optimize_seg_arr[segment_id] || xd->lossless[segment_id]) { *rate_cost = av1_cost_skip_txb(&x->coeff_costs, txb_ctx, plane, tx_size); return eob; } return av1_optimize_txb(cpi, x, plane, block, tx_size, tx_type, txb_ctx, rate_cost, cpi->oxcf.algo_cfg.sharpness); } // Hyper-parameters for dropout optimization, based on following logics. // TODO(yjshen): These settings are tuned by experiments. They may still be // optimized for better performance. // (1) Coefficients which are large enough will ALWAYS be kept. static const tran_low_t DROPOUT_COEFF_MAX = 2; // Max dropout-able coefficient. // (2) Continuous coefficients will ALWAYS be kept. Here rigorous continuity is // NOT required. For example, `5 0 0 0 7` is treated as two continuous // coefficients if three zeros do not fulfill the dropout condition. static const int DROPOUT_CONTINUITY_MAX = 2; // Max dropout-able continuous coeff. // (3) Dropout operation is NOT applicable to blocks with large or small // quantization index. static const int DROPOUT_Q_MAX = 128; static const int DROPOUT_Q_MIN = 16; // (4) Recall that dropout optimization will forcibly set some quantized // coefficients to zero. The key logic on determining whether a coefficient // should be dropped is to check the number of continuous zeros before AND // after this coefficient. The exact number of zeros for judgement depends // on block size and quantization index. More concretely, block size // determines the base number of zeros, while quantization index determines // the multiplier. Intuitively, larger block requires more zeros and larger // quantization index also requires more zeros (more information is lost // when using larger quantization index). static const int DROPOUT_BEFORE_BASE_MAX = 32; // Max base number for leading zeros. static const int DROPOUT_BEFORE_BASE_MIN = 16; // Min base number for leading zeros. static const int DROPOUT_AFTER_BASE_MAX = 32; // Max base number for trailing zeros. static const int DROPOUT_AFTER_BASE_MIN = 16; // Min base number for trailing zeros. static const int DROPOUT_MULTIPLIER_MAX = 8; // Max multiplier on number of zeros. static const int DROPOUT_MULTIPLIER_MIN = 2; // Min multiplier on number of zeros. static const int DROPOUT_MULTIPLIER_Q_BASE = 32; // Base Q to compute multiplier. void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, TX_TYPE tx_type, int qindex) { const int tx_width = tx_size_wide[tx_size]; const int tx_height = tx_size_high[tx_size]; // Early return if `qindex` is out of range. if (qindex > DROPOUT_Q_MAX || qindex < DROPOUT_Q_MIN) { return; } // Compute number of zeros used for dropout judgement. const int base_size = AOMMAX(tx_width, tx_height); const int multiplier = CLIP(qindex / DROPOUT_MULTIPLIER_Q_BASE, DROPOUT_MULTIPLIER_MIN, DROPOUT_MULTIPLIER_MAX); const int dropout_num_before = multiplier * CLIP(base_size, DROPOUT_BEFORE_BASE_MIN, DROPOUT_BEFORE_BASE_MAX); const int dropout_num_after = multiplier * CLIP(base_size, DROPOUT_AFTER_BASE_MIN, DROPOUT_AFTER_BASE_MAX); av1_dropout_qcoeff_num(mb, plane, block, tx_size, tx_type, dropout_num_before, dropout_num_after); } void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, TX_TYPE tx_type, int dropout_num_before, int dropout_num_after) { const struct macroblock_plane *const p = &mb->plane[plane]; tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block); tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); const int max_eob = av1_get_max_eob(tx_size); const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); // Early return if there are not enough non-zero coefficients. if (p->eobs[block] == 0 || p->eobs[block] <= dropout_num_before || max_eob <= dropout_num_before + dropout_num_after) { return; } int count_zeros_before = 0; int count_zeros_after = 0; int count_nonzeros = 0; // Index of the first non-zero coefficient after sufficient number of // continuous zeros. If equals to `-1`, it means number of leading zeros // hasn't reach `dropout_num_before`. int idx = -1; int eob = 0; // New end of block. for (int i = 0; i < p->eobs[block]; ++i) { const int scan_idx = scan_order->scan[i]; if (abs(qcoeff[scan_idx]) > DROPOUT_COEFF_MAX) { // Keep large coefficients. count_zeros_before = 0; count_zeros_after = 0; idx = -1; eob = i + 1; } else if (qcoeff[scan_idx] == 0) { // Count zeros. if (idx == -1) { ++count_zeros_before; } else { ++count_zeros_after; } } else { // Count non-zeros. if (count_zeros_before >= dropout_num_before) { idx = (idx == -1) ? i : idx; ++count_nonzeros; } else { count_zeros_before = 0; eob = i + 1; } } // Handle continuity. if (count_nonzeros > DROPOUT_CONTINUITY_MAX) { count_zeros_before = 0; count_zeros_after = 0; count_nonzeros = 0; idx = -1; eob = i + 1; } // Handle the trailing zeros after original end of block. if (idx != -1 && i == p->eobs[block] - 1) { count_zeros_after += (max_eob - p->eobs[block]); } // Set redundant coefficients to zeros if needed. if (count_zeros_after >= dropout_num_after) { for (int j = idx; j <= i; ++j) { qcoeff[scan_order->scan[j]] = 0; dqcoeff[scan_order->scan[j]] = 0; } count_zeros_before += (i - idx + 1); count_zeros_after = 0; count_nonzeros = 0; } else if (i == p->eobs[block] - 1) { eob = i + 1; } } if (eob != p->eobs[block]) { p->eobs[block] = eob; p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(qcoeff, scan_order, eob); } } // Settings for optimization type. NOTE: To set optimization type for all intra // frames, both `KEY_BLOCK_OPT_TYPE` and `INTRA_BLOCK_OPT_TYPE` should be set. // TODO(yjshen): These settings are hard-coded and look okay for now. They // should be made configurable later. // Blocks of key frames ONLY. static const OPT_TYPE KEY_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT; // Blocks of intra frames (key frames EXCLUSIVE). static const OPT_TYPE INTRA_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT; // Blocks of inter frames. (NOTE: Dropout optimization is DISABLED by default // if trellis optimization is on for inter frames.) static const OPT_TYPE INTER_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT; enum { QUANT_FUNC_LOWBD = 0, QUANT_FUNC_HIGHBD = 1, QUANT_FUNC_TYPES = 2 } UENUM1BYTE(QUANT_FUNC); #if CONFIG_AV1_HIGHBITDEPTH static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = { { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade }, { av1_quantize_b_facade, av1_highbd_quantize_b_facade }, { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade }, { NULL, NULL } }; #else static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES] = { av1_quantize_fp_facade, av1_quantize_b_facade, av1_quantize_dc_facade, NULL }; #endif // Computes the transform for DC only blocks void av1_xform_dc_only(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param, int64_t per_px_mean) { assert(per_px_mean != INT64_MAX); const struct macroblock_plane *const p = &x->plane[plane]; const int block_offset = BLOCK_OFFSET(block); tran_low_t *const coeff = p->coeff + block_offset; const int n_coeffs = av1_get_max_eob(txfm_param->tx_size); memset(coeff, 0, sizeof(*coeff) * n_coeffs); coeff[0] = (tran_low_t)((per_px_mean * dc_coeff_scale[txfm_param->tx_size]) >> 12); } void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param, const QUANT_PARAM *qparam) { av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, txfm_param); av1_quant(x, plane, block, txfm_param, qparam); } void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param) { const struct macroblock_plane *const p = &x->plane[plane]; const int block_offset = BLOCK_OFFSET(block); tran_low_t *const coeff = p->coeff + block_offset; const int diff_stride = block_size_wide[plane_bsize]; const int src_offset = (blk_row * diff_stride + blk_col); const int16_t *src_diff = &p->src_diff[src_offset << MI_SIZE_LOG2]; av1_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); } void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param, const QUANT_PARAM *qparam) { const struct macroblock_plane *const p = &x->plane[plane]; const SCAN_ORDER *const scan_order = get_scan(txfm_param->tx_size, txfm_param->tx_type); const int block_offset = BLOCK_OFFSET(block); tran_low_t *const coeff = p->coeff + block_offset; tran_low_t *const qcoeff = p->qcoeff + block_offset; tran_low_t *const dqcoeff = p->dqcoeff + block_offset; uint16_t *const eob = &p->eobs[block]; if (qparam->xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) { const int n_coeffs = av1_get_max_eob(txfm_param->tx_size); if (LIKELY(!x->seg_skip_block)) { #if CONFIG_AV1_HIGHBITDEPTH quant_func_list[qparam->xform_quant_idx][txfm_param->is_hbd]( coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam); #else quant_func_list[qparam->xform_quant_idx]( coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam); #endif } else { av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob); } } // use_optimize_b is true means av1_optimze_b will be called, // thus cannot update entropy ctx now (performed in optimize_b) if (qparam->use_optimize_b) { p->txb_entropy_ctx[block] = 0; } else { p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(qcoeff, scan_order, *eob); } } void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size, TX_TYPE tx_type, TxfmParam *txfm_param) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; txfm_param->tx_type = tx_type; txfm_param->tx_size = tx_size; txfm_param->lossless = xd->lossless[mbmi->segment_id]; txfm_param->tx_set_type = av1_get_ext_tx_set_type( tx_size, is_inter_block(mbmi), cm->features.reduced_tx_set_used); txfm_param->bd = xd->bd; txfm_param->is_hbd = is_cur_buf_hbd(xd); } void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx, int use_quant_b_adapt, QUANT_PARAM *qparam) { qparam->log_scale = av1_get_tx_scale(tx_size); qparam->tx_size = tx_size; qparam->use_quant_b_adapt = use_quant_b_adapt; // TODO(bohanli): optimize_b and quantization idx has relationship, // but is kind of buried and complicated in different encoding stages. // Should have a unified function to derive quant_idx, rather than // determine and pass in the quant_idx qparam->use_optimize_b = use_optimize_b; qparam->xform_quant_idx = xform_quant_idx; qparam->qmatrix = NULL; qparam->iqmatrix = NULL; } void av1_setup_qmatrix(const CommonQuantParams *quant_params, const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, TX_TYPE tx_type, QUANT_PARAM *qparam) { qparam->qmatrix = av1_get_qmatrix(quant_params, xd, plane, tx_size, tx_type); qparam->iqmatrix = av1_get_iqmatrix(quant_params, xd, plane, tx_size, tx_type); } static void encode_block(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg, RUN_TYPE dry_run) { (void)dry_run; struct encode_b_args *const args = arg; const AV1_COMP *const cpi = args->cpi; const AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); uint8_t *dst; ENTROPY_CONTEXT *a, *l; int dummy_rate_cost = 0; const int bw = mi_size_wide[plane_bsize]; dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2]; a = &args->ta[blk_col]; l = &args->tl[blk_row]; TX_TYPE tx_type = DCT_DCT; const int blk_skip_idx = blk_row * bw + blk_col; if (!is_blk_skip(x->txfm_search_info.blk_skip, plane, blk_skip_idx) && !mbmi->skip_mode) { tx_type = av1_get_tx_type(xd, pd->plane_type, blk_row, blk_col, tx_size, cm->features.reduced_tx_set_used); TxfmParam txfm_param; QUANT_PARAM quant_param; const int use_trellis = is_trellis_used(args->enable_optimize_b, dry_run); int quant_idx; if (use_trellis) quant_idx = AV1_XFORM_QUANT_FP; else quant_idx = USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP; av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param); av1_setup_quant(tx_size, use_trellis, quant_idx, cpi->oxcf.q_cfg.quant_b_adapt, &quant_param); av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, &quant_param); av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, &quant_param); // Whether trellis or dropout optimization is required for inter frames. const bool do_trellis = INTER_BLOCK_OPT_TYPE == TRELLIS_OPT || INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT; const bool do_dropout = INTER_BLOCK_OPT_TYPE == DROPOUT_OPT || INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT; if (quant_param.use_optimize_b && do_trellis) { TXB_CTX txb_ctx; get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, &dummy_rate_cost); } if (!quant_param.use_optimize_b && do_dropout) { av1_dropout_qcoeff(x, plane, block, tx_size, tx_type, cm->quant_params.base_qindex); } } else { p->eobs[block] = 0; p->txb_entropy_ctx[block] = 0; } av1_set_txb_context(x, plane, block, tx_size, a, l); if (p->eobs[block]) { // As long as any YUV plane has non-zero quantized transform coefficients, // mbmi->skip_txfm flag is set to 0. mbmi->skip_txfm = 0; av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, pd->dst.stride, p->eobs[block], cm->features.reduced_tx_set_used); } else { // Only when YUV planes all have zero quantized transform coefficients, // mbmi->skip_txfm flag is set to 1. mbmi->skip_txfm &= 1; } // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0 // case. It is possible that certain collision in hash index would cause // the assertion failure. To further optimize the rate-distortion // performance, we need to re-visit this part and enable this assert // again. if (p->eobs[block] == 0 && plane == 0) { #if 0 if (args->cpi->oxcf.q_cfg.aq_mode == NO_AQ && args->cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q) { // TODO(jingning,angiebird,huisu@google.com): enable txk_check when // enable_optimize_b is true to detect potential RD bug. const uint8_t disable_txk_check = args->enable_optimize_b; if (!disable_txk_check) { assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] == DCT_DCT); } } #endif update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); } #if CONFIG_MISMATCH_DEBUG if (dry_run == OUTPUT_ENABLED) { int pixel_c, pixel_r; BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; int blk_w = block_size_wide[bsize]; int blk_h = block_size_high[bsize]; mi_to_pixel_loc(&pixel_c, &pixel_r, xd->mi_col, xd->mi_row, blk_col, blk_row, pd->subsampling_x, pd->subsampling_y); mismatch_record_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint, plane, pixel_c, pixel_r, blk_w, blk_h, xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); } #endif } static void encode_block_inter(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg, RUN_TYPE dry_run) { struct encode_b_args *const args = arg; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int max_blocks_high = max_block_high(xd, plane_bsize, plane); const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; const TX_SIZE plane_tx_size = plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x, pd->subsampling_y) : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, blk_col)]; if (!plane) { assert(tx_size_wide[tx_size] >= tx_size_wide[plane_tx_size] && tx_size_high[tx_size] >= tx_size_high[plane_tx_size]); } if (tx_size == plane_tx_size || plane) { encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg, dry_run); } else { assert(tx_size < TX_SIZES_ALL); const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size)); assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size)); // This is the square transform block partition entry point. const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; const int step = bsh * bsw; const int row_end = AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); const int col_end = AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); assert(bsw > 0 && bsh > 0); for (int row = 0; row < row_end; row += bsh) { const int offsetr = blk_row + row; for (int col = 0; col < col_end; col += bsw) { const int offsetc = blk_col + col; encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs, arg, dry_run); block += step; } } } } void av1_foreach_transformed_block_in_plane( const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane, foreach_transformed_block_visitor visit, void *arg) { const struct macroblockd_plane *const pd = &xd->plane[plane]; // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 // transform size varies per plane, look it up in a common way. const TX_SIZE tx_size = av1_get_tx_size(plane, xd); const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; // Call visit() directly with zero offsets if the current block size is the // same as the transform block size. if (plane_bsize == tx_bsize) { visit(plane, 0, 0, 0, plane_bsize, tx_size, arg); return; } const uint8_t txw_unit = tx_size_wide_unit[tx_size]; const uint8_t txh_unit = tx_size_high_unit[tx_size]; const int step = txw_unit * txh_unit; // If mb_to_right_edge is < 0 we are in a situation in which // the current block size extends into the UMV and we won't // visit the sub blocks that are wholly within the UMV. const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); const int max_blocks_high = max_block_high(xd, plane_bsize, plane); const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y); const int mu_blocks_wide = AOMMIN(mi_size_wide[max_unit_bsize], max_blocks_wide); const int mu_blocks_high = AOMMIN(mi_size_high[max_unit_bsize], max_blocks_high); // Keep track of the row and column of the blocks we use so that we know // if we are in the unrestricted motion border. int i = 0; for (int r = 0; r < max_blocks_high; r += mu_blocks_high) { const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high); // Skip visiting the sub blocks that are wholly within the UMV. for (int c = 0; c < max_blocks_wide; c += mu_blocks_wide) { const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide); for (int blk_row = r; blk_row < unit_height; blk_row += txh_unit) { for (int blk_col = c; blk_col < unit_width; blk_col += txw_unit) { visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg); i += step; } } } } // Check if visit() is invoked at least once. assert(i >= 1); } typedef struct encode_block_pass1_args { AV1_COMP *cpi; MACROBLOCK *x; } encode_block_pass1_args; static void encode_block_pass1(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { encode_block_pass1_args *args = (encode_block_pass1_args *)arg; AV1_COMP *cpi = args->cpi; AV1_COMMON *cm = &cpi->common; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); uint8_t *dst; dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2]; TxfmParam txfm_param; QUANT_PARAM quant_param; av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param); av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt, &quant_param); av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, DCT_DCT, &quant_param); av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, &quant_param); if (p->eobs[block] > 0) { txfm_param.eob = p->eobs[block]; if (txfm_param.is_hbd) { av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param); return; } av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param); } } void av1_encode_sby_pass1(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize) { encode_block_pass1_args args = { cpi, x }; av1_subtract_plane(x, bsize, 0); av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, encode_block_pass1, &args); } void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, RUN_TYPE dry_run) { assert(bsize < BLOCK_SIZES_ALL); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; // In the current encoder implementation, for inter blocks, // only when YUV planes all have zero quantized transform coefficients, // mbmi->skip_txfm flag is set to 1. // For intra blocks, this flag is set to 0 since skipped blocks are so rare // that transmitting skip_txfm = 1 is very expensive. // mbmi->skip_txfm is init to 1, and will be modified in encode_block() based // on transform, quantization, and (if exists) trellis optimization. mbmi->skip_txfm = 1; if (x->txfm_search_info.skip_txfm) return; struct optimize_ctx ctx; struct encode_b_args arg = { cpi, x, &ctx, NULL, NULL, dry_run, cpi->optimize_seg_arr[mbmi->segment_id] }; const AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); for (int plane = 0; plane < num_planes; ++plane) { const struct macroblockd_plane *const pd = &xd->plane[plane]; const int subsampling_x = pd->subsampling_x; const int subsampling_y = pd->subsampling_y; if (plane && !xd->is_chroma_ref) break; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, subsampling_x, subsampling_y); assert(plane_bsize < BLOCK_SIZES_ALL); const int mi_width = mi_size_wide[plane_bsize]; const int mi_height = mi_size_high[plane_bsize]; const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size]; const int bw = mi_size_wide[txb_size]; const int bh = mi_size_high[txb_size]; int block = 0; const int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; av1_get_entropy_contexts(plane_bsize, pd, ctx.ta[plane], ctx.tl[plane]); av1_subtract_plane(x, plane_bsize, plane); arg.ta = ctx.ta[plane]; arg.tl = ctx.tl[plane]; const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, subsampling_x, subsampling_y); int mu_blocks_wide = mi_size_wide[max_unit_bsize]; int mu_blocks_high = mi_size_high[max_unit_bsize]; mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide); mu_blocks_high = AOMMIN(mi_height, mu_blocks_high); for (int idy = 0; idy < mi_height; idy += mu_blocks_high) { for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) { int blk_row, blk_col; const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height); const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width); for (blk_row = idy; blk_row < unit_height; blk_row += bh) { for (blk_col = idx; blk_col < unit_width; blk_col += bw) { encode_block_inter(plane, block, blk_row, blk_col, plane_bsize, max_tx_size, &arg, dry_run); block += step; } } } } } } static void encode_block_intra(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { struct encode_b_args *const args = arg; const AV1_COMP *const cpi = args->cpi; const AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); PLANE_TYPE plane_type = get_plane_type(plane); uint16_t *eob = &p->eobs[block]; const int dst_stride = pd->dst.stride; uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; int dummy_rate_cost = 0; av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size); TX_TYPE tx_type = DCT_DCT; const int bw = mi_size_wide[plane_bsize]; if (plane == 0 && is_blk_skip(x->txfm_search_info.blk_skip, plane, blk_row * bw + blk_col)) { *eob = 0; p->txb_entropy_ctx[block] = 0; } else { av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); const ENTROPY_CONTEXT *a = &args->ta[blk_col]; const ENTROPY_CONTEXT *l = &args->tl[blk_row]; tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, cm->features.reduced_tx_set_used); TxfmParam txfm_param; QUANT_PARAM quant_param; const int use_trellis = is_trellis_used(args->enable_optimize_b, args->dry_run); int quant_idx; if (use_trellis) quant_idx = AV1_XFORM_QUANT_FP; else quant_idx = USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP; av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param); av1_setup_quant(tx_size, use_trellis, quant_idx, cpi->oxcf.q_cfg.quant_b_adapt, &quant_param); av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, &quant_param); av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, &quant_param); // Whether trellis or dropout optimization is required for key frames and // intra frames. const bool do_trellis = (frame_is_intra_only(cm) && (KEY_BLOCK_OPT_TYPE == TRELLIS_OPT || KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) || (!frame_is_intra_only(cm) && (INTRA_BLOCK_OPT_TYPE == TRELLIS_OPT || INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)); const bool do_dropout = (frame_is_intra_only(cm) && (KEY_BLOCK_OPT_TYPE == DROPOUT_OPT || KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) || (!frame_is_intra_only(cm) && (INTRA_BLOCK_OPT_TYPE == DROPOUT_OPT || INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)); if (quant_param.use_optimize_b && do_trellis) { TXB_CTX txb_ctx; get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, &dummy_rate_cost); } if (do_dropout) { av1_dropout_qcoeff(x, plane, block, tx_size, tx_type, cm->quant_params.base_qindex); } } if (*eob) { av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, dst_stride, *eob, cm->features.reduced_tx_set_used); } // TODO(jingning): Temporarily disable txk_type check for eob=0 case. // It is possible that certain collision in hash index would cause // the assertion failure. To further optimize the rate-distortion // performance, we need to re-visit this part and enable this assert // again. if (*eob == 0 && plane == 0) { #if 0 if (args->cpi->oxcf.q_cfg.aq_mode == NO_AQ && args->cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q) { assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] == DCT_DCT); } #endif update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); } // For intra mode, skipped blocks are so rare that transmitting // skip_txfm = 1 is very expensive. mbmi->skip_txfm = 0; #if !CONFIG_REALTIME_ONLY if (plane == AOM_PLANE_Y && xd->cfl.store_y) { cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize); } #endif } static void encode_block_intra_and_set_context(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg); struct encode_b_args *const args = arg; MACROBLOCK *x = args->x; ENTROPY_CONTEXT *a = &args->ta[blk_col]; ENTROPY_CONTEXT *l = &args->tl[blk_row]; av1_set_txb_context(x, plane, block, tx_size, a, l); } void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run, TRELLIS_OPT_TYPE enable_optimize_b) { assert(bsize < BLOCK_SIZES_ALL); const MACROBLOCKD *const xd = &x->e_mbd; if (plane && !xd->is_chroma_ref) return; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int ss_x = pd->subsampling_x; const int ss_y = pd->subsampling_y; ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 }; ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 }; struct encode_b_args arg = { cpi, x, NULL, ta, tl, dry_run, enable_optimize_b }; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); if (enable_optimize_b) { av1_get_entropy_contexts(plane_bsize, pd, ta, tl); } av1_foreach_transformed_block_in_plane( xd, plane_bsize, plane, encode_block_intra_and_set_context, &arg); } aom-3.12.1/av1/encoder/encodemb.h000066400000000000000000000172001477627663500164340ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_ENCODEMB_H_ #define AOM_AV1_ENCODER_ENCODEMB_H_ #include "config/aom_config.h" #include "av1/common/av1_common_int.h" #include "av1/common/txb_common.h" #include "av1/encoder/av1_quantize.h" #include "av1/encoder/block.h" #include "av1/encoder/tokenize.h" #ifdef __cplusplus extern "C" { #endif enum { AV1_XFORM_QUANT_FP = 0, AV1_XFORM_QUANT_B = 1, AV1_XFORM_QUANT_DC = 2, AV1_XFORM_QUANT_SKIP_QUANT, AV1_XFORM_QUANT_TYPES, } UENUM1BYTE(AV1_XFORM_QUANT); // TODO(any): Merge OPT_TYPe and TRELLLIS_OPT_TYPE // Available optimization types to optimize the quantized coefficients. enum { NONE_OPT = 0, // No optimization. TRELLIS_OPT = 1, // Trellis optimization. See `av1_optimize_b()`. DROPOUT_OPT = 2, // Dropout optimization. See `av1_dropout_qcoeff()`. TRELLIS_DROPOUT_OPT = 3 // Perform dropout after trellis optimization. } UENUM1BYTE(OPT_TYPE); enum { NO_TRELLIS_OPT, // No trellis optimization FULL_TRELLIS_OPT, // Trellis optimization in all stages FINAL_PASS_TRELLIS_OPT, // Trellis optimization in only the final encode pass NO_ESTIMATE_YRD_TRELLIS_OPT // Disable trellis in estimate_yrd_for_sb } UENUM1BYTE(TRELLIS_OPT_TYPE); struct optimize_ctx { ENTROPY_CONTEXT ta[MAX_MB_PLANE][MAX_MIB_SIZE]; ENTROPY_CONTEXT tl[MAX_MB_PLANE][MAX_MIB_SIZE]; }; struct encode_b_args { const struct AV1_COMP *cpi; MACROBLOCK *x; struct optimize_ctx *ctx; ENTROPY_CONTEXT *ta; ENTROPY_CONTEXT *tl; RUN_TYPE dry_run; TRELLIS_OPT_TYPE enable_optimize_b; }; void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, RUN_TYPE dry_run); void av1_foreach_transformed_block_in_plane( const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane, foreach_transformed_block_visitor visit, void *arg); void av1_encode_sby_pass1(struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize); void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size, TX_TYPE tx_type, TxfmParam *txfm_param); void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx, int use_quant_b_adapt, QUANT_PARAM *qparam); void av1_setup_qmatrix(const CommonQuantParams *quant_params, const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, TX_TYPE tx_type, QUANT_PARAM *qparam); void av1_xform_dc_only(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param, int64_t per_px_mean); void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param, const QUANT_PARAM *qparam); void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param); void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param, const QUANT_PARAM *qparam); int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, TX_TYPE tx_type, const TXB_CTX *const txb_ctx, int *rate_cost); // This function can be used as (i) a further optimization to reduce the // redundancy of quantized coefficients (a.k.a., `qcoeff`) after trellis // optimization, or (ii) an alternative to trellis optimization in high-speed // compression mode (e.g., real-time mode under speed-6) due to its LOW time // complexity. The rational behind is to drop out the may-be redundant quantized // coefficient which is among a bunch of zeros. NOTE: This algorithm is not as // accurate as trellis optimization since the hyper-parameters are hard-coded // instead of dynamic search. More adaptive logic may improve the performance. // This function should be applied to all or partical block cells. // Inputs: // mb: Pointer to the MACROBLOCK to perform dropout on. // plane: Index of the plane to which the target block belongs. // block: Index of the target block. // tx_size: Transform size of the target block. // tx_type: Transform type of the target block. This field is particularly // used to find out the scan order of the block. // qindex: Quantization index used for target block. In general, all blocks // in a same plane share the same quantization index. This field is // particularly used to determine how many zeros should be used to // drop out a coefficient. // Returns: // Nothing will be returned, but `qcoeff`, `dqcoeff`, `eob`, as well as // `txb_entropy_ctx`, which `mb` points to, may be modified by this function. void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, TX_TYPE tx_type, int qindex); // Same as above, with the number of zeroes needed before/after a coeff to drop // it explicitly passed in, instead of being derived from qindex. void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, TX_TYPE tx_type, int dropout_num_before, int dropout_num_after); void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src8, ptrdiff_t src_stride, const uint8_t *pred8, ptrdiff_t pred_stride); void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, int blk_col, int blk_row, TX_SIZE tx_size); void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane); static inline void av1_set_txb_context(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) { const uint8_t ctx = x->plane[plane].txb_entropy_ctx[block]; memset(a, ctx, tx_size_wide_unit[tx_size] * sizeof(*a)); memset(l, ctx, tx_size_high_unit[tx_size] * sizeof(*l)); } void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run, TRELLIS_OPT_TYPE enable_optimize_b); static inline int is_trellis_used(TRELLIS_OPT_TYPE optimize_b, RUN_TYPE dry_run) { if (optimize_b == NO_TRELLIS_OPT) return false; if (optimize_b == FINAL_PASS_TRELLIS_OPT && dry_run != OUTPUT_ENABLED) return false; return true; } // Scaling terms (precision of 12 bits) to perform tx-size specific // normalization that is used in DCT_DCT forward transform. // For transform blocks of 1:2 and 2:1 - sqrt(2) normalization is used // For transform blocks of 1:4 and 4:1 - factor of 2 is used // For transform blocks TX_8x8 and below - an additional factor of 2 is used // For transform blocks max(width,height)=64 - currently not supported static const uint16_t dc_coeff_scale[TX_SIZES_ALL] = { 1024, 2048, 4096, 4096, 0, 1448, 1448, 2896, 2896, 2896, 2896, 0, 0, 2048, 2048, 4096, 4096, 0, 0 }; #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_ENCODEMB_H_ aom-3.12.1/av1/encoder/encodemv.c000066400000000000000000000307531477627663500164630ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/common/common.h" #include "av1/common/entropymode.h" #include "av1/encoder/cost.h" #include "av1/encoder/encodemv.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/bitops.h" static void update_mv_component_stats(int comp, nmv_component *mvcomp, MvSubpelPrecision precision) { assert(comp != 0); int offset; const int sign = comp < 0; const int mag = sign ? -comp : comp; const int mv_class = av1_get_mv_class(mag - 1, &offset); const int d = offset >> 3; // int mv data const int fr = (offset >> 1) & 3; // fractional mv data const int hp = offset & 1; // high precision mv data // Sign update_cdf(mvcomp->sign_cdf, sign, 2); // Class update_cdf(mvcomp->classes_cdf, mv_class, MV_CLASSES); // Integer bits if (mv_class == MV_CLASS_0) { update_cdf(mvcomp->class0_cdf, d, CLASS0_SIZE); } else { const int n = mv_class + CLASS0_BITS - 1; // number of bits for (int i = 0; i < n; ++i) update_cdf(mvcomp->bits_cdf[i], (d >> i) & 1, 2); } // Fractional bits if (precision > MV_SUBPEL_NONE) { aom_cdf_prob *fp_cdf = mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf; update_cdf(fp_cdf, fr, MV_FP_SIZE); } // High precision bit if (precision > MV_SUBPEL_LOW_PRECISION) { aom_cdf_prob *hp_cdf = mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf; update_cdf(hp_cdf, hp, 2); } } void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx, MvSubpelPrecision precision) { const MV diff = { mv->row - ref->row, mv->col - ref->col }; const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); update_cdf(mvctx->joints_cdf, j, MV_JOINTS); if (mv_joint_vertical(j)) update_mv_component_stats(diff.row, &mvctx->comps[0], precision); if (mv_joint_horizontal(j)) update_mv_component_stats(diff.col, &mvctx->comps[1], precision); } static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp, MvSubpelPrecision precision) { assert(comp != 0); int offset; const int sign = comp < 0; const int mag = sign ? -comp : comp; const int mv_class = av1_get_mv_class(mag - 1, &offset); const int d = offset >> 3; // int mv data const int fr = (offset >> 1) & 3; // fractional mv data const int hp = offset & 1; // high precision mv data // Sign aom_write_symbol(w, sign, mvcomp->sign_cdf, 2); // Class aom_write_symbol(w, mv_class, mvcomp->classes_cdf, MV_CLASSES); // Integer bits if (mv_class == MV_CLASS_0) { aom_write_symbol(w, d, mvcomp->class0_cdf, CLASS0_SIZE); } else { int i; const int n = mv_class + CLASS0_BITS - 1; // number of bits for (i = 0; i < n; ++i) aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[i], 2); } // Fractional bits if (precision > MV_SUBPEL_NONE) { aom_write_symbol( w, fr, mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf, MV_FP_SIZE); } // High precision bit if (precision > MV_SUBPEL_LOW_PRECISION) aom_write_symbol( w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf, 2); } /* TODO(siekyleb@amazon.com): This function writes MV_VALS ints or 128 KiB. This * is more than most L1D caches and is a significant chunk of L2. Write * SIMD that uses streaming writes to avoid loading all of that into L1, or * just don't update the larger component costs every time this called * (or both). */ void av1_build_nmv_component_cost_table(int *mvcost, const nmv_component *const mvcomp, MvSubpelPrecision precision) { int i, j, v, o, mantissa; int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE]; int bits_cost[MV_OFFSET_BITS][2]; int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE] = { 0 }, fp_cost[MV_FP_SIZE] = { 0 }; int class0_hp_cost[2] = { 0 }, hp_cost[2] = { 0 }; av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL); av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL); av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, NULL); for (i = 0; i < MV_OFFSET_BITS; ++i) { av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL); } if (precision > MV_SUBPEL_NONE) { for (i = 0; i < CLASS0_SIZE; ++i) av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i], NULL); av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL); } if (precision > MV_SUBPEL_LOW_PRECISION) { av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL); av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL); } // Instead of accumulating the cost of each vector component's bits // individually, compute the costs based on smaller vectors. Costs for // [2^exp, 2 * 2^exp - 1] are calculated based on [0, 2^exp - 1] // respectively. Offsets are maintained to swap both 1) class costs when // treated as a complete vector component with the highest set bit when // treated as a mantissa (significand) and 2) leading zeros to account for // the current exponent. // Cost offsets int cost_swap[MV_OFFSET_BITS] = { 0 }; // Delta to convert positive vector to negative vector costs int negate_sign = sign_cost[1] - sign_cost[0]; // Initialize with offsets to swap the class costs with the costs of the // highest set bit. for (i = 1; i < MV_OFFSET_BITS; ++i) { cost_swap[i] = bits_cost[i - 1][1]; if (i > CLASS0_BITS) cost_swap[i] -= class_cost[i - CLASS0_BITS]; } // Seed the fractional costs onto the output (overwritten latter). for (o = 0; o < MV_FP_SIZE; ++o) { int hp; for (hp = 0; hp < 2; ++hp) { v = 2 * o + hp + 1; mvcost[v] = fp_cost[o] + hp_cost[hp] + sign_cost[0]; } } mvcost[0] = 0; // Fill the costs for each exponent's vectors, using the costs set in the // previous exponents. for (i = 0; i < MV_OFFSET_BITS; ++i) { const int exponent = (2 * MV_FP_SIZE) << i; int class = 0; if (i >= CLASS0_BITS) { class = class_cost[i - CLASS0_BITS + 1]; } // Iterate through mantissas, keeping track of the location // of the highest set bit for the mantissa. // To be clear: in the outer loop, the position of the highest set bit // (exponent) is tracked and, in this loop, the highest set bit of the // mantissa is tracked. mantissa = 0; for (j = 0; j <= i; ++j) { for (; mantissa < (2 * MV_FP_SIZE) << j; ++mantissa) { int cost = mvcost[mantissa + 1] + class + cost_swap[j]; v = exponent + mantissa + 1; mvcost[v] = cost; mvcost[-v] = cost + negate_sign; } cost_swap[j] += bits_cost[i][0]; } } // Special case to avoid buffer overrun { int exponent = (2 * MV_FP_SIZE) << MV_OFFSET_BITS; int class = class_cost[MV_CLASSES - 1]; mantissa = 0; for (j = 0; j < MV_OFFSET_BITS; ++j) { for (; mantissa < (2 * MV_FP_SIZE) << j; ++mantissa) { int cost = mvcost[mantissa + 1] + class + cost_swap[j]; v = exponent + mantissa + 1; mvcost[v] = cost; mvcost[-v] = cost + negate_sign; } } // At this point: mantissa = exponent >> 1 // Manually calculate the final cost offset int cost_swap_hi = bits_cost[MV_OFFSET_BITS - 1][1] - class_cost[MV_CLASSES - 2]; for (; mantissa < exponent - 1; ++mantissa) { int cost = mvcost[mantissa + 1] + class + cost_swap_hi; v = exponent + mantissa + 1; mvcost[v] = cost; mvcost[-v] = cost + negate_sign; } } // Fill costs for class0 vectors, overwriting previous placeholder values // used for calculating the costs of the larger vectors. for (i = 0; i < CLASS0_SIZE; ++i) { const int top = i * 2 * MV_FP_SIZE; for (o = 0; o < MV_FP_SIZE; ++o) { int hp; int cost = class0_fp_cost[i][o] + class_cost[0] + class0_cost[i]; for (hp = 0; hp < 2; ++hp) { v = top + 2 * o + hp + 1; mvcost[v] = cost + class0_hp_cost[hp] + sign_cost[0]; mvcost[-v] = cost + class0_hp_cost[hp] + sign_cost[1]; } } } } void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv, const MV *ref, nmv_context *mvctx, int usehp) { const MV diff = { mv->row - ref->row, mv->col - ref->col }; const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); // If the mv_diff is zero, then we should have used near or nearest instead. assert(j != MV_JOINT_ZERO); if (cpi->common.features.cur_frame_force_integer_mv) { usehp = MV_SUBPEL_NONE; } aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS); if (mv_joint_vertical(j)) encode_mv_component(w, diff.row, &mvctx->comps[0], usehp); if (mv_joint_horizontal(j)) encode_mv_component(w, diff.col, &mvctx->comps[1], usehp); // If auto_mv_step_size is enabled then keep track of the largest // motion vector component used. if (cpi->sf.mv_sf.auto_mv_step_size) { int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3; td->max_mv_magnitude = AOMMAX(maxv, td->max_mv_magnitude); } } void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref, nmv_context *mvctx) { // DV and ref DV should not have sub-pel. assert((mv->col & 7) == 0); assert((mv->row & 7) == 0); assert((ref->col & 7) == 0); assert((ref->row & 7) == 0); const MV diff = { mv->row - ref->row, mv->col - ref->col }; const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS); if (mv_joint_vertical(j)) encode_mv_component(w, diff.row, &mvctx->comps[0], MV_SUBPEL_NONE); if (mv_joint_horizontal(j)) encode_mv_component(w, diff.col, &mvctx->comps[1], MV_SUBPEL_NONE); } void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2], const nmv_context *ctx, MvSubpelPrecision precision) { av1_cost_tokens_from_cdf(mvjoint, ctx->joints_cdf, NULL); av1_build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], precision); av1_build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision); } int_mv av1_get_ref_mv_from_stack(int ref_idx, const MV_REFERENCE_FRAME *ref_frame, int ref_mv_idx, const MB_MODE_INFO_EXT *mbmi_ext) { const int8_t ref_frame_type = av1_ref_frame_type(ref_frame); const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext->ref_mv_stack[ref_frame_type]; if (ref_frame[1] > INTRA_FRAME) { assert(ref_idx == 0 || ref_idx == 1); return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv : curr_ref_mv_stack[ref_mv_idx].this_mv; } assert(ref_idx == 0); return ref_mv_idx < mbmi_ext->ref_mv_count[ref_frame_type] ? curr_ref_mv_stack[ref_mv_idx].this_mv : mbmi_ext->global_mvs[ref_frame_type]; } int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx) { const MACROBLOCKD *xd = &x->e_mbd; const MB_MODE_INFO *mbmi = xd->mi[0]; int ref_mv_idx = mbmi->ref_mv_idx; if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) { assert(has_second_ref(mbmi)); ref_mv_idx += 1; } return av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx, &x->mbmi_ext); } void av1_find_best_ref_mvs_from_stack(int allow_hp, const MB_MODE_INFO_EXT *mbmi_ext, MV_REFERENCE_FRAME ref_frame, int_mv *nearest_mv, int_mv *near_mv, int is_integer) { const int ref_idx = 0; MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME }; *nearest_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 0, mbmi_ext); lower_mv_precision(&nearest_mv->as_mv, allow_hp, is_integer); *near_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 1, mbmi_ext); lower_mv_precision(&near_mv->as_mv, allow_hp, is_integer); } aom-3.12.1/av1/encoder/encodemv.h000066400000000000000000000077701477627663500164730ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_ENCODEMV_H_ #define AOM_AV1_ENCODER_ENCODEMV_H_ #include "av1/encoder/encoder.h" #ifdef __cplusplus extern "C" { #endif void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv, const MV *ref, nmv_context *mvctx, int usehp); void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx, MvSubpelPrecision precision); void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2], const nmv_context *mvctx, MvSubpelPrecision precision); void av1_build_nmv_component_cost_table(int *mvcost, const nmv_component *const mvcomp, MvSubpelPrecision precision); void av1_update_mv_count(ThreadData *td); void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref, nmv_context *mvctx); int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx); int_mv av1_get_ref_mv_from_stack(int ref_idx, const MV_REFERENCE_FRAME *ref_frame, int ref_mv_idx, const MB_MODE_INFO_EXT *mbmi_ext); void av1_find_best_ref_mvs_from_stack(int allow_hp, const MB_MODE_INFO_EXT *mbmi_ext, MV_REFERENCE_FRAME ref_frame, int_mv *nearest_mv, int_mv *near_mv, int is_integer); static inline MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) { // row: Z col: Z | MV_JOINT_ZERO (0) // row: Z col: NZ | MV_JOINT_HNZVZ (1) // row: NZ col: Z | MV_JOINT_HZVNZ (2) // row: NZ col: NZ | MV_JOINT_HNZVNZ (3) return (!!mv->col) | ((!!mv->row) << 1); } static inline int av1_mv_class_base(MV_CLASS_TYPE c) { return c ? CLASS0_SIZE << (c + 2) : 0; } // If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0. static inline uint8_t av1_log_in_base_2(unsigned int n) { // get_msb() is only valid when n != 0. return n == 0 ? 0 : get_msb(n); } static inline MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) { assert(z >= 0); const MV_CLASS_TYPE c = (MV_CLASS_TYPE)av1_log_in_base_2(z >> 3); assert(c <= MV_CLASS_10); if (offset) *offset = z - av1_mv_class_base(c); return c; } static inline int av1_check_newmv_joint_nonzero(const AV1_COMMON *cm, MACROBLOCK *const x) { (void)cm; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; const PREDICTION_MODE this_mode = mbmi->mode; if (this_mode == NEW_NEWMV) { const int_mv ref_mv_0 = av1_get_ref_mv(x, 0); const int_mv ref_mv_1 = av1_get_ref_mv(x, 1); if (mbmi->mv[0].as_int == ref_mv_0.as_int || mbmi->mv[1].as_int == ref_mv_1.as_int) { return 0; } } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) { const int_mv ref_mv_1 = av1_get_ref_mv(x, 1); if (mbmi->mv[1].as_int == ref_mv_1.as_int) { return 0; } } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) { const int_mv ref_mv_0 = av1_get_ref_mv(x, 0); if (mbmi->mv[0].as_int == ref_mv_0.as_int) { return 0; } } else if (this_mode == NEWMV) { const int_mv ref_mv_0 = av1_get_ref_mv(x, 0); if (mbmi->mv[0].as_int == ref_mv_0.as_int) { return 0; } } return 1; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_ENCODEMV_H_ aom-3.12.1/av1/encoder/encoder.c000066400000000000000000006254741477627663500163140ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include #include #include #include #include "av1/common/scale.h" #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aomcx.h" #if CONFIG_DENOISE #include "aom_dsp/grain_table.h" #include "aom_dsp/noise_util.h" #include "aom_dsp/noise_model.h" #endif #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_dsp/psnr.h" #if CONFIG_INTERNAL_STATS #include "aom_dsp/ssim.h" #endif #include "aom_ports/aom_timer.h" #include "aom_ports/mem.h" #include "aom_util/aom_pthread.h" #if CONFIG_BITSTREAM_DEBUG #include "aom_util/debug_util.h" #endif // CONFIG_BITSTREAM_DEBUG #include "av1/common/alloccommon.h" #include "av1/common/debugmodes.h" #include "av1/common/filter.h" #include "av1/common/idct.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" #include "av1/common/resize.h" #include "av1/common/tile_common.h" #include "av1/encoder/allintra_vis.h" #include "av1/encoder/aq_complexity.h" #include "av1/encoder/aq_cyclicrefresh.h" #include "av1/encoder/aq_variance.h" #include "av1/encoder/bitstream.h" #if CONFIG_INTERNAL_STATS #include "av1/encoder/blockiness.h" #endif #include "av1/encoder/context_tree.h" #include "av1/encoder/dwt.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/encode_strategy.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encoder_alloc.h" #include "av1/encoder/encoder_utils.h" #include "av1/encoder/encodetxb.h" #include "av1/encoder/ethread.h" #include "av1/encoder/firstpass.h" #include "av1/encoder/hash_motion.h" #include "av1/encoder/hybrid_fwd_txfm.h" #include "av1/encoder/intra_mode_search.h" #include "av1/encoder/mv_prec.h" #include "av1/encoder/pass2_strategy.h" #include "av1/encoder/pickcdef.h" #include "av1/encoder/picklpf.h" #include "av1/encoder/pickrst.h" #include "av1/encoder/random.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/rc_utils.h" #include "av1/encoder/rd.h" #include "av1/encoder/rdopt.h" #if CONFIG_SALIENCY_MAP #include "av1/encoder/saliency_map.h" #endif #include "av1/encoder/segmentation.h" #include "av1/encoder/speed_features.h" #include "av1/encoder/superres_scale.h" #if CONFIG_THREE_PASS #include "av1/encoder/thirdpass.h" #endif #include "av1/encoder/tpl_model.h" #include "av1/encoder/reconinter_enc.h" #include "av1/encoder/var_based_part.h" #define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7 // #define OUTPUT_YUV_REC #ifdef OUTPUT_YUV_REC FILE *yuv_rec_file; #define FILE_NAME_LEN 100 #endif #ifdef OUTPUT_YUV_DENOISED FILE *yuv_denoised_file = NULL; #endif static inline void Scale2Ratio(AOM_SCALING_MODE mode, int *hr, int *hs) { switch (mode) { case AOME_NORMAL: *hr = 1; *hs = 1; break; case AOME_FOURFIVE: *hr = 4; *hs = 5; break; case AOME_THREEFIVE: *hr = 3; *hs = 5; break; case AOME_THREEFOUR: *hr = 3; *hs = 4; break; case AOME_ONEFOUR: *hr = 1; *hs = 4; break; case AOME_ONEEIGHT: *hr = 1; *hs = 8; break; case AOME_ONETWO: *hr = 1; *hs = 2; break; case AOME_TWOTHREE: *hr = 2; *hs = 3; break; case AOME_ONETHREE: *hr = 1; *hs = 3; break; default: *hr = 1; *hs = 1; assert(0); break; } } int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows, int cols) { const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; if (rows == mi_params->mb_rows && cols == mi_params->mb_cols) { unsigned char *const active_map_4x4 = cpi->active_map.map; const int mi_rows = mi_params->mi_rows; const int mi_cols = mi_params->mi_cols; cpi->active_map.update = 0; cpi->rc.percent_blocks_inactive = 0; assert(mi_rows % 2 == 0 && mi_rows > 0); assert(mi_cols % 2 == 0 && mi_cols > 0); if (new_map_16x16) { int num_samples = 0; int num_blocks_inactive = 0; for (int r = 0; r < mi_rows; r += 4) { for (int c = 0; c < mi_cols; c += 4) { const uint8_t val = new_map_16x16[(r >> 2) * cols + (c >> 2)] ? AM_SEGMENT_ID_ACTIVE : AM_SEGMENT_ID_INACTIVE; num_samples++; if (val == AM_SEGMENT_ID_INACTIVE) num_blocks_inactive++; const int row_max = AOMMIN(4, mi_rows - r); const int col_max = AOMMIN(4, mi_cols - c); for (int x = 0; x < row_max; ++x) { for (int y = 0; y < col_max; ++y) { active_map_4x4[(r + x) * mi_cols + (c + y)] = val; } } } } cpi->active_map.enabled = 1; cpi->active_map.update = 1; assert(num_samples); cpi->rc.percent_blocks_inactive = (num_blocks_inactive * 100) / num_samples; } return 0; } return -1; } int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows, int cols) { const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; if (rows == mi_params->mb_rows && cols == mi_params->mb_cols && new_map_16x16) { unsigned char *const seg_map_8x8 = cpi->enc_seg.map; const int mi_rows = mi_params->mi_rows; const int mi_cols = mi_params->mi_cols; const int row_scale = mi_size_high_log2[BLOCK_16X16]; const int col_scale = mi_size_wide_log2[BLOCK_16X16]; assert(mi_rows % 2 == 0); assert(mi_cols % 2 == 0); memset(new_map_16x16, !cpi->active_map.enabled, rows * cols); if (cpi->active_map.enabled) { for (int r = 0; r < (mi_rows >> row_scale); ++r) { for (int c = 0; c < (mi_cols >> col_scale); ++c) { // Cyclic refresh segments are considered active despite not having // AM_SEGMENT_ID_ACTIVE uint8_t temp = 0; temp |= seg_map_8x8[(2 * r + 0) * mi_cols + (2 * c + 0)] != AM_SEGMENT_ID_INACTIVE; temp |= seg_map_8x8[(2 * r + 0) * mi_cols + (2 * c + 1)] != AM_SEGMENT_ID_INACTIVE; temp |= seg_map_8x8[(2 * r + 1) * mi_cols + (2 * c + 0)] != AM_SEGMENT_ID_INACTIVE; temp |= seg_map_8x8[(2 * r + 1) * mi_cols + (2 * c + 1)] != AM_SEGMENT_ID_INACTIVE; new_map_16x16[r * cols + c] |= temp; } } } return 0; } return -1; } void av1_initialize_enc(unsigned int usage, enum aom_rc_mode end_usage) { bool is_allintra = usage == ALLINTRA; av1_rtcd(); aom_dsp_rtcd(); aom_scale_rtcd(); av1_init_intra_predictors(); av1_init_me_luts(); if (!is_allintra) av1_init_wedge_masks(); if (!is_allintra || end_usage != AOM_Q) av1_rc_init_minq_luts(); } void av1_new_framerate(AV1_COMP *cpi, double framerate) { cpi->framerate = framerate < 0.1 ? 30 : framerate; av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height); } double av1_get_compression_ratio(const AV1_COMMON *const cm, size_t encoded_frame_size) { const int upscaled_width = cm->superres_upscaled_width; const int height = cm->height; const int64_t luma_pic_size = (int64_t)upscaled_width * height; const SequenceHeader *const seq_params = cm->seq_params; const BITSTREAM_PROFILE profile = seq_params->profile; const int pic_size_profile_factor = profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36); encoded_frame_size = (encoded_frame_size > 129 ? encoded_frame_size - 128 : 1); const int64_t uncompressed_frame_size = (luma_pic_size * pic_size_profile_factor) >> 3; return (double)uncompressed_frame_size / encoded_frame_size; } static void auto_tile_size_balancing(AV1_COMMON *const cm, int num_sbs, int num_tiles_lg, int tile_col_row) { CommonTileParams *const tiles = &cm->tiles; int i, start_sb; int size_sb = num_sbs >> num_tiles_lg; int res_sbs = num_sbs - (size_sb << num_tiles_lg); int num_tiles = 1 << num_tiles_lg; int inc_index = num_tiles - res_sbs; tiles->uniform_spacing = 0; for (i = 0, start_sb = 0; start_sb < num_sbs && i < MAX_TILE_COLS; ++i) { if (i == inc_index) ++size_sb; if (tile_col_row) tiles->col_start_sb[i] = start_sb; else tiles->row_start_sb[i] = start_sb; start_sb += AOMMIN(size_sb, tiles->max_width_sb); } if (tile_col_row) { tiles->cols = i; tiles->col_start_sb[i] = num_sbs; } else { tiles->rows = i; tiles->row_start_sb[i] = num_sbs; } } static void set_tile_info(AV1_COMMON *const cm, const TileConfig *const tile_cfg) { const CommonModeInfoParams *const mi_params = &cm->mi_params; const SequenceHeader *const seq_params = cm->seq_params; CommonTileParams *const tiles = &cm->tiles; int i, start_sb; av1_get_tile_limits(cm); int sb_cols = CEIL_POWER_OF_TWO(mi_params->mi_cols, seq_params->mib_size_log2); // configure tile columns if (tile_cfg->tile_width_count == 0 || tile_cfg->tile_height_count == 0) { tiles->uniform_spacing = 1; tiles->log2_cols = AOMMAX(tile_cfg->tile_columns, tiles->min_log2_cols); // Add a special case to handle super resolution sb_cols = coded_to_superres_mi(sb_cols, cm->superres_scale_denominator); int min_log2_cols = 0; for (; (tiles->max_width_sb << min_log2_cols) <= sb_cols; ++min_log2_cols) { } tiles->log2_cols = AOMMAX(tiles->log2_cols, min_log2_cols); tiles->log2_cols = AOMMIN(tiles->log2_cols, tiles->max_log2_cols); } else if (tile_cfg->tile_widths[0] < 0) { auto_tile_size_balancing(cm, sb_cols, tile_cfg->tile_columns, 1); } else { int size_sb, j = 0; tiles->uniform_spacing = 0; for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) { tiles->col_start_sb[i] = start_sb; size_sb = tile_cfg->tile_widths[j++]; if (j >= tile_cfg->tile_width_count) j = 0; start_sb += AOMMIN(size_sb, tiles->max_width_sb); } tiles->cols = i; tiles->col_start_sb[i] = sb_cols; } av1_calculate_tile_cols(seq_params, mi_params->mi_rows, mi_params->mi_cols, tiles); // configure tile rows int sb_rows = CEIL_POWER_OF_TWO(mi_params->mi_rows, seq_params->mib_size_log2); if (tiles->uniform_spacing) { tiles->log2_rows = AOMMAX(tile_cfg->tile_rows, tiles->min_log2_rows); tiles->log2_rows = AOMMIN(tiles->log2_rows, tiles->max_log2_rows); } else if (tile_cfg->tile_heights[0] < 0) { auto_tile_size_balancing(cm, sb_rows, tile_cfg->tile_rows, 0); } else { int size_sb, j = 0; for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) { tiles->row_start_sb[i] = start_sb; size_sb = tile_cfg->tile_heights[j++]; if (j >= tile_cfg->tile_height_count) j = 0; start_sb += AOMMIN(size_sb, tiles->max_height_sb); } tiles->rows = i; tiles->row_start_sb[i] = sb_rows; } av1_calculate_tile_rows(seq_params, mi_params->mi_rows, tiles); } void av1_update_frame_size(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; // Setup mi_params here in case we need more mi's. CommonModeInfoParams *const mi_params = &cm->mi_params; mi_params->set_mb_mi(mi_params, cm->width, cm->height, cpi->sf.part_sf.default_min_partition_size); av1_init_macroblockd(cm, xd); if (!cpi->ppi->seq_params_locked) set_sb_size(cm->seq_params, av1_select_sb_size(&cpi->oxcf, cm->width, cm->height, cpi->ppi->number_spatial_layers)); set_tile_info(cm, &cpi->oxcf.tile_cfg); } static inline int does_level_match(int width, int height, double fps, int lvl_width, int lvl_height, double lvl_fps, int lvl_dim_mult) { const int64_t lvl_luma_pels = (int64_t)lvl_width * lvl_height; const double lvl_display_sample_rate = lvl_luma_pels * lvl_fps; const int64_t luma_pels = (int64_t)width * height; const double display_sample_rate = luma_pels * fps; return luma_pels <= lvl_luma_pels && display_sample_rate <= lvl_display_sample_rate && width <= lvl_width * lvl_dim_mult && height <= lvl_height * lvl_dim_mult; } static void set_bitstream_level_tier(AV1_PRIMARY *const ppi, int width, int height, double init_framerate) { SequenceHeader *const seq_params = &ppi->seq_params; const AV1LevelParams *const level_params = &ppi->level_params; // TODO(any): This is a placeholder function that only addresses dimensions // and max display sample rates. // Need to add checks for max bit rate, max decoded luma sample rate, header // rate, etc. that are not covered by this function. AV1_LEVEL level = SEQ_LEVEL_MAX; if (does_level_match(width, height, init_framerate, 512, 288, 30.0, 4)) { level = SEQ_LEVEL_2_0; } else if (does_level_match(width, height, init_framerate, 704, 396, 30.0, 4)) { level = SEQ_LEVEL_2_1; } else if (does_level_match(width, height, init_framerate, 1088, 612, 30.0, 4)) { level = SEQ_LEVEL_3_0; } else if (does_level_match(width, height, init_framerate, 1376, 774, 30.0, 4)) { level = SEQ_LEVEL_3_1; } else if (does_level_match(width, height, init_framerate, 2048, 1152, 30.0, 3)) { level = SEQ_LEVEL_4_0; } else if (does_level_match(width, height, init_framerate, 2048, 1152, 60.0, 3)) { level = SEQ_LEVEL_4_1; } else if (does_level_match(width, height, init_framerate, 4096, 2176, 30.0, 2)) { level = SEQ_LEVEL_5_0; } else if (does_level_match(width, height, init_framerate, 4096, 2176, 60.0, 2)) { level = SEQ_LEVEL_5_1; } else if (does_level_match(width, height, init_framerate, 4096, 2176, 120.0, 2)) { level = SEQ_LEVEL_5_2; } else if (does_level_match(width, height, init_framerate, 8192, 4352, 30.0, 2)) { level = SEQ_LEVEL_6_0; } else if (does_level_match(width, height, init_framerate, 8192, 4352, 60.0, 2)) { level = SEQ_LEVEL_6_1; } else if (does_level_match(width, height, init_framerate, 8192, 4352, 120.0, 2)) { level = SEQ_LEVEL_6_2; } #if CONFIG_CWG_C013 // TODO(bohanli): currently target level is only working for the 0th operating // point, so scalable coding is not supported. else if (level_params->target_seq_level_idx[0] >= SEQ_LEVEL_7_0 && level_params->target_seq_level_idx[0] <= SEQ_LEVEL_8_3) { // Only use level 7.x to 8.x when explicitly asked to. if (does_level_match(width, height, init_framerate, 16384, 8704, 30.0, 2)) { level = SEQ_LEVEL_7_0; } else if (does_level_match(width, height, init_framerate, 16384, 8704, 60.0, 2)) { level = SEQ_LEVEL_7_1; } else if (does_level_match(width, height, init_framerate, 16384, 8704, 120.0, 2)) { level = SEQ_LEVEL_7_2; } else if (does_level_match(width, height, init_framerate, 32768, 17408, 30.0, 2)) { level = SEQ_LEVEL_8_0; } else if (does_level_match(width, height, init_framerate, 32768, 17408, 60.0, 2)) { level = SEQ_LEVEL_8_1; } else if (does_level_match(width, height, init_framerate, 32768, 17408, 120.0, 2)) { level = SEQ_LEVEL_8_2; } } #endif for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { assert(is_valid_seq_level_idx(level_params->target_seq_level_idx[i]) || level_params->target_seq_level_idx[i] == SEQ_LEVEL_KEEP_STATS); // If a higher target level is specified, it is then used rather than the // inferred one from resolution and framerate. seq_params->seq_level_idx[i] = level_params->target_seq_level_idx[i] < SEQ_LEVELS && level_params->target_seq_level_idx[i] > level ? level_params->target_seq_level_idx[i] : level; // Set the maximum parameters for bitrate and buffer size for this profile, // level, and tier seq_params->op_params[i].bitrate = av1_max_level_bitrate( seq_params->profile, seq_params->seq_level_idx[i], seq_params->tier[i]); // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the // check if (seq_params->op_params[i].bitrate == 0) aom_internal_error( &ppi->error, AOM_CODEC_UNSUP_BITSTREAM, "AV1 does not support this combination of profile, level, and tier."); // Buffer size in bits/s is bitrate in bits/s * 1 s seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate; } } static void init_seq_coding_tools(AV1_PRIMARY *const ppi, const AV1EncoderConfig *oxcf, int disable_frame_id_numbers) { SequenceHeader *const seq = &ppi->seq_params; const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; const ToolCfg *const tool_cfg = &oxcf->tool_cfg; seq->still_picture = !tool_cfg->force_video_mode && (oxcf->input_cfg.limit == 1); seq->reduced_still_picture_hdr = seq->still_picture && !tool_cfg->full_still_picture_hdr; seq->force_screen_content_tools = 2; seq->force_integer_mv = 2; seq->order_hint_info.enable_order_hint = tool_cfg->enable_order_hint; seq->frame_id_numbers_present_flag = !seq->reduced_still_picture_hdr && !oxcf->tile_cfg.enable_large_scale_tile && tool_cfg->error_resilient_mode && !disable_frame_id_numbers; if (seq->reduced_still_picture_hdr) { seq->order_hint_info.enable_order_hint = 0; seq->force_screen_content_tools = 2; seq->force_integer_mv = 2; } seq->order_hint_info.order_hint_bits_minus_1 = seq->order_hint_info.enable_order_hint ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1 : -1; seq->max_frame_width = frm_dim_cfg->forced_max_frame_width ? frm_dim_cfg->forced_max_frame_width : frm_dim_cfg->width; seq->max_frame_height = frm_dim_cfg->forced_max_frame_height ? frm_dim_cfg->forced_max_frame_height : frm_dim_cfg->height; seq->num_bits_width = (seq->max_frame_width > 1) ? get_msb(seq->max_frame_width - 1) + 1 : 1; seq->num_bits_height = (seq->max_frame_height > 1) ? get_msb(seq->max_frame_height - 1) + 1 : 1; assert(seq->num_bits_width <= 16); assert(seq->num_bits_height <= 16); seq->frame_id_length = FRAME_ID_LENGTH; seq->delta_frame_id_length = DELTA_FRAME_ID_LENGTH; seq->enable_dual_filter = tool_cfg->enable_dual_filter; seq->order_hint_info.enable_dist_wtd_comp = oxcf->comp_type_cfg.enable_dist_wtd_comp; seq->order_hint_info.enable_dist_wtd_comp &= seq->order_hint_info.enable_order_hint; seq->order_hint_info.enable_ref_frame_mvs = tool_cfg->ref_frame_mvs_present; seq->order_hint_info.enable_ref_frame_mvs &= seq->order_hint_info.enable_order_hint; seq->enable_superres = oxcf->superres_cfg.enable_superres; seq->enable_cdef = tool_cfg->cdef_control != CDEF_NONE ? 1 : 0; seq->enable_restoration = tool_cfg->enable_restoration; seq->enable_warped_motion = oxcf->motion_mode_cfg.enable_warped_motion; seq->enable_interintra_compound = tool_cfg->enable_interintra_comp; seq->enable_masked_compound = oxcf->comp_type_cfg.enable_masked_comp; seq->enable_intra_edge_filter = oxcf->intra_mode_cfg.enable_intra_edge_filter; seq->enable_filter_intra = oxcf->intra_mode_cfg.enable_filter_intra; set_bitstream_level_tier(ppi, frm_dim_cfg->width, frm_dim_cfg->height, oxcf->input_cfg.init_framerate); if (seq->operating_points_cnt_minus_1 == 0) { seq->operating_point_idc[0] = 0; seq->has_nonzero_operating_point_idc = false; } else { // Set operating_point_idc[] such that the i=0 point corresponds to the // highest quality operating point (all layers), and subsequent // operarting points (i > 0) are lower quality corresponding to // skip decoding enhancement layers (temporal first). int i = 0; assert(seq->operating_points_cnt_minus_1 == (int)(ppi->number_spatial_layers * ppi->number_temporal_layers - 1)); for (unsigned int sl = 0; sl < ppi->number_spatial_layers; sl++) { for (unsigned int tl = 0; tl < ppi->number_temporal_layers; tl++) { seq->operating_point_idc[i] = (~(~0u << (ppi->number_spatial_layers - sl)) << 8) | ~(~0u << (ppi->number_temporal_layers - tl)); assert(seq->operating_point_idc[i] != 0); i++; } } seq->has_nonzero_operating_point_idc = true; } } static void init_config_sequence(struct AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf) { SequenceHeader *const seq_params = &ppi->seq_params; const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg; const ColorCfg *const color_cfg = &oxcf->color_cfg; ppi->use_svc = 0; ppi->number_spatial_layers = 1; ppi->number_temporal_layers = 1; seq_params->profile = oxcf->profile; seq_params->bit_depth = oxcf->tool_cfg.bit_depth; seq_params->use_highbitdepth = oxcf->use_highbitdepth; seq_params->color_primaries = color_cfg->color_primaries; seq_params->transfer_characteristics = color_cfg->transfer_characteristics; seq_params->matrix_coefficients = color_cfg->matrix_coefficients; seq_params->monochrome = oxcf->tool_cfg.enable_monochrome; seq_params->chroma_sample_position = color_cfg->chroma_sample_position; seq_params->color_range = color_cfg->color_range; seq_params->timing_info_present = dec_model_cfg->timing_info_present; seq_params->timing_info.num_units_in_display_tick = dec_model_cfg->timing_info.num_units_in_display_tick; seq_params->timing_info.time_scale = dec_model_cfg->timing_info.time_scale; seq_params->timing_info.equal_picture_interval = dec_model_cfg->timing_info.equal_picture_interval; seq_params->timing_info.num_ticks_per_picture = dec_model_cfg->timing_info.num_ticks_per_picture; seq_params->display_model_info_present_flag = dec_model_cfg->display_model_info_present_flag; seq_params->decoder_model_info_present_flag = dec_model_cfg->decoder_model_info_present_flag; if (dec_model_cfg->decoder_model_info_present_flag) { // set the decoder model parameters in schedule mode seq_params->decoder_model_info.num_units_in_decoding_tick = dec_model_cfg->num_units_in_decoding_tick; ppi->buffer_removal_time_present = 1; av1_set_aom_dec_model_info(&seq_params->decoder_model_info); av1_set_dec_model_op_parameters(&seq_params->op_params[0]); } else if (seq_params->timing_info_present && seq_params->timing_info.equal_picture_interval && !seq_params->decoder_model_info_present_flag) { // set the decoder model parameters in resource availability mode av1_set_resource_availability_parameters(&seq_params->op_params[0]); } else { seq_params->op_params[0].initial_display_delay = 10; // Default value (not signaled) } if (seq_params->monochrome) { seq_params->subsampling_x = 1; seq_params->subsampling_y = 1; } else if (seq_params->color_primaries == AOM_CICP_CP_BT_709 && seq_params->transfer_characteristics == AOM_CICP_TC_SRGB && seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { seq_params->subsampling_x = 0; seq_params->subsampling_y = 0; } else { if (seq_params->profile == 0) { seq_params->subsampling_x = 1; seq_params->subsampling_y = 1; } else if (seq_params->profile == 1) { seq_params->subsampling_x = 0; seq_params->subsampling_y = 0; } else { if (seq_params->bit_depth == AOM_BITS_12) { seq_params->subsampling_x = oxcf->input_cfg.chroma_subsampling_x; seq_params->subsampling_y = oxcf->input_cfg.chroma_subsampling_y; } else { seq_params->subsampling_x = 1; seq_params->subsampling_y = 0; } } } av1_change_config_seq(ppi, oxcf, NULL); } static void init_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { AV1_COMMON *const cm = &cpi->common; ResizePendingParams *resize_pending_params = &cpi->resize_pending_params; cpi->oxcf = *oxcf; cpi->framerate = oxcf->input_cfg.init_framerate; cm->width = oxcf->frm_dim_cfg.width; cm->height = oxcf->frm_dim_cfg.height; cpi->is_dropped_frame = false; alloc_compressor_data(cpi); cpi->data_alloc_width = cm->width; cpi->data_alloc_height = cm->height; cpi->frame_size_related_setup_done = false; // Single thread case: use counts in common. cpi->td.counts = &cpi->counts; // Init SVC parameters. cpi->svc.number_spatial_layers = 1; cpi->svc.number_temporal_layers = 1; cm->spatial_layer_id = 0; cm->temporal_layer_id = 0; // Init rtc_ref parameters. cpi->ppi->rtc_ref.set_ref_frame_config = 0; cpi->ppi->rtc_ref.non_reference_frame = 0; cpi->ppi->rtc_ref.ref_frame_comp[0] = 0; cpi->ppi->rtc_ref.ref_frame_comp[1] = 0; cpi->ppi->rtc_ref.ref_frame_comp[2] = 0; // change includes all joint functionality av1_change_config(cpi, oxcf, false); cpi->ref_frame_flags = 0; // Reset resize pending flags resize_pending_params->width = 0; resize_pending_params->height = 0; // Setup identity scale factor av1_setup_scale_factors_for_frame(&cm->sf_identity, 1, 1, 1, 1); init_buffer_indices(&cpi->force_intpel_info, cm->remapped_ref_idx); av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height); } void av1_change_config_seq(struct AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf, bool *is_sb_size_changed) { SequenceHeader *const seq_params = &ppi->seq_params; const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg; const ColorCfg *const color_cfg = &oxcf->color_cfg; if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile; seq_params->bit_depth = oxcf->tool_cfg.bit_depth; seq_params->color_primaries = color_cfg->color_primaries; seq_params->transfer_characteristics = color_cfg->transfer_characteristics; seq_params->matrix_coefficients = color_cfg->matrix_coefficients; seq_params->monochrome = oxcf->tool_cfg.enable_monochrome; seq_params->chroma_sample_position = color_cfg->chroma_sample_position; seq_params->color_range = color_cfg->color_range; assert(IMPLIES(seq_params->profile <= PROFILE_1, seq_params->bit_depth <= AOM_BITS_10)); seq_params->timing_info_present = dec_model_cfg->timing_info_present; seq_params->timing_info.num_units_in_display_tick = dec_model_cfg->timing_info.num_units_in_display_tick; seq_params->timing_info.time_scale = dec_model_cfg->timing_info.time_scale; seq_params->timing_info.equal_picture_interval = dec_model_cfg->timing_info.equal_picture_interval; seq_params->timing_info.num_ticks_per_picture = dec_model_cfg->timing_info.num_ticks_per_picture; seq_params->display_model_info_present_flag = dec_model_cfg->display_model_info_present_flag; seq_params->decoder_model_info_present_flag = dec_model_cfg->decoder_model_info_present_flag; if (dec_model_cfg->decoder_model_info_present_flag) { // set the decoder model parameters in schedule mode seq_params->decoder_model_info.num_units_in_decoding_tick = dec_model_cfg->num_units_in_decoding_tick; ppi->buffer_removal_time_present = 1; av1_set_aom_dec_model_info(&seq_params->decoder_model_info); av1_set_dec_model_op_parameters(&seq_params->op_params[0]); } else if (seq_params->timing_info_present && seq_params->timing_info.equal_picture_interval && !seq_params->decoder_model_info_present_flag) { // set the decoder model parameters in resource availability mode av1_set_resource_availability_parameters(&seq_params->op_params[0]); } else { seq_params->op_params[0].initial_display_delay = 10; // Default value (not signaled) } #if !CONFIG_REALTIME_ONLY av1_update_film_grain_parameters_seq(ppi, oxcf); #endif int sb_size = seq_params->sb_size; // Superblock size should not be updated after the first key frame. if (!ppi->seq_params_locked) { set_sb_size(seq_params, av1_select_sb_size(oxcf, frm_dim_cfg->width, frm_dim_cfg->height, ppi->number_spatial_layers)); for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) seq_params->tier[i] = (oxcf->tier_mask >> i) & 1; } if (is_sb_size_changed != NULL && sb_size != seq_params->sb_size) *is_sb_size_changed = true; // Init sequence level coding tools // This should not be called after the first key frame. if (!ppi->seq_params_locked) { seq_params->operating_points_cnt_minus_1 = (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1) ? ppi->number_spatial_layers * ppi->number_temporal_layers - 1 : 0; init_seq_coding_tools(ppi, oxcf, ppi->use_svc || ppi->rtc_ref.set_ref_frame_config); } seq_params->timing_info_present &= !seq_params->reduced_still_picture_hdr; #if CONFIG_AV1_HIGHBITDEPTH highbd_set_var_fns(ppi); #endif set_primary_rc_buffer_sizes(oxcf, ppi); } void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf, bool is_sb_size_changed) { AV1_COMMON *const cm = &cpi->common; SequenceHeader *const seq_params = cm->seq_params; RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; MACROBLOCK *const x = &cpi->td.mb; AV1LevelParams *const level_params = &cpi->ppi->level_params; RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg; const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; FeatureFlags *const features = &cm->features; // in case of LAP, lag in frames is set according to number of lap buffers // calculated at init time. This stores and restores LAP's lag in frames to // prevent override by new cfg. int lap_lag_in_frames = -1; if (cpi->ppi->lap_enabled && cpi->compressor_stage == LAP_STAGE) { lap_lag_in_frames = cpi->oxcf.gf_cfg.lag_in_frames; } cpi->oxcf = *oxcf; #if !CONFIG_REALTIME_ONLY av1_update_film_grain_parameters(cpi, oxcf); #endif // When user provides superres_mode = AOM_SUPERRES_AUTO, we still initialize // superres mode for current encoding = AOM_SUPERRES_NONE. This is to ensure // that any analysis (e.g. TPL) happening outside the main encoding loop still // happens at full resolution. // This value will later be set appropriately just before main encoding loop. cpi->superres_mode = oxcf->superres_cfg.superres_mode == AOM_SUPERRES_AUTO ? AOM_SUPERRES_NONE : oxcf->superres_cfg.superres_mode; // default x->e_mbd.bd = (int)seq_params->bit_depth; x->e_mbd.global_motion = cm->global_motion; memcpy(level_params->target_seq_level_idx, cpi->oxcf.target_seq_level_idx, sizeof(level_params->target_seq_level_idx)); level_params->keep_level_stats = 0; for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { if (level_params->target_seq_level_idx[i] < SEQ_LEVELS || level_params->target_seq_level_idx[i] == SEQ_LEVEL_KEEP_STATS) { level_params->keep_level_stats |= 1u << i; if (!level_params->level_info[i]) { CHECK_MEM_ERROR(cm, level_params->level_info[i], aom_calloc(1, sizeof(*level_params->level_info[i]))); } } } // TODO(huisu@): level targeting currently only works for the 0th operating // point, so scalable coding is not supported yet. if (level_params->target_seq_level_idx[0] < SEQ_LEVELS) { // Adjust encoder config in order to meet target level. config_target_level(cpi, level_params->target_seq_level_idx[0], seq_params->tier[0]); } if (has_no_stats_stage(cpi) && (rc_cfg->mode == AOM_Q)) { p_rc->baseline_gf_interval = FIXED_GF_INTERVAL; } else if (!is_one_pass_rt_params(cpi) || cm->current_frame.frame_number == 0) { // For rtc mode: logic for setting the baseline_gf_interval is done // in av1_get_one_pass_rt_params(), and it should not be reset here in // change_config(), unless after init_config (first frame). p_rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2; } refresh_frame->golden_frame = false; refresh_frame->bwd_ref_frame = false; features->refresh_frame_context = (oxcf->tool_cfg.frame_parallel_decoding_mode) ? REFRESH_FRAME_CONTEXT_DISABLED : REFRESH_FRAME_CONTEXT_BACKWARD; if (oxcf->tile_cfg.enable_large_scale_tile) features->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; if (x->palette_buffer == NULL) { CHECK_MEM_ERROR(cm, x->palette_buffer, aom_memalign(16, sizeof(*x->palette_buffer))); } if (x->tmp_conv_dst == NULL) { CHECK_MEM_ERROR( cm, x->tmp_conv_dst, aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst))); x->e_mbd.tmp_conv_dst = x->tmp_conv_dst; } // The buffers 'tmp_pred_bufs[]' and 'comp_rd_buffer' are used in inter frames // to store intermediate inter mode prediction results and are not required // for allintra encoding mode. Hence, the memory allocations for these buffers // are avoided for allintra encoding mode. if (cpi->oxcf.kf_cfg.key_freq_max != 0) { if (x->comp_rd_buffer.pred0 == NULL) alloc_compound_type_rd_buffers(cm->error, &x->comp_rd_buffer); for (int i = 0; i < 2; ++i) { if (x->tmp_pred_bufs[i] == NULL) { CHECK_MEM_ERROR(cm, x->tmp_pred_bufs[i], aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*x->tmp_pred_bufs[i]))); x->e_mbd.tmp_obmc_bufs[i] = x->tmp_pred_bufs[i]; } } } av1_reset_segment_features(cm); av1_set_high_precision_mv(cpi, 1, 0); // Under a configuration change, where maximum_buffer_size may change, // keep buffer level clipped to the maximum allowed buffer size. p_rc->bits_off_target = AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size); p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size); // Set up frame rate and related parameters rate control values. av1_new_framerate(cpi, cpi->framerate); // Set absolute upper and lower quality limits rc->worst_quality = rc_cfg->worst_allowed_q; rc->best_quality = rc_cfg->best_allowed_q; // If lossless has been requested make sure average Q accumulators are reset. if (is_lossless_requested(&cpi->oxcf.rc_cfg)) { int i; for (i = 0; i < FRAME_TYPES; ++i) { p_rc->avg_frame_qindex[i] = 0; } } features->interp_filter = oxcf->tile_cfg.enable_large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE; features->switchable_motion_mode = is_switchable_motion_mode_allowed( features->allow_warped_motion, oxcf->motion_mode_cfg.enable_obmc); if (frm_dim_cfg->render_width > 0 && frm_dim_cfg->render_height > 0) { cm->render_width = frm_dim_cfg->render_width; cm->render_height = frm_dim_cfg->render_height; } else { cm->render_width = frm_dim_cfg->width; cm->render_height = frm_dim_cfg->height; } cm->width = frm_dim_cfg->width; cm->height = frm_dim_cfg->height; if (cm->width > cpi->data_alloc_width || cm->height > cpi->data_alloc_height || is_sb_size_changed) { av1_free_context_buffers(cm); av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf); av1_free_sms_tree(&cpi->td); av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm)); cpi->td.firstpass_ctx = NULL; alloc_compressor_data(cpi); realloc_segmentation_maps(cpi); cpi->data_alloc_width = cm->width; cpi->data_alloc_height = cm->height; cpi->frame_size_related_setup_done = false; } av1_update_frame_size(cpi); rc->is_src_frame_alt_ref = 0; if (!cpi->ppi->rtc_ref.set_ref_frame_config) cpi->ext_flags.refresh_frame.update_pending = 0; cpi->ext_flags.refresh_frame_context_pending = 0; if (cpi->ppi->use_svc) av1_update_layer_context_change_config(cpi, rc_cfg->target_bandwidth); check_reset_rc_flag(cpi); // restore the value of lag_in_frame for LAP stage. if (lap_lag_in_frames != -1) { cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames; } #if CONFIG_REALTIME_ONLY assert(!oxcf->tool_cfg.enable_global_motion); cpi->alloc_pyramid = false; #else cpi->alloc_pyramid = oxcf->tool_cfg.enable_global_motion; #endif // CONFIG_REALTIME_ONLY } static inline void init_frame_info(FRAME_INFO *frame_info, const AV1_COMMON *const cm) { const CommonModeInfoParams *const mi_params = &cm->mi_params; const SequenceHeader *const seq_params = cm->seq_params; frame_info->frame_width = cm->width; frame_info->frame_height = cm->height; frame_info->mi_cols = mi_params->mi_cols; frame_info->mi_rows = mi_params->mi_rows; frame_info->mb_cols = mi_params->mb_cols; frame_info->mb_rows = mi_params->mb_rows; frame_info->num_mbs = mi_params->MBs; frame_info->bit_depth = seq_params->bit_depth; frame_info->subsampling_x = seq_params->subsampling_x; frame_info->subsampling_y = seq_params->subsampling_y; } static inline void init_frame_index_set(FRAME_INDEX_SET *frame_index_set) { frame_index_set->show_frame_count = 0; } static inline void update_counters_for_show_frame(AV1_COMP *const cpi) { assert(cpi->common.show_frame); cpi->frame_index_set.show_frame_count++; cpi->common.current_frame.frame_number++; } AV1_PRIMARY *av1_create_primary_compressor( struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers, const AV1EncoderConfig *oxcf) { AV1_PRIMARY *volatile const ppi = aom_memalign(32, sizeof(AV1_PRIMARY)); if (!ppi) return NULL; av1_zero(*ppi); // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(ppi->error.jmp)) { ppi->error.setjmp = 0; av1_remove_primary_compressor(ppi); return 0; } ppi->error.setjmp = 1; ppi->seq_params_locked = 0; ppi->lap_enabled = num_lap_buffers > 0; ppi->output_pkt_list = pkt_list_head; ppi->b_calculate_psnr = CONFIG_INTERNAL_STATS; ppi->frames_left = oxcf->input_cfg.limit; ppi->num_fp_contexts = 1; init_config_sequence(ppi, oxcf); #if CONFIG_ENTROPY_STATS av1_zero(ppi->aggregate_fc); #endif // CONFIG_ENTROPY_STATS av1_primary_rc_init(oxcf, &ppi->p_rc); // For two pass and lag_in_frames > 33 in LAP. ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_2; if (ppi->lap_enabled) { if ((num_lap_buffers < (MAX_GF_LENGTH_LAP + SCENE_CUT_KEY_TEST_INTERVAL + 1)) && num_lap_buffers >= (MAX_GF_LENGTH_LAP + 3)) { /* * For lag in frames >= 19 and <33, enable scenecut * with limited future frame prediction. */ ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_1; } else if (num_lap_buffers < (MAX_GF_LENGTH_LAP + 3)) { // Disable scenecut when lag_in_frames < 19. ppi->p_rc.enable_scenecut_detection = DISABLE_SCENECUT; } } #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF) \ ppi->fn_ptr[BT].sdf = SDF; \ ppi->fn_ptr[BT].sdaf = SDAF; \ ppi->fn_ptr[BT].vf = VF; \ ppi->fn_ptr[BT].svf = SVF; \ ppi->fn_ptr[BT].svaf = SVAF; \ ppi->fn_ptr[BT].sdx4df = SDX4DF; \ ppi->fn_ptr[BT].sdx3df = SDX3DF; // Realtime mode doesn't use 4x rectangular blocks. #if !CONFIG_REALTIME_ONLY // sdaf (used in compound prediction, get_mvpred_compound_sad()) is unused // for 4xN and Nx4 blocks. BFP(BLOCK_4X16, aom_sad4x16, /*SDAF=*/NULL, aom_variance4x16, aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16, aom_sad4x16x4d, aom_sad4x16x3d) // sdaf (used in compound prediction, get_mvpred_compound_sad()) is unused // for 4xN and Nx4 blocks. BFP(BLOCK_16X4, aom_sad16x4, /*SDAF=*/NULL, aom_variance16x4, aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4, aom_sad16x4x4d, aom_sad16x4x3d) BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32, aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32, aom_sad8x32x4d, aom_sad8x32x3d) BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8, aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8, aom_sad32x8x4d, aom_sad32x8x3d) BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64, aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64, aom_sad16x64x4d, aom_sad16x64x3d) BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16, aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16, aom_sad64x16x4d, aom_sad64x16x3d) #endif // !CONFIG_REALTIME_ONLY BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128, aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128, aom_sad128x128x4d, aom_sad128x128x3d) BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64, aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64, aom_sad128x64x4d, aom_sad128x64x3d) BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128, aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128, aom_sad64x128x4d, aom_sad64x128x3d) BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16, aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16, aom_sad32x16x4d, aom_sad32x16x3d) BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32, aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32, aom_sad16x32x4d, aom_sad16x32x3d) BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32, aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32, aom_sad64x32x4d, aom_sad64x32x3d) BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64, aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64, aom_sad32x64x4d, aom_sad32x64x3d) BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32, aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32, aom_sad32x32x4d, aom_sad32x32x3d) BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64, aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64, aom_sad64x64x4d, aom_sad64x64x3d) BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16, aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16, aom_sad16x16x4d, aom_sad16x16x3d) BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8, aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8, aom_sad16x8x4d, aom_sad16x8x3d) BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16, aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16, aom_sad8x16x4d, aom_sad8x16x3d) BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8, aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d, aom_sad8x8x3d) // sdaf (used in compound prediction, get_mvpred_compound_sad()) is unused // for 4xN and Nx4 blocks. BFP(BLOCK_8X4, aom_sad8x4, /*SDAF=*/NULL, aom_variance8x4, aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d, aom_sad8x4x3d) // sdaf (used in compound prediction, get_mvpred_compound_sad()) is unused // for 4xN and Nx4 blocks. BFP(BLOCK_4X8, aom_sad4x8, /*SDAF=*/NULL, aom_variance4x8, aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d, aom_sad4x8x3d) // sdaf (used in compound prediction, get_mvpred_compound_sad()) is unused // for 4xN and Nx4 blocks. BFP(BLOCK_4X4, aom_sad4x4, /*SDAF=*/NULL, aom_variance4x4, aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d, aom_sad4x4x3d) #if !CONFIG_REALTIME_ONLY #define OBFP(BT, OSDF, OVF, OSVF) \ ppi->fn_ptr[BT].osdf = OSDF; \ ppi->fn_ptr[BT].ovf = OVF; \ ppi->fn_ptr[BT].osvf = OSVF; OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128, aom_obmc_sub_pixel_variance128x128) OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64, aom_obmc_sub_pixel_variance128x64) OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128, aom_obmc_sub_pixel_variance64x128) OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64, aom_obmc_sub_pixel_variance64x64) OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32, aom_obmc_sub_pixel_variance64x32) OBFP(BLOCK_32X64, aom_obmc_sad32x64, aom_obmc_variance32x64, aom_obmc_sub_pixel_variance32x64) OBFP(BLOCK_32X32, aom_obmc_sad32x32, aom_obmc_variance32x32, aom_obmc_sub_pixel_variance32x32) OBFP(BLOCK_32X16, aom_obmc_sad32x16, aom_obmc_variance32x16, aom_obmc_sub_pixel_variance32x16) OBFP(BLOCK_16X32, aom_obmc_sad16x32, aom_obmc_variance16x32, aom_obmc_sub_pixel_variance16x32) OBFP(BLOCK_16X16, aom_obmc_sad16x16, aom_obmc_variance16x16, aom_obmc_sub_pixel_variance16x16) OBFP(BLOCK_16X8, aom_obmc_sad16x8, aom_obmc_variance16x8, aom_obmc_sub_pixel_variance16x8) OBFP(BLOCK_8X16, aom_obmc_sad8x16, aom_obmc_variance8x16, aom_obmc_sub_pixel_variance8x16) OBFP(BLOCK_8X8, aom_obmc_sad8x8, aom_obmc_variance8x8, aom_obmc_sub_pixel_variance8x8) OBFP(BLOCK_4X8, aom_obmc_sad4x8, aom_obmc_variance4x8, aom_obmc_sub_pixel_variance4x8) OBFP(BLOCK_8X4, aom_obmc_sad8x4, aom_obmc_variance8x4, aom_obmc_sub_pixel_variance8x4) OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4, aom_obmc_sub_pixel_variance4x4) OBFP(BLOCK_4X16, aom_obmc_sad4x16, aom_obmc_variance4x16, aom_obmc_sub_pixel_variance4x16) OBFP(BLOCK_16X4, aom_obmc_sad16x4, aom_obmc_variance16x4, aom_obmc_sub_pixel_variance16x4) OBFP(BLOCK_8X32, aom_obmc_sad8x32, aom_obmc_variance8x32, aom_obmc_sub_pixel_variance8x32) OBFP(BLOCK_32X8, aom_obmc_sad32x8, aom_obmc_variance32x8, aom_obmc_sub_pixel_variance32x8) OBFP(BLOCK_16X64, aom_obmc_sad16x64, aom_obmc_variance16x64, aom_obmc_sub_pixel_variance16x64) OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16, aom_obmc_sub_pixel_variance64x16) #endif // !CONFIG_REALTIME_ONLY #define MBFP(BT, MCSDF, MCSVF) \ ppi->fn_ptr[BT].msdf = MCSDF; \ ppi->fn_ptr[BT].msvf = MCSVF; MBFP(BLOCK_128X128, aom_masked_sad128x128, aom_masked_sub_pixel_variance128x128) MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_sub_pixel_variance128x64) MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_sub_pixel_variance64x128) MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_sub_pixel_variance64x64) MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_sub_pixel_variance64x32) MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_sub_pixel_variance32x64) MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_sub_pixel_variance32x32) MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_sub_pixel_variance32x16) MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_sub_pixel_variance16x32) MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_sub_pixel_variance16x16) MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_sub_pixel_variance16x8) MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_sub_pixel_variance8x16) MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_sub_pixel_variance8x8) MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_sub_pixel_variance4x8) MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4) MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4) #if !CONFIG_REALTIME_ONLY MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16) MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4) MBFP(BLOCK_8X32, aom_masked_sad8x32, aom_masked_sub_pixel_variance8x32) MBFP(BLOCK_32X8, aom_masked_sad32x8, aom_masked_sub_pixel_variance32x8) MBFP(BLOCK_16X64, aom_masked_sad16x64, aom_masked_sub_pixel_variance16x64) MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16) #endif #define SDSFP(BT, SDSF, SDSX4DF) \ ppi->fn_ptr[BT].sdsf = SDSF; \ ppi->fn_ptr[BT].sdsx4df = SDSX4DF; SDSFP(BLOCK_128X128, aom_sad_skip_128x128, aom_sad_skip_128x128x4d) SDSFP(BLOCK_128X64, aom_sad_skip_128x64, aom_sad_skip_128x64x4d) SDSFP(BLOCK_64X128, aom_sad_skip_64x128, aom_sad_skip_64x128x4d) SDSFP(BLOCK_64X64, aom_sad_skip_64x64, aom_sad_skip_64x64x4d) SDSFP(BLOCK_64X32, aom_sad_skip_64x32, aom_sad_skip_64x32x4d) SDSFP(BLOCK_32X64, aom_sad_skip_32x64, aom_sad_skip_32x64x4d) SDSFP(BLOCK_32X32, aom_sad_skip_32x32, aom_sad_skip_32x32x4d) SDSFP(BLOCK_32X16, aom_sad_skip_32x16, aom_sad_skip_32x16x4d) SDSFP(BLOCK_16X32, aom_sad_skip_16x32, aom_sad_skip_16x32x4d) SDSFP(BLOCK_16X16, aom_sad_skip_16x16, aom_sad_skip_16x16x4d) SDSFP(BLOCK_8X16, aom_sad_skip_8x16, aom_sad_skip_8x16x4d) #if !CONFIG_REALTIME_ONLY SDSFP(BLOCK_64X16, aom_sad_skip_64x16, aom_sad_skip_64x16x4d) SDSFP(BLOCK_16X64, aom_sad_skip_16x64, aom_sad_skip_16x64x4d) SDSFP(BLOCK_8X32, aom_sad_skip_8x32, aom_sad_skip_8x32x4d) SDSFP(BLOCK_4X16, aom_sad_skip_4x16, aom_sad_skip_4x16x4d) #endif #undef SDSFP #if CONFIG_AV1_HIGHBITDEPTH highbd_set_var_fns(ppi); #endif { // As cm->mi_params is a part of the frame level context (cpi), it is // unavailable at this point. mi_params is created as a local temporary // variable, to be passed into the functions used for allocating tpl // buffers. The values in this variable are populated according to initial // width and height of the frame. CommonModeInfoParams mi_params; enc_set_mb_mi(&mi_params, oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, BLOCK_4X4); const BLOCK_SIZE bsize = BLOCK_16X16; const int w = mi_size_wide[bsize]; const int h = mi_size_high[bsize]; const int num_cols = (mi_params.mi_cols + w - 1) / w; const int num_rows = (mi_params.mi_rows + h - 1) / h; AOM_CHECK_MEM_ERROR( &ppi->error, ppi->tpl_sb_rdmult_scaling_factors, aom_calloc(num_rows * num_cols, sizeof(*ppi->tpl_sb_rdmult_scaling_factors))); #if CONFIG_INTERNAL_STATS ppi->b_calculate_blockiness = 1; ppi->b_calculate_consistency = 1; for (int i = 0; i <= STAT_ALL; i++) { ppi->psnr[0].stat[i] = 0; ppi->psnr[1].stat[i] = 0; ppi->fastssim.stat[i] = 0; ppi->psnrhvs.stat[i] = 0; } ppi->psnr[0].worst = 100.0; ppi->psnr[1].worst = 100.0; ppi->worst_ssim = 100.0; ppi->worst_ssim_hbd = 100.0; ppi->count[0] = 0; ppi->count[1] = 0; ppi->total_bytes = 0; if (ppi->b_calculate_psnr) { ppi->total_sq_error[0] = 0; ppi->total_samples[0] = 0; ppi->total_sq_error[1] = 0; ppi->total_samples[1] = 0; ppi->total_recode_hits = 0; ppi->summed_quality = 0; ppi->summed_weights = 0; ppi->summed_quality_hbd = 0; ppi->summed_weights_hbd = 0; } ppi->fastssim.worst = 100.0; ppi->psnrhvs.worst = 100.0; if (ppi->b_calculate_blockiness) { ppi->total_blockiness = 0; ppi->worst_blockiness = 0.0; } ppi->total_inconsistency = 0; ppi->worst_consistency = 100.0; if (ppi->b_calculate_consistency) { AOM_CHECK_MEM_ERROR(&ppi->error, ppi->ssim_vars, aom_malloc(sizeof(*ppi->ssim_vars) * 4 * mi_params.mi_rows * mi_params.mi_cols)); } #endif } ppi->error.setjmp = 0; return ppi; } AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf, BufferPool *const pool, COMPRESSOR_STAGE stage, int lap_lag_in_frames) { AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP)); if (!cpi) return NULL; av1_zero(*cpi); cpi->ppi = ppi; AV1_COMMON *volatile const cm = &cpi->common; cm->seq_params = &ppi->seq_params; cm->error = (struct aom_internal_error_info *)aom_calloc(1, sizeof(*cm->error)); if (!cm->error) { aom_free(cpi); return NULL; } // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(cm->error->jmp)) { cm->error->setjmp = 0; av1_remove_compressor(cpi); return NULL; } cm->error->setjmp = 1; cpi->compressor_stage = stage; cpi->do_frame_data_update = true; CommonModeInfoParams *const mi_params = &cm->mi_params; mi_params->free_mi = enc_free_mi; mi_params->setup_mi = enc_setup_mi; mi_params->set_mb_mi = (oxcf->pass == AOM_RC_FIRST_PASS || cpi->compressor_stage == LAP_STAGE) ? stat_stage_set_mb_mi : enc_set_mb_mi; mi_params->mi_alloc_bsize = BLOCK_4X4; CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc))); CHECK_MEM_ERROR( cm, cm->default_frame_context, (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context))); memset(cm->fc, 0, sizeof(*cm->fc)); memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context)); cpi->common.buffer_pool = pool; init_config(cpi, oxcf); if (cpi->compressor_stage == LAP_STAGE) { cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames; } av1_rc_init(&cpi->oxcf, &cpi->rc); init_frame_info(&cpi->frame_info, cm); init_frame_index_set(&cpi->frame_index_set); cm->current_frame.frame_number = 0; cpi->rc.frame_number_encoded = 0; cpi->rc.prev_frame_is_dropped = 0; cpi->rc.max_consec_drop = INT_MAX; cpi->rc.drop_count_consec = 0; cm->current_frame_id = -1; cpi->tile_data = NULL; cpi->last_show_frame_buf = NULL; realloc_segmentation_maps(cpi); cpi->refresh_frame.alt_ref_frame = false; #if CONFIG_SPEED_STATS cpi->tx_search_count = 0; #endif // CONFIG_SPEED_STATS cpi->time_stamps.first_ts_start = INT64_MAX; #ifdef OUTPUT_YUV_REC yuv_rec_file = fopen("rec.yuv", "wb"); #endif #ifdef OUTPUT_YUV_DENOISED yuv_denoised_file = fopen("denoised.yuv", "wb"); #endif #if !CONFIG_REALTIME_ONLY if (is_stat_consumption_stage(cpi)) { const size_t packet_sz = sizeof(FIRSTPASS_STATS); const int packets = (int)(oxcf->twopass_stats_in.sz / packet_sz); if (!cpi->ppi->lap_enabled) { /*Re-initialize to stats buffer, populated by application in the case of * two pass*/ cpi->ppi->twopass.stats_buf_ctx->stats_in_start = oxcf->twopass_stats_in.buf; cpi->twopass_frame.stats_in = cpi->ppi->twopass.stats_buf_ctx->stats_in_start; cpi->ppi->twopass.stats_buf_ctx->stats_in_end = &cpi->ppi->twopass.stats_buf_ctx->stats_in_start[packets - 1]; // The buffer size is packets - 1 because the last packet is total_stats. av1_firstpass_info_init(&cpi->ppi->twopass.firstpass_info, oxcf->twopass_stats_in.buf, packets - 1); av1_init_second_pass(cpi); } else { av1_firstpass_info_init(&cpi->ppi->twopass.firstpass_info, NULL, 0); av1_init_single_pass_lap(cpi); } } #endif // The buffer "obmc_buffer" is used in inter frames for fast obmc search. // Hence, the memory allocation for the same is avoided for allintra encoding // mode. if (cpi->oxcf.kf_cfg.key_freq_max != 0) alloc_obmc_buffers(&cpi->td.mb.obmc_buffer, cm->error); for (int x = 0; x < 2; x++) for (int y = 0; y < 2; y++) CHECK_MEM_ERROR( cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y], (uint32_t *)aom_malloc( AOM_BUFFER_SIZE_FOR_BLOCK_HASH * sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0]))); cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0; av1_set_speed_features_framesize_independent(cpi, oxcf->speed); av1_set_speed_features_framesize_dependent(cpi, oxcf->speed); int max_mi_cols = mi_params->mi_cols; int max_mi_rows = mi_params->mi_rows; if (oxcf->frm_dim_cfg.forced_max_frame_width) { max_mi_cols = size_in_mi(oxcf->frm_dim_cfg.forced_max_frame_width); } if (oxcf->frm_dim_cfg.forced_max_frame_height) { max_mi_rows = size_in_mi(oxcf->frm_dim_cfg.forced_max_frame_height); } const int consec_zero_mv_alloc_size = (max_mi_rows * max_mi_cols) >> 2; CHECK_MEM_ERROR( cm, cpi->consec_zero_mv, aom_calloc(consec_zero_mv_alloc_size, sizeof(*cpi->consec_zero_mv))); cpi->consec_zero_mv_alloc_size = consec_zero_mv_alloc_size; cpi->mb_weber_stats = NULL; cpi->mb_delta_q = NULL; cpi->palette_pixel_num = 0; cpi->scaled_last_source_available = 0; { const BLOCK_SIZE bsize = BLOCK_16X16; const int w = mi_size_wide[bsize]; const int h = mi_size_high[bsize]; const int num_cols = (max_mi_cols + w - 1) / w; const int num_rows = (max_mi_rows + h - 1) / h; CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors, aom_calloc(num_rows * num_cols, sizeof(*cpi->ssim_rdmult_scaling_factors))); CHECK_MEM_ERROR(cm, cpi->tpl_rdmult_scaling_factors, aom_calloc(num_rows * num_cols, sizeof(*cpi->tpl_rdmult_scaling_factors))); } #if CONFIG_TUNE_VMAF { const BLOCK_SIZE bsize = BLOCK_64X64; const int w = mi_size_wide[bsize]; const int h = mi_size_high[bsize]; const int num_cols = (mi_params->mi_cols + w - 1) / w; const int num_rows = (mi_params->mi_rows + h - 1) / h; CHECK_MEM_ERROR(cm, cpi->vmaf_info.rdmult_scaling_factors, aom_calloc(num_rows * num_cols, sizeof(*cpi->vmaf_info.rdmult_scaling_factors))); for (int i = 0; i < MAX_ARF_LAYERS; i++) { cpi->vmaf_info.last_frame_unsharp_amount[i] = -1.0; cpi->vmaf_info.last_frame_ysse[i] = -1.0; cpi->vmaf_info.last_frame_vmaf[i] = -1.0; } cpi->vmaf_info.original_qindex = -1; cpi->vmaf_info.vmaf_model = NULL; } #endif #if CONFIG_TUNE_BUTTERAUGLI { const int w = mi_size_wide[butteraugli_rdo_bsize]; const int h = mi_size_high[butteraugli_rdo_bsize]; const int num_cols = (mi_params->mi_cols + w - 1) / w; const int num_rows = (mi_params->mi_rows + h - 1) / h; CHECK_MEM_ERROR( cm, cpi->butteraugli_info.rdmult_scaling_factors, aom_malloc(num_rows * num_cols * sizeof(*cpi->butteraugli_info.rdmult_scaling_factors))); memset(&cpi->butteraugli_info.source, 0, sizeof(cpi->butteraugli_info.source)); memset(&cpi->butteraugli_info.resized_source, 0, sizeof(cpi->butteraugli_info.resized_source)); cpi->butteraugli_info.recon_set = false; } #endif #if CONFIG_SALIENCY_MAP { CHECK_MEM_ERROR(cm, cpi->saliency_map, (uint8_t *)aom_calloc(cm->height * cm->width, sizeof(*cpi->saliency_map))); // Buffer initialization based on MIN_MIB_SIZE_LOG2 to ensure that // cpi->sm_scaling_factor buffer is allocated big enough, since we have no // idea of the actual superblock size we are going to use yet. const int min_mi_w_sb = (1 << MIN_MIB_SIZE_LOG2); const int min_mi_h_sb = (1 << MIN_MIB_SIZE_LOG2); const int max_sb_cols = (cm->mi_params.mi_cols + min_mi_w_sb - 1) / min_mi_w_sb; const int max_sb_rows = (cm->mi_params.mi_rows + min_mi_h_sb - 1) / min_mi_h_sb; CHECK_MEM_ERROR(cm, cpi->sm_scaling_factor, (double *)aom_calloc(max_sb_rows * max_sb_cols, sizeof(*cpi->sm_scaling_factor))); } #endif #if CONFIG_COLLECT_PARTITION_STATS av1_zero(cpi->partition_stats); #endif // CONFIG_COLLECT_PARTITION_STATS // Initialize the members of DeltaQuantParams with INT_MAX to ensure that // the quantizer tables are correctly initialized using the default deltaq // parameters when av1_init_quantizer is called for the first time. DeltaQuantParams *const prev_deltaq_params = &cpi->enc_quant_dequant_params.prev_deltaq_params; prev_deltaq_params->y_dc_delta_q = INT_MAX; prev_deltaq_params->u_dc_delta_q = INT_MAX; prev_deltaq_params->v_dc_delta_q = INT_MAX; prev_deltaq_params->u_ac_delta_q = INT_MAX; prev_deltaq_params->v_ac_delta_q = INT_MAX; av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, cm->seq_params->bit_depth); av1_qm_init(&cm->quant_params, av1_num_planes(cm)); av1_loop_filter_init(cm); cm->superres_scale_denominator = SCALE_NUMERATOR; cm->superres_upscaled_width = oxcf->frm_dim_cfg.width; cm->superres_upscaled_height = oxcf->frm_dim_cfg.height; #if !CONFIG_REALTIME_ONLY av1_loop_restoration_precal(); #endif #if CONFIG_THREE_PASS cpi->third_pass_ctx = NULL; if (cpi->oxcf.pass == AOM_RC_THIRD_PASS) { av1_init_thirdpass_ctx(cm, &cpi->third_pass_ctx, NULL); } #endif cpi->second_pass_log_stream = NULL; cpi->use_ducky_encode = 0; cm->error->setjmp = 0; return cpi; } #if CONFIG_INTERNAL_STATS #define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T)) #define SNPRINT2(H, T, V) \ snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V)) #endif // CONFIG_INTERNAL_STATS void av1_remove_primary_compressor(AV1_PRIMARY *ppi) { if (!ppi) return; #if !CONFIG_REALTIME_ONLY av1_tf_info_free(&ppi->tf_info); #endif // !CONFIG_REALTIME_ONLY for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { aom_free(ppi->level_params.level_info[i]); } av1_lookahead_destroy(ppi->lookahead); aom_free(ppi->tpl_sb_rdmult_scaling_factors); ppi->tpl_sb_rdmult_scaling_factors = NULL; TplParams *const tpl_data = &ppi->tpl_data; aom_free(tpl_data->txfm_stats_list); for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) { aom_free(tpl_data->tpl_stats_pool[frame]); aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]); tpl_data->tpl_stats_pool[frame] = NULL; } #if !CONFIG_REALTIME_ONLY av1_tpl_dealloc(&tpl_data->tpl_mt_sync); #endif av1_terminate_workers(ppi); free_thread_data(ppi); aom_free(ppi->p_mt_info.tile_thr_data); ppi->p_mt_info.tile_thr_data = NULL; aom_free(ppi->p_mt_info.workers); ppi->p_mt_info.workers = NULL; ppi->p_mt_info.num_workers = 0; aom_free(ppi); } void av1_remove_compressor(AV1_COMP *cpi) { if (!cpi) return; #if CONFIG_RATECTRL_LOG if (cpi->oxcf.pass == 3) { rc_log_show(&cpi->rc_log); } #endif // CONFIG_RATECTRL_LOG AV1_COMMON *cm = &cpi->common; if (cm->current_frame.frame_number > 0) { #if CONFIG_SPEED_STATS if (!is_stat_generation_stage(cpi)) { fprintf(stdout, "tx_search_count = %d\n", cpi->tx_search_count); } #endif // CONFIG_SPEED_STATS #if CONFIG_COLLECT_PARTITION_STATS == 2 if (!is_stat_generation_stage(cpi)) { av1_print_fr_partition_timing_stats(&cpi->partition_stats, "fr_part_timing_data.csv"); } #endif } #if CONFIG_AV1_TEMPORAL_DENOISING av1_denoiser_free(&(cpi->denoiser)); #endif if (cm->error) { // Help detect use after free of the error detail string. memset(cm->error->detail, 'A', sizeof(cm->error->detail) - 1); cm->error->detail[sizeof(cm->error->detail) - 1] = '\0'; aom_free(cm->error); } aom_free(cpi->td.tctx); MultiThreadInfo *const mt_info = &cpi->mt_info; #if CONFIG_MULTITHREAD pthread_mutex_t *const enc_row_mt_mutex_ = mt_info->enc_row_mt.mutex_; pthread_cond_t *const enc_row_mt_cond_ = mt_info->enc_row_mt.cond_; pthread_mutex_t *const gm_mt_mutex_ = mt_info->gm_sync.mutex_; pthread_mutex_t *const tpl_error_mutex_ = mt_info->tpl_row_mt.mutex_; pthread_mutex_t *const pack_bs_mt_mutex_ = mt_info->pack_bs_sync.mutex_; if (enc_row_mt_mutex_ != NULL) { pthread_mutex_destroy(enc_row_mt_mutex_); aom_free(enc_row_mt_mutex_); } if (enc_row_mt_cond_ != NULL) { pthread_cond_destroy(enc_row_mt_cond_); aom_free(enc_row_mt_cond_); } if (gm_mt_mutex_ != NULL) { pthread_mutex_destroy(gm_mt_mutex_); aom_free(gm_mt_mutex_); } if (tpl_error_mutex_ != NULL) { pthread_mutex_destroy(tpl_error_mutex_); aom_free(tpl_error_mutex_); } if (pack_bs_mt_mutex_ != NULL) { pthread_mutex_destroy(pack_bs_mt_mutex_); aom_free(pack_bs_mt_mutex_); } #endif av1_row_mt_mem_dealloc(cpi); if (mt_info->num_workers > 1) { av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync); av1_loop_filter_dealloc(&mt_info->lf_row_sync); av1_cdef_mt_dealloc(&mt_info->cdef_sync); #if !CONFIG_REALTIME_ONLY av1_loop_restoration_dealloc(&mt_info->lr_row_sync); av1_tf_mt_dealloc(&mt_info->tf_sync); #endif } #if CONFIG_THREE_PASS av1_free_thirdpass_ctx(cpi->third_pass_ctx); av1_close_second_pass_log(cpi); #endif dealloc_compressor_data(cpi); av1_ext_part_delete(&cpi->ext_part_controller); av1_remove_common(cm); aom_free(cpi); #ifdef OUTPUT_YUV_REC fclose(yuv_rec_file); #endif #ifdef OUTPUT_YUV_DENOISED fclose(yuv_denoised_file); #endif } static void generate_psnr_packet(AV1_COMP *cpi) { struct aom_codec_cx_pkt pkt; int i; PSNR_STATS psnr; #if CONFIG_AV1_HIGHBITDEPTH const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth; const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr, bit_depth, in_bit_depth); #else aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr); #endif for (i = 0; i < 4; ++i) { pkt.data.psnr.samples[i] = psnr.samples[i]; pkt.data.psnr.sse[i] = psnr.sse[i]; pkt.data.psnr.psnr[i] = psnr.psnr[i]; } #if CONFIG_AV1_HIGHBITDEPTH if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) && (in_bit_depth < bit_depth)) { for (i = 0; i < 4; ++i) { pkt.data.psnr.samples_hbd[i] = psnr.samples_hbd[i]; pkt.data.psnr.sse_hbd[i] = psnr.sse_hbd[i]; pkt.data.psnr.psnr_hbd[i] = psnr.psnr_hbd[i]; } } #endif pkt.kind = AOM_CODEC_PSNR_PKT; aom_codec_pkt_list_add(cpi->ppi->output_pkt_list, &pkt); } int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags) { if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1; *ext_ref_frame_flags = ref_frame_flags; return 0; } int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) { AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx); if (cfg) { aom_yv12_copy_frame(cfg, sd, num_planes); return 0; } else { return -1; } } int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) { AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx); if (cfg) { aom_yv12_copy_frame(sd, cfg, num_planes); return 0; } else { return -1; } } #ifdef OUTPUT_YUV_REC void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) { uint8_t *src = s->y_buffer; int h = cm->height; if (yuv_rec_file == NULL) return; if (s->flags & YV12_FLAG_HIGHBITDEPTH) { uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer); do { fwrite(src16, s->y_width, 2, yuv_rec_file); src16 += s->y_stride; } while (--h); src16 = CONVERT_TO_SHORTPTR(s->u_buffer); h = s->uv_height; do { fwrite(src16, s->uv_width, 2, yuv_rec_file); src16 += s->uv_stride; } while (--h); src16 = CONVERT_TO_SHORTPTR(s->v_buffer); h = s->uv_height; do { fwrite(src16, s->uv_width, 2, yuv_rec_file); src16 += s->uv_stride; } while (--h); fflush(yuv_rec_file); return; } do { fwrite(src, s->y_width, 1, yuv_rec_file); src += s->y_stride; } while (--h); src = s->u_buffer; h = s->uv_height; do { fwrite(src, s->uv_width, 1, yuv_rec_file); src += s->uv_stride; } while (--h); src = s->v_buffer; h = s->uv_height; do { fwrite(src, s->uv_width, 1, yuv_rec_file); src += s->uv_stride; } while (--h); fflush(yuv_rec_file); } #endif // OUTPUT_YUV_REC void av1_set_mv_search_params(AV1_COMP *cpi) { const AV1_COMMON *const cm = &cpi->common; MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params; const int max_mv_def = AOMMAX(cm->width, cm->height); // Default based on max resolution. mv_search_params->mv_step_param = av1_init_search_range(max_mv_def); if (cpi->sf.mv_sf.auto_mv_step_size) { if (frame_is_intra_only(cm)) { // Initialize max_mv_magnitude for use in the first INTER frame // after a key/intra-only frame. mv_search_params->max_mv_magnitude = max_mv_def; } else { // Use adaptive mv steps based on previous frame stats for show frames and // internal arfs. FRAME_UPDATE_TYPE cur_update_type = cpi->ppi->gf_group.update_type[cpi->gf_frame_index]; int use_auto_mv_step = (cm->show_frame || cur_update_type == INTNL_ARF_UPDATE) && mv_search_params->max_mv_magnitude != -1 && cpi->sf.mv_sf.auto_mv_step_size >= 2; if (use_auto_mv_step) { // Allow mv_steps to correspond to twice the max mv magnitude found // in the previous frame, capped by the default max_mv_magnitude based // on resolution. mv_search_params->mv_step_param = av1_init_search_range( AOMMIN(max_mv_def, 2 * mv_search_params->max_mv_magnitude)); } // Reset max_mv_magnitude based on update flag. if (cpi->do_frame_data_update) mv_search_params->max_mv_magnitude = -1; } } } void av1_set_screen_content_options(AV1_COMP *cpi, FeatureFlags *features) { const AV1_COMMON *const cm = &cpi->common; const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; if (cm->seq_params->force_screen_content_tools != 2) { features->allow_screen_content_tools = features->allow_intrabc = cm->seq_params->force_screen_content_tools; return; } if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { features->allow_screen_content_tools = 1; features->allow_intrabc = cpi->oxcf.mode == REALTIME ? 0 : 1; cpi->is_screen_content_type = 1; cpi->use_screen_content_tools = 1; return; } if (cpi->oxcf.mode == REALTIME) { features->allow_screen_content_tools = features->allow_intrabc = 0; return; } // Screen content tools are not evaluated in non-RD encoding mode unless // content type is not set explicitly, i.e., when // cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN, use_nonrd_pick_mode = 1 // and hybrid_intra_pickmode = 0. Hence, screen content detection is // disabled. if (cpi->sf.rt_sf.use_nonrd_pick_mode && !cpi->sf.rt_sf.hybrid_intra_pickmode) { features->allow_screen_content_tools = features->allow_intrabc = 0; return; } // Estimate if the source frame is screen content, based on the portion of // blocks that have few luma colors. const uint8_t *src = cpi->unfiltered_source->y_buffer; assert(src != NULL); const int use_hbd = cpi->unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH; const int stride = cpi->unfiltered_source->y_stride; const int width = cpi->unfiltered_source->y_width; const int height = cpi->unfiltered_source->y_height; const int64_t area = (int64_t)width * height; const int bd = cm->seq_params->bit_depth; const int blk_w = 16; const int blk_h = 16; // These threshold values are selected experimentally. const int color_thresh = 4; const unsigned int var_thresh = 0; // Counts of blocks with no more than color_thresh colors. int64_t counts_1 = 0; // Counts of blocks with no more than color_thresh colors and variance larger // than var_thresh. int64_t counts_2 = 0; for (int r = 0; r + blk_h <= height; r += blk_h) { for (int c = 0; c + blk_w <= width; c += blk_w) { int count_buf[1 << 8]; // Maximum (1 << 8) bins for hbd path. const uint8_t *const this_src = src + r * stride + c; int n_colors; if (use_hbd) av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd, NULL, count_buf, &n_colors, NULL); else av1_count_colors(this_src, stride, blk_w, blk_h, count_buf, &n_colors); if (n_colors > 1 && n_colors <= color_thresh) { ++counts_1; struct buf_2d buf; buf.stride = stride; buf.buf = (uint8_t *)this_src; const unsigned int var = av1_get_perpixel_variance( cpi, xd, &buf, BLOCK_16X16, AOM_PLANE_Y, use_hbd); if (var > var_thresh) ++counts_2; } } } // The threshold values are selected experimentally. features->allow_screen_content_tools = counts_1 * blk_h * blk_w * 10 > area; // IntraBC would force loop filters off, so we use more strict rules that also // requires that the block has high variance. features->allow_intrabc = features->allow_screen_content_tools && counts_2 * blk_h * blk_w * 12 > area; cpi->use_screen_content_tools = features->allow_screen_content_tools; cpi->is_screen_content_type = features->allow_intrabc || (counts_1 * blk_h * blk_w * 10 > area * 4 && counts_2 * blk_h * blk_w * 30 > area); } static void init_motion_estimation(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params; const int aligned_width = (cm->width + 7) & ~7; const int y_stride = aom_calc_y_stride(aligned_width, cpi->oxcf.border_in_pixels); const int y_stride_src = ((cpi->oxcf.frm_dim_cfg.width != cm->width || cpi->oxcf.frm_dim_cfg.height != cm->height) || av1_superres_scaled(cm)) ? y_stride : cpi->ppi->lookahead->buf->img.y_stride; int fpf_y_stride = cm->cur_frame != NULL ? cm->cur_frame->buf.y_stride : y_stride; // Update if search_site_cfg is uninitialized or the current frame has a new // stride const int should_update = !mv_search_params->search_site_cfg[SS_CFG_SRC][DIAMOND].stride || !mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD][DIAMOND].stride || (y_stride != mv_search_params->search_site_cfg[SS_CFG_SRC][DIAMOND].stride); if (!should_update) { return; } // Initialization of search_site_cfg for NUM_DISTINCT_SEARCH_METHODS. for (SEARCH_METHODS i = DIAMOND; i < NUM_DISTINCT_SEARCH_METHODS; i++) { const int level = ((i == NSTEP_8PT) || (i == CLAMPED_DIAMOND)) ? 1 : 0; av1_init_motion_compensation[i]( &mv_search_params->search_site_cfg[SS_CFG_SRC][i], y_stride, level); av1_init_motion_compensation[i]( &mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD][i], y_stride_src, level); } // First pass search site config initialization. av1_init_motion_fpf(&mv_search_params->search_site_cfg[SS_CFG_FPF][DIAMOND], fpf_y_stride); for (SEARCH_METHODS i = NSTEP; i < NUM_DISTINCT_SEARCH_METHODS; i++) { memcpy(&mv_search_params->search_site_cfg[SS_CFG_FPF][i], &mv_search_params->search_site_cfg[SS_CFG_FPF][DIAMOND], sizeof(search_site_config)); } } static void init_ref_frame_bufs(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; int i; if (cm->cur_frame) { cm->cur_frame->ref_count--; cm->cur_frame = NULL; } for (i = 0; i < REF_FRAMES; ++i) { if (cm->ref_frame_map[i]) { cm->ref_frame_map[i]->ref_count--; cm->ref_frame_map[i] = NULL; } } #ifndef NDEBUG BufferPool *const pool = cm->buffer_pool; for (i = 0; i < pool->num_frame_bufs; ++i) { assert(pool->frame_bufs[i].ref_count == 0); } #endif } // TODO(chengchen): consider renaming this function as it is necessary // for the encoder to setup critical parameters, and it does not // deal with initial width any longer. aom_codec_err_t av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, int subsampling_x, int subsampling_y) { AV1_COMMON *const cm = &cpi->common; SequenceHeader *const seq_params = cm->seq_params; if (!cpi->frame_size_related_setup_done || seq_params->use_highbitdepth != use_highbitdepth || seq_params->subsampling_x != subsampling_x || seq_params->subsampling_y != subsampling_y) { seq_params->subsampling_x = subsampling_x; seq_params->subsampling_y = subsampling_y; seq_params->use_highbitdepth = use_highbitdepth; av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed); av1_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed); if (!is_stat_generation_stage(cpi)) { #if !CONFIG_REALTIME_ONLY if (!av1_tf_info_alloc(&cpi->ppi->tf_info, cpi)) return AOM_CODEC_MEM_ERROR; #endif // !CONFIG_REALTIME_ONLY } init_ref_frame_bufs(cpi); init_motion_estimation(cpi); // TODO(agrange) This can be removed. cpi->initial_mbs = cm->mi_params.MBs; cpi->frame_size_related_setup_done = true; } return AOM_CODEC_OK; } #if CONFIG_AV1_TEMPORAL_DENOISING static void setup_denoiser_buffer(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; if (cpi->oxcf.noise_sensitivity > 0 && !cpi->denoiser.frame_buffer_initialized) { if (av1_denoiser_alloc( cm, &cpi->svc, &cpi->denoiser, cpi->ppi->use_svc, cpi->oxcf.noise_sensitivity, cm->width, cm->height, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate denoiser"); } } #endif // Returns 1 if the assigned width or height was <= 0. static int set_size_literal(AV1_COMP *cpi, int width, int height) { AV1_COMMON *cm = &cpi->common; aom_codec_err_t err = av1_check_initial_width( cpi, cm->seq_params->use_highbitdepth, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y); if (err != AOM_CODEC_OK) { aom_internal_error(cm->error, err, "av1_check_initial_width() failed"); } if (width <= 0 || height <= 0) return 1; cm->width = width; cm->height = height; #if CONFIG_AV1_TEMPORAL_DENOISING setup_denoiser_buffer(cpi); #endif if (cm->width > cpi->data_alloc_width || cm->height > cpi->data_alloc_height) { av1_free_context_buffers(cm); av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf); av1_free_sms_tree(&cpi->td); av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm)); cpi->td.firstpass_ctx = NULL; alloc_compressor_data(cpi); realloc_segmentation_maps(cpi); cpi->data_alloc_width = cm->width; cpi->data_alloc_height = cm->height; cpi->frame_size_related_setup_done = false; } alloc_mb_mode_info_buffers(cpi); av1_update_frame_size(cpi); return 0; } void av1_set_frame_size(AV1_COMP *cpi, int width, int height) { AV1_COMMON *const cm = &cpi->common; const SequenceHeader *const seq_params = cm->seq_params; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; int ref_frame; if (width != cm->width || height != cm->height) { // There has been a change in the encoded frame size set_size_literal(cpi, width, height); // Recalculate 'all_lossless' in case super-resolution was (un)selected. cm->features.all_lossless = cm->features.coded_lossless && !av1_superres_scaled(cm); av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height); #if CONFIG_AV1_TEMPORAL_DENOISING // Reset the denoiser on the resized frame. if (cpi->oxcf.noise_sensitivity > 0) { av1_denoiser_free(&(cpi->denoiser)); setup_denoiser_buffer(cpi); } #endif } if (is_stat_consumption_stage(cpi)) { av1_set_target_rate(cpi, cm->width, cm->height); } alloc_frame_mvs(cm, cm->cur_frame); // Allocate above context buffers CommonContexts *const above_contexts = &cm->above_contexts; if (above_contexts->num_planes < av1_num_planes(cm) || above_contexts->num_mi_cols < cm->mi_params.mi_cols || above_contexts->num_tile_rows < cm->tiles.rows) { av1_free_above_context_buffers(above_contexts); if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows, cm->mi_params.mi_cols, av1_num_planes(cm))) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate context buffers"); } AV1EncoderConfig *oxcf = &cpi->oxcf; oxcf->border_in_pixels = av1_get_enc_border_size( av1_is_resize_needed(oxcf), oxcf->kf_cfg.key_freq_max == 0, cm->seq_params->sb_size); // Reset the frame pointers to the current frame size. if (aom_realloc_frame_buffer( &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL, cpi->alloc_pyramid, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); if (!is_stat_generation_stage(cpi)) av1_init_cdef_worker(cpi); #if !CONFIG_REALTIME_ONLY if (is_restoration_used(cm)) { for (int i = 0; i < num_planes; ++i) cm->rst_info[i].frame_restoration_type = RESTORE_NONE; const bool is_sgr_enabled = !cpi->sf.lpf_sf.disable_sgr_filter; av1_alloc_restoration_buffers(cm, is_sgr_enabled); // Store the allocated restoration buffers in MT object. if (cpi->ppi->p_mt_info.num_workers > 1) { av1_init_lr_mt_buffers(cpi); } } #endif init_motion_estimation(cpi); int has_valid_ref_frame = 0; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); if (buf != NULL) { struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame); av1_setup_scale_factors_for_frame(sf, buf->buf.y_crop_width, buf->buf.y_crop_height, cm->width, cm->height); has_valid_ref_frame |= av1_is_valid_scale(sf); if (av1_is_scaled(sf)) aom_extend_frame_borders(&buf->buf, num_planes); } } if (!frame_is_intra_only(cm) && !has_valid_ref_frame) { aom_internal_error( cm->error, AOM_CODEC_CORRUPT_FRAME, "Can't find at least one reference frame with valid size"); } av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height, cm->width, cm->height); set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); } static inline int extend_borders_mt(const AV1_COMP *cpi, MULTI_THREADED_MODULES stage, int plane) { const AV1_COMMON *const cm = &cpi->common; if (cpi->mt_info.num_mod_workers[stage] < 2) return 0; switch (stage) { // TODO(deepa.kg@ittiam.com): When cdef and loop-restoration are disabled, // multi-thread frame border extension along with loop filter frame. // As loop-filtering of a superblock row modifies the pixels of the // above superblock row, border extension requires that loop filtering // of the current and above superblock row is complete. case MOD_LPF: return 0; case MOD_CDEF: return is_cdef_used(cm) && !cpi->ppi->rtc_ref.non_reference_frame && !is_restoration_used(cm) && !av1_superres_scaled(cm); case MOD_LR: return is_restoration_used(cm) && (cm->rst_info[plane].frame_restoration_type != RESTORE_NONE); default: assert(0); } return 0; } /*!\brief Select and apply cdef filters and switchable restoration filters * * \ingroup high_level_algo */ static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm, MACROBLOCKD *xd, int use_restoration, int use_cdef, unsigned int skip_apply_postproc_filters) { #if !CONFIG_REALTIME_ONLY if (use_restoration) av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 0); #else (void)use_restoration; #endif if (use_cdef) { #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, cdef_time); #endif const int num_workers = cpi->mt_info.num_mod_workers[MOD_CDEF]; // Find CDEF parameters av1_cdef_search(cpi); // Apply the filter if ((skip_apply_postproc_filters & SKIP_APPLY_CDEF) == 0) { assert(!cpi->ppi->rtc_ref.non_reference_frame); if (num_workers > 1) { // Extension of frame borders is multi-threaded along with cdef. const int do_extend_border = extend_borders_mt(cpi, MOD_CDEF, /* plane */ 0); av1_cdef_frame_mt(cm, xd, cpi->mt_info.cdef_worker, cpi->mt_info.workers, &cpi->mt_info.cdef_sync, num_workers, av1_cdef_init_fb_row_mt, do_extend_border); } else { av1_cdef_frame(&cm->cur_frame->buf, cm, xd, av1_cdef_init_fb_row); } } #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, cdef_time); #endif } const int use_superres = av1_superres_scaled(cm); if (use_superres) { if ((skip_apply_postproc_filters & SKIP_APPLY_SUPERRES) == 0) { av1_superres_post_encode(cpi); } } #if !CONFIG_REALTIME_ONLY #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, loop_restoration_time); #endif if (use_restoration) { MultiThreadInfo *const mt_info = &cpi->mt_info; const int num_workers = mt_info->num_mod_workers[MOD_LR]; av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 1); av1_pick_filter_restoration(cpi->source, cpi); if ((skip_apply_postproc_filters & SKIP_APPLY_RESTORATION) == 0 && (cm->rst_info[0].frame_restoration_type != RESTORE_NONE || cm->rst_info[1].frame_restoration_type != RESTORE_NONE || cm->rst_info[2].frame_restoration_type != RESTORE_NONE)) { if (num_workers > 1) { // Extension of frame borders is multi-threaded along with loop // restoration filter. const int do_extend_border = 1; av1_loop_restoration_filter_frame_mt( &cm->cur_frame->buf, cm, 0, mt_info->workers, num_workers, &mt_info->lr_row_sync, &cpi->lr_ctxt, do_extend_border); } else { av1_loop_restoration_filter_frame(&cm->cur_frame->buf, cm, 0, &cpi->lr_ctxt); } } } #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, loop_restoration_time); #endif #endif // !CONFIG_REALTIME_ONLY } static void extend_frame_borders(AV1_COMP *cpi) { const AV1_COMMON *const cm = &cpi->common; // TODO(debargha): Fix mv search range on encoder side for (int plane = 0; plane < av1_num_planes(cm); ++plane) { const bool extend_border_done = extend_borders_mt(cpi, MOD_CDEF, plane) || extend_borders_mt(cpi, MOD_LR, plane); if (!extend_border_done) { const YV12_BUFFER_CONFIG *const ybf = &cm->cur_frame->buf; aom_extend_frame_borders_plane_row(ybf, plane, 0, ybf->crop_heights[plane > 0]); } } } /*!\brief Select and apply deblocking filters, cdef filters, and restoration * filters. * * \ingroup high_level_algo */ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) { MultiThreadInfo *const mt_info = &cpi->mt_info; const int num_workers = mt_info->num_mod_workers[MOD_LPF]; const int num_planes = av1_num_planes(cm); MACROBLOCKD *xd = &cpi->td.mb.e_mbd; cpi->td.mb.rdmult = cpi->rd.RDMULT; assert(IMPLIES(is_lossless_requested(&cpi->oxcf.rc_cfg), cm->features.coded_lossless && cm->features.all_lossless)); const int use_loopfilter = is_loopfilter_used(cm) && !cpi->mt_info.pipeline_lpf_mt_with_enc; const int use_cdef = is_cdef_used(cm); const int use_superres = av1_superres_scaled(cm); const int use_restoration = is_restoration_used(cm); const unsigned int skip_apply_postproc_filters = derive_skip_apply_postproc_filters(cpi, use_loopfilter, use_cdef, use_superres, use_restoration); #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, loop_filter_time); #endif if (use_loopfilter) { av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_sf.lpf_pick); struct loopfilter *lf = &cm->lf; if ((lf->filter_level[0] || lf->filter_level[1]) && (skip_apply_postproc_filters & SKIP_APPLY_LOOPFILTER) == 0) { assert(!cpi->ppi->rtc_ref.non_reference_frame); // lpf_opt_level = 1 : Enables dual/quad loop-filtering. // lpf_opt_level is set to 1 if transform size search depth in inter // blocks is limited to one as quad loop filtering assumes that all the // transform blocks within a 16x8/8x16/16x16 prediction block are of the // same size. lpf_opt_level = 2 : Filters both chroma planes together, in // addition to enabling dual/quad loop-filtering. This is enabled when lpf // pick method is LPF_PICK_FROM_Q as u and v plane filter levels are // equal. int lpf_opt_level = get_lpf_opt_level(&cpi->sf); av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0, mt_info->workers, num_workers, &mt_info->lf_row_sync, lpf_opt_level); } } #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, loop_filter_time); #endif cdef_restoration_frame(cpi, cm, xd, use_restoration, use_cdef, skip_apply_postproc_filters); } static void update_motion_stat(AV1_COMP *const cpi) { AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; RATE_CONTROL *const rc = &cpi->rc; SVC *const svc = &cpi->svc; const int avg_cnt_zeromv = 100 * cpi->rc.cnt_zeromv / (mi_params->mi_rows * mi_params->mi_cols); if (!cpi->ppi->use_svc || (cpi->ppi->use_svc && !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { rc->avg_frame_low_motion = (rc->avg_frame_low_motion == 0) ? avg_cnt_zeromv : (3 * rc->avg_frame_low_motion + avg_cnt_zeromv) / 4; // For SVC: set avg_frame_low_motion (only computed on top spatial layer) // to all lower spatial layers. if (cpi->ppi->use_svc && svc->spatial_layer_id == svc->number_spatial_layers - 1) { for (int i = 0; i < svc->number_spatial_layers - 1; ++i) { const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, svc->number_temporal_layers); LAYER_CONTEXT *const lc = &svc->layer_context[layer]; RATE_CONTROL *const lrc = &lc->rc; lrc->avg_frame_low_motion = rc->avg_frame_low_motion; } } } } /*!\brief Encode a frame without the recode loop, usually used in one-pass * encoding and realtime coding. * * \ingroup high_level_algo * * \param[in] cpi Top-level encoder structure * * \return Returns a value to indicate if the encoding is done successfully. * \retval #AOM_CODEC_OK * \retval #AOM_CODEC_ERROR */ static int encode_without_recode(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const QuantizationCfg *const q_cfg = &cpi->oxcf.q_cfg; SVC *const svc = &cpi->svc; const int resize_pending = is_frame_resize_pending(cpi); int top_index = 0, bottom_index = 0, q = 0; YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source; InterpFilter filter_scaler = cpi->ppi->use_svc ? svc->downsample_filter_type[svc->spatial_layer_id] : EIGHTTAP_SMOOTH; int phase_scaler = cpi->ppi->use_svc ? svc->downsample_filter_phase[svc->spatial_layer_id] : 0; if (cpi->rc.postencode_drop && allow_postencode_drop_rtc(cpi)) av1_save_all_coding_context(cpi); set_size_independent_vars(cpi); av1_setup_frame_size(cpi); cm->prev_frame = get_primary_ref_frame_buf(cm); av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); av1_set_mv_search_params(cpi); if (cm->current_frame.frame_number == 0 && (cpi->ppi->use_svc || cpi->oxcf.rc_cfg.drop_frames_water_mark > 0) && cpi->svc.temporal_layer_id == 0) { const SequenceHeader *seq_params = cm->seq_params; if (aom_alloc_frame_buffer( &cpi->svc.source_last_TL0, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0)) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate buffer for source_last_TL0"); } } if (!cpi->ppi->use_svc) { phase_scaler = 8; // 2:1 scaling. if ((cm->width << 1) == unscaled->y_crop_width && (cm->height << 1) == unscaled->y_crop_height) { filter_scaler = BILINEAR; // For lower resolutions use eighttap_smooth. if (cm->width * cm->height <= 320 * 180) filter_scaler = EIGHTTAP_SMOOTH; } else if ((cm->width << 2) == unscaled->y_crop_width && (cm->height << 2) == unscaled->y_crop_height) { // 4:1 scaling. filter_scaler = EIGHTTAP_SMOOTH; } else if ((cm->width << 2) == 3 * unscaled->y_crop_width && (cm->height << 2) == 3 * unscaled->y_crop_height) { // 4:3 scaling. filter_scaler = EIGHTTAP_REGULAR; } } allocate_gradient_info_for_hog(cpi); allocate_src_var_of_4x4_sub_block_buf(cpi); const SPEED_FEATURES *sf = &cpi->sf; if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) variance_partition_alloc(cpi); if (cm->current_frame.frame_type == KEY_FRAME || ((sf->inter_sf.extra_prune_warped && cpi->refresh_frame.golden_frame))) copy_frame_prob_info(cpi); #if CONFIG_COLLECT_COMPONENT_TIMING printf("\n Encoding a frame: \n"); #endif #if CONFIG_TUNE_BUTTERAUGLI if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { av1_setup_butteraugli_rdmult(cpi); } #endif cpi->source = av1_realloc_and_scale_if_required( cm, unscaled, &cpi->scaled_source, filter_scaler, phase_scaler, true, false, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); if (frame_is_intra_only(cm) || resize_pending != 0) { const int current_size = (cm->mi_params.mi_rows * cm->mi_params.mi_cols) >> 2; if (cpi->consec_zero_mv && (cpi->consec_zero_mv_alloc_size < current_size)) { aom_free(cpi->consec_zero_mv); cpi->consec_zero_mv_alloc_size = 0; CHECK_MEM_ERROR(cm, cpi->consec_zero_mv, aom_malloc(current_size * sizeof(*cpi->consec_zero_mv))); cpi->consec_zero_mv_alloc_size = current_size; } assert(cpi->consec_zero_mv != NULL); memset(cpi->consec_zero_mv, 0, current_size * sizeof(*cpi->consec_zero_mv)); } if (cpi->scaled_last_source_available) { cpi->last_source = &cpi->scaled_last_source; cpi->scaled_last_source_available = 0; } else if (cpi->unscaled_last_source != NULL) { cpi->last_source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_last_source, &cpi->scaled_last_source, filter_scaler, phase_scaler, true, false, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); } if (cpi->sf.rt_sf.use_temporal_noise_estimate) { av1_update_noise_estimate(cpi); } #if CONFIG_AV1_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && cpi->ppi->use_svc) av1_denoiser_reset_on_first_frame(cpi); #endif // For 1 spatial layer encoding: if the (non-LAST) reference has different // resolution from the source then disable that reference. This is to avoid // significant increase in encode time from scaling the references in // av1_scale_references. Note GOLDEN is forced to update on the (first/tigger) // resized frame and ALTREF will be refreshed ~4 frames later, so both // references become available again after few frames. // For superres: don't disable golden reference. if (svc->number_spatial_layers == 1) { if (!cpi->oxcf.superres_cfg.enable_superres) { if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) { const YV12_BUFFER_CONFIG *const ref = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); if (ref == NULL || ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { cpi->ref_frame_flags ^= AOM_GOLD_FLAG; } } } if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]) { const YV12_BUFFER_CONFIG *const ref = get_ref_frame_yv12_buf(cm, ALTREF_FRAME); if (ref == NULL || ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { cpi->ref_frame_flags ^= AOM_ALT_FLAG; } } } int scale_references = 0; #if CONFIG_FPMT_TEST scale_references = cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0; #endif // CONFIG_FPMT_TEST if (scale_references || cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { if (!frame_is_intra_only(cm)) { av1_scale_references(cpi, filter_scaler, phase_scaler, 1); } } av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q, q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq, cpi->oxcf.mode == ALLINTRA, cpi->oxcf.tune_cfg.tuning); av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed); av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, cm->seq_params->bit_depth); av1_set_variance_partition_thresholds(cpi, q, 0); av1_setup_frame(cpi); // Check if this high_source_sad (scene/slide change) frame should be // encoded at high/max QP, and if so, set the q and adjust some rate // control parameters. if (cpi->sf.rt_sf.overshoot_detection_cbr == FAST_DETECTION_MAXQ && cpi->rc.high_source_sad) { if (av1_encodedframe_overshoot_cbr(cpi, &q)) { av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q, q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq, cpi->oxcf.mode == ALLINTRA, cpi->oxcf.tune_cfg.tuning); av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed); av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, cm->seq_params->bit_depth); av1_set_variance_partition_thresholds(cpi, q, 0); if (frame_is_intra_only(cm) || cm->features.error_resilient_mode || cm->features.primary_ref_frame == PRIMARY_REF_NONE) av1_setup_frame(cpi); } } av1_apply_active_map(cpi); if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ) av1_cyclic_refresh_setup(cpi); if (cm->seg.enabled) { if (!cm->seg.update_data && cm->prev_frame) { segfeatures_copy(&cm->seg, &cm->prev_frame->seg); cm->seg.enabled = cm->prev_frame->seg.enabled; } else { av1_calculate_segdata(&cm->seg); } } else { memset(&cm->seg, 0, sizeof(cm->seg)); } segfeatures_copy(&cm->cur_frame->seg, &cm->seg); cm->cur_frame->seg.enabled = cm->seg.enabled; // This is for rtc temporal filtering case. if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf) { const SequenceHeader *seq_params = cm->seq_params; if (cpi->orig_source.buffer_alloc_sz == 0 || cpi->rc.prev_coded_width != cpi->oxcf.frm_dim_cfg.width || cpi->rc.prev_coded_height != cpi->oxcf.frm_dim_cfg.height) { // Allocate a source buffer to store the true source for psnr calculation. if (aom_alloc_frame_buffer( &cpi->orig_source, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate scaled buffer"); } aom_yv12_copy_y(cpi->source, &cpi->orig_source, 1); aom_yv12_copy_u(cpi->source, &cpi->orig_source, 1); aom_yv12_copy_v(cpi->source, &cpi->orig_source, 1); } #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, av1_encode_frame_time); #endif // Set the motion vector precision based on mv stats from the last coded // frame. if (!frame_is_intra_only(cm)) av1_pick_and_set_high_precision_mv(cpi, q); // transform / motion compensation build reconstruction frame av1_encode_frame(cpi); if (!cpi->rc.rtc_external_ratectrl && !frame_is_intra_only(cm)) update_motion_stat(cpi); // Adjust the refresh of the golden (longer-term) reference based on QP // selected for this frame. This is for CBR real-time mode, and only // for single layer without usage of the set_ref_frame_config (so // reference structure for 1 layer is set internally). if (!frame_is_intra_only(cm) && cpi->oxcf.rc_cfg.mode == AOM_CBR && cpi->oxcf.mode == REALTIME && svc->number_spatial_layers == 1 && svc->number_temporal_layers == 1 && !cpi->rc.rtc_external_ratectrl && !cpi->ppi->rtc_ref.set_ref_frame_config && sf->rt_sf.gf_refresh_based_on_qp) av1_adjust_gf_refresh_qp_one_pass_rt(cpi); // For non-svc: if scaling is required, copy scaled_source // into scaled_last_source. if (cm->current_frame.frame_number > 1 && !cpi->ppi->use_svc && cpi->scaled_source.y_buffer != NULL && cpi->scaled_last_source.y_buffer != NULL && cpi->scaled_source.y_crop_width == cpi->scaled_last_source.y_crop_width && cpi->scaled_source.y_crop_height == cpi->scaled_last_source.y_crop_height && (cm->width != cpi->unscaled_source->y_crop_width || cm->height != cpi->unscaled_source->y_crop_height)) { cpi->scaled_last_source_available = 1; aom_yv12_copy_y(&cpi->scaled_source, &cpi->scaled_last_source, 1); aom_yv12_copy_u(&cpi->scaled_source, &cpi->scaled_last_source, 1); aom_yv12_copy_v(&cpi->scaled_source, &cpi->scaled_last_source, 1); } #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, av1_encode_frame_time); #endif #if CONFIG_INTERNAL_STATS ++cpi->frame_recode_hits; #endif return AOM_CODEC_OK; } #if !CONFIG_REALTIME_ONLY /*!\brief Recode loop for encoding one frame. the purpose of encoding one frame * for multiple times can be approaching a target bitrate or adjusting the usage * of global motions. * * \ingroup high_level_algo * * \param[in] cpi Top-level encoder structure * \param[in] size Bitstream size * \param[out] dest Bitstream output buffer * \param[in] dest_size Bitstream output buffer size * * \return Returns a value to indicate if the encoding is done successfully. * \retval #AOM_CODEC_OK * \retval -1 * \retval #AOM_CODEC_ERROR */ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest, size_t dest_size) { AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; GlobalMotionInfo *const gm_info = &cpi->gm_info; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const QuantizationCfg *const q_cfg = &oxcf->q_cfg; const int allow_recode = (cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE); // Must allow recode if minimum compression ratio is set. assert(IMPLIES(oxcf->rc_cfg.min_cr > 0, allow_recode)); set_size_independent_vars(cpi); if (is_stat_consumption_stage_twopass(cpi) && cpi->sf.interp_sf.adaptive_interp_filter_search) cpi->interp_search_flags.interp_filter_search_mask = av1_setup_interp_filter_search_mask(cpi); av1_setup_frame_size(cpi); if (av1_superres_in_recode_allowed(cpi) && cpi->superres_mode != AOM_SUPERRES_NONE && cm->superres_scale_denominator == SCALE_NUMERATOR) { // Superres mode is currently enabled, but the denominator selected will // disable superres. So no need to continue, as we will go through another // recode loop for full-resolution after this anyway. return -1; } int top_index = 0, bottom_index = 0; int q = 0, q_low = 0, q_high = 0; av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); q_low = bottom_index; q_high = top_index; av1_set_mv_search_params(cpi); allocate_gradient_info_for_hog(cpi); allocate_src_var_of_4x4_sub_block_buf(cpi); if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) variance_partition_alloc(cpi); if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi); #if CONFIG_COLLECT_COMPONENT_TIMING printf("\n Encoding a frame: \n"); #endif #if !CONFIG_RD_COMMAND // Determine whether to use screen content tools using two fast encoding. if (!cpi->sf.hl_sf.disable_extra_sc_testing && !cpi->use_ducky_encode) av1_determine_sc_tools_with_encoding(cpi, q); #endif // !CONFIG_RD_COMMAND #if CONFIG_TUNE_VMAF if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { av1_vmaf_neg_preprocessing(cpi, cpi->unscaled_source); } #endif #if CONFIG_TUNE_BUTTERAUGLI cpi->butteraugli_info.recon_set = false; int original_q = 0; #endif cpi->num_frame_recode = 0; // Loop variables int loop = 0; int loop_count = 0; int overshoot_seen = 0; int undershoot_seen = 0; int low_cr_seen = 0; int last_loop_allow_hp = 0; do { loop = 0; int do_mv_stats_collection = 1; // if frame was scaled calculate global_motion_search again if already // done if (loop_count > 0 && cpi->source && gm_info->search_done) { if (cpi->source->y_crop_width != cm->width || cpi->source->y_crop_height != cm->height) { gm_info->search_done = 0; } } cpi->source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_source, &cpi->scaled_source, EIGHTTAP_REGULAR, 0, false, false, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); #if CONFIG_TUNE_BUTTERAUGLI if (oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { if (loop_count == 0) { original_q = q; // TODO(sdeng): different q here does not make big difference. Use a // faster pass instead. q = 96; av1_setup_butteraugli_source(cpi); } else { q = original_q; } } #endif if (cpi->unscaled_last_source != NULL) { cpi->last_source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_last_source, &cpi->scaled_last_source, EIGHTTAP_REGULAR, 0, false, false, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); } int scale_references = 0; #if CONFIG_FPMT_TEST scale_references = cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0; #endif // CONFIG_FPMT_TEST if (scale_references || cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { if (!frame_is_intra_only(cm)) { if (loop_count > 0) { release_scaled_references(cpi); } av1_scale_references(cpi, EIGHTTAP_REGULAR, 0, 0); } } #if CONFIG_TUNE_VMAF if (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) { cpi->vmaf_info.original_qindex = q; q = av1_get_vmaf_base_qindex(cpi, q); } #endif #if CONFIG_RD_COMMAND RD_COMMAND *rd_command = &cpi->rd_command; RD_OPTION option = rd_command->option_ls[rd_command->frame_index]; if (option == RD_OPTION_SET_Q || option == RD_OPTION_SET_Q_RDMULT) { q = rd_command->q_index_ls[rd_command->frame_index]; } #endif // CONFIG_RD_COMMAND #if CONFIG_BITRATE_ACCURACY #if CONFIG_THREE_PASS if (oxcf->pass == AOM_RC_THIRD_PASS && cpi->vbr_rc_info.ready == 1) { int frame_coding_idx = av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index); if (frame_coding_idx < cpi->vbr_rc_info.total_frame_count) { q = cpi->vbr_rc_info.q_index_list[frame_coding_idx]; } else { // TODO(angiebird): Investigate why sometimes there is an extra frame // after the last GOP. q = cpi->vbr_rc_info.base_q_index; } } #else if (cpi->vbr_rc_info.q_index_list_ready) { q = cpi->vbr_rc_info.q_index_list[cpi->gf_frame_index]; } #endif // CONFIG_THREE_PASS #endif // CONFIG_BITRATE_ACCURACY #if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY // TODO(angiebird): Move this into a function. if (oxcf->pass == AOM_RC_THIRD_PASS) { int frame_coding_idx = av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index); double qstep_ratio = cpi->vbr_rc_info.qstep_ratio_list[frame_coding_idx]; FRAME_UPDATE_TYPE update_type = cpi->vbr_rc_info.update_type_list[frame_coding_idx]; rc_log_frame_encode_param(&cpi->rc_log, frame_coding_idx, qstep_ratio, q, update_type); } #endif // CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY if (cpi->use_ducky_encode) { const DuckyEncodeFrameInfo *frame_info = &cpi->ducky_encode_info.frame_info; if (frame_info->qp_mode == DUCKY_ENCODE_FRAME_MODE_QINDEX) { q = frame_info->q_index; cm->delta_q_info.delta_q_present_flag = frame_info->delta_q_enabled; } } av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q, q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq, oxcf->mode == ALLINTRA, oxcf->tune_cfg.tuning); av1_set_speed_features_qindex_dependent(cpi, oxcf->speed); av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, cm->seq_params->bit_depth); av1_set_variance_partition_thresholds(cpi, q, 0); // printf("Frame %d/%d: q = %d, frame_type = %d superres_denom = %d\n", // cm->current_frame.frame_number, cm->show_frame, q, // cm->current_frame.frame_type, cm->superres_scale_denominator); if (loop_count == 0) { av1_setup_frame(cpi); } else if (get_primary_ref_frame_buf(cm) == NULL) { // Base q-index may have changed, so we need to assign proper default coef // probs before every iteration. av1_default_coef_probs(cm); av1_setup_frame_contexts(cm); } if (q_cfg->aq_mode == VARIANCE_AQ) { av1_vaq_frame_setup(cpi); } else if (q_cfg->aq_mode == COMPLEXITY_AQ) { av1_setup_in_frame_q_adj(cpi); } if (cm->seg.enabled) { if (!cm->seg.update_data && cm->prev_frame) { segfeatures_copy(&cm->seg, &cm->prev_frame->seg); cm->seg.enabled = cm->prev_frame->seg.enabled; } else { av1_calculate_segdata(&cm->seg); } } else { memset(&cm->seg, 0, sizeof(cm->seg)); } segfeatures_copy(&cm->cur_frame->seg, &cm->seg); cm->cur_frame->seg.enabled = cm->seg.enabled; #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, av1_encode_frame_time); #endif // Set the motion vector precision based on mv stats from the last coded // frame. if (!frame_is_intra_only(cm)) { av1_pick_and_set_high_precision_mv(cpi, q); // If the precision has changed during different iteration of the loop, // then we need to reset the global motion vectors if (loop_count > 0 && cm->features.allow_high_precision_mv != last_loop_allow_hp) { gm_info->search_done = 0; } last_loop_allow_hp = cm->features.allow_high_precision_mv; } // transform / motion compensation build reconstruction frame av1_encode_frame(cpi); // Disable mv_stats collection for parallel frames based on update flag. if (!cpi->do_frame_data_update) do_mv_stats_collection = 0; // Reset the mv_stats in case we are interrupted by an intraframe or an // overlay frame. if (cpi->mv_stats.valid && do_mv_stats_collection) av1_zero(cpi->mv_stats); // Gather the mv_stats for the next frame if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA && av1_frame_allows_smart_mv(cpi) && do_mv_stats_collection) { av1_collect_mv_stats(cpi, q); } #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, av1_encode_frame_time); #endif #if CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND const int do_dummy_pack = 1; #else // CONFIG_BITRATE_ACCURACY // Dummy pack of the bitstream using up to date stats to get an // accurate estimate of output frame size to determine if we need // to recode. const int do_dummy_pack = (cpi->sf.hl_sf.recode_loop >= ALLOW_RECODE_KFARFGF && oxcf->rc_cfg.mode != AOM_Q) || oxcf->rc_cfg.min_cr > 0; #endif // CONFIG_BITRATE_ACCURACY if (do_dummy_pack) { av1_finalize_encoded_frame(cpi); int largest_tile_id = 0; // Output from bitstream: unused here rc->coefficient_size = 0; if (av1_pack_bitstream(cpi, dest, dest_size, size, &largest_tile_id) != AOM_CODEC_OK) { return AOM_CODEC_ERROR; } // bits used for this frame rc->projected_frame_size = (int)(*size) << 3; #if CONFIG_RD_COMMAND PSNR_STATS psnr; aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr); printf("q %d rdmult %d rate %d dist %" PRIu64 "\n", q, cpi->rd.RDMULT, rc->projected_frame_size, psnr.sse[0]); ++rd_command->frame_index; if (rd_command->frame_index == rd_command->frame_count) { return AOM_CODEC_ERROR; } #endif // CONFIG_RD_COMMAND #if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY if (oxcf->pass == AOM_RC_THIRD_PASS) { int frame_coding_idx = av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index); rc_log_frame_entropy(&cpi->rc_log, frame_coding_idx, rc->projected_frame_size, rc->coefficient_size); } #endif // CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY } #if CONFIG_TUNE_VMAF if (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) { q = cpi->vmaf_info.original_qindex; } #endif if (allow_recode) { // Update q and decide whether to do a recode loop recode_loop_update_q(cpi, &loop, &q, &q_low, &q_high, top_index, bottom_index, &undershoot_seen, &overshoot_seen, &low_cr_seen, loop_count); } #if CONFIG_TUNE_BUTTERAUGLI if (loop_count == 0 && oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { loop = 1; av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.4); } #endif if (cpi->use_ducky_encode) { // Ducky encode currently does not support recode loop. loop = 0; } #if CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND loop = 0; // turn off recode loop when CONFIG_BITRATE_ACCURACY is on #endif // CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND if (loop) { ++loop_count; cpi->num_frame_recode = (cpi->num_frame_recode < (NUM_RECODES_PER_FRAME - 1)) ? (cpi->num_frame_recode + 1) : (NUM_RECODES_PER_FRAME - 1); #if CONFIG_INTERNAL_STATS ++cpi->frame_recode_hits; #endif } #if CONFIG_COLLECT_COMPONENT_TIMING if (loop) printf("\n Recoding:"); #endif } while (loop); return AOM_CODEC_OK; } #endif // !CONFIG_REALTIME_ONLY // TODO(jingning, paulwilkins): Set up high grain level to test // hardware decoders. Need to adapt the actual noise variance // according to the difference between reconstructed frame and the // source signal. static void set_grain_syn_params(AV1_COMMON *cm) { aom_film_grain_t *film_grain_params = &cm->film_grain_params; film_grain_params->apply_grain = 1; film_grain_params->update_parameters = 1; film_grain_params->random_seed = rand() & 0xffff; film_grain_params->num_y_points = 1; film_grain_params->scaling_points_y[0][0] = 128; film_grain_params->scaling_points_y[0][1] = 100; if (!cm->seq_params->monochrome) { film_grain_params->num_cb_points = 1; film_grain_params->scaling_points_cb[0][0] = 128; film_grain_params->scaling_points_cb[0][1] = 100; film_grain_params->num_cr_points = 1; film_grain_params->scaling_points_cr[0][0] = 128; film_grain_params->scaling_points_cr[0][1] = 100; } else { film_grain_params->num_cb_points = 0; film_grain_params->num_cr_points = 0; } film_grain_params->chroma_scaling_from_luma = 0; film_grain_params->scaling_shift = 1; film_grain_params->ar_coeff_lag = 0; film_grain_params->ar_coeff_shift = 1; film_grain_params->overlap_flag = 1; film_grain_params->grain_scale_shift = 0; } /*!\brief Recode loop or a single loop for encoding one frame, followed by * in-loop deblocking filters, CDEF filters, and restoration filters. * * \ingroup high_level_algo * \callgraph * \callergraph * * \param[in] cpi Top-level encoder structure * \param[in] size Bitstream size * \param[out] dest Bitstream output buffer * \param[in] dest_size Bitstream output buffer size * \param[in] sse Total distortion of the frame * \param[in] rate Total rate of the frame * \param[in] largest_tile_id Tile id of the last tile * * \return Returns a value to indicate if the encoding is done successfully. * \retval #AOM_CODEC_OK * \retval #AOM_CODEC_ERROR */ static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size, uint8_t *dest, size_t dest_size, int64_t *sse, int64_t *rate, int *largest_tile_id) { #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, encode_with_or_without_recode_time); #endif for (int i = 0; i < NUM_RECODES_PER_FRAME; i++) { cpi->do_update_frame_probs_txtype[i] = 0; cpi->do_update_frame_probs_obmc[i] = 0; cpi->do_update_frame_probs_warp[i] = 0; cpi->do_update_frame_probs_interpfilter[i] = 0; } cpi->do_update_vbr_bits_off_target_fast = 0; int err; #if CONFIG_REALTIME_ONLY err = encode_without_recode(cpi); #else if (cpi->sf.hl_sf.recode_loop == DISALLOW_RECODE) err = encode_without_recode(cpi); else err = encode_with_recode_loop(cpi, size, dest, dest_size); #endif #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, encode_with_or_without_recode_time); #endif if (err != AOM_CODEC_OK) { if (err == -1) { // special case as described in encode_with_recode_loop(). // Encoding was skipped. err = AOM_CODEC_OK; if (sse != NULL) *sse = INT64_MAX; if (rate != NULL) *rate = INT64_MAX; *largest_tile_id = 0; } return err; } #ifdef OUTPUT_YUV_DENOISED const AV1EncoderConfig *const oxcf = &cpi->oxcf; if (oxcf->noise_sensitivity > 0 && denoise_svc(cpi)) { aom_write_yuv_frame(yuv_denoised_file, &cpi->denoiser.running_avg_y[INTRA_FRAME]); } #endif AV1_COMMON *const cm = &cpi->common; SequenceHeader *const seq_params = cm->seq_params; // Special case code to reduce pulsing when key frames are forced at a // fixed interval. Note the reconstruction error if it is the frame before // the force key frame if (cpi->ppi->p_rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) { #if CONFIG_AV1_HIGHBITDEPTH if (seq_params->use_highbitdepth) { cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf); } else { cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); } #else cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); #endif } cm->cur_frame->buf.color_primaries = seq_params->color_primaries; cm->cur_frame->buf.transfer_characteristics = seq_params->transfer_characteristics; cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients; cm->cur_frame->buf.monochrome = seq_params->monochrome; cm->cur_frame->buf.chroma_sample_position = seq_params->chroma_sample_position; cm->cur_frame->buf.color_range = seq_params->color_range; cm->cur_frame->buf.render_width = cm->render_width; cm->cur_frame->buf.render_height = cm->render_height; if (!cpi->mt_info.pipeline_lpf_mt_with_enc) set_postproc_filter_default_params(&cpi->common); if (!cm->features.allow_intrabc) { loopfilter_frame(cpi, cm); } if (cpi->oxcf.mode != ALLINTRA && !cpi->ppi->rtc_ref.non_reference_frame) { extend_frame_borders(cpi); } #ifdef OUTPUT_YUV_REC aom_write_one_yuv_frame(cm, &cm->cur_frame->buf); #endif if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_FILM) { set_grain_syn_params(cm); } av1_finalize_encoded_frame(cpi); // Build the bitstream #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, av1_pack_bitstream_final_time); #endif cpi->rc.coefficient_size = 0; if (av1_pack_bitstream(cpi, dest, dest_size, size, largest_tile_id) != AOM_CODEC_OK) return AOM_CODEC_ERROR; #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, av1_pack_bitstream_final_time); #endif if (cpi->rc.postencode_drop && allow_postencode_drop_rtc(cpi) && av1_postencode_drop_cbr(cpi, size)) { return AOM_CODEC_OK; } // Compute sse and rate. if (sse != NULL) { #if CONFIG_AV1_HIGHBITDEPTH *sse = (seq_params->use_highbitdepth) ? aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf) : aom_get_y_sse(cpi->source, &cm->cur_frame->buf); #else *sse = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); #endif } if (rate != NULL) { const int64_t bits = (*size << 3); *rate = (bits << 5); // To match scale. } #if !CONFIG_REALTIME_ONLY if (cpi->use_ducky_encode) { PSNR_STATS psnr; aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr); DuckyEncodeFrameResult *frame_result = &cpi->ducky_encode_info.frame_result; frame_result->global_order_idx = cm->cur_frame->display_order_hint; frame_result->q_index = cm->quant_params.base_qindex; frame_result->rdmult = cpi->rd.RDMULT; frame_result->rate = (int)(*size) * 8; frame_result->dist = psnr.sse[0]; frame_result->psnr = psnr.psnr[0]; } #endif // !CONFIG_REALTIME_ONLY return AOM_CODEC_OK; } static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size, uint8_t *dest, size_t dest_size, int *largest_tile_id) { const AV1_COMMON *const cm = &cpi->common; assert(cm->seq_params->enable_superres); assert(av1_superres_in_recode_allowed(cpi)); aom_codec_err_t err = AOM_CODEC_OK; av1_save_all_coding_context(cpi); int64_t sse1 = INT64_MAX; int64_t rate1 = INT64_MAX; int largest_tile_id1 = 0; int64_t sse2 = INT64_MAX; int64_t rate2 = INT64_MAX; int largest_tile_id2; double proj_rdcost1 = DBL_MAX; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const FRAME_UPDATE_TYPE update_type = gf_group->update_type[cpi->gf_frame_index]; const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth; // Encode with superres. if (cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_ALL) { SuperResCfg *const superres_cfg = &cpi->oxcf.superres_cfg; int64_t superres_sses[SCALE_NUMERATOR]; int64_t superres_rates[SCALE_NUMERATOR]; int superres_largest_tile_ids[SCALE_NUMERATOR]; // Use superres for Key-frames and Alt-ref frames only. if (update_type != OVERLAY_UPDATE && update_type != INTNL_OVERLAY_UPDATE) { for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; ++denom) { superres_cfg->superres_scale_denominator = denom; superres_cfg->superres_kf_scale_denominator = denom; const int this_index = denom - (SCALE_NUMERATOR + 1); cpi->superres_mode = AOM_SUPERRES_AUTO; // Super-res on for this loop. err = encode_with_recode_loop_and_filter( cpi, size, dest, dest_size, &superres_sses[this_index], &superres_rates[this_index], &superres_largest_tile_ids[this_index]); cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res). if (err != AOM_CODEC_OK) return err; restore_all_coding_context(cpi); } // Reset. superres_cfg->superres_scale_denominator = SCALE_NUMERATOR; superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR; } else { for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; ++denom) { const int this_index = denom - (SCALE_NUMERATOR + 1); superres_sses[this_index] = INT64_MAX; superres_rates[this_index] = INT64_MAX; superres_largest_tile_ids[this_index] = 0; } } // Encode without superres. assert(cpi->superres_mode == AOM_SUPERRES_NONE); err = encode_with_recode_loop_and_filter(cpi, size, dest, dest_size, &sse2, &rate2, &largest_tile_id2); if (err != AOM_CODEC_OK) return err; // Note: Both use common rdmult based on base qindex of fullres. const int64_t rdmult = av1_compute_rd_mult_based_on_qindex( bit_depth, update_type, cm->quant_params.base_qindex, cpi->oxcf.tune_cfg.tuning); // Find the best rdcost among all superres denoms. int best_denom = -1; for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; ++denom) { const int this_index = denom - (SCALE_NUMERATOR + 1); const int64_t this_sse = superres_sses[this_index]; const int64_t this_rate = superres_rates[this_index]; const int this_largest_tile_id = superres_largest_tile_ids[this_index]; const double this_rdcost = RDCOST_DBL_WITH_NATIVE_BD_DIST( rdmult, this_rate, this_sse, bit_depth); if (this_rdcost < proj_rdcost1) { sse1 = this_sse; rate1 = this_rate; largest_tile_id1 = this_largest_tile_id; proj_rdcost1 = this_rdcost; best_denom = denom; } } const double proj_rdcost2 = RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate2, sse2, bit_depth); // Re-encode with superres if it's better. if (proj_rdcost1 < proj_rdcost2) { restore_all_coding_context(cpi); // TODO(urvang): We should avoid rerunning the recode loop by saving // previous output+state, or running encode only for the selected 'q' in // previous step. // Again, temporarily force the best denom. superres_cfg->superres_scale_denominator = best_denom; superres_cfg->superres_kf_scale_denominator = best_denom; int64_t sse3 = INT64_MAX; int64_t rate3 = INT64_MAX; cpi->superres_mode = AOM_SUPERRES_AUTO; // Super-res on for this recode loop. err = encode_with_recode_loop_and_filter(cpi, size, dest, dest_size, &sse3, &rate3, largest_tile_id); cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res). assert(sse1 == sse3); assert(rate1 == rate3); assert(largest_tile_id1 == *largest_tile_id); // Reset. superres_cfg->superres_scale_denominator = SCALE_NUMERATOR; superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR; } else { *largest_tile_id = largest_tile_id2; } } else { assert(cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_DUAL); cpi->superres_mode = AOM_SUPERRES_AUTO; // Super-res on for this recode loop. err = encode_with_recode_loop_and_filter(cpi, size, dest, dest_size, &sse1, &rate1, &largest_tile_id1); cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res). if (err != AOM_CODEC_OK) return err; restore_all_coding_context(cpi); // Encode without superres. assert(cpi->superres_mode == AOM_SUPERRES_NONE); err = encode_with_recode_loop_and_filter(cpi, size, dest, dest_size, &sse2, &rate2, &largest_tile_id2); if (err != AOM_CODEC_OK) return err; // Note: Both use common rdmult based on base qindex of fullres. const int64_t rdmult = av1_compute_rd_mult_based_on_qindex( bit_depth, update_type, cm->quant_params.base_qindex, cpi->oxcf.tune_cfg.tuning); proj_rdcost1 = RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate1, sse1, bit_depth); const double proj_rdcost2 = RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate2, sse2, bit_depth); // Re-encode with superres if it's better. if (proj_rdcost1 < proj_rdcost2) { restore_all_coding_context(cpi); // TODO(urvang): We should avoid rerunning the recode loop by saving // previous output+state, or running encode only for the selected 'q' in // previous step. int64_t sse3 = INT64_MAX; int64_t rate3 = INT64_MAX; cpi->superres_mode = AOM_SUPERRES_AUTO; // Super-res on for this recode loop. err = encode_with_recode_loop_and_filter(cpi, size, dest, dest_size, &sse3, &rate3, largest_tile_id); cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res). assert(sse1 == sse3); assert(rate1 == rate3); assert(largest_tile_id1 == *largest_tile_id); } else { *largest_tile_id = largest_tile_id2; } } return err; } // Conditions to disable cdf_update mode in selective mode for real-time. // Handle case for layers, scene change, and resizing. static inline int selective_disable_cdf_rtc(const AV1_COMP *cpi) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; // For single layer. if (cpi->svc.number_spatial_layers == 1 && cpi->svc.number_temporal_layers == 1) { // Don't disable on intra_only, scene change (high_source_sad = 1), // or resized frame. To avoid quality loss force enable at // for ~30 frames after key or scene/slide change, and // after 8 frames since last update if frame_source_sad > 0. if (frame_is_intra_only(cm) || is_frame_resize_pending(cpi) || rc->high_source_sad || rc->frames_since_key < 30 || (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->cyclic_refresh->counter_encode_maxq_scene_change < 30) || (cpi->frames_since_last_update > 8 && cpi->rc.frame_source_sad > 0)) return 0; else return 1; } else if (cpi->svc.number_temporal_layers > 1) { // Disable only on top temporal enhancement layer for now. return cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1; } return 1; } #if !CONFIG_REALTIME_ONLY static void subtract_stats(FIRSTPASS_STATS *section, const FIRSTPASS_STATS *frame) { section->frame -= frame->frame; section->weight -= frame->weight; section->intra_error -= frame->intra_error; section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy; section->coded_error -= frame->coded_error; section->sr_coded_error -= frame->sr_coded_error; section->pcnt_inter -= frame->pcnt_inter; section->pcnt_motion -= frame->pcnt_motion; section->pcnt_second_ref -= frame->pcnt_second_ref; section->pcnt_neutral -= frame->pcnt_neutral; section->intra_skip_pct -= frame->intra_skip_pct; section->inactive_zone_rows -= frame->inactive_zone_rows; section->inactive_zone_cols -= frame->inactive_zone_cols; section->MVr -= frame->MVr; section->mvr_abs -= frame->mvr_abs; section->MVc -= frame->MVc; section->mvc_abs -= frame->mvc_abs; section->MVrv -= frame->MVrv; section->MVcv -= frame->MVcv; section->mv_in_out_count -= frame->mv_in_out_count; section->new_mv_count -= frame->new_mv_count; section->count -= frame->count; section->duration -= frame->duration; } static void calculate_frame_avg_haar_energy(AV1_COMP *cpi) { TWO_PASS *const twopass = &cpi->ppi->twopass; const FIRSTPASS_STATS *const total_stats = twopass->stats_buf_ctx->total_stats; if (is_one_pass_rt_params(cpi) || (cpi->oxcf.q_cfg.deltaq_mode != DELTA_Q_PERCEPTUAL) || (is_fp_wavelet_energy_invalid(total_stats) == 0)) return; const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE) ? cpi->initial_mbs : cpi->common.mi_params.MBs; const YV12_BUFFER_CONFIG *const unfiltered_source = cpi->unfiltered_source; const uint8_t *const src = unfiltered_source->y_buffer; const int hbd = unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH; const int stride = unfiltered_source->y_stride; const BLOCK_SIZE fp_block_size = get_fp_block_size(cpi->is_screen_content_type); const int fp_block_size_width = block_size_wide[fp_block_size]; const int fp_block_size_height = block_size_high[fp_block_size]; const int num_unit_cols = get_num_blocks(unfiltered_source->y_crop_width, fp_block_size_width); const int num_unit_rows = get_num_blocks(unfiltered_source->y_crop_height, fp_block_size_height); const int num_8x8_cols = num_unit_cols * (fp_block_size_width / 8); const int num_8x8_rows = num_unit_rows * (fp_block_size_height / 8); int64_t frame_avg_wavelet_energy = av1_haar_ac_sad_mxn_uint8_input( src, stride, hbd, num_8x8_rows, num_8x8_cols); cpi->twopass_frame.frame_avg_haar_energy = log1p((double)frame_avg_wavelet_energy / num_mbs); } #endif /*!\brief Run the final pass encoding for 1-pass/2-pass encoding mode, and pack * the bitstream * * \ingroup high_level_algo * \callgraph * \callergraph * * \param[in] cpi Top-level encoder structure * \param[in] size Bitstream size * \param[out] dest Bitstream output buffer * \param[in] dest_size Bitstream output buffer size * * \return Returns a value to indicate if the encoding is done successfully. * \retval #AOM_CODEC_OK * \retval #AOM_CODEC_ERROR */ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, size_t dest_size) { AV1_COMMON *const cm = &cpi->common; SequenceHeader *const seq_params = cm->seq_params; CurrentFrame *const current_frame = &cm->current_frame; const AV1EncoderConfig *const oxcf = &cpi->oxcf; struct segmentation *const seg = &cm->seg; FeatureFlags *const features = &cm->features; const TileConfig *const tile_cfg = &oxcf->tile_cfg; assert(cpi->source != NULL); cpi->td.mb.e_mbd.cur_buf = cpi->source; #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, encode_frame_to_data_rate_time); #endif #if !CONFIG_REALTIME_ONLY calculate_frame_avg_haar_energy(cpi); #endif // frame type has been decided outside of this function call cm->cur_frame->frame_type = current_frame->frame_type; cm->tiles.large_scale = tile_cfg->enable_large_scale_tile; cm->tiles.single_tile_decoding = tile_cfg->enable_single_tile_decoding; features->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm); // features->allow_ref_frame_mvs needs to be written into the frame header // while cm->tiles.large_scale is 1, therefore, "cm->tiles.large_scale=1" case // is separated from frame_might_allow_ref_frame_mvs(). features->allow_ref_frame_mvs &= !cm->tiles.large_scale; features->allow_warped_motion = oxcf->motion_mode_cfg.allow_warped_motion && frame_might_allow_warped_motion(cm); cpi->last_frame_type = current_frame->frame_type; if (frame_is_intra_only(cm)) { cpi->frames_since_last_update = 0; } if (frame_is_sframe(cm)) { GF_GROUP *gf_group = &cpi->ppi->gf_group; // S frame will wipe out any previously encoded altref so we cannot place // an overlay frame gf_group->update_type[gf_group->size] = GF_UPDATE; } if (encode_show_existing_frame(cm)) { #if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY // TODO(angiebird): Move this into a function. if (oxcf->pass == AOM_RC_THIRD_PASS) { int frame_coding_idx = av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index); rc_log_frame_encode_param( &cpi->rc_log, frame_coding_idx, 1, 255, cpi->ppi->gf_group.update_type[cpi->gf_frame_index]); } #endif av1_finalize_encoded_frame(cpi); // Build the bitstream int largest_tile_id = 0; // Output from bitstream: unused here cpi->rc.coefficient_size = 0; if (av1_pack_bitstream(cpi, dest, dest_size, size, &largest_tile_id) != AOM_CODEC_OK) return AOM_CODEC_ERROR; if (seq_params->frame_id_numbers_present_flag && current_frame->frame_type == KEY_FRAME) { // Displaying a forward key-frame, so reset the ref buffer IDs int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show]; for (int i = 0; i < REF_FRAMES; i++) cm->ref_frame_id[i] = display_frame_id; } #if DUMP_RECON_FRAMES == 1 // NOTE(zoeliu): For debug - Output the filtered reconstructed video. av1_dump_filtered_recon_frames(cpi); #endif // DUMP_RECON_FRAMES // NOTE: Save the new show frame buffer index for --test-code=warn, i.e., // for the purpose to verify no mismatch between encoder and decoder. if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame; #if CONFIG_AV1_TEMPORAL_DENOISING av1_denoiser_update_ref_frame(cpi); #endif // Since we allocate a spot for the OVERLAY frame in the gf group, we need // to do post-encoding update accordingly. av1_set_target_rate(cpi, cm->width, cm->height); if (is_psnr_calc_enabled(cpi)) { cpi->source = realloc_and_scale_source(cpi, cm->cur_frame->buf.y_crop_width, cm->cur_frame->buf.y_crop_height); } #if !CONFIG_REALTIME_ONLY if (cpi->use_ducky_encode) { PSNR_STATS psnr; aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr); DuckyEncodeFrameResult *frame_result = &cpi->ducky_encode_info.frame_result; frame_result->global_order_idx = cm->cur_frame->display_order_hint; frame_result->q_index = cm->quant_params.base_qindex; frame_result->rdmult = cpi->rd.RDMULT; frame_result->rate = (int)(*size) * 8; frame_result->dist = psnr.sse[0]; frame_result->psnr = psnr.psnr[0]; } #endif // !CONFIG_REALTIME_ONLY update_counters_for_show_frame(cpi); return AOM_CODEC_OK; } // Work out whether to force_integer_mv this frame if (!is_stat_generation_stage(cpi) && cpi->common.features.allow_screen_content_tools && !frame_is_intra_only(cm) && !cpi->sf.rt_sf.use_nonrd_pick_mode) { if (cpi->common.seq_params->force_integer_mv == 2) { // Adaptive mode: see what previous frame encoded did if (cpi->unscaled_last_source != NULL) { features->cur_frame_force_integer_mv = av1_is_integer_mv( cpi->source, cpi->unscaled_last_source, &cpi->force_intpel_info); } else { cpi->common.features.cur_frame_force_integer_mv = 0; } } else { cpi->common.features.cur_frame_force_integer_mv = cpi->common.seq_params->force_integer_mv; } } else { cpi->common.features.cur_frame_force_integer_mv = 0; } // This is used by av1_pack_bitstream. So this needs to be set in case of // row-mt where the encoding code will use a temporary structure. cpi->td.mb.e_mbd.cur_frame_force_integer_mv = cpi->common.features.cur_frame_force_integer_mv; // Set default state for segment based loop filter update flags. cm->lf.mode_ref_delta_update = 0; // Set various flags etc to special state if it is a key frame. if (frame_is_intra_only(cm) || frame_is_sframe(cm)) { // Reset the loop filter deltas and segmentation map. av1_reset_segment_features(cm); // If segmentation is enabled force a map update for key frames. if (seg->enabled) { seg->update_map = 1; seg->update_data = 1; } } if (tile_cfg->mtu == 0) { cpi->num_tg = tile_cfg->num_tile_groups; } else { // Use a default value for the purposes of weighting costs in probability // updates cpi->num_tg = DEFAULT_MAX_NUM_TG; } // For 1 pass CBR mode: check if we are dropping this frame. if (has_no_stats_stage(cpi) && oxcf->rc_cfg.mode == AOM_CBR) { // Always drop for spatial enhancement layer if layer bandwidth is 0. // Otherwise check for frame-dropping based on buffer level in // av1_rc_drop_frame(). if ((cpi->svc.spatial_layer_id > 0 && cpi->oxcf.rc_cfg.target_bandwidth == 0) || av1_rc_drop_frame(cpi)) { cpi->is_dropped_frame = true; } if (cpi->is_dropped_frame) { av1_setup_frame_size(cpi); av1_set_mv_search_params(cpi); av1_rc_postencode_update_drop_frame(cpi); release_scaled_references(cpi); cpi->ppi->gf_group.is_frame_dropped[cpi->gf_frame_index] = true; // A dropped frame might not be shown but it always takes a slot in the gf // group. Therefore, even when it is not shown, we still need to update // the relevant frame counters. if (cm->show_frame) { update_counters_for_show_frame(cpi); } return AOM_CODEC_OK; } } if (oxcf->tune_cfg.tuning == AOM_TUNE_SSIM || oxcf->tune_cfg.tuning == AOM_TUNE_IQ) { av1_set_mb_ssim_rdmult_scaling(cpi); } #if CONFIG_SALIENCY_MAP else if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_SALIENCY_MAP && !(cpi->source->flags & YV12_FLAG_HIGHBITDEPTH)) { if (av1_set_saliency_map(cpi) == 0) { return AOM_CODEC_MEM_ERROR; } #if !CONFIG_REALTIME_ONLY double motion_ratio = av1_setup_motion_ratio(cpi); #else double motion_ratio = 1.0; #endif if (av1_setup_sm_rdmult_scaling_factor(cpi, motion_ratio) == 0) { return AOM_CODEC_MEM_ERROR; } } #endif #if CONFIG_TUNE_VMAF else if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING || oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN || oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { av1_set_mb_vmaf_rdmult_scaling(cpi); } #endif if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI && cpi->sf.rt_sf.use_nonrd_pick_mode == 0) { av1_init_mb_wiener_var_buffer(cpi); av1_set_mb_wiener_variance(cpi); } if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) { av1_init_mb_ur_var_buffer(cpi); av1_set_mb_ur_variance(cpi); } #if CONFIG_INTERNAL_STATS memset(cpi->mode_chosen_counts, 0, MAX_MODES * sizeof(*cpi->mode_chosen_counts)); #endif if (seq_params->frame_id_numbers_present_flag) { /* Non-normative definition of current_frame_id ("frame counter" with * wraparound) */ if (cm->current_frame_id == -1) { int lsb, msb; /* quasi-random initialization of current_frame_id for a key frame */ if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) { lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff; msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff; } else { lsb = cpi->source->y_buffer[0] & 0xff; msb = cpi->source->y_buffer[1] & 0xff; } cm->current_frame_id = ((msb << 8) + lsb) % (1 << seq_params->frame_id_length); // S_frame is meant for stitching different streams of different // resolutions together, so current_frame_id must be the // same across different streams of the same content current_frame_id // should be the same and not random. 0x37 is a chosen number as start // point if (oxcf->kf_cfg.sframe_dist != 0) cm->current_frame_id = 0x37; } else { cm->current_frame_id = (cm->current_frame_id + 1 + (1 << seq_params->frame_id_length)) % (1 << seq_params->frame_id_length); } } switch (oxcf->algo_cfg.cdf_update_mode) { case 0: // No CDF update for any frames(4~6% compression loss). features->disable_cdf_update = 1; break; case 1: // Enable CDF update for all frames. if (cpi->sf.rt_sf.disable_cdf_update_non_reference_frame && cpi->ppi->rtc_ref.non_reference_frame && cpi->rc.frames_since_key > 2) features->disable_cdf_update = 1; else if (cpi->sf.rt_sf.selective_cdf_update) features->disable_cdf_update = selective_disable_cdf_rtc(cpi); else features->disable_cdf_update = 0; break; case 2: // Strategically determine at which frames to do CDF update. // Currently only enable CDF update for all-intra and no-show frames(1.5% // compression loss) for good qualiy or allintra mode. if (oxcf->mode == GOOD || oxcf->mode == ALLINTRA) { features->disable_cdf_update = (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1; } else { features->disable_cdf_update = selective_disable_cdf_rtc(cpi); } break; } // Disable cdf update for the INTNL_ARF_UPDATE frame with // frame_parallel_level 1. if (!cpi->do_frame_data_update && cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) { assert(cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1); features->disable_cdf_update = 1; } #if !CONFIG_REALTIME_ONLY if (cpi->oxcf.tool_cfg.enable_global_motion && !frame_is_intra_only(cm)) { // Flush any stale global motion information, which may be left over // from a previous frame aom_invalidate_pyramid(cpi->source->y_pyramid); av1_invalidate_corner_list(cpi->source->corners); } #endif // !CONFIG_REALTIME_ONLY int largest_tile_id = 0; if (av1_superres_in_recode_allowed(cpi)) { if (encode_with_and_without_superres(cpi, size, dest, dest_size, &largest_tile_id) != AOM_CODEC_OK) { return AOM_CODEC_ERROR; } } else { const aom_superres_mode orig_superres_mode = cpi->superres_mode; // save cpi->superres_mode = cpi->oxcf.superres_cfg.superres_mode; if (encode_with_recode_loop_and_filter(cpi, size, dest, dest_size, NULL, NULL, &largest_tile_id) != AOM_CODEC_OK) { return AOM_CODEC_ERROR; } cpi->superres_mode = orig_superres_mode; // restore } // Update reference frame ids for reference frames this frame will overwrite if (seq_params->frame_id_numbers_present_flag) { for (int i = 0; i < REF_FRAMES; i++) { if ((current_frame->refresh_frame_flags >> i) & 1) { cm->ref_frame_id[i] = cm->current_frame_id; } } } if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) cpi->svc.num_encoded_top_layer++; #if DUMP_RECON_FRAMES == 1 // NOTE(zoeliu): For debug - Output the filtered reconstructed video. av1_dump_filtered_recon_frames(cpi); #endif // DUMP_RECON_FRAMES if (cm->seg.enabled) { if (cm->seg.update_map == 0 && cm->last_frame_seg_map) { memcpy(cm->cur_frame->seg_map, cm->last_frame_seg_map, cm->cur_frame->mi_cols * cm->cur_frame->mi_rows * sizeof(*cm->cur_frame->seg_map)); } } int release_scaled_refs = 0; #if CONFIG_FPMT_TEST release_scaled_refs = (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 1 : 0; #endif // CONFIG_FPMT_TEST if (release_scaled_refs || cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { if (frame_is_intra_only(cm) == 0) { release_scaled_references(cpi); } } #if CONFIG_AV1_TEMPORAL_DENOISING av1_denoiser_update_ref_frame(cpi); #endif // NOTE: Save the new show frame buffer index for --test-code=warn, i.e., // for the purpose to verify no mismatch between encoder and decoder. if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame; if (features->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { *cm->fc = cpi->tile_data[largest_tile_id].tctx; av1_reset_cdf_symbol_counters(cm->fc); } if (!cm->tiles.large_scale) { cm->cur_frame->frame_context = *cm->fc; } if (tile_cfg->enable_ext_tile_debug) { // (yunqing) This test ensures the correctness of large scale tile coding. if (cm->tiles.large_scale && is_stat_consumption_stage(cpi)) { char fn[20] = "./fc"; fn[4] = current_frame->frame_number / 100 + '0'; fn[5] = (current_frame->frame_number % 100) / 10 + '0'; fn[6] = (current_frame->frame_number % 10) + '0'; fn[7] = '\0'; av1_print_frame_contexts(cm->fc, fn); } } cpi->last_frame_type = current_frame->frame_type; if (cm->features.disable_cdf_update) { cpi->frames_since_last_update++; } else { cpi->frames_since_last_update = 1; } if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) cpi->svc.prev_number_spatial_layers = cpi->svc.number_spatial_layers; // Clear the one shot update flags for segmentation map and mode/ref loop // filter deltas. cm->seg.update_map = 0; cm->seg.update_data = 0; cm->lf.mode_ref_delta_update = 0; if (cm->show_frame) { update_counters_for_show_frame(cpi); } #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, encode_frame_to_data_rate_time); #endif return AOM_CODEC_OK; } int av1_encode(AV1_COMP *const cpi, uint8_t *const dest, size_t dest_size, const EncodeFrameInput *const frame_input, const EncodeFrameParams *const frame_params, size_t *const frame_size) { AV1_COMMON *const cm = &cpi->common; CurrentFrame *const current_frame = &cm->current_frame; cpi->unscaled_source = frame_input->source; cpi->source = frame_input->source; cpi->unscaled_last_source = frame_input->last_source; current_frame->refresh_frame_flags = frame_params->refresh_frame_flags; cm->features.error_resilient_mode = frame_params->error_resilient_mode; cm->features.primary_ref_frame = frame_params->primary_ref_frame; cm->current_frame.frame_type = frame_params->frame_type; cm->show_frame = frame_params->show_frame; cpi->ref_frame_flags = frame_params->ref_frame_flags; cpi->speed = frame_params->speed; cm->show_existing_frame = frame_params->show_existing_frame; cpi->existing_fb_idx_to_show = frame_params->existing_fb_idx_to_show; memcpy(cm->remapped_ref_idx, frame_params->remapped_ref_idx, REF_FRAMES * sizeof(*cm->remapped_ref_idx)); memcpy(&cpi->refresh_frame, &frame_params->refresh_frame, sizeof(cpi->refresh_frame)); if (current_frame->frame_type == KEY_FRAME && cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) { current_frame->frame_number = 0; } current_frame->order_hint = current_frame->frame_number + frame_params->order_offset; current_frame->display_order_hint = current_frame->order_hint; current_frame->order_hint %= (1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1)); current_frame->pyramid_level = get_true_pyr_level( cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index], current_frame->display_order_hint, cpi->ppi->gf_group.max_layer_depth); if (is_stat_generation_stage(cpi)) { #if !CONFIG_REALTIME_ONLY if (cpi->oxcf.q_cfg.use_fixed_qp_offsets) av1_noop_first_pass_frame(cpi, frame_input->ts_duration); else av1_first_pass(cpi, frame_input->ts_duration); #endif } else if (cpi->oxcf.pass == AOM_RC_ONE_PASS || cpi->oxcf.pass >= AOM_RC_SECOND_PASS) { if (encode_frame_to_data_rate(cpi, frame_size, dest, dest_size) != AOM_CODEC_OK) { return AOM_CODEC_ERROR; } } else { return AOM_CODEC_ERROR; } return AOM_CODEC_OK; } #if CONFIG_DENOISE && !CONFIG_REALTIME_ONLY static int apply_denoise_2d(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *sd, int block_size, float noise_level, int64_t time_stamp, int64_t end_time) { AV1_COMMON *const cm = &cpi->common; if (!cpi->denoise_and_model) { cpi->denoise_and_model = aom_denoise_and_model_alloc( cm->seq_params->bit_depth, block_size, noise_level); if (!cpi->denoise_and_model) { aom_set_error(cm->error, AOM_CODEC_MEM_ERROR, "Error allocating denoise and model"); return -1; } } if (!cpi->film_grain_table) { cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table)); if (!cpi->film_grain_table) { aom_set_error(cm->error, AOM_CODEC_MEM_ERROR, "Error allocating grain table"); return -1; } memset(cpi->film_grain_table, 0, sizeof(*cpi->film_grain_table)); } if (aom_denoise_and_model_run(cpi->denoise_and_model, sd, &cm->film_grain_params, cpi->oxcf.enable_dnl_denoising)) { if (cm->film_grain_params.apply_grain) { aom_film_grain_table_append(cpi->film_grain_table, time_stamp, end_time, &cm->film_grain_params); } } return 0; } #endif int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, const YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time) { AV1_COMMON *const cm = &cpi->common; const SequenceHeader *const seq_params = cm->seq_params; int res = 0; const int subsampling_x = sd->subsampling_x; const int subsampling_y = sd->subsampling_y; const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; #if CONFIG_TUNE_VMAF if (!is_stat_generation_stage(cpi) && cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING) { av1_vmaf_frame_preprocessing(cpi, sd); } if (!is_stat_generation_stage(cpi) && cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN) { av1_vmaf_blk_preprocessing(cpi, sd); } #endif #if CONFIG_INTERNAL_STATS struct aom_usec_timer timer; aom_usec_timer_start(&timer); #endif #if CONFIG_AV1_TEMPORAL_DENOISING setup_denoiser_buffer(cpi); #endif #if CONFIG_DENOISE // even if denoise_noise_level is > 0, we don't need need to denoise on pass // 1 of 2 if enable_dnl_denoising is disabled since the 2nd pass will be // encoding the original (non-denoised) frame if (cpi->oxcf.noise_level > 0 && !(cpi->oxcf.pass == AOM_RC_FIRST_PASS && !cpi->oxcf.enable_dnl_denoising)) { #if !CONFIG_REALTIME_ONLY // Choose a synthetic noise level for still images for enhanced perceptual // quality based on an estimated noise level in the source, but only if // the noise level is set on the command line to > 0. if (cpi->oxcf.mode == ALLINTRA) { // No noise synthesis if source is very clean. // Uses a low edge threshold to focus on smooth areas. // Increase output noise setting a little compared to measured value. double y_noise_level = 0.0; av1_estimate_noise_level(sd, &y_noise_level, AOM_PLANE_Y, AOM_PLANE_Y, cm->seq_params->bit_depth, 16); cpi->oxcf.noise_level = (float)(y_noise_level - 0.1); cpi->oxcf.noise_level = (float)AOMMAX(0.0, cpi->oxcf.noise_level); if (cpi->oxcf.noise_level > 0.0) { cpi->oxcf.noise_level += (float)0.5; } cpi->oxcf.noise_level = (float)AOMMIN(5.0, cpi->oxcf.noise_level); } if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size, cpi->oxcf.noise_level, time_stamp, end_time) < 0) res = -1; #endif // !CONFIG_REALTIME_ONLY } #endif // CONFIG_DENOISE if (av1_lookahead_push(cpi->ppi->lookahead, sd, time_stamp, end_time, use_highbitdepth, cpi->alloc_pyramid, frame_flags)) { aom_set_error(cm->error, AOM_CODEC_ERROR, "av1_lookahead_push() failed"); res = -1; } #if CONFIG_INTERNAL_STATS aom_usec_timer_mark(&timer); cpi->ppi->total_time_receive_data += aom_usec_timer_elapsed(&timer); #endif // Note: Regarding profile setting, the following checks are added to help // choose a proper profile for the input video. The criterion is that all // bitstreams must be designated as the lowest profile that match its content. // E.G. A bitstream that contains 4:4:4 video must be designated as High // Profile in the seq header, and likewise a bitstream that contains 4:2:2 // bitstream must be designated as Professional Profile in the sequence // header. if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome && (subsampling_x != 1 || subsampling_y != 1)) { aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM, "Non-4:2:0 color format requires profile 1 or 2"); res = -1; } if ((seq_params->profile == PROFILE_1) && !(subsampling_x == 0 && subsampling_y == 0)) { aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM, "Profile 1 requires 4:4:4 color format"); res = -1; } if ((seq_params->profile == PROFILE_2) && (seq_params->bit_depth <= AOM_BITS_10) && !(subsampling_x == 1 && subsampling_y == 0)) { aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM, "Profile 2 bit-depth <= 10 requires 4:2:2 color format"); res = -1; } return res; } #if CONFIG_ENTROPY_STATS void print_entropy_stats(AV1_PRIMARY *const ppi) { if (!ppi->cpi) return; if (ppi->cpi->oxcf.pass != 1 && ppi->cpi->common.current_frame.frame_number > 0) { fprintf(stderr, "Writing counts.stt\n"); FILE *f = fopen("counts.stt", "wb"); fwrite(&ppi->aggregate_fc, sizeof(ppi->aggregate_fc), 1, f); fclose(f); } } #endif // CONFIG_ENTROPY_STATS #if CONFIG_INTERNAL_STATS static void adjust_image_stat(double y, double u, double v, double all, ImageStat *s) { s->stat[STAT_Y] += y; s->stat[STAT_U] += u; s->stat[STAT_V] += v; s->stat[STAT_ALL] += all; s->worst = AOMMIN(s->worst, all); } static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { AV1_PRIMARY *const ppi = cpi->ppi; AV1_COMMON *const cm = &cpi->common; double samples = 0.0; const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth; const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; if (cpi->ppi->use_svc && cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) return; #if CONFIG_INTER_STATS_ONLY if (cm->current_frame.frame_type == KEY_FRAME) return; // skip key frame #endif cpi->bytes += frame_bytes; if (cm->show_frame) { const YV12_BUFFER_CONFIG *orig = cpi->source; const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; double y, u, v, frame_all; ppi->count[0]++; ppi->count[1]++; if (cpi->ppi->b_calculate_psnr) { PSNR_STATS psnr; double weight[2] = { 0.0, 0.0 }; double frame_ssim2[2] = { 0.0, 0.0 }; #if CONFIG_AV1_HIGHBITDEPTH aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth); #else aom_calc_psnr(orig, recon, &psnr); #endif adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0], &(ppi->psnr[0])); ppi->total_sq_error[0] += psnr.sse[0]; ppi->total_samples[0] += psnr.samples[0]; samples = psnr.samples[0]; aom_calc_ssim(orig, recon, bit_depth, in_bit_depth, cm->seq_params->use_highbitdepth, weight, frame_ssim2); ppi->worst_ssim = AOMMIN(ppi->worst_ssim, frame_ssim2[0]); ppi->summed_quality += frame_ssim2[0] * weight[0]; ppi->summed_weights += weight[0]; #if CONFIG_AV1_HIGHBITDEPTH // Compute PSNR based on stream bit depth if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) && (in_bit_depth < bit_depth)) { adjust_image_stat(psnr.psnr_hbd[1], psnr.psnr_hbd[2], psnr.psnr_hbd[3], psnr.psnr_hbd[0], &ppi->psnr[1]); ppi->total_sq_error[1] += psnr.sse_hbd[0]; ppi->total_samples[1] += psnr.samples_hbd[0]; ppi->worst_ssim_hbd = AOMMIN(ppi->worst_ssim_hbd, frame_ssim2[1]); ppi->summed_quality_hbd += frame_ssim2[1] * weight[1]; ppi->summed_weights_hbd += weight[1]; } #endif #if 0 { FILE *f = fopen("q_used.stt", "a"); double y2 = psnr.psnr[1]; double u2 = psnr.psnr[2]; double v2 = psnr.psnr[3]; double frame_psnr2 = psnr.psnr[0]; fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n", cm->current_frame.frame_number, y2, u2, v2, frame_psnr2, frame_ssim2); fclose(f); } #endif } if (ppi->b_calculate_blockiness) { if (!cm->seq_params->use_highbitdepth) { const double frame_blockiness = av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height); ppi->worst_blockiness = AOMMAX(ppi->worst_blockiness, frame_blockiness); ppi->total_blockiness += frame_blockiness; } if (ppi->b_calculate_consistency) { if (!cm->seq_params->use_highbitdepth) { const double this_inconsistency = aom_get_ssim_metrics( orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height, ppi->ssim_vars, &ppi->metrics, 1); const double peak = (double)((1 << in_bit_depth) - 1); const double consistency = aom_sse_to_psnr(samples, peak, ppi->total_inconsistency); if (consistency > 0.0) ppi->worst_consistency = AOMMIN(ppi->worst_consistency, consistency); ppi->total_inconsistency += this_inconsistency; } } } frame_all = aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth); adjust_image_stat(y, u, v, frame_all, &ppi->fastssim); frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth); adjust_image_stat(y, u, v, frame_all, &ppi->psnrhvs); } } void print_internal_stats(AV1_PRIMARY *ppi) { if (!ppi->cpi) return; AV1_COMP *const cpi = ppi->cpi; if (ppi->cpi->oxcf.pass != 1 && ppi->cpi->common.current_frame.frame_number > 0) { char headings[512] = { 0 }; char results[512] = { 0 }; FILE *f = fopen("opsnr.stt", "a"); double time_encoded = (cpi->time_stamps.prev_ts_end - cpi->time_stamps.first_ts_start) / 10000000.000; double total_encode_time = (ppi->total_time_receive_data + ppi->total_time_compress_data) / 1000.000; const double dr = (double)ppi->total_bytes * (double)8 / (double)1000 / time_encoded; const double peak = (double)((1 << ppi->cpi->oxcf.input_cfg.input_bit_depth) - 1); const double target_rate = (double)ppi->cpi->oxcf.rc_cfg.target_bandwidth / 1000; const double rate_err = ((100.0 * (dr - target_rate)) / target_rate); if (ppi->b_calculate_psnr) { const double total_psnr = aom_sse_to_psnr( (double)ppi->total_samples[0], peak, (double)ppi->total_sq_error[0]); const double total_ssim = 100 * pow(ppi->summed_quality / ppi->summed_weights, 8.0); snprintf(headings, sizeof(headings), "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t" "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t" "WstPsnr\tWstSsim\tWstFast\tWstHVS\t" "AVPsrnY\tAPsnrCb\tAPsnrCr"); snprintf(results, sizeof(results), "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" "%7.3f\t%7.3f\t%7.3f", dr, ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr, ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr, total_ssim, total_ssim, ppi->fastssim.stat[STAT_ALL] / ppi->count[0], ppi->psnrhvs.stat[STAT_ALL] / ppi->count[0], ppi->psnr[0].worst, ppi->worst_ssim, ppi->fastssim.worst, ppi->psnrhvs.worst, ppi->psnr[0].stat[STAT_Y] / ppi->count[0], ppi->psnr[0].stat[STAT_U] / ppi->count[0], ppi->psnr[0].stat[STAT_V] / ppi->count[0]); if (ppi->b_calculate_blockiness) { SNPRINT(headings, "\t Block\tWstBlck"); SNPRINT2(results, "\t%7.3f", ppi->total_blockiness / ppi->count[0]); SNPRINT2(results, "\t%7.3f", ppi->worst_blockiness); } if (ppi->b_calculate_consistency) { double consistency = aom_sse_to_psnr((double)ppi->total_samples[0], peak, (double)ppi->total_inconsistency); SNPRINT(headings, "\tConsist\tWstCons"); SNPRINT2(results, "\t%7.3f", consistency); SNPRINT2(results, "\t%7.3f", ppi->worst_consistency); } SNPRINT(headings, "\t Time\tRcErr\tAbsErr"); SNPRINT2(results, "\t%8.0f", total_encode_time); SNPRINT2(results, " %7.2f", rate_err); SNPRINT2(results, " %7.2f", fabs(rate_err)); SNPRINT(headings, "\tAPsnr611"); SNPRINT2(results, " %7.3f", (6 * ppi->psnr[0].stat[STAT_Y] + ppi->psnr[0].stat[STAT_U] + ppi->psnr[0].stat[STAT_V]) / (ppi->count[0] * 8)); #if CONFIG_AV1_HIGHBITDEPTH const uint32_t in_bit_depth = ppi->cpi->oxcf.input_cfg.input_bit_depth; const uint32_t bit_depth = ppi->seq_params.bit_depth; // Since cpi->source->flags is not available here, but total_samples[1] // will be non-zero if cpi->source->flags & YV12_FLAG_HIGHBITDEPTH was // true in compute_internal_stats if ((ppi->total_samples[1] > 0) && (in_bit_depth < bit_depth)) { const double peak_hbd = (double)((1 << bit_depth) - 1); const double total_psnr_hbd = aom_sse_to_psnr((double)ppi->total_samples[1], peak_hbd, (double)ppi->total_sq_error[1]); const double total_ssim_hbd = 100 * pow(ppi->summed_quality_hbd / ppi->summed_weights_hbd, 8.0); SNPRINT(headings, "\t AVGPsnrH GLBPsnrH AVPsnrPH GLPsnrPH" " AVPsnrYH APsnrCbH APsnrCrH WstPsnrH" " AOMSSIMH VPSSIMPH WstSsimH"); SNPRINT2(results, "\t%7.3f", ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]); SNPRINT2(results, " %7.3f", total_psnr_hbd); SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]); SNPRINT2(results, " %7.3f", total_psnr_hbd); SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_Y] / ppi->count[1]); SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_U] / ppi->count[1]); SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_V] / ppi->count[1]); SNPRINT2(results, " %7.3f", ppi->psnr[1].worst); SNPRINT2(results, " %7.3f", total_ssim_hbd); SNPRINT2(results, " %7.3f", total_ssim_hbd); SNPRINT2(results, " %7.3f", ppi->worst_ssim_hbd); } #endif fprintf(f, "%s\n", headings); fprintf(f, "%s\n", results); } fclose(f); aom_free(ppi->ssim_vars); ppi->ssim_vars = NULL; } } #endif // CONFIG_INTERNAL_STATS static inline void update_keyframe_counters(AV1_COMP *cpi) { if (cpi->common.show_frame && cpi->rc.frames_to_key) { #if !CONFIG_REALTIME_ONLY FIRSTPASS_INFO *firstpass_info = &cpi->ppi->twopass.firstpass_info; if (firstpass_info->past_stats_count > FIRSTPASS_INFO_STATS_PAST_MIN) { av1_firstpass_info_move_cur_index_and_pop(firstpass_info); } else { // When there is not enough past stats, we move the current // index without popping the past stats av1_firstpass_info_move_cur_index(firstpass_info); } #endif if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { cpi->rc.frames_since_key++; cpi->rc.frames_to_key--; cpi->rc.frames_to_fwd_kf--; cpi->rc.frames_since_scene_change++; } } } static inline void update_frames_till_gf_update(AV1_COMP *cpi) { // TODO(weitinglin): Updating this counter for is_frame_droppable // is a work-around to handle the condition when a frame is drop. // We should fix the cpi->common.show_frame flag // instead of checking the other condition to update the counter properly. if (cpi->common.show_frame || is_frame_droppable(&cpi->ppi->rtc_ref, &cpi->ext_flags.refresh_frame)) { // Decrement count down till next gf if (cpi->rc.frames_till_gf_update_due > 0) cpi->rc.frames_till_gf_update_due--; } } static inline void update_gf_group_index(AV1_COMP *cpi) { // Increment the gf group index ready for the next frame. if (is_one_pass_rt_params(cpi) && cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { ++cpi->gf_frame_index; // Reset gf_frame_index in case it reaches MAX_STATIC_GF_GROUP_LENGTH // for real time encoding. if (cpi->gf_frame_index == MAX_STATIC_GF_GROUP_LENGTH) cpi->gf_frame_index = 0; } else { ++cpi->gf_frame_index; } } static void update_fb_of_context_type(const AV1_COMP *const cpi, int *const fb_of_context_type) { const AV1_COMMON *const cm = &cpi->common; const int current_frame_ref_type = get_current_frame_ref_type(cpi); if (frame_is_intra_only(cm) || cm->features.error_resilient_mode || cpi->ext_flags.use_primary_ref_none) { for (int i = 0; i < REF_FRAMES; i++) { fb_of_context_type[i] = -1; } fb_of_context_type[current_frame_ref_type] = cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME) : get_ref_frame_map_idx(cm, ALTREF_FRAME); } if (!encode_show_existing_frame(cm)) { // Refresh fb_of_context_type[]: see encoder.h for explanation if (cm->current_frame.frame_type == KEY_FRAME) { // All ref frames are refreshed, pick one that will live long enough fb_of_context_type[current_frame_ref_type] = 0; } else { // If more than one frame is refreshed, it doesn't matter which one we // pick so pick the first. LST sometimes doesn't refresh any: this is ok for (int i = 0; i < REF_FRAMES; i++) { if (cm->current_frame.refresh_frame_flags & (1 << i)) { fb_of_context_type[current_frame_ref_type] = i; break; } } } } } static void update_rc_counts(AV1_COMP *cpi) { update_keyframe_counters(cpi); update_frames_till_gf_update(cpi); update_gf_group_index(cpi); } static void update_end_of_frame_stats(AV1_COMP *cpi) { if (cpi->do_frame_data_update) { // Store current frame loopfilter levels in ppi, if update flag is set. if (!cpi->common.show_existing_frame) { AV1_COMMON *const cm = &cpi->common; struct loopfilter *const lf = &cm->lf; cpi->ppi->filter_level[0] = lf->filter_level[0]; cpi->ppi->filter_level[1] = lf->filter_level[1]; cpi->ppi->filter_level_u = lf->filter_level_u; cpi->ppi->filter_level_v = lf->filter_level_v; } } // Store frame level mv_stats from cpi to ppi. cpi->ppi->mv_stats = cpi->mv_stats; } // Updates frame level stats related to global motion static inline void update_gm_stats(AV1_COMP *cpi) { FRAME_UPDATE_TYPE update_type = cpi->ppi->gf_group.update_type[cpi->gf_frame_index]; int i, is_gm_present = 0; // Check if the current frame has any valid global motion model across its // reference frames for (i = 0; i < REF_FRAMES; i++) { if (cpi->common.global_motion[i].wmtype != IDENTITY) { is_gm_present = 1; break; } } int update_actual_stats = 1; #if CONFIG_FPMT_TEST update_actual_stats = (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1; if (!update_actual_stats) { if (cpi->ppi->temp_valid_gm_model_found[update_type] == INT32_MAX) { cpi->ppi->temp_valid_gm_model_found[update_type] = is_gm_present; } else { cpi->ppi->temp_valid_gm_model_found[update_type] |= is_gm_present; } int show_existing_between_parallel_frames = (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE && cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2); if (cpi->do_frame_data_update == 1 && !show_existing_between_parallel_frames) { for (i = 0; i < FRAME_UPDATE_TYPES; i++) { cpi->ppi->valid_gm_model_found[i] = cpi->ppi->temp_valid_gm_model_found[i]; } } } #endif if (update_actual_stats) { if (cpi->ppi->valid_gm_model_found[update_type] == INT32_MAX) { cpi->ppi->valid_gm_model_found[update_type] = is_gm_present; } else { cpi->ppi->valid_gm_model_found[update_type] |= is_gm_present; } } } void av1_post_encode_updates(AV1_COMP *const cpi, const AV1_COMP_DATA *const cpi_data) { AV1_PRIMARY *const ppi = cpi->ppi; AV1_COMMON *const cm = &cpi->common; update_gm_stats(cpi); #if !CONFIG_REALTIME_ONLY // Update the total stats remaining structure. if (cpi->twopass_frame.this_frame != NULL && ppi->twopass.stats_buf_ctx->total_left_stats) { subtract_stats(ppi->twopass.stats_buf_ctx->total_left_stats, cpi->twopass_frame.this_frame); } #endif #if CONFIG_OUTPUT_FRAME_SIZE FILE *f = fopen("frame_sizes.csv", "a"); fprintf(f, "%d,", 8 * (int)cpi_data->frame_size); fprintf(f, "%d\n", cm->quant_params.base_qindex); fclose(f); #endif // CONFIG_OUTPUT_FRAME_SIZE if (!is_stat_generation_stage(cpi) && !cpi->is_dropped_frame) { // Before calling refresh_reference_frames(), copy ppi->ref_frame_map_copy // to cm->ref_frame_map for frame_parallel_level 2 frame in a parallel // encode set of lower layer frames. // TODO(Remya): Move ref_frame_map from AV1_COMMON to AV1_PRIMARY to avoid // copy. if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 2 && ppi->gf_group.frame_parallel_level[cpi->gf_frame_index - 1] == 1 && ppi->gf_group.update_type[cpi->gf_frame_index - 1] == INTNL_ARF_UPDATE) { memcpy(cm->ref_frame_map, ppi->ref_frame_map_copy, sizeof(cm->ref_frame_map)); } refresh_reference_frames(cpi); // For frame_parallel_level 1 frame in a parallel encode set of lower layer // frames, store the updated cm->ref_frame_map in ppi->ref_frame_map_copy. if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1 && ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) { memcpy(ppi->ref_frame_map_copy, cm->ref_frame_map, sizeof(cm->ref_frame_map)); } av1_rc_postencode_update(cpi, cpi_data->frame_size); } if (cpi_data->pop_lookahead == 1) { av1_lookahead_pop(cpi->ppi->lookahead, cpi_data->flush, cpi->compressor_stage); } if (cpi->common.show_frame) { cpi->ppi->ts_start_last_show_frame = cpi_data->ts_frame_start; cpi->ppi->ts_end_last_show_frame = cpi_data->ts_frame_end; } if (ppi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) { // Initialize level info. at the beginning of each sequence. if (cm->current_frame.frame_type == KEY_FRAME && ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) { av1_init_level_info(cpi); } av1_update_level_info(cpi, cpi_data->frame_size, cpi_data->ts_frame_start, cpi_data->ts_frame_end); } if (!is_stat_generation_stage(cpi)) { #if !CONFIG_REALTIME_ONLY if (!has_no_stats_stage(cpi)) av1_twopass_postencode_update(cpi); #endif update_fb_of_context_type(cpi, ppi->fb_of_context_type); update_rc_counts(cpi); update_end_of_frame_stats(cpi); } #if CONFIG_THREE_PASS if (cpi->oxcf.pass == AOM_RC_THIRD_PASS && cpi->third_pass_ctx) { av1_pop_third_pass_info(cpi->third_pass_ctx); } #endif if (ppi->rtc_ref.set_ref_frame_config && !cpi->is_dropped_frame) { av1_svc_update_buffer_slot_refreshed(cpi); av1_svc_set_reference_was_previous(cpi); } if (ppi->use_svc) av1_save_layer_context(cpi); // Note *size = 0 indicates a dropped frame for which psnr is not calculated if (ppi->b_calculate_psnr && cpi_data->frame_size > 0) { if (cm->show_existing_frame || (!is_stat_generation_stage(cpi) && cm->show_frame)) { generate_psnr_packet(cpi); } } #if CONFIG_INTERNAL_STATS if (!is_stat_generation_stage(cpi)) { compute_internal_stats(cpi, (int)cpi_data->frame_size); } #endif // CONFIG_INTERNAL_STATS #if CONFIG_THREE_PASS // Write frame info. Subtract 1 from frame index since if was incremented in // update_rc_counts. av1_write_second_pass_per_frame_info(cpi, cpi->gf_frame_index - 1); #endif } int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data) { const AV1EncoderConfig *const oxcf = &cpi->oxcf; AV1_COMMON *const cm = &cpi->common; // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(cm->error->jmp)) { cm->error->setjmp = 0; return cm->error->error_code; } cm->error->setjmp = 1; #if CONFIG_INTERNAL_STATS cpi->frame_recode_hits = 0; cpi->time_compress_data = 0; cpi->bytes = 0; #endif #if CONFIG_ENTROPY_STATS if (cpi->compressor_stage == ENCODE_STAGE) { av1_zero(cpi->counts); } #endif #if CONFIG_BITSTREAM_DEBUG assert(cpi->oxcf.max_threads <= 1 && "bitstream debug tool does not support multithreading"); bitstream_queue_record_write(); if (cm->seq_params->order_hint_info.enable_order_hint) { aom_bitstream_queue_set_frame_write(cm->current_frame.order_hint * 2 + cm->show_frame); } else { // This is currently used in RTC encoding. cm->show_frame is always 1. aom_bitstream_queue_set_frame_write(cm->current_frame.frame_number); } #endif if (cpi->ppi->use_svc) { av1_one_pass_cbr_svc_start_layer(cpi); } cpi->is_dropped_frame = false; cm->showable_frame = 0; cpi_data->frame_size = 0; cpi->available_bs_size = cpi_data->cx_data_sz; #if CONFIG_INTERNAL_STATS struct aom_usec_timer cmptimer; aom_usec_timer_start(&cmptimer); #endif av1_set_high_precision_mv(cpi, 1, 0); // Normal defaults cm->features.refresh_frame_context = oxcf->tool_cfg.frame_parallel_decoding_mode ? REFRESH_FRAME_CONTEXT_DISABLED : REFRESH_FRAME_CONTEXT_BACKWARD; if (oxcf->tile_cfg.enable_large_scale_tile) cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; if (assign_cur_frame_new_fb(cm) == NULL) { aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, "Failed to allocate new cur_frame"); } #if CONFIG_COLLECT_COMPONENT_TIMING // Accumulate 2nd pass time in 2-pass case or 1 pass time in 1-pass case. if (cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0) start_timing(cpi, av1_encode_strategy_time); #endif const int result = av1_encode_strategy( cpi, &cpi_data->frame_size, cpi_data->cx_data, cpi_data->cx_data_sz, &cpi_data->lib_flags, &cpi_data->ts_frame_start, &cpi_data->ts_frame_end, cpi_data->timestamp_ratio, &cpi_data->pop_lookahead, cpi_data->flush); #if CONFIG_COLLECT_COMPONENT_TIMING if (cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0) end_timing(cpi, av1_encode_strategy_time); // Print out timing information. // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of // show_existing_frame and lag-in-frames. if ((cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0) && cpi->frame_component_time[0] > 100) { int i; uint64_t frame_total = 0, total = 0; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; FRAME_UPDATE_TYPE frame_update_type = get_frame_update_type(gf_group, cpi->gf_frame_index); fprintf(stderr, "\n Frame number: %d, Frame type: %s, Show Frame: %d, Frame Update " "Type: %d, Q: %d\n", cm->current_frame.frame_number, get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame, frame_update_type, cm->quant_params.base_qindex); for (i = 0; i < kTimingComponents; i++) { cpi->component_time[i] += cpi->frame_component_time[i]; // Use av1_encode_strategy_time (i = 0) as the total time. if (i == 0) { frame_total = cpi->frame_component_time[0]; total = cpi->component_time[0]; } fprintf(stderr, " %50s: %15" PRId64 " us [%6.2f%%] (total: %15" PRId64 " us [%6.2f%%])\n", get_component_name(i), cpi->frame_component_time[i], (float)((float)cpi->frame_component_time[i] * 100.0 / (float)frame_total), cpi->component_time[i], (float)((float)cpi->component_time[i] * 100.0 / (float)total)); cpi->frame_component_time[i] = 0; } } #endif // Reset the flag to 0 afer encoding. cpi->rc.use_external_qp_one_pass = 0; if (result == -1) { cm->error->setjmp = 0; // Returning -1 indicates no frame encoded; more input is required return -1; } if (result != AOM_CODEC_OK) { aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, "Failed to encode frame"); } #if CONFIG_INTERNAL_STATS aom_usec_timer_mark(&cmptimer); cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer); #endif // CONFIG_INTERNAL_STATS #if CONFIG_SPEED_STATS if (!is_stat_generation_stage(cpi) && !cm->show_existing_frame) { cpi->tx_search_count += cpi->td.mb.txfm_search_info.tx_search_count; cpi->td.mb.txfm_search_info.tx_search_count = 0; } #endif // CONFIG_SPEED_STATS cm->error->setjmp = 0; return AOM_CODEC_OK; } // Populates cpi->scaled_ref_buf corresponding to frames in a parallel encode // set. Also sets the bitmask 'ref_buffers_used_map'. static void scale_references_fpmt(AV1_COMP *cpi, int *ref_buffers_used_map) { AV1_COMMON *cm = &cpi->common; MV_REFERENCE_FRAME ref_frame; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1). if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { const YV12_BUFFER_CONFIG *const ref = get_ref_frame_yv12_buf(cm, ref_frame); if (ref == NULL) { cpi->scaled_ref_buf[ref_frame - 1] = NULL; continue; } // FPMT does not support scaling yet. assert(ref->y_crop_width == cm->width && ref->y_crop_height == cm->height); RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame); cpi->scaled_ref_buf[ref_frame - 1] = buf; for (int i = 0; i < cm->buffer_pool->num_frame_bufs; ++i) { if (&cm->buffer_pool->frame_bufs[i] == buf) { *ref_buffers_used_map |= (1 << i); } } } else { if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL; } } } // Increments the ref_count of frame buffers referenced by cpi->scaled_ref_buf // corresponding to frames in a parallel encode set. static void increment_scaled_ref_counts_fpmt(BufferPool *buffer_pool, int ref_buffers_used_map) { for (int i = 0; i < buffer_pool->num_frame_bufs; ++i) { if (ref_buffers_used_map & (1 << i)) { ++buffer_pool->frame_bufs[i].ref_count; } } } // Releases cpi->scaled_ref_buf corresponding to frames in a parallel encode // set. void av1_release_scaled_references_fpmt(AV1_COMP *cpi) { // TODO(isbs): only refresh the necessary frames, rather than all of them for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { RefCntBuffer *const buf = cpi->scaled_ref_buf[i]; if (buf != NULL) { cpi->scaled_ref_buf[i] = NULL; } } } // Decrements the ref_count of frame buffers referenced by cpi->scaled_ref_buf // corresponding to frames in a parallel encode set. void av1_decrement_ref_counts_fpmt(BufferPool *buffer_pool, int ref_buffers_used_map) { for (int i = 0; i < buffer_pool->num_frame_bufs; ++i) { if (ref_buffers_used_map & (1 << i)) { --buffer_pool->frame_bufs[i].ref_count; } } } // Initialize parallel frame contexts with screen content decisions. void av1_init_sc_decisions(AV1_PRIMARY *const ppi) { AV1_COMP *const first_cpi = ppi->cpi; for (int i = 1; i < ppi->num_fp_contexts; ++i) { AV1_COMP *cur_cpi = ppi->parallel_cpi[i]; cur_cpi->common.features.allow_screen_content_tools = first_cpi->common.features.allow_screen_content_tools; cur_cpi->common.features.allow_intrabc = first_cpi->common.features.allow_intrabc; cur_cpi->use_screen_content_tools = first_cpi->use_screen_content_tools; cur_cpi->is_screen_content_type = first_cpi->is_screen_content_type; } } AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi, AV1_COMP_DATA *const first_cpi_data) { int cpi_idx = 0; // Loop over parallel_cpi to find the cpi that processed the current // gf_frame_index ahead of time. for (int i = 1; i < ppi->num_fp_contexts; i++) { if (ppi->cpi->gf_frame_index == ppi->parallel_cpi[i]->gf_frame_index) { cpi_idx = i; break; } } assert(cpi_idx > 0); assert(!ppi->parallel_cpi[cpi_idx]->common.show_existing_frame); // Release the previously-used frame-buffer. if (ppi->cpi->common.cur_frame != NULL) { --ppi->cpi->common.cur_frame->ref_count; ppi->cpi->common.cur_frame = NULL; } // Swap the appropriate parallel_cpi with the parallel_cpi[0]. ppi->cpi = ppi->parallel_cpi[cpi_idx]; ppi->parallel_cpi[cpi_idx] = ppi->parallel_cpi[0]; ppi->parallel_cpi[0] = ppi->cpi; // Copy appropriate parallel_frames_data to local data. { AV1_COMP_DATA *data = &ppi->parallel_frames_data[cpi_idx - 1]; assert(data->frame_size > 0); if (data->frame_size > first_cpi_data->cx_data_sz) { aom_internal_error(&ppi->error, AOM_CODEC_ERROR, "first_cpi_data->cx_data buffer full"); } first_cpi_data->lib_flags = data->lib_flags; first_cpi_data->ts_frame_start = data->ts_frame_start; first_cpi_data->ts_frame_end = data->ts_frame_end; memcpy(first_cpi_data->cx_data, data->cx_data, data->frame_size); first_cpi_data->frame_size = data->frame_size; if (ppi->cpi->common.show_frame) { first_cpi_data->pop_lookahead = 1; } } return ppi->cpi; } // Initialises frames belonging to a parallel encode set. int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data, AV1_PRIMARY *const ppi, int *ref_buffers_used_map) { AV1_COMP *const first_cpi = ppi->cpi; GF_GROUP *const gf_group = &ppi->gf_group; int gf_index_start = first_cpi->gf_frame_index; assert(gf_group->frame_parallel_level[gf_index_start] == 1); int parallel_frame_count = 0; int cur_frame_num = first_cpi->common.current_frame.frame_number; int show_frame_count = first_cpi->frame_index_set.show_frame_count; int frames_since_key = first_cpi->rc.frames_since_key; int frames_to_key = first_cpi->rc.frames_to_key; int frames_to_fwd_kf = first_cpi->rc.frames_to_fwd_kf; int cur_frame_disp = cur_frame_num + gf_group->arf_src_offset[gf_index_start]; const FIRSTPASS_STATS *stats_in = first_cpi->twopass_frame.stats_in; assert(*ref_buffers_used_map == 0); // Release the previously used frame-buffer by a frame_parallel_level 1 frame. if (first_cpi->common.cur_frame != NULL) { --first_cpi->common.cur_frame->ref_count; first_cpi->common.cur_frame = NULL; } RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]; RefFrameMapPair first_ref_frame_map_pairs[REF_FRAMES]; init_ref_map_pair(first_cpi, first_ref_frame_map_pairs); memcpy(ref_frame_map_pairs, first_ref_frame_map_pairs, sizeof(RefFrameMapPair) * REF_FRAMES); // Store the reference refresh index of frame_parallel_level 1 frame in a // parallel encode set of lower layer frames. if (gf_group->update_type[gf_index_start] == INTNL_ARF_UPDATE) { first_cpi->ref_refresh_index = av1_calc_refresh_idx_for_intnl_arf( first_cpi, ref_frame_map_pairs, gf_index_start); assert(first_cpi->ref_refresh_index != INVALID_IDX && first_cpi->ref_refresh_index < REF_FRAMES); first_cpi->refresh_idx_available = true; // Update ref_frame_map_pairs. ref_frame_map_pairs[first_cpi->ref_refresh_index].disp_order = gf_group->display_idx[gf_index_start]; ref_frame_map_pairs[first_cpi->ref_refresh_index].pyr_level = gf_group->layer_depth[gf_index_start]; } // Set do_frame_data_update flag as false for frame_parallel_level 1 frame. first_cpi->do_frame_data_update = false; if (gf_group->arf_src_offset[gf_index_start] == 0) { first_cpi->time_stamps.prev_ts_start = ppi->ts_start_last_show_frame; first_cpi->time_stamps.prev_ts_end = ppi->ts_end_last_show_frame; } av1_get_ref_frames(first_ref_frame_map_pairs, cur_frame_disp, first_cpi, gf_index_start, 1, first_cpi->common.remapped_ref_idx); scale_references_fpmt(first_cpi, ref_buffers_used_map); parallel_frame_count++; // Iterate through the GF_GROUP to find the remaining frame_parallel_level 2 // frames which are part of the current parallel encode set and initialize the // required cpi elements. for (int i = gf_index_start + 1; i < gf_group->size; i++) { // Update frame counters if previous frame was show frame or show existing // frame. if (gf_group->arf_src_offset[i - 1] == 0) { cur_frame_num++; show_frame_count++; if (frames_to_fwd_kf <= 0) frames_to_fwd_kf = first_cpi->oxcf.kf_cfg.fwd_kf_dist; if (frames_to_key) { frames_since_key++; frames_to_key--; frames_to_fwd_kf--; } stats_in++; } cur_frame_disp = cur_frame_num + gf_group->arf_src_offset[i]; if (gf_group->frame_parallel_level[i] == 2) { AV1_COMP *cur_cpi = ppi->parallel_cpi[parallel_frame_count]; AV1_COMP_DATA *cur_cpi_data = &ppi->parallel_frames_data[parallel_frame_count - 1]; cur_cpi->gf_frame_index = i; cur_cpi->framerate = first_cpi->framerate; cur_cpi->common.current_frame.frame_number = cur_frame_num; cur_cpi->common.current_frame.frame_type = gf_group->frame_type[i]; cur_cpi->frame_index_set.show_frame_count = show_frame_count; cur_cpi->rc.frames_since_key = frames_since_key; cur_cpi->rc.frames_to_key = frames_to_key; cur_cpi->rc.frames_to_fwd_kf = frames_to_fwd_kf; cur_cpi->rc.active_worst_quality = first_cpi->rc.active_worst_quality; cur_cpi->rc.avg_frame_bandwidth = first_cpi->rc.avg_frame_bandwidth; cur_cpi->rc.max_frame_bandwidth = first_cpi->rc.max_frame_bandwidth; cur_cpi->rc.min_frame_bandwidth = first_cpi->rc.min_frame_bandwidth; cur_cpi->rc.intervals_till_gf_calculate_due = first_cpi->rc.intervals_till_gf_calculate_due; cur_cpi->mv_search_params.max_mv_magnitude = first_cpi->mv_search_params.max_mv_magnitude; if (gf_group->update_type[cur_cpi->gf_frame_index] == INTNL_ARF_UPDATE) { cur_cpi->common.lf.mode_ref_delta_enabled = 1; } cur_cpi->do_frame_data_update = false; // Initialize prev_ts_start and prev_ts_end for show frame(s) and show // existing frame(s). if (gf_group->arf_src_offset[i] == 0) { // Choose source of prev frame. int src_index = gf_group->src_offset[i]; struct lookahead_entry *prev_source = av1_lookahead_peek( ppi->lookahead, src_index - 1, cur_cpi->compressor_stage); // Save timestamps of prev frame. cur_cpi->time_stamps.prev_ts_start = prev_source->ts_start; cur_cpi->time_stamps.prev_ts_end = prev_source->ts_end; } cur_cpi->time_stamps.first_ts_start = first_cpi->time_stamps.first_ts_start; memcpy(cur_cpi->common.ref_frame_map, first_cpi->common.ref_frame_map, sizeof(first_cpi->common.ref_frame_map)); cur_cpi_data->lib_flags = 0; cur_cpi_data->timestamp_ratio = first_cpi_data->timestamp_ratio; cur_cpi_data->flush = first_cpi_data->flush; cur_cpi_data->frame_size = 0; if (gf_group->update_type[gf_index_start] == INTNL_ARF_UPDATE) { // If the first frame in a parallel encode set is INTNL_ARF_UPDATE // frame, initialize lib_flags of frame_parallel_level 2 frame in the // set with that of frame_parallel_level 1 frame. cur_cpi_data->lib_flags = first_cpi_data->lib_flags; // Store the reference refresh index of frame_parallel_level 2 frame in // a parallel encode set of lower layer frames. cur_cpi->ref_refresh_index = av1_calc_refresh_idx_for_intnl_arf(cur_cpi, ref_frame_map_pairs, i); cur_cpi->refresh_idx_available = true; // Skip the reference frame which will be refreshed by // frame_parallel_level 1 frame in a parallel encode set of lower layer // frames. cur_cpi->ref_idx_to_skip = first_cpi->ref_refresh_index; } else { cur_cpi->ref_idx_to_skip = INVALID_IDX; cur_cpi->ref_refresh_index = INVALID_IDX; cur_cpi->refresh_idx_available = false; } cur_cpi->twopass_frame.stats_in = stats_in; av1_get_ref_frames(first_ref_frame_map_pairs, cur_frame_disp, cur_cpi, i, 1, cur_cpi->common.remapped_ref_idx); scale_references_fpmt(cur_cpi, ref_buffers_used_map); parallel_frame_count++; } // Set do_frame_data_update to true for the last frame_parallel_level 2 // frame in the current parallel encode set. if (i == (gf_group->size - 1) || (gf_group->frame_parallel_level[i + 1] == 0 && (gf_group->update_type[i + 1] == ARF_UPDATE || gf_group->update_type[i + 1] == INTNL_ARF_UPDATE)) || gf_group->frame_parallel_level[i + 1] == 1) { ppi->parallel_cpi[parallel_frame_count - 1]->do_frame_data_update = true; break; } } increment_scaled_ref_counts_fpmt(first_cpi->common.buffer_pool, *ref_buffers_used_map); // Return the number of frames in the parallel encode set. return parallel_frame_count; } int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) { AV1_COMMON *cm = &cpi->common; if (!cm->show_frame) { return -1; } else { int ret; if (cm->cur_frame != NULL && !cpi->oxcf.algo_cfg.skip_postproc_filtering) { *dest = cm->cur_frame->buf; dest->y_width = cm->width; dest->y_height = cm->height; dest->uv_width = cm->width >> cm->seq_params->subsampling_x; dest->uv_height = cm->height >> cm->seq_params->subsampling_y; ret = 0; } else { ret = -1; } return ret; } } int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) { if (cpi->last_show_frame_buf == NULL || cpi->oxcf.algo_cfg.skip_postproc_filtering) return -1; *frame = cpi->last_show_frame_buf->buf; return 0; } aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm, YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *sd) { const int num_planes = av1_num_planes(cm); if (!equal_dimensions_and_border(new_frame, sd)) aom_internal_error(cm->error, AOM_CODEC_ERROR, "Incorrect buffer dimensions"); else aom_yv12_copy_frame(new_frame, sd, num_planes); return cm->error->error_code; } int av1_set_internal_size(AV1EncoderConfig *const oxcf, ResizePendingParams *resize_pending_params, AOM_SCALING_MODE horiz_mode, AOM_SCALING_MODE vert_mode) { int hr = 0, hs = 0, vr = 0, vs = 0; // Checks for invalid AOM_SCALING_MODE values. if (horiz_mode > AOME_ONETHREE || vert_mode > AOME_ONETHREE) return -1; Scale2Ratio(horiz_mode, &hr, &hs); Scale2Ratio(vert_mode, &vr, &vs); // always go to the next whole number resize_pending_params->width = (hs - 1 + oxcf->frm_dim_cfg.width * hr) / hs; resize_pending_params->height = (vs - 1 + oxcf->frm_dim_cfg.height * vr) / vs; if (horiz_mode != AOME_NORMAL || vert_mode != AOME_NORMAL) { oxcf->resize_cfg.resize_mode = RESIZE_FIXED; oxcf->algo_cfg.enable_tpl_model = 0; } return 0; } int av1_get_quantizer(AV1_COMP *cpi) { return cpi->common.quant_params.base_qindex; } int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t buffer_size, size_t *frame_size) { assert(*frame_size <= buffer_size); size_t output_size = 0; size_t remaining_size = *frame_size; uint8_t *buff_ptr = buffer; // go through each OBUs while (remaining_size > 0) { uint8_t saved_obu_header[2]; uint64_t obu_payload_size; size_t length_of_payload_size; size_t length_of_obu_size; const uint32_t obu_header_size = (buff_ptr[0] >> 2) & 0x1 ? 2 : 1; size_t obu_bytes_read = obu_header_size; // bytes read for current obu // save the obu header (1 or 2 bytes) memcpy(saved_obu_header, buff_ptr, obu_header_size); // clear the obu_has_size_field saved_obu_header[0] &= ~0x2; // get the payload_size and length of payload_size if (aom_uleb_decode(buff_ptr + obu_header_size, remaining_size - obu_header_size, &obu_payload_size, &length_of_payload_size) != 0) { return AOM_CODEC_ERROR; } obu_bytes_read += length_of_payload_size; // calculate the length of size of the obu header plus payload const uint64_t obu_size = obu_header_size + obu_payload_size; length_of_obu_size = aom_uleb_size_in_bytes(obu_size); if (length_of_obu_size + obu_header_size > buffer_size - output_size - (remaining_size - obu_bytes_read)) { return AOM_CODEC_ERROR; } // move the rest of data to new location memmove(buff_ptr + length_of_obu_size + obu_header_size, buff_ptr + obu_bytes_read, remaining_size - obu_bytes_read); obu_bytes_read += (size_t)obu_payload_size; // write the new obu size size_t coded_obu_size; if (aom_uleb_encode(obu_size, length_of_obu_size, buff_ptr, &coded_obu_size) != 0 || coded_obu_size != length_of_obu_size) { return AOM_CODEC_ERROR; } // write the saved (modified) obu_header following obu size memcpy(buff_ptr + length_of_obu_size, saved_obu_header, obu_header_size); remaining_size -= obu_bytes_read; buff_ptr += length_of_obu_size + (size_t)obu_size; output_size += length_of_obu_size + (size_t)obu_size; } *frame_size = output_size; return AOM_CODEC_OK; } static void rtc_set_updates_ref_frame_config( ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags, RTC_REF *const rtc_ref) { ext_refresh_frame_flags->update_pending = 1; ext_refresh_frame_flags->last_frame = rtc_ref->refresh[rtc_ref->ref_idx[0]]; ext_refresh_frame_flags->golden_frame = rtc_ref->refresh[rtc_ref->ref_idx[3]]; ext_refresh_frame_flags->bwd_ref_frame = rtc_ref->refresh[rtc_ref->ref_idx[4]]; ext_refresh_frame_flags->alt2_ref_frame = rtc_ref->refresh[rtc_ref->ref_idx[5]]; ext_refresh_frame_flags->alt_ref_frame = rtc_ref->refresh[rtc_ref->ref_idx[6]]; rtc_ref->non_reference_frame = 1; for (int i = 0; i < REF_FRAMES; i++) { if (rtc_ref->refresh[i] == 1) { rtc_ref->non_reference_frame = 0; break; } } } static int rtc_set_references_external_ref_frame_config(AV1_COMP *cpi) { // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). int ref = AOM_REFFRAME_ALL; for (int i = 0; i < INTER_REFS_PER_FRAME; i++) { if (!cpi->ppi->rtc_ref.reference[i]) ref ^= (1 << i); } return ref; } void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) { // TODO(yunqingwang): For what references to use, external encoding flags // should be consistent with internal reference frame selection. Need to // ensure that there is not conflict between the two. In AV1 encoder, the // priority rank for 7 reference frames are: LAST, ALTREF, LAST2, LAST3, // GOLDEN, BWDREF, ALTREF2. ExternalFlags *const ext_flags = &cpi->ext_flags; ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags = &ext_flags->refresh_frame; ext_flags->ref_frame_flags = AOM_REFFRAME_ALL; if (flags & (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2)) { int ref = AOM_REFFRAME_ALL; if (flags & AOM_EFLAG_NO_REF_LAST) ref ^= AOM_LAST_FLAG; if (flags & AOM_EFLAG_NO_REF_LAST2) ref ^= AOM_LAST2_FLAG; if (flags & AOM_EFLAG_NO_REF_LAST3) ref ^= AOM_LAST3_FLAG; if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG; if (flags & AOM_EFLAG_NO_REF_ARF) { ref ^= AOM_ALT_FLAG; ref ^= AOM_BWD_FLAG; ref ^= AOM_ALT2_FLAG; } else { if (flags & AOM_EFLAG_NO_REF_BWD) ref ^= AOM_BWD_FLAG; if (flags & AOM_EFLAG_NO_REF_ARF2) ref ^= AOM_ALT2_FLAG; } av1_use_as_reference(&ext_flags->ref_frame_flags, ref); } else { if (cpi->ppi->rtc_ref.set_ref_frame_config) { int ref = rtc_set_references_external_ref_frame_config(cpi); av1_use_as_reference(&ext_flags->ref_frame_flags, ref); } } if (flags & (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) { int upd = AOM_REFFRAME_ALL; // Refreshing LAST/LAST2/LAST3 is handled by 1 common flag. if (flags & AOM_EFLAG_NO_UPD_LAST) upd ^= AOM_LAST_FLAG; if (flags & AOM_EFLAG_NO_UPD_GF) upd ^= AOM_GOLD_FLAG; if (flags & AOM_EFLAG_NO_UPD_ARF) { upd ^= AOM_ALT_FLAG; upd ^= AOM_BWD_FLAG; upd ^= AOM_ALT2_FLAG; } ext_refresh_frame_flags->last_frame = (upd & AOM_LAST_FLAG) != 0; ext_refresh_frame_flags->golden_frame = (upd & AOM_GOLD_FLAG) != 0; ext_refresh_frame_flags->alt_ref_frame = (upd & AOM_ALT_FLAG) != 0; ext_refresh_frame_flags->bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0; ext_refresh_frame_flags->alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0; ext_refresh_frame_flags->update_pending = 1; } else { if (cpi->ppi->rtc_ref.set_ref_frame_config) rtc_set_updates_ref_frame_config(ext_refresh_frame_flags, &cpi->ppi->rtc_ref); else ext_refresh_frame_flags->update_pending = 0; } ext_flags->use_ref_frame_mvs = cpi->oxcf.tool_cfg.enable_ref_frame_mvs & ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0); ext_flags->use_error_resilient = cpi->oxcf.tool_cfg.error_resilient_mode | ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0); ext_flags->use_s_frame = cpi->oxcf.kf_cfg.enable_sframe | ((flags & AOM_EFLAG_SET_S_FRAME) != 0); ext_flags->use_primary_ref_none = (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0; if (flags & AOM_EFLAG_NO_UPD_ENTROPY) { update_entropy(&ext_flags->refresh_frame_context, &ext_flags->refresh_frame_context_pending, 0); } } aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi) { if (!ppi) return NULL; uint8_t header_buf[512] = { 0 }; const uint32_t sequence_header_size = av1_write_sequence_header_obu( &ppi->seq_params, &header_buf[0], sizeof(header_buf)); assert(sequence_header_size <= sizeof(header_buf)); if (sequence_header_size == 0) return NULL; const size_t obu_header_size = 1; const size_t size_field_size = aom_uleb_size_in_bytes(sequence_header_size); const size_t payload_offset = obu_header_size + size_field_size; if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL; memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size); if (av1_write_obu_header(&ppi->level_params, &ppi->cpi->frame_header_count, OBU_SEQUENCE_HEADER, ppi->seq_params.has_nonzero_operating_point_idc, /*is_layer_specific_obu=*/false, 0, &header_buf[0]) != obu_header_size) { return NULL; } size_t coded_size_field_size = 0; if (aom_uleb_encode(sequence_header_size, size_field_size, &header_buf[obu_header_size], &coded_size_field_size) != 0) { return NULL; } assert(coded_size_field_size == size_field_size); aom_fixed_buf_t *global_headers = (aom_fixed_buf_t *)malloc(sizeof(*global_headers)); if (!global_headers) return NULL; const size_t global_header_buf_size = obu_header_size + size_field_size + sequence_header_size; global_headers->buf = malloc(global_header_buf_size); if (!global_headers->buf) { free(global_headers); return NULL; } memcpy(global_headers->buf, &header_buf[0], global_header_buf_size); global_headers->sz = global_header_buf_size; return global_headers; } aom-3.12.1/av1/encoder/encoder.h000066400000000000000000004072541477627663500163130ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief Declares top-level encoder structures and functions. */ #ifndef AOM_AV1_ENCODER_ENCODER_H_ #define AOM_AV1_ENCODER_ENCODER_H_ #include #include #include "config/aom_config.h" #include "aom/aomcx.h" #include "aom_util/aom_pthread.h" #include "av1/common/alloccommon.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/entropymode.h" #include "av1/common/enums.h" #include "av1/common/reconintra.h" #include "av1/common/resize.h" #include "av1/common/thread_common.h" #include "av1/common/timing.h" #include "av1/encoder/aq_cyclicrefresh.h" #include "av1/encoder/av1_quantize.h" #include "av1/encoder/block.h" #include "av1/encoder/context_tree.h" #include "av1/encoder/enc_enums.h" #include "av1/encoder/encodemb.h" #include "av1/encoder/external_partition.h" #include "av1/encoder/firstpass.h" #include "av1/encoder/global_motion.h" #include "av1/encoder/level.h" #include "av1/encoder/lookahead.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/pickcdef.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/rd.h" #include "av1/encoder/speed_features.h" #include "av1/encoder/svc_layercontext.h" #include "av1/encoder/temporal_filter.h" #if CONFIG_THREE_PASS #include "av1/encoder/thirdpass.h" #endif #include "av1/encoder/tokenize.h" #include "av1/encoder/tpl_model.h" #include "av1/encoder/av1_noise_estimate.h" #include "av1/encoder/bitstream.h" #if CONFIG_INTERNAL_STATS #include "aom_dsp/ssim.h" #endif #include "aom_dsp/variance.h" #if CONFIG_DENOISE #include "aom_dsp/noise_model.h" #endif #if CONFIG_TUNE_VMAF #include "av1/encoder/tune_vmaf.h" #endif #if CONFIG_AV1_TEMPORAL_DENOISING #include "av1/encoder/av1_temporal_denoiser.h" #endif #if CONFIG_TUNE_BUTTERAUGLI #include "av1/encoder/tune_butteraugli.h" #endif #include "aom/internal/aom_codec_internal.h" #ifdef __cplusplus extern "C" { #endif // TODO(yunqing, any): Added suppression tag to quiet Doxygen warnings. Need to // adjust it while we work on documentation. /*!\cond */ // Number of frames required to test for scene cut detection #define SCENE_CUT_KEY_TEST_INTERVAL 16 // Lookahead index threshold to enable temporal filtering for second arf. #define TF_LOOKAHEAD_IDX_THR 7 #define HDR_QP_LEVELS 10 #define CHROMA_CB_QP_SCALE 1.04 #define CHROMA_CR_QP_SCALE 1.04 #define CHROMA_QP_SCALE -0.46 #define CHROMA_QP_OFFSET 9.26 #define QP_SCALE_FACTOR 2.0 #define DISABLE_HDR_LUMA_DELTAQ 1 // Rational number with an int64 numerator // This structure holds a fractional value typedef struct aom_rational64 { int64_t num; // fraction numerator int den; // fraction denominator } aom_rational64_t; // alias for struct aom_rational enum { // Good Quality Fast Encoding. The encoder balances quality with the amount of // time it takes to encode the output. Speed setting controls how fast. GOOD, // Realtime Fast Encoding. Will force some restrictions on bitrate // constraints. REALTIME, // All intra mode. All the frames are coded as intra frames. ALLINTRA } UENUM1BYTE(MODE); enum { FRAMEFLAGS_KEY = 1 << 0, FRAMEFLAGS_GOLDEN = 1 << 1, FRAMEFLAGS_BWDREF = 1 << 2, // TODO(zoeliu): To determine whether a frame flag is needed for ALTREF2_FRAME FRAMEFLAGS_ALTREF = 1 << 3, FRAMEFLAGS_INTRAONLY = 1 << 4, FRAMEFLAGS_SWITCH = 1 << 5, FRAMEFLAGS_ERROR_RESILIENT = 1 << 6, } UENUM1BYTE(FRAMETYPE_FLAGS); #if CONFIG_FPMT_TEST enum { PARALLEL_ENCODE = 0, PARALLEL_SIMULATION_ENCODE, NUM_FPMT_TEST_ENCODES } UENUM1BYTE(FPMT_TEST_ENC_CFG); #endif // CONFIG_FPMT_TEST // 0 level frames are sometimes used for rate control purposes, but for // reference mapping purposes, the minimum level should be 1. #define MIN_PYR_LEVEL 1 static inline int get_true_pyr_level(int frame_level, int frame_order, int max_layer_depth) { if (frame_order == 0) { // Keyframe case return MIN_PYR_LEVEL; } else if (frame_level == MAX_ARF_LAYERS) { // Leaves return max_layer_depth; } else if (frame_level == (MAX_ARF_LAYERS + 1)) { // Altrefs return MIN_PYR_LEVEL; } return AOMMAX(MIN_PYR_LEVEL, frame_level); } enum { NO_AQ = 0, VARIANCE_AQ = 1, COMPLEXITY_AQ = 2, CYCLIC_REFRESH_AQ = 3, AQ_MODE_COUNT // This should always be the last member of the enum } UENUM1BYTE(AQ_MODE); enum { NO_DELTA_Q = 0, DELTA_Q_OBJECTIVE = 1, // Modulation to improve objective quality DELTA_Q_PERCEPTUAL = 2, // Modulation to improve video perceptual quality DELTA_Q_PERCEPTUAL_AI = 3, // Perceptual quality opt for all intra mode DELTA_Q_USER_RATING_BASED = 4, // User rating based delta q mode DELTA_Q_HDR = 5, // QP adjustment based on HDR block pixel average DELTA_Q_VARIANCE_BOOST = 6, // Variance Boost style modulation for all intra mode DELTA_Q_MODE_COUNT // This should always be the last member of the enum } UENUM1BYTE(DELTAQ_MODE); enum { RESIZE_NONE = 0, // No frame resizing allowed. RESIZE_FIXED = 1, // All frames are coded at the specified scale. RESIZE_RANDOM = 2, // All frames are coded at a random scale. RESIZE_DYNAMIC = 3, // Frames coded at lower scale based on rate control. RESIZE_MODES } UENUM1BYTE(RESIZE_MODE); enum { SS_CFG_SRC = 0, SS_CFG_LOOKAHEAD = 1, SS_CFG_FPF = 2, SS_CFG_TOTAL = 3 } UENUM1BYTE(SS_CFG_OFFSET); enum { DISABLE_SCENECUT, // For LAP, lag_in_frames < 19 ENABLE_SCENECUT_MODE_1, // For LAP, lag_in_frames >=19 and < 33 ENABLE_SCENECUT_MODE_2 // For twopass and LAP - lag_in_frames >=33 } UENUM1BYTE(SCENECUT_MODE); #define MAX_VBR_CORPUS_COMPLEXITY 10000 typedef enum { MOD_FP, // First pass MOD_TF, // Temporal filtering MOD_TPL, // TPL MOD_GME, // Global motion estimation MOD_ENC, // Encode stage MOD_LPF, // Deblocking loop filter MOD_CDEF_SEARCH, // CDEF search MOD_CDEF, // CDEF frame MOD_LR, // Loop restoration filtering MOD_PACK_BS, // Pack bitstream MOD_FRAME_ENC, // Frame Parallel encode MOD_AI, // All intra NUM_MT_MODULES } MULTI_THREADED_MODULES; /*!\endcond */ /*!\enum COST_UPDATE_TYPE * \brief This enum controls how often the entropy costs should be updated. * \warning In case of any modifications/additions done to the enum * COST_UPDATE_TYPE, the enum INTERNAL_COST_UPDATE_TYPE needs to be updated as * well. */ typedef enum { COST_UPD_SB, /*!< Update every sb. */ COST_UPD_SBROW, /*!< Update every sb rows inside a tile. */ COST_UPD_TILE, /*!< Update every tile. */ COST_UPD_OFF, /*!< Turn off cost updates. */ NUM_COST_UPDATE_TYPES, /*!< Number of cost update types. */ } COST_UPDATE_TYPE; /*!\enum LOOPFILTER_CONTROL * \brief This enum controls to which frames loopfilter is applied. */ typedef enum { LOOPFILTER_NONE = 0, /*!< Disable loopfilter on all frames. */ LOOPFILTER_ALL = 1, /*!< Enable loopfilter for all frames. */ LOOPFILTER_REFERENCE = 2, /*!< Disable loopfilter on non reference frames. */ LOOPFILTER_SELECTIVELY = 3, /*!< Disable loopfilter on frames with low motion. */ } LOOPFILTER_CONTROL; /*!\enum SKIP_APPLY_POSTPROC_FILTER * \brief This enum controls the application of post-processing filters on a * reconstructed frame. */ typedef enum { SKIP_APPLY_RESTORATION = 1 << 0, SKIP_APPLY_SUPERRES = 1 << 1, SKIP_APPLY_CDEF = 1 << 2, SKIP_APPLY_LOOPFILTER = 1 << 3, } SKIP_APPLY_POSTPROC_FILTER; /*! * \brief Encoder config related to resize. */ typedef struct { /*! * Indicates the frame resize mode to be used by the encoder. */ RESIZE_MODE resize_mode; /*! * Indicates the denominator for resize of inter frames, assuming 8 as the * numerator. Its value ranges between 8-16. */ uint8_t resize_scale_denominator; /*! * Indicates the denominator for resize of key frames, assuming 8 as the * numerator. Its value ranges between 8-16. */ uint8_t resize_kf_scale_denominator; } ResizeCfg; /*! * \brief Encoder config for coding block partitioning. */ typedef struct { /*! * Flag to indicate if rectanguar partitions should be enabled. */ bool enable_rect_partitions; /*! * Flag to indicate if AB partitions should be enabled. */ bool enable_ab_partitions; /*! * Flag to indicate if 1:4 / 4:1 partitions should be enabled. */ bool enable_1to4_partitions; /*! * Indicates the minimum partition size that should be allowed. Both width and * height of a partition cannot be smaller than the min_partition_size. */ BLOCK_SIZE min_partition_size; /*! * Indicates the maximum partition size that should be allowed. Both width and * height of a partition cannot be larger than the max_partition_size. */ BLOCK_SIZE max_partition_size; } PartitionCfg; /*! * \brief Encoder flags for intra prediction. */ typedef struct { /*! * Flag to indicate if intra edge filtering process should be enabled. */ bool enable_intra_edge_filter; /*! * Flag to indicate if recursive filtering based intra prediction should be * enabled. */ bool enable_filter_intra; /*! * Flag to indicate if smooth intra prediction modes should be enabled. */ bool enable_smooth_intra; /*! * Flag to indicate if PAETH intra prediction mode should be enabled. */ bool enable_paeth_intra; /*! * Flag to indicate if CFL uv intra mode should be enabled. */ bool enable_cfl_intra; /*! * Flag to indicate if directional modes should be enabled. */ bool enable_directional_intra; /*! * Flag to indicate if the subset of directional modes from D45 to D203 intra * should be enabled. Has no effect if directional modes are disabled. */ bool enable_diagonal_intra; /*! * Flag to indicate if delta angles for directional intra prediction should be * enabled. */ bool enable_angle_delta; /*! * Flag to indicate whether to automatically turn off several intral coding * tools. * This flag is only used when "--deltaq-mode=3" is true. * When set to 1, the encoder will analyze the reconstruction quality * as compared to the source image in the preprocessing pass. * If the recontruction quality is considered high enough, we disable * the following intra coding tools, for better encoding speed: * "--enable_smooth_intra", * "--enable_paeth_intra", * "--enable_cfl_intra", * "--enable_diagonal_intra". */ bool auto_intra_tools_off; } IntraModeCfg; /*! * \brief Encoder flags for transform sizes and types. */ typedef struct { /*! * Flag to indicate if 64-pt transform should be enabled. */ bool enable_tx64; /*! * Flag to indicate if flip and identity transform types should be enabled. */ bool enable_flip_idtx; /*! * Flag to indicate if rectangular transform should be enabled. */ bool enable_rect_tx; /*! * Flag to indicate whether or not to use a default reduced set for ext-tx * rather than the potential full set of 16 transforms. */ bool reduced_tx_type_set; /*! * Flag to indicate if transform type for intra blocks should be limited to * DCT_DCT. */ bool use_intra_dct_only; /*! * Flag to indicate if transform type for inter blocks should be limited to * DCT_DCT. */ bool use_inter_dct_only; /*! * Flag to indicate if intra blocks should use default transform type * (mode-dependent) only. */ bool use_intra_default_tx_only; /*! * Flag to indicate if transform size search should be enabled. */ bool enable_tx_size_search; } TxfmSizeTypeCfg; /*! * \brief Encoder flags for compound prediction modes. */ typedef struct { /*! * Flag to indicate if distance-weighted compound type should be enabled. */ bool enable_dist_wtd_comp; /*! * Flag to indicate if masked (wedge/diff-wtd) compound type should be * enabled. */ bool enable_masked_comp; /*! * Flag to indicate if smooth interintra mode should be enabled. */ bool enable_smooth_interintra; /*! * Flag to indicate if difference-weighted compound type should be enabled. */ bool enable_diff_wtd_comp; /*! * Flag to indicate if inter-inter wedge compound type should be enabled. */ bool enable_interinter_wedge; /*! * Flag to indicate if inter-intra wedge compound type should be enabled. */ bool enable_interintra_wedge; } CompoundTypeCfg; /*! * \brief Encoder config related to frame super-resolution. */ typedef struct { /*! * Indicates the qindex based threshold to be used when AOM_SUPERRES_QTHRESH * mode is used for inter frames. */ int superres_qthresh; /*! * Indicates the qindex based threshold to be used when AOM_SUPERRES_QTHRESH * mode is used for key frames. */ int superres_kf_qthresh; /*! * Indicates the denominator of the fraction that specifies the ratio between * the superblock width before and after upscaling for inter frames. The * numerator of this fraction is equal to the constant SCALE_NUMERATOR. */ uint8_t superres_scale_denominator; /*! * Indicates the denominator of the fraction that specifies the ratio between * the superblock width before and after upscaling for key frames. The * numerator of this fraction is equal to the constant SCALE_NUMERATOR. */ uint8_t superres_kf_scale_denominator; /*! * Indicates the Super-resolution mode to be used by the encoder. */ aom_superres_mode superres_mode; /*! * Flag to indicate if super-resolution should be enabled for the sequence. */ bool enable_superres; } SuperResCfg; /*! * \brief Encoder config related to the coding of key frames. */ typedef struct { /*! * Indicates the minimum distance to a key frame. */ int key_freq_min; /*! * Indicates the maximum distance to a key frame. */ int key_freq_max; /*! * Indicates if temporal filtering should be applied on keyframe. */ int enable_keyframe_filtering; /*! * Indicates the number of frames after which a frame may be coded as an * S-Frame. */ int sframe_dist; /*! * Indicates how an S-Frame should be inserted. * 1: the considered frame will be made into an S-Frame only if it is an * altref frame. 2: the next altref frame will be made into an S-Frame. */ int sframe_mode; /*! * Indicates if encoder should autodetect cut scenes and set the keyframes. */ bool auto_key; /*! * Indicates the forward key frame distance. */ int fwd_kf_dist; /*! * Indicates if forward keyframe reference should be enabled. */ bool fwd_kf_enabled; /*! * Indicates if S-Frames should be enabled for the sequence. */ bool enable_sframe; /*! * Indicates if intra block copy prediction mode should be enabled or not. */ bool enable_intrabc; } KeyFrameCfg; /*! * \brief Encoder rate control configuration parameters */ typedef struct { /*!\cond */ // BUFFERING PARAMETERS /*!\endcond */ /*! * Indicates the amount of data that will be buffered by the decoding * application prior to beginning playback, and is expressed in units of * time(milliseconds). */ int64_t starting_buffer_level_ms; /*! * Indicates the amount of data that the encoder should try to maintain in the * decoder's buffer, and is expressed in units of time(milliseconds). */ int64_t optimal_buffer_level_ms; /*! * Indicates the maximum amount of data that may be buffered by the decoding * application, and is expressed in units of time(milliseconds). */ int64_t maximum_buffer_size_ms; /*! * Indicates the bandwidth to be used in bits per second. */ int64_t target_bandwidth; /*! * Indicates average complexity of the corpus in single pass vbr based on * LAP. 0 indicates that corpus complexity vbr mode is disabled. */ unsigned int vbr_corpus_complexity_lap; /*! * Indicates the maximum allowed bitrate for any intra frame as % of bitrate * target. */ unsigned int max_intra_bitrate_pct; /*! * Indicates the maximum allowed bitrate for any inter frame as % of bitrate * target. */ unsigned int max_inter_bitrate_pct; /*! * Indicates the percentage of rate boost for golden frame in CBR mode. */ unsigned int gf_cbr_boost_pct; /*! * min_cr / 100 indicates the target minimum compression ratio for each * frame. */ unsigned int min_cr; /*! * Indicates the frame drop threshold. */ int drop_frames_water_mark; /*! * under_shoot_pct indicates the tolerance of the VBR algorithm to * undershoot and is used as a trigger threshold for more aggressive * adaptation of Q. It's value can range from 0-100. */ int under_shoot_pct; /*! * over_shoot_pct indicates the tolerance of the VBR algorithm to overshoot * and is used as a trigger threshold for more aggressive adaptation of Q. * It's value can range from 0-1000. */ int over_shoot_pct; /*! * Indicates the maximum qindex that can be used by the quantizer i.e. the * worst quality qindex. */ int worst_allowed_q; /*! * Indicates the minimum qindex that can be used by the quantizer i.e. the * best quality qindex. */ int best_allowed_q; /*! * Indicates the Constant/Constrained Quality level. */ int cq_level; /*! * Indicates if the encoding mode is vbr, cbr, constrained quality or * constant quality. */ enum aom_rc_mode mode; /*! * Indicates the bias (expressed on a scale of 0 to 100) for determining * target size for the current frame. The value 0 indicates the optimal CBR * mode value should be used, and 100 indicates the optimal VBR mode value * should be used. */ int vbrbias; /*! * Indicates the minimum bitrate to be used for a single frame as a percentage * of the target bitrate. */ int vbrmin_section; /*! * Indicates the maximum bitrate to be used for a single frame as a percentage * of the target bitrate. */ int vbrmax_section; /*! * Indicates the maximum consecutive amount of frame drops, in units of time * (milliseconds). This is converted to frame units internally. Only used in * CBR mode. */ int max_consec_drop_ms; } RateControlCfg; /*!\cond */ typedef struct { // Indicates the number of frames lag before encoding is started. int lag_in_frames; // Indicates the minimum gf/arf interval to be used. int min_gf_interval; // Indicates the maximum gf/arf interval to be used. int max_gf_interval; // Indicates the minimum height for GF group pyramid structure to be used. int gf_min_pyr_height; // Indicates the maximum height for GF group pyramid structure to be used. int gf_max_pyr_height; // Indicates if automatic set and use of altref frames should be enabled. bool enable_auto_arf; // Indicates if automatic set and use of (b)ackward (r)ef (f)rames should be // enabled. bool enable_auto_brf; } GFConfig; typedef struct { // Indicates the number of tile groups. unsigned int num_tile_groups; // Indicates the MTU size for a tile group. If mtu is non-zero, // num_tile_groups is set to DEFAULT_MAX_NUM_TG. unsigned int mtu; // Indicates the number of tile columns in log2. int tile_columns; // Indicates the number of tile rows in log2. int tile_rows; // Indicates the number of widths in the tile_widths[] array. int tile_width_count; // Indicates the number of heights in the tile_heights[] array. int tile_height_count; // Indicates the tile widths, and may be empty. int tile_widths[MAX_TILE_COLS]; // Indicates the tile heights, and may be empty. int tile_heights[MAX_TILE_ROWS]; // Indicates if large scale tile coding should be used. bool enable_large_scale_tile; // Indicates if single tile decoding mode should be enabled. bool enable_single_tile_decoding; // Indicates if EXT_TILE_DEBUG should be enabled. bool enable_ext_tile_debug; } TileConfig; typedef struct { // Indicates the width of the input frame. int width; // Indicates the height of the input frame. int height; // If forced_max_frame_width is non-zero then it is used to force the maximum // frame width written in write_sequence_header(). int forced_max_frame_width; // If forced_max_frame_width is non-zero then it is used to force the maximum // frame height written in write_sequence_header(). int forced_max_frame_height; // Indicates the frame width after applying both super-resolution and resize // to the coded frame. int render_width; // Indicates the frame height after applying both super-resolution and resize // to the coded frame. int render_height; } FrameDimensionCfg; typedef struct { // Indicates if warped motion should be enabled. bool enable_warped_motion; // Indicates if warped motion should be evaluated or not. bool allow_warped_motion; // Indicates if OBMC motion should be enabled. bool enable_obmc; } MotionModeCfg; typedef struct { // Timing info for each frame. aom_timing_info_t timing_info; // Indicates the number of time units of a decoding clock. uint32_t num_units_in_decoding_tick; // Indicates if decoder model information is present in the coded sequence // header. bool decoder_model_info_present_flag; // Indicates if display model information is present in the coded sequence // header. bool display_model_info_present_flag; // Indicates if timing info for each frame is present. bool timing_info_present; } DecoderModelCfg; typedef struct { // Indicates the update frequency for coeff costs. COST_UPDATE_TYPE coeff; // Indicates the update frequency for mode costs. COST_UPDATE_TYPE mode; // Indicates the update frequency for mv costs. COST_UPDATE_TYPE mv; // Indicates the update frequency for dv costs. COST_UPDATE_TYPE dv; } CostUpdateFreq; typedef struct { // Indicates the maximum number of reference frames allowed per frame. unsigned int max_reference_frames; // Indicates if the reduced set of references should be enabled. bool enable_reduced_reference_set; // Indicates if one-sided compound should be enabled. bool enable_onesided_comp; } RefFrameCfg; typedef struct { // Indicates the color space that should be used. aom_color_primaries_t color_primaries; // Indicates the characteristics of transfer function to be used. aom_transfer_characteristics_t transfer_characteristics; // Indicates the matrix coefficients to be used for the transfer function. aom_matrix_coefficients_t matrix_coefficients; // Indicates the chroma 4:2:0 sample position info. aom_chroma_sample_position_t chroma_sample_position; // Indicates if a limited color range or full color range should be used. aom_color_range_t color_range; } ColorCfg; typedef struct { // Indicates if extreme motion vector unit test should be enabled or not. unsigned int motion_vector_unit_test; // Indicates if superblock multipass unit test should be enabled or not. unsigned int sb_multipass_unit_test; } UnitTestCfg; typedef struct { // Indicates the file path to the VMAF model. const char *vmaf_model_path; // Indicates the path to the film grain parameters. const char *film_grain_table_filename; // Indicates the visual tuning metric. aom_tune_metric tuning; // Indicates if the current content is screen or default type. aom_tune_content content; // Indicates the film grain parameters. int film_grain_test_vector; // Indicates the in-block distortion metric to use. aom_dist_metric dist_metric; } TuneCfg; typedef struct { // Indicates the framerate of the input video. double init_framerate; // Indicates the bit-depth of the input video. unsigned int input_bit_depth; // Indicates the maximum number of frames to be encoded. unsigned int limit; // Indicates the chrome subsampling x value. unsigned int chroma_subsampling_x; // Indicates the chrome subsampling y value. unsigned int chroma_subsampling_y; } InputCfg; typedef struct { // If true, encoder will use fixed QP offsets, that are either: // - Given by the user, and stored in 'fixed_qp_offsets' array, OR // - Picked automatically from cq_level. int use_fixed_qp_offsets; // Indicates the minimum flatness of the quantization matrix. int qm_minlevel; // Indicates the maximum flatness of the quantization matrix. int qm_maxlevel; // Indicates if adaptive quantize_b should be enabled. int quant_b_adapt; // Indicates the Adaptive Quantization mode to be used. AQ_MODE aq_mode; // Indicates the delta q mode to be used. DELTAQ_MODE deltaq_mode; // Indicates the delta q mode strength. DELTAQ_MODE deltaq_strength; // Indicates if delta quantization should be enabled in chroma planes. bool enable_chroma_deltaq; // Indicates if delta quantization should be enabled for hdr video bool enable_hdr_deltaq; // Indicates if encoding with quantization matrices should be enabled. bool using_qm; } QuantizationCfg; /*!\endcond */ /*! * \brief Algorithm configuration parameters. */ typedef struct { /*! * Controls the level at which rate-distortion optimization of transform * coefficients favors sharpness in the block. Has no impact on RD when set * to zero (default). * * For values 1-7, eob and skip block optimization are * avoided and rdmult is adjusted in favor of block sharpness. * * In all-intra mode: it also sets the `loop_filter_sharpness` syntax element * in the bitstream. Larger values increasingly reduce how much the filtering * can change the sample values on block edges to favor perceived sharpness. */ int sharpness; /*! * Indicates the trellis optimization mode of quantized coefficients. * 0: disabled * 1: enabled * 2: enabled for rd search * 3: true for estimate yrd search */ int disable_trellis_quant; /*! * The maximum number of frames used to create an arf. */ int arnr_max_frames; /*! * The temporal filter strength for arf used when creating ARFs. */ int arnr_strength; /*! * Indicates the CDF update mode * 0: no update * 1: update on every frame(default) * 2: selectively update */ uint8_t cdf_update_mode; /*! * Indicates if RDO based on frame temporal dependency should be enabled. */ bool enable_tpl_model; /*! * Indicates if coding of overlay frames for filtered ALTREF frames is * enabled. */ bool enable_overlay; /*! * Controls loop filtering * 0: Loop filter is disabled for all frames * 1: Loop filter is enabled for all frames * 2: Loop filter is disabled for non-reference frames * 3: Loop filter is disables for the frames with low motion */ LOOPFILTER_CONTROL loopfilter_control; /*! * Indicates if the application of post-processing filters should be skipped * on reconstructed frame. */ bool skip_postproc_filtering; } AlgoCfg; /*!\cond */ typedef struct { // Indicates the codec bit-depth. aom_bit_depth_t bit_depth; // Indicates the superblock size that should be used by the encoder. aom_superblock_size_t superblock_size; // Indicates if loopfilter modulation should be enabled. bool enable_deltalf_mode; // Indicates how CDEF should be applied. CDEF_CONTROL cdef_control; // Indicates if loop restoration filter should be enabled. bool enable_restoration; // When enabled, video mode should be used even for single frame input. bool force_video_mode; // Indicates if the error resiliency features should be enabled. bool error_resilient_mode; // Indicates if frame parallel decoding feature should be enabled. bool frame_parallel_decoding_mode; // Indicates if the input should be encoded as monochrome. bool enable_monochrome; // When enabled, the encoder will use a full header even for still pictures. // When disabled, a reduced header is used for still pictures. bool full_still_picture_hdr; // Indicates if dual interpolation filters should be enabled. bool enable_dual_filter; // Indicates if frame order hint should be enabled or not. bool enable_order_hint; // Indicates if ref_frame_mvs should be enabled at the sequence level. bool ref_frame_mvs_present; // Indicates if ref_frame_mvs should be enabled at the frame level. bool enable_ref_frame_mvs; // Indicates if interintra compound mode is enabled. bool enable_interintra_comp; // Indicates if global motion should be enabled. bool enable_global_motion; // Indicates if palette should be enabled. bool enable_palette; } ToolCfg; /*!\endcond */ /*! * \brief Main encoder configuration data structure. */ typedef struct AV1EncoderConfig { /*!\cond */ // Configuration related to the input video. InputCfg input_cfg; // Configuration related to frame-dimensions. FrameDimensionCfg frm_dim_cfg; /*!\endcond */ /*! * Encoder algorithm configuration. */ AlgoCfg algo_cfg; /*! * Configuration related to key-frames. */ KeyFrameCfg kf_cfg; /*! * Rate control configuration */ RateControlCfg rc_cfg; /*!\cond */ // Configuration related to Quantization. QuantizationCfg q_cfg; // Internal frame size scaling. ResizeCfg resize_cfg; // Frame Super-Resolution size scaling. SuperResCfg superres_cfg; /*!\endcond */ /*! * stats_in buffer contains all of the stats packets produced in the first * pass, concatenated. */ aom_fixed_buf_t twopass_stats_in; /*!\cond */ // Configuration related to encoder toolsets. ToolCfg tool_cfg; // Configuration related to Group of frames. GFConfig gf_cfg; // Tile related configuration parameters. TileConfig tile_cfg; // Configuration related to Tune. TuneCfg tune_cfg; // Configuration related to color. ColorCfg color_cfg; // Configuration related to decoder model. DecoderModelCfg dec_model_cfg; // Configuration related to reference frames. RefFrameCfg ref_frm_cfg; // Configuration related to unit tests. UnitTestCfg unit_test_cfg; // Flags related to motion mode. MotionModeCfg motion_mode_cfg; // Flags related to intra mode search. IntraModeCfg intra_mode_cfg; // Flags related to transform size/type. TxfmSizeTypeCfg txfm_cfg; // Flags related to compound type. CompoundTypeCfg comp_type_cfg; // Partition related information. PartitionCfg part_cfg; // Configuration related to frequency of cost update. CostUpdateFreq cost_upd_freq; #if CONFIG_DENOISE // Indicates the noise level. float noise_level; // Indicates the the denoisers block size. int noise_block_size; // Indicates whether to apply denoising to the frame to be encoded int enable_dnl_denoising; #endif #if CONFIG_AV1_TEMPORAL_DENOISING // Noise sensitivity. int noise_sensitivity; #endif // Bit mask to specify which tier each of the 32 possible operating points // conforms to. unsigned int tier_mask; // Indicates the number of pixels off the edge of a reference frame we're // allowed to go when forming an inter prediction. int border_in_pixels; // Indicates the maximum number of threads that may be used by the encoder. int max_threads; // Indicates the speed preset to be used. int speed; // Indicates the target sequence level index for each operating point(OP). AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS]; // Indicates the bitstream profile to be used. BITSTREAM_PROFILE profile; /*!\endcond */ /*! * Indicates the current encoder pass : * AOM_RC_ONE_PASS = One pass encode, * AOM_RC_FIRST_PASS = First pass of multiple-pass * AOM_RC_SECOND_PASS = Second pass of multiple-pass * AOM_RC_THIRD_PASS = Third pass of multiple-pass */ enum aom_enc_pass pass; /*!\cond */ // Total number of encoding passes. int passes; // the name of the second pass output file when passes > 2 const char *two_pass_output; // the name of the second pass log file when passes > 2 const char *second_pass_log; // Indicates if the encoding is GOOD or REALTIME. MODE mode; // Indicates if row-based multi-threading should be enabled or not. bool row_mt; // Indicates if frame parallel multi-threading should be enabled or not. bool fp_mt; // Indicates if 16bit frame buffers are to be used i.e., the content is > // 8-bit. bool use_highbitdepth; // Indicates the bitstream syntax mode. 0 indicates bitstream is saved as // Section 5 bitstream, while 1 indicates the bitstream is saved in Annex - B // format. bool save_as_annexb; // The path for partition stats reading and writing, used in the experiment // CONFIG_PARTITION_SEARCH_ORDER. const char *partition_info_path; // The flag that indicates whether we use an external rate distribution to // guide adaptive quantization. It requires --deltaq-mode=3. The rate // distribution map file name is stored in |rate_distribution_info|. unsigned int enable_rate_guide_deltaq; // The input file of rate distribution information used in all intra mode // to determine delta quantization. const char *rate_distribution_info; // Exit the encoder when it fails to encode to a given level. int strict_level_conformance; // Max depth for the GOP after a key frame int kf_max_pyr_height; // A flag to control if we enable the superblock qp sweep for a given lambda int sb_qp_sweep; /*!\endcond */ } AV1EncoderConfig; /*!\cond */ static inline int is_lossless_requested(const RateControlCfg *const rc_cfg) { return rc_cfg->best_allowed_q == 0 && rc_cfg->worst_allowed_q == 0; } /*!\endcond */ /*! * \brief Encoder-side probabilities for pruning of various AV1 tools */ typedef struct { /*! * obmc_probs[i][j] is the probability of OBMC being the best motion mode for * jth block size and ith frame update type, averaged over past frames. If * obmc_probs[i][j] < thresh, then OBMC search is pruned. */ int obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL]; /*! * warped_probs[i] is the probability of warped motion being the best motion * mode for ith frame update type, averaged over past frames. If * warped_probs[i] < thresh, then warped motion search is pruned. */ int warped_probs[FRAME_UPDATE_TYPES]; /*! * tx_type_probs[i][j][k] is the probability of kth tx_type being the best * for jth transform size and ith frame update type, averaged over past * frames. If tx_type_probs[i][j][k] < thresh, then transform search for that * type is pruned. */ int tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES]; /*! * switchable_interp_probs[i][j][k] is the probability of kth interpolation * filter being the best for jth filter context and ith frame update type, * averaged over past frames. If switchable_interp_probs[i][j][k] < thresh, * then interpolation filter search is pruned for that case. */ int switchable_interp_probs[FRAME_UPDATE_TYPES][SWITCHABLE_FILTER_CONTEXTS] [SWITCHABLE_FILTERS]; } FrameProbInfo; /*!\cond */ typedef struct FRAME_COUNTS { // Note: This structure should only contain 'unsigned int' fields, or // aggregates built solely from 'unsigned int' fields/elements #if CONFIG_ENTROPY_STATS unsigned int kf_y_mode[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][INTRA_MODES]; unsigned int angle_delta[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1]; unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES]; unsigned int uv_mode[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES]; unsigned int cfl_sign[CFL_JOINT_SIGNS]; unsigned int cfl_alpha[CFL_ALPHA_CONTEXTS][CFL_ALPHABET_SIZE]; unsigned int palette_y_mode[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2]; unsigned int palette_uv_mode[PALETTE_UV_MODE_CONTEXTS][2]; unsigned int palette_y_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; unsigned int palette_uv_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; unsigned int palette_y_color_index[PALETTE_SIZES] [PALETTE_COLOR_INDEX_CONTEXTS] [PALETTE_COLORS]; unsigned int palette_uv_color_index[PALETTE_SIZES] [PALETTE_COLOR_INDEX_CONTEXTS] [PALETTE_COLORS]; unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES]; unsigned int txb_skip[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS][2]; unsigned int eob_extra[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] [EOB_COEF_CONTEXTS][2]; unsigned int dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS][2]; unsigned int coeff_lps[TX_SIZES][PLANE_TYPES][BR_CDF_SIZE - 1][LEVEL_CONTEXTS] [2]; unsigned int eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS][2]; unsigned int eob_multi16[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][5]; unsigned int eob_multi32[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][6]; unsigned int eob_multi64[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][7]; unsigned int eob_multi128[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][8]; unsigned int eob_multi256[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][9]; unsigned int eob_multi512[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][10]; unsigned int eob_multi1024[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][11]; unsigned int coeff_lps_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] [LEVEL_CONTEXTS][BR_CDF_SIZE]; unsigned int coeff_base_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] [SIG_COEF_CONTEXTS][NUM_BASE_LEVELS + 2]; unsigned int coeff_base_eob_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] [SIG_COEF_CONTEXTS_EOB][NUM_BASE_LEVELS + 1]; unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2]; unsigned int zeromv_mode[GLOBALMV_MODE_CONTEXTS][2]; unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2]; unsigned int drl_mode[DRL_MODE_CONTEXTS][2]; unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES]; unsigned int wedge_idx[BLOCK_SIZES_ALL][16]; unsigned int interintra[BLOCK_SIZE_GROUPS][2]; unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES]; unsigned int wedge_interintra[BLOCK_SIZES_ALL][2]; unsigned int compound_type[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES]; unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES]; unsigned int obmc[BLOCK_SIZES_ALL][2]; unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; unsigned int comp_inter[COMP_INTER_CONTEXTS][2]; unsigned int comp_ref_type[COMP_REF_TYPE_CONTEXTS][2]; unsigned int uni_comp_ref[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1][2]; unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2]; unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2]; unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2]; unsigned int intrabc[2]; unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2]; unsigned int intra_tx_size[MAX_TX_CATS][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1]; unsigned int skip_mode[SKIP_MODE_CONTEXTS][2]; unsigned int skip_txfm[SKIP_CONTEXTS][2]; unsigned int compound_index[COMP_INDEX_CONTEXTS][2]; unsigned int comp_group_idx[COMP_GROUP_IDX_CONTEXTS][2]; unsigned int delta_q[DELTA_Q_PROBS][2]; unsigned int delta_lf_multi[FRAME_LF_COUNT][DELTA_LF_PROBS][2]; unsigned int delta_lf[DELTA_LF_PROBS][2]; unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES]; unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES] [TX_TYPES]; unsigned int filter_intra_mode[FILTER_INTRA_MODES]; unsigned int filter_intra[BLOCK_SIZES_ALL][2]; unsigned int switchable_restore[RESTORE_SWITCHABLE_TYPES]; unsigned int wiener_restore[2]; unsigned int sgrproj_restore[2]; #endif // CONFIG_ENTROPY_STATS unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS] [SWITCHABLE_FILTERS]; } FRAME_COUNTS; #define INTER_MODE_RD_DATA_OVERALL_SIZE 6400 typedef struct { int ready; double a; double b; double dist_mean; double ld_mean; double sse_mean; double sse_sse_mean; double sse_ld_mean; int num; double dist_sum; double ld_sum; double sse_sum; double sse_sse_sum; double sse_ld_sum; } InterModeRdModel; typedef struct { int idx; int64_t rd; } RdIdxPair; // TODO(angiebird): This is an estimated size. We still need to figure what is // the maximum number of modes. #define MAX_INTER_MODES 1024 // TODO(any): rename this struct to something else. There is already another // struct called inter_mode_info, which makes this terribly confusing. /*!\endcond */ /*! * \brief Struct used to hold inter mode data for fast tx search. * * This struct is used to perform a full transform search only on winning * candidates searched with an estimate for transform coding RD. */ typedef struct inter_modes_info { /*! * The number of inter modes for which data was stored in each of the * following arrays. */ int num; /*! * Mode info struct for each of the candidate modes. */ MB_MODE_INFO mbmi_arr[MAX_INTER_MODES]; /*! * The rate for each of the candidate modes. */ int mode_rate_arr[MAX_INTER_MODES]; /*! * The sse of the predictor for each of the candidate modes. */ int64_t sse_arr[MAX_INTER_MODES]; /*! * The estimated rd of the predictor for each of the candidate modes. */ int64_t est_rd_arr[MAX_INTER_MODES]; /*! * The rate and mode index for each of the candidate modes. */ RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES]; /*! * The full rd stats for each of the candidate modes. */ RD_STATS rd_cost_arr[MAX_INTER_MODES]; /*! * The full rd stats of luma only for each of the candidate modes. */ RD_STATS rd_cost_y_arr[MAX_INTER_MODES]; /*! * The full rd stats of chroma only for each of the candidate modes. */ RD_STATS rd_cost_uv_arr[MAX_INTER_MODES]; } InterModesInfo; /*!\cond */ typedef struct { // TODO(kyslov): consider changing to 64bit // This struct is used for computing variance in choose_partitioning(), where // the max number of samples within a superblock is 32x32 (with 4x4 avg). // With 8bit bitdepth, uint32_t is enough for sum_square_error (2^8 * 2^8 * 32 // * 32 = 2^26). For high bitdepth we need to consider changing this to 64 bit uint32_t sum_square_error; int32_t sum_error; int log2_count; int variance; } VPartVar; typedef struct { VPartVar none; VPartVar horz[2]; VPartVar vert[2]; } VPVariance; typedef struct { VPVariance part_variances; VPartVar split[4]; } VP4x4; typedef struct { VPVariance part_variances; VP4x4 split[4]; } VP8x8; typedef struct { VPVariance part_variances; VP8x8 split[4]; } VP16x16; typedef struct { VPVariance part_variances; VP16x16 split[4]; } VP32x32; typedef struct { VPVariance part_variances; VP32x32 split[4]; } VP64x64; typedef struct { VPVariance part_variances; VP64x64 *split; } VP128x128; /*!\endcond */ /*! * \brief Thresholds for variance based partitioning. */ typedef struct { /*! * If block variance > threshold, then that block is forced to split. * thresholds[0] - threshold for 128x128; * thresholds[1] - threshold for 64x64; * thresholds[2] - threshold for 32x32; * thresholds[3] - threshold for 16x16; * thresholds[4] - threshold for 8x8; */ int64_t thresholds[5]; /*! * MinMax variance threshold for 8x8 sub blocks of a 16x16 block. If actual * minmax > threshold_minmax, the 16x16 is forced to split. */ int64_t threshold_minmax; } VarBasedPartitionInfo; /*! * \brief Encoder parameters for synchronization of row based multi-threading */ typedef struct { #if CONFIG_MULTITHREAD /** * \name Synchronization objects for top-right dependency. */ /**@{*/ pthread_mutex_t *mutex_; /*!< Mutex lock object */ pthread_cond_t *cond_; /*!< Condition variable */ /**@}*/ #endif // CONFIG_MULTITHREAD /*! * Buffer to store the superblock whose encoding is complete. * num_finished_cols[i] stores the number of superblocks which finished * encoding in the ith superblock row. */ int *num_finished_cols; /*! * Denotes the superblock interval at which conditional signalling should * happen. Also denotes the minimum number of extra superblocks of the top row * to be complete to start encoding the current superblock. A value of 1 * indicates top-right dependency. */ int sync_range; /*! * Denotes the additional number of superblocks in the previous row to be * complete to start encoding the current superblock when intraBC tool is * enabled. This additional top-right delay is required to satisfy the * hardware constraints for intraBC tool when row multithreading is enabled. */ int intrabc_extra_top_right_sb_delay; /*! * Number of superblock rows. */ int rows; /*! * The superblock row (in units of MI blocks) to be processed next. */ int next_mi_row; /*! * Number of threads processing the current tile. */ int num_threads_working; } AV1EncRowMultiThreadSync; /*!\cond */ // TODO(jingning) All spatially adaptive variables should go to TileDataEnc. typedef struct TileDataEnc { TileInfo tile_info; DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx); FRAME_CONTEXT *row_ctx; uint64_t abs_sum_level; uint8_t allow_update_cdf; InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; AV1EncRowMultiThreadSync row_mt_sync; MV firstpass_top_mv; } TileDataEnc; typedef struct RD_COUNTS { int compound_ref_used_flag; int skip_mode_used_flag; int tx_type_used[TX_SIZES_ALL][TX_TYPES]; int obmc_used[BLOCK_SIZES_ALL][2]; int warped_used[2]; int newmv_or_intra_blocks; uint64_t seg_tmp_pred_cost[2]; } RD_COUNTS; typedef struct ThreadData { MACROBLOCK mb; MvCosts *mv_costs_alloc; IntraBCMVCosts *dv_costs_alloc; RD_COUNTS rd_counts; FRAME_COUNTS *counts; PC_TREE_SHARED_BUFFERS shared_coeff_buf; SIMPLE_MOTION_DATA_TREE *sms_tree; SIMPLE_MOTION_DATA_TREE *sms_root; uint32_t *hash_value_buffer[2][2]; OBMCBuffer obmc_buffer; PALETTE_BUFFER *palette_buffer; CompoundTypeRdBuffers comp_rd_buffer; CONV_BUF_TYPE *tmp_conv_dst; uint64_t abs_sum_level; uint8_t *tmp_pred_bufs[2]; uint8_t *wiener_tmp_pred_buf; int intrabc_used; int deltaq_used; int coefficient_size; int max_mv_magnitude; int interp_filter_selected[SWITCHABLE]; FRAME_CONTEXT *tctx; VP64x64 *vt64x64; int32_t num_64x64_blocks; PICK_MODE_CONTEXT *firstpass_ctx; TemporalFilterData tf_data; TplBuffers tpl_tmp_buffers; TplTxfmStats tpl_txfm_stats; GlobalMotionData gm_data; // Pointer to the array of structures to store gradient information of each // pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level // structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV). PixelLevelGradientInfo *pixel_gradient_info; // Pointer to the array of structures to store source variance information of // each 4x4 sub-block in a superblock. Block4x4VarInfo structure is used to // store source variance and log of source variance of each 4x4 sub-block // for subsequent retrieval. Block4x4VarInfo *src_var_info_of_4x4_sub_blocks; // Pointer to pc tree root. PC_TREE *pc_root; } ThreadData; struct EncWorkerData; /*!\endcond */ /*! * \brief Encoder data related to row-based multi-threading */ typedef struct { /*! * Number of tile rows for which row synchronization memory is allocated. */ int allocated_tile_rows; /*! * Number of tile cols for which row synchronization memory is allocated. */ int allocated_tile_cols; /*! * Number of rows for which row synchronization memory is allocated * per tile. During first-pass/look-ahead stage this equals the * maximum number of macroblock rows in a tile. During encode stage, * this equals the maximum number of superblock rows in a tile. */ int allocated_rows; /*! * Number of columns for which entropy context memory is allocated * per tile. During encode stage, this equals the maximum number of * superblock columns in a tile minus 1. The entropy context memory * is not allocated during first-pass/look-ahead stage. */ int allocated_cols; /*! * thread_id_to_tile_id[i] indicates the tile id assigned to the ith thread. */ int thread_id_to_tile_id[MAX_NUM_THREADS]; /*! * num_tile_cols_done[i] indicates the number of tile columns whose encoding * is complete in the ith superblock row. */ int *num_tile_cols_done; /*! * Number of superblock rows in a frame for which 'num_tile_cols_done' is * allocated. */ int allocated_sb_rows; /*! * Initialized to false, set to true by the worker thread that encounters an * error in order to abort the processing of other worker threads. */ bool row_mt_exit; /*! * Initialized to false, set to true during first pass encoding by the worker * thread that encounters an error in order to abort the processing of other * worker threads. */ bool firstpass_mt_exit; /*! * Initialized to false, set to true in cal_mb_wiener_var_hook() by the worker * thread that encounters an error in order to abort the processing of other * worker threads. */ bool mb_wiener_mt_exit; #if CONFIG_MULTITHREAD /*! * Mutex lock used while dispatching jobs. */ pthread_mutex_t *mutex_; /*! * Condition variable used to dispatch loopfilter jobs. */ pthread_cond_t *cond_; #endif /** * \name Row synchronization related function pointers. */ /**@{*/ /*! * Reader. */ void (*sync_read_ptr)(AV1EncRowMultiThreadSync *const, int, int); /*! * Writer. */ void (*sync_write_ptr)(AV1EncRowMultiThreadSync *const, int, int, int); /**@}*/ } AV1EncRowMultiThreadInfo; /*! * \brief Encoder data related to multi-threading for allintra deltaq-mode=3 */ typedef struct { #if CONFIG_MULTITHREAD /*! * Mutex lock used while dispatching jobs. */ pthread_mutex_t *mutex_; /*! * Condition variable used to dispatch loopfilter jobs. */ pthread_cond_t *cond_; #endif /** * \name Row synchronization related function pointers for all intra mode */ /**@{*/ /*! * Reader. */ void (*intra_sync_read_ptr)(AV1EncRowMultiThreadSync *const, int, int); /*! * Writer. */ void (*intra_sync_write_ptr)(AV1EncRowMultiThreadSync *const, int, int, int); /**@}*/ } AV1EncAllIntraMultiThreadInfo; /*! * \brief Max number of recodes used to track the frame probabilities. */ #define NUM_RECODES_PER_FRAME 10 /*! * \brief Max number of frames that can be encoded in a parallel encode set. */ #define MAX_PARALLEL_FRAMES 4 /*! * \brief Buffers to be backed up during parallel encode set to be restored * later. */ typedef struct RestoreStateBuffers { /*! * Backup of original CDEF srcbuf. */ uint16_t *cdef_srcbuf; /*! * Backup of original CDEF colbuf. */ uint16_t *cdef_colbuf[MAX_MB_PLANE]; /*! * Backup of original LR rst_tmpbuf. */ int32_t *rst_tmpbuf; /*! * Backup of original LR rlbs. */ RestorationLineBuffers *rlbs; } RestoreStateBuffers; /*! * \brief Parameters related to restoration types. */ typedef struct { /*! * Stores the best coefficients for Wiener restoration. */ WienerInfo wiener; /*! * Stores the best coefficients for Sgrproj restoration. */ SgrprojInfo sgrproj; /*! * The rtype to use for this unit given a frame rtype as index. Indices: * WIENER, SGRPROJ, SWITCHABLE. */ RestorationType best_rtype[RESTORE_TYPES - 1]; } RestUnitSearchInfo; /*! * \brief Structure to hold search parameter per restoration unit and * intermediate buffer of Wiener filter used in pick filter stage of Loop * restoration. */ typedef struct { /*! * Array of pointers to 'RestUnitSearchInfo' which holds data related to * restoration types. */ RestUnitSearchInfo *rusi[MAX_MB_PLANE]; /*! * Buffer used to hold dgd-avg data during SIMD call of Wiener filter. */ int16_t *dgd_avg; } AV1LrPickStruct; /*! * \brief Primary Encoder parameters related to multi-threading. */ typedef struct PrimaryMultiThreadInfo { /*! * Number of workers created for multi-threading. */ int num_workers; /*! * Number of workers used for different MT modules. */ int num_mod_workers[NUM_MT_MODULES]; /*! * Synchronization object used to launch job in the worker thread. */ AVxWorker *workers; /*! * Data specific to each worker in encoder multi-threading. * tile_thr_data[i] stores the worker data of the ith thread. */ struct EncWorkerData *tile_thr_data; /*! * CDEF row multi-threading data. */ AV1CdefWorkerData *cdef_worker; /*! * Primary(Level 1) Synchronization object used to launch job in the worker * thread. */ AVxWorker *p_workers[MAX_PARALLEL_FRAMES]; /*! * Number of primary workers created for multi-threading. */ int p_num_workers; /*! * Tracks the number of workers in encode stage multi-threading. */ int prev_num_enc_workers; } PrimaryMultiThreadInfo; /*! * \brief Encoder parameters related to multi-threading. */ typedef struct MultiThreadInfo { /*! * Number of workers created for multi-threading. */ int num_workers; /*! * Number of workers used for different MT modules. */ int num_mod_workers[NUM_MT_MODULES]; /*! * Synchronization object used to launch job in the worker thread. */ AVxWorker *workers; /*! * Data specific to each worker in encoder multi-threading. * tile_thr_data[i] stores the worker data of the ith thread. */ struct EncWorkerData *tile_thr_data; /*! * When set, indicates that row based multi-threading of the encoder is * enabled. */ bool row_mt_enabled; /*! * When set, indicates that multi-threading for bitstream packing is enabled. */ bool pack_bs_mt_enabled; /*! * Encoder row multi-threading data. */ AV1EncRowMultiThreadInfo enc_row_mt; /*! * Encoder multi-threading data for allintra mode in the preprocessing stage * when --deltaq-mode=3. */ AV1EncAllIntraMultiThreadInfo intra_mt; /*! * Tpl row multi-threading data. */ AV1TplRowMultiThreadInfo tpl_row_mt; /*! * Loop Filter multi-threading object. */ AV1LfSync lf_row_sync; /*! * Loop Restoration multi-threading object. */ AV1LrSync lr_row_sync; /*! * Pack bitstream multi-threading object. */ AV1EncPackBSSync pack_bs_sync; /*! * Global Motion multi-threading object. */ AV1GlobalMotionSync gm_sync; /*! * Temporal Filter multi-threading object. */ AV1TemporalFilterSync tf_sync; /*! * CDEF search multi-threading object. */ AV1CdefSync cdef_sync; /*! * Pointer to CDEF row multi-threading data for the frame. */ AV1CdefWorkerData *cdef_worker; /*! * Buffers to be stored/restored before/after parallel encode. */ RestoreStateBuffers restore_state_buf; /*! * In multi-threaded realtime encoding with row-mt enabled, pipeline * loop-filtering after encoding. */ int pipeline_lpf_mt_with_enc; } MultiThreadInfo; /*!\cond */ typedef struct ActiveMap { int enabled; int update; unsigned char *map; } ActiveMap; /*!\endcond */ /*! * \brief Encoder info used for decision on forcing integer motion vectors. */ typedef struct { /*! * cs_rate_array[i] is the fraction of blocks in a frame which either match * with the collocated block or are smooth, where i is the rate_index. */ double cs_rate_array[32]; /*! * rate_index is used to index cs_rate_array. */ int rate_index; /*! * rate_size is the total number of entries populated in cs_rate_array. */ int rate_size; } ForceIntegerMVInfo; /*!\cond */ #if CONFIG_INTERNAL_STATS // types of stats enum { STAT_Y, STAT_U, STAT_V, STAT_ALL, NUM_STAT_TYPES // This should always be the last member of the enum } UENUM1BYTE(StatType); typedef struct IMAGE_STAT { double stat[NUM_STAT_TYPES]; double worst; } ImageStat; #endif // CONFIG_INTERNAL_STATS typedef struct { int ref_count; YV12_BUFFER_CONFIG buf; } EncRefCntBuffer; /*!\endcond */ /*! * \brief Buffer to store mode information at mi_alloc_bsize (4x4 or 8x8) level * * This is used for bitstream preparation. */ typedef struct { /*! * frame_base[mi_row * stride + mi_col] stores the mode information of * block (mi_row,mi_col). */ MB_MODE_INFO_EXT_FRAME *frame_base; /*! * Size of frame_base buffer. */ int alloc_size; /*! * Stride of frame_base buffer. */ int stride; } MBMIExtFrameBufferInfo; /*!\cond */ #if CONFIG_COLLECT_PARTITION_STATS typedef struct FramePartitionTimingStats { int partition_decisions[6][EXT_PARTITION_TYPES]; int partition_attempts[6][EXT_PARTITION_TYPES]; int64_t partition_times[6][EXT_PARTITION_TYPES]; int partition_redo; } FramePartitionTimingStats; #endif // CONFIG_COLLECT_PARTITION_STATS #if CONFIG_COLLECT_COMPONENT_TIMING #include "aom_ports/aom_timer.h" // Adjust the following to add new components. enum { av1_encode_strategy_time, av1_get_one_pass_rt_params_time, av1_get_second_pass_params_time, denoise_and_encode_time, apply_filtering_time, av1_tpl_setup_stats_time, encode_frame_to_data_rate_time, encode_with_or_without_recode_time, loop_filter_time, cdef_time, loop_restoration_time, av1_pack_bitstream_final_time, av1_encode_frame_time, av1_compute_global_motion_time, av1_setup_motion_field_time, encode_sb_row_time, rd_pick_partition_time, rd_use_partition_time, choose_var_based_partitioning_time, av1_prune_partitions_time, none_partition_search_time, split_partition_search_time, rectangular_partition_search_time, ab_partitions_search_time, rd_pick_4partition_time, encode_sb_time, rd_pick_sb_modes_time, av1_rd_pick_intra_mode_sb_time, av1_rd_pick_inter_mode_sb_time, set_params_rd_pick_inter_mode_time, skip_inter_mode_time, handle_inter_mode_time, evaluate_motion_mode_for_winner_candidates_time, do_tx_search_time, handle_intra_mode_time, refine_winner_mode_tx_time, av1_search_palette_mode_time, handle_newmv_time, compound_type_rd_time, interpolation_filter_search_time, motion_mode_rd_time, nonrd_use_partition_time, pick_sb_modes_nonrd_time, hybrid_intra_mode_search_time, nonrd_pick_inter_mode_sb_time, encode_b_nonrd_time, kTimingComponents, } UENUM1BYTE(TIMING_COMPONENT); static inline char const *get_component_name(int index) { switch (index) { case av1_encode_strategy_time: return "av1_encode_strategy_time"; case av1_get_one_pass_rt_params_time: return "av1_get_one_pass_rt_params_time"; case av1_get_second_pass_params_time: return "av1_get_second_pass_params_time"; case denoise_and_encode_time: return "denoise_and_encode_time"; case apply_filtering_time: return "apply_filtering_time"; case av1_tpl_setup_stats_time: return "av1_tpl_setup_stats_time"; case encode_frame_to_data_rate_time: return "encode_frame_to_data_rate_time"; case encode_with_or_without_recode_time: return "encode_with_or_without_recode_time"; case loop_filter_time: return "loop_filter_time"; case cdef_time: return "cdef_time"; case loop_restoration_time: return "loop_restoration_time"; case av1_pack_bitstream_final_time: return "av1_pack_bitstream_final_time"; case av1_encode_frame_time: return "av1_encode_frame_time"; case av1_compute_global_motion_time: return "av1_compute_global_motion_time"; case av1_setup_motion_field_time: return "av1_setup_motion_field_time"; case encode_sb_row_time: return "encode_sb_row_time"; case rd_pick_partition_time: return "rd_pick_partition_time"; case rd_use_partition_time: return "rd_use_partition_time"; case choose_var_based_partitioning_time: return "choose_var_based_partitioning_time"; case av1_prune_partitions_time: return "av1_prune_partitions_time"; case none_partition_search_time: return "none_partition_search_time"; case split_partition_search_time: return "split_partition_search_time"; case rectangular_partition_search_time: return "rectangular_partition_search_time"; case ab_partitions_search_time: return "ab_partitions_search_time"; case rd_pick_4partition_time: return "rd_pick_4partition_time"; case encode_sb_time: return "encode_sb_time"; case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time"; case av1_rd_pick_intra_mode_sb_time: return "av1_rd_pick_intra_mode_sb_time"; case av1_rd_pick_inter_mode_sb_time: return "av1_rd_pick_inter_mode_sb_time"; case set_params_rd_pick_inter_mode_time: return "set_params_rd_pick_inter_mode_time"; case skip_inter_mode_time: return "skip_inter_mode_time"; case handle_inter_mode_time: return "handle_inter_mode_time"; case evaluate_motion_mode_for_winner_candidates_time: return "evaluate_motion_mode_for_winner_candidates_time"; case do_tx_search_time: return "do_tx_search_time"; case handle_intra_mode_time: return "handle_intra_mode_time"; case refine_winner_mode_tx_time: return "refine_winner_mode_tx_time"; case av1_search_palette_mode_time: return "av1_search_palette_mode_time"; case handle_newmv_time: return "handle_newmv_time"; case compound_type_rd_time: return "compound_type_rd_time"; case interpolation_filter_search_time: return "interpolation_filter_search_time"; case motion_mode_rd_time: return "motion_mode_rd_time"; case nonrd_use_partition_time: return "nonrd_use_partition_time"; case pick_sb_modes_nonrd_time: return "pick_sb_modes_nonrd_time"; case hybrid_intra_mode_search_time: return "hybrid_intra_mode_search_time"; case nonrd_pick_inter_mode_sb_time: return "nonrd_pick_inter_mode_sb_time"; case encode_b_nonrd_time: return "encode_b_nonrd_time"; default: assert(0); } return "error"; } #endif // The maximum number of internal ARFs except ALTREF_FRAME #define MAX_INTERNAL_ARFS (REF_FRAMES - BWDREF_FRAME - 1) /*!\endcond */ /*! * \brief Parameters related to global motion search */ typedef struct { /*! * Flag to indicate if global motion search needs to be rerun. */ bool search_done; /*! * Array of pointers to the frame buffers holding the reference frames. * ref_buf[i] stores the pointer to the reference frame of the ith * reference frame type. */ YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES]; /*! * Holds the number of valid reference frames in past and future directions * w.r.t. the current frame. num_ref_frames[i] stores the total number of * valid reference frames in 'i' direction. */ int num_ref_frames[MAX_DIRECTIONS]; /*! * Array of structure which stores the valid reference frames in past and * future directions and their corresponding distance from the source frame. * reference_frames[i][j] holds the jth valid reference frame type in the * direction 'i' and its temporal distance from the source frame . */ FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1]; /** * \name Dimensions for which segment map is allocated. */ /**@{*/ int segment_map_w; /*!< segment map width */ int segment_map_h; /*!< segment map height */ /**@}*/ } GlobalMotionInfo; /*! * \brief Flags related to interpolation filter search */ typedef struct { /*! * Stores the default value of skip flag depending on chroma format * Set as 1 for monochrome and 3 for other color formats */ int default_interp_skip_flags; /*! * Filter mask to allow certain interp_filter type. */ uint16_t interp_filter_search_mask; } InterpSearchFlags; /*! * \brief Parameters for motion vector search process */ typedef struct { /*! * Largest MV component used in a frame. * The value from the previous frame is used to set the full pixel search * range for the current frame. */ int max_mv_magnitude; /*! * Parameter indicating initial search window to be used in full-pixel search. * Range [0, MAX_MVSEARCH_STEPS-2]. Lower value indicates larger window. */ int mv_step_param; /*! * Pointer to sub-pixel search function. * In encoder: av1_find_best_sub_pixel_tree * av1_find_best_sub_pixel_tree_pruned * av1_find_best_sub_pixel_tree_pruned_more * In MV unit test: av1_return_max_sub_pixel_mv * av1_return_min_sub_pixel_mv */ fractional_mv_step_fp *find_fractional_mv_step; /*! * Search site configuration for full-pel MV search. * search_site_cfg[SS_CFG_SRC]: Used in tpl, rd/non-rd inter mode loop, simple * motion search. search_site_cfg[SS_CFG_LOOKAHEAD]: Used in intraBC, temporal * filter search_site_cfg[SS_CFG_FPF]: Used during first pass and lookahead */ search_site_config search_site_cfg[SS_CFG_TOTAL][NUM_DISTINCT_SEARCH_METHODS]; } MotionVectorSearchParams; /*! * \brief Refresh frame flags for different type of frames. * * If the refresh flag is true for a particular reference frame, after the * current frame is encoded, the reference frame gets refreshed (updated) to * be the current frame. Note: Usually at most one flag will be set to true at * a time. But, for key-frames, all flags are set to true at once. */ typedef struct { bool golden_frame; /*!< Refresh flag for golden frame */ bool bwd_ref_frame; /*!< Refresh flag for bwd-ref frame */ bool alt_ref_frame; /*!< Refresh flag for alt-ref frame */ } RefreshFrameInfo; /*! * \brief Desired dimensions for an externally triggered resize. * * When resize is triggered externally, the desired dimensions are stored in * this struct until used in the next frame to be coded. These values are * effective only for one frame and are reset after they are used. */ typedef struct { int width; /*!< Desired resized width */ int height; /*!< Desired resized height */ } ResizePendingParams; /*! * \brief Refrence frame distance related variables. */ typedef struct { /*! * True relative distance of reference frames w.r.t. the current frame. */ int ref_relative_dist[INTER_REFS_PER_FRAME]; /*! * The nearest reference w.r.t. current frame in the past. */ int8_t nearest_past_ref; /*! * The nearest reference w.r.t. current frame in the future. */ int8_t nearest_future_ref; } RefFrameDistanceInfo; /*! * \brief Parameters used for winner mode processing. * * This is a basic two pass approach: in the first pass, we reduce the number of * transform searches based on some thresholds during the rdopt process to find * the "winner mode". In the second pass, we perform a more through tx search * on the winner mode. * There are some arrays in the struct, and their indices are used in the * following manner: * Index 0: Default mode evaluation, Winner mode processing is not applicable * (Eg : IntraBc). * Index 1: Mode evaluation. * Index 2: Winner mode evaluation * Index 1 and 2 are only used when the respective speed feature is on. */ typedef struct { /*! * Threshold to determine if trellis optimization is to be enabled * based on : * 0 : dist threshold * 1 : satd threshold * Corresponds to enable_winner_mode_for_coeff_opt speed feature. */ unsigned int coeff_opt_thresholds[MODE_EVAL_TYPES][2]; /*! * Determines the tx size search method during rdopt. * Corresponds to enable_winner_mode_for_tx_size_srch speed feature. */ TX_SIZE_SEARCH_METHOD tx_size_search_methods[MODE_EVAL_TYPES]; /*! * Controls how often we should approximate prediction error with tx * coefficients. If it's 0, then never. If 1, then it's during the tx_type * search only. If 2, then always. * Corresponds to tx_domain_dist_level speed feature. */ unsigned int use_transform_domain_distortion[MODE_EVAL_TYPES]; /*! * Threshold to approximate pixel domain distortion with transform domain * distortion. This is only used if use_transform_domain_distortion is on. * Corresponds to enable_winner_mode_for_use_tx_domain_dist speed feature. */ unsigned int tx_domain_dist_threshold[MODE_EVAL_TYPES]; /*! * Controls how often we should try to skip the transform process based on * result from dct. * Corresponds to use_skip_flag_prediction speed feature. */ unsigned int skip_txfm_level[MODE_EVAL_TYPES]; /*! * Predict DC only txfm blocks for default, mode and winner mode evaluation. * Index 0: Default mode evaluation, Winner mode processing is not applicable. * Index 1: Mode evaluation, Index 2: Winner mode evaluation */ unsigned int predict_dc_level[MODE_EVAL_TYPES]; } WinnerModeParams; /*! * \brief Frame refresh flags set by the external interface. * * Flags set by external interface to determine which reference buffers are * refreshed by this frame. When set, the encoder will update the particular * reference frame buffer with the contents of the current frame. */ typedef struct { bool last_frame; /*!< Refresh flag for last frame */ bool golden_frame; /*!< Refresh flag for golden frame */ bool bwd_ref_frame; /*!< Refresh flag for bwd-ref frame */ bool alt2_ref_frame; /*!< Refresh flag for alt2-ref frame */ bool alt_ref_frame; /*!< Refresh flag for alt-ref frame */ /*! * Flag indicating if the update of refresh frame flags is pending. */ bool update_pending; } ExtRefreshFrameFlagsInfo; /*! * \brief Flags signalled by the external interface at frame level. */ typedef struct { /*! * Bit mask to disable certain reference frame types. */ int ref_frame_flags; /*! * Frame refresh flags set by the external interface. */ ExtRefreshFrameFlagsInfo refresh_frame; /*! * Flag to enable the update of frame contexts at the end of a frame decode. */ bool refresh_frame_context; /*! * Flag to indicate that update of refresh_frame_context from external * interface is pending. */ bool refresh_frame_context_pending; /*! * Flag to enable temporal MV prediction. */ bool use_ref_frame_mvs; /*! * Indicates whether the current frame is to be coded as error resilient. */ bool use_error_resilient; /*! * Indicates whether the current frame is to be coded as s-frame. */ bool use_s_frame; /*! * Indicates whether the current frame's primary_ref_frame is set to * PRIMARY_REF_NONE. */ bool use_primary_ref_none; } ExternalFlags; /*!\cond */ typedef struct { // Some misc info int high_prec; int q; int order; // MV counters int inter_count; int intra_count; int default_mvs; int mv_joint_count[4]; int last_bit_zero; int last_bit_nonzero; // Keep track of the rates int total_mv_rate; int hp_total_mv_rate; int lp_total_mv_rate; // Texture info int horz_text; int vert_text; int diag_text; // Whether the current struct contains valid data int valid; } MV_STATS; typedef struct WeberStats { int64_t mb_wiener_variance; int64_t src_variance; int64_t rec_variance; int16_t src_pix_max; int16_t rec_pix_max; int64_t distortion; int64_t satd; double max_scale; } WeberStats; typedef struct { struct loopfilter lf; CdefInfo cdef_info; YV12_BUFFER_CONFIG copy_buffer; RATE_CONTROL rc; MV_STATS mv_stats; } CODING_CONTEXT; typedef struct { int frame_width; int frame_height; int mi_rows; int mi_cols; int mb_rows; int mb_cols; int num_mbs; aom_bit_depth_t bit_depth; int subsampling_x; int subsampling_y; } FRAME_INFO; /*! * \brief This structure stores different types of frame indices. */ typedef struct { int show_frame_count; } FRAME_INDEX_SET; /*!\endcond */ /*! * \brief Segmentation related information for the current frame. */ typedef struct { /*! * 3-bit number containing the segment affiliation for each 4x4 block in the * frame. map[y * stride + x] contains the segment id of the 4x4 block at * (x,y) position. */ uint8_t *map; /*! * Flag to indicate if current frame has lossless segments or not. * 1: frame has at least one lossless segment. * 0: frame has no lossless segments. */ bool has_lossless_segment; } EncSegmentationInfo; /*! * \brief Frame time stamps. */ typedef struct { /*! * Start time stamp of the previous frame */ int64_t prev_ts_start; /*! * End time stamp of the previous frame */ int64_t prev_ts_end; /*! * Start time stamp of the first frame */ int64_t first_ts_start; } TimeStamps; /*! * Pointers to the memory allocated for frame level transform coeff related * info. */ typedef struct { /*! * Pointer to the transformed coefficients buffer. */ tran_low_t *tcoeff; /*! * Pointer to the eobs buffer. */ uint16_t *eobs; /*! * Pointer to the entropy_ctx buffer. */ uint8_t *entropy_ctx; } CoeffBufferPool; #if !CONFIG_REALTIME_ONLY /*!\cond */ // DUCKY_ENCODE_FRAME_MODE is c version of EncodeFrameMode enum { DUCKY_ENCODE_FRAME_MODE_NONE, // Let native AV1 determine q index and rdmult DUCKY_ENCODE_FRAME_MODE_QINDEX, // DuckyEncode determines q index and AV1 // determines rdmult DUCKY_ENCODE_FRAME_MODE_QINDEX_RDMULT, // DuckyEncode determines q index and // rdmult } UENUM1BYTE(DUCKY_ENCODE_FRAME_MODE); enum { DUCKY_ENCODE_GOP_MODE_NONE, // native AV1 decides GOP DUCKY_ENCODE_GOP_MODE_RCL, // rate control lib decides GOP } UENUM1BYTE(DUCKY_ENCODE_GOP_MODE); typedef struct DuckyEncodeFrameInfo { DUCKY_ENCODE_FRAME_MODE qp_mode; DUCKY_ENCODE_GOP_MODE gop_mode; int q_index; int rdmult; // These two arrays are equivalent to std::vector int *superblock_encode_qindex; int *superblock_encode_rdmult; int delta_q_enabled; } DuckyEncodeFrameInfo; typedef struct DuckyEncodeFrameResult { int global_order_idx; int q_index; int rdmult; int rate; int64_t dist; double psnr; } DuckyEncodeFrameResult; typedef struct DuckyEncodeInfo { DuckyEncodeFrameInfo frame_info; DuckyEncodeFrameResult frame_result; } DuckyEncodeInfo; /*!\endcond */ #endif /*!\cond */ typedef struct RTC_REF { /*! * LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), * BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). */ int reference[INTER_REFS_PER_FRAME]; int ref_idx[INTER_REFS_PER_FRAME]; int refresh[REF_FRAMES]; int set_ref_frame_config; int non_reference_frame; int ref_frame_comp[3]; int gld_idx_1layer; /*! * Frame number of the last frame that refreshed the buffer slot. */ unsigned int buffer_time_index[REF_FRAMES]; /*! * Spatial layer id of the last frame that refreshed the buffer slot. */ unsigned char buffer_spatial_layer[REF_FRAMES]; /*! * Flag to indicate whether closest reference was the previous frame. */ bool reference_was_previous_frame; /*! * Flag to indicate this frame is based on longer term reference only, * for recovery from past loss, and it should be biased for improved coding. */ bool bias_recovery_frame; } RTC_REF; /*!\endcond */ /*! * \brief Structure to hold data corresponding to an encoded frame. */ typedef struct AV1_COMP_DATA { /*! * Buffer to store packed bitstream data of a frame. */ unsigned char *cx_data; /*! * Allocated size of the cx_data buffer. */ size_t cx_data_sz; /*! * Size of data written in the cx_data buffer. */ size_t frame_size; /*! * Flags for the frame. */ unsigned int lib_flags; /*! * Time stamp for start of frame. */ int64_t ts_frame_start; /*! * Time stamp for end of frame. */ int64_t ts_frame_end; /*! * Flag to indicate flush call. */ int flush; /*! * Time base for sequence. */ const aom_rational64_t *timestamp_ratio; /*! * Decide to pop the source for this frame from input buffer queue. */ int pop_lookahead; } AV1_COMP_DATA; /*! * \brief Top level primary encoder structure */ typedef struct AV1_PRIMARY { /*! * Array of frame level encoder stage top level structures */ struct AV1_COMP *parallel_cpi[MAX_PARALLEL_FRAMES]; /*! * Array of structures to hold data of frames encoded in a given parallel * encode set. */ struct AV1_COMP_DATA parallel_frames_data[MAX_PARALLEL_FRAMES - 1]; #if CONFIG_FPMT_TEST /*! * Flag which enables/disables simulation path for fpmt unit test. * 0 - FPMT integration * 1 - FPMT simulation */ FPMT_TEST_ENC_CFG fpmt_unit_test_cfg; /*! * Temporary variable simulating the delayed frame_probability update. */ FrameProbInfo temp_frame_probs; /*! * Temporary variable holding the updated frame probability across * frames. Copy its value to temp_frame_probs for frame_parallel_level 0 * frames or last frame in parallel encode set. */ FrameProbInfo temp_frame_probs_simulation; /*! * Temporary variable simulating the delayed update of valid global motion * model across frames. */ int temp_valid_gm_model_found[FRAME_UPDATE_TYPES]; #endif // CONFIG_FPMT_TEST /*! * Copy of cm->ref_frame_map maintained to facilitate sequential update of * ref_frame_map by lower layer depth frames encoded ahead of time in a * parallel encode set. */ RefCntBuffer *ref_frame_map_copy[REF_FRAMES]; /*! * Start time stamp of the last encoded show frame */ int64_t ts_start_last_show_frame; /*! * End time stamp of the last encoded show frame */ int64_t ts_end_last_show_frame; /*! * Number of frame level contexts(cpis) */ int num_fp_contexts; /*! * Loopfilter levels of the previous encoded frame. */ int filter_level[2]; /*! * Chrominance component loopfilter level of the previous encoded frame. */ int filter_level_u; /*! * Chrominance component loopfilter level of the previous encoded frame. */ int filter_level_v; /*! * Encode stage top level structure * During frame parallel encode, this is the same as parallel_cpi[0] */ struct AV1_COMP *cpi; /*! * Lookahead processing stage top level structure */ struct AV1_COMP *cpi_lap; /*! * Look-ahead context. */ struct lookahead_ctx *lookahead; /*! * Sequence parameters have been transmitted already and locked * or not. Once locked av1_change_config cannot change the seq * parameters. */ int seq_params_locked; /*! * Pointer to internal utility functions that manipulate aom_codec_* data * structures. */ struct aom_codec_pkt_list *output_pkt_list; /*! * When set, indicates that internal ARFs are enabled. */ int internal_altref_allowed; /*! * Tell if OVERLAY frame shows existing alt_ref frame. */ int show_existing_alt_ref; /*! * Information related to a gf group. */ GF_GROUP gf_group; /*! * Track prior gf group state. */ GF_STATE gf_state; /*! * Flag indicating whether look ahead processing (LAP) is enabled. */ int lap_enabled; /*! * Parameters for AV1 bitstream levels. */ AV1LevelParams level_params; /*! * Calculates PSNR on each frame when set to 1. */ int b_calculate_psnr; /*! * Number of frames left to be encoded, is 0 if limit is not set. */ int frames_left; /*! * Information related to two pass encoding. */ TWO_PASS twopass; /*! * Rate control related parameters. */ PRIMARY_RATE_CONTROL p_rc; /*! * Info and resources used by temporal filtering. */ TEMPORAL_FILTER_INFO tf_info; /*! * Elements part of the sequence header, that are applicable for all the * frames in the video. */ SequenceHeader seq_params; /*! * Indicates whether to use SVC. */ int use_svc; /*! * If true, buffer removal times are present. */ bool buffer_removal_time_present; /*! * Number of temporal layers: may be > 1 for SVC (scalable vector coding). */ unsigned int number_temporal_layers; /*! * Number of spatial layers: may be > 1 for SVC (scalable vector coding). */ unsigned int number_spatial_layers; /*! * Code and details about current error status. */ struct aom_internal_error_info error; /*! * Function pointers to variants of sse/sad/variance computation functions. * fn_ptr[i] indicates the list of function pointers corresponding to block * size i. */ aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL]; /*! * tpl_sb_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of * the ith 16 x 16 block in raster scan order. */ double *tpl_sb_rdmult_scaling_factors; /*! * Parameters related to tpl. */ TplParams tpl_data; /*! * Motion vector stats of the previous encoded frame. */ MV_STATS mv_stats; #if CONFIG_INTERNAL_STATS /*!\cond */ uint64_t total_time_receive_data; uint64_t total_time_compress_data; unsigned int total_mode_chosen_counts[MAX_MODES]; int count[2]; uint64_t total_sq_error[2]; uint64_t total_samples[2]; ImageStat psnr[2]; double total_blockiness; double worst_blockiness; uint64_t total_bytes; double summed_quality; double summed_weights; double summed_quality_hbd; double summed_weights_hbd; unsigned int total_recode_hits; double worst_ssim; double worst_ssim_hbd; ImageStat fastssim; ImageStat psnrhvs; int b_calculate_blockiness; int b_calculate_consistency; double total_inconsistency; double worst_consistency; Ssimv *ssim_vars; Metrics metrics; /*!\endcond */ #endif #if CONFIG_ENTROPY_STATS /*! * Aggregates frame counts for the sequence. */ FRAME_COUNTS aggregate_fc; #endif // CONFIG_ENTROPY_STATS /*! * For each type of reference frame, this contains the index of a reference * frame buffer for a reference frame of the same type. We use this to * choose our primary reference frame (which is the most recent reference * frame of the same type as the current frame). */ int fb_of_context_type[REF_FRAMES]; /*! * Primary Multi-threading parameters. */ PrimaryMultiThreadInfo p_mt_info; /*! * Probabilities for pruning of various AV1 tools. */ FrameProbInfo frame_probs; /*! * Indicates if a valid global motion model has been found in the different * frame update types of a GF group. * valid_gm_model_found[i] indicates if valid global motion model has been * found in the frame update type with enum value equal to i */ int valid_gm_model_found[FRAME_UPDATE_TYPES]; /*! * Struct for the reference structure for RTC. */ RTC_REF rtc_ref; /*! * Struct for all intra mode row multi threading in the preprocess stage * when --deltaq-mode=3. */ AV1EncRowMultiThreadSync intra_row_mt_sync; } AV1_PRIMARY; /*! * \brief Top level encoder structure. */ typedef struct AV1_COMP { /*! * Pointer to top level primary encoder structure */ AV1_PRIMARY *ppi; /*! * Quantization and dequantization parameters for internal quantizer setup * in the encoder. */ EncQuantDequantParams enc_quant_dequant_params; /*! * Structure holding thread specific variables. */ ThreadData td; /*! * Statistics collected at frame level. */ FRAME_COUNTS counts; /*! * Holds buffer storing mode information at 4x4/8x8 level. */ MBMIExtFrameBufferInfo mbmi_ext_info; /*! * Buffer holding the transform block related information. * coeff_buffer_base[i] stores the transform block related information of the * ith superblock in raster scan order. */ CB_COEFF_BUFFER *coeff_buffer_base; /*! * Structure holding pointers to frame level memory allocated for transform * block related information. */ CoeffBufferPool coeff_buffer_pool; /*! * Structure holding variables common to encoder and decoder. */ AV1_COMMON common; /*! * Encoder configuration related parameters. */ AV1EncoderConfig oxcf; /*! * Stores the trellis optimization type at segment level. * optimize_seg_arr[i] stores the trellis opt type for ith segment. */ TRELLIS_OPT_TYPE optimize_seg_arr[MAX_SEGMENTS]; /*! * Pointer to the frame buffer holding the source frame to be used during the * current stage of encoding. It can be the raw input, temporally filtered * input or scaled input. */ YV12_BUFFER_CONFIG *source; /*! * Pointer to the frame buffer holding the last raw source frame. * last_source is NULL for the following cases: * 1) First frame * 2) Alt-ref frames * 3) All frames for all-intra frame encoding. */ YV12_BUFFER_CONFIG *last_source; /*! * Pointer to the frame buffer holding the unscaled source frame. * It can be either the raw input or temporally filtered input. */ YV12_BUFFER_CONFIG *unscaled_source; /*! * Frame buffer holding the resized source frame (cropping / superres). */ YV12_BUFFER_CONFIG scaled_source; /*! * Pointer to the frame buffer holding the unscaled last source frame. */ YV12_BUFFER_CONFIG *unscaled_last_source; /*! * Frame buffer holding the resized last source frame. */ YV12_BUFFER_CONFIG scaled_last_source; /*! * Pointer to the original source frame. This is used to determine if the * content is screen. */ YV12_BUFFER_CONFIG *unfiltered_source; /*! * Frame buffer holding the orig source frame for PSNR calculation in rtc tf * case. */ YV12_BUFFER_CONFIG orig_source; /*! * Skip tpl setup when tpl data from gop length decision can be reused. */ int skip_tpl_setup_stats; /*! * Scaling factors used in the RD multiplier modulation. * TODO(sdeng): consider merge the following arrays. * tpl_rdmult_scaling_factors is a temporary buffer used to store the * intermediate scaling factors which are used in the calculation of * tpl_sb_rdmult_scaling_factors. tpl_rdmult_scaling_factors[i] stores the * intermediate scaling factor of the ith 16 x 16 block in raster scan order. */ double *tpl_rdmult_scaling_factors; /*! * Temporal filter context. */ TemporalFilterCtx tf_ctx; /*! * Pointer to CDEF search context. */ CdefSearchCtx *cdef_search_ctx; /*! * Variables related to forcing integer mv decisions for the current frame. */ ForceIntegerMVInfo force_intpel_info; /*! * Pointer to the buffer holding the scaled reference frames. * scaled_ref_buf[i] holds the scaled reference frame of type i. */ RefCntBuffer *scaled_ref_buf[INTER_REFS_PER_FRAME]; /*! * Pointer to the buffer holding the last show frame. */ RefCntBuffer *last_show_frame_buf; /*! * Refresh frame flags for golden, bwd-ref and alt-ref frames. */ RefreshFrameInfo refresh_frame; /*! * Flag to reduce the number of reference frame buffers used in rt. */ int rt_reduce_num_ref_buffers; /*! * Flags signalled by the external interface at frame level. */ ExternalFlags ext_flags; /*! * Temporary frame buffer used to store the non-loop filtered reconstructed * frame during the search of loop filter level. */ YV12_BUFFER_CONFIG last_frame_uf; /*! * Temporary frame buffer used to store the loop restored frame during loop * restoration search. */ YV12_BUFFER_CONFIG trial_frame_rst; /*! * Ambient reconstruction err target for force key frames. */ int64_t ambient_err; /*! * Parameters related to rate distortion optimization. */ RD_OPT rd; /*! * Temporary coding context used to save and restore when encoding with and * without super-resolution. */ CODING_CONTEXT coding_context; /*! * Parameters related to global motion search. */ GlobalMotionInfo gm_info; /*! * Parameters related to winner mode processing. */ WinnerModeParams winner_mode_params; /*! * Frame time stamps. */ TimeStamps time_stamps; /*! * Rate control related parameters. */ RATE_CONTROL rc; /*! * Frame rate of the video. */ double framerate; /*! * Bitmask indicating which reference buffers may be referenced by this frame. */ int ref_frame_flags; /*! * speed is passed as a per-frame parameter into the encoder. */ int speed; /*! * sf contains fine-grained config set internally based on speed. */ SPEED_FEATURES sf; /*! * Parameters for motion vector search process. */ MotionVectorSearchParams mv_search_params; /*! * When set, indicates that all reference frames are forward references, * i.e., all the reference frames are output before the current frame. */ int all_one_sided_refs; /*! * Segmentation related information for current frame. */ EncSegmentationInfo enc_seg; /*! * Parameters related to cyclic refresh aq-mode. */ CYCLIC_REFRESH *cyclic_refresh; /*! * Parameters related to active map. Active maps indicate * if there is any activity on a 4x4 block basis. */ ActiveMap active_map; /*! * The frame processing order within a GOP. */ unsigned char gf_frame_index; #if CONFIG_INTERNAL_STATS /*!\cond */ uint64_t time_compress_data; unsigned int mode_chosen_counts[MAX_MODES]; int bytes; unsigned int frame_recode_hits; /*!\endcond */ #endif #if CONFIG_SPEED_STATS /*! * For debugging: number of transform searches we have performed. */ unsigned int tx_search_count; #endif // CONFIG_SPEED_STATS /*! * When set, indicates that the frame is droppable, i.e., this frame * does not update any reference buffers. */ int droppable; /*! * Stores the frame parameters during encoder initialization. */ FRAME_INFO frame_info; /*! * Stores different types of frame indices. */ FRAME_INDEX_SET frame_index_set; /*! * Stores the cm->width in the last call of alloc_compressor_data(). Helps * determine whether compressor data should be reallocated when cm->width * changes. */ int data_alloc_width; /*! * Stores the cm->height in the last call of alloc_compressor_data(). Helps * determine whether compressor data should be reallocated when cm->height * changes. */ int data_alloc_height; /*! * Number of MBs in the full-size frame; to be used to * normalize the firstpass stats. This will differ from the * number of MBs in the current frame when the frame is * scaled. */ int initial_mbs; /*! * Flag to indicate whether the frame size inforamation has been * setup and propagated to associated allocations. */ bool frame_size_related_setup_done; /*! * The width of the frame that is lastly encoded. * It is updated in the function "encoder_encode()". */ int last_coded_width; /*! * The height of the frame that is lastly encoded. * It is updated in the function "encoder_encode()". */ int last_coded_height; /*! * Resize related parameters. */ ResizePendingParams resize_pending_params; /*! * Pointer to struct holding adaptive data/contexts/models for the tile during * encoding. */ TileDataEnc *tile_data; /*! * Number of tiles for which memory has been allocated for tile_data. */ int allocated_tiles; /*! * Structure to store the palette token related information. */ TokenInfo token_info; /*! * VARIANCE_AQ segment map refresh. */ int vaq_refresh; /*! * Thresholds for variance based partitioning. */ VarBasedPartitionInfo vbp_info; /*! * Number of recodes in the frame. */ int num_frame_recode; /*! * Current frame probability of parallel frames, across recodes. */ FrameProbInfo frame_new_probs[NUM_RECODES_PER_FRAME]; /*! * Retain condition for transform type frame_probability calculation */ int do_update_frame_probs_txtype[NUM_RECODES_PER_FRAME]; /*! * Retain condition for obmc frame_probability calculation */ int do_update_frame_probs_obmc[NUM_RECODES_PER_FRAME]; /*! * Retain condition for warped motion frame_probability calculation */ int do_update_frame_probs_warp[NUM_RECODES_PER_FRAME]; /*! * Retain condition for interpolation filter frame_probability calculation */ int do_update_frame_probs_interpfilter[NUM_RECODES_PER_FRAME]; #if CONFIG_FPMT_TEST /*! * Temporary variable for simulation. * Previous frame's framerate. */ double temp_framerate; #endif /*! * Updated framerate for the current parallel frame. * cpi->framerate is updated with new_framerate during * post encode updates for parallel frames. */ double new_framerate; /*! * Retain condition for fast_extra_bits calculation. */ int do_update_vbr_bits_off_target_fast; /*! * Multi-threading parameters. */ MultiThreadInfo mt_info; /*! * Specifies the frame to be output. It is valid only if show_existing_frame * is 1. When show_existing_frame is 0, existing_fb_idx_to_show is set to * INVALID_IDX. */ int existing_fb_idx_to_show; /*! * A flag to indicate if intrabc is ever used in current frame. */ int intrabc_used; /*! * Mark which ref frames can be skipped for encoding current frame during RDO. */ int prune_ref_frame_mask; /*! * Loop Restoration context. */ AV1LrStruct lr_ctxt; /*! * Loop Restoration context used during pick stage. */ AV1LrPickStruct pick_lr_ctxt; /*! * Pointer to list of tables with film grain parameters. */ aom_film_grain_table_t *film_grain_table; #if CONFIG_DENOISE /*! * Pointer to structure holding the denoised image buffers and the helper * noise models. */ struct aom_denoise_and_model_t *denoise_and_model; #endif /*! * Flags related to interpolation filter search. */ InterpSearchFlags interp_search_flags; /*! * Turn on screen content tools flag. * Note that some videos are not screen content videos, but * screen content tools could also improve coding efficiency. * For example, videos with large flat regions, gaming videos that look * like natural videos. */ int use_screen_content_tools; /*! * A flag to indicate "real" screen content videos. * For example, screen shares, screen editing. * This type is true indicates |use_screen_content_tools| must be true. * In addition, rate control strategy is adjusted when this flag is true. */ int is_screen_content_type; #if CONFIG_COLLECT_PARTITION_STATS /*! * Accumulates the partition timing stat over the whole frame. */ FramePartitionTimingStats partition_stats; #endif // CONFIG_COLLECT_PARTITION_STATS #if CONFIG_COLLECT_COMPONENT_TIMING /*! * component_time[] are initialized to zero while encoder starts. */ uint64_t component_time[kTimingComponents]; /*! * Stores timing for individual components between calls of start_timing() * and end_timing(). */ struct aom_usec_timer component_timer[kTimingComponents]; /*! * frame_component_time[] are initialized to zero at beginning of each frame. */ uint64_t frame_component_time[kTimingComponents]; #endif /*! * Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation. */ int frame_header_count; /*! * Whether any no-zero delta_q was actually used. */ int deltaq_used; /*! * Refrence frame distance related variables. */ RefFrameDistanceInfo ref_frame_dist_info; /*! * ssim_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of * the ith 16 x 16 block in raster scan order. This scaling factor is used for * RD multiplier modulation when SSIM tuning is enabled. */ double *ssim_rdmult_scaling_factors; #if CONFIG_TUNE_VMAF /*! * Parameters for VMAF tuning. */ TuneVMAFInfo vmaf_info; #endif #if CONFIG_TUNE_BUTTERAUGLI /*! * Parameters for Butteraugli tuning. */ TuneButteraugliInfo butteraugli_info; #endif /*! * Parameters for scalable video coding. */ SVC svc; /*! * Indicates whether current processing stage is encode stage or LAP stage. */ COMPRESSOR_STAGE compressor_stage; /*! * Frame type of the last frame. May be used in some heuristics for speeding * up the encoding. */ FRAME_TYPE last_frame_type; /*! * Number of tile-groups. */ int num_tg; /*! * Super-resolution mode currently being used by the encoder. * This may / may not be same as user-supplied mode in oxcf->superres_mode * (when we are recoding to try multiple options for example). */ aom_superres_mode superres_mode; /*! * First pass related data. */ FirstPassData firstpass_data; /*! * Temporal Noise Estimate */ NOISE_ESTIMATE noise_estimate; #if CONFIG_AV1_TEMPORAL_DENOISING /*! * Temporal Denoiser */ AV1_DENOISER denoiser; #endif /*! * Count on how many consecutive times a block uses small/zeromv for encoding * in a scale of 8x8 block. */ uint8_t *consec_zero_mv; /*! * Allocated memory size for |consec_zero_mv|. */ int consec_zero_mv_alloc_size; /*! * Block size of first pass encoding */ BLOCK_SIZE fp_block_size; /*! * The counter of encoded super block, used to differentiate block names. * This number starts from 0 and increases whenever a super block is encoded. */ int sb_counter; /*! * Available bitstream buffer size in bytes */ size_t available_bs_size; /*! * The controller of the external partition model. * It is used to do partition type selection based on external models. */ ExtPartController ext_part_controller; /*! * Motion vector stats of the current encoded frame, used to update the * ppi->mv_stats during postencode. */ MV_STATS mv_stats; /*! * Stores the reference refresh index for the current frame. */ int ref_refresh_index; /*! * A flag to indicate if the reference refresh index is available for the * current frame. */ bool refresh_idx_available; /*! * Reference frame index corresponding to the frame to be excluded from being * used as a reference by frame_parallel_level 2 frame in a parallel * encode set of lower layer frames. */ int ref_idx_to_skip; #if CONFIG_FPMT_TEST /*! * Stores the wanted frame buffer index for choosing primary ref frame by a * frame_parallel_level 2 frame in a parallel encode set of lower layer * frames. */ int wanted_fb; #endif // CONFIG_FPMT_TEST /*! * A flag to indicate frames that will update their data to the primary * context at the end of the encode. It is set for non-parallel frames and the * last frame in encode order in a given parallel encode set. */ bool do_frame_data_update; #if CONFIG_RD_COMMAND /*! * A structure for assigning external q_index / rdmult for experiments */ RD_COMMAND rd_command; #endif // CONFIG_RD_COMMAND /*! * Buffer to store MB variance after Wiener filter. */ WeberStats *mb_weber_stats; /*! * Buffer to store rate cost estimates for each macro block (8x8) in the * preprocessing stage used in allintra mode. */ int *prep_rate_estimates; /*! * Buffer to store rate cost estimates for each 16x16 block read * from an external file, used in allintra mode. */ double *ext_rate_distribution; /*! * The scale that equals sum_rate_uniform_quantizer / sum_ext_rate. */ double ext_rate_scale; /*! * Buffer to store MB variance after Wiener filter. */ BLOCK_SIZE weber_bsize; /*! * Frame level Wiener filter normalization. */ int64_t norm_wiener_variance; /*! * Buffer to store delta-q values for delta-q mode 4. */ int *mb_delta_q; /*! * Flag to indicate that current frame is dropped. */ bool is_dropped_frame; #if CONFIG_BITRATE_ACCURACY /*! * Structure stores information needed for bitrate accuracy experiment. */ VBR_RATECTRL_INFO vbr_rc_info; #endif #if CONFIG_RATECTRL_LOG /*! * Structure stores information of rate control decisions. */ RATECTRL_LOG rc_log; #endif // CONFIG_RATECTRL_LOG /*! * Frame level twopass status and control data */ TWO_PASS_FRAME twopass_frame; #if CONFIG_THREE_PASS /*! * Context needed for third pass encoding. */ THIRD_PASS_DEC_CTX *third_pass_ctx; #endif /*! * File pointer to second pass log */ FILE *second_pass_log_stream; /*! * Buffer to store 64x64 SAD */ uint64_t *src_sad_blk_64x64; /*! * SSE between the current frame and the reconstructed last frame * It is only used for CBR mode. * It is not used if the reference frame has a different frame size. */ uint64_t rec_sse; /*! * A flag to indicate whether the encoder is controlled by DuckyEncode or not. * 1:yes 0:no */ int use_ducky_encode; #if !CONFIG_REALTIME_ONLY /*! A structure that facilitates the communication between DuckyEncode and AV1 * encoder. */ DuckyEncodeInfo ducky_encode_info; #endif // CONFIG_REALTIME_ONLY // /*! * Frames since last frame with cdf update. */ int frames_since_last_update; /*! * Block level thresholds to force zeromv-skip at partition level. */ unsigned int zeromv_skip_thresh_exit_part[BLOCK_SIZES_ALL]; /*! * Should we allocate a downsampling pyramid for each frame buffer? * This is currently only used for global motion */ bool alloc_pyramid; #if CONFIG_SALIENCY_MAP /*! * Pixel level saliency map for each frame. */ uint8_t *saliency_map; /*! * Superblock level rdmult scaling factor driven by saliency map. */ double *sm_scaling_factor; #endif /*! * Number of pixels that choose palette mode for luma in the * fast encoding pass in av1_determine_sc_tools_with_encoding(). */ int palette_pixel_num; /*! * Flag to indicate scaled_last_source is available, * so scaling is not needed for last_source. */ int scaled_last_source_available; } AV1_COMP; /*! * \brief Input frames and last input frame */ typedef struct EncodeFrameInput { /*!\cond */ YV12_BUFFER_CONFIG *source; YV12_BUFFER_CONFIG *last_source; int64_t ts_duration; /*!\endcond */ } EncodeFrameInput; /*! * \brief contains per-frame encoding parameters decided upon by * av1_encode_strategy() and passed down to av1_encode(). */ typedef struct EncodeFrameParams { /*! * Is error resilient mode enabled */ int error_resilient_mode; /*! * Frame type (eg KF vs inter frame etc) */ FRAME_TYPE frame_type; /*!\cond */ int primary_ref_frame; int order_offset; /*!\endcond */ /*! * Should the current frame be displayed after being decoded */ int show_frame; /*!\cond */ int refresh_frame_flags; int show_existing_frame; int existing_fb_idx_to_show; /*!\endcond */ /*! * Bitmask of which reference buffers may be referenced by this frame. */ int ref_frame_flags; /*! * Reference buffer assignment for this frame. */ int remapped_ref_idx[REF_FRAMES]; /*! * Flags which determine which reference buffers are refreshed by this * frame. */ RefreshFrameInfo refresh_frame; /*! * Speed level to use for this frame: Bigger number means faster. */ int speed; } EncodeFrameParams; /*!\cond */ void av1_initialize_enc(unsigned int usage, enum aom_rc_mode end_usage); struct AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf, BufferPool *const pool, COMPRESSOR_STAGE stage, int lap_lag_in_frames); struct AV1_PRIMARY *av1_create_primary_compressor( struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers, const AV1EncoderConfig *oxcf); void av1_remove_compressor(AV1_COMP *cpi); void av1_remove_primary_compressor(AV1_PRIMARY *ppi); #if CONFIG_ENTROPY_STATS void print_entropy_stats(AV1_PRIMARY *const ppi); #endif #if CONFIG_INTERNAL_STATS void print_internal_stats(AV1_PRIMARY *ppi); #endif void av1_change_config_seq(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf, bool *sb_size_changed); void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf, bool sb_size_changed); aom_codec_err_t av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, int subsampling_x, int subsampling_y); void av1_post_encode_updates(AV1_COMP *const cpi, const AV1_COMP_DATA *const cpi_data); void av1_release_scaled_references_fpmt(AV1_COMP *cpi); void av1_decrement_ref_counts_fpmt(BufferPool *buffer_pool, int ref_buffers_used_map); void av1_init_sc_decisions(AV1_PRIMARY *const ppi); AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi, AV1_COMP_DATA *const first_cpi_data); int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data, AV1_PRIMARY *const ppi, int *ref_buffers_used_map); /*!\endcond */ /*!\brief Obtain the raw frame data * * \ingroup high_level_algo * This function receives the raw frame data from input. * * \param[in] cpi Top-level encoder structure * \param[in] frame_flags Flags to decide how to encoding the frame * \param[in,out] sd Contain raw frame data * \param[in] time_stamp Time stamp of the frame * \param[in] end_time_stamp End time stamp * * \return Returns a value to indicate if the frame data is received * successfully. * \note The caller can assume that a copy of this frame is made and not just a * copy of the pointer. */ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, const YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp); /*!\brief Encode a frame * * \ingroup high_level_algo * \callgraph * \callergraph * This function encodes the raw frame data, and outputs the frame bit stream * to the designated buffer. The caller should use the output parameters * cpi_data->ts_frame_start and cpi_data->ts_frame_end only when this function * returns AOM_CODEC_OK. * * \param[in] cpi Top-level encoder structure * \param[in,out] cpi_data Data corresponding to a frame encode * * \return Returns a value to indicate if the encoding is done successfully. * \retval #AOM_CODEC_OK * \retval -1 * No frame encoded; more input is required. * \retval "A nonzero (positive) aom_codec_err_t code" * The encoding failed with the error. Sets the error code and error message * in \c cpi->common.error. */ int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data); /*!\brief Run 1-pass/2-pass encoding * * \ingroup high_level_algo * \callgraph * \callergraph */ int av1_encode(AV1_COMP *const cpi, uint8_t *const dest, size_t dest_size, const EncodeFrameInput *const frame_input, const EncodeFrameParams *const frame_params, size_t *const frame_size); /*!\cond */ int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest); int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame); aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm, YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *sd); int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags); int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd); int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd); void av1_set_frame_size(AV1_COMP *cpi, int width, int height); void av1_set_mv_search_params(AV1_COMP *cpi); int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols); int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols); int av1_set_internal_size(AV1EncoderConfig *const oxcf, ResizePendingParams *resize_pending_params, AOM_SCALING_MODE horiz_mode, AOM_SCALING_MODE vert_mode); int av1_get_quantizer(struct AV1_COMP *cpi); // This function assumes that the input buffer contains valid OBUs. It should // not be called on untrusted input. int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t buffer_size, size_t *input_size); void av1_alloc_mb_wiener_var_pred_buf(AV1_COMMON *cm, ThreadData *td); void av1_dealloc_mb_wiener_var_pred_buf(ThreadData *td); // Set screen content options. // This function estimates whether to use screen content tools, by counting // the portion of blocks that have few luma colors. // Modifies: // cpi->commom.features.allow_screen_content_tools // cpi->common.features.allow_intrabc // cpi->use_screen_content_tools // cpi->is_screen_content_type // However, the estimation is not accurate and may misclassify videos. // A slower but more accurate approach that determines whether to use screen // content tools is employed later. See av1_determine_sc_tools_with_encoding(). void av1_set_screen_content_options(struct AV1_COMP *cpi, FeatureFlags *features); void av1_update_frame_size(AV1_COMP *cpi); typedef struct { int pyr_level; int disp_order; } RefFrameMapPair; static inline void init_ref_map_pair( AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) { if (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE) { memset(ref_frame_map_pairs, -1, sizeof(*ref_frame_map_pairs) * REF_FRAMES); return; } memset(ref_frame_map_pairs, 0, sizeof(*ref_frame_map_pairs) * REF_FRAMES); for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) { // Get reference frame buffer. const RefCntBuffer *const buf = cpi->common.ref_frame_map[map_idx]; if (ref_frame_map_pairs[map_idx].disp_order == -1) continue; if (buf == NULL) { ref_frame_map_pairs[map_idx].disp_order = -1; ref_frame_map_pairs[map_idx].pyr_level = -1; continue; } else if (buf->ref_count > 1) { // Once the keyframe is coded, the slots in ref_frame_map will all // point to the same frame. In that case, all subsequent pointers // matching the current are considered "free" slots. This will find // the next occurrence of the current pointer if ref_count indicates // there are multiple instances of it and mark it as free. for (int idx2 = map_idx + 1; idx2 < REF_FRAMES; ++idx2) { const RefCntBuffer *const buf2 = cpi->common.ref_frame_map[idx2]; if (buf2 == buf) { ref_frame_map_pairs[idx2].disp_order = -1; ref_frame_map_pairs[idx2].pyr_level = -1; } } } ref_frame_map_pairs[map_idx].disp_order = (int)buf->display_order_hint; ref_frame_map_pairs[map_idx].pyr_level = buf->pyramid_level; } } #if CONFIG_FPMT_TEST static inline void calc_frame_data_update_flag( GF_GROUP *const gf_group, int gf_frame_index, bool *const do_frame_data_update) { *do_frame_data_update = true; // Set the flag to false for all frames in a given parallel encode set except // the last frame in the set with frame_parallel_level = 2. if (gf_group->frame_parallel_level[gf_frame_index] == 1) { *do_frame_data_update = false; } else if (gf_group->frame_parallel_level[gf_frame_index] == 2) { // Check if this is the last frame in the set with frame_parallel_level = 2. for (int i = gf_frame_index + 1; i < gf_group->size; i++) { if ((gf_group->frame_parallel_level[i] == 0 && (gf_group->update_type[i] == ARF_UPDATE || gf_group->update_type[i] == INTNL_ARF_UPDATE)) || gf_group->frame_parallel_level[i] == 1) { break; } else if (gf_group->frame_parallel_level[i] == 2) { *do_frame_data_update = false; break; } } } } #endif // av1 uses 10,000,000 ticks/second as time stamp #define TICKS_PER_SEC 10000000LL static inline int64_t timebase_units_to_ticks( const aom_rational64_t *timestamp_ratio, int64_t n) { return n * timestamp_ratio->num / timestamp_ratio->den; } static inline int64_t ticks_to_timebase_units( const aom_rational64_t *timestamp_ratio, int64_t n) { int64_t round = timestamp_ratio->num / 2; if (round > 0) --round; return (n * timestamp_ratio->den + round) / timestamp_ratio->num; } static inline int frame_is_kf_gf_arf(const AV1_COMP *cpi) { const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const FRAME_UPDATE_TYPE update_type = gf_group->update_type[cpi->gf_frame_index]; return frame_is_intra_only(&cpi->common) || update_type == ARF_UPDATE || update_type == GF_UPDATE; } // TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD. static inline int av1_use_hash_me(const AV1_COMP *const cpi) { return (cpi->common.features.allow_screen_content_tools && cpi->common.features.allow_intrabc && frame_is_intra_only(&cpi->common)); } static inline const YV12_BUFFER_CONFIG *get_ref_frame_yv12_buf( const AV1_COMMON *const cm, MV_REFERENCE_FRAME ref_frame) { const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); return buf != NULL ? &buf->buf : NULL; } static inline void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) { assert(buf != NULL); ensure_mv_buffer(buf, cm); buf->width = cm->width; buf->height = cm->height; } // Get the allocated token size for a tile. It does the same calculation as in // the frame token allocation. static inline unsigned int allocated_tokens(const TileInfo *tile, int sb_size_log2, int num_planes) { int tile_mb_rows = ROUND_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start, 2); int tile_mb_cols = ROUND_POWER_OF_TWO(tile->mi_col_end - tile->mi_col_start, 2); return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes); } static inline void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col, int mi_row, TokenExtra **tok, int sb_size_log2, int num_planes) { AV1_COMMON *const cm = &cpi->common; const int tile_cols = cm->tiles.cols; TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; const TileInfo *const tile_info = &this_tile->tile_info; const int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2; const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2; *tok = cpi->token_info.tile_tok[tile_row][tile_col] + get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes); } void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags); #define ALT_MIN_LAG 3 static inline int is_altref_enabled(int lag_in_frames, bool enable_auto_arf) { return lag_in_frames >= ALT_MIN_LAG && enable_auto_arf; } static inline int can_disable_altref(const GFConfig *gf_cfg) { return is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) && (gf_cfg->gf_min_pyr_height == 0); } // Helper function to compute number of blocks on either side of the frame. static inline int get_num_blocks(const int frame_length, const int mb_length) { return (frame_length + mb_length - 1) / mb_length; } // Check if statistics generation stage static inline int is_stat_generation_stage(const AV1_COMP *const cpi) { assert(IMPLIES(cpi->compressor_stage == LAP_STAGE, cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->ppi->lap_enabled)); return (cpi->oxcf.pass == AOM_RC_FIRST_PASS || (cpi->compressor_stage == LAP_STAGE)); } // Check if statistics consumption stage static inline int is_stat_consumption_stage_twopass(const AV1_COMP *const cpi) { return (cpi->oxcf.pass >= AOM_RC_SECOND_PASS); } // Check if statistics consumption stage static inline int is_stat_consumption_stage(const AV1_COMP *const cpi) { return (is_stat_consumption_stage_twopass(cpi) || (cpi->oxcf.pass == AOM_RC_ONE_PASS && (cpi->compressor_stage == ENCODE_STAGE) && cpi->ppi->lap_enabled)); } // Decide whether 'dv_costs' need to be allocated/stored during the encoding. static inline bool av1_need_dv_costs(const AV1_COMP *const cpi) { return !cpi->sf.rt_sf.use_nonrd_pick_mode && av1_allow_intrabc(&cpi->common) && !is_stat_generation_stage(cpi); } /*!\endcond */ /*!\brief Check if the current stage has statistics * *\ingroup two_pass_algo * * \param[in] cpi Top - level encoder instance structure * * \return 0 if no stats for current stage else 1 */ static inline int has_no_stats_stage(const AV1_COMP *const cpi) { assert( IMPLIES(!cpi->ppi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE)); return (cpi->oxcf.pass == AOM_RC_ONE_PASS && !cpi->ppi->lap_enabled); } /*!\cond */ static inline int is_one_pass_rt_params(const AV1_COMP *cpi) { return has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME && cpi->oxcf.gf_cfg.lag_in_frames == 0; } // Use default/internal reference structure for single-layer RTC. static inline int use_rtc_reference_structure_one_layer(const AV1_COMP *cpi) { return is_one_pass_rt_params(cpi) && cpi->ppi->number_spatial_layers == 1 && cpi->ppi->number_temporal_layers == 1 && !cpi->ppi->rtc_ref.set_ref_frame_config; } // Check if postencode drop is allowed. static inline int allow_postencode_drop_rtc(const AV1_COMP *cpi) { const AV1_COMMON *const cm = &cpi->common; return is_one_pass_rt_params(cpi) && cpi->oxcf.rc_cfg.mode == AOM_CBR && cpi->oxcf.rc_cfg.drop_frames_water_mark > 0 && !cpi->rc.rtc_external_ratectrl && !frame_is_intra_only(cm) && cpi->svc.spatial_layer_id == 0; } // Function return size of frame stats buffer static inline int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) { /* if lookahead is enabled return num_lap_buffers else num_lag_buffers */ return (num_lap_buffer > 0 ? num_lap_buffer + 1 : num_lag_buffer); } // TODO(zoeliu): To set up cpi->oxcf.gf_cfg.enable_auto_brf static inline void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd, MV_REFERENCE_FRAME ref0, MV_REFERENCE_FRAME ref1) { xd->block_ref_scale_factors[0] = get_ref_scale_factors_const(cm, ref0 >= LAST_FRAME ? ref0 : 1); xd->block_ref_scale_factors[1] = get_ref_scale_factors_const(cm, ref1 >= LAST_FRAME ? ref1 : 1); } static inline int get_chessboard_index(int frame_index) { return frame_index & 0x1; } static inline const int *cond_cost_list_const(const struct AV1_COMP *cpi, const int *cost_list) { const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE && cpi->sf.mv_sf.use_fullpel_costlist; return use_cost_list ? cost_list : NULL; } static inline int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) { const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE && cpi->sf.mv_sf.use_fullpel_costlist; return use_cost_list ? cost_list : NULL; } // Compression ratio of current frame. double av1_get_compression_ratio(const AV1_COMMON *const cm, size_t encoded_frame_size); void av1_new_framerate(AV1_COMP *cpi, double framerate); void av1_setup_frame_size(AV1_COMP *cpi); #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl)) // Returns 1 if a frame is scaled and 0 otherwise. static inline int av1_resize_scaled(const AV1_COMMON *cm) { return cm->superres_upscaled_width != cm->render_width || cm->superres_upscaled_height != cm->render_height; } static inline int av1_frame_scaled(const AV1_COMMON *cm) { return av1_superres_scaled(cm) || av1_resize_scaled(cm); } // Don't allow a show_existing_frame to coincide with an error resilient // frame. An exception can be made for a forward keyframe since it has no // previous dependencies. static inline int encode_show_existing_frame(const AV1_COMMON *cm) { return cm->show_existing_frame && (!cm->features.error_resilient_mode || cm->current_frame.frame_type == KEY_FRAME); } // Get index into the 'cpi->mbmi_ext_info.frame_base' array for the given // 'mi_row' and 'mi_col'. static inline int get_mi_ext_idx(const int mi_row, const int mi_col, const BLOCK_SIZE mi_alloc_bsize, const int mbmi_ext_stride) { const int mi_ext_size_1d = mi_size_wide[mi_alloc_bsize]; const int mi_ext_row = mi_row / mi_ext_size_1d; const int mi_ext_col = mi_col / mi_ext_size_1d; return mi_ext_row * mbmi_ext_stride + mi_ext_col; } // Lighter version of set_offsets that only sets the mode info // pointers. static inline void set_mode_info_offsets( const CommonModeInfoParams *const mi_params, const MBMIExtFrameBufferInfo *const mbmi_ext_info, MACROBLOCK *const x, MACROBLOCKD *const xd, int mi_row, int mi_col) { set_mi_offsets(mi_params, xd, mi_row, mi_col); const int ext_idx = get_mi_ext_idx(mi_row, mi_col, mi_params->mi_alloc_bsize, mbmi_ext_info->stride); x->mbmi_ext_frame = mbmi_ext_info->frame_base + ext_idx; } // Check to see if the given partition size is allowed for a specified number // of mi block rows and columns remaining in the image. // If not then return the largest allowed partition size static inline BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left, int cols_left, int *bh, int *bw) { int int_size = (int)bsize; if (rows_left <= 0 || cols_left <= 0) { return AOMMIN(bsize, BLOCK_8X8); } else { for (; int_size > 0; int_size -= 3) { *bh = mi_size_high[int_size]; *bw = mi_size_wide[int_size]; if ((*bh <= rows_left) && (*bw <= cols_left)) { break; } } } return (BLOCK_SIZE)int_size; } static const uint8_t av1_ref_frame_flag_list[REF_FRAMES] = { 0, AOM_LAST_FLAG, AOM_LAST2_FLAG, AOM_LAST3_FLAG, AOM_GOLD_FLAG, AOM_BWD_FLAG, AOM_ALT2_FLAG, AOM_ALT_FLAG }; // When more than 'max_allowed_refs' are available, we reduce the number of // reference frames one at a time based on this order. static const MV_REFERENCE_FRAME disable_order[] = { LAST3_FRAME, LAST2_FRAME, ALTREF2_FRAME, BWDREF_FRAME, }; static const MV_REFERENCE_FRAME ref_frame_priority_order[INTER_REFS_PER_FRAME] = { LAST_FRAME, ALTREF_FRAME, BWDREF_FRAME, GOLDEN_FRAME, ALTREF2_FRAME, LAST2_FRAME, LAST3_FRAME, }; static inline int get_ref_frame_flags(const SPEED_FEATURES *const sf, const int use_one_pass_rt_params, const YV12_BUFFER_CONFIG **ref_frames, const int ext_ref_frame_flags) { // cpi->ext_flags.ref_frame_flags allows certain reference types to be // disabled by the external interface. These are set by // av1_apply_encoding_flags(). Start with what the external interface allows, // then suppress any reference types which we have found to be duplicates. int flags = ext_ref_frame_flags; for (int i = 1; i < INTER_REFS_PER_FRAME; ++i) { const YV12_BUFFER_CONFIG *const this_ref = ref_frames[i]; // If this_ref has appeared before, mark the corresponding ref frame as // invalid. For one_pass_rt mode, only disable GOLDEN_FRAME if it's the // same as LAST_FRAME or ALTREF_FRAME (if ALTREF is being used in nonrd). int index = (use_one_pass_rt_params && ref_frame_priority_order[i] == GOLDEN_FRAME) ? (1 + sf->rt_sf.use_nonrd_altref_frame) : i; for (int j = 0; j < index; ++j) { // If this_ref has appeared before (same as the reference corresponding // to lower index j), remove it as a reference only if that reference // (for index j) is actually used as a reference. if (this_ref == ref_frames[j] && (flags & (1 << (ref_frame_priority_order[j] - 1)))) { flags &= ~(1 << (ref_frame_priority_order[i] - 1)); break; } } } return flags; } // Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon // failure. When a non-NULL aom_fixed_buf_t pointer is returned by this // function, the memory must be freed by the caller. Both the buf member of the // aom_fixed_buf_t, and the aom_fixed_buf_t pointer itself must be freed. Memory // returned must be freed via call to free(). // // Note: The OBU returned is in Low Overhead Bitstream Format. Specifically, // the obu_has_size_field bit is set, and the buffer contains the obu_size // field. aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi); #define MAX_GFUBOOST_FACTOR 10.0 #define MIN_GFUBOOST_FACTOR 4.0 static inline int is_frame_tpl_eligible(const GF_GROUP *const gf_group, uint8_t index) { const FRAME_UPDATE_TYPE update_type = gf_group->update_type[index]; return update_type == ARF_UPDATE || update_type == GF_UPDATE || update_type == KF_UPDATE; } static inline int is_frame_eligible_for_ref_pruning(const GF_GROUP *gf_group, int selective_ref_frame, int prune_ref_frames, int gf_index) { return (selective_ref_frame > 0) && (prune_ref_frames > 0) && !is_frame_tpl_eligible(gf_group, gf_index); } // Get update type of the current frame. static inline FRAME_UPDATE_TYPE get_frame_update_type(const GF_GROUP *gf_group, int gf_frame_index) { return gf_group->update_type[gf_frame_index]; } static inline int av1_pixels_to_mi(int pixels) { return ALIGN_POWER_OF_TWO(pixels, 3) >> MI_SIZE_LOG2; } static inline int is_psnr_calc_enabled(const AV1_COMP *cpi) { const AV1_COMMON *const cm = &cpi->common; return cpi->ppi->b_calculate_psnr && !is_stat_generation_stage(cpi) && cm->show_frame && !cpi->is_dropped_frame; } static inline int is_frame_resize_pending(const AV1_COMP *const cpi) { const ResizePendingParams *const resize_pending_params = &cpi->resize_pending_params; return (resize_pending_params->width && resize_pending_params->height && (cpi->common.width != resize_pending_params->width || cpi->common.height != resize_pending_params->height)); } // Check if loop filter is used. static inline int is_loopfilter_used(const AV1_COMMON *const cm) { return !cm->features.coded_lossless && !cm->tiles.large_scale; } // Check if CDEF is used. static inline int is_cdef_used(const AV1_COMMON *const cm) { return cm->seq_params->enable_cdef && !cm->features.coded_lossless && !cm->tiles.large_scale; } // Check if loop restoration filter is used. static inline int is_restoration_used(const AV1_COMMON *const cm) { return cm->seq_params->enable_restoration && !cm->features.all_lossless && !cm->tiles.large_scale; } // Checks if post-processing filters need to be applied. // NOTE: This function decides if the application of different post-processing // filters on the reconstructed frame can be skipped at the encoder side. // However the computation of different filter parameters that are signaled in // the bitstream is still required. static inline unsigned int derive_skip_apply_postproc_filters( const AV1_COMP *cpi, int use_loopfilter, int use_cdef, int use_superres, int use_restoration) { // Though CDEF parameter selection should be dependent on // deblocked/loop-filtered pixels for cdef_pick_method <= // CDEF_FAST_SEARCH_LVL5, CDEF strength values are calculated based on the // pixel values that are not loop-filtered in svc real-time encoding mode. // Hence this case is handled separately using the condition below. if (cpi->ppi->rtc_ref.non_reference_frame) return (SKIP_APPLY_LOOPFILTER | SKIP_APPLY_CDEF); if (!cpi->oxcf.algo_cfg.skip_postproc_filtering || cpi->ppi->b_calculate_psnr) return 0; assert(cpi->oxcf.mode == ALLINTRA); // The post-processing filters are applied one after the other in the // following order: deblocking->cdef->superres->restoration. In case of // ALLINTRA encoding, the reconstructed frame is not used as a reference // frame. Hence, the application of these filters can be skipped when // 1. filter parameters of the subsequent stages are not dependent on the // filtered output of the current stage or // 2. subsequent filtering stages are disabled if (use_restoration) return SKIP_APPLY_RESTORATION; if (use_superres) return SKIP_APPLY_SUPERRES; if (use_cdef) { // CDEF parameter selection is not dependent on the deblocked frame if // cdef_pick_method is CDEF_PICK_FROM_Q. Hence the application of deblocking // filters and cdef filters can be skipped in this case. return (cpi->sf.lpf_sf.cdef_pick_method == CDEF_PICK_FROM_Q && use_loopfilter) ? (SKIP_APPLY_LOOPFILTER | SKIP_APPLY_CDEF) : SKIP_APPLY_CDEF; } if (use_loopfilter) return SKIP_APPLY_LOOPFILTER; // If we reach here, all post-processing stages are disabled, so none need to // be skipped. return 0; } static inline void set_postproc_filter_default_params(AV1_COMMON *cm) { struct loopfilter *const lf = &cm->lf; CdefInfo *const cdef_info = &cm->cdef_info; RestorationInfo *const rst_info = cm->rst_info; lf->filter_level[0] = 0; lf->filter_level[1] = 0; cdef_info->cdef_bits = 0; cdef_info->cdef_strengths[0] = 0; cdef_info->nb_cdef_strengths = 1; cdef_info->cdef_uv_strengths[0] = 0; rst_info[0].frame_restoration_type = RESTORE_NONE; rst_info[1].frame_restoration_type = RESTORE_NONE; rst_info[2].frame_restoration_type = RESTORE_NONE; } static inline int is_inter_tx_size_search_level_one( const TX_SPEED_FEATURES *tx_sf) { return (tx_sf->inter_tx_size_search_init_depth_rect >= 1 && tx_sf->inter_tx_size_search_init_depth_sqr >= 1); } static inline int get_lpf_opt_level(const SPEED_FEATURES *sf) { int lpf_opt_level = 0; if (is_inter_tx_size_search_level_one(&sf->tx_sf)) lpf_opt_level = (sf->lpf_sf.lpf_pick == LPF_PICK_FROM_Q) ? 2 : 1; return lpf_opt_level; } // Enable switchable motion mode only if warp and OBMC tools are allowed static inline bool is_switchable_motion_mode_allowed(bool allow_warped_motion, bool enable_obmc) { return (allow_warped_motion || enable_obmc); } #if CONFIG_AV1_TEMPORAL_DENOISING static inline int denoise_svc(const struct AV1_COMP *const cpi) { return (!cpi->ppi->use_svc || (cpi->ppi->use_svc && cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise)); } #endif #if CONFIG_COLLECT_PARTITION_STATS == 2 static inline void av1_print_fr_partition_timing_stats( const FramePartitionTimingStats *part_stats, const char *filename) { FILE *f = fopen(filename, "w"); if (!f) { return; } fprintf(f, "bsize,redo,"); for (int part = 0; part < EXT_PARTITION_TYPES; part++) { fprintf(f, "decision_%d,", part); } for (int part = 0; part < EXT_PARTITION_TYPES; part++) { fprintf(f, "attempt_%d,", part); } for (int part = 0; part < EXT_PARTITION_TYPES; part++) { fprintf(f, "time_%d,", part); } fprintf(f, "\n"); static const int bsizes[6] = { 128, 64, 32, 16, 8, 4 }; for (int bsize_idx = 0; bsize_idx < 6; bsize_idx++) { fprintf(f, "%d,%d,", bsizes[bsize_idx], part_stats->partition_redo); for (int part = 0; part < EXT_PARTITION_TYPES; part++) { fprintf(f, "%d,", part_stats->partition_decisions[bsize_idx][part]); } for (int part = 0; part < EXT_PARTITION_TYPES; part++) { fprintf(f, "%d,", part_stats->partition_attempts[bsize_idx][part]); } for (int part = 0; part < EXT_PARTITION_TYPES; part++) { fprintf(f, "%ld,", part_stats->partition_times[bsize_idx][part]); } fprintf(f, "\n"); } fclose(f); } #endif // CONFIG_COLLECT_PARTITION_STATS == 2 #if CONFIG_COLLECT_PARTITION_STATS static inline int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) { assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 || bsize == BLOCK_32X32 || bsize == BLOCK_16X16 || bsize == BLOCK_8X8 || bsize == BLOCK_4X4); switch (bsize) { case BLOCK_128X128: return 0; case BLOCK_64X64: return 1; case BLOCK_32X32: return 2; case BLOCK_16X16: return 3; case BLOCK_8X8: return 4; case BLOCK_4X4: return 5; default: assert(0 && "Invalid bsize for partition_stats."); return -1; } } #endif // CONFIG_COLLECT_PARTITION_STATS #if CONFIG_COLLECT_COMPONENT_TIMING static inline void start_timing(AV1_COMP *cpi, int component) { aom_usec_timer_start(&cpi->component_timer[component]); } static inline void end_timing(AV1_COMP *cpi, int component) { aom_usec_timer_mark(&cpi->component_timer[component]); cpi->frame_component_time[component] += aom_usec_timer_elapsed(&cpi->component_timer[component]); } static inline char const *get_frame_type_enum(int type) { switch (type) { case 0: return "KEY_FRAME"; case 1: return "INTER_FRAME"; case 2: return "INTRA_ONLY_FRAME"; case 3: return "S_FRAME"; default: assert(0); } return "error"; } #endif /*!\endcond */ #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_ENCODER_H_ aom-3.12.1/av1/encoder/encoder_alloc.h000066400000000000000000000451741477627663500174640ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_ENCODER_ALLOC_H_ #define AOM_AV1_ENCODER_ENCODER_ALLOC_H_ #include "av1/encoder/block.h" #include "av1/encoder/encodeframe_utils.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encodetxb.h" #include "av1/encoder/ethread.h" #include "av1/encoder/global_motion_facade.h" #include "av1/encoder/intra_mode_search_utils.h" #include "av1/encoder/pickcdef.h" #ifdef __cplusplus extern "C" { #endif static inline void dealloc_context_buffers_ext( MBMIExtFrameBufferInfo *mbmi_ext_info) { aom_free(mbmi_ext_info->frame_base); mbmi_ext_info->frame_base = NULL; mbmi_ext_info->alloc_size = 0; } static inline void alloc_context_buffers_ext( AV1_COMMON *cm, MBMIExtFrameBufferInfo *mbmi_ext_info) { const CommonModeInfoParams *const mi_params = &cm->mi_params; const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; const int mi_alloc_rows = (mi_params->mi_rows + mi_alloc_size_1d - 1) / mi_alloc_size_1d; const int mi_alloc_cols = (mi_params->mi_cols + mi_alloc_size_1d - 1) / mi_alloc_size_1d; const int new_ext_mi_size = mi_alloc_rows * mi_alloc_cols; if (new_ext_mi_size > mbmi_ext_info->alloc_size) { dealloc_context_buffers_ext(mbmi_ext_info); CHECK_MEM_ERROR( cm, mbmi_ext_info->frame_base, aom_malloc(new_ext_mi_size * sizeof(*mbmi_ext_info->frame_base))); mbmi_ext_info->alloc_size = new_ext_mi_size; } // The stride needs to be updated regardless of whether new allocation // happened or not. mbmi_ext_info->stride = mi_alloc_cols; } static inline void alloc_compressor_data(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; CommonModeInfoParams *const mi_params = &cm->mi_params; // Setup mi_params mi_params->set_mb_mi(mi_params, cm->width, cm->height, cpi->sf.part_sf.default_min_partition_size); if (!is_stat_generation_stage(cpi)) av1_alloc_txb_buf(cpi); aom_free(cpi->td.mv_costs_alloc); cpi->td.mv_costs_alloc = NULL; // Avoid the memory allocation of 'mv_costs_alloc' for allintra encoding // mode. if (cpi->oxcf.kf_cfg.key_freq_max != 0) { CHECK_MEM_ERROR(cm, cpi->td.mv_costs_alloc, (MvCosts *)aom_calloc(1, sizeof(*cpi->td.mv_costs_alloc))); cpi->td.mb.mv_costs = cpi->td.mv_costs_alloc; } av1_setup_shared_coeff_buffer(cm->seq_params, &cpi->td.shared_coeff_buf, cm->error); if (av1_setup_sms_tree(cpi, &cpi->td)) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate SMS tree"); } cpi->td.firstpass_ctx = av1_alloc_pmc(cpi, BLOCK_16X16, &cpi->td.shared_coeff_buf); if (!cpi->td.firstpass_ctx) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } // Allocate mbmi buffers which are used to store mode information at block // level. static inline void alloc_mb_mode_info_buffers(AV1_COMP *const cpi) { AV1_COMMON *const cm = &cpi->common; if (av1_alloc_context_buffers(cm, cm->width, cm->height, cpi->sf.part_sf.default_min_partition_size)) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate context buffers"); } if (!is_stat_generation_stage(cpi)) alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info); } static inline void realloc_segmentation_maps(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; CommonModeInfoParams *const mi_params = &cm->mi_params; // Create the encoder segmentation map and set all entries to 0 aom_free(cpi->enc_seg.map); CHECK_MEM_ERROR(cm, cpi->enc_seg.map, aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1)); // Create a map used for cyclic background refresh. if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh); CHECK_MEM_ERROR( cm, cpi->cyclic_refresh, av1_cyclic_refresh_alloc(mi_params->mi_rows, mi_params->mi_cols)); // Create a map used to mark inactive areas. aom_free(cpi->active_map.map); CHECK_MEM_ERROR(cm, cpi->active_map.map, aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1)); } static inline void alloc_obmc_buffers(OBMCBuffer *obmc_buffer, struct aom_internal_error_info *error) { AOM_CHECK_MEM_ERROR( error, obmc_buffer->wsrc, (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->wsrc))); AOM_CHECK_MEM_ERROR( error, obmc_buffer->mask, (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->mask))); AOM_CHECK_MEM_ERROR( error, obmc_buffer->above_pred, (uint8_t *)aom_memalign( 16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->above_pred))); AOM_CHECK_MEM_ERROR( error, obmc_buffer->left_pred, (uint8_t *)aom_memalign( 16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->left_pred))); } static inline void release_obmc_buffers(OBMCBuffer *obmc_buffer) { aom_free(obmc_buffer->mask); aom_free(obmc_buffer->above_pred); aom_free(obmc_buffer->left_pred); aom_free(obmc_buffer->wsrc); obmc_buffer->mask = NULL; obmc_buffer->above_pred = NULL; obmc_buffer->left_pred = NULL; obmc_buffer->wsrc = NULL; } static inline void alloc_compound_type_rd_buffers( struct aom_internal_error_info *error, CompoundTypeRdBuffers *const bufs) { AOM_CHECK_MEM_ERROR( error, bufs->pred0, (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0))); AOM_CHECK_MEM_ERROR( error, bufs->pred1, (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1))); AOM_CHECK_MEM_ERROR( error, bufs->residual1, (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1))); AOM_CHECK_MEM_ERROR( error, bufs->diff10, (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10))); AOM_CHECK_MEM_ERROR(error, bufs->tmp_best_mask_buf, (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE * sizeof(*bufs->tmp_best_mask_buf))); } static inline void release_compound_type_rd_buffers( CompoundTypeRdBuffers *const bufs) { aom_free(bufs->pred0); aom_free(bufs->pred1); aom_free(bufs->residual1); aom_free(bufs->diff10); aom_free(bufs->tmp_best_mask_buf); av1_zero(*bufs); // Set all pointers to NULL for safety. } static inline void dealloc_compressor_data(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; TokenInfo *token_info = &cpi->token_info; AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; const int num_planes = av1_num_planes(cm); dealloc_context_buffers_ext(&cpi->mbmi_ext_info); aom_free(cpi->tile_data); cpi->tile_data = NULL; cpi->allocated_tiles = 0; enc_row_mt->allocated_tile_cols = 0; enc_row_mt->allocated_tile_rows = 0; // Delete sementation map aom_free(cpi->enc_seg.map); cpi->enc_seg.map = NULL; av1_cyclic_refresh_free(cpi->cyclic_refresh); cpi->cyclic_refresh = NULL; aom_free(cpi->active_map.map); cpi->active_map.map = NULL; aom_free(cpi->ssim_rdmult_scaling_factors); cpi->ssim_rdmult_scaling_factors = NULL; aom_free(cpi->tpl_rdmult_scaling_factors); cpi->tpl_rdmult_scaling_factors = NULL; #if CONFIG_TUNE_VMAF aom_free(cpi->vmaf_info.rdmult_scaling_factors); cpi->vmaf_info.rdmult_scaling_factors = NULL; aom_close_vmaf_model(cpi->vmaf_info.vmaf_model); #endif #if CONFIG_TUNE_BUTTERAUGLI aom_free(cpi->butteraugli_info.rdmult_scaling_factors); cpi->butteraugli_info.rdmult_scaling_factors = NULL; aom_free_frame_buffer(&cpi->butteraugli_info.source); aom_free_frame_buffer(&cpi->butteraugli_info.resized_source); #endif #if CONFIG_SALIENCY_MAP aom_free(cpi->saliency_map); aom_free(cpi->sm_scaling_factor); #endif release_obmc_buffers(&cpi->td.mb.obmc_buffer); aom_free(cpi->td.mv_costs_alloc); cpi->td.mv_costs_alloc = NULL; aom_free(cpi->td.dv_costs_alloc); cpi->td.dv_costs_alloc = NULL; aom_free(cpi->td.mb.sb_stats_cache); cpi->td.mb.sb_stats_cache = NULL; aom_free(cpi->td.mb.sb_fp_stats); cpi->td.mb.sb_fp_stats = NULL; #if CONFIG_PARTITION_SEARCH_ORDER aom_free(cpi->td.mb.rdcost); cpi->td.mb.rdcost = NULL; #endif av1_free_pc_tree_recursive(cpi->td.pc_root, num_planes, 0, 0, cpi->sf.part_sf.partition_search_type); cpi->td.pc_root = NULL; for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) { aom_free(cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j]); cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j] = NULL; } av1_hash_table_destroy(&cpi->td.mb.intrabc_hash_info.intrabc_hash_table); aom_free(cm->tpl_mvs); cm->tpl_mvs = NULL; aom_free(cpi->td.pixel_gradient_info); cpi->td.pixel_gradient_info = NULL; aom_free(cpi->td.src_var_info_of_4x4_sub_blocks); cpi->td.src_var_info_of_4x4_sub_blocks = NULL; aom_free(cpi->td.vt64x64); cpi->td.vt64x64 = NULL; av1_free_pmc(cpi->td.firstpass_ctx, num_planes); cpi->td.firstpass_ctx = NULL; const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth; // This call ensures that the buffers allocated by tf_alloc_and_reset_data() // in av1_temporal_filter() for single-threaded encode are freed in case an // error is encountered during temporal filtering (due to early termination // tf_dealloc_data() in av1_temporal_filter() would not be invoked). tf_dealloc_data(&cpi->td.tf_data, is_highbitdepth); // This call ensures that tpl_tmp_buffers for single-threaded encode are freed // in case of an error during tpl. tpl_dealloc_temp_buffers(&cpi->td.tpl_tmp_buffers); // This call ensures that the global motion (gm) data buffers for // single-threaded encode are freed in case of an error during gm. gm_dealloc_data(&cpi->td.gm_data); // This call ensures that CDEF search context buffers are deallocated in case // of an error during cdef search. av1_cdef_dealloc_data(cpi->cdef_search_ctx); aom_free(cpi->cdef_search_ctx); cpi->cdef_search_ctx = NULL; av1_dealloc_mb_data(&cpi->td.mb, num_planes); av1_dealloc_mb_wiener_var_pred_buf(&cpi->td); av1_free_txb_buf(cpi); av1_free_context_buffers(cm); aom_free_frame_buffer(&cpi->last_frame_uf); #if !CONFIG_REALTIME_ONLY av1_free_restoration_buffers(cm); av1_free_firstpass_data(&cpi->firstpass_data); #endif if (!is_stat_generation_stage(cpi)) { av1_free_cdef_buffers(cm, &cpi->ppi->p_mt_info.cdef_worker, &cpi->mt_info.cdef_sync); } for (int plane = 0; plane < num_planes; plane++) { aom_free(cpi->pick_lr_ctxt.rusi[plane]); cpi->pick_lr_ctxt.rusi[plane] = NULL; } aom_free(cpi->pick_lr_ctxt.dgd_avg); cpi->pick_lr_ctxt.dgd_avg = NULL; aom_free_frame_buffer(&cpi->trial_frame_rst); aom_free_frame_buffer(&cpi->scaled_source); aom_free_frame_buffer(&cpi->scaled_last_source); aom_free_frame_buffer(&cpi->orig_source); aom_free_frame_buffer(&cpi->svc.source_last_TL0); free_token_info(token_info); av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf); av1_free_sms_tree(&cpi->td); aom_free(cpi->td.mb.palette_buffer); release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer); aom_free(cpi->td.mb.tmp_conv_dst); for (int j = 0; j < 2; ++j) { aom_free(cpi->td.mb.tmp_pred_bufs[j]); } #if CONFIG_DENOISE && !CONFIG_REALTIME_ONLY if (cpi->denoise_and_model) { aom_denoise_and_model_free(cpi->denoise_and_model); cpi->denoise_and_model = NULL; } #endif #if !CONFIG_REALTIME_ONLY if (cpi->film_grain_table) { aom_film_grain_table_free(cpi->film_grain_table); aom_free(cpi->film_grain_table); cpi->film_grain_table = NULL; } #endif if (cpi->ppi->use_svc) av1_free_svc_cyclic_refresh(cpi); aom_free(cpi->svc.layer_context); cpi->svc.layer_context = NULL; aom_free(cpi->consec_zero_mv); cpi->consec_zero_mv = NULL; cpi->consec_zero_mv_alloc_size = 0; aom_free(cpi->src_sad_blk_64x64); cpi->src_sad_blk_64x64 = NULL; aom_free(cpi->mb_weber_stats); cpi->mb_weber_stats = NULL; if (cpi->oxcf.enable_rate_guide_deltaq) { aom_free(cpi->prep_rate_estimates); cpi->prep_rate_estimates = NULL; aom_free(cpi->ext_rate_distribution); cpi->ext_rate_distribution = NULL; } aom_free(cpi->mb_delta_q); cpi->mb_delta_q = NULL; } static inline void allocate_gradient_info_for_hog(AV1_COMP *cpi) { if (!is_gradient_caching_for_hog_enabled(cpi)) return; PixelLevelGradientInfo *pixel_gradient_info = cpi->td.pixel_gradient_info; if (!pixel_gradient_info) { const AV1_COMMON *const cm = &cpi->common; const int plane_types = PLANE_TYPES >> cm->seq_params->monochrome; CHECK_MEM_ERROR( cm, pixel_gradient_info, aom_malloc(sizeof(*pixel_gradient_info) * plane_types * MAX_SB_SQUARE)); cpi->td.pixel_gradient_info = pixel_gradient_info; } cpi->td.mb.pixel_gradient_info = pixel_gradient_info; } static inline void allocate_src_var_of_4x4_sub_block_buf(AV1_COMP *cpi) { if (!is_src_var_for_4x4_sub_blocks_caching_enabled(cpi)) return; Block4x4VarInfo *source_variance_info = cpi->td.src_var_info_of_4x4_sub_blocks; if (!source_variance_info) { const AV1_COMMON *const cm = &cpi->common; const BLOCK_SIZE sb_size = cm->seq_params->sb_size; const int mi_count_in_sb = mi_size_wide[sb_size] * mi_size_high[sb_size]; CHECK_MEM_ERROR(cm, source_variance_info, aom_malloc(sizeof(*source_variance_info) * mi_count_in_sb)); cpi->td.src_var_info_of_4x4_sub_blocks = source_variance_info; } cpi->td.mb.src_var_info_of_4x4_sub_blocks = source_variance_info; } static inline void variance_partition_alloc(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const int num_64x64_blocks = (cm->seq_params->sb_size == BLOCK_64X64) ? 1 : 4; if (cpi->td.vt64x64) { if (num_64x64_blocks != cpi->td.num_64x64_blocks) { aom_free(cpi->td.vt64x64); cpi->td.vt64x64 = NULL; } } if (!cpi->td.vt64x64) { CHECK_MEM_ERROR(cm, cpi->td.vt64x64, aom_malloc(sizeof(*cpi->td.vt64x64) * num_64x64_blocks)); cpi->td.num_64x64_blocks = num_64x64_blocks; } } static inline YV12_BUFFER_CONFIG *realloc_and_scale_source(AV1_COMP *cpi, int scaled_width, int scaled_height) { AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); if (scaled_width == cpi->unscaled_source->y_crop_width && scaled_height == cpi->unscaled_source->y_crop_height) { return cpi->unscaled_source; } if (aom_realloc_frame_buffer( &cpi->scaled_source, scaled_width, scaled_height, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->features.byte_alignment, NULL, NULL, NULL, cpi->alloc_pyramid, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to reallocate scaled source buffer"); assert(cpi->scaled_source.y_crop_width == scaled_width); assert(cpi->scaled_source.y_crop_height == scaled_height); if (!av1_resize_and_extend_frame_nonnormative( cpi->unscaled_source, &cpi->scaled_source, (int)cm->seq_params->bit_depth, num_planes)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to reallocate buffers during resize"); return &cpi->scaled_source; } // Deallocate allocated thread_data. static inline void free_thread_data(AV1_PRIMARY *ppi) { PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; const int num_tf_workers = AOMMIN(p_mt_info->num_mod_workers[MOD_TF], p_mt_info->num_workers); const int num_tpl_workers = AOMMIN(p_mt_info->num_mod_workers[MOD_TPL], p_mt_info->num_workers); const int is_highbitdepth = ppi->seq_params.use_highbitdepth; const int num_planes = ppi->seq_params.monochrome ? 1 : MAX_MB_PLANE; for (int t = 1; t < p_mt_info->num_workers; ++t) { EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[t]; thread_data->td = thread_data->original_td; ThreadData *const td = thread_data->td; if (!td) continue; aom_free(td->tctx); aom_free(td->palette_buffer); aom_free(td->tmp_conv_dst); release_compound_type_rd_buffers(&td->comp_rd_buffer); for (int j = 0; j < 2; ++j) { aom_free(td->tmp_pred_bufs[j]); } aom_free(td->pixel_gradient_info); aom_free(td->src_var_info_of_4x4_sub_blocks); release_obmc_buffers(&td->obmc_buffer); aom_free(td->vt64x64); for (int x = 0; x < 2; x++) { for (int y = 0; y < 2; y++) { aom_free(td->hash_value_buffer[x][y]); td->hash_value_buffer[x][y] = NULL; } } aom_free(td->mv_costs_alloc); td->mv_costs_alloc = NULL; aom_free(td->dv_costs_alloc); td->dv_costs_alloc = NULL; aom_free(td->counts); av1_free_pmc(td->firstpass_ctx, num_planes); td->firstpass_ctx = NULL; av1_free_shared_coeff_buffer(&td->shared_coeff_buf); av1_free_sms_tree(td); // This call ensures that the buffers allocated by tf_alloc_and_reset_data() // in prepare_tf_workers() for MT encode are freed in case an error is // encountered during temporal filtering (due to early termination // tf_dealloc_thread_data() in av1_tf_do_filtering_mt() would not be // invoked). if (t < num_tf_workers) tf_dealloc_data(&td->tf_data, is_highbitdepth); // This call ensures that tpl_tmp_buffers for MT encode are freed in case of // an error during tpl. if (t < num_tpl_workers) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers); // This call ensures that the buffers in gm_data for MT encode are freed in // case of an error during gm. gm_dealloc_data(&td->gm_data); av1_dealloc_mb_data(&td->mb, num_planes); aom_free(td->mb.sb_stats_cache); td->mb.sb_stats_cache = NULL; aom_free(td->mb.sb_fp_stats); td->mb.sb_fp_stats = NULL; #if CONFIG_PARTITION_SEARCH_ORDER aom_free(td->mb.rdcost); td->mb.rdcost = NULL; #endif av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0, SEARCH_PARTITION); td->pc_root = NULL; av1_dealloc_mb_wiener_var_pred_buf(td); aom_free(td); thread_data->td = NULL; thread_data->original_td = NULL; } } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_ENCODER_ALLOC_H_ aom-3.12.1/av1/encoder/encoder_utils.c000066400000000000000000002026471477627663500175250ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom/aomcx.h" #include "av1/common/av1_common_int.h" #include "av1/encoder/bitstream.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encoder_alloc.h" #include "av1/encoder/encodetxb.h" #include "av1/encoder/encoder_utils.h" #include "av1/encoder/grain_test_vectors.h" #include "av1/encoder/mv_prec.h" #include "av1/encoder/rc_utils.h" #include "av1/encoder/rdopt.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/superres_scale.h" #include "av1/encoder/tpl_model.h" #include "av1/encoder/var_based_part.h" #if CONFIG_TUNE_VMAF #include "av1/encoder/tune_vmaf.h" #endif #define MIN_BOOST_COMBINE_FACTOR 4.0 #define MAX_BOOST_COMBINE_FACTOR 12.0 const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES] = { { { 221, 189, 214, 292, 0, 0, 0, 0, 0, 2, 38, 68, 0, 0, 0, 0 }, { 262, 203, 216, 239, 0, 0, 0, 0, 0, 1, 37, 66, 0, 0, 0, 0 }, { 315, 231, 239, 226, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 222, 188, 214, 287, 0, 0, 0, 0, 0, 2, 50, 61, 0, 0, 0, 0 }, { 256, 182, 205, 282, 0, 0, 0, 0, 0, 2, 21, 76, 0, 0, 0, 0 }, { 281, 214, 217, 222, 0, 0, 0, 0, 0, 1, 48, 41, 0, 0, 0, 0 }, { 263, 194, 225, 225, 0, 0, 0, 0, 0, 2, 15, 100, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 170, 192, 242, 293, 0, 0, 0, 0, 0, 1, 68, 58, 0, 0, 0, 0 }, { 199, 210, 213, 291, 0, 0, 0, 0, 0, 1, 14, 96, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { { 106, 69, 107, 278, 9, 15, 20, 45, 49, 23, 23, 88, 36, 74, 25, 57 }, { 105, 72, 81, 98, 45, 49, 47, 50, 56, 72, 30, 81, 33, 95, 27, 83 }, { 211, 105, 109, 120, 57, 62, 43, 49, 52, 58, 42, 116, 0, 0, 0, 0 }, { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 131, 57, 98, 172, 19, 40, 37, 64, 69, 22, 41, 52, 51, 77, 35, 59 }, { 176, 83, 93, 202, 22, 24, 28, 47, 50, 16, 12, 93, 26, 76, 17, 59 }, { 136, 72, 89, 95, 46, 59, 47, 56, 61, 68, 35, 51, 32, 82, 26, 69 }, { 122, 80, 87, 105, 49, 47, 46, 46, 57, 52, 13, 90, 19, 103, 15, 93 }, { 1009, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0 }, { 1011, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 202, 20, 84, 114, 14, 60, 41, 79, 99, 21, 41, 15, 50, 84, 34, 66 }, { 196, 44, 23, 72, 30, 22, 28, 57, 67, 13, 4, 165, 15, 148, 9, 131 }, { 882, 0, 0, 0, 0, 0, 0, 0, 0, 142, 0, 0, 0, 0, 0, 0 }, { 840, 0, 0, 0, 0, 0, 0, 0, 0, 184, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } }, { { 213, 110, 141, 269, 12, 16, 15, 19, 21, 11, 38, 68, 22, 29, 16, 24 }, { 216, 119, 128, 143, 38, 41, 26, 30, 31, 30, 42, 70, 23, 36, 19, 32 }, { 367, 149, 154, 154, 38, 35, 17, 21, 21, 10, 22, 36, 0, 0, 0, 0 }, { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 219, 96, 127, 191, 21, 40, 25, 32, 34, 18, 45, 45, 33, 39, 26, 33 }, { 296, 99, 122, 198, 23, 21, 19, 24, 25, 13, 20, 64, 23, 32, 18, 27 }, { 275, 128, 142, 143, 35, 48, 23, 30, 29, 18, 42, 36, 18, 23, 14, 20 }, { 239, 132, 166, 175, 36, 27, 19, 21, 24, 14, 13, 85, 9, 31, 8, 25 }, { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 309, 25, 79, 59, 25, 80, 34, 53, 61, 25, 49, 23, 43, 64, 36, 59 }, { 270, 57, 40, 54, 50, 42, 41, 53, 56, 28, 17, 81, 45, 86, 34, 70 }, { 1005, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0 }, { 992, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { { 133, 63, 55, 83, 57, 87, 58, 72, 68, 16, 24, 35, 29, 105, 25, 114 }, { 131, 75, 74, 60, 71, 77, 65, 66, 73, 33, 21, 79, 20, 83, 18, 78 }, { 276, 95, 82, 58, 86, 93, 63, 60, 64, 17, 38, 92, 0, 0, 0, 0 }, { 1006, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 147, 49, 75, 78, 50, 97, 60, 67, 76, 17, 42, 35, 31, 93, 27, 80 }, { 157, 49, 58, 75, 61, 52, 56, 67, 69, 12, 15, 79, 24, 119, 11, 120 }, { 178, 69, 83, 77, 69, 85, 72, 77, 77, 20, 35, 40, 25, 48, 23, 46 }, { 174, 55, 64, 57, 73, 68, 62, 61, 75, 15, 12, 90, 17, 99, 16, 86 }, { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 }, { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 266, 31, 63, 64, 21, 52, 39, 54, 63, 30, 52, 31, 48, 89, 46, 75 }, { 272, 26, 32, 44, 29, 31, 32, 53, 51, 13, 13, 88, 22, 153, 16, 149 }, { 923, 0, 0, 0, 0, 0, 0, 0, 0, 101, 0, 0, 0, 0, 0, 0 }, { 969, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } }, { { 158, 92, 125, 298, 12, 15, 20, 29, 31, 12, 29, 67, 34, 44, 23, 35 }, { 147, 94, 103, 123, 45, 48, 38, 41, 46, 48, 37, 78, 33, 63, 27, 53 }, { 268, 126, 125, 136, 54, 53, 31, 38, 38, 33, 35, 87, 0, 0, 0, 0 }, { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 159, 72, 103, 194, 20, 35, 37, 50, 56, 21, 39, 40, 51, 61, 38, 48 }, { 259, 86, 95, 188, 32, 20, 25, 34, 37, 13, 12, 85, 25, 53, 17, 43 }, { 189, 99, 113, 123, 45, 59, 37, 46, 48, 44, 39, 41, 31, 47, 26, 37 }, { 175, 110, 113, 128, 58, 38, 33, 33, 43, 29, 13, 100, 14, 68, 12, 57 }, { 1017, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0 }, { 1019, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 208, 22, 84, 101, 21, 59, 44, 70, 90, 25, 59, 13, 64, 67, 49, 48 }, { 277, 52, 32, 63, 43, 26, 33, 48, 54, 11, 6, 130, 18, 119, 11, 101 }, { 963, 0, 0, 0, 0, 0, 0, 0, 0, 61, 0, 0, 0, 0, 0, 0 }, { 979, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } }; const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL] = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 106, 90, 90, 97, 67, 59, 70, 28, 30, 38, 16, 16, 16, 0, 0, 44, 50, 26, 25 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 98, 93, 97, 68, 82, 85, 33, 30, 33, 16, 16, 16, 16, 0, 0, 43, 37, 26, 16 }, { 0, 0, 0, 91, 80, 76, 78, 55, 49, 24, 16, 16, 16, 16, 16, 16, 0, 0, 29, 45, 16, 38 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 103, 89, 89, 89, 62, 63, 76, 34, 35, 32, 19, 16, 16, 0, 0, 49, 55, 29, 19 } }; const int default_warped_probs[FRAME_UPDATE_TYPES] = { 64, 64, 64, 64, 64, 64, 64 }; // TODO(yunqing): the default probs can be trained later from better // performance. const int default_switchable_interp_probs[FRAME_UPDATE_TYPES] [SWITCHABLE_FILTER_CONTEXTS] [SWITCHABLE_FILTERS] = { { { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 } }, { { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 } }, { { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 } }, { { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 } }, { { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 } }, { { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 } }, { { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 }, { 512, 512, 512 } } }; static void configure_static_seg_features(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; struct segmentation *const seg = &cm->seg; double avg_q; #if CONFIG_FPMT_TEST avg_q = ((cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) && (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE)) ? cpi->ppi->p_rc.temp_avg_q : cpi->ppi->p_rc.avg_q; #else avg_q = cpi->ppi->p_rc.avg_q; #endif int high_q = (int)(avg_q > 48.0); int qi_delta; // Disable and clear down for KF if (cm->current_frame.frame_type == KEY_FRAME) { // Clear down the global segmentation map memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); seg->update_map = 0; seg->update_data = 0; // Disable segmentation av1_disable_segmentation(seg); // Clear down the segment features. av1_clearall_segfeatures(seg); } else if (cpi->refresh_frame.alt_ref_frame) { // If this is an alt ref frame // Clear down the global segmentation map memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); seg->update_map = 0; seg->update_data = 0; // Disable segmentation and individual segment features by default av1_disable_segmentation(seg); av1_clearall_segfeatures(seg); // If segmentation was enabled set those features needed for the // arf itself. if (seg->enabled) { seg->update_map = 1; seg->update_data = 1; qi_delta = av1_compute_qdelta(rc, avg_q, avg_q * 0.875, cm->seq_params->bit_depth); av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2); av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2); av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2); av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2); av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2); av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H); av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V); av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U); av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V); av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); } } else if (seg->enabled) { // All other frames if segmentation has been enabled // First normal frame in a valid gf or alt ref group if (rc->frames_since_golden == 0) { // Set up segment features for normal frames in an arf group // Disable segmentation and clear down features if alt ref // is not active for this group av1_disable_segmentation(seg); memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); seg->update_map = 0; seg->update_data = 0; av1_clearall_segfeatures(seg); } else if (rc->is_src_frame_alt_ref) { // Special case where we are coding over the top of a previous // alt ref frame. // Segment coding disabled for compred testing // Enable ref frame features for segment 0 as well av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME); av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME); // All mbs should use ALTREF_FRAME av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME); av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME); av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME); av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); // Skip all MBs if high Q (0,0 mv and skip coeffs) if (high_q) { av1_enable_segfeature(seg, 0, SEG_LVL_SKIP); av1_enable_segfeature(seg, 1, SEG_LVL_SKIP); } // Enable data update seg->update_data = 1; } else { // All other frames. // No updates.. leave things as they are. seg->update_map = 0; seg->update_data = 0; } } } void av1_apply_active_map(AV1_COMP *cpi) { struct segmentation *const seg = &cpi->common.seg; unsigned char *const seg_map = cpi->enc_seg.map; const unsigned char *const active_map = cpi->active_map.map; assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE); // Disable the active_maps on intra_only frames or if the // input map for the current frame has no inactive blocks. if (frame_is_intra_only(&cpi->common) || cpi->rc.percent_blocks_inactive == 0) { cpi->active_map.enabled = 0; cpi->active_map.update = 1; } if (cpi->active_map.update) { if (cpi->active_map.enabled) { const int num_mis = cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; memcpy(seg_map, active_map, sizeof(active_map[0]) * num_mis); av1_enable_segmentation(seg); av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP); av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H); av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V); av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U); av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V); av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H, -MAX_LOOP_FILTER); av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V, -MAX_LOOP_FILTER); av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U, -MAX_LOOP_FILTER); av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V, -MAX_LOOP_FILTER); } else { av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP); av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H); av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V); av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U); av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V); if (seg->enabled) { seg->update_data = 1; seg->update_map = 1; } } cpi->active_map.update = 0; } } #if !CONFIG_REALTIME_ONLY static void process_tpl_stats_frame(AV1_COMP *cpi) { const GF_GROUP *const gf_group = &cpi->ppi->gf_group; AV1_COMMON *const cm = &cpi->common; assert(IMPLIES(gf_group->size > 0, cpi->gf_frame_index < gf_group->size)); const int tpl_idx = cpi->gf_frame_index; TplParams *const tpl_data = &cpi->ppi->tpl_data; TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; if (tpl_frame->is_valid) { int tpl_stride = tpl_frame->stride; double intra_cost_base = 0; double mc_dep_cost_base = 0; double cbcmp_base = 1; const int step = 1 << tpl_data->tpl_stats_block_mis_log2; const int row_step = step; const int col_step_sr = coded_to_superres_mi(step, cm->superres_scale_denominator); const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); for (int row = 0; row < cm->mi_params.mi_rows; row += row_step) { for (int col = 0; col < mi_cols_sr; col += col_step_sr) { TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; double cbcmp = (double)(this_stats->srcrf_dist); int64_t mc_dep_delta = RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, this_stats->mc_dep_dist); double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS); intra_cost_base += log(dist_scaled) * cbcmp; mc_dep_cost_base += log(dist_scaled + mc_dep_delta) * cbcmp; cbcmp_base += cbcmp; } } if (mc_dep_cost_base == 0) { tpl_frame->is_valid = 0; } else { cpi->rd.r0 = exp((intra_cost_base - mc_dep_cost_base) / cbcmp_base); if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) { if (cpi->ppi->lap_enabled) { double min_boost_factor = sqrt(cpi->ppi->p_rc.baseline_gf_interval); const int gfu_boost = get_gfu_boost_from_r0_lap( min_boost_factor, MAX_GFUBOOST_FACTOR, cpi->rd.r0, cpi->ppi->p_rc.num_stats_required_for_gfu_boost); // printf("old boost %d new boost %d\n", cpi->rc.gfu_boost, // gfu_boost); cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost( min_boost_factor, MAX_BOOST_COMBINE_FACTOR, cpi->ppi->p_rc.gfu_boost, gfu_boost, cpi->ppi->p_rc.num_stats_used_for_gfu_boost); } else { // TPL may only look at a subset of frame in the gf group when the // speed feature 'reduce_num_frames' is on, which affects the r0 // calcuation. Thus, to compensate for TPL not using all frames a // factor to adjust r0 is used. const int gfu_boost = (int)(200.0 * cpi->ppi->tpl_data.r0_adjust_factor / cpi->rd.r0); cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost( MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR, cpi->ppi->p_rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key); } } } } } #endif // !CONFIG_REALTIME_ONLY void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index, int *top_index) { AV1_COMMON *const cm = &cpi->common; // Setup variables that depend on the dimensions of the frame. av1_set_speed_features_framesize_dependent(cpi, cpi->speed); #if !CONFIG_REALTIME_ONLY GF_GROUP *gf_group = &cpi->ppi->gf_group; if (cpi->oxcf.algo_cfg.enable_tpl_model && av1_tpl_stats_ready(&cpi->ppi->tpl_data, cpi->gf_frame_index)) { process_tpl_stats_frame(cpi); av1_tpl_rdmult_setup(cpi); } #endif // Decide q and q bounds. *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, cpi->gf_frame_index, bottom_index, top_index); if (cpi->oxcf.rc_cfg.mode == AOM_CBR && cpi->rc.force_max_q) { *q = cpi->rc.worst_quality; cpi->rc.force_max_q = 0; } #if !CONFIG_REALTIME_ONLY if (cpi->oxcf.rc_cfg.mode == AOM_Q && cpi->ppi->tpl_data.tpl_frame[cpi->gf_frame_index].is_valid && !is_lossless_requested(&cpi->oxcf.rc_cfg)) { const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; const int tpl_q = av1_tpl_get_q_index( &cpi->ppi->tpl_data, cpi->gf_frame_index, cpi->rc.active_worst_quality, cm->seq_params->bit_depth); *q = clamp(tpl_q, rc_cfg->best_allowed_q, rc_cfg->worst_allowed_q); *top_index = *bottom_index = *q; if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE) cpi->ppi->p_rc.arf_q = *q; } if (cpi->oxcf.q_cfg.use_fixed_qp_offsets && cpi->oxcf.rc_cfg.mode == AOM_Q) { if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) { const double qratio_grad = cpi->ppi->p_rc.baseline_gf_interval > 20 ? 0.2 : 0.3; const double qstep_ratio = 0.2 + (1.0 - (double)cpi->rc.active_worst_quality / MAXQ) * qratio_grad; *q = av1_get_q_index_from_qstep_ratio( cpi->rc.active_worst_quality, qstep_ratio, cm->seq_params->bit_depth); *top_index = *bottom_index = *q; if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE || gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE || gf_group->update_type[cpi->gf_frame_index] == GF_UPDATE) cpi->ppi->p_rc.arf_q = *q; } else if (gf_group->layer_depth[cpi->gf_frame_index] < gf_group->max_layer_depth) { int this_height = gf_group->layer_depth[cpi->gf_frame_index]; int arf_q = cpi->ppi->p_rc.arf_q; while (this_height > 1) { arf_q = (arf_q + cpi->oxcf.rc_cfg.cq_level + 1) / 2; --this_height; } *top_index = *bottom_index = *q = arf_q; } } #endif // Configure experimental use of segmentation for enhanced coding of // static regions if indicated. // Only allowed in the second pass of a two pass encode, as it requires // lagged coding, and if the relevant speed feature flag is set. if (is_stat_consumption_stage_twopass(cpi) && cpi->sf.hl_sf.static_segmentation) configure_static_seg_features(cpi); } #if !CONFIG_REALTIME_ONLY static void reset_film_grain_chroma_params(aom_film_grain_t *pars) { pars->num_cr_points = 0; pars->cr_mult = 0; pars->cr_luma_mult = 0; memset(pars->scaling_points_cr, 0, sizeof(pars->scaling_points_cr)); memset(pars->ar_coeffs_cr, 0, sizeof(pars->ar_coeffs_cr)); pars->num_cb_points = 0; pars->cb_mult = 0; pars->cb_luma_mult = 0; pars->chroma_scaling_from_luma = 0; memset(pars->scaling_points_cb, 0, sizeof(pars->scaling_points_cb)); memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb)); } void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf) { SequenceHeader *const seq_params = &ppi->seq_params; const TuneCfg *const tune_cfg = &oxcf->tune_cfg; if (tune_cfg->film_grain_test_vector || tune_cfg->film_grain_table_filename || tune_cfg->content == AOM_CONTENT_FILM) { seq_params->film_grain_params_present = 1; } else { #if CONFIG_DENOISE seq_params->film_grain_params_present = (oxcf->noise_level > 0); #else seq_params->film_grain_params_present = 0; #endif } } void av1_update_film_grain_parameters(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { AV1_COMMON *const cm = &cpi->common; const TuneCfg *const tune_cfg = &oxcf->tune_cfg; if (cpi->film_grain_table) { aom_film_grain_table_free(cpi->film_grain_table); aom_free(cpi->film_grain_table); cpi->film_grain_table = NULL; } if (tune_cfg->film_grain_test_vector) { if (cm->current_frame.frame_type == KEY_FRAME) { memcpy(&cm->film_grain_params, film_grain_test_vectors + tune_cfg->film_grain_test_vector - 1, sizeof(cm->film_grain_params)); if (oxcf->tool_cfg.enable_monochrome) reset_film_grain_chroma_params(&cm->film_grain_params); cm->film_grain_params.bit_depth = cm->seq_params->bit_depth; if (cm->seq_params->color_range == AOM_CR_FULL_RANGE) { cm->film_grain_params.clip_to_restricted_range = 0; } } } else if (tune_cfg->film_grain_table_filename) { CHECK_MEM_ERROR(cm, cpi->film_grain_table, aom_calloc(1, sizeof(*cpi->film_grain_table))); aom_film_grain_table_read(cpi->film_grain_table, tune_cfg->film_grain_table_filename, cm->error); } else if (tune_cfg->content == AOM_CONTENT_FILM) { cm->film_grain_params.bit_depth = cm->seq_params->bit_depth; if (oxcf->tool_cfg.enable_monochrome) reset_film_grain_chroma_params(&cm->film_grain_params); if (cm->seq_params->color_range == AOM_CR_FULL_RANGE) cm->film_grain_params.clip_to_restricted_range = 0; } else { memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params)); } } #endif // !CONFIG_REALTIME_ONLY void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter, const int phase, const int use_optimized_scaler) { AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); MV_REFERENCE_FRAME ref_frame; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1). if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { BufferPool *const pool = cm->buffer_pool; const YV12_BUFFER_CONFIG *const ref = get_ref_frame_yv12_buf(cm, ref_frame); if (ref == NULL) { cpi->scaled_ref_buf[ref_frame - 1] = NULL; continue; } // For RTC-SVC: if force_zero_mode_spatial_ref is enabled, check if the // motion search can be skipped for the references: last, golden, altref. // If so, we can skip scaling that reference. if (cpi->ppi->use_svc && cpi->svc.force_zero_mode_spatial_ref && cpi->ppi->rtc_ref.set_ref_frame_config) { if (ref_frame == LAST_FRAME && cpi->svc.skip_mvsearch_last) continue; if (ref_frame == GOLDEN_FRAME && cpi->svc.skip_mvsearch_gf) continue; if (ref_frame == ALTREF_FRAME && cpi->svc.skip_mvsearch_altref) continue; } // For RTC with superres on: golden reference only needs to be scaled // if it was refreshed in previous frame. if (is_one_pass_rt_params(cpi) && cpi->oxcf.superres_cfg.enable_superres && ref_frame == GOLDEN_FRAME && cpi->rc.frame_num_last_gf_refresh < (int)cm->current_frame.frame_number - 1) { continue; } if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { // Replace the reference buffer with a copy having a thicker border, // if the reference buffer is higher resolution than the current // frame, and the border is thin. if ((ref->y_crop_width > cm->width || ref->y_crop_height > cm->height) && ref->border < AOM_BORDER_IN_PIXELS) { RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame); if (aom_yv12_realloc_with_new_border( &ref_fb->buf, AOM_BORDER_IN_PIXELS, cm->features.byte_alignment, cpi->alloc_pyramid, num_planes) != 0) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } } int force_scaling = 0; RefCntBuffer *new_fb = cpi->scaled_ref_buf[ref_frame - 1]; if (new_fb == NULL) { const int new_fb_idx = get_free_fb(cm); if (new_fb_idx == INVALID_IDX) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Unable to find free frame buffer"); } force_scaling = 1; new_fb = &pool->frame_bufs[new_fb_idx]; } if (force_scaling || new_fb->buf.y_crop_width != cm->width || new_fb->buf.y_crop_height != cm->height) { if (aom_realloc_frame_buffer( &new_fb->buf, cm->width, cm->height, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->features.byte_alignment, NULL, NULL, NULL, false, 0)) { if (force_scaling) { // Release the reference acquired in the get_free_fb() call above. --new_fb->ref_count; } aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } bool has_optimized_scaler = av1_has_optimized_scaler( ref->y_crop_width, ref->y_crop_height, new_fb->buf.y_crop_width, new_fb->buf.y_crop_height); if (num_planes > 1) { has_optimized_scaler = has_optimized_scaler && av1_has_optimized_scaler( ref->uv_crop_width, ref->uv_crop_height, new_fb->buf.uv_crop_width, new_fb->buf.uv_crop_height); } #if CONFIG_AV1_HIGHBITDEPTH if (use_optimized_scaler && has_optimized_scaler && cm->seq_params->bit_depth == AOM_BITS_8) { av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase, num_planes); } else if (!av1_resize_and_extend_frame_nonnormative( ref, &new_fb->buf, (int)cm->seq_params->bit_depth, num_planes)) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate buffer during resize"); } #else if (use_optimized_scaler && has_optimized_scaler) { av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase, num_planes); } else if (!av1_resize_and_extend_frame_nonnormative( ref, &new_fb->buf, (int)cm->seq_params->bit_depth, num_planes)) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate buffer during resize"); } #endif cpi->scaled_ref_buf[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); } } else { RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame); buf->buf.y_crop_width = ref->y_crop_width; buf->buf.y_crop_height = ref->y_crop_height; cpi->scaled_ref_buf[ref_frame - 1] = buf; ++buf->ref_count; } } else { if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL; } } } BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width, int height, int number_spatial_layers) { if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_64X64) { return BLOCK_64X64; } if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_128X128) { return BLOCK_128X128; } #if CONFIG_TFLITE if (oxcf->q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) return BLOCK_64X64; #endif // Force 64x64 superblock size to increase resolution in perceptual // AQ and user rating based modes. if (oxcf->mode == ALLINTRA && (oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI || oxcf->q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED)) { return BLOCK_64X64; } // Variance Boost only supports 64x64 superblocks. if (oxcf->q_cfg.deltaq_mode == DELTA_Q_VARIANCE_BOOST) { return BLOCK_64X64; } assert(oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC); if (number_spatial_layers > 1 || oxcf->resize_cfg.resize_mode != RESIZE_NONE) { // Use the configured size (top resolution) for spatial layers or // on resize. return AOMMIN(oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height) > 720 ? BLOCK_128X128 : BLOCK_64X64; } else if (oxcf->mode == REALTIME) { if (oxcf->tune_cfg.content == AOM_CONTENT_SCREEN) { const TileConfig *const tile_cfg = &oxcf->tile_cfg; const int num_tiles = (1 << tile_cfg->tile_columns) * (1 << tile_cfg->tile_rows); // For multi-thread encode: if the number of (128x128) superblocks // per tile is low use 64X64 superblock. if (oxcf->row_mt == 1 && oxcf->max_threads >= 4 && oxcf->max_threads >= num_tiles && AOMMIN(width, height) >= 720 && (width * height) / (128 * 128 * num_tiles) < 40) return BLOCK_64X64; else return AOMMIN(width, height) >= 720 ? BLOCK_128X128 : BLOCK_64X64; } else { return AOMMIN(width, height) > 720 ? BLOCK_128X128 : BLOCK_64X64; } } // TODO(any): Possibly could improve this with a heuristic. // When superres / resize is on, 'cm->width / height' can change between // calls, so we don't apply this heuristic there. // Things break if superblock size changes between the first pass and second // pass encoding, which is why this heuristic is not configured as a // speed-feature. if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE && oxcf->resize_cfg.resize_mode == RESIZE_NONE) { int is_480p_or_lesser = AOMMIN(width, height) <= 480; if (oxcf->speed >= 1 && is_480p_or_lesser) return BLOCK_64X64; // For 1080p and lower resolutions, choose SB size adaptively based on // resolution and speed level for multi-thread encode. int is_1080p_or_lesser = AOMMIN(width, height) <= 1080; if (!is_480p_or_lesser && is_1080p_or_lesser && oxcf->mode == GOOD && oxcf->row_mt == 1 && oxcf->max_threads > 1 && oxcf->speed >= 5) return BLOCK_64X64; // For allintra encode, since the maximum partition size is set to 32X32 for // speed>=6, superblock size is set to 64X64 instead of 128X128. This // improves the multithread performance due to reduction in top right delay // and thread sync wastage. Currently, this setting is selectively enabled // only for speed>=9 and resolutions less than 4k since cost update // frequency is set to INTERNAL_COST_UPD_OFF in these cases. const int is_4k_or_larger = AOMMIN(width, height) >= 2160; if (oxcf->mode == ALLINTRA && oxcf->speed >= 9 && !is_4k_or_larger) return BLOCK_64X64; } return BLOCK_128X128; } void av1_setup_frame(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; // Set up entropy context depending on frame type. The decoder mandates // the use of the default context, index 0, for keyframes and inter // frames where the error_resilient_mode or intra_only flag is set. For // other inter-frames the encoder currently uses only two contexts; // context 1 for ALTREF frames and context 0 for the others. if (frame_is_intra_only(cm) || cm->features.error_resilient_mode || cpi->ext_flags.use_primary_ref_none) { av1_setup_past_independence(cm); } if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) || frame_is_sframe(cm)) { if (!cpi->ppi->seq_params_locked) { set_sb_size(cm->seq_params, av1_select_sb_size(&cpi->oxcf, cm->width, cm->height, cpi->ppi->number_spatial_layers)); } } else { const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm); if (primary_ref_buf == NULL) { av1_setup_past_independence(cm); cm->seg.update_map = 1; cm->seg.update_data = 1; } else { *cm->fc = primary_ref_buf->frame_context; } } av1_zero(cm->cur_frame->interp_filter_selected); cm->prev_frame = get_primary_ref_frame_buf(cm); cpi->vaq_refresh = 0; } #if !CONFIG_REALTIME_ONLY static int get_interp_filter_selected(const AV1_COMMON *const cm, MV_REFERENCE_FRAME ref, InterpFilter ifilter) { const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref); if (buf == NULL) return 0; return buf->interp_filter_selected[ifilter]; } uint16_t av1_setup_interp_filter_search_mask(AV1_COMP *cpi) { const AV1_COMMON *const cm = &cpi->common; int ref_total[REF_FRAMES] = { 0 }; uint16_t mask = ALLOW_ALL_INTERP_FILT_MASK; if (cpi->last_frame_type == KEY_FRAME || cpi->refresh_frame.alt_ref_frame) return mask; for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) { for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP; ++ifilter) { ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter); } } int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] + ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] + ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]); for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP; ++ifilter) { int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30; if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) { int filter_score = get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 + get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 + get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 + get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 + get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 + get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10; if (filter_score < ref_total_total) { DUAL_FILTER_TYPE filt_type = ifilter + SWITCHABLE_FILTERS * ifilter; reset_interp_filter_allowed_mask(&mask, filt_type); } } } return mask; } #define STRICT_PSNR_DIFF_THRESH 0.9 // Encode key frame with/without screen content tools to determine whether // screen content tools should be enabled for this key frame group or not. // The first encoding is without screen content tools. // The second encoding is with screen content tools. // We compare the psnr and frame size to make the decision. static void screen_content_tools_determination( AV1_COMP *cpi, const int allow_screen_content_tools_orig_decision, const int allow_intrabc_orig_decision, const int use_screen_content_tools_orig_decision, const int is_screen_content_type_orig_decision, const int pass, int *projected_size_pass, PSNR_STATS *psnr) { AV1_COMMON *const cm = &cpi->common; FeatureFlags *const features = &cm->features; #if CONFIG_FPMT_TEST projected_size_pass[pass] = ((cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) && (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE)) ? cpi->ppi->p_rc.temp_projected_frame_size : cpi->rc.projected_frame_size; #else projected_size_pass[pass] = cpi->rc.projected_frame_size; #endif #if CONFIG_AV1_HIGHBITDEPTH const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth; const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass], bit_depth, in_bit_depth); #else aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass]); #endif if (pass != 1) return; const double psnr_diff = psnr[1].psnr[0] - psnr[0].psnr[0]; // Calculate % of palette mode to be chosen in a frame from mode decision. const double palette_ratio = (double)cpi->palette_pixel_num / (double)(cm->height * cm->width); const int psnr_diff_is_large = (psnr_diff > STRICT_PSNR_DIFF_THRESH); const int ratio_is_large = ((palette_ratio >= 0.0001) && ((psnr_diff / palette_ratio) > 4)); const int is_sc_encoding_much_better = (psnr_diff_is_large || ratio_is_large); if (is_sc_encoding_much_better) { // Use screen content tools, if we get coding gain. features->allow_screen_content_tools = 1; features->allow_intrabc = cpi->intrabc_used; cpi->use_screen_content_tools = 1; cpi->is_screen_content_type = 1; } else { // Use original screen content decision. features->allow_screen_content_tools = allow_screen_content_tools_orig_decision; features->allow_intrabc = allow_intrabc_orig_decision; cpi->use_screen_content_tools = use_screen_content_tools_orig_decision; cpi->is_screen_content_type = is_screen_content_type_orig_decision; } } // Set some encoding parameters to make the encoding process fast. // A fixed block partition size, and a large q is used. static void set_encoding_params_for_screen_content(AV1_COMP *cpi, const int pass) { AV1_COMMON *const cm = &cpi->common; if (pass == 0) { // In the first pass, encode without screen content tools. // Use a high q, and a fixed block size for fast encoding. cm->features.allow_screen_content_tools = 0; cm->features.allow_intrabc = 0; cpi->use_screen_content_tools = 0; cpi->sf.part_sf.partition_search_type = FIXED_PARTITION; cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32; return; } assert(pass == 1); // In the second pass, encode with screen content tools. // Use a high q, and a fixed block size for fast encoding. cm->features.allow_screen_content_tools = 1; // TODO(chengchen): turn intrabc on could lead to data race issue. // cm->allow_intrabc = 1; cpi->use_screen_content_tools = 1; cpi->sf.part_sf.partition_search_type = FIXED_PARTITION; cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32; } // Determines whether to use screen content tools for the key frame group. // This function modifies "cm->features.allow_screen_content_tools", // "cm->features.allow_intrabc" and "cpi->use_screen_content_tools". void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig) { AV1_COMMON *const cm = &cpi->common; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const QuantizationCfg *const q_cfg = &oxcf->q_cfg; // Variables to help determine if we should allow screen content tools. int projected_size_pass[3] = { 0 }; PSNR_STATS psnr[3]; const int is_key_frame = cm->current_frame.frame_type == KEY_FRAME; const int allow_screen_content_tools_orig_decision = cm->features.allow_screen_content_tools; const int allow_intrabc_orig_decision = cm->features.allow_intrabc; const int use_screen_content_tools_orig_decision = cpi->use_screen_content_tools; const int is_screen_content_type_orig_decision = cpi->is_screen_content_type; // Turn off the encoding trial for forward key frame and superres. if (cpi->sf.rt_sf.use_nonrd_pick_mode || oxcf->kf_cfg.fwd_kf_enabled || cpi->superres_mode != AOM_SUPERRES_NONE || oxcf->mode == REALTIME || use_screen_content_tools_orig_decision || !is_key_frame) { return; } // TODO(chengchen): multiple encoding for the lossless mode is time consuming. // Find a better way to determine whether screen content tools should be used // for lossless coding. // Use a high q and a fixed partition to do quick encoding. const int q_for_screen_content_quick_run = is_lossless_requested(&oxcf->rc_cfg) ? q_orig : AOMMAX(q_orig, 244); const int partition_search_type_orig = cpi->sf.part_sf.partition_search_type; const BLOCK_SIZE fixed_partition_block_size_orig = cpi->sf.part_sf.fixed_partition_size; // Setup necessary params for encoding, including frame source, etc. cpi->source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); if (cpi->unscaled_last_source != NULL) { cpi->last_source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_last_source, &cpi->scaled_last_source, cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); } av1_setup_frame(cpi); if (cm->seg.enabled) { if (!cm->seg.update_data && cm->prev_frame) { segfeatures_copy(&cm->seg, &cm->prev_frame->seg); cm->seg.enabled = cm->prev_frame->seg.enabled; } else { av1_calculate_segdata(&cm->seg); } } else { memset(&cm->seg, 0, sizeof(cm->seg)); } segfeatures_copy(&cm->cur_frame->seg, &cm->seg); cm->cur_frame->seg.enabled = cm->seg.enabled; // The two encoding passes aim to help determine whether to use screen // content tools, with a high q and fixed partition. for (int pass = 0; pass < 2; ++pass) { set_encoding_params_for_screen_content(cpi, pass); av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q_for_screen_content_quick_run, q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq, oxcf->mode == ALLINTRA, oxcf->tune_cfg.tuning); av1_set_speed_features_qindex_dependent(cpi, oxcf->speed); av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, cm->seq_params->bit_depth); av1_set_variance_partition_thresholds(cpi, q_for_screen_content_quick_run, 0); // transform / motion compensation build reconstruction frame av1_encode_frame(cpi); // Screen content decision screen_content_tools_determination( cpi, allow_screen_content_tools_orig_decision, allow_intrabc_orig_decision, use_screen_content_tools_orig_decision, is_screen_content_type_orig_decision, pass, projected_size_pass, psnr); } // Set partition speed feature back. cpi->sf.part_sf.partition_search_type = partition_search_type_orig; cpi->sf.part_sf.fixed_partition_size = fixed_partition_block_size_orig; // Free token related info if screen content coding tools are not enabled. if (!cm->features.allow_screen_content_tools) free_token_info(&cpi->token_info); } #endif // CONFIG_REALTIME_ONLY static void fix_interp_filter(InterpFilter *const interp_filter, const FRAME_COUNTS *const counts) { if (*interp_filter == SWITCHABLE) { // Check to see if only one of the filters is actually used int count[SWITCHABLE_FILTERS] = { 0 }; int num_filters_used = 0; for (int i = 0; i < SWITCHABLE_FILTERS; ++i) { for (int j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) count[i] += counts->switchable_interp[j][i]; num_filters_used += (count[i] > 0); } if (num_filters_used == 1) { // Only one filter is used. So set the filter at frame level for (int i = 0; i < SWITCHABLE_FILTERS; ++i) { if (count[i]) { *interp_filter = i; break; } } } } } void av1_finalize_encoded_frame(AV1_COMP *const cpi) { AV1_COMMON *const cm = &cpi->common; CurrentFrame *const current_frame = &cm->current_frame; if (!cm->seq_params->reduced_still_picture_hdr && encode_show_existing_frame(cm)) { RefCntBuffer *const frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show]; if (frame_to_show == NULL) { aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Buffer does not contain a reconstructed frame"); } assert(frame_to_show->ref_count > 0); assign_frame_buffer_p(&cm->cur_frame, frame_to_show); } if (!encode_show_existing_frame(cm) && cm->seq_params->film_grain_params_present && (cm->show_frame || cm->showable_frame)) { // Copy the current frame's film grain params to the its corresponding // RefCntBuffer slot. cm->cur_frame->film_grain_params = cm->film_grain_params; // We must update the parameters if this is not an INTER_FRAME if (current_frame->frame_type != INTER_FRAME) cm->cur_frame->film_grain_params.update_parameters = 1; // Iterate the random seed for the next frame. cm->film_grain_params.random_seed += 3381; if (cm->film_grain_params.random_seed == 0) cm->film_grain_params.random_seed = 7391; } // Initialise all tiles' contexts from the global frame context for (int tile_col = 0; tile_col < cm->tiles.cols; tile_col++) { for (int tile_row = 0; tile_row < cm->tiles.rows; tile_row++) { const int tile_idx = tile_row * cm->tiles.cols + tile_col; cpi->tile_data[tile_idx].tctx = *cm->fc; } } if (!frame_is_intra_only(cm)) fix_interp_filter(&cm->features.interp_filter, cpi->td.counts); } int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture, const YV12_BUFFER_CONFIG *last_picture, ForceIntegerMVInfo *const force_intpel_info) { // check use hash ME int k; const int block_size = FORCE_INT_MV_DECISION_BLOCK_SIZE; const double threshold_current = 0.8; const double threshold_average = 0.95; const int max_history_size = 32; int T = 0; // total block int C = 0; // match with collocated block int S = 0; // smooth region but not match with collocated block const int pic_width = cur_picture->y_width; const int pic_height = cur_picture->y_height; for (int i = 0; i + block_size <= pic_height; i += block_size) { for (int j = 0; j + block_size <= pic_width; j += block_size) { const int x_pos = j; const int y_pos = i; int match = 1; T++; // check whether collocated block match with current uint8_t *p_cur = cur_picture->y_buffer; uint8_t *p_ref = last_picture->y_buffer; int stride_cur = cur_picture->y_stride; int stride_ref = last_picture->y_stride; p_cur += (y_pos * stride_cur + x_pos); p_ref += (y_pos * stride_ref + x_pos); if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) { uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur); uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref); for (int tmpY = 0; tmpY < block_size && match; tmpY++) { for (int tmpX = 0; tmpX < block_size && match; tmpX++) { if (p16_cur[tmpX] != p16_ref[tmpX]) { match = 0; } } p16_cur += stride_cur; p16_ref += stride_ref; } } else { for (int tmpY = 0; tmpY < block_size && match; tmpY++) { for (int tmpX = 0; tmpX < block_size && match; tmpX++) { if (p_cur[tmpX] != p_ref[tmpX]) { match = 0; } } p_cur += stride_cur; p_ref += stride_ref; } } if (match) { C++; continue; } if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos, y_pos) || av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) { S++; continue; } } } assert(T > 0); double cs_rate = ((double)(C + S)) / ((double)(T)); force_intpel_info->cs_rate_array[force_intpel_info->rate_index] = cs_rate; force_intpel_info->rate_index = (force_intpel_info->rate_index + 1) % max_history_size; force_intpel_info->rate_size++; force_intpel_info->rate_size = AOMMIN(force_intpel_info->rate_size, max_history_size); if (cs_rate < threshold_current) { return 0; } if (C == T) { return 1; } double cs_average = 0.0; for (k = 0; k < force_intpel_info->rate_size; k++) { cs_average += force_intpel_info->cs_rate_array[k]; } cs_average /= force_intpel_info->rate_size; if (cs_average < threshold_average) { return 0; } if ((T - C - S) < 0) { return 1; } if (cs_average > 1.01) { return 1; } return 0; } void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi) { const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; uint8_t *y_buffer = cpi->source->y_buffer; const int y_stride = cpi->source->y_stride; const int block_size = BLOCK_16X16; const int num_mi_w = mi_size_wide[block_size]; const int num_mi_h = mi_size_high[block_size]; const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w; const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h; double log_sum = 0.0; // Loop through each 16x16 block. for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { double var = 0.0, num_of_var = 0.0; const int index = row * num_cols + col; // Loop through each 8x8 block. for (int mi_row = row * num_mi_h; mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h; mi_row += 2) { for (int mi_col = col * num_mi_w; mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w; mi_col += 2) { struct buf_2d buf; const int row_offset_y = mi_row << 2; const int col_offset_y = mi_col << 2; buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y; buf.stride = y_stride; var += av1_get_perpixel_variance_facade(cpi, xd, &buf, BLOCK_8X8, AOM_PLANE_Y); num_of_var += 1.0; } } var = var / num_of_var; // Curve fitting with an exponential model on all 16x16 blocks from the // midres dataset. var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222; // As per the above computation, var will be in the range of // [17.492222, 84.527656], assuming the data type is of infinite // precision. The following assert conservatively checks if var is in the // range of [17.0, 85.0] to avoid any issues due to the precision of the // relevant data type. assert(var > 17.0 && var < 85.0); cpi->ssim_rdmult_scaling_factors[index] = var; log_sum += log(var); } } // As log_sum holds the geometric mean, it will be in the range // [17.492222, 84.527656]. Hence, in the below loop, the value of // cpi->ssim_rdmult_scaling_factors[index] would be in the range // [0.2069, 4.8323]. log_sum = exp(log_sum / (double)(num_rows * num_cols)); for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { const int index = row * num_cols + col; cpi->ssim_rdmult_scaling_factors[index] /= log_sum; } } } // Coding context that only needs to be saved when recode loop includes // filtering (deblocking, CDEF, superres post-encode upscale and/or loop // restoraton). static void save_extra_coding_context(AV1_COMP *cpi) { CODING_CONTEXT *const cc = &cpi->coding_context; AV1_COMMON *cm = &cpi->common; cc->lf = cm->lf; cc->cdef_info = cm->cdef_info; cc->rc = cpi->rc; cc->mv_stats = cpi->ppi->mv_stats; } void av1_save_all_coding_context(AV1_COMP *cpi) { save_extra_coding_context(cpi); if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi); } #if DUMP_RECON_FRAMES == 1 // NOTE(zoeliu): For debug - Output the filtered reconstructed video. void av1_dump_filtered_recon_frames(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const CurrentFrame *const current_frame = &cm->current_frame; const YV12_BUFFER_CONFIG *recon_buf = &cm->cur_frame->buf; if (recon_buf == NULL) { printf("Frame %d is not ready.\n", current_frame->frame_number); return; } static const int flag_list[REF_FRAMES] = { 0, AOM_LAST_FLAG, AOM_LAST2_FLAG, AOM_LAST3_FLAG, AOM_GOLD_FLAG, AOM_BWD_FLAG, AOM_ALT2_FLAG, AOM_ALT_FLAG }; printf( "\n***Frame=%d (frame_offset=%d, show_frame=%d, " "show_existing_frame=%d) " "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[", current_frame->frame_number, current_frame->order_hint, cm->show_frame, cm->show_existing_frame); for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); const int ref_offset = buf != NULL ? (int)buf->order_hint : -1; printf(" %d(%c)", ref_offset, (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N'); } printf(" ]\n"); if (!cm->show_frame) { printf("Frame %d is a no show frame, so no image dump.\n", current_frame->frame_number); return; } int h; char file_name[256] = "/tmp/enc_filtered_recon.yuv"; FILE *f_recon = NULL; if (current_frame->frame_number == 0) { if ((f_recon = fopen(file_name, "wb")) == NULL) { printf("Unable to open file %s to write.\n", file_name); return; } } else { if ((f_recon = fopen(file_name, "ab")) == NULL) { printf("Unable to open file %s to append.\n", file_name); return; } } printf( "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, " "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, " "refresh_alt_ref_frame=%d, " "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n", current_frame->frame_number, cpi->gf_frame_index, cpi->ppi->gf_group.update_type[cpi->gf_frame_index], current_frame->order_hint, cm->show_frame, cm->show_existing_frame, cpi->rc.source_alt_ref_active, cpi->refresh_frame.alt_ref_frame, recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height); #if 0 int ref_frame; printf("get_ref_frame_map_idx: ["); for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) printf(" %d", get_ref_frame_map_idx(cm, ref_frame)); printf(" ]\n"); #endif // 0 // --- Y --- for (h = 0; h < cm->height; ++h) { fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width, f_recon); } // --- U --- for (h = 0; h < (cm->height >> 1); ++h) { fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1), f_recon); } // --- V --- for (h = 0; h < (cm->height >> 1); ++h) { fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1), f_recon); } fclose(f_recon); } #endif // DUMP_RECON_FRAMES aom-3.12.1/av1/encoder/encoder_utils.h000066400000000000000000001334221477627663500175240ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_ENCODER_UTILS_H_ #define AOM_AV1_ENCODER_ENCODER_UTILS_H_ #include "config/aom_dsp_rtcd.h" #include "config/aom_scale_rtcd.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encodetxb.h" #ifdef __cplusplus extern "C" { #endif #define AM_SEGMENT_ID_INACTIVE 7 #define AM_SEGMENT_ID_ACTIVE 0 #define DUMP_RECON_FRAMES 0 extern const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL] [TX_TYPES]; extern const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL]; extern const int default_warped_probs[FRAME_UPDATE_TYPES]; extern const int default_switchable_interp_probs[FRAME_UPDATE_TYPES] [SWITCHABLE_FILTER_CONTEXTS] [SWITCHABLE_FILTERS]; // Mark all inactive blocks as active. Other segmentation features may be set // so memset cannot be used, instead only inactive blocks should be reset. static inline void suppress_active_map(AV1_COMP *cpi) { unsigned char *const seg_map = cpi->enc_seg.map; int i; const int num_mis = cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; if (cpi->active_map.enabled || cpi->active_map.update) for (i = 0; i < num_mis; ++i) if (seg_map[i] == AM_SEGMENT_ID_INACTIVE) seg_map[i] = AM_SEGMENT_ID_ACTIVE; } // Returns 'size' in the number of Mode Info (MI) units. 'size' is either the // width or height. static inline int size_in_mi(int size) { // Ensure that the decoded width and height are both multiples of // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if // subsampling is used). // This simplifies the implementation of various experiments, // eg. cdef, which operates on units of 8x8 luma pixels. const int aligned_size = ALIGN_POWER_OF_TWO(size, 3); return aligned_size >> MI_SIZE_LOG2; } static inline void set_mb_mi(CommonModeInfoParams *mi_params, int width, int height) { mi_params->mi_cols = size_in_mi(width); mi_params->mi_rows = size_in_mi(height); mi_params->mi_stride = calc_mi_size(mi_params->mi_cols); mi_params->mb_cols = ROUND_POWER_OF_TWO(mi_params->mi_cols, 2); mi_params->mb_rows = ROUND_POWER_OF_TWO(mi_params->mi_rows, 2); mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols; const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; mi_params->mi_alloc_stride = (mi_params->mi_stride + mi_alloc_size_1d - 1) / mi_alloc_size_1d; assert(mi_size_wide[mi_params->mi_alloc_bsize] == mi_size_high[mi_params->mi_alloc_bsize]); } static inline void enc_free_mi(CommonModeInfoParams *mi_params) { aom_free(mi_params->mi_alloc); mi_params->mi_alloc = NULL; mi_params->mi_alloc_size = 0; aom_free(mi_params->mi_grid_base); mi_params->mi_grid_base = NULL; mi_params->mi_grid_size = 0; aom_free(mi_params->tx_type_map); mi_params->tx_type_map = NULL; } static inline void enc_set_mb_mi(CommonModeInfoParams *mi_params, int width, int height, BLOCK_SIZE min_partition_size) { mi_params->mi_alloc_bsize = min_partition_size; set_mb_mi(mi_params, width, height); } static inline void stat_stage_set_mb_mi(CommonModeInfoParams *mi_params, int width, int height, BLOCK_SIZE min_partition_size) { (void)min_partition_size; mi_params->mi_alloc_bsize = BLOCK_16X16; set_mb_mi(mi_params, width, height); } static inline void enc_setup_mi(CommonModeInfoParams *mi_params) { const int mi_grid_size = mi_params->mi_stride * calc_mi_size(mi_params->mi_rows); memset(mi_params->mi_alloc, 0, mi_params->mi_alloc_size * sizeof(*mi_params->mi_alloc)); memset(mi_params->mi_grid_base, 0, mi_grid_size * sizeof(*mi_params->mi_grid_base)); memset(mi_params->tx_type_map, 0, mi_grid_size * sizeof(*mi_params->tx_type_map)); } static inline void init_buffer_indices( ForceIntegerMVInfo *const force_intpel_info, int *const remapped_ref_idx) { int fb_idx; for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx) remapped_ref_idx[fb_idx] = fb_idx; force_intpel_info->rate_index = 0; force_intpel_info->rate_size = 0; } #define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF) \ ppi->fn_ptr[BT].sdf = SDF; \ ppi->fn_ptr[BT].sdaf = SDAF; \ ppi->fn_ptr[BT].vf = VF; \ ppi->fn_ptr[BT].svf = SVF; \ ppi->fn_ptr[BT].svaf = SVAF; \ ppi->fn_ptr[BT].sdx4df = SDX4DF; \ ppi->fn_ptr[BT].sdx3df = SDX3DF; #define HIGHBD_BFP_WRAPPER(WIDTH, HEIGHT, BD) \ HIGHBD_BFP(BLOCK_##WIDTH##X##HEIGHT, \ aom_highbd_sad##WIDTH##x##HEIGHT##_bits##BD, \ aom_highbd_sad##WIDTH##x##HEIGHT##_avg_bits##BD, \ aom_highbd_##BD##_variance##WIDTH##x##HEIGHT, \ aom_highbd_##BD##_sub_pixel_variance##WIDTH##x##HEIGHT, \ aom_highbd_##BD##_sub_pixel_avg_variance##WIDTH##x##HEIGHT, \ aom_highbd_sad##WIDTH##x##HEIGHT##x4d_bits##BD, \ aom_highbd_sad##WIDTH##x##HEIGHT##x3d_bits##BD) #define HIGHBD_BFP_WRAPPER_NO_SAD_AVG(WIDTH, HEIGHT, BD) \ HIGHBD_BFP(BLOCK_##WIDTH##X##HEIGHT, \ aom_highbd_sad##WIDTH##x##HEIGHT##_bits##BD, /*SDAF=*/NULL, \ aom_highbd_##BD##_variance##WIDTH##x##HEIGHT, \ aom_highbd_##BD##_sub_pixel_variance##WIDTH##x##HEIGHT, \ aom_highbd_##BD##_sub_pixel_avg_variance##WIDTH##x##HEIGHT, \ aom_highbd_sad##WIDTH##x##HEIGHT##x4d_bits##BD, \ aom_highbd_sad##WIDTH##x##HEIGHT##x3d_bits##BD) #define MAKE_BFP_SAD_WRAPPER(fnname) \ static unsigned int fnname##_bits8(const uint8_t *src_ptr, \ int source_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \ } \ static unsigned int fnname##_bits10( \ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ int ref_stride) { \ return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \ } \ static unsigned int fnname##_bits12( \ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ int ref_stride) { \ return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \ } #define MAKE_BFP_SADAVG_WRAPPER(fnname) \ static unsigned int fnname##_bits8( \ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred) { \ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \ } \ static unsigned int fnname##_bits10( \ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred) { \ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \ 2; \ } \ static unsigned int fnname##_bits12( \ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred) { \ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \ 4; \ } #define MAKE_BFP_SAD4D_WRAPPER(fnname) \ static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ const uint8_t *const ref_ptr[], int ref_stride, \ unsigned int *sad_array) { \ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ } \ static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ const uint8_t *const ref_ptr[], int ref_stride, \ unsigned int *sad_array) { \ int i; \ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ for (i = 0; i < 4; i++) sad_array[i] >>= 2; \ } \ static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ const uint8_t *const ref_ptr[], int ref_stride, \ unsigned int *sad_array) { \ int i; \ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ for (i = 0; i < 4; i++) sad_array[i] >>= 4; \ } #if CONFIG_AV1_HIGHBITDEPTH MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x3d) #if !CONFIG_REALTIME_ONLY MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x3d) MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x3d) #endif #endif // CONFIG_AV1_HIGHBITDEPTH #define HIGHBD_MBFP(BT, MCSDF, MCSVF) \ ppi->fn_ptr[BT].msdf = MCSDF; \ ppi->fn_ptr[BT].msvf = MCSVF; #define HIGHBD_MBFP_WRAPPER(WIDTH, HEIGHT, BD) \ HIGHBD_MBFP(BLOCK_##WIDTH##X##HEIGHT, \ aom_highbd_masked_sad##WIDTH##x##HEIGHT##_bits##BD, \ aom_highbd_##BD##_masked_sub_pixel_variance##WIDTH##x##HEIGHT) #define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname) \ static unsigned int fnname##_bits8( \ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \ int m_stride, int invert_mask) { \ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \ second_pred_ptr, m, m_stride, invert_mask); \ } \ static unsigned int fnname##_bits10( \ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \ int m_stride, int invert_mask) { \ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \ second_pred_ptr, m, m_stride, invert_mask) >> \ 2; \ } \ static unsigned int fnname##_bits12( \ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \ int m_stride, int invert_mask) { \ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \ second_pred_ptr, m, m_stride, invert_mask) >> \ 4; \ } #if CONFIG_AV1_HIGHBITDEPTH MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x32) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x16) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x32) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x16) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x8) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x16) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4) #if !CONFIG_REALTIME_ONLY MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64) MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16) #endif #endif #define HIGHBD_SDSFP(BT, SDSF, SDSX4DF) \ ppi->fn_ptr[BT].sdsf = SDSF; \ ppi->fn_ptr[BT].sdsx4df = SDSX4DF; #define HIGHBD_SDSFP_WRAPPER(WIDTH, HEIGHT, BD) \ HIGHBD_SDSFP(BLOCK_##WIDTH##X##HEIGHT, \ aom_highbd_sad_skip_##WIDTH##x##HEIGHT##_bits##BD, \ aom_highbd_sad_skip_##WIDTH##x##HEIGHT##x4d##_bits##BD) #define MAKE_SDSF_SKIP_SAD_WRAPPER(fnname) \ static unsigned int fnname##_bits8(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride) { \ return fnname(src, src_stride, ref, ref_stride); \ } \ static unsigned int fnname##_bits10(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride) { \ return fnname(src, src_stride, ref, ref_stride) >> 2; \ } \ static unsigned int fnname##_bits12(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride) { \ return fnname(src, src_stride, ref, ref_stride) >> 4; \ } #define MAKE_SDSF_SKIP_SAD_4D_WRAPPER(fnname) \ static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ const uint8_t *const ref_ptr[], int ref_stride, \ unsigned int *sad_array) { \ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ } \ static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ const uint8_t *const ref_ptr[], int ref_stride, \ unsigned int *sad_array) { \ int i; \ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ for (i = 0; i < 4; i++) sad_array[i] >>= 2; \ } \ static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ const uint8_t *const ref_ptr[], int ref_stride, \ unsigned int *sad_array) { \ int i; \ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ for (i = 0; i < 4; i++) sad_array[i] >>= 4; \ } #if CONFIG_AV1_HIGHBITDEPTH MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x128) MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x64) MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x128) MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x64) MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x32) MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x64) MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x32) MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x16) MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x32) MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x16) MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x16) #if !CONFIG_REALTIME_ONLY MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x16) MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x64) MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x16) MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x32) #endif MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x128x4d) MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x64x4d) MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x128x4d) MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x64x4d) MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x32x4d) MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x64x4d) MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x32x4d) MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x16x4d) MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x32x4d) MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x16x4d) MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x16x4d) #if !CONFIG_REALTIME_ONLY MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x16x4d) MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x64x4d) MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x16x4d) MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x32x4d) #endif #endif #if !CONFIG_REALTIME_ONLY #if CONFIG_AV1_HIGHBITDEPTH #define HIGHBD_OBFP_WRAPPER_8(WIDTH, HEIGHT) \ HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT, \ aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits8, \ aom_highbd_8_obmc_variance##WIDTH##x##HEIGHT, \ aom_highbd_8_obmc_sub_pixel_variance##WIDTH##x##HEIGHT) #define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \ ppi->fn_ptr[BT].osdf = OSDF; \ ppi->fn_ptr[BT].ovf = OVF; \ ppi->fn_ptr[BT].osvf = OSVF; #define HIGHBD_OBFP_WRAPPER(WIDTH, HEIGHT, BD) \ HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT, \ aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits##BD, \ aom_highbd_##BD##_obmc_variance##WIDTH##x##HEIGHT, \ aom_highbd_##BD##_obmc_sub_pixel_variance##WIDTH##x##HEIGHT) #define MAKE_OBFP_SAD_WRAPPER(fnname) \ static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride, \ const int32_t *wsrc, \ const int32_t *msk) { \ return fnname(ref, ref_stride, wsrc, msk); \ } \ static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \ const int32_t *wsrc, \ const int32_t *msk) { \ return fnname(ref, ref_stride, wsrc, msk) >> 2; \ } \ static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \ const int32_t *wsrc, \ const int32_t *msk) { \ return fnname(ref, ref_stride, wsrc, msk) >> 4; \ } #endif // CONFIG_AV1_HIGHBITDEPTH #endif // !CONFIG_REALTIME_ONLY #if CONFIG_AV1_HIGHBITDEPTH #if !CONFIG_REALTIME_ONLY MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16) #endif static inline void highbd_set_var_fns(AV1_PRIMARY *const ppi) { SequenceHeader *const seq_params = &ppi->seq_params; if (seq_params->use_highbitdepth) { switch (seq_params->bit_depth) { case AOM_BITS_8: #if !CONFIG_REALTIME_ONLY HIGHBD_BFP_WRAPPER(64, 16, 8) HIGHBD_BFP_WRAPPER(16, 64, 8) HIGHBD_BFP_WRAPPER(32, 8, 8) HIGHBD_BFP_WRAPPER(8, 32, 8) HIGHBD_BFP_WRAPPER_NO_SAD_AVG(16, 4, 8) HIGHBD_BFP_WRAPPER_NO_SAD_AVG(4, 16, 8) #endif HIGHBD_BFP_WRAPPER(32, 16, 8) HIGHBD_BFP_WRAPPER(16, 32, 8) HIGHBD_BFP_WRAPPER(64, 32, 8) HIGHBD_BFP_WRAPPER(32, 64, 8) HIGHBD_BFP_WRAPPER(32, 32, 8) HIGHBD_BFP_WRAPPER(64, 64, 8) HIGHBD_BFP_WRAPPER(16, 16, 8) HIGHBD_BFP_WRAPPER(16, 8, 8) HIGHBD_BFP_WRAPPER(8, 16, 8) HIGHBD_BFP_WRAPPER(8, 8, 8) HIGHBD_BFP_WRAPPER_NO_SAD_AVG(8, 4, 8) HIGHBD_BFP_WRAPPER_NO_SAD_AVG(4, 8, 8) HIGHBD_BFP_WRAPPER_NO_SAD_AVG(4, 4, 8) HIGHBD_BFP_WRAPPER(128, 128, 8) HIGHBD_BFP_WRAPPER(128, 64, 8) HIGHBD_BFP_WRAPPER(64, 128, 8) HIGHBD_MBFP_WRAPPER(128, 128, 8) HIGHBD_MBFP_WRAPPER(128, 64, 8) HIGHBD_MBFP_WRAPPER(64, 128, 8) HIGHBD_MBFP_WRAPPER(64, 64, 8) HIGHBD_MBFP_WRAPPER(64, 32, 8) HIGHBD_MBFP_WRAPPER(32, 64, 8) HIGHBD_MBFP_WRAPPER(32, 32, 8) HIGHBD_MBFP_WRAPPER(32, 16, 8) HIGHBD_MBFP_WRAPPER(16, 32, 8) HIGHBD_MBFP_WRAPPER(16, 16, 8) HIGHBD_MBFP_WRAPPER(8, 16, 8) HIGHBD_MBFP_WRAPPER(16, 8, 8) HIGHBD_MBFP_WRAPPER(8, 8, 8) HIGHBD_MBFP_WRAPPER(4, 8, 8) HIGHBD_MBFP_WRAPPER(8, 4, 8) HIGHBD_MBFP_WRAPPER(4, 4, 8) #if !CONFIG_REALTIME_ONLY HIGHBD_MBFP_WRAPPER(64, 16, 8) HIGHBD_MBFP_WRAPPER(16, 64, 8) HIGHBD_MBFP_WRAPPER(32, 8, 8) HIGHBD_MBFP_WRAPPER(8, 32, 8) HIGHBD_MBFP_WRAPPER(16, 4, 8) HIGHBD_MBFP_WRAPPER(4, 16, 8) #endif // OBMC excluded from realtime only build. #if !CONFIG_REALTIME_ONLY HIGHBD_OBFP_WRAPPER_8(128, 128) HIGHBD_OBFP_WRAPPER_8(128, 64) HIGHBD_OBFP_WRAPPER_8(64, 128) HIGHBD_OBFP_WRAPPER_8(64, 64) HIGHBD_OBFP_WRAPPER_8(64, 32) HIGHBD_OBFP_WRAPPER_8(32, 64) HIGHBD_OBFP_WRAPPER_8(32, 32) HIGHBD_OBFP_WRAPPER_8(32, 16) HIGHBD_OBFP_WRAPPER_8(16, 32) HIGHBD_OBFP_WRAPPER_8(16, 16) HIGHBD_OBFP_WRAPPER_8(8, 16) HIGHBD_OBFP_WRAPPER_8(16, 8) HIGHBD_OBFP_WRAPPER_8(8, 8) HIGHBD_OBFP_WRAPPER_8(4, 8) HIGHBD_OBFP_WRAPPER_8(8, 4) HIGHBD_OBFP_WRAPPER_8(4, 4) HIGHBD_OBFP_WRAPPER_8(64, 16) HIGHBD_OBFP_WRAPPER_8(16, 64) HIGHBD_OBFP_WRAPPER_8(32, 8) HIGHBD_OBFP_WRAPPER_8(8, 32) HIGHBD_OBFP_WRAPPER_8(16, 4) HIGHBD_OBFP_WRAPPER_8(4, 16) #endif HIGHBD_SDSFP_WRAPPER(128, 128, 8) HIGHBD_SDSFP_WRAPPER(128, 64, 8) HIGHBD_SDSFP_WRAPPER(64, 128, 8) HIGHBD_SDSFP_WRAPPER(64, 64, 8) HIGHBD_SDSFP_WRAPPER(64, 32, 8) HIGHBD_SDSFP_WRAPPER(32, 64, 8) HIGHBD_SDSFP_WRAPPER(32, 32, 8) HIGHBD_SDSFP_WRAPPER(32, 16, 8) HIGHBD_SDSFP_WRAPPER(16, 32, 8) HIGHBD_SDSFP_WRAPPER(16, 16, 8) HIGHBD_SDSFP_WRAPPER(8, 16, 8) #if !CONFIG_REALTIME_ONLY HIGHBD_SDSFP_WRAPPER(64, 16, 8) HIGHBD_SDSFP_WRAPPER(16, 64, 8) HIGHBD_SDSFP_WRAPPER(8, 32, 8) HIGHBD_SDSFP_WRAPPER(4, 16, 8) #endif break; case AOM_BITS_10: #if !CONFIG_REALTIME_ONLY HIGHBD_BFP_WRAPPER(64, 16, 10) HIGHBD_BFP_WRAPPER(16, 64, 10) HIGHBD_BFP_WRAPPER(32, 8, 10) HIGHBD_BFP_WRAPPER(8, 32, 10) HIGHBD_BFP_WRAPPER_NO_SAD_AVG(16, 4, 10) HIGHBD_BFP_WRAPPER_NO_SAD_AVG(4, 16, 10) #endif HIGHBD_BFP_WRAPPER(32, 16, 10) HIGHBD_BFP_WRAPPER(16, 32, 10) HIGHBD_BFP_WRAPPER(64, 32, 10) HIGHBD_BFP_WRAPPER(32, 64, 10) HIGHBD_BFP_WRAPPER(32, 32, 10) HIGHBD_BFP_WRAPPER(64, 64, 10) HIGHBD_BFP_WRAPPER(16, 16, 10) HIGHBD_BFP_WRAPPER(16, 8, 10) HIGHBD_BFP_WRAPPER(8, 16, 10) HIGHBD_BFP_WRAPPER(8, 8, 10) HIGHBD_BFP_WRAPPER_NO_SAD_AVG(8, 4, 10) HIGHBD_BFP_WRAPPER_NO_SAD_AVG(4, 8, 10) HIGHBD_BFP_WRAPPER_NO_SAD_AVG(4, 4, 10) HIGHBD_BFP_WRAPPER(128, 128, 10) HIGHBD_BFP_WRAPPER(128, 64, 10) HIGHBD_BFP_WRAPPER(64, 128, 10) HIGHBD_MBFP_WRAPPER(128, 128, 10) HIGHBD_MBFP_WRAPPER(128, 64, 10) HIGHBD_MBFP_WRAPPER(64, 128, 10) HIGHBD_MBFP_WRAPPER(64, 64, 10) HIGHBD_MBFP_WRAPPER(64, 32, 10) HIGHBD_MBFP_WRAPPER(32, 64, 10) HIGHBD_MBFP_WRAPPER(32, 32, 10) HIGHBD_MBFP_WRAPPER(32, 16, 10) HIGHBD_MBFP_WRAPPER(16, 32, 10) HIGHBD_MBFP_WRAPPER(16, 16, 10) HIGHBD_MBFP_WRAPPER(8, 16, 10) HIGHBD_MBFP_WRAPPER(16, 8, 10) HIGHBD_MBFP_WRAPPER(8, 8, 10) HIGHBD_MBFP_WRAPPER(4, 8, 10) HIGHBD_MBFP_WRAPPER(8, 4, 10) HIGHBD_MBFP_WRAPPER(4, 4, 10) #if !CONFIG_REALTIME_ONLY HIGHBD_MBFP_WRAPPER(64, 16, 10) HIGHBD_MBFP_WRAPPER(16, 64, 10) HIGHBD_MBFP_WRAPPER(32, 8, 10) HIGHBD_MBFP_WRAPPER(8, 32, 10) HIGHBD_MBFP_WRAPPER(16, 4, 10) HIGHBD_MBFP_WRAPPER(4, 16, 10) #endif // OBMC excluded from realtime only build. #if !CONFIG_REALTIME_ONLY HIGHBD_OBFP_WRAPPER(128, 128, 10) HIGHBD_OBFP_WRAPPER(128, 64, 10) HIGHBD_OBFP_WRAPPER(64, 128, 10) HIGHBD_OBFP_WRAPPER(64, 64, 10) HIGHBD_OBFP_WRAPPER(64, 32, 10) HIGHBD_OBFP_WRAPPER(32, 64, 10) HIGHBD_OBFP_WRAPPER(32, 32, 10) HIGHBD_OBFP_WRAPPER(32, 16, 10) HIGHBD_OBFP_WRAPPER(16, 32, 10) HIGHBD_OBFP_WRAPPER(16, 16, 10) HIGHBD_OBFP_WRAPPER(8, 16, 10) HIGHBD_OBFP_WRAPPER(16, 8, 10) HIGHBD_OBFP_WRAPPER(8, 8, 10) HIGHBD_OBFP_WRAPPER(4, 8, 10) HIGHBD_OBFP_WRAPPER(8, 4, 10) HIGHBD_OBFP_WRAPPER(4, 4, 10) HIGHBD_OBFP_WRAPPER(64, 16, 10) HIGHBD_OBFP_WRAPPER(16, 64, 10) HIGHBD_OBFP_WRAPPER(32, 8, 10) HIGHBD_OBFP_WRAPPER(8, 32, 10) HIGHBD_OBFP_WRAPPER(16, 4, 10) HIGHBD_OBFP_WRAPPER(4, 16, 10) #endif HIGHBD_SDSFP_WRAPPER(128, 128, 10) HIGHBD_SDSFP_WRAPPER(128, 64, 10) HIGHBD_SDSFP_WRAPPER(64, 128, 10) HIGHBD_SDSFP_WRAPPER(64, 64, 10) HIGHBD_SDSFP_WRAPPER(64, 32, 10) HIGHBD_SDSFP_WRAPPER(32, 64, 10) HIGHBD_SDSFP_WRAPPER(32, 32, 10) HIGHBD_SDSFP_WRAPPER(32, 16, 10) HIGHBD_SDSFP_WRAPPER(16, 32, 10) HIGHBD_SDSFP_WRAPPER(16, 16, 10) HIGHBD_SDSFP_WRAPPER(8, 16, 10) #if !CONFIG_REALTIME_ONLY HIGHBD_SDSFP_WRAPPER(64, 16, 10) HIGHBD_SDSFP_WRAPPER(16, 64, 10) HIGHBD_SDSFP_WRAPPER(8, 32, 10) HIGHBD_SDSFP_WRAPPER(4, 16, 10) #endif break; case AOM_BITS_12: #if !CONFIG_REALTIME_ONLY HIGHBD_BFP_WRAPPER(64, 16, 12) HIGHBD_BFP_WRAPPER(16, 64, 12) HIGHBD_BFP_WRAPPER(32, 8, 12) HIGHBD_BFP_WRAPPER(8, 32, 12) HIGHBD_BFP_WRAPPER_NO_SAD_AVG(16, 4, 12) HIGHBD_BFP_WRAPPER_NO_SAD_AVG(4, 16, 12) #endif HIGHBD_BFP_WRAPPER(32, 16, 12) HIGHBD_BFP_WRAPPER(16, 32, 12) HIGHBD_BFP_WRAPPER(64, 32, 12) HIGHBD_BFP_WRAPPER(32, 64, 12) HIGHBD_BFP_WRAPPER(32, 32, 12) HIGHBD_BFP_WRAPPER(64, 64, 12) HIGHBD_BFP_WRAPPER(16, 16, 12) HIGHBD_BFP_WRAPPER(16, 8, 12) HIGHBD_BFP_WRAPPER(8, 16, 12) HIGHBD_BFP_WRAPPER(8, 8, 12) HIGHBD_BFP_WRAPPER_NO_SAD_AVG(8, 4, 12) HIGHBD_BFP_WRAPPER_NO_SAD_AVG(4, 8, 12) HIGHBD_BFP_WRAPPER_NO_SAD_AVG(4, 4, 12) HIGHBD_BFP_WRAPPER(128, 128, 12) HIGHBD_BFP_WRAPPER(128, 64, 12) HIGHBD_BFP_WRAPPER(64, 128, 12) HIGHBD_MBFP_WRAPPER(128, 128, 12) HIGHBD_MBFP_WRAPPER(128, 64, 12) HIGHBD_MBFP_WRAPPER(64, 128, 12) HIGHBD_MBFP_WRAPPER(64, 64, 12) HIGHBD_MBFP_WRAPPER(64, 32, 12) HIGHBD_MBFP_WRAPPER(32, 64, 12) HIGHBD_MBFP_WRAPPER(32, 32, 12) HIGHBD_MBFP_WRAPPER(32, 16, 12) HIGHBD_MBFP_WRAPPER(16, 32, 12) HIGHBD_MBFP_WRAPPER(16, 16, 12) HIGHBD_MBFP_WRAPPER(8, 16, 12) HIGHBD_MBFP_WRAPPER(16, 8, 12) HIGHBD_MBFP_WRAPPER(8, 8, 12) HIGHBD_MBFP_WRAPPER(4, 8, 12) HIGHBD_MBFP_WRAPPER(8, 4, 12) HIGHBD_MBFP_WRAPPER(4, 4, 12) #if !CONFIG_REALTIME_ONLY HIGHBD_MBFP_WRAPPER(64, 16, 12) HIGHBD_MBFP_WRAPPER(16, 64, 12) HIGHBD_MBFP_WRAPPER(32, 8, 12) HIGHBD_MBFP_WRAPPER(8, 32, 12) HIGHBD_MBFP_WRAPPER(16, 4, 12) HIGHBD_MBFP_WRAPPER(4, 16, 12) #endif // OBMC excluded from realtime only build. #if !CONFIG_REALTIME_ONLY HIGHBD_OBFP_WRAPPER(128, 128, 12) HIGHBD_OBFP_WRAPPER(128, 64, 12) HIGHBD_OBFP_WRAPPER(64, 128, 12) HIGHBD_OBFP_WRAPPER(64, 64, 12) HIGHBD_OBFP_WRAPPER(64, 32, 12) HIGHBD_OBFP_WRAPPER(32, 64, 12) HIGHBD_OBFP_WRAPPER(32, 32, 12) HIGHBD_OBFP_WRAPPER(32, 16, 12) HIGHBD_OBFP_WRAPPER(16, 32, 12) HIGHBD_OBFP_WRAPPER(16, 16, 12) HIGHBD_OBFP_WRAPPER(8, 16, 12) HIGHBD_OBFP_WRAPPER(16, 8, 12) HIGHBD_OBFP_WRAPPER(8, 8, 12) HIGHBD_OBFP_WRAPPER(4, 8, 12) HIGHBD_OBFP_WRAPPER(8, 4, 12) HIGHBD_OBFP_WRAPPER(4, 4, 12) HIGHBD_OBFP_WRAPPER(64, 16, 12) HIGHBD_OBFP_WRAPPER(16, 64, 12) HIGHBD_OBFP_WRAPPER(32, 8, 12) HIGHBD_OBFP_WRAPPER(8, 32, 12) HIGHBD_OBFP_WRAPPER(16, 4, 12) HIGHBD_OBFP_WRAPPER(4, 16, 12) #endif HIGHBD_SDSFP_WRAPPER(128, 128, 12) HIGHBD_SDSFP_WRAPPER(128, 64, 12) HIGHBD_SDSFP_WRAPPER(64, 128, 12) HIGHBD_SDSFP_WRAPPER(64, 64, 12) HIGHBD_SDSFP_WRAPPER(64, 32, 12) HIGHBD_SDSFP_WRAPPER(32, 64, 12) HIGHBD_SDSFP_WRAPPER(32, 32, 12) HIGHBD_SDSFP_WRAPPER(32, 16, 12) HIGHBD_SDSFP_WRAPPER(16, 32, 12) HIGHBD_SDSFP_WRAPPER(16, 16, 12) HIGHBD_SDSFP_WRAPPER(8, 16, 12) #if !CONFIG_REALTIME_ONLY HIGHBD_SDSFP_WRAPPER(64, 16, 12) HIGHBD_SDSFP_WRAPPER(16, 64, 12) HIGHBD_SDSFP_WRAPPER(8, 32, 12) HIGHBD_SDSFP_WRAPPER(4, 16, 12) #endif break; default: assert(0 && "cm->seq_params->bit_depth should be AOM_BITS_8, " "AOM_BITS_10 or AOM_BITS_12"); } } } #endif // CONFIG_AV1_HIGHBITDEPTH static inline void copy_frame_prob_info(AV1_COMP *cpi) { FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs; if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { av1_copy(frame_probs->tx_type_probs, default_tx_type_probs); } if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) { av1_copy(frame_probs->obmc_probs, default_obmc_probs); } if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { av1_copy(frame_probs->warped_probs, default_warped_probs); } if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) { av1_copy(frame_probs->switchable_interp_probs, default_switchable_interp_probs); } #if CONFIG_FPMT_TEST if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs; if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { av1_copy(temp_frame_probs->tx_type_probs, default_tx_type_probs); } if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) { av1_copy(temp_frame_probs->obmc_probs, default_obmc_probs); } if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { av1_copy(temp_frame_probs->warped_probs, default_warped_probs); } if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) { av1_copy(temp_frame_probs->switchable_interp_probs, default_switchable_interp_probs); } FrameProbInfo *const temp_frame_probs_simulation = &cpi->ppi->temp_frame_probs_simulation; if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { av1_copy(temp_frame_probs_simulation->tx_type_probs, default_tx_type_probs); } if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) { av1_copy(temp_frame_probs_simulation->obmc_probs, default_obmc_probs); } if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { av1_copy(temp_frame_probs_simulation->warped_probs, default_warped_probs); } if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) { av1_copy(temp_frame_probs_simulation->switchable_interp_probs, default_switchable_interp_probs); } } #endif } static inline void restore_cdef_coding_context(CdefInfo *const dst, const CdefInfo *const src) { dst->cdef_bits = src->cdef_bits; dst->cdef_damping = src->cdef_damping; av1_copy(dst->cdef_strengths, src->cdef_strengths); av1_copy(dst->cdef_uv_strengths, src->cdef_uv_strengths); dst->nb_cdef_strengths = src->nb_cdef_strengths; } // Coding context that only needs to be restored when recode loop includes // filtering (deblocking, CDEF, superres post-encode upscale and/or loop // restoraton). static inline void restore_extra_coding_context(AV1_COMP *cpi) { CODING_CONTEXT *const cc = &cpi->coding_context; AV1_COMMON *cm = &cpi->common; cm->lf = cc->lf; restore_cdef_coding_context(&cm->cdef_info, &cc->cdef_info); cpi->rc = cc->rc; cpi->ppi->mv_stats = cc->mv_stats; } static inline int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) { return a->y_height == b->y_height && a->y_width == b->y_width && a->uv_height == b->uv_height && a->uv_width == b->uv_width && a->y_stride == b->y_stride && a->uv_stride == b->uv_stride && a->border == b->border && (a->flags & YV12_FLAG_HIGHBITDEPTH) == (b->flags & YV12_FLAG_HIGHBITDEPTH); } static inline int update_entropy(bool *ext_refresh_frame_context, bool *ext_refresh_frame_context_pending, bool update) { *ext_refresh_frame_context = update; *ext_refresh_frame_context_pending = 1; return 0; } #if !CONFIG_REALTIME_ONLY static inline int combine_prior_with_tpl_boost(double min_factor, double max_factor, int prior_boost, int tpl_boost, int frames_to_key) { double factor = sqrt((double)frames_to_key); double range = max_factor - min_factor; factor = AOMMIN(factor, max_factor); factor = AOMMAX(factor, min_factor); factor -= min_factor; int boost = (int)((factor * prior_boost + (range - factor) * tpl_boost) / range); return boost; } #endif static inline void set_size_independent_vars(AV1_COMP *cpi) { int i; AV1_COMMON *const cm = &cpi->common; FeatureFlags *const features = &cm->features; for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { cm->global_motion[i] = default_warp_params; } cpi->gm_info.search_done = 0; av1_set_speed_features_framesize_independent(cpi, cpi->speed); av1_set_rd_speed_thresholds(cpi); features->interp_filter = SWITCHABLE; features->switchable_motion_mode = is_switchable_motion_mode_allowed( features->allow_warped_motion, cpi->oxcf.motion_mode_cfg.enable_obmc); } static inline void release_scaled_references(AV1_COMP *cpi) { // Scaled references should only need to be released under certain conditions: // if the reference will be updated, or if the scaled reference has same // resolution. For now only apply this to Golden for non-svc RTC mode. AV1_COMMON *const cm = &cpi->common; const bool refresh_golden = (cpi->refresh_frame.golden_frame) ? 1 : 0; bool release_golden = true; for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { RefCntBuffer *const buf = cpi->scaled_ref_buf[i]; const int golden_ref = (i == GOLDEN_FRAME - 1); if (golden_ref && is_one_pass_rt_params(cpi) && !cpi->ppi->use_svc && buf != NULL) { const RefCntBuffer *const ref = get_ref_frame_buf(cm, GOLDEN_FRAME); const bool same_resoln = buf->buf.y_crop_width == ref->buf.y_crop_width && buf->buf.y_crop_height == ref->buf.y_crop_height; release_golden = refresh_golden || same_resoln; } if (buf != NULL && (!golden_ref || (golden_ref && release_golden))) { --buf->ref_count; cpi->scaled_ref_buf[i] = NULL; } } } static inline void restore_all_coding_context(AV1_COMP *cpi) { restore_extra_coding_context(cpi); if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi); } static inline int reduce_num_ref_buffers(const AV1_COMP *cpi) { const SequenceHeader *const seq_params = cpi->common.seq_params; return is_one_pass_rt_params(cpi) && use_rtc_reference_structure_one_layer(cpi) && (seq_params->order_hint_info.enable_order_hint == 0) && cpi->rt_reduce_num_ref_buffers; } // Refresh reference frame buffers according to refresh_frame_flags. static inline void refresh_reference_frames(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; // All buffers are refreshed for shown keyframes and S-frames. // In case of RT, golden frame refreshes the 6th slot and other reference // frames refresh slots 0 to 5. Slot 7 is not refreshed by any reference // frame. Thus, only 7 buffers are refreshed for keyframes and S-frames // instead of 8. int num_ref_buffers = REF_FRAMES; if (reduce_num_ref_buffers(cpi)) { const int refresh_all_bufs = (cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET || frame_is_sframe(cm)); assert(IMPLIES(((cm->current_frame.refresh_frame_flags >> 7) & 1) == 1, refresh_all_bufs)); (void)refresh_all_bufs; num_ref_buffers--; } for (int ref_frame = 0; ref_frame < num_ref_buffers; ref_frame++) { if (((cm->current_frame.refresh_frame_flags >> ref_frame) & 1) == 1) { assign_frame_buffer_p(&cm->ref_frame_map[ref_frame], cm->cur_frame); } } } #if !CONFIG_REALTIME_ONLY void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf); void av1_update_film_grain_parameters(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf); #endif void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter, const int phase, const int use_optimized_scaler); void av1_setup_frame(AV1_COMP *cpi); BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width, int height, int number_spatial_layers); void av1_apply_active_map(AV1_COMP *cpi); #if !CONFIG_REALTIME_ONLY uint16_t av1_setup_interp_filter_search_mask(AV1_COMP *cpi); void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig); #endif void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index, int *top_index); void av1_finalize_encoded_frame(AV1_COMP *const cpi); int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture, const YV12_BUFFER_CONFIG *last_picture, ForceIntegerMVInfo *const force_intpel_info); void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi); void av1_save_all_coding_context(AV1_COMP *cpi); #if DUMP_RECON_FRAMES == 1 void av1_dump_filtered_recon_frames(AV1_COMP *cpi); #endif static inline int av1_get_enc_border_size(bool resize, bool all_intra, BLOCK_SIZE sb_size) { // For allintra encoding mode, inter-frame motion search is not applicable and // the intraBC motion vectors are restricted within the tile boundaries. Hence // a smaller frame border size (AOM_ENC_ALLINTRA_BORDER) is used in this case. if (resize) { return AOM_BORDER_IN_PIXELS; } if (all_intra) { return AOM_ENC_ALLINTRA_BORDER; } return block_size_wide[sb_size] + 32; } static inline bool av1_is_resize_needed(const AV1EncoderConfig *oxcf) { const ResizeCfg *resize_cfg = &oxcf->resize_cfg; const SuperResCfg *superres_cfg = &oxcf->superres_cfg; return resize_cfg->resize_mode || superres_cfg->superres_mode; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_ENCODER_UTILS_H_ aom-3.12.1/av1/encoder/encodetxb.c000066400000000000000000001060021477627663500166250ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/encoder/encodetxb.h" #include #include "aom_ports/mem.h" #include "av1/common/blockd.h" #include "av1/common/idct.h" #include "av1/common/pred_common.h" #include "av1/common/scan.h" #include "av1/encoder/bitstream.h" #include "av1/encoder/cost.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/hash.h" #include "av1/encoder/rdopt.h" #include "av1/encoder/tokenize.h" void av1_alloc_txb_buf(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool; const int num_sb_rows = CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2); const int num_sb_cols = CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2); const int size = num_sb_rows * num_sb_cols; const int num_planes = av1_num_planes(cm); const int subsampling_x = cm->seq_params->subsampling_x; const int subsampling_y = cm->seq_params->subsampling_y; const int luma_max_sb_square = 1 << num_pels_log2_lookup[cm->seq_params->sb_size]; const int chroma_max_sb_square = luma_max_sb_square >> (subsampling_x + subsampling_y); const int total_max_sb_square = (luma_max_sb_square + (num_planes - 1) * chroma_max_sb_square); if ((size_t)size > SIZE_MAX / (size_t)total_max_sb_square) { aom_internal_error(cm->error, AOM_CODEC_ERROR, "A multiplication would overflow size_t"); } const size_t num_tcoeffs = (size_t)size * (size_t)total_max_sb_square; const int txb_unit_size = TX_SIZE_W_MIN * TX_SIZE_H_MIN; av1_free_txb_buf(cpi); // TODO(jingning): This should be further reduced. CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base, aom_malloc(sizeof(*cpi->coeff_buffer_base) * size)); if (sizeof(*coeff_buf_pool->tcoeff) > SIZE_MAX / num_tcoeffs) { aom_internal_error(cm->error, AOM_CODEC_ERROR, "A multiplication would overflow size_t"); } CHECK_MEM_ERROR( cm, coeff_buf_pool->tcoeff, aom_memalign(32, sizeof(*coeff_buf_pool->tcoeff) * num_tcoeffs)); if (sizeof(*coeff_buf_pool->eobs) > SIZE_MAX / num_tcoeffs) { aom_internal_error(cm->error, AOM_CODEC_ERROR, "A multiplication would overflow size_t"); } CHECK_MEM_ERROR( cm, coeff_buf_pool->eobs, aom_malloc(sizeof(*coeff_buf_pool->eobs) * num_tcoeffs / txb_unit_size)); if (sizeof(*coeff_buf_pool->entropy_ctx) > SIZE_MAX / num_tcoeffs) { aom_internal_error(cm->error, AOM_CODEC_ERROR, "A multiplication would overflow size_t"); } CHECK_MEM_ERROR(cm, coeff_buf_pool->entropy_ctx, aom_malloc(sizeof(*coeff_buf_pool->entropy_ctx) * num_tcoeffs / txb_unit_size)); tran_low_t *tcoeff_ptr = coeff_buf_pool->tcoeff; uint16_t *eob_ptr = coeff_buf_pool->eobs; uint8_t *entropy_ctx_ptr = coeff_buf_pool->entropy_ctx; for (int i = 0; i < size; i++) { for (int plane = 0; plane < num_planes; plane++) { const int max_sb_square = (plane == AOM_PLANE_Y) ? luma_max_sb_square : chroma_max_sb_square; cpi->coeff_buffer_base[i].tcoeff[plane] = tcoeff_ptr; cpi->coeff_buffer_base[i].eobs[plane] = eob_ptr; cpi->coeff_buffer_base[i].entropy_ctx[plane] = entropy_ctx_ptr; tcoeff_ptr += max_sb_square; eob_ptr += max_sb_square / txb_unit_size; entropy_ctx_ptr += max_sb_square / txb_unit_size; } } } void av1_free_txb_buf(AV1_COMP *cpi) { CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool; aom_free(cpi->coeff_buffer_base); cpi->coeff_buffer_base = NULL; aom_free(coeff_buf_pool->tcoeff); coeff_buf_pool->tcoeff = NULL; aom_free(coeff_buf_pool->eobs); coeff_buf_pool->eobs = NULL; aom_free(coeff_buf_pool->entropy_ctx); coeff_buf_pool->entropy_ctx = NULL; } static void write_golomb(aom_writer *w, int level) { int x = level + 1; int i = x; int length = 0; while (i) { i >>= 1; ++length; } assert(length > 0); for (i = 0; i < length - 1; ++i) aom_write_bit(w, 0); for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01); } static const int8_t eob_to_pos_small[33] = { 0, 1, 2, // 0-2 3, 3, // 3-4 4, 4, 4, 4, // 5-8 5, 5, 5, 5, 5, 5, 5, 5, // 9-16 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 // 17-32 }; static const int8_t eob_to_pos_large[17] = { 6, // place holder 7, // 33-64 8, 8, // 65-128 9, 9, 9, 9, // 129-256 10, 10, 10, 10, 10, 10, 10, 10, // 257-512 11 // 513- }; int av1_get_eob_pos_token(const int eob, int *const extra) { int t; if (eob < 33) { t = eob_to_pos_small[eob]; } else { const int e = AOMMIN((eob - 1) >> 5, 16); t = eob_to_pos_large[e]; } *extra = eob - av1_eob_group_start[t]; return t; } #if CONFIG_ENTROPY_STATS static void update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size, TX_CLASS tx_class, PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts, uint8_t allow_update_cdf) { #else static void update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class, PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx, uint8_t allow_update_cdf) { #endif int eob_extra; const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra); TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); const int eob_multi_size = txsize_log2_minus4[tx_size]; const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; switch (eob_multi_size) { case 0: #if CONFIG_ENTROPY_STATS ++counts->eob_multi16[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; #endif if (allow_update_cdf) update_cdf(ec_ctx->eob_flag_cdf16[plane][eob_multi_ctx], eob_pt - 1, 5); break; case 1: #if CONFIG_ENTROPY_STATS ++counts->eob_multi32[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; #endif if (allow_update_cdf) update_cdf(ec_ctx->eob_flag_cdf32[plane][eob_multi_ctx], eob_pt - 1, 6); break; case 2: #if CONFIG_ENTROPY_STATS ++counts->eob_multi64[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; #endif if (allow_update_cdf) update_cdf(ec_ctx->eob_flag_cdf64[plane][eob_multi_ctx], eob_pt - 1, 7); break; case 3: #if CONFIG_ENTROPY_STATS ++counts->eob_multi128[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; #endif if (allow_update_cdf) { update_cdf(ec_ctx->eob_flag_cdf128[plane][eob_multi_ctx], eob_pt - 1, 8); } break; case 4: #if CONFIG_ENTROPY_STATS ++counts->eob_multi256[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; #endif if (allow_update_cdf) { update_cdf(ec_ctx->eob_flag_cdf256[plane][eob_multi_ctx], eob_pt - 1, 9); } break; case 5: #if CONFIG_ENTROPY_STATS ++counts->eob_multi512[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; #endif if (allow_update_cdf) { update_cdf(ec_ctx->eob_flag_cdf512[plane][eob_multi_ctx], eob_pt - 1, 10); } break; case 6: default: #if CONFIG_ENTROPY_STATS ++counts->eob_multi1024[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; #endif if (allow_update_cdf) { update_cdf(ec_ctx->eob_flag_cdf1024[plane][eob_multi_ctx], eob_pt - 1, 11); } break; } if (av1_eob_offset_bits[eob_pt] > 0) { int eob_ctx = eob_pt - 3; int eob_shift = av1_eob_offset_bits[eob_pt] - 1; int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; #if CONFIG_ENTROPY_STATS counts->eob_extra[cdf_idx][txs_ctx][plane][eob_pt][bit]++; #endif // CONFIG_ENTROPY_STATS if (allow_update_cdf) update_cdf(ec_ctx->eob_extra_cdf[txs_ctx][plane][eob_ctx], bit, 2); } } static inline int get_nz_map_ctx(const uint8_t *const levels, const int coeff_idx, const int bhl, const int width, const int scan_idx, const int is_eob, const TX_SIZE tx_size, const TX_CLASS tx_class) { if (is_eob) { if (scan_idx == 0) return 0; if (scan_idx <= (width << bhl) / 8) return 1; if (scan_idx <= (width << bhl) / 4) return 2; return 3; } const int stats = get_nz_mag(levels + get_padded_idx(coeff_idx, bhl), bhl, tx_class); return get_nz_map_ctx_from_stats(stats, coeff_idx, bhl, tx_size, tx_class); } void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels) { const int stride = height + TX_PAD_HOR; uint8_t *ls = levels; memset(levels + stride * width, 0, sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); for (int i = 0; i < width; i++) { for (int j = 0; j < height; j++) { *ls++ = (uint8_t)clamp(abs(coeff[i * height + j]), 0, INT8_MAX); } for (int j = 0; j < TX_PAD_HOR; j++) { *ls++ = 0; } } } void av1_get_nz_map_contexts_c(const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts) { const int bhl = get_txb_bhl(tx_size); const int width = get_txb_wide(tx_size); for (int i = 0; i < eob; ++i) { const int pos = scan[i]; coeff_contexts[pos] = get_nz_map_ctx(levels, pos, bhl, width, i, i == eob - 1, tx_size, tx_class); } } void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x, aom_writer *w, int blk_row, int blk_col, int plane, int block, TX_SIZE tx_size) { MACROBLOCKD *xd = &x->e_mbd; const CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff; const PLANE_TYPE plane_type = get_plane_type(plane); const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN); const uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset; const uint16_t eob = eob_txb[block]; const uint8_t *entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset; const int txb_skip_ctx = entropy_ctx[block] & TXB_SKIP_CTX_MASK; const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); FRAME_CONTEXT *ec_ctx = xd->tile_ctx; aom_write_symbol(w, eob == 0, ec_ctx->txb_skip_cdf[txs_ctx][txb_skip_ctx], 2); if (eob == 0) return; const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, cm->features.reduced_tx_set_used); // Only y plane's tx_type is transmitted if (plane == 0) { av1_write_tx_type(cm, xd, tx_type, tx_size, w); } int eob_extra; const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra); const int eob_multi_size = txsize_log2_minus4[tx_size]; const TX_CLASS tx_class = tx_type_to_class[tx_type]; const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; switch (eob_multi_size) { case 0: aom_write_symbol(w, eob_pt - 1, ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], 5); break; case 1: aom_write_symbol(w, eob_pt - 1, ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], 6); break; case 2: aom_write_symbol(w, eob_pt - 1, ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], 7); break; case 3: aom_write_symbol(w, eob_pt - 1, ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], 8); break; case 4: aom_write_symbol(w, eob_pt - 1, ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], 9); break; case 5: aom_write_symbol(w, eob_pt - 1, ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], 10); break; default: aom_write_symbol(w, eob_pt - 1, ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11); break; } const int eob_offset_bits = av1_eob_offset_bits[eob_pt]; if (eob_offset_bits > 0) { const int eob_ctx = eob_pt - 3; int eob_shift = eob_offset_bits - 1; int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; aom_write_symbol(w, bit, ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2); for (int i = 1; i < eob_offset_bits; i++) { eob_shift = eob_offset_bits - 1 - i; bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; aom_write_bit(w, bit); } } const int width = get_txb_wide(tx_size); const int height = get_txb_high(tx_size); uint8_t levels_buf[TX_PAD_2D]; uint8_t *const levels = set_levels(levels_buf, height); const tran_low_t *tcoeff_txb = cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type]; const tran_low_t *tcoeff = tcoeff_txb + BLOCK_OFFSET(block); av1_txb_init_levels(tcoeff, width, height, levels); const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); const int16_t *const scan = scan_order->scan; DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts); const int bhl = get_txb_bhl(tx_size); for (int c = eob - 1; c >= 0; --c) { const int pos = scan[c]; const int coeff_ctx = coeff_contexts[pos]; const tran_low_t v = tcoeff[pos]; const tran_low_t level = abs(v); if (c == eob - 1) { aom_write_symbol( w, AOMMIN(level, 3) - 1, ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx], 3); } else { aom_write_symbol(w, AOMMIN(level, 3), ec_ctx->coeff_base_cdf[txs_ctx][plane_type][coeff_ctx], 4); } if (level > NUM_BASE_LEVELS) { // level is above 1. const int base_range = level - 1 - NUM_BASE_LEVELS; const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class); aom_cdf_prob *cdf = ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx]; for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1); aom_write_symbol(w, k, cdf, BR_CDF_SIZE); if (k < BR_CDF_SIZE - 1) break; } } } // Loop to code all signs in the transform block, // starting with the sign of DC (if applicable) for (int c = 0; c < eob; ++c) { const tran_low_t v = tcoeff[scan[c]]; const tran_low_t level = abs(v); const int sign = (v < 0) ? 1 : 0; if (level) { if (c == 0) { const int dc_sign_ctx = (entropy_ctx[block] >> DC_SIGN_CTX_SHIFT) & DC_SIGN_CTX_MASK; aom_write_symbol(w, sign, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], 2); } else { aom_write_bit(w, sign); } if (level > COEFF_BASE_RANGE + NUM_BASE_LEVELS) write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS); } } } void av1_write_intra_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, aom_writer *w, BLOCK_SIZE bsize) { MACROBLOCKD *xd = &x->e_mbd; const int num_planes = av1_num_planes(cm); int block[MAX_MB_PLANE] = { 0 }; int row, col; assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x, xd->plane[0].subsampling_y)); const int max_blocks_wide = max_block_wide(xd, bsize, 0); const int max_blocks_high = max_block_high(xd, bsize, 0); const BLOCK_SIZE max_unit_bsize = BLOCK_64X64; int mu_blocks_wide = mi_size_wide[max_unit_bsize]; int mu_blocks_high = mi_size_high[max_unit_bsize]; mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide); mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high); for (row = 0; row < max_blocks_high; row += mu_blocks_high) { for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) { for (int plane = 0; plane < num_planes; ++plane) { if (plane && !xd->is_chroma_ref) break; const TX_SIZE tx_size = av1_get_tx_size(plane, xd); const int stepr = tx_size_high_unit[tx_size]; const int stepc = tx_size_wide_unit[tx_size]; const int step = stepr * stepc; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int unit_height = ROUND_POWER_OF_TWO( AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y); const int unit_width = ROUND_POWER_OF_TWO( AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x); for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height; blk_row += stepr) { for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width; blk_col += stepc) { av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane, block[plane], tx_size); block[plane] += step; } } } } } } uint8_t av1_get_txb_entropy_context(const tran_low_t *qcoeff, const SCAN_ORDER *scan_order, int eob) { const int16_t *const scan = scan_order->scan; int cul_level = 0; int c; if (eob == 0) return 0; for (c = 0; c < eob; ++c) { cul_level += abs(qcoeff[scan[c]]); if (cul_level > COEFF_CONTEXT_MASK) break; } cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level); set_dc_sign(&cul_level, qcoeff[0]); return (uint8_t)cul_level; } static void update_tx_type_count(const AV1_COMP *cpi, const AV1_COMMON *cm, MACROBLOCKD *xd, int blk_row, int blk_col, int plane, TX_SIZE tx_size, FRAME_COUNTS *counts, uint8_t allow_update_cdf) { MB_MODE_INFO *mbmi = xd->mi[0]; int is_inter = is_inter_block(mbmi); const int reduced_tx_set_used = cm->features.reduced_tx_set_used; FRAME_CONTEXT *fc = xd->tile_ctx; #if !CONFIG_ENTROPY_STATS (void)counts; #endif // !CONFIG_ENTROPY_STATS // Only y plane's tx_type is updated if (plane > 0) return; const TX_TYPE tx_type = av1_get_tx_type(xd, PLANE_TYPE_Y, blk_row, blk_col, tx_size, reduced_tx_set_used); if (is_inter) { if (cpi->oxcf.txfm_cfg.use_inter_dct_only) { assert(tx_type == DCT_DCT); } } else { if (cpi->oxcf.txfm_cfg.use_intra_dct_only) { assert(tx_type == DCT_DCT); } else if (cpi->oxcf.txfm_cfg.use_intra_default_tx_only) { const TX_TYPE default_type = get_default_tx_type( PLANE_TYPE_Y, xd, tx_size, cpi->use_screen_content_tools); (void)default_type; // TODO(kyslov): We don't always respect use_intra_default_tx_only flag in // NonRD and REALTIME case. Specifically we ignore it in hybrid inta mode // search, when picking up intra mode in nonRD inter mode search and in RD // REALTIME mode when we limit TX type usage. // We need to fix txfm cfg for these cases. Meanwhile relieving the // assert. assert(tx_type == default_type || cpi->sf.rt_sf.use_nonrd_pick_mode || cpi->oxcf.mode == REALTIME); } } if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 && cm->quant_params.base_qindex > 0 && !mbmi->skip_txfm && !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { const int eset = get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used); if (eset > 0) { const TxSetType tx_set_type = av1_get_ext_tx_set_type(tx_size, is_inter, reduced_tx_set_used); if (is_inter) { if (allow_update_cdf) { update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]], av1_ext_tx_ind[tx_set_type][tx_type], av1_num_ext_tx_set[tx_set_type]); } #if CONFIG_ENTROPY_STATS ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]] [av1_ext_tx_ind[tx_set_type][tx_type]]; #endif // CONFIG_ENTROPY_STATS } else { PREDICTION_MODE intra_dir; if (mbmi->filter_intra_mode_info.use_filter_intra) intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info .filter_intra_mode]; else intra_dir = mbmi->mode; #if CONFIG_ENTROPY_STATS ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][intra_dir] [av1_ext_tx_ind[tx_set_type][tx_type]]; #endif // CONFIG_ENTROPY_STATS if (allow_update_cdf) { update_cdf( fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][intra_dir], av1_ext_tx_ind[tx_set_type][tx_type], av1_num_ext_tx_set[tx_set_type]); } } } } } void av1_update_and_record_txb_context(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { struct tokenize_b_args *const args = arg; const AV1_COMP *cpi = args->cpi; const AV1_COMMON *cm = &cpi->common; ThreadData *const td = args->td; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *p = &x->plane[plane]; struct macroblockd_plane *pd = &xd->plane[plane]; const int eob = p->eobs[block]; const int block_offset = BLOCK_OFFSET(block); tran_low_t *qcoeff = p->qcoeff + block_offset; const PLANE_TYPE plane_type = pd->plane_type; const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, cm->features.reduced_tx_set_used); const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); tran_low_t *tcoeff; assert(args->dry_run != DRY_RUN_COSTCOEFFS); if (args->dry_run == OUTPUT_ENABLED) { MB_MODE_INFO *mbmi = xd->mi[0]; TXB_CTX txb_ctx; get_txb_ctx(plane_bsize, tx_size, plane, pd->above_entropy_context + blk_col, pd->left_entropy_context + blk_row, &txb_ctx); const int bhl = get_txb_bhl(tx_size); const int width = get_txb_wide(tx_size); const int height = get_txb_high(tx_size); const uint8_t allow_update_cdf = args->allow_update_cdf; const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size); FRAME_CONTEXT *ec_ctx = xd->tile_ctx; #if CONFIG_ENTROPY_STATS int cdf_idx = cm->coef_cdf_category; ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0]; #endif // CONFIG_ENTROPY_STATS if (allow_update_cdf) { update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx], eob == 0, 2); } CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff; const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN); uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset; uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset; entropy_ctx[block] = txb_ctx.txb_skip_ctx; eob_txb[block] = eob; if (eob == 0) { av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, blk_row); return; } const int segment_id = mbmi->segment_id; const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size); tran_low_t *tcoeff_txb = cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type]; tcoeff = tcoeff_txb + block_offset; memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob); uint8_t levels_buf[TX_PAD_2D]; uint8_t *const levels = set_levels(levels_buf, height); av1_txb_init_levels(tcoeff, width, height, levels); update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size, td->counts, allow_update_cdf); const TX_CLASS tx_class = tx_type_to_class[tx_type]; const int16_t *const scan = scan_order->scan; // record tx type usage td->rd_counts.tx_type_used[tx_size][tx_type]++; #if CONFIG_ENTROPY_STATS update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx, td->counts, allow_update_cdf); #else update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx, allow_update_cdf); #endif DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts); for (int c = eob - 1; c >= 0; --c) { const int pos = scan[c]; const int coeff_ctx = coeff_contexts[pos]; const tran_low_t v = qcoeff[pos]; const tran_low_t level = abs(v); /* abs_sum_level is needed to decide the job scheduling order of * pack bitstream multi-threading. This data is not needed if * multi-threading is disabled. */ if (cpi->mt_info.pack_bs_mt_enabled) td->abs_sum_level += level; if (allow_update_cdf) { if (c == eob - 1) { assert(coeff_ctx < 4); update_cdf( ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx], AOMMIN(level, 3) - 1, 3); } else { update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx], AOMMIN(level, 3), 4); } } if (c == eob - 1) { assert(coeff_ctx < 4); #if CONFIG_ENTROPY_STATS ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type] [coeff_ctx][AOMMIN(level, 3) - 1]; } else { ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type] [coeff_ctx][AOMMIN(level, 3)]; #endif } if (level > NUM_BASE_LEVELS) { const int base_range = level - 1 - NUM_BASE_LEVELS; const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class); for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1); if (allow_update_cdf) { update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)] [plane_type][br_ctx], k, BR_CDF_SIZE); } for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) { #if CONFIG_ENTROPY_STATS ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type] [lps][br_ctx][lps == k]; #endif // CONFIG_ENTROPY_STATS if (lps == k) break; } #if CONFIG_ENTROPY_STATS ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)] [plane_type][br_ctx][k]; #endif if (k < BR_CDF_SIZE - 1) break; } } } // Update the context needed to code the DC sign (if applicable) if (tcoeff[0] != 0) { const int dc_sign = (tcoeff[0] < 0) ? 1 : 0; const int dc_sign_ctx = txb_ctx.dc_sign_ctx; #if CONFIG_ENTROPY_STATS ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign]; #endif // CONFIG_ENTROPY_STATS if (allow_update_cdf) update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2); entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT; } } else { tcoeff = qcoeff; } const uint8_t cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob); av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, blk_col, blk_row); } void av1_record_txb_context(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { struct tokenize_b_args *const args = arg; const AV1_COMP *cpi = args->cpi; const AV1_COMMON *cm = &cpi->common; ThreadData *const td = args->td; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *p = &x->plane[plane]; struct macroblockd_plane *pd = &xd->plane[plane]; const int eob = p->eobs[block]; const int block_offset = BLOCK_OFFSET(block); tran_low_t *qcoeff = p->qcoeff + block_offset; const PLANE_TYPE plane_type = pd->plane_type; const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, cm->features.reduced_tx_set_used); const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); tran_low_t *tcoeff; assert(args->dry_run != DRY_RUN_COSTCOEFFS); if (args->dry_run == OUTPUT_ENABLED) { MB_MODE_INFO *mbmi = xd->mi[0]; TXB_CTX txb_ctx; get_txb_ctx(plane_bsize, tx_size, plane, pd->above_entropy_context + blk_col, pd->left_entropy_context + blk_row, &txb_ctx); #if CONFIG_ENTROPY_STATS const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size); const int bhl = get_txb_bhl(tx_size); const int width = get_txb_wide(tx_size); const int height = get_txb_high(tx_size); int cdf_idx = cm->coef_cdf_category; ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0]; #endif // CONFIG_ENTROPY_STATS CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff; const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN); uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset; uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset; entropy_ctx[block] = txb_ctx.txb_skip_ctx; eob_txb[block] = eob; if (eob == 0) { av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, blk_row); return; } const int segment_id = mbmi->segment_id; const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size); tran_low_t *tcoeff_txb = cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type]; tcoeff = tcoeff_txb + block_offset; memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob); #if CONFIG_ENTROPY_STATS uint8_t levels_buf[TX_PAD_2D]; uint8_t *const levels = set_levels(levels_buf, height); av1_txb_init_levels(tcoeff, width, height, levels); update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size, td->counts, 0 /*allow_update_cdf*/); const TX_CLASS tx_class = tx_type_to_class[tx_type]; const bool do_coeff_scan = true; #else const bool do_coeff_scan = cpi->mt_info.pack_bs_mt_enabled; #endif const int16_t *const scan = scan_order->scan; // record tx type usage td->rd_counts.tx_type_used[tx_size][tx_type]++; #if CONFIG_ENTROPY_STATS FRAME_CONTEXT *ec_ctx = xd->tile_ctx; update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx, td->counts, 0 /*allow_update_cdf*/); DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts); #endif for (int c = eob - 1; (c >= 0) && do_coeff_scan; --c) { const int pos = scan[c]; const tran_low_t v = qcoeff[pos]; const tran_low_t level = abs(v); /* abs_sum_level is needed to decide the job scheduling order of * pack bitstream multi-threading. This data is not needed if * multi-threading is disabled. */ if (cpi->mt_info.pack_bs_mt_enabled) td->abs_sum_level += level; #if CONFIG_ENTROPY_STATS const int coeff_ctx = coeff_contexts[pos]; if (c == eob - 1) { assert(coeff_ctx < 4); ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type] [coeff_ctx][AOMMIN(level, 3) - 1]; } else { ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type] [coeff_ctx][AOMMIN(level, 3)]; } if (level > NUM_BASE_LEVELS) { const int base_range = level - 1 - NUM_BASE_LEVELS; const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class); for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1); for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) { ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type] [lps][br_ctx][lps == k]; if (lps == k) break; } ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)] [plane_type][br_ctx][k]; if (k < BR_CDF_SIZE - 1) break; } } #endif } // Update the context needed to code the DC sign (if applicable) if (tcoeff[0] != 0) { const int dc_sign_ctx = txb_ctx.dc_sign_ctx; #if CONFIG_ENTROPY_STATS const int dc_sign = (tcoeff[0] < 0) ? 1 : 0; ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign]; #endif // CONFIG_ENTROPY_STATS entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT; } } else { tcoeff = qcoeff; } const uint8_t cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob); av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, blk_col, blk_row); } void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td, RUN_TYPE dry_run, BLOCK_SIZE bsize, uint8_t allow_update_cdf) { const AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run }; if (mbmi->skip_txfm) { av1_reset_entropy_context(xd, bsize, num_planes); return; } const foreach_transformed_block_visitor visit = allow_update_cdf ? av1_update_and_record_txb_context : av1_record_txb_context; for (int plane = 0; plane < num_planes; ++plane) { if (plane && !xd->is_chroma_ref) break; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int ss_x = pd->subsampling_x; const int ss_y = pd->subsampling_y; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); av1_foreach_transformed_block_in_plane(xd, plane_bsize, plane, visit, &arg); } } CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row, int mi_col) { const AV1_COMMON *const cm = &cpi->common; const int mib_size_log2 = cm->seq_params->mib_size_log2; const int stride = CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2); const int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2); return cpi->coeff_buffer_base + offset; } aom-3.12.1/av1/encoder/encodetxb.h000066400000000000000000000255021477627663500166370ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_ENCODETXB_H_ #define AOM_AV1_ENCODER_ENCODETXB_H_ #include "config/aom_config.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/txb_common.h" #include "av1/encoder/block.h" #include "av1/encoder/encoder.h" #include "aom_dsp/bitwriter.h" #ifdef __cplusplus extern "C" { #endif /*!\cond */ #define TXB_SKIP_CTX_MASK 15 #define DC_SIGN_CTX_SHIFT 4 #define DC_SIGN_CTX_MASK 3 int av1_get_eob_pos_token(const int eob, int *const extra); /*!\endcond */ /*!\brief Allocate the memory resources for all the macro blocks in the current * coding frame. * \ingroup coefficient_coding * * Each macro block will need a \ref CB_COEFF_BUFFER to store information for * rate-distortion optimization and entropy coding of transform coefficients. * * \param[in] cpi Top-level encoder structure */ void av1_alloc_txb_buf(AV1_COMP *cpi); /*!\brief Free the memory resources for all the macro blocks in the current * coding frame. * \ingroup coefficient_coding * * See \ref av1_alloc_txb_buf and \ref CB_COEFF_BUFFER for more details. * * \param[in] cpi Top-level encoder structure */ void av1_free_txb_buf(AV1_COMP *cpi); /*!\brief Write quantized coefficients in a transform block into bitstream using * entropy coding. * * \ingroup coefficient_coding * * This function will write the quantized coefficients in a transform block into * the bitstream using entropy coding. * * The coding steps are as follows. * * 1) Code the end of block position "eob", which is the scan index of the * last non-zero coefficient plus one. * * 2) Code the lower magnitude level (<= COEFF_BASE_RANGE + NUM_BASE_LEVELS) * for each coefficient in reversed scan order. * * 3) Code the sign and higher magnitude level * (> COEFF_BASE_RANGE + NUM_BASE_LEVELS) in forward scan order. * * \param[in] cm Top-level structure shared by encoder and * decoder * \param[in] x Pointer to structure holding the data for the current encoding macroblock * \param[in] w Entropy coding write pointer * \param[in] blk_row The row index of the current transform block * in the macroblock. Each unit has 4 pixels in y plane * \param[in] blk_col The col index of the current transform block * in the macroblock. Each unit has 4 pixels in y plane * \param[in] plane The index of the current plane * \param[in] block The index of the current transform block in the * macroblock. It's defined by number of 4x4 units that have been coded before * the currernt transform block * \param[in] tx_size The given transform size */ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x, aom_writer *w, int blk_row, int blk_col, int plane, int block, TX_SIZE tx_size); /*!\brief Write quantized coefficients of all transform blocks in an intra * macroblock into the bitstream using entropy coding. * * \ingroup coefficient_coding * * All transform blocks in the intra macroblock share the same transform size. * * This function use \ref av1_write_coeffs_txb() to code each transform block in * raster order. * * \param[in] cm Top-level structure shared by encoder and * decoder * \param[in] x Pointer to structure holding the data for the current encoding macroblock * \param[in] w Entropy coding write pointer * \param[in] bsize Block size of the current macroblock */ void av1_write_intra_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, aom_writer *w, BLOCK_SIZE bsize); /*!\brief Pack the context info of the current transform block into an uint8_t. * \ingroup coefficient_coding * * This context info will be collected and consolidated by its neighbor * transform blocks for coding transform block skip flag (tx_skip) and * the sign of DC coefficient (dc_sign). * * \param[in] qcoeff Buffer of quantized coefficients * \param[in] scan_order Coding order of coefficients in the transform * block * \param[in] eob The scan index of last non-zero coefficient plus * one */ uint8_t av1_get_txb_entropy_context(const tran_low_t *qcoeff, const SCAN_ORDER *scan_order, int eob); /*!\brief Update the probability model (cdf) and the entropy context related to * coefficient coding for all transform blocks in the intra macroblock. * * \ingroup coefficient_coding * * This function will go through each transform block in the intra macorblock * and call \ref av1_update_and_record_txb_context to update the probability * model and entropy context properly. * * \param[in] cpi Top-level encoder structure * \param[in] td Top-level multithreading structure * \param[in] dry_run Whether this is a dry run. * \param[in] bsize Block size of the current macroblock * \param[in] allow_update_cdf Allowed to update probability model (cdf) or * not. */ void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td, RUN_TYPE dry_run, BLOCK_SIZE bsize, uint8_t allow_update_cdf); /*!\brief Update the probability model (cdf) and the entropy context related to * coefficient coding for a transform block. * * \ingroup coefficient_coding * * There are regular mode and dry run for this funtion. * * Regular mode: * * The probability model (cdf) for each coding symbol in the * transform block will be updated. * * The entropy context of this transform block will be updated. * * Dry run: * * The probability model update will be skipped. * * The entropy context of this transform block will be updated. * * \param[in] plane The index of the current plane. * \param[in] block The index of the current transform block in the * macroblock. It's defined by number of 4x4 units that have been coded before * the currernt transform block. * \param[in] blk_row The row index of the current transform block * in the macroblock. Each unit has 4 pixels in y plane. * \param[in] blk_col The col index of the current transform block * in the macroblock. Each unit has 4 pixels in y plane. * \param[in] plane_bsize Block size for this plane. When the video source * uses chroma subsampling, the block size of UV planes will be smaller than the * block size of Y plane. * \param[in] tx_size The given transform size. * \param[in] arg This parameter will be translated into * tokenize_b_args, in which RUN_TYPE indicates using regular mode or dry run. */ void av1_update_and_record_txb_context(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); /*!\brief Update the entropy context related to coefficient coding for a * transform block. * * \ingroup coefficient_coding * * There are regular mode and dry run for this function. * * Regular mode: * * The entropy context of this transform block will be updated. * * Dry run: * * The probability model update will be skipped. * * The entropy context of this transform block will be updated. * * \param[in] plane The index of the current plane. * \param[in] block The index of the current transform block in the * macroblock. It's defined by number of 4x4 units that have been coded before * the currernt transform block. * \param[in] blk_row The row index of the current transform block * in the macroblock. Each unit has 4 pixels in y plane. * \param[in] blk_col The col index of the current transform block * in the macroblock. Each unit has 4 pixels in y plane. * \param[in] plane_bsize Block size for this plane. When the video source * uses chroma subsampling, the block size of UV planes will be smaller than the * block size of Y plane. * \param[in] tx_size The given transform size. * \param[in] arg This parameter will be translated into * tokenize_b_args, in which RUN_TYPE indicates using regular mode or dry run. */ void av1_record_txb_context(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); /*!\brief Get the corresponding \ref CB_COEFF_BUFFER of the current macro block. * * \ingroup coefficient_coding * * The macroblock's location is described by mi_row and mi_col, row and column * mi indexes in the coding frame. * * Each mi unit is a 4x4 pixel block. * * \param[in] cpi Top-level encoder structure. * \param[in] mi_row Row mi index of the current transform block * in the frame. * \param[in] mi_col Column mi index of the current transform * block in the frame. * \return CB_COEFF_BUFFER* Pointer of \ref CB_COEFF_BUFFER associated * to this macroblock. */ CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row, int mi_col); /*!\brief Returns the entropy cost associated with skipping the current * transform block. * * \ingroup coefficient_coding * * \param[in] coeff_costs Table of entropy cost for coefficient coding. * \param[in] txb_ctx Context info for entropy coding transform block * skip flag (tx_skip) and the sign of DC coefficient (dc_sign). * \param[in] plane The index of the current plane * \param[in] tx_size The transform size */ static inline int av1_cost_skip_txb(const CoeffCosts *coeff_costs, const TXB_CTX *const txb_ctx, int plane, TX_SIZE tx_size) { const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); const PLANE_TYPE plane_type = get_plane_type(plane); const LV_MAP_COEFF_COST *const coeff_costs_ = &coeff_costs->coeff_costs[txs_ctx][plane_type]; return coeff_costs_->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; } /*!\cond */ // These numbers are empirically obtained. static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { { 17, 13 }, { 16, 10 }, }; /*!\endcond */ #ifdef __cplusplus } #endif #endif // AOM_AV1_ENCODER_ENCODETXB_H_ aom-3.12.1/av1/encoder/ethread.c000066400000000000000000004034201477627663500162720ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_util/aom_pthread.h" #include "av1/common/warped_motion.h" #include "av1/common/thread_common.h" #include "av1/encoder/allintra_vis.h" #include "av1/encoder/bitstream.h" #include "av1/encoder/enc_enums.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodeframe_utils.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encoder_alloc.h" #include "av1/encoder/ethread.h" #if !CONFIG_REALTIME_ONLY #include "av1/encoder/firstpass.h" #endif #include "av1/encoder/global_motion.h" #include "av1/encoder/global_motion_facade.h" #include "av1/encoder/intra_mode_search_utils.h" #include "av1/encoder/picklpf.h" #include "av1/encoder/rdopt.h" #include "aom_dsp/aom_dsp_common.h" #include "av1/encoder/temporal_filter.h" #include "av1/encoder/tpl_model.h" static inline void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { td->rd_counts.compound_ref_used_flag |= td_t->rd_counts.compound_ref_used_flag; td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag; for (int i = 0; i < TX_SIZES_ALL; i++) { for (int j = 0; j < TX_TYPES; j++) td->rd_counts.tx_type_used[i][j] += td_t->rd_counts.tx_type_used[i][j]; } for (int i = 0; i < BLOCK_SIZES_ALL; i++) { for (int j = 0; j < 2; j++) { td->rd_counts.obmc_used[i][j] += td_t->rd_counts.obmc_used[i][j]; } } for (int i = 0; i < 2; i++) { td->rd_counts.warped_used[i] += td_t->rd_counts.warped_used[i]; } td->rd_counts.seg_tmp_pred_cost[0] += td_t->rd_counts.seg_tmp_pred_cost[0]; td->rd_counts.seg_tmp_pred_cost[1] += td_t->rd_counts.seg_tmp_pred_cost[1]; td->rd_counts.newmv_or_intra_blocks += td_t->rd_counts.newmv_or_intra_blocks; } static inline void update_delta_lf_for_row_mt(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &cpi->td.mb.e_mbd; const int mib_size = cm->seq_params->mib_size; const int frame_lf_count = av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; for (int row = 0; row < cm->tiles.rows; row++) { for (int col = 0; col < cm->tiles.cols; col++) { TileDataEnc *tile_data = &cpi->tile_data[row * cm->tiles.cols + col]; const TileInfo *const tile_info = &tile_data->tile_info; for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; mi_row += mib_size) { if (mi_row == tile_info->mi_row_start) av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end; mi_col += mib_size) { const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col; MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str; MB_MODE_INFO *mbmi = mi[0]; if (mbmi->skip_txfm == 1 && (mbmi->bsize == cm->seq_params->sb_size)) { for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id]; mbmi->delta_lf_from_base = xd->delta_lf_from_base; } else { if (cm->delta_q_info.delta_lf_multi) { for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; } else { xd->delta_lf_from_base = mbmi->delta_lf_from_base; } } } } } } } void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c) { (void)row_mt_sync; (void)r; (void)c; } void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c, int cols) { (void)row_mt_sync; (void)r; (void)c; (void)cols; } void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c) { #if CONFIG_MULTITHREAD const int nsync = row_mt_sync->sync_range; if (r) { pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1]; pthread_mutex_lock(mutex); while (c > row_mt_sync->num_finished_cols[r - 1] - nsync - row_mt_sync->intrabc_extra_top_right_sb_delay) { pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex); } pthread_mutex_unlock(mutex); } #else (void)row_mt_sync; (void)r; (void)c; #endif // CONFIG_MULTITHREAD } void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c, int cols) { #if CONFIG_MULTITHREAD const int nsync = row_mt_sync->sync_range; int cur; // Only signal when there are enough encoded blocks for next row to run. int sig = 1; if (c < cols - 1) { cur = c; if (c % nsync) sig = 0; } else { cur = cols + nsync + row_mt_sync->intrabc_extra_top_right_sb_delay; } if (sig) { pthread_mutex_lock(&row_mt_sync->mutex_[r]); // When a thread encounters an error, num_finished_cols[r] is set to maximum // column number. In this case, the AOMMAX operation here ensures that // num_finished_cols[r] is not overwritten with a smaller value thus // preventing the infinite waiting of threads in the relevant sync_read() // function. row_mt_sync->num_finished_cols[r] = AOMMAX(row_mt_sync->num_finished_cols[r], cur); pthread_cond_signal(&row_mt_sync->cond_[r]); pthread_mutex_unlock(&row_mt_sync->mutex_[r]); } #else (void)row_mt_sync; (void)r; (void)c; (void)cols; #endif // CONFIG_MULTITHREAD } // Allocate memory for row synchronization static void row_mt_sync_mem_alloc(AV1EncRowMultiThreadSync *row_mt_sync, AV1_COMMON *cm, int rows) { #if CONFIG_MULTITHREAD int i; CHECK_MEM_ERROR(cm, row_mt_sync->mutex_, aom_malloc(sizeof(*row_mt_sync->mutex_) * rows)); if (row_mt_sync->mutex_) { for (i = 0; i < rows; ++i) { pthread_mutex_init(&row_mt_sync->mutex_[i], NULL); } } CHECK_MEM_ERROR(cm, row_mt_sync->cond_, aom_malloc(sizeof(*row_mt_sync->cond_) * rows)); if (row_mt_sync->cond_) { for (i = 0; i < rows; ++i) { pthread_cond_init(&row_mt_sync->cond_[i], NULL); } } #endif // CONFIG_MULTITHREAD CHECK_MEM_ERROR(cm, row_mt_sync->num_finished_cols, aom_malloc(sizeof(*row_mt_sync->num_finished_cols) * rows)); row_mt_sync->rows = rows; // Set up nsync. row_mt_sync->sync_range = 1; } // Deallocate row based multi-threading synchronization related mutex and data void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) { if (row_mt_sync != NULL) { #if CONFIG_MULTITHREAD int i; if (row_mt_sync->mutex_ != NULL) { for (i = 0; i < row_mt_sync->rows; ++i) { pthread_mutex_destroy(&row_mt_sync->mutex_[i]); } aom_free(row_mt_sync->mutex_); } if (row_mt_sync->cond_ != NULL) { for (i = 0; i < row_mt_sync->rows; ++i) { pthread_cond_destroy(&row_mt_sync->cond_[i]); } aom_free(row_mt_sync->cond_); } #endif // CONFIG_MULTITHREAD aom_free(row_mt_sync->num_finished_cols); // clear the structure as the source of this call may be dynamic change // in tiles in which case this call will be followed by an _alloc() // which may fail. av1_zero(*row_mt_sync); } } static inline int get_sb_rows_in_frame(AV1_COMMON *cm) { return CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2); } static void row_mt_mem_alloc(AV1_COMP *cpi, int max_rows, int max_cols, int alloc_row_ctx) { struct AV1Common *cm = &cpi->common; AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; int tile_col, tile_row; av1_row_mt_mem_dealloc(cpi); // Allocate memory for row based multi-threading for (tile_row = 0; tile_row < tile_rows; tile_row++) { for (tile_col = 0; tile_col < tile_cols; tile_col++) { int tile_index = tile_row * tile_cols + tile_col; TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_rows); if (alloc_row_ctx) { assert(max_cols > 0); const int num_row_ctx = AOMMAX(1, (max_cols - 1)); CHECK_MEM_ERROR(cm, this_tile->row_ctx, (FRAME_CONTEXT *)aom_memalign( 16, num_row_ctx * sizeof(*this_tile->row_ctx))); } } } const int sb_rows = get_sb_rows_in_frame(cm); CHECK_MEM_ERROR( cm, enc_row_mt->num_tile_cols_done, aom_malloc(sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows)); enc_row_mt->allocated_rows = max_rows; enc_row_mt->allocated_cols = max_cols - 1; enc_row_mt->allocated_sb_rows = sb_rows; } void av1_row_mt_mem_dealloc(AV1_COMP *cpi) { AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; const int tile_cols = enc_row_mt->allocated_tile_cols; const int tile_rows = enc_row_mt->allocated_tile_rows; int tile_col, tile_row; // Free row based multi-threading sync memory for (tile_row = 0; tile_row < tile_rows; tile_row++) { for (tile_col = 0; tile_col < tile_cols; tile_col++) { int tile_index = tile_row * tile_cols + tile_col; TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync); if (cpi->oxcf.algo_cfg.cdf_update_mode) { aom_free(this_tile->row_ctx); this_tile->row_ctx = NULL; } } } aom_free(enc_row_mt->num_tile_cols_done); enc_row_mt->num_tile_cols_done = NULL; enc_row_mt->allocated_rows = 0; enc_row_mt->allocated_cols = 0; enc_row_mt->allocated_sb_rows = 0; } static inline void assign_tile_to_thread(int *thread_id_to_tile_id, int num_tiles, int num_workers) { int tile_id = 0; int i; for (i = 0; i < num_workers; i++) { thread_id_to_tile_id[i] = tile_id++; if (tile_id == num_tiles) tile_id = 0; } } static inline int get_next_job(TileDataEnc *const tile_data, int *current_mi_row, int mib_size) { AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync; const int mi_row_end = tile_data->tile_info.mi_row_end; if (row_mt_sync->next_mi_row < mi_row_end) { *current_mi_row = row_mt_sync->next_mi_row; row_mt_sync->num_threads_working++; row_mt_sync->next_mi_row += mib_size; return 1; } return 0; } static inline void switch_tile_and_get_next_job( AV1_COMMON *const cm, TileDataEnc *const tile_data, int *cur_tile_id, int *current_mi_row, int *end_of_frame, int is_firstpass, const BLOCK_SIZE fp_block_size) { const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; int tile_id = -1; // Stores the tile ID with minimum proc done int max_mis_to_encode = 0; int min_num_threads_working = INT_MAX; for (int tile_row = 0; tile_row < tile_rows; tile_row++) { for (int tile_col = 0; tile_col < tile_cols; tile_col++) { int tile_index = tile_row * tile_cols + tile_col; TileDataEnc *const this_tile = &tile_data[tile_index]; AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; #if CONFIG_REALTIME_ONLY int num_b_rows_in_tile = av1_get_sb_rows_in_tile(cm, &this_tile->tile_info); int num_b_cols_in_tile = av1_get_sb_cols_in_tile(cm, &this_tile->tile_info); #else int num_b_rows_in_tile = is_firstpass ? av1_get_unit_rows_in_tile(&this_tile->tile_info, fp_block_size) : av1_get_sb_rows_in_tile(cm, &this_tile->tile_info); int num_b_cols_in_tile = is_firstpass ? av1_get_unit_cols_in_tile(&this_tile->tile_info, fp_block_size) : av1_get_sb_cols_in_tile(cm, &this_tile->tile_info); #endif int theoretical_limit_on_threads = AOMMIN((num_b_cols_in_tile + 1) >> 1, num_b_rows_in_tile); int num_threads_working = row_mt_sync->num_threads_working; if (num_threads_working < theoretical_limit_on_threads) { int num_mis_to_encode = this_tile->tile_info.mi_row_end - row_mt_sync->next_mi_row; // Tile to be processed by this thread is selected on the basis of // availability of jobs: // 1) If jobs are available, tile to be processed is chosen on the // basis of minimum number of threads working for that tile. If two or // more tiles have same number of threads working for them, then the // tile with maximum number of jobs available will be chosen. // 2) If no jobs are available, then end_of_frame is reached. if (num_mis_to_encode > 0) { if (num_threads_working < min_num_threads_working) { min_num_threads_working = num_threads_working; max_mis_to_encode = 0; } if (num_threads_working == min_num_threads_working && num_mis_to_encode > max_mis_to_encode) { tile_id = tile_index; max_mis_to_encode = num_mis_to_encode; } } } } } if (tile_id == -1) { *end_of_frame = 1; } else { // Update the current tile id to the tile id that will be processed next, // which will be the least processed tile. *cur_tile_id = tile_id; const int unit_height = mi_size_high[fp_block_size]; get_next_job(&tile_data[tile_id], current_mi_row, is_firstpass ? unit_height : cm->seq_params->mib_size); } } #if !CONFIG_REALTIME_ONLY static void set_firstpass_encode_done(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; const BLOCK_SIZE fp_block_size = cpi->fp_block_size; const int unit_height = mi_size_high[fp_block_size]; // In case of multithreading of firstpass encode, due to top-right // dependency, the worker on a firstpass row waits for the completion of the // firstpass processing of the top and top-right fp_blocks. Hence, in case a // thread (main/worker) encounters an error, update the firstpass processing // of every row in the frame to indicate that it is complete in order to avoid // dependent workers waiting indefinitely. for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { TileDataEnc *const tile_data = &cpi->tile_data[tile_row * tile_cols + tile_col]; TileInfo *tile = &tile_data->tile_info; AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync; const int unit_cols_in_tile = av1_get_unit_cols_in_tile(tile, fp_block_size); for (int mi_row = tile->mi_row_start, unit_row_in_tile = 0; mi_row < tile->mi_row_end; mi_row += unit_height, unit_row_in_tile++) { enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile, unit_cols_in_tile - 1, unit_cols_in_tile); } } } } static int fp_enc_row_mt_worker_hook(void *arg1, void *unused) { EncWorkerData *const thread_data = (EncWorkerData *)arg1; AV1_COMP *const cpi = thread_data->cpi; int thread_id = thread_data->thread_id; AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; #if CONFIG_MULTITHREAD pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_; #endif (void)unused; struct aom_internal_error_info *const error_info = &thread_data->error_info; MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; xd->error_info = error_info; // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(error_info->jmp)) { error_info->setjmp = 0; #if CONFIG_MULTITHREAD pthread_mutex_lock(enc_row_mt_mutex_); enc_row_mt->firstpass_mt_exit = true; pthread_mutex_unlock(enc_row_mt_mutex_); #endif set_firstpass_encode_done(cpi); return 0; } error_info->setjmp = 1; AV1_COMMON *const cm = &cpi->common; int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id]; assert(cur_tile_id != -1); const BLOCK_SIZE fp_block_size = cpi->fp_block_size; const int unit_height = mi_size_high[fp_block_size]; int end_of_frame = 0; while (1) { int current_mi_row = -1; #if CONFIG_MULTITHREAD pthread_mutex_lock(enc_row_mt_mutex_); #endif bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit; if (!firstpass_mt_exit && !get_next_job(&cpi->tile_data[cur_tile_id], ¤t_mi_row, unit_height)) { // No jobs are available for the current tile. Query for the status of // other tiles and get the next job if available switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id, ¤t_mi_row, &end_of_frame, 1, fp_block_size); } #if CONFIG_MULTITHREAD pthread_mutex_unlock(enc_row_mt_mutex_); #endif // When firstpass_mt_exit is set to true, other workers need not pursue any // further jobs. if (firstpass_mt_exit || end_of_frame) break; TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id]; AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; ThreadData *td = thread_data->td; assert(current_mi_row != -1 && current_mi_row < this_tile->tile_info.mi_row_end); const int unit_height_log2 = mi_size_high_log2[fp_block_size]; av1_first_pass_row(cpi, td, this_tile, current_mi_row >> unit_height_log2, fp_block_size); #if CONFIG_MULTITHREAD pthread_mutex_lock(enc_row_mt_mutex_); #endif row_mt_sync->num_threads_working--; #if CONFIG_MULTITHREAD pthread_mutex_unlock(enc_row_mt_mutex_); #endif } error_info->setjmp = 0; return 1; } #endif static void launch_loop_filter_rows(AV1_COMMON *cm, EncWorkerData *thread_data, AV1EncRowMultiThreadInfo *enc_row_mt, int mib_size_log2) { AV1LfSync *const lf_sync = (AV1LfSync *)thread_data->lf_sync; const int sb_rows = get_sb_rows_in_frame(cm); AV1LfMTInfo *cur_job_info; bool row_mt_exit = false; (void)enc_row_mt; #if CONFIG_MULTITHREAD pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_; #endif while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) { LFWorkerData *const lf_data = (LFWorkerData *)thread_data->lf_data; const int lpf_opt_level = cur_job_info->lpf_opt_level; (void)sb_rows; #if CONFIG_MULTITHREAD const int cur_sb_row = cur_job_info->mi_row >> mib_size_log2; const int next_sb_row = AOMMIN(sb_rows - 1, cur_sb_row + 1); // Wait for current and next superblock row to finish encoding. pthread_mutex_lock(enc_row_mt_mutex_); while (!enc_row_mt->row_mt_exit && (enc_row_mt->num_tile_cols_done[cur_sb_row] < cm->tiles.cols || enc_row_mt->num_tile_cols_done[next_sb_row] < cm->tiles.cols)) { pthread_cond_wait(enc_row_mt->cond_, enc_row_mt_mutex_); } row_mt_exit = enc_row_mt->row_mt_exit; pthread_mutex_unlock(enc_row_mt_mutex_); #endif if (row_mt_exit) return; av1_thread_loop_filter_rows( lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd, cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir, lpf_opt_level, lf_sync, &thread_data->error_info, lf_data->params_buf, lf_data->tx_buf, mib_size_log2); } } static void set_encoding_done(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; const int mib_size = cm->seq_params->mib_size; // In case of row-multithreading, due to top-right dependency, the worker on // an SB row waits for the completion of the encode of the top and top-right // SBs. Hence, in case a thread (main/worker) encounters an error, update that // encoding of every SB row in the frame is complete in order to avoid the // dependent workers of every tile from waiting indefinitely. for (int tile_row = 0; tile_row < tile_rows; tile_row++) { for (int tile_col = 0; tile_col < tile_cols; tile_col++) { TileDataEnc *const this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; const TileInfo *const tile_info = &this_tile->tile_info; AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info); for (int mi_row = tile_info->mi_row_start, sb_row_in_tile = 0; mi_row < tile_info->mi_row_end; mi_row += mib_size, sb_row_in_tile++) { enc_row_mt->sync_write_ptr(row_mt_sync, sb_row_in_tile, sb_cols_in_tile - 1, sb_cols_in_tile); } } } } static bool lpf_mt_with_enc_enabled(int pipeline_lpf_mt_with_enc, const int filter_level[2]) { return pipeline_lpf_mt_with_enc && (filter_level[0] || filter_level[1]); } static int enc_row_mt_worker_hook(void *arg1, void *unused) { EncWorkerData *const thread_data = (EncWorkerData *)arg1; AV1_COMP *const cpi = thread_data->cpi; int thread_id = thread_data->thread_id; AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; #if CONFIG_MULTITHREAD pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_; #endif (void)unused; struct aom_internal_error_info *const error_info = &thread_data->error_info; AV1LfSync *const lf_sync = thread_data->lf_sync; MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; xd->error_info = error_info; AV1_COMMON *volatile const cm = &cpi->common; volatile const bool do_pipelined_lpf_mt_with_enc = lpf_mt_with_enc_enabled( cpi->mt_info.pipeline_lpf_mt_with_enc, cm->lf.filter_level); // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(error_info->jmp)) { error_info->setjmp = 0; #if CONFIG_MULTITHREAD pthread_mutex_lock(enc_row_mt_mutex_); enc_row_mt->row_mt_exit = true; // Wake up all the workers waiting in launch_loop_filter_rows() to exit in // case of an error. pthread_cond_broadcast(enc_row_mt->cond_); pthread_mutex_unlock(enc_row_mt_mutex_); #endif set_encoding_done(cpi); if (do_pipelined_lpf_mt_with_enc) { #if CONFIG_MULTITHREAD pthread_mutex_lock(lf_sync->job_mutex); lf_sync->lf_mt_exit = true; pthread_mutex_unlock(lf_sync->job_mutex); #endif av1_set_vert_loop_filter_done(&cpi->common, lf_sync, cpi->common.seq_params->mib_size_log2); } return 0; } error_info->setjmp = 1; const int mib_size_log2 = cm->seq_params->mib_size_log2; int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id]; // Preallocate the pc_tree for realtime coding to reduce the cost of memory // allocation. if (cpi->sf.rt_sf.use_nonrd_pick_mode) { thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size); if (!thread_data->td->pc_root) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); } else { thread_data->td->pc_root = NULL; } assert(cur_tile_id != -1); const BLOCK_SIZE fp_block_size = cpi->fp_block_size; int end_of_frame = 0; bool row_mt_exit = false; // When master thread does not have a valid job to process, xd->tile_ctx // is not set and it contains NULL pointer. This can result in NULL pointer // access violation if accessed beyond the encode stage. Hence, updating // thread_data->td->mb.e_mbd.tile_ctx is initialized with common frame // context to avoid NULL pointer access in subsequent stages. thread_data->td->mb.e_mbd.tile_ctx = cm->fc; while (1) { int current_mi_row = -1; #if CONFIG_MULTITHREAD pthread_mutex_lock(enc_row_mt_mutex_); #endif row_mt_exit = enc_row_mt->row_mt_exit; // row_mt_exit check here can be avoided as it is checked after // sync_read_ptr() in encode_sb_row(). However, checking row_mt_exit here, // tries to return before calling the function get_next_job(). if (!row_mt_exit && !get_next_job(&cpi->tile_data[cur_tile_id], ¤t_mi_row, cm->seq_params->mib_size)) { // No jobs are available for the current tile. Query for the status of // other tiles and get the next job if available switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id, ¤t_mi_row, &end_of_frame, 0, fp_block_size); } #if CONFIG_MULTITHREAD pthread_mutex_unlock(enc_row_mt_mutex_); #endif // When row_mt_exit is set to true, other workers need not pursue any // further jobs. if (row_mt_exit) { error_info->setjmp = 0; return 1; } if (end_of_frame) break; TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id]; AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; const TileInfo *const tile_info = &this_tile->tile_info; const int tile_row = tile_info->tile_row; const int tile_col = tile_info->tile_col; ThreadData *td = thread_data->td; const int sb_row = current_mi_row >> mib_size_log2; assert(current_mi_row != -1 && current_mi_row <= tile_info->mi_row_end); td->mb.e_mbd.tile_ctx = td->tctx; td->mb.tile_pb_ctx = &this_tile->tctx; td->abs_sum_level = 0; if (this_tile->allow_update_cdf) { td->mb.row_ctx = this_tile->row_ctx; if (current_mi_row == tile_info->mi_row_start) memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT)); } else { memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT)); } av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, &td->mb.e_mbd); #if !CONFIG_REALTIME_ONLY cfl_init(&td->mb.e_mbd.cfl, cm->seq_params); #endif if (td->mb.txfm_search_info.mb_rd_record != NULL) { av1_crc32c_calculator_init( &td->mb.txfm_search_info.mb_rd_record->crc_calculator); } av1_encode_sb_row(cpi, td, tile_row, tile_col, current_mi_row); #if CONFIG_MULTITHREAD pthread_mutex_lock(enc_row_mt_mutex_); #endif this_tile->abs_sum_level += td->abs_sum_level; row_mt_sync->num_threads_working--; enc_row_mt->num_tile_cols_done[sb_row]++; #if CONFIG_MULTITHREAD pthread_cond_broadcast(enc_row_mt->cond_); pthread_mutex_unlock(enc_row_mt_mutex_); #endif } if (do_pipelined_lpf_mt_with_enc) { // Loop-filter a superblock row if encoding of the current and next // superblock row is complete. // TODO(deepa.kg @ittiam.com) Evaluate encoder speed by interleaving // encoding and loop filter stage. launch_loop_filter_rows(cm, thread_data, enc_row_mt, mib_size_log2); } av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0, cpi->sf.part_sf.partition_search_type); thread_data->td->pc_root = NULL; error_info->setjmp = 0; return 1; } static int enc_worker_hook(void *arg1, void *unused) { EncWorkerData *const thread_data = (EncWorkerData *)arg1; AV1_COMP *const cpi = thread_data->cpi; MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; struct aom_internal_error_info *const error_info = &thread_data->error_info; const AV1_COMMON *const cm = &cpi->common; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; int t; (void)unused; xd->error_info = error_info; // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(error_info->jmp)) { error_info->setjmp = 0; return 0; } error_info->setjmp = 1; // Preallocate the pc_tree for realtime coding to reduce the cost of memory // allocation. if (cpi->sf.rt_sf.use_nonrd_pick_mode) { thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size); if (!thread_data->td->pc_root) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); } else { thread_data->td->pc_root = NULL; } for (t = thread_data->start; t < tile_rows * tile_cols; t += cpi->mt_info.num_workers) { int tile_row = t / tile_cols; int tile_col = t % tile_cols; TileDataEnc *const this_tile = &cpi->tile_data[tile_row * cm->tiles.cols + tile_col]; thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx; thread_data->td->mb.tile_pb_ctx = &this_tile->tctx; av1_encode_tile(cpi, thread_data->td, tile_row, tile_col); } av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0, cpi->sf.part_sf.partition_search_type); thread_data->td->pc_root = NULL; error_info->setjmp = 0; return 1; } void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi) { cpi->mt_info.workers = ppi->p_mt_info.workers; cpi->mt_info.num_workers = ppi->p_mt_info.num_workers; cpi->mt_info.tile_thr_data = ppi->p_mt_info.tile_thr_data; int i; for (i = MOD_FP; i < NUM_MT_MODULES; i++) { cpi->mt_info.num_mod_workers[i] = AOMMIN(cpi->mt_info.num_workers, ppi->p_mt_info.num_mod_workers[i]); } } void av1_init_cdef_worker(AV1_COMP *cpi) { // The allocation is done only for level 0 parallel frames. No change // in config is supported in the middle of a parallel encode set, since the // rest of the MT modules also do not support dynamic change of config. if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) return; PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info; int num_cdef_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_CDEF); av1_alloc_cdef_buffers(&cpi->common, &p_mt_info->cdef_worker, &cpi->mt_info.cdef_sync, num_cdef_workers, 1); cpi->mt_info.cdef_worker = p_mt_info->cdef_worker; } #if !CONFIG_REALTIME_ONLY void av1_init_lr_mt_buffers(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; AV1LrSync *lr_sync = &cpi->mt_info.lr_row_sync; if (lr_sync->sync_range) { if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) return; int num_lr_workers = av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_LR); assert(num_lr_workers <= lr_sync->num_workers); lr_sync->lrworkerdata[num_lr_workers - 1].rst_tmpbuf = cm->rst_tmpbuf; lr_sync->lrworkerdata[num_lr_workers - 1].rlbs = cm->rlbs; } } #endif #if CONFIG_MULTITHREAD void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) { AV1_COMMON *const cm = &cpi->common; MultiThreadInfo *const mt_info = &cpi->mt_info; if (setjmp(cm->error->jmp)) { cm->error->setjmp = 0; aom_internal_error_copy(&cpi->ppi->error, cm->error); } cm->error->setjmp = 1; // Initialize enc row MT object. if (is_first_pass || cpi->oxcf.row_mt == 1) { AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt; if (enc_row_mt->mutex_ == NULL) { CHECK_MEM_ERROR(cm, enc_row_mt->mutex_, aom_malloc(sizeof(*(enc_row_mt->mutex_)))); if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL); } if (enc_row_mt->cond_ == NULL) { CHECK_MEM_ERROR(cm, enc_row_mt->cond_, aom_malloc(sizeof(*(enc_row_mt->cond_)))); if (enc_row_mt->cond_) pthread_cond_init(enc_row_mt->cond_, NULL); } } if (!is_first_pass) { // Initialize global motion MT object. AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync; if (gm_sync->mutex_ == NULL) { CHECK_MEM_ERROR(cm, gm_sync->mutex_, aom_malloc(sizeof(*(gm_sync->mutex_)))); if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL); } #if !CONFIG_REALTIME_ONLY // Initialize temporal filtering MT object. AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync; if (tf_sync->mutex_ == NULL) { CHECK_MEM_ERROR(cm, tf_sync->mutex_, aom_malloc(sizeof(*tf_sync->mutex_))); if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL); } #endif // !CONFIG_REALTIME_ONLY // Initialize CDEF MT object. AV1CdefSync *cdef_sync = &mt_info->cdef_sync; if (cdef_sync->mutex_ == NULL) { CHECK_MEM_ERROR(cm, cdef_sync->mutex_, aom_malloc(sizeof(*(cdef_sync->mutex_)))); if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); } // Initialize loop filter MT object. AV1LfSync *lf_sync = &mt_info->lf_row_sync; // Number of superblock rows const int sb_rows = CEIL_POWER_OF_TWO(cm->height >> MI_SIZE_LOG2, MAX_MIB_SIZE_LOG2); PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info; int num_lf_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LPF); if (!lf_sync->sync_range || sb_rows != lf_sync->rows || num_lf_workers > lf_sync->num_workers) { av1_loop_filter_dealloc(lf_sync); av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_lf_workers); } // Initialize tpl MT object. AV1TplRowMultiThreadInfo *tpl_row_mt = &mt_info->tpl_row_mt; if (tpl_row_mt->mutex_ == NULL) { CHECK_MEM_ERROR(cm, tpl_row_mt->mutex_, aom_malloc(sizeof(*(tpl_row_mt->mutex_)))); if (tpl_row_mt->mutex_) pthread_mutex_init(tpl_row_mt->mutex_, NULL); } #if !CONFIG_REALTIME_ONLY if (is_restoration_used(cm)) { // Initialize loop restoration MT object. AV1LrSync *lr_sync = &mt_info->lr_row_sync; int rst_unit_size = cpi->sf.lpf_sf.min_lr_unit_size; int num_rows_lr = av1_lr_count_units(rst_unit_size, cm->height); int num_lr_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LR); if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows || num_lr_workers > lr_sync->num_workers || MAX_MB_PLANE > lr_sync->num_planes) { av1_loop_restoration_dealloc(lr_sync); av1_loop_restoration_alloc(lr_sync, cm, num_lr_workers, num_rows_lr, MAX_MB_PLANE, cm->width); } } #endif // Initialization of pack bitstream MT object. AV1EncPackBSSync *pack_bs_sync = &mt_info->pack_bs_sync; if (pack_bs_sync->mutex_ == NULL) { CHECK_MEM_ERROR(cm, pack_bs_sync->mutex_, aom_malloc(sizeof(*pack_bs_sync->mutex_))); if (pack_bs_sync->mutex_) pthread_mutex_init(pack_bs_sync->mutex_, NULL); } } cm->error->setjmp = 0; } #endif // CONFIG_MULTITHREAD // Computes the number of workers to be considered while allocating memory for a // multi-threaded module under FPMT. int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info, MULTI_THREADED_MODULES mod_name) { int num_mod_workers = p_mt_info->num_mod_workers[mod_name]; if (p_mt_info->num_mod_workers[MOD_FRAME_ENC] > 1) { // TODO(anyone): Change num_mod_workers to num_mod_workers[MOD_FRAME_ENC]. // As frame parallel jobs will only perform multi-threading for the encode // stage, we can limit the allocations according to num_enc_workers per // frame parallel encode(a.k.a num_mod_workers[MOD_FRAME_ENC]). num_mod_workers = p_mt_info->num_workers; } return num_mod_workers; } void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass) { PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; assert(p_mt_info->workers != NULL); assert(p_mt_info->tile_thr_data != NULL); int num_workers = p_mt_info->num_workers; int num_enc_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_ENC); assert(num_enc_workers <= num_workers); for (int i = num_workers - 1; i >= 0; i--) { EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i]; if (i > 0) { // Allocate thread data. ThreadData *td; AOM_CHECK_MEM_ERROR(&ppi->error, td, aom_memalign(32, sizeof(*td))); av1_zero(*td); thread_data->original_td = thread_data->td = td; // Set up shared coeff buffers. av1_setup_shared_coeff_buffer(&ppi->seq_params, &td->shared_coeff_buf, &ppi->error); AOM_CHECK_MEM_ERROR(&ppi->error, td->tmp_conv_dst, aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*td->tmp_conv_dst))); if (i < p_mt_info->num_mod_workers[MOD_FP]) { // Set up firstpass PICK_MODE_CONTEXT. td->firstpass_ctx = av1_alloc_pmc(ppi->cpi, BLOCK_16X16, &td->shared_coeff_buf); if (!td->firstpass_ctx) aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } if (!is_first_pass && i < num_enc_workers) { // Set up sms_tree. if (av1_setup_sms_tree(ppi->cpi, td)) { aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate SMS tree"); } for (int x = 0; x < 2; x++) for (int y = 0; y < 2; y++) AOM_CHECK_MEM_ERROR( &ppi->error, td->hash_value_buffer[x][y], (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH * sizeof(*td->hash_value_buffer[0][0]))); // Allocate frame counters in thread data. AOM_CHECK_MEM_ERROR(&ppi->error, td->counts, aom_calloc(1, sizeof(*td->counts))); // Allocate buffers used by palette coding mode. AOM_CHECK_MEM_ERROR(&ppi->error, td->palette_buffer, aom_memalign(16, sizeof(*td->palette_buffer))); // The buffers 'tmp_pred_bufs[]', 'comp_rd_buffer' and 'obmc_buffer' are // used in inter frames to store intermediate inter mode prediction // results and are not required for allintra encoding mode. Hence, the // memory allocations for these buffers are avoided for allintra // encoding mode. if (ppi->cpi->oxcf.kf_cfg.key_freq_max != 0) { alloc_obmc_buffers(&td->obmc_buffer, &ppi->error); alloc_compound_type_rd_buffers(&ppi->error, &td->comp_rd_buffer); for (int j = 0; j < 2; ++j) { AOM_CHECK_MEM_ERROR( &ppi->error, td->tmp_pred_bufs[j], aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*td->tmp_pred_bufs[j]))); } } if (is_gradient_caching_for_hog_enabled(ppi->cpi)) { const int plane_types = PLANE_TYPES >> ppi->seq_params.monochrome; AOM_CHECK_MEM_ERROR(&ppi->error, td->pixel_gradient_info, aom_malloc(sizeof(*td->pixel_gradient_info) * plane_types * MAX_SB_SQUARE)); } if (is_src_var_for_4x4_sub_blocks_caching_enabled(ppi->cpi)) { const BLOCK_SIZE sb_size = ppi->cpi->common.seq_params->sb_size; const int mi_count_in_sb = mi_size_wide[sb_size] * mi_size_high[sb_size]; AOM_CHECK_MEM_ERROR( &ppi->error, td->src_var_info_of_4x4_sub_blocks, aom_malloc(sizeof(*td->src_var_info_of_4x4_sub_blocks) * mi_count_in_sb)); } if (ppi->cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) { const int num_64x64_blocks = (ppi->seq_params.sb_size == BLOCK_64X64) ? 1 : 4; AOM_CHECK_MEM_ERROR( &ppi->error, td->vt64x64, aom_malloc(sizeof(*td->vt64x64) * num_64x64_blocks)); } } } if (!is_first_pass && ppi->cpi->oxcf.row_mt == 1 && i < num_enc_workers) { if (i == 0) { for (int j = 0; j < ppi->num_fp_contexts; j++) { AOM_CHECK_MEM_ERROR(&ppi->error, ppi->parallel_cpi[j]->td.tctx, (FRAME_CONTEXT *)aom_memalign( 16, sizeof(*ppi->parallel_cpi[j]->td.tctx))); } } else { AOM_CHECK_MEM_ERROR( &ppi->error, thread_data->td->tctx, (FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx))); } } } // Record the number of workers in encode stage multi-threading for which // allocation is done. p_mt_info->prev_num_enc_workers = num_enc_workers; } void av1_create_workers(AV1_PRIMARY *ppi, int num_workers) { PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; const AVxWorkerInterface *const winterface = aom_get_worker_interface(); assert(p_mt_info->num_workers == 0); AOM_CHECK_MEM_ERROR(&ppi->error, p_mt_info->workers, aom_malloc(num_workers * sizeof(*p_mt_info->workers))); AOM_CHECK_MEM_ERROR( &ppi->error, p_mt_info->tile_thr_data, aom_calloc(num_workers, sizeof(*p_mt_info->tile_thr_data))); for (int i = 0; i < num_workers; ++i) { AVxWorker *const worker = &p_mt_info->workers[i]; EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i]; winterface->init(worker); worker->thread_name = "aom enc worker"; thread_data->thread_id = i; // Set the starting tile for each thread. thread_data->start = i; if (i > 0) { // Create threads if (!winterface->reset(worker)) aom_internal_error(&ppi->error, AOM_CODEC_ERROR, "Tile encoder thread creation failed"); } winterface->sync(worker); ++p_mt_info->num_workers; } } // This function will change the state and free the mutex of corresponding // workers and terminate the object. The object can not be re-used unless a call // to reset() is made. void av1_terminate_workers(AV1_PRIMARY *ppi) { PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; for (int t = 0; t < p_mt_info->num_workers; ++t) { AVxWorker *const worker = &p_mt_info->workers[t]; aom_get_worker_interface()->end(worker); } } // This function returns 1 if frame parallel encode is supported for // the current configuration. Returns 0 otherwise. static inline int is_fpmt_config(const AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf) { // FPMT is enabled for AOM_Q and AOM_VBR. // TODO(Tarun): Test and enable resize config. if (oxcf->rc_cfg.mode == AOM_CBR || oxcf->rc_cfg.mode == AOM_CQ) { return 0; } if (ppi->use_svc) { return 0; } if (oxcf->tile_cfg.enable_large_scale_tile) { return 0; } if (oxcf->dec_model_cfg.timing_info_present) { return 0; } if (oxcf->mode != GOOD) { return 0; } if (oxcf->tool_cfg.error_resilient_mode) { return 0; } if (oxcf->resize_cfg.resize_mode) { return 0; } if (oxcf->pass != AOM_RC_SECOND_PASS) { return 0; } if (oxcf->max_threads < 2) { return 0; } if (!oxcf->fp_mt) { return 0; } return 1; } int av1_check_fpmt_config(AV1_PRIMARY *const ppi, const AV1EncoderConfig *const oxcf) { if (is_fpmt_config(ppi, oxcf)) return 1; // Reset frame parallel configuration for unsupported config if (ppi->num_fp_contexts > 1) { for (int i = 1; i < ppi->num_fp_contexts; i++) { // Release the previously-used frame-buffer if (ppi->parallel_cpi[i]->common.cur_frame != NULL) { --ppi->parallel_cpi[i]->common.cur_frame->ref_count; ppi->parallel_cpi[i]->common.cur_frame = NULL; } } int cur_gf_index = ppi->cpi->gf_frame_index; int reset_size = AOMMAX(0, ppi->gf_group.size - cur_gf_index); av1_zero_array(&ppi->gf_group.frame_parallel_level[cur_gf_index], reset_size); av1_zero_array(&ppi->gf_group.is_frame_non_ref[cur_gf_index], reset_size); av1_zero_array(&ppi->gf_group.src_offset[cur_gf_index], reset_size); memset(&ppi->gf_group.skip_frame_refresh[cur_gf_index][0], INVALID_IDX, sizeof(ppi->gf_group.skip_frame_refresh[cur_gf_index][0]) * reset_size * REF_FRAMES); memset(&ppi->gf_group.skip_frame_as_ref[cur_gf_index], INVALID_IDX, sizeof(ppi->gf_group.skip_frame_as_ref[cur_gf_index]) * reset_size); ppi->num_fp_contexts = 1; } return 0; } // A large value for threads used to compute the max num_enc_workers // possible for each resolution. #define MAX_THREADS 100 // Computes the max number of enc workers possible for each resolution. static inline int compute_max_num_enc_workers( CommonModeInfoParams *const mi_params, int mib_size_log2) { int num_sb_rows = CEIL_POWER_OF_TWO(mi_params->mi_rows, mib_size_log2); int num_sb_cols = CEIL_POWER_OF_TWO(mi_params->mi_cols, mib_size_log2); return AOMMIN((num_sb_cols + 1) >> 1, num_sb_rows); } // Computes the number of frame parallel(fp) contexts to be created // based on the number of max_enc_workers. int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) { ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] = 0; if (!av1_check_fpmt_config(ppi, oxcf)) { return 1; } int max_num_enc_workers = compute_max_num_enc_workers( &ppi->cpi->common.mi_params, ppi->cpi->common.seq_params->mib_size_log2); // Scaling factors and rounding factors used to tune worker_per_frame // computation. int rounding_factor[2] = { 2, 4 }; int scaling_factor[2] = { 4, 8 }; int is_480p_or_lesser = AOMMIN(oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height) <= 480; int is_sb_64 = 0; if (ppi->cpi != NULL) is_sb_64 = ppi->cpi->common.seq_params->sb_size == BLOCK_64X64; // A parallel frame encode has at least 1/4th the // theoretical limit of max enc workers in default case. For resolutions // larger than 480p, if SB size is 64x64, optimal performance is obtained with // limit of 1/8. int index = (!is_480p_or_lesser && is_sb_64) ? 1 : 0; int workers_per_frame = AOMMAX(1, (max_num_enc_workers + rounding_factor[index]) / scaling_factor[index]); int max_threads = oxcf->max_threads; int num_fp_contexts = max_threads / workers_per_frame; // Based on empirical results, FPMT gains with multi-tile are significant when // more parallel frames are available. Use FPMT with multi-tile encode only // when sufficient threads are available for parallel encode of // MAX_PARALLEL_FRAMES frames. if (oxcf->tile_cfg.tile_columns > 0 || oxcf->tile_cfg.tile_rows > 0) { if (num_fp_contexts < MAX_PARALLEL_FRAMES) num_fp_contexts = 1; } num_fp_contexts = AOMMAX(1, AOMMIN(num_fp_contexts, MAX_PARALLEL_FRAMES)); // Limit recalculated num_fp_contexts to ppi->num_fp_contexts. num_fp_contexts = (ppi->num_fp_contexts == 1) ? num_fp_contexts : AOMMIN(num_fp_contexts, ppi->num_fp_contexts); if (num_fp_contexts > 1) { ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] = AOMMIN(max_num_enc_workers * num_fp_contexts, oxcf->max_threads); } return num_fp_contexts; } // Computes the number of workers to process each of the parallel frames. static inline int compute_num_workers_per_frame( const int num_workers, const int parallel_frame_count) { // Number of level 2 workers per frame context (floor division). int workers_per_frame = (num_workers / parallel_frame_count); return workers_per_frame; } static inline void restore_workers_after_fpmt(AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared); // Prepare level 1 workers. This function is only called for // parallel_frame_count > 1. This function populates the mt_info structure of // frame level contexts appropriately by dividing the total number of available // workers amongst the frames as level 2 workers. It also populates the hook and // data members of level 1 workers. static inline void prepare_fpmt_workers(AV1_PRIMARY *ppi, AV1_COMP_DATA *first_cpi_data, AVxWorkerHook hook, int parallel_frame_count) { assert(parallel_frame_count <= ppi->num_fp_contexts && parallel_frame_count > 1); PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; int num_workers = p_mt_info->num_workers; volatile int frame_idx = 0; volatile int i = 0; while (i < num_workers) { // Assign level 1 worker AVxWorker *frame_worker = p_mt_info->p_workers[frame_idx] = &p_mt_info->workers[i]; AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx]; MultiThreadInfo *mt_info = &cur_cpi->mt_info; // This 'aom_internal_error_info' pointer is not derived from the local // pointer ('AV1_COMMON *const cm') to silence the compiler warning // "variable 'cm' might be clobbered by 'longjmp' or 'vfork' [-Wclobbered]". struct aom_internal_error_info *const error = cur_cpi->common.error; // The jmp_buf is valid only within the scope of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(error->jmp)) { error->setjmp = 0; restore_workers_after_fpmt(ppi, parallel_frame_count, i); aom_internal_error_copy(&ppi->error, error); } error->setjmp = 1; AV1_COMMON *const cm = &cur_cpi->common; // Assign start of level 2 worker pool mt_info->workers = &p_mt_info->workers[i]; mt_info->tile_thr_data = &p_mt_info->tile_thr_data[i]; // Assign number of workers for each frame in the parallel encode set. mt_info->num_workers = compute_num_workers_per_frame( num_workers - i, parallel_frame_count - frame_idx); for (int j = MOD_FP; j < NUM_MT_MODULES; j++) { mt_info->num_mod_workers[j] = AOMMIN(mt_info->num_workers, p_mt_info->num_mod_workers[j]); } if (p_mt_info->cdef_worker != NULL) { mt_info->cdef_worker = &p_mt_info->cdef_worker[i]; // Back up the original cdef_worker pointers. mt_info->restore_state_buf.cdef_srcbuf = mt_info->cdef_worker->srcbuf; const int num_planes = av1_num_planes(cm); for (int plane = 0; plane < num_planes; plane++) mt_info->restore_state_buf.cdef_colbuf[plane] = mt_info->cdef_worker->colbuf[plane]; } #if !CONFIG_REALTIME_ONLY if (is_restoration_used(cm)) { // Back up the original LR buffers before update. int idx = i + mt_info->num_workers - 1; assert(idx < mt_info->lr_row_sync.num_workers); mt_info->restore_state_buf.rst_tmpbuf = mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf; mt_info->restore_state_buf.rlbs = mt_info->lr_row_sync.lrworkerdata[idx].rlbs; // Update LR buffers. mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf = cm->rst_tmpbuf; mt_info->lr_row_sync.lrworkerdata[idx].rlbs = cm->rlbs; } #endif i += mt_info->num_workers; // At this stage, the thread specific CDEF buffers for the current frame's // 'common' and 'cdef_sync' only need to be allocated. 'cdef_worker' has // already been allocated across parallel frames. av1_alloc_cdef_buffers(cm, &p_mt_info->cdef_worker, &mt_info->cdef_sync, p_mt_info->num_workers, 0); frame_worker->hook = hook; frame_worker->data1 = cur_cpi; frame_worker->data2 = (frame_idx == 0) ? first_cpi_data : &ppi->parallel_frames_data[frame_idx - 1]; frame_idx++; error->setjmp = 0; } p_mt_info->p_num_workers = parallel_frame_count; } // Launch level 1 workers to perform frame parallel encode. static inline void launch_fpmt_workers(AV1_PRIMARY *ppi) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); int num_workers = ppi->p_mt_info.p_num_workers; for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = ppi->p_mt_info.p_workers[i]; if (i == 0) winterface->execute(worker); else winterface->launch(worker); } } // Restore worker states after parallel encode. static inline void restore_workers_after_fpmt(AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared) { assert(parallel_frame_count <= ppi->num_fp_contexts && parallel_frame_count > 1); (void)parallel_frame_count; PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; int frame_idx = 0; int i = 0; while (i < num_fpmt_workers_prepared) { AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx]; MultiThreadInfo *mt_info = &cur_cpi->mt_info; const AV1_COMMON *const cm = &cur_cpi->common; const int num_planes = av1_num_planes(cm); // Restore the original cdef_worker pointers. if (p_mt_info->cdef_worker != NULL) { mt_info->cdef_worker->srcbuf = mt_info->restore_state_buf.cdef_srcbuf; for (int plane = 0; plane < num_planes; plane++) mt_info->cdef_worker->colbuf[plane] = mt_info->restore_state_buf.cdef_colbuf[plane]; } #if !CONFIG_REALTIME_ONLY if (is_restoration_used(cm)) { // Restore the original LR buffers. int idx = i + mt_info->num_workers - 1; assert(idx < mt_info->lr_row_sync.num_workers); mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf = mt_info->restore_state_buf.rst_tmpbuf; mt_info->lr_row_sync.lrworkerdata[idx].rlbs = mt_info->restore_state_buf.rlbs; } #endif frame_idx++; i += mt_info->num_workers; } } // Synchronize level 1 workers. static inline void sync_fpmt_workers(AV1_PRIMARY *ppi, int frames_in_parallel_set) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); int num_workers = ppi->p_mt_info.p_num_workers; int had_error = 0; // Points to error in the earliest display order frame in the parallel set. const struct aom_internal_error_info *error = NULL; // Encoding ends. for (int i = num_workers - 1; i >= 0; --i) { AVxWorker *const worker = ppi->p_mt_info.p_workers[i]; if (!winterface->sync(worker)) { had_error = 1; error = ppi->parallel_cpi[i]->common.error; } } restore_workers_after_fpmt(ppi, frames_in_parallel_set, ppi->p_mt_info.num_workers); if (had_error) aom_internal_error_copy(&ppi->error, error); } static int get_compressed_data_hook(void *arg1, void *arg2) { AV1_COMP *cpi = (AV1_COMP *)arg1; AV1_COMP_DATA *cpi_data = (AV1_COMP_DATA *)arg2; int status = av1_get_compressed_data(cpi, cpi_data); // AOM_CODEC_OK(0) means no error. return !status; } // This function encodes the raw frame data for each frame in parallel encode // set, and outputs the frame bit stream to the designated buffers. void av1_compress_parallel_frames(AV1_PRIMARY *const ppi, AV1_COMP_DATA *const first_cpi_data) { // Bitmask for the frame buffers referenced by cpi->scaled_ref_buf // corresponding to frames in the current parallel encode set. int ref_buffers_used_map = 0; int frames_in_parallel_set = av1_init_parallel_frame_context( first_cpi_data, ppi, &ref_buffers_used_map); prepare_fpmt_workers(ppi, first_cpi_data, get_compressed_data_hook, frames_in_parallel_set); launch_fpmt_workers(ppi); sync_fpmt_workers(ppi, frames_in_parallel_set); // Release cpi->scaled_ref_buf corresponding to frames in the current parallel // encode set. for (int i = 0; i < frames_in_parallel_set; ++i) { av1_release_scaled_references_fpmt(ppi->parallel_cpi[i]); } av1_decrement_ref_counts_fpmt(ppi->cpi->common.buffer_pool, ref_buffers_used_map); } static inline void launch_workers(MultiThreadInfo *const mt_info, int num_workers) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = &mt_info->workers[i]; worker->had_error = 0; if (i == 0) winterface->execute(worker); else winterface->launch(worker); } } static inline void sync_enc_workers(MultiThreadInfo *const mt_info, AV1_COMMON *const cm, int num_workers) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); const AVxWorker *const worker_main = &mt_info->workers[0]; int had_error = worker_main->had_error; struct aom_internal_error_info error_info; // Read the error_info of main thread. if (had_error) { error_info = ((EncWorkerData *)worker_main->data1)->error_info; } // Encoding ends. for (int i = num_workers - 1; i > 0; i--) { AVxWorker *const worker = &mt_info->workers[i]; if (!winterface->sync(worker)) { had_error = 1; error_info = ((EncWorkerData *)worker->data1)->error_info; } } if (had_error) aom_internal_error_copy(cm->error, &error_info); // Restore xd->error_info of the main thread back to cm->error so that the // multithreaded code, when executed using a single thread, has a valid // xd->error_info. MACROBLOCKD *const xd = &((EncWorkerData *)worker_main->data1)->td->mb.e_mbd; xd->error_info = cm->error; } static inline void accumulate_counters_enc_workers(AV1_COMP *cpi, int num_workers) { for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = &cpi->mt_info.workers[i]; EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; cpi->intrabc_used |= thread_data->td->intrabc_used; cpi->deltaq_used |= thread_data->td->deltaq_used; // Accumulate rtc counters. if (!frame_is_intra_only(&cpi->common)) av1_accumulate_rtc_counters(cpi, &thread_data->td->mb); cpi->palette_pixel_num += thread_data->td->mb.palette_pixels; if (thread_data->td != &cpi->td) { // Keep these conditional expressions in sync with the corresponding ones // in prepare_enc_workers(). if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) { aom_free(thread_data->td->mv_costs_alloc); thread_data->td->mv_costs_alloc = NULL; } if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) { aom_free(thread_data->td->dv_costs_alloc); thread_data->td->dv_costs_alloc = NULL; } } av1_dealloc_mb_data(&thread_data->td->mb, av1_num_planes(&cpi->common)); // Accumulate counters. if (i > 0) { av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts); accumulate_rd_opt(&cpi->td, thread_data->td); cpi->td.mb.txfm_search_info.txb_split_count += thread_data->td->mb.txfm_search_info.txb_split_count; #if CONFIG_SPEED_STATS cpi->td.mb.txfm_search_info.tx_search_count += thread_data->td->mb.txfm_search_info.tx_search_count; #endif // CONFIG_SPEED_STATS } } } static inline void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, int num_workers) { MultiThreadInfo *const mt_info = &cpi->mt_info; AV1_COMMON *const cm = &cpi->common; for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = &mt_info->workers[i]; EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; worker->hook = hook; worker->data1 = thread_data; worker->data2 = NULL; thread_data->thread_id = i; // Set the starting tile for each thread. thread_data->start = i; thread_data->cpi = cpi; if (i == 0) { thread_data->td = &cpi->td; } else { thread_data->td = thread_data->original_td; } thread_data->td->intrabc_used = 0; thread_data->td->deltaq_used = 0; thread_data->td->abs_sum_level = 0; thread_data->td->rd_counts.seg_tmp_pred_cost[0] = 0; thread_data->td->rd_counts.seg_tmp_pred_cost[1] = 0; // Before encoding a frame, copy the thread data from cpi. if (thread_data->td != &cpi->td) { thread_data->td->mb = cpi->td.mb; thread_data->td->rd_counts = cpi->td.rd_counts; thread_data->td->mb.obmc_buffer = thread_data->td->obmc_buffer; for (int x = 0; x < 2; x++) { for (int y = 0; y < 2; y++) { memcpy(thread_data->td->hash_value_buffer[x][y], cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y], AOM_BUFFER_SIZE_FOR_BLOCK_HASH * sizeof(*thread_data->td->hash_value_buffer[0][0])); thread_data->td->mb.intrabc_hash_info.hash_value_buffer[x][y] = thread_data->td->hash_value_buffer[x][y]; } } // Keep these conditional expressions in sync with the corresponding ones // in accumulate_counters_enc_workers(). if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) { CHECK_MEM_ERROR( cm, thread_data->td->mv_costs_alloc, (MvCosts *)aom_malloc(sizeof(*thread_data->td->mv_costs_alloc))); thread_data->td->mb.mv_costs = thread_data->td->mv_costs_alloc; memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs, sizeof(MvCosts)); } if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) { // Reset dv_costs to NULL for worker threads when dv cost update is // enabled so that only dv_cost_upd_level needs to be checked before the // aom_free() call for the same. thread_data->td->mb.dv_costs = NULL; if (av1_need_dv_costs(cpi)) { CHECK_MEM_ERROR(cm, thread_data->td->dv_costs_alloc, (IntraBCMVCosts *)aom_malloc( sizeof(*thread_data->td->dv_costs_alloc))); thread_data->td->mb.dv_costs = thread_data->td->dv_costs_alloc; memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs, sizeof(IntraBCMVCosts)); } } } av1_alloc_mb_data(cpi, &thread_data->td->mb); // Reset rtc counters. av1_init_rtc_counters(&thread_data->td->mb); thread_data->td->mb.palette_pixels = 0; if (thread_data->td->counts != &cpi->counts) { memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts)); } if (i > 0) { thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer; thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer; thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst; for (int j = 0; j < 2; ++j) { thread_data->td->mb.tmp_pred_bufs[j] = thread_data->td->tmp_pred_bufs[j]; } thread_data->td->mb.pixel_gradient_info = thread_data->td->pixel_gradient_info; thread_data->td->mb.src_var_info_of_4x4_sub_blocks = thread_data->td->src_var_info_of_4x4_sub_blocks; thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst; for (int j = 0; j < 2; ++j) { thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] = thread_data->td->mb.tmp_pred_bufs[j]; } } } } #if !CONFIG_REALTIME_ONLY static inline void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, int num_workers) { AV1_COMMON *const cm = &cpi->common; MultiThreadInfo *const mt_info = &cpi->mt_info; for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = &mt_info->workers[i]; EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; worker->hook = hook; worker->data1 = thread_data; worker->data2 = NULL; thread_data->thread_id = i; // Set the starting tile for each thread. thread_data->start = i; thread_data->cpi = cpi; if (i == 0) { thread_data->td = &cpi->td; } else { thread_data->td = thread_data->original_td; // Before encoding a frame, copy the thread data from cpi. thread_data->td->mb = cpi->td.mb; } av1_alloc_src_diff_buf(cm, &thread_data->td->mb); } } #endif // Computes the number of workers for row multi-threading of encoding stage static inline int compute_num_enc_row_mt_workers(const AV1_COMMON *cm, int max_threads) { TileInfo tile_info; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; int total_num_threads_row_mt = 0; for (int row = 0; row < tile_rows; row++) { for (int col = 0; col < tile_cols; col++) { av1_tile_init(&tile_info, cm, row, col); const int num_sb_rows_in_tile = av1_get_sb_rows_in_tile(cm, &tile_info); const int num_sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, &tile_info); total_num_threads_row_mt += AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile); } } return AOMMIN(max_threads, total_num_threads_row_mt); } // Computes the number of workers for tile multi-threading of encoding stage static inline int compute_num_enc_tile_mt_workers(const AV1_COMMON *cm, int max_threads) { const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; return AOMMIN(max_threads, tile_cols * tile_rows); } // Find max worker of all MT stages int av1_get_max_num_workers(const AV1_COMP *cpi) { int max_num_workers = 0; for (int i = MOD_FP; i < NUM_MT_MODULES; i++) max_num_workers = AOMMAX(cpi->ppi->p_mt_info.num_mod_workers[i], max_num_workers); assert(max_num_workers >= 1); return AOMMIN(max_num_workers, cpi->oxcf.max_threads); } // Computes the number of workers for encoding stage (row/tile multi-threading) static int compute_num_enc_workers(const AV1_COMP *cpi, int max_workers) { if (max_workers <= 1) return 1; if (cpi->oxcf.row_mt) return compute_num_enc_row_mt_workers(&cpi->common, max_workers); else return compute_num_enc_tile_mt_workers(&cpi->common, max_workers); } void av1_encode_tiles_mt(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; MultiThreadInfo *const mt_info = &cpi->mt_info; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; int num_workers = mt_info->num_mod_workers[MOD_ENC]; assert(IMPLIES(cpi->tile_data == NULL, cpi->allocated_tiles < tile_cols * tile_rows)); if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi); av1_init_tile_data(cpi); num_workers = AOMMIN(num_workers, mt_info->num_workers); prepare_enc_workers(cpi, enc_worker_hook, num_workers); launch_workers(&cpi->mt_info, num_workers); sync_enc_workers(&cpi->mt_info, cm, num_workers); accumulate_counters_enc_workers(cpi, num_workers); } // Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int' // members, so we treat it as an array, and sum over the whole length. void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts, const FRAME_COUNTS *counts) { unsigned int *const acc = (unsigned int *)acc_counts; const unsigned int *const cnt = (const unsigned int *)counts; const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int); for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i]; } // Computes the maximum number of sb rows and sb_cols across tiles which are // used to allocate memory for multi-threaded encoding with row-mt=1. static inline void compute_max_sb_rows_cols(const AV1_COMMON *cm, int *max_sb_rows_in_tile, int *max_sb_cols_in_tile) { const int tile_rows = cm->tiles.rows; const int mib_size_log2 = cm->seq_params->mib_size_log2; const int num_mi_rows = cm->mi_params.mi_rows; const int *const row_start_sb = cm->tiles.row_start_sb; for (int row = 0; row < tile_rows; row++) { const int mi_row_start = row_start_sb[row] << mib_size_log2; const int mi_row_end = AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows); const int num_sb_rows_in_tile = CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, mib_size_log2); *max_sb_rows_in_tile = AOMMAX(*max_sb_rows_in_tile, num_sb_rows_in_tile); } const int tile_cols = cm->tiles.cols; const int num_mi_cols = cm->mi_params.mi_cols; const int *const col_start_sb = cm->tiles.col_start_sb; for (int col = 0; col < tile_cols; col++) { const int mi_col_start = col_start_sb[col] << mib_size_log2; const int mi_col_end = AOMMIN(col_start_sb[col + 1] << mib_size_log2, num_mi_cols); const int num_sb_cols_in_tile = CEIL_POWER_OF_TWO(mi_col_end - mi_col_start, mib_size_log2); *max_sb_cols_in_tile = AOMMAX(*max_sb_cols_in_tile, num_sb_cols_in_tile); } } #if !CONFIG_REALTIME_ONLY // Computes the number of workers for firstpass stage (row/tile multi-threading) int av1_fp_compute_num_enc_workers(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; int total_num_threads_row_mt = 0; TileInfo tile_info; if (cpi->oxcf.max_threads <= 1) return 1; for (int row = 0; row < tile_rows; row++) { for (int col = 0; col < tile_cols; col++) { av1_tile_init(&tile_info, cm, row, col); const int num_mb_rows_in_tile = av1_get_unit_rows_in_tile(&tile_info, cpi->fp_block_size); const int num_mb_cols_in_tile = av1_get_unit_cols_in_tile(&tile_info, cpi->fp_block_size); total_num_threads_row_mt += AOMMIN((num_mb_cols_in_tile + 1) >> 1, num_mb_rows_in_tile); } } return AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt); } // Computes the maximum number of mb_rows for row multi-threading of firstpass // stage static inline int fp_compute_max_mb_rows(const AV1_COMMON *cm, BLOCK_SIZE fp_block_size) { const int tile_rows = cm->tiles.rows; const int unit_height_log2 = mi_size_high_log2[fp_block_size]; const int mib_size_log2 = cm->seq_params->mib_size_log2; const int num_mi_rows = cm->mi_params.mi_rows; const int *const row_start_sb = cm->tiles.row_start_sb; int max_mb_rows = 0; for (int row = 0; row < tile_rows; row++) { const int mi_row_start = row_start_sb[row] << mib_size_log2; const int mi_row_end = AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows); const int num_mb_rows_in_tile = CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, unit_height_log2); max_mb_rows = AOMMAX(max_mb_rows, num_mb_rows_in_tile); } return max_mb_rows; } #endif static void lpf_pipeline_mt_init(AV1_COMP *cpi, int num_workers) { // Pipelining of loop-filtering after encoding is enabled when loop-filter // level is chosen based on quantizer and frame type. It is disabled in case // of 'LOOPFILTER_SELECTIVELY' as the stats collected during encoding stage // decides the filter level. Loop-filtering is disabled in case // of non-reference frames and for frames with intra block copy tool enabled. AV1_COMMON *cm = &cpi->common; const int use_loopfilter = is_loopfilter_used(cm); const int use_superres = av1_superres_scaled(cm); const int use_cdef = is_cdef_used(cm); const int use_restoration = is_restoration_used(cm); MultiThreadInfo *const mt_info = &cpi->mt_info; MACROBLOCKD *xd = &cpi->td.mb.e_mbd; const unsigned int skip_apply_postproc_filters = derive_skip_apply_postproc_filters(cpi, use_loopfilter, use_cdef, use_superres, use_restoration); mt_info->pipeline_lpf_mt_with_enc = (cpi->oxcf.mode == REALTIME) && (cpi->oxcf.speed >= 5) && (cpi->sf.lpf_sf.lpf_pick == LPF_PICK_FROM_Q) && (cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY) && !cpi->ppi->rtc_ref.non_reference_frame && !cm->features.allow_intrabc && ((skip_apply_postproc_filters & SKIP_APPLY_LOOPFILTER) == 0); if (!mt_info->pipeline_lpf_mt_with_enc) return; set_postproc_filter_default_params(cm); if (!use_loopfilter) return; const LPF_PICK_METHOD method = cpi->sf.lpf_sf.lpf_pick; assert(method == LPF_PICK_FROM_Q); assert(cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY); av1_pick_filter_level(cpi->source, cpi, method); struct loopfilter *lf = &cm->lf; const int plane_start = 0; const int plane_end = av1_num_planes(cm); int planes_to_lf[MAX_MB_PLANE]; if (lpf_mt_with_enc_enabled(cpi->mt_info.pipeline_lpf_mt_with_enc, lf->filter_level)) { set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end); int lpf_opt_level = get_lpf_opt_level(&cpi->sf); assert(lpf_opt_level == 2); const int start_mi_row = 0; const int end_mi_row = start_mi_row + cm->mi_params.mi_rows; av1_loop_filter_frame_init(cm, plane_start, plane_end); assert(mt_info->num_mod_workers[MOD_ENC] == mt_info->num_mod_workers[MOD_LPF]); loop_filter_frame_mt_init(cm, start_mi_row, end_mi_row, planes_to_lf, mt_info->num_mod_workers[MOD_LPF], &mt_info->lf_row_sync, lpf_opt_level, cm->seq_params->mib_size_log2); for (int i = num_workers - 1; i >= 0; i--) { EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; // Initialize loopfilter data thread_data->lf_sync = &mt_info->lf_row_sync; thread_data->lf_data = &thread_data->lf_sync->lfdata[i]; loop_filter_data_reset(thread_data->lf_data, &cm->cur_frame->buf, cm, xd); } } } void av1_encode_tiles_row_mt(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; MultiThreadInfo *const mt_info = &cpi->mt_info; AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; const int sb_rows_in_frame = get_sb_rows_in_frame(cm); int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id; int max_sb_rows_in_tile = 0, max_sb_cols_in_tile = 0; int num_workers = mt_info->num_mod_workers[MOD_ENC]; compute_max_sb_rows_cols(cm, &max_sb_rows_in_tile, &max_sb_cols_in_tile); const bool alloc_row_mt_mem = (enc_row_mt->allocated_tile_cols != tile_cols || enc_row_mt->allocated_tile_rows != tile_rows || enc_row_mt->allocated_rows != max_sb_rows_in_tile || enc_row_mt->allocated_cols != (max_sb_cols_in_tile - 1) || enc_row_mt->allocated_sb_rows != sb_rows_in_frame); const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows; assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data)); if (alloc_tile_data) { av1_alloc_tile_data(cpi); } assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem)); if (alloc_row_mt_mem) { row_mt_mem_alloc(cpi, max_sb_rows_in_tile, max_sb_cols_in_tile, cpi->oxcf.algo_cfg.cdf_update_mode); } num_workers = AOMMIN(num_workers, mt_info->num_workers); lpf_pipeline_mt_init(cpi, num_workers); av1_init_tile_data(cpi); memset(thread_id_to_tile_id, -1, sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS); memset(enc_row_mt->num_tile_cols_done, 0, sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows_in_frame); enc_row_mt->row_mt_exit = false; for (int tile_row = 0; tile_row < tile_rows; tile_row++) { for (int tile_col = 0; tile_col < tile_cols; tile_col++) { int tile_index = tile_row * tile_cols + tile_col; TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; // Initialize num_finished_cols to -1 for all rows. memset(row_mt_sync->num_finished_cols, -1, sizeof(*row_mt_sync->num_finished_cols) * max_sb_rows_in_tile); row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start; row_mt_sync->num_threads_working = 0; row_mt_sync->intrabc_extra_top_right_sb_delay = av1_get_intrabc_extra_top_right_sb_delay(cm); av1_inter_mode_data_init(this_tile); av1_zero_above_context(cm, &cpi->td.mb.e_mbd, this_tile->tile_info.mi_col_start, this_tile->tile_info.mi_col_end, tile_row); } } assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows, num_workers); prepare_enc_workers(cpi, enc_row_mt_worker_hook, num_workers); launch_workers(&cpi->mt_info, num_workers); sync_enc_workers(&cpi->mt_info, cm, num_workers); if (cm->delta_q_info.delta_lf_present_flag) update_delta_lf_for_row_mt(cpi); accumulate_counters_enc_workers(cpi, num_workers); } #if !CONFIG_REALTIME_ONLY static void dealloc_thread_data_src_diff_buf(AV1_COMP *cpi, int num_workers) { for (int i = num_workers - 1; i >= 0; --i) { EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i]; if (thread_data->td != &cpi->td) av1_dealloc_src_diff_buf(&thread_data->td->mb, av1_num_planes(&cpi->common)); } } void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; MultiThreadInfo *const mt_info = &cpi->mt_info; AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id; int num_workers = 0; int max_mb_rows = 0; max_mb_rows = fp_compute_max_mb_rows(cm, cpi->fp_block_size); const bool alloc_row_mt_mem = enc_row_mt->allocated_tile_cols != tile_cols || enc_row_mt->allocated_tile_rows != tile_rows || enc_row_mt->allocated_rows != max_mb_rows; const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows; assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data)); if (alloc_tile_data) { av1_alloc_tile_data(cpi); } assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem)); if (alloc_row_mt_mem) { row_mt_mem_alloc(cpi, max_mb_rows, -1, 0); } av1_init_tile_data(cpi); // For pass = 1, compute the no. of workers needed. For single-pass encode // (pass = 0), no. of workers are already computed. if (mt_info->num_mod_workers[MOD_FP] == 0) num_workers = av1_fp_compute_num_enc_workers(cpi); else num_workers = mt_info->num_mod_workers[MOD_FP]; memset(thread_id_to_tile_id, -1, sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS); enc_row_mt->firstpass_mt_exit = false; for (int tile_row = 0; tile_row < tile_rows; tile_row++) { for (int tile_col = 0; tile_col < tile_cols; tile_col++) { int tile_index = tile_row * tile_cols + tile_col; TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; // Initialize num_finished_cols to -1 for all rows. memset(row_mt_sync->num_finished_cols, -1, sizeof(*row_mt_sync->num_finished_cols) * max_mb_rows); row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start; row_mt_sync->num_threads_working = 0; // intraBC mode is not evaluated during first-pass encoding. Hence, no // additional top-right delay is required. row_mt_sync->intrabc_extra_top_right_sb_delay = 0; } } num_workers = AOMMIN(num_workers, mt_info->num_workers); assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows, num_workers); fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers); launch_workers(&cpi->mt_info, num_workers); sync_enc_workers(&cpi->mt_info, cm, num_workers); dealloc_thread_data_src_diff_buf(cpi, num_workers); } void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c) { (void)tpl_mt_sync; (void)r; (void)c; } void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c, int cols) { (void)tpl_mt_sync; (void)r; (void)c; (void)cols; } void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r, int c) { #if CONFIG_MULTITHREAD int nsync = tpl_row_mt_sync->sync_range; if (r) { pthread_mutex_t *const mutex = &tpl_row_mt_sync->mutex_[r - 1]; pthread_mutex_lock(mutex); while (c > tpl_row_mt_sync->num_finished_cols[r - 1] - nsync) pthread_cond_wait(&tpl_row_mt_sync->cond_[r - 1], mutex); pthread_mutex_unlock(mutex); } #else (void)tpl_row_mt_sync; (void)r; (void)c; #endif // CONFIG_MULTITHREAD } void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r, int c, int cols) { #if CONFIG_MULTITHREAD int nsync = tpl_row_mt_sync->sync_range; int cur; // Only signal when there are enough encoded blocks for next row to run. int sig = 1; if (c < cols - 1) { cur = c; if (c % nsync) sig = 0; } else { cur = cols + nsync; } if (sig) { pthread_mutex_lock(&tpl_row_mt_sync->mutex_[r]); // When a thread encounters an error, num_finished_cols[r] is set to maximum // column number. In this case, the AOMMAX operation here ensures that // num_finished_cols[r] is not overwritten with a smaller value thus // preventing the infinite waiting of threads in the relevant sync_read() // function. tpl_row_mt_sync->num_finished_cols[r] = AOMMAX(tpl_row_mt_sync->num_finished_cols[r], cur); pthread_cond_signal(&tpl_row_mt_sync->cond_[r]); pthread_mutex_unlock(&tpl_row_mt_sync->mutex_[r]); } #else (void)tpl_row_mt_sync; (void)r; (void)c; (void)cols; #endif // CONFIG_MULTITHREAD } static inline void set_mode_estimation_done(AV1_COMP *cpi) { const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; TplParams *const tpl_data = &cpi->ppi->tpl_data; const BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); const int mi_height = mi_size_high[bsize]; AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt; const int tplb_cols_in_tile = ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]); // In case of tpl row-multithreading, due to top-right dependency, the worker // on an mb_row waits for the completion of the tpl processing of the top and // top-right blocks. Hence, in case a thread (main/worker) encounters an // error, update that the tpl processing of every mb_row in the frame is // complete in order to avoid dependent workers waiting indefinitely. for (int mi_row = 0, tplb_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height, tplb_row++) { (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row, tplb_cols_in_tile - 1, tplb_cols_in_tile); } } // Each worker calls tpl_worker_hook() and computes the tpl data. static int tpl_worker_hook(void *arg1, void *unused) { (void)unused; EncWorkerData *thread_data = (EncWorkerData *)arg1; AV1_COMP *cpi = thread_data->cpi; AV1_COMMON *cm = &cpi->common; MACROBLOCK *x = &thread_data->td->mb; MACROBLOCKD *xd = &x->e_mbd; TplTxfmStats *tpl_txfm_stats = &thread_data->td->tpl_txfm_stats; TplBuffers *tpl_tmp_buffers = &thread_data->td->tpl_tmp_buffers; CommonModeInfoParams *mi_params = &cm->mi_params; int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working; struct aom_internal_error_info *const error_info = &thread_data->error_info; xd->error_info = error_info; AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt; (void)tpl_row_mt; #if CONFIG_MULTITHREAD pthread_mutex_t *tpl_error_mutex_ = tpl_row_mt->mutex_; #endif // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(error_info->jmp)) { error_info->setjmp = 0; #if CONFIG_MULTITHREAD pthread_mutex_lock(tpl_error_mutex_); tpl_row_mt->tpl_mt_exit = true; pthread_mutex_unlock(tpl_error_mutex_); #endif set_mode_estimation_done(cpi); return 0; } error_info->setjmp = 1; BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); TX_SIZE tx_size = max_txsize_lookup[bsize]; int mi_height = mi_size_high[bsize]; av1_init_tpl_txfm_stats(tpl_txfm_stats); for (int mi_row = thread_data->start * mi_height; mi_row < mi_params->mi_rows; mi_row += num_active_workers * mi_height) { // Motion estimation row boundary av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height, cpi->oxcf.border_in_pixels); xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE); av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row, bsize, tx_size); } error_info->setjmp = 0; return 1; } // Deallocate tpl synchronization related mutex and data. void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync) { assert(tpl_sync != NULL); #if CONFIG_MULTITHREAD if (tpl_sync->mutex_ != NULL) { for (int i = 0; i < tpl_sync->rows; ++i) pthread_mutex_destroy(&tpl_sync->mutex_[i]); aom_free(tpl_sync->mutex_); } if (tpl_sync->cond_ != NULL) { for (int i = 0; i < tpl_sync->rows; ++i) pthread_cond_destroy(&tpl_sync->cond_[i]); aom_free(tpl_sync->cond_); } #endif // CONFIG_MULTITHREAD aom_free(tpl_sync->num_finished_cols); // clear the structure as the source of this call may be a resize in which // case this call will be followed by an _alloc() which may fail. av1_zero(*tpl_sync); } // Allocate memory for tpl row synchronization. static void av1_tpl_alloc(AV1TplRowMultiThreadSync *tpl_sync, AV1_COMMON *cm, int mb_rows) { tpl_sync->rows = mb_rows; #if CONFIG_MULTITHREAD { CHECK_MEM_ERROR(cm, tpl_sync->mutex_, aom_malloc(sizeof(*tpl_sync->mutex_) * mb_rows)); if (tpl_sync->mutex_) { for (int i = 0; i < mb_rows; ++i) pthread_mutex_init(&tpl_sync->mutex_[i], NULL); } CHECK_MEM_ERROR(cm, tpl_sync->cond_, aom_malloc(sizeof(*tpl_sync->cond_) * mb_rows)); if (tpl_sync->cond_) { for (int i = 0; i < mb_rows; ++i) pthread_cond_init(&tpl_sync->cond_[i], NULL); } } #endif // CONFIG_MULTITHREAD CHECK_MEM_ERROR(cm, tpl_sync->num_finished_cols, aom_malloc(sizeof(*tpl_sync->num_finished_cols) * mb_rows)); // Set up nsync. tpl_sync->sync_range = 1; } // Each worker is prepared by assigning the hook function and individual thread // data. static inline void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook, int num_workers) { MultiThreadInfo *mt_info = &cpi->mt_info; for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *worker = &mt_info->workers[i]; EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; worker->hook = hook; worker->data1 = thread_data; worker->data2 = NULL; thread_data->thread_id = i; // Set the starting tile for each thread. thread_data->start = i; thread_data->cpi = cpi; if (i == 0) { thread_data->td = &cpi->td; } else { thread_data->td = thread_data->original_td; } // Before encoding a frame, copy the thread data from cpi. if (thread_data->td != &cpi->td) { thread_data->td->mb = cpi->td.mb; // OBMC buffers are used only to init MS params and remain unused when // called from tpl, hence set the buffers to defaults. av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer); if (!tpl_alloc_temp_buffers(&thread_data->td->tpl_tmp_buffers, cpi->ppi->tpl_data.tpl_bsize_1d)) { aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, "Error allocating tpl data"); } thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst; thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst; } } } #if CONFIG_BITRATE_ACCURACY // Accumulate transform stats after tpl. static void tpl_accumulate_txfm_stats(ThreadData *main_td, const MultiThreadInfo *mt_info, int num_workers) { TplTxfmStats *accumulated_stats = &main_td->tpl_txfm_stats; for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = &mt_info->workers[i]; EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; ThreadData *td = thread_data->td; if (td != main_td) { const TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats; av1_accumulate_tpl_txfm_stats(tpl_txfm_stats, accumulated_stats); } } } #endif // CONFIG_BITRATE_ACCURACY // Implements multi-threading for tpl. void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; CommonModeInfoParams *mi_params = &cm->mi_params; MultiThreadInfo *mt_info = &cpi->mt_info; TplParams *tpl_data = &cpi->ppi->tpl_data; AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync; int mb_rows = mi_params->mb_rows; int num_workers = AOMMIN(mt_info->num_mod_workers[MOD_TPL], mt_info->num_workers); if (mb_rows != tpl_sync->rows) { av1_tpl_dealloc(tpl_sync); av1_tpl_alloc(tpl_sync, cm, mb_rows); } tpl_sync->num_threads_working = num_workers; mt_info->tpl_row_mt.tpl_mt_exit = false; // Initialize cur_mb_col to -1 for all MB rows. memset(tpl_sync->num_finished_cols, -1, sizeof(*tpl_sync->num_finished_cols) * mb_rows); prepare_tpl_workers(cpi, tpl_worker_hook, num_workers); launch_workers(&cpi->mt_info, num_workers); sync_enc_workers(&cpi->mt_info, cm, num_workers); #if CONFIG_BITRATE_ACCURACY tpl_accumulate_txfm_stats(&cpi->td, &cpi->mt_info, num_workers); #endif // CONFIG_BITRATE_ACCURACY for (int i = num_workers - 1; i >= 0; i--) { EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; ThreadData *td = thread_data->td; if (td != &cpi->td) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers); } } // Deallocate memory for temporal filter multi-thread synchronization. void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync) { assert(tf_sync != NULL); #if CONFIG_MULTITHREAD if (tf_sync->mutex_ != NULL) { pthread_mutex_destroy(tf_sync->mutex_); aom_free(tf_sync->mutex_); } #endif // CONFIG_MULTITHREAD tf_sync->next_tf_row = 0; } // Checks if a job is available. If job is available, // populates next_tf_row and returns 1, else returns 0. static inline int tf_get_next_job(AV1TemporalFilterSync *tf_mt_sync, int *current_mb_row, int mb_rows) { int do_next_row = 0; #if CONFIG_MULTITHREAD pthread_mutex_t *tf_mutex_ = tf_mt_sync->mutex_; pthread_mutex_lock(tf_mutex_); #endif if (!tf_mt_sync->tf_mt_exit && tf_mt_sync->next_tf_row < mb_rows) { *current_mb_row = tf_mt_sync->next_tf_row; tf_mt_sync->next_tf_row++; do_next_row = 1; } #if CONFIG_MULTITHREAD pthread_mutex_unlock(tf_mutex_); #endif return do_next_row; } // Hook function for each thread in temporal filter multi-threading. static int tf_worker_hook(void *arg1, void *unused) { (void)unused; EncWorkerData *thread_data = (EncWorkerData *)arg1; AV1_COMP *cpi = thread_data->cpi; ThreadData *td = thread_data->td; TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; AV1TemporalFilterSync *tf_sync = &cpi->mt_info.tf_sync; const struct scale_factors *scale = &cpi->tf_ctx.sf; #if CONFIG_MULTITHREAD pthread_mutex_t *tf_mutex_ = tf_sync->mutex_; #endif MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; struct aom_internal_error_info *const error_info = &thread_data->error_info; xd->error_info = error_info; // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(error_info->jmp)) { error_info->setjmp = 0; #if CONFIG_MULTITHREAD pthread_mutex_lock(tf_mutex_); tf_sync->tf_mt_exit = true; pthread_mutex_unlock(tf_mutex_); #endif return 0; } error_info->setjmp = 1; const int num_planes = av1_num_planes(&cpi->common); assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); MACROBLOCKD *mbd = &td->mb.e_mbd; uint8_t *input_buffer[MAX_MB_PLANE]; MB_MODE_INFO **input_mb_mode_info; tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes); tf_setup_macroblockd(mbd, &td->tf_data, scale); int current_mb_row = -1; while (tf_get_next_job(tf_sync, ¤t_mb_row, tf_ctx->mb_rows)) av1_tf_do_filtering_row(cpi, td, current_mb_row); tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes); error_info->setjmp = 0; return 1; } // Assigns temporal filter hook function and thread data to each worker. static void prepare_tf_workers(AV1_COMP *cpi, AVxWorkerHook hook, int num_workers, int is_highbitdepth) { MultiThreadInfo *mt_info = &cpi->mt_info; mt_info->tf_sync.next_tf_row = 0; mt_info->tf_sync.tf_mt_exit = false; for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *worker = &mt_info->workers[i]; EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; worker->hook = hook; worker->data1 = thread_data; worker->data2 = NULL; thread_data->thread_id = i; // Set the starting tile for each thread. thread_data->start = i; thread_data->cpi = cpi; if (i == 0) { thread_data->td = &cpi->td; } else { thread_data->td = thread_data->original_td; } // Before encoding a frame, copy the thread data from cpi. if (thread_data->td != &cpi->td) { thread_data->td->mb = cpi->td.mb; // OBMC buffers are used only to init MS params and remain unused when // called from tf, hence set the buffers to defaults. av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer); if (!tf_alloc_and_reset_data(&thread_data->td->tf_data, cpi->tf_ctx.num_pels, is_highbitdepth)) { aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, "Error allocating temporal filter data"); } } } } // Deallocate thread specific data for temporal filter. static void tf_dealloc_thread_data(AV1_COMP *cpi, int num_workers, int is_highbitdepth) { MultiThreadInfo *mt_info = &cpi->mt_info; for (int i = num_workers - 1; i >= 0; i--) { EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; ThreadData *td = thread_data->td; if (td != &cpi->td) tf_dealloc_data(&td->tf_data, is_highbitdepth); } } // Accumulate sse and sum after temporal filtering. static void tf_accumulate_frame_diff(AV1_COMP *cpi, int num_workers) { FRAME_DIFF *total_diff = &cpi->td.tf_data.diff; for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = &cpi->mt_info.workers[i]; EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; ThreadData *td = thread_data->td; FRAME_DIFF *diff = &td->tf_data.diff; if (td != &cpi->td) { total_diff->sse += diff->sse; total_diff->sum += diff->sum; } } } // Implements multi-threading for temporal filter. void av1_tf_do_filtering_mt(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; MultiThreadInfo *mt_info = &cpi->mt_info; const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth; int num_workers = AOMMIN(mt_info->num_mod_workers[MOD_TF], mt_info->num_workers); prepare_tf_workers(cpi, tf_worker_hook, num_workers, is_highbitdepth); launch_workers(mt_info, num_workers); sync_enc_workers(mt_info, cm, num_workers); tf_accumulate_frame_diff(cpi, num_workers); tf_dealloc_thread_data(cpi, num_workers, is_highbitdepth); } // Checks if a job is available in the current direction. If a job is available, // frame_idx will be populated and returns 1, else returns 0. static inline int get_next_gm_job(AV1_COMP *cpi, int *frame_idx, int cur_dir) { GlobalMotionInfo *gm_info = &cpi->gm_info; GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info; int total_refs = gm_info->num_ref_frames[cur_dir]; int8_t cur_frame_to_process = job_info->next_frame_to_process[cur_dir]; if (cur_frame_to_process < total_refs && !job_info->early_exit[cur_dir]) { *frame_idx = gm_info->reference_frames[cur_dir][cur_frame_to_process].frame; job_info->next_frame_to_process[cur_dir] += 1; return 1; } return 0; } // Switches the current direction and calls the function get_next_gm_job() if // the speed feature 'prune_ref_frame_for_gm_search' is not set. static inline void switch_direction(AV1_COMP *cpi, int *frame_idx, int *cur_dir) { if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search) return; // Switch the direction and get next job *cur_dir = !(*cur_dir); get_next_gm_job(cpi, frame_idx, *(cur_dir)); } // Hook function for each thread in global motion multi-threading. static int gm_mt_worker_hook(void *arg1, void *unused) { (void)unused; EncWorkerData *thread_data = (EncWorkerData *)arg1; AV1_COMP *cpi = thread_data->cpi; GlobalMotionInfo *gm_info = &cpi->gm_info; AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync; GlobalMotionJobInfo *job_info = &gm_sync->job_info; int thread_id = thread_data->thread_id; GlobalMotionData *gm_thread_data = &thread_data->td->gm_data; #if CONFIG_MULTITHREAD pthread_mutex_t *gm_mt_mutex_ = gm_sync->mutex_; #endif MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; struct aom_internal_error_info *const error_info = &thread_data->error_info; xd->error_info = error_info; // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(error_info->jmp)) { error_info->setjmp = 0; #if CONFIG_MULTITHREAD pthread_mutex_lock(gm_mt_mutex_); gm_sync->gm_mt_exit = true; pthread_mutex_unlock(gm_mt_mutex_); #endif return 0; } error_info->setjmp = 1; int cur_dir = job_info->thread_id_to_dir[thread_id]; bool gm_mt_exit = false; while (1) { int ref_buf_idx = -1; #if CONFIG_MULTITHREAD pthread_mutex_lock(gm_mt_mutex_); #endif gm_mt_exit = gm_sync->gm_mt_exit; // Populates ref_buf_idx(the reference frame type) for which global motion // estimation will be done. if (!gm_mt_exit && !get_next_gm_job(cpi, &ref_buf_idx, cur_dir)) { // No jobs are available for the current direction. Switch // to other direction and get the next job, if available. switch_direction(cpi, &ref_buf_idx, &cur_dir); } #if CONFIG_MULTITHREAD pthread_mutex_unlock(gm_mt_mutex_); #endif // When gm_mt_exit is set to true, other workers need not pursue any // further jobs. if (gm_mt_exit || ref_buf_idx == -1) break; // Compute global motion for the given ref_buf_idx. av1_compute_gm_for_valid_ref_frames( cpi, error_info, gm_info->ref_buf, ref_buf_idx, gm_thread_data->motion_models, gm_thread_data->segment_map, gm_info->segment_map_w, gm_info->segment_map_h); #if CONFIG_MULTITHREAD pthread_mutex_lock(gm_mt_mutex_); #endif // If global motion w.r.t. current ref frame is // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t // the remaining ref frames in that direction. if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search && cpi->common.global_motion[ref_buf_idx].wmtype <= TRANSLATION) job_info->early_exit[cur_dir] = 1; #if CONFIG_MULTITHREAD pthread_mutex_unlock(gm_mt_mutex_); #endif } error_info->setjmp = 0; return 1; } // Assigns global motion hook function and thread data to each worker. static inline void prepare_gm_workers(AV1_COMP *cpi, AVxWorkerHook hook, int num_workers) { MultiThreadInfo *mt_info = &cpi->mt_info; mt_info->gm_sync.gm_mt_exit = false; for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *worker = &mt_info->workers[i]; EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; worker->hook = hook; worker->data1 = thread_data; worker->data2 = NULL; thread_data->thread_id = i; // Set the starting tile for each thread. thread_data->start = i; thread_data->cpi = cpi; if (i == 0) { thread_data->td = &cpi->td; } else { thread_data->td = thread_data->original_td; } if (thread_data->td != &cpi->td) gm_alloc_data(cpi, &thread_data->td->gm_data); } } // Assigns available threads to past/future direction. static inline void assign_thread_to_dir(int8_t *thread_id_to_dir, int num_workers) { int8_t frame_dir_idx = 0; for (int i = 0; i < num_workers; i++) { thread_id_to_dir[i] = frame_dir_idx++; if (frame_dir_idx == MAX_DIRECTIONS) frame_dir_idx = 0; } } // Computes number of workers for global motion multi-threading. static inline int compute_gm_workers(const AV1_COMP *cpi) { int total_refs = cpi->gm_info.num_ref_frames[0] + cpi->gm_info.num_ref_frames[1]; int num_gm_workers = cpi->sf.gm_sf.prune_ref_frame_for_gm_search ? AOMMIN(MAX_DIRECTIONS, total_refs) : total_refs; num_gm_workers = AOMMIN(num_gm_workers, cpi->mt_info.num_workers); return (num_gm_workers); } // Frees the memory allocated for each worker in global motion multi-threading. static inline void gm_dealloc_thread_data(AV1_COMP *cpi, int num_workers) { MultiThreadInfo *mt_info = &cpi->mt_info; for (int j = 0; j < num_workers; j++) { EncWorkerData *thread_data = &mt_info->tile_thr_data[j]; ThreadData *td = thread_data->td; if (td != &cpi->td) gm_dealloc_data(&td->gm_data); } } // Implements multi-threading for global motion. void av1_global_motion_estimation_mt(AV1_COMP *cpi) { GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info; av1_zero(*job_info); int num_workers = compute_gm_workers(cpi); assign_thread_to_dir(job_info->thread_id_to_dir, num_workers); prepare_gm_workers(cpi, gm_mt_worker_hook, num_workers); launch_workers(&cpi->mt_info, num_workers); sync_enc_workers(&cpi->mt_info, &cpi->common, num_workers); gm_dealloc_thread_data(cpi, num_workers); } #endif // !CONFIG_REALTIME_ONLY static inline int get_next_job_allintra( AV1EncRowMultiThreadSync *const row_mt_sync, const int mi_row_end, int *current_mi_row, int mib_size) { if (row_mt_sync->next_mi_row < mi_row_end) { *current_mi_row = row_mt_sync->next_mi_row; row_mt_sync->num_threads_working++; row_mt_sync->next_mi_row += mib_size; return 1; } return 0; } static inline void prepare_wiener_var_workers(AV1_COMP *const cpi, AVxWorkerHook hook, const int num_workers) { MultiThreadInfo *const mt_info = &cpi->mt_info; for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = &mt_info->workers[i]; EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; worker->hook = hook; worker->data1 = thread_data; worker->data2 = NULL; thread_data->thread_id = i; // Set the starting tile for each thread, in this case the preprocessing // stage does not need tiles. So we set it to 0. thread_data->start = 0; thread_data->cpi = cpi; if (i == 0) { thread_data->td = &cpi->td; } else { thread_data->td = thread_data->original_td; } if (thread_data->td != &cpi->td) { thread_data->td->mb = cpi->td.mb; av1_alloc_mb_wiener_var_pred_buf(&cpi->common, thread_data->td); } } } static void set_mb_wiener_var_calc_done(AV1_COMP *const cpi) { const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; const BLOCK_SIZE bsize = cpi->weber_bsize; const int mb_step = mi_size_wide[bsize]; assert(MB_WIENER_MT_UNIT_SIZE < BLOCK_SIZES_ALL); const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE]; const int mt_unit_cols = (mi_params->mi_cols + (mt_unit_step >> 1)) / mt_unit_step; const AV1EncAllIntraMultiThreadInfo *const intra_mt = &cpi->mt_info.intra_mt; AV1EncRowMultiThreadSync *const intra_row_mt_sync = &cpi->ppi->intra_row_mt_sync; // Update the wiener variance computation of every row in the frame to // indicate that it is complete in order to avoid dependent workers waiting // indefinitely. for (int mi_row = 0, mt_thread_id = 0; mi_row < mi_params->mi_rows; mi_row += mb_step, ++mt_thread_id) { intra_mt->intra_sync_write_ptr(intra_row_mt_sync, mt_thread_id, mt_unit_cols - 1, mt_unit_cols); } } static int cal_mb_wiener_var_hook(void *arg1, void *unused) { (void)unused; EncWorkerData *const thread_data = (EncWorkerData *)arg1; AV1_COMP *const cpi = thread_data->cpi; MACROBLOCK *x = &thread_data->td->mb; MACROBLOCKD *xd = &x->e_mbd; const BLOCK_SIZE bsize = cpi->weber_bsize; const int mb_step = mi_size_wide[bsize]; AV1EncRowMultiThreadSync *const intra_row_mt_sync = &cpi->ppi->intra_row_mt_sync; AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; (void)enc_row_mt; #if CONFIG_MULTITHREAD pthread_mutex_t *enc_row_mt_mutex = enc_row_mt->mutex_; #endif struct aom_internal_error_info *const error_info = &thread_data->error_info; xd->error_info = error_info; // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(error_info->jmp)) { error_info->setjmp = 0; #if CONFIG_MULTITHREAD pthread_mutex_lock(enc_row_mt_mutex); enc_row_mt->mb_wiener_mt_exit = true; pthread_mutex_unlock(enc_row_mt_mutex); #endif set_mb_wiener_var_calc_done(cpi); return 0; } error_info->setjmp = 1; DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]); DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]); DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]); DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]); double sum_rec_distortion = 0; double sum_est_rate = 0; while (1) { int current_mi_row = -1; #if CONFIG_MULTITHREAD pthread_mutex_lock(enc_row_mt_mutex); #endif int has_jobs = enc_row_mt->mb_wiener_mt_exit ? 0 : get_next_job_allintra(intra_row_mt_sync, cpi->common.mi_params.mi_rows, ¤t_mi_row, mb_step); #if CONFIG_MULTITHREAD pthread_mutex_unlock(enc_row_mt_mutex); #endif if (!has_jobs) break; // TODO(chengchen): properly accumulate the distortion and rate. av1_calc_mb_wiener_var_row(cpi, x, xd, current_mi_row, src_diff, coeff, qcoeff, dqcoeff, &sum_rec_distortion, &sum_est_rate, thread_data->td->wiener_tmp_pred_buf); #if CONFIG_MULTITHREAD pthread_mutex_lock(enc_row_mt_mutex); #endif intra_row_mt_sync->num_threads_working--; #if CONFIG_MULTITHREAD pthread_mutex_unlock(enc_row_mt_mutex); #endif } error_info->setjmp = 0; return 1; } static void dealloc_mb_wiener_var_mt_data(AV1_COMP *cpi, int num_workers) { av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync); MultiThreadInfo *mt_info = &cpi->mt_info; for (int j = 0; j < num_workers; ++j) { EncWorkerData *thread_data = &mt_info->tile_thr_data[j]; ThreadData *td = thread_data->td; if (td != &cpi->td) av1_dealloc_mb_wiener_var_pred_buf(td); } } // This function is the multi-threading version of computing the wiener // variance. // Note that the wiener variance is used for allintra mode (1 pass) and its // computation is before the frame encoding, so we don't need to consider // the number of tiles, instead we allocate all available threads to // the computation. void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers, double *sum_rec_distortion, double *sum_est_rate) { (void)sum_rec_distortion; (void)sum_est_rate; AV1_COMMON *const cm = &cpi->common; MultiThreadInfo *const mt_info = &cpi->mt_info; AV1EncRowMultiThreadSync *const intra_row_mt_sync = &cpi->ppi->intra_row_mt_sync; // TODO(chengchen): the memory usage could be improved. const int mi_rows = cm->mi_params.mi_rows; row_mt_sync_mem_alloc(intra_row_mt_sync, cm, mi_rows); intra_row_mt_sync->intrabc_extra_top_right_sb_delay = 0; intra_row_mt_sync->num_threads_working = num_workers; intra_row_mt_sync->next_mi_row = 0; memset(intra_row_mt_sync->num_finished_cols, -1, sizeof(*intra_row_mt_sync->num_finished_cols) * mi_rows); mt_info->enc_row_mt.mb_wiener_mt_exit = false; prepare_wiener_var_workers(cpi, cal_mb_wiener_var_hook, num_workers); launch_workers(mt_info, num_workers); sync_enc_workers(mt_info, cm, num_workers); dealloc_mb_wiener_var_mt_data(cpi, num_workers); } // Compare and order tiles based on absolute sum of tx coeffs. static int compare_tile_order(const void *a, const void *b) { const PackBSTileOrder *const tile_a = (const PackBSTileOrder *)a; const PackBSTileOrder *const tile_b = (const PackBSTileOrder *)b; if (tile_a->abs_sum_level > tile_b->abs_sum_level) return -1; else if (tile_a->abs_sum_level == tile_b->abs_sum_level) return (tile_a->tile_idx > tile_b->tile_idx ? 1 : -1); else return 1; } // Get next tile index to be processed for pack bitstream static inline int get_next_pack_bs_tile_idx( AV1EncPackBSSync *const pack_bs_sync, const int num_tiles) { assert(pack_bs_sync->next_job_idx <= num_tiles); if (pack_bs_sync->next_job_idx == num_tiles) return -1; return pack_bs_sync->pack_bs_tile_order[pack_bs_sync->next_job_idx++] .tile_idx; } // Calculates bitstream chunk size based on total buffer size and tile or tile // group size. static inline size_t get_bs_chunk_size(int tg_or_tile_size, const int frame_or_tg_size, size_t *remain_buf_size, size_t max_buf_size, int is_last_chunk) { size_t this_chunk_size; assert(*remain_buf_size > 0); if (is_last_chunk) { this_chunk_size = *remain_buf_size; *remain_buf_size = 0; } else { const uint64_t size_scale = (uint64_t)max_buf_size * tg_or_tile_size; this_chunk_size = (size_t)(size_scale / frame_or_tg_size); *remain_buf_size -= this_chunk_size; assert(*remain_buf_size > 0); } assert(this_chunk_size > 0); return this_chunk_size; } // Initializes params required for pack bitstream tile. static void init_tile_pack_bs_params(AV1_COMP *const cpi, uint8_t *const dst, struct aom_write_bit_buffer *saved_wb, PackBSParams *const pack_bs_params_arr, uint8_t obu_extn_header) { MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; AV1_COMMON *const cm = &cpi->common; const CommonTileParams *const tiles = &cm->tiles; const int num_tiles = tiles->cols * tiles->rows; // Fixed size tile groups for the moment const int num_tg_hdrs = cpi->num_tg; // Tile group size in terms of number of tiles. const int tg_size_in_tiles = (num_tiles + num_tg_hdrs - 1) / num_tg_hdrs; uint8_t *tile_dst = dst; uint8_t *tile_data_curr = dst; // Max tile group count can not be more than MAX_TILES. int tg_size_mi[MAX_TILES] = { 0 }; // Size of tile group in mi units int tile_idx; int tg_idx = 0; int tile_count_in_tg = 0; int new_tg = 1; // Populate pack bitstream params of all tiles. for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) { const TileInfo *const tile_info = &cpi->tile_data[tile_idx].tile_info; PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; // Calculate tile size in mi units. const int tile_size_mi = (tile_info->mi_col_end - tile_info->mi_col_start) * (tile_info->mi_row_end - tile_info->mi_row_start); int is_last_tile_in_tg = 0; tile_count_in_tg++; if (tile_count_in_tg == tg_size_in_tiles || tile_idx == (num_tiles - 1)) is_last_tile_in_tg = 1; // Populate pack bitstream params of this tile. pack_bs_params->curr_tg_hdr_size = 0; pack_bs_params->obu_extn_header = obu_extn_header; pack_bs_params->saved_wb = saved_wb; pack_bs_params->obu_header_size = 0; pack_bs_params->is_last_tile_in_tg = is_last_tile_in_tg; pack_bs_params->new_tg = new_tg; pack_bs_params->tile_col = tile_info->tile_col; pack_bs_params->tile_row = tile_info->tile_row; pack_bs_params->tile_size_mi = tile_size_mi; tg_size_mi[tg_idx] += tile_size_mi; if (new_tg) new_tg = 0; if (is_last_tile_in_tg) { tile_count_in_tg = 0; new_tg = 1; tg_idx++; } } assert(cpi->available_bs_size > 0); size_t tg_buf_size[MAX_TILES] = { 0 }; size_t max_buf_size = cpi->available_bs_size; size_t remain_buf_size = max_buf_size; const int frame_size_mi = cm->mi_params.mi_rows * cm->mi_params.mi_cols; tile_idx = 0; // Prepare obu, tile group and frame header of each tile group. for (tg_idx = 0; tg_idx < cpi->num_tg; tg_idx++) { PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; int is_last_tg = tg_idx == cpi->num_tg - 1; // Prorate bitstream buffer size based on tile group size and available // buffer size. This buffer will be used to store headers and tile data. tg_buf_size[tg_idx] = get_bs_chunk_size(tg_size_mi[tg_idx], frame_size_mi, &remain_buf_size, max_buf_size, is_last_tg); pack_bs_params->dst = tile_dst; pack_bs_params->tile_data_curr = tile_dst; // Write obu, tile group and frame header at first tile in the tile // group. av1_write_obu_tg_tile_headers(cpi, xd, pack_bs_params, tile_idx); tile_dst += tg_buf_size[tg_idx]; // Exclude headers from tile group buffer size. tg_buf_size[tg_idx] -= pack_bs_params->curr_tg_hdr_size; tile_idx += tg_size_in_tiles; } tg_idx = 0; // Calculate bitstream buffer size of each tile in the tile group. for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) { PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; if (pack_bs_params->new_tg) { max_buf_size = tg_buf_size[tg_idx]; remain_buf_size = max_buf_size; } // Prorate bitstream buffer size of this tile based on tile size and // available buffer size. For this proration, header size is not accounted. const size_t tile_buf_size = get_bs_chunk_size( pack_bs_params->tile_size_mi, tg_size_mi[tg_idx], &remain_buf_size, max_buf_size, pack_bs_params->is_last_tile_in_tg); pack_bs_params->tile_buf_size = tile_buf_size; // Update base address of bitstream buffer for tile and tile group. if (pack_bs_params->new_tg) { tile_dst = pack_bs_params->dst; tile_data_curr = pack_bs_params->tile_data_curr; // Account header size in first tile of a tile group. pack_bs_params->tile_buf_size += pack_bs_params->curr_tg_hdr_size; } else { pack_bs_params->dst = tile_dst; pack_bs_params->tile_data_curr = tile_data_curr; } if (pack_bs_params->is_last_tile_in_tg) tg_idx++; tile_dst += pack_bs_params->tile_buf_size; } } // Worker hook function of pack bitsteam multithreading. static int pack_bs_worker_hook(void *arg1, void *arg2) { EncWorkerData *const thread_data = (EncWorkerData *)arg1; PackBSParams *const pack_bs_params = (PackBSParams *)arg2; AV1_COMP *const cpi = thread_data->cpi; AV1_COMMON *const cm = &cpi->common; AV1EncPackBSSync *const pack_bs_sync = &cpi->mt_info.pack_bs_sync; const CommonTileParams *const tiles = &cm->tiles; const int num_tiles = tiles->cols * tiles->rows; #if CONFIG_MULTITHREAD pthread_mutex_t *const pack_bs_mutex = pack_bs_sync->mutex_; #endif MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; struct aom_internal_error_info *const error_info = &thread_data->error_info; xd->error_info = error_info; // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(error_info->jmp)) { error_info->setjmp = 0; #if CONFIG_MULTITHREAD pthread_mutex_lock(pack_bs_mutex); pack_bs_sync->pack_bs_mt_exit = true; pthread_mutex_unlock(pack_bs_mutex); #endif return 0; } error_info->setjmp = 1; while (1) { #if CONFIG_MULTITHREAD pthread_mutex_lock(pack_bs_mutex); #endif const int tile_idx = pack_bs_sync->pack_bs_mt_exit ? -1 : get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles); #if CONFIG_MULTITHREAD pthread_mutex_unlock(pack_bs_mutex); #endif // When pack_bs_mt_exit is set to true, other workers need not pursue any // further jobs. if (tile_idx == -1) break; TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx; av1_pack_tile_info(cpi, thread_data->td, &pack_bs_params[tile_idx]); } error_info->setjmp = 0; return 1; } // Prepares thread data and workers of pack bitsteam multithreading. static void prepare_pack_bs_workers(AV1_COMP *const cpi, PackBSParams *const pack_bs_params, AVxWorkerHook hook, const int num_workers) { MultiThreadInfo *const mt_info = &cpi->mt_info; for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *worker = &mt_info->workers[i]; EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; if (i == 0) { thread_data->td = &cpi->td; } else { thread_data->td = thread_data->original_td; } if (thread_data->td != &cpi->td) thread_data->td->mb = cpi->td.mb; thread_data->cpi = cpi; thread_data->start = i; thread_data->thread_id = i; av1_reset_pack_bs_thread_data(thread_data->td); worker->hook = hook; worker->data1 = thread_data; worker->data2 = pack_bs_params; } AV1_COMMON *const cm = &cpi->common; AV1EncPackBSSync *const pack_bs_sync = &mt_info->pack_bs_sync; const uint16_t num_tiles = cm->tiles.rows * cm->tiles.cols; pack_bs_sync->next_job_idx = 0; pack_bs_sync->pack_bs_mt_exit = false; PackBSTileOrder *const pack_bs_tile_order = pack_bs_sync->pack_bs_tile_order; // Reset tile order data of pack bitstream av1_zero_array(pack_bs_tile_order, num_tiles); // Populate pack bitstream tile order structure for (uint16_t tile_idx = 0; tile_idx < num_tiles; tile_idx++) { pack_bs_tile_order[tile_idx].abs_sum_level = cpi->tile_data[tile_idx].abs_sum_level; pack_bs_tile_order[tile_idx].tile_idx = tile_idx; } // Sort tiles in descending order based on tile area. qsort(pack_bs_tile_order, num_tiles, sizeof(*pack_bs_tile_order), compare_tile_order); } // Accumulates data after pack bitsteam processing. static void accumulate_pack_bs_data( AV1_COMP *const cpi, const PackBSParams *const pack_bs_params_arr, uint8_t *const dst, uint32_t *total_size, const FrameHeaderInfo *fh_info, int *const largest_tile_id, unsigned int *max_tile_size, uint32_t *const obu_header_size, uint8_t **tile_data_start, const int num_workers) { const AV1_COMMON *const cm = &cpi->common; const CommonTileParams *const tiles = &cm->tiles; const int tile_count = tiles->cols * tiles->rows; // Fixed size tile groups for the moment size_t curr_tg_data_size = 0; int is_first_tg = 1; uint8_t *curr_tg_start = dst; size_t src_offset = 0; size_t dst_offset = 0; for (int tile_idx = 0; tile_idx < tile_count; tile_idx++) { // PackBSParams stores all parameters required to pack tile and header // info. const PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; uint32_t tile_size = 0; if (pack_bs_params->new_tg) { curr_tg_start = dst + *total_size; curr_tg_data_size = pack_bs_params->curr_tg_hdr_size; *tile_data_start += pack_bs_params->curr_tg_hdr_size; *obu_header_size = pack_bs_params->obu_header_size; } curr_tg_data_size += pack_bs_params->buf.size + (pack_bs_params->is_last_tile_in_tg ? 0 : 4); if (pack_bs_params->buf.size > *max_tile_size) { *largest_tile_id = tile_idx; *max_tile_size = (unsigned int)pack_bs_params->buf.size; } tile_size += (uint32_t)pack_bs_params->buf.size + *pack_bs_params->total_size; // Pack all the chunks of tile bitstreams together if (tile_idx != 0) memmove(dst + dst_offset, dst + src_offset, tile_size); if (pack_bs_params->is_last_tile_in_tg) av1_write_last_tile_info( cpi, fh_info, pack_bs_params->saved_wb, &curr_tg_data_size, curr_tg_start, &tile_size, tile_data_start, largest_tile_id, &is_first_tg, *obu_header_size, pack_bs_params->obu_extn_header); src_offset += pack_bs_params->tile_buf_size; dst_offset += tile_size; *total_size += tile_size; } // Accumulate thread data MultiThreadInfo *const mt_info = &cpi->mt_info; for (int idx = num_workers - 1; idx >= 0; idx--) { ThreadData const *td = mt_info->tile_thr_data[idx].td; av1_accumulate_pack_bs_thread_data(cpi, td); } } void av1_write_tile_obu_mt( AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size, struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header, const FrameHeaderInfo *fh_info, int *const largest_tile_id, unsigned int *max_tile_size, uint32_t *const obu_header_size, uint8_t **tile_data_start, const int num_workers) { MultiThreadInfo *const mt_info = &cpi->mt_info; PackBSParams pack_bs_params[MAX_TILES]; uint32_t tile_size[MAX_TILES] = { 0 }; for (int tile_idx = 0; tile_idx < MAX_TILES; tile_idx++) pack_bs_params[tile_idx].total_size = &tile_size[tile_idx]; init_tile_pack_bs_params(cpi, dst, saved_wb, pack_bs_params, obu_extn_header); prepare_pack_bs_workers(cpi, pack_bs_params, pack_bs_worker_hook, num_workers); launch_workers(mt_info, num_workers); sync_enc_workers(mt_info, &cpi->common, num_workers); accumulate_pack_bs_data(cpi, pack_bs_params, dst, total_size, fh_info, largest_tile_id, max_tile_size, obu_header_size, tile_data_start, num_workers); } // Deallocate memory for CDEF search multi-thread synchronization. void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync) { (void)cdef_sync; assert(cdef_sync != NULL); #if CONFIG_MULTITHREAD if (cdef_sync->mutex_ != NULL) { pthread_mutex_destroy(cdef_sync->mutex_); aom_free(cdef_sync->mutex_); } #endif // CONFIG_MULTITHREAD } // Updates the row and column indices of the next job to be processed. // Also updates end_of_frame flag when the processing of all blocks is complete. static void update_next_job_info(AV1CdefSync *cdef_sync, int nvfb, int nhfb) { cdef_sync->fbc++; if (cdef_sync->fbc == nhfb) { cdef_sync->fbr++; if (cdef_sync->fbr == nvfb) { cdef_sync->end_of_frame = 1; } else { cdef_sync->fbc = 0; } } } // Initializes cdef_sync parameters. static inline void cdef_reset_job_info(AV1CdefSync *cdef_sync) { #if CONFIG_MULTITHREAD if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); #endif // CONFIG_MULTITHREAD cdef_sync->end_of_frame = 0; cdef_sync->fbr = 0; cdef_sync->fbc = 0; cdef_sync->cdef_mt_exit = false; } // Checks if a job is available. If job is available, // populates next job information and returns 1, else returns 0. static inline int cdef_get_next_job(AV1CdefSync *cdef_sync, CdefSearchCtx *cdef_search_ctx, volatile int *cur_fbr, volatile int *cur_fbc, volatile int *sb_count) { #if CONFIG_MULTITHREAD pthread_mutex_lock(cdef_sync->mutex_); #endif // CONFIG_MULTITHREAD int do_next_block = 0; const int nvfb = cdef_search_ctx->nvfb; const int nhfb = cdef_search_ctx->nhfb; // If a block is skip, do not process the block and // check the skip condition for the next block. while (!cdef_sync->cdef_mt_exit && !cdef_sync->end_of_frame && cdef_sb_skip(cdef_search_ctx->mi_params, cdef_sync->fbr, cdef_sync->fbc)) { update_next_job_info(cdef_sync, nvfb, nhfb); } // Populates information needed for current job and update the row, // column indices of the next block to be processed. if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) { do_next_block = 1; *cur_fbr = cdef_sync->fbr; *cur_fbc = cdef_sync->fbc; *sb_count = cdef_search_ctx->sb_count; cdef_search_ctx->sb_count++; update_next_job_info(cdef_sync, nvfb, nhfb); } #if CONFIG_MULTITHREAD pthread_mutex_unlock(cdef_sync->mutex_); #endif // CONFIG_MULTITHREAD return do_next_block; } // Hook function for each thread in CDEF search multi-threading. static int cdef_filter_block_worker_hook(void *arg1, void *arg2) { EncWorkerData *thread_data = (EncWorkerData *)arg1; AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg2; #if CONFIG_MULTITHREAD pthread_mutex_t *cdef_mutex_ = cdef_sync->mutex_; #endif struct aom_internal_error_info *const error_info = &thread_data->error_info; CdefSearchCtx *cdef_search_ctx = thread_data->cpi->cdef_search_ctx; // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. if (setjmp(error_info->jmp)) { error_info->setjmp = 0; #if CONFIG_MULTITHREAD pthread_mutex_lock(cdef_mutex_); cdef_sync->cdef_mt_exit = true; pthread_mutex_unlock(cdef_mutex_); #endif return 0; } error_info->setjmp = 1; volatile int cur_fbr, cur_fbc, sb_count; while (cdef_get_next_job(cdef_sync, cdef_search_ctx, &cur_fbr, &cur_fbc, &sb_count)) { av1_cdef_mse_calc_block(cdef_search_ctx, error_info, cur_fbr, cur_fbc, sb_count); } error_info->setjmp = 0; return 1; } // Assigns CDEF search hook function and thread data to each worker. static void prepare_cdef_workers(AV1_COMP *cpi, AVxWorkerHook hook, int num_workers) { MultiThreadInfo *mt_info = &cpi->mt_info; for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *worker = &mt_info->workers[i]; EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; thread_data->cpi = cpi; worker->hook = hook; worker->data1 = thread_data; worker->data2 = &mt_info->cdef_sync; } } // Implements multi-threading for CDEF search. void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi) { MultiThreadInfo *mt_info = &cpi->mt_info; AV1CdefSync *cdef_sync = &mt_info->cdef_sync; const int num_workers = mt_info->num_mod_workers[MOD_CDEF_SEARCH]; cdef_reset_job_info(cdef_sync); prepare_cdef_workers(cpi, cdef_filter_block_worker_hook, num_workers); launch_workers(mt_info, num_workers); sync_enc_workers(mt_info, &cpi->common, num_workers); } // Computes num_workers for temporal filter multi-threading. static inline int compute_num_tf_workers(const AV1_COMP *cpi) { // For single-pass encode, using no. of workers as per tf block size was not // found to improve speed. Hence the thread assignment for single-pass encode // is kept based on compute_num_enc_workers(). if (cpi->oxcf.pass < AOM_RC_SECOND_PASS) return compute_num_enc_workers(cpi, cpi->oxcf.max_threads); if (cpi->oxcf.max_threads <= 1) return 1; const int frame_height = cpi->common.height; const BLOCK_SIZE block_size = TF_BLOCK_SIZE; const int mb_height = block_size_high[block_size]; const int mb_rows = get_num_blocks(frame_height, mb_height); return AOMMIN(cpi->oxcf.max_threads, mb_rows); } // Computes num_workers for tpl multi-threading. static inline int compute_num_tpl_workers(AV1_COMP *cpi) { return compute_num_enc_workers(cpi, cpi->oxcf.max_threads); } // Computes num_workers for loop filter multi-threading. static inline int compute_num_lf_workers(AV1_COMP *cpi) { return compute_num_enc_workers(cpi, cpi->oxcf.max_threads); } // Computes num_workers for cdef multi-threading. static inline int compute_num_cdef_workers(AV1_COMP *cpi) { return compute_num_enc_workers(cpi, cpi->oxcf.max_threads); } // Computes num_workers for loop-restoration multi-threading. static inline int compute_num_lr_workers(AV1_COMP *cpi) { return compute_num_enc_workers(cpi, cpi->oxcf.max_threads); } // Computes num_workers for pack bitstream multi-threading. static inline int compute_num_pack_bs_workers(AV1_COMP *cpi) { if (cpi->oxcf.max_threads <= 1) return 1; return compute_num_enc_tile_mt_workers(&cpi->common, cpi->oxcf.max_threads); } // Computes num_workers for all intra multi-threading. static inline int compute_num_ai_workers(AV1_COMP *cpi) { if (cpi->oxcf.max_threads <= 1) return 1; // The multi-threading implementation of deltaq-mode = 3 in allintra // mode is based on row multi threading. if (!cpi->oxcf.row_mt) return 1; cpi->weber_bsize = BLOCK_8X8; const BLOCK_SIZE bsize = cpi->weber_bsize; const int mb_step = mi_size_wide[bsize]; const int num_mb_rows = cpi->common.mi_params.mi_rows / mb_step; return AOMMIN(num_mb_rows, cpi->oxcf.max_threads); } static int compute_num_mod_workers(AV1_COMP *cpi, MULTI_THREADED_MODULES mod_name) { int num_mod_workers = 0; switch (mod_name) { case MOD_FP: if (cpi->oxcf.pass >= AOM_RC_SECOND_PASS) num_mod_workers = 0; else num_mod_workers = compute_num_enc_workers(cpi, cpi->oxcf.max_threads); break; case MOD_TF: num_mod_workers = compute_num_tf_workers(cpi); break; case MOD_TPL: num_mod_workers = compute_num_tpl_workers(cpi); break; case MOD_GME: num_mod_workers = 1; break; case MOD_ENC: num_mod_workers = compute_num_enc_workers(cpi, cpi->oxcf.max_threads); break; case MOD_LPF: num_mod_workers = compute_num_lf_workers(cpi); break; case MOD_CDEF_SEARCH: num_mod_workers = compute_num_cdef_workers(cpi); break; case MOD_CDEF: num_mod_workers = compute_num_cdef_workers(cpi); break; case MOD_LR: num_mod_workers = compute_num_lr_workers(cpi); break; case MOD_PACK_BS: num_mod_workers = compute_num_pack_bs_workers(cpi); break; case MOD_FRAME_ENC: num_mod_workers = cpi->ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC]; break; case MOD_AI: if (cpi->oxcf.pass == AOM_RC_ONE_PASS) { num_mod_workers = compute_num_ai_workers(cpi); } else { num_mod_workers = 0; } break; default: assert(0); break; } return (num_mod_workers); } // Computes the number of workers for each MT modules in the encoder void av1_compute_num_workers_for_mt(AV1_COMP *cpi) { for (int i = MOD_FP; i < NUM_MT_MODULES; i++) { cpi->ppi->p_mt_info.num_mod_workers[i] = compute_num_mod_workers(cpi, (MULTI_THREADED_MODULES)i); } } aom-3.12.1/av1/encoder/ethread.h000066400000000000000000000105431477627663500162770ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_ETHREAD_H_ #define AOM_AV1_ENCODER_ETHREAD_H_ #ifdef __cplusplus extern "C" { #endif struct AV1_COMP; struct ThreadData; typedef struct EncWorkerData { struct AV1_COMP *cpi; struct ThreadData *td; struct ThreadData *original_td; struct aom_internal_error_info error_info; AV1LfSync *lf_sync; LFWorkerData *lf_data; int start; int thread_id; } EncWorkerData; void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c); void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c, int cols); void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c); void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c, int cols); void av1_encode_tiles_mt(struct AV1_COMP *cpi); void av1_encode_tiles_row_mt(struct AV1_COMP *cpi); #if !CONFIG_REALTIME_ONLY void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi); int av1_fp_compute_num_enc_workers(AV1_COMP *cpi); #endif void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts, const struct FRAME_COUNTS *counts); void av1_row_mt_mem_dealloc(AV1_COMP *cpi); void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync); void av1_global_motion_estimation_mt(AV1_COMP *cpi); #if !CONFIG_REALTIME_ONLY void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c); void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c, int cols); void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c); void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c, int cols); void av1_mc_flow_dispenser_mt(AV1_COMP *cpi); void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync); #endif // !CONFIG_REALTIME_ONLY void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers, double *sum_rec_distortion, double *sum_est_rate); void av1_tf_do_filtering_mt(AV1_COMP *cpi); void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync); void av1_compute_num_workers_for_mt(AV1_COMP *cpi); int av1_get_max_num_workers(const AV1_COMP *cpi); void av1_create_workers(AV1_PRIMARY *ppi, int num_workers); void av1_terminate_workers(AV1_PRIMARY *ppi); void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi); void av1_init_cdef_worker(AV1_COMP *cpi); #if !CONFIG_REALTIME_ONLY void av1_init_lr_mt_buffers(AV1_COMP *cpi); #endif #if CONFIG_MULTITHREAD void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass); #endif // CONFIG_MULTITHREAD int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info, MULTI_THREADED_MODULES mod_name); void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass); void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi); void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync); void av1_write_tile_obu_mt( AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size, struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header, const FrameHeaderInfo *fh_info, int *const largest_tile_id, unsigned int *max_tile_size, uint32_t *const obu_header_size, uint8_t **tile_data_start, const int num_workers); int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf); int av1_check_fpmt_config(AV1_PRIMARY *const ppi, const AV1EncoderConfig *const oxcf); void av1_compress_parallel_frames(AV1_PRIMARY *const ppi, AV1_COMP_DATA *const first_cpi_data); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_ETHREAD_H_ aom-3.12.1/av1/encoder/extend.c000066400000000000000000000142121477627663500161420ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "av1/common/common.h" #include "av1/encoder/extend.h" static void copy_and_extend_plane(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch, int w, int h, int extend_top, int extend_left, int extend_bottom, int extend_right, int chroma_step) { int i, linesize; // copy the left and right most columns out const uint8_t *src_ptr1 = src; const uint8_t *src_ptr2 = src + (w - 1) * chroma_step; uint8_t *dst_ptr1 = dst - extend_left; uint8_t *dst_ptr2 = dst + w; for (i = 0; i < h; i++) { memset(dst_ptr1, src_ptr1[0], extend_left); if (chroma_step == 1) { memcpy(dst_ptr1 + extend_left, src_ptr1, w); } else { for (int j = 0; j < w; j++) { dst_ptr1[extend_left + j] = src_ptr1[chroma_step * j]; } } memset(dst_ptr2, src_ptr2[0], extend_right); src_ptr1 += src_pitch; src_ptr2 += src_pitch; dst_ptr1 += dst_pitch; dst_ptr2 += dst_pitch; } // Now copy the top and bottom lines into each line of the respective // borders src_ptr1 = dst - extend_left; src_ptr2 = dst + dst_pitch * (h - 1) - extend_left; dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left; dst_ptr2 = dst + dst_pitch * (h)-extend_left; linesize = extend_left + extend_right + w; assert(linesize <= dst_pitch); for (i = 0; i < extend_top; i++) { memcpy(dst_ptr1, src_ptr1, linesize); dst_ptr1 += dst_pitch; } for (i = 0; i < extend_bottom; i++) { memcpy(dst_ptr2, src_ptr2, linesize); dst_ptr2 += dst_pitch; } } static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch, uint8_t *dst8, int dst_pitch, int w, int h, int extend_top, int extend_left, int extend_bottom, int extend_right) { int i, linesize; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); // copy the left and right most columns out const uint16_t *src_ptr1 = src; const uint16_t *src_ptr2 = src + w - 1; uint16_t *dst_ptr1 = dst - extend_left; uint16_t *dst_ptr2 = dst + w; for (i = 0; i < h; i++) { aom_memset16(dst_ptr1, src_ptr1[0], extend_left); memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0])); aom_memset16(dst_ptr2, src_ptr2[0], extend_right); src_ptr1 += src_pitch; src_ptr2 += src_pitch; dst_ptr1 += dst_pitch; dst_ptr2 += dst_pitch; } // Now copy the top and bottom lines into each line of the respective // borders src_ptr1 = dst - extend_left; src_ptr2 = dst + dst_pitch * (h - 1) - extend_left; dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left; dst_ptr2 = dst + dst_pitch * (h)-extend_left; linesize = extend_left + extend_right + w; assert(linesize <= dst_pitch); for (i = 0; i < extend_top; i++) { memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0])); dst_ptr1 += dst_pitch; } for (i = 0; i < extend_bottom; i++) { memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0])); dst_ptr2 += dst_pitch; } } void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst) { // Extend src frame in buffer const int et_y = dst->border; const int el_y = dst->border; const int er_y = AOMMAX(src->y_width + dst->border, ALIGN_POWER_OF_TWO(src->y_width, 6)) - src->y_crop_width; const int eb_y = AOMMAX(src->y_height + dst->border, ALIGN_POWER_OF_TWO(src->y_height, 6)) - src->y_crop_height; const int uv_width_subsampling = src->subsampling_x; const int uv_height_subsampling = src->subsampling_y; const int et_uv = et_y >> uv_height_subsampling; const int el_uv = el_y >> uv_width_subsampling; const int eb_uv = eb_y >> uv_height_subsampling; const int er_uv = er_y >> uv_width_subsampling; if (src->flags & YV12_FLAG_HIGHBITDEPTH) { highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src->y_crop_width, src->y_crop_height, et_y, el_y, eb_y, er_y); if (!src->monochrome) { highbd_copy_and_extend_plane( src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv); highbd_copy_and_extend_plane( src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv); } return; } copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src->y_crop_width, src->y_crop_height, et_y, el_y, eb_y, er_y, 1); if (!src->monochrome) { // detect nv12 format const int chroma_step = src->v_buffer ? 1 : 2; const uint8_t *src_v_buffer = src->v_buffer ? src->v_buffer : src->u_buffer + 1; copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv, chroma_step); copy_and_extend_plane(src_v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv, chroma_step); } } aom-3.12.1/av1/encoder/extend.h000066400000000000000000000016111477627663500161460ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_EXTEND_H_ #define AOM_AV1_ENCODER_EXTEND_H_ #include "aom_scale/yv12config.h" #include "aom/aom_integer.h" #ifdef __cplusplus extern "C" { #endif void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_EXTEND_H_ aom-3.12.1/av1/encoder/external_partition.c000066400000000000000000000072721477627663500205760ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/common.h" #include "av1/encoder/external_partition.h" #include "config/aom_config.h" aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs, aom_ext_part_config_t config, ExtPartController *ext_part_controller) { if (ext_part_controller == NULL) { return AOM_CODEC_INVALID_PARAM; } ext_part_controller->funcs = funcs; ext_part_controller->config = config; const aom_ext_part_status_t status = ext_part_controller->funcs.create_model( ext_part_controller->funcs.priv, &ext_part_controller->config, &ext_part_controller->model); if (status == AOM_EXT_PART_ERROR) { return AOM_CODEC_ERROR; } else if (status == AOM_EXT_PART_TEST) { ext_part_controller->test_mode = 1; ext_part_controller->ready = 0; return AOM_CODEC_OK; } assert(status == AOM_EXT_PART_OK); ext_part_controller->ready = 1; return AOM_CODEC_OK; } static aom_codec_err_t ext_part_init(ExtPartController *ext_part_controller) { if (ext_part_controller == NULL) { return AOM_CODEC_INVALID_PARAM; } av1_zero(ext_part_controller); return AOM_CODEC_OK; } aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller) { if (ext_part_controller == NULL) { return AOM_CODEC_INVALID_PARAM; } if (ext_part_controller->ready) { const aom_ext_part_status_t status = ext_part_controller->funcs.delete_model(ext_part_controller->model); if (status != AOM_EXT_PART_OK) { return AOM_CODEC_ERROR; } } return ext_part_init(ext_part_controller); } bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller, aom_partition_decision_t *decision) { assert(ext_part_controller != NULL); assert(ext_part_controller->ready); assert(decision != NULL); const aom_ext_part_status_t status = ext_part_controller->funcs.get_partition_decision( ext_part_controller->model, decision); if (status != AOM_EXT_PART_OK) return false; return true; } bool av1_ext_part_send_features(ExtPartController *ext_part_controller, const aom_partition_features_t *features) { assert(ext_part_controller != NULL); assert(ext_part_controller->ready); assert(features != NULL); const aom_ext_part_status_t status = ext_part_controller->funcs.send_features( ext_part_controller->model, features); if (status != AOM_EXT_PART_OK) return false; return true; } #if CONFIG_PARTITION_SEARCH_ORDER bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller, const aom_partition_stats_t *stats) { assert(ext_part_controller != NULL); assert(ext_part_controller->ready); assert(stats != NULL); const aom_ext_part_status_t status = ext_part_controller->funcs.send_partition_stats( ext_part_controller->model, stats); if (status != AOM_EXT_PART_OK) return false; return true; } aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode( const ExtPartController *ext_part_controller) { return ext_part_controller->funcs.decision_mode; } #endif // CONFIG_PARTITION_SEARCH_ORDER aom-3.12.1/av1/encoder/external_partition.h000066400000000000000000000037461477627663500206050ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_ #define AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_ #include #include "aom/aom_codec.h" #include "aom/aom_external_partition.h" #include "config/aom_config.h" #ifdef __cplusplus extern "C" { #endif /*!\cond */ typedef struct ExtPartController { int ready; int test_mode; aom_ext_part_config_t config; aom_ext_part_model_t model; aom_ext_part_funcs_t funcs; } ExtPartController; aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs, aom_ext_part_config_t config, ExtPartController *ext_part_controller); aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller); bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller, aom_partition_decision_t *decision); bool av1_ext_part_send_features(ExtPartController *ext_part_controller, const aom_partition_features_t *features); #if CONFIG_PARTITION_SEARCH_ORDER bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller, const aom_partition_stats_t *stats); aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode( const ExtPartController *ext_part_controller); #endif // CONFIG_PARTITION_SEARCH_ORDER /*!\endcond */ #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_ aom-3.12.1/av1/encoder/firstpass.c000066400000000000000000001752211477627663500167010ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_dsp_rtcd.h" #include "config/aom_scale_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/variance.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "aom_scale/yv12config.h" #include "aom_util/aom_pthread.h" #include "av1/common/entropymv.h" #include "av1/common/quant_common.h" #include "av1/common/reconinter.h" // av1_setup_dst_planes() #include "av1/common/reconintra.h" #include "av1/common/txb_common.h" #include "av1/encoder/aq_variance.h" #include "av1/encoder/av1_quantize.h" #include "av1/encoder/block.h" #include "av1/encoder/dwt.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodeframe_utils.h" #include "av1/encoder/encodemb.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encoder_utils.h" #include "av1/encoder/encode_strategy.h" #include "av1/encoder/ethread.h" #include "av1/encoder/extend.h" #include "av1/encoder/firstpass.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/rd.h" #include "av1/encoder/reconinter_enc.h" #define OUTPUT_FPF 0 #define FIRST_PASS_Q 10.0 #define INTRA_MODE_PENALTY 1024 #define NEW_MV_MODE_PENALTY 32 #define DARK_THRESH 64 #define NCOUNT_INTRA_THRESH 8192 #define NCOUNT_INTRA_FACTOR 3 #define INVALID_FP_STATS_TO_PREDICT_FLAT_GOP -1 static inline void output_stats(FIRSTPASS_STATS *stats, struct aom_codec_pkt_list *pktlist) { struct aom_codec_cx_pkt pkt; pkt.kind = AOM_CODEC_STATS_PKT; pkt.data.twopass_stats.buf = stats; pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS); if (pktlist != NULL) aom_codec_pkt_list_add(pktlist, &pkt); // TEMP debug code #if OUTPUT_FPF { FILE *fpfile; fpfile = fopen("firstpass.stt", "a"); fprintf(fpfile, "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf" "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf" "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf\n", stats->frame, stats->weight, stats->intra_error, stats->coded_error, stats->sr_coded_error, stats->pcnt_inter, stats->pcnt_motion, stats->pcnt_second_ref, stats->pcnt_neutral, stats->intra_skip_pct, stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr, stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv, stats->MVcv, stats->mv_in_out_count, stats->new_mv_count, stats->count, stats->duration); fclose(fpfile); } #endif } void av1_twopass_zero_stats(FIRSTPASS_STATS *section) { section->frame = 0.0; section->weight = 0.0; section->intra_error = 0.0; section->frame_avg_wavelet_energy = 0.0; section->coded_error = 0.0; section->log_intra_error = 0.0; section->log_coded_error = 0.0; section->sr_coded_error = 0.0; section->pcnt_inter = 0.0; section->pcnt_motion = 0.0; section->pcnt_second_ref = 0.0; section->pcnt_neutral = 0.0; section->intra_skip_pct = 0.0; section->inactive_zone_rows = 0.0; section->inactive_zone_cols = 0.0; section->MVr = 0.0; section->mvr_abs = 0.0; section->MVc = 0.0; section->mvc_abs = 0.0; section->MVrv = 0.0; section->MVcv = 0.0; section->mv_in_out_count = 0.0; section->new_mv_count = 0.0; section->count = 0.0; section->duration = 1.0; section->is_flash = 0; section->noise_var = 0; section->cor_coeff = 1.0; } void av1_accumulate_stats(FIRSTPASS_STATS *section, const FIRSTPASS_STATS *frame) { section->frame += frame->frame; section->weight += frame->weight; section->intra_error += frame->intra_error; section->log_intra_error += log1p(frame->intra_error); section->log_coded_error += log1p(frame->coded_error); section->frame_avg_wavelet_energy += frame->frame_avg_wavelet_energy; section->coded_error += frame->coded_error; section->sr_coded_error += frame->sr_coded_error; section->pcnt_inter += frame->pcnt_inter; section->pcnt_motion += frame->pcnt_motion; section->pcnt_second_ref += frame->pcnt_second_ref; section->pcnt_neutral += frame->pcnt_neutral; section->intra_skip_pct += frame->intra_skip_pct; section->inactive_zone_rows += frame->inactive_zone_rows; section->inactive_zone_cols += frame->inactive_zone_cols; section->MVr += frame->MVr; section->mvr_abs += frame->mvr_abs; section->MVc += frame->MVc; section->mvc_abs += frame->mvc_abs; section->MVrv += frame->MVrv; section->MVcv += frame->MVcv; section->mv_in_out_count += frame->mv_in_out_count; section->new_mv_count += frame->new_mv_count; section->count += frame->count; section->duration += frame->duration; } static int get_unit_rows(const BLOCK_SIZE fp_block_size, const int mb_rows) { const int height_mi_log2 = mi_size_high_log2[fp_block_size]; const int mb_height_mi_log2 = mi_size_high_log2[BLOCK_16X16]; if (height_mi_log2 > mb_height_mi_log2) { return mb_rows >> (height_mi_log2 - mb_height_mi_log2); } return mb_rows << (mb_height_mi_log2 - height_mi_log2); } static int get_unit_cols(const BLOCK_SIZE fp_block_size, const int mb_cols) { const int width_mi_log2 = mi_size_wide_log2[fp_block_size]; const int mb_width_mi_log2 = mi_size_wide_log2[BLOCK_16X16]; if (width_mi_log2 > mb_width_mi_log2) { return mb_cols >> (width_mi_log2 - mb_width_mi_log2); } return mb_cols << (mb_width_mi_log2 - width_mi_log2); } // TODO(chengchen): can we simplify it even if resize has to be considered? static int get_num_mbs(const BLOCK_SIZE fp_block_size, const int num_mbs_16X16) { const int width_mi_log2 = mi_size_wide_log2[fp_block_size]; const int height_mi_log2 = mi_size_high_log2[fp_block_size]; const int mb_width_mi_log2 = mi_size_wide_log2[BLOCK_16X16]; const int mb_height_mi_log2 = mi_size_high_log2[BLOCK_16X16]; // TODO(chengchen): Now this function assumes a square block is used. // It does not support rectangular block sizes. assert(width_mi_log2 == height_mi_log2); if (width_mi_log2 > mb_width_mi_log2) { return num_mbs_16X16 >> ((width_mi_log2 - mb_width_mi_log2) + (height_mi_log2 - mb_height_mi_log2)); } return num_mbs_16X16 << ((mb_width_mi_log2 - width_mi_log2) + (mb_height_mi_log2 - height_mi_log2)); } void av1_end_first_pass(AV1_COMP *cpi) { if (cpi->ppi->twopass.stats_buf_ctx->total_stats && !cpi->ppi->lap_enabled) output_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats, cpi->ppi->output_pkt_list); } static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) { switch (bsize) { case BLOCK_8X8: return aom_mse8x8; case BLOCK_16X8: return aom_mse16x8; case BLOCK_8X16: return aom_mse8x16; default: return aom_mse16x16; } } static unsigned int get_prediction_error(BLOCK_SIZE bsize, const struct buf_2d *src, const struct buf_2d *ref) { unsigned int sse; const aom_variance_fn_t fn = get_block_variance_fn(bsize); fn(src->buf, src->stride, ref->buf, ref->stride, &sse); return sse; } #if CONFIG_AV1_HIGHBITDEPTH static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, int bd) { switch (bd) { default: switch (bsize) { case BLOCK_8X8: return aom_highbd_8_mse8x8; case BLOCK_16X8: return aom_highbd_8_mse16x8; case BLOCK_8X16: return aom_highbd_8_mse8x16; default: return aom_highbd_8_mse16x16; } case 10: switch (bsize) { case BLOCK_8X8: return aom_highbd_10_mse8x8; case BLOCK_16X8: return aom_highbd_10_mse16x8; case BLOCK_8X16: return aom_highbd_10_mse8x16; default: return aom_highbd_10_mse16x16; } case 12: switch (bsize) { case BLOCK_8X8: return aom_highbd_12_mse8x8; case BLOCK_16X8: return aom_highbd_12_mse16x8; case BLOCK_8X16: return aom_highbd_12_mse8x16; default: return aom_highbd_12_mse16x16; } } } static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize, const struct buf_2d *src, const struct buf_2d *ref, int bd) { unsigned int sse; const aom_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd); fn(src->buf, src->stride, ref->buf, ref->stride, &sse); return sse; } #endif // CONFIG_AV1_HIGHBITDEPTH // Refine the motion search range according to the frame dimension // for first pass test. static int get_search_range(int width, int height) { int sr = 0; const int dim = AOMMIN(width, height); while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr; return sr; } static inline const search_site_config *av1_get_first_pass_search_site_config( const AV1_COMP *cpi, MACROBLOCK *x, SEARCH_METHODS search_method) { const int ref_stride = x->e_mbd.plane[0].pre[0].stride; // For AVIF applications, even the source frames can have changing resolution, // so we need to manually check for the strides :( // AV1_COMP::mv_search_params.search_site_config is a compressor level cache // that's shared by multiple threads. In most cases where all frames have the // same resolution, the cache contains the search site config that we need. const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params; if (ref_stride == mv_search_params->search_site_cfg[SS_CFG_FPF]->stride) { return mv_search_params->search_site_cfg[SS_CFG_FPF]; } // If the cache does not contain the correct stride, then we will need to rely // on the thread level config MACROBLOCK::search_site_cfg_buf. If even the // thread level config doesn't match, then we need to update it. search_method = search_method_lookup[search_method]; assert(search_method_lookup[search_method] == search_method && "The search_method_lookup table should be idempotent."); if (ref_stride != x->search_site_cfg_buf[search_method].stride) { av1_refresh_search_site_config(x->search_site_cfg_buf, search_method, ref_stride); } return x->search_site_cfg_buf; } static inline void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x, const MV *ref_mv, FULLPEL_MV *best_mv, int *best_motion_err) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv); int tmp_err; const BLOCK_SIZE bsize = xd->mi[0]->bsize; const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY; const int sr = get_search_range(cm->width, cm->height); const int step_param = cpi->sf.fp_sf.reduce_mv_step_param + sr; const search_site_config *first_pass_search_sites = av1_get_first_pass_search_site_config(cpi, x, NSTEP); const int fine_search_interval = cpi->is_screen_content_type && cm->features.allow_intrabc; FULLPEL_MOTION_SEARCH_PARAMS ms_params; av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv, start_mv, first_pass_search_sites, NSTEP, fine_search_interval); FULLPEL_MV this_best_mv; FULLPEL_MV_STATS best_mv_stats; tmp_err = av1_full_pixel_search(start_mv, &ms_params, step_param, NULL, &this_best_mv, &best_mv_stats, NULL); if (tmp_err < INT_MAX) { aom_variance_fn_ptr_t v_fn_ptr = cpi->ppi->fn_ptr[bsize]; const MSBuffers *ms_buffers = &ms_params.ms_buffers; tmp_err = av1_get_mvpred_sse(&ms_params.mv_cost_params, this_best_mv, &v_fn_ptr, ms_buffers->src, ms_buffers->ref) + new_mv_mode_penalty; } if (tmp_err < *best_motion_err) { *best_motion_err = tmp_err; *best_mv = this_best_mv; } } static BLOCK_SIZE get_bsize(const CommonModeInfoParams *const mi_params, const BLOCK_SIZE fp_block_size, const int unit_row, const int unit_col) { const int unit_width = mi_size_wide[fp_block_size]; const int unit_height = mi_size_high[fp_block_size]; const int is_half_width = unit_width * unit_col + unit_width / 2 >= mi_params->mi_cols; const int is_half_height = unit_height * unit_row + unit_height / 2 >= mi_params->mi_rows; const int max_dimension = AOMMAX(block_size_wide[fp_block_size], block_size_high[fp_block_size]); int square_block_size = 0; // 4X4, 8X8, 16X16, 32X32, 64X64, 128X128 switch (max_dimension) { case 4: square_block_size = 0; break; case 8: square_block_size = 1; break; case 16: square_block_size = 2; break; case 32: square_block_size = 3; break; case 64: square_block_size = 4; break; case 128: square_block_size = 5; break; default: assert(0 && "First pass block size is not supported!"); break; } if (is_half_width && is_half_height) { return subsize_lookup[PARTITION_SPLIT][square_block_size]; } else if (is_half_width) { return subsize_lookup[PARTITION_VERT][square_block_size]; } else if (is_half_height) { return subsize_lookup[PARTITION_HORZ][square_block_size]; } else { return fp_block_size; } } static int find_fp_qindex(aom_bit_depth_t bit_depth) { return av1_find_qindex(FIRST_PASS_Q, bit_depth, 0, QINDEX_RANGE - 1); } static double raw_motion_error_stdev(int *raw_motion_err_list, int raw_motion_err_counts) { int64_t sum_raw_err = 0; double raw_err_avg = 0; double raw_err_stdev = 0; if (raw_motion_err_counts == 0) return 0; int i; for (i = 0; i < raw_motion_err_counts; i++) { sum_raw_err += raw_motion_err_list[i]; } raw_err_avg = (double)sum_raw_err / raw_motion_err_counts; for (i = 0; i < raw_motion_err_counts; i++) { raw_err_stdev += (raw_motion_err_list[i] - raw_err_avg) * (raw_motion_err_list[i] - raw_err_avg); } // Calculate the standard deviation for the motion error of all the inter // blocks of the 0,0 motion using the last source // frame as the reference. raw_err_stdev = sqrt(raw_err_stdev / raw_motion_err_counts); return raw_err_stdev; } static inline int calc_wavelet_energy(const AV1EncoderConfig *oxcf) { return oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL; } typedef struct intra_pred_block_pass1_args { const SequenceHeader *seq_params; MACROBLOCK *x; } intra_pred_block_pass1_args; static inline void copy_rect(uint8_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height, int use_hbd) { #if CONFIG_AV1_HIGHBITDEPTH if (use_hbd) { aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src), sstride, CONVERT_TO_SHORTPTR(dst), dstride, width, height); } else { aom_convolve_copy(src, sstride, dst, dstride, width, height); } #else (void)use_hbd; aom_convolve_copy(src, sstride, dst, dstride, width, height); #endif } static void first_pass_intra_pred_and_calc_diff(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { (void)block; struct intra_pred_block_pass1_args *const args = arg; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; MACROBLOCKD_PLANE *const pd = &xd->plane[plane]; MACROBLOCK_PLANE *const p = &x->plane[plane]; const int dst_stride = pd->dst.stride; uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; const MB_MODE_INFO *const mbmi = xd->mi[0]; const SequenceHeader *seq_params = args->seq_params; const int src_stride = p->src.stride; uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2]; av1_predict_intra_block( xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width, pd->height, tx_size, mbmi->mode, 0, 0, FILTER_INTRA_MODES, src, src_stride, dst, dst_stride, blk_col, blk_row, plane); av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); } static void first_pass_predict_intra_block_for_luma_plane( const SequenceHeader *seq_params, MACROBLOCK *x, BLOCK_SIZE bsize) { assert(bsize < BLOCK_SIZES_ALL); const MACROBLOCKD *const xd = &x->e_mbd; const int plane = AOM_PLANE_Y; const MACROBLOCKD_PLANE *const pd = &xd->plane[plane]; const int ss_x = pd->subsampling_x; const int ss_y = pd->subsampling_y; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); const int dst_stride = pd->dst.stride; uint8_t *dst = pd->dst.buf; const MACROBLOCK_PLANE *const p = &x->plane[plane]; const int src_stride = p->src.stride; const uint8_t *src = p->src.buf; intra_pred_block_pass1_args args = { seq_params, x }; av1_foreach_transformed_block_in_plane( xd, plane_bsize, plane, first_pass_intra_pred_and_calc_diff, &args); // copy source data to recon buffer, as the recon buffer will be used as a // reference frame subsequently. copy_rect(dst, dst_stride, src, src_stride, block_size_wide[bsize], block_size_high[bsize], seq_params->use_highbitdepth); } #define UL_INTRA_THRESH 50 #define INVALID_ROW -1 // Computes and returns the intra pred error of a block. // intra pred error: sum of squared error of the intra predicted residual. // Inputs: // cpi: the encoder setting. Only a few params in it will be used. // this_frame: the current frame buffer. // tile: tile information (not used in first pass, already init to zero) // unit_row: row index in the unit of first pass block size. // unit_col: column index in the unit of first pass block size. // y_offset: the offset of y frame buffer, indicating the starting point of // the current block. // uv_offset: the offset of u and v frame buffer, indicating the starting // point of the current block. // fp_block_size: first pass block size. // qindex: quantization step size to encode the frame. // stats: frame encoding stats. // Modifies: // stats->intra_skip_count // stats->image_data_start_row // stats->intra_factor // stats->brightness_factor // stats->intra_error // stats->frame_avg_wavelet_energy // Returns: // this_intra_error. static int firstpass_intra_prediction( AV1_COMP *cpi, ThreadData *td, YV12_BUFFER_CONFIG *const this_frame, const TileInfo *const tile, const int unit_row, const int unit_col, const int y_offset, const int uv_offset, const BLOCK_SIZE fp_block_size, const int qindex, FRAME_STATS *const stats) { const AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; const SequenceHeader *const seq_params = cm->seq_params; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; const int unit_scale = mi_size_wide[fp_block_size]; const int num_planes = av1_num_planes(cm); const BLOCK_SIZE bsize = get_bsize(mi_params, fp_block_size, unit_row, unit_col); set_mi_offsets(mi_params, xd, unit_row * unit_scale, unit_col * unit_scale); xd->plane[0].dst.buf = this_frame->y_buffer + y_offset; if (num_planes > 1) { xd->plane[1].dst.buf = this_frame->u_buffer + uv_offset; xd->plane[2].dst.buf = this_frame->v_buffer + uv_offset; } xd->left_available = (unit_col != 0); xd->mi[0]->bsize = bsize; xd->mi[0]->ref_frame[0] = INTRA_FRAME; set_mi_row_col(xd, tile, unit_row * unit_scale, mi_size_high[bsize], unit_col * unit_scale, mi_size_wide[bsize], mi_params->mi_rows, mi_params->mi_cols); set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes); xd->mi[0]->segment_id = 0; xd->lossless[xd->mi[0]->segment_id] = (qindex == 0); xd->mi[0]->mode = DC_PRED; xd->mi[0]->tx_size = TX_4X4; if (cpi->sf.fp_sf.disable_recon) first_pass_predict_intra_block_for_luma_plane(seq_params, x, bsize); else av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0); int this_intra_error = aom_get_mb_ss(x->plane[0].src_diff); if (seq_params->use_highbitdepth) { switch (seq_params->bit_depth) { case AOM_BITS_8: break; case AOM_BITS_10: this_intra_error >>= 4; break; case AOM_BITS_12: this_intra_error >>= 8; break; default: assert(0 && "seq_params->bit_depth should be AOM_BITS_8, " "AOM_BITS_10 or AOM_BITS_12"); return -1; } } if (this_intra_error < UL_INTRA_THRESH) { ++stats->intra_skip_count; } else if ((unit_col > 0) && (stats->image_data_start_row == INVALID_ROW)) { stats->image_data_start_row = unit_row; } double log_intra = log1p(this_intra_error); if (log_intra < 10.0) { stats->intra_factor += 1.0 + ((10.0 - log_intra) * 0.05); } else { stats->intra_factor += 1.0; } int level_sample; if (seq_params->use_highbitdepth) { level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0]; } else { level_sample = x->plane[0].src.buf[0]; } if (seq_params->use_highbitdepth) { switch (seq_params->bit_depth) { case AOM_BITS_8: break; case AOM_BITS_10: level_sample >>= 2; break; case AOM_BITS_12: level_sample >>= 4; break; default: assert(0 && "seq_params->bit_depth should be AOM_BITS_8, " "AOM_BITS_10 or AOM_BITS_12"); return -1; } } if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) { stats->brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample)); } else { stats->brightness_factor += 1.0; } // Intrapenalty below deals with situations where the intra and inter // error scores are very low (e.g. a plain black frame). // We do not have special cases in first pass for 0,0 and nearest etc so // all inter modes carry an overhead cost estimate for the mv. // When the error score is very low this causes us to pick all or lots of // INTRA modes and throw lots of key frames. // This penalty adds a cost matching that of a 0,0 mv to the intra case. this_intra_error += INTRA_MODE_PENALTY; // Accumulate the intra error. stats->intra_error += (int64_t)this_intra_error; // Stats based on wavelet energy is used in the following cases : // 1. ML model which predicts if a flat structure (golden-frame only structure // without ALT-REF and Internal-ARFs) is better. This ML model is enabled in // constant quality mode under certain conditions. // 2. Delta qindex mode is set as DELTA_Q_PERCEPTUAL. // Thus, wavelet energy calculation is enabled for the above cases. if (calc_wavelet_energy(&cpi->oxcf)) { const int hbd = is_cur_buf_hbd(xd); const int stride = x->plane[0].src.stride; const int num_8x8_rows = block_size_high[fp_block_size] / 8; const int num_8x8_cols = block_size_wide[fp_block_size] / 8; const uint8_t *buf = x->plane[0].src.buf; stats->frame_avg_wavelet_energy += av1_haar_ac_sad_mxn_uint8_input( buf, stride, hbd, num_8x8_rows, num_8x8_cols); } else { stats->frame_avg_wavelet_energy = INVALID_FP_STATS_TO_PREDICT_FLAT_GOP; } return this_intra_error; } // Returns the sum of square error between source and reference blocks. static int get_prediction_error_bitdepth(const int is_high_bitdepth, const int bitdepth, const BLOCK_SIZE block_size, const struct buf_2d *src, const struct buf_2d *ref) { (void)is_high_bitdepth; (void)bitdepth; #if CONFIG_AV1_HIGHBITDEPTH if (is_high_bitdepth) { return highbd_get_prediction_error(block_size, src, ref, bitdepth); } #endif // CONFIG_AV1_HIGHBITDEPTH return get_prediction_error(block_size, src, ref); } // Accumulates motion vector stats. // Modifies member variables of "stats". static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv, const int mb_row, const int mb_col, const int mb_rows, const int mb_cols, MV *last_non_zero_mv, FRAME_STATS *stats) { if (is_zero_mv(&best_mv)) return; ++stats->mv_count; // Non-zero vector, was it different from the last non zero vector? if (!is_equal_mv(&best_mv, last_non_zero_mv)) ++stats->new_mv_count; *last_non_zero_mv = best_mv; // Does the row vector point inwards or outwards? if (mb_row < mb_rows / 2) { if (mv.row > 0) { --stats->sum_in_vectors; } else if (mv.row < 0) { ++stats->sum_in_vectors; } } else if (mb_row > mb_rows / 2) { if (mv.row > 0) { ++stats->sum_in_vectors; } else if (mv.row < 0) { --stats->sum_in_vectors; } } // Does the col vector point inwards or outwards? if (mb_col < mb_cols / 2) { if (mv.col > 0) { --stats->sum_in_vectors; } else if (mv.col < 0) { ++stats->sum_in_vectors; } } else if (mb_col > mb_cols / 2) { if (mv.col > 0) { ++stats->sum_in_vectors; } else if (mv.col < 0) { --stats->sum_in_vectors; } } } // Computes and returns the inter prediction error from the last frame. // Computes inter prediction errors from the golden and alt ref frams and // Updates stats accordingly. // Inputs: // cpi: the encoder setting. Only a few params in it will be used. // last_frame: the frame buffer of the last frame. // golden_frame: the frame buffer of the golden frame. // unit_row: row index in the unit of first pass block size. // unit_col: column index in the unit of first pass block size. // recon_yoffset: the y offset of the reconstructed frame buffer, // indicating the starting point of the current block. // recont_uvoffset: the u/v offset of the reconstructed frame buffer, // indicating the starting point of the current block. // src_yoffset: the y offset of the source frame buffer. // fp_block_size: first pass block size. // this_intra_error: the intra prediction error of this block. // raw_motion_err_counts: the count of raw motion vectors. // raw_motion_err_list: the array that records the raw motion error. // ref_mv: the reference used to start the motion search // best_mv: the best mv found // last_non_zero_mv: the last non zero mv found in this tile row. // stats: frame encoding stats. // Modifies: // raw_motion_err_list // best_ref_mv // last_mv // stats: many member params in it. // Returns: // this_inter_error static int firstpass_inter_prediction( AV1_COMP *cpi, ThreadData *td, const YV12_BUFFER_CONFIG *const last_frame, const YV12_BUFFER_CONFIG *const golden_frame, const int unit_row, const int unit_col, const int recon_yoffset, const int recon_uvoffset, const int src_yoffset, const BLOCK_SIZE fp_block_size, const int this_intra_error, const int raw_motion_err_counts, int *raw_motion_err_list, const MV ref_mv, MV *best_mv, MV *last_non_zero_mv, FRAME_STATS *stats) { int this_inter_error = this_intra_error; AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; CurrentFrame *const current_frame = &cm->current_frame; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; const int is_high_bitdepth = is_cur_buf_hbd(xd); const int bitdepth = xd->bd; const int unit_scale = mi_size_wide[fp_block_size]; const BLOCK_SIZE bsize = get_bsize(mi_params, fp_block_size, unit_row, unit_col); const int fp_block_size_height = block_size_wide[fp_block_size]; const int unit_width = mi_size_wide[fp_block_size]; const int unit_rows = get_unit_rows(fp_block_size, mi_params->mb_rows); const int unit_cols = get_unit_cols(fp_block_size, mi_params->mb_cols); // Assume 0,0 motion with no mv overhead. FULLPEL_MV mv = kZeroFullMv; xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset; // Set up limit values for motion vectors to prevent them extending // outside the UMV borders. av1_set_mv_col_limits(mi_params, &x->mv_limits, unit_col * unit_width, fp_block_size_height >> MI_SIZE_LOG2, cpi->oxcf.border_in_pixels); int motion_error = get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize, &x->plane[0].src, &xd->plane[0].pre[0]); // Compute the motion error of the 0,0 motion using the last source // frame as the reference. Skip the further motion search on // reconstructed frame if this error is small. // TODO(chiyotsai): The unscaled last source might be different dimension // as the current source. See BUG=aomedia:3413 struct buf_2d unscaled_last_source_buf_2d; unscaled_last_source_buf_2d.buf = cpi->unscaled_last_source->y_buffer + src_yoffset; unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride; const int raw_motion_error = get_prediction_error_bitdepth( is_high_bitdepth, bitdepth, bsize, &x->plane[0].src, &unscaled_last_source_buf_2d); raw_motion_err_list[raw_motion_err_counts] = raw_motion_error; const FIRST_PASS_SPEED_FEATURES *const fp_sf = &cpi->sf.fp_sf; if (raw_motion_error > fp_sf->skip_motion_search_threshold) { // Test last reference frame using the previous best mv as the // starting point (best reference) for the search. first_pass_motion_search(cpi, x, &ref_mv, &mv, &motion_error); // If the current best reference mv is not centered on 0,0 then do a // 0,0 based search as well. if ((fp_sf->skip_zeromv_motion_search == 0) && !is_zero_mv(&ref_mv)) { FULLPEL_MV tmp_mv = kZeroFullMv; int tmp_err = INT_MAX; first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err); if (tmp_err < motion_error) { motion_error = tmp_err; mv = tmp_mv; } } } // Motion search in 2nd reference frame. int gf_motion_error = motion_error; if ((current_frame->frame_number > 1) && golden_frame != NULL) { FULLPEL_MV tmp_mv = kZeroFullMv; // Assume 0,0 motion with no mv overhead. av1_setup_pre_planes(xd, 0, golden_frame, 0, 0, NULL, 1); xd->plane[0].pre[0].buf += recon_yoffset; gf_motion_error = get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize, &x->plane[0].src, &xd->plane[0].pre[0]); first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &gf_motion_error); } if (gf_motion_error < motion_error && gf_motion_error < this_intra_error) { ++stats->second_ref_count; } // In accumulating a score for the 2nd reference frame take the // best of the motion predicted score and the intra coded error // (just as will be done for) accumulation of "coded_error" for // the last frame. if ((current_frame->frame_number > 1) && golden_frame != NULL) { stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error); } else { // TODO(chengchen): I believe logically this should also be changed to // stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error). stats->sr_coded_error += motion_error; } // Reset to last frame as reference buffer. xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset; if (av1_num_planes(&cpi->common) > 1) { xd->plane[1].pre[0].buf = last_frame->u_buffer + recon_uvoffset; xd->plane[2].pre[0].buf = last_frame->v_buffer + recon_uvoffset; } // Start by assuming that intra mode is best. *best_mv = kZeroMv; if (motion_error <= this_intra_error) { // Keep a count of cases where the inter and intra were very close // and very low. This helps with scene cut detection for example in // cropped clips with black bars at the sides or top and bottom. if (((this_intra_error - INTRA_MODE_PENALTY) * 9 <= motion_error * 10) && (this_intra_error < (2 * INTRA_MODE_PENALTY))) { stats->neutral_count += 1.0; // Also track cases where the intra is not much worse than the inter // and use this in limiting the GF/arf group length. } else if ((this_intra_error > NCOUNT_INTRA_THRESH) && (this_intra_error < (NCOUNT_INTRA_FACTOR * motion_error))) { stats->neutral_count += (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_intra_error); } *best_mv = get_mv_from_fullmv(&mv); this_inter_error = motion_error; xd->mi[0]->mode = NEWMV; xd->mi[0]->mv[0].as_mv = *best_mv; xd->mi[0]->tx_size = TX_4X4; xd->mi[0]->ref_frame[0] = LAST_FRAME; xd->mi[0]->ref_frame[1] = NONE_FRAME; if (fp_sf->disable_recon == 0) { av1_enc_build_inter_predictor(cm, xd, unit_row * unit_scale, unit_col * unit_scale, NULL, bsize, AOM_PLANE_Y, AOM_PLANE_Y); av1_encode_sby_pass1(cpi, x, bsize); } stats->sum_mvr += best_mv->row; stats->sum_mvr_abs += abs(best_mv->row); stats->sum_mvc += best_mv->col; stats->sum_mvc_abs += abs(best_mv->col); stats->sum_mvrs += best_mv->row * best_mv->row; stats->sum_mvcs += best_mv->col * best_mv->col; ++stats->inter_count; accumulate_mv_stats(*best_mv, mv, unit_row, unit_col, unit_rows, unit_cols, last_non_zero_mv, stats); } return this_inter_error; } // Normalize the first pass stats. // Error / counters are normalized to each MB. // MVs are normalized to the width/height of the frame. static void normalize_firstpass_stats(FIRSTPASS_STATS *fps, double num_mbs_16x16, double f_w, double f_h) { fps->coded_error /= num_mbs_16x16; fps->sr_coded_error /= num_mbs_16x16; fps->intra_error /= num_mbs_16x16; fps->frame_avg_wavelet_energy /= num_mbs_16x16; fps->log_coded_error = log1p(fps->coded_error); fps->log_intra_error = log1p(fps->intra_error); fps->MVr /= f_h; fps->mvr_abs /= f_h; fps->MVc /= f_w; fps->mvc_abs /= f_w; fps->MVrv /= (f_h * f_h); fps->MVcv /= (f_w * f_w); fps->new_mv_count /= num_mbs_16x16; } // Updates the first pass stats of this frame. // Input: // cpi: the encoder setting. Only a few params in it will be used. // stats: stats accumulated for this frame. // raw_err_stdev: the statndard deviation for the motion error of all the // inter blocks of the (0,0) motion using the last source // frame as the reference. // frame_number: current frame number. // ts_duration: Duration of the frame / collection of frames. // Updates: // twopass->total_stats: the accumulated stats. // twopass->stats_buf_ctx->stats_in_end: the pointer to the current stats, // update its value and its position // in the buffer. static void update_firstpass_stats(AV1_COMP *cpi, const FRAME_STATS *const stats, const double raw_err_stdev, const int frame_number, const int64_t ts_duration, const BLOCK_SIZE fp_block_size) { TWO_PASS *twopass = &cpi->ppi->twopass; AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end; FIRSTPASS_STATS fps; // The minimum error here insures some bit allocation to frames even // in static regions. The allocation per MB declines for larger formats // where the typical "real" energy per MB also falls. // Initial estimate here uses sqrt(mbs) to define the min_err, where the // number of mbs is proportional to the image area. const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE) ? cpi->initial_mbs : mi_params->MBs; // Number of actual units used in the first pass, it can be other square // block sizes than 16X16. const int num_mbs = get_num_mbs(fp_block_size, num_mbs_16X16); const double min_err = 200 * sqrt(num_mbs); fps.weight = stats->intra_factor * stats->brightness_factor; fps.frame = frame_number; fps.coded_error = (double)(stats->coded_error >> 8) + min_err; fps.sr_coded_error = (double)(stats->sr_coded_error >> 8) + min_err; fps.intra_error = (double)(stats->intra_error >> 8) + min_err; fps.frame_avg_wavelet_energy = (double)stats->frame_avg_wavelet_energy; fps.count = 1.0; fps.pcnt_inter = (double)stats->inter_count / num_mbs; fps.pcnt_second_ref = (double)stats->second_ref_count / num_mbs; fps.pcnt_neutral = (double)stats->neutral_count / num_mbs; fps.intra_skip_pct = (double)stats->intra_skip_count / num_mbs; fps.inactive_zone_rows = (double)stats->image_data_start_row; fps.inactive_zone_cols = 0.0; // Placeholder: not currently supported. fps.raw_error_stdev = raw_err_stdev; fps.is_flash = 0; fps.noise_var = 0.0; fps.cor_coeff = 1.0; fps.log_coded_error = 0.0; fps.log_intra_error = 0.0; if (stats->mv_count > 0) { fps.MVr = (double)stats->sum_mvr / stats->mv_count; fps.mvr_abs = (double)stats->sum_mvr_abs / stats->mv_count; fps.MVc = (double)stats->sum_mvc / stats->mv_count; fps.mvc_abs = (double)stats->sum_mvc_abs / stats->mv_count; fps.MVrv = ((double)stats->sum_mvrs - ((double)stats->sum_mvr * stats->sum_mvr / stats->mv_count)) / stats->mv_count; fps.MVcv = ((double)stats->sum_mvcs - ((double)stats->sum_mvc * stats->sum_mvc / stats->mv_count)) / stats->mv_count; fps.mv_in_out_count = (double)stats->sum_in_vectors / (stats->mv_count * 2); fps.new_mv_count = stats->new_mv_count; fps.pcnt_motion = (double)stats->mv_count / num_mbs; } else { fps.MVr = 0.0; fps.mvr_abs = 0.0; fps.MVc = 0.0; fps.mvc_abs = 0.0; fps.MVrv = 0.0; fps.MVcv = 0.0; fps.mv_in_out_count = 0.0; fps.new_mv_count = 0.0; fps.pcnt_motion = 0.0; } // TODO(paulwilkins): Handle the case when duration is set to 0, or // something less than the full time between subsequent values of // cpi->source_time_stamp. fps.duration = (double)ts_duration; normalize_firstpass_stats(&fps, num_mbs_16X16, cm->width, cm->height); // We will store the stats inside the persistent twopass struct (and NOT the // local variable 'fps'), and then cpi->output_pkt_list will point to it. *this_frame_stats = fps; if (!cpi->ppi->lap_enabled) { output_stats(this_frame_stats, cpi->ppi->output_pkt_list); } else { av1_firstpass_info_push(&twopass->firstpass_info, this_frame_stats); } if (cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL) { av1_accumulate_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats, &fps); } twopass->stats_buf_ctx->stats_in_end++; // When ducky encode is on, we always use linear buffer for stats_buf_ctx. if (cpi->use_ducky_encode == 0) { // TODO(angiebird): Figure out why first pass uses circular buffer. /* In the case of two pass, first pass uses it as a circular buffer, * when LAP is enabled it is used as a linear buffer*/ if ((cpi->oxcf.pass == AOM_RC_FIRST_PASS) && (twopass->stats_buf_ctx->stats_in_end >= twopass->stats_buf_ctx->stats_in_buf_end)) { twopass->stats_buf_ctx->stats_in_end = twopass->stats_buf_ctx->stats_in_start; } } } static void print_reconstruction_frame( const YV12_BUFFER_CONFIG *const last_frame, int frame_number, int do_print) { if (!do_print) return; char filename[512]; FILE *recon_file; snprintf(filename, sizeof(filename), "enc%04d.yuv", frame_number); if (frame_number == 0) { recon_file = fopen(filename, "wb"); } else { recon_file = fopen(filename, "ab"); } fwrite(last_frame->buffer_alloc, last_frame->frame_size, 1, recon_file); fclose(recon_file); } static FRAME_STATS accumulate_frame_stats(FRAME_STATS *mb_stats, int mb_rows, int mb_cols) { FRAME_STATS stats = { 0 }; int i, j; stats.image_data_start_row = INVALID_ROW; for (j = 0; j < mb_rows; j++) { for (i = 0; i < mb_cols; i++) { FRAME_STATS mb_stat = mb_stats[j * mb_cols + i]; stats.brightness_factor += mb_stat.brightness_factor; stats.coded_error += mb_stat.coded_error; stats.frame_avg_wavelet_energy += mb_stat.frame_avg_wavelet_energy; if (stats.image_data_start_row == INVALID_ROW && mb_stat.image_data_start_row != INVALID_ROW) { stats.image_data_start_row = mb_stat.image_data_start_row; } stats.inter_count += mb_stat.inter_count; stats.intra_error += mb_stat.intra_error; stats.intra_factor += mb_stat.intra_factor; stats.intra_skip_count += mb_stat.intra_skip_count; stats.mv_count += mb_stat.mv_count; stats.neutral_count += mb_stat.neutral_count; stats.new_mv_count += mb_stat.new_mv_count; stats.second_ref_count += mb_stat.second_ref_count; stats.sr_coded_error += mb_stat.sr_coded_error; stats.sum_in_vectors += mb_stat.sum_in_vectors; stats.sum_mvc += mb_stat.sum_mvc; stats.sum_mvc_abs += mb_stat.sum_mvc_abs; stats.sum_mvcs += mb_stat.sum_mvcs; stats.sum_mvr += mb_stat.sum_mvr; stats.sum_mvr_abs += mb_stat.sum_mvr_abs; stats.sum_mvrs += mb_stat.sum_mvrs; } } return stats; } static void setup_firstpass_data(AV1_COMMON *const cm, FirstPassData *firstpass_data, const int unit_rows, const int unit_cols) { CHECK_MEM_ERROR(cm, firstpass_data->raw_motion_err_list, aom_calloc(unit_rows * unit_cols, sizeof(*firstpass_data->raw_motion_err_list))); CHECK_MEM_ERROR( cm, firstpass_data->mb_stats, aom_calloc(unit_rows * unit_cols, sizeof(*firstpass_data->mb_stats))); for (int j = 0; j < unit_rows; j++) { for (int i = 0; i < unit_cols; i++) { firstpass_data->mb_stats[j * unit_cols + i].image_data_start_row = INVALID_ROW; } } } void av1_free_firstpass_data(FirstPassData *firstpass_data) { aom_free(firstpass_data->raw_motion_err_list); firstpass_data->raw_motion_err_list = NULL; aom_free(firstpass_data->mb_stats); firstpass_data->mb_stats = NULL; } int av1_get_unit_rows_in_tile(const TileInfo *tile, const BLOCK_SIZE fp_block_size) { const int unit_height_log2 = mi_size_high_log2[fp_block_size]; const int mi_rows = tile->mi_row_end - tile->mi_row_start; const int unit_rows = CEIL_POWER_OF_TWO(mi_rows, unit_height_log2); return unit_rows; } int av1_get_unit_cols_in_tile(const TileInfo *tile, const BLOCK_SIZE fp_block_size) { const int unit_width_log2 = mi_size_wide_log2[fp_block_size]; const int mi_cols = tile->mi_col_end - tile->mi_col_start; const int unit_cols = CEIL_POWER_OF_TWO(mi_cols, unit_width_log2); return unit_cols; } #define FIRST_PASS_ALT_REF_DISTANCE 16 static void first_pass_tile(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, const BLOCK_SIZE fp_block_size) { TileInfo *tile = &tile_data->tile_info; const int unit_height = mi_size_high[fp_block_size]; const int unit_height_log2 = mi_size_high_log2[fp_block_size]; for (int mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; mi_row += unit_height) { av1_first_pass_row(cpi, td, tile_data, mi_row >> unit_height_log2, fp_block_size); } } static void first_pass_tiles(AV1_COMP *cpi, const BLOCK_SIZE fp_block_size) { AV1_COMMON *const cm = &cpi->common; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; av1_alloc_src_diff_buf(cm, &cpi->td.mb); for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { TileDataEnc *const tile_data = &cpi->tile_data[tile_row * tile_cols + tile_col]; first_pass_tile(cpi, &cpi->td, tile_data, fp_block_size); } } } void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, const int unit_row, const BLOCK_SIZE fp_block_size) { MACROBLOCK *const x = &td->mb; AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; const SequenceHeader *const seq_params = cm->seq_params; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; TileInfo *tile = &tile_data->tile_info; const int qindex = find_fp_qindex(seq_params->bit_depth); const int fp_block_size_width = block_size_high[fp_block_size]; const int fp_block_size_height = block_size_wide[fp_block_size]; const int unit_width = mi_size_wide[fp_block_size]; const int unit_width_log2 = mi_size_wide_log2[fp_block_size]; const int unit_height_log2 = mi_size_high_log2[fp_block_size]; const int unit_cols = mi_params->mb_cols * 4 / unit_width; int raw_motion_err_counts = 0; int unit_row_in_tile = unit_row - (tile->mi_row_start >> unit_height_log2); int unit_col_start = tile->mi_col_start >> unit_width_log2; int unit_cols_in_tile = av1_get_unit_cols_in_tile(tile, fp_block_size); MultiThreadInfo *const mt_info = &cpi->mt_info; AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync; const YV12_BUFFER_CONFIG *last_frame = av1_get_scaled_ref_frame(cpi, LAST_FRAME); if (!last_frame) { last_frame = get_ref_frame_yv12_buf(cm, LAST_FRAME); } const YV12_BUFFER_CONFIG *golden_frame = av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME); if (!golden_frame) { golden_frame = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); } YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf; PICK_MODE_CONTEXT *ctx = td->firstpass_ctx; FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats + unit_row * unit_cols + unit_col_start; int *raw_motion_err_list = cpi->firstpass_data.raw_motion_err_list + unit_row * unit_cols + unit_col_start; MV *first_top_mv = &tile_data->firstpass_top_mv; for (int i = 0; i < num_planes; ++i) { x->plane[i].coeff = ctx->coeff[i]; x->plane[i].qcoeff = ctx->qcoeff[i]; x->plane[i].eobs = ctx->eobs[i]; x->plane[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; x->plane[i].dqcoeff = ctx->dqcoeff[i]; } const int src_y_stride = cpi->source->y_stride; const int recon_y_stride = this_frame->y_stride; const int recon_uv_stride = this_frame->uv_stride; const int uv_mb_height = fp_block_size_height >> (this_frame->y_height > this_frame->uv_height); MV best_ref_mv = kZeroMv; MV last_mv; // Reset above block coeffs. xd->up_available = (unit_row_in_tile != 0); int recon_yoffset = (unit_row * recon_y_stride * fp_block_size_height) + (unit_col_start * fp_block_size_width); int src_yoffset = (unit_row * src_y_stride * fp_block_size_height) + (unit_col_start * fp_block_size_width); int recon_uvoffset = (unit_row * recon_uv_stride * uv_mb_height) + (unit_col_start * uv_mb_height); // Set up limit values for motion vectors to prevent them extending // outside the UMV borders. av1_set_mv_row_limits( mi_params, &x->mv_limits, (unit_row << unit_height_log2), (fp_block_size_height >> MI_SIZE_LOG2), cpi->oxcf.border_in_pixels); av1_setup_src_planes(x, cpi->source, unit_row << unit_height_log2, tile->mi_col_start, num_planes, fp_block_size); // Fix - zero the 16x16 block first. This ensures correct this_intra_error for // block sizes smaller than 16x16. av1_zero_array(x->plane[0].src_diff, 256); for (int unit_col_in_tile = 0; unit_col_in_tile < unit_cols_in_tile; unit_col_in_tile++) { const int unit_col = unit_col_start + unit_col_in_tile; enc_row_mt->sync_read_ptr(row_mt_sync, unit_row_in_tile, unit_col_in_tile); #if CONFIG_MULTITHREAD if (cpi->ppi->p_mt_info.num_workers > 1) { pthread_mutex_lock(enc_row_mt->mutex_); bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit; pthread_mutex_unlock(enc_row_mt->mutex_); // Exit in case any worker has encountered an error. if (firstpass_mt_exit) return; } #endif if (unit_col_in_tile == 0) { last_mv = *first_top_mv; } int this_intra_error = firstpass_intra_prediction( cpi, td, this_frame, tile, unit_row, unit_col, recon_yoffset, recon_uvoffset, fp_block_size, qindex, mb_stats); if (!frame_is_intra_only(cm)) { const int this_inter_error = firstpass_inter_prediction( cpi, td, last_frame, golden_frame, unit_row, unit_col, recon_yoffset, recon_uvoffset, src_yoffset, fp_block_size, this_intra_error, raw_motion_err_counts, raw_motion_err_list, best_ref_mv, &best_ref_mv, &last_mv, mb_stats); if (unit_col_in_tile == 0) { *first_top_mv = last_mv; } mb_stats->coded_error += this_inter_error; ++raw_motion_err_counts; } else { mb_stats->sr_coded_error += this_intra_error; mb_stats->coded_error += this_intra_error; } // Adjust to the next column of MBs. x->plane[0].src.buf += fp_block_size_width; if (num_planes > 1) { x->plane[1].src.buf += uv_mb_height; x->plane[2].src.buf += uv_mb_height; } recon_yoffset += fp_block_size_width; src_yoffset += fp_block_size_width; recon_uvoffset += uv_mb_height; mb_stats++; enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile, unit_col_in_tile, unit_cols_in_tile); } } void av1_noop_first_pass_frame(AV1_COMP *cpi, const int64_t ts_duration) { AV1_COMMON *const cm = &cpi->common; CurrentFrame *const current_frame = &cm->current_frame; const CommonModeInfoParams *const mi_params = &cm->mi_params; int max_mb_rows = mi_params->mb_rows; int max_mb_cols = mi_params->mb_cols; if (cpi->oxcf.frm_dim_cfg.forced_max_frame_width) { int max_mi_cols = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_width); max_mb_cols = ROUND_POWER_OF_TWO(max_mi_cols, 2); } if (cpi->oxcf.frm_dim_cfg.forced_max_frame_height) { int max_mi_rows = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_height); max_mb_rows = ROUND_POWER_OF_TWO(max_mi_rows, 2); } const int unit_rows = get_unit_rows(BLOCK_16X16, max_mb_rows); const int unit_cols = get_unit_cols(BLOCK_16X16, max_mb_cols); setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols); FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats; FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols); av1_free_firstpass_data(&cpi->firstpass_data); update_firstpass_stats(cpi, &stats, 1.0, current_frame->frame_number, ts_duration, BLOCK_16X16); } void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) { MACROBLOCK *const x = &cpi->td.mb; AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; CurrentFrame *const current_frame = &cm->current_frame; const SequenceHeader *const seq_params = cm->seq_params; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; const int qindex = find_fp_qindex(seq_params->bit_depth); const int ref_frame_flags_backup = cpi->ref_frame_flags; cpi->ref_frame_flags = av1_ref_frame_flag_list[LAST_FRAME] | av1_ref_frame_flag_list[GOLDEN_FRAME]; // Detect if the key frame is screen content type. if (frame_is_intra_only(cm)) { FeatureFlags *const features = &cm->features; assert(cpi->source != NULL); xd->cur_buf = cpi->source; av1_set_screen_content_options(cpi, features); } // Prepare the speed features av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed); // Unit size for the first pass encoding. const BLOCK_SIZE fp_block_size = get_fp_block_size(cpi->is_screen_content_type); int max_mb_rows = mi_params->mb_rows; int max_mb_cols = mi_params->mb_cols; if (cpi->oxcf.frm_dim_cfg.forced_max_frame_width) { int max_mi_cols = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_width); max_mb_cols = ROUND_POWER_OF_TWO(max_mi_cols, 2); } if (cpi->oxcf.frm_dim_cfg.forced_max_frame_height) { int max_mi_rows = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_height); max_mb_rows = ROUND_POWER_OF_TWO(max_mi_rows, 2); } // Number of rows in the unit size. // Note max_mb_rows and max_mb_cols are in the unit of 16x16. const int unit_rows = get_unit_rows(fp_block_size, max_mb_rows); const int unit_cols = get_unit_cols(fp_block_size, max_mb_cols); // Set fp_block_size, for the convenience of multi-thread usage. cpi->fp_block_size = fp_block_size; setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols); int *raw_motion_err_list = cpi->firstpass_data.raw_motion_err_list; FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats; // multi threading info MultiThreadInfo *const mt_info = &cpi->mt_info; AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; if (cpi->allocated_tiles < tile_cols * tile_rows) { av1_alloc_tile_data(cpi); } av1_init_tile_data(cpi); const YV12_BUFFER_CONFIG *last_frame = NULL; const YV12_BUFFER_CONFIG *golden_frame = NULL; if (!frame_is_intra_only(cm)) { av1_scale_references(cpi, EIGHTTAP_REGULAR, 0, 0); last_frame = av1_is_scaled(get_ref_scale_factors_const(cm, LAST_FRAME)) ? av1_get_scaled_ref_frame(cpi, LAST_FRAME) : get_ref_frame_yv12_buf(cm, LAST_FRAME); golden_frame = av1_is_scaled(get_ref_scale_factors_const(cm, GOLDEN_FRAME)) ? av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME) : get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); } YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf; // First pass code requires valid last and new frame buffers. assert(this_frame != NULL); assert(frame_is_intra_only(cm) || (last_frame != NULL)); av1_setup_frame_size(cpi); av1_set_mv_search_params(cpi); set_mi_offsets(mi_params, xd, 0, 0); xd->mi[0]->bsize = fp_block_size; // Do not use periodic key frames. cpi->rc.frames_to_key = INT_MAX; av1_set_quantizer( cm, cpi->oxcf.q_cfg.qm_minlevel, cpi->oxcf.q_cfg.qm_maxlevel, qindex, cpi->oxcf.q_cfg.enable_chroma_deltaq, cpi->oxcf.q_cfg.enable_hdr_deltaq, cpi->oxcf.mode == ALLINTRA, cpi->oxcf.tune_cfg.tuning); av1_setup_block_planes(xd, seq_params->subsampling_x, seq_params->subsampling_y, num_planes); av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, fp_block_size); av1_setup_dst_planes(xd->plane, seq_params->sb_size, this_frame, 0, 0, 0, num_planes); if (!frame_is_intra_only(cm)) { av1_setup_pre_planes(xd, 0, last_frame, 0, 0, NULL, num_planes); } set_mi_offsets(mi_params, xd, 0, 0); // Don't store luma on the fist pass since chroma is not computed xd->cfl.store_y = 0; av1_frame_init_quantizer(cpi); av1_default_coef_probs(cm); av1_init_mode_probs(cm->fc); av1_init_mv_probs(cm); av1_initialize_rd_consts(cpi); enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy; enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy; if (mt_info->num_workers > 1) { enc_row_mt->sync_read_ptr = av1_row_mt_sync_read; enc_row_mt->sync_write_ptr = av1_row_mt_sync_write; av1_fp_encode_tiles_row_mt(cpi); } else { first_pass_tiles(cpi, fp_block_size); } FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols); int total_raw_motion_err_count = frame_is_intra_only(cm) ? 0 : unit_rows * unit_cols; const double raw_err_stdev = raw_motion_error_stdev(raw_motion_err_list, total_raw_motion_err_count); av1_free_firstpass_data(&cpi->firstpass_data); av1_dealloc_src_diff_buf(&cpi->td.mb, av1_num_planes(cm)); // Clamp the image start to rows/2. This number of rows is discarded top // and bottom as dead data so rows / 2 means the frame is blank. if ((stats.image_data_start_row > unit_rows / 2) || (stats.image_data_start_row == INVALID_ROW)) { stats.image_data_start_row = unit_rows / 2; } // Exclude any image dead zone if (stats.image_data_start_row > 0) { stats.intra_skip_count = AOMMAX(0, stats.intra_skip_count - (stats.image_data_start_row * unit_cols * 2)); } TWO_PASS *twopass = &cpi->ppi->twopass; const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE) ? cpi->initial_mbs : mi_params->MBs; // Number of actual units used in the first pass, it can be other square // block sizes than 16X16. const int num_mbs = get_num_mbs(fp_block_size, num_mbs_16X16); stats.intra_factor = stats.intra_factor / (double)num_mbs; stats.brightness_factor = stats.brightness_factor / (double)num_mbs; FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end; update_firstpass_stats(cpi, &stats, raw_err_stdev, current_frame->frame_number, ts_duration, fp_block_size); // Copy the previous Last Frame back into gf buffer if the prediction is good // enough... but also don't allow it to lag too far. if ((twopass->sr_update_lag > 3) || ((current_frame->frame_number > 0) && (this_frame_stats->pcnt_inter > 0.20) && ((this_frame_stats->intra_error / DOUBLE_DIVIDE_CHECK(this_frame_stats->coded_error)) > 2.0))) { if (golden_frame != NULL) { assign_frame_buffer_p( &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)], cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]); } twopass->sr_update_lag = 1; } else { ++twopass->sr_update_lag; } aom_extend_frame_borders(this_frame, num_planes); // The frame we just compressed now becomes the last frame. assign_frame_buffer_p( &cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)], cm->cur_frame); // Special case for the first frame. Copy into the GF buffer as a second // reference. if (current_frame->frame_number == 0 && get_ref_frame_map_idx(cm, GOLDEN_FRAME) != INVALID_IDX) { assign_frame_buffer_p( &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)], cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]); } print_reconstruction_frame(last_frame, current_frame->frame_number, /*do_print=*/0); ++current_frame->frame_number; cpi->ref_frame_flags = ref_frame_flags_backup; if (!frame_is_intra_only(cm)) { release_scaled_references(cpi); } } aom_codec_err_t av1_firstpass_info_init(FIRSTPASS_INFO *firstpass_info, FIRSTPASS_STATS *ext_stats_buf, int ext_stats_buf_size) { assert(IMPLIES(ext_stats_buf == NULL, ext_stats_buf_size == 0)); if (ext_stats_buf == NULL) { firstpass_info->stats_buf = firstpass_info->static_stats_buf; firstpass_info->stats_buf_size = sizeof(firstpass_info->static_stats_buf) / sizeof(firstpass_info->static_stats_buf[0]); firstpass_info->start_index = 0; firstpass_info->cur_index = 0; firstpass_info->stats_count = 0; firstpass_info->future_stats_count = 0; firstpass_info->past_stats_count = 0; av1_zero(firstpass_info->total_stats); if (ext_stats_buf_size == 0) { return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } else { firstpass_info->stats_buf = ext_stats_buf; firstpass_info->stats_buf_size = ext_stats_buf_size; firstpass_info->start_index = 0; firstpass_info->cur_index = 0; firstpass_info->stats_count = firstpass_info->stats_buf_size; firstpass_info->future_stats_count = firstpass_info->stats_count; firstpass_info->past_stats_count = 0; av1_zero(firstpass_info->total_stats); for (int i = 0; i < firstpass_info->stats_count; ++i) { av1_accumulate_stats(&firstpass_info->total_stats, &firstpass_info->stats_buf[i]); } } return AOM_CODEC_OK; } aom_codec_err_t av1_firstpass_info_move_cur_index( FIRSTPASS_INFO *firstpass_info) { assert(firstpass_info->future_stats_count + firstpass_info->past_stats_count == firstpass_info->stats_count); if (firstpass_info->future_stats_count > 1) { firstpass_info->cur_index = (firstpass_info->cur_index + 1) % firstpass_info->stats_buf_size; --firstpass_info->future_stats_count; ++firstpass_info->past_stats_count; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } aom_codec_err_t av1_firstpass_info_pop(FIRSTPASS_INFO *firstpass_info) { if (firstpass_info->stats_count > 0 && firstpass_info->past_stats_count > 0) { const int next_start = (firstpass_info->start_index + 1) % firstpass_info->stats_buf_size; firstpass_info->start_index = next_start; --firstpass_info->stats_count; --firstpass_info->past_stats_count; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } aom_codec_err_t av1_firstpass_info_move_cur_index_and_pop( FIRSTPASS_INFO *firstpass_info) { aom_codec_err_t ret = av1_firstpass_info_move_cur_index(firstpass_info); if (ret != AOM_CODEC_OK) return ret; ret = av1_firstpass_info_pop(firstpass_info); return ret; } aom_codec_err_t av1_firstpass_info_push(FIRSTPASS_INFO *firstpass_info, const FIRSTPASS_STATS *input_stats) { if (firstpass_info->stats_count < firstpass_info->stats_buf_size) { const int next_index = (firstpass_info->start_index + firstpass_info->stats_count) % firstpass_info->stats_buf_size; firstpass_info->stats_buf[next_index] = *input_stats; ++firstpass_info->stats_count; ++firstpass_info->future_stats_count; av1_accumulate_stats(&firstpass_info->total_stats, input_stats); return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; } } const FIRSTPASS_STATS *av1_firstpass_info_peek( const FIRSTPASS_INFO *firstpass_info, int offset_from_cur) { if (offset_from_cur >= -firstpass_info->past_stats_count && offset_from_cur < firstpass_info->future_stats_count) { const int index = (firstpass_info->cur_index + offset_from_cur) % firstpass_info->stats_buf_size; return &firstpass_info->stats_buf[index]; } else { return NULL; } } int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info, int offset_from_cur) { if (offset_from_cur < firstpass_info->future_stats_count) { return firstpass_info->future_stats_count - offset_from_cur; } return 0; } aom-3.12.1/av1/encoder/firstpass.h000066400000000000000000000453761477627663500167150ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_FIRSTPASS_H_ #define AOM_AV1_ENCODER_FIRSTPASS_H_ #include #include "av1/common/av1_common_int.h" #include "av1/common/enums.h" #include "av1/encoder/lookahead.h" #include "av1/encoder/ratectrl.h" #ifdef __cplusplus extern "C" { #endif #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001) #define MIN_ZERO_MOTION 0.95 #define MAX_SR_CODED_ERROR 40 #define MAX_RAW_ERR_VAR 2000 #define MIN_MV_IN_OUT 0.4 #define VLOW_MOTION_THRESHOLD 950 struct ThreadData; /*! * \brief The stucture of acummulated frame stats in the first pass. * * Errors (coded_error, intra_error, etc.) and counters (new_mv_count) are * normalized to each MB. MV related stats (MVc, MVr, etc.) are normalized to * the frame width and height. See function normalize_firstpass_stats. */ typedef struct FIRSTPASS_STATS { /*! * Frame number in display order, if stats are for a single frame. * No real meaning for a collection of frames. */ double frame; /*! * Weight assigned to this frame (or total weight for the collection of * frames) currently based on intra factor and brightness factor. This is used * to distribute bits betweeen easier and harder frames. */ double weight; /*! * Intra prediction error. */ double intra_error; /*! * Average wavelet energy computed using Discrete Wavelet Transform (DWT). */ double frame_avg_wavelet_energy; /*! * Best of intra pred error and inter pred error using last frame as ref. */ double coded_error; /*! * Best of intra pred error and inter pred error using golden frame as ref. */ double sr_coded_error; /*! * Percentage of blocks with inter pred error < intra pred error. */ double pcnt_inter; /*! * Percentage of blocks using (inter prediction and) non-zero motion vectors. */ double pcnt_motion; /*! * Percentage of blocks where golden frame was better than last or intra: * inter pred error using golden frame < inter pred error using last frame and * inter pred error using golden frame < intra pred error */ double pcnt_second_ref; /*! * Percentage of blocks where intra and inter prediction errors were very * close. Note that this is a 'weighted count', that is, the so blocks may be * weighted by how close the two errors were. */ double pcnt_neutral; /*! * Percentage of blocks that have almost no intra error residual * (i.e. are in effect completely flat and untextured in the intra * domain). In natural videos this is uncommon, but it is much more * common in animations, graphics and screen content, so may be used * as a signal to detect these types of content. */ double intra_skip_pct; /*! * Image mask rows top and bottom. */ double inactive_zone_rows; /*! * Image mask columns at left and right edges. */ double inactive_zone_cols; /*! * Average of row motion vectors. */ double MVr; /*! * Mean of absolute value of row motion vectors. */ double mvr_abs; /*! * Mean of column motion vectors. */ double MVc; /*! * Mean of absolute value of column motion vectors. */ double mvc_abs; /*! * Variance of row motion vectors. */ double MVrv; /*! * Variance of column motion vectors. */ double MVcv; /*! * Value in range [-1,1] indicating fraction of row and column motion vectors * that point inwards (negative MV value) or outwards (positive MV value). * For example, value of 1 indicates, all row/column MVs are inwards. */ double mv_in_out_count; /*! * Count of unique non-zero motion vectors. */ double new_mv_count; /*! * Duration of the frame / collection of frames. */ double duration; /*! * 1.0 if stats are for a single frame, OR * Number of frames in this collection for which the stats are accumulated. */ double count; /*! * standard deviation for (0, 0) motion prediction error */ double raw_error_stdev; /*! * Whether the frame contains a flash */ int64_t is_flash; /*! * Estimated noise variance */ double noise_var; /*! * Correlation coefficient with the previous frame */ double cor_coeff; /*! * log of intra_error */ double log_intra_error; /*! * log of coded_error */ double log_coded_error; } FIRSTPASS_STATS; // We want to keep one past stats for key frame detection // in test_candidate_kf() #define FIRSTPASS_INFO_STATS_PAST_MIN 1 // The size of static buffer used in FIRSTPASS_INFO. #define FIRSTPASS_INFO_STATIC_BUF_SIZE \ (MAX_LAP_BUFFERS + FIRSTPASS_INFO_STATS_PAST_MIN) /*! * \brief Data structure used for managing first pass stats */ typedef struct { /*! * A static buffer that will be used when no ext_stats_buf is assigned. The * ext_stats_buf is assigned through av1_firstpass_info_init() when the user * already has a pre-existing firstpass stats that is stored in an external * buffer. The ext_stats_buf is usually used in two pass mode. When using one * pass mode, we generate "firstpass" stats and encode the video in the same * pass. In this scenario, the stats will be pushed and popped from * static_stats_buf. */ FIRSTPASS_STATS static_stats_buf[FIRSTPASS_INFO_STATIC_BUF_SIZE]; /*! * A pointer to first pass stats. * Note that this buffer will be used as ring buffer. */ FIRSTPASS_STATS *stats_buf; /*! * size of stats_buf */ int stats_buf_size; /*! * start index of the available frame stats * Note that start_index doesn't always point to * current frame's stats because we need to * keep past stats as well. To access current * frame's stats, please use cur_index. */ int start_index; /*! * count available stats stored in stats_buf * the following condition should stay true * stats_count = future_stats_count + past_stats_count */ int stats_count; /*! * index of the current frame's stats */ int cur_index; /*! * count available future stats including current stats */ int future_stats_count; /*! * count available past stats EXCLUDING current stats */ int past_stats_count; /*! * Accumulation of the stats being pushed into firstpass_info */ FIRSTPASS_STATS total_stats; } FIRSTPASS_INFO; /*!\brief Init firstpass_info * * If using ext_stats_buf, the buffer needs to stay available during encoding * process. * * \ingroup rate_control * \param[out] firstpass_info struct of firstpass_info. * \param[in] ext_stats_buf external stats buffer. Pass in NULL if * choose to use internal static_stats_buf. * \param[in] ext_stats_buf_size external stats buffer size. Pass in 0 if * choose to use internal static_stats_buf. \return status */ aom_codec_err_t av1_firstpass_info_init(FIRSTPASS_INFO *firstpass_info, FIRSTPASS_STATS *ext_stats_buf, int ext_stats_buf_size); /*!\brief Move cur_index by 1 * * \ingroup rate_control * \param[out] firstpass_info struct of firstpass_info. * \return status */ aom_codec_err_t av1_firstpass_info_move_cur_index( FIRSTPASS_INFO *firstpass_info); /*!\brief Pop a stats from firstpass_info * * \ingroup rate_control * \param[out] firstpass_info struct of firstpass_info. * \return status */ aom_codec_err_t av1_firstpass_info_pop(FIRSTPASS_INFO *firstpass_info); /*!\brief Move cur_index by 1 and pop a stats from firstpass_info * * \ingroup rate_control * \param[out] firstpass_info struct of firstpass_info. * \return status */ aom_codec_err_t av1_firstpass_info_move_cur_index_and_pop( FIRSTPASS_INFO *firstpass_info); /*!\brief Push a stats into firstpass_info * * Note that the input stats will be copied into firstpass_info. * \ingroup rate_control * \param[out] firstpass_info struct of firstpass_info. * \param[in] input_stats input stats * \return status */ aom_codec_err_t av1_firstpass_info_push(FIRSTPASS_INFO *firstpass_info, const FIRSTPASS_STATS *input_stats); /*!\brief Peek at a stats from firstpass_info * * The target index is as follows. * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size * * \ingroup rate_control * \param[in] firstpass_info struct of firstpass_info. * \param[in] offset_from_cur index offset from cur_index. * \return pointer to the stats. The pointer will be NULL if * stats_index_offset is invalid. */ const FIRSTPASS_STATS *av1_firstpass_info_peek( const FIRSTPASS_INFO *firstpass_info, int offset_from_cur); /*!\brief Count the future stats from the target in firstpass_info * Note that the target stats will be counted as well. * The target index is as follows. * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size * * \ingroup rate_control * \param[in] firstpass_info struct of firstpass_info. * \param[in] offset_from_cur target stats's inffset * from cur_index. * \return Number of stats in the future after the target stats * including itself. */ int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info, int offset_from_cur); /*!\cond */ #define FC_ANIMATION_THRESH 0.15 enum { FC_NORMAL = 0, FC_GRAPHICS_ANIMATION = 1, FRAME_CONTENT_TYPES = 2 } UENUM1BYTE(FRAME_CONTENT_TYPE); /*!\endcond */ /*! * \brief Data related to the current GF/ARF group and the * individual frames within the group */ typedef struct GF_GROUP { /*!\cond */ // Frame update type, e.g. ARF/GF/LF/Overlay FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH]; unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH]; // The number of frames displayed so far within the GOP at a given coding // frame. unsigned char cur_frame_idx[MAX_STATIC_GF_GROUP_LENGTH]; int layer_depth[MAX_STATIC_GF_GROUP_LENGTH]; int arf_boost[MAX_STATIC_GF_GROUP_LENGTH]; int max_layer_depth; int max_layer_depth_allowed; // This is currently only populated for AOM_Q mode int q_val[MAX_STATIC_GF_GROUP_LENGTH]; int rdmult_val[MAX_STATIC_GF_GROUP_LENGTH]; int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH]; // The frame coding type - inter/intra frame FRAME_TYPE frame_type[MAX_STATIC_GF_GROUP_LENGTH]; // The reference frame buffer control - update or reset REFBUF_STATE refbuf_state[MAX_STATIC_GF_GROUP_LENGTH]; int arf_index; // the index in the gf group of ARF, if no arf, then -1 int size; // The total length of a GOP // The offset into lookahead_ctx for choosing // source of frame parallel encodes. int src_offset[MAX_STATIC_GF_GROUP_LENGTH]; // Stores the display order hint of each frame in the current GF_GROUP. int display_idx[MAX_STATIC_GF_GROUP_LENGTH]; // The reference frame list maps the reference frame indexes to its // buffer index in the decoded buffer. A value of -1 means the // corresponding reference frame index doesn't point towards any // previously decoded frame. int8_t ref_frame_list[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES]; // Update frame index int update_ref_idx[MAX_STATIC_GF_GROUP_LENGTH]; // The map_idx of primary reference int primary_ref_idx[MAX_STATIC_GF_GROUP_LENGTH]; // Indicates the level of parallelism in frame parallel encodes. // 0 : frame is independently encoded (not part of parallel encodes). // 1 : frame is the first in encode order in a given parallel encode set. // 2 : frame occurs later in encode order in a given parallel encode set. int frame_parallel_level[MAX_STATIC_GF_GROUP_LENGTH]; // Indicates whether a frame should act as non-reference frame. bool is_frame_non_ref[MAX_STATIC_GF_GROUP_LENGTH]; // Indicates whether a frame is dropped. bool is_frame_dropped[MAX_STATIC_GF_GROUP_LENGTH]; // Stores the display order hint of the frames not to be // refreshed by the current frame. int skip_frame_refresh[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES]; // Stores the display order hint of the frame to be excluded during reference // assignment. int skip_frame_as_ref[MAX_STATIC_GF_GROUP_LENGTH]; /*!\endcond */ } GF_GROUP; /*!\cond */ typedef struct { // Track if the last frame in a GOP has higher quality. int arf_gf_boost_lst; } GF_STATE; typedef struct { FIRSTPASS_STATS *stats_in_start; FIRSTPASS_STATS *stats_in_end; FIRSTPASS_STATS *stats_in_buf_end; FIRSTPASS_STATS *total_stats; FIRSTPASS_STATS *total_left_stats; } STATS_BUFFER_CTX; /*!\endcond */ /*! * \brief Two pass status and control data. */ typedef struct { /*!\cond */ unsigned int section_intra_rating; // Circular queue of first pass stats stored for most recent frames. // cpi->output_pkt_list[i].data.twopass_stats.buf points to actual data stored // here. FIRSTPASS_STATS *frame_stats_arr[MAX_LAP_BUFFERS + 1]; int frame_stats_next_idx; // Index to next unused element in frame_stats_arr. STATS_BUFFER_CTX *stats_buf_ctx; FIRSTPASS_INFO firstpass_info; // This is the first pass data structure // intended to replace stats_in int first_pass_done; int64_t bits_left; double modified_error_min; double modified_error_max; double modified_error_left; // Projected total bits available for a key frame group of frames int64_t kf_group_bits; // Error score of frames still to be coded in kf group double kf_group_error_left; // Over time correction for bits per macro block estimation double bpm_factor; // Record of target and actual bits spent in current ARF group int rolling_arf_group_target_bits; int rolling_arf_group_actual_bits; int sr_update_lag; int kf_zeromotion_pct; int last_kfgroup_zeromotion_pct; int extend_minq; int extend_maxq; /*!\endcond */ } TWO_PASS; /*! * \brief Frame level Two pass status and control data. */ typedef struct { /*!\cond */ const FIRSTPASS_STATS *stats_in; // Pointer to the stats of the current frame. const FIRSTPASS_STATS *this_frame; double mb_av_energy; // An indication of the content type of the current frame FRAME_CONTENT_TYPE fr_content_type; double frame_avg_haar_energy; /*!\endcond */ } TWO_PASS_FRAME; /*!\cond */ // This structure contains several key parameters to be accumulated for this // frame. typedef struct { // Intra prediction error. int64_t intra_error; // Average wavelet energy computed using Discrete Wavelet Transform (DWT). int64_t frame_avg_wavelet_energy; // Best of intra pred error and inter pred error using last frame as ref. int64_t coded_error; // Best of intra pred error and inter pred error using golden frame as ref. int64_t sr_coded_error; // Count of motion vector. int mv_count; // Count of blocks that pick inter prediction (inter pred error is smaller // than intra pred error). int inter_count; // Count of blocks that pick second ref (golden frame). int second_ref_count; // Count of blocks where the inter and intra are very close and very low. double neutral_count; // Count of blocks where intra error is very small. int intra_skip_count; // Start row. int image_data_start_row; // Count of unique non-zero motion vectors. int new_mv_count; // Sum of inward motion vectors. int sum_in_vectors; // Sum of motion vector row. int sum_mvr; // Sum of motion vector column. int sum_mvc; // Sum of absolute value of motion vector row. int sum_mvr_abs; // Sum of absolute value of motion vector column. int sum_mvc_abs; // Sum of the square of motion vector row. int64_t sum_mvrs; // Sum of the square of motion vector column. int64_t sum_mvcs; // A factor calculated using intra pred error. double intra_factor; // A factor that measures brightness. double brightness_factor; } FRAME_STATS; // This structure contains first pass data. typedef struct { // Buffer holding frame stats for all MACROBLOCKs. // mb_stats[i] stores the FRAME_STATS of the ith // MB in raster scan order. FRAME_STATS *mb_stats; // Buffer to store the prediction error of the (0,0) motion // vector using the last source frame as the reference. // raw_motion_err_list[i] stores the raw_motion_err of // the ith MB in raster scan order. int *raw_motion_err_list; } FirstPassData; struct AV1_COMP; struct EncodeFrameParams; struct AV1EncoderConfig; struct TileDataEnc; static inline int is_fp_wavelet_energy_invalid( const FIRSTPASS_STATS *fp_stats) { assert(fp_stats != NULL); return (fp_stats->frame_avg_wavelet_energy < 0); } static inline BLOCK_SIZE get_fp_block_size(int is_screen_content_type) { return (is_screen_content_type ? BLOCK_8X8 : BLOCK_16X16); } int av1_get_unit_rows_in_tile(const TileInfo *tile, const BLOCK_SIZE fp_block_size); int av1_get_unit_cols_in_tile(const TileInfo *tile, const BLOCK_SIZE fp_block_size); void av1_first_pass_row(struct AV1_COMP *cpi, struct ThreadData *td, struct TileDataEnc *tile_data, const int mb_row, const BLOCK_SIZE fp_block_size); void av1_end_first_pass(struct AV1_COMP *cpi); void av1_free_firstpass_data(FirstPassData *firstpass_data); void av1_twopass_zero_stats(FIRSTPASS_STATS *section); void av1_accumulate_stats(FIRSTPASS_STATS *section, const FIRSTPASS_STATS *frame); /*!\endcond */ /*!\brief AV1 first pass encoding. * * \ingroup rate_control * This function is the first encoding pass for the two pass encoding mode. * It encodes the whole video and collect essential information. * Two pass encoding is an encoding mode in the reference software (libaom) * of AV1 for high performance encoding. The first pass is a fast encoding * process to collect essential information to help the second pass make * encoding decisions and improve coding quality. The collected stats is used * in rate control, for example, to determine frame cut, the position of * alternative reference frame (ARF), etc. * * \param[in] cpi Top-level encoder structure * \param[in] ts_duration Duration of the frame / collection of frames * * \remark Nothing is returned. Instead, the "TWO_PASS" structure inside "cpi" * is modified to store information computed in this function. */ void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration); void av1_noop_first_pass_frame(struct AV1_COMP *cpi, const int64_t ts_duration); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_FIRSTPASS_H_ aom-3.12.1/av1/encoder/global_motion.c000066400000000000000000000510621477627663500175040ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include #include #include "config/aom_dsp_rtcd.h" #include "av1/encoder/global_motion.h" #include "av1/common/convolve.h" #include "av1/common/warped_motion.h" #include "av1/encoder/segmentation.h" #define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR) // Border over which to compute the global motion #define ERRORADV_BORDER 0 int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost) { return best_erroradvantage < erroradv_tr && best_erroradvantage * params_cost < erroradv_prod_tr; } static void convert_to_params(const double *params, int32_t *model) { int i; model[0] = (int32_t)floor(params[0] * (1 << GM_TRANS_PREC_BITS) + 0.5); model[1] = (int32_t)floor(params[1] * (1 << GM_TRANS_PREC_BITS) + 0.5); model[0] = (int32_t)clamp(model[0], GM_TRANS_MIN, GM_TRANS_MAX) * GM_TRANS_DECODE_FACTOR; model[1] = (int32_t)clamp(model[1], GM_TRANS_MIN, GM_TRANS_MAX) * GM_TRANS_DECODE_FACTOR; for (i = 2; i < 6; ++i) { const int diag_value = ((i == 2 || i == 5) ? (1 << GM_ALPHA_PREC_BITS) : 0); model[i] = (int32_t)floor(params[i] * (1 << GM_ALPHA_PREC_BITS) + 0.5); model[i] = (int32_t)clamp(model[i] - diag_value, GM_ALPHA_MIN, GM_ALPHA_MAX); model[i] = (model[i] + diag_value) * GM_ALPHA_DECODE_FACTOR; } } void av1_convert_model_to_params(const double *params, WarpedMotionParams *model) { convert_to_params(params, model->wmmat); model->wmtype = get_wmtype(model); model->invalid = 0; } // Adds some offset to a global motion parameter and handles // all of the necessary precision shifts, clamping, and // zero-centering. static int32_t add_param_offset(int param_index, int32_t param_value, int32_t offset) { const int scale_vals[2] = { GM_TRANS_PREC_DIFF, GM_ALPHA_PREC_DIFF }; const int clamp_vals[2] = { GM_TRANS_MAX, GM_ALPHA_MAX }; // type of param: 0 - translation, 1 - affine const int param_type = (param_index < 2 ? 0 : 1); const int is_one_centered = (param_index == 2 || param_index == 5); // Make parameter zero-centered and offset the shift that was done to make // it compatible with the warped model param_value = (param_value - (is_one_centered << WARPEDMODEL_PREC_BITS)) >> scale_vals[param_type]; // Add desired offset to the rescaled/zero-centered parameter param_value += offset; // Clamp the parameter so it does not overflow the number of bits allotted // to it in the bitstream param_value = (int32_t)clamp(param_value, -clamp_vals[param_type], clamp_vals[param_type]); // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible // with the warped motion library param_value *= (1 << scale_vals[param_type]); // Undo the zero-centering step if necessary return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS); } static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) { switch (wmtype) { case IDENTITY: wm->wmmat[0] = 0; wm->wmmat[1] = 0; AOM_FALLTHROUGH_INTENDED; case TRANSLATION: wm->wmmat[2] = 1 << WARPEDMODEL_PREC_BITS; wm->wmmat[3] = 0; AOM_FALLTHROUGH_INTENDED; case ROTZOOM: wm->wmmat[4] = -wm->wmmat[3]; wm->wmmat[5] = wm->wmmat[2]; AOM_FALLTHROUGH_INTENDED; case AFFINE: break; default: assert(0); } wm->wmtype = wmtype; } #if CONFIG_AV1_HIGHBITDEPTH static inline int generic_sad_highbd(const uint16_t *const ref, int ref_stride, const uint16_t *const dst, int dst_stride, int p_width, int p_height) { // This function should only be called for patches smaller than // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels // small enough that we don't need a 64-bit accumulator assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK); int sad = 0; for (int i = 0; i < p_height; ++i) { for (int j = 0; j < p_width; ++j) { sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]); } } return sad; } #if WARP_ERROR_BLOCK != 32 #error "Need to change SAD call size in highbd_segmented_frame_error" #endif // WARP_ERROR_BLOCK != 32 static int64_t highbd_segmented_frame_error( const uint16_t *const ref, int ref_stride, const uint16_t *const dst, int dst_stride, int p_width, int p_height, int bd, uint8_t *segment_map, int segment_map_stride) { (void)bd; int patch_w, patch_h; const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); int64_t sum_error = 0; for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) { for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) { int seg_x = j >> WARP_ERROR_BLOCK_LOG; int seg_y = i >> WARP_ERROR_BLOCK_LOG; // Only compute the error if this block contains inliers from the motion // model if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; // avoid computing error into the frame padding patch_w = AOMMIN(error_bsize_w, p_width - j); patch_h = AOMMIN(error_bsize_h, p_height - i); if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) { sum_error += aom_highbd_sad32x32( CONVERT_TO_BYTEPTR(ref + j + i * ref_stride), ref_stride, CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride); } else { sum_error += generic_sad_highbd(ref + j + i * ref_stride, ref_stride, dst + j + i * dst_stride, dst_stride, patch_w, patch_h); } } } return sum_error; } #if WARP_ERROR_BLOCK != 32 #error "Need to change SAD call size in highbd_warp_error" #endif // WARP_ERROR_BLOCK != 32 static int64_t highbd_warp_error(WarpedMotionParams *wm, const uint16_t *const ref, int ref_width, int ref_height, int ref_stride, const uint16_t *const dst, int dst_stride, int p_col, int p_row, int p_width, int p_height, int subsampling_x, int subsampling_y, int bd, int64_t best_error, uint8_t *segment_map, int segment_map_stride) { int64_t gm_sumerr = 0; const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); DECLARE_ALIGNED(32, uint16_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]); ConvolveParams conv_params = get_conv_params(0, 0, bd); conv_params.use_dist_wtd_comp_avg = 0; for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) { int seg_x = j >> WARP_ERROR_BLOCK_LOG; int seg_y = i >> WARP_ERROR_BLOCK_LOG; // Only compute the error if this block contains inliers from the motion // model if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; // avoid warping extra 8x8 blocks in the padded region of the frame // when p_width and p_height are not multiples of WARP_ERROR_BLOCK const int warp_w = AOMMIN(error_bsize_w, p_col + ref_width - j); const int warp_h = AOMMIN(error_bsize_h, p_row + ref_height - i); highbd_warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i, warp_w, warp_h, WARP_ERROR_BLOCK, subsampling_x, subsampling_y, bd, &conv_params); if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) { gm_sumerr += aom_highbd_sad32x32( CONVERT_TO_BYTEPTR(tmp), WARP_ERROR_BLOCK, CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride); } else { gm_sumerr += generic_sad_highbd(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride, dst_stride, warp_w, warp_h); } if (gm_sumerr > best_error) return INT64_MAX; } } return gm_sumerr; } #endif static inline int generic_sad(const uint8_t *const ref, int ref_stride, const uint8_t *const dst, int dst_stride, int p_width, int p_height) { // This function should only be called for patches smaller than // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels // small enough that we don't need a 64-bit accumulator assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK); int sad = 0; for (int i = 0; i < p_height; ++i) { for (int j = 0; j < p_width; ++j) { sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]); } } return sad; } #if WARP_ERROR_BLOCK != 32 #error "Need to change SAD call size in segmented_warp_error" #endif // WARP_ERROR_BLOCK != 32 static int64_t segmented_frame_error(const uint8_t *const ref, int ref_stride, const uint8_t *const dst, int dst_stride, int p_width, int p_height, uint8_t *segment_map, int segment_map_stride) { int patch_w, patch_h; const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); int64_t sum_error = 0; for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) { for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) { int seg_x = j >> WARP_ERROR_BLOCK_LOG; int seg_y = i >> WARP_ERROR_BLOCK_LOG; // Only compute the error if this block contains inliers from the motion // model if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; // avoid computing error into the frame padding patch_w = AOMMIN(error_bsize_w, p_width - j); patch_h = AOMMIN(error_bsize_h, p_height - i); if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) { sum_error += aom_sad32x32(ref + j + i * ref_stride, ref_stride, dst + j + i * dst_stride, dst_stride); } else { sum_error += generic_sad(ref + j + i * ref_stride, ref_stride, dst + j + i * dst_stride, dst_stride, patch_w, patch_h); } } } return sum_error; } #if WARP_ERROR_BLOCK != 32 #error "Need to change SAD call size in warp_error" #endif // WARP_ERROR_BLOCK != 32 static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref, int ref_width, int ref_height, int ref_stride, const uint8_t *const dst, int dst_stride, int p_col, int p_row, int p_width, int p_height, int subsampling_x, int subsampling_y, int64_t best_error, uint8_t *segment_map, int segment_map_stride) { int64_t gm_sumerr = 0; int warp_w, warp_h; const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); DECLARE_ALIGNED(16, uint8_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]); ConvolveParams conv_params = get_conv_params(0, 0, 8); conv_params.use_dist_wtd_comp_avg = 0; for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) { int seg_x = j >> WARP_ERROR_BLOCK_LOG; int seg_y = i >> WARP_ERROR_BLOCK_LOG; // Only compute the error if this block contains inliers from the motion // model if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; // avoid warping extra 8x8 blocks in the padded region of the frame // when p_width and p_height are not multiples of WARP_ERROR_BLOCK warp_w = AOMMIN(error_bsize_w, p_col + ref_width - j); warp_h = AOMMIN(error_bsize_h, p_row + ref_height - i); warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i, warp_w, warp_h, WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params); if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) { gm_sumerr += aom_sad32x32(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride, dst_stride); } else { gm_sumerr += generic_sad(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride, dst_stride, warp_w, warp_h); } if (gm_sumerr > best_error) return INT64_MAX; } } return gm_sumerr; } int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, int p_width, int p_height, uint8_t *segment_map, int segment_map_stride) { #if CONFIG_AV1_HIGHBITDEPTH if (use_hbd) { return highbd_segmented_frame_error( CONVERT_TO_SHORTPTR(ref), ref_stride, CONVERT_TO_SHORTPTR(dst), dst_stride, p_width, p_height, bd, segment_map, segment_map_stride); } #endif (void)use_hbd; (void)bd; return segmented_frame_error(ref, ref_stride, dst, dst_stride, p_width, p_height, segment_map, segment_map_stride); } // Returns the error between the result of applying motion 'wm' to the frame // described by 'ref' and the frame described by 'dst'. static int64_t get_warp_error(WarpedMotionParams *wm, int use_hbd, int bd, const uint8_t *ref, int ref_width, int ref_height, int ref_stride, uint8_t *dst, int dst_stride, int p_col, int p_row, int p_width, int p_height, int subsampling_x, int subsampling_y, int64_t best_error, uint8_t *segment_map, int segment_map_stride) { if (!av1_get_shear_params(wm)) return INT64_MAX; #if CONFIG_AV1_HIGHBITDEPTH if (use_hbd) return highbd_warp_error(wm, CONVERT_TO_SHORTPTR(ref), ref_width, ref_height, ref_stride, CONVERT_TO_SHORTPTR(dst), dst_stride, p_col, p_row, p_width, p_height, subsampling_x, subsampling_y, bd, best_error, segment_map, segment_map_stride); #endif (void)use_hbd; (void)bd; return warp_error(wm, ref, ref_width, ref_height, ref_stride, dst, dst_stride, p_col, p_row, p_width, p_height, subsampling_x, subsampling_y, best_error, segment_map, segment_map_stride); } int64_t av1_refine_integerized_param( WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd, uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst, int d_width, int d_height, int d_stride, int n_refinements, int64_t ref_frame_error, uint8_t *segment_map, int segment_map_stride) { static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 }; const int border = ERRORADV_BORDER; int i = 0, p; int n_params = max_trans_model_params[wmtype]; int32_t *param_mat = wm->wmmat; int64_t step_error, best_error; int32_t step; int32_t *param; int32_t curr_param; int32_t best_param; force_wmtype(wm, wmtype); wm->wmtype = get_wmtype(wm); if (n_refinements == 0) { // Compute the maximum error value that will be accepted, so that // get_warp_error can terminate early if it proves the model will not // be accepted. int64_t selection_threshold = (int64_t)lrint(ref_frame_error * erroradv_tr); return get_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, dst + border * d_stride + border, d_stride, border, border, d_width - 2 * border, d_height - 2 * border, 0, 0, selection_threshold, segment_map, segment_map_stride); } // When refining, use a slightly higher threshold for the initial error // calculation - see comment above erroradv_early_tr for why. int64_t selection_threshold = (int64_t)lrint(ref_frame_error * erroradv_early_tr); best_error = get_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, dst + border * d_stride + border, d_stride, border, border, d_width - 2 * border, d_height - 2 * border, 0, 0, selection_threshold, segment_map, segment_map_stride); if (best_error > selection_threshold) { return INT64_MAX; } step = 1 << (n_refinements - 1); for (i = 0; i < n_refinements; i++, step >>= 1) { for (p = 0; p < n_params; ++p) { int step_dir = 0; param = param_mat + p; curr_param = *param; best_param = curr_param; // look to the left // Note: We have to use force_wmtype() to keep the proper symmetry for // ROTZOOM type models *param = add_param_offset(p, curr_param, -step); force_wmtype(wm, wmtype); step_error = get_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, dst + border * d_stride + border, d_stride, border, border, d_width - 2 * border, d_height - 2 * border, 0, 0, best_error, segment_map, segment_map_stride); if (step_error < best_error) { best_error = step_error; best_param = *param; step_dir = -1; } // look to the right *param = add_param_offset(p, curr_param, step); force_wmtype(wm, wmtype); step_error = get_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, dst + border * d_stride + border, d_stride, border, border, d_width - 2 * border, d_height - 2 * border, 0, 0, best_error, segment_map, segment_map_stride); if (step_error < best_error) { best_error = step_error; best_param = *param; step_dir = 1; } // look to the direction chosen above repeatedly until error increases // for the biggest step size while (step_dir) { *param = add_param_offset(p, best_param, step * step_dir); force_wmtype(wm, wmtype); step_error = get_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, dst + border * d_stride + border, d_stride, border, border, d_width - 2 * border, d_height - 2 * border, 0, 0, best_error, segment_map, segment_map_stride); if (step_error < best_error) { best_error = step_error; best_param = *param; } else { step_dir = 0; } } // Restore best parameter value so far *param = best_param; force_wmtype(wm, wmtype); } } wm->wmtype = get_wmtype(wm); // Recompute shear params for the refined model // This should never fail, because we only ever consider warp-able models if (!av1_get_shear_params(wm)) { assert(0); } return best_error; } #define FEAT_COUNT_TR 3 #define SEG_COUNT_TR 48 void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width, int height, int *inliers, int num_inliers) { int seg_count = 0; memset(segment_map, 0, sizeof(*segment_map) * width * height); for (int i = 0; i < num_inliers; i++) { int x = inliers[i * 2]; int y = inliers[i * 2 + 1]; int seg_x = x >> WARP_ERROR_BLOCK_LOG; int seg_y = y >> WARP_ERROR_BLOCK_LOG; segment_map[seg_y * width + seg_x] += 1; } for (int i = 0; i < height; i++) { for (int j = 0; j < width; j++) { uint8_t feat_count = segment_map[i * width + j]; segment_map[i * width + j] = (feat_count >= FEAT_COUNT_TR); seg_count += (segment_map[i * width + j]); } } // If this motion does not make up a large enough portion of the frame, // use the unsegmented version of the error metric if (seg_count < SEG_COUNT_TR) memset(segment_map, 1, width * height * sizeof(*segment_map)); } aom-3.12.1/av1/encoder/global_motion.h000066400000000000000000000105331477627663500175070ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_H_ #define AOM_AV1_ENCODER_GLOBAL_MOTION_H_ #include "aom/aom_integer.h" #include "aom_dsp/flow_estimation/flow_estimation.h" #include "aom_util/aom_pthread.h" #include "av1/encoder/enc_enums.h" #ifdef __cplusplus extern "C" { #endif #define RANSAC_NUM_MOTIONS 1 #define GM_MAX_REFINEMENT_STEPS 5 #define MAX_DIRECTIONS 2 // The structure holds a valid reference frame type and its temporal distance // from the source frame. typedef struct { int distance; MV_REFERENCE_FRAME frame; } FrameDistPair; typedef struct { // Array of structure which holds the global motion parameters for a given // motion model. motion_models[i] holds the parameters for a given motion // model for the ith ransac motion. MotionModel motion_models[RANSAC_NUM_MOTIONS]; // Pointer to hold inliers from motion model. uint8_t *segment_map; } GlobalMotionData; typedef struct { // Holds the mapping of each thread to past/future direction. // thread_id_to_dir[i] indicates the direction id (past - 0/future - 1) // assigned to the ith thread. int8_t thread_id_to_dir[MAX_NUM_THREADS]; // A flag which holds the early exit status based on the speed feature // 'prune_ref_frame_for_gm_search'. early_exit[i] will be set if the speed // feature based early exit happens in the direction 'i'. int8_t early_exit[MAX_DIRECTIONS]; // Counter for the next reference frame to be processed. // next_frame_to_process[i] will hold the count of next reference frame to be // processed in the direction 'i'. int8_t next_frame_to_process[MAX_DIRECTIONS]; } GlobalMotionJobInfo; typedef struct { // Data related to assigning jobs for global motion multi-threading. GlobalMotionJobInfo job_info; #if CONFIG_MULTITHREAD // Mutex lock used while dispatching jobs. pthread_mutex_t *mutex_; #endif // Initialized to false, set to true by the worker thread that encounters an // error in order to abort the processing of other worker threads. bool gm_mt_exit; } AV1GlobalMotionSync; void av1_convert_model_to_params(const double *params, WarpedMotionParams *model); // Criteria for accepting a global motion model static const double erroradv_tr = 0.65; static const double erroradv_prod_tr = 20000; // Early exit threshold for global motion refinement // This is set slightly higher than erroradv_tr, as a compromise between // two factors: // // 1) By rejecting un-promising models early, we can reduce the encode time // spent trying to refine them // // 2) When we refine a model, its error may decrease to below the acceptance // threshold even if the model is initially above the threshold static const double erroradv_early_tr = 0.70; int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost); void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width, int height, int *inliers, int num_inliers); int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, int p_width, int p_height, uint8_t *segment_map, int segment_map_stride); // Returns the warp error between "dst" and the result of applying the // motion params that result from fine-tuning "wm" to "ref". Note that "wm" is // modified in place. int64_t av1_refine_integerized_param( WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd, uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst, int d_width, int d_height, int d_stride, int n_refinements, int64_t ref_frame_error, uint8_t *segment_map, int segment_map_stride); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_GLOBAL_MOTION_H_ aom-3.12.1/av1/encoder/global_motion_facade.c000066400000000000000000000451031477627663500207660ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom_dsp/binary_codes_writer.h" #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_dsp/flow_estimation/flow_estimation.h" #include "aom_dsp/pyramid.h" #include "av1/common/warped_motion.h" #include "av1/encoder/encoder.h" #include "av1/encoder/ethread.h" #include "av1/encoder/rdopt.h" #include "av1/encoder/global_motion_facade.h" // Range of model types to search #define FIRST_GLOBAL_TRANS_TYPE ROTZOOM #define LAST_GLOBAL_TRANS_TYPE ROTZOOM // Computes the cost for the warp parameters. static int gm_get_params_cost(const WarpedMotionParams *gm, const WarpedMotionParams *ref_gm, int allow_hp) { int params_cost = 0; int trans_bits, trans_prec_diff; switch (gm->wmtype) { case AFFINE: case ROTZOOM: params_cost += aom_count_signed_primitive_refsubexpfin( GM_ALPHA_MAX + 1, SUBEXPFIN_K, (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS), (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); params_cost += aom_count_signed_primitive_refsubexpfin( GM_ALPHA_MAX + 1, SUBEXPFIN_K, (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF), (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF)); if (gm->wmtype >= AFFINE) { params_cost += aom_count_signed_primitive_refsubexpfin( GM_ALPHA_MAX + 1, SUBEXPFIN_K, (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF), (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF)); params_cost += aom_count_signed_primitive_refsubexpfin( GM_ALPHA_MAX + 1, SUBEXPFIN_K, (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS), (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); } AOM_FALLTHROUGH_INTENDED; case TRANSLATION: trans_bits = (gm->wmtype == TRANSLATION) ? GM_ABS_TRANS_ONLY_BITS - !allow_hp : GM_ABS_TRANS_BITS; trans_prec_diff = (gm->wmtype == TRANSLATION) ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp : GM_TRANS_PREC_DIFF; params_cost += aom_count_signed_primitive_refsubexpfin( (1 << trans_bits) + 1, SUBEXPFIN_K, (ref_gm->wmmat[0] >> trans_prec_diff), (gm->wmmat[0] >> trans_prec_diff)); params_cost += aom_count_signed_primitive_refsubexpfin( (1 << trans_bits) + 1, SUBEXPFIN_K, (ref_gm->wmmat[1] >> trans_prec_diff), (gm->wmmat[1] >> trans_prec_diff)); AOM_FALLTHROUGH_INTENDED; case IDENTITY: break; default: assert(0); } return (params_cost << AV1_PROB_COST_SHIFT); } // For the given reference frame, computes the global motion parameters for // different motion models and finds the best. static inline void compute_global_motion_for_ref_frame( AV1_COMP *cpi, struct aom_internal_error_info *error_info, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame, MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w, const int segment_map_h, const WarpedMotionParams *ref_params) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; int src_width = cpi->source->y_crop_width; int src_height = cpi->source->y_crop_height; int src_stride = cpi->source->y_stride; assert(ref_buf[frame] != NULL); int bit_depth = cpi->common.seq_params->bit_depth; GlobalMotionMethod global_motion_method = default_global_motion_method; int downsample_level = cpi->sf.gm_sf.downsample_level; int num_refinements = cpi->sf.gm_sf.num_refinement_steps; bool mem_alloc_failed = false; // Select the best model based on fractional error reduction. // By initializing this to erroradv_tr, the same logic which is used to // select the best model will automatically filter out any model which // doesn't meet the required quality threshold double best_erroradv = erroradv_tr; for (TransformationType model = FIRST_GLOBAL_TRANS_TYPE; model <= LAST_GLOBAL_TRANS_TYPE; ++model) { if (!aom_compute_global_motion(model, cpi->source, ref_buf[frame], bit_depth, global_motion_method, downsample_level, motion_models, RANSAC_NUM_MOTIONS, &mem_alloc_failed)) { if (mem_alloc_failed) { aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate global motion buffers"); } continue; } for (int i = 0; i < RANSAC_NUM_MOTIONS; ++i) { if (motion_models[i].num_inliers == 0) continue; WarpedMotionParams tmp_wm_params; av1_convert_model_to_params(motion_models[i].params, &tmp_wm_params); // Check that the generated model is warp-able if (!av1_get_shear_params(&tmp_wm_params)) continue; // Skip models that we won't use (IDENTITY or TRANSLATION) // // For IDENTITY type models, we don't need to evaluate anything because // all the following logic is effectively comparing the estimated model // to an identity model. // // For TRANSLATION type global motion models, gm_get_motion_vector() gives // the wrong motion vector (see comments in that function for details). // As translation-type models do not give much gain, we can avoid this bug // by never choosing a TRANSLATION type model if (tmp_wm_params.wmtype <= TRANSLATION) continue; av1_compute_feature_segmentation_map( segment_map, segment_map_w, segment_map_h, motion_models[i].inliers, motion_models[i].num_inliers); int64_t ref_frame_error = av1_segmented_frame_error( is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride, cpi->source->y_buffer, src_stride, src_width, src_height, segment_map, segment_map_w); if (ref_frame_error == 0) continue; const int64_t warp_error = av1_refine_integerized_param( &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer, ref_buf[frame]->y_crop_width, ref_buf[frame]->y_crop_height, ref_buf[frame]->y_stride, cpi->source->y_buffer, src_width, src_height, src_stride, num_refinements, ref_frame_error, segment_map, segment_map_w); // av1_refine_integerized_param() can return a simpler model type than // its input, so re-check model type here if (tmp_wm_params.wmtype <= TRANSLATION) continue; double erroradvantage = (double)warp_error / ref_frame_error; // Check that the model signaling cost is not too high if (!av1_is_enough_erroradvantage( erroradvantage, gm_get_params_cost(&tmp_wm_params, ref_params, cm->features.allow_high_precision_mv))) { continue; } if (erroradvantage < best_erroradv) { best_erroradv = erroradvantage; // Save the wm_params modified by // av1_refine_integerized_param() rather than motion index to // avoid rerunning refine() below. memcpy(&(cm->global_motion[frame]), &tmp_wm_params, sizeof(WarpedMotionParams)); } } } } // Computes global motion for the given reference frame. void av1_compute_gm_for_valid_ref_frames( AV1_COMP *cpi, struct aom_internal_error_info *error_info, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame, MotionModel *motion_models, uint8_t *segment_map, int segment_map_w, int segment_map_h) { AV1_COMMON *const cm = &cpi->common; const WarpedMotionParams *ref_params = cm->prev_frame ? &cm->prev_frame->global_motion[frame] : &default_warp_params; compute_global_motion_for_ref_frame(cpi, error_info, ref_buf, frame, motion_models, segment_map, segment_map_w, segment_map_h, ref_params); } // Loops over valid reference frames and computes global motion estimation. static inline void compute_global_motion_for_references( AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], FrameDistPair reference_frame[REF_FRAMES - 1], int num_ref_frames, MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w, const int segment_map_h) { AV1_COMMON *const cm = &cpi->common; struct aom_internal_error_info *const error_info = cpi->td.mb.e_mbd.error_info; // Compute global motion w.r.t. reference frames starting from the nearest ref // frame in a given direction. for (int frame = 0; frame < num_ref_frames; frame++) { int ref_frame = reference_frame[frame].frame; av1_compute_gm_for_valid_ref_frames(cpi, error_info, ref_buf, ref_frame, motion_models, segment_map, segment_map_w, segment_map_h); // If global motion w.r.t. current ref frame is // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t // the remaining ref frames in that direction. if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search && cm->global_motion[ref_frame].wmtype <= TRANSLATION) break; } } // Compares the distance in 'a' and 'b'. Returns 1 if the frame corresponding to // 'a' is farther, -1 if the frame corresponding to 'b' is farther, 0 otherwise. static int compare_distance(const void *a, const void *b) { const int diff = ((FrameDistPair *)a)->distance - ((FrameDistPair *)b)->distance; if (diff > 0) return 1; else if (diff < 0) return -1; return 0; } static int disable_gm_search_based_on_stats(const AV1_COMP *const cpi) { int is_gm_present = 1; // Check number of GM models only in GF groups with ARF frames. GM param // estimation is always done in the case of GF groups with no ARF frames (flat // gops) if (cpi->ppi->gf_group.arf_index > -1) { // valid_gm_model_found is initialized to INT32_MAX in the beginning of // every GF group. // Therefore, GM param estimation is always done for all frames until // at least 1 frame each of ARF_UPDATE, INTNL_ARF_UPDATE and LF_UPDATE are // encoded in a GF group For subsequent frames, GM param estimation is // disabled, if no valid models have been found in all the three update // types. is_gm_present = (cpi->ppi->valid_gm_model_found[ARF_UPDATE] != 0) || (cpi->ppi->valid_gm_model_found[INTNL_ARF_UPDATE] != 0) || (cpi->ppi->valid_gm_model_found[LF_UPDATE] != 0); } return !is_gm_present; } // Prunes reference frames for global motion estimation based on the speed // feature 'gm_search_type'. static int do_gm_search_logic(SPEED_FEATURES *const sf, int frame) { (void)frame; switch (sf->gm_sf.gm_search_type) { case GM_FULL_SEARCH: return 1; case GM_REDUCED_REF_SEARCH_SKIP_L2_L3: return !(frame == LAST2_FRAME || frame == LAST3_FRAME); case GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2: return !(frame == LAST2_FRAME || frame == LAST3_FRAME || (frame == ALTREF2_FRAME)); case GM_SEARCH_CLOSEST_REFS_ONLY: return 1; case GM_DISABLE_SEARCH: return 0; default: assert(0); } return 1; } // Populates valid reference frames in past/future directions in // 'reference_frames' and their count in 'num_ref_frames'. static inline void update_valid_ref_frames_for_gm( AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1], int *num_ref_frames) { AV1_COMMON *const cm = &cpi->common; int *num_past_ref_frames = &num_ref_frames[0]; int *num_future_ref_frames = &num_ref_frames[1]; const GF_GROUP *gf_group = &cpi->ppi->gf_group; int ref_pruning_enabled = is_frame_eligible_for_ref_pruning( gf_group, cpi->sf.inter_sf.selective_ref_frame, 1, cpi->gf_frame_index); int cur_frame_gm_disabled = 0; int pyr_lvl = cm->cur_frame->pyramid_level; if (cpi->sf.gm_sf.disable_gm_search_based_on_stats) { cur_frame_gm_disabled = disable_gm_search_based_on_stats(cpi); } for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) { const MV_REFERENCE_FRAME ref_frame[2] = { frame, NONE_FRAME }; RefCntBuffer *buf = get_ref_frame_buf(cm, frame); const int ref_disabled = !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]); ref_buf[frame] = NULL; cm->global_motion[frame] = default_warp_params; // Skip global motion estimation for invalid ref frames if (buf == NULL || (ref_disabled && cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE)) { continue; } else { ref_buf[frame] = &buf->buf; } int prune_ref_frames = ref_pruning_enabled && prune_ref_by_selective_ref_frame(cpi, NULL, ref_frame, cm->cur_frame->ref_display_order_hint); int ref_pyr_lvl = buf->pyramid_level; if (ref_buf[frame]->y_crop_width == cpi->source->y_crop_width && ref_buf[frame]->y_crop_height == cpi->source->y_crop_height && do_gm_search_logic(&cpi->sf, frame) && !prune_ref_frames && ref_pyr_lvl <= pyr_lvl && !cur_frame_gm_disabled) { assert(ref_buf[frame] != NULL); const int relative_frame_dist = av1_encoder_get_relative_dist( buf->display_order_hint, cm->cur_frame->display_order_hint); // Populate past and future ref frames. // reference_frames[0][] indicates past direction and // reference_frames[1][] indicates future direction. if (relative_frame_dist == 0) { // Skip global motion estimation for frames at the same nominal instant. // This will generally be either a "real" frame coded against a // temporal filtered version, or a higher spatial layer coded against // a lower spatial layer. In either case, the optimal motion model will // be IDENTITY, so we don't need to search explicitly. } else if (relative_frame_dist < 0) { reference_frames[0][*num_past_ref_frames].distance = abs(relative_frame_dist); reference_frames[0][*num_past_ref_frames].frame = frame; (*num_past_ref_frames)++; } else { reference_frames[1][*num_future_ref_frames].distance = abs(relative_frame_dist); reference_frames[1][*num_future_ref_frames].frame = frame; (*num_future_ref_frames)++; } } } } // Initializes parameters used for computing global motion. static inline void setup_global_motion_info_params(AV1_COMP *cpi) { GlobalMotionInfo *const gm_info = &cpi->gm_info; YV12_BUFFER_CONFIG *source = cpi->source; gm_info->segment_map_w = (source->y_crop_width + WARP_ERROR_BLOCK - 1) >> WARP_ERROR_BLOCK_LOG; gm_info->segment_map_h = (source->y_crop_height + WARP_ERROR_BLOCK - 1) >> WARP_ERROR_BLOCK_LOG; memset(gm_info->reference_frames, -1, sizeof(gm_info->reference_frames[0][0]) * MAX_DIRECTIONS * (REF_FRAMES - 1)); av1_zero(gm_info->num_ref_frames); // Populate ref_buf for valid ref frames in global motion update_valid_ref_frames_for_gm(cpi, gm_info->ref_buf, gm_info->reference_frames, gm_info->num_ref_frames); // Sort the past and future ref frames in the ascending order of their // distance from the current frame. reference_frames[0] => past direction // and reference_frames[1] => future direction. qsort(gm_info->reference_frames[0], gm_info->num_ref_frames[0], sizeof(gm_info->reference_frames[0][0]), compare_distance); qsort(gm_info->reference_frames[1], gm_info->num_ref_frames[1], sizeof(gm_info->reference_frames[1][0]), compare_distance); if (cpi->sf.gm_sf.gm_search_type == GM_SEARCH_CLOSEST_REFS_ONLY) { // Filter down to the nearest two ref frames. // Prefer one past and one future ref over two past refs, even if // the second past ref is closer if (gm_info->num_ref_frames[1] > 0) { gm_info->num_ref_frames[0] = AOMMIN(gm_info->num_ref_frames[0], 1); gm_info->num_ref_frames[1] = AOMMIN(gm_info->num_ref_frames[1], 1); } else { gm_info->num_ref_frames[0] = AOMMIN(gm_info->num_ref_frames[0], 2); } } } // Computes global motion w.r.t. valid reference frames. static inline void global_motion_estimation(AV1_COMP *cpi) { GlobalMotionInfo *const gm_info = &cpi->gm_info; GlobalMotionData *gm_data = &cpi->td.gm_data; // Compute global motion w.r.t. past reference frames and future reference // frames for (int dir = 0; dir < MAX_DIRECTIONS; dir++) { if (gm_info->num_ref_frames[dir] > 0) compute_global_motion_for_references( cpi, gm_info->ref_buf, gm_info->reference_frames[dir], gm_info->num_ref_frames[dir], gm_data->motion_models, gm_data->segment_map, gm_info->segment_map_w, gm_info->segment_map_h); } } // Global motion estimation for the current frame is computed.This computation // happens once per frame and the winner motion model parameters are stored in // cm->cur_frame->global_motion. void av1_compute_global_motion_facade(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; GlobalMotionInfo *const gm_info = &cpi->gm_info; if (cpi->oxcf.tool_cfg.enable_global_motion) { if (cpi->gf_frame_index == 0) { for (int i = 0; i < FRAME_UPDATE_TYPES; i++) { cpi->ppi->valid_gm_model_found[i] = INT32_MAX; #if CONFIG_FPMT_TEST if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) cpi->ppi->temp_valid_gm_model_found[i] = INT32_MAX; #endif } } } if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source && cpi->oxcf.tool_cfg.enable_global_motion && !gm_info->search_done && cpi->sf.gm_sf.gm_search_type != GM_DISABLE_SEARCH) { setup_global_motion_info_params(cpi); // Terminate early if the total number of reference frames is zero. if (cpi->gm_info.num_ref_frames[0] || cpi->gm_info.num_ref_frames[1]) { gm_alloc_data(cpi, &cpi->td.gm_data); if (cpi->mt_info.num_workers > 1) av1_global_motion_estimation_mt(cpi); else global_motion_estimation(cpi); gm_dealloc_data(&cpi->td.gm_data); gm_info->search_done = 1; } } memcpy(cm->cur_frame->global_motion, cm->global_motion, sizeof(cm->cur_frame->global_motion)); } aom-3.12.1/av1/encoder/global_motion_facade.h000066400000000000000000000041751477627663500207770ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_ #define AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_ #ifdef __cplusplus extern "C" { #endif struct yv12_buffer_config; struct AV1_COMP; // Allocates memory for members of GlobalMotionData. static inline void gm_alloc_data(AV1_COMP *cpi, GlobalMotionData *gm_data) { AV1_COMMON *cm = &cpi->common; GlobalMotionInfo *gm_info = &cpi->gm_info; CHECK_MEM_ERROR(cm, gm_data->segment_map, aom_malloc(sizeof(*gm_data->segment_map) * gm_info->segment_map_w * gm_info->segment_map_h)); av1_zero_array(gm_data->motion_models, RANSAC_NUM_MOTIONS); for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) { CHECK_MEM_ERROR(cm, gm_data->motion_models[m].inliers, aom_malloc(sizeof(*gm_data->motion_models[m].inliers) * 2 * MAX_CORNERS)); } } // Deallocates the memory allocated for members of GlobalMotionData. static inline void gm_dealloc_data(GlobalMotionData *gm_data) { aom_free(gm_data->segment_map); gm_data->segment_map = NULL; for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) { aom_free(gm_data->motion_models[m].inliers); gm_data->motion_models[m].inliers = NULL; } } void av1_compute_gm_for_valid_ref_frames( AV1_COMP *cpi, struct aom_internal_error_info *error_info, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame, MotionModel *motion_models, uint8_t *segment_map, int segment_map_w, int segment_map_h); void av1_compute_global_motion_facade(struct AV1_COMP *cpi); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_ aom-3.12.1/av1/encoder/gop_structure.c000066400000000000000000001145211477627663500175640ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/common/blockd.h" #include "config/aom_config.h" #include "config/aom_scale_rtcd.h" #include "aom/aom_codec.h" #include "aom/aom_encoder.h" #include "av1/common/av1_common_int.h" #include "av1/encoder/encoder.h" #include "av1/encoder/firstpass.h" #include "av1/encoder/gop_structure.h" #include "av1/encoder/pass2_strategy.h" // This function sets gf_group->frame_parallel_level for LF_UPDATE frames based // on the value of parallel_frame_count. static void set_frame_parallel_level(int *frame_parallel_level, int *parallel_frame_count, int max_parallel_frames) { assert(*parallel_frame_count > 0); // parallel_frame_count > 1 indicates subsequent frame(s) in the current // parallel encode set. *frame_parallel_level = 1 + (*parallel_frame_count > 1); // Update the count of no. of parallel frames. (*parallel_frame_count)++; if (*parallel_frame_count > max_parallel_frames) *parallel_frame_count = 1; } // This function sets gf_group->src_offset based on frame_parallel_level. // Outputs are gf_group->src_offset and first_frame_index static void set_src_offset(GF_GROUP *const gf_group, int *first_frame_index, int cur_frame_idx, int frame_ind) { if (gf_group->frame_parallel_level[frame_ind] > 0) { if (gf_group->frame_parallel_level[frame_ind] == 1) { *first_frame_index = cur_frame_idx; } // Obtain the offset of the frame at frame_ind in the lookahead queue by // subtracting the display order hints of the current frame from the display // order hint of the first frame in parallel encoding set (at // first_frame_index). gf_group->src_offset[frame_ind] = (cur_frame_idx + gf_group->arf_src_offset[frame_ind]) - *first_frame_index; } } // Sets the GF_GROUP params for LF_UPDATE frames. static inline void set_params_for_leaf_frames( const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info, GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind, int *parallel_frame_count, int max_parallel_frames, int do_frame_parallel_encode, int *first_frame_index, int *cur_disp_index, int layer_depth, int start, int end) { gf_group->update_type[*frame_ind] = LF_UPDATE; gf_group->arf_src_offset[*frame_ind] = 0; gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS; gf_group->frame_type[*frame_ind] = INTER_FRAME; gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, layer_depth); gf_group->display_idx[*frame_ind] = (*cur_disp_index); gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start, end - start, 0, NULL, NULL, 0); ++(*cur_disp_index); // Set the level of parallelism for the LF_UPDATE frame. if (do_frame_parallel_encode) { set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind], parallel_frame_count, max_parallel_frames); // Set LF_UPDATE frames as non-reference frames. gf_group->is_frame_non_ref[*frame_ind] = true; } set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); ++(*frame_ind); ++(*cur_frame_idx); } // Sets the GF_GROUP params for INTNL_OVERLAY_UPDATE frames. static inline void set_params_for_intnl_overlay_frames( GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind, int *first_frame_index, int *cur_disp_index, int layer_depth) { gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE; gf_group->arf_src_offset[*frame_ind] = 0; gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; gf_group->layer_depth[*frame_ind] = layer_depth; gf_group->frame_type[*frame_ind] = INTER_FRAME; gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; gf_group->display_idx[*frame_ind] = (*cur_disp_index); ++(*cur_disp_index); set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); ++(*frame_ind); ++(*cur_frame_idx); } // Sets the GF_GROUP params for INTNL_ARF_UPDATE frames. static inline void set_params_for_internal_arfs( const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info, GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind, int *parallel_frame_count, int max_parallel_frames, int do_frame_parallel_encode, int *first_frame_index, int depth_thr, int *cur_disp_idx, int layer_depth, int arf_src_offset, int offset, int f_frames, int b_frames) { gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE; gf_group->arf_src_offset[*frame_ind] = arf_src_offset; gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; gf_group->layer_depth[*frame_ind] = layer_depth; gf_group->frame_type[*frame_ind] = INTER_FRAME; gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; gf_group->display_idx[*frame_ind] = (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind]; gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, offset, f_frames, b_frames, NULL, NULL, 0); if (do_frame_parallel_encode) { if (depth_thr != INT_MAX) { assert(depth_thr == 3 || depth_thr == 4); assert(IMPLIES(depth_thr == 3, layer_depth == 4)); assert(IMPLIES(depth_thr == 4, layer_depth == 5)); // Set frame_parallel_level of the first frame in the given layer to 1. if (gf_group->layer_depth[(*frame_ind) - 1] != layer_depth) { gf_group->frame_parallel_level[*frame_ind] = 1; } else { // Set frame_parallel_level of the consecutive frame in the same given // layer to 2. assert(gf_group->frame_parallel_level[(*frame_ind) - 1] == 1); gf_group->frame_parallel_level[*frame_ind] = 2; // Store the display order hints of the past 2 INTNL_ARF_UPDATE // frames which would not have been displayed at the time of the encode // of current frame. gf_group->skip_frame_refresh[*frame_ind][0] = gf_group->display_idx[(*frame_ind) - 1]; gf_group->skip_frame_refresh[*frame_ind][1] = gf_group->display_idx[(*frame_ind) - 2]; // Set the display_idx of frame_parallel_level 1 frame in // gf_group->skip_frame_as_ref. gf_group->skip_frame_as_ref[*frame_ind] = gf_group->display_idx[(*frame_ind) - 1]; } } // If max_parallel_frames is not exceeded and if the frame will not be // temporally filtered, encode the next internal ARF frame in parallel. if (*parallel_frame_count > 1 && *parallel_frame_count <= max_parallel_frames) { if (gf_group->arf_src_offset[*frame_ind] < TF_LOOKAHEAD_IDX_THR) gf_group->frame_parallel_level[*frame_ind] = 2; *parallel_frame_count = 1; } } set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); ++(*frame_ind); } // Set parameters for frames between 'start' and 'end' (excluding both). static void set_multi_layer_params_for_fp( const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc, RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end, int *cur_frame_idx, int *frame_ind, int *parallel_frame_count, int max_parallel_frames, int do_frame_parallel_encode, int *first_frame_index, int depth_thr, int *cur_disp_idx, int layer_depth) { const int num_frames_to_process = end - start; // Either we are at the last level of the pyramid, or we don't have enough // frames between 'l' and 'r' to create one more level. if (layer_depth > gf_group->max_layer_depth_allowed || num_frames_to_process < 3) { // Leaf nodes. while (start < end) { set_params_for_leaf_frames(twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames, do_frame_parallel_encode, first_frame_index, cur_disp_idx, layer_depth, start, end); ++start; } } else { const int m = (start + end - 1) / 2; // Internal ARF. int arf_src_offset = m - start; set_params_for_internal_arfs( twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames, do_frame_parallel_encode, first_frame_index, INT_MAX, cur_disp_idx, layer_depth, arf_src_offset, m, end - m, m - start); // If encode reordering is enabled, configure the multi-layers accordingly // and return. For e.g., the encode order for gf-interval 16 after // reordering would be 0-> 16-> 8-> 4-> 2-> 6-> 1-> 3-> 5-> 7-> 12-> 10-> // 14-> 9-> 11-> 13-> 15. if (layer_depth >= depth_thr) { int m1 = (m + start - 1) / 2; int m2 = (m + 1 + end) / 2; int arf_src_offsets[2] = { m1 - start, m2 - start }; // Parameters to compute arf_boost. int offset[2] = { m1, m2 }; int f_frames[2] = { m - m1, end - m2 }; int b_frames[2] = { m1 - start, m2 - (m + 1) }; // Set GF_GROUP params for INTNL_ARF_UPDATE frames which are reordered. for (int i = 0; i < 2; i++) { set_params_for_internal_arfs( twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames, do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx, layer_depth + 1, arf_src_offsets[i], offset[i], f_frames[i], b_frames[i]); } // Initialize the start and end indices to configure LF_UPDATE frames. int start_idx[4] = { start, m1 + 1, m + 1, end - 1 }; int end_idx[4] = { m1, m, m2, end }; int layer_depth_for_intnl_overlay[4] = { layer_depth + 1, layer_depth, layer_depth + 1, INVALID_IDX }; // Set GF_GROUP params for the rest of LF_UPDATE and INTNL_OVERLAY_UPDATE // frames after reordering. for (int i = 0; i < 4; i++) { set_multi_layer_params_for_fp( twopass, twopass_frame, gf_group, p_rc, rc, frame_info, start_idx[i], end_idx[i], cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames, do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx, layer_depth + 2); if (layer_depth_for_intnl_overlay[i] != INVALID_IDX) set_params_for_intnl_overlay_frames( gf_group, cur_frame_idx, frame_ind, first_frame_index, cur_disp_idx, layer_depth_for_intnl_overlay[i]); } return; } // Frames displayed before this internal ARF. set_multi_layer_params_for_fp( twopass, twopass_frame, gf_group, p_rc, rc, frame_info, start, m, cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames, do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx, layer_depth + 1); // Overlay for internal ARF. set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind, first_frame_index, cur_disp_idx, layer_depth); // Frames displayed after this internal ARF. set_multi_layer_params_for_fp( twopass, twopass_frame, gf_group, p_rc, rc, frame_info, m + 1, end, cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames, do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx, layer_depth + 1); } } // Structure for bookkeeping start, end and display indices to configure // INTNL_ARF_UPDATE frames. typedef struct { int start; int end; int display_index; } FRAME_REORDER_INFO; // Updates the stats required to configure the GF_GROUP. static inline void fill_arf_frame_stats(FRAME_REORDER_INFO *arf_frame_stats, int arf_frame_index, int display_idx, int start, int end) { arf_frame_stats[arf_frame_index].start = start; arf_frame_stats[arf_frame_index].end = end; arf_frame_stats[arf_frame_index].display_index = display_idx; } // Sets GF_GROUP params for INTNL_ARF_UPDATE frames. Also populates // doh_gf_index_map and arf_frame_stats. static inline void set_params_for_internal_arfs_in_gf14( GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats, int *cur_frame_idx, int *cur_disp_idx, int *frame_ind, int *count_arf_frames, int *doh_gf_index_map, int start, int end, int layer_depth, int layer_with_parallel_encodes) { int index = (start + end - 1) / 2; gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE; gf_group->arf_src_offset[*frame_ind] = index - 1; gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; gf_group->layer_depth[*frame_ind] = layer_depth; gf_group->frame_type[*frame_ind] = INTER_FRAME; gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; gf_group->display_idx[*frame_ind] = (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind]; // Update the display index of the current frame with its gf index. doh_gf_index_map[index] = *frame_ind; if (layer_with_parallel_encodes) { assert(layer_depth == 4); // Set frame_parallel_level of the first frame in the given layer depth // to 1. if (gf_group->layer_depth[(*frame_ind) - 1] != layer_depth) { gf_group->frame_parallel_level[*frame_ind] = 1; } else { // Set frame_parallel_level of the consecutive frame in the same given // layer depth to 2. assert(gf_group->frame_parallel_level[(*frame_ind) - 1] == 1); gf_group->frame_parallel_level[*frame_ind] = 2; // Set the display_idx of frame_parallel_level 1 frame in // gf_group->skip_frame_as_ref. gf_group->skip_frame_as_ref[*frame_ind] = gf_group->display_idx[(*frame_ind) - 1]; } } ++(*frame_ind); // Update arf_frame_stats. fill_arf_frame_stats(arf_frame_stats, *count_arf_frames, index, start, end); ++(*count_arf_frames); } // Sets GF_GROUP params for all INTNL_ARF_UPDATE frames in the given layer // dpeth. static inline void set_params_for_cur_layer_frames( GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats, int *cur_frame_idx, int *cur_disp_idx, int *frame_ind, int *count_arf_frames, int *doh_gf_index_map, int num_dir, int node_start, int node_end, int layer_depth) { assert(num_dir < 3); int start, end; // Iterate through the nodes in the previous layer depth. for (int i = node_start; i < node_end; i++) { // For each node, check if a frame can be coded as INTNL_ARF_UPDATE frame on // either direction. for (int dir = 0; dir < num_dir; dir++) { // Checks for a frame to the left of current node. if (dir == 0) { start = arf_frame_stats[i].start; end = arf_frame_stats[i].display_index; } else { // Checks for a frame to the right of current node. start = arf_frame_stats[i].display_index + 1; end = arf_frame_stats[i].end; } const int num_frames_to_process = end - start; // Checks if a frame can be coded as INTNL_ARF_UPDATE frame. If // num_frames_to_process is less than 3, then there are not enough frames // between 'start' and 'end' to create another level. if (num_frames_to_process >= 3) { // Flag to indicate the lower layer depths for which parallel encoding // is enabled. Currently enabled for layer 4 frames. int layer_with_parallel_encodes = layer_depth == 4; set_params_for_internal_arfs_in_gf14( gf_group, arf_frame_stats, cur_frame_idx, cur_disp_idx, frame_ind, count_arf_frames, doh_gf_index_map, start, end, layer_depth, layer_with_parallel_encodes); } } } } // Configures multi-layers of the GF_GROUP when consecutive encode of frames in // the same layer depth is enbaled. static inline void set_multi_layer_params_for_gf14( const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info, GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats, int *cur_frame_idx, int *frame_ind, int *count_arf_frames, int *doh_gf_index_map, int *parallel_frame_count, int *first_frame_index, int *cur_disp_index, int gf_interval, int layer_depth, int max_parallel_frames) { assert(layer_depth == 2); assert(gf_group->max_layer_depth_allowed >= 4); int layer, node_start, node_end = 0; // Maximum layer depth excluding LF_UPDATE frames is 4 since applicable only // for gf-interval 14. const int max_layer_depth = 4; // Iterate through each layer depth starting from 2 till 'max_layer_depth'. for (layer = layer_depth; layer <= max_layer_depth; layer++) { // 'node_start' and 'node_end' indicate the number of nodes from the // previous layer depth to be considered. It also corresponds to the indices // of arf_frame_stats. node_start = node_end; node_end = (*count_arf_frames); // 'num_dir' indicates the number of directions to traverse w.r.t. a given // node in order to choose an INTNL_ARF_UPDATE frame. Layer depth 2 would // have only one frame and hence needs to traverse only in the left // direction w.r.t the node in the previous layer. int num_dir = layer == 2 ? 1 : 2; set_params_for_cur_layer_frames(gf_group, arf_frame_stats, cur_frame_idx, cur_disp_index, frame_ind, count_arf_frames, doh_gf_index_map, num_dir, node_start, node_end, layer); } for (int i = 1; i < gf_interval; i++) { // Since doh_gf_index_map is already populated for all INTNL_ARF_UPDATE // frames in the GF_GROUP, any frame with INVALID_IDX would correspond to an // LF_UPDATE frame. if (doh_gf_index_map[i] == INVALID_IDX) { // LF_UPDATE frames. // TODO(Remya): Correct start and end parameters passed to // set_params_for_leaf_frames() once encode reordering for gf-interval 14 // is enbaled for parallel encode of lower layer frames. set_params_for_leaf_frames( twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames, 1, first_frame_index, cur_disp_index, layer, 0, 0); } else { // In order to obtain the layer depths of INTNL_OVERLAY_UPDATE frames, get // the gf index of corresponding INTNL_ARF_UPDATE frames. int intnl_arf_index = doh_gf_index_map[i]; int ld = gf_group->layer_depth[intnl_arf_index]; set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind, first_frame_index, cur_disp_index, ld); } } } // Set parameters for frames between 'start' and 'end' (excluding both). static void set_multi_layer_params( const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc, RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end, int *cur_frame_idx, int *frame_ind, int *parallel_frame_count, int max_parallel_frames, int do_frame_parallel_encode, int *first_frame_index, int *cur_disp_idx, int layer_depth) { const int num_frames_to_process = end - start; // Either we are at the last level of the pyramid, or we don't have enough // frames between 'l' and 'r' to create one more level. if (layer_depth > gf_group->max_layer_depth_allowed || num_frames_to_process < 3) { // Leaf nodes. while (start < end) { gf_group->update_type[*frame_ind] = LF_UPDATE; gf_group->arf_src_offset[*frame_ind] = 0; gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; gf_group->display_idx[*frame_ind] = *cur_disp_idx; gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS; gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start, end - start, 0, NULL, NULL, 0); gf_group->frame_type[*frame_ind] = INTER_FRAME; gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, layer_depth); // Set the level of parallelism for the LF_UPDATE frame. if (do_frame_parallel_encode) { set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind], parallel_frame_count, max_parallel_frames); // Set LF_UPDATE frames as non-reference frames. gf_group->is_frame_non_ref[*frame_ind] = true; } set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); ++(*frame_ind); ++(*cur_frame_idx); ++(*cur_disp_idx); ++start; } } else { const int m = (start + end - 1) / 2; // Internal ARF. gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE; gf_group->arf_src_offset[*frame_ind] = m - start; gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; gf_group->display_idx[*frame_ind] = *cur_disp_idx + gf_group->arf_src_offset[*frame_ind]; gf_group->layer_depth[*frame_ind] = layer_depth; gf_group->frame_type[*frame_ind] = INTER_FRAME; gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; if (do_frame_parallel_encode) { // If max_parallel_frames is not exceeded and if the frame will not be // temporally filtered, encode the next internal ARF frame in parallel. if (*parallel_frame_count > 1 && *parallel_frame_count <= max_parallel_frames) { if (gf_group->arf_src_offset[*frame_ind] < TF_LOOKAHEAD_IDX_THR) gf_group->frame_parallel_level[*frame_ind] = 2; *parallel_frame_count = 1; } } set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); // Get the boost factor for intermediate ARF frames. gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, m, end - m, m - start, NULL, NULL, 0); ++(*frame_ind); // Frames displayed before this internal ARF. set_multi_layer_params(twopass, twopass_frame, gf_group, p_rc, rc, frame_info, start, m, cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames, do_frame_parallel_encode, first_frame_index, cur_disp_idx, layer_depth + 1); // Overlay for internal ARF. gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE; gf_group->arf_src_offset[*frame_ind] = 0; gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; gf_group->display_idx[*frame_ind] = *cur_disp_idx; gf_group->arf_boost[*frame_ind] = 0; gf_group->layer_depth[*frame_ind] = layer_depth; gf_group->frame_type[*frame_ind] = INTER_FRAME; gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); ++(*frame_ind); ++(*cur_frame_idx); ++(*cur_disp_idx); // Frames displayed after this internal ARF. set_multi_layer_params(twopass, twopass_frame, gf_group, p_rc, rc, frame_info, m + 1, end, cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames, do_frame_parallel_encode, first_frame_index, cur_disp_idx, layer_depth + 1); } } static int construct_multi_layer_gf_structure( AV1_COMP *cpi, TWO_PASS *twopass, GF_GROUP *const gf_group, RATE_CONTROL *rc, FRAME_INFO *const frame_info, int baseline_gf_interval, FRAME_UPDATE_TYPE first_frame_update_type) { PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; // TODO(angiebird): Why do we need "-1" here? const int gf_interval = baseline_gf_interval - 1; int frame_index = 0; int cur_frame_index = 0; // Set the display order hint for the first frame in the GF_GROUP. int cur_disp_index = (first_frame_update_type == KF_UPDATE) ? 0 : cpi->common.current_frame.frame_number; // Initialize gf_group->frame_parallel_level, gf_group->is_frame_non_ref, // gf_group->src_offset and gf_group->is_frame_dropped with 0. memset(gf_group->frame_parallel_level, 0, sizeof(gf_group->frame_parallel_level)); memset(gf_group->is_frame_non_ref, 0, sizeof(gf_group->is_frame_non_ref)); memset(gf_group->src_offset, 0, sizeof(gf_group->src_offset)); memset(gf_group->is_frame_dropped, 0, sizeof(gf_group->is_frame_dropped)); // Initialize gf_group->skip_frame_refresh and gf_group->skip_frame_as_ref // with INVALID_IDX. memset(gf_group->skip_frame_refresh, INVALID_IDX, sizeof(gf_group->skip_frame_refresh)); memset(gf_group->skip_frame_as_ref, INVALID_IDX, sizeof(gf_group->skip_frame_as_ref)); int kf_decomp = cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1; // This is a patch that fixes https://crbug.com/aomedia/3163 // enable_keyframe_filtering > 1 will introduce an extra overlay frame at // key frame location. However when // baseline_gf_interval == MAX_STATIC_GF_GROUP_LENGTH, we can't // afford to have an extra overlay frame. Otherwise, the gf_group->size will // become MAX_STATIC_GF_GROUP_LENGTH + 1, which causes memory error. // A cheap solution is to turn of kf_decomp here. // TODO(angiebird): Find a systematic way to solve this issue. if (baseline_gf_interval == MAX_STATIC_GF_GROUP_LENGTH) { kf_decomp = 0; } if (first_frame_update_type == KF_UPDATE) { gf_group->update_type[frame_index] = kf_decomp ? ARF_UPDATE : KF_UPDATE; gf_group->arf_src_offset[frame_index] = 0; gf_group->cur_frame_idx[frame_index] = cur_frame_index; gf_group->layer_depth[frame_index] = 0; gf_group->frame_type[frame_index] = KEY_FRAME; gf_group->refbuf_state[frame_index] = REFBUF_RESET; gf_group->max_layer_depth = 0; gf_group->display_idx[frame_index] = cur_disp_index; if (!kf_decomp) cur_disp_index++; ++frame_index; if (kf_decomp) { gf_group->update_type[frame_index] = OVERLAY_UPDATE; gf_group->arf_src_offset[frame_index] = 0; gf_group->cur_frame_idx[frame_index] = cur_frame_index; gf_group->layer_depth[frame_index] = 0; gf_group->frame_type[frame_index] = INTER_FRAME; gf_group->refbuf_state[frame_index] = REFBUF_UPDATE; gf_group->max_layer_depth = 0; gf_group->display_idx[frame_index] = cur_disp_index; cur_disp_index++; ++frame_index; } cur_frame_index++; } if (first_frame_update_type == GF_UPDATE) { gf_group->update_type[frame_index] = GF_UPDATE; gf_group->arf_src_offset[frame_index] = 0; gf_group->cur_frame_idx[frame_index] = cur_frame_index; gf_group->layer_depth[frame_index] = 0; gf_group->frame_type[frame_index] = INTER_FRAME; gf_group->refbuf_state[frame_index] = REFBUF_UPDATE; gf_group->max_layer_depth = 0; gf_group->display_idx[frame_index] = cur_disp_index; cur_disp_index++; ++frame_index; ++cur_frame_index; } // ALTREF. const int use_altref = gf_group->max_layer_depth_allowed > 0; int is_fwd_kf = rc->frames_to_fwd_kf == gf_interval; if (use_altref) { gf_group->update_type[frame_index] = ARF_UPDATE; gf_group->arf_src_offset[frame_index] = gf_interval - cur_frame_index; gf_group->cur_frame_idx[frame_index] = cur_frame_index; gf_group->layer_depth[frame_index] = 1; gf_group->arf_boost[frame_index] = cpi->ppi->p_rc.gfu_boost; gf_group->frame_type[frame_index] = is_fwd_kf ? KEY_FRAME : INTER_FRAME; gf_group->refbuf_state[frame_index] = REFBUF_UPDATE; gf_group->max_layer_depth = 1; gf_group->arf_index = frame_index; gf_group->display_idx[frame_index] = cur_disp_index + gf_group->arf_src_offset[frame_index]; ++frame_index; } else { gf_group->arf_index = -1; } // Flag to indicate if multi-layer configuration is complete. int is_multi_layer_configured = 0; // Running count of no. of frames that is part of a given parallel // encode set in a gf_group. Value of 1 indicates no parallel encode. int parallel_frame_count = 1; // Enable parallel encode of frames if gf_group has a multi-layer pyramid // structure with minimum 4 layers. int do_frame_parallel_encode = (cpi->ppi->num_fp_contexts > 1 && use_altref && gf_group->max_layer_depth_allowed >= 4); int first_frame_index = cur_frame_index; if (do_frame_parallel_encode) { // construct_multi_layer_gf_structure() takes the input parameter // 'gf_interval' as p_rc->baseline_gf_interval - 1 . Below code computes the // actual GF_GROUP length by compensating for this offset. int actual_gf_length = ((first_frame_update_type == KF_UPDATE) || (first_frame_update_type == GF_UPDATE)) ? gf_interval : gf_interval + 1; // In order to facilitate parallel encoding of frames in lower layer depths, // encode reordering is done. Currently encode reordering is enabled only // for gf-intervals 16 and 32. NOTE: Since the buffer holding the // reference frames is of size 8 (ref_frame_map[REF_FRAMES]), there is a // limitation on the number of hidden frames possible at any given point and // hence the reordering is enabled only for gf-intervals 16 and 32. // Disabling encode reordering for gf-interval 14 since some cross-frame // dependencies related to temporal filtering for FPMT is currently not // handled. int disable_gf14_reorder = 1; if (actual_gf_length == 14 && !disable_gf14_reorder) { // This array holds the gf index of INTNL_ARF_UPDATE frames in the slot // corresponding to their display order hint. This is used while // configuring the LF_UPDATE frames and INTNL_OVERLAY_UPDATE frames. int doh_gf_index_map[FIXED_GF_INTERVAL]; // Initialize doh_gf_index_map with INVALID_IDX. memset(&doh_gf_index_map[0], INVALID_IDX, (sizeof(doh_gf_index_map[0]) * FIXED_GF_INTERVAL)); FRAME_REORDER_INFO arf_frame_stats[REF_FRAMES - 1]; // Store the stats corresponding to layer 1 frame. fill_arf_frame_stats(arf_frame_stats, 0, actual_gf_length, 1, actual_gf_length); int count_arf_frames = 1; // Sets multi-layer params for gf-interval 14 to consecutively encode // frames in the same layer depth, i.e., encode order would be 0-> 14-> // 7-> 3-> 10-> 5-> 12-> 1-> 2-> 4-> 6-> 8-> 9-> 11-> 13. // TODO(Remya): Set GF_GROUP param 'arf_boost' for all frames. set_multi_layer_params_for_gf14( twopass, &cpi->twopass_frame, p_rc, frame_info, gf_group, arf_frame_stats, &cur_frame_index, &frame_index, &count_arf_frames, doh_gf_index_map, ¶llel_frame_count, &first_frame_index, &cur_disp_index, actual_gf_length, use_altref + 1, cpi->ppi->num_fp_contexts); // Set gf_group->skip_frame_refresh. for (int i = 0; i < actual_gf_length; i++) { int count = 0; if (gf_group->update_type[i] == INTNL_ARF_UPDATE) { for (int j = 0; j < i; j++) { // Store the display order hint of the frames which would not // have been displayed at the encode call of frame 'i'. if ((gf_group->display_idx[j] < gf_group->display_idx[i]) && gf_group->update_type[j] == INTNL_ARF_UPDATE) { gf_group->skip_frame_refresh[i][count++] = gf_group->display_idx[j]; } } } } } else { // Set layer depth threshold for reordering as per the gf length. int depth_thr = (actual_gf_length == 16) ? 3 : (actual_gf_length == 32) ? 4 : INT_MAX; set_multi_layer_params_for_fp( twopass, &cpi->twopass_frame, gf_group, p_rc, rc, frame_info, cur_frame_index, gf_interval, &cur_frame_index, &frame_index, ¶llel_frame_count, cpi->ppi->num_fp_contexts, do_frame_parallel_encode, &first_frame_index, depth_thr, &cur_disp_index, use_altref + 1); } is_multi_layer_configured = 1; } // Rest of the frames. if (!is_multi_layer_configured) set_multi_layer_params(twopass, &cpi->twopass_frame, gf_group, p_rc, rc, frame_info, cur_frame_index, gf_interval, &cur_frame_index, &frame_index, ¶llel_frame_count, cpi->ppi->num_fp_contexts, do_frame_parallel_encode, &first_frame_index, &cur_disp_index, use_altref + 1); if (use_altref) { gf_group->update_type[frame_index] = OVERLAY_UPDATE; gf_group->arf_src_offset[frame_index] = 0; gf_group->cur_frame_idx[frame_index] = cur_frame_index; gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS; gf_group->arf_boost[frame_index] = NORMAL_BOOST; gf_group->frame_type[frame_index] = INTER_FRAME; gf_group->refbuf_state[frame_index] = is_fwd_kf ? REFBUF_RESET : REFBUF_UPDATE; gf_group->display_idx[frame_index] = cur_disp_index; ++frame_index; } else { for (; cur_frame_index <= gf_interval; ++cur_frame_index) { gf_group->update_type[frame_index] = LF_UPDATE; gf_group->arf_src_offset[frame_index] = 0; gf_group->cur_frame_idx[frame_index] = cur_frame_index; gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS; gf_group->arf_boost[frame_index] = NORMAL_BOOST; gf_group->frame_type[frame_index] = INTER_FRAME; gf_group->refbuf_state[frame_index] = REFBUF_UPDATE; gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2); set_src_offset(gf_group, &first_frame_index, cur_frame_index, frame_index); gf_group->display_idx[frame_index] = cur_disp_index; cur_disp_index++; ++frame_index; } } if (do_frame_parallel_encode) { // Iterate through the gf_group and reset frame_parallel_level to 0 in case // a frame is marked as frame_parallel_level 1 with no subsequent // frame_parallel_level 2 frame(s). int level1_frame_idx = INT_MAX; int level2_frame_count = 0; for (int frame_idx = 0; frame_idx < frame_index; frame_idx++) { if (gf_group->frame_parallel_level[frame_idx] == 1) { // Set frame_parallel_level to 0 if only one frame is present in a // parallel encode set. if (level1_frame_idx != INT_MAX && !level2_frame_count) gf_group->frame_parallel_level[level1_frame_idx] = 0; // Book-keep frame_idx of frame_parallel_level 1 frame and reset the // count of frame_parallel_level 2 frames in the corresponding parallel // encode set. level1_frame_idx = frame_idx; level2_frame_count = 0; } if (gf_group->frame_parallel_level[frame_idx] == 2) level2_frame_count++; } // If frame_parallel_level is set to 1 for the last LF_UPDATE // frame in the gf_group, reset it to zero since there are no subsequent // frames in the gf_group. if (gf_group->frame_parallel_level[frame_index - 2] == 1) { assert(gf_group->update_type[frame_index - 2] == LF_UPDATE); gf_group->frame_parallel_level[frame_index - 2] = 0; } } for (int gf_idx = frame_index; gf_idx < MAX_STATIC_GF_GROUP_LENGTH; ++gf_idx) { gf_group->update_type[gf_idx] = LF_UPDATE; gf_group->arf_src_offset[gf_idx] = 0; gf_group->cur_frame_idx[gf_idx] = gf_idx; gf_group->layer_depth[gf_idx] = MAX_ARF_LAYERS; gf_group->arf_boost[gf_idx] = NORMAL_BOOST; gf_group->frame_type[gf_idx] = INTER_FRAME; gf_group->refbuf_state[gf_idx] = REFBUF_UPDATE; gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2); } return frame_index; } static void set_ld_layer_depth(GF_GROUP *gf_group, int gop_length) { int log_gop_length = 0; while ((1 << log_gop_length) < gop_length) { ++log_gop_length; } for (int gf_index = 0; gf_index < gf_group->size; ++gf_index) { int count = 0; // Find the trailing zeros for (; count < MAX_ARF_LAYERS; ++count) { if ((gf_index >> count) & 0x01) break; } gf_group->layer_depth[gf_index] = AOMMAX(log_gop_length - count, 0); } gf_group->max_layer_depth = AOMMIN(log_gop_length, MAX_ARF_LAYERS); } void av1_gop_setup_structure(AV1_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; GF_GROUP *const gf_group = &cpi->ppi->gf_group; TWO_PASS *const twopass = &cpi->ppi->twopass; FRAME_INFO *const frame_info = &cpi->frame_info; const int key_frame = rc->frames_since_key == 0; FRAME_UPDATE_TYPE first_frame_update_type = ARF_UPDATE; if (key_frame) { first_frame_update_type = KF_UPDATE; if (cpi->oxcf.kf_max_pyr_height != -1) { gf_group->max_layer_depth_allowed = AOMMIN( cpi->oxcf.kf_max_pyr_height, gf_group->max_layer_depth_allowed); } } else if (!cpi->ppi->gf_state.arf_gf_boost_lst) { first_frame_update_type = GF_UPDATE; } gf_group->size = construct_multi_layer_gf_structure( cpi, twopass, gf_group, rc, frame_info, p_rc->baseline_gf_interval, first_frame_update_type); if (gf_group->max_layer_depth_allowed == 0) set_ld_layer_depth(gf_group, p_rc->baseline_gf_interval); } int av1_gop_check_forward_keyframe(const GF_GROUP *gf_group, int gf_frame_index) { return gf_group->frame_type[gf_frame_index] == KEY_FRAME && gf_group->refbuf_state[gf_frame_index] == REFBUF_UPDATE; } int av1_gop_is_second_arf(const GF_GROUP *gf_group, int gf_frame_index) { const int arf_src_offset = gf_group->arf_src_offset[gf_frame_index]; // TODO(angiebird): when gf_group->size == 32, it's possble to // have "two" second arf. Check if this is acceptable. if (gf_group->update_type[gf_frame_index] == INTNL_ARF_UPDATE && arf_src_offset >= TF_LOOKAHEAD_IDX_THR) { return 1; } return 0; } aom-3.12.1/av1/encoder/gop_structure.h000066400000000000000000000042361477627663500175720ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_GOP_STRUCTURE_H_ #define AOM_AV1_ENCODER_GOP_STRUCTURE_H_ #include "av1/common/av1_common_int.h" #include "av1/encoder/ratectrl.h" #ifdef __cplusplus extern "C" { #endif /*!\cond */ struct AV1_COMP; struct EncodeFrameParams; #define MIN_ARF_GF_BOOST 240 #define NORMAL_BOOST 100 /*!\endcond */ /*!\brief Set up the Group-Of-Pictures structure for this GF_GROUP. * *\ingroup rate_control * * This function defines the Group-Of-Pictures structure for this GF_GROUP. * This involves deciding where to place the various FRAME_UPDATE_TYPEs in * the group. It does this primarily by updateing entries in * cpi->twopass.gf_group.update_type[]. * * \param[in] cpi Top - level encoder instance structure * * \remark No return value but this function updates group data structures. */ void av1_gop_setup_structure(struct AV1_COMP *cpi); /*!\brief Check whether a frame in the GOP is a forward key frame * *\ingroup rate_control * * \param[in] gf_group GF/ARF group data structure * \param[in] gf_frame_index GOP index * * \return Return 1 if it is a forward key frame, otherwise return 0 */ int av1_gop_check_forward_keyframe(const GF_GROUP *gf_group, int gf_frame_index); /*!\brief Check whether a frame in the GOP is the second arf * *\ingroup rate_control * * \param[in] gf_group GF/ARF group data structure * \param[in] gf_frame_index GOP index * * \return Return 1 if it is the second arf */ int av1_gop_is_second_arf(const GF_GROUP *gf_group, int gf_frame_index); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_GOP_STRUCTURE_H_ aom-3.12.1/av1/encoder/grain_test_vectors.h000066400000000000000000000516311477627663500205720ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ #define AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ /* Test vectors for emulation of different film grain types. * Note that bit depth would be derived from the bitstream and * not signaled in film grain metadata. The parameters are valid * for any bit depth. */ #if !CONFIG_REALTIME_ONLY static aom_film_grain_t film_grain_test_vectors[16] = { /* Test 1 */ { 1 /* apply_grain */, 1 /* update_parameters */, { { 16, 0 }, { 25, 136 }, { 33, 144 }, { 41, 160 }, { 48, 168 }, { 56, 136 }, { 67, 128 }, { 82, 144 }, { 97, 152 }, { 113, 144 }, { 128, 176 }, { 143, 168 }, { 158, 176 }, { 178, 184 } }, 14 /* num_points_y */, { { 16, 0 }, { 20, 64 }, { 28, 88 }, { 60, 104 }, { 90, 136 }, { 105, 160 }, { 134, 168 }, { 168, 208 } }, 8 /* num_cb_points */, { { 16, 0 }, { 28, 96 }, { 56, 80 }, { 66, 96 }, { 80, 104 }, { 108, 96 }, { 122, 112 }, { 137, 112 }, { 169, 176 } }, 9 /* num_cr_points */, 11 /* scaling_shift */, 2 /* ar_coeff_lag */, { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 }, { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 }, { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 }, 8 /* ar_coeff_shift */, 247 /* cb_mult */, 192 /* cb_luma_mult */, 18 /* cb_offset */, 229 /* cr_mult */, 192 /* cr_luma_mult */, 54 /* cr_offset */, 0 /* overlap_flag */, 1 /* clip_to_restricted_range */, 8 /* bit_depth */, 0 /* chroma_scaling_from_luma*/, 0 /* grain_scale_shift*/, 45231 /* random_seed */ }, /* Test 2 */ { 1 /* apply_grain */, 1 /* update_parameters */, { { 0, 96 }, { 255, 96 } }, 2 /* num_points_y */, { { 0, 64 }, { 255, 64 } }, 2 /* num_cb_points */, { { 0, 64 }, { 255, 64 } }, 2 /* num_cr_points */, 11 /* scaling_shift */, 3 /* ar_coeff_lag */, { 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, }, 7 /* ar_coeff_shift */, 128 /* cb_mult */, 192 /* cb_luma_mult */, 256 /* cb_offset */, 128 /* cr_mult */, 192 /* cr_luma_mult */, 256 /* cr_offset */, 1 /* overlap_flag */, 0 /* clip_to_restricted_range */, 8 /* bit_depth */, 0 /*chroma_scaling_from_luma*/, 0 /* grain_scale_shift*/, 45231 /* random_seed */ }, /* Test 3 */ { 1 /* apply_grain */, 1 /* update_parameters */, { { 0, 192 }, { 255, 192 } }, 2 /* num_points_y */, { { 0, 128 }, { 255, 128 } }, 2 /* num_cb_points */, { { 0, 128 }, { 255, 128 } }, 2 /* num_cr_points */, 11 /* scaling_shift */, 3 /* ar_coeff_lag */, { 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, }, { 4, -7, 2, 4, 12, -12, 5, -8, 6, 8, -19, -16, 19, -10, -2, 17, -42, 58, -2, -13, 9, 14, -36, 67, 0, }, { 4, -7, 2, 4, 12, -12, 5, -8, 6, 8, -19, -16, 19, -10, -2, 17, -42, 58, -2, -13, 9, 14, -36, 67, 0, }, 7 /* ar_coeff_shift */, 128 /* cb_mult */, 192 /* cb_luma_mult */, 256 /* cb_offset */, 128 /* cr_mult */, 192 /* cr_luma_mult */, 256 /* cr_offset */, 1 /* overlap_flag */, 1 /* clip_to_restricted_range */, 8 /* bit_depth */, 0 /*chroma_scaling_from_luma*/, 1 /* grain_scale_shift*/, 45231 /* random_seed */ }, /* Test 4 */ { 1 /* apply_grain */, 1 /* update_parameters */, { { 16, 0 }, { 24, 137 }, { 53, 146 }, { 63, 155 }, { 78, 155 }, { 107, 150 }, { 122, 147 }, { 136, 147 }, { 166, 153 }, }, 9 /* num_points_y */, { { 16, 0 }, { 20, 72 }, { 27, 82 }, { 33, 91 }, { 69, 121 }, { 95, 143 }, { 108, 154 }, { 134, 169 }, { 147, 177 }, }, 9 /* num_cb_points */, { { 16, 0 }, { 24, 95 }, { 54, 93 }, { 65, 94 }, { 79, 98 }, { 109, 107 }, { 124, 119 }, { 139, 136 }, { 169, 170 }, }, 9 /* num_cr_points */, 11 /* scaling_shift */, 3 /* ar_coeff_lag */, { 7, -9, 2, 4, 7, -12, 7, -18, 18, -30, -27, -42, 13, -20, 7, -18, 6, 107, 55, -2, -4, -9, -22, 113, }, { -3, -1, -4, 3, -6, -2, 3, 1, -4, -10, -10, -5, -5, -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66, 0, }, { 0, 4, -3, 13, 0, 1, -3, 0, -3, -10, -68, -4, -2, -5, 2, -3, -20, 62, -31, 0, -4, -1, -8, -29, 0, }, 8 /* ar_coeff_shift */, 128 /* cb_mult */, 192 /* cb_luma_mult */, 256 /* cb_offset */, 128 /* cr_mult */, 192 /* cr_luma_mult */, 256 /* cr_offset */, 1 /* overlap_flag */, 0 /* clip_to_restricted_range */, 8 /* bit_depth */, 0 /*chroma_scaling_from_luma*/, 0 /* grain_scale_shift*/, 45231 /* random_seed */ }, /* Test 5 */ { 1 /* apply_grain */, 0 /* update_parameters */, { { 0, 64 }, { 255, 64 } }, 2 /* num_points_y */, { { 0, 96 }, { 32, 90 }, { 64, 83 }, { 96, 76 }, { 128, 68 }, { 159, 59 }, { 191, 48 }, { 223, 34 }, { 255, 0 }, }, 9 /* num_cb_points */, { { 0, 0 }, { 32, 34 }, { 64, 48 }, { 96, 59 }, { 128, 68 }, { 159, 76 }, { 191, 83 }, { 223, 90 }, { 255, 96 }, }, 9 /* num_cr_points */, 11 /* scaling_shift */, 3 /* ar_coeff_lag */, { 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, }, { -2, 2, -5, 7, -6, 4, -2, -1, 1, -2, 0, -2, 2, -3, -5, 13, -13, 6, -14, 8, -1, 18, -36, 58, 0, }, { -2, -1, -3, 14, -4, -1, -3, 0, -1, 7, -31, 7, 2, 0, 1, 0, -7, 50, -8, -2, 2, 2, 2, -4, 0, }, 7 /* ar_coeff_shift */, 128 /* cb_mult */, 192 /* cb_luma_mult */, 256 /* cb_offset */, 128 /* cr_mult */, 192 /* cr_luma_mult */, 256 /* cr_offset */, 1 /* overlap_flag */, 1 /* clip_to_restricted_range */, 8 /* bit_depth */, 0 /*chroma_scaling_from_luma*/, 0 /* grain_scale_shift*/, 1063 /* random_seed */ }, /* Test 6 */ { 1 /* apply_grain */, 1 /* update_parameters */, { { 0, 96 }, { 20, 92 }, { 39, 88 }, { 59, 84 }, { 78, 80 }, { 98, 75 }, { 118, 70 }, { 137, 65 }, { 157, 60 }, { 177, 53 }, { 196, 46 }, { 216, 38 }, { 235, 27 }, { 255, 0 }, }, 14 /* num_points_y */, { { 0, 0 } }, 0 /* num_cb_points */, { { 0, 0 } }, 0 /* num_cr_points */, 11 /* scaling_shift */, 3 /* ar_coeff_lag */, { 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, 7 /* ar_coeff_shift */, 128 /* cb_mult */, 192 /* cb_luma_mult */, 256 /* cb_offset */, 128 /* cr_mult */, 192 /* cr_luma_mult */, 256 /* cr_offset */, 1 /* overlap_flag */, 1 /* clip_to_restricted_range */, 8 /* bit_depth */, 0 /*chroma_scaling_from_luma*/, 0 /* grain_scale_shift*/, 2754 /* random_seed */ }, /* Test 7 */ { 1 /* apply_grain */, 1 /* update_parameters */, { { 0, 0 }, { 20, 27 }, { 39, 38 }, { 59, 46 }, { 78, 53 }, { 98, 60 }, { 118, 65 }, { 137, 70 }, { 157, 75 }, { 177, 80 }, { 196, 84 }, { 216, 88 }, { 235, 92 }, { 255, 96 }, }, 14 /* num_points_y */, { { 0, 0 }, { 255, 0 } }, 2 /* num_cb_points */, { { 0, 0 }, { 255, 0 } }, 2 /* num_cr_points */, 11 /* scaling_shift */, 3 /* ar_coeff_lag */, { 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, 7 /* ar_coeff_shift */, 128 /* cb_mult */, 192 /* cb_luma_mult */, 256 /* cb_offset */, 128 /* cr_mult */, 192 /* cr_luma_mult */, 256 /* cr_offset */, 1 /* overlap_flag */, 1 /* clip_to_restricted_range */, 8 /* bit_depth */, 0 /*chroma_scaling_from_luma*/, 0 /* grain_scale_shift*/, 45231 /* random_seed */ }, /* Test 8 */ { 1 /* apply_grain */, 1 /* update_parameters */, { { 0, 96 }, { 255, 96 } }, 2 /* num_points_y */, { { 0, 62 }, { 255, 62 } }, 2 /* num_cb_points */, { { 0, 62 }, { 255, 62 } }, 2 /* num_cr_points */, 11 /* scaling_shift */, 3 /* ar_coeff_lag */, { 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, }, { 0, -2, -2, 8, 5, -1, 1, -1, 5, 16, -33, -9, 6, -1, -3, 10, -47, 63, 0, -15, 3, 11, -42, 75, -69, }, { 1, -1, -1, 9, 5, 0, 1, -1, 5, 15, -32, -10, 8, -2, -4, 11, -46, 62, 1, -16, 3, 13, -43, 75, -55, }, 7 /* ar_coeff_shift */, 128 /* cb_mult */, 192 /* cb_luma_mult */, 256 /* cb_offset */, 128 /* cr_mult */, 192 /* cr_luma_mult */, 256 /* cr_offset */, 1 /* overlap_flag */, 0 /* clip_to_restricted_range */, 8 /* bit_depth */, 0 /*chroma_scaling_from_luma*/, 0 /* grain_scale_shift*/, 45231 /* random_seed */ }, /* Test 9 */ { 1 /* apply_grain */, 0 /* update_parameters */, { { 0, 48 }, { 255, 48 } }, 2 /* num_points_y */, { { 0, 32 }, { 255, 32 } }, 2 /* num_cb_points */, { { 0, 32 }, { 255, 32 } }, 2 /* num_cr_points */, 10 /* scaling_shift */, 2 /* ar_coeff_lag */, { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 }, 8 /* ar_coeff_shift */, 128 /* cb_mult */, 192 /* cb_luma_mult */, 256 /* cb_offset */, 128 /* cr_mult */, 192 /* cr_luma_mult */, 256 /* cr_offset */, 1 /* overlap_flag */, 0 /* clip_to_restricted_range */, 8 /* bit_depth */, 0 /*chroma_scaling_from_luma*/, 0 /* grain_scale_shift*/, 45231 /* random_seed */ }, /* Test 10 */ { 1 /* apply_grain */, 1 /* update_parameters */, { { 0, 48 }, { 255, 48 } }, 2 /* num_points_y */, { { 0, 32 }, { 255, 32 } }, 2 /* num_cb_points */, { { 0, 32 }, { 255, 32 } }, 2 /* num_cr_points */, 10 /* scaling_shift */, 2 /* ar_coeff_lag */, { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 }, { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 }, 8 /* ar_coeff_shift */, 128 /* cb_mult */, 192 /* cb_luma_mult */, 256 /* cb_offset */, 128 /* cr_mult */, 192 /* cr_luma_mult */, 256 /* cr_offset */, 1 /* overlap_flag */, 0 /* clip_to_restricted_range */, 8 /* bit_depth */, 0 /*chroma_scaling_from_luma*/, 0 /* grain_scale_shift*/, 45231 /* random_seed */ }, /* Test 11 */ { 1 /* apply_grain */, 0 /* update_parameters */, { { 0, 32 }, { 255, 32 } }, 2 /* num_points_y */, { { 0, 48 }, { 32, 45 }, { 64, 42 }, { 96, 38 }, { 128, 34 }, { 159, 29 }, { 191, 24 }, { 223, 17 }, { 255, 0 }, }, 9 /* num_cb_points */, { { 0, 0 }, { 32, 17 }, { 64, 24 }, { 96, 29 }, { 128, 34 }, { 159, 38 }, { 191, 42 }, { 223, 45 }, { 255, 48 }, }, 9 /* num_cr_points */, 10 /* scaling_shift */, 3 /* ar_coeff_lag */, { 7, -9, 2, 4, 7, -12, 7, -18, 18, -30, -27, -42, 13, -20, 7, -18, 6, 107, 55, -2, -4, -9, -22, 113, }, { -3, -1, -4, 3, -6, -2, 3, 1, -4, -10, -10, -5, -5, -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66, 0, }, { 0, 4, -3, 13, 0, 1, -3, 0, -3, -10, -68, -4, -2, -5, 2, -3, -20, 62, -31, 0, -4, -1, -8, -29, 0, }, 8 /* ar_coeff_shift */, 128 /* cb_mult */, 192 /* cb_luma_mult */, 256 /* cb_offset */, 128 /* cr_mult */, 192 /* cr_luma_mult */, 256 /* cr_offset */, 1 /* overlap_flag */, 1 /* clip_to_restricted_range */, 8 /* bit_depth */, 0 /*chroma_scaling_from_luma*/, 0 /* grain_scale_shift*/, 1357 /* random_seed */ }, /* Test 12 */ { 1 /* apply_grain */, 1 /* update_parameters */, { { 16, 0 }, { 24, 49 }, { 39, 69 }, { 46, 84 }, { 53, 91 }, { 63, 100 }, { 78, 114 }, { 92, 134 }, { 164, 139 }, }, 9 /* num_points_y */, { { 16, 0 }, { 20, 31 }, { 26, 42 }, { 33, 54 }, { 40, 65 }, { 47, 72 }, { 56, 85 }, { 84, 123 }, { 152, 157 }, }, 9 /* num_cb_points */, { { 16, 0 }, { 25, 14 }, { 39, 33 }, { 47, 40 }, { 54, 47 }, { 64, 62 }, { 79, 76 }, { 94, 83 }, { 167, 101 }, }, 9 /* num_cr_points */, 10 /* scaling_shift */, 2 /* ar_coeff_lag */, { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 }, { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 }, { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 }, 8 /* ar_coeff_shift */, 128 /* cb_mult */, 192 /* cb_luma_mult */, 256 /* cb_offset */, 128 /* cr_mult */, 192 /* cr_luma_mult */, 256 /* cr_offset */, 0 /* overlap_flag */, 0 /* clip_to_restricted_range */, 8 /* bit_depth */, 0 /*chroma_scaling_from_luma*/, 0 /* grain_scale_shift*/, 45231 /* random_seed */ }, /* Test 13 */ { 1 /* apply_grain */, 1 /* update_parameters */, { { 0, 48 }, { 20, 46 }, { 39, 44 }, { 59, 42 }, { 78, 40 }, { 98, 38 }, { 118, 35 }, { 137, 33 }, { 157, 30 }, { 177, 27 }, { 196, 23 }, { 216, 19 }, { 235, 13 }, { 255, 0 }, }, 14 /* num_points_y */, { { 0, 0 }, { 255, 0 } }, 0 /* num_cb_points */, { { 0, 0 }, { 255, 0 } }, 0 /* num_cr_points */, 10 /* scaling_shift */, 2 /* ar_coeff_lag */, { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 8 /* ar_coeff_shift */, 128 /* cb_mult */, 192 /* cb_luma_mult */, 256 /* cb_offset */, 128 /* cr_mult */, 192 /* cr_luma_mult */, 256 /* cr_offset */, 1 /* overlap_flag */, 0 /* clip_to_restricted_range */, 8 /* bit_depth */, 0 /*chroma_scaling_from_luma*/, 0 /* grain_scale_shift*/, 45231 /* random_seed */ }, /* Test 14 */ { 1 /* apply_grain */, 1 /* update_parameters */, { { 0, 0 }, { 20, 13 }, { 39, 19 }, { 59, 23 }, { 78, 27 }, { 98, 30 }, { 118, 33 }, { 137, 35 }, { 157, 38 }, { 177, 40 }, { 196, 42 }, { 216, 44 }, { 235, 46 }, { 255, 48 }, }, 14 /* num_points_y */, { { 0, 0 }, { 255, 0 } }, 0 /* num_cb_points */, { { 0, 0 }, { 255, 0 } }, 0 /* num_cr_points */, 10 /* scaling_shift */, 2 /* ar_coeff_lag */, { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 8 /* ar_coeff_shift */, 128 /* cb_mult */, 192 /* cb_luma_mult */, 256 /* cb_offset */, 128 /* cr_mult */, 192 /* cr_luma_mult */, 256 /* cr_offset */, 1 /* overlap_flag */, 1 /* clip_to_restricted_range */, 8 /* bit_depth */, 0 /*chroma_scaling_from_luma*/, 0 /* grain_scale_shift*/, 45231 /* random_seed */ }, /* Test 15 */ { 1 /* apply_grain */, 1 /* update_parameters */, { { 0, 96 }, { 255, 96 } }, 1 /* num_points_y */, { { 0, 96 }, { 255, 96 } }, 0 /* num_cb_points */, { { 0, 96 }, { 255, 96 } }, 0 /* num_cr_points */, 11 /* scaling_shift */, 2 /* ar_coeff_lag */, { 5, -15, -10, -19, 0, -12, 6, 51, 30, -5, -12, 56 }, { 2, 2, -24, -5, 1, 1, -18, 37, -2, 0, -15, 39, -70 }, { 2, 3, -24, -5, -1, 0, -18, 38, -2, 0, -15, 39, -55 }, 7 /* ar_coeff_shift */, 128 /* cb_mult */, 192 /* cb_luma_mult */, 256 /* cb_offset */, 128 /* cr_mult */, 192 /* cr_luma_mult */, 256 /* cr_offset */, 1 /* overlap_flag */, 0 /* clip_to_restricted_range */, 8 /* bit_depth */, 1 /*chroma_scaling_from_luma*/, 0 /* grain_scale_shift*/, 45231 /* random_seed */ }, /* Test 16 */ { 1 /* apply_grain */, 1 /* update_parameters */, { { 16, 0 }, { 58, 126 }, { 87, 120 }, { 97, 122 }, { 112, 125 }, { 126, 131 }, { 141, 139 }, { 199, 153 }, }, 8 /* num_points_y */, { { 16, 0 }, { 59, 68 }, { 66, 76 }, { 73, 82 }, { 79, 85 }, { 86, 86 }, { 151, 95 }, { 192, 101 }, }, 8 /* num_cb_points */, { { 16, 0 }, { 59, 64 }, { 89, 80 }, { 99, 86 }, { 114, 90 }, { 129, 93 }, { 144, 97 }, { 203, 85 }, }, 8 /* num_cr_points */, 10 /* scaling_shift */, 3 /* ar_coeff_lag */, { 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, }, { 0, -2, -2, 8, 5, -1, 1, -1, 5, 16, -33, -9, 6, -1, -3, 10, -47, 63, 0, -15, 3, 11, -42, 75, -69, }, { 1, -1, -1, 9, 5, 0, 1, -1, 5, 15, -32, -10, 8, -2, -4, 11, -46, 62, 1, -16, 3, 13, -43, 75, -55, }, 7 /* ar_coeff_shift */, 128 /* cb_mult */, 192 /* cb_luma_mult */, 256 /* cb_offset */, 128 /* cr_mult */, 192 /* cr_luma_mult */, 256 /* cr_offset */, 1 /* overlap_flag */, 0 /* clip_to_restricted_range */, 8 /* bit_depth */, 0 /*chroma_scaling_from_luma*/, 2 /* grain_scale_shift*/, 45231 /* random_seed */ }, }; #endif // !CONFIG_REALTIME_ONLY #endif // AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ aom-3.12.1/av1/encoder/hash.c000066400000000000000000000106501477627663500156000ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/encoder/hash.h" #include "config/av1_rtcd.h" static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator, uint8_t *pData, uint32_t dataLength) { for (uint32_t i = 0; i < dataLength; i++) { const uint8_t index = (uint8_t)((p_crc_calculator->remainder >> (p_crc_calculator->bits - 8)) ^ pData[i]); p_crc_calculator->remainder <<= 8; p_crc_calculator->remainder ^= p_crc_calculator->table[index]; } } static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) { p_crc_calculator->remainder = 0; } static uint32_t crc_calculator_get_crc(CRC_CALCULATOR *p_crc_calculator) { return p_crc_calculator->remainder & p_crc_calculator->final_result_mask; } static void crc_calculator_init_table(CRC_CALCULATOR *p_crc_calculator) { const uint32_t high_bit = 1 << (p_crc_calculator->bits - 1); const uint32_t byte_high_bit = 1 << (8 - 1); for (uint32_t value = 0; value < 256; value++) { uint32_t remainder = 0; for (uint8_t mask = byte_high_bit; mask != 0; mask >>= 1) { if (value & mask) { remainder ^= high_bit; } if (remainder & high_bit) { remainder <<= 1; remainder ^= p_crc_calculator->trunc_poly; } else { remainder <<= 1; } } p_crc_calculator->table[value] = remainder; } } void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits, uint32_t truncPoly) { p_crc_calculator->remainder = 0; p_crc_calculator->bits = bits; p_crc_calculator->trunc_poly = truncPoly; p_crc_calculator->final_result_mask = (1 << bits) - 1; crc_calculator_init_table(p_crc_calculator); } uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p, int length) { crc_calculator_reset(p_crc_calculator); crc_calculator_process_data(p_crc_calculator, p, length); return crc_calculator_get_crc(p_crc_calculator); } /* CRC-32C (iSCSI) polynomial in reversed bit order. */ #define POLY 0x82f63b78 /* Construct table for software CRC-32C calculation. */ void av1_crc32c_calculator_init(CRC32C *p_crc32c) { uint32_t crc; for (int n = 0; n < 256; n++) { crc = n; crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; p_crc32c->table[0][n] = crc; } for (int n = 0; n < 256; n++) { crc = p_crc32c->table[0][n]; for (int k = 1; k < 8; k++) { crc = p_crc32c->table[0][crc & 0xff] ^ (crc >> 8); p_crc32c->table[k][n] = crc; } } } /* Table-driven software version as a fall-back. This is about 15 times slower than using the hardware instructions. This assumes little-endian integers, as is the case on Intel processors that the assembler code here is for. */ uint32_t av1_get_crc32c_value_c(void *c, uint8_t *buf, size_t len) { const uint8_t *next = (const uint8_t *)(buf); uint64_t crc; CRC32C *p = (CRC32C *)c; crc = 0 ^ 0xffffffff; while (len && ((uintptr_t)next & 7) != 0) { crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); len--; } while (len >= 8) { crc ^= *(uint64_t *)next; crc = p->table[7][crc & 0xff] ^ p->table[6][(crc >> 8) & 0xff] ^ p->table[5][(crc >> 16) & 0xff] ^ p->table[4][(crc >> 24) & 0xff] ^ p->table[3][(crc >> 32) & 0xff] ^ p->table[2][(crc >> 40) & 0xff] ^ p->table[1][(crc >> 48) & 0xff] ^ p->table[0][crc >> 56]; next += 8; len -= 8; } while (len) { crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); len--; } return (uint32_t)crc ^ 0xffffffff; } aom-3.12.1/av1/encoder/hash.h000066400000000000000000000030531477627663500156040ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_HASH_H_ #define AOM_AV1_ENCODER_HASH_H_ #include "config/aom_config.h" #include "aom/aom_integer.h" #ifdef __cplusplus extern "C" { #endif typedef struct _crc_calculator { uint32_t remainder; uint32_t trunc_poly; uint32_t bits; uint32_t table[256]; uint32_t final_result_mask; } CRC_CALCULATOR; // Initialize the crc calculator. It must be executed at least once before // calling av1_get_crc_value(). void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits, uint32_t truncPoly); uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p, int length); // CRC32C: POLY = 0x82f63b78; typedef struct _CRC32C { /* Table for a quadword-at-a-time software crc. */ uint32_t table[8][256]; } CRC32C; // init table for software version crc32c void av1_crc32c_calculator_init(CRC32C *p_crc32c); #define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096) #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_HASH_H_ aom-3.12.1/av1/encoder/hash_motion.c000066400000000000000000000416301477627663500171670ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "av1/encoder/block.h" #include "av1/encoder/hash.h" #include "av1/encoder/hash_motion.h" #define kSrcBits 16 #define kBlockSizeBits 3 #define kMaxAddr (1 << (kSrcBits + kBlockSizeBits)) // TODO(youzhou@microsoft.com): is higher than 8 bits screen content supported? // If yes, fix this function static void get_pixels_in_1D_char_array_by_block_2x2(const uint8_t *y_src, int stride, uint8_t *p_pixels_in1D) { const uint8_t *p_pel = y_src; int index = 0; for (int i = 0; i < 2; i++) { for (int j = 0; j < 2; j++) { p_pixels_in1D[index++] = p_pel[j]; } p_pel += stride; } } static void get_pixels_in_1D_short_array_by_block_2x2(const uint16_t *y_src, int stride, uint16_t *p_pixels_in1D) { const uint16_t *p_pel = y_src; int index = 0; for (int i = 0; i < 2; i++) { for (int j = 0; j < 2; j++) { p_pixels_in1D[index++] = p_pel[j]; } p_pel += stride; } } static int is_block_2x2_row_same_value(const uint8_t *p) { if (p[0] != p[1] || p[2] != p[3]) { return 0; } return 1; } static int is_block16_2x2_row_same_value(const uint16_t *p) { if (p[0] != p[1] || p[2] != p[3]) { return 0; } return 1; } static int is_block_2x2_col_same_value(const uint8_t *p) { if ((p[0] != p[2]) || (p[1] != p[3])) { return 0; } return 1; } static int is_block16_2x2_col_same_value(const uint16_t *p) { if ((p[0] != p[2]) || (p[1] != p[3])) { return 0; } return 1; } // the hash value (hash_value1 consists two parts, the first 3 bits relate to // the block size and the remaining 16 bits are the crc values. This fuction // is used to get the first 3 bits. static int hash_block_size_to_index(int block_size) { switch (block_size) { case 4: return 0; case 8: return 1; case 16: return 2; case 32: return 3; case 64: return 4; case 128: return 5; default: return -1; } } void av1_hash_table_init(IntraBCHashInfo *intrabc_hash_info) { if (!intrabc_hash_info->g_crc_initialized) { av1_crc_calculator_init(&intrabc_hash_info->crc_calculator1, 24, 0x5D6DCB); av1_crc_calculator_init(&intrabc_hash_info->crc_calculator2, 24, 0x864CFB); intrabc_hash_info->g_crc_initialized = 1; } intrabc_hash_info->intrabc_hash_table.p_lookup_table = NULL; } static void clear_all(hash_table *p_hash_table) { if (p_hash_table->p_lookup_table == NULL) { return; } for (int i = 0; i < kMaxAddr; i++) { if (p_hash_table->p_lookup_table[i] != NULL) { aom_vector_destroy(p_hash_table->p_lookup_table[i]); aom_free(p_hash_table->p_lookup_table[i]); p_hash_table->p_lookup_table[i] = NULL; } } } void av1_hash_table_destroy(hash_table *p_hash_table) { clear_all(p_hash_table); aom_free(p_hash_table->p_lookup_table); p_hash_table->p_lookup_table = NULL; } bool av1_hash_table_create(hash_table *p_hash_table) { if (p_hash_table->p_lookup_table != NULL) { clear_all(p_hash_table); return true; } p_hash_table->p_lookup_table = (Vector **)aom_calloc(kMaxAddr, sizeof(p_hash_table->p_lookup_table[0])); if (!p_hash_table->p_lookup_table) return false; return true; } static bool hash_table_add_to_table(hash_table *p_hash_table, uint32_t hash_value, block_hash *curr_block_hash) { if (p_hash_table->p_lookup_table[hash_value] == NULL) { p_hash_table->p_lookup_table[hash_value] = aom_malloc(sizeof(p_hash_table->p_lookup_table[0][0])); if (p_hash_table->p_lookup_table[hash_value] == NULL) { return false; } if (aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10, sizeof(curr_block_hash[0])) == VECTOR_ERROR) return false; if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value], curr_block_hash) == VECTOR_ERROR) return false; } else { if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value], curr_block_hash) == VECTOR_ERROR) return false; } return true; } int32_t av1_hash_table_count(const hash_table *p_hash_table, uint32_t hash_value) { if (p_hash_table->p_lookup_table[hash_value] == NULL) { return 0; } else { return (int32_t)(p_hash_table->p_lookup_table[hash_value]->size); } } Iterator av1_hash_get_first_iterator(hash_table *p_hash_table, uint32_t hash_value) { assert(av1_hash_table_count(p_hash_table, hash_value) > 0); return aom_vector_begin(p_hash_table->p_lookup_table[hash_value]); } void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intrabc_hash_info, const YV12_BUFFER_CONFIG *picture, uint32_t *pic_block_hash[2], int8_t *pic_block_same_info[3]) { const int width = 2; const int height = 2; const int x_end = picture->y_crop_width - width + 1; const int y_end = picture->y_crop_height - height + 1; CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1; CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2; const int length = width * 2; if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { uint16_t p[4]; int pos = 0; for (int y_pos = 0; y_pos < y_end; y_pos++) { for (int x_pos = 0; x_pos < x_end; x_pos++) { get_pixels_in_1D_short_array_by_block_2x2( CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride + x_pos, picture->y_stride, p); pic_block_same_info[0][pos] = is_block16_2x2_row_same_value(p); pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p); pic_block_hash[0][pos] = av1_get_crc_value(calc_1, (uint8_t *)p, length * sizeof(p[0])); pic_block_hash[1][pos] = av1_get_crc_value(calc_2, (uint8_t *)p, length * sizeof(p[0])); pos++; } pos += width - 1; } } else { uint8_t p[4]; int pos = 0; for (int y_pos = 0; y_pos < y_end; y_pos++) { for (int x_pos = 0; x_pos < x_end; x_pos++) { get_pixels_in_1D_char_array_by_block_2x2( picture->y_buffer + y_pos * picture->y_stride + x_pos, picture->y_stride, p); pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p); pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p); pic_block_hash[0][pos] = av1_get_crc_value(calc_1, p, length * sizeof(p[0])); pic_block_hash[1][pos] = av1_get_crc_value(calc_2, p, length * sizeof(p[0])); pos++; } pos += width - 1; } } } void av1_generate_block_hash_value(IntraBCHashInfo *intrabc_hash_info, const YV12_BUFFER_CONFIG *picture, int block_size, uint32_t *src_pic_block_hash[2], uint32_t *dst_pic_block_hash[2], int8_t *src_pic_block_same_info[3], int8_t *dst_pic_block_same_info[3]) { CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1; CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2; const int pic_width = picture->y_crop_width; const int x_end = picture->y_crop_width - block_size + 1; const int y_end = picture->y_crop_height - block_size + 1; const int src_size = block_size >> 1; const int quad_size = block_size >> 2; uint32_t p[4]; const int length = sizeof(p); int pos = 0; for (int y_pos = 0; y_pos < y_end; y_pos++) { for (int x_pos = 0; x_pos < x_end; x_pos++) { p[0] = src_pic_block_hash[0][pos]; p[1] = src_pic_block_hash[0][pos + src_size]; p[2] = src_pic_block_hash[0][pos + src_size * pic_width]; p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size]; dst_pic_block_hash[0][pos] = av1_get_crc_value(calc_1, (uint8_t *)p, length); p[0] = src_pic_block_hash[1][pos]; p[1] = src_pic_block_hash[1][pos + src_size]; p[2] = src_pic_block_hash[1][pos + src_size * pic_width]; p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size]; dst_pic_block_hash[1][pos] = av1_get_crc_value(calc_2, (uint8_t *)p, length); dst_pic_block_same_info[0][pos] = src_pic_block_same_info[0][pos] && src_pic_block_same_info[0][pos + quad_size] && src_pic_block_same_info[0][pos + src_size] && src_pic_block_same_info[0][pos + src_size * pic_width] && src_pic_block_same_info[0][pos + src_size * pic_width + quad_size] && src_pic_block_same_info[0][pos + src_size * pic_width + src_size]; dst_pic_block_same_info[1][pos] = src_pic_block_same_info[1][pos] && src_pic_block_same_info[1][pos + src_size] && src_pic_block_same_info[1][pos + quad_size * pic_width] && src_pic_block_same_info[1][pos + quad_size * pic_width + src_size] && src_pic_block_same_info[1][pos + src_size * pic_width] && src_pic_block_same_info[1][pos + src_size * pic_width + src_size]; pos++; } pos += block_size - 1; } if (block_size >= 4) { const int size_minus_1 = block_size - 1; pos = 0; for (int y_pos = 0; y_pos < y_end; y_pos++) { for (int x_pos = 0; x_pos < x_end; x_pos++) { dst_pic_block_same_info[2][pos] = (!dst_pic_block_same_info[0][pos] && !dst_pic_block_same_info[1][pos]) || (((x_pos & size_minus_1) == 0) && ((y_pos & size_minus_1) == 0)); pos++; } pos += block_size - 1; } } } bool av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table, uint32_t *pic_hash[2], int8_t *pic_is_same, int pic_width, int pic_height, int block_size) { const int x_end = pic_width - block_size + 1; const int y_end = pic_height - block_size + 1; const int8_t *src_is_added = pic_is_same; const uint32_t *src_hash[2] = { pic_hash[0], pic_hash[1] }; int add_value = hash_block_size_to_index(block_size); assert(add_value >= 0); add_value <<= kSrcBits; const int crc_mask = (1 << kSrcBits) - 1; for (int x_pos = 0; x_pos < x_end; x_pos++) { for (int y_pos = 0; y_pos < y_end; y_pos++) { const int pos = y_pos * pic_width + x_pos; // valid data if (src_is_added[pos]) { block_hash curr_block_hash; curr_block_hash.x = x_pos; curr_block_hash.y = y_pos; const uint32_t hash_value1 = (src_hash[0][pos] & crc_mask) + add_value; curr_block_hash.hash_value2 = src_hash[1][pos]; if (!hash_table_add_to_table(p_hash_table, hash_value1, &curr_block_hash)) { return false; } } } } return true; } int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture, int block_size, int x_start, int y_start) { const int stride = picture->y_stride; const uint8_t *p = picture->y_buffer + y_start * stride + x_start; if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *p16 = CONVERT_TO_SHORTPTR(p); for (int i = 0; i < block_size; i++) { for (int j = 1; j < block_size; j++) { if (p16[j] != p16[0]) { return 0; } } p16 += stride; } } else { for (int i = 0; i < block_size; i++) { for (int j = 1; j < block_size; j++) { if (p[j] != p[0]) { return 0; } } p += stride; } } return 1; } int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture, int block_size, int x_start, int y_start) { const int stride = picture->y_stride; const uint8_t *p = picture->y_buffer + y_start * stride + x_start; if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *p16 = CONVERT_TO_SHORTPTR(p); for (int i = 0; i < block_size; i++) { for (int j = 1; j < block_size; j++) { if (p16[j * stride + i] != p16[i]) { return 0; } } } } else { for (int i = 0; i < block_size; i++) { for (int j = 1; j < block_size; j++) { if (p[j * stride + i] != p[i]) { return 0; } } } } return 1; } void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info, const uint8_t *y_src, int stride, int block_size, uint32_t *hash_value1, uint32_t *hash_value2, int use_highbitdepth) { int add_value = hash_block_size_to_index(block_size); assert(add_value >= 0); add_value <<= kSrcBits; const int crc_mask = (1 << kSrcBits) - 1; CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1; CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2; uint32_t **buf_1 = intrabc_hash_info->hash_value_buffer[0]; uint32_t **buf_2 = intrabc_hash_info->hash_value_buffer[1]; // 2x2 subblock hash values in current CU int sub_block_in_width = (block_size >> 1); if (use_highbitdepth) { uint16_t pixel_to_hash[4]; uint16_t *y16_src = CONVERT_TO_SHORTPTR(y_src); for (int y_pos = 0; y_pos < block_size; y_pos += 2) { for (int x_pos = 0; x_pos < block_size; x_pos += 2) { int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1); get_pixels_in_1D_short_array_by_block_2x2( y16_src + y_pos * stride + x_pos, stride, pixel_to_hash); assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); buf_1[0][pos] = av1_get_crc_value(calc_1, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash)); buf_2[0][pos] = av1_get_crc_value(calc_2, (uint8_t *)pixel_to_hash, sizeof(pixel_to_hash)); } } } else { uint8_t pixel_to_hash[4]; for (int y_pos = 0; y_pos < block_size; y_pos += 2) { for (int x_pos = 0; x_pos < block_size; x_pos += 2) { int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1); get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos, stride, pixel_to_hash); assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); buf_1[0][pos] = av1_get_crc_value(calc_1, pixel_to_hash, sizeof(pixel_to_hash)); buf_2[0][pos] = av1_get_crc_value(calc_2, pixel_to_hash, sizeof(pixel_to_hash)); } } } int src_sub_block_in_width = sub_block_in_width; sub_block_in_width >>= 1; int src_idx = 1; int dst_idx = 0; // 4x4 subblock hash values to current block hash values uint32_t to_hash[4]; for (int sub_width = 4; sub_width <= block_size; sub_width *= 2) { src_idx = 1 - src_idx; dst_idx = 1 - dst_idx; int dst_pos = 0; for (int y_pos = 0; y_pos < sub_block_in_width; y_pos++) { for (int x_pos = 0; x_pos < sub_block_in_width; x_pos++) { int srcPos = (y_pos << 1) * src_sub_block_in_width + (x_pos << 1); assert(srcPos + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); assert(srcPos + src_sub_block_in_width + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); to_hash[0] = buf_1[src_idx][srcPos]; to_hash[1] = buf_1[src_idx][srcPos + 1]; to_hash[2] = buf_1[src_idx][srcPos + src_sub_block_in_width]; to_hash[3] = buf_1[src_idx][srcPos + src_sub_block_in_width + 1]; buf_1[dst_idx][dst_pos] = av1_get_crc_value(calc_1, (uint8_t *)to_hash, sizeof(to_hash)); to_hash[0] = buf_2[src_idx][srcPos]; to_hash[1] = buf_2[src_idx][srcPos + 1]; to_hash[2] = buf_2[src_idx][srcPos + src_sub_block_in_width]; to_hash[3] = buf_2[src_idx][srcPos + src_sub_block_in_width + 1]; buf_2[dst_idx][dst_pos] = av1_get_crc_value(calc_2, (uint8_t *)to_hash, sizeof(to_hash)); dst_pos++; } } src_sub_block_in_width = sub_block_in_width; sub_block_in_width >>= 1; } *hash_value1 = (buf_1[dst_idx][0] & crc_mask) + add_value; *hash_value2 = buf_2[dst_idx][0]; } aom-3.12.1/av1/encoder/hash_motion.h000066400000000000000000000100051477627663500171640ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_HASH_MOTION_H_ #define AOM_AV1_ENCODER_HASH_MOTION_H_ #include #include "config/aom_config.h" #include "aom/aom_integer.h" #include "aom_scale/yv12config.h" #include "av1/encoder/hash.h" #include "third_party/vector/vector.h" #ifdef __cplusplus extern "C" { #endif // Block size used for force_integer_mv decisions #define FORCE_INT_MV_DECISION_BLOCK_SIZE 8 // store a block's hash info. // x and y are the position from the top left of the picture // hash_value2 is used to store the second hash value typedef struct _block_hash { int16_t x; int16_t y; uint32_t hash_value2; } block_hash; typedef struct _hash_table { Vector **p_lookup_table; } hash_table; struct intrabc_hash_info; typedef struct intrabc_hash_info { // buffer for hash value calculation of a block // used only in av1_get_block_hash_value() // [first hash/second hash] // [two buffers used ping-pong] uint32_t *hash_value_buffer[2][2]; hash_table intrabc_hash_table; CRC_CALCULATOR crc_calculator1; CRC_CALCULATOR crc_calculator2; int g_crc_initialized; } IntraBCHashInfo; void av1_hash_table_init(IntraBCHashInfo *intra_bc_hash_info); void av1_hash_table_destroy(hash_table *p_hash_table); bool av1_hash_table_create(hash_table *p_hash_table); int32_t av1_hash_table_count(const hash_table *p_hash_table, uint32_t hash_value); Iterator av1_hash_get_first_iterator(hash_table *p_hash_table, uint32_t hash_value); void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intra_bc_hash_info, const YV12_BUFFER_CONFIG *picture, uint32_t *pic_block_hash[2], int8_t *pic_block_same_info[3]); void av1_generate_block_hash_value(IntraBCHashInfo *intra_bc_hash_info, const YV12_BUFFER_CONFIG *picture, int block_size, uint32_t *src_pic_block_hash[2], uint32_t *dst_pic_block_hash[2], int8_t *src_pic_block_same_info[3], int8_t *dst_pic_block_same_info[3]); bool av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table, uint32_t *pic_hash[2], int8_t *pic_is_same, int pic_width, int pic_height, int block_size); // check whether the block starts from (x_start, y_start) with the size of // block_size x block_size has the same color in all rows int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture, int block_size, int x_start, int y_start); // check whether the block starts from (x_start, y_start) with the size of // block_size x block_size has the same color in all columns int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture, int block_size, int x_start, int y_start); void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info, const uint8_t *y_src, int stride, int block_size, uint32_t *hash_value1, uint32_t *hash_value2, int use_highbitdepth); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_HASH_MOTION_H_ aom-3.12.1/av1/encoder/hybrid_fwd_txfm.c000066400000000000000000000322621477627663500200370ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "config/aom_dsp_rtcd.h" #include "av1/common/idct.h" #include "av1/common/blockd.h" #include "av1/encoder/hybrid_fwd_txfm.h" /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. Shared for both high and low bit depth. */ void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { int i; tran_high_t a1, b1, c1, d1, e1; const int16_t *ip_pass0 = input; const tran_low_t *ip = NULL; tran_low_t *op = output; for (i = 0; i < 4; i++) { a1 = ip_pass0[0 * stride]; b1 = ip_pass0[1 * stride]; c1 = ip_pass0[2 * stride]; d1 = ip_pass0[3 * stride]; a1 += b1; d1 = d1 - c1; e1 = (a1 - d1) >> 1; b1 = e1 - b1; c1 = e1 - c1; a1 -= c1; d1 += b1; op[0] = (tran_low_t)a1; op[1] = (tran_low_t)c1; op[2] = (tran_low_t)d1; op[3] = (tran_low_t)b1; ip_pass0++; op += 4; } ip = output; op = output; for (i = 0; i < 4; i++) { a1 = ip[4 * 0]; b1 = ip[4 * 1]; c1 = ip[4 * 2]; d1 = ip[4 * 3]; a1 += b1; d1 -= c1; e1 = (a1 - d1) >> 1; b1 = e1 - b1; c1 = e1 - c1; a1 -= c1; d1 += b1; op[4 * 0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR); op[4 * 1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR); op[4 * 2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR); op[4 * 3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR); ip++; op++; } } static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; const TX_TYPE tx_type = txfm_param->tx_type; const int bd = txfm_param->bd; if (txfm_param->lossless) { assert(tx_type == DCT_DCT); av1_fwht4x4(src_diff, coeff, diff_stride); return; } av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd); } static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; av1_fwd_txfm2d_4x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; av1_fwd_txfm2d_8x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; const TX_TYPE tx_type = txfm_param->tx_type; const int bd = txfm_param->bd; av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd); } static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; const TX_TYPE tx_type = txfm_param->tx_type; const int bd = txfm_param->bd; av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd); } static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; av1_fwd_txfm2d_16x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; av1_fwd_txfm2d_32x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); } #if !CONFIG_REALTIME_ONLY static void highbd_fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; av1_fwd_txfm2d_16x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; av1_fwd_txfm2d_32x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); } static void highbd_fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; av1_fwd_txfm2d_8x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); } #endif static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; const TX_TYPE tx_type = txfm_param->tx_type; const int bd = txfm_param->bd; av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd); } static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; const TX_TYPE tx_type = txfm_param->tx_type; const int bd = txfm_param->bd; av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd); } static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { int32_t *dst_coeff = (int32_t *)coeff; const TX_TYPE tx_type = txfm_param->tx_type; const int bd = txfm_param->bd; av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd); } static void highbd_fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { assert(txfm_param->tx_type == DCT_DCT); int32_t *dst_coeff = (int32_t *)coeff; const int bd = txfm_param->bd; av1_fwd_txfm2d_32x64(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd); } static void highbd_fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { assert(txfm_param->tx_type == DCT_DCT); int32_t *dst_coeff = (int32_t *)coeff; const int bd = txfm_param->bd; av1_fwd_txfm2d_64x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd); } #if !CONFIG_REALTIME_ONLY static void highbd_fwd_txfm_16x64(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { assert(txfm_param->tx_type == DCT_DCT); int32_t *dst_coeff = (int32_t *)coeff; const int bd = txfm_param->bd; av1_fwd_txfm2d_16x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); } static void highbd_fwd_txfm_64x16(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { assert(txfm_param->tx_type == DCT_DCT); int32_t *dst_coeff = (int32_t *)coeff; const int bd = txfm_param->bd; av1_fwd_txfm2d_64x16(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); } #endif static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { assert(txfm_param->tx_type == DCT_DCT); int32_t *dst_coeff = (int32_t *)coeff; const int bd = txfm_param->bd; av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); } void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { if (txfm_param->bd == 8) av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); else av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); } void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); } void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const TX_SIZE tx_size = txfm_param->tx_size; switch (tx_size) { case TX_64X64: highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param); break; case TX_32X64: highbd_fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param); break; case TX_64X32: highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param); break; case TX_32X32: highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param); break; case TX_16X16: highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param); break; case TX_8X8: highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param); break; case TX_4X8: highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param); break; case TX_8X4: highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param); break; case TX_8X16: highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param); break; case TX_16X8: highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param); break; case TX_16X32: highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param); break; case TX_32X16: highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param); break; case TX_4X4: highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param); break; #if !CONFIG_REALTIME_ONLY case TX_4X16: highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param); break; case TX_16X4: highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param); break; case TX_8X32: highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param); break; case TX_32X8: highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param); break; case TX_16X64: highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param); break; case TX_64X16: highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param); break; #endif default: assert(0); break; } } #if CONFIG_AV1_HIGHBITDEPTH static inline void highbd_wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { switch (tx_size) { // As the output transform co-efficients of 4x4 Hadamard transform can be // represented using 15 bits (for 12-bit clip) use lowbd variant of // hadamard_4x4. case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break; case TX_8X8: aom_highbd_hadamard_8x8(src_diff, src_stride, coeff); break; case TX_16X16: aom_highbd_hadamard_16x16(src_diff, src_stride, coeff); break; case TX_32X32: aom_highbd_hadamard_32x32(src_diff, src_stride, coeff); break; default: assert(0); } } #endif // CONFIG_AV1_HIGHBITDEPTH static inline void wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { switch (tx_size) { case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break; case TX_8X8: aom_hadamard_8x8(src_diff, src_stride, coeff); break; case TX_16X16: aom_hadamard_16x16(src_diff, src_stride, coeff); break; case TX_32X32: aom_hadamard_32x32(src_diff, src_stride, coeff); break; default: assert(0); } } void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info, const int16_t *src_diff, int src_stride, tran_low_t *coeff) { if (use_hadamard) { #if CONFIG_AV1_HIGHBITDEPTH if (bd_info.use_highbitdepth_buf) { highbd_wht_fwd_txfm(tx_size, src_diff, src_stride, coeff); } else { wht_fwd_txfm(tx_size, src_diff, src_stride, coeff); } #else wht_fwd_txfm(tx_size, src_diff, src_stride, coeff); #endif // CONFIG_AV1_HIGHBITDEPTH } else { TxfmParam txfm_param; txfm_param.tx_type = DCT_DCT; txfm_param.tx_size = tx_size; txfm_param.lossless = 0; txfm_param.bd = bd_info.bit_depth; txfm_param.is_hbd = bd_info.use_highbitdepth_buf; txfm_param.tx_set_type = EXT_TX_SET_ALL16; av1_fwd_txfm(src_diff, coeff, src_stride, &txfm_param); } } aom-3.12.1/av1/encoder/hybrid_fwd_txfm.h000066400000000000000000000026351477627663500200450ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ #define AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ #include "config/aom_config.h" #ifdef __cplusplus extern "C" { #endif void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); /*!\brief Apply Hadamard or DCT transform * * \callergraph * DCT and Hadamard transforms are commonly used for quick RD score estimation. * The coeff buffer's size should be equal to the number of pixels * corresponding to tx_size. */ void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info, const int16_t *src_diff, int src_stride, tran_low_t *coeff); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ aom-3.12.1/av1/encoder/interp_search.c000066400000000000000000001037271477627663500175130ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/pred_common.h" #include "av1/encoder/interp_search.h" #include "av1/encoder/model_rd.h" #include "av1/encoder/rdopt_utils.h" #include "av1/encoder/reconinter_enc.h" // return mv_diff static inline int is_interp_filter_good_match( const INTERPOLATION_FILTER_STATS *st, MB_MODE_INFO *const mi, int skip_level) { const int is_comp = has_second_ref(mi); int i; for (i = 0; i < 1 + is_comp; ++i) { if (st->ref_frames[i] != mi->ref_frame[i]) return INT_MAX; } if (skip_level == 1 && is_comp) { if (st->comp_type != mi->interinter_comp.type) return INT_MAX; if (st->compound_idx != mi->compound_idx) return INT_MAX; } int mv_diff = 0; for (i = 0; i < 1 + is_comp; ++i) { mv_diff += abs(st->mv[i].as_mv.row - mi->mv[i].as_mv.row) + abs(st->mv[i].as_mv.col - mi->mv[i].as_mv.col); } return mv_diff; } static inline int save_interp_filter_search_stat( MB_MODE_INFO *const mbmi, int64_t rd, unsigned int pred_sse, INTERPOLATION_FILTER_STATS *interp_filter_stats, int interp_filter_stats_idx) { if (interp_filter_stats_idx < MAX_INTERP_FILTER_STATS) { INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters, { mbmi->mv[0], mbmi->mv[1] }, { mbmi->ref_frame[0], mbmi->ref_frame[1] }, mbmi->interinter_comp.type, mbmi->compound_idx, rd, pred_sse }; interp_filter_stats[interp_filter_stats_idx] = stat; interp_filter_stats_idx++; } return interp_filter_stats_idx; } static inline int find_interp_filter_in_stats( MB_MODE_INFO *const mbmi, INTERPOLATION_FILTER_STATS *interp_filter_stats, int interp_filter_stats_idx, int skip_level) { // [skip_levels][single or comp] const int thr[2][2] = { { 0, 0 }, { 3, 7 } }; const int is_comp = has_second_ref(mbmi); // Find good enough match. // TODO(yunqing): Separate single-ref mode and comp mode stats for fast // search. int best = INT_MAX; int match = -1; for (int j = 0; j < interp_filter_stats_idx; ++j) { const INTERPOLATION_FILTER_STATS *st = &interp_filter_stats[j]; const int mv_diff = is_interp_filter_good_match(st, mbmi, skip_level); // Exact match is found. if (mv_diff == 0) { match = j; break; } else if (mv_diff < best && mv_diff <= thr[skip_level - 1][is_comp]) { best = mv_diff; match = j; } } if (match != -1) { mbmi->interp_filters = interp_filter_stats[match].filters; return match; } return -1; // no match result found } static int find_interp_filter_match( MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi, const InterpFilter assign_filter, const int need_search, INTERPOLATION_FILTER_STATS *interp_filter_stats, int interp_filter_stats_idx) { int match_found_idx = -1; if (cpi->sf.interp_sf.use_interp_filter && need_search) match_found_idx = find_interp_filter_in_stats( mbmi, interp_filter_stats, interp_filter_stats_idx, cpi->sf.interp_sf.use_interp_filter); if (!need_search || match_found_idx == -1) set_default_interp_filters(mbmi, assign_filter); return match_found_idx; } static inline int get_switchable_rate(MACROBLOCK *const x, const int_interpfilters filters, const int ctx[2], int dual_filter) { const InterpFilter filter0 = filters.as_filters.y_filter; int inter_filter_cost = x->mode_costs.switchable_interp_costs[ctx[0]][filter0]; if (dual_filter) { const InterpFilter filter1 = filters.as_filters.x_filter; inter_filter_cost += x->mode_costs.switchable_interp_costs[ctx[1]][filter1]; } return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost; } // Build inter predictor and calculate model rd // for a given plane. static inline void interp_model_rd_eval( MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize, const BUFFER_SET *const orig_dst, int plane_from, int plane_to, RD_STATS *rd_stats, int is_skip_build_pred) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; RD_STATS tmp_rd_stats; av1_init_rd_stats(&tmp_rd_stats); // Skip inter predictor if the predictor is already available. if (!is_skip_build_pred) { const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, plane_from, plane_to); } model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model ? MODELRD_LEGACY : MODELRD_TYPE_INTERP_FILTER]( cpi, bsize, x, xd, plane_from, plane_to, &tmp_rd_stats.rate, &tmp_rd_stats.dist, &tmp_rd_stats.skip_txfm, &tmp_rd_stats.sse, NULL, NULL, NULL); av1_merge_rd_stats(rd_stats, &tmp_rd_stats); } // calculate the rdcost of given interpolation_filter static inline int64_t interpolation_filter_rd( MACROBLOCK *const x, const AV1_COMP *const cpi, const TileDataEnc *tile_data, BLOCK_SIZE bsize, const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_luma, RD_STATS *rd_stats, int *const switchable_rate, const BUFFER_SET *dst_bufs[2], int filter_idx, const int switchable_ctx[2], const int skip_pred) { const AV1_COMMON *cm = &cpi->common; const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; RD_STATS this_rd_stats_luma, this_rd_stats; // Initialize rd_stats structures to default values. av1_init_rd_stats(&this_rd_stats_luma); this_rd_stats = *rd_stats_luma; const int_interpfilters last_best = mbmi->interp_filters; mbmi->interp_filters = filter_sets[filter_idx]; const int tmp_rs = get_switchable_rate(x, mbmi->interp_filters, switchable_ctx, cm->seq_params->enable_dual_filter); int64_t min_rd = RDCOST(x->rdmult, tmp_rs, 0); if (min_rd > *rd) { mbmi->interp_filters = last_best; return 0; } (void)tile_data; assert(skip_pred != 2); assert((rd_stats_luma->rate >= 0) && (rd_stats->rate >= 0)); assert((rd_stats_luma->dist >= 0) && (rd_stats->dist >= 0)); assert((rd_stats_luma->sse >= 0) && (rd_stats->sse >= 0)); assert((rd_stats_luma->skip_txfm == 0) || (rd_stats_luma->skip_txfm == 1)); assert((rd_stats->skip_txfm == 0) || (rd_stats->skip_txfm == 1)); assert((skip_pred >= 0) && (skip_pred <= interp_search_flags->default_interp_skip_flags)); // When skip_txfm pred is equal to default_interp_skip_flags, // skip both luma and chroma MC. // For mono-chrome images: // num_planes = 1 and cpi->default_interp_skip_flags = 1, // skip_pred = 1: skip both luma and chroma // skip_pred = 0: Evaluate luma and as num_planes=1, // skip chroma evaluation int tmp_skip_pred = (skip_pred == interp_search_flags->default_interp_skip_flags) ? INTERP_SKIP_LUMA_SKIP_CHROMA : skip_pred; switch (tmp_skip_pred) { case INTERP_EVAL_LUMA_EVAL_CHROMA: // skip_pred = 0: Evaluate both luma and chroma. // Luma MC interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y, &this_rd_stats_luma, 0); this_rd_stats = this_rd_stats_luma; #if CONFIG_COLLECT_RD_STATS == 3 RD_STATS rd_stats_y; av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize); #endif // CONFIG_COLLECT_RD_STATS == 3 AOM_FALLTHROUGH_INTENDED; case INTERP_SKIP_LUMA_EVAL_CHROMA: // skip_pred = 1: skip luma evaluation (retain previous best luma stats) // and do chroma evaluation. for (int plane = 1; plane < num_planes; ++plane) { int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist); if (tmp_rd >= *rd) { mbmi->interp_filters = last_best; return 0; } interp_model_rd_eval(x, cpi, bsize, orig_dst, plane, plane, &this_rd_stats, 0); } break; case INTERP_SKIP_LUMA_SKIP_CHROMA: // both luma and chroma evaluation is skipped this_rd_stats = *rd_stats; break; case INTERP_EVAL_INVALID: default: assert(0); return 0; } int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist); if (tmp_rd < *rd) { *rd = tmp_rd; *switchable_rate = tmp_rs; if (skip_pred != interp_search_flags->default_interp_skip_flags) { if (skip_pred == INTERP_EVAL_LUMA_EVAL_CHROMA) { // Overwrite the data as current filter is the best one *rd_stats_luma = this_rd_stats_luma; *rd_stats = this_rd_stats; // As luma MC data is computed, no need to recompute after the search x->recalc_luma_mc_data = 0; } else if (skip_pred == INTERP_SKIP_LUMA_EVAL_CHROMA) { // As luma MC data is not computed, update of luma data can be skipped *rd_stats = this_rd_stats; // As luma MC data is not recomputed and current filter is the best, // indicate the possibility of recomputing MC data // If current buffer contains valid MC data, toggle to indicate that // luma MC data needs to be recomputed x->recalc_luma_mc_data ^= 1; } swap_dst_buf(xd, dst_bufs, num_planes); } return 1; } mbmi->interp_filters = last_best; return 0; } static inline INTERP_PRED_TYPE is_pred_filter_search_allowed( const AV1_COMP *const cpi, MACROBLOCKD *xd, BLOCK_SIZE bsize, int_interpfilters *af, int_interpfilters *lf) { const AV1_COMMON *cm = &cpi->common; const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; const int bsl = mi_size_wide_log2[bsize]; int is_horiz_eq = 0, is_vert_eq = 0; if (above_mbmi && is_inter_block(above_mbmi)) *af = above_mbmi->interp_filters; if (left_mbmi && is_inter_block(left_mbmi)) *lf = left_mbmi->interp_filters; if (af->as_filters.x_filter != INTERP_INVALID) is_horiz_eq = af->as_filters.x_filter == lf->as_filters.x_filter; if (af->as_filters.y_filter != INTERP_INVALID) is_vert_eq = af->as_filters.y_filter == lf->as_filters.y_filter; INTERP_PRED_TYPE pred_filter_type = (is_vert_eq << 1) + is_horiz_eq; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; int pred_filter_enable = cpi->sf.interp_sf.cb_pred_filter_search ? (((mi_row + mi_col) >> bsl) + get_chessboard_index(cm->current_frame.frame_number)) & 0x1 : 0; pred_filter_enable &= is_horiz_eq || is_vert_eq; // pred_filter_search = 0: pred_filter is disabled // pred_filter_search = 1: pred_filter is enabled and only horz pred matching // pred_filter_search = 2: pred_filter is enabled and only vert pred matching // pred_filter_search = 3: pred_filter is enabled and // both vert, horz pred matching return pred_filter_enable * pred_filter_type; } static DUAL_FILTER_TYPE find_best_interp_rd_facade( MACROBLOCK *const x, const AV1_COMP *const cpi, const TileDataEnc *tile_data, BLOCK_SIZE bsize, const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y, RD_STATS *rd_stats, int *const switchable_rate, const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], const int skip_pred, uint16_t allow_interp_mask, int is_w4_or_h4) { int tmp_skip_pred = skip_pred; DUAL_FILTER_TYPE best_filt_type = REG_REG; // If no filter are set to be evaluated, return from function if (allow_interp_mask == 0x0) return best_filt_type; // For block width or height is 4, skip the pred evaluation of SHARP_SHARP tmp_skip_pred = is_w4_or_h4 ? cpi->interp_search_flags.default_interp_skip_flags : skip_pred; // Loop over the all filter types and evaluate for only allowed filter types for (int filt_type = SHARP_SHARP; filt_type >= REG_REG; --filt_type) { const int is_filter_allowed = get_interp_filter_allowed_mask(allow_interp_mask, filt_type); if (is_filter_allowed) if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, rd_stats_y, rd_stats, switchable_rate, dst_bufs, filt_type, switchable_ctx, tmp_skip_pred)) best_filt_type = filt_type; tmp_skip_pred = skip_pred; } return best_filt_type; } static inline void pred_dual_interp_filter_rd( MACROBLOCK *const x, const AV1_COMP *const cpi, const TileDataEnc *tile_data, BLOCK_SIZE bsize, const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y, RD_STATS *rd_stats, int *const switchable_rate, const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], const int skip_pred, INTERP_PRED_TYPE pred_filt_type, int_interpfilters *af, int_interpfilters *lf) { (void)lf; assert(pred_filt_type > INTERP_HORZ_NEQ_VERT_NEQ); assert(pred_filt_type < INTERP_PRED_TYPE_ALL); uint16_t allowed_interp_mask = 0; if (pred_filt_type == INTERP_HORZ_EQ_VERT_NEQ) { // pred_filter_search = 1: Only horizontal filter is matching allowed_interp_mask = av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.x_filter]; } else if (pred_filt_type == INTERP_HORZ_NEQ_VERT_EQ) { // pred_filter_search = 2: Only vertical filter is matching allowed_interp_mask = av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.y_filter]; } else { // pred_filter_search = 3: Both horizontal and vertical filter are matching int filt_type = af->as_filters.x_filter + af->as_filters.y_filter * SWITCHABLE_FILTERS; set_interp_filter_allowed_mask(&allowed_interp_mask, filt_type); } // REG_REG is already been evaluated in the beginning reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG); find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, rd_stats_y, rd_stats, switchable_rate, dst_bufs, switchable_ctx, skip_pred, allowed_interp_mask, 0); } // Evaluate dual filter type // a) Using above, left block interp filter // b) Find the best horizontal filter and // then evaluate corresponding vertical filters. static inline void fast_dual_interp_filter_rd( MACROBLOCK *const x, const AV1_COMP *const cpi, const TileDataEnc *tile_data, BLOCK_SIZE bsize, const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y, RD_STATS *rd_stats, int *const switchable_rate, const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], const int skip_hor, const int skip_ver) { const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ; int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID); int_interpfilters lf = af; if (!have_newmv_in_inter_mode(mbmi->mode)) { pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf); } if (pred_filter_type) { pred_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, rd_stats_y, rd_stats, switchable_rate, dst_bufs, switchable_ctx, (skip_hor & skip_ver), pred_filter_type, &af, &lf); } else { const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; int best_dual_mode = 0; int skip_pred = bw <= 4 ? interp_search_flags->default_interp_skip_flags : skip_hor; // TODO(any): Make use of find_best_interp_rd_facade() // if speed impact is negligible for (int i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) { if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, rd_stats_y, rd_stats, switchable_rate, dst_bufs, i, switchable_ctx, skip_pred)) { best_dual_mode = i; } skip_pred = skip_hor; } // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes skip_pred = bh <= 4 ? interp_search_flags->default_interp_skip_flags : skip_ver; for (int i = (best_dual_mode + (SWITCHABLE_FILTERS * 2)); i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) { interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, rd_stats_y, rd_stats, switchable_rate, dst_bufs, i, switchable_ctx, skip_pred); skip_pred = skip_ver; } } } // Find the best interp filter if dual_interp_filter = 0 static inline void find_best_non_dual_interp_filter( MACROBLOCK *const x, const AV1_COMP *const cpi, const TileDataEnc *tile_data, BLOCK_SIZE bsize, const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y, RD_STATS *rd_stats, int *const switchable_rate, const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], const int skip_ver, const int skip_hor) { const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags; int8_t i; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; uint16_t interp_filter_search_mask = interp_search_flags->interp_filter_search_mask; if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) { const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); const int ctx0 = av1_get_pred_context_switchable_interp(xd, 0); const int ctx1 = av1_get_pred_context_switchable_interp(xd, 1); int use_actual_frame_probs = 1; const int *switchable_interp_p0; const int *switchable_interp_p1; #if CONFIG_FPMT_TEST use_actual_frame_probs = (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1; if (!use_actual_frame_probs) { switchable_interp_p0 = (int *)cpi->ppi->temp_frame_probs .switchable_interp_probs[update_type][ctx0]; switchable_interp_p1 = (int *)cpi->ppi->temp_frame_probs .switchable_interp_probs[update_type][ctx1]; } #endif if (use_actual_frame_probs) { switchable_interp_p0 = cpi->ppi->frame_probs.switchable_interp_probs[update_type][ctx0]; switchable_interp_p1 = cpi->ppi->frame_probs.switchable_interp_probs[update_type][ctx1]; } static const int thr[7] = { 0, 8, 8, 8, 8, 0, 8 }; const int thresh = thr[update_type]; for (i = 0; i < SWITCHABLE_FILTERS; i++) { // For non-dual case, the 2 dir's prob should be identical. assert(switchable_interp_p0[i] == switchable_interp_p1[i]); if (switchable_interp_p0[i] < thresh && switchable_interp_p1[i] < thresh) { DUAL_FILTER_TYPE filt_type = i + SWITCHABLE_FILTERS * i; reset_interp_filter_allowed_mask(&interp_filter_search_mask, filt_type); } } } // Regular filter evaluation should have been done and hence the same should // be the winner assert(x->e_mbd.mi[0]->interp_filters.as_int == filter_sets[0].as_int); if ((skip_hor & skip_ver) != interp_search_flags->default_interp_skip_flags) { INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ; int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID); int_interpfilters lf = af; pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf); if (pred_filter_type) { assert(af.as_filters.x_filter != INTERP_INVALID); int filter_idx = SWITCHABLE * af.as_filters.x_filter; // This assert tells that (filter_x == filter_y) for non-dual filter case assert(filter_sets[filter_idx].as_filters.x_filter == filter_sets[filter_idx].as_filters.y_filter); if (cpi->sf.interp_sf.adaptive_interp_filter_search && !(get_interp_filter_allowed_mask(interp_filter_search_mask, filter_idx))) { return; } if (filter_idx) { interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, rd_stats_y, rd_stats, switchable_rate, dst_bufs, filter_idx, switchable_ctx, (skip_hor & skip_ver)); } return; } } // Reuse regular filter's modeled rd data for sharp filter for following // cases // 1) When bsize is 4x4 // 2) When block width is 4 (i.e. 4x8/4x16 blocks) and MV in vertical // direction is full-pel // 3) When block height is 4 (i.e. 8x4/16x4 blocks) and MV in horizontal // direction is full-pel // TODO(any): Optimize cases 2 and 3 further if luma MV in relavant direction // alone is full-pel if ((bsize == BLOCK_4X4) || (block_size_wide[bsize] == 4 && skip_ver == interp_search_flags->default_interp_skip_flags) || (block_size_high[bsize] == 4 && skip_hor == interp_search_flags->default_interp_skip_flags)) { int skip_pred = skip_hor & skip_ver; uint16_t allowed_interp_mask = 0; // REG_REG filter type is evaluated beforehand, hence skip it set_interp_filter_allowed_mask(&allowed_interp_mask, SHARP_SHARP); set_interp_filter_allowed_mask(&allowed_interp_mask, SMOOTH_SMOOTH); if (cpi->sf.interp_sf.adaptive_interp_filter_search) allowed_interp_mask &= interp_filter_search_mask; find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, rd_stats_y, rd_stats, switchable_rate, dst_bufs, switchable_ctx, skip_pred, allowed_interp_mask, 1); } else { int skip_pred = (skip_hor & skip_ver); for (i = (SWITCHABLE_FILTERS + 1); i < DUAL_FILTER_SET_SIZE; i += (SWITCHABLE_FILTERS + 1)) { // This assert tells that (filter_x == filter_y) for non-dual filter case assert(filter_sets[i].as_filters.x_filter == filter_sets[i].as_filters.y_filter); if (cpi->sf.interp_sf.adaptive_interp_filter_search && !(get_interp_filter_allowed_mask(interp_filter_search_mask, i))) { continue; } interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, rd_stats_y, rd_stats, switchable_rate, dst_bufs, i, switchable_ctx, skip_pred); // In first iteration, smooth filter is evaluated. If smooth filter // (which is less sharper) is the winner among regular and smooth filters, // sharp filter evaluation is skipped // TODO(any): Refine this gating based on modelled rd only (i.e., by not // accounting switchable filter rate) if (cpi->sf.interp_sf.skip_sharp_interp_filter_search && skip_pred != interp_search_flags->default_interp_skip_flags) { if (mbmi->interp_filters.as_int == filter_sets[SMOOTH_SMOOTH].as_int) break; } } } } static inline void calc_interp_skip_pred_flag(MACROBLOCK *const x, const AV1_COMP *const cpi, int *skip_hor, int *skip_ver) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const int num_planes = av1_num_planes(cm); const int is_compound = has_second_ref(mbmi); assert(is_intrabc_block(mbmi) == 0); for (int ref = 0; ref < 1 + is_compound; ++ref) { const struct scale_factors *const sf = get_ref_scale_factors_const(cm, mbmi->ref_frame[ref]); // TODO(any): Refine skip flag calculation considering scaling if (av1_is_scaled(sf)) { *skip_hor = 0; *skip_ver = 0; break; } const MV mv = mbmi->mv[ref].as_mv; int skip_hor_plane = 0; int skip_ver_plane = 0; for (int plane_idx = 0; plane_idx < AOMMAX(1, (num_planes - 1)); ++plane_idx) { struct macroblockd_plane *const pd = &xd->plane[plane_idx]; const int bw = pd->width; const int bh = pd->height; const MV mv_q4 = clamp_mv_to_umv_border_sb( xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y); const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS; const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS; skip_hor_plane |= ((sub_x == 0) << plane_idx); skip_ver_plane |= ((sub_y == 0) << plane_idx); } *skip_hor &= skip_hor_plane; *skip_ver &= skip_ver_plane; // It is not valid that "luma MV is sub-pel, whereas chroma MV is not" assert(*skip_hor != 2); assert(*skip_ver != 2); } // When compond prediction type is compound segment wedge, luma MC and chroma // MC need to go hand in hand as mask generated during luma MC is reuired for // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during // vertical filter decision may be incorrect as temporary MC evaluation // overwrites the mask. Make skip_ver as 0 for this case so that mask is // populated during luma MC if (is_compound && mbmi->compound_idx == 1 && mbmi->interinter_comp.type == COMPOUND_DIFFWTD) { assert(mbmi->comp_group_idx == 1); if (*skip_hor == 0 && *skip_ver == 1) *skip_ver = 0; } } /*!\brief AV1 interpolation filter search * * \ingroup inter_mode_search * * \param[in] cpi Top-level encoder structure. * \param[in] tile_data Pointer to struct holding adaptive * data/contexts/models for the tile during * encoding. * \param[in] x Pointer to struc holding all the data for * the current macroblock. * \param[in] bsize Current block size. * \param[in] tmp_dst A temporary prediction buffer to hold a * computed prediction. * \param[in,out] orig_dst A prediction buffer to hold a computed * prediction. This will eventually hold the * final prediction, and the tmp_dst info will * be copied here. * \param[in,out] rd The RD cost associated with the selected * interpolation filter parameters. * \param[in,out] switchable_rate The rate associated with using a SWITCHABLE * filter mode. * \param[in,out] skip_build_pred Indicates whether or not to build the inter * predictor. If this is 0, the inter predictor * has already been built and thus we can avoid * repeating computation. * \param[in] args HandleInterModeArgs struct holding * miscellaneous arguments for inter mode * search. See the documentation for this * struct for a description of each member. * \param[in] ref_best_rd Best RD found so far for this block. * It is used for early termination of this * search if the RD exceeds this value. * * \return Returns INT64_MAX if the filter parameters are invalid and the * current motion mode being tested should be skipped. It returns 0 if the * parameter search is a success. */ int64_t av1_interpolation_filter_search( MACROBLOCK *const x, const AV1_COMP *const cpi, const TileDataEnc *tile_data, BLOCK_SIZE bsize, const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst, int64_t *const rd, int *const switchable_rate, int *skip_build_pred, HandleInterModeArgs *args, int64_t ref_best_rd) { const AV1_COMMON *cm = &cpi->common; const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const int need_search = av1_is_interp_needed(xd); const int ref_frame = xd->mi[0]->ref_frame[0]; RD_STATS rd_stats_luma, rd_stats; // Initialization of rd_stats structures with default values av1_init_rd_stats(&rd_stats_luma); av1_init_rd_stats(&rd_stats); int match_found_idx = -1; const InterpFilter assign_filter = cm->features.interp_filter; match_found_idx = find_interp_filter_match( mbmi, cpi, assign_filter, need_search, args->interp_filter_stats, args->interp_filter_stats_idx); if (match_found_idx != -1) { *rd = args->interp_filter_stats[match_found_idx].rd; x->pred_sse[ref_frame] = args->interp_filter_stats[match_found_idx].pred_sse; *skip_build_pred = 0; return 0; } int switchable_ctx[2]; switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0); switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1); *switchable_rate = get_switchable_rate(x, mbmi->interp_filters, switchable_ctx, cm->seq_params->enable_dual_filter); // Do MC evaluation for default filter_type. // Luma MC interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y, &rd_stats_luma, *skip_build_pred); #if CONFIG_COLLECT_RD_STATS == 3 RD_STATS rd_stats_y; av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize); #endif // CONFIG_COLLECT_RD_STATS == 3 // Chroma MC if (num_planes > 1) { interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_U, AOM_PLANE_V, &rd_stats, *skip_build_pred); } *skip_build_pred = 1; av1_merge_rd_stats(&rd_stats, &rd_stats_luma); assert(rd_stats.rate >= 0); *rd = RDCOST(x->rdmult, *switchable_rate + rd_stats.rate, rd_stats.dist); x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4); if (assign_filter != SWITCHABLE || match_found_idx != -1) { return 0; } if (!need_search) { int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); assert(mbmi->interp_filters.as_int == filters.as_int); (void)filters; return 0; } if (args->modelled_rd != NULL) { if (has_second_ref(mbmi)) { const int ref_mv_idx = mbmi->ref_mv_idx; MV_REFERENCE_FRAME *refs = mbmi->ref_frame; const int mode0 = compound_ref0_mode(mbmi->mode); const int mode1 = compound_ref1_mode(mbmi->mode); const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]], args->modelled_rd[mode1][ref_mv_idx][refs[1]]); if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) { return INT64_MAX; } } } x->recalc_luma_mc_data = 0; // skip_flag=xx (in binary form) // Setting 0th flag corresonds to skipping luma MC and setting 1st bt // corresponds to skipping chroma MC skip_flag=0 corresponds to "Don't skip // luma and chroma MC" Skip flag=1 corresponds to "Skip Luma MC only" // Skip_flag=2 is not a valid case // skip_flag=3 corresponds to "Skip both luma and chroma MC" int skip_hor = interp_search_flags->default_interp_skip_flags; int skip_ver = interp_search_flags->default_interp_skip_flags; calc_interp_skip_pred_flag(x, cpi, &skip_hor, &skip_ver); // do interp_filter search restore_dst_buf(xd, *tmp_dst, num_planes); const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst }; // Evaluate dual interp filters if (cm->seq_params->enable_dual_filter) { if (cpi->sf.interp_sf.use_fast_interpolation_filter_search) { fast_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, &rd_stats_luma, &rd_stats, switchable_rate, dst_bufs, switchable_ctx, skip_hor, skip_ver); } else { // Use full interpolation filter search uint16_t allowed_interp_mask = ALLOW_ALL_INTERP_FILT_MASK; // REG_REG filter type is evaluated beforehand, so loop is repeated over // REG_SMOOTH to SHARP_SHARP for full interpolation filter search reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG); find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, &rd_stats_luma, &rd_stats, switchable_rate, dst_bufs, switchable_ctx, (skip_hor & skip_ver), allowed_interp_mask, 0); } } else { // Evaluate non-dual interp filters find_best_non_dual_interp_filter( x, cpi, tile_data, bsize, orig_dst, rd, &rd_stats_luma, &rd_stats, switchable_rate, dst_bufs, switchable_ctx, skip_ver, skip_hor); } swap_dst_buf(xd, dst_bufs, num_planes); // Recompute final MC data if required if (x->recalc_luma_mc_data == 1) { // Recomputing final luma MC data is required only if the same was skipped // in either of the directions Condition below is necessary, but not // sufficient assert((skip_hor == 1) || (skip_ver == 1)); const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, AOM_PLANE_Y, AOM_PLANE_Y); } x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4); // save search results if (cpi->sf.interp_sf.use_interp_filter) { assert(match_found_idx == -1); args->interp_filter_stats_idx = save_interp_filter_search_stat( mbmi, *rd, x->pred_sse[ref_frame], args->interp_filter_stats, args->interp_filter_stats_idx); } return 0; } aom-3.12.1/av1/encoder/interp_search.h000066400000000000000000000147171477627663500175200ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_ #define AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_ #include "av1/encoder/block.h" #include "av1/encoder/encoder.h" #include "av1/encoder/rdopt_utils.h" #ifdef __cplusplus extern "C" { #endif /*!\cond */ #define MAX_INTERP_FILTER_STATS 128 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS) typedef struct { int_interpfilters filters; int_mv mv[2]; int8_t ref_frames[2]; COMPOUND_TYPE comp_type; int compound_idx; int64_t rd; unsigned int pred_sse; } INTERPOLATION_FILTER_STATS; /*!\endcond */ /*!\brief Miscellaneous arguments for inter mode search. */ typedef struct HandleInterModeArgs { /*! * Buffer for the above predictor in OBMC */ uint8_t *above_pred_buf[MAX_MB_PLANE]; /*! * Stride for the above predictor in OBMC */ int above_pred_stride[MAX_MB_PLANE]; /*! * Buffer for the left predictor in OBMC */ uint8_t *left_pred_buf[MAX_MB_PLANE]; /*! * Stride for the left predictor in OBMC */ int left_pred_stride[MAX_MB_PLANE]; /*! * Pointer to the first member in a 2D array which holds * single reference mode motion vectors to be used as a starting * point in the mv search for compound modes. Each array is length REF_FRAMES, * meaning there is a slot for a single reference motion vector for * each possible reference frame. The 2D array consists of N of these arrays, * where N is the length of the reference mv stack computed for the single * reference case for that particular reference frame. */ int_mv (*single_newmv)[REF_FRAMES]; /*! * Pointer to the first array of a 2D array with the same setup as * single_newmv array above. This is a 2D array to hold the rate * corresponding to each of the single reference mode motion vectors * held in single_newmv. */ int (*single_newmv_rate)[REF_FRAMES]; /*! * Pointer to the first array of a 2D array with the same setup as * single_newmv array above. This is a 2D array to hold a 0 or 1 * validity value corresponding to each of the single reference mode motion * vectors held in single_newmv. */ int (*single_newmv_valid)[REF_FRAMES]; /*! * Pointer to the first array in a 3D array of predicted rate-distortion. * The dimensions of this structure are: * (number of possible inter modes) X * (number of reference MVs) X * (number of reference frames). */ int64_t (*modelled_rd)[MAX_REF_MV_SEARCH][REF_FRAMES]; /*! * Holds an estimated entropy cost for picking the current reference frame. * This is used to compute an rd estimate. */ int ref_frame_cost; /*! * Holds an estimated entropy cost for picking single or compound * reference. This is used to compute an rd estimate. */ int single_comp_cost; /*! * Pointer to the first element in a 3D array holding rd's of * SIMPLE_TRANSLATION used to prune out the motion mode search in single ref * modes used to determine compound ref modes. The full structure is: * (number of inter modes) X (length of refmv list) X (number of ref frames) */ int64_t (*simple_rd)[MAX_REF_MV_SEARCH][REF_FRAMES]; /*! * An integer value 0 or 1 which indicates whether or not to skip the motion * mode search and default to SIMPLE_TRANSLATION as a speed feature. */ int skip_motion_mode; /*! * Initialized to false. If true, skips interpolation filter search and uses * the default EIGHTTAP_REGULAR. */ bool skip_ifs; /*! * A pointer to the first element in an array of INTERINTRA_MODE types. This * contains the best inter_intra mode for each reference frame. */ INTERINTRA_MODE *inter_intra_mode; /*! * Array of saved interpolation filter stats collected to avoid repeating * an interpolation filter search when the mv and ref_frame are the same * as a previous search. */ INTERPOLATION_FILTER_STATS interp_filter_stats[MAX_INTERP_FILTER_STATS]; /*! * Stack to store full pixel search start mv of NEWMV mode. */ FULLPEL_MV start_mv_stack[(MAX_REF_MV_SEARCH - 1) * 2]; /*! * Stack to store ref_mv_idx of NEWMV mode. */ uint8_t ref_mv_idx_stack[(MAX_REF_MV_SEARCH - 1) * 2]; /*! * Count of mvs in start mv stack. */ int start_mv_cnt; /*! * Index of the last set of saved stats in the interp_filter_stats array. */ int interp_filter_stats_idx; /*! * Estimated wedge index. */ int wedge_index; /*! * Estimated wedge sign. */ int wedge_sign; /*! * Estimated diff wtd index. */ int diffwtd_index; /*! * Estimated cmp mode. */ int cmp_mode[MODE_CTX_REF_FRAMES]; /*! * The best sse during single new_mv search. Note that the sse here comes from * single_motion_search, and not from interpolation_filter_search. This has * two implications: * 1. The mv used to calculate the sse here does not have to be the best sse * found in handle_inter_mode. * 2. Even if the mvs agree, the sse here can differ from the sse in \ref * MACROBLOCK::pred_sse due to different interpolation filter used. */ unsigned int best_single_sse_in_refs[REF_FRAMES]; /*! * Holds the sse of best mode so far in the mode evaluation process. This is * used in intermediate termination of NEWMV mode evaluation. */ unsigned int best_pred_sse; } HandleInterModeArgs; /*!\cond */ static const int_interpfilters filter_sets[DUAL_FILTER_SET_SIZE] = { { 0x00000000 }, { 0x00010000 }, { 0x00020000 }, // y = 0 { 0x00000001 }, { 0x00010001 }, { 0x00020001 }, // y = 1 { 0x00000002 }, { 0x00010002 }, { 0x00020002 }, // y = 2 }; int64_t av1_interpolation_filter_search( MACROBLOCK *const x, const AV1_COMP *const cpi, const TileDataEnc *tile_data, BLOCK_SIZE bsize, const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst, int64_t *const rd, int *const switchable_rate, int *skip_build_pred, HandleInterModeArgs *args, int64_t ref_best_rd); /*!\endcond */ #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_ aom-3.12.1/av1/encoder/intra_mode_search.c000066400000000000000000002206461477627663500203330ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/av1_common_int.h" #include "av1/common/cfl.h" #include "av1/common/reconintra.h" #include "av1/encoder/intra_mode_search.h" #include "av1/encoder/intra_mode_search_utils.h" #include "av1/encoder/palette.h" #include "av1/encoder/speed_features.h" #include "av1/encoder/tx_search.h" // Even though there are 7 delta angles, this macro is set to 9 to facilitate // the rd threshold check to prune -3 and 3 delta angles. #define SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY (2 * MAX_ANGLE_DELTA + 3) // The order for evaluating delta angles while processing the luma directional // intra modes. Currently, this order of evaluation is applicable only when // speed feature prune_luma_odd_delta_angles_in_intra is enabled. In this case, // even angles are evaluated first in order to facilitate the pruning of odd // delta angles based on the rd costs of the neighboring delta angles. static const int8_t luma_delta_angles_order[2 * MAX_ANGLE_DELTA] = { -2, 2, -3, -1, 1, 3, }; /*!\cond */ static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = { DC_PRED, H_PRED, V_PRED, SMOOTH_PRED, PAETH_PRED, SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED, D157_PRED, D67_PRED, D113_PRED, D45_PRED, }; static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = { UV_DC_PRED, UV_CFL_PRED, UV_H_PRED, UV_V_PRED, UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED, UV_D135_PRED, UV_D203_PRED, UV_D157_PRED, UV_D67_PRED, UV_D113_PRED, UV_D45_PRED, }; // The bitmask corresponds to the filter intra modes as defined in enums.h // FILTER_INTRA_MODE enumeration type. Setting a bit to 0 in the mask means to // disable the evaluation of corresponding filter intra mode. The table // av1_derived_filter_intra_mode_used_flag is used when speed feature // prune_filter_intra_level is 1. The evaluated filter intra modes are union // of the following: // 1) FILTER_DC_PRED // 2) mode that corresponds to best mode so far of DC_PRED, V_PRED, H_PRED, // D157_PRED and PAETH_PRED. (Eg: FILTER_V_PRED if best mode so far is V_PRED). static const uint8_t av1_derived_filter_intra_mode_used_flag[INTRA_MODES] = { 0x01, // DC_PRED: 0000 0001 0x03, // V_PRED: 0000 0011 0x05, // H_PRED: 0000 0101 0x01, // D45_PRED: 0000 0001 0x01, // D135_PRED: 0000 0001 0x01, // D113_PRED: 0000 0001 0x09, // D157_PRED: 0000 1001 0x01, // D203_PRED: 0000 0001 0x01, // D67_PRED: 0000 0001 0x01, // SMOOTH_PRED: 0000 0001 0x01, // SMOOTH_V_PRED: 0000 0001 0x01, // SMOOTH_H_PRED: 0000 0001 0x11 // PAETH_PRED: 0001 0001 }; // The bitmask corresponds to the chroma intra modes as defined in enums.h // UV_PREDICTION_MODE enumeration type. Setting a bit to 0 in the mask means to // disable the evaluation of corresponding chroma intra mode. The table // av1_derived_chroma_intra_mode_used_flag is used when speed feature // prune_chroma_modes_using_luma_winner is enabled. The evaluated chroma // intra modes are union of the following: // 1) UV_DC_PRED // 2) UV_SMOOTH_PRED // 3) UV_CFL_PRED // 4) mode that corresponds to luma intra mode winner (Eg : UV_V_PRED if luma // intra mode winner is V_PRED). static const uint16_t av1_derived_chroma_intra_mode_used_flag[INTRA_MODES] = { 0x2201, // DC_PRED: 0010 0010 0000 0001 0x2203, // V_PRED: 0010 0010 0000 0011 0x2205, // H_PRED: 0010 0010 0000 0101 0x2209, // D45_PRED: 0010 0010 0000 1001 0x2211, // D135_PRED: 0010 0010 0001 0001 0x2221, // D113_PRED: 0010 0010 0010 0001 0x2241, // D157_PRED: 0010 0010 0100 0001 0x2281, // D203_PRED: 0010 0010 1000 0001 0x2301, // D67_PRED: 0010 0011 0000 0001 0x2201, // SMOOTH_PRED: 0010 0010 0000 0001 0x2601, // SMOOTH_V_PRED: 0010 0110 0000 0001 0x2a01, // SMOOTH_H_PRED: 0010 1010 0000 0001 0x3201 // PAETH_PRED: 0011 0010 0000 0001 }; DECLARE_ALIGNED(16, static const uint8_t, all_zeros[MAX_SB_SIZE]) = { 0 }; DECLARE_ALIGNED(16, static const uint16_t, highbd_all_zeros[MAX_SB_SIZE]) = { 0 }; int av1_calc_normalized_variance(aom_variance_fn_t vf, const uint8_t *const buf, const int stride, const int is_hbd) { unsigned int sse; if (is_hbd) return vf(buf, stride, CONVERT_TO_BYTEPTR(highbd_all_zeros), 0, &sse); else return vf(buf, stride, all_zeros, 0, &sse); } // Computes average of log(1 + variance) across 4x4 sub-blocks for source and // reconstructed blocks. static void compute_avg_log_variance(const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bs, double *avg_log_src_variance, double *avg_log_recon_variance) { const MACROBLOCKD *const xd = &x->e_mbd; const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size; const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1); const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1); const int right_overflow = (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0; const int bottom_overflow = (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0; const int bw = (MI_SIZE * mi_size_wide[bs] - right_overflow); const int bh = (MI_SIZE * mi_size_high[bs] - bottom_overflow); const int is_hbd = is_cur_buf_hbd(xd); aom_variance_fn_t vf = cpi->ppi->fn_ptr[BLOCK_4X4].vf; for (int i = 0; i < bh; i += MI_SIZE) { const int r = mi_row_in_sb + (i >> MI_SIZE_LOG2); for (int j = 0; j < bw; j += MI_SIZE) { const int c = mi_col_in_sb + (j >> MI_SIZE_LOG2); const int mi_offset = r * mi_size_wide[sb_size] + c; Block4x4VarInfo *block_4x4_var_info = &x->src_var_info_of_4x4_sub_blocks[mi_offset]; int src_var = block_4x4_var_info->var; double log_src_var = block_4x4_var_info->log_var; // Compute average of log(1 + variance) for the source block from 4x4 // sub-block variance values. Calculate and store 4x4 sub-block variance // and log(1 + variance), if the values present in // src_var_of_4x4_sub_blocks are invalid. Reuse the same if it is readily // available with valid values. if (src_var < 0) { src_var = av1_calc_normalized_variance( vf, x->plane[0].src.buf + i * x->plane[0].src.stride + j, x->plane[0].src.stride, is_hbd); block_4x4_var_info->var = src_var; log_src_var = log1p(src_var / 16.0); block_4x4_var_info->log_var = log_src_var; } else { // When source variance is already calculated and available for // retrieval, check if log(1 + variance) is also available. If it is // available, then retrieve from buffer. Else, calculate the same and // store to the buffer. if (log_src_var < 0) { log_src_var = log1p(src_var / 16.0); block_4x4_var_info->log_var = log_src_var; } } *avg_log_src_variance += log_src_var; const int recon_var = av1_calc_normalized_variance( vf, xd->plane[0].dst.buf + i * xd->plane[0].dst.stride + j, xd->plane[0].dst.stride, is_hbd); *avg_log_recon_variance += log1p(recon_var / 16.0); } } const int blocks = (bw * bh) / 16; *avg_log_src_variance /= (double)blocks; *avg_log_recon_variance /= (double)blocks; } // Returns a factor to be applied to the RD value based on how well the // reconstructed block variance matches the source variance. static double intra_rd_variance_factor(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { double threshold = INTRA_RD_VAR_THRESH(cpi->oxcf.speed); // For non-positive threshold values, the comparison of source and // reconstructed variances with threshold evaluates to false // (src_var < threshold/rec_var < threshold) as these metrics are greater than // than 0. Hence further calculations are skipped. if (threshold <= 0) return 1.0; double variance_rd_factor = 1.0; double avg_log_src_variance = 0.0; double avg_log_recon_variance = 0.0; double var_diff = 0.0; compute_avg_log_variance(cpi, x, bs, &avg_log_src_variance, &avg_log_recon_variance); // Dont allow 0 to prevent / 0 below. avg_log_src_variance += 0.000001; avg_log_recon_variance += 0.000001; if (avg_log_src_variance >= avg_log_recon_variance) { var_diff = (avg_log_src_variance - avg_log_recon_variance); if ((var_diff > 0.5) && (avg_log_recon_variance < threshold)) { variance_rd_factor = 1.0 + ((var_diff * 2) / avg_log_src_variance); } } else { var_diff = (avg_log_recon_variance - avg_log_src_variance); if ((var_diff > 0.5) && (avg_log_src_variance < threshold)) { variance_rd_factor = 1.0 + (var_diff / (2 * avg_log_src_variance)); } } // Limit adjustment; variance_rd_factor = AOMMIN(3.0, variance_rd_factor); return variance_rd_factor; } /*!\endcond */ /*!\brief Search for the best filter_intra mode when coding intra frame. * * \ingroup intra_mode_search * \callergraph * This function loops through all filter_intra modes to find the best one. * * \return Returns 1 if a new filter_intra mode is selected; 0 otherwise. */ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable, BLOCK_SIZE bsize, int mode_cost, PREDICTION_MODE best_mode_so_far, int64_t *best_rd, int64_t *best_model_rd, PICK_MODE_CONTEXT *ctx) { // Skip the evaluation of filter intra modes. if (cpi->sf.intra_sf.prune_filter_intra_level == 2) return 0; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; int filter_intra_selected_flag = 0; FILTER_INTRA_MODE mode; TX_SIZE best_tx_size = TX_8X8; FILTER_INTRA_MODE_INFO filter_intra_mode_info; uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; av1_zero(filter_intra_mode_info); mbmi->filter_intra_mode_info.use_filter_intra = 1; mbmi->mode = DC_PRED; mbmi->palette_mode_info.palette_size[0] = 0; // Skip the evaluation of filter-intra if cached MB_MODE_INFO does not have // filter-intra as winner. if (x->use_mb_mode_cache && !x->mb_mode_cache->filter_intra_mode_info.use_filter_intra) return 0; for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) { int64_t this_rd; RD_STATS tokenonly_rd_stats; mbmi->filter_intra_mode_info.filter_intra_mode = mode; if ((cpi->sf.intra_sf.prune_filter_intra_level == 1) && !(av1_derived_filter_intra_mode_used_flag[best_mode_so_far] & (1 << mode))) continue; // Skip the evaluation of modes that do not match with the winner mode in // x->mb_mode_cache. if (x->use_mb_mode_cache && mode != x->mb_mode_cache->filter_intra_mode_info.filter_intra_mode) continue; if (model_intra_yrd_and_prune(cpi, x, bsize, best_model_rd)) { continue; } av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); if (tokenonly_rd_stats.rate == INT_MAX) continue; const int this_rate = tokenonly_rd_stats.rate + intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0); this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); // Visual quality adjustment based on recon vs source variance. if ((cpi->oxcf.mode == ALLINTRA) && (this_rd != INT64_MAX)) { this_rd = (int64_t)(this_rd * intra_rd_variance_factor(cpi, x, bsize)); } // Collect mode stats for multiwinner mode processing const int txfm_search_done = 1; store_winner_mode_stats( &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd, cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done); if (this_rd < *best_rd) { *best_rd = this_rd; best_tx_size = mbmi->tx_size; filter_intra_mode_info = mbmi->filter_intra_mode_info; av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip, sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); *rate = this_rate; *rate_tokenonly = tokenonly_rd_stats.rate; *distortion = tokenonly_rd_stats.dist; *skippable = tokenonly_rd_stats.skip_txfm; filter_intra_selected_flag = 1; } } if (filter_intra_selected_flag) { mbmi->mode = DC_PRED; mbmi->tx_size = best_tx_size; mbmi->filter_intra_mode_info = filter_intra_mode_info; av1_copy_array(ctx->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); return 1; } else { return 0; } } void av1_count_colors(const uint8_t *src, int stride, int rows, int cols, int *val_count, int *num_colors) { const int max_pix_val = 1 << 8; memset(val_count, 0, max_pix_val * sizeof(val_count[0])); for (int r = 0; r < rows; ++r) { for (int c = 0; c < cols; ++c) { const int this_val = src[r * stride + c]; assert(this_val < max_pix_val); ++val_count[this_val]; } } int n = 0; for (int i = 0; i < max_pix_val; ++i) { if (val_count[i]) ++n; } *num_colors = n; } void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols, int bit_depth, int *val_count, int *bin_val_count, int *num_color_bins, int *num_colors) { assert(bit_depth <= 12); const int max_bin_val = 1 << 8; const int max_pix_val = 1 << bit_depth; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); memset(bin_val_count, 0, max_bin_val * sizeof(val_count[0])); if (val_count != NULL) memset(val_count, 0, max_pix_val * sizeof(val_count[0])); for (int r = 0; r < rows; ++r) { for (int c = 0; c < cols; ++c) { /* * Down-convert the pixels to 8-bit domain before counting. * This provides consistency of behavior for palette search * between lbd and hbd encodes. This down-converted pixels * are only used for calculating the threshold (n). */ const int this_val = ((src[r * stride + c]) >> (bit_depth - 8)); assert(this_val < max_bin_val); if (this_val >= max_bin_val) continue; ++bin_val_count[this_val]; if (val_count != NULL) ++val_count[(src[r * stride + c])]; } } int n = 0; // Count the colors based on 8-bit domain used to gate the palette path for (int i = 0; i < max_bin_val; ++i) { if (bin_val_count[i]) ++n; } *num_color_bins = n; // Count the actual hbd colors used to create top_colors n = 0; if (val_count != NULL) { for (int i = 0; i < max_pix_val; ++i) { if (val_count[i]) ++n; } *num_colors = n; } } void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi, int reorder_delta_angle_eval) { if (mode_idx < INTRA_MODE_END) { mbmi->mode = intra_rd_search_mode_order[mode_idx]; mbmi->angle_delta[PLANE_TYPE_Y] = 0; } else { mbmi->mode = (mode_idx - INTRA_MODE_END) / (MAX_ANGLE_DELTA * 2) + V_PRED; int delta_angle_eval_idx = (mode_idx - INTRA_MODE_END) % (MAX_ANGLE_DELTA * 2); if (reorder_delta_angle_eval) { mbmi->angle_delta[PLANE_TYPE_Y] = luma_delta_angles_order[delta_angle_eval_idx]; } else { mbmi->angle_delta[PLANE_TYPE_Y] = (delta_angle_eval_idx < 3 ? (delta_angle_eval_idx - 3) : (delta_angle_eval_idx - 2)); } } } static inline int get_model_rd_index_for_pruning( const MACROBLOCK *const x, const INTRA_MODE_SPEED_FEATURES *const intra_sf) { const int top_intra_model_count_allowed = intra_sf->top_intra_model_count_allowed; if (!intra_sf->adapt_top_model_rd_count_using_neighbors) return top_intra_model_count_allowed - 1; const MACROBLOCKD *const xd = &x->e_mbd; const PREDICTION_MODE mode = xd->mi[0]->mode; int model_rd_index_for_pruning = top_intra_model_count_allowed - 1; int is_left_mode_neq_cur_mode = 0, is_above_mode_neq_cur_mode = 0; if (xd->left_available) is_left_mode_neq_cur_mode = xd->left_mbmi->mode != mode; if (xd->up_available) is_above_mode_neq_cur_mode = xd->above_mbmi->mode != mode; // The pruning of luma intra modes is made more aggressive at lower quantizers // and vice versa. The value for model_rd_index_for_pruning is derived as // follows. // qidx 0 to 127: Reduce the index of a candidate used for comparison only if // the current mode does not match either of the available neighboring modes. // qidx 128 to 255: Reduce the index of a candidate used for comparison only // if the current mode does not match both the available neighboring modes. if (x->qindex <= 127) { if (is_left_mode_neq_cur_mode || is_above_mode_neq_cur_mode) model_rd_index_for_pruning = AOMMAX(model_rd_index_for_pruning - 1, 0); } else { if (is_left_mode_neq_cur_mode && is_above_mode_neq_cur_mode) model_rd_index_for_pruning = AOMMAX(model_rd_index_for_pruning - 1, 0); } return model_rd_index_for_pruning; } /*! \brief prune luma intra mode based on the model rd. * \param[in] this_model_rd model rd for current mode. * \param[in] best_model_rd Best model RD seen for this block so * far. * \param[in] top_intra_model_rd Top intra model RD seen for this * block so far. * \param[in] max_model_cnt_allowed The maximum number of top intra * model RD allowed. * \param[in] model_rd_index_for_pruning Index of the candidate used for * pruning based on model rd. */ static int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd, int64_t top_intra_model_rd[], int max_model_cnt_allowed, int model_rd_index_for_pruning) { const double thresh_best = 1.50; const double thresh_top = 1.00; for (int i = 0; i < max_model_cnt_allowed; i++) { if (this_model_rd < top_intra_model_rd[i]) { for (int j = max_model_cnt_allowed - 1; j > i; j--) { top_intra_model_rd[j] = top_intra_model_rd[j - 1]; } top_intra_model_rd[i] = this_model_rd; break; } } if (top_intra_model_rd[model_rd_index_for_pruning] != INT64_MAX && this_model_rd > thresh_top * top_intra_model_rd[model_rd_index_for_pruning]) return 1; if (this_model_rd != INT64_MAX && this_model_rd > thresh_best * (*best_model_rd)) return 1; if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd; return 0; } // Run RD calculation with given chroma intra prediction angle., and return // the RD cost. Update the best mode info. if the RD cost is the best so far. static int64_t pick_intra_angle_routine_sbuv( const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats, int *best_angle_delta, int64_t *best_rd) { MB_MODE_INFO *mbmi = x->e_mbd.mi[0]; assert(!is_inter_block(mbmi)); int this_rate; int64_t this_rd; RD_STATS tokenonly_rd_stats; if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in)) return INT64_MAX; this_rate = tokenonly_rd_stats.rate + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, rate_overhead); this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); if (this_rd < *best_rd) { *best_rd = this_rd; *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV]; *rate = this_rate; rd_stats->rate = tokenonly_rd_stats.rate; rd_stats->dist = tokenonly_rd_stats.dist; rd_stats->skip_txfm = tokenonly_rd_stats.skip_txfm; } return this_rd; } /*!\brief Search for the best angle delta for chroma prediction * * \ingroup intra_mode_search * \callergraph * Given a chroma directional intra prediction mode, this function will try to * estimate the best delta_angle. * * \returns Return if there is a new mode with smaller rdcost than best_rd. */ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int rate_overhead, int64_t best_rd, int *rate, RD_STATS *rd_stats) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); int i, angle_delta, best_angle_delta = 0; int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)]; rd_stats->rate = INT_MAX; rd_stats->skip_txfm = 0; rd_stats->dist = INT64_MAX; for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX; for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { for (i = 0; i < 2; ++i) { best_rd_in = (best_rd == INT64_MAX) ? INT64_MAX : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5))); mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta; this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd_in, rate, rd_stats, &best_angle_delta, &best_rd); rd_cost[2 * angle_delta + i] = this_rd; if (angle_delta == 0) { if (this_rd == INT64_MAX) return 0; rd_cost[1] = this_rd; break; } } } assert(best_rd != INT64_MAX); for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { int64_t rd_thresh; for (i = 0; i < 2; ++i) { int skip_search = 0; rd_thresh = best_rd + (best_rd >> 5); if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh && rd_cost[2 * (angle_delta - 1) + i] > rd_thresh) skip_search = 1; if (!skip_search) { mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta; pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd, rate, rd_stats, &best_angle_delta, &best_rd); } } } mbmi->angle_delta[PLANE_TYPE_UV] = best_angle_delta; return rd_stats->rate != INT_MAX; } #define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \ (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1) static void cfl_idx_to_sign_and_alpha(int cfl_idx, CFL_SIGN_TYPE *cfl_sign, int *cfl_alpha) { int cfl_linear_idx = cfl_idx - CFL_INDEX_ZERO; if (cfl_linear_idx == 0) { *cfl_sign = CFL_SIGN_ZERO; *cfl_alpha = 0; } else { *cfl_sign = cfl_linear_idx > 0 ? CFL_SIGN_POS : CFL_SIGN_NEG; *cfl_alpha = abs(cfl_linear_idx) - 1; } } static int64_t cfl_compute_rd(const AV1_COMP *const cpi, MACROBLOCK *x, int plane, TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int cfl_idx, int fast_mode, RD_STATS *rd_stats) { assert(IMPLIES(fast_mode, rd_stats == NULL)); const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; int cfl_plane = get_cfl_pred_type(plane); CFL_SIGN_TYPE cfl_sign; int cfl_alpha; cfl_idx_to_sign_and_alpha(cfl_idx, &cfl_sign, &cfl_alpha); // We conly build CFL for a given plane, the other plane's sign is dummy int dummy_sign = CFL_SIGN_NEG; const int8_t orig_cfl_alpha_signs = mbmi->cfl_alpha_signs; const uint8_t orig_cfl_alpha_idx = mbmi->cfl_alpha_idx; mbmi->cfl_alpha_signs = PLANE_SIGN_TO_JOINT_SIGN(cfl_plane, cfl_sign, dummy_sign); mbmi->cfl_alpha_idx = (cfl_alpha << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha; int64_t cfl_cost; if (fast_mode) { cfl_cost = intra_model_rd(cm, x, plane, plane_bsize, tx_size, /*use_hadamard=*/0); } else { av1_init_rd_stats(rd_stats); av1_txfm_rd_in_plane(x, cpi, rd_stats, INT64_MAX, 0, plane, plane_bsize, tx_size, FTXS_NONE, 0); av1_rd_cost_update(x->rdmult, rd_stats); cfl_cost = rd_stats->rdcost; } mbmi->cfl_alpha_signs = orig_cfl_alpha_signs; mbmi->cfl_alpha_idx = orig_cfl_alpha_idx; return cfl_cost; } static const int cfl_dir_ls[2] = { 1, -1 }; // If cfl_search_range is CFL_MAGS_SIZE, return zero. Otherwise return the index // of the best alpha found using intra_model_rd(). static int cfl_pick_plane_parameter(const AV1_COMP *const cpi, MACROBLOCK *x, int plane, TX_SIZE tx_size, int cfl_search_range) { assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE); if (cfl_search_range == CFL_MAGS_SIZE) return CFL_INDEX_ZERO; const MACROBLOCKD *const xd = &x->e_mbd; const MB_MODE_INFO *const mbmi = xd->mi[0]; assert(mbmi->uv_mode == UV_CFL_PRED); const MACROBLOCKD_PLANE *pd = &xd->plane[plane]; const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); int est_best_cfl_idx = CFL_INDEX_ZERO; int fast_mode = 1; int start_cfl_idx = CFL_INDEX_ZERO; int64_t best_cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, start_cfl_idx, fast_mode, NULL); for (int si = 0; si < 2; ++si) { const int dir = cfl_dir_ls[si]; for (int i = 1; i < CFL_MAGS_SIZE; ++i) { int cfl_idx = start_cfl_idx + dir * i; if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break; int64_t cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, cfl_idx, fast_mode, NULL); if (cfl_cost < best_cfl_cost) { best_cfl_cost = cfl_cost; est_best_cfl_idx = cfl_idx; } else { break; } } } return est_best_cfl_idx; } static inline void set_invalid_cfl_parameters(uint8_t *best_cfl_alpha_idx, int8_t *best_cfl_alpha_signs) { *best_cfl_alpha_idx = 0; *best_cfl_alpha_signs = 0; } static void cfl_pick_plane_rd(const AV1_COMP *const cpi, MACROBLOCK *x, int plane, TX_SIZE tx_size, int cfl_search_range, RD_STATS cfl_rd_arr[CFL_MAGS_SIZE], int est_best_cfl_idx) { assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE); const MACROBLOCKD *const xd = &x->e_mbd; const MB_MODE_INFO *const mbmi = xd->mi[0]; assert(mbmi->uv_mode == UV_CFL_PRED); const MACROBLOCKD_PLANE *pd = &xd->plane[plane]; const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); for (int cfl_idx = 0; cfl_idx < CFL_MAGS_SIZE; ++cfl_idx) { av1_invalid_rd_stats(&cfl_rd_arr[cfl_idx]); } int fast_mode = 0; int start_cfl_idx = est_best_cfl_idx; cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, start_cfl_idx, fast_mode, &cfl_rd_arr[start_cfl_idx]); if (cfl_search_range == 1) return; for (int si = 0; si < 2; ++si) { const int dir = cfl_dir_ls[si]; for (int i = 1; i < cfl_search_range; ++i) { int cfl_idx = start_cfl_idx + dir * i; if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break; cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, cfl_idx, fast_mode, &cfl_rd_arr[cfl_idx]); } } } /*!\brief Pick the optimal parameters for Chroma to Luma (CFL) component * * \ingroup intra_mode_search * \callergraph * * This function will use DCT_DCT followed by computing SATD (sum of absolute * transformed differences) to estimate the RD score and find the best possible * CFL parameter. * * Then the function will apply a full RD search near the best possible CFL * parameter to find the best actual CFL parameter. * * Side effect: * We use ths buffers in x->plane[] and xd->plane[] as throw-away buffers for RD * search. * * \param[in] x Encoder prediction block structure. * \param[in] cpi Top-level encoder instance structure. * \param[in] tx_size Transform size. * \param[in] ref_best_rd Reference best RD. * \param[in] cfl_search_range The search range of full RD search near the * estimated best CFL parameter. * * \param[out] best_rd_stats RD stats of the best CFL parameter * \param[out] best_cfl_alpha_idx Best CFL alpha index * \param[out] best_cfl_alpha_signs Best CFL joint signs * */ static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi, TX_SIZE tx_size, int64_t ref_best_rd, int cfl_search_range, RD_STATS *best_rd_stats, uint8_t *best_cfl_alpha_idx, int8_t *best_cfl_alpha_signs) { assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE); const ModeCosts *mode_costs = &x->mode_costs; RD_STATS cfl_rd_arr_u[CFL_MAGS_SIZE]; RD_STATS cfl_rd_arr_v[CFL_MAGS_SIZE]; MACROBLOCKD *const xd = &x->e_mbd; int est_best_cfl_idx_u, est_best_cfl_idx_v; av1_invalid_rd_stats(best_rd_stats); // As the dc pred data is same for different values of alpha, enable the // caching of dc pred data. Call clear_cfl_dc_pred_cache_flags() before // returning to avoid the unintentional usage of cached dc pred data. xd->cfl.use_dc_pred_cache = true; // Evaluate alpha parameter of each chroma plane. est_best_cfl_idx_u = cfl_pick_plane_parameter(cpi, x, 1, tx_size, cfl_search_range); est_best_cfl_idx_v = cfl_pick_plane_parameter(cpi, x, 2, tx_size, cfl_search_range); if (cfl_search_range == 1) { // For cfl_search_range=1, further refinement of alpha is not enabled. Hence // CfL index=0 for both the chroma planes implies invalid CfL mode. if (est_best_cfl_idx_u == CFL_INDEX_ZERO && est_best_cfl_idx_v == CFL_INDEX_ZERO) { set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs); clear_cfl_dc_pred_cache_flags(&xd->cfl); return 0; } int cfl_alpha_u, cfl_alpha_v; CFL_SIGN_TYPE cfl_sign_u, cfl_sign_v; const MB_MODE_INFO *mbmi = xd->mi[0]; cfl_idx_to_sign_and_alpha(est_best_cfl_idx_u, &cfl_sign_u, &cfl_alpha_u); cfl_idx_to_sign_and_alpha(est_best_cfl_idx_v, &cfl_sign_v, &cfl_alpha_v); const int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1; // Compute alpha and mode signaling rate. const int rate_overhead = mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u] + mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v] + mode_costs ->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_CFL_PRED]; // Skip the CfL mode evaluation if the RD cost derived using the rate needed // to signal the CfL mode and alpha parameter exceeds the ref_best_rd. if (RDCOST(x->rdmult, rate_overhead, 0) > ref_best_rd) { set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs); clear_cfl_dc_pred_cache_flags(&xd->cfl); return 0; } } // Compute the rd cost of each chroma plane using the alpha parameters which // were already evaluated. cfl_pick_plane_rd(cpi, x, 1, tx_size, cfl_search_range, cfl_rd_arr_u, est_best_cfl_idx_u); cfl_pick_plane_rd(cpi, x, 2, tx_size, cfl_search_range, cfl_rd_arr_v, est_best_cfl_idx_v); clear_cfl_dc_pred_cache_flags(&xd->cfl); for (int ui = 0; ui < CFL_MAGS_SIZE; ++ui) { if (cfl_rd_arr_u[ui].rate == INT_MAX) continue; int cfl_alpha_u; CFL_SIGN_TYPE cfl_sign_u; cfl_idx_to_sign_and_alpha(ui, &cfl_sign_u, &cfl_alpha_u); for (int vi = 0; vi < CFL_MAGS_SIZE; ++vi) { if (cfl_rd_arr_v[vi].rate == INT_MAX) continue; int cfl_alpha_v; CFL_SIGN_TYPE cfl_sign_v; cfl_idx_to_sign_and_alpha(vi, &cfl_sign_v, &cfl_alpha_v); // cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO is not a // valid parameter for CFL if (cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO) continue; int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1; RD_STATS rd_stats = cfl_rd_arr_u[ui]; av1_merge_rd_stats(&rd_stats, &cfl_rd_arr_v[vi]); if (rd_stats.rate != INT_MAX) { rd_stats.rate += mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u]; rd_stats.rate += mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v]; } av1_rd_cost_update(x->rdmult, &rd_stats); if (rd_stats.rdcost < best_rd_stats->rdcost) { *best_rd_stats = rd_stats; *best_cfl_alpha_idx = (cfl_alpha_u << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha_v; *best_cfl_alpha_signs = joint_sign; } } } if (best_rd_stats->rdcost >= ref_best_rd) { av1_invalid_rd_stats(best_rd_stats); // Set invalid CFL parameters here since the rdcost is not better than // ref_best_rd. set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs); return 0; } return 1; } static bool should_prune_chroma_smooth_pred_based_on_source_variance( const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize) { if (!cpi->sf.intra_sf.prune_smooth_intra_mode_for_chroma) return false; // If the source variance of both chroma planes is less than 20 (empirically // derived), prune UV_SMOOTH_PRED. for (int i = AOM_PLANE_U; i < av1_num_planes(&cpi->common); i++) { const unsigned int variance = av1_get_perpixel_variance_facade( cpi, &x->e_mbd, &x->plane[i].src, bsize, i); if (variance >= 20) return false; } return true; } int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable, BLOCK_SIZE bsize, TX_SIZE max_tx_size) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); MB_MODE_INFO best_mbmi = *mbmi; int64_t best_rd = INT64_MAX, this_rd; const ModeCosts *mode_costs = &x->mode_costs; const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg; init_sbuv_mode(mbmi); // Return if the current block does not correspond to a chroma block. if (!xd->is_chroma_ref) { *rate = 0; *rate_tokenonly = 0; *distortion = 0; *skippable = 1; return INT64_MAX; } // Only store reconstructed luma when there's chroma RDO. When there's no // chroma RDO, the reconstructed luma will be stored in encode_superblock(). xd->cfl.store_y = store_cfl_required_rdo(cm, x); if (xd->cfl.store_y) { // Restore reconstructed luma values. // TODO(chiyotsai@google.com): right now we are re-computing the txfm in // this function everytime we search through uv modes. There is some // potential speed up here if we cache the result to avoid redundant // computation. av1_encode_intra_block_plane(cpi, x, mbmi->bsize, AOM_PLANE_Y, DRY_RUN_NORMAL, cpi->optimize_seg_arr[mbmi->segment_id]); xd->cfl.store_y = 0; } IntraModeSearchState intra_search_state; init_intra_mode_search_state(&intra_search_state); const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd); // Search through all non-palette modes. for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) { int this_rate; RD_STATS tokenonly_rd_stats; UV_PREDICTION_MODE uv_mode = uv_rd_search_mode_order[mode_idx]; // Skip the current mode evaluation if the RD cost derived using the mode // signaling rate exceeds the best_rd so far. const int mode_rate = mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode]; if (RDCOST(x->rdmult, mode_rate, 0) > best_rd) continue; PREDICTION_MODE intra_mode = get_uv_mode(uv_mode); const int is_diagonal_mode = av1_is_diagonal_mode(intra_mode); const int is_directional_mode = av1_is_directional_mode(intra_mode); if (is_diagonal_mode && !cpi->oxcf.intra_mode_cfg.enable_diagonal_intra) continue; if (is_directional_mode && !cpi->oxcf.intra_mode_cfg.enable_directional_intra) continue; if (!(cpi->sf.intra_sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] & (1 << uv_mode))) continue; if (!intra_mode_cfg->enable_smooth_intra && uv_mode >= UV_SMOOTH_PRED && uv_mode <= UV_SMOOTH_H_PRED) continue; if (!intra_mode_cfg->enable_paeth_intra && uv_mode == UV_PAETH_PRED) continue; assert(mbmi->mode < INTRA_MODES); if (cpi->sf.intra_sf.prune_chroma_modes_using_luma_winner && !(av1_derived_chroma_intra_mode_used_flag[mbmi->mode] & (1 << uv_mode))) continue; mbmi->uv_mode = uv_mode; // Init variables for cfl and angle delta const SPEED_FEATURES *sf = &cpi->sf; mbmi->angle_delta[PLANE_TYPE_UV] = 0; if (uv_mode == UV_CFL_PRED) { if (!cfl_allowed || !intra_mode_cfg->enable_cfl_intra) continue; assert(!is_directional_mode); const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); if (!cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd, sf->intra_sf.cfl_search_range, &tokenonly_rd_stats, &mbmi->cfl_alpha_idx, &mbmi->cfl_alpha_signs)) { continue; } } else if (is_directional_mode && av1_use_angle_delta(mbmi->bsize) && intra_mode_cfg->enable_angle_delta) { if (sf->intra_sf.chroma_intra_pruning_with_hog && !intra_search_state.dir_mode_skip_mask_ready) { static const float thresh[2][4] = { { -1.2f, 0.0f, 0.0f, 1.2f }, // Interframe { -1.2f, -1.2f, -0.6f, 0.4f }, // Intraframe }; const int is_chroma = 1; const int is_intra_frame = frame_is_intra_only(cm); prune_intra_mode_with_hog( x, bsize, cm->seq_params->sb_size, thresh[is_intra_frame] [sf->intra_sf.chroma_intra_pruning_with_hog - 1], intra_search_state.directional_mode_skip_mask, is_chroma); intra_search_state.dir_mode_skip_mask_ready = 1; } if (intra_search_state.directional_mode_skip_mask[uv_mode]) { continue; } // Search through angle delta const int rate_overhead = mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode]; if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd, &this_rate, &tokenonly_rd_stats)) continue; } else { if (uv_mode == UV_SMOOTH_PRED && should_prune_chroma_smooth_pred_based_on_source_variance(cpi, x, bsize)) continue; // Predict directly if we don't need to search for angle delta. if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) { continue; } } const int mode_cost = mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode]; this_rate = tokenonly_rd_stats.rate + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost); this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); if (this_rd < best_rd) { best_mbmi = *mbmi; best_rd = this_rd; *rate = this_rate; *rate_tokenonly = tokenonly_rd_stats.rate; *distortion = tokenonly_rd_stats.dist; *skippable = tokenonly_rd_stats.skip_txfm; } } // Search palette mode const int try_palette = cpi->oxcf.tool_cfg.enable_palette && av1_allow_palette(cpi->common.features.allow_screen_content_tools, mbmi->bsize); if (try_palette) { uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map; av1_rd_pick_palette_intra_sbuv( cpi, x, mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][UV_DC_PRED], best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly, distortion, skippable); } *mbmi = best_mbmi; // Make sure we actually chose a mode assert(best_rd < INT64_MAX); return best_rd; } // Searches palette mode for luma channel in inter frame. int av1_search_palette_mode(IntraModeSearchState *intra_search_state, const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, unsigned int ref_frame_cost, PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost, int64_t best_rd) { const AV1_COMMON *const cm = &cpi->common; MB_MODE_INFO *const mbmi = x->e_mbd.mi[0]; PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; int rate2 = 0; int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd; int skippable = 0; uint8_t *const best_palette_color_map = x->palette_buffer->best_palette_color_map; uint8_t *const color_map = xd->plane[0].color_index_map; MB_MODE_INFO best_mbmi_palette = *mbmi; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; const ModeCosts *mode_costs = &x->mode_costs; const int *const intra_mode_cost = mode_costs->mbmode_cost[size_group_lookup[bsize]]; const int rows = block_size_high[bsize]; const int cols = block_size_wide[bsize]; mbmi->mode = DC_PRED; mbmi->uv_mode = UV_DC_PRED; mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE_FRAME; av1_zero(pmi->palette_size); RD_STATS rd_stats_y; av1_invalid_rd_stats(&rd_stats_y); av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette, best_palette_color_map, &best_rd_palette, &rd_stats_y.rate, NULL, &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL, ctx, best_blk_skip, best_tx_type_map); if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) { this_rd_cost->rdcost = INT64_MAX; return skippable; } memcpy(x->txfm_search_info.blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize)); av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); memcpy(color_map, best_palette_color_map, rows * cols * sizeof(best_palette_color_map[0])); skippable = rd_stats_y.skip_txfm; distortion2 = rd_stats_y.dist; rate2 = rd_stats_y.rate + ref_frame_cost; if (num_planes > 1) { if (intra_search_state->rate_uv_intra == INT_MAX) { // We have not found any good uv mode yet, so we need to search for it. TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd); av1_rd_pick_intra_sbuv_mode(cpi, x, &intra_search_state->rate_uv_intra, &intra_search_state->rate_uv_tokenonly, &intra_search_state->dist_uvs, &intra_search_state->skip_uvs, bsize, uv_tx); intra_search_state->mode_uv = mbmi->uv_mode; intra_search_state->pmi_uv = *pmi; intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV]; } // We have found at least one good uv mode before, so copy and paste it // over. mbmi->uv_mode = intra_search_state->mode_uv; pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1]; if (pmi->palette_size[1] > 0) { memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE, 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0])); } mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta; skippable = skippable && intra_search_state->skip_uvs; distortion2 += intra_search_state->dist_uvs; rate2 += intra_search_state->rate_uv_intra; } if (skippable) { rate2 -= rd_stats_y.rate; if (num_planes > 1) rate2 -= intra_search_state->rate_uv_tokenonly; rate2 += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][1]; } else { rate2 += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][0]; } this_rd = RDCOST(x->rdmult, rate2, distortion2); this_rd_cost->rate = rate2; this_rd_cost->dist = distortion2; this_rd_cost->rdcost = this_rd; return skippable; } void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, unsigned int ref_frame_cost, PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost, int64_t best_rd) { MB_MODE_INFO *const mbmi = x->e_mbd.mi[0]; PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; MACROBLOCKD *const xd = &x->e_mbd; int64_t best_rd_palette = best_rd, this_rd; uint8_t *const best_palette_color_map = x->palette_buffer->best_palette_color_map; uint8_t *const color_map = xd->plane[0].color_index_map; MB_MODE_INFO best_mbmi_palette = *mbmi; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; const ModeCosts *mode_costs = &x->mode_costs; const int *const intra_mode_cost = mode_costs->mbmode_cost[size_group_lookup[bsize]]; const int rows = block_size_high[bsize]; const int cols = block_size_wide[bsize]; mbmi->mode = DC_PRED; mbmi->uv_mode = UV_DC_PRED; mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE_FRAME; av1_zero(pmi->palette_size); RD_STATS rd_stats_y; av1_invalid_rd_stats(&rd_stats_y); av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette, best_palette_color_map, &best_rd_palette, &rd_stats_y.rate, NULL, &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL, ctx, best_blk_skip, best_tx_type_map); if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) { this_rd_cost->rdcost = INT64_MAX; return; } memcpy(x->txfm_search_info.blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize)); av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); memcpy(color_map, best_palette_color_map, rows * cols * sizeof(best_palette_color_map[0])); rd_stats_y.rate += ref_frame_cost; if (rd_stats_y.skip_txfm) { rd_stats_y.rate = ref_frame_cost + mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][1]; } else { rd_stats_y.rate += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][0]; } this_rd = RDCOST(x->rdmult, rd_stats_y.rate, rd_stats_y.dist); this_rd_cost->rate = rd_stats_y.rate; this_rd_cost->dist = rd_stats_y.dist; this_rd_cost->rdcost = this_rd; this_rd_cost->skip_txfm = rd_stats_y.skip_txfm; } /*!\brief Get the intra prediction by searching through tx_type and tx_size. * * \ingroup intra_mode_search * \callergraph * Currently this function is only used in the intra frame code path for * winner-mode processing. * * \return Returns whether the current mode is an improvement over best_rd. */ static inline int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, const int *bmode_costs, int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable, MB_MODE_INFO *best_mbmi, PICK_MODE_CONTEXT *ctx) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; RD_STATS rd_stats; // In order to improve txfm search, avoid rd based breakouts during winner // mode evaluation. Hence passing ref_best_rd as INT64_MAX by default when the // speed feature use_rd_based_breakout_for_intra_tx_search is disabled. int64_t ref_best_rd = cpi->sf.tx_sf.use_rd_based_breakout_for_intra_tx_search ? *best_rd : INT64_MAX; av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats, bsize, ref_best_rd); if (rd_stats.rate == INT_MAX) return 0; int this_rate_tokenonly = rd_stats.rate; if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) { // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size // in the tokenonly rate, but for intra blocks, tx_size is always coded // (prediction granularity), so we account for it in the full rate, // not the tokenonly rate. this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size); } const int this_rate = rd_stats.rate + intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode], 0); const int64_t this_rd = RDCOST(x->rdmult, this_rate, rd_stats.dist); if (this_rd < *best_rd) { *best_mbmi = *mbmi; *best_rd = this_rd; *rate = this_rate; *rate_tokenonly = this_rate_tokenonly; *distortion = rd_stats.dist; *skippable = rd_stats.skip_txfm; av1_copy_array(ctx->blk_skip, x->txfm_search_info.blk_skip, ctx->num_4x4_blk); av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); return 1; } return 0; } /*!\brief Search for the best filter_intra mode when coding inter frame. * * \ingroup intra_mode_search * \callergraph * This function loops through all filter_intra modes to find the best one. * * \remark Returns nothing, but updates the mbmi and rd_stats. */ static inline void handle_filter_intra_mode(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y, int mode_cost, int64_t best_rd, int64_t best_rd_so_far) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; assert(mbmi->mode == DC_PRED && av1_filter_intra_allowed_bsize(&cpi->common, bsize)); RD_STATS rd_stats_y_fi; int filter_intra_selected_flag = 0; TX_SIZE best_tx_size = mbmi->tx_size; FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; memcpy(best_blk_skip, x->txfm_search_info.blk_skip, sizeof(best_blk_skip[0]) * ctx->num_4x4_blk); uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); mbmi->filter_intra_mode_info.use_filter_intra = 1; for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED; fi_mode < FILTER_INTRA_MODES; ++fi_mode) { mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode; av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y_fi, bsize, best_rd); if (rd_stats_y_fi.rate == INT_MAX) continue; const int this_rate_tmp = rd_stats_y_fi.rate + intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0); const int64_t this_rd_tmp = RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist); if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > best_rd) { break; } if (this_rd_tmp < best_rd_so_far) { best_tx_size = mbmi->tx_size; av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); memcpy(best_blk_skip, x->txfm_search_info.blk_skip, sizeof(best_blk_skip[0]) * ctx->num_4x4_blk); best_fi_mode = fi_mode; *rd_stats_y = rd_stats_y_fi; filter_intra_selected_flag = 1; best_rd_so_far = this_rd_tmp; } } mbmi->tx_size = best_tx_size; av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); memcpy(x->txfm_search_info.blk_skip, best_blk_skip, sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); if (filter_intra_selected_flag) { mbmi->filter_intra_mode_info.use_filter_intra = 1; mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode; } else { mbmi->filter_intra_mode_info.use_filter_intra = 0; } } // Evaluate a given luma intra-mode in inter frames. int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state, const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, unsigned int ref_frame_cost, const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y, int64_t best_rd, int *mode_cost_y, int64_t *rd_y, int64_t *best_model_rd, int64_t top_intra_model_rd[]) { const AV1_COMMON *cm = &cpi->common; const INTRA_MODE_SPEED_FEATURES *const intra_sf = &cpi->sf.intra_sf; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; assert(mbmi->ref_frame[0] == INTRA_FRAME); const PREDICTION_MODE mode = mbmi->mode; const ModeCosts *mode_costs = &x->mode_costs; const int mode_cost = mode_costs->mbmode_cost[size_group_lookup[bsize]][mode] + ref_frame_cost; const int skip_ctx = av1_get_skip_txfm_context(xd); int known_rate = mode_cost; const int intra_cost_penalty = av1_get_intra_cost_penalty( cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q, cm->seq_params->bit_depth); if (mode != DC_PRED && mode != PAETH_PRED) known_rate += intra_cost_penalty; known_rate += AOMMIN(mode_costs->skip_txfm_cost[skip_ctx][0], mode_costs->skip_txfm_cost[skip_ctx][1]); const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0); if (known_rd > best_rd) { intra_search_state->skip_intra_modes = 1; return 0; } const int is_directional_mode = av1_is_directional_mode(mode); if (is_directional_mode && av1_use_angle_delta(bsize) && cpi->oxcf.intra_mode_cfg.enable_angle_delta) { if (intra_sf->intra_pruning_with_hog && !intra_search_state->dir_mode_skip_mask_ready) { const float thresh[4] = { -1.2f, 0.0f, 0.0f, 1.2f }; const int is_chroma = 0; prune_intra_mode_with_hog(x, bsize, cm->seq_params->sb_size, thresh[intra_sf->intra_pruning_with_hog - 1], intra_search_state->directional_mode_skip_mask, is_chroma); intra_search_state->dir_mode_skip_mask_ready = 1; } if (intra_search_state->directional_mode_skip_mask[mode]) return 0; } const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]); const int64_t this_model_rd = intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1); const int model_rd_index_for_pruning = get_model_rd_index_for_pruning(x, intra_sf); if (prune_intra_y_mode(this_model_rd, best_model_rd, top_intra_model_rd, intra_sf->top_intra_model_count_allowed, model_rd_index_for_pruning)) return 0; av1_init_rd_stats(rd_stats_y); av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd); // Pick filter intra modes. if (mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) { int try_filter_intra = 1; int64_t best_rd_so_far = INT64_MAX; if (rd_stats_y->rate != INT_MAX) { // best_rd_so_far is the rdcost of DC_PRED without using filter_intra. // Later, in filter intra search, best_rd_so_far is used for comparison. mbmi->filter_intra_mode_info.use_filter_intra = 0; const int tmp_rate = rd_stats_y->rate + intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0); best_rd_so_far = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist); try_filter_intra = (best_rd_so_far / 2) <= best_rd; } else if (intra_sf->skip_filter_intra_in_inter_frames >= 1) { // As rd cost of luma intra dc mode is more than best_rd (i.e., // rd_stats_y->rate = INT_MAX), skip the evaluation of filter intra modes. try_filter_intra = 0; } if (try_filter_intra) { handle_filter_intra_mode(cpi, x, bsize, ctx, rd_stats_y, mode_cost, best_rd, best_rd_so_far); } } if (rd_stats_y->rate == INT_MAX) return 0; *mode_cost_y = intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0); const int rate_y = rd_stats_y->skip_txfm ? mode_costs->skip_txfm_cost[skip_ctx][1] : rd_stats_y->rate; *rd_y = RDCOST(x->rdmult, rate_y + *mode_cost_y, rd_stats_y->dist); if (best_rd < (INT64_MAX / 2) && *rd_y > (best_rd + (best_rd >> 2))) { intra_search_state->skip_intra_modes = 1; return 0; } return 1; } int av1_search_intra_uv_modes_in_interframe( IntraModeSearchState *intra_search_state, const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats, const RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int64_t best_rd) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; assert(mbmi->ref_frame[0] == INTRA_FRAME); // TODO(chiyotsai@google.com): Consolidate the chroma search code here with // the one in av1_search_palette_mode. PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const int try_palette = cpi->oxcf.tool_cfg.enable_palette && av1_allow_palette(cm->features.allow_screen_content_tools, mbmi->bsize); assert(intra_search_state->rate_uv_intra == INT_MAX); if (intra_search_state->rate_uv_intra == INT_MAX) { // If no good uv-predictor had been found, search for it. const TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd); av1_rd_pick_intra_sbuv_mode(cpi, x, &intra_search_state->rate_uv_intra, &intra_search_state->rate_uv_tokenonly, &intra_search_state->dist_uvs, &intra_search_state->skip_uvs, bsize, uv_tx); intra_search_state->mode_uv = mbmi->uv_mode; if (try_palette) intra_search_state->pmi_uv = *pmi; intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV]; const int uv_rate = intra_search_state->rate_uv_tokenonly; const int64_t uv_dist = intra_search_state->dist_uvs; const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist); if (uv_rd > best_rd) { // If there is no good intra uv-mode available, we can skip all intra // modes. intra_search_state->skip_intra_modes = 1; return 0; } } // If we are here, then the encoder has found at least one good intra uv // predictor, so we can directly copy its statistics over. // TODO(any): the stats here is not right if the best uv mode is CFL but the // best y mode is palette. rd_stats_uv->rate = intra_search_state->rate_uv_tokenonly; rd_stats_uv->dist = intra_search_state->dist_uvs; rd_stats_uv->skip_txfm = intra_search_state->skip_uvs; rd_stats->skip_txfm = rd_stats_y->skip_txfm && rd_stats_uv->skip_txfm; mbmi->uv_mode = intra_search_state->mode_uv; if (try_palette) { pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1]; memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE, 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0])); } mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta; return 1; } // Checks if odd delta angles can be pruned based on rdcosts of even delta // angles of the corresponding directional mode. static inline int prune_luma_odd_delta_angles_using_rd_cost( const MB_MODE_INFO *const mbmi, const int64_t *const intra_modes_rd_cost, int64_t best_rd, int prune_luma_odd_delta_angles_in_intra) { const int luma_delta_angle = mbmi->angle_delta[PLANE_TYPE_Y]; if (!prune_luma_odd_delta_angles_in_intra || !av1_is_directional_mode(mbmi->mode) || !(abs(luma_delta_angle) & 1) || best_rd == INT64_MAX) return 0; const int64_t rd_thresh = best_rd + (best_rd >> 3); // Neighbour rdcosts are considered for pruning of odd delta angles as // mentioned below: // Delta angle Delta angle rdcost // to be pruned to be considered // -3 -2 // -1 -2, 0 // 1 0, 2 // 3 2 return intra_modes_rd_cost[luma_delta_angle + MAX_ANGLE_DELTA] > rd_thresh && intra_modes_rd_cost[luma_delta_angle + MAX_ANGLE_DELTA + 2] > rd_thresh; } // Finds the best non-intrabc mode on an intra frame. int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable, BLOCK_SIZE bsize, int64_t best_rd, PICK_MODE_CONTEXT *ctx) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); int64_t best_model_rd = INT64_MAX; int is_directional_mode; uint8_t directional_mode_skip_mask[INTRA_MODES] = { 0 }; // Flag to check rd of any intra mode is better than best_rd passed to this // function int beat_best_rd = 0; const int *bmode_costs; const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg; PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const int try_palette = cpi->oxcf.tool_cfg.enable_palette && av1_allow_palette(cpi->common.features.allow_screen_content_tools, mbmi->bsize); uint8_t *best_palette_color_map = try_palette ? x->palette_buffer->best_palette_color_map : NULL; const MB_MODE_INFO *above_mi = xd->above_mbmi; const MB_MODE_INFO *left_mi = xd->left_mbmi; const PREDICTION_MODE A = av1_above_block_mode(above_mi); const PREDICTION_MODE L = av1_left_block_mode(left_mi); const int above_ctx = intra_mode_context[A]; const int left_ctx = intra_mode_context[L]; bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx]; mbmi->angle_delta[PLANE_TYPE_Y] = 0; const INTRA_MODE_SPEED_FEATURES *const intra_sf = &cpi->sf.intra_sf; if (intra_sf->intra_pruning_with_hog) { // Less aggressive thresholds are used here than those used in inter frame // encoding in av1_handle_intra_y_mode() because we want key frames/intra // frames to have higher quality. const float thresh[4] = { -1.2f, -1.2f, -0.6f, 0.4f }; const int is_chroma = 0; prune_intra_mode_with_hog(x, bsize, cpi->common.seq_params->sb_size, thresh[intra_sf->intra_pruning_with_hog - 1], directional_mode_skip_mask, is_chroma); } mbmi->filter_intra_mode_info.use_filter_intra = 0; pmi->palette_size[0] = 0; // Set params for mode evaluation set_mode_eval_params(cpi, x, MODE_EVAL); MB_MODE_INFO best_mbmi = *mbmi; const int max_winner_mode_count = winner_mode_count_allowed[cpi->sf.winner_mode_sf.multi_winner_mode_type]; zero_winner_mode_stats(bsize, max_winner_mode_count, x->winner_mode_stats); x->winner_mode_count = 0; // Searches the intra-modes except for intrabc, palette, and filter_intra. int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT]; for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) { top_intra_model_rd[i] = INT64_MAX; } // Initialize the rdcost corresponding to all the directional and // non-directional intra modes. // 1. For directional modes, it stores the rdcost values for delta angles -4, // -3, ..., 3, 4. // 2. The rdcost value for luma_delta_angle is stored at index // luma_delta_angle + MAX_ANGLE_DELTA + 1. // 3. The rdcost values for fictitious/nonexistent luma_delta_angle -4 and 4 // (array indices 0 and 8) are always set to INT64_MAX (the initial value). int64_t intra_modes_rd_cost[INTRA_MODE_END] [SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY]; for (int i = 0; i < INTRA_MODE_END; i++) { for (int j = 0; j < SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY; j++) { intra_modes_rd_cost[i][j] = INT64_MAX; } } for (int mode_idx = INTRA_MODE_START; mode_idx < LUMA_MODE_COUNT; ++mode_idx) { set_y_mode_and_delta_angle(mode_idx, mbmi, intra_sf->prune_luma_odd_delta_angles_in_intra); RD_STATS this_rd_stats; int this_rate, this_rate_tokenonly, s; int is_diagonal_mode; int64_t this_distortion, this_rd; const int luma_delta_angle = mbmi->angle_delta[PLANE_TYPE_Y]; is_diagonal_mode = av1_is_diagonal_mode(mbmi->mode); if (is_diagonal_mode && !intra_mode_cfg->enable_diagonal_intra) continue; if (av1_is_directional_mode(mbmi->mode) && !intra_mode_cfg->enable_directional_intra) continue; // The smooth prediction mode appears to be more frequently picked // than horizontal / vertical smooth prediction modes. Hence treat // them differently in speed features. if ((!intra_mode_cfg->enable_smooth_intra || intra_sf->disable_smooth_intra) && (mbmi->mode == SMOOTH_H_PRED || mbmi->mode == SMOOTH_V_PRED)) continue; if (!intra_mode_cfg->enable_smooth_intra && mbmi->mode == SMOOTH_PRED) continue; // The functionality of filter intra modes and smooth prediction // overlap. Hence smooth prediction is pruned only if all the // filter intra modes are enabled. if (intra_sf->disable_smooth_intra && intra_sf->prune_filter_intra_level == 0 && mbmi->mode == SMOOTH_PRED) continue; if (!intra_mode_cfg->enable_paeth_intra && mbmi->mode == PAETH_PRED) continue; // Skip the evaluation of modes that do not match with the winner mode in // x->mb_mode_cache. if (x->use_mb_mode_cache && mbmi->mode != x->mb_mode_cache->mode) continue; is_directional_mode = av1_is_directional_mode(mbmi->mode); if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue; if (is_directional_mode && !(av1_use_angle_delta(bsize) && intra_mode_cfg->enable_angle_delta) && luma_delta_angle != 0) continue; // Use intra_y_mode_mask speed feature to skip intra mode evaluation. if (!(intra_sf->intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mbmi->mode))) continue; if (prune_luma_odd_delta_angles_using_rd_cost( mbmi, intra_modes_rd_cost[mbmi->mode], best_rd, intra_sf->prune_luma_odd_delta_angles_in_intra)) continue; const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]); const int64_t this_model_rd = intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1); const int model_rd_index_for_pruning = get_model_rd_index_for_pruning(x, intra_sf); if (prune_intra_y_mode(this_model_rd, &best_model_rd, top_intra_model_rd, intra_sf->top_intra_model_count_allowed, model_rd_index_for_pruning)) continue; // Builds the actual prediction. The prediction from // model_intra_yrd_and_prune was just an estimation that did not take into // account the effect of txfm pipeline, so we need to redo it for real // here. av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd); this_rate_tokenonly = this_rd_stats.rate; this_distortion = this_rd_stats.dist; s = this_rd_stats.skip_txfm; if (this_rate_tokenonly == INT_MAX) continue; if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) { // av1_pick_uniform_tx_size_type_yrd above includes the cost of the // tx_size in the tokenonly rate, but for intra blocks, tx_size is always // coded (prediction granularity), so we account for it in the full rate, // not the tokenonly rate. this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size); } this_rate = this_rd_stats.rate + intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode], 0); this_rd = RDCOST(x->rdmult, this_rate, this_distortion); // Visual quality adjustment based on recon vs source variance. if ((cpi->oxcf.mode == ALLINTRA) && (this_rd != INT64_MAX)) { this_rd = (int64_t)(this_rd * intra_rd_variance_factor(cpi, x, bsize)); } intra_modes_rd_cost[mbmi->mode][luma_delta_angle + MAX_ANGLE_DELTA + 1] = this_rd; // Collect mode stats for multiwinner mode processing const int txfm_search_done = 1; store_winner_mode_stats( &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd, cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done); if (this_rd < best_rd) { best_mbmi = *mbmi; best_rd = this_rd; // Setting beat_best_rd flag because current mode rd is better than // best_rd passed to this function beat_best_rd = 1; *rate = this_rate; *rate_tokenonly = this_rate_tokenonly; *distortion = this_distortion; *skippable = s; memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip, sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); } } // Searches palette if (try_palette) { av1_rd_pick_palette_intra_sby( cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, best_palette_color_map, &best_rd, rate, rate_tokenonly, distortion, skippable, &beat_best_rd, ctx, ctx->blk_skip, ctx->tx_type_map); } // Searches filter_intra if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) { if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion, skippable, bsize, bmode_costs[DC_PRED], best_mbmi.mode, &best_rd, &best_model_rd, ctx)) { best_mbmi = *mbmi; } } // No mode is identified with less rd value than best_rd passed to this // function. In such cases winner mode processing is not necessary and return // best_rd as INT64_MAX to indicate best mode is not identified if (!beat_best_rd) return INT64_MAX; // In multi-winner mode processing, perform tx search for few best modes // identified during mode evaluation. Winner mode processing uses best tx // configuration for tx search. if (cpi->sf.winner_mode_sf.multi_winner_mode_type) { int best_mode_idx = 0; int block_width, block_height; uint8_t *color_map_dst = xd->plane[PLANE_TYPE_Y].color_index_map; av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width, &block_height, NULL, NULL); for (int mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++) { *mbmi = x->winner_mode_stats[mode_idx].mbmi; if (is_winner_mode_processing_enabled(cpi, x, mbmi, 0)) { // Restore color_map of palette mode before winner mode processing if (mbmi->palette_mode_info.palette_size[0] > 0) { uint8_t *color_map_src = x->winner_mode_stats[mode_idx].color_index_map; memcpy(color_map_dst, color_map_src, block_width * block_height * sizeof(*color_map_src)); } // Set params for winner mode evaluation set_mode_eval_params(cpi, x, WINNER_MODE_EVAL); // Winner mode processing // If previous searches use only the default tx type/no R-D optimization // of quantized coeffs, do an extra search for the best tx type/better // R-D optimization of quantized coeffs if (intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate, rate_tokenonly, distortion, skippable, &best_mbmi, ctx)) best_mode_idx = mode_idx; } } // Copy color_map of palette mode for final winner mode if (best_mbmi.palette_mode_info.palette_size[0] > 0) { uint8_t *color_map_src = x->winner_mode_stats[best_mode_idx].color_index_map; memcpy(color_map_dst, color_map_src, block_width * block_height * sizeof(*color_map_src)); } } else { // If previous searches use only the default tx type/no R-D optimization of // quantized coeffs, do an extra search for the best tx type/better R-D // optimization of quantized coeffs if (is_winner_mode_processing_enabled(cpi, x, mbmi, 0)) { // Set params for winner mode evaluation set_mode_eval_params(cpi, x, WINNER_MODE_EVAL); *mbmi = best_mbmi; intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate, rate_tokenonly, distortion, skippable, &best_mbmi, ctx); } } *mbmi = best_mbmi; av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk); return best_rd; } aom-3.12.1/av1/encoder/intra_mode_search.h000066400000000000000000000357501477627663500203400ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief Declares high level functions to search through intra modes. */ #ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_ #define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_ #include "av1/encoder/encoder.h" #ifdef __cplusplus extern "C" { #endif /*! \brief Variables related to intra-mode search during inter frame coding. * * \ingroup intra_mode_search * This is a set of variables used during intra-mode search for inter frames. * This includes an histogram of gradient speed features and a cache of uv * prediction to avoid repeated search of chroma prediction. */ typedef struct IntraModeSearchState { /*! * \brief The best luma intra-mode found so far */ PREDICTION_MODE best_intra_mode; /** \name Speed feature variables * Variables to help with pruning some luma intra-modes during inter frame * coding process. */ /**@{*/ /*! * \brief Whether to terminate all intra mode search. */ int skip_intra_modes; /*! * \brief Whether a directional mode is pruned. */ uint8_t directional_mode_skip_mask[INTRA_MODES]; /*! * \brief Whether \ref directional_mode_skip_mask is valid for pruning. */ int dir_mode_skip_mask_ready; /**@}*/ /** \name Chroma mode search cache * A cache of the best chroma prediction mode to avoid having to search for * chroma predictions repeatedly in \ref * av1_search_intra_uv_modes_in_interframe() */ /**@{*/ int rate_uv_intra; /*!< \brief Total rate to transmit uv_mode */ int rate_uv_tokenonly; /*!< \brief Rate transmit txfm tokens */ int64_t dist_uvs; /*!< \brief Distortion of the uv_mode's recon */ uint8_t skip_uvs; /*!< \brief Whether the uv txfm is skippable */ UV_PREDICTION_MODE mode_uv; /*!< \brief The best uv mode */ PALETTE_MODE_INFO pmi_uv; /*!< \brief Color map if mode_uv is palette */ int8_t uv_angle_delta; /*!< \brief Angle delta if mode_uv directional */ /**@}*/ } IntraModeSearchState; /*!\brief Evaluate a given luma intra-mode for inter frames. * * \ingroup intra_mode_search * \callgraph * \callergraph * This function handles an intra-mode luma prediction when the current frame * is an inter frame. This is the intra-mode counterpart of handle_inter_mode. * This function performs an intra luma prediction using the mode specified by * x->e_mbd.mi[0]->mode. This function does *not* support palette mode * prediction in the luma channel. * * \param[in,out] intra_search_state Structure to intra search state. * \param[in] cpi Top-level encoder structure. * \param[in,out] x Pointer to structure holding all the * data for the current macroblock. * \param[in] bsize Current partition block size. * \param[in] ref_frame_cost The entropy cost for signaling that the * current ref frame is an intra frame. * \param[in] ctx Structure to hold the number of 4x4 blks * to copy tx_type and txfm_skip arrays. * \param[out] rd_stats_y Struct to keep track of the current * intra-mode's rd_stats (luma only). * \param[in] best_rd Best RD seen for this block so far. * \param[out] mode_cost_y The cost needed to signal the current * intra mode. * \param[out] rd_y The rdcost of the chosen mode. * \param[in] best_model_rd Best model RD seen for this block so far * \param[in] top_intra_model_rd Top intra model RD seen for this * block so far. * * \return Returns 1 if a valid intra mode is found, 0 otherwise. * The corresponding values in x->e_mbd.mi[0], rd_stats_y, mode_cost_y, and * rd_y are also updated. Moreover, in the first evaluation with directional * mode, a prune_mask computed with histogram of gradient is also stored in * intra_search_state. */ int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state, const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, unsigned int ref_frame_cost, const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y, int64_t best_rd, int *mode_cost_y, int64_t *rd_y, int64_t *best_model_rd, int64_t top_intra_model_rd[]); /*!\brief Search through all chroma intra-modes for inter frames. * * \ingroup intra_mode_search * \callgraph * \callergraph * This function handles intra-mode chroma prediction when the current frame * is an inter frame. This is done by calling \ref av1_rd_pick_intra_sbuv_mode * with some additional book-keeping. * * \param[in,out] intra_search_state Structure to intra search state. * \param[in] cpi Top-level encoder structure. * \param[in,out] x Pointer to structure holding all the * data for the current macroblock. * \param[in] bsize Current partition block size. * \param[out] rd_stats Struct to keep track of the current * intra-mode's rd_stats (all planes). * \param[out] rd_stats_y Struct to keep track of the current * intra-mode's rd_stats (luma only). * \param[out] rd_stats_uv Struct to keep track of the current * intra-mode's rd_stats (chroma only). * \param[in] best_rd Best RD seen for this block so far. * * \return Returns 1 if a valid intra mode is found, 0 otherwise. * The corresponding values in x->e_mbd.mi[0], rd_stats(_y|_uv) are also * updated. Moreover, in the first evocation of the function, the chroma intra * mode result is cached in intra_search_state to be used in subsequent calls. */ int av1_search_intra_uv_modes_in_interframe( IntraModeSearchState *intra_search_state, const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats, const RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int64_t best_rd); /*!\brief Evaluate luma palette mode for inter frames. * * \ingroup intra_mode_search * \callergraph * \callgraph * This function handles luma palette mode when the current frame is an * inter frame. * * \param[in] intra_search_state Structure to hold the best luma intra mode * and cache chroma prediction for speed up. * \param[in] cpi Top-level encoder structure. * \param[in] x Pointer to structure holding all the data * for the current macroblock. * \param[in] bsize Current partition block size. * \param[in] ref_frame_cost The entropy cost for signaling that the * current ref frame is an intra frame. * \param[in] ctx Structure to hold the number of 4x4 blks to * copy the tx_type and txfm_skip arrays. * \param[in] this_rd_cost Struct to keep track of palette mode's * rd_stats. * \param[in] best_rd Best RD seen for this block so far. * * \return Returns whether luma palette mode can skip the txfm. The * corresponding mbmi, this_rd_costs, intra_search_state, and tx_type arrays in * ctx are also updated. */ int av1_search_palette_mode(IntraModeSearchState *intra_search_state, const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, unsigned int ref_frame_cost, PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost, int64_t best_rd); /*!\brief Evaluate luma palette mode for inter frames. * * \ingroup intra_mode_search * \callergraph * \callgraph * This function handles luma palette mode when the current frame is an * inter frame. * * \param[in] cpi Top-level encoder structure. * \param[in] x Pointer to structure holding all the data * for the current macroblock. * \param[in] bsize Current partition block size. * \param[in] ref_frame_cost The entropy cost for signaling that the * current ref frame is an intra frame. * \param[in] ctx Structure to hold the number of 4x4 blks to * copy the tx_type and txfm_skip arrays. * \param[in] this_rd_cost Struct to keep track of palette mode's * rd_stats. * \param[in] best_rd Best RD seen for this block so far. */ void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, unsigned int ref_frame_cost, PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost, int64_t best_rd); /*!\brief Perform intra-mode search on luma channels for intra frames. * * \ingroup intra_mode_search * \callgraph * \callergraph * This function performs intra-mode search on the luma channel when the * current frame is intra-only. This function does not search intrabc mode, * but it does search palette and filter_intra. * * \param[in] cpi Top-level encoder structure. * \param[in] x Pointer to structure holding all the data * for the current macroblock. * \param[in] rate The total rate needed to predict the current * chroma block. * \param[in] rate_tokenonly The rate without the cost of sending the * prediction modes. * chroma block. * after the reconstruction. * \param[in] distortion The chroma distortion of the best prediction * after the reconstruction. * \param[in] skippable Whether we can skip txfm process. * \param[in] bsize Current partition block size. * \param[in] best_rd Best RD seen for this block so far. * \param[in] ctx Structure to hold the number of 4x4 blks to * copy the tx_type and txfm_skip arrays. * * \return Returns the rd_cost if this function finds a mode better than * best_rd, otherwise returns INT64_MAX. This also updates the mbmi, the rate * and distortion, and the tx_type arrays in ctx. */ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable, BLOCK_SIZE bsize, int64_t best_rd, PICK_MODE_CONTEXT *ctx); /*!\brief Perform intra-mode search on chroma channels. * * \ingroup intra_mode_search * \callergraph * \callgraph * This function performs intra-mode search on the chroma channels. Just like * \ref av1_rd_pick_intra_sby_mode(), this function searches over palette mode * (filter_intra is not available on chroma planes). Unlike \ref * av1_rd_pick_intra_sby_mode() this function is used by both inter and intra * frames. * * \param[in] cpi Top-level encoder structure. * \param[in] x Pointer to structure holding all the data * for the current macroblock. * \param[in] rate The total rate needed to predict the current * chroma block. * \param[in] rate_tokenonly The rate without the cost of sending the * prediction modes. * chroma block. * after the reconstruction. * \param[in] distortion The chroma distortion of the best prediction * after the reconstruction. * \param[in] skippable Whether we can skip txfm process. * \param[in] bsize Current partition block size. * \param[in] max_tx_size The maximum tx_size available * * \return Returns the rd_cost of the best uv mode found. This also updates the * mbmi, the rate and distortion, distortion. */ int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable, BLOCK_SIZE bsize, TX_SIZE max_tx_size); /*! \brief Return the number of colors in src. Used by palette mode. */ void av1_count_colors(const uint8_t *src, int stride, int rows, int cols, int *val_count, int *num_colors); /*! \brief See \ref av1_count_colors(), but for highbd. */ void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols, int bit_depth, int *val_count, int *val_count_8bit, int *num_color_bins, int *num_colors); /*! \brief Initializes the \ref IntraModeSearchState struct. */ static inline void init_intra_mode_search_state( IntraModeSearchState *intra_search_state) { memset(intra_search_state, 0, sizeof(*intra_search_state)); intra_search_state->rate_uv_intra = INT_MAX; } /*! \brief set the luma intra mode and delta angles for a given mode index. * The total number of luma intra mode is LUMA_MODE_COUNT = 61. * The first 13 modes are from DC_PRED to PAETH_PRED, followed by directional * modes. Each of the main 8 directional modes have 6 = MAX_ANGLE_DELTA * 2 * delta angles. * \param[in] mode_idx mode index in intra mode decision * process. * \param[in] mbmi Pointer to structure holding the mode * info for the current macroblock. * \param[in] reorder_delta_angle_eval Indicates whether to reorder the * evaluation of delta angle modes. */ void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi, int reorder_delta_angle_eval); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_ aom-3.12.1/av1/encoder/intra_mode_search_utils.h000066400000000000000000000700001477627663500215430ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief Defines utility functions used in intra mode search. * * This includes rdcost estimations, histogram based pruning, etc. */ #ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_ #define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_ #include "av1/common/enums.h" #include "av1/common/pred_common.h" #include "av1/common/reconintra.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/model_rd.h" #include "av1/encoder/palette.h" #include "av1/encoder/hybrid_fwd_txfm.h" #ifdef __cplusplus extern "C" { #endif /*!\cond */ // Macro for computing the speed-preset dependent threshold which is used for // deciding whether to enable/disable variance calculations in // intra_rd_variance_factor(). #define INTRA_RD_VAR_THRESH(X) (1.0 - (0.25 * (X))) #define BINS 32 static const float av1_intra_hog_model_bias[DIRECTIONAL_MODES] = { 0.450578f, 0.695518f, -0.717944f, -0.639894f, -0.602019f, -0.453454f, 0.055857f, -0.465480f, }; static const float av1_intra_hog_model_weights[BINS * DIRECTIONAL_MODES] = { -3.076402f, -3.757063f, -3.275266f, -3.180665f, -3.452105f, -3.216593f, -2.871212f, -3.134296f, -1.822324f, -2.401411f, -1.541016f, -1.195322f, -0.434156f, 0.322868f, 2.260546f, 3.368715f, 3.989290f, 3.308487f, 2.277893f, 0.923793f, 0.026412f, -0.385174f, -0.718622f, -1.408867f, -1.050558f, -2.323941f, -2.225827f, -2.585453f, -3.054283f, -2.875087f, -2.985709f, -3.447155f, 3.758139f, 3.204353f, 2.170998f, 0.826587f, -0.269665f, -0.702068f, -1.085776f, -2.175249f, -1.623180f, -2.975142f, -2.779629f, -3.190799f, -3.521900f, -3.375480f, -3.319355f, -3.897389f, -3.172334f, -3.594528f, -2.879132f, -2.547777f, -2.921023f, -2.281844f, -1.818988f, -2.041771f, -0.618268f, -1.396458f, -0.567153f, -0.285868f, -0.088058f, 0.753494f, 2.092413f, 3.215266f, -3.300277f, -2.748658f, -2.315784f, -2.423671f, -2.257283f, -2.269583f, -2.196660f, -2.301076f, -2.646516f, -2.271319f, -2.254366f, -2.300102f, -2.217960f, -2.473300f, -2.116866f, -2.528246f, -3.314712f, -1.701010f, -0.589040f, -0.088077f, 0.813112f, 1.702213f, 2.653045f, 3.351749f, 3.243554f, 3.199409f, 2.437856f, 1.468854f, 0.533039f, -0.099065f, -0.622643f, -2.200732f, -4.228861f, -2.875263f, -1.273956f, -0.433280f, 0.803771f, 1.975043f, 3.179528f, 3.939064f, 3.454379f, 3.689386f, 3.116411f, 1.970991f, 0.798406f, -0.628514f, -1.252546f, -2.825176f, -4.090178f, -3.777448f, -3.227314f, -3.479403f, -3.320569f, -3.159372f, -2.729202f, -2.722341f, -3.054913f, -2.742923f, -2.612703f, -2.662632f, -2.907314f, -3.117794f, -3.102660f, -3.970972f, -4.891357f, -3.935582f, -3.347758f, -2.721924f, -2.219011f, -1.702391f, -0.866529f, -0.153743f, 0.107733f, 1.416882f, 2.572884f, 3.607755f, 3.974820f, 3.997783f, 2.970459f, 0.791687f, -1.478921f, -1.228154f, -1.216955f, -1.765932f, -1.951003f, -1.985301f, -1.975881f, -1.985593f, -2.422371f, -2.419978f, -2.531288f, -2.951853f, -3.071380f, -3.277027f, -3.373539f, -4.462010f, -0.967888f, 0.805524f, 2.794130f, 3.685984f, 3.745195f, 3.252444f, 2.316108f, 1.399146f, -0.136519f, -0.162811f, -1.004357f, -1.667911f, -1.964662f, -2.937579f, -3.019533f, -3.942766f, -5.102767f, -3.882073f, -3.532027f, -3.451956f, -2.944015f, -2.643064f, -2.529872f, -2.077290f, -2.809965f, -1.803734f, -1.783593f, -1.662585f, -1.415484f, -1.392673f, -0.788794f, -1.204819f, -1.998864f, -1.182102f, -0.892110f, -1.317415f, -1.359112f, -1.522867f, -1.468552f, -1.779072f, -2.332959f, -2.160346f, -2.329387f, -2.631259f, -2.744936f, -3.052494f, -2.787363f, -3.442548f, -4.245075f, -3.032172f, -2.061609f, -1.768116f, -1.286072f, -0.706587f, -0.192413f, 0.386938f, 0.716997f, 1.481393f, 2.216702f, 2.737986f, 3.109809f, 3.226084f, 2.490098f, -0.095827f, -3.864816f, -3.507248f, -3.128925f, -2.908251f, -2.883836f, -2.881411f, -2.524377f, -2.624478f, -2.399573f, -2.367718f, -1.918255f, -1.926277f, -1.694584f, -1.723790f, -0.966491f, -1.183115f, -1.430687f, 0.872896f, 2.766550f, 3.610080f, 3.578041f, 3.334928f, 2.586680f, 1.895721f, 1.122195f, 0.488519f, -0.140689f, -0.799076f, -1.222860f, -1.502437f, -1.900969f, -3.206816f, }; static const NN_CONFIG av1_intra_hog_model_nnconfig = { BINS, // num_inputs DIRECTIONAL_MODES, // num_outputs 0, // num_hidden_layers { 0 }, { av1_intra_hog_model_weights, }, { av1_intra_hog_model_bias, }, }; #define FIX_PREC_BITS (16) static inline int get_hist_bin_idx(int dx, int dy) { const int32_t ratio = (dy * (1 << FIX_PREC_BITS)) / dx; // Find index by bisection static const int thresholds[BINS] = { -1334015, -441798, -261605, -183158, -138560, -109331, -88359, -72303, -59392, -48579, -39272, -30982, -23445, -16400, -9715, -3194, 3227, 9748, 16433, 23478, 31015, 39305, 48611, 59425, 72336, 88392, 109364, 138593, 183191, 261638, 441831, INT32_MAX }; int lo_idx = 0, hi_idx = BINS - 1; // Divide into segments of size 8 gives better performance than binary search // here. if (ratio <= thresholds[7]) { lo_idx = 0; hi_idx = 7; } else if (ratio <= thresholds[15]) { lo_idx = 8; hi_idx = 15; } else if (ratio <= thresholds[23]) { lo_idx = 16; hi_idx = 23; } else { lo_idx = 24; hi_idx = 31; } for (int idx = lo_idx; idx <= hi_idx; idx++) { if (ratio <= thresholds[idx]) { return idx; } } assert(0 && "No valid histogram bin found!"); return BINS - 1; } #undef FIX_PREC_BITS // Normalizes the hog data. static inline void normalize_hog(float total, float *hist) { for (int i = 0; i < BINS; ++i) hist[i] /= total; } static inline void lowbd_generate_hog(const uint8_t *src, int stride, int rows, int cols, float *hist) { float total = 0.1f; src += stride; for (int r = 1; r < rows - 1; ++r) { for (int c = 1; c < cols - 1; ++c) { const uint8_t *above = &src[c - stride]; const uint8_t *below = &src[c + stride]; const uint8_t *left = &src[c - 1]; const uint8_t *right = &src[c + 1]; // Calculate gradient using Sobel filters. const int dx = (right[-stride] + 2 * right[0] + right[stride]) - (left[-stride] + 2 * left[0] + left[stride]); const int dy = (below[-1] + 2 * below[0] + below[1]) - (above[-1] + 2 * above[0] + above[1]); if (dx == 0 && dy == 0) continue; const int temp = abs(dx) + abs(dy); if (!temp) continue; total += temp; if (dx == 0) { hist[0] += temp / 2; hist[BINS - 1] += temp / 2; } else { const int idx = get_hist_bin_idx(dx, dy); assert(idx >= 0 && idx < BINS); hist[idx] += temp; } } src += stride; } normalize_hog(total, hist); } // Computes and stores pixel level gradient information of a given superblock // for LBD encode. static inline void lowbd_compute_gradient_info_sb(MACROBLOCK *const x, BLOCK_SIZE sb_size, PLANE_TYPE plane) { PixelLevelGradientInfo *const grad_info_sb = x->pixel_gradient_info + plane * MAX_SB_SQUARE; const uint8_t *src = x->plane[plane].src.buf; const int stride = x->plane[plane].src.stride; const int ss_x = x->e_mbd.plane[plane].subsampling_x; const int ss_y = x->e_mbd.plane[plane].subsampling_y; const int sb_height = block_size_high[sb_size] >> ss_y; const int sb_width = block_size_wide[sb_size] >> ss_x; src += stride; for (int r = 1; r < sb_height - 1; ++r) { for (int c = 1; c < sb_width - 1; ++c) { const uint8_t *above = &src[c - stride]; const uint8_t *below = &src[c + stride]; const uint8_t *left = &src[c - 1]; const uint8_t *right = &src[c + 1]; // Calculate gradient using Sobel filters. const int dx = (right[-stride] + 2 * right[0] + right[stride]) - (left[-stride] + 2 * left[0] + left[stride]); const int dy = (below[-1] + 2 * below[0] + below[1]) - (above[-1] + 2 * above[0] + above[1]); grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0); grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum = (uint16_t)(abs(dx) + abs(dy)); grad_info_sb[r * sb_width + c].hist_bin_idx = (dx != 0) ? get_hist_bin_idx(dx, dy) : -1; } src += stride; } } #if CONFIG_AV1_HIGHBITDEPTH static inline void highbd_generate_hog(const uint8_t *src8, int stride, int rows, int cols, float *hist) { float total = 0.1f; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); src += stride; for (int r = 1; r < rows - 1; ++r) { for (int c = 1; c < cols - 1; ++c) { const uint16_t *above = &src[c - stride]; const uint16_t *below = &src[c + stride]; const uint16_t *left = &src[c - 1]; const uint16_t *right = &src[c + 1]; // Calculate gradient using Sobel filters. const int dx = (right[-stride] + 2 * right[0] + right[stride]) - (left[-stride] + 2 * left[0] + left[stride]); const int dy = (below[-1] + 2 * below[0] + below[1]) - (above[-1] + 2 * above[0] + above[1]); if (dx == 0 && dy == 0) continue; const int temp = abs(dx) + abs(dy); if (!temp) continue; total += temp; if (dx == 0) { hist[0] += temp / 2; hist[BINS - 1] += temp / 2; } else { const int idx = get_hist_bin_idx(dx, dy); assert(idx >= 0 && idx < BINS); hist[idx] += temp; } } src += stride; } normalize_hog(total, hist); } // Computes and stores pixel level gradient information of a given superblock // for HBD encode. static inline void highbd_compute_gradient_info_sb(MACROBLOCK *const x, BLOCK_SIZE sb_size, PLANE_TYPE plane) { PixelLevelGradientInfo *const grad_info_sb = x->pixel_gradient_info + plane * MAX_SB_SQUARE; const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[plane].src.buf); const int stride = x->plane[plane].src.stride; const int ss_x = x->e_mbd.plane[plane].subsampling_x; const int ss_y = x->e_mbd.plane[plane].subsampling_y; const int sb_height = block_size_high[sb_size] >> ss_y; const int sb_width = block_size_wide[sb_size] >> ss_x; src += stride; for (int r = 1; r < sb_height - 1; ++r) { for (int c = 1; c < sb_width - 1; ++c) { const uint16_t *above = &src[c - stride]; const uint16_t *below = &src[c + stride]; const uint16_t *left = &src[c - 1]; const uint16_t *right = &src[c + 1]; // Calculate gradient using Sobel filters. const int dx = (right[-stride] + 2 * right[0] + right[stride]) - (left[-stride] + 2 * left[0] + left[stride]); const int dy = (below[-1] + 2 * below[0] + below[1]) - (above[-1] + 2 * above[0] + above[1]); grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0); grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum = (uint16_t)(abs(dx) + abs(dy)); grad_info_sb[r * sb_width + c].hist_bin_idx = (dx != 0) ? get_hist_bin_idx(dx, dy) : -1; } src += stride; } } #endif // CONFIG_AV1_HIGHBITDEPTH static inline void generate_hog(const uint8_t *src8, int stride, int rows, int cols, float *hist, int highbd) { #if CONFIG_AV1_HIGHBITDEPTH if (highbd) { highbd_generate_hog(src8, stride, rows, cols, hist); return; } #else (void)highbd; #endif // CONFIG_AV1_HIGHBITDEPTH lowbd_generate_hog(src8, stride, rows, cols, hist); } static inline void compute_gradient_info_sb(MACROBLOCK *const x, BLOCK_SIZE sb_size, PLANE_TYPE plane) { #if CONFIG_AV1_HIGHBITDEPTH if (is_cur_buf_hbd(&x->e_mbd)) { highbd_compute_gradient_info_sb(x, sb_size, plane); return; } #endif // CONFIG_AV1_HIGHBITDEPTH lowbd_compute_gradient_info_sb(x, sb_size, plane); } // Gradient caching at superblock level is allowed only if all of the following // conditions are satisfied: // (1) The current frame is an intra only frame // (2) Non-RD mode decisions are not enabled // (3) The sf partition_search_type is set to SEARCH_PARTITION // (4) Either intra_pruning_with_hog or chroma_intra_pruning_with_hog is enabled // // SB level caching of gradient data may not help in speedup for the following // cases: // (1) Inter frames (due to early intra gating) // (2) When partition_search_type is not SEARCH_PARTITION // Hence, gradient data is computed at block level in such cases. static inline bool is_gradient_caching_for_hog_enabled( const AV1_COMP *const cpi) { const SPEED_FEATURES *const sf = &cpi->sf; return frame_is_intra_only(&cpi->common) && !sf->rt_sf.use_nonrd_pick_mode && (sf->part_sf.partition_search_type == SEARCH_PARTITION) && (sf->intra_sf.intra_pruning_with_hog || sf->intra_sf.chroma_intra_pruning_with_hog); } // Function to generate pixel level gradient information for a given superblock. // Sets the flags 'is_sb_gradient_cached' for the specific plane-type if // gradient info is generated for the same. static inline void produce_gradients_for_sb(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE sb_size, int mi_row, int mi_col) { // Initialise flags related to hog data caching. x->is_sb_gradient_cached[PLANE_TYPE_Y] = false; x->is_sb_gradient_cached[PLANE_TYPE_UV] = false; if (!is_gradient_caching_for_hog_enabled(cpi)) return; const SPEED_FEATURES *sf = &cpi->sf; const int num_planes = av1_num_planes(&cpi->common); av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size); if (sf->intra_sf.intra_pruning_with_hog) { compute_gradient_info_sb(x, sb_size, PLANE_TYPE_Y); x->is_sb_gradient_cached[PLANE_TYPE_Y] = true; } if (sf->intra_sf.chroma_intra_pruning_with_hog && num_planes > 1) { compute_gradient_info_sb(x, sb_size, PLANE_TYPE_UV); x->is_sb_gradient_cached[PLANE_TYPE_UV] = true; } } // Reuses the pixel level gradient data generated at superblock level for block // level histogram computation. static inline void generate_hog_using_gradient_cache(const MACROBLOCK *x, int rows, int cols, BLOCK_SIZE sb_size, PLANE_TYPE plane, float *hist) { float total = 0.1f; const int ss_x = x->e_mbd.plane[plane].subsampling_x; const int ss_y = x->e_mbd.plane[plane].subsampling_y; const int sb_width = block_size_wide[sb_size] >> ss_x; // Derive the offset from the starting of the superblock in order to locate // the block level gradient data in the cache. const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1); const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1); const int block_offset_in_grad_cache = sb_width * (mi_row_in_sb << (MI_SIZE_LOG2 - ss_y)) + (mi_col_in_sb << (MI_SIZE_LOG2 - ss_x)); const PixelLevelGradientInfo *grad_info_blk = x->pixel_gradient_info + plane * MAX_SB_SQUARE + block_offset_in_grad_cache; // Retrieve the cached gradient information and generate the histogram. for (int r = 1; r < rows - 1; ++r) { for (int c = 1; c < cols - 1; ++c) { const uint16_t abs_dx_abs_dy_sum = grad_info_blk[r * sb_width + c].abs_dx_abs_dy_sum; if (!abs_dx_abs_dy_sum) continue; total += abs_dx_abs_dy_sum; const bool is_dx_zero = grad_info_blk[r * sb_width + c].is_dx_zero; if (is_dx_zero) { hist[0] += abs_dx_abs_dy_sum >> 1; hist[BINS - 1] += abs_dx_abs_dy_sum >> 1; } else { const int8_t idx = grad_info_blk[r * sb_width + c].hist_bin_idx; assert(idx >= 0 && idx < BINS); hist[idx] += abs_dx_abs_dy_sum; } } } normalize_hog(total, hist); } static inline void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize, BLOCK_SIZE sb_size, int plane, float *hog) { const MACROBLOCKD *xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int ss_x = pd->subsampling_x; const int ss_y = pd->subsampling_y; const int bh = block_size_high[bsize]; const int bw = block_size_wide[bsize]; const int rows = ((xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh) >> ss_y; const int cols = ((xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw) >> ss_x; // If gradient data is already generated at SB level, reuse the cached data. // Otherwise, compute the data. if (x->is_sb_gradient_cached[plane]) { generate_hog_using_gradient_cache(x, rows, cols, sb_size, plane, hog); } else { const uint8_t *src = x->plane[plane].src.buf; const int src_stride = x->plane[plane].src.stride; generate_hog(src, src_stride, rows, cols, hog, is_cur_buf_hbd(xd)); } // Scale the hog so the luma and chroma are on the same scale for (int b = 0; b < BINS; ++b) { hog[b] *= (1 + ss_x) * (1 + ss_y); } } static inline void prune_intra_mode_with_hog( const MACROBLOCK *x, BLOCK_SIZE bsize, BLOCK_SIZE sb_size, float th, uint8_t *directional_mode_skip_mask, int is_chroma) { const int plane = is_chroma ? AOM_PLANE_U : AOM_PLANE_Y; float hist[BINS] = { 0.0f }; collect_hog_data(x, bsize, sb_size, plane, hist); // Make prediction for each of the mode float scores[DIRECTIONAL_MODES] = { 0.0f }; av1_nn_predict(hist, &av1_intra_hog_model_nnconfig, 1, scores); for (UV_PREDICTION_MODE uv_mode = UV_V_PRED; uv_mode <= UV_D67_PRED; uv_mode++) { if (scores[uv_mode - UV_V_PRED] <= th) { directional_mode_skip_mask[uv_mode] = 1; } } } #undef BINS int av1_calc_normalized_variance(aom_variance_fn_t vf, const uint8_t *const buf, const int stride, const int is_hbd); // Returns whether caching of source variance for 4x4 sub-blocks is allowed. static inline bool is_src_var_for_4x4_sub_blocks_caching_enabled( const AV1_COMP *const cpi) { const SPEED_FEATURES *const sf = &cpi->sf; if (cpi->oxcf.mode != ALLINTRA) return false; if (sf->part_sf.partition_search_type == SEARCH_PARTITION) return true; if (INTRA_RD_VAR_THRESH(cpi->oxcf.speed) <= 0 || (sf->rt_sf.use_nonrd_pick_mode && !sf->rt_sf.hybrid_intra_pickmode)) return false; return true; } // Initialize the members of Block4x4VarInfo structure to -1 at the start // of every superblock. static inline void init_src_var_info_of_4x4_sub_blocks( const AV1_COMP *const cpi, Block4x4VarInfo *src_var_info_of_4x4_sub_blocks, const BLOCK_SIZE sb_size) { if (!is_src_var_for_4x4_sub_blocks_caching_enabled(cpi)) return; const int mi_count_in_sb = mi_size_wide[sb_size] * mi_size_high[sb_size]; for (int i = 0; i < mi_count_in_sb; i++) { src_var_info_of_4x4_sub_blocks[i].var = -1; src_var_info_of_4x4_sub_blocks[i].log_var = -1.0; } } // Returns the cost needed to send a uniformly distributed r.v. static inline int write_uniform_cost(int n, int v) { const int l = get_unsigned_bits(n); const int m = (1 << l) - n; if (l == 0) return 0; if (v < m) return av1_cost_literal(l - 1); else return av1_cost_literal(l); } /*!\endcond */ /*!\brief Returns the rate cost for luma prediction mode info of intra blocks. * * \callergraph */ static inline int intra_mode_info_cost_y(const AV1_COMP *cpi, const MACROBLOCK *x, const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, int mode_cost, int discount_color_cost) { int total_rate = mode_cost; const ModeCosts *mode_costs = &x->mode_costs; const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0; const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra; const int use_intrabc = mbmi->use_intrabc; // Can only activate one mode. assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc + use_filter_intra) <= 1); const int try_palette = av1_allow_palette( cpi->common.features.allow_screen_content_tools, mbmi->bsize); if (try_palette && mbmi->mode == DC_PRED) { const MACROBLOCKD *xd = &x->e_mbd; const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); const int mode_ctx = av1_get_palette_mode_ctx(xd); total_rate += mode_costs->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette]; if (use_palette) { const uint8_t *const color_map = xd->plane[0].color_index_map; int block_width, block_height, rows, cols; av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, &cols); const int plt_size = mbmi->palette_mode_info.palette_size[0]; int palette_mode_cost = mode_costs ->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] + write_uniform_cost(plt_size, color_map[0]); uint16_t color_cache[2 * PALETTE_MAX_SIZE]; const int n_cache = av1_get_palette_cache(xd, 0, color_cache); palette_mode_cost += av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache, n_cache, cpi->common.seq_params->bit_depth); if (!discount_color_cost) palette_mode_cost += av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP); total_rate += palette_mode_cost; } } if (av1_filter_intra_allowed(&cpi->common, mbmi)) { total_rate += mode_costs->filter_intra_cost[mbmi->bsize][use_filter_intra]; if (use_filter_intra) { total_rate += mode_costs->filter_intra_mode_cost[mbmi->filter_intra_mode_info .filter_intra_mode]; } } if (av1_is_directional_mode(mbmi->mode)) { if (av1_use_angle_delta(bsize)) { total_rate += mode_costs->angle_delta_cost[mbmi->mode - V_PRED] [MAX_ANGLE_DELTA + mbmi->angle_delta[PLANE_TYPE_Y]]; } } if (av1_allow_intrabc(&cpi->common)) total_rate += mode_costs->intrabc_cost[use_intrabc]; return total_rate; } /*!\brief Return the rate cost for chroma prediction mode info of intra blocks. * * \callergraph */ static inline int intra_mode_info_cost_uv(const AV1_COMP *cpi, const MACROBLOCK *x, const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, int mode_cost) { int total_rate = mode_cost; const ModeCosts *mode_costs = &x->mode_costs; const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0; const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; // Can only activate one mode. assert(((uv_mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1); const int try_palette = av1_allow_palette( cpi->common.features.allow_screen_content_tools, mbmi->bsize); if (try_palette && uv_mode == UV_DC_PRED) { const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info; total_rate += mode_costs->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette]; if (use_palette) { const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); const int plt_size = pmi->palette_size[1]; const MACROBLOCKD *xd = &x->e_mbd; const uint8_t *const color_map = xd->plane[1].color_index_map; int palette_mode_cost = mode_costs ->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] + write_uniform_cost(plt_size, color_map[0]); uint16_t color_cache[2 * PALETTE_MAX_SIZE]; const int n_cache = av1_get_palette_cache(xd, 1, color_cache); palette_mode_cost += av1_palette_color_cost_uv( pmi, color_cache, n_cache, cpi->common.seq_params->bit_depth); palette_mode_cost += av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP); total_rate += palette_mode_cost; } } const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode); if (av1_is_directional_mode(intra_mode)) { if (av1_use_angle_delta(bsize)) { total_rate += mode_costs->angle_delta_cost[intra_mode - V_PRED] [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA]; } } return total_rate; } /*!\cond */ // Makes a quick intra prediction and estimate the rdcost with a model without // going through the whole txfm/quantize/itxfm process. static int64_t intra_model_rd(const AV1_COMMON *cm, MACROBLOCK *const x, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int use_hadamard) { MACROBLOCKD *const xd = &x->e_mbd; const BitDepthInfo bd_info = get_bit_depth_info(xd); int row, col; assert(!is_inter_block(xd->mi[0])); const int stepr = tx_size_high_unit[tx_size]; const int stepc = tx_size_wide_unit[tx_size]; const int txbw = tx_size_wide[tx_size]; const int txbh = tx_size_high[tx_size]; const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); const int max_blocks_high = max_block_high(xd, plane_bsize, plane); int64_t satd_cost = 0; struct macroblock_plane *p = &x->plane[plane]; struct macroblockd_plane *pd = &xd->plane[plane]; // Prediction. for (row = 0; row < max_blocks_high; row += stepr) { for (col = 0; col < max_blocks_wide; col += stepc) { av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size); // Here we use p->src_diff and p->coeff as temporary buffers for // prediction residue and transform coefficients. The buffers are only // used in this for loop, therefore we don't need to properly add offset // to the buffers. av1_subtract_block( bd_info, txbh, txbw, p->src_diff, block_size_wide[plane_bsize], p->src.buf + (((row * p->src.stride) + col) << 2), p->src.stride, pd->dst.buf + (((row * pd->dst.stride) + col) << 2), pd->dst.stride); av1_quick_txfm(use_hadamard, tx_size, bd_info, p->src_diff, block_size_wide[plane_bsize], p->coeff); satd_cost += aom_satd(p->coeff, tx_size_2d[tx_size]); } } return satd_cost; } /*!\endcond */ /*!\brief Estimate the luma rdcost of a given intra mode and try to prune it. * * \ingroup intra_mode_search * \callergraph * This function first makes a quick luma prediction and estimates the rdcost * with a model without going through the txfm, then try to prune the current * mode if the new estimate y_rd > 1.25 * best_model_rd. * * \return Returns 1 if the given mode is prune; 0 otherwise. */ static inline int model_intra_yrd_and_prune(const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *best_model_rd) { const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]); const int plane = 0; const AV1_COMMON *cm = &cpi->common; const int64_t this_model_rd = intra_model_rd(cm, x, plane, bsize, tx_size, /*use_hadamard=*/1); if (*best_model_rd != INT64_MAX && this_model_rd > *best_model_rd + (*best_model_rd >> 2)) { return 1; } else if (this_model_rd < *best_model_rd) { *best_model_rd = this_model_rd; } return 0; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_ aom-3.12.1/av1/encoder/k_means_template.h000066400000000000000000000116461477627663500202000ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "av1/common/blockd.h" #include "av1/encoder/palette.h" #include "av1/encoder/random.h" #ifndef AV1_K_MEANS_DIM #error "This template requires AV1_K_MEANS_DIM to be defined" #endif #define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y) #define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM) #define K_MEANS_RENAME_C(x, y) x##_dim##y##_c #define RENAME_C_(x, y) K_MEANS_RENAME_C(x, y) #define RENAME_C(x) RENAME_C_(x, AV1_K_MEANS_DIM) // Though we want to compute the smallest L2 norm, in 1 dimension, // it is equivalent to find the smallest L1 norm and then square it. // This is preferrable for speed, especially on the SIMD side. static int RENAME(calc_dist)(const int16_t *p1, const int16_t *p2) { #if AV1_K_MEANS_DIM == 1 return abs(p1[0] - p2[0]); #else int dist = 0; for (int i = 0; i < AV1_K_MEANS_DIM; ++i) { const int diff = p1[i] - p2[i]; dist += diff * diff; } return dist; #endif } void RENAME_C(av1_calc_indices)(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *dist, int n, int k) { if (dist) { *dist = 0; } for (int i = 0; i < n; ++i) { int min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids); indices[i] = 0; for (int j = 1; j < k; ++j) { const int this_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids + j * AV1_K_MEANS_DIM); if (this_dist < min_dist) { min_dist = this_dist; indices[i] = j; } } if (dist) { #if AV1_K_MEANS_DIM == 1 *dist += min_dist * min_dist; #else *dist += min_dist; #endif } } } static void RENAME(calc_centroids)(const int16_t *data, int16_t *centroids, const uint8_t *indices, int n, int k) { int i, j; int count[PALETTE_MAX_SIZE] = { 0 }; int centroids_sum[AV1_K_MEANS_DIM * PALETTE_MAX_SIZE]; unsigned int rand_state = (unsigned int)data[0]; assert(n <= 32768); memset(centroids_sum, 0, sizeof(centroids_sum[0]) * k * AV1_K_MEANS_DIM); for (i = 0; i < n; ++i) { const int index = indices[i]; assert(index < k); ++count[index]; for (j = 0; j < AV1_K_MEANS_DIM; ++j) { centroids_sum[index * AV1_K_MEANS_DIM + j] += data[i * AV1_K_MEANS_DIM + j]; } } for (i = 0; i < k; ++i) { if (count[i] == 0) { memcpy(centroids + i * AV1_K_MEANS_DIM, data + (lcg_rand16(&rand_state) % n) * AV1_K_MEANS_DIM, sizeof(centroids[0]) * AV1_K_MEANS_DIM); } else { for (j = 0; j < AV1_K_MEANS_DIM; ++j) { centroids[i * AV1_K_MEANS_DIM + j] = DIVIDE_AND_ROUND(centroids_sum[i * AV1_K_MEANS_DIM + j], count[i]); } } } } void RENAME(av1_k_means)(const int16_t *data, int16_t *centroids, uint8_t *indices, int n, int k, int max_itr) { int16_t centroids_tmp[AV1_K_MEANS_DIM * PALETTE_MAX_SIZE]; uint8_t indices_tmp[MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT]; int16_t *meta_centroids[2] = { centroids, centroids_tmp }; uint8_t *meta_indices[2] = { indices, indices_tmp }; int i, l = 0, prev_l, best_l = 0; int64_t this_dist; assert(n <= MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT); #if AV1_K_MEANS_DIM == 1 av1_calc_indices_dim1(data, centroids, indices, &this_dist, n, k); #else av1_calc_indices_dim2(data, centroids, indices, &this_dist, n, k); #endif for (i = 0; i < max_itr; ++i) { const int64_t prev_dist = this_dist; prev_l = l; l = (l == 1) ? 0 : 1; RENAME(calc_centroids)(data, meta_centroids[l], meta_indices[prev_l], n, k); if (!memcmp(meta_centroids[l], meta_centroids[prev_l], sizeof(centroids[0]) * k * AV1_K_MEANS_DIM)) { break; } #if AV1_K_MEANS_DIM == 1 av1_calc_indices_dim1(data, meta_centroids[l], meta_indices[l], &this_dist, n, k); #else av1_calc_indices_dim2(data, meta_centroids[l], meta_indices[l], &this_dist, n, k); #endif if (this_dist > prev_dist) { best_l = prev_l; break; } } if (i == max_itr) best_l = l; if (best_l != 0) { memcpy(centroids, meta_centroids[1], sizeof(centroids[0]) * k * AV1_K_MEANS_DIM); memcpy(indices, meta_indices[1], sizeof(indices[0]) * n); } } #undef RENAME_ #undef RENAME #undef K_MEANS_RENAME_C #undef RENAME_C_ #undef RENAME_C aom-3.12.1/av1/encoder/level.c000066400000000000000000001437251477627663500157760ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/encoder/encoder.h" #include "av1/encoder/level.h" #define UNDEFINED_LEVEL \ { \ .level = SEQ_LEVEL_MAX, .max_picture_size = 0, .max_h_size = 0, \ .max_v_size = 0, .max_display_rate = 0, .max_decode_rate = 0, \ .max_header_rate = 0, .main_mbps = 0, .high_mbps = 0, .main_cr = 0, \ .high_cr = 0, .max_tiles = 0, .max_tile_cols = 0 \ } static const AV1LevelSpec av1_level_defs[SEQ_LEVELS] = { { .level = SEQ_LEVEL_2_0, .max_picture_size = 147456, .max_h_size = 2048, .max_v_size = 1152, .max_display_rate = 4423680L, .max_decode_rate = 5529600L, .max_header_rate = 150, .main_mbps = 1.5, .high_mbps = 0, .main_cr = 2.0, .high_cr = 0, .max_tiles = 8, .max_tile_cols = 4 }, { .level = SEQ_LEVEL_2_1, .max_picture_size = 278784, .max_h_size = 2816, .max_v_size = 1584, .max_display_rate = 8363520L, .max_decode_rate = 10454400L, .max_header_rate = 150, .main_mbps = 3.0, .high_mbps = 0, .main_cr = 2.0, .high_cr = 0, .max_tiles = 8, .max_tile_cols = 4 }, UNDEFINED_LEVEL, UNDEFINED_LEVEL, { .level = SEQ_LEVEL_3_0, .max_picture_size = 665856, .max_h_size = 4352, .max_v_size = 2448, .max_display_rate = 19975680L, .max_decode_rate = 24969600L, .max_header_rate = 150, .main_mbps = 6.0, .high_mbps = 0, .main_cr = 2.0, .high_cr = 0, .max_tiles = 16, .max_tile_cols = 6 }, { .level = SEQ_LEVEL_3_1, .max_picture_size = 1065024, .max_h_size = 5504, .max_v_size = 3096, .max_display_rate = 31950720L, .max_decode_rate = 39938400L, .max_header_rate = 150, .main_mbps = 10.0, .high_mbps = 0, .main_cr = 2.0, .high_cr = 0, .max_tiles = 16, .max_tile_cols = 6 }, UNDEFINED_LEVEL, UNDEFINED_LEVEL, { .level = SEQ_LEVEL_4_0, .max_picture_size = 2359296, .max_h_size = 6144, .max_v_size = 3456, .max_display_rate = 70778880L, .max_decode_rate = 77856768L, .max_header_rate = 300, .main_mbps = 12.0, .high_mbps = 30.0, .main_cr = 4.0, .high_cr = 4.0, .max_tiles = 32, .max_tile_cols = 8 }, { .level = SEQ_LEVEL_4_1, .max_picture_size = 2359296, .max_h_size = 6144, .max_v_size = 3456, .max_display_rate = 141557760L, .max_decode_rate = 155713536L, .max_header_rate = 300, .main_mbps = 20.0, .high_mbps = 50.0, .main_cr = 4.0, .high_cr = 4.0, .max_tiles = 32, .max_tile_cols = 8 }, UNDEFINED_LEVEL, UNDEFINED_LEVEL, { .level = SEQ_LEVEL_5_0, .max_picture_size = 8912896, .max_h_size = 8192, .max_v_size = 4352, .max_display_rate = 267386880L, .max_decode_rate = 273715200L, .max_header_rate = 300, .main_mbps = 30.0, .high_mbps = 100.0, .main_cr = 6.0, .high_cr = 4.0, .max_tiles = 64, .max_tile_cols = 8 }, { .level = SEQ_LEVEL_5_1, .max_picture_size = 8912896, .max_h_size = 8192, .max_v_size = 4352, .max_display_rate = 534773760L, .max_decode_rate = 547430400L, .max_header_rate = 300, .main_mbps = 40.0, .high_mbps = 160.0, .main_cr = 8.0, .high_cr = 4.0, .max_tiles = 64, .max_tile_cols = 8 }, { .level = SEQ_LEVEL_5_2, .max_picture_size = 8912896, .max_h_size = 8192, .max_v_size = 4352, .max_display_rate = 1069547520L, .max_decode_rate = 1094860800L, .max_header_rate = 300, .main_mbps = 60.0, .high_mbps = 240.0, .main_cr = 8.0, .high_cr = 4.0, .max_tiles = 64, .max_tile_cols = 8 }, { .level = SEQ_LEVEL_5_3, .max_picture_size = 8912896, .max_h_size = 8192, .max_v_size = 4352, .max_display_rate = 1069547520L, .max_decode_rate = 1176502272L, .max_header_rate = 300, .main_mbps = 60.0, .high_mbps = 240.0, .main_cr = 8.0, .high_cr = 4.0, .max_tiles = 64, .max_tile_cols = 8 }, { .level = SEQ_LEVEL_6_0, .max_picture_size = 35651584, .max_h_size = 16384, .max_v_size = 8704, .max_display_rate = 1069547520L, .max_decode_rate = 1176502272L, .max_header_rate = 300, .main_mbps = 60.0, .high_mbps = 240.0, .main_cr = 8.0, .high_cr = 4.0, .max_tiles = 128, .max_tile_cols = 16 }, { .level = SEQ_LEVEL_6_1, .max_picture_size = 35651584, .max_h_size = 16384, .max_v_size = 8704, .max_display_rate = 2139095040L, .max_decode_rate = 2189721600L, .max_header_rate = 300, .main_mbps = 100.0, .high_mbps = 480.0, .main_cr = 8.0, .high_cr = 4.0, .max_tiles = 128, .max_tile_cols = 16 }, { .level = SEQ_LEVEL_6_2, .max_picture_size = 35651584, .max_h_size = 16384, .max_v_size = 8704, .max_display_rate = 4278190080L, .max_decode_rate = 4379443200L, .max_header_rate = 300, .main_mbps = 160.0, .high_mbps = 800.0, .main_cr = 8.0, .high_cr = 4.0, .max_tiles = 128, .max_tile_cols = 16 }, { .level = SEQ_LEVEL_6_3, .max_picture_size = 35651584, .max_h_size = 16384, .max_v_size = 8704, .max_display_rate = 4278190080L, .max_decode_rate = 4706009088L, .max_header_rate = 300, .main_mbps = 160.0, .high_mbps = 800.0, .main_cr = 8.0, .high_cr = 4.0, .max_tiles = 128, .max_tile_cols = 16 }, #if CONFIG_CWG_C013 { .level = SEQ_LEVEL_7_0, .max_picture_size = 142606336, .max_h_size = 32768, .max_v_size = 17408, .max_display_rate = 4278190080L, .max_decode_rate = 4706009088L, .max_header_rate = 300, .main_mbps = 160.0, .high_mbps = 800.0, .main_cr = 8.0, .high_cr = 4.0, .max_tiles = 256, .max_tile_cols = 32 }, { .level = SEQ_LEVEL_7_1, .max_picture_size = 142606336, .max_h_size = 32768, .max_v_size = 17408, .max_display_rate = 8556380160L, .max_decode_rate = 8758886400L, .max_header_rate = 300, .main_mbps = 200.0, .high_mbps = 960.0, .main_cr = 8.0, .high_cr = 4.0, .max_tiles = 256, .max_tile_cols = 32 }, { .level = SEQ_LEVEL_7_2, .max_picture_size = 142606336, .max_h_size = 32768, .max_v_size = 17408, .max_display_rate = 17112760320L, .max_decode_rate = 17517772800L, .max_header_rate = 300, .main_mbps = 320.0, .high_mbps = 1600.0, .main_cr = 8.0, .high_cr = 4.0, .max_tiles = 256, .max_tile_cols = 32 }, { .level = SEQ_LEVEL_7_3, .max_picture_size = 142606336, .max_h_size = 32768, .max_v_size = 17408, .max_display_rate = 17112760320L, .max_decode_rate = 18824036352L, .max_header_rate = 300, .main_mbps = 320.0, .high_mbps = 1600.0, .main_cr = 8.0, .high_cr = 4.0, .max_tiles = 256, .max_tile_cols = 32 }, { .level = SEQ_LEVEL_8_0, .max_picture_size = 530841600, .max_h_size = 65536, .max_v_size = 34816, .max_display_rate = 17112760320L, .max_decode_rate = 18824036352L, .max_header_rate = 300, .main_mbps = 320.0, .high_mbps = 1600.0, .main_cr = 8.0, .high_cr = 4.0, .max_tiles = 512, .max_tile_cols = 64 }, { .level = SEQ_LEVEL_8_1, .max_picture_size = 530841600, .max_h_size = 65536, .max_v_size = 34816, .max_display_rate = 34225520640L, .max_decode_rate = 34910031052L, .max_header_rate = 300, .main_mbps = 400.0, .high_mbps = 1920.0, .main_cr = 8.0, .high_cr = 4.0, .max_tiles = 512, .max_tile_cols = 64 }, { .level = SEQ_LEVEL_8_2, .max_picture_size = 530841600, .max_h_size = 65536, .max_v_size = 34816, .max_display_rate = 68451041280L, .max_decode_rate = 69820062105L, .max_header_rate = 300, .main_mbps = 640.0, .high_mbps = 3200.0, .main_cr = 8.0, .high_cr = 4.0, .max_tiles = 512, .max_tile_cols = 64 }, { .level = SEQ_LEVEL_8_3, .max_picture_size = 530841600, .max_h_size = 65536, .max_v_size = 34816, .max_display_rate = 68451041280L, .max_decode_rate = 75296145408L, .max_header_rate = 300, .main_mbps = 640.0, .high_mbps = 3200.0, .main_cr = 8.0, .high_cr = 4.0, .max_tiles = 512, .max_tile_cols = 64 }, #else // !CONFIG_CWG_C013 UNDEFINED_LEVEL, UNDEFINED_LEVEL, UNDEFINED_LEVEL, UNDEFINED_LEVEL, UNDEFINED_LEVEL, UNDEFINED_LEVEL, UNDEFINED_LEVEL, UNDEFINED_LEVEL, #endif // CONFIG_CWG_C013 }; typedef enum { LUMA_PIC_SIZE_TOO_LARGE, LUMA_PIC_H_SIZE_TOO_LARGE, LUMA_PIC_V_SIZE_TOO_LARGE, LUMA_PIC_H_SIZE_TOO_SMALL, LUMA_PIC_V_SIZE_TOO_SMALL, TOO_MANY_TILE_COLUMNS, TOO_MANY_TILES, TILE_RATE_TOO_HIGH, TILE_TOO_LARGE, SUPERRES_TILE_WIDTH_TOO_LARGE, CROPPED_TILE_WIDTH_TOO_SMALL, CROPPED_TILE_HEIGHT_TOO_SMALL, TILE_WIDTH_INVALID, FRAME_HEADER_RATE_TOO_HIGH, DISPLAY_RATE_TOO_HIGH, DECODE_RATE_TOO_HIGH, CR_TOO_SMALL, TILE_SIZE_HEADER_RATE_TOO_HIGH, BITRATE_TOO_HIGH, DECODER_MODEL_FAIL, TARGET_LEVEL_FAIL_IDS, TARGET_LEVEL_OK, } TARGET_LEVEL_FAIL_ID; static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = { "The picture size is too large.", "The picture width is too large.", "The picture height is too large.", "The picture width is too small.", "The picture height is too small.", "Too many tile columns are used.", "Too many tiles are used.", "The tile rate is too high.", "The tile size is too large.", "The superres tile width is too large.", "The cropped tile width is less than 8.", "The cropped tile height is less than 8.", "The tile width is invalid.", "The frame header rate is too high.", "The display luma sample rate is too high.", "The decoded luma sample rate is too high.", "The compression ratio is too small.", "The product of max tile size and header rate is too high.", "The bitrate is too high.", "The decoder model fails.", }; static double get_max_bitrate(const AV1LevelSpec *const level_spec, int tier, BITSTREAM_PROFILE profile) { if (level_spec->level < SEQ_LEVEL_4_0) tier = 0; const double bitrate_basis = (tier ? level_spec->high_mbps : level_spec->main_mbps) * 1e6; const double bitrate_profile_factor = profile == PROFILE_0 ? 1.0 : (profile == PROFILE_1 ? 2.0 : 3.0); return bitrate_basis * bitrate_profile_factor; } double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier, BITSTREAM_PROFILE profile) { assert(is_valid_seq_level_idx(level_index)); return get_max_bitrate(&av1_level_defs[level_index], tier, profile); } void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles, int *const max_tile_cols) { assert(is_valid_seq_level_idx(level_index)); const AV1LevelSpec *const level_spec = &av1_level_defs[level_index]; *max_tiles = level_spec->max_tiles; *max_tile_cols = level_spec->max_tile_cols; } // We assume time t to be valid if and only if t >= 0.0. // So INVALID_TIME can be defined as anything less than 0. #define INVALID_TIME (-1.0) // This corresponds to "free_buffer" in the spec. static void release_buffer(DECODER_MODEL *const decoder_model, int idx) { assert(idx >= 0 && idx < BUFFER_POOL_MAX_SIZE); FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx]; this_buffer->decoder_ref_count = 0; this_buffer->player_ref_count = 0; this_buffer->display_index = -1; this_buffer->presentation_time = INVALID_TIME; } static void initialize_buffer_pool(DECODER_MODEL *const decoder_model) { for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { release_buffer(decoder_model, i); } for (int i = 0; i < REF_FRAMES; ++i) { decoder_model->vbi[i] = -1; } } static int get_free_buffer(DECODER_MODEL *const decoder_model) { for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { const FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[i]; if (this_buffer->decoder_ref_count == 0 && this_buffer->player_ref_count == 0) return i; } return -1; } static void update_ref_buffers(DECODER_MODEL *const decoder_model, int idx, int refresh_frame_flags) { FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx]; for (int i = 0; i < REF_FRAMES; ++i) { if (refresh_frame_flags & (1 << i)) { const int pre_idx = decoder_model->vbi[i]; if (pre_idx != -1) { --decoder_model->frame_buffer_pool[pre_idx].decoder_ref_count; } decoder_model->vbi[i] = idx; ++this_buffer->decoder_ref_count; } } } // The time (in seconds) required to decode a frame. static double time_to_decode_frame(const AV1_COMMON *const cm, int64_t max_decode_rate) { if (cm->show_existing_frame) return 0.0; const FRAME_TYPE frame_type = cm->current_frame.frame_type; int luma_samples = 0; if (frame_type == KEY_FRAME || frame_type == INTRA_ONLY_FRAME) { luma_samples = cm->superres_upscaled_width * cm->height; } else { const int spatial_layer_dimensions_present_flag = 0; if (spatial_layer_dimensions_present_flag) { assert(0 && "Spatial layer dimensions not supported yet."); } else { const SequenceHeader *const seq_params = cm->seq_params; const int max_frame_width = seq_params->max_frame_width; const int max_frame_height = seq_params->max_frame_height; luma_samples = max_frame_width * max_frame_height; } } return luma_samples / (double)max_decode_rate; } // Release frame buffers that are no longer needed for decode or display. // It corresponds to "start_decode_at_removal_time" in the spec. static void release_processed_frames(DECODER_MODEL *const decoder_model, double removal_time) { for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[i]; if (this_buffer->player_ref_count > 0) { if (this_buffer->presentation_time >= 0.0 && this_buffer->presentation_time <= removal_time) { this_buffer->player_ref_count = 0; if (this_buffer->decoder_ref_count == 0) { release_buffer(decoder_model, i); } } } } } static int frames_in_buffer_pool(const DECODER_MODEL *const decoder_model) { int frames_in_pool = 0; for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { const FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[i]; if (this_buffer->decoder_ref_count > 0 || this_buffer->player_ref_count > 0) { ++frames_in_pool; } } return frames_in_pool; } static double get_presentation_time(const DECODER_MODEL *const decoder_model, int display_index) { if (decoder_model->mode == SCHEDULE_MODE) { assert(0 && "SCHEDULE_MODE NOT SUPPORTED"); return INVALID_TIME; } else { const double initial_presentation_delay = decoder_model->initial_presentation_delay; // Can't decide presentation time until the initial presentation delay is // known. if (initial_presentation_delay < 0.0) return INVALID_TIME; return initial_presentation_delay + display_index * decoder_model->num_ticks_per_picture * decoder_model->display_clock_tick; } } #define MAX_TIME 1e16 static double time_next_buffer_is_free(int num_decoded_frame, int decoder_buffer_delay, const FRAME_BUFFER *frame_buffer_pool, double current_time) { if (num_decoded_frame == 0) { return (double)decoder_buffer_delay / 90000.0; } double buf_free_time = MAX_TIME; for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { const FRAME_BUFFER *const this_buffer = &frame_buffer_pool[i]; if (this_buffer->decoder_ref_count == 0) { if (this_buffer->player_ref_count == 0) { return current_time; } const double presentation_time = this_buffer->presentation_time; if (presentation_time >= 0.0 && presentation_time < buf_free_time) { buf_free_time = presentation_time; } } } return buf_free_time < MAX_TIME ? buf_free_time : INVALID_TIME; } #undef MAX_TIME static double get_removal_time(int mode, int num_decoded_frame, int decoder_buffer_delay, const FRAME_BUFFER *frame_buffer_pool, double current_time) { if (mode == SCHEDULE_MODE) { assert(0 && "SCHEDULE_MODE IS NOT SUPPORTED YET"); return INVALID_TIME; } else { return time_next_buffer_is_free(num_decoded_frame, decoder_buffer_delay, frame_buffer_pool, current_time); } } #if 0 // Print the status of the decoder model (for debugging). void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model) { printf( "\n status %d, num_frame %3d, num_decoded_frame %3d, " "num_shown_frame %3d, current time %6.2f, frames in buffer %2d, " "presentation delay %6.2f, total interval %6.2f\n", decoder_model->status, decoder_model->num_frame, decoder_model->num_decoded_frame, decoder_model->num_shown_frame, decoder_model->current_time, frames_in_buffer_pool(decoder_model), decoder_model->initial_presentation_delay, decoder_model->dfg_interval_queue.total_interval); for (int i = 0; i < 10; ++i) { const FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[i]; printf("buffer %d, decode count %d, display count %d, present time %6.4f\n", i, this_buffer->decoder_ref_count, this_buffer->player_ref_count, this_buffer->presentation_time); } } #endif // op_index is the operating point index. static void decoder_model_init(const AV1_COMP *const cpi, AV1_LEVEL level, int op_index, DECODER_MODEL *const decoder_model) { decoder_model->status = DECODER_MODEL_OK; decoder_model->level = level; const AV1_COMMON *const cm = &cpi->common; const SequenceHeader *const seq_params = cm->seq_params; decoder_model->bit_rate = get_max_bitrate( av1_level_defs + level, seq_params->tier[op_index], seq_params->profile); // TODO(huisu or anyone): implement SCHEDULE_MODE. decoder_model->mode = RESOURCE_MODE; decoder_model->encoder_buffer_delay = 20000; decoder_model->decoder_buffer_delay = 70000; decoder_model->is_low_delay_mode = false; decoder_model->first_bit_arrival_time = 0.0; decoder_model->last_bit_arrival_time = 0.0; decoder_model->coded_bits = 0; decoder_model->removal_time = INVALID_TIME; decoder_model->presentation_time = INVALID_TIME; decoder_model->decode_samples = 0; decoder_model->display_samples = 0; decoder_model->max_decode_rate = 0.0; decoder_model->max_display_rate = 0.0; decoder_model->num_frame = -1; decoder_model->num_decoded_frame = -1; decoder_model->num_shown_frame = -1; decoder_model->current_time = 0.0; initialize_buffer_pool(decoder_model); DFG_INTERVAL_QUEUE *const dfg_interval_queue = &decoder_model->dfg_interval_queue; dfg_interval_queue->total_interval = 0.0; dfg_interval_queue->head = 0; dfg_interval_queue->size = 0; if (seq_params->timing_info_present) { decoder_model->num_ticks_per_picture = seq_params->timing_info.num_ticks_per_picture; decoder_model->display_clock_tick = seq_params->timing_info.num_units_in_display_tick / seq_params->timing_info.time_scale; } else { decoder_model->num_ticks_per_picture = 1; decoder_model->display_clock_tick = 1.0 / cpi->framerate; } decoder_model->initial_display_delay = seq_params->op_params[op_index].initial_display_delay; decoder_model->initial_presentation_delay = INVALID_TIME; decoder_model->decode_rate = av1_level_defs[level].max_decode_rate; } DECODER_MODEL_STATUS av1_decoder_model_try_smooth_buf( const AV1_COMP *const cpi, size_t coded_bits, const DECODER_MODEL *const decoder_model) { DECODER_MODEL_STATUS status = DECODER_MODEL_OK; if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) { return status; } const AV1_COMMON *const cm = &cpi->common; const int show_existing_frame = cm->show_existing_frame; size_t cur_coded_bits = decoder_model->coded_bits + coded_bits; int num_decoded_frame = decoder_model->num_decoded_frame; if (!show_existing_frame) ++num_decoded_frame; if (show_existing_frame) { return status; } else { const double removal_time = get_removal_time( decoder_model->mode, num_decoded_frame, decoder_model->decoder_buffer_delay, decoder_model->frame_buffer_pool, decoder_model->current_time); if (removal_time < 0.0) { status = DECODE_FRAME_BUF_UNAVAILABLE; return status; } // A frame with show_existing_frame being false indicates the end of a DFG. // Update the bits arrival time of this DFG. const double buffer_delay = (decoder_model->encoder_buffer_delay + decoder_model->decoder_buffer_delay) / 90000.0; const double latest_arrival_time = removal_time - buffer_delay; const double first_bit_arrival_time = AOMMAX(decoder_model->last_bit_arrival_time, latest_arrival_time); const double last_bit_arrival_time = first_bit_arrival_time + (double)cur_coded_bits / decoder_model->bit_rate; // Smoothing buffer underflows if the last bit arrives after the removal // time. if (last_bit_arrival_time > removal_time && !decoder_model->is_low_delay_mode) { status = SMOOTHING_BUFFER_UNDERFLOW; return status; } // Check if the smoothing buffer overflows. const DFG_INTERVAL_QUEUE *const queue = &decoder_model->dfg_interval_queue; if (queue->size >= DFG_INTERVAL_QUEUE_SIZE) { assert(0); } double total_interval = queue->total_interval; int qhead = queue->head; int qsize = queue->size; // Remove the DFGs with removal time earlier than last_bit_arrival_time. while (queue->buf[qhead].removal_time <= last_bit_arrival_time && qsize > 0) { if (queue->buf[qhead].removal_time - first_bit_arrival_time + total_interval > 1.0) { status = SMOOTHING_BUFFER_OVERFLOW; return status; } total_interval -= queue->buf[qhead].last_bit_arrival_time - queue->buf[qhead].first_bit_arrival_time; qhead = (qhead + 1) % DFG_INTERVAL_QUEUE_SIZE; --qsize; } total_interval += last_bit_arrival_time - first_bit_arrival_time; // The smoothing buffer can hold at most "bit_rate" bits, which is // equivalent to 1 second of total interval. if (total_interval > 1.0) { status = SMOOTHING_BUFFER_OVERFLOW; return status; } return status; } } static void decoder_model_process_frame(const AV1_COMP *const cpi, size_t coded_bits, DECODER_MODEL *const decoder_model) { if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) return; const AV1_COMMON *const cm = &cpi->common; const int luma_pic_size = cm->superres_upscaled_width * cm->height; const int show_existing_frame = cm->show_existing_frame; const int show_frame = cm->show_frame || show_existing_frame; ++decoder_model->num_frame; if (!show_existing_frame) ++decoder_model->num_decoded_frame; if (show_frame) ++decoder_model->num_shown_frame; decoder_model->coded_bits += coded_bits; int display_idx = -1; if (show_existing_frame) { display_idx = decoder_model->vbi[cpi->existing_fb_idx_to_show]; if (display_idx < 0) { decoder_model->status = DECODE_EXISTING_FRAME_BUF_EMPTY; return; } if (decoder_model->frame_buffer_pool[display_idx].frame_type == KEY_FRAME) { update_ref_buffers(decoder_model, display_idx, 0xFF); } } else { const double removal_time = get_removal_time( decoder_model->mode, decoder_model->num_decoded_frame, decoder_model->decoder_buffer_delay, decoder_model->frame_buffer_pool, decoder_model->current_time); if (removal_time < 0.0) { decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE; return; } const int previous_decode_samples = decoder_model->decode_samples; const double previous_removal_time = decoder_model->removal_time; assert(previous_removal_time < removal_time); decoder_model->removal_time = removal_time; decoder_model->decode_samples = luma_pic_size; const double this_decode_rate = previous_decode_samples / (removal_time - previous_removal_time); decoder_model->max_decode_rate = AOMMAX(decoder_model->max_decode_rate, this_decode_rate); // A frame with show_existing_frame being false indicates the end of a DFG. // Update the bits arrival time of this DFG. const double buffer_delay = (decoder_model->encoder_buffer_delay + decoder_model->decoder_buffer_delay) / 90000.0; const double latest_arrival_time = removal_time - buffer_delay; decoder_model->first_bit_arrival_time = AOMMAX(decoder_model->last_bit_arrival_time, latest_arrival_time); decoder_model->last_bit_arrival_time = decoder_model->first_bit_arrival_time + (double)decoder_model->coded_bits / decoder_model->bit_rate; // Smoothing buffer underflows if the last bit arrives after the removal // time. if (decoder_model->last_bit_arrival_time > removal_time && !decoder_model->is_low_delay_mode) { decoder_model->status = SMOOTHING_BUFFER_UNDERFLOW; return; } // Reset the coded bits for the next DFG. decoder_model->coded_bits = 0; // Check if the smoothing buffer overflows. DFG_INTERVAL_QUEUE *const queue = &decoder_model->dfg_interval_queue; if (queue->size >= DFG_INTERVAL_QUEUE_SIZE) { assert(0); } const double first_bit_arrival_time = decoder_model->first_bit_arrival_time; const double last_bit_arrival_time = decoder_model->last_bit_arrival_time; // Remove the DFGs with removal time earlier than last_bit_arrival_time. while (queue->buf[queue->head].removal_time <= last_bit_arrival_time && queue->size > 0) { if (queue->buf[queue->head].removal_time - first_bit_arrival_time + queue->total_interval > 1.0) { decoder_model->status = SMOOTHING_BUFFER_OVERFLOW; return; } queue->total_interval -= queue->buf[queue->head].last_bit_arrival_time - queue->buf[queue->head].first_bit_arrival_time; queue->head = (queue->head + 1) % DFG_INTERVAL_QUEUE_SIZE; --queue->size; } // Push current DFG into the queue. const int queue_index = (queue->head + queue->size++) % DFG_INTERVAL_QUEUE_SIZE; queue->buf[queue_index].first_bit_arrival_time = first_bit_arrival_time; queue->buf[queue_index].last_bit_arrival_time = last_bit_arrival_time; queue->buf[queue_index].removal_time = removal_time; queue->total_interval += last_bit_arrival_time - first_bit_arrival_time; // The smoothing buffer can hold at most "bit_rate" bits, which is // equivalent to 1 second of total interval. if (queue->total_interval > 1.0) { decoder_model->status = SMOOTHING_BUFFER_OVERFLOW; return; } release_processed_frames(decoder_model, removal_time); decoder_model->current_time = removal_time + time_to_decode_frame(cm, decoder_model->decode_rate); const int cfbi = get_free_buffer(decoder_model); if (cfbi < 0) { decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE; return; } const CurrentFrame *const current_frame = &cm->current_frame; decoder_model->frame_buffer_pool[cfbi].frame_type = cm->current_frame.frame_type; display_idx = cfbi; update_ref_buffers(decoder_model, cfbi, current_frame->refresh_frame_flags); if (decoder_model->initial_presentation_delay < 0.0) { // Display can begin after required number of frames have been buffered. if (frames_in_buffer_pool(decoder_model) >= decoder_model->initial_display_delay - 1) { decoder_model->initial_presentation_delay = decoder_model->current_time; // Update presentation time for each shown frame in the frame buffer. for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[i]; if (this_buffer->player_ref_count == 0) continue; assert(this_buffer->display_index >= 0); this_buffer->presentation_time = get_presentation_time(decoder_model, this_buffer->display_index); } } } } // Display. if (show_frame) { assert(display_idx >= 0 && display_idx < BUFFER_POOL_MAX_SIZE); FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[display_idx]; ++this_buffer->player_ref_count; this_buffer->display_index = decoder_model->num_shown_frame; const double presentation_time = get_presentation_time(decoder_model, this_buffer->display_index); this_buffer->presentation_time = presentation_time; if (presentation_time >= 0.0 && decoder_model->current_time > presentation_time) { decoder_model->status = DISPLAY_FRAME_LATE; return; } const int previous_display_samples = decoder_model->display_samples; const double previous_presentation_time = decoder_model->presentation_time; decoder_model->display_samples = luma_pic_size; decoder_model->presentation_time = presentation_time; if (presentation_time >= 0.0 && previous_presentation_time >= 0.0) { assert(previous_presentation_time < presentation_time); const double this_display_rate = previous_display_samples / (presentation_time - previous_presentation_time); decoder_model->max_display_rate = AOMMAX(decoder_model->max_display_rate, this_display_rate); } } } void av1_init_level_info(AV1_COMP *cpi) { for (int op_index = 0; op_index < MAX_NUM_OPERATING_POINTS; ++op_index) { AV1LevelInfo *const this_level_info = cpi->ppi->level_params.level_info[op_index]; if (!this_level_info) continue; memset(this_level_info, 0, sizeof(*this_level_info)); AV1LevelSpec *const level_spec = &this_level_info->level_spec; level_spec->level = SEQ_LEVEL_MAX; AV1LevelStats *const level_stats = &this_level_info->level_stats; level_stats->min_cropped_tile_width = INT_MAX; level_stats->min_cropped_tile_height = INT_MAX; level_stats->min_frame_width = INT_MAX; level_stats->min_frame_height = INT_MAX; level_stats->tile_width_is_valid = 1; level_stats->min_cr = 1e8; FrameWindowBuffer *const frame_window_buffer = &this_level_info->frame_window_buffer; frame_window_buffer->num = 0; frame_window_buffer->start = 0; const AV1_COMMON *const cm = &cpi->common; const int upscaled_width = cm->superres_upscaled_width; const int height = cm->height; const int pic_size = upscaled_width * height; for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) { DECODER_MODEL *const this_model = &this_level_info->decoder_models[level]; const AV1LevelSpec *const spec = &av1_level_defs[level]; if (upscaled_width > spec->max_h_size || height > spec->max_v_size || pic_size > spec->max_picture_size) { // Turn off decoder model for this level as the frame size already // exceeds level constraints. this_model->status = DECODER_MODEL_DISABLED; } else { decoder_model_init(cpi, level, op_index, this_model); } } } } static double get_min_cr(const AV1LevelSpec *const level_spec, int tier, int is_still_picture, int64_t decoded_sample_rate) { if (is_still_picture) return 0.8; if (level_spec->level < SEQ_LEVEL_4_0) tier = 0; const double min_cr_basis = tier ? level_spec->high_cr : level_spec->main_cr; const double speed_adj = (double)decoded_sample_rate / level_spec->max_display_rate; return AOMMAX(min_cr_basis * speed_adj, 0.8); } double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier, int is_still_picture) { assert(is_valid_seq_level_idx(level_index)); const AV1LevelSpec *const level_spec = &av1_level_defs[level_index]; return get_min_cr(level_spec, tier, is_still_picture, level_spec->max_decode_rate); } static void get_temporal_parallel_params(int scalability_mode_idc, int *temporal_parallel_num, int *temporal_parallel_denom) { if (scalability_mode_idc < 0) { *temporal_parallel_num = 1; *temporal_parallel_denom = 1; return; } // TODO(huisu@): handle scalability cases. if (scalability_mode_idc == SCALABILITY_SS) { (void)scalability_mode_idc; } else { (void)scalability_mode_idc; } } #define MIN_CROPPED_TILE_WIDTH 8 #define MIN_CROPPED_TILE_HEIGHT 8 #define MIN_FRAME_WIDTH 16 #define MIN_FRAME_HEIGHT 16 #define MAX_TILE_SIZE_HEADER_RATE_PRODUCT 588251136 static TARGET_LEVEL_FAIL_ID check_level_constraints( const AV1LevelInfo *const level_info, AV1_LEVEL level, int tier, int is_still_picture, BITSTREAM_PROFILE profile, int check_bitrate) { const DECODER_MODEL *const decoder_model = &level_info->decoder_models[level]; const DECODER_MODEL_STATUS decoder_model_status = decoder_model->status; if (decoder_model_status != DECODER_MODEL_OK && decoder_model_status != DECODER_MODEL_DISABLED) { return DECODER_MODEL_FAIL; } const AV1LevelSpec *const level_spec = &level_info->level_spec; const AV1LevelSpec *const target_level_spec = &av1_level_defs[level]; const AV1LevelStats *const level_stats = &level_info->level_stats; TARGET_LEVEL_FAIL_ID fail_id = TARGET_LEVEL_OK; do { if (level_spec->max_picture_size > target_level_spec->max_picture_size) { fail_id = LUMA_PIC_SIZE_TOO_LARGE; break; } if (level_spec->max_h_size > target_level_spec->max_h_size) { fail_id = LUMA_PIC_H_SIZE_TOO_LARGE; break; } if (level_spec->max_v_size > target_level_spec->max_v_size) { fail_id = LUMA_PIC_V_SIZE_TOO_LARGE; break; } if (level_spec->max_tile_cols > target_level_spec->max_tile_cols) { fail_id = TOO_MANY_TILE_COLUMNS; break; } if (level_spec->max_tiles > target_level_spec->max_tiles) { fail_id = TOO_MANY_TILES; break; } if (level_spec->max_header_rate > target_level_spec->max_header_rate) { fail_id = FRAME_HEADER_RATE_TOO_HIGH; break; } if (decoder_model->max_display_rate > (double)target_level_spec->max_display_rate) { fail_id = DISPLAY_RATE_TOO_HIGH; break; } // TODO(huisu): we are not using max decode rate calculated by the decoder // model because the model in resource availability mode always returns // MaxDecodeRate(as in the level definitions) as the max decode rate. if (level_spec->max_decode_rate > target_level_spec->max_decode_rate) { fail_id = DECODE_RATE_TOO_HIGH; break; } if (level_spec->max_tile_rate > target_level_spec->max_tiles * 120) { fail_id = TILE_RATE_TOO_HIGH; break; } #if CONFIG_CWG_C013 const int max_tile_size = (level >= SEQ_LEVEL_7_0 && level <= SEQ_LEVEL_8_3) ? MAX_TILE_AREA_LEVEL_7_AND_ABOVE : MAX_TILE_AREA; #else const int max_tile_size = MAX_TILE_AREA; #endif if (level_stats->max_tile_size > max_tile_size) { fail_id = TILE_TOO_LARGE; break; } if (level_stats->max_superres_tile_width > MAX_TILE_WIDTH) { fail_id = SUPERRES_TILE_WIDTH_TOO_LARGE; break; } if (level_stats->min_cropped_tile_width < MIN_CROPPED_TILE_WIDTH) { fail_id = CROPPED_TILE_WIDTH_TOO_SMALL; break; } if (level_stats->min_cropped_tile_height < MIN_CROPPED_TILE_HEIGHT) { fail_id = CROPPED_TILE_HEIGHT_TOO_SMALL; break; } if (level_stats->min_frame_width < MIN_FRAME_WIDTH) { fail_id = LUMA_PIC_H_SIZE_TOO_SMALL; break; } if (level_stats->min_frame_height < MIN_FRAME_HEIGHT) { fail_id = LUMA_PIC_V_SIZE_TOO_SMALL; break; } if (!level_stats->tile_width_is_valid) { fail_id = TILE_WIDTH_INVALID; break; } const double min_cr = get_min_cr(target_level_spec, tier, is_still_picture, level_spec->max_decode_rate); if (level_stats->min_cr < min_cr) { fail_id = CR_TOO_SMALL; break; } if (check_bitrate) { // Check average bitrate instead of max_bitrate. const double bitrate_limit = get_max_bitrate(target_level_spec, tier, profile); const double avg_bitrate = level_stats->total_compressed_size * 8.0 / level_stats->total_time_encoded; if (avg_bitrate > bitrate_limit) { fail_id = BITRATE_TOO_HIGH; break; } } if (target_level_spec->level > SEQ_LEVEL_5_1) { int temporal_parallel_num; int temporal_parallel_denom; const int scalability_mode_idc = -1; get_temporal_parallel_params(scalability_mode_idc, &temporal_parallel_num, &temporal_parallel_denom); const int val = level_stats->max_tile_size * level_spec->max_header_rate * temporal_parallel_denom / temporal_parallel_num; if (val > MAX_TILE_SIZE_HEADER_RATE_PRODUCT) { fail_id = TILE_SIZE_HEADER_RATE_TOO_HIGH; break; } } } while (0); return fail_id; } static void get_tile_stats(const AV1_COMMON *const cm, const TileDataEnc *const tile_data, int *max_tile_size, int *max_superres_tile_width, int *min_cropped_tile_width, int *min_cropped_tile_height, int *tile_width_valid) { const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; const int superres_scale_denominator = cm->superres_scale_denominator; *max_tile_size = 0; *max_superres_tile_width = 0; *min_cropped_tile_width = INT_MAX; *min_cropped_tile_height = INT_MAX; *tile_width_valid = 1; for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { const TileInfo *const tile_info = &tile_data[tile_row * cm->tiles.cols + tile_col].tile_info; const int tile_width = (tile_info->mi_col_end - tile_info->mi_col_start) * MI_SIZE; const int tile_height = (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE; const int tile_size = tile_width * tile_height; *max_tile_size = AOMMAX(*max_tile_size, tile_size); const int supperres_tile_width = tile_width * superres_scale_denominator / SCALE_NUMERATOR; *max_superres_tile_width = AOMMAX(*max_superres_tile_width, supperres_tile_width); const int cropped_tile_width = cm->width - tile_info->mi_col_start * MI_SIZE; const int cropped_tile_height = cm->height - tile_info->mi_row_start * MI_SIZE; *min_cropped_tile_width = AOMMIN(*min_cropped_tile_width, cropped_tile_width); *min_cropped_tile_height = AOMMIN(*min_cropped_tile_height, cropped_tile_height); const int is_right_most_tile = tile_info->mi_col_end == cm->mi_params.mi_cols; if (!is_right_most_tile) { if (av1_superres_scaled(cm)) *tile_width_valid &= tile_width >= 128; else *tile_width_valid &= tile_width >= 64; } } } } static int store_frame_record(int64_t ts_start, int64_t ts_end, size_t encoded_size, int pic_size, int frame_header_count, int tiles, int show_frame, int show_existing_frame, FrameWindowBuffer *const buffer) { if (buffer->num < FRAME_WINDOW_SIZE) { ++buffer->num; } else { buffer->start = (buffer->start + 1) % FRAME_WINDOW_SIZE; } const int new_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE; FrameRecord *const record = &buffer->buf[new_idx]; record->ts_start = ts_start; record->ts_end = ts_end; record->encoded_size_in_bytes = encoded_size; record->pic_size = pic_size; record->frame_header_count = frame_header_count; record->tiles = tiles; record->show_frame = show_frame; record->show_existing_frame = show_existing_frame; return new_idx; } // Count the number of frames encoded in the last "duration" ticks, in display // time. static int count_frames(const FrameWindowBuffer *const buffer, int64_t duration) { const int current_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE; // Assume current frame is shown frame. assert(buffer->buf[current_idx].show_frame); const int64_t current_time = buffer->buf[current_idx].ts_end; const int64_t time_limit = AOMMAX(current_time - duration, 0); int num_frames = 1; int index = current_idx - 1; for (int i = buffer->num - 2; i >= 0; --i, --index, ++num_frames) { if (index < 0) index = FRAME_WINDOW_SIZE - 1; const FrameRecord *const record = &buffer->buf[index]; if (!record->show_frame) continue; const int64_t ts_start = record->ts_start; if (ts_start < time_limit) break; } return num_frames; } // Scan previously encoded frames and update level metrics accordingly. static void scan_past_frames(const FrameWindowBuffer *const buffer, int num_frames_to_scan, AV1LevelSpec *const level_spec, AV1LevelStats *const level_stats) { const int num_frames_in_buffer = buffer->num; int index = (buffer->start + num_frames_in_buffer - 1) % FRAME_WINDOW_SIZE; int frame_headers = 0; int tiles = 0; int64_t display_samples = 0; int64_t decoded_samples = 0; size_t encoded_size_in_bytes = 0; for (int i = 0; i < AOMMIN(num_frames_in_buffer, num_frames_to_scan); ++i) { const FrameRecord *const record = &buffer->buf[index]; if (!record->show_existing_frame) { frame_headers += record->frame_header_count; decoded_samples += record->pic_size; } if (record->show_frame) { display_samples += record->pic_size; } tiles += record->tiles; encoded_size_in_bytes += record->encoded_size_in_bytes; --index; if (index < 0) index = FRAME_WINDOW_SIZE - 1; } level_spec->max_header_rate = AOMMAX(level_spec->max_header_rate, frame_headers); // TODO(huisu): we can now compute max display rate with the decoder model, so // these couple of lines can be removed. Keep them here for a while for // debugging purpose. level_spec->max_display_rate = AOMMAX(level_spec->max_display_rate, display_samples); level_spec->max_decode_rate = AOMMAX(level_spec->max_decode_rate, decoded_samples); level_spec->max_tile_rate = AOMMAX(level_spec->max_tile_rate, tiles); level_stats->max_bitrate = AOMMAX(level_stats->max_bitrate, (int)AOMMIN(encoded_size_in_bytes * 8, (size_t)INT_MAX)); } void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start, int64_t ts_end) { AV1_COMMON *const cm = &cpi->common; const AV1LevelParams *const level_params = &cpi->ppi->level_params; const int upscaled_width = cm->superres_upscaled_width; const int width = cm->width; const int height = cm->height; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; const int tiles = tile_cols * tile_rows; const int luma_pic_size = upscaled_width * height; const int frame_header_count = cpi->frame_header_count; const int show_frame = cm->show_frame; const int show_existing_frame = cm->show_existing_frame; int max_tile_size; int min_cropped_tile_width; int min_cropped_tile_height; int max_superres_tile_width; int tile_width_is_valid; get_tile_stats(cm, cpi->tile_data, &max_tile_size, &max_superres_tile_width, &min_cropped_tile_width, &min_cropped_tile_height, &tile_width_is_valid); const double compression_ratio = av1_get_compression_ratio(cm, size); const int temporal_layer_id = cm->temporal_layer_id; const int spatial_layer_id = cm->spatial_layer_id; const SequenceHeader *const seq_params = cm->seq_params; const BITSTREAM_PROFILE profile = seq_params->profile; const int is_still_picture = seq_params->still_picture; // update level_stats // TODO(kyslov@) fix the implementation according to buffer model for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; ++i) { if (!is_in_operating_point(seq_params->operating_point_idc[i], temporal_layer_id, spatial_layer_id) || !((level_params->keep_level_stats >> i) & 1)) { continue; } AV1LevelInfo *const level_info = level_params->level_info[i]; assert(level_info != NULL); AV1LevelStats *const level_stats = &level_info->level_stats; level_stats->max_tile_size = AOMMAX(level_stats->max_tile_size, max_tile_size); level_stats->max_superres_tile_width = AOMMAX(level_stats->max_superres_tile_width, max_superres_tile_width); level_stats->min_cropped_tile_width = AOMMIN(level_stats->min_cropped_tile_width, min_cropped_tile_width); level_stats->min_cropped_tile_height = AOMMIN(level_stats->min_cropped_tile_height, min_cropped_tile_height); level_stats->tile_width_is_valid &= tile_width_is_valid; level_stats->min_frame_width = AOMMIN(level_stats->min_frame_width, width); level_stats->min_frame_height = AOMMIN(level_stats->min_frame_height, height); level_stats->min_cr = AOMMIN(level_stats->min_cr, compression_ratio); level_stats->total_compressed_size += (double)size; // update level_spec // TODO(kyslov@) update all spec fields AV1LevelSpec *const level_spec = &level_info->level_spec; level_spec->max_picture_size = AOMMAX(level_spec->max_picture_size, luma_pic_size); level_spec->max_h_size = AOMMAX(level_spec->max_h_size, cm->superres_upscaled_width); level_spec->max_v_size = AOMMAX(level_spec->max_v_size, height); level_spec->max_tile_cols = AOMMAX(level_spec->max_tile_cols, tile_cols); level_spec->max_tiles = AOMMAX(level_spec->max_tiles, tiles); // Store info. of current frame into FrameWindowBuffer. FrameWindowBuffer *const buffer = &level_info->frame_window_buffer; store_frame_record(ts_start, ts_end, size, luma_pic_size, frame_header_count, tiles, show_frame, show_existing_frame, buffer); if (show_frame) { // Count the number of frames encoded in the past 1 second. const int encoded_frames_in_last_second = show_frame ? count_frames(buffer, TICKS_PER_SEC) : 0; scan_past_frames(buffer, encoded_frames_in_last_second, level_spec, level_stats); level_stats->total_time_encoded += (cpi->time_stamps.prev_ts_end - cpi->time_stamps.prev_ts_start) / (double)TICKS_PER_SEC; } DECODER_MODEL *const decoder_models = level_info->decoder_models; for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) { decoder_model_process_frame(cpi, size << 3, &decoder_models[level]); } // Check whether target level is met. const AV1_LEVEL target_level = level_params->target_seq_level_idx[i]; if (target_level < SEQ_LEVELS && cpi->oxcf.strict_level_conformance) { assert(is_valid_seq_level_idx(target_level)); const int tier = seq_params->tier[i]; const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints( level_info, target_level, tier, is_still_picture, profile, 0); if (fail_id != TARGET_LEVEL_OK) { const int target_level_major = 2 + (target_level >> 2); const int target_level_minor = target_level & 3; aom_internal_error(cm->error, AOM_CODEC_ERROR, "Failed to encode to the target level %d_%d. %s", target_level_major, target_level_minor, level_fail_messages[fail_id]); } } } } aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params, const AV1LevelParams *level_params, int *seq_level_idx) { const int is_still_picture = seq_params->still_picture; const BITSTREAM_PROFILE profile = seq_params->profile; for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) { seq_level_idx[op] = (int)SEQ_LEVEL_MAX; if (!((level_params->keep_level_stats >> op) & 1)) continue; const int tier = seq_params->tier[op]; const AV1LevelInfo *const level_info = level_params->level_info[op]; assert(level_info != NULL); for (int level = 0; level < SEQ_LEVELS; ++level) { if (!is_valid_seq_level_idx(level)) continue; const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints( level_info, level, tier, is_still_picture, profile, 1); if (fail_id == TARGET_LEVEL_OK) { seq_level_idx[op] = level; break; } } } return AOM_CODEC_OK; } aom_codec_err_t av1_get_target_seq_level_idx(const SequenceHeader *seq_params, const AV1LevelParams *level_params, int *target_seq_level_idx) { for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) { target_seq_level_idx[op] = (int)SEQ_LEVEL_MAX; if (!((level_params->keep_level_stats >> op) & 1)) continue; target_seq_level_idx[op] = level_params->target_seq_level_idx[op]; } return AOM_CODEC_OK; } aom-3.12.1/av1/encoder/level.h000066400000000000000000000155051477627663500157750ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_LEVEL_H_ #define AOM_AV1_ENCODER_LEVEL_H_ #include "av1/common/enums.h" struct AV1_COMP; // AV1 Level Specifications typedef struct { AV1_LEVEL level; int max_picture_size; int max_h_size; int max_v_size; int max_header_rate; int max_tile_rate; int max_tiles; int max_tile_cols; int64_t max_display_rate; int64_t max_decode_rate; double main_mbps; double high_mbps; double main_cr; double high_cr; } AV1LevelSpec; typedef struct { int64_t ts_start; int64_t ts_end; size_t encoded_size_in_bytes; int pic_size; int frame_header_count; int tiles; int show_frame; int show_existing_frame; } FrameRecord; // Record frame info. in a rolling window. #define FRAME_WINDOW_SIZE 256 typedef struct { FrameRecord buf[FRAME_WINDOW_SIZE]; int num; // Number of FrameRecord stored in the buffer. int start; // Buffer index of the first FrameRecord. } FrameWindowBuffer; typedef struct { int max_bitrate; // Max bitrate in any 1-second window, in bps. int max_tile_size; int max_superres_tile_width; int min_cropped_tile_width; int min_cropped_tile_height; int tile_width_is_valid; int min_frame_width; int min_frame_height; double total_compressed_size; // In bytes. double total_time_encoded; // In seconds. double min_cr; } AV1LevelStats; // The following data structures are for the decoder model. typedef struct { int decoder_ref_count; int player_ref_count; int display_index; FRAME_TYPE frame_type; double presentation_time; } FRAME_BUFFER; // Interval of bits transmission for a DFG(Decodable Frame Group). typedef struct { double first_bit_arrival_time; // Time when the first bit arrives. double last_bit_arrival_time; // Time when the last bit arrives. // Removal time means the time when the bits to be decoded are removed from // the smoothing buffer. Removal time is essentially the time when the // decoding of the frame starts. double removal_time; } DFG_INTERVAL; #define DFG_INTERVAL_QUEUE_SIZE 64 typedef struct { int head; int size; double total_interval; DFG_INTERVAL buf[DFG_INTERVAL_QUEUE_SIZE]; } DFG_INTERVAL_QUEUE; enum { RESOURCE_MODE = 0, // Resource availability mode. SCHEDULE_MODE // Decoding schedule mode. } UENUM1BYTE(DECODER_MODEL_MODE); enum { DECODER_MODEL_OK = 0, DECODE_BUFFER_AVAILABLE_LATE, DECODE_FRAME_BUF_UNAVAILABLE, DECODE_EXISTING_FRAME_BUF_EMPTY, DISPLAY_FRAME_LATE, SMOOTHING_BUFFER_UNDERFLOW, SMOOTHING_BUFFER_OVERFLOW, DECODER_MODEL_DISABLED } UENUM1BYTE(DECODER_MODEL_STATUS); #define BUFFER_POOL_MAX_SIZE 10 typedef struct { DECODER_MODEL_STATUS status; DECODER_MODEL_MODE mode; bool is_low_delay_mode; AV1_LEVEL level; int encoder_buffer_delay; // In units of 1/90000 seconds. int decoder_buffer_delay; // In units of 1/90000 seconds. int num_ticks_per_picture; int initial_display_delay; // In units of frames. int64_t decode_rate; double display_clock_tick; // In units of seconds. double current_time; // In units of seconds. double initial_presentation_delay; // In units of seconds. double bit_rate; // Bits per second. int num_frame; int num_decoded_frame; int num_shown_frame; int vbi[REF_FRAMES]; // Virtual buffer index. FRAME_BUFFER frame_buffer_pool[BUFFER_POOL_MAX_SIZE]; DFG_INTERVAL_QUEUE dfg_interval_queue; // Information for the DFG(Decodable Frame Group) being processed. double first_bit_arrival_time; double last_bit_arrival_time; size_t coded_bits; // Information for the frame being processed. double removal_time; double presentation_time; int decode_samples; int display_samples; double max_display_rate; double max_decode_rate; } DECODER_MODEL; typedef struct { AV1LevelStats level_stats; AV1LevelSpec level_spec; FrameWindowBuffer frame_window_buffer; DECODER_MODEL decoder_models[SEQ_LEVELS]; } AV1LevelInfo; typedef struct AV1LevelParams { // Specifies the level that the coded video sequence conforms to for each // operating point. AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS]; // Bit mask to indicate whether to keep level stats for corresponding // operating points. uint32_t keep_level_stats; // Level information for each operating point. AV1LevelInfo *level_info[MAX_NUM_OPERATING_POINTS]; } AV1LevelParams; static inline int is_in_operating_point(int operating_point, int temporal_layer_id, int spatial_layer_id) { if (!operating_point) return 1; return ((operating_point >> temporal_layer_id) & 1) && ((operating_point >> (spatial_layer_id + 8)) & 1); } void av1_init_level_info(struct AV1_COMP *cpi); void av1_update_level_info(struct AV1_COMP *cpi, size_t size, int64_t ts_start, int64_t ts_end); // Return sequence level indices in seq_level_idx[MAX_NUM_OPERATING_POINTS]. aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params, const AV1LevelParams *level_params, int *seq_level_idx); aom_codec_err_t av1_get_target_seq_level_idx(const SequenceHeader *seq_params, const AV1LevelParams *level_params, int *target_seq_level_idx); // This function uses the decoder model to check whether there could be // SMOOTHING_BUFFER_UNDERFLOW or SMOOTHING_BUFFER_OVERFLOW. It does not // update the content of decoder_model, and can be used to target certain // encoding level in the recode loop. DECODER_MODEL_STATUS av1_decoder_model_try_smooth_buf( const struct AV1_COMP *const cpi, size_t coded_bits, const DECODER_MODEL *const decoder_model); // Return max bitrate(bps) for given level. double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier, BITSTREAM_PROFILE profile); // Get max number of tiles and tile columns for given level. void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles, int *const max_tile_cols); // Return minimum compression ratio for given level. double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier, int is_still_picture); #endif // AOM_AV1_ENCODER_LEVEL_H_ aom-3.12.1/av1/encoder/lookahead.c000066400000000000000000000166431477627663500166140ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "aom_scale/yv12config.h" #include "av1/common/common.h" #include "av1/encoder/encoder.h" #include "av1/encoder/extend.h" #include "av1/encoder/lookahead.h" /* Return the buffer at the given absolute index and increment the index */ static struct lookahead_entry *pop(struct lookahead_ctx *ctx, int *idx) { int index = *idx; struct lookahead_entry *buf = ctx->buf + index; assert(index < ctx->max_sz); if (++index >= ctx->max_sz) index -= ctx->max_sz; *idx = index; return buf; } void av1_lookahead_destroy(struct lookahead_ctx *ctx) { if (ctx) { if (ctx->buf) { int i; for (i = 0; i < ctx->max_sz; i++) aom_free_frame_buffer(&ctx->buf[i].img); free(ctx->buf); } free(ctx); } } struct lookahead_ctx *av1_lookahead_init( unsigned int width, unsigned int height, unsigned int subsampling_x, unsigned int subsampling_y, int use_highbitdepth, unsigned int depth, const int border_in_pixels, int byte_alignment, int num_lap_buffers, bool is_all_intra, bool alloc_pyramid) { int lag_in_frames = AOMMAX(1, depth); // For all-intra frame encoding, previous source frames are not required. // Hence max_pre_frames is set to 0 in this case. As previous source frames // are accessed using a negative index to av1_lookahead_peek(), setting // max_pre_frames to 0 will cause av1_lookahead_peek() to return NULL for a // negative index. const uint8_t max_pre_frames = is_all_intra ? 0 : MAX_PRE_FRAMES; // Add the lags to depth and clamp depth += num_lap_buffers; depth = clamp(depth, 1, MAX_TOTAL_BUFFERS); // Allocate memory to keep previous source frames available. depth += max_pre_frames; // Allocate the lookahead structures struct lookahead_ctx *ctx = calloc(1, sizeof(*ctx)); if (ctx) { unsigned int i; ctx->max_sz = depth; ctx->push_frame_count = 0; ctx->max_pre_frames = max_pre_frames; ctx->read_ctxs[ENCODE_STAGE].pop_sz = ctx->max_sz - ctx->max_pre_frames; ctx->read_ctxs[ENCODE_STAGE].valid = 1; if (num_lap_buffers) { ctx->read_ctxs[LAP_STAGE].pop_sz = lag_in_frames; ctx->read_ctxs[LAP_STAGE].valid = 1; } ctx->buf = calloc(depth, sizeof(*ctx->buf)); if (!ctx->buf) goto fail; for (i = 0; i < depth; i++) { if (aom_realloc_frame_buffer( &ctx->buf[i].img, width, height, subsampling_x, subsampling_y, use_highbitdepth, border_in_pixels, byte_alignment, NULL, NULL, NULL, alloc_pyramid, 0)) { goto fail; } } } return ctx; fail: av1_lookahead_destroy(ctx); return NULL; } int av1_lookahead_full(const struct lookahead_ctx *ctx) { // TODO(angiebird): Test this function. return ctx->read_ctxs[ENCODE_STAGE].sz >= ctx->read_ctxs[ENCODE_STAGE].pop_sz; } int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src, int64_t ts_start, int64_t ts_end, int use_highbitdepth, bool alloc_pyramid, aom_enc_frame_flags_t flags) { int width = src->y_crop_width; int height = src->y_crop_height; int uv_width = src->uv_crop_width; int uv_height = src->uv_crop_height; int subsampling_x = src->subsampling_x; int subsampling_y = src->subsampling_y; int larger_dimensions, new_dimensions; assert(ctx->read_ctxs[ENCODE_STAGE].valid == 1); if (ctx->read_ctxs[ENCODE_STAGE].sz + ctx->max_pre_frames > ctx->max_sz) return 1; ctx->read_ctxs[ENCODE_STAGE].sz++; if (ctx->read_ctxs[LAP_STAGE].valid) { ctx->read_ctxs[LAP_STAGE].sz++; } struct lookahead_entry *buf = pop(ctx, &ctx->write_idx); new_dimensions = width != buf->img.y_crop_width || height != buf->img.y_crop_height || uv_width != buf->img.uv_crop_width || uv_height != buf->img.uv_crop_height; larger_dimensions = width > buf->img.y_crop_width || height > buf->img.y_crop_height || uv_width > buf->img.uv_crop_width || uv_height > buf->img.uv_crop_height; assert(!larger_dimensions || new_dimensions); if (larger_dimensions) { YV12_BUFFER_CONFIG new_img; memset(&new_img, 0, sizeof(new_img)); if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x, subsampling_y, use_highbitdepth, AOM_BORDER_IN_PIXELS, 0, alloc_pyramid, 0)) return 1; aom_free_frame_buffer(&buf->img); buf->img = new_img; } else if (new_dimensions) { buf->img.y_width = src->y_width; buf->img.y_height = src->y_height; buf->img.uv_width = src->uv_width; buf->img.uv_height = src->uv_height; buf->img.y_crop_width = src->y_crop_width; buf->img.y_crop_height = src->y_crop_height; buf->img.uv_crop_width = src->uv_crop_width; buf->img.uv_crop_height = src->uv_crop_height; buf->img.subsampling_x = src->subsampling_x; buf->img.subsampling_y = src->subsampling_y; } av1_copy_and_extend_frame(src, &buf->img); buf->ts_start = ts_start; buf->ts_end = ts_end; buf->display_idx = ctx->push_frame_count; buf->flags = flags; ++ctx->push_frame_count; aom_remove_metadata_from_frame_buffer(&buf->img); if (src->metadata && aom_copy_metadata_to_frame_buffer(&buf->img, src->metadata)) { return 1; } return 0; } struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain, COMPRESSOR_STAGE stage) { struct lookahead_entry *buf = NULL; if (ctx) { struct read_ctx *read_ctx = &ctx->read_ctxs[stage]; assert(read_ctx->valid == 1); if (read_ctx->sz && (drain || read_ctx->sz == read_ctx->pop_sz)) { buf = pop(ctx, &read_ctx->read_idx); read_ctx->sz--; } } return buf; } struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index, COMPRESSOR_STAGE stage) { struct lookahead_entry *buf = NULL; if (ctx == NULL) { return buf; } struct read_ctx *read_ctx = &ctx->read_ctxs[stage]; assert(read_ctx->valid == 1); if (index >= 0) { // Forward peek if (index < read_ctx->sz) { index += read_ctx->read_idx; if (index >= ctx->max_sz) index -= ctx->max_sz; buf = ctx->buf + index; } } else if (index < 0) { // Backward peek if (-index <= ctx->max_pre_frames) { index += (int)(read_ctx->read_idx); if (index < 0) index += (int)(ctx->max_sz); buf = ctx->buf + index; } } return buf; } unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage) { assert(ctx != NULL); struct read_ctx *read_ctx = &ctx->read_ctxs[stage]; assert(read_ctx->valid == 1); return read_ctx->sz; } int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage) { assert(ctx != NULL); struct read_ctx *read_ctx = &ctx->read_ctxs[stage]; assert(read_ctx->valid == 1); return read_ctx->pop_sz; } aom-3.12.1/av1/encoder/lookahead.h000066400000000000000000000114311477627663500166070ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief Describes look ahead buffer operations. */ #ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_ #define AOM_AV1_ENCODER_LOOKAHEAD_H_ #include #include "aom_scale/yv12config.h" #include "aom/aom_integer.h" #ifdef __cplusplus extern "C" { #endif /*!\cond */ #define MAX_LAG_BUFFERS 48 #define MAX_LAP_BUFFERS 48 #define MAX_TOTAL_BUFFERS (MAX_LAG_BUFFERS + MAX_LAP_BUFFERS) #define LAP_LAG_IN_FRAMES 17 struct lookahead_entry { YV12_BUFFER_CONFIG img; int64_t ts_start; int64_t ts_end; int display_idx; aom_enc_frame_flags_t flags; }; // The max of past frames we want to keep in the queue. #define MAX_PRE_FRAMES 1 enum { ENCODE_STAGE, LAP_STAGE, MAX_STAGES } UENUM1BYTE(COMPRESSOR_STAGE); struct read_ctx { int sz; /* Number of buffers currently in the queue */ int read_idx; /* Read index */ int pop_sz; /* Size to check for pop condition */ int valid; /* Is this ctx valid? */ }; struct lookahead_ctx { int max_sz; /* Absolute size of the queue */ int write_idx; /* Write index */ struct read_ctx read_ctxs[MAX_STAGES]; /* Read context */ struct lookahead_entry *buf; /* Buffer list */ int push_frame_count; /* Number of frames that have been pushed in the queue*/ uint8_t max_pre_frames; /* Maximum number of past frames allowed in the queue */ }; /*!\endcond */ /**\brief Initializes the lookahead stage * * The lookahead stage is a queue of frame buffers on which some analysis * may be done when buffers are enqueued. */ struct lookahead_ctx *av1_lookahead_init( unsigned int width, unsigned int height, unsigned int subsampling_x, unsigned int subsampling_y, int use_highbitdepth, unsigned int depth, const int border_in_pixels, int byte_alignment, int num_lap_buffers, bool is_all_intra, bool alloc_pyramid); /**\brief Destroys the lookahead stage */ void av1_lookahead_destroy(struct lookahead_ctx *ctx); /**\brief Check if lookahead buffer is full */ int av1_lookahead_full(const struct lookahead_ctx *ctx); /**\brief Enqueue a source buffer * * This function will copy the source image into a new framebuffer with * the expected stride/border. * * \param[in] ctx Pointer to the lookahead context * \param[in] src Pointer to the image to enqueue * \param[in] ts_start Timestamp for the start of this frame * \param[in] ts_end Timestamp for the end of this frame * \param[in] use_highbitdepth Tell if HBD is used * \param[in] alloc_pyramid Whether to allocate a downsampling pyramid * for each frame buffer * \param[in] flags Flags set on this frame */ int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src, int64_t ts_start, int64_t ts_end, int use_highbitdepth, bool alloc_pyramid, aom_enc_frame_flags_t flags); /**\brief Get the next source buffer to encode * * \param[in] ctx Pointer to the lookahead context * \param[in] drain Flag indicating the buffer should be drained * (return a buffer regardless of the current queue depth) * \param[in] stage Encoder stage * * \retval Return NULL, if drain set and queue is empty, or if drain not set and * queue not of the configured depth. */ struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain, COMPRESSOR_STAGE stage); /**\brief Get a future source buffer to encode * * \param[in] ctx Pointer to the lookahead context * \param[in] index Index of the frame to be returned, 0 == next frame * \param[in] stage Encoder stage * * \retval Return NULL, if no buffer exists at the specified index */ struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index, COMPRESSOR_STAGE stage); /**\brief Get the number of frames currently in the lookahead queue */ unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage); /**\brief Get pop_sz value */ int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_LOOKAHEAD_H_ aom-3.12.1/av1/encoder/mcomp.c000066400000000000000000004707501477627663500160030ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "av1/common/av1_common_int.h" #include "av1/common/common.h" #include "av1/common/filter.h" #include "av1/common/mvref_common.h" #include "av1/common/reconinter.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/rdopt.h" #include "av1/encoder/reconinter_enc.h" static inline void init_mv_cost_params(MV_COST_PARAMS *mv_cost_params, const MvCosts *mv_costs, const MV *ref_mv, int errorperbit, int sadperbit) { mv_cost_params->ref_mv = ref_mv; mv_cost_params->full_ref_mv = get_fullmv_from_mv(ref_mv); mv_cost_params->mv_cost_type = MV_COST_ENTROPY; mv_cost_params->error_per_bit = errorperbit; mv_cost_params->sad_per_bit = sadperbit; // For allintra encoding mode, 'mv_costs' is not allocated. Hence, the // population of mvjcost and mvcost are avoided. In case of IntraBC, these // values are populated from 'dv_costs' in av1_set_ms_to_intra_mode(). if (mv_costs != NULL) { mv_cost_params->mvjcost = mv_costs->nmv_joint_cost; mv_cost_params->mvcost[0] = mv_costs->mv_cost_stack[0]; mv_cost_params->mvcost[1] = mv_costs->mv_cost_stack[1]; } } static inline void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) { ms_buffers->ref = &x->e_mbd.plane[0].pre[0]; ms_buffers->src = &x->plane[0].src; av1_set_ms_compound_refs(ms_buffers, NULL, NULL, 0, 0); ms_buffers->wsrc = x->obmc_buffer.wsrc; ms_buffers->obmc_mask = x->obmc_buffer.mask; } void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer) { obmc_buffer->wsrc = NULL; obmc_buffer->mask = NULL; obmc_buffer->above_pred = NULL; obmc_buffer->left_pred = NULL; } void av1_make_default_fullpel_ms_params( FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv, const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS], SEARCH_METHODS search_method, int fine_search_interval) { const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf; const int is_key_frame = cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE; // High level params ms_params->bsize = bsize; ms_params->vfp = &cpi->ppi->fn_ptr[bsize]; init_ms_buffers(&ms_params->ms_buffers, x); av1_set_mv_search_method(ms_params, search_sites, search_method); ms_params->mesh_patterns[0] = mv_sf->mesh_patterns; ms_params->mesh_patterns[1] = mv_sf->intrabc_mesh_patterns; ms_params->force_mesh_thresh = mv_sf->exhaustive_searches_thresh; ms_params->prune_mesh_search = (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_2) ? 1 : 0; ms_params->mesh_search_mv_diff_threshold = 4; ms_params->run_mesh_search = 0; ms_params->fine_search_interval = fine_search_interval; ms_params->is_intra_mode = 0; ms_params->fast_obmc_search = mv_sf->obmc_full_pixel_search_level; ms_params->mv_limits = x->mv_limits; av1_set_mv_search_range(&ms_params->mv_limits, ref_mv); // Mvcost params init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv, x->errorperbit, x->sadperbit); ms_params->sdf = ms_params->vfp->sdf; ms_params->sdx4df = ms_params->vfp->sdx4df; ms_params->sdx3df = ms_params->vfp->sdx3df; if (mv_sf->use_downsampled_sad == 2 && block_size_high[bsize] >= 16) { assert(ms_params->vfp->sdsf != NULL); ms_params->sdf = ms_params->vfp->sdsf; assert(ms_params->vfp->sdsx4df != NULL); ms_params->sdx4df = ms_params->vfp->sdsx4df; // Skip version of sadx3 is not available yet ms_params->sdx3df = ms_params->vfp->sdsx4df; } else if (mv_sf->use_downsampled_sad == 1 && block_size_high[bsize] >= 16 && !is_key_frame) { FULLPEL_MV start_mv_clamped = start_mv; // adjust start_mv to make sure it is within MV range clamp_fullmv(&start_mv_clamped, &ms_params->mv_limits); const struct buf_2d *const ref = ms_params->ms_buffers.ref; const int ref_stride = ref->stride; const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv_clamped); const struct buf_2d *const src = ms_params->ms_buffers.src; const uint8_t *src_buf = src->buf; const int src_stride = src->stride; unsigned int start_mv_sad_even_rows, start_mv_sad_odd_rows; assert(ms_params->vfp->sdsf != NULL); start_mv_sad_even_rows = ms_params->vfp->sdsf(src_buf, src_stride, best_address, ref_stride); start_mv_sad_odd_rows = ms_params->vfp->sdsf(src_buf + src_stride, src_stride, best_address + ref_stride, ref_stride); // If the absolute SAD difference computed between the pred-to-src of even // and odd rows is small, skip every other row in sad computation. const int odd_to_even_diff_sad = abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows); const int mult_thresh = 4; if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) { ms_params->sdf = ms_params->vfp->sdsf; assert(ms_params->vfp->sdsx4df != NULL); ms_params->sdx4df = ms_params->vfp->sdsx4df; ms_params->sdx3df = ms_params->vfp->sdsx4df; } } } void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const IntraBCMVCosts *dv_costs) { ms_params->is_intra_mode = 1; MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; mv_cost_params->mvjcost = dv_costs->joint_mv; mv_cost_params->mvcost[0] = dv_costs->dv_costs[0]; mv_cost_params->mvcost[1] = dv_costs->dv_costs[1]; } void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, const int *cost_list) { const AV1_COMMON *cm = &cpi->common; // High level params ms_params->allow_hp = cm->features.allow_high_precision_mv; ms_params->forced_stop = cpi->sf.mv_sf.subpel_force_stop; ms_params->iters_per_step = cpi->sf.mv_sf.subpel_iters_per_step; ms_params->cost_list = cond_cost_list_const(cpi, cost_list); av1_set_subpel_mv_search_range(&ms_params->mv_limits, &x->mv_limits, ref_mv); // Mvcost params init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv, x->errorperbit, x->sadperbit); // Subpel variance params ms_params->var_params.vfp = &cpi->ppi->fn_ptr[bsize]; ms_params->var_params.subpel_search_type = cpi->sf.mv_sf.use_accurate_subpel_search; ms_params->var_params.w = block_size_wide[bsize]; ms_params->var_params.h = block_size_high[bsize]; // Ref and src buffers MSBuffers *ms_buffers = &ms_params->var_params.ms_buffers; init_ms_buffers(ms_buffers, x); } void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv) { // Calculate the outermost full-pixel MVs which are inside the limits set by // av1_set_subpel_mv_search_range(). // // The subpel limits are simply mv->col +/- 8*MAX_FULL_PEL_VAL, and similar // for mv->row. We can then divide by 8 to find the fullpel MV limits. But // we have to be careful about the rounding. We want these bounds to be // at least as tight as the subpel limits, which means that we must round // the minimum values up and the maximum values down when dividing. int col_min = ((mv->col + 7) >> 3) - MAX_FULL_PEL_VAL; int row_min = ((mv->row + 7) >> 3) - MAX_FULL_PEL_VAL; int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL; int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL; col_min = AOMMAX(col_min, (MV_LOW >> 3) + 1); row_min = AOMMAX(row_min, (MV_LOW >> 3) + 1); col_max = AOMMIN(col_max, (MV_UPP >> 3) - 1); row_max = AOMMIN(row_max, (MV_UPP >> 3) - 1); // Get intersection of UMV window and valid MV window to reduce # of checks // in diamond search. if (mv_limits->col_min < col_min) mv_limits->col_min = col_min; if (mv_limits->col_max > col_max) mv_limits->col_max = col_max; if (mv_limits->row_min < row_min) mv_limits->row_min = row_min; if (mv_limits->row_max > row_max) mv_limits->row_max = row_max; mv_limits->col_max = AOMMAX(mv_limits->col_min, mv_limits->col_max); mv_limits->row_max = AOMMAX(mv_limits->row_min, mv_limits->row_max); } int av1_init_search_range(int size) { int sr = 0; // Minimum search size no matter what the passed in value. size = AOMMAX(16, size); while ((size << sr) < MAX_FULL_PEL_VAL) sr++; sr = AOMMIN(sr, MAX_MVSEARCH_STEPS - 2); return sr; } // ============================================================================ // Cost of motion vectors // ============================================================================ // TODO(any): Adaptively adjust the regularization strength based on image size // and motion activity instead of using hard-coded values. It seems like we // roughly half the lambda for each increase in resolution // These are multiplier used to perform regularization in motion compensation // when x->mv_cost_type is set to MV_COST_L1. // LOWRES #define SSE_LAMBDA_LOWRES 2 // Used by mv_cost_err_fn #define SAD_LAMBDA_LOWRES 32 // Used by mvsad_err_cost during full pixel search // MIDRES #define SSE_LAMBDA_MIDRES 0 // Used by mv_cost_err_fn #define SAD_LAMBDA_MIDRES 15 // Used by mvsad_err_cost during full pixel search // HDRES #define SSE_LAMBDA_HDRES 1 // Used by mv_cost_err_fn #define SAD_LAMBDA_HDRES 8 // Used by mvsad_err_cost during full pixel search // Returns the rate of encoding the current motion vector based on the // joint_cost and comp_cost. joint_costs covers the cost of transmitting // JOINT_MV, and comp_cost covers the cost of transmitting the actual motion // vector. static inline int mv_cost(const MV *mv, const int *joint_cost, const int *const comp_cost[2]) { return joint_cost[av1_get_mv_joint(mv)] + comp_cost[0][mv->row] + comp_cost[1][mv->col]; } #define CONVERT_TO_CONST_MVCOST(ptr) ((const int *const *)(ptr)) // Returns the cost of encoding the motion vector diff := *mv - *ref. The cost // is defined as the rate required to encode diff * weight, rounded to the // nearest 2 ** 7. // This is NOT used during motion compensation. int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost, int *const mvcost[2], int weight) { const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col }; return ROUND_POWER_OF_TWO( mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * weight, 7); } // Returns the cost of using the current mv during the motion search. This is // used when var is used as the error metric. #define PIXEL_TRANSFORM_ERROR_SCALE 4 static inline int mv_err_cost(const MV *mv, const MV *ref_mv, const int *mvjcost, const int *const mvcost[2], int error_per_bit, MV_COST_TYPE mv_cost_type) { const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col }; const MV abs_diff = { abs(diff.row), abs(diff.col) }; switch (mv_cost_type) { case MV_COST_ENTROPY: if (mvcost) { return (int)ROUND_POWER_OF_TWO_64( (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit, RDDIV_BITS + AV1_PROB_COST_SHIFT - RD_EPB_SHIFT + PIXEL_TRANSFORM_ERROR_SCALE); } return 0; case MV_COST_L1_LOWRES: return (SSE_LAMBDA_LOWRES * (abs_diff.row + abs_diff.col)) >> 3; case MV_COST_L1_MIDRES: return (SSE_LAMBDA_MIDRES * (abs_diff.row + abs_diff.col)) >> 3; case MV_COST_L1_HDRES: return (SSE_LAMBDA_HDRES * (abs_diff.row + abs_diff.col)) >> 3; case MV_COST_NONE: return 0; default: assert(0 && "Invalid rd_cost_type"); return 0; } } static inline int mv_err_cost_(const MV *mv, const MV_COST_PARAMS *mv_cost_params) { if (mv_cost_params->mv_cost_type == MV_COST_NONE) { return 0; } return mv_err_cost(mv, mv_cost_params->ref_mv, mv_cost_params->mvjcost, mv_cost_params->mvcost, mv_cost_params->error_per_bit, mv_cost_params->mv_cost_type); } // Returns the cost of using the current mv during the motion search. This is // only used during full pixel motion search when sad is used as the error // metric static inline int mvsad_err_cost(const FULLPEL_MV *mv, const FULLPEL_MV *ref_mv, const int *mvjcost, const int *const mvcost[2], int sad_per_bit, MV_COST_TYPE mv_cost_type) { const MV diff = { GET_MV_SUBPEL(mv->row - ref_mv->row), GET_MV_SUBPEL(mv->col - ref_mv->col) }; switch (mv_cost_type) { case MV_COST_ENTROPY: return ROUND_POWER_OF_TWO( (unsigned)mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * sad_per_bit, AV1_PROB_COST_SHIFT); case MV_COST_L1_LOWRES: return (SAD_LAMBDA_LOWRES * (abs(diff.row) + abs(diff.col))) >> 3; case MV_COST_L1_MIDRES: return (SAD_LAMBDA_MIDRES * (abs(diff.row) + abs(diff.col))) >> 3; case MV_COST_L1_HDRES: return (SAD_LAMBDA_HDRES * (abs(diff.row) + abs(diff.col))) >> 3; case MV_COST_NONE: return 0; default: assert(0 && "Invalid rd_cost_type"); return 0; } } static inline int mvsad_err_cost_(const FULLPEL_MV *mv, const MV_COST_PARAMS *mv_cost_params) { return mvsad_err_cost(mv, &mv_cost_params->full_ref_mv, mv_cost_params->mvjcost, mv_cost_params->mvcost, mv_cost_params->sad_per_bit, mv_cost_params->mv_cost_type); } // ============================================================================= // Fullpixel Motion Search: Translational // ============================================================================= #define MAX_PATTERN_SCALES 11 #define MAX_PATTERN_CANDIDATES 8 // max number of candidates per scale #define PATTERN_CANDIDATES_REF 3 // number of refinement candidates // Search site initialization for DIAMOND / CLAMPED_DIAMOND search methods. // level = 0: DIAMOND, level = 1: CLAMPED_DIAMOND. static void init_dsmotion_compensation(search_site_config *cfg, int stride, int level) { int num_search_steps = 0; int stage_index = MAX_MVSEARCH_STEPS - 1; cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0; cfg->site[stage_index][0].offset = 0; cfg->stride = stride; // Choose the initial step size depending on level. const int first_step = (level > 0) ? (MAX_FIRST_STEP / 4) : MAX_FIRST_STEP; for (int radius = first_step; radius > 0;) { int num_search_pts = 8; const FULLPEL_MV search_site_mvs[13] = { { 0, 0 }, { -radius, 0 }, { radius, 0 }, { 0, -radius }, { 0, radius }, { -radius, -radius }, { radius, radius }, { -radius, radius }, { radius, -radius }, }; int i; for (i = 0; i <= num_search_pts; ++i) { search_site *const site = &cfg->site[stage_index][i]; site->mv = search_site_mvs[i]; site->offset = get_offset_from_fullmv(&site->mv, stride); } cfg->searches_per_step[stage_index] = num_search_pts; cfg->radius[stage_index] = radius; // Update the search radius based on level. if (!level || ((stage_index < 9) && level)) radius /= 2; --stage_index; ++num_search_steps; } cfg->num_search_steps = num_search_steps; } void av1_init_motion_fpf(search_site_config *cfg, int stride) { int num_search_steps = 0; int stage_index = MAX_MVSEARCH_STEPS - 1; cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0; cfg->site[stage_index][0].offset = 0; cfg->stride = stride; for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) { // Generate offsets for 8 search sites per step. int tan_radius = AOMMAX((int)(0.41 * radius), 1); int num_search_pts = 12; if (radius == 1) num_search_pts = 8; const FULLPEL_MV search_site_mvs[13] = { { 0, 0 }, { -radius, 0 }, { radius, 0 }, { 0, -radius }, { 0, radius }, { -radius, -tan_radius }, { radius, tan_radius }, { -tan_radius, radius }, { tan_radius, -radius }, { -radius, tan_radius }, { radius, -tan_radius }, { tan_radius, radius }, { -tan_radius, -radius }, }; int i; for (i = 0; i <= num_search_pts; ++i) { search_site *const site = &cfg->site[stage_index][i]; site->mv = search_site_mvs[i]; site->offset = get_offset_from_fullmv(&site->mv, stride); } cfg->searches_per_step[stage_index] = num_search_pts; cfg->radius[stage_index] = radius; --stage_index; ++num_search_steps; } cfg->num_search_steps = num_search_steps; } // Search site initialization for NSTEP / NSTEP_8PT search methods. // level = 0: NSTEP, level = 1: NSTEP_8PT. static void init_motion_compensation_nstep(search_site_config *cfg, int stride, int level) { int num_search_steps = 0; int stage_index = 0; cfg->stride = stride; int radius = 1; const int num_stages = (level > 0) ? 16 : 15; for (stage_index = 0; stage_index < num_stages; ++stage_index) { int tan_radius = AOMMAX((int)(0.41 * radius), 1); int num_search_pts = 12; if ((radius <= 5) || (level > 0)) { tan_radius = radius; num_search_pts = 8; } const FULLPEL_MV search_site_mvs[13] = { { 0, 0 }, { -radius, 0 }, { radius, 0 }, { 0, -radius }, { 0, radius }, { -radius, -tan_radius }, { radius, tan_radius }, { -tan_radius, radius }, { tan_radius, -radius }, { -radius, tan_radius }, { radius, -tan_radius }, { tan_radius, radius }, { -tan_radius, -radius }, }; for (int i = 0; i <= num_search_pts; ++i) { search_site *const site = &cfg->site[stage_index][i]; site->mv = search_site_mvs[i]; site->offset = get_offset_from_fullmv(&site->mv, stride); } cfg->searches_per_step[stage_index] = num_search_pts; cfg->radius[stage_index] = radius; ++num_search_steps; if (stage_index < 12) radius = (int)AOMMAX((radius * 1.5 + 0.5), radius + 1); } cfg->num_search_steps = num_search_steps; } // Search site initialization for BIGDIA / FAST_BIGDIA / FAST_DIAMOND // search methods. static void init_motion_compensation_bigdia(search_site_config *cfg, int stride, int level) { (void)level; cfg->stride = stride; // First scale has 4-closest points, the rest have 8 points in diamond // shape at increasing scales static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = { 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, }; // BIGDIA search method candidates. // Note that the largest candidate step at each scale is 2^scale /* clang-format off */ static const FULLPEL_MV site_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } }, { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 }, { -1, 1 }, { -2, 0 } }, { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 }, { -2, 2 }, { -4, 0 } }, { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 }, { -4, 4 }, { -8, 0 } }, { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 }, { -8, 8 }, { -16, 0 } }, { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 }, { 0, 32 }, { -16, 16 }, { -32, 0 } }, { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 }, { 0, 64 }, { -32, 32 }, { -64, 0 } }, { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 }, { 0, 128 }, { -64, 64 }, { -128, 0 } }, { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 }, { 128, 128 }, { 0, 256 }, { -128, 128 }, { -256, 0 } }, { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 }, { 256, 256 }, { 0, 512 }, { -256, 256 }, { -512, 0 } }, { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 }, { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } }, }; /* clang-format on */ int radius = 1; for (int i = 0; i < MAX_PATTERN_SCALES; ++i) { cfg->searches_per_step[i] = bigdia_num_candidates[i]; cfg->radius[i] = radius; for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) { search_site *const site = &cfg->site[i][j]; site->mv = site_candidates[i][j]; site->offset = get_offset_from_fullmv(&site->mv, stride); } radius *= 2; } cfg->num_search_steps = MAX_PATTERN_SCALES; } // Search site initialization for SQUARE search method. static void init_motion_compensation_square(search_site_config *cfg, int stride, int level) { (void)level; cfg->stride = stride; // All scales have 8 closest points in square shape. static const int square_num_candidates[MAX_PATTERN_SCALES] = { 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, }; // Square search method candidates. // Note that the largest candidate step at each scale is 2^scale. /* clang-format off */ static const FULLPEL_MV square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, { -1, 1 }, { -1, 0 } }, { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 }, { -2, 2 }, { -2, 0 } }, { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 }, { -4, 4 }, { -4, 0 } }, { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 }, { -8, 8 }, { -8, 0 } }, { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 }, { 0, 16 }, { -16, 16 }, { -16, 0 } }, { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 }, { 0, 32 }, { -32, 32 }, { -32, 0 } }, { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 }, { 0, 64 }, { -64, 64 }, { -64, 0 } }, { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 }, { 128, 128 }, { 0, 128 }, { -128, 128 }, { -128, 0 } }, { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 }, { 256, 256 }, { 0, 256 }, { -256, 256 }, { -256, 0 } }, { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 }, { 512, 512 }, { 0, 512 }, { -512, 512 }, { -512, 0 } }, { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 }, { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } }, }; /* clang-format on */ int radius = 1; for (int i = 0; i < MAX_PATTERN_SCALES; ++i) { cfg->searches_per_step[i] = square_num_candidates[i]; cfg->radius[i] = radius; for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) { search_site *const site = &cfg->site[i][j]; site->mv = square_candidates[i][j]; site->offset = get_offset_from_fullmv(&site->mv, stride); } radius *= 2; } cfg->num_search_steps = MAX_PATTERN_SCALES; } // Search site initialization for HEX / FAST_HEX search methods. static void init_motion_compensation_hex(search_site_config *cfg, int stride, int level) { (void)level; cfg->stride = stride; // First scale has 8-closest points, the rest have 6 points in hex shape // at increasing scales. static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 }; // Note that the largest candidate step at each scale is 2^scale. /* clang-format off */ static const FULLPEL_MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, { -1, 1 }, { -1, 0 } }, { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } }, { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } }, { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } }, { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 }, { -8, 16 }, { -16, 0 } }, { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 }, { -32, 0 } }, { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 }, { -64, 0 } }, { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 }, { -64, 128 }, { -128, 0 } }, { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 }, { -128, 256 }, { -256, 0 } }, { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 }, { -256, 512 }, { -512, 0 } }, { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 }, { -512, 1024 }, { -1024, 0 } }, }; /* clang-format on */ int radius = 1; for (int i = 0; i < MAX_PATTERN_SCALES; ++i) { cfg->searches_per_step[i] = hex_num_candidates[i]; cfg->radius[i] = radius; for (int j = 0; j < hex_num_candidates[i]; ++j) { search_site *const site = &cfg->site[i][j]; site->mv = hex_candidates[i][j]; site->offset = get_offset_from_fullmv(&site->mv, stride); } radius *= 2; } cfg->num_search_steps = MAX_PATTERN_SCALES; } const av1_init_search_site_config av1_init_motion_compensation[NUM_DISTINCT_SEARCH_METHODS] = { init_dsmotion_compensation, init_motion_compensation_nstep, init_motion_compensation_nstep, init_dsmotion_compensation, init_motion_compensation_hex, init_motion_compensation_bigdia, init_motion_compensation_square }; // Checks whether the mv is within range of the mv_limits static inline int check_bounds(const FullMvLimits *mv_limits, int row, int col, int range) { return ((row - range) >= mv_limits->row_min) & ((row + range) <= mv_limits->row_max) & ((col - range) >= mv_limits->col_min) & ((col + range) <= mv_limits->col_max); } static inline int get_mvpred_var_cost( const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv, FULLPEL_MV_STATS *mv_stats) { const aom_variance_fn_ptr_t *vfp = ms_params->vfp; const MV sub_this_mv = get_mv_from_fullmv(this_mv); const struct buf_2d *const src = ms_params->ms_buffers.src; const struct buf_2d *const ref = ms_params->ms_buffers.ref; const uint8_t *src_buf = src->buf; const int src_stride = src->stride; const int ref_stride = ref->stride; int bestsme; bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv), ref_stride, &mv_stats->sse); mv_stats->distortion = bestsme; mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params); bestsme += mv_stats->err_cost; return bestsme; } static inline int get_mvpred_sad(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct buf_2d *const src, const uint8_t *const ref_address, const int ref_stride) { const uint8_t *src_buf = src->buf; const int src_stride = src->stride; return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride); } static inline int get_mvpred_compound_var_cost( const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv, FULLPEL_MV_STATS *mv_stats) { const aom_variance_fn_ptr_t *vfp = ms_params->vfp; const struct buf_2d *const src = ms_params->ms_buffers.src; const struct buf_2d *const ref = ms_params->ms_buffers.ref; const uint8_t *src_buf = src->buf; const int src_stride = src->stride; const int ref_stride = ref->stride; const uint8_t *mask = ms_params->ms_buffers.mask; const uint8_t *second_pred = ms_params->ms_buffers.second_pred; const int mask_stride = ms_params->ms_buffers.mask_stride; const int invert_mask = ms_params->ms_buffers.inv_mask; int bestsme; if (mask) { bestsme = vfp->msvf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0, src_buf, src_stride, second_pred, mask, mask_stride, invert_mask, &mv_stats->sse); } else if (second_pred) { bestsme = vfp->svaf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0, src_buf, src_stride, &mv_stats->sse, second_pred); } else { bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv), ref_stride, &mv_stats->sse); } mv_stats->distortion = bestsme; const MV sub_this_mv = get_mv_from_fullmv(this_mv); mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params); bestsme += mv_stats->err_cost; return bestsme; } static inline int get_mvpred_compound_sad( const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct buf_2d *const src, const uint8_t *const ref_address, const int ref_stride) { const aom_variance_fn_ptr_t *vfp = ms_params->vfp; const uint8_t *src_buf = src->buf; const int src_stride = src->stride; const uint8_t *mask = ms_params->ms_buffers.mask; const uint8_t *second_pred = ms_params->ms_buffers.second_pred; const int mask_stride = ms_params->ms_buffers.mask_stride; const int invert_mask = ms_params->ms_buffers.inv_mask; if (mask) { return vfp->msdf(src_buf, src_stride, ref_address, ref_stride, second_pred, mask, mask_stride, invert_mask); } else if (second_pred) { assert(vfp->sdaf != NULL); return vfp->sdaf(src_buf, src_stride, ref_address, ref_stride, second_pred); } else { return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride); } } // Calculates and returns a sad+mvcost list around an integer best pel during // fullpixel motion search. The resulting list can be used to speed up subpel // motion search later. #define USE_SAD_COSTLIST 1 // calc_int_cost_list uses var to populate the costlist, which is more accurate // than sad but slightly slower. static AOM_FORCE_INLINE void calc_int_cost_list( const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, int *cost_list) { static const FULLPEL_MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } }; const int br = best_mv.row; const int bc = best_mv.col; FULLPEL_MV_STATS mv_stats; cost_list[0] = get_mvpred_var_cost(ms_params, &best_mv, &mv_stats); if (check_bounds(&ms_params->mv_limits, br, bc, 1)) { for (int i = 0; i < 4; i++) { const FULLPEL_MV neighbor_mv = { br + neighbors[i].row, bc + neighbors[i].col }; cost_list[i + 1] = get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats); } } else { for (int i = 0; i < 4; i++) { const FULLPEL_MV neighbor_mv = { br + neighbors[i].row, bc + neighbors[i].col }; if (!av1_is_fullmv_in_range(&ms_params->mv_limits, neighbor_mv)) { cost_list[i + 1] = INT_MAX; } else { cost_list[i + 1] = get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats); } } } } // calc_int_sad_list uses sad to populate the costlist, which is less accurate // than var but faster. static AOM_FORCE_INLINE void calc_int_sad_list( const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, int *cost_list, int costlist_has_sad) { static const FULLPEL_MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } }; const struct buf_2d *const src = ms_params->ms_buffers.src; const struct buf_2d *const ref = ms_params->ms_buffers.ref; const int ref_stride = ref->stride; const int br = best_mv.row; const int bc = best_mv.col; assert(av1_is_fullmv_in_range(&ms_params->mv_limits, best_mv)); // Refresh the costlist it does not contain valid sad if (!costlist_has_sad) { cost_list[0] = get_mvpred_sad( ms_params, src, get_buf_from_fullmv(ref, &best_mv), ref_stride); if (check_bounds(&ms_params->mv_limits, br, bc, 1)) { for (int i = 0; i < 4; i++) { const FULLPEL_MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col }; cost_list[i + 1] = get_mvpred_sad( ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); } } else { for (int i = 0; i < 4; i++) { const FULLPEL_MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col }; if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) { cost_list[i + 1] = INT_MAX; } else { cost_list[i + 1] = get_mvpred_sad( ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); } } } } const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; cost_list[0] += mvsad_err_cost_(&best_mv, mv_cost_params); for (int idx = 0; idx < 4; idx++) { if (cost_list[idx + 1] != INT_MAX) { const FULLPEL_MV this_mv = { br + neighbors[idx].row, bc + neighbors[idx].col }; cost_list[idx + 1] += mvsad_err_cost_(&this_mv, mv_cost_params); } } } // Computes motion vector cost and adds to the sad cost. // Then updates the best sad and motion vectors. // Inputs: // this_sad: the sad to be evaluated. // mv: the current motion vector. // mv_cost_params: a structure containing information to compute mv cost. // best_sad: the current best sad. // raw_best_sad (optional): the current best sad without calculating mv cost. // best_mv: the current best motion vector. // second_best_mv (optional): the second best motion vector up to now. // Modifies: // best_sad, raw_best_sad, best_mv, second_best_mv // If the current sad is lower than the current best sad. // Returns: // Whether the input sad (mv) is better than the current best. static inline int update_mvs_and_sad(const unsigned int this_sad, const FULLPEL_MV *mv, const MV_COST_PARAMS *mv_cost_params, unsigned int *best_sad, unsigned int *raw_best_sad, FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) { if (this_sad >= *best_sad) return 0; // Add the motion vector cost. const unsigned int sad = this_sad + mvsad_err_cost_(mv, mv_cost_params); if (sad < *best_sad) { if (raw_best_sad) *raw_best_sad = this_sad; *best_sad = sad; if (second_best_mv) *second_best_mv = *best_mv; *best_mv = *mv; return 1; } return 0; } // Calculate sad4 and update the bestmv information // in FAST_DIAMOND search method. static inline void calc_sad4_update_bestmv( const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv, const FULLPEL_MV center_mv, const uint8_t *center_address, unsigned int *bestsad, unsigned int *raw_bestsad, int search_step, int *best_site, int cand_start, int *cost_list) { const struct buf_2d *const src = ms_params->ms_buffers.src; const struct buf_2d *const ref = ms_params->ms_buffers.ref; const search_site *site = ms_params->search_sites->site[search_step]; unsigned char const *block_offset[4]; unsigned int sads_buf[4]; unsigned int *sads; const uint8_t *src_buf = src->buf; const int src_stride = src->stride; if (cost_list) { sads = (unsigned int *)(cost_list + 1); } else { sads = sads_buf; } // Loop over number of candidates. for (int j = 0; j < 4; j++) block_offset[j] = site[cand_start + j].offset + center_address; // 4-point sad calculation. ms_params->sdx4df(src_buf, src_stride, block_offset, ref->stride, sads); for (int j = 0; j < 4; j++) { const FULLPEL_MV this_mv = { center_mv.row + site[cand_start + j].mv.row, center_mv.col + site[cand_start + j].mv.col }; const int found_better_mv = update_mvs_and_sad( sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv, /*second_best_mv=*/NULL); if (found_better_mv) *best_site = cand_start + j; } } static inline void calc_sad3_update_bestmv( const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv, FULLPEL_MV center_mv, const uint8_t *center_address, unsigned int *bestsad, unsigned int *raw_bestsad, int search_step, int *best_site, const int *chkpts_indices, int *cost_list) { const struct buf_2d *const src = ms_params->ms_buffers.src; const struct buf_2d *const ref = ms_params->ms_buffers.ref; const search_site *site = ms_params->search_sites->site[search_step]; unsigned char const *block_offset[4] = { center_address + site[chkpts_indices[0]].offset, center_address + site[chkpts_indices[1]].offset, center_address + site[chkpts_indices[2]].offset, center_address, }; unsigned int sads[4]; ms_params->sdx3df(src->buf, src->stride, block_offset, ref->stride, sads); for (int j = 0; j < 3; j++) { const int index = chkpts_indices[j]; const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row, center_mv.col + site[index].mv.col }; const int found_better_mv = update_mvs_and_sad( sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv, /*second_best_mv=*/NULL); if (found_better_mv) *best_site = j; } if (cost_list) { for (int j = 0; j < 3; j++) { int index = chkpts_indices[j]; cost_list[index + 1] = sads[j]; } } } // Calculate sad and update the bestmv information // in FAST_DIAMOND search method. static inline void calc_sad_update_bestmv( const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv, const FULLPEL_MV center_mv, const uint8_t *center_address, unsigned int *bestsad, unsigned int *raw_bestsad, int search_step, int *best_site, const int num_candidates, int cand_start, int *cost_list) { const struct buf_2d *const src = ms_params->ms_buffers.src; const struct buf_2d *const ref = ms_params->ms_buffers.ref; const search_site *site = ms_params->search_sites->site[search_step]; // Loop over number of candidates. for (int i = cand_start; i < num_candidates; i++) { const FULLPEL_MV this_mv = { center_mv.row + site[i].mv.row, center_mv.col + site[i].mv.col }; if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) continue; int thissad = get_mvpred_sad(ms_params, src, center_address + site[i].offset, ref->stride); if (cost_list) { cost_list[i + 1] = thissad; } const int found_better_mv = update_mvs_and_sad( thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv, /*second_best_mv=*/NULL); if (found_better_mv) *best_site = i; } } static inline void calc_sad_update_bestmv_with_indices( const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv, const FULLPEL_MV center_mv, const uint8_t *center_address, unsigned int *bestsad, unsigned int *raw_bestsad, int search_step, int *best_site, const int num_candidates, const int *chkpts_indices, int *cost_list) { const struct buf_2d *const src = ms_params->ms_buffers.src; const struct buf_2d *const ref = ms_params->ms_buffers.ref; const search_site *site = ms_params->search_sites->site[search_step]; // Loop over number of candidates. for (int i = 0; i < num_candidates; i++) { int index = chkpts_indices[i]; const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row, center_mv.col + site[index].mv.col }; if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) { if (cost_list) { cost_list[index + 1] = INT_MAX; } continue; } const int thissad = get_mvpred_sad( ms_params, src, center_address + site[index].offset, ref->stride); if (cost_list) { cost_list[index + 1] = thissad; } const int found_better_mv = update_mvs_and_sad( thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv, /*second_best_mv=*/NULL); if (found_better_mv) *best_site = i; } } // Generic pattern search function that searches over multiple scales. // Each scale can have a different number of candidates and shape of // candidates as indicated in the num_candidates and candidates arrays // passed into this function static int pattern_search(FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, int search_step, const int do_init_search, int *cost_list, FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats) { static const int search_steps[MAX_MVSEARCH_STEPS] = { 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, }; int i, s, t; const struct buf_2d *const src = ms_params->ms_buffers.src; const struct buf_2d *const ref = ms_params->ms_buffers.ref; const search_site_config *search_sites = ms_params->search_sites; const int *num_candidates = search_sites->searches_per_step; const int ref_stride = ref->stride; const int last_is_4 = num_candidates[0] == 4; int br, bc; unsigned int bestsad = UINT_MAX, raw_bestsad = UINT_MAX; int k = -1; const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; search_step = AOMMIN(search_step, MAX_MVSEARCH_STEPS - 1); assert(search_step >= 0); int best_init_s = search_steps[search_step]; // adjust ref_mv to make sure it is within MV range clamp_fullmv(&start_mv, &ms_params->mv_limits); br = start_mv.row; bc = start_mv.col; if (cost_list != NULL) { cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX; } int costlist_has_sad = 0; // Work out the start point for the search raw_bestsad = get_mvpred_sad(ms_params, src, get_buf_from_fullmv(ref, &start_mv), ref_stride); bestsad = raw_bestsad + mvsad_err_cost_(&start_mv, mv_cost_params); // Search all possible scales up to the search param around the center point // pick the scale of the point that is best as the starting scale of // further steps around it. const uint8_t *center_address = get_buf_from_fullmv(ref, &start_mv); if (do_init_search) { s = best_init_s; best_init_s = -1; for (t = 0; t <= s; ++t) { int best_site = -1; FULLPEL_MV center_mv = { br, bc }; if (check_bounds(&ms_params->mv_limits, br, bc, 1 << t)) { // Call 4-point sad for multiples of 4 candidates. const int no_of_4_cand_loops = num_candidates[t] >> 2; for (i = 0; i < no_of_4_cand_loops; i++) { calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, center_address, &bestsad, &raw_bestsad, t, &best_site, i * 4, /*cost_list=*/NULL); } // Rest of the candidates const int remaining_cand = num_candidates[t] % 4; calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, center_address, &bestsad, &raw_bestsad, t, &best_site, remaining_cand, no_of_4_cand_loops * 4, NULL); } else { calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, center_address, &bestsad, &raw_bestsad, t, &best_site, num_candidates[t], 0, NULL); } if (best_site == -1) { continue; } else { best_init_s = t; k = best_site; } } if (best_init_s != -1) { br += search_sites->site[best_init_s][k].mv.row; bc += search_sites->site[best_init_s][k].mv.col; center_address += search_sites->site[best_init_s][k].offset; } } // If the center point is still the best, just skip this and move to // the refinement step. if (best_init_s != -1) { const int last_s = (last_is_4 && cost_list != NULL); int best_site = -1; s = best_init_s; for (; s >= last_s; s--) { // No need to search all points the 1st time if initial search was used if (!do_init_search || s != best_init_s) { FULLPEL_MV center_mv = { br, bc }; if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) { // Call 4-point sad for multiples of 4 candidates. const int no_of_4_cand_loops = num_candidates[s] >> 2; for (i = 0; i < no_of_4_cand_loops; i++) { calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, center_address, &bestsad, &raw_bestsad, s, &best_site, i * 4, /*cost_list=*/NULL); } // Rest of the candidates const int remaining_cand = num_candidates[s] % 4; calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, center_address, &bestsad, &raw_bestsad, s, &best_site, remaining_cand, no_of_4_cand_loops * 4, NULL); } else { calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, center_address, &bestsad, &raw_bestsad, s, &best_site, num_candidates[s], 0, NULL); } if (best_site == -1) { continue; } else { br += search_sites->site[s][best_site].mv.row; bc += search_sites->site[s][best_site].mv.col; center_address += search_sites->site[s][best_site].offset; k = best_site; } } do { int next_chkpts_indices[PATTERN_CANDIDATES_REF]; best_site = -1; next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; next_chkpts_indices[1] = k; next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; FULLPEL_MV center_mv = { br, bc }; if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) { calc_sad3_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, center_address, &bestsad, &raw_bestsad, s, &best_site, next_chkpts_indices, NULL); } else { calc_sad_update_bestmv_with_indices( ms_params, mv_cost_params, best_mv, center_mv, center_address, &bestsad, &raw_bestsad, s, &best_site, PATTERN_CANDIDATES_REF, next_chkpts_indices, NULL); } if (best_site != -1) { k = next_chkpts_indices[best_site]; br += search_sites->site[s][k].mv.row; bc += search_sites->site[s][k].mv.col; center_address += search_sites->site[s][k].offset; } } while (best_site != -1); } // Note: If we enter the if below, then cost_list must be non-NULL. if (s == 0) { cost_list[0] = raw_bestsad; costlist_has_sad = 1; assert(num_candidates[s] == 4); if (!do_init_search || s != best_init_s) { FULLPEL_MV center_mv = { br, bc }; if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) { calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, center_address, &bestsad, &raw_bestsad, s, &best_site, 0, cost_list); } else { calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, center_address, &bestsad, &raw_bestsad, s, &best_site, /*num_candidates=*/4, /*cand_start=*/0, cost_list); } if (best_site != -1) { br += search_sites->site[s][best_site].mv.row; bc += search_sites->site[s][best_site].mv.col; center_address += search_sites->site[s][best_site].offset; k = best_site; } } while (best_site != -1) { int next_chkpts_indices[PATTERN_CANDIDATES_REF]; best_site = -1; next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; next_chkpts_indices[1] = k; next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX; cost_list[((k + 2) % 4) + 1] = cost_list[0]; cost_list[0] = raw_bestsad; FULLPEL_MV center_mv = { br, bc }; if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) { assert(PATTERN_CANDIDATES_REF == 3); calc_sad3_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, center_address, &bestsad, &raw_bestsad, s, &best_site, next_chkpts_indices, cost_list); } else { calc_sad_update_bestmv_with_indices( ms_params, mv_cost_params, best_mv, center_mv, center_address, &bestsad, &raw_bestsad, s, &best_site, PATTERN_CANDIDATES_REF, next_chkpts_indices, cost_list); } if (best_site != -1) { k = next_chkpts_indices[best_site]; br += search_sites->site[s][k].mv.row; bc += search_sites->site[s][k].mv.col; center_address += search_sites->site[s][k].offset; } } } } best_mv->row = br; best_mv->col = bc; assert(center_address == get_buf_from_fullmv(ref, best_mv) && "center address is out of sync with best_mv!\n"); // Returns the one-away integer pel cost/sad around the best as follows: // cost_list[0]: cost/sad at the best integer pel // cost_list[1]: cost/sad at delta {0, -1} (left) from the best integer pel // cost_list[2]: cost/sad at delta { 1, 0} (bottom) from the best integer pel // cost_list[3]: cost/sad at delta { 0, 1} (right) from the best integer pel // cost_list[4]: cost/sad at delta {-1, 0} (top) from the best integer pel if (cost_list) { if (USE_SAD_COSTLIST) { calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad); } else { calc_int_cost_list(*best_mv, ms_params, cost_list); } } const int var_cost = get_mvpred_var_cost(ms_params, best_mv, best_mv_stats); return var_cost; } // For the following foo_search, the input arguments are: // start_mv: where we are starting our motion search // ms_params: a collection of motion search parameters // search_step: how many steps to skip in our motion search. For example, // a value 3 suggests that 3 search steps have already taken place prior to // this function call, so we jump directly to step 4 of the search process // do_init_search: if on, do an initial search of all possible scales around the // start_mv, and then pick the best scale. // cond_list: used to hold the cost around the best full mv so we can use it to // speed up subpel search later. // best_mv: the best mv found in the motion search static int hex_search(const FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const int search_step, const int do_init_search, int *cost_list, FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats) { return pattern_search(start_mv, ms_params, search_step, do_init_search, cost_list, best_mv, best_mv_stats); } static int bigdia_search(const FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const int search_step, const int do_init_search, int *cost_list, FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats) { return pattern_search(start_mv, ms_params, search_step, do_init_search, cost_list, best_mv, best_mv_stats); } static int square_search(const FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const int search_step, const int do_init_search, int *cost_list, FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats) { return pattern_search(start_mv, ms_params, search_step, do_init_search, cost_list, best_mv, best_mv_stats); } static int fast_hex_search(const FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const int search_step, const int do_init_search, int *cost_list, FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats) { return hex_search(start_mv, ms_params, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step), do_init_search, cost_list, best_mv, best_mv_stats); } static int vfast_dia_search(const FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const int search_step, const int do_init_search, int *cost_list, FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats) { return bigdia_search(start_mv, ms_params, AOMMAX(MAX_MVSEARCH_STEPS - 1, search_step), do_init_search, cost_list, best_mv, best_mv_stats); } static int fast_dia_search(const FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const int search_step, const int do_init_search, int *cost_list, FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats) { return bigdia_search(start_mv, ms_params, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step), do_init_search, cost_list, best_mv, best_mv_stats); } static int fast_bigdia_search(const FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const int search_step, const int do_init_search, int *cost_list, FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats) { return bigdia_search(start_mv, ms_params, AOMMAX(MAX_MVSEARCH_STEPS - 3, search_step), do_init_search, cost_list, best_mv, best_mv_stats); } static int diamond_search_sad(FULLPEL_MV start_mv, unsigned int start_mv_sad, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const int search_step, int *num00, FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) { #define UPDATE_SEARCH_STEP \ do { \ if (best_site != 0) { \ tmp_second_best_mv = *best_mv; \ best_mv->row += site[best_site].mv.row; \ best_mv->col += site[best_site].mv.col; \ best_address += site[best_site].offset; \ is_off_center = 1; \ } \ \ if (is_off_center == 0) num_center_steps++; \ \ if (best_site == 0 && step > 2) { \ int next_step_size = cfg->radius[step - 1]; \ while (next_step_size == cfg->radius[step] && step > 2) { \ num_center_steps++; \ --step; \ next_step_size = cfg->radius[step - 1]; \ } \ } \ } while (0) const struct buf_2d *const src = ms_params->ms_buffers.src; const struct buf_2d *const ref = ms_params->ms_buffers.ref; const uint8_t *src_buf = src->buf; const int src_stride = src->stride; const int ref_stride = ref->stride; const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; const search_site_config *cfg = ms_params->search_sites; int is_off_center = 0; // Number of times that we have stayed in the middle. This is used to skip // search steps in the future if diamond_search_sad is called again. int num_center_steps = 0; // search_step determines the length of the initial step and hence the number // of iterations. const int tot_steps = cfg->num_search_steps - search_step; FULLPEL_MV tmp_second_best_mv; if (second_best_mv) { tmp_second_best_mv = *second_best_mv; } *best_mv = start_mv; // Check the starting position const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv); unsigned int bestsad = start_mv_sad; // TODO(chiyotsai@google.com): Implement 4 points search for msdf&sdaf if (ms_params->ms_buffers.second_pred) { for (int step = tot_steps - 1; step >= 0; --step) { const search_site *site = cfg->site[step]; const int num_searches = cfg->searches_per_step[step]; int best_site = 0; for (int idx = 1; idx <= num_searches; idx++) { const FULLPEL_MV this_mv = { best_mv->row + site[idx].mv.row, best_mv->col + site[idx].mv.col }; if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) { const uint8_t *const check_here = site[idx].offset + best_address; unsigned int thissad = get_mvpred_compound_sad(ms_params, src, check_here, ref_stride); if (thissad < bestsad) { thissad += mvsad_err_cost_(&this_mv, mv_cost_params); if (thissad < bestsad) { bestsad = thissad; best_site = idx; } } } } UPDATE_SEARCH_STEP; } } else { for (int step = tot_steps - 1; step >= 0; --step) { const search_site *site = cfg->site[step]; const int num_searches = cfg->searches_per_step[step]; int best_site = 0; int all_in = 1; // Trap illegal vectors all_in &= best_mv->row + site[1].mv.row >= ms_params->mv_limits.row_min; all_in &= best_mv->row + site[2].mv.row <= ms_params->mv_limits.row_max; all_in &= best_mv->col + site[3].mv.col >= ms_params->mv_limits.col_min; all_in &= best_mv->col + site[4].mv.col <= ms_params->mv_limits.col_max; if (all_in) { for (int idx = 1; idx <= num_searches; idx += 4) { unsigned char const *block_offset[4]; unsigned int sads[4]; for (int j = 0; j < 4; j++) block_offset[j] = site[idx + j].offset + best_address; ms_params->sdx4df(src_buf, src_stride, block_offset, ref_stride, sads); for (int j = 0; j < 4; j++) { if (sads[j] < bestsad) { const FULLPEL_MV this_mv = { best_mv->row + site[idx + j].mv.row, best_mv->col + site[idx + j].mv.col }; unsigned int thissad = sads[j] + mvsad_err_cost_(&this_mv, mv_cost_params); if (thissad < bestsad) { bestsad = thissad; best_site = idx + j; } } } } } else { for (int idx = 1; idx <= num_searches; idx++) { const FULLPEL_MV this_mv = { best_mv->row + site[idx].mv.row, best_mv->col + site[idx].mv.col }; if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) { const uint8_t *const check_here = site[idx].offset + best_address; unsigned int thissad = get_mvpred_sad(ms_params, src, check_here, ref_stride); if (thissad < bestsad) { thissad += mvsad_err_cost_(&this_mv, mv_cost_params); if (thissad < bestsad) { bestsad = thissad; best_site = idx; } } } } } UPDATE_SEARCH_STEP; } } *num00 = num_center_steps; if (second_best_mv) { *second_best_mv = tmp_second_best_mv; } return bestsad; #undef UPDATE_SEARCH_STEP } static inline unsigned int get_start_mvpred_sad_cost( const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv) { const struct buf_2d *const src = ms_params->ms_buffers.src; const struct buf_2d *const ref = ms_params->ms_buffers.ref; const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv); unsigned int start_mv_sad = mvsad_err_cost_(&start_mv, &ms_params->mv_cost_params); if (ms_params->ms_buffers.second_pred) start_mv_sad += get_mvpred_compound_sad(ms_params, src, best_address, ref->stride); else start_mv_sad += get_mvpred_sad(ms_params, src, best_address, ref->stride); return start_mv_sad; } static int full_pixel_diamond(FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const int step_param, int *cost_list, FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats, FULLPEL_MV *second_best_mv) { const search_site_config *cfg = ms_params->search_sites; int thissme, n, num00 = 0; // Clamp start mv and calculate the cost clamp_fullmv(&start_mv, &ms_params->mv_limits); unsigned int start_mv_sad = get_start_mvpred_sad_cost(ms_params, start_mv); diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param, &n, best_mv, second_best_mv); int bestsme = get_mvpred_compound_var_cost(ms_params, best_mv, best_mv_stats); // If there won't be more n-step search, check to see if refining search is // needed. const int further_steps = cfg->num_search_steps - 1 - step_param; while (n < further_steps) { ++n; // TODO(chiyotsai@google.com): There is another bug here where the second // best mv gets incorrectly overwritten. Fix it later. FULLPEL_MV tmp_best_mv; FULLPEL_MV_STATS tmp_best_mv_stats; diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param + n, &num00, &tmp_best_mv, second_best_mv); thissme = get_mvpred_compound_var_cost(ms_params, &tmp_best_mv, &tmp_best_mv_stats); if (thissme < bestsme) { bestsme = thissme; *best_mv = tmp_best_mv; *best_mv_stats = tmp_best_mv_stats; } if (num00) { // Advance the loop by num00 steps n += num00; num00 = 0; } } // Return cost list. if (cost_list) { if (USE_SAD_COSTLIST) { const int costlist_has_sad = 0; calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad); } else { calc_int_cost_list(*best_mv, ms_params, cost_list); } } return bestsme; } // Exhaustive motion search around a given centre position with a given // step size. static int exhaustive_mesh_search(FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const int range, const int step, FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) { const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; const struct buf_2d *const src = ms_params->ms_buffers.src; const struct buf_2d *const ref = ms_params->ms_buffers.ref; const int ref_stride = ref->stride; unsigned int best_sad = INT_MAX; int r, c, i; int start_col, end_col, start_row, end_row; const int col_step = (step > 1) ? step : 4; assert(step >= 1); clamp_fullmv(&start_mv, &ms_params->mv_limits); *best_mv = start_mv; best_sad = get_mvpred_sad(ms_params, src, get_buf_from_fullmv(ref, &start_mv), ref_stride); best_sad += mvsad_err_cost_(&start_mv, mv_cost_params); start_row = AOMMAX(-range, ms_params->mv_limits.row_min - start_mv.row); start_col = AOMMAX(-range, ms_params->mv_limits.col_min - start_mv.col); end_row = AOMMIN(range, ms_params->mv_limits.row_max - start_mv.row); end_col = AOMMIN(range, ms_params->mv_limits.col_max - start_mv.col); for (r = start_row; r <= end_row; r += step) { for (c = start_col; c <= end_col; c += col_step) { // Step > 1 means we are not checking every location in this pass. if (step > 1) { const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c }; unsigned int sad = get_mvpred_sad( ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride); update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad, /*raw_best_sad=*/NULL, best_mv, second_best_mv); } else { // 4 sads in a single call if we are checking every location if (c + 3 <= end_col) { unsigned int sads[4]; const uint8_t *addrs[4]; for (i = 0; i < 4; ++i) { const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i }; addrs[i] = get_buf_from_fullmv(ref, &mv); } ms_params->sdx4df(src->buf, src->stride, addrs, ref_stride, sads); for (i = 0; i < 4; ++i) { if (sads[i] < best_sad) { const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i }; update_mvs_and_sad(sads[i], &mv, mv_cost_params, &best_sad, /*raw_best_sad=*/NULL, best_mv, second_best_mv); } } } else { for (i = 0; i < end_col - c; ++i) { const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i }; unsigned int sad = get_mvpred_sad( ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride); update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad, /*raw_best_sad=*/NULL, best_mv, second_best_mv); } } } } } return best_sad; } // Runs an limited range exhaustive mesh search using a pattern set // according to the encode speed profile. static int full_pixel_exhaustive(const FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct MESH_PATTERN *const mesh_patterns, int *cost_list, FULLPEL_MV *best_mv, FULLPEL_MV_STATS *mv_stats, FULLPEL_MV *second_best_mv) { const int kMinRange = 7; const int kMaxRange = 256; const int kMinInterval = 1; int bestsme; int i; int interval = mesh_patterns[0].interval; int range = mesh_patterns[0].range; int baseline_interval_divisor; // TODO(chiyotsai@google.com): Currently exhaustive search calls single ref // version of sad and variance function. We still need to check the // performance when compound ref exhaustive search is enabled. assert(!ms_params->ms_buffers.second_pred && "Mesh search does not support compound mode!"); *best_mv = start_mv; // Trap illegal values for interval and range for this function. if ((range < kMinRange) || (range > kMaxRange) || (interval < kMinInterval) || (interval > range)) return INT_MAX; baseline_interval_divisor = range / interval; // Check size of proposed first range against magnitude of the centre // value used as a starting point. range = AOMMAX(range, (5 * AOMMAX(abs(best_mv->row), abs(best_mv->col))) / 4); range = AOMMIN(range, kMaxRange); interval = AOMMAX(interval, range / baseline_interval_divisor); // Use a small search step/interval for certain kind of clips. // For example, screen content clips with a lot of texts. // Large interval could lead to a false matching position, and it can't find // the best global candidate in following iterations due to reduced search // range. The solution here is to use a small search iterval in the beginning // and thus reduces the chance of missing the best candidate. if (ms_params->fine_search_interval) { interval = AOMMIN(interval, 4); } // initial search bestsme = exhaustive_mesh_search(*best_mv, ms_params, range, interval, best_mv, second_best_mv); if ((interval > kMinInterval) && (range > kMinRange)) { // Progressive searches with range and step size decreasing each time // till we reach a step size of 1. Then break out. for (i = 1; i < MAX_MESH_STEP; ++i) { // First pass with coarser step and longer range bestsme = exhaustive_mesh_search( *best_mv, ms_params, mesh_patterns[i].range, mesh_patterns[i].interval, best_mv, second_best_mv); if (mesh_patterns[i].interval == 1) break; } } if (bestsme < INT_MAX) { bestsme = get_mvpred_var_cost(ms_params, best_mv, mv_stats); } // Return cost list. if (cost_list) { if (USE_SAD_COSTLIST) { const int costlist_has_sad = 0; calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad); } else { calc_int_cost_list(*best_mv, ms_params, cost_list); } } return bestsme; } // This function is called when we do joint motion search in comp_inter_inter // mode, or when searching for one component of an ext-inter compound mode. int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV start_mv, FULLPEL_MV *best_mv) { static const search_neighbors neighbors[8] = { { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 }, { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 }, { { 0, 1 }, 0 * SEARCH_GRID_STRIDE_8P + 1 }, { { 1, 0 }, 1 * SEARCH_GRID_STRIDE_8P + 0 }, { { -1, -1 }, -1 * SEARCH_GRID_STRIDE_8P - 1 }, { { 1, -1 }, 1 * SEARCH_GRID_STRIDE_8P - 1 }, { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 }, { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 } }; uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P * SEARCH_GRID_STRIDE_8P] = { 0 }; int grid_center = SEARCH_GRID_CENTER_8P; int grid_coord = grid_center; const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; const FullMvLimits *mv_limits = &ms_params->mv_limits; const MSBuffers *ms_buffers = &ms_params->ms_buffers; const struct buf_2d *src = ms_buffers->src; const struct buf_2d *ref = ms_buffers->ref; const int ref_stride = ref->stride; *best_mv = start_mv; clamp_fullmv(best_mv, mv_limits); unsigned int best_sad = get_mvpred_compound_sad( ms_params, src, get_buf_from_fullmv(ref, best_mv), ref_stride); best_sad += mvsad_err_cost_(best_mv, mv_cost_params); do_refine_search_grid[grid_coord] = 1; for (int i = 0; i < SEARCH_RANGE_8P; ++i) { int best_site = -1; for (int j = 0; j < 8; ++j) { grid_coord = grid_center + neighbors[j].coord_offset; if (do_refine_search_grid[grid_coord] == 1) { continue; } const FULLPEL_MV mv = { best_mv->row + neighbors[j].coord.row, best_mv->col + neighbors[j].coord.col }; do_refine_search_grid[grid_coord] = 1; if (av1_is_fullmv_in_range(mv_limits, mv)) { unsigned int sad; sad = get_mvpred_compound_sad( ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride); if (sad < best_sad) { sad += mvsad_err_cost_(&mv, mv_cost_params); if (sad < best_sad) { best_sad = sad; best_site = j; } } } } if (best_site == -1) { break; } else { best_mv->row += neighbors[best_site].coord.row; best_mv->col += neighbors[best_site].coord.col; grid_center += neighbors[best_site].coord_offset; } } return best_sad; } int av1_full_pixel_search(const FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const int step_param, int *cost_list, FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats, FULLPEL_MV *second_best_mv) { const BLOCK_SIZE bsize = ms_params->bsize; const SEARCH_METHODS search_method = ms_params->search_method; const int is_intra_mode = ms_params->is_intra_mode; int run_mesh_search = ms_params->run_mesh_search; int var = 0; MARK_MV_INVALID(best_mv); if (second_best_mv) { MARK_MV_INVALID(second_best_mv); } if (cost_list) { cost_list[0] = INT_MAX; cost_list[1] = INT_MAX; cost_list[2] = INT_MAX; cost_list[3] = INT_MAX; cost_list[4] = INT_MAX; } assert(ms_params->ms_buffers.ref->stride == ms_params->search_sites->stride); switch (search_method) { case FAST_BIGDIA: var = fast_bigdia_search(start_mv, ms_params, step_param, 0, cost_list, best_mv, best_mv_stats); break; case VFAST_DIAMOND: var = vfast_dia_search(start_mv, ms_params, step_param, 0, cost_list, best_mv, best_mv_stats); break; case FAST_DIAMOND: var = fast_dia_search(start_mv, ms_params, step_param, 0, cost_list, best_mv, best_mv_stats); break; case FAST_HEX: var = fast_hex_search(start_mv, ms_params, step_param, 0, cost_list, best_mv, best_mv_stats); break; case HEX: var = hex_search(start_mv, ms_params, step_param, 1, cost_list, best_mv, best_mv_stats); break; case SQUARE: var = square_search(start_mv, ms_params, step_param, 1, cost_list, best_mv, best_mv_stats); break; case BIGDIA: var = bigdia_search(start_mv, ms_params, step_param, 1, cost_list, best_mv, best_mv_stats); break; case NSTEP: case NSTEP_8PT: case DIAMOND: case CLAMPED_DIAMOND: var = full_pixel_diamond(start_mv, ms_params, step_param, cost_list, best_mv, best_mv_stats, second_best_mv); break; default: assert(0 && "Invalid search method."); } // Should we allow a follow on exhaustive search? if (!run_mesh_search && ((search_method == NSTEP) || (search_method == NSTEP_8PT)) && !ms_params->ms_buffers.second_pred) { int exhaustive_thr = ms_params->force_mesh_thresh; exhaustive_thr >>= 10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]); // Threshold variance for an exhaustive full search. if (var > exhaustive_thr) run_mesh_search = 1; } // TODO(yunqing): the following is used to reduce mesh search in temporal // filtering. Can extend it to intrabc. if (!is_intra_mode && ms_params->prune_mesh_search) { const int full_pel_mv_diff = AOMMAX(abs(start_mv.row - best_mv->row), abs(start_mv.col - best_mv->col)); if (full_pel_mv_diff <= ms_params->mesh_search_mv_diff_threshold) { run_mesh_search = 0; } } if (ms_params->sdf != ms_params->vfp->sdf) { // If we are skipping rows when we perform the motion search, we need to // check the quality of skipping. If it's bad, then we run mesh search with // skip row features off. // TODO(chiyotsai@google.com): Handle the case where we have a vertical // offset of 1 before we hit this statement to avoid having to redo // motion search. const struct buf_2d *src = ms_params->ms_buffers.src; const struct buf_2d *ref = ms_params->ms_buffers.ref; const int src_stride = src->stride; const int ref_stride = ref->stride; const uint8_t *src_address = src->buf; const uint8_t *best_address = get_buf_from_fullmv(ref, best_mv); const int sad = ms_params->vfp->sdf(src_address, src_stride, best_address, ref_stride); const int skip_sad = ms_params->vfp->sdsf(src_address, src_stride, best_address, ref_stride); // We will keep the result of skipping rows if it's good enough. Here, good // enough means the error is less than 1 per pixel. const int kSADThresh = 1 << (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]); if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= AOMMAX(sad, 1) * 9) { // There is a large discrepancy between skipping and not skipping, so we // need to redo the motion search. FULLPEL_MOTION_SEARCH_PARAMS new_ms_params = *ms_params; new_ms_params.sdf = new_ms_params.vfp->sdf; new_ms_params.sdx4df = new_ms_params.vfp->sdx4df; new_ms_params.sdx3df = new_ms_params.vfp->sdx3df; return av1_full_pixel_search(start_mv, &new_ms_params, step_param, cost_list, best_mv, best_mv_stats, second_best_mv); } } if (run_mesh_search) { int var_ex; FULLPEL_MV tmp_mv_ex; FULLPEL_MV_STATS tmp_mv_stats; // Pick the mesh pattern for exhaustive search based on the toolset (intraBC // or non-intraBC) // TODO(chiyotsai@google.com): There is a bug here where the second best mv // gets overwritten without actually comparing the rdcost. const MESH_PATTERN *const mesh_patterns = ms_params->mesh_patterns[is_intra_mode]; // TODO(chiyotsai@google.com): the second best mv is not set correctly by // full_pixel_exhaustive, which can incorrectly override it. var_ex = full_pixel_exhaustive(*best_mv, ms_params, mesh_patterns, cost_list, &tmp_mv_ex, &tmp_mv_stats, second_best_mv); if (var_ex < var) { var = var_ex; *best_mv_stats = tmp_mv_stats; *best_mv = tmp_mv_ex; } } return var; } int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, IntraBCHashInfo *intrabc_hash_info, FULLPEL_MV *best_mv) { if (!av1_use_hash_me(cpi)) return INT_MAX; const BLOCK_SIZE bsize = ms_params->bsize; const int block_width = block_size_wide[bsize]; const int block_height = block_size_high[bsize]; if (block_width != block_height) return INT_MAX; const FullMvLimits *mv_limits = &ms_params->mv_limits; const MSBuffers *ms_buffer = &ms_params->ms_buffers; const uint8_t *src = ms_buffer->src->buf; const int src_stride = ms_buffer->src->stride; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; const int x_pos = mi_col * MI_SIZE; const int y_pos = mi_row * MI_SIZE; uint32_t hash_value1, hash_value2; int best_hash_cost = INT_MAX; // for the hashMap hash_table *ref_frame_hash = &intrabc_hash_info->intrabc_hash_table; av1_get_block_hash_value(intrabc_hash_info, src, src_stride, block_width, &hash_value1, &hash_value2, is_cur_buf_hbd(xd)); const int count = av1_hash_table_count(ref_frame_hash, hash_value1); if (count <= 1) { return INT_MAX; } Iterator iterator = av1_hash_get_first_iterator(ref_frame_hash, hash_value1); for (int i = 0; i < count; i++, aom_iterator_increment(&iterator)) { block_hash ref_block_hash = *(block_hash *)(aom_iterator_get(&iterator)); if (hash_value2 == ref_block_hash.hash_value2) { // Make sure the prediction is from valid area. const MV dv = { GET_MV_SUBPEL(ref_block_hash.y - y_pos), GET_MV_SUBPEL(ref_block_hash.x - x_pos) }; if (!av1_is_dv_valid(dv, &cpi->common, xd, mi_row, mi_col, bsize, cpi->common.seq_params->mib_size_log2)) continue; FULLPEL_MV hash_mv; hash_mv.col = ref_block_hash.x - x_pos; hash_mv.row = ref_block_hash.y - y_pos; if (!av1_is_fullmv_in_range(mv_limits, hash_mv)) continue; FULLPEL_MV_STATS mv_stats; const int refCost = get_mvpred_var_cost(ms_params, &hash_mv, &mv_stats); if (refCost < best_hash_cost) { best_hash_cost = refCost; *best_mv = hash_mv; } } } return best_hash_cost; } int av1_vector_match(const int16_t *ref, const int16_t *src, int bwl, int search_size, int full_search, int *sad) { int best_sad = INT_MAX; int this_sad; int d; int center, offset = 0; int bw = search_size << 1; if (full_search) { for (d = 0; d <= bw; d++) { this_sad = aom_vector_var(&ref[d], src, bwl); if (this_sad < best_sad) { best_sad = this_sad; offset = d; } } center = offset; *sad = best_sad; return (center - (bw >> 1)); } for (d = 0; d <= bw; d += 16) { this_sad = aom_vector_var(&ref[d], src, bwl); if (this_sad < best_sad) { best_sad = this_sad; offset = d; } } center = offset; for (d = -8; d <= 8; d += 16) { int this_pos = offset + d; // check limit if (this_pos < 0 || this_pos > bw) continue; this_sad = aom_vector_var(&ref[this_pos], src, bwl); if (this_sad < best_sad) { best_sad = this_sad; center = this_pos; } } offset = center; for (d = -4; d <= 4; d += 8) { int this_pos = offset + d; // check limit if (this_pos < 0 || this_pos > bw) continue; this_sad = aom_vector_var(&ref[this_pos], src, bwl); if (this_sad < best_sad) { best_sad = this_sad; center = this_pos; } } offset = center; for (d = -2; d <= 2; d += 4) { int this_pos = offset + d; // check limit if (this_pos < 0 || this_pos > bw) continue; this_sad = aom_vector_var(&ref[this_pos], src, bwl); if (this_sad < best_sad) { best_sad = this_sad; center = this_pos; } } offset = center; for (d = -1; d <= 1; d += 2) { int this_pos = offset + d; // check limit if (this_pos < 0 || this_pos > bw) continue; this_sad = aom_vector_var(&ref[this_pos], src, bwl); if (this_sad < best_sad) { best_sad = this_sad; center = this_pos; } } *sad = best_sad; return (center - (bw >> 1)); } // A special fast version of motion search used in rt mode. // The search window along columns and row is given by: // +/- me_search_size_col/row. unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, const MV *ref_mv, unsigned int *y_sad_zero, int me_search_size_col, int me_search_size_row) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mi = xd->mi[0]; struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } }; int idx; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN; const int full_search = is_screen; const bool screen_scroll_superblock = is_screen && bsize == cm->seq_params->sb_size; // Keep border a multiple of 16. const int border = (cpi->oxcf.border_in_pixels >> 4) << 4; int search_size_width = me_search_size_col; int search_size_height = me_search_size_row; // Adjust based on boundary. if (((mi_col << 2) - search_size_width < -border) || ((mi_col << 2) + search_size_width > cm->width + border)) search_size_width = border; if (((mi_row << 2) - search_size_height < -border) || ((mi_row << 2) + search_size_height > cm->height + border)) search_size_height = border; const int src_stride = x->plane[0].src.stride; const int ref_stride = xd->plane[0].pre[0].stride; uint8_t const *ref_buf, *src_buf; int_mv *best_int_mv = &xd->mi[0]->mv[0]; unsigned int best_sad, tmp_sad, this_sad[4]; int best_sad_col, best_sad_row; const int row_norm_factor = mi_size_high_log2[bsize] + 1; const int col_norm_factor = 3 + (bw >> 5); const YV12_BUFFER_CONFIG *scaled_ref_frame = av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]); static const MV search_pos[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 }, }; if (scaled_ref_frame) { int i; // Swap out the reference frame for a version that's been scaled to // match the resolution of the current frame, allowing the existing // motion search code to be used without additional modifications. for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0]; av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL, MAX_MB_PLANE); } if (xd->bd != 8) { best_int_mv->as_fullmv = kZeroFullMv; best_sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, xd->plane[0].pre[0].buf, ref_stride); if (scaled_ref_frame) { int i; for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; } return best_sad; } const int width_ref_buf = (search_size_width << 1) + bw; const int height_ref_buf = (search_size_height << 1) + bh; int16_t *hbuf = (int16_t *)aom_malloc(width_ref_buf * sizeof(*hbuf)); int16_t *vbuf = (int16_t *)aom_malloc(height_ref_buf * sizeof(*vbuf)); int16_t *src_hbuf = (int16_t *)aom_malloc(bw * sizeof(*src_hbuf)); int16_t *src_vbuf = (int16_t *)aom_malloc(bh * sizeof(*src_vbuf)); if (!hbuf || !vbuf || !src_hbuf || !src_vbuf) { aom_free(hbuf); aom_free(vbuf); aom_free(src_hbuf); aom_free(src_vbuf); aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate hbuf, vbuf, src_hbuf, or src_vbuf"); } // Set up prediction 1-D reference set for rows. ref_buf = xd->plane[0].pre[0].buf - search_size_width; aom_int_pro_row(hbuf, ref_buf, ref_stride, width_ref_buf, bh, row_norm_factor); // Set up prediction 1-D reference set for cols ref_buf = xd->plane[0].pre[0].buf - search_size_height * ref_stride; aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, height_ref_buf, col_norm_factor); // Set up src 1-D reference set src_buf = x->plane[0].src.buf; aom_int_pro_row(src_hbuf, src_buf, src_stride, bw, bh, row_norm_factor); aom_int_pro_col(src_vbuf, src_buf, src_stride, bw, bh, col_norm_factor); // Find the best match per 1-D search best_int_mv->as_fullmv.col = av1_vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize], search_size_width, full_search, &best_sad_col); best_int_mv->as_fullmv.row = av1_vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize], search_size_height, full_search, &best_sad_row); // For screen: select between horiz or vert motion. if (is_screen) { if (best_sad_col < best_sad_row) best_int_mv->as_fullmv.row = 0; else best_int_mv->as_fullmv.col = 0; } FULLPEL_MV this_mv = best_int_mv->as_fullmv; src_buf = x->plane[0].src.buf; ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv); best_sad = cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); // Evaluate zero MV if found MV is non-zero. if (best_int_mv->as_int != 0) { tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, xd->plane[0].pre[0].buf, ref_stride); *y_sad_zero = tmp_sad; if (tmp_sad < best_sad) { best_int_mv->as_fullmv = kZeroFullMv; this_mv = best_int_mv->as_fullmv; ref_buf = xd->plane[0].pre[0].buf; best_sad = tmp_sad; } } else { *y_sad_zero = best_sad; } if (!screen_scroll_superblock) { const uint8_t *const pos[4] = { ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride, }; cpi->ppi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad); for (idx = 0; idx < 4; ++idx) { if (this_sad[idx] < best_sad) { best_sad = this_sad[idx]; best_int_mv->as_fullmv.row = search_pos[idx].row + this_mv.row; best_int_mv->as_fullmv.col = search_pos[idx].col + this_mv.col; } } if (this_sad[0] < this_sad[3]) this_mv.row -= 1; else this_mv.row += 1; if (this_sad[1] < this_sad[2]) this_mv.col -= 1; else this_mv.col += 1; ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv); tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); if (best_sad > tmp_sad) { best_int_mv->as_fullmv = this_mv; best_sad = tmp_sad; } } FullMvLimits mv_limits = x->mv_limits; av1_set_mv_search_range(&mv_limits, ref_mv); clamp_fullmv(&best_int_mv->as_fullmv, &mv_limits); convert_fullmv_to_mv(best_int_mv); if (scaled_ref_frame) { int i; for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; } aom_free(hbuf); aom_free(vbuf); aom_free(src_hbuf); aom_free(src_vbuf); return best_sad; } // ============================================================================= // Fullpixel Motion Search: OBMC // ============================================================================= static inline int get_obmc_mvpred_var( const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) { const aom_variance_fn_ptr_t *vfp = ms_params->vfp; const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; const MSBuffers *ms_buffers = &ms_params->ms_buffers; const int32_t *wsrc = ms_buffers->wsrc; const int32_t *mask = ms_buffers->obmc_mask; const struct buf_2d *ref_buf = ms_buffers->ref; const MV mv = get_mv_from_fullmv(this_mv); unsigned int unused; return vfp->ovf(get_buf_from_fullmv(ref_buf, this_mv), ref_buf->stride, wsrc, mask, &unused) + mv_err_cost_(&mv, mv_cost_params); } static int obmc_refining_search_sad( const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV *best_mv) { const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp; const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; const MSBuffers *ms_buffers = &ms_params->ms_buffers; const int32_t *wsrc = ms_buffers->wsrc; const int32_t *mask = ms_buffers->obmc_mask; const struct buf_2d *ref_buf = ms_buffers->ref; const FULLPEL_MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; const int kSearchRange = 8; unsigned int best_sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, best_mv), ref_buf->stride, wsrc, mask) + mvsad_err_cost_(best_mv, mv_cost_params); for (int i = 0; i < kSearchRange; i++) { int best_site = -1; for (int j = 0; j < 4; j++) { const FULLPEL_MV mv = { best_mv->row + neighbors[j].row, best_mv->col + neighbors[j].col }; if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) { unsigned int sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, &mv), ref_buf->stride, wsrc, mask); if (sad < best_sad) { sad += mvsad_err_cost_(&mv, mv_cost_params); if (sad < best_sad) { best_sad = sad; best_site = j; } } } } if (best_site == -1) { break; } else { best_mv->row += neighbors[best_site].row; best_mv->col += neighbors[best_site].col; } } return best_sad; } static int obmc_diamond_search_sad( const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv, FULLPEL_MV *best_mv, int search_step, int *num00) { const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp; const search_site_config *cfg = ms_params->search_sites; const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; const MSBuffers *ms_buffers = &ms_params->ms_buffers; const int32_t *wsrc = ms_buffers->wsrc; const int32_t *mask = ms_buffers->obmc_mask; const struct buf_2d *const ref_buf = ms_buffers->ref; // search_step determines the length of the initial step and hence the number // of iterations. const int tot_steps = cfg->num_search_steps - search_step; const uint8_t *best_address, *init_ref; int best_sad = INT_MAX; int best_site = 0; clamp_fullmv(&start_mv, &ms_params->mv_limits); best_address = init_ref = get_buf_from_fullmv(ref_buf, &start_mv); *num00 = 0; *best_mv = start_mv; // Check the starting position best_sad = fn_ptr->osdf(best_address, ref_buf->stride, wsrc, mask) + mvsad_err_cost_(best_mv, mv_cost_params); for (int step = tot_steps - 1; step >= 0; --step) { const search_site *const site = cfg->site[step]; best_site = 0; for (int idx = 1; idx <= cfg->searches_per_step[step]; ++idx) { const FULLPEL_MV mv = { best_mv->row + site[idx].mv.row, best_mv->col + site[idx].mv.col }; if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) { int sad = fn_ptr->osdf(best_address + site[idx].offset, ref_buf->stride, wsrc, mask); if (sad < best_sad) { sad += mvsad_err_cost_(&mv, mv_cost_params); if (sad < best_sad) { best_sad = sad; best_site = idx; } } } } if (best_site != 0) { best_mv->row += site[best_site].mv.row; best_mv->col += site[best_site].mv.col; best_address += site[best_site].offset; } else if (best_address == init_ref) { (*num00)++; } } return best_sad; } static int obmc_full_pixel_diamond( const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV start_mv, int step_param, FULLPEL_MV *best_mv) { const search_site_config *cfg = ms_params->search_sites; FULLPEL_MV tmp_mv; int thissme, n, num00 = 0; int bestsme = obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv, step_param, &n); if (bestsme < INT_MAX) bestsme = get_obmc_mvpred_var(ms_params, &tmp_mv); *best_mv = tmp_mv; // If there won't be more n-step search, check to see if refining search is // needed. const int further_steps = cfg->num_search_steps - 1 - step_param; while (n < further_steps) { ++n; if (num00) { num00--; } else { thissme = obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv, step_param + n, &num00); if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, &tmp_mv); if (thissme < bestsme) { bestsme = thissme; *best_mv = tmp_mv; } } } return bestsme; } int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const int step_param, FULLPEL_MV *best_mv) { if (!ms_params->fast_obmc_search) { const int bestsme = obmc_full_pixel_diamond(ms_params, start_mv, step_param, best_mv); return bestsme; } else { *best_mv = start_mv; clamp_fullmv(best_mv, &ms_params->mv_limits); int thissme = obmc_refining_search_sad(ms_params, best_mv); if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, best_mv); return thissme; } } // ============================================================================= // Subpixel Motion Search: Translational // ============================================================================= #define INIT_SUBPEL_STEP_SIZE (4) /* * To avoid the penalty for crossing cache-line read, preload the reference * area in a small buffer, which is aligned to make sure there won't be crossing * cache-line read while reading from this buffer. This reduced the cpu * cycles spent on reading ref data in sub-pixel filter functions. * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we * could reduce the area. */ // Returns the subpel offset used by various subpel variance functions [m]sv[a]f static inline int get_subpel_part(int x) { return x & 7; } // Gets the address of the ref buffer at subpel location (r, c), rounded to the // nearest fullpel precision toward - \infty static inline const uint8_t *get_buf_from_mv(const struct buf_2d *buf, const MV mv) { const int offset = (mv.row >> 3) * buf->stride + (mv.col >> 3); return &buf->buf[offset]; } // Estimates the variance of prediction residue using bilinear filter for fast // search. static inline int estimated_pref_error( const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, unsigned int *sse) { const aom_variance_fn_ptr_t *vfp = var_params->vfp; const MSBuffers *ms_buffers = &var_params->ms_buffers; const uint8_t *src = ms_buffers->src->buf; const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv); const int src_stride = ms_buffers->src->stride; const int ref_stride = ms_buffers->ref->stride; const uint8_t *second_pred = ms_buffers->second_pred; const uint8_t *mask = ms_buffers->mask; const int mask_stride = ms_buffers->mask_stride; const int invert_mask = ms_buffers->inv_mask; const int subpel_x_q3 = get_subpel_part(this_mv->col); const int subpel_y_q3 = get_subpel_part(this_mv->row); if (second_pred == NULL) { return vfp->svf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride, sse); } else if (mask) { return vfp->msvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride, second_pred, mask, mask_stride, invert_mask, sse); } else { return vfp->svaf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride, sse, second_pred); } } // Calculates the variance of prediction residue. static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, unsigned int *sse) { const aom_variance_fn_ptr_t *vfp = var_params->vfp; const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type; const MSBuffers *ms_buffers = &var_params->ms_buffers; const uint8_t *src = ms_buffers->src->buf; const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv); const int src_stride = ms_buffers->src->stride; const int ref_stride = ms_buffers->ref->stride; const uint8_t *second_pred = ms_buffers->second_pred; const uint8_t *mask = ms_buffers->mask; const int mask_stride = ms_buffers->mask_stride; const int invert_mask = ms_buffers->inv_mask; const int w = var_params->w; const int h = var_params->h; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; const int subpel_x_q3 = get_subpel_part(this_mv->col); const int subpel_y_q3 = get_subpel_part(this_mv->row); unsigned int besterr; #if CONFIG_AV1_HIGHBITDEPTH if (is_cur_buf_hbd(xd)) { DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16); if (second_pred != NULL) { if (mask) { aom_highbd_comp_mask_upsampled_pred( xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h, subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride, invert_mask, xd->bd, subpel_search_type); } else { aom_highbd_comp_avg_upsampled_pred( xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h, subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd, subpel_search_type); } } else { aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h, subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd, subpel_search_type); } besterr = vfp->vf(pred8, w, src, src_stride, sse); } else { DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); if (second_pred != NULL) { if (mask) { aom_comp_mask_upsampled_pred( xd, cm, mi_row, mi_col, this_mv, pred, second_pred, w, h, subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride, invert_mask, subpel_search_type); } else { aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, second_pred, w, h, subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search_type); } } else { aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search_type); } besterr = vfp->vf(pred, w, src, src_stride, sse); } #else DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); if (second_pred != NULL) { if (mask) { aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, second_pred, w, h, subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride, invert_mask, subpel_search_type); } else { aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, second_pred, w, h, subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search_type); } } else { aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search_type); } besterr = vfp->vf(pred, w, src, src_stride, sse); #endif return besterr; } // Estimates whether this_mv is better than best_mv. This function incorporates // both prediction error and residue into account. It is suffixed "fast" because // it uses bilinear filter to estimate the prediction. static inline unsigned int check_better_fast( MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, unsigned int *sse1, int *distortion, int *has_better_mv, int is_scaled) { unsigned int cost; if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) { unsigned int sse; int thismse; if (is_scaled) { thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse); } else { thismse = estimated_pref_error(this_mv, var_params, &sse); } cost = mv_err_cost_(this_mv, mv_cost_params); cost += thismse; if (cost < *besterr) { *besterr = cost; *best_mv = *this_mv; *distortion = thismse; *sse1 = sse; *has_better_mv |= 1; } } else { cost = INT_MAX; } return cost; } // Checks whether this_mv is better than best_mv. This function incorporates // both prediction error and residue into account. static AOM_FORCE_INLINE unsigned int check_better( MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, unsigned int *sse1, int *distortion, int *is_better) { unsigned int cost; if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) { unsigned int sse; int thismse; thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse); cost = mv_err_cost_(this_mv, mv_cost_params); cost += thismse; if (cost < *besterr) { *besterr = cost; *best_mv = *this_mv; *distortion = thismse; *sse1 = sse; *is_better |= 1; } } else { cost = INT_MAX; } return cost; } static inline MV get_best_diag_step(int step_size, unsigned int left_cost, unsigned int right_cost, unsigned int up_cost, unsigned int down_cost) { const MV diag_step = { up_cost <= down_cost ? -step_size : step_size, left_cost <= right_cost ? -step_size : step_size }; return diag_step; } // Searches the four cardinal direction for a better mv, then follows up with a // search in the best quadrant. This uses bilinear filter to speed up the // calculation. static AOM_FORCE_INLINE MV first_level_check_fast( MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, unsigned int *sse1, int *distortion, int is_scaled) { // Check the four cardinal directions const MV left_mv = { this_mv.row, this_mv.col - hstep }; int dummy = 0; const unsigned int left = check_better_fast( xd, cm, &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy, is_scaled); const MV right_mv = { this_mv.row, this_mv.col + hstep }; const unsigned int right = check_better_fast( xd, cm, &right_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy, is_scaled); const MV top_mv = { this_mv.row - hstep, this_mv.col }; const unsigned int up = check_better_fast( xd, cm, &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy, is_scaled); const MV bottom_mv = { this_mv.row + hstep, this_mv.col }; const unsigned int down = check_better_fast( xd, cm, &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy, is_scaled); const MV diag_step = get_best_diag_step(hstep, left, right, up, down); const MV diag_mv = { this_mv.row + diag_step.row, this_mv.col + diag_step.col }; // Check the diagonal direction with the best mv check_better_fast(xd, cm, &diag_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy, is_scaled); return diag_step; } // Performs a following up search after first_level_check_fast is called. This // performs two extra chess pattern searches in the best quadrant. static AOM_FORCE_INLINE void second_level_check_fast( MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, const MV diag_step, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, unsigned int *sse1, int *distortion, int is_scaled) { assert(diag_step.row == hstep || diag_step.row == -hstep); assert(diag_step.col == hstep || diag_step.col == -hstep); const int tr = this_mv.row; const int tc = this_mv.col; const int br = best_mv->row; const int bc = best_mv->col; int dummy = 0; if (tr != br && tc != bc) { assert(diag_step.col == bc - tc); assert(diag_step.row == br - tr); const MV chess_mv_1 = { br, bc + diag_step.col }; const MV chess_mv_2 = { br + diag_step.row, bc }; check_better_fast(xd, cm, &chess_mv_1, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy, is_scaled); check_better_fast(xd, cm, &chess_mv_2, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy, is_scaled); } else if (tr == br && tc != bc) { assert(diag_step.col == bc - tc); // Continue searching in the best direction const MV bottom_long_mv = { br + hstep, bc + diag_step.col }; const MV top_long_mv = { br - hstep, bc + diag_step.col }; check_better_fast(xd, cm, &bottom_long_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy, is_scaled); check_better_fast(xd, cm, &top_long_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy, is_scaled); // Search in the direction opposite of the best quadrant const MV rev_mv = { br - diag_step.row, bc }; check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy, is_scaled); } else if (tr != br && tc == bc) { assert(diag_step.row == br - tr); // Continue searching in the best direction const MV right_long_mv = { br + diag_step.row, bc + hstep }; const MV left_long_mv = { br + diag_step.row, bc - hstep }; check_better_fast(xd, cm, &right_long_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy, is_scaled); check_better_fast(xd, cm, &left_long_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy, is_scaled); // Search in the direction opposite of the best quadrant const MV rev_mv = { br, bc - diag_step.col }; check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy, is_scaled); } } // Combines first level check and second level check when applicable. This first // searches the four cardinal directions, and perform several // diagonal/chess-pattern searches in the best quadrant. static AOM_FORCE_INLINE void two_level_checks_fast( MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, unsigned int *sse1, int *distortion, int iters, int is_scaled) { const MV diag_step = first_level_check_fast( xd, cm, this_mv, best_mv, hstep, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, is_scaled); if (iters > 1) { second_level_check_fast(xd, cm, this_mv, diag_step, best_mv, hstep, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, is_scaled); } } static AOM_FORCE_INLINE MV first_level_check(MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV *best_mv, const int hstep, const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, unsigned int *sse1, int *distortion) { int dummy = 0; const MV left_mv = { this_mv.row, this_mv.col - hstep }; const MV right_mv = { this_mv.row, this_mv.col + hstep }; const MV top_mv = { this_mv.row - hstep, this_mv.col }; const MV bottom_mv = { this_mv.row + hstep, this_mv.col }; const unsigned int left = check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy); const unsigned int right = check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy); const unsigned int up = check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy); const unsigned int down = check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy); const MV diag_step = get_best_diag_step(hstep, left, right, up, down); const MV diag_mv = { this_mv.row + diag_step.row, this_mv.col + diag_step.col }; // Check the diagonal direction with the best mv check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy); return diag_step; } // A newer version of second level check that gives better quality. // TODO(chiyotsai@google.com): evaluate this on subpel_search_types different // from av1_find_best_sub_pixel_tree static AOM_FORCE_INLINE void second_level_check_v2( MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step, MV *best_mv, const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, unsigned int *sse1, int *distortion, int is_scaled) { assert(best_mv->row == this_mv.row + diag_step.row || best_mv->col == this_mv.col + diag_step.col); if (CHECK_MV_EQUAL(this_mv, *best_mv)) { return; } else if (this_mv.row == best_mv->row) { // Search away from diagonal step since diagonal search did not provide any // improvement diag_step.row *= -1; } else if (this_mv.col == best_mv->col) { diag_step.col *= -1; } const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col }; const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col }; const MV diag_bias_mv = { best_mv->row + diag_step.row, best_mv->col + diag_step.col }; int has_better_mv = 0; if (var_params->subpel_search_type != USE_2_TAPS_ORIG) { check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &has_better_mv); check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &has_better_mv); // Do an additional search if the second iteration gives a better mv if (has_better_mv) { check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &has_better_mv); } } else { check_better_fast(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &has_better_mv, is_scaled); check_better_fast(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &has_better_mv, is_scaled); // Do an additional search if the second iteration gives a better mv if (has_better_mv) { check_better_fast(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &has_better_mv, is_scaled); } } } // Gets the error at the beginning when the mv has fullpel precision static unsigned int setup_center_error( const MACROBLOCKD *xd, const MV *bestmv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) { const aom_variance_fn_ptr_t *vfp = var_params->vfp; const int w = var_params->w; const int h = var_params->h; const MSBuffers *ms_buffers = &var_params->ms_buffers; const uint8_t *src = ms_buffers->src->buf; const uint8_t *y = get_buf_from_mv(ms_buffers->ref, *bestmv); const int src_stride = ms_buffers->src->stride; const int y_stride = ms_buffers->ref->stride; const uint8_t *second_pred = ms_buffers->second_pred; const uint8_t *mask = ms_buffers->mask; const int mask_stride = ms_buffers->mask_stride; const int invert_mask = ms_buffers->inv_mask; unsigned int besterr; if (second_pred != NULL) { #if CONFIG_AV1_HIGHBITDEPTH if (is_cur_buf_hbd(xd)) { DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]); uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16); if (mask) { aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask, mask_stride, invert_mask); } else { aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); } besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } else { DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]); if (mask) { aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask, mask_stride, invert_mask); } else { aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); } besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } #else (void)xd; DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]); if (mask) { aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask, mask_stride, invert_mask); } else { aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); } besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); #endif } else { besterr = vfp->vf(y, y_stride, src, src_stride, sse1); } *distortion = besterr; besterr += mv_err_cost_(bestmv, mv_cost_params); return besterr; } // Gets the error at the beginning when the mv has fullpel precision static unsigned int upsampled_setup_center_error( MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *bestmv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) { unsigned int besterr = upsampled_pref_error(xd, cm, bestmv, var_params, sse1); *distortion = besterr; besterr += mv_err_cost_(bestmv, mv_cost_params); return besterr; } static inline int divide_and_round(int n, int d) { return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d); } static inline int is_cost_list_wellbehaved(const int *cost_list) { return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] && cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4]; } // Returns surface minima estimate at given precision in 1/2^n bits. // Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C // For a given set of costs S0, S1, S2, S3, S4 at points // (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively, // the solution for the location of the minima (x0, y0) is given by: // x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0), // y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0). // The code below is an integerized version of that. static inline void get_cost_surf_min(const int *cost_list, int *ir, int *ic, int bits) { *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)), (cost_list[1] - 2 * cost_list[0] + cost_list[3])); *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)), (cost_list[4] - 2 * cost_list[0] + cost_list[2])); } // Checks the list of mvs searched in the last iteration and see if we are // repeating it. If so, return 1. Otherwise we update the last_mv_search_list // with current_mv and return 0. static inline int check_repeated_mv_and_update(int_mv *last_mv_search_list, const MV current_mv, int iter) { if (last_mv_search_list) { if (CHECK_MV_EQUAL(last_mv_search_list[iter].as_mv, current_mv)) { return 1; } last_mv_search_list[iter].as_mv = current_mv; } return 0; } static inline int setup_center_error_facade( MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *bestmv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion, int is_scaled) { if (is_scaled) { return upsampled_setup_center_error(xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion); } else { return setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1, distortion); } } int av1_find_best_sub_pixel_tree_pruned_more( MACROBLOCKD *xd, const AV1_COMMON *const cm, const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) { (void)cm; const int allow_hp = ms_params->allow_hp; const int forced_stop = ms_params->forced_stop; const int iters_per_step = ms_params->iters_per_step; const int *cost_list = ms_params->cost_list; const SubpelMvLimits *mv_limits = &ms_params->mv_limits; const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; // The iteration we are current searching for. Iter 0 corresponds to fullpel // mv, iter 1 to half pel, and so on int iter = 0; int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel unsigned int besterr = INT_MAX; *bestmv = start_mv; const struct scale_factors *const sf = is_intrabc_block(xd->mi[0]) ? &cm->sf_identity : xd->block_ref_scale_factors[0]; const int is_scaled = av1_is_scaled(sf); if (start_mv_stats != NULL && !is_scaled) { besterr = start_mv_stats->distortion + start_mv_stats->err_cost; *distortion = start_mv_stats->distortion; *sse1 = start_mv_stats->sse; } else { besterr = setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion, is_scaled); } // If forced_stop is FULL_PEL, return. if (forced_stop == FULL_PEL) return besterr; if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { return INT_MAX; } iter++; if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) { int ir, ic; get_cost_surf_min(cost_list, &ir, &ic, 1); if (ir != 0 || ic != 0) { const MV this_mv = { start_mv.row + ir * hstep, start_mv.col + ic * hstep }; int dummy = 0; check_better_fast(xd, cm, &this_mv, bestmv, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, &dummy, is_scaled); } } else { two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, iters_per_step, is_scaled); } // Each subsequent iteration checks at least one point in common with // the last iteration could be 2 ( if diag selected) 1/4 pel if (forced_stop < HALF_PEL) { if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { return INT_MAX; } iter++; hstep >>= 1; start_mv = *bestmv; two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, iters_per_step, is_scaled); } if (allow_hp && forced_stop == EIGHTH_PEL) { if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { return INT_MAX; } iter++; hstep >>= 1; start_mv = *bestmv; two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, iters_per_step, is_scaled); } return besterr; } int av1_find_best_sub_pixel_tree_pruned( MACROBLOCKD *xd, const AV1_COMMON *const cm, const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) { (void)cm; (void)start_mv_stats; const int allow_hp = ms_params->allow_hp; const int forced_stop = ms_params->forced_stop; const int iters_per_step = ms_params->iters_per_step; const int *cost_list = ms_params->cost_list; const SubpelMvLimits *mv_limits = &ms_params->mv_limits; const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; // The iteration we are current searching for. Iter 0 corresponds to fullpel // mv, iter 1 to half pel, and so on int iter = 0; int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel unsigned int besterr = INT_MAX; *bestmv = start_mv; const struct scale_factors *const sf = is_intrabc_block(xd->mi[0]) ? &cm->sf_identity : xd->block_ref_scale_factors[0]; const int is_scaled = av1_is_scaled(sf); if (start_mv_stats != NULL && !is_scaled) { besterr = start_mv_stats->distortion + start_mv_stats->err_cost; *distortion = start_mv_stats->distortion; *sse1 = start_mv_stats->sse; } else { besterr = setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion, is_scaled); } // If forced_stop is FULL_PEL, return. if (forced_stop == FULL_PEL) return besterr; if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { return INT_MAX; } iter++; if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && cost_list[4] != INT_MAX) { const unsigned int whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) + (cost_list[2] < cost_list[4] ? 0 : 2); const MV left_mv = { start_mv.row, start_mv.col - hstep }; const MV right_mv = { start_mv.row, start_mv.col + hstep }; const MV bottom_mv = { start_mv.row + hstep, start_mv.col }; const MV top_mv = { start_mv.row - hstep, start_mv.col }; const MV bottom_left_mv = { start_mv.row + hstep, start_mv.col - hstep }; const MV bottom_right_mv = { start_mv.row + hstep, start_mv.col + hstep }; const MV top_left_mv = { start_mv.row - hstep, start_mv.col - hstep }; const MV top_right_mv = { start_mv.row - hstep, start_mv.col + hstep }; int dummy = 0; switch (whichdir) { case 0: // bottom left quadrant check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, &dummy, is_scaled); check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, &dummy, is_scaled); check_better_fast(xd, cm, &bottom_left_mv, bestmv, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, &dummy, is_scaled); break; case 1: // bottom right quadrant check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, &dummy, is_scaled); check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, &dummy, is_scaled); check_better_fast(xd, cm, &bottom_right_mv, bestmv, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, &dummy, is_scaled); break; case 2: // top left quadrant check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, &dummy, is_scaled); check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, &dummy, is_scaled); check_better_fast(xd, cm, &top_left_mv, bestmv, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, &dummy, is_scaled); break; case 3: // top right quadrant check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, &dummy, is_scaled); check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, &dummy, is_scaled); check_better_fast(xd, cm, &top_right_mv, bestmv, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, &dummy, is_scaled); break; } } else { two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, iters_per_step, is_scaled); } // Each subsequent iteration checks at least one point in common with // the last iteration could be 2 ( if diag selected) 1/4 pel if (forced_stop < HALF_PEL) { if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { return INT_MAX; } iter++; hstep >>= 1; start_mv = *bestmv; two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, iters_per_step, is_scaled); } if (allow_hp && forced_stop == EIGHTH_PEL) { if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { return INT_MAX; } iter++; hstep >>= 1; start_mv = *bestmv; two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, iters_per_step, is_scaled); } return besterr; } int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm, const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) { (void)start_mv_stats; const int allow_hp = ms_params->allow_hp; const int forced_stop = ms_params->forced_stop; const int iters_per_step = ms_params->iters_per_step; const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; const SUBPEL_SEARCH_TYPE subpel_search_type = ms_params->var_params.subpel_search_type; const SubpelMvLimits *mv_limits = &ms_params->mv_limits; // How many steps to take. A round of 0 means fullpel search only, 1 means // half-pel, and so on. const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp); int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel unsigned int besterr = INT_MAX; *bestmv = start_mv; const struct scale_factors *const sf = is_intrabc_block(xd->mi[0]) ? &cm->sf_identity : xd->block_ref_scale_factors[0]; const int is_scaled = av1_is_scaled(sf); if (start_mv_stats != NULL && !is_scaled) { besterr = start_mv_stats->distortion + start_mv_stats->err_cost; *distortion = start_mv_stats->distortion; *sse1 = start_mv_stats->sse; } else { if (subpel_search_type != USE_2_TAPS_ORIG) { besterr = upsampled_setup_center_error(xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion); } else { besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1, distortion); } } // If forced_stop is FULL_PEL, return. if (!round) return besterr; for (int iter = 0; iter < round; ++iter) { MV iter_center_mv = *bestmv; if (check_repeated_mv_and_update(last_mv_search_list, iter_center_mv, iter)) { return INT_MAX; } MV diag_step; if (subpel_search_type != USE_2_TAPS_ORIG) { diag_step = first_level_check(xd, cm, iter_center_mv, bestmv, hstep, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion); } else { diag_step = first_level_check_fast(xd, cm, iter_center_mv, bestmv, hstep, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, is_scaled); } // Check diagonal sub-pixel position if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) { second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion, is_scaled); } hstep >>= 1; } return besterr; } // Note(yunqingwang): The following 2 functions are only used in the motion // vector unit test, which return extreme motion vectors allowed by the MV // limits. // Returns the maximum MV. int av1_return_max_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm, const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) { (void)xd; (void)cm; (void)start_mv; (void)start_mv_stats; (void)distortion; (void)last_mv_search_list; const int allow_hp = ms_params->allow_hp; const SubpelMvLimits *mv_limits = &ms_params->mv_limits; bestmv->row = mv_limits->row_max; bestmv->col = mv_limits->col_max; unsigned int besterr = 0; // In the sub-pel motion search, if hp is not used, then the last bit of mv // has to be 0. lower_mv_precision(bestmv, allow_hp, 0); *sse1 = besterr; return besterr; } // Returns the minimum MV. int av1_return_min_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm, const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) { (void)xd; (void)cm; (void)start_mv; (void)start_mv_stats; (void)distortion; (void)last_mv_search_list; const int allow_hp = ms_params->allow_hp; const SubpelMvLimits *mv_limits = &ms_params->mv_limits; bestmv->row = mv_limits->row_min; bestmv->col = mv_limits->col_min; unsigned int besterr = 0; // In the sub-pel motion search, if hp is not used, then the last bit of mv // has to be 0. lower_mv_precision(bestmv, allow_hp, 0); *sse1 = besterr; return besterr; } #if !CONFIG_REALTIME_ONLY // Computes the cost of the current predictor by going through the whole // av1_enc_build_inter_predictor pipeline. This is mainly used by warped mv // during motion_mode_rd. We are going through the whole // av1_enc_build_inter_predictor because we might have changed the interpolation // filter, etc before motion_mode_rd is called. static inline unsigned int compute_motion_cost( MACROBLOCKD *xd, const AV1_COMMON *const cm, const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, BLOCK_SIZE bsize, const MV *this_mv) { unsigned int mse; unsigned int sse; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, AOM_PLANE_Y, AOM_PLANE_Y); const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; const MSBuffers *ms_buffers = &var_params->ms_buffers; const uint8_t *const src = ms_buffers->src->buf; const int src_stride = ms_buffers->src->stride; const uint8_t *const dst = xd->plane[0].dst.buf; const int dst_stride = xd->plane[0].dst.stride; const aom_variance_fn_ptr_t *vfp = ms_params->var_params.vfp; mse = vfp->vf(dst, dst_stride, src, src_stride, &sse); mse += mv_err_cost_(this_mv, &ms_params->mv_cost_params); return mse; } // Refines MV in a small range // Macros to build bitmasks which help us avoid redundant computations // // To explain the idea here, imagine that on the first iteration of the // loop below, we step rightwards. Then, on the second iteration, the neighbors // to consider are: // . . . // 0 1 . // . . . // Where 0 is the initial search point, 1 is the best candidate found in the // first iteration, and the dots are the other neighbors of point 1. // // Naively, we would now need to scan all 8 neighbors of point 1 (point 0 and // the seven points marked with dots), and compare them to see where to move // next. However, we already evaluated 5 of those 8 neighbors in the last // iteration, and decided that they are worse than point 1. So we don't need // to re-consider these points. We only really need to consider the three // points which are adjacent to point 1 but *not* to point 0. // // As the algorithm goes on, there are other ways that redundant evaluations // can happen, if the search path curls back around on itself. // // To avoid all possible redundancies, we'd have to build a set containing // every point we have already checked, and this would be quite expensive. // // So instead, we apply a 95%-effective solution with a much lower overhead: // we prune out the points which were considered during the previous // iteration, but we don't worry about any prior iteration. This can be done // as follows: // // We build a static table, called neighbor_mask, which answers the question // "if we moved in direction X last time, which neighbors are new, and which // were scanned last iteration?" // Then we can query this table to quickly determine which points we need to // evaluate, and which we can skip. // // To query the table, the logic is simply: // neighbor_mask[i] & (1 << j) == "if we moved in direction i last iteration, // do we need to scan neighbor j this iteration?" #define NEIGHBOR_MASK_DIA(left, down, right, up) \ (left | (down << 1) | (right << 2) | (up << 3)) #define NEIGHBOR_MASK_SQR(left, down, right, up, down_left, down_right, \ up_left, up_right) \ (left | (down << 1) | (right << 2) | (up << 3) | (down_left << 4) | \ (down_right << 5) | (up_left << 6) | (up_right << 7)) static const warp_search_config warp_search_info[WARP_SEARCH_METHODS] = { // WARP_SEARCH_DIAMOND { .num_neighbors = 4, .neighbors = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } }, .neighbor_mask = { // If we stepped left last time, consider all points except right NEIGHBOR_MASK_DIA(1, 1, 0, 1), // If we stepped down last time, consider all points except up NEIGHBOR_MASK_DIA(1, 1, 1, 0), // Stepped right last time NEIGHBOR_MASK_DIA(0, 1, 1, 1), // Stepped up last time NEIGHBOR_MASK_DIA(1, 0, 1, 1), }, }, // WARP_SEARCH_SQUARE { .num_neighbors = 8, .neighbors = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }, { 1, -1 }, { 1, 1 }, { -1, -1 }, { -1, 1 } }, .neighbor_mask = { // If we stepped left last time, then we only need to consider 3 points: // left, down+left, up+left NEIGHBOR_MASK_SQR(1, 0, 0, 0, 1, 0, 1, 0), // If we stepped down last time, then we only need to consider 3 points: // down, down+left, down+right NEIGHBOR_MASK_SQR(0, 1, 0, 0, 1, 1, 0, 0), // Stepped right last time NEIGHBOR_MASK_SQR(0, 0, 1, 0, 0, 1, 0, 1), // Stepped up last time NEIGHBOR_MASK_SQR(0, 0, 0, 1, 0, 0, 1, 1), // If we stepped down+left last time, then we need to consider 5 points: // left, down, down+left, down+right, up+left NEIGHBOR_MASK_SQR(1, 1, 0, 0, 1, 1, 1, 0), // Stepped down+right last time NEIGHBOR_MASK_SQR(0, 1, 1, 0, 1, 1, 0, 1), // Stepped up+left last time NEIGHBOR_MASK_SQR(1, 0, 0, 1, 1, 0, 1, 1), // Stepped up+right last time NEIGHBOR_MASK_SQR(0, 0, 1, 1, 0, 1, 1, 1), }, }, }; unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm, const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, BLOCK_SIZE bsize, const int *pts0, const int *pts_inref0, int total_samples, WARP_SEARCH_METHOD search_method, int num_iterations) { MB_MODE_INFO *mbmi = xd->mi[0]; const MV *neighbors = warp_search_info[search_method].neighbors; const int num_neighbors = warp_search_info[search_method].num_neighbors; const uint8_t *neighbor_mask = warp_search_info[search_method].neighbor_mask; MV *best_mv = &mbmi->mv[0].as_mv; WarpedMotionParams best_wm_params = mbmi->wm_params; int best_num_proj_ref = mbmi->num_proj_ref; unsigned int bestmse; const SubpelMvLimits *mv_limits = &ms_params->mv_limits; const int mv_shift = ms_params->allow_hp ? 0 : 1; // Calculate the center position's error assert(av1_is_subpelmv_in_range(mv_limits, *best_mv)); bestmse = compute_motion_cost(xd, cm, ms_params, bsize, best_mv); // MV search int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; // First step always scans all neighbors uint8_t valid_neighbors = UINT8_MAX; for (int ite = 0; ite < num_iterations; ++ite) { int best_idx = -1; for (int idx = 0; idx < num_neighbors; ++idx) { if ((valid_neighbors & (1 << idx)) == 0) { continue; } unsigned int thismse; MV this_mv = { best_mv->row + neighbors[idx].row * (1 << mv_shift), best_mv->col + neighbors[idx].col * (1 << mv_shift) }; if (av1_is_subpelmv_in_range(mv_limits, this_mv)) { memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); if (total_samples > 1) { mbmi->num_proj_ref = av1_selectSamples(&this_mv, pts, pts_inref, total_samples, bsize); } if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, this_mv.row, this_mv.col, &mbmi->wm_params, mi_row, mi_col)) { thismse = compute_motion_cost(xd, cm, ms_params, bsize, &this_mv); if (thismse < bestmse) { best_idx = idx; best_wm_params = mbmi->wm_params; best_num_proj_ref = mbmi->num_proj_ref; bestmse = thismse; } } } } if (best_idx == -1) break; if (best_idx >= 0) { best_mv->row += neighbors[best_idx].row * (1 << mv_shift); best_mv->col += neighbors[best_idx].col * (1 << mv_shift); valid_neighbors = neighbor_mask[best_idx]; } } mbmi->wm_params = best_wm_params; mbmi->num_proj_ref = best_num_proj_ref; return bestmse; } #endif // !CONFIG_REALTIME_ONLY // ============================================================================= // Subpixel Motion Search: OBMC // ============================================================================= // Estimates the variance of prediction residue static inline int estimate_obmc_pref_error( const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, unsigned int *sse) { const aom_variance_fn_ptr_t *vfp = var_params->vfp; const MSBuffers *ms_buffers = &var_params->ms_buffers; const int32_t *src = ms_buffers->wsrc; const int32_t *mask = ms_buffers->obmc_mask; const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv); const int ref_stride = ms_buffers->ref->stride; const int subpel_x_q3 = get_subpel_part(this_mv->col); const int subpel_y_q3 = get_subpel_part(this_mv->row); return vfp->osvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, mask, sse); } // Calculates the variance of prediction residue static int upsampled_obmc_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, unsigned int *sse) { const aom_variance_fn_ptr_t *vfp = var_params->vfp; const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type; const int w = var_params->w; const int h = var_params->h; const MSBuffers *ms_buffers = &var_params->ms_buffers; const int32_t *wsrc = ms_buffers->wsrc; const int32_t *mask = ms_buffers->obmc_mask; const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv); const int ref_stride = ms_buffers->ref->stride; const int subpel_x_q3 = get_subpel_part(this_mv->col); const int subpel_y_q3 = get_subpel_part(this_mv->row); const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; unsigned int besterr; DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]); #if CONFIG_AV1_HIGHBITDEPTH if (is_cur_buf_hbd(xd)) { uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred); aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h, subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd, subpel_search_type); besterr = vfp->ovf(pred8, w, wsrc, mask, sse); } else { aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search_type); besterr = vfp->ovf(pred, w, wsrc, mask, sse); } #else aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search_type); besterr = vfp->ovf(pred, w, wsrc, mask, sse); #endif return besterr; } static unsigned int setup_obmc_center_error( const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) { // TODO(chiyotsai@google.com): There might be a bug here where we didn't use // get_buf_from_mv(ref, *this_mv). const MSBuffers *ms_buffers = &var_params->ms_buffers; const int32_t *wsrc = ms_buffers->wsrc; const int32_t *mask = ms_buffers->obmc_mask; const uint8_t *ref = ms_buffers->ref->buf; const int ref_stride = ms_buffers->ref->stride; unsigned int besterr = var_params->vfp->ovf(ref, ref_stride, wsrc, mask, sse1); *distortion = besterr; besterr += mv_err_cost_(this_mv, mv_cost_params); return besterr; } static unsigned int upsampled_setup_obmc_center_error( MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) { unsigned int besterr = upsampled_obmc_pref_error(xd, cm, this_mv, var_params, sse1); *distortion = besterr; besterr += mv_err_cost_(this_mv, mv_cost_params); return besterr; } // Estimates the variance of prediction residue // TODO(chiyotsai@google.com): the cost does does not match the cost in // mv_cost_. Investigate this later. static inline int estimate_obmc_mvcost(const MV *this_mv, const MV_COST_PARAMS *mv_cost_params) { const MV *ref_mv = mv_cost_params->ref_mv; const int *mvjcost = mv_cost_params->mvjcost; const int *const *mvcost = mv_cost_params->mvcost; const int error_per_bit = mv_cost_params->error_per_bit; const MV_COST_TYPE mv_cost_type = mv_cost_params->mv_cost_type; const MV diff_mv = { GET_MV_SUBPEL(this_mv->row - ref_mv->row), GET_MV_SUBPEL(this_mv->col - ref_mv->col) }; switch (mv_cost_type) { case MV_COST_ENTROPY: return (unsigned)((mv_cost(&diff_mv, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * error_per_bit + 4096) >> 13); case MV_COST_NONE: return 0; default: assert(0 && "L1 norm is not tuned for estimated obmc mvcost"); return 0; } } // Estimates whether this_mv is better than best_mv. This function incorporates // both prediction error and residue into account. static inline unsigned int obmc_check_better_fast( const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, unsigned int *sse1, int *distortion, int *has_better_mv) { unsigned int cost; if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) { unsigned int sse; const int thismse = estimate_obmc_pref_error(this_mv, var_params, &sse); cost = estimate_obmc_mvcost(this_mv, mv_cost_params); cost += thismse; if (cost < *besterr) { *besterr = cost; *best_mv = *this_mv; *distortion = thismse; *sse1 = sse; *has_better_mv |= 1; } } else { cost = INT_MAX; } return cost; } // Estimates whether this_mv is better than best_mv. This function incorporates // both prediction error and residue into account. static inline unsigned int obmc_check_better( MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, unsigned int *sse1, int *distortion, int *has_better_mv) { unsigned int cost; if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) { unsigned int sse; const int thismse = upsampled_obmc_pref_error(xd, cm, this_mv, var_params, &sse); cost = mv_err_cost_(this_mv, mv_cost_params); cost += thismse; if (cost < *besterr) { *besterr = cost; *best_mv = *this_mv; *distortion = thismse; *sse1 = sse; *has_better_mv |= 1; } } else { cost = INT_MAX; } return cost; } static AOM_FORCE_INLINE MV obmc_first_level_check( MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV *best_mv, const int hstep, const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, unsigned int *sse1, int *distortion) { int dummy = 0; const MV left_mv = { this_mv.row, this_mv.col - hstep }; const MV right_mv = { this_mv.row, this_mv.col + hstep }; const MV top_mv = { this_mv.row - hstep, this_mv.col }; const MV bottom_mv = { this_mv.row + hstep, this_mv.col }; if (var_params->subpel_search_type != USE_2_TAPS_ORIG) { const unsigned int left = obmc_check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy); const unsigned int right = obmc_check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy); const unsigned int up = obmc_check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy); const unsigned int down = obmc_check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy); const MV diag_step = get_best_diag_step(hstep, left, right, up, down); const MV diag_mv = { this_mv.row + diag_step.row, this_mv.col + diag_step.col }; // Check the diagonal direction with the best mv obmc_check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy); return diag_step; } else { const unsigned int left = obmc_check_better_fast( &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy); const unsigned int right = obmc_check_better_fast( &right_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy); const unsigned int up = obmc_check_better_fast( &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy); const unsigned int down = obmc_check_better_fast( &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy); const MV diag_step = get_best_diag_step(hstep, left, right, up, down); const MV diag_mv = { this_mv.row + diag_step.row, this_mv.col + diag_step.col }; // Check the diagonal direction with the best mv obmc_check_better_fast(&diag_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &dummy); return diag_step; } } // A newer version of second level check for obmc that gives better quality. static AOM_FORCE_INLINE void obmc_second_level_check_v2( MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step, MV *best_mv, const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, unsigned int *sse1, int *distortion) { assert(best_mv->row == this_mv.row + diag_step.row || best_mv->col == this_mv.col + diag_step.col); if (CHECK_MV_EQUAL(this_mv, *best_mv)) { return; } else if (this_mv.row == best_mv->row) { // Search away from diagonal step since diagonal search did not provide any // improvement diag_step.row *= -1; } else if (this_mv.col == best_mv->col) { diag_step.col *= -1; } const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col }; const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col }; const MV diag_bias_mv = { best_mv->row + diag_step.row, best_mv->col + diag_step.col }; int has_better_mv = 0; if (var_params->subpel_search_type != USE_2_TAPS_ORIG) { obmc_check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &has_better_mv); obmc_check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &has_better_mv); // Do an additional search if the second iteration gives a better mv if (has_better_mv) { obmc_check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &has_better_mv); } } else { obmc_check_better_fast(&row_bias_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &has_better_mv); obmc_check_better_fast(&col_bias_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &has_better_mv); // Do an additional search if the second iteration gives a better mv if (has_better_mv) { obmc_check_better_fast(&diag_bias_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, distortion, &has_better_mv); } } } int av1_find_best_obmc_sub_pixel_tree_up( MACROBLOCKD *xd, const AV1_COMMON *const cm, const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) { (void)last_mv_search_list; (void)start_mv_stats; const int allow_hp = ms_params->allow_hp; const int forced_stop = ms_params->forced_stop; const int iters_per_step = ms_params->iters_per_step; const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; const SUBPEL_SEARCH_TYPE subpel_search_type = ms_params->var_params.subpel_search_type; const SubpelMvLimits *mv_limits = &ms_params->mv_limits; int hstep = INIT_SUBPEL_STEP_SIZE; const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp); unsigned int besterr = INT_MAX; *bestmv = start_mv; if (subpel_search_type != USE_2_TAPS_ORIG) besterr = upsampled_setup_obmc_center_error( xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion); else besterr = setup_obmc_center_error(bestmv, var_params, mv_cost_params, sse1, distortion); for (int iter = 0; iter < round; ++iter) { MV iter_center_mv = *bestmv; MV diag_step = obmc_first_level_check(xd, cm, iter_center_mv, bestmv, hstep, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion); if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) { obmc_second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv, mv_limits, var_params, mv_cost_params, &besterr, sse1, distortion); } hstep >>= 1; } return besterr; } // ============================================================================= // Public cost function: mv_cost + pred error // ============================================================================= int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params, const FULLPEL_MV best_mv, const aom_variance_fn_ptr_t *vfp, const struct buf_2d *src, const struct buf_2d *pre) { const MV mv = get_mv_from_fullmv(&best_mv); unsigned int sse, var; var = vfp->vf(src->buf, src->stride, get_buf_from_fullmv(pre, &best_mv), pre->stride, &sse); (void)var; return sse + mv_err_cost_(&mv, mv_cost_params); } aom-3.12.1/av1/encoder/mcomp.h000066400000000000000000000354601477627663500160030ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_MCOMP_H_ #define AOM_AV1_ENCODER_MCOMP_H_ #include "av1/common/mv.h" #include "av1/encoder/block.h" #include "av1/encoder/rd.h" #include "aom_dsp/variance.h" #ifdef __cplusplus extern "C" { #endif struct AV1_COMP; struct SPEED_FEATURES; // ============================================================================= // Cost functions // ============================================================================= enum { MV_COST_ENTROPY, // Use the entropy rate of the mv as the cost MV_COST_L1_LOWRES, // Use the l1 norm of the mv as the cost (<480p) MV_COST_L1_MIDRES, // Use the l1 norm of the mv as the cost (>=480p) MV_COST_L1_HDRES, // Use the l1 norm of the mv as the cost (>=720p) MV_COST_NONE // Use 0 as as cost irrespective of the current mv } UENUM1BYTE(MV_COST_TYPE); typedef struct { // The reference mv used to compute the mv cost const MV *ref_mv; FULLPEL_MV full_ref_mv; MV_COST_TYPE mv_cost_type; const int *mvjcost; const int *mvcost[2]; int error_per_bit; // A multiplier used to convert rate to sad cost int sad_per_bit; } MV_COST_PARAMS; int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost, int *const mvcost[2], int weight); int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params, const FULLPEL_MV best_mv, const aom_variance_fn_ptr_t *vfp, const struct buf_2d *src, const struct buf_2d *pre); // ============================================================================= // Motion Search // ============================================================================= typedef struct { // The reference buffer const struct buf_2d *ref; // The source and predictors/mask used by translational search const struct buf_2d *src; const uint8_t *second_pred; const uint8_t *mask; int mask_stride; int inv_mask; // The weighted source and mask used by OBMC const int32_t *wsrc; const int32_t *obmc_mask; } MSBuffers; static inline void av1_set_ms_compound_refs(MSBuffers *ms_buffers, const uint8_t *second_pred, const uint8_t *mask, int mask_stride, int invert_mask) { ms_buffers->second_pred = second_pred; ms_buffers->mask = mask; ms_buffers->mask_stride = mask_stride; ms_buffers->inv_mask = invert_mask; } // ============================================================================= // Fullpixel Motion Search // ============================================================================= // This struct holds fullpixel motion search parameters that should be constant // during the search typedef struct { BLOCK_SIZE bsize; // A function pointer to the simd function for fast computation const aom_variance_fn_ptr_t *vfp; MSBuffers ms_buffers; // WARNING: search_method should be regarded as a private variable and should // not be modified directly so it is in sync with search_sites. To modify it, // use av1_set_mv_search_method. SEARCH_METHODS search_method; const search_site_config *search_sites; FullMvLimits mv_limits; int run_mesh_search; // Sets mesh search unless it got pruned by // prune_mesh_search. int prune_mesh_search; // Disables mesh search if the best_mv after a normal // search if close to the start_mv. int mesh_search_mv_diff_threshold; // mv diff threshold to enable // prune_mesh_search int force_mesh_thresh; // Forces mesh search if the residue variance is // higher than the threshold. const struct MESH_PATTERN *mesh_patterns[2]; // Use maximum search interval of 4 if true. This helps motion search to find // the best motion vector for screen content types. int fine_search_interval; int is_intra_mode; int fast_obmc_search; // For calculating mv cost MV_COST_PARAMS mv_cost_params; // Stores the function used to compute the sad. This can be different from the // sdf in vfp (e.g. downsampled sad and not sad) to allow speed up. aom_sad_fn_t sdf; aom_sad_multi_d_fn_t sdx4df; aom_sad_multi_d_fn_t sdx3df; } FULLPEL_MOTION_SEARCH_PARAMS; typedef struct { int err_cost; unsigned int distortion; unsigned int sse; } FULLPEL_MV_STATS; void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer); void av1_make_default_fullpel_ms_params( FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv, const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS], SEARCH_METHODS search_method, int fine_search_interval); /*! Sets the \ref FULLPEL_MOTION_SEARCH_PARAMS to intra mode. */ void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const IntraBCMVCosts *dv_costs); // Sets up configs for firstpass motion search. void av1_init_motion_fpf(search_site_config *cfg, int stride); /*! Function pointer to search site config initialization of different search * method functions. */ typedef void (*av1_init_search_site_config)(search_site_config *cfg, int stride, int level); /*! Array of function pointers used to set the motion search config. */ extern const av1_init_search_site_config av1_init_motion_compensation[NUM_DISTINCT_SEARCH_METHODS]; // Array to inform which all search methods are having // same candidates and different in number of search steps. static const SEARCH_METHODS search_method_lookup[NUM_SEARCH_METHODS] = { DIAMOND, // DIAMOND NSTEP, // NSTEP NSTEP_8PT, // NSTEP_8PT CLAMPED_DIAMOND, // CLAMPED_DIAMOND HEX, // HEX BIGDIA, // BIGDIA SQUARE, // SQUARE HEX, // FAST_HEX BIGDIA, // FAST_DIAMOND BIGDIA, // FAST_BIGDIA BIGDIA // VFAST_DIAMOND }; // Reinitialize the search site config. static inline void av1_refresh_search_site_config( search_site_config *ss_cfg_buf, SEARCH_METHODS search_method, const int ref_stride) { const int level = search_method == NSTEP_8PT || search_method == CLAMPED_DIAMOND; search_method = search_method_lookup[search_method]; av1_init_motion_compensation[search_method](&ss_cfg_buf[search_method], ref_stride, level); } // Mv beyond the range do not produce new/different prediction block. static inline void av1_set_mv_search_method( FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS], SEARCH_METHODS search_method) { ms_params->search_method = search_method; ms_params->search_sites = &search_sites[search_method_lookup[ms_params->search_method]]; } // Set up limit values for MV components. // Mv beyond the range do not produce new/different prediction block. static inline void av1_set_mv_row_limits( const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits, int mi_row, int mi_height, int border) { const int min1 = -(mi_row * MI_SIZE + border - 2 * AOM_INTERP_EXTEND); const int min2 = -(((mi_row + mi_height) * MI_SIZE) + 2 * AOM_INTERP_EXTEND); mv_limits->row_min = AOMMAX(min1, min2); const int max1 = (mi_params->mi_rows - mi_row - mi_height) * MI_SIZE + border - 2 * AOM_INTERP_EXTEND; const int max2 = (mi_params->mi_rows - mi_row) * MI_SIZE + 2 * AOM_INTERP_EXTEND; mv_limits->row_max = AOMMIN(max1, max2); } static inline void av1_set_mv_col_limits( const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits, int mi_col, int mi_width, int border) { const int min1 = -(mi_col * MI_SIZE + border - 2 * AOM_INTERP_EXTEND); const int min2 = -(((mi_col + mi_width) * MI_SIZE) + 2 * AOM_INTERP_EXTEND); mv_limits->col_min = AOMMAX(min1, min2); const int max1 = (mi_params->mi_cols - mi_col - mi_width) * MI_SIZE + border - 2 * AOM_INTERP_EXTEND; const int max2 = (mi_params->mi_cols - mi_col) * MI_SIZE + 2 * AOM_INTERP_EXTEND; mv_limits->col_max = AOMMIN(max1, max2); } static inline void av1_set_mv_limits( const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits, int mi_row, int mi_col, int mi_height, int mi_width, int border) { av1_set_mv_row_limits(mi_params, mv_limits, mi_row, mi_height, border); av1_set_mv_col_limits(mi_params, mv_limits, mi_col, mi_width, border); } void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv); int av1_init_search_range(int size); int av1_vector_match(const int16_t *ref, const int16_t *src, int bwl, int search_size, int full_search, int *sad); unsigned int av1_int_pro_motion_estimation( const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, const MV *ref_mv, unsigned int *y_sad_zero, int me_search_size_col, int me_search_size_row); int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV start_mv, FULLPEL_MV *best_mv); int av1_full_pixel_search(const FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const int step_param, int *cost_list, FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats, FULLPEL_MV *second_best_mv); int av1_intrabc_hash_search(const struct AV1_COMP *cpi, const MACROBLOCKD *xd, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, IntraBCHashInfo *intrabc_hash_info, FULLPEL_MV *best_mv); int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const int step_param, FULLPEL_MV *best_mv); static inline int av1_is_fullmv_in_range(const FullMvLimits *mv_limits, FULLPEL_MV mv) { return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) && (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max); } // ============================================================================= // Subpixel Motion Search // ============================================================================= enum { EIGHTH_PEL, QUARTER_PEL, HALF_PEL, FULL_PEL } UENUM1BYTE(SUBPEL_FORCE_STOP); typedef struct { const aom_variance_fn_ptr_t *vfp; SUBPEL_SEARCH_TYPE subpel_search_type; // Source and reference buffers MSBuffers ms_buffers; int w, h; } SUBPEL_SEARCH_VAR_PARAMS; // This struct holds subpixel motion search parameters that should be constant // during the search typedef struct { // High level motion search settings int allow_hp; const int *cost_list; SUBPEL_FORCE_STOP forced_stop; int iters_per_step; SubpelMvLimits mv_limits; // For calculating mv cost MV_COST_PARAMS mv_cost_params; // Distortion calculation params SUBPEL_SEARCH_VAR_PARAMS var_params; } SUBPEL_MOTION_SEARCH_PARAMS; void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, const int *cost_list); typedef int(fractional_mv_step_fp)(MACROBLOCKD *xd, const AV1_COMMON *const cm, const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion, unsigned int *sse1, int_mv *last_mv_search_list); extern fractional_mv_step_fp av1_find_best_sub_pixel_tree; extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned; extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_more; extern fractional_mv_step_fp av1_return_max_sub_pixel_mv; extern fractional_mv_step_fp av1_return_min_sub_pixel_mv; extern fractional_mv_step_fp av1_find_best_obmc_sub_pixel_tree_up; unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm, const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, BLOCK_SIZE bsize, const int *pts0, const int *pts_inref0, int total_samples, WARP_SEARCH_METHOD search_method, int num_iterations); static inline void av1_set_fractional_mv(int_mv *fractional_best_mv) { for (int z = 0; z < 3; z++) { fractional_best_mv[z].as_int = INVALID_MV; } } static inline void av1_set_subpel_mv_search_range(SubpelMvLimits *subpel_limits, const FullMvLimits *mv_limits, const MV *ref_mv) { const int max_mv = GET_MV_SUBPEL(MAX_FULL_PEL_VAL); int minc = AOMMAX(GET_MV_SUBPEL(mv_limits->col_min), ref_mv->col - max_mv); int maxc = AOMMIN(GET_MV_SUBPEL(mv_limits->col_max), ref_mv->col + max_mv); int minr = AOMMAX(GET_MV_SUBPEL(mv_limits->row_min), ref_mv->row - max_mv); int maxr = AOMMIN(GET_MV_SUBPEL(mv_limits->row_max), ref_mv->row + max_mv); maxc = AOMMAX(minc, maxc); maxr = AOMMAX(minr, maxr); subpel_limits->col_min = AOMMAX(MV_LOW + 1, minc); subpel_limits->col_max = AOMMIN(MV_UPP - 1, maxc); subpel_limits->row_min = AOMMAX(MV_LOW + 1, minr); subpel_limits->row_max = AOMMIN(MV_UPP - 1, maxr); } static inline int av1_is_subpelmv_in_range(const SubpelMvLimits *mv_limits, MV mv) { return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) && (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max); } static inline int get_offset_from_fullmv(const FULLPEL_MV *mv, int stride) { return mv->row * stride + mv->col; } static inline const uint8_t *get_buf_from_fullmv(const struct buf_2d *buf, const FULLPEL_MV *mv) { return &buf->buf[get_offset_from_fullmv(mv, buf->stride)]; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_MCOMP_H_ aom-3.12.1/av1/encoder/mcomp_structs.h000066400000000000000000000076541477627663500175760ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_MCOMP_STRUCTS_H_ #define AOM_AV1_ENCODER_MCOMP_STRUCTS_H_ #include "av1/common/mv.h" // The maximum number of steps in a step search given the largest // allowed initial step #define MAX_MVSEARCH_STEPS 11 // Max full pel mv specified in the unit of full pixel // Enable the use of motion vector in range [-1023, 1023]. #define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1) // Maximum size of the first step in full pel units #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1)) // Maximum number of neighbors to scan per iteration during // WARPED_CAUSAL refinement // Note: The elements of warp_search_config.neighbor_mask must be at least // MAX_WARP_SEARCH_NEIGHBORS many bits wide. So the type may need to be // widened if this value is increased. #define MAX_WARP_SEARCH_NEIGHBORS 8 #define SEARCH_RANGE_8P 3 #define SEARCH_GRID_STRIDE_8P (2 * SEARCH_RANGE_8P + 1) #define SEARCH_GRID_CENTER_8P \ (SEARCH_RANGE_8P * SEARCH_GRID_STRIDE_8P + SEARCH_RANGE_8P) typedef struct { FULLPEL_MV coord; int coord_offset; } search_neighbors; // motion search site typedef struct search_site { FULLPEL_MV mv; int offset; } search_site; typedef struct search_site_config { search_site site[MAX_MVSEARCH_STEPS * 2][16 + 1]; // Number of search steps. int num_search_steps; int searches_per_step[MAX_MVSEARCH_STEPS * 2]; int radius[MAX_MVSEARCH_STEPS * 2]; int stride; } search_site_config; enum { // Search 8-points in the radius grid around center, up to 11 search stages. DIAMOND = 0, // Search 12-points in the radius/tan_radius grid around center, // up to 15 search stages. NSTEP = 1, // Search 8-points in the radius grid around center, up to 16 search stages. NSTEP_8PT = 2, // Search 8-points in the radius grid around center, upto 11 search stages // with clamping of search radius. CLAMPED_DIAMOND = 3, // Search maximum 8-points in the radius grid around center, // up to 11 search stages. First stage consists of 8 search points // and the rest with 6 search points each in hex shape. HEX = 4, // Search maximum 8-points in the radius grid around center, // up to 11 search stages. First stage consists of 4 search // points and the rest with 8 search points each. BIGDIA = 5, // Search 8-points in the square grid around center, up to 11 search stages. SQUARE = 6, // HEX search with up to 2 stages. FAST_HEX = 7, // BIGDIA search with up to 2 stages. FAST_DIAMOND = 8, // BIGDIA search with up to 3 stages. FAST_BIGDIA = 9, // BIGDIA search with up to 1 stage. VFAST_DIAMOND = 10, // Total number of search methods. NUM_SEARCH_METHODS, // Number of distinct search methods. NUM_DISTINCT_SEARCH_METHODS = SQUARE + 1, } UENUM1BYTE(SEARCH_METHODS); typedef struct warp_search_config { int num_neighbors; MV neighbors[MAX_WARP_SEARCH_NEIGHBORS]; // Bitmask which is used to prune the search neighbors at one iteration // based on which direction we chose in the previous iteration. // See comments in av1_refine_warped_mv for details. uint8_t neighbor_mask[MAX_WARP_SEARCH_NEIGHBORS]; } warp_search_config; // Methods for refining WARPED_CAUSAL motion vectors enum { // Search 4 adjacent points in a diamond shape at each iteration WARP_SEARCH_DIAMOND, // Search 8 adjacent points in a square at each iteration WARP_SEARCH_SQUARE, WARP_SEARCH_METHODS } UENUM1BYTE(WARP_SEARCH_METHOD); #endif // AOM_AV1_ENCODER_MCOMP_STRUCTS_H_ aom-3.12.1/av1/encoder/misc_model_weights.h000066400000000000000000001362701477627663500205360ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_ #define AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_ #ifdef __cplusplus extern "C" { #endif #include "av1/encoder/ml.h" #define MV_PREC_FEATURE_SIZE 18 #define NUM_DNN_LAYERS 1 #define NUM_DNN_FEATURES MV_PREC_FEATURE_SIZE #define MV_PREC_LAYER_SIZE_0 32 #define NUM_LOGITS 1 static const float av1_mv_prec_mean[MV_PREC_FEATURE_SIZE] = { 143.67358891063745f, 141.6251917346238f, 0.36313633945679064f, 0.0028162791958822085f, 0.000484820537626698f, 0.002769969388939025f, 0.0f, 0.00031274626720947577f, 0.00020578555375160075f, 0.0007075246732697733f, 0.000539641029909925f, 0.0013939401375906984f, 4.985394760423499f, 4.985394760423499f, 4.9992148717283085f, 5.143739822380163f, 5.518483124004564f, 87.63597847427077f }; static const float av1_mv_prec_std[MV_PREC_FEATURE_SIZE] = { 66.86256140247244f, 68.04472572607503f, 13.23247674430399f, 0.0029123438396921955f, 0.0015331406169374737f, 0.0029149813096313775f, 1.0f, 0.00047501102871357813f, 0.00030025962993117947f, 0.0009861163580391207f, 0.0012157593528004055f, 0.002004954948490521f, 6.539447500484038f, 6.539447500484038f, 6.396589058279465f, 3.4870155874262516f, 3.8911353973740535f, 112.07985259573601f }; static const float av1_mv_prec_nn_weights_layer_0[] = { -0.13008492159557145f, -0.1483527373474774f, 0.08112076098858864f, -0.9582568679627453f, -0.34794757171071206f, 0.6465225723304947f, 0.0f, 0.06754171885839604f, 0.27156803620541214f, 0.10635231245664407f, -0.031183926995968583f, 0.048122572260291f, -0.19498534230045128f, -0.2614116319273316f, -0.3223762845136331f, -1.2063368350609205f, -0.523333556911706f, 1.075632260890728f, 0.48989726814387946f, -0.34816466111070477f, 0.41668357610256473f, -1.0973562848791671f, 0.04183921854389494f, -0.9123815389260476f, 0.0f, 0.859965047744027f, 0.1962095804679813f, 0.2606564339077058f, 0.26695868715184895f, 0.5319308568326692f, -0.23717505799723165f, -0.43127224481782567f, -0.3214545776203726f, 0.5850852241402176f, -0.26705531612587813f, -0.5786016766610093f, 0.9360519909983003f, 0.20771329289016555f, -0.027614159544811823f, -1.175022807046164f, -0.07578967497693835f, 0.6890172485324256f, 0.0f, -0.008008338164988263f, -0.08064800010158935f, -0.22606910981666667f, 0.4541586669210879f, 0.07731527661370792f, -0.6744475941247964f, -0.2625842448396184f, 1.7018613444303785f, -0.08622229073162656f, 0.041858142814941275f, -0.24575964090386415f, -0.046626044730994964f, 0.7608713064175202f, -0.23330119070907146f, -0.10115510984500826f, 0.9722537349192069f, 0.11718554254290829f, 0.0f, 0.2075123446014759f, 0.09465167310768637f, 0.7609896851963016f, 0.4441038581385328f, 0.26064144727430955f, -0.14678625366485035f, -0.03597014452200524f, 0.3128680867196166f, 1.102496797385966f, 0.06642253233084111f, -1.2665494483407629f, 0.09049412632000911f, -1.1160621999565095f, 0.043420275255913035f, -0.8811412259978966f, 0.21076234632287777f, 0.16571534463543866f, 0.0f, -0.7324075176473275f, -0.3677622514459495f, 0.3273532243056415f, 0.22922161936797775f, 0.8204766691058087f, 0.02982161033720488f, 0.5266419954188112f, -1.0032154963302191f, 0.7007602969763729f, 0.37196355167990885f, -0.7608579453228548f, 0.08568111584781847f, 0.07011061059123677f, 0.3233263598082507f, -0.08249928295410253f, 0.08220165761319252f, 0.22148722752246794f, 0.0f, 0.6122392701743506f, -0.26429838296378333f, 0.31958081620005463f, -0.006027177397853826f, -0.3088310785887994f, -0.5436192046707807f, -0.011080356757423306f, 0.12632650770008413f, -0.45097913215234525f, 1.8008072867127298f, -0.7630029654575501f, -0.4054774329826579f, 0.40386074452544535f, -0.18541426257453025f, 0.2444879765079863f, -0.6216724756115081f, 0.27030299321302f, 0.0f, -0.6835848952967989f, -0.7914184320964815f, -0.6761595019582928f, -1.009565565604081f, -0.1904242439353305f, 0.4463417126318631f, 0.6025503823452971f, 0.5149990860115566f, 1.0242970663937634f, 0.037947306826401385f, 0.07039339786212848f, 0.14273796789711987f, 0.168103961425691f, 1.6596066376811978f, 0.19321092229384657f, -0.3710750388148514f, -0.01717015559410288f, 0.0f, 0.3005688477942597f, 0.23877080653829577f, 0.2718594552971173f, 0.3885402571589898f, 0.32999531945669247f, -0.6134460954213243f, -0.13972265462799183f, -0.07180089575716991f, -1.014572598188105f, 0.0717207322809836f, 0.34896157745155615f, -0.27127687591403f, -0.5058651212773623f, -1.5442435628306925f, -0.6399784724734707f, 0.6274301429074947f, -0.4645750072767051f, 0.0f, -0.2406726815244178f, -0.06321214115916597f, 0.312856714253404f, 0.16459514124116134f, 0.3993579604809623f, -0.15232044351561913f, -0.5613743948568469f, 0.7219801372223262f, 0.2936857469624009f, 0.7823466656034087f, -0.12416947814098349f, -0.36413756654028345f, -0.07992098796866462f, -0.7395722879842416f, 0.8639913543220514f, -0.311931773757945f, -1.7308240470400613f, 0.0f, 0.394499716712104f, 0.6511462819539963f, -0.0722425275974144f, 0.13490818194661386f, 0.055319135836378035f, 0.15389577508097013f, 0.28958598328870605f, -0.14608429470539772f, 0.09488817462478298f, -0.17231294096622088f, 0.6721115415911466f, -0.05664621150536103f, 0.03291799673669331f, 0.02845382711057482f, -0.9953563446999164f, -0.17994298220605923f, 0.6560824519337476f, 0.0f, -0.30990646375917935f, 0.17215517202874f, 0.2026816225170481f, 0.22011958747715601f, 0.3562520768889686f, -0.18436559057189175f, 0.1733377147302066f, 0.02818276995640877f, -0.29703005574859076f, -0.3310652639215064f, -1.6091173258529277f, 0.45461585790028003f, -0.5078643334592593f, -0.338997374732338f, 0.4688619590359733f, 0.627099126828289f, -0.5249801376494249f, 0.0f, 0.34465498218272883f, 0.009891680630908135f, -0.27244020967349f, 0.05404589867626979f, -0.06220329325739666f, -0.13365376464759104f, -0.13098573553512366f, 0.11434198976289106f, 0.6740951247574676f, 1.3381727185724581f, -1.4865773213251936f, 0.05809898701966341f, 0.25380780261023456f, 1.2716367496512722f, 0.1768290070780598f, -0.07554828135356352f, 0.8180570085344856f, 0.0f, 1.0788448980077463f, 0.0651938742459459f, 0.3807672030015587f, 0.6144792680268445f, 0.011660612214908059f, -0.018306023765580288f, 0.44140813809926516f, -0.13411994195502386f, 0.15920368955127778f, -0.19382358417849888f, -0.08802147969690055f, -0.019731052733814477f, 0.1104744229169665f, -0.195834419735958f, -0.5005295046454347f, -0.17041241868229032f, -0.471942117351489f, 0.0f, -0.3599073304761372f, -0.2745532782968519f, -0.8323064841106417f, -0.88355885384943f, -0.02826466859020679f, 0.06977870308805256f, 0.11926112095374196f, 1.367382707959643f, -0.06119843162964051f, -0.5331395268889569f, -1.2155531584240624f, -0.01896651779524327f, 0.10591845408571081f, -0.010632842156504733f, 0.6150787968629282f, -0.4191690185896091f, -0.9961718918346271f, 0.0f, 0.23370364516013867f, 0.4156033072362998f, 0.1261005546633433f, 0.0812413884532226f, -0.008894337353937203f, 0.07984447025056046f, -0.1258098052766725f, -0.40245475467767916f, 1.78188906675019f, -1.1544387954232302f, -0.41768781481273387f, 0.6791211165341995f, -0.4175127856183446f, -0.07353219159767788f, -0.2888813577574072f, -0.7107767892597061f, -1.0450031091195449f, 0.0f, -0.9221599545079143f, -0.6747876356740621f, 0.30241454354872105f, 0.4924965303373908f, -0.14042722740054084f, 0.27744210409350445f, -0.14788270997426836f, -0.9081467469237995f, -0.04513115674995093f, -0.5254168669125793f, -0.6999012037974789f, 0.434661246306547f, -0.7193303957246092f, -0.9117952623409744f, -1.5097267865916142f, -0.20779888103770922f, 0.4935562480901218f, 0.0f, 0.18303393908923593f, 0.34753722677570037f, 0.29291001533177663f, 0.3832351878354224f, 0.3295194956120599f, -0.32398033003617527f, -0.31570906736433746f, 0.23657779050372962f, 0.9510794465234161f, -0.5122243902568278f, 0.08652112725315658f, 0.2246634353717998f, -0.9032595595582497f, -0.8936484034533545f, 0.6012969720865752f, -0.6454216646117924f, -1.1753786049658332f, 0.0f, -0.4360545677728656f, -0.6586237455328507f, -0.34347301697886656f, -0.8909724651992144f, -0.24378721818350263f, 0.6179733359297576f, 0.0661661181742234f, -0.14120142044993794f, -0.07732699885498932f, 1.0221355882357506f, 0.44514798994115284f, -0.7371569579959046f, -0.7212499572378936f, 0.7453626921081045f, 0.5478757761345768f, -0.39411232789985384f, 0.7200542656743857f, 0.0f, -0.11790869453118827f, -0.12317030713581928f, -0.4207902738133338f, 0.15895105878327986f, 0.304261777102111f, 0.11450744587017621f, -0.11470709991317944f, 0.5949222371739038f, 0.6549518619412444f, -0.24390606570422838f, -0.4212796009440803f, -0.6269666206320964f, -0.5421193969807078f, -0.12297772128652287f, 0.021517257619930424f, 0.25462855095544523f, -0.22107798187348246f, 0.0f, 0.5204516300095662f, 0.2837402841862462f, 0.11310823283285916f, 0.8944351685018025f, 0.17487203235834015f, -0.5271221928634433f, -0.19516594503423199f, 0.452456617580365f, 1.2456272242706414f, 0.24166615894862817f, 0.09411429305204502f, -0.2730072283327243f, -0.8129383770918172f, -0.24093254193486136f, 0.5696499174142177f, -0.11110805836073044f, -0.3968204166235694f, 0.0f, -0.04388165369378549f, -0.005631266017272595f, -0.02574211858479705f, 0.06230399626660669f, 0.17677671232932785f, 0.5172871274400965f, 0.4919150085620063f, -1.597656637582941f, 0.02415185715719143f, -0.17945446376668306f, -0.39340600199798886f, 0.25013205256886845f, 0.05972330340308685f, 0.1359911505596489f, -0.02341033271820833f, 0.15726074644063684f, 0.47512625913020357f, 0.0f, 0.7327341664835779f, -0.3689092312320013f, 0.4571824787436036f, 0.6215465537945456f, 0.0944111296842023f, -0.12571956176607574f, -0.2507235674395462f, -0.09579602654351593f, 1.4463357293728496f, 0.749153535856049f, -0.5553955120807588f, -0.09622771929369946f, -0.2598697420394813f, -0.964691815299676f, -0.8289963178173902f, 0.7112949291983329f, -0.8667009730492162f, 0.0f, -0.48698304169042794f, -0.18786095669893707f, -0.11425249263203247f, -0.3693391011684809f, 0.09933145842585253f, 0.2568559685298844f, 0.7048512233651738f, 0.6056238412407038f, -0.4355558119826642f, 0.17318931883915484f, 0.6481333496429564f, -0.45728823054344486f, -0.006325004538589701f, 0.45609864075494927f, -0.6199385981116988f, 0.035105808783046165f, 0.1203147963894839f, 0.0f, 0.383402190836527f, 0.048429009055370106f, 0.5887186439275204f, -0.20538767641607814f, -0.031237879611002117f, 0.3140759860883231f, 0.24447070584999556f, 0.7271263905705878f, 0.8432799162434237f, -0.11530577554199217f, -0.7781023892314718f, 0.05359488822710336f, 0.5624870388700809f, 0.5134656523208906f, 0.18304041423438375f, -0.04237421156328257f, -0.20759809886942207f, 0.0f, -0.06249337454975615f, 0.10081284533873777f, 0.3894374350259183f, 1.518217777528342f, -0.9100037950171563f, 0.17796906121831477f, -0.2892167255357892f, 0.6117902467884032f, 0.13332120964959573f, -0.3487155932849374f, -0.32920583745734694f, 0.08242631209809854f, -0.24920225708110588f, 0.8401757259392635f, 0.11729108681358365f, 0.11222925752499184f, -0.027078490721459958f, 0.0f, 0.726132375517389f, 0.72220359881096f, 0.5721582611845177f, 0.15139162075524315f, 0.6676549461551197f, -0.321449586554697f, -0.10141104515219895f, -0.09711123988777906f, 0.9623356184776928f, -0.7941822373167173f, -0.9373923554119346f, 0.4573241832354059f, -0.42029139056126147f, 0.2675223459380999f, -0.5487300191551386f, 0.2236621891916084f, 0.11692039230044018f, 0.0f, 0.1758399202780961f, 0.676447587678781f, 0.5945412815881029f, 0.5669863357359594f, 0.8433565415303922f, -0.30300550790708036f, -0.43332881999693673f, -0.4996522695731392f, -0.2084930815451962f, 0.27765278702463786f, 1.0886848763946915f, -0.0739433655813831f, -0.4762801579229192f, -0.2490825339320731f, -1.8820479350439439f, -0.4251592225775914f, -0.3992922365484464f, 0.0f, 0.19598917760218867f, 0.4860238022746914f, 0.3364528828641281f, 0.3350950865226741f, 0.2773654548632006f, -0.30547262140782566f, 0.028649620490728344f, -0.11763407628280315f, 0.6237318502627169f, -0.3958952632477945f, 0.14797171297835243f, 0.45821729624747465f, -0.8687137170773626f, 0.06989667196937126f, -0.5752606929478727f, 0.16986945686358412f, 0.6925071596817824f, 0.0f, 0.4991250796183003f, 0.03424654896322111f, 0.6153698611882319f, 0.5070872444849457f, 0.43615747516328135f, -0.7870352838659244f, -0.6424101231965247f, -0.7005774876651399f, 0.79983115431488f, 0.15720357955596242f, -1.408372612176309f, -0.039294695217213765f, 0.6979415372962309f, 0.27403316751965656f, 1.2844596102619275f, -0.2781534150257364f, 0.3248437714908865f, 0.0f, 0.4364362371752831f, -0.2548580911485434f, -0.19578001373349452f, -0.04597194387828005f, -0.010035156855533233f, 0.0415941475251266f, 0.07929549739797387f, -0.060629652912508866f, 0.5977303008711333f, -1.4404008068066554f, 0.8555694790197376f, -0.03693438534401856f, 0.17761411164512408f, -0.11858304304109235f, -1.4241324353471327f, 0.1533849765389186f, 0.7650643783126995f, 0.0f, -0.0639949379280401f, 0.4288617817939563f, 0.4235508646885404f, 0.3419843254383798f, -0.015992360660098768f, -0.773247697505441f, -0.4908452922015917f, 0.9868134897291486f, -0.5078689994742608f, 1.05632043744864f, -0.38867419409275117f, -0.0065547696858664194f, -0.3056003173415037f, -0.333762331930102f, 0.4459671174011671f, 0.08219092584580244f, -0.08099158579518179f, 0.0f, -0.1568180656346373f, -0.061962372393910135f, 0.14065868174859464f, -0.055925712798972765f, 0.05136117465820622f, 0.0907831030477633f, 0.19518110495319604f, -0.7470794578145956f, 1.5945999734733545f, -0.4351697502345834f, -0.33253649399571805f }; static const float av1_mv_prec_nn_bias_layer_0[] = { -0.651213833993862f, -1.1243309933417809f, -0.2123880023097051f, 0.23095477452877616f, -0.6668057665893545f, 0.3082268148379634f, -0.3344916753975844f, -0.20920185606857844f, 0.6057933917964854f, 0.5031857662559803f, -1.5380096313468152f, -0.4457245344804041f, 1.82368055812373f, 0.7973912064077963f, 0.25706500555622913f, 0.1394695119825382f, 0.4508811973450553f, -0.5408959545111782f, 1.064829233697863f, 0.3733268644246235f, 1.1173169029905483f, -0.2012817466400134f, -0.16628447748302294f, 1.3086000088940826f, 0.7267092979664235f, -0.9097857006590555f, -0.7564259343863077f, -0.49844128036716173f, -0.4675729246975423f, -0.03626154526362181f, -0.41957330902404616f, -0.9658160514319954f }; static const float av1_mv_prec_nn_weights_layer_1[] = { 1.5017296484510276f, 1.044216918060133f, -1.066541411740906f, -0.7762965171172661f, -0.9814396609661653f, 0.9334065847340715f, 0.7117244268817873f, -0.7695942296628597f, 0.7892157680137047f, -0.5786309358654476f, -2.4444494892027264f, 1.1666759262637185f, -0.9699580532370483f, 0.5849682956422552f, -1.0372272986941953f, -0.5005014627824439f, 1.1816204711740521f, -1.2204867615892114f, 0.4510263977504913f, 0.35567865078585165f, -0.7811389330738839f, -0.6643977800301099f, -0.6283287371705794f, 0.790873821018048f, 0.8861643352684585f, 0.6438840651522237f, 0.6677191546466089f, 0.9703715021995785f, 1.250893534236489f, 0.7733742028067933f, -1.249673977776904f, -1.2890127265725608f }; static const float av1_mv_prec_nn_bias_layer_1[] = { -0.341771735378258f }; static const NN_CONFIG av1_mv_prec_dnn_config = { NUM_DNN_FEATURES, NUM_LOGITS, NUM_DNN_LAYERS, { MV_PREC_LAYER_SIZE_0 }, { av1_mv_prec_nn_weights_layer_0, av1_mv_prec_nn_weights_layer_1, }, { av1_mv_prec_nn_bias_layer_0, av1_mv_prec_nn_bias_layer_1, }, }; #undef NUM_DNN_LAYERS #undef NUM_DNN_FEATURES #undef NUM_LAYER_0_UNITS #undef NUM_LOGITS #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_ aom-3.12.1/av1/encoder/ml.c000066400000000000000000000144271477627663500152730ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/mathutils.h" #include "av1/encoder/ml.h" void av1_nn_output_prec_reduce(float *const output, int num_output) { const int prec_bits = 9; const int prec = 1 << prec_bits; const float inv_prec = (float)(1.0 / prec); for (int i = 0; i < num_output; i++) { output[i] = ((int)(output[i] * prec + 0.5)) * inv_prec; } } // Calculate prediction based on the given input features and neural net config. // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden // layer. void av1_nn_predict_c(const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output) { int num_input_nodes = nn_config->num_inputs; int buf_index = 0; float buf[2][NN_MAX_NODES_PER_LAYER]; // Propagate hidden layers. const int num_layers = nn_config->num_hidden_layers; assert(num_layers <= NN_MAX_HIDDEN_LAYERS); for (int layer = 0; layer < num_layers; ++layer) { const float *layer_weights = nn_config->weights[layer]; const float *layer_bias = nn_config->bias[layer]; float *output_nodes = buf[buf_index]; const int num_output_nodes = nn_config->num_hidden_nodes[layer]; assert(num_output_nodes < NN_MAX_NODES_PER_LAYER); for (int node = 0; node < num_output_nodes; ++node) { float val = layer_bias[node]; for (int i = 0; i < num_input_nodes; ++i) val += layer_weights[node * num_input_nodes + i] * input_nodes[i]; // ReLU as activation function. val = val > 0.0f ? val : 0.0f; // Could use AOMMAX(). output_nodes[node] = val; } num_input_nodes = num_output_nodes; input_nodes = output_nodes; buf_index = 1 - buf_index; } // Final output layer. const float *layer_weights = nn_config->weights[num_layers]; const float *layer_bias = nn_config->bias[num_layers]; for (int node = 0; node < nn_config->num_outputs; ++node) { float val = layer_bias[node]; for (int i = 0; i < num_input_nodes; ++i) val += layer_weights[node * num_input_nodes + i] * input_nodes[i]; output[node] = val; } if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs); } #if CONFIG_NN_V2 // Applies the ReLu activation to one fc layer // output[i] = Max(input[i],0.0f) static float *nn_relu(const float *input, FC_LAYER *layer) { for (int i = 0; i < layer->num_outputs; ++i) { layer->output[i] = AOMMAX(input[i], 0.0f); } return layer->output; } // Applies the Sigmoid activation to one fc layer // output[i] = 1/(1+exp(input[i])) static float *nn_sigmoid(const float *input, FC_LAYER *layer) { for (int i = 0; i < layer->num_outputs; ++i) { const float tmp = AOMMIN(AOMMAX(input[i], -10.0f), 10.0f); layer->output[i] = 1.0f / (1.0f + expf(-tmp)); } return layer->output; } // Forward prediction in one fc layer, used in function av1_nn_predict_V2 static float *nn_fc_forward(const float *input, FC_LAYER *layer) { const float *weights = layer->weights; const float *bias = layer->bias; assert(layer->num_outputs < NN_MAX_NODES_PER_LAYER); // fc for (int node = 0; node < layer->num_outputs; ++node) { float val = bias[node]; for (int i = 0; i < layer->num_inputs; ++i) val += weights[i] * input[i]; layer->output[node] = val; weights += layer->num_inputs; } // activation switch (layer->activation) { case NONE: return layer->output; case RELU: return nn_relu(layer->output, layer); case SIGMOID: return nn_sigmoid(layer->output, layer); case SOFTSIGN: assert(0 && "Softsign has not been supported in NN."); // TO DO return NULL; default: assert(0 && "Unknown activation"); // Unknown activation return NULL; } } void av1_nn_predict_v2(const float *feature, NN_CONFIG_V2 *nn_config, int reduce_prec, float *output) { const float *input_nodes = feature; // Propagate the layers. const int num_layers = nn_config->num_hidden_layers; assert(num_layers <= NN_MAX_HIDDEN_LAYERS); for (int i = 0; i < num_layers; ++i) { input_nodes = nn_fc_forward(input_nodes, nn_config->layer + i); assert(nn_config->layer[i + 1].num_inputs == nn_config->layer[i].num_outputs); } // Final layer input_nodes = nn_fc_forward(input_nodes, nn_config->layer + num_layers); assert(nn_config->layer[num_layers].num_outputs == nn_config->num_logits); // Copy the final layer output memcpy(output, input_nodes, sizeof(*input_nodes) * nn_config->num_logits); if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_logits); } #endif // CONFIG_NN_V2 void av1_nn_softmax(const float *input, float *output, int n) { // Softmax function is invariant to adding the same constant // to all input values, so we subtract the maximum input to avoid // possible overflow. float max_input = input[0]; for (int i = 1; i < n; i++) max_input = AOMMAX(max_input, input[i]); float sum_out = 0.0f; for (int i = 0; i < n; i++) { // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors. const float normalized_input = AOMMAX(input[i] - max_input, -10.0f); output[i] = expf(normalized_input); sum_out += output[i]; } for (int i = 0; i < n; i++) output[i] /= sum_out; } void av1_nn_fast_softmax_16_c(const float *input, float *output) { const int kNumClasses = 16; float max_input = input[0]; for (int i = 1; i < kNumClasses; i++) max_input = AOMMAX(max_input, input[i]); float sum_out = 0.0f; for (int i = 0; i < kNumClasses; i++) { // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors. const float normalized_input = AOMMAX(input[i] - max_input, -10.0f); output[i] = approx_exp(normalized_input); sum_out += output[i]; } for (int i = 0; i < kNumClasses; i++) output[i] /= sum_out; } aom-3.12.1/av1/encoder/ml.h000066400000000000000000000061021477627663500152670ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_ML_H_ #define AOM_AV1_ENCODER_ML_H_ #ifdef __cplusplus extern "C" { #endif #include "config/av1_rtcd.h" #define NN_MAX_HIDDEN_LAYERS 10 #define NN_MAX_NODES_PER_LAYER 128 struct NN_CONFIG { int num_inputs; // Number of input nodes, i.e. features. int num_outputs; // Number of output nodes. int num_hidden_layers; // Number of hidden layers, maximum 10. // Number of nodes for each hidden layer. int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS]; // Weight parameters, indexed by layer. const float *weights[NN_MAX_HIDDEN_LAYERS + 1]; // Bias parameters, indexed by layer. const float *bias[NN_MAX_HIDDEN_LAYERS + 1]; }; // Typedef from struct NN_CONFIG to NN_CONFIG is in rtcd_defs #if CONFIG_NN_V2 // Fully-connectedly layer configuration struct FC_LAYER { const int num_inputs; // Number of input nodes, i.e. features. const int num_outputs; // Number of output nodes. float *weights; // Weight parameters. float *bias; // Bias parameters. const ACTIVATION activation; // Activation function. float *output; // The output array. float *dY; // Gradient of outputs float *dW; // Gradient of weights. float *db; // Gradient of bias }; // NN configure structure V2 struct NN_CONFIG_V2 { const int num_hidden_layers; // Number of hidden layers, max = 10. FC_LAYER layer[NN_MAX_HIDDEN_LAYERS + 1]; // The layer array const int num_logits; // Number of output nodes. float *logits; // Raw prediction (same as output of final layer) const LOSS loss; // Loss function }; // Calculate prediction based on the given input features and neural net config. // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden // layer. void av1_nn_predict_v2(const float *features, NN_CONFIG_V2 *nn_config, int reduce_prec, float *output); #endif // CONFIG_NN_V2 // Applies the softmax normalization function to the input // to get a valid probability distribution in the output: // output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k])) void av1_nn_softmax(const float *input, float *output, int n); // A faster but less accurate version of av1_nn_softmax(input, output, 16) void av1_nn_fast_softmax_16_c(const float *input, float *output); // Applies a precision reduction to output of av1_nn_predict to prevent // mismatches between C and SIMD implementations. void av1_nn_output_prec_reduce(float *const output, int num_output); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_ML_H_ aom-3.12.1/av1/encoder/mode_prune_model_weights.h000066400000000000000000000212431477627663500217310ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_ #define AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_ #ifdef __cplusplus extern "C" { #endif #define NUM_HIDDEN_LAYERS_12 1 #define NUM_FEATURES_12 6 #define NUM_LAYER_0_UNITS_12 24 #define NUM_LOGITS_12 2 static const float av1_intrap_hiddenlayer_0_kernel_12[] = { 7.28372f, -1.3333898f, -1.3180022f, -0.007156151f, -0.40799126f, -0.57538104f, -31.81647f, 6.7057495f, 6.351472f, -0.029544508f, 0.026801195f, 1.12863f, -0.70769817f, -0.24183524f, 0.0649113f, -0.7189517f, 0.21791299f, 0.12840256f, -0.56424767f, 0.16924907f, 0.4605501f, -0.170895f, -0.60358995f, -0.15383226f, -4.0523643f, 0.6961917f, 1.3100256f, -0.4189354f, 0.37264112f, -0.14555685f, 10.628014f, 8.184437f, 8.941916f, -0.011731001f, -0.45127156f, 0.42704004f, 36.84277f, 8.988796f, 8.844238f, 0.00030091056f, -0.022038324f, 1.3566176f, -8.863219f, -0.84811693f, -1.0908632f, 0.00023130262f, -1.0698471f, -6.755927f, 7.1711984f, 4.7216063f, 3.5099216f, -0.6650184f, 0.5935173f, -0.6696286f, 11.8595295f, 0.3001874f, 0.29822728f, 0.04319222f, -1.203178f, 1.1210147f, 0.035045594f, -0.20559944f, -0.015388541f, -0.7857941f, -0.94100875f, -0.1278549f, -19.22603f, 7.9466896f, 6.5048656f, -0.22195444f, 0.19061874f, 1.3927288f, -8.896529f, -0.48146892f, -1.6098932f, -0.0030235797f, -0.6533787f, -2.1333003f, -22.256454f, -4.934058f, -4.4707212f, -0.015831878f, -0.4243649f, -2.776269f, -0.23762038f, 0.1820098f, -0.51865315f, -1.1893421f, 0.34969202f, 0.10636194f, 14.545696f, 1.3849198f, 2.6815193f, -0.5145498f, 0.45948258f, -0.8842355f, -0.9111363f, -0.39652422f, 0.077266276f, -0.68084997f, 0.4593515f, -0.28872707f, -6.936231f, 1.12253f, 1.7616503f, -0.014069137f, -0.0052156276f, -4.5095444f, 6.2076726f, -0.058755957f, -0.4675936f, -0.13039507f, 0.12094394f, -0.07285393f, 68.26125f, 7.4893136f, 8.770954f, 0.020274093f, -0.027877754f, 1.6579602f, -0.1825479f, 0.34832543f, 0.07472531f, -0.44812247f, -1.0941806f, -0.16749863f, 1.1394324f, 0.47983396f, -0.99983627f, -0.00064249727f, -1.3345739f, -0.057157427f, -18.14875f, 16.506035f, 15.539248f, 0.013191509f, -0.021674965f, -25.006235f, 0.51220596f, 0.7334426f, 0.81836903f, -1.0443225f, 0.4459505f, -1.2045046f }; static const float av1_intrap_hiddenlayer_0_bias_12[] = { -4.154915f, 14.33833f, 0.0f, 0.0f, 2.0440118f, 12.40922f, -16.77514f, 0.5879813f, 3.2305415f, 0.8303539f, 0.0f, 14.488708f, 2.94393f, 1.874383f, 0.0f, -0.53140444f, 0.0f, 1.8456234f, -0.55427986f, -19.856262f, 0.0f, 0.17281002f, 48.31631f, 0.0f }; static const float av1_intrap_logits_kernel_12[] = { 0.26843873f, -0.09576241f, 0.34427166f, 0.09914787f, -0.10275399f, 0.02999484f, -0.1467772f, 0.11594324f, 0.29200763f, 0.0067976206f, 0.050393578f, -0.018694371f, 0.3333476f, 0.2127221f, 0.35128218f, 0.19968672f, 0.08099991f, 0.084850654f, -0.16045967f, 0.30286232f, 0.6164765f, -0.27140254f, 0.08210814f, 0.34852806f, 0.25028184f, -0.12188078f, 0.16310331f, 0.31253803f, -0.10792341f, 0.065858394f, -0.1349708f, 0.08948815f, 0.31905392f, 0.03680656f, -0.05040944f, -0.051539157f, 0.3211852f, 0.2137136f, 0.45037416f, 0.22748767f, -0.10978614f, 0.06475646f, -0.16954158f, 0.32831904f, 0.16479677f, -0.30020145f, 0.066221856f, 0.37213042f }; static const float av1_intrap_logits_bias_12[] = { 0.95783f, -0.95823103f }; static const NN_CONFIG av1_intrap_nn_config = { NUM_FEATURES_12, NUM_LOGITS_12, NUM_HIDDEN_LAYERS_12, { NUM_LAYER_0_UNITS_12, }, { av1_intrap_hiddenlayer_0_kernel_12, av1_intrap_logits_kernel_12, }, { av1_intrap_hiddenlayer_0_bias_12, av1_intrap_logits_bias_12, }, }; #undef NUM_HIDDEN_LAYERS_12 #undef NUM_FEATURES_12 #undef NUM_LAYER_0_UNITS_12 #undef NUM_LOGITS_12 #define NUM_HIDDEN_LAYERS_15 1 #define NUM_FEATURES_15 6 #define NUM_LAYER_0_UNITS_15 24 #define NUM_LOGITS_15 2 static const float av1_intraph_hiddenlayer_0_kernel_15[] = { -0.77480125f, 0.3219551f, -0.015702145f, -0.5310235f, 0.5254026f, -1.1522819f, 2.682016f, 0.08001052f, -0.2539285f, 0.04711023f, -0.81296307f, 0.2675382f, 0.1952474f, -0.0664705f, 1.2989824f, -0.3150117f, -0.8022715f, 0.045423955f, -27.584324f, -2.5608704f, -3.2280366f, 0.05272543f, -0.47141576f, -0.07644298f, -53.77942f, -22.393923f, -23.027853f, -0.00015186476f, -0.010696465f, 2.7064638f, -22.776028f, 11.514891f, 11.138167f, -0.001243723f, -0.4802433f, -8.758646f, 0.26398206f, -0.23485385f, 0.27586034f, -0.004954741f, -0.4935232f, -0.017607696f, 69.56049f, -1.1756641f, -0.052366666f, -0.38052833f, 0.32474658f, 0.04634263f, 0.8583235f, -0.528438f, -0.7868907f, -0.4757781f, 0.4620985f, -0.70621157f, 231.40195f, 6.805205f, 9.420295f, 0.02585775f, -0.03480937f, 1.3577378f, 0.1758226f, 15.056758f, 14.437874f, -0.1305005f, 0.115103304f, 0.21297209f, 55.821743f, -6.611156f, -6.8552365f, -0.011928095f, -0.2042175f, 1.2557873f, -1.0722278f, -0.2683614f, 0.48318478f, -0.73739994f, 0.54055226f, -0.03224738f, -0.06767959f, -0.21015017f, 0.29171246f, -0.6937296f, -1.2342545f, -0.41278538f, -37.9365f, 17.68424f, 16.263042f, -0.074828684f, 0.06607806f, -0.16763286f, 13.594707f, 0.6152676f, -0.4371223f, -0.8365592f, 0.8273623f, -1.2126317f, 0.1216157f, -1.3002136f, -0.18856938f, -0.2589358f, -0.76897144f, 0.21777137f, -122.25033f, -0.23490006f, -3.1238277f, -0.13916978f, 0.08576391f, -1.7391548f, -116.24812f, 14.906071f, 13.468357f, 0.02332889f, -0.034617376f, -18.506111f, 0.7500542f, -1.1882535f, 0.40848416f, -0.28434393f, -0.71471655f, -0.29188696f, -0.46588746f, -0.17324813f, -0.62460244f, -1.1801276f, 0.28993344f, -0.22072886f, 129.2688f, -0.33782578f, -0.34836572f, -0.034112718f, -0.023666814f, -0.5865087f, -33.484146f, 1.1431375f, 0.56056374f, -0.0049730353f, -0.24347587f, -1.3003352f, 0.88973033f, 0.8499571f, -0.5678484f, -0.39009875f, -0.062105156f, -0.13965102f }; static const float av1_intraph_hiddenlayer_0_bias_15[] = { 0.0f, -0.2926711f, 0.0f, -1.0303509f, -27.459345f, 12.412848f, 0.0f, -2.5971522f, -0.02733541f, -19.881912f, 14.391992f, -8.249469f, 0.0f, 0.0f, 13.676118f, -0.6472994f, -0.07189449f, 1.1986839f, 52.479107f, 0.0f, 0.0f, -3.0187025f, 1.4435643f, 0.0f }; static const float av1_intraph_logits_kernel_15[] = { 0.05390722f, -0.06859513f, 0.036842898f, 0.190772f, 0.13623567f, 0.09321194f, 0.2314745f, -0.13958375f, -0.3058229f, -0.0104543045f, 0.11336068f, -0.276115f, 0.00470723f, -0.49123898f, -0.15988174f, 0.087681435f, 0.022517204f, 0.073877744f, 0.2968856f, -0.1401399f, -0.38788354f, -0.26005393f, -0.39564916f, -0.16195515f, 0.2680102f, -0.032179773f, -0.35758728f, 0.25819537f, 0.11468631f, 0.13573235f, -0.2672175f, 0.016490124f, 0.048118807f, 0.020319486f, 0.07892215f, -0.21821865f, 0.08434734f, 0.3129456f, -0.18215221f, 0.08884877f, -0.35621428f, 0.11405768f, 0.27370325f, 0.14956686f, 0.01604587f, -0.18334487f, -0.42385718f, -0.08033409f }; static const float av1_intraph_logits_bias_15[] = { 0.83619016f, -0.8340626f }; static const NN_CONFIG av1_intrap_hd_nn_config = { NUM_FEATURES_15, NUM_LOGITS_15, NUM_HIDDEN_LAYERS_15, { NUM_LAYER_0_UNITS_15, }, { av1_intraph_hiddenlayer_0_kernel_15, av1_intraph_logits_kernel_15, }, { av1_intraph_hiddenlayer_0_bias_15, av1_intraph_logits_bias_15, }, }; #undef NUM_HIDDEN_LAYERS_15 #undef NUM_FEATURES_15 #undef NUM_LAYER_0_UNITS_15 #undef NUM_LOGITS_15 #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_ aom-3.12.1/av1/encoder/model_rd.h000066400000000000000000000234241477627663500164520ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_MODEL_RD_H_ #define AOM_AV1_ENCODER_MODEL_RD_H_ #include "aom/aom_integer.h" #include "av1/encoder/block.h" #include "av1/encoder/encoder.h" #include "av1/encoder/pustats.h" #include "av1/encoder/rdopt_utils.h" #include "config/aom_dsp_rtcd.h" #ifdef __cplusplus extern "C" { #endif // 0: Legacy model // 1: Curve fit model // 2: Surface fit model // 3: DNN regression model // 4: Full rd model #define MODELRD_TYPE_INTERP_FILTER 1 #define MODELRD_TYPE_TX_SEARCH_PRUNE 1 #define MODELRD_TYPE_MASKED_COMPOUND 1 #define MODELRD_TYPE_INTERINTRA 1 #define MODELRD_TYPE_INTRA 1 #define MODELRD_TYPE_MOTION_MODE_RD 1 typedef void (*model_rd_for_sb_type)( const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum, uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate, int64_t *plane_sse, int64_t *plane_dist); typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi, const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, int plane, int64_t sse, int num_samples, int *rate, int64_t *dist); static int64_t calculate_sse(MACROBLOCKD *const xd, const struct macroblock_plane *p, struct macroblockd_plane *pd, const int bw, const int bh) { int64_t sse = 0; const int shift = xd->bd - 8; #if CONFIG_AV1_HIGHBITDEPTH if (is_cur_buf_hbd(xd)) { sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh); } else { sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh); } #else sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh); #endif sse = ROUND_POWER_OF_TWO(sse, shift * 2); return sse; } static inline int64_t compute_sse_plane(MACROBLOCK *x, MACROBLOCKD *xd, int plane, const BLOCK_SIZE bsize) { struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); int bw, bh; const struct macroblock_plane *const p = &x->plane[plane]; get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, &bh); int64_t sse = calculate_sse(xd, p, pd, bw, bh); return sse; } static inline void model_rd_from_sse(const AV1_COMP *const cpi, const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, int plane, int64_t sse, int num_samples, int *rate, int64_t *dist) { (void)num_samples; const MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; // Fast approximate the modelling function. if (cpi->sf.rd_sf.simple_model_rd_from_var) { const int64_t square_error = sse; int quantizer = p->dequant_QTX[1] >> dequant_shift; if (quantizer < 120) *rate = (int)AOMMIN( (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT), INT_MAX); else *rate = 0; assert(*rate >= 0); *dist = (square_error * quantizer) >> 8; } else { av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize], p->dequant_QTX[1] >> dequant_shift, rate, dist); } *dist <<= 4; } // Fits a curve for rate and distortion using as feature: // log2(sse_norm/qstep^2) static inline void model_rd_with_curvfit(const AV1_COMP *const cpi, const MACROBLOCK *const x, BLOCK_SIZE plane_bsize, int plane, int64_t sse, int num_samples, int *rate, int64_t *dist) { (void)cpi; (void)plane_bsize; const MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; const int qstep = AOMMAX(p->dequant_QTX[1] >> dequant_shift, 1); if (sse == 0) { if (rate) *rate = 0; if (dist) *dist = 0; return; } const double sse_norm = (double)sse / num_samples; const double qstepsqr = (double)qstep * qstep; const double xqr = log2(sse_norm / qstepsqr); double rate_f, dist_by_sse_norm_f; av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, &rate_f, &dist_by_sse_norm_f); const double dist_f = dist_by_sse_norm_f * sse_norm; int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5); // Check if skip is better if (rate_i == 0) { dist_i = sse << 4; } else if (RDCOST(x->rdmult, rate_i, dist_i) >= RDCOST(x->rdmult, 0, sse << 4)) { rate_i = 0; dist_i = sse << 4; } if (rate) *rate = rate_i; if (dist) *dist = dist_i; } static inline void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum, uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. int plane; const int ref = xd->mi[0]->ref_frame[0]; int64_t rate_sum = 0; int64_t dist_sum = 0; int64_t total_sse = 0; assert(bsize < BLOCK_SIZES_ALL); for (plane = plane_from; plane <= plane_to; ++plane) { if (plane && !xd->is_chroma_ref) break; struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); assert(plane_bsize < BLOCK_SIZES_ALL); const int bw = block_size_wide[plane_bsize]; const int bh = block_size_high[plane_bsize]; int64_t sse; int rate; int64_t dist; sse = calculate_sse(xd, p, pd, bw, bh); model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist); if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); total_sse += sse; rate_sum += rate; dist_sum += dist; if (plane_rate) plane_rate[plane] = rate; if (plane_sse) plane_sse[plane] = sse; if (plane_dist) plane_dist[plane] = dist; assert(rate_sum >= 0); } if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; if (skip_sse_sb) *skip_sse_sb = total_sse << 4; rate_sum = AOMMIN(rate_sum, INT_MAX); *out_rate_sum = (int)rate_sum; *out_dist_sum = dist_sum; } static inline void model_rd_for_sb_with_curvfit( const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum, uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. const int ref = xd->mi[0]->ref_frame[0]; int64_t rate_sum = 0; int64_t dist_sum = 0; int64_t total_sse = 0; for (int plane = plane_from; plane <= plane_to; ++plane) { if (plane && !xd->is_chroma_ref) break; struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); int64_t dist, sse; int rate; int bw, bh; const struct macroblock_plane *const p = &x->plane[plane]; get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, &bh); sse = calculate_sse(xd, p, pd, bw, bh); model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist); if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); total_sse += sse; rate_sum += rate; dist_sum += dist; if (plane_rate) plane_rate[plane] = rate; if (plane_sse) plane_sse[plane] = sse; if (plane_dist) plane_dist[plane] = dist; } if (skip_txfm_sb) *skip_txfm_sb = rate_sum == 0; if (skip_sse_sb) *skip_sse_sb = total_sse << 4; *out_rate_sum = (int)rate_sum; *out_dist_sum = dist_sum; } enum { MODELRD_LEGACY, MODELRD_CURVFIT, MODELRD_TYPES } UENUM1BYTE(ModelRdType); static const model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = { model_rd_for_sb, model_rd_for_sb_with_curvfit }; static const model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = { model_rd_from_sse, model_rd_with_curvfit }; #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_MODEL_RD_H_ aom-3.12.1/av1/encoder/motion_search_facade.c000066400000000000000000001266721477627663500210060ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/reconinter.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/encoder.h" #include "av1/encoder/interp_search.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/motion_search_facade.h" #include "av1/encoder/partition_strategy.h" #include "av1/encoder/reconinter_enc.h" #include "av1/encoder/tpl_model.h" #include "av1/encoder/tx_search.h" #define RIGHT_SHIFT_MV(x) (((x) + 3 + ((x) >= 0)) >> 3) typedef struct { int_mv fmv; int weight; } cand_mv_t; static int compare_weight(const void *a, const void *b) { const int diff = ((cand_mv_t *)a)->weight - ((cand_mv_t *)b)->weight; if (diff < 0) return 1; else if (diff > 0) return -1; return 0; } // Allow more mesh searches for screen content type on the ARF. static int use_fine_search_interval(const AV1_COMP *const cpi) { return cpi->is_screen_content_type && cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == ARF_UPDATE && cpi->oxcf.speed <= 2; } // Iterate through the tpl and collect the mvs to be used as candidates static inline void get_mv_candidate_from_tpl(const AV1_COMP *const cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, int ref, cand_mv_t *cand, int *cand_count, int *total_cand_weight) { const SuperBlockEnc *sb_enc = &x->sb_enc; if (!sb_enc->tpl_data_count) { return; } const AV1_COMMON *cm = &cpi->common; const MACROBLOCKD *xd = &x->e_mbd; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); const int tplw = mi_size_wide[tpl_bsize]; const int tplh = mi_size_high[tpl_bsize]; const int nw = mi_size_wide[bsize] / tplw; const int nh = mi_size_high[bsize] / tplh; if (nw >= 1 && nh >= 1) { const int of_h = mi_row % mi_size_high[cm->seq_params->sb_size]; const int of_w = mi_col % mi_size_wide[cm->seq_params->sb_size]; const int start = of_h / tplh * sb_enc->tpl_stride + of_w / tplw; int valid = 1; // Assign large weight to start_mv, so it is always tested. cand[0].weight = nw * nh; for (int k = 0; k < nh; k++) { for (int l = 0; l < nw; l++) { const int_mv mv = sb_enc ->tpl_mv[start + k * sb_enc->tpl_stride + l][ref - LAST_FRAME]; if (mv.as_int == INVALID_MV) { valid = 0; break; } const FULLPEL_MV fmv = { GET_MV_RAWPEL(mv.as_mv.row), GET_MV_RAWPEL(mv.as_mv.col) }; int unique = 1; for (int m = 0; m < *cand_count; m++) { if (RIGHT_SHIFT_MV(fmv.row) == RIGHT_SHIFT_MV(cand[m].fmv.as_fullmv.row) && RIGHT_SHIFT_MV(fmv.col) == RIGHT_SHIFT_MV(cand[m].fmv.as_fullmv.col)) { unique = 0; cand[m].weight++; break; } } if (unique) { cand[*cand_count].fmv.as_fullmv = fmv; cand[*cand_count].weight = 1; (*cand_count)++; } } if (!valid) break; } if (valid) { *total_cand_weight = 2 * nh * nw; if (*cand_count > 2) qsort(cand, *cand_count, sizeof(cand[0]), &compare_weight); } } } void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int ref_idx, int *rate_mv, int search_range, inter_mode_info *mode_info, int_mv *best_mv, struct HandleInterModeArgs *const args) { MACROBLOCKD *xd = &x->e_mbd; const AV1_COMMON *cm = &cpi->common; const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params; const int num_planes = av1_num_planes(cm); MB_MODE_INFO *mbmi = xd->mi[0]; struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } }; int bestsme = INT_MAX; const int ref = mbmi->ref_frame[ref_idx]; const YV12_BUFFER_CONFIG *scaled_ref_frame = av1_get_scaled_ref_frame(cpi, ref); const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; const MvCosts *mv_costs = x->mv_costs; if (scaled_ref_frame) { // Swap out the reference frame for a version that's been scaled to // match the resolution of the current frame, allowing the existing // full-pixel motion search code to be used without additional // modifications. for (int i = 0; i < num_planes; i++) { backup_yv12[i] = xd->plane[i].pre[ref_idx]; } av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL, num_planes); } // Work out the size of the first step in the mv step search. // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc. int step_param; if (cpi->sf.mv_sf.auto_mv_step_size && cm->show_frame) { // Take the weighted average of the step_params based on the last frame's // max mv magnitude and that based on the best ref mvs of the current // block for the given reference. step_param = (av1_init_search_range(x->max_mv_context[ref]) + mv_search_params->mv_step_param) / 2; } else { step_param = mv_search_params->mv_step_param; } const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv; FULLPEL_MV start_mv; if (mbmi->motion_mode != SIMPLE_TRANSLATION) start_mv = get_fullmv_from_mv(&mbmi->mv[0].as_mv); else start_mv = get_fullmv_from_mv(&ref_mv); // cand stores start_mv and all possible MVs in a SB. cand_mv_t cand[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB + 1]; av1_zero(cand); cand[0].fmv.as_fullmv = start_mv; int cnt = 1; int total_weight = 0; if (!cpi->sf.mv_sf.full_pixel_search_level && mbmi->motion_mode == SIMPLE_TRANSLATION) { get_mv_candidate_from_tpl(cpi, x, bsize, ref, cand, &cnt, &total_weight); } const int cand_cnt = AOMMIN(2, cnt); // TODO(any): Test the speed feature for OBMC_CAUSAL mode. if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv && mbmi->motion_mode == SIMPLE_TRANSLATION) { const int stack_size = args->start_mv_cnt; for (int cand_idx = 0; cand_idx < cand_cnt; cand_idx++) { int_mv *fmv_cand = &cand[cand_idx].fmv; int skip_cand_mv = 0; // Check difference between mvs in the stack and candidate mv. for (int stack_idx = 0; stack_idx < stack_size; stack_idx++) { const uint8_t this_ref_mv_idx = args->ref_mv_idx_stack[stack_idx]; const FULLPEL_MV *fmv_stack = &args->start_mv_stack[stack_idx]; const int this_newmv_valid = args->single_newmv_valid[this_ref_mv_idx][ref]; const int row_diff = abs(fmv_stack->row - fmv_cand->as_fullmv.row); const int col_diff = abs(fmv_stack->col - fmv_cand->as_fullmv.col); if (!this_newmv_valid) continue; if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv >= 2) { // Prunes the current start_mv candidate, if the absolute mv // difference of both row and column are <= 1. if (row_diff <= 1 && col_diff <= 1) { skip_cand_mv = 1; break; } } else if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv >= 1) { // Prunes the current start_mv candidate, if the sum of the absolute // mv difference of row and column is <= 1. if (row_diff + col_diff <= 1) { skip_cand_mv = 1; break; } } } if (skip_cand_mv) { // Ensure atleast one full-pel motion search is not pruned. assert(mbmi->ref_mv_idx != 0); // Mark the candidate mv as invalid so that motion search gets skipped. cand[cand_idx].fmv.as_int = INVALID_MV; } else { // Store start_mv candidate and corresponding ref_mv_idx of full-pel // search in the mv stack (except last ref_mv_idx). if (mbmi->ref_mv_idx != MAX_REF_MV_SEARCH - 1) { assert(args->start_mv_cnt < (MAX_REF_MV_SEARCH - 1) * 2); args->start_mv_stack[args->start_mv_cnt] = fmv_cand->as_fullmv; args->ref_mv_idx_stack[args->start_mv_cnt] = mbmi->ref_mv_idx; args->start_mv_cnt++; } } } } // Hot fix for asan complaints when resize mode is on. When resize mode is on, // the stride of the reference frame can be different from indicated by // MotionVectorSearchParams::search_site_cfg. When this happens, we need to // readjust the stride. const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf; const SEARCH_METHODS search_method = av1_get_default_mv_search_method(x, mv_sf, bsize); const search_site_config *src_search_site_cfg = av1_get_search_site_config(cpi, x, search_method); // Further reduce the search range. if (search_range < INT_MAX) { const search_site_config *search_site_cfg = &src_search_site_cfg[search_method_lookup[search_method]]; // Max step_param is search_site_cfg->num_search_steps. if (search_range < 1) { step_param = search_site_cfg->num_search_steps; } else { while (search_site_cfg->radius[search_site_cfg->num_search_steps - step_param - 1] > (search_range << 1) && search_site_cfg->num_search_steps - step_param - 1 > 0) step_param++; } } int cost_list[5]; FULLPEL_MV_STATS best_mv_stats; int_mv second_best_mv; best_mv->as_int = second_best_mv.as_int = INVALID_MV; // Allow more mesh searches for screen content type on the ARF. const int fine_search_interval = use_fine_search_interval(cpi); FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; switch (mbmi->motion_mode) { case SIMPLE_TRANSLATION: { // Perform a search with the top 2 candidates int sum_weight = 0; for (int m = 0; m < cand_cnt; m++) { int_mv smv = cand[m].fmv; FULLPEL_MV this_best_mv, this_second_best_mv; FULLPEL_MV_STATS this_mv_stats; if (smv.as_int == INVALID_MV) continue; av1_make_default_fullpel_ms_params( &full_ms_params, cpi, x, bsize, &ref_mv, smv.as_fullmv, src_search_site_cfg, search_method, fine_search_interval); const int thissme = av1_full_pixel_search(smv.as_fullmv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list), &this_best_mv, &this_mv_stats, &this_second_best_mv); if (thissme < bestsme) { bestsme = thissme; best_mv->as_fullmv = this_best_mv; best_mv_stats = this_mv_stats; second_best_mv.as_fullmv = this_second_best_mv; } sum_weight += cand[m].weight; if (4 * sum_weight > 3 * total_weight) break; } } break; case OBMC_CAUSAL: av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv, start_mv, src_search_site_cfg, search_method, fine_search_interval); bestsme = av1_obmc_full_pixel_search(start_mv, &full_ms_params, step_param, &best_mv->as_fullmv); break; default: assert(0 && "Invalid motion mode!\n"); } if (best_mv->as_int == INVALID_MV) return; if (scaled_ref_frame) { // Swap back the original buffers for subpel motion search. for (int i = 0; i < num_planes; i++) { xd->plane[i].pre[ref_idx] = backup_yv12[i]; } } // Terminate search with the current ref_idx based on fullpel mv, rate cost, // and other know cost. if (cpi->sf.inter_sf.skip_newmv_in_drl >= 2 && mbmi->motion_mode == SIMPLE_TRANSLATION && best_mv->as_int != INVALID_MV) { int_mv this_mv; this_mv.as_mv = get_mv_from_fullmv(&best_mv->as_fullmv); const int ref_mv_idx = mbmi->ref_mv_idx; const int this_mv_rate = av1_mv_bit_cost(&this_mv.as_mv, &ref_mv, mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack, MV_COST_WEIGHT); mode_info[ref_mv_idx].full_search_mv.as_int = this_mv.as_int; mode_info[ref_mv_idx].full_mv_rate = this_mv_rate; mode_info[ref_mv_idx].full_mv_bestsme = bestsme; for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) { // Check if the motion search result same as previous results if (this_mv.as_int == mode_info[prev_ref_idx].full_search_mv.as_int) { // Compare the rate cost const int prev_rate_cost = mode_info[prev_ref_idx].full_mv_rate + mode_info[prev_ref_idx].drl_cost; const int this_rate_cost = this_mv_rate + mode_info[ref_mv_idx].drl_cost; if (prev_rate_cost <= this_rate_cost) { // If the current rate_cost is worse than the previous rate_cost, then // we terminate the search. Since av1_single_motion_search is only // called by handle_new_mv in SIMPLE_TRANSLATION mode, we set the // best_mv to INVALID mv to signal that we wish to terminate search // for the current mode. best_mv->as_int = INVALID_MV; return; } } // Terminate the evaluation of current ref_mv_idx based on bestsme and // drl_cost. const int psme = mode_info[prev_ref_idx].full_mv_bestsme; if (psme == INT_MAX) continue; const int thr = cpi->sf.inter_sf.skip_newmv_in_drl == 3 ? (psme + (psme >> 2)) : psme; if (cpi->sf.inter_sf.skip_newmv_in_drl >= 3 && mode_info[ref_mv_idx].full_mv_bestsme > thr && mode_info[prev_ref_idx].drl_cost < mode_info[ref_mv_idx].drl_cost) { best_mv->as_int = INVALID_MV; return; } } } if (cpi->common.features.cur_frame_force_integer_mv) { convert_fullmv_to_mv(best_mv); } const int use_fractional_mv = bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0; int best_mv_rate = 0; int mv_rate_calculated = 0; if (use_fractional_mv) { int_mv fractional_ms_list[3]; av1_set_fractional_mv(fractional_ms_list); int dis; /* TODO: use dis in distortion calculation later. */ SUBPEL_MOTION_SEARCH_PARAMS ms_params; av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, cost_list); MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv); assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); switch (mbmi->motion_mode) { case SIMPLE_TRANSLATION: if (mv_sf->use_accurate_subpel_search) { const int try_second = second_best_mv.as_int != INVALID_MV && second_best_mv.as_int != best_mv->as_int && (mv_sf->disable_second_mv <= 1); const int best_mv_var = mv_search_params->find_fractional_mv_step( xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv->as_mv, &dis, &x->pred_sse[ref], fractional_ms_list); if (try_second) { struct macroblockd_plane *p = xd->plane; const BUFFER_SET orig_dst = { { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf }, { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride }, }; int64_t rd = INT64_MAX; if (!mv_sf->disable_second_mv) { // Calculate actual rd cost. mbmi->mv[0].as_mv = best_mv->as_mv; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, 0, 0); av1_subtract_plane(x, bsize, 0); RD_STATS this_rd_stats; av1_init_rd_stats(&this_rd_stats); av1_estimate_txfm_yrd(cpi, x, &this_rd_stats, INT64_MAX, bsize, max_txsize_rect_lookup[bsize]); int this_mv_rate = av1_mv_bit_cost( &best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack, MV_COST_WEIGHT); rd = RDCOST(x->rdmult, this_mv_rate + this_rd_stats.rate, this_rd_stats.dist); } MV this_best_mv; subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv); if (av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)) { unsigned int sse; const int this_var = mv_search_params->find_fractional_mv_step( xd, cm, &ms_params, subpel_start_mv, NULL, &this_best_mv, &dis, &sse, fractional_ms_list); if (!mv_sf->disable_second_mv) { // If cpi->sf.mv_sf.disable_second_mv is 0, use actual rd cost // to choose the better MV. mbmi->mv[0].as_mv = this_best_mv; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, 0, 0); av1_subtract_plane(x, bsize, 0); RD_STATS tmp_rd_stats; av1_init_rd_stats(&tmp_rd_stats); av1_estimate_txfm_yrd(cpi, x, &tmp_rd_stats, INT64_MAX, bsize, max_txsize_rect_lookup[bsize]); int tmp_mv_rate = av1_mv_bit_cost( &this_best_mv, &ref_mv, mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack, MV_COST_WEIGHT); int64_t tmp_rd = RDCOST(x->rdmult, tmp_rd_stats.rate + tmp_mv_rate, tmp_rd_stats.dist); if (tmp_rd < rd) { best_mv->as_mv = this_best_mv; x->pred_sse[ref] = sse; } } else { // If cpi->sf.mv_sf.disable_second_mv = 1, use var to decide the // best MV. if (this_var < best_mv_var) { best_mv->as_mv = this_best_mv; x->pred_sse[ref] = sse; } } } } } else { mv_search_params->find_fractional_mv_step( xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv->as_mv, &dis, &x->pred_sse[ref], NULL); } break; case OBMC_CAUSAL: av1_find_best_obmc_sub_pixel_tree_up( xd, cm, &ms_params, subpel_start_mv, NULL, &best_mv->as_mv, &dis, &x->pred_sse[ref], NULL); break; default: assert(0 && "Invalid motion mode!\n"); } // Terminate search with the current ref_idx based on subpel mv and rate // cost. if (cpi->sf.inter_sf.skip_newmv_in_drl >= 1 && args != NULL && mbmi->motion_mode == SIMPLE_TRANSLATION && best_mv->as_int != INVALID_MV) { const int ref_mv_idx = mbmi->ref_mv_idx; best_mv_rate = av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack, MV_COST_WEIGHT); mv_rate_calculated = 1; for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) { if (!args->single_newmv_valid[prev_ref_idx][ref]) continue; // Check if the motion vectors are the same. if (best_mv->as_int == args->single_newmv[prev_ref_idx][ref].as_int) { // Skip this evaluation if the previous one is skipped. if (mode_info[prev_ref_idx].skip) { mode_info[ref_mv_idx].skip = 1; break; } // Compare the rate cost that we current know. const int prev_rate_cost = args->single_newmv_rate[prev_ref_idx][ref] + mode_info[prev_ref_idx].drl_cost; const int this_rate_cost = best_mv_rate + mode_info[ref_mv_idx].drl_cost; if (prev_rate_cost <= this_rate_cost) { // If the current rate_cost is worse than the previous rate_cost, // then we terminate the search for this ref_mv_idx. mode_info[ref_mv_idx].skip = 1; break; } } } } } if (mv_rate_calculated) { *rate_mv = best_mv_rate; } else { *rate_mv = av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack, MV_COST_WEIGHT); } } int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv, const uint8_t *mask, int mask_stride, int *rate_mv, int allow_second_mv, int joint_me_num_refine_iter) { const AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); const int pw = block_size_wide[bsize]; const int ph = block_size_high[bsize]; const int plane = 0; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; // This function should only ever be called for compound modes assert(has_second_ref(mbmi)); const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] }; const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] }; const MvCosts *mv_costs = x->mv_costs; int_mv ref_mv[2]; int ite, ref; // Get the prediction block from the 'other' reference frame. const int_interpfilters interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); InterPredParams inter_pred_params; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; // Do joint motion search in compound mode to get more accurate mv. struct buf_2d backup_yv12[2][MAX_MB_PLANE]; int last_besterr[2] = { INT_MAX, INT_MAX }; const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = { av1_get_scaled_ref_frame(cpi, refs[0]), av1_get_scaled_ref_frame(cpi, refs[1]) }; // Prediction buffer from second frame. DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]); uint8_t *second_pred = get_buf_by_bd(xd, second_pred16); int_mv best_mv, second_best_mv; // Allow joint search multiple times iteratively for each reference frame // and break out of the search loop if it couldn't find a better mv. for (ite = 0; ite < (2 * joint_me_num_refine_iter); ite++) { struct buf_2d ref_yv12[2]; int bestsme = INT_MAX; int id = ite % 2; // Even iterations search in the first reference frame, // odd iterations search in the second. The predictor // found for the 'other' reference frame is factored in. if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) { if (cur_mv[id].as_int == init_mv[id].as_int) { break; } else { int_mv cur_int_mv, init_int_mv; cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3; cur_int_mv.as_mv.row = cur_mv[id].as_mv.row >> 3; init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3; init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3; if (cur_int_mv.as_int == init_int_mv.as_int) { break; } } } for (ref = 0; ref < 2; ++ref) { ref_mv[ref] = av1_get_ref_mv(x, ref); // Swap out the reference frame for a version that's been scaled to // match the resolution of the current frame, allowing the existing // motion search code to be used without additional modifications. if (scaled_ref_frame[ref]) { int i; for (i = 0; i < num_planes; i++) backup_yv12[ref][i] = xd->plane[i].pre[ref]; av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, NULL, num_planes); } } assert(IMPLIES(scaled_ref_frame[0] != NULL, cm->width == scaled_ref_frame[0]->y_crop_width && cm->height == scaled_ref_frame[0]->y_crop_height)); assert(IMPLIES(scaled_ref_frame[1] != NULL, cm->width == scaled_ref_frame[1]->y_crop_width && cm->height == scaled_ref_frame[1]->y_crop_height)); // Initialize based on (possibly scaled) prediction buffers. ref_yv12[0] = xd->plane[plane].pre[0]; ref_yv12[1] = xd->plane[plane].pre[1]; av1_init_inter_params(&inter_pred_params, pw, ph, mi_row * MI_SIZE, mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0, &cm->sf_identity, &ref_yv12[!id], interp_filters); inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd); // Since we have scaled the reference frames to match the size of the // current frame we must use a unit scaling factor during mode selection. av1_enc_build_one_inter_predictor(second_pred, pw, &cur_mv[!id].as_mv, &inter_pred_params); // Do full-pixel compound motion search on the current reference frame. if (id) xd->plane[plane].pre[0] = ref_yv12[id]; // Make motion search params FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; FULLPEL_MV_STATS best_mv_stats; const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf; const SEARCH_METHODS search_method = av1_get_default_mv_search_method(x, mv_sf, bsize); const search_site_config *src_search_sites = av1_get_search_site_config(cpi, x, search_method); // Use the mv result from the single mode as mv predictor. const FULLPEL_MV start_fullmv = get_fullmv_from_mv(&cur_mv[id].as_mv); av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv[id].as_mv, start_fullmv, src_search_sites, search_method, /*fine_search_interval=*/0); av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask, mask_stride, id); // Small-range full-pixel motion search. if (!mv_sf->disable_extensive_joint_motion_search && mbmi->interinter_comp.type != COMPOUND_WEDGE) { bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL, &best_mv.as_fullmv, &best_mv_stats, &second_best_mv.as_fullmv); } else { bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv, &best_mv.as_fullmv); second_best_mv = best_mv; } const int try_second = second_best_mv.as_int != INVALID_MV && second_best_mv.as_int != best_mv.as_int && allow_second_mv; // Restore the pointer to the first (possibly scaled) prediction buffer. if (id) xd->plane[plane].pre[0] = ref_yv12[0]; for (ref = 0; ref < 2; ++ref) { if (scaled_ref_frame[ref]) { // Swap back the original buffers for subpel motion search. for (int i = 0; i < num_planes; i++) { xd->plane[i].pre[ref] = backup_yv12[ref][i]; } // Re-initialize based on unscaled prediction buffers. ref_yv12[ref] = xd->plane[plane].pre[ref]; } } // Do sub-pixel compound motion search on the current reference frame. if (id) xd->plane[plane].pre[0] = ref_yv12[id]; if (cpi->common.features.cur_frame_force_integer_mv) { convert_fullmv_to_mv(&best_mv); } if (bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0) { int dis; /* TODO: use dis in distortion calculation later. */ unsigned int sse; SUBPEL_MOTION_SEARCH_PARAMS ms_params; av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv[id].as_mv, NULL); av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred, mask, mask_stride, id); ms_params.forced_stop = EIGHTH_PEL; MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv)); bestsme = cpi->mv_search_params.find_fractional_mv_step( xd, cm, &ms_params, start_mv, NULL, &best_mv.as_mv, &dis, &sse, NULL); if (try_second) { MV this_best_mv; MV subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv); if (av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)) { const int thissme = cpi->mv_search_params.find_fractional_mv_step( xd, cm, &ms_params, subpel_start_mv, NULL, &this_best_mv, &dis, &sse, NULL); if (thissme < bestsme) { best_mv.as_mv = this_best_mv; bestsme = thissme; } } } } // Restore the pointer to the first prediction buffer. if (id) xd->plane[plane].pre[0] = ref_yv12[0]; if (bestsme < last_besterr[id]) { cur_mv[id] = best_mv; last_besterr[id] = bestsme; } else { break; } } *rate_mv = 0; for (ref = 0; ref < 2; ++ref) { const int_mv curr_ref_mv = av1_get_ref_mv(x, ref); *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv, mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack, MV_COST_WEIGHT); } return AOMMIN(last_besterr[0], last_besterr[1]); } // Search for the best mv for one component of a compound, // given that the other component is fixed. int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *this_mv, const uint8_t *second_pred, const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx) { const AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; const int ref = mbmi->ref_frame[ref_idx]; const int_mv ref_mv = av1_get_ref_mv(x, ref_idx); struct macroblockd_plane *const pd = &xd->plane[0]; const MvCosts *mv_costs = x->mv_costs; struct buf_2d backup_yv12[MAX_MB_PLANE]; const YV12_BUFFER_CONFIG *const scaled_ref_frame = av1_get_scaled_ref_frame(cpi, ref); // Check that this is either an interinter or an interintra block assert(has_second_ref(mbmi) || (ref_idx == 0 && is_interintra_mode(mbmi))); // Store the first prediction buffer. struct buf_2d orig_yv12; if (ref_idx) { orig_yv12 = pd->pre[0]; pd->pre[0] = pd->pre[ref_idx]; } if (scaled_ref_frame) { // Swap out the reference frame for a version that's been scaled to // match the resolution of the current frame, allowing the existing // full-pixel motion search code to be used without additional // modifications. for (int i = 0; i < num_planes; i++) { backup_yv12[i] = xd->plane[i].pre[ref_idx]; } const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; // The index below needs to be 0 instead of ref_idx since we assume the // 0th slot to be used for subsequent searches. Note that the ref_idx // reference buffer has been copied to the 0th slot in the code above. // Now we need to swap the reference frame for the 0th slot. av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL, num_planes); } int bestsme = INT_MAX; int_mv best_mv; // Make motion search params FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; FULLPEL_MV_STATS best_mv_stats; const SEARCH_METHODS search_method = av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize); const search_site_config *src_search_sites = av1_get_search_site_config(cpi, x, search_method); // Use the mv result from the single mode as mv predictor. const FULLPEL_MV start_fullmv = get_fullmv_from_mv(this_mv); av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv.as_mv, start_fullmv, src_search_sites, search_method, /*fine_search_interval=*/0); av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask, mask_stride, ref_idx); // Small-range full-pixel motion search. bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL, &best_mv.as_fullmv, &best_mv_stats, NULL); if (scaled_ref_frame) { // Swap back the original buffers for subpel motion search for the 0th slot. for (int i = 0; i < num_planes; i++) { xd->plane[i].pre[0] = backup_yv12[i]; } } if (cpi->common.features.cur_frame_force_integer_mv) { convert_fullmv_to_mv(&best_mv); } const int use_fractional_mv = bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0; if (use_fractional_mv) { int dis; /* TODO: use dis in distortion calculation later. */ unsigned int sse; SUBPEL_MOTION_SEARCH_PARAMS ms_params; av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv.as_mv, NULL); av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred, mask, mask_stride, ref_idx); ms_params.forced_stop = EIGHTH_PEL; MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv)); bestsme = cpi->mv_search_params.find_fractional_mv_step( xd, cm, &ms_params, start_mv, &best_mv_stats, &best_mv.as_mv, &dis, &sse, NULL); } // Restore the pointer to the first unscaled prediction buffer. if (ref_idx) pd->pre[0] = orig_yv12; if (bestsme < INT_MAX) *this_mv = best_mv.as_mv; *rate_mv = 0; *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack, MV_COST_WEIGHT); return bestsme; } static inline void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, const MV *other_mv, int ref_idx, uint8_t *second_pred) { const AV1_COMMON *const cm = &cpi->common; const int pw = block_size_wide[bsize]; const int ph = block_size_high[bsize]; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; struct macroblockd_plane *const pd = &xd->plane[0]; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x); const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y); // This function should only ever be called for compound modes assert(has_second_ref(mbmi)); const int plane = 0; struct buf_2d ref_yv12 = xd->plane[plane].pre[!ref_idx]; struct scale_factors sf; av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height, cm->width, cm->height); InterPredParams inter_pred_params; av1_init_inter_params(&inter_pred_params, pw, ph, p_row, p_col, pd->subsampling_x, pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0, &sf, &ref_yv12, mbmi->interp_filters); inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); // Get the prediction block from the 'other' reference frame. av1_enc_build_one_inter_predictor(second_pred, pw, other_mv, &inter_pred_params); } // Wrapper for av1_compound_single_motion_search, for the common case // where the second prediction is also an inter mode. static int compound_single_motion_search_interinter( const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv, const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx) { MACROBLOCKD *xd = &x->e_mbd; // This function should only ever be called for compound modes assert(has_second_ref(xd->mi[0])); // Prediction buffer from second frame. DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]); uint8_t *second_pred; if (is_cur_buf_hbd(xd)) second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16); else second_pred = (uint8_t *)second_pred_alloc_16; MV *this_mv = &cur_mv[ref_idx].as_mv; const MV *other_mv = &cur_mv[!ref_idx].as_mv; build_second_inter_pred(cpi, x, bsize, other_mv, ref_idx, second_pred); return av1_compound_single_motion_search(cpi, x, bsize, this_mv, second_pred, mask, mask_stride, rate_mv, ref_idx); } static inline void do_masked_motion_search_indexed( const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize, int_mv *tmp_mv, int *rate_mv, int which) { // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; BLOCK_SIZE sb_type = mbmi->bsize; const uint8_t *mask; const int mask_stride = block_size_wide[bsize]; mask = av1_get_compound_type_mask(comp_data, sb_type); tmp_mv[0].as_int = cur_mv[0].as_int; tmp_mv[1].as_int = cur_mv[1].as_int; if (which == 0 || which == 1) { compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mask, mask_stride, rate_mv, which); } else if (which == 2) { const int joint_me_num_refine_iter = cpi->sf.inter_sf.enable_fast_compound_mode_search == 2 ? REDUCED_JOINT_ME_REFINE_ITER : NUM_JOINT_ME_REFINE_ITER; av1_joint_motion_search(cpi, x, bsize, tmp_mv, mask, mask_stride, rate_mv, !cpi->sf.mv_sf.disable_second_mv, joint_me_num_refine_iter); } } int av1_interinter_compound_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; int_mv tmp_mv[2]; int tmp_rate_mv = 0; // TODO(jingning): The average compound mode has proper SAD and variance // functions implemented, and is triggerd by setting the mask pointer as // Null. Need to further implement those for frame distance weighted mode. mbmi->interinter_comp.seg_mask = mbmi->interinter_comp.type == COMPOUND_AVERAGE ? NULL : xd->seg_mask; const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp; if (this_mode == NEW_NEWMV) { do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize, tmp_mv, &tmp_rate_mv, 2); mbmi->mv[0].as_int = tmp_mv[0].as_int; mbmi->mv[1].as_int = tmp_mv[1].as_int; } else if (this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV) { // which = 1 if this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV // which = 0 if this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV int which = (NEWMV == compound_ref1_mode(this_mode)); do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize, tmp_mv, &tmp_rate_mv, which); mbmi->mv[which].as_int = tmp_mv[which].as_int; } return tmp_rate_mv; } int_mv av1_simple_motion_search_sse_var(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, int mi_col, BLOCK_SIZE bsize, int ref, FULLPEL_MV start_mv, int num_planes, int use_subpixel, unsigned int *sse, unsigned int *var) { assert(num_planes == 1 && "Currently simple_motion_search only supports luma plane"); assert(!frame_is_intra_only(&cpi->common) && "Simple motion search only enabled for non-key frames"); AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize); MB_MODE_INFO *mbmi = xd->mi[0]; mbmi->bsize = bsize; mbmi->ref_frame[0] = ref; mbmi->ref_frame[1] = NONE_FRAME; mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref); const YV12_BUFFER_CONFIG *scaled_ref_frame = av1_get_scaled_ref_frame(cpi, ref); struct buf_2d backup_yv12; // ref_mv is used to calculate the cost of the motion vector const MV ref_mv = kZeroMv; const int step_param = AOMMIN(cpi->mv_search_params.mv_step_param + cpi->sf.part_sf.simple_motion_search_reduce_search_steps, MAX_MVSEARCH_STEPS - 2); int cost_list[5]; const int ref_idx = 0; int bestsme; int_mv best_mv; FULLPEL_MV_STATS best_mv_stats; av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col, get_ref_scale_factors(cm, ref), num_planes); set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); if (scaled_ref_frame) { backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx]; av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL, num_planes); } // Allow more mesh searches for screen content type on the ARF. const int fine_search_interval = use_fine_search_interval(cpi); FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf; const SEARCH_METHODS search_method = av1_get_default_mv_search_method(x, mv_sf, bsize); const search_site_config *src_search_sites = av1_get_search_site_config(cpi, x, search_method); av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv, start_mv, src_search_sites, search_method, fine_search_interval); bestsme = av1_full_pixel_search(start_mv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list), &best_mv.as_fullmv, &best_mv_stats, NULL); const int use_subpel_search = bestsme < INT_MAX && !cpi->common.features.cur_frame_force_integer_mv && use_subpixel && (cpi->sf.mv_sf.simple_motion_subpel_force_stop != FULL_PEL); if (scaled_ref_frame) { xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12; } if (use_subpel_search) { int not_used = 0; SUBPEL_MOTION_SEARCH_PARAMS ms_params; av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, cost_list); // TODO(yunqing): integrate this into av1_make_default_subpel_ms_params(). ms_params.forced_stop = mv_sf->simple_motion_subpel_force_stop; MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); cpi->mv_search_params.find_fractional_mv_step( xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv.as_mv, ¬_used, &x->pred_sse[ref], NULL); mbmi->mv[0] = best_mv; // Get a copy of the prediction output av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, AOM_PLANE_Y, AOM_PLANE_Y); *var = cpi->ppi->fn_ptr[bsize].vf( x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, xd->plane[0].dst.stride, sse); } else { // Manually convert from units of pixel to 1/8-pixels if we are not doing // subpel search convert_fullmv_to_mv(&best_mv); *var = best_mv_stats.distortion; *sse = best_mv_stats.sse; } return best_mv; } aom-3.12.1/av1/encoder/motion_search_facade.h000066400000000000000000000137301477627663500210010ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_MOTION_SEARCH_H_ #define AOM_AV1_ENCODER_MOTION_SEARCH_H_ #include "av1/encoder/encoder.h" #ifdef __cplusplus extern "C" { #endif #define NUM_JOINT_ME_REFINE_ITER 2 #define REDUCED_JOINT_ME_REFINE_ITER 1 // TODO(any): rename this struct to something else. There is already another // struct called inter_modes_info, which makes this terribly confusing. typedef struct { int drl_cost; int_mv full_search_mv; int full_mv_rate; int full_mv_bestsme; int skip; } inter_mode_info; struct HandleInterModeArgs; void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int ref_idx, int *rate_mv, int search_range, inter_mode_info *mode_info, int_mv *best_mv, struct HandleInterModeArgs *const args); int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv, const uint8_t *mask, int mask_stride, int *rate_mv, int allow_second_mv, int joint_me_num_refine_iter); int av1_interinter_compound_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode); int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *this_mv, const uint8_t *second_pred, const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx); // Performs a motion search in SIMPLE_TRANSLATION mode using reference frame // ref and calculates the sse and var of the residue. Note that this sets the // offset of mbmi, so we will need to reset it after calling this function. int_mv av1_simple_motion_search_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, BLOCK_SIZE bsize, int ref, const FULLPEL_MV start_mv, int num_planes, int use_subpixel, unsigned int *sse, unsigned int *var); static inline const search_site_config *av1_get_search_site_config( const AV1_COMP *cpi, MACROBLOCK *x, SEARCH_METHODS search_method) { const int ref_stride = x->e_mbd.plane[0].pre[0].stride; // AV1_COMP::mv_search_params.search_site_config is a compressor level cache // that's shared by multiple threads. In most cases where all frames have the // same resolution, the cache contains the search site config that we need. const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params; if (ref_stride == mv_search_params->search_site_cfg[SS_CFG_SRC]->stride) { return mv_search_params->search_site_cfg[SS_CFG_SRC]; } else if (ref_stride == mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD]->stride) { return mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD]; } // If the cache does not contain the correct stride, then we will need to rely // on the thread level config MACROBLOCK::search_site_cfg_buf. If even the // thread level config doesn't match, then we need to update it. search_method = search_method_lookup[search_method]; assert(search_method_lookup[search_method] == search_method && "The search_method_lookup table should be idempotent."); if (ref_stride != x->search_site_cfg_buf[search_method].stride) { av1_refresh_search_site_config(x->search_site_cfg_buf, search_method, ref_stride); } return x->search_site_cfg_buf; } static inline SEARCH_METHODS av1_get_faster_search_method( SEARCH_METHODS search_method) { // Note on search method's accuracy: // 1. NSTEP // 2. DIAMOND // 3. BIGDIA \approx SQUARE // 4. HEX. // 5. FAST_HEX \approx FAST_DIAMOND switch (search_method) { case NSTEP: return DIAMOND; case NSTEP_8PT: return DIAMOND; case DIAMOND: return BIGDIA; case CLAMPED_DIAMOND: return BIGDIA; case BIGDIA: return HEX; case SQUARE: return HEX; case HEX: return FAST_HEX; case FAST_HEX: return FAST_HEX; case FAST_DIAMOND: return VFAST_DIAMOND; case FAST_BIGDIA: return FAST_BIGDIA; case VFAST_DIAMOND: return VFAST_DIAMOND; default: assert(0 && "Invalid search method!"); return DIAMOND; } } static inline SEARCH_METHODS av1_get_default_mv_search_method( const MACROBLOCK *x, const MV_SPEED_FEATURES *mv_sf, BLOCK_SIZE bsize) { SEARCH_METHODS search_method = mv_sf->search_method; const int sf_blk_search_method = mv_sf->use_bsize_dependent_search_method; const int min_dim = AOMMIN(block_size_wide[bsize], block_size_high[bsize]); const int qband = x->qindex >> (QINDEX_BITS - 2); const bool use_faster_search_method = (sf_blk_search_method == 1 && min_dim >= 32) || (sf_blk_search_method >= 2 && min_dim >= 16 && x->content_state_sb.source_sad_nonrd <= kMedSad && qband < 3); if (use_faster_search_method) { search_method = av1_get_faster_search_method(search_method); } return search_method; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_MOTION_SEARCH_H_ aom-3.12.1/av1/encoder/mv_prec.c000066400000000000000000000402231477627663500163070ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_config.h" #include "av1/encoder/encodemv.h" #if !CONFIG_REALTIME_ONLY #include "av1/encoder/misc_model_weights.h" #endif // !CONFIG_REALTIME_ONLY #include "av1/encoder/mv_prec.h" #if !CONFIG_REALTIME_ONLY static inline int_mv get_ref_mv_for_mv_stats( const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, int ref_idx) { int ref_mv_idx = mbmi->ref_mv_idx; if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) { assert(has_second_ref(mbmi)); ref_mv_idx += 1; } const MV_REFERENCE_FRAME *ref_frames = mbmi->ref_frame; const int8_t ref_frame_type = av1_ref_frame_type(ref_frames); const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack; if (ref_frames[1] > INTRA_FRAME) { assert(ref_idx == 0 || ref_idx == 1); return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv : curr_ref_mv_stack[ref_mv_idx].this_mv; } assert(ref_idx == 0); return ref_mv_idx < mbmi_ext_frame->ref_mv_count ? curr_ref_mv_stack[ref_mv_idx].this_mv : mbmi_ext_frame->global_mvs[ref_frame_type]; } static inline int get_symbol_cost(const aom_cdf_prob *cdf, int symbol) { const aom_cdf_prob cur_cdf = AOM_ICDF(cdf[symbol]); const aom_cdf_prob prev_cdf = symbol ? AOM_ICDF(cdf[symbol - 1]) : 0; const aom_cdf_prob p15 = AOMMAX(cur_cdf - prev_cdf, EC_MIN_PROB); return av1_cost_symbol(p15); } static inline int keep_one_comp_stat(MV_STATS *mv_stats, int comp_val, int comp_idx, const AV1_COMP *cpi, int *rates) { assert(comp_val != 0 && "mv component should not have zero value!"); const int sign = comp_val < 0; const int mag = sign ? -comp_val : comp_val; const int mag_minus_1 = mag - 1; int offset; const int mv_class = av1_get_mv_class(mag_minus_1, &offset); const int int_part = offset >> 3; // int mv data const int frac_part = (offset >> 1) & 3; // fractional mv data const int high_part = offset & 1; // high precision mv data const int use_hp = cpi->common.features.allow_high_precision_mv; int r_idx = 0; const MACROBLOCK *const x = &cpi->td.mb; const MACROBLOCKD *const xd = &x->e_mbd; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; nmv_context *nmvc = &ec_ctx->nmvc; nmv_component *mvcomp_ctx = nmvc->comps; nmv_component *cur_mvcomp_ctx = &mvcomp_ctx[comp_idx]; aom_cdf_prob *sign_cdf = cur_mvcomp_ctx->sign_cdf; aom_cdf_prob *class_cdf = cur_mvcomp_ctx->classes_cdf; aom_cdf_prob *class0_cdf = cur_mvcomp_ctx->class0_cdf; aom_cdf_prob(*bits_cdf)[3] = cur_mvcomp_ctx->bits_cdf; aom_cdf_prob *frac_part_cdf = mv_class ? (cur_mvcomp_ctx->fp_cdf) : (cur_mvcomp_ctx->class0_fp_cdf[int_part]); aom_cdf_prob *high_part_cdf = mv_class ? (cur_mvcomp_ctx->hp_cdf) : (cur_mvcomp_ctx->class0_hp_cdf); const int sign_rate = get_symbol_cost(sign_cdf, sign); rates[r_idx++] = sign_rate; update_cdf(sign_cdf, sign, 2); const int class_rate = get_symbol_cost(class_cdf, mv_class); rates[r_idx++] = class_rate; update_cdf(class_cdf, mv_class, MV_CLASSES); int int_bit_rate = 0; if (mv_class == MV_CLASS_0) { int_bit_rate = get_symbol_cost(class0_cdf, int_part); update_cdf(class0_cdf, int_part, CLASS0_SIZE); } else { const int n = mv_class + CLASS0_BITS - 1; // number of bits for (int i = 0; i < n; ++i) { int_bit_rate += get_symbol_cost(bits_cdf[i], (int_part >> i) & 1); update_cdf(bits_cdf[i], (int_part >> i) & 1, 2); } } rates[r_idx++] = int_bit_rate; const int frac_part_rate = get_symbol_cost(frac_part_cdf, frac_part); rates[r_idx++] = frac_part_rate; update_cdf(frac_part_cdf, frac_part, MV_FP_SIZE); const int high_part_rate = use_hp ? get_symbol_cost(high_part_cdf, high_part) : 0; if (use_hp) { update_cdf(high_part_cdf, high_part, 2); } rates[r_idx++] = high_part_rate; mv_stats->last_bit_zero += !high_part; mv_stats->last_bit_nonzero += high_part; const int total_rate = (sign_rate + class_rate + int_bit_rate + frac_part_rate + high_part_rate); return total_rate; } static inline void keep_one_mv_stat(MV_STATS *mv_stats, const MV *ref_mv, const MV *cur_mv, const AV1_COMP *cpi) { const MACROBLOCK *const x = &cpi->td.mb; const MACROBLOCKD *const xd = &x->e_mbd; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; nmv_context *nmvc = &ec_ctx->nmvc; aom_cdf_prob *joint_cdf = nmvc->joints_cdf; const int use_hp = cpi->common.features.allow_high_precision_mv; const MV diff = { cur_mv->row - ref_mv->row, cur_mv->col - ref_mv->col }; const int mv_joint = av1_get_mv_joint(&diff); // TODO(chiyotsai@google.com): Estimate hp_diff when we are using lp const MV hp_diff = diff; const int hp_mv_joint = av1_get_mv_joint(&hp_diff); const MV truncated_diff = { (diff.row / 2) * 2, (diff.col / 2) * 2 }; const MV lp_diff = use_hp ? truncated_diff : diff; const int lp_mv_joint = av1_get_mv_joint(&lp_diff); const int mv_joint_rate = get_symbol_cost(joint_cdf, mv_joint); const int hp_mv_joint_rate = get_symbol_cost(joint_cdf, hp_mv_joint); const int lp_mv_joint_rate = get_symbol_cost(joint_cdf, lp_mv_joint); update_cdf(joint_cdf, mv_joint, MV_JOINTS); mv_stats->total_mv_rate += mv_joint_rate; mv_stats->hp_total_mv_rate += hp_mv_joint_rate; mv_stats->lp_total_mv_rate += lp_mv_joint_rate; mv_stats->mv_joint_count[mv_joint]++; for (int comp_idx = 0; comp_idx < 2; comp_idx++) { const int comp_val = comp_idx ? diff.col : diff.row; const int hp_comp_val = comp_idx ? hp_diff.col : hp_diff.row; const int lp_comp_val = comp_idx ? lp_diff.col : lp_diff.row; int rates[5]; av1_zero_array(rates, 5); const int comp_rate = comp_val ? keep_one_comp_stat(mv_stats, comp_val, comp_idx, cpi, rates) : 0; // TODO(chiyotsai@google.com): Properly get hp rate when use_hp is false const int hp_rate = hp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] + rates[4] : 0; const int lp_rate = lp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] : 0; mv_stats->total_mv_rate += comp_rate; mv_stats->hp_total_mv_rate += hp_rate; mv_stats->lp_total_mv_rate += lp_rate; } } static inline void collect_mv_stats_b(MV_STATS *mv_stats, const AV1_COMP *cpi, int mi_row, int mi_col) { const AV1_COMMON *cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) { return; } const MB_MODE_INFO *mbmi = mi_params->mi_grid_base[mi_row * mi_params->mi_stride + mi_col]; const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame = cpi->mbmi_ext_info.frame_base + get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize, cpi->mbmi_ext_info.stride); if (!is_inter_block(mbmi)) { mv_stats->intra_count++; return; } mv_stats->inter_count++; const PREDICTION_MODE mode = mbmi->mode; const int is_compound = has_second_ref(mbmi); if (mode == NEWMV || mode == NEW_NEWMV) { // All mvs are new for (int ref_idx = 0; ref_idx < 1 + is_compound; ++ref_idx) { const MV ref_mv = get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv; const MV cur_mv = mbmi->mv[ref_idx].as_mv; keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi); } } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV || mode == NEW_NEARESTMV || mode == NEW_NEARMV) { // has exactly one new_mv mv_stats->default_mvs += 1; const int ref_idx = (mode == NEAREST_NEWMV || mode == NEAR_NEWMV); const MV ref_mv = get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv; const MV cur_mv = mbmi->mv[ref_idx].as_mv; keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi); } else { // No new_mv mv_stats->default_mvs += 1 + is_compound; } // Add texture information const BLOCK_SIZE bsize = mbmi->bsize; const int num_rows = block_size_high[bsize]; const int num_cols = block_size_wide[bsize]; const int y_stride = cpi->source->y_stride; const int px_row = 4 * mi_row, px_col = 4 * mi_col; const int buf_is_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH; const int bd = cm->seq_params->bit_depth; if (buf_is_hbd) { uint16_t *source_buf = CONVERT_TO_SHORTPTR(cpi->source->y_buffer) + px_row * y_stride + px_col; for (int row = 0; row < num_rows - 1; row++) { for (int col = 0; col < num_cols - 1; col++) { const int offset = row * y_stride + col; const int horz_diff = abs(source_buf[offset + 1] - source_buf[offset]) >> (bd - 8); const int vert_diff = abs(source_buf[offset + y_stride] - source_buf[offset]) >> (bd - 8); mv_stats->horz_text += horz_diff; mv_stats->vert_text += vert_diff; mv_stats->diag_text += horz_diff * vert_diff; } } } else { uint8_t *source_buf = cpi->source->y_buffer + px_row * y_stride + px_col; for (int row = 0; row < num_rows - 1; row++) { for (int col = 0; col < num_cols - 1; col++) { const int offset = row * y_stride + col; const int horz_diff = abs(source_buf[offset + 1] - source_buf[offset]); const int vert_diff = abs(source_buf[offset + y_stride] - source_buf[offset]); mv_stats->horz_text += horz_diff; mv_stats->vert_text += vert_diff; mv_stats->diag_text += horz_diff * vert_diff; } } } } // Split block static inline void collect_mv_stats_sb(MV_STATS *mv_stats, const AV1_COMP *cpi, int mi_row, int mi_col, BLOCK_SIZE bsize) { assert(bsize < BLOCK_SIZES_ALL); const AV1_COMMON *cm = &cpi->common; if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) return; const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize); const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); const int hbs = mi_size_wide[bsize] / 2; const int qbs = mi_size_wide[bsize] / 4; switch (partition) { case PARTITION_NONE: collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); break; case PARTITION_HORZ: collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col); break; case PARTITION_VERT: collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs); break; case PARTITION_SPLIT: collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, subsize); collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col + hbs, subsize); collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col, subsize); collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col + hbs, subsize); break; case PARTITION_HORZ_A: collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs); collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col); break; case PARTITION_HORZ_B: collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col); collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs); break; case PARTITION_VERT_A: collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col); collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs); break; case PARTITION_VERT_B: collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs); collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs); break; case PARTITION_HORZ_4: for (int i = 0; i < 4; ++i) { const int this_mi_row = mi_row + i * qbs; collect_mv_stats_b(mv_stats, cpi, this_mi_row, mi_col); } break; case PARTITION_VERT_4: for (int i = 0; i < 4; ++i) { const int this_mi_col = mi_col + i * qbs; collect_mv_stats_b(mv_stats, cpi, mi_row, this_mi_col); } break; default: assert(0); } } static inline void collect_mv_stats_tile(MV_STATS *mv_stats, const AV1_COMP *cpi, const TileInfo *tile_info) { const AV1_COMMON *cm = &cpi->common; const int mi_row_start = tile_info->mi_row_start; const int mi_row_end = tile_info->mi_row_end; const int mi_col_start = tile_info->mi_col_start; const int mi_col_end = tile_info->mi_col_end; const int sb_size_mi = cm->seq_params->mib_size; BLOCK_SIZE sb_size = cm->seq_params->sb_size; for (int mi_row = mi_row_start; mi_row < mi_row_end; mi_row += sb_size_mi) { for (int mi_col = mi_col_start; mi_col < mi_col_end; mi_col += sb_size_mi) { collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, sb_size); } } } void av1_collect_mv_stats(AV1_COMP *cpi, int current_q) { MV_STATS *mv_stats = &cpi->mv_stats; const AV1_COMMON *cm = &cpi->common; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; for (int tile_row = 0; tile_row < tile_rows; tile_row++) { TileInfo tile_info; av1_tile_set_row(&tile_info, cm, tile_row); for (int tile_col = 0; tile_col < tile_cols; tile_col++) { const int tile_idx = tile_row * tile_cols + tile_col; av1_tile_set_col(&tile_info, cm, tile_col); cpi->tile_data[tile_idx].tctx = *cm->fc; cpi->td.mb.e_mbd.tile_ctx = &cpi->tile_data[tile_idx].tctx; collect_mv_stats_tile(mv_stats, cpi, &tile_info); } } mv_stats->q = current_q; mv_stats->order = cpi->common.current_frame.order_hint; mv_stats->valid = 1; } static inline int get_smart_mv_prec(AV1_COMP *cpi, const MV_STATS *mv_stats, int current_q) { const AV1_COMMON *cm = &cpi->common; const int order_hint = cpi->common.current_frame.order_hint; const int order_diff = order_hint - mv_stats->order; const float area = (float)(cm->width * cm->height); float features[MV_PREC_FEATURE_SIZE] = { (float)current_q, (float)mv_stats->q, (float)order_diff, mv_stats->inter_count / area, mv_stats->intra_count / area, mv_stats->default_mvs / area, mv_stats->mv_joint_count[0] / area, mv_stats->mv_joint_count[1] / area, mv_stats->mv_joint_count[2] / area, mv_stats->mv_joint_count[3] / area, mv_stats->last_bit_zero / area, mv_stats->last_bit_nonzero / area, mv_stats->total_mv_rate / area, mv_stats->hp_total_mv_rate / area, mv_stats->lp_total_mv_rate / area, mv_stats->horz_text / area, mv_stats->vert_text / area, mv_stats->diag_text / area, }; for (int f_idx = 0; f_idx < MV_PREC_FEATURE_SIZE; f_idx++) { features[f_idx] = (features[f_idx] - av1_mv_prec_mean[f_idx]) / av1_mv_prec_std[f_idx]; } float score = 0.0f; av1_nn_predict(features, &av1_mv_prec_dnn_config, 1, &score); const int use_high_hp = score >= 0.0f; return use_high_hp; } #endif // !CONFIG_REALTIME_ONLY void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex) { int use_hp = qindex < HIGH_PRECISION_MV_QTHRESH; #if !CONFIG_REALTIME_ONLY MV_STATS *mv_stats = &cpi->mv_stats; #endif // !CONFIG_REALTIME_ONLY if (cpi->sf.hl_sf.high_precision_mv_usage == QTR_ONLY) { use_hp = 0; } #if !CONFIG_REALTIME_ONLY else if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA && av1_frame_allows_smart_mv(cpi) && mv_stats->valid) { use_hp = get_smart_mv_prec(cpi, mv_stats, qindex); } #endif // !CONFIG_REALTIME_ONLY av1_set_high_precision_mv(cpi, use_hp, cpi->common.features.cur_frame_force_integer_mv); } aom-3.12.1/av1/encoder/mv_prec.h000066400000000000000000000041631477627663500163170ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_MV_PREC_H_ #define AOM_AV1_ENCODER_MV_PREC_H_ #include "av1/encoder/encoder.h" #include "av1/encoder/speed_features.h" // Q threshold for high precision mv. #define HIGH_PRECISION_MV_QTHRESH 128 #if !CONFIG_REALTIME_ONLY void av1_collect_mv_stats(AV1_COMP *cpi, int current_q); static inline int av1_frame_allows_smart_mv(const AV1_COMP *cpi) { const int gf_group_index = cpi->gf_frame_index; const int gf_update_type = cpi->ppi->gf_group.update_type[gf_group_index]; return !frame_is_intra_only(&cpi->common) && !(gf_update_type == INTNL_OVERLAY_UPDATE || gf_update_type == OVERLAY_UPDATE); } #endif // !CONFIG_REALTIME_ONLY static inline void av1_set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv, int cur_frame_force_integer_mv) { MvCosts *const mv_costs = cpi->td.mb.mv_costs; // Avoid accessing 'mv_costs' when it is not allocated. if (mv_costs == NULL) return; const int copy_hp = cpi->common.features.allow_high_precision_mv = allow_high_precision_mv && !cur_frame_force_integer_mv; mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX]; mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX]; mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX]; mv_costs->nmv_cost_hp[1] = &mv_costs->nmv_cost_hp_alloc[1][MV_MAX]; mv_costs->mv_cost_stack = copy_hp ? mv_costs->nmv_cost_hp : mv_costs->nmv_cost; } void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex); #endif // AOM_AV1_ENCODER_MV_PREC_H_ aom-3.12.1/av1/encoder/nonrd_opt.c000066400000000000000000001154601477627663500166640ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" #include "av1/common/reconinter.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/nonrd_opt.h" #include "av1/encoder/rdopt.h" static const SCAN_ORDER av1_fast_idtx_scan_order_16x16 = { av1_fast_idtx_scan_16x16, av1_fast_idtx_iscan_16x16 }; #define DECLARE_BLOCK_YRD_BUFFERS() \ DECLARE_ALIGNED(64, tran_low_t, dqcoeff_buf[16 * 16]); \ DECLARE_ALIGNED(64, tran_low_t, qcoeff_buf[16 * 16]); \ DECLARE_ALIGNED(64, tran_low_t, coeff_buf[16 * 16]); \ uint16_t eob[1]; #define DECLARE_BLOCK_YRD_VARS() \ /* When is_tx_8x8_dual_applicable is true, we compute the txfm for the \ * entire bsize and write macroblock_plane::coeff. So low_coeff is kept \ * as a non-const so we can reassign it to macroblock_plane::coeff. */ \ int16_t *low_coeff = (int16_t *)coeff_buf; \ int16_t *const low_qcoeff = (int16_t *)qcoeff_buf; \ int16_t *const low_dqcoeff = (int16_t *)dqcoeff_buf; \ const int diff_stride = bw; #define DECLARE_LOOP_VARS_BLOCK_YRD() \ const int16_t *src_diff = &p->src_diff[(r * diff_stride + c) << 2]; static AOM_FORCE_INLINE void update_yrd_loop_vars( MACROBLOCK *x, int *skippable, int step, int ncoeffs, int16_t *const low_coeff, int16_t *const low_qcoeff, int16_t *const low_dqcoeff, RD_STATS *this_rdc, int *eob_cost, int tx_blk_id) { const int is_txfm_skip = (ncoeffs == 0); *skippable &= is_txfm_skip; x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip; *eob_cost += get_msb(ncoeffs + 1); if (ncoeffs == 1) this_rdc->rate += (int)abs(low_qcoeff[0]); else if (ncoeffs > 1) this_rdc->rate += aom_satd_lp(low_qcoeff, step << 4); this_rdc->dist += av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2; } static inline void aom_process_hadamard_lp_8x16(MACROBLOCK *x, int max_blocks_high, int max_blocks_wide, int num_4x4_w, int step, int block_step) { struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; const int bw = 4 * num_4x4_w; const int num_4x4 = AOMMIN(num_4x4_w, max_blocks_wide); int block = 0; for (int r = 0; r < max_blocks_high; r += block_step) { for (int c = 0; c < num_4x4; c += 2 * block_step) { const int16_t *src_diff = &p->src_diff[(r * bw + c) << 2]; int16_t *low_coeff = (int16_t *)p->coeff + BLOCK_OFFSET(block); aom_hadamard_lp_8x8_dual(src_diff, (ptrdiff_t)bw, low_coeff); block += 2 * step; } } } #if CONFIG_AV1_HIGHBITDEPTH #define DECLARE_BLOCK_YRD_HBD_VARS() \ tran_low_t *const coeff = coeff_buf; \ tran_low_t *const qcoeff = qcoeff_buf; \ tran_low_t *const dqcoeff = dqcoeff_buf; static AOM_FORCE_INLINE void update_yrd_loop_vars_hbd( MACROBLOCK *x, int *skippable, int step, int ncoeffs, tran_low_t *const coeff, tran_low_t *const qcoeff, tran_low_t *const dqcoeff, RD_STATS *this_rdc, int *eob_cost, int tx_blk_id) { const MACROBLOCKD *xd = &x->e_mbd; const int is_txfm_skip = (ncoeffs == 0); *skippable &= is_txfm_skip; x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip; *eob_cost += get_msb(ncoeffs + 1); int64_t dummy; if (ncoeffs == 1) this_rdc->rate += (int)abs(qcoeff[0]); else if (ncoeffs > 1) this_rdc->rate += aom_satd(qcoeff, step << 4); this_rdc->dist += av1_highbd_block_error(coeff, dqcoeff, step << 4, &dummy, xd->bd) >> 2; } #endif /*!\brief Calculates RD Cost using Hadamard transform. * * \ingroup nonrd_mode_search * \callgraph * \callergraph * Calculates RD Cost using Hadamard transform. For low bit depth this function * uses low-precision set of functions (16-bit) and 32 bit for high bit depth * \param[in] x Pointer to structure holding all the data for the current macroblock * \param[in] this_rdc Pointer to calculated RD Cost * \param[in] skippable Pointer to a flag indicating possible tx skip * \param[in] bsize Current block size * \param[in] tx_size Transform size * \param[in] is_inter_mode Flag to indicate inter mode * * \remark Nothing is returned. Instead, calculated RD cost is placed to * \c this_rdc. \c skippable flag is set if there is no non-zero quantized * coefficients for Hadamard transform */ void av1_block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable, BLOCK_SIZE bsize, TX_SIZE tx_size) { MACROBLOCKD *xd = &x->e_mbd; const struct macroblockd_plane *pd = &xd->plane[AOM_PLANE_Y]; struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; assert(bsize < BLOCK_SIZES_ALL); const int num_4x4_w = mi_size_wide[bsize]; const int num_4x4_h = mi_size_high[bsize]; const int step = 1 << (tx_size << 1); const int block_step = (1 << tx_size); const int row_step = step * num_4x4_w >> tx_size; int block = 0; const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5); const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5); int eob_cost = 0; const int bw = 4 * num_4x4_w; const int bh = 4 * num_4x4_h; const int use_hbd = is_cur_buf_hbd(xd); int num_blk_skip_w = num_4x4_w; #if CONFIG_AV1_HIGHBITDEPTH if (use_hbd) { aom_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); } else { aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); } #else aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); #endif // Keep the intermediate value on the stack here. Writing directly to // skippable causes speed regression due to load-and-store issues in // update_yrd_loop_vars. int temp_skippable = 1; this_rdc->dist = 0; this_rdc->rate = 0; // For block sizes 8x16 or above, Hadamard txfm of two adjacent 8x8 blocks // can be done per function call. Hence the call of Hadamard txfm is // abstracted here for the specified cases. int is_tx_8x8_dual_applicable = (tx_size == TX_8X8 && block_size_wide[bsize] >= 16 && block_size_high[bsize] >= 8); #if CONFIG_AV1_HIGHBITDEPTH // As of now, dual implementation of hadamard txfm is available for low // bitdepth. if (use_hbd) is_tx_8x8_dual_applicable = 0; #endif if (is_tx_8x8_dual_applicable) { aom_process_hadamard_lp_8x16(x, max_blocks_high, max_blocks_wide, num_4x4_w, step, block_step); } const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; DECLARE_BLOCK_YRD_BUFFERS() DECLARE_BLOCK_YRD_VARS() #if CONFIG_AV1_HIGHBITDEPTH DECLARE_BLOCK_YRD_HBD_VARS() #else (void)use_hbd; #endif // Keep track of the row and column of the blocks we use so that we know // if we are in the unrestricted motion border. for (int r = 0; r < max_blocks_high; r += block_step) { for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) { DECLARE_LOOP_VARS_BLOCK_YRD() switch (tx_size) { #if CONFIG_AV1_HIGHBITDEPTH case TX_16X16: if (use_hbd) { aom_hadamard_16x16(src_diff, diff_stride, coeff); av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob, // default_scan_fp_16x16_transpose and // av1_default_iscan_fp_16x16_transpose have to be // used together. default_scan_fp_16x16_transpose, av1_default_iscan_fp_16x16_transpose); } else { aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff); av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, // default_scan_lp_16x16_transpose and // av1_default_iscan_lp_16x16_transpose have to be // used together. default_scan_lp_16x16_transpose, av1_default_iscan_lp_16x16_transpose); } break; case TX_8X8: if (use_hbd) { aom_hadamard_8x8(src_diff, diff_stride, coeff); av1_quantize_fp( coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob, default_scan_8x8_transpose, av1_default_iscan_8x8_transpose); } else { if (is_tx_8x8_dual_applicable) { // The coeffs are pre-computed for the whole block, so re-assign // low_coeff to the appropriate location. const int block_offset = BLOCK_OFFSET(block + s); low_coeff = (int16_t *)p->coeff + block_offset; } else { aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff); } av1_quantize_lp( low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, // default_scan_8x8_transpose and // av1_default_iscan_8x8_transpose have to be used together. default_scan_8x8_transpose, av1_default_iscan_8x8_transpose); } break; default: assert(tx_size == TX_4X4); // In tx_size=4x4 case, aom_fdct4x4 and aom_fdct4x4_lp generate // normal coefficients order, so we don't need to change the scan // order here. if (use_hbd) { aom_fdct4x4(src_diff, coeff, diff_stride); av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob, scan_order->scan, scan_order->iscan); } else { aom_fdct4x4_lp(src_diff, low_coeff, diff_stride); av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, scan_order->scan, scan_order->iscan); } break; #else case TX_16X16: aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff); av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, default_scan_lp_16x16_transpose, av1_default_iscan_lp_16x16_transpose); break; case TX_8X8: if (is_tx_8x8_dual_applicable) { // The coeffs are pre-computed for the whole block, so re-assign // low_coeff to the appropriate location. const int block_offset = BLOCK_OFFSET(block + s); low_coeff = (int16_t *)p->coeff + block_offset; } else { aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff); } av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, default_scan_8x8_transpose, av1_default_iscan_8x8_transpose); break; default: aom_fdct4x4_lp(src_diff, low_coeff, diff_stride); av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, scan_order->scan, scan_order->iscan); break; #endif } assert(*eob <= 1024); #if CONFIG_AV1_HIGHBITDEPTH if (use_hbd) update_yrd_loop_vars_hbd(x, &temp_skippable, step, *eob, coeff, qcoeff, dqcoeff, this_rdc, &eob_cost, r * num_blk_skip_w + c); else #endif update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff, low_qcoeff, low_dqcoeff, this_rdc, &eob_cost, r * num_blk_skip_w + c); } block += row_step; } this_rdc->skip_txfm = *skippable = temp_skippable; if (this_rdc->sse < INT64_MAX) { this_rdc->sse = (this_rdc->sse << 6) >> 2; if (temp_skippable) { this_rdc->dist = 0; this_rdc->dist = this_rdc->sse; return; } } // If skippable is set, rate gets clobbered later. this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT); this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT); } // Explicitly enumerate the cases so the compiler can generate SIMD for the // function. According to the disassembler, gcc generates SSE codes for each of // the possible block sizes. The hottest case is tx_width 16, which takes up // about 8% of the self cycle of av1_nonrd_pick_inter_mode_sb. Since // av1_nonrd_pick_inter_mode_sb takes up about 3% of total encoding time, the // potential room of improvement for writing AVX2 optimization is only 3% * 8% = // 0.24% of total encoding time. static inline void scale_square_buf_vals(int16_t *dst, int tx_width, const int16_t *src, int src_stride) { #define DO_SCALING \ do { \ for (int idy = 0; idy < tx_width; ++idy) { \ for (int idx = 0; idx < tx_width; ++idx) { \ dst[idy * tx_width + idx] = src[idy * src_stride + idx] * 8; \ } \ } \ } while (0) if (tx_width == 4) { DO_SCALING; } else if (tx_width == 8) { DO_SCALING; } else if (tx_width == 16) { DO_SCALING; } else { assert(0); } #undef DO_SCALING } /*!\brief Calculates RD Cost when the block uses Identity transform. * Note that this function is only for low bit depth encoding, since it * is called in real-time mode for now, which sets high bit depth to 0: * -DCONFIG_AV1_HIGHBITDEPTH=0 * * \ingroup nonrd_mode_search * \callgraph * \callergraph * Calculates RD Cost. For low bit depth this function * uses low-precision set of functions (16-bit) and 32 bit for high bit depth * \param[in] x Pointer to structure holding all the data for the current macroblock * \param[in] pred_buf Pointer to the prediction buffer * \param[in] pred_stride Stride for the prediction buffer * \param[in] this_rdc Pointer to calculated RD Cost * \param[in] skippable Pointer to a flag indicating possible tx skip * \param[in] bsize Current block size * \param[in] tx_size Transform size * * \remark Nothing is returned. Instead, calculated RD cost is placed to * \c this_rdc. \c skippable flag is set if all coefficients are zero. */ void av1_block_yrd_idtx(MACROBLOCK *x, const uint8_t *const pred_buf, int pred_stride, RD_STATS *this_rdc, int *skippable, BLOCK_SIZE bsize, TX_SIZE tx_size) { MACROBLOCKD *xd = &x->e_mbd; struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; assert(bsize < BLOCK_SIZES_ALL); const int num_4x4_w = mi_size_wide[bsize]; const int num_4x4_h = mi_size_high[bsize]; const int step = 1 << (tx_size << 1); const int block_step = (1 << tx_size); const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5); const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5); int eob_cost = 0; const int bw = 4 * num_4x4_w; const int bh = 4 * num_4x4_h; const int num_blk_skip_w = num_4x4_w; // Keep the intermediate value on the stack here. Writing directly to // skippable causes speed regression due to load-and-store issues in // update_yrd_loop_vars. int temp_skippable = 1; int tx_wd = 0; const SCAN_ORDER *scan_order = NULL; switch (tx_size) { case TX_64X64: assert(0); // Not implemented break; case TX_32X32: assert(0); // Not used break; case TX_16X16: scan_order = &av1_fast_idtx_scan_order_16x16; tx_wd = 16; break; case TX_8X8: scan_order = &av1_fast_idtx_scan_order_8x8; tx_wd = 8; break; default: assert(tx_size == TX_4X4); scan_order = &av1_fast_idtx_scan_order_4x4; tx_wd = 4; break; } assert(scan_order != NULL); this_rdc->dist = 0; this_rdc->rate = 0; aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, pred_buf, pred_stride); // Keep track of the row and column of the blocks we use so that we know // if we are in the unrestricted motion border. DECLARE_BLOCK_YRD_BUFFERS() DECLARE_BLOCK_YRD_VARS() for (int r = 0; r < max_blocks_high; r += block_step) { for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) { DECLARE_LOOP_VARS_BLOCK_YRD() scale_square_buf_vals(low_coeff, tx_wd, src_diff, diff_stride); av1_quantize_lp(low_coeff, tx_wd * tx_wd, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, scan_order->scan, scan_order->iscan); assert(*eob <= 1024); update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff, low_qcoeff, low_dqcoeff, this_rdc, &eob_cost, r * num_blk_skip_w + c); } } this_rdc->skip_txfm = *skippable = temp_skippable; if (this_rdc->sse < INT64_MAX) { this_rdc->sse = (this_rdc->sse << 6) >> 2; if (temp_skippable) { this_rdc->dist = 0; this_rdc->dist = this_rdc->sse; return; } } // If skippable is set, rate gets clobbered later. this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT); this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT); } int64_t av1_model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize, MACROBLOCK *x, MACROBLOCKD *xd, RD_STATS *this_rdc, int start_plane, int stop_plane) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. unsigned int sse; int rate; int64_t dist; int plane; int64_t tot_sse = 0; this_rdc->rate = 0; this_rdc->dist = 0; this_rdc->skip_txfm = 0; for (plane = start_plane; plane <= stop_plane; ++plane) { struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; const uint32_t dc_quant = p->dequant_QTX[0]; const uint32_t ac_quant = p->dequant_QTX[1]; const BLOCK_SIZE bs = plane_bsize; unsigned int var; if (!x->color_sensitivity[COLOR_SENS_IDX(plane)]) continue; var = cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse); assert(sse >= var); tot_sse += sse; av1_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs], dc_quant >> 3, &rate, &dist); this_rdc->rate += rate >> 1; this_rdc->dist += dist << 3; av1_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], ac_quant >> 3, &rate, &dist); this_rdc->rate += rate; this_rdc->dist += dist << 4; } if (this_rdc->rate == 0) { this_rdc->skip_txfm = 1; } if (RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist) >= RDCOST(x->rdmult, 0, tot_sse << 4)) { this_rdc->rate = 0; this_rdc->dist = tot_sse << 4; this_rdc->skip_txfm = 1; } return tot_sse; } static void compute_intra_yprediction(const AV1_COMMON *cm, PREDICTION_MODE mode, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd) { const SequenceHeader *seq_params = cm->seq_params; struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; uint8_t *const src_buf_base = p->src.buf; uint8_t *const dst_buf_base = pd->dst.buf; const int src_stride = p->src.stride; const int dst_stride = pd->dst.stride; int plane = 0; int row, col; // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 // transform size varies per plane, look it up in a common way. const TX_SIZE tx_size = max_txsize_lookup[bsize]; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); // If mb_to_right_edge is < 0 we are in a situation in which // the current block size extends into the UMV and we won't // visit the sub blocks that are wholly within the UMV. const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); const int max_blocks_high = max_block_high(xd, plane_bsize, plane); // Keep track of the row and column of the blocks we use so that we know // if we are in the unrestricted motion border. for (row = 0; row < max_blocks_high; row += (1 << tx_size)) { // Skip visiting the sub blocks that are wholly within the UMV. for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) { p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)]; pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)]; av1_predict_intra_block( xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, block_size_wide[bsize], block_size_high[bsize], tx_size, mode, 0, 0, FILTER_INTRA_MODES, pd->dst.buf, dst_stride, pd->dst.buf, dst_stride, 0, 0, plane); } } p->src.buf = src_buf_base; pd->dst.buf = dst_buf_base; } // Checks whether Intra mode needs to be pruned based on // 'intra_y_mode_bsize_mask_nrd' and 'prune_hv_pred_modes_using_blksad' // speed features. static inline bool is_prune_intra_mode( AV1_COMP *cpi, int mode_index, int force_intra_check, BLOCK_SIZE bsize, uint8_t segment_id, SOURCE_SAD source_sad_nonrd, uint8_t color_sensitivity[MAX_MB_PLANE - 1]) { const PREDICTION_MODE this_mode = intra_mode_list[mode_index]; if (mode_index > 2 || force_intra_check == 0) { if (!((1 << this_mode) & cpi->sf.rt_sf.intra_y_mode_bsize_mask_nrd[bsize])) return true; if (this_mode == DC_PRED) return false; if (!cpi->sf.rt_sf.prune_hv_pred_modes_using_src_sad) return false; const bool has_color_sensitivity = color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] && color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]; if (has_color_sensitivity && (cpi->rc.frame_source_sad > 1.1 * cpi->rc.avg_source_sad || cyclic_refresh_segment_id_boosted(segment_id) || source_sad_nonrd > kMedSad)) return false; return true; } return false; } /*!\brief Estimation of RD cost of an intra mode for Non-RD optimized case. * * \ingroup nonrd_mode_search * \callgraph * \callergraph * Calculates RD Cost for an intra mode for a single TX block using Hadamard * transform. * \param[in] plane Color plane * \param[in] block Index of a TX block in a prediction block * \param[in] row Row of a current TX block * \param[in] col Column of a current TX block * \param[in] plane_bsize Block size of a current prediction block * \param[in] tx_size Transform size * \param[in] arg Pointer to a structure that holds parameters * for intra mode search * * \remark Nothing is returned. Instead, best mode and RD Cost of the best mode * are set in \c args->rdc and \c args->mode */ void av1_estimate_block_intra(int plane, int block, int row, int col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { struct estimate_block_intra_args *const args = arg; AV1_COMP *const cpi = args->cpi; AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size]; uint8_t *const src_buf_base = p->src.buf; uint8_t *const dst_buf_base = pd->dst.buf; const int64_t src_stride = p->src.stride; const int64_t dst_stride = pd->dst.stride; (void)block; av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size); if (args->prune_mode_based_on_sad || args->prune_palette_sad) { unsigned int this_sad = cpi->ppi->fn_ptr[plane_bsize].sdf( p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); const unsigned int sad_threshold = args->best_sad != UINT_MAX ? args->best_sad + (args->best_sad >> 4) : UINT_MAX; // Skip the evaluation of current mode if its SAD is more than a threshold. if (args->prune_mode_based_on_sad && this_sad > sad_threshold) { // For the current mode, set rate and distortion to maximum possible // values and return. // Note: args->rdc->rate is checked in av1_nonrd_pick_intra_mode() to skip // the evaluation of the current mode. args->rdc->rate = INT_MAX; args->rdc->dist = INT64_MAX; return; } if (this_sad < args->best_sad) { args->best_sad = this_sad; } } RD_STATS this_rdc; av1_invalid_rd_stats(&this_rdc); p->src.buf = &src_buf_base[4 * (row * src_stride + col)]; pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)]; if (plane == 0) { av1_block_yrd(x, &this_rdc, &args->skippable, bsize_tx, AOMMIN(tx_size, TX_16X16)); } else { av1_model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, plane, plane); } p->src.buf = src_buf_base; pd->dst.buf = dst_buf_base; assert(args->rdc->rate != INT_MAX && args->rdc->dist != INT64_MAX); args->rdc->rate += this_rdc.rate; args->rdc->dist += this_rdc.dist; } /*!\brief Estimates best intra mode for inter mode search * * \ingroup nonrd_mode_search * \callgraph * \callergraph * * Using heuristics based on best inter mode, block size, and other decides * whether to check intra modes. If so, estimates and selects best intra mode * from the reduced set of intra modes (max 4 intra modes checked) * * \param[in] cpi Top-level encoder structure * \param[in] x Pointer to structure holding all the * data for the current macroblock * \param[in] bsize Current block size * \param[in] best_early_term Flag, indicating that TX for the * best inter mode was skipped * \param[in] ref_cost_intra Cost of signalling intra mode * \param[in] reuse_prediction Flag, indicating prediction re-use * \param[in] orig_dst Original destination buffer * \param[in] tmp_buffers Pointer to a temporary buffers for * prediction re-use * \param[out] this_mode_pred Pointer to store prediction buffer * for prediction re-use * \param[in] best_rdc Pointer to RD cost for the best * selected intra mode * \param[in] best_pickmode Pointer to a structure containing * best mode picked so far * \param[in] ctx Pointer to structure holding coding * contexts and modes for the block * * \remark Nothing is returned. Instead, calculated RD cost is placed to * \c best_rdc and best selected mode is placed to \c best_pickmode * */ void av1_estimate_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int best_early_term, unsigned int ref_cost_intra, int reuse_prediction, struct buf_2d *orig_dst, PRED_BUFFER *tmp_buffers, PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc, BEST_PICKMODE *best_pickmode, PICK_MODE_CONTEXT *ctx, unsigned int *best_sad_norm) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mi = xd->mi[0]; const TxfmSearchParams *txfm_params = &x->txfm_search_params; const unsigned char segment_id = mi->segment_id; const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize]; const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize]; const bool is_screen_content = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN; struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; const CommonQuantParams *quant_params = &cm->quant_params; RD_STATS this_rdc; int intra_cost_penalty = av1_get_intra_cost_penalty( quant_params->base_qindex, quant_params->y_dc_delta_q, cm->seq_params->bit_depth); int64_t inter_mode_thresh = RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0); int perform_intra_pred = rt_sf->check_intra_pred_nonrd; int force_intra_check = 0; // For spatial enhancement layer: turn off intra prediction if the // previous spatial layer as golden ref is not chosen as best reference. // only do this for temporal enhancement layer and on non-key frames. if (cpi->svc.spatial_layer_id > 0 && best_pickmode->best_ref_frame != GOLDEN_FRAME && cpi->svc.temporal_layer_id > 0 && !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) perform_intra_pred = 0; int do_early_exit_rdthresh = 1; uint32_t spatial_var_thresh = 50; int motion_thresh = 32; // Adjust thresholds to make intra mode likely tested if the other // references (golden, alt) are skipped/not checked. For now always // adjust for svc mode. if (cpi->ppi->use_svc || (rt_sf->use_nonrd_altref_frame == 0 && rt_sf->nonrd_prune_ref_frame_search > 0)) { spatial_var_thresh = 150; motion_thresh = 0; } // Some adjustments to checking intra mode based on source variance. if (x->source_variance < spatial_var_thresh) { // If the best inter mode is large motion or non-LAST ref reduce intra cost // penalty, so intra mode is more likely tested. if (best_rdc->rdcost != INT64_MAX && (best_pickmode->best_ref_frame != LAST_FRAME || abs(mi->mv[0].as_mv.row) >= motion_thresh || abs(mi->mv[0].as_mv.col) >= motion_thresh)) { intra_cost_penalty = intra_cost_penalty >> 2; inter_mode_thresh = RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0); do_early_exit_rdthresh = 0; } if ((x->source_variance < AOMMAX(50, (spatial_var_thresh >> 1)) && x->content_state_sb.source_sad_nonrd >= kHighSad) || (is_screen_content && x->source_variance < 50 && ((bsize >= BLOCK_32X32 && x->content_state_sb.source_sad_nonrd != kZeroSad) || x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 1))) force_intra_check = 1; // For big blocks worth checking intra (since only DC will be checked), // even if best_early_term is set. if (bsize >= BLOCK_32X32) best_early_term = 0; } else if (rt_sf->source_metrics_sb_nonrd && x->content_state_sb.source_sad_nonrd <= kLowSad) { perform_intra_pred = 0; } if (best_rdc->skip_txfm && best_pickmode->best_mode_initial_skip_flag) { if (rt_sf->skip_intra_pred == 1 && best_pickmode->best_mode != NEWMV) perform_intra_pred = 0; else if (rt_sf->skip_intra_pred == 2) perform_intra_pred = 0; } if (!(best_rdc->rdcost == INT64_MAX || force_intra_check || (perform_intra_pred && !best_early_term && bsize <= cpi->sf.part_sf.max_intra_bsize))) { return; } // Early exit based on RD cost calculated using known rate. When // is_screen_content is true, more bias is given to intra modes. Hence, // considered conservative threshold in early exit for the same. const int64_t known_rd = is_screen_content ? CALC_BIASED_RDCOST(inter_mode_thresh) : inter_mode_thresh; if (known_rd > best_rdc->rdcost) return; struct estimate_block_intra_args args; init_estimate_block_intra_args(&args, cpi, x); if (prune_palette_testing_inter(cpi, x->source_variance)) args.prune_palette_sad = true; TX_SIZE intra_tx_size = AOMMIN( AOMMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]), TX_16X16); if (is_screen_content && cpi->rc.high_source_sad && x->source_variance > spatial_var_thresh && bsize <= BLOCK_16X16) intra_tx_size = TX_4X4; PRED_BUFFER *const best_pred = best_pickmode->best_pred; if (reuse_prediction && best_pred != NULL) { const int bh = block_size_high[bsize]; const int bw = block_size_wide[bsize]; if (best_pred->data == orig_dst->buf) { *this_mode_pred = &tmp_buffers[get_pred_buffer(tmp_buffers, 3)]; aom_convolve_copy(best_pred->data, best_pred->stride, (*this_mode_pred)->data, (*this_mode_pred)->stride, bw, bh); best_pickmode->best_pred = *this_mode_pred; } } pd->dst = *orig_dst; for (int midx = 0; midx < RTC_INTRA_MODES; ++midx) { const PREDICTION_MODE this_mode = intra_mode_list[midx]; const THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)]; const int64_t mode_rd_thresh = rd_threshes[mode_index]; if (is_prune_intra_mode(cpi, midx, force_intra_check, bsize, segment_id, x->content_state_sb.source_sad_nonrd, x->color_sensitivity)) continue; if (is_screen_content && rt_sf->source_metrics_sb_nonrd) { // For spatially flat blocks with zero motion only check // DC mode. if (x->content_state_sb.source_sad_nonrd == kZeroSad && x->source_variance == 0 && this_mode != DC_PRED) continue; // Only test Intra for big blocks if spatial_variance is small. else if (bsize > BLOCK_32X32 && x->source_variance > 50) continue; } if (rd_less_than_thresh(best_rdc->rdcost, mode_rd_thresh, rd_thresh_freq_fact[mode_index]) && (do_early_exit_rdthresh || this_mode == SMOOTH_PRED)) { continue; } const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x, xd->plane[AOM_PLANE_U].subsampling_y); mi->mode = this_mode; mi->ref_frame[0] = INTRA_FRAME; mi->ref_frame[1] = NONE_FRAME; av1_invalid_rd_stats(&this_rdc); args.mode = this_mode; args.skippable = 1; args.rdc = &this_rdc; mi->tx_size = intra_tx_size; compute_intra_yprediction(cm, this_mode, bsize, x, xd); // Look into selecting tx_size here, based on prediction residual. av1_block_yrd(x, &this_rdc, &args.skippable, bsize, mi->tx_size); // TODO(kyslov@) Need to account for skippable if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) { av1_foreach_transformed_block_in_plane(xd, uv_bsize, AOM_PLANE_U, av1_estimate_block_intra, &args); } if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) { av1_foreach_transformed_block_in_plane(xd, uv_bsize, AOM_PLANE_V, av1_estimate_block_intra, &args); } int mode_cost = 0; if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) { mode_cost += x->mode_costs.angle_delta_cost[this_mode - V_PRED] [MAX_ANGLE_DELTA + mi->angle_delta[PLANE_TYPE_Y]]; } if (this_mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) { mode_cost += x->mode_costs.filter_intra_cost[bsize][0]; } this_rdc.rate += ref_cost_intra; this_rdc.rate += intra_cost_penalty; this_rdc.rate += mode_cost; this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); if (is_screen_content && rt_sf->source_metrics_sb_nonrd) { // For blocks with low spatial variance and color sad, // favor the intra-modes, only on scene/slide change. if (cpi->rc.high_source_sad && x->source_variance < 800 && (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])) this_rdc.rdcost = CALC_BIASED_RDCOST(this_rdc.rdcost); // Otherwise bias against intra for blocks with zero // motion and no color, on non-scene/slide changes. else if (!cpi->rc.high_source_sad && x->source_variance > 0 && x->content_state_sb.source_sad_nonrd == kZeroSad && x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) this_rdc.rdcost = (3 * this_rdc.rdcost) >> 1; } if (this_rdc.rdcost < best_rdc->rdcost) { *best_rdc = this_rdc; best_pickmode->best_mode = this_mode; best_pickmode->best_tx_size = mi->tx_size; best_pickmode->best_ref_frame = INTRA_FRAME; best_pickmode->best_second_ref_frame = NONE; best_pickmode->best_mode_skip_txfm = this_rdc.skip_txfm; mi->uv_mode = this_mode; mi->mv[0].as_int = INVALID_MV; mi->mv[1].as_int = INVALID_MV; if (!this_rdc.skip_txfm) memset(ctx->blk_skip, 0, sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); } } if (best_pickmode->best_ref_frame == INTRA_FRAME) memset(ctx->blk_skip, 0, sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); mi->tx_size = best_pickmode->best_tx_size; *best_sad_norm = args.best_sad >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); } aom-3.12.1/av1/encoder/nonrd_opt.h000066400000000000000000000677601477627663500167020ustar00rootroot00000000000000/* * Copyright (c) 2022, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_NONRD_OPT_H_ #define AOM_AV1_ENCODER_NONRD_OPT_H_ #include "av1/encoder/context_tree.h" #include "av1/encoder/rdopt_utils.h" #include "av1/encoder/rdopt.h" #define RTC_INTER_MODES (4) #define RTC_INTRA_MODES (4) #define RTC_MODES (AOMMAX(RTC_INTER_MODES, RTC_INTRA_MODES)) #define CALC_BIASED_RDCOST(rdcost) (7 * (rdcost) >> 3) #define NUM_COMP_INTER_MODES_RT (6) #define NUM_INTER_MODES 12 #define CAP_TX_SIZE_FOR_BSIZE_GT32(tx_mode_search_type, bsize) \ (((tx_mode_search_type) != ONLY_4X4 && (bsize) > BLOCK_32X32) ? true : false) #define TX_SIZE_FOR_BSIZE_GT32 (TX_16X16) #define FILTER_SEARCH_SIZE 2 #if !CONFIG_REALTIME_ONLY #define MOTION_MODE_SEARCH_SIZE 2 #endif extern int g_pick_inter_mode_cnt; /*!\cond */ typedef struct { uint8_t *data; int stride; int in_use; } PRED_BUFFER; typedef struct { PRED_BUFFER *best_pred; PREDICTION_MODE best_mode; TX_SIZE best_tx_size; TX_TYPE tx_type; MV_REFERENCE_FRAME best_ref_frame; MV_REFERENCE_FRAME best_second_ref_frame; uint8_t best_mode_skip_txfm; uint8_t best_mode_initial_skip_flag; int_interpfilters best_pred_filter; MOTION_MODE best_motion_mode; WarpedMotionParams wm_params; int num_proj_ref; PALETTE_MODE_INFO pmi; int64_t best_sse; } BEST_PICKMODE; typedef struct { MV_REFERENCE_FRAME ref_frame; PREDICTION_MODE pred_mode; } REF_MODE; typedef struct { MV_REFERENCE_FRAME ref_frame[2]; PREDICTION_MODE pred_mode; } COMP_REF_MODE; struct estimate_block_intra_args { AV1_COMP *cpi; MACROBLOCK *x; PREDICTION_MODE mode; int skippable; RD_STATS *rdc; unsigned int best_sad; bool prune_mode_based_on_sad; bool prune_palette_sad; }; /*!\endcond */ /*!\brief Structure to store parameters and statistics used in non-rd inter mode * evaluation. */ typedef struct { //! Structure to hold best inter mode data BEST_PICKMODE best_pickmode; //! Structure to RD cost of current mode RD_STATS this_rdc; //! Pointer to the RD Cost for the best mode found so far RD_STATS best_rdc; //! Distortion of chroma planes for all modes and reference frames int64_t uv_dist[RTC_INTER_MODES][REF_FRAMES]; //! Buffer to hold predicted block for all reference frames and planes struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]; //! Array to hold variance of all modes and reference frames unsigned int vars[RTC_INTER_MODES][REF_FRAMES]; //! Array to hold ref cost of single reference mode for all ref frames unsigned int ref_costs_single[REF_FRAMES]; //! Array to hold motion vector for all modes and reference frames int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES]; //! Array to hold best mv for all modes and reference frames int_mv frame_mv_best[MB_MODE_COUNT][REF_FRAMES]; //! Array to hold inter mode cost of single ref mode for all ref frames int single_inter_mode_costs[RTC_INTER_MODES][REF_FRAMES]; //! Array to hold use reference frame mask for each reference frame int use_ref_frame_mask[REF_FRAMES]; //! Array to hold flags of evaluated modes for each reference frame uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES]; //! Array to hold flag indicating if scaled reference frame is used. bool use_scaled_ref_frame[REF_FRAMES]; } InterModeSearchStateNonrd; static const uint8_t b_width_log2_lookup[BLOCK_SIZES] = { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5 }; static const uint8_t b_height_log2_lookup[BLOCK_SIZES] = { 0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5 }; static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED, SMOOTH_PRED }; static const PREDICTION_MODE inter_mode_list[] = { NEARESTMV, NEARMV, GLOBALMV, NEWMV }; static const THR_MODES mode_idx[REF_FRAMES][RTC_MODES] = { { THR_DC, THR_V_PRED, THR_H_PRED, THR_SMOOTH }, { THR_NEARESTMV, THR_NEARMV, THR_GLOBALMV, THR_NEWMV }, { THR_NEARESTL2, THR_NEARL2, THR_GLOBALL2, THR_NEWL2 }, { THR_NEARESTL3, THR_NEARL3, THR_GLOBALL3, THR_NEWL3 }, { THR_NEARESTG, THR_NEARG, THR_GLOBALG, THR_NEWG }, { THR_NEARESTB, THR_NEARB, THR_GLOBALB, THR_NEWB }, { THR_NEARESTA2, THR_NEARA2, THR_GLOBALA2, THR_NEWA2 }, { THR_NEARESTA, THR_NEARA, THR_GLOBALA, THR_NEWA }, }; // GLOBALMV in the set below is in fact ZEROMV as we don't do global ME in RT // mode static const REF_MODE ref_mode_set[NUM_INTER_MODES] = { { LAST_FRAME, NEARESTMV }, { LAST_FRAME, NEARMV }, { LAST_FRAME, GLOBALMV }, { LAST_FRAME, NEWMV }, { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV }, { GOLDEN_FRAME, GLOBALMV }, { GOLDEN_FRAME, NEWMV }, { ALTREF_FRAME, NEARESTMV }, { ALTREF_FRAME, NEARMV }, { ALTREF_FRAME, GLOBALMV }, { ALTREF_FRAME, NEWMV }, }; static const COMP_REF_MODE comp_ref_mode_set[NUM_COMP_INTER_MODES_RT] = { { { LAST_FRAME, GOLDEN_FRAME }, GLOBAL_GLOBALMV }, { { LAST_FRAME, GOLDEN_FRAME }, NEAREST_NEARESTMV }, { { LAST_FRAME, LAST2_FRAME }, GLOBAL_GLOBALMV }, { { LAST_FRAME, LAST2_FRAME }, NEAREST_NEARESTMV }, { { LAST_FRAME, ALTREF_FRAME }, GLOBAL_GLOBALMV }, { { LAST_FRAME, ALTREF_FRAME }, NEAREST_NEARESTMV }, }; static const int_interpfilters filters_ref_set[9] = { [0].as_filters = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR }, [1].as_filters = { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH }, [2].as_filters = { EIGHTTAP_REGULAR, EIGHTTAP_SMOOTH }, [3].as_filters = { EIGHTTAP_SMOOTH, EIGHTTAP_REGULAR }, [4].as_filters = { MULTITAP_SHARP, MULTITAP_SHARP }, [5].as_filters = { EIGHTTAP_REGULAR, MULTITAP_SHARP }, [6].as_filters = { MULTITAP_SHARP, EIGHTTAP_REGULAR }, [7].as_filters = { EIGHTTAP_SMOOTH, MULTITAP_SHARP }, [8].as_filters = { MULTITAP_SHARP, EIGHTTAP_SMOOTH } }; enum { // INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV), INTER_NEAREST = (1 << NEARESTMV), INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV), INTER_NEAREST_NEAR = (1 << NEARESTMV) | (1 << NEARMV), INTER_NEAR_NEW = (1 << NEARMV) | (1 << NEWMV), }; // The original scan order (default_scan_8x8) is modified according to the extra // transpose in hadamard c implementation, i.e., aom_hadamard_lp_8x8_c and // aom_hadamard_8x8_c. DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8_transpose[64]) = { 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40, 33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35, 28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30, 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63 }; // The original scan order (av1_default_iscan_8x8) is modified to match // hadamard AVX2 implementation, i.e., aom_hadamard_lp_8x8_avx2 and // aom_hadamard_8x8_avx2. Since hadamard AVX2 implementation will modify the // order of coefficients, such that the normal scan order is no longer // guaranteed to scan low coefficients first, therefore we modify the scan order // accordingly. // Note that this one has to be used together with default_scan_8x8_transpose. DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x8_transpose[64]) = { 0, 2, 3, 9, 10, 20, 21, 35, 1, 4, 8, 11, 19, 22, 34, 36, 5, 7, 12, 18, 23, 33, 37, 48, 6, 13, 17, 24, 32, 38, 47, 49, 14, 16, 25, 31, 39, 46, 50, 57, 15, 26, 30, 40, 45, 51, 56, 58, 27, 29, 41, 44, 52, 55, 59, 62, 28, 42, 43, 53, 54, 60, 61, 63 }; // The original scan order (default_scan_16x16) is modified according to the // extra transpose in hadamard c implementation in lp case, i.e., // aom_hadamard_lp_16x16_c. DECLARE_ALIGNED(16, static const int16_t, default_scan_lp_16x16_transpose[256]) = { 0, 8, 2, 4, 10, 16, 24, 18, 12, 6, 64, 14, 20, 26, 32, 40, 34, 28, 22, 72, 66, 68, 74, 80, 30, 36, 42, 48, 56, 50, 44, 38, 88, 82, 76, 70, 128, 78, 84, 90, 96, 46, 52, 58, 1, 9, 3, 60, 54, 104, 98, 92, 86, 136, 130, 132, 138, 144, 94, 100, 106, 112, 62, 5, 11, 17, 25, 19, 13, 7, 120, 114, 108, 102, 152, 146, 140, 134, 192, 142, 148, 154, 160, 110, 116, 122, 65, 15, 21, 27, 33, 41, 35, 29, 23, 73, 67, 124, 118, 168, 162, 156, 150, 200, 194, 196, 202, 208, 158, 164, 170, 176, 126, 69, 75, 81, 31, 37, 43, 49, 57, 51, 45, 39, 89, 83, 77, 71, 184, 178, 172, 166, 216, 210, 204, 198, 206, 212, 218, 224, 174, 180, 186, 129, 79, 85, 91, 97, 47, 53, 59, 61, 55, 105, 99, 93, 87, 137, 131, 188, 182, 232, 226, 220, 214, 222, 228, 234, 240, 190, 133, 139, 145, 95, 101, 107, 113, 63, 121, 115, 109, 103, 153, 147, 141, 135, 248, 242, 236, 230, 238, 244, 250, 193, 143, 149, 155, 161, 111, 117, 123, 125, 119, 169, 163, 157, 151, 201, 195, 252, 246, 254, 197, 203, 209, 159, 165, 171, 177, 127, 185, 179, 173, 167, 217, 211, 205, 199, 207, 213, 219, 225, 175, 181, 187, 189, 183, 233, 227, 221, 215, 223, 229, 235, 241, 191, 249, 243, 237, 231, 239, 245, 251, 253, 247, 255 }; #if CONFIG_AV1_HIGHBITDEPTH // The original scan order (default_scan_16x16) is modified according to the // extra shift in hadamard c implementation in fp case, i.e., // aom_hadamard_16x16_c. Note that 16x16 lp and fp hadamard generate different // outputs, so we handle them separately. DECLARE_ALIGNED(16, static const int16_t, default_scan_fp_16x16_transpose[256]) = { 0, 4, 2, 8, 6, 16, 20, 18, 12, 10, 64, 14, 24, 22, 32, 36, 34, 28, 26, 68, 66, 72, 70, 80, 30, 40, 38, 48, 52, 50, 44, 42, 84, 82, 76, 74, 128, 78, 88, 86, 96, 46, 56, 54, 1, 5, 3, 60, 58, 100, 98, 92, 90, 132, 130, 136, 134, 144, 94, 104, 102, 112, 62, 9, 7, 17, 21, 19, 13, 11, 116, 114, 108, 106, 148, 146, 140, 138, 192, 142, 152, 150, 160, 110, 120, 118, 65, 15, 25, 23, 33, 37, 35, 29, 27, 69, 67, 124, 122, 164, 162, 156, 154, 196, 194, 200, 198, 208, 158, 168, 166, 176, 126, 73, 71, 81, 31, 41, 39, 49, 53, 51, 45, 43, 85, 83, 77, 75, 180, 178, 172, 170, 212, 210, 204, 202, 206, 216, 214, 224, 174, 184, 182, 129, 79, 89, 87, 97, 47, 57, 55, 61, 59, 101, 99, 93, 91, 133, 131, 188, 186, 228, 226, 220, 218, 222, 232, 230, 240, 190, 137, 135, 145, 95, 105, 103, 113, 63, 117, 115, 109, 107, 149, 147, 141, 139, 244, 242, 236, 234, 238, 248, 246, 193, 143, 153, 151, 161, 111, 121, 119, 125, 123, 165, 163, 157, 155, 197, 195, 252, 250, 254, 201, 199, 209, 159, 169, 167, 177, 127, 181, 179, 173, 171, 213, 211, 205, 203, 207, 217, 215, 225, 175, 185, 183, 189, 187, 229, 227, 221, 219, 223, 233, 231, 241, 191, 245, 243, 237, 235, 239, 249, 247, 253, 251, 255 }; #endif // The original scan order (av1_default_iscan_16x16) is modified to match // hadamard AVX2 implementation, i.e., aom_hadamard_lp_16x16_avx2. // Since hadamard AVX2 implementation will modify the order of coefficients, // such that the normal scan order is no longer guaranteed to scan low // coefficients first, therefore we modify the scan order accordingly. Note that // this one has to be used together with default_scan_lp_16x16_transpose. DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_lp_16x16_transpose[256]) = { 0, 44, 2, 46, 3, 63, 9, 69, 1, 45, 4, 64, 8, 68, 11, 87, 5, 65, 7, 67, 12, 88, 18, 94, 6, 66, 13, 89, 17, 93, 24, 116, 14, 90, 16, 92, 25, 117, 31, 123, 15, 91, 26, 118, 30, 122, 41, 148, 27, 119, 29, 121, 42, 149, 48, 152, 28, 120, 43, 150, 47, 151, 62, 177, 10, 86, 20, 96, 21, 113, 35, 127, 19, 95, 22, 114, 34, 126, 37, 144, 23, 115, 33, 125, 38, 145, 52, 156, 32, 124, 39, 146, 51, 155, 58, 173, 40, 147, 50, 154, 59, 174, 73, 181, 49, 153, 60, 175, 72, 180, 83, 198, 61, 176, 71, 179, 84, 199, 98, 202, 70, 178, 85, 200, 97, 201, 112, 219, 36, 143, 54, 158, 55, 170, 77, 185, 53, 157, 56, 171, 76, 184, 79, 194, 57, 172, 75, 183, 80, 195, 102, 206, 74, 182, 81, 196, 101, 205, 108, 215, 82, 197, 100, 204, 109, 216, 131, 223, 99, 203, 110, 217, 130, 222, 140, 232, 111, 218, 129, 221, 141, 233, 160, 236, 128, 220, 142, 234, 159, 235, 169, 245, 78, 193, 104, 208, 105, 212, 135, 227, 103, 207, 106, 213, 134, 226, 136, 228, 107, 214, 133, 225, 137, 229, 164, 240, 132, 224, 138, 230, 163, 239, 165, 241, 139, 231, 162, 238, 166, 242, 189, 249, 161, 237, 167, 243, 188, 248, 190, 250, 168, 244, 187, 247, 191, 251, 210, 254, 186, 246, 192, 252, 209, 253, 211, 255 }; #if CONFIG_AV1_HIGHBITDEPTH // The original scan order (av1_default_iscan_16x16) is modified to match // hadamard AVX2 implementation, i.e., aom_hadamard_16x16_avx2. // Since hadamard AVX2 implementation will modify the order of coefficients, // such that the normal scan order is no longer guaranteed to scan low // coefficients first, therefore we modify the scan order accordingly. Note that // this one has to be used together with default_scan_fp_16x16_transpose. DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_fp_16x16_transpose[256]) = { 0, 44, 2, 46, 1, 45, 4, 64, 3, 63, 9, 69, 8, 68, 11, 87, 5, 65, 7, 67, 6, 66, 13, 89, 12, 88, 18, 94, 17, 93, 24, 116, 14, 90, 16, 92, 15, 91, 26, 118, 25, 117, 31, 123, 30, 122, 41, 148, 27, 119, 29, 121, 28, 120, 43, 150, 42, 149, 48, 152, 47, 151, 62, 177, 10, 86, 20, 96, 19, 95, 22, 114, 21, 113, 35, 127, 34, 126, 37, 144, 23, 115, 33, 125, 32, 124, 39, 146, 38, 145, 52, 156, 51, 155, 58, 173, 40, 147, 50, 154, 49, 153, 60, 175, 59, 174, 73, 181, 72, 180, 83, 198, 61, 176, 71, 179, 70, 178, 85, 200, 84, 199, 98, 202, 97, 201, 112, 219, 36, 143, 54, 158, 53, 157, 56, 171, 55, 170, 77, 185, 76, 184, 79, 194, 57, 172, 75, 183, 74, 182, 81, 196, 80, 195, 102, 206, 101, 205, 108, 215, 82, 197, 100, 204, 99, 203, 110, 217, 109, 216, 131, 223, 130, 222, 140, 232, 111, 218, 129, 221, 128, 220, 142, 234, 141, 233, 160, 236, 159, 235, 169, 245, 78, 193, 104, 208, 103, 207, 106, 213, 105, 212, 135, 227, 134, 226, 136, 228, 107, 214, 133, 225, 132, 224, 138, 230, 137, 229, 164, 240, 163, 239, 165, 241, 139, 231, 162, 238, 161, 237, 167, 243, 166, 242, 189, 249, 188, 248, 190, 250, 168, 244, 187, 247, 186, 246, 192, 252, 191, 251, 210, 254, 209, 253, 211, 255 }; #endif // For entropy coding, IDTX shares the scan orders of the other 2D-transforms, // but the fastest way to calculate the IDTX transform (i.e. no transposes) // results in coefficients that are a transposition of the entropy coding // versions. These tables are used as substitute for the scan order for the // faster version of IDTX. // Must be used together with av1_fast_idtx_iscan_4x4 DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_scan_4x4[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 }; // Must be used together with av1_fast_idtx_scan_4x4 DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_iscan_4x4[16]) = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 }; static const SCAN_ORDER av1_fast_idtx_scan_order_4x4 = { av1_fast_idtx_scan_4x4, av1_fast_idtx_iscan_4x4 }; // Must be used together with av1_fast_idtx_iscan_8x8 DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_scan_8x8[64]) = { 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63 }; // Must be used together with av1_fast_idtx_scan_8x8 DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_iscan_8x8[64]) = { 0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42, 3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53, 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60, 21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63 }; static const SCAN_ORDER av1_fast_idtx_scan_order_8x8 = { av1_fast_idtx_scan_8x8, av1_fast_idtx_iscan_8x8 }; // Must be used together with av1_fast_idtx_iscan_16x16 DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_scan_16x16[256]) = { 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4, 5, 20, 35, 50, 65, 80, 96, 81, 66, 51, 36, 21, 6, 7, 22, 37, 52, 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, 8, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100, 85, 70, 55, 40, 25, 10, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87, 72, 57, 42, 27, 12, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, 224, 209, 194, 179, 164, 149, 134, 119, 104, 89, 74, 59, 44, 29, 14, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, 240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91, 76, 61, 46, 31, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242, 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93, 78, 63, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185, 170, 155, 140, 125, 110, 95, 111, 126, 141, 156, 171, 186, 201, 216, 231, 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203, 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235, 250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255 }; // Must be used together with av1_fast_idtx_scan_16x16 DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_iscan_16x16[256]) = { 0, 1, 5, 6, 14, 15, 27, 28, 44, 45, 65, 66, 90, 91, 119, 120, 2, 4, 7, 13, 16, 26, 29, 43, 46, 64, 67, 89, 92, 118, 121, 150, 3, 8, 12, 17, 25, 30, 42, 47, 63, 68, 88, 93, 117, 122, 149, 151, 9, 11, 18, 24, 31, 41, 48, 62, 69, 87, 94, 116, 123, 148, 152, 177, 10, 19, 23, 32, 40, 49, 61, 70, 86, 95, 115, 124, 147, 153, 176, 178, 20, 22, 33, 39, 50, 60, 71, 85, 96, 114, 125, 146, 154, 175, 179, 200, 21, 34, 38, 51, 59, 72, 84, 97, 113, 126, 145, 155, 174, 180, 199, 201, 35, 37, 52, 58, 73, 83, 98, 112, 127, 144, 156, 173, 181, 198, 202, 219, 36, 53, 57, 74, 82, 99, 111, 128, 143, 157, 172, 182, 197, 203, 218, 220, 54, 56, 75, 81, 100, 110, 129, 142, 158, 171, 183, 196, 204, 217, 221, 234, 55, 76, 80, 101, 109, 130, 141, 159, 170, 184, 195, 205, 216, 222, 233, 235, 77, 79, 102, 108, 131, 140, 160, 169, 185, 194, 206, 215, 223, 232, 236, 245, 78, 103, 107, 132, 139, 161, 168, 186, 193, 207, 214, 224, 231, 237, 244, 246, 104, 106, 133, 138, 162, 167, 187, 192, 208, 213, 225, 230, 238, 243, 247, 252, 105, 134, 137, 163, 166, 188, 191, 209, 212, 226, 229, 239, 242, 248, 251, 253, 135, 136, 164, 165, 189, 190, 210, 211, 227, 228, 240, 241, 249, 250, 254, 255 }; // Indicates the blocks for which RD model should be based on special logic static inline int get_model_rd_flag(const AV1_COMP *cpi, const MACROBLOCKD *xd, BLOCK_SIZE bsize) { const AV1_COMMON *const cm = &cpi->common; const int large_block = bsize >= BLOCK_32X32; // Only enable for low bitdepth to mitigate issue: b/303023614. return cpi->oxcf.rc_cfg.mode == AOM_CBR && large_block && !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && cm->quant_params.base_qindex && !cpi->oxcf.use_highbitdepth; } /*!\brief Finds predicted motion vectors for a block. * * \ingroup nonrd_mode_search * \callgraph * \callergraph * Finds predicted motion vectors for a block from a certain reference frame. * First, it fills reference MV stack, then picks the test from the stack and * predicts the final MV for a block for each mode. * \param[in] cpi Top-level encoder structure * \param[in] x Pointer to structure holding all the * data for the current macroblock * \param[in] ref_frame Reference frame for which to find * ref MVs * \param[out] frame_mv Predicted MVs for a block * \param[in] yv12_mb Buffer to hold predicted block * \param[in] bsize Current block size * \param[in] force_skip_low_temp_var Flag indicating possible mode search * prune for low temporal variance block * \param[in] skip_pred_mv Flag indicating to skip av1_mv_pred * \param[out] use_scaled_ref_frame Flag to indicate if scaled reference * frame is used. * * \remark Nothing is returned. Instead, predicted MVs are placed into * \c frame_mv array, and use_scaled_ref_frame is set. */ static inline void find_predictors( AV1_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], struct buf_2d yv12_mb[8][MAX_MB_PLANE], BLOCK_SIZE bsize, int force_skip_low_temp_var, int skip_pred_mv, bool *use_scaled_ref_frame) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; const YV12_BUFFER_CONFIG *ref = get_ref_frame_yv12_buf(cm, ref_frame); const bool ref_is_scaled = ref->y_crop_height != cm->height || ref->y_crop_width != cm->width; const YV12_BUFFER_CONFIG *scaled_ref = av1_get_scaled_ref_frame(cpi, ref_frame); const YV12_BUFFER_CONFIG *yv12 = ref_is_scaled && scaled_ref ? scaled_ref : ref; const int num_planes = av1_num_planes(cm); x->pred_mv_sad[ref_frame] = INT_MAX; x->pred_mv0_sad[ref_frame] = INT_MAX; x->pred_mv1_sad[ref_frame] = INT_MAX; frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; // TODO(kyslov) this needs various further optimizations. to be continued.. assert(yv12 != NULL); if (yv12 != NULL) { struct scale_factors *const sf = scaled_ref ? NULL : get_ref_scale_factors(cm, ref_frame); av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes); av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, mbmi_ext->mode_context); // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame); av1_find_best_ref_mvs_from_stack( cm->features.allow_high_precision_mv, mbmi_ext, ref_frame, &frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame], 0); frame_mv[GLOBALMV][ref_frame] = mbmi_ext->global_mvs[ref_frame]; // Early exit for non-LAST frame if force_skip_low_temp_var is set. if (!ref_is_scaled && bsize >= BLOCK_8X8 && !skip_pred_mv && !(force_skip_low_temp_var && ref_frame != LAST_FRAME)) { av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame, bsize); } } if (cm->features.switchable_motion_mode) { av1_count_overlappable_neighbors(cm, xd); } mbmi->num_proj_ref = 1; *use_scaled_ref_frame = ref_is_scaled && scaled_ref; } static inline void init_mbmi_nonrd(MB_MODE_INFO *mbmi, PREDICTION_MODE pred_mode, MV_REFERENCE_FRAME ref_frame0, MV_REFERENCE_FRAME ref_frame1, const AV1_COMMON *cm) { PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; mbmi->ref_mv_idx = 0; mbmi->mode = pred_mode; mbmi->uv_mode = UV_DC_PRED; mbmi->ref_frame[0] = ref_frame0; mbmi->ref_frame[1] = ref_frame1; pmi->palette_size[PLANE_TYPE_Y] = 0; pmi->palette_size[PLANE_TYPE_UV] = 0; mbmi->filter_intra_mode_info.use_filter_intra = 0; mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0; mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->num_proj_ref = 1; mbmi->interintra_mode = 0; set_default_interp_filters(mbmi, cm->features.interp_filter); } static inline void init_estimate_block_intra_args( struct estimate_block_intra_args *args, AV1_COMP *cpi, MACROBLOCK *x) { args->cpi = cpi; args->x = x; args->mode = DC_PRED; args->skippable = 1; args->rdc = 0; args->best_sad = UINT_MAX; args->prune_mode_based_on_sad = false; args->prune_palette_sad = false; } static inline int get_pred_buffer(PRED_BUFFER *p, int len) { for (int buf_idx = 0; buf_idx < len; buf_idx++) { if (!p[buf_idx].in_use) { p[buf_idx].in_use = 1; return buf_idx; } } return -1; } static inline bool prune_palette_testing_inter(AV1_COMP *cpi, unsigned int source_variance) { return (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && cpi->oxcf.speed >= 11 && cpi->rc.high_source_sad && cpi->sf.rt_sf.rc_compute_spatial_var_sc && cpi->rc.frame_spatial_variance < 1200 && cpi->rc.perc_spatial_flat_blocks < 5 && cpi->rc.percent_blocks_with_motion > 98 && source_variance < 4000); } static inline void free_pred_buffer(PRED_BUFFER *p) { if (p != NULL) p->in_use = 0; } #if CONFIG_INTERNAL_STATS static inline void store_coding_context_nonrd(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index) { #else static inline void store_coding_context_nonrd(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { #endif // CONFIG_INTERNAL_STATS MACROBLOCKD *const xd = &x->e_mbd; TxfmSearchInfo *txfm_info = &x->txfm_search_info; // Take a snapshot of the coding context so it can be // restored if we decide to encode this way ctx->rd_stats.skip_txfm = txfm_info->skip_txfm; ctx->skippable = txfm_info->skip_txfm; #if CONFIG_INTERNAL_STATS ctx->best_mode_index = mode_index; #endif // CONFIG_INTERNAL_STATS ctx->mic = *xd->mi[0]; ctx->skippable = txfm_info->skip_txfm; av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext, av1_ref_frame_type(xd->mi[0]->ref_frame)); } void av1_block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable, BLOCK_SIZE bsize, TX_SIZE tx_size); void av1_block_yrd_idtx(MACROBLOCK *x, const uint8_t *const pred_buf, int pred_stride, RD_STATS *this_rdc, int *skippable, BLOCK_SIZE bsize, TX_SIZE tx_size); int64_t av1_model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize, MACROBLOCK *x, MACROBLOCKD *xd, RD_STATS *this_rdc, int start_plane, int stop_plane); void av1_estimate_block_intra(int plane, int block, int row, int col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); void av1_estimate_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int best_early_term, unsigned int ref_cost_intra, int reuse_prediction, struct buf_2d *orig_dst, PRED_BUFFER *tmp_buffers, PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc, BEST_PICKMODE *best_pickmode, PICK_MODE_CONTEXT *ctx, unsigned int *best_sad_norm); #endif // AOM_AV1_ENCODER_NONRD_OPT_H_ aom-3.12.1/av1/encoder/nonrd_pickmode.c000066400000000000000000004574041477627663500176640ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/intra_mode_search.h" #include "av1/encoder/model_rd.h" #include "av1/encoder/motion_search_facade.h" #include "av1/encoder/nonrd_opt.h" #include "av1/encoder/palette.h" #include "av1/encoder/reconinter_enc.h" #include "av1/encoder/var_based_part.h" static inline int early_term_inter_search_with_sse(int early_term_idx, BLOCK_SIZE bsize, int64_t this_sse, int64_t best_sse, PREDICTION_MODE this_mode) { // Aggressiveness to terminate inter mode search early is adjusted based on // speed and block size. static const double early_term_thresh[4][4] = { { 0.65, 0.65, 0.65, 0.7 }, { 0.6, 0.65, 0.85, 0.9 }, { 0.5, 0.5, 0.55, 0.6 }, { 0.6, 0.75, 0.85, 0.85 } }; static const double early_term_thresh_newmv_nearestmv[4] = { 0.3, 0.3, 0.3, 0.3 }; const int size_group = size_group_lookup[bsize]; assert(size_group < 4); assert((early_term_idx > 0) && (early_term_idx < EARLY_TERM_INDICES)); const double threshold = ((early_term_idx == EARLY_TERM_IDX_4) && (this_mode == NEWMV || this_mode == NEARESTMV)) ? early_term_thresh_newmv_nearestmv[size_group] : early_term_thresh[early_term_idx - 1][size_group]; // Terminate inter mode search early based on best sse so far. if ((early_term_idx > 0) && (threshold * this_sse > best_sse)) { return 1; } return 0; } static inline void init_best_pickmode(BEST_PICKMODE *bp) { bp->best_sse = INT64_MAX; bp->best_mode = NEARESTMV; bp->best_ref_frame = LAST_FRAME; bp->best_second_ref_frame = NONE_FRAME; bp->best_tx_size = TX_8X8; bp->tx_type = DCT_DCT; bp->best_pred_filter = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); bp->best_mode_skip_txfm = 0; bp->best_mode_initial_skip_flag = 0; bp->best_pred = NULL; bp->best_motion_mode = SIMPLE_TRANSLATION; bp->num_proj_ref = 0; av1_zero(bp->wm_params); av1_zero(bp->pmi); } // Copy best inter mode parameters to best_pickmode static inline void update_search_state_nonrd( InterModeSearchStateNonrd *search_state, MB_MODE_INFO *const mi, TxfmSearchInfo *txfm_info, RD_STATS *nonskip_rdc, PICK_MODE_CONTEXT *ctx, PREDICTION_MODE this_best_mode, const int64_t sse_y) { BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode; best_pickmode->best_sse = sse_y; best_pickmode->best_mode = this_best_mode; best_pickmode->best_motion_mode = mi->motion_mode; best_pickmode->wm_params = mi->wm_params; best_pickmode->num_proj_ref = mi->num_proj_ref; best_pickmode->best_pred_filter = mi->interp_filters; best_pickmode->best_tx_size = mi->tx_size; best_pickmode->best_ref_frame = mi->ref_frame[0]; best_pickmode->best_second_ref_frame = mi->ref_frame[1]; best_pickmode->best_mode_skip_txfm = search_state->this_rdc.skip_txfm; best_pickmode->best_mode_initial_skip_flag = (nonskip_rdc->rate == INT_MAX && search_state->this_rdc.skip_txfm); if (!best_pickmode->best_mode_skip_txfm) { memcpy(ctx->blk_skip, txfm_info->blk_skip, sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); } } static inline int subpel_select(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *mv, MV ref_mv, FULLPEL_MV start_mv, bool fullpel_performed_well) { const int frame_lowmotion = cpi->rc.avg_frame_low_motion; const int reduce_mv_pel_precision_highmotion = cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion; // Reduce MV precision for higher int MV value & frame-level motion if (reduce_mv_pel_precision_highmotion >= 3) { int mv_thresh = 4; const int is_low_resoln = (cpi->common.width * cpi->common.height <= 320 * 240); mv_thresh = (bsize > BLOCK_32X32) ? 2 : (bsize > BLOCK_16X16) ? 4 : 6; if (frame_lowmotion > 0 && frame_lowmotion < 40) mv_thresh = 12; mv_thresh = (is_low_resoln) ? mv_thresh >> 1 : mv_thresh; if (abs(mv->as_fullmv.row) >= mv_thresh || abs(mv->as_fullmv.col) >= mv_thresh) return HALF_PEL; } else if (reduce_mv_pel_precision_highmotion >= 1) { int mv_thresh; const int th_vals[2][3] = { { 4, 8, 10 }, { 4, 6, 8 } }; const int th_idx = reduce_mv_pel_precision_highmotion - 1; assert(th_idx >= 0 && th_idx < 2); if (frame_lowmotion > 0 && frame_lowmotion < 40) mv_thresh = 12; else mv_thresh = (bsize >= BLOCK_32X32) ? th_vals[th_idx][0] : (bsize >= BLOCK_16X16) ? th_vals[th_idx][1] : th_vals[th_idx][2]; if (abs(mv->as_fullmv.row) >= (mv_thresh << 1) || abs(mv->as_fullmv.col) >= (mv_thresh << 1)) return FULL_PEL; else if (abs(mv->as_fullmv.row) >= mv_thresh || abs(mv->as_fullmv.col) >= mv_thresh) return HALF_PEL; } // Reduce MV precision for relatively static (e.g. background), low-complex // large areas if (cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex >= 2) { const int qband = x->qindex >> (QINDEX_BITS - 2); assert(qband < 4); if (x->content_state_sb.source_sad_nonrd <= kVeryLowSad && bsize > BLOCK_16X16 && qband != 0) { if (x->source_variance < 500) return FULL_PEL; else if (x->source_variance < 5000) return HALF_PEL; } } else if (cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex >= 1) { if (fullpel_performed_well && ref_mv.row == 0 && ref_mv.col == 0 && start_mv.row == 0 && start_mv.col == 0) return HALF_PEL; } return cpi->sf.mv_sf.subpel_force_stop; } static bool use_aggressive_subpel_search_method(MACROBLOCK *x, bool use_adaptive_subpel_search, bool fullpel_performed_well) { if (!use_adaptive_subpel_search) return false; const int qband = x->qindex >> (QINDEX_BITS - 2); assert(qband < 4); if ((qband > 0) && (fullpel_performed_well || (x->content_state_sb.source_sad_nonrd <= kLowSad) || (x->source_variance < 100))) return true; return false; } /*!\brief Runs Motion Estimation for a specific block and specific ref frame. * * \ingroup nonrd_mode_search * \callgraph * \callergraph * Finds the best Motion Vector by running Motion Estimation for a specific * block and a specific reference frame. Exits early if RDCost of Full Pel part * exceeds best RD Cost fund so far * \param[in] cpi Top-level encoder structure * \param[in] x Pointer to structure holding all the * data for the current macroblock * \param[in] bsize Current block size * \param[in] tmp_mv Pointer to best found New MV * \param[in] rate_mv Pointer to Rate of the best new MV * \param[in] best_rd_sofar RD Cost of the best mode found so far * \param[in] use_base_mv Flag, indicating that tmp_mv holds * specific MV to start the search with * * \return Returns 0 if ME was terminated after Full Pel Search because too * high RD Cost. Otherwise returns 1. Best New MV is placed into \c tmp_mv. * Rate estimation for this vector is placed to \c rate_mv */ static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *tmp_mv, int *rate_mv, int64_t best_rd_sofar, int use_base_mv) { MACROBLOCKD *xd = &x->e_mbd; const AV1_COMMON *cm = &cpi->common; const SPEED_FEATURES *sf = &cpi->sf; MB_MODE_INFO *mi = xd->mi[0]; int step_param = (sf->rt_sf.fullpel_search_step_param) ? sf->rt_sf.fullpel_search_step_param : cpi->mv_search_params.mv_step_param; FULLPEL_MV start_mv; const int ref = mi->ref_frame[0]; const MV ref_mv = av1_get_ref_mv(x, mi->ref_mv_idx).as_mv; MV center_mv; int dis; int rv = 0; int cost_list[5]; int search_subpel = 1; start_mv = get_fullmv_from_mv(&ref_mv); if (!use_base_mv) center_mv = ref_mv; else center_mv = tmp_mv->as_mv; const SEARCH_METHODS search_method = av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize); const search_site_config *src_search_sites = av1_get_search_site_config(cpi, x, search_method); FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; FULLPEL_MV_STATS best_mv_stats; av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, ¢er_mv, start_mv, src_search_sites, search_method, /*fine_search_interval=*/0); const unsigned int full_var_rd = av1_full_pixel_search( start_mv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list), &tmp_mv->as_fullmv, &best_mv_stats, NULL); // calculate the bit cost on motion vector MV mvp_full = get_mv_from_fullmv(&tmp_mv->as_fullmv); *rate_mv = av1_mv_bit_cost(&mvp_full, &ref_mv, x->mv_costs->nmv_joint_cost, x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); // TODO(kyslov) Account for Rate Mode! rv = !(RDCOST(x->rdmult, (*rate_mv), 0) > best_rd_sofar); if (rv && search_subpel) { SUBPEL_MOTION_SEARCH_PARAMS ms_params; av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, cost_list); const bool fullpel_performed_well = (bsize == BLOCK_64X64 && full_var_rd * 40 < 62267 * 7) || (bsize == BLOCK_32X32 && full_var_rd * 8 < 42380) || (bsize == BLOCK_16X16 && full_var_rd * 8 < 10127); if (sf->rt_sf.reduce_mv_pel_precision_highmotion || sf->rt_sf.reduce_mv_pel_precision_lowcomplex) ms_params.forced_stop = subpel_select(cpi, x, bsize, tmp_mv, ref_mv, start_mv, fullpel_performed_well); MV subpel_start_mv = get_mv_from_fullmv(&tmp_mv->as_fullmv); assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); // adaptively downgrade subpel search method based on block properties if (use_aggressive_subpel_search_method( x, sf->rt_sf.use_adaptive_subpel_search, fullpel_performed_well)) av1_find_best_sub_pixel_tree_pruned_more( xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &tmp_mv->as_mv, &dis, &x->pred_sse[ref], NULL); else cpi->mv_search_params.find_fractional_mv_step( xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &tmp_mv->as_mv, &dis, &x->pred_sse[ref], NULL); *rate_mv = av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->mv_costs->nmv_joint_cost, x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); } // The final MV can not be equal to the reference MV as this will trigger an // assert later. This can happen if both NEAREST and NEAR modes were skipped. rv = (tmp_mv->as_mv.col != ref_mv.col || tmp_mv->as_mv.row != ref_mv.row); return rv; } /*!\brief Searches for the best New Motion Vector. * * \ingroup nonrd_mode_search * \callgraph * \callergraph * Finds the best Motion Vector by doing Motion Estimation. Uses reduced * complexity ME for non-LAST frames or calls \c combined_motion_search * for LAST reference frame * \param[in] cpi Top-level encoder structure * \param[in] x Pointer to structure holding all the * data for the current macroblock * \param[in] frame_mv Array that holds MVs for all modes * and ref frames * \param[in] ref_frame Reference frame for which to find * the best New MVs * \param[in] gf_temporal_ref Flag, indicating temporal reference * for GOLDEN frame * \param[in] bsize Current block size * \param[in] mi_row Row index in 4x4 units * \param[in] mi_col Column index in 4x4 units * \param[in] rate_mv Pointer to Rate of the best new MV * \param[in] best_rdc Pointer to the RD Cost for the best * mode found so far * * \return Returns -1 if the search was not done, otherwise returns 0. * Best New MV is placed into \c frame_mv array, Rate estimation for this * vector is placed to \c rate_mv */ static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x, int_mv frame_mv[][REF_FRAMES], MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref, BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_mv, RD_STATS *best_rdc) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mi = xd->mi[0]; AV1_COMMON *cm = &cpi->common; int_mv *this_ref_frm_newmv = &frame_mv[NEWMV][ref_frame]; unsigned int y_sad_zero; if (ref_frame > LAST_FRAME && cpi->oxcf.rc_cfg.mode == AOM_CBR && gf_temporal_ref) { int tmp_sad; int dis; if (bsize < BLOCK_16X16) return -1; int me_search_size_col = block_size_wide[bsize] >> 1; int me_search_size_row = block_size_high[bsize] >> 1; MV ref_mv = av1_get_ref_mv(x, 0).as_mv; tmp_sad = av1_int_pro_motion_estimation( cpi, x, bsize, mi_row, mi_col, &ref_mv, &y_sad_zero, me_search_size_col, me_search_size_row); if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1; this_ref_frm_newmv->as_int = mi->mv[0].as_int; int_mv best_mv = mi->mv[0]; best_mv.as_mv.row >>= 3; best_mv.as_mv.col >>= 3; this_ref_frm_newmv->as_mv.row >>= 3; this_ref_frm_newmv->as_mv.col >>= 3; SUBPEL_MOTION_SEARCH_PARAMS ms_params; av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, NULL); if (cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion || cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex) { FULLPEL_MV start_mv = { .row = 0, .col = 0 }; ms_params.forced_stop = subpel_select(cpi, x, bsize, &best_mv, ref_mv, start_mv, false); } MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv)); cpi->mv_search_params.find_fractional_mv_step( xd, cm, &ms_params, start_mv, NULL, &best_mv.as_mv, &dis, &x->pred_sse[ref_frame], NULL); this_ref_frm_newmv->as_int = best_mv.as_int; // When NEWMV is same as ref_mv from the drl, it is preferred to code the // MV as NEARESTMV or NEARMV. In this case, NEWMV needs to be skipped to // avoid an assert failure at a later stage. The scenario can occur if // NEARESTMV was not evaluated for ALTREF. if (this_ref_frm_newmv->as_mv.col == ref_mv.col && this_ref_frm_newmv->as_mv.row == ref_mv.row) return -1; *rate_mv = av1_mv_bit_cost(&this_ref_frm_newmv->as_mv, &ref_mv, x->mv_costs->nmv_joint_cost, x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); } else if (!combined_motion_search(cpi, x, bsize, &frame_mv[NEWMV][ref_frame], rate_mv, best_rdc->rdcost, 0)) { return -1; } return 0; } static void estimate_single_ref_frame_costs(const AV1_COMMON *cm, const MACROBLOCKD *xd, const ModeCosts *mode_costs, int segment_id, BLOCK_SIZE bsize, unsigned int *ref_costs_single) { int seg_ref_active = segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME); if (seg_ref_active) { memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single)); } else { int intra_inter_ctx = av1_get_intra_inter_context(xd); ref_costs_single[INTRA_FRAME] = mode_costs->intra_inter_cost[intra_inter_ctx][0]; unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1]; if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT && is_comp_ref_allowed(bsize)) { const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd); base_cost += mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1]; } ref_costs_single[LAST_FRAME] = base_cost; ref_costs_single[GOLDEN_FRAME] = base_cost; ref_costs_single[ALTREF_FRAME] = base_cost; // add cost for last, golden, altref ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[0][0][0]; ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][0][1]; ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][1][0]; ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][0][1]; ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][2][0]; } } static inline void set_force_skip_flag(const AV1_COMP *const cpi, MACROBLOCK *const x, unsigned int sse, int *force_skip) { if (x->txfm_search_params.tx_mode_search_type == TX_MODE_SELECT && cpi->sf.rt_sf.tx_size_level_based_on_qstep && cpi->sf.rt_sf.tx_size_level_based_on_qstep >= 2) { const int qstep = x->plane[AOM_PLANE_Y].dequant_QTX[1] >> (x->e_mbd.bd - 5); const unsigned int qstep_sq = qstep * qstep; // If the sse is low for low source variance blocks, mark those as // transform skip. // Note: Though qstep_sq is based on ac qstep, the threshold is kept // low so that reliable early estimate of tx skip can be obtained // through its comparison with sse. if (sse < qstep_sq && x->source_variance < qstep_sq && x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) *force_skip = 1; } } #define CAP_TX_SIZE_FOR_BSIZE_GT32(tx_mode_search_type, bsize) \ (((tx_mode_search_type) != ONLY_4X4 && (bsize) > BLOCK_32X32) ? true : false) #define TX_SIZE_FOR_BSIZE_GT32 (TX_16X16) static TX_SIZE calculate_tx_size(const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *const x, unsigned int var, unsigned int sse, int *force_skip) { MACROBLOCKD *const xd = &x->e_mbd; TX_SIZE tx_size; const TxfmSearchParams *txfm_params = &x->txfm_search_params; if (txfm_params->tx_mode_search_type == TX_MODE_SELECT) { int multiplier = 8; unsigned int var_thresh = 0; unsigned int is_high_var = 1; // Use quantizer based thresholds to determine transform size. if (cpi->sf.rt_sf.tx_size_level_based_on_qstep) { const int qband = x->qindex >> (QINDEX_BITS - 2); const int mult[4] = { 8, 7, 6, 5 }; assert(qband < 4); multiplier = mult[qband]; const int qstep = x->plane[AOM_PLANE_Y].dequant_QTX[1] >> (xd->bd - 5); const unsigned int qstep_sq = qstep * qstep; var_thresh = qstep_sq * 2; if (cpi->sf.rt_sf.tx_size_level_based_on_qstep >= 2) { // If the sse is low for low source variance blocks, mark those as // transform skip. // Note: Though qstep_sq is based on ac qstep, the threshold is kept // low so that reliable early estimate of tx skip can be obtained // through its comparison with sse. if (sse < qstep_sq && x->source_variance < qstep_sq && x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) *force_skip = 1; // Further lower transform size based on aq mode only if residual // variance is high. is_high_var = (var >= var_thresh); } } // Choose larger transform size for blocks where dc component is dominant or // the ac component is low. if (sse > ((var * multiplier) >> 2) || (var < var_thresh)) tx_size = AOMMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]); else tx_size = TX_8X8; if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && is_high_var) tx_size = TX_8X8; else if (tx_size > TX_16X16) tx_size = TX_16X16; } else { tx_size = AOMMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]); } if (CAP_TX_SIZE_FOR_BSIZE_GT32(txfm_params->tx_mode_search_type, bsize)) tx_size = TX_SIZE_FOR_BSIZE_GT32; return AOMMIN(tx_size, TX_16X16); } static void block_variance(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int w, int h, unsigned int *sse, int *sum, int block_size, uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) { int k = 0; *sse = 0; *sum = 0; // This function is called for block sizes >= BLOCK_32x32. As per the design // the aom_get_var_sse_sum_8x8_quad() processes four 8x8 blocks (in a 8x32) // per call. Hence the width and height of the block need to be at least 8 and // 32 samples respectively. assert(w >= 32); assert(h >= 8); for (int row = 0; row < h; row += block_size) { for (int col = 0; col < w; col += 32) { aom_get_var_sse_sum_8x8_quad(src + src_stride * row + col, src_stride, ref + ref_stride * row + col, ref_stride, &sse8x8[k], &sum8x8[k], sse, sum, &var8x8[k]); k += 4; } } } static void block_variance_16x16_dual(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int w, int h, unsigned int *sse, int *sum, int block_size, uint32_t *sse16x16, uint32_t *var16x16) { int k = 0; *sse = 0; *sum = 0; // This function is called for block sizes >= BLOCK_32x32. As per the design // the aom_get_var_sse_sum_16x16_dual() processes four 16x16 blocks (in a // 16x32) per call. Hence the width and height of the block need to be at // least 16 and 32 samples respectively. assert(w >= 32); assert(h >= 16); for (int row = 0; row < h; row += block_size) { for (int col = 0; col < w; col += 32) { aom_get_var_sse_sum_16x16_dual(src + src_stride * row + col, src_stride, ref + ref_stride * row + col, ref_stride, &sse16x16[k], sse, sum, &var16x16[k]); k += 2; } } } static void calculate_variance(int bw, int bh, TX_SIZE tx_size, unsigned int *sse_i, int *sum_i, unsigned int *var_o, unsigned int *sse_o, int *sum_o) { const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size]; const int nw = 1 << (bw - b_width_log2_lookup[unit_size]); const int nh = 1 << (bh - b_height_log2_lookup[unit_size]); int row, col, k = 0; for (row = 0; row < nh; row += 2) { for (col = 0; col < nw; col += 2) { sse_o[k] = sse_i[row * nw + col] + sse_i[row * nw + col + 1] + sse_i[(row + 1) * nw + col] + sse_i[(row + 1) * nw + col + 1]; sum_o[k] = sum_i[row * nw + col] + sum_i[row * nw + col + 1] + sum_i[(row + 1) * nw + col] + sum_i[(row + 1) * nw + col + 1]; var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >> (b_width_log2_lookup[unit_size] + b_height_log2_lookup[unit_size] + 6)); k++; } } } // Adjust the ac_thr according to speed, width, height and normalized sum static int ac_thr_factor(int speed, int width, int height, int norm_sum) { if (speed >= 8 && norm_sum < 5) { if (width <= 640 && height <= 480) return 4; else return 2; } return 1; } // Sets early_term flag based on chroma planes prediction static inline void set_early_term_based_on_uv_plane( AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MACROBLOCKD *xd, int mi_row, int mi_col, int *early_term, int num_blk, const unsigned int *sse_tx, const unsigned int *var_tx, int sum, unsigned int var, unsigned int sse) { AV1_COMMON *const cm = &cpi->common; struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; const uint32_t dc_quant = p->dequant_QTX[0]; const uint32_t ac_quant = p->dequant_QTX[1]; int64_t dc_thr = dc_quant * dc_quant >> 6; int64_t ac_thr = ac_quant * ac_quant >> 6; const int bw = b_width_log2_lookup[bsize]; const int bh = b_height_log2_lookup[bsize]; int ac_test = 1; int dc_test = 1; const int norm_sum = abs(sum) >> (bw + bh); #if CONFIG_AV1_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && cpi->oxcf.speed > 5) ac_thr = av1_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level, norm_sum, cpi->svc.temporal_layer_id); else ac_thr *= ac_thr_factor(cpi->oxcf.speed, cm->width, cm->height, norm_sum); #else ac_thr *= ac_thr_factor(cpi->oxcf.speed, cm->width, cm->height, norm_sum); #endif if (cpi->sf.rt_sf.increase_source_sad_thresh) { dc_thr = dc_thr << 1; ac_thr = ac_thr << 2; } for (int k = 0; k < num_blk; k++) { // Check if all ac coefficients can be quantized to zero. if (!(var_tx[k] < ac_thr || var == 0)) { ac_test = 0; break; } // Check if dc coefficient can be quantized to zero. if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) { dc_test = 0; break; } } // Check if chroma can be skipped based on ac and dc test flags. if (ac_test && dc_test) { int skip_uv[2] = { 0 }; unsigned int var_uv[2]; unsigned int sse_uv[2]; // Transform skipping test in UV planes. for (int plane = AOM_PLANE_U; plane <= AOM_PLANE_V; plane++) { int j = plane - 1; skip_uv[j] = 1; if (x->color_sensitivity[COLOR_SENS_IDX(plane)]) { skip_uv[j] = 0; struct macroblock_plane *const puv = &x->plane[plane]; struct macroblockd_plane *const puvd = &xd->plane[plane]; const BLOCK_SIZE uv_bsize = get_plane_block_size( bsize, puvd->subsampling_x, puvd->subsampling_y); // Adjust these thresholds for UV. const int shift_ac = cpi->sf.rt_sf.increase_source_sad_thresh ? 5 : 3; const int shift_dc = cpi->sf.rt_sf.increase_source_sad_thresh ? 4 : 3; const int64_t uv_dc_thr = (puv->dequant_QTX[0] * puv->dequant_QTX[0]) >> shift_dc; const int64_t uv_ac_thr = (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> shift_ac; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, plane, plane); var_uv[j] = cpi->ppi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride, puvd->dst.buf, puvd->dst.stride, &sse_uv[j]); if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) && (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j])) skip_uv[j] = 1; else break; } } if (skip_uv[0] & skip_uv[1]) { *early_term = 1; } } } static inline void calc_rate_dist_block_param(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_stats, int calculate_rd, int *early_term, BLOCK_SIZE bsize, unsigned int sse) { if (calculate_rd) { if (!*early_term) { const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, rd_stats->sse, bw * bh, &rd_stats->rate, &rd_stats->dist); } if (*early_term) { rd_stats->rate = 0; rd_stats->dist = sse << 4; } } } static void model_skip_for_sb_y_large_64(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, MACROBLOCK *x, MACROBLOCKD *xd, RD_STATS *rd_stats, int *early_term, int calculate_rd, int64_t best_sse, unsigned int *var_output, unsigned int var_prune_threshold) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. unsigned int sse; struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; int test_skip = 1; unsigned int var; int sum; const int bw = b_width_log2_lookup[bsize]; const int bh = b_height_log2_lookup[bsize]; unsigned int sse16x16[64] = { 0 }; unsigned int var16x16[64] = { 0 }; assert(xd->mi[0]->tx_size == TX_16X16); assert(bsize > BLOCK_32X32); // Calculate variance for whole partition, and also save 16x16 blocks' // variance to be used in following transform skipping test. block_variance_16x16_dual(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, 4 << bw, 4 << bh, &sse, &sum, 16, sse16x16, var16x16); var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4)); if (var_output) { *var_output = var; if (*var_output > var_prune_threshold) { return; } } rd_stats->sse = sse; // Skipping test *early_term = 0; set_force_skip_flag(cpi, x, sse, early_term); // The code below for setting skip flag assumes transform size of at least // 8x8, so force this lower limit on transform. MB_MODE_INFO *const mi = xd->mi[0]; if (!calculate_rd && cpi->sf.rt_sf.sse_early_term_inter_search && early_term_inter_search_with_sse( cpi->sf.rt_sf.sse_early_term_inter_search, bsize, sse, best_sse, mi->mode)) test_skip = 0; if (*early_term) test_skip = 0; // Evaluate if the partition block is a skippable block in Y plane. if (test_skip) { const unsigned int *sse_tx = sse16x16; const unsigned int *var_tx = var16x16; const unsigned int num_block = (1 << (bw + bh - 2)) >> 2; set_early_term_based_on_uv_plane(cpi, x, bsize, xd, mi_row, mi_col, early_term, num_block, sse_tx, var_tx, sum, var, sse); } calc_rate_dist_block_param(cpi, x, rd_stats, calculate_rd, early_term, bsize, sse); } static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, MACROBLOCK *x, MACROBLOCKD *xd, RD_STATS *rd_stats, int *early_term, int calculate_rd, int64_t best_sse, unsigned int *var_output, unsigned int var_prune_threshold) { if (x->force_zeromv_skip_for_blk) { *early_term = 1; rd_stats->rate = 0; rd_stats->dist = 0; rd_stats->sse = 0; return; } // For block sizes greater than 32x32, the transform size is always 16x16. // This function avoids calling calculate_variance() for tx_size 16x16 cases // by directly populating variance at tx_size level from // block_variance_16x16_dual() function. const TxfmSearchParams *txfm_params = &x->txfm_search_params; if (CAP_TX_SIZE_FOR_BSIZE_GT32(txfm_params->tx_mode_search_type, bsize)) { xd->mi[0]->tx_size = TX_SIZE_FOR_BSIZE_GT32; model_skip_for_sb_y_large_64(cpi, bsize, mi_row, mi_col, x, xd, rd_stats, early_term, calculate_rd, best_sse, var_output, var_prune_threshold); return; } // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. unsigned int sse; struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; int test_skip = 1; unsigned int var; int sum; const int bw = b_width_log2_lookup[bsize]; const int bh = b_height_log2_lookup[bsize]; unsigned int sse8x8[256] = { 0 }; int sum8x8[256] = { 0 }; unsigned int var8x8[256] = { 0 }; TX_SIZE tx_size; // Calculate variance for whole partition, and also save 8x8 blocks' variance // to be used in following transform skipping test. block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8); var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4)); if (var_output) { *var_output = var; if (*var_output > var_prune_threshold) { return; } } rd_stats->sse = sse; // Skipping test *early_term = 0; tx_size = calculate_tx_size(cpi, bsize, x, var, sse, early_term); assert(tx_size <= TX_16X16); // The code below for setting skip flag assumes transform size of at least // 8x8, so force this lower limit on transform. if (tx_size < TX_8X8) tx_size = TX_8X8; xd->mi[0]->tx_size = tx_size; MB_MODE_INFO *const mi = xd->mi[0]; if (!calculate_rd && cpi->sf.rt_sf.sse_early_term_inter_search && early_term_inter_search_with_sse( cpi->sf.rt_sf.sse_early_term_inter_search, bsize, sse, best_sse, mi->mode)) test_skip = 0; if (*early_term) test_skip = 0; // Evaluate if the partition block is a skippable block in Y plane. if (test_skip) { unsigned int sse16x16[64] = { 0 }; int sum16x16[64] = { 0 }; unsigned int var16x16[64] = { 0 }; const unsigned int *sse_tx = sse8x8; const unsigned int *var_tx = var8x8; unsigned int num_blks = 1 << (bw + bh - 2); if (tx_size >= TX_16X16) { calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16, sum16x16); sse_tx = sse16x16; var_tx = var16x16; num_blks = num_blks >> 2; } set_early_term_based_on_uv_plane(cpi, x, bsize, xd, mi_row, mi_col, early_term, num_blks, sse_tx, var_tx, sum, var, sse); } calc_rate_dist_block_param(cpi, x, rd_stats, calculate_rd, early_term, bsize, sse); } static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, RD_STATS *rd_stats, unsigned int *var_out, int calculate_rd, int *early_term) { if (x->force_zeromv_skip_for_blk && early_term != NULL) { *early_term = 1; rd_stats->rate = 0; rd_stats->dist = 0; rd_stats->sse = 0; } // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. const int ref = xd->mi[0]->ref_frame[0]; assert(bsize < BLOCK_SIZES_ALL); struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; unsigned int sse; int rate; int64_t dist; unsigned int var = cpi->ppi->fn_ptr[bsize].vf( p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse); int force_skip = 0; xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse, &force_skip); if (var_out) { *var_out = var; } if (calculate_rd && (!force_skip || ref == INTRA_FRAME)) { const int bwide = block_size_wide[bsize]; const int bhigh = block_size_high[bsize]; model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh, &rate, &dist); } else { rate = INT_MAX; // this will be overwritten later with av1_block_yrd dist = INT_MAX; } rd_stats->sse = sse; x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); if (force_skip && ref > INTRA_FRAME) { rate = 0; dist = (int64_t)sse << 4; } assert(rate >= 0); rd_stats->skip_txfm = (rate == 0); rate = AOMMIN(rate, INT_MAX); rd_stats->rate = rate; rd_stats->dist = dist; } static inline int get_drl_cost(PREDICTION_MODE this_mode, int ref_mv_idx, const MB_MODE_INFO_EXT *mbmi_ext, const int (*const drl_mode_cost0)[2], int8_t ref_frame_type) { int cost = 0; if (this_mode == NEWMV || this_mode == NEW_NEWMV) { for (int idx = 0; idx < 2; ++idx) { if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); cost += drl_mode_cost0[drl_ctx][ref_mv_idx != idx]; if (ref_mv_idx == idx) return cost; } } return cost; } if (have_nearmv_in_inter_mode(this_mode)) { for (int idx = 1; idx < 3; ++idx) { if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); cost += drl_mode_cost0[drl_ctx][ref_mv_idx != (idx - 1)]; if (ref_mv_idx == (idx - 1)) return cost; } } return cost; } return cost; } static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode, int16_t mode_context) { if (is_inter_compound_mode(mode)) { return mode_costs ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)]; } int mode_cost = 0; int16_t mode_ctx = mode_context & NEWMV_CTX_MASK; assert(is_inter_mode(mode)); if (mode == NEWMV) { mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0]; return mode_cost; } else { mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1]; mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; if (mode == GLOBALMV) { mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0]; return mode_cost; } else { mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1]; mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV]; return mode_cost; } } } static void newmv_diff_bias(MACROBLOCKD *xd, PREDICTION_MODE this_mode, RD_STATS *this_rdc, BLOCK_SIZE bsize, int mv_row, int mv_col, int speed, uint32_t spatial_variance, CONTENT_STATE_SB content_state_sb) { // Bias against MVs associated with NEWMV mode that are very different from // top/left neighbors. if (this_mode == NEWMV) { int al_mv_average_row; int al_mv_average_col; int row_diff, col_diff; int above_mv_valid = 0; int left_mv_valid = 0; int above_row = INVALID_MV_ROW_COL, above_col = INVALID_MV_ROW_COL; int left_row = INVALID_MV_ROW_COL, left_col = INVALID_MV_ROW_COL; if (bsize >= BLOCK_64X64 && content_state_sb.source_sad_nonrd != kHighSad && spatial_variance < 300 && (mv_row > 16 || mv_row < -16 || mv_col > 16 || mv_col < -16)) { this_rdc->rdcost = this_rdc->rdcost << 2; return; } if (xd->above_mbmi) { above_mv_valid = xd->above_mbmi->mv[0].as_int != INVALID_MV; above_row = xd->above_mbmi->mv[0].as_mv.row; above_col = xd->above_mbmi->mv[0].as_mv.col; } if (xd->left_mbmi) { left_mv_valid = xd->left_mbmi->mv[0].as_int != INVALID_MV; left_row = xd->left_mbmi->mv[0].as_mv.row; left_col = xd->left_mbmi->mv[0].as_mv.col; } if (above_mv_valid && left_mv_valid) { al_mv_average_row = (above_row + left_row + 1) >> 1; al_mv_average_col = (above_col + left_col + 1) >> 1; } else if (above_mv_valid) { al_mv_average_row = above_row; al_mv_average_col = above_col; } else if (left_mv_valid) { al_mv_average_row = left_row; al_mv_average_col = left_col; } else { al_mv_average_row = al_mv_average_col = 0; } row_diff = al_mv_average_row - mv_row; col_diff = al_mv_average_col - mv_col; if (row_diff > 80 || row_diff < -80 || col_diff > 80 || col_diff < -80) { if (bsize >= BLOCK_32X32) this_rdc->rdcost = this_rdc->rdcost << 1; else this_rdc->rdcost = 5 * this_rdc->rdcost >> 2; } } else { // Bias for speed >= 8 for low spatial variance. if (speed >= 8 && spatial_variance < 150 && (mv_row > 64 || mv_row < -64 || mv_col > 64 || mv_col < -64)) this_rdc->rdcost = 5 * this_rdc->rdcost >> 2; } } static inline void update_thresh_freq_fact(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV_REFERENCE_FRAME ref_frame, THR_MODES best_mode_idx, PREDICTION_MODE mode) { const THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)]; const BLOCK_SIZE min_size = AOMMAX(bsize - 3, BLOCK_4X4); const BLOCK_SIZE max_size = AOMMIN(bsize + 6, BLOCK_128X128); for (BLOCK_SIZE bs = min_size; bs <= max_size; bs += 3) { int *freq_fact = &x->thresh_freq_fact[bs][thr_mode_idx]; if (thr_mode_idx == best_mode_idx) { *freq_fact -= (*freq_fact >> 4); } else { *freq_fact = AOMMIN(*freq_fact + RD_THRESH_INC, cpi->sf.inter_sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT); } } } #if CONFIG_AV1_TEMPORAL_DENOISING static void av1_pickmode_ctx_den_update( AV1_PICKMODE_CTX_DEN *ctx_den, int64_t zero_last_cost_orig, unsigned int ref_frame_cost[REF_FRAMES], int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], int reuse_inter_pred, BEST_PICKMODE *bp) { ctx_den->zero_last_cost_orig = zero_last_cost_orig; ctx_den->ref_frame_cost = ref_frame_cost; ctx_den->frame_mv = frame_mv; ctx_den->reuse_inter_pred = reuse_inter_pred; ctx_den->best_tx_size = bp->best_tx_size; ctx_den->best_mode = bp->best_mode; ctx_den->best_ref_frame = bp->best_ref_frame; ctx_den->best_pred_filter = bp->best_pred_filter; ctx_den->best_mode_skip_txfm = bp->best_mode_skip_txfm; } static void recheck_zeromv_after_denoising( AV1_COMP *cpi, MB_MODE_INFO *const mi, MACROBLOCK *x, MACROBLOCKD *const xd, AV1_DENOISER_DECISION decision, AV1_PICKMODE_CTX_DEN *ctx_den, struct buf_2d yv12_mb[4][MAX_MB_PLANE], RD_STATS *best_rdc, BEST_PICKMODE *best_pickmode, BLOCK_SIZE bsize, int mi_row, int mi_col) { // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on // denoised result. Only do this under noise conditions, and if rdcost of // ZEROMV on original source is not significantly higher than rdcost of best // mode. if (cpi->noise_estimate.enabled && cpi->noise_estimate.level > kLow && ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) && ((ctx_den->best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) || (ctx_den->best_ref_frame == GOLDEN_FRAME && cpi->svc.number_spatial_layers == 1 && decision == FILTER_ZEROMV_BLOCK))) { // Check if we should pick ZEROMV on denoised signal. AV1_COMMON *const cm = &cpi->common; RD_STATS this_rdc; const ModeCosts *mode_costs = &x->mode_costs; TxfmSearchInfo *txfm_info = &x->txfm_search_info; MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; mi->mode = GLOBALMV; mi->ref_frame[0] = LAST_FRAME; mi->ref_frame[1] = NONE_FRAME; set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE_FRAME); mi->mv[0].as_int = 0; mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); xd->plane[AOM_PLANE_Y].pre[0] = yv12_mb[LAST_FRAME][AOM_PLANE_Y]; av1_enc_build_inter_predictor_y(xd, mi_row, mi_col); unsigned int var; model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, &var, 1, NULL); const int16_t mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame); this_rdc.rate += cost_mv_ref(mode_costs, GLOBALMV, mode_ctx); this_rdc.rate += ctx_den->ref_frame_cost[LAST_FRAME]; this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); txfm_info->skip_txfm = this_rdc.skip_txfm; // Don't switch to ZEROMV if the rdcost for ZEROMV on denoised source // is higher than best_ref mode (on original source). if (this_rdc.rdcost > best_rdc->rdcost) { this_rdc = *best_rdc; mi->mode = best_pickmode->best_mode; mi->ref_frame[0] = best_pickmode->best_ref_frame; set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE_FRAME); mi->interp_filters = best_pickmode->best_pred_filter; if (best_pickmode->best_ref_frame == INTRA_FRAME) { mi->mv[0].as_int = INVALID_MV; } else { mi->mv[0].as_int = ctx_den ->frame_mv[best_pickmode->best_mode] [best_pickmode->best_ref_frame] .as_int; if (ctx_den->reuse_inter_pred) { xd->plane[AOM_PLANE_Y].pre[0] = yv12_mb[GOLDEN_FRAME][AOM_PLANE_Y]; av1_enc_build_inter_predictor_y(xd, mi_row, mi_col); } } mi->tx_size = best_pickmode->best_tx_size; txfm_info->skip_txfm = best_pickmode->best_mode_skip_txfm; } else { ctx_den->best_ref_frame = LAST_FRAME; *best_rdc = this_rdc; } } } #endif // CONFIG_AV1_TEMPORAL_DENOISING /*!\brief Searches for the best interpolation filter * * \ingroup nonrd_mode_search * \callgraph * \callergraph * Iterates through subset of possible interpolation filters (EIGHTTAP_REGULAR, * EIGTHTAP_SMOOTH, MULTITAP_SHARP, depending on FILTER_SEARCH_SIZE) and selects * the one that gives lowest RD cost. RD cost is calculated using curvfit model. * Support for dual filters (different filters in the x & y directions) is * allowed if sf.interp_sf.disable_dual_filter = 0. * * \param[in] cpi Top-level encoder structure * \param[in] x Pointer to structure holding all the * data for the current macroblock * \param[in] this_rdc Pointer to calculated RD Cost * \param[in] inter_pred_params_sr Pointer to structure holding parameters of inter prediction for single reference * \param[in] mi_row Row index in 4x4 units * \param[in] mi_col Column index in 4x4 units * \param[in] tmp_buffer Pointer to a temporary buffer for * prediction re-use * \param[in] bsize Current block size * \param[in] reuse_inter_pred Flag, indicating prediction re-use * \param[out] this_mode_pred Pointer to store prediction buffer * for prediction re-use * \param[out] this_early_term Flag, indicating that transform can be * skipped * \param[out] var The residue variance of the current * predictor. * \param[in] use_model_yrd_large Flag, indicating special logic to handle * large blocks * \param[in] best_sse Best sse so far. * \param[in] is_single_pred Flag, indicating single mode. * * \remark Nothing is returned. Instead, calculated RD cost is placed to * \c this_rdc and best filter is placed to \c mi->interp_filters. In case * \c reuse_inter_pred flag is set, this function also outputs * \c this_mode_pred. Also \c this_early_temp is set if transform can be * skipped */ static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc, InterPredParams *inter_pred_params_sr, int mi_row, int mi_col, PRED_BUFFER *tmp_buffer, BLOCK_SIZE bsize, int reuse_inter_pred, PRED_BUFFER **this_mode_pred, int *this_early_term, unsigned int *var, int use_model_yrd_large, int64_t best_sse, int is_single_pred) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; MB_MODE_INFO *const mi = xd->mi[0]; const int bw = block_size_wide[bsize]; int dim_factor = (cpi->sf.interp_sf.disable_dual_filter == 0) ? FILTER_SEARCH_SIZE : 1; RD_STATS pf_rd_stats[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 }; TX_SIZE pf_tx_size[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 }; PRED_BUFFER *current_pred = *this_mode_pred; int best_skip = 0; int best_early_term = 0; int64_t best_cost = INT64_MAX; int best_filter_index = -1; SubpelParams subpel_params; // Initialize inter prediction params at mode level for single reference // mode. if (is_single_pred) init_inter_mode_params(&mi->mv[0].as_mv, inter_pred_params_sr, &subpel_params, xd->block_ref_scale_factors[0], pd->pre->width, pd->pre->height); for (int filter_idx = 0; filter_idx < FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE; ++filter_idx) { int64_t cost; if (cpi->sf.interp_sf.disable_dual_filter && filters_ref_set[filter_idx].as_filters.x_filter != filters_ref_set[filter_idx].as_filters.y_filter) continue; mi->interp_filters.as_int = filters_ref_set[filter_idx].as_int; if (is_single_pred) av1_enc_build_inter_predictor_y_nonrd(xd, inter_pred_params_sr, &subpel_params); else av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, AOM_PLANE_Y, AOM_PLANE_Y); unsigned int curr_var = UINT_MAX; if (use_model_yrd_large) model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, &pf_rd_stats[filter_idx], this_early_term, 1, best_sse, &curr_var, UINT_MAX); else model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[filter_idx], &curr_var, 1, NULL); pf_rd_stats[filter_idx].rate += av1_get_switchable_rate( x, xd, cm->features.interp_filter, cm->seq_params->enable_dual_filter); cost = RDCOST(x->rdmult, pf_rd_stats[filter_idx].rate, pf_rd_stats[filter_idx].dist); pf_tx_size[filter_idx] = mi->tx_size; if (cost < best_cost) { *var = curr_var; best_filter_index = filter_idx; best_cost = cost; best_skip = pf_rd_stats[filter_idx].skip_txfm; best_early_term = *this_early_term; if (reuse_inter_pred) { if (*this_mode_pred != current_pred) { free_pred_buffer(*this_mode_pred); *this_mode_pred = current_pred; } current_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)]; pd->dst.buf = current_pred->data; pd->dst.stride = bw; } } } assert(best_filter_index >= 0 && best_filter_index < dim_factor * FILTER_SEARCH_SIZE); if (reuse_inter_pred && *this_mode_pred != current_pred) free_pred_buffer(current_pred); mi->interp_filters.as_int = filters_ref_set[best_filter_index].as_int; mi->tx_size = pf_tx_size[best_filter_index]; this_rdc->rate = pf_rd_stats[best_filter_index].rate; this_rdc->dist = pf_rd_stats[best_filter_index].dist; this_rdc->sse = pf_rd_stats[best_filter_index].sse; this_rdc->skip_txfm = (best_skip || best_early_term); *this_early_term = best_early_term; if (reuse_inter_pred) { pd->dst.buf = (*this_mode_pred)->data; pd->dst.stride = (*this_mode_pred)->stride; } else if (best_filter_index < dim_factor * FILTER_SEARCH_SIZE - 1) { if (is_single_pred) av1_enc_build_inter_predictor_y_nonrd(xd, inter_pred_params_sr, &subpel_params); else av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, AOM_PLANE_Y, AOM_PLANE_Y); } } #if !CONFIG_REALTIME_ONLY static inline int is_warped_mode_allowed(const AV1_COMP *cpi, MACROBLOCK *const x, const MB_MODE_INFO *mbmi) { const FeatureFlags *const features = &cpi->common.features; const MACROBLOCKD *xd = &x->e_mbd; if (cpi->sf.inter_sf.extra_prune_warped) return 0; if (has_second_ref(mbmi)) return 0; MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION; if (features->switchable_motion_mode) { // Determine which motion modes to search if more than SIMPLE_TRANSLATION // is allowed. last_motion_mode_allowed = motion_mode_allowed( xd->global_motion, xd, mbmi, features->allow_warped_motion); } if (last_motion_mode_allowed == WARPED_CAUSAL) { return 1; } return 0; } static void calc_num_proj_ref(AV1_COMP *cpi, MACROBLOCK *x, MB_MODE_INFO *mi) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; const FeatureFlags *const features = &cm->features; mi->num_proj_ref = 1; WARP_SAMPLE_INFO *const warp_sample_info = &x->warp_sample_info[mi->ref_frame[0]]; int *pts0 = warp_sample_info->pts; int *pts_inref0 = warp_sample_info->pts_inref; MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION; if (features->switchable_motion_mode) { // Determine which motion modes to search if more than SIMPLE_TRANSLATION // is allowed. last_motion_mode_allowed = motion_mode_allowed( xd->global_motion, xd, mi, features->allow_warped_motion); } if (last_motion_mode_allowed == WARPED_CAUSAL) { if (warp_sample_info->num < 0) { warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0); } mi->num_proj_ref = warp_sample_info->num; } } static void search_motion_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc, int mi_row, int mi_col, BLOCK_SIZE bsize, int *this_early_term, int use_model_yrd_large, int *rate_mv, int64_t best_sse) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; const FeatureFlags *const features = &cm->features; MB_MODE_INFO *const mi = xd->mi[0]; RD_STATS pf_rd_stats[MOTION_MODE_SEARCH_SIZE] = { 0 }; int best_skip = 0; int best_early_term = 0; int64_t best_cost = INT64_MAX; int best_mode_index = -1; const int interp_filter = features->interp_filter; const MOTION_MODE motion_modes[MOTION_MODE_SEARCH_SIZE] = { SIMPLE_TRANSLATION, WARPED_CAUSAL }; int mode_search_size = is_warped_mode_allowed(cpi, x, mi) ? 2 : 1; WARP_SAMPLE_INFO *const warp_sample_info = &x->warp_sample_info[mi->ref_frame[0]]; int *pts0 = warp_sample_info->pts; int *pts_inref0 = warp_sample_info->pts_inref; const int total_samples = mi->num_proj_ref; if (total_samples == 0) { // Do not search WARPED_CAUSAL if there are no samples to use to determine // warped parameters. mode_search_size = 1; } const MB_MODE_INFO base_mbmi = *mi; MB_MODE_INFO best_mbmi; for (int mode_index = 0; mode_index < mode_search_size; ++mode_index) { int64_t cost = INT64_MAX; MOTION_MODE motion_mode = motion_modes[mode_index]; *mi = base_mbmi; mi->motion_mode = motion_mode; if (motion_mode == SIMPLE_TRANSLATION) { mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, AOM_PLANE_Y, AOM_PLANE_Y); if (use_model_yrd_large) model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, &pf_rd_stats[mode_index], this_early_term, 1, best_sse, NULL, UINT_MAX); else model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[mode_index], NULL, 1, NULL); pf_rd_stats[mode_index].rate += av1_get_switchable_rate(x, xd, cm->features.interp_filter, cm->seq_params->enable_dual_filter); cost = RDCOST(x->rdmult, pf_rd_stats[mode_index].rate, pf_rd_stats[mode_index].dist); } else if (motion_mode == WARPED_CAUSAL) { int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; const ModeCosts *mode_costs = &x->mode_costs; mi->wm_params.wmtype = DEFAULT_WMTYPE; mi->interp_filters = av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter)); memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); // Select the samples according to motion vector difference if (mi->num_proj_ref > 1) { mi->num_proj_ref = av1_selectSamples(&mi->mv[0].as_mv, pts, pts_inref, mi->num_proj_ref, bsize); } // Compute the warped motion parameters with a least squares fit // using the collected samples if (!av1_find_projection(mi->num_proj_ref, pts, pts_inref, bsize, mi->mv[0].as_mv.row, mi->mv[0].as_mv.col, &mi->wm_params, mi_row, mi_col)) { if (mi->mode == NEWMV) { const int_mv mv0 = mi->mv[0]; const WarpedMotionParams wm_params0 = mi->wm_params; const int num_proj_ref0 = mi->num_proj_ref; const int_mv ref_mv = av1_get_ref_mv(x, 0); SUBPEL_MOTION_SEARCH_PARAMS ms_params; av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv.as_mv, NULL); // Refine MV in a small range. av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0, total_samples, cpi->sf.mv_sf.warp_search_method, cpi->sf.mv_sf.warp_search_iters); if (mi->mv[0].as_int == ref_mv.as_int) { continue; } if (mv0.as_int != mi->mv[0].as_int) { // Keep the refined MV and WM parameters. int tmp_rate_mv = av1_mv_bit_cost( &mi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost, x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); *rate_mv = tmp_rate_mv; } else { // Restore the old MV and WM parameters. mi->mv[0] = mv0; mi->wm_params = wm_params0; mi->num_proj_ref = num_proj_ref0; } } // Build the warped predictor av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, AOM_PLANE_Y, av1_num_planes(cm) - 1); if (use_model_yrd_large) model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, &pf_rd_stats[mode_index], this_early_term, 1, best_sse, NULL, UINT_MAX); else model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[mode_index], NULL, 1, NULL); pf_rd_stats[mode_index].rate += mode_costs->motion_mode_cost[bsize][mi->motion_mode]; cost = RDCOST(x->rdmult, pf_rd_stats[mode_index].rate, pf_rd_stats[mode_index].dist); } else { cost = INT64_MAX; } } if (cost < best_cost) { best_mode_index = mode_index; best_cost = cost; best_skip = pf_rd_stats[mode_index].skip_txfm; best_early_term = *this_early_term; best_mbmi = *mi; } } assert(best_mode_index >= 0 && best_mode_index < FILTER_SEARCH_SIZE); *mi = best_mbmi; this_rdc->rate = pf_rd_stats[best_mode_index].rate; this_rdc->dist = pf_rd_stats[best_mode_index].dist; this_rdc->sse = pf_rd_stats[best_mode_index].sse; this_rdc->skip_txfm = (best_skip || best_early_term); *this_early_term = best_early_term; if (best_mode_index < FILTER_SEARCH_SIZE - 1) { av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, AOM_PLANE_Y, AOM_PLANE_Y); } } #endif // !CONFIG_REALTIME_ONLY #define COLLECT_NON_SQR_STAT 0 #if COLLECT_NONRD_PICK_MODE_STAT static inline void print_stage_time(const char *stage_name, int64_t stage_time, int64_t total_time) { printf(" %s: %ld (%f%%)\n", stage_name, stage_time, 100 * stage_time / (float)total_time); } static void print_time(const mode_search_stat_nonrd *const ms_stat, BLOCK_SIZE bsize, int mi_rows, int mi_cols, int mi_row, int mi_col) { if ((mi_row + mi_size_high[bsize] >= mi_rows) && (mi_col + mi_size_wide[bsize] >= mi_cols)) { int64_t total_time = 0l; int32_t total_blocks = 0; for (BLOCK_SIZE bs = 0; bs < BLOCK_SIZES; bs++) { total_time += ms_stat->total_block_times[bs]; total_blocks += ms_stat->num_blocks[bs]; } printf("\n"); for (BLOCK_SIZE bs = 0; bs < BLOCK_SIZES; bs++) { if (ms_stat->num_blocks[bs] == 0) { continue; } if (!COLLECT_NON_SQR_STAT && block_size_wide[bs] != block_size_high[bs]) { continue; } printf("BLOCK_%dX%d Num %d, Time: %ld (%f%%), Avg_time %f:\n", block_size_wide[bs], block_size_high[bs], ms_stat->num_blocks[bs], ms_stat->total_block_times[bs], 100 * ms_stat->total_block_times[bs] / (float)total_time, (float)ms_stat->total_block_times[bs] / ms_stat->num_blocks[bs]); for (int j = 0; j < MB_MODE_COUNT; j++) { if (ms_stat->nonskipped_search_times[bs][j] == 0) { continue; } int64_t total_mode_time = ms_stat->nonskipped_search_times[bs][j]; printf(" Mode %d, %d/%d tps %f\n", j, ms_stat->num_nonskipped_searches[bs][j], ms_stat->num_searches[bs][j], ms_stat->num_nonskipped_searches[bs][j] > 0 ? (float)ms_stat->nonskipped_search_times[bs][j] / ms_stat->num_nonskipped_searches[bs][j] : 0l); if (j >= INTER_MODE_START) { total_mode_time = ms_stat->ms_time[bs][j] + ms_stat->ifs_time[bs][j] + ms_stat->model_rd_time[bs][j] + ms_stat->txfm_time[bs][j]; print_stage_time("Motion Search Time", ms_stat->ms_time[bs][j], total_time); print_stage_time("Filter Search Time", ms_stat->ifs_time[bs][j], total_time); print_stage_time("Model RD Time", ms_stat->model_rd_time[bs][j], total_time); print_stage_time("Tranfm Search Time", ms_stat->txfm_time[bs][j], total_time); } print_stage_time("Total Mode Time", total_mode_time, total_time); } printf("\n"); } printf("Total time = %ld. Total blocks = %d\n", total_time, total_blocks); } } #endif // COLLECT_NONRD_PICK_MODE_STAT static bool should_prune_intra_modes_using_neighbors( const MACROBLOCKD *xd, bool enable_intra_mode_pruning_using_neighbors, PREDICTION_MODE this_mode, PREDICTION_MODE above_mode, PREDICTION_MODE left_mode) { if (!enable_intra_mode_pruning_using_neighbors) return false; // Avoid pruning of DC_PRED as it is the most probable mode to win as per the // statistics generated for nonrd intra mode evaluations. if (this_mode == DC_PRED) return false; // Enable the pruning for current mode only if it is not the winner mode of // both the neighboring blocks (left/top). return xd->up_available && this_mode != above_mode && xd->left_available && this_mode != left_mode; } void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mi = xd->mi[0]; RD_STATS this_rdc, best_rdc; struct estimate_block_intra_args args; init_estimate_block_intra_args(&args, cpi, x); const TxfmSearchParams *txfm_params = &x->txfm_search_params; mi->tx_size = AOMMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]); assert(IMPLIES(xd->lossless[mi->segment_id], mi->tx_size == TX_4X4)); const BLOCK_SIZE tx_bsize = txsize_to_bsize[mi->tx_size]; // If the current block size is the same as the transform block size, enable // mode pruning based on the best SAD so far. if (cpi->sf.rt_sf.prune_intra_mode_using_best_sad_so_far && bsize == tx_bsize) args.prune_mode_based_on_sad = true; int *bmode_costs; PREDICTION_MODE best_mode = DC_PRED; const MB_MODE_INFO *above_mi = xd->above_mbmi; const MB_MODE_INFO *left_mi = xd->left_mbmi; const PREDICTION_MODE A = av1_above_block_mode(above_mi); const PREDICTION_MODE L = av1_left_block_mode(left_mi); const int above_ctx = intra_mode_context[A]; const int left_ctx = intra_mode_context[L]; const unsigned int source_variance = x->source_variance; bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx]; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; av1_invalid_rd_stats(&best_rdc); av1_invalid_rd_stats(&this_rdc); init_mbmi_nonrd(mi, DC_PRED, INTRA_FRAME, NONE_FRAME, cm); mi->mv[0].as_int = mi->mv[1].as_int = INVALID_MV; // Change the limit of this loop to add other intra prediction // mode tests. for (int mode_index = 0; mode_index < RTC_INTRA_MODES; ++mode_index) { PREDICTION_MODE this_mode = intra_mode_list[mode_index]; // Force DC for spatially flat block for large bsize, on top-left corner. // This removed potential artifact observed in gray scale image for high Q. if (x->source_variance == 0 && mi_col == 0 && mi_row == 0 && bsize >= BLOCK_32X32 && this_mode > 0) continue; // As per the statistics generated for intra mode evaluation in the nonrd // path, it is found that the probability of H_PRED mode being the winner is // very low when the best mode so far is V_PRED (out of DC_PRED and V_PRED). // If V_PRED is the winner mode out of DC_PRED and V_PRED, it could imply // the presence of a vertically dominant pattern. Hence, H_PRED mode is not // evaluated. if (cpi->sf.rt_sf.prune_h_pred_using_best_mode_so_far && this_mode == H_PRED && best_mode == V_PRED) continue; if (should_prune_intra_modes_using_neighbors( xd, cpi->sf.rt_sf.enable_intra_mode_pruning_using_neighbors, this_mode, A, L)) { // Prune V_PRED and H_PRED if source variance of the block is less than // or equal to 50. The source variance threshold is obtained empirically. if ((this_mode == V_PRED || this_mode == H_PRED) && source_variance <= 50) continue; // As per the statistics, probability of SMOOTH_PRED being the winner is // low when best mode so far is DC_PRED (out of DC_PRED, V_PRED and // H_PRED). Hence, SMOOTH_PRED mode is not evaluated. if (best_mode == DC_PRED && this_mode == SMOOTH_PRED) continue; } this_rdc.dist = this_rdc.rate = 0; args.mode = this_mode; args.skippable = 1; args.rdc = &this_rdc; mi->mode = this_mode; av1_foreach_transformed_block_in_plane(xd, bsize, AOM_PLANE_Y, av1_estimate_block_intra, &args); if (this_rdc.rate == INT_MAX) continue; const int skip_ctx = av1_get_skip_txfm_context(xd); if (args.skippable) { this_rdc.rate = x->mode_costs.skip_txfm_cost[skip_ctx][1]; } else { this_rdc.rate += x->mode_costs.skip_txfm_cost[skip_ctx][0]; } this_rdc.rate += bmode_costs[this_mode]; this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); if (this_rdc.rdcost < best_rdc.rdcost) { best_rdc = this_rdc; best_mode = this_mode; if (!this_rdc.skip_txfm) { memset(ctx->blk_skip, 0, sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); } } } const unsigned int thresh_sad = cpi->sf.rt_sf.prune_palette_search_nonrd > 1 ? 100 : 20; const unsigned int best_sad_norm = args.best_sad >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); // Try palette if it's enabled. bool try_palette = cpi->oxcf.tool_cfg.enable_palette && av1_allow_palette(cpi->common.features.allow_screen_content_tools, mi->bsize); if (cpi->sf.rt_sf.prune_palette_search_nonrd > 0) { bool prune = (!args.prune_mode_based_on_sad || best_sad_norm > thresh_sad) && bsize <= BLOCK_16X16 && x->source_variance > 200; try_palette &= prune; } if (try_palette) { const TxfmSearchInfo *txfm_info = &x->txfm_search_info; const unsigned int intra_ref_frame_cost = 0; x->color_palette_thresh = (best_sad_norm < 500) ? 32 : 64; // Search palette mode for Luma plane in intra frame. av1_search_palette_mode_luma(cpi, x, bsize, intra_ref_frame_cost, ctx, &this_rdc, best_rdc.rdcost); // Update best mode data. if (this_rdc.rdcost < best_rdc.rdcost) { best_mode = DC_PRED; mi->mv[0].as_int = INVALID_MV; mi->mv[1].as_int = INVALID_MV; best_rdc.rate = this_rdc.rate; best_rdc.dist = this_rdc.dist; best_rdc.rdcost = this_rdc.rdcost; if (!this_rdc.skip_txfm) { memcpy(ctx->blk_skip, txfm_info->blk_skip, sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); } if (xd->tx_type_map[0] != DCT_DCT) av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); } else { av1_zero(mi->palette_mode_info); } } mi->mode = best_mode; // Keep DC for UV since mode test is based on Y channel only. mi->uv_mode = UV_DC_PRED; *rd_cost = best_rdc; // For lossless: always force the skip flags off. // Even though the blk_skip is set to 0 above in the rdcost comparison, // do it here again in case the above logic changes. if (is_lossless_requested(&cpi->oxcf.rc_cfg)) { x->txfm_search_info.skip_txfm = 0; memset(ctx->blk_skip, 0, sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); } #if CONFIG_INTERNAL_STATS store_coding_context_nonrd(x, ctx, mi->mode); #else store_coding_context_nonrd(x, ctx); #endif // CONFIG_INTERNAL_STATS } static inline int is_same_gf_and_last_scale(AV1_COMMON *cm) { struct scale_factors *const sf_last = get_ref_scale_factors(cm, LAST_FRAME); struct scale_factors *const sf_golden = get_ref_scale_factors(cm, GOLDEN_FRAME); return ((sf_last->x_scale_fp == sf_golden->x_scale_fp) && (sf_last->y_scale_fp == sf_golden->y_scale_fp)); } static inline void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x, MB_MODE_INFO *mi, int mi_row, int mi_col, BLOCK_SIZE bsize, int gf_temporal_ref, int use_ref_frame[], int *force_skip_low_temp_var) { AV1_COMMON *const cm = &cpi->common; const struct segmentation *const seg = &cm->seg; const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64); // When the ref_frame_config is used to set the reference frame structure // then the usage of alt_ref is determined by the ref_frame_flags // (and not the speed feature use_nonrd_altref_frame). int use_alt_ref_frame = cpi->ppi->rtc_ref.set_ref_frame_config || cpi->sf.rt_sf.use_nonrd_altref_frame; int use_golden_ref_frame = 1; int use_last_ref_frame = 1; // When the ref_frame_config is used to set the reference frame structure: // check if LAST is used as a reference. And only remove golden and altref // references below if last is used as a reference. if (cpi->ppi->rtc_ref.set_ref_frame_config) use_last_ref_frame = cpi->ref_frame_flags & AOM_LAST_FLAG ? use_last_ref_frame : 0; // frame_since_golden is not used when user sets the referene structure. if (!cpi->ppi->rtc_ref.set_ref_frame_config && use_last_ref_frame && cpi->rc.frames_since_golden == 0 && gf_temporal_ref) { use_golden_ref_frame = 0; } if (use_last_ref_frame && cpi->sf.rt_sf.short_circuit_low_temp_var && x->nonrd_prune_ref_frame_search) { if (is_small_sb) *force_skip_low_temp_var = av1_get_force_skip_low_temp_var_small_sb( &x->part_search_info.variance_low[0], mi_row, mi_col, bsize); else *force_skip_low_temp_var = av1_get_force_skip_low_temp_var( &x->part_search_info.variance_low[0], mi_row, mi_col, bsize); // If force_skip_low_temp_var is set, skip golden reference. if (*force_skip_low_temp_var) { use_golden_ref_frame = 0; use_alt_ref_frame = 0; } } if (use_last_ref_frame && (x->nonrd_prune_ref_frame_search > 2 || x->force_zeromv_skip_for_blk || (x->nonrd_prune_ref_frame_search > 1 && bsize > BLOCK_64X64))) { use_golden_ref_frame = 0; use_alt_ref_frame = 0; } if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) { use_golden_ref_frame = 1; use_alt_ref_frame = 0; } // Skip golden/altref reference if color is set, on flat blocks with motion. // For screen: always skip golden/alt (if color_sensitivity_sb_g/alt is set) // except when x->nonrd_prune_ref_frame_search = 0. This latter flag // may be set in the variance partition when golden is a much better // reference than last, in which case it may not be worth skipping // golden/altref completely. // Condition on use_last_ref to make sure there remains at least one // reference. if (use_last_ref_frame && ((cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && x->nonrd_prune_ref_frame_search != 0) || (x->source_variance < 200 && x->content_state_sb.source_sad_nonrd >= kLowSad))) { if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1) use_golden_ref_frame = 0; if (x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_V)] == 1) use_alt_ref_frame = 0; } // For non-screen: if golden and altref are not being selected as references // (use_golden_ref_frame/use_alt_ref_frame = 0) check to allow golden back // based on the sad of nearest/nearmv of LAST ref. If this block sad is large, // keep golden as reference. Only do this for the agrressive pruning mode and // avoid it when color is set for golden reference. if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && (cpi->ref_frame_flags & AOM_LAST_FLAG) && !use_golden_ref_frame && !use_alt_ref_frame && x->pred_mv_sad[LAST_FRAME] != INT_MAX && x->nonrd_prune_ref_frame_search > 2 && x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) { int thr = (cm->width * cm->height > RESOLUTION_288P) ? 100 : 150; int pred = x->pred_mv_sad[LAST_FRAME] >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); if (pred > thr) use_golden_ref_frame = 1; } use_alt_ref_frame = cpi->ref_frame_flags & AOM_ALT_FLAG ? use_alt_ref_frame : 0; use_golden_ref_frame = cpi->ref_frame_flags & AOM_GOLD_FLAG ? use_golden_ref_frame : 0; // For spatial layers: enable golden ref if it is set by user and // corresponds to the lower spatial layer. if (cpi->svc.spatial_layer_id > 0 && (cpi->ref_frame_flags & AOM_GOLD_FLAG) && x->content_state_sb.source_sad_nonrd < kHighSad) { const int buffslot_golden = cpi->ppi->rtc_ref.ref_idx[GOLDEN_FRAME - LAST_FRAME]; if (cpi->ppi->rtc_ref.buffer_time_index[buffslot_golden] == cpi->svc.current_superframe) use_golden_ref_frame = 1; } use_ref_frame[ALTREF_FRAME] = use_alt_ref_frame; use_ref_frame[GOLDEN_FRAME] = use_golden_ref_frame; use_ref_frame[LAST_FRAME] = use_last_ref_frame; // Keep this assert on, as only 3 references are used in nonrd_pickmode // (LAST, GOLDEN, ALTREF), and if all 3 are not set by user then this // frame must be an intra-only frame and hence should never enter the // pickmode here for inter frames. assert(use_last_ref_frame || use_golden_ref_frame || use_alt_ref_frame); } static inline int is_filter_search_enabled_blk(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, BLOCK_SIZE bsize, int segment_id, int cb_pred_filter_search, InterpFilter *filt_select) { const AV1_COMMON *const cm = &cpi->common; // filt search disabled if (!cpi->sf.rt_sf.use_nonrd_filter_search) return 0; // filt search purely based on mode properties if (!cb_pred_filter_search) return 1; MACROBLOCKD *const xd = &x->e_mbd; int enable_interp_search = 0; if (!(xd->left_mbmi && xd->above_mbmi)) { // neighbors info unavailable enable_interp_search = 2; } else if (!(is_inter_block(xd->left_mbmi) && is_inter_block(xd->above_mbmi))) { // neighbor is INTRA enable_interp_search = 2; } else if (xd->left_mbmi->interp_filters.as_int != xd->above_mbmi->interp_filters.as_int) { // filters are different enable_interp_search = 2; } else if ((cb_pred_filter_search == 1) && (xd->left_mbmi->interp_filters.as_filters.x_filter != EIGHTTAP_REGULAR)) { // not regular enable_interp_search = 2; } else { // enable prediction based on chessboard pattern if (xd->left_mbmi->interp_filters.as_filters.x_filter == EIGHTTAP_SMOOTH) *filt_select = EIGHTTAP_SMOOTH; const int bsl = mi_size_wide_log2[bsize]; enable_interp_search = (bool)((((mi_row + mi_col) >> bsl) + get_chessboard_index(cm->current_frame.frame_number)) & 0x1); if (cyclic_refresh_segment_id_boosted(segment_id)) enable_interp_search = 1; } return enable_interp_search; } static inline int skip_mode_by_threshold(PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, int_mv mv, int frames_since_golden, const int *const rd_threshes, const int *const rd_thresh_freq_fact, int64_t best_cost, int best_skip, int extra_shift) { int skip_this_mode = 0; const THR_MODES mode_index = mode_idx[ref_frame][INTER_OFFSET(mode)]; int64_t mode_rd_thresh = best_skip ? ((int64_t)rd_threshes[mode_index]) << (extra_shift + 1) : ((int64_t)rd_threshes[mode_index]) << extra_shift; // Increase mode_rd_thresh value for non-LAST for improved encoding // speed if (ref_frame != LAST_FRAME) { mode_rd_thresh = mode_rd_thresh << 1; if (ref_frame == GOLDEN_FRAME && frames_since_golden > 4) mode_rd_thresh = mode_rd_thresh << (extra_shift + 1); } if (rd_less_than_thresh(best_cost, mode_rd_thresh, rd_thresh_freq_fact[mode_index])) if (mv.as_int != 0) skip_this_mode = 1; return skip_this_mode; } static inline int skip_mode_by_low_temp( PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize, CONTENT_STATE_SB content_state_sb, int_mv mv, int force_skip_low_temp_var) { // Skip non-zeromv mode search for non-LAST frame if force_skip_low_temp_var // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped // later. if (force_skip_low_temp_var && ref_frame != LAST_FRAME && mv.as_int != 0) { return 1; } if (content_state_sb.source_sad_nonrd != kHighSad && bsize >= BLOCK_64X64 && force_skip_low_temp_var && mode == NEWMV) { return 1; } return 0; } static inline int skip_mode_by_bsize_and_ref_frame( PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize, int extra_prune, unsigned int sse_zeromv_norm, int more_prune, int skip_nearmv) { const unsigned int thresh_skip_golden = 500; if (ref_frame != LAST_FRAME && sse_zeromv_norm < thresh_skip_golden && mode == NEWMV) return 1; if ((bsize == BLOCK_128X128 && mode == NEWMV) || (skip_nearmv && mode == NEARMV)) return 1; // Skip testing non-LAST if this flag is set. if (extra_prune) { if (extra_prune > 1 && ref_frame != LAST_FRAME && (bsize > BLOCK_16X16 && mode == NEWMV)) return 1; if (ref_frame != LAST_FRAME && mode == NEARMV) return 1; if (more_prune && bsize >= BLOCK_32X32 && mode == NEARMV) return 1; } return 0; } static void set_block_source_sad(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, struct buf_2d *yv12_mb) { struct macroblock_plane *const p = &x->plane[0]; const int y_sad = cpi->ppi->fn_ptr[bsize].sdf(p->src.buf, p->src.stride, yv12_mb->buf, yv12_mb->stride); if (y_sad == 0) x->block_is_zero_sad = 1; } static void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int y_sad, unsigned int source_variance, struct buf_2d yv12_mb[MAX_MB_PLANE]) { const int subsampling_x = cpi->common.seq_params->subsampling_x; const int subsampling_y = cpi->common.seq_params->subsampling_y; const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd; const int high_res = cpi->common.width * cpi->common.height >= 640 * 360; if (bsize == cpi->common.seq_params->sb_size && !x->force_color_check_block_level) { // At superblock level color_sensitivity is already set to 0, 1, or 2. // 2 is middle/uncertain level. To avoid additional sad // computations when bsize = sb_size force level 2 to 1 (certain color) // for motion areas. Avoid this shortcut if x->force_color_check_block_level // is set. if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 2) { x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = source_sad_nonrd >= kMedSad ? 1 : 0; } if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 2) { x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = source_sad_nonrd >= kMedSad ? 1 : 0; } return; } // Divide factor for comparing uv_sad to y_sad. int shift = 3; // Threshold for the block spatial source variance. unsigned int source_var_thr = 50; // Thresholds for normalized uv_sad, the first one is used for // low source_varaince. int norm_uv_sad_thresh = 100; int norm_uv_sad_thresh2 = 40; if (source_sad_nonrd >= kMedSad && x->source_variance > 0 && high_res) shift = 4; if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { if (cpi->rc.high_source_sad) shift = 6; if (source_sad_nonrd > kMedSad) { source_var_thr = 1200; norm_uv_sad_thresh = 10; } if (cpi->rc.percent_blocks_with_motion > 90 && cpi->rc.frame_source_sad > 10000 && source_sad_nonrd > kLowSad) { // Aggressive setting for color_sensitivity for this content. shift = 10; norm_uv_sad_thresh = 0; norm_uv_sad_thresh2 = 0; } } NOISE_LEVEL noise_level = kLow; int norm_sad = y_sad >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); unsigned int thresh_spatial = (cpi->common.width > 1920) ? 5000 : 1000; // If the spatial source variance is high and the normalized y_sad // is low, then y-channel is likely good for mode estimation, so keep // color_sensitivity off. For low noise content for now, since there is // some bdrate regression for noisy color clip. if (cpi->noise_estimate.enabled) noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate); if (noise_level == kLow && source_variance > thresh_spatial && cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && norm_sad < 50) { x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = 0; x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = 0; return; } const int num_planes = av1_num_planes(&cpi->common); for (int plane = AOM_PLANE_U; plane < num_planes; ++plane) { // Always check if level = 2. If level = 0 check again for // motion areas for higher resolns, where color artifacts // are more noticeable. Always check if // x->force_color_check_block_level is set. if (x->color_sensitivity[COLOR_SENS_IDX(plane)] == 2 || x->force_color_check_block_level || (x->color_sensitivity[COLOR_SENS_IDX(plane)] == 0 && source_sad_nonrd >= kMedSad && high_res)) { struct macroblock_plane *const p = &x->plane[plane]; const BLOCK_SIZE bs = get_plane_block_size(bsize, subsampling_x, subsampling_y); const int uv_sad = cpi->ppi->fn_ptr[bs].sdf( p->src.buf, p->src.stride, yv12_mb[plane].buf, yv12_mb[plane].stride); const int norm_uv_sad = uv_sad >> (b_width_log2_lookup[bs] + b_height_log2_lookup[bs]); x->color_sensitivity[COLOR_SENS_IDX(plane)] = uv_sad > (y_sad >> shift) && norm_uv_sad > norm_uv_sad_thresh2; if (source_variance < source_var_thr && norm_uv_sad > norm_uv_sad_thresh) x->color_sensitivity[COLOR_SENS_IDX(plane)] = 1; } } } static void setup_compound_prediction(const AV1_COMMON *cm, MACROBLOCK *x, struct buf_2d yv12_mb[8][MAX_MB_PLANE], const int *use_ref_frame_mask, const MV_REFERENCE_FRAME *rf, int *ref_mv_idx) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; MV_REFERENCE_FRAME ref_frame_comp; if (!use_ref_frame_mask[rf[1]]) { // Need to setup pred_block, if it hasn't been done in find_predictors. const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, rf[1]); const int num_planes = av1_num_planes(cm); if (yv12 != NULL) { const struct scale_factors *const sf = get_ref_scale_factors_const(cm, rf[1]); av1_setup_pred_block(xd, yv12_mb[rf[1]], yv12, sf, sf, num_planes); } } ref_frame_comp = av1_ref_frame_type(rf); mbmi_ext->mode_context[ref_frame_comp] = 0; mbmi_ext->ref_mv_count[ref_frame_comp] = UINT8_MAX; av1_find_mv_refs(cm, xd, mbmi, ref_frame_comp, mbmi_ext->ref_mv_count, xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, mbmi_ext->mode_context); av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_comp); *ref_mv_idx = mbmi->ref_mv_idx + 1; } static void set_compound_mode(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, MV_REFERENCE_FRAME ref_frame2, int ref_mv_idx, int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], PREDICTION_MODE this_mode) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mi = xd->mi[0]; mi->ref_frame[0] = ref_frame; mi->ref_frame[1] = ref_frame2; mi->compound_idx = 1; mi->comp_group_idx = 0; mi->interinter_comp.type = COMPOUND_AVERAGE; MV_REFERENCE_FRAME ref_frame_comp = av1_ref_frame_type(mi->ref_frame); if (this_mode == GLOBAL_GLOBALMV) { frame_mv[this_mode][ref_frame].as_int = 0; frame_mv[this_mode][ref_frame2].as_int = 0; } else if (this_mode == NEAREST_NEARESTMV) { frame_mv[this_mode][ref_frame].as_int = xd->ref_mv_stack[ref_frame_comp][0].this_mv.as_int; frame_mv[this_mode][ref_frame2].as_int = xd->ref_mv_stack[ref_frame_comp][0].comp_mv.as_int; } else if (this_mode == NEAR_NEARMV) { frame_mv[this_mode][ref_frame].as_int = xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].this_mv.as_int; frame_mv[this_mode][ref_frame2].as_int = xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].comp_mv.as_int; } } // Prune compound mode if the single mode variance is lower than a fixed // percentage of the median value. static bool skip_comp_based_on_var( const unsigned int (*single_vars)[REF_FRAMES], BLOCK_SIZE bsize) { unsigned int best_var = UINT_MAX; for (int cur_mode_idx = 0; cur_mode_idx < RTC_INTER_MODES; cur_mode_idx++) { for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) { best_var = AOMMIN(best_var, single_vars[cur_mode_idx][ref_idx]); } } const unsigned int thresh_64 = (unsigned int)(0.57356805f * 8659); const unsigned int thresh_32 = (unsigned int)(0.23964763f * 4281); // Currently, the thresh for 128 and 16 are not well-tuned. We are using the // results from 64 and 32 as an heuristic. switch (bsize) { case BLOCK_128X128: return best_var < 4 * thresh_64; case BLOCK_64X64: return best_var < thresh_64; case BLOCK_32X32: return best_var < thresh_32; case BLOCK_16X16: return best_var < thresh_32 / 4; default: return false; } } static AOM_FORCE_INLINE void fill_single_inter_mode_costs( int (*single_inter_mode_costs)[REF_FRAMES], int num_inter_modes, const REF_MODE *reference_mode_set, const ModeCosts *mode_costs, const int16_t *mode_context) { bool ref_frame_used[REF_FRAMES] = { false }; for (int idx = 0; idx < num_inter_modes; idx++) { ref_frame_used[reference_mode_set[idx].ref_frame] = true; } for (int this_ref_frame = LAST_FRAME; this_ref_frame < REF_FRAMES; this_ref_frame++) { if (!ref_frame_used[this_ref_frame]) { continue; } const MV_REFERENCE_FRAME rf[2] = { this_ref_frame, NONE_FRAME }; const int16_t mode_ctx = av1_mode_context_analyzer(mode_context, rf); for (PREDICTION_MODE this_mode = NEARESTMV; this_mode <= NEWMV; this_mode++) { single_inter_mode_costs[INTER_OFFSET(this_mode)][this_ref_frame] = cost_mv_ref(mode_costs, this_mode, mode_ctx); } } } static inline bool is_globalmv_better( PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame, int rate_mv, const ModeCosts *mode_costs, const int (*single_inter_mode_costs)[REF_FRAMES], const MB_MODE_INFO_EXT *mbmi_ext) { const int globalmv_mode_cost = single_inter_mode_costs[INTER_OFFSET(GLOBALMV)][ref_frame]; int this_mode_cost = rate_mv + single_inter_mode_costs[INTER_OFFSET(this_mode)][ref_frame]; if (this_mode == NEWMV || this_mode == NEARMV) { const MV_REFERENCE_FRAME rf[2] = { ref_frame, NONE_FRAME }; this_mode_cost += get_drl_cost( NEWMV, 0, mbmi_ext, mode_costs->drl_mode_cost0, av1_ref_frame_type(rf)); } return this_mode_cost > globalmv_mode_cost; } // Set up the mv/ref_frames etc based on the comp_index. Returns 1 if it // succeeds, 0 if it fails. static inline int setup_compound_params_from_comp_idx( const AV1_COMP *cpi, MACROBLOCK *x, struct buf_2d yv12_mb[8][MAX_MB_PLANE], PREDICTION_MODE *this_mode, MV_REFERENCE_FRAME *ref_frame, MV_REFERENCE_FRAME *ref_frame2, int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], const int *use_ref_frame_mask, int comp_index, bool comp_use_zero_zeromv_only, MV_REFERENCE_FRAME *last_comp_ref_frame, BLOCK_SIZE bsize) { const MV_REFERENCE_FRAME *rf = comp_ref_mode_set[comp_index].ref_frame; int skip_gf = 0; int skip_alt = 0; *this_mode = comp_ref_mode_set[comp_index].pred_mode; *ref_frame = rf[0]; *ref_frame2 = rf[1]; assert(*ref_frame == LAST_FRAME); assert(*this_mode == GLOBAL_GLOBALMV || *this_mode == NEAREST_NEARESTMV); if (x->source_variance < 50 && bsize > BLOCK_16X16) { if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1) skip_gf = 1; if (x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_V)] == 1) skip_alt = 1; } if (comp_use_zero_zeromv_only && *this_mode != GLOBAL_GLOBALMV) { return 0; } if (*ref_frame2 == GOLDEN_FRAME && (cpi->sf.rt_sf.ref_frame_comp_nonrd[0] == 0 || skip_gf || !(cpi->ref_frame_flags & AOM_GOLD_FLAG))) { return 0; } else if (*ref_frame2 == LAST2_FRAME && (cpi->sf.rt_sf.ref_frame_comp_nonrd[1] == 0 || !(cpi->ref_frame_flags & AOM_LAST2_FLAG))) { return 0; } else if (*ref_frame2 == ALTREF_FRAME && (cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 0 || skip_alt || !(cpi->ref_frame_flags & AOM_ALT_FLAG))) { return 0; } int ref_mv_idx = 0; if (*last_comp_ref_frame != rf[1]) { // Only needs to be done once per reference pair. setup_compound_prediction(&cpi->common, x, yv12_mb, use_ref_frame_mask, rf, &ref_mv_idx); *last_comp_ref_frame = rf[1]; } set_compound_mode(x, *ref_frame, *ref_frame2, ref_mv_idx, frame_mv, *this_mode); if (*this_mode != GLOBAL_GLOBALMV && frame_mv[*this_mode][*ref_frame].as_int == 0 && frame_mv[*this_mode][*ref_frame2].as_int == 0) { return 0; } return 1; } static inline bool previous_mode_performed_poorly( PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, const unsigned int (*vars)[REF_FRAMES], const int64_t (*uv_dist)[REF_FRAMES]) { unsigned int best_var = UINT_MAX; int64_t best_uv_dist = INT64_MAX; for (int midx = 0; midx < RTC_INTER_MODES; midx++) { best_var = AOMMIN(best_var, vars[midx][ref_frame]); best_uv_dist = AOMMIN(best_uv_dist, uv_dist[midx][ref_frame]); } assert(best_var != UINT_MAX && "Invalid variance data."); const float mult = 1.125f; bool var_bad = mult * best_var < vars[INTER_OFFSET(mode)][ref_frame]; if (uv_dist[INTER_OFFSET(mode)][ref_frame] < INT64_MAX && best_uv_dist != uv_dist[INTER_OFFSET(mode)][ref_frame]) { // If we have chroma info, then take it into account var_bad &= mult * best_uv_dist < uv_dist[INTER_OFFSET(mode)][ref_frame]; } return var_bad; } static inline bool prune_compoundmode_with_singlemode_var( PREDICTION_MODE compound_mode, MV_REFERENCE_FRAME ref_frame, MV_REFERENCE_FRAME ref_frame2, const int_mv (*frame_mv)[REF_FRAMES], const uint8_t (*mode_checked)[REF_FRAMES], const unsigned int (*vars)[REF_FRAMES], const int64_t (*uv_dist)[REF_FRAMES]) { const PREDICTION_MODE single_mode0 = compound_ref0_mode(compound_mode); const PREDICTION_MODE single_mode1 = compound_ref1_mode(compound_mode); bool first_ref_valid = false, second_ref_valid = false; bool first_ref_bad = false, second_ref_bad = false; if (mode_checked[single_mode0][ref_frame] && frame_mv[single_mode0][ref_frame].as_int == frame_mv[compound_mode][ref_frame].as_int && vars[INTER_OFFSET(single_mode0)][ref_frame] < UINT_MAX) { first_ref_valid = true; first_ref_bad = previous_mode_performed_poorly(single_mode0, ref_frame, vars, uv_dist); } if (mode_checked[single_mode1][ref_frame2] && frame_mv[single_mode1][ref_frame2].as_int == frame_mv[compound_mode][ref_frame2].as_int && vars[INTER_OFFSET(single_mode1)][ref_frame2] < UINT_MAX) { second_ref_valid = true; second_ref_bad = previous_mode_performed_poorly(single_mode1, ref_frame2, vars, uv_dist); } if (first_ref_valid && second_ref_valid) { return first_ref_bad && second_ref_bad; } else if (first_ref_valid || second_ref_valid) { return first_ref_bad || second_ref_bad; } return false; } // Function to setup parameters used for inter mode evaluation in non-rd. static AOM_FORCE_INLINE void set_params_nonrd_pick_inter_mode( AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state, RD_STATS *rd_cost, int *force_skip_low_temp_var, int mi_row, int mi_col, int gf_temporal_ref, unsigned char segment_id, BLOCK_SIZE bsize #if CONFIG_AV1_TEMPORAL_DENOISING , PICK_MODE_CONTEXT *ctx, int denoise_svc_pickmode #endif ) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; TxfmSearchInfo *txfm_info = &x->txfm_search_info; MB_MODE_INFO *const mi = xd->mi[0]; const ModeCosts *mode_costs = &x->mode_costs; int skip_pred_mv = 0; // Initialize variance and distortion (chroma) for all modes and reference // frames for (int idx = 0; idx < RTC_INTER_MODES; idx++) { for (int ref = 0; ref < REF_FRAMES; ref++) { search_state->vars[idx][ref] = UINT_MAX; search_state->uv_dist[idx][ref] = INT64_MAX; } } // Initialize values of color sensitivity with sb level color sensitivity av1_copy(x->color_sensitivity, x->color_sensitivity_sb); init_best_pickmode(&search_state->best_pickmode); // Estimate cost for single reference frames estimate_single_ref_frame_costs(cm, xd, mode_costs, segment_id, bsize, search_state->ref_costs_single); // Reset flag to indicate modes evaluated av1_zero(search_state->mode_checked); txfm_info->skip_txfm = 0; // Initialize mode decisions av1_invalid_rd_stats(&search_state->best_rdc); av1_invalid_rd_stats(&search_state->this_rdc); av1_invalid_rd_stats(rd_cost); for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) { x->warp_sample_info[ref_idx].num = -1; } mi->bsize = bsize; mi->ref_frame[0] = NONE_FRAME; mi->ref_frame[1] = NONE_FRAME; #if CONFIG_AV1_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0) { // if (cpi->ppi->use_svc) denoise_svc_pickmode = // av1_denoise_svc_non_key(cpi); if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode) av1_denoiser_reset_frame_stats(ctx); } #endif // Populate predicated motion vectors for LAST_FRAME if (cpi->ref_frame_flags & AOM_LAST_FLAG) { find_predictors(cpi, x, LAST_FRAME, search_state->frame_mv, search_state->yv12_mb, bsize, *force_skip_low_temp_var, x->force_zeromv_skip_for_blk, &search_state->use_scaled_ref_frame[LAST_FRAME]); } // Update mask to use all reference frame get_ref_frame_use_mask(cpi, x, mi, mi_row, mi_col, bsize, gf_temporal_ref, search_state->use_ref_frame_mask, force_skip_low_temp_var); skip_pred_mv = x->force_zeromv_skip_for_blk || (x->nonrd_prune_ref_frame_search > 2 && x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] != 2 && x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2); // Populate predicated motion vectors for other single reference frame // Start at LAST_FRAME + 1. for (MV_REFERENCE_FRAME ref_frame_iter = LAST_FRAME + 1; ref_frame_iter <= ALTREF_FRAME; ++ref_frame_iter) { if (search_state->use_ref_frame_mask[ref_frame_iter]) { find_predictors(cpi, x, ref_frame_iter, search_state->frame_mv, search_state->yv12_mb, bsize, *force_skip_low_temp_var, skip_pred_mv, &search_state->use_scaled_ref_frame[ref_frame_iter]); } } } // Function to check the inter mode can be skipped based on mode statistics and // speed features settings. static AOM_FORCE_INLINE bool skip_inter_mode_nonrd( AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state, int64_t *thresh_sad_pred, int *force_mv_inter_layer, int *is_single_pred, PREDICTION_MODE *this_mode, MV_REFERENCE_FRAME *last_comp_ref_frame, MV_REFERENCE_FRAME *ref_frame, MV_REFERENCE_FRAME *ref_frame2, int idx, int_mv svc_mv, int force_skip_low_temp_var, unsigned int sse_zeromv_norm, int num_inter_modes, unsigned char segment_id, BLOCK_SIZE bsize, bool comp_use_zero_zeromv_only, bool check_globalmv) { AV1_COMMON *const cm = &cpi->common; const struct segmentation *const seg = &cm->seg; const SVC *const svc = &cpi->svc; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mi = xd->mi[0]; const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; // Skip compound mode based on reference frame mask and type of the mode and // for allowed compound modes, setup ref mv stack and reference frame. if (idx >= num_inter_modes) { const int comp_index = idx - num_inter_modes; if (!setup_compound_params_from_comp_idx( cpi, x, search_state->yv12_mb, this_mode, ref_frame, ref_frame2, search_state->frame_mv, search_state->use_ref_frame_mask, comp_index, comp_use_zero_zeromv_only, last_comp_ref_frame, bsize)) { return true; } *is_single_pred = 0; } else { *this_mode = ref_mode_set[idx].pred_mode; *ref_frame = ref_mode_set[idx].ref_frame; *ref_frame2 = NONE_FRAME; } if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) && (*this_mode != GLOBALMV || *ref_frame != LAST_FRAME)) return true; // Skip the mode if use reference frame mask flag is not set. if (!search_state->use_ref_frame_mask[*ref_frame]) return true; // Skip mode for some modes and reference frames when // force_zeromv_skip_for_blk flag is true. if (x->force_zeromv_skip_for_blk && ((!(*this_mode == NEARESTMV && search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) && *this_mode != GLOBALMV) || *ref_frame != LAST_FRAME)) return true; if (x->sb_me_block && *ref_frame == LAST_FRAME) { // We want to make sure to test the superblock MV: // so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they // have this sb MV. And don't skip NEWMV_LAST: this will be set to // sb MV in handle_inter_mode_nonrd(), in case NEAREST or NEAR don't // have it. if (*this_mode == NEARESTMV && search_state->frame_mv[NEARESTMV][LAST_FRAME].as_int == x->sb_me_mv.as_int) { return false; } if (*this_mode == NEARMV && search_state->frame_mv[NEARMV][LAST_FRAME].as_int == x->sb_me_mv.as_int) { return false; } if (*this_mode == NEWMV) { return false; } } // Skip the single reference mode for which mode check flag is set. if (*is_single_pred && search_state->mode_checked[*this_mode][*ref_frame]) { return true; } // Skip GLOBALMV mode if check_globalmv flag is not enabled. if (!check_globalmv && *this_mode == GLOBALMV) { return true; } #if COLLECT_NONRD_PICK_MODE_STAT aom_usec_timer_start(&x->ms_stat_nonrd.timer1); x->ms_stat_nonrd.num_searches[bsize][*this_mode]++; #endif mi->mode = *this_mode; mi->ref_frame[0] = *ref_frame; mi->ref_frame[1] = *ref_frame2; // Skip compound mode based on variance of previously evaluated single // reference modes. if (rt_sf->prune_compoundmode_with_singlemode_var && !*is_single_pred && prune_compoundmode_with_singlemode_var( *this_mode, *ref_frame, *ref_frame2, search_state->frame_mv, search_state->mode_checked, search_state->vars, search_state->uv_dist)) { return true; } *force_mv_inter_layer = 0; if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 && ((*ref_frame == LAST_FRAME && svc->skip_mvsearch_last) || (*ref_frame == GOLDEN_FRAME && svc->skip_mvsearch_gf) || (*ref_frame == ALTREF_FRAME && svc->skip_mvsearch_altref))) { // Only test mode if NEARESTMV/NEARMV is (svc_mv.mv.col, svc_mv.mv.row), // otherwise set NEWMV to (svc_mv.mv.col, svc_mv.mv.row). // Skip newmv and filter search. *force_mv_inter_layer = 1; if (*this_mode == NEWMV) { search_state->frame_mv[*this_mode][*ref_frame] = svc_mv; } else if (search_state->frame_mv[*this_mode][*ref_frame].as_int != svc_mv.as_int) { return true; } } // If the segment reference frame feature is enabled then do nothing if the // current ref frame is not allowed. if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)(*ref_frame)) return true; // For screen content: skip mode testing based on source_sad. if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && !x->force_zeromv_skip_for_blk) { // If source_sad is computed: skip non-zero motion // check for stationary (super)blocks. Otherwise if superblock // has motion skip the modes with zero motion on last reference // for flat blocks, and color is not set. // For the latter condition: the same condition should apply // to newmv if (0, 0), so this latter condition is repeated // below after search_new_mv. if (rt_sf->source_metrics_sb_nonrd) { if ((search_state->frame_mv[*this_mode][*ref_frame].as_int != 0 && x->content_state_sb.source_sad_nonrd == kZeroSad) || (search_state->frame_mv[*this_mode][*ref_frame].as_int == 0 && x->block_is_zero_sad == 0 && *ref_frame == LAST_FRAME && ((x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) || cpi->rc.high_source_sad) && x->source_variance == 0)) return true; } // Skip NEWMV search for flat blocks. if (rt_sf->skip_newmv_flat_blocks_screen && *this_mode == NEWMV && x->source_variance < 100) return true; // Skip non-LAST for color on flat blocks. if (*ref_frame > LAST_FRAME && x->source_variance == 0 && (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)) return true; } // Skip mode based on block size, reference frame mode and other block // properties. if (skip_mode_by_bsize_and_ref_frame( *this_mode, *ref_frame, bsize, x->nonrd_prune_ref_frame_search, sse_zeromv_norm, rt_sf->nonrd_aggressive_skip, rt_sf->increase_source_sad_thresh)) return true; // Skip mode based on low temporal variance and souce sad. if (skip_mode_by_low_temp(*this_mode, *ref_frame, bsize, x->content_state_sb, search_state->frame_mv[*this_mode][*ref_frame], force_skip_low_temp_var)) return true; // Disable this drop out case if the ref frame segment level feature is // enabled for this segment. This is to prevent the possibility that we // end up unable to pick any mode. if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { // Check for skipping GOLDEN and ALTREF based pred_mv_sad. if (rt_sf->nonrd_prune_ref_frame_search > 0 && x->pred_mv_sad[*ref_frame] != INT_MAX && *ref_frame != LAST_FRAME) { if ((int64_t)(x->pred_mv_sad[*ref_frame]) > *thresh_sad_pred) return true; } } // Check for skipping NEARMV based on pred_mv_sad. if (*this_mode == NEARMV && x->pred_mv1_sad[*ref_frame] != INT_MAX && x->pred_mv1_sad[*ref_frame] > (x->pred_mv0_sad[*ref_frame] << 1)) return true; // Skip single reference mode based on rd threshold. if (*is_single_pred) { if (skip_mode_by_threshold( *this_mode, *ref_frame, search_state->frame_mv[*this_mode][*ref_frame], cpi->rc.frames_since_golden, cpi->rd.threshes[segment_id][bsize], x->thresh_freq_fact[bsize], search_state->best_rdc.rdcost, search_state->best_pickmode.best_mode_skip_txfm, (rt_sf->nonrd_aggressive_skip ? 1 : 0))) return true; } return false; } // Function to perform inter mode evaluation for non-rd static AOM_FORCE_INLINE bool handle_inter_mode_nonrd( AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state, PICK_MODE_CONTEXT *ctx, PRED_BUFFER **this_mode_pred, PRED_BUFFER *tmp_buffer, InterPredParams inter_pred_params_sr, int *best_early_term, unsigned int *sse_zeromv_norm, bool *check_globalmv, #if CONFIG_AV1_TEMPORAL_DENOISING int64_t *zero_last_cost_orig, int denoise_svc_pickmode, #endif int idx, int force_mv_inter_layer, int is_single_pred, int gf_temporal_ref, int use_model_yrd_large, int filter_search_enabled_blk, BLOCK_SIZE bsize, PREDICTION_MODE this_mode, InterpFilter filt_select, int cb_pred_filter_search, int reuse_inter_pred, int *sb_me_has_been_tested) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mi = xd->mi[0]; const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; const int bw = block_size_wide[bsize]; const InterpFilter filter_ref = cm->features.interp_filter; const InterpFilter default_interp_filter = EIGHTTAP_REGULAR; TxfmSearchInfo *txfm_info = &x->txfm_search_info; const ModeCosts *mode_costs = &x->mode_costs; const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode; MV_REFERENCE_FRAME ref_frame = mi->ref_frame[0]; MV_REFERENCE_FRAME ref_frame2 = mi->ref_frame[1]; int_mv *const this_mv = &search_state->frame_mv[this_mode][ref_frame]; unsigned int var = UINT_MAX; int this_early_term = 0; int rate_mv = 0; int is_skippable; int skip_this_mv = 0; unsigned int var_threshold = UINT_MAX; PREDICTION_MODE this_best_mode; RD_STATS nonskip_rdc; av1_invalid_rd_stats(&nonskip_rdc); if (x->sb_me_block && this_mode == NEWMV && ref_frame == LAST_FRAME) { // Set the NEWMV_LAST to the sb MV. search_state->frame_mv[NEWMV][LAST_FRAME].as_int = x->sb_me_mv.as_int; } else if (this_mode == NEWMV && !force_mv_inter_layer) { #if COLLECT_NONRD_PICK_MODE_STAT aom_usec_timer_start(&x->ms_stat_nonrd.timer2); #endif // Find the best motion vector for single/compound mode. const bool skip_newmv = search_new_mv( cpi, x, search_state->frame_mv, ref_frame, gf_temporal_ref, bsize, mi_row, mi_col, &rate_mv, &search_state->best_rdc); #if COLLECT_NONRD_PICK_MODE_STAT aom_usec_timer_mark(&x->ms_stat_nonrd.timer2); x->ms_stat_nonrd.ms_time[bsize][this_mode] += aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2); #endif // Skip NEWMV mode, // (i). For bsize smaller than 16X16 // (ii). Based on sad of the predicted mv w.r.t LAST_FRAME // (iii). When motion vector is same as that of reference mv if (skip_newmv) { return true; } } // Check the current motion vector is same as that of previously evaluated // motion vectors. for (PREDICTION_MODE inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV; inter_mv_mode++) { if (inter_mv_mode == this_mode) continue; if (is_single_pred && search_state->mode_checked[inter_mv_mode][ref_frame] && this_mv->as_int == search_state->frame_mv[inter_mv_mode][ref_frame].as_int) { skip_this_mv = 1; break; } } // Skip single mode if current motion vector is same that of previously // evaluated motion vectors. if (skip_this_mv && is_single_pred) return true; // For screen: for spatially flat blocks with non-zero motion, // skip newmv if the motion vector is (0, 0)-LAST, and color is not set. if (this_mode == NEWMV && cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && cpi->svc.spatial_layer_id == 0 && rt_sf->source_metrics_sb_nonrd) { if (this_mv->as_int == 0 && ref_frame == LAST_FRAME && x->block_is_zero_sad == 0 && ((x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) || cpi->rc.high_source_sad) && x->source_variance == 0) return true; } mi->mode = this_mode; mi->mv[0].as_int = this_mv->as_int; mi->mv[1].as_int = 0; if (!is_single_pred) mi->mv[1].as_int = search_state->frame_mv[this_mode][ref_frame2].as_int; // Set buffers to store predicted samples for reuse if (reuse_inter_pred) { if (!*this_mode_pred) { *this_mode_pred = &tmp_buffer[3]; } else { *this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)]; pd->dst.buf = (*this_mode_pred)->data; pd->dst.stride = bw; } } mi->motion_mode = SIMPLE_TRANSLATION; #if !CONFIG_REALTIME_ONLY if (cpi->oxcf.motion_mode_cfg.allow_warped_motion) { calc_num_proj_ref(cpi, x, mi); } #endif // set variance threshold for compound mode pruning if (rt_sf->prune_compoundmode_with_singlecompound_var && !is_single_pred && use_model_yrd_large) { const PREDICTION_MODE single_mode0 = compound_ref0_mode(this_mode); const PREDICTION_MODE single_mode1 = compound_ref1_mode(this_mode); var_threshold = AOMMIN(var_threshold, search_state->vars[INTER_OFFSET(single_mode0)][ref_frame]); var_threshold = AOMMIN(var_threshold, search_state->vars[INTER_OFFSET(single_mode1)][ref_frame2]); } // decide interpolation filter, build prediction signal, get sse const bool is_mv_subpel = (mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07); const bool enable_filt_search_this_mode = (filter_search_enabled_blk == 2) ? true : (filter_search_enabled_blk && !force_mv_inter_layer && is_single_pred && (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search)); if (is_mv_subpel && enable_filt_search_this_mode) { #if COLLECT_NONRD_PICK_MODE_STAT aom_usec_timer_start(&x->ms_stat_nonrd.timer2); #endif search_filter_ref( cpi, x, &search_state->this_rdc, &inter_pred_params_sr, mi_row, mi_col, tmp_buffer, bsize, reuse_inter_pred, this_mode_pred, &this_early_term, &var, use_model_yrd_large, best_pickmode->best_sse, is_single_pred); #if COLLECT_NONRD_PICK_MODE_STAT aom_usec_timer_mark(&x->ms_stat_nonrd.timer2); x->ms_stat_nonrd.ifs_time[bsize][this_mode] += aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2); #endif #if !CONFIG_REALTIME_ONLY } else if (cpi->oxcf.motion_mode_cfg.allow_warped_motion && this_mode == NEWMV) { // Find the best motion mode when current mode is NEWMV search_motion_mode(cpi, x, &search_state->this_rdc, mi_row, mi_col, bsize, &this_early_term, use_model_yrd_large, &rate_mv, best_pickmode->best_sse); if (this_mode == NEWMV) { this_mv[0] = mi->mv[0]; } #endif } else { mi->interp_filters = (filter_ref == SWITCHABLE) ? av1_broadcast_interp_filter(default_interp_filter) : av1_broadcast_interp_filter(filter_ref); if (force_mv_inter_layer) mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); // If it is sub-pel motion and cb_pred_filter_search is enabled, select // the pre-decided filter if (is_mv_subpel && cb_pred_filter_search) mi->interp_filters = av1_broadcast_interp_filter(filt_select); #if COLLECT_NONRD_PICK_MODE_STAT aom_usec_timer_start(&x->ms_stat_nonrd.timer2); #endif if (is_single_pred) { SubpelParams subpel_params; // Initialize inter mode level params for single reference mode. init_inter_mode_params(&mi->mv[0].as_mv, &inter_pred_params_sr, &subpel_params, xd->block_ref_scale_factors[0], pd->pre->width, pd->pre->height); av1_enc_build_inter_predictor_y_nonrd(xd, &inter_pred_params_sr, &subpel_params); } else { av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, AOM_PLANE_Y, AOM_PLANE_Y); } if (use_model_yrd_large) { model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, &search_state->this_rdc, &this_early_term, 0, best_pickmode->best_sse, &var, var_threshold); } else { model_rd_for_sb_y(cpi, bsize, x, xd, &search_state->this_rdc, &var, 0, &this_early_term); } #if COLLECT_NONRD_PICK_MODE_STAT aom_usec_timer_mark(&x->ms_stat_nonrd.timer2); x->ms_stat_nonrd.model_rd_time[bsize][this_mode] += aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2); #endif } // update variance for single mode if (is_single_pred) { search_state->vars[INTER_OFFSET(this_mode)][ref_frame] = var; if (this_mv->as_int == 0) { search_state->vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var; } } // prune compound mode based on single mode var threshold if (!is_single_pred && var > var_threshold) { if (reuse_inter_pred) free_pred_buffer(*this_mode_pred); return true; } if (ref_frame == LAST_FRAME && this_mv->as_int == 0) { *sse_zeromv_norm = (unsigned int)(search_state->this_rdc.sse >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize])); } // Perform early termination based on sse. if (rt_sf->sse_early_term_inter_search && early_term_inter_search_with_sse(rt_sf->sse_early_term_inter_search, bsize, search_state->this_rdc.sse, best_pickmode->best_sse, this_mode)) { if (reuse_inter_pred) free_pred_buffer(*this_mode_pred); return true; } #if COLLECT_NONRD_PICK_MODE_STAT x->ms_stat_nonrd.num_nonskipped_searches[bsize][this_mode]++; #endif const int skip_ctx = av1_get_skip_txfm_context(xd); const int skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][1]; const int no_skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][0]; const int64_t sse_y = search_state->this_rdc.sse; if (this_early_term) { search_state->this_rdc.skip_txfm = 1; search_state->this_rdc.rate = skip_txfm_cost; search_state->this_rdc.dist = search_state->this_rdc.sse << 4; } else { #if COLLECT_NONRD_PICK_MODE_STAT aom_usec_timer_start(&x->ms_stat_nonrd.timer2); #endif // Calculates RD Cost using Hadamard transform. av1_block_yrd(x, &search_state->this_rdc, &is_skippable, bsize, mi->tx_size); if (search_state->this_rdc.skip_txfm || RDCOST(x->rdmult, search_state->this_rdc.rate, search_state->this_rdc.dist) >= RDCOST(x->rdmult, 0, search_state->this_rdc.sse)) { if (!search_state->this_rdc.skip_txfm) { // Need to store "real" rdc for possible future use if UV rdc // disallows tx skip nonskip_rdc = search_state->this_rdc; nonskip_rdc.rate += no_skip_txfm_cost; } search_state->this_rdc.rate = skip_txfm_cost; search_state->this_rdc.skip_txfm = 1; search_state->this_rdc.dist = search_state->this_rdc.sse; } else { search_state->this_rdc.rate += no_skip_txfm_cost; } // Populate predicted sample for chroma planes based on color sensitivity. if ((x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])) { RD_STATS rdc_uv; const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x, xd->plane[AOM_PLANE_U].subsampling_y); if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) { av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, AOM_PLANE_U, AOM_PLANE_U); } if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) { av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, AOM_PLANE_V, AOM_PLANE_V); } // Compute sse for chroma planes. const int64_t sse_uv = av1_model_rd_for_sb_uv( cpi, uv_bsize, x, xd, &rdc_uv, AOM_PLANE_U, AOM_PLANE_V); if (rdc_uv.dist < x->min_dist_inter_uv) x->min_dist_inter_uv = rdc_uv.dist; search_state->this_rdc.sse += sse_uv; // Restore Y rdc if UV rdc disallows txfm skip if (search_state->this_rdc.skip_txfm && !rdc_uv.skip_txfm && nonskip_rdc.rate != INT_MAX) search_state->this_rdc = nonskip_rdc; if (is_single_pred) { search_state->uv_dist[INTER_OFFSET(this_mode)][ref_frame] = rdc_uv.dist; } search_state->this_rdc.rate += rdc_uv.rate; search_state->this_rdc.dist += rdc_uv.dist; search_state->this_rdc.skip_txfm = search_state->this_rdc.skip_txfm && rdc_uv.skip_txfm; } #if COLLECT_NONRD_PICK_MODE_STAT aom_usec_timer_mark(&x->ms_stat_nonrd.timer2); x->ms_stat_nonrd.txfm_time[bsize][this_mode] += aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2); #endif } this_best_mode = this_mode; // TODO(kyslov) account for UV prediction cost search_state->this_rdc.rate += rate_mv; if (!is_single_pred) { const int16_t mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame); search_state->this_rdc.rate += cost_mv_ref(mode_costs, this_mode, mode_ctx); } else { // If the current mode has zeromv but is not GLOBALMV, compare the rate // cost. If GLOBALMV is cheaper, use GLOBALMV instead. if (this_mode != GLOBALMV && this_mv->as_int == search_state->frame_mv[GLOBALMV][ref_frame].as_int) { if (is_globalmv_better(this_mode, ref_frame, rate_mv, mode_costs, search_state->single_inter_mode_costs, mbmi_ext)) { this_best_mode = GLOBALMV; } } search_state->this_rdc.rate += search_state ->single_inter_mode_costs[INTER_OFFSET(this_best_mode)][ref_frame]; } if (is_single_pred && this_mv->as_int == 0 && var < UINT_MAX) { search_state->vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var; } search_state->this_rdc.rate += search_state->ref_costs_single[ref_frame]; search_state->this_rdc.rdcost = RDCOST(x->rdmult, search_state->this_rdc.rate, search_state->this_rdc.dist); if (cpi->oxcf.rc_cfg.mode == AOM_CBR && is_single_pred) { newmv_diff_bias(xd, this_best_mode, &search_state->this_rdc, bsize, search_state->frame_mv[this_best_mode][ref_frame].as_mv.row, search_state->frame_mv[this_best_mode][ref_frame].as_mv.col, cpi->speed, x->source_variance, x->content_state_sb); } #if CONFIG_AV1_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow) { av1_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx); // Keep track of zero_last cost. if (ref_frame == LAST_FRAME && this_mv->as_int == 0) *zero_last_cost_orig = search_state->this_rdc.rdcost; } #else (void)(sse_y); #endif search_state->mode_checked[this_mode][ref_frame] = 1; search_state->mode_checked[this_best_mode][ref_frame] = 1; if (*check_globalmv) { int32_t abs_mv = abs(search_state->frame_mv[this_best_mode][ref_frame].as_mv.row) + abs(search_state->frame_mv[this_best_mode][ref_frame].as_mv.col); // Early exit check: if the magnitude of this_best_mode's mv is small // enough, we skip GLOBALMV check in the next loop iteration. if (abs_mv < 2) { *check_globalmv = false; } } #if COLLECT_NONRD_PICK_MODE_STAT aom_usec_timer_mark(&x->ms_stat_nonrd.timer1); x->ms_stat_nonrd.nonskipped_search_times[bsize][this_mode] += aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer1); #endif if (x->sb_me_block && ref_frame == LAST_FRAME && search_state->frame_mv[this_best_mode][ref_frame].as_int == x->sb_me_mv.as_int) *sb_me_has_been_tested = 1; // Copy best mode params to search state if (search_state->this_rdc.rdcost < search_state->best_rdc.rdcost) { search_state->best_rdc = search_state->this_rdc; *best_early_term = this_early_term; update_search_state_nonrd(search_state, mi, txfm_info, &nonskip_rdc, ctx, this_best_mode, sse_y); // This is needed for the compound modes. search_state->frame_mv_best[this_best_mode][ref_frame].as_int = search_state->frame_mv[this_best_mode][ref_frame].as_int; if (ref_frame2 > NONE_FRAME) { search_state->frame_mv_best[this_best_mode][ref_frame2].as_int = search_state->frame_mv[this_best_mode][ref_frame2].as_int; } if (reuse_inter_pred) { free_pred_buffer(best_pickmode->best_pred); best_pickmode->best_pred = *this_mode_pred; } } else { if (reuse_inter_pred) free_pred_buffer(*this_mode_pred); } if (*best_early_term && (idx > 0 || rt_sf->nonrd_aggressive_skip)) { txfm_info->skip_txfm = 1; if (!x->sb_me_block || *sb_me_has_been_tested) return false; } return true; } // Function to perform screen content mode evaluation for non-rd static AOM_FORCE_INLINE void handle_screen_content_mode_nonrd( AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state, PRED_BUFFER *this_mode_pred, PICK_MODE_CONTEXT *ctx, PRED_BUFFER *tmp_buffer, struct buf_2d *orig_dst, int skip_idtx_palette, int try_palette, BLOCK_SIZE bsize, int reuse_inter_pred, int mi_col, int mi_row) { AV1_COMMON *const cm = &cpi->common; const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mi = xd->mi[0]; struct macroblockd_plane *const pd = &xd->plane[0]; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; TxfmSearchInfo *txfm_info = &x->txfm_search_info; BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode; // TODO(marpan): Only allow for 8 bit-depth for now, re-enable for 10/12 bit // when issue 3359 is fixed. if (cm->seq_params->bit_depth == 8 && rt_sf->use_idtx_nonrd && !skip_idtx_palette && !cpi->oxcf.txfm_cfg.use_inter_dct_only && !x->force_zeromv_skip_for_blk && is_inter_mode(best_pickmode->best_mode) && best_pickmode->best_pred != NULL && (!rt_sf->prune_idtx_nonrd || (rt_sf->prune_idtx_nonrd && bsize <= BLOCK_32X32 && best_pickmode->best_mode_skip_txfm != 1 && x->source_variance > 200))) { RD_STATS idtx_rdc; av1_init_rd_stats(&idtx_rdc); int is_skippable; this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)]; pd->dst.buf = this_mode_pred->data; pd->dst.stride = bw; const PRED_BUFFER *const best_pred = best_pickmode->best_pred; av1_block_yrd_idtx(x, best_pred->data, best_pred->stride, &idtx_rdc, &is_skippable, bsize, mi->tx_size); int64_t idx_rdcost_y = RDCOST(x->rdmult, idtx_rdc.rate, idtx_rdc.dist); int allow_idtx = 1; // Incorporate color into rd cost. if ((x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])) { RD_STATS rdc_uv; const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x, xd->plane[AOM_PLANE_U].subsampling_y); if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) { av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, AOM_PLANE_U, AOM_PLANE_U); } if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) { av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, AOM_PLANE_V, AOM_PLANE_V); } av1_model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, AOM_PLANE_U, AOM_PLANE_V); if (rdc_uv.dist < x->min_dist_inter_uv) x->min_dist_inter_uv = rdc_uv.dist; idtx_rdc.rate += rdc_uv.rate; idtx_rdc.dist += rdc_uv.dist; idtx_rdc.skip_txfm = idtx_rdc.skip_txfm && rdc_uv.skip_txfm; if (idx_rdcost_y == 0 && rdc_uv.dist > 0 && x->source_variance < 3000 && x->content_state_sb.source_sad_nonrd > kMedSad) allow_idtx = 0; } int64_t idx_rdcost = RDCOST(x->rdmult, idtx_rdc.rate, idtx_rdc.dist); if (allow_idtx && idx_rdcost < search_state->best_rdc.rdcost) { best_pickmode->tx_type = IDTX; search_state->best_rdc.rdcost = idx_rdcost; best_pickmode->best_mode_skip_txfm = idtx_rdc.skip_txfm; if (!idtx_rdc.skip_txfm) { memcpy(ctx->blk_skip, txfm_info->blk_skip, sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); } xd->tx_type_map[0] = best_pickmode->tx_type; memset(ctx->tx_type_map, best_pickmode->tx_type, ctx->num_4x4_blk); memset(xd->tx_type_map, best_pickmode->tx_type, ctx->num_4x4_blk); } pd->dst = *orig_dst; } if (!try_palette) return; const unsigned int intra_ref_frame_cost = search_state->ref_costs_single[INTRA_FRAME]; if (!is_mode_intra(best_pickmode->best_mode)) { PRED_BUFFER *const best_pred = best_pickmode->best_pred; if (reuse_inter_pred && best_pred != NULL) { if (best_pred->data == orig_dst->buf) { this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)]; aom_convolve_copy(best_pred->data, best_pred->stride, this_mode_pred->data, this_mode_pred->stride, bw, bh); best_pickmode->best_pred = this_mode_pred; } } pd->dst = *orig_dst; } // Search palette mode for Luma plane in inter frame. av1_search_palette_mode_luma(cpi, x, bsize, intra_ref_frame_cost, ctx, &search_state->this_rdc, search_state->best_rdc.rdcost); // Update best mode data in search_state if (search_state->this_rdc.rdcost < search_state->best_rdc.rdcost) { best_pickmode->pmi = mi->palette_mode_info; best_pickmode->best_mode = DC_PRED; mi->mv[0].as_int = INVALID_MV; mi->mv[1].as_int = INVALID_MV; best_pickmode->best_ref_frame = INTRA_FRAME; best_pickmode->best_second_ref_frame = NONE; search_state->best_rdc.rate = search_state->this_rdc.rate; search_state->best_rdc.dist = search_state->this_rdc.dist; search_state->best_rdc.rdcost = search_state->this_rdc.rdcost; best_pickmode->best_mode_skip_txfm = search_state->this_rdc.skip_txfm; // Keep the skip_txfm off if the color_sensitivity is set. if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) search_state->this_rdc.skip_txfm = 0; if (!search_state->this_rdc.skip_txfm) { memcpy(ctx->blk_skip, txfm_info->blk_skip, sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); } if (xd->tx_type_map[0] != DCT_DCT) av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); } } static inline bool enable_palette(AV1_COMP *cpi, bool is_mode_intra, BLOCK_SIZE bsize, unsigned int source_variance, int force_zeromv_skip, int skip_idtx_palette, int force_palette_test, unsigned int best_intra_sad_norm) { if (!cpi->oxcf.tool_cfg.enable_palette) return false; if (!av1_allow_palette(cpi->common.features.allow_screen_content_tools, bsize)) { return false; } if (skip_idtx_palette) return false; if (cpi->sf.rt_sf.prune_palette_search_nonrd > 1 && ((cpi->rc.high_source_sad && cpi->ppi->rtc_ref.non_reference_frame) || bsize > BLOCK_16X16)) { return false; } if (prune_palette_testing_inter(cpi, source_variance) && best_intra_sad_norm < 10) return false; if ((is_mode_intra || force_palette_test) && source_variance > 0 && !force_zeromv_skip && (cpi->rc.high_source_sad || source_variance > 300)) { return true; } else { return false; } } /*!\brief AV1 inter mode selection based on Non-RD optimized model. * * \ingroup nonrd_mode_search * \callgraph * Top level function for Non-RD optimized inter mode selection. * This finction will loop over subset of inter modes and select the best one * based on calculated modelled RD cost. While making decisions which modes to * check, this function applies heuristics based on previously checked modes, * block residual variance, block size, and other factors to prune certain * modes and reference frames. Currently only single reference frame modes * are checked. Additional heuristics are applied to decide if intra modes * need to be checked. * * * \param[in] cpi Top-level encoder structure * \param[in] tile_data Pointer to struct holding adaptive data/contexts/models for the tile during encoding * \param[in] x Pointer to structure holding all the data for the current macroblock * \param[in] rd_cost Struct to keep track of the RD information * \param[in] bsize Current block size * \param[in] ctx Structure to hold snapshot of coding context during the mode picking process * * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x * is modified to store information about the best mode computed * in this function. The rd_cost struct is also updated with the RD stats * corresponding to the best mode found. */ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x, RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { AV1_COMMON *const cm = &cpi->common; SVC *const svc = &cpi->svc; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mi = xd->mi[0]; struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; MV_REFERENCE_FRAME ref_frame, ref_frame2; const unsigned char segment_id = mi->segment_id; int best_early_term = 0; int force_skip_low_temp_var = 0; unsigned int sse_zeromv_norm = UINT_MAX; const int num_inter_modes = NUM_INTER_MODES; const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; bool check_globalmv = rt_sf->check_globalmv_on_single_ref; PRED_BUFFER tmp_buffer[4]; DECLARE_ALIGNED(16, uint8_t, pred_buf[MAX_MB_PLANE * MAX_SB_SQUARE]); PRED_BUFFER *this_mode_pred = NULL; const int reuse_inter_pred = rt_sf->reuse_inter_pred_nonrd && cm->seq_params->bit_depth == AOM_BITS_8; InterModeSearchStateNonrd search_state; av1_zero(search_state.use_ref_frame_mask); av1_zero(search_state.use_scaled_ref_frame); BEST_PICKMODE *const best_pickmode = &search_state.best_pickmode; (void)tile_data; const int bh = block_size_high[bsize]; const int bw = block_size_wide[bsize]; const int pixels_in_block = bh * bw; struct buf_2d orig_dst = pd->dst; const TxfmSearchParams *txfm_params = &x->txfm_search_params; TxfmSearchInfo *txfm_info = &x->txfm_search_info; #if COLLECT_NONRD_PICK_MODE_STAT // Mode statistics can be collected only when num_workers is 1 assert(cpi->mt_info.num_workers <= 1); aom_usec_timer_start(&x->ms_stat_nonrd.bsize_timer); #endif int64_t thresh_sad_pred = INT64_MAX; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; int_mv svc_mv = { .as_int = 0 }; int force_mv_inter_layer = 0; bool comp_use_zero_zeromv_only = 0; int tot_num_comp_modes = NUM_COMP_INTER_MODES_RT; #if CONFIG_AV1_TEMPORAL_DENOISING const int denoise_recheck_zeromv = 1; AV1_PICKMODE_CTX_DEN ctx_den; int64_t zero_last_cost_orig = INT64_MAX; int denoise_svc_pickmode = 1; const int resize_pending = is_frame_resize_pending(cpi); #endif const ModeCosts *mode_costs = &x->mode_costs; struct scale_factors sf_no_scale; av1_setup_scale_factors_for_frame(&sf_no_scale, cm->width, cm->height, cm->width, cm->height); if (reuse_inter_pred) { for (int buf_idx = 0; buf_idx < 3; buf_idx++) { tmp_buffer[buf_idx].data = &pred_buf[pixels_in_block * buf_idx]; tmp_buffer[buf_idx].stride = bw; tmp_buffer[buf_idx].in_use = 0; } tmp_buffer[3].data = pd->dst.buf; tmp_buffer[3].stride = pd->dst.stride; tmp_buffer[3].in_use = 0; } const int gf_temporal_ref = is_same_gf_and_last_scale(cm); // If the lower spatial layer uses an averaging filter for downsampling // (phase = 8), the target decimated pixel is shifted by (1/2, 1/2) relative // to source, so use subpel motion vector to compensate. The nonzero motion // is half pixel shifted to left and top, so (-4, -4). This has more effect // on higher resolutions, so condition it on that for now. // Exclude quality layers, which have the same resolution and hence no shift. if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 && !svc->has_lower_quality_layer && svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 && cm->width * cm->height > 640 * 480) { svc_mv.as_mv.row = -4; svc_mv.as_mv.col = -4; } // Setup parameters used for inter mode evaluation. set_params_nonrd_pick_inter_mode(cpi, x, &search_state, rd_cost, &force_skip_low_temp_var, mi_row, mi_col, gf_temporal_ref, segment_id, bsize #if CONFIG_AV1_TEMPORAL_DENOISING , ctx, denoise_svc_pickmode #endif ); if (rt_sf->use_comp_ref_nonrd && is_comp_ref_allowed(bsize)) { // Only search compound if bsize \gt BLOCK_16X16. if (bsize > BLOCK_16X16) { comp_use_zero_zeromv_only = rt_sf->check_only_zero_zeromv_on_large_blocks; } else { tot_num_comp_modes = 0; } } else { tot_num_comp_modes = 0; } if (x->pred_mv_sad[LAST_FRAME] != INT_MAX) { thresh_sad_pred = ((int64_t)x->pred_mv_sad[LAST_FRAME]) << 1; // Increase threshold for less aggressive pruning. if (rt_sf->nonrd_prune_ref_frame_search == 1) thresh_sad_pred += (x->pred_mv_sad[LAST_FRAME] >> 2); } const int use_model_yrd_large = get_model_rd_flag(cpi, xd, bsize); // decide block-level interp filter search flags: // filter_search_enabled_blk: // 0: disabled // 1: filter search depends on mode properties // 2: filter search forced since prediction is unreliable // cb_pred_filter_search 0: disabled cb prediction InterpFilter filt_select = EIGHTTAP_REGULAR; const int cb_pred_filter_search = x->content_state_sb.source_sad_nonrd > kVeryLowSad ? cpi->sf.interp_sf.cb_pred_filter_search : 0; const int filter_search_enabled_blk = is_filter_search_enabled_blk(cpi, x, mi_row, mi_col, bsize, segment_id, cb_pred_filter_search, &filt_select); #if COLLECT_NONRD_PICK_MODE_STAT x->ms_stat_nonrd.num_blocks[bsize]++; #endif init_mbmi_nonrd(mi, DC_PRED, NONE_FRAME, NONE_FRAME, cm); mi->tx_size = AOMMIN( AOMMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]), TX_16X16); fill_single_inter_mode_costs(search_state.single_inter_mode_costs, num_inter_modes, ref_mode_set, mode_costs, mbmi_ext->mode_context); MV_REFERENCE_FRAME last_comp_ref_frame = NONE_FRAME; // Initialize inter prediction params at block level for single reference // mode. InterPredParams inter_pred_params_sr; init_inter_block_params(&inter_pred_params_sr, pd->width, pd->height, mi_row * MI_SIZE, mi_col * MI_SIZE, pd->subsampling_x, pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), /*is_intrabc=*/0); inter_pred_params_sr.conv_params = get_conv_params(/*do_average=*/0, AOM_PLANE_Y, xd->bd); x->block_is_zero_sad = x->content_state_sb.source_sad_nonrd == kZeroSad || segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && !x->force_zeromv_skip_for_blk && x->content_state_sb.source_sad_nonrd != kZeroSad && x->source_variance == 0 && bsize < cm->seq_params->sb_size && search_state.yv12_mb[LAST_FRAME][0].width == cm->width && search_state.yv12_mb[LAST_FRAME][0].height == cm->height) { set_block_source_sad(cpi, x, bsize, &search_state.yv12_mb[LAST_FRAME][0]); } int sb_me_has_been_tested = 0; x->sb_me_block = x->sb_me_partition; // Only use this feature (force testing of superblock motion) if coding // block size is large. if (x->sb_me_block) { if (cm->seq_params->sb_size == BLOCK_128X128 && bsize < BLOCK_64X64) x->sb_me_block = 0; else if (cm->seq_params->sb_size == BLOCK_64X64 && bsize < BLOCK_32X32) x->sb_me_block = 0; } x->min_dist_inter_uv = INT64_MAX; for (int idx = 0; idx < num_inter_modes + tot_num_comp_modes; ++idx) { // If we are at the first compound mode, and the single modes already // perform well, then end the search. if (rt_sf->skip_compound_based_on_var && idx == num_inter_modes && skip_comp_based_on_var(search_state.vars, bsize)) { break; } int is_single_pred = 1; PREDICTION_MODE this_mode; if (idx == 0 && !x->force_zeromv_skip_for_blk) { // Set color sensitivity on first tested mode only. // Use y-sad already computed in find_predictors: take the sad with motion // vector closest to 0; the uv-sad computed below in set_color_sensitivity // is for zeromv. // For screen: first check if golden reference is being used, if so, // force color_sensitivity on (=1) if the color sensitivity for sb_g is 1. // The check in set_color_sensitivity() will then follow and check for // setting the flag if the level is still 2 or 0. if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && search_state.use_ref_frame_mask[GOLDEN_FRAME]) { if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1) x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = 1; if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1) x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = 1; } if (search_state.use_ref_frame_mask[LAST_FRAME] && x->pred_mv0_sad[LAST_FRAME] != INT_MAX) { int y_sad = x->pred_mv0_sad[LAST_FRAME]; if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX && (abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.col) + abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.row)) < (abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) + abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.row))) y_sad = x->pred_mv1_sad[LAST_FRAME]; set_color_sensitivity(cpi, x, bsize, y_sad, x->source_variance, search_state.yv12_mb[LAST_FRAME]); } } // Check the inter mode can be skipped based on mode statistics and speed // features settings. if (skip_inter_mode_nonrd(cpi, x, &search_state, &thresh_sad_pred, &force_mv_inter_layer, &is_single_pred, &this_mode, &last_comp_ref_frame, &ref_frame, &ref_frame2, idx, svc_mv, force_skip_low_temp_var, sse_zeromv_norm, num_inter_modes, segment_id, bsize, comp_use_zero_zeromv_only, check_globalmv)) continue; // Select prediction reference frames. for (int plane = 0; plane < MAX_MB_PLANE; plane++) { xd->plane[plane].pre[0] = search_state.yv12_mb[ref_frame][plane]; if (!is_single_pred) xd->plane[plane].pre[1] = search_state.yv12_mb[ref_frame2][plane]; } mi->ref_frame[0] = ref_frame; mi->ref_frame[1] = ref_frame2; set_ref_ptrs(cm, xd, ref_frame, ref_frame2); // Check if the scaled reference frame should be used. This is set in the // find_predictors() for each usable reference. If so, set the // block_ref_scale_factors[] to no reference scaling. if (search_state.use_scaled_ref_frame[ref_frame]) { xd->block_ref_scale_factors[0] = &sf_no_scale; } if (!is_single_pred && search_state.use_scaled_ref_frame[ref_frame2]) { xd->block_ref_scale_factors[1] = &sf_no_scale; } // Perform inter mode evaluation for non-rd if (!handle_inter_mode_nonrd( cpi, x, &search_state, ctx, &this_mode_pred, tmp_buffer, inter_pred_params_sr, &best_early_term, &sse_zeromv_norm, &check_globalmv, #if CONFIG_AV1_TEMPORAL_DENOISING &zero_last_cost_orig, denoise_svc_pickmode, #endif idx, force_mv_inter_layer, is_single_pred, gf_temporal_ref, use_model_yrd_large, filter_search_enabled_blk, bsize, this_mode, filt_select, cb_pred_filter_search, reuse_inter_pred, &sb_me_has_been_tested)) { break; } } // Restore mode data of best inter mode mi->mode = best_pickmode->best_mode; mi->motion_mode = best_pickmode->best_motion_mode; mi->wm_params = best_pickmode->wm_params; mi->num_proj_ref = best_pickmode->num_proj_ref; mi->interp_filters = best_pickmode->best_pred_filter; mi->tx_size = best_pickmode->best_tx_size; memset(mi->inter_tx_size, mi->tx_size, sizeof(mi->inter_tx_size)); mi->ref_frame[0] = best_pickmode->best_ref_frame; mi->mv[0].as_int = search_state .frame_mv_best[best_pickmode->best_mode] [best_pickmode->best_ref_frame] .as_int; mi->mv[1].as_int = 0; if (best_pickmode->best_second_ref_frame > INTRA_FRAME) { mi->ref_frame[1] = best_pickmode->best_second_ref_frame; mi->mv[1].as_int = search_state .frame_mv_best[best_pickmode->best_mode] [best_pickmode->best_second_ref_frame] .as_int; } // Perform intra prediction search, if the best SAD is above a certain // threshold. mi->angle_delta[PLANE_TYPE_Y] = 0; mi->angle_delta[PLANE_TYPE_UV] = 0; mi->filter_intra_mode_info.use_filter_intra = 0; #if COLLECT_NONRD_PICK_MODE_STAT aom_usec_timer_start(&x->ms_stat_nonrd.timer1); x->ms_stat_nonrd.num_searches[bsize][DC_PRED]++; x->ms_stat_nonrd.num_nonskipped_searches[bsize][DC_PRED]++; #endif int force_palette_test = 0; if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && x->content_state_sb.source_sad_nonrd != kZeroSad && bsize <= BLOCK_16X16) { unsigned int thresh_sse = cpi->rc.high_source_sad ? 15000 : 200000; unsigned int thresh_source_var = cpi->rc.high_source_sad ? 50 : 200; unsigned int best_sse_inter_motion = (unsigned int)(search_state.best_rdc.sse >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize])); if (best_sse_inter_motion > thresh_sse && x->source_variance > thresh_source_var) force_palette_test = 1; } // Evaluate Intra modes in inter frame unsigned int best_intra_sad_norm = UINT_MAX; if (!x->force_zeromv_skip_for_blk) av1_estimate_intra_mode(cpi, x, bsize, best_early_term, search_state.ref_costs_single[INTRA_FRAME], reuse_inter_pred, &orig_dst, tmp_buffer, &this_mode_pred, &search_state.best_rdc, best_pickmode, ctx, &best_intra_sad_norm); int skip_idtx_palette = (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) && x->content_state_sb.source_sad_nonrd != kZeroSad && !cpi->rc.high_source_sad && (cpi->rc.high_motion_content_screen_rtc || cpi->rc.frame_source_sad < 10000); bool try_palette = enable_palette( cpi, is_mode_intra(best_pickmode->best_mode), bsize, x->source_variance, x->force_zeromv_skip_for_blk, skip_idtx_palette, force_palette_test, best_intra_sad_norm); if (try_palette && prune_palette_testing_inter(cpi, x->source_variance)) x->color_palette_thresh = 32; // Perform screen content mode evaluation for non-rd handle_screen_content_mode_nonrd( cpi, x, &search_state, this_mode_pred, ctx, tmp_buffer, &orig_dst, skip_idtx_palette, try_palette, bsize, reuse_inter_pred, mi_col, mi_row); #if COLLECT_NONRD_PICK_MODE_STAT aom_usec_timer_mark(&x->ms_stat_nonrd.timer1); x->ms_stat_nonrd.nonskipped_search_times[bsize][DC_PRED] += aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer1); #endif pd->dst = orig_dst; // Best mode is finalized. Restore the mode data to mbmi if (try_palette) mi->palette_mode_info = best_pickmode->pmi; mi->mode = best_pickmode->best_mode; mi->ref_frame[0] = best_pickmode->best_ref_frame; mi->ref_frame[1] = best_pickmode->best_second_ref_frame; // For lossless: always force the skip flags off. if (is_lossless_requested(&cpi->oxcf.rc_cfg)) { txfm_info->skip_txfm = 0; memset(ctx->blk_skip, 0, sizeof(ctx->blk_skip[0]) * ctx->num_4x4_blk); } else { txfm_info->skip_txfm = best_pickmode->best_mode_skip_txfm; } if (has_second_ref(mi)) { mi->comp_group_idx = 0; mi->compound_idx = 1; mi->interinter_comp.type = COMPOUND_AVERAGE; } if (!is_inter_block(mi)) { mi->interp_filters = av1_broadcast_interp_filter(SWITCHABLE_FILTERS); } else { // If inter mode is selected and ref_frame was one that uses the // scaled reference frame, then we can't use reuse_inter_pred. if (search_state.use_scaled_ref_frame[best_pickmode->best_ref_frame] || (has_second_ref(mi) && search_state .use_scaled_ref_frame[best_pickmode->best_second_ref_frame])) x->reuse_inter_pred = 0; } // Restore the predicted samples of best mode to final buffer if (reuse_inter_pred && best_pickmode->best_pred != NULL) { PRED_BUFFER *const best_pred = best_pickmode->best_pred; if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) { aom_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, pd->dst.stride, bw, bh); } } #if CONFIG_AV1_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && resize_pending == 0 && denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow && cpi->denoiser.reset == 0) { AV1_DENOISER_DECISION decision = COPY_BLOCK; ctx->sb_skip_denoising = 0; av1_pickmode_ctx_den_update( &ctx_den, zero_last_cost_orig, search_state.ref_costs_single, search_state.frame_mv, reuse_inter_pred, best_pickmode); av1_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision, gf_temporal_ref); if (denoise_recheck_zeromv) recheck_zeromv_after_denoising( cpi, mi, x, xd, decision, &ctx_den, search_state.yv12_mb, &search_state.best_rdc, best_pickmode, bsize, mi_row, mi_col); best_pickmode->best_ref_frame = ctx_den.best_ref_frame; } #endif // Update the factors used for RD thresholding for all modes. if (cpi->sf.inter_sf.adaptive_rd_thresh && !has_second_ref(mi)) { THR_MODES best_mode_idx = mode_idx[best_pickmode->best_ref_frame][mode_offset(mi->mode)]; if (best_pickmode->best_ref_frame == INTRA_FRAME) { // Only consider the modes that are included in the intra_mode_list. int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE); for (int mode_index = 0; mode_index < intra_modes; mode_index++) { update_thresh_freq_fact(cpi, x, bsize, INTRA_FRAME, best_mode_idx, intra_mode_list[mode_index]); } } else { PREDICTION_MODE this_mode; for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { update_thresh_freq_fact(cpi, x, bsize, best_pickmode->best_ref_frame, best_mode_idx, this_mode); } } } #if CONFIG_INTERNAL_STATS store_coding_context_nonrd(x, ctx, mi->mode); #else store_coding_context_nonrd(x, ctx); #endif // CONFIG_INTERNAL_STATS #if COLLECT_NONRD_PICK_MODE_STAT aom_usec_timer_mark(&x->ms_stat_nonrd.bsize_timer); x->ms_stat_nonrd.total_block_times[bsize] += aom_usec_timer_elapsed(&x->ms_stat_nonrd.bsize_timer); print_time(&x->ms_stat_nonrd, bsize, cm->mi_params.mi_rows, cm->mi_params.mi_cols, mi_row, mi_col); #endif // COLLECT_NONRD_PICK_MODE_STAT *rd_cost = search_state.best_rdc; // Reset the xd->block_ref_scale_factors[i], as they may have // been set to pointer &sf_no_scale, which becomes invalid afer // this function. set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); } aom-3.12.1/av1/encoder/optical_flow.c000066400000000000000000001265401477627663500173450ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_config.h" #include "aom_dsp/mathutils.h" #include "aom_mem/aom_mem.h" #include "av1/common/av1_common_int.h" #include "av1/encoder/encoder.h" #include "av1/encoder/optical_flow.h" #include "av1/encoder/sparse_linear_solver.h" #include "av1/encoder/reconinter_enc.h" #if CONFIG_OPTICAL_FLOW_API void av1_init_opfl_params(OPFL_PARAMS *opfl_params) { opfl_params->pyramid_levels = OPFL_PYRAMID_LEVELS; opfl_params->warping_steps = OPFL_WARPING_STEPS; opfl_params->lk_params = NULL; } void av1_init_lk_params(LK_PARAMS *lk_params) { lk_params->window_size = OPFL_WINDOW_SIZE; } // Helper function to determine whether a frame is encoded with high bit-depth. static inline int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) { return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; } // Helper function to determine whether optical flow method is sparse. static inline int is_sparse(const OPFL_PARAMS *opfl_params) { return (opfl_params->flags & OPFL_FLAG_SPARSE) ? 1 : 0; } static void gradients_over_window(const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref_frame, const double x_coord, const double y_coord, const int window_size, const int bit_depth, double *ix, double *iy, double *it, LOCALMV *mv); // coefficients for bilinear interpolation on unit square static int pixel_interp(const double x, const double y, const double b00, const double b01, const double b10, const double b11) { const int xint = (int)x; const int yint = (int)y; const double xdec = x - xint; const double ydec = y - yint; const double a = (1 - xdec) * (1 - ydec); const double b = xdec * (1 - ydec); const double c = (1 - xdec) * ydec; const double d = xdec * ydec; // if x, y are already integers, this results to b00 int interp = (int)round(a * b00 + b * b01 + c * b10 + d * b11); return interp; } // Scharr filter to compute spatial gradient static void spatial_gradient(const YV12_BUFFER_CONFIG *frame, const int x_coord, const int y_coord, const int direction, double *derivative) { double *filter; // Scharr filters double gx[9] = { -3, 0, 3, -10, 0, 10, -3, 0, 3 }; double gy[9] = { -3, -10, -3, 0, 0, 0, 3, 10, 3 }; if (direction == 0) { // x direction filter = gx; } else { // y direction filter = gy; } int idx = 0; double d = 0; for (int yy = -1; yy <= 1; yy++) { for (int xx = -1; xx <= 1; xx++) { d += filter[idx] * frame->y_buffer[(y_coord + yy) * frame->y_stride + (x_coord + xx)]; idx++; } } // normalization scaling factor for scharr *derivative = d / 32.0; } // Determine the spatial gradient at subpixel locations // For example, when reducing images for pyramidal LK, // corners found in original image may be at subpixel locations. static void gradient_interp(double *fullpel_deriv, const double x_coord, const double y_coord, const int w, const int h, double *derivative) { const int xint = (int)x_coord; const int yint = (int)y_coord; double interp; if (xint + 1 > w - 1 || yint + 1 > h - 1) { interp = fullpel_deriv[yint * w + xint]; } else { interp = pixel_interp(x_coord, y_coord, fullpel_deriv[yint * w + xint], fullpel_deriv[yint * w + (xint + 1)], fullpel_deriv[(yint + 1) * w + xint], fullpel_deriv[(yint + 1) * w + (xint + 1)]); } *derivative = interp; } static void temporal_gradient(const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *frame2, const double x_coord, const double y_coord, const int bit_depth, double *derivative, LOCALMV *mv) { const int w = 2; const int h = 2; uint8_t pred1[4]; uint8_t pred2[4]; const int y = (int)y_coord; const int x = (int)x_coord; const double ydec = y_coord - y; const double xdec = x_coord - x; const int is_intrabc = 0; // Is intra-copied? const int is_high_bitdepth = is_frame_high_bitdepth(frame2); const int subsampling_x = 0, subsampling_y = 0; // for y-buffer const int_interpfilters interp_filters = av1_broadcast_interp_filter(MULTITAP_SHARP); const int plane = 0; // y-plane const struct buf_2d ref_buf2 = { NULL, frame2->y_buffer, frame2->y_crop_width, frame2->y_crop_height, frame2->y_stride }; struct scale_factors scale; av1_setup_scale_factors_for_frame(&scale, frame->y_crop_width, frame->y_crop_height, frame->y_crop_width, frame->y_crop_height); InterPredParams inter_pred_params; av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x, subsampling_y, bit_depth, is_high_bitdepth, is_intrabc, &scale, &ref_buf2, interp_filters); inter_pred_params.interp_filter_params[0] = &av1_interp_filter_params_list[interp_filters.as_filters.x_filter]; inter_pred_params.interp_filter_params[1] = &av1_interp_filter_params_list[interp_filters.as_filters.y_filter]; inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth); MV newmv = { .row = (int16_t)round((mv->row + xdec) * 8), .col = (int16_t)round((mv->col + ydec) * 8) }; av1_enc_build_one_inter_predictor(pred2, w, &newmv, &inter_pred_params); const struct buf_2d ref_buf1 = { NULL, frame->y_buffer, frame->y_crop_width, frame->y_crop_height, frame->y_stride }; av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x, subsampling_y, bit_depth, is_high_bitdepth, is_intrabc, &scale, &ref_buf1, interp_filters); inter_pred_params.interp_filter_params[0] = &av1_interp_filter_params_list[interp_filters.as_filters.x_filter]; inter_pred_params.interp_filter_params[1] = &av1_interp_filter_params_list[interp_filters.as_filters.y_filter]; inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth); MV zeroMV = { .row = (int16_t)round(xdec * 8), .col = (int16_t)round(ydec * 8) }; av1_enc_build_one_inter_predictor(pred1, w, &zeroMV, &inter_pred_params); *derivative = pred2[0] - pred1[0]; } // Numerical differentiate over window_size x window_size surrounding (x,y) // location. Alters ix, iy, it to contain numerical partial derivatives static void gradients_over_window(const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref_frame, const double x_coord, const double y_coord, const int window_size, const int bit_depth, double *ix, double *iy, double *it, LOCALMV *mv) { const double left = x_coord - window_size / 2.0; const double top = y_coord - window_size / 2.0; // gradient operators need pixel before and after (start at 1) const double x_start = AOMMAX(1, left); const double y_start = AOMMAX(1, top); const int frame_height = frame->y_crop_height; const int frame_width = frame->y_crop_width; double deriv_x; double deriv_y; double deriv_t; const double x_end = AOMMIN(x_coord + window_size / 2.0, frame_width - 2); const double y_end = AOMMIN(y_coord + window_size / 2.0, frame_height - 2); const int xs = (int)AOMMAX(1, x_start - 1); const int ys = (int)AOMMAX(1, y_start - 1); const int xe = (int)AOMMIN(x_end + 2, frame_width - 2); const int ye = (int)AOMMIN(y_end + 2, frame_height - 2); // with normalization, gradients may be double values double *fullpel_dx = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_x)); double *fullpel_dy = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_y)); if (!fullpel_dx || !fullpel_dy) { aom_free(fullpel_dx); aom_free(fullpel_dy); return; } // TODO(any): This could be more efficient in the case that x_coord // and y_coord are integers.. but it may look more messy. // calculate spatial gradients at full pixel locations for (int j = ys; j < ye; j++) { for (int i = xs; i < xe; i++) { spatial_gradient(frame, i, j, 0, &deriv_x); spatial_gradient(frame, i, j, 1, &deriv_y); int idx = (j - ys) * (xe - xs) + (i - xs); fullpel_dx[idx] = deriv_x; fullpel_dy[idx] = deriv_y; } } // compute numerical differentiation for every pixel in window // (this potentially includes subpixels) for (double j = y_start; j < y_end; j++) { for (double i = x_start; i < x_end; i++) { temporal_gradient(frame, ref_frame, i, j, bit_depth, &deriv_t, mv); gradient_interp(fullpel_dx, i - xs, j - ys, xe - xs, ye - ys, &deriv_x); gradient_interp(fullpel_dy, i - xs, j - ys, xe - xs, ye - ys, &deriv_y); int idx = (int)(j - top) * window_size + (int)(i - left); ix[idx] = deriv_x; iy[idx] = deriv_y; it[idx] = deriv_t; } } // TODO(any): to avoid setting deriv arrays to zero for every iteration, // could instead pass these two values back through function call // int first_idx = (int)(y_start - top) * window_size + (int)(x_start - left); // int width = window_size - ((int)(x_start - left) + (int)(left + window_size // - x_end)); aom_free(fullpel_dx); aom_free(fullpel_dy); } // To compute eigenvalues of 2x2 matrix: Solve for lambda where // Determinant(matrix - lambda*identity) == 0 static void eigenvalues_2x2(const double *matrix, double *eig) { const double a = 1; const double b = -1 * matrix[0] - matrix[3]; const double c = -1 * matrix[1] * matrix[2] + matrix[0] * matrix[3]; // quadratic formula const double discriminant = b * b - 4 * a * c; eig[0] = (-b - sqrt(discriminant)) / (2.0 * a); eig[1] = (-b + sqrt(discriminant)) / (2.0 * a); // double check that eigenvalues are ordered by magnitude if (fabs(eig[0]) > fabs(eig[1])) { double tmp = eig[0]; eig[0] = eig[1]; eig[1] = tmp; } } // Shi-Tomasi corner detection criteria static double corner_score(const YV12_BUFFER_CONFIG *frame_to_filter, const YV12_BUFFER_CONFIG *ref_frame, const int x, const int y, double *i_x, double *i_y, double *i_t, const int n, const int bit_depth) { double eig[2]; LOCALMV mv = { .row = 0, .col = 0 }; // TODO(any): technically, ref_frame and i_t are not used by corner score // so these could be replaced by dummy variables, // or change this to spatial gradient function over window only gradients_over_window(frame_to_filter, ref_frame, x, y, n, bit_depth, i_x, i_y, i_t, &mv); double Mres1[1] = { 0 }, Mres2[1] = { 0 }, Mres3[1] = { 0 }; multiply_mat(i_x, i_x, Mres1, 1, n * n, 1); multiply_mat(i_x, i_y, Mres2, 1, n * n, 1); multiply_mat(i_y, i_y, Mres3, 1, n * n, 1); double M[4] = { Mres1[0], Mres2[0], Mres2[0], Mres3[0] }; eigenvalues_2x2(M, eig); return fabs(eig[0]); } // Finds corners in frame_to_filter // For less strict requirements (i.e. more corners), decrease threshold static int detect_corners(const YV12_BUFFER_CONFIG *frame_to_filter, const YV12_BUFFER_CONFIG *ref_frame, const int maxcorners, int *ref_corners, const int bit_depth) { const int frame_height = frame_to_filter->y_crop_height; const int frame_width = frame_to_filter->y_crop_width; // TODO(any): currently if maxcorners is decreased, then it only means // corners will be omited from bottom-right of image. if maxcorners // is actually used, then this algorithm would need to re-iterate // and choose threshold based on that assert(maxcorners == frame_height * frame_width); int countcorners = 0; const double threshold = 0.1; double score; const int n = 3; double i_x[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; double i_y[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; double i_t[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; const int fromedge = n; double max_score = corner_score(frame_to_filter, ref_frame, fromedge, fromedge, i_x, i_y, i_t, n, bit_depth); // rough estimate of max corner score in image for (int x = fromedge; x < frame_width - fromedge; x += 1) { for (int y = fromedge; y < frame_height - fromedge; y += frame_height / 5) { for (int i = 0; i < n * n; i++) { i_x[i] = 0; i_y[i] = 0; i_t[i] = 0; } score = corner_score(frame_to_filter, ref_frame, x, y, i_x, i_y, i_t, n, bit_depth); if (score > max_score) { max_score = score; } } } // score all the points and choose corners over threshold for (int x = fromedge; x < frame_width - fromedge; x += 1) { for (int y = fromedge; (y < frame_height - fromedge) && countcorners < maxcorners; y += 1) { for (int i = 0; i < n * n; i++) { i_x[i] = 0; i_y[i] = 0; i_t[i] = 0; } score = corner_score(frame_to_filter, ref_frame, x, y, i_x, i_y, i_t, n, bit_depth); if (score > threshold * max_score) { ref_corners[countcorners * 2] = x; ref_corners[countcorners * 2 + 1] = y; countcorners++; } } } return countcorners; } // weights is an nxn matrix. weights is filled with a gaussian function, // with independent variable: distance from the center point. static void gaussian(const double sigma, const int n, const int normalize, double *weights) { double total_weight = 0; for (int j = 0; j < n; j++) { for (int i = 0; i < n; i++) { double distance = sqrt(pow(n / 2 - i, 2) + pow(n / 2 - j, 2)); double weight = exp(-0.5 * pow(distance / sigma, 2)); weights[j * n + i] = weight; total_weight += weight; } } if (normalize == 1) { for (int j = 0; j < n; j++) { weights[j] = weights[j] / total_weight; } } } static double convolve(const double *filter, const int *img, const int size) { double result = 0; for (int i = 0; i < size; i++) { result += filter[i] * img[i]; } return result; } // Applies a Gaussian low-pass smoothing filter to produce // a corresponding lower resolution image with halved dimensions static void reduce(uint8_t *img, int height, int width, int stride, uint8_t *reduced_img) { const int new_width = width / 2; const int window_size = 5; const double gaussian_filter[25] = { 1. / 256, 1.0 / 64, 3. / 128, 1. / 64, 1. / 256, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 3. / 128, 3. / 32, 9. / 64, 3. / 32, 3. / 128, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 1. / 256, 1. / 64, 3. / 128, 1. / 64, 1. / 256 }; // filter is 5x5 so need prev and forward 2 pixels int img_section[25]; for (int y = 0; y < height - 1; y += 2) { for (int x = 0; x < width - 1; x += 2) { int i = 0; for (int yy = y - window_size / 2; yy <= y + window_size / 2; yy++) { for (int xx = x - window_size / 2; xx <= x + window_size / 2; xx++) { int yvalue = yy; int xvalue = xx; // copied pixels outside the boundary if (yvalue < 0) yvalue = 0; if (xvalue < 0) xvalue = 0; if (yvalue >= height) yvalue = height - 1; if (xvalue >= width) xvalue = width - 1; img_section[i++] = img[yvalue * stride + xvalue]; } } reduced_img[(y / 2) * new_width + (x / 2)] = (uint8_t)convolve( gaussian_filter, img_section, window_size * window_size); } } } static int cmpfunc(const void *a, const void *b) { return (*(int *)a - *(int *)b); } static void filter_mvs(const MV_FILTER_TYPE mv_filter, const int frame_height, const int frame_width, LOCALMV *localmvs, MV *mvs) { const int n = 5; // window size // for smoothing filter const double gaussian_filter[25] = { 1. / 256, 1. / 64, 3. / 128, 1. / 64, 1. / 256, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 3. / 128, 3. / 32, 9. / 64, 3. / 32, 3. / 128, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 1. / 256, 1. / 64, 3. / 128, 1. / 64, 1. / 256 }; // for median filter int mvrows[25]; int mvcols[25]; if (mv_filter != MV_FILTER_NONE) { for (int y = 0; y < frame_height; y++) { for (int x = 0; x < frame_width; x++) { int center_idx = y * frame_width + x; int i = 0; double filtered_row = 0; double filtered_col = 0; for (int yy = y - n / 2; yy <= y + n / 2; yy++) { for (int xx = x - n / 2; xx <= x + n / 2; xx++) { int yvalue = yy; int xvalue = xx; // copied pixels outside the boundary if (yvalue < 0) yvalue = 0; if (xvalue < 0) xvalue = 0; if (yvalue >= frame_height) yvalue = frame_height - 1; if (xvalue >= frame_width) xvalue = frame_width - 1; int index = yvalue * frame_width + xvalue; if (mv_filter == MV_FILTER_SMOOTH) { filtered_row += mvs[index].row * gaussian_filter[i]; filtered_col += mvs[index].col * gaussian_filter[i]; } else if (mv_filter == MV_FILTER_MEDIAN) { mvrows[i] = mvs[index].row; mvcols[i] = mvs[index].col; } i++; } } MV mv = mvs[center_idx]; if (mv_filter == MV_FILTER_SMOOTH) { mv.row = (int16_t)filtered_row; mv.col = (int16_t)filtered_col; } else if (mv_filter == MV_FILTER_MEDIAN) { qsort(mvrows, 25, sizeof(mv.row), cmpfunc); qsort(mvcols, 25, sizeof(mv.col), cmpfunc); mv.row = mvrows[25 / 2]; mv.col = mvcols[25 / 2]; } LOCALMV localmv = { .row = ((double)mv.row) / 8, .col = ((double)mv.row) / 8 }; localmvs[y * frame_width + x] = localmv; // if mvs array is immediately updated here, then the result may // propagate to other pixels. } } for (int i = 0; i < frame_height * frame_width; i++) { MV mv = { .row = (int16_t)round(8 * localmvs[i].row), .col = (int16_t)round(8 * localmvs[i].col) }; mvs[i] = mv; } } } // Computes optical flow at a single pyramid level, // using Lucas-Kanade algorithm. // Modifies mvs array. static void lucas_kanade(const YV12_BUFFER_CONFIG *from_frame, const YV12_BUFFER_CONFIG *to_frame, const int level, const LK_PARAMS *lk_params, const int num_ref_corners, int *ref_corners, const int mv_stride, const int bit_depth, LOCALMV *mvs) { assert(lk_params->window_size > 0 && lk_params->window_size % 2 == 0); const int n = lk_params->window_size; // algorithm is sensitive to window size double *i_x = (double *)aom_malloc(n * n * sizeof(*i_x)); double *i_y = (double *)aom_malloc(n * n * sizeof(*i_y)); double *i_t = (double *)aom_malloc(n * n * sizeof(*i_t)); double *weights = (double *)aom_malloc(n * n * sizeof(*weights)); if (!i_x || !i_y || !i_t || !weights) goto free_lk_buf; const int expand_multiplier = (int)pow(2, level); double sigma = 0.2 * n; // normalizing doesn't really affect anything since it's applied // to every component of M and b gaussian(sigma, n, 0, weights); for (int i = 0; i < num_ref_corners; i++) { const double x_coord = 1.0 * ref_corners[i * 2] / expand_multiplier; const double y_coord = 1.0 * ref_corners[i * 2 + 1] / expand_multiplier; int highres_x = ref_corners[i * 2]; int highres_y = ref_corners[i * 2 + 1]; int mv_idx = highres_y * (mv_stride) + highres_x; LOCALMV mv_old = mvs[mv_idx]; mv_old.row = mv_old.row / expand_multiplier; mv_old.col = mv_old.col / expand_multiplier; // using this instead of memset, since it's not completely // clear if zero memset works on double arrays for (int j = 0; j < n * n; j++) { i_x[j] = 0; i_y[j] = 0; i_t[j] = 0; } gradients_over_window(from_frame, to_frame, x_coord, y_coord, n, bit_depth, i_x, i_y, i_t, &mv_old); double Mres1[1] = { 0 }, Mres2[1] = { 0 }, Mres3[1] = { 0 }; double bres1[1] = { 0 }, bres2[1] = { 0 }; for (int j = 0; j < n * n; j++) { Mres1[0] += weights[j] * i_x[j] * i_x[j]; Mres2[0] += weights[j] * i_x[j] * i_y[j]; Mres3[0] += weights[j] * i_y[j] * i_y[j]; bres1[0] += weights[j] * i_x[j] * i_t[j]; bres2[0] += weights[j] * i_y[j] * i_t[j]; } double M[4] = { Mres1[0], Mres2[0], Mres2[0], Mres3[0] }; double b[2] = { -1 * bres1[0], -1 * bres2[0] }; double eig[2] = { 1, 1 }; eigenvalues_2x2(M, eig); double threshold = 0.1; if (fabs(eig[0]) > threshold) { // if M is not invertible, then displacement // will default to zeros double u[2] = { 0, 0 }; linsolve(2, M, 2, b, u); int mult = 1; if (level != 0) mult = expand_multiplier; // mv doubles when resolution doubles LOCALMV mv = { .row = (mult * (u[0] + mv_old.row)), .col = (mult * (u[1] + mv_old.col)) }; mvs[mv_idx] = mv; mvs[mv_idx] = mv; } } free_lk_buf: aom_free(weights); aom_free(i_t); aom_free(i_x); aom_free(i_y); } // Warp the src_frame to warper_frame according to mvs. // mvs point to src_frame static void warp_back_frame(YV12_BUFFER_CONFIG *warped_frame, const YV12_BUFFER_CONFIG *src_frame, const LOCALMV *mvs, int mv_stride) { int w, h; const int fw = src_frame->y_crop_width; const int fh = src_frame->y_crop_height; const int src_fs = src_frame->y_stride, warped_fs = warped_frame->y_stride; const uint8_t *src_buf = src_frame->y_buffer; uint8_t *warped_buf = warped_frame->y_buffer; double temp; for (h = 0; h < fh; h++) { for (w = 0; w < fw; w++) { double cord_x = (double)w + mvs[h * mv_stride + w].col; double cord_y = (double)h + mvs[h * mv_stride + w].row; cord_x = fclamp(cord_x, 0, (double)(fw - 1)); cord_y = fclamp(cord_y, 0, (double)(fh - 1)); const int floorx = (int)floor(cord_x); const int floory = (int)floor(cord_y); const double fracx = cord_x - (double)floorx; const double fracy = cord_y - (double)floory; temp = 0; for (int hh = 0; hh < 2; hh++) { const double weighth = hh ? (fracy) : (1 - fracy); for (int ww = 0; ww < 2; ww++) { const double weightw = ww ? (fracx) : (1 - fracx); int y = floory + hh; int x = floorx + ww; y = clamp(y, 0, fh - 1); x = clamp(x, 0, fw - 1); temp += (double)src_buf[y * src_fs + x] * weightw * weighth; } } warped_buf[h * warped_fs + w] = (uint8_t)round(temp); } } } // Same as warp_back_frame, but using a better interpolation filter. static void warp_back_frame_intp(YV12_BUFFER_CONFIG *warped_frame, const YV12_BUFFER_CONFIG *src_frame, const LOCALMV *mvs, int mv_stride) { int w, h; const int fw = src_frame->y_crop_width; const int fh = src_frame->y_crop_height; const int warped_fs = warped_frame->y_stride; uint8_t *warped_buf = warped_frame->y_buffer; const int blk = 2; uint8_t temp_blk[4]; const int is_intrabc = 0; // Is intra-copied? const int is_high_bitdepth = is_frame_high_bitdepth(src_frame); const int subsampling_x = 0, subsampling_y = 0; // for y-buffer const int_interpfilters interp_filters = av1_broadcast_interp_filter(MULTITAP_SHARP2); const int plane = 0; // y-plane const struct buf_2d ref_buf2 = { NULL, src_frame->y_buffer, src_frame->y_crop_width, src_frame->y_crop_height, src_frame->y_stride }; const int bit_depth = src_frame->bit_depth; struct scale_factors scale; av1_setup_scale_factors_for_frame( &scale, src_frame->y_crop_width, src_frame->y_crop_height, src_frame->y_crop_width, src_frame->y_crop_height); for (h = 0; h < fh; h++) { for (w = 0; w < fw; w++) { InterPredParams inter_pred_params; av1_init_inter_params(&inter_pred_params, blk, blk, h, w, subsampling_x, subsampling_y, bit_depth, is_high_bitdepth, is_intrabc, &scale, &ref_buf2, interp_filters); inter_pred_params.interp_filter_params[0] = &av1_interp_filter_params_list[interp_filters.as_filters.x_filter]; inter_pred_params.interp_filter_params[1] = &av1_interp_filter_params_list[interp_filters.as_filters.y_filter]; inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth); MV newmv = { .row = (int16_t)round((mvs[h * mv_stride + w].row) * 8), .col = (int16_t)round((mvs[h * mv_stride + w].col) * 8) }; av1_enc_build_one_inter_predictor(temp_blk, blk, &newmv, &inter_pred_params); warped_buf[h * warped_fs + w] = temp_blk[0]; } } } #define DERIVATIVE_FILTER_LENGTH 7 double filter[DERIVATIVE_FILTER_LENGTH] = { -1.0 / 60, 9.0 / 60, -45.0 / 60, 0, 45.0 / 60, -9.0 / 60, 1.0 / 60 }; // Get gradient of the whole frame static void get_frame_gradients(const YV12_BUFFER_CONFIG *from_frame, const YV12_BUFFER_CONFIG *to_frame, double *ix, double *iy, double *it, int grad_stride) { int w, h, k, idx; const int fw = from_frame->y_crop_width; const int fh = from_frame->y_crop_height; const int from_fs = from_frame->y_stride, to_fs = to_frame->y_stride; const uint8_t *from_buf = from_frame->y_buffer; const uint8_t *to_buf = to_frame->y_buffer; const int lh = DERIVATIVE_FILTER_LENGTH; const int hleft = (lh - 1) / 2; for (h = 0; h < fh; h++) { for (w = 0; w < fw; w++) { // x ix[h * grad_stride + w] = 0; for (k = 0; k < lh; k++) { // if we want to make this block dependent, need to extend the // boundaries using other initializations. idx = w + k - hleft; idx = clamp(idx, 0, fw - 1); ix[h * grad_stride + w] += filter[k] * 0.5 * ((double)from_buf[h * from_fs + idx] + (double)to_buf[h * to_fs + idx]); } // y iy[h * grad_stride + w] = 0; for (k = 0; k < lh; k++) { // if we want to make this block dependent, need to extend the // boundaries using other initializations. idx = h + k - hleft; idx = clamp(idx, 0, fh - 1); iy[h * grad_stride + w] += filter[k] * 0.5 * ((double)from_buf[idx * from_fs + w] + (double)to_buf[idx * to_fs + w]); } // t it[h * grad_stride + w] = (double)to_buf[h * to_fs + w] - (double)from_buf[h * from_fs + w]; } } } // Solve for linear equations given by the H-S method static void solve_horn_schunck(const double *ix, const double *iy, const double *it, int grad_stride, int width, int height, const LOCALMV *init_mvs, int init_mv_stride, LOCALMV *mvs, int mv_stride) { // TODO(bohanli): May just need to allocate the buffers once per optical flow // calculation int *row_pos = aom_calloc(width * height * 28, sizeof(*row_pos)); int *col_pos = aom_calloc(width * height * 28, sizeof(*col_pos)); double *values = aom_calloc(width * height * 28, sizeof(*values)); double *mv_vec = aom_calloc(width * height * 2, sizeof(*mv_vec)); double *mv_init_vec = aom_calloc(width * height * 2, sizeof(*mv_init_vec)); double *temp_b = aom_calloc(width * height * 2, sizeof(*temp_b)); double *b = aom_calloc(width * height * 2, sizeof(*b)); if (!row_pos || !col_pos || !values || !mv_vec || !mv_init_vec || !temp_b || !b) { goto free_hs_solver_buf; } // the location idx for neighboring pixels, k < 4 are the 4 direct neighbors const int check_locs_y[12] = { 0, 0, -1, 1, -1, -1, 1, 1, 0, 0, -2, 2 }; const int check_locs_x[12] = { -1, 1, 0, 0, -1, 1, -1, 1, -2, 2, 0, 0 }; int h, w, checkh, checkw, k, ret; const int offset = height * width; SPARSE_MTX A; int c = 0; const double lambda = 100; for (w = 0; w < width; w++) { for (h = 0; h < height; h++) { mv_init_vec[w * height + h] = init_mvs[h * init_mv_stride + w].col; mv_init_vec[w * height + h + offset] = init_mvs[h * init_mv_stride + w].row; } } // get matrix A for (w = 0; w < width; w++) { for (h = 0; h < height; h++) { int center_num_direct = 4; const int center_idx = w * height + h; if (w == 0 || w == width - 1) center_num_direct--; if (h == 0 || h == height - 1) center_num_direct--; // diagonal entry for this row from the center pixel double cor_w = center_num_direct * center_num_direct + center_num_direct; row_pos[c] = center_idx; col_pos[c] = center_idx; values[c] = lambda * cor_w; c++; row_pos[c] = center_idx + offset; col_pos[c] = center_idx + offset; values[c] = lambda * cor_w; c++; // other entries from direct neighbors for (k = 0; k < 4; k++) { checkh = h + check_locs_y[k]; checkw = w + check_locs_x[k]; if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) { continue; } int this_idx = checkw * height + checkh; int this_num_direct = 4; if (checkw == 0 || checkw == width - 1) this_num_direct--; if (checkh == 0 || checkh == height - 1) this_num_direct--; cor_w = -center_num_direct - this_num_direct; row_pos[c] = center_idx; col_pos[c] = this_idx; values[c] = lambda * cor_w; c++; row_pos[c] = center_idx + offset; col_pos[c] = this_idx + offset; values[c] = lambda * cor_w; c++; } // entries from neighbors on the diagonal corners for (k = 4; k < 8; k++) { checkh = h + check_locs_y[k]; checkw = w + check_locs_x[k]; if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) { continue; } int this_idx = checkw * height + checkh; cor_w = 2; row_pos[c] = center_idx; col_pos[c] = this_idx; values[c] = lambda * cor_w; c++; row_pos[c] = center_idx + offset; col_pos[c] = this_idx + offset; values[c] = lambda * cor_w; c++; } // entries from neighbors with dist of 2 for (k = 8; k < 12; k++) { checkh = h + check_locs_y[k]; checkw = w + check_locs_x[k]; if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) { continue; } int this_idx = checkw * height + checkh; cor_w = 1; row_pos[c] = center_idx; col_pos[c] = this_idx; values[c] = lambda * cor_w; c++; row_pos[c] = center_idx + offset; col_pos[c] = this_idx + offset; values[c] = lambda * cor_w; c++; } } } ret = av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height, 2 * width * height, &A); if (ret < 0) goto free_hs_solver_buf; // subtract init mv part from b av1_mtx_vect_multi_left(&A, mv_init_vec, temp_b, 2 * width * height); for (int i = 0; i < 2 * width * height; i++) { b[i] = -temp_b[i]; } av1_free_sparse_mtx_elems(&A); // add cross terms to A and modify b with ExEt / EyEt for (w = 0; w < width; w++) { for (h = 0; h < height; h++) { int curidx = w * height + h; // modify b b[curidx] += -ix[h * grad_stride + w] * it[h * grad_stride + w]; b[curidx + offset] += -iy[h * grad_stride + w] * it[h * grad_stride + w]; // add cross terms to A row_pos[c] = curidx; col_pos[c] = curidx + offset; values[c] = ix[h * grad_stride + w] * iy[h * grad_stride + w]; c++; row_pos[c] = curidx + offset; col_pos[c] = curidx; values[c] = ix[h * grad_stride + w] * iy[h * grad_stride + w]; c++; } } // Add diagonal terms to A for (int i = 0; i < c; i++) { if (row_pos[i] == col_pos[i]) { if (row_pos[i] < offset) { w = row_pos[i] / height; h = row_pos[i] % height; values[i] += pow(ix[h * grad_stride + w], 2); } else { w = (row_pos[i] - offset) / height; h = (row_pos[i] - offset) % height; values[i] += pow(iy[h * grad_stride + w], 2); } } } ret = av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height, 2 * width * height, &A); if (ret < 0) goto free_hs_solver_buf; // solve for the mvs ret = av1_conjugate_gradient_sparse(&A, b, 2 * width * height, mv_vec); if (ret < 0) goto free_hs_solver_buf; // copy mvs for (w = 0; w < width; w++) { for (h = 0; h < height; h++) { mvs[h * mv_stride + w].col = mv_vec[w * height + h]; mvs[h * mv_stride + w].row = mv_vec[w * height + h + offset]; } } free_hs_solver_buf: aom_free(row_pos); aom_free(col_pos); aom_free(values); aom_free(mv_vec); aom_free(mv_init_vec); aom_free(b); aom_free(temp_b); av1_free_sparse_mtx_elems(&A); } // Calculate optical flow from from_frame to to_frame using the H-S method. static void horn_schunck(const YV12_BUFFER_CONFIG *from_frame, const YV12_BUFFER_CONFIG *to_frame, const int level, const int mv_stride, const int mv_height, const int mv_width, const OPFL_PARAMS *opfl_params, LOCALMV *mvs) { // mvs are always on level 0, here we define two new mv arrays that is of size // of this level. const int fw = from_frame->y_crop_width; const int fh = from_frame->y_crop_height; const int factor = (int)pow(2, level); int w, h, k, init_mv_stride; LOCALMV *init_mvs = NULL, *refine_mvs = NULL; double *ix = NULL, *iy = NULL, *it = NULL; YV12_BUFFER_CONFIG temp_frame; temp_frame.y_buffer = NULL; if (level == 0) { init_mvs = mvs; init_mv_stride = mv_stride; } else { init_mvs = aom_calloc(fw * fh, sizeof(*mvs)); if (!init_mvs) goto free_hs_buf; init_mv_stride = fw; for (h = 0; h < fh; h++) { for (w = 0; w < fw; w++) { init_mvs[h * init_mv_stride + w].row = mvs[h * factor * mv_stride + w * factor].row / (double)factor; init_mvs[h * init_mv_stride + w].col = mvs[h * factor * mv_stride + w * factor].col / (double)factor; } } } refine_mvs = aom_calloc(fw * fh, sizeof(*mvs)); if (!refine_mvs) goto free_hs_buf; // temp frame for warping temp_frame.y_buffer = (uint8_t *)aom_calloc(fh * fw, sizeof(*temp_frame.y_buffer)); if (!temp_frame.y_buffer) goto free_hs_buf; temp_frame.y_crop_height = fh; temp_frame.y_crop_width = fw; temp_frame.y_stride = fw; // gradient buffers ix = aom_calloc(fw * fh, sizeof(*ix)); iy = aom_calloc(fw * fh, sizeof(*iy)); it = aom_calloc(fw * fh, sizeof(*it)); if (!ix || !iy || !it) goto free_hs_buf; // For each warping step for (k = 0; k < opfl_params->warping_steps; k++) { // warp from_frame with init_mv if (level == 0) { warp_back_frame_intp(&temp_frame, to_frame, init_mvs, init_mv_stride); } else { warp_back_frame(&temp_frame, to_frame, init_mvs, init_mv_stride); } // calculate frame gradients get_frame_gradients(from_frame, &temp_frame, ix, iy, it, fw); // form linear equations and solve mvs solve_horn_schunck(ix, iy, it, fw, fw, fh, init_mvs, init_mv_stride, refine_mvs, fw); // update init_mvs for (h = 0; h < fh; h++) { for (w = 0; w < fw; w++) { init_mvs[h * init_mv_stride + w].col += refine_mvs[h * fw + w].col; init_mvs[h * init_mv_stride + w].row += refine_mvs[h * fw + w].row; } } } // copy back the mvs if needed if (level != 0) { for (h = 0; h < mv_height; h++) { for (w = 0; w < mv_width; w++) { mvs[h * mv_stride + w].row = init_mvs[h / factor * init_mv_stride + w / factor].row * (double)factor; mvs[h * mv_stride + w].col = init_mvs[h / factor * init_mv_stride + w / factor].col * (double)factor; } } } free_hs_buf: if (level != 0) aom_free(init_mvs); aom_free(refine_mvs); aom_free(temp_frame.y_buffer); aom_free(ix); aom_free(iy); aom_free(it); } // Apply optical flow iteratively at each pyramid level static void pyramid_optical_flow(const YV12_BUFFER_CONFIG *from_frame, const YV12_BUFFER_CONFIG *to_frame, const int bit_depth, const OPFL_PARAMS *opfl_params, const OPTFLOW_METHOD method, LOCALMV *mvs) { assert(opfl_params->pyramid_levels > 0 && opfl_params->pyramid_levels <= MAX_PYRAMID_LEVELS); int levels = opfl_params->pyramid_levels; const int frame_height = from_frame->y_crop_height; const int frame_width = from_frame->y_crop_width; if ((frame_height / pow(2.0, levels - 1) < 50 || frame_height / pow(2.0, levels - 1) < 50) && levels > 1) levels = levels - 1; uint8_t *images1[MAX_PYRAMID_LEVELS] = { NULL }; uint8_t *images2[MAX_PYRAMID_LEVELS] = { NULL }; int *ref_corners = NULL; images1[0] = from_frame->y_buffer; images2[0] = to_frame->y_buffer; YV12_BUFFER_CONFIG *buffers1 = aom_malloc(levels * sizeof(*buffers1)); YV12_BUFFER_CONFIG *buffers2 = aom_malloc(levels * sizeof(*buffers2)); if (!buffers1 || !buffers2) goto free_pyramid_buf; buffers1[0] = *from_frame; buffers2[0] = *to_frame; int fw = frame_width; int fh = frame_height; for (int i = 1; i < levels; i++) { // TODO(bohanli): may need to extend buffers for better interpolation SIMD images1[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(*images1[i])); images2[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(*images2[i])); if (!images1[i] || !images2[i]) goto free_pyramid_buf; int stride; if (i == 1) stride = from_frame->y_stride; else stride = fw; reduce(images1[i - 1], fh, fw, stride, images1[i]); reduce(images2[i - 1], fh, fw, stride, images2[i]); fh /= 2; fw /= 2; YV12_BUFFER_CONFIG a = { .y_buffer = images1[i], .y_crop_width = fw, .y_crop_height = fh, .y_stride = fw }; YV12_BUFFER_CONFIG b = { .y_buffer = images2[i], .y_crop_width = fw, .y_crop_height = fh, .y_stride = fw }; buffers1[i] = a; buffers2[i] = b; } // Compute corners for specific frame int num_ref_corners = 0; if (is_sparse(opfl_params)) { int maxcorners = from_frame->y_crop_width * from_frame->y_crop_height; ref_corners = aom_malloc(maxcorners * 2 * sizeof(*ref_corners)); if (!ref_corners) goto free_pyramid_buf; num_ref_corners = detect_corners(from_frame, to_frame, maxcorners, ref_corners, bit_depth); } const int stop_level = 0; for (int i = levels - 1; i >= stop_level; i--) { if (method == LUCAS_KANADE) { assert(is_sparse(opfl_params)); lucas_kanade(&buffers1[i], &buffers2[i], i, opfl_params->lk_params, num_ref_corners, ref_corners, buffers1[0].y_crop_width, bit_depth, mvs); } else if (method == HORN_SCHUNCK) { assert(!is_sparse(opfl_params)); horn_schunck(&buffers1[i], &buffers2[i], i, buffers1[0].y_crop_width, buffers1[0].y_crop_height, buffers1[0].y_crop_width, opfl_params, mvs); } } free_pyramid_buf: for (int i = 1; i < levels; i++) { aom_free(images1[i]); aom_free(images2[i]); } aom_free(ref_corners); aom_free(buffers1); aom_free(buffers2); } // Computes optical flow by applying algorithm at // multiple pyramid levels of images (lower-resolution, smoothed images) // This accounts for larger motions. // Inputs: // from_frame Frame buffer. // to_frame: Frame buffer. MVs point from_frame -> to_frame. // from_frame_idx: Index of from_frame. // to_frame_idx: Index of to_frame. Return all zero MVs when idx are equal. // bit_depth: // opfl_params: contains algorithm-specific parameters. // mv_filter: MV_FILTER_NONE, MV_FILTER_SMOOTH, or MV_FILTER_MEDIAN. // method: LUCAS_KANADE, HORN_SCHUNCK // mvs: pointer to MVs. Contains initialization, and modified // based on optical flow. Must have // dimensions = from_frame->y_crop_width * from_frame->y_crop_height void av1_optical_flow(const YV12_BUFFER_CONFIG *from_frame, const YV12_BUFFER_CONFIG *to_frame, const int from_frame_idx, const int to_frame_idx, const int bit_depth, const OPFL_PARAMS *opfl_params, const MV_FILTER_TYPE mv_filter, const OPTFLOW_METHOD method, MV *mvs) { const int frame_height = from_frame->y_crop_height; const int frame_width = from_frame->y_crop_width; // TODO(any): deal with the case where frames are not of the same dimensions assert(frame_height == to_frame->y_crop_height && frame_width == to_frame->y_crop_width); if (from_frame_idx == to_frame_idx) { // immediately return all zero mvs when frame indices are equal for (int yy = 0; yy < frame_height; yy++) { for (int xx = 0; xx < frame_width; xx++) { MV mv = { .row = 0, .col = 0 }; mvs[yy * frame_width + xx] = mv; } } return; } // Initialize double mvs based on input parameter mvs array LOCALMV *localmvs = aom_malloc(frame_height * frame_width * sizeof(*localmvs)); if (!localmvs) return; filter_mvs(MV_FILTER_SMOOTH, frame_height, frame_width, localmvs, mvs); for (int i = 0; i < frame_width * frame_height; i++) { MV mv = mvs[i]; LOCALMV localmv = { .row = ((double)mv.row) / 8, .col = ((double)mv.col) / 8 }; localmvs[i] = localmv; } // Apply optical flow algorithm pyramid_optical_flow(from_frame, to_frame, bit_depth, opfl_params, method, localmvs); // Update original mvs array for (int j = 0; j < frame_height; j++) { for (int i = 0; i < frame_width; i++) { int idx = j * frame_width + i; if (j + localmvs[idx].row < 0 || j + localmvs[idx].row >= frame_height || i + localmvs[idx].col < 0 || i + localmvs[idx].col >= frame_width) { continue; } MV mv = { .row = (int16_t)round(8 * localmvs[idx].row), .col = (int16_t)round(8 * localmvs[idx].col) }; mvs[idx] = mv; } } filter_mvs(mv_filter, frame_height, frame_width, localmvs, mvs); aom_free(localmvs); } #endif aom-3.12.1/av1/encoder/optical_flow.h000066400000000000000000000040241477627663500173420ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_OPTICAL_FLOW_H_ #define AOM_AV1_ENCODER_OPTICAL_FLOW_H_ #include "aom_scale/yv12config.h" #include "av1/common/mv.h" #include "config/aom_config.h" #ifdef __cplusplus extern "C" { #endif #if CONFIG_OPTICAL_FLOW_API typedef enum { LUCAS_KANADE, HORN_SCHUNCK } OPTFLOW_METHOD; typedef enum { MV_FILTER_NONE, MV_FILTER_SMOOTH, MV_FILTER_MEDIAN } MV_FILTER_TYPE; typedef struct LOCALMV { double row; double col; } LOCALMV; #define MAX_PYRAMID_LEVELS 5 // default options for optical flow #define OPFL_WINDOW_SIZE 15 #define OPFL_PYRAMID_LEVELS 3 // total levels #define OPFL_WARPING_STEPS 3 // parameters specific to Lucas-Kanade typedef struct lk_params { int window_size; } LK_PARAMS; // generic structure to contain parameters for all // optical flow algorithms typedef struct opfl_params { int pyramid_levels; int warping_steps; LK_PARAMS *lk_params; int flags; } OPFL_PARAMS; #define OPFL_FLAG_SPARSE 1 void av1_init_opfl_params(OPFL_PARAMS *opfl_params); void av1_init_lk_params(LK_PARAMS *lk_params); void av1_optical_flow(const YV12_BUFFER_CONFIG *from_frame, const YV12_BUFFER_CONFIG *to_frame, const int from_frame_idx, const int to_frame_idx, const int bit_depth, const OPFL_PARAMS *opfl_params, const MV_FILTER_TYPE mv_filter, const OPTFLOW_METHOD method, MV *mvs); #endif #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_OPTICAL_FLOW_H_ aom-3.12.1/av1/encoder/palette.c000066400000000000000000001244651477627663500163250ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "av1/common/pred_common.h" #include "av1/encoder/block.h" #include "av1/encoder/cost.h" #include "av1/encoder/encoder.h" #include "av1/encoder/intra_mode_search.h" #include "av1/encoder/intra_mode_search_utils.h" #include "av1/encoder/palette.h" #include "av1/encoder/random.h" #include "av1/encoder/rdopt_utils.h" #include "av1/encoder/tx_search.h" #define AV1_K_MEANS_DIM 1 #include "av1/encoder/k_means_template.h" #undef AV1_K_MEANS_DIM #define AV1_K_MEANS_DIM 2 #include "av1/encoder/k_means_template.h" #undef AV1_K_MEANS_DIM static int int16_comparer(const void *a, const void *b) { return (*(int16_t *)a - *(int16_t *)b); } /*!\brief Removes duplicated centroid indices. * * \ingroup palette_mode_search * \param[in] centroids A list of centroids index. * \param[in] num_centroids Number of centroids. * * \return Returns the number of unique centroids and saves the unique centroids * in beginning of the centroids array. * * \attention The centroids should be rounded to integers before calling this * method. */ static int remove_duplicates(int16_t *centroids, int num_centroids) { int num_unique; // number of unique centroids int i; qsort(centroids, num_centroids, sizeof(*centroids), int16_comparer); // Remove duplicates. num_unique = 1; for (i = 1; i < num_centroids; ++i) { if (centroids[i] != centroids[i - 1]) { // found a new unique centroid centroids[num_unique++] = centroids[i]; } } return num_unique; } static int delta_encode_cost(const int *colors, int num, int bit_depth, int min_val) { if (num <= 0) return 0; int bits_cost = bit_depth; if (num == 1) return bits_cost; bits_cost += 2; int max_delta = 0; int deltas[PALETTE_MAX_SIZE]; const int min_bits = bit_depth - 3; for (int i = 1; i < num; ++i) { const int delta = colors[i] - colors[i - 1]; deltas[i - 1] = delta; assert(delta >= min_val); if (delta > max_delta) max_delta = delta; } int bits_per_delta = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits); assert(bits_per_delta <= bit_depth); int range = (1 << bit_depth) - colors[0] - min_val; for (int i = 0; i < num - 1; ++i) { bits_cost += bits_per_delta; range -= deltas[i]; bits_per_delta = AOMMIN(bits_per_delta, av1_ceil_log2(range)); } return bits_cost; } int av1_index_color_cache(const uint16_t *color_cache, int n_cache, const uint16_t *colors, int n_colors, uint8_t *cache_color_found, int *out_cache_colors) { if (n_cache <= 0) { for (int i = 0; i < n_colors; ++i) out_cache_colors[i] = colors[i]; return n_colors; } memset(cache_color_found, 0, n_cache * sizeof(*cache_color_found)); int n_in_cache = 0; int in_cache_flags[PALETTE_MAX_SIZE]; memset(in_cache_flags, 0, sizeof(in_cache_flags)); for (int i = 0; i < n_cache && n_in_cache < n_colors; ++i) { for (int j = 0; j < n_colors; ++j) { if (colors[j] == color_cache[i]) { in_cache_flags[j] = 1; cache_color_found[i] = 1; ++n_in_cache; break; } } } int j = 0; for (int i = 0; i < n_colors; ++i) if (!in_cache_flags[i]) out_cache_colors[j++] = colors[i]; assert(j == n_colors - n_in_cache); return j; } int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi, int bit_depth, int *zero_count, int *min_bits) { const int n = pmi->palette_size[1]; const int max_val = 1 << bit_depth; int max_d = 0; *min_bits = bit_depth - 4; *zero_count = 0; for (int i = 1; i < n; ++i) { const int delta = pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] - pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1]; const int v = abs(delta); const int d = AOMMIN(v, max_val - v); if (d > max_d) max_d = d; if (d == 0) ++(*zero_count); } return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits); } int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi, const uint16_t *color_cache, int n_cache, int bit_depth) { const int n = pmi->palette_size[0]; int out_cache_colors[PALETTE_MAX_SIZE]; uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; const int n_out_cache = av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n, cache_color_found, out_cache_colors); const int total_bits = n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 1); return av1_cost_literal(total_bits); } int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi, const uint16_t *color_cache, int n_cache, int bit_depth) { const int n = pmi->palette_size[1]; int total_bits = 0; // U channel palette color cost. int out_cache_colors[PALETTE_MAX_SIZE]; uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; const int n_out_cache = av1_index_color_cache( color_cache, n_cache, pmi->palette_colors + PALETTE_MAX_SIZE, n, cache_color_found, out_cache_colors); total_bits += n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 0); // V channel palette color cost. int zero_count = 0, min_bits_v = 0; const int bits_v = av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v); const int bits_using_delta = 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count; const int bits_using_raw = bit_depth * n; total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw); return av1_cost_literal(total_bits); } // Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x // new_height'. Extra rows and columns are filled in by copying last valid // row/column. static inline void extend_palette_color_map(uint8_t *const color_map, int orig_width, int orig_height, int new_width, int new_height) { int j; assert(new_width >= orig_width); assert(new_height >= orig_height); if (new_width == orig_width && new_height == orig_height) return; for (j = orig_height - 1; j >= 0; --j) { memmove(color_map + j * new_width, color_map + j * orig_width, orig_width); // Copy last column to extra columns. memset(color_map + j * new_width + orig_width, color_map[j * new_width + orig_width - 1], new_width - orig_width); } // Copy last row to extra rows. for (j = orig_height; j < new_height; ++j) { memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width, new_width); } } // Bias toward using colors in the cache. // TODO(huisu): Try other schemes to improve compression. static inline void optimize_palette_colors(uint16_t *color_cache, int n_cache, int n_colors, int stride, int16_t *centroids, int bit_depth) { if (n_cache <= 0) return; for (int i = 0; i < n_colors * stride; i += stride) { int min_diff = abs((int)centroids[i] - (int)color_cache[0]); int idx = 0; for (int j = 1; j < n_cache; ++j) { const int this_diff = abs((int)centroids[i] - (int)color_cache[j]); if (this_diff < min_diff) { min_diff = this_diff; idx = j; } } const int min_threshold = 4 << (bit_depth - 8); if (min_diff <= min_threshold) centroids[i] = color_cache[idx]; } } /*!\brief Calculate the luma palette cost from a given color palette * * \ingroup palette_mode_search * \callergraph * Given the base colors as specified in centroids[], calculate the RD cost * of palette mode. */ static inline void palette_rd_y( const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int16_t *centroids, int n, uint16_t *color_cache, int n_cache, bool do_header_rd_based_gating, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip, uint8_t *tx_type_map, int *beat_best_palette_rd, bool *do_header_rd_based_breakout, int discount_color_cost) { if (do_header_rd_based_breakout != NULL) *do_header_rd_based_breakout = false; optimize_palette_colors(color_cache, n_cache, n, 1, centroids, cpi->common.seq_params->bit_depth); const int num_unique_colors = remove_duplicates(centroids, n); if (num_unique_colors < PALETTE_MIN_SIZE) { // Too few unique colors to create a palette. And DC_PRED will work // well for that case anyway. So skip. return; } PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; if (cpi->common.seq_params->use_highbitdepth) { for (int i = 0; i < num_unique_colors; ++i) { pmi->palette_colors[i] = clip_pixel_highbd( (int)centroids[i], cpi->common.seq_params->bit_depth); } } else { for (int i = 0; i < num_unique_colors; ++i) { pmi->palette_colors[i] = clip_pixel(centroids[i]); } } pmi->palette_size[0] = num_unique_colors; MACROBLOCKD *const xd = &x->e_mbd; uint8_t *const color_map = xd->plane[0].color_index_map; int block_width, block_height, rows, cols; av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, &cols); av1_calc_indices(data, centroids, color_map, rows * cols, num_unique_colors, 1); extend_palette_color_map(color_map, cols, rows, block_width, block_height); RD_STATS tokenonly_rd_stats; int this_rate; if (do_header_rd_based_gating) { assert(do_header_rd_based_breakout != NULL); const int palette_mode_rate = intra_mode_info_cost_y( cpi, x, mbmi, bsize, dc_mode_cost, discount_color_cost); const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0); // Less aggressive pruning when prune_luma_palette_size_search_level == 1. const int header_rd_shift = (cpi->sf.intra_sf.prune_luma_palette_size_search_level == 1) ? 1 : 0; // Terminate further palette_size search, if the header cost corresponding // to lower palette_size is more than *best_rd << header_rd_shift. This // logic is implemented with a right shift in the LHS to prevent a possible // overflow with the left shift in RHS. if ((header_rd >> header_rd_shift) > *best_rd) { *do_header_rd_based_breakout = true; return; } av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); if (tokenonly_rd_stats.rate == INT_MAX) return; this_rate = tokenonly_rd_stats.rate + palette_mode_rate; } else { av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); if (tokenonly_rd_stats.rate == INT_MAX) return; this_rate = tokenonly_rd_stats.rate + intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost, discount_color_cost); } int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) { tokenonly_rd_stats.rate -= tx_size_cost(x, bsize, mbmi->tx_size); } // Collect mode stats for multiwinner mode processing const int txfm_search_done = 1; store_winner_mode_stats( &cpi->common, x, mbmi, NULL, NULL, NULL, THR_DC, color_map, bsize, this_rd, cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done); if (this_rd < *best_rd) { *best_rd = this_rd; // Setting beat_best_rd flag because current mode rd is better than best_rd. // This flag need to be updated only for palette evaluation in key frames if (beat_best_rd) *beat_best_rd = 1; memcpy(best_palette_color_map, color_map, block_width * block_height * sizeof(color_map[0])); *best_mbmi = *mbmi; memcpy(blk_skip, x->txfm_search_info.blk_skip, sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); av1_copy_array(tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); if (rate) *rate = this_rate; if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate; if (distortion) *distortion = tokenonly_rd_stats.dist; if (skippable) *skippable = tokenonly_rd_stats.skip_txfm; if (beat_best_palette_rd) *beat_best_palette_rd = 1; } } static inline int is_iter_over(int curr_idx, int end_idx, int step_size) { assert(step_size != 0); return (step_size > 0) ? curr_idx >= end_idx : curr_idx <= end_idx; } // Performs count-based palette search with number of colors in interval // [start_n, end_n) with step size step_size. If step_size < 0, then end_n can // be less than start_n. Saves the last numbers searched in last_n_searched and // returns the best number of colors found. static inline int perform_top_color_palette_search( const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int16_t *top_colors, int start_n, int end_n, int step_size, bool do_header_rd_based_gating, int *last_n_searched, uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, uint8_t *tx_type_map, int discount_color_cost) { int16_t centroids[PALETTE_MAX_SIZE]; int n = start_n; int top_color_winner = end_n; /* clang-format off */ assert(IMPLIES(step_size < 0, start_n > end_n)); /* clang-format on */ assert(IMPLIES(step_size > 0, start_n < end_n)); while (!is_iter_over(n, end_n, step_size)) { int beat_best_palette_rd = 0; bool do_header_rd_based_breakout = false; memcpy(centroids, top_colors, n * sizeof(top_colors[0])); palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, color_cache, n_cache, do_header_rd_based_gating, best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, &beat_best_palette_rd, &do_header_rd_based_breakout, discount_color_cost); *last_n_searched = n; if (do_header_rd_based_breakout) { // Terminate palette_size search by setting last_n_searched to end_n. *last_n_searched = end_n; break; } if (beat_best_palette_rd) { top_color_winner = n; } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) { // At search level 2, we return immediately if we don't see an improvement return top_color_winner; } n += step_size; } return top_color_winner; } // Performs k-means based palette search with number of colors in interval // [start_n, end_n) with step size step_size. If step_size < 0, then end_n can // be less than start_n. Saves the last numbers searched in last_n_searched and // returns the best number of colors found. static inline int perform_k_means_palette_search( const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int lower_bound, int upper_bound, int start_n, int end_n, int step_size, bool do_header_rd_based_gating, int *last_n_searched, uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map, int data_points, int discount_color_cost) { int16_t centroids[PALETTE_MAX_SIZE]; const int max_itr = 50; int n = start_n; int top_color_winner = end_n; /* clang-format off */ assert(IMPLIES(step_size < 0, start_n > end_n)); /* clang-format on */ assert(IMPLIES(step_size > 0, start_n < end_n)); while (!is_iter_over(n, end_n, step_size)) { int beat_best_palette_rd = 0; bool do_header_rd_based_breakout = false; for (int i = 0; i < n; ++i) { centroids[i] = lower_bound + (2 * i + 1) * (upper_bound - lower_bound) / n / 2; } av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr); palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, color_cache, n_cache, do_header_rd_based_gating, best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, &beat_best_palette_rd, &do_header_rd_based_breakout, discount_color_cost); *last_n_searched = n; if (do_header_rd_based_breakout) { // Terminate palette_size search by setting last_n_searched to end_n. *last_n_searched = end_n; break; } if (beat_best_palette_rd) { top_color_winner = n; } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) { // At search level 2, we return immediately if we don't see an improvement return top_color_winner; } n += step_size; } return top_color_winner; } // Sets the parameters to search the current number of colors +- 1 static inline void set_stage2_params(int *min_n, int *max_n, int *step_size, int winner, int end_n) { // Set min to winner - 1 unless we are already at the border, then we set it // to winner + 1 *min_n = (winner == PALETTE_MIN_SIZE) ? (PALETTE_MIN_SIZE + 1) : AOMMAX(winner - 1, PALETTE_MIN_SIZE); // Set max to winner + 1 unless we are already at the border, then we set it // to winner - 1 *max_n = (winner == end_n) ? (winner - 1) : AOMMIN(winner + 1, PALETTE_MAX_SIZE); // Set the step size to max_n - min_n so we only search those two values. // If max_n == min_n, then set step_size to 1 to avoid infinite loop later. *step_size = AOMMAX(1, *max_n - *min_n); } static inline void fill_data_and_get_bounds(const uint8_t *src, const int src_stride, const int rows, const int cols, const int is_high_bitdepth, int16_t *data, int *lower_bound, int *upper_bound) { if (is_high_bitdepth) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); *lower_bound = *upper_bound = src_ptr[0]; for (int r = 0; r < rows; ++r) { for (int c = 0; c < cols; ++c) { const int val = src_ptr[c]; data[c] = (int16_t)val; *lower_bound = AOMMIN(*lower_bound, val); *upper_bound = AOMMAX(*upper_bound, val); } src_ptr += src_stride; data += cols; } return; } // low bit depth *lower_bound = *upper_bound = src[0]; for (int r = 0; r < rows; ++r) { for (int c = 0; c < cols; ++c) { const int val = src[c]; data[c] = (int16_t)val; *lower_bound = AOMMIN(*lower_bound, val); *upper_bound = AOMMAX(*upper_bound, val); } src += src_stride; data += cols; } } /*! \brief Colors are sorted by their count: the higher the better. */ struct ColorCount { //! Color index in the histogram. int index; //! Histogram count. int count; }; static int color_count_comp(const void *c1, const void *c2) { const struct ColorCount *color_count1 = (const struct ColorCount *)c1; const struct ColorCount *color_count2 = (const struct ColorCount *)c2; if (color_count1->count > color_count2->count) return -1; if (color_count1->count < color_count2->count) return 1; if (color_count1->index < color_count2->index) return -1; return 1; } static void find_top_colors(const int *const count_buf, int bit_depth, int n_colors, int16_t *top_colors) { // Top color array, serving as a priority queue if more than n_colors are // found. struct ColorCount top_color_counts[PALETTE_MAX_SIZE] = { { 0 } }; int n_color_count = 0; for (int i = 0; i < (1 << bit_depth); ++i) { if (count_buf[i] > 0) { if (n_color_count < n_colors) { // Keep adding to the top colors. top_color_counts[n_color_count].index = i; top_color_counts[n_color_count].count = count_buf[i]; ++n_color_count; if (n_color_count == n_colors) { qsort(top_color_counts, n_colors, sizeof(top_color_counts[0]), color_count_comp); } } else { // Check the worst in the sorted top. if (count_buf[i] > top_color_counts[n_colors - 1].count) { int j = n_colors - 1; // Move up to the best one. while (j >= 1 && count_buf[i] > top_color_counts[j - 1].count) --j; memmove(top_color_counts + j + 1, top_color_counts + j, (n_colors - j - 1) * sizeof(top_color_counts[0])); top_color_counts[j].index = i; top_color_counts[j].count = count_buf[i]; } } } } assert(n_color_count == n_colors); for (int i = 0; i < n_colors; ++i) { top_colors[i] = top_color_counts[i].index; } } void av1_rd_pick_palette_intra_sby( const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, uint8_t *tx_type_map) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools, bsize)); assert(PALETTE_MAX_SIZE == 8); assert(PALETTE_MIN_SIZE == 2); const int src_stride = x->plane[0].src.stride; const uint8_t *const src = x->plane[0].src.buf; int block_width, block_height, rows, cols; av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, &cols); const SequenceHeader *const seq_params = cpi->common.seq_params; const int is_hbd = seq_params->use_highbitdepth; const int bit_depth = seq_params->bit_depth; const int discount_color_cost = cpi->sf.rt_sf.use_nonrd_pick_mode; int unused; int count_buf[1 << 12]; // Maximum (1 << 12) color levels. int colors, colors_threshold = 0; if (is_hbd) { int count_buf_8bit[1 << 8]; // Maximum (1 << 8) bins for hbd path. av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth, count_buf, count_buf_8bit, &colors_threshold, &colors); } else { av1_count_colors(src, src_stride, rows, cols, count_buf, &colors); colors_threshold = colors; } uint8_t *const color_map = xd->plane[0].color_index_map; int color_thresh_palette = x->color_palette_thresh; // Allow for larger color_threshold for palette search, based on color, // scene_change, and block source variance. // Since palette is Y based, only allow larger threshold if block // color_dist is below threshold. if (cpi->sf.rt_sf.use_nonrd_pick_mode && cpi->sf.rt_sf.increase_color_thresh_palette && cpi->rc.high_source_sad && x->source_variance > 50) { int64_t norm_color_dist = 0; if (x->color_sensitivity[0] || x->color_sensitivity[1]) { norm_color_dist = x->min_dist_inter_uv >> (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]); if (x->color_sensitivity[0] && x->color_sensitivity[1]) norm_color_dist = norm_color_dist >> 1; } if (norm_color_dist < 8000) color_thresh_palette += 20; } if (colors_threshold > 1 && colors_threshold <= color_thresh_palette) { int16_t *const data = x->palette_buffer->kmeans_data_buf; int16_t centroids[PALETTE_MAX_SIZE]; int lower_bound, upper_bound; fill_data_and_get_bounds(src, src_stride, rows, cols, is_hbd, data, &lower_bound, &upper_bound); mbmi->mode = DC_PRED; mbmi->filter_intra_mode_info.use_filter_intra = 0; uint16_t color_cache[2 * PALETTE_MAX_SIZE]; const int n_cache = av1_get_palette_cache(xd, 0, color_cache); // Find the dominant colors, stored in top_colors[]. int16_t top_colors[PALETTE_MAX_SIZE] = { 0 }; find_top_colors(count_buf, bit_depth, AOMMIN(colors, PALETTE_MAX_SIZE), top_colors); // The following are the approaches used for header rdcost based gating // for early termination for different values of prune_palette_search_level. // 0: Pruning based on header rdcost for ascending order palette_size // search. // 1: When colors > PALETTE_MIN_SIZE, enabled only for coarse palette_size // search and for finer search do_header_rd_based_gating parameter is // explicitly passed as 'false'. // 2: Enabled only for ascending order palette_size search and for // descending order search do_header_rd_based_gating parameter is explicitly // passed as 'false'. const bool do_header_rd_based_gating = cpi->sf.intra_sf.prune_luma_palette_size_search_level != 0; // TODO(huisu@google.com): Try to avoid duplicate computation in cases // where the dominant colors and the k-means results are similar. if ((cpi->sf.intra_sf.prune_palette_search_level == 1) && (colors > PALETTE_MIN_SIZE)) { // Start index and step size below are chosen to evaluate unique // candidates in neighbor search, in case a winner candidate is found in // coarse search. Example, // 1) 8 colors (end_n = 8): 2,3,4,5,6,7,8. start_n is chosen as 2 and step // size is chosen as 3. Therefore, coarse search will evaluate 2, 5 and 8. // If winner is found at 5, then 4 and 6 are evaluated. Similarly, for 2 // (3) and 8 (7). // 2) 7 colors (end_n = 7): 2,3,4,5,6,7. If start_n is chosen as 2 (same // as for 8 colors) then step size should also be 2, to cover all // candidates. Coarse search will evaluate 2, 4 and 6. If winner is either // 2 or 4, 3 will be evaluated. Instead, if start_n=3 and step_size=3, // coarse search will evaluate 3 and 6. For the winner, unique neighbors // (3: 2,4 or 6: 5,7) would be evaluated. // Start index for coarse palette search for dominant colors and k-means const uint8_t start_n_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0, 3, 3, 2, 3, 3, 2 }; // Step size for coarse palette search for dominant colors and k-means const uint8_t step_size_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0, 3, 3, 3, 3, 3, 3 }; // Choose the start index and step size for coarse search based on number // of colors const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE); const int min_n = start_n_lookup_table[max_n]; const int step_size = step_size_lookup_table[max_n]; assert(min_n >= PALETTE_MIN_SIZE); // Perform top color coarse palette search to find the winner candidate const int top_color_winner = perform_top_color_palette_search( cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1, step_size, do_header_rd_based_gating, &unused, color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, discount_color_cost); // Evaluate neighbors for the winner color (if winner is found) in the // above coarse search for dominant colors if (top_color_winner <= max_n) { int stage2_min_n, stage2_max_n, stage2_step_size; set_stage2_params(&stage2_min_n, &stage2_max_n, &stage2_step_size, top_color_winner, max_n); // perform finer search for the winner candidate perform_top_color_palette_search( cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, stage2_min_n, stage2_max_n + 1, stage2_step_size, /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, discount_color_cost); } // K-means clustering. // Perform k-means coarse palette search to find the winner candidate const int k_means_winner = perform_k_means_palette_search( cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound, min_n, max_n + 1, step_size, do_header_rd_based_gating, &unused, color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map, rows * cols, discount_color_cost); // Evaluate neighbors for the winner color (if winner is found) in the // above coarse search for k-means if (k_means_winner <= max_n) { int start_n_stage2, end_n_stage2, step_size_stage2; set_stage2_params(&start_n_stage2, &end_n_stage2, &step_size_stage2, k_means_winner, max_n); // perform finer search for the winner candidate perform_k_means_palette_search( cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound, start_n_stage2, end_n_stage2 + 1, step_size_stage2, /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map, rows * cols, discount_color_cost); } } else { const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE), min_n = PALETTE_MIN_SIZE; // Perform top color palette search in ascending order int last_n_searched = min_n; perform_top_color_palette_search( cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1, 1, do_header_rd_based_gating, &last_n_searched, color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, discount_color_cost); if (last_n_searched < max_n) { // Search in descending order until we get to the previous best perform_top_color_palette_search( cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, max_n, last_n_searched, -1, /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, discount_color_cost); } // K-means clustering. if (colors == PALETTE_MIN_SIZE) { // Special case: These colors automatically become the centroids. assert(colors == 2); centroids[0] = lower_bound; centroids[1] = upper_bound; palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors, color_cache, n_cache, /*do_header_rd_based_gating=*/false, best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, NULL, NULL, discount_color_cost); } else { // Perform k-means palette search in ascending order last_n_searched = min_n; perform_k_means_palette_search( cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound, min_n, max_n + 1, 1, do_header_rd_based_gating, &last_n_searched, color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map, rows * cols, discount_color_cost); if (last_n_searched < max_n) { // Search in descending order until we get to the previous best perform_k_means_palette_search( cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound, max_n, last_n_searched, -1, /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map, rows * cols, discount_color_cost); } } } } if (best_mbmi->palette_mode_info.palette_size[0] > 0) { memcpy(color_map, best_palette_color_map, block_width * block_height * sizeof(best_palette_color_map[0])); // Gather the stats to determine whether to use screen content tools in // function av1_determine_sc_tools_with_encoding(). x->palette_pixels += (block_width * block_height); } *mbmi = *best_mbmi; } void av1_rd_pick_palette_intra_sbuv(const AV1_COMP *cpi, MACROBLOCK *x, int dc_mode_cost, uint8_t *best_palette_color_map, MB_MODE_INFO *const best_mbmi, int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools, mbmi->bsize)); PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const BLOCK_SIZE bsize = mbmi->bsize; const SequenceHeader *const seq_params = cpi->common.seq_params; int this_rate; int64_t this_rd; int colors_u, colors_v; int colors_threshold_u = 0, colors_threshold_v = 0, colors_threshold = 0; const int src_stride = x->plane[1].src.stride; const uint8_t *const src_u = x->plane[1].src.buf; const uint8_t *const src_v = x->plane[2].src.buf; uint8_t *const color_map = xd->plane[1].color_index_map; RD_STATS tokenonly_rd_stats; int plane_block_width, plane_block_height, rows, cols; av1_get_block_dimensions(bsize, 1, xd, &plane_block_width, &plane_block_height, &rows, &cols); mbmi->uv_mode = UV_DC_PRED; if (seq_params->use_highbitdepth) { int count_buf[1 << 12]; // Maximum (1 << 12) color levels. int count_buf_8bit[1 << 8]; // Maximum (1 << 8) bins for hbd path. av1_count_colors_highbd(src_u, src_stride, rows, cols, seq_params->bit_depth, count_buf, count_buf_8bit, &colors_threshold_u, &colors_u); av1_count_colors_highbd(src_v, src_stride, rows, cols, seq_params->bit_depth, count_buf, count_buf_8bit, &colors_threshold_v, &colors_v); } else { int count_buf[1 << 8]; av1_count_colors(src_u, src_stride, rows, cols, count_buf, &colors_u); av1_count_colors(src_v, src_stride, rows, cols, count_buf, &colors_v); colors_threshold_u = colors_u; colors_threshold_v = colors_v; } uint16_t color_cache[2 * PALETTE_MAX_SIZE]; const int n_cache = av1_get_palette_cache(xd, 1, color_cache); colors_threshold = colors_threshold_u > colors_threshold_v ? colors_threshold_u : colors_threshold_v; if (colors_threshold > 1 && colors_threshold <= 64) { int r, c, n, i, j; const int max_itr = 50; int lb_u, ub_u, val_u; int lb_v, ub_v, val_v; int16_t *const data = x->palette_buffer->kmeans_data_buf; int16_t centroids[2 * PALETTE_MAX_SIZE]; uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u); uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v); if (seq_params->use_highbitdepth) { lb_u = src_u16[0]; ub_u = src_u16[0]; lb_v = src_v16[0]; ub_v = src_v16[0]; } else { lb_u = src_u[0]; ub_u = src_u[0]; lb_v = src_v[0]; ub_v = src_v[0]; } for (r = 0; r < rows; ++r) { for (c = 0; c < cols; ++c) { if (seq_params->use_highbitdepth) { val_u = src_u16[r * src_stride + c]; val_v = src_v16[r * src_stride + c]; data[(r * cols + c) * 2] = val_u; data[(r * cols + c) * 2 + 1] = val_v; } else { val_u = src_u[r * src_stride + c]; val_v = src_v[r * src_stride + c]; data[(r * cols + c) * 2] = val_u; data[(r * cols + c) * 2 + 1] = val_v; } if (val_u < lb_u) lb_u = val_u; else if (val_u > ub_u) ub_u = val_u; if (val_v < lb_v) lb_v = val_v; else if (val_v > ub_v) ub_v = val_v; } } const int colors = colors_u > colors_v ? colors_u : colors_v; const int max_colors = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; for (n = PALETTE_MIN_SIZE; n <= max_colors; ++n) { for (i = 0; i < n; ++i) { centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2; centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2; } av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr); optimize_palette_colors(color_cache, n_cache, n, 2, centroids, cpi->common.seq_params->bit_depth); // Sort the U channel colors in ascending order. for (i = 0; i < 2 * (n - 1); i += 2) { int min_idx = i; int min_val = centroids[i]; for (j = i + 2; j < 2 * n; j += 2) if (centroids[j] < min_val) min_val = centroids[j], min_idx = j; if (min_idx != i) { int temp_u = centroids[i], temp_v = centroids[i + 1]; centroids[i] = centroids[min_idx]; centroids[i + 1] = centroids[min_idx + 1]; centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v; } } av1_calc_indices(data, centroids, color_map, rows * cols, n, 2); extend_palette_color_map(color_map, cols, rows, plane_block_width, plane_block_height); pmi->palette_size[1] = n; for (i = 1; i < 3; ++i) { for (j = 0; j < n; ++j) { if (seq_params->use_highbitdepth) pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd( (int)centroids[j * 2 + i - 1], seq_params->bit_depth); else pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel((int)centroids[j * 2 + i - 1]); } } if (cpi->sf.intra_sf.early_term_chroma_palette_size_search) { const int palette_mode_rate = intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost); const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0); // Terminate further palette_size search, if header cost corresponding // to lower palette_size is more than the best_rd. if (header_rd >= *best_rd) break; av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); if (tokenonly_rd_stats.rate == INT_MAX) continue; this_rate = tokenonly_rd_stats.rate + palette_mode_rate; } else { av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); if (tokenonly_rd_stats.rate == INT_MAX) continue; this_rate = tokenonly_rd_stats.rate + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost); } this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); if (this_rd < *best_rd) { *best_rd = this_rd; *best_mbmi = *mbmi; memcpy(best_palette_color_map, color_map, plane_block_width * plane_block_height * sizeof(best_palette_color_map[0])); *rate = this_rate; *distortion = tokenonly_rd_stats.dist; *rate_tokenonly = tokenonly_rd_stats.rate; *skippable = tokenonly_rd_stats.skip_txfm; } } } if (best_mbmi->palette_mode_info.palette_size[1] > 0) { memcpy(color_map, best_palette_color_map, plane_block_width * plane_block_height * sizeof(best_palette_color_map[0])); } } void av1_restore_uv_color_map(const AV1_COMP *cpi, MACROBLOCK *x) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const BLOCK_SIZE bsize = mbmi->bsize; int src_stride = x->plane[1].src.stride; const uint8_t *const src_u = x->plane[1].src.buf; const uint8_t *const src_v = x->plane[2].src.buf; int16_t *const data = x->palette_buffer->kmeans_data_buf; int16_t centroids[2 * PALETTE_MAX_SIZE]; uint8_t *const color_map = xd->plane[1].color_index_map; int r, c; const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u); const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v); int plane_block_width, plane_block_height, rows, cols; av1_get_block_dimensions(bsize, 1, xd, &plane_block_width, &plane_block_height, &rows, &cols); for (r = 0; r < rows; ++r) { for (c = 0; c < cols; ++c) { if (cpi->common.seq_params->use_highbitdepth) { data[(r * cols + c) * 2] = src_u16[r * src_stride + c]; data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c]; } else { data[(r * cols + c) * 2] = src_u[r * src_stride + c]; data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c]; } } } for (r = 1; r < 3; ++r) { for (c = 0; c < pmi->palette_size[1]; ++c) { centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c]; } } av1_calc_indices(data, centroids, color_map, rows * cols, pmi->palette_size[1], 2); extend_palette_color_map(color_map, cols, rows, plane_block_width, plane_block_height); } aom-3.12.1/av1/encoder/palette.h000066400000000000000000000210221477627663500163130ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief Declares functions used in palette search. */ #ifndef AOM_AV1_ENCODER_PALETTE_H_ #define AOM_AV1_ENCODER_PALETTE_H_ #include "av1/common/blockd.h" #ifdef __cplusplus extern "C" { #endif struct AV1_COMP; struct PICK_MODE_CONTEXT; struct macroblock; /*!\cond */ #define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int16_t *data, int16_t *centroids, uint8_t *indices, int n, int k, int max_itr); void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int16_t *data, int16_t *centroids, uint8_t *indices, int n, int k, int max_itr); /*!\endcond */ /*!\brief Calculates the cluster to which each data point belong. * * \ingroup palette_mode_search * \param[in] data The data points whose cluster indices are * to be computed. The data layout is * NUM_DATA_POINTS X DATA_DIM. * \param[in] centroids Pointer to the centroids. The data layout * is NUM_CENTROIDS X DATA_DIM. * \param[in] indices Pointer to store the computed indices. * \param[in] n Number of data points. * \param[in] k Number of clusters. * \param[in] dim Data dimension. * * \remark Returns nothing, but saves each data's cluster index in \a indices. */ static inline void av1_calc_indices(const int16_t *data, const int16_t *centroids, uint8_t *indices, int n, int k, int dim) { assert(n > 0); assert(k > 0); if (dim == 1) { av1_calc_indices_dim1(data, centroids, indices, /*total_dist=*/NULL, n, k); } else if (dim == 2) { av1_calc_indices_dim2(data, centroids, indices, /*total_dist=*/NULL, n, k); } else { assert(0 && "Untemplated k means dimension"); } } /*!\brief Performs k-means cluster on the data. * * \ingroup palette_mode_search * \param[in] data The data points to be clustered. The data * layout is NUM_DATA_POINTS X DATA_DIM. * \param[in] centroids Pointer to store the computed centroids. * The data layout is * NUM_CENTROIDS X DATA_DIM. * \param[in] indices Pointer to store the computed indices. For * each training data. * \param[in] n Number of data points. * \param[in] k Number of clusters. * \param[in] dim Data dimension. * \param[in] max_itr Maximum number of iterations to run. * * \remark Returns nothing, but saves each cluster's centroid in centroids and * each data's cluster index in \a indices. * * \attention The output centroids are rounded off to nearest integers. */ static inline void av1_k_means(const int16_t *data, int16_t *centroids, uint8_t *indices, int n, int k, int dim, int max_itr) { assert(n > 0); assert(k > 0); if (dim == 1) { AV1_K_MEANS_RENAME(av1_k_means, 1)(data, centroids, indices, n, k, max_itr); } else if (dim == 2) { AV1_K_MEANS_RENAME(av1_k_means, 2)(data, centroids, indices, n, k, max_itr); } else { assert(0 && "Untemplated k means dimension"); } } /*!\brief Checks what colors are in the color cache. * * \ingroup palette_mode_search * \param[in] color_cache A cache of colors. * \param[in] n_cache Number of colors in the cache. * \param[in] colors New base colors. * \param[in] n_colors Number of new colors. * \param[in] cache_color_found Stores what cached colors are presented in * colors. * \param[in] out_cache_colors Stores what colors are not in the cache. * * \return Returns the number of colors that are not in cache. In addition, * records whether each cache color is presented in colors in cache_color_found, * and stores and stores the out of cache colors in out_cache_colors. */ int av1_index_color_cache(const uint16_t *color_cache, int n_cache, const uint16_t *colors, int n_colors, uint8_t *cache_color_found, int *out_cache_colors); /*!\brief Gets the rate cost for each delta-encoding v palette. * * \ingroup palette_mode_search * \param[in] pmi Struct that stores the palette mode info. * \param[in] bit_depth Pixel bitdepth of the sequence. * \param[in] zero_count Stores the number of zero deltas. * \param[in] min_bits Minimum bits for the deltas. Sets to * bit_depth - 4. * * \return Returns the number of bits used to transmit each v palette color * delta and assigns zero_count with the number of deltas being 0. */ int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi, int bit_depth, int *zero_count, int *min_bits); /*!\brief Gets the rate cost for transmitting luma palette color values. * * \ingroup palette_mode_search * \param[in] pmi Struct that stores the palette mode info. * \param[in] color_cache Color cache presented at the decoder. * \param[in] n_cache Number of colors in the cache. * \param[in] bit_depth Pixel bitdepth of the sequence. * * \return Returns the rate needed to transmit the palette. Note that this does * not include the cost of transmitted the color map. */ int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi, const uint16_t *color_cache, int n_cache, int bit_depth); /*!\brief Gets the rate cost for transmitting luma palette chroma values. * * \ingroup palette_mode_search * \param[in] pmi Struct that stores the palette mode info. * \param[in] color_cache Color cache presented at the decoder. * \param[in] n_cache Number of colors in the cache. * \param[in] bit_depth Pixel bitdepth of the sequence. * * \return Returns the rate needed to transmit the palette. Note that this does * not include the cost of transmitted the color map. */ int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi, const uint16_t *color_cache, int n_cache, int bit_depth); /*!\brief Search for the best palette in the luma plane. * * \ingroup palette_mode_search * \callergraph * This function is used in both inter and intra frame coding. */ void av1_rd_pick_palette_intra_sby( const struct AV1_COMP *cpi, struct macroblock *x, BLOCK_SIZE bsize, int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable, int *beat_best_rd, struct PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, uint8_t *tx_type_map); /*!\brief Search for the best palette in the chroma plane. * * \ingroup palette_mode_search * \callergraph * This function is used in both inter and intra frame coding. */ void av1_rd_pick_palette_intra_sbuv(const struct AV1_COMP *cpi, struct macroblock *x, int dc_mode_cost, uint8_t *best_palette_color_map, MB_MODE_INFO *const best_mbmi, int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable); /*!\brief Resets palette color map for chroma channels. */ void av1_restore_uv_color_map(const struct AV1_COMP *cpi, struct macroblock *x); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_PALETTE_H_ aom-3.12.1/av1/encoder/partition_cnn_weights.h000066400000000000000000004111411477627663500212630ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_ #define AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_ #ifdef __cplusplus extern "C" { #endif #include "av1/encoder/cnn.h" #include "av1/encoder/ml.h" #define CNN_BRANCH_0_OUT_CH 20 #define CNN_BRANCH_1_OUT_CH 4 #define CNN_BRANCH_2_OUT_CH 20 #define CNN_BRANCH_3_OUT_CH 20 #define CNN_TOT_OUT_CH \ (((CNN_BRANCH_0_OUT_CH) + (CNN_BRANCH_1_OUT_CH) + (CNN_BRANCH_2_OUT_CH) + \ (CNN_BRANCH_3_OUT_CH))) #define CNN_BRANCH_0_OUT_SIZE (CNN_BRANCH_0_OUT_CH) #define CNN_BRANCH_1_OUT_SIZE ((CNN_BRANCH_1_OUT_CH)*2 * 2) #define CNN_BRANCH_2_OUT_SIZE ((CNN_BRANCH_2_OUT_CH)*4 * 4) #define CNN_BRANCH_3_OUT_SIZE ((CNN_BRANCH_3_OUT_CH)*8 * 8) #define CNN_OUT_BUF_SIZE \ (((CNN_BRANCH_0_OUT_SIZE) + (CNN_BRANCH_1_OUT_SIZE) + \ (CNN_BRANCH_2_OUT_SIZE) + (CNN_BRANCH_3_OUT_SIZE))) #define NUM_DNN_BRANCHES 4 #define NUM_CNN_LAYERS 5 #define BRANCH_0_NUM_DNN_LAYERS 2 #define BRANCH_1_NUM_DNN_LAYERS 2 #define BRANCH_2_NUM_DNN_LAYERS 2 #define BRANCH_3_NUM_DNN_LAYERS 2 #define CNN_LAYER_0_HEIGHT 5 #define CNN_LAYER_0_WIDTH 5 #define CNN_LAYER_0_IN_CH 1 #define CNN_LAYER_0_OUT_CH 20 #define CNN_LAYER_0_HORZ_STRIDE 4 #define CNN_LAYER_0_VERT_STRIDE 4 #define CNN_LAYER_1_HEIGHT 2 #define CNN_LAYER_1_WIDTH 2 #define CNN_LAYER_1_IN_CH 20 #define CNN_LAYER_1_OUT_CH 20 #define CNN_LAYER_1_HORZ_STRIDE 2 #define CNN_LAYER_1_VERT_STRIDE 2 #define CNN_LAYER_2_HEIGHT 2 #define CNN_LAYER_2_WIDTH 2 #define CNN_LAYER_2_IN_CH 20 #define CNN_LAYER_2_OUT_CH 20 #define CNN_LAYER_2_HORZ_STRIDE 2 #define CNN_LAYER_2_VERT_STRIDE 2 #define CNN_LAYER_3_HEIGHT 2 #define CNN_LAYER_3_WIDTH 2 #define CNN_LAYER_3_IN_CH 20 #define CNN_LAYER_3_OUT_CH 4 #define CNN_LAYER_3_HORZ_STRIDE 2 #define CNN_LAYER_3_VERT_STRIDE 2 #define CNN_LAYER_4_HEIGHT 2 #define CNN_LAYER_4_WIDTH 2 #define CNN_LAYER_4_IN_CH 4 #define CNN_LAYER_4_OUT_CH 20 #define CNN_LAYER_4_HORZ_STRIDE 2 #define CNN_LAYER_4_VERT_STRIDE 2 #define BRANCH_0_NUM_DNN_FEATURES 37 #define BRANCH_0_NUM_DNN_LAYER_0_UNITS 16 #define BRANCH_0_NUM_DNN_LAYER_1_UNITS 24 #define BRANCH_0_NUM_LOGITS 1 #define BRANCH_1_NUM_DNN_FEATURES 25 #define BRANCH_1_NUM_DNN_LAYER_0_UNITS 16 #define BRANCH_1_NUM_DNN_LAYER_1_UNITS 24 #define BRANCH_1_NUM_LOGITS 1 #define BRANCH_2_NUM_DNN_FEATURES 25 #define BRANCH_2_NUM_DNN_LAYER_0_UNITS 16 #define BRANCH_2_NUM_DNN_LAYER_1_UNITS 24 #define BRANCH_2_NUM_LOGITS 1 #define BRANCH_3_NUM_DNN_FEATURES 41 #define BRANCH_3_NUM_DNN_LAYER_0_UNITS 16 #define BRANCH_3_NUM_DNN_LAYER_1_UNITS 24 #define BRANCH_3_NUM_LOGITS 1 static const float av1_intra_mode_cnn_partition_cnn_layer_0_kernel[] = { 0.131894f, -0.593536f, -0.212935f, -0.00220011f, -0.396949f, 0.287753f, -0.91875f, -0.0095057f, 0.804197f, -0.395239f, 0.516604f, 1.16439f, 0.445784f, -0.163349f, 0.746488f, -0.33891f, -0.562652f, 0.481403f, 0.755378f, -0.200753f, 0.0784307f, 0.105657f, 0.0205673f, -0.524089f, -0.476146f, -0.161206f, -0.65079f, 0.137474f, 0.28584f, 0.508768f, -0.643386f, 0.227068f, -0.899507f, -0.413382f, 0.631466f, 0.398203f, -0.544392f, 0.825155f, 0.671847f, -0.249779f, 0.323121f, 0.125357f, -0.719564f, -0.0714854f, -0.168472f, -0.213246f, -0.674525f, 0.330148f, -0.138414f, 0.20462f, -0.518571f, -0.15091f, -0.605116f, -0.448732f, -0.475599f, 0.738f, -0.328526f, 0.755035f, 0.969414f, -0.321039f, -0.23068f, 0.408567f, -0.377813f, -0.273974f, 1.0684f, 0.373968f, -0.450305f, 0.439258f, -0.381846f, -0.267331f, 0.30613f, -0.39369f, 0.622438f, -0.52877f, -0.334991f, 0.263193f, -0.402121f, 0.64142f, 0.793048f, -0.0231174f, -0.68474f, -0.293338f, -0.737511f, -0.462654f, 0.474629f, 0.141397f, -0.152529f, 0.345879f, -0.499991f, 0.00174024f, 0.337387f, -0.131151f, 0.427385f, -0.457449f, -0.879614f, -0.425908f, -0.263172f, 0.0344974f, 1.07861f, -0.00416662f, 0.0208952f, 0.233905f, 0.765965f, 0.0423685f, -0.117554f, -0.248237f, 0.49848f, -0.845131f, 0.223648f, -0.838709f, 0.5834f, 0.309956f, -0.0625093f, -0.619619f, 0.918957f, 0.358271f, -0.668459f, 0.518783f, -0.418963f, -0.206788f, 0.364983f, -0.0396087f, 0.624309f, -0.138679f, -0.142453f, 0.28309f, 0.895092f, -0.215713f, 0.439025f, 0.659333f, -0.366025f, -0.413518f, 0.66657f, -0.265919f, 0.473471f, -1.0729f, -0.526702f, 0.2838f, 0.367648f, -0.61242f, 0.121656f, 0.547727f, -0.0636793f, -0.33006f, -0.306604f, -0.00897731f, 0.688242f, 0.0944626f, 0.321508f, 0.0437392f, -0.560035f, -0.768334f, 0.0571051f, -0.0427601f, -0.0437806f, -0.816209f, -0.395829f, 0.293733f, 0.217645f, -0.646428f, 0.132448f, -0.435806f, -0.0556814f, 0.0218857f, 0.348525f, -0.17296f, 0.669057f, 0.638604f, -0.0995596f, -0.024099f, -0.262332f, -0.548975f, 0.357894f, 0.43873f, -0.688234f, -0.425519f, 0.190986f, -0.074778f, 0.294232f, -0.548969f, -0.731198f, 0.03616f, -0.475969f, -0.306075f, -0.111929f, -0.234146f, 0.612669f, 0.882254f, -0.622893f, 0.262431f, 0.465242f, 0.245384f, -0.811016f, 0.501798f, -0.925875f, 0.264373f, 0.307766f, -0.26872f, 0.113027f, -0.158875f, 0.0711483f, 0.220275f, -0.0699022f, -0.0111303f, -0.435384f, -0.720014f, 0.593484f, -0.964082f, 0.750925f, 0.252433f, 0.964332f, -0.256904f, -0.421715f, -0.403851f, -0.188081f, 0.694014f, -1.00183f, 0.798921f, 0.0603123f, 0.213814f, 0.739642f, -0.0203375f, 0.72569f, -0.260224f, 0.0199516f, -0.322451f, 0.318204f, -0.38392f, 0.740994f, -0.265215f, -0.54541f, -0.51479f, -0.458397f, 0.519564f, 0.0509182f, 0.0363331f, -0.293051f, 0.317714f, -0.327488f, -0.0840401f, 0.318437f, -0.619403f, 0.641094f, -0.288435f, -0.260185f, 0.181083f, -0.169294f, 0.292645f, 0.140405f, 0.0572885f, -0.637428f, -0.102616f, 0.288955f, 0.817314f, 0.116855f, 0.635532f, 0.283334f, -0.236391f, -0.305035f, -0.217365f, -0.033021f, -0.455858f, 0.439922f, -0.104039f, 0.373376f, 0.310659f, 0.388789f, 0.266341f, 0.0746306f, -0.428192f, -0.202695f, -0.347625f, 0.00585741f, 0.366203f, 0.221413f, 0.518856f, 0.57245f, -0.375071f, -0.2436f, -0.511895f, -1.03708f, 0.681455f, -0.111544f, -0.183563f, 0.109729f, -0.422646f, -0.529777f, 0.747473f, -0.270223f, -0.11435f, 0.378931f, 0.420456f, 0.236331f, 0.49261f, -0.0666801f, 0.0475846f, 0.906095f, -0.4146f, -0.020588f, -0.653285f, 0.135335f, 0.543846f, -0.309061f, 0.11899f, -0.639168f, -0.719994f, -0.219706f, -0.645631f, -0.829049f, -0.0114746f, 0.834604f, 0.0378035f, 0.107957f, 0.546929f, -0.674395f, -0.854817f, -1.1443f, 0.223413f, -0.326324f, 0.440971f, 0.383582f, -0.495084f, 0.280091f, -0.53116f, 0.0333923f, -0.354339f, -0.0449156f, -0.538896f, -0.753355f, 0.463995f, 0.000969967f, -0.2832f, 0.587276f, 0.853094f, -0.481985f, -0.138202f, 0.180989f, -0.349044f, -0.417534f, 0.455591f, 0.287332f, 0.251496f, 0.381416f, 0.339632f, -0.0825727f, 0.352739f, 0.161697f, -0.319764f, -0.258015f, 0.668833f, -0.553303f, -0.578815f, -0.3758f, 0.289f, 0.247368f, 0.00681103f, 0.421092f, -0.191033f, -0.425868f, -0.1239f, 0.0540422f, -0.0856856f, 0.481168f, -0.0283741f, -0.196018f, 0.230923f, -0.145288f, 0.52188f, 0.00628462f, -0.604556f, -0.562879f, 0.319282f, 0.323799f, 0.453941f, 0.271129f, -0.0520196f, 0.684571f, -0.391779f, -0.404614f, 0.134097f, -0.825482f, 0.0913949f, 0.483543f, 0.159084f, 0.301637f, 0.427013f, 0.196153f, 0.460091f, -0.730573f, -0.12278f, 0.221665f, 0.674622f, -0.623363f, -0.0761517f, 0.637979f, -0.468498f, 0.527276f, -0.596894f, -0.34675f, -0.251241f, 0.418533f, -0.476696f, -0.901267f, -0.0088241f, -0.12421f, -0.660316f, -0.0222117f, -0.470898f, -1.10739f, -0.441645f, 0.39516f, -0.0117906f, 0.254122f, 0.00722599f, -1.00697f, 0.48908f, -0.122287f, -0.378608f, -0.339145f, 0.682463f, 0.305606f, 0.453628f, -0.49923f, -0.791388f, -0.202515f, 0.23214f, -0.434209f, -0.778283f, -0.538015f, 0.145769f, 0.446281f, -0.339329f, -0.198478f, -0.183717f, -0.855441f, -0.105778f, 0.575067f, -0.18592f, -0.348094f, 0.740614f, 0.041549f, -0.109663f, 0.0434492f, 0.245242f, -1.22192f, 0.685896f, -0.208115f, -0.0616216f, -1.00552f, 0.31045f, -0.184394f, 0.466705f, -0.0984364f, -0.506252f, 0.144874f, 0.357038f, 0.675221f, -0.822171f, -0.52729f, 0.991212f, 0.432422f, 0.383493f, -0.372395f, 0.35651f, -0.25369f, 0.660208f, -0.117745f, -0.142433f, -0.724115f, -1.0035f, -0.59178f, 0.563444f, -0.282531f, -0.599989f, 0.507424f, -0.782875f, 0.755029f, -0.754962f, -0.617825f, 0.565984f, -0.826878f, -0.456563f, 0.0212161f, 0.469867f, -0.144864f, 0.225748f, -0.279029f, 0.21052f, -0.440183f, 0.936069f, 0.170595f, 0.40966f, 0.452453f, -0.576006f, 1.50696f, 0.649049f, 0.094957f, -0.167706f, -0.258342f, 0.59269f }; static const float av1_intra_mode_cnn_partition_cnn_layer_0_bias[] = { 0.00475215f, -0.00362332f, -0.00317542f, 0.190083f, 0.0488147f, -0.0268093f, -0.00432231f, 0.0112229f, 0.0626653f, -0.0025698f, 0.0018675f, -0.00368139f, -0.00159125f, -0.00034354f, 0.311437f, 0.000136436f, 0.0667295f, 0.0251274f, 0.00226553f, -0.000638344f }; static const float av1_intra_mode_cnn_partition_cnn_layer_1_kernel[] = { 0.228403f, 0.241933f, 0.181079f, 0.101728f, 0.278455f, -0.222078f, 0.387578f, 0.0847356f, -0.0737012f, 0.26518f, -1.0817f, 0.0404161f, -0.805199f, 0.336576f, -0.541494f, 0.246264f, 0.116597f, -0.756804f, -0.914136f, 0.410265f, 0.413294f, 0.07873f, 0.450017f, -0.264346f, 0.549095f, 1.03755f, -0.203542f, 1.61018f, 0.374131f, 0.402515f, -2.36115f, 0.116427f, -0.172157f, -0.231482f, -0.905736f, -0.0183059f, -0.575746f, 0.110348f, -0.268018f, 0.140399f, 0.427196f, 0.0718528f, 0.247936f, -0.326661f, 0.150404f, -0.659979f, -0.157148f, 0.00826241f, -0.679275f, -0.131564f, -1.04822f, 1.06039f, -0.207898f, 0.510167f, 0.484233f, 0.138972f, -0.0801639f, -0.184416f, 0.0741107f, -0.0299281f, 0.112263f, 0.380071f, -0.0185269f, -0.0821188f, 0.918796f, -0.576106f, 0.593007f, 0.479446f, 0.0440703f, 0.322379f, 0.176783f, -0.147111f, 0.0953247f, -0.636377f, 0.0702104f, 0.130979f, 0.293892f, -0.0112124f, -0.040347f, -0.16034f, 0.3252f, -0.586802f, 0.601786f, -0.487148f, -0.458777f, 0.463835f, 0.144942f, 0.00339965f, -0.779966f, 0.0585298f, -1.20758f, -0.275614f, 0.292346f, -0.132781f, 0.337892f, -0.357677f, 1.48511f, 0.172907f, -0.148668f, 0.243184f, -0.503392f, -0.0791543f, 0.0265389f, -0.102267f, 0.213294f, 0.0657801f, 0.156996f, 0.0891168f, 0.120805f, 0.261285f, -0.343025f, -0.0792235f, -0.106415f, 0.133878f, -0.112981f, -0.00151126f, -0.0643829f, 0.0458938f, -0.0452731f, -0.00147422f, 0.1871f, -0.0208793f, 0.0752037f, 0.0794674f, 0.167666f, 0.198028f, -0.361015f, -0.0661721f, -0.10672f, -0.0773641f, -1.15856f, -0.516443f, -0.322702f, 0.15668f, 0.0075841f, -0.157731f, 0.270926f, -0.241551f, 0.0169097f, -0.0263953f, -0.303556f, -0.239237f, 0.117792f, -0.137871f, 0.122054f, -0.587381f, 0.112938f, 0.0867262f, -0.27909f, -0.203622f, -0.622195f, 0.42623f, 0.670704f, 0.190826f, -0.304979f, -0.570075f, -0.240699f, 0.43744f, 0.632896f, -0.563846f, -0.0160434f, -0.0709745f, 0.816662f, 0.269999f, -0.358734f, 0.193644f, 1.19339f, -0.118223f, -0.363291f, -0.723616f, -1.58825f, 0.0222856f, 0.769852f, 0.322713f, 0.0857619f, -0.669756f, -1.08414f, 1.18593f, 0.486166f, -0.520646f, 0.0861854f, -0.134197f, 0.258337f, 0.223345f, 0.697639f, -0.57261f, 0.54031f, 0.892644f, 0.497572f, -0.287076f, -1.95928f, -0.0568128f, -0.253335f, 0.00233392f, -0.192787f, -0.115203f, -0.0975649f, 0.277954f, 0.000704534f, -0.315884f, 0.309583f, 0.357458f, 0.0939298f, -0.072701f, 0.433045f, -0.536938f, 0.534523f, 0.184585f, -0.0415175f, -0.120909f, -1.2622f, 0.412449f, -0.114741f, 0.290453f, -0.441671f, -0.0242497f, -0.20746f, 0.139019f, -0.422668f, -0.146732f, -0.688828f, -0.00339426f, 0.04166f, 0.41755f, 0.405675f, 0.562564f, 0.0216812f, 0.0271391f, 0.215227f, 0.328183f, -1.6442f, -0.827838f, 0.115491f, 0.0951442f, -0.133779f, -0.0482928f, 0.203177f, 0.322953f, -0.513259f, 0.0676788f, -0.0877928f, 0.224448f, 0.451957f, 0.314243f, 0.307403f, 0.35653f, 0.0286278f, 2.27554f, 0.569313f, -0.0488753f, -2.48809f, 0.274555f, -0.248375f, -0.635634f, -0.187663f, 0.1827f, -0.409634f, -0.0280568f, -0.207119f, -0.208192f, -0.410268f, -0.017669f, 0.134856f, 0.434551f, 0.165201f, 0.584608f, -0.389997f, -0.088713f, 0.118087f, 0.00210905f, -1.07698f, -0.520967f, -0.198742f, 0.190255f, -0.162639f, 0.0122759f, 0.460774f, -0.684633f, -0.149512f, 0.167556f, -0.295034f, -0.0650964f, 0.0868653f, -0.691352f, 0.089795f, 0.0620608f, 0.0531289f, 0.0124286f, 0.151921f, 1.51067f, -0.10586f, -0.0311871f, 0.114706f, 0.0565205f, -0.159634f, -0.423987f, -0.226896f, 0.0605352f, -0.36324f, -0.142205f, -0.252249f, 0.0666312f, 0.316655f, 0.00687196f, 0.131079f, -0.128281f, -0.293468f, 1.3327f, 0.542277f, -0.060088f, -1.73475f, 0.0542297f, -0.227522f, -0.376004f, -0.147028f, 0.0228252f, 0.0569538f, -0.0796497f, 0.0937596f, -0.0660153f, -0.979219f, -0.377322f, 0.0523787f, 0.467299f, 0.0824278f, 0.437147f, 0.263637f, 0.0325681f, 0.303581f, 0.353479f, -0.142369f, -0.394797f, 0.597185f, 0.116482f, -0.0782593f, 0.364539f, -0.30396f, 0.119016f, -0.0022429f, -0.044292f, -0.0110531f, 0.233571f, 0.000975879f, 0.447332f, -0.0320396f, 0.541609f, 0.14232f, 0.163905f, 0.848609f, 0.19954f, -0.186591f, -0.44465f, -0.431672f, 0.159037f, -0.129977f, -0.141778f, 0.246818f, -0.197539f, -0.70115f, 0.185449f, 0.400274f, -0.0350744f, 0.239727f, -0.290504f, 0.0698443f, -0.180374f, -0.759591f, -0.0569088f, -0.50246f, -0.0986616f, -0.892114f, 0.306737f, -0.133937f, 0.285625f, 0.495471f, -0.686222f, -0.168647f, -0.0926158f, 0.351772f, -0.0215394f, 0.361223f, 0.0657142f, 0.268229f, -0.616299f, 0.0564718f, -0.294013f, -0.588019f, 0.0234195f, -0.426863f, -0.511253f, -0.72177f, 0.420903f, 0.0987506f, 0.309368f, 0.523532f, 1.06073f, -0.33028f, 0.0818142f, 0.0130354f, 0.0180882f, 0.0316898f, -0.416614f, -0.566344f, -0.163083f, 0.285085f, -0.0534352f, 0.385496f, 0.151068f, -0.208295f, -0.175648f, 0.0476705f, 0.190428f, -0.643391f, 0.484004f, -0.421836f, -0.19829f, -0.227574f, -0.0869152f, 1.09881f, 0.345129f, -0.236732f, -0.381935f, -1.46271f, 0.465914f, 0.610375f, 0.689968f, -0.688546f, 1.95033f, 0.420946f, 0.0282428f, 0.147823f, 0.669393f, 0.429085f, -0.328385f, -0.150439f, -0.419097f, -0.828102f, 0.248743f, 0.24644f, 0.0186131f, -0.384319f, -0.126294f, -0.417067f, 0.271483f, -0.0128456f, -0.881351f, 0.152581f, 0.185584f, -0.745827f, 0.0551359f, 0.127083f, 0.936983f, -0.0225341f, 0.575861f, 0.767417f, -0.140867f, -0.762518f, 0.422446f, -0.0611973f, 0.0515641f, -0.144168f, -0.298882f, 0.308461f, 0.0208704f, 0.213872f, -0.258708f, 1.13186f, 0.314083f, -0.347536f, -0.137768f, 0.653953f, -0.217883f, -0.56112f, -0.864661f, 0.488836f, 0.268133f, -0.548664f, -0.765226f, 0.117082f, 0.326798f, -0.678246f, 0.477785f, -1.27584f, 0.198912f, -0.710395f, 1.39096f, -0.411577f, -0.55119f, 0.51092f, -0.295023f, 0.245983f, -0.0957192f, -0.312001f, 0.0175991f, 0.524423f, -0.126379f, 0.124687f, -1.53945f, -0.342856f, 0.514072f, 0.400884f, -0.00581101f, -0.219327f, 0.0977873f, 0.337551f, -0.058603f, 0.20034f, 0.0429945f, 0.676803f, -0.273585f, -0.173435f, -0.581596f, 0.226263f, -0.0946223f, -0.060088f, -0.0100809f, -0.022242f, -0.22218f, -0.030463f, -0.141389f, -0.190757f, -0.00526518f, -0.77519f, -0.0825695f, 0.308403f, 0.262792f, -0.601842f, 0.0783697f, 0.197527f, 0.0714048f, 0.0392629f, -0.388628f, 0.172541f, -0.0222009f, 0.252096f, 0.0728652f, 0.173632f, 0.192914f, -0.00969965f, 0.0530136f, -0.00765759f, 0.440234f, -0.0943323f, 0.112319f, 0.0878737f, -0.739021f, 0.385305f, 0.133334f, -0.396697f, 0.177818f, -0.0712558f, 0.516923f, 0.102174f, 0.17158f, -0.211068f, 0.295795f, -0.36198f, 0.179087f, -0.845744f, -0.242514f, -1.49073f, 0.272702f, 0.59011f, -0.408184f, -0.0731313f, 0.234643f, 0.589642f, -0.100778f, 0.516921f, -0.700154f, 0.316432f, 0.36117f, 0.0380282f, 0.480101f, -0.0975487f, 0.941452f, 0.231705f, -0.151182f, -1.20305f, 0.28255f, -0.0427662f, -0.00717175f, -0.842085f, -0.357376f, 0.545581f, -0.290714f, 0.741498f, 1.00377f, 0.483864f, 0.150405f, 0.0834512f, -0.10031f, 0.424054f, -0.0223491f, -0.0696701f, -0.134479f, -0.747227f, 0.422208f, 0.123858f, -0.392624f, -0.0299847f, -0.0376142f, -0.392536f, -0.0343114f, 0.298224f, -0.375899f, 0.693119f, 0.27909f, -0.53463f, 0.105459f, -0.0267383f, 0.5094f, -0.411557f, 0.451749f, -0.348479f, -0.0497316f, -0.353913f, -0.14858f, 0.241838f, 0.331039f, 0.756607f, -0.0701661f, -0.827264f, -0.367772f, 0.447201f, 0.834616f, -0.00497265f, -0.0557285f, 0.055088f, -0.300115f, -0.143833f, -1.07838f, -0.106896f, 0.16945f, 0.0170324f, 0.108754f, 0.335893f, -0.0923708f, 0.450209f, -0.0713308f, -0.0233037f, -0.0129902f, -1.40664f, -0.0996218f, 0.711236f, 0.400716f, 0.227871f, 2.01499f, 0.572926f, 0.135673f, -0.0340458f, -0.316736f, 0.24257f, -0.700768f, -0.194985f, 0.312011f, -0.179599f, 0.128114f, 0.0725977f, -0.193816f, 0.352143f, 0.070641f, -0.467808f, -0.399047f, 0.10136f, 0.671574f, -0.553965f, 0.105729f, 0.210383f, 0.065048f, 0.248198f, -0.731674f, 0.588725f, -0.308237f, 0.24511f, 0.00608906f, 0.170906f, 0.246175f, 0.149521f, 0.106071f, 0.160246f, 0.118487f, -0.104102f, 0.872823f, 0.227478f, 0.0182631f, -0.115083f, 0.0142445f, 0.307947f, -0.884925f, 0.0767105f, 0.0414042f, -0.448021f, -0.0400193f, -0.0765448f, -0.411931f, -0.199624f, 0.333371f, 0.17267f, -0.0431816f, 0.190826f, -0.0758961f, -1.02831f, -0.0414525f, 0.605374f, -0.0188181f, -0.2207f, 1.30004f, -0.207005f, -0.0333617f, 0.227145f, 0.105059f, -0.0473393f, -0.448752f, -0.0342152f, -0.0244812f, 0.220329f, 0.0313591f, -0.0902074f, -0.0731945f, 0.88488f, 0.306306f, -0.275613f, -0.476372f, 0.00678104f, 0.442029f, 0.122049f, 0.118042f, 0.270527f, -0.462538f, 0.0665021f, -0.260255f, 0.209182f, 0.162321f, 0.0629934f, -0.244896f, -0.078863f, 0.655585f, -0.0506617f, -0.487128f, 0.118765f, -0.34408f, 0.0930615f, -0.365632f, -0.0670776f, 0.44428f, 0.286734f, 0.146608f, 0.686757f, -0.0738428f, -0.10034f, -0.928438f, -0.172601f, -0.0959575f, -0.010532f, 0.277549f, 0.28773f, -0.318883f, 0.71254f, 0.273593f, -0.382845f, -0.0104587f, -0.647769f, 0.25541f, 0.194625f, 0.265197f, -0.750938f, -0.0650515f, -0.567092f, 0.070613f, 0.209531f, 0.429699f, 0.130676f, 0.514914f, 0.615778f, 0.594535f, -0.0878778f, 0.40593f, -0.303383f, 0.0907863f, -0.320068f, 0.0137162f, -0.303424f, 0.594207f, -0.236524f, -0.692627f, -0.990063f, -0.0262934f, 0.222375f, 0.503412f, 0.220224f, 0.676871f, -0.150996f, 0.379777f, 0.841339f, -1.05981f, 0.259943f, -0.781745f, 0.0346478f, 0.115791f, -0.25171f, -0.00872158f, 0.395561f, -0.0849893f, -1.20134f, -0.313938f, 0.789542f, 0.159606f, -0.782095f, -0.229754f, 0.266687f, -0.0354282f, -0.3041f, 0.0338618f, -0.390001f, -0.28362f, -0.436144f, 0.777351f, 0.855321f, 0.653338f, -0.0382912f, -0.204577f, 1.13828f, 0.220395f, -4.60853f, 0.575694f, 0.0453189f, 1.76567f, 0.466151f, -0.366109f, 0.594717f, 0.278891f, -0.750676f, -0.332739f, -0.942304f, 0.280363f, 0.284561f, 0.209326f, 0.238347f, -0.0124311f, -0.439463f, -0.036186f, 0.165997f, 0.374717f, -0.481148f, -0.626417f, 0.0223598f, 0.039337f, -0.379918f, 0.211046f, 0.0795812f, 0.863355f, -0.341448f, 0.421494f, 0.410477f, -0.117025f, -0.511108f, 0.565193f, -0.063582f, -0.031349f, -0.0750174f, 0.387941f, 0.541266f, 0.0919753f, 1.05041f, 0.263004f, 0.289006f, 0.0439694f, -1.22439f, -0.247832f, 0.260967f, 0.355794f, 0.599694f, -0.69418f, 0.372805f, -0.161731f, 0.0720574f, 0.0394657f, 0.122772f, -0.458067f, -0.370826f, -1.34495e-05f, -0.373404f, 0.0245539f, -2.3472f, -2.61448f, 0.264794f, 0.0601582f, -0.968597f, -0.196022f, -0.727067f, 0.167346f, 0.517478f, 0.0035377f, 0.777219f, 0.553128f, 0.727211f, 0.606202f, -0.495604f, 2.41445f, 0.465214f, -0.0443004f, 0.142972f, 0.141459f, -0.17771f, 0.0156117f, 0.169264f, 0.0428022f, -0.164827f, -0.240632f, 0.215289f, -0.213134f, -0.184163f, 0.0161321f, -0.20025f, -0.0311616f, 0.00292108f, -0.0131921f, 0.0437664f, -0.104817f, -0.131906f, 0.0822771f, 0.237307f, -0.347567f, -1.2485f, 0.253616f, -0.442217f, 0.0514077f, 0.337561f, -0.0147658f, -0.132888f, -0.643821f, 0.445573f, -0.0146213f, 0.235511f, 0.53583f, -0.640644f, 0.0280044f, 0.00628834f, 0.143885f, 0.380077f, -0.542342f, 0.363101f, 0.0647334f, -0.476556f, -0.822676f, 0.482454f, -0.0467326f, -0.253083f, 0.116726f, 0.317333f, 0.548131f, -0.234667f, 0.579923f, -0.420683f, 0.595613f, -0.279864f, -0.753204f, -0.516844f, -0.436574f, -0.120682f, -0.278939f, 0.752202f, -0.183443f, -0.14632f, -0.0344068f, 0.127638f, -0.225245f, 0.489391f, 0.145082f, -0.73672f, 0.980065f, -0.0367412f, 0.40632f, -0.802509f, 0.356897f, 0.366172f, 1.23858f, -0.978381f, -0.684924f, -0.0870693f, -0.353628f, 0.695788f, -0.244593f, -1.8897f, -0.257803f, 0.686937f, 0.405155f, -0.125696f, 0.258075f, 0.570584f, -0.439481f, -0.59798f, 0.0745711f, -0.235162f, 0.133048f, -0.243033f, 0.0415527f, -0.00118735f, 0.00980514f, -0.297429f, -0.144983f, 0.463093f, 0.0965441f, -0.338508f, -0.651077f, 0.817577f, -0.0364773f, -0.388465f, 0.113288f, 0.231198f, 0.316208f, -0.592201f, 0.530376f, -0.431434f, 0.0200985f, 0.104303f, -0.130705f, 0.4374f, 0.362342f, 0.70641f, 0.20037f, 0.309128f, -0.484535f, -1.18469f, 0.513893f, 0.201236f, -0.022396f, 0.179638f, -0.361289f, -0.0794946f, -1.04704f, -0.0281103f, 0.0494822f, 0.00196415f, 0.0625478f, -0.229033f, 0.12018f, 0.542629f, -0.222423f, -0.0123321f, -0.0988525f, 0.773192f, -0.192218f, -3.19156f, 0.300606f, 0.462751f, 2.2968f, 0.137182f, 0.132539f, 0.165884f, 0.128818f, -0.155856f, -0.558538f, -0.231742f, -0.244377f, -0.442397f, 0.250947f, 0.0850658f, -0.00820139f, 0.391284f, 0.17453f, 0.306003f, -0.531499f, -0.624451f, 0.564584f, -0.343953f, -0.0278713f, 0.212664f, -0.135969f, -0.0179867f, -0.687887f, 0.371065f, -0.0537029f, 0.0499509f, 0.0980684f, -0.0438569f, 0.186731f, 0.182105f, 0.172254f, -0.149446f, -0.0247637f, 0.148098f, 1.20772f, -0.136664f, 0.00983112f, 0.0181381f, -0.0147549f, -0.0846561f, -0.827022f, 0.00207177f, 0.0478215f, 0.0652549f, 0.0898219f, -0.0224959f, -0.0274246f, 0.0166498f, -0.0211715f, -0.502932f, 0.0961452f, 0.251206f, -0.0623632f, 0.741566f, 0.0078449f, -2.99162f, -0.187244f, 0.0743479f, 1.46425f, 0.0737923f, 0.0133544f, 0.20922f, -0.178671f, -0.0528492f, -0.526717f, 0.0282125f, -0.0363201f, 0.37406f, -0.303658f, -0.066803f, 0.132237f, 0.962057f, -0.399733f, 0.191765f, -0.452606f, -0.348732f, 0.444939f, 0.153025f, 0.0796317f, 0.265985f, -0.319638f, 0.0278161f, -0.333734f, 0.226108f, 0.147895f, -0.124066f, -0.37306f, 0.19541f, 0.200175f, -0.0593244f, 0.0333887f, -0.0284278f, 0.462491f, 0.0686487f, -0.332435f, -0.437166f, 0.302795f, 0.100542f, 0.0265019f, 0.767212f, -0.140621f, 0.11558f, -0.70584f, -0.00017415f, 0.00793092f, -0.0490901f, 0.0598338f, 0.484876f, -0.13025f, 0.660349f, 0.147503f, -0.462766f, 0.0843824f, 0.218493f, 0.310921f, -0.162284f, 0.210404f, -0.788799f, 0.0698512f, -0.484799f, 0.0311505f, -0.308243f, 0.417298f, 0.0593723f, 0.208908f, 0.451437f, 0.354546f, -0.0700888f, -0.281678f, -0.311177f, 0.00914652f, -0.372084f, 0.135036f, 0.185393f, 0.461347f, -0.114241f, -0.402347f, -0.692327f, 0.0376155f, -0.200267f, 0.565963f, -0.0627442f, 0.429677f, 0.170514f, 0.350565f, 0.699528f, -0.948126f, -0.364205f, 0.348878f, -0.137832f, -0.0791649f, -0.0462295f, -0.255078f, -0.398509f, 0.136783f, -0.0164628f, -0.555472f, 0.690396f, 0.147715f, 0.000523095f, 0.14874f, 0.524804f, 0.162974f, 0.797599f, 0.277473f, -0.500696f, 0.189917f, -0.333309f, 0.00613646f, -1.07817f, 0.0470502f, 0.210766f, 0.159768f, -0.447774f, -0.252968f, -1.72739f, 0.0658259f, -0.448747f, 2.26511f, 0.349651f, 0.157232f, 0.956842f, 0.856676f, 0.149227f, -0.626957f, -0.566771f, -0.0980846f, 0.351668f, -0.362741f, -0.0272282f, -0.113632f, 0.366015f, -0.00790003f, -0.458632f, -0.31157f, -0.182257f, -0.953975f, 0.0583582f, 0.164721f, -0.900107f, -0.115542f, 0.0654192f, 0.99056f, -0.247976f, 0.48254f, 0.670196f, 0.098585f, -0.212855f, 0.310072f, 0.0894616f, 0.151944f, 0.119629f, -0.26735f, 0.162257f, -0.0305818f, 0.681526f, -0.229847f, 1.01556f, 0.29132f, 0.740113f, 0.0703937f, 0.537892f, -0.18653f, -0.0252359f, -0.420014f, 0.197631f, -0.176629f, 0.00674754f, 0.301288f, -0.162816f, 0.636235f, -0.341362f, 0.197296f, -0.589747f, -0.749363f, -0.277197f, -1.27291f, -0.0857908f, -0.147591f, -0.0956297f, -0.109097f, 0.0717554f, 0.359078f, 0.301457f, 0.486934f, -0.260955f, -0.126821f, 1.55756f, 0.477469f, -1.45363f, 1.42198f, -0.360847f, -0.0211924f, -0.0184957f, -0.110706f, -0.152136f, 0.104703f, 0.267615f, 0.127392f, 0.172996f, 0.258326f, 0.268578f, -0.431123f, -0.114419f, 0.0101172f, -0.195671f, 0.0792025f, -0.151505f, -0.064077f, 0.0479777f, -0.141882f, 0.121492f, -0.139132f, -0.348252f, 0.341043f, -0.565367f, -0.0791259f, -0.781086f, 0.0140045f, 0.571094f, -0.00875077f, 0.217132f, -0.202345f, 0.157213f, 0.228445f, 0.366612f, -0.529989f, 0.42241f, -0.540538f, -0.0425556f, -0.207774f, -0.0663941f, 0.37836f, -0.0650245f, -0.0828694f, -0.0835478f, -0.795512f, 0.470268f, 0.1551f, -0.69017f, -0.116735f, 0.157614f, 0.555973f, -0.293311f, 0.245428f, -0.0853701f, -0.449278f, -0.0551647f, -0.00137429f, 0.709439f, -0.456796f, 0.132062f, -0.0449484f, -0.308599f, 0.180608f, -2.24196f, 0.421478f, -0.640946f, -0.460397f, -0.920628f, -0.184949f, -0.0416982f, 0.6484f, -0.22806f, 0.412229f, -0.468079f, -0.72372f, -0.347698f, -1.3899f, 0.631876f, 0.0611046f, 0.0294258f, -0.128091f, -0.205615f, 0.355348f, -0.267725f, -0.644835f, 0.435879f, 0.517477f, -0.338123f, -0.157764f, 0.32762f, -0.166454f, 0.221007f, -0.0438278f, -0.0777725f, 0.10986f, 0.941545f, -0.542284f, -0.172312f, -0.256597f, -0.0181391f, 0.220623f, -0.432456f, 0.0164074f, 0.250226f, -0.522576f, 0.783109f, 0.198703f, -0.784554f, -0.0929628f, 0.326861f, 0.470293f, 0.442684f, 0.271879f, -0.108256f, 0.0483558f, -0.403151f, 0.36183f, -0.268186f, 0.270851f, -0.696826f, -0.166037f, -0.354658f, 0.405977f, -0.473447f, 0.649689f, -0.0863114f, -0.147319f, 0.0869966f, 0.319792f, 0.493026f, -1.07456f, 0.354751f, 0.114605f, -0.120647f, -0.238315f, 0.0290955f, -0.355299f, -0.45381f, 0.0812865f, -0.0180434f, 0.00861318f, -0.892943f, -0.0127801f, -1.66398f, 0.290505f, 0.126832f, 2.08173f, -0.0454847f, -0.162481f, 1.07426f, 0.228566f, 0.280528f, -0.537625f, -0.175288f, -0.118012f, 0.649114f, -0.349926f, -0.0189864f, -0.30934f, -0.363178f, -0.119822f, -0.22656f, 0.484513f, -0.173269f, 0.41987f, -0.448517f, -0.0950466f, 0.482443f, 0.061558f, 0.4219f, -0.536388f, 0.0781972f, 0.212489f, 0.104229f, -0.0792804f, 0.402066f, -0.676313f, -0.2272f, -0.16379f, 0.260145f, -0.0504658f, -0.0826579f, -1.37749f, 0.00790747f, 0.0841031f, -0.0671308f, -0.00301736f, -0.386206f, 0.190311f, 0.0702639f, 0.0643968f, 0.133741f, -0.0141555f, -0.0365324f, 0.87028f, 0.207894f, -0.421266f, 0.689256f, 0.145037f, -0.270796f, 0.212604f, -0.345326f, 0.0074631f, -1.72379f, 0.0672097f, -0.273153f, 1.30503f, -1.01324f, 0.00284696f, 0.851459f, 0.176847f, 0.30948f, -0.57144f, -0.0596695f, -0.111189f, 0.130361f, -0.298286f, 0.0567591f, -0.0885215f, -0.847601f, 0.238624f, -0.162391f, 0.452357f, -0.0192713f, 0.226661f, 0.0762922f, -0.0894055f, 0.332702f, 0.424484f, 0.0443207f, -0.162345f, -0.601036f, 0.280527f, -0.137362f, 0.266345f, 0.729438f, -0.887182f, 0.152943f, -0.573548f, -0.0201383f, -0.56521f, 0.033582f, 0.300284f, -0.144472f, 0.633026f, 0.30866f, 0.0653073f, 0.316901f, 0.0721326f, 0.192252f, -0.833162f, 0.194292f, -0.08663f, -0.189401f, -0.178242f, 0.111488f, 0.522487f, -0.65497f, 0.457049f, 0.390654f, 0.0522936f, -0.39712f, -0.293717f, -0.374656f, -0.118916f, -0.853076f, -0.0829578f, -0.17335f, -0.0218694f, 0.367968f, 0.478469f, 0.0913813f, 0.519251f, 0.803526f, -0.272516f, -0.341329f, 0.0897285f, 0.247653f, 0.000898686f, 0.313196f, 0.000587979f, -0.314189f, -0.449439f, -0.0291611f, -0.356287f, -0.722904f, -0.0480958f, -0.523758f, -0.576146f, 0.133754f, 0.616921f, -0.085494f, 0.487487f, 0.745129f, 0.993267f, 0.256555f, 0.0822743f, 0.0411971f, 0.139388f }; static const float av1_intra_mode_cnn_partition_cnn_layer_1_bias[] = { 0.00447951f, 0.0202534f, 0.00970833f, -0.00460874f, 0.0942288f, -0.0534704f, 0.00829869f, -0.0255174f, -0.0809143f, 0.00169117f, 0.0177427f, 0.0259387f, 0.0291077f, -0.0267599f, 0.100275f, -0.00389366f, 0.0315499f, 0.0265846f, -0.000206604f, 0.0302221f }; static const float av1_intra_mode_cnn_partition_cnn_layer_2_kernel[] = { 0.153048f, 0.0725422f, 0.068901f, -0.475608f, 0.0736706f, -0.134076f, 0.229289f, 0.0217921f, 0.0449205f, -1.00002f, 0.149133f, 0.0497258f, 0.118988f, 0.0741764f, 0.0385486f, 0.225181f, 0.012966f, 0.155593f, -3.07175f, -0.0641051f, 0.09161f, 0.0259005f, -0.209998f, -0.420298f, 0.0587126f, 0.00352744f, 0.0451313f, -0.049384f, 0.11516f, 0.083135f, 0.103675f, -0.0185604f, 0.0623248f, -0.0993726f, 0.0448522f, 0.0134017f, -0.294776f, -0.251924f, 0.0712635f, -0.0764298f, -0.463766f, -0.0295011f, -0.579168f, 0.573853f, -0.00596607f, 0.0237762f, -0.0500104f, -0.0969275f, 0.155573f, 0.0515382f, -0.178454f, -0.154008f, -0.278299f, -0.166421f, 0.0149533f, -0.0700236f, 0.239287f, -1.19545f, -0.0744625f, 0.143037f, 0.141874f, 0.086302f, 0.0838633f, -0.454179f, 0.120308f, -0.0896718f, 0.254909f, 0.0714462f, 0.00471098f, -0.869494f, 0.209407f, 0.138285f, 0.0816641f, 0.0666266f, 0.0848555f, 0.173313f, 0.0695633f, 0.285667f, -3.15384f, 0.00140275f, -0.969824f, -0.0318689f, -0.00487396f, 0.412541f, 0.0263593f, -0.249824f, 0.0897776f, 0.0208836f, -0.0982745f, -0.16049f, -0.12719f, -0.186166f, 0.102338f, 0.273931f, -0.0886306f, -0.19513f, -0.0135712f, -0.194127f, -0.0834291f, 0.426623f, -0.0705446f, 0.0327476f, 0.0800862f, 0.478757f, -0.00849111f, -0.554911f, -0.0489312f, -0.184029f, -0.227428f, 0.159989f, -0.0677731f, -0.0901436f, 0.00308696f, -0.352243f, 0.278715f, 0.306374f, -0.0772054f, -0.0122733f, -0.0693457f, 0.074365f, -0.267458f, -0.123612f, -0.495954f, 0.552604f, -0.103951f, -0.121771f, 0.179966f, -0.377947f, -1.35472f, 0.153294f, -0.445284f, -0.089813f, -0.00529807f, 0.254047f, -0.0378426f, 0.114597f, -0.143052f, 0.0815258f, -0.10528f, 0.00833533f, -0.117508f, 0.129052f, 0.0706719f, -1.39506f, 0.0124731f, 0.109831f, -0.0744156f, 0.181612f, 0.0787894f, 0.0293352f, 0.494929f, 0.00997207f, -0.585882f, -0.0844138f, -0.00864134f, -0.109943f, 0.0713114f, 0.14883f, 0.0610554f, 0.204145f, -0.00390313f, 0.0184763f, -0.111387f, 0.175442f, -0.0840215f, -0.178785f, -0.0693612f, -0.254507f, -0.191549f, 0.501561f, -0.0858995f, -0.164921f, 0.0250706f, -0.0916282f, 0.247085f, 0.13877f, -0.419487f, -0.295065f, -0.213812f, -0.10362f, 0.138243f, 0.086985f, 0.113633f, -0.459273f, 0.12388f, -0.139296f, 0.253792f, 0.0421624f, 0.0665065f, -0.977282f, 0.199927f, 0.115194f, 0.099045f, 0.0534806f, 0.089283f, 0.0815367f, 0.150901f, 0.253458f, -3.24825f, -0.0118163f, -0.544565f, 0.0201825f, -0.0682201f, 0.759028f, 0.00479696f, -0.00625607f, 0.058007f, -0.0811189f, -0.114617f, -0.0998578f, 0.133312f, 0.0246256f, -0.0167416f, 0.196118f, 0.109823f, 0.109489f, 0.474682f, -0.763475f, 0.0818745f, 0.0798777f, -0.0994905f, -0.00138143f, -0.108563f, 0.697289f, -0.103702f, -0.306085f, -0.0996705f, -0.142618f, -0.130989f, 0.0813303f, -0.0909275f, -0.10786f, -0.0280431f, 0.206877f, -1.70798f, 0.525568f, 0.559891f, -0.166132f, -0.227574f, -0.150955f, 0.0849226f, 0.00497342f, -0.168667f, -0.282575f, 0.00537805f, -0.0185572f, 0.0607167f, -0.0534948f, -0.0215776f, -0.14825f, -0.0164577f, -0.0611978f, 0.0347562f, 0.286917f, 0.226598f, 0.149497f, -0.478101f, -0.246006f, 0.0663239f, -0.121728f, 0.267087f, 0.0802681f, -0.184741f, -0.558267f, 0.0437066f, 0.13816f, -0.0710939f, 0.0725697f, 0.339857f, 0.161069f, 0.304871f, 0.108138f, 0.193396f, 0.0891607f, -0.0701939f, -0.182038f, -0.451873f, -0.233883f, 0.0444747f, 0.0436545f, -0.245894f, -0.0721136f, 0.309013f, 0.278996f, 0.0259377f, 0.0278116f, 0.0686773f, -0.271237f, 0.235082f, -0.0778285f, -0.456541f, -0.109303f, -0.074565f, -0.407301f, -0.162191f, -0.801819f, 0.372435f, -0.559083f, -0.039189f, 0.0477762f, 0.0875363f, 0.0699926f, 0.116552f, -0.308217f, 0.0341607f, -0.14202f, 0.135517f, 0.0316971f, 0.153297f, -0.759722f, 0.12849f, 0.114229f, 0.0814893f, 0.275402f, 0.0403976f, 0.0357503f, 0.212295f, 0.0673998f, -2.59822f, -0.0475021f, -0.0594725f, 0.0659163f, 0.0469717f, -0.0370461f, -0.12863f, -0.381743f, -0.0445055f, -0.106843f, -0.0880648f, 0.00591106f, 0.235514f, -0.165162f, -0.0696645f, 0.115374f, 0.245558f, 0.192049f, -0.388628f, -0.48291f, 0.154313f, -0.160207f, 0.125928f, 0.122039f, 0.0713794f, -0.161244f, 0.128082f, -0.234659f, 0.0680219f, 0.0597933f, 0.208421f, -0.163623f, 0.196873f, 0.156603f, 0.184179f, -0.278331f, -0.0481286f, 0.0828152f, 0.247004f, 0.0915582f, -0.0906229f, -0.20376f, 0.136593f, 0.0740336f, -0.0134935f, -0.355048f, 0.0898485f, -0.0962068f, 0.185804f, -0.0145596f, 0.0966589f, -0.515784f, 0.121602f, 0.0320428f, 0.11093f, -0.0559421f, 0.0355484f, 0.192128f, 0.0500888f, 0.133641f, -1.73282f, -0.0624599f, 0.122524f, 0.0757292f, -0.0974648f, -0.193649f, 0.0561096f, 0.0159959f, 0.0334472f, -0.0168832f, -0.12386f, -0.112419f, 0.19552f, 0.0308502f, 0.0537643f, -0.0181012f, 0.0392183f, 0.0461833f, -0.52623f, -0.238252f, 0.0821762f, -0.212384f, 0.112901f, 0.096063f, 0.0540225f, 0.0773583f, 0.143045f, -0.101551f, 0.282418f, 0.0176749f, -0.00244542f, -0.780154f, -0.254428f, -5.82215f, 0.106638f, 0.11746f, 0.0486823f, 0.164562f, 0.0303006f, 0.229614f, -2.41845f, -0.117122f, 0.0451654f, 0.0237383f, -0.208731f, 0.0721137f, 0.0761163f, -0.0569416f, -0.00830511f, -0.045256f, 0.14535f, -0.0189222f, -0.283363f, -3.15502f, 0.0971161f, -0.035913f, 0.00813281f, 0.0187974f, -0.361573f, -0.302067f, 0.118014f, -0.0956148f, -0.596567f, 0.0105443f, -0.49019f, -0.0801959f, 0.0322344f, -0.0280032f, 0.0555038f, -0.111495f, -0.0994456f, 0.0178021f, 0.0358362f, 1.07063f, -0.0833138f, 0.0621246f, 0.0637157f, 0.0999207f, 0.191975f, -1.2811f, 0.0341681f, 0.14818f, 0.0957259f, 0.109909f, 0.0566115f, 0.0585633f, 0.179939f, -0.104372f, 0.309091f, 0.0172941f, 0.0243182f, -0.935252f, -0.296257f, -5.83634f, 0.0899249f, 0.455347f, 0.129505f, 0.220212f, 0.0214801f, 0.284802f, -2.94585f, -0.0805413f, -1.01819f, 0.00534034f, -0.057203f, 0.0869331f, 0.0207575f, -0.124479f, -0.0465806f, 0.0894252f, 0.32203f, 0.0858497f, 0.25178f, 0.0932205f, 0.0888455f, 0.233153f, -0.446398f, -0.00791233f, 0.0909603f, -0.0904397f, 0.131835f, 0.475597f, -0.1236f, 0.0231622f, 0.138602f, -0.097731f, -0.0282484f, -0.549095f, -0.0457428f, -0.0895407f, -0.293965f, 0.166872f, 0.46719f, 0.236254f, 0.0615991f, 0.499236f, 0.540366f, 0.402035f, 0.0606324f, -0.0499928f, -0.0155198f, 0.0994403f, -0.14773f, -0.183433f, -0.612093f, -0.334201f, -0.110877f, -0.143441f, 0.05815f, -0.318586f, -0.344235f, 0.199593f, 0.51109f, -0.252281f, -0.028834f, 0.0615421f, 0.0623699f, 0.210745f, -0.236448f, 0.166279f, 0.127516f, -0.0971157f, -0.204389f, 0.208112f, 0.0377023f, 0.271837f, -0.00859528f, 0.0797081f, -0.00582115f, 0.140018f, -0.384865f, -0.0853243f, -0.586727f, -0.0664489f, -0.631436f, -0.245828f, -0.0647894f, -0.171912f, -0.0801706f, 0.0731614f, -0.11725f, 0.281478f, -0.03047f, 0.0363488f, -0.0481651f, -0.326329f, -0.0155898f, -0.428316f, -0.0989367f, -0.271902f, -0.00263837f, 0.366168f, 0.325989f, 0.165463f, 0.0668512f, -0.142202f, 0.419992f, 0.164971f, -0.515479f, -0.187585f, -0.151783f, -0.0682468f, 0.0910191f, 0.117086f, 0.106579f, 0.0961825f, 0.162148f, -0.129645f, 0.301039f, 0.000320343f, -0.0558097f, -0.844295f, -0.218919f, -5.7571f, 0.0982612f, 0.238955f, 0.0703565f, 0.0969388f, 0.107202f, 0.321585f, -3.00594f, -0.058755f, -0.620004f, 0.052114f, 0.128423f, -0.177673f, -0.00341509f, -0.146756f, -0.0414309f, -0.0893262f, -0.0584779f, -0.129552f, 0.127629f, 0.13275f, -0.0973342f, -0.215617f, 0.0724309f, 0.0102229f, 0.178137f, -0.943374f, -0.171465f, 0.304949f, -0.0963836f, -0.0346437f, -0.138667f, -0.234184f, 0.0344159f, -0.319592f, -0.0990766f, -0.16065f, 0.369432f, 0.194911f, 0.363348f, -0.356009f, -0.00736217f, 0.241788f, -2.21311f, 0.704816f, 0.697019f, 0.129186f, -0.132799f, -0.11861f, 0.0383451f, 0.0247782f, -0.12687f, 0.0256552f, 0.048413f, 0.00660549f, 0.0457962f, -0.012819f, 0.115991f, -0.1117f, -0.291045f, -0.646138f, 0.0813613f, 0.112063f, 0.191675f, 0.120835f, -0.444267f, -0.340385f, 0.0391936f, -0.151132f, 0.184419f, 0.124998f, -0.14089f, 0.214087f, 0.00108535f, 0.119611f, 0.0236965f, 0.0715074f, -0.225997f, -0.0126552f, -0.459214f, -0.490444f, 0.173716f, 0.355811f, -0.13607f, -0.191091f, -0.530085f, -0.400666f, 0.011221f, 0.10527f, -0.11498f, -0.011864f, 0.364376f, 0.0319587f, -0.0528563f, 0.0353899f, 0.0393453f, -0.289211f, -0.347785f, -0.0417157f, 0.545848f, 0.741785f, -0.0732565f, -1.29687f, -0.0433128f, -1.44162f, 0.318894f, -0.377784f, 0.123751f, -0.00444347f, 0.0957118f, 0.0893616f, 0.0911595f, 0.092917f, 0.127681f, -0.159929f, 0.190417f, -0.0297948f, -0.00132599f, -0.742756f, -0.0364169f, -4.00108f, 0.0784767f, 0.223048f, 0.0430138f, 0.0180493f, 0.212842f, 0.122987f, -2.83267f, -0.0641464f, -0.173247f, 0.100946f, 0.0804885f, 0.0172631f, 0.0877408f, -0.353222f, 0.0108262f, -0.0452121f, -0.116127f, 0.268154f, -0.132587f, -0.27481f, -0.0316914f, 0.0610525f, 0.439691f, 0.00966415f, -0.78962f, -0.424823f, -0.0214365f, -0.113846f, 0.100793f, 0.126482f, 0.0415354f, 0.0427995f, 0.14273f, -0.315674f, 0.110095f, 0.0061568f, 0.0320474f, -0.3596f, -0.12533f, -1.28837f, 0.174673f, -0.235912f, 0.00495439f, 0.0695473f, 0.266489f, 0.049248f, 0.0868526f, -0.0685969f, 0.102984f, 0.0924639f, -0.027535f, 0.0709277f, 0.155776f, -0.190944f, 0.188273f, -0.00897471f, 0.0964232f, -0.475822f, -0.209374f, -5.00252f, 0.103495f, 0.110698f, 0.00682092f, 0.208586f, 0.0489575f, 0.0966254f, -1.42973f, -0.0645128f, 0.0515961f, 0.0571281f, -0.0992321f, 0.00791648f, 0.0087609f, 0.0607367f, 0.0315705f, 0.0183317f, 0.0756087f, -0.0292847f, -0.212932f, -0.782259f, 0.0899944f, 0.102677f, 0.0681135f, 0.0447764f, -0.481969f, -0.221459f, 0.0794475f, -0.229157f, 0.136781f, 0.0832359f, 0.0297807f, -0.00287225f, -5.97897f, -0.0960581f, 0.250945f, -0.00133314f, -0.112396f, -0.856922f, 0.115776f, 0.124536f, 0.0914194f, -0.160775f, 0.128684f, 0.106718f, 0.100665f, 0.139579f, -0.86141f, -0.190323f, 0.0884896f, 0.0363845f, -0.19831f, 0.121601f, 0.0264453f, -0.00557822f, 0.0720238f, -0.0140132f, -0.166814f, -0.266214f, 0.00500545f, 0.0146905f, 0.126035f, 0.0812372f, 0.0615973f, 0.0766063f, -0.420156f, -0.126157f, -0.0284299f, -0.112513f, -0.567008f, -0.0100263f, -0.607567f, 0.193053f, 0.0067527f, -0.0753897f, 0.00134269f, -0.0512249f, -0.161661f, 0.0667741f, -0.113702f, -0.071606f, -0.300563f, 0.276479f, -0.155318f, -0.0512306f, 0.0896443f, -0.987911f, 0.0440889f, 0.430958f, 0.175427f, 0.101385f, 0.0303662f, 0.0672653f, -6.62463f, -0.10475f, 0.228249f, -0.00482173f, -0.0608713f, -0.895836f, 0.187976f, 0.162173f, 0.0747544f, 0.219953f, 0.0682489f, 0.142665f, 0.100287f, 0.301887f, -1.97736f, -0.295001f, -1.0733f, -0.0562668f, -0.0604295f, 0.0304073f, 0.194274f, -0.243593f, 0.0727137f, 0.0610967f, -0.0692415f, -0.02967f, 0.055633f, 0.0192402f, 0.105841f, 0.102236f, -0.0757102f, -0.0067639f, 0.0102317f, -0.257959f, -0.0638652f, 0.45521f, -0.114967f, 0.0921177f, 0.223796f, 0.277072f, -0.0613282f, -0.564693f, -0.151333f, -0.158035f, 0.228491f, 0.12997f, -0.192625f, -0.125344f, 0.0983258f, -0.931206f, 0.618715f, 0.273759f, -0.145527f, -0.099431f, -0.119551f, 0.0663484f, -0.161419f, -0.202377f, -0.545393f, 0.0917645f, 0.042263f, -0.17117f, -0.178622f, -0.336977f, 0.866715f, 0.0376922f, -0.319728f, -0.127406f, 0.0599384f, 0.268804f, -0.0331844f, 0.355326f, -0.103902f, 0.0425935f, 0.00525512f, -0.133687f, -0.122695f, 0.145582f, 0.139013f, -0.0053352f, 0.0313566f, 0.327295f, -0.0117993f, 0.233524f, 0.162388f, -0.0793262f, 0.454543f, 0.0442224f, -0.742673f, -0.144882f, 0.0874983f, -0.0707259f, 0.0219869f, 0.201728f, 0.0204537f, 0.0788857f, -0.0374329f, 0.0724169f, 0.0743593f, -0.0193526f, -0.313546f, -0.418882f, -0.0815754f, -0.197144f, 0.305053f, 0.330196f, -0.131006f, -0.00113249f, 0.0750458f, -0.541764f, 0.299935f, 0.308516f, -0.20547f, -0.333066f, 0.0285833f, 0.191147f, 0.160372f, 0.0724649f, 0.0426326f, 0.153046f, -6.59656f, -0.081237f, 0.219163f, 0.0147081f, -0.0109837f, -1.01487f, 0.170055f, 0.163386f, 0.106413f, 0.150188f, 0.0688875f, 0.0541359f, 0.156307f, 0.178844f, -1.51054f, -0.149477f, -0.504503f, 0.017878f, -0.181821f, -0.0999659f, 0.0484548f, -0.32211f, 0.0406744f, 0.0017627f, 0.0220593f, 0.0900512f, -0.561625f, 0.107279f, -0.0861521f, -0.0862376f, 0.0816765f, 0.168072f, 0.150063f, -0.816825f, -0.13569f, 0.557555f, -0.155265f, 0.025135f, -0.109304f, -0.0487062f, -0.00347487f, -0.454803f, -0.0394371f, -0.214597f, -0.248898f, 0.286501f, -0.249246f, -0.138935f, 0.00391409f, -0.122544f, -2.14993f, 0.588942f, 0.541231f, 0.0154047f, -0.359742f, 0.0520729f, 0.0667058f, 0.0418163f, -0.132533f, -0.184759f, 0.0546118f, -0.131198f, 0.109664f, -0.0714679f, -0.114163f, -0.243081f, -0.0405089f, 0.0342795f, 0.0801825f, -0.268408f, 0.192207f, 0.0800494f, -0.586539f, -0.118155f, -0.0508569f, -0.193987f, 0.261478f, 0.105719f, -0.125361f, -0.0956201f, 0.0233802f, 0.271098f, 0.0113352f, 0.0910447f, 0.00628244f, -0.071722f, 0.21439f, 0.0747191f, 0.207765f, -0.0782454f, -0.0151716f, -0.196505f, -0.44798f, -0.228597f, 0.0549039f, -0.120715f, -0.19388f, -0.0768461f, 0.361102f, 0.122936f, -0.0334211f, -0.202503f, -0.0450776f, -0.272345f, 0.662321f, 0.109247f, -0.218026f, -0.0669386f, -0.0864701f, -0.633421f, -0.158007f, -1.10778f, 0.351211f, -0.541458f, -0.0171707f, 0.149606f, 0.106105f, 0.0880349f, 0.0968455f, 0.113269f, -5.01949f, -0.106404f, 0.175578f, -0.030045f, -0.0267249f, -0.563713f, 0.173885f, 0.130772f, 0.0334519f, 0.0770157f, 0.0394389f, -0.0290326f, 0.220003f, 0.180901f, -1.62203f, -0.151858f, -0.202386f, -0.0067836f, 0.0287665f, -0.194183f, -0.239834f, -0.484159f, 0.00671722f, -0.122459f, 0.0808959f, -0.263769f, -0.015066f, -0.0429868f, -0.111255f, -0.231872f, 0.219659f, -0.0437412f, -0.536618f, -0.477831f, 0.0421895f, -0.0815851f, 0.119638f, 0.0786293f, -0.000668378f, 0.0305567f, -0.0868189f, -0.178327f, 0.0799657f, 0.0280923f, -0.211395f, -0.464577f, 0.216912f, 0.0761976f, 0.160288f, -0.416372f, -0.10286f, -0.0733786f, 0.261033f, 0.0493698f, 0.143137f, -0.179979f, 0.15655f, 0.0897976f, -0.0258041f, -0.152852f, -6.15512f, -0.118917f, 0.227283f, -0.0514043f, -0.0786432f, -0.523485f, 0.1644f, 0.0869001f, 0.0984082f, -0.428288f, 0.0791992f, 0.141904f, 0.0652073f, 0.104429f, -0.775125f, -0.121479f, 0.0841637f, 0.0135705f, -0.208863f, -0.0629523f, 0.0455794f, 0.0513898f, -0.0147657f, 0.0401145f, 0.0660079f, 0.0210609f, -0.0151801f, 0.0562111f, 0.140308f, -0.0196394f, 0.0230753f, -0.0336115f, -0.422411f, -0.196974f, -0.0405748f, -0.283428f, 0.15458f, 0.0876296f, 0.0314038f, 0.16389f, -7.01385f, -0.117146f, 0.197273f, -0.0400688f, 0.0143951f, -0.964007f, -0.0618919f, 0.0406891f, 0.07992f, -0.144132f, 0.116416f, 0.0326838f, 0.103641f, 0.171805f, -1.05158f, -0.182589f, 0.116991f, 0.0530774f, -0.212454f, -0.016727f, -0.0565992f, 0.0712873f, 0.0445466f, -0.000107032f, -0.121449f, -0.15148f, 0.0220338f, 0.0762024f, 0.12253f, 0.0622466f, 0.0835822f, 0.0465119f, -0.388743f, -0.34665f, -0.0720734f, -0.101581f, -0.630565f, -0.0512685f, -0.520541f, 0.0530119f, -0.0245276f, -0.19116f, -0.0144446f, -0.0604486f, 0.187251f, -0.021341f, -0.217823f, 0.0510256f, -0.197946f, 0.060955f, -0.0617316f, 0.0741673f, 0.117591f, -1.47844f, -0.0911093f, 0.359225f, 0.145027f, 0.127513f, 0.0617905f, 0.141154f, -7.63868f, -0.0808127f, 0.274843f, 0.00693195f, -0.0283113f, -0.853871f, -0.15737f, 0.0858904f, 0.0746279f, 0.109912f, 0.193775f, 0.0698094f, 0.174159f, 0.259556f, -1.49885f, -0.156706f, -1.04113f, -0.0329546f, -0.0491449f, -0.0304125f, 0.0514892f, -0.244284f, 0.126814f, -0.0387081f, -0.153173f, -0.0566748f, 0.294111f, -0.0170534f, 0.102381f, 0.447606f, -0.0613267f, -0.0636869f, -0.0347599f, -0.259572f, -0.0657846f, 0.454352f, -0.169453f, -0.00177987f, 0.133279f, -0.0863932f, -0.134423f, -0.475107f, -0.00448962f, -0.214607f, 0.111413f, 0.194377f, -0.0710837f, 0.0562353f, 0.0401193f, 0.248595f, 0.538374f, 0.449469f, -0.39111f, 0.0125057f, 0.0448811f, -0.00707751f, -0.164894f, -0.317516f, -0.56231f, -0.270262f, 0.127016f, -0.12092f, -0.0881587f, -0.323908f, 0.872344f, 0.103391f, 0.267971f, -0.155088f, -0.0136683f, 0.309517f, 0.119901f, 0.271307f, -0.188463f, 0.185121f, -0.142777f, -0.110535f, -0.163107f, 0.175502f, 0.0801924f, 0.240499f, 0.0874759f, 0.308907f, -0.00222504f, 0.193366f, 0.109018f, -0.0772158f, -0.520675f, 0.0259432f, -0.736666f, -0.296579f, 0.043486f, -0.128932f, 0.0417669f, 0.125747f, 0.157879f, 0.112857f, -0.0595681f, 0.0611936f, -0.042125f, -0.270338f, 0.120072f, -0.36675f, -0.0347962f, -0.119539f, 0.0873369f, 0.296432f, -0.069501f, -0.0383859f, 0.0913597f, -0.40747f, 0.234276f, 0.332536f, -0.732132f, -0.312291f, 0.137759f, 0.227593f, 0.14165f, 0.129068f, 0.102734f, 0.135818f, -7.35883f, -0.101533f, 0.256027f, -0.0142278f, -0.0561601f, -1.09899f, -0.106538f, 0.0612256f, 0.099487f, -0.0605983f, 0.134311f, 0.052226f, 0.143672f, 0.219944f, -1.47539f, -0.101828f, -0.429979f, 0.010478f, -0.0132605f, 0.103363f, 0.0267373f, -0.338865f, 0.0090188f, 0.0810085f, -0.124368f, -0.0133776f, 0.595666f, -0.00162201f, -0.212444f, -0.26342f, 0.0913656f, -0.106279f, 0.414515f, -0.709901f, -0.00198859f, 0.305288f, -0.188536f, -0.0377482f, -0.131909f, -0.116099f, -0.236827f, -0.36356f, 0.0179455f, -0.202143f, -0.00395508f, 0.177363f, 0.0630679f, -0.145173f, -0.0558639f, -0.44879f, -1.55687f, 0.473398f, 0.50531f, -0.0656231f, -0.137197f, 0.064707f, 0.122083f, 0.0321111f, -0.167096f, 0.0406581f, -0.0793592f, -0.0777081f, 0.0321379f, -0.0108834f, -0.0652323f, -0.102918f, 0.0178664f, 0.0781873f, 0.0613189f, -0.04177f, 0.159566f, 0.15134f, -0.445996f, -0.384905f, 0.0951659f, -0.175046f, 0.255746f, 0.177047f, -0.150632f, 0.200522f, 0.00778549f, 0.232168f, -0.0304652f, 0.083155f, -0.125395f, -0.0203289f, -0.23874f, 0.0349836f, 0.231701f, -0.14849f, -0.204272f, -0.198309f, -0.364955f, -0.228428f, 0.0614142f, -0.040976f, -0.227785f, -0.0898404f, 0.271566f, -0.209196f, 0.0226431f, -0.0911715f, 0.0840369f, -0.299411f, -0.529182f, 0.0622292f, 0.202475f, 0.0155583f, -0.083114f, 0.124253f, -0.22721f, -1.02565f, 0.193961f, -0.54287f, -0.00849364f, 0.11124f, 0.0993531f, 0.120621f, 0.0959537f, 0.136274f, -5.23358f, -0.107433f, 0.155286f, -0.0136043f, -0.0246768f, -0.631187f, -0.0493852f, 0.0446751f, 0.0588353f, 0.160766f, -0.0354385f, -0.0672548f, 0.243743f, 0.186004f, -1.20199f, -0.151872f, -0.0760096f, -0.00775123f, -0.0122227f, 0.0891327f, -0.377876f, -0.469926f, -0.134715f, -0.0969362f, 0.212542f, 0.0871489f, 0.164638f, -0.0485785f, -0.167754f, -0.515052f, 0.13821f, 0.0515572f, -0.430691f, -0.394719f, 0.143947f, -0.00670816f, 0.129623f, 0.140299f, 0.0336978f, 0.153545f, -0.350927f, -0.213485f, 0.0344809f, 0.0405889f, 0.0749967f, -0.369352f, -0.109398f, 0.0350649f, 0.190893f, -0.284106f, -0.185376f, 0.0105842f, 0.263692f, 0.160429f, 0.0998209f, -0.127779f, 0.140558f, 0.108968f, -0.0122672f, 0.102875f, -5.72172f, -0.161288f, 0.135935f, -0.0143087f, 0.106556f, -0.649813f, -0.123049f, -0.0108861f, 0.102918f, -0.298137f, 0.0329013f, 0.100763f, 0.12018f, 0.100782f, -0.648036f, -0.111122f, 0.12363f, 0.0211952f, -0.225201f, 0.0506021f, 0.0167621f, 0.0608759f, -0.0245646f, 0.0503477f, -0.0972749f, -0.0415155f, -0.00578366f, -0.0977591f, 0.124867f, 0.0134788f, -0.0375816f, -0.00581233f, -0.272292f, -0.250393f, 0.024511f, -0.184891f }; static const float av1_intra_mode_cnn_partition_cnn_layer_2_bias[] = { 0.182474f, 0.0223202f, 0.204111f, 0.0573683f, 0.111143f, 0.0800926f, -0.0364215f, 0.192371f, 0.00498262f, 0.302543f, 0.0133081f, 0.119719f, 0.237522f, -0.266705f, 0.129427f, 0.0695857f, 0.22068f, 0.231667f, 0.405829f, -0.0972567f }; static const float av1_intra_mode_cnn_partition_cnn_layer_3_kernel[] = { -0.0393876f, -0.269924f, -0.0703231f, -0.0236484f, 0.170478f, 0.245566f, 0.175963f, 0.104194f, -0.0490501f, -0.157605f, -0.0275165f, -0.0169499f, -0.250725f, 0.215203f, -0.00733655f, 0.0111298f, 0.205606f, 0.928046f, 0.15139f, 0.0955483f, -0.015115f, -0.126643f, 0.0957605f, -0.140178f, -0.0246866f, 0.097097f, 0.116287f, 0.177746f, 0.0570021f, -0.0518686f, -0.0446482f, -0.0125318f, 0.0116092f, 0.102431f, 0.0898519f, 0.0870372f, -0.843274f, 0.383311f, -0.102761f, -0.0246494f, 0.0312555f, 0.19472f, 0.111573f, 0.0920392f, -0.0555618f, 0.326461f, 0.219357f, -0.133727f, -0.118399f, -0.0611432f, -0.169931f, 0.123733f, -0.204607f, 0.082592f, 0.0323181f, 0.201618f, -0.00388867f, -0.053583f, 0.0266333f, -0.0951787f, -0.0358283f, -0.0649549f, 0.0119263f, -0.11812f, 0.209851f, -0.036616f, -0.014911f, -0.138096f, -0.139664f, -0.207395f, 0.0128848f, -0.201816f, 0.0899419f, 0.343308f, -0.0096243f, -0.212605f, -0.0905284f, -0.0597114f, -0.055261f, -0.0653405f, 0.0330484f, -0.27681f, -0.0994095f, -0.0468272f, 0.145713f, 0.267216f, 0.185335f, 0.1798f, -0.0437882f, -0.200401f, -0.0398117f, -0.0736501f, -0.166349f, 0.203316f, 0.0710647f, 0.061825f, 0.281131f, 0.733323f, 0.215488f, 0.00145659f, -0.138995f, -0.0833713f, 0.107809f, -0.105343f, -0.0672139f, 0.101852f, 0.135455f, 0.132903f, 0.0312017f, -0.0643586f, -0.0274546f, -0.0687466f, -0.020233f, 0.109444f, 0.0774587f, 0.139497f, -0.800587f, 0.325783f, -0.0546695f, -0.092003f, -0.0773301f, 0.189672f, 0.0604666f, 0.0939425f, 0.679495f, 0.114789f, -0.161153f, 0.12843f, -0.0345385f, -0.134641f, -0.153995f, 0.0823055f, -0.0349296f, 0.0299183f, -0.0606872f, 0.137588f, 0.0449805f, -0.0555399f, -0.00553351f, -0.120719f, -0.204701f, -0.0739813f, 0.0584115f, -0.104833f, -0.110989f, 0.00845446f, 0.0630702f, -0.147861f, 0.0268545f, -0.216419f, 0.00531986f, -0.206641f, 0.253082f, 0.413215f, -0.05909f, -0.0939983f, -0.116818f, -0.0450892f, -0.0551134f, -0.00696931f, -0.113003f, -0.289192f, -0.00884866f, -0.0365724f, 0.0401887f, 0.238622f, 0.149151f, 0.175751f, -0.157425f, -0.138924f, -0.0277598f, -0.0285915f, 0.10165f, 0.209532f, 0.0862249f, 0.0256428f, 0.623204f, -0.0941196f, 0.20345f, -0.132869f, 0.00947298f, -0.14753f, 0.103918f, -0.161799f, 0.125566f, 0.10916f, 0.115446f, 0.135627f, -0.0181667f, -0.0734694f, -0.0154729f, -0.085849f, -0.000427605f, 0.113614f, 0.0776308f, 0.111899f, -0.214917f, 0.393234f, -0.132223f, 0.020783f, -0.074902f, 0.217477f, 0.107883f, 0.109466f, 0.146609f, 0.317061f, 0.074379f, -0.0505457f, -0.0503772f, -0.0678954f, -0.220003f, 0.114878f, 0.176014f, -0.00657996f, -0.0875497f, 0.065582f, 0.00238612f, -0.063395f, 0.0295323f, -0.127126f, 0.099813f, -0.115452f, 0.0106309f, -0.179632f, -0.0436553f, 0.0120295f, 0.0652713f, -0.131512f, -0.081714f, -0.205363f, -0.0374944f, -0.196707f, 0.680568f, -0.00991824f, -0.0212223f, -0.186258f, -0.432361f, -0.0291303f, -0.0475983f, -0.071383f, -0.0116416f, -0.28257f, -0.0635272f, -0.0576546f, -0.280129f, 0.286528f, 0.199997f, 0.192851f, 0.323829f, -0.185006f, -0.04791f, -0.0882187f, -0.0496895f, 0.293135f, 0.125539f, 0.0341828f, 0.993452f, 0.0369177f, 0.0453796f, 0.0329807f, 0.157673f, -0.153195f, 0.122383f, -0.161983f, -0.317619f, 0.105129f, 0.155673f, 0.152489f, 0.0685417f, -0.0595907f, -0.026657f, -0.0954336f, -0.0359557f, 0.105617f, 0.0825066f, 0.100189f, -0.22125f, 0.382508f, -0.0247677f, -0.115807f, -0.0639787f, 0.177786f, 0.0566206f, 0.0496389f, 1.31533f, 0.0482907f, -0.118743f, 0.190632f, 0.172867f, -0.108446f, -0.200186f, 0.122572f, 0.0897468f, 0.0155328f, -0.0380217f, 0.125161f, -0.141723f, -0.023157f, 0.0270805f, -0.101961f, 0.12358f, -0.0866255f, 0.00306761f, -0.131764f, -0.461118f, -0.00803936f, 0.0895496f, -0.153905f, 0.207623f, -0.249099f, -0.0198487f, -0.160013f, 0.81136f, -0.109978f, -0.0880332f, -0.0761368f, -0.0755881f, -0.0384827f, -0.0554777f, -0.0750048f }; static const float av1_intra_mode_cnn_partition_cnn_layer_3_bias[] = { 0.0106809f, 0.136699f, 0.285316f, 0.395746f }; static const float av1_intra_mode_cnn_partition_cnn_layer_4_kernel[] = { -0.0161019f, -0.088871f, 0.0463358f, -0.198037f, 0.038122f, 0.0135483f, -0.196641f, -0.433531f, 0.527972f, -0.143716f, 0.558627f, 0.459889f, 0.322864f, -0.491514f, -0.190915f, -0.0765601f, 0.210329f, 0.689389f, -0.100415f, -1.8788f, 0.2228f, 0.292781f, -0.954838f, -0.0788763f, -0.131402f, -0.17154f, 0.049934f, -0.0541183f, -0.530529f, -0.666165f, 0.195492f, 0.218548f, -0.314895f, 0.0749444f, -0.191344f, 0.349469f, 0.00811248f, -0.760157f, 0.0707434f, -0.0719285f, -0.264495f, -0.432009f, -0.432686f, 0.155738f, -0.020197f, 0.19278f, -0.658335f, -0.273143f, -0.286079f, 0.243402f, 0.497701f, 0.0121003f, -0.666308f, 0.028172f, -0.547901f, -0.11755f, 0.322028f, 0.0878274f, -0.0328334f, 0.311816f, 0.0951026f, -1.11429f, -0.0417486f, 0.123467f, -0.0910681f, -0.0154255f, 0.311201f, -0.0156158f, -0.600437f, 0.0274156f, -0.174907f, -1.29313f, -0.178656f, 0.596556f, -0.421725f, -0.289137f, 0.529297f, 0.114833f, -0.0155887f, -0.308232f, -0.0228361f, 0.184017f, 0.138232f, 0.146347f, -0.117867f, 0.248351f, -0.282846f, -0.18058f, 0.348355f, -0.415754f, 0.0657168f, 0.431728f, -0.231043f, -0.186745f, 0.137401f, -0.282329f, -0.159678f, 0.754262f, 0.037824f, -1.68521f, -0.290175f, 0.289588f, -0.18683f, -0.300385f, 0.285449f, -0.00386456f, 0.0563485f, -0.376541f, 0.159899f, -0.697312f, 0.0284389f, 0.437307f, 0.3968f, -0.372082f, -0.232535f, 0.394629f, 0.00315248f, -0.38374f, 0.0311291f, -0.624353f, 0.498083f, -0.342663f, -0.125978f, 0.186797f, 0.187723f, 0.149335f, -0.82727f, -0.0740974f, -0.659039f, 0.42671f, -0.448835f, 0.150677f, 0.830742f, -0.233148f, -0.65308f, -0.0878935f, -0.407797f, -0.511826f, -0.0739023f, 0.506305f, -0.187451f, 0.0284968f, -0.822238f, 0.362523f, -0.270865f, 0.032335f, 0.560413f, -0.00388247f, -0.446333f, 0.163147f, -0.409633f, -0.372575f, 0.306993f, 0.55953f, -0.24362f, -0.0929369f, -0.520298f, -0.444022f, 0.186077f, -0.0942208f, 0.624049f, -0.429625f, -0.869528f, 0.405257f, -0.120445f, 0.537685f, -0.3911f, 0.142142f, 0.0913808f, -0.00375967f, 0.382781f, 0.60505f, -0.271608f, -0.0630436f, -0.150625f, -0.0124598f, 0.0132878f, 0.138475f, -0.106264f, -0.416581f, -0.518415f, 0.185127f, -0.464622f, -0.0102925f, 0.0389567f, 0.406439f, -0.0414264f, -0.366185f, -0.511867f, -0.650255f, 0.278252f, 0.0270234f, 0.262788f, -0.0294793f, 0.12651f, 0.421537f, 0.0300837f, 0.0742187f, 0.281954f, -0.122069f, -0.450145f, -0.312206f, -0.402633f, -0.0868137f, 0.190433f, -0.149602f, -0.175029f, 0.00900023f, -0.266596f, 0.21721f, -0.245079f, -1.09798f, 0.319409f, -0.337938f, 0.358514f, 0.0771549f, 0.447087f, -0.305507f, -0.285492f, 0.383896f, 0.145933f, -0.264944f, -0.118486f, 0.068805f, -0.194231f, -1.79133f, 0.363408f, -0.17434f, -0.229629f, 0.132188f, 0.207548f, -0.876264f, 0.265634f, 0.139332f, 0.236206f, -0.0145184f, 0.562865f, 0.526612f, -0.0333508f, -0.421885f, 0.273485f, -0.110882f, 0.425557f, 0.513303f, -0.422322f, 0.0563155f, -0.0409693f, 0.194768f, -0.419828f, -0.107195f, -1.19224f, 0.48552f, 0.132782f, -0.00932096f, -0.225484f, -0.428484f, -0.0392684f, 0.750697f, 0.337615f, 0.158476f, 0.413484f, 0.326017f, -0.757107f, -0.183962f, 0.00884361f, 0.126507f, -0.0751588f, -0.308782f, -0.104237f, -0.703877f, -0.491806f, -0.204251f, -0.317212f, 0.0815479f, 0.296323f, 0.219632f, -0.039859f, 0.556257f, 0.176144f, -0.0750654f, -0.106419f, 0.00400385f, -0.172266f, 0.000178763f, 0.146532f, 0.255202f, -0.427235f, -0.182198f, -0.256557f, 0.260255f, -0.0143364f, 0.0868664f, -0.564373f, -0.0876947f, 0.726289f, 0.0160001f, -0.381562f, -0.638214f, -0.803803f, 0.25945f, -0.371542f, -0.419611f, 0.238617f, 0.371834f, -0.226777f, -0.894602f, 0.37458f, -0.354866f, 0.0249312f, 0.142374f, 0.433813f, -0.0218183f, -0.33248f, 0.107223f, 0.390823f, -0.0271108f, -0.616878f, -0.604984f, 0.517269f, -0.293573f }; static const float av1_intra_mode_cnn_partition_cnn_layer_4_bias[] = { -0.290371f, -0.0560272f, -0.118144f, -0.270583f, 0.401388f, -0.308677f, 0.150729f, -0.0324442f, -0.135937f, 0.0875581f, 0.0206493f, -0.212682f, -0.0266535f, -0.326656f, 0.0185105f, -1.01429f, -0.00315052f, -0.0273938f, -0.0263379f, -0.171702f }; static const CNN_CONFIG av1_intra_mode_cnn_partition_cnn_config = { NUM_CNN_LAYERS, // num_layers 0, // is_residue 0, // ext_width 0, // ext_height 0, // strict_bounds { { CNN_LAYER_0_IN_CH, // in_channels CNN_LAYER_0_WIDTH, // filter_width CNN_LAYER_0_WIDTH, // filter_height CNN_LAYER_0_OUT_CH, // out_channels CNN_LAYER_0_HORZ_STRIDE, // skip_width CNN_LAYER_0_VERT_STRIDE, // skip_height 0, // maxpool av1_intra_mode_cnn_partition_cnn_layer_0_kernel, // weights av1_intra_mode_cnn_partition_cnn_layer_0_bias, // bias PADDING_VALID, // pad RELU, // activation 0, // deconvolve 0, // branch BRANCH_NO_COPY, // branch_copy_type BRANCH_NOC, // branch_combine_type NO_BRANCH_CONFIG, // branch_config NO_BN_PARAMS, // bn_params -1, // output_num }, { CNN_LAYER_1_IN_CH, // in_channels CNN_LAYER_1_WIDTH, // filter_width CNN_LAYER_1_WIDTH, // filter_height CNN_LAYER_1_OUT_CH, // out_channels CNN_LAYER_1_HORZ_STRIDE, // skip_width CNN_LAYER_1_VERT_STRIDE, // skip_height 0, // maxpool av1_intra_mode_cnn_partition_cnn_layer_1_kernel, // weights av1_intra_mode_cnn_partition_cnn_layer_1_bias, // bias PADDING_VALID, // pad RELU, // activation 0, // deconvolve 0, // branch BRANCH_NO_COPY, // branch_copy_type BRANCH_NOC, // branch_combine_type NO_BRANCH_CONFIG, // branch_config NO_BN_PARAMS, // bn_params 3, // output_num }, { CNN_LAYER_2_IN_CH, // in_channels CNN_LAYER_2_WIDTH, // filter_width CNN_LAYER_2_WIDTH, // filter_height CNN_LAYER_2_OUT_CH, // out_channels CNN_LAYER_2_HORZ_STRIDE, // skip_width CNN_LAYER_2_VERT_STRIDE, // skip_height 0, // maxpool av1_intra_mode_cnn_partition_cnn_layer_2_kernel, // weights av1_intra_mode_cnn_partition_cnn_layer_2_bias, // bias PADDING_VALID, // pad RELU, // activation 0, // deconvolve 0, // branch BRANCH_NO_COPY, // branch_copy_type BRANCH_NOC, // branch_combine_type NO_BRANCH_CONFIG, // branch_config NO_BN_PARAMS, // bn_params 2, // output_num }, { CNN_LAYER_3_IN_CH, // in_channels CNN_LAYER_3_WIDTH, // filter_width CNN_LAYER_3_WIDTH, // filter_height CNN_LAYER_3_OUT_CH, // out_channels CNN_LAYER_3_HORZ_STRIDE, // skip_width CNN_LAYER_3_VERT_STRIDE, // skip_height 0, // maxpool av1_intra_mode_cnn_partition_cnn_layer_3_kernel, // weights av1_intra_mode_cnn_partition_cnn_layer_3_bias, // bias PADDING_VALID, // pad RELU, // activation 0, // deconvolve 0, // branch BRANCH_NO_COPY, // branch_copy_type BRANCH_NOC, // branch_combine_type NO_BRANCH_CONFIG, // branch_config NO_BN_PARAMS, // bn_params 1, // output_num }, { CNN_LAYER_4_IN_CH, // in_channels CNN_LAYER_4_WIDTH, // filter_width CNN_LAYER_4_WIDTH, // filter_height CNN_LAYER_4_OUT_CH, // out_channels CNN_LAYER_4_HORZ_STRIDE, // skip_width CNN_LAYER_4_VERT_STRIDE, // skip_height 0, // maxpool av1_intra_mode_cnn_partition_cnn_layer_4_kernel, // weights av1_intra_mode_cnn_partition_cnn_layer_4_bias, // bias PADDING_VALID, // pad RELU, // activation 0, // deconvolve 0, // branch BRANCH_NO_COPY, // branch_copy_type BRANCH_NOC, // branch_combine_type NO_BRANCH_CONFIG, // branch_config NO_BN_PARAMS, // bn_params 0, // output_num }, }, }; static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel[] = { 0.604356f, -0.236007f, 0.342172f, 0.531397f, -0.635698f, -0.591573f, 0.833872f, 0.492814f, -0.100308f, 0.186385f, 0.202779f, 0.263578f, 0.330001f, -0.15531f, 0.879584f, -0.0048796f, 0.490796f, 0.242254f, -0.292211f, -0.696912f, 0.746664f, 0.129371f, -0.0122443f, 0.196234f, -0.251605f, -0.385617f, 0.157707f, 0.699963f, 0.0432536f, -0.11141f, -0.0353473f, -0.0364045f, -0.113556f, -0.520842f, 0.231248f, 0.230638f, -0.323852f, -1.08633f, -0.0469168f, -0.481821f, 0.366838f, 0.189627f, -0.0637262f, -0.484917f, -0.109874f, 0.292237f, 0.368702f, -0.183896f, -0.109038f, -1.22613f, -0.880355f, -1.63768f, 0.337426f, -0.940994f, 0.413097f, -0.37879f, -0.480525f, -0.594819f, -0.0172653f, -0.499436f, -0.298395f, -0.840181f, -0.0758645f, -0.772089f, -0.232727f, -0.815968f, 0.160785f, -0.0767165f, 0.0064244f, -0.540491f, 0.417776f, -0.384337f, -0.497377f, 0.68414f, 0.00797514f, 0.262626f, 0.203732f, 0.702047f, 0.0617544f, 0.0878249f, -0.315032f, -0.0169776f, 0.403986f, 0.815872f, 0.135388f, 0.0858594f, 0.169172f, -0.638227f, -1.65268f, -0.0476042f, -0.982685f, 0.45707f, -0.0577537f, 0.367329f, 0.176513f, -0.356454f, 0.0979095f, -0.277476f, 0.257271f, -0.333451f, 0.0241497f, 0.0671127f, 0.221216f, 0.106065f, 0.537151f, 0.0257329f, 0.265559f, -0.348353f, 0.285569f, -0.0610511f, -1.59334f, -1.63826f, -0.164898f, -0.36605f, -0.489304f, 0.729241f, 0.0197627f, 0.200291f, -0.231506f, -0.255715f, -0.0932264f, -0.728793f, 0.468297f, -1.09592f, -0.079791f, -1.76531f, -0.182904f, -2.05897f, -0.371894f, 0.207124f, 0.255029f, 0.186501f, -0.005805f, 0.00160733f, -0.178206f, -0.352757f, -0.164741f, -0.557583f, -0.559692f, -0.00731467f, 0.149326f, 0.409735f, 0.22083f, -0.332572f, -0.1741f, -0.0519008f, -0.266402f, 0.294031f, -2.4453f, 0.339851f, -0.573747f, -5.97783f, -0.084142f, 0.20286f, -0.576038f, -0.111081f, 0.101238f, -5.83427f, -1.98537f, 0.322796f, -0.60171f, 0.212412f, 0.247176f, 0.603694f, -0.54357f, -0.693439f, 0.250725f, -4.31988f, 0.0935924f, 0.43669f, -0.139706f, -0.158391f, 0.244309f, 0.619213f, -0.309154f, -0.135341f, 0.475815f, -0.290804f, -0.109038f, -0.0937104f, 0.0385907f, -0.29105f, -0.0597651f, -0.451187f, -1.51821f, 0.141772f, 0.822204f, -0.729661f, -0.109908f, 0.178217f, -0.750278f, 0.113762f, -0.0959985f, 0.066579f, -0.104209f, -0.951378f, 1.4087f, -1.13175f, -1.09103f, -1.50416f, -0.182273f, -1.80129f, -0.152135f, 0.356931f, 0.205591f, 0.183148f, -0.498671f, -0.183034f, -0.176428f, 0.395706f, -0.589908f, -0.318276f, -0.421162f, 0.658766f, -0.186752f, 0.0656253f, 0.248002f, 0.289618f, -0.458111f, -0.130789f, -0.542988f, 0.405804f, -0.35364f, -0.311927f, 0.218339f, 0.309215f, -0.130347f, -0.0257543f, 0.0413234f, -0.190205f, -0.242382f, 0.819886f, -0.255157f, -0.181219f, -0.290903f, -0.301995f, -0.0469988f, 0.702936f, 0.209122f, 0.0234243f, 0.598637f, 0.0305196f, 0.0423457f, -0.618799f, 0.0190867f, 0.420584f, -0.224752f, -0.410077f, 0.127854f, 0.395261f, -0.393685f, -0.282822f, 0.0289504f, 0.0406515f, -0.511531f, -0.497611f, 0.0252715f, 0.0812549f, 0.80205f, 1.29084f, 0.764972f, 0.561258f, -0.23499f, 0.217594f, -0.690935f, -0.26607f, 0.357955f, 0.391608f, 0.448352f, 0.458586f, -0.790071f, 0.719959f, -0.468052f, 1.24579f, 0.220705f, 0.284044f, 0.141346f, 0.246687f, 0.147826f, -0.403557f, -0.00648195f, 0.398034f, -0.100464f, -0.77107f, -0.188274f, -0.219245f, -0.0330375f, 0.367585f, -0.220391f, 0.308736f, 0.221399f, 0.340292f, 0.037597f, 0.606083f, 0.665634f, -0.755529f, -0.95989f, -0.243673f, 0.233709f, -0.454628f, -0.110952f, 0.776062f, 0.731136f, -0.140422f, 0.19261f, 0.355086f, 0.975026f, 0.190936f, 0.776205f, 0.982781f, 0.555569f, 0.42382f, -0.409721f, 0.25053f, -0.271328f, 0.859941f, -0.0210901f, 0.0176916f, -0.562895f, -0.0787431f, -0.861032f, -0.34022f, -0.571995f, 0.205436f, 0.346968f, 0.377033f, -1.08484f, 0.297007f, -1.01693f, 0.189463f, -0.483242f, 0.147058f, 0.0159503f, 0.0908779f, -0.46962f, 0.174024f, -0.490704f, -0.383501f, -0.0507626f, 0.00902188f, -0.202495f, 0.205047f, 0.0562261f, -0.143371f, 0.219524f, -0.317294f, -0.0575756f, -0.0595825f, -0.000625279f, -0.278864f, -0.0516874f, -0.225259f, 0.429046f, -0.0952421f, 0.0799135f, -0.122883f, -0.262308f, -0.481006f, -0.0466122f, -0.402822f, 0.150595f, -0.0919558f, -0.356765f, -0.199222f, 0.219389f, -0.214452f, -0.196361f, -0.095758f, -0.115891f, -0.143777f, 0.549843f, -0.113036f, 0.764895f, -0.0114812f, -0.0684054f, -0.98045f, -0.0170634f, 0.247719f, -0.18718f, -0.381566f, 0.150758f, -0.526257f, 1.00851f, 0.776634f, 1.69728f, -0.303058f, 0.228967f, -0.414134f, 0.0858226f, -0.285472f, 0.431459f, 0.315318f, 0.587835f, 0.335737f, -0.0222039f, 0.18945f, 0.274008f, 0.609263f, 0.320232f, -0.214137f, -0.0297668f, 0.0439046f, -0.52821f, -0.0127375f, 0.431885f, 0.508846f, -0.329189f, -0.166778f, -0.94338f, -0.358807f, 0.208641f, -0.517986f, -0.128278f, 0.693464f, -0.24408f, -0.0669412f, -0.410287f, 0.0444145f, -0.264179f, 0.143884f, 0.276842f, 0.498934f, -0.682557f, -0.217198f, -0.8249f, -0.40446f, -0.115376f, 0.417934f, 0.65605f, -0.00570035f, -0.365742f, -0.367625f, 0.526824f, -0.0164913f, -0.255998f, 0.247292f, 0.0846536f, 0.109302f, -0.302996f, 0.160564f, 0.0228132f, 0.035211f, -0.236951f, 0.493801f, 1.37315f, -0.182348f, 0.234437f, -0.256906f, 0.12523f, 0.667113f, -0.437981f, -0.0721831f, 0.303976f, -0.041336f, -0.145894f, -0.733741f, 0.436056f, 0.368542f, -0.149072f, -0.290281f, 0.0946743f, -0.0579292f, 0.264539f, 0.170048f, 0.262411f, 0.049679f, 0.371369f, 0.760675f, 0.482157f, -0.0196783f, 0.260888f, 0.948856f, 0.170228f, -0.134432f, -0.942235f, -1.23226f, -0.373963f, -0.0381773f, -0.17947f, 0.00947998f, 0.01086f, 0.389578f, -0.380389f, -0.0865851f, -0.220328f, -0.171901f, -0.384325f, -0.0787615f, 0.392678f, 0.123392f, -0.0895824f, 0.00480886f, -0.162918f, 0.214336f, -0.00147339f, 0.203899f, -0.00292344f, -0.148594f, 0.0425697f, -0.306896f, -0.342225f, -0.45088f, -0.184454f, -0.00923638f, -0.521993f, -0.334464f, 0.156497f, -0.0856832f, -0.277661f, -0.0721105f, -0.488781f, -0.509543f, -0.012664f, 0.0940558f, -0.29869f, 0.0434843f, -0.0178945f, -0.0525666f, -0.303178f, 0.713507f, -0.137413f, -0.170289f, -0.142942f, -0.316002f, 0.229125f, -0.277585f, 0.0125026f, 0.508316f, -1.20614f, -0.915129f, -1.63389f, -0.454604f, -0.893951f, -0.447403f, -0.751423f, 1.3886f, 0.617818f, 0.611458f, -0.884173f, -0.7779f, -0.608639f, -0.164759f, -0.631846f, -0.176894f, -0.459361f, -0.187119f, 0.173283f, -0.477191f, -0.156736f, 0.182675f, 0.598854f, -0.489941f, -0.420493f, -0.162002f, 0.344418f, 0.33832f, -0.187463f, -0.388721f, -0.0733151f, -0.138835f, 0.313699f, 0.0625967f, -0.291488f, 0.114088f, -0.356843f, 0.197506f, 0.0320749f, 1.16745f, -0.36081f, 1.63416f, 0.198392f, 1.13928f, -0.317971f, 0.531019f, 0.526518f, 0.185814f, 0.0923607f, 0.192858f, -0.234378f, 0.18091f, -0.228837f, 0.397216f, 0.581501f, 0.284376f, -0.130434f, 0.20076f, 0.242662f, -0.0480872f, 0.131746f, 0.362712f, 0.0146821f, 0.475679f }; static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias[] = { 0.477356f, 0.385222f, 0.389122f, 0.539506f, -0.0272558f, 0.581605f, -0.800961f, 0.142229f, 0.117549f, -0.0724944f, 0.102095f, -0.71319f, -0.0162434f, -0.132858f, 0.543411f, -0.626599f }; static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel[] = { 0.195436f, -0.623354f, 1.27907f, 0.270071f, -0.677612f, 0.0266141f, 0.272991f, -0.425446f, 0.891889f, -0.299836f, -0.611825f, -0.0322273f, 0.185276f, 0.238639f, -0.150954f, 0.083495f, -0.472106f, 0.573506f, 1.16465f, -0.154947f, 0.640631f, -1.59467f, -9.8166f, -0.22889f, -0.189912f, 0.227052f, -0.540787f, 0.0840873f, -3.04293f, -0.0209975f, -6.10979f, -5.92801f, 0.288467f, -0.169476f, 0.0527948f, -1.21202f, -0.280915f, 0.290863f, -0.601877f, 0.0598784f, -0.592136f, -0.535588f, -0.0434018f, -0.653223f, 0.00339129f, -0.133273f, 0.279463f, 0.483879f, 0.463664f, -0.14174f, -1.56354f, 0.560043f, -1.44639f, 0.673528f, -0.108418f, -0.707313f, 0.49633f, -0.0321971f, 0.411475f, -0.382184f, -0.965501f, -0.0507655f, 0.540415f, -0.977297f, 0.370382f, -0.375683f, 0.0844529f, -2.0002f, -0.346289f, 0.621251f, -0.489855f, 0.191252f, -0.576629f, -0.35773f, 0.023167f, 0.180793f, -0.417864f, 0.0587254f, 0.167824f, 0.0612058f, -0.712108f, 0.155614f, 0.900036f, -0.480124f, 0.146117f, 0.467011f, 0.412525f, 0.312724f, 0.551826f, -0.179601f, 0.706261f, 0.00674965f, -0.495221f, 0.140829f, -0.0619195f, -0.0697912f, 0.511967f, -0.0318237f, -0.285946f, -0.28608f, 0.0894142f, 0.234351f, -0.272328f, -0.350369f, -0.392605f, 0.287318f, 0.310426f, 0.293524f, 0.357681f, -0.157868f, 0.149652f, -0.259363f, 0.192941f, -0.850096f, 0.456507f, 0.387857f, -0.491187f, -0.0541993f, -0.28118f, 0.193991f, -0.0956664f, 0.0679829f, 0.0341118f, 0.141826f, 0.271538f, -0.285295f, -0.68666f, 0.306414f, 0.600678f, 0.494801f, -1.11907f, 0.524849f, 0.151169f, 0.474068f, -0.43441f, -0.229138f, 0.0345483f, 0.682888f, -0.471534f, -0.0457066f, -2.36721f, 0.446407f, 0.20396f, -1.17868f, 0.815363f, -1.13897f, 0.397217f, -0.593796f, -6.95512f, 0.650695f, 0.771657f, 0.15227f, -0.824519f, 0.617854f, -0.295353f, -0.101207f, 0.600989f, -0.550653f, -0.722371f, 0.292006f, -0.451891f, 0.54544f, 0.354278f, 0.0136258f, 0.192003f, 0.258275f, -0.0443647f, 0.0928186f, 0.667775f, 0.239558f, 0.0523887f, 0.71586f, 0.292563f, 0.362479f, 0.373453f, 0.250638f, -0.423037f, -0.486574f, -0.619397f, 0.343888f, 0.974971f, 0.574218f, 0.273989f, -0.209956f, -0.274333f, 0.0553766f, 0.263918f, 0.733824f, 0.038713f, -0.0788992f, 0.292014f, 0.111808f, -0.197507f, 0.593668f, -0.0245337f, 0.0873662f, 0.530997f, 0.620717f, 0.310697f, -1.54861f, 1.12915f, 0.0991346f, -0.59214f, 0.422325f, -0.0157936f, 0.380975f, 0.626403f, 0.268064f, -0.615231f, -1.43172f, 0.0928048f, 0.0949026f, -0.470912f, -0.0867527f, -0.0381206f, 0.178393f, -1.13737f, 0.12798f, 0.258214f, -0.803364f, 0.177506f, 0.542718f, 0.660656f, 0.145091f, 0.183056f, -0.47338f, 0.469287f, 0.10832f, 0.0994899f, -0.402719f, 0.157287f, 0.523071f, -0.324493f, 0.343599f, 0.664839f, -0.0375519f, -0.279238f, -0.0722333f, 0.395344f, -0.289316f, 0.0259298f, -0.843245f, -0.160021f, 0.741429f, -1.38726f, -0.2969f, -0.240443f, 0.247731f, -1.04088f, -0.280454f, -0.237054f, -0.759227f, 0.0456369f, -0.647453f, -1.02372f, -0.200395f, -0.546839f, -0.104226f, -0.152727f, -0.56685f, -0.0559663f, -0.425494f, -0.610679f, -0.987096f, -0.575138f, -0.0887979f, 0.463646f, -1.041f, -0.49412f, -0.175298f, -0.463296f, -0.955177f, 0.17852f, -1.10694f, 0.181991f, -0.18998f, 0.227818f, 0.688237f, -1.10444f, 0.549108f, -0.171849f, -0.245614f, 0.120624f, 1.29571f, 0.607116f, 0.00809927f, 0.1041f, -1.22918f, -0.212948f, 0.430239f, -1.57341f, 0.482054f, 0.275905f, 0.939785f, -1.0209f, -0.355534f, 0.397337f, -0.0593077f, -0.239603f, 0.475483f, -0.999101f, -0.140578f, 1.04787f, -0.591981f, -0.306989f, -0.879012f, -0.994715f, 0.0343158f, 0.218509f, 0.34704f, 0.0672934f, -0.178941f, 0.20509f, -0.360031f, 0.161241f, -0.324775f, -0.359531f, -0.0657085f, -0.864422f, -0.444865f, 0.597095f, -0.948691f, 0.240001f, -0.783159f, -0.569422f, 0.974205f, -1.04539f, 0.345915f, -0.681558f, -0.246047f, 0.256174f, 0.493667f, 0.681324f, 0.155613f, 0.773309f, -0.647027f, -0.214744f, -0.474202f, -0.661092f, -1.02316f, 0.0572593f, -0.437082f, -0.119874f, -0.464877f, -0.58067f, -0.218029f, 0.319516f, -0.378983f, -0.0698695f, 0.554693f, -0.537875f, 0.126429f, -0.145113f, -0.594312f, -0.218021f, -0.703569f, 0.0720548f, 0.261054f, -0.81438f, 0.249921f, 0.165296f, -0.079028f, -0.322647f, 0.134458f, 0.0975046f, 0.538594f, -0.250126f, 0.142309f, 0.526486f, 0.0532615f, -0.383332f, -0.38143f, -0.101611f, 0.519776f, -0.278364f, -0.23287f, -0.29139f, 0.22353f, 0.472085f, 0.366264f, 0.741187f, 0.42019f, 0.0676459f, -0.230008f }; static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias[] = { -0.48603f, -0.578556f, 0.257639f, 0.459915f, 0.178156f, -1.16663f, 0.828891f, 0.620291f, 0.413257f, -1.00508f, -0.574179f, -1.20623f, -0.377837f, -0.0360333f, 0.681536f, 0.137189f, -0.458718f, 0.387131f, 0.0233112f, 0.126045f, 0.361304f, 0.655317f, 0.413134f, 0.769947f }; static const float av1_intra_mode_cnn_partition_branch_0_logits_kernel[] = { 0.67244f, -2.59179f, 0.50425f, -1.86481f, 1.15891f, -1.26447f, 0.761081f, 0.645117f, -1.78594f, -0.872703f, -0.192054f, -1.82359f, -0.560935f, 0.838959f, 0.502264f, -1.28958f, -0.205551f, 0.635671f, -1.12619f, -1.68277f, 0.83361f, 1.57235f, 1.15839f, 0.35345f }; static const float av1_intra_mode_cnn_partition_branch_0_logits_bias[] = { 1.14463f }; static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel[] = { 0.364612f, 0.237868f, -0.192821f, 0.12364f, 0.522205f, -0.205785f, -0.503288f, -0.426503f, -0.083073f, 0.0164429f, 0.184278f, -0.426055f, 0.0717997f, -0.261968f, 0.176412f, -0.101226f, 0.0400285f, -0.332051f, 0.344385f, 0.189565f, 0.441162f, 0.330462f, -0.719857f, -1.14209f, 0.557831f, 0.104756f, 0.0562001f, -0.465923f, -0.344592f, -0.191554f, -0.0656866f, -0.640162f, 0.419388f, 0.409308f, -1.68632f, -1.10829f, 0.105485f, -0.14561f, -0.944738f, 0.104629f, -0.146837f, 0.538823f, -0.153157f, 0.321081f, -1.77714f, -0.0559296f, 0.324136f, -0.497023f, -1.15793f, -0.740144f, -0.0888472f, 0.010059f, -0.18394f, -0.234405f, -0.10586f, 0.130958f, -0.101944f, -0.186483f, -0.447049f, -0.900026f, 0.128444f, 0.401696f, 0.128509f, 0.123778f, 0.062168f, -0.321755f, -0.0691584f, 0.254468f, -0.115212f, -0.848885f, 0.817005f, 0.0615853f, 0.153363f, 0.513855f, 0.789225f, 0.356168f, 0.371613f, 0.269541f, 0.268173f, 0.220481f, -0.109063f, -0.00620798f, -0.0334622f, 0.236267f, -0.0235294f, -0.0800253f, 0.0294184f, 0.047131f, -0.224047f, 0.0890737f, -0.356293f, 0.0989534f, 0.16799f, 0.498266f, 0.612581f, -0.372897f, -0.75125f, 0.77698f, 1.1032f, -0.0764679f, 0.0266299f, 0.309532f, 0.461305f, 0.0193521f, -0.0939161f, -0.276156f, -0.102714f, -0.0828328f, 0.40003f, 0.122542f, 0.0867203f, -0.170738f, 0.0850642f, -0.130762f, 0.082324f, -0.115218f, -0.0244491f, 0.0434331f, 0.216453f, 0.443733f, -0.173679f, -0.161617f, 0.316209f, -0.689656f, -1.52007f, -0.421018f, 0.430833f, -0.00734122f, 0.284499f, -0.0207885f, 0.0572024f, -0.878942f, 0.388264f, 0.0191589f, -0.123415f, -0.0461196f, -0.0444461f, -0.00383171f, 0.0945655f, -0.0597219f, -0.374918f, 0.0182124f, 0.523083f, 0.00519547f, 0.80513f, -0.221433f, -1.30591f, -0.416917f, -0.718173f, 0.622999f, 0.941798f, 0.0477536f, 0.0303772f, 0.268078f, 0.414778f, 0.394325f, 0.299733f, -0.583208f, 0.309379f, 0.416581f, 0.0299948f, -0.409145f, -0.161557f, -0.214082f, -0.0098119f, 0.221912f, 0.107135f, 0.0692518f, 0.00490957f, 0.107613f, -0.368404f, -0.548006f, 0.208274f, 0.550475f, 0.643678f, -1.65859f, 0.095938f, -0.0434245f, -0.0792685f, 0.838109f, -0.0138653f, -0.527573f, -0.123472f, -0.235618f, -0.677401f, -0.125877f, -0.175604f, -0.203196f, 0.113478f, -0.228323f, -0.53539f, 0.134458f, 0.0534899f, -0.213006f, -0.138679f, -2.15023f, 0.186303f, 0.48566f, -1.22301f, -0.240982f, -0.486836f, -0.121181f, -0.131382f, -0.0320283f, 0.278828f, 0.342581f, -0.182257f, -0.365193f, -0.226351f, 0.108928f, -0.100159f, 0.448355f, -0.0768947f, 0.0633719f, -0.104786f, 0.0456653f, 0.0965752f, 0.156403f, -0.157337f, 0.212259f, 0.317939f, 0.124193f, -0.329475f, 0.206868f, -2.15986f, -0.108385f, -0.396769f, -0.0317231f, -0.271524f, -0.184697f, 0.662615f, 0.412926f, -0.0217462f, -0.0285475f, -0.118826f, 0.0252706f, -0.137091f, 0.198973f, 0.329509f, -0.0831966f, -0.621237f, 0.0896179f, 0.805261f, -0.019675f, 0.962452f, 0.307433f, 0.892168f, -0.537587f, -2.46145f, 0.125606f, 0.920491f, 0.219462f, 0.292765f, -0.748238f, -0.0537239f, -0.224326f, 0.505492f, 0.176426f, 0.0343168f, 0.16708f, -0.581393f, 0.951726f, -1.1777f, -0.561914f, -1.53288f, 0.864567f, -1.19648f, -1.24141f, -0.334688f, -0.622026f, 0.666876f, -0.197005f, -0.600507f, -0.851924f, 0.492299f, 0.31078f, -0.0736115f, 0.030999f, -6.02463e-05f, -0.0604341f, -0.0254238f, 0.139222f, 0.333235f, 0.366534f, -0.191982f, -0.0156092f, 0.44234f, -0.0193213f, 0.0938745f, -0.015709f, -0.12043f, 0.00895591f, 0.0464401f, 0.0530699f, -0.623018f, -1.23372f, -0.538647f, -1.12389f, 0.26742f, 0.548694f, 0.00540655f, -0.219703f, 0.314894f, -0.573463f, -0.241555f, 0.441851f, 0.422491f, 0.253785f, -0.384683f, 0.0370165f, 0.226669f, 0.245587f, 0.215265f, -0.122272f, 0.0492235f, 0.000658591f, -0.312877f, 0.436487f, -0.229199f, -0.174373f, 0.904268f, -0.855845f, -0.877293f, -0.65409f, 0.313795f, 0.461748f, -0.737766f, -0.228523f, 0.182181f, 0.334522f, 0.0629676f, -0.151087f, 0.178798f, -0.325809f, -0.331672f, 0.0865837f, -0.0684225f, 0.0252008f, -0.0820631f, 0.0481863f, 0.209473f, -0.0242151f, -0.0898919f, -0.163828f, -0.164282f, 0.581888f, 0.816896f, 0.0607674f, 0.364855f, -0.346512f, -0.764174f, 0.595561f, 0.302872f, 0.206361f, 0.106917f, -0.972338f, 0.176948f, 0.6415f, -0.131897f, -0.155802f, 0.216337f, -0.342511f, 0.123743f, -0.123014f, 0.0205439f, 0.15173f, -0.23801f, -1.00387f, 0.651328f, 0.237439f, -0.542952f, 1.066f, -0.161107f, -0.593545f, 0.219343f, -0.178094f, 0.0789992f, 0.428332f, 0.23827f, -0.327421f, 0.416144f, 0.00394653f, 0.052046f, -0.238289f, 0.405942f, 0.00141984f, 0.161017f, 0.077111f, 0.0823985f, 0.0981208f, 0.109949f, -0.0428502f, 0.343629f, -0.722978f, -0.375269f, -0.111634f, -0.271523f, 0.712093f, 0.684904f, -0.572331f }; static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias[] = { 0.583367f, -0.202004f, -0.207626f, 0.412451f, -0.258311f, 0.0304954f, -0.102458f, 0.450087f, -0.376851f, -0.338702f, 0.335226f, 0.889072f, 0.502411f, 0.649282f, 0.15345f, -0.0109896f }; static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel[] = { 0.0214882f, -0.934339f, -0.173335f, 0.8362f, -0.764234f, 0.525163f, 0.409749f, 0.821539f, -0.784157f, -0.455593f, 0.446099f, 0.406756f, 0.479242f, -0.814038f, -0.419332f, 0.328869f, -0.340707f, 0.133219f, 0.0320347f, 0.25089f, -0.324917f, -0.0684265f, 0.0377777f, -0.262556f, 0.673458f, -0.0291454f, -0.417957f, -1.0075f, -0.481537f, 0.922105f, -0.000516239f, -0.40034f, 0.242067f, -0.43178f, 0.32001f, 0.143599f, -0.345172f, 0.126093f, 0.148518f, -1.12151f, -1.03435f, 0.551691f, -0.310001f, -0.323194f, -0.595128f, -0.395689f, 0.737268f, -0.729227f, 0.590804f, -0.590022f, -1.01427f, -0.521159f, -0.617579f, 1.07292f, -0.613047f, -0.619093f, 0.335268f, 0.473753f, -0.795027f, 1.24635f, -0.556193f, 0.241046f, -0.0354181f, -0.354215f, 0.716752f, -0.00200745f, -1.25171f, -0.440731f, -0.763918f, -0.588614f, -0.183901f, -0.396056f, 0.226903f, 0.921471f, 1.10465f, 0.207053f, 0.57681f, -0.555699f, 0.235469f, -0.92149f, 0.625808f, 0.29653f, -0.81775f, -0.307889f, -1.41384f, -0.136205f, -0.365314f, -0.516741f, 0.748052f, 0.617947f, 0.0973239f, 0.839607f, 0.530668f, -0.227032f, -0.449044f, -1.04725f, -0.244363f, -0.396888f, -0.146161f, 0.359789f, 0.0436599f, 1.21645f, -0.336069f, 0.0534646f, -0.00200328f, 0.658551f, -0.156142f, -1.0728f, 0.0951015f, 0.234837f, -0.380525f, 0.041783f, -0.269273f, 0.0386013f, -0.455589f, -0.174338f, 0.0345251f, 0.17116f, -0.507642f, 0.210453f, 0.739987f, -0.0438776f, 0.570145f, -0.118811f, 0.0548662f, 0.153458f, -0.89887f, 0.493704f, 0.283351f, 0.785441f, -0.586002f, -0.0616167f, -0.714328f, -0.145941f, -0.449656f, 0.850117f, 0.279997f, 0.204143f, -0.31356f, 0.947057f, -0.135787f, 0.747071f, 0.0145968f, -0.81414f, 0.431009f, -0.275824f, -0.342928f, -0.0528272f, -0.592183f, 0.433915f, -0.251752f, -0.311815f, -1.47533f, -1.43677f, 0.0698436f, 1.01341f, 0.305063f, -0.252003f, -0.428915f, -0.00104153f, -0.368267f, -0.354523f, -0.27956f, -0.771664f, 0.232092f, -0.428495f, 0.424952f, -0.343229f, 0.196899f, -0.761084f, -0.0110293f, -0.335361f, 0.571637f, -0.423489f, -0.52773f, 0.0108043f, -0.504715f, -1.1419f, -0.402904f, -0.160747f, -0.329184f, 0.375374f, -1.02604f, -0.601371f, 0.631652f, 0.0742486f, -0.464765f, 0.467445f, 0.240562f, -0.38211f, -0.459004f, 0.704196f, 0.021357f, 0.860785f, -1.16731f, -0.479029f, -0.139644f, -0.444087f, 0.322326f, -0.25455f, 0.874399f, 0.477696f, 0.0464487f, 1.20658f, 0.0993356f, 0.00682712f, -0.10163f, -0.371765f, -0.629513f, -0.679196f, -0.193935f, 0.47405f, -0.18238f, 0.254918f, -0.35306f, -0.375611f, 0.119771f, -0.257282f, -0.565124f, 0.162667f, -0.356128f, 0.870351f, 0.241847f, -0.264712f, -0.384322f, 0.31807f, 0.211621f, -0.180767f, 0.764944f, 0.368646f, 0.186111f, 1.02458f, -0.494252f, -0.483375f, -0.699664f, 0.00415657f, -0.189376f, -0.677103f, -0.030319f, 0.667087f, 0.810951f, -0.488237f, -0.387355f, -0.726579f, -0.304763f, 1.10392f, -0.775977f, -0.247731f, 0.532396f, 1.24089f, 0.206621f, -0.670568f, -1.08142f, -0.342503f, 0.189854f, -0.200846f, 0.784204f, 0.641112f, -0.509346f, 0.0805264f, -1.40006f, 0.322084f, -0.823739f, -1.12965f, -0.215668f, 0.099673f, 0.425966f, 0.771697f, 0.338834f, 0.345364f, -0.297826f, -0.176746f, -0.297299f, -1.80029f, -0.178348f, 0.421194f, -0.19155f, 0.417653f, 0.374441f, -0.135654f, -0.895843f, 0.220647f, 0.368264f, 0.369233f, 0.382707f, 0.0800511f, 0.542053f, 0.318896f, -0.385539f, 0.313305f, -1.01166f, -0.222379f, -1.53708f, 1.32407f, -0.665444f, -0.102348f, 0.0410504f, -0.616825f, 1.3108f, 0.405902f, 1.27777f, 0.0630558f, -0.172696f, 0.16224f, -1.10111f, -3.31326f, -0.242566f, 0.831422f, 0.917397f, 0.311749f, -0.238613f, 0.438007f, -0.407089f, -0.0202555f, -1.82502f, -0.907965f, -0.300031f, -0.616669f, -0.767921f, 0.285919f, -0.112019f, 0.252677f, 0.350892f, 0.000214244f, 0.315915f, 0.260344f, 0.327362f, -0.0211213f, -0.41241f, 0.0418355f, 0.103328f, -0.0158439f, -0.230505f, -0.0215114f, 0.266739f, -0.234376f, -0.352583f, 0.0709437f, -0.90649f, -0.535843f, 1.21322f, -1.05144f, -0.983682f, -0.189956f, 1.14208f, -0.0188492f, -0.254821f, -0.463214f, -0.708714f, 0.0447348f, -0.220831f, 0.476299f, 0.102544f, 1.1173f, -0.36981f, -0.814102f, 0.103604f, -0.247871f, 0.0610701f, -0.356616f, -0.144093f, 1.66496f, 0.180206f, -1.04384f, -0.65883f, 0.0290771f, -0.622728f, 0.761523f, -0.909091f, -0.0340348f, 0.666895f, -0.0232575f, 0.962643f, -2.50103f, -1.69745f, -0.0482305f, 0.771811f, -1.32233f, -0.778722f, -0.203309f, 0.395875f, -0.171812f, 0.253794f, 0.432799f }; static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias[] = { -0.152159f, 0.552347f, -0.806068f, 0.227901f, 0.335896f, 0.180785f, 0.75277f, 0.982208f, 0.409823f, -0.17755f, -0.125365f, 0.738114f, 0.202331f, 0.751737f, -0.360511f, 0.149254f, 0.085073f, -0.214542f, 0.529727f, -0.0348777f, -2.13162f, -0.893332f, -0.136952f, -0.71258f }; static const float av1_intra_mode_cnn_partition_branch_1_logits_kernel[] = { -0.632145f, 0.738727f, -0.750737f, -0.931571f, -1.79763f, -2.31153f, 0.912733f, 0.879995f, -1.00602f, -1.02467f, 0.0536835f, 1.76011f, -0.898546f, 1.06959f, 1.60471f, -1.7312f, -0.877168f, -0.681185f, -1.57286f, -1.16038f, -4.11303f, -3.06351f, -3.02536f, -2.92186f }; static const float av1_intra_mode_cnn_partition_branch_1_logits_bias[] = { 1.33207f }; static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel[] = { 0.0419551f, 0.0924078f, -0.153084f, 0.191642f, 0.069586f, -0.530661f, 0.431968f, 0.000453838f, 0.793047f, 0.0161817f, -0.476075f, -0.156638f, -0.219066f, 0.372716f, -0.0642299f, 0.156813f, -0.105819f, -0.0519422f, 0.149935f, 0.295544f, 0.192037f, -0.0450383f, 0.828794f, -0.0510661f, -1.22549f, -0.100293f, -0.178274f, 0.0304427f, -0.0664097f, -0.0438936f, 0.948248f, 0.425486f, -0.238206f, 1.3744f, 0.336897f, 0.0760769f, -0.583508f, 0.0735519f, -0.117024f, 0.0501598f, 0.332212f, 0.199531f, 0.424764f, 0.206712f, 0.342868f, 0.592673f, -0.0961148f, -0.190113f, -0.155027f, 0.00789871f, -0.0514839f, -0.416154f, -0.290309f, 0.407541f, 0.48534f, 0.126564f, 0.0709566f, -0.0469664f, 0.735403f, -0.365963f, 0.150295f, -0.50147f, 0.021383f, 0.76514f, 0.0085721f, -0.416384f, 1.22268f, 0.0832438f, 0.367813f, -0.12012f, 0.823183f, -0.0525972f, -0.325526f, -0.0983032f, 0.370128f, 0.368778f, 0.138971f, -0.0397997f, 0.411058f, -0.0400404f, 0.588437f, -0.29963f, -0.107992f, -1.75238f, -0.274387f, 0.430418f, 0.495152f, 0.283172f, -0.441166f, 0.195339f, -0.436182f, -0.252613f, 0.176204f, -0.126541f, -0.474833f, -0.0721603f, -0.496599f, -0.0608464f, 0.0333451f, -0.0621485f, 0.0843859f, 0.0637854f, -0.145291f, 0.14876f, 0.181665f, -0.675805f, 0.294903f, 0.301118f, -0.225957f, 0.0105897f, -0.136427f, -0.555925f, -0.158853f, -0.216779f, 0.0612481f, -0.107158f, 0.352451f, 0.140536f, -0.0148237f, 0.189371f, -0.091046f, -0.0476226f, 0.366054f, -0.0723413f, 0.389883f, -0.0213411f, 0.0279539f, 0.194827f, -0.271502f, -0.166474f, 0.0690549f, 0.0584665f, 0.0198415f, -0.442348f, 0.1571f, -0.113463f, -0.16822f, -0.0580659f, -0.13441f, -0.0022386f, 0.251521f, -0.160494f, -0.0753547f, 0.0897289f, 0.137917f, 0.129836f, 0.0816833f, -0.626288f, 0.0643293f, -1.20001f, 0.085631f, -0.195602f, 0.251244f, 0.0321744f, 0.0493178f, -0.220616f, 0.724075f, -0.00831514f, 2.00319f, 0.407932f, 0.0710799f, -0.166128f, 0.0126611f, -0.229644f, -0.0984299f, 0.632041f, -0.0946141f, 0.295315f, 0.100934f, 0.184883f, -0.236173f, 0.158081f, 0.195775f, 0.413542f, 0.789801f, 0.767741f, 0.166275f, -0.348271f, -0.384074f, -0.291648f, -0.119899f, 0.0368354f, 0.0751987f, 1.04217f, -0.159002f, -2.71592f, -0.788502f, -1.06268f, 0.536057f, 0.0575876f, 1.06811f, 0.12033f, 0.198578f, -0.0419196f, 0.0631388f, 0.623138f, -0.142226f, 1.33129f, 0.0868059f, -0.0287825f, 0.139378f, -0.143037f, 0.307452f, 0.0363987f, -0.0976368f, 0.040544f, 0.0269327f, -0.0845524f, 0.0674699f, 0.104501f, -0.0351155f, 0.167071f, 0.00986971f, 0.10284f, 0.0300016f, 0.192601f, 0.0397177f, 0.0251346f, -0.00912908f, -0.0452825f, 0.0164356f, -0.0275149f, 0.194846f, 0.0943608f, 1.61674f, 0.0124345f, 0.523787f, 0.0397258f, -0.17208f, -0.147808f, -1.23583f, 0.676385f, 0.551994f, 0.0233041f, 0.0116391f, -0.466706f, 0.154725f, -0.207371f, 0.606662f, 0.247286f, 0.31216f, 0.173765f, -0.268033f, 0.224422f, 0.314649f, 0.481922f, -0.190604f, -0.0129162f, 0.270552f, 0.135195f, 0.0927735f, -0.226099f, 0.53897f, 0.103309f, -0.0257271f, -0.0246776f, 0.442013f, -0.179246f, -1.02581f, 0.206176f, -0.326365f, 0.391623f, -0.103549f, 0.115645f, 0.0269328f, -0.584517f, -0.237502f, 0.157996f, 0.0447407f, -0.161f, -0.126072f, -0.148967f, -0.416347f, 0.0236496f, -1.12612f, 0.0120709f, -0.00979376f, 0.0507126f, -0.172262f, 0.0697059f, -0.212334f, 0.335731f, -0.0301362f, -0.839583f, -0.238539f, 0.0636752f, -0.0467217f, -0.0372118f, -0.144615f, -0.161773f, -0.648242f, 0.158197f, -0.051471f, -0.0615805f, -0.0426936f, -0.0745554f, 0.358975f, 0.358297f, 0.0568553f, -1.14383f, -0.103955f, 0.728194f, -0.224945f, -0.31659f, -0.204458f, 0.171763f, -0.465666f, 0.899234f, -0.37042f, -0.0894774f, 0.11478f, -0.334957f, 0.0896514f, 0.413251f, 0.359471f, 1.41597f, 0.558082f, 0.153486f, 0.0270558f, -0.0178797f, 0.124983f, -0.12273f, -1.04516f, -0.125375f, 0.370336f, -0.209423f, -0.36816f, -0.66077f, -0.0180773f, -0.628921f, -0.178542f, 0.0346841f, 0.0319309f, -0.470138f, 0.172763f, 0.0798846f, -0.259737f, -0.652461f, -0.386283f, -0.474447f, -0.924054f, -0.0154613f, -0.613712f, -0.138068f, -0.337842f, 0.217921f, -0.0711405f, 0.000404091f, -0.703766f, 0.0364683f, 0.150173f, 0.0126249f, 0.170594f, 0.0371879f, -0.0862515f, -0.23454f, -0.0144143f, 0.164947f, 0.45591f, 0.115703f, 0.069752f, -0.011993f, 0.0402097f, 0.00697581f, 0.0811613f, 0.384752f, 0.341977f, 0.06087f, 0.0590107f, 0.00812679f, 0.121211f, -0.0612108f, 0.167851f, 0.195781f, -1.62162f, 0.336292f, -0.0772523f, -0.310786f, 0.188257f, -0.0325804f, -0.240098f, 0.158748f, -0.265264f, 3.19593f, -0.449251f, -1.33102f, -0.482856f, -0.435731f, 0.300808f, 0.346503f, 2.67378f, -0.152379f, 0.219322f, -0.146119f, -0.0584806f, -0.0276895f, -0.21955f, -0.479179f, -0.689545f, 0.152799f }; static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias[] = { -0.296575f, 0.101072f, -0.208429f, 0.111585f, 0.699552f, -0.379484f, 0.313244f, -0.746369f, 0.867757f, 0.457318f, -0.0190943f, -0.290745f, 0.45592f, -0.160465f, -0.634243f, 0.0829737f }; static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel[] = { 0.27511f, -2.14172f, 1.25755f, -0.554772f, 0.589508f, 0.228307f, 0.0754914f, 1.07061f, 0.293323f, 0.65162f, -0.272016f, -1.33519f, -0.606759f, -0.57827f, 0.368807f, -1.48668f, 0.162439f, 0.0821667f, 0.225535f, -0.795996f, 0.0328293f, 0.975476f, -0.187514f, 2.47069f, -1.5638f, -0.461524f, 0.00310062f, 1.1556f, -0.286206f, 0.00426021f, 0.585836f, 0.900007f, 0.384055f, 0.189435f, -0.157291f, -0.0710573f, -0.0663986f, -0.710772f, -0.669136f, -0.379493f, -1.2634f, -0.377524f, 0.824094f, 0.312308f, 0.125368f, -0.382737f, 0.637109f, 0.61907f, -0.741184f, 0.00257198f, -0.0151343f, -0.669826f, -0.439855f, 0.564852f, -0.0588036f, -1.38123f, -1.1126f, 0.701831f, 0.198686f, 0.266866f, 0.270172f, -0.692401f, 0.272533f, -1.70914f, 0.66064f, 0.0886659f, -0.132233f, 0.270531f, -0.479581f, 0.704338f, -0.307039f, -0.111792f, -2.05753f, -0.231749f, 0.300528f, 0.383266f, -0.130857f, -0.373944f, 1.21025f, 0.704655f, -0.589422f, 0.267185f, -0.109065f, -0.195991f, 0.20209f, -0.0676526f, -0.183926f, 0.164894f, 0.0877923f, 0.565943f, -0.0610466f, -0.86354f, -0.80853f, -0.176111f, -1.45016f, -2.29078f, -0.124524f, -0.139305f, -0.187858f, -0.0250151f, -0.572544f, 0.185336f, -0.69275f, -0.430354f, -0.30861f, -0.754258f, -0.468221f, -0.160487f, -0.766692f, -0.636418f, -0.71016f, 0.576125f, -0.240476f, -0.954556f, -0.104693f, 0.155557f, -0.840224f, -0.685457f, -0.0346927f, -0.644882f, -1.92475f, -0.314544f, 0.463569f, 0.323569f, -0.990124f, -0.213658f, 0.407183f, 1.19797f, -4.77004f, -0.0613379f, -2.40345f, -0.0591791f, -0.477622f, -0.303556f, 0.104077f, -0.974128f, -0.035172f, 1.47064f, 0.233727f, -0.0754056f, 0.158553f, 0.0614361f, -1.38865f, 0.690729f, 0.568455f, 0.205866f, -0.0236852f, -0.0921077f, -0.538954f, 0.336613f, -0.427115f, 0.791754f, -1.819f, -0.404432f, 0.670242f, -0.0343869f, -0.37191f, 0.0271262f, 0.988161f, -0.547343f, 0.925304f, 0.548079f, -0.430343f, -0.214109f, 0.242013f, 1.39027f, 0.37648f, -1.63524f, -0.158864f, -0.572779f, -0.766801f, -2.62032f, 0.47799f, -1.12025f, -0.115283f, 1.22349f, -0.262132f, -0.151274f, 0.390483f, -0.496482f, 1.06166f, -0.183052f, 0.54647f, 0.847486f, 0.0229506f, 0.653309f, -0.020736f, -1.27453f, 0.48386f, -0.366625f, -0.515725f, -1.31196f, 0.140701f, -0.183636f, 0.000413912f, 0.300993f, -0.849529f, -0.59764f, -0.212992f, -0.933365f, -1.4054f, -0.091982f, 0.41695f, 0.264004f, -0.26379f, -0.0738219f, 0.434052f, 1.16617f, -0.639624f, -0.146465f, 0.0409936f, -0.900182f, 0.73517f, 0.805746f, -0.208088f, 1.74459f, -0.0592751f, 0.624865f, -0.62325f, -0.446315f, 0.150526f, 0.0526697f, 0.374254f, -0.658043f, 1.02623f, -0.941758f, 0.381217f, -0.359448f, 0.160051f, 0.556455f, 0.239382f, 0.75851f, 0.437583f, -0.122221f, 0.746136f, 0.218286f, -0.426729f, 0.0353903f, -0.830513f, -0.877586f, 0.488077f, -0.132354f, -0.180756f, 0.736163f, -0.202934f, -0.882534f, 0.166305f, 0.183122f, 0.0599858f, 0.442687f, 0.0522908f, -1.17755f, -1.03733f, 0.392363f, 0.672718f, -1.44704f, 0.360623f, 0.390298f, -0.213968f, 0.169783f, -0.717536f, -0.830984f, -0.445049f, 0.196772f, -0.730634f, -1.09497f, 0.344012f, -0.292802f, -0.67966f, 0.138515f, -0.361803f, 0.936778f, -0.189802f, 0.197777f, -0.367507f, -0.293653f, 0.447759f, -0.409245f, -0.687568f, -0.431301f, -0.271234f, -0.585413f, -0.936414f, -0.396049f, -0.29388f, -0.0930843f, 0.0179339f, 0.262463f, -0.166598f, 0.0171466f, -0.329641f, 0.39343f, 0.657445f, -0.579052f, -0.312444f, -0.0915881f, -0.432622f, -0.247645f, 0.485749f, -0.602508f, -0.347936f, 0.287353f, 0.288705f, 0.168397f, 0.568228f, -0.493586f, 1.04155f, -0.097956f, 0.658928f, -0.561007f, 0.0457783f, 2.12744f, 0.182683f, -0.690282f, 0.183302f, 0.0309499f, -0.722251f, 0.0660448f, -0.333277f, 0.198929f, -0.724102f, -0.405597f, 0.614868f, -0.292862f, 0.886513f, 0.142353f, -1.48934f, -0.97273f, 0.199683f, 0.522121f, 0.0877478f, -0.172593f, -1.58858f, 0.113191f, -0.436178f, 0.640895f, -0.504676f, 0.0658654f, -0.361301f, 0.604323f, 0.315196f, -0.423021f, -0.323484f, -0.563163f, 0.118989f, -0.404508f, -0.0550995f, -0.0359236f, -0.126574f, -0.357288f, -0.0494502f, 1.04959f, -0.31646f, -0.0376684f, -0.300744f, -0.135016f, 0.102696f, -0.392333f, -1.17502f, 0.505227f, 0.337608f, -0.348831f, -0.420815f, 0.202791f, -0.154264f, -0.563686f, 0.0942187f, 0.353862f, 0.0303509f, -0.132794f, 0.420746f, 0.143529f, 0.455822f, -1.28348f, -1.35662f, -0.850688f, -1.76361f, -0.717546f, 0.443111f, 0.227155f, -0.863307f, -0.452033f, -0.278151f, 1.86233f }; static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias[] = { -0.103218f, -0.359587f, 0.619666f, -0.473497f, -0.649803f, 0.86992f, -0.115561f, 0.335114f, -0.285044f, -0.59295f, 0.24497f, 0.611583f, 0.38568f, 0.137913f, -0.281191f, -0.0107777f, 0.487236f, -0.262363f, 0.696962f, 0.121565f, 0.312511f, 0.430916f, 0.694134f, 0.393632f }; static const float av1_intra_mode_cnn_partition_branch_2_logits_kernel[] = { -2.42496f, -1.239f, 0.832673f, 1.56923f, -2.6175f, -1.42492f, -0.311387f, -1.94237f, 0.54071f, -2.50391f, 0.352205f, -0.96572f, 1.47144f, -2.04702f, -1.12372f, -0.709186f, 0.812238f, 0.310389f, 0.789163f, -0.65236f, 1.77018f, 0.273867f, 1.19506f, 1.07022f }; static const float av1_intra_mode_cnn_partition_branch_2_logits_bias[] = { 0.953424f }; static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel[] = { 0.0485154f, 0.0496279f, 0.0268229f, -0.0584843f, -0.166928f, 0.0316731f, -0.0895094f, -0.0433243f, -0.00893639f, -0.0886265f, -0.0345622f, -0.235395f, -0.213754f, -0.00212398f, 0.0218857f, -0.0054983f, -0.0248236f, 0.081822f, -0.0355708f, -0.0795593f, -0.106995f, -0.0596378f, 0.0350686f, -0.133863f, -0.00582928f, 0.114963f, 0.193906f, -0.00419085f, 0.0430529f, -0.128318f, 0.0614715f, -0.000952935f, -0.0345722f, -0.109459f, 0.074204f, -0.0865131f, 0.0649158f, -0.0942417f, -0.10122f, -0.047551f, -1.27825f, -0.0125456f, -0.019722f, -0.152058f, 0.280306f, -0.121231f, -0.0565484f, 0.0959188f, 0.0603919f, 0.0457468f, 0.967589f, 0.105892f, -0.118326f, 0.198933f, 0.163437f, -0.056824f, -0.0302956f, -0.07366f, -0.681407f, -0.0781575f, 0.255732f, -0.0712105f, 0.177882f, 0.709206f, -0.232457f, 1.33809f, -0.0328557f, 0.0572231f, -1.01361f, 0.130676f, -0.205159f, 0.975398f, 0.356293f, 0.0766364f, -0.297397f, -0.0261066f, -0.0933549f, 0.0568851f, -0.0123034f, -0.0433538f, 0.131003f, 0.890705f, 0.0084565f, 0.00547395f, 0.00157634f, 0.0047937f, -0.0511092f, 0.0300034f, -0.00604993f, -0.0133502f, -0.000274302f, 0.129728f, -0.00532916f, 0.0855351f, 0.136885f, 0.0175562f, -0.0123633f, -0.000512229f, -0.019924f, -0.0316328f, 0.422972f, 0.0460336f, 0.0170841f, -0.00086795f, -0.0655137f, 0.0287308f, -0.0375644f, -0.0329215f, -0.0273072f, 0.0241426f, -0.0429052f, 0.0221593f, -0.063881f, -0.0347391f, -6.44339e-07f, 0.0476934f, -0.0150068f, 0.0146403f, -0.0653099f, 0.0107635f, 0.012407f, 0.0048935f, 1.50975f, 0.322256f, 0.17881f, 0.0943775f, -0.100583f, -0.367022f, -0.156525f, -0.0397161f, 0.0752784f, -0.00219022f, -0.887456f, 0.0153415f, -0.0148185f, -0.56435f, 0.163996f, -0.0221024f, -0.0115872f, -0.0529284f, 0.156838f, -1.13813f, -0.207863f, -0.00484959f, 0.135719f, 0.131004f, 0.0417939f, 0.31453f, 0.121719f, -0.101515f, 0.267951f, 0.219727f, 0.0398821f, 0.0713504f, 3.65918e-06f, -0.00659998f, 0.477343f, -0.128426f, 0.0648877f, 0.111884f, 0.224552f, 0.0617426f, 0.117742f, 0.031377f, 0.0586865f, -0.459293f, 0.100211f, -0.14127f, 0.624412f, 0.014659f, -1.41807f, -0.382452f, -0.695931f, -0.103153f, 0.145808f, 0.333526f, -0.256367f, 0.096842f, 0.102458f, -0.181224f, 0.729272f, 0.151177f, 1.46729f, 0.111044f, -4.28813f, 0.0178379f, 0.47641f, -6.57533f, 0.0633335f, 0.496934f, -0.154657f, -9.07298e-05f, 0.848937f, -5.40143f, 0.375685f, 0.23586f, -0.166591f, -0.0191648f, -0.039862f, -3.25093f, 0.168472f, -0.260317f, -5.51548f, 0.0575334f, 0.328979f, 0.112644f, 0.231339f, -0.122641f, 0.0567331f, 1.19541f, -0.038735f, 0.0630576f, 0.176668f, 0.0757184f, -0.833104f, 0.133669f, 0.982669f, 0.0311783f, 0.0908558f, -0.10065f, -0.0386599f, -0.231587f, -0.83876f, -0.347148f, 0.225529f, -1.29625f, 0.0806834f, 0.369648f, -1.63367f, 0.118057f, -0.311948f, 0.95022f, -0.354807f, -0.648657f, -1.72048f, 0.260397f, 0.915555f, 0.057737f, -0.162019f, -0.453543f, -1.70388f, -0.311632f, -0.731593f, -0.678089f, 0.10438f, -0.293911f, 0.144864f, 0.039212f, 0.0289241f, -0.0685266f, 0.634592f, -0.0798614f, -0.119197f, -0.00517433f, -0.04653f, -0.127568f, -0.0582645f, 0.0735302f, -0.0946823f, 0.00865585f, 0.0115748f, 0.0194847f, 0.0455664f, 0.181006f, -0.0824601f, 0.0869093f, 0.264767f, -0.0750432f, 0.135136f, 0.316511f, 0.399015f, 0.0994808f, -0.166944f, -0.102126f, 0.457858f, 0.300488f, 0.467582f, 0.830244f, -0.0511439f, -0.522892f, -0.183049f, 0.2626f, 0.118382f, 0.241674f, 0.250399f, -0.0963507f, -0.83231f, -0.227699f, -0.133314f, 0.231718f, -0.0700274f, 0.891311f, 0.224742f, -0.572836f, 0.402798f, -0.191576f, 0.740922f, -0.00374073f, 0.658178f, -0.209364f, -0.416259f, 0.166297f, 0.0095577f, -0.0876076f, 0.424954f, 0.265226f, -0.129343f, -0.203146f, -0.194637f, -0.818142f, -0.164152f, -0.368962f, 0.273373f, 0.599927f, -0.19859f, 0.0939651f, -0.12458f, -0.751816f, -0.302997f, -0.139176f, -0.372737f, 0.332704f, -0.206045f, -0.00593763f, -0.452363f, -0.2704f, -0.198846f, 0.0976308f, -0.216124f, 0.110122f, -0.220342f, 0.00763426f, -0.0272775f, -0.190395f, -0.0359411f, -0.0395759f, 0.000941162f, -1.49959f, 0.0914233f, 0.448346f, -0.420435f, -0.0102102f, -0.0757978f, -0.0177687f, -0.0231492f, -0.142125f, 1.31774f, 0.0269368f, 0.134566f, 0.152079f, -0.139933f, 0.139226f, -0.214467f, -0.194446f, -0.555893f, 0.271197f, -0.111047f, 0.0888069f, -0.198121f, 0.0871713f, 0.100612f, 0.429782f, -0.3787f, 0.123147f, -0.12538f, 0.235678f, 0.139237f, 0.223326f, 0.85806f, -0.00554756f, 0.285095f, 0.0954683f, 0.0464989f, 0.100806f, -0.0211297f, 0.121672f, 0.242473f, 0.0810475f, -0.834356f, 0.119629f, 0.111338f, -0.227126f, 0.159296f, -0.0584685f, -0.108265f, -0.0909221f, -0.21749f, 0.0929309f, -0.176815f, 0.178067f, -0.0025905f, 0.317883f, 0.313045f, 0.26774f, -0.589329f, -1.19882f, -0.285513f, -0.109478f, 0.309441f, -0.0604479f, 0.947461f, -0.142342f, -0.9086f, -0.814788f, 0.184588f, -0.0736317f, 0.276237f, 0.13132f, -0.3931f, -0.381744f, -0.0122719f, 0.0246101f, -0.0920412f, 0.11331f, -0.110355f, 0.00848064f, 0.0931248f, -0.0638655f, -4.30869e-05f, -0.300367f, 0.0489508f, 0.464441f, -0.0466243f, -0.0137732f, 0.0099241f, -0.223972f, 0.188966f, -0.653173f, -0.354322f, 0.189237f, -0.624276f, -1.46218f, -0.075161f, -0.516172f, 0.40993f, 0.291178f, -1.95088f, -0.0352157f, 0.196354f, -0.335897f, 0.0857039f, 0.605319f, -1.12923f, -0.638387f, 1.41868f, 0.0955757f, -0.00913477f, 0.315935f, -0.671223f, -0.851436f, -0.157464f, -0.296763f, 0.182277f, -0.139309f, 0.232789f, 0.869562f, 0.248894f, 0.242709f, 0.195479f, 0.106153f, 0.358881f, 0.167443f, 0.982987f, 0.104767f, -0.033925f, -0.0263185f, 0.0045304f, 0.0722479f, -0.111307f, 0.00128896f, 0.406128f, -0.00944947f, 0.121592f, 0.546284f, -0.00175696f, 0.776588f, 0.238846f, 0.064469f, 0.27082f, 0.269187f, 0.0294455f, 0.62364f, -0.27872f, -0.0488013f, 0.229024f, 0.154457f, 0.0445898f, 0.349943f, 0.0710998f, 0.0820674f, 0.0279449f, 0.172826f, -0.122156f, -0.164688f, 0.0292124f, 0.0496112f, -0.741762f, 0.0673926f, 0.108159f, -0.0942327f, -0.0562883f, 0.558231f, 0.0552399f, 0.211393f, 0.0376817f, -0.275788f, 0.0548436f, 0.212732f, 0.163603f, 0.0663363f, -0.0252315f, 0.164533f, 0.0826088f, 0.0301389f, 0.345705f, -0.0378046f, -0.139581f, 1.30162f, 1.23551f, -0.446693f, 0.682534f, -0.0831157f, -0.0121595f, 1.50505f, 0.0839017f, -0.953413f, 0.0820985f, -0.125556f, 0.699796f, -0.140453f, 0.168438f, -0.110966f, 0.173806f, 0.114683f, 0.132502f, -0.0453539f, -0.133096f, 0.511947f, -0.180657f, -0.0298605f, 0.291437f, -0.0275017f, -0.229703f, -0.0504205f, 0.559622f, 0.384601f, 0.111024f, -0.0773559f, -0.0591752f, -0.0866182f, -0.189437f, -0.262345f, -0.0372182f, 0.149925f, 0.154644f, -0.188298f, 0.236949f, -0.199328f, -0.378909f, -0.680128f, 0.277184f, -0.172784f, 0.184717f, -0.23899f, 0.0712069f, 0.0235425f, 0.4225f, -0.441487f, 0.177434f, -0.298303f, 0.295696f, 0.17346f, 0.220542f, -0.680116f, 0.00266223f, -0.0408459f, -0.15486f, 0.24335f, 0.237258f, -0.0283245f, 0.19703f, -0.100027f, 0.0554843f, -1.03081f, 0.151745f, 0.538582f, 0.370368f, 0.196683f, 0.0222123f, -0.0831401f, -0.0832803f, -0.286743f, -0.686003f, 0.0995004f, 0.148901f, -0.0436037f, -0.316508f, 0.00391835f, -0.228452f, 0.940058f, 0.520047f, -0.334211f, 0.652142f, -0.0755971f, 0.0965123f, -0.98191f, 0.394096f, -0.420466f, 0.327284f, -0.134651f, 0.849297f, -0.523372f, 0.010327f, 0.133636f, 0.298119f, -0.257389f, 0.0376153f, -0.198298f, 0.0736235f, 0.608809f, 0.0291836f, -0.290005f, -0.141316f, 0.0184599f, 0.0554437f, 0.0621519f, 0.485276f, 0.617062f, -0.0924811f, -0.0120834f, 0.0817611f, 0.100421f, -0.0153553f, -0.135958f, -0.0185322f, -0.395803f, -0.204862f, 0.547916f, -0.438117f, 0.0229788f, 0.406981f, 0.795584f, -2.02756f, -0.8355f, -0.386789f, 0.00968368f, 1.2147f, -0.740869f, -1.18415f, -0.954918f, -0.541142f, 0.0596003f, 0.107189f, -0.411708f, -0.964593f, 0.511906f }; static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias[] = { -0.485545f, 0.131552f, 0.796833f, -0.157582f, -0.0948124f, 0.00818613f, -0.485562f, 0.3826f, -0.0839326f, 0.170998f, 0.279545f, -0.287143f, 0.184986f, -0.0719864f, 0.19748f, 0.404145f }; static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel[] = { 1.30172f, 0.720189f, 0.261675f, -0.466201f, 1.21773f, 0.495525f, 0.62398f, 0.44567f, -0.330993f, -0.269798f, 0.835161f, -0.294874f, 0.186981f, 0.0162467f, 0.367654f, 0.658468f, 1.08325f, 1.01558f, 0.12783f, -0.280581f, 2.2204f, 0.0337286f, -0.403649f, -0.230908f, -0.35188f, 0.437712f, -0.103634f, -0.645929f, 1.17407f, 0.157385f, 0.212438f, 1.41874f, 0.284242f, -0.493105f, 1.0703f, 0.00632116f, 1.18222f, -0.26003f, 0.276795f, -0.823156f, 0.29577f, -0.157467f, -0.18092f, 0.0237336f, 0.205715f, -0.295679f, 0.165443f, -0.628279f, 1.00804f, 0.361232f, 0.646155f, -0.028651f, 1.64317f, 0.334251f, -1.50713f, -1.51685f, -0.488522f, 0.169694f, -0.593176f, -0.372682f, -1.50223f, 0.35076f, -0.24641f, -0.237189f, 0.190502f, -0.948191f, -0.303346f, 0.45108f, -0.794368f, -2.3116f, 0.404008f, -2.67269f, -0.941992f, -0.45336f, 0.0655987f, -0.288432f, 0.106068f, 0.286978f, 0.121403f, 0.462739f, 0.0130292f, 0.240597f, -2.30983f, -0.453309f, -0.149335f, 0.856424f, -0.186576f, 0.769961f, -0.0657097f, -0.976188f, 0.972971f, -0.532728f, -0.699334f, -0.168803f, 0.361945f, 0.950769f, 1.5368f, -0.223899f, 1.17547f, -0.281483f, 0.533619f, 0.315344f, 0.0854543f, 0.464701f, 0.346828f, 0.271794f, -0.0185388f, 0.109517f, 0.371662f, -0.10852f, 0.244092f, 0.491959f, -0.750281f, 1.41865f, -3.51221f, 0.298194f, -0.0790832f, -0.134158f, -0.424084f, 0.189593f, -0.238361f, -0.407872f, -0.366222f, -0.606813f, -0.230498f, 0.387248f, -0.102734f, -0.190544f, -1.43649f, 0.141338f, -0.0438917f, 0.204628f, 1.57033f, 0.0366937f, -0.14733f, 0.048198f, -0.122631f, 0.183354f, 0.0658753f, -0.243381f, 0.0246889f, -0.768798f, -0.0644054f, 0.775073f, 1.63419f, 0.491624f, 0.21898f, -0.358944f, 3.31304f, 0.0195916f, 0.236174f, 0.530704f, 0.140124f, 0.0736778f, -0.27361f, -0.598836f, -1.01659f, 0.361765f, 0.00455986f, -0.345222f, 1.68731f, 0.764082f, 0.193555f, 0.322782f, 1.19801f, 0.538935f, -0.0393231f, -0.0248292f, -0.151168f, 0.479879f, -0.208582f, 0.22798f, 0.335473f, -0.00295455f, 0.139539f, 0.400814f, 0.478307f, -0.189376f, 0.540084f, 0.466072f, 0.920231f, 0.398774f, -0.472403f, -0.0431972f, -0.581665f, -0.990058f, 0.258995f, -0.0148889f, 0.27105f, 0.340334f, 0.223576f, -0.0405193f, -1.23888f, -1.45229f, -1.44543f, -0.376146f, 0.132601f, -0.4064f, -0.583611f, -0.374588f, 0.0659428f, 0.325652f, -0.338456f, 0.253767f, -0.0181164f, 0.681732f, 0.222041f, 0.837496f, 1.09735f, 0.156328f, 0.177236f, -0.702702f, 0.473689f, 0.322118f, 0.43343f, 0.315441f, -0.40798f, 0.0811291f, 0.631431f, 0.361929f, 0.0723276f, 0.0164498f, 0.0293847f, 0.156406f, -1.10453f, 0.837977f, -1.03449f, -0.348408f, 1.71953f, -0.401765f, 0.64272f, -0.182438f, -0.233954f, 0.364597f, 0.269177f, -0.578512f, 0.397216f, 0.0425122f, -0.258728f, 1.41621f, -0.688768f, 0.0944726f, 0.253163f, -0.989037f, 1.72726f, 1.15976f, -0.0460612f, 0.534186f, -0.136814f, 0.49327f, 0.115744f, -0.633052f, -0.433855f, -1.01874f, -0.324035f, 0.489487f, 1.08696f, 0.836376f, -0.423477f, -0.421309f, 1.07348f, 0.323266f, 0.717604f, 0.366422f, 0.32983f, 0.336583f, 0.749292f, -0.210666f, 0.387101f, -0.583376f, 0.0391101f, -1.07537f, 0.914591f, -0.51303f, 1.15023f, -0.0378782f, 0.262889f, -0.841128f, 0.41619f, -0.669704f, -0.109995f, 1.01825f, -0.194853f, 0.120739f, 0.627889f, -0.00269221f, 0.751152f, -0.529865f, -1.50238f, 0.184521f, 0.795464f, 0.106099f, 1.83117f, 0.0883305f, 0.306844f, -0.0671504f, -0.169306f, -0.214575f, -0.121606f, -0.234965f, 0.109752f, -0.35831f, -0.07894f, 0.497203f, -2.63013f, 0.815608f, -0.193593f, -0.62292f, 0.338941f, 0.0970922f, -0.531178f, 0.723346f, 0.35063f, 0.182647f, -0.257013f, 0.784924f, -0.217915f, -0.0797363f, -0.399706f, -0.485602f, 1.23155f, 0.345998f, 0.322949f, -0.168196f, -0.173313f, 0.282205f, 0.45117f, 0.918706f, -0.046172f, -0.0873883f, 0.56103f, -0.485768f, 0.546199f, 0.254997f, 0.394296f, 0.607178f, 0.667532f, -0.343883f, 0.374402f, -0.531439f, 2.27782f, -1.13255f, 0.505867f, -0.514742f, 0.998571f, -1.60984f, -0.172873f, -0.0604094f, 0.719791f, -0.733982f, 0.348905f, 1.39008f, -0.895343f, -0.677064f, -1.84221f, 0.0434018f, -0.534794f, 0.0434753f, -0.266576f, 0.268099f, -0.242935f, 0.00166289f, 0.0263789f, -0.224794f, -0.113493f, -0.236397f, 0.0879936f, 0.510895f, -0.511789f, -1.48962f, -2.78268f, -0.0495784f, -0.0343907f, 0.440459f, -0.364209f, 0.833223f, -0.0589337f, 0.00181418f, 0.455499f, 0.101762f, -1.16424f, 0.270405f, 0.219033f, -4.91105f }; static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias[] = { -0.40114f, -0.372342f, -0.216186f, -0.240014f, -0.341773f, -0.344489f, -0.113037f, 0.198479f, 0.482958f, -0.630072f, -0.728704f, -0.171963f, 0.519883f, 0.253003f, -0.121618f, -0.0569875f, -0.485568f, -0.147577f, 0.533305f, -0.587251f, -0.120837f, -0.483953f, 0.445641f, -0.125136f }; static const float av1_intra_mode_cnn_partition_branch_3_logits_kernel[] = { -1.57431f, -1.09069f, 1.67996f, -0.669702f, 0.499807f, -3.03145f, -0.878135f, 0.637818f, -1.58419f, -3.79756f, 0.62755f, -0.446646f, 0.653269f, -0.667854f, -2.19774f, -3.53349f, 2.6107f, -0.685892f, -1.2603f, -0.89707f, -0.715551f, 0.382202f, 2.09574f, 0.469386f }; static const float av1_intra_mode_cnn_partition_branch_3_logits_bias[] = { -0.022787f }; static const NN_CONFIG av1_intra_mode_cnn_partition_branch_0_dnn_config = { BRANCH_0_NUM_DNN_FEATURES, BRANCH_0_NUM_LOGITS, BRANCH_0_NUM_DNN_LAYERS, { BRANCH_0_NUM_DNN_LAYER_0_UNITS, BRANCH_0_NUM_DNN_LAYER_1_UNITS, }, { av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel, av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel, av1_intra_mode_cnn_partition_branch_0_logits_kernel, }, { av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias, av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias, av1_intra_mode_cnn_partition_branch_0_logits_bias, }, }; static const NN_CONFIG av1_intra_mode_cnn_partition_branch_1_dnn_config = { BRANCH_1_NUM_DNN_FEATURES, BRANCH_1_NUM_LOGITS, BRANCH_1_NUM_DNN_LAYERS, { BRANCH_1_NUM_DNN_LAYER_0_UNITS, BRANCH_1_NUM_DNN_LAYER_1_UNITS, }, { av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel, av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel, av1_intra_mode_cnn_partition_branch_1_logits_kernel, }, { av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias, av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias, av1_intra_mode_cnn_partition_branch_1_logits_bias, }, }; static const NN_CONFIG av1_intra_mode_cnn_partition_branch_2_dnn_config = { BRANCH_2_NUM_DNN_FEATURES, BRANCH_2_NUM_LOGITS, BRANCH_2_NUM_DNN_LAYERS, { BRANCH_2_NUM_DNN_LAYER_0_UNITS, BRANCH_2_NUM_DNN_LAYER_1_UNITS, }, { av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel, av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel, av1_intra_mode_cnn_partition_branch_2_logits_kernel, }, { av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias, av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias, av1_intra_mode_cnn_partition_branch_2_logits_bias, }, }; static const NN_CONFIG av1_intra_mode_cnn_partition_branch_3_dnn_config = { BRANCH_3_NUM_DNN_FEATURES, BRANCH_3_NUM_LOGITS, BRANCH_3_NUM_DNN_LAYERS, { BRANCH_3_NUM_DNN_LAYER_0_UNITS, BRANCH_3_NUM_DNN_LAYER_1_UNITS, }, { av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel, av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel, av1_intra_mode_cnn_partition_branch_3_logits_kernel, }, { av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias, av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias, av1_intra_mode_cnn_partition_branch_3_logits_bias, }, }; #undef NUM_DNN_BRANCHES #undef NUM_CNN_LAYERS #undef BRANCH_0_NUM_DNN_LAYERS #undef BRANCH_1_NUM_DNN_LAYERS #undef BRANCH_2_NUM_DNN_LAYERS #undef BRANCH_3_NUM_DNN_LAYERS #undef CNN_LAYER_0_HEIGHT #undef CNN_LAYER_0_WIDTH #undef CNN_LAYER_0_IN_CH #undef CNN_LAYER_0_OUT_CH #undef CNN_LAYER_0_HORZ_STRIDE #undef CNN_LAYER_0_VERT_STRIDE #undef CNN_LAYER_1_HEIGHT #undef CNN_LAYER_1_WIDTH #undef CNN_LAYER_1_IN_CH #undef CNN_LAYER_1_OUT_CH #undef CNN_LAYER_1_HORZ_STRIDE #undef CNN_LAYER_1_VERT_STRIDE #undef CNN_LAYER_2_HEIGHT #undef CNN_LAYER_2_WIDTH #undef CNN_LAYER_2_IN_CH #undef CNN_LAYER_2_OUT_CH #undef CNN_LAYER_2_HORZ_STRIDE #undef CNN_LAYER_2_VERT_STRIDE #undef CNN_LAYER_3_HEIGHT #undef CNN_LAYER_3_WIDTH #undef CNN_LAYER_3_IN_CH #undef CNN_LAYER_3_OUT_CH #undef CNN_LAYER_3_HORZ_STRIDE #undef CNN_LAYER_3_VERT_STRIDE #undef CNN_LAYER_4_HEIGHT #undef CNN_LAYER_4_WIDTH #undef CNN_LAYER_4_IN_CH #undef CNN_LAYER_4_OUT_CH #undef CNN_LAYER_4_HORZ_STRIDE #undef CNN_LAYER_4_VERT_STRIDE #undef BRANCH_0_NUM_DNN_FEATURES #undef BRANCH_0_NUM_DNN_LAYER_0_UNITS #undef BRANCH_0_NUM_DNN_LAYER_1_UNITS #undef BRANCH_0_NUM_LOGITS #undef BRANCH_1_NUM_DNN_FEATURES #undef BRANCH_1_NUM_DNN_LAYER_0_UNITS #undef BRANCH_1_NUM_DNN_LAYER_1_UNITS #undef BRANCH_1_NUM_LOGITS #undef BRANCH_2_NUM_DNN_FEATURES #undef BRANCH_2_NUM_DNN_LAYER_0_UNITS #undef BRANCH_2_NUM_DNN_LAYER_1_UNITS #undef BRANCH_2_NUM_LOGITS #undef BRANCH_3_NUM_DNN_FEATURES #undef BRANCH_3_NUM_DNN_LAYER_0_UNITS #undef BRANCH_3_NUM_DNN_LAYER_1_UNITS #undef BRANCH_3_NUM_LOGITS static const float av1_intra_mode_cnn_partition_split_thresh_hdres[5] = { 100.000000f, 4.750139f, 1.655964f, 3.711212f, 0.963839f, }; static const float av1_intra_mode_cnn_partition_no_split_thresh_hdres[5] = { -100.000000f, -2.404842f, -3.858223f, -2.041206f, -1.573735f, }; static const float av1_intra_mode_cnn_partition_split_thresh_midres[5] = { 100.000000f, 3.218737f, 2.657764f, 0.868458f, 2.454447f, }; static const float av1_intra_mode_cnn_partition_no_split_thresh_midres[5] = { -100.000000f, -3.842426f, -4.005076f, -3.642994f, -2.467197f, }; static const float av1_intra_mode_cnn_partition_split_thresh_lowres[5] = { 100.000000f, 1.890757f, 2.658417f, 1.450626f, 1.833180f, }; static const float av1_intra_mode_cnn_partition_no_split_thresh_lowres[5] = { -100.000000f, -4.100921f, -4.564202f, -5.695176f, -1.483546f, }; static const float av1_intra_mode_cnn_partition_mean[1] = { 1.191922f, }; static const float av1_intra_mode_cnn_partition_std[1] = { 1.730044f, }; static const int quad_to_linear_0[1] = { 0 }; static const int quad_to_linear_1[4] = { 0, 1, 2, 3 }; static const int quad_to_linear_2[16] = { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 }; static const int quad_to_linear_3[64] = { 0, 1, 8, 9, 2, 3, 10, 11, 16, 17, 24, 25, 18, 19, 26, 27, 4, 5, 12, 13, 6, 7, 14, 15, 20, 21, 28, 29, 22, 23, 30, 31, 32, 33, 40, 41, 34, 35, 42, 43, 48, 49, 56, 57, 50, 51, 58, 59, 36, 37, 44, 45, 38, 39, 46, 47, 52, 53, 60, 61, 54, 55, 62, 63 }; #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_ aom-3.12.1/av1/encoder/partition_model_weights.h000066400000000000000000012164161477627663500216160ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ #define AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ #ifdef __cplusplus extern "C" { #endif #include "av1/encoder/ml.h" // TODO(chiyotsai@google.com): The performance of these models are getting worse // due the changes in the encoder. We should retrain the models here to get // better performance once we have the time. #define FEATURE_SIZE 10 #define LABEL_SIZE 16 // nn model for ab partition pruning, 128x128. static const float av1_ab_partition_nn_weights_128_layer0[FEATURE_SIZE * 64] = { -0.715251f, -0.015767f, -0.667353f, -0.345255f, 0.177887f, -0.469759f, 0.426152f, 0.489798f, 0.469865f, 0.773821f, 0.088517f, 0.074585f, 0.838754f, 0.048449f, -0.007584f, 0.638968f, 0.233305f, -0.319236f, -0.257124f, -0.170869f, 0.137180f, 0.114852f, -0.721241f, -0.947962f, -0.411298f, 0.494306f, -0.060435f, -0.648421f, -0.126624f, 0.072686f, -0.143904f, -0.115839f, -0.175527f, -0.117728f, 0.040686f, -0.189925f, 0.134361f, -0.258070f, -0.177558f, 0.158049f, 0.168668f, -0.062919f, 0.341986f, 0.038100f, -0.435577f, -0.321255f, 0.203213f, 0.213061f, 0.533304f, 0.359296f, -0.079558f, 0.004637f, 0.663904f, 0.043779f, 0.383018f, 1.136559f, -0.084155f, 0.333057f, -0.199011f, 0.152059f, -0.078419f, -0.167752f, -0.093651f, 0.083171f, -0.190143f, 0.086195f, -0.280632f, -0.160663f, -0.017298f, 0.122628f, -0.138116f, 0.062927f, 0.222462f, 0.626979f, 0.426928f, 0.117170f, -0.240457f, 0.053750f, 0.038017f, 0.007359f, -0.017595f, 0.101407f, 0.332891f, 0.074933f, 0.306498f, 0.219380f, -0.151638f, -0.247976f, 0.343405f, 0.121256f, 0.049173f, 0.171474f, -0.139608f, -1.016599f, -0.345553f, -0.901138f, 0.243401f, 0.059928f, -0.089396f, -0.195565f, 0.364705f, -0.020400f, -1.383672f, 0.413018f, 0.536950f, -0.020904f, -1.335306f, -0.732290f, 0.102885f, 0.315290f, -0.208521f, -0.081811f, 0.182300f, 0.125712f, -0.593833f, -0.220639f, -0.314155f, 0.188327f, 0.118503f, 0.524427f, -1.083859f, -1.130640f, 0.390352f, -0.045591f, 0.113160f, -0.009149f, -0.096183f, 0.115829f, 0.377752f, 0.318396f, -0.591983f, 0.004797f, -0.497377f, -0.342248f, 0.079546f, -0.025249f, -0.295972f, 0.615501f, -0.464372f, 0.418315f, -0.173556f, 0.105217f, 0.298073f, 0.082478f, 0.033223f, 0.977341f, -0.372982f, -0.052337f, 0.154124f, 0.396787f, 0.536654f, -0.139061f, -0.223702f, 0.229666f, -0.846766f, 0.107723f, 0.563839f, -0.483141f, 0.304813f, -0.765283f, 0.070964f, 0.151101f, 0.275188f, 0.490303f, 1.175892f, 0.085377f, -0.191200f, 0.544532f, -0.365075f, 0.167546f, 0.052183f, -0.220529f, -0.212227f, -0.144988f, -0.273356f, -0.062023f, 0.103993f, -0.238493f, -0.161204f, -0.054611f, -0.166672f, 0.128327f, 0.461751f, -0.545822f, 0.739798f, 0.594386f, -0.163192f, -0.332501f, 0.363834f, -0.065043f, 0.474812f, -0.138811f, 0.170924f, -0.778142f, -0.316474f, -0.508065f, -0.039986f, -0.478001f, 0.340591f, 0.041783f, 0.055419f, 0.015155f, -0.981830f, -1.355237f, 0.347516f, 1.155327f, 0.081319f, 0.274163f, -0.327230f, -0.113478f, 0.556552f, -0.055986f, 0.217318f, -0.445351f, 0.325759f, 0.526547f, -0.657434f, -0.572214f, -0.037087f, 0.081384f, 0.064518f, 0.014892f, 0.215279f, 1.834504f, -0.242107f, 0.079810f, 0.129558f, 0.079588f, -0.035189f, -0.221745f, -0.163414f, 0.043978f, -1.028662f, -0.623609f, 1.130336f, 0.664661f, -0.063975f, -0.415863f, 0.018581f, 0.157758f, 0.200570f, 0.063420f, 0.901039f, -0.746286f, 0.196230f, -0.290592f, 0.042373f, -0.502500f, 0.183638f, 0.103394f, -0.298858f, 0.145436f, 0.196916f, 0.108319f, -0.448572f, -0.881385f, 0.302497f, 0.121679f, -0.021327f, 0.025150f, 0.481306f, -0.359634f, 0.350257f, -0.228647f, -0.669860f, 0.260025f, -0.034182f, 0.619247f, -0.158826f, -0.405864f, 0.674112f, -0.027885f, -0.325274f, -0.241492f, 0.036024f, -0.437685f, -0.091458f, -0.109295f, -0.350676f, 0.044706f, 0.297059f, 0.016290f, 1.121203f, 1.289062f, -1.299476f, -1.129221f, 0.103752f, 0.131302f, -0.263265f, 0.222155f, -0.229908f, 0.013922f, -0.226001f, -0.248383f, -0.004415f, -0.020958f, 0.055634f, 0.086200f, 0.114556f, -0.184061f, -0.096210f, -0.146466f, -0.249618f, -0.195998f, 0.088758f, 0.023781f, -0.264460f, 0.157026f, -0.235228f, -0.102564f, 0.043463f, -0.187823f, -0.257500f, -0.199049f, -0.242210f, 0.030448f, 0.221604f, 0.151804f, -0.100404f, -0.073931f, 0.144749f, -0.001572f, -1.438079f, -0.233716f, 0.733422f, 1.727080f, -0.036397f, 0.027551f, 0.425321f, 0.085703f, 0.031186f, 0.032333f, -0.675130f, 1.437733f, -0.202392f, -0.525003f, 0.087048f, 0.328194f, -0.079989f, -0.391088f, -0.238732f, -0.120660f, -0.139600f, 0.154665f, 0.026202f, -0.233501f, -0.009046f, -0.149187f, -0.199646f, 0.115375f, 0.209762f, -0.014875f, 0.124038f, -0.119985f, 1.079625f, -0.461513f, 0.614114f, 0.021003f, 0.439449f, -0.824834f, -0.299701f, 0.193817f, -0.870551f, -1.262313f, -0.079517f, 0.341570f, 0.305310f, -0.089721f, -0.317314f, -0.075631f, 0.127172f, -0.208635f, 1.191922f, 0.163141f, 0.564285f, 0.286352f, 0.480865f, 0.173094f, -0.094034f, -0.071339f, -0.328992f, -0.006382f, 0.314705f, 0.090258f, -0.016099f, 0.193230f, 0.188061f, 0.398144f, 0.722781f, 0.769949f, 0.025442f, -0.162016f, 0.070192f, -0.056946f, -0.100957f, -0.219934f, -0.203492f, -0.015454f, -0.013272f, -0.098008f, 0.051707f, -0.017493f, 0.527446f, 0.083605f, 0.588318f, 0.878215f, 0.028747f, -0.146479f, -0.345170f, -0.136059f, -0.152005f, -0.203634f, 0.232702f, -0.101340f, -0.027733f, -0.282611f, 0.265366f, 0.082362f, -0.265420f, -0.131124f, 0.166303f, 0.040194f, -0.100710f, 0.579151f, -0.530136f, 0.163422f, -0.998821f, -1.565311f, -1.774785f, -2.493372f, 0.116970f, -0.090302f, 1.723272f, 0.552370f, -0.295954f, -0.439095f, -0.266730f, 0.027936f, 0.539616f, -0.234902f, -0.167601f, -0.149877f, -0.242983f, 0.122353f, -0.121620f, -0.205517f, -0.180144f, -0.264208f, 0.151500f, -0.159378f, 0.029145f, -0.050892f, -0.223407f, -0.246239f, 0.043152f, -0.018460f, 0.169972f, -0.187769f, -0.034670f, -0.238330f, 0.288070f, -0.093243f, -0.437105f, -0.573376f, 0.660073f, 0.285727f, 0.408470f, 0.158475f, 0.032699f, 0.056280f, -0.237176f, -0.083003f, 0.105598f, -0.169522f, -0.260420f, -0.121100f, -0.173983f, -0.195693f, -0.232028f, 0.224940f, 0.029124f, 0.009580f, -0.252034f, 0.103087f, 1.156561f, 0.603848f, -0.562805f, -1.652742f, -0.568288f, -1.829395f, 0.046169f, 0.076095f, 1.490819f, 0.415893f, -0.277788f, -0.115787f, 0.093750f, 0.270726f, -0.395983f, -0.353742f, 0.034605f, 0.005342f, 0.184537f, 0.086445f, 0.156417f, 1.476367f, 0.122587f, 0.002145f, 0.431057f, -0.381184f, -1.646457f, -0.014009f, -0.671224f, 0.193726f, -0.019247f, -0.031267f, -0.046208f, 0.298733f, 0.064734f, 0.616984f, 0.039381f, 0.182722f, -0.116670f, 0.233093f, -1.214374f, -0.817970f, -0.064394f, -0.584783f, 0.077697f, -0.266720f, 0.130875f, -0.235295f, -0.265754f, -0.159999f, -0.250114f, -0.183017f, 0.194403f, -0.105808f, -0.169215f, -0.240866f, -0.026662f, -0.045123f, -0.036175f, -0.167471f, -0.192908f, -0.232602f, -0.267036f, -0.112500f, -0.257944f, -0.111909f, -0.802226f, -0.008800f, 0.881460f, -0.678603f, 0.008666f, -0.252053f, -0.341035f, -0.175290f, 0.183012f, 0.385991f, 0.079888f, -0.014039f, -0.148653f, 0.671778f, -0.130219f, 1.086467f, 0.129267f, -0.040400f, -0.201221f, -0.077005f, 0.015890f, 0.000781f, 0.137764f, 1.389546f, 0.172152f, 0.047279f, -0.042783f, 0.127740f, 0.141467f, -0.335738f, -1.396392f, 0.031496f, 0.357385f, 0.343602f, -0.714553f, 0.311014f, 0.132845f, 0.061149f, 0.006796f, 0.568106f, -0.255949f, 0.104134f, -0.993447f, 0.298135f, -0.406590f, -0.049228f, -0.578570f, -0.188561f, -0.107046f, 0.374095f, 0.068481f, 0.036240f, -0.495801f, 0.180574f, -0.766129f, 0.886967f, -0.568868f, -0.936062f, -0.418886f, -0.058735f, -0.511964f, -0.438596f, 0.019016f, -0.015837f, 0.600197f, 0.429773f, 0.315026f, 0.319667f, 0.214617f, -0.017316f, 0.270257f, -0.040524f, 0.695803f, -0.015223f, -1.554965f, 0.356997f, -1.472428f, 0.024637f, -0.562958f, 0.870351f, 0.193635f, 0.036063f, 0.328638f, 0.200274f, -1.634707f, 0.110534f, 0.420104f, -0.072042f, -0.006404f, 0.171680f, }; static const float av1_ab_partition_nn_bias_128_layer0[64] = { 0.643147f, -1.348826f, 0.431627f, 0.000000f, 0.102717f, -0.772628f, -0.034351f, -0.761977f, -0.638397f, 0.541969f, -0.391311f, 0.563076f, 0.148553f, 0.267217f, -0.788092f, 0.544573f, -0.546280f, 0.000000f, -0.446945f, 0.127732f, 0.270624f, -0.219435f, -1.220203f, 0.324584f, 0.110885f, 0.276547f, 0.179726f, -0.375160f, 0.026401f, -0.032595f, 0.000000f, -0.047932f, -0.648602f, -0.512637f, -0.031661f, -0.236761f, 0.476453f, -0.028021f, -0.013673f, -0.015578f, -0.920077f, 0.000000f, 0.915351f, -0.209962f, 0.000000f, -0.025731f, 0.218288f, 0.000000f, 0.047726f, -0.813077f, -1.263281f, 0.239087f, 0.278614f, -0.030753f, 0.000000f, 0.346744f, -0.948543f, -1.174211f, 0.216377f, 0.498913f, 0.853918f, 0.002504f, -0.190403f, 0.452050f, }; static const float av1_ab_partition_nn_weights_128_layer1[64 * LABEL_SIZE] = { 0.179769f, 1.499417f, -0.445135f, -0.142278f, -0.337661f, 0.682064f, -0.203213f, 0.302171f, 0.226877f, -0.422169f, 1.687586f, 0.783773f, 0.220995f, 0.253482f, 0.370435f, -1.342775f, 0.337229f, -0.271473f, 0.291796f, 1.362227f, -1.751397f, -0.086178f, 0.725496f, -0.118597f, 0.227963f, -0.501577f, 0.223849f, -0.122421f, -0.123437f, -0.051045f, -0.020115f, 0.212711f, 0.246025f, 0.088120f, -0.168995f, 1.740190f, -0.195098f, 0.680339f, -0.589572f, -0.075244f, 0.878766f, 0.064092f, -3.548527f, 0.001660f, 0.107926f, -0.169501f, -0.455212f, 0.123045f, -1.836998f, 0.330365f, 1.301475f, 0.454761f, -0.576552f, -0.190761f, 0.208459f, 0.618483f, 1.383364f, 0.970718f, 0.390174f, 0.406252f, -0.564519f, -0.312062f, 1.345712f, -0.151873f, 0.109290f, 0.408847f, 0.391243f, 0.152024f, 0.181764f, -0.036263f, -0.160466f, 0.153595f, 0.049163f, -0.753012f, -1.804062f, 0.347475f, -2.746580f, 0.575618f, 0.261799f, 0.210505f, -0.302054f, -0.109872f, 0.199506f, -1.182971f, 0.723668f, 0.177758f, -0.338202f, 0.254396f, -0.220023f, 0.043504f, 0.669866f, -0.040816f, -0.402730f, 0.017990f, 0.215523f, -0.216816f, 0.454826f, -0.726067f, -0.018750f, -0.928679f, 0.154315f, -0.465641f, 0.144566f, -0.030064f, -0.054667f, -0.154055f, 0.625384f, 1.323795f, -0.159496f, 0.097072f, -0.463197f, -0.057938f, 0.750290f, -0.233061f, 0.412631f, -0.535223f, -0.151423f, -0.154583f, 0.024721f, -0.494448f, 0.230594f, -0.980138f, -0.653968f, 0.126079f, 0.051814f, -0.053219f, -0.421708f, -0.228853f, 0.237885f, 0.888157f, 0.059655f, 0.241295f, 0.210443f, 0.228238f, 0.119127f, -0.051989f, -0.355408f, 0.182215f, 0.244277f, -0.104577f, -0.558035f, -0.023270f, 0.054571f, 0.700646f, -0.223006f, 0.115523f, 0.023391f, 0.437264f, 0.709477f, -0.531212f, -0.094731f, 0.328161f, -0.105418f, -0.133511f, 0.497168f, -0.030948f, -0.407132f, -0.043943f, 0.155505f, 0.251945f, 0.205010f, 0.167160f, 0.083654f, -0.636810f, 0.401315f, -0.398414f, 0.290046f, 0.206846f, 0.042218f, 0.168150f, 0.843181f, -0.671242f, -0.202392f, -0.073301f, 0.142895f, 0.237466f, 0.212145f, -0.091828f, 0.187038f, -0.720841f, -0.616069f, -0.238021f, 0.065365f, 0.434119f, 0.179023f, -0.040107f, -0.430734f, -0.297368f, 0.575954f, 0.382619f, -0.709787f, -0.320810f, 0.242342f, -0.047614f, 0.705216f, 0.098077f, 0.357179f, 0.046017f, 0.115074f, -0.412305f, -0.272304f, 0.048096f, -0.803811f, 0.275000f, 0.642198f, 0.180286f, -0.087178f, -0.112707f, -0.394443f, 0.201989f, 0.241759f, -1.038870f, 0.728124f, 0.800559f, -1.296268f, 0.198612f, -0.053478f, 0.414344f, -0.510529f, 0.124179f, -2.219115f, -0.074583f, -0.143055f, 0.001697f, 0.810811f, -0.657140f, 0.186818f, -0.936414f, 0.539578f, -0.308244f, -0.126624f, -0.204767f, 0.091145f, -0.049340f, 0.252014f, 0.394582f, 0.018764f, -0.060377f, -0.019133f, 0.064083f, 0.069211f, -0.526693f, 0.209850f, -0.481466f, -0.468302f, -0.100407f, 0.241018f, -1.037781f, 0.038539f, -2.113840f, -0.974895f, 0.163187f, 0.425132f, -0.772546f, -1.261254f, -0.217488f, -0.971748f, -0.805640f, -0.745175f, -0.177077f, 0.217658f, 0.381431f, -0.052338f, 0.087176f, -0.165972f, 0.085937f, 0.472564f, -0.796627f, -2.453307f, 0.569664f, -0.233010f, -0.192134f, 0.064339f, -0.111411f, -0.262469f, -0.410022f, 0.519993f, -0.684620f, 0.393460f, -0.277753f, -0.153624f, 0.528984f, -0.415558f, -0.445863f, 0.588512f, -0.142439f, -0.132127f, 0.199776f, -0.579284f, 0.119488f, -0.033590f, -0.503846f, -0.674979f, 0.335125f, 0.020519f, 0.233973f, -0.297998f, -0.051511f, 0.518626f, -0.412782f, -0.074045f, 0.130523f, 0.465751f, -0.117795f, 2.535813f, 0.352108f, -0.499228f, 0.379784f, 0.056699f, 0.173142f, -0.076519f, -0.026666f, 0.017834f, 0.492333f, 0.093364f, 0.037867f, -0.165420f, -0.356429f, -0.562334f, 0.057656f, -0.307544f, 0.085857f, -0.559851f, 0.107230f, -0.398633f, 0.152618f, -0.216835f, -0.024539f, 0.026044f, -0.249519f, -0.563594f, -0.746025f, 0.025265f, -0.298888f, -0.185243f, 0.058794f, 0.233696f, -0.115223f, 0.144617f, -0.864390f, 0.619944f, -0.023980f, 0.019481f, 0.225252f, 0.416552f, -0.115993f, 0.935387f, 0.744386f, 0.053353f, -0.052582f, -0.065650f, 0.228488f, -0.032042f, -0.371252f, -0.003638f, -0.736984f, -0.203776f, 0.030922f, -0.065577f, -0.031643f, -0.049253f, -0.054640f, 0.787134f, 0.545414f, -0.140297f, -0.124274f, -0.110011f, -0.029552f, 0.657005f, 0.214973f, -0.374300f, 0.251642f, 0.276591f, 0.030566f, -0.145470f, 0.350579f, -0.356436f, -0.052694f, -0.063966f, -0.751008f, -1.042392f, 0.328892f, -0.425058f, -0.421571f, -0.571889f, -1.141472f, -0.125216f, 0.212713f, -0.485170f, -0.088791f, 0.124589f, 0.023237f, 0.077635f, 0.020901f, -0.271402f, -0.321424f, -0.513946f, -0.867872f, -0.284593f, 0.106276f, 0.220192f, -0.143532f, -0.014648f, 0.073402f, 0.327256f, -0.139803f, 0.168763f, 0.048199f, -0.122526f, 0.111713f, -0.134257f, 0.810364f, -0.085222f, -0.259221f, -0.239349f, 0.044448f, 0.205031f, 0.413113f, -0.107720f, -0.018816f, -0.247741f, -0.004963f, 0.041170f, -0.158019f, 0.134839f, 0.129502f, 0.800488f, -1.041584f, -0.129336f, 0.170834f, 0.566586f, -0.230443f, 0.437937f, -0.149922f, -0.046665f, -0.094646f, 0.200070f, 0.072943f, -0.076943f, -0.084971f, -0.515843f, -0.146720f, 0.472869f, -0.444731f, -0.100877f, 0.545196f, -1.786626f, -0.482946f, 0.500509f, -0.843257f, 0.200374f, 0.045103f, -0.575718f, -0.164335f, -0.232522f, -0.021825f, -0.139490f, 0.356058f, -0.352075f, 0.061751f, -0.200616f, -1.180921f, -0.181355f, -0.137459f, 0.247574f, 0.181541f, 0.184314f, -0.961482f, 0.493615f, 0.910261f, -2.279238f, 0.648631f, -0.055526f, -0.037137f, 0.038643f, 0.136609f, -0.819373f, -0.040840f, -0.265989f, 0.006877f, 0.454651f, -0.595323f, -0.099500f, -0.263717f, 0.150456f, 0.245077f, -0.268666f, 0.162232f, -0.516451f, -0.024501f, 0.188046f, -0.002262f, 0.261319f, 0.004173f, 0.746982f, 0.174761f, 0.470447f, -0.159558f, -0.385240f, 0.023084f, -0.133520f, -0.220607f, -0.018731f, -0.373558f, -0.707763f, -1.850150f, -0.807404f, -0.168063f, -0.071435f, -0.160740f, -0.478789f, -1.070674f, -0.489740f, -0.255796f, 0.100486f, -0.153361f, 0.334394f, -0.569472f, -0.198118f, 0.255922f, 0.104717f, -0.065179f, 0.111879f, -0.447237f, 1.373623f, -0.190191f, -0.063311f, 0.337529f, -0.138800f, 0.057009f, -0.137006f, 0.641378f, 0.883147f, -0.679655f, 0.267717f, -0.351602f, -0.135225f, 0.229398f, -0.513225f, -1.120345f, 0.528786f, -0.051081f, 0.086653f, 0.140141f, -0.563969f, 0.333402f, -0.174745f, 0.321093f, -0.438641f, -0.005131f, 0.247415f, 0.110120f, -0.076308f, -0.083244f, 0.838944f, -0.113043f, -0.013258f, -0.175028f, -0.179941f, 0.272676f, -0.047946f, -0.088076f, -0.450031f, 0.053929f, -0.083549f, -0.089952f, -0.186253f, 0.257483f, 0.011019f, 0.586435f, 0.060580f, -0.052078f, 0.090277f, -0.780869f, 0.969811f, -0.025349f, -0.281917f, 0.014857f, 0.231863f, -0.228601f, -0.003861f, 0.226550f, 0.141825f, -0.102171f, -0.010387f, 0.220378f, -2.561975f, -0.497071f, -0.315117f, 0.371981f, 0.138247f, 0.625031f, -0.308133f, -0.217876f, 0.005615f, -0.860179f, 0.747491f, 0.006356f, -0.057024f, -0.483189f, 0.055592f, -0.316834f, 0.069858f, 0.218788f, -0.200044f, 0.227588f, 0.215496f, -0.055324f, -0.393147f, -0.394062f, -0.253264f, -0.075619f, -0.152512f, -0.332995f, 0.129053f, 0.178668f, -0.302694f, 0.030678f, 0.925896f, 0.964375f, 0.169021f, -0.218657f, -0.627204f, 0.206437f, -0.521336f, 0.176206f, 0.142733f, 0.139248f, 0.411682f, 0.181544f, 0.224850f, -0.935547f, -0.558208f, 0.348096f, 0.342129f, -0.389340f, -0.236308f, -0.132099f, 0.073642f, 0.089391f, -0.306901f, -0.397842f, 0.444282f, 0.074623f, -0.051075f, -0.106617f, -0.184037f, -0.239046f, -0.138761f, 0.120794f, -0.647577f, -0.336471f, 0.527899f, -0.164234f, -0.028354f, 1.083678f, -0.251534f, -0.145903f, -0.182783f, 0.070976f, -0.199590f, -0.400306f, -0.029763f, -0.548042f, -0.266270f, -0.118084f, -1.152632f, 0.383685f, -0.105895f, -0.096829f, 0.118382f, 0.047447f, -0.019051f, 0.310180f, -0.162793f, -0.029574f, 0.058054f, -0.636017f, 0.490639f, 0.158347f, -0.385701f, -0.147057f, 1.285825f, -1.276083f, -0.021795f, -0.101600f, 0.163254f, 0.267160f, -2.317864f, -0.098598f, -0.296337f, -0.309017f, 0.164127f, -0.270012f, -0.071187f, -0.262270f, 0.075415f, -0.368328f, 0.186728f, -0.158031f, 0.481663f, 0.515950f, -0.162551f, 0.497981f, 0.262196f, 0.168479f, 0.726066f, -0.243856f, -0.058998f, 0.140168f, 0.053242f, -0.624623f, -0.249480f, 0.055197f, -1.376804f, 0.417571f, 0.203784f, 0.174370f, -0.155531f, -0.029400f, -0.491473f, 0.079811f, -0.080123f, 1.345900f, 0.637077f, 0.434862f, -1.787438f, 0.005756f, -0.362706f, 0.179458f, -0.288263f, 0.516788f, -0.921248f, 0.043794f, -0.137729f, -0.196171f, -0.046295f, -0.793781f, -0.156532f, -0.132566f, 0.517989f, -0.154321f, -0.054174f, -0.077900f, -0.373316f, -0.117718f, 0.188986f, -0.476188f, -0.245312f, 0.181439f, -0.161024f, -0.229059f, -3.079907f, -0.225452f, -0.594355f, -0.558027f, -0.135429f, 0.125766f, -0.081314f, -0.350894f, -0.163165f, -1.936507f, -0.205966f, 0.031472f, 0.744446f, -0.006680f, -0.837551f, 0.605862f, -0.854929f, -1.543750f, -0.307704f, -0.240517f, 0.178240f, -0.183586f, -0.010307f, 0.099373f, -0.228278f, 0.175236f, -0.000133f, 0.104491f, -1.540545f, -0.570971f, -0.252885f, 0.483036f, 0.052531f, 0.260214f, -0.515016f, -0.602081f, -0.485690f, -0.730710f, 0.163719f, -1.775975f, -0.298634f, 0.323626f, -0.373579f, -0.872977f, 0.619574f, 0.026862f, -0.122531f, -0.084698f, -2.436297f, 0.483996f, -0.203640f, -0.302157f, -0.150666f, -0.238320f, 0.089250f, 0.236485f, -0.668654f, -0.122863f, 0.491152f, -0.226444f, -0.181248f, 0.120158f, 0.294027f, 0.250056f, 0.307601f, 0.357875f, -1.746455f, -0.175670f, 0.385447f, -0.108808f, -0.090235f, -0.642504f, -0.486004f, -0.055160f, -0.068692f, 0.009736f, 0.607555f, -0.489426f, 0.150624f, 0.598114f, -0.128816f, -0.445793f, -0.066524f, -0.254380f, 0.227106f, -0.406495f, -0.121632f, -0.275960f, -0.136494f, 0.339457f, -1.318132f, -0.417572f, -2.614077f, 0.324603f, -0.001211f, 0.375192f, -0.473448f, -0.162510f, 0.099329f, -0.277965f, 0.101221f, -0.060263f, 0.121867f, -1.042140f, 0.440851f, 0.078898f, -0.209007f, -0.243699f, 0.715197f, -0.093997f, 0.086022f, -0.178203f, -2.275496f, -0.098413f, 0.199352f, -0.526791f, -0.162086f, -0.197806f, -0.231657f, -0.269202f, -0.794294f, -0.223461f, 0.503584f, 0.416236f, 0.064082f, 0.197655f, 0.340871f, -0.186645f, -0.291498f, 0.433938f, -1.110063f, 0.003751f, 0.392738f, 0.069360f, 0.102088f, -0.302128f, -1.518457f, 0.106939f, 0.404527f, -0.306868f, -0.286928f, 0.729276f, -0.531710f, 0.745048f, -0.168837f, -1.953886f, -0.258828f, -0.190252f, 0.241877f, -0.916744f, -0.030326f, -0.070541f, -0.271037f, 0.211303f, -0.489957f, 0.100850f, 0.323999f, -0.802837f, -0.462408f, -0.079350f, -0.029374f, 0.131213f, -0.825032f, 0.040202f, 0.351821f, 0.002869f, -0.132516f, -0.471264f, -0.297002f, 0.263913f, 0.033478f, 0.146161f, 0.533229f, -0.228608f, -0.200639f, -0.170955f, -0.915037f, 0.724491f, 0.005151f, 0.018584f, -0.029771f, -0.396038f, -0.159236f, 0.038691f, -1.197056f, 0.146302f, 0.226840f, -0.852126f, 0.031214f, 0.108880f, 0.562000f, -0.134633f, -0.713343f, -0.342252f, -1.764521f, -0.114653f, 0.515073f, -0.080515f, -0.121155f, -0.865139f, -0.833694f, -0.368553f, 0.347673f, 0.623379f, 0.722067f, -0.492458f, -0.513263f, 0.585167f, 0.721518f, -0.693499f, 0.343725f, -0.273861f, -0.040230f, -0.785664f, -0.157500f, -0.308445f, 0.054062f, 0.600131f, -0.860887f, 0.434470f, -0.191382f, -0.306150f, -0.243965f, 0.705444f, 0.007789f, -0.146154f, -0.054499f, -0.073500f, -1.067364f, 0.404936f, -2.864590f, 0.182323f, 0.326126f, 0.102405f, -0.135800f, 1.128095f, -0.012267f, -0.023996f, -0.264834f, -0.108967f, -1.176746f, -0.926666f, 0.082999f, -0.498361f, 0.083560f, -0.210074f, 0.019225f, -0.201614f, -0.904760f, 0.181421f, 0.586384f, -0.177706f, 0.065471f, 0.168552f, 0.054705f, 0.045241f, 0.048057f, -0.410957f, -2.188854f, -0.169812f, 0.015521f, 0.176856f, -0.179331f, -0.352640f, -0.491735f, -1.743206f, 0.044227f, 0.010454f, 0.823643f, -0.119781f, -0.098359f, 0.093119f, }; static const float av1_ab_partition_nn_bias_128_layer1[LABEL_SIZE] = { -0.433195f, -0.120488f, -0.116721f, 0.112134f, 0.118170f, -0.259769f, -0.077530f, 0.394044f, 0.279167f, -0.317988f, 0.189538f, 0.314776f, 0.325655f, -0.107123f, 0.591049f, 0.358744f, }; static const NN_CONFIG av1_ab_partition_nnconfig_128 = { FEATURE_SIZE, // num_inputs LABEL_SIZE, // num_outputs 1, // num_hidden_layers { 64, // num_hidden_nodes }, { av1_ab_partition_nn_weights_128_layer0, av1_ab_partition_nn_weights_128_layer1, }, { av1_ab_partition_nn_bias_128_layer0, av1_ab_partition_nn_bias_128_layer1, }, }; // nn model for ab partition pruning, 64x64. static const float av1_ab_partition_nn_weights_64_layer0[FEATURE_SIZE * 64] = { -0.495347f, -0.049498f, -0.026804f, 0.030474f, -0.289308f, -0.264193f, -0.141121f, -0.072562f, -0.391665f, -0.051491f, -0.234761f, 0.027155f, -0.038217f, 0.014872f, -0.289728f, -0.233577f, -0.415875f, -0.343615f, -0.442543f, -0.482492f, 0.073510f, 0.007503f, 2.162329f, -0.362849f, 2.145915f, -0.883135f, 0.185636f, -0.062859f, -0.465574f, -0.486205f, -0.056710f, -0.330642f, -0.321860f, 0.042321f, -0.348965f, 0.003542f, -0.291365f, -0.078164f, -0.345093f, -0.220272f, -0.471270f, -0.763853f, 0.246622f, 0.199651f, -0.663420f, -0.154152f, -1.220383f, 0.047138f, 0.816811f, 0.083247f, -0.218839f, 0.038143f, -0.063436f, 0.015517f, -0.307320f, -0.166956f, -0.169499f, -0.399005f, -0.234638f, -0.162266f, 0.050425f, -0.221723f, -0.256942f, -0.287285f, 0.144011f, -0.033245f, 0.083649f, 0.119428f, -0.056706f, -0.117805f, 0.021866f, -0.257300f, -0.201378f, -0.217484f, -0.413780f, -0.145793f, 0.082792f, -0.347247f, 0.042539f, -0.302697f, 1.652316f, 0.000701f, -0.482843f, -0.160332f, -0.450099f, 0.212399f, -4.715360f, -5.336774f, -5.375758f, -6.048339f, 0.085956f, -0.037767f, 1.052409f, -0.931924f, -2.221907f, 0.268946f, 0.015512f, 1.237094f, -1.092185f, 0.418247f, -0.082143f, -0.076914f, -0.060749f, -0.325440f, -0.296960f, -0.066815f, -0.158477f, -0.373945f, -0.122322f, -0.113495f, -0.097978f, -0.192816f, -0.270418f, 0.035840f, -0.015458f, -0.121071f, -0.279582f, -0.067683f, 0.097855f, 0.019839f, 0.451127f, 0.004376f, 1.410392f, 3.255835f, -0.344815f, 0.145202f, 0.204132f, 0.171948f, -0.527736f, -0.110353f, 0.901448f, 0.003238f, -3.822090f, 0.235462f, 1.024823f, -0.821244f, 0.876056f, 2.553762f, -3.478597f, -2.076582f, -0.265515f, -0.055923f, -0.156980f, -0.164097f, -0.246040f, 0.039430f, -0.071769f, -0.118847f, -0.304053f, -0.281541f, -0.226021f, -0.263091f, -0.127359f, -0.249410f, -0.051023f, 0.083911f, 0.084721f, 0.168089f, -0.272169f, -0.204998f, -0.008303f, -0.173998f, 0.079376f, -0.197426f, -0.199052f, -0.118794f, -0.063753f, -0.094769f, 0.066176f, -0.175832f, -0.238752f, -0.287960f, -0.134307f, -0.185953f, -0.385845f, 0.119769f, -0.006567f, -0.382126f, -0.214221f, 0.038449f, -0.253484f, -0.282766f, -0.020249f, -0.193929f, 0.016281f, -0.114423f, -0.145940f, -0.281621f, -0.007588f, -0.131470f, -0.189012f, -0.185699f, -0.279011f, -0.008132f, 0.208463f, 0.020569f, -0.206803f, -0.213408f, -0.206131f, -0.290245f, 0.069701f, -0.000371f, -0.307572f, -0.451785f, -0.300838f, -0.453186f, -0.301691f, 0.046327f, -0.312668f, 0.058272f, -0.303131f, -0.376252f, 0.108384f, -0.086623f, -0.100630f, -0.027330f, -0.003969f, 0.089502f, -0.200722f, -0.107889f, 0.061843f, -0.008478f, -0.265057f, -0.271132f, -0.073562f, 0.129337f, -0.283698f, -0.353414f, 0.076420f, -0.244280f, -0.119537f, -0.105366f, -0.184692f, -0.038817f, -0.478507f, -0.118808f, -0.472979f, -0.305884f, -0.462813f, -0.189581f, -0.011932f, -0.585700f, 0.253212f, -1.061900f, -0.205116f, -0.336407f, -0.762199f, 0.577737f, 0.230832f, 0.434440f, -0.096713f, 0.038552f, -0.147800f, -0.213553f, 0.041740f, -0.281907f, -0.026154f, -0.082356f, -0.331871f, -0.408247f, -0.129022f, -0.037550f, -0.310233f, -0.320883f, -0.391963f, -0.467392f, 0.027453f, -0.394761f, -0.045544f, 0.076052f, 0.483985f, 0.067093f, 0.141361f, 0.576772f, 0.859718f, 2.566515f, -0.025476f, 0.769738f, -0.680235f, -1.683309f, -2.394131f, -0.000714f, -0.615021f, -0.195856f, -0.434035f, -0.295010f, -0.668659f, -0.245959f, 0.551148f, 1.777227f, -0.461630f, 0.043093f, 0.012293f, -0.255841f, -0.097070f, -0.371156f, -0.146323f, -0.015508f, -0.103873f, -0.087476f, -0.297266f, -0.128699f, -0.149555f, 0.016534f, -0.375498f, -0.346759f, -0.455156f, -0.147509f, -0.427076f, -0.354431f, -0.158025f, -0.164604f, -0.237038f, -0.010314f, -0.092884f, -0.397084f, -0.217980f, -0.127184f, -0.048421f, -0.144133f, 0.889073f, 0.012606f, 3.007608f, -0.602584f, -1.849480f, -0.373159f, -1.890695f, -3.609938f, 0.811923f, -1.867208f, -0.244326f, -0.018012f, -0.211192f, -0.220196f, 0.169363f, 0.119141f, -0.230715f, 0.083247f, 0.020367f, -0.128629f, -0.217455f, -0.159640f, 1.815952f, -0.369238f, -1.186447f, -0.658753f, -0.511026f, -0.096934f, 0.662971f, 0.486475f, 0.159746f, -0.018932f, 3.692397f, 1.384353f, -0.401984f, -0.248380f, -0.140861f, 0.215248f, -0.023711f, 0.059679f, -0.072260f, 0.004271f, 0.039545f, -0.347971f, -0.081851f, -0.474896f, -0.181572f, 0.066736f, -0.157822f, -0.163760f, -0.171113f, -0.089935f, -0.338281f, -0.421444f, -0.306687f, -0.085283f, -0.377953f, -0.138750f, -0.102701f, -0.312336f, 0.149831f, 0.007229f, -0.155700f, -0.173611f, 4.074261f, 1.342306f, -1.272712f, 1.570899f, -0.545093f, -0.317605f, -0.189440f, -0.133910f, -0.273190f, -0.108020f, -0.166107f, 0.021413f, -0.239130f, -0.067211f, 0.041957f, -0.039234f, -1.003587f, -0.094412f, 0.532512f, -0.870538f, -1.118023f, -1.160983f, -0.736307f, -0.418752f, 0.419466f, 0.492122f, -0.004368f, -0.022096f, -1.115132f, 0.150886f, 2.396852f, 2.660000f, -0.376537f, 0.468628f, 0.149413f, -0.074898f, -0.067154f, 0.021245f, 0.127857f, 0.294189f, 0.508056f, 0.390232f, -3.899177f, -3.414681f, -3.929195f, -4.160545f, -0.274323f, -0.052583f, -0.003545f, -0.433084f, -0.404891f, -0.145051f, -0.312367f, 0.004579f, -0.398724f, -0.372068f, -0.234279f, 0.017799f, -0.424760f, -0.646717f, -0.047568f, 2.924664f, -0.644165f, 0.359349f, -0.294800f, 0.591746f, -0.404710f, -0.092358f, -0.250729f, 0.030829f, -0.147149f, -0.476023f, -0.071803f, -0.482516f, -0.293117f, -0.215923f, -0.373122f, -0.085315f, -0.377052f, -0.449899f, -0.056452f, 0.138081f, -0.085350f, -0.308391f, 0.106661f, 0.176234f, 0.258869f, -0.230172f, -0.233029f, -0.241208f, -0.067509f, -0.223172f, -0.118353f, -0.302478f, -0.579632f, -0.561326f, -0.158114f, -0.223167f, -0.026689f, 0.051863f, 0.212834f, -0.304714f, -0.169071f, -0.193695f, -0.075682f, -0.170860f, -0.241008f, -0.044648f, 0.280815f, -0.002585f, -0.283552f, -0.037701f, -0.681169f, -0.274535f, -0.380595f, 0.109504f, -0.111141f, -0.437685f, -0.094459f, 0.144206f, -0.106139f, -0.211832f, -0.054742f, -0.172813f, -0.295905f, -0.071907f, -0.418429f, -0.183240f, 0.031319f, -0.095785f, -0.315447f, 0.069404f, -0.422910f, -0.029867f, -0.357321f, -0.199976f, -0.337707f, -0.070188f, -0.178198f, 0.177208f, 0.134688f, -0.081933f, -0.229452f, -0.208872f, 0.026287f, -0.364040f, -0.063696f, -0.227443f, -0.234401f, -0.205699f, -0.267238f, -0.494125f, -0.056255f, 0.053715f, -0.487754f, 0.014818f, 0.087383f, -0.077556f, -0.168085f, -0.436851f, -0.276286f, -0.137845f, -0.107606f, -0.103653f, -0.233766f, -0.419083f, 0.169185f, 0.010186f, -0.001587f, 0.086735f, -2.465718f, 1.482185f, 1.621193f, -2.081680f, 1.386553f, -3.204335f, -0.267111f, -0.004508f, 0.164712f, 0.274147f, 1.724306f, -2.273659f, 0.749574f, -0.891905f, 0.105965f, -0.030428f, -0.416018f, -0.300762f, 0.122911f, -0.316908f, -0.292504f, 0.138666f, -0.161327f, -0.042143f, -0.249128f, 0.149210f, -0.088987f, -0.654101f, -1.501843f, 0.216777f, 0.955914f, 0.524158f, -1.642561f, -1.643626f, 0.864797f, -0.425451f, -2.115764f, -0.012502f, 0.065172f, 1.297270f, 0.018845f, 1.167276f, -0.470970f, -0.244995f, 0.374782f, -1.811056f, -0.055430f, -0.024102f, -0.376519f, -0.339640f, -0.119177f, -0.277995f, -0.290095f, -0.081362f, -0.144139f, -0.118037f, -0.180357f, -0.217559f, -0.370683f, 0.172816f, -0.265069f, 0.194321f, -0.273478f, 0.037442f, -0.235552f, -0.078625f, -0.447541f, 0.016836f, -0.271123f, -0.171481f, -0.321477f, -0.184826f, -0.442981f, -0.227273f, -0.370666f, -0.237232f, -0.257493f, -0.225714f, -0.153716f, -0.283487f, -0.155399f, 0.067697f, 0.230343f, -0.034318f, -0.022687f, -0.047090f, }; static const float av1_ab_partition_nn_bias_64_layer0[64] = { -0.212182f, -0.233725f, -0.758846f, -0.158162f, 0.614743f, -0.150944f, -0.075727f, -0.208414f, 1.054996f, 0.713758f, -0.300051f, -0.151482f, -2.443570f, 0.430590f, -0.129001f, -0.160733f, -0.230547f, -0.143228f, -0.140577f, -0.086812f, -0.212298f, -0.159557f, -0.055647f, -0.211423f, 0.578161f, -0.220318f, -0.210107f, -3.111584f, 0.604419f, -0.232622f, -0.209924f, -0.130794f, -0.084097f, -0.036005f, 0.294594f, -2.535531f, -0.209783f, -0.211189f, -2.766337f, 0.000000f, 0.450177f, -1.754884f, 3.262664f, -0.209691f, -0.614886f, -0.211257f, -0.109096f, -0.190492f, -0.109007f, -0.026910f, -0.136035f, -0.212321f, -0.139320f, -0.212233f, -0.305430f, 0.739171f, 0.991277f, -0.088150f, 0.086313f, -0.023379f, -0.125366f, -0.063576f, -0.212169f, -0.047463f, }; static const float av1_ab_partition_nn_weights_64_layer1[64 * LABEL_SIZE] = { -0.036800f, 0.528721f, 0.490767f, 0.144409f, 1.103640f, 0.361910f, -0.180069f, 0.068033f, -14.868382f, 0.359013f, 0.322567f, -0.199212f, 0.906164f, -0.488254f, 0.149653f, -0.216394f, -0.099347f, 0.004936f, -0.111391f, 0.074848f, -0.041709f, 0.147627f, -0.018905f, 0.096116f, 0.184817f, -0.016241f, 0.115739f, 2.376754f, 0.637097f, 0.052954f, 0.136428f, 0.225267f, -0.181873f, -0.142876f, 0.684048f, 0.658791f, 0.105795f, 0.241705f, 1.381114f, -0.209379f, 1.145949f, 0.795293f, -9.361877f, 0.198302f, 0.539600f, 0.092317f, -0.081695f, 0.200777f, 0.102334f, 0.081583f, 0.060948f, -0.025110f, 0.160951f, -0.020170f, 0.234006f, -0.029369f, 0.375036f, 0.270209f, -0.556529f, 1.402949f, 0.101777f, -0.027331f, 0.004502f, -0.153166f, -0.116651f, 0.151573f, -0.022187f, 0.144044f, -0.108719f, -0.129942f, -0.270321f, 0.227363f, 1.892330f, -0.661052f, -0.219398f, -0.229417f, -0.856438f, -1.196988f, -0.081774f, 0.078847f, -0.207057f, -0.048947f, 0.152073f, -0.243056f, -0.233329f, -0.288689f, -0.158333f, -0.141177f, -0.715436f, 0.016947f, -0.093752f, 0.204984f, -1.209782f, 0.155683f, 0.092239f, 0.146495f, 0.813146f, -0.027757f, 0.330982f, 2.173948f, -0.028867f, -0.141815f, 0.292708f, -0.204794f, 0.014496f, 1.032799f, 1.312155f, 0.107020f, 0.824752f, -0.013945f, 0.184829f, -0.041633f, 0.215300f, -0.476088f, -0.053213f, 0.126862f, -0.020777f, 0.082893f, -0.223727f, -0.923063f, 0.466529f, 0.082140f, -0.845758f, -1.140791f, -0.262033f, 0.138491f, 0.151717f, -0.182479f, -0.131128f, 0.055411f, 0.106771f, 0.125552f, 0.297184f, -0.257403f, -0.059884f, -0.274903f, 2.694357f, -0.108244f, 0.025377f, 0.043092f, -0.558317f, 3.517159f, -0.270833f, -0.240676f, 0.205100f, -0.057068f, -0.140445f, -0.193449f, -0.030061f, -0.286762f, -0.467523f, -0.012647f, 0.190564f, 0.022394f, -0.101479f, 0.339684f, -0.902743f, -0.169578f, -0.178029f, -0.041836f, -3.952108f, -0.028298f, -0.221137f, -0.733895f, -0.223895f, 0.039012f, 0.687867f, 0.021423f, 0.113063f, 0.676087f, -0.961000f, -0.064847f, 0.712856f, -0.192765f, -0.001132f, 0.016689f, -0.236020f, -0.766186f, -0.175729f, 0.012879f, -0.251064f, -0.105523f, -0.039212f, -0.347584f, 0.304352f, -0.034174f, -0.364258f, -0.685252f, -0.266115f, -0.247345f, -0.155905f, 0.152283f, -0.156315f, 0.174082f, -0.757654f, 0.102303f, -2.192316f, -0.245815f, 0.119882f, -0.086542f, 1.987246f, -1.353163f, -0.374813f, -0.233504f, -1.980895f, 0.692093f, -0.168351f, 0.172700f, -0.009052f, -0.015734f, 0.106679f, -0.060472f, -0.256813f, -0.074874f, -0.207488f, -0.329515f, -0.418268f, -0.017940f, -0.036081f, 0.064719f, -1.488016f, 0.020591f, -0.176325f, -0.141074f, 0.944494f, 0.150237f, -0.249805f, -0.277280f, 0.012686f, 0.132483f, 0.116123f, 0.013737f, -0.116091f, 0.750340f, 3.251343f, -0.188864f, 1.096992f, 0.058467f, -0.041433f, -0.037937f, -0.133294f, -0.137908f, -0.171132f, 0.106362f, 0.069383f, -0.052662f, -0.177883f, -0.408049f, 0.680221f, -0.117035f, -0.904240f, -1.395228f, 0.154527f, 0.134427f, 0.022767f, -0.158886f, -0.230316f, 0.161096f, 0.362213f, -0.235060f, -0.941620f, 0.055912f, -0.049458f, -0.166632f, 0.481418f, 0.930146f, 0.041108f, 0.033674f, 1.372066f, -1.847709f, 0.003324f, 0.259534f, 0.177014f, -0.202761f, -0.262017f, -0.190852f, -0.102839f, 0.028338f, 0.187193f, -0.041684f, 0.123973f, -0.198576f, -0.110369f, -1.431400f, 0.208369f, -0.302370f, -0.248549f, 0.062985f, 0.673409f, 0.036662f, -0.711340f, -0.120584f, -0.189789f, 0.098812f, 2.947819f, 0.216567f, -0.414472f, -0.181742f, 1.873779f, -0.222726f, -0.782870f, 0.007889f, 0.015062f, -0.554328f, 0.182928f, -0.191430f, 0.123636f, -0.215460f, -0.225245f, 0.251516f, -0.013025f, -1.359595f, -0.750602f, 0.342667f, -0.141899f, -0.687493f, -0.072639f, 0.048018f, -0.242107f, -0.031917f, -0.287472f, -0.046088f, 0.832197f, -0.016576f, -1.553349f, -0.216341f, 0.023077f, -0.410867f, 4.243743f, -0.514878f, -0.066007f, -0.160696f, -0.262678f, -0.648790f, -0.430586f, 0.199940f, -0.202496f, -0.222241f, -0.016406f, -0.121473f, 0.000828f, -0.081584f, -0.152641f, -0.190166f, 0.644400f, 0.040196f, -0.302104f, -1.143654f, -0.160327f, -0.320780f, -0.187006f, 0.037311f, 0.440618f, -0.070733f, -0.117785f, 1.527539f, -0.419310f, 0.001300f, 1.389956f, -0.036366f, -0.269203f, 0.612265f, 2.721897f, -0.086836f, -0.446999f, 0.012525f, -0.078317f, -0.287052f, -0.111188f, -0.085181f, -0.164667f, -0.010466f, -0.569722f, -0.018888f, -0.101663f, -1.147130f, -0.465204f, 0.114524f, -2.192402f, -0.221325f, 0.375748f, 0.206284f, -0.261548f, -0.246257f, -0.143004f, -0.069981f, -0.057306f, -0.116481f, -0.435903f, -0.314970f, 0.013210f, -0.010175f, 4.630571f, -0.473226f, -0.197199f, -0.028204f, 0.122907f, 2.475548f, 0.025011f, -0.092603f, -0.127561f, -0.151330f, -0.077295f, 0.245016f, -0.045005f, 0.183396f, -0.330556f, -0.384887f, 0.356374f, -0.016618f, -0.463353f, -1.291546f, -0.071986f, -0.311599f, 0.072385f, -0.430786f, -2.094788f, 0.202733f, -0.910109f, -1.336543f, -0.086800f, -0.096413f, 1.544383f, 0.031860f, -0.796211f, 0.762786f, 3.250022f, -0.441798f, -0.698537f, 0.062839f, 0.033525f, -0.362996f, 0.027022f, -1.131264f, -0.228926f, 0.053885f, -0.338628f, 0.155037f, -0.046844f, -0.888172f, -0.241767f, 0.084965f, -0.617743f, -0.049896f, -0.036894f, -0.304783f, -0.002639f, 0.137957f, 0.052121f, -0.131161f, -0.117200f, -0.253380f, -0.205561f, -0.302450f, -0.047397f, -0.330518f, 3.613420f, -1.525951f, -0.026738f, 0.209150f, -2.103534f, 2.019689f, -0.366199f, -0.095260f, 0.027417f, -0.242512f, 0.162579f, 0.052113f, -0.293851f, -0.068138f, -0.005799f, -0.344696f, -0.114824f, -0.431107f, -0.120058f, -1.139926f, -1.048379f, 0.036446f, -0.323020f, -0.432945f, 0.454151f, -0.140058f, 0.050649f, -0.094900f, -0.017278f, -0.238719f, 1.193153f, 0.120447f, -0.496061f, 0.917431f, 2.936126f, -0.115521f, -0.347397f, -0.435325f, -0.004383f, -0.211864f, 0.162383f, -1.040726f, 0.089537f, -0.128579f, -0.133505f, 0.107129f, -0.435657f, -0.180388f, 0.043650f, 0.018709f, -0.773242f, -0.687192f, -0.120633f, -0.063626f, 0.029912f, 0.113972f, -0.403502f, -0.127640f, -0.269625f, 0.129794f, -0.188539f, 0.041641f, 0.029769f, -0.198374f, 1.401407f, 0.353887f, -0.219925f, 0.260515f, 1.157034f, -2.992044f, -0.097618f, -0.064417f, -0.203626f, -0.008217f, -0.112339f, -0.227407f, -0.155118f, 0.247705f, -0.012304f, -0.248447f, -0.913463f, -0.064788f, -0.214619f, -0.251761f, -0.386861f, -0.040574f, -0.163219f, -0.100700f, 1.488274f, -0.071684f, -0.033626f, -0.006497f, -0.246945f, -0.145221f, -3.747390f, 0.149609f, -0.263326f, -0.297385f, -1.039896f, -0.083174f, -0.025473f, -0.235586f, -0.001087f, 0.254286f, 0.265106f, 0.007325f, 0.199239f, 0.134103f, -0.578211f, -0.259801f, -0.062373f, 2.368348f, 0.560556f, -0.252260f, 0.889997f, -0.447872f, -0.059218f, -0.095315f, -0.061667f, 0.183580f, -0.157479f, 0.055387f, -0.831734f, 0.007606f, -1.104906f, 0.301180f, -0.117115f, 0.212959f, 4.727223f, -0.243833f, -0.397495f, -0.025021f, -0.367587f, -2.082058f, -0.217699f, 0.148111f, 0.252430f, 0.111088f, -0.260692f, 0.095124f, -0.407774f, -0.322169f, 0.002927f, 0.126169f, -1.272325f, -0.279772f, -0.373680f, -0.485177f, -0.605458f, 0.021225f, -0.092031f, -0.226585f, 1.895162f, 0.037866f, -0.275475f, 1.614360f, -0.014972f, -0.277679f, -3.449082f, -0.092060f, -0.747873f, 0.020716f, 2.776178f, -0.049963f, 0.183999f, -0.295259f, -0.028868f, 0.221895f, 0.001265f, 0.336823f, 0.219372f, 0.112824f, 0.408132f, -0.017940f, -0.311666f, 1.489606f, -0.058093f, -0.305659f, -0.491933f, -0.143847f, 0.166115f, 0.042867f, -0.123447f, -0.087099f, -0.305395f, -0.365079f, -0.755801f, -0.160649f, 0.736260f, -0.008611f, 0.095836f, -0.017345f, 5.697515f, -0.498971f, -0.125280f, 0.199907f, 0.300053f, 0.605026f, -0.228225f, -0.259523f, 0.016384f, 0.146973f, 0.210258f, 0.226766f, -0.075178f, -0.050924f, 0.188496f, -0.415266f, -0.484880f, -0.236384f, 0.071931f, -0.331863f, -0.601243f, -0.232479f, -0.285272f, 0.123789f, -1.341333f, 0.037082f, -0.315202f, -1.587215f, -0.271576f, 0.003216f, -4.437186f, -0.256205f, -0.576589f, -0.114147f, 2.153916f, -0.369618f, 0.271415f, 0.145036f, -0.158731f, -0.240938f, -0.187369f, 0.036325f, 0.254771f, 0.211488f, -0.240297f, 0.098417f, -0.415011f, 2.334793f, -0.127252f, 0.020069f, -0.168755f, -0.448922f, -0.219207f, 0.016232f, -0.221935f, -0.269500f, -0.100636f, 0.102545f, -0.809376f, -0.054979f, 0.360713f, -0.326541f, 0.112933f, 0.138073f, 4.229404f, -0.763801f, -0.305429f, 0.199955f, -1.787713f, 0.272866f, 0.109895f, 0.138466f, -0.250259f, -0.167162f, -0.212588f, -0.217589f, -0.067125f, -0.077490f, -0.208970f, -0.006863f, -0.671146f, -0.298320f, -0.165509f, 0.044597f, -1.408624f, -0.213957f, -0.220947f, 0.129718f, 1.316777f, -0.098928f, -0.008121f, -0.558293f, -0.297290f, -0.218873f, -4.346638f, -0.228174f, -0.204710f, -0.388864f, 2.697919f, 0.025260f, 0.857020f, 0.009921f, 0.036915f, -0.320275f, -0.087937f, 0.022636f, 0.236667f, 0.135496f, -0.059616f, -0.192955f, 0.009470f, 2.139589f, -0.200449f, 0.129818f, 1.017444f, -0.608299f, 0.257914f, -0.134306f, -0.033327f, 0.002855f, -0.338598f, 0.015559f, 0.117362f, -0.166760f, 0.086903f, -0.167666f, 0.193523f, 0.033852f, -1.147686f, 0.489468f, -0.006969f, 0.125630f, 1.557907f, -1.604449f, -0.071114f, 0.096178f, 0.007065f, 0.200013f, 0.213393f, 0.168466f, -0.100568f, -0.117861f, -0.161542f, -0.072561f, -1.069871f, -0.470138f, -0.352578f, -1.503513f, -0.001394f, -0.380109f, 0.065089f, -0.281668f, 0.988953f, -0.002778f, -0.659026f, -0.470692f, -0.407292f, 0.011710f, -1.362085f, 0.184738f, -0.135786f, -1.374241f, 4.487930f, -0.067274f, -0.956404f, -0.233995f, 0.224527f, -0.454556f, 0.037900f, -0.281658f, 0.208224f, -0.254753f, 0.045740f, 0.051444f, -0.388281f, 0.257112f, -0.485030f, -0.082659f, 0.148103f, -1.007456f, -0.022295f, 0.036984f, -0.369401f, -0.076943f, -0.007636f, -0.293022f, 0.470466f, 0.199012f, -2.158182f, 0.036577f, -0.014725f, -0.229516f, 2.236929f, 0.030945f, -0.400045f, 0.109348f, 0.214691f, -0.891516f, -0.251379f, -0.217358f, 0.013733f, 0.205573f, -0.151725f, -0.191782f, -0.339630f, -0.163905f, -0.119191f, -0.032516f, 0.503015f, 0.025772f, 0.029094f, -1.146153f, 0.216723f, -0.330023f, 0.064695f, -0.262521f, 0.425612f, -0.093080f, -0.489648f, 1.051293f, -0.092332f, 0.095557f, -0.874132f, 0.218483f, -0.127648f, -1.605802f, 2.763617f, -0.186734f, -1.243166f, -0.193514f, -0.173748f, 0.337822f, 0.183873f, -0.251594f, -0.211582f, 0.144081f, 0.029620f, -0.024853f, -0.385140f, 0.467341f, -0.928316f, -0.195442f, 0.917783f, 0.357084f, 0.174445f, -0.073659f, -0.012811f, -0.115420f, -0.181147f, -0.364449f, -0.567395f, -0.012969f, -1.680714f, 0.065323f, 0.198063f, -0.244201f, 1.428545f, -0.432539f, -0.208931f, -0.091205f, 0.957125f, 0.813519f, -0.262677f, 0.246852f, 0.015536f, 0.055026f, 0.067054f, 0.262103f, -0.358115f, -0.095206f, -0.267522f, -0.402710f, -0.680397f, -0.123627f, -0.385590f, -1.504680f, -0.169513f, -0.215338f, 0.043633f, -0.079052f, -0.464410f, 0.122894f, -0.278231f, -2.456445f, -0.159917f, -0.015597f, -0.735449f, -0.078854f, -0.400290f, -1.153870f, 3.657228f, -0.287093f, -1.174355f, -0.102001f, -0.288281f, 0.185209f, -0.145228f, -0.200449f, -0.099914f, -0.138354f, 0.254428f, -0.161751f, -0.118206f, 0.296043f, -0.482613f, 0.080932f, 1.097605f, -0.010190f, 0.232439f, 0.447617f, -0.133508f, 0.115763f, -0.388589f, 0.174695f, -0.236014f, 0.006284f, -1.374129f, 0.092015f, -0.241419f, -0.231667f, 2.763950f, -0.922932f, -0.061605f, 0.208740f, -1.597190f, 1.353325f, -0.198528f, 0.250498f, -0.013950f, -0.203861f, -0.254563f, 0.081931f, -0.413369f, 0.011844f, 0.080961f, -0.231161f, -1.234909f, -0.440843f, -0.174980f, -0.315283f, -0.337474f, -0.123243f, -0.310001f, -0.271028f, 0.364179f, 0.022845f, -0.535517f, -0.772936f, -0.188435f, 0.039667f, -0.807463f, 0.266550f, -0.288857f, -1.630789f, 1.280155f, 0.065712f, -0.279960f, -0.300056f, 0.258440f, -0.073781f, 0.213878f, 0.042196f, 0.021360f, 0.211698f, -0.003751f, -0.192673f, -0.137008f, 0.247878f, -0.470604f, 0.073164f, 1.523241f, 0.734755f, -0.114126f, -0.193834f, -0.025759f, 0.263183f, }; static const float av1_ab_partition_nn_bias_64_layer1[LABEL_SIZE] = { -0.343508f, -0.706936f, -0.160676f, -0.877101f, -0.517567f, -0.253254f, -0.148074f, 0.923430f, -0.364770f, 0.203550f, 0.401216f, 0.938246f, -0.872737f, 0.718723f, 0.703398f, 2.560015f, }; static const NN_CONFIG av1_ab_partition_nnconfig_64 = { FEATURE_SIZE, // num_inputs LABEL_SIZE, // num_outputs 1, // num_hidden_layers { 64, // num_hidden_nodes }, { av1_ab_partition_nn_weights_64_layer0, av1_ab_partition_nn_weights_64_layer1, }, { av1_ab_partition_nn_bias_64_layer0, av1_ab_partition_nn_bias_64_layer1, }, }; // nn model for ab partition pruning, 32x32. static const float av1_ab_partition_nn_weights_32_layer0[FEATURE_SIZE * 64] = { -0.323723f, -0.214013f, -0.007772f, -0.458851f, -0.125542f, -0.123860f, -0.410973f, -0.209389f, -0.087580f, -0.272881f, -0.168500f, -1.130845f, 0.344916f, -0.475017f, -0.362262f, -0.195662f, -0.566124f, 0.782163f, 0.411575f, -0.013378f, -0.318650f, -0.124678f, -0.612909f, -0.315788f, -0.263990f, -0.508783f, -0.048938f, -0.416407f, -0.402648f, -0.156644f, 0.225887f, -0.000493f, 2.682241f, 0.871204f, 0.059014f, 0.803542f, -1.407028f, -1.154669f, 1.388148f, -0.293348f, -0.003669f, -0.009607f, 1.330030f, -0.337841f, 2.118617f, 1.033059f, -0.084788f, 0.212904f, 0.082405f, -0.070579f, -0.494005f, -0.173392f, 0.039546f, -0.463865f, 0.077163f, -0.434066f, 0.030835f, -0.427139f, -0.560520f, -0.031606f, -0.368541f, -0.027458f, 0.370574f, 0.461418f, 1.087682f, -0.572137f, -1.509596f, -0.765697f, -0.499383f, -0.277998f, -0.106492f, -0.129564f, -0.169133f, -0.269834f, -0.114270f, -0.275431f, 0.016339f, -0.156744f, -0.267922f, 0.171216f, 0.110556f, 0.002954f, -0.200327f, -0.187663f, 3.691601f, 1.234152f, 0.186315f, -0.125370f, -0.211235f, -0.554432f, -0.131072f, -0.124982f, -0.130339f, -0.235350f, 0.018903f, 0.012896f, -0.159372f, -0.269571f, -0.025709f, -0.221251f, 0.061919f, 0.016307f, 0.384673f, -0.134525f, -1.599126f, -0.416459f, -0.743052f, 0.670249f, -0.169709f, 0.421681f, -0.033360f, -0.072817f, 0.003647f, -0.110632f, -0.158651f, -0.095136f, 0.223759f, 0.165767f, -0.269129f, -0.196075f, -0.023183f, -0.293420f, 0.014875f, 0.018688f, -0.153407f, -0.172009f, -0.259947f, -0.124015f, 0.173653f, -0.089103f, -0.021001f, -0.334230f, 0.027177f, 0.103371f, -0.183860f, -0.204051f, -0.023721f, -0.192297f, -0.143771f, -0.247106f, 0.218116f, -0.013240f, 2.831783f, 1.483928f, -0.877025f, -0.313462f, -0.411320f, -0.447825f, 0.605977f, 0.234684f, -0.119150f, -0.075182f, -0.330463f, 0.071503f, -0.254924f, -0.360071f, -0.037022f, 0.063261f, -0.148759f, -0.238254f, -0.462018f, -0.027166f, 0.065318f, -0.235743f, -0.257194f, -0.094784f, 0.022423f, 0.055925f, 0.086672f, -0.021010f, 0.009965f, -0.001648f, -0.104917f, -0.387443f, -0.102673f, -0.281706f, 0.145923f, -0.233391f, -0.378365f, -0.145584f, -0.077751f, -0.121166f, 1.134565f, -0.097500f, -0.749202f, -0.544566f, -1.361374f, -0.102494f, 1.089275f, 0.375299f, -0.105091f, 0.037641f, -0.054248f, -0.282691f, -0.377797f, -0.066427f, -0.253815f, -0.329677f, -0.339326f, -0.128217f, -0.282905f, 0.014937f, 1.067185f, -0.171764f, 0.484458f, 0.396706f, -0.557055f, -0.891596f, -0.257839f, -0.720879f, -0.218449f, -0.004755f, 1.572857f, 0.006229f, 1.962895f, -0.029746f, -4.137691f, -2.185991f, -2.763477f, -0.520437f, -0.208708f, 0.006444f, -1.263078f, -0.304560f, 1.072374f, 2.556429f, 0.312850f, 0.257488f, -0.634264f, 0.156769f, -0.188943f, 0.040295f, -0.389915f, 0.085250f, -0.248525f, 0.045667f, -0.776115f, -0.274680f, -0.448145f, -0.566161f, -1.285316f, 0.079060f, 0.389124f, -0.510401f, -0.015299f, -0.664661f, 0.099901f, -0.470694f, -0.051593f, -1.076381f, -0.442104f, -0.197867f, -0.330011f, -0.448523f, -0.301018f, -0.442093f, -0.491953f, -0.582091f, -0.064569f, -0.156516f, 0.543522f, -0.005924f, 0.161432f, 0.974793f, 0.273712f, 1.104850f, -0.290312f, 0.313417f, -0.125370f, 0.136234f, -0.191227f, -0.165054f, 0.011872f, -0.298871f, 0.095740f, 0.142760f, -0.215771f, -0.031437f, 0.101041f, -0.085620f, 0.435387f, 0.002786f, 1.971375f, 0.018392f, -1.771940f, -0.401433f, 0.808263f, -3.350013f, 2.296952f, -1.024403f, -0.041645f, -0.034799f, -0.024078f, -0.347301f, -0.276088f, -0.455907f, 0.266021f, 0.087348f, -0.146566f, 0.040492f, -0.539866f, -0.206851f, -0.387874f, -0.125508f, -0.496676f, -0.373845f, -0.472356f, -0.357082f, -0.081254f, -0.456466f, 0.554713f, 0.002185f, -4.225019f, 0.344025f, 0.728796f, -0.262936f, 1.383924f, 1.577300f, -2.653320f, -2.516156f, -0.301604f, -0.204105f, -0.138252f, -0.587536f, -0.097889f, -0.352414f, -0.288276f, -0.184340f, -0.122741f, -0.243376f, 0.031970f, -0.373402f, -0.396079f, 0.045566f, 0.072595f, -0.222681f, -0.243802f, -0.340129f, -0.258494f, -0.192041f, -0.386112f, -0.240940f, -0.047268f, -0.555802f, -0.032514f, -0.241341f, -0.167463f, -0.478308f, -0.205936f, -0.316275f, 0.103729f, -0.197893f, -0.128029f, -0.218796f, -0.167362f, -0.111814f, -0.126062f, -0.394260f, -0.025357f, -0.402697f, -0.587395f, -0.400385f, -0.259664f, -0.415588f, -0.338503f, -0.399166f, -0.270504f, 0.234505f, 0.272144f, 0.266938f, -0.392395f, -0.011717f, -0.384221f, -0.473446f, -0.038420f, -0.241101f, -0.234402f, -0.275567f, -0.410454f, -0.377599f, -0.179099f, -0.138432f, -0.248083f, -0.543026f, -0.428043f, -0.239895f, -0.333193f, -0.103346f, -0.039038f, -0.171109f, -0.119432f, -0.222351f, 0.000450f, 0.208724f, -0.510526f, -0.144656f, -0.316721f, -0.344846f, -0.244794f, -0.129134f, -0.045634f, -0.400183f, 0.043714f, -0.235414f, 0.115594f, -0.195616f, -0.106693f, -0.124242f, 0.083990f, 0.049110f, -0.196130f, -0.059860f, -0.464235f, -0.516443f, -0.101521f, -0.422379f, -0.413955f, -0.042991f, -0.345263f, -0.129264f, -0.106911f, -0.140156f, -0.457841f, -0.199848f, -0.218954f, -0.329850f, -0.364097f, -0.335262f, -0.312254f, -0.299331f, -0.052710f, -0.251019f, -0.023459f, -0.222538f, 0.028849f, -0.088038f, -0.301550f, -0.273566f, 0.067295f, -0.174608f, -0.445784f, -0.158366f, -0.567275f, -0.557652f, -0.353503f, -0.302092f, -0.302049f, -0.551793f, -0.034535f, -0.225190f, -0.210733f, -0.219377f, -0.057197f, -0.430933f, -0.025185f, -0.388150f, -0.086147f, -0.430088f, 0.058466f, -0.152129f, -0.058411f, -0.236392f, -0.547669f, -0.613849f, -0.893774f, -0.351715f, -0.399227f, -0.454909f, -0.324501f, 0.000490f, -0.282167f, -0.073163f, -0.281452f, 0.047932f, -0.175500f, 0.165220f, -0.276212f, 0.062153f, -0.217054f, -0.255487f, -0.146416f, -0.097718f, -0.173809f, -0.559328f, -0.055695f, -0.391193f, -0.132020f, -0.561184f, -0.308666f, -0.474053f, -0.219149f, -0.246558f, -0.158325f, 0.151907f, -0.266835f, -0.144697f, -0.193960f, -0.046587f, -0.220028f, -0.247355f, 0.135584f, 0.016511f, 0.367705f, -1.855877f, 0.435622f, 0.444710f, -3.372301f, -3.030489f, 1.013267f, 0.380951f, -0.170011f, -0.111415f, -0.456146f, -0.107254f, -0.095220f, -0.053078f, -0.135864f, -0.591949f, -0.252810f, -0.324799f, -0.094796f, -0.260969f, -0.391981f, -0.063170f, -0.336130f, -0.470127f, -0.405168f, -0.433219f, -0.309563f, -0.295462f, -0.552270f, -0.012300f, -0.057793f, -0.034494f, -0.446843f, -0.640160f, -1.188681f, -0.791361f, 0.543271f, 1.189112f, 1.458468f, -0.005876f, -0.927475f, 0.062038f, -1.170818f, 0.338227f, -3.007096f, -4.559296f, -4.045457f, -5.953635f, -0.228386f, -0.266890f, -0.092595f, -0.377440f, -0.044534f, -0.053565f, -0.349268f, -0.415030f, -0.310094f, 0.062721f, 0.251422f, -0.014350f, -1.282910f, 1.619560f, 1.180566f, -0.032163f, -1.322951f, -0.603601f, 1.443710f, 0.654650f, -0.393227f, 0.003536f, 0.029725f, -0.108925f, -0.053911f, 0.133977f, -0.036145f, -0.168438f, 0.046989f, -0.331463f, -0.176983f, -0.311922f, -0.272389f, -0.379592f, -0.399993f, -0.297873f, -0.193425f, -0.177524f, -0.258309f, -0.567312f, -0.260217f, -0.241869f, 0.024010f, -0.032867f, -0.039424f, -0.063670f, 0.193808f, -0.303514f, -0.013376f, -0.057761f, 0.187922f, 0.006938f, 0.031810f, 0.180594f, -1.198427f, 2.820662f, 0.154986f, -0.375518f, 0.116925f, -0.795782f, -0.085139f, -0.079365f, -0.197936f, -0.321468f, -0.205271f, -0.558203f, -0.296235f, -0.151193f, -0.158282f, -0.245402f, -0.208504f, -0.042335f, -0.087426f, -0.557129f, -0.381427f, -0.441551f, -0.541011f, -0.060567f, -0.469305f, -0.032326f, -2.453587f, -0.045568f, -0.296932f, 0.613061f, -0.320284f, 0.191620f, -0.827145f, -0.225277f, 0.275800f, 1.696635f, }; static const float av1_ab_partition_nn_bias_32_layer0[64] = { -0.176206f, 0.660189f, -0.186156f, -2.481963f, -1.564218f, -0.280424f, 0.732684f, -0.135581f, -2.193132f, -0.172771f, 0.605001f, -0.060392f, -0.067190f, -0.132969f, -1.410812f, -0.298701f, -0.105963f, -0.086173f, 0.632779f, 0.005585f, 1.310169f, 1.392136f, -0.563860f, -0.051053f, 0.660998f, -0.214726f, -1.894342f, -0.128288f, -0.330721f, -0.053988f, -0.177726f, 1.200859f, -0.178902f, -0.172620f, -0.184476f, -0.175559f, 0.538503f, -0.322158f, -0.219080f, -0.058208f, -0.171347f, -0.216060f, -0.174950f, -0.295740f, -0.184820f, -0.213896f, 1.317728f, -0.020116f, -0.208096f, 0.000000f, 1.246166f, -0.225421f, -0.181555f, 0.861761f, 1.172429f, -0.172892f, -0.737092f, -0.189904f, -0.179385f, -0.114618f, -1.384604f, -0.201713f, -0.271948f, 0.372351f, }; static const float av1_ab_partition_nn_weights_32_layer1[64 * 16] = { -0.037828f, 1.529029f, 0.004927f, 1.475763f, 0.627172f, 0.325872f, -0.990757f, 0.129476f, 0.889958f, -0.082031f, 0.332133f, 0.074422f, -0.176212f, -0.074355f, 0.774378f, 0.110987f, -0.155469f, 0.253310f, 0.882538f, 0.253605f, 0.332436f, -5.389474f, 0.278470f, 0.168644f, 0.914611f, 0.154165f, 0.809262f, -0.174734f, 0.923673f, 0.064716f, -0.070228f, -0.228735f, 0.002312f, 0.112222f, -0.045502f, -0.046004f, 0.514101f, 0.306480f, 0.021232f, -0.015955f, -0.288260f, 0.189177f, -0.104158f, 0.103273f, 0.096910f, -0.086328f, 1.327289f, -0.154247f, 0.056676f, -0.243327f, -0.646676f, 0.177221f, -0.086761f, 0.729729f, -14.710893f, -0.044881f, 0.339003f, -0.134737f, 0.073621f, -0.162913f, 1.215237f, 0.140723f, 0.138630f, 1.241719f, 0.204092f, -0.463080f, -0.176086f, 1.125868f, 1.034814f, 0.225455f, -0.203421f, -0.078787f, -0.527498f, 0.012491f, -0.563307f, -0.170792f, 0.002679f, 0.116153f, 0.211348f, -0.191900f, -0.212505f, 0.263445f, -0.074679f, -0.081441f, -0.815405f, 2.448215f, 0.781299f, 0.149542f, -1.045162f, 0.043014f, 0.217381f, -0.094500f, -0.090427f, 0.025784f, -0.228906f, -2.741798f, 0.230475f, -0.256112f, -0.103297f, 0.159121f, -0.229793f, -0.014883f, -0.104131f, -0.123816f, 0.164148f, -0.052279f, -0.071845f, -0.041197f, 0.208527f, -0.234197f, -0.542336f, 0.020053f, 0.088870f, 0.014346f, 2.502164f, -0.010244f, -0.267792f, 0.844394f, 2.711486f, -0.015262f, -0.868053f, -0.295704f, 0.222289f, -0.000286f, -0.352098f, -0.079000f, 0.021267f, -0.721739f, -0.240558f, -0.384775f, 0.065974f, -2.161058f, 0.195889f, 0.268966f, -0.009329f, 0.014949f, 0.314943f, 0.235885f, 0.072591f, -0.127120f, 0.150784f, 0.105697f, -1.297403f, -0.207509f, -0.217688f, -0.076752f, 0.170952f, -0.294235f, 0.449973f, -1.712690f, 0.860989f, 0.054757f, -0.812627f, -0.105316f, -0.736230f, -0.133192f, -3.741608f, 0.495660f, -0.288936f, 4.654852f, -0.021305f, -0.308916f, 0.049205f, -0.259996f, 0.114248f, -0.252647f, -0.253180f, -0.449314f, 0.022979f, 0.063281f, -0.196154f, 0.078295f, -0.322317f, -0.145142f, 0.300573f, 0.048385f, -0.254787f, 0.123939f, -1.263088f, -0.228565f, -0.389061f, 0.391084f, 2.322438f, 0.075009f, 0.225743f, -0.198808f, -0.280538f, -0.173939f, -0.120543f, -0.070792f, -0.417187f, -0.781056f, -0.102756f, -1.760965f, 0.019149f, -0.867342f, 0.347141f, 0.031588f, 0.302572f, -0.203573f, -0.357320f, -0.096078f, -0.527528f, 0.046699f, -0.108561f, -0.167077f, -2.851509f, -0.307116f, 0.202720f, -0.160280f, -0.215525f, 0.064355f, -0.427220f, 1.516230f, 0.634453f, 0.099400f, -1.013887f, -0.029740f, -0.093426f, -0.044272f, -1.297636f, -0.237614f, -0.160953f, 0.399036f, -0.030685f, -0.113619f, -0.184704f, 0.040519f, -0.588252f, -0.210235f, -0.067623f, -0.031841f, -0.107261f, -0.192582f, -0.253959f, -0.430821f, -0.103184f, -0.280185f, -0.357723f, 0.197761f, -0.175087f, -0.055171f, 1.642014f, -0.192559f, -0.288147f, 0.610311f, 4.688195f, -0.128728f, -0.914869f, -0.108286f, 0.013789f, 0.092125f, 0.019770f, -0.178386f, 0.074164f, -1.152658f, -0.216738f, -0.277286f, 0.012381f, 0.418259f, -0.680727f, -0.221917f, -0.485946f, 0.101672f, 2.009457f, 0.054302f, 1.019838f, -0.116170f, 0.165134f, -0.112567f, 0.852632f, -0.385796f, -0.108666f, 0.053181f, -0.311797f, -0.372875f, -0.675717f, 2.409268f, -0.514720f, -0.214245f, -0.646596f, 0.009756f, 0.203993f, 0.093617f, -0.301290f, 0.253551f, -0.128909f, -1.448442f, -0.186823f, -0.278001f, -0.294993f, -0.176928f, -0.473605f, 0.062049f, -0.212084f, -0.137326f, 0.012505f, 0.087850f, -0.200413f, -0.394119f, -0.132224f, 0.146917f, 0.155746f, 0.198725f, -0.322541f, 0.196391f, -0.945500f, 0.036736f, -0.155646f, -0.677341f, 1.130545f, -0.339554f, 0.411628f, -0.355813f, -0.249843f, 0.213694f, -2.035607f, 0.055694f, -0.111669f, 0.408696f, -0.067043f, -0.048182f, 0.398110f, -0.067542f, 1.459801f, 0.236833f, -0.178806f, 0.168758f, 0.492387f, 0.099691f, -0.776680f, -0.172865f, 0.204225f, 0.193982f, 0.575685f, -0.062248f, 0.011486f, 0.058571f, -0.493391f, 0.026893f, -0.900467f, 3.793129f, -0.634613f, -0.064660f, -0.048262f, 0.361905f, 0.033641f, 0.245171f, -0.064671f, 0.034954f, 0.204358f, -0.904023f, -0.052714f, -0.250134f, 0.136700f, 0.000734f, -0.371720f, 0.226483f, 0.217958f, 0.060559f, 0.180111f, 0.000970f, 0.079556f, -0.096775f, 0.093855f, -0.026224f, -0.243664f, 0.004290f, 0.123281f, -0.239476f, 1.230374f, -0.107826f, -0.101982f, -0.153917f, 5.464427f, 0.304375f, -0.809957f, 0.090564f, -0.278416f, -0.245555f, -2.078421f, 0.243093f, -0.127666f, 0.052451f, -0.126662f, -0.783505f, 0.025149f, -1.422675f, -0.207769f, -0.362547f, 0.115310f, 0.133390f, 1.264754f, -0.027055f, -0.485312f, -0.240717f, -0.239722f, 0.146818f, -1.265043f, -0.235553f, 0.267104f, -0.021357f, -0.435949f, -0.309371f, 0.049920f, 1.302721f, -0.233978f, -0.097551f, -0.240631f, -0.287821f, -0.378380f, -0.273131f, -3.075169f, 0.226404f, -0.029361f, 2.703590f, -0.430659f, 0.067927f, -0.387520f, -0.370630f, -0.229236f, 0.085653f, -0.370956f, -0.065556f, -0.187859f, 0.068309f, -0.109299f, -0.259898f, -0.103644f, -0.271199f, -0.209350f, 0.140993f, -0.196713f, -0.135508f, -1.423209f, -0.406385f, -0.019956f, -0.864694f, 5.963707f, -0.201157f, 0.726377f, -0.011076f, 0.010553f, -0.102918f, -2.230088f, -0.258098f, -0.039547f, -0.029262f, -0.082324f, -0.860222f, -0.094735f, -1.381839f, 0.587298f, -0.173048f, 0.721360f, 0.241900f, 0.764302f, -0.023609f, -1.173755f, 0.103912f, -0.185363f, 0.078435f, -2.245062f, -0.127269f, 0.202234f, 0.158975f, -0.260909f, 0.098608f, -0.348247f, 1.732502f, -0.412298f, -0.269602f, -0.425771f, -0.146243f, -0.530730f, 0.125716f, -1.004419f, 0.145109f, -0.059289f, 1.096304f, 0.012891f, 0.045033f, -0.306875f, 0.003514f, -0.176110f, 0.037544f, -0.441537f, -0.518921f, -0.262149f, -0.060407f, -0.379419f, -0.141245f, -0.128894f, -0.176537f, -1.161318f, -0.249100f, -0.118330f, 0.042816f, 1.173404f, 0.088312f, -0.393568f, -0.175134f, 6.529819f, -0.326652f, -0.631917f, -0.393476f, 0.057781f, -0.217748f, -1.781139f, -0.012614f, -0.212621f, -0.720322f, -0.218498f, -0.388556f, -0.254796f, -0.248399f, -0.608744f, -0.265146f, 0.238517f, 0.066882f, -2.916806f, 0.054642f, 0.282590f, 0.075248f, 0.010188f, -0.133486f, 0.985945f, -0.045849f, -0.347564f, 0.057320f, -0.417920f, 0.063664f, 0.387062f, -2.692059f, -0.535549f, 0.263736f, 0.327889f, -0.070273f, -0.775254f, 0.147250f, 3.309425f, -0.212191f, -0.067204f, -2.912663f, -0.061496f, 0.084233f, 0.022907f, 0.138421f, -0.112159f, -0.288447f, -0.010799f, 0.056049f, -0.036527f, 0.021525f, 0.106649f, -0.291883f, 0.088424f, -0.057773f, -0.086031f, 0.015277f, -0.318505f, -0.269049f, -1.008913f, -0.224785f, -0.025820f, -0.649037f, 0.706381f, 0.096410f, 0.643776f, -0.046743f, -0.009654f, -0.024246f, 1.469255f, -0.183536f, -0.370046f, -0.048442f, -0.376527f, -0.431264f, -0.245109f, -0.093951f, 0.203683f, -0.099872f, 0.087210f, 0.160692f, -3.527694f, -0.068891f, -0.228994f, -0.231817f, -0.241949f, 0.193613f, 0.979597f, -0.091259f, 0.414424f, -0.047341f, -0.209582f, -0.295134f, -0.016824f, 0.460327f, -0.072671f, 0.246234f, 0.235896f, 0.127238f, -1.068683f, 0.035648f, 2.254888f, 0.180105f, -0.260098f, -2.322120f, -0.184249f, -0.314801f, -0.099969f, -0.272117f, -0.237916f, 0.031103f, -0.274063f, -0.049384f, -0.044917f, 0.102477f, -0.342148f, -0.257558f, -0.346300f, 0.115333f, -0.115456f, 0.208354f, -0.359301f, -0.167395f, 1.146514f, -0.177861f, -0.098658f, -0.444570f, 6.759993f, -0.369772f, -0.831118f, 0.001866f, -0.073298f, -0.072095f, 0.811902f, -0.431997f, -0.286587f, -0.269500f, 0.111492f, -0.525364f, -0.351785f, -2.463474f, -1.852659f, 0.135325f, 0.138267f, 0.100643f, -2.373278f, -0.285514f, -0.395388f, -0.185016f, -0.030249f, -0.005767f, -0.716424f, -0.031674f, 0.011147f, 0.057405f, -0.215873f, -0.094401f, 0.573528f, -1.223820f, 0.414852f, -0.059053f, -0.076488f, -0.287168f, -0.842640f, 0.174084f, -0.567186f, 0.336629f, -0.062514f, 2.075448f, -0.061680f, -0.131529f, -0.098994f, -0.204111f, -0.347865f, 0.108516f, -0.049616f, -0.069212f, -0.273935f, -0.096545f, -0.210784f, -0.284698f, 0.141501f, -0.176924f, -0.361341f, -0.251197f, -0.286694f, 0.245569f, -1.521661f, -0.122639f, -0.015760f, -0.718912f, 5.877828f, 0.146916f, 0.151767f, 0.220785f, -0.032298f, 0.230902f, 0.663943f, -0.252613f, 0.057718f, -0.436038f, -0.323994f, -1.139787f, -0.042489f, -1.326298f, -1.031206f, -0.104136f, 0.389897f, 0.127602f, -2.667789f, -0.212366f, -0.506262f, -0.009115f, -0.213202f, 0.076167f, -1.629405f, 0.055129f, 0.375393f, -0.150272f, -0.241515f, -0.326497f, 0.100069f, 0.410703f, 0.340622f, 0.042437f, -0.349945f, 0.041176f, -1.178950f, 0.030992f, 0.933908f, -0.035844f, -0.098660f, 1.030584f, -0.092043f, -0.355739f, -0.305562f, 0.036161f, -0.049558f, -0.033225f, -0.403856f, -0.088276f, 0.215493f, -0.149105f, -0.013363f, 0.025886f, -0.101306f, -0.205781f, -1.072487f, -0.076019f, 0.077555f, 0.131003f, 1.267763f, -0.008954f, -0.327617f, -0.246539f, 6.664081f, -0.404403f, -1.442489f, 0.191301f, -0.336361f, 0.181156f, 0.833108f, 0.007879f, -0.194464f, -1.029408f, -0.036268f, -0.927110f, -0.379190f, -0.293443f, -1.848579f, -0.242548f, -0.065990f, 0.203160f, -0.291788f, 0.000680f, 0.587011f, -0.241289f, 0.037034f, 0.000552f, 1.072308f, -0.387230f, -0.230050f, 0.292322f, -0.720001f, 0.034109f, -0.467260f, 2.211644f, -1.839191f, -0.048797f, -0.083469f, -0.334686f, -0.269056f, 0.051295f, 1.319904f, -0.035603f, -0.018457f, -0.824915f, -0.212285f, -0.230516f, -0.035093f, -0.400843f, -0.305469f, -0.099011f, 0.014225f, -0.452772f, 0.170331f, -0.389312f, -0.115084f, -0.014770f, -0.429387f, -0.155961f, -0.568200f, -0.037853f, -0.125137f, 0.067228f, -1.329271f, -0.117874f, -0.132499f, -0.218376f, -0.588325f, -0.320024f, 0.085695f, -0.235047f, -0.217790f, 0.103015f, -0.698644f, 0.017766f, -0.058299f, 0.199411f, -0.122485f, -0.563949f, -0.349011f, -0.557045f, -0.131165f, 0.002281f, 0.118559f, -0.210302f, -1.153815f, 0.116738f, -0.236007f, -0.003487f, -0.006885f, -0.244816f, 0.953222f, 0.093748f, 0.266869f, 0.241869f, -0.860832f, -0.387012f, -0.338986f, 2.097515f, -1.942512f, -0.298021f, 0.543911f, -0.043214f, 0.082125f, -0.120242f, 0.712231f, 0.213327f, -0.301687f, -0.544011f, -0.392131f, 0.004302f, 0.004825f, -0.317440f, -0.107518f, -0.293407f, -0.159111f, -0.080367f, 0.132663f, -0.017726f, -0.237521f, -0.190297f, -0.361633f, 0.200518f, -0.538296f, -0.027975f, -0.381704f, -0.016963f, 0.630105f, -0.190997f, -0.287840f, -0.603488f, 3.605598f, -0.276614f, -1.346383f, 0.186912f, -0.047575f, -0.189232f, -1.519072f, 0.097816f, -0.223722f, 0.304924f, -0.213022f, -1.052433f, -0.322283f, -1.706734f, -2.458027f, 0.237976f, 0.171050f, -0.103139f, -0.278689f, 0.329824f, -0.262448f, -0.122916f, -0.236398f, -0.013848f, -0.969160f, -0.374907f, 0.091018f, -0.386471f, -0.723940f, 0.064956f, -0.057652f, 1.321024f, -1.397418f, -0.143136f, 0.272468f, -0.030749f, 0.037324f, 0.069316f, -0.904925f, -0.333693f, -0.117709f, 2.279598f, -0.428065f, -0.131157f, -0.014288f, -0.402862f, -0.666090f, 0.017070f, -0.028333f, 0.002481f, 0.197156f, -0.038120f, -0.271062f, -0.188275f, -0.021370f, -0.070849f, -0.905007f, -0.095886f, -0.093055f, -0.121821f, -1.239812f, -0.411799f, -0.089948f, -0.936827f, 1.437569f, -0.388908f, 0.126170f, 0.186162f, -0.018819f, -0.138364f, -1.066412f, -0.138222f, -0.022186f, 0.107331f, -0.230436f, -1.352605f, -0.161323f, -1.081810f, -0.933825f, -0.136675f, 0.378157f, 0.113377f, -0.850610f, 0.080245f, -0.087305f, -0.002852f, 0.044408f, -0.188172f, -1.891998f, 0.092189f, 0.125325f, -0.105090f, -0.848510f, -0.396308f, -0.384130f, 2.007509f, -1.480787f, -0.126946f, 0.314767f, 0.000195f, -0.285628f, -0.110442f, -0.293948f, 0.258559f, -0.417603f, 1.570705f, 0.092459f, -0.340974f, -0.284754f, -0.007801f, -0.324610f, -0.004734f, -0.207716f, -0.057175f, 0.055467f, -0.210830f, -0.113005f, -0.299177f, 0.068074f, 0.017929f, -2.897598f, -0.260074f, -0.014422f, -0.206467f, 1.246997f, -0.372863f, -0.214160f, -0.114035f, 5.805862f, 0.003611f, -1.340990f, -0.021085f, -0.260431f, -0.002720f, -1.251640f, -0.353531f, -0.304009f, -0.153376f, }; static const float av1_ab_partition_nn_bias_32_layer1[LABEL_SIZE] = { -0.521497f, -1.061572f, -0.078756f, -0.660662f, -0.403741f, -0.960163f, 0.001427f, 0.523607f, 0.225068f, -0.055273f, 1.019519f, 1.181880f, -0.010198f, 0.130597f, 1.276752f, 2.028188f, }; static const NN_CONFIG av1_ab_partition_nnconfig_32 = { FEATURE_SIZE, // num_inputs LABEL_SIZE, // num_outputs 1, // num_hidden_layers { 64, // num_hidden_nodes }, { av1_ab_partition_nn_weights_32_layer0, av1_ab_partition_nn_weights_32_layer1, }, { av1_ab_partition_nn_bias_32_layer0, av1_ab_partition_nn_bias_32_layer1, }, }; // nn model for ab partition pruning, 16x16. static const float av1_ab_partition_nn_weights_16_layer0[FEATURE_SIZE * 64] = { 0.151902f, 0.007947f, -1.788454f, 0.431869f, -2.971387f, 0.923566f, 1.632542f, -1.665136f, -0.338632f, -5.075884f, 0.398267f, 0.030467f, 2.263534f, -0.045532f, -1.066128f, 0.915139f, -0.560500f, -3.293125f, 2.072793f, -1.011414f, 0.122716f, -0.060169f, -0.388860f, 0.031019f, -0.381861f, 0.001551f, -0.328472f, 0.038296f, -0.060398f, -0.375556f, 0.209226f, 0.014764f, -1.443469f, -0.345486f, 2.409269f, 1.524846f, -0.640666f, 1.322139f, -2.074771f, -0.580944f, -0.203960f, -0.072893f, 0.329701f, 0.115339f, -1.339542f, 0.249024f, -0.421545f, -0.409151f, -0.258293f, 0.836288f, -0.073685f, -0.009624f, 0.895712f, 0.320639f, 0.451002f, -1.544558f, 0.193709f, -1.389012f, 1.305451f, 0.089795f, 0.050338f, -0.017433f, -0.304667f, 0.500729f, 0.504346f, 0.073757f, 0.582649f, -0.993623f, 1.766766f, -3.067265f, -0.415774f, -0.006036f, -1.245281f, 0.253205f, -0.591245f, -0.626238f, 0.551852f, 0.593755f, 0.491023f, 1.099384f, -0.348448f, 0.054564f, -0.451422f, -0.375781f, -0.248390f, -0.052548f, -0.380069f, -0.165391f, -0.297968f, -0.052142f, -0.316381f, -0.045246f, -0.243905f, -0.034169f, -0.247523f, -0.180773f, 0.068066f, -0.374920f, 0.057536f, -0.189748f, 0.058375f, -0.267749f, -0.147286f, -0.246153f, 0.006183f, -0.202029f, -0.059128f, 0.116852f, 0.134719f, -0.126900f, -0.064646f, -0.196458f, -0.182331f, 0.108029f, -0.264499f, 0.155816f, -0.107255f, -0.056983f, -0.209771f, -0.099070f, 0.007313f, -0.254124f, -0.231964f, -0.275972f, 0.032098f, -0.264564f, -0.208743f, 0.155599f, -0.121511f, -0.156145f, -0.162315f, -0.059788f, -0.257073f, -0.076654f, -0.110616f, -0.321675f, -0.051952f, 0.006301f, -0.154114f, 0.017032f, -0.017364f, -0.233247f, 0.009918f, -0.179289f, -0.190722f, 0.147106f, -0.063910f, -0.396872f, -0.263123f, -0.003850f, -0.040718f, -0.324699f, 0.118660f, -0.170727f, -0.316788f, 0.100886f, -0.202842f, 0.045371f, 0.150561f, -0.057054f, -0.308150f, 0.028346f, -0.381473f, -0.195365f, 0.026221f, -0.281795f, 0.087204f, 0.047689f, -0.027643f, -0.104724f, -0.089030f, -0.117661f, -0.349160f, 0.056982f, -0.340273f, 0.048086f, 0.046103f, -0.121527f, 0.021697f, 0.054109f, -0.002768f, -0.008461f, -2.297240f, 0.124651f, 3.621661f, -0.057120f, -1.151656f, 2.296894f, -3.678720f, -0.290240f, 0.087683f, -0.186389f, 0.007656f, -0.090236f, -0.245217f, 0.110389f, -0.251719f, -0.029084f, -0.128203f, -0.100005f, -0.032779f, 0.007281f, -0.366596f, -0.267870f, -0.215620f, 0.047687f, 0.010303f, 0.097980f, -0.191569f, -0.341162f, 0.119249f, 0.026279f, -2.161546f, 0.459591f, 1.290566f, 1.791797f, -0.409835f, 0.127081f, -1.156367f, 0.198286f, 0.099561f, -0.067445f, -0.034352f, 0.017966f, -0.277380f, -0.057220f, -0.174198f, -0.014164f, 0.146090f, -0.357530f, 0.097644f, -0.000932f, 0.446603f, -0.066793f, 2.448620f, 0.937617f, -1.232922f, 0.313183f, 0.816827f, -0.275115f, -0.245205f, -0.126895f, 0.156668f, -0.186977f, -0.273505f, 0.013315f, 0.168629f, -0.089084f, 0.006166f, -0.116107f, -0.199316f, -0.024010f, -0.242303f, 0.011612f, -0.218485f, -0.229661f, -0.123922f, 0.136699f, 0.006732f, -0.148718f, -0.164225f, 0.116063f, 1.587898f, 0.690519f, 0.360566f, 0.009739f, -0.678702f, -0.046003f, 0.126984f, 0.605212f, 1.240663f, -0.000228f, -1.119369f, -0.415589f, -0.721003f, 0.097936f, -1.410586f, -2.358833f, -2.773129f, -3.983361f, -0.087144f, -0.050029f, -0.242255f, 0.137424f, -0.307490f, -0.084637f, -0.023812f, -0.196582f, -0.078695f, 0.038257f, -0.012110f, -0.263521f, 0.009839f, -0.109125f, -0.226036f, 0.060712f, 0.093671f, 0.153143f, 0.039116f, -0.290891f, 0.227057f, -0.204633f, -0.207539f, -0.148242f, 0.046204f, -0.231268f, -0.209315f, -0.307579f, -0.436556f, 0.023475f, 0.131793f, -0.038301f, 1.650584f, 0.392570f, 1.446576f, 1.254380f, -0.516867f, -0.057116f, 0.149320f, 0.414424f, -0.246309f, 0.003877f, -0.480238f, -1.037035f, -0.830779f, -1.122244f, -0.408267f, -0.253956f, 0.382005f, 0.940609f, -1.113370f, -0.018554f, 0.141064f, -0.182504f, 1.270707f, 0.414904f, -0.216036f, 0.203831f, 0.450716f, -0.452909f, 0.139358f, -0.027143f, 1.956892f, 1.643732f, -0.867839f, -0.620520f, -0.334607f, -0.519982f, 0.205023f, 0.661159f, -0.000809f, 0.049033f, -0.348579f, -0.200338f, -0.362144f, -0.346590f, -0.230096f, 0.180746f, -0.149954f, -0.253429f, -0.378170f, -0.040724f, -0.041597f, 0.243659f, -0.472181f, 0.015401f, -0.180376f, 0.153139f, -0.247738f, -0.010485f, -0.157158f, 0.016825f, -0.238925f, -0.265798f, -0.318374f, 0.142352f, -0.210520f, 0.051928f, -0.352190f, -0.179052f, -0.185498f, 0.025540f, -0.111667f, -0.235187f, -0.215454f, 0.010931f, -0.238372f, -0.126659f, 0.075691f, -0.091167f, -2.462379f, -0.007950f, -0.637990f, 0.285554f, -0.051275f, 0.282279f, -0.744083f, -0.570646f, 0.592198f, 1.421332f, -0.256027f, -0.140315f, 0.160247f, -0.063185f, -0.055895f, -0.199864f, -0.287353f, -0.074561f, -0.071228f, 0.055864f, -1.084764f, -0.263409f, 0.779266f, 0.228187f, 0.375013f, 0.121204f, -0.656948f, 0.533561f, 0.272671f, -0.015423f, -0.124180f, -0.009127f, 2.934838f, -0.150998f, 1.163152f, 0.081997f, -4.715939f, -3.676595f, -1.524886f, -0.167593f, 0.281186f, 0.024046f, -1.451709f, 0.332558f, 0.990504f, 0.376290f, -1.466773f, -0.448439f, -2.929108f, -4.255188f, 0.065238f, 0.019950f, 1.372393f, 0.444052f, -2.538772f, 1.579767f, -0.464911f, -1.866114f, 1.053958f, 0.434467f, -0.125964f, 0.034671f, 0.077116f, -0.138466f, -0.413395f, -0.223453f, -0.172127f, -0.251265f, -0.048239f, -0.395519f, 0.023141f, 0.037459f, -0.249593f, -0.062215f, -0.047209f, -0.435189f, -0.164155f, -0.077590f, -0.241164f, -0.126128f, -0.038243f, -0.180888f, 0.198840f, -0.328036f, -0.169790f, 0.036506f, 0.052572f, -0.183570f, -0.073617f, -0.244959f, 0.266498f, 0.032846f, -1.902106f, 0.486078f, 2.414993f, 0.975182f, -0.382875f, 1.647810f, -2.197017f, -0.890107f, 0.221287f, 0.010889f, 3.817042f, 0.572728f, 0.092466f, 0.473337f, -1.634659f, -1.069455f, 1.486776f, -1.023850f, 0.088184f, 0.008842f, 0.518202f, 0.270259f, 1.757191f, -0.121839f, -2.912229f, -1.250866f, -2.381808f, 0.335309f, -0.120079f, -0.061294f, -0.058725f, -0.315169f, -0.262443f, 0.072434f, -0.267836f, -0.319354f, -0.274975f, 0.068970f, -0.406467f, 0.044074f, -0.152311f, -0.333656f, -0.228355f, -0.185613f, 0.017346f, -0.177674f, -0.090675f, -0.102047f, -0.011768f, -0.025280f, -0.271661f, 0.098099f, -0.312272f, -0.222217f, -0.100548f, 0.106260f, -0.034655f, 0.135109f, -0.021276f, 0.018177f, -0.353097f, -0.011128f, 0.061136f, -0.511662f, -0.223236f, -0.308841f, 0.118789f, -0.154628f, -0.053178f, -0.055973f, 0.013175f, -0.368337f, -0.090863f, -0.116920f, 0.178990f, -0.025278f, -0.190553f, -0.238092f, 0.303943f, -0.024944f, 0.719373f, 0.384332f, -0.378480f, -0.423316f, 0.709922f, 0.758514f, -1.559023f, -2.503173f, 0.068652f, -0.234741f, -0.182932f, 0.037878f, 0.020684f, -0.174142f, -0.182300f, -0.052796f, -0.219145f, 0.113028f, -1.041826f, 0.035317f, 0.919904f, -0.676011f, 0.652297f, 1.456447f, -0.166904f, -0.861823f, 0.895827f, 0.429821f, -0.180376f, -0.076587f, -0.273945f, -0.288990f, -0.206692f, -0.080745f, -0.085444f, 0.186953f, -0.050135f, 0.044243f, -0.391706f, -0.160498f, -0.292268f, 0.164060f, 0.412649f, 0.211611f, -0.327294f, -0.919399f, 0.320297f, 0.385284f, -0.088848f, -0.072556f, -0.384813f, -0.176267f, -0.065918f, 0.134724f, -0.231104f, -0.337707f, -0.195442f, -0.263569f, 0.098090f, -0.341411f, -0.189211f, -0.439276f, -0.404046f, 0.262491f, -0.311093f, -0.086454f, -0.013400f, -0.061447f, -0.026945f, -0.112036f, -0.322985f, 0.078500f, -0.230205f, -0.344535f, -0.021087f, 0.110220f, -0.128671f, 0.044219f, }; static const float av1_ab_partition_nn_bias_16_layer0[64] = { 2.936406f, -0.396539f, -0.110456f, -1.254954f, 0.785350f, 0.516290f, -0.172341f, 0.254386f, -0.192465f, -0.106751f, -0.055518f, -0.094994f, 0.000000f, -0.065018f, -0.004908f, -0.130483f, -0.119580f, -0.142072f, 0.457446f, -0.125051f, -0.107712f, 0.714607f, -0.140809f, -1.788650f, -0.087199f, 0.000000f, -1.290050f, 0.443930f, -0.110634f, -0.109380f, -0.188213f, -1.414179f, 1.193579f, 0.388775f, -0.873193f, -0.110050f, -0.072565f, -0.117050f, -0.119132f, 0.456959f, -0.132069f, 0.131974f, 1.160474f, 1.746465f, 0.442628f, -0.188849f, -0.207794f, -0.108364f, -0.856655f, -2.141620f, 0.335476f, -0.105508f, -0.212162f, -0.109319f, -0.237213f, -0.109980f, -0.291044f, -0.137877f, 0.470191f, -0.023908f, 0.123809f, -0.109797f, 0.200510f, -0.147542f, }; static const float av1_ab_partition_nn_weights_16_layer1[64 * LABEL_SIZE] = { -6.823716f, 1.406568f, -0.144009f, 2.228765f, 0.838336f, 0.738107f, -0.319014f, -0.148756f, 0.240862f, -0.111089f, -0.004241f, 0.025758f, -0.193820f, -0.246362f, -0.181363f, -0.201556f, 0.024268f, 0.252994f, -0.289443f, 0.194932f, 0.057467f, 0.724735f, 0.014063f, 1.361352f, 0.025191f, 0.024274f, 0.231462f, -7.227959f, -0.094515f, 0.039946f, 0.412719f, 0.812318f, 3.038903f, -0.286289f, 0.647482f, -0.115114f, 0.053590f, 0.066069f, 0.153134f, 0.996250f, -0.125700f, 0.951365f, -6.243494f, -4.827697f, 0.566320f, 0.239515f, -0.099702f, 0.054546f, 1.847330f, 3.680076f, -3.049829f, -0.127709f, 0.068469f, -0.017794f, 0.223864f, -0.106778f, -0.020425f, -0.040226f, -0.251890f, -0.168673f, -0.552073f, 0.043311f, 0.218668f, 0.033209f, -3.199210f, 0.193079f, 0.321406f, 0.718307f, -0.181418f, -0.459612f, -1.981170f, 0.968496f, -0.029757f, -0.130065f, 0.043782f, 0.072394f, -0.088686f, 0.025322f, 0.129882f, 0.101324f, 0.335707f, 0.072714f, -2.079774f, 0.203997f, 0.239321f, -0.301757f, 0.257845f, 1.288382f, -0.031275f, -0.234194f, 0.310722f, 2.045469f, 0.034716f, 0.135638f, -0.251388f, 0.320071f, -1.065301f, -0.322731f, -0.545028f, 0.226276f, 0.090799f, 0.019289f, 0.048950f, -1.079300f, 0.231938f, 0.083683f, 4.762127f, 0.145037f, -0.145549f, 0.075592f, 0.172336f, 0.108175f, 0.333751f, 1.090501f, 1.056114f, 0.047073f, 0.182052f, -0.081587f, 0.089900f, 0.339286f, 2.049988f, 0.073585f, 0.537355f, -0.243322f, -0.010179f, -0.052601f, -0.174915f, 0.117793f, 2.222990f, -2.520837f, -0.092699f, 1.199887f, 0.138720f, 0.679918f, -0.463155f, -0.659496f, -0.109913f, -0.003398f, 0.114633f, -0.128377f, 0.092970f, -0.107489f, -0.191078f, 0.185182f, 0.216980f, -0.019343f, 3.443133f, 0.287953f, 0.099314f, 0.985958f, 0.157268f, -0.606516f, 0.049418f, -0.221809f, -0.453081f, -0.344796f, -0.003735f, -0.107269f, -0.128541f, -0.259543f, -0.934806f, -0.542456f, -1.011192f, 0.022795f, 0.186363f, -0.076356f, -0.050932f, -0.165098f, 0.168177f, -0.101596f, -5.270886f, 2.553943f, -0.440870f, -0.017494f, 0.215208f, -0.017032f, 1.495915f, -4.304677f, 0.762211f, 0.182937f, 0.254406f, -0.029433f, -0.088364f, -0.110160f, -0.108257f, -0.036538f, 0.737697f, -0.234989f, 0.168095f, 0.245118f, -0.077262f, 0.195718f, 0.753302f, -1.637869f, 0.126227f, 0.982129f, -0.121444f, -0.295570f, -1.215799f, 0.147867f, -0.068496f, 0.132726f, -0.005772f, -0.181774f, 0.126513f, 0.204723f, -0.366123f, 0.103906f, -0.148053f, -0.075272f, 0.243884f, -0.104828f, 0.198988f, 0.501034f, -0.112671f, 0.111421f, 0.167508f, -0.117803f, -0.738624f, 2.046292f, 0.124011f, 0.057983f, -0.359154f, -0.648883f, -0.259462f, -0.459041f, -2.501223f, -0.065138f, 0.122417f, 0.060291f, -0.129033f, -0.843086f, 0.268241f, -0.399927f, 1.585888f, 1.816393f, -0.631427f, 0.127826f, 0.088105f, 0.073488f, 0.717694f, -1.497362f, 2.608528f, 0.066896f, -0.079230f, 0.223436f, -0.010530f, 0.175310f, 1.120365f, 0.034391f, 0.835312f, 0.071652f, -0.080615f, 0.111395f, 0.162742f, 0.079927f, -3.859582f, -0.638431f, -0.167880f, -0.992659f, -0.885355f, -1.276197f, 1.334344f, 0.931940f, -0.078244f, -0.149030f, -0.070974f, -0.133566f, 0.200034f, 0.102793f, -0.048546f, 0.063545f, 0.023864f, -0.190863f, 1.934257f, -0.136286f, -0.107916f, -0.637468f, 0.066449f, 1.089693f, -0.214047f, -0.265780f, 0.899660f, -0.130333f, 0.288311f, -0.049024f, 0.090202f, 0.487969f, 0.339704f, 0.858479f, 0.841253f, -0.184100f, -0.637070f, -0.125071f, -0.077650f, -0.087877f, 0.202268f, -0.027300f, 2.842862f, -0.100698f, -0.259080f, 0.260556f, 0.157912f, -0.070364f, 0.467190f, 1.200037f, 1.419317f, -0.033588f, -0.227824f, 0.292617f, 0.228574f, 0.213839f, -1.091099f, -0.022258f, -1.294681f, 0.136118f, 0.081652f, -0.185359f, -0.039706f, 0.191407f, -2.053219f, -0.261934f, 0.047812f, -0.029536f, -0.823869f, -1.090534f, -0.755890f, 0.441035f, -0.167945f, 0.231441f, -0.135013f, -0.260762f, 0.256872f, 0.130339f, -0.243751f, 0.189760f, -0.288454f, 0.145363f, 0.338490f, 0.403898f, -0.022814f, -1.263598f, -0.101315f, 0.860135f, 0.136511f, 0.028942f, 0.574047f, 2.656370f, 0.037587f, -0.188690f, -0.125312f, 1.100435f, -1.080402f, 0.380905f, 0.004635f, 0.097144f, -0.214309f, 0.085552f, -0.285066f, -0.705134f, -0.054704f, -0.319951f, 5.486626f, 0.958158f, -1.380585f, 0.223340f, -0.169167f, -0.170697f, -0.216748f, 0.324232f, 2.684204f, -0.008490f, -0.211052f, -0.201190f, 0.123466f, -0.000234f, 0.579907f, 0.096938f, -0.042745f, 0.201855f, 0.157195f, -0.261440f, 0.029699f, -0.046599f, 1.618216f, -2.596280f, -0.377420f, -0.526725f, -0.493592f, -0.579615f, 0.579699f, -0.100392f, 0.150694f, 0.061794f, 0.200425f, -0.062515f, -0.179122f, 0.250112f, -0.344675f, -0.118359f, -0.095670f, 0.152311f, 3.662276f, -0.154921f, -0.312991f, 0.972008f, -0.308596f, -0.190426f, 0.133889f, -0.238673f, -0.094726f, 1.683835f, -0.215629f, -0.198890f, -0.035278f, -0.367973f, -0.822435f, 0.240848f, -0.194656f, 0.034655f, -0.079424f, 0.146670f, 0.026646f, -0.034507f, 0.059467f, -0.153109f, -0.431033f, 2.552991f, -1.894091f, -0.180462f, -0.306839f, -0.025648f, 1.026326f, -3.096230f, 1.346935f, 0.033633f, -0.181827f, 0.094376f, 0.001696f, -0.379264f, -1.069503f, -0.140972f, -0.208769f, -0.195239f, 0.281795f, -0.127251f, 0.180776f, 0.067763f, 0.697124f, -1.040779f, 0.111280f, 0.188351f, -0.340234f, -0.207790f, -0.720075f, -0.137409f, -0.070310f, -0.032918f, -0.060787f, 0.131484f, -0.077845f, -0.258652f, 0.056911f, -0.062034f, 0.007663f, -0.185100f, 1.340361f, 0.014096f, -0.124602f, 0.194241f, 0.128383f, 0.360465f, 0.082979f, -0.050475f, -0.519294f, 3.323262f, 0.067014f, 0.221203f, -0.085082f, -0.228606f, -0.916668f, -0.022643f, -1.386737f, -0.131902f, -0.349952f, -0.032874f, -0.189190f, -0.898790f, -0.102394f, -1.017387f, 2.214050f, 1.790253f, -1.913561f, -0.043716f, -0.214924f, -0.194598f, -0.064723f, -1.671793f, 2.251166f, -0.146007f, 0.138527f, -0.003134f, 0.103665f, 0.006928f, -0.240253f, -0.227464f, 0.578437f, -0.214724f, 0.503085f, 0.158093f, 0.033091f, 0.008061f, 4.815371f, 2.132264f, 0.281850f, -2.288560f, -0.145012f, 1.296832f, -0.362401f, -0.403252f, 0.109873f, 0.185746f, 0.244764f, 0.172367f, -0.185588f, 0.139801f, -0.178254f, 0.068629f, 0.358488f, -0.153969f, -6.433524f, 0.225983f, -0.138123f, -0.095971f, -0.036089f, -1.400083f, 0.265908f, 0.257787f, 0.181144f, -1.647228f, -0.136289f, -0.074206f, 0.122988f, -0.088895f, -1.266717f, 0.006010f, 0.536681f, 0.263061f, -0.032207f, -0.155136f, 0.086431f, 0.441950f, -0.060755f, -0.280683f, -0.783475f, -2.567033f, 1.093221f, 0.117667f, -0.000408f, 0.225719f, -2.199698f, 0.141447f, -1.459051f, 0.051315f, 0.203228f, 0.354432f, -0.005775f, -0.028073f, -0.965817f, 0.231083f, -0.666884f, 0.026283f, -0.317486f, 0.210754f, 0.123897f, 0.223827f, 4.214405f, 1.457334f, -0.253945f, -1.306733f, -0.391235f, 0.451154f, -1.553888f, -0.353429f, 0.069533f, 0.159278f, -0.173836f, -0.004952f, -0.137033f, 0.127012f, 0.143600f, 0.051587f, -0.070549f, 0.066509f, -5.776547f, 0.180021f, -0.189183f, -1.288504f, -0.233575f, -1.473873f, 0.140940f, 0.144451f, -0.104534f, 2.089873f, -0.168168f, 0.110726f, 0.132134f, -0.215223f, -1.682754f, 0.157757f, -0.146163f, 0.064882f, 0.117313f, -0.038780f, -0.124720f, -0.501697f, 0.092047f, -0.233992f, 3.324976f, 0.516601f, 1.294202f, 0.119989f, 0.061055f, 0.043420f, -2.750727f, -0.382812f, -0.648496f, -0.115353f, -0.334205f, 0.024354f, -0.282998f, -0.282705f, 0.073798f, 0.169851f, 0.135651f, 0.182677f, -0.040220f, 0.132462f, -0.303120f, -0.230113f, 6.165739f, -0.258596f, 0.024127f, -1.388283f, -0.006042f, 0.572600f, 0.348411f, -0.387376f, -0.075845f, 0.122319f, -0.029616f, 0.077873f, 0.154763f, 0.049073f, 0.018597f, 0.102688f, -0.204165f, 0.020734f, -1.389133f, -0.032854f, -0.147561f, 0.853944f, 0.132100f, -3.259659f, 0.243745f, 0.181529f, -0.738414f, 1.509994f, 0.023470f, -0.005329f, 0.066115f, -1.345081f, -1.455402f, -0.172023f, -0.194625f, 0.071885f, -0.201742f, -0.262402f, 0.077601f, -0.048938f, 0.257993f, -0.504029f, -2.032415f, 1.158880f, 0.448647f, -0.025633f, 0.117586f, -0.072275f, -0.673744f, -3.854342f, -0.983843f, 0.047766f, -0.017193f, -0.215775f, -0.158743f, -0.232042f, -0.509112f, 0.148812f, 0.130122f, 0.006486f, -0.099016f, 0.022514f, -0.486850f, -0.059623f, 4.012731f, 0.025454f, 0.029059f, -0.783546f, -0.295260f, 0.322521f, -0.473201f, -0.172100f, -0.100087f, -0.076516f, -0.258367f, -0.112897f, 0.269364f, -0.065912f, 0.169022f, -0.178783f, -0.095114f, 0.122089f, -2.790099f, -0.100431f, -0.087963f, -0.009431f, -0.087819f, -2.774399f, -0.100757f, 0.013005f, -0.964533f, 3.236665f, -0.354903f, -0.144169f, -0.166869f, -1.396513f, -0.931271f, -0.046261f, -1.799262f, -0.365269f, 0.108611f, 0.037994f, 0.024747f, -1.073639f, -0.203158f, -0.935006f, 1.880891f, 1.578385f, 0.726272f, -0.024546f, -0.011626f, -0.151363f, -1.121716f, -1.787484f, 0.232806f, 0.075451f, 0.182899f, 0.092215f, -0.207347f, -0.030111f, 0.054316f, 0.192481f, 0.594639f, -0.247694f, 0.547471f, -0.032094f, -0.065000f, 0.007198f, 1.605377f, -0.155945f, -0.066200f, -2.343716f, -1.016283f, -0.079321f, 0.919365f, 0.599980f, 0.125545f, 0.265813f, 0.246884f, 0.095385f, -0.260374f, -0.202916f, -0.042770f, 0.234967f, -0.233139f, -0.326994f, -1.375256f, 0.121766f, 0.077433f, -1.103569f, 0.019497f, -1.029185f, 0.253905f, 0.206569f, 0.187334f, -0.237089f, -0.294351f, 0.164137f, 0.149696f, -0.749787f, -0.413433f, 0.976587f, 1.027976f, -0.285264f, 0.209273f, -0.124762f, 0.050884f, 0.250764f, -0.082031f, -0.646520f, 4.116680f, 0.437336f, 0.671684f, 0.129509f, -0.078462f, 0.014072f, -0.678232f, 0.094831f, 1.125624f, 0.207070f, -0.154750f, -0.025780f, -0.103030f, 0.118019f, -0.908186f, -0.263546f, -1.555324f, -0.236887f, -0.217854f, -0.051790f, 0.017915f, 0.171001f, 1.355562f, 0.094603f, -0.233929f, -1.282169f, -0.773183f, -0.161682f, -0.834565f, -0.286776f, -0.298901f, 0.038162f, 0.251899f, 0.039612f, -0.022935f, -0.232308f, -0.043855f, -0.192892f, -0.279009f, -0.182234f, -1.272808f, -0.070344f, -0.092432f, -1.915946f, -0.134373f, -1.405496f, -0.067071f, -0.131922f, 0.185269f, 1.465082f, 0.040240f, 0.112665f, 0.144329f, -0.286112f, -0.617649f, 0.916177f, 0.221044f, -0.079867f, 0.170251f, -0.093638f, -0.212620f, -0.305945f, -0.234356f, -0.482501f, 3.928472f, 1.241179f, 0.355922f, -0.170848f, -0.189168f, 0.080225f, -1.357793f, 0.190890f, 0.976800f, -0.068070f, -0.016295f, -0.088623f, -0.129560f, -0.212267f, -0.071537f, -0.219501f, -0.655198f, -0.225188f, -0.116024f, 0.224174f, -0.049715f, -0.178005f, 3.029985f, -1.141546f, 0.080066f, -1.932316f, -0.641137f, -0.189564f, 0.935080f, 0.136119f, 0.015558f, -0.179331f, 0.204571f, 0.020350f, 0.009362f, 0.108478f, 0.037076f, -0.049009f, 0.081090f, -0.180202f, 1.455561f, -0.081559f, 0.059361f, 0.484971f, 0.160923f, -2.170744f, -0.013204f, 0.126561f, -0.407122f, 1.223661f, 0.044262f, 0.118044f, 0.058274f, -1.747100f, -0.171318f, 0.971374f, 0.306995f, -0.103268f, -0.319443f, -0.333176f, -0.038608f, 0.119674f, -0.106479f, -0.907933f, 1.121231f, 1.673840f, -0.421458f, -0.021146f, -0.254838f, 0.097632f, 0.235109f, -2.901782f, 0.289518f, -0.355459f, -0.068264f, -0.179121f, 0.068560f, -0.047570f, -0.522523f, -0.228963f, -1.037158f, -0.163723f, 0.280563f, -0.000868f, -0.197220f, -0.239329f, 1.985274f, -0.256181f, -0.064341f, -0.822417f, -0.465140f, -0.010942f, -0.792024f, -0.114290f, 0.060969f, 0.104106f, -0.252123f, -0.150400f, -0.133277f, 0.267147f, 0.274413f, 0.223744f, -0.180223f, -0.345415f, -0.104883f, 0.119210f, -0.095041f, -0.301635f, 0.013175f, -2.128121f, -0.147208f, -0.151509f, -0.692013f, 3.418555f, -0.016541f, 0.171511f, 0.107159f, -1.516672f, 0.127408f, 0.687035f, -0.906486f, -0.145463f, -0.169382f, -0.143906f, 0.125091f, -0.960645f, -0.180869f, -0.716908f, 2.840951f, 1.904919f, -0.416268f, -0.425181f, -0.194697f, -0.075932f, -0.950604f, -1.599800f, 0.943671f, -0.022744f, -0.270492f, 0.080843f, -0.372916f, 0.047838f, -0.100300f, -0.026600f, 0.011733f, -0.226051f, 0.172790f, -0.172982f, 0.041258f, -0.299379f, }; static const float av1_ab_partition_nn_bias_16_layer1[LABEL_SIZE] = { -0.053805f, -1.248639f, 0.520965f, -0.904962f, -0.126425f, -0.118798f, 0.748430f, 0.203096f, 0.059317f, 0.418219f, 0.841294f, 0.402693f, -0.658522f, 0.723479f, 0.544264f, 1.035225f, }; static const NN_CONFIG av1_ab_partition_nnconfig_16 = { FEATURE_SIZE, // num_inputs LABEL_SIZE, // num_outputs 1, // num_hidden_layers { 64, // num_hidden_nodes }, { av1_ab_partition_nn_weights_16_layer0, av1_ab_partition_nn_weights_16_layer1, }, { av1_ab_partition_nn_bias_16_layer0, av1_ab_partition_nn_bias_16_layer1, }, }; #undef FEATURE_SIZE #undef LABEL_SIZE #define FEATURE_SIZE 18 #define LABEL_SIZE 4 static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 24] = { -2.032866f, 0.056691f, 0.495960f, 0.778785f, 0.548153f, -0.806942f, 0.481155f, 0.282298f, 0.584980f, 0.504688f, 0.209648f, 0.234616f, 0.213484f, 0.221969f, 0.205862f, 0.235054f, 0.317863f, 0.257139f, 0.529478f, 0.098122f, -0.657532f, 0.036296f, 0.327728f, 1.323180f, -0.813082f, 0.160216f, -0.702030f, 0.722733f, -0.270576f, -0.347416f, -0.264700f, -0.254248f, 0.159820f, 0.087995f, -0.184163f, 0.117357f, 0.074194f, -0.667369f, 0.498246f, 0.420506f, 0.072409f, -0.121581f, 0.315788f, 0.000525f, 0.414986f, 0.678166f, -0.011230f, 0.188131f, -0.227749f, 0.009564f, 0.108672f, 0.106923f, -0.080695f, -0.279382f, -0.061339f, -0.297835f, -0.134707f, 0.145865f, -0.009655f, -0.000842f, -0.047436f, -0.159149f, -0.320353f, -0.089646f, -0.344765f, 0.313416f, -0.143413f, 0.279668f, 0.000885f, -0.022380f, -0.140194f, -0.310473f, 0.252699f, 0.066204f, 0.477568f, 0.994609f, -0.276000f, 1.213182f, 0.277028f, -0.411570f, -0.211559f, 0.377815f, 0.121488f, -0.100559f, -0.317082f, -0.251039f, -0.335181f, -0.154114f, -0.052726f, -0.332558f, -0.143196f, -0.334035f, 0.162305f, 0.142279f, -0.001210f, -0.135252f, -0.033562f, 0.204307f, -0.039757f, -0.394174f, 0.126617f, -0.128648f, -0.410979f, 0.107641f, -0.117573f, -0.326512f, 0.235166f, 0.084959f, 0.290063f, -0.005838f, 0.459894f, 1.023709f, -0.196145f, 1.100137f, -0.319815f, -0.308526f, -0.443389f, -0.272769f, -0.035259f, -0.026932f, -0.029743f, 0.125113f, -0.131024f, -0.321458f, -0.143996f, 0.008714f, -0.101234f, 0.079706f, -1.128615f, -0.467381f, 0.220563f, -0.409900f, -0.435353f, 0.759499f, -0.465799f, -0.394309f, 0.176282f, -0.086275f, -0.161225f, -0.354814f, 0.562871f, 0.418253f, 0.414361f, 0.445480f, -0.995903f, -0.086632f, -0.230645f, 0.354656f, -0.317576f, 0.079926f, 0.424369f, 0.997232f, -0.304388f, 1.071667f, -0.023540f, 0.029677f, 0.108564f, 0.183581f, -0.201395f, -0.054854f, -0.193039f, -0.049899f, -0.271949f, -0.358483f, 0.304930f, 0.023823f, -0.009319f, -0.214247f, 0.100712f, -0.050162f, 0.327103f, -0.212999f, -0.030496f, 0.316380f, -0.439589f, -0.249959f, 0.229777f, -0.353664f, -0.384559f, 0.114236f, 0.023119f, 0.007927f, 0.618368f, 0.957759f, -0.019780f, -1.002389f, 0.564277f, -0.839531f, 1.040445f, 0.054340f, 0.031908f, -0.032893f, -0.019170f, -0.042011f, 0.568928f, 0.362567f, -0.559999f, -0.605344f, -0.586146f, -0.290778f, 0.195943f, -0.109580f, -0.088898f, -0.113054f, 0.293282f, 0.429019f, 0.306136f, 0.863025f, 0.021234f, 0.125770f, -0.097108f, -0.072659f, -0.137053f, -0.191631f, 0.106281f, 0.064151f, 0.029883f, 0.076287f, 0.757543f, 0.276713f, -2.529775f, -0.351727f, -1.832316f, 0.544780f, -0.944529f, 0.509705f, -0.010236f, -0.016181f, 0.021520f, 0.086417f, 0.041312f, 0.296853f, -0.372378f, 0.354446f, -1.366762f, 0.048875f, 0.464918f, -0.007450f, 0.750013f, -0.360261f, 0.518532f, 0.753776f, 0.641448f, 0.710746f, 0.250866f, 0.257063f, 0.283421f, 0.253585f, 0.170303f, 0.210426f, 0.208842f, 0.158000f, -0.033144f, 0.130748f, 0.907147f, 0.409248f, -0.854301f, -0.981307f, 0.294427f, -0.507137f, 1.079967f, 0.203203f, 0.383890f, 0.368278f, 0.305122f, 0.449288f, -0.044507f, -0.547263f, -0.298245f, -0.497834f, 0.007016f, -0.101982f, -0.073488f, -0.096111f, -0.479418f, -0.045497f, 0.033502f, -0.018578f, -0.231531f, 0.177949f, 0.099564f, -0.010233f, -0.333055f, -0.078586f, -0.417867f, 0.171271f, 0.013662f, -0.143599f, -0.117296f, 0.135382f, 0.048321f, 0.000924f, -0.055024f, -0.405595f, -0.068260f, -0.271011f, -0.436425f, 0.206751f, -0.899890f, 0.605510f, 0.535649f, -0.238919f, -0.037619f, -0.213734f, -0.391360f, -0.132344f, 0.004660f, 0.176644f, -1.008475f, -0.038895f, 0.155429f, -0.095229f, -0.680124f, -0.258063f, -0.261901f, 0.110380f, -0.337649f, -0.505870f, -1.428536f, 0.610629f, 0.254905f, 0.045098f, 0.044109f, 0.172329f, 0.060001f, -0.234009f, -0.184855f, -0.153028f, -0.140897f, -0.152006f, -0.312134f, 0.081261f, 0.160166f, 0.112690f, 0.266081f, 0.030175f, -0.242746f, 0.000754f, -0.341811f, -0.149774f, -0.017484f, -0.301342f, -0.121466f, 0.067300f, 0.342176f, 0.474538f, 0.085441f, -0.263935f, 0.479235f, -0.003713f, -0.784840f, 0.119480f, 0.456632f, -0.640082f, -0.080575f, -0.744403f, 0.259970f, 0.034667f, -0.274641f, -0.257594f, -1.121124f, -0.003745f, -0.420693f, 0.300441f, -0.100976f, -1.049016f, 0.201960f, 0.113054f, 0.187010f, 1.237427f, 0.054803f, -0.028673f, 0.003596f, -0.034724f, 0.117246f, 0.190977f, 0.278915f, 0.224307f, 0.017852f, -0.336233f, -0.372311f, -0.182284f, -0.143510f, 0.331466f, 0.045698f, -0.301095f, 0.184447f, 0.348240f, -0.017021f, -0.145064f, -0.000221f, -0.382256f, -0.302683f, -0.083927f, -0.008070f, 0.217907f, 0.647597f, -0.050490f, -0.572736f, -0.985748f, -0.289943f, 0.041391f, -0.795464f, -0.186680f, -0.354062f, -0.617400f, -0.282783f, -0.170450f, -0.197197f, -0.146496f, -0.173692f, -0.106277f, -0.071004f, -0.124405f, -0.971412f, 0.038542f, 0.705204f, 0.887113f, 0.150430f, -0.243676f, 0.638410f, 0.320953f, 0.776676f, 0.527584f, 0.070389f, 0.051554f, 0.177519f, 0.140451f, 0.128892f, 0.087771f, 0.197660f, 0.194764f, }; static const float av1_4_partition_nn_bias_16_layer0[24] = { 0.614063f, -0.384872f, 0.084884f, -0.023980f, -0.378765f, -0.082312f, -0.458271f, 0.189578f, -0.046169f, -0.073308f, -0.372322f, 0.162793f, 0.148803f, 0.829214f, -0.221162f, -0.111157f, -0.017484f, -0.280596f, -0.031905f, -0.143459f, 0.078823f, -0.021940f, 0.026834f, 0.257472f, }; static const float av1_4_partition_nn_weights_16_layer1[24 * LABEL_SIZE] = { -0.985391f, 0.587616f, 0.740683f, 0.192066f, 0.447080f, -0.016585f, 0.680449f, 0.028983f, 0.643111f, 0.234338f, 0.107148f, 0.328456f, -0.216394f, 1.106838f, -0.179062f, -0.129108f, -0.121655f, -0.151340f, -0.306017f, -0.350989f, 0.859284f, -0.372831f, -0.954419f, 0.250495f, 1.046732f, 0.287923f, -0.421088f, 0.326613f, -0.314396f, -0.084757f, -0.474228f, 0.687999f, 0.052334f, 0.441708f, -0.630698f, -0.350348f, -0.602067f, -0.434161f, -0.489824f, -0.313193f, 0.315568f, 0.603119f, 0.120245f, 0.182920f, -1.117797f, -0.239594f, -0.296296f, -0.718093f, 0.489497f, -0.527019f, 0.102453f, 0.426731f, 0.034606f, 0.311461f, -0.012723f, -0.229877f, -0.284290f, 0.383227f, 0.065696f, -0.222400f, 1.279248f, -0.862190f, 0.629766f, -0.250011f, -0.325060f, -0.360115f, -0.159540f, -0.291856f, -0.038348f, 0.224639f, 0.600934f, 0.030205f, 1.337615f, -0.286409f, -0.473710f, -0.418995f, -1.035249f, 0.004359f, -0.481860f, 0.563625f, -0.154709f, -0.101198f, -0.758796f, -0.507616f, -0.095253f, -0.711135f, 0.207759f, 0.076313f, -0.056087f, -0.162719f, -0.232918f, -0.128402f, -0.444620f, -0.447344f, 1.126012f, -1.504446f, }; static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = { -0.462133f, 0.465060f, 0.062211f, 0.401786f, }; static const NN_CONFIG av1_4_partition_nnconfig_16 = { FEATURE_SIZE, // num_inputs LABEL_SIZE, // num_outputs 1, // num_hidden_layers { 24, // num_hidden_nodes }, { av1_4_partition_nn_weights_16_layer0, av1_4_partition_nn_weights_16_layer1, }, { av1_4_partition_nn_bias_16_layer0, av1_4_partition_nn_bias_16_layer1, }, }; static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = { -0.219494f, -0.428273f, 0.471006f, 0.448210f, -0.152935f, 0.440435f, 0.922857f, -0.074436f, 1.002195f, 0.414176f, -0.327202f, -0.380066f, -0.212346f, 0.061868f, -0.056620f, 0.594134f, 0.617995f, 0.308358f, 0.232484f, 0.129849f, 1.483593f, -0.071460f, 1.984515f, 1.116422f, -1.141762f, -0.306220f, 0.089075f, -0.271845f, 0.187524f, 0.050396f, -0.061025f, 0.030809f, 0.172799f, -0.458151f, -0.318357f, 0.122052f, -0.414329f, 0.089366f, 0.118898f, -0.376213f, -0.206151f, -0.519946f, -0.463252f, -0.206694f, -0.254383f, -0.379487f, 0.093059f, -0.245280f, -0.205044f, -0.280060f, -0.171229f, -0.045389f, -0.179481f, -0.306245f, -0.500856f, 0.003388f, -0.527397f, -0.449330f, -0.174272f, 0.123769f, 0.023005f, 0.157273f, 0.073400f, 0.019099f, -0.113848f, -0.098601f, -0.290946f, -0.046770f, -0.314592f, -0.179914f, -0.391411f, -0.235631f, -1.282604f, 0.048505f, -0.746382f, 0.093740f, -0.706583f, -0.085729f, 0.947382f, -0.002961f, 1.175362f, 1.007309f, 0.141638f, -0.037608f, -0.118807f, -0.021474f, -0.146763f, 0.069363f, -0.074372f, -0.215713f, -0.004134f, -0.114110f, -0.330438f, -0.031136f, 0.111821f, -0.534598f, -0.357759f, -0.455950f, 0.139469f, 0.036582f, -0.384743f, -0.168828f, -0.239250f, 0.003520f, -0.049003f, 0.075702f, -0.025809f, -0.225972f, -0.228905f, -0.412489f, 0.060570f, -0.328819f, -0.206446f, -0.080231f, -0.372008f, -0.218118f, -0.011954f, 0.024155f, 0.156014f, 0.020679f, 0.194398f, -0.283491f, -0.024463f, -0.275099f, 0.028031f, 0.026340f, -0.254668f, 0.103637f, 2.178693f, 0.552284f, 0.109366f, -0.474806f, -0.379286f, -0.026315f, 2.487924f, -0.089466f, 0.206428f, 0.114578f, 0.152248f, 0.184050f, -0.631948f, -0.014793f, -0.283782f, -0.830353f, 0.009343f, -0.021029f, -0.060534f, -0.025164f, 1.841311f, 1.842748f, -1.979708f, 0.450985f, -1.606357f, -0.785454f, -0.212679f, -0.344342f, 0.198991f, -0.258070f, 0.055974f, 0.224069f, 0.453051f, 0.408053f, 0.027873f, -0.180538f, 0.056609f, 0.207654f, 0.104086f, -0.194426f, -0.359789f, -0.381143f, -0.331212f, -0.203973f, -0.324313f, -0.160825f, -0.160439f, -0.044856f, -0.346647f, 0.044859f, 0.231398f, -0.023643f, -0.140316f, -0.260177f, 0.206965f, -0.425386f, -0.420268f, -0.409748f, 0.006971f, 0.066186f, -0.034950f, -0.345518f, 0.018633f, -0.122489f, -0.038506f, -0.330942f, 0.161236f, -0.314119f, -0.050202f, -0.179597f, 0.731897f, -0.184481f, 0.153598f, -0.539501f, -0.301493f, -0.184967f, -0.883754f, -0.586959f, -0.136292f, -1.772065f, -0.196276f, -0.053272f, -0.101083f, -0.064142f, 0.161190f, 0.430826f, 0.355647f, 0.138266f, 0.051114f, -0.028893f, -0.477673f, -0.238663f, -0.354117f, -0.056747f, -0.334273f, -0.497688f, -0.486004f, -0.092033f, -0.241304f, -0.373250f, 0.120193f, 0.011360f, -0.010475f, -0.092739f, -0.159650f, -0.033129f, -0.259893f, -0.073217f, 0.200128f, 0.103407f, -0.229233f, 0.128831f, -0.063450f, -0.241732f, -0.408428f, -0.342239f, -0.264326f, -0.105403f, -0.442879f, -0.310456f, -0.112881f, 0.263696f, -0.205014f, -0.497936f, -0.261734f, -0.382312f, -0.426807f, -0.021995f, -0.152794f, -0.301494f, 0.117232f, -0.577809f, 0.154596f, -0.409522f, -0.413113f, -0.359199f, 0.307294f, -0.008746f, -0.310522f, 0.347620f, -0.384845f, -0.451398f, -0.226199f, 0.054154f, -0.167608f, 0.046836f, -0.013285f, -0.408119f, -0.177973f, -0.248293f, -0.465830f, 0.035827f, -0.222208f, -0.221717f, 0.066392f, -0.349769f, -0.428029f, -0.516692f, 0.022398f, -0.251682f, 0.134746f, 0.011167f, -2.078787f, 0.173592f, -1.948348f, 0.330060f, 1.993785f, -0.052859f, -0.004795f, -3.703177f, 0.013450f, -0.011687f, 0.073079f, 0.034803f, 0.025515f, 0.005994f, 0.101731f, 0.074303f, -0.109962f, -0.270825f, -0.068273f, -0.163268f, -0.252826f, 0.137190f, 0.007667f, -0.358453f, 0.027412f, 0.033492f, 0.021197f, -0.049991f, 0.104468f, -0.012157f, -0.056252f, -0.380756f, -0.338483f, 0.233235f, -0.048631f, -0.441209f, -0.158482f, -0.148108f, -0.263453f, 0.138847f, -0.304073f, -0.336312f, -0.017941f, -0.135563f, 0.075137f, -0.246475f, -0.229144f, -0.087744f, -0.346909f, 0.172611f, 0.004377f, -0.009386f, -0.023104f, 0.008000f, -0.029390f, -0.317842f, 0.549674f, -0.195337f, -0.863979f, 0.160889f, -0.269014f, -0.442104f, -1.799191f, 1.396533f, -0.112837f, 0.881303f, 0.000764f, -0.035415f, -0.141877f, 0.184831f, -0.363566f, -0.178569f, 0.254134f, -0.326893f, 0.127325f, 0.310620f, -0.384621f, 0.146058f, -0.287682f, -0.373447f, 0.026930f, 0.251650f, 0.053817f, 0.227509f, 0.121396f, 0.396514f, -0.278381f, -0.038969f, -1.538756f, -0.002856f, -0.892900f, 0.363426f, -1.257922f, 0.743795f, 0.941177f, 0.219345f, 0.684189f, 1.396858f, 0.026299f, -0.093433f, -0.066182f, 0.057868f, -0.089278f, -0.159680f, -0.262035f, -0.236656f, 0.005349f, -0.031314f, 0.027917f, -0.182113f, -0.212086f, -0.160774f, 0.051468f, 0.036787f, 0.183881f, -0.288205f, -0.349691f, 0.162511f, 0.117878f, -0.294534f, -0.365037f, -0.246313f, 0.073977f, -0.072378f, -0.173579f, -0.584560f, 0.547194f, 0.259853f, -0.405287f, -0.421146f, 0.165788f, -0.146964f, 0.257415f, 0.772394f, -0.475302f, -0.310906f, 0.058723f, 0.276833f, 0.586842f, 0.248998f, -0.061135f, 0.255779f, 0.152158f, -0.024781f, 2.821834f, 1.365141f, 0.914744f, 0.165752f, -1.048304f, -0.333891f, 1.804087f, -0.437028f, -0.120211f, -0.020443f, 0.040077f, 0.258600f, -0.598893f, -0.494579f, -0.281054f, -0.517041f, 0.005258f, 0.053986f, 0.322755f, 0.429495f, -1.992364f, -0.717192f, -1.774802f, 2.047362f, -0.016194f, 0.312606f, 0.019331f, 0.060950f, 0.116428f, 0.168458f, -0.307001f, -0.420734f, 0.475843f, 0.425346f, -0.107119f, 0.049892f, -1.168619f, 0.010878f, 0.354872f, 0.902717f, -0.391407f, 0.332772f, -1.335037f, -0.447100f, 0.481719f, -0.101069f, -1.806565f, 0.925280f, 0.346999f, 0.093809f, 0.006275f, 0.270814f, -0.691123f, 0.230748f, 0.137033f, 0.068228f, 1.555975f, -0.271637f, -0.370403f, 0.236131f, 0.367464f, -0.136562f, 0.428838f, 0.181750f, 0.338762f, 0.292449f, -0.748204f, -0.922731f, -0.959445f, -0.806418f, -0.140501f, 0.070525f, 1.248748f, 0.637990f, -1.307246f, -0.514055f, 0.393858f, -1.858727f, 0.713591f, -0.141044f, 0.080723f, 0.120220f, -0.031175f, 0.224488f, 0.753818f, -0.833351f, -1.099132f, 0.651100f, -0.135061f, -0.043820f, 0.026983f, -0.059259f, 0.001345f, -0.281775f, 0.006958f, 0.046103f, -0.246539f, 0.057630f, -0.360778f, -0.160681f, -0.414870f, -0.301979f, 0.000683f, 0.132957f, -0.477609f, 0.106110f, -0.637769f, -0.078374f, -0.229494f, 0.583108f, -0.822973f, -0.107540f, 1.063426f, -0.268346f, 1.105787f, 2.587550f, -0.020314f, -0.002161f, -0.063836f, -0.099990f, -0.103975f, -0.114078f, -0.094199f, -0.065181f, -0.019870f, -0.018920f, -0.219732f, 0.035608f, -1.789450f, 0.483032f, -0.464729f, 1.563277f, -1.054195f, 0.359991f, 0.065204f, 0.135623f, 0.158380f, -0.103815f, -1.398726f, -1.436666f, -0.356311f, 0.507752f, }; static const float av1_4_partition_nn_bias_32_layer0[32] = { 0.421645f, -0.620548f, -0.187819f, -0.189414f, -0.204975f, -0.189600f, -0.174917f, -0.651928f, -0.799655f, -0.086105f, -0.163449f, -0.089212f, -0.214495f, -0.108500f, -0.065777f, -0.127704f, 1.544948f, -0.032831f, -0.165621f, 0.145844f, -0.032104f, -0.453246f, -0.113444f, 0.321589f, -0.862375f, -0.108826f, -0.486259f, 0.685325f, 0.072569f, -0.187961f, 0.109579f, -0.082685f, }; static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = { 0.255012f, 0.658860f, 0.216907f, 0.165947f, 0.241182f, 0.340854f, 0.409445f, 0.165220f, 0.553373f, -0.242385f, -0.209571f, 0.255515f, 0.222500f, 0.037032f, 0.238590f, 0.061624f, -2.038693f, 0.264167f, -0.230144f, 0.129952f, -0.027979f, 0.847761f, 0.438922f, 0.462323f, 0.555345f, 0.030689f, 0.336357f, -0.357326f, -0.113137f, 0.272631f, 0.421022f, 0.367776f, -0.197094f, 0.157117f, -0.015008f, -0.056123f, -0.283913f, 0.186417f, 0.178561f, -0.763041f, 0.602038f, 0.341092f, 0.320453f, -0.312776f, -0.371240f, -0.356279f, 0.220117f, -0.131871f, 1.517429f, 0.162223f, -0.255069f, 0.451861f, 0.045071f, -0.223257f, 0.003257f, 0.015734f, -0.630447f, -0.672588f, 0.670164f, 0.571031f, -0.657948f, 0.034506f, -0.249076f, 0.790293f, 0.066491f, -0.131245f, 0.355173f, 0.564622f, 0.374048f, 0.033974f, 0.253970f, 0.495498f, -0.556321f, -0.104651f, 0.276947f, 0.057148f, -0.039126f, -0.170050f, -0.141542f, 0.158541f, 0.582763f, -0.100992f, 0.096705f, -0.209029f, 0.008449f, 0.255865f, 0.103565f, 0.317719f, 0.479499f, 0.599126f, -0.065613f, -0.268614f, 0.508736f, 0.180813f, -0.815868f, 0.051238f, 0.001223f, -0.305423f, -0.270079f, 0.036180f, 0.304342f, 0.202634f, 0.218348f, -0.304304f, -0.438297f, 0.241123f, 0.200230f, 0.151804f, 0.051944f, 0.160422f, -0.262981f, -0.417412f, 1.845729f, -0.086183f, 0.403517f, 0.059667f, 0.564543f, -0.081752f, 0.114907f, -0.284489f, -0.673943f, 0.056965f, 0.362221f, 0.403224f, -0.000233f, -0.209552f, -0.800926f, -0.134132f, }; static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = { -0.019518f, 0.198546f, 0.339015f, -0.261961f, }; static const NN_CONFIG av1_4_partition_nnconfig_32 = { FEATURE_SIZE, // num_inputs LABEL_SIZE, // num_outputs 1, // num_hidden_layers { 32, // num_hidden_nodes }, { av1_4_partition_nn_weights_32_layer0, av1_4_partition_nn_weights_32_layer1, }, { av1_4_partition_nn_bias_32_layer0, av1_4_partition_nn_bias_32_layer1, }, }; static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 24] = { -0.152649f, 0.074509f, 1.000136f, 0.601661f, -1.416694f, -1.932396f, -1.163850f, 0.640931f, -0.888625f, -0.345711f, 0.161799f, 0.103165f, 0.147513f, 0.089956f, 0.204329f, 0.196922f, 0.014927f, 0.283714f, -0.110422f, 0.062005f, -0.531870f, -0.075287f, -0.448349f, -0.218881f, -0.005592f, -0.130490f, -0.015779f, 0.093521f, -0.158487f, 0.072241f, 0.066879f, -0.418566f, -0.206281f, 0.025634f, 0.048334f, -0.534750f, 0.302081f, 0.028707f, -1.543248f, 0.103799f, -1.214052f, 0.395870f, 0.394754f, -0.272170f, -0.702953f, -4.057464f, -0.033497f, -0.042142f, 0.014742f, 0.065263f, 0.000879f, -0.019768f, 0.101275f, 0.163059f, -0.371392f, -0.283484f, 0.241915f, 0.012684f, -0.210101f, -0.166534f, -0.024894f, 0.274696f, 0.098993f, 0.104086f, 0.055044f, -0.289378f, 0.146571f, -0.147441f, 0.004056f, 0.112244f, -0.416162f, -0.033176f, -0.214836f, -0.213787f, 0.023197f, -0.339043f, 0.301109f, -0.408551f, 0.284922f, -0.344418f, -0.039255f, 0.158748f, -0.344169f, 0.078286f, -0.043957f, -0.302162f, -0.310826f, 0.063425f, 0.198166f, -0.285324f, -0.108252f, 0.038992f, -1.053110f, -1.663290f, -0.417185f, 1.504443f, 0.643206f, -0.850240f, 0.889641f, -0.733214f, 0.147302f, 0.060291f, -0.052954f, 0.167453f, 0.111870f, 0.085471f, 0.035107f, 0.064361f, 0.176053f, 0.184373f, 0.676576f, 0.066164f, 1.455569f, 0.925111f, -0.640845f, 0.803795f, -0.653782f, -0.201038f, 0.060033f, 0.016964f, -0.047590f, 0.045908f, 0.354162f, 0.014812f, 0.156978f, 0.058792f, -0.238119f, 0.002450f, -0.094388f, -0.155229f, 0.194858f, -0.355429f, -0.187098f, -0.119264f, -0.088694f, -0.102845f, 0.184905f, -0.425339f, -0.157808f, -0.104599f, -0.393248f, -0.379842f, 0.027741f, -0.185816f, -0.317294f, 0.002453f, -0.498241f, -0.204302f, -0.079093f, 0.020646f, -0.412850f, -0.426039f, -0.177050f, -0.419304f, -0.064478f, -0.191802f, -0.146812f, 0.171111f, 0.090261f, -0.367033f, -0.299051f, -0.322132f, 0.428192f, -0.252613f, 0.488498f, -0.559682f, 0.486720f, -0.511084f, 0.992506f, 0.346765f, -0.118697f, -0.065127f, -0.376612f, -0.345137f, -0.426517f, -0.516836f, 0.307083f, 0.609362f, 0.369555f, 0.093775f, -0.375664f, -0.221595f, -0.025465f, 0.134374f, -0.387031f, 0.096236f, 0.337465f, -0.124029f, -0.157340f, -0.368790f, -0.104490f, -0.279507f, -0.247705f, 0.146559f, -0.236206f, -0.036073f, 0.064206f, -0.330919f, 0.516591f, -0.013492f, 1.269568f, 1.182530f, -0.455390f, -1.328091f, -0.200950f, -0.380513f, -0.195532f, -0.341479f, 0.016064f, 0.021176f, 0.169119f, 0.103707f, -0.174504f, -0.462719f, -0.079445f, -0.247128f, 0.459111f, 0.036129f, 0.769570f, -0.080405f, 1.667107f, 0.355567f, -2.433896f, 0.627572f, -0.600090f, -0.651872f, -0.059769f, -0.041945f, -0.009933f, 0.014864f, -0.049378f, -0.041561f, 0.075180f, 0.138307f, 0.122366f, -0.160756f, 0.215327f, 0.013572f, 0.198194f, -0.762650f, 0.054466f, 1.110332f, 1.692853f, 0.658654f, -0.409549f, 0.506085f, 0.330962f, -0.223008f, 0.007448f, -0.289062f, -0.476231f, -0.228359f, 0.013977f, -0.000609f, -0.673604f, 0.275996f, 0.405291f, 1.693561f, -1.079768f, 1.122516f, -0.203227f, 0.099265f, -0.165207f, -0.323899f, -0.269973f, -0.080122f, 0.127700f, 0.190201f, 0.219527f, 0.306194f, 0.026049f, -0.003779f, 1.107357f, 1.720315f, 1.017908f, 0.078664f, -1.599813f, -0.482636f, -0.117450f, 0.122249f, 0.030220f, 0.039794f, 0.176350f, 0.129715f, -0.305755f, -0.274044f, -0.299640f, -0.187335f, -0.073616f, -0.564507f, -0.127758f, 0.044855f, -0.191090f, 0.039095f, 0.115378f, 0.969352f, -0.088360f, 0.301443f, 0.065726f, -0.019740f, -0.102350f, -0.084913f, -0.194615f, 0.118582f, 0.920789f, -0.171615f, -1.436553f, -0.026419f, -0.730864f, 0.615697f, -0.795079f, 0.119701f, 0.601782f, 0.792902f, 0.184920f, 1.635090f, -0.085860f, -0.033187f, -0.166883f, 0.008487f, -0.128300f, -0.089923f, -0.108781f, -0.133719f, -0.011988f, -0.239816f, -0.092563f, -0.238471f, -0.339722f, 0.177432f, -0.063101f, -0.121002f, 0.058072f, -0.031166f, 0.086413f, -0.016203f, -0.305075f, -0.005420f, -0.168796f, 0.148745f, -0.116737f, -0.050222f, -0.287952f, -0.290982f, -0.090449f, 0.076098f, -0.345632f, -0.061309f, 0.142218f, 0.035692f, 0.304517f, -0.228031f, 0.119608f, -0.120350f, 0.163404f, -0.105605f, -0.305462f, -0.176657f, 0.210070f, -0.227600f, -0.081965f, -0.464027f, -0.053782f, -0.018367f, 0.119159f, 0.017162f, -0.069792f, 0.305768f, -0.421095f, 0.187740f, -0.032059f, 0.575115f, -0.064283f, -0.091828f, 0.772648f, -0.393189f, -0.297098f, 0.141420f, 0.826389f, -0.071586f, -0.893968f, -0.346793f, -1.151655f, 0.039393f, 1.546000f, -0.094029f, -0.005786f, -0.195764f, -0.169724f, -0.133167f, -0.129312f, -0.418860f, -0.026553f, -0.053667f, -0.091976f, -0.106275f, -0.492625f, 0.025350f, -0.332075f, -0.475638f, -0.076667f, -0.065779f, 0.108957f, 0.246298f, -0.289007f, -0.442552f, -0.206692f, -0.257453f, 0.073806f, -0.458606f, -0.410390f, -0.312674f, -0.144813f, 0.170128f, 0.018810f, -0.098241f, 1.027369f, 0.479328f, 1.129707f, 0.484813f, -0.085207f, 0.621873f, -0.520981f, 0.236175f, 0.273487f, 0.061426f, 0.306085f, 0.161487f, 0.220991f, 0.223783f, -0.091826f, 0.391031f, }; static const float av1_4_partition_nn_bias_64_layer0[24] = { 0.580225f, -0.191304f, 1.091767f, -0.134522f, -0.089361f, 0.398750f, -0.882708f, -0.213102f, -0.119981f, 0.378296f, -0.075719f, 0.426598f, -2.015505f, 0.202534f, -1.044792f, -0.841519f, 0.266421f, -0.047115f, -0.131147f, -0.075066f, -0.009441f, 0.853007f, -0.175606f, -0.868306f, }; static const float av1_4_partition_nn_weights_64_layer1[24 * LABEL_SIZE] = { -0.851937f, -0.211148f, -2.289513f, -0.275071f, 0.251340f, -0.340847f, 0.498032f, 0.308652f, -0.051574f, 0.323146f, -0.097547f, -0.040269f, 1.909655f, 0.098348f, 0.588136f, 0.568112f, 0.313297f, 0.920848f, -0.014486f, 0.386014f, 0.029199f, -0.537330f, -0.021502f, 0.349073f, -0.524715f, -0.351848f, 1.565454f, -0.297148f, 0.020177f, 0.648369f, 0.027321f, -0.096052f, -0.363163f, -0.132642f, 0.024292f, -0.734176f, -0.782700f, 0.408299f, 0.476945f, -0.489512f, -0.728318f, -0.632042f, 0.405417f, 0.184086f, -0.400730f, 0.359032f, 0.019710f, -0.217409f, 0.519159f, -0.136316f, 0.993592f, -0.147128f, 0.097495f, 0.426189f, -0.295233f, 0.278799f, 0.080667f, -0.025052f, -0.307757f, 0.418716f, -0.853388f, -0.374878f, -0.322725f, 0.696335f, -0.380649f, -0.160356f, -0.140060f, 0.502455f, 0.656728f, -0.095023f, -0.184198f, -0.347069f, 0.456372f, -0.029754f, 0.907923f, 0.265710f, -0.065505f, 0.226763f, -0.277798f, 0.413292f, -0.593899f, -0.060740f, -0.313358f, -0.249944f, -0.627329f, -0.327151f, -0.853788f, -1.163807f, -0.388944f, -0.228788f, -0.057382f, 0.334741f, -0.283083f, 0.368280f, -0.407197f, -0.441849f, }; static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = { -0.478735f, 0.292948f, 0.293172f, 0.040013f, }; static const NN_CONFIG av1_4_partition_nnconfig_64 = { FEATURE_SIZE, // num_inputs LABEL_SIZE, // num_outputs 1, // num_hidden_layers { 24, // num_hidden_nodes }, { av1_4_partition_nn_weights_64_layer0, av1_4_partition_nn_weights_64_layer1, }, { av1_4_partition_nn_bias_64_layer0, av1_4_partition_nn_bias_64_layer1, }, }; #undef FEATURE_SIZE #undef LABEL_SIZE #define FEATURE_SIZE 4 static const float av1_partition_breakout_nn_weights_128_layer0[FEATURE_SIZE * 32] = { -0.331785f, 0.068675f, -0.323814f, 0.033714f, -0.237835f, 0.166316f, -0.498766f, -0.545634f, -0.266173f, -0.476957f, -0.120409f, -0.021042f, 0.124056f, -0.278750f, -0.110120f, -0.372812f, 4.547939f, 0.097618f, -0.002710f, -0.064169f, -1.841173f, -0.403833f, 0.005536f, 0.067188f, -0.434935f, -0.227421f, -0.000011f, -0.139961f, -0.174056f, -0.652384f, -0.000015f, -0.262847f, -3.319706f, -0.947693f, 0.002981f, 0.016717f, -10.408850f, -0.014568f, -0.000018f, 0.019084f, 1.523383f, 0.074525f, -0.002076f, -0.020734f, 4.881495f, 0.002799f, 0.000342f, -0.019623f, 1.786154f, 0.037462f, -0.019037f, 0.052833f, 11.408153f, -0.044602f, 0.026155f, -0.518627f, -0.474499f, -0.427430f, -0.442733f, -0.011116f, -22.379410f, -0.000549f, -0.001418f, 0.008090f, -0.295090f, -0.230268f, -0.337278f, -0.001127f, -0.644282f, -0.598783f, -0.539417f, -0.003303f, 9.189824f, 0.038066f, -0.004097f, -0.460045f, -0.308858f, -0.242691f, -0.230835f, -0.273057f, 0.152226f, 0.179239f, -0.146382f, -0.004655f, -0.242940f, -0.718862f, -0.001685f, -0.214736f, 3.263186f, 0.079463f, -0.003854f, -0.187461f, -0.599144f, -0.419808f, -0.000597f, -0.136980f, 0.184813f, -0.319525f, -0.007246f, 0.079709f, -0.883229f, -0.343748f, -0.000077f, -0.172214f, -0.548759f, -0.194674f, -0.144786f, 0.043896f, -0.176364f, -0.248394f, -0.090215f, -0.294743f, -0.280980f, -0.181436f, -0.115681f, -0.071915f, -13.035494f, -0.075623f, 0.017052f, -0.171152f, 5.910803f, 0.128344f, 0.010256f, -1.073301f, 2.387826f, 0.166183f, -0.007193f, -0.257836f, }; static const float av1_partition_breakout_nn_bias_128_layer0[32] = { 0.115591f, -0.100178f, -0.165523f, -0.122997f, 11.045759f, 1.034761f, -0.323672f, -0.189087f, 2.850950f, 7.010029f, -21.447067f, 1.877031f, 0.437442f, 5.929414f, -0.117274f, 4.462253f, -0.135198f, -0.145927f, 8.727211f, 0.000000f, -3.532987f, -0.405898f, 11.364439f, -0.141728f, -5.994947f, -0.362574f, 1.857687f, -0.100400f, -0.130312f, 0.006080f, 0.429660f, -8.439470f, }; static const float av1_partition_breakout_nn_weights_128_layer1[32] = { -0.013738f, 0.022052f, -0.074437f, -0.211377f, -0.080433f, 0.015543f, 0.002091f, 0.014252f, 0.134834f, 0.190263f, 0.244175f, -0.031747f, 0.020068f, -0.068326f, 0.185471f, 0.660268f, -0.134898f, -0.010376f, -0.276023f, -0.282921f, -0.022769f, 0.007070f, -0.186235f, 0.024407f, -0.024837f, 0.005764f, 0.016599f, -0.040077f, 0.020990f, 0.095054f, -0.039662f, 0.131499f, }; static const float av1_partition_breakout_nn_bias_128_layer1[1] = { 0.86678213f, }; static const NN_CONFIG av1_partition_breakout_nnconfig_128 = { FEATURE_SIZE, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 32, // num_hidden_nodes }, { av1_partition_breakout_nn_weights_128_layer0, av1_partition_breakout_nn_weights_128_layer1, }, { av1_partition_breakout_nn_bias_128_layer0, av1_partition_breakout_nn_bias_128_layer1, }, }; static const float av1_partition_breakout_nn_weights_64_layer0[FEATURE_SIZE * 16] = { 0.872892f, -0.235539f, -0.412159f, -0.142533f, -2.251479f, -0.057073f, -0.001373f, 0.112147f, 5.281734f, 0.060704f, 0.000838f, -0.961554f, 0.244995f, 0.154515f, -0.292654f, -0.167177f, -3.759112f, -0.486347f, 0.003208f, -0.418226f, 2.618152f, 0.026832f, 0.003988f, -0.404406f, -0.405434f, 0.102791f, -0.033406f, -0.029820f, -4.492342f, -0.154291f, 0.012947f, -0.195075f, 0.009311f, -0.411410f, -0.010986f, -0.554822f, 0.160576f, 0.020796f, -0.457230f, -0.191111f, -7.759542f, -0.065039f, -0.001322f, 0.055691f, 0.291924f, -0.053076f, -0.148379f, -0.298383f, 1.022023f, -0.033668f, -0.000804f, -0.825778f, -3.902254f, -0.085812f, -0.052520f, -0.035012f, -0.465468f, -0.319231f, -0.497529f, -0.183068f, -2.407131f, -0.062304f, 0.000874f, 0.108786f, }; static const float av1_partition_breakout_nn_bias_64_layer0[16] = { 0.081425f, -14.404084f, 11.511393f, -0.930053f, 1.841889f, 15.020920f, -1.872288f, 5.392535f, -0.329335f, -0.005358f, 12.600776f, 0.000000f, -0.337413f, 4.492778f, 0.000000f, 17.043072f, }; static const float av1_partition_breakout_nn_weights_64_layer1[16] = { -0.465338f, -0.103023f, -0.174808f, -0.005156f, -0.016366f, -0.172494f, 0.014185f, 0.067030f, -0.001939f, -0.175049f, 0.245992f, -0.181660f, -0.038572f, 0.307899f, -0.294283f, 0.118323f, }; static const float av1_partition_breakout_nn_bias_64_layer1[1] = { -1.33438122f, }; static const NN_CONFIG av1_partition_breakout_nnconfig_64 = { FEATURE_SIZE, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 16, // num_hidden_nodes }, { av1_partition_breakout_nn_weights_64_layer0, av1_partition_breakout_nn_weights_64_layer1, }, { av1_partition_breakout_nn_bias_64_layer0, av1_partition_breakout_nn_bias_64_layer1, }, }; static const float av1_partition_breakout_nn_weights_32_layer0[FEATURE_SIZE * 16] = { -4.825528f, -0.145737f, 0.001907f, 0.145415f, -1.858153f, -0.080744f, 0.000601f, 0.211991f, 0.384265f, -0.043945f, -0.521332f, -0.170622f, -0.046866f, -0.600506f, -0.001216f, -0.332760f, -0.447677f, -0.605844f, -0.121008f, -0.119936f, -0.215739f, -0.269665f, -0.668587f, 0.071318f, -1.202551f, -0.729727f, -0.370084f, 0.088215f, -1.926800f, -0.086519f, 0.000359f, 0.215120f, 0.718749f, 0.022942f, 0.003840f, -0.176518f, 1.213451f, 0.080786f, 0.001557f, -1.053430f, 0.202698f, -0.583919f, -0.535512f, -0.239927f, -0.110151f, -0.128832f, -0.441087f, -0.145575f, -0.178518f, -0.585784f, 0.000029f, -0.833014f, -0.331358f, -0.520297f, -0.088676f, -0.178487f, -1.430755f, 0.022981f, -0.106931f, 0.015573f, -0.520814f, -0.045386f, -0.443123f, -0.484209f, }; static const float av1_partition_breakout_nn_bias_32_layer0[16] = { 11.747026f, -9.337718f, 0.341648f, -0.155847f, -0.104005f, 4.666283f, 6.669584f, 16.625504f, 9.885626f, 15.439183f, -0.346080f, 0.000000f, -0.423808f, 0.000000f, 6.352258f, -0.155787f, }; static const float av1_partition_breakout_nn_weights_32_layer1[16] = { 0.168561f, -0.122519f, 0.524667f, 0.032474f, 0.059097f, 0.011900f, 0.166445f, 0.127256f, -0.034838f, -0.212586f, -0.317973f, 0.348419f, -0.004171f, 0.157694f, 0.117845f, 0.272115f, }; static const float av1_partition_breakout_nn_bias_32_layer1[1] = { 0.09049262f, }; static const NN_CONFIG av1_partition_breakout_nnconfig_32 = { FEATURE_SIZE, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 16, // num_hidden_nodes }, { av1_partition_breakout_nn_weights_32_layer0, av1_partition_breakout_nn_weights_32_layer1, }, { av1_partition_breakout_nn_bias_32_layer0, av1_partition_breakout_nn_bias_32_layer1, }, }; static const float av1_partition_breakout_nn_weights_16_layer0[FEATURE_SIZE * 16] = { 0.209371f, 0.028758f, 0.005764f, -0.384401f, -0.625777f, -0.005647f, -0.316867f, 0.042985f, 0.127344f, 0.025461f, 0.011465f, -0.071043f, -0.295977f, -0.076093f, -0.209681f, -0.311653f, -0.147538f, 0.009910f, -0.130997f, -0.012326f, 0.024124f, -0.323578f, -0.005790f, -0.085664f, -1.575066f, -0.119221f, 0.015018f, 0.187204f, 0.238117f, 0.084924f, -0.004444f, -1.271538f, -0.709860f, -0.006226f, -0.903111f, 0.090573f, -0.278642f, -0.011114f, 0.021162f, 0.081290f, -0.467486f, -0.040771f, -0.224069f, -0.714390f, -0.281905f, -0.001336f, -0.761212f, -0.060385f, -0.814479f, -0.050450f, -0.003666f, 0.085668f, -0.272589f, 0.057330f, -0.206540f, -0.303418f, 0.075335f, -0.180468f, -0.064872f, -0.755948f, -0.509287f, -0.048877f, -0.001512f, 0.077086f, }; static const float av1_partition_breakout_nn_bias_16_layer0[16] = { 16.421495f, 4.012273f, -1.828571f, 0.000000f, -0.263564f, -0.201972f, 6.564987f, 14.651000f, -3.227779f, 2.241833f, -0.137116f, 0.762876f, 5.625762f, 0.615822f, 0.040057f, 16.668884f, }; static const float av1_partition_breakout_nn_weights_16_layer1[16] = { -0.096440f, 0.184316f, -0.021148f, 0.424974f, 0.003743f, 0.006310f, 0.046266f, -0.219224f, -0.087004f, 0.024623f, -0.275798f, 0.120164f, 0.269773f, -0.021105f, -0.146698f, 0.188764f, }; static const float av1_partition_breakout_nn_bias_16_layer1[1] = { 1.60751927f, }; static const NN_CONFIG av1_partition_breakout_nnconfig_16 = { FEATURE_SIZE, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 16, // num_hidden_nodes }, { av1_partition_breakout_nn_weights_16_layer0, av1_partition_breakout_nn_weights_16_layer1, }, { av1_partition_breakout_nn_bias_16_layer0, av1_partition_breakout_nn_bias_16_layer1, }, }; static const float av1_partition_breakout_nn_weights_8_layer0[FEATURE_SIZE * 16] = { -0.255885f, 0.109548f, -0.111054f, -0.476119f, -1.083031f, -0.342003f, 0.048241f, -0.356013f, -0.085054f, 0.124908f, 0.000084f, -0.149906f, -0.729829f, 0.133535f, -0.002125f, 0.207516f, -0.210163f, -0.567365f, -0.590103f, 0.045308f, -0.539406f, 0.130550f, -0.663879f, -0.170549f, 0.017587f, -0.054187f, 0.000550f, 0.038297f, -0.112891f, -0.012751f, -0.048067f, 0.095564f, 0.079892f, 0.077285f, -0.749708f, -0.286312f, -0.054334f, 0.132242f, -0.004152f, -0.209758f, -0.073407f, 0.082306f, -0.001034f, -0.090990f, 0.122823f, -0.109794f, -0.230066f, -0.391155f, -0.262245f, -0.004744f, -0.232246f, 0.099290f, -0.637484f, 0.111937f, -0.548556f, -0.598344f, 0.123265f, -0.281395f, -0.399711f, -0.525671f, -0.596269f, 0.098494f, -0.005765f, 0.173652f, }; static const float av1_partition_breakout_nn_bias_8_layer0[16] = { 0.194141f, -0.111223f, 2.503733f, -7.155602f, -0.695068f, 0.114874f, 2.056990f, 5.284306f, 0.639643f, -2.792049f, -2.232339f, -0.232209f, 2.336705f, -0.278834f, 0.231905f, 7.954366f, }; static const float av1_partition_breakout_nn_weights_8_layer1[16] = { -0.014439f, 0.010171f, 0.048116f, -0.090659f, -0.081235f, -0.021840f, -0.017360f, 0.031063f, -0.031737f, -0.023439f, -0.037725f, 0.021954f, 0.055858f, 0.230970f, -0.056466f, 0.119780f, }; static const float av1_partition_breakout_nn_bias_8_layer1[1] = { 1.27784479f, }; static const NN_CONFIG av1_partition_breakout_nnconfig_8 = { FEATURE_SIZE, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 16, // num_hidden_nodes }, { av1_partition_breakout_nn_weights_8_layer0, av1_partition_breakout_nn_weights_8_layer1, }, { av1_partition_breakout_nn_bias_8_layer0, av1_partition_breakout_nn_bias_8_layer1, }, }; #undef FEATURE_SIZE #define FEATURE_SIZE 9 // Input layer size #define NUM_NODES 32 // Hidden layer size #define LABEL_SIZE 3 // Output layer size static const float av1_rect_partition_nn_weights_8_layer0[FEATURE_SIZE * NUM_NODES] = { 0.22151f, 0.99424f, 0.23415f, -1.13841f, -0.11277f, 0.09530f, 0.14769f, -1.18895f, -0.96640f, -0.21421f, -0.13974f, 0.03236f, 0.15777f, -0.03176f, 0.02729f, -0.37344f, -0.01727f, -0.05469f, 0.19402f, -3.45508f, 0.90106f, -2.91557f, 0.19379f, 0.14356f, -0.13291f, 0.05734f, -0.03032f, -0.13060f, 0.35744f, 1.31630f, -1.54493f, -0.20749f, -0.24413f, -0.04524f, -0.12400f, 1.08305f, -0.21596f, 0.76244f, 1.10616f, -1.71706f, 0.05768f, 0.10966f, 0.00949f, -0.12680f, 0.00699f, -0.11522f, -0.38566f, 0.34283f, -0.35266f, -0.40643f, -0.22462f, 0.32300f, -0.39737f, -0.20587f, -0.16096f, 1.07543f, 0.30314f, -1.35659f, -0.38212f, 0.45857f, 0.76615f, 0.16819f, -1.24459f, 0.39677f, 0.87436f, -2.33757f, 1.27471f, 0.27488f, 0.01019f, -0.01221f, -0.07461f, -0.14577f, -0.01231f, -0.64426f, -1.02733f, -1.96242f, 0.95143f, -0.06777f, -1.13868f, 0.01354f, -0.75590f, -0.78222f, -0.07453f, 0.61788f, 0.56899f, 1.17144f, 0.70899f, 0.48568f, 0.11266f, 0.81579f, -0.03929f, 0.01088f, 0.33599f, -0.22401f, -0.49654f, -0.02598f, 0.04509f, -0.08217f, -0.30687f, 0.19851f, -2.96860f, -2.30698f, 0.01848f, 0.11801f, 0.06614f, 0.01673f, -0.11002f, -0.08168f, 0.09204f, -0.06379f, 0.27972f, -0.31716f, -0.00566f, -0.13651f, -0.37276f, 0.01511f, -0.23697f, 0.21696f, -0.19480f, 0.60758f, -0.43506f, -0.02247f, -1.45073f, 0.84442f, -0.94018f, 0.32550f, 0.03985f, -0.06581f, 0.21665f, 0.79472f, -2.41080f, 0.04788f, -0.09492f, -0.10677f, 0.07250f, 0.14329f, -0.37319f, 0.53043f, -0.49108f, 0.25792f, -0.36569f, -0.28669f, -0.18416f, -0.52385f, -1.17081f, -1.32153f, -1.13403f, -0.26196f, 0.93379f, 0.72115f, 0.54464f, 0.27642f, 0.04757f, 2.01629f, 1.55787f, -0.11665f, 1.00722f, -0.24352f, 0.53308f, 0.57719f, 0.39344f, 0.19174f, 0.06339f, -0.02530f, 0.07724f, -0.32416f, -0.26992f, -0.35887f, -0.35285f, -0.33379f, -0.37475f, -0.77335f, 1.70027f, -1.52153f, -0.26503f, 0.97552f, -2.96705f, -0.91220f, -0.11827f, 0.00406f, -0.14514f, 0.18417f, -0.20874f, 0.27293f, -0.34072f, -0.34838f, -0.19054f, -0.29806f, -0.27960f, -0.19293f, -0.18275f, -0.05902f, 0.58625f, -0.05470f, -0.48814f, -0.45382f, -0.05959f, 2.01250f, -0.30014f, 0.69546f, -1.24180f, 1.34923f, 0.20337f, 0.16850f, 0.07187f, 0.72630f, -0.15380f, -2.40973f, -2.73561f, -1.71375f, -1.61695f, 0.50052f, 0.09730f, 0.00579f, 0.06133f, -0.06512f, -0.61439f, -1.16173f, -0.58716f, 1.60438f, 0.23242f, 0.91847f, 0.49041f, -0.16277f, -0.02574f, -0.64593f, 1.17028f, 0.46852f, 0.14926f, 0.73853f, -0.78521f, 0.05959f, -0.35590f, 0.02039f, 0.10812f, -0.28650f, 1.34038f, -0.72188f, 0.62385f, -0.35271f, -0.39599f, 0.41543f, 0.53124f, -0.23510f, -0.15480f, -0.05066f, -0.33529f, 0.05238f, -0.35311f, -0.26983f, -0.39764f, 0.01085f, 0.26593f, -0.18411f, -0.29945f, 0.50090f, -0.03397f, 0.78562f, -0.33068f, 1.21308f, -2.23273f, -0.33366f, -0.15164f, -1.13270f, 0.17394f, 0.65567f, 0.76496f, 0.44325f, 0.01368f, -0.33619f, -0.64256f, 0.64478f, 0.84553f, 1.74183f, 0.22563f, -0.14550f, -0.16258f, 0.03010f, 0.49922f, 0.64575f, -0.29187f, -0.10348f, -1.43619f, -0.56540f, -0.14779f, 0.04616f, 0.87411f, -1.08228f, }; static const float av1_rect_partition_nn_bias_8_layer0[NUM_NODES] = { 0.33919f, -0.03003f, 0.79073f, -0.18508f, 0.00668f, -0.12017f, 0.35362f, -0.51642f, 0.06536f, 0.41668f, -0.06509f, 0.94606f, -0.15385f, 0.14936f, 1.46274f, -0.06961f, 2.82537f, -1.95576f, -0.09457f, 0.02042f, -0.07480f, -0.55083f, 0.26170f, 4.39883f, 0.33999f, -0.10502f, 0.70884f, -0.06992f, -0.22638f, 1.40940f, -0.09309f, 0.05828f, }; static const float av1_rect_partition_nn_weights_8_layer1[NUM_NODES * LABEL_SIZE] = { 0.09209f, 0.26236f, 0.62136f, 0.76324f, -1.14678f, 0.42289f, -0.08895f, -0.97267f, 2.05958f, 0.00843f, 0.35335f, 1.12096f, -0.11679f, 0.07350f, -1.23231f, -0.61990f, 1.51379f, -1.99450f, 0.22441f, 2.41974f, -0.30488f, -0.37869f, 0.47168f, -3.70132f, 0.00061f, 0.19432f, 0.11512f, 0.26200f, -0.35285f, 0.37985f, 0.90571f, 0.27344f, 0.74840f, -0.17965f, -2.51433f, 0.59235f, 1.16670f, -0.53446f, 0.67897f, 0.04505f, -0.86874f, 0.45361f, -0.35033f, 1.21283f, 0.31426f, -0.20841f, 0.56757f, 0.45909f, -1.23683f, 0.09835f, -0.17214f, -0.96323f, 0.01138f, -0.50233f, 0.30104f, 2.01814f, 1.15821f, -0.11947f, 0.74574f, -0.30714f, -0.39646f, -1.30086f, -0.88541f, -0.12259f, -0.54977f, 0.30069f, 1.84299f, -0.95141f, -0.65887f, -0.25888f, -0.63265f, 1.29531f, -0.56672f, 0.10837f, -0.21297f, -2.19131f, 0.01156f, 0.51912f, 0.46704f, 0.42810f, -0.59271f, 0.98469f, -0.17914f, -1.91163f, -0.32807f, 0.48199f, -0.99525f, 1.67108f, -0.87631f, -0.60258f, -0.78731f, -0.32877f, 0.44237f, 0.01087f, 0.07489f, -0.28224f, }; static const float av1_rect_partition_nn_bias_8_layer1[LABEL_SIZE] = { 1.70665f, -0.77954f, -0.92709f, }; static const NN_CONFIG av1_rect_partition_nnconfig_8 = { FEATURE_SIZE, // num_inputs LABEL_SIZE, // num_outputs 1, // num_hidden_layers { NUM_NODES, }, // num_hidden_nodes { av1_rect_partition_nn_weights_8_layer0, av1_rect_partition_nn_weights_8_layer1 }, { av1_rect_partition_nn_bias_8_layer0, av1_rect_partition_nn_bias_8_layer1 } }; static const float av1_rect_partition_nn_weights_16_layer0[FEATURE_SIZE * NUM_NODES] = { -0.18480f, -0.05410f, -0.18957f, 0.15451f, -0.38649f, -0.26162f, -0.22727f, -0.38555f, -0.36738f, 0.74384f, -1.85999f, 0.98491f, -0.72119f, 1.77321f, 0.39983f, 0.96314f, 0.23695f, 0.30200f, 0.30629f, -0.47617f, -1.43320f, -1.81730f, 0.36554f, -0.07142f, -1.27242f, -1.27697f, 0.00110f, -0.32179f, 0.27460f, 0.45428f, 0.15308f, -0.73906f, -0.28577f, -0.01238f, -0.16958f, -0.85390f, 1.05484f, -1.62812f, 0.77632f, -0.27327f, -0.32527f, 0.32726f, 1.73255f, 0.53763f, 0.59121f, -0.39068f, -0.32451f, -0.31869f, 0.17777f, 0.07519f, -0.18066f, -0.11250f, -0.14616f, -0.16882f, -0.04099f, -0.67959f, 0.39674f, -0.08596f, 0.18587f, -2.04097f, -1.73993f, 1.57212f, 1.42410f, -1.36762f, -0.41485f, -1.12103f, 0.56959f, 0.11500f, 0.48945f, -0.13585f, 1.22125f, 0.67071f, -1.11812f, -0.20660f, -0.52856f, 0.70663f, 0.74382f, 0.61114f, -0.11454f, 1.14687f, 0.80322f, -0.45965f, -0.44466f, -0.05830f, 0.13206f, -0.53750f, -0.11324f, -0.37971f, -0.13491f, -0.21268f, 1.93407f, 1.34433f, 2.49427f, 2.91955f, 1.71730f, 0.03295f, 0.03587f, -0.14550f, 0.08189f, -0.38655f, -0.35432f, -0.62706f, -0.01849f, -0.57882f, -0.60438f, -1.01334f, -0.57302f, 0.22592f, 0.05916f, -0.05305f, -0.89824f, -0.52969f, -0.24542f, 0.27029f, -0.40924f, -0.82452f, -0.60665f, -5.03025f, 0.83302f, 1.83695f, 2.19716f, 2.31001f, 0.03657f, 0.00063f, -0.04379f, 0.05835f, -0.08623f, 0.20557f, -0.17791f, 0.07874f, -0.25456f, -0.19513f, -0.27753f, -0.31982f, 0.00245f, -0.33183f, 0.26059f, -0.22165f, 0.37582f, -0.30411f, -0.22639f, -0.14739f, -0.20201f, -0.37507f, -1.30653f, 0.49570f, 1.03673f, 0.66139f, 0.44941f, -0.44461f, -0.50376f, -0.49664f, 0.18608f, -0.26175f, 0.14844f, 0.78715f, -0.70344f, -0.87624f, -0.98535f, -0.35346f, 0.37094f, -0.43135f, -0.22571f, 3.46263f, 3.13580f, -1.33203f, -0.15247f, -0.15866f, -0.11214f, 0.12211f, 0.03964f, -1.87597f, -4.81597f, -4.80195f, -4.98096f, -5.62336f, -0.05337f, -0.00943f, 0.00792f, 0.02742f, 1.05679f, 2.41455f, 0.85382f, 1.42504f, 0.58096f, 0.21443f, 1.02694f, 1.06746f, 1.20242f, 0.60767f, 1.98667f, -0.80879f, -0.63495f, 1.95508f, 0.23952f, -0.15019f, -0.16097f, 0.30155f, -3.42407f, -1.34998f, 9.07689f, -2.22559f, 2.22562f, -0.03348f, -0.05229f, 0.05931f, 0.03042f, -0.18068f, -0.05732f, -0.33010f, -0.32279f, -0.26607f, -0.02723f, -0.04067f, 0.08700f, -0.16366f, -0.24935f, -0.69124f, 0.58508f, 0.50654f, 0.04492f, 1.38340f, -1.51487f, 1.72889f, -1.95618f, -3.65013f, -1.38525f, -3.05516f, -2.40448f, 2.47467f, 0.03784f, 0.08052f, -0.01971f, -0.08918f, -0.84997f, -0.55302f, -1.07861f, -0.62626f, 0.61751f, -0.11012f, -0.24185f, -0.39201f, -1.85390f, -0.31261f, -0.11927f, 0.15671f, -0.23450f, -0.14916f, -0.31715f, -0.19350f, 0.01795f, -0.11533f, -0.05799f, -0.03142f, 0.20218f, -0.39499f, -0.33859f, -0.13201f, -0.19527f, -0.28459f, -0.20346f, 0.89457f, -2.22103f, -2.37455f, -2.00221f, 2.44553f, 0.33915f, 0.50047f, -0.34625f, -0.19667f, -0.56333f, -0.84328f, 1.25767f, -1.70297f, 1.00482f, -0.00103f, -1.40813f, 0.21311f, 0.39230f, -0.07302f, -3.49100f, 1.60675f, -2.90692f, 0.11022f, 0.13507f, -0.13308f, 0.15201f, -0.05573f, }; static const float av1_rect_partition_nn_bias_16_layer0[NUM_NODES] = { -0.16783f, -0.16023f, 0.52215f, -0.04109f, 2.00122f, -0.11633f, 0.25535f, 1.80638f, 1.69273f, -0.25998f, -6.83550f, -0.79682f, -1.03466f, 1.42721f, 0.00000f, -0.00000f, -0.11665f, -0.12047f, -1.01497f, 7.27181f, -0.78548f, -1.39335f, -5.42248f, -0.10388f, 0.07634f, 2.81012f, -0.57429f, -0.15629f, -0.12044f, 1.65478f, -0.75153f, 1.18441f, }; static const float av1_rect_partition_nn_weights_16_layer1[NUM_NODES * LABEL_SIZE] = { -0.26407f, 0.06322f, 0.87932f, 0.17772f, 0.71686f, -0.12283f, 0.08454f, 0.20098f, -0.31763f, -0.33178f, -4.59535f, -0.04367f, 0.17099f, 3.80486f, 0.16750f, 0.29218f, 0.57234f, -0.96550f, -0.10599f, -4.91130f, -0.14658f, 0.95803f, -4.13925f, 0.24567f, 0.25708f, 1.60547f, -1.03251f, -0.31053f, -0.05659f, -0.94121f, -0.68926f, -0.24738f, -0.38019f, 0.98950f, 0.13689f, 0.24504f, 0.49623f, 0.19980f, 0.38349f, 0.37481f, 0.54540f, -0.02198f, 3.43385f, 1.02543f, -0.40921f, -3.07235f, 0.02996f, 0.00323f, -0.35414f, 0.71099f, 1.39334f, 2.43741f, -1.11007f, -0.22739f, -4.21757f, 0.11905f, 0.00353f, -1.69637f, 0.45944f, -0.19884f, 0.03624f, 0.25729f, 0.23659f, -2.08405f, 0.08573f, -0.53393f, -1.28103f, -0.53970f, -0.65465f, 0.31821f, -0.09884f, -0.69026f, -0.37284f, 0.04622f, 1.32973f, -0.15414f, 0.19138f, -0.67927f, -0.17658f, 0.36008f, -0.51832f, 0.09887f, -1.94414f, 2.95227f, 1.76937f, -0.26687f, 8.50976f, 0.26247f, 0.60262f, -0.27910f, 0.30061f, -0.05117f, 0.16018f, 0.71195f, 0.57871f, 1.57794f, }; static const float av1_rect_partition_nn_bias_16_layer1[3] = { 2.68750f, -1.31894f, -1.36768f, }; static const NN_CONFIG av1_rect_partition_nnconfig_16 = { FEATURE_SIZE, // num_inputs LABEL_SIZE, // num_outputs 1, // num_hidden_layers { NUM_NODES, }, // num_hidden_nodes { av1_rect_partition_nn_weights_16_layer0, av1_rect_partition_nn_weights_16_layer1 }, { av1_rect_partition_nn_bias_16_layer0, av1_rect_partition_nn_bias_16_layer1 } }; static const float av1_rect_partition_nn_weights_32_layer0[FEATURE_SIZE * NUM_NODES] = { -0.54654f, -0.43537f, -0.10620f, -0.48051f, -0.43543f, -0.22737f, -0.15429f, -0.09858f, -0.09438f, 0.37306f, 0.23934f, -1.86375f, -1.18307f, -0.32995f, -0.09745f, 0.05431f, -0.13799f, 0.14734f, -0.33219f, 0.18057f, -0.23792f, -0.28126f, 0.02977f, -0.07431f, 0.07860f, 0.00067f, -0.01927f, 1.01841f, -0.57739f, 0.08412f, -1.33843f, -1.05563f, -0.28693f, -0.39425f, -0.69572f, -0.16703f, 0.02808f, 0.11994f, -0.26267f, 0.19706f, -0.29707f, -0.25305f, -0.07050f, -0.02704f, -0.31528f, -0.42301f, 0.22496f, -0.37001f, -0.23319f, -0.11139f, -0.30513f, 0.04213f, -0.12550f, 0.02504f, 0.33245f, 0.01102f, -0.35950f, -0.05949f, -0.19590f, -0.27457f, -0.28339f, -0.15676f, -0.21538f, 0.65066f, 0.28443f, -1.24943f, -3.00246f, -1.01897f, 0.09304f, 0.70052f, -0.12877f, 0.21120f, -0.37476f, 0.23261f, -0.28401f, 0.09837f, 0.00020f, -0.12106f, -0.32354f, -0.02472f, -0.19772f, 1.01886f, 0.16596f, -0.06532f, 1.72938f, 1.57754f, 0.55963f, 0.33246f, -0.20023f, 0.30715f, 0.08629f, 0.18945f, -0.45988f, -1.22610f, -0.05152f, -0.48859f, -1.02104f, -0.27315f, -0.57698f, 0.04157f, -0.92428f, -1.31268f, 1.78210f, 0.10291f, 1.55042f, -1.26793f, 1.39042f, -1.43729f, 0.25600f, 5.21263f, 5.31955f, 5.19316f, 5.43430f, 0.00294f, -0.00970f, -0.02333f, 0.00250f, 1.17672f, 6.27544f, 4.95973f, 3.54009f, 4.51269f, 0.30750f, 0.78780f, -0.44741f, -0.76442f, 0.75050f, 0.58799f, 0.03400f, -2.09859f, 1.67313f, 0.12503f, 0.28609f, 1.15809f, 2.46530f, -0.04898f, 0.23072f, -0.12635f, -0.82097f, -0.63827f, 2.16779f, 1.77132f, 0.15434f, -1.06427f, 0.06206f, -0.87732f, -0.61897f, -0.44593f, -0.77131f, -0.15979f, -0.02282f, -0.74381f, 0.66052f, -0.22992f, 1.74638f, 1.29199f, -0.55464f, 0.98316f, 0.06665f, 0.50254f, -0.66292f, 0.17113f, -0.32633f, -1.85803f, -0.92759f, 4.44965f, 1.33057f, 0.02135f, -0.27446f, -0.26018f, -0.12613f, -0.14470f, -0.23355f, -0.09717f, -0.24123f, -0.05535f, -0.19146f, -0.36222f, -0.30458f, -0.40323f, 0.21779f, 0.14248f, -0.48630f, 0.18840f, 0.11040f, 0.17287f, -0.51880f, 1.12466f, -0.38888f, -0.16421f, -0.31784f, -0.36112f, -0.25386f, -0.01636f, 0.10029f, -0.26881f, -0.17051f, -0.30903f, -0.08573f, -0.28774f, -0.01173f, -0.09706f, -0.23089f, -0.12922f, -0.17463f, -0.12433f, -0.23074f, 0.15220f, 1.29826f, 0.23788f, 0.04189f, 2.66416f, 0.48815f, -0.06803f, 0.96742f, 1.27165f, -0.70348f, -0.09941f, -0.42948f, -0.20243f, -0.02364f, -0.26689f, -0.40629f, -0.68217f, -0.48073f, 2.43657f, -2.60191f, -1.82837f, 0.50440f, 0.71829f, 0.76491f, 0.28293f, 0.20568f, 0.92642f, -0.02496f, 1.43637f, -0.24474f, -1.21030f, 0.54084f, 1.05130f, 1.29572f, 0.03750f, -0.36894f, 0.74548f, -1.33857f, -0.84858f, 1.35230f, 0.80175f, 0.66136f, 1.06473f, 0.18701f, 1.42413f, 0.04661f, -0.07820f, 0.64990f, -0.43595f, 1.18304f, -0.11437f, -0.06365f, 0.03558f, 0.78260f, -1.74890f, 1.56217f, -1.23424f, 4.59193f, -3.35072f, 0.01180f, -0.18296f, -0.20870f, 0.04510f, 1.52595f, -1.37402f, -0.33123f, -0.85957f, 0.80598f, 0.03743f, 0.02354f, 0.37707f, 1.62095f, -0.29627f, -0.31778f, -0.45789f, -0.14906f, 0.25315f, -0.10817f, -0.32610f, -0.40890f, 0.33984f, }; static const float av1_rect_partition_nn_bias_32_layer0[NUM_NODES] = { -0.17482f, 0.39042f, 0.00000f, 1.69677f, 0.08792f, -0.09301f, 0.13809f, 4.84061f, 0.00000f, 0.40515f, 0.46246f, 0.20644f, -5.77478f, -1.54510f, 0.05660f, -0.32013f, 0.23649f, 0.03778f, -2.53710f, -0.27869f, 0.45623f, -0.04155f, -0.18445f, -0.73405f, -0.50243f, 2.23191f, 1.93272f, -1.07032f, -0.27602f, -1.98063f, 0.20816f, -0.01315f, }; static const float av1_rect_partition_nn_weights_32_layer1[NUM_NODES * LABEL_SIZE] = { 0.02827f, 1.02560f, -0.07137f, -0.31911f, 0.11365f, 0.13684f, -0.07816f, -5.23036f, -0.34340f, 0.84526f, -1.51845f, 0.07017f, -8.12570f, 6.24061f, 0.35739f, -0.09937f, -0.30978f, 0.22032f, 0.74968f, -0.34557f, 0.45547f, -0.16512f, 0.07118f, 1.66415f, 0.41320f, -1.81533f, -1.96004f, 1.04666f, 0.84049f, 4.31009f, 0.68850f, 0.26322f, -0.24634f, -1.25889f, 0.31952f, 0.63632f, 0.05801f, -0.10664f, -0.21992f, 2.44386f, 0.19526f, -0.09838f, 1.53049f, -0.26630f, 3.54126f, -3.40574f, 0.72730f, 0.04557f, 0.92652f, 0.15522f, 2.35895f, -0.13347f, 0.56907f, 0.15352f, 0.01823f, -0.73939f, 0.43104f, 1.90321f, 0.31267f, -0.51972f, 0.50094f, -3.98372f, -3.41518f, -0.48183f, 0.26661f, 0.64146f, 0.14500f, -0.01695f, 0.16653f, -0.37846f, 0.08412f, 2.69714f, -0.20258f, -0.75786f, 0.11201f, 0.61878f, 4.22231f, -3.55330f, -1.14137f, -0.37722f, -0.28000f, -0.72581f, -2.62827f, -0.19448f, -0.59398f, -0.30136f, -0.17725f, -0.69630f, -0.41132f, 0.12208f, 2.11441f, -1.08794f, -1.41694f, 0.02620f, 2.18792f, 0.04271f, }; static const float av1_rect_partition_nn_bias_32_layer1[3] = { 2.47332f, -1.65756f, -0.81573f, }; static const NN_CONFIG av1_rect_partition_nnconfig_32 = { FEATURE_SIZE, // num_inputs LABEL_SIZE, // num_outputs 1, // num_hidden_layers { NUM_NODES, }, // num_hidden_nodes { av1_rect_partition_nn_weights_32_layer0, av1_rect_partition_nn_weights_32_layer1 }, { av1_rect_partition_nn_bias_32_layer0, av1_rect_partition_nn_bias_32_layer1 } }; static const float av1_rect_partition_nn_weights_64_layer0[FEATURE_SIZE * NUM_NODES] = { 0.08972f, 4.09095f, -0.31398f, -2.43631f, -0.74767f, 1.42471f, 1.60926f, 1.44721f, 1.88259f, 2.35375f, 1.88299f, 2.01109f, 0.98679f, 2.24131f, 0.06279f, -0.08315f, 0.32107f, 0.91334f, -0.36569f, 5.55049f, 5.44943f, 5.20471f, 5.39099f, -0.01943f, -0.00284f, 0.02203f, -0.01309f, 1.41917f, 6.68460f, -6.15986f, 6.41341f, -3.20630f, -0.00567f, -0.00038f, 0.05960f, 0.04308f, 0.95366f, 3.48535f, 2.98266f, 4.11784f, 3.44255f, 0.61630f, 0.71405f, 0.63945f, -0.00713f, 0.39193f, 1.91621f, 3.32755f, 0.71674f, -0.11647f, 2.07090f, 2.64191f, 0.07949f, -0.05023f, 0.99935f, 0.83145f, 0.75898f, -0.98764f, -0.58731f, 1.21734f, -0.08076f, -3.26780f, 1.66278f, 0.04189f, -0.33177f, -1.58648f, 1.00883f, -0.56132f, -2.34877f, 0.67056f, -2.32297f, -0.91641f, -1.02909f, 4.19781f, 3.87484f, 4.32778f, -1.97171f, -0.24734f, 0.00822f, 0.05892f, 0.12697f, -3.62915f, -2.93127f, 7.94856f, -3.29311f, 3.26001f, -0.02231f, 0.02741f, 0.05919f, 0.08190f, -1.49344f, -0.64475f, -0.24627f, 4.03324f, -1.14799f, -0.18465f, -0.17829f, 0.10394f, 0.08580f, -5.74721f, 4.42467f, 3.63964f, 3.00258f, -1.22744f, -0.29408f, 0.00767f, 0.12305f, 0.05249f, -0.17166f, -0.20120f, -0.32941f, -0.31901f, 0.04628f, -0.35249f, -0.18272f, 0.03956f, -0.19329f, -0.33564f, 0.09856f, -0.00173f, -0.31751f, -0.05702f, -0.20558f, -0.31464f, -0.02488f, -0.00729f, -0.35854f, -0.14762f, -0.34897f, -0.12746f, 0.04011f, -0.24918f, -0.53516f, -0.28440f, -0.36789f, -1.34889f, -9.10044f, -9.19238f, 4.48042f, 6.54429f, -0.00226f, 0.00430f, 0.00321f, 0.00442f, 0.87551f, -0.16224f, -0.22832f, -0.60640f, -0.28738f, 0.18062f, 0.22008f, -0.47406f, 0.80302f, 0.12149f, 1.49530f, 1.05069f, -2.02985f, -0.92833f, 0.25616f, 0.12852f, 3.51840f, 0.25226f, -2.63283f, -4.04386f, 8.46300f, -2.93408f, 0.44069f, 0.08276f, 0.34482f, -0.22615f, 0.28666f, 3.02962f, -1.20055f, -1.04832f, -0.97632f, -0.99530f, 1.44196f, 1.68550f, 0.49360f, 1.08155f, -0.26059f, -0.02876f, -0.27492f, -0.06205f, -0.09496f, -0.12314f, -0.30228f, -0.07453f, -0.38857f, 1.17443f, 2.41497f, 1.90537f, 2.37716f, 2.91495f, -0.44455f, -0.51176f, 0.48195f, 0.53032f, 0.23696f, -1.06211f, 1.47459f, -0.89029f, 0.29521f, 0.66291f, -0.42653f, 1.82308f, -1.30372f, -0.36192f, -3.40388f, -1.61476f, -2.29745f, -0.66886f, -2.08252f, -0.54552f, -4.06849f, 0.02948f, 0.27297f, -4.81472f, 4.60404f, -0.11053f, 0.14765f, 0.02826f, -0.14688f, -0.07066f, -0.01224f, 1.20377f, 7.02725f, -6.02627f, 6.87255f, -3.14257f, 0.01074f, 0.02397f, -0.02359f, 0.01901f, 0.14956f, -1.67671f, 2.26714f, 2.57043f, -0.45888f, -1.60265f, -2.11475f, -2.74029f, -2.74658f, -0.35630f, -2.63013f, -2.14814f, -0.67266f, -1.56850f, 0.57137f, -1.14428f, -0.34265f, -0.12521f, 0.01220f, -0.74906f, -0.19270f, 0.68110f, -0.24737f, -0.70568f, -1.64826f, -0.35847f, -0.15984f, -1.17932f, -8.72306f, -8.72834f, 3.93701f, 6.17812f, -0.03191f, -0.00104f, 0.01402f, -0.00046f, -0.94517f, 1.51266f, -0.56318f, 0.72260f, -0.09253f, -0.09069f, -2.16695f, -0.23653f, 0.24418f, 2.21148f, -1.47954f, -1.01439f, 0.31536f, 0.77238f, -0.85083f, -0.15758f, -0.50886f, 0.09101f, }; static const float av1_rect_partition_nn_bias_64_layer0[NUM_NODES] = { 0.91706f, -1.31328f, -5.16196f, 1.13191f, -0.98044f, -1.61122f, 1.03039f, -0.98537f, -4.45568f, -4.34802f, -0.92116f, 0.66836f, -0.10752f, -0.13065f, -0.35567f, -0.35693f, 1.74941f, 1.17379f, -3.45555f, 5.66321f, -0.24917f, -1.11940f, -0.73656f, -0.19299f, -0.04181f, 1.11010f, -2.97859f, -0.16774f, 0.59835f, -0.31269f, -0.30585f, -1.66212f, }; static const float av1_rect_partition_nn_weights_64_layer1[NUM_NODES * LABEL_SIZE] = { 0.58963f, 4.20320f, -8.62465f, -6.54014f, 5.41108f, 2.33581f, -0.10354f, -1.17753f, -3.45909f, -2.24722f, 2.20881f, 3.21971f, -0.09087f, -0.21624f, 0.16529f, -8.40985f, -1.60205f, -1.41538f, 4.41826f, -4.63069f, -0.27742f, 4.08710f, 0.26439f, -1.46028f, 0.51234f, 6.25212f, -3.35650f, -1.21348f, 1.37201f, 8.89151f, 0.28859f, -0.97328f, -0.36196f, -2.71701f, 4.54196f, -0.62476f, -2.43814f, -1.34209f, 0.12850f, 1.73859f, 3.09809f, -4.42434f, -1.82552f, -3.66420f, -0.31535f, 0.00968f, -0.02019f, 9.66824f, 0.58835f, 1.50425f, 2.84487f, 2.55522f, 0.01409f, -2.27594f, -0.31800f, 0.91076f, -0.66808f, 0.33120f, -0.12460f, 0.64457f, -0.36416f, -10.30843f, 1.51013f, 2.06861f, -0.20989f, -0.87119f, 3.68642f, 7.33662f, -2.88037f, -0.52414f, -0.35036f, -0.45947f, -0.07406f, 6.46346f, -0.16031f, 0.27071f, 0.38845f, -0.21940f, 0.08583f, -1.39526f, 0.50554f, 0.45279f, -6.61856f, 1.84069f, -0.19149f, -1.77235f, 0.75136f, 1.11797f, 0.32677f, -7.10427f, 3.82908f, 1.04238f, -0.91435f, 1.93317f, -1.84946f, -0.48909f, }; static const float av1_rect_partition_nn_bias_64_layer1[3] = { 0.32215f, -0.57522f, 0.25314f, }; static const NN_CONFIG av1_rect_partition_nnconfig_64 = { FEATURE_SIZE, // num_inputs LABEL_SIZE, // num_outputs 1, // num_hidden_layers { NUM_NODES, }, // num_hidden_nodes { av1_rect_partition_nn_weights_64_layer0, av1_rect_partition_nn_weights_64_layer1 }, { av1_rect_partition_nn_bias_64_layer0, av1_rect_partition_nn_bias_64_layer1 } }; static const float av1_rect_partition_nn_weights_128_layer0[FEATURE_SIZE * NUM_NODES] = { -0.70901f, -3.03481f, 3.30604f, -1.28803f, -0.08610f, -0.33320f, -0.30716f, 0.25100f, 0.14323f, -0.98422f, -0.89084f, -0.24508f, -1.10785f, -0.82524f, 0.11766f, -0.42777f, 1.08965f, 4.35125f, -1.19388f, 4.22042f, 4.96306f, 6.32406f, 3.29899f, -0.90768f, 0.05203f, 0.38467f, 1.74257f, -0.19918f, -0.11335f, 0.00140f, -0.42303f, -0.04419f, 0.03583f, -0.05441f, -0.19586f, 0.01484f, -1.19964f, 0.25497f, 3.04502f, 0.05446f, -0.23253f, 0.00266f, 0.07117f, -2.78986f, -4.62953f, 1.45331f, 0.43923f, 0.92298f, -0.47736f, 1.49165f, 0.45942f, -1.99787f, 3.33510f, 0.17234f, 0.04024f, -1.42780f, 0.23566f, -0.90970f, 1.18041f, -1.45865f, 2.30878f, -1.28507f, 1.87290f, 1.91186f, 4.74826f, -3.70735f, 4.49808f, -4.72275f, -0.02696f, -0.02642f, -0.06093f, -0.01121f, -0.70683f, 2.69737f, -1.88563f, 2.48637f, 1.10922f, 0.74624f, 0.40308f, 2.06396f, 1.39289f, 0.00909f, -2.05271f, -1.53539f, -1.38323f, 0.83303f, -0.32250f, 0.51172f, 3.91249f, 1.66373f, 1.13184f, -2.22874f, -1.13448f, -0.11185f, 0.19387f, 0.36770f, -0.58933f, 0.22789f, 1.17307f, 0.77461f, 0.20817f, 0.33417f, 0.54037f, 0.32961f, -0.18456f, -9.78171f, -0.17216f, -3.44703f, -2.42158f, 0.51946f, 4.35949f, -0.73335f, -1.61515f, -0.29622f, -0.37617f, -0.42316f, 0.74922f, 1.44386f, 3.92704f, -3.76274f, 4.19775f, -3.86958f, 0.00074f, -0.02418f, -0.12944f, 0.05857f, -0.85507f, 5.42546f, 5.40338f, 5.54347f, 5.59791f, -0.01611f, 0.01618f, -0.01654f, -0.00270f, -0.39608f, -0.40410f, -0.24551f, 0.09124f, -0.34413f, -0.11504f, 0.12793f, -0.31523f, 0.09148f, -0.08567f, -0.05140f, -0.13310f, -0.81200f, 0.06882f, -0.52537f, -12.74048f, -0.45395f, -4.04775f, -1.84887f, -1.02573f, 0.32788f, 1.06828f, -1.25503f, -0.42693f, 2.01413f, -2.29103f, 0.62271f, 1.11764f, -1.83113f, -1.32325f, -1.65651f, -2.87826f, 1.46910f, 0.60885f, 0.16079f, 0.00171f, -0.25658f, -0.25465f, -0.14149f, 0.19497f, -0.07866f, -0.37080f, -0.05778f, -0.08870f, -0.20491f, 0.84521f, -0.18214f, -1.38441f, -1.08932f, -1.76627f, 0.73172f, 0.05967f, 1.28057f, 3.42722f, 1.69287f, 0.77169f, 0.44528f, 1.85513f, 0.07840f, 1.31252f, 2.89948f, 1.49489f, 0.15281f, 0.54708f, -1.14185f, -2.51063f, 0.36618f, -0.55322f, 0.96671f, 1.59470f, 1.38252f, 1.99697f, 0.03266f, -0.23200f, -0.01127f, -0.18918f, -0.37598f, -0.03119f, -0.36039f, -0.21192f, -0.11565f, -4.22635f, 1.41252f, 0.56608f, -0.08867f, 3.11924f, -0.54597f, -0.12504f, -0.05289f, -0.28665f, -0.58297f, -1.18362f, -0.76201f, -1.22011f, -0.58756f, 0.14740f, 1.43971f, 0.98381f, -0.02998f, -0.40678f, -0.23047f, -0.12979f, 0.04003f, -0.22081f, -0.09294f, -0.15955f, -0.10379f, -0.10192f, -1.51316f, 2.39482f, -1.69975f, 3.58976f, -0.91032f, -0.03498f, 0.48982f, -0.13418f, 0.76256f, 1.61003f, -2.01676f, -1.24430f, -3.25763f, 1.12314f, 2.00740f, 0.04613f, -0.14746f, -0.57374f, 3.44511f, -0.56767f, -4.08432f, -2.04894f, 2.35951f, -0.00458f, 0.18512f, 0.09916f, -0.04084f, -1.56207f, 1.38034f, 4.17302f, -1.47326f, -2.03530f, -0.00210f, 0.27469f, -0.17423f, 0.86860f, 2.76195f, 2.43269f, -3.57331f, 2.08715f, -1.44171f, -0.17389f, 2.26157f, -0.07852f, 2.02519f, }; static const float av1_rect_partition_nn_bias_128_layer0[NUM_NODES] = { 2.53427f, 1.66678f, -0.84914f, -0.15070f, -1.74769f, 0.45218f, -0.26067f, 2.05916f, 0.08978f, 5.30984f, 2.66243f, -1.62740f, 0.70018f, 1.96403f, -4.97152f, -0.05425f, -3.84474f, -1.28006f, 3.47490f, -0.08373f, 0.00225f, -1.40692f, -0.27569f, -0.30253f, 0.77377f, -0.67636f, -0.26379f, 1.82348f, 0.66120f, 0.61119f, -1.42293f, 0.32676f, }; static const float av1_rect_partition_nn_weights_128_layer1[NUM_NODES * LABEL_SIZE] = { 1.53453f, -0.23707f, 7.88368f, 0.33340f, 0.97523f, 1.38538f, -0.16746f, 4.42070f, 3.18678f, -5.03545f, -2.27029f, -3.75719f, -0.26850f, -4.93432f, -8.75673f, 0.27398f, -5.77882f, -0.91616f, -2.62725f, -0.23961f, 0.31249f, 3.32134f, 0.25375f, -0.00394f, 2.30213f, -0.14183f, 0.14544f, -1.42830f, 1.31101f, 3.99389f, -0.00017f, -2.90184f, -2.11444f, 2.16734f, -3.05133f, 0.39206f, 4.61489f, -2.88181f, -0.47745f, 2.86649f, -1.20621f, 3.70550f, 1.58029f, -4.58731f, -2.29350f, -0.76930f, 5.19135f, -0.22521f, -5.08782f, 2.17316f, 1.30563f, 0.16777f, -2.17767f, -2.09904f, 1.37001f, 0.25091f, -1.76743f, 1.57940f, 0.30544f, -2.39895f, -0.08532f, -1.77122f, 1.84010f, -0.88449f, 0.79299f, -1.35368f, -4.54110f, 0.02244f, -5.11580f, 1.60883f, 0.29352f, -6.47042f, -1.81426f, 1.24013f, 0.90980f, 7.93977f, 2.12555f, 5.24720f, 4.19508f, 0.21499f, 11.06045f, -0.74752f, 0.89396f, 0.26422f, 1.72332f, -1.25113f, -1.71136f, 0.13676f, -0.07867f, -0.96929f, 0.19911f, 3.58233f, -0.76470f, -2.24162f, -2.87465f, 3.18736f, }; static const float av1_rect_partition_nn_bias_128_layer1[3] = { 1.09014f, -0.53317f, -0.55668f, }; static const NN_CONFIG av1_rect_partition_nnconfig_128 = { FEATURE_SIZE, // num_inputs LABEL_SIZE, // num_outputs 1, // num_hidden_layers { NUM_NODES, }, // num_hidden_nodes { av1_rect_partition_nn_weights_128_layer0, av1_rect_partition_nn_weights_128_layer1 }, { av1_rect_partition_nn_bias_128_layer0, av1_rect_partition_nn_bias_128_layer1 } }; #undef FEATURE_SIZE #undef NUM_NODES #undef LABEL_SIZE // Below are the models used for simple_motion_search_based_split // Thresholds // The first index level is for aggresiveness, and the second is frame // resolution, third is bsize static const float av1_simple_motion_search_split_thresh[4][3][5] = { // Aggressiveness = 0 { // lowres { 1.40402595879f, // p = 0.8028197 4.72845183649f, // p = 0.99123732 1.86517797783f, // p = 0.86589934 1.58715223005f, // p = 0.83021506 7.22695596987f, // p = 0.9992738 }, // midres { 5.839480f, // p = 0.997098 1.877167f, // p = 0.867285 3.073499f, // p = 0.955783 1.405601f, // p = 0.803071 2.555636f, // p = 0.927951 }, // hdres { 5.839480f, // p = 0.997098 1.877167f, // p = 0.867285 3.073499f, // p = 0.955783 1.405601f, // p = 0.803071 2.555636f, // p = 0.927951 }, }, // Aggressiveness = 1 { // Lowres { 100.0000f, // p = 1.000000 4.952535f, // p = 0.992984 1.720880f, // p = 0.848242 1.426233f, // p = 0.806314 1.491905f, // p = 0.816364 }, // Midres { 100.0000f, // p = 100.0000 3.137263f, // p = 0.958404 2.703262f, // p = 0.937219 1.877166f, // p = 0.867285 2.221149f, // p = 0.902133 }, // Hdres { 4.417680f, // p = 0.988082 3.086898f, // p = 0.956349 3.966704f, // p = 0.981416 1.532565f, // p = 0.822381 3.449975f, // p = 0.969230 }, }, // Aggressiveness = 2 { // lowres { 100.000000f, // p = 0.998048 1.484020f, // p = 0.815179 1.866781f, // p = 0.866085 1.706711f, // p = 0.846409 2.080369f, // p = 0.888980 }, // midres { 100.000000f, // p = 0.0 3.265763f, // p = 0.963235428881 2.024598f, // p = 0.883355591569 1.846446f, // p = 0.863709256976 2.240962f, // p = 0.903868036126 }, // hdres { 3.133026f, // p = 0.958234684141 2.940954f, // p = 0.949834204693 2.484544f, // p = 0.923051170045 1.702972f, // p = 0.845922460525 1.655562f, // p = 0.839641385729 }, }, // Aggressiveness = 3 { // lowres { 100.000000f, 1.41409519484f, 0.606066095487f, 0.0993410805635f, 0.762099214988f }, // midres { 100.000000f, 0.702207995397f, 0.503550081119f, 0.0403228785199f, 0.557298794638f }, // hdres { 1.21895384144f, 1.26798450469f, 0.872537808115f, 0.975869438148f, 1.86572095242f }, }, }; static const float av1_simple_motion_search_no_split_thresh[4][3][5] = { // Aggressiveness = 0 { // lowres { -100.0f, // p = 0.0 -100.0f, // p = 0.0 -100.0f, // p = 0.0 -100.0f, // p = 0.0 -100.0f, // p = 0.0 }, // midres { -3.38168078f, // p = 0.032872917 -4.08610739f, // p = 0.016526795 -1.78302370f, // p = 0.15270848 -100.000000f, // p = 0.0 -100.000000f, // p = 0.0 }, // hdres { -100.000000f, // p = 0.0 -100.000000f, // p = 0.0 -2.98718897f, // p = 0.048008 -100.000000f, // p = 0.0 -3.33229488f, // p = 0.03447975 }, }, // Aggressiveness = 1 { // Lowres { -100.0000f, // p = 0.0 -4.893793f, // p = 0.007437 -3.387766f, // p = 0.032680 -2.982806f, // p = 0.048209 -2.330372f, // p = 0.088639 }, // Midres { -100.0000f, // p = 0.000000 -6.131853f, // p = 0.002168 -2.346579f, // p = 0.087338 -2.712849f, // p = 0.062219 -3.195430f, // p = 0.039338 }, // Hdres { -3.491416f, // p = 0.029557 -2.192853f, // p = 0.100394 -3.620180f, // p = 0.026079 -2.030855f, // p = 0.116001 -2.797586f, // p = 0.057455 }, }, // Aggressiveness = 2 { // lowres { -100.0000f, // p = 0.0 -3.617350f, // p = 0.026151 -5.902503f, // p = 0.002725 -4.677840f, // p = 0.009213 -2.168378f, // p = 0.102626 }, // midres { -100.0000f, // p = 0.0 -3.204195f, // p = 0.0390081679555 -2.354128f, // p = 0.0867382128969 -2.523326f, // p = 0.0742390077132 -3.112328f, // p = 0.0426016085803 }, // hdres { -5.047760f, // p = 0.00638270448225 -3.414994f, // p = 0.0318301469487 -5.628090f, // p = 0.00358255438917 -2.122691f, // p = 0.10691083145 -1.972387f, // p = 0.122132728355 }, }, // Aggressiveness = 3 { // lowres { -100.000000f, -2.04766486133f, -1.00442099188f, -1.15077982642f, -1.0830321897f }, // midres { -100.000000f, -0.985686808303f, -0.757739584866f, -0.890120107569f, -0.228236297886f }, // hdres { -1.03535679263f, -1.57431743203f, -0.564851540156f, -0.35442301663f, -1.36741555171f }, }, }; static const float av1_simple_motion_search_split_mean_128[17] = { 14.119120f, 14.087010f, 12.016185f, 11.966075f, 12.042454f, 11.994805f, 12.152105f, 12.100394f, 12.178377f, 12.128937f, 4.779944f, 0.714786f, 3.535450f, 3.566207f, 0.835913f, 3.315452f, 3.302908f, }; static const float av1_simple_motion_search_split_std_128[17] = { 1.832420f, 1.835338f, 2.019207f, 2.020793f, 2.008731f, 2.008403f, 1.900999f, 1.907081f, 1.908915f, 1.913122f, 2.109345f, 0.451517f, 1.407097f, 1.372501f, 0.370355f, 1.321495f, 1.319665f, }; static const float av1_simple_motion_search_split_mean_64[17] = { 12.363721f, 12.314348f, 10.404341f, 10.333541f, 10.405775f, 10.336996f, 10.402246f, 10.330084f, 10.405584f, 10.334330f, 4.554232f, 0.896393f, 2.819613f, 2.855845f, 0.926296f, 2.808782f, 2.798229f, }; static const float av1_simple_motion_search_split_std_64[17] = { 1.878920f, 1.882255f, 1.950167f, 1.953289f, 1.913869f, 1.914781f, 1.920096f, 1.924454f, 1.880200f, 1.882499f, 2.050922f, 0.304750f, 1.144391f, 1.125088f, 0.261289f, 1.145059f, 1.131215f, }; static const float av1_simple_motion_search_split_mean_32[17] = { 10.750278f, 10.679627f, 8.745625f, 8.644149f, 8.757436f, 8.656657f, 8.759780f, 8.656299f, 8.772563f, 8.669839f, 4.208026f, 0.958573f, 2.308769f, 2.347375f, 0.961685f, 2.323464f, 2.296322f, }; static const float av1_simple_motion_search_split_std_32[17] = { 1.879269f, 1.883531f, 1.935828f, 1.935677f, 1.915823f, 1.914773f, 1.909733f, 1.910315f, 1.890451f, 1.890032f, 1.913318f, 0.199276f, 0.988825f, 0.972115f, 0.191956f, 0.977131f, 0.951418f, }; static const float av1_simple_motion_search_split_mean_16[17] = { 9.076768f, 8.974986f, 7.078364f, 6.926072f, 7.088739f, 6.936111f, 7.096697f, 6.942841f, 7.114978f, 6.961046f, 3.865480f, 0.982632f, 1.886023f, 1.912892f, 0.981492f, 1.926059f, 1.891233f, }; static const float av1_simple_motion_search_split_std_16[17] = { 1.922965f, 1.925609f, 1.851980f, 1.847558f, 1.848410f, 1.843990f, 1.843931f, 1.839582f, 1.840304f, 1.836144f, 1.760042f, 0.130639f, 0.841086f, 0.833523f, 0.134780f, 0.840790f, 0.831309f, }; static const float av1_simple_motion_search_split_mean_8[17] = { 7.120238f, 6.957731f, 5.176309f, 4.889594f, 5.178396f, 4.886607f, 5.195322f, 4.905566f, 5.198845f, 4.904745f, 3.648933f, 0.993198f, 1.496831f, 1.520804f, 0.991864f, 1.489763f, 1.460761f, }; static const float av1_simple_motion_search_split_std_8[17] = { 1.698498f, 1.696000f, 1.629605f, 1.614641f, 1.632476f, 1.618831f, 1.618352f, 1.603742f, 1.623089f, 1.609674f, 1.668587f, 0.082193f, 0.759407f, 0.759684f, 0.089830f, 0.742797f, 0.730632f, }; static const float *const av1_simple_motion_search_split_mean[5] = { av1_simple_motion_search_split_mean_128, av1_simple_motion_search_split_mean_64, av1_simple_motion_search_split_mean_32, av1_simple_motion_search_split_mean_16, av1_simple_motion_search_split_mean_8, }; static const float *const av1_simple_motion_search_split_std[5] = { av1_simple_motion_search_split_std_128, av1_simple_motion_search_split_std_64, av1_simple_motion_search_split_std_32, av1_simple_motion_search_split_std_16, av1_simple_motion_search_split_std_8, }; #define NUM_HIDDEN_LAYERS_128 1 #define NUM_FEATURES_128 17 #define NUM_LAYER_0_UNITS_128 20 #define NUM_LOGITS_128 1 static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_128[] = { 0.24095f, -0.397761f, -0.388619f, -0.0629548f, -0.44577f, 0.688212f, -0.20889f, -1.08227f, -0.0313894f, -0.615505f, -0.401839f, 0.40233f, -0.171305f, 0.439803f, 1.58527f, -0.968535f, -1.29255f, 1.14846f, 0.885777f, 0.116412f, -0.225704f, 0.316506f, 0.793951f, -0.63591f, 0.097789f, -0.327027f, -0.778396f, -0.231667f, -0.9622f, 1.0044f, 0.32594f, 0.179768f, -0.115529f, -0.499395f, -1.14727f, -1.26111f, 0.269818f, -0.0882028f, -0.349107f, 0.100901f, 0.0249506f, 0.528929f, 0.113961f, 0.929794f, 0.242494f, -0.122828f, -0.0477379f, 0.170659f, 0.0500187f, 0.28859f, 0.78783f, 0.482412f, 0.795298f, 0.179517f, 0.453911f, -0.298029f, -0.903332f, 0.510615f, 0.691994f, 0.433383f, -0.140802f, -1.11635f, -0.547326f, 1.11318f, 0.71905f, 0.978538f, 0.097444f, -0.0386012f, 0.713599f, 0.465164f, 0.391278f, -0.472864f, 0.230224f, -0.279508f, 0.558192f, -0.468625f, 0.55995f, -0.57507f, -1.39947f, -0.755819f, -1.04512f, -0.411552f, -0.830444f, -0.106571f, -0.0972184f, 0.251842f, 0.269955f, 0.230492f, -0.290581f, -0.484799f, 0.0151041f, 0.171047f, 0.829999f, -0.384581f, 0.220301f, -0.121687f, 1.88848f, -0.482809f, -0.48185f, 1.34482f, -0.716438f, -0.284482f, -1.78592f, -1.29333f, 0.886867f, 0.80106f, 0.456415f, 0.649095f, 0.231093f, 0.361562f, 0.290018f, 0.128009f, -0.196343f, 0.0607802f, 0.576761f, -0.0413836f, 0.0300984f, -0.318998f, 0.204434f, -0.712524f, 0.833394f, -0.81168f, 0.765488f, -0.720973f, 1.12866f, -0.838694f, 1.295f, -0.159127f, 1.05404f, 0.736519f, 0.248662f, 0.229233f, 0.0434302f, 0.0551856f, 0.197862f, 0.354823f, -0.32429f, -0.227353f, -0.132198f, -0.438118f, -0.210401f, -0.81046f, 0.653555f, 0.826737f, 0.154235f, 0.228945f, 0.123089f, 0.614964f, -0.0940471f, -0.00676807f, 0.24996f, 0.949233f, 0.746526f, -0.044474f, 0.386414f, 0.503221f, 0.155133f, -0.698848f, -0.735356f, -0.255091f, 0.413235f, -0.335295f, -0.145757f, 0.326299f, -0.602629f, -0.844474f, -0.346722f, -0.42598f, -0.491016f, -0.447732f, -0.965366f, -0.0242841f, 0.836606f, -0.104877f, 1.23236f, 0.683986f, 0.787005f, -0.0253437f, 1.2145f, 1.29554f, -1.24302f, -0.229495f, 0.439415f, 0.885087f, -0.408704f, -0.119299f, -0.0960972f, 0.60148f, 0.683271f, -0.057129f, -0.180295f, -0.264815f, -0.363184f, 0.638271f, 0.631083f, -0.252899f, -0.164364f, -1.31274f, 0.354408f, 0.0429172f, 0.371154f, -1.0978f, 0.0433642f, -0.467394f, -0.706572f, 1.57198f, -0.0701271f, 1.93149f, -0.446267f, 1.4519f, -1.29567f, 0.309978f, -0.878062f, 0.891494f, 0.364005f, -0.209611f, -0.125927f, 0.184097f, 0.0629695f, -0.43375f, -0.0980562f, 1.08547f, 0.578312f, 0.16566f, -0.198852f, -0.241854f, -0.523934f, -0.206037f, -0.867721f, 1.00041f, 1.09848f, -2.12562f, -0.19992f, -0.186128f, -0.03507f, 0.0484884f, 0.160856f, 0.10802f, -0.805141f, -1.06902f, 0.290363f, 0.0222096f, -0.849266f, 0.112932f, 0.148682f, -0.0457585f, 1.139f, 1.79141f, 0.194122f, -0.342508f, -0.403572f, 0.133678f, 0.217553f, -0.263759f, 0.18441f, 0.254529f, 0.0471115f, 0.733178f, -0.416205f, 0.441447f, -0.443335f, 0.725005f, -0.78946f, 0.71301f, -0.644969f, 1.5445f, 0.365277f, -0.455775f, -0.365066f, 0.4742f, -0.381714f, -0.545794f, -0.0464861f, -0.222768f, -0.0106466f, -0.069743f, 0.0335566f, 0.378348f, -0.249663f, 0.922286f, 0.125711f, -0.894619f, 0.444682f, 0.447893f, -1.98936f, -1.41978f, 0.0406667f, -0.199928f, -0.199786f, 0.463481f, 0.334931f, -0.396222f, -0.0732259f, 0.796684f, -0.140817f, -0.26878f, 0.194642f, 0.895784f, -0.369976f, -2.26981f, -0.0791776f, -0.0492268f, 0.6715f, 0.281805f, 0.0156664f, -0.779785f, 0.17743f, 0.188786f, -0.588077f, -0.359153f, 0.258319f, 0.881688f, 0.846894f, 1.00292f, 0.838134f, 0.680632f, 0.273098f, -0.329261f, 0.217757f, -0.506726f, -0.336523f, -0.695875f, -0.252006f, 0.751216f, 0.334409f, -0.0151467f, 0.0885474f, 0.0973114f, -0.248754f, -0.263716f, 0.369906f, -0.213749f, -0.0355395f, -0.137799f, 2.43233f, -0.944233f, -0.745167f, 0.318558f, 0.316608f, 0.568678f }; static const float av1_simple_motion_search_split_hiddenlayer_0_bias_128[] = { 0.821344f, 1.11542f, -1.24172f, 1.03642f, 1.13511f, 1.16414f, -0.278655f, -1.35558f, -1.26788f, -1.63189f, -0.323271f, 1.21319f, -0.888415f, 0.987145f, -1.16767f, 0.255833f, -0.1392f, 1.43265f, -1.54952f, 1.65159f }; static const float av1_simple_motion_search_split_logits_kernel_128[] = { 0.3565753f, 0.5490161f, -1.015597f, 0.565366f, 0.751604f, 0.922747f, -1.931846f, 1.759353f, -0.7362949f, 0.5707034f, -1.092127f, 0.936767f, 2.034499f, 2.08148f, 0.9509507f, -1.342504f, -0.834566f, 0.618184f, 0.844113f, 1.182693f }; static const float av1_simple_motion_search_split_logits_bias_128[] = { 1.819351f }; static const NN_CONFIG av1_simple_motion_search_split_nn_config_128 = { NUM_FEATURES_128, NUM_LOGITS_128, NUM_HIDDEN_LAYERS_128, { NUM_LAYER_0_UNITS_128, }, { av1_simple_motion_search_split_hiddenlayer_0_kernel_128, av1_simple_motion_search_split_logits_kernel_128, }, { av1_simple_motion_search_split_hiddenlayer_0_bias_128, av1_simple_motion_search_split_logits_bias_128, }, }; #undef NUM_HIDDEN_LAYERS_128 #undef NUM_FEATURES_128 #undef NUM_LAYER_0_UNITS_128 #undef NUM_LOGITS_128 #define NUM_HIDDEN_LAYERS_64 1 #define NUM_FEATURES_64 17 #define NUM_LAYER_0_UNITS_64 24 #define NUM_LOGITS_64 1 static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_64[] = { -1.40663f, -0.851503f, -0.0613111f, 0.741591f, 0.302754f, 0.184001f, 0.0474853f, 0.371096f, 0.0541624f, 0.381508f, 0.355427f, 0.0428822f, 0.154916f, -0.00490099f, 0.025484f, 0.0208921f, 0.140596f, -0.292525f, -0.459067f, -0.081393f, 0.109824f, -0.290183f, 0.720236f, 0.385835f, -0.150643f, -0.078518f, 0.0979819f, -0.102135f, 0.137152f, -0.0786457f, 0.0171441f, 0.991338f, -0.546583f, -1.0714f, -0.0842851f, 0.244072f, 0.427379f, 0.146775f, -0.921613f, -0.912093f, 0.393566f, -0.232375f, 0.19963f, 0.312355f, 0.55659f, -0.104714f, -0.137563f, 0.0985237f, 0.0788307f, -0.225514f, 0.0228832f, -0.288733f, -0.00737685f, -0.711657f, -0.256796f, 0.0869605f, 0.583977f, 0.384306f, 1.46692f, -0.741126f, -0.21105f, -0.276604f, -0.0151463f, -0.0227997f, -0.0403232f, 0.044122f, 0.0185784f, -0.0451951f, 0.00489513f, -0.387131f, 0.0966724f, -0.599174f, -0.00243351f, -0.21439f, 0.302043f, 0.130334f, -0.191251f, 0.863261f, -1.50112f, 0.00901057f, 0.000324294f, -0.0572545f, 0.0117685f, -0.0734682f, -0.0570435f, -0.126253f, 1.2313f, -0.328267f, 0.211788f, -0.175438f, -0.0419298f, 0.166447f, -0.178739f, -0.326221f, -0.0439188f, 1.01182f, -0.390678f, -0.426343f, 0.0944665f, -0.225042f, -0.183344f, 0.0500763f, -0.377393f, -0.673401f, -0.436907f, -0.00366876f, -0.363412f, 0.195194f, 0.250248f, -0.397193f, -0.0917222f, -0.0221579f, 1.7693f, -0.0694484f, -0.0410764f, -0.134571f, -0.159992f, -0.170359f, -0.249333f, -0.128056f, -0.617054f, -0.808701f, -0.540642f, 0.396391f, 0.147787f, 0.346916f, 0.709852f, 0.116064f, 0.0509731f, 0.073713f, -0.365082f, -1.09287f, -0.618214f, 0.20545f, 0.126161f, -0.140012f, 0.62592f, 0.316326f, -0.392765f, -0.15934f, 0.337617f, -0.41669f, -0.295225f, 0.0602025f, -0.0150657f, -0.319629f, 0.783729f, -0.0661199f, -0.362657f, 0.390042f, -0.043614f, -0.0414596f, 0.121155f, -0.309775f, -0.284761f, -0.243932f, 0.279855f, -0.266823f, 0.734824f, -0.164028f, 0.261776f, -0.105585f, 0.10733f, -0.180469f, 1.18875f, -1.12836f, -0.173008f, 0.150221f, 0.111598f, 0.148306f, -1.2833f, -1.06346f, 0.233546f, 0.16432f, 0.00142378f, 0.340574f, -0.0140885f, 0.634761f, -0.122096f, 0.821487f, 0.421424f, -0.0256687f, -0.035503f, -0.0453547f, -0.0215179f, -0.0671277f, -0.0486862f, -0.962761f, -0.208383f, 0.109573f, -0.210668f, -0.176485f, 0.421279f, 0.41605f, 0.342084f, 0.619364f, 0.103718f, -0.00341643f, 0.00266677f, 0.249089f, -0.22848f, -0.0368968f, 1.12092f, -0.64912f, -0.456579f, 0.477823f, 0.418345f, 1.41515f, 0.0936279f, 0.886155f, -0.785656f, -0.217109f, -0.561829f, -0.286435f, -0.884068f, -0.148839f, -0.282848f, 0.0683745f, 0.0962815f, -0.111975f, 0.0509158f, -0.211274f, 0.744909f, -0.8982f, 0.315232f, -0.78624f, 0.598387f, -0.530952f, 0.677357f, 0.0371339f, 0.99209f, -0.681899f, -0.291416f, -0.224822f, -0.26049f, -0.0436525f, -0.380004f, -0.27187f, 0.534779f, 0.717939f, 0.418197f, -0.152539f, -0.0684039f, -0.186308f, -0.0653121f, 0.194145f, -0.196367f, 0.256997f, -0.726269f, -0.307672f, -0.153362f, 0.450827f, 0.708842f, -0.0667079f, 0.555564f, 0.0486892f, 0.0715072f, -0.7211f, -0.849797f, 0.0650271f, 1.2747f, -0.646738f, -0.53042f, 0.182197f, 0.928203f, 0.180621f, -0.00640791f, -0.171416f, 0.092688f, -0.391275f, -0.0650657f, 0.0843773f, 0.170824f, 0.378085f, 0.0596657f, 0.844398f, -1.3083f, -1.27828f, -0.199179f, 0.557855f, 0.241479f, 0.385804f, 0.169533f, -0.0028072f, 0.0538041f, 0.00136234f, 0.0130481f, 0.0349449f, -0.0366494f, -0.000474055f, 0.437956f, 0.286724f, -0.298187f, 0.461967f, 0.43065f, -0.0877194f, -0.19133f, 0.379121f, -0.687751f, -1.64077f, -0.375191f, -0.336836f, -0.323904f, -0.101859f, 0.0126672f, -0.346332f, 0.112303f, -0.863336f, 0.155538f, 0.366509f, -0.0976829f, 0.635278f, -0.681967f, -0.527729f, 0.591839f, 0.366678f, 0.189981f, 0.0208007f, -0.565809f, 0.70183f, -0.282844f, -0.327485f, 0.347243f, -1.13014f, -0.373378f, -0.514978f, 0.662994f, -0.144931f, 0.1402f, -0.820049f, 0.711498f, 0.681156f, 1.06515f, -0.423409f, -0.0392664f, 0.0675396f, -0.0508602f, 0.0431443f, 0.0212639f, -0.0279887f, -0.62611f, -0.202064f, 0.701934f, 1.28452f, -0.00858481f, -0.517249f, 0.0615832f, -0.260215f, 0.0949119f, -0.28423f, -0.39573f, -0.0574246f, -0.318658f, 0.0601775f, -0.0629386f, -0.134208f, 0.111686f, -0.23355f, 0.078667f, 0.741023f, 0.828523f, -0.345067f, -0.315135f, -0.0957154f, 0.522825f, -0.190057f, -0.473789f, -0.390489f, 0.200677f, -0.0271802f, 0.110336f, 0.493302f, 0.663126f, 0.570148f, -0.380042f, -0.437349f, -0.660884f, 0.301908f, 0.0644179f, 0.172494f, 0.461917f, 0.330938f, -0.140041f, -0.0430205f, -1.51003f, -0.410984f, -0.182161f, 0.0235313f, -0.364849f, 0.154183f, -0.592465f, 0.272701f, 0.192389f, -0.0497777f, -0.924467f, -0.179513f, -0.592217f, 0.436363f, -0.0716164f, 0.189094f, -0.574697f, -0.304303f, 0.326441f, -0.0865553f, 0.735948f, 0.266912f, 0.435824f, -0.123322f }; static const float av1_simple_motion_search_split_hiddenlayer_0_bias_64[] = { -1.19333f, 1.01834f, -1.10844f, 0.0454873f, -1.45506f, 0.580864f, -0.040979f, -0.505681f, -1.15072f, 0.692697f, -0.520812f, -0.479384f, 0.529652f, 0.507252f, -1.08619f, 0.0586375f, 0.0929614f, -0.46753f, -0.701857f, -0.362933f, -0.291983f, -0.133933f, -0.0131351f, -0.267582f }; static const float av1_simple_motion_search_split_logits_kernel_64[] = { -3.32501f, 0.43082f, -1.060692f, 1.328908f, 0.8892894f, 0.6488833f, -1.096516f, -0.664786f, -1.301339f, 0.508805f, -2.128406f, -0.757304f, 0.383839f, 0.694763f, -0.591725f, 0.770385f, 1.021594f, 0.589181f, -0.76238f, 1.488826f, 0.709135f, -0.575738f, 0.26421759f, -0.2484219f }; static const float av1_simple_motion_search_split_logits_bias_64[] = { 0.699037f }; static const NN_CONFIG av1_simple_motion_search_split_nn_config_64 = { NUM_FEATURES_64, NUM_LOGITS_64, NUM_HIDDEN_LAYERS_64, { NUM_LAYER_0_UNITS_64, }, { av1_simple_motion_search_split_hiddenlayer_0_kernel_64, av1_simple_motion_search_split_logits_kernel_64, }, { av1_simple_motion_search_split_hiddenlayer_0_bias_64, av1_simple_motion_search_split_logits_bias_64, }, }; #undef NUM_HIDDEN_LAYERS_64 #undef NUM_FEATURES_64 #undef NUM_LAYER_0_UNITS_64 #undef NUM_LOGITS_64 #define NUM_HIDDEN_LAYERS_32 1 #define NUM_FEATURES_32 17 #define NUM_LAYER_0_UNITS_32 20 #define NUM_LOGITS_32 1 static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_32[] = { -0.980626f, -0.946611f, 0.103761f, 0.408899f, 0.498149f, 0.0490161f, 0.253279f, 0.332029f, 0.00367441f, 0.364401f, -0.236433f, 0.0592119f, -0.0978848f, 0.159733f, -0.018052f, -1.10726f, 1.16167f, -0.244982f, -0.147819f, -0.147095f, 0.111404f, -0.349502f, 0.441178f, 0.0984191f, -0.135537f, -0.0423312f, 0.0123079f, 0.358012f, -0.266796f, 0.0125811f, 0.196563f, 0.337093f, -1.07266f, -1.25134f, 0.57337f, -0.521717f, 0.259824f, 0.537383f, -0.463688f, -0.336128f, 0.373385f, 0.483443f, -0.229293f, -0.33373f, -0.656021f, 0.768647f, 0.179279f, 0.315415f, 0.187749f, 1.07839f, 0.0626629f, -0.230299f, 0.662606f, -0.414154f, 0.459334f, -0.6312f, 0.427704f, -0.249849f, 0.701056f, -0.707969f, 0.057401f, 0.620434f, 0.665748f, -0.501356f, -0.230685f, 0.0722371f, -0.0988625f, -0.114035f, -0.653799f, 0.571353f, 0.268276f, 1.13251f, -1.0695f, -0.225607f, -0.984355f, -0.42213f, 0.300422f, 1.21492f, -0.139931f, -0.000726004f, 0.045964f, -0.0817352f, -0.0278813f, -0.0102341f, -0.0144087f, -0.475882f, 1.20682f, -0.359919f, 0.277189f, -0.166401f, 0.599211f, -0.129872f, 0.574211f, -0.247573f, 0.824405f, -1.53329f, -0.202151f, -0.328698f, -0.516322f, -0.281416f, -0.383651f, -0.252862f, -0.43185f, 0.456802f, -0.430055f, -0.55245f, -0.6884f, -0.541456f, -0.281376f, 1.10425f, -0.140706f, 1.59816f, -0.0343895f, -0.00920039f, -0.0307667f, 0.0560132f, -0.0340302f, -0.10848f, 0.0593314f, -0.951795f, 0.876831f, -1.00548f, -0.566244f, 0.430061f, 1.10109f, -0.634212f, -0.0755369f, -0.108953f, 1.03191f, 0.109036f, -0.0415309f, 0.0681162f, -0.0611775f, -0.0231938f, 0.0973158f, -0.0558169f, -0.823484f, -0.918509f, 0.16756f, 0.27087f, 0.286074f, 0.174069f, 0.1304f, 0.386074f, 0.433953f, 0.0291467f, -1.74087f, 0.0296094f, -0.00793714f, -0.13041f, 0.00990992f, -0.0137848f, -0.0742606f, -0.251029f, -0.645316f, 0.640029f, 0.550607f, 0.470097f, 0.549451f, -0.285723f, -0.164759f, -0.128166f, -0.391496f, -0.80287f, 0.0769472f, 1.34391f, 0.0215005f, 0.0669497f, 0.131919f, 0.291674f, 0.0952889f, -0.677953f, -0.364054f, 0.144823f, 0.246198f, -0.12393f, 0.363661f, 0.215091f, -0.239658f, 0.18491f, 0.118703f, 0.0064156f, 1.38619f, -1.3845f, 0.0567323f, 1.20812f, -0.720374f, -1.92158f, -1.48657f, 0.335601f, 0.409379f, 0.373618f, 0.231274f, 0.292194f, 0.368619f, 0.2398f, 0.473579f, 0.83402f, -0.0133751f, -0.00344358f, 2.20688e-05f, 0.00836757f, 0.00405377f, 0.0110539f, -0.260154f, 0.192112f, -0.666986f, 0.302875f, -0.113302f, 0.17882f, -0.221493f, 0.146161f, -0.448697f, 0.584187f, 0.122109f, 0.989981f, -1.14706f, -0.734042f, 0.0638213f, 0.213357f, 0.068543f, -0.808558f, 0.404741f, 0.808313f, 1.57523f, -0.113448f, 0.254102f, -0.350065f, -0.615f, 0.0753549f, -0.540936f, -0.0250732f, -0.225681f, -0.161384f, 0.0128342f, -0.0933368f, -0.286904f, 0.130133f, -0.874747f, 0.392585f, -0.493135f, 0.169708f, 0.0909804f, 1.89921f, -0.469954f, 0.65165f, -0.953401f, -0.21595f, -0.37479f, 0.0451146f, 0.0234621f, -0.0596903f, -0.0682308f, -0.0830426f, 0.130011f, -0.409141f, 0.0627038f, -0.581148f, -0.513922f, 0.631676f, 0.0637034f, 0.0539081f, 0.0638872f, 0.515863f, -0.0123463f, 0.177238f, 0.279506f, -0.930345f, 1.23726f, 0.202851f, 0.708792f, -0.445086f, -0.0267075f, -0.913822f, -0.0714978f, -0.281107f, -0.0770565f, -0.23086f, -0.165893f, -0.319683f, 0.216235f, -0.490999f, 2.04841f, -0.0524071f, -0.239043f, -0.0526375f, 0.023002f, -0.132685f, -0.155354f, -0.186503f, -0.904296f, 0.166478f, 0.063268f, -0.302842f, -0.27179f, -0.428299f, 0.50193f, 0.480717f, -0.864275f, 0.317096f, 0.40698f, 0.0286107f, 0.189432f, -0.0374374f, 0.0671728f, 0.203681f, -0.457959f, -0.155776f, 0.340948f, 0.542841f, 0.342675f, -0.000952399f, 0.470957f, 0.744418f, -1.11763f, -0.658812f, -0.044832f, 0.0688237f, -0.357766f, 0.428662f, -0.087152f, -0.291903f, 0.373244f, -0.587853f, 0.415895f, -0.535694f, 0.621785f, -0.143648f, 0.0451373f, 0.00068827f, 1.84432f, -1.26239f, -0.432087f, -0.152307f, 0.0293551f, 0.184744f, -0.0173156f, -0.00572154f, -0.0305062f, -0.0900071f }; static const float av1_simple_motion_search_split_hiddenlayer_0_bias_32[] = { 0.160011f, 0.903856f, -0.13738f, 0.358221f, -0.0906044f, -0.606558f, -0.0215651f, -0.03377f, -1.67017f, -0.144554f, -0.201482f, -0.87719f, 0.639815f, -0.51976f, -0.309922f, -1.33421f, 0.721328f, -0.889354f, -1.7158f, -0.285963f }; static const float av1_simple_motion_search_split_logits_kernel_32[] = { -0.2745374f, 0.333548f, -0.2437388f, 0.288009f, 0.55635f, 0.4560176f, 0.2970518f, 0.391192f, 1.311854f, -0.231219f, -0.2968651f, -1.819984f, 0.2775824f, 0.28929857f, 0.419126f, -0.32868411f, -0.916399f, -0.1921077f, -0.617489f, 0.637953f }; static const float av1_simple_motion_search_split_logits_bias_32[] = { 0.208473f }; static const NN_CONFIG av1_simple_motion_search_split_nn_config_32 = { NUM_FEATURES_32, NUM_LOGITS_32, NUM_HIDDEN_LAYERS_32, { NUM_LAYER_0_UNITS_32, }, { av1_simple_motion_search_split_hiddenlayer_0_kernel_32, av1_simple_motion_search_split_logits_kernel_32, }, { av1_simple_motion_search_split_hiddenlayer_0_bias_32, av1_simple_motion_search_split_logits_bias_32, }, }; #undef NUM_HIDDEN_LAYERS_32 #undef NUM_FEATURES_32 #undef NUM_LAYER_0_UNITS_32 #undef NUM_LOGITS_32 #define NUM_HIDDEN_LAYERS_16 1 #define NUM_FEATURES_16 17 #define NUM_LAYER_0_UNITS_16 20 #define NUM_LOGITS_16 1 static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_16[] = { 0.0136957f, 0.182135f, -0.583394f, 0.0556956f, 0.211152f, 0.168234f, -0.694203f, -0.678216f, 0.289943f, 1.00014f, -0.0427784f, -0.0427538f, -0.0276009f, -0.00133608f, 0.0901944f, 0.0674892f, 0.104068f, -0.308582f, -0.43596f, 0.855997f, -0.223414f, 0.0390026f, 0.366492f, 0.216065f, -0.386863f, -0.148823f, -0.297022f, 0.0529546f, -0.202885f, 1.26471f, -0.861163f, -0.0949431f, 0.573627f, -0.00277083f, -0.616063f, -0.626927f, 0.371583f, -0.411743f, 0.173387f, -0.209734f, 0.293697f, -0.260714f, 0.442728f, -0.594486f, 1.38987f, 0.208025f, -0.0433776f, 0.01173f, 0.921766f, -0.168379f, 0.000697326f, 0.209967f, -0.304577f, 0.149551f, -0.196658f, 0.389251f, -0.449106f, -0.456329f, 0.669073f, -0.163806f, 0.083348f, -0.0783998f, 0.0678355f, 0.0510435f, 0.103964f, 0.104537f, -0.778093f, -1.0641f, -0.626102f, -2.02131f, 0.159591f, 0.254161f, -0.000362642f, 0.289859f, 0.192713f, 0.139801f, -0.0251327f, 0.164002f, 1.22892f, -0.0852193f, 0.0769487f, 0.0296408f, -0.0418688f, 0.0936023f, 0.0448523f, 0.674015f, -0.0732944f, 0.313575f, -0.593432f, 0.642067f, -1.06063f, 0.468223f, -0.769085f, -0.173798f, -0.175663f, 0.692808f, 0.00753295f, -0.123327f, -0.0234937f, -0.0923153f, 0.0216917f, -0.0690157f, -0.397488f, 0.426628f, 0.264475f, 0.342074f, -0.139817f, 0.215915f, 0.422544f, -0.321102f, 0.0355587f, 0.460193f, 0.0315326f, 0.080556f, -0.0256533f, -0.0857874f, -0.488283f, -0.299653f, -0.245987f, 0.104383f, 0.203731f, 0.328734f, 0.668104f, -0.586909f, -0.501335f, -0.661292f, -0.359811f, 0.00951363f, 0.816315f, -0.0124104f, 0.0545827f, 0.089863f, 0.0125486f, 0.043609f, -0.0259544f, 0.0123911f, 0.12557f, -0.539875f, -0.0556721f, 0.16532f, 0.265834f, -0.384171f, 0.646496f, 0.366147f, -0.111272f, 0.262096f, -0.0845724f, 0.382724f, 0.165783f, 0.1025f, 0.392988f, 0.290525f, 0.038659f, 0.540269f, -0.485586f, -0.273065f, -0.154052f, -0.0896895f, -0.35394f, 0.193214f, -0.423728f, 0.654576f, -0.373321f, 0.814914f, 0.026278f, -0.0328304f, -0.220913f, -0.0442121f, 0.487545f, -0.509537f, -0.777581f, -1.23886f, 0.223482f, 0.206009f, 0.20391f, 0.194628f, 0.226762f, 0.171609f, -0.219037f, 0.557892f, -0.312011f, 1.27709f, 0.064013f, 0.105384f, 0.0493933f, 0.074059f, -0.0100078f, -0.0176888f, -0.440005f, 0.302922f, -0.197456f, 0.296128f, -0.326647f, 0.305323f, -0.30696f, 0.201951f, -0.15874f, -0.793042f, 0.0197254f, 0.0569867f, -0.0295468f, -0.0215012f, 0.025855f, -0.0196102f, 0.215558f, -0.253069f, 0.298469f, 0.261269f, 0.435305f, 0.0120354f, -0.384789f, -0.2772f, 0.0366613f, -0.494994f, 0.149072f, 1.32981f, -0.427717f, 0.43938f, -0.16375f, -0.444342f, 0.548214f, 0.127955f, -1.24387f, 0.0863676f, 0.175071f, 0.172673f, -0.0906204f, 0.444454f, -0.546669f, 0.215857f, -0.100621f, 0.200699f, -0.0985915f, 0.134706f, -0.256396f, 0.393427f, 0.119606f, -0.214278f, -0.0183637f, 0.194266f, -0.238025f, 0.182203f, 0.599718f, 0.846933f, 0.0607852f, -0.183434f, -0.723743f, -0.72414f, -0.124701f, 0.0227527f, -0.0664636f, -0.0385867f, -0.0257377f, -0.149054f, 0.12077f, 0.678029f, -0.624456f, 0.189644f, -0.518604f, 0.134397f, -0.189777f, -0.309376f, -0.00377086f, 0.701132f, -0.170915f, 0.00736111f, -0.121906f, 0.329136f, 0.165514f, 0.0328356f, 0.171275f, 0.248619f, 0.247704f, -0.449933f, 0.0841684f, 0.136982f, 0.122703f, -0.0169439f, -0.0726496f, 0.302648f, -0.128556f, 0.0667425f, -0.289717f, -0.207532f, -1.20269f, -0.68892f, 0.045259f, 0.0973945f, 0.0988314f, -0.944748f, -0.180401f, 0.134331f, 0.033834f, 0.109023f, 0.265723f, 0.38063f, -0.106518f, -0.0686953f, 0.3744f, -1.0957f, 0.0302782f, 0.0515164f, 0.00188222f, 0.0014413f, -0.0404425f, 0.0124618f, -0.0828645f, 0.506166f, -0.776352f, -0.405138f, -0.123887f, 0.0732116f, 0.379928f, 0.604524f, -0.492317f, 0.439191f, 0.0744193f, 0.389101f, 0.0604518f, 0.0943165f, 0.0339942f, 0.0917975f, 0.0161988f, 0.512227f, 0.538021f, -0.411495f, 0.307281f, 0.33746f, -0.218639f, 0.265742f, 0.39738f, -0.12442f, 0.125236f, -0.0845223f, -0.150396f, 0.0334878f, -0.00391915f, 0.0406864f, -0.0487059f, 0.0377073f }; static const float av1_simple_motion_search_split_hiddenlayer_0_bias_16[] = { 0.0535976f, -0.0130279f, 0.150146f, -0.511132f, -0.357698f, 0.6719f, -1.27877f, -0.0208048f, 0.0961914f, 0.263603f, 0.704574f, -1.48998f, 0.728063f, 0.941829f, -0.199981f, 0.797802f, -0.29816f, -0.60894f, -0.116624f, -1.16723f }; static const float av1_simple_motion_search_split_logits_kernel_16[] = { 0.343153f, -0.2110482f, -0.487199f, 0.3274144f, -2.1975f, -0.6051438f, 0.1901127f, 0.4741924f, -0.24029f, -0.185018f, -0.652635f, 2.57714f, -0.31033031f, -0.307222f, 0.329035f, -0.430181f, 0.3429f, 0.742292f, 0.3269808f, 0.4142165f }; static const float av1_simple_motion_search_split_logits_bias_16[] = { -0.783658f }; static const NN_CONFIG av1_simple_motion_search_split_nn_config_16 = { NUM_FEATURES_16, NUM_LOGITS_16, NUM_HIDDEN_LAYERS_16, { NUM_LAYER_0_UNITS_16, }, { av1_simple_motion_search_split_hiddenlayer_0_kernel_16, av1_simple_motion_search_split_logits_kernel_16, }, { av1_simple_motion_search_split_hiddenlayer_0_bias_16, av1_simple_motion_search_split_logits_bias_16, }, }; #undef NUM_HIDDEN_LAYERS_16 #undef NUM_FEATURES_16 #undef NUM_LAYER_0_UNITS_16 #undef NUM_LOGITS_16 #define NUM_HIDDEN_LAYERS_8 1 #define NUM_FEATURES_8 17 #define NUM_LAYER_0_UNITS_8 20 #define NUM_LOGITS_8 1 static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_8[] = { 0.079443f, -1.04068f, 0.336819f, -0.20901f, 0.796251f, 0.181066f, 0.0118876f, -0.207145f, 0.250671f, -0.402119f, -0.0847227f, 1.88683f, 0.303469f, 0.0718458f, 0.0338589f, 0.158896f, 0.0540238f, -0.385426f, 0.955925f, 0.424506f, 0.492584f, -0.795058f, -0.248667f, -0.905349f, -0.316989f, 0.545471f, 0.63762f, -0.232613f, -0.238947f, -0.395338f, -0.322673f, -0.0761563f, -0.125357f, 0.0694415f, -0.371599f, 0.358387f, -0.486841f, 0.403863f, -0.0295666f, 0.283074f, -0.424396f, 0.156318f, -0.685355f, 0.6663f, 0.337949f, 0.273198f, 0.517448f, 0.458911f, 0.157252f, 0.692096f, 0.64965f, -0.23987f, -1.08431f, -0.252475f, -0.332614f, -0.712291f, -0.380973f, 0.460545f, 0.48936f, 0.337601f, 0.489223f, 1.65336f, -0.223585f, 0.17367f, -0.235057f, -0.456773f, 0.327877f, -0.221192f, -0.940151f, -1.06616f, 0.687084f, -0.109973f, 0.106636f, 0.445895f, 0.163432f, 0.378306f, 0.201902f, 0.176811f, 0.693082f, 1.62156f, -0.178346f, 0.455175f, 1.61943f, 0.231376f, 0.0890932f, -0.889693f, -1.03298f, 0.778196f, -0.0289539f, 0.137848f, 0.18707f, 0.171889f, 0.119157f, 0.24893f, -0.313628f, 0.00250735f, -0.0758209f, 0.272974f, -0.229825f, 2.47926f, -0.0354665f, 0.175366f, 0.0411555f, -1.52149f, -0.0258663f, 0.253027f, -0.0520839f, -0.0189782f, 0.362387f, -0.371154f, 0.622929f, 0.0447056f, 0.242529f, -0.168391f, 0.308935f, -0.117294f, 2.16307f, 0.0673638f, 0.080771f, -0.460779f, -0.940176f, 0.473266f, -0.0125302f, 0.475145f, -0.218187f, 0.43258f, -0.0380196f, 0.413607f, -0.110856f, -1.52076f, 0.0896812f, 0.246636f, -0.0612008f, 0.189583f, 0.0106902f, -0.158403f, -0.629377f, -0.0634279f, -0.0864584f, -0.226568f, -0.286234f, -0.0721132f, -0.43702f, 0.113702f, 0.433372f, 0.743396f, 0.14312f, 0.29914f, 0.801188f, 0.7609f, 0.385046f, 0.480314f, 0.171119f, -1.59058f, -1.18853f, 0.150676f, 0.408123f, -0.00677924f, 0.398145f, 0.0914611f, 0.176945f, 0.0677457f, 0.316478f, 0.998219f, -0.22618f, 0.0756793f, -0.0156674f, 0.105716f, 0.0496245f, -0.0827133f, -0.423119f, -0.161033f, 0.212962f, -0.234453f, 0.743366f, 1.04108f, 0.0597604f, -0.285993f, -0.114829f, -0.557364f, -0.840051f, 0.326509f, -0.192508f, -0.141769f, 0.370626f, -0.126353f, 0.00672923f, 0.493623f, -0.852076f, 0.466798f, -0.226436f, 0.259268f, -0.452662f, 0.0721126f, 0.0198245f, 0.2048f, 0.02506f, 0.316194f, 0.814651f, 1.01288f, -0.569607f, -0.0838994f, 1.37146f, -0.613135f, 0.441761f, -0.643901f, 0.364269f, -0.147177f, 0.338001f, -0.332376f, 0.518875f, -0.628964f, -0.291889f, -0.050736f, 0.108047f, 1.05673f, 0.0479492f, 0.466756f, -0.0867334f, -0.0355575f, 0.57626f, -0.227583f, -0.146421f, 0.0990489f, 0.117351f, -0.103858f, -0.0336936f, 0.0201903f, -0.0766383f, -0.010211f, 0.0400779f, 0.0725462f, 0.137142f, 0.478261f, 0.287869f, 0.0882359f, -0.739754f, -0.853521f, -0.43703f, 0.316856f, 0.27593f, 0.312149f, 0.175575f, 0.441839f, 0.264325f, 0.0148051f, -0.005559f, 0.373176f, 0.933701f, -0.0197615f, 0.0219723f, -0.0559883f, -0.103456f, -0.0323009f, 0.0773202f, -0.390838f, 0.855488f, -0.596525f, -0.249093f, 0.124262f, 0.220172f, 0.0552478f, 1.04041f, -0.960992f, -0.495255f, -0.211612f, 0.350007f, -0.238998f, -0.0265068f, 0.384686f, -0.0815808f, -0.0570019f, 0.123903f, -0.485114f, -0.00282573f, -0.0649603f, 0.163719f, -0.469479f, -0.439713f, 0.0602562f, -0.527993f, -0.111458f, 2.48686f, -0.180723f, 0.0553895f, 0.0560679f, -0.0978928f, -0.216063f, 0.089457f, -1.5602f, -1.62332f, -0.147388f, 0.736155f, 0.440409f, 0.243519f, 0.0622638f, 0.522932f, 0.109686f, 0.422849f, 0.510589f, 1.01116f, 0.174019f, 0.0191171f, -0.0717751f, -0.0068308f, 0.172932f, -0.834888f, -0.635788f, 0.32012f, 0.298656f, 0.274309f, -0.155456f, 0.1755f, -0.175171f, 0.343498f, -0.122832f, -0.107696f, 0.279924f, -0.797633f, -0.344658f, 0.162669f, 0.389092f, 0.644479f, -0.635216f, -0.181868f, 0.0579244f, -0.0568976f, 0.433003f, -0.591067f, 0.71013f, -0.165515f, 0.225725f, -0.358156f, 0.0541944f, 1.95485f, -0.315223f, 0.61537f, -0.0401568f, 0.22811f, 0.271147f }; static const float av1_simple_motion_search_split_hiddenlayer_0_bias_8[] = { 1.63441f, -0.616459f, -0.437775f, -0.71669f, 1.56616f, 2.28109f, 1.64054f, -1.51476f, 0.0274108f, 0.935156f, -0.966329f, 0.906069f, 1.19954f, -1.25867f, -1.7376f, -0.594211f, 0.322242f, 0.438631f, -1.01682f, 1.30032f }; static const float av1_simple_motion_search_split_logits_kernel_8[] = { -0.463187f, 0.2936127f, 0.16762f, -0.1663271f, -0.292418f, -0.421457f, -0.378265f, 1.053049f, 0.32432879f, -0.49775575f, 0.427357f, -0.239251f, -0.1631546f, 0.335468f, 0.255371f, 0.276901f, -0.665683f, -0.7021493f, 0.381513f, -0.1339761f }; static const float av1_simple_motion_search_split_logits_bias_8[] = { -1.739754f }; static const NN_CONFIG av1_simple_motion_search_split_nn_config_8 = { NUM_FEATURES_8, NUM_LOGITS_8, NUM_HIDDEN_LAYERS_8, { NUM_LAYER_0_UNITS_8, }, { av1_simple_motion_search_split_hiddenlayer_0_kernel_8, av1_simple_motion_search_split_logits_kernel_8, }, { av1_simple_motion_search_split_hiddenlayer_0_bias_8, av1_simple_motion_search_split_logits_bias_8, }, }; #undef NUM_HIDDEN_LAYERS_8 #undef NUM_FEATURES_8 #undef NUM_LAYER_0_UNITS_8 #undef NUM_LOGITS_8 static const NN_CONFIG *const av1_simple_motion_search_split_nn_config[5] = { &av1_simple_motion_search_split_nn_config_128, &av1_simple_motion_search_split_nn_config_64, &av1_simple_motion_search_split_nn_config_32, &av1_simple_motion_search_split_nn_config_16, &av1_simple_motion_search_split_nn_config_8, }; // Model based on simple_motion_search for pruning rect // Thresholds. The first idx level is aggresiveness, second is frame resolution, // third is bsize static const float av1_simple_motion_search_prune_rect_thresh[4][3][5] = { // Aggressivness = 0 { // Lowres { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f, 0.000961189195907f, 0.0f }, // Midres { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f, 0.000961189195907f, 0.0f }, // Hdres { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f, 0.000961189195907f, 0.0f }, }, // Aggressivness = 1 { // Lowres { 0.000000f, 0.116076f, 0.049759f, 0.057747f, 0.006001f, }, // Midres { 0.000000f, 0.017380f, 0.026077f, 0.078111f, 0.064477f, }, // Hdres { 0.002994f, 0.103093f, 0.076408f, 0.010456f, 0.187211f, }, }, // Aggressiveness = 2 { // Lowres { 0.000000f, 0.003111f, 0.144294f, 0.144884f, 0.069924f, }, // Midres { 0.000000f, 0.013696f, 0.055203f, 0.152271f, 0.078886f, }, // Hdres { 0.030577f, 0.082486f, 0.040690f, 0.140924f, 0.067608f, }, }, // Aggressiveness = 3 { // Lowres { 0.0f, 0.352338114654f, 0.171190796972f, 0.322629318068f, 0.287219697095f }, // Midres { 0.0f, 0.30938393361f, 0.271772875141f, 0.240627957104f, 0.178833795641f }, // Hdres { 0.285731215187f, 0.37521798723f, 0.142380566244f, 0.338288917819f, 0.21329309279f }, }, }; // Mean and std static const float av1_simple_motion_search_prune_rect_mean_128[25] = { 13.292176f, 13.231236f, 11.098058f, 11.049944f, 10.481336f, 10.431587f, 10.789337f, 10.732787f, 10.233817f, 10.173738f, 12.214045f, 12.157505f, 11.863353f, 11.802220f, 12.204053f, 12.152315f, 11.517566f, 11.465651f, 5.383040f, 0.757934f, 4.012611f, 4.052191f, 0.853365f, 3.954503f, 3.944135f, }; static const float av1_simple_motion_search_prune_rect_std_128[25] = { 2.589217f, 2.559396f, 2.268402f, 2.282274f, 3.341234f, 3.341994f, 3.033007f, 3.041550f, 3.786247f, 3.784053f, 2.523459f, 2.511275f, 3.349364f, 3.340481f, 2.390149f, 2.384226f, 3.599467f, 3.587460f, 2.319911f, 0.428335f, 1.241087f, 1.208679f, 0.353742f, 1.228122f, 1.211777f, }; static const float av1_simple_motion_search_prune_rect_mean_64[25] = { 11.439831f, 11.382639f, 9.647134f, 9.578121f, 9.146770f, 9.084122f, 8.559063f, 8.499496f, 8.095865f, 8.041795f, 10.547537f, 10.486240f, 9.362147f, 9.308391f, 10.548071f, 10.484358f, 10.002225f, 9.944480f, 4.964504f, 0.897164f, 3.306144f, 3.351039f, 0.928582f, 3.319739f, 3.287726f, }; static const float av1_simple_motion_search_prune_rect_std_64[25] = { 2.033404f, 2.050657f, 2.064671f, 2.081519f, 2.916312f, 2.914649f, 3.628949f, 3.618760f, 4.011421f, 3.996068f, 2.087127f, 2.103106f, 3.885277f, 3.876166f, 2.035599f, 2.052976f, 3.052501f, 3.050985f, 2.232998f, 0.303745f, 1.111161f, 1.081292f, 0.257521f, 1.112510f, 1.089404f, }; static const float av1_simple_motion_search_prune_rect_mean_32[25] = { 9.862349f, 9.793658f, 8.043962f, 7.954083f, 8.058867f, 7.966165f, 8.046844f, 7.956817f, 8.061414f, 7.967906f, 8.966450f, 8.890165f, 8.968315f, 8.891513f, 8.953573f, 8.877070f, 8.974275f, 8.895363f, 4.387239f, 0.954143f, 2.701000f, 2.751266f, 0.963302f, 2.716584f, 2.709725f, }; static const float av1_simple_motion_search_prune_rect_std_32[25] = { 1.971555f, 1.985517f, 1.935986f, 1.944743f, 1.924122f, 1.932169f, 1.943151f, 1.950612f, 1.931156f, 1.938242f, 1.987803f, 1.997670f, 2.000859f, 2.009913f, 1.938270f, 1.949277f, 1.922999f, 1.933145f, 1.991504f, 0.209175f, 0.973824f, 0.952221f, 0.188018f, 0.985295f, 0.946228f, }; static const float av1_simple_motion_search_prune_rect_mean_16[25] = { 8.391692f, 8.303431f, 6.590342f, 6.459725f, 6.460719f, 6.333274f, 6.592615f, 6.461661f, 6.464787f, 6.337191f, 7.499753f, 7.395166f, 7.503220f, 7.398344f, 7.498312f, 7.395039f, 7.353743f, 7.253139f, 3.874267f, 0.979701f, 2.087404f, 2.131698f, 0.981005f, 2.110868f, 2.106539f, }; static const float av1_simple_motion_search_prune_rect_std_16[25] = { 1.865867f, 1.870012f, 1.773885f, 1.770447f, 1.972922f, 1.961361f, 1.777224f, 1.772864f, 1.974519f, 1.962281f, 1.831632f, 1.831837f, 1.837595f, 1.837008f, 1.822791f, 1.822053f, 2.074991f, 2.067200f, 1.676261f, 0.141022f, 0.840297f, 0.829935f, 0.136507f, 0.828972f, 0.808563f, }; static const float av1_simple_motion_search_prune_rect_mean_8[25] = { 6.997798f, 6.867032f, 5.134819f, 4.883330f, 5.134804f, 4.879707f, 5.140518f, 4.886751f, 5.142186f, 4.885262f, 6.069946f, 5.896944f, 6.080442f, 5.906130f, 6.077539f, 5.905929f, 6.083087f, 5.909298f, 3.552709f, 0.990654f, 1.497349f, 1.531762f, 0.989606f, 1.496581f, 1.484139f, }; static const float av1_simple_motion_search_prune_rect_std_8[25] = { 1.727562f, 1.725050f, 1.633396f, 1.618773f, 1.633586f, 1.620657f, 1.620798f, 1.604892f, 1.621570f, 1.607439f, 1.691024f, 1.684225f, 1.676065f, 1.668442f, 1.680016f, 1.672452f, 1.677775f, 1.671586f, 1.451902f, 0.096223f, 0.751190f, 0.754040f, 0.101419f, 0.738239f, 0.729455f, }; static const float *const av1_simple_motion_search_prune_rect_mean[5] = { av1_simple_motion_search_prune_rect_mean_128, av1_simple_motion_search_prune_rect_mean_64, av1_simple_motion_search_prune_rect_mean_32, av1_simple_motion_search_prune_rect_mean_16, av1_simple_motion_search_prune_rect_mean_8, }; static const float *const av1_simple_motion_search_prune_rect_std[5] = { av1_simple_motion_search_prune_rect_std_128, av1_simple_motion_search_prune_rect_std_64, av1_simple_motion_search_prune_rect_std_32, av1_simple_motion_search_prune_rect_std_16, av1_simple_motion_search_prune_rect_std_8, }; #define NUM_HIDDEN_LAYERS_128 1 #define NUM_FEATURES_128 25 #define NUM_LAYER_0_UNITS_128 8 #define NUM_LOGITS_128 4 static const float av1_simple_motion_search_prune_rect_logits_kernel_128[] = { -0.129103f, 0.457758f, -0.489986f, 0.65462f, -0.184312f, 3.81202f, -0.444407f, -0.64198f, -0.575008f, 0.0311711f, 0.525243f, -20.892f, 1.08811f, -65.0976f, -12.3973f, -1.38278f, -0.264233f, 0.241636f, -10.6925f, -0.725414f, -18.8987f, -40.2284f, -16.08f, 0.995331f, 1.47614f, -0.964864f, 0.405506f, 0.140449f, 0.459534f, -1.9093f, 0.398452f, 0.696949f }; static const float av1_simple_motion_search_prune_rect_layer_0_bias_128[] = { 1.22789f, -1.34527f, 0.759048f, 0.315086f, 1.0834f, -1.58019f, -0.465158f, 1.20716f }; static const float av1_simple_motion_search_prune_rect_layer_0_kernel_128[] = { -0.668677f, 0.58694f, -0.417094f, 0.754735f, -0.7859f, 0.377479f, -0.0415929f, -0.0140585f, -0.730001f, 0.747528f, -0.135247f, 0.406505f, -0.234184f, 0.956362f, -0.637555f, 0.791884f, 0.0303722f, 1.04424f, -0.727859f, -0.274321f, -0.122986f, 0.066312f, -0.00559175f, -0.239643f, -0.0188767f, -0.102787f, -0.262967f, 0.071882f, -0.283398f, 0.111607f, -0.425826f, 0.02699f, 0.108873f, -0.180558f, -0.0794057f, 0.29665f, -0.0252969f, -0.0266213f, -0.277462f, -0.361973f, 0.512552f, 0.395011f, -0.225876f, 0.301924f, 0.136954f, 0.507259f, 1.23425f, 0.0137135f, 0.662572f, 0.591583f, 0.101564f, 0.416805f, -0.645081f, -0.179086f, -0.36747f, -0.332213f, 0.095177f, 0.220739f, -0.153256f, 0.706155f, 0.161701f, 0.696815f, -1.21531f, -0.115059f, 0.486764f, -0.396093f, 0.784883f, 0.535357f, -0.278021f, 0.143496f, -0.44931f, -0.144543f, 0.319326f, 0.0190167f, -0.206295f, 0.373995f, -0.247897f, -0.608095f, -0.41796f, -0.137129f, -0.709562f, 0.678273f, 0.537607f, 0.557474f, 0.453308f, 0.21405f, -0.0466495f, 0.519139f, -0.168832f, 0.902911f, 0.681131f, -0.139876f, -0.2052f, -0.393271f, 0.262222f, -0.246246f, -0.213993f, 0.646619f, 0.0496181f, -0.00354157f, 0.822927f, 0.0939522f, 0.180738f, 0.118355f, 0.120456f, -0.0472214f, -0.144958f, 0.173405f, -0.886644f, -0.0949769f, -0.813518f, -0.3947f, -0.128021f, 0.356196f, 0.469169f, -0.413702f, 1.04242f, 0.428853f, -0.387293f, 0.0850877f, 0.279409f, -0.142276f, 0.0579376f, 0.211112f, 0.0703013f, -1.9274f, -0.729147f, 0.534193f, 0.773586f, 0.922864f, 0.642881f, 1.15127f, 0.621032f, 0.933942f, 1.01837f, -0.660282f, -0.40059f, -1.11279f, -0.77088f, -0.43349f, 0.202361f, -0.0840912f, 0.0935707f, 0.056333f, -0.0779369f, 0.0173447f, -0.0104756f, 0.0115005f, -0.0195593f, 0.03592f, -0.343454f, -0.618048f, 0.258172f, -0.412322f, -0.0463746f, -0.0413654f, -0.0400194f, 0.615981f, -0.452094f, 0.644555f, 0.0822476f, -0.359791f, -0.0904274f, 0.209427f, 0.0116338f, -0.190978f, 0.890233f, 0.737769f, -1.66663f, -0.392605f, 0.0785728f, -0.224553f, -0.128258f, -0.227227f, -0.0777773f, 0.685976f, 0.347042f, -0.555325f, -0.249221f, 0.0919837f, -0.0660016f, -0.272316f, 0.0390632f, -0.619624f, -0.0565801f, 0.585026f, 0.597375f, 0.54114f, 0.593389f, 0.604391f, 0.0820294f, -0.85339f, -1.40741f, -0.391675f, 0.0579205f, -0.197626f, 0.130044f, -0.234488f, -0.0373991f, -0.0717973f }; static const float av1_simple_motion_search_prune_rect_logits_bias_128[] = { 1.58571f, -4.6314f, -2.00273f, 0.543699f }; static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_128 = { NUM_FEATURES_128, NUM_LOGITS_128, NUM_HIDDEN_LAYERS_128, { NUM_LAYER_0_UNITS_128, }, { av1_simple_motion_search_prune_rect_layer_0_kernel_128, av1_simple_motion_search_prune_rect_logits_kernel_128, }, { av1_simple_motion_search_prune_rect_layer_0_bias_128, av1_simple_motion_search_prune_rect_logits_bias_128, }, }; #undef NUM_HIDDEN_LAYERS_128 #undef NUM_FEATURES_128 #undef NUM_LAYER_0_UNITS_128 #undef NUM_LOGITS_128 #define NUM_HIDDEN_LAYERS_64 1 #define NUM_FEATURES_64 25 #define NUM_LAYER_0_UNITS_64 32 #define NUM_LOGITS_64 10 static const float av1_simple_motion_search_prune_rect_logits_kernel_64[] = { 0.10424f, -0.346025f, 0.534547f, -0.385925f, 2.58341f, -0.256414f, -0.232498f, 0.329823f, -0.0777376f, -0.590939f, 0.062657f, -0.628252f, 0.0934588f, 2.04029f, -0.224448f, 0.371168f, -0.385348f, -0.589883f, -3.73627f, -0.943144f, 0.346409f, -0.211215f, -0.351008f, 0.418807f, 0.943663f, 0.173267f, 1.16585f, -0.0840888f, 0.227464f, 0.374412f, 0.0422597f, -0.338868f, 0.222576f, 0.431713f, 1.12366f, 0.00753411f, 0.248412f, -0.0902425f, 0.542455f, -0.665629f, -0.311245f, -0.205639f, -0.447149f, -0.0502733f, -0.290186f, -0.794384f, 0.0940881f, -0.0686117f, -0.0199961f, -0.587965f, 0.777096f, -0.083381f, -1.21282f, 0.652959f, -1.18238f, 0.539991f, 0.352497f, -0.540076f, -0.26222f, -0.568556f, 0.409102f, -0.131146f, -0.407161f, -0.188287f, -0.478657f, 0.000401932f, -0.689324f, 0.351064f, -1.43704f, -0.315185f, -0.868726f, 0.376341f, -0.0566277f, 0.364831f, 0.611298f, -0.495253f, -0.0193132f, 0.617978f, 0.189586f, -0.236758f, -0.608246f, -0.149017f, -1.78303f, 0.143023f, 0.698386f, -0.994086f, -0.673327f, 0.233868f, 0.360425f, 0.0294123f, -0.248683f, -0.148392f, 0.0861829f, -0.190843f, -0.414906f, 0.607378f, -0.756715f, -0.511713f, -0.321556f, 1.0078f, -1.18141f, 0.519751f, 0.834629f, -0.359343f, 0.612262f, -0.0730553f, 0.262935f, 0.488276f, 0.387071f, -1.44123f, 1.08269f, 0.554402f, -0.069f, 0.14113f, 0.323817f, 0.824314f, -0.431417f, -0.349448f, 0.950728f, -0.587836f, -0.83914f, -0.10844f, 0.26602f, 0.831933f, -0.271315f, 0.231563f, 0.417049f, 0.190627f, -0.0940667f, 0.255363f, -0.0741022f, -0.0987662f, -0.847522f, 0.00287554f, 0.0615741f, -0.0832218f, 0.0847148f, -0.392843f, -0.938068f, -0.10621f, -0.260859f, -0.825175f, -0.401039f, 0.315213f, -0.108269f, 0.288036f, -8.66166f, -0.970752f, -0.66678f, -0.593405f, -0.518294f, -0.138722f, -0.454698f, -0.22969f, -0.553006f, -0.440111f, 0.462661f, -0.536854f, 0.0108295f, -0.522888f, 0.00111157f, 0.229999f, 0.0267768f, 0.176266f, -1.57043f, 0.0318106f, 0.257534f, -0.198583f, 0.175564f, -0.251465f, -0.262441f, -1.65283f, -0.319603f, -0.875282f, -0.301303f, 0.0170948f, -0.227075f, 0.0299545f, -4.98346f, 0.470046f, -1.28051f, -0.213809f, -0.486585f, -0.906463f, -0.169984f, -0.333153f, -0.376733f, 0.108016f, 0.486744f, -0.186936f, -0.429259f, 0.056501f, -0.266545f, 0.265447f, -0.137718f, -0.490687f, -0.935668f, -0.16229f, -0.696932f, 0.173157f, 0.434959f, -0.140595f, 0.345845f, -1.08013f, -0.0205929f, -0.815874f, -0.179812f, 0.02767f, -0.141727f, 0.471936f, -7.29453f, -1.04362f, -0.745482f, -0.28725f, -0.214997f, -0.0850651f, -0.748471f, 0.161325f, -1.04387f, -0.705305f, 0.489427f, -0.765373f, -0.301576f, 0.0742467f, -0.331282f, 0.0372328f, -0.90298f, -0.0608646f, -2.18756f, 0.170384f, -0.258357f, 0.106287f, -0.161684f, -0.103799f, -0.127774f, -0.156313f, 0.0705286f, -0.977908f, -0.281191f, -0.056757f, -0.309474f, 0.050476f, -9.78198f, -2.42795f, -0.289626f, -1.07579f, -0.439256f, -1.09948f, -0.564671f, 0.0913182f, -0.417216f, -1.19909f, 0.287063f, 0.402315f, -0.17646f, 0.540488f, 0.00840239f, 0.397492f, 0.702393f, -0.10566f, 0.655296f, -0.0443876f, 0.154918f, -0.760479f, -0.0523153f, -0.366199f, -1.08212f, -0.398556f, -0.415203f, -1.10488f, 0.208349f, 0.27079f, 0.101546f, -0.205752f, -13.7923f, -0.218637f, -1.10077f, 0.355735f, -0.306196f, 0.627434f, -0.473101f, -0.308027f, -1.12724f, 0.301597f, 0.660785f, 0.0576217f, -0.155925f, -0.56107f, -0.223537f, 0.114299f, -0.53803f, -0.252674f, -2.66103f, -0.185245f, -0.314673f, 0.403337f, 0.679821f, -0.69231f, 0.506264f, -0.999705f, -0.549097f, 0.353745f, 0.188249f, 0.414484f, -0.615853f, 0.525681f, -5.23065f, -3.05174f, 1.02074f, -0.965499f, -0.158947f, 0.0436088f, -0.485824f, 0.0375094f, -1.39985f, -0.481392f, 0.485785f, -0.24874f, -0.359633f, 0.668108f }; static const float av1_simple_motion_search_prune_rect_layer_0_bias_64[] = { 0.0735592f, -0.045064f, -0.0114103f, 1.39246f, -0.683467f, 0.155765f, -0.667652f, -0.202425f, -0.585433f, -0.146752f, -0.0812931f, 0.580642f, 0.578542f, -0.831916f, 0.610063f, 0.0101856f, -0.235863f, 0.538141f, -2.91334f, -1.71887f, 0.126616f, 0.582497f, -0.438879f, 0.221833f, 0.850773f, -0.280886f, 0.443233f, -0.0964873f, -0.216161f, 0.34413f, 0.656818f, 0.0169274f }; static const float av1_simple_motion_search_prune_rect_layer_0_kernel_64[] = { -0.310947f, -0.232675f, 0.0171092f, 0.0834474f, 0.373977f, 0.300429f, 0.215072f, -0.454074f, 0.187565f, 0.282742f, 0.562562f, -0.0419322f, 0.000978486f, -0.298267f, 0.216934f, -0.388722f, -0.146866f, -0.275946f, 0.202361f, 0.225847f, 1.42868f, 0.473127f, -0.145747f, -0.104986f, 0.153459f, 0.69382f, 0.162266f, 0.0207715f, -0.45095f, -0.412071f, -0.235109f, -0.130199f, 0.231741f, 0.460193f, 0.0378202f, 0.429516f, 0.387691f, -0.272479f, 0.0723884f, -0.453914f, -0.150618f, -0.10745f, -0.258615f, 0.0838312f, -0.00554958f, 0.105377f, -0.0415479f, 0.13228f, 1.09044f, -0.73053f, -0.422553f, -0.435842f, 0.211416f, 0.420332f, 0.0181353f, -0.030891f, 0.522788f, 0.613526f, 0.374032f, 0.287986f, -0.403118f, -0.287362f, -1.11523f, -0.577713f, -0.020228f, 0.86465f, -0.0590579f, 0.341274f, -0.0115644f, -0.260236f, 0.192123f, -0.0849825f, 0.0501709f, 0.444382f, 0.0762727f, 0.0926596f, -0.101157f, -0.142787f, 0.40861f, 0.555805f, -0.00614654f, -0.122846f, 0.203163f, 0.234266f, 0.409795f, -0.0206245f, -0.224679f, 0.025081f, 0.518044f, -0.287186f, 0.016494f, -0.0886331f, 0.236438f, -1.01032f, 0.118332f, 0.364217f, 0.061438f, 0.0381303f, 0.128418f, 0.0257077f, -0.975751f, -0.694894f, 0.00351914f, 0.278179f, 0.29363f, 0.525576f, 0.0604849f, 0.531734f, 0.406643f, 0.812497f, -0.403196f, -0.16664f, -0.620887f, -0.428194f, 0.275401f, 0.432063f, -0.00378342f, 0.295758f, 0.105615f, -0.00683626f, 0.00396146f, 0.00598654f, -0.0131701f, -0.0115787f, 0.00386643f, -0.69686f, -0.139623f, -0.440817f, 0.0542873f, 0.217962f, 0.527035f, -0.0201046f, 0.0471354f, 0.0271858f, -0.0775197f, -0.309797f, 0.184879f, -0.232854f, -0.407081f, 0.706227f, -0.0877534f, 0.306843f, 0.455075f, -0.333961f, 0.0759148f, 0.0444791f, -0.0693626f, -0.0850289f, -0.513063f, -0.643971f, -0.630279f, -0.153889f, 0.123315f, 0.00548238f, 0.170707f, 0.734339f, -0.176988f, 0.322519f, 0.178365f, 0.183519f, -0.698683f, -0.12043f, -0.349914f, -0.0696762f, -0.53986f, -0.104738f, 1.05264f, 0.983568f, -0.109035f, 0.0113748f, 0.0815189f, -0.0628812f, 0.0769389f, 0.010261f, 0.146573f, -0.433194f, -0.211572f, -0.000397392f, 0.445325f, 0.145091f, -0.0625902f, 0.29394f, 0.302315f, 0.0892226f, -0.209504f, -0.0150374f, 0.242608f, 0.216223f, 0.366857f, 0.209829f, -0.540035f, 0.117599f, -0.329315f, 0.0471133f, -0.0115449f, -0.0638235f, 0.0527461f, 0.348149f, 0.360802f, 1.06624f, -0.615991f, -0.341396f, 0.18972f, 0.0709888f, -0.0414466f, -0.0193809f, 0.0938933f, 0.209058f, 0.575042f, 0.483608f, -0.285875f, -0.115905f, -0.363637f, 0.375425f, 0.336217f, 0.0336358f, -0.00265618f, -0.406854f, -0.792959f, -0.219354f, 0.0331615f, 0.0298859f, -0.211446f, -0.00280773f, -0.194011f, 0.262109f, 0.548076f, 0.120183f, -0.661603f, 0.241855f, -0.501428f, 0.00102718f, -0.347331f, -0.58306f, 0.0977254f, 0.117491f, 0.0840667f, 0.00693675f, 0.000600294f, 0.649569f, -0.0553811f, -0.197198f, 0.397236f, -0.523737f, -0.564192f, -0.374679f, -0.249344f, 0.00861428f, 0.00393439f, -0.0834608f, 0.124389f, -0.0393049f, 0.0425391f, -0.153383f, -0.182346f, 0.420953f, 0.464221f, 0.288984f, 0.570921f, -0.239965f, 0.247239f, -0.083434f, 0.714418f, 0.986323f, -0.460244f, -0.260993f, -0.947743f, -1.0789f, -0.0391231f, 0.612407f, -0.0306767f, 0.281419f, 0.0072426f, -0.37623f, 0.188744f, 0.221666f, -0.424914f, 0.29703f, 0.261715f, 0.277809f, -0.0617616f, -0.000611999f, -0.0547053f, -0.0901018f, -0.347669f, 0.856072f, 0.596675f, -0.467639f, -1.09324f, -0.184224f, -0.56051f, -0.0144704f, 0.102894f, -0.122982f, -0.0020749f, -0.0423487f, 0.0328702f, -0.0154263f, 0.0349021f, -0.00315595f, 0.0254802f, -0.729191f, 0.207296f, -0.0212349f, -0.207078f, 0.20636f, -0.156883f, 0.429765f, -0.42672f, 0.138775f, -0.0267343f, 0.631528f, 0.300646f, -0.4793f, -0.273833f, -0.0135367f, -0.530819f, -0.534881f, 0.830896f, 0.0266992f, 0.473744f, 0.210334f, 0.0234739f, 0.255394f, 0.123531f, -0.489341f, -0.796627f, 0.372617f, 0.190136f, 0.275342f, 0.739505f, 0.402354f, 0.782806f, 0.437374f, 1.04948f, -0.55963f, 0.382704f, -0.698321f, 0.0817868f, -0.440108f, -0.0635004f, -0.277851f, -0.524194f, 0.286157f, -0.01097f, -0.0293145f, -0.0405071f, -0.035662f, -0.012871f, -0.0516409f, -0.406671f, 0.709259f, -0.525177f, 0.521123f, -0.44813f, 0.48412f, -0.0546513f, 0.305253f, -0.468328f, 0.316453f, -0.36307f, 0.497515f, -0.0606276f, 0.315764f, -0.422066f, 0.554025f, -0.679183f, 0.616914f, 0.00283324f, -0.000643824f, 0.0639999f, 0.0488285f, -0.141031f, 0.068003f, -0.0792678f, -0.425307f, -0.152235f, 0.269917f, -0.352327f, 0.44792f, -0.116514f, -0.465868f, 0.154287f, 0.0161028f, -0.16848f, -0.255487f, 0.189832f, 0.254883f, 0.0240822f, 0.432638f, -0.136564f, 0.137036f, 0.0375734f, 0.989246f, -0.126287f, 0.111416f, -0.0271002f, 0.718755f, -0.0412969f, 0.00645681f, 0.253811f, -0.0186998f, 0.691971f, -0.282042f, -0.0783915f, 0.274592f, -0.358449f, 0.34155f, -0.186374f, -0.136907f, -0.192334f, -0.251168f, -0.100874f, -0.166578f, -0.336507f, 0.402373f, 0.173695f, 0.108788f, 0.00885581f, -0.310063f, 1.05545f, 0.0295867f, 0.180785f, -0.173469f, -0.469924f, -0.224155f, 0.665862f, -0.126546f, 0.240691f, -0.0415301f, -0.598534f, 0.0012723f, -0.122297f, -0.558947f, 0.268844f, 0.241193f, 0.0524422f, -0.1683f, 0.575588f, -0.139012f, 0.0636691f, -0.446709f, -0.094532f, 0.883809f, -0.112981f, -0.224047f, 0.0811193f, -0.140571f, -0.09683f, -0.0796143f, -0.102246f, -0.863392f, -0.0755124f, 0.23125f, -0.0301361f, -0.153029f, -0.172238f, -0.0286382f, -0.338495f, -0.317216f, -0.146629f, -0.242264f, -0.702306f, -0.285052f, 0.0623479f, 0.265735f, 0.00674475f, 0.666196f, 0.883586f, 0.278416f, -0.341692f, -0.509931f, -0.156263f, 0.635885f, -0.544143f, -0.572632f, -0.213285f, 0.443396f, -0.268329f, 0.0638439f, -0.185397f, 0.071126f, 0.386503f, -0.402212f, -0.140784f, -0.411661f, 0.049398f, -0.0672907f, -0.267034f, -0.0560875f, 0.0607937f, 0.0445484f, -0.547651f, 0.574718f, 0.417189f, -0.0610166f, 0.0632293f, 0.391619f, -0.00671215f, -0.136883f, -0.339346f, 0.0356183f, 0.511993f, 0.178676f, 0.286998f, 0.136511f, -0.00796929f, 0.203985f, 0.0423532f, -0.175196f, 0.378534f, 0.770417f, 0.593778f, 0.0256067f, -0.82394f, -0.500691f, -0.425725f, -0.623708f, -0.0406241f, -0.00226464f, 0.0207836f, 0.30732f, -0.00784268f, 0.0065445f, -0.0991039f, -0.20871f, -0.206835f, 0.281219f, 0.119361f, 0.259346f, -0.102713f, 0.186488f, -0.034455f, -0.00198392f, -0.279107f, -0.638993f, -0.374404f, -0.48601f, -0.262345f, 0.624532f, 0.620632f, -0.227014f, 0.433579f, -0.0455096f, 1.22123f, -0.429156f, 0.12396f, 0.0815152f, -0.0837355f, 0.0282623f, -0.407475f, 0.787321f, -0.434974f, 0.312904f, -0.230805f, 0.213042f, -0.250929f, 0.302997f, -0.354709f, 0.0504905f, -0.561706f, 0.595558f, 0.374951f, 0.802969f, -0.674902f, 0.33136f, 0.156606f, 0.0218968f, -0.694188f, -0.0221949f, -0.00639123f, 0.0146536f, 0.0104145f, 0.021635f, -0.0499428f, -0.575116f, -0.239035f, -0.0588276f, 0.599722f, 0.541932f, 0.437433f, 0.716268f, 0.193207f, 0.548351f, 0.326951f, -0.197124f, 0.0355353f, -0.0952009f, -0.217265f, -0.389789f, 0.0528124f, -0.21334f, -0.190296f, -1.17367f, 0.108905f, 0.109397f, -0.0192577f, 0.0343813f, 0.085004f, -0.0556737f, -0.0411158f, -0.534989f, 0.0361896f, 0.124415f, 0.291603f, -0.0311974f, -0.326726f, 0.343131f, 0.0276456f, -0.231827f, -0.373894f, -0.208898f, -0.273011f, 0.061323f, -0.0910538f, -0.30746f, -0.108644f, -0.190736f, 1.58048f, -0.0739711f, -0.0623489f, -0.137967f, -0.0601359f, -0.133004f, -0.0857153f, 0.00955987f, -0.365561f, -0.0329051f, 0.463463f, 0.14758f, -0.512256f, -0.227463f, -0.26008f, -0.567777f, 0.0646234f, 1.02161f, 0.66157f, -0.16733f, 0.264921f, -0.242036f, 0.214622f, 0.0712054f, -0.260377f, 0.0849665f, 0.735094f, 0.11001f, 0.297301f, -0.333342f, 0.066978f, -0.123625f, 1.07596f, 0.401263f, 0.0800875f, -0.340862f, -0.115587f, -0.32692f, -0.300842f, 0.0277397f, 0.0630788f, -0.261198f, 0.428695f, -0.0544757f, -0.124511f, 0.036992f, 0.126322f, 0.0317603f, 0.0820762f, 0.117277f, -1.14594f, -0.108076f, -0.0258198f, -0.00337525f, -0.00512531f, 0.1274f, -0.0660535f, -0.640733f, 0.197142f, 0.147278f, 0.489271f, 0.226507f, -0.0668414f, 0.0946318f, 0.0994164f, -0.820516f, 0.512939f, -0.305172f, -0.715187f, -0.195125f, 0.279346f, 0.462144f, 0.913882f, -0.453879f, 0.0582033f, -0.462866f, 0.0538736f, 0.0115737f, 0.00626993f, -0.0185185f, 0.0114601f, -0.0181164f, 0.41588f, -0.0447331f, 0.611756f, 0.43385f, 0.834465f, 0.122019f, -0.352983f, 0.340429f, -0.245425f, -0.365328f, -0.521825f, 0.0371057f, 0.172188f, -0.387949f, 0.221054f, 0.0126359f, 0.422958f, 0.584198f, -0.581498f, -0.019466f, -0.0271737f, -0.0740885f, 0.00540879f, 0.186086f, -0.0324402f, -0.563462f, -0.458759f, -0.425296f, -0.0118862f, -0.641508f, 0.0132084f, 0.0581128f, 0.0231444f, 0.468587f, 0.258838f, 0.0296665f, 0.0562801f, 0.630014f, 0.381816f, -0.269761f, -0.135515f, 0.046186f, 1.07632f, -0.050616f, 0.104987f, 0.29991f, 0.119316f, 0.117248f, 0.0795009f, 0.242573f, 0.0416634f, -0.0577639f, -0.0974078f, 0.106255f, -0.13098f, 0.0141486f, -0.00418257f, 0.144848f, -0.463934f, 0.0452591f, 0.252617f, 0.205222f, -0.189843f, 0.0652245f, -0.135386f, 0.0500646f, -0.200368f, -0.0142312f, -0.0286832f, -0.254355f, -1.02752f, -0.73549f, 0.0364518f, 0.0416227f, -0.13185f, -0.0886515f, -0.502314f, -0.102916f, 0.410911f, -0.355655f, 0.400416f, -0.340217f, 0.208829f, 0.245972f, 0.149739f, -0.49458f, 0.589482f, 0.550827f, 0.912709f, -0.351275f, -0.128076f, -0.285172f, -0.672752f, 0.090583f, -0.245286f, -0.737297f, -0.201515f, -0.025122f, -0.109854f, 0.36738f }; static const float av1_simple_motion_search_prune_rect_logits_bias_64[] = { 0.346819f, 0.442965f, -0.0216032f, 0.0229235f, -0.402797f, -0.666074f, -0.455388f, -0.00353411f, -0.595511f, -0.845667f }; static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_64 = { NUM_FEATURES_64, NUM_LOGITS_64, NUM_HIDDEN_LAYERS_64, { NUM_LAYER_0_UNITS_64, }, { av1_simple_motion_search_prune_rect_layer_0_kernel_64, av1_simple_motion_search_prune_rect_logits_kernel_64, }, { av1_simple_motion_search_prune_rect_layer_0_bias_64, av1_simple_motion_search_prune_rect_logits_bias_64, }, }; #undef NUM_HIDDEN_LAYERS_64 #undef NUM_FEATURES_64 #undef NUM_LAYER_0_UNITS_64 #undef NUM_LOGITS_64 #define NUM_HIDDEN_LAYERS_32 1 #define NUM_FEATURES_32 25 #define NUM_LAYER_0_UNITS_32 28 #define NUM_LOGITS_32 10 static const float av1_simple_motion_search_prune_rect_logits_kernel_32[] = { 0.486581f, 0.340847f, -0.109226f, 0.467224f, -0.541561f, 0.0943619f, -0.429442f, -0.207442f, 0.959963f, 0.618666f, -0.0636751f, 0.144508f, -0.0278289f, 0.332293f, -0.751493f, 0.245438f, -0.917758f, 0.612128f, -0.32648f, 0.534618f, -0.615239f, 2.71641f, 0.233759f, 0.820558f, -0.249758f, -0.427783f, -0.359361f, 0.0375732f, 0.806973f, 0.352512f, -0.0532192f, 0.0576861f, -0.464178f, -0.334877f, -0.697042f, 0.0538218f, 0.0919659f, -0.00765812f, 0.0603847f, -0.460315f, 0.37979f, -0.0867612f, -0.670683f, -0.188619f, -0.570586f, 0.233418f, 0.153581f, 0.290905f, -0.624885f, -0.557842f, -0.555567f, 0.463773f, -0.123909f, -0.277731f, 0.0374468f, 0.409903f, 0.287638f, -0.593066f, -0.223434f, 0.154263f, -0.250464f, -0.077696f, 0.229652f, -0.304174f, 0.308053f, 0.33155f, -0.502825f, 0.361216f, -0.499294f, 0.00595444f, -0.307201f, 0.5766f, -0.438384f, -0.093701f, -0.118586f, 0.202337f, -0.486623f, 0.261552f, 0.139756f, -0.655642f, -0.0627001f, -0.213053f, -0.243037f, 0.205918f, 0.0718368f, 0.188041f, 0.141529f, -0.132239f, 0.425827f, -0.218353f, 0.153114f, 0.33268f, 0.0226116f, 0.167394f, 0.269854f, -0.457001f, 0.1973f, -0.526087f, 0.467528f, 0.290934f, 1.16267f, 0.0823663f, -0.754389f, -0.83716f, 0.270157f, -1.41229f, 0.148511f, -0.286832f, 0.664796f, 0.492254f, 0.360567f, -0.533993f, 0.0435672f, -0.103001f, 0.220668f, 0.594621f, -0.0213356f, -0.347638f, -0.694457f, 0.0759505f, 0.161358f, -0.389384f, -0.0455192f, -0.61252f, -0.174173f, -0.00788878f, -1.22487f, 0.332233f, -0.0457021f, -0.225918f, -0.197657f, -0.115408f, -0.240589f, -2.05681f, 0.00914629f, -1.92213f, 0.0268578f, -0.49076f, -0.0120123f, 0.291157f, 0.267116f, -0.0775724f, 0.181115f, -0.392441f, -0.488114f, -0.28842f, -0.115465f, 0.128974f, -0.0829899f, -0.14096f, -0.140145f, -0.700281f, 0.0368945f, -0.437598f, 0.243485f, -1.00301f, 0.332324f, 0.125014f, -0.0604481f, -0.0652028f, -0.207295f, -1.0209f, -0.341525f, 0.191326f, -0.147578f, 0.0878327f, 0.129827f, -0.0848319f, 0.187381f, -1.28663f, 0.00537885f, -0.134277f, -0.0411126f, -0.3434f, -0.0456494f, 0.37861f, 0.409095f, 0.237177f, -0.396855f, -0.205418f, -1.31701f, -0.319032f, -0.123404f, -0.240005f, -0.305206f, -0.0258176f, -0.26367f, -0.142396f, 0.191672f, -1.44061f, 0.0554776f, -0.571839f, -0.284789f, -0.425677f, -0.0307376f, 0.20275f, -0.223146f, 0.144612f, 0.0212636f, 0.0238303f, -0.253802f, -0.188922f, -0.0637066f, -0.340836f, 0.124774f, 0.130474f, -0.154099f, -0.0292733f, 0.158148f, -0.246989f, -0.259059f, 0.220224f, 0.228449f, -0.41956f, -0.321848f, -0.2396f, -0.316449f, -1.3363f, 0.0264099f, -1.46865f, 0.113073f, 0.0722885f, -0.166986f, -0.164877f, 0.0360911f, 0.534472f, -0.551152f, -0.328501f, 0.0781121f, -0.378112f, -0.459502f, 0.28015f, -0.212302f, -0.521641f, 0.618993f, -0.347709f, 0.266253f, -0.0280894f, 0.348511f, -0.0155031f, -0.100693f, 0.0447673f, 0.277519f, -0.233998f, -0.0796738f, -1.73644f, -0.160776f, 0.53092f, -0.180406f, 0.056447f, 0.385356f, -0.262337f, -0.241479f, -0.271426f, -0.457354f, -0.266788f, 0.367371f, -0.103065f, 0.47783f, -0.188327f, -0.159636f, 0.00142907f, -0.409756f, 0.454889f, -0.24566f, -0.0760084f, 0.286355f, 0.462102f, 0.0431695f, -0.127395f, -0.200476f, -0.350557f, 0.217275f, -0.23975f, 0.255148f, -0.280626f, 0.42476f, 0.157411f, 0.0358675f, -0.192591f }; static const float av1_simple_motion_search_prune_rect_layer_0_bias_32[] = { 0.940498f, 0.15602f, -0.234831f, 0.0268585f, 0.144769f, 0.243081f, 0.611406f, 0.366093f, 0.361868f, 0.39668f, 0.401479f, 0.369467f, 0.0909503f, 0.710595f, 0.032786f, 0.525891f, -1.0232f, 0.732557f, -0.064425f, 0.865222f, -0.042917f, -0.237191f, -0.527006f, -0.0172101f, 0.59681f, -0.472405f, 0.0969218f, -0.250624f }; static const float av1_simple_motion_search_prune_rect_layer_0_kernel_32[] = { 0.355607f, 0.126701f, -0.0825159f, 0.200675f, -0.011308f, -0.280057f, 0.559816f, 0.142689f, 0.0422419f, -0.151692f, -0.0275637f, -0.283101f, -0.20822f, -0.200394f, 0.465427f, 0.344491f, -0.525319f, -0.358813f, -0.39767f, 0.0974486f, 0.00559058f, -0.00546089f, 0.0506486f, 0.114475f, -0.0436463f, -0.574152f, -0.376294f, 0.16563f, -0.0967032f, 0.00579838f, 0.0639909f, -0.037129f, 0.407574f, -0.231428f, 0.489326f, -0.221566f, -0.270382f, -0.784628f, -0.155502f, 0.481698f, -0.0296057f, 0.431855f, 0.840807f, 0.112291f, 0.773874f, -0.0610936f, -0.012892f, 0.365154f, 0.0267687f, -0.0751114f, 0.25043f, 0.516472f, -0.186133f, -0.12762f, -0.168804f, -0.146309f, 0.139314f, -0.367113f, -0.601079f, 0.0559856f, 0.176081f, 0.22397f, 0.434113f, 0.0363256f, 0.313051f, 0.0143976f, 0.190076f, 0.474607f, -0.681134f, -0.0709097f, -0.253289f, -0.216277f, -0.0593789f, -0.107795f, -0.194842f, 0.513945f, 0.239171f, -0.720561f, 0.0136723f, -0.391147f, -0.272043f, -0.164766f, 0.124248f, 0.147178f, -0.35497f, 0.397725f, -0.117603f, 0.262937f, -0.331964f, 0.182418f, 0.315671f, -0.0385649f, 0.488769f, -0.334568f, 0.00596018f, 0.0661557f, -0.0446985f, -0.0928255f, -0.0221032f, -0.019045f, -0.20881f, 0.197907f, -0.381881f, 0.0598071f, -0.0434551f, 0.159283f, -0.110631f, 0.266996f, -0.0265494f, 0.135199f, -0.00833162f, 0.804482f, -0.114698f, -0.15066f, -0.479553f, 0.448407f, -0.344069f, -0.0280952f, -0.208211f, -0.102269f, -0.679066f, -0.37476f, -0.0228875f, 0.0535049f, 0.111015f, -0.18125f, -0.167584f, 0.0110497f, 0.262723f, -0.413839f, -0.0611238f, 0.358499f, 0.0807514f, 0.208254f, 0.214499f, 0.11137f, -0.14262f, -0.0513973f, 0.243718f, -0.373716f, -0.00413366f, 0.216501f, -0.164149f, -0.064935f, -0.0840282f, 0.0566148f, 0.0377686f, 0.289835f, 0.769388f, 0.891198f, -0.592739f, 0.40744f, -0.153095f, 0.657311f, 0.140737f, 0.28209f, 0.158344f, 0.353546f, 0.0868246f, 0.116887f, 0.402004f, 0.437184f, 0.589219f, 0.760594f, -0.575419f, -0.754308f, -0.709219f, -0.297814f, -0.418609f, -0.0262104f, 0.0411959f, 0.0597708f, -0.143728f, -0.136642f, 0.099614f, -0.257601f, -0.2404f, 0.305893f, 0.254009f, -0.0301398f, -0.0653091f, -0.459002f, -0.163404f, 0.123152f, -0.0284252f, -0.457272f, 0.00788622f, -0.828399f, -0.0534199f, 0.586877f, 0.982728f, 0.424581f, 0.0891856f, 0.383182f, -0.122053f, 0.0808408f, -0.00384914f, -0.0560201f, -0.0524772f, -0.263444f, -0.239287f, -0.882777f, 0.0180592f, -0.0948711f, -0.177946f, 0.0296473f, 0.096082f, 0.0455604f, -0.108608f, 0.00777951f, -0.140896f, 0.117187f, -0.342467f, -0.0691604f, 0.0761611f, -0.0892053f, 0.111386f, -0.167456f, 1.40616f, -0.00478793f, 0.00547665f, -0.0441829f, 0.0151323f, -0.0674099f, -0.0380578f, 0.16072f, 0.31882f, 0.245486f, -0.424318f, 0.101845f, -0.203343f, -0.197402f, -0.163025f, -0.0771961f, -0.264435f, 0.319429f, 0.250076f, 0.782726f, 0.386003f, 0.00700673f, -0.375715f, 0.151453f, -0.296265f, -0.560183f, -0.00767249f, -0.109593f, -0.119419f, -0.0161516f, 0.0380283f, -0.156417f, 0.131708f, 0.396268f, -0.221796f, 0.232099f, 0.128852f, 0.0567268f, 0.297297f, 0.173269f, 0.213411f, 0.0384426f, -0.290985f, -0.0426841f, -0.488292f, -0.087101f, -0.311582f, 0.83009f, -0.153163f, 0.903335f, -1.15644f, -0.0378635f, -0.0552129f, -0.126362f, -0.176945f, 0.0653115f, 0.0989368f, -0.333543f, -0.330586f, 0.29775f, -0.103535f, 0.210824f, -0.00300509f, 0.317105f, 0.216852f, 0.479718f, 0.0485808f, -0.15662f, 0.718199f, 0.327513f, 0.115169f, -0.423598f, -0.456633f, -0.575814f, -0.494454f, 0.304411f, 0.0493055f, -0.381171f, 0.467251f, -0.122872f, -0.167441f, 0.017253f, -0.0583646f, -0.1586f, 0.214046f, -0.0284424f, -0.217112f, 0.606567f, -0.107533f, 0.36615f, -0.0709227f, 0.604761f, -0.244657f, -0.296651f, -0.595611f, -0.156629f, -0.693468f, -0.310603f, 0.499272f, 0.282941f, 0.295043f, -0.178704f, 0.281186f, 0.014329f, -0.120819f, 0.154234f, 0.0131325f, -0.472231f, -0.631281f, 0.422955f, 0.711432f, -0.118025f, 0.0864996f, 0.343971f, -0.301477f, -0.246638f, 0.165068f, 0.218044f, 0.224236f, -0.0848522f, 0.00671216f, 0.401141f, -0.218857f, -0.0298495f, -0.135725f, -0.377618f, 0.022473f, 0.106955f, -0.0582005f, 0.0468484f, -0.0217442f, 0.130911f, -0.0926905f, 0.383007f, -0.159353f, -0.222711f, -0.0286419f, 0.372315f, -0.469095f, 0.797571f, -0.301315f, 0.239327f, -0.997507f, -0.363409f, 0.353717f, 0.676686f, -0.0500028f, 0.0638539f, -0.431927f, 0.243852f, 0.000884826f, -0.00166585f, 0.0613292f, -0.029558f, -0.0248432f, -0.0125607f, -0.0309674f, -0.743308f, 0.0409806f, 0.0921015f, 0.167816f, 0.406849f, 0.095677f, 0.0308913f, 0.139956f, -0.400472f, 0.396617f, 0.936517f, 0.355057f, -0.423816f, -0.232472f, -0.220188f, -0.399746f, -0.409623f, -0.158797f, 0.361153f, 0.0327019f, 0.0690844f, -0.032197f, 0.0248558f, 0.00438518f, 0.0222724f, -0.326832f, -0.314295f, 0.156563f, 0.0562703f, 0.332694f, 0.299424f, 0.228206f, 0.322038f, 0.0136098f, 0.0060297f, -0.165851f, -0.306512f, 0.0796508f, -0.37158f, 0.239395f, -0.349442f, 0.198515f, -0.253854f, -1.13694f, 0.0202873f, -0.0504009f, -0.130528f, -0.017126f, -0.0370001f, -0.087458f, -0.119952f, -0.130404f, 0.0333733f, -0.184736f, 0.182162f, 0.227776f, -0.166563f, -0.156162f, 0.118215f, -0.220183f, 0.00474779f, -0.107792f, 0.260493f, 0.11884f, 0.156587f, 0.303936f, -0.131788f, -0.314774f, 0.310606f, 0.0935523f, 0.790767f, 0.26461f, 0.0236426f, 0.0629469f, 0.0344072f, -0.151513f, 0.211498f, 0.0245435f, 0.0629973f, 0.052019f, -0.03308f, 0.123487f, 0.0885027f, 0.159172f, -0.0510615f, 0.0298033f, -0.130515f, -0.121799f, -0.104915f, 0.208822f, -0.310496f, -0.314106f, 0.303307f, -0.0196736f, 0.0420045f, 0.461777f, -0.433699f, 0.00345407f, 0.703139f, -0.655637f, -0.210767f, -0.201278f, 0.163694f, -0.236534f, 0.300877f, 0.0769982f, -0.282453f, 0.149721f, -0.0303466f, -0.191473f, -0.406056f, -0.213472f, 0.1619f, -0.245953f, 0.00544399f, -0.121434f, 0.193012f, -0.307165f, 1.45431f, -0.161468f, -0.12444f, -0.146129f, -0.0528212f, -0.0925165f, -0.134528f, -0.479475f, 0.315525f, 0.133845f, 0.382158f, -0.0799693f, -0.151041f, 0.255772f, 0.409536f, -0.240663f, -0.323741f, -0.205876f, 0.03699f, -0.217541f, 0.108511f, 0.640628f, 0.705993f, -0.423899f, -0.78314f, -0.100733f, -0.00859087f, 0.0251879f, 0.0458335f, 0.00210128f, -0.047576f, -0.0560518f, -1.23869f, -0.829914f, 0.0346551f, 0.350505f, 0.193688f, 0.459154f, 0.137898f, 0.503818f, 0.260867f, 0.649539f, 0.0150802f, 0.0239274f, -0.276069f, -0.0621478f, -0.193106f, -0.0375665f, -0.654529f, 0.189493f, 0.446625f, -0.0208265f, 0.019838f, -0.0201955f, 0.00180428f, -0.0110678f, -0.0172414f, 0.0276489f, -0.252882f, -0.0351807f, -0.0518874f, 0.279098f, -0.245122f, 0.101287f, -0.114202f, -0.0812187f, 0.572429f, -0.0821731f, 0.564183f, 0.0222552f, 0.190111f, -0.0417497f, -0.00385925f, -0.182995f, -0.240482f, -0.291572f, -0.0450444f, 0.0962974f, -0.165973f, -0.0954637f, -0.163841f, -0.833405f, -1.31541f, -0.336473f, -0.0920702f, 0.816105f, 0.393377f, 0.0340241f, -0.0844545f, 0.61729f, -0.17596f, 0.241149f, -0.42825f, -0.59091f, -0.290702f, 0.0796465f, 0.0982819f, 0.466934f, 0.261666f, 0.0373333f, 0.332509f, -0.0266694f, -0.0476951f, -0.00642167f, -0.0132542f, -0.000320841f, 0.00475532f, 0.000502778f, 0.296534f, -0.13297f, -0.113082f, -0.327923f, 0.35901f, -0.302246f, 0.189799f, -0.37994f, 0.16107f, -0.20414f, 0.548575f, -0.460821f, 0.591878f, -0.213113f, -0.169373f, -0.07332f, 0.228841f, 0.682302f, -0.0665316f, -0.142456f, -0.0873117f, 0.00607451f, 0.0376443f, 0.0536673f, -0.0109536f, -0.400279f, 0.550058f, 0.820871f, -0.666373f, -0.471962f, -0.315925f, -0.313142f, 0.952742f, 0.473928f, -0.119006f, 0.153241f, -0.0383078f, 0.631869f, -0.343423f, -0.233473f, -0.218195f, -0.077688f, -0.728291f, 0.0382408f, -0.00662886f, -0.0419666f, 0.0309776f, -0.0281592f, 0.0154229f, -0.198534f, 0.0206324f, 0.0152272f, -0.235067f, 0.0330486f, 0.139198f, -0.0612118f, 0.133154f, -0.258675f, 0.0900275f, -0.127771f, 0.157322f, -0.00767807f, -0.329258f, 0.327458f, 0.0528581f, -0.181125f, 0.409995f, -0.162979f, -0.0193475f, 0.186009f, 0.0519501f, 0.651877f, -0.37821f, -1.10341f, -0.189776f, -0.0922788f, 0.460256f, 0.168011f, 0.440295f, 0.478135f, 0.374573f, 0.384048f, 0.116953f, 0.68886f, -0.427727f, -0.36676f, -0.500013f, -0.228685f, -0.218859f, 0.208396f, -0.0173765f, -0.0680241f, -0.00538013f, -0.0674409f, -0.092764f, 0.0295707f, -0.0462887f, -0.00636006f, 0.0334169f }; static const float av1_simple_motion_search_prune_rect_logits_bias_32[] = { 0.176459f, 0.154405f, 0.281821f, 0.375264f, -0.882863f, -0.240261f, -1.17075f, -0.280216f, -0.743836f, -0.317511f }; static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_32 = { NUM_FEATURES_32, NUM_LOGITS_32, NUM_HIDDEN_LAYERS_32, { NUM_LAYER_0_UNITS_32, }, { av1_simple_motion_search_prune_rect_layer_0_kernel_32, av1_simple_motion_search_prune_rect_logits_kernel_32, }, { av1_simple_motion_search_prune_rect_layer_0_bias_32, av1_simple_motion_search_prune_rect_logits_bias_32, }, }; #undef NUM_HIDDEN_LAYERS_32 #undef NUM_FEATURES_32 #undef NUM_LAYER_0_UNITS_32 #undef NUM_LOGITS_32 #define NUM_HIDDEN_LAYERS_16 1 #define NUM_FEATURES_16 25 #define NUM_LAYER_0_UNITS_16 32 #define NUM_LOGITS_16 10 static const float av1_simple_motion_search_prune_rect_logits_kernel_16[] = { -0.520913f, 0.395611f, 0.0369091f, -0.318591f, -0.463252f, 0.134992f, -0.43154f, -0.0739112f, -0.118817f, 0.476373f, -0.281406f, 0.3413f, 0.456255f, 0.33307f, 0.2942f, 0.1317f, 0.498113f, 1.95406f, -0.165726f, -0.219306f, -0.302656f, -1.31157f, -0.433662f, 0.151716f, -0.214817f, 0.504523f, -0.710049f, 0.359616f, -0.412695f, -0.103193f, 0.341912f, 0.351378f, -0.181486f, 0.573862f, -0.0396254f, -0.17855f, -0.276163f, 0.0367465f, -0.353905f, -0.204689f, 0.309581f, -0.0439686f, -0.147855f, 0.152745f, 0.290871f, 0.131049f, -0.27808f, -0.142997f, 0.207843f, -1.23074f, -0.267714f, -0.336923f, 0.313781f, -0.61488f, -0.161984f, 0.238059f, -0.0879942f, -0.085543f, -0.260156f, -0.13614f, -0.242196f, 0.201216f, -0.248691f, 0.0936671f, -0.350522f, -0.35002f, -0.156583f, -0.00579001f, 0.300578f, -0.341269f, -0.290712f, 0.354802f, -0.31629f, 0.509107f, -0.236953f, -0.0923519f, 0.544509f, -0.280991f, -0.017437f, -0.202721f, -0.116388f, -0.7191f, 0.324586f, 0.254249f, 0.125505f, 0.00658697f, -0.333322f, -0.126537f, -0.140004f, -0.0241202f, -0.172466f, 0.210035f, -0.270833f, 0.0579044f, 0.0950352f, -0.120382f, 0.063292f, -0.394925f, 0.482165f, 0.147753f, 0.331465f, -0.187444f, 0.1083f, 0.414028f, 0.279238f, -0.486889f, -0.674349f, -0.313656f, -0.131186f, -0.100662f, 0.238191f, -1.19083f, -0.30667f, -2.4324f, 0.235311f, 0.108605f, 1.67197f, 0.476157f, 0.30055f, 0.0839538f, 0.408469f, -0.473517f, 0.560283f, -0.0188136f, 0.273824f, -0.43707f, -0.0346978f, -0.438315f, -0.0196275f, -0.0567921f, -0.220166f, 0.216175f, -0.0180461f, 0.0116429f, -0.0096949f, -0.32613f, 0.176829f, -0.243563f, -0.240972f, -0.621819f, -0.00619648f, -0.145525f, 0.124324f, -0.0306925f, 0.172208f, -2.04631f, -0.200087f, -0.594135f, -0.352303f, -0.309826f, 0.0922786f, -0.698371f, -0.0366823f, 0.0244036f, 0.338775f, -0.115947f, 0.144971f, -0.0607037f, -0.762412f, 0.0125584f, -0.262427f, -0.0830273f, -0.291252f, -0.176059f, -0.203983f, 0.0871455f, -0.0894925f, 0.0426263f, -0.060001f, -0.542355f, -0.407837f, -0.0419273f, 0.226608f, -0.114844f, 0.158733f, -0.187237f, 0.113163f, -1.86337f, -0.367544f, -0.547048f, -0.24192f, -0.226764f, 0.090912f, 0.819604f, 0.433766f, -0.841657f, 0.446987f, -0.622761f, -0.0296385f, -0.130176f, -0.0518136f, -0.640326f, -0.330107f, -0.137832f, -0.0119033f, 0.39401f, 0.111331f, -0.141367f, -0.230289f, 0.171054f, -0.924059f, -0.107317f, -0.347983f, 0.0261109f, 0.423002f, -0.305817f, 0.247696f, 0.0436002f, 0.0305862f, -1.52448f, -0.595587f, -0.155552f, -1.11949f, -0.513937f, 0.138347f, -0.301487f, 0.352144f, -0.615801f, 0.0326701f, -0.215322f, -0.0608176f, -0.416557f, -0.306073f, -0.441512f, -0.0569277f, -0.709768f, -0.602527f, -0.311134f, 0.152471f, -0.255299f, 0.354505f, 0.194464f, 0.0144251f, 0.110732f, -0.4452f, -0.804814f, 0.205325f, -0.0957486f, 0.502684f, 0.09112f, -0.533087f, -1.77979f, 0.556992f, -0.176157f, -0.642633f, 0.11553f, -0.232561f, 0.161277f, -0.0631125f, -0.20759f, 0.489253f, -0.067533f, 0.0231024f, -0.179831f, -0.272985f, -0.390059f, 0.3089f, 0.185733f, -0.257065f, -0.508838f, -0.550028f, 0.0665621f, -0.138288f, -0.413188f, 0.191193f, -1.32969f, -0.431025f, 0.270242f, -0.340062f, 0.0817257f, 0.0376051f, -0.18633f, 0.0828274f, 0.00670051f, -0.431295f, -0.450316f, -0.173042f, -0.322248f, 0.370628f, 0.10019f, 0.317293f, -0.266613f, 0.0752441f, -0.425656f, -0.112223f, 0.557991f, -0.324368f, -0.195261f, -0.0526129f, -0.807472f, -0.387466f, 0.192186f, 0.353213f, -0.120238f, 0.107686f, 0.200678f, -0.75363f, 0.466857f, -0.282345f, -0.0849236f, -0.0490695f, -0.00643182f, 0.123047f, -0.207805f, -0.130456f, -1.09455f, 0.340973f, 0.334784f, 0.0706643f, -1.65681f, -0.319952f, -0.198514f, -0.0787972f, 0.089524f, 0.0531034f, -0.202705f, -0.0852339f, -0.62572f, -0.0734234f, -0.838088f }; static const float av1_simple_motion_search_prune_rect_layer_0_bias_16[] = { -0.0616197f, 0.939947f, 0.521161f, 0.213886f, 0.130324f, -0.127443f, -0.0538715f, 0.708746f, 0.445031f, 0.418781f, -0.114539f, 0.521941f, 1.13719f, 0.606545f, -0.32193f, -0.150788f, 0.158487f, -0.224005f, 0.654715f, 0.115729f, -0.286506f, -2.06223f, 0.0117697f, 0.503905f, -0.102339f, 0.653256f, -0.813561f, 0.905235f, -0.417269f, -0.206265f, 0.661496f, 0.95533f }; static const float av1_simple_motion_search_prune_rect_layer_0_kernel_16[] = { -0.203489f, 0.00686229f, -0.161414f, 0.0637276f, 0.27516f, 0.512219f, 0.164205f, 0.00326062f, -0.41914f, -0.400334f, 0.554419f, 0.715772f, -0.295569f, -0.703503f, 0.0137744f, -0.0934259f, 0.174234f, -0.148618f, -0.0360558f, -0.0986598f, -0.138502f, -0.0770713f, 0.122922f, -0.00784415f, 0.0953234f, -0.255754f, -0.310967f, 0.185306f, 0.464554f, 0.147338f, -0.0612304f, 0.164783f, 0.301097f, 0.161364f, -0.12723f, -0.0265984f, -0.471361f, 0.0578776f, -0.362865f, 0.425789f, 0.402758f, -0.190235f, 0.00549738f, -0.570908f, 1.27206f, 0.048868f, -0.0097675f, 0.0708324f, 0.0456103f, 0.0149062f, -0.563032f, -0.420573f, 0.107278f, 0.0938258f, 0.142712f, -0.00251036f, -0.250583f, 0.522272f, 0.0113175f, 0.126751f, -0.433028f, -0.035542f, -0.536686f, -0.0668722f, 0.253094f, 0.254007f, -0.435505f, 0.343001f, 0.0531542f, -0.361914f, -0.102664f, 0.0404874f, 0.132686f, 0.0762298f, 0.0236971f, -0.419454f, 0.230877f, -0.223714f, 0.037813f, 0.0818604f, 0.383705f, -0.235028f, -0.0554801f, 0.429851f, 0.0845829f, 0.166295f, 0.355111f, -0.421197f, 0.298949f, 0.0218224f, 0.445705f, -0.392217f, -0.429578f, -0.076276f, -0.0963531f, -0.631425f, -0.225977f, 8.06349e-06f, 0.0676679f, 0.0779651f, 0.0706891f, 0.101377f, 0.517103f, 0.0945502f, -0.52522f, -0.312022f, 0.0358089f, 0.616509f, -0.0507444f, -0.465814f, -0.0326024f, 0.591298f, 0.188544f, -0.0633316f, -0.199987f, 0.403118f, -0.511281f, -0.696263f, 0.112996f, 0.103875f, 0.0495595f, -0.0107449f, 0.521539f, -0.0123823f, -0.0642751f, 0.08548f, -0.0679207f, 0.526558f, 0.0651114f, -0.342643f, -0.349934f, 0.307437f, 0.368763f, -0.194851f, -0.134117f, 0.102448f, -0.0520666f, 0.0415824f, -0.175085f, 0.272685f, 0.0675856f, 0.120627f, 0.391408f, -0.135249f, -0.357024f, 0.019666f, -0.0622677f, 0.407427f, 0.22655f, -0.129432f, -0.165327f, 0.004893f, 0.5479f, 0.0613981f, -0.479682f, -0.144228f, -0.130106f, 0.206458f, -0.342086f, 0.12691f, -0.113554f, 0.231164f, -0.051419f, 0.0401286f, -0.560429f, -0.070609f, 0.420232f, 0.442465f, -0.237501f, -0.000293732f, -1.017f, -0.210222f, 0.0157063f, 0.0488178f, 0.0734721f, -0.52626f, -0.276441f, -0.521579f, 0.443532f, -0.0819051f, -0.0732633f, -0.17999f, 0.258525f, -0.0374872f, 0.150115f, 0.0510939f, 0.168116f, 0.473372f, 0.824489f, 0.302195f, -0.348613f, 0.238569f, 0.176444f, -0.633945f, -0.0567195f, -0.0305827f, -0.0551851f, 0.85822f, -0.0628099f, 0.0364294f, -0.234823f, 0.179067f, 0.143208f, -0.0511014f, -0.404191f, 0.428035f, 0.0235506f, 0.371991f, -0.312909f, 0.550933f, -0.389265f, -0.271813f, -0.293461f, -0.583752f, 0.179991f, 0.191698f, 0.659094f, 1.07941f, -0.509555f, -0.100638f, 0.079988f, -0.0519107f, -0.112723f, -0.0663326f, 0.0353569f, -0.795055f, -0.465999f, 0.283579f, 0.340913f, 0.152738f, 0.294664f, 0.527839f, 0.187735f, 0.359461f, 0.164629f, 0.107512f, 0.390402f, 0.236702f, 0.114674f, -0.525655f, -0.555476f, -0.6589f, -0.266601f, -0.0946547f, 0.6306f, 0.0248513f, 0.038497f, 0.432706f, -0.0715465f, 0.0410172f, -0.115313f, -0.428684f, 0.136283f, 0.0913185f, 0.11277f, 0.0968689f, -0.00437052f, 0.0888981f, 0.10304f, 0.02442f, -0.211315f, 0.00981596f, -0.0974827f, 0.208611f, 0.140644f, 0.0315567f, 0.350332f, -0.291049f, -0.0715449f, -0.352992f, -0.858004f, 0.828658f, 0.439092f, 0.0151291f, 0.0503828f, 0.0656112f, -0.710749f, -0.0951757f, 0.193908f, 0.00908018f, 0.141486f, -0.0657711f, 0.099791f, 0.153729f, -0.419576f, -0.892636f, -0.0449268f, -0.170786f, -0.156564f, 0.384511f, 0.296565f, 0.0569815f, -0.103938f, 1.27479f, -0.0406475f, 0.154083f, -0.186442f, 0.0282588f, 0.0312102f, -0.188994f, 0.284243f, -0.564693f, 0.425525f, -0.00924596f, 0.810003f, 0.233812f, -0.0180273f, 0.121082f, -0.209096f, 0.151437f, 0.286921f, -0.348095f, 0.174813f, -0.413798f, 0.108994f, -0.34266f, -0.0337981f, -0.459f, -0.409812f, -0.0890104f, 0.0834802f, -0.00259191f, -0.105914f, -0.164207f, 0.0697689f, -0.312098f, -0.00650536f, -0.486758f, -0.248486f, 0.24314f, -0.0857144f, 0.0884781f, -0.65615f, -0.121744f, 0.0709335f, -0.0237193f, 0.10764f, -0.0409452f, -0.0824305f, 0.42329f, 0.138258f, 0.502607f, 0.228545f, 0.0687789f, 0.0361586f, 0.39074f, 0.0722654f, -0.0133148f, 0.283278f, 0.0743384f, 0.310292f, -0.297675f, -0.359935f, 0.521021f, -0.10082f, -0.272333f, 0.0120283f, 0.138118f, -0.123711f, -0.0711386f, 0.0170747f, 0.831039f, 0.0509626f, 0.790608f, -0.0863406f, -0.31962f, 0.0631013f, 0.0873453f, -0.472331f, -0.0826027f, -0.241722f, 0.148835f, -0.131611f, 0.000195347f, -0.0615804f, -0.838663f, -0.586979f, 0.247713f, 0.362254f, 0.492727f, -0.132163f, 0.0516545f, 0.477838f, -0.0395182f, 0.0124993f, -0.771514f, 0.0386912f, -0.118525f, -0.346172f, -0.265905f, -0.175257f, -0.406287f, 0.393837f, 0.409096f, -0.408501f, -0.0207146f, 0.0487809f, 0.0636982f, 0.0276368f, 0.0878249f, 0.0425889f, 0.0868633f, 0.17423f, -0.128217f, -0.477068f, -0.321294f, 0.0393771f, 0.00812823f, -0.350529f, -0.129012f, 0.439953f, 0.396662f, 0.410475f, -0.123129f, -0.565966f, 0.0298635f, -0.614611f, -0.477514f, 0.453651f, 0.0617068f, 0.0530563f, 0.0479074f, 0.213551f, 0.039034f, 0.0449095f, -1.06868f, -1.2654f, -0.175482f, 0.595068f, -0.230095f, 0.719838f, -0.272148f, 0.696564f, 0.0485396f, 0.468584f, 0.0695439f, -0.0842122f, -0.228978f, 0.161397f, -0.000441421f, -0.0297514f, -0.250599f, 0.196656f, 0.608423f, -0.0112096f, 0.0236881f, -0.00167311f, 0.0040709f, 0.015495f, 0.00757698f, -0.165886f, 0.359767f, -0.0214696f, 0.377208f, 0.0303547f, 0.0657094f, 0.140775f, 0.21867f, -0.203922f, 0.263878f, -0.0529099f, 0.202438f, -0.243226f, 0.156659f, -0.627056f, -0.845036f, -0.500873f, 0.172588f, 0.402972f, -0.147734f, 0.151792f, -0.075579f, 0.443519f, 0.0311335f, -0.0328222f, -0.0299781f, 0.435956f, -0.0987376f, 0.288402f, 0.135902f, -0.173584f, -0.186255f, 0.224524f, -0.249645f, 0.123702f, -0.0846244f, 0.491317f, 0.544846f, 0.338677f, -0.258885f, -0.617434f, -0.629003f, -0.347233f, 0.181262f, -0.0606015f, -0.537766f, 0.215089f, -0.334527f, 0.0488534f, 0.0577997f, -1.12431f, -0.932292f, -0.11559f, 0.573715f, 0.151128f, 0.693818f, -0.16956f, 0.802591f, -0.231531f, 1.04318f, -0.476417f, 0.293452f, -0.610136f, 0.27506f, -0.384012f, 0.305366f, -0.0540464f, -0.337583f, -0.174285f, 0.157248f, 0.0477345f, -0.0229535f, 0.0475766f, -0.00603319f, 0.00856119f, -0.702893f, -0.0579673f, 0.183024f, -0.166222f, 0.109763f, -0.148019f, -0.258873f, -0.0820157f, -0.186716f, -0.449265f, -0.0534138f, 0.15732f, 0.46357f, 0.00502591f, -0.0282085f, 0.152277f, -0.855199f, -0.357115f, 0.0366159f, 0.0131101f, -0.0407758f, 0.0462835f, 0.146309f, -0.00276278f, -0.0591814f, -0.109437f, 0.506764f, -0.044421f, 0.465907f, 0.114444f, -0.241053f, -0.362649f, -0.432615f, 0.199989f, -0.00635866f, -0.521886f, 0.0958924f, -0.485725f, 0.0430527f, 0.069746f, 0.681091f, -0.288144f, 0.505671f, 0.0489065f, -0.0373836f, 0.266079f, 0.145173f, -0.011481f, -0.225074f, -0.754501f, -0.122939f, -0.294213f, 0.334738f, 0.281561f, 0.558977f, -0.21551f, -0.346507f, -0.0625635f, 0.0782034f, -0.236999f, -0.803783f, -0.601117f, 0.091192f, 0.636122f, -0.250626f, 0.0354961f, 0.103915f, 0.508571f, 0.329911f, -0.0425999f, -0.0867587f, -0.0385824f, 1.13914f, -0.0261992f, 0.00484478f, 0.124603f, -0.012173f, -0.377358f, -0.243563f, 0.236094f, 0.145663f, -0.132752f, 0.347497f, -0.529315f, 0.271632f, -0.372805f, 0.0261836f, 0.126169f, 0.0941008f, 0.283773f, 0.765701f, -0.226477f, -0.181549f, -0.306896f, 0.110165f, -0.0784234f, -0.0827892f, -0.0374252f, -0.0950872f, -0.451015f, -0.995793f, -0.452663f, 0.293338f, -0.380865f, 0.032683f, 0.0178248f, 0.0699194f, -0.0811722f, -0.0866096f, 0.139289f, 0.296604f, 0.192293f, -0.0589607f, -0.179878f, 0.00360266f, -0.0905794f, 0.136744f, -0.191555f, 1.31877f, -0.0592033f, -0.158766f, 0.0214746f, -0.190113f, -0.116671f, 0.0449292f, -0.109533f, -0.709307f, 0.386424f, 0.40201f, 0.262211f, -0.155244f, 0.233988f, -0.0166317f, 0.462665f, 0.0484462f, 0.210902f, -0.352798f, 0.38698f, -0.228261f, -0.084309f, -0.220751f, -0.170879f, -0.352617f, -1.24277f, 0.266004f, -0.0125749f, -0.0380073f, 0.101838f, -0.0483024f, -0.0629178f, -0.0695577f, -0.103439f, 0.242131f, -0.0796858f, 0.349718f, -0.332045f, 0.0138352f, -0.380235f, -0.28717f, -0.176276f, 0.865903f, 0.36593f, 0.243925f, -0.422289f, -0.117327f, 0.21876f, 0.245393f, -0.426134f, -0.186077f, 0.0352515f, -0.123742f, 0.249376f, 1.3281f, 0.0707771f, 0.071415f, -0.286827f, -0.131691f, -0.270881f, -0.434378f, 0.376064f, 0.35966f, 0.513374f, 0.439378f, -0.222716f, -0.5874f, 0.487997f, -0.293271f, -0.184245f, -0.037256f, 0.17723f, -0.438651f, 0.428184f, 0.112983f, -0.449287f, -0.0451963f, 0.0854929f, 0.0735442f, -0.0148642f, -0.0586782f, -0.176455f, -0.438979f, -0.127109f, 0.211478f, 0.388035f, -0.0372021f, 0.220575f, 0.382144f, 0.302121f, 0.0857121f, 0.193445f, -0.488858f, -0.195288f, -0.316184f, -0.314026f, -0.111956f, 0.0744768f, 0.292709f, 0.30187f, -0.285506f, -0.105006f, 0.0851402f, -0.082318f, 0.277518f, 0.725294f, -0.756304f, 0.0155309f, -0.378542f, 0.293377f, -0.347252f, -0.338458f, 0.221449f, -0.176443f, -0.131972f, 0.0129163f, -0.290649f, 0.198596f, -0.0721333f, 0.620591f, 0.568736f, 0.174001f, -0.205186f, -0.265606f, -0.249155f, 0.299163f, 1.11842f, 0.17423f, 0.196417f, -0.014484f, 0.0735422f, 0.26329f, 0.12284f, -0.750305f, -0.351337f, 0.121994f, -0.00542878f, -0.295707f, -0.094124f, 0.300993f, 0.412408f, -0.170761f, -0.0676329f, -0.106638f, -0.419785f, -0.43878f, 0.22421f, 0.0339903f, 0.619851f, 0.0615381f, 0.514631f, 1.35424f, -0.0679228f, -0.203457f, 0.131948f, -0.0041251f, -0.209054f }; static const float av1_simple_motion_search_prune_rect_logits_bias_16[] = { 0.304025f, 0.131887f, 0.259279f, -0.561564f, -0.161729f, -0.208036f, 0.102206f, -0.162937f, -1.42311f, -0.708305f }; static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_16 = { NUM_FEATURES_16, NUM_LOGITS_16, NUM_HIDDEN_LAYERS_16, { NUM_LAYER_0_UNITS_16, }, { av1_simple_motion_search_prune_rect_layer_0_kernel_16, av1_simple_motion_search_prune_rect_logits_kernel_16, }, { av1_simple_motion_search_prune_rect_layer_0_bias_16, av1_simple_motion_search_prune_rect_logits_bias_16, }, }; #undef NUM_HIDDEN_LAYERS_16 #undef NUM_FEATURES_16 #undef NUM_LAYER_0_UNITS_16 #undef NUM_LOGITS_16 #define NUM_HIDDEN_LAYERS_8 1 #define NUM_FEATURES_8 25 #define NUM_LAYER_0_UNITS_8 32 #define NUM_LOGITS_8 4 static const float av1_simple_motion_search_prune_rect_logits_kernel_8[] = { -0.266303f, -0.387676f, 0.204501f, -0.120842f, -0.0752326f, 0.0337739f, 0.0243477f, -0.356748f, 0.0143051f, -0.16403f, -0.139013f, 0.175003f, -0.206754f, 0.349059f, 0.181763f, 0.212768f, -0.313783f, 0.182829f, 0.00205376f, -0.939525f, -0.0992424f, 0.306254f, 0.083329f, -0.133137f, -0.179022f, -0.0237902f, 0.0601026f, -0.216698f, -0.551149f, 0.081711f, -0.442191f, 0.0680832f, -0.0353678f, 0.237704f, 0.23155f, -0.36097f, 0.123389f, -0.288927f, 0.178133f, -0.152222f, -0.235648f, -0.0495293f, -0.316522f, 0.034207f, 0.0463139f, -0.817825f, 0.417443f, -0.110984f, -0.402371f, 0.0341694f, -0.37383f, 0.414532f, 0.093993f, 0.0039505f, 0.0803175f, -0.511859f, -0.0154802f, 0.0979595f, 0.0909049f, -0.120938f, -0.577382f, -0.155041f, -0.404295f, 0.122223f, -0.084703f, 0.00415336f, 0.149135f, 0.113219f, 0.124236f, -0.240905f, 0.163909f, -0.154202f, -0.208917f, 0.00200158f, -0.71796f, 0.105984f, -0.131996f, -0.539603f, 0.223768f, -0.0710733f, -0.346679f, -0.0745909f, 0.171032f, 0.215701f, 0.218519f, 0.105981f, -0.096209f, -0.166453f, -0.468894f, -0.401578f, -0.239222f, 0.111382f, 0.38747f, -0.164734f, -0.175955f, 0.336621f, -0.0305501f, -0.0576765f, 0.0672671f, -0.183692f, 0.412082f, -0.262951f, -0.153429f, -0.128589f, -0.530472f, 0.0936412f, -1.08296f, -0.45147f, 0.0714904f, -3.96842f, 0.438125f, -0.313945f, 0.231104f, -0.00183851f, -0.0192768f, -0.637531f, -0.109296f, 0.0531702f, 0.00262162f, -0.615951f, -0.546241f, -0.635305f, -0.0762367f, 0.0122019f, 0.423693f, -0.129142f, -0.112242f, 0.295184f }; static const float av1_simple_motion_search_prune_rect_layer_0_bias_8[] = { -2.16023f, -3.12831f, -0.213206f, -2.97875f, -1.83791f, -2.84713f, -0.909636f, -2.05893f, 0.00525274f, -1.51672f, -3.95017f, 1.82847f, -0.853224f, -3.29503f, -0.537517f, 0.923106f, -3.18665f, -1.29905f, 1.64506f, -1.99848f, -2.24315f, 0.408613f, 0.503671f, -3.83393f, -2.88388f, -3.52337f, 1.46818f, -1.67169f, -3.83253f, 1.52644f, -0.490783f, -0.415782f }; static const float av1_simple_motion_search_prune_rect_layer_0_kernel_8[] = { -0.702198f, -0.102148f, 0.0564545f, -0.0555548f, 0.16184f, 0.0950792f, 0.136974f, -0.00824146f, 0.05746f, 0.0447542f, 0.145978f, 0.0855769f, -0.041449f, 0.301347f, -0.0206691f, -0.0662514f, -0.0525079f, -0.0998387f, -0.0891438f, 0.110545f, -0.863098f, -1.83798f, 0.238818f, 0.127797f, 0.116872f, -0.270655f, -0.21057f, 0.197013f, -0.123332f, 0.137104f, -0.174766f, -0.00803025f, 0.0234369f, -0.0894175f, -0.0380927f, 0.00827928f, -0.134148f, 0.110575f, -0.250173f, 0.116273f, 0.0197749f, 0.270391f, 0.108437f, 0.173197f, -0.0650348f, 0.0884626f, 0.262792f, 0.0649228f, 0.5573f, -2.81315f, -0.479801f, -1.15825f, 0.0807932f, -0.19144f, 0.404016f, -0.211521f, 0.233269f, -0.391414f, 0.160381f, -0.277233f, 0.426354f, 0.156839f, 0.494315f, -0.214259f, -0.0132062f, 0.148628f, -0.0899568f, 0.161845f, 0.467689f, 0.229474f, 0.590634f, -0.705793f, -0.0486113f, -0.439088f, 0.994566f, 0.679065f, 0.777869f, -0.225291f, -0.0303006f, -0.638782f, -0.0824632f, -0.128561f, -0.327603f, 0.105624f, 0.567581f, -0.396135f, -0.471028f, 0.181286f, 0.274604f, 0.180169f, 0.0612144f, -0.865004f, 0.0306804f, 0.142985f, -0.0914358f, -0.243284f, 0.358359f, -0.443847f, -0.371978f, 0.606933f, -0.900408f, -0.52076f, 0.472118f, 0.0610973f, 0.152526f, -0.550379f, 0.309331f, -0.141573f, 0.203046f, -0.231485f, 0.505156f, 0.393224f, 0.435487f, -0.218681f, 0.123707f, -0.270383f, -0.033565f, 0.210373f, -2.33967f, 0.367434f, 0.0308118f, -0.205771f, 0.546141f, 0.19837f, 0.035648f, -0.467007f, -1.50995f, -0.0314176f, 0.11762f, -0.15307f, 0.618257f, -0.139502f, 0.303386f, -0.00758681f, 0.228107f, -0.594499f, -0.201984f, -0.239666f, 0.114878f, -0.922174f, -0.530137f, -0.379366f, -0.319582f, 0.0889624f, -0.00544663f, 0.316264f, -0.204262f, -0.0959358f, 0.23552f, 0.141369f, -0.207129f, -1.04067f, -0.0780501f, 0.226768f, -0.246752f, 0.0823105f, 0.114783f, 0.49315f, 0.0197732f, 0.705433f, 0.158076f, -0.250584f, -0.157326f, -0.0439547f, -0.139047f, 0.090531f, -0.38833f, 0.743143f, -1.47418f, -0.155009f, 0.511466f, -0.726716f, -0.181075f, 0.450133f, -0.390204f, 0.292725f, 0.00811462f, -0.347738f, 0.613381f, -0.237124f, 0.750748f, -0.383123f, 0.410309f, -0.204166f, 0.667199f, -0.313197f, 0.436059f, -0.607571f, 0.193681f, 0.409399f, 0.631747f, -0.0454149f, 0.198232f, 0.345591f, -0.0137374f, -0.307014f, -0.535515f, 0.764678f, -0.225686f, -0.451621f, -2.75564f, -1.52877f, 0.0511933f, 0.905979f, 0.145029f, 0.759615f, 0.130166f, 0.83827f, 0.0655081f, 1.07555f, -0.529777f, 0.682967f, -0.412052f, 0.611947f, -0.83676f, 0.940695f, -0.465681f, 0.51505f, -0.883659f, -0.105524f, -0.0344173f, -0.0683618f, -0.00698688f, -0.139349f, 0.135741f, -0.294455f, -0.377834f, -0.602084f, -1.00128f, 0.483291f, 1.25327f, 0.178987f, 0.75068f, -0.520731f, -0.325517f, 0.272032f, 0.144144f, -0.279453f, 0.564907f, 0.144036f, 0.297448f, -0.504243f, -0.250508f, -1.26395f, 0.4816f, 0.392771f, -0.389961f, -0.261585f, -0.127124f, -0.202945f, -0.709716f, -0.174719f, 0.113613f, 0.477753f, -0.226659f, 0.0697828f, -0.177994f, 0.300726f, -0.185504f, 0.339424f, -0.316746f, 0.369693f, -0.339723f, -0.143886f, -0.0326589f, -0.268761f, -0.241094f, 0.284876f, -0.0270867f, -0.207397f, -1.42738f, 0.495612f, -0.0277732f, 0.199675f, 1.48638f, -0.659257f, -1.28199f, 0.498702f, 0.140695f, 0.571152f, 0.416368f, 0.14153f, 0.126876f, 0.521114f, -0.00150571f, 0.375581f, 0.00537624f, 0.1286f, -0.332227f, 0.417663f, -0.539023f, 0.217124f, -0.787111f, -0.0335266f, 1.56751f, 0.0640563f, -0.158791f, 0.118195f, 0.000970493f, -0.0403852f, -0.0572557f, -0.0201181f, -0.10255f, 0.63237f, 0.156662f, 0.418696f, -0.274802f, -0.663923f, -0.375232f, -0.40846f, 0.462092f, 1.2176f, -0.301532f, -0.779704f, -0.112876f, 0.0806591f, -0.0141923f, 0.00960801f, -0.663557f, 0.0979948f, -0.0575999f, -0.012847f, 0.0403853f, -0.133666f, -0.00330217f, -0.931518f, -0.774599f, -0.21391f, 0.377601f, -0.183365f, 0.299094f, 0.0238552f, 0.206716f, -0.18959f, 0.346013f, -0.150991f, -0.192817f, -0.293962f, -0.0537604f, -0.0648171f, -0.275941f, -0.144854f, -0.224092f, 2.43113f, 0.0422494f, -0.047236f, -0.0262028f, 0.0282119f, -0.175553f, 0.0888502f, 0.580682f, 0.951055f, -0.284441f, -0.120133f, -0.268058f, -0.312083f, -0.411556f, 0.21431f, -0.28033f, 0.324851f, -1.02787f, -0.936816f, -0.577628f, 0.544743f, 0.295807f, 0.406157f, 0.447927f, 0.25369f, -0.811421f, -0.0424979f, -0.189867f, 0.00778673f, -0.113587f, -0.116175f, -0.0542222f, -1.80089f, -1.44175f, -0.35332f, 0.191314f, -0.236691f, -0.0261926f, -0.502363f, 0.252278f, -0.485478f, 0.296495f, 0.455612f, -0.0489631f, 0.227255f, 0.170975f, 0.473487f, 0.257812f, 0.178048f, 0.2506f, 2.04637f, -0.173857f, 0.0583379f, 0.00765589f, -0.025772f, -0.162666f, -0.016214f, -0.607486f, -0.0808025f, 0.0551611f, -0.0772291f, 0.126421f, 0.10869f, -0.0877463f, -0.111527f, -0.0775766f, 0.503886f, -0.002757f, -0.0421354f, -0.247857f, 0.140827f, 0.383576f, 0.228232f, -0.157877f, -0.0927911f, 0.344687f, 0.191181f, 0.236533f, 0.00102869f, -0.0184502f, -1.4509f, -1.15945f, -0.521978f, -0.643225f, 0.133139f, 0.0660321f, 0.0851957f, 0.0303648f, 0.0296239f, 0.0455713f, 0.175647f, 0.080532f, 0.0445691f, -0.257356f, -0.125602f, -0.138829f, -0.167057f, -0.0992552f, -0.13944f, 0.507531f, 0.444997f, 0.221452f, -0.308384f, -0.327554f, 0.13235f, 2.1487f, -1.15453f, -0.280239f, -0.363582f, -0.00358745f, 0.012866f, 0.251088f, 0.0676416f, 0.178492f, -0.136631f, 0.197938f, -0.078198f, 0.812439f, 1.1173f, 0.712113f, 1.10124f, -0.836503f, -1.22433f, -1.07894f, -1.29215f, 0.56057f, 2.23928f, -0.419029f, 0.282178f, -0.0719266f, -0.172192f, 0.28034f, -2.99124f, -2.01481f, 0.0688982f, 0.697466f, 0.00635555f, 0.566069f, 0.047534f, 0.507755f, -0.00690707f, 0.712594f, -0.191467f, 0.355733f, -0.480016f, 0.664669f, -0.390619f, 0.351199f, -0.482342f, 0.325005f, 1.9089f, 0.155987f, 0.17032f, 0.132729f, 0.0402649f, 0.146991f, 0.0314905f, -0.775316f, -0.208892f, -0.105993f, 0.0181653f, -0.12735f, 0.0897852f, 0.0470231f, 0.25807f, 0.127406f, -0.0893252f, -0.279776f, 0.190844f, 0.110384f, -0.148833f, 0.025293f, 0.239838f, 0.00932245f, 0.35103f, -0.128268f, -0.0536754f, 0.506899f, -0.16793f, 0.0955582f, -2.01108f, 0.721433f, -2.31413f, -2.08646f, 0.033315f, 0.689828f, -0.271213f, 0.790425f, -0.114234f, 0.755325f, -0.211533f, 0.774544f, -0.263268f, 0.795762f, -0.551455f, 0.953602f, -0.168454f, 0.529055f, -0.768991f, 0.882371f, 0.29763f, -0.155017f, 0.00464101f, 0.121093f, 0.948271f, 0.113138f, -0.110332f, -2.0492f, -1.31322f, -0.129212f, 0.464778f, -0.181465f, 0.618403f, 0.0627984f, 0.465228f, 0.165729f, 0.278277f, -0.563276f, -0.358358f, -0.590638f, 0.0104993f, 0.731206f, 0.752569f, 0.631615f, 0.811822f, 0.129804f, -0.0558327f, 0.570081f, -0.417922f, -0.168275f, 0.0703671f, 0.269127f, 0.240457f, -0.197159f, -0.00179261f, 0.220065f, 0.463511f, 0.0714626f, -0.716477f, -0.441865f, -0.717028f, -0.149176f, 0.452182f, 0.662699f, -0.906534f, -0.817133f, 0.237747f, 0.26024f, -7.7441e-05f, 0.0934616f, 0.824641f, -0.0404494f, -0.088297f, -0.157899f, 0.037408f, 0.132435f, -0.316155f, -0.276785f, 0.0117868f, 0.185008f, 0.32369f, -0.465855f, -0.302127f, 0.303289f, 0.338597f, -0.665408f, -0.507594f, 0.526979f, 0.532091f, 0.234395f, 0.754063f, 0.116769f, 0.0800309f, -0.939344f, -1.51269f, 1.4583f, 0.178444f, 0.0106756f, -0.213468f, -0.00369439f, 0.071015f, -0.192798f, -0.0933147f, -0.129901f, -0.368279f, -0.246564f, 0.126966f, 0.478565f, -0.476246f, -0.762863f, 0.168883f, 0.536136f, -0.272969f, 0.2573f, -0.161577f, 0.311428f, -0.777994f, -1.29752f, 0.216046f, 0.329016f, 1.57265f, 0.168075f, -0.192518f, 0.0829308f, -0.073533f, -0.0202034f, 0.114716f, -0.34888f, -0.519215f, 0.190809f, 0.0138507f, 0.133635f, 0.14194f, 0.410618f, -0.165106f, 0.214438f, 0.0438265f, -0.8481f, -1.19182f, -1.07878f, -0.882217f, 0.45616f, 0.977385f, 0.74929f, 0.918466f, 0.904704f, 0.041938f, 0.0362776f, 0.0757255f, 1.14007f, 0.0516825f, -0.160068f, 0.219535f, 0.638634f, -0.0284544f, -0.222849f, -0.0344915f, -0.0350256f, -0.0504452f, -0.0458416f, 0.146099f, 0.0783083f, 0.206579f, 0.241264f, 0.28401f, 0.0425312f, -0.802049f, -0.746271f, -0.578969f, -0.078218f, 0.436176f, -0.281465f, -2.5539f, 0.237868f, -0.121796f, 0.0715619f, 0.106992f, -0.621862f, -0.167142f, 0.153716f, 0.0570912f, -0.06525f, -0.923773f, 0.130759f, 0.0517066f, 0.0729862f, -0.873064f, 0.0403328f, -0.186499f, -0.0831918f, -0.223723f, 0.144697f, 0.212845f, 0.416876f, 0.361598f, 0.138229f, 0.0728777f, -1.95419f, -0.00382816f, -0.0440387f, 0.433627f, 0.44781f, -1.05229f, -1.54506f, 0.564827f, -0.263456f, 0.296105f, -0.158055f, 0.388274f, -0.366639f, 0.212006f, -0.245619f, 0.593064f, 0.088727f, 0.410632f, -0.263462f, 0.507075f, -0.0974155f, 0.275268f, -0.1293f, 0.136679f, 1.98276f, 0.411766f, 0.391987f, 0.34283f, -0.114077f, 0.258462f, -0.302443f, 0.301138f, -0.00726621f, 0.276441f, -0.291582f, 0.66498f, -0.321451f, -0.332805f, 0.0943272f, 0.572253f, -0.45818f, -0.0219593f, -0.151679f, 0.402033f, -1.15502f, -0.882955f, 0.772904f, 0.88126f, -0.149555f, 0.709525f, 0.350116f, -0.21531f, 0.797893f, 0.0230234f, 0.0203034f, 0.2744f, 1.08273f, 0.039349f, 0.503909f, -0.45892f, -0.579516f, -0.344058f, 0.390628f, -0.386941f, -0.430317f, -0.0807066f, 0.435906f, 0.522996f, 0.724476f, -0.74371f, -0.05376f, -0.340898f, -0.962646f, -0.0278005f, 0.0981149f, -0.0811161f, 0.00237994f, 0.850042f, 0.0665473f, 0.134413f }; static const float av1_simple_motion_search_prune_rect_logits_bias_8[] = { 1.63404f, -0.715866f, -1.0132f, -2.08745f }; static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_8 = { NUM_FEATURES_8, NUM_LOGITS_8, NUM_HIDDEN_LAYERS_8, { NUM_LAYER_0_UNITS_8, }, { av1_simple_motion_search_prune_rect_layer_0_kernel_8, av1_simple_motion_search_prune_rect_logits_kernel_8, }, { av1_simple_motion_search_prune_rect_layer_0_bias_8, av1_simple_motion_search_prune_rect_logits_bias_8, }, }; #undef NUM_HIDDEN_LAYERS_8 #undef NUM_FEATURES_8 #undef NUM_LAYER_0_UNITS_8 #undef NUM_LOGITS_8 static const NN_CONFIG *const av1_simple_motion_search_prune_rect_nn_config[5] = { &av1_simple_motion_search_prune_rect_nn_config_128, &av1_simple_motion_search_prune_rect_nn_config_64, &av1_simple_motion_search_prune_rect_nn_config_32, &av1_simple_motion_search_prune_rect_nn_config_16, &av1_simple_motion_search_prune_rect_nn_config_8, }; // nn model for predicting max square partition level of a superblock #define NUM_HIDDEN_LAYERS 1 #define NUM_FEATURES 13 #define NUM_LAYER_0_UNITS 48 #define NUM_LOGITS 4 static const float av1_max_part_pred_logits_kernel[] = { -0.304561f, 0.0885596f, -0.988539f, 1.08147f, 0.215213f, 0.202965f, -0.828457f, -0.233945f, -0.0866977f, -0.115521f, 0.02079f, 0.196491f, -0.0285075f, 0.05067f, -0.00872862f, 0.00281844f, -0.238954f, 0.0253801f, 0.0257775f, 0.339269f, 0.176174f, -0.152545f, -0.0588704f, -1.62275f, -0.189329f, 0.0808033f, 0.233844f, -4.53798f, 0.674968f, -0.0361688f, -0.0754075f, 1.16129f, -0.0188879f, 0.113255f, -3.04378f, 0.814728f, -0.568517f, -0.00179383f, -3.61223f, -1.67535f, -2.20417f, -0.197196f, 0.0507745f, -0.0909394f, -0.0507879f, -1.27999f, -0.055623f, 0.0318497f, 0.192867f, 0.138726f, 0.0443392f, -0.595075f, -0.166774f, 0.0882958f, -0.348161f, 0.0214428f, -0.0599275f, -0.0995385f, -0.82358f, 0.141205f, -0.053232f, 0.00508296f, -1.90872f, 1.15004f, -0.194219f, 0.0229019f, -0.00354318f, 0.22016f, 0.154101f, -0.159231f, -0.0446647f, -0.197503f, 0.0408453f, 0.197659f, 0.797858f, -0.189722f, 0.343653f, 0.124666f, -1.03083f, 0.603059f, 0.101565f, 0.0932993f, 0.462484f, 0.295984f, 1.11198f, 0.143709f, -0.846232f, -0.464392f, -1.06058f, -0.124889f, 0.0727475f, 1.18446f, -0.100302f, 0.0641918f, -0.101622f, 0.10219f, 0.130189f, 0.0915623f, -0.166904f, -1.10606f, -0.16726f, -0.146152f, 0.145443f, -0.177091f, -0.0215214f, 0.0158506f, -0.553294f, 0.0784749f, -0.0416628f, -0.027785f, 0.280027f, 0.484898f, -0.164225f, 0.0238317f, -0.0345254f, 0.0410244f, 0.131529f, 0.0239622f, -0.0749436f, -0.0224914f, 0.128926f, 0.224539f, 0.413297f, 0.0638572f, 0.103308f, 0.0913242f, -0.119274f, 0.0163103f, 0.113828f, 0.119809f, 0.297057f, -0.124889f, -0.533108f, -0.181408f, -0.129896f, 0.0221064f, -0.0773281f, -0.0386467f, 0.0342961f, 0.126575f, -0.24114f, 0.0735576f, 0.0524791f, 0.246896f, -0.130674f, -0.03979f, 0.173639f, 1.95193f, -0.113029f, -0.0305852f, -0.00671737f, 0.157159f, -0.00102858f, -0.543688f, 0.566772f, 0.124124f, -0.0294064f, -0.0699021f, -0.0704103f, -0.766097f, -0.0625802f, -0.0906173f, -0.0520414f, -0.0272724f, 0.283064f, 0.236213f, -0.127319f, 0.019392f, 0.170042f, -0.0214542f, 0.0740938f, 0.356578f, -0.236257f, 0.269021f, 0.114759f, -0.641166f, 0.136308f, -0.0386959f, -0.112024f, -0.361209f, 0.686095f, 0.183906f, 0.288656f, 0.182007f, 0.337458f, 0.058974f, -0.305512f, -0.841708f, -0.243779f, -0.0614058f, 0.208747f, 0.448697f }; static const float av1_max_part_pred_layer_0_bias[] = { -0.776544f, -2.0022f, -0.330294f, 2.47665f, 1.90206f, -1.61571f, 0.536246f, 1.00455f, 5.24561f, 1.55111f, -0.816399f, -4.88703f, -1.06417f, -1.15359f, -0.145289f, 1.91831f, 0.630915f, -1.94256f, -3.35239f, -1.05007f, -1.05186f, 1.36824f, -5.2878f, 1.10482f, -5.00077f, -0.0445198f, 3.41427f, 2.3439f, -0.413306f, -1.88152f, -2.28638f, 8.24783f, -1.91961f, -1.49324f, 1.96599f, -6.32309f, -0.332426f, -0.425506f, 4.06511f, 5.84386f, 4.15747f, 1.22402f, 2.8512f, 2.53027f, 0.0170272f, -1.43966f, -0.997785f, 5.43064f }; static const float av1_max_part_pred_logits_bias[] = { -4.25432f, 0.144758f, 1.96217f, 0.728905f }; static const float av1_max_part_pred_layer_0_kernel[] = { 0.992471f, 0.533006f, 0.143743f, -2.51788f, -0.468337f, -0.201376f, -0.151834f, 0.479883f, 1.16061f, -0.278878f, -0.814954f, -0.152405f, -0.0521608f, 0.797104f, -2.08912f, 0.385839f, -2.22889f, -0.106858f, -0.239766f, -0.951128f, -0.698753f, 0.0831051f, 1.1702f, 0.342834f, -0.0352795f, -0.0847639f, -0.802086f, 0.258982f, 1.14174f, 0.645885f, -1.19226f, -0.592888f, -0.343659f, 1.1912f, 1.45411f, -1.22927f, 0.152858f, 0.00373585f, -1.60637f, 0.592611f, 0.0857475f, -0.346147f, -0.150784f, -0.0817408f, -0.189918f, -0.804952f, -1.33036f, -1.03307f, 0.0248769f, 0.16607f, -2.896f, -2.1293f, 0.12293f, -0.173179f, -0.212128f, -6.76221f, 0.033188f, 0.0231787f, 0.905957f, 0.0551327f, -0.356276f, 0.0181795f, 0.0977523f, -0.0352873f, -0.0396386f, 2.3241f, 0.0632874f, -0.11804f, -6.32521f, 0.0224659f, -0.00188896f, 0.267992f, 0.272337f, 0.00936963f, 0.659969f, -2.25707f, -0.0278229f, -0.0185089f, -1.14466f, 0.104827f, 0.0435885f, 0.558586f, -0.00697004f, 0.0312611f, 0.540574f, -0.568625f, 0.218608f, 0.378911f, -0.0289192f, -0.0734742f, -1.08782f, -2.42069f, -0.0127239f, 0.0493651f, -1.15837f, 0.261831f, 0.401824f, -1.04545f, 0.284173f, 0.784972f, -0.511243f, -0.982599f, -0.106134f, -0.325964f, -1.44107f, -1.42434f, -1.02402f, -1.52034f, 0.0737116f, 0.0462242f, 0.628722f, -1.0405f, -0.113718f, 2.20573f, -4.33951f, -0.0192695f, -0.0229314f, -1.89156f, 0.645942f, 0.375708f, -1.97447f, -0.267014f, 0.0989443f, -0.450534f, -1.01737f, -0.642416f, -0.0897288f, -2.08724f, -0.190965f, -0.279135f, -0.830178f, 0.808754f, -0.139091f, 1.11004f, -0.454439f, -0.479238f, -1.44001f, 0.0888059f, 0.885689f, -0.642505f, -0.00773651f, -0.0265721f, -0.906346f, 1.68504f, 0.084257f, -0.951101f, -8.06495f, 0.19231f, 0.16389f, -0.193678f, 0.729837f, -1.98392f, -5.98513f, 3.32638f, -0.0658378f, -0.0910426f, -0.666567f, -0.315339f, 0.123124f, -2.66375f, -0.714852f, -0.136176f, -0.460166f, -0.567551f, -1.06193f, -1.21389f, -0.83865f, 0.00280695f, -0.199519f, -0.534704f, 0.419311f, -0.149008f, -3.68707f, 0.00285113f, -0.0718198f, -1.41026f, -1.34155f, -0.538687f, -0.623666f, -2.56462f, -0.0183333f, -0.323532f, -1.27141f, -0.0212039f, 0.198633f, 0.459554f, -4.65103f, -1.01293f, -1.39512f, -0.289026f, 0.208724f, -0.665226f, 1.13369f, -1.96734f, -1.45442f, -3.46172f, 0.810681f, -0.603973f, 0.842764f, -3.90371f, -0.394561f, -3.61363f, -2.88085f, 0.031645f, -0.23125f, -2.63898f, -1.35314f, -0.46726f, 1.33145f, 1.20269f, 1.38682f, -0.331637f, 0.069021f, 0.149523f, -1.24957f, -0.878857f, -0.200368f, 0.465744f, 1.01365f, -0.0122221f, -0.550586f, -1.12581f, -0.422132f, -0.0744868f, -2.4804f, -1.07072f, -0.479006f, 0.101817f, -0.118947f, 0.341576f, -1.0538f, -0.812346f, -1.13727f, -0.00939806f, 10.1571f, -0.0441302f, 0.00280407f, -21.5044f, 0.0181152f, -0.0143246f, 3.23462f, -1.38624f, -1.80416f, 4.89763f, -2.67364f, 2.31771e-05f, 0.000393989f, 0.352204f, -0.193455f, 0.531455f, 0.488757f, -0.442555f, -0.518528f, 0.431482f, -2.67727f, -2.00626f, -0.39729f, -0.221494f, -0.0188888f, -0.0377649f, -1.80169f, 0.0810332f, -0.0408335f, -1.28675f, -0.0353824f, -0.666723f, -1.07281f, 0.252912f, -1.24547f, -1.7831f, -1.14354f, -0.137662f, 0.00230182f, 0.736862f, 0.175872f, -0.187556f, 0.43963f, -0.796524f, 0.056219f, -0.387874f, 0.0710224f, -0.16548f, -0.100993f, 0.931481f, -3.20738f, -0.0197576f, 0.266148f, -0.173909f, -0.337795f, -0.0682381f, 0.176844f, 0.140286f, 1.12033f, 0.429064f, -2.24192f, -1.54682f, 2.23646f, -0.0371138f, -0.0475339f, -3.21766f, 0.0412858f, 0.387811f, 6.6711f, 0.140649f, 0.0559547f, -0.802839f, 0.599977f, 0.64552f, -2.08103f, -0.503401f, -0.0407036f, -0.0299199f, 0.0849445f, -0.111657f, -1.63462f, 3.33762f, 0.0441394f, 0.0466889f, -0.951806f, 0.0723954f, 0.00348661f, -1.36903f, 2.24625f, -0.0348915f, -0.0508893f, -0.240891f, -0.120143f, -0.17991f, -2.09137f, 0.0150871f, 0.0480333f, 1.72012f, 0.0309551f, -0.0370507f, -0.377075f, 0.103916f, -0.0169255f, -0.0145395f, -4.02144f, 0.83193f, -0.316502f, 6.3832f, -1.70038f, -1.97215f, -1.94501f, 1.45479f, 0.711725f, -0.348496f, -0.279056f, -1.13396f, -1.51744f, -0.853307f, 1.53131f, -0.0032358f, 1.41808f, -1.32989f, -0.245221f, -0.161614f, -0.500845f, -0.449252f, 0.0724151f, -0.116333f, -0.0946182f, -2.0945f, 0.0564572f, 0.393261f, -1.06861f, -0.111458f, -0.839943f, -0.0880348f, 0.0365742f, 0.415339f, -1.57494f, -0.713697f, 1.02349f, -0.221371f, -0.0446281f, 1.89223f, -0.0811754f, -0.402773f, -0.930987f, 0.0243194f, 0.0678332f, -0.0233014f, 0.165372f, -0.44083f, -1.2404f, 0.35675f, -0.040916f, -0.0512548f, -2.9071f, 0.861174f, -0.778133f, 2.14436f, -0.688427f, -0.480371f, -1.69032f, 0.706687f, -0.281982f, -2.30451f, 1.61541f, -0.0213638f, -0.740509f, -0.266677f, 0.0268434f, -0.0116908f, -3.17595f, 0.0114825f, 0.0196997f, -0.144005f, 0.0550181f, -0.851459f, -0.000285073f, -0.538441f, -0.0254868f, -0.0104454f, -0.0661998f, -0.196469f, -0.346372f, -5.52892f, -0.643683f, -0.622224f, -0.31463f, -0.555956f, -0.520132f, -0.843166f, -2.59479f, -0.750195f, 0.00635995f, -0.338615f, -0.216676f, -0.391544f, -1.62185f, -0.718471f, -0.475406f, -0.782041f, -0.608824f, -1.09633f, -1.27308f, -0.560719f, -0.207539f, -0.0196445f, -1.05519f, -0.575249f, -1.0642f, 1.01615f, -0.873633f, -0.417953f, -0.428051f, 0.350259f, -2.53833f, -2.72203f, 0.672846f, -0.503094f, -1.1374f, 0.214291f, 0.013305f, 0.0112064f, 1.10532f, 0.030455f, 0.0239614f, 0.628072f, 0.0539135f, -0.472441f, -0.688439f, -0.32044f, -0.0234867f, -0.0158436f, -0.949314f, -0.0453161f, -1.18306f, 0.626845f, -0.426925f, -0.688371f, 0.415062f, 0.0640985f, -0.638387f, -2.01399f, -0.209744f, -0.762892f, -0.0753296f, -0.879315f, -0.520433f, -0.111375f, 0.389742f, -0.398862f, -0.643227f, -0.246396f, 0.0317051f, 1.06973f, 0.413617f, 0.180506f, -0.0507897f, -0.00650435f, 0.620892f, 0.046312f, 0.475032f, 0.906993f, -0.0388061f, -0.256271f, -1.03323f, 0.0125266f, -0.31116f, -0.377611f, -0.0386407f, -0.0232745f, -0.353644f, -2.27289f, 0.0571779f, -0.00865006f, 1.65101f, 0.0175711f, 0.0184585f, 0.558458f, 0.2213f, -0.285089f, 0.433445f, -0.427177f, -0.0103682f, -0.0101273f, 0.214085f, -0.0459885f, 0.00761981f, 0.836381f, 0.0175293f, 0.02508f, -1.51778f, 0.0143956f, -0.162589f, 0.595418f, 0.21445f, -0.0335848f, -0.0136684f, -0.16686f, -0.14612f, 0.0816238f, 0.499636f, 0.12458f, -2.41673f, -0.261721f, -0.676805f, -1.88366f, 0.730462f, 0.69196f, -0.0288489f, -2.38272f, 0.329876f, 0.014517f, -0.115145f, -3.48151f, -0.00209072f, -0.0732377f, 0.820443f, -0.0118701f, 0.112145f, 0.272315f, 0.137531f, -0.0200997f, -0.0397883f, -2.19458f, 0.183554f, -0.639716f, 0.481605f, -0.621639f, -0.0980299f, -0.710534f, -0.143105f, -6.77626f, -1.65139f, -2.37718f, -0.533127f, -1.12574f, 3.34182f, -0.0758663f, 0.0334238f, -9.48647f, 0.0674974f, 0.0507665f, 0.523007f, -0.0668f, 0.5736f, -0.589761f, -1.1692f, -0.0236497f, -0.00828928f, -0.265823f, 1.15284f, 0.307927f, -0.695308f, 0.13725f, -0.20394f, -0.363965f, -0.331159f, -1.50927f, -1.20051f, -0.0205825f, -0.0381859f, -0.0579876f, -1.6913f, -1.94626f, 3.4214f, 3.3922f, -2.13798f, -0.679848f, -0.890735f, 0.235017f, -0.253202f, -1.0571f, 1.40354f, 0.00719052f, -1.54365f, -0.7289f, -1.05492f, 0.0238169f, -0.00543592f, -0.0510353f, -0.175386f, -0.724207f, -0.788936f, 0.039976f, 1.36966f, 0.869475f, -0.0302774f, -0.0537556f }; static const NN_CONFIG av1_max_part_pred_nn_config = { NUM_FEATURES, NUM_LOGITS, NUM_HIDDEN_LAYERS, { NUM_LAYER_0_UNITS, }, { av1_max_part_pred_layer_0_kernel, av1_max_part_pred_logits_kernel, }, { av1_max_part_pred_layer_0_bias, av1_max_part_pred_logits_bias, }, }; #undef NUM_HIDDEN_LAYERS #undef NUM_FEATURES #undef NUM_LAYER_0_UNITS #undef NUM_LOGITS // Early termination in second pass static const float av1_simple_motion_search_term_none_mean_128[28] = { 12.661922f, 12.638062f, 10.896497f, 10.865719f, 10.978963f, 10.940105f, 11.012235f, 10.972760f, 11.069924f, 11.018533f, 11.773865f, 11.747426f, 11.891315f, 11.858107f, 11.793916f, 11.766356f, 11.874997f, 11.840164f, 5.940535f, 0.770746f, 4.292692f, 4.309581f, 0.848423f, 4.292334f, 4.298179f, 8.514713f, 14.911736f, 19.825352f, }; static const float av1_simple_motion_search_term_none_std_128[28] = { 1.796731f, 1.797056f, 1.898383f, 1.900753f, 1.846624f, 1.846953f, 1.906632f, 1.908089f, 1.836533f, 1.835967f, 1.840262f, 1.840671f, 1.816836f, 1.817103f, 1.879846f, 1.881333f, 1.803102f, 1.802654f, 2.263402f, 0.420354f, 1.117165f, 1.083779f, 0.358611f, 1.101183f, 1.084938f, 2.462638f, 1.577009f, 1.574711f, }; static const float av1_simple_motion_search_term_none_mean_64[28] = { 10.904455f, 10.853546f, 9.247903f, 9.184479f, 9.251985f, 9.186686f, 9.253490f, 9.190190f, 9.270079f, 9.204357f, 10.086511f, 10.031060f, 10.100875f, 10.045429f, 10.069688f, 10.013173f, 10.082980f, 10.024640f, 4.888378f, 0.878113f, 3.598450f, 3.628491f, 0.925833f, 3.560971f, 3.573322f, 8.807137f, 13.348477f, 18.269117f, }; static const float av1_simple_motion_search_term_none_std_64[28] = { 1.789300f, 1.787061f, 1.823519f, 1.820226f, 1.794643f, 1.788620f, 1.797194f, 1.795135f, 1.777795f, 1.773634f, 1.794000f, 1.790377f, 1.772197f, 1.769692f, 1.819050f, 1.817139f, 1.793577f, 1.789333f, 1.998251f, 0.327156f, 0.885748f, 0.853767f, 0.262043f, 0.902435f, 0.860033f, 1.224865f, 1.603411f, 1.589296f, }; static const float av1_simple_motion_search_term_none_mean_32[28] = { 9.818970f, 9.751199f, 8.015079f, 7.927318f, 8.029113f, 7.938330f, 8.012570f, 7.923719f, 8.033508f, 7.941911f, 8.933057f, 8.857422f, 8.935639f, 8.859187f, 8.905495f, 8.829741f, 8.929428f, 8.851351f, 4.114069f, 0.954752f, 2.645082f, 2.709703f, 0.964678f, 2.652077f, 2.673393f, 9.430499f, 11.922798f, 16.942251f, }; static const float av1_simple_motion_search_term_none_std_32[28] = { 1.737107f, 1.734327f, 1.727923f, 1.720244f, 1.721570f, 1.712775f, 1.718028f, 1.710370f, 1.711612f, 1.702596f, 1.754856f, 1.748855f, 1.741871f, 1.736304f, 1.722428f, 1.717380f, 1.713563f, 1.707582f, 1.761170f, 0.207847f, 0.900058f, 0.862356f, 0.184593f, 0.903822f, 0.856120f, 1.529199f, 1.412085f, 1.453153f, }; static const float av1_simple_motion_search_term_none_mean_16[28] = { 8.998877f, 8.912468f, 7.085255f, 6.953476f, 7.086386f, 6.954091f, 7.088727f, 6.955747f, 7.093955f, 6.960635f, 8.065050f, 7.961432f, 8.071631f, 7.967233f, 8.041699f, 7.937715f, 8.046791f, 7.942183f, 3.833521f, 0.978421f, 1.901347f, 1.950124f, 0.979418f, 1.928000f, 1.936727f, 9.773951f, 10.735227f, 15.949769f, }; static const float av1_simple_motion_search_term_none_std_16[28] = { 1.641193f, 1.640172f, 1.614794f, 1.608906f, 1.609571f, 1.603580f, 1.606928f, 1.601246f, 1.599230f, 1.593529f, 1.633747f, 1.630219f, 1.625695f, 1.622547f, 1.633827f, 1.630182f, 1.626607f, 1.622777f, 1.548838f, 0.145303f, 0.744550f, 0.736552f, 0.141980f, 0.742979f, 0.736977f, 1.366255f, 1.258794f, 1.294309f, }; static const float av1_simple_motion_search_term_none_model_128[] = { -0.6106842357f, -1.0402954455f, 0.6054417656f, -0.2116623578f, 0.2447714930f, 0.3782256209f, 0.5095592479f, -0.3275620904f, 0.3886188013f, 0.2629499420f, -0.1979599415f, -0.5389565605f, 0.1209207902f, -0.4913347466f, 0.3798542731f, -0.2812861709f, -0.1049824167f, -0.1088672020f, 0.4059596517f, -0.1347896613f, 0.2276868621f, 0.0506386970f, 0.0071088411f, 0.0467952100f, 0.2091247458f, -0.7371964736f, 0.1368935545f, 0.3175247786f, -0.5493146094f, }; static const float av1_simple_motion_search_term_none_model_64[] = { -0.4150046575f, -0.3954358561f, 0.1997997444f, 0.3395826831f, 0.2827215753f, 0.3395683652f, 0.2483140395f, 0.2722216476f, 0.2610308009f, 0.3724974359f, -0.0551479654f, -0.1721616359f, -0.3459358629f, -0.0952524186f, -0.1428993840f, -0.0415654914f, -0.3169539902f, -0.0269429900f, 0.9891530919f, -0.0125084982f, 0.0972182377f, 0.0008889801f, 0.0205418050f, 0.0057237854f, 0.1005222691f, -0.2851321920f, -1.5150336445f, 0.1893942436f, -0.4337360901f, }; static const float av1_simple_motion_search_term_none_model_32[] = { -0.4667392852f, -0.3893302767f, 0.1603498635f, 0.2304974726f, 0.1404975592f, 0.2505516225f, 0.1423053884f, 0.2189318406f, 0.1379765409f, 0.2638241296f, -0.1342865463f, -0.0549054345f, -0.1925223436f, -0.1142702769f, 0.0127811659f, 0.0868639997f, -0.0643197251f, 0.0279496470f, 0.9904395769f, -0.0095178685f, 0.1179410649f, -0.0013411972f, 0.0095060660f, 0.0195730400f, 0.0779717771f, -0.2498860763f, -0.8168817125f, -0.4798397348f, -0.6609679881f, }; static const float av1_simple_motion_search_term_none_model_16[] = { -0.3021081992f, -0.4620153673f, 0.0448577479f, 0.1738455035f, 0.0663209177f, 0.1629614573f, 0.0555168744f, 0.1631870212f, 0.0425805150f, 0.1688564954f, 0.0434083772f, -0.0046603915f, -0.0271580056f, -0.0183879127f, 0.1073730471f, 0.0314201476f, 0.0576891756f, 0.0119723753f, 0.9084332022f, -0.0188429077f, 0.0755089811f, -0.0172550234f, 0.0037663075f, 0.0022094472f, 0.0500247894f, -0.2944572004f, -0.8908521199f, -0.2555515792f, -0.5396254205f, }; #define FEATURES 31 #define HIDDEN_NODES 32 static const float av1_early_term_after_split_nn_weights_64_layer0[] = { -0.306296f, -0.691664f, 0.335148f, -0.298465f, -0.509241f, -0.632796f, -0.527979f, -0.009904f, -0.503646f, -0.494002f, -0.575101f, 0.239911f, -0.413312f, -0.622825f, -0.405448f, -0.419103f, -0.505903f, -0.392550f, -0.240293f, 0.121749f, -0.489777f, -0.756647f, 0.001047f, -0.016528f, 0.145714f, 0.172910f, 0.086197f, 0.162882f, -0.070588f, -0.077104f, 0.502730f, -0.244954f, 0.265605f, -0.323994f, 0.223397f, -1.086453f, 0.391886f, 0.200343f, 0.253878f, 0.018925f, 0.201819f, -0.205136f, 0.427314f, 0.041155f, 0.070484f, 0.159925f, -0.057095f, -0.146544f, -0.073792f, 0.152628f, 0.003986f, -0.515965f, -0.209754f, 0.037457f, 0.070622f, -0.143571f, -0.059602f, 0.111734f, 0.319674f, 0.149894f, -0.219883f, 0.206678f, 0.015809f, -0.210549f, 0.130156f, -0.189502f, -0.850392f, -0.156363f, -0.060354f, 0.189044f, 0.266495f, 0.151305f, -0.563677f, -0.354896f, 0.300637f, 0.257568f, -0.008359f, -0.535497f, -0.003127f, 0.293054f, -0.020212f, -0.157278f, 0.229972f, -0.309799f, -0.329927f, -0.077140f, 0.001177f, -0.024415f, 0.134044f, -0.181587f, -0.135380f, 0.230989f, -0.281451f, 0.912282f, 0.511562f, -3.900779f, -0.039917f, 1.956406f, -0.357589f, 0.292998f, -0.950158f, 0.422041f, 0.526572f, 0.605746f, -0.147110f, 0.256576f, 0.090010f, 0.221641f, 0.029763f, 0.351592f, 0.458324f, -0.005888f, 0.010521f, -0.389326f, -0.094006f, -0.171489f, -0.013153f, 0.026333f, -0.454571f, -1.932891f, -0.168211f, 0.051298f, -0.258061f, -0.028936f, -0.555937f, -0.475566f, -0.304046f, -0.318113f, 0.099697f, -0.217145f, 0.139433f, -0.203986f, -0.164012f, 0.051527f, 0.138603f, -0.085100f, -0.082887f, -0.242955f, -0.663410f, -0.535772f, -0.181665f, -0.197883f, 0.071319f, 0.135086f, 0.146200f, 0.184827f, -0.199041f, 0.162570f, -0.300167f, 0.017748f, -0.140111f, 0.103553f, 0.206929f, 0.193446f, 0.123141f, -1.201898f, -0.052254f, -0.750121f, 0.111741f, 0.204092f, -0.166266f, 0.124008f, -0.455496f, 0.306035f, 0.275903f, 0.193599f, -0.730011f, 0.126808f, 0.051059f, 0.103634f, -0.044334f, 0.048889f, 0.405228f, 0.574099f, 0.061167f, 0.260576f, 0.070032f, -0.038040f, 0.229183f, -0.243269f, -0.130116f, -0.538563f, -0.070199f, -0.129249f, -0.205153f, -0.268530f, -0.290828f, -0.233006f, 0.068712f, 0.618085f, -0.407008f, 0.686868f, 0.172247f, 0.826287f, -0.002672f, 0.239825f, -0.051548f, 0.420773f, 0.218747f, 0.041057f, -0.071189f, 0.286987f, -0.113915f, 0.122561f, 0.013979f, -0.049046f, 0.148175f, 0.031313f, -0.248601f, 0.209488f, 0.069008f, 0.072763f, 0.332475f, 0.079986f, -0.151042f, -0.205110f, -0.155550f, -0.510408f, 0.330429f, 0.577729f, 0.266524f, -0.378489f, 0.228204f, 0.055318f, 0.117583f, -0.588557f, -0.778201f, 0.434622f, -0.227820f, 0.611642f, 0.170548f, 0.817761f, 0.006642f, -1.005794f, -0.911490f, 1.633684f, -0.290664f, 0.308128f, 0.295986f, 0.243377f, -0.001275f, -0.131156f, 0.275205f, -0.041865f, -0.201951f, -0.016380f, 0.336604f, -0.258118f, 0.890810f, 0.441065f, -0.968006f, 0.135989f, -1.447191f, 0.353426f, -0.343235f, 0.376837f, -0.071602f, -0.319639f, -0.072347f, 0.547450f, -0.215380f, 0.182141f, -0.066186f, 0.033787f, 0.257482f, 0.217428f, -0.130249f, 0.057525f, 0.263991f, 0.230664f, -0.245113f, 0.048610f, -0.079955f, 0.251737f, -0.070368f, -0.017968f, -0.151815f, 0.025945f, -0.257769f, 0.299735f, 0.077263f, -0.565526f, 0.326263f, 0.096429f, 0.113414f, 0.092754f, -0.141908f, 0.172060f, 0.393117f, -0.216755f, 0.331051f, -0.363369f, -0.113363f, -0.095164f, -0.072784f, 0.214572f, 0.010993f, 0.209456f, 0.260381f, -0.314747f, -0.422173f, -0.189963f, -0.225130f, 0.339448f, 0.153814f, 0.265616f, -0.103575f, -0.123841f, -0.106236f, 0.155894f, -0.156264f, -1.361406f, -0.040736f, -0.614998f, -0.468200f, -0.266505f, -0.342786f, -0.908088f, 0.105758f, 0.040788f, -0.313589f, -1.359318f, 0.071329f, 0.176404f, -0.476141f, 0.010108f, -0.201440f, -0.221167f, -0.197448f, -0.013927f, -0.610270f, -0.607285f, 0.178070f, 0.174320f, 0.313115f, 0.026191f, -0.112330f, 0.122338f, -0.367751f, 0.196794f, 0.153709f, -0.205454f, -0.397471f, -1.879336f, -0.030129f, 0.143429f, -0.079832f, 0.435259f, -1.729539f, 0.518301f, -0.141393f, 0.199399f, -1.914601f, 0.142865f, -0.219899f, 0.508458f, 0.086365f, -0.220740f, -0.012507f, 1.263320f, 0.042136f, 0.050922f, -0.329644f, -0.188198f, 0.251522f, 0.394731f, -0.047866f, -0.260853f, -0.267207f, -0.248489f, 0.146474f, 0.359257f, -0.427732f, -0.100652f, 0.192129f, 0.075572f, 0.916708f, 0.255747f, 0.486384f, 0.127989f, -0.556449f, -0.484913f, 0.392298f, 0.045401f, -0.839551f, -0.703619f, 0.069263f, -0.040720f, 0.542265f, 0.443739f, 0.862552f, -0.021726f, 0.230858f, -0.261004f, -0.125697f, -0.106435f, 0.002341f, 0.013904f, 0.011034f, 0.542296f, -0.284325f, 0.135736f, 0.113882f, 0.040610f, -0.255485f, 0.224061f, -0.087140f, 0.127872f, -0.002638f, 0.164889f, -0.335958f, -0.031166f, -0.393581f, 0.075455f, 0.055995f, 0.087934f, -0.133859f, -0.342187f, 0.002492f, -0.340722f, 0.058304f, 0.104165f, -0.142136f, -0.351111f, -0.158037f, -0.079924f, -0.253209f, -0.092840f, -0.174646f, -0.202772f, -0.353438f, -0.031111f, 0.076088f, -0.232091f, -0.070052f, 0.097595f, 0.063173f, -0.211195f, 0.126478f, -0.178828f, 0.278723f, -0.070807f, -0.179783f, 0.034123f, 0.035721f, -0.200431f, 0.170640f, 0.107933f, 0.226594f, -0.301499f, -0.291096f, 0.228076f, -0.272951f, 0.002490f, -0.210707f, -0.128033f, -0.194009f, -0.011347f, -0.256694f, -0.011841f, -0.005167f, -0.163203f, -0.253796f, -0.198877f, -0.055827f, -0.882685f, -0.443471f, 0.349601f, 0.749334f, -1.161845f, 0.505480f, 0.221733f, 0.210490f, -0.234984f, 0.014183f, -0.510401f, 0.238692f, -0.134111f, 0.083844f, -0.478751f, -0.088434f, 0.304063f, 0.150336f, -0.749682f, -0.081999f, 0.729739f, 0.412508f, 0.132571f, 0.058306f, -0.047451f, -0.117435f, -0.445395f, -0.005182f, -0.025757f, 0.175051f, -0.258194f, -0.150311f, -0.196533f, -1.314316f, -0.428627f, 0.512451f, 0.045138f, -0.200925f, 0.081538f, -0.346151f, -0.358197f, -0.422258f, -0.028542f, -0.383534f, -0.026163f, -0.419858f, -0.154321f, 0.376970f, 0.094017f, 0.783520f, 0.110641f, 0.077966f, -0.093064f, 0.160522f, -0.863041f, 0.086210f, 0.560764f, 0.057032f, 0.159224f, 0.323068f, -0.173109f, 0.014042f, -0.126856f, -0.128237f, -0.245273f, -0.317312f, -0.257597f, -0.181977f, 0.259485f, -0.215834f, 0.062076f, -0.270596f, 0.271581f, -0.153486f, -0.247165f, 0.079737f, -0.157049f, -0.027459f, -0.299397f, 0.136729f, -0.334192f, -0.191722f, 0.145865f, -0.031324f, -0.307165f, -0.244923f, -0.228027f, 0.063807f, 0.054965f, -0.005709f, -0.041977f, -0.276245f, 0.020003f, 0.133323f, -0.145992f, -0.951030f, 0.414083f, -1.063323f, 0.137872f, 0.104732f, -0.123728f, 0.542532f, 0.213654f, 0.542954f, 0.155619f, 0.543072f, 0.399067f, 0.191402f, -0.102552f, -0.176734f, -0.136776f, -0.012814f, -0.021298f, -0.802467f, -0.957481f, -0.238787f, -0.138482f, 0.058331f, 0.126601f, 0.104420f, -0.148684f, 0.343218f, 0.093604f, -0.055642f, -0.383918f, -0.045250f, -0.090480f, -0.155464f, 0.278299f, 0.042791f, -0.029084f, -0.373861f, -0.073233f, -0.085172f, 0.186841f, -0.070898f, -0.156415f, 0.112831f, -0.065931f, -0.353007f, 0.058453f, -0.136982f, 0.233393f, 0.017240f, -0.018428f, 0.229104f, -0.371440f, -0.262212f, 0.203075f, -0.263293f, 0.034413f, -0.299354f, 0.227269f, 0.204977f, -0.118107f, -0.359832f, -0.068252f, 0.480105f, -0.214711f, -0.614381f, 0.209048f, -0.456014f, -0.188819f, -0.220995f, -0.322104f, -0.191457f, 0.420874f, -0.454919f, 0.023119f, 0.291700f, -0.532885f, -0.032642f, 0.043271f, 0.133974f, 0.002399f, -0.179899f, -0.044158f, -0.027078f, -0.350075f, 0.236766f, 0.346771f, -0.118534f, -0.421221f, 0.019544f, 0.109349f, 0.141517f, 0.403561f, 0.409102f, 0.054555f, -0.561751f, 0.577183f, -0.705156f, -0.231188f, -1.969772f, 0.172289f, -0.048122f, 0.205671f, -0.667130f, -0.066870f, 0.202838f, -0.095538f, -0.842651f, 0.254170f, 0.046256f, -0.271891f, -0.369254f, 0.492101f, 0.001189f, -0.186525f, 0.188470f, -0.207072f, 0.030086f, -0.132904f, 0.127001f, 0.116662f, -0.079246f, 0.227241f, -0.462178f, 0.446304f, -1.660753f, 0.241832f, -0.288040f, 0.054663f, -0.435804f, 0.296782f, -0.026421f, -0.115618f, 0.163416f, 0.834001f, 0.008019f, -0.014243f, 0.524658f, 0.067894f, -0.253936f, -0.100657f, 1.285389f, -0.005952f, 0.087134f, -0.088375f, -0.121866f, -0.171172f, 0.279463f, -0.598593f, -0.727761f, 0.189831f, -0.822575f, -0.291141f, -0.012410f, -0.069999f, 0.098842f, -0.218513f, 0.009494f, 0.100106f, -0.402884f, -0.299236f, -0.345668f, -0.057739f, -0.213248f, -0.426661f, -0.360268f, -0.349860f, -0.382177f, -0.357802f, -0.032030f, -0.110597f, -0.155442f, -0.418794f, -0.012113f, -0.032962f, -0.450648f, 0.129060f, -0.135227f, -0.298593f, 0.001435f, 0.278790f, -0.272945f, 0.162759f, -0.290208f, 0.058481f, -0.490971f, 0.019630f, -0.210347f, 0.000520f, -0.340413f, 0.641562f, 0.023104f, 0.194832f, -0.441894f, -0.253538f, -0.228332f, 0.423264f, -1.094073f, -0.475657f, -0.238752f, 0.033910f, 0.440425f, 0.036320f, 0.566989f, -0.065326f, -0.297939f, 0.406098f, 0.529561f, -0.113084f, 0.141472f, -0.024462f, -0.179212f, 0.187801f, -0.235787f, -0.229624f, 0.357791f, 0.061110f, -0.607788f, -1.713694f, -0.651041f, 1.734283f, -0.334701f, 0.161687f, 0.010215f, 0.320708f, 0.169447f, 0.513558f, 0.488340f, -0.619036f, -0.525441f, -1.144352f, -0.546154f, 0.669973f, 0.327028f, -0.100539f, 0.012048f, -0.223013f, -0.239680f, 0.323035f, 0.165950f, -0.155110f, 0.128664f, -0.157378f, -0.124490f, 0.291553f, 0.055849f, -0.221664f, 0.077770f, -0.350658f, -0.181939f, 0.110230f, -0.078219f, 0.007472f, -0.031620f, 0.007708f, -0.201794f, 0.017594f, -0.027480f, 0.058884f, -0.369166f, -0.369770f, 0.181635f, -0.183318f, -0.389184f, -0.256661f, 0.160107f, 0.037127f, -0.082573f, -0.095815f, -0.322782f, 0.072528f, -0.348875f, 0.216247f, -0.161757f, -0.385502f, -0.315738f, 0.020123f, -0.155609f, 0.114403f, -0.383232f, 0.629529f, 0.066142f, 0.448392f, -0.389557f, -0.083315f, 0.829535f, -0.015531f, -0.050728f, -0.325127f, 0.812992f, -0.196780f, 0.021060f, -0.952647f, 0.006687f, -0.512715f, -0.066778f, 0.410067f, -0.116945f, -0.288283f, 0.189334f, -0.083153f, 0.159980f, -0.068208f, 0.107358f, -0.154411f, -0.068914f, 0.186816f, 0.032251f, 0.109242f, 0.134825f, 0.035101f, -0.253175f, 0.157309f, -0.363597f, -0.138176f, -0.334141f, -0.172697f, 0.045800f, -0.286057f, 0.173403f, -0.172444f, -0.117996f, -0.383848f, -0.173303f, -0.258482f, -0.021404f, -0.017898f, -0.001970f, 0.003273f, 0.056121f, 0.155046f, 0.044708f, -0.295609f, -0.211688f, -0.233229f, -0.264980f, 0.145549f, 0.045323f, -0.027112f, 0.175638f, -0.207251f, -0.055274f, 0.092706f, 0.086200f, -0.241340f, -0.147416f, 0.024510f, -0.357194f, -0.181944f, -0.050104f, -0.079024f, -0.290473f, -0.169790f, -0.277982f, -0.017781f, -0.004854f, -0.094132f, -0.348555f, 0.199291f, -0.343989f, -0.319299f, -0.268935f, -0.021208f, 0.020938f, -0.090609f, 0.006595f, -0.200790f, 0.171856f, -0.027766f, -0.032017f, -0.006745f, 0.566426f, -0.096850f, 0.727633f, -0.408065f, -0.012436f, 0.005646f, -0.305148f, -0.095075f, -0.391549f, -0.020378f, -0.236498f, -0.252773f, -0.231385f, -0.203175f, 0.041903f, -0.373694f, 0.058239f, -0.101116f, 0.183772f, 0.164523f, -0.099046f, -0.201272f, -0.394523f, -0.157517f, 0.032079f, -0.381173f, -0.238496f, -0.037990f, -0.294553f, 0.141473f, 0.100268f, -0.023806f, 0.004978f, 0.184916f, 0.142699f, -0.113240f, -0.213364f, -0.160059f, -0.216263f, -0.406387f, -0.301140f, -0.406355f, -0.113085f, -0.279699f, -0.267434f, 0.126263f, -0.260527f, -0.153904f, -0.494653f, -0.355144f, 0.030549f, -0.216400f, -0.123363f, 0.189090f, 0.219122f, 0.096677f, -0.202037f, -0.014489f, -0.137859f, -0.114184f, -0.279423f, -0.270683f, }; static const float av1_early_term_after_split_nn_bias_64_layer0[] = { -0.491455f, 0.464538f, -0.005742f, -0.219951f, -0.073682f, 0.102027f, 0.567071f, 0.441402f, 0.277521f, 0.314498f, -0.448199f, -0.065032f, 0.488139f, -0.079632f, 0.000000f, 0.521555f, -0.151950f, -0.034616f, 0.393438f, -0.072242f, -0.087343f, -0.571308f, 0.017372f, -0.126144f, 0.372261f, -0.451537f, -0.140238f, -0.092377f, -0.074475f, -0.068879f, -0.109614f, -0.164492f, }; static const float av1_early_term_after_split_nn_weights_64_layer1[] = { -0.373195f, -0.283141f, 0.416113f, 0.483659f, 0.230583f, 0.349197f, -0.168582f, -0.813338f, -0.472369f, -0.173872f, 1.297845f, 0.339355f, -0.828033f, 0.019617f, 0.118757f, -0.619360f, 0.282295f, -0.054116f, -0.730596f, 0.068567f, -0.248707f, 0.461225f, 0.330224f, -0.287080f, -0.458103f, 0.591852f, -0.008491f, 0.632119f, -0.007872f, 0.007869f, -0.230698f, -0.011437f, }; static const float av1_early_term_after_split_nn_bias_64_layer1[] = { -0.55403697f, }; static const NN_CONFIG av1_early_term_after_split_nnconfig_64 = { FEATURES, 1, 1, { HIDDEN_NODES, }, { av1_early_term_after_split_nn_weights_64_layer0, av1_early_term_after_split_nn_weights_64_layer1, }, { av1_early_term_after_split_nn_bias_64_layer0, av1_early_term_after_split_nn_bias_64_layer1, }, }; static const float av1_early_term_after_split_nn_weights_32_layer0[] = { 0.026050f, -0.226531f, 0.308107f, -0.083744f, 0.201785f, 0.098562f, 0.147595f, -0.495771f, -0.245741f, 0.201616f, -0.272070f, -0.579545f, -0.127261f, -0.229588f, 0.250831f, -0.176929f, -0.031689f, 0.284718f, 0.085845f, -0.285027f, 0.012304f, 0.382402f, -0.204591f, 0.272514f, -0.065854f, -0.054228f, -0.231174f, -0.174504f, 0.258287f, 0.195689f, 0.242530f, 0.023528f, -0.294242f, -0.272132f, 0.460180f, -0.731281f, -0.208103f, 0.208204f, 0.348250f, 0.016328f, 0.043707f, -0.169551f, 0.108521f, 0.226895f, -0.020471f, 0.102443f, 0.429640f, -0.252555f, -0.218434f, -0.163665f, 0.175531f, 0.101588f, -0.135798f, -0.158102f, 0.142565f, 0.128277f, 0.174985f, -0.100073f, 0.113967f, 0.223682f, -0.145576f, -0.008443f, 0.112748f, -0.037845f, 0.076954f, -0.287137f, -0.518185f, -0.106833f, 0.175359f, 0.031408f, 0.219069f, -0.294440f, 0.007766f, 0.067754f, -0.049168f, -0.212368f, -0.261708f, 0.309252f, 0.220859f, -0.274852f, -0.653157f, 0.083438f, -0.265386f, 0.174429f, -0.116931f, -0.091594f, -0.244897f, -0.089015f, 0.274453f, 0.212890f, 0.272053f, -0.425315f, -0.107726f, 0.294444f, -0.354629f, 0.104402f, -0.307663f, 0.558430f, 0.140334f, -0.054831f, -0.449456f, 0.058274f, -0.033768f, -0.354117f, -0.331618f, -0.411772f, 0.232064f, -0.079297f, -0.638571f, 0.181823f, -0.039611f, 0.206310f, -0.659157f, -0.102930f, -0.067303f, -0.176881f, -0.001038f, 0.091835f, 0.079739f, -0.121923f, 0.211070f, 0.362719f, -0.154915f, -0.151876f, -0.165460f, 0.023469f, -0.251036f, 0.210014f, -0.537125f, 0.156832f, -0.216987f, 0.062975f, -0.198462f, 0.329123f, 0.125870f, 0.225830f, 0.086377f, -0.128773f, -0.179673f, -0.074612f, 0.456645f, 0.021905f, -0.243140f, 0.059145f, -0.273942f, -0.277822f, 0.154556f, -0.025459f, 0.227614f, -0.313076f, 0.044705f, -0.019017f, 0.108999f, -0.020243f, -0.016373f, 0.560270f, -0.064818f, 0.050880f, -0.218458f, 0.825699f, -0.534056f, -0.258253f, 0.222073f, 0.013295f, 0.477870f, -0.386727f, 0.388509f, 0.004128f, 0.451388f, -0.175788f, 0.264093f, -0.109812f, 0.358132f, 0.500992f, -0.446933f, -0.222397f, 0.345834f, 0.370943f, -0.233115f, -0.047005f, -0.111335f, -0.111586f, 0.026975f, -0.052191f, -0.111800f, -0.129782f, 0.225132f, 0.102524f, 0.544557f, -0.111674f, -0.857884f, 0.133258f, 0.310001f, 0.043829f, 0.104143f, 0.256493f, 0.242520f, -0.342082f, 0.421447f, 0.124227f, 0.061542f, -0.090206f, 0.316681f, 0.353452f, -0.918408f, -0.001903f, -0.052303f, -0.004816f, -0.446393f, -0.053038f, 0.255725f, -0.126346f, 0.034095f, -0.240276f, -0.135918f, 0.095682f, -0.147457f, -0.338216f, -0.200426f, 0.010265f, -0.243915f, -0.231375f, -0.323924f, -0.014353f, 0.150252f, -0.264346f, 0.205303f, -0.194610f, -0.282527f, 0.180555f, -0.000087f, 0.027240f, -0.000903f, -0.345877f, -0.353274f, -0.311829f, 0.172985f, -0.111748f, -0.309380f, 0.108110f, -0.260914f, -0.164990f, 0.183625f, -0.319692f, -0.096988f, 0.094147f, -0.047062f, -0.080978f, 0.227387f, -0.000450f, -0.220159f, -0.211448f, -0.020885f, -0.139646f, -0.086721f, 0.067928f, -0.033084f, -0.251996f, 0.090317f, 0.086313f, -0.228420f, -0.111356f, -0.314304f, -0.223664f, 0.188176f, -0.002360f, -0.029491f, -0.006000f, -0.075343f, 0.173699f, -0.272800f, -0.238507f, -0.272071f, -0.015000f, -0.215305f, -0.192943f, -0.038595f, 0.119537f, 0.260477f, -0.168014f, -0.172751f, 0.532861f, -0.753250f, -0.017485f, -0.115541f, -0.109291f, -1.098943f, 0.418559f, -0.532110f, 0.359323f, -0.254786f, 0.471316f, -0.545024f, 0.291912f, -0.836939f, 0.443427f, -0.441709f, 0.168866f, -0.140372f, 0.546607f, -0.315465f, 0.023328f, 0.137709f, -0.083492f, -0.049986f, -0.071302f, -0.293680f, -0.105049f, 0.315317f, 0.279569f, 0.220762f, 0.088161f, -0.756456f, -0.074512f, 0.958318f, -0.332924f, -0.004906f, -0.629271f, 0.212050f, 0.279123f, 0.311523f, -0.599580f, 0.516150f, 0.456952f, 0.020255f, 0.247290f, -0.182670f, -0.335554f, 0.021203f, 0.131081f, -0.208584f, 0.112530f, -0.198980f, 0.211583f, -0.101271f, -0.206453f, -0.502688f, -0.294976f, -0.187019f, -0.114473f, 0.282050f, -0.165483f, 0.094953f, -0.182578f, 0.055068f, 0.135605f, -0.266941f, -0.297556f, 0.199181f, 0.015979f, -0.158659f, -0.226841f, 0.171306f, 0.013438f, -0.286309f, -0.071753f, -0.170300f, -0.238188f, 0.093572f, -0.026230f, -0.254502f, -0.297786f, -0.063480f, -0.300799f, -0.065644f, 0.074710f, 0.248576f, -0.144425f, -0.113948f, -0.247297f, 0.276682f, 0.010963f, -0.737786f, 0.026347f, 0.007830f, 0.753543f, 0.371904f, 0.305614f, 0.105028f, 0.073530f, -0.119137f, 0.102352f, -0.080523f, 0.176366f, -0.159457f, -0.339948f, 0.360131f, -0.007051f, -0.388378f, -0.101695f, 0.663041f, -0.234486f, -0.142536f, -0.099931f, 0.041478f, 0.230425f, 0.005743f, 0.154060f, 0.056233f, -0.080668f, -0.009754f, -0.194356f, 0.185474f, -0.296474f, 0.192700f, 0.257767f, 0.348529f, 0.458265f, 0.060276f, -0.130473f, 0.139889f, 0.310073f, -0.306869f, -0.272922f, -0.259862f, 0.409207f, 0.431991f, -0.100357f, -0.050415f, -0.071830f, -0.239665f, 0.153399f, 0.177192f, -0.611644f, -0.176114f, -0.022694f, -0.033701f, -0.345842f, 0.015660f, 0.158931f, -0.097586f, 0.222001f, 0.257887f, -0.171307f, -0.222607f, -0.245508f, -0.145742f, -0.096461f, -0.010895f, 0.052815f, -0.265306f, -0.081059f, 0.219162f, -0.256084f, -0.372676f, 0.148977f, 0.174831f, 0.086980f, 0.108518f, 0.074011f, 0.038032f, -0.070856f, -0.109407f, 0.126174f, 0.022341f, -0.249786f, -0.356164f, -0.202841f, -0.087437f, -0.133740f, 0.090956f, -0.017953f, -0.028353f, 0.233621f, 0.109426f, 0.232798f, -0.104950f, -0.241798f, -0.018995f, -0.167954f, 0.002473f, 0.060418f, -0.232717f, -0.195980f, -0.283971f, -0.371881f, 0.219728f, 0.018072f, -0.166694f, -0.083301f, -0.000616f, -0.212641f, -0.173158f, 0.222739f, -0.235302f, 0.237624f, 0.222232f, -0.041235f, -0.342411f, 0.121194f, 0.211291f, -0.032237f, -0.249401f, -0.291668f, 0.206055f, -0.148200f, 0.011824f, -0.272728f, -0.194854f, 0.367175f, -0.257243f, 0.103433f, -0.231077f, 0.236734f, 0.135733f, -0.362845f, 0.197147f, 0.242782f, -0.135289f, 0.123311f, 0.259420f, -0.116278f, 0.127287f, 0.236789f, -0.097438f, 0.118073f, 0.112796f, -0.035949f, 0.184408f, 0.200948f, -0.008859f, 0.195989f, 0.161970f, -0.295320f, -0.330389f, 0.141034f, 0.066081f, -0.707857f, 0.357037f, 0.149633f, 0.679877f, 0.548674f, 0.469076f, 0.194123f, -0.209872f, -0.071764f, -0.126960f, 0.199420f, 0.327116f, -0.169053f, -0.429156f, 0.443429f, -0.225530f, -0.130738f, -0.028351f, 0.644393f, 0.049606f, -0.243602f, -0.409920f, 0.117028f, -0.258557f, 0.073865f, -0.200454f, -0.139957f, -0.031314f, 0.162325f, 0.247221f, 0.071909f, -0.336276f, 0.079922f, 0.192780f, -0.148882f, 0.133192f, -0.143177f, -0.121327f, 0.126221f, -0.089521f, -0.181826f, 0.149923f, -0.280682f, 0.391572f, 0.108990f, -0.445494f, -0.170787f, 0.225182f, 0.223313f, -0.234828f, -0.071072f, -0.072673f, -0.093686f, 0.223892f, -0.049377f, 0.057976f, 0.033558f, 0.068733f, -0.283353f, 0.217877f, 0.158093f, -0.276761f, -0.097049f, -0.351913f, -0.383604f, 0.002863f, -0.474510f, -0.096738f, 0.256940f, 0.234203f, -0.226667f, -0.260576f, -0.183403f, -0.035578f, 0.141570f, 0.078764f, -0.028086f, 0.155800f, -0.251115f, -0.286703f, -0.014739f, -0.072621f, -0.311506f, -0.048639f, 0.081621f, 0.043057f, 0.068136f, -0.179903f, 0.143699f, -0.002571f, 0.239012f, 0.197456f, 0.035745f, -0.311927f, 0.220320f, 0.102687f, -0.294105f, 0.426740f, 0.209050f, 0.211907f, 0.083453f, 0.006578f, -0.143338f, 0.003157f, 0.040295f, 0.234497f, 0.035344f, -0.163909f, 0.411115f, 0.289453f, -0.075357f, -0.008884f, 0.469798f, -0.033304f, -0.153293f, -0.229322f, -0.004162f, 0.113363f, 0.395381f, 0.067414f, -0.188966f, -0.117424f, -0.166423f, 0.066839f, 0.595641f, -0.204782f, -0.451727f, 0.198509f, -0.921583f, -0.246765f, -0.153411f, 0.046491f, 0.365906f, 0.376710f, -0.017355f, -0.035232f, 0.138785f, -0.163918f, -0.283449f, -0.094340f, 0.192127f, 0.154815f, 0.035787f, -0.029087f, 0.115649f, -0.220133f, -0.452741f, 0.311667f, 0.157666f, 0.091401f, 0.236040f, -0.168523f, 0.122176f, -0.219016f, -0.214856f, 0.172824f, -0.091810f, 0.031520f, -0.857420f, 0.643446f, -0.017471f, 0.206082f, -0.933517f, -0.020070f, -0.065091f, -0.117680f, -1.271870f, -0.069177f, -0.149409f, 0.289970f, -0.889775f, -0.044741f, 0.232647f, -0.319416f, 0.073030f, 0.278549f, 0.238782f, -0.202206f, 0.272540f, 0.201412f, 0.175574f, -0.127971f, -0.253164f, -0.086352f, -0.005381f, 0.114714f, 0.505169f, -0.175049f, -1.534280f, -0.320666f, -2.119298f, -0.023075f, -0.021259f, -0.161019f, 0.344837f, 0.361958f, -0.097050f, 0.014375f, 0.267110f, 0.341442f, -0.016688f, 0.073393f, 0.131500f, 0.246331f, 0.011059f, 0.033597f, 0.014779f, -0.269366f, -0.504788f, 0.048651f, 0.295682f, 0.237363f, 0.227484f, -0.235814f, -0.160530f, 0.182682f, -0.172999f, -0.126630f, 0.168357f, -0.078729f, 0.052805f, 0.377021f, -0.004727f, 0.230415f, -0.876673f, 0.458457f, 0.099401f, -0.019616f, 0.611982f, -0.231508f, -0.070894f, -0.056142f, 0.548969f, -0.376599f, -0.600428f, 0.241930f, -0.592893f, 0.189371f, 0.488651f, -0.092446f, -0.272569f, 0.251643f, 0.315945f, -0.301468f, 0.112961f, 0.052119f, -0.066076f, -0.082249f, 0.252805f, -0.195539f, 0.150386f, -0.865534f, 0.673447f, 0.030177f, -0.438528f, -1.006174f, 0.575176f, -0.271656f, 0.035835f, -1.056916f, 0.495267f, -0.092428f, -0.109511f, -0.192359f, 0.166669f, -0.624326f, -0.000354f, -0.089075f, 0.176279f, -0.289347f, 0.021346f, 0.020375f, 0.255282f, -0.045588f, 0.173675f, 0.100957f, -0.294373f, 0.049303f, -0.134132f, -0.255731f, -0.025559f, -0.307463f, -0.205100f, 0.079024f, 0.101113f, 0.135742f, -0.348869f, -0.026759f, -0.134155f, -0.179275f, -0.054297f, -0.054948f, 0.029351f, 0.190560f, 0.102476f, -0.025785f, 0.169442f, -0.271303f, 0.200667f, 0.099063f, 0.074767f, -0.326533f, 0.044426f, -0.290251f, -0.082443f, -0.164482f, -0.349412f, 0.045109f, -0.157330f, 0.165935f, 0.012672f, -0.059818f, 0.399140f, -0.316620f, 0.386638f, -0.285399f, -0.296777f, -0.200473f, -0.144232f, 0.251851f, -0.203768f, 0.001071f, -0.179063f, 0.248952f, -0.143029f, 0.010423f, -0.030293f, -0.046786f, -0.196195f, -0.016845f, 0.295023f, 0.322825f, 0.133683f, 0.017388f, 0.142467f, 0.221320f, 0.004059f, -0.115770f, 0.143363f, 0.137972f, -0.272584f, 0.489366f, -0.091828f, -0.014703f, 0.082332f, -0.476226f, -0.202859f, 0.356094f, -0.283049f, 0.218086f, 0.202015f, 0.201724f, 0.012617f, 0.050720f, 0.255695f, 0.244653f, 0.111296f, -0.151450f, -0.056210f, -0.757348f, 0.441724f, -0.022455f, -0.244662f, 0.296205f, -0.421883f, -0.217386f, -0.254301f, 0.409105f, -0.031309f, 0.050147f, -0.337170f, -0.106620f, -0.606455f, 0.308024f, 0.298144f, 0.363993f, 0.704870f, -0.047292f, 0.166901f, 0.105991f, -0.536757f, -0.424031f, -0.226034f, 0.213635f, -0.526754f, 0.310990f, -0.116038f, 0.007775f, 0.538330f, -0.177912f, 0.445357f, -0.290365f, 0.451169f, 0.030931f, 0.033388f, 0.209905f, -0.244492f, -0.097792f, -0.246042f, 0.132047f, 0.032576f, 0.115516f, 0.022890f, 0.093508f, -0.071840f, 0.362948f, -0.135245f, 0.659911f, -0.321413f, 0.193118f, -0.795001f, -0.218311f, 0.024862f, 0.206172f, -0.832878f, -0.255670f, 0.343402f, -0.275211f, -0.898363f, -0.025172f, 0.158565f, 0.171347f, -0.127518f, -0.215156f, -0.159198f, 0.250355f, -0.132452f, 0.061254f, -0.097544f, -0.223246f, 0.013183f, 0.239468f, 0.259017f, -0.217739f, -0.032263f, 0.123755f, -0.701777f, 0.150049f, -0.555293f, 0.062430f, -0.260304f, 0.494894f, -0.168702f, -0.134829f, -0.113989f, 0.150092f, -0.060248f, 0.115711f, -0.277202f, 0.499811f, 0.417116f, 0.191081f, -0.376432f, -0.321092f, 0.033992f, 0.057193f, 0.127077f, -0.009042f, 0.014443f, 0.142808f, -0.124349f, 0.213087f, -0.381686f, 0.129726f, -0.038396f, }; static const float av1_early_term_after_split_nn_bias_32_layer0[] = { -0.107171f, 0.060848f, -0.069480f, -0.121982f, 0.037637f, -0.291839f, 0.102257f, -0.065889f, -0.032452f, 0.034171f, -0.073984f, -0.005236f, 0.218820f, 0.132123f, -0.089621f, -0.067679f, 0.049368f, 0.329444f, -0.184729f, 0.031702f, 0.009735f, -0.039964f, -0.018024f, -0.073031f, -0.030166f, -0.191037f, -0.074862f, -0.076548f, 0.076537f, 0.216609f, -0.078358f, -0.007740f, }; static const float av1_early_term_after_split_nn_weights_32_layer1[] = { 0.047869f, -0.231773f, -0.185663f, 0.460676f, -0.208182f, 0.590555f, -0.622627f, 0.279377f, 0.351681f, 0.633504f, 1.069884f, 0.332449f, -0.457703f, -0.435817f, -0.028853f, 0.327490f, -0.282469f, -0.975792f, -0.062975f, -0.147187f, 0.348340f, -1.207116f, 0.516159f, -1.509626f, -0.805072f, 0.522999f, 0.143671f, 0.304246f, -0.360720f, -0.612472f, 0.260045f, -0.223243f, }; static const float av1_early_term_after_split_nn_bias_32_layer1[] = { -0.07571174f, }; static const NN_CONFIG av1_early_term_after_split_nnconfig_32 = { FEATURES, 1, 1, { HIDDEN_NODES, }, { av1_early_term_after_split_nn_weights_32_layer0, av1_early_term_after_split_nn_weights_32_layer1, }, { av1_early_term_after_split_nn_bias_32_layer0, av1_early_term_after_split_nn_bias_32_layer1, }, }; static const float av1_early_term_after_split_nn_weights_16_layer0[] = { -0.113798f, 0.053357f, -0.037947f, -0.477171f, 0.276517f, -0.349252f, -0.177284f, 0.189597f, 0.141744f, 0.230207f, -0.328104f, 0.074328f, 0.247717f, 0.233533f, 0.145167f, 0.018029f, -0.398725f, -0.226199f, -0.309724f, 0.125279f, 0.194759f, 0.025531f, 0.349714f, -0.273944f, 0.186871f, 0.181735f, -0.520614f, -0.264076f, 0.308207f, 0.157438f, -0.137791f, -0.054582f, 0.125879f, 0.796218f, -0.897562f, 0.885439f, 0.381640f, 0.106625f, -2.027456f, 0.000874f, 0.179581f, 0.013287f, -2.329439f, -0.163169f, -0.136191f, 0.320108f, -2.318779f, -0.196722f, -0.295721f, 0.203658f, -0.182275f, 0.615941f, 0.015762f, 0.257181f, -0.115297f, 0.295774f, -0.026144f, -0.022686f, -0.219423f, -0.042861f, 0.207647f, -0.057791f, 0.201671f, -0.169569f, 0.291492f, -0.994991f, 0.137473f, 0.230948f, 0.505626f, -1.065860f, 0.275225f, -0.250861f, 0.519466f, -1.217242f, -0.087384f, 0.053441f, 0.030729f, -1.702304f, -0.034635f, 0.010177f, -0.035422f, -0.749979f, 0.355499f, 0.408166f, -0.086883f, 0.017203f, 0.195706f, -0.218056f, -0.029153f, 0.367335f, -0.061732f, -0.241068f, 0.078496f, -0.370346f, -0.124223f, -0.172708f, 0.037971f, 0.038875f, -0.282489f, -0.266323f, -0.210864f, 0.214714f, 0.234695f, -0.045625f, 0.015357f, -0.007464f, -0.362003f, -0.113465f, 0.145141f, 0.238470f, -0.202664f, -0.286587f, -0.347112f, 0.054501f, -0.190290f, -0.283256f, 0.062179f, 0.041165f, -0.006935f, -0.220351f, -0.088800f, 0.220924f, -0.200982f, 0.058493f, -0.225175f, 0.057175f, -0.618187f, 0.761023f, -0.743774f, -0.500599f, -0.584999f, 1.545211f, 0.123055f, -0.106848f, -0.353057f, 1.552187f, 0.174104f, 0.068060f, -0.449859f, 1.254299f, -0.161716f, -0.060630f, -0.230721f, 0.165976f, -0.101582f, -0.422415f, 0.110384f, -0.130098f, 0.104428f, 0.083518f, 0.031626f, 0.083048f, 0.158877f, 0.173340f, 0.063962f, 0.427845f, 0.663268f, 0.376996f, 0.146435f, -0.091329f, 0.443447f, 0.518432f, -0.182777f, -0.091313f, 0.331229f, 0.532604f, -0.187001f, 0.054774f, 0.298068f, 0.502295f, -0.362378f, 0.054283f, 0.292806f, 0.168901f, -0.214787f, 0.025637f, 0.458009f, -0.322714f, -0.264059f, 0.140313f, -0.102696f, -0.431208f, -0.134450f, -0.545415f, 0.253851f, -0.009061f, -0.050681f, 0.108681f, 0.043272f, -1.073133f, 0.206410f, 0.469576f, 0.291494f, -2.021244f, -0.001183f, -0.067542f, 0.364907f, -2.470543f, 0.049147f, -0.018868f, 0.658500f, -2.531048f, 0.275433f, -0.034224f, -0.171386f, 0.096369f, 0.728069f, 0.272332f, 0.222255f, -0.030426f, 0.026994f, 0.208928f, -0.173943f, -0.227581f, -0.214798f, 0.079341f, 0.032344f, -0.253575f, -0.044353f, -0.239265f, -0.055852f, -0.162582f, -0.086592f, 0.066487f, 0.337353f, -0.168704f, 0.015702f, 0.022607f, 0.286647f, 0.218106f, 0.193319f, -0.358714f, 0.030796f, 0.007646f, -0.045617f, 0.165007f, -0.284641f, -0.291812f, 0.207544f, 0.082823f, -0.141907f, -0.331336f, -0.052908f, 0.120716f, 0.202521f, 0.232782f, -0.348141f, -0.017332f, 1.191126f, -0.391987f, -0.154537f, -0.206551f, -2.378690f, 0.057918f, -0.328183f, 2.151556f, 0.238803f, 0.164880f, -0.480039f, 1.616200f, 0.260243f, 0.083704f, -0.174461f, 1.804634f, 0.194810f, 0.223837f, 0.550107f, -0.068171f, -0.293435f, -0.186770f, -0.364846f, 0.127181f, 0.105556f, -0.016202f, 0.278403f, -0.344995f, -0.009761f, -0.082555f, 0.046731f, -0.301452f, 0.604259f, 0.055895f, 0.049862f, 0.314249f, -0.305811f, -0.112937f, 0.658787f, -0.549288f, -0.307567f, -0.460650f, -0.840643f, 0.082576f, 0.373711f, 0.138318f, 0.336901f, 0.284984f, -0.281400f, 0.408210f, -0.449858f, 0.461054f, 0.227629f, -0.131705f, 0.301769f, -0.278540f, 0.189290f, -0.269041f, 0.111350f, -0.300257f, 0.436858f, -0.265920f, -0.211938f, 0.272631f, 0.206291f, 0.253273f, -0.229776f, -0.031112f, -0.171183f, -0.109676f, -0.202390f, -0.068857f, 0.182125f, -0.140523f, -0.308742f, -0.045840f, 0.256545f, -0.262405f, 0.225951f, -0.287463f, -0.189203f, -0.055552f, -0.052448f, -0.242839f, -0.278877f, 0.140920f, -0.175755f, 0.215402f, -0.248841f, -0.264080f, -0.178303f, 0.147777f, 0.049460f, -0.279877f, -0.539725f, -0.004622f, 0.182874f, 0.338814f, 0.265974f, 0.249851f, -0.141154f, 0.157228f, -0.090972f, 0.179444f, 0.305255f, 0.127788f, 0.123270f, 0.355320f, 0.076797f, 0.263495f, 0.235965f, -0.133816f, 0.243624f, 0.227062f, -0.213629f, 0.002075f, 0.061203f, -0.077820f, -0.008807f, -0.247324f, -0.051464f, -0.191894f, -0.238713f, -0.389526f, -0.274248f, 0.053950f, -0.225750f, -0.367097f, -0.122391f, 0.181212f, -0.411824f, -0.084241f, -0.302288f, 0.077860f, -0.187443f, -0.300262f, 0.083156f, -0.392461f, -0.332320f, -0.346474f, 0.140658f, -0.283656f, 0.120714f, -0.056577f, -0.280968f, 0.017795f, -0.024686f, 0.073113f, -0.346637f, 0.082567f, -0.036556f, -0.369730f, 0.081225f, -0.005211f, 0.144886f, -0.003544f, 0.178307f, -0.366035f, -0.063887f, -0.191767f, 0.105835f, -0.273978f, -0.266532f, -0.023984f, 0.039166f, 0.065848f, -0.026802f, -0.268923f, 0.189659f, 0.086300f, 0.030718f, 0.216565f, -0.130025f, -0.215687f, 0.146341f, -0.286438f, -0.394226f, -0.181509f, -0.005612f, 0.186040f, 0.133491f, 0.032096f, -0.261609f, 0.074007f, -0.042929f, -0.234479f, 0.189704f, 0.088395f, -0.003671f, -0.125055f, -0.252418f, -0.086387f, 0.111197f, -0.297071f, -0.018793f, -0.031902f, -0.333191f, -0.186279f, 0.039868f, 0.091419f, -0.264438f, -0.216150f, -0.212550f, 0.203412f, -0.113028f, -0.197169f, -0.346771f, 0.086066f, 0.091443f, -0.128507f, -0.007281f, -0.118389f, 0.003370f, -0.338661f, 0.026739f, -0.063571f, -0.281567f, -0.166824f, 0.167455f, 0.216173f, 0.199163f, 0.256314f, -0.222679f, 0.040282f, -0.154808f, -0.133943f, -0.270163f, -0.357398f, 0.260373f, 0.176950f, -0.125162f, -0.085050f, 0.226376f, -0.124585f, -0.324804f, 0.035536f, -0.133600f, 0.173450f, 0.068107f, -0.337442f, 0.169629f, 0.047223f, 0.057878f, 0.055555f, -0.317449f, -0.103768f, 0.080899f, -0.194759f, -1.137593f, 0.508999f, 0.045372f, 1.746454f, 1.250347f, -0.342930f, -0.127821f, -0.220175f, -0.417649f, -0.480595f, 0.071902f, 0.050231f, -0.562554f, -0.677866f, -0.121416f, -0.247558f, -0.483876f, -0.504157f, 1.731953f, 0.572936f, 0.047325f, 0.050619f, 0.112611f, -0.035393f, 0.052585f, -0.071076f, -0.015798f, -0.050228f, -0.142875f, 0.189329f, 0.048833f, 0.503633f, 0.249588f, 0.175492f, -0.137664f, -0.018533f, 0.288453f, -0.025644f, 0.079131f, 0.195096f, -0.154039f, -0.104220f, -0.224072f, 0.095946f, -0.208424f, 0.214745f, 0.056468f, 0.182603f, 0.341784f, -0.134664f, -0.194050f, 0.058532f, -0.107336f, -0.087783f, -0.238795f, -0.387212f, 0.049055f, -0.127417f, -0.299919f, -0.094371f, -0.011735f, -0.264753f, 0.407375f, -0.462654f, -0.609488f, 0.027742f, -0.985512f, -0.109154f, -0.423276f, 2.347960f, 0.129240f, 0.187610f, -0.057081f, 2.424892f, 0.087666f, 0.106716f, -0.039379f, 2.764866f, 0.113309f, 0.028196f, -0.582789f, 0.335385f, -0.538029f, -0.477337f, -0.114207f, 0.178829f, 0.006276f, 0.123179f, 0.095101f, 0.139898f, -0.372074f, -0.111010f, 0.136330f, 0.272900f, 0.126737f, -0.097808f, -0.363697f, 0.108665f, -0.227749f, -0.083421f, 1.714677f, 0.451943f, 0.107931f, -0.392281f, 1.615846f, 0.022307f, -0.247011f, 0.257703f, 1.039134f, 0.537789f, 0.022177f, -0.271532f, 0.351350f, -0.399205f, -0.240534f, -0.315399f, 0.026928f, -0.005618f, 0.053179f, -0.010277f, 0.000501f, 0.040896f, -0.109160f, 0.018282f, 0.003887f, 0.199599f, 0.095349f, -0.337284f, 0.169929f, -0.109409f, -0.166983f, 0.059908f, -0.226574f, -0.120114f, 0.077329f, -0.333133f, -0.220936f, 0.114309f, -0.233965f, -0.281551f, 0.042948f, 0.100940f, 0.116037f, -0.313122f, 0.215149f, -0.309057f, -0.341052f, -0.294417f, -0.179722f, 0.010795f, 0.192053f, -0.275261f, -0.033077f, 0.117348f, 0.090206f, 0.781573f, 0.602456f, -0.220296f, 0.172159f, 0.758513f, 0.157910f, -0.217897f, -0.372659f, 0.031935f, 0.791463f, 0.267195f, 0.931593f, -0.057349f, 0.405512f, -0.058512f, -0.641663f, -0.076592f, 0.550227f, -0.024094f, 0.048218f, -0.289971f, 0.180940f, 0.167533f, 0.052711f, -0.360726f, 0.019210f, -0.488879f, 0.380498f, 0.151608f, -0.276895f, -0.596554f, 0.106076f, -0.245833f, -0.048783f, 0.073823f, 0.098780f, 0.000211f, 0.113958f, -0.068964f, -0.265533f, -0.185457f, 0.175586f, -0.163621f, -0.204919f, 0.145802f, -0.163421f, 0.129576f, -0.153486f, -0.105573f, 0.067289f, -0.213120f, -0.286103f, 0.249543f, -0.044970f, -0.170464f, -0.105501f, -0.094765f, -0.050734f, -0.369468f, 0.180020f, -0.363328f, -0.151654f, -0.262550f, -0.424503f, 0.829032f, -0.559452f, 0.506837f, 0.143823f, 0.276660f, -1.808608f, -0.259517f, -0.053945f, 0.035676f, -1.842195f, -0.065960f, -0.069285f, 0.462022f, -2.319453f, -0.370299f, 0.183329f, -0.146412f, -0.563875f, 0.305068f, 0.480904f, 0.044319f, -0.016098f, 0.168516f, 0.114874f, -0.097621f, -0.030373f, 0.177700f, 0.181591f, -0.146003f, -0.330853f, -0.259200f, 0.779319f, -1.517524f, 0.178781f, 0.135451f, 0.088784f, -2.076089f, 0.628717f, -0.048685f, 0.281327f, -2.341596f, 0.422171f, 0.006135f, 0.367096f, -1.663118f, 0.365253f, -0.072884f, -0.197620f, -0.688634f, 0.477354f, 0.395841f, -0.098505f, 0.208709f, -0.027523f, 0.127119f, 0.106274f, 0.114424f, -0.122877f, -0.087245f, 0.086923f, -0.527398f, -0.342062f, -0.764662f, 0.713094f, -0.626453f, -0.081454f, -0.087683f, 0.885047f, 0.323440f, -0.018579f, -0.217166f, 1.617984f, -0.159038f, 0.265991f, -0.390313f, 1.933182f, -0.032431f, -0.057513f, -0.300841f, 0.461248f, -0.072147f, -0.287052f, -0.078056f, 0.011734f, 0.044013f, 0.177174f, 0.093400f, 0.028819f, 0.193686f, -0.224853f, 0.268321f, -0.075059f, 0.074526f, -0.015618f, 0.165615f, -0.276780f, -0.063908f, -0.369264f, -0.171497f, -0.173624f, -0.130743f, -0.224625f, -0.124980f, -0.104482f, 0.076864f, -0.009631f, -0.164682f, 0.150480f, -0.111880f, -0.260425f, 0.086234f, -0.176936f, -0.136771f, -0.168867f, -0.405626f, -0.288716f, -0.128950f, -0.207327f, 0.015581f, -0.109061f, -0.098970f, 0.090792f, -0.109623f, 0.349851f, 0.266341f, -0.088602f, -0.108071f, 0.082519f, 0.472650f, -1.838758f, 0.456694f, 0.119927f, 0.461077f, -2.860022f, 0.231495f, 0.235771f, 0.256424f, -1.938516f, -0.188202f, -0.000832f, -0.518206f, 0.194644f, 0.505510f, 0.615657f, 0.193760f, 0.224600f, 0.265732f, -0.121553f, -0.354597f, -0.242414f, -0.276639f, -0.057591f, 0.026369f, -0.261148f, -0.356155f, -0.149178f, -0.353566f, -0.340835f, -0.141776f, 0.076535f, 0.221299f, -0.108857f, -0.156514f, 0.050901f, 0.058541f, -0.077141f, 0.071515f, -0.333283f, -0.181489f, -0.212900f, -0.224698f, -0.174693f, -0.178665f, -0.143374f, -0.091811f, 0.165161f, 0.060156f, -0.086103f, -0.039031f, -0.377759f, -0.370533f, 0.074431f, 0.064192f, 0.186576f, 0.447858f, -0.082260f, -0.020268f, -0.123089f, -0.402017f, 0.080500f, 0.176286f, 2.850013f, 0.019385f, -0.225361f, -0.235315f, 1.654694f, -0.073978f, -0.341412f, -1.187575f, 2.815900f, -0.228063f, -0.174547f, 0.623825f, -0.010676f, 0.157189f, 0.111879f, -0.198965f, 0.051851f, 0.158396f, 0.045194f, 0.293531f, -0.246714f, -0.351493f, 0.026954f, 0.076233f, 0.420367f, 0.168154f, -0.131450f, 0.134487f, -0.288851f, -0.134553f, 0.014902f, 0.756381f, 0.277713f, 0.190080f, -0.020869f, 1.446672f, 0.029792f, -0.025927f, 0.060640f, 0.559864f, 0.422229f, 0.198459f, 0.036167f, 0.029432f, 0.001882f, 0.038480f, -0.160528f, -0.288855f, -0.310886f, 0.291296f, 0.190558f, -0.182816f, -0.002252f, 0.073101f, -0.172245f, -0.305980f, 0.112492f, -0.422839f, -0.295999f, -0.078160f, -0.173405f, -0.032819f, 0.373774f, -0.715223f, 0.018911f, 0.131753f, -0.237364f, -0.128499f, -0.228406f, 0.341619f, 0.343552f, -0.521581f, -0.263790f, 0.362502f, -0.018450f, 0.054233f, 0.183068f, 0.382772f, 0.188811f, -0.627287f, 0.040399f, -0.487338f, -0.192591f, 0.247426f, 0.154372f, -0.483994f, }; static const float av1_early_term_after_split_nn_bias_16_layer0[] = { -0.173976f, 0.305495f, 0.250981f, -0.067127f, -0.313100f, 0.242464f, 0.315196f, -0.056052f, -0.241227f, -0.253308f, -0.002697f, 0.003687f, -0.124421f, -0.090383f, -0.070366f, -0.064074f, -0.056115f, 0.123313f, -0.239698f, -0.182082f, -0.065296f, 0.021503f, -0.036787f, 0.311861f, 0.118135f, -0.320456f, -0.110719f, 0.220692f, -0.071727f, -0.088226f, -0.110874f, -0.111671f, }; static const float av1_early_term_after_split_nn_weights_16_layer1[] = { -0.338573f, 0.398159f, 0.314774f, -0.037448f, -0.271950f, -0.774991f, 0.950901f, -0.225380f, -1.841906f, -0.350379f, -0.079350f, 0.383148f, -0.183676f, -0.313132f, -0.340820f, -0.309401f, -1.050540f, -0.432267f, -0.657195f, 0.927632f, -0.040150f, 0.578920f, 0.212301f, 0.292495f, 0.563590f, -0.205735f, 0.195877f, 0.582122f, -0.217860f, 1.613379f, 0.313278f, -0.555802f, }; static const float av1_early_term_after_split_nn_bias_16_layer1[] = { 0.16553f, }; static const NN_CONFIG av1_early_term_after_split_nnconfig_16 = { FEATURES, 1, 1, { HIDDEN_NODES, }, { av1_early_term_after_split_nn_weights_16_layer0, av1_early_term_after_split_nn_weights_16_layer1, }, { av1_early_term_after_split_nn_bias_16_layer0, av1_early_term_after_split_nn_bias_16_layer1, }, }; static const float av1_early_term_after_split_nn_weights_8_layer0[] = { -0.719472f, 0.305806f, 0.855829f, 0.100094f, 0.412517f, 1.254673f, 1.552105f, -5.890773f, -0.089957f, -0.016736f, 1.418074f, -5.393506f, -0.028214f, 0.117758f, 1.479209f, -5.299794f, 0.171585f, -0.084182f, -0.162105f, 0.388577f, -0.044319f, -0.025861f, 0.251782f, -0.181462f, -0.101545f, -0.079999f, -0.033014f, -0.191627f, -0.032802f, -0.053404f, 0.038038f, -0.119492f, 0.049104f, -0.344384f, -0.354513f, 0.036977f, 0.017513f, -0.004025f, -0.163212f, -0.261999f, 0.146575f, 0.207541f, 0.130365f, -0.252127f, 0.097419f, -0.231057f, -0.309421f, 0.347866f, -0.064670f, -0.283171f, -0.244193f, -0.193323f, -0.226954f, -0.276194f, -0.233553f, 0.156354f, -0.184009f, 0.344289f, -0.308058f, -0.205202f, -0.325068f, 0.183820f, -0.361667f, -0.069559f, -0.121834f, -0.038357f, -0.210043f, -0.266129f, 0.003188f, 0.074902f, -0.328843f, 0.293679f, -0.234698f, -0.428268f, -0.308772f, -0.136538f, -0.008384f, -0.078227f, 0.166074f, -0.262899f, 0.102114f, -0.323420f, 0.057064f, -0.203318f, -0.397413f, -0.317324f, -0.307093f, 0.020574f, -0.188627f, 0.132529f, 0.118992f, -0.487387f, -0.282975f, 0.573231f, -0.266071f, 0.125140f, -0.970034f, 1.424008f, -0.487366f, -0.196415f, 3.680273f, -0.008407f, 0.081109f, -0.187479f, 3.876021f, 0.159168f, 0.111721f, -0.337423f, 3.901760f, 0.261268f, -0.245555f, -0.187632f, -0.324298f, 0.167234f, 0.170986f, -0.473055f, 0.087016f, -0.003469f, 0.051035f, 0.251794f, 0.153549f, 0.217609f, -0.326870f, -0.175511f, 0.637341f, -0.694837f, -0.873487f, -0.186614f, -1.089884f, -0.607316f, -0.523519f, 5.256331f, 0.071414f, 0.215265f, -0.835999f, 5.735746f, 0.300101f, 0.089626f, -0.450261f, 5.608051f, 0.190491f, 0.110220f, -0.595360f, -0.446324f, 0.311380f, 0.268812f, -0.339656f, -0.008708f, 0.011111f, -0.027557f, 0.171534f, 0.000676f, 0.227232f, 0.033993f, 0.146684f, 0.094817f, -0.175381f, -0.211927f, -0.362471f, 0.168834f, 0.264149f, -0.350538f, -0.463249f, -0.288105f, 0.347155f, 0.183231f, -0.229732f, -0.252202f, -0.218074f, -0.008769f, -0.156103f, 0.181233f, -0.354736f, 0.263270f, -0.106636f, 0.081057f, 0.060634f, -0.046887f, 0.050468f, 0.071259f, 0.221287f, 0.199071f, -0.180185f, -0.406902f, -0.239351f, -0.034957f, 0.369140f, 0.864600f, 0.233798f, 0.423612f, -0.468918f, 0.976987f, 0.691198f, -1.597908f, 0.102926f, 0.305546f, 0.391196f, -3.909059f, 0.333635f, 0.311561f, 0.738886f, -4.002001f, 0.236394f, -0.233141f, 0.263342f, 0.679898f, 0.136233f, 0.254743f, -0.367571f, 0.066412f, 0.001606f, -0.059542f, 0.051726f, -0.347145f, -0.045501f, -0.313847f, -0.021952f, 1.386316f, -0.579139f, -1.275844f, -0.003493f, -1.716577f, 0.250209f, 0.192086f, 4.177055f, 0.351835f, 0.338177f, 0.140163f, 4.099592f, 0.321866f, -0.128153f, -0.360414f, 4.350767f, 0.025943f, -0.116740f, -0.664107f, -0.064558f, -0.039553f, -0.208186f, -0.678774f, 0.149441f, -0.019823f, 0.012759f, 0.404442f, -0.108881f, 0.067974f, -0.188278f, 0.136327f, 0.109927f, -0.179270f, -0.272342f, 0.018064f, -0.304216f, -0.469470f, 0.109310f, -0.326214f, 0.061909f, -0.278997f, -0.352329f, -0.333770f, -0.186522f, -0.328567f, -0.206211f, -0.008804f, 0.042441f, -0.126699f, -0.420399f, -0.033842f, 0.016773f, -0.273789f, 0.081928f, -0.191552f, -0.179533f, -0.263070f, -0.471807f, 0.062601f, -0.232576f, 0.082955f, -0.490080f, 0.073820f, -0.090384f, 0.035781f, -0.158880f, -0.506793f, -0.069132f, 0.047602f, -0.349640f, -0.058389f, -0.017387f, -0.194636f, -0.457227f, -0.143105f, 0.222045f, -0.548909f, -0.131561f, 0.247196f, -0.207923f, 0.133056f, -0.509854f, -0.193685f, -0.181327f, -0.242442f, 0.091821f, 0.114430f, -0.375233f, -0.015254f, -0.336632f, -0.060279f, -0.169169f, -0.429914f, -0.036563f, -0.400560f, -0.076332f, -0.186232f, -0.268491f, 0.075561f, -0.389082f, -0.077435f, 0.352562f, -0.020086f, -0.338181f, -0.404629f, 0.254983f, 0.150477f, -0.265903f, 0.003341f, 0.099969f, -0.211964f, -0.129372f, -0.166366f, 0.327712f, -0.276234f, 0.140675f, -0.433677f, -0.163050f, -0.143578f, -0.397840f, -0.422130f, -0.293835f, -0.075362f, -0.468375f, 1.021238f, 1.394155f, -0.922486f, -1.350222f, 2.030201f, 0.057717f, 0.227650f, -0.193179f, 0.037224f, 0.065555f, 0.020558f, -0.059205f, -0.023690f, -0.008718f, 0.095976f, -0.549587f, -0.321164f, -0.243728f, 1.344381f, -1.254107f, 0.294244f, -0.154737f, -0.152597f, 0.342419f, 0.301883f, 0.069866f, -0.327766f, 0.209323f, -0.364913f, -0.005530f, -0.558972f, 0.057684f, -0.309357f, -0.283325f, -0.278445f, -0.420115f, -0.418457f, -0.391481f, -0.418460f, -0.003897f, -0.023744f, -0.312330f, -0.366213f, 0.269628f, -0.274877f, -0.189988f, -0.419555f, -0.034033f, 0.192874f, -0.135487f, -0.326108f, -0.039019f, 0.185029f, -0.264883f, -0.563447f, -0.163532f, -0.447652f, -0.141851f, 0.001714f, -0.193184f, 0.032609f, -0.112883f, 0.074599f, 0.490665f, 0.434764f, 0.021652f, -0.219618f, 0.743267f, 0.147195f, -0.303479f, -0.097674f, 0.195813f, 0.704007f, -1.290851f, 0.119701f, 0.224065f, 0.260246f, -0.580657f, -0.096201f, -0.333214f, -0.586689f, 0.567178f, 0.157340f, -0.043184f, 0.194358f, -0.026506f, -0.339894f, -0.571803f, -0.234828f, 0.147054f, -0.564178f, -0.156933f, -0.366055f, -0.691687f, -0.187501f, 0.215834f, -0.346106f, -0.256892f, 0.110915f, -0.337464f, -0.341474f, -0.216113f, 0.249445f, -0.070175f, -0.412141f, 0.153458f, -0.081280f, 0.164669f, -0.356396f, -0.294971f, -0.165121f, -0.133585f, -0.071467f, 0.295147f, -0.253233f, -0.213833f, -0.343416f, -0.474344f, -0.304000f, -0.341379f, -0.331456f, -0.393952f, -0.508004f, -0.569518f, -0.509864f, 0.121961f, 0.011957f, 0.000498f, -0.201969f, -0.407195f, -0.414375f, -0.295846f, 0.247492f, 0.124249f, -0.550804f, -0.420397f, -0.123462f, 0.333292f, -0.240230f, -0.025604f, 0.337536f, -0.295006f, -0.272614f, -0.496850f, -0.278521f, 0.234591f, -0.052775f, -0.014052f, -0.260078f, -0.279128f, -0.036385f, 0.008714f, -0.064018f, -0.124873f, -0.334014f, }; static const float av1_early_term_after_split_nn_bias_8_layer0[] = { 1.202379f, -0.117005f, -0.135527f, -0.262255f, -0.443658f, -0.078981f, 0.615653f, -0.124482f, -0.227768f, -0.227014f, -0.135898f, 0.143216f, -0.225995f, 0.370877f, -0.214821f, -0.227752f, }; static const float av1_early_term_after_split_nn_weights_8_layer1[] = { 0.376594f, 0.266703f, -0.039847f, 1.680142f, -0.879939f, 0.286806f, -0.378223f, -0.405295f, -0.021107f, 0.039188f, 0.259308f, 0.193091f, 0.077994f, -0.269141f, 0.011180f, -0.019262f, }; static const float av1_early_term_after_split_nn_bias_8_layer1[] = { -1.29585564f, }; static const NN_CONFIG av1_early_term_after_split_nnconfig_8 = { FEATURES, 1, 1, { 16, }, { av1_early_term_after_split_nn_weights_8_layer0, av1_early_term_after_split_nn_weights_8_layer1, }, { av1_early_term_after_split_nn_bias_8_layer0, av1_early_term_after_split_nn_bias_8_layer1, }, }; #undef FEATURES #undef HIDDEN_NODES #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ aom-3.12.1/av1/encoder/partition_search.c000066400000000000000000010064361477627663500202230ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "aom_dsp/txfm_common.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/enums.h" #include "av1/common/reconintra.h" #include "av1/encoder/aq_complexity.h" #include "av1/encoder/aq_variance.h" #include "av1/encoder/context_tree.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodeframe_utils.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/intra_mode_search_utils.h" #include "av1/encoder/motion_search_facade.h" #include "av1/encoder/nonrd_opt.h" #include "av1/encoder/partition_search.h" #include "av1/encoder/partition_strategy.h" #include "av1/encoder/reconinter_enc.h" #include "av1/encoder/tokenize.h" #include "av1/encoder/var_based_part.h" #include "av1/encoder/av1_ml_partition_models.h" #if CONFIG_TUNE_VMAF #include "av1/encoder/tune_vmaf.h" #endif #define COLLECT_MOTION_SEARCH_FEATURE_SB 0 #if CONFIG_PARTITION_SEARCH_ORDER void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf) { part_sf->partition_search_type = SEARCH_PARTITION; part_sf->less_rectangular_check_level = 0; part_sf->use_square_partition_only_threshold = BLOCK_128X128; part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE; part_sf->default_max_partition_size = BLOCK_LARGEST; part_sf->default_min_partition_size = BLOCK_4X4; part_sf->adjust_var_based_rd_partitioning = 0; part_sf->max_intra_bsize = BLOCK_LARGEST; // This setting only takes effect when partition_search_type is set // to FIXED_PARTITION. part_sf->fixed_partition_size = BLOCK_16X16; // Recode loop tolerance %. part_sf->partition_search_breakout_dist_thr = 0; part_sf->partition_search_breakout_rate_thr = 0; part_sf->prune_ext_partition_types_search_level = 0; part_sf->prune_part4_search = 0; part_sf->ml_prune_partition = 0; part_sf->ml_early_term_after_part_split_level = 0; for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) { part_sf->ml_partition_search_breakout_thresh[i] = -1; // -1 means not enabled. } part_sf->simple_motion_search_prune_agg = SIMPLE_AGG_LVL0; part_sf->simple_motion_search_split = 0; part_sf->simple_motion_search_prune_rect = 0; part_sf->simple_motion_search_early_term_none = 0; part_sf->simple_motion_search_reduce_search_steps = 0; part_sf->intra_cnn_based_part_prune_level = 0; part_sf->ext_partition_eval_thresh = BLOCK_8X8; part_sf->rect_partition_eval_thresh = BLOCK_128X128; part_sf->ext_part_eval_based_on_cur_best = 0; part_sf->prune_ext_part_using_split_info = 0; part_sf->prune_rectangular_split_based_on_qidx = 0; part_sf->early_term_after_none_split = 0; part_sf->ml_predict_breakout_level = 0; part_sf->prune_sub_8x8_partition_level = 0; part_sf->simple_motion_search_rect_split = 0; part_sf->reuse_prev_rd_results_for_part_ab = 0; part_sf->reuse_best_prediction_for_part_ab = 0; part_sf->use_best_rd_for_pruning = 0; part_sf->skip_non_sq_part_based_on_none = 0; } // Reset speed features that works for the baseline encoding, but // blocks the external partition search. void av1_reset_sf_for_ext_part(AV1_COMP *const cpi) { cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions = 0; } #endif // CONFIG_PARTITION_SEARCH_ORDER #if !CONFIG_REALTIME_ONLY // If input |features| is NULL, write tpl stats to file for each super block. // Otherwise, store tpl stats to |features|. // The tpl stats is computed in the unit of tpl_bsize_1d (16x16). // When writing to text file: // The first row contains super block position, super block size, // tpl unit length, number of units in the super block. // The second row contains the intra prediction cost for each unit. // The third row contains the inter prediction cost for each unit. // The forth row contains the motion compensated dependency cost for each unit. static void collect_tpl_stats_sb(const AV1_COMP *const cpi, const BLOCK_SIZE bsize, const int mi_row, const int mi_col, aom_partition_features_t *features) { const AV1_COMMON *const cm = &cpi->common; GF_GROUP *gf_group = &cpi->ppi->gf_group; if (gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE || gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) { return; } TplParams *const tpl_data = &cpi->ppi->tpl_data; TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cpi->gf_frame_index]; TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; // If tpl stats is not established, early return if (!tpl_data->ready || gf_group->max_layer_depth_allowed == 0) { if (features != NULL) features->sb_features.tpl_features.available = 0; return; } const int tpl_stride = tpl_frame->stride; const int step = 1 << tpl_data->tpl_stats_block_mis_log2; const int mi_width = AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col); const int mi_height = AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row); const int col_steps = (mi_width / step) + ((mi_width % step) > 0); const int row_steps = (mi_height / step) + ((mi_height % step) > 0); const int num_blocks = col_steps * row_steps; if (features == NULL) { char filename[256]; snprintf(filename, sizeof(filename), "%s/tpl_feature_sb%d", cpi->oxcf.partition_info_path, cpi->sb_counter); FILE *pfile = fopen(filename, "w"); fprintf(pfile, "%d,%d,%d,%d,%d\n", mi_row, mi_col, bsize, tpl_data->tpl_bsize_1d, num_blocks); int count = 0; for (int row = 0; row < mi_height; row += step) { for (int col = 0; col < mi_width; col += step) { TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; fprintf(pfile, "%.0f", (double)this_stats->intra_cost); if (count < num_blocks - 1) fprintf(pfile, ","); ++count; } } fprintf(pfile, "\n"); count = 0; for (int row = 0; row < mi_height; row += step) { for (int col = 0; col < mi_width; col += step) { TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; fprintf(pfile, "%.0f", (double)this_stats->inter_cost); if (count < num_blocks - 1) fprintf(pfile, ","); ++count; } } fprintf(pfile, "\n"); count = 0; for (int row = 0; row < mi_height; row += step) { for (int col = 0; col < mi_width; col += step) { TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; const int64_t mc_dep_delta = RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, this_stats->mc_dep_dist); fprintf(pfile, "%.0f", (double)mc_dep_delta); if (count < num_blocks - 1) fprintf(pfile, ","); ++count; } } fclose(pfile); } else { features->sb_features.tpl_features.available = 1; features->sb_features.tpl_features.tpl_unit_length = tpl_data->tpl_bsize_1d; features->sb_features.tpl_features.num_units = num_blocks; int count = 0; for (int row = 0; row < mi_height; row += step) { for (int col = 0; col < mi_width; col += step) { TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; const int64_t mc_dep_delta = RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, this_stats->mc_dep_dist); features->sb_features.tpl_features.intra_cost[count] = this_stats->intra_cost; features->sb_features.tpl_features.inter_cost[count] = this_stats->inter_cost; features->sb_features.tpl_features.mc_dep_cost[count] = mc_dep_delta; ++count; } } } } #endif // !CONFIG_REALTIME_ONLY static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd, FRAME_COUNTS *counts, TX_SIZE tx_size, int depth, int blk_row, int blk_col, uint8_t allow_update_cdf) { MB_MODE_INFO *mbmi = xd->mi[0]; const BLOCK_SIZE bsize = mbmi->bsize; const int max_blocks_high = max_block_high(xd, bsize, 0); const int max_blocks_wide = max_block_wide(xd, bsize, 0); int ctx = txfm_partition_context(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, mbmi->bsize, tx_size); const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col); const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index]; if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; assert(tx_size > TX_4X4); if (depth == MAX_VARTX_DEPTH) { // Don't add to counts in this case mbmi->tx_size = tx_size; txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, tx_size, tx_size); return; } if (tx_size == plane_tx_size) { #if CONFIG_ENTROPY_STATS ++counts->txfm_partition[ctx][0]; #endif if (allow_update_cdf) update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2); mbmi->tx_size = tx_size; txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, tx_size, tx_size); } else { const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; #if CONFIG_ENTROPY_STATS ++counts->txfm_partition[ctx][1]; #endif if (allow_update_cdf) update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2); ++x->txfm_search_info.txb_split_count; if (sub_txs == TX_4X4) { mbmi->inter_tx_size[txb_size_index] = TX_4X4; mbmi->tx_size = TX_4X4; txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, TX_4X4, tx_size); return; } for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { int offsetr = row; int offsetc = col; update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr, blk_col + offsetc, allow_update_cdf); } } } } static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE plane_bsize, FRAME_COUNTS *td_counts, uint8_t allow_update_cdf) { MACROBLOCKD *xd = &x->e_mbd; const int mi_width = mi_size_wide[plane_bsize]; const int mi_height = mi_size_high[plane_bsize]; const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0); const int bh = tx_size_high_unit[max_tx_size]; const int bw = tx_size_wide_unit[max_tx_size]; xd->above_txfm_context = cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK); for (int idy = 0; idy < mi_height; idy += bh) { for (int idx = 0; idx < mi_width; idx += bw) { update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx, allow_update_cdf); } } } static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row, int blk_col) { MB_MODE_INFO *mbmi = xd->mi[0]; const BLOCK_SIZE bsize = mbmi->bsize; const int max_blocks_high = max_block_high(xd, bsize, 0); const int max_blocks_wide = max_block_wide(xd, bsize, 0); const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col); const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index]; if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; if (tx_size == plane_tx_size) { mbmi->tx_size = tx_size; txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, tx_size, tx_size); } else { if (tx_size == TX_8X8) { mbmi->inter_tx_size[txb_size_index] = TX_4X4; mbmi->tx_size = TX_4X4; txfm_partition_update(xd->above_txfm_context + blk_col, xd->left_txfm_context + blk_row, TX_4X4, tx_size); return; } const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; const int row_end = AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); const int col_end = AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); for (int row = 0; row < row_end; row += bsh) { const int offsetr = blk_row + row; for (int col = 0; col < col_end; col += bsw) { const int offsetc = blk_col + col; set_txfm_context(xd, sub_txs, offsetr, offsetc); } } } } static void tx_partition_set_contexts(const AV1_COMMON *const cm, MACROBLOCKD *xd, BLOCK_SIZE plane_bsize) { const int mi_width = mi_size_wide[plane_bsize]; const int mi_height = mi_size_high[plane_bsize]; const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0); const int bh = tx_size_high_unit[max_tx_size]; const int bw = tx_size_wide_unit[max_tx_size]; xd->above_txfm_context = cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK); for (int idy = 0; idy < mi_height; idy += bh) { for (int idx = 0; idx < mi_width; idx += bw) { set_txfm_context(xd, max_tx_size, idy, idx); } } } static void update_zeromv_cnt(const AV1_COMP *const cpi, const MB_MODE_INFO *const mi, int mi_row, int mi_col, BLOCK_SIZE bsize) { if (mi->ref_frame[0] != LAST_FRAME || !is_inter_block(mi) || mi->segment_id > CR_SEGMENT_ID_BOOST2) { return; } const AV1_COMMON *const cm = &cpi->common; const MV mv = mi->mv[0].as_mv; const int bw = mi_size_wide[bsize] >> 1; const int bh = mi_size_high[bsize] >> 1; const int xmis = AOMMIN((cm->mi_params.mi_cols - mi_col) >> 1, bw); const int ymis = AOMMIN((cm->mi_params.mi_rows - mi_row) >> 1, bh); const int block_index = (mi_row >> 1) * (cm->mi_params.mi_cols >> 1) + (mi_col >> 1); for (int y = 0; y < ymis; y++) { for (int x = 0; x < xmis; x++) { // consec_zero_mv is in the scale of 8x8 blocks const int map_offset = block_index + y * (cm->mi_params.mi_cols >> 1) + x; if (abs(mv.row) < 10 && abs(mv.col) < 10) { if (cpi->consec_zero_mv[map_offset] < 255) cpi->consec_zero_mv[map_offset]++; } else { cpi->consec_zero_mv[map_offset] = 0; } } } } static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data, ThreadData *td, TokenExtra **t, RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate) { const AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO **mi_4x4 = xd->mi; MB_MODE_INFO *mbmi = mi_4x4[0]; const int seg_skip = segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); const int mis = cm->mi_params.mi_stride; const int mi_width = mi_size_wide[bsize]; const int mi_height = mi_size_high[bsize]; const int is_inter = is_inter_block(mbmi); // Initialize tx_mode and tx_size_search_method TxfmSearchParams *txfm_params = &x->txfm_search_params; set_tx_size_search_method( cm, &cpi->winner_mode_params, txfm_params, cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1); const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; if (!is_inter) { xd->cfl.store_y = store_cfl_required(cm, xd); mbmi->skip_txfm = 1; for (int plane = 0; plane < num_planes; ++plane) { av1_encode_intra_block_plane(cpi, x, bsize, plane, dry_run, cpi->optimize_seg_arr[mbmi->segment_id]); } // If there is at least one lossless segment, force the skip for intra // block to be 0, in order to avoid the segment_id to be changed by in // write_segment_id(). if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map && cpi->enc_seg.has_lossless_segment) mbmi->skip_txfm = 0; xd->cfl.store_y = 0; if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) { for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) { if (mbmi->palette_mode_info.palette_size[plane] > 0) { if (!dry_run) { av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size, PALETTE_MAP, tile_data->allow_update_cdf, td->counts); } else if (dry_run == DRY_RUN_COSTCOEFFS) { *rate += av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP); } } } } av1_update_intra_mb_txb_context(cpi, td, dry_run, bsize, tile_data->allow_update_cdf); } else { int ref; const int is_compound = has_second_ref(mbmi); set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); for (ref = 0; ref < 1 + is_compound; ++ref) { const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, mbmi->ref_frame[ref]); assert(IMPLIES(!is_intrabc_block(mbmi), cfg)); av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col, xd->block_ref_scale_factors[ref], num_planes); } // Predicted sample of inter mode (for Luma plane) cannot be reused if // nonrd_check_partition_split speed feature is enabled, Since in such cases // the buffer may not contain the predicted sample of best mode. const int start_plane = (x->reuse_inter_pred && (!cpi->sf.rt_sf.nonrd_check_partition_split) && cm->seq_params->bit_depth == AOM_BITS_8) ? 1 : 0; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, start_plane, av1_num_planes(cm) - 1); if (mbmi->motion_mode == OBMC_CAUSAL) { assert(cpi->oxcf.motion_mode_cfg.enable_obmc); av1_build_obmc_inter_predictors_sb(cm, xd); } #if CONFIG_MISMATCH_DEBUG if (dry_run == OUTPUT_ENABLED) { for (int plane = 0; plane < num_planes; ++plane) { const struct macroblockd_plane *pd = &xd->plane[plane]; int pixel_c, pixel_r; mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, pd->subsampling_x, pd->subsampling_y); if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, pd->subsampling_y)) continue; mismatch_record_block_pre(pd->dst.buf, pd->dst.stride, cm->current_frame.order_hint, plane, pixel_c, pixel_r, pd->width, pd->height, xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); } } #else (void)num_planes; #endif av1_encode_sb(cpi, x, bsize, dry_run); av1_tokenize_sb_vartx(cpi, td, dry_run, bsize, rate, tile_data->allow_update_cdf); } if (!dry_run) { if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi)) td->intrabc_used = 1; if (txfm_params->tx_mode_search_type == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id] && mbmi->bsize > BLOCK_4X4 && !(is_inter && (mbmi->skip_txfm || seg_skip))) { if (is_inter) { tx_partition_count_update(cm, x, bsize, td->counts, tile_data->allow_update_cdf); } else { if (mbmi->tx_size != max_txsize_rect_lookup[bsize]) ++x->txfm_search_info.txb_split_count; if (block_signals_txsize(bsize)) { const int tx_size_ctx = get_tx_size_context(xd); const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); const int depth = tx_size_to_depth(mbmi->tx_size, bsize); const int max_depths = bsize_to_max_depth(bsize); if (tile_data->allow_update_cdf) update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx], depth, max_depths + 1); #if CONFIG_ENTROPY_STATS ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth]; #endif } } assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi))); } else { int i, j; TX_SIZE intra_tx_size; // The new intra coding scheme requires no change of transform size if (is_inter) { if (xd->lossless[mbmi->segment_id]) { intra_tx_size = TX_4X4; } else { intra_tx_size = tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type); } } else { intra_tx_size = mbmi->tx_size; } const int cols = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_width); const int rows = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_height); for (j = 0; j < rows; j++) { for (i = 0; i < cols; i++) mi_4x4[mis * j + i]->tx_size = intra_tx_size; } if (intra_tx_size != max_txsize_rect_lookup[bsize]) ++x->txfm_search_info.txb_split_count; } } if (txfm_params->tx_mode_search_type == TX_MODE_SELECT && block_signals_txsize(mbmi->bsize) && is_inter && !(mbmi->skip_txfm || seg_skip) && !xd->lossless[mbmi->segment_id]) { if (dry_run) tx_partition_set_contexts(cm, xd, bsize); } else { TX_SIZE tx_size = mbmi->tx_size; // The new intra coding scheme requires no change of transform size if (is_inter) { if (xd->lossless[mbmi->segment_id]) { tx_size = TX_4X4; } else { tx_size = tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type); } } else { tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4; } mbmi->tx_size = tx_size; set_txfm_ctxs(tx_size, xd->width, xd->height, (mbmi->skip_txfm || seg_skip) && is_inter_block(mbmi), xd); } #if !CONFIG_REALTIME_ONLY if (is_inter_block(mbmi) && !xd->is_chroma_ref && is_cfl_allowed(xd)) { cfl_store_block(xd, mbmi->bsize, mbmi->tx_size); } #endif if (!dry_run) { if (cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->svc.temporal_layer_id == 0 && cpi->sf.rt_sf.use_temporal_noise_estimate && (!cpi->ppi->use_svc || (cpi->ppi->use_svc && !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1))) update_zeromv_cnt(cpi, mbmi, mi_row, mi_col, bsize); } } static void setup_block_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, int mi_row, int mi_col, BLOCK_SIZE bsize, AQ_MODE aq_mode, MB_MODE_INFO *mbmi) { x->rdmult = cpi->rd.RDMULT; if (aq_mode != NO_AQ) { assert(mbmi != NULL); if (aq_mode == VARIANCE_AQ) { if (cpi->vaq_refresh) { const int energy = bsize <= BLOCK_16X16 ? x->mb_energy : av1_log_block_var(cpi, x, bsize); mbmi->segment_id = energy; } x->rdmult = set_rdmult(cpi, x, mbmi->segment_id); } else if (aq_mode == COMPLEXITY_AQ) { x->rdmult = set_rdmult(cpi, x, mbmi->segment_id); } else if (aq_mode == CYCLIC_REFRESH_AQ) { // If segment is boosted, use rdmult for that segment. if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); } } #if !CONFIG_REALTIME_ONLY if (cpi->common.delta_q_info.delta_q_present_flag && !cpi->sf.rt_sf.use_nonrd_pick_mode) { x->rdmult = av1_get_cb_rdmult(cpi, x, bsize, mi_row, mi_col); } #endif // !CONFIG_REALTIME_ONLY if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM || cpi->oxcf.tune_cfg.tuning == AOM_TUNE_IQ) { av1_set_ssim_rdmult(cpi, &x->errorperbit, bsize, mi_row, mi_col, &x->rdmult); } #if CONFIG_SALIENCY_MAP else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_SALIENCY_MAP) { av1_set_saliency_map_vmaf_rdmult(cpi, &x->errorperbit, cpi->common.seq_params->sb_size, mi_row, mi_col, &x->rdmult); } #endif #if CONFIG_TUNE_VMAF else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING || cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN || cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { av1_set_vmaf_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); } #endif #if CONFIG_TUNE_BUTTERAUGLI else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { av1_set_butteraugli_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); } #endif if (cpi->oxcf.mode == ALLINTRA) { x->rdmult = (int)(((int64_t)x->rdmult * x->intra_sb_rdmult_modifier) >> 7); } // Check to make sure that the adjustments above have not caused the // rd multiplier to be truncated to 0. x->rdmult = (x->rdmult > 0) ? x->rdmult : 1; } void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi, const TileInfo *const tile, MACROBLOCK *const x, int mi_row, int mi_col, BLOCK_SIZE bsize) { const AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; assert(bsize < BLOCK_SIZES_ALL); const int mi_width = mi_size_wide[bsize]; const int mi_height = mi_size_high[bsize]; set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd, mi_row, mi_col); set_entropy_context(xd, mi_row, mi_col, num_planes); xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); // Set up destination pointers. av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, num_planes); // Set up limit values for MV components. // Mv beyond the range do not produce new/different prediction block. av1_set_mv_limits(&cm->mi_params, &x->mv_limits, mi_row, mi_col, mi_height, mi_width, cpi->oxcf.border_in_pixels); set_plane_n4(xd, mi_width, mi_height, num_planes); // Set up distance of MB to edge of frame in 1/8th pel units. assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, cm->mi_params.mi_rows, cm->mi_params.mi_cols); // Set up source buffers. av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs() xd->tile = *tile; } void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile, MACROBLOCK *const x, int mi_row, int mi_col, BLOCK_SIZE bsize) { const AV1_COMMON *const cm = &cpi->common; const struct segmentation *const seg = &cm->seg; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi; av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); // Setup segment ID. mbmi = xd->mi[0]; mbmi->segment_id = 0; if (seg->enabled) { if (seg->enabled && !cpi->vaq_refresh) { const uint8_t *const map = seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map; mbmi->segment_id = map ? get_segment_id(&cm->mi_params, map, bsize, mi_row, mi_col) : 0; } av1_init_plane_quantizers(cpi, x, mbmi->segment_id, 0); } #ifndef NDEBUG x->last_set_offsets_loc.mi_row = mi_row; x->last_set_offsets_loc.mi_col = mi_col; x->last_set_offsets_loc.bsize = bsize; #endif // NDEBUG } /*!\brief Hybrid intra mode search. * * \ingroup intra_mode_search * \callgraph * \callergraph * This is top level function for mode search for intra frames in non-RD * optimized case. Depending on speed feature and block size it calls * either non-RD or RD optimized intra mode search. * * \param[in] cpi Top-level encoder structure * \param[in] x Pointer to structure holding all the data for the current macroblock * \param[in] rd_cost Struct to keep track of the RD information * \param[in] bsize Current block size * \param[in] ctx Structure to hold snapshot of coding context during the mode picking process * * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x * is modified to store information about the best mode computed * in this function. The rd_cost struct is also updated with the RD stats * corresponding to the best mode found. */ static inline void hybrid_intra_mode_search(AV1_COMP *cpi, MACROBLOCK *const x, RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { int use_rdopt = 0; const int hybrid_intra_pickmode = cpi->sf.rt_sf.hybrid_intra_pickmode; // Use rd pick for intra mode search based on block size and variance. if (hybrid_intra_pickmode && bsize < BLOCK_16X16) { unsigned int var_thresh[3] = { 0, 101, 201 }; assert(hybrid_intra_pickmode <= 3); if (x->source_variance >= var_thresh[hybrid_intra_pickmode - 1]) use_rdopt = 1; } if (use_rdopt) av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); else av1_nonrd_pick_intra_mode(cpi, x, rd_cost, bsize, ctx); } // For real time/allintra row-mt enabled multi-threaded encoding with cost // update frequency set to COST_UPD_TILE/COST_UPD_OFF, tile ctxt is not updated // at superblock level. Thus, it is not required for the encoding of top-right // superblock be complete for updating tile ctxt. However, when encoding a block // whose right edge is also the superblock edge, intra and inter mode evaluation // (ref mv list population) require the encoding of the top-right superblock to // be complete. So, here, we delay the waiting of threads until the need for the // data from the top-right superblock region. static inline void wait_for_top_right_sb(AV1EncRowMultiThreadInfo *enc_row_mt, AV1EncRowMultiThreadSync *row_mt_sync, TileInfo *tile_info, BLOCK_SIZE sb_size, int sb_mi_size_log2, BLOCK_SIZE bsize, int mi_row, int mi_col) { const int sb_size_in_mi = mi_size_wide[sb_size]; const int bw_in_mi = mi_size_wide[bsize]; const int blk_row_in_sb = mi_row & (sb_size_in_mi - 1); const int blk_col_in_sb = mi_col & (sb_size_in_mi - 1); const int top_right_block_in_sb = (blk_row_in_sb == 0) && (blk_col_in_sb + bw_in_mi >= sb_size_in_mi); // Don't wait if the block is the not the top-right block in the superblock. if (!top_right_block_in_sb) return; // Wait for the top-right superblock to finish encoding. const int sb_row_in_tile = (mi_row - tile_info->mi_row_start) >> sb_mi_size_log2; const int sb_col_in_tile = (mi_col - tile_info->mi_col_start) >> sb_mi_size_log2; enc_row_mt->sync_read_ptr(row_mt_sync, sb_row_in_tile, sb_col_in_tile); } /*!\brief Interface for AV1 mode search for an individual coding block * * \ingroup partition_search * \callgraph * \callergraph * Searches prediction modes, transform, and coefficient coding modes for an * individual coding block. This function is the top-level interface that * directs the encoder to the proper mode search function, among these * implemented for inter/intra + rd/non-rd + non-skip segment/skip segment. * * \param[in] cpi Top-level encoder structure * \param[in] tile_data Pointer to struct holding adaptive * data/contexts/models for the tile during * encoding * \param[in] x Pointer to structure holding all the data for * the current macroblock * \param[in] mi_row Row coordinate of the block in a step size of * MI_SIZE * \param[in] mi_col Column coordinate of the block in a step size of * MI_SIZE * \param[in] rd_cost Pointer to structure holding rate and distortion * stats for the current block * \param[in] partition Partition mode of the parent block * \param[in] bsize Current block size * \param[in] ctx Pointer to structure holding coding contexts and * chosen modes for the current block * \param[in] best_rd Upper bound of rd cost of a valid partition * * \remark Nothing is returned. Instead, the chosen modes and contexts necessary * for reconstruction are stored in ctx, the rate-distortion stats are stored in * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be * signalled by an INT64_MAX rd_cost->rdcost. */ static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x, int mi_row, int mi_col, RD_STATS *rd_cost, PARTITION_TYPE partition, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, RD_STATS best_rd) { if (cpi->sf.part_sf.use_best_rd_for_pruning && best_rd.rdcost < 0) { ctx->rd_stats.rdcost = INT64_MAX; ctx->rd_stats.skip_txfm = 0; av1_invalid_rd_stats(rd_cost); return; } av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize); if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab && ctx->rd_mode_is_ready) { assert(ctx->mic.bsize == bsize); assert(ctx->mic.partition == partition); rd_cost->rate = ctx->rd_stats.rate; rd_cost->dist = ctx->rd_stats.dist; rd_cost->rdcost = ctx->rd_stats.rdcost; return; } AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi; struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode; TxfmSearchInfo *txfm_info = &x->txfm_search_info; int i; // This is only needed for real time/allintra row-mt enabled multi-threaded // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF. wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync, &tile_data->tile_info, cm->seq_params->sb_size, cm->seq_params->mib_size_log2, bsize, mi_row, mi_col); #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, rd_pick_sb_modes_time); #endif mbmi = xd->mi[0]; mbmi->bsize = bsize; mbmi->partition = partition; #if CONFIG_RD_DEBUG mbmi->mi_row = mi_row; mbmi->mi_col = mi_col; #endif // Sets up the tx_type_map buffer in MACROBLOCKD. xd->tx_type_map = txfm_info->tx_type_map_; xd->tx_type_map_stride = mi_size_wide[bsize]; for (i = 0; i < num_planes; ++i) { p[i].coeff = ctx->coeff[i]; p[i].qcoeff = ctx->qcoeff[i]; p[i].dqcoeff = ctx->dqcoeff[i]; p[i].eobs = ctx->eobs[i]; p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; } for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; ctx->skippable = 0; // Set to zero to make sure we do not use the previous encoded frame stats mbmi->skip_txfm = 0; // Reset skip mode flag. mbmi->skip_mode = 0; x->source_variance = av1_get_perpixel_variance_facade( cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y); // Initialize default mode evaluation params set_mode_eval_params(cpi, x, DEFAULT_EVAL); // Save rdmult before it might be changed, so it can be restored later. const int orig_rdmult = x->rdmult; setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi); // Set error per bit for current rdmult av1_set_error_per_bit(&x->errorperbit, x->rdmult); av1_rd_cost_update(x->rdmult, &best_rd); // If set best_rd.rdcost to INT64_MAX, the encoder will not use any previous // rdcost information for the following mode search. // Disabling the feature could get some coding gain, with encoder slowdown. if (!cpi->sf.part_sf.use_best_rd_for_pruning) { av1_invalid_rd_stats(&best_rd); } // Find best coding mode & reconstruct the MB so it is available // as a predictor for MBs that follow in the SB if (frame_is_intra_only(cm)) { #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, av1_rd_pick_intra_mode_sb_time); #endif av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd.rdcost); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, av1_rd_pick_intra_mode_sb_time); #endif } else { #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, av1_rd_pick_inter_mode_sb_time); #endif if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col, rd_cost, bsize, ctx, best_rd.rdcost); } else { av1_rd_pick_inter_mode(cpi, tile_data, x, rd_cost, bsize, ctx, best_rd.rdcost); } #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, av1_rd_pick_inter_mode_sb_time); #endif } // Examine the resulting rate and for AQ mode 2 make a segment choice. if (rd_cost->rate != INT_MAX && aq_mode == COMPLEXITY_AQ && bsize >= BLOCK_16X16) { av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate); } x->rdmult = orig_rdmult; // TODO(jingning) The rate-distortion optimization flow needs to be // refactored to provide proper exit/return handle. if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX; ctx->rd_stats.rate = rd_cost->rate; ctx->rd_stats.dist = rd_cost->dist; ctx->rd_stats.rdcost = rd_cost->rdcost; #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, rd_pick_sb_modes_time); #endif } static void update_stats(const AV1_COMMON *const cm, ThreadData *td) { MACROBLOCK *x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; const MB_MODE_INFO *const mbmi = xd->mi[0]; const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; const CurrentFrame *const current_frame = &cm->current_frame; const BLOCK_SIZE bsize = mbmi->bsize; FRAME_CONTEXT *fc = xd->tile_ctx; const int seg_ref_active = segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); if (current_frame->skip_mode_info.skip_mode_flag && !seg_ref_active && is_comp_ref_allowed(bsize)) { const int skip_mode_ctx = av1_get_skip_mode_context(xd); #if CONFIG_ENTROPY_STATS td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++; #endif update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2); } if (!mbmi->skip_mode && !seg_ref_active) { const int skip_ctx = av1_get_skip_txfm_context(xd); #if CONFIG_ENTROPY_STATS td->counts->skip_txfm[skip_ctx][mbmi->skip_txfm]++; #endif update_cdf(fc->skip_txfm_cdfs[skip_ctx], mbmi->skip_txfm, 2); } #if CONFIG_ENTROPY_STATS // delta quant applies to both intra and inter const int super_block_upper_left = ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) && ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0); const DeltaQInfo *const delta_q_info = &cm->delta_q_info; if (delta_q_info->delta_q_present_flag && (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) && super_block_upper_left) { const int dq = (mbmi->current_qindex - xd->current_base_qindex) / delta_q_info->delta_q_res; const int absdq = abs(dq); for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) { td->counts->delta_q[i][1]++; } if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++; if (delta_q_info->delta_lf_present_flag) { if (delta_q_info->delta_lf_multi) { const int frame_lf_count = av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { const int delta_lf = (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / delta_q_info->delta_lf_res; const int abs_delta_lf = abs(delta_lf); for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) { td->counts->delta_lf_multi[lf_id][i][1]++; } if (abs_delta_lf < DELTA_LF_SMALL) td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++; } } else { const int delta_lf = (mbmi->delta_lf_from_base - xd->delta_lf_from_base) / delta_q_info->delta_lf_res; const int abs_delta_lf = abs(delta_lf); for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) { td->counts->delta_lf[i][1]++; } if (abs_delta_lf < DELTA_LF_SMALL) td->counts->delta_lf[abs_delta_lf][0]++; } } } #endif if (!is_inter_block(mbmi)) { av1_sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi, frame_is_intra_only(cm)); } if (av1_allow_intrabc(cm)) { const int is_intrabc = is_intrabc_block(mbmi); update_cdf(fc->intrabc_cdf, is_intrabc, 2); #if CONFIG_ENTROPY_STATS ++td->counts->intrabc[is_intrabc]; #endif // CONFIG_ENTROPY_STATS if (is_intrabc) { const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); const int_mv dv_ref = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv; av1_update_mv_stats(&mbmi->mv[0].as_mv, &dv_ref.as_mv, &fc->ndvc, MV_SUBPEL_NONE); } } if (frame_is_intra_only(cm) || mbmi->skip_mode) return; FRAME_COUNTS *const counts = td->counts; const int inter_block = is_inter_block(mbmi); if (!seg_ref_active) { #if CONFIG_ENTROPY_STATS counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++; #endif update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)], inter_block, 2); // If the segment reference feature is enabled we have only a single // reference frame allowed for the segment so exclude it from // the reference frame counts used to work out probabilities. if (inter_block) { const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0]; const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1]; if (current_frame->reference_mode == REFERENCE_MODE_SELECT) { if (is_comp_ref_allowed(bsize)) { #if CONFIG_ENTROPY_STATS counts->comp_inter[av1_get_reference_mode_context(xd)] [has_second_ref(mbmi)]++; #endif // CONFIG_ENTROPY_STATS update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi), 2); } } if (has_second_ref(mbmi)) { const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi) ? UNIDIR_COMP_REFERENCE : BIDIR_COMP_REFERENCE; update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type, COMP_REFERENCE_TYPES); #if CONFIG_ENTROPY_STATS counts->comp_ref_type[av1_get_comp_reference_type_context(xd)] [comp_ref_type]++; #endif // CONFIG_ENTROPY_STATS if (comp_ref_type == UNIDIR_COMP_REFERENCE) { const int bit = (ref0 == BWDREF_FRAME); update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2); #if CONFIG_ENTROPY_STATS counts ->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0][bit]++; #endif // CONFIG_ENTROPY_STATS if (!bit) { const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME); update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2); #if CONFIG_ENTROPY_STATS counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1] [bit1]++; #endif // CONFIG_ENTROPY_STATS if (bit1) { update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd), ref1 == GOLDEN_FRAME, 2); #if CONFIG_ENTROPY_STATS counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)][2] [ref1 == GOLDEN_FRAME]++; #endif // CONFIG_ENTROPY_STATS } } } else { const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME); update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2); #if CONFIG_ENTROPY_STATS counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++; #endif // CONFIG_ENTROPY_STATS if (!bit) { update_cdf(av1_get_pred_cdf_comp_ref_p1(xd), ref0 == LAST2_FRAME, 2); #if CONFIG_ENTROPY_STATS counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1] [ref0 == LAST2_FRAME]++; #endif // CONFIG_ENTROPY_STATS } else { update_cdf(av1_get_pred_cdf_comp_ref_p2(xd), ref0 == GOLDEN_FRAME, 2); #if CONFIG_ENTROPY_STATS counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2] [ref0 == GOLDEN_FRAME]++; #endif // CONFIG_ENTROPY_STATS } update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd), ref1 == ALTREF_FRAME, 2); #if CONFIG_ENTROPY_STATS counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0] [ref1 == ALTREF_FRAME]++; #endif // CONFIG_ENTROPY_STATS if (ref1 != ALTREF_FRAME) { update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd), ref1 == ALTREF2_FRAME, 2); #if CONFIG_ENTROPY_STATS counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1] [ref1 == ALTREF2_FRAME]++; #endif // CONFIG_ENTROPY_STATS } } } else { const int bit = (ref0 >= BWDREF_FRAME); update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2); #if CONFIG_ENTROPY_STATS counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++; #endif // CONFIG_ENTROPY_STATS if (bit) { assert(ref0 <= ALTREF_FRAME); update_cdf(av1_get_pred_cdf_single_ref_p2(xd), ref0 == ALTREF_FRAME, 2); #if CONFIG_ENTROPY_STATS counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1] [ref0 == ALTREF_FRAME]++; #endif // CONFIG_ENTROPY_STATS if (ref0 != ALTREF_FRAME) { update_cdf(av1_get_pred_cdf_single_ref_p6(xd), ref0 == ALTREF2_FRAME, 2); #if CONFIG_ENTROPY_STATS counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5] [ref0 == ALTREF2_FRAME]++; #endif // CONFIG_ENTROPY_STATS } } else { const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME); update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2); #if CONFIG_ENTROPY_STATS counts->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++; #endif // CONFIG_ENTROPY_STATS if (!bit1) { update_cdf(av1_get_pred_cdf_single_ref_p4(xd), ref0 != LAST_FRAME, 2); #if CONFIG_ENTROPY_STATS counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3] [ref0 != LAST_FRAME]++; #endif // CONFIG_ENTROPY_STATS } else { update_cdf(av1_get_pred_cdf_single_ref_p5(xd), ref0 != LAST3_FRAME, 2); #if CONFIG_ENTROPY_STATS counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4] [ref0 != LAST3_FRAME]++; #endif // CONFIG_ENTROPY_STATS } } } if (cm->seq_params->enable_interintra_compound && is_interintra_allowed(mbmi)) { const int bsize_group = size_group_lookup[bsize]; if (mbmi->ref_frame[1] == INTRA_FRAME) { #if CONFIG_ENTROPY_STATS counts->interintra[bsize_group][1]++; #endif update_cdf(fc->interintra_cdf[bsize_group], 1, 2); #if CONFIG_ENTROPY_STATS counts->interintra_mode[bsize_group][mbmi->interintra_mode]++; #endif update_cdf(fc->interintra_mode_cdf[bsize_group], mbmi->interintra_mode, INTERINTRA_MODES); if (av1_is_wedge_used(bsize)) { #if CONFIG_ENTROPY_STATS counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++; #endif update_cdf(fc->wedge_interintra_cdf[bsize], mbmi->use_wedge_interintra, 2); if (mbmi->use_wedge_interintra) { #if CONFIG_ENTROPY_STATS counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++; #endif update_cdf(fc->wedge_idx_cdf[bsize], mbmi->interintra_wedge_index, 16); } } } else { #if CONFIG_ENTROPY_STATS counts->interintra[bsize_group][0]++; #endif update_cdf(fc->interintra_cdf[bsize_group], 0, 2); } } const MOTION_MODE motion_allowed = cm->features.switchable_motion_mode ? motion_mode_allowed(xd->global_motion, xd, mbmi, cm->features.allow_warped_motion) : SIMPLE_TRANSLATION; if (mbmi->ref_frame[1] != INTRA_FRAME) { if (motion_allowed == WARPED_CAUSAL) { #if CONFIG_ENTROPY_STATS counts->motion_mode[bsize][mbmi->motion_mode]++; #endif update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode, MOTION_MODES); } else if (motion_allowed == OBMC_CAUSAL) { #if CONFIG_ENTROPY_STATS counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++; #endif update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL, 2); } } if (has_second_ref(mbmi)) { assert(current_frame->reference_mode != SINGLE_REFERENCE && is_inter_compound_mode(mbmi->mode) && mbmi->motion_mode == SIMPLE_TRANSLATION); const int masked_compound_used = is_any_masked_compound_used(bsize) && cm->seq_params->enable_masked_compound; if (masked_compound_used) { const int comp_group_idx_ctx = get_comp_group_idx_context(xd); #if CONFIG_ENTROPY_STATS ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx]; #endif update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx], mbmi->comp_group_idx, 2); } if (mbmi->comp_group_idx == 0) { const int comp_index_ctx = get_comp_index_context(cm, xd); #if CONFIG_ENTROPY_STATS ++counts->compound_index[comp_index_ctx][mbmi->compound_idx]; #endif update_cdf(fc->compound_index_cdf[comp_index_ctx], mbmi->compound_idx, 2); } else { assert(masked_compound_used); if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) { #if CONFIG_ENTROPY_STATS ++counts->compound_type[bsize][mbmi->interinter_comp.type - COMPOUND_WEDGE]; #endif update_cdf(fc->compound_type_cdf[bsize], mbmi->interinter_comp.type - COMPOUND_WEDGE, MASKED_COMPOUND_TYPES); } } } if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) { #if CONFIG_ENTROPY_STATS counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++; #endif update_cdf(fc->wedge_idx_cdf[bsize], mbmi->interinter_comp.wedge_index, 16); } } } } if (inter_block && cm->features.interp_filter == SWITCHABLE && av1_is_interp_needed(xd)) { update_filter_type_cdf(xd, mbmi, cm->seq_params->enable_dual_filter); } if (inter_block && !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { const PREDICTION_MODE mode = mbmi->mode; const int16_t mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); if (has_second_ref(mbmi)) { #if CONFIG_ENTROPY_STATS ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)]; #endif update_cdf(fc->inter_compound_mode_cdf[mode_ctx], INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES); } else { av1_update_inter_mode_stats(fc, counts, mode, mode_ctx); } const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV; if (new_mv) { const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); for (int idx = 0; idx < 2; ++idx) { if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { const uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx, 2); #if CONFIG_ENTROPY_STATS ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx]; #endif if (mbmi->ref_mv_idx == idx) break; } } } if (have_nearmv_in_inter_mode(mbmi->mode)) { const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); for (int idx = 1; idx < 3; ++idx) { if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { const uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx - 1, 2); #if CONFIG_ENTROPY_STATS ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1]; #endif if (mbmi->ref_mv_idx == idx - 1) break; } } } if (have_newmv_in_inter_mode(mbmi->mode)) { const int allow_hp = cm->features.cur_frame_force_integer_mv ? MV_SUBPEL_NONE : cm->features.allow_high_precision_mv; if (new_mv) { for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { const int_mv ref_mv = av1_get_ref_mv(x, ref); av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc, allow_hp); } } else if (mbmi->mode == NEAREST_NEWMV || mbmi->mode == NEAR_NEWMV) { const int ref = 1; const int_mv ref_mv = av1_get_ref_mv(x, ref); av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc, allow_hp); } else if (mbmi->mode == NEW_NEARESTMV || mbmi->mode == NEW_NEARMV) { const int ref = 0; const int_mv ref_mv = av1_get_ref_mv(x, ref); av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc, allow_hp); } } } } /*!\brief Reconstructs an individual coding block * * \ingroup partition_search * Reconstructs an individual coding block by applying the chosen modes stored * in ctx, also updates mode counts and entropy models. * * \param[in] cpi Top-level encoder structure * \param[in] tile_data Pointer to struct holding adaptive * data/contexts/models for the tile during encoding * \param[in] td Pointer to thread data * \param[in] tp Pointer to the starting token * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE * \param[in] mi_col Column coordinate of the block in a step size of * MI_SIZE * \param[in] dry_run A code indicating whether it is part of the final * pass for reconstructing the superblock * \param[in] bsize Current block size * \param[in] partition Partition mode of the parent block * \param[in] ctx Pointer to structure holding coding contexts and the * chosen modes for the current block * \param[in] rate Pointer to the total rate for the current block * * \remark Nothing is returned. Instead, reconstructions (w/o in-loop filters) * will be updated in the pixel buffers in td->mb.e_mbd. Also, the chosen modes * will be stored in the MB_MODE_INFO buffer td->mb.e_mbd.mi[0]. */ static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data, ThreadData *td, TokenExtra **tp, int mi_row, int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize, PARTITION_TYPE partition, PICK_MODE_CONTEXT *const ctx, int *rate) { const AV1_COMMON *const cm = &cpi->common; TileInfo *const tile = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; MACROBLOCKD *xd = &x->e_mbd; const int subsampling_x = cm->seq_params->subsampling_x; const int subsampling_y = cm->seq_params->subsampling_y; av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); const int origin_mult = x->rdmult; setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); MB_MODE_INFO *mbmi = xd->mi[0]; mbmi->partition = partition; av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run); if (!dry_run) { set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y], x->cb_offset[PLANE_TYPE_UV]); assert(x->cb_offset[PLANE_TYPE_Y] < (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size])); assert(x->cb_offset[PLANE_TYPE_UV] < ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >> (subsampling_x + subsampling_y))); } encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate); if (!dry_run) { update_cb_offsets(x, bsize, subsampling_x, subsampling_y); if (bsize == cpi->common.seq_params->sb_size && mbmi->skip_txfm == 1 && cm->delta_q_info.delta_lf_present_flag) { const int frame_lf_count = av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id]; mbmi->delta_lf_from_base = xd->delta_lf_from_base; } if (has_second_ref(mbmi)) { if (mbmi->compound_idx == 0 || mbmi->interinter_comp.type == COMPOUND_AVERAGE) mbmi->comp_group_idx = 0; else mbmi->comp_group_idx = 1; } // delta quant applies to both intra and inter const int super_block_upper_left = ((mi_row & (cm->seq_params->mib_size - 1)) == 0) && ((mi_col & (cm->seq_params->mib_size - 1)) == 0); const DeltaQInfo *const delta_q_info = &cm->delta_q_info; if (delta_q_info->delta_q_present_flag && (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) && super_block_upper_left) { xd->current_base_qindex = mbmi->current_qindex; if (delta_q_info->delta_lf_present_flag) { if (delta_q_info->delta_lf_multi) { const int frame_lf_count = av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; } } else { xd->delta_lf_from_base = mbmi->delta_lf_from_base; } } } RD_COUNTS *rdc = &td->rd_counts; if (mbmi->skip_mode) { assert(!frame_is_intra_only(cm)); rdc->skip_mode_used_flag = 1; if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) { assert(has_second_ref(mbmi)); rdc->compound_ref_used_flag = 1; } set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); } else { const int seg_ref_active = segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); if (!seg_ref_active) { // If the segment reference feature is enabled we have only a single // reference frame allowed for the segment so exclude it from // the reference frame counts used to work out probabilities. if (is_inter_block(mbmi)) { av1_collect_neighbors_ref_counts(xd); if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) { if (has_second_ref(mbmi)) { // This flag is also updated for 4x4 blocks rdc->compound_ref_used_flag = 1; } } set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); } } } if (tile_data->allow_update_cdf) update_stats(&cpi->common, td); // Gather obmc and warped motion count to update the probability. if ((cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) || (cm->features.allow_warped_motion && cpi->sf.inter_sf.prune_warped_prob_thresh > 0)) { const int inter_block = is_inter_block(mbmi); const int seg_ref_active = segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); if (!seg_ref_active && inter_block) { const MOTION_MODE motion_allowed = cm->features.switchable_motion_mode ? motion_mode_allowed(xd->global_motion, xd, mbmi, cm->features.allow_warped_motion) : SIMPLE_TRANSLATION; if (mbmi->ref_frame[1] != INTRA_FRAME) { if (motion_allowed >= OBMC_CAUSAL) { td->rd_counts.obmc_used[bsize][mbmi->motion_mode == OBMC_CAUSAL]++; } if (motion_allowed == WARPED_CAUSAL) { td->rd_counts.warped_used[mbmi->motion_mode == WARPED_CAUSAL]++; } } } } } // TODO(Ravi/Remya): Move this copy function to a better logical place // This function will copy the best mode information from block // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during // bitstream preparation. av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, &x->mbmi_ext, av1_ref_frame_type(xd->mi[0]->ref_frame)); x->rdmult = origin_mult; } /*!\brief Reconstructs a partition (may contain multiple coding blocks) * * \ingroup partition_search * Reconstructs a sub-partition of the superblock by applying the chosen modes * and partition trees stored in pc_tree. * * \param[in] cpi Top-level encoder structure * \param[in] td Pointer to thread data * \param[in] tile_data Pointer to struct holding adaptive * data/contexts/models for the tile during encoding * \param[in] tp Pointer to the starting token * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE * \param[in] mi_col Column coordinate of the block in a step size of * MI_SIZE * \param[in] dry_run A code indicating whether it is part of the final * pass for reconstructing the superblock * \param[in] bsize Current block size * \param[in] pc_tree Pointer to the PC_TREE node storing the picked * partitions and mode info for the current block * \param[in] rate Pointer to the total rate for the current block * * \remark Nothing is returned. Instead, reconstructions (w/o in-loop filters) * will be updated in the pixel buffers in td->mb.e_mbd. */ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, int mi_row, int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize, PC_TREE *pc_tree, int *rate) { assert(bsize < BLOCK_SIZES_ALL); const AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; assert(bsize < BLOCK_SIZES_ALL); const int hbs = mi_size_wide[bsize] / 2; const int is_partition_root = bsize >= BLOCK_8X8; const int ctx = is_partition_root ? partition_plane_context(xd, mi_row, mi_col, bsize) : -1; const PARTITION_TYPE partition = pc_tree->partitioning; const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); #if !CONFIG_REALTIME_ONLY int quarter_step = mi_size_wide[bsize] / 4; int i; BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); #endif if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; if (subsize == BLOCK_INVALID) return; if (!dry_run && ctx >= 0) { const int has_rows = (mi_row + hbs) < mi_params->mi_rows; const int has_cols = (mi_col + hbs) < mi_params->mi_cols; if (has_rows && has_cols) { #if CONFIG_ENTROPY_STATS td->counts->partition[ctx][partition]++; #endif if (tile_data->allow_update_cdf) { FRAME_CONTEXT *fc = xd->tile_ctx; update_cdf(fc->partition_cdf[ctx], partition, partition_cdf_length(bsize)); } } } switch (partition) { case PARTITION_NONE: encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, partition, pc_tree->none, rate); break; case PARTITION_VERT: encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, partition, pc_tree->vertical[0], rate); if (mi_col + hbs < mi_params->mi_cols) { encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize, partition, pc_tree->vertical[1], rate); } break; case PARTITION_HORZ: encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, partition, pc_tree->horizontal[0], rate); if (mi_row + hbs < mi_params->mi_rows) { encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize, partition, pc_tree->horizontal[1], rate); } break; case PARTITION_SPLIT: encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize, pc_tree->split[0], rate); encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize, pc_tree->split[1], rate); encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize, pc_tree->split[2], rate); encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run, subsize, pc_tree->split[3], rate); break; #if !CONFIG_REALTIME_ONLY case PARTITION_HORZ_A: encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2, partition, pc_tree->horizontala[0], rate); encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2, partition, pc_tree->horizontala[1], rate); encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize, partition, pc_tree->horizontala[2], rate); break; case PARTITION_HORZ_B: encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, partition, pc_tree->horizontalb[0], rate); encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2, partition, pc_tree->horizontalb[1], rate); encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2, partition, pc_tree->horizontalb[2], rate); break; case PARTITION_VERT_A: encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2, partition, pc_tree->verticala[0], rate); encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2, partition, pc_tree->verticala[1], rate); encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize, partition, pc_tree->verticala[2], rate); break; case PARTITION_VERT_B: encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, partition, pc_tree->verticalb[0], rate); encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2, partition, pc_tree->verticalb[1], rate); encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2, partition, pc_tree->verticalb[2], rate); break; case PARTITION_HORZ_4: for (i = 0; i < SUB_PARTITIONS_PART4; ++i) { int this_mi_row = mi_row + i * quarter_step; if (i > 0 && this_mi_row >= mi_params->mi_rows) break; encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize, partition, pc_tree->horizontal4[i], rate); } break; case PARTITION_VERT_4: for (i = 0; i < SUB_PARTITIONS_PART4; ++i) { int this_mi_col = mi_col + i * quarter_step; if (i > 0 && this_mi_col >= mi_params->mi_cols) break; encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize, partition, pc_tree->vertical4[i], rate); } break; #endif default: assert(0 && "Invalid partition type."); break; } update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); } static inline int is_adjust_var_based_part_enabled( AV1_COMMON *const cm, const PARTITION_SPEED_FEATURES *const part_sf, BLOCK_SIZE bsize) { if (part_sf->partition_search_type != VAR_BASED_PARTITION) return 0; if (part_sf->adjust_var_based_rd_partitioning == 0 || part_sf->adjust_var_based_rd_partitioning > 2) return 0; if (bsize <= BLOCK_32X32) return 1; if (part_sf->adjust_var_based_rd_partitioning == 2) { const int is_larger_qindex = cm->quant_params.base_qindex > 190; const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360; return is_360p_or_larger && is_larger_qindex && bsize == BLOCK_64X64; } return 0; } /*!\brief AV1 block partition search (partition estimation and partial search). * * \ingroup partition_search * Encode the block by applying pre-calculated partition patterns that are * represented by coding block sizes stored in the mbmi array. Minor partition * adjustments are tested and applied if they lead to lower rd costs. The * partition types are limited to a basic set: none, horz, vert, and split. * * \param[in] cpi Top-level encoder structure * \param[in] td Pointer to thread data * \param[in] tile_data Pointer to struct holding adaptive data/contexts/models for the tile during encoding * \param[in] mib Array representing MB_MODE_INFO pointers for mi blocks starting from the first pixel of the current block * \param[in] tp Pointer to the starting token * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE * \param[in] mi_col Column coordinate of the block in a step size of MI_SIZE * \param[in] bsize Current block size * \param[in] rate Pointer to the final rate for encoding the current block * \param[in] dist Pointer to the final distortion of the current block * \param[in] do_recon Whether the reconstruction function needs to be run, either for finalizing a superblock or providing reference for future sub-partitions * \param[in] pc_tree Pointer to the PC_TREE node holding the picked partitions and mode info for the current block * * \remark Nothing is returned. The pc_tree struct is modified to store the * picked partition and modes. The rate and dist are also updated with those * corresponding to the best partition found. */ void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, MB_MODE_INFO **mib, TokenExtra **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, int64_t *dist, int do_recon, PC_TREE *pc_tree) { AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; const int num_planes = av1_num_planes(cm); TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; const ModeCosts *mode_costs = &x->mode_costs; const int bs = mi_size_wide[bsize]; const int hbs = bs / 2; const int pl = (bsize >= BLOCK_8X8) ? partition_plane_context(xd, mi_row, mi_col, bsize) : 0; const PARTITION_TYPE partition = (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize) : PARTITION_NONE; const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; RD_STATS last_part_rdc, none_rdc, chosen_rdc, invalid_rdc; BLOCK_SIZE bs_type = mib[0]->bsize; int use_partition_none = 0; x->try_merge_partition = 0; if (pc_tree->none == NULL) { pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf); if (!pc_tree->none) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } PICK_MODE_CONTEXT *ctx_none = pc_tree->none; if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; assert(mi_size_wide[bsize] == mi_size_high[bsize]); // In rt mode, currently the min partition size is BLOCK_8X8. assert(bsize >= cpi->sf.part_sf.default_min_partition_size); av1_invalid_rd_stats(&last_part_rdc); av1_invalid_rd_stats(&none_rdc); av1_invalid_rd_stats(&chosen_rdc); av1_invalid_rd_stats(&invalid_rdc); pc_tree->partitioning = partition; xd->above_txfm_context = cm->above_contexts.txfm[tile_info->tile_row] + mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); if (bsize == BLOCK_16X16 && cpi->vaq_refresh) { av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); x->mb_energy = av1_log_block_var(cpi, x, bsize); } // Save rdmult before it might be changed, so it can be restored later. const int orig_rdmult = x->rdmult; setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); if (partition != PARTITION_NONE && is_adjust_var_based_part_enabled(cm, &cpi->sf.part_sf, bsize) && (mi_row + hbs < mi_params->mi_rows && mi_col + hbs < mi_params->mi_cols)) { assert(bsize > cpi->sf.part_sf.default_min_partition_size); mib[0]->bsize = bsize; pc_tree->partitioning = PARTITION_NONE; x->try_merge_partition = 1; pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, PARTITION_NONE, bsize, ctx_none, invalid_rdc); if (none_rdc.rate < INT_MAX) { none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE]; none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist); } // Try to skip split partition evaluation based on none partition // characteristics. if (none_rdc.rate < INT_MAX && none_rdc.skip_txfm == 1) { use_partition_none = 1; } av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); mib[0]->bsize = bs_type; pc_tree->partitioning = partition; } for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); if (!pc_tree->split[i]) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); pc_tree->split[i]->index = i; } switch (partition) { case PARTITION_NONE: pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, PARTITION_NONE, bsize, ctx_none, invalid_rdc); break; case PARTITION_HORZ: if (use_partition_none) { av1_invalid_rd_stats(&last_part_rdc); break; } for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { pc_tree->horizontal[i] = av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); if (!pc_tree->horizontal[i]) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, PARTITION_HORZ, subsize, pc_tree->horizontal[0], invalid_rdc); if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_row + hbs < mi_params->mi_rows) { RD_STATS tmp_rdc; const PICK_MODE_CONTEXT *const ctx_h = pc_tree->horizontal[0]; av1_init_rd_stats(&tmp_rdc); av1_update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1); encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL); pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc, PARTITION_HORZ, subsize, pc_tree->horizontal[1], invalid_rdc); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { av1_invalid_rd_stats(&last_part_rdc); break; } last_part_rdc.rate += tmp_rdc.rate; last_part_rdc.dist += tmp_rdc.dist; last_part_rdc.rdcost += tmp_rdc.rdcost; } break; case PARTITION_VERT: if (use_partition_none) { av1_invalid_rd_stats(&last_part_rdc); break; } for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { pc_tree->vertical[i] = av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); if (!pc_tree->vertical[i]) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, PARTITION_VERT, subsize, pc_tree->vertical[0], invalid_rdc); if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_col + hbs < mi_params->mi_cols) { RD_STATS tmp_rdc; const PICK_MODE_CONTEXT *const ctx_v = pc_tree->vertical[0]; av1_init_rd_stats(&tmp_rdc); av1_update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1); encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL); pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc, PARTITION_VERT, subsize, pc_tree->vertical[bsize > BLOCK_8X8], invalid_rdc); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { av1_invalid_rd_stats(&last_part_rdc); break; } last_part_rdc.rate += tmp_rdc.rate; last_part_rdc.dist += tmp_rdc.dist; last_part_rdc.rdcost += tmp_rdc.rdcost; } break; case PARTITION_SPLIT: if (use_partition_none) { av1_invalid_rd_stats(&last_part_rdc); break; } last_part_rdc.rate = 0; last_part_rdc.dist = 0; last_part_rdc.rdcost = 0; for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { int x_idx = (i & 1) * hbs; int y_idx = (i >> 1) * hbs; int jj = i >> 1, ii = i & 0x01; RD_STATS tmp_rdc; if ((mi_row + y_idx >= mi_params->mi_rows) || (mi_col + x_idx >= mi_params->mi_cols)) continue; av1_init_rd_stats(&tmp_rdc); av1_rd_use_partition( cpi, td, tile_data, mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp, mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate, &tmp_rdc.dist, i != (SUB_PARTITIONS_SPLIT - 1), pc_tree->split[i]); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { av1_invalid_rd_stats(&last_part_rdc); break; } last_part_rdc.rate += tmp_rdc.rate; last_part_rdc.dist += tmp_rdc.dist; } break; case PARTITION_VERT_A: case PARTITION_VERT_B: case PARTITION_HORZ_A: case PARTITION_HORZ_B: case PARTITION_HORZ_4: case PARTITION_VERT_4: assert(0 && "Cannot handle extended partition types"); default: assert(0); break; } if (last_part_rdc.rate < INT_MAX) { last_part_rdc.rate += mode_costs->partition_cost[pl][partition]; last_part_rdc.rdcost = RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist); } if ((cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION && cpi->sf.part_sf.adjust_var_based_rd_partitioning > 2) && partition != PARTITION_SPLIT && bsize > BLOCK_8X8 && (mi_row + bs < mi_params->mi_rows || mi_row + hbs == mi_params->mi_rows) && (mi_col + bs < mi_params->mi_cols || mi_col + hbs == mi_params->mi_cols)) { BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT); chosen_rdc.rate = 0; chosen_rdc.dist = 0; av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); pc_tree->partitioning = PARTITION_SPLIT; // Split partition. for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { int x_idx = (i & 1) * hbs; int y_idx = (i >> 1) * hbs; RD_STATS tmp_rdc; if ((mi_row + y_idx >= mi_params->mi_rows) || (mi_col + x_idx >= mi_params->mi_cols)) continue; av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); pc_tree->split[i]->partitioning = PARTITION_NONE; if (pc_tree->split[i]->none == NULL) pc_tree->split[i]->none = av1_alloc_pmc(cpi, split_subsize, &td->shared_coeff_buf); if (!pc_tree->split[i]->none) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc, PARTITION_SPLIT, split_subsize, pc_tree->split[i]->none, invalid_rdc); av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { av1_invalid_rd_stats(&chosen_rdc); break; } chosen_rdc.rate += tmp_rdc.rate; chosen_rdc.dist += tmp_rdc.dist; if (i != SUB_PARTITIONS_SPLIT - 1) encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL); chosen_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE]; } if (chosen_rdc.rate < INT_MAX) { chosen_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT]; chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist); } } // If last_part is better set the partitioning to that. if (last_part_rdc.rdcost < chosen_rdc.rdcost) { mib[0]->bsize = bs_type; if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition; chosen_rdc = last_part_rdc; } // If none was better set the partitioning to that. if (none_rdc.rdcost < INT64_MAX && none_rdc.rdcost - (none_rdc.rdcost >> 9) < chosen_rdc.rdcost) { mib[0]->bsize = bsize; if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; chosen_rdc = none_rdc; } av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); // We must have chosen a partitioning and encoding or we'll fail later on. // No other opportunities for success. if (bsize == cm->seq_params->sb_size) assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX); #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, encode_sb_time); #endif if (do_recon) { if (bsize == cm->seq_params->sb_size) { // NOTE: To get estimate for rate due to the tokens, use: // int rate_coeffs = 0; // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS, // bsize, pc_tree, &rate_coeffs); set_cb_offsets(x->cb_offset, 0, 0); encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, pc_tree, NULL); } else { encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, pc_tree, NULL); } } #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, encode_sb_time); #endif *rate = chosen_rdc.rate; *dist = chosen_rdc.dist; x->rdmult = orig_rdmult; } static void encode_b_nonrd(const AV1_COMP *const cpi, TileDataEnc *tile_data, ThreadData *td, TokenExtra **tp, int mi_row, int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize, PARTITION_TYPE partition, PICK_MODE_CONTEXT *const ctx, int *rate) { #if CONFIG_COLLECT_COMPONENT_TIMING start_timing((AV1_COMP *)cpi, encode_b_nonrd_time); #endif const AV1_COMMON *const cm = &cpi->common; TileInfo *const tile = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; MACROBLOCKD *xd = &x->e_mbd; av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); const int origin_mult = x->rdmult; setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); MB_MODE_INFO *mbmi = xd->mi[0]; mbmi->partition = partition; av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run); const int subsampling_x = cpi->common.seq_params->subsampling_x; const int subsampling_y = cpi->common.seq_params->subsampling_y; if (!dry_run) { set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y], x->cb_offset[PLANE_TYPE_UV]); assert(x->cb_offset[PLANE_TYPE_Y] < (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size])); assert(x->cb_offset[PLANE_TYPE_UV] < ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >> (subsampling_x + subsampling_y))); } encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate); if (!dry_run) { update_cb_offsets(x, bsize, subsampling_x, subsampling_y); if (has_second_ref(mbmi)) { if (mbmi->compound_idx == 0 || mbmi->interinter_comp.type == COMPOUND_AVERAGE) mbmi->comp_group_idx = 0; else mbmi->comp_group_idx = 1; mbmi->compound_idx = 1; } RD_COUNTS *const rdc = &td->rd_counts; if (mbmi->skip_mode) { assert(!frame_is_intra_only(cm)); rdc->skip_mode_used_flag = 1; if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT && has_second_ref(mbmi)) { rdc->compound_ref_used_flag = 1; } set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); } else { const int seg_ref_active = segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); if (!seg_ref_active) { // If the segment reference feature is enabled we have only a single // reference frame allowed for the segment so exclude it from // the reference frame counts used to work out probabilities. if (is_inter_block(mbmi)) { av1_collect_neighbors_ref_counts(xd); if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT && has_second_ref(mbmi)) { // This flag is also updated for 4x4 blocks rdc->compound_ref_used_flag = 1; } set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); } } } if (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_SELECTIVELY && (mbmi->mode == NEWMV || mbmi->mode < INTRA_MODE_END)) { int32_t blocks = mi_size_high[bsize] * mi_size_wide[bsize]; rdc->newmv_or_intra_blocks += blocks; } if (tile_data->allow_update_cdf) update_stats(&cpi->common, td); } if ((cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ || cpi->active_map.enabled) && mbmi->skip_txfm && !cpi->rc.rtc_external_ratectrl && cm->seg.enabled) av1_cyclic_reset_segment_skip(cpi, x, mi_row, mi_col, bsize, dry_run); // TODO(Ravi/Remya): Move this copy function to a better logical place // This function will copy the best mode information from block // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during // bitstream preparation. av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, &x->mbmi_ext, av1_ref_frame_type(xd->mi[0]->ref_frame)); x->rdmult = origin_mult; #if CONFIG_COLLECT_COMPONENT_TIMING end_timing((AV1_COMP *)cpi, encode_b_nonrd_time); #endif } static int get_force_zeromv_skip_flag_for_blk(const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize) { // Force zero MV skip based on SB level decision if (x->force_zeromv_skip_for_sb < 2) return x->force_zeromv_skip_for_sb; // For blocks of size equal to superblock size, the decision would have been // already done at superblock level. Hence zeromv-skip decision is skipped. const AV1_COMMON *const cm = &cpi->common; if (bsize == cm->seq_params->sb_size) return 0; const int num_planes = av1_num_planes(cm); const MACROBLOCKD *const xd = &x->e_mbd; const unsigned int thresh_exit_part_y = cpi->zeromv_skip_thresh_exit_part[bsize]; const unsigned int thresh_exit_part_uv = CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y); const unsigned int thresh_exit_part[MAX_MB_PLANE] = { thresh_exit_part_y, thresh_exit_part_uv, thresh_exit_part_uv }; const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); const struct scale_factors *const sf = get_ref_scale_factors_const(cm, LAST_FRAME); struct buf_2d yv12_mb[MAX_MB_PLANE]; av1_setup_pred_block(xd, yv12_mb, yv12, sf, sf, num_planes); for (int plane = 0; plane < num_planes; ++plane) { const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE bs = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); const unsigned int plane_sad = cpi->ppi->fn_ptr[bs].sdf( p->src.buf, p->src.stride, yv12_mb[plane].buf, yv12_mb[plane].stride); assert(plane < MAX_MB_PLANE); if (plane_sad >= thresh_exit_part[plane]) return 0; } return 1; } /*!\brief Top level function to pick block mode for non-RD optimized case * * \ingroup partition_search * \callgraph * \callergraph * Searches prediction modes, transform, and coefficient coding modes for an * individual coding block. This function is the top-level function that is * used for non-RD optimized mode search (controlled by * \c cpi->sf.rt_sf.use_nonrd_pick_mode). Depending on frame type it calls * inter/skip/hybrid-intra mode search functions * * \param[in] cpi Top-level encoder structure * \param[in] tile_data Pointer to struct holding adaptive * data/contexts/models for the tile during * encoding * \param[in] x Pointer to structure holding all the data for * the current macroblock * \param[in] mi_row Row coordinate of the block in a step size of * MI_SIZE * \param[in] mi_col Column coordinate of the block in a step size of * MI_SIZE * \param[in] rd_cost Pointer to structure holding rate and distortion * stats for the current block * \param[in] bsize Current block size * \param[in] ctx Pointer to structure holding coding contexts and * chosen modes for the current block * * \remark Nothing is returned. Instead, the chosen modes and contexts necessary * for reconstruction are stored in ctx, the rate-distortion stats are stored in * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be * signalled by an INT64_MAX rd_cost->rdcost. */ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x, int mi_row, int mi_col, RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { // For nonrd mode, av1_set_offsets is already called at the superblock level // in encode_nonrd_sb when we determine the partitioning. if (bsize != cpi->common.seq_params->sb_size || cpi->sf.rt_sf.nonrd_check_partition_split == 1) { av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize); } assert(x->last_set_offsets_loc.mi_row == mi_row && x->last_set_offsets_loc.mi_col == mi_col && x->last_set_offsets_loc.bsize == bsize); AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode; TxfmSearchInfo *txfm_info = &x->txfm_search_info; int i; const int seg_skip = segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); // This is only needed for real time/allintra row-mt enabled multi-threaded // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF. wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync, &tile_data->tile_info, cm->seq_params->sb_size, cm->seq_params->mib_size_log2, bsize, mi_row, mi_col); #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, pick_sb_modes_nonrd_time); #endif // Sets up the tx_type_map buffer in MACROBLOCKD. xd->tx_type_map = txfm_info->tx_type_map_; xd->tx_type_map_stride = mi_size_wide[bsize]; for (i = 0; i < num_planes; ++i) { p[i].coeff = ctx->coeff[i]; p[i].qcoeff = ctx->qcoeff[i]; p[i].dqcoeff = ctx->dqcoeff[i]; p[i].eobs = ctx->eobs[i]; p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; } for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; if (!seg_skip) { x->force_zeromv_skip_for_blk = get_force_zeromv_skip_flag_for_blk(cpi, x, bsize); // Source variance may be already compute at superblock level, so no need // to recompute, unless bsize < sb_size or source_variance is not yet set. if (!x->force_zeromv_skip_for_blk && (x->source_variance == UINT_MAX || bsize < cm->seq_params->sb_size)) x->source_variance = av1_get_perpixel_variance_facade( cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y); } // Save rdmult before it might be changed, so it can be restored later. const int orig_rdmult = x->rdmult; setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi); // Set error per bit for current rdmult av1_set_error_per_bit(&x->errorperbit, x->rdmult); // Find best coding mode & reconstruct the MB so it is available // as a predictor for MBs that follow in the SB if (frame_is_intra_only(cm)) { #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, hybrid_intra_mode_search_time); #endif hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, hybrid_intra_mode_search_time); #endif } else { #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, nonrd_pick_inter_mode_sb_time); #endif if (seg_skip) { x->force_zeromv_skip_for_blk = 1; // TODO(marpan): Consider adding a function for nonrd: // av1_nonrd_pick_inter_mode_sb_seg_skip(), instead of setting // x->force_zeromv_skip flag and entering av1_nonrd_pick_inter_mode_sb(). } av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, nonrd_pick_inter_mode_sb_time); #endif } if (cpi->sf.rt_sf.skip_cdef_sb) { // cdef_strength is initialized to 1 which means skip_cdef, and is updated // here. Check to see is skipping cdef is allowed. Never skip on slide/scene // change, near a key frame, or when color sensitivity is set. Always allow // cdef_skip for seg_skip = 1. const int allow_cdef_skipping = seg_skip || (cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad && !(x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])); // Find the corresponding 64x64 block. It'll be the 128x128 block if that's // the block size. const int mi_row_sb = mi_row - mi_row % MI_SIZE_64X64; const int mi_col_sb = mi_col - mi_col % MI_SIZE_64X64; MB_MODE_INFO **mi_sb = cm->mi_params.mi_grid_base + get_mi_grid_idx(&cm->mi_params, mi_row_sb, mi_col_sb); const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; unsigned int thresh_spatial_var = (cpi->oxcf.speed >= 11 && !is_720p_or_larger && cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) ? 400 : UINT_MAX; // For skip_cdef_sb = 1: do not skip if allow_cdef_skipping is false or // intra or new mv is picked, with possible conidition on spatial variance. // For skip_cdef_sb >= 2: more aggressive mode to always skip unless // allow_cdef_skipping is false and source_variance is non-zero. if (cpi->sf.rt_sf.skip_cdef_sb >= 2) { mi_sb[0]->cdef_strength = mi_sb[0]->cdef_strength && (allow_cdef_skipping || x->source_variance == 0); } else { mi_sb[0]->cdef_strength = mi_sb[0]->cdef_strength && allow_cdef_skipping && !(x->source_variance < thresh_spatial_var && (mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV)); } // Store in the pickmode context. ctx->mic.cdef_strength = mi_sb[0]->cdef_strength; } x->rdmult = orig_rdmult; ctx->rd_stats.rate = rd_cost->rate; ctx->rd_stats.dist = rd_cost->dist; ctx->rd_stats.rdcost = rd_cost->rdcost; #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, pick_sb_modes_nonrd_time); #endif } static int try_split_partition(AV1_COMP *const cpi, ThreadData *const td, TileDataEnc *const tile_data, TileInfo *const tile_info, TokenExtra **tp, MACROBLOCK *const x, MACROBLOCKD *const xd, const CommonModeInfoParams *const mi_params, const int mi_row, const int mi_col, const BLOCK_SIZE bsize, const int pl, PC_TREE *pc_tree) { AV1_COMMON *const cm = &cpi->common; const ModeCosts *mode_costs = &x->mode_costs; const int hbs = mi_size_wide[bsize] / 2; if (mi_row + mi_size_high[bsize] >= mi_params->mi_rows || mi_col + mi_size_wide[bsize] >= mi_params->mi_cols) return 0; if (bsize <= BLOCK_8X8 || frame_is_intra_only(cm)) return 0; if (x->content_state_sb.source_sad_nonrd <= kLowSad) return 0; // Do not try split partition when the source sad is small, or // the prediction residual is small. const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); const struct scale_factors *const sf = get_ref_scale_factors_const(cm, LAST_FRAME); const int num_planes = av1_num_planes(cm); av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf, num_planes); int block_sad = 0; for (int plane = 0; plane < num_planes; ++plane) { const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE bs = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); const unsigned int plane_sad = cpi->ppi->fn_ptr[bs].sdf( p->src.buf, p->src.stride, pd->pre[0].buf, pd->pre[0].stride); block_sad += plane_sad; } const int blk_pix = block_size_wide[bsize] * block_size_high[bsize]; const int block_avg_sad = block_sad / blk_pix; // TODO(chengchen): find a proper threshold. It might change according to // q as well. const int threshold = 25; if (block_avg_sad < threshold) return 0; RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; RD_STATS split_rdc, none_rdc; av1_invalid_rd_stats(&split_rdc); av1_invalid_rd_stats(&none_rdc); av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3); xd->above_txfm_context = cm->above_contexts.txfm[tile_info->tile_row] + mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); // Calculate rdcost for none partition pc_tree->partitioning = PARTITION_NONE; av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); if (!pc_tree->none) { pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf); if (!pc_tree->none) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } else { av1_reset_pmc(pc_tree->none); } pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize, pc_tree->none); none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE]; none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist); av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3); // Calculate rdcost for split partition pc_tree->partitioning = PARTITION_SPLIT; const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); av1_init_rd_stats(&split_rdc); split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT]; if (subsize >= BLOCK_8X8) { split_rdc.rate += (mode_costs->partition_cost[pl][PARTITION_NONE] * 4); } for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { if (!pc_tree->split[i]) { pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); if (!pc_tree->split[i]) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); } pc_tree->split[i]->index = i; } for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { RD_STATS block_rdc; av1_invalid_rd_stats(&block_rdc); int x_idx = (i & 1) * hbs; int y_idx = (i >> 1) * hbs; if ((mi_row + y_idx >= mi_params->mi_rows) || (mi_col + x_idx >= mi_params->mi_cols)) continue; xd->above_txfm_context = cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx; xd->left_txfm_context = xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK); if (!pc_tree->split[i]->none) { pc_tree->split[i]->none = av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); if (!pc_tree->split[i]->none) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } else { av1_reset_pmc(pc_tree->split[i]->none); } pc_tree->split[i]->partitioning = PARTITION_NONE; pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &block_rdc, subsize, pc_tree->split[i]->none); split_rdc.rate += block_rdc.rate; split_rdc.dist += block_rdc.dist; av1_rd_cost_update(x->rdmult, &split_rdc); if (none_rdc.rdcost < split_rdc.rdcost) break; if (i != SUB_PARTITIONS_SPLIT - 1) encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1, subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL); } av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3); split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist); const int split = split_rdc.rdcost < none_rdc.rdcost; return split; } // Returns if SPLIT partitions should be evaluated static bool calc_do_split_flag(const AV1_COMP *cpi, const MACROBLOCK *x, const PC_TREE *pc_tree, const RD_STATS *none_rdc, const CommonModeInfoParams *mi_params, int mi_row, int mi_col, int hbs, BLOCK_SIZE bsize, PARTITION_TYPE partition) { const AV1_COMMON *const cm = &cpi->common; const int is_larger_qindex = cm->quant_params.base_qindex > 100; const MACROBLOCKD *const xd = &x->e_mbd; bool do_split = (cpi->sf.rt_sf.nonrd_check_partition_merge_mode == 3) ? (bsize <= BLOCK_32X32 || (is_larger_qindex && bsize <= BLOCK_64X64)) : true; if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN || cpi->sf.rt_sf.nonrd_check_partition_merge_mode < 2 || cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) || !none_rdc->skip_txfm) return do_split; const int use_model_yrd_large = get_model_rd_flag(cpi, xd, bsize); // When model based skip is not used (i.e.,use_model_yrd_large = 0), skip_txfm // would have been populated based on Hadamard transform and skip_txfm flag is // more reliable. Hence SPLIT evaluation is disabled at all quantizers for 8x8 // and 16x16 blocks. // When model based skip is used (i.e.,use_model_yrd_large = 1), skip_txfm may // not be reliable. Hence SPLIT evaluation is disabled only at lower // quantizers for blocks >= 32x32. if ((!use_model_yrd_large) || (!is_larger_qindex)) return false; // Use residual statistics to decide if SPLIT partition should be evaluated // for 32x32 blocks. The pruning logic is avoided for larger block size to // avoid the visual artifacts if (pc_tree->none->mic.mode == NEWMV && bsize == BLOCK_32X32 && do_split) { const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); assert(subsize < BLOCK_SIZES_ALL); double min_per_pixel_error = DBL_MAX; double max_per_pixel_error = 0.; int i; for (i = 0; i < SUB_PARTITIONS_SPLIT; i++) { const int x_idx = (i & 1) * hbs; const int y_idx = (i >> 1) * hbs; if ((mi_row + y_idx >= mi_params->mi_rows) || (mi_col + x_idx >= mi_params->mi_cols)) { break; } // Populate the appropriate buffer pointers. // Pass scale factors as NULL as the base pointer of the block would have // been calculated appropriately. struct buf_2d src_split_buf_2d, pred_split_buf_2d; const struct buf_2d *src_none_buf_2d = &x->plane[AOM_PLANE_Y].src; setup_pred_plane(&src_split_buf_2d, subsize, src_none_buf_2d->buf, src_none_buf_2d->width, src_none_buf_2d->height, src_none_buf_2d->stride, y_idx, x_idx, NULL, 0, 0); const struct buf_2d *pred_none_buf_2d = &xd->plane[AOM_PLANE_Y].dst; setup_pred_plane(&pred_split_buf_2d, subsize, pred_none_buf_2d->buf, pred_none_buf_2d->width, pred_none_buf_2d->height, pred_none_buf_2d->stride, y_idx, x_idx, NULL, 0, 0); unsigned int curr_uint_mse; const unsigned int curr_uint_var = cpi->ppi->fn_ptr[subsize].vf( src_split_buf_2d.buf, src_split_buf_2d.stride, pred_split_buf_2d.buf, pred_split_buf_2d.stride, &curr_uint_mse); const double curr_per_pixel_error = sqrt((double)curr_uint_var / block_size_wide[subsize] / block_size_high[subsize]); if (curr_per_pixel_error < min_per_pixel_error) min_per_pixel_error = curr_per_pixel_error; if (curr_per_pixel_error > max_per_pixel_error) max_per_pixel_error = curr_per_pixel_error; } // Prune based on residual statistics only if all the sub-partitions are // valid. if (i == SUB_PARTITIONS_SPLIT) { if (max_per_pixel_error - min_per_pixel_error <= 1.5) do_split = false; } } return do_split; } static void try_merge(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, MB_MODE_INFO **mib, TokenExtra **tp, const int mi_row, const int mi_col, const BLOCK_SIZE bsize, PC_TREE *const pc_tree, const PARTITION_TYPE partition, const BLOCK_SIZE subsize, const int pl) { AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; const ModeCosts *mode_costs = &x->mode_costs; const int num_planes = av1_num_planes(cm); // Only square blocks from 8x8 to 128x128 are supported assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128); const int bs = mi_size_wide[bsize]; const int hbs = bs / 2; bool do_split = false; RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; RD_STATS split_rdc, none_rdc; av1_invalid_rd_stats(&split_rdc); av1_invalid_rd_stats(&none_rdc); av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); xd->above_txfm_context = cm->above_contexts.txfm[tile_info->tile_row] + mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); pc_tree->partitioning = PARTITION_NONE; if (!pc_tree->none) { pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf); if (!pc_tree->none) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } else { av1_reset_pmc(pc_tree->none); } pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize, pc_tree->none); none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE]; none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist); av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode < 2 || none_rdc.skip_txfm != 1 || pc_tree->none->mic.mode == NEWMV) { do_split = calc_do_split_flag(cpi, x, pc_tree, &none_rdc, mi_params, mi_row, mi_col, hbs, bsize, partition); if (do_split) { av1_init_rd_stats(&split_rdc); split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT]; for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { RD_STATS block_rdc; av1_invalid_rd_stats(&block_rdc); int x_idx = (i & 1) * hbs; int y_idx = (i >> 1) * hbs; if ((mi_row + y_idx >= mi_params->mi_rows) || (mi_col + x_idx >= mi_params->mi_cols)) continue; xd->above_txfm_context = cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx; xd->left_txfm_context = xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK); if (!pc_tree->split[i]->none) { pc_tree->split[i]->none = av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); if (!pc_tree->split[i]->none) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } else { av1_reset_pmc(pc_tree->split[i]->none); } pc_tree->split[i]->partitioning = PARTITION_NONE; pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &block_rdc, subsize, pc_tree->split[i]->none); // TODO(yunqingwang): The rate here did not include the cost of // signaling PARTITION_NONE token in the sub-blocks. split_rdc.rate += block_rdc.rate; split_rdc.dist += block_rdc.dist; av1_rd_cost_update(x->rdmult, &split_rdc); if (none_rdc.rdcost < split_rdc.rdcost) { break; } if (i != SUB_PARTITIONS_SPLIT - 1) encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1, subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL); } av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist); } } if (none_rdc.rdcost < split_rdc.rdcost) { /* Predicted samples can not be reused for PARTITION_NONE since same * buffer is being used to store the reconstructed samples of * PARTITION_SPLIT block. */ if (do_split) x->reuse_inter_pred = false; mib[0]->bsize = bsize; pc_tree->partitioning = PARTITION_NONE; encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition, pc_tree->none, NULL); } else { mib[0]->bsize = subsize; pc_tree->partitioning = PARTITION_SPLIT; /* Predicted samples can not be reused for PARTITION_SPLIT since same * buffer is being used to write the reconstructed samples. */ // TODO(Cherma): Store and reuse predicted samples generated by // encode_b_nonrd() in DRY_RUN_NORMAL mode. x->reuse_inter_pred = false; for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { int x_idx = (i & 1) * hbs; int y_idx = (i >> 1) * hbs; if ((mi_row + y_idx >= mi_params->mi_rows) || (mi_col + x_idx >= mi_params->mi_cols)) continue; // Note: We don't reset pc_tree->split[i]->none here because it // could contain results from the additional check. Instead, it is // reset before we enter the nonrd_check_partition_merge_mode // condition. if (!pc_tree->split[i]->none) { pc_tree->split[i]->none = av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); if (!pc_tree->split[i]->none) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0, subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL); } } } // Evaluate if the sub-partitions can be merged directly into a large partition // without calculating the RD cost. static void direct_partition_merging(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, MB_MODE_INFO **mib, int mi_row, int mi_col, BLOCK_SIZE bsize) { AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; const int bs = mi_size_wide[bsize]; const int hbs = bs / 2; const PARTITION_TYPE partition = (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize) : PARTITION_NONE; BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); MB_MODE_INFO **b0 = mib; MB_MODE_INFO **b1 = mib + hbs; MB_MODE_INFO **b2 = mib + hbs * mi_params->mi_stride; MB_MODE_INFO **b3 = mib + hbs * mi_params->mi_stride + hbs; // Check if the following conditions are met. This can be updated // later with more support added. const int further_split = b0[0]->bsize < subsize || b1[0]->bsize < subsize || b2[0]->bsize < subsize || b3[0]->bsize < subsize; if (further_split) return; const int no_skip = !b0[0]->skip_txfm || !b1[0]->skip_txfm || !b2[0]->skip_txfm || !b3[0]->skip_txfm; if (no_skip) return; const int compound = (b0[0]->ref_frame[1] != b1[0]->ref_frame[1] || b0[0]->ref_frame[1] != b2[0]->ref_frame[1] || b0[0]->ref_frame[1] != b3[0]->ref_frame[1] || b0[0]->ref_frame[1] > NONE_FRAME); if (compound) return; // Intra modes aren't considered here. const int different_ref = (b0[0]->ref_frame[0] != b1[0]->ref_frame[0] || b0[0]->ref_frame[0] != b2[0]->ref_frame[0] || b0[0]->ref_frame[0] != b3[0]->ref_frame[0] || b0[0]->ref_frame[0] <= INTRA_FRAME); if (different_ref) return; const int different_mode = (b0[0]->mode != b1[0]->mode || b0[0]->mode != b2[0]->mode || b0[0]->mode != b3[0]->mode); if (different_mode) return; const int unsupported_mode = (b0[0]->mode != NEARESTMV && b0[0]->mode != GLOBALMV); if (unsupported_mode) return; const int different_mv = (b0[0]->mv[0].as_int != b1[0]->mv[0].as_int || b0[0]->mv[0].as_int != b2[0]->mv[0].as_int || b0[0]->mv[0].as_int != b3[0]->mv[0].as_int); if (different_mv) return; const int unsupported_motion_mode = (b0[0]->motion_mode != b1[0]->motion_mode || b0[0]->motion_mode != b2[0]->motion_mode || b0[0]->motion_mode != b3[0]->motion_mode || b0[0]->motion_mode != SIMPLE_TRANSLATION); if (unsupported_motion_mode) return; const int diffent_filter = (b0[0]->interp_filters.as_int != b1[0]->interp_filters.as_int || b0[0]->interp_filters.as_int != b2[0]->interp_filters.as_int || b0[0]->interp_filters.as_int != b3[0]->interp_filters.as_int); if (diffent_filter) return; const int different_seg = (b0[0]->segment_id != b1[0]->segment_id || b0[0]->segment_id != b2[0]->segment_id || b0[0]->segment_id != b3[0]->segment_id); if (different_seg) return; // Evaluate the ref_mv. MB_MODE_INFO **this_mi = mib; BLOCK_SIZE orig_bsize = this_mi[0]->bsize; const PARTITION_TYPE orig_partition = this_mi[0]->partition; this_mi[0]->bsize = bsize; this_mi[0]->partition = PARTITION_NONE; this_mi[0]->skip_txfm = 1; // TODO(yunqing): functions called below can be optimized by // removing unrelated operations. av1_set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize); const MV_REFERENCE_FRAME ref_frame = this_mi[0]->ref_frame[0]; int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES]; struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]; int force_skip_low_temp_var = 0; int skip_pred_mv = 0; bool use_scaled_ref; for (int i = 0; i < MB_MODE_COUNT; ++i) { for (int j = 0; j < REF_FRAMES; ++j) { frame_mv[i][j].as_int = INVALID_MV; } } av1_copy(x->color_sensitivity, x->color_sensitivity_sb); skip_pred_mv = (x->nonrd_prune_ref_frame_search > 2 && x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] != 2 && x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2); find_predictors(cpi, x, ref_frame, frame_mv, yv12_mb, bsize, force_skip_low_temp_var, skip_pred_mv, &use_scaled_ref); int continue_merging = 1; if (frame_mv[NEARESTMV][ref_frame].as_mv.row != b0[0]->mv[0].as_mv.row || frame_mv[NEARESTMV][ref_frame].as_mv.col != b0[0]->mv[0].as_mv.col) continue_merging = 0; if (!continue_merging) { this_mi[0]->bsize = orig_bsize; this_mi[0]->partition = orig_partition; // TODO(yunqing): Store the results and restore here instead of // calling find_predictors() again. av1_set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row, mi_col, this_mi[0]->bsize); find_predictors(cpi, x, ref_frame, frame_mv, yv12_mb, this_mi[0]->bsize, force_skip_low_temp_var, skip_pred_mv, &use_scaled_ref); } else { struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame); const int is_scaled = av1_is_scaled(sf); const int is_y_subpel_mv = (abs(this_mi[0]->mv[0].as_mv.row) % 8) || (abs(this_mi[0]->mv[0].as_mv.col) % 8); const int is_uv_subpel_mv = (abs(this_mi[0]->mv[0].as_mv.row) % 16) || (abs(this_mi[0]->mv[0].as_mv.col) % 16); if (cpi->ppi->use_svc || is_scaled || is_y_subpel_mv || is_uv_subpel_mv) { const int num_planes = av1_num_planes(cm); set_ref_ptrs(cm, xd, ref_frame, this_mi[0]->ref_frame[1]); const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame); av1_setup_pre_planes(xd, 0, cfg, mi_row, mi_col, xd->block_ref_scale_factors[0], num_planes); if (!cpi->ppi->use_svc && !is_scaled && !is_y_subpel_mv) { assert(is_uv_subpel_mv == 1); av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 1, num_planes - 1); } else { av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, num_planes - 1); } } // Copy out mbmi_ext information. MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame = x->mbmi_ext_frame; av1_copy_mbmi_ext_to_mbmi_ext_frame( mbmi_ext_frame, mbmi_ext, av1_ref_frame_type(this_mi[0]->ref_frame)); const BLOCK_SIZE this_subsize = get_partition_subsize(bsize, this_mi[0]->partition); // Update partition contexts. update_ext_partition_context(xd, mi_row, mi_col, this_subsize, bsize, this_mi[0]->partition); const int num_planes = av1_num_planes(cm); av1_reset_entropy_context(xd, bsize, num_planes); // Note: use x->txfm_search_params.tx_mode_search_type instead of // cm->features.tx_mode here. TX_SIZE tx_size = tx_size_from_tx_mode(bsize, x->txfm_search_params.tx_mode_search_type); if (xd->lossless[this_mi[0]->segment_id]) tx_size = TX_4X4; this_mi[0]->tx_size = tx_size; memset(this_mi[0]->inter_tx_size, this_mi[0]->tx_size, sizeof(this_mi[0]->inter_tx_size)); // Update txfm contexts. xd->above_txfm_context = cm->above_contexts.txfm[tile_info->tile_row] + mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); set_txfm_ctxs(this_mi[0]->tx_size, xd->width, xd->height, this_mi[0]->skip_txfm && is_inter_block(this_mi[0]), xd); // Update mi for this partition block. for (int y = 0; y < bs; y++) { for (int x_idx = 0; x_idx < bs; x_idx++) { this_mi[x_idx + y * mi_params->mi_stride] = this_mi[0]; } } } } /*!\brief AV1 block partition application (minimal RD search). * * \ingroup partition_search * \callgraph * \callergraph * Encode the block by applying pre-calculated partition patterns that are * represented by coding block sizes stored in the mbmi array. The only * partition adjustment allowed is merging leaf split nodes if it leads to a * lower rd cost. The partition types are limited to a basic set: none, horz, * vert, and split. This function is only used in the real-time mode. * * \param[in] cpi Top-level encoder structure * \param[in] td Pointer to thread data * \param[in] tile_data Pointer to struct holding adaptive data/contexts/models for the tile during encoding * \param[in] mib Array representing MB_MODE_INFO pointers for mi blocks starting from the first pixel of the current block * \param[in] tp Pointer to the starting token * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE * \param[in] mi_col Column coordinate of the block in a step size of MI_SIZE * \param[in] bsize Current block size * \param[in] pc_tree Pointer to the PC_TREE node holding the picked partitions and mode info for the current block * * \remark Nothing is returned. The pc_tree struct is modified to store the * picked partition and modes. */ void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, MB_MODE_INFO **mib, TokenExtra **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, PC_TREE *pc_tree) { AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; const ModeCosts *mode_costs = &x->mode_costs; // Only square blocks from 8x8 to 128x128 are supported assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128); const int bs = mi_size_wide[bsize]; const int hbs = bs / 2; PARTITION_TYPE partition = (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize) : PARTITION_NONE; BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); assert(subsize <= BLOCK_LARGEST); const int pl = (bsize >= BLOCK_8X8) ? partition_plane_context(xd, mi_row, mi_col, bsize) : 0; RD_STATS dummy_cost; av1_invalid_rd_stats(&dummy_cost); if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; assert(mi_size_wide[bsize] == mi_size_high[bsize]); xd->above_txfm_context = cm->above_contexts.txfm[tile_info->tile_row] + mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); // Initialize default mode evaluation params set_mode_eval_params(cpi, x, DEFAULT_EVAL); x->reuse_inter_pred = cpi->sf.rt_sf.reuse_inter_pred_nonrd; int change_none_to_split = 0; if (partition == PARTITION_NONE && cpi->sf.rt_sf.nonrd_check_partition_split == 1) { change_none_to_split = try_split_partition(cpi, td, tile_data, tile_info, tp, x, xd, mi_params, mi_row, mi_col, bsize, pl, pc_tree); if (change_none_to_split) { partition = PARTITION_SPLIT; subsize = get_partition_subsize(bsize, partition); assert(subsize <= BLOCK_LARGEST); } } pc_tree->partitioning = partition; switch (partition) { case PARTITION_NONE: if (!pc_tree->none) { pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf); if (!pc_tree->none) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } else { av1_reset_pmc(pc_tree->none); } pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost, bsize, pc_tree->none); encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition, pc_tree->none, NULL); break; case PARTITION_VERT: for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { if (!pc_tree->vertical[i]) { pc_tree->vertical[i] = av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); if (!pc_tree->vertical[i]) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } else { av1_reset_pmc(pc_tree->vertical[i]); } } pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost, subsize, pc_tree->vertical[0]); encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize, PARTITION_VERT, pc_tree->vertical[0], NULL); if (mi_col + hbs < mi_params->mi_cols && bsize > BLOCK_8X8) { pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col + hbs, &dummy_cost, subsize, pc_tree->vertical[1]); encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col + hbs, 0, subsize, PARTITION_VERT, pc_tree->vertical[1], NULL); } break; case PARTITION_HORZ: for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { if (!pc_tree->horizontal[i]) { pc_tree->horizontal[i] = av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); if (!pc_tree->horizontal[i]) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } else { av1_reset_pmc(pc_tree->horizontal[i]); } } pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost, subsize, pc_tree->horizontal[0]); encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize, PARTITION_HORZ, pc_tree->horizontal[0], NULL); if (mi_row + hbs < mi_params->mi_rows && bsize > BLOCK_8X8) { pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + hbs, mi_col, &dummy_cost, subsize, pc_tree->horizontal[1]); encode_b_nonrd(cpi, tile_data, td, tp, mi_row + hbs, mi_col, 0, subsize, PARTITION_HORZ, pc_tree->horizontal[1], NULL); } break; case PARTITION_SPLIT: for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { if (!pc_tree->split[i]) { pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); if (!pc_tree->split[i]) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); } pc_tree->split[i]->index = i; } if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode && av1_is_leaf_split_partition(cm, mi_row, mi_col, bsize) && !frame_is_intra_only(cm) && bsize <= BLOCK_64X64) { try_merge(cpi, td, tile_data, mib, tp, mi_row, mi_col, bsize, pc_tree, partition, subsize, pl); } else { for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { int x_idx = (i & 1) * hbs; int y_idx = (i >> 1) * hbs; int jj = i >> 1, ii = i & 0x01; if ((mi_row + y_idx >= mi_params->mi_rows) || (mi_col + x_idx >= mi_params->mi_cols)) continue; av1_nonrd_use_partition( cpi, td, tile_data, mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp, mi_row + y_idx, mi_col + x_idx, subsize, pc_tree->split[i]); } if (!change_none_to_split) { // Note: Palette, cfl are not supported. if (!frame_is_intra_only(cm) && !tile_data->allow_update_cdf && cpi->sf.rt_sf.partition_direct_merging && mode_costs->partition_cost[pl][PARTITION_NONE] < mode_costs->partition_cost[pl][PARTITION_SPLIT] && (mi_row + bs <= mi_params->mi_rows) && (mi_col + bs <= mi_params->mi_cols)) { direct_partition_merging(cpi, td, tile_data, mib, mi_row, mi_col, bsize); } } } break; case PARTITION_VERT_A: case PARTITION_VERT_B: case PARTITION_HORZ_A: case PARTITION_HORZ_B: case PARTITION_HORZ_4: case PARTITION_VERT_4: assert(0 && "Cannot handle extended partition types"); default: assert(0); break; } } #if !CONFIG_REALTIME_ONLY // Try searching for an encoding for the given subblock. Returns zero if the // rdcost is already too high (to tell the caller not to bother searching for // encodings of further subblocks). static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, int is_last, int mi_row, int mi_col, BLOCK_SIZE subsize, RD_STATS best_rdcost, RD_STATS *sum_rdc, PARTITION_TYPE partition, PICK_MODE_CONTEXT *this_ctx) { MACROBLOCK *const x = &td->mb; const int orig_mult = x->rdmult; setup_block_rdmult(cpi, x, mi_row, mi_col, subsize, NO_AQ, NULL); av1_rd_cost_update(x->rdmult, &best_rdcost); RD_STATS rdcost_remaining; av1_rd_stats_subtraction(x->rdmult, &best_rdcost, sum_rdc, &rdcost_remaining); RD_STATS this_rdc; pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, partition, subsize, this_ctx, rdcost_remaining); if (this_rdc.rate == INT_MAX) { sum_rdc->rdcost = INT64_MAX; } else { sum_rdc->rate += this_rdc.rate; sum_rdc->dist += this_rdc.dist; av1_rd_cost_update(x->rdmult, sum_rdc); } if (sum_rdc->rdcost >= best_rdcost.rdcost) { x->rdmult = orig_mult; return 0; } if (!is_last) { av1_update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1); encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL); } x->rdmult = orig_mult; return 1; } // Tests an AB partition, and updates the encoder status, the pick mode // contexts, the best rdcost, and the best partition. static bool rd_test_partition3(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, PC_TREE *pc_tree, RD_STATS *best_rdc, int64_t *this_rdcost, PICK_MODE_CONTEXT *ctxs[SUB_PARTITIONS_AB], int mi_row, int mi_col, BLOCK_SIZE bsize, PARTITION_TYPE partition, const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB], const int ab_mi_pos[SUB_PARTITIONS_AB][2], const MB_MODE_INFO **mode_cache) { MACROBLOCK *const x = &td->mb; const MACROBLOCKD *const xd = &x->e_mbd; const int pl = partition_plane_context(xd, mi_row, mi_col, bsize); RD_STATS sum_rdc; av1_init_rd_stats(&sum_rdc); sum_rdc.rate = x->mode_costs.partition_cost[pl][partition]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); // Loop over sub-partitions in AB partition type. for (int i = 0; i < SUB_PARTITIONS_AB; i++) { if (mode_cache && mode_cache[i]) { x->use_mb_mode_cache = 1; x->mb_mode_cache = mode_cache[i]; } const int mode_search_success = rd_try_subblock(cpi, td, tile_data, tp, i == SUB_PARTITIONS_AB - 1, ab_mi_pos[i][0], ab_mi_pos[i][1], ab_subsize[i], *best_rdc, &sum_rdc, partition, ctxs[i]); x->use_mb_mode_cache = 0; x->mb_mode_cache = NULL; if (!mode_search_success) { return false; } } av1_rd_cost_update(x->rdmult, &sum_rdc); *this_rdcost = sum_rdc.rdcost; if (sum_rdc.rdcost >= best_rdc->rdcost) return false; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); *this_rdcost = sum_rdc.rdcost; if (sum_rdc.rdcost >= best_rdc->rdcost) return false; *best_rdc = sum_rdc; pc_tree->partitioning = partition; return true; } #if CONFIG_COLLECT_PARTITION_STATS static void init_partition_block_timing_stats( PartitionTimingStats *part_timing_stats) { av1_zero(*part_timing_stats); } static inline void start_partition_block_timer( PartitionTimingStats *part_timing_stats, PARTITION_TYPE partition_type) { assert(!part_timing_stats->timer_is_on); part_timing_stats->partition_attempts[partition_type] += 1; aom_usec_timer_start(&part_timing_stats->timer); part_timing_stats->timer_is_on = 1; } static inline void end_partition_block_timer( PartitionTimingStats *part_timing_stats, PARTITION_TYPE partition_type, int64_t rdcost) { if (part_timing_stats->timer_is_on) { aom_usec_timer_mark(&part_timing_stats->timer); const int64_t time = aom_usec_timer_elapsed(&part_timing_stats->timer); part_timing_stats->partition_times[partition_type] += time; part_timing_stats->partition_rdcost[partition_type] = rdcost; part_timing_stats->timer_is_on = 0; } } static inline void print_partition_timing_stats_with_rdcost( const PartitionTimingStats *part_timing_stats, int mi_row, int mi_col, BLOCK_SIZE bsize, FRAME_UPDATE_TYPE frame_update_type, int frame_number, const RD_STATS *best_rdc, const char *filename) { FILE *f = fopen(filename, "a"); fprintf(f, "%d,%d,%d,%d,%d,%d,%" PRId64 ",%" PRId64 ",", bsize, frame_number, frame_update_type, mi_row, mi_col, best_rdc->rate, best_rdc->dist, best_rdc->rdcost); for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { fprintf(f, "%d,", part_timing_stats->partition_decisions[idx]); } for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { fprintf(f, "%d,", part_timing_stats->partition_attempts[idx]); } for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { fprintf(f, "%" PRId64 ",", part_timing_stats->partition_times[idx]); } for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { if (part_timing_stats->partition_rdcost[idx] == INT64_MAX) { fprintf(f, "%d,", -1); } else { fprintf(f, "%" PRId64 ",", part_timing_stats->partition_rdcost[idx]); } } fprintf(f, "\n"); fclose(f); } static inline void print_partition_timing_stats( const PartitionTimingStats *part_timing_stats, int intra_only, int show_frame, const BLOCK_SIZE bsize, const char *filename) { FILE *f = fopen(filename, "a"); fprintf(f, "%d,%d,%d,", bsize, show_frame, intra_only); for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { fprintf(f, "%d,", part_timing_stats->partition_decisions[idx]); } for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { fprintf(f, "%d,", part_timing_stats->partition_attempts[idx]); } for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { fprintf(f, "%" PRId64 ",", part_timing_stats->partition_times[idx]); } fprintf(f, "\n"); fclose(f); } static inline void accumulate_partition_timing_stats( FramePartitionTimingStats *fr_part_timing_stats, const PartitionTimingStats *part_timing_stats, BLOCK_SIZE bsize) { const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize); int *agg_attempts = fr_part_timing_stats->partition_attempts[bsize_idx]; int *agg_decisions = fr_part_timing_stats->partition_decisions[bsize_idx]; int64_t *agg_times = fr_part_timing_stats->partition_times[bsize_idx]; for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { agg_attempts[idx] += part_timing_stats->partition_attempts[idx]; agg_decisions[idx] += part_timing_stats->partition_decisions[idx]; agg_times[idx] += part_timing_stats->partition_times[idx]; } } #endif // CONFIG_COLLECT_PARTITION_STATS // Initialize state variables of partition search used in // av1_rd_pick_partition(). static void init_partition_search_state_params( MACROBLOCK *x, AV1_COMP *const cpi, PartitionSearchState *part_search_state, int mi_row, int mi_col, BLOCK_SIZE bsize) { MACROBLOCKD *const xd = &x->e_mbd; const AV1_COMMON *const cm = &cpi->common; PartitionBlkParams *blk_params = &part_search_state->part_blk_params; const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; // Initialization of block size related parameters. blk_params->mi_step = mi_size_wide[bsize] / 2; blk_params->mi_row = mi_row; blk_params->mi_col = mi_col; blk_params->mi_row_edge = mi_row + blk_params->mi_step; blk_params->mi_col_edge = mi_col + blk_params->mi_step; blk_params->width = block_size_wide[bsize]; blk_params->min_partition_size_1d = block_size_wide[x->sb_enc.min_partition_size]; blk_params->subsize = get_partition_subsize(bsize, PARTITION_SPLIT); blk_params->split_bsize2 = blk_params->subsize; blk_params->bsize_at_least_8x8 = (bsize >= BLOCK_8X8); blk_params->bsize = bsize; // Check if the partition corresponds to edge block. blk_params->has_rows = (blk_params->mi_row_edge < mi_params->mi_rows); blk_params->has_cols = (blk_params->mi_col_edge < mi_params->mi_cols); // Update intra partitioning related info. part_search_state->intra_part_info = &x->part_search_info; // Prepare for segmentation CNN-based partitioning for intra-frame. if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) { part_search_state->intra_part_info->quad_tree_idx = 0; part_search_state->intra_part_info->cnn_output_valid = 0; } // Set partition plane context index. part_search_state->pl_ctx_idx = blk_params->bsize_at_least_8x8 ? partition_plane_context(xd, mi_row, mi_col, bsize) : 0; // Partition cost buffer update ModeCosts *mode_costs = &x->mode_costs; part_search_state->partition_cost = mode_costs->partition_cost[part_search_state->pl_ctx_idx]; // Initialize HORZ and VERT win flags as true for all split partitions. for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { part_search_state->split_part_rect_win[i].rect_part_win[HORZ] = true; part_search_state->split_part_rect_win[i].rect_part_win[VERT] = true; } // Initialize the rd cost. av1_init_rd_stats(&part_search_state->this_rdc); // Initialize RD costs for partition types to 0. part_search_state->none_rd = 0; av1_zero(part_search_state->split_rd); av1_zero(part_search_state->rect_part_rd); // Initialize SPLIT partition to be not ready. av1_zero(part_search_state->is_split_ctx_is_ready); // Initialize HORZ and VERT partitions to be not ready. av1_zero(part_search_state->is_rect_ctx_is_ready); // Chroma subsampling. part_search_state->ss_x = x->e_mbd.plane[1].subsampling_x; part_search_state->ss_y = x->e_mbd.plane[1].subsampling_y; // Initialize partition search flags to defaults. part_search_state->terminate_partition_search = 0; part_search_state->do_square_split = blk_params->bsize_at_least_8x8; part_search_state->do_rectangular_split = cpi->oxcf.part_cfg.enable_rect_partitions && blk_params->bsize_at_least_8x8; av1_zero(part_search_state->prune_rect_part); // Initialize allowed partition types for the partition block. part_search_state->partition_none_allowed = av1_blk_has_rows_and_cols(blk_params); part_search_state->partition_rect_allowed[HORZ] = part_search_state->do_rectangular_split && blk_params->has_cols && get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ), part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID; part_search_state->partition_rect_allowed[VERT] = part_search_state->do_rectangular_split && blk_params->has_rows && get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT), part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID; // Reset the flag indicating whether a partition leading to a rdcost lower // than the bound best_rdc has been found. part_search_state->found_best_partition = false; #if CONFIG_COLLECT_PARTITION_STATS init_partition_block_timing_stats(&part_search_state->part_timing_stats); #endif // CONFIG_COLLECT_PARTITION_STATS } // Override partition cost buffer for the edge blocks. static void set_partition_cost_for_edge_blk( AV1_COMMON const *cm, PartitionSearchState *part_search_state) { PartitionBlkParams blk_params = part_search_state->part_blk_params; assert(blk_params.bsize_at_least_8x8 && part_search_state->pl_ctx_idx >= 0); const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[part_search_state->pl_ctx_idx]; const int max_cost = av1_cost_symbol(0); for (PARTITION_TYPE i = 0; i < PARTITION_TYPES; ++i) part_search_state->tmp_partition_cost[i] = max_cost; if (blk_params.has_cols) { // At the bottom, the two possibilities are HORZ and SPLIT. aom_cdf_prob bot_cdf[2]; partition_gather_vert_alike(bot_cdf, partition_cdf, blk_params.bsize); static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT }; av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost, bot_cdf, bot_inv_map); } else if (blk_params.has_rows) { // At the right, the two possibilities are VERT and SPLIT. aom_cdf_prob rhs_cdf[2]; partition_gather_horz_alike(rhs_cdf, partition_cdf, blk_params.bsize); static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT }; av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost, rhs_cdf, rhs_inv_map); } else { // At the bottom right, we always split. part_search_state->tmp_partition_cost[PARTITION_SPLIT] = 0; } // Override the partition cost buffer. part_search_state->partition_cost = part_search_state->tmp_partition_cost; } // Reset the partition search state flags when // must_find_valid_partition is equal to 1. static inline void reset_part_limitations( AV1_COMP *const cpi, PartitionSearchState *part_search_state) { PartitionBlkParams blk_params = part_search_state->part_blk_params; const int is_rect_part_allowed = blk_params.bsize_at_least_8x8 && cpi->oxcf.part_cfg.enable_rect_partitions && (blk_params.width > blk_params.min_partition_size_1d); part_search_state->do_square_split = blk_params.bsize_at_least_8x8 && (blk_params.width > blk_params.min_partition_size_1d); part_search_state->partition_none_allowed = av1_blk_has_rows_and_cols(&blk_params) && (blk_params.width >= blk_params.min_partition_size_1d); part_search_state->partition_rect_allowed[HORZ] = blk_params.has_cols && is_rect_part_allowed && get_plane_block_size( get_partition_subsize(blk_params.bsize, PARTITION_HORZ), part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID; part_search_state->partition_rect_allowed[VERT] = blk_params.has_rows && is_rect_part_allowed && get_plane_block_size( get_partition_subsize(blk_params.bsize, PARTITION_VERT), part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID; part_search_state->terminate_partition_search = 0; } // Rectangular partitions evaluation at sub-block level. static void rd_pick_rect_partition(AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *x, PICK_MODE_CONTEXT *cur_partition_ctx, PartitionSearchState *part_search_state, RD_STATS *best_rdc, const int idx, int mi_row, int mi_col, BLOCK_SIZE bsize, PARTITION_TYPE partition_type) { // Obtain the remainder from the best rd cost // for further processing of partition. RD_STATS best_remain_rdcost; av1_rd_stats_subtraction(x->rdmult, best_rdc, &part_search_state->sum_rdc, &best_remain_rdcost); // Obtain the best mode for the partition sub-block. pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &part_search_state->this_rdc, partition_type, bsize, cur_partition_ctx, best_remain_rdcost); av1_rd_cost_update(x->rdmult, &part_search_state->this_rdc); // Update the partition rd cost with the current sub-block rd. if (part_search_state->this_rdc.rate == INT_MAX) { part_search_state->sum_rdc.rdcost = INT64_MAX; } else { part_search_state->sum_rdc.rate += part_search_state->this_rdc.rate; part_search_state->sum_rdc.dist += part_search_state->this_rdc.dist; av1_rd_cost_update(x->rdmult, &part_search_state->sum_rdc); } const RECT_PART_TYPE rect_part = partition_type == PARTITION_HORZ ? HORZ : VERT; part_search_state->rect_part_rd[rect_part][idx] = part_search_state->this_rdc.rdcost; } typedef int (*active_edge_info)(const AV1_COMP *cpi, int mi_col, int mi_step); // Checks if HORZ / VERT partition search is allowed. static inline int is_rect_part_allowed( const AV1_COMP *cpi, const PartitionSearchState *part_search_state, const active_edge_info *active_edge, RECT_PART_TYPE rect_part, const int mi_pos) { const PartitionBlkParams *blk_params = &part_search_state->part_blk_params; const int is_part_allowed = (!part_search_state->terminate_partition_search && part_search_state->partition_rect_allowed[rect_part] && !part_search_state->prune_rect_part[rect_part] && (part_search_state->do_rectangular_split || active_edge[rect_part](cpi, mi_pos, blk_params->mi_step))); return is_part_allowed; } static void rectangular_partition_search( AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, PartitionSearchState *part_search_state, RD_STATS *best_rdc, RD_RECT_PART_WIN_INFO *rect_part_win_info, const RECT_PART_TYPE start_type, const RECT_PART_TYPE end_type) { const AV1_COMMON *const cm = &cpi->common; PartitionBlkParams blk_params = part_search_state->part_blk_params; RD_STATS *sum_rdc = &part_search_state->sum_rdc; const int rect_partition_type[NUM_RECT_PARTS] = { PARTITION_HORZ, PARTITION_VERT }; // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][0]: mi_row postion of // HORZ and VERT partition types. // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][1]: mi_col postion of // HORZ and VERT partition types. const int mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][2] = { { { blk_params.mi_row, blk_params.mi_col }, { blk_params.mi_row_edge, blk_params.mi_col } }, { { blk_params.mi_row, blk_params.mi_col }, { blk_params.mi_row, blk_params.mi_col_edge } } }; // Initialize active edge_type function pointer // for HOZR and VERT partition types. active_edge_info active_edge_type[NUM_RECT_PARTS] = { av1_active_h_edge, av1_active_v_edge }; // Indicates edge blocks for HORZ and VERT partition types. const int is_not_edge_block[NUM_RECT_PARTS] = { blk_params.has_rows, blk_params.has_cols }; // Initialize pc tree context for HORZ and VERT partition types. PICK_MODE_CONTEXT **cur_ctx[NUM_RECT_PARTS][SUB_PARTITIONS_RECT] = { { &pc_tree->horizontal[0], &pc_tree->horizontal[1] }, { &pc_tree->vertical[0], &pc_tree->vertical[1] } }; // Loop over rectangular partition types. for (RECT_PART_TYPE i = start_type; i <= end_type; i++) { assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions, !part_search_state->partition_rect_allowed[i])); // Check if the HORZ / VERT partition search is to be performed. if (!is_rect_part_allowed(cpi, part_search_state, active_edge_type, i, mi_pos_rect[i][0][i])) continue; // Sub-partition idx. int sub_part_idx = 0; PARTITION_TYPE partition_type = rect_partition_type[i]; blk_params.subsize = get_partition_subsize(blk_params.bsize, partition_type); assert(blk_params.subsize <= BLOCK_LARGEST); av1_init_rd_stats(sum_rdc); for (int j = 0; j < SUB_PARTITIONS_RECT; j++) { if (cur_ctx[i][j][0] == NULL) { cur_ctx[i][j][0] = av1_alloc_pmc(cpi, blk_params.subsize, &td->shared_coeff_buf); if (!cur_ctx[i][j][0]) aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } } sum_rdc->rate = part_search_state->partition_cost[partition_type]; sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, 0); #if CONFIG_COLLECT_PARTITION_STATS PartitionTimingStats *part_timing_stats = &part_search_state->part_timing_stats; if (best_rdc->rdcost - sum_rdc->rdcost >= 0) { start_partition_block_timer(part_timing_stats, partition_type); } #endif // First sub-partition evaluation in HORZ / VERT partition type. rd_pick_rect_partition( cpi, tile_data, x, cur_ctx[i][sub_part_idx][0], part_search_state, best_rdc, 0, mi_pos_rect[i][sub_part_idx][0], mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type); // Start of second sub-partition evaluation. // Evaluate second sub-partition if the first sub-partition cost // is less than the best cost and if it is not an edge block. if (sum_rdc->rdcost < best_rdc->rdcost && is_not_edge_block[i]) { const MB_MODE_INFO *const mbmi = &cur_ctx[i][sub_part_idx][0]->mic; const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; // Neither palette mode nor cfl predicted. if (pmi->palette_size[PLANE_TYPE_Y] == 0 && pmi->palette_size[PLANE_TYPE_UV] == 0) { if (mbmi->uv_mode != UV_CFL_PRED) part_search_state->is_rect_ctx_is_ready[i] = 1; } av1_update_state(cpi, td, cur_ctx[i][sub_part_idx][0], blk_params.mi_row, blk_params.mi_col, blk_params.subsize, DRY_RUN_NORMAL); encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, blk_params.subsize, NULL); // Second sub-partition evaluation in HORZ / VERT partition type. sub_part_idx = 1; rd_pick_rect_partition( cpi, tile_data, x, cur_ctx[i][sub_part_idx][0], part_search_state, best_rdc, 1, mi_pos_rect[i][sub_part_idx][0], mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type); } // Update HORZ / VERT best partition. if (sum_rdc->rdcost < best_rdc->rdcost) { sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, sum_rdc->dist); if (sum_rdc->rdcost < best_rdc->rdcost) { *best_rdc = *sum_rdc; part_search_state->found_best_partition = true; pc_tree->partitioning = partition_type; } } else { // Update HORZ / VERT win flag. if (rect_part_win_info != NULL) rect_part_win_info->rect_part_win[i] = false; } #if CONFIG_COLLECT_PARTITION_STATS if (part_timing_stats->timer_is_on) { end_partition_block_timer(part_timing_stats, partition_type, sum_rdc->rdcost); } #endif av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col, blk_params.bsize, av1_num_planes(cm)); } } // AB partition type evaluation. static void rd_pick_ab_part( AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, PC_TREE *pc_tree, PICK_MODE_CONTEXT *dst_ctxs[SUB_PARTITIONS_AB], PartitionSearchState *part_search_state, RD_STATS *best_rdc, const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB], const int ab_mi_pos[SUB_PARTITIONS_AB][2], const PARTITION_TYPE part_type, const MB_MODE_INFO **mode_cache) { const AV1_COMMON *const cm = &cpi->common; PartitionBlkParams blk_params = part_search_state->part_blk_params; const int mi_row = blk_params.mi_row; const int mi_col = blk_params.mi_col; const BLOCK_SIZE bsize = blk_params.bsize; int64_t this_rdcost = 0; #if CONFIG_COLLECT_PARTITION_STATS PartitionTimingStats *part_timing_stats = &part_search_state->part_timing_stats; { RD_STATS tmp_sum_rdc; av1_init_rd_stats(&tmp_sum_rdc); tmp_sum_rdc.rate = part_search_state->partition_cost[part_type]; tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0); if (best_rdc->rdcost - tmp_sum_rdc.rdcost >= 0) { start_partition_block_timer(part_timing_stats, part_type); } } #endif // Test this partition and update the best partition. const bool find_best_ab_part = rd_test_partition3( cpi, td, tile_data, tp, pc_tree, best_rdc, &this_rdcost, dst_ctxs, mi_row, mi_col, bsize, part_type, ab_subsize, ab_mi_pos, mode_cache); part_search_state->found_best_partition |= find_best_ab_part; #if CONFIG_COLLECT_PARTITION_STATS if (part_timing_stats->timer_is_on) { if (!find_best_ab_part) this_rdcost = INT64_MAX; end_partition_block_timer(part_timing_stats, part_type, this_rdcost); } #endif av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm)); } // Set mode search context. static inline void set_mode_search_ctx( PC_TREE *pc_tree, const int is_ctx_ready[NUM_AB_PARTS][2], PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2]) { mode_srch_ctx[HORZ_B][0] = &pc_tree->horizontal[0]; mode_srch_ctx[VERT_B][0] = &pc_tree->vertical[0]; if (is_ctx_ready[HORZ_A][0]) mode_srch_ctx[HORZ_A][0] = &pc_tree->split[0]->none; if (is_ctx_ready[VERT_A][0]) mode_srch_ctx[VERT_A][0] = &pc_tree->split[0]->none; if (is_ctx_ready[HORZ_A][1]) mode_srch_ctx[HORZ_A][1] = &pc_tree->split[1]->none; } static inline void copy_partition_mode_from_mode_context( const MB_MODE_INFO **dst_mode, const PICK_MODE_CONTEXT *ctx) { if (ctx && ctx->rd_stats.rate < INT_MAX) { *dst_mode = &ctx->mic; } else { *dst_mode = NULL; } } static inline void copy_partition_mode_from_pc_tree( const MB_MODE_INFO **dst_mode, const PC_TREE *pc_tree) { if (pc_tree) { copy_partition_mode_from_mode_context(dst_mode, pc_tree->none); } else { *dst_mode = NULL; } } static inline void set_mode_cache_for_partition_ab( const MB_MODE_INFO **mode_cache, const PC_TREE *pc_tree, AB_PART_TYPE ab_part_type) { switch (ab_part_type) { case HORZ_A: copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]); copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]); copy_partition_mode_from_mode_context(&mode_cache[2], pc_tree->horizontal[1]); break; case HORZ_B: copy_partition_mode_from_mode_context(&mode_cache[0], pc_tree->horizontal[0]); copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]); copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]); break; case VERT_A: copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]); copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]); copy_partition_mode_from_mode_context(&mode_cache[2], pc_tree->vertical[1]); break; case VERT_B: copy_partition_mode_from_mode_context(&mode_cache[0], pc_tree->vertical[0]); copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]); copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]); break; default: assert(0 && "Invalid ab partition type!\n"); } } // AB Partitions type search. static void ab_partitions_search( AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, PC_TREE *pc_tree, PartitionSearchState *part_search_state, RD_STATS *best_rdc, RD_RECT_PART_WIN_INFO *rect_part_win_info, int pb_source_variance, int ext_partition_allowed, const AB_PART_TYPE start_type, const AB_PART_TYPE end_type) { PartitionBlkParams blk_params = part_search_state->part_blk_params; const int mi_row = blk_params.mi_row; const int mi_col = blk_params.mi_col; const BLOCK_SIZE bsize = blk_params.bsize; if (part_search_state->terminate_partition_search) { return; } int ab_partitions_allowed[NUM_AB_PARTS]; // Prune AB partitions av1_prune_ab_partitions(cpi, x, pc_tree, pb_source_variance, best_rdc->rdcost, rect_part_win_info, ext_partition_allowed, part_search_state, ab_partitions_allowed); // Flags to indicate whether the mode search is done. const int is_ctx_ready[NUM_AB_PARTS][2] = { { part_search_state->is_split_ctx_is_ready[0], part_search_state->is_split_ctx_is_ready[1] }, { part_search_state->is_rect_ctx_is_ready[HORZ], 0 }, { part_search_state->is_split_ctx_is_ready[0], 0 }, { part_search_state->is_rect_ctx_is_ready[VERT], 0 } }; // Current partition context. PICK_MODE_CONTEXT **cur_part_ctxs[NUM_AB_PARTS] = { pc_tree->horizontala, pc_tree->horizontalb, pc_tree->verticala, pc_tree->verticalb }; // Context of already evaluted partition types. PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2]; // Set context of already evaluted partition types. set_mode_search_ctx(pc_tree, is_ctx_ready, mode_srch_ctx); // Array of sub-partition size of AB partition types. const BLOCK_SIZE ab_subsize[NUM_AB_PARTS][SUB_PARTITIONS_AB] = { { blk_params.split_bsize2, blk_params.split_bsize2, get_partition_subsize(bsize, PARTITION_HORZ_A) }, { get_partition_subsize(bsize, PARTITION_HORZ_B), blk_params.split_bsize2, blk_params.split_bsize2 }, { blk_params.split_bsize2, blk_params.split_bsize2, get_partition_subsize(bsize, PARTITION_VERT_A) }, { get_partition_subsize(bsize, PARTITION_VERT_B), blk_params.split_bsize2, blk_params.split_bsize2 } }; // Array of mi_row, mi_col positions corresponds to each sub-partition in AB // partition types. const int ab_mi_pos[NUM_AB_PARTS][SUB_PARTITIONS_AB][2] = { { { mi_row, mi_col }, { mi_row, blk_params.mi_col_edge }, { blk_params.mi_row_edge, mi_col } }, { { mi_row, mi_col }, { blk_params.mi_row_edge, mi_col }, { blk_params.mi_row_edge, blk_params.mi_col_edge } }, { { mi_row, mi_col }, { blk_params.mi_row_edge, mi_col }, { mi_row, blk_params.mi_col_edge } }, { { mi_row, mi_col }, { mi_row, blk_params.mi_col_edge }, { blk_params.mi_row_edge, blk_params.mi_col_edge } } }; // Loop over AB partition types. for (AB_PART_TYPE ab_part_type = start_type; ab_part_type <= end_type; ab_part_type++) { const PARTITION_TYPE part_type = ab_part_type + PARTITION_HORZ_A; // Check if the AB partition search is to be performed. if (!ab_partitions_allowed[ab_part_type]) { continue; } blk_params.subsize = get_partition_subsize(bsize, part_type); for (int i = 0; i < SUB_PARTITIONS_AB; i++) { // Set AB partition context. cur_part_ctxs[ab_part_type][i] = av1_alloc_pmc( cpi, ab_subsize[ab_part_type][i], &td->shared_coeff_buf); if (!cur_part_ctxs[ab_part_type][i]) aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); // Set mode as not ready. cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0; } if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab) { // We can copy directly the mode search results if we have already // searched the current block and the contexts match. if (is_ctx_ready[ab_part_type][0]) { av1_copy_tree_context(cur_part_ctxs[ab_part_type][0], mode_srch_ctx[ab_part_type][0][0]); cur_part_ctxs[ab_part_type][0]->mic.partition = part_type; cur_part_ctxs[ab_part_type][0]->rd_mode_is_ready = 1; if (is_ctx_ready[ab_part_type][1]) { av1_copy_tree_context(cur_part_ctxs[ab_part_type][1], mode_srch_ctx[ab_part_type][1][0]); cur_part_ctxs[ab_part_type][1]->mic.partition = part_type; cur_part_ctxs[ab_part_type][1]->rd_mode_is_ready = 1; } } } // Even if the contexts don't match, we can still speed up by reusing the // previous prediction mode. const MB_MODE_INFO *mode_cache[3] = { NULL, NULL, NULL }; if (cpi->sf.part_sf.reuse_best_prediction_for_part_ab) { set_mode_cache_for_partition_ab(mode_cache, pc_tree, ab_part_type); } // Evaluation of AB partition type. rd_pick_ab_part(cpi, td, tile_data, tp, x, x_ctx, pc_tree, cur_part_ctxs[ab_part_type], part_search_state, best_rdc, ab_subsize[ab_part_type], ab_mi_pos[ab_part_type], part_type, mode_cache); } } // Set mi positions for HORZ4 / VERT4 sub-block partitions. static void set_mi_pos_partition4(const int inc_step[NUM_PART4_TYPES], int mi_pos[SUB_PARTITIONS_PART4][2], const int mi_row, const int mi_col) { for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; i++) { mi_pos[i][0] = mi_row + i * inc_step[HORZ4]; mi_pos[i][1] = mi_col + i * inc_step[VERT4]; } } // Set context and RD cost for HORZ4 / VERT4 partition types. static void set_4_part_ctx_and_rdcost( MACROBLOCK *x, const AV1_COMP *const cpi, ThreadData *td, PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4], PartitionSearchState *part_search_state, PARTITION_TYPE partition_type, BLOCK_SIZE bsize) { // Initialize sum_rdc RD cost structure. av1_init_rd_stats(&part_search_state->sum_rdc); const int subsize = get_partition_subsize(bsize, partition_type); part_search_state->sum_rdc.rate = part_search_state->partition_cost[partition_type]; part_search_state->sum_rdc.rdcost = RDCOST(x->rdmult, part_search_state->sum_rdc.rate, 0); for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) { cur_part_ctx[i] = av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); if (!cur_part_ctx[i]) aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } } // Partition search of HORZ4 / VERT4 partition types. static void rd_pick_4partition( AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, PC_TREE *pc_tree, PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4], PartitionSearchState *part_search_state, RD_STATS *best_rdc, const int inc_step[NUM_PART4_TYPES], PARTITION_TYPE partition_type) { const AV1_COMMON *const cm = &cpi->common; PartitionBlkParams blk_params = part_search_state->part_blk_params; // mi positions needed for HORZ4 and VERT4 partition types. int mi_pos_check[NUM_PART4_TYPES] = { cm->mi_params.mi_rows, cm->mi_params.mi_cols }; const PART4_TYPES part4_idx = (partition_type != PARTITION_HORZ_4); int mi_pos[SUB_PARTITIONS_PART4][2]; blk_params.subsize = get_partition_subsize(blk_params.bsize, partition_type); // Set partition context and RD cost. set_4_part_ctx_and_rdcost(x, cpi, td, cur_part_ctx, part_search_state, partition_type, blk_params.bsize); // Set mi positions for sub-block sizes. set_mi_pos_partition4(inc_step, mi_pos, blk_params.mi_row, blk_params.mi_col); #if CONFIG_COLLECT_PARTITION_STATS PartitionTimingStats *part_timing_stats = &part_search_state->part_timing_stats; if (best_rdc->rdcost - part_search_state->sum_rdc.rdcost >= 0) { start_partition_block_timer(part_timing_stats, partition_type); } #endif // Loop over sub-block partitions. for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) { if (i > 0 && mi_pos[i][part4_idx] >= mi_pos_check[part4_idx]) break; // Sub-block evaluation of Horz4 / Vert4 partition type. cur_part_ctx[i]->rd_mode_is_ready = 0; if (!rd_try_subblock( cpi, td, tile_data, tp, (i == SUB_PARTITIONS_PART4 - 1), mi_pos[i][0], mi_pos[i][1], blk_params.subsize, *best_rdc, &part_search_state->sum_rdc, partition_type, cur_part_ctx[i])) { av1_invalid_rd_stats(&part_search_state->sum_rdc); break; } } // Calculate the total cost and update the best partition. av1_rd_cost_update(x->rdmult, &part_search_state->sum_rdc); if (part_search_state->sum_rdc.rdcost < best_rdc->rdcost) { *best_rdc = part_search_state->sum_rdc; part_search_state->found_best_partition = true; pc_tree->partitioning = partition_type; } #if CONFIG_COLLECT_PARTITION_STATS if (part_timing_stats->timer_is_on) { end_partition_block_timer(part_timing_stats, partition_type, part_search_state->sum_rdc.rdcost); } #endif av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col, blk_params.bsize, av1_num_planes(cm)); } // Do not evaluate extended partitions if NONE partition is skippable. static inline int prune_ext_part_none_skippable( PICK_MODE_CONTEXT *part_none, int must_find_valid_partition, int skip_non_sq_part_based_on_none, BLOCK_SIZE bsize) { if ((skip_non_sq_part_based_on_none >= 1) && (part_none != NULL)) { if (part_none->skippable && !must_find_valid_partition && bsize >= BLOCK_16X16) { return 1; } } return 0; } // Allow ab partition search static int allow_ab_partition_search(PartitionSearchState *part_search_state, PARTITION_SPEED_FEATURES *part_sf, PARTITION_TYPE curr_best_part, int must_find_valid_partition, int prune_ext_part_state, int64_t best_rdcost) { const PartitionBlkParams blk_params = part_search_state->part_blk_params; const BLOCK_SIZE bsize = blk_params.bsize; // Do not prune if there is no valid partition if (best_rdcost == INT64_MAX) return 1; // Determine bsize threshold to evaluate ab partitions BLOCK_SIZE ab_bsize_thresh = part_sf->ext_partition_eval_thresh; if (part_sf->ext_part_eval_based_on_cur_best && !must_find_valid_partition && !(curr_best_part == PARTITION_HORZ || curr_best_part == PARTITION_VERT)) ab_bsize_thresh = BLOCK_128X128; // ab partitions are only allowed for square block sizes BLOCK_16X16 or // higher, so ab_bsize_thresh must be large enough to exclude BLOCK_4X4 and // BLOCK_8X8. assert(ab_bsize_thresh >= BLOCK_8X8); int ab_partition_allowed = part_search_state->do_rectangular_split && bsize > ab_bsize_thresh && av1_blk_has_rows_and_cols(&blk_params) && !prune_ext_part_state; return ab_partition_allowed; } // Prune 4-way partitions based on the number of horz/vert wins // in the current block and sub-blocks in PARTITION_SPLIT. static void prune_4_partition_using_split_info( AV1_COMP *const cpi, MACROBLOCK *x, PartitionSearchState *part_search_state, int part4_search_allowed[NUM_PART4_TYPES]) { PART4_TYPES cur_part[NUM_PART4_TYPES] = { HORZ4, VERT4 }; // Count of child blocks in which HORZ or VERT partition has won int num_child_rect_win[NUM_RECT_PARTS] = { 0, 0 }; // Prune HORZ4/VERT4 partitions based on number of HORZ/VERT winners of // split partiitons. // Conservative pruning for high quantizers. const int num_win_thresh = AOMMIN(3 * (MAXQ - x->qindex) / MAXQ + 1, 3); for (RECT_PART_TYPE i = HORZ; i < NUM_RECT_PARTS; i++) { if (!(cpi->sf.part_sf.prune_ext_part_using_split_info && part4_search_allowed[cur_part[i]])) continue; // Loop over split partitions. // Get rectangular partitions winner info of split partitions. for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; idx++) num_child_rect_win[i] += (part_search_state->split_part_rect_win[idx].rect_part_win[i]) ? 1 : 0; if (num_child_rect_win[i] < num_win_thresh) { part4_search_allowed[cur_part[i]] = 0; } } } // Prune 4-way partition search. static void prune_4_way_partition_search( AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, PartitionSearchState *part_search_state, RD_STATS *best_rdc, int pb_source_variance, int prune_ext_part_state, int part4_search_allowed[NUM_PART4_TYPES]) { const PartitionBlkParams blk_params = part_search_state->part_blk_params; const BLOCK_SIZE bsize = blk_params.bsize; // Do not prune if there is no valid partition if (best_rdc->rdcost == INT64_MAX) return; // Determine bsize threshold to evaluate 4-way partitions BLOCK_SIZE part4_bsize_thresh = cpi->sf.part_sf.ext_partition_eval_thresh; if (cpi->sf.part_sf.ext_part_eval_based_on_cur_best && !x->must_find_valid_partition && pc_tree->partitioning == PARTITION_NONE) part4_bsize_thresh = BLOCK_128X128; // 4-way partitions are only allowed for BLOCK_16X16, BLOCK_32X32, and // BLOCK_64X64, so part4_bsize_thresh must be large enough to exclude // BLOCK_4X4 and BLOCK_8X8. assert(part4_bsize_thresh >= BLOCK_8X8); bool partition4_allowed = part_search_state->do_rectangular_split && bsize > part4_bsize_thresh && av1_blk_has_rows_and_cols(&blk_params) && !prune_ext_part_state; // Disable 4-way partition search flags for width less than a multiple of the // minimum partition width. if (blk_params.width < (blk_params.min_partition_size_1d << cpi->sf.part_sf.prune_part4_search)) { part4_search_allowed[HORZ4] = 0; part4_search_allowed[VERT4] = 0; return; } PARTITION_TYPE cur_part[NUM_PART4_TYPES] = { PARTITION_HORZ_4, PARTITION_VERT_4 }; const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg; // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or // PARTITION_VERT_4 for this block. This is almost the same as // partition4_allowed, except that we don't allow 128x32 or 32x128 // blocks, so we require that bsize is not BLOCK_128X128. partition4_allowed &= part_cfg->enable_1to4_partitions && bsize != BLOCK_128X128; for (PART4_TYPES i = HORZ4; i < NUM_PART4_TYPES; i++) { part4_search_allowed[i] = partition4_allowed && part_search_state->partition_rect_allowed[i] && get_plane_block_size(get_partition_subsize(bsize, cur_part[i]), part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID; } // Pruning: pruning out 4-way partitions based on the current best partition. if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 2) { part4_search_allowed[HORZ4] &= (pc_tree->partitioning == PARTITION_HORZ || pc_tree->partitioning == PARTITION_HORZ_A || pc_tree->partitioning == PARTITION_HORZ_B || pc_tree->partitioning == PARTITION_SPLIT || pc_tree->partitioning == PARTITION_NONE); part4_search_allowed[VERT4] &= (pc_tree->partitioning == PARTITION_VERT || pc_tree->partitioning == PARTITION_VERT_A || pc_tree->partitioning == PARTITION_VERT_B || pc_tree->partitioning == PARTITION_SPLIT || pc_tree->partitioning == PARTITION_NONE); } // Pruning: pruning out some 4-way partitions using a DNN taking rd costs of // sub-blocks from basic partition types. if (cpi->sf.part_sf.ml_prune_partition && partition4_allowed && part_search_state->partition_rect_allowed[HORZ] && part_search_state->partition_rect_allowed[VERT]) { av1_ml_prune_4_partition(cpi, x, pc_tree->partitioning, best_rdc->rdcost, part_search_state, part4_search_allowed, pb_source_variance); } // Pruning: pruning out 4-way partitions based on the number of horz/vert wins // in the current block and sub-blocks in PARTITION_SPLIT. prune_4_partition_using_split_info(cpi, x, part_search_state, part4_search_allowed); } // Set params needed for PARTITION_NONE search. static void set_none_partition_params(const AV1_COMP *const cpi, ThreadData *td, MACROBLOCK *x, PC_TREE *pc_tree, PartitionSearchState *part_search_state, RD_STATS *best_remain_rdcost, RD_STATS *best_rdc, int *pt_cost) { PartitionBlkParams blk_params = part_search_state->part_blk_params; RD_STATS partition_rdcost; // Set PARTITION_NONE context. if (pc_tree->none == NULL) pc_tree->none = av1_alloc_pmc(cpi, blk_params.bsize, &td->shared_coeff_buf); if (!pc_tree->none) aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); // Set PARTITION_NONE type cost. if (part_search_state->partition_none_allowed) { if (blk_params.bsize_at_least_8x8) { *pt_cost = part_search_state->partition_cost[PARTITION_NONE] < INT_MAX ? part_search_state->partition_cost[PARTITION_NONE] : 0; } // Initialize the RD stats structure. av1_init_rd_stats(&partition_rdcost); partition_rdcost.rate = *pt_cost; av1_rd_cost_update(x->rdmult, &partition_rdcost); av1_rd_stats_subtraction(x->rdmult, best_rdc, &partition_rdcost, best_remain_rdcost); } } // Skip other partitions based on PARTITION_NONE rd cost. static void prune_partitions_after_none(AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, PICK_MODE_CONTEXT *ctx_none, PartitionSearchState *part_search_state, RD_STATS *best_rdc, unsigned int *pb_source_variance) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; const PartitionBlkParams blk_params = part_search_state->part_blk_params; RD_STATS *this_rdc = &part_search_state->this_rdc; const BLOCK_SIZE bsize = blk_params.bsize; assert(bsize < BLOCK_SIZES_ALL); if (!frame_is_intra_only(cm) && (part_search_state->do_square_split || part_search_state->do_rectangular_split) && !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) { const int use_ml_based_breakout = bsize <= cpi->sf.part_sf.use_square_partition_only_threshold && bsize > BLOCK_4X4 && cpi->sf.part_sf.ml_predict_breakout_level >= 1; if (use_ml_based_breakout) { av1_ml_predict_breakout(cpi, x, this_rdc, *pb_source_variance, xd->bd, part_search_state); } // Adjust dist breakout threshold according to the partition size. const int64_t dist_breakout_thr = cpi->sf.part_sf.partition_search_breakout_dist_thr >> ((2 * (MAX_SB_SIZE_LOG2 - 2)) - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize])); const int rate_breakout_thr = cpi->sf.part_sf.partition_search_breakout_rate_thr * num_pels_log2_lookup[bsize]; // If all y, u, v transform blocks in this partition are skippable, // and the dist & rate are within the thresholds, the partition // search is terminated for current branch of the partition search // tree. The dist & rate thresholds are set to 0 at speed 0 to // disable the early termination at that speed. if (best_rdc->dist < dist_breakout_thr && best_rdc->rate < rate_breakout_thr) { part_search_state->do_square_split = 0; part_search_state->do_rectangular_split = 0; } } // Early termination: using simple_motion_search features and the // rate, distortion, and rdcost of PARTITION_NONE, a DNN will make a // decision on early terminating at PARTITION_NONE. if (cpi->sf.part_sf.simple_motion_search_early_term_none && cm->show_frame && !frame_is_intra_only(cm) && bsize >= BLOCK_16X16 && av1_blk_has_rows_and_cols(&blk_params) && this_rdc->rdcost < INT64_MAX && this_rdc->rdcost >= 0 && this_rdc->rate < INT_MAX && this_rdc->rate >= 0 && (part_search_state->do_square_split || part_search_state->do_rectangular_split)) { av1_simple_motion_search_early_term_none(cpi, x, sms_tree, this_rdc, part_search_state); } } // Decide early termination and rectangular partition pruning // based on PARTITION_NONE and PARTITION_SPLIT costs. static void prune_partitions_after_split( AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, PartitionSearchState *part_search_state, RD_STATS *best_rdc, int64_t part_none_rd, int64_t part_split_rd) { const AV1_COMMON *const cm = &cpi->common; PartitionBlkParams blk_params = part_search_state->part_blk_params; const int mi_row = blk_params.mi_row; const int mi_col = blk_params.mi_col; const BLOCK_SIZE bsize = blk_params.bsize; assert(bsize < BLOCK_SIZES_ALL); // Early termination: using the rd costs of PARTITION_NONE and subblocks // from PARTITION_SPLIT to determine an early breakout. if (cpi->sf.part_sf.ml_early_term_after_part_split_level && !frame_is_intra_only(cm) && !part_search_state->terminate_partition_search && part_search_state->do_rectangular_split && (part_search_state->partition_rect_allowed[HORZ] || part_search_state->partition_rect_allowed[VERT])) { av1_ml_early_term_after_split( cpi, x, sms_tree, best_rdc->rdcost, part_none_rd, part_split_rd, part_search_state->split_rd, part_search_state); } // Use the rd costs of PARTITION_NONE and subblocks from PARTITION_SPLIT // to prune out rectangular partitions in some directions. if (!cpi->sf.part_sf.ml_early_term_after_part_split_level && cpi->sf.part_sf.ml_prune_partition && !frame_is_intra_only(cm) && (part_search_state->partition_rect_allowed[HORZ] || part_search_state->partition_rect_allowed[VERT]) && !(part_search_state->prune_rect_part[HORZ] || part_search_state->prune_rect_part[VERT]) && !part_search_state->terminate_partition_search) { av1_setup_src_planes(x, cpi->source, mi_row, mi_col, av1_num_planes(cm), bsize); av1_ml_prune_rect_partition(cpi, x, best_rdc->rdcost, part_search_state->none_rd, part_search_state->split_rd, part_search_state); } } // Returns true if either of the left and top neighbor blocks is larger than // the current block; false otherwise. static inline bool is_neighbor_blk_larger_than_cur_blk(const MACROBLOCKD *xd, BLOCK_SIZE bsize) { const int cur_blk_area = (block_size_high[bsize] * block_size_wide[bsize]); if (xd->left_available) { const BLOCK_SIZE left_bsize = xd->left_mbmi->bsize; if (block_size_high[left_bsize] * block_size_wide[left_bsize] > cur_blk_area) return true; } if (xd->up_available) { const BLOCK_SIZE above_bsize = xd->above_mbmi->bsize; if (block_size_high[above_bsize] * block_size_wide[above_bsize] > cur_blk_area) return true; } return false; } static inline void prune_rect_part_using_none_pred_mode( const MACROBLOCKD *xd, PartitionSearchState *part_state, PREDICTION_MODE mode, BLOCK_SIZE bsize) { if (mode == DC_PRED || mode == SMOOTH_PRED) { // If the prediction mode of NONE partition is either DC_PRED or // SMOOTH_PRED, it indicates that the current block has less variation. In // this case, HORZ and VERT partitions are pruned if at least one of left // and top neighbor blocks is larger than the current block. if (is_neighbor_blk_larger_than_cur_blk(xd, bsize)) { part_state->prune_rect_part[HORZ] = 1; part_state->prune_rect_part[VERT] = 1; } } else if (mode == D67_PRED || mode == V_PRED || mode == D113_PRED) { // If the prediction mode chosen by NONE partition is close to 90 degrees, // it implies a dominant vertical pattern, and the chance of choosing a // vertical rectangular partition is high. Hence, horizontal partition is // pruned in these cases. part_state->prune_rect_part[HORZ] = 1; } else if (mode == D157_PRED || mode == H_PRED || mode == D203_PRED) { // If the prediction mode chosen by NONE partition is close to 180 degrees, // it implies a dominant horizontal pattern, and the chance of choosing a // horizontal rectangular partition is high. Hence, vertical partition is // pruned in these cases. part_state->prune_rect_part[VERT] = 1; } } // PARTITION_NONE search. static void none_partition_search( AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, MACROBLOCK *x, PC_TREE *pc_tree, SIMPLE_MOTION_DATA_TREE *sms_tree, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, PartitionSearchState *part_search_state, RD_STATS *best_rdc, unsigned int *pb_source_variance, int64_t *none_rd, int64_t *part_none_rd) { const AV1_COMMON *const cm = &cpi->common; PartitionBlkParams blk_params = part_search_state->part_blk_params; RD_STATS *this_rdc = &part_search_state->this_rdc; const int mi_row = blk_params.mi_row; const int mi_col = blk_params.mi_col; const BLOCK_SIZE bsize = blk_params.bsize; assert(bsize < BLOCK_SIZES_ALL); if (part_search_state->terminate_partition_search || !part_search_state->partition_none_allowed) return; int pt_cost = 0; RD_STATS best_remain_rdcost; av1_invalid_rd_stats(&best_remain_rdcost); // Set PARTITION_NONE context and cost. set_none_partition_params(cpi, td, x, pc_tree, part_search_state, &best_remain_rdcost, best_rdc, &pt_cost); #if CONFIG_COLLECT_PARTITION_STATS // Timer start for partition None. PartitionTimingStats *part_timing_stats = &part_search_state->part_timing_stats; if (best_remain_rdcost.rdcost >= 0) { start_partition_block_timer(part_timing_stats, PARTITION_NONE); } #endif // PARTITION_NONE evaluation and cost update. pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc, PARTITION_NONE, bsize, pc_tree->none, best_remain_rdcost); av1_rd_cost_update(x->rdmult, this_rdc); #if CONFIG_COLLECT_PARTITION_STATS // Timer end for partition None. if (part_timing_stats->timer_is_on) { RD_STATS tmp_rdc; av1_init_rd_stats(&tmp_rdc); if (this_rdc->rate != INT_MAX) { tmp_rdc.rate = this_rdc->rate; tmp_rdc.dist = this_rdc->dist; tmp_rdc.rdcost = this_rdc->rdcost; if (blk_params.bsize_at_least_8x8) { tmp_rdc.rate += pt_cost; tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist); } } end_partition_block_timer(part_timing_stats, PARTITION_NONE, tmp_rdc.rdcost); } #endif *pb_source_variance = x->source_variance; if (none_rd) *none_rd = this_rdc->rdcost; part_search_state->none_rd = this_rdc->rdcost; if (this_rdc->rate != INT_MAX) { // Record picked ref frame to prune ref frames for other partition types. if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions) { const int ref_type = av1_ref_frame_type(pc_tree->none->mic.ref_frame); av1_update_picked_ref_frames_mask( x, ref_type, bsize, cm->seq_params->mib_size, mi_row, mi_col); } // Calculate the total cost and update the best partition. if (blk_params.bsize_at_least_8x8) { this_rdc->rate += pt_cost; this_rdc->rdcost = RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist); } *part_none_rd = this_rdc->rdcost; if (this_rdc->rdcost < best_rdc->rdcost) { *best_rdc = *this_rdc; part_search_state->found_best_partition = true; if (blk_params.bsize_at_least_8x8) { pc_tree->partitioning = PARTITION_NONE; } // Disable split and rectangular partition search // based on PARTITION_NONE cost. prune_partitions_after_none(cpi, x, sms_tree, pc_tree->none, part_search_state, best_rdc, pb_source_variance); } if (cpi->sf.part_sf.prune_rect_part_using_none_pred_mode) prune_rect_part_using_none_pred_mode(&x->e_mbd, part_search_state, pc_tree->none->mic.mode, bsize); } av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm)); } // PARTITION_SPLIT search. static void split_partition_search( AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree, SIMPLE_MOTION_DATA_TREE *sms_tree, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, PartitionSearchState *part_search_state, RD_STATS *best_rdc, SB_MULTI_PASS_MODE multi_pass_mode, int64_t *part_split_rd) { const AV1_COMMON *const cm = &cpi->common; PartitionBlkParams blk_params = part_search_state->part_blk_params; const CommonModeInfoParams *const mi_params = &cm->mi_params; const int mi_row = blk_params.mi_row; const int mi_col = blk_params.mi_col; const BLOCK_SIZE bsize = blk_params.bsize; assert(bsize < BLOCK_SIZES_ALL); RD_STATS sum_rdc = part_search_state->sum_rdc; const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); // Check if partition split is allowed. if (part_search_state->terminate_partition_search || !part_search_state->do_square_split) return; for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { if (pc_tree->split[i] == NULL) pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); if (!pc_tree->split[i]) aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); pc_tree->split[i]->index = i; } // Initialization of this partition RD stats. av1_init_rd_stats(&sum_rdc); sum_rdc.rate = part_search_state->partition_cost[PARTITION_SPLIT]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); int idx; #if CONFIG_COLLECT_PARTITION_STATS PartitionTimingStats *part_timing_stats = &part_search_state->part_timing_stats; if (best_rdc->rdcost - sum_rdc.rdcost >= 0) { start_partition_block_timer(part_timing_stats, PARTITION_SPLIT); } #endif // Recursive partition search on 4 sub-blocks. for (idx = 0; idx < SUB_PARTITIONS_SPLIT && sum_rdc.rdcost < best_rdc->rdcost; ++idx) { const int x_idx = (idx & 1) * blk_params.mi_step; const int y_idx = (idx >> 1) * blk_params.mi_step; if (mi_row + y_idx >= mi_params->mi_rows || mi_col + x_idx >= mi_params->mi_cols) continue; pc_tree->split[idx]->index = idx; int64_t *p_split_rd = &part_search_state->split_rd[idx]; RD_STATS best_remain_rdcost; av1_rd_stats_subtraction(x->rdmult, best_rdc, &sum_rdc, &best_remain_rdcost); int curr_quad_tree_idx = 0; if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) { curr_quad_tree_idx = part_search_state->intra_part_info->quad_tree_idx; part_search_state->intra_part_info->quad_tree_idx = 4 * curr_quad_tree_idx + idx + 1; } // Split partition evaluation of corresponding idx. // If the RD cost exceeds the best cost then do not // evaluate other split sub-partitions. SIMPLE_MOTION_DATA_TREE *const sms_tree_split = (sms_tree == NULL) ? NULL : sms_tree->split[idx]; if (!av1_rd_pick_partition( cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize, &part_search_state->this_rdc, best_remain_rdcost, pc_tree->split[idx], sms_tree_split, p_split_rd, multi_pass_mode, &part_search_state->split_part_rect_win[idx])) { av1_invalid_rd_stats(&sum_rdc); break; } if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) { part_search_state->intra_part_info->quad_tree_idx = curr_quad_tree_idx; } sum_rdc.rate += part_search_state->this_rdc.rate; sum_rdc.dist += part_search_state->this_rdc.dist; av1_rd_cost_update(x->rdmult, &sum_rdc); // Set split ctx as ready for use. if (idx <= 1 && (bsize <= BLOCK_8X8 || pc_tree->split[idx]->partitioning == PARTITION_NONE)) { const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none->mic; const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; // Neither palette mode nor cfl predicted. if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) { if (mbmi->uv_mode != UV_CFL_PRED) part_search_state->is_split_ctx_is_ready[idx] = 1; } } } #if CONFIG_COLLECT_PARTITION_STATS if (part_timing_stats->timer_is_on) { end_partition_block_timer(part_timing_stats, PARTITION_SPLIT, sum_rdc.rdcost); } #endif const int reached_last_index = (idx == SUB_PARTITIONS_SPLIT); // Calculate the total cost and update the best partition. *part_split_rd = sum_rdc.rdcost; if (reached_last_index && sum_rdc.rdcost < best_rdc->rdcost) { sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); if (sum_rdc.rdcost < best_rdc->rdcost) { *best_rdc = sum_rdc; part_search_state->found_best_partition = true; pc_tree->partitioning = PARTITION_SPLIT; } } else if (cpi->sf.part_sf.less_rectangular_check_level > 0) { // Skip rectangular partition test when partition type none gives better // rd than partition type split. if (cpi->sf.part_sf.less_rectangular_check_level == 2 || idx <= 2) { const int partition_none_valid = part_search_state->none_rd > 0; const int partition_none_better = part_search_state->none_rd < sum_rdc.rdcost; part_search_state->do_rectangular_split &= !(partition_none_valid && partition_none_better); } } // Restore the context for the following cases: // 1) Current block size not more than maximum partition size as dry run // encode happens for these cases // 2) Current block size same as superblock size as the final encode // happens for this case if (bsize <= x->sb_enc.max_partition_size || bsize == cm->seq_params->sb_size) av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm)); } // The max number of nodes in the partition tree. // The number of leaf nodes is (128x128) / (4x4) = 1024. // The number of All possible parent nodes is 1 + 2 + ... + 512 = 1023. #define NUM_NODES 2048 static void write_partition_tree(AV1_COMP *const cpi, const PC_TREE *const pc_tree, const BLOCK_SIZE bsize, const int mi_row, const int mi_col) { (void)mi_row; (void)mi_col; const char *path = cpi->oxcf.partition_info_path; char filename[256]; snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path, cpi->sb_counter, 0); FILE *pfile = fopen(filename, "w"); fprintf(pfile, "%d", bsize); // Write partition type with BFS order. const PC_TREE *tree_node_queue[NUM_NODES] = { NULL }; int q_idx = 0; int last_idx = 1; int num_nodes = 1; // First traversal to get number of leaf nodes. tree_node_queue[q_idx] = pc_tree; while (num_nodes > 0) { const PC_TREE *node = tree_node_queue[q_idx]; if (node->partitioning == PARTITION_SPLIT) { for (int i = 0; i < 4; ++i) { tree_node_queue[last_idx] = node->split[i]; ++last_idx; } num_nodes += 4; } --num_nodes; ++q_idx; } const int num_leafs = last_idx; fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1); // Write partitions for each node. q_idx = 0; last_idx = 1; num_nodes = 1; tree_node_queue[q_idx] = pc_tree; while (num_nodes > 0) { const PC_TREE *node = tree_node_queue[q_idx]; fprintf(pfile, ",%d", node->partitioning); if (node->partitioning == PARTITION_SPLIT) { for (int i = 0; i < 4; ++i) { tree_node_queue[last_idx] = node->split[i]; ++last_idx; } num_nodes += 4; } --num_nodes; ++q_idx; } fprintf(pfile, "\n"); fclose(pfile); } #if CONFIG_PARTITION_SEARCH_ORDER static void verify_write_partition_tree(const AV1_COMP *const cpi, const PC_TREE *const pc_tree, const BLOCK_SIZE bsize, const int config_id, const int mi_row, const int mi_col) { (void)mi_row; (void)mi_col; const char *path = cpi->oxcf.partition_info_path; char filename[256]; snprintf(filename, sizeof(filename), "%s/verify_partition_tree_sb%d_c%d", path, cpi->sb_counter, config_id); FILE *pfile = fopen(filename, "w"); fprintf(pfile, "%d", bsize); // Write partition type with BFS order. const PC_TREE *tree_node_queue[NUM_NODES] = { NULL }; int q_idx = 0; int last_idx = 1; int num_nodes = 1; // First traversal to get number of leaf nodes. tree_node_queue[q_idx] = pc_tree; while (num_nodes > 0) { const PC_TREE *node = tree_node_queue[q_idx]; if (node != NULL && node->partitioning == PARTITION_SPLIT) { for (int i = 0; i < 4; ++i) { tree_node_queue[last_idx] = node->split[i]; ++last_idx; } num_nodes += 4; } --num_nodes; ++q_idx; } const int num_leafs = last_idx; fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1); // Write partitions for each node. q_idx = 0; last_idx = 1; num_nodes = 1; tree_node_queue[q_idx] = pc_tree; while (num_nodes > 0) { const PC_TREE *node = tree_node_queue[q_idx]; if (node != NULL) { // suppress warning fprintf(pfile, ",%d", node->partitioning); if (node->partitioning == PARTITION_SPLIT) { for (int i = 0; i < 4; ++i) { tree_node_queue[last_idx] = node->split[i]; ++last_idx; } num_nodes += 4; } } --num_nodes; ++q_idx; } fprintf(pfile, "\n"); fclose(pfile); } static int read_partition_tree(AV1_COMP *const cpi, PC_TREE *const pc_tree, struct aom_internal_error_info *error_info, const int config_id) { const AV1_COMMON *const cm = &cpi->common; const char *path = cpi->oxcf.partition_info_path; char filename[256]; snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path, cpi->sb_counter, config_id); FILE *pfile = fopen(filename, "r"); if (pfile == NULL) { aom_internal_error(cm->error, AOM_CODEC_ERROR, "Can't find input file: %s.", filename); } int read_bsize; int num_nodes; int num_configs; fscanf(pfile, "%d,%d,%d", &read_bsize, &num_nodes, &num_configs); assert(read_bsize == cpi->common.seq_params->sb_size); BLOCK_SIZE bsize = (BLOCK_SIZE)read_bsize; assert(bsize == pc_tree->block_size); PC_TREE *tree_node_queue[NUM_NODES] = { NULL }; int last_idx = 1; int q_idx = 0; tree_node_queue[q_idx] = pc_tree; while (num_nodes > 0) { int partitioning; fscanf(pfile, ",%d", &partitioning); assert(partitioning >= PARTITION_NONE && partitioning < EXT_PARTITION_TYPES); PC_TREE *node = tree_node_queue[q_idx]; if (node != NULL) { node->partitioning = partitioning; bsize = node->block_size; } if (partitioning == PARTITION_SPLIT) { const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); for (int i = 0; i < 4; ++i) { if (node != NULL) { // Suppress warning node->split[i] = av1_alloc_pc_tree_node(subsize); if (!node->split[i]) aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); node->split[i]->index = i; tree_node_queue[last_idx] = node->split[i]; ++last_idx; } } } --num_nodes; ++q_idx; } fclose(pfile); return num_configs; } static RD_STATS rd_search_for_fixed_partition( AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, SIMPLE_MOTION_DATA_TREE *sms_tree, int mi_row, int mi_col, const BLOCK_SIZE bsize, PC_TREE *pc_tree) { const PARTITION_TYPE partition = pc_tree->partitioning; const AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; TileInfo *const tile_info = &tile_data->tile_info; RD_STATS best_rdc; av1_invalid_rd_stats(&best_rdc); int sum_subblock_rate = 0; int64_t sum_subblock_dist = 0; PartitionSearchState part_search_state; init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col, bsize); // Override partition costs at the edges of the frame in the same // way as in read_partition (see decodeframe.c). PartitionBlkParams blk_params = part_search_state.part_blk_params; if (!av1_blk_has_rows_and_cols(&blk_params)) set_partition_cost_for_edge_blk(cm, &part_search_state); av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); // Save rdmult before it might be changed, so it can be restored later. const int orig_rdmult = x->rdmult; setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); (void)orig_rdmult; // Set the context. RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; xd->above_txfm_context = cm->above_contexts.txfm[tile_info->tile_row] + mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); assert(bsize < BLOCK_SIZES_ALL); unsigned int pb_source_variance = UINT_MAX; int64_t part_none_rd = INT64_MAX; int64_t none_rd = INT64_MAX; int inc_step[NUM_PART4_TYPES] = { 0 }; if (partition == PARTITION_HORZ_4) inc_step[HORZ4] = mi_size_high[bsize] / 4; if (partition == PARTITION_VERT_4) inc_step[VERT4] = mi_size_wide[bsize] / 4; switch (partition) { case PARTITION_NONE: none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx, &part_search_state, &best_rdc, &pb_source_variance, &none_rd, &part_none_rd); break; case PARTITION_HORZ: rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx, &part_search_state, &best_rdc, NULL, HORZ, HORZ); break; case PARTITION_VERT: rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx, &part_search_state, &best_rdc, NULL, VERT, VERT); break; case PARTITION_HORZ_A: ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, &part_search_state, &best_rdc, NULL, pb_source_variance, 1, HORZ_A, HORZ_A); break; case PARTITION_HORZ_B: ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, &part_search_state, &best_rdc, NULL, pb_source_variance, 1, HORZ_B, HORZ_B); break; case PARTITION_VERT_A: ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, &part_search_state, &best_rdc, NULL, pb_source_variance, 1, VERT_A, VERT_A); break; case PARTITION_VERT_B: ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, &part_search_state, &best_rdc, NULL, pb_source_variance, 1, VERT_B, VERT_B); break; case PARTITION_HORZ_4: rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, pc_tree->horizontal4, &part_search_state, &best_rdc, inc_step, PARTITION_HORZ_4); break; case PARTITION_VERT_4: rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, pc_tree->vertical4, &part_search_state, &best_rdc, inc_step, PARTITION_VERT_4); break; case PARTITION_SPLIT: for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; ++idx) { const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); assert(subsize < BLOCK_SIZES_ALL); const int next_mi_row = idx < 2 ? mi_row : mi_row + mi_size_high[subsize]; const int next_mi_col = idx % 2 == 0 ? mi_col : mi_col + mi_size_wide[subsize]; if (next_mi_row >= cm->mi_params.mi_rows || next_mi_col >= cm->mi_params.mi_cols) { continue; } const RD_STATS subblock_rdc = rd_search_for_fixed_partition( cpi, td, tile_data, tp, sms_tree->split[idx], next_mi_row, next_mi_col, subsize, pc_tree->split[idx]); sum_subblock_rate += subblock_rdc.rate; sum_subblock_dist += subblock_rdc.dist; } best_rdc.rate = sum_subblock_rate; best_rdc.rate += part_search_state.partition_cost[PARTITION_SPLIT]; best_rdc.dist = sum_subblock_dist; best_rdc.rdcost = RDCOST(x->rdmult, best_rdc.rate, best_rdc.dist); break; default: assert(0 && "invalid partition type."); aom_internal_error(cm->error, AOM_CODEC_ERROR, "Invalid partition type."); } // Note: it is necessary to restore context information. av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); if (bsize != cm->seq_params->sb_size) { encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, pc_tree, NULL); } x->rdmult = orig_rdmult; return best_rdc; } static void prepare_sb_features_before_search( AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row, int mi_col, const BLOCK_SIZE bsize, aom_partition_features_t *features) { av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col, bsize, features); collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, features); } static void update_partition_stats(const RD_STATS *const this_rdcost, aom_partition_stats_t *stats) { stats->rate = this_rdcost->rate; stats->dist = this_rdcost->dist; stats->rdcost = this_rdcost->rdcost; } static void build_pc_tree_from_part_decision( const aom_partition_decision_t *partition_decision, const BLOCK_SIZE this_bsize, PC_TREE *pc_tree, struct aom_internal_error_info *error_info) { BLOCK_SIZE bsize = this_bsize; int num_nodes = partition_decision->num_nodes; PC_TREE *tree_node_queue[NUM_NODES] = { NULL }; int last_idx = 1; int q_idx = 0; tree_node_queue[q_idx] = pc_tree; while (num_nodes > 0) { const int partitioning = partition_decision->partition_decision[q_idx]; assert(partitioning >= PARTITION_NONE && partitioning < EXT_PARTITION_TYPES); PC_TREE *node = tree_node_queue[q_idx]; if (node != NULL) { node->partitioning = partitioning; bsize = node->block_size; } if (partitioning == PARTITION_SPLIT) { const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); for (int i = 0; i < 4; ++i) { if (node != NULL) { // Suppress warning node->split[i] = av1_alloc_pc_tree_node(subsize); if (!node->split[i]) aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); node->split[i]->index = i; tree_node_queue[last_idx] = node->split[i]; ++last_idx; } } } --num_nodes; ++q_idx; } } // The ML model needs to provide the whole decision tree for the superblock. static bool ml_partition_search_whole_tree(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row, int mi_col, const BLOCK_SIZE bsize) { AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &td->mb; ExtPartController *const ext_part_controller = &cpi->ext_part_controller; struct aom_internal_error_info *error_info = x->e_mbd.error_info; aom_partition_features_t features; prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize, &features); features.mi_row = mi_row; features.mi_col = mi_col; features.frame_width = cpi->frame_info.frame_width; features.frame_height = cpi->frame_info.frame_height; features.block_size = bsize; av1_ext_part_send_features(ext_part_controller, &features); // rd mode search (dry run) for a valid partition decision from the ml model. aom_partition_decision_t partition_decision; do { const bool valid_decision = av1_ext_part_get_partition_decision( ext_part_controller, &partition_decision); if (!valid_decision) return false; // First, let's take the easy approach. // We require that the ml model has to provide partition decisions for the // whole superblock. td->pc_root = av1_alloc_pc_tree_node(bsize); if (!td->pc_root) aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); build_pc_tree_from_part_decision(&partition_decision, bsize, td->pc_root, error_info); const RD_STATS this_rdcost = rd_search_for_fixed_partition( cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root); aom_partition_stats_t stats; update_partition_stats(&this_rdcost, &stats); av1_ext_part_send_partition_stats(ext_part_controller, &stats); if (!partition_decision.is_final_decision) { av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, cpi->sf.part_sf.partition_search_type); td->pc_root = NULL; } } while (!partition_decision.is_final_decision); // Encode with the selected mode and partition. set_cb_offsets(x->cb_offset, 0, 0); encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, td->pc_root, NULL); av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, cpi->sf.part_sf.partition_search_type); td->pc_root = NULL; return true; } // Use a bitmask to represent the valid partition types for the current // block. "1" represents the corresponding partition type is vaild. // The least significant bit represents "PARTITION_NONE", the // largest significant bit represents "PARTITION_VERT_4", follow // the enum order for PARTITION_TYPE in "enums.h" static int get_valid_partition_types( const AV1_COMP *const cpi, const PartitionSearchState *const part_search_state, const BLOCK_SIZE bsize) { const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg; const PartitionBlkParams blk_params = part_search_state->part_blk_params; int valid_types = 0; // PARTITION_NONE valid_types |= (part_search_state->partition_none_allowed << 0); // PARTITION_HORZ valid_types |= (part_search_state->partition_rect_allowed[HORZ] << 1); // PARTITION_VERT valid_types |= (part_search_state->partition_rect_allowed[VERT] << 2); // PARTITION_SPLIT valid_types |= (part_search_state->do_square_split << 3); // PARTITION_HORZ_A const int ext_partition_allowed = part_search_state->do_rectangular_split && av1_blk_has_rows_and_cols(&blk_params); const int horzab_partition_allowed = ext_partition_allowed && part_cfg->enable_ab_partitions && part_search_state->partition_rect_allowed[HORZ]; valid_types |= (horzab_partition_allowed << 4); // PARTITION_HORZ_B valid_types |= (horzab_partition_allowed << 5); // PARTITION_VERT_A const int vertab_partition_allowed = ext_partition_allowed && part_cfg->enable_ab_partitions && part_search_state->partition_rect_allowed[VERT]; valid_types |= (vertab_partition_allowed << 6); // PARTITION_VERT_B valid_types |= (vertab_partition_allowed << 7); // PARTITION_HORZ_4 const int partition4_allowed = part_cfg->enable_1to4_partitions && ext_partition_allowed && bsize != BLOCK_128X128; const int horz4_allowed = partition4_allowed && part_search_state->partition_rect_allowed[HORZ] && get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ_4), part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID; valid_types |= (horz4_allowed << 8); // PARTITION_VERT_4 const int vert4_allowed = partition4_allowed && part_search_state->partition_rect_allowed[HORZ] && get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT_4), part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID; valid_types |= (vert4_allowed << 9); return valid_types; } static void prepare_tpl_stats_block(const AV1_COMP *const cpi, const BLOCK_SIZE bsize, const int mi_row, const int mi_col, int64_t *intra_cost, int64_t *inter_cost, int64_t *mc_dep_cost) { const AV1_COMMON *const cm = &cpi->common; GF_GROUP *gf_group = &cpi->ppi->gf_group; if (gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE || gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) { return; } TplParams *const tpl_data = &cpi->ppi->tpl_data; TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cpi->gf_frame_index]; TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; // If tpl stats is not established, early return if (!tpl_data->ready || gf_group->max_layer_depth_allowed == 0) { return; } const int tpl_stride = tpl_frame->stride; const int step = 1 << tpl_data->tpl_stats_block_mis_log2; const int mi_width = AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col); const int mi_height = AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row); int64_t sum_intra_cost = 0; int64_t sum_inter_cost = 0; int64_t sum_mc_dep_cost = 0; for (int row = 0; row < mi_height; row += step) { for (int col = 0; col < mi_width; col += step) { TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; sum_intra_cost += this_stats->intra_cost; sum_inter_cost += this_stats->inter_cost; const int64_t mc_dep_delta = RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, this_stats->mc_dep_dist); sum_mc_dep_cost += mc_dep_delta; } } *intra_cost = sum_intra_cost; *inter_cost = sum_inter_cost; *mc_dep_cost = sum_mc_dep_cost; } static bool recursive_partition(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, SIMPLE_MOTION_DATA_TREE *sms_root, PC_TREE *pc_tree, int mi_row, int mi_col, const BLOCK_SIZE bsize, RD_STATS *this_rdcost) { const AV1_COMMON *const cm = &cpi->common; ExtPartController *const ext_part_controller = &cpi->ext_part_controller; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) { return false; } aom_partition_decision_t partition_decision; do { PartitionSearchState part_search_state; // Initialization of state variables used in partition search. // TODO(chengchen): check if there is hidden conditions that don't allow // all possible partition types. init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col, bsize); // Override partition costs at the edges of the frame in the same // way as in read_partition (see decodeframe.c). PartitionBlkParams blk_params = part_search_state.part_blk_params; if (!av1_blk_has_rows_and_cols(&blk_params)) set_partition_cost_for_edge_blk(cm, &part_search_state); const int orig_rdmult = x->rdmult; setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); const int valid_partition_types = get_valid_partition_types(cpi, &part_search_state, bsize); const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); const int qindex = av1_get_qindex(&cm->seg, xd->mi[0]->segment_id, cm->quant_params.base_qindex); // RD multiplier const int rdmult = x->rdmult; // pyramid level const int pyramid_level = cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]; x->rdmult = orig_rdmult; // Neighbor information const int has_above = !!xd->above_mbmi; const int has_left = !!xd->left_mbmi; const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->bsize : BLOCK_INVALID; const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->bsize : BLOCK_INVALID; const int above_block_width = above_bsize == BLOCK_INVALID ? -1 : block_size_wide[above_bsize]; const int above_block_height = above_bsize == BLOCK_INVALID ? -1 : block_size_high[above_bsize]; const int left_block_width = left_bsize == BLOCK_INVALID ? -1 : block_size_wide[left_bsize]; const int left_block_height = left_bsize == BLOCK_INVALID ? -1 : block_size_high[left_bsize]; // Prepare simple motion search stats as features unsigned int block_sse = -1; unsigned int block_var = -1; unsigned int sub_block_sse[4] = { -1, -1, -1, -1 }; unsigned int sub_block_var[4] = { -1, -1, -1, -1 }; unsigned int horz_block_sse[2] = { -1, -1 }; unsigned int horz_block_var[2] = { -1, -1 }; unsigned int vert_block_sse[2] = { -1, -1 }; unsigned int vert_block_var[2] = { -1, -1 }; av1_prepare_motion_search_features_block( cpi, td, tile_data, mi_row, mi_col, bsize, valid_partition_types, &block_sse, &block_var, sub_block_sse, sub_block_var, horz_block_sse, horz_block_var, vert_block_sse, vert_block_var); // Prepare tpl stats for the current block as features int64_t tpl_intra_cost = -1; int64_t tpl_inter_cost = -1; int64_t tpl_mc_dep_cost = -1; prepare_tpl_stats_block(cpi, bsize, mi_row, mi_col, &tpl_intra_cost, &tpl_inter_cost, &tpl_mc_dep_cost); aom_partition_features_t features; features.mi_row = mi_row; features.mi_col = mi_col; features.frame_width = cpi->frame_info.frame_width; features.frame_height = cpi->frame_info.frame_height; features.block_size = bsize; features.valid_partition_types = valid_partition_types; features.update_type = update_type; features.qindex = qindex; features.rdmult = rdmult; features.pyramid_level = pyramid_level; features.has_above_block = has_above; features.above_block_width = above_block_width; features.above_block_height = above_block_height; features.has_left_block = has_left; features.left_block_width = left_block_width; features.left_block_height = left_block_height; features.block_sse = block_sse; features.block_var = block_var; for (int i = 0; i < 4; ++i) { features.sub_block_sse[i] = sub_block_sse[i]; features.sub_block_var[i] = sub_block_var[i]; } for (int i = 0; i < 2; ++i) { features.horz_block_sse[i] = horz_block_sse[i]; features.horz_block_var[i] = horz_block_var[i]; features.vert_block_sse[i] = vert_block_sse[i]; features.vert_block_var[i] = vert_block_var[i]; } features.tpl_intra_cost = tpl_intra_cost; features.tpl_inter_cost = tpl_inter_cost; features.tpl_mc_dep_cost = tpl_mc_dep_cost; av1_ext_part_send_features(ext_part_controller, &features); const bool valid_decision = av1_ext_part_get_partition_decision( ext_part_controller, &partition_decision); if (!valid_decision) return false; pc_tree->partitioning = partition_decision.current_decision; av1_init_rd_stats(this_rdcost); if (partition_decision.current_decision == PARTITION_SPLIT) { assert(block_size_wide[bsize] >= 8 && block_size_high[bsize] >= 8); const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); RD_STATS split_rdc[SUB_PARTITIONS_SPLIT]; for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { av1_init_rd_stats(&split_rdc[i]); if (pc_tree->split[i] == NULL) pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); if (!pc_tree->split[i]) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); pc_tree->split[i]->index = i; } const int orig_rdmult_tmp = x->rdmult; setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); // TODO(chengchen): check boundary conditions // top-left recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[0], mi_row, mi_col, subsize, &split_rdc[0]); // top-right recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[1], mi_row, mi_col + mi_size_wide[subsize], subsize, &split_rdc[1]); // bottom-left recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[2], mi_row + mi_size_high[subsize], mi_col, subsize, &split_rdc[2]); // bottom_right recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[3], mi_row + mi_size_high[subsize], mi_col + mi_size_wide[subsize], subsize, &split_rdc[3]); this_rdcost->rate += part_search_state.partition_cost[PARTITION_SPLIT]; // problem is here, the rdmult is different from the rdmult in sub block. for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { this_rdcost->rate += split_rdc[i].rate; this_rdcost->dist += split_rdc[i].dist; av1_rd_cost_update(x->rdmult, this_rdcost); } x->rdmult = orig_rdmult_tmp; } else { *this_rdcost = rd_search_for_fixed_partition( cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, pc_tree); } aom_partition_stats_t stats; update_partition_stats(this_rdcost, &stats); av1_ext_part_send_partition_stats(ext_part_controller, &stats); if (!partition_decision.is_final_decision) { if (partition_decision.current_decision == PARTITION_SPLIT) { for (int i = 0; i < 4; ++i) { if (pc_tree->split[i] != NULL) { av1_free_pc_tree_recursive(pc_tree->split[i], av1_num_planes(cm), 0, 0, cpi->sf.part_sf.partition_search_type); pc_tree->split[i] = NULL; } } } } } while (!partition_decision.is_final_decision); return true; } // The ML model only needs to make decisions for the current block each time. static bool ml_partition_search_partial(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row, int mi_col, const BLOCK_SIZE bsize) { AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &td->mb; ExtPartController *const ext_part_controller = &cpi->ext_part_controller; aom_partition_features_t features; prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize, &features); features.mi_row = mi_row; features.mi_col = mi_col; features.frame_width = cpi->frame_info.frame_width; features.frame_height = cpi->frame_info.frame_height; features.block_size = bsize; av1_ext_part_send_features(ext_part_controller, &features); td->pc_root = av1_alloc_pc_tree_node(bsize); if (!td->pc_root) aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); RD_STATS rdcost; const bool valid_partition = recursive_partition(cpi, td, tile_data, tp, sms_root, td->pc_root, mi_row, mi_col, bsize, &rdcost); if (!valid_partition) { return false; } // Encode with the selected mode and partition. set_cb_offsets(x->cb_offset, 0, 0); encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, td->pc_root, NULL); av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, cpi->sf.part_sf.partition_search_type); td->pc_root = NULL; return true; } bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row, int mi_col, const BLOCK_SIZE bsize, RD_STATS *best_rd_cost) { AV1_COMMON *const cm = &cpi->common; if (cpi->ext_part_controller.ready) { bool valid_search = true; const aom_ext_part_decision_mode_t decision_mode = av1_get_ext_part_decision_mode(&cpi->ext_part_controller); if (decision_mode == AOM_EXT_PART_WHOLE_TREE) { valid_search = ml_partition_search_whole_tree( cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize); } else if (decision_mode == AOM_EXT_PART_RECURSIVE) { valid_search = ml_partition_search_partial( cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize); } else { assert(0 && "Unknown decision mode."); return false; } if (!valid_search) { aom_internal_error( cm->error, AOM_CODEC_ERROR, "Invalid search from ML model, partition search failed"); } return true; } MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; int best_idx = 0; int64_t min_rdcost = INT64_MAX; int num_configs; int i = 0; do { td->pc_root = av1_alloc_pc_tree_node(bsize); if (!td->pc_root) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); num_configs = read_partition_tree(cpi, td->pc_root, xd->error_info, i); if (num_configs <= 0) { av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, cpi->sf.part_sf.partition_search_type); td->pc_root = NULL; aom_internal_error(xd->error_info, AOM_CODEC_ERROR, "Invalid configs."); } verify_write_partition_tree(cpi, td->pc_root, bsize, i, mi_row, mi_col); if (i == 0) { AOM_CHECK_MEM_ERROR(xd->error_info, x->rdcost, aom_calloc(num_configs, sizeof(*x->rdcost))); } // Encode the block with the given partition tree. Get rdcost and encoding // time. x->rdcost[i] = rd_search_for_fixed_partition( cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root); if (x->rdcost[i].rdcost < min_rdcost) { min_rdcost = x->rdcost[i].rdcost; best_idx = i; *best_rd_cost = x->rdcost[i]; } av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, cpi->sf.part_sf.partition_search_type); td->pc_root = NULL; ++i; } while (i < num_configs); aom_free(x->rdcost); x->rdcost = NULL; // Encode with the partition configuration with the smallest rdcost. td->pc_root = av1_alloc_pc_tree_node(bsize); if (!td->pc_root) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); read_partition_tree(cpi, td->pc_root, xd->error_info, best_idx); rd_search_for_fixed_partition(cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root); set_cb_offsets(x->cb_offset, 0, 0); encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, td->pc_root, NULL); av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, cpi->sf.part_sf.partition_search_type); td->pc_root = NULL; ++cpi->sb_counter; return true; } #endif // CONFIG_PARTITION_SEARCH_ORDER static inline bool should_do_dry_run_encode_for_current_block( BLOCK_SIZE sb_size, BLOCK_SIZE max_partition_size, int curr_block_index, BLOCK_SIZE bsize) { if (bsize > max_partition_size) return false; // Enable the reconstruction with dry-run for the 4th sub-block only if its // parent block's reconstruction with dry-run is skipped. If // max_partition_size is the same as immediate split of superblock, then avoid // reconstruction of the 4th sub-block, as this data is not consumed. if (curr_block_index != 3) return true; const BLOCK_SIZE sub_sb_size = get_partition_subsize(sb_size, PARTITION_SPLIT); return bsize == max_partition_size && sub_sb_size != max_partition_size; } static void log_sub_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs, double *var_min, double *var_max) { // This functions returns a the minimum and maximum log variances for 4x4 // sub blocks in the current block. const MACROBLOCKD *const xd = &x->e_mbd; const int is_hbd = is_cur_buf_hbd(xd); const int right_overflow = (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0; const int bottom_overflow = (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0; const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow; const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow; // Initialize minimum variance to a large value and maximum variance to 0. double min_var_4x4 = (double)INT_MAX; double max_var_4x4 = 0.0; aom_variance_fn_t vf = cpi->ppi->fn_ptr[BLOCK_4X4].vf; for (int i = 0; i < bh; i += MI_SIZE) { for (int j = 0; j < bw; j += MI_SIZE) { int var; // Calculate the 4x4 sub-block variance. var = av1_calc_normalized_variance( vf, x->plane[0].src.buf + (i * x->plane[0].src.stride) + j, x->plane[0].src.stride, is_hbd); // Record min and max for over-arching block min_var_4x4 = AOMMIN(min_var_4x4, var); max_var_4x4 = AOMMAX(max_var_4x4, var); } } *var_min = log1p(min_var_4x4 / 16.0); *var_max = log1p(max_var_4x4 / 16.0); } static inline void set_sms_tree_partitioning(SIMPLE_MOTION_DATA_TREE *sms_tree, PARTITION_TYPE partition) { if (sms_tree == NULL) return; sms_tree->partitioning = partition; } /*!\brief AV1 block partition search (full search). * * \ingroup partition_search * \callgraph * Searches for the best partition pattern for a block based on the * rate-distortion cost, and returns a bool value to indicate whether a valid * partition pattern is found. The partition can recursively go down to the * smallest block size. * * \param[in] cpi Top-level encoder structure * \param[in] td Pointer to thread data * \param[in] tile_data Pointer to struct holding adaptive data/contexts/models for the tile during encoding * \param[in] tp Pointer to the starting token * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE * \param[in] mi_col Column coordinate of the block in a step size of MI_SIZE * \param[in] bsize Current block size * \param[in] rd_cost Pointer to the final rd cost of the block * \param[in] best_rdc Upper bound of rd cost of a valid partition * \param[in] pc_tree Pointer to the PC_TREE node storing the picked partitions and mode info for the current block * \param[in] sms_tree Pointer to struct holding simple motion search data for the current block * \param[in] none_rd Pointer to the rd cost in the case of not splitting the current block * \param[in] multi_pass_mode SB_SINGLE_PASS/SB_DRY_PASS/SB_WET_PASS * \param[in] rect_part_win_info Pointer to struct storing whether horz/vert partition outperforms previously tested partitions * * \return A bool value is returned indicating if a valid partition is found. * The pc_tree struct is modified to store the picked partition and modes. * The rd_cost struct is also updated with the RD stats corresponding to the * best partition found. */ bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost, RD_STATS best_rdc, PC_TREE *pc_tree, SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd, SB_MULTI_PASS_MODE multi_pass_mode, RD_RECT_PART_WIN_INFO *rect_part_win_info) { const AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; const TokenExtra *const tp_orig = *tp; PartitionSearchState part_search_state; // Initialization of state variables used in partition search. init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col, bsize); PartitionBlkParams blk_params = part_search_state.part_blk_params; set_sms_tree_partitioning(sms_tree, PARTITION_NONE); if (best_rdc.rdcost < 0) { av1_invalid_rd_stats(rd_cost); return part_search_state.found_best_partition; } if (bsize == cm->seq_params->sb_size) x->must_find_valid_partition = 0; // Override skipping rectangular partition operations for edge blocks. if (none_rd) *none_rd = 0; (void)*tp_orig; #if CONFIG_COLLECT_PARTITION_STATS // Stats at the current quad tree PartitionTimingStats *part_timing_stats = &part_search_state.part_timing_stats; // Stats aggregated at frame level FramePartitionTimingStats *fr_part_timing_stats = &cpi->partition_stats; #endif // CONFIG_COLLECT_PARTITION_STATS // Override partition costs at the edges of the frame in the same // way as in read_partition (see decodeframe.c). if (!av1_blk_has_rows_and_cols(&blk_params)) set_partition_cost_for_edge_blk(cm, &part_search_state); // Disable rectangular partitions for inner blocks when the current block is // forced to only use square partitions. if (bsize > cpi->sf.part_sf.use_square_partition_only_threshold) { part_search_state.partition_rect_allowed[HORZ] &= !blk_params.has_rows; part_search_state.partition_rect_allowed[VERT] &= !blk_params.has_cols; } #ifndef NDEBUG // Nothing should rely on the default value of this array (which is just // leftover from encoding the previous block. Setting it to fixed pattern // when debugging. // bit 0, 1, 2 are blk_skip of each plane // bit 4, 5, 6 are initialization checking of each plane memset(x->txfm_search_info.blk_skip, 0x77, sizeof(x->txfm_search_info.blk_skip)); #endif // NDEBUG assert(mi_size_wide[bsize] == mi_size_high[bsize]); // Set buffers and offsets. av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); if (cpi->oxcf.mode == ALLINTRA) { if (bsize == cm->seq_params->sb_size) { double var_min, var_max; log_sub_block_var(cpi, x, bsize, &var_min, &var_max); x->intra_sb_rdmult_modifier = 128; if ((var_min < 2.0) && (var_max > 4.0)) { if ((var_max - var_min) > 8.0) { x->intra_sb_rdmult_modifier -= 48; } else { x->intra_sb_rdmult_modifier -= (int)((var_max - var_min) * 6); } } } } // Save rdmult before it might be changed, so it can be restored later. const int orig_rdmult = x->rdmult; setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); // Apply simple motion search for the entire super block with fixed block // size, e.g., 16x16, to collect features and write to files for the // external ML model. // TODO(chengchen): reduce motion search. This function is similar to // av1_get_max_min_partition_features(). if (COLLECT_MOTION_SEARCH_FEATURE_SB && !frame_is_intra_only(cm) && bsize == cm->seq_params->sb_size) { av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col, bsize, /*features=*/NULL); collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, /*features=*/NULL); } // Update rd cost of the bound using the current multiplier. av1_rd_cost_update(x->rdmult, &best_rdc); if (bsize == BLOCK_16X16 && cpi->vaq_refresh) x->mb_energy = av1_log_block_var(cpi, x, bsize); // Set the context. xd->above_txfm_context = cm->above_contexts.txfm[tile_info->tile_row] + mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, av1_prune_partitions_time); #endif // Pruning: before searching any partition type, using source and simple // motion search results to prune out unlikely partitions. av1_prune_partitions_before_search(cpi, x, sms_tree, &part_search_state); // Pruning: eliminating partition types leading to coding block sizes outside // the min and max bsize limitations set from the encoder. av1_prune_partitions_by_max_min_bsize(&x->sb_enc, &part_search_state); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, av1_prune_partitions_time); #endif // Partition search BEGIN_PARTITION_SEARCH: // If a valid partition is required, usually when the first round cannot find // a valid one under the cost limit after pruning, reset the limitations on // partition types and intra cnn output. if (x->must_find_valid_partition) { reset_part_limitations(cpi, &part_search_state); av1_prune_partitions_by_max_min_bsize(&x->sb_enc, &part_search_state); // Invalidate intra cnn output for key frames. if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) { part_search_state.intra_part_info->quad_tree_idx = 0; part_search_state.intra_part_info->cnn_output_valid = 0; } } // Partition block source pixel variance. unsigned int pb_source_variance = UINT_MAX; #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, none_partition_search_time); #endif if (cpi->oxcf.mode == ALLINTRA) { const bool bsize_at_least_16x16 = (bsize >= BLOCK_16X16); const bool prune_rect_part_using_4x4_var_deviation = (cpi->sf.part_sf.prune_rect_part_using_4x4_var_deviation && !x->must_find_valid_partition); if (bsize_at_least_16x16 || prune_rect_part_using_4x4_var_deviation) { double var_min, var_max; log_sub_block_var(cpi, x, bsize, &var_min, &var_max); // Further pruning or in some cases reverse pruning when allintra is set. // This code helps visual and in some cases metrics quality where the // current block comprises at least one very low variance sub-block and at // least one where the variance is much higher. // // The idea is that in such cases there is danger of ringing and other // visual artifacts from a high variance feature such as an edge into a // very low variance region. // // The approach taken is to force break down / split to a smaller block // size to try and separate out the low variance and well predicted blocks // from the more complex ones and to prevent propagation of ringing over a // large region. if (bsize_at_least_16x16 && (var_min < 0.272) && ((var_max - var_min) > 3.0)) { part_search_state.partition_none_allowed = 0; part_search_state.terminate_partition_search = 0; part_search_state.do_square_split = 1; } else if (prune_rect_part_using_4x4_var_deviation && (var_max - var_min < 3.0)) { // Prune rectangular partitions if the variance deviation of 4x4 // sub-blocks within the block is less than a threshold (derived // empirically). part_search_state.do_rectangular_split = 0; } } } // PARTITION_NONE search stage. int64_t part_none_rd = INT64_MAX; none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx, &part_search_state, &best_rdc, &pb_source_variance, none_rd, &part_none_rd); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, none_partition_search_time); #endif #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, split_partition_search_time); #endif // PARTITION_SPLIT search stage. int64_t part_split_rd = INT64_MAX; split_partition_search(cpi, td, tile_data, tp, x, pc_tree, sms_tree, &x_ctx, &part_search_state, &best_rdc, multi_pass_mode, &part_split_rd); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, split_partition_search_time); #endif // Terminate partition search for child partition, // when NONE and SPLIT partition rd_costs are INT64_MAX. if (cpi->sf.part_sf.early_term_after_none_split && part_none_rd == INT64_MAX && part_split_rd == INT64_MAX && !x->must_find_valid_partition && (bsize != cm->seq_params->sb_size)) { part_search_state.terminate_partition_search = 1; } // Do not evaluate non-square partitions if NONE partition did not choose a // newmv mode and is skippable. if ((cpi->sf.part_sf.skip_non_sq_part_based_on_none >= 2) && (pc_tree->none != NULL)) { if (x->qindex <= 200 && is_inter_mode(pc_tree->none->mic.mode) && !have_newmv_in_inter_mode(pc_tree->none->mic.mode) && pc_tree->none->skippable && !x->must_find_valid_partition && bsize >= BLOCK_16X16) part_search_state.do_rectangular_split = 0; } // Prune partitions based on PARTITION_NONE and PARTITION_SPLIT. prune_partitions_after_split(cpi, x, sms_tree, &part_search_state, &best_rdc, part_none_rd, part_split_rd); #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, rectangular_partition_search_time); #endif // Rectangular partitions search stage. rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx, &part_search_state, &best_rdc, rect_part_win_info, HORZ, VERT); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, rectangular_partition_search_time); #endif if (pb_source_variance == UINT_MAX) { av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); pb_source_variance = av1_get_perpixel_variance_facade( cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y); } assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions, !part_search_state.do_rectangular_split)); const int prune_ext_part_state = prune_ext_part_none_skippable( pc_tree->none, x->must_find_valid_partition, cpi->sf.part_sf.skip_non_sq_part_based_on_none, bsize); const int ab_partition_allowed = allow_ab_partition_search( &part_search_state, &cpi->sf.part_sf, pc_tree->partitioning, x->must_find_valid_partition, prune_ext_part_state, best_rdc.rdcost); #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, ab_partitions_search_time); #endif // AB partitions search stage. ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, &part_search_state, &best_rdc, rect_part_win_info, pb_source_variance, ab_partition_allowed, HORZ_A, VERT_B); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, ab_partitions_search_time); #endif // 4-way partitions search stage. int part4_search_allowed[NUM_PART4_TYPES] = { 1, 1 }; // Prune 4-way partition search. prune_4_way_partition_search(cpi, x, pc_tree, &part_search_state, &best_rdc, pb_source_variance, prune_ext_part_state, part4_search_allowed); #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, rd_pick_4partition_time); #endif // PARTITION_HORZ_4 assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions, !part4_search_allowed[HORZ4])); if (!part_search_state.terminate_partition_search && part4_search_allowed[HORZ4]) { const int inc_step[NUM_PART4_TYPES] = { mi_size_high[blk_params.bsize] / 4, 0 }; // Evaluation of Horz4 partition type. rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, pc_tree->horizontal4, &part_search_state, &best_rdc, inc_step, PARTITION_HORZ_4); } // PARTITION_VERT_4 assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions, !part4_search_allowed[VERT4])); if (!part_search_state.terminate_partition_search && part4_search_allowed[VERT4] && blk_params.has_cols) { const int inc_step[NUM_PART4_TYPES] = { 0, mi_size_wide[blk_params.bsize] / 4 }; // Evaluation of Vert4 partition type. rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, pc_tree->vertical4, &part_search_state, &best_rdc, inc_step, PARTITION_VERT_4); } #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, rd_pick_4partition_time); #endif if (bsize == cm->seq_params->sb_size && !part_search_state.found_best_partition) { // Did not find a valid partition, go back and search again, with less // constraint on which partition types to search. x->must_find_valid_partition = 1; #if CONFIG_COLLECT_PARTITION_STATS fr_part_timing_stats->partition_redo += 1; #endif // CONFIG_COLLECT_PARTITION_STATS goto BEGIN_PARTITION_SEARCH; } // Store the final rd cost *rd_cost = best_rdc; // Also record the best partition in simple motion data tree because it is // necessary for the related speed features. set_sms_tree_partitioning(sms_tree, pc_tree->partitioning); #if CONFIG_COLLECT_PARTITION_STATS if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) { part_timing_stats->partition_decisions[pc_tree->partitioning] += 1; } // If CONFIG_COLLECT_PARTITION_STATS is 1, then print out the stats for each // prediction block. print_partition_timing_stats_with_rdcost( part_timing_stats, mi_row, mi_col, bsize, cpi->ppi->gf_group.update_type[cpi->gf_frame_index], cm->current_frame.frame_number, &best_rdc, "part_timing.csv"); const bool print_timing_stats = false; if (print_timing_stats) { print_partition_timing_stats(part_timing_stats, cm->show_frame, frame_is_intra_only(cm), bsize, "part_timing_data.csv"); } // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats for // the whole clip. So we need to pass the information upstream to the encoder. accumulate_partition_timing_stats(fr_part_timing_stats, part_timing_stats, bsize); #endif // CONFIG_COLLECT_PARTITION_STATS // Reset the PC_TREE deallocation flag. int pc_tree_dealloc = 0; #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, encode_sb_time); #endif if (part_search_state.found_best_partition) { if (bsize == cm->seq_params->sb_size) { // Encode the superblock. const int emit_output = multi_pass_mode != SB_DRY_PASS; const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL; // Write partition tree to file. Not used by default. if (COLLECT_MOTION_SEARCH_FEATURE_SB) { write_partition_tree(cpi, pc_tree, bsize, mi_row, mi_col); ++cpi->sb_counter; } set_cb_offsets(x->cb_offset, 0, 0); encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize, pc_tree, NULL); assert(pc_tree == td->pc_root); // Dealloc the whole PC_TREE after a superblock is done. av1_free_pc_tree_recursive(pc_tree, num_planes, 0, 0, cpi->sf.part_sf.partition_search_type); pc_tree = NULL; td->pc_root = NULL; pc_tree_dealloc = 1; } else if (should_do_dry_run_encode_for_current_block( cm->seq_params->sb_size, x->sb_enc.max_partition_size, pc_tree->index, bsize)) { // Encode the smaller blocks in DRY_RUN mode. encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, pc_tree, NULL); } } #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, encode_sb_time); #endif // If the tree still exists (non-superblock), dealloc most nodes, only keep // nodes for the best partition and PARTITION_NONE. if (pc_tree_dealloc == 0) av1_free_pc_tree_recursive(pc_tree, num_planes, 1, 1, cpi->sf.part_sf.partition_search_type); if (bsize == cm->seq_params->sb_size) { assert(best_rdc.rate < INT_MAX); assert(best_rdc.dist < INT64_MAX); } else { assert(tp_orig == *tp); } // Restore the rd multiplier. x->rdmult = orig_rdmult; return part_search_state.found_best_partition; } #endif // !CONFIG_REALTIME_ONLY #undef COLLECT_MOTION_SEARCH_FEATURE_SB #if CONFIG_RT_ML_PARTITIONING #define FEATURES 6 #define LABELS 2 static int ml_predict_var_partitioning(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col) { AV1_COMMON *const cm = &cpi->common; const NN_CONFIG *nn_config = NULL; const float *means = NULL; const float *vars = NULL; switch (bsize) { case BLOCK_64X64: nn_config = &av1_var_part_nnconfig_64; means = av1_var_part_means_64; vars = av1_var_part_vars_64; break; case BLOCK_32X32: nn_config = &av1_var_part_nnconfig_32; means = av1_var_part_means_32; vars = av1_var_part_vars_32; break; case BLOCK_16X16: nn_config = &av1_var_part_nnconfig_16; means = av1_var_part_means_16; vars = av1_var_part_vars_16; break; case BLOCK_8X8: default: assert(0 && "Unexpected block size."); return -1; } if (!nn_config) return -1; { const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f; float features[FEATURES] = { 0.0f }; const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0, cm->seq_params->bit_depth); int feature_idx = 0; float score[LABELS]; features[feature_idx] = (log1pf((float)(dc_q * dc_q) / 256.0f) - means[feature_idx]) / sqrtf(vars[feature_idx]); feature_idx++; av1_setup_src_planes(x, cpi->source, mi_row, mi_col, 1, bsize); { const int bs = block_size_wide[bsize]; const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); const int sb_offset_row = 4 * (mi_row & 15); const int sb_offset_col = 4 * (mi_col & 15); const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col; const uint8_t *src = x->plane[0].src.buf; const int src_stride = x->plane[0].src.stride; const int pred_stride = 64; unsigned int sse; int i; // Variance of whole block. const unsigned int var = cpi->ppi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); features[feature_idx] = (log1pf((float)var) - means[feature_idx]) / sqrtf(vars[feature_idx]); feature_idx++; for (i = 0; i < 4; ++i) { const int x_idx = (i & 1) * bs / 2; const int y_idx = (i >> 1) * bs / 2; const int src_offset = y_idx * src_stride + x_idx; const int pred_offset = y_idx * pred_stride + x_idx; // Variance of quarter block. const unsigned int sub_var = cpi->ppi->fn_ptr[subsize].vf(src + src_offset, src_stride, pred + pred_offset, pred_stride, &sse); const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; features[feature_idx] = (var_ratio - means[feature_idx]) / sqrtf(vars[feature_idx]); feature_idx++; } } // for (int i = 0; i thresh) return PARTITION_SPLIT; if (score[0] < -thresh) return PARTITION_NONE; return -1; } } #undef FEATURES #undef LABELS // Uncomment for collecting data for ML-based partitioning // #define _COLLECT_GROUND_TRUTH_ #ifdef _COLLECT_GROUND_TRUTH_ static int store_partition_data(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, PARTITION_TYPE part) { AV1_COMMON *const cm = &cpi->common; char fname[128]; switch (bsize) { case BLOCK_64X64: sprintf(fname, "data_64x64.txt"); break; case BLOCK_32X32: sprintf(fname, "data_32x32.txt"); break; case BLOCK_16X16: sprintf(fname, "data_16x16.txt"); break; case BLOCK_8X8: sprintf(fname, "data_8x8.txt"); break; default: assert(0 && "Unexpected block size."); return -1; } float features[6]; // DC_Q, VAR, VAR_RATIO-0..3 FILE *f = fopen(fname, "a"); { const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0, cm->seq_params->bit_depth); int feature_idx = 0; features[feature_idx++] = log1pf((float)(dc_q * dc_q) / 256.0f); av1_setup_src_planes(x, cpi->source, mi_row, mi_col, 1, bsize); { const int bs = block_size_wide[bsize]; const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); const int sb_offset_row = 4 * (mi_row & 15); const int sb_offset_col = 4 * (mi_col & 15); const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col; const uint8_t *src = x->plane[0].src.buf; const int src_stride = x->plane[0].src.stride; const int pred_stride = 64; unsigned int sse; int i; // Variance of whole block. /* if (bs == 8) { int r, c; printf("%d %d\n", mi_row, mi_col); for (r = 0; r < bs; ++r) { for (c = 0; c < bs; ++c) { printf("%3d ", src[r * src_stride + c] - pred[64 * r + c]); } printf("\n"); } printf("\n"); } */ const unsigned int var = cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); features[feature_idx++] = log1pf((float)var); fprintf(f, "%f,%f,", features[0], features[1]); for (i = 0; i < 4; ++i) { const int x_idx = (i & 1) * bs / 2; const int y_idx = (i >> 1) * bs / 2; const int src_offset = y_idx * src_stride + x_idx; const int pred_offset = y_idx * pred_stride + x_idx; // Variance of quarter block. const unsigned int sub_var = cpi->fn_ptr[subsize].vf(src + src_offset, src_stride, pred + pred_offset, pred_stride, &sse); const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; features[feature_idx++] = var_ratio; fprintf(f, "%f,", var_ratio); } fprintf(f, "%d\n", part == PARTITION_NONE ? 0 : 1); } fclose(f); return -1; } } #endif static void duplicate_mode_info_in_sb(AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize) { const int block_width = AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col); const int block_height = AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row); const int mi_stride = xd->mi_stride; MB_MODE_INFO *const src_mi = xd->mi[0]; int i, j; for (j = 0; j < block_height; ++j) for (i = 0; i < block_width; ++i) xd->mi[j * mi_stride + i] = src_mi; } static inline void copy_mbmi_ext_frame_to_mbmi_ext( MB_MODE_INFO_EXT *const mbmi_ext, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_best, uint8_t ref_frame_type) { memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack, sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE])); memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight, sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE])); mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context; mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count; memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs, sizeof(mbmi_ext->global_mvs)); } static void fill_mode_info_sb(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, BLOCK_SIZE bsize, PC_TREE *pc_tree) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; int hbs = mi_size_wide[bsize] >> 1; PARTITION_TYPE partition = pc_tree->partitioning; BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); assert(bsize >= BLOCK_8X8); if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) return; switch (partition) { case PARTITION_NONE: set_mode_info_offsets(&cm->mi_params, &cpi->mbmi_ext_info, x, xd, mi_row, mi_col); *(xd->mi[0]) = pc_tree->none->mic; copy_mbmi_ext_frame_to_mbmi_ext( &x->mbmi_ext, &pc_tree->none->mbmi_ext_best, LAST_FRAME); duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); break; case PARTITION_SPLIT: { fill_mode_info_sb(cpi, x, mi_row, mi_col, subsize, pc_tree->split[0]); fill_mode_info_sb(cpi, x, mi_row, mi_col + hbs, subsize, pc_tree->split[1]); fill_mode_info_sb(cpi, x, mi_row + hbs, mi_col, subsize, pc_tree->split[2]); fill_mode_info_sb(cpi, x, mi_row + hbs, mi_col + hbs, subsize, pc_tree->split[3]); break; } default: break; } } void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost, int do_recon, int64_t best_rd, PC_TREE *pc_tree) { AV1_COMMON *const cm = &cpi->common; TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; const int hbs = mi_size_wide[bsize] >> 1; TokenExtra *tp_orig = *tp; const ModeCosts *mode_costs = &x->mode_costs; RD_STATS this_rdc, best_rdc; RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; int do_split = bsize > BLOCK_8X8; // Override skipping rectangular partition operations for edge blocks const int force_horz_split = (mi_row + 2 * hbs > cm->mi_params.mi_rows); const int force_vert_split = (mi_col + 2 * hbs > cm->mi_params.mi_cols); int partition_none_allowed = !force_horz_split && !force_vert_split; assert(mi_size_wide[bsize] == mi_size_high[bsize]); // Square partition only assert(cm->seq_params->sb_size == BLOCK_64X64); // Small SB so far (void)*tp_orig; av1_invalid_rd_stats(&best_rdc); best_rdc.rdcost = best_rd; #ifndef _COLLECT_GROUND_TRUTH_ if (partition_none_allowed && do_split) { const int ml_predicted_partition = ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col); if (ml_predicted_partition == PARTITION_NONE) do_split = 0; if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0; } #endif xd->above_txfm_context = cm->above_contexts.txfm[tile_info->tile_row] + mi_col; xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3); // PARTITION_NONE if (partition_none_allowed) { pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf); if (!pc_tree->none) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); PICK_MODE_CONTEXT *ctx = pc_tree->none; // Flip for RDO based pick mode #if 0 RD_STATS dummy; av1_invalid_rd_stats(&dummy); pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_NONE, bsize, ctx, dummy); #else pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, ctx); #endif if (this_rdc.rate != INT_MAX) { const int pl = partition_plane_context(xd, mi_row, mi_col, bsize); this_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE]; this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); if (this_rdc.rdcost < best_rdc.rdcost) { best_rdc = this_rdc; if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; } } } // PARTITION_SPLIT if (do_split) { RD_STATS sum_rdc; const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); av1_init_rd_stats(&sum_rdc); for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); if (!pc_tree->split[i]) aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate PC_TREE"); pc_tree->split[i]->index = i; } int pl = partition_plane_context(xd, mi_row, mi_col, bsize); sum_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); for (int i = 0; i < SUB_PARTITIONS_SPLIT && sum_rdc.rdcost < best_rdc.rdcost; ++i) { const int x_idx = (i & 1) * hbs; const int y_idx = (i >> 1) * hbs; if (mi_row + y_idx >= cm->mi_params.mi_rows || mi_col + x_idx >= cm->mi_params.mi_cols) continue; av1_nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize, &this_rdc, i < 3, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]); if (this_rdc.rate == INT_MAX) { av1_invalid_rd_stats(&sum_rdc); } else { sum_rdc.rate += this_rdc.rate; sum_rdc.dist += this_rdc.dist; sum_rdc.rdcost += this_rdc.rdcost; } } if (sum_rdc.rdcost < best_rdc.rdcost) { best_rdc = sum_rdc; pc_tree->partitioning = PARTITION_SPLIT; } } #ifdef _COLLECT_GROUND_TRUTH_ store_partition_data(cpi, x, bsize, mi_row, mi_col, pc_tree->partitioning); #endif *rd_cost = best_rdc; av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3); if (best_rdc.rate == INT_MAX) { av1_invalid_rd_stats(rd_cost); return; } // update mode info array fill_mode_info_sb(cpi, x, mi_row, mi_col, bsize, pc_tree); if (do_recon) { if (bsize == cm->seq_params->sb_size) { // NOTE: To get estimate for rate due to the tokens, use: // int rate_coeffs = 0; // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS, // bsize, pc_tree, &rate_coeffs); set_cb_offsets(x->cb_offset, 0, 0); encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, pc_tree, NULL); } else { encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, pc_tree, NULL); } } if (bsize == BLOCK_64X64 && do_recon) { assert(best_rdc.rate < INT_MAX); assert(best_rdc.dist < INT64_MAX); } else { assert(tp_orig == *tp); } } #endif // CONFIG_RT_ML_PARTITIONING aom-3.12.1/av1/encoder/partition_search.h000066400000000000000000000077221477627663500202260ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_PARTITION_SEARCH_H_ #define AOM_AV1_ENCODER_PARTITION_SEARCH_H_ #include "config/aom_config.h" #include "av1/encoder/block.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/tokenize.h" void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi, const TileInfo *const tile, MACROBLOCK *const x, int mi_row, int mi_col, BLOCK_SIZE bsize); void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile, MACROBLOCK *const x, int mi_row, int mi_col, BLOCK_SIZE bsize); void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, MB_MODE_INFO **mib, TokenExtra **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, int64_t *dist, int do_recon, PC_TREE *pc_tree); void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, MB_MODE_INFO **mib, TokenExtra **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, PC_TREE *pc_tree); #if CONFIG_RT_ML_PARTITIONING void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost, int do_recon, int64_t best_rd, PC_TREE *pc_tree); #endif #if CONFIG_PARTITION_SEARCH_ORDER void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf); void av1_reset_sf_for_ext_part(AV1_COMP *const cpi); bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row, int mi_col, BLOCK_SIZE bsize, RD_STATS *best_rd_cost); #endif bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost, RD_STATS best_rdc, PC_TREE *pc_tree, SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd, SB_MULTI_PASS_MODE multi_pass_mode, RD_RECT_PART_WIN_INFO *rect_part_win_info); static inline void set_cb_offsets(uint16_t *cb_offset, const uint16_t cb_offset_y, const uint16_t cb_offset_uv) { cb_offset[PLANE_TYPE_Y] = cb_offset_y; cb_offset[PLANE_TYPE_UV] = cb_offset_uv; } static inline void update_cb_offsets(MACROBLOCK *x, const BLOCK_SIZE bsize, const int subsampling_x, const int subsampling_y) { x->cb_offset[PLANE_TYPE_Y] += block_size_wide[bsize] * block_size_high[bsize]; if (x->e_mbd.is_chroma_ref) { const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, subsampling_x, subsampling_y); assert(plane_bsize != BLOCK_INVALID); x->cb_offset[PLANE_TYPE_UV] += block_size_wide[plane_bsize] * block_size_high[plane_bsize]; } } #endif // AOM_AV1_ENCODER_PARTITION_SEARCH_H_ aom-3.12.1/av1/encoder/partition_strategy.c000066400000000000000000003124401477627663500206120ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/aom_config.h" #include "av1/encoder/encodeframe_utils.h" #if CONFIG_THREE_PASS #include "av1/encoder/thirdpass.h" #endif #include "config/aom_dsp_rtcd.h" #include "av1/common/enums.h" #include "av1/common/reconinter.h" #if !CONFIG_REALTIME_ONLY #include "av1/encoder/cnn.h" #include "av1/encoder/partition_model_weights.h" #include "av1/encoder/partition_cnn_weights.h" #endif #include "av1/encoder/encoder.h" #include "av1/encoder/motion_search_facade.h" #include "av1/encoder/partition_strategy.h" #include "av1/encoder/partition_search.h" #include "av1/encoder/rdopt.h" #if !CONFIG_REALTIME_ONLY static inline void simple_motion_search_prune_part_features( AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, int mi_row, int mi_col, BLOCK_SIZE bsize, float *features, int features_to_get); static bool ext_ml_model_decision_before_none( AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT], int *partition_none_allowed, int *partition_horz_allowed, int *partition_vert_allowed, int *do_rectangular_split, int *do_square_split); static bool ext_ml_model_decision_before_none_part2( AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART], int *prune_horz, int *prune_vert); static bool ext_ml_model_decision_after_none( ExtPartController *const ext_part_controller, const int is_intra_frame, const float *const features_after_none, int *do_square_split, int *do_rectangular_split); static bool ext_ml_model_decision_after_none_part2( AV1_COMP *const cpi, const float *const features_terminate, int *terminate_partition_search); static bool ext_ml_model_decision_after_split( AV1_COMP *const cpi, const float *const features_terminate, int *terminate_partition_search); static bool ext_ml_model_decision_after_split_part2( ExtPartController *const ext_part_controller, const int is_intra_frame, const float *const features_prune, int *prune_rect_part_horz, int *prune_rect_part_vert); static bool ext_ml_model_decision_after_rect( ExtPartController *const ext_part_controller, const int is_intra_frame, const float *const features_after_rect, int *horza_partition_allowed, int *horzb_partition_allowed, int *verta_partition_allowed, int *vertb_partition_allowed); static bool ext_ml_model_decision_after_part_ab( AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx, int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed, int *const partition_vert4_allowed, unsigned int pb_source_variance, int mi_row, int mi_col); static inline int convert_bsize_to_idx(BLOCK_SIZE bsize) { switch (bsize) { case BLOCK_128X128: return 0; case BLOCK_64X64: return 1; case BLOCK_32X32: return 2; case BLOCK_16X16: return 3; case BLOCK_8X8: return 4; default: assert(0 && "Invalid bsize"); return -1; } } static char *get_feature_file_name(int id) { static char *feature_file_names[] = { "feature_before_partition_none", "feature_before_partition_none_prune_rect", "feature_after_partition_none_prune", "feature_after_partition_none_terminate", "feature_after_partition_split_terminate", "feature_after_partition_split_prune_rect", "feature_after_partition_rect", "feature_after_partition_ab", }; return feature_file_names[id]; } static void write_features_to_file(const char *const path, const bool is_test_mode, const float *features, const int feature_size, const int id, const BLOCK_SIZE bsize, const int mi_row, const int mi_col) { if (!WRITE_FEATURE_TO_FILE && !is_test_mode) return; char filename[256]; snprintf(filename, sizeof(filename), "%s/%s", path, get_feature_file_name(id)); FILE *pfile = fopen(filename, "a"); if (pfile == NULL) return; if (!is_test_mode) { fprintf(pfile, "%d,%d,%d,%d,%d\n", id, (int)bsize, mi_row, mi_col, feature_size); } for (int i = 0; i < feature_size; ++i) { fprintf(pfile, "%.6f", features[i]); if (i < feature_size - 1) fprintf(pfile, ","); } fprintf(pfile, "\n"); fclose(pfile); } // TODO(chiyotsai@google.com): This is very much a work in progress. We still // need to the following: // -- add support for hdres // -- add support for pruning rectangular partitions // -- use reconstructed pixels instead of source pixels for padding // -- use chroma pixels in addition to luma pixels static void intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x, int quad_tree_idx, int intra_cnn_based_part_prune_level, PartitionSearchState *part_state) { assert(cm->seq_params->sb_size >= BLOCK_64X64 && "Invalid sb_size for intra_cnn!"); const PartitionBlkParams *blk_params = &part_state->part_blk_params; const BLOCK_SIZE bsize = blk_params->bsize; const int bsize_idx = convert_bsize_to_idx(bsize); if (bsize == BLOCK_128X128) { return; } PartitionSearchInfo *part_info = &x->part_search_info; // Precompute the CNN part and cache the result in MACROBLOCK if (bsize == BLOCK_64X64 && !part_info->cnn_output_valid) { const CNN_CONFIG *cnn_config = &av1_intra_mode_cnn_partition_cnn_config; // Prepare the output const CNN_THREAD_DATA thread_data = { .num_workers = 1, .workers = NULL }; const int num_outputs = 4; const int output_dims[4] = { 1, 2, 4, 8 }; const int out_chs[4] = { CNN_BRANCH_0_OUT_CH, CNN_BRANCH_1_OUT_CH, CNN_BRANCH_2_OUT_CH, CNN_BRANCH_3_OUT_CH }; float *output_buffer[CNN_TOT_OUT_CH]; float **cur_output_buf = output_buffer; float *curr_buf_ptr = part_info->cnn_buffer; for (int output_idx = 0; output_idx < num_outputs; output_idx++) { const int num_chs = out_chs[output_idx]; const int ch_size = output_dims[output_idx] * output_dims[output_idx]; for (int ch = 0; ch < num_chs; ch++) { cur_output_buf[ch] = curr_buf_ptr; curr_buf_ptr += ch_size; } cur_output_buf += num_chs; } CNN_MULTI_OUT output = { .num_outputs = 4, .output_channels = out_chs, .output_strides = output_dims, .output_buffer = output_buffer, }; // Prepare the input const MACROBLOCKD *xd = &x->e_mbd; const int bit_depth = xd->bd; const int dc_q = av1_dc_quant_QTX(x->qindex, 0, bit_depth) >> (bit_depth - 8); part_info->log_q = log1pf((float)(dc_q * dc_q) / 256.0f); part_info->log_q = (part_info->log_q - av1_intra_mode_cnn_partition_mean[0]) / av1_intra_mode_cnn_partition_std[0]; const int width = 65, height = 65, stride = x->plane[AOM_PLANE_Y].src.stride; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { uint16_t *image[1] = { CONVERT_TO_SHORTPTR(x->plane[AOM_PLANE_Y].src.buf) - stride - 1 }; if (!av1_cnn_predict_img_multi_out_highbd(image, width, height, stride, cnn_config, &thread_data, bit_depth, &output)) { aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Error allocating CNN data"); return; } } else { uint8_t *image[1] = { x->plane[AOM_PLANE_Y].src.buf - stride - 1 }; if (!av1_cnn_predict_img_multi_out(image, width, height, stride, cnn_config, &thread_data, &output)) { aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Error allocating CNN data"); return; } } part_info->cnn_output_valid = 1; } if (!part_info->cnn_output_valid) { return; } const NN_CONFIG *dnn_configs[5] = { NULL, &av1_intra_mode_cnn_partition_branch_0_dnn_config, &av1_intra_mode_cnn_partition_branch_1_dnn_config, &av1_intra_mode_cnn_partition_branch_2_dnn_config, &av1_intra_mode_cnn_partition_branch_3_dnn_config, }; const NN_CONFIG *dnn_config = dnn_configs[bsize_idx]; float dnn_features[100]; float logits[4] = { 0.0f }; const float *branch_0 = part_info->cnn_buffer; const float *branch_1 = branch_0 + CNN_BRANCH_0_OUT_SIZE; const float *branch_2 = branch_1 + CNN_BRANCH_1_OUT_SIZE; const float *branch_3 = branch_2 + CNN_BRANCH_2_OUT_SIZE; if (bsize == BLOCK_64X64) { int f_idx = 0; for (int ch_idx = 0; ch_idx < CNN_BRANCH_0_OUT_CH; ch_idx++) { dnn_features[f_idx++] = branch_0[ch_idx]; } const int spa_stride = 2 * 2; for (int lin_idx = 0; lin_idx < spa_stride; lin_idx++) { for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) { dnn_features[f_idx++] = branch_1[lin_idx + ch_idx * spa_stride]; } } dnn_features[f_idx++] = part_info->log_q; } else if (bsize == BLOCK_32X32) { int f_idx = 0; for (int idx = 0; idx < CNN_BRANCH_0_OUT_CH; idx++) { dnn_features[f_idx++] = branch_0[idx]; } const int curr_lin_idx = quad_to_linear_1[quad_tree_idx - 1]; const int spa_stride = 2 * 2; for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) { dnn_features[f_idx++] = branch_1[curr_lin_idx + ch_idx * spa_stride]; } dnn_features[f_idx++] = part_info->log_q; } else if (bsize == BLOCK_16X16) { int f_idx = 0; const int prev_quad_idx = (quad_tree_idx - 1) / 4; const int prev_lin_idx = quad_to_linear_1[prev_quad_idx - 1]; const int prev_spa_stride = 2 * 2; for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) { dnn_features[f_idx++] = branch_1[prev_lin_idx + ch_idx * prev_spa_stride]; } const int curr_lin_idx = quad_to_linear_2[quad_tree_idx - 5]; const int spa_stride = 4 * 4; for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) { dnn_features[f_idx++] = branch_2[curr_lin_idx + ch_idx * spa_stride]; } dnn_features[f_idx++] = part_info->log_q; } else if (bsize == BLOCK_8X8) { int f_idx = 0; const int prev_quad_idx = (quad_tree_idx - 1) / 4; const int prev_lin_idx = quad_to_linear_2[prev_quad_idx - 5]; const int prev_spa_stride = 4 * 4; for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) { dnn_features[f_idx++] = branch_2[prev_lin_idx + ch_idx * prev_spa_stride]; } const int curr_lin_idx = quad_to_linear_3[quad_tree_idx - 21]; const int spa_stride = 8 * 8; for (int ch_idx = 0; ch_idx < CNN_BRANCH_3_OUT_CH; ch_idx++) { dnn_features[f_idx++] = branch_3[curr_lin_idx + ch_idx * spa_stride]; } dnn_features[f_idx++] = part_info->log_q; } else { assert(0 && "Invalid bsize in intra_cnn partition"); } // Make decision av1_nn_predict(dnn_features, dnn_config, 1, logits); const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; float split_only_thresh = 100.0f, no_split_thresh = -100.0f; if (is_720p_or_larger) { split_only_thresh = av1_intra_mode_cnn_partition_split_thresh_hdres[bsize_idx]; no_split_thresh = av1_intra_mode_cnn_partition_no_split_thresh_hdres[bsize_idx]; } else if (is_480p_or_larger) { split_only_thresh = av1_intra_mode_cnn_partition_split_thresh_midres[bsize_idx]; no_split_thresh = av1_intra_mode_cnn_partition_no_split_thresh_midres[bsize_idx]; } else { split_only_thresh = av1_intra_mode_cnn_partition_split_thresh_lowres[bsize_idx]; no_split_thresh = av1_intra_mode_cnn_partition_no_split_thresh_lowres[bsize_idx]; } if (logits[0] > split_only_thresh) { // As screen contents tend to choose larger partitions, do not prune // PARTITION_NONE when intra_cnn_based_part_prune_level=1. if (intra_cnn_based_part_prune_level != 1) { part_state->partition_none_allowed = 0; } part_state->do_square_split = 1; av1_disable_rect_partitions(part_state); } if (logits[0] < no_split_thresh) { av1_disable_square_split_partition(part_state); } } static inline int get_simple_motion_search_prune_agg(int qindex, int prune_level, int is_rect_part) { assert(prune_level < TOTAL_AGG_LVLS); if (prune_level == NO_PRUNING) { return -1; } // Aggressiveness value for SIMPLE_MOTION_SEARCH_PRUNE_LEVEL except // QIDX_BASED_AGG_LVL const int sms_prune_agg_levels[TOTAL_SIMPLE_AGG_LVLS] = { 0, 1, 2, 3 }; if (prune_level < TOTAL_SIMPLE_AGG_LVLS) { return sms_prune_agg_levels[prune_level]; } // Map the QIDX_BASED_AGG_LVL to corresponding aggressiveness value. // Aggressive pruning for lower quantizers in non-boosted frames to prune // rectangular partitions. const int qband = is_rect_part ? (qindex <= 90 ? 1 : 0) : 0; const int sms_prune_agg_qindex_based[2] = { 1, 2 }; return sms_prune_agg_qindex_based[qband]; } // Performs a simple_motion_search with a single reference frame and extract // the variance of residues. Then use the features to determine whether we want // to go straight to splitting without trying PARTITION_NONE static void simple_motion_search_based_split(AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, PartitionSearchState *part_state) { const AV1_COMMON *const cm = &cpi->common; const PartitionBlkParams *blk_params = &part_state->part_blk_params; const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; const BLOCK_SIZE bsize = blk_params->bsize; const int bsize_idx = convert_bsize_to_idx(bsize); const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; // res_idx is 0 for res < 480p, 1 for 480p, 2 for 720p+ const int res_idx = is_480p_or_larger + is_720p_or_larger; assert(bsize_idx >= 0 && bsize_idx <= 4 && "Invalid bsize in simple_motion_search_based_split"); const float *ml_mean = av1_simple_motion_search_split_mean[bsize_idx]; const float *ml_std = av1_simple_motion_search_split_std[bsize_idx]; const NN_CONFIG *nn_config = av1_simple_motion_search_split_nn_config[bsize_idx]; const int agg = get_simple_motion_search_prune_agg( x->qindex, cpi->sf.part_sf.simple_motion_search_prune_agg, 0); if (agg < 0) { return; } const float split_only_thresh = av1_simple_motion_search_split_thresh[agg][res_idx][bsize_idx]; const float no_split_thresh = av1_simple_motion_search_no_split_thresh[agg][res_idx][bsize_idx]; float features[FEATURE_SIZE_SMS_SPLIT] = { 0.0f }; simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col, bsize, features, FEATURE_SMS_SPLIT_MODEL_FLAG); // Write features to file write_features_to_file(cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode, features, FEATURE_SIZE_SMS_SPLIT, 0, bsize, mi_row, mi_col); // Note: it is intended to not normalize the features here, to keep it // consistent for all features collected and passed to the external model. if (ext_ml_model_decision_before_none( cpi, features, &part_state->partition_none_allowed, &part_state->partition_rect_allowed[HORZ], &part_state->partition_rect_allowed[VERT], &part_state->do_rectangular_split, &part_state->do_square_split)) { return; } for (int idx = 0; idx < FEATURE_SIZE_SMS_SPLIT; idx++) { features[idx] = (features[idx] - ml_mean[idx]) / ml_std[idx]; } float score = 0.0f; av1_nn_predict(features, nn_config, 1, &score); if (score > split_only_thresh) { av1_set_square_split_only(part_state); } if (cpi->sf.part_sf.simple_motion_search_split >= 2 && score < no_split_thresh) { av1_disable_square_split_partition(part_state); } // If the score is very low, prune rectangular split since it is unlikely to // occur. if (cpi->sf.part_sf.simple_motion_search_rect_split) { const float scale = res_idx >= 2 ? 3.0f : 2.0f; const float rect_split_thresh = scale * av1_simple_motion_search_no_split_thresh [cpi->sf.part_sf.simple_motion_search_rect_split][res_idx] [bsize_idx]; if (score < rect_split_thresh) { part_state->do_rectangular_split = 0; } } } // Given a list of ref frames in refs, performs simple_motion_search on each of // the refs and returns the ref with the smallest sse. Returns -1 if none of the // ref in the list is available. Also stores the best sse and var in best_sse, // best_var, respectively. If save_mv is 0, don't update mv_ref_fulls in // sms_tree. If save_mv is 1, update mv_ref_fulls under sms_tree and the // subtrees. static int simple_motion_search_get_best_ref( AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, int mi_row, int mi_col, BLOCK_SIZE bsize, const int *const refs, int num_refs, int use_subpixel, int save_mv, unsigned int *best_sse, unsigned int *best_var) { const AV1_COMMON *const cm = &cpi->common; int best_ref = -1; if (mi_col >= cm->mi_params.mi_cols || mi_row >= cm->mi_params.mi_rows) { // If the whole block is outside of the image, set the var and sse to 0. *best_var = 0; *best_sse = 0; return best_ref; } // Otherwise do loop through the reference frames and find the one with the // minimum SSE const int num_planes = 1; *best_sse = INT_MAX; for (int ref_idx = 0; ref_idx < num_refs; ref_idx++) { const int ref = refs[ref_idx]; if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref]) { const FULLPEL_MV *start_mvs = sms_tree->start_mvs; unsigned int curr_sse = 0, curr_var = 0; const int_mv best_mv = av1_simple_motion_search_sse_var( cpi, x, mi_row, mi_col, bsize, ref, start_mvs[ref], num_planes, use_subpixel, &curr_sse, &curr_var); if (curr_sse < *best_sse) { *best_sse = curr_sse; *best_var = curr_var; best_ref = ref; } if (save_mv) { sms_tree->start_mvs[ref].row = best_mv.as_mv.row / 8; sms_tree->start_mvs[ref].col = best_mv.as_mv.col / 8; if (bsize >= BLOCK_8X8) { for (int r_idx = 0; r_idx < SUB_PARTITIONS_SPLIT; r_idx++) { // Propagate the new motion vectors to a lower level SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[r_idx]; sub_tree->start_mvs[ref] = sms_tree->start_mvs[ref]; } } } } } return best_ref; } // Collects features using simple_motion_search and store them in features. The // features are also cached in SIMPLE_MOTION_DATA_TREE. By default, the features // collected are the sse and var from the subblocks flagged by features_to_get. // Furthermore, if features is not NULL, then 7 more features are appended to // the end of features: // - log(1.0 + dc_q ** 2) // - whether an above macroblock exists // - width of above macroblock // - height of above macroblock // - whether a left marcoblock exists // - width of left macroblock // - height of left macroblock static inline void simple_motion_search_prune_part_features( AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, int mi_row, int mi_col, BLOCK_SIZE bsize, float *features, int features_to_get) { const int w_mi = mi_size_wide[bsize]; const int h_mi = mi_size_high[bsize]; assert(mi_size_wide[bsize] == mi_size_high[bsize]); assert(bsize >= BLOCK_8X8); assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] || cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]); // Setting up motion search const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME }; const int num_refs = 1; const int use_subpixel = 1; // Doing whole block first to update the mv if (!sms_tree->sms_none_valid && features_to_get & FEATURE_SMS_NONE_FLAG) { simple_motion_search_get_best_ref(cpi, x, sms_tree, mi_row, mi_col, bsize, ref_list, num_refs, use_subpixel, 1, &sms_tree->sms_none_feat[0], &sms_tree->sms_none_feat[1]); sms_tree->sms_none_valid = 1; } // Split subblocks if (features_to_get & FEATURE_SMS_SPLIT_FLAG) { const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); for (int r_idx = 0; r_idx < SUB_PARTITIONS_SPLIT; r_idx++) { const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2; const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2; SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[r_idx]; if (!sub_tree->sms_none_valid) { simple_motion_search_get_best_ref( cpi, x, sub_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs, use_subpixel, 1, &sub_tree->sms_none_feat[0], &sub_tree->sms_none_feat[1]); sub_tree->sms_none_valid = 1; } } } // Rectangular subblocks if (!sms_tree->sms_rect_valid && features_to_get & FEATURE_SMS_RECT_FLAG) { // Horz subblock BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ); for (int r_idx = 0; r_idx < SUB_PARTITIONS_RECT; r_idx++) { const int sub_mi_col = mi_col + 0; const int sub_mi_row = mi_row + r_idx * h_mi / 2; simple_motion_search_get_best_ref( cpi, x, sms_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs, use_subpixel, 0, &sms_tree->sms_rect_feat[2 * r_idx], &sms_tree->sms_rect_feat[2 * r_idx + 1]); } // Vert subblock subsize = get_partition_subsize(bsize, PARTITION_VERT); for (int r_idx = 0; r_idx < SUB_PARTITIONS_RECT; r_idx++) { const int sub_mi_col = mi_col + r_idx * w_mi / 2; const int sub_mi_row = mi_row + 0; simple_motion_search_get_best_ref( cpi, x, sms_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs, use_subpixel, 0, &sms_tree->sms_rect_feat[4 + 2 * r_idx], &sms_tree->sms_rect_feat[4 + 2 * r_idx + 1]); } sms_tree->sms_rect_valid = 1; } if (!features) return; int f_idx = 0; if (features_to_get & FEATURE_SMS_NONE_FLAG) { for (int sub_idx = 0; sub_idx < 2; sub_idx++) { features[f_idx++] = log1pf((float)sms_tree->sms_none_feat[sub_idx]); } } if (features_to_get & FEATURE_SMS_SPLIT_FLAG) { for (int sub_idx = 0; sub_idx < SUB_PARTITIONS_SPLIT; sub_idx++) { SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[sub_idx]; features[f_idx++] = log1pf((float)sub_tree->sms_none_feat[0]); features[f_idx++] = log1pf((float)sub_tree->sms_none_feat[1]); } } if (features_to_get & FEATURE_SMS_RECT_FLAG) { for (int sub_idx = 0; sub_idx < 8; sub_idx++) { features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[sub_idx]); } } const MACROBLOCKD *xd = &x->e_mbd; set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize); // Q_INDEX const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); features[f_idx++] = log1pf((float)(dc_q * dc_q) / 256.0f); // Neighbor stuff const int has_above = !!xd->above_mbmi; const int has_left = !!xd->left_mbmi; const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->bsize : bsize; const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->bsize : bsize; features[f_idx++] = (float)has_above; features[f_idx++] = (float)mi_size_wide_log2[above_bsize]; features[f_idx++] = (float)mi_size_high_log2[above_bsize]; features[f_idx++] = (float)has_left; features[f_idx++] = (float)mi_size_wide_log2[left_bsize]; features[f_idx++] = (float)mi_size_high_log2[left_bsize]; } // Performs a simple_motion_search with two reference frames and extract // the variance of residues. Then use the features to determine whether we want // to prune some partitions. static void simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, PartitionSearchState *part_state) { const AV1_COMMON *const cm = &cpi->common; const PartitionBlkParams *blk_params = &part_state->part_blk_params; const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; const BLOCK_SIZE bsize = blk_params->bsize; const int bsize_idx = convert_bsize_to_idx(bsize); const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; // res_idx is 0 for lowres, 1 for 48p, 2 for 720p+ const int res_idx = is_480p_or_larger + is_720p_or_larger; // Get model parameters const NN_CONFIG *nn_config = av1_simple_motion_search_prune_rect_nn_config[bsize_idx]; const float *ml_mean = av1_simple_motion_search_prune_rect_mean[bsize_idx], *ml_std = av1_simple_motion_search_prune_rect_std[bsize_idx]; const int agg = get_simple_motion_search_prune_agg( x->qindex, cpi->sf.part_sf.simple_motion_search_prune_agg, 1); if (agg < 0) { return; } const float prune_thresh = av1_simple_motion_search_prune_rect_thresh[agg][res_idx][bsize_idx]; // If there is no valid threshold, return immediately. if (!nn_config || prune_thresh == 0.0f) { return; } // Get features float features[FEATURE_SIZE_SMS_PRUNE_PART] = { 0.0f }; simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col, bsize, features, FEATURE_SMS_PRUNE_PART_FLAG); // Note: it is intended to not normalize the features here, to keep it // consistent for all features collected and passed to the external model. if (cpi->sf.part_sf.simple_motion_search_prune_rect && !frame_is_intra_only(cm) && (part_state->partition_rect_allowed[HORZ] || part_state->partition_rect_allowed[VERT]) && bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) { // Write features to file write_features_to_file( cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode, features, FEATURE_SIZE_SMS_PRUNE_PART, 1, bsize, mi_row, mi_col); if (ext_ml_model_decision_before_none_part2( cpi, features, &part_state->prune_rect_part[HORZ], &part_state->prune_rect_part[VERT])) { return; } } for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) { features[f_idx] = (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx]; } // Get probabilities float scores[EXT_PARTITION_TYPES] = { 0.0f }, probs[EXT_PARTITION_TYPES] = { 0.0f }; const int num_classes = (bsize == BLOCK_128X128 || bsize == BLOCK_8X8) ? PARTITION_TYPES : EXT_PARTITION_TYPES; av1_nn_predict(features, nn_config, 1, scores); av1_nn_softmax(scores, probs, num_classes); // Determine if we should prune rectangular partitions. if (probs[PARTITION_HORZ] <= prune_thresh) { part_state->prune_rect_part[HORZ] = 1; } if (probs[PARTITION_VERT] <= prune_thresh) { part_state->prune_rect_part[VERT] = 1; } } // Early terminates PARTITION_NONE using simple_motion_search features and the // rate, distortion, and rdcost of PARTITION_NONE. This is only called when: // - The frame is a show frame // - The frame is not intra only // - The current bsize is > BLOCK_8X8 // - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols void av1_simple_motion_search_early_term_none( AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, const RD_STATS *none_rdc, PartitionSearchState *part_state) { const PartitionBlkParams *blk_params = &part_state->part_blk_params; const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; const BLOCK_SIZE bsize = blk_params->bsize; float features[FEATURE_SIZE_SMS_TERM_NONE] = { 0.0f }; simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col, bsize, features, FEATURE_SMS_PRUNE_PART_FLAG); int f_idx = FEATURE_SIZE_SMS_PRUNE_PART; features[f_idx++] = log1pf((float)none_rdc->rate); features[f_idx++] = log1pf((float)none_rdc->dist); features[f_idx++] = log1pf((float)none_rdc->rdcost); assert(f_idx == FEATURE_SIZE_SMS_TERM_NONE); const float *ml_mean = NULL; const float *ml_std = NULL; const float *ml_model = NULL; if (bsize == BLOCK_128X128) { ml_mean = av1_simple_motion_search_term_none_mean_128; ml_std = av1_simple_motion_search_term_none_std_128; ml_model = av1_simple_motion_search_term_none_model_128; } else if (bsize == BLOCK_64X64) { ml_mean = av1_simple_motion_search_term_none_mean_64; ml_std = av1_simple_motion_search_term_none_std_64; ml_model = av1_simple_motion_search_term_none_model_64; } else if (bsize == BLOCK_32X32) { ml_mean = av1_simple_motion_search_term_none_mean_32; ml_std = av1_simple_motion_search_term_none_std_32; ml_model = av1_simple_motion_search_term_none_model_32; } else if (bsize == BLOCK_16X16) { ml_mean = av1_simple_motion_search_term_none_mean_16; ml_std = av1_simple_motion_search_term_none_std_16; ml_model = av1_simple_motion_search_term_none_model_16; } else { assert(0 && "Unexpected block size in simple_motion_term_none"); } // Write features to file write_features_to_file(cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode, features, FEATURE_SIZE_SMS_TERM_NONE, 3, bsize, mi_row, mi_col); if (ext_ml_model_decision_after_none_part2( cpi, features, &part_state->terminate_partition_search)) { return; } if (ml_model) { float score = 0.0f; for (f_idx = 0; f_idx < FEATURE_SIZE_SMS_TERM_NONE; f_idx++) { score += ml_model[f_idx] * (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx]; } score += ml_model[FEATURE_SIZE_SMS_TERM_NONE]; if (score >= 0.0f) { part_state->terminate_partition_search = 1; } } } void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, int mi_col, float *features) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; const BLOCK_SIZE sb_size = cm->seq_params->sb_size; // Currently this only allows 128X128 SB size. May extend it to 64X64 SB size. assert(sb_size == BLOCK_128X128); int f_idx = 0; const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); const float log_q_sq = log1pf((float)(dc_q * dc_q) / 256.0f); // Perform full-pixel single motion search in Y plane of 16x16 mbs in the sb float sum_mv_row_sq = 0; float sum_mv_row = 0; float min_abs_mv_row = FLT_MAX; float max_abs_mv_row = 0; float sum_mv_col_sq = 0; float sum_mv_col = 0; float min_abs_mv_col = FLT_MAX; float max_abs_mv_col = 0; float sum_log_sse_sq = 0; float sum_log_sse = 0; float min_log_sse = FLT_MAX; float max_log_sse = 0; const BLOCK_SIZE mb_size = BLOCK_16X16; const int mb_rows = block_size_high[sb_size] / block_size_high[mb_size]; const int mb_cols = block_size_wide[sb_size] / block_size_wide[mb_size]; const int mb_in_mi_size_high_log2 = mi_size_high_log2[mb_size]; const int mb_in_mi_size_wide_log2 = mi_size_wide_log2[mb_size]; for (int mb_row = 0; mb_row < mb_rows; mb_row++) for (int mb_col = 0; mb_col < mb_cols; mb_col++) { const int this_mi_row = mi_row + (mb_row << mb_in_mi_size_high_log2); const int this_mi_col = mi_col + (mb_col << mb_in_mi_size_wide_log2); unsigned int sse = 0; unsigned int var = 0; const FULLPEL_MV start_mv = kZeroFullMv; const MV_REFERENCE_FRAME ref = cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME; const int_mv best_mv = av1_simple_motion_search_sse_var( cpi, x, this_mi_row, this_mi_col, mb_size, ref, start_mv, 1, 0, &sse, &var); const float mv_row = (float)(best_mv.as_mv.row / 8); const float mv_col = (float)(best_mv.as_mv.col / 8); const float log_sse = log1pf((float)sse); const float abs_mv_row = fabsf(mv_row); const float abs_mv_col = fabsf(mv_col); sum_mv_row_sq += mv_row * mv_row; sum_mv_row += mv_row; sum_mv_col_sq += mv_col * mv_col; sum_mv_col += mv_col; if (abs_mv_row < min_abs_mv_row) min_abs_mv_row = abs_mv_row; if (abs_mv_row > max_abs_mv_row) max_abs_mv_row = abs_mv_row; if (abs_mv_col < min_abs_mv_col) min_abs_mv_col = abs_mv_col; if (abs_mv_col > max_abs_mv_col) max_abs_mv_col = abs_mv_col; sum_log_sse_sq += log_sse * log_sse; sum_log_sse += log_sse; if (log_sse < min_log_sse) min_log_sse = log_sse; if (log_sse > max_log_sse) max_log_sse = log_sse; } const int blks = mb_rows * mb_cols; const float avg_mv_row = sum_mv_row / (float)blks; const float var_mv_row = sum_mv_row_sq / (float)blks - avg_mv_row * avg_mv_row; const float avg_mv_col = sum_mv_col / (float)blks; const float var_mv_col = sum_mv_col_sq / (float)blks - avg_mv_col * avg_mv_col; const float avg_log_sse = sum_log_sse / (float)blks; const float var_log_sse = sum_log_sse_sq / (float)blks - avg_log_sse * avg_log_sse; features[f_idx++] = avg_log_sse; features[f_idx++] = avg_mv_col; features[f_idx++] = avg_mv_row; features[f_idx++] = log_q_sq; features[f_idx++] = max_abs_mv_col; features[f_idx++] = max_abs_mv_row; features[f_idx++] = max_log_sse; features[f_idx++] = min_abs_mv_col; features[f_idx++] = min_abs_mv_row; features[f_idx++] = min_log_sse; features[f_idx++] = var_log_sse; features[f_idx++] = var_mv_col; features[f_idx++] = var_mv_row; assert(f_idx == FEATURE_SIZE_MAX_MIN_PART_PRED); } // Convert result index to block size. // result idx block size // 0 BLOCK_16X16 // 1 BLOCK_32X32 // 2 BLOCK_64X64 // 3 BLOCK_128X128 static BLOCK_SIZE get_block_size(int idx) { return (BLOCK_SIZE)((idx + 2) * 3); } BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi, const MACROBLOCK *const x, const float *features) { float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f }; const NN_CONFIG *nn_config = &av1_max_part_pred_nn_config; assert(cpi->sf.part_sf.auto_max_partition_based_on_simple_motion != NOT_IN_USE); av1_nn_predict(features, nn_config, 1, scores); int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion == DIRECT_PRED) { result = 0; float max_score = scores[0]; for (int i = 1; i < MAX_NUM_CLASSES_MAX_MIN_PART_PRED; ++i) { if (scores[i] > max_score) { max_score = scores[i]; result = i; } } return get_block_size(result); } float probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f }; av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED); if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion == RELAXED_PRED) { for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0; --result) { if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) { probs[result] += probs[result + 1]; } if (probs[result] > 0.2) break; } } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion == ADAPT_PRED) { const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size; // TODO(debargha): x->source_variance is unavailable at this point, // so compute. The redundant recomputation later can be removed. const unsigned int source_variance = av1_get_perpixel_variance_facade( cpi, &x->e_mbd, &x->plane[0].src, sb_size, AOM_PLANE_Y); if (source_variance > 16) { const double thresh = source_variance < 128 ? 0.05 : 0.1; for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0; --result) { if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) { probs[result] += probs[result + 1]; } if (probs[result] > thresh) break; } } } return get_block_size(result); } // Get the minimum partition block width and height(in log scale) under a // SIMPLE_MOTION_DATA_TREE. static inline void get_min_bsize(const SIMPLE_MOTION_DATA_TREE *sms_tree, int *min_bw, int *min_bh) { if (!sms_tree) return; const BLOCK_SIZE bsize = sms_tree->block_size; if (bsize == BLOCK_4X4) { *min_bw = 0; *min_bh = 0; return; } PARTITION_TYPE part_type = sms_tree->partitioning; if (part_type == PARTITION_INVALID) return; if (part_type == PARTITION_SPLIT) { for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { get_min_bsize(sms_tree->split[i], min_bw, min_bh); } } else { if (part_type == PARTITION_HORZ_A || part_type == PARTITION_HORZ_B || part_type == PARTITION_VERT_A || part_type == PARTITION_VERT_B) part_type = PARTITION_SPLIT; const BLOCK_SIZE subsize = get_partition_subsize(bsize, part_type); if (subsize != BLOCK_INVALID) { *min_bw = AOMMIN(*min_bw, mi_size_wide_log2[subsize]); *min_bh = AOMMIN(*min_bh, mi_size_high_log2[subsize]); } } } static inline void add_rd_feature(int64_t rd, int64_t best_rd, float *features, int *feature_idx) { const int rd_valid = rd > 0 && rd < INT64_MAX; const float rd_ratio = rd_valid ? (float)rd / best_rd : 1.0f; features[(*feature_idx)++] = (float)rd_valid; features[(*feature_idx)++] = rd_ratio; } #define FEATURES 31 void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x, SIMPLE_MOTION_DATA_TREE *const sms_tree, int64_t best_rd, int64_t part_none_rd, int64_t part_split_rd, int64_t *split_block_rd, PartitionSearchState *part_state) { const PartitionBlkParams *blk_params = &part_state->part_blk_params; const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; const BLOCK_SIZE bsize = blk_params->bsize; if (best_rd <= 0 || best_rd == INT64_MAX || part_state->terminate_partition_search) return; const AV1_COMMON *const cm = &cpi->common; const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; const NN_CONFIG *nn_config = NULL; float thresh = -1e6; switch (bsize) { case BLOCK_128X128: break; case BLOCK_64X64: nn_config = &av1_early_term_after_split_nnconfig_64; thresh = is_480p_or_larger ? -2.0f : -1.2f; break; case BLOCK_32X32: nn_config = &av1_early_term_after_split_nnconfig_32; thresh = is_480p_or_larger ? -2.6f : -2.3f; break; case BLOCK_16X16: nn_config = &av1_early_term_after_split_nnconfig_16; thresh = is_480p_or_larger ? -2.0f : -2.4f; break; case BLOCK_8X8: nn_config = &av1_early_term_after_split_nnconfig_8; thresh = is_480p_or_larger ? -1.0f : -1.4f; break; case BLOCK_4X4: break; default: assert(0 && "Invalid block size in av1_ml_early_term_after_split()."); break; } if (!nn_config) return; // Use more conservative threshold for level 1. if (cpi->sf.part_sf.ml_early_term_after_part_split_level < 2) thresh -= 0.3f; const MACROBLOCKD *const xd = &x->e_mbd; const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); const int bs = block_size_wide[bsize]; int f_idx = 0; float features[FEATURES] = { 0.0f }; features[f_idx++] = log1pf((float)dc_q / 4.0f); features[f_idx++] = log1pf((float)best_rd / bs / bs / 1024.0f); add_rd_feature(part_none_rd, best_rd, features, &f_idx); add_rd_feature(part_split_rd, best_rd, features, &f_idx); for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { add_rd_feature(split_block_rd[i], best_rd, features, &f_idx); int min_bw = MAX_SB_SIZE_LOG2; int min_bh = MAX_SB_SIZE_LOG2; get_min_bsize(sms_tree->split[i], &min_bw, &min_bh); features[f_idx++] = (float)min_bw; features[f_idx++] = (float)min_bh; } simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col, bsize, NULL, FEATURE_SMS_PRUNE_PART_FLAG); features[f_idx++] = log1pf((float)sms_tree->sms_none_feat[1]); features[f_idx++] = log1pf((float)sms_tree->split[0]->sms_none_feat[1]); features[f_idx++] = log1pf((float)sms_tree->split[1]->sms_none_feat[1]); features[f_idx++] = log1pf((float)sms_tree->split[2]->sms_none_feat[1]); features[f_idx++] = log1pf((float)sms_tree->split[3]->sms_none_feat[1]); features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[1]); features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[3]); features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[5]); features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[7]); assert(f_idx == FEATURES); // Write features to file write_features_to_file(cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode, features, FEATURES, 4, bsize, mi_row, mi_col); if (ext_ml_model_decision_after_split( cpi, features, &part_state->terminate_partition_search)) { return; } float score = 0.0f; av1_nn_predict(features, nn_config, 1, &score); // Score is indicator of confidence that we should NOT terminate. if (score < thresh) { part_state->terminate_partition_search = 1; } } #undef FEATURES void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x, int64_t best_rd, int64_t none_rd, const int64_t *split_rd, PartitionSearchState *part_state) { const PartitionBlkParams *blk_params = &part_state->part_blk_params; const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; const BLOCK_SIZE bsize = blk_params->bsize; if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return; best_rd = AOMMAX(best_rd, 1); const NN_CONFIG *nn_config = NULL; const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f }; float cur_thresh = 0.0f; switch (bsize) { case BLOCK_8X8: nn_config = &av1_rect_partition_nnconfig_8; cur_thresh = prob_thresholds[0]; break; case BLOCK_16X16: nn_config = &av1_rect_partition_nnconfig_16; cur_thresh = prob_thresholds[1]; break; case BLOCK_32X32: nn_config = &av1_rect_partition_nnconfig_32; cur_thresh = prob_thresholds[2]; break; case BLOCK_64X64: nn_config = &av1_rect_partition_nnconfig_64; cur_thresh = prob_thresholds[3]; break; case BLOCK_128X128: nn_config = &av1_rect_partition_nnconfig_128; cur_thresh = prob_thresholds[4]; break; default: assert(0 && "Unexpected bsize."); } if (!nn_config) return; // 1. Compute input features float features[9]; // RD cost ratios for (int i = 0; i < 5; i++) features[i] = 1.0f; if (none_rd > 0 && none_rd < 1000000000) features[0] = (float)none_rd / (float)best_rd; for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { if (split_rd[i] > 0 && split_rd[i] < 1000000000) features[1 + i] = (float)split_rd[i] / (float)best_rd; } // Variance ratios const MACROBLOCKD *const xd = &x->e_mbd; int whole_block_variance; whole_block_variance = av1_get_perpixel_variance_facade( cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y); whole_block_variance = AOMMAX(whole_block_variance, 1); int split_variance[SUB_PARTITIONS_SPLIT]; const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); struct buf_2d buf; buf.stride = x->plane[0].src.stride; const int bw = block_size_wide[bsize]; for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { const int x_idx = (i & 1) * bw / 2; const int y_idx = (i >> 1) * bw / 2; buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride; split_variance[i] = av1_get_perpixel_variance_facade(cpi, xd, &buf, subsize, AOM_PLANE_Y); } for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) features[5 + i] = (float)split_variance[i] / (float)whole_block_variance; // Write features to file write_features_to_file(cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode, features, /*feature_size=*/9, 5, bsize, mi_row, mi_col); if (ext_ml_model_decision_after_split_part2( &cpi->ext_part_controller, frame_is_intra_only(&cpi->common), features, &part_state->prune_rect_part[HORZ], &part_state->prune_rect_part[VERT])) { return; } // 2. Do the prediction and prune 0-2 partitions based on their probabilities float raw_scores[3] = { 0.0f }; av1_nn_predict(features, nn_config, 1, raw_scores); float probs[3] = { 0.0f }; av1_nn_softmax(raw_scores, probs, 3); // probs[0] is the probability of the fact that both rectangular partitions // are worse than current best_rd if (probs[1] <= cur_thresh) part_state->prune_rect_part[HORZ] = 1; if (probs[2] <= cur_thresh) part_state->prune_rect_part[VERT] = 1; } // Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be // considered. static void ml_prune_ab_partition(AV1_COMP *const cpi, int part_ctx, int var_ctx, int64_t best_rd, PartitionSearchState *part_state, int *ab_partitions_allowed) { const PartitionBlkParams blk_params = part_state->part_blk_params; const int mi_row = blk_params.mi_row; const int mi_col = blk_params.mi_col; const BLOCK_SIZE bsize = blk_params.bsize; if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return; const NN_CONFIG *nn_config = NULL; switch (bsize) { case BLOCK_8X8: nn_config = NULL; break; case BLOCK_16X16: nn_config = &av1_ab_partition_nnconfig_16; break; case BLOCK_32X32: nn_config = &av1_ab_partition_nnconfig_32; break; case BLOCK_64X64: nn_config = &av1_ab_partition_nnconfig_64; break; case BLOCK_128X128: nn_config = &av1_ab_partition_nnconfig_128; break; default: assert(0 && "Unexpected bsize."); } if (!nn_config) return; // Generate features. float features[10]; int feature_index = 0; features[feature_index++] = (float)part_ctx; features[feature_index++] = (float)var_ctx; const int rdcost = (int)AOMMIN(INT_MAX, best_rd); int sub_block_rdcost[8] = { 0 }; int rd_index = 0; for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { const int64_t *horz_rd = part_state->rect_part_rd[HORZ]; if (horz_rd[i] > 0 && horz_rd[i] < 1000000000) sub_block_rdcost[rd_index] = (int)horz_rd[i]; ++rd_index; } for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { const int64_t *vert_rd = part_state->rect_part_rd[VERT]; if (vert_rd[i] > 0 && vert_rd[i] < 1000000000) sub_block_rdcost[rd_index] = (int)vert_rd[i]; ++rd_index; } for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { const int64_t *split_rd = part_state->split_rd; if (split_rd[i] > 0 && split_rd[i] < 1000000000) sub_block_rdcost[rd_index] = (int)split_rd[i]; ++rd_index; } for (int i = 0; i < 8; ++i) { // Ratio between the sub-block RD and the whole-block RD. float rd_ratio = 1.0f; if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost) rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost; features[feature_index++] = rd_ratio; } assert(feature_index == 10); // Write features to file if (!frame_is_intra_only(&cpi->common)) { write_features_to_file(cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode, features, /*feature_size=*/10, 6, bsize, mi_row, mi_col); } if (ext_ml_model_decision_after_rect( &cpi->ext_part_controller, frame_is_intra_only(&cpi->common), features, &ab_partitions_allowed[HORZ_A], &ab_partitions_allowed[HORZ_B], &ab_partitions_allowed[VERT_A], &ab_partitions_allowed[VERT_B])) { return; } // Calculate scores using the NN model. float score[16] = { 0.0f }; av1_nn_predict(features, nn_config, 1, score); int int_score[16]; int max_score = -1000; for (int i = 0; i < 16; ++i) { int_score[i] = (int)(100 * score[i]); max_score = AOMMAX(int_score[i], max_score); } // Make decisions based on the model scores. int thresh = max_score; switch (bsize) { case BLOCK_16X16: thresh -= 150; break; case BLOCK_32X32: thresh -= 100; break; default: break; } av1_zero_array(ab_partitions_allowed, NUM_AB_PARTS); for (int i = 0; i < 16; ++i) { if (int_score[i] >= thresh) { if ((i >> 0) & 1) ab_partitions_allowed[HORZ_A] = 1; if ((i >> 1) & 1) ab_partitions_allowed[HORZ_B] = 1; if ((i >> 2) & 1) ab_partitions_allowed[VERT_A] = 1; if ((i >> 3) & 1) ab_partitions_allowed[VERT_B] = 1; } } } #define FEATURES 18 #define LABELS 4 // Use a ML model to predict if horz4 and vert4 should be considered. void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x, int part_ctx, int64_t best_rd, PartitionSearchState *part_state, int *part4_allowed, unsigned int pb_source_variance) { const PartitionBlkParams blk_params = part_state->part_blk_params; const int mi_row = blk_params.mi_row; const int mi_col = blk_params.mi_col; const BLOCK_SIZE bsize = blk_params.bsize; int64_t(*rect_part_rd)[SUB_PARTITIONS_RECT] = part_state->rect_part_rd; int64_t *split_rd = part_state->split_rd; if (ext_ml_model_decision_after_part_ab( cpi, x, bsize, part_ctx, best_rd, rect_part_rd, split_rd, &part4_allowed[HORZ4], &part4_allowed[VERT4], pb_source_variance, mi_row, mi_col)) return; if (best_rd >= 1000000000) return; int64_t *horz_rd = rect_part_rd[HORZ4]; int64_t *vert_rd = rect_part_rd[VERT4]; const NN_CONFIG *nn_config = NULL; // 4-way partitions are only allowed for these three square block sizes. switch (bsize) { case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break; case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break; case BLOCK_64X64: nn_config = &av1_4_partition_nnconfig_64; break; default: assert(0 && "Unexpected bsize."); } if (!nn_config) return; // Generate features. float features[FEATURES]; int feature_index = 0; features[feature_index++] = (float)part_ctx; features[feature_index++] = (float)get_unsigned_bits(pb_source_variance); const int rdcost = (int)AOMMIN(INT_MAX, best_rd); int sub_block_rdcost[8] = { 0 }; int rd_index = 0; for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { if (horz_rd[i] > 0 && horz_rd[i] < 1000000000) sub_block_rdcost[rd_index] = (int)horz_rd[i]; ++rd_index; } for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { if (vert_rd[i] > 0 && vert_rd[i] < 1000000000) sub_block_rdcost[rd_index] = (int)vert_rd[i]; ++rd_index; } for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { if (split_rd[i] > 0 && split_rd[i] < 1000000000) sub_block_rdcost[rd_index] = (int)split_rd[i]; ++rd_index; } for (int i = 0; i < 8; ++i) { // Ratio between the sub-block RD and the whole-block RD. float rd_ratio = 1.0f; if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost) rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost; features[feature_index++] = rd_ratio; } // Get variance of the 1:4 and 4:1 sub-blocks. unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 }; unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 }; { BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4); BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4); assert(horz_4_bs != BLOCK_INVALID); assert(vert_4_bs != BLOCK_INVALID); av1_setup_src_planes(x, cpi->source, mi_row, mi_col, av1_num_planes(&cpi->common), bsize); const int src_stride = x->plane[0].src.stride; uint8_t *src = x->plane[0].src.buf; const MACROBLOCKD *const xd = &x->e_mbd; struct buf_2d horz_4_src, vert_4_src; horz_4_src.stride = src_stride; vert_4_src.stride = src_stride; for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride; vert_4_src.buf = src + i * block_size_wide[vert_4_bs]; horz_4_source_var[i] = av1_get_perpixel_variance_facade( cpi, xd, &horz_4_src, horz_4_bs, AOM_PLANE_Y); vert_4_source_var[i] = av1_get_perpixel_variance_facade( cpi, xd, &vert_4_src, vert_4_bs, AOM_PLANE_Y); } } const float denom = (float)(pb_source_variance + 1); const float low_b = 0.1f; const float high_b = 10.0f; for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { // Ratio between the 4:1 sub-block variance and the whole-block variance. float var_ratio = (float)(horz_4_source_var[i] + 1) / denom; if (var_ratio < low_b) var_ratio = low_b; if (var_ratio > high_b) var_ratio = high_b; features[feature_index++] = var_ratio; } for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { // Ratio between the 1:4 sub-block RD and the whole-block RD. float var_ratio = (float)(vert_4_source_var[i] + 1) / denom; if (var_ratio < low_b) var_ratio = low_b; if (var_ratio > high_b) var_ratio = high_b; features[feature_index++] = var_ratio; } assert(feature_index == FEATURES); // Write features to file if (!frame_is_intra_only(&cpi->common)) { write_features_to_file(cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode, features, FEATURES, 7, bsize, mi_row, mi_col); } // Calculate scores using the NN model. float score[LABELS] = { 0.0f }; av1_nn_predict(features, nn_config, 1, score); int int_score[LABELS]; int max_score = -1000; for (int i = 0; i < LABELS; ++i) { int_score[i] = (int)(100 * score[i]); max_score = AOMMAX(int_score[i], max_score); } // Make decisions based on the model scores. int thresh = max_score; switch (bsize) { case BLOCK_16X16: thresh -= 500; break; case BLOCK_32X32: thresh -= 500; break; case BLOCK_64X64: thresh -= 200; break; default: break; } av1_zero_array(part4_allowed, NUM_PART4_TYPES); for (int i = 0; i < LABELS; ++i) { if (int_score[i] >= thresh) { if ((i >> 0) & 1) part4_allowed[HORZ4] = 1; if ((i >> 1) & 1) part4_allowed[VERT4] = 1; } } } #undef FEATURES #undef LABELS #define FEATURES 4 void av1_ml_predict_breakout(AV1_COMP *const cpi, const MACROBLOCK *const x, const RD_STATS *const rd_stats, unsigned int pb_source_variance, int bit_depth, PartitionSearchState *part_state) { const PartitionBlkParams *blk_params = &part_state->part_blk_params; const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; const BLOCK_SIZE bsize = blk_params->bsize; const NN_CONFIG *nn_config = NULL; int thresh = 0; switch (bsize) { case BLOCK_8X8: nn_config = &av1_partition_breakout_nnconfig_8; thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[0]; break; case BLOCK_16X16: nn_config = &av1_partition_breakout_nnconfig_16; thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[1]; break; case BLOCK_32X32: nn_config = &av1_partition_breakout_nnconfig_32; thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[2]; break; case BLOCK_64X64: nn_config = &av1_partition_breakout_nnconfig_64; thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[3]; break; case BLOCK_128X128: nn_config = &av1_partition_breakout_nnconfig_128; thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[4]; break; default: assert(0 && "Unexpected bsize."); } if (!nn_config || thresh < 0) return; const float ml_predict_breakout_thresh_scale[3] = { 1.15f, 1.05f, 1.0f }; thresh = (int)((float)thresh * ml_predict_breakout_thresh_scale [cpi->sf.part_sf.ml_predict_breakout_level - 1]); // Generate feature values. float features[FEATURES]; int feature_index = 0; const int num_pels_log2 = num_pels_log2_lookup[bsize]; float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX); rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) * rate_f; features[feature_index++] = rate_f; const float dist_f = (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2); features[feature_index++] = dist_f; features[feature_index++] = (float)pb_source_variance; const int dc_q = (int)x->plane[0].dequant_QTX[0] >> (bit_depth - 8); features[feature_index++] = (float)(dc_q * dc_q) / 256.0f; assert(feature_index == FEATURES); // Write features to file write_features_to_file(cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode, features, FEATURES, 2, bsize, mi_row, mi_col); if (ext_ml_model_decision_after_none(&cpi->ext_part_controller, frame_is_intra_only(&cpi->common), features, &part_state->do_square_split, &part_state->do_rectangular_split)) { return; } // Calculate score using the NN model. float score = 0.0f; av1_nn_predict(features, nn_config, 1, &score); // Make decision. if ((int)(score * 100) >= thresh) { part_state->do_square_split = 0; part_state->do_rectangular_split = 0; } } #undef FEATURES void av1_prune_partitions_before_search(AV1_COMP *const cpi, MACROBLOCK *const x, SIMPLE_MOTION_DATA_TREE *const sms_tree, PartitionSearchState *part_state) { const AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; const PartitionBlkParams *blk_params = &part_state->part_blk_params; const BLOCK_SIZE bsize = blk_params->bsize; #if CONFIG_THREE_PASS if (cpi->third_pass_ctx) { int mi_row = blk_params->mi_row; int mi_col = blk_params->mi_col; double ratio_h, ratio_w; av1_get_third_pass_ratio(cpi->third_pass_ctx, 0, cm->height, cm->width, &ratio_h, &ratio_w); THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi( cpi->third_pass_ctx, 0, mi_row, mi_col, ratio_h, ratio_w); BLOCK_SIZE third_pass_bsize = av1_get_third_pass_adjusted_blk_size(this_mi, ratio_h, ratio_w); // check the actual partition of this block in the second pass PARTITION_TYPE third_pass_part = av1_third_pass_get_sb_part_type(cpi->third_pass_ctx, this_mi); int is_edge = (mi_row + mi_size_high[bsize] >= cm->mi_params.mi_rows) || (mi_col + mi_size_wide[bsize] >= cm->mi_params.mi_cols); if (!is_edge && block_size_wide[bsize] >= 16) { // If in second pass we used rectangular partition, then do not search for // rectangular partition in the different direction. if (third_pass_part != PARTITION_NONE) { if (third_pass_part == PARTITION_HORZ || third_pass_part == PARTITION_HORZ_4 || third_pass_part == PARTITION_HORZ_A || third_pass_part == PARTITION_HORZ_B) { part_state->partition_rect_allowed[VERT] = 0; } else if (third_pass_part == PARTITION_VERT || third_pass_part == PARTITION_VERT_4 || third_pass_part == PARTITION_VERT_A || third_pass_part == PARTITION_VERT_B) { part_state->partition_rect_allowed[HORZ] = 0; } } int minSize = AOMMIN(block_size_wide[third_pass_bsize], block_size_high[third_pass_bsize]); int maxSize = AOMMAX(block_size_wide[third_pass_bsize], block_size_high[third_pass_bsize]); if (block_size_wide[bsize] < minSize / 4) { // Current partition is too small, just terminate part_state->terminate_partition_search = 1; return; } else if (block_size_wide[bsize] < minSize / 2) { if (third_pass_part != PARTITION_NONE) { // Current partition is very small, and in second pass we used // rectangular partition. Terminate the search here then. part_state->terminate_partition_search = 1; return; } else { // Partition is small, but we still check this partition, only disable // further splits. // TODO(any): check why this is not covered by the termination for < // minSize/4. av1_disable_square_split_partition(part_state); av1_disable_rect_partitions(part_state); return; } } else if (block_size_wide[bsize] > maxSize) { // Partition is larger than in the second pass. Only allow split. av1_set_square_split_only(part_state); return; } else if (block_size_wide[bsize] >= minSize && block_size_wide[bsize] <= maxSize) { // Partition is within a range where it is very likely to find a good // choice, so do not prune anything. return; } } } #endif // CONFIG_THREE_PASS // Prune rectangular partitions for larger blocks. if (bsize > cpi->sf.part_sf.rect_partition_eval_thresh) { part_state->do_rectangular_split = 0; part_state->partition_rect_allowed[HORZ] = 0; part_state->partition_rect_allowed[VERT] = 0; } // Prune rectangular, AB and 4-way partition based on q index and block size if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx == 1) { if (bsize == BLOCK_8X8 && x->qindex < 35) av1_disable_rect_partitions(part_state); } else if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx == 2) { // Enumeration difference between two square partitions const int sqr_bsize_step = BLOCK_32X32 - BLOCK_16X16; int max_bsize = BLOCK_32X32 - (x->qindex * 3 / QINDEX_RANGE) * sqr_bsize_step; max_bsize = AOMMAX(max_bsize, BLOCK_4X4); const BLOCK_SIZE max_prune_bsize = (BLOCK_SIZE)AOMMIN(max_bsize, BLOCK_32X32); // Prune partition // qidx 0 to 85: prune bsize below BLOCK_32X32 // qidx 86 to 170: prune bsize below BLOCK_16X16 // qidx 171 to 255: prune bsize below BLOCK_8X8 if (bsize < max_prune_bsize) { av1_disable_rect_partitions(part_state); } } if (cpi->sf.part_sf.prune_sub_8x8_partition_level && (bsize == BLOCK_8X8)) { const MACROBLOCKD *const xd = &x->e_mbd; int prune_sub_8x8; if (cpi->sf.part_sf.prune_sub_8x8_partition_level == 2) { prune_sub_8x8 = 1; } else { assert(cpi->sf.part_sf.prune_sub_8x8_partition_level == 1); // Prune if both neighbors are available and either is > BLOCK_8X8 prune_sub_8x8 = xd->left_available && xd->up_available && (xd->left_mbmi->bsize > BLOCK_8X8 || xd->above_mbmi->bsize > BLOCK_8X8); } if (prune_sub_8x8) { av1_disable_all_splits(part_state); } } // A CNN-based speed feature pruning out either split or all non-split // partition in INTRA frame coding. const int try_intra_cnn_based_part_prune = frame_is_intra_only(cm) && cpi->sf.part_sf.intra_cnn_based_part_prune_level && cm->seq_params->sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 && blk_params->bsize_at_least_8x8 && av1_is_whole_blk_in_frame(blk_params, mi_params); if (try_intra_cnn_based_part_prune) { intra_mode_cnn_partition(&cpi->common, x, x->part_search_info.quad_tree_idx, cpi->sf.part_sf.intra_cnn_based_part_prune_level, part_state); } // Use simple motion search to prune out split or non-split partitions. This // must be done prior to PARTITION_SPLIT to propagate the initial mvs to a // smaller blocksize. const int try_split_only = cpi->sf.part_sf.simple_motion_search_split && part_state->do_square_split && blk_params->bsize_at_least_8x8 && av1_is_whole_blk_in_frame(blk_params, mi_params) && !frame_is_intra_only(cm) && !av1_superres_scaled(cm); if (try_split_only) { simple_motion_search_based_split(cpi, x, sms_tree, part_state); } // Use simple motion search to prune out rectangular partition in some // direction. The results are stored in prune_horz and prune_vert in order to // bypass future related pruning checks if a pruning decision has been made. // We want to search at least one partition mode, so don't prune if NONE and // SPLIT are disabled. const int non_rect_part_allowed = part_state->do_square_split || part_state->partition_none_allowed; // Only run the model if the partitions are not already pruned. const int rect_part_allowed = part_state->do_rectangular_split && ((part_state->partition_rect_allowed[HORZ] && !part_state->prune_rect_part[HORZ]) || (part_state->partition_rect_allowed[VERT] && !part_state->prune_rect_part[VERT])); const int try_prune_rect = cpi->sf.part_sf.simple_motion_search_prune_rect && !frame_is_intra_only(cm) && non_rect_part_allowed && rect_part_allowed && !av1_superres_scaled(cm); if (try_prune_rect) { simple_motion_search_prune_rect(cpi, x, sms_tree, part_state); } } #ifndef NDEBUG static inline int is_bsize_square(BLOCK_SIZE bsize) { return block_size_wide[bsize] == block_size_high[bsize]; } #endif // NDEBUG void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc, PartitionSearchState *part_state) { assert(is_bsize_square(sb_enc->max_partition_size)); assert(is_bsize_square(sb_enc->min_partition_size)); assert(sb_enc->min_partition_size <= sb_enc->max_partition_size); const PartitionBlkParams *blk_params = &part_state->part_blk_params; const BLOCK_SIZE bsize = blk_params->bsize; assert(is_bsize_square(bsize)); const int max_partition_size_1d = block_size_wide[sb_enc->max_partition_size]; const int min_partition_size_1d = block_size_wide[sb_enc->min_partition_size]; const int bsize_1d = block_size_wide[bsize]; assert(min_partition_size_1d <= max_partition_size_1d); const int is_le_min_sq_part = bsize_1d <= min_partition_size_1d; const int is_gt_max_sq_part = bsize_1d > max_partition_size_1d; if (is_gt_max_sq_part) { // If current block size is larger than max, only allow split. av1_set_square_split_only(part_state); } else if (is_le_min_sq_part) { // If current block size is less or equal to min, only allow none if valid // block large enough; only allow split otherwise. av1_disable_rect_partitions(part_state); // only disable square split when current block is not at the picture // boundary. otherwise, inherit the square split flag from previous logic if (av1_blk_has_rows_and_cols(blk_params)) { part_state->do_square_split = 0; } part_state->partition_none_allowed = !(part_state->do_square_split); } } // Decide whether to evaluate the AB partition specified by part_type based on // split and HORZ/VERT info static int evaluate_ab_partition_based_on_split( const PC_TREE *pc_tree, PARTITION_TYPE rect_part, const RD_RECT_PART_WIN_INFO *rect_part_win_info, int qindex, int split_idx1, int split_idx2) { int num_win = 0; // Threshold for number of winners // Conservative pruning for high quantizers const int num_win_thresh = AOMMIN(3 * (2 * (MAXQ - qindex) / MAXQ), 3); int sub_part_win = (rect_part_win_info == NULL) ? (pc_tree->partitioning == rect_part) : (rect_part == PARTITION_HORZ) ? rect_part_win_info->rect_part_win[HORZ] : rect_part_win_info->rect_part_win[VERT]; num_win += (sub_part_win) ? 1 : 0; if (pc_tree->split[split_idx1]) { num_win += (pc_tree->split[split_idx1]->partitioning == PARTITION_NONE) ? 1 : 0; } else { num_win += 1; } if (pc_tree->split[split_idx2]) { num_win += (pc_tree->split[split_idx2]->partitioning == PARTITION_NONE) ? 1 : 0; } else { num_win += 1; } if (num_win < num_win_thresh) { return 0; } return 1; } void av1_prune_ab_partitions(AV1_COMP *cpi, const MACROBLOCK *x, const PC_TREE *pc_tree, int pb_source_variance, int64_t best_rdcost, const RD_RECT_PART_WIN_INFO *rect_part_win_info, bool ext_partition_allowed, PartitionSearchState *part_state, int *ab_partitions_allowed) { int64_t *horz_rd = part_state->rect_part_rd[HORZ]; int64_t *vert_rd = part_state->rect_part_rd[VERT]; int64_t *split_rd = part_state->split_rd; const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg; // The standard AB partitions are allowed initially if ext-partition-types are // allowed. int horzab_partition_allowed = ext_partition_allowed && part_cfg->enable_ab_partitions && part_state->partition_rect_allowed[HORZ]; int vertab_partition_allowed = ext_partition_allowed && part_cfg->enable_ab_partitions && part_state->partition_rect_allowed[VERT]; // Pruning: pruning out AB partitions on one main direction based on the // current best partition and source variance. if (cpi->sf.part_sf.prune_ext_partition_types_search_level) { if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 1) { // TODO(debargha,huisu@google.com): may need to tune the threshold for // pb_source_variance. horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ || (pc_tree->partitioning == PARTITION_NONE && pb_source_variance < 32) || pc_tree->partitioning == PARTITION_SPLIT); vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT || (pc_tree->partitioning == PARTITION_NONE && pb_source_variance < 32) || pc_tree->partitioning == PARTITION_SPLIT); } else { horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ || pc_tree->partitioning == PARTITION_SPLIT); vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT || pc_tree->partitioning == PARTITION_SPLIT); } horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0); horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0); vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0); vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0); split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0); split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0); split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0); split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0); } // Pruning: pruning out horz_a or horz_b if the combined rdcost of its // subblocks estimated from previous partitions is much higher than the best // rd so far. ab_partitions_allowed[HORZ_A] = horzab_partition_allowed; ab_partitions_allowed[HORZ_B] = horzab_partition_allowed; if (cpi->sf.part_sf.prune_ext_partition_types_search_level) { const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1]; const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3]; switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) { case 1: ab_partitions_allowed[HORZ_A] &= (horz_a_rd / 16 * 14 < best_rdcost); ab_partitions_allowed[HORZ_B] &= (horz_b_rd / 16 * 14 < best_rdcost); break; case 2: default: ab_partitions_allowed[HORZ_A] &= (horz_a_rd / 16 * 15 < best_rdcost); ab_partitions_allowed[HORZ_B] &= (horz_b_rd / 16 * 15 < best_rdcost); break; } } // Pruning: pruning out vert_a or vert_b if the combined rdcost of its // subblocks estimated from previous partitions is much higher than the best // rd so far. ab_partitions_allowed[VERT_A] = vertab_partition_allowed; ab_partitions_allowed[VERT_B] = vertab_partition_allowed; if (cpi->sf.part_sf.prune_ext_partition_types_search_level) { const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2]; const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3]; switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) { case 1: ab_partitions_allowed[VERT_A] &= (vert_a_rd / 16 * 14 < best_rdcost); ab_partitions_allowed[VERT_B] &= (vert_b_rd / 16 * 14 < best_rdcost); break; case 2: default: ab_partitions_allowed[VERT_A] &= (vert_a_rd / 16 * 15 < best_rdcost); ab_partitions_allowed[VERT_B] &= (vert_b_rd / 16 * 15 < best_rdcost); break; } } // Pruning: pruning out some ab partitions using a DNN taking rd costs of // sub-blocks from previous basic partition types. if (cpi->sf.part_sf.ml_prune_partition && ext_partition_allowed && part_state->partition_rect_allowed[HORZ] && part_state->partition_rect_allowed[VERT]) { // TODO(huisu@google.com): x->source_variance may not be the current // block's variance. The correct one to use is pb_source_variance. Need to // re-train the model to fix it. ml_prune_ab_partition(cpi, pc_tree->partitioning, get_unsigned_bits(x->source_variance), best_rdcost, part_state, ab_partitions_allowed); } // Pruning: pruning AB partitions based on the number of horz/vert wins // in the current block and sub-blocks in PARTITION_SPLIT. if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 && ab_partitions_allowed[HORZ_A]) { ab_partitions_allowed[HORZ_A] &= evaluate_ab_partition_based_on_split( pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 0, 1); } if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 && ab_partitions_allowed[HORZ_B]) { ab_partitions_allowed[HORZ_B] &= evaluate_ab_partition_based_on_split( pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 2, 3); } if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 && ab_partitions_allowed[VERT_A]) { ab_partitions_allowed[VERT_A] &= evaluate_ab_partition_based_on_split( pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 0, 2); } if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 && ab_partitions_allowed[VERT_B]) { ab_partitions_allowed[VERT_B] &= evaluate_ab_partition_based_on_split( pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 1, 3); } } // Prepare features for the external model. Specifically, features after // ab partition is searched. static void prepare_features_after_part_ab( const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx, int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], int64_t split_rd[SUB_PARTITIONS_SPLIT], unsigned int pb_source_variance, int mi_row, int mi_col, aom_partition_features_t *const features) { int64_t *horz_rd = rect_part_rd[HORZ]; int64_t *vert_rd = rect_part_rd[VERT]; // Generate features. int feature_index = 0; features->after_part_ab.f[feature_index++] = (float)part_ctx; features->after_part_ab.f[feature_index++] = (float)get_unsigned_bits(pb_source_variance); const int rdcost = (int)AOMMIN(INT_MAX, best_rd); int sub_block_rdcost[8] = { 0 }; int rd_index = 0; for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { if (horz_rd[i] > 0 && horz_rd[i] < 1000000000) sub_block_rdcost[rd_index] = (int)horz_rd[i]; ++rd_index; } for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { if (vert_rd[i] > 0 && vert_rd[i] < 1000000000) sub_block_rdcost[rd_index] = (int)vert_rd[i]; ++rd_index; } for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { if (split_rd[i] > 0 && split_rd[i] < 1000000000) sub_block_rdcost[rd_index] = (int)split_rd[i]; ++rd_index; } for (int i = 0; i < 8; ++i) { // Ratio between the sub-block RD and the whole-block RD. float rd_ratio = 1.0f; if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost) rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost; features->after_part_ab.f[feature_index++] = rd_ratio; } // 4-way partitions are only allowed for these three square block sizes. assert(bsize == BLOCK_16X16 || bsize == BLOCK_32X32 || bsize == BLOCK_64X64); // Get variance of the 1:4 and 4:1 sub-blocks. unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 }; unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 }; { BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4); BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4); assert(horz_4_bs != BLOCK_INVALID); assert(vert_4_bs != BLOCK_INVALID); av1_setup_src_planes(x, cpi->source, mi_row, mi_col, av1_num_planes(&cpi->common), bsize); const int src_stride = x->plane[0].src.stride; uint8_t *src = x->plane[0].src.buf; const MACROBLOCKD *const xd = &x->e_mbd; struct buf_2d horz_4_src, vert_4_src; horz_4_src.stride = src_stride; vert_4_src.stride = src_stride; for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride; vert_4_src.buf = src + i * block_size_wide[vert_4_bs]; horz_4_source_var[i] = av1_get_perpixel_variance_facade( cpi, xd, &horz_4_src, horz_4_bs, AOM_PLANE_Y); vert_4_source_var[i] = av1_get_perpixel_variance_facade( cpi, xd, &vert_4_src, vert_4_bs, AOM_PLANE_Y); } } const float denom = (float)(pb_source_variance + 1); const float low_b = 0.1f; const float high_b = 10.0f; for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { // Ratio between the 4:1 sub-block variance and the whole-block variance. float var_ratio = (float)(horz_4_source_var[i] + 1) / denom; if (var_ratio < low_b) var_ratio = low_b; if (var_ratio > high_b) var_ratio = high_b; features->after_part_ab.f[feature_index++] = var_ratio; } for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { // Ratio between the 1:4 sub-block RD and the whole-block RD. float var_ratio = (float)(vert_4_source_var[i] + 1) / denom; if (var_ratio < low_b) var_ratio = low_b; if (var_ratio > high_b) var_ratio = high_b; features->after_part_ab.f[feature_index++] = var_ratio; } assert(feature_index == 18); } // If the external partition model is used, we let it determine partition // decisions before partition none. Specifically, these parameters: // partition_none_allowed // partition_horz_allowed // partition_vert_allowed // do_rectangular_split // do_square_split static bool ext_ml_model_decision_before_none( AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT], int *partition_none_allowed, int *partition_horz_allowed, int *partition_vert_allowed, int *do_rectangular_split, int *do_square_split) { ExtPartController *const ext_part_controller = &cpi->ext_part_controller; if (!ext_part_controller->ready) return false; // Setup features. aom_partition_features_t features; features.id = AOM_EXT_PART_FEATURE_BEFORE_NONE; for (int i = 0; i < FEATURE_SIZE_SMS_SPLIT; ++i) { features.before_part_none.f[i] = features_from_motion[i]; } // Send necessary features to the external model. av1_ext_part_send_features(ext_part_controller, &features); // Get partition decisions from the external model. aom_partition_decision_t decision; const bool valid_decision = av1_ext_part_get_partition_decision(ext_part_controller, &decision); if (!valid_decision) return false; // Populate decisions *partition_none_allowed = decision.partition_none_allowed; *partition_horz_allowed = decision.partition_rect_allowed[HORZ]; *partition_vert_allowed = decision.partition_rect_allowed[VERT]; *do_rectangular_split = decision.do_rectangular_split; *do_square_split = decision.do_square_split; return true; } // If the external partition model is used, we let it determine partition // decisions before partition none. Specifically, these parameters: // prune_horz // prune_vert static bool ext_ml_model_decision_before_none_part2( AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART], int *prune_horz, int *prune_vert) { ExtPartController *const ext_part_controller = &cpi->ext_part_controller; if (!ext_part_controller->ready) return false; // Setup features. aom_partition_features_t features; features.id = AOM_EXT_PART_FEATURE_BEFORE_NONE_PART2; for (int i = 0; i < FEATURE_SIZE_SMS_PRUNE_PART; ++i) { features.before_part_none.f_part2[i] = features_from_motion[i]; } // Send necessary features to the external model. av1_ext_part_send_features(ext_part_controller, &features); // Get partition decisions from the external model. aom_partition_decision_t decision; const bool valid_decision = av1_ext_part_get_partition_decision(ext_part_controller, &decision); if (!valid_decision) return false; // Populate decisions *prune_horz = decision.prune_rect_part[HORZ]; *prune_vert = decision.prune_rect_part[VERT]; return true; } // If the external partition model is used, we let it determine partition // decisions after none partition. Specifically, these parameters: // do_square_split // do_rectangular_split bool ext_ml_model_decision_after_none( ExtPartController *const ext_part_controller, const int is_intra_frame, const float *const features_after_none, int *do_square_split, int *do_rectangular_split) { if (!ext_part_controller->ready || is_intra_frame) return false; // Setup features. aom_partition_features_t features; features.id = AOM_EXT_PART_FEATURE_AFTER_NONE; for (int i = 0; i < 4; ++i) { features.after_part_none.f[i] = features_after_none[i]; } // Send necessary features to the external model. av1_ext_part_send_features(ext_part_controller, &features); // Get partition decisions from the external model. aom_partition_decision_t decision; const bool valid_decision = av1_ext_part_get_partition_decision(ext_part_controller, &decision); if (!valid_decision) return false; // Populate decisions *do_square_split = decision.do_square_split; *do_rectangular_split = decision.do_rectangular_split; return true; } // If the external partition model is used, we let it determine partition // decisions after none partition. Specifically, these parameters: // terminate_partition_search bool ext_ml_model_decision_after_none_part2( AV1_COMP *const cpi, const float *const features_terminate, int *terminate_partition_search) { AV1_COMMON *const cm = &cpi->common; ExtPartController *const ext_part_controller = &cpi->ext_part_controller; if (!ext_part_controller->ready || frame_is_intra_only(cm)) return false; // Setup features. aom_partition_features_t features; features.id = AOM_EXT_PART_FEATURE_AFTER_NONE_PART2; for (int i = 0; i < FEATURE_SIZE_SMS_TERM_NONE; ++i) { features.after_part_none.f_terminate[i] = features_terminate[i]; } // Send necessary features to the external model. av1_ext_part_send_features(ext_part_controller, &features); // Get partition decisions from the external model. aom_partition_decision_t decision; const bool valid_decision = av1_ext_part_get_partition_decision(ext_part_controller, &decision); if (!valid_decision) return false; // Populate decisions *terminate_partition_search = decision.terminate_partition_search; return true; } // If the external partition model is used, we let it determine partition // decisions after none partition. Specifically, these parameters: // terminate_partition_search bool ext_ml_model_decision_after_split(AV1_COMP *const cpi, const float *const features_terminate, int *terminate_partition_search) { const AV1_COMMON *const cm = &cpi->common; ExtPartController *const ext_part_controller = &cpi->ext_part_controller; if (frame_is_intra_only(cm) || !cpi->ext_part_controller.ready) { return false; } // Setup features. aom_partition_features_t features; features.id = AOM_EXT_PART_FEATURE_AFTER_SPLIT; for (int i = 0; i < 31; ++i) { features.after_part_split.f_terminate[i] = features_terminate[i]; } // Send necessary features to the external model. av1_ext_part_send_features(ext_part_controller, &features); // Get partition decisions from the external model. aom_partition_decision_t decision; const bool valid_decision = av1_ext_part_get_partition_decision(ext_part_controller, &decision); if (!valid_decision) return false; // Populate decisions *terminate_partition_search = decision.terminate_partition_search; return true; } // If the external partition model is used, we let it determine partition // decisions after none partition. Specifically, these parameters: // prune_rect_part[HORZ] // prune_rect_part[VERT] bool ext_ml_model_decision_after_split_part2( ExtPartController *const ext_part_controller, const int is_intra_frame, const float *const features_prune, int *prune_rect_part_horz, int *prune_rect_part_vert) { if (is_intra_frame || !ext_part_controller->ready) { return false; } // Setup features. aom_partition_features_t features; features.id = AOM_EXT_PART_FEATURE_AFTER_SPLIT_PART2; for (int i = 0; i < 9; ++i) { features.after_part_split.f_prune_rect[i] = features_prune[i]; } // Send necessary features to the external model. av1_ext_part_send_features(ext_part_controller, &features); // Get partition decisions from the external model. aom_partition_decision_t decision; const bool valid_decision = av1_ext_part_get_partition_decision(ext_part_controller, &decision); if (!valid_decision) return false; // Populate decisions *prune_rect_part_horz = decision.prune_rect_part[0]; *prune_rect_part_vert = decision.prune_rect_part[1]; return true; } // If the external partition model is used, we let it determine partition // decisions after rectangular partition. Specifically, these parameters: // horza_partition_allowed // horzb_partition_allowed // verta_partition_allowed // vertb_partition_allowed static bool ext_ml_model_decision_after_rect( ExtPartController *const ext_part_controller, const int is_intra_frame, const float *const features_after_rect, int *horza_partition_allowed, int *horzb_partition_allowed, int *verta_partition_allowed, int *vertb_partition_allowed) { if (is_intra_frame || !ext_part_controller->ready) return false; // Setup features. aom_partition_features_t features; features.id = AOM_EXT_PART_FEATURE_AFTER_RECT; for (int i = 0; i < 10; ++i) { features.after_part_rect.f[i] = features_after_rect[i]; } // Send necessary features to the external model. av1_ext_part_send_features(ext_part_controller, &features); // Get partition decisions from the external model. aom_partition_decision_t decision; const bool valid_decision = av1_ext_part_get_partition_decision(ext_part_controller, &decision); if (!valid_decision) return false; // Populate decisions *horza_partition_allowed = decision.horza_partition_allowed; *horzb_partition_allowed = decision.horzb_partition_allowed; *verta_partition_allowed = decision.verta_partition_allowed; *vertb_partition_allowed = decision.vertb_partition_allowed; return true; } // If the external partition model is used, we let it determine partition // decisions after AB partition. Specifically, these parameters: // partition_vert4_allowed // partition_horz4_allowed static bool ext_ml_model_decision_after_part_ab( AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx, int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed, int *const partition_vert4_allowed, unsigned int pb_source_variance, int mi_row, int mi_col) { const AV1_COMMON *const cm = &cpi->common; ExtPartController *const ext_part_controller = &cpi->ext_part_controller; if (!frame_is_intra_only(cm) && ext_part_controller->ready) { // Setup features. aom_partition_features_t features; features.id = AOM_EXT_PART_FEATURE_AFTER_AB; prepare_features_after_part_ab(cpi, x, bsize, part_ctx, best_rd, rect_part_rd, split_rd, pb_source_variance, mi_row, mi_col, &features); // Send necessary features to the external model. av1_ext_part_send_features(ext_part_controller, &features); // Get partition decisions from the external model. aom_partition_decision_t decision; const bool valid_decision = av1_ext_part_get_partition_decision(ext_part_controller, &decision); if (!valid_decision) return false; // Populate decisions *partition_horz4_allowed = decision.partition_horz4_allowed; *partition_vert4_allowed = decision.partition_vert4_allowed; return true; } return false; } // This function resembles "av1_setup_sms_tree()" in context_tree.c // with function signature change. static SIMPLE_MOTION_DATA_TREE *setup_sms_tree( AV1_COMP *const cpi, SIMPLE_MOTION_DATA_TREE *sms_tree) { AV1_COMMON *const cm = &cpi->common; const int stat_generation_stage = is_stat_generation_stage(cpi); const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128; const int tree_nodes = av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage); int sms_tree_index = 0; SIMPLE_MOTION_DATA_TREE *this_sms; int square_index = 1; int nodes; this_sms = &sms_tree[0]; if (!stat_generation_stage) { const int leaf_factor = is_sb_size_128 ? 4 : 1; const int leaf_nodes = 256 * leaf_factor; // Sets up all the leaf nodes in the tree. for (sms_tree_index = 0; sms_tree_index < leaf_nodes; ++sms_tree_index) { SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index]; tree->block_size = square[0]; } // Each node has 4 leaf nodes, fill each block_size level of the tree // from leafs to the root. for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) { for (int i = 0; i < nodes; ++i) { SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index]; tree->block_size = square[square_index]; for (int j = 0; j < 4; j++) tree->split[j] = this_sms++; ++sms_tree_index; } ++square_index; } } else { // Allocation for firstpass/LAP stage // TODO(Mufaddal): refactor square_index to use a common block_size macro // from firstpass.c SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index]; square_index = 2; tree->block_size = square[square_index]; } // Set up the root node for the largest superblock size return &sms_tree[tree_nodes - 1]; } static void write_motion_feature_to_file( const char *const path, const int sb_counter, const unsigned int *block_sse, const unsigned int *block_var, const int num_blocks, const BLOCK_SIZE bsize, const BLOCK_SIZE fixed_block_size, const int mi_row, const int mi_col) { char filename[256]; snprintf(filename, sizeof(filename), "%s/motion_search_feature_sb%d", path, sb_counter); FILE *pfile = fopen(filename, "w"); fprintf(pfile, "%d,%d,%d,%d,%d\n", mi_row, mi_col, bsize, block_size_wide[fixed_block_size], num_blocks); for (int i = 0; i < num_blocks; ++i) { fprintf(pfile, "%d", block_sse[i]); if (i < num_blocks - 1) fprintf(pfile, ","); } fprintf(pfile, "\n"); for (int i = 0; i < num_blocks; ++i) { fprintf(pfile, "%d", block_var[i]); if (i < num_blocks - 1) fprintf(pfile, ","); } fprintf(pfile, "\n"); fclose(pfile); } void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, const int mi_row, const int mi_col, const BLOCK_SIZE bsize, aom_partition_features_t *features) { const AV1_COMMON *const cm = &cpi->common; if (frame_is_intra_only(cm)) return; MACROBLOCK *const x = &td->mb; const BLOCK_SIZE fixed_block_size = BLOCK_16X16; const int col_step = mi_size_wide[fixed_block_size]; const int row_step = mi_size_high[fixed_block_size]; SIMPLE_MOTION_DATA_TREE *sms_tree = NULL; const int stat_generation_stage = is_stat_generation_stage(cpi); const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128; const int tree_nodes = av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage); CHECK_MEM_ERROR(cm, sms_tree, aom_calloc(tree_nodes, sizeof(*sms_tree))); SIMPLE_MOTION_DATA_TREE *sms_root = setup_sms_tree(cpi, sms_tree); TileInfo *const tile_info = &tile_data->tile_info; av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, bsize); av1_init_simple_motion_search_mvs_for_sb(cpi, NULL, x, sms_root, mi_row, mi_col); av1_reset_simple_motion_tree_partition(sms_root, bsize); const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME }; const int mi_width = AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col); const int mi_height = AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row); const int col_steps = (mi_width / col_step) + ((mi_width % col_step) > 0); const int row_steps = (mi_height / row_step) + ((mi_height % row_step) > 0); const int num_blocks = col_steps * row_steps; unsigned int *block_sse = aom_calloc(num_blocks, sizeof(*block_sse)); unsigned int *block_var = aom_calloc(num_blocks, sizeof(*block_var)); if (!(block_sse && block_var)) { aom_free(sms_tree); aom_free(block_sse); aom_free(block_var); aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Error allocating block_sse & block_var"); } int idx = 0; for (int row = mi_row; row < AOMMIN(mi_row + mi_size_high[bsize], cm->mi_params.mi_rows); row += row_step) { for (int col = mi_col; col < AOMMIN(mi_col + mi_size_wide[bsize], cm->mi_params.mi_cols); col += col_step) { simple_motion_search_get_best_ref( cpi, x, sms_root, row, col, fixed_block_size, ref_list, /*num_refs=*/1, /*use_subpixel=*/1, /*save_mv=*/1, &block_sse[idx], &block_var[idx]); ++idx; } } if (features == NULL) { write_motion_feature_to_file(cpi->oxcf.partition_info_path, cpi->sb_counter, block_sse, block_var, idx, bsize, fixed_block_size, mi_row, mi_col); } else { features->sb_features.motion_features.unit_length = block_size_wide[fixed_block_size]; features->sb_features.motion_features.num_units = idx; for (int i = 0; i < idx; ++i) { features->sb_features.motion_features.block_sse[i] = block_sse[i]; features->sb_features.motion_features.block_var[i] = block_var[i]; } } aom_free(block_sse); aom_free(block_var); aom_free(sms_tree); } #if CONFIG_PARTITION_SEARCH_ORDER void av1_prepare_motion_search_features_block( AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, const int mi_row, const int mi_col, const BLOCK_SIZE bsize, const int valid_partition_types, unsigned int *block_sse, unsigned int *block_var, unsigned int sub_block_sse[4], unsigned int sub_block_var[4], unsigned int horz_block_sse[2], unsigned int horz_block_var[2], unsigned int vert_block_sse[2], unsigned int vert_block_var[2]) { const AV1_COMMON *const cm = &cpi->common; if (frame_is_intra_only(cm)) return; MACROBLOCK *const x = &td->mb; SIMPLE_MOTION_DATA_TREE *sms_tree = NULL; const int stat_generation_stage = is_stat_generation_stage(cpi); const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128; const int tree_nodes = av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage); CHECK_MEM_ERROR(cm, sms_tree, aom_calloc(tree_nodes, sizeof(*sms_tree))); SIMPLE_MOTION_DATA_TREE *sms_root = setup_sms_tree(cpi, sms_tree); TileInfo *const tile_info = &tile_data->tile_info; av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, bsize); av1_reset_simple_motion_tree_partition(sms_root, bsize); const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME }; const int sub_mi_width = mi_size_wide[bsize] / 2; const int sub_mi_height = sub_mi_width; simple_motion_search_get_best_ref( cpi, x, sms_root, mi_row, mi_col, bsize, ref_list, /*num_refs=*/1, /*use_subpixel=*/1, /*save_mv=*/1, block_sse, block_var); // Split to 4 sub blocks. if (valid_partition_types & (1 << PARTITION_SPLIT)) { const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); for (int i = 0; i < 4; ++i) { const int row = mi_row + (i >> 1) * sub_mi_height; const int col = mi_col + (i & 1) * sub_mi_width; simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize, ref_list, /*num_refs=*/1, /*use_subpixel=*/1, /*save_mv=*/1, &sub_block_sse[i], &sub_block_var[i]); } } // Horizontal split if (valid_partition_types & (1 << PARTITION_HORZ)) { const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ); for (int i = 0; i < 2; ++i) { const int row = mi_row + (i & 1) * sub_mi_height; const int col = mi_col; simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize, ref_list, /*num_refs=*/1, /*use_subpixel=*/1, /*save_mv=*/1, &horz_block_sse[i], &horz_block_var[i]); } } // Vertical split if (valid_partition_types & (1 << PARTITION_VERT)) { const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT); for (int i = 0; i < 2; ++i) { const int row = mi_row; const int col = mi_col + (i & 1) * sub_mi_width; simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize, ref_list, /*num_refs=*/1, /*use_subpixel=*/1, /*save_mv=*/1, &vert_block_sse[i], &vert_block_var[i]); } } aom_free(sms_tree); } #endif // CONFIG_PARTITION_SEARCH_ORDER #endif // !CONFIG_REALTIME_ONLY static inline void init_simple_motion_search_mvs( SIMPLE_MOTION_DATA_TREE *sms_tree, const FULLPEL_MV *start_mvs) { memcpy(sms_tree->start_mvs, start_mvs, sizeof(sms_tree->start_mvs)); av1_zero(sms_tree->sms_none_feat); av1_zero(sms_tree->sms_rect_feat); av1_zero(sms_tree->sms_none_valid); av1_zero(sms_tree->sms_rect_valid); if (sms_tree->block_size >= BLOCK_8X8) { init_simple_motion_search_mvs(sms_tree->split[0], start_mvs); init_simple_motion_search_mvs(sms_tree->split[1], start_mvs); init_simple_motion_search_mvs(sms_tree->split[2], start_mvs); init_simple_motion_search_mvs(sms_tree->split[3], start_mvs); } } void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi, const TileInfo *tile_info, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row, int mi_col) { // Use the NEARESTMV of the sb as the start mv const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; FULLPEL_MV ref_mvs[REF_FRAMES]; const BLOCK_SIZE sb_size = cm->seq_params->sb_size; av1_zero(ref_mvs); // If tile_info is NULL, assume that the offsets have already been set. if (tile_info) { av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, sb_size); } MB_MODE_INFO_EXT mbmi_ext; const int ref_frame = cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME; av1_find_mv_refs(cm, xd, xd->mi[0], ref_frame, mbmi_ext.ref_mv_count, xd->ref_mv_stack, xd->weight, NULL, mbmi_ext.global_mvs, mbmi_ext.mode_context); if (mbmi_ext.ref_mv_count[ref_frame] > 0) { ref_mvs[ref_frame] = get_fullmv_from_mv(&xd->ref_mv_stack[ref_frame][0].this_mv.as_mv); } else { ref_mvs[ref_frame] = get_fullmv_from_mv(&mbmi_ext.global_mvs[ref_frame].as_mv); } init_simple_motion_search_mvs(sms_root, ref_mvs); } aom-3.12.1/av1/encoder/partition_strategy.h000066400000000000000000000271071477627663500206220ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ #define AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ #include "config/aom_config.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodeframe_utils.h" #include "av1/encoder/encodemb.h" #include "av1/encoder/encoder.h" #if !CONFIG_REALTIME_ONLY // Early terminates PARTITION_NONE using simple_motion_search features and the // rate, distortion, and rdcost of PARTITION_NONE. This is only called when: // - The frame is a show frame // - The frame is not intra only // - The current bsize is > BLOCK_8X8 // - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, const RD_STATS *none_rdc, PartitionSearchState *part_state); // Get the features for selecting the max and min partition size. Currently this // performs simple_motion_search on 16X16 subblocks of the current superblock, // and then extract the statistics of sse and motion vectors as features. void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, int mi_col, float *features); // Predict the maximum BLOCK_SIZE to be used to encoder the current superblock. BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi, const MACROBLOCK *const x, const float *features); // Attempts an early termination after PARTITION_SPLIT. void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x, SIMPLE_MOTION_DATA_TREE *const sms_tree, int64_t best_rd, int64_t part_none_rd, int64_t part_split_rd, int64_t *split_block_rd, PartitionSearchState *part_state); // Use the rdcost ratio and source var ratio to prune PARTITION_HORZ and // PARTITION_VERT. // TODO(chiyotsai@google.com): Currently this model does not use q value and has // no information about rectangular partitions. Preliminary experiments suggest // that we can get better performance by adding in q_index and rectangular // sse/var from SMS. We should retrain and tune this model later. void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x, int64_t best_rd, int64_t none_rd, const int64_t *split_rd, PartitionSearchState *part_state); // Use a ML model to predict if horz4 and vert4 should be considered. void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x, int part_ctx, int64_t best_rd, PartitionSearchState *part_state, int *part4_allowed, unsigned int pb_source_variance); // ML-based partition search breakout after PARTITION_NONE. void av1_ml_predict_breakout(AV1_COMP *const cpi, const MACROBLOCK *const x, const RD_STATS *const rd_stats, unsigned int pb_source_variance, int bit_depth, PartitionSearchState *part_state); // The first round of partition pruning determined before any partition // has been tested. The decisions will be updated and passed back // to the partition search function. void av1_prune_partitions_before_search(AV1_COMP *const cpi, MACROBLOCK *const x, SIMPLE_MOTION_DATA_TREE *const sms_tree, PartitionSearchState *part_state); // Prune out partitions that lead to coding block sizes outside the min and max // bsizes set by the encoder. Max and min square partition levels are defined as // the partition nodes that the recursive function rd_pick_partition() can // reach. To implement this: only PARTITION_NONE is allowed if the current node // equals max_partition_size, only PARTITION_SPLIT is allowed if the current // node exceeds max_partition_size. void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc, PartitionSearchState *part_state); // Prune out AB partitions based on rd decisions made from testing the // basic partitions. void av1_prune_ab_partitions(AV1_COMP *cpi, const MACROBLOCK *x, const PC_TREE *pc_tree, int pb_source_variance, int64_t best_rdcost, const RD_RECT_PART_WIN_INFO *rect_part_win_info, bool ext_partition_allowed, PartitionSearchState *part_state, int *ab_partitions_allowed); void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, const int mi_row, const int mi_col, const BLOCK_SIZE bsize, aom_partition_features_t *features); #if CONFIG_PARTITION_SEARCH_ORDER void av1_prepare_motion_search_features_block( AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, const int mi_row, const int mi_col, const BLOCK_SIZE bsize, const int valid_partition_types, unsigned int *block_sse, unsigned int *block_var, unsigned int sub_block_sse[4], unsigned int sub_block_var[4], unsigned int horz_block_sse[2], unsigned int horz_block_var[2], unsigned int vert_block_sse[2], unsigned int vert_block_var[2]); #endif // CONFIG_PARTITION_SEARCH_ORDER #endif // !CONFIG_REALTIME_ONLY // A simplified version of set_offsets meant to be used for // simple_motion_search. static inline void set_offsets_for_motion_search(const AV1_COMP *const cpi, MACROBLOCK *const x, int mi_row, int mi_col, BLOCK_SIZE bsize) { const AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; const int mi_width = mi_size_wide[bsize]; const int mi_height = mi_size_high[bsize]; set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd, mi_row, mi_col); // Set up destination pointers. av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, num_planes); // Set up limit values for MV components. // Mv beyond the range do not produce new/different prediction block. av1_set_mv_limits(mi_params, &x->mv_limits, mi_row, mi_col, mi_height, mi_width, cpi->oxcf.border_in_pixels); set_plane_n4(xd, mi_width, mi_height, num_planes); xd->mi_row = mi_row; xd->mi_col = mi_col; // Set up distance of MB to edge of frame in 1/8th pel units. assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE); xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE); xd->mb_to_right_edge = GET_MV_SUBPEL((mi_params->mi_cols - mi_width - mi_col) * MI_SIZE); // Set up source buffers. av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); } void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi, const TileInfo *tile_info, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row, int mi_col); static inline int is_full_sb(const CommonModeInfoParams *const mi_params, int mi_row, int mi_col, BLOCK_SIZE sb_size) { const int sb_mi_wide = mi_size_wide[sb_size]; const int sb_mi_high = mi_size_high[sb_size]; return (mi_row + sb_mi_high) <= mi_params->mi_rows && (mi_col + sb_mi_wide) <= mi_params->mi_cols; } #if !CONFIG_REALTIME_ONLY // Do not use this criteria for screen content videos. // Since screen content videos could often find good predictors and the largest // block size is likely to be used. static inline int use_auto_max_partition(const AV1_COMP *const cpi, BLOCK_SIZE sb_size, int mi_row, int mi_col) { assert(IMPLIES(cpi->ppi->gf_group.size > 0, cpi->gf_frame_index < cpi->ppi->gf_group.size)); const AV1_COMMON *const cm = &cpi->common; return !frame_is_intra_only(cm) && !cpi->use_screen_content_tools && cpi->sf.part_sf.auto_max_partition_based_on_simple_motion != NOT_IN_USE && sb_size == BLOCK_128X128 && is_full_sb(&cm->mi_params, mi_row, mi_col, sb_size) && cpi->ppi->gf_group.update_type[cpi->gf_frame_index] != OVERLAY_UPDATE && cpi->ppi->gf_group.update_type[cpi->gf_frame_index] != INTNL_OVERLAY_UPDATE; } static BLOCK_SIZE dim_to_size(int dim) { switch (dim) { case 4: return BLOCK_4X4; case 8: return BLOCK_8X8; case 16: return BLOCK_16X16; case 32: return BLOCK_32X32; case 64: return BLOCK_64X64; case 128: return BLOCK_128X128; default: assert(0); return 0; } } static inline void set_max_min_partition_size(SuperBlockEnc *sb_enc, AV1_COMP *cpi, MACROBLOCK *x, const SPEED_FEATURES *sf, BLOCK_SIZE sb_size, int mi_row, int mi_col) { const AV1_COMMON *cm = &cpi->common; sb_enc->max_partition_size = AOMMIN(sf->part_sf.default_max_partition_size, dim_to_size(cpi->oxcf.part_cfg.max_partition_size)); sb_enc->min_partition_size = AOMMAX(sf->part_sf.default_min_partition_size, dim_to_size(cpi->oxcf.part_cfg.min_partition_size)); sb_enc->max_partition_size = AOMMIN(sb_enc->max_partition_size, cm->seq_params->sb_size); sb_enc->min_partition_size = AOMMIN(sb_enc->min_partition_size, cm->seq_params->sb_size); if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) { float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f }; av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features); sb_enc->max_partition_size = AOMMAX(AOMMIN(av1_predict_max_partition(cpi, x, features), sb_enc->max_partition_size), sb_enc->min_partition_size); } } #endif // !CONFIG_REALTIME_ONLY #endif // AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ aom-3.12.1/av1/encoder/pass2_strategy.c000066400000000000000000005370731477627663500176440ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\defgroup gf_group_algo Golden Frame Group * \ingroup high_level_algo * Algorithms regarding determining the length of GF groups and defining GF * group structures. * @{ */ /*! @} - end defgroup gf_group_algo */ #include #include #include #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "config/aom_config.h" #include "config/aom_scale_rtcd.h" #include "aom/aom_codec.h" #include "aom/aom_encoder.h" #include "av1/common/av1_common_int.h" #include "av1/encoder/encoder.h" #include "av1/encoder/firstpass.h" #include "av1/encoder/gop_structure.h" #include "av1/encoder/pass2_strategy.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/rc_utils.h" #include "av1/encoder/temporal_filter.h" #if CONFIG_THREE_PASS #include "av1/encoder/thirdpass.h" #endif #include "av1/encoder/tpl_model.h" #include "av1/encoder/encode_strategy.h" #define DEFAULT_KF_BOOST 2300 #define DEFAULT_GF_BOOST 2000 #define GROUP_ADAPTIVE_MAXQ 1 static void init_gf_stats(GF_GROUP_STATS *gf_stats); #if CONFIG_THREE_PASS static int define_gf_group_pass3(AV1_COMP *cpi, EncodeFrameParams *frame_params, int is_final_pass); #endif // Calculate an active area of the image that discounts formatting // bars and partially discounts other 0 energy areas. #define MIN_ACTIVE_AREA 0.5 #define MAX_ACTIVE_AREA 1.0 static double calculate_active_area(const FRAME_INFO *frame_info, const FIRSTPASS_STATS *this_frame) { const double active_pct = 1.0 - ((this_frame->intra_skip_pct / 2) + ((this_frame->inactive_zone_rows * 2) / (double)frame_info->mb_rows)); return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA); } // Calculate a modified Error used in distributing bits between easier and // harder frames. #define ACT_AREA_CORRECTION 0.5 static double calculate_modified_err_new(const FRAME_INFO *frame_info, const FIRSTPASS_STATS *total_stats, const FIRSTPASS_STATS *this_stats, int vbrbias, double modified_error_min, double modified_error_max) { if (total_stats == NULL) { return 0; } const double av_weight = total_stats->weight / total_stats->count; const double av_err = (total_stats->coded_error * av_weight) / total_stats->count; double modified_error = av_err * pow(this_stats->coded_error * this_stats->weight / DOUBLE_DIVIDE_CHECK(av_err), vbrbias / 100.0); // Correction for active area. Frames with a reduced active area // (eg due to formatting bars) have a higher error per mb for the // remaining active MBs. The correction here assumes that coding // 0.5N blocks of complexity 2X is a little easier than coding N // blocks of complexity X. modified_error *= pow(calculate_active_area(frame_info, this_stats), ACT_AREA_CORRECTION); return fclamp(modified_error, modified_error_min, modified_error_max); } static double calculate_modified_err(const FRAME_INFO *frame_info, const TWO_PASS *twopass, const AV1EncoderConfig *oxcf, const FIRSTPASS_STATS *this_frame) { const FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats; return calculate_modified_err_new( frame_info, total_stats, this_frame, oxcf->rc_cfg.vbrbias, twopass->modified_error_min, twopass->modified_error_max); } // Resets the first pass file to the given position using a relative seek from // the current position. static void reset_fpf_position(TWO_PASS_FRAME *p_frame, const FIRSTPASS_STATS *position) { p_frame->stats_in = position; } static int input_stats(TWO_PASS *p, TWO_PASS_FRAME *p_frame, FIRSTPASS_STATS *fps) { if (p_frame->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF; *fps = *p_frame->stats_in; ++p_frame->stats_in; return 1; } static int input_stats_lap(TWO_PASS *p, TWO_PASS_FRAME *p_frame, FIRSTPASS_STATS *fps) { if (p_frame->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF; *fps = *p_frame->stats_in; /* Move old stats[0] out to accommodate for next frame stats */ memmove(p->frame_stats_arr[0], p->frame_stats_arr[1], (p->stats_buf_ctx->stats_in_end - p_frame->stats_in - 1) * sizeof(FIRSTPASS_STATS)); p->stats_buf_ctx->stats_in_end--; return 1; } // Read frame stats at an offset from the current position. static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, const TWO_PASS_FRAME *p_frame, int offset) { if ((offset >= 0 && p_frame->stats_in + offset >= p->stats_buf_ctx->stats_in_end) || (offset < 0 && p_frame->stats_in + offset < p->stats_buf_ctx->stats_in_start)) { return NULL; } return &p_frame->stats_in[offset]; } // This function returns the maximum target rate per frame. static int frame_max_bits(const RATE_CONTROL *rc, const AV1EncoderConfig *oxcf) { int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth * (int64_t)oxcf->rc_cfg.vbrmax_section) / 100; if (max_bits < 0) max_bits = 0; else if (max_bits > rc->max_frame_bandwidth) max_bits = rc->max_frame_bandwidth; return (int)max_bits; } // Based on history adjust expectations of bits per macroblock. static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) { TWO_PASS *const twopass = &cpi->ppi->twopass; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; // Based on recent history adjust expectations of bits per macroblock. double rate_err_factor = 1.0; const double adj_limit = AOMMAX(0.2, (double)(100 - rate_err_tol) / 200.0); const double min_fac = 1.0 - adj_limit; const double max_fac = 1.0 + adj_limit; #if CONFIG_THREE_PASS if (cpi->third_pass_ctx && cpi->third_pass_ctx->frame_info_count > 0) { int64_t actual_bits = 0; int64_t target_bits = 0; double factor = 0.0; int count = 0; for (int i = 0; i < cpi->third_pass_ctx->frame_info_count; i++) { actual_bits += cpi->third_pass_ctx->frame_info[i].actual_bits; target_bits += cpi->third_pass_ctx->frame_info[i].bits_allocated; factor += cpi->third_pass_ctx->frame_info[i].bpm_factor; count++; } if (count == 0) { factor = 1.0; } else { factor /= (double)count; } factor *= (double)actual_bits / DOUBLE_DIVIDE_CHECK((double)target_bits); if ((twopass->bpm_factor <= 1 && factor < twopass->bpm_factor) || (twopass->bpm_factor >= 1 && factor > twopass->bpm_factor)) { twopass->bpm_factor = factor; twopass->bpm_factor = AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor)); } } #endif // CONFIG_THREE_PASS int err_estimate = p_rc->rate_error_estimate; int64_t total_actual_bits = p_rc->total_actual_bits; double rolling_arf_group_actual_bits = (double)twopass->rolling_arf_group_actual_bits; double rolling_arf_group_target_bits = (double)twopass->rolling_arf_group_target_bits; #if CONFIG_FPMT_TEST const int is_parallel_frame = cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 ? 1 : 0; const int simulate_parallel_frame = cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? is_parallel_frame : 0; total_actual_bits = simulate_parallel_frame ? p_rc->temp_total_actual_bits : p_rc->total_actual_bits; rolling_arf_group_target_bits = (double)(simulate_parallel_frame ? p_rc->temp_rolling_arf_group_target_bits : twopass->rolling_arf_group_target_bits); rolling_arf_group_actual_bits = (double)(simulate_parallel_frame ? p_rc->temp_rolling_arf_group_actual_bits : twopass->rolling_arf_group_actual_bits); err_estimate = simulate_parallel_frame ? p_rc->temp_rate_error_estimate : p_rc->rate_error_estimate; #endif if ((p_rc->bits_off_target && total_actual_bits > 0) && (rolling_arf_group_target_bits >= 1.0)) { if (rolling_arf_group_actual_bits > rolling_arf_group_target_bits) { double error_fraction = (rolling_arf_group_actual_bits - rolling_arf_group_target_bits) / rolling_arf_group_target_bits; error_fraction = (error_fraction > 1.0) ? 1.0 : error_fraction; rate_err_factor = 1.0 + error_fraction; } else { double error_fraction = (rolling_arf_group_target_bits - rolling_arf_group_actual_bits) / rolling_arf_group_target_bits; rate_err_factor = 1.0 - error_fraction; } rate_err_factor = AOMMAX(min_fac, AOMMIN(max_fac, rate_err_factor)); } // Is the rate control trending in the right direction. Only make // an adjustment if things are getting worse. if ((rate_err_factor < 1.0 && err_estimate >= 0) || (rate_err_factor > 1.0 && err_estimate <= 0)) { twopass->bpm_factor *= rate_err_factor; twopass->bpm_factor = AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor)); } } static const double q_div_term[(QINDEX_RANGE >> 4) + 1] = { 18.0, 30.0, 38.0, 44.0, 47.0, 50.0, 52.0, 54.0, 56.0, 58.0, 60.0, 62.0, 64.0, 66.0, 68.0, 70.0, 72.0 }; #define EPMB_SCALER 1250000 static double calc_correction_factor(double err_per_mb, int q) { double power_term = 0.90; const int index = q >> 4; const double divisor = q_div_term[index] + (((q_div_term[index + 1] - q_div_term[index]) * (q % 16)) / 16.0); double error_term = EPMB_SCALER * pow(err_per_mb, power_term); return error_term / divisor; } // Similar to find_qindex_by_rate() function in ratectrl.c, but includes // calculation of a correction_factor. static int find_qindex_by_rate_with_correction(uint64_t desired_bits_per_mb, aom_bit_depth_t bit_depth, double error_per_mb, double group_weight_factor, int best_qindex, int worst_qindex) { assert(best_qindex <= worst_qindex); int low = best_qindex; int high = worst_qindex; while (low < high) { const int mid = (low + high) >> 1; const double q_factor = calc_correction_factor(error_per_mb, mid); const double q = av1_convert_qindex_to_q(mid, bit_depth); const uint64_t mid_bits_per_mb = (uint64_t)((q_factor * group_weight_factor) / q); if (mid_bits_per_mb > desired_bits_per_mb) { low = mid + 1; } else { high = mid; } } return low; } /*!\brief Choose a target maximum Q for a group of frames * * \ingroup rate_control * * This function is used to estimate a suitable maximum Q for a * group of frames. Inititally it is called to get a crude estimate * for the whole clip. It is then called for each ARF/GF group to get * a revised estimate for that group. * * \param[in] cpi Top-level encoder structure * \param[in] av_frame_err The average per frame coded error score * for frames making up this section/group. * \param[in] inactive_zone Used to mask off /ignore part of the * frame. The most common use case is where * a wide format video (e.g. 16:9) is * letter-boxed into a more square format. * Here we want to ignore the bands at the * top and bottom. * \param[in] av_target_bandwidth The target bits per frame * * \return The maximum Q for frames in the group. */ static int get_twopass_worst_quality(AV1_COMP *cpi, const double av_frame_err, double inactive_zone, int av_target_bandwidth) { const RATE_CONTROL *const rc = &cpi->rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; inactive_zone = fclamp(inactive_zone, 0.0, 0.9999); if (av_target_bandwidth <= 0) { return rc->worst_quality; // Highest value allowed } else { const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE) ? cpi->initial_mbs : cpi->common.mi_params.MBs; const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone)); const double av_err_per_mb = av_frame_err / (1.0 - inactive_zone); const uint64_t target_norm_bits_per_mb = ((uint64_t)av_target_bandwidth << BPER_MB_NORMBITS) / active_mbs; int rate_err_tol = AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct); const double size_factor = (active_mbs < 500) ? 0.925 : ((active_mbs > 3000) ? 1.05 : 1.0); const double speed_factor = AOMMIN(1.02, (0.975 + (0.005 * cpi->oxcf.speed))); // Update bpm correction factor based on previous GOP rate error. twopass_update_bpm_factor(cpi, rate_err_tol); // Try and pick a max Q that will be high enough to encode the // content at the given rate. int q = find_qindex_by_rate_with_correction( target_norm_bits_per_mb, cpi->common.seq_params->bit_depth, av_err_per_mb, cpi->ppi->twopass.bpm_factor * speed_factor * size_factor, rc->best_quality, rc->worst_quality); // Restriction on active max q for constrained quality mode. if (rc_cfg->mode == AOM_CQ) q = AOMMAX(q, rc_cfg->cq_level); return q; } } #define INTRA_PART 0.005 #define DEFAULT_DECAY_LIMIT 0.75 #define LOW_SR_DIFF_TRHESH 0.01 #define NCOUNT_FRAME_II_THRESH 5.0 #define LOW_CODED_ERR_PER_MB 0.01 /* This function considers how the quality of prediction may be deteriorating * with distance. It comapres the coded error for the last frame and the * second reference frame (usually two frames old) and also applies a factor * based on the extent of INTRA coding. * * The decay factor is then used to reduce the contribution of frames further * from the alt-ref or golden frame, to the bitframe boost calculation for that * alt-ref or golden frame. */ static double get_sr_decay_rate(const FIRSTPASS_STATS *frame) { double sr_diff = (frame->sr_coded_error - frame->coded_error); double sr_decay = 1.0; double modified_pct_inter; double modified_pcnt_intra; modified_pct_inter = frame->pcnt_inter; if ((frame->coded_error > LOW_CODED_ERR_PER_MB) && ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) < (double)NCOUNT_FRAME_II_THRESH)) { modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral; } modified_pcnt_intra = 100 * (1.0 - modified_pct_inter); if ((sr_diff > LOW_SR_DIFF_TRHESH)) { double sr_diff_part = ((sr_diff * 0.25) / frame->intra_error); sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra); } return AOMMAX(sr_decay, DEFAULT_DECAY_LIMIT); } // This function gives an estimate of how badly we believe the prediction // quality is decaying from frame to frame. static double get_zero_motion_factor(const FIRSTPASS_STATS *frame) { const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion; double sr_decay = get_sr_decay_rate(frame); return AOMMIN(sr_decay, zero_motion_pct); } #define DEFAULT_ZM_FACTOR 0.5 static double get_prediction_decay_rate(const FIRSTPASS_STATS *frame_stats) { const double sr_decay_rate = get_sr_decay_rate(frame_stats); double zero_motion_factor = DEFAULT_ZM_FACTOR * (frame_stats->pcnt_inter - frame_stats->pcnt_motion); // Clamp value to range 0.0 to 1.0 // This should happen anyway if input values are sensibly clamped but checked // here just in case. if (zero_motion_factor > 1.0) zero_motion_factor = 1.0; else if (zero_motion_factor < 0.0) zero_motion_factor = 0.0; return AOMMAX(zero_motion_factor, (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor))); } // Function to test for a condition where a complex transition is followed // by a static section. For example in slide shows where there is a fade // between slides. This is to help with more optimal kf and gf positioning. static int detect_transition_to_still(const FIRSTPASS_INFO *firstpass_info, int next_stats_index, const int min_gf_interval, const int frame_interval, const int still_interval, const double loop_decay_rate, const double last_decay_rate) { // Break clause to detect very still sections after motion // For example a static image after a fade or other transition // instead of a clean scene cut. if (frame_interval > min_gf_interval && loop_decay_rate >= 0.999 && last_decay_rate < 0.9) { int stats_left = av1_firstpass_info_future_count(firstpass_info, next_stats_index); if (stats_left >= still_interval) { int j; // Look ahead a few frames to see if static condition persists... for (j = 0; j < still_interval; ++j) { const FIRSTPASS_STATS *stats = av1_firstpass_info_peek(firstpass_info, next_stats_index + j); if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break; } // Only if it does do we signal a transition to still. return j == still_interval; } } return 0; } // This function detects a flash through the high relative pcnt_second_ref // score in the frame following a flash frame. The offset passed in should // reflect this. static int detect_flash(const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, const int offset) { const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, twopass_frame, offset); // What we are looking for here is a situation where there is a // brief break in prediction (such as a flash) but subsequent frames // are reasonably well predicted by an earlier (pre flash) frame. // The recovery after a flash is indicated by a high pcnt_second_ref // compared to pcnt_inter. return next_frame != NULL && next_frame->pcnt_second_ref > next_frame->pcnt_inter && next_frame->pcnt_second_ref >= 0.5; } // Update the motion related elements to the GF arf boost calculation. static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, GF_GROUP_STATS *gf_stats, double f_w, double f_h) { const double pct = stats->pcnt_motion; // Accumulate Motion In/Out of frame stats. gf_stats->this_frame_mv_in_out = stats->mv_in_out_count * pct; gf_stats->mv_in_out_accumulator += gf_stats->this_frame_mv_in_out; gf_stats->abs_mv_in_out_accumulator += fabs(gf_stats->this_frame_mv_in_out); // Accumulate a measure of how uniform (or conversely how random) the motion // field is (a ratio of abs(mv) / mv). if (pct > 0.05) { const double mvr_ratio = fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr)); const double mvc_ratio = fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc)); gf_stats->mv_ratio_accumulator += pct * (mvr_ratio < stats->mvr_abs * f_h ? mvr_ratio : stats->mvr_abs * f_h); gf_stats->mv_ratio_accumulator += pct * (mvc_ratio < stats->mvc_abs * f_w ? mvc_ratio : stats->mvc_abs * f_w); } } static void accumulate_this_frame_stats(const FIRSTPASS_STATS *stats, const double mod_frame_err, GF_GROUP_STATS *gf_stats) { gf_stats->gf_group_err += mod_frame_err; #if GROUP_ADAPTIVE_MAXQ gf_stats->gf_group_raw_error += stats->coded_error; #endif gf_stats->gf_group_skip_pct += stats->intra_skip_pct; gf_stats->gf_group_inactive_zone_rows += stats->inactive_zone_rows; } static void accumulate_next_frame_stats(const FIRSTPASS_STATS *stats, const int flash_detected, const int frames_since_key, const int cur_idx, GF_GROUP_STATS *gf_stats, int f_w, int f_h) { accumulate_frame_motion_stats(stats, gf_stats, f_w, f_h); // sum up the metric values of current gf group gf_stats->avg_sr_coded_error += stats->sr_coded_error; gf_stats->avg_pcnt_second_ref += stats->pcnt_second_ref; gf_stats->avg_new_mv_count += stats->new_mv_count; gf_stats->avg_wavelet_energy += stats->frame_avg_wavelet_energy; if (fabs(stats->raw_error_stdev) > 0.000001) { gf_stats->non_zero_stdev_count++; gf_stats->avg_raw_err_stdev += stats->raw_error_stdev; } // Accumulate the effect of prediction quality decay if (!flash_detected) { gf_stats->last_loop_decay_rate = gf_stats->loop_decay_rate; gf_stats->loop_decay_rate = get_prediction_decay_rate(stats); gf_stats->decay_accumulator = gf_stats->decay_accumulator * gf_stats->loop_decay_rate; // Monitor for static sections. if ((frames_since_key + cur_idx - 1) > 1) { gf_stats->zero_motion_accumulator = AOMMIN( gf_stats->zero_motion_accumulator, get_zero_motion_factor(stats)); } } } static void average_gf_stats(const int total_frame, GF_GROUP_STATS *gf_stats) { if (total_frame) { gf_stats->avg_sr_coded_error /= total_frame; gf_stats->avg_pcnt_second_ref /= total_frame; gf_stats->avg_new_mv_count /= total_frame; gf_stats->avg_wavelet_energy /= total_frame; } if (gf_stats->non_zero_stdev_count) gf_stats->avg_raw_err_stdev /= gf_stats->non_zero_stdev_count; } #define BOOST_FACTOR 12.5 static double baseline_err_per_mb(const FRAME_INFO *frame_info) { unsigned int screen_area = frame_info->frame_height * frame_info->frame_width; // Use a different error per mb factor for calculating boost for // different formats. if (screen_area <= 640 * 360) { return 500.0; } else { return 1000.0; } } static double calc_frame_boost(const PRIMARY_RATE_CONTROL *p_rc, const FRAME_INFO *frame_info, const FIRSTPASS_STATS *this_frame, double this_frame_mv_in_out, double max_boost) { double frame_boost; const double lq = av1_convert_qindex_to_q(p_rc->avg_frame_qindex[INTER_FRAME], frame_info->bit_depth); const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5); const double active_area = calculate_active_area(frame_info, this_frame); // Underlying boost factor is based on inter error ratio. frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * active_area, this_frame->intra_error * active_area) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error); frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction; // Increase boost for frames where new data coming into frame (e.g. zoom out). // Slightly reduce boost if there is a net balance of motion out of the frame // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0. if (this_frame_mv_in_out > 0.0) frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); // In the extreme case the boost is halved. else frame_boost += frame_boost * (this_frame_mv_in_out / 2.0); return AOMMIN(frame_boost, max_boost * boost_q_correction); } static double calc_kf_frame_boost(const PRIMARY_RATE_CONTROL *p_rc, const FRAME_INFO *frame_info, const FIRSTPASS_STATS *this_frame, double *sr_accumulator, double max_boost) { double frame_boost; const double lq = av1_convert_qindex_to_q(p_rc->avg_frame_qindex[INTER_FRAME], frame_info->bit_depth); const double boost_q_correction = AOMMIN((0.50 + (lq * 0.015)), 2.00); const double active_area = calculate_active_area(frame_info, this_frame); // Underlying boost factor is based on inter error ratio. frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * active_area, this_frame->intra_error * active_area) / DOUBLE_DIVIDE_CHECK( (this_frame->coded_error + *sr_accumulator) * active_area); // Update the accumulator for second ref error difference. // This is intended to give an indication of how much the coded error is // increasing over time. *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error); *sr_accumulator = AOMMAX(0.0, *sr_accumulator); // Q correction and scaling // The 40.0 value here is an experimentally derived baseline minimum. // This value is in line with the minimum per frame boost in the alt_ref // boost calculation. frame_boost = ((frame_boost + 40.0) * boost_q_correction); return AOMMIN(frame_boost, max_boost * boost_q_correction); } static int get_projected_gfu_boost(const PRIMARY_RATE_CONTROL *p_rc, int gfu_boost, int frames_to_project, int num_stats_used_for_gfu_boost) { /* * If frames_to_project is equal to num_stats_used_for_gfu_boost, * it means that gfu_boost was calculated over frames_to_project to * begin with(ie; all stats required were available), hence return * the original boost. */ if (num_stats_used_for_gfu_boost >= frames_to_project) return gfu_boost; double min_boost_factor = sqrt(p_rc->baseline_gf_interval); // Get the current tpl factor (number of frames = frames_to_project). double tpl_factor = av1_get_gfu_boost_projection_factor( min_boost_factor, MAX_GFUBOOST_FACTOR, frames_to_project); // Get the tpl factor when number of frames = num_stats_used_for_prior_boost. double tpl_factor_num_stats = av1_get_gfu_boost_projection_factor( min_boost_factor, MAX_GFUBOOST_FACTOR, num_stats_used_for_gfu_boost); int projected_gfu_boost = (int)rint((tpl_factor * gfu_boost) / tpl_factor_num_stats); return projected_gfu_boost; } #define GF_MAX_BOOST 90.0 #define GF_MIN_BOOST 50 #define MIN_DECAY_FACTOR 0.01 int av1_calc_arf_boost(const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info, int offset, int f_frames, int b_frames, int *num_fpstats_used, int *num_fpstats_required, int project_gfu_boost) { int i; GF_GROUP_STATS gf_stats; init_gf_stats(&gf_stats); double boost_score = (double)NORMAL_BOOST; int arf_boost; int flash_detected = 0; if (num_fpstats_used) *num_fpstats_used = 0; // Search forward from the proposed arf/next gf position. for (i = 0; i < f_frames; ++i) { const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, twopass_frame, i + offset); if (this_frame == NULL) break; // Update the motion related elements to the boost calculation. accumulate_frame_motion_stats(this_frame, &gf_stats, frame_info->frame_width, frame_info->frame_height); // We want to discount the flash frame itself and the recovery // frame that follows as both will have poor scores. flash_detected = detect_flash(twopass, twopass_frame, i + offset) || detect_flash(twopass, twopass_frame, i + offset + 1); // Accumulate the effect of prediction quality decay. if (!flash_detected) { gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame); gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR ? MIN_DECAY_FACTOR : gf_stats.decay_accumulator; } boost_score += gf_stats.decay_accumulator * calc_frame_boost(p_rc, frame_info, this_frame, gf_stats.this_frame_mv_in_out, GF_MAX_BOOST); if (num_fpstats_used) (*num_fpstats_used)++; } arf_boost = (int)boost_score; // Reset for backward looking loop. boost_score = 0.0; init_gf_stats(&gf_stats); // Search backward towards last gf position. for (i = -1; i >= -b_frames; --i) { const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, twopass_frame, i + offset); if (this_frame == NULL) break; // Update the motion related elements to the boost calculation. accumulate_frame_motion_stats(this_frame, &gf_stats, frame_info->frame_width, frame_info->frame_height); // We want to discount the the flash frame itself and the recovery // frame that follows as both will have poor scores. flash_detected = detect_flash(twopass, twopass_frame, i + offset) || detect_flash(twopass, twopass_frame, i + offset + 1); // Cumulative effect of prediction quality decay. if (!flash_detected) { gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame); gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR ? MIN_DECAY_FACTOR : gf_stats.decay_accumulator; } boost_score += gf_stats.decay_accumulator * calc_frame_boost(p_rc, frame_info, this_frame, gf_stats.this_frame_mv_in_out, GF_MAX_BOOST); if (num_fpstats_used) (*num_fpstats_used)++; } arf_boost += (int)boost_score; if (project_gfu_boost) { assert(num_fpstats_required != NULL); assert(num_fpstats_used != NULL); *num_fpstats_required = f_frames + b_frames; arf_boost = get_projected_gfu_boost(p_rc, arf_boost, *num_fpstats_required, *num_fpstats_used); } if (arf_boost < ((b_frames + f_frames) * GF_MIN_BOOST)) arf_boost = ((b_frames + f_frames) * GF_MIN_BOOST); return arf_boost; } // Calculate a section intra ratio used in setting max loop filter. static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin, const FIRSTPASS_STATS *end, int section_length) { const FIRSTPASS_STATS *s = begin; double intra_error = 0.0; double coded_error = 0.0; int i = 0; while (s < end && i < section_length) { intra_error += s->intra_error; coded_error += s->coded_error; ++s; ++i; } return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error)); } /*!\brief Calculates the bit target for this GF/ARF group * * \ingroup rate_control * * Calculates the total bits to allocate in this GF/ARF group. * * \param[in] cpi Top-level encoder structure * \param[in] gf_group_err Cumulative coded error score for the * frames making up this group. * * \return The target total number of bits for this GF/ARF group. */ static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi, double gf_group_err) { const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const TWO_PASS *const twopass = &cpi->ppi->twopass; const int max_bits = frame_max_bits(rc, &cpi->oxcf); int64_t total_group_bits; // Calculate the bits to be allocated to the group as a whole. if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) { total_group_bits = (int64_t)(twopass->kf_group_bits * (gf_group_err / twopass->kf_group_error_left)); } else { total_group_bits = 0; } // Clamp odd edge cases. total_group_bits = (total_group_bits < 0) ? 0 : (total_group_bits > twopass->kf_group_bits) ? twopass->kf_group_bits : total_group_bits; // Clip based on user supplied data rate variability limit. if (total_group_bits > (int64_t)max_bits * p_rc->baseline_gf_interval) total_group_bits = (int64_t)max_bits * p_rc->baseline_gf_interval; return total_group_bits; } // Calculate the number of bits to assign to boosted frames in a group. static int calculate_boost_bits(int frame_count, int boost, int64_t total_group_bits) { int allocation_chunks; // return 0 for invalid inputs (could arise e.g. through rounding errors) if (!boost || (total_group_bits <= 0)) return 0; if (frame_count <= 0) return (int)(AOMMIN(total_group_bits, INT_MAX)); allocation_chunks = (frame_count * 100) + boost; // Prevent overflow. if (boost > 1023) { int divisor = boost >> 10; boost /= divisor; allocation_chunks /= divisor; } // Calculate the number of extra bits for use in the boosted frame or frames. return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), 0); } // Calculate the boost factor based on the number of bits assigned, i.e. the // inverse of calculate_boost_bits(). static int calculate_boost_factor(int frame_count, int bits, int64_t total_group_bits) { return (int)(100.0 * frame_count * bits / (total_group_bits - bits)); } // Reduce the number of bits assigned to keyframe or arf if necessary, to // prevent bitrate spikes that may break level constraints. // frame_type: 0: keyframe; 1: arf. static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi, RATE_CONTROL *const rc, int bits_assigned, int64_t group_bits, int frame_type) { const AV1_COMMON *const cm = &cpi->common; const SequenceHeader *const seq_params = cm->seq_params; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const int temporal_layer_id = cm->temporal_layer_id; const int spatial_layer_id = cm->spatial_layer_id; for (int index = 0; index < seq_params->operating_points_cnt_minus_1 + 1; ++index) { if (!is_in_operating_point(seq_params->operating_point_idc[index], temporal_layer_id, spatial_layer_id)) { continue; } const AV1_LEVEL target_level = cpi->ppi->level_params.target_seq_level_idx[index]; if (target_level >= SEQ_LEVELS) continue; assert(is_valid_seq_level_idx(target_level)); const double level_bitrate_limit = av1_get_max_bitrate_for_level( target_level, seq_params->tier[0], seq_params->profile); const int target_bits_per_frame = (int)(level_bitrate_limit / cpi->framerate); if (frame_type == 0) { // Maximum bits for keyframe is 8 times the target_bits_per_frame. const int level_enforced_max_kf_bits = target_bits_per_frame * 8; if (bits_assigned > level_enforced_max_kf_bits) { const int frames = rc->frames_to_key - 1; p_rc->kf_boost = calculate_boost_factor( frames, level_enforced_max_kf_bits, group_bits); bits_assigned = calculate_boost_bits(frames, p_rc->kf_boost, group_bits); } } else if (frame_type == 1) { // Maximum bits for arf is 4 times the target_bits_per_frame. const int level_enforced_max_arf_bits = target_bits_per_frame * 4; if (bits_assigned > level_enforced_max_arf_bits) { p_rc->gfu_boost = calculate_boost_factor(p_rc->baseline_gf_interval, level_enforced_max_arf_bits, group_bits); bits_assigned = calculate_boost_bits(p_rc->baseline_gf_interval, p_rc->gfu_boost, group_bits); } } else { assert(0); } } return bits_assigned; } // Allocate bits to each frame in a GF / ARF group static void allocate_gf_group_bits(GF_GROUP *gf_group, PRIMARY_RATE_CONTROL *const p_rc, RATE_CONTROL *const rc, int64_t gf_group_bits, int gf_arf_bits, int key_frame, int use_arf) { static const double layer_fraction[MAX_ARF_LAYERS + 1] = { 1.0, 0.70, 0.55, 0.60, 0.60, 1.0, 1.0 }; int64_t total_group_bits = gf_group_bits; int base_frame_bits; const int gf_group_size = gf_group->size; int layer_frames[MAX_ARF_LAYERS + 1] = { 0 }; // For key frames the frame target rate is already set and it // is also the golden frame. // === [frame_index == 0] === int frame_index = !!key_frame; // Subtract the extra bits set aside for ARF frames from the Group Total if (use_arf) total_group_bits -= gf_arf_bits; int num_frames = AOMMAX(1, p_rc->baseline_gf_interval - (rc->frames_since_key == 0)); base_frame_bits = (int)(total_group_bits / num_frames); // Check the number of frames in each layer in case we have a // non standard group length. int max_arf_layer = gf_group->max_layer_depth - 1; for (int idx = frame_index; idx < gf_group_size; ++idx) { if ((gf_group->update_type[idx] == ARF_UPDATE) || (gf_group->update_type[idx] == INTNL_ARF_UPDATE)) { layer_frames[gf_group->layer_depth[idx]]++; } } // Allocate extra bits to each ARF layer int i; int layer_extra_bits[MAX_ARF_LAYERS + 1] = { 0 }; assert(max_arf_layer <= MAX_ARF_LAYERS); for (i = 1; i <= max_arf_layer; ++i) { double fraction = (i == max_arf_layer) ? 1.0 : layer_fraction[i]; layer_extra_bits[i] = (int)((gf_arf_bits * fraction) / AOMMAX(1, layer_frames[i])); gf_arf_bits -= (int)(gf_arf_bits * fraction); } // Now combine ARF layer and baseline bits to give total bits for each frame. int arf_extra_bits; for (int idx = frame_index; idx < gf_group_size; ++idx) { switch (gf_group->update_type[idx]) { case ARF_UPDATE: case INTNL_ARF_UPDATE: arf_extra_bits = layer_extra_bits[gf_group->layer_depth[idx]]; gf_group->bit_allocation[idx] = (base_frame_bits > INT_MAX - arf_extra_bits) ? INT_MAX : (base_frame_bits + arf_extra_bits); break; case INTNL_OVERLAY_UPDATE: case OVERLAY_UPDATE: gf_group->bit_allocation[idx] = 0; break; default: gf_group->bit_allocation[idx] = base_frame_bits; break; } } // Set the frame following the current GOP to 0 bit allocation. For ARF // groups, this next frame will be overlay frame, which is the first frame // in the next GOP. For GF group, next GOP will overwrite the rate allocation. // Setting this frame to use 0 bit (of out the current GOP budget) will // simplify logics in reference frame management. if (gf_group_size < MAX_STATIC_GF_GROUP_LENGTH) gf_group->bit_allocation[gf_group_size] = 0; } // Returns true if KF group and GF group both are almost completely static. static inline int is_almost_static(double gf_zero_motion, int kf_zero_motion, int is_lap_enabled) { if (is_lap_enabled) { /* * when LAP enabled kf_zero_motion is not reliable, so use strict * constraint on gf_zero_motion. */ return (gf_zero_motion >= 0.999); } else { return (gf_zero_motion >= 0.995) && (kf_zero_motion >= STATIC_KF_GROUP_THRESH); } } #define ARF_ABS_ZOOM_THRESH 4.4 static inline int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start, int flash_detected, int active_max_gf_interval, int active_min_gf_interval, GF_GROUP_STATS *gf_stats) { RATE_CONTROL *const rc = &cpi->rc; TWO_PASS *const twopass = &cpi->ppi->twopass; AV1_COMMON *const cm = &cpi->common; // Motion breakout threshold for loop below depends on image size. const double mv_ratio_accumulator_thresh = (cm->height + cm->width) / 4.0; if (!flash_detected) { // Break clause to detect very still sections after motion. For example, // a static image after a fade or other transition. // TODO(angiebird): This is a temporary change, we will avoid using // twopass_frame.stats_in in the follow-up CL int index = (int)(cpi->twopass_frame.stats_in - twopass->stats_buf_ctx->stats_in_start); if (detect_transition_to_still(&twopass->firstpass_info, index, rc->min_gf_interval, frame_index - cur_start, 5, gf_stats->loop_decay_rate, gf_stats->last_loop_decay_rate)) { return 1; } } // Some conditions to breakout after min interval. if (frame_index - cur_start >= active_min_gf_interval && // If possible don't break very close to a kf (rc->frames_to_key - frame_index >= rc->min_gf_interval) && ((frame_index - cur_start) & 0x01) && !flash_detected && (gf_stats->mv_ratio_accumulator > mv_ratio_accumulator_thresh || gf_stats->abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH)) { return 1; } // If almost totally static, we will not use the the max GF length later, // so we can continue for more frames. if (((frame_index - cur_start) >= active_max_gf_interval + 1) && !is_almost_static(gf_stats->zero_motion_accumulator, twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled)) { return 1; } return 0; } static int is_shorter_gf_interval_better( AV1_COMP *cpi, const EncodeFrameParams *frame_params) { const RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; int gop_length_decision_method = cpi->sf.tpl_sf.gop_length_decision_method; int shorten_gf_interval; av1_tpl_preload_rc_estimate(cpi, frame_params); if (gop_length_decision_method == 2) { // GF group length is decided based on GF boost and tpl stats of ARFs from // base layer, (base+1) layer. shorten_gf_interval = (p_rc->gfu_boost < p_rc->num_stats_used_for_gfu_boost * GF_MIN_BOOST * 1.4) && !av1_tpl_setup_stats(cpi, 3, frame_params); } else { int do_complete_tpl = 1; GF_GROUP *const gf_group = &cpi->ppi->gf_group; int is_temporal_filter_enabled = (rc->frames_since_key > 0 && gf_group->arf_index > -1); if (gop_length_decision_method == 1) { // Check if tpl stats of ARFs from base layer, (base+1) layer, // (base+2) layer can decide the GF group length. int gop_length_eval = av1_tpl_setup_stats(cpi, 2, frame_params); if (gop_length_eval != 2) { do_complete_tpl = 0; shorten_gf_interval = !gop_length_eval; } } if (do_complete_tpl) { // Decide GF group length based on complete tpl stats. shorten_gf_interval = !av1_tpl_setup_stats(cpi, 1, frame_params); // Tpl stats is reused when the ARF is temporally filtered and GF // interval is not shortened. if (is_temporal_filter_enabled && !shorten_gf_interval) { cpi->skip_tpl_setup_stats = 1; #if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS assert(cpi->gf_frame_index == 0); av1_vbr_rc_update_q_index_list(&cpi->vbr_rc_info, &cpi->ppi->tpl_data, gf_group, cpi->common.seq_params->bit_depth); #endif // CONFIG_BITRATE_ACCURACY } } } return shorten_gf_interval; } #define MIN_SHRINK_LEN 6 // the minimum length of gf if we are shrinking #define SMOOTH_FILT_LEN 7 #define HALF_FILT_LEN (SMOOTH_FILT_LEN / 2) #define WINDOW_SIZE 7 #define HALF_WIN (WINDOW_SIZE / 2) // Smooth filter intra_error and coded_error in firstpass stats. // If stats[i].is_flash==1, the ith element should not be used in the filtering. static void smooth_filter_stats(const FIRSTPASS_STATS *stats, int start_idx, int last_idx, double *filt_intra_err, double *filt_coded_err) { // A 7-tap gaussian smooth filter static const double smooth_filt[SMOOTH_FILT_LEN] = { 0.006, 0.061, 0.242, 0.383, 0.242, 0.061, 0.006 }; int i, j; for (i = start_idx; i <= last_idx; i++) { double total_wt = 0; for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) { int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx); if (stats[idx].is_flash) continue; filt_intra_err[i] += smooth_filt[j + HALF_FILT_LEN] * stats[idx].intra_error; total_wt += smooth_filt[j + HALF_FILT_LEN]; } if (total_wt > 0.01) { filt_intra_err[i] /= total_wt; } else { filt_intra_err[i] = stats[i].intra_error; } } for (i = start_idx; i <= last_idx; i++) { double total_wt = 0; for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) { int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx); // Coded error involves idx and idx - 1. if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue; filt_coded_err[i] += smooth_filt[j + HALF_FILT_LEN] * stats[idx].coded_error; total_wt += smooth_filt[j + HALF_FILT_LEN]; } if (total_wt > 0.01) { filt_coded_err[i] /= total_wt; } else { filt_coded_err[i] = stats[i].coded_error; } } } // Calculate gradient static void get_gradient(const double *values, int start, int last, double *grad) { if (start == last) { grad[start] = 0; return; } for (int i = start; i <= last; i++) { int prev = AOMMAX(i - 1, start); int next = AOMMIN(i + 1, last); grad[i] = (values[next] - values[prev]) / (next - prev); } } static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start, int first, int last) { // Identify unstable areas caused by scenecuts. // Find the max and 2nd max coded error, and the average of the rest frames. // If there is only one frame that yields a huge coded error, it is likely a // scenecut. double this_ratio, max_prev_ratio, max_next_ratio, max_prev_coded, max_next_coded; if (last - first == 0) return -1; for (int i = first; i <= last; i++) { if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash)) continue; double temp_intra = AOMMAX(stats_start[i].intra_error, 0.01); this_ratio = stats_start[i].coded_error / temp_intra; // find the avg ratio in the preceding neighborhood max_prev_ratio = 0; max_prev_coded = 0; for (int j = AOMMAX(first, i - HALF_WIN); j < i; j++) { if (stats_start[j].is_flash || (j > 0 && stats_start[j - 1].is_flash)) continue; temp_intra = AOMMAX(stats_start[j].intra_error, 0.01); double temp_ratio = stats_start[j].coded_error / temp_intra; if (temp_ratio > max_prev_ratio) { max_prev_ratio = temp_ratio; } if (stats_start[j].coded_error > max_prev_coded) { max_prev_coded = stats_start[j].coded_error; } } // find the avg ratio in the following neighborhood max_next_ratio = 0; max_next_coded = 0; for (int j = i + 1; j <= AOMMIN(i + HALF_WIN, last); j++) { if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash)) continue; temp_intra = AOMMAX(stats_start[j].intra_error, 0.01); double temp_ratio = stats_start[j].coded_error / temp_intra; if (temp_ratio > max_next_ratio) { max_next_ratio = temp_ratio; } if (stats_start[j].coded_error > max_next_coded) { max_next_coded = stats_start[j].coded_error; } } if (max_prev_ratio < 0.001 && max_next_ratio < 0.001) { // the ratios are very small, only check a small fixed threshold if (this_ratio < 0.02) continue; } else { // check if this frame has a larger ratio than the neighborhood double max_sr = stats_start[i].sr_coded_error; if (i < last) max_sr = AOMMAX(max_sr, stats_start[i + 1].sr_coded_error); double max_sr_fr_ratio = max_sr / AOMMAX(stats_start[i].coded_error, 0.01); if (max_sr_fr_ratio > 1.2) continue; if (this_ratio < 2 * AOMMAX(max_prev_ratio, max_next_ratio) && stats_start[i].coded_error < 2 * AOMMAX(max_prev_coded, max_next_coded)) { continue; } } return i; } return -1; } // Remove the region with index next_region. // parameter merge: 0: merge with previous; 1: merge with next; 2: // merge with both, take type from previous if possible // After removing, next_region will be the index of the next region. static void remove_region(int merge, REGIONS *regions, int *num_regions, int *next_region) { int k = *next_region; assert(k < *num_regions); if (*num_regions == 1) { *num_regions = 0; return; } if (k == 0) { merge = 1; } else if (k == *num_regions - 1) { merge = 0; } int num_merge = (merge == 2) ? 2 : 1; switch (merge) { case 0: regions[k - 1].last = regions[k].last; *next_region = k; break; case 1: regions[k + 1].start = regions[k].start; *next_region = k + 1; break; case 2: regions[k - 1].last = regions[k + 1].last; *next_region = k; break; default: assert(0); } *num_regions -= num_merge; for (k = *next_region - (merge == 1); k < *num_regions; k++) { regions[k] = regions[k + num_merge]; } } // Insert a region in the cur_region_idx. The start and last should both be in // the current region. After insertion, the cur_region_idx will point to the // last region that was splitted from the original region. static void insert_region(int start, int last, REGION_TYPES type, REGIONS *regions, int *num_regions, int *cur_region_idx) { int k = *cur_region_idx; REGION_TYPES this_region_type = regions[k].type; int this_region_last = regions[k].last; int num_add = (start != regions[k].start) + (last != regions[k].last); // move the following regions further to the back for (int r = *num_regions - 1; r > k; r--) { regions[r + num_add] = regions[r]; } *num_regions += num_add; if (start > regions[k].start) { regions[k].last = start - 1; k++; regions[k].start = start; } regions[k].type = type; if (last < this_region_last) { regions[k].last = last; k++; regions[k].start = last + 1; regions[k].last = this_region_last; regions[k].type = this_region_type; } else { regions[k].last = this_region_last; } *cur_region_idx = k; } // Get the average of stats inside a region. static void analyze_region(const FIRSTPASS_STATS *stats, int k, REGIONS *regions) { int i; regions[k].avg_cor_coeff = 0; regions[k].avg_sr_fr_ratio = 0; regions[k].avg_intra_err = 0; regions[k].avg_coded_err = 0; int check_first_sr = (k != 0); for (i = regions[k].start; i <= regions[k].last; i++) { if (i > regions[k].start || check_first_sr) { double num_frames = (double)(regions[k].last - regions[k].start + check_first_sr); double max_coded_error = AOMMAX(stats[i].coded_error, stats[i - 1].coded_error); double this_ratio = stats[i].sr_coded_error / AOMMAX(max_coded_error, 0.001); regions[k].avg_sr_fr_ratio += this_ratio / num_frames; } regions[k].avg_intra_err += stats[i].intra_error / (double)(regions[k].last - regions[k].start + 1); regions[k].avg_coded_err += stats[i].coded_error / (double)(regions[k].last - regions[k].start + 1); regions[k].avg_cor_coeff += AOMMAX(stats[i].cor_coeff, 0.001) / (double)(regions[k].last - regions[k].start + 1); regions[k].avg_noise_var += AOMMAX(stats[i].noise_var, 0.001) / (double)(regions[k].last - regions[k].start + 1); } } // Calculate the regions stats of every region. static void get_region_stats(const FIRSTPASS_STATS *stats, REGIONS *regions, int num_regions) { for (int k = 0; k < num_regions; k++) { analyze_region(stats, k, regions); } } // Find tentative stable regions static int find_stable_regions(const FIRSTPASS_STATS *stats, const double *grad_coded, int this_start, int this_last, REGIONS *regions) { int i, j, k = 0; regions[k].start = this_start; for (i = this_start; i <= this_last; i++) { // Check mean and variance of stats in a window double mean_intra = 0.001, var_intra = 0.001; double mean_coded = 0.001, var_coded = 0.001; int count = 0; for (j = -HALF_WIN; j <= HALF_WIN; j++) { int idx = AOMMIN(AOMMAX(i + j, this_start), this_last); if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue; mean_intra += stats[idx].intra_error; var_intra += stats[idx].intra_error * stats[idx].intra_error; mean_coded += stats[idx].coded_error; var_coded += stats[idx].coded_error * stats[idx].coded_error; count++; } REGION_TYPES cur_type; if (count > 0) { mean_intra /= (double)count; var_intra /= (double)count; mean_coded /= (double)count; var_coded /= (double)count; int is_intra_stable = (var_intra / (mean_intra * mean_intra) < 1.03); int is_coded_stable = (var_coded / (mean_coded * mean_coded) < 1.04 && fabs(grad_coded[i]) / mean_coded < 0.05) || mean_coded / mean_intra < 0.05; int is_coded_small = mean_coded < 0.5 * mean_intra; cur_type = (is_intra_stable && is_coded_stable && is_coded_small) ? STABLE_REGION : HIGH_VAR_REGION; } else { cur_type = HIGH_VAR_REGION; } // mark a new region if type changes if (i == regions[k].start) { // first frame in the region regions[k].type = cur_type; } else if (cur_type != regions[k].type) { // Append a new region regions[k].last = i - 1; regions[k + 1].start = i; regions[k + 1].type = cur_type; k++; } } regions[k].last = this_last; return k + 1; } // Clean up regions that should be removed or merged. static void cleanup_regions(REGIONS *regions, int *num_regions) { int k = 0; while (k < *num_regions) { if ((k > 0 && regions[k - 1].type == regions[k].type && regions[k].type != SCENECUT_REGION) || regions[k].last < regions[k].start) { remove_region(0, regions, num_regions, &k); } else { k++; } } } // Remove regions that are of type and shorter than length. // Merge it with its neighboring regions. static void remove_short_regions(REGIONS *regions, int *num_regions, REGION_TYPES type, int length) { int k = 0; while (k < *num_regions && (*num_regions) > 1) { if ((regions[k].last - regions[k].start + 1 < length && regions[k].type == type)) { // merge current region with the previous and next regions remove_region(2, regions, num_regions, &k); } else { k++; } } cleanup_regions(regions, num_regions); } static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats, REGIONS *regions, int *num_regions) { int i, j, k; // Remove regions that are too short. Likely noise. remove_short_regions(regions, num_regions, STABLE_REGION, HALF_WIN); remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN); get_region_stats(stats, regions, *num_regions); // Adjust region boundaries. The thresholds are empirically obtained, but // overall the performance is not very sensitive to small changes to them. for (k = 0; k < *num_regions; k++) { if (regions[k].type == STABLE_REGION) continue; if (k > 0) { // Adjust previous boundary. // First find the average intra/coded error in the previous // neighborhood. double avg_intra_err = 0; const int starti = AOMMAX(regions[k - 1].last - WINDOW_SIZE + 1, regions[k - 1].start + 1); const int lasti = regions[k - 1].last; int counti = 0; for (i = starti; i <= lasti; i++) { avg_intra_err += stats[i].intra_error; counti++; } if (counti > 0) { avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001); int count_coded = 0, count_grad = 0; for (j = lasti + 1; j <= regions[k].last; j++) { const int intra_close = fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1; const int coded_small = stats[j].coded_error / avg_intra_err < 0.1; const int coeff_close = stats[j].cor_coeff > 0.995; if (!coeff_close || !coded_small) count_coded--; if (intra_close && count_coded >= 0 && count_grad >= 0) { // this frame probably belongs to the previous stable region regions[k - 1].last = j; regions[k].start = j + 1; } else { break; } } } } // if k > 0 if (k < *num_regions - 1) { // Adjust next boundary. // First find the average intra/coded error in the next neighborhood. double avg_intra_err = 0; const int starti = regions[k + 1].start; const int lasti = AOMMIN(regions[k + 1].last - 1, regions[k + 1].start + WINDOW_SIZE - 1); int counti = 0; for (i = starti; i <= lasti; i++) { avg_intra_err += stats[i].intra_error; counti++; } if (counti > 0) { avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001); // At the boundary, coded error is large, but still the frame is stable int count_coded = 1, count_grad = 1; for (j = starti - 1; j >= regions[k].start; j--) { const int intra_close = fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1; const int coded_small = stats[j + 1].coded_error / avg_intra_err < 0.1; const int coeff_close = stats[j].cor_coeff > 0.995; if (!coeff_close || !coded_small) count_coded--; if (intra_close && count_coded >= 0 && count_grad >= 0) { // this frame probably belongs to the next stable region regions[k + 1].start = j; regions[k].last = j - 1; } else { break; } } } } // if k < *num_regions - 1 } // end of loop over all regions cleanup_regions(regions, num_regions); remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN); get_region_stats(stats, regions, *num_regions); // If a stable regions has higher error than neighboring high var regions, // or if the stable region has a lower average correlation, // then it should be merged with them k = 0; while (k < *num_regions && (*num_regions) > 1) { if (regions[k].type == STABLE_REGION && (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE && ((k > 0 && // previous regions (regions[k].avg_coded_err > regions[k - 1].avg_coded_err * 1.01 || regions[k].avg_cor_coeff < regions[k - 1].avg_cor_coeff * 0.999)) && (k < *num_regions - 1 && // next region (regions[k].avg_coded_err > regions[k + 1].avg_coded_err * 1.01 || regions[k].avg_cor_coeff < regions[k + 1].avg_cor_coeff * 0.999)))) { // merge current region with the previous and next regions remove_region(2, regions, num_regions, &k); analyze_region(stats, k - 1, regions); } else if (regions[k].type == HIGH_VAR_REGION && (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE && ((k > 0 && // previous regions (regions[k].avg_coded_err < regions[k - 1].avg_coded_err * 0.99 || regions[k].avg_cor_coeff > regions[k - 1].avg_cor_coeff * 1.001)) && (k < *num_regions - 1 && // next region (regions[k].avg_coded_err < regions[k + 1].avg_coded_err * 0.99 || regions[k].avg_cor_coeff > regions[k + 1].avg_cor_coeff * 1.001)))) { // merge current region with the previous and next regions remove_region(2, regions, num_regions, &k); analyze_region(stats, k - 1, regions); } else { k++; } } remove_short_regions(regions, num_regions, STABLE_REGION, WINDOW_SIZE); remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN); } // Identify blending regions. static void find_blending_regions(const FIRSTPASS_STATS *stats, REGIONS *regions, int *num_regions) { int i, k = 0; // Blending regions will have large content change, therefore will have a // large consistent change in intra error. int count_stable = 0; while (k < *num_regions) { if (regions[k].type == STABLE_REGION) { k++; count_stable++; continue; } int dir = 0; int start = 0, last; for (i = regions[k].start; i <= regions[k].last; i++) { // First mark the regions that has consistent large change of intra error. if (k == 0 && i == regions[k].start) continue; if (stats[i].is_flash || (i > 0 && stats[i - 1].is_flash)) continue; double grad = stats[i].intra_error - stats[i - 1].intra_error; int large_change = fabs(grad) / AOMMAX(stats[i].intra_error, 0.01) > 0.05; int this_dir = 0; if (large_change) { this_dir = (grad > 0) ? 1 : -1; } // the current trend continues if (dir == this_dir) continue; if (dir != 0) { // Mark the end of a new large change group and add it last = i - 1; insert_region(start, last, BLENDING_REGION, regions, num_regions, &k); } dir = this_dir; if (k == 0 && i == regions[k].start + 1) { start = i - 1; } else { start = i; } } if (dir != 0) { last = regions[k].last; insert_region(start, last, BLENDING_REGION, regions, num_regions, &k); } k++; } // If the blending region has very low correlation, mark it as high variance // since we probably cannot benefit from it anyways. get_region_stats(stats, regions, *num_regions); for (k = 0; k < *num_regions; k++) { if (regions[k].type != BLENDING_REGION) continue; if (regions[k].last == regions[k].start || regions[k].avg_cor_coeff < 0.6 || count_stable == 0) regions[k].type = HIGH_VAR_REGION; } get_region_stats(stats, regions, *num_regions); // It is possible for blending to result in a "dip" in intra error (first // decrease then increase). Therefore we need to find the dip and combine the // two regions. k = 1; while (k < *num_regions) { if (k < *num_regions - 1 && regions[k].type == HIGH_VAR_REGION) { // Check if this short high variance regions is actually in the middle of // a blending region. if (regions[k - 1].type == BLENDING_REGION && regions[k + 1].type == BLENDING_REGION && regions[k].last - regions[k].start < 3) { int prev_dir = (stats[regions[k - 1].last].intra_error - stats[regions[k - 1].last - 1].intra_error) > 0 ? 1 : -1; int next_dir = (stats[regions[k + 1].last].intra_error - stats[regions[k + 1].last - 1].intra_error) > 0 ? 1 : -1; if (prev_dir < 0 && next_dir > 0) { // This is possibly a mid region of blending. Check the ratios double ratio_thres = AOMMIN(regions[k - 1].avg_sr_fr_ratio, regions[k + 1].avg_sr_fr_ratio) * 0.95; if (regions[k].avg_sr_fr_ratio > ratio_thres) { regions[k].type = BLENDING_REGION; remove_region(2, regions, num_regions, &k); analyze_region(stats, k - 1, regions); continue; } } } } // Check if we have a pair of consecutive blending regions. if (regions[k - 1].type == BLENDING_REGION && regions[k].type == BLENDING_REGION) { int prev_dir = (stats[regions[k - 1].last].intra_error - stats[regions[k - 1].last - 1].intra_error) > 0 ? 1 : -1; int next_dir = (stats[regions[k].last].intra_error - stats[regions[k].last - 1].intra_error) > 0 ? 1 : -1; // if both are too short, no need to check int total_length = regions[k].last - regions[k - 1].start + 1; if (total_length < 4) { regions[k - 1].type = HIGH_VAR_REGION; k++; continue; } int to_merge = 0; if (prev_dir < 0 && next_dir > 0) { // In this case we check the last frame in the previous region. double prev_length = (double)(regions[k - 1].last - regions[k - 1].start + 1); double last_ratio, ratio_thres; if (prev_length < 2.01) { // if the previous region is very short double max_coded_error = AOMMAX(stats[regions[k - 1].last].coded_error, stats[regions[k - 1].last - 1].coded_error); last_ratio = stats[regions[k - 1].last].sr_coded_error / AOMMAX(max_coded_error, 0.001); ratio_thres = regions[k].avg_sr_fr_ratio * 0.95; } else { double max_coded_error = AOMMAX(stats[regions[k - 1].last].coded_error, stats[regions[k - 1].last - 1].coded_error); last_ratio = stats[regions[k - 1].last].sr_coded_error / AOMMAX(max_coded_error, 0.001); double prev_ratio = (regions[k - 1].avg_sr_fr_ratio * prev_length - last_ratio) / (prev_length - 1.0); ratio_thres = AOMMIN(prev_ratio, regions[k].avg_sr_fr_ratio) * 0.95; } if (last_ratio > ratio_thres) { to_merge = 1; } } if (to_merge) { remove_region(0, regions, num_regions, &k); analyze_region(stats, k - 1, regions); continue; } else { // These are possibly two separate blending regions. Mark the boundary // frame as HIGH_VAR_REGION to separate the two. int prev_k = k - 1; insert_region(regions[prev_k].last, regions[prev_k].last, HIGH_VAR_REGION, regions, num_regions, &prev_k); analyze_region(stats, prev_k, regions); k = prev_k + 1; analyze_region(stats, k, regions); } } k++; } cleanup_regions(regions, num_regions); } // Clean up decision for blendings. Remove blending regions that are too short. // Also if a very short high var region is between a blending and a stable // region, just merge it with one of them. static void cleanup_blendings(REGIONS *regions, int *num_regions) { int k = 0; while (k<*num_regions && * num_regions> 1) { int is_short_blending = regions[k].type == BLENDING_REGION && regions[k].last - regions[k].start + 1 < 5; int is_short_hv = regions[k].type == HIGH_VAR_REGION && regions[k].last - regions[k].start + 1 < 5; int has_stable_neighbor = ((k > 0 && regions[k - 1].type == STABLE_REGION) || (k < *num_regions - 1 && regions[k + 1].type == STABLE_REGION)); int has_blend_neighbor = ((k > 0 && regions[k - 1].type == BLENDING_REGION) || (k < *num_regions - 1 && regions[k + 1].type == BLENDING_REGION)); int total_neighbors = (k > 0) + (k < *num_regions - 1); if (is_short_blending || (is_short_hv && has_stable_neighbor + has_blend_neighbor >= total_neighbors)) { // Remove this region.Try to determine whether to combine it with the // previous or next region. int merge; double prev_diff = (k > 0) ? fabs(regions[k].avg_cor_coeff - regions[k - 1].avg_cor_coeff) : 1; double next_diff = (k < *num_regions - 1) ? fabs(regions[k].avg_cor_coeff - regions[k + 1].avg_cor_coeff) : 1; // merge == 0 means to merge with previous, 1 means to merge with next merge = prev_diff > next_diff; remove_region(merge, regions, num_regions, &k); } else { k++; } } cleanup_regions(regions, num_regions); } static void free_firstpass_stats_buffers(REGIONS *temp_regions, double *filt_intra_err, double *filt_coded_err, double *grad_coded) { aom_free(temp_regions); aom_free(filt_intra_err); aom_free(filt_coded_err); aom_free(grad_coded); } // Identify stable and unstable regions from first pass stats. // stats_start points to the first frame to analyze. // |offset| is the offset from the current frame to the frame stats_start is // pointing to. // Returns 0 on success, -1 on memory allocation failure. static int identify_regions(const FIRSTPASS_STATS *const stats_start, int total_frames, int offset, REGIONS *regions, int *total_regions) { int k; if (total_frames <= 1) return 0; // store the initial decisions REGIONS *temp_regions = (REGIONS *)aom_malloc(total_frames * sizeof(temp_regions[0])); // buffers for filtered stats double *filt_intra_err = (double *)aom_calloc(total_frames, sizeof(*filt_intra_err)); double *filt_coded_err = (double *)aom_calloc(total_frames, sizeof(*filt_coded_err)); double *grad_coded = (double *)aom_calloc(total_frames, sizeof(*grad_coded)); if (!(temp_regions && filt_intra_err && filt_coded_err && grad_coded)) { free_firstpass_stats_buffers(temp_regions, filt_intra_err, filt_coded_err, grad_coded); return -1; } av1_zero_array(temp_regions, total_frames); int cur_region = 0, this_start = 0, this_last; int next_scenecut = -1; do { // first get the obvious scenecuts next_scenecut = find_next_scenecut(stats_start, this_start, total_frames - 1); this_last = (next_scenecut >= 0) ? (next_scenecut - 1) : total_frames - 1; // low-pass filter the needed stats smooth_filter_stats(stats_start, this_start, this_last, filt_intra_err, filt_coded_err); get_gradient(filt_coded_err, this_start, this_last, grad_coded); // find tentative stable regions and unstable regions int num_regions = find_stable_regions(stats_start, grad_coded, this_start, this_last, temp_regions); adjust_unstable_region_bounds(stats_start, temp_regions, &num_regions); get_region_stats(stats_start, temp_regions, num_regions); // Try to identify blending regions in the unstable regions find_blending_regions(stats_start, temp_regions, &num_regions); cleanup_blendings(temp_regions, &num_regions); // The flash points should all be considered high variance points k = 0; while (k < num_regions) { if (temp_regions[k].type != STABLE_REGION) { k++; continue; } int start = temp_regions[k].start; int last = temp_regions[k].last; for (int i = start; i <= last; i++) { if (stats_start[i].is_flash) { insert_region(i, i, HIGH_VAR_REGION, temp_regions, &num_regions, &k); } } k++; } cleanup_regions(temp_regions, &num_regions); // copy the regions in the scenecut group for (k = 0; k < num_regions; k++) { if (temp_regions[k].last < temp_regions[k].start && k == num_regions - 1) { num_regions--; break; } regions[k + cur_region] = temp_regions[k]; } cur_region += num_regions; // add the scenecut region if (next_scenecut > -1) { // add the scenecut region, and find the next scenecut regions[cur_region].type = SCENECUT_REGION; regions[cur_region].start = next_scenecut; regions[cur_region].last = next_scenecut; cur_region++; this_start = next_scenecut + 1; } } while (next_scenecut >= 0); *total_regions = cur_region; get_region_stats(stats_start, regions, *total_regions); for (k = 0; k < *total_regions; k++) { // If scenecuts are very minor, mark them as high variance. if (regions[k].type != SCENECUT_REGION || regions[k].avg_cor_coeff * (1 - stats_start[regions[k].start].noise_var / regions[k].avg_intra_err) < 0.8) { continue; } regions[k].type = HIGH_VAR_REGION; } cleanup_regions(regions, total_regions); get_region_stats(stats_start, regions, *total_regions); for (k = 0; k < *total_regions; k++) { regions[k].start += offset; regions[k].last += offset; } free_firstpass_stats_buffers(temp_regions, filt_intra_err, filt_coded_err, grad_coded); return 0; } static int find_regions_index(const REGIONS *regions, int num_regions, int frame_idx) { for (int k = 0; k < num_regions; k++) { if (regions[k].start <= frame_idx && regions[k].last >= frame_idx) { return k; } } return -1; } /*!\brief Determine the length of future GF groups. * * \ingroup gf_group_algo * This function decides the gf group length of future frames in batch * * \param[in] cpi Top-level encoder structure * \param[in] max_gop_length Maximum length of the GF group * \param[in] max_intervals Maximum number of intervals to decide * * \remark Nothing is returned. Instead, cpi->ppi->rc.gf_intervals is * changed to store the decided GF group lengths. */ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length, int max_intervals) { RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; TWO_PASS *const twopass = &cpi->ppi->twopass; FIRSTPASS_STATS next_frame; const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in; const FIRSTPASS_STATS *const stats = start_pos - (rc->frames_since_key == 0); const int f_w = cpi->common.width; const int f_h = cpi->common.height; int i; int flash_detected; av1_zero(next_frame); if (has_no_stats_stage(cpi)) { for (i = 0; i < MAX_NUM_GF_INTERVALS; i++) { p_rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length); } p_rc->cur_gf_index = 0; rc->intervals_till_gf_calculate_due = MAX_NUM_GF_INTERVALS; return; } // TODO(urvang): Try logic to vary min and max interval based on q. const int active_min_gf_interval = rc->min_gf_interval; const int active_max_gf_interval = AOMMIN(rc->max_gf_interval, max_gop_length); const int min_shrink_int = AOMMAX(MIN_SHRINK_LEN, active_min_gf_interval); i = (rc->frames_since_key == 0); max_intervals = cpi->ppi->lap_enabled ? 1 : max_intervals; int count_cuts = 1; // If cpi->gf_state.arf_gf_boost_lst is 0, we are starting with a KF or GF. int cur_start = -1 + !cpi->ppi->gf_state.arf_gf_boost_lst, cur_last; int cut_pos[MAX_NUM_GF_INTERVALS + 1] = { -1 }; int cut_here; GF_GROUP_STATS gf_stats; init_gf_stats(&gf_stats); while (count_cuts < max_intervals + 1) { // reaches next key frame, break here if (i >= rc->frames_to_key) { cut_here = 2; } else if (i - cur_start >= rc->static_scene_max_gf_interval) { // reached maximum len, but nothing special yet (almost static) // let's look at the next interval cut_here = 1; } else if (EOF == input_stats(twopass, &cpi->twopass_frame, &next_frame)) { // reaches last frame, break cut_here = 2; } else { // Test for the case where there is a brief flash but the prediction // quality back to an earlier frame is then restored. flash_detected = detect_flash(twopass, &cpi->twopass_frame, 0); // TODO(bohanli): remove redundant accumulations here, or unify // this and the ones in define_gf_group accumulate_next_frame_stats(&next_frame, flash_detected, rc->frames_since_key, i, &gf_stats, f_w, f_h); cut_here = detect_gf_cut(cpi, i, cur_start, flash_detected, active_max_gf_interval, active_min_gf_interval, &gf_stats); } if (cut_here) { cur_last = i - 1; // the current last frame in the gf group int ori_last = cur_last; // The region frame idx does not start from the same frame as cur_start // and cur_last. Need to offset them. int offset = rc->frames_since_key - p_rc->regions_offset; REGIONS *regions = p_rc->regions; int num_regions = p_rc->num_regions; int scenecut_idx = -1; // only try shrinking if interval smaller than active_max_gf_interval if (cur_last - cur_start <= active_max_gf_interval && cur_last > cur_start) { // find the region indices of where the first and last frame belong. int k_start = find_regions_index(regions, num_regions, cur_start + offset); int k_last = find_regions_index(regions, num_regions, cur_last + offset); if (cur_start + offset == 0) k_start = 0; // See if we have a scenecut in between for (int r = k_start + 1; r <= k_last; r++) { if (regions[r].type == SCENECUT_REGION && regions[r].last - offset - cur_start > active_min_gf_interval) { scenecut_idx = r; break; } } // if the found scenecut is very close to the end, ignore it. if (regions[num_regions - 1].last - regions[scenecut_idx].last < 4) { scenecut_idx = -1; } if (scenecut_idx != -1) { // If we have a scenecut, then stop at it. // TODO(bohanli): add logic here to stop before the scenecut and for // the next gop start from the scenecut with GF int is_minor_sc = (regions[scenecut_idx].avg_cor_coeff * (1 - stats[regions[scenecut_idx].start - offset].noise_var / regions[scenecut_idx].avg_intra_err) > 0.6); cur_last = regions[scenecut_idx].last - offset - !is_minor_sc; } else { int is_last_analysed = (k_last == num_regions - 1) && (cur_last + offset == regions[k_last].last); int not_enough_regions = k_last - k_start <= 1 + (regions[k_start].type == SCENECUT_REGION); // if we are very close to the end, then do not shrink since it may // introduce intervals that are too short if (!(is_last_analysed && not_enough_regions)) { const double arf_length_factor = 0.1; double best_score = 0; int best_j = -1; const int first_frame = regions[0].start - offset; const int last_frame = regions[num_regions - 1].last - offset; // score of how much the arf helps the whole GOP double base_score = 0.0; // Accumulate base_score in for (int j = cur_start + 1; j < cur_start + min_shrink_int; j++) { if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break; base_score = (base_score + 1.0) * stats[j].cor_coeff; } int met_blending = 0; // Whether we have met blending areas before int last_blending = 0; // Whether the previous frame if blending for (int j = cur_start + min_shrink_int; j <= cur_last; j++) { if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break; base_score = (base_score + 1.0) * stats[j].cor_coeff; int this_reg = find_regions_index(regions, num_regions, j + offset); if (this_reg < 0) continue; // A GOP should include at most 1 blending region. if (regions[this_reg].type == BLENDING_REGION) { last_blending = 1; if (met_blending) { break; } else { base_score = 0; continue; } } else { if (last_blending) met_blending = 1; last_blending = 0; } // Add the factor of how good the neighborhood is for this // candidate arf. double this_score = arf_length_factor * base_score; double temp_accu_coeff = 1.0; // following frames int count_f = 0; for (int n = j + 1; n <= j + 3 && n <= last_frame; n++) { if (stats + n >= twopass->stats_buf_ctx->stats_in_end) break; temp_accu_coeff *= stats[n].cor_coeff; this_score += temp_accu_coeff * sqrt(AOMMAX(0.5, 1 - stats[n].noise_var / AOMMAX(stats[n].intra_error, 0.001))); count_f++; } // preceding frames temp_accu_coeff = 1.0; for (int n = j; n > j - 3 * 2 + count_f && n > first_frame; n--) { if (stats + n < twopass->stats_buf_ctx->stats_in_start) break; temp_accu_coeff *= stats[n].cor_coeff; this_score += temp_accu_coeff * sqrt(AOMMAX(0.5, 1 - stats[n].noise_var / AOMMAX(stats[n].intra_error, 0.001))); } if (this_score > best_score) { best_score = this_score; best_j = j; } } // For blending areas, move one more frame in case we missed the // first blending frame. int best_reg = find_regions_index(regions, num_regions, best_j + offset); if (best_reg < num_regions - 1 && best_reg > 0) { if (regions[best_reg - 1].type == BLENDING_REGION && regions[best_reg + 1].type == BLENDING_REGION) { if (best_j + offset == regions[best_reg].start && best_j + offset < regions[best_reg].last) { best_j += 1; } else if (best_j + offset == regions[best_reg].last && best_j + offset > regions[best_reg].start) { best_j -= 1; } } } if (cur_last - best_j < 2) best_j = cur_last; if (best_j > 0 && best_score > 0.1) cur_last = best_j; // if cannot find anything, just cut at the original place. } } } cut_pos[count_cuts] = cur_last; count_cuts++; // reset pointers to the shrunken location cpi->twopass_frame.stats_in = start_pos + cur_last; cur_start = cur_last; int cur_region_idx = find_regions_index(regions, num_regions, cur_start + 1 + offset); if (cur_region_idx >= 0) if (regions[cur_region_idx].type == SCENECUT_REGION) cur_start++; i = cur_last; if (cut_here > 1 && cur_last == ori_last) break; // reset accumulators init_gf_stats(&gf_stats); } ++i; } // save intervals rc->intervals_till_gf_calculate_due = count_cuts - 1; for (int n = 1; n < count_cuts; n++) { p_rc->gf_intervals[n - 1] = cut_pos[n] - cut_pos[n - 1]; } p_rc->cur_gf_index = 0; cpi->twopass_frame.stats_in = start_pos; } static void correct_frames_to_key(AV1_COMP *cpi) { int lookahead_size = (int)av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage); if (lookahead_size < av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage)) { assert( IMPLIES(cpi->oxcf.pass != AOM_RC_ONE_PASS && cpi->ppi->frames_left > 0, lookahead_size == cpi->ppi->frames_left)); cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, lookahead_size); } else if (cpi->ppi->frames_left > 0) { // Correct frames to key based on limit cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, cpi->ppi->frames_left); } } /*!\brief Define a GF group in one pass mode when no look ahead stats are * available. * * \ingroup gf_group_algo * This function defines the structure of a GF group, along with various * parameters regarding bit-allocation and quality setup in the special * case of one pass encoding where no lookahead stats are avialable. * * \param[in] cpi Top-level encoder structure * * \remark Nothing is returned. Instead, cpi->ppi->gf_group is changed. */ static void define_gf_group_pass0(AV1_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; GF_GROUP *const gf_group = &cpi->ppi->gf_group; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const GFConfig *const gf_cfg = &oxcf->gf_cfg; int target; if (oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) { av1_cyclic_refresh_set_golden_update(cpi); } else { p_rc->baseline_gf_interval = p_rc->gf_intervals[p_rc->cur_gf_index]; rc->intervals_till_gf_calculate_due--; p_rc->cur_gf_index++; } // correct frames_to_key when lookahead queue is flushing correct_frames_to_key(cpi); if (p_rc->baseline_gf_interval > rc->frames_to_key) p_rc->baseline_gf_interval = rc->frames_to_key; p_rc->gfu_boost = DEFAULT_GF_BOOST; p_rc->constrained_gf_group = (p_rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0; gf_group->max_layer_depth_allowed = oxcf->gf_cfg.gf_max_pyr_height; // Rare case when the look-ahead is less than the target GOP length, can't // generate ARF frame. if (p_rc->baseline_gf_interval > gf_cfg->lag_in_frames || !is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) || p_rc->baseline_gf_interval < rc->min_gf_interval) gf_group->max_layer_depth_allowed = 0; // Set up the structure of this Group-Of-Pictures (same as GF_GROUP) av1_gop_setup_structure(cpi); // Allocate bits to each of the frames in the GF group. // TODO(sarahparker) Extend this to work with pyramid structure. for (int cur_index = 0; cur_index < gf_group->size; ++cur_index) { const FRAME_UPDATE_TYPE cur_update_type = gf_group->update_type[cur_index]; if (oxcf->rc_cfg.mode == AOM_CBR) { if (cur_update_type == KF_UPDATE) { target = av1_calc_iframe_target_size_one_pass_cbr(cpi); } else { target = av1_calc_pframe_target_size_one_pass_cbr(cpi, cur_update_type); } } else { if (cur_update_type == KF_UPDATE) { target = av1_calc_iframe_target_size_one_pass_vbr(cpi); } else { target = av1_calc_pframe_target_size_one_pass_vbr(cpi, cur_update_type); } } gf_group->bit_allocation[cur_index] = target; } } static inline void set_baseline_gf_interval(PRIMARY_RATE_CONTROL *p_rc, int arf_position) { p_rc->baseline_gf_interval = arf_position; } // initialize GF_GROUP_STATS static void init_gf_stats(GF_GROUP_STATS *gf_stats) { gf_stats->gf_group_err = 0.0; gf_stats->gf_group_raw_error = 0.0; gf_stats->gf_group_skip_pct = 0.0; gf_stats->gf_group_inactive_zone_rows = 0.0; gf_stats->mv_ratio_accumulator = 0.0; gf_stats->decay_accumulator = 1.0; gf_stats->zero_motion_accumulator = 1.0; gf_stats->loop_decay_rate = 1.0; gf_stats->last_loop_decay_rate = 1.0; gf_stats->this_frame_mv_in_out = 0.0; gf_stats->mv_in_out_accumulator = 0.0; gf_stats->abs_mv_in_out_accumulator = 0.0; gf_stats->avg_sr_coded_error = 0.0; gf_stats->avg_pcnt_second_ref = 0.0; gf_stats->avg_new_mv_count = 0.0; gf_stats->avg_wavelet_energy = 0.0; gf_stats->avg_raw_err_stdev = 0.0; gf_stats->non_zero_stdev_count = 0; } static void accumulate_gop_stats(AV1_COMP *cpi, int is_intra_only, int f_w, int f_h, FIRSTPASS_STATS *next_frame, const FIRSTPASS_STATS *start_pos, GF_GROUP_STATS *gf_stats, int *idx) { int i, flash_detected; TWO_PASS *const twopass = &cpi->ppi->twopass; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; RATE_CONTROL *const rc = &cpi->rc; FRAME_INFO *frame_info = &cpi->frame_info; const AV1EncoderConfig *const oxcf = &cpi->oxcf; init_gf_stats(gf_stats); av1_zero(*next_frame); // If this is a key frame or the overlay from a previous arf then // the error score / cost of this frame has already been accounted for. i = is_intra_only; // get the determined gf group length from p_rc->gf_intervals while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) { // read in the next frame if (EOF == input_stats(twopass, &cpi->twopass_frame, next_frame)) break; // Accumulate error score of frames in this gf group. double mod_frame_err = calculate_modified_err(frame_info, twopass, oxcf, next_frame); // accumulate stats for this frame accumulate_this_frame_stats(next_frame, mod_frame_err, gf_stats); ++i; } reset_fpf_position(&cpi->twopass_frame, start_pos); i = is_intra_only; input_stats(twopass, &cpi->twopass_frame, next_frame); while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) { // read in the next frame if (EOF == input_stats(twopass, &cpi->twopass_frame, next_frame)) break; // Test for the case where there is a brief flash but the prediction // quality back to an earlier frame is then restored. flash_detected = detect_flash(twopass, &cpi->twopass_frame, 0); // accumulate stats for next frame accumulate_next_frame_stats(next_frame, flash_detected, rc->frames_since_key, i, gf_stats, f_w, f_h); ++i; } i = p_rc->gf_intervals[p_rc->cur_gf_index]; average_gf_stats(i, gf_stats); *idx = i; } static void update_gop_length(RATE_CONTROL *rc, PRIMARY_RATE_CONTROL *p_rc, int idx, int is_final_pass) { if (is_final_pass) { rc->intervals_till_gf_calculate_due--; p_rc->cur_gf_index++; } // Was the group length constrained by the requirement for a new KF? p_rc->constrained_gf_group = (idx >= rc->frames_to_key) ? 1 : 0; set_baseline_gf_interval(p_rc, idx); rc->frames_till_gf_update_due = p_rc->baseline_gf_interval; } // #define FIXED_ARF_BITS #ifdef FIXED_ARF_BITS #define ARF_BITS_FRACTION 0.75 #endif /*!\brief Distributes bits to frames in a group * *\ingroup rate_control * * This function decides on the allocation of bits between the different * frames and types of frame in a GF/ARF group. * * \param[in] cpi Top - level encoder instance structure * \param[in] rc Rate control data * \param[in] gf_group GF/ARF group data structure * \param[in] is_key_frame Indicates if the first frame in the group is * also a key frame. * \param[in] use_arf Are ARF frames enabled or is this a GF only * uni-directional group. * \param[in] gf_group_bits Bits available to be allocated. * * \remark No return but updates the rate control and group data structures * to reflect the allocation of bits. */ static void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc, GF_GROUP *gf_group, int is_key_frame, int use_arf, int64_t gf_group_bits) { PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; // Calculate the extra bits to be used for boosted frame(s) #ifdef FIXED_ARF_BITS int gf_arf_bits = (int)(ARF_BITS_FRACTION * gf_group_bits); #else int gf_arf_bits = calculate_boost_bits( p_rc->baseline_gf_interval - (rc->frames_since_key == 0), p_rc->gfu_boost, gf_group_bits); #endif gf_arf_bits = adjust_boost_bits_for_target_level(cpi, rc, gf_arf_bits, gf_group_bits, 1); // Allocate bits to each of the frames in the GF group. allocate_gf_group_bits(gf_group, p_rc, rc, gf_group_bits, gf_arf_bits, is_key_frame, use_arf); } #undef ARF_BITS_FRACTION #define MAX_GF_BOOST 5400 #define REDUCE_GF_LENGTH_THRESH 4 #define REDUCE_GF_LENGTH_TO_KEY_THRESH 9 #define REDUCE_GF_LENGTH_BY 1 static void set_gop_bits_boost(AV1_COMP *cpi, int i, int is_intra_only, int is_final_pass, int use_alt_ref, int alt_offset, const FIRSTPASS_STATS *start_pos, GF_GROUP_STATS *gf_stats) { // Should we use the alternate reference frame. AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; TWO_PASS *const twopass = &cpi->ppi->twopass; GF_GROUP *gf_group = &cpi->ppi->gf_group; FRAME_INFO *frame_info = &cpi->frame_info; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; int ext_len = i - is_intra_only; if (use_alt_ref) { const int forward_frames = (rc->frames_to_key - i >= ext_len) ? ext_len : AOMMAX(0, rc->frames_to_key - i); // Calculate the boost for alt ref. p_rc->gfu_boost = av1_calc_arf_boost( twopass, &cpi->twopass_frame, p_rc, frame_info, alt_offset, forward_frames, ext_len, &p_rc->num_stats_used_for_gfu_boost, &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled); } else { reset_fpf_position(&cpi->twopass_frame, start_pos); p_rc->gfu_boost = AOMMIN( MAX_GF_BOOST, av1_calc_arf_boost( twopass, &cpi->twopass_frame, p_rc, frame_info, alt_offset, ext_len, 0, &p_rc->num_stats_used_for_gfu_boost, &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled)); } #define LAST_ALR_BOOST_FACTOR 0.2f p_rc->arf_boost_factor = 1.0; if (use_alt_ref && !is_lossless_requested(rc_cfg)) { // Reduce the boost of altref in the last gf group if (rc->frames_to_key - ext_len == REDUCE_GF_LENGTH_BY || rc->frames_to_key - ext_len == 0) { p_rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR; } } // Reset the file position. reset_fpf_position(&cpi->twopass_frame, start_pos); if (cpi->ppi->lap_enabled) { // Since we don't have enough stats to know the actual error of the // gf group, we assume error of each frame to be equal to 1 and set // the error of the group as baseline_gf_interval. gf_stats->gf_group_err = p_rc->baseline_gf_interval; } // Calculate the bits to be allocated to the gf/arf group as a whole p_rc->gf_group_bits = calculate_total_gf_group_bits(cpi, gf_stats->gf_group_err); #if GROUP_ADAPTIVE_MAXQ // Calculate an estimate of the maxq needed for the group. // We are more aggressive about correcting for sections // where there could be significant overshoot than for easier // sections where we do not wish to risk creating an overshoot // of the allocated bit budget. if ((rc_cfg->mode != AOM_Q) && (p_rc->baseline_gf_interval > 1) && is_final_pass) { const int vbr_group_bits_per_frame = (int)(p_rc->gf_group_bits / p_rc->baseline_gf_interval); const double group_av_err = gf_stats->gf_group_raw_error / p_rc->baseline_gf_interval; const double group_av_skip_pct = gf_stats->gf_group_skip_pct / p_rc->baseline_gf_interval; const double group_av_inactive_zone = ((gf_stats->gf_group_inactive_zone_rows * 2) / (p_rc->baseline_gf_interval * (double)cm->mi_params.mb_rows)); int tmp_q; tmp_q = get_twopass_worst_quality( cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone), vbr_group_bits_per_frame); rc->active_worst_quality = AOMMAX(tmp_q, rc->active_worst_quality >> 1); } #endif // Adjust KF group bits and error remaining. if (is_final_pass) twopass->kf_group_error_left -= gf_stats->gf_group_err; // Reset the file position. reset_fpf_position(&cpi->twopass_frame, start_pos); // Calculate a section intra ratio used in setting max loop filter. if (rc->frames_since_key != 0) { twopass->section_intra_rating = calculate_section_intra_ratio( start_pos, twopass->stats_buf_ctx->stats_in_end, p_rc->baseline_gf_interval); } av1_gop_bit_allocation(cpi, rc, gf_group, rc->frames_since_key == 0, use_alt_ref, p_rc->gf_group_bits); // TODO(jingning): Generalize this condition. if (is_final_pass) { cpi->ppi->gf_state.arf_gf_boost_lst = use_alt_ref; // Reset rolling actual and target bits counters for ARF groups. twopass->rolling_arf_group_target_bits = 1; twopass->rolling_arf_group_actual_bits = 1; } #if CONFIG_BITRATE_ACCURACY if (is_final_pass) { av1_vbr_rc_set_gop_bit_budget(&cpi->vbr_rc_info, p_rc->baseline_gf_interval); } #endif } /*!\brief Define a GF group. * * \ingroup gf_group_algo * This function defines the structure of a GF group, along with various * parameters regarding bit-allocation and quality setup. * * \param[in] cpi Top-level encoder structure * \param[in] frame_params Structure with frame parameters * \param[in] is_final_pass Whether this is the final pass for the * GF group, or a trial (non-zero) * * \remark Nothing is returned. Instead, cpi->ppi->gf_group is changed. */ static void define_gf_group(AV1_COMP *cpi, EncodeFrameParams *frame_params, int is_final_pass) { AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; TWO_PASS *const twopass = &cpi->ppi->twopass; FIRSTPASS_STATS next_frame; const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in; GF_GROUP *gf_group = &cpi->ppi->gf_group; const GFConfig *const gf_cfg = &oxcf->gf_cfg; const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; const int f_w = cm->width; const int f_h = cm->height; int i; const int is_intra_only = rc->frames_since_key == 0; cpi->ppi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1); // Reset the GF group data structures unless this is a key // frame in which case it will already have been done. if (!is_intra_only) { av1_zero(cpi->ppi->gf_group); cpi->gf_frame_index = 0; } if (has_no_stats_stage(cpi)) { define_gf_group_pass0(cpi); return; } #if CONFIG_THREE_PASS if (cpi->third_pass_ctx && oxcf->pass == AOM_RC_THIRD_PASS) { int ret = define_gf_group_pass3(cpi, frame_params, is_final_pass); if (ret == 0) return; av1_free_thirdpass_ctx(cpi->third_pass_ctx); cpi->third_pass_ctx = NULL; } #endif // CONFIG_THREE_PASS // correct frames_to_key when lookahead queue is emptying if (cpi->ppi->lap_enabled) { correct_frames_to_key(cpi); } GF_GROUP_STATS gf_stats; accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, start_pos, &gf_stats, &i); const int can_disable_arf = !gf_cfg->gf_min_pyr_height; // If this is a key frame or the overlay from a previous arf then // the error score / cost of this frame has already been accounted for. const int active_min_gf_interval = rc->min_gf_interval; // Disable internal ARFs for "still" gf groups. // zero_motion_accumulator: minimum percentage of (0,0) motion; // avg_sr_coded_error: average of the SSE per pixel of each frame; // avg_raw_err_stdev: average of the standard deviation of (0,0) // motion error per block of each frame. const int can_disable_internal_arfs = gf_cfg->gf_min_pyr_height <= 1; if (can_disable_internal_arfs && gf_stats.zero_motion_accumulator > MIN_ZERO_MOTION && gf_stats.avg_sr_coded_error < MAX_SR_CODED_ERROR && gf_stats.avg_raw_err_stdev < MAX_RAW_ERR_VAR) { cpi->ppi->internal_altref_allowed = 0; } int use_alt_ref; if (can_disable_arf) { use_alt_ref = !is_almost_static(gf_stats.zero_motion_accumulator, twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled) && p_rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) && (i >= MIN_GF_INTERVAL); } else { use_alt_ref = p_rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) && (i > 2); } if (use_alt_ref) { gf_group->max_layer_depth_allowed = gf_cfg->gf_max_pyr_height; } else { gf_group->max_layer_depth_allowed = 0; } int alt_offset = 0; // The length reduction strategy is tweaked for certain cases, and doesn't // work well for certain other cases. const int allow_gf_length_reduction = ((rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 128) || !cpi->ppi->internal_altref_allowed) && !is_lossless_requested(rc_cfg); if (allow_gf_length_reduction && use_alt_ref) { // adjust length of this gf group if one of the following condition met // 1: only one overlay frame left and this gf is too long // 2: next gf group is too short to have arf compared to the current gf // maximum length of next gf group const int next_gf_len = rc->frames_to_key - i; const int single_overlay_left = next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH; // the next gf is probably going to have a ARF but it will be shorter than // this gf const int unbalanced_gf = i > REDUCE_GF_LENGTH_TO_KEY_THRESH && next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH && next_gf_len + 1 >= rc->min_gf_interval; if (single_overlay_left || unbalanced_gf) { const int roll_back = REDUCE_GF_LENGTH_BY; // Reduce length only if active_min_gf_interval will be respected later. if (i - roll_back >= active_min_gf_interval + 1) { alt_offset = -roll_back; i -= roll_back; if (is_final_pass) rc->intervals_till_gf_calculate_due = 0; p_rc->gf_intervals[p_rc->cur_gf_index] -= roll_back; reset_fpf_position(&cpi->twopass_frame, start_pos); accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, start_pos, &gf_stats, &i); } } } update_gop_length(rc, p_rc, i, is_final_pass); // Set up the structure of this Group-Of-Pictures (same as GF_GROUP) av1_gop_setup_structure(cpi); set_gop_bits_boost(cpi, i, is_intra_only, is_final_pass, use_alt_ref, alt_offset, start_pos, &gf_stats); frame_params->frame_type = rc->frames_since_key == 0 ? KEY_FRAME : INTER_FRAME; frame_params->show_frame = !(gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE || gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE); } #if CONFIG_THREE_PASS /*!\brief Define a GF group for the third apss. * * \ingroup gf_group_algo * This function defines the structure of a GF group for the third pass, along * with various parameters regarding bit-allocation and quality setup based on * the two-pass bitstream. * Much of the function still uses the strategies used for the second pass and * relies on first pass statistics. It is expected that over time these portions * would be replaced with strategies specific to the third pass. * * \param[in] cpi Top-level encoder structure * \param[in] frame_params Structure with frame parameters * \param[in] is_final_pass Whether this is the final pass for the * GF group, or a trial (non-zero) * * \return 0: Success; * -1: There are conflicts between the bitstream and current config * The values in cpi->ppi->gf_group are also changed. */ static int define_gf_group_pass3(AV1_COMP *cpi, EncodeFrameParams *frame_params, int is_final_pass) { if (!cpi->third_pass_ctx) return -1; AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; FIRSTPASS_STATS next_frame; const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in; GF_GROUP *gf_group = &cpi->ppi->gf_group; const GFConfig *const gf_cfg = &oxcf->gf_cfg; const int f_w = cm->width; const int f_h = cm->height; int i; const int is_intra_only = rc->frames_since_key == 0; cpi->ppi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1); // Reset the GF group data structures unless this is a key // frame in which case it will already have been done. if (!is_intra_only) { av1_zero(cpi->ppi->gf_group); cpi->gf_frame_index = 0; } GF_GROUP_STATS gf_stats; accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, start_pos, &gf_stats, &i); const int can_disable_arf = !gf_cfg->gf_min_pyr_height; // TODO(any): set cpi->ppi->internal_altref_allowed accordingly; int use_alt_ref = av1_check_use_arf(cpi->third_pass_ctx); if (use_alt_ref == 0 && !can_disable_arf) return -1; if (use_alt_ref) { gf_group->max_layer_depth_allowed = gf_cfg->gf_max_pyr_height; } else { gf_group->max_layer_depth_allowed = 0; } update_gop_length(rc, p_rc, i, is_final_pass); // Set up the structure of this Group-Of-Pictures (same as GF_GROUP) av1_gop_setup_structure(cpi); set_gop_bits_boost(cpi, i, is_intra_only, is_final_pass, use_alt_ref, 0, start_pos, &gf_stats); frame_params->frame_type = cpi->third_pass_ctx->frame_info[0].frame_type; frame_params->show_frame = cpi->third_pass_ctx->frame_info[0].is_show_frame; return 0; } #endif // CONFIG_THREE_PASS // Minimum % intra coding observed in first pass (1.0 = 100%) #define MIN_INTRA_LEVEL 0.25 // Minimum ratio between the % of intra coding and inter coding in the first // pass after discounting neutral blocks (discounting neutral blocks in this // way helps catch scene cuts in clips with very flat areas or letter box // format clips with image padding. #define INTRA_VS_INTER_THRESH 2.0 // Hard threshold where the first pass chooses intra for almost all blocks. // In such a case even if the frame is not a scene cut coding a key frame // may be a good option. #define VERY_LOW_INTER_THRESH 0.05 // Maximum threshold for the relative ratio of intra error score vs best // inter error score. #define KF_II_ERR_THRESHOLD 1.9 // In real scene cuts there is almost always a sharp change in the intra // or inter error score. #define ERR_CHANGE_THRESHOLD 0.4 // For real scene cuts we expect an improvment in the intra inter error // ratio in the next frame. #define II_IMPROVEMENT_THRESHOLD 3.5 #define KF_II_MAX 128.0 // Intra / Inter threshold very low #define VERY_LOW_II 1.5 // Clean slide transitions we expect a sharp single frame spike in error. #define ERROR_SPIKE 5.0 // Slide show transition detection. // Tests for case where there is very low error either side of the current frame // but much higher just for this frame. This can help detect key frames in // slide shows even where the slides are pictures of different sizes. // Also requires that intra and inter errors are very similar to help eliminate // harmful false positives. // It will not help if the transition is a fade or other multi-frame effect. static int slide_transition(const FIRSTPASS_STATS *this_frame, const FIRSTPASS_STATS *last_frame, const FIRSTPASS_STATS *next_frame) { return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) && (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) && (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE)); } // Threshold for use of the lagging second reference frame. High second ref // usage may point to a transient event like a flash or occlusion rather than // a real scene cut. // We adapt the threshold based on number of frames in this key-frame group so // far. static double get_second_ref_usage_thresh(int frame_count_so_far) { const int adapt_upto = 32; const double min_second_ref_usage_thresh = 0.085; const double second_ref_usage_thresh_max_delta = 0.035; if (frame_count_so_far >= adapt_upto) { return min_second_ref_usage_thresh + second_ref_usage_thresh_max_delta; } return min_second_ref_usage_thresh + ((double)frame_count_so_far / (adapt_upto - 1)) * second_ref_usage_thresh_max_delta; } static int test_candidate_kf(const FIRSTPASS_INFO *firstpass_info, int this_stats_index, int frame_count_so_far, enum aom_rc_mode rc_mode, int scenecut_mode, int num_mbs) { const FIRSTPASS_STATS *last_stats = av1_firstpass_info_peek(firstpass_info, this_stats_index - 1); const FIRSTPASS_STATS *this_stats = av1_firstpass_info_peek(firstpass_info, this_stats_index); const FIRSTPASS_STATS *next_stats = av1_firstpass_info_peek(firstpass_info, this_stats_index + 1); if (last_stats == NULL || this_stats == NULL || next_stats == NULL) { return 0; } int is_viable_kf = 0; double pcnt_intra = 1.0 - this_stats->pcnt_inter; double modified_pcnt_inter = this_stats->pcnt_inter - this_stats->pcnt_neutral; const double second_ref_usage_thresh = get_second_ref_usage_thresh(frame_count_so_far); int frames_to_test_after_candidate_key = SCENE_CUT_KEY_TEST_INTERVAL; int count_for_tolerable_prediction = 3; // We do "-1" because the candidate key is not counted. int stats_after_this_stats = av1_firstpass_info_future_count(firstpass_info, this_stats_index) - 1; if (scenecut_mode == ENABLE_SCENECUT_MODE_1) { if (stats_after_this_stats < 3) { return 0; } else { frames_to_test_after_candidate_key = 3; count_for_tolerable_prediction = 1; } } // Make sure we have enough stats after the candidate key. frames_to_test_after_candidate_key = AOMMIN(frames_to_test_after_candidate_key, stats_after_this_stats); // Does the frame satisfy the primary criteria of a key frame? // See above for an explanation of the test criteria. // If so, then examine how well it predicts subsequent frames. if (IMPLIES(rc_mode == AOM_Q, frame_count_so_far >= 3) && (this_stats->pcnt_second_ref < second_ref_usage_thresh) && (next_stats->pcnt_second_ref < second_ref_usage_thresh) && ((this_stats->pcnt_inter < VERY_LOW_INTER_THRESH) || slide_transition(this_stats, last_stats, next_stats) || ((pcnt_intra > MIN_INTRA_LEVEL) && (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) && ((this_stats->intra_error / DOUBLE_DIVIDE_CHECK(this_stats->coded_error)) < KF_II_ERR_THRESHOLD) && ((fabs(last_stats->coded_error - this_stats->coded_error) / DOUBLE_DIVIDE_CHECK(this_stats->coded_error) > ERR_CHANGE_THRESHOLD) || (fabs(last_stats->intra_error - this_stats->intra_error) / DOUBLE_DIVIDE_CHECK(this_stats->intra_error) > ERR_CHANGE_THRESHOLD) || ((next_stats->intra_error / DOUBLE_DIVIDE_CHECK(next_stats->coded_error)) > II_IMPROVEMENT_THRESHOLD))))) { int i; double boost_score = 0.0; double old_boost_score = 0.0; double decay_accumulator = 1.0; // Examine how well the key frame predicts subsequent frames. for (i = 1; i <= frames_to_test_after_candidate_key; ++i) { // Get the next frame details const FIRSTPASS_STATS *local_next_frame = av1_firstpass_info_peek(firstpass_info, this_stats_index + i); double next_iiratio = (BOOST_FACTOR * local_next_frame->intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame->coded_error)); if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX; // Cumulative effect of decay in prediction quality. if (local_next_frame->pcnt_inter > 0.85) decay_accumulator *= local_next_frame->pcnt_inter; else decay_accumulator *= (0.85 + local_next_frame->pcnt_inter) / 2.0; // Keep a running total. boost_score += (decay_accumulator * next_iiratio); // Test various breakout clauses. // TODO(any): Test of intra error should be normalized to an MB. if ((local_next_frame->pcnt_inter < 0.05) || (next_iiratio < 1.5) || (((local_next_frame->pcnt_inter - local_next_frame->pcnt_neutral) < 0.20) && (next_iiratio < 3.0)) || ((boost_score - old_boost_score) < 3.0) || (local_next_frame->intra_error < (200.0 / (double)num_mbs))) { break; } old_boost_score = boost_score; } // If there is tolerable prediction for at least the next 3 frames then // break out else discard this potential key frame and move on if (boost_score > 30.0 && (i > count_for_tolerable_prediction)) { is_viable_kf = 1; } else { is_viable_kf = 0; } } return is_viable_kf; } #define FRAMES_TO_CHECK_DECAY 8 #define KF_MIN_FRAME_BOOST 80.0 #define KF_MAX_FRAME_BOOST 128.0 #define MIN_KF_BOOST 600 // Minimum boost for non-static KF interval #define MAX_KF_BOOST 3200 #define MIN_STATIC_KF_BOOST 5400 // Minimum boost for static KF interval static int detect_app_forced_key(AV1_COMP *cpi) { int num_frames_to_app_forced_key = is_forced_keyframe_pending( cpi->ppi->lookahead, cpi->ppi->lookahead->max_sz, cpi->compressor_stage); return num_frames_to_app_forced_key; } static int get_projected_kf_boost(AV1_COMP *cpi) { /* * If num_stats_used_for_kf_boost >= frames_to_key, then * all stats needed for prior boost calculation are available. * Hence projecting the prior boost is not needed in this cases. */ if (cpi->ppi->p_rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key) return cpi->ppi->p_rc.kf_boost; // Get the current tpl factor (number of frames = frames_to_key). double tpl_factor = av1_get_kf_boost_projection_factor(cpi->rc.frames_to_key); // Get the tpl factor when number of frames = num_stats_used_for_kf_boost. double tpl_factor_num_stats = av1_get_kf_boost_projection_factor( cpi->ppi->p_rc.num_stats_used_for_kf_boost); int projected_kf_boost = (int)rint((tpl_factor * cpi->ppi->p_rc.kf_boost) / tpl_factor_num_stats); return projected_kf_boost; } /*!\brief Determine the location of the next key frame * * \ingroup gf_group_algo * This function decides the placement of the next key frame when a * scenecut is detected or the maximum key frame distance is reached. * * \param[in] cpi Top-level encoder structure * \param[in] firstpass_info struct for firstpass info * \param[in] num_frames_to_detect_scenecut Maximum lookahead frames. * \param[in] search_start_idx the start index for searching key frame. * Set it to one if we already know the * current frame is key frame. Otherwise, * set it to zero. * * \return Number of frames to the next key including the current frame. */ static int define_kf_interval(AV1_COMP *cpi, const FIRSTPASS_INFO *firstpass_info, int num_frames_to_detect_scenecut, int search_start_idx) { const TWO_PASS *const twopass = &cpi->ppi->twopass; const RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg; double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; double decay_accumulator = 1.0; int i = 0, j; int frames_to_key = search_start_idx; int frames_since_key = rc->frames_since_key + 1; int scenecut_detected = 0; int num_frames_to_next_key = detect_app_forced_key(cpi); if (num_frames_to_detect_scenecut == 0) { if (num_frames_to_next_key != -1) return num_frames_to_next_key; else return rc->frames_to_key; } if (num_frames_to_next_key != -1) num_frames_to_detect_scenecut = AOMMIN(num_frames_to_detect_scenecut, num_frames_to_next_key); // Initialize the decay rates for the recent frames to check for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0; i = 0; const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE) ? cpi->initial_mbs : cpi->common.mi_params.MBs; const int future_stats_count = av1_firstpass_info_future_count(firstpass_info, 0); while (frames_to_key < future_stats_count && frames_to_key < num_frames_to_detect_scenecut) { // Provided that we are not at the end of the file... if ((cpi->ppi->p_rc.enable_scenecut_detection > 0) && kf_cfg->auto_key && frames_to_key + 1 < future_stats_count) { double loop_decay_rate; // Check for a scene cut. if (frames_since_key >= kf_cfg->key_freq_min) { scenecut_detected = test_candidate_kf( &twopass->firstpass_info, frames_to_key, frames_since_key, oxcf->rc_cfg.mode, cpi->ppi->p_rc.enable_scenecut_detection, num_mbs); if (scenecut_detected) { break; } } // How fast is the prediction quality decaying? const FIRSTPASS_STATS *next_stats = av1_firstpass_info_peek(firstpass_info, frames_to_key + 1); loop_decay_rate = get_prediction_decay_rate(next_stats); // We want to know something about the recent past... rather than // as used elsewhere where we are concerned with decay in prediction // quality since the last GF or KF. recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate; decay_accumulator = 1.0; for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) decay_accumulator *= recent_loop_decay[j]; // Special check for transition or high motion followed by a // static scene. if (frames_since_key >= kf_cfg->key_freq_min) { scenecut_detected = detect_transition_to_still( firstpass_info, frames_to_key + 1, rc->min_gf_interval, i, kf_cfg->key_freq_max - i, loop_decay_rate, decay_accumulator); if (scenecut_detected) { // In the case of transition followed by a static scene, the key frame // could be a good predictor for the following frames, therefore we // do not use an arf. p_rc->use_arf_in_this_kf_group = 0; break; } } // Step on to the next frame. ++frames_to_key; ++frames_since_key; // If we don't have a real key frame within the next two // key_freq_max intervals then break out of the loop. if (frames_to_key >= 2 * kf_cfg->key_freq_max) { break; } } else { ++frames_to_key; ++frames_since_key; } ++i; } if (cpi->ppi->lap_enabled && !scenecut_detected) frames_to_key = num_frames_to_next_key; return frames_to_key; } static double get_kf_group_avg_error(TWO_PASS *twopass, TWO_PASS_FRAME *twopass_frame, const FIRSTPASS_STATS *first_frame, const FIRSTPASS_STATS *start_position, int frames_to_key) { FIRSTPASS_STATS cur_frame = *first_frame; int num_frames, i; double kf_group_avg_error = 0.0; reset_fpf_position(twopass_frame, start_position); for (i = 0; i < frames_to_key; ++i) { kf_group_avg_error += cur_frame.coded_error; if (EOF == input_stats(twopass, twopass_frame, &cur_frame)) break; } num_frames = i + 1; num_frames = AOMMIN(num_frames, frames_to_key); kf_group_avg_error = kf_group_avg_error / num_frames; return (kf_group_avg_error); } static int64_t get_kf_group_bits(AV1_COMP *cpi, double kf_group_err, double kf_group_avg_error) { RATE_CONTROL *const rc = &cpi->rc; TWO_PASS *const twopass = &cpi->ppi->twopass; int64_t kf_group_bits; if (cpi->ppi->lap_enabled) { kf_group_bits = (int64_t)rc->frames_to_key * rc->avg_frame_bandwidth; if (cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap) { double vbr_corpus_complexity_lap = cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap / 10.0; /* Get the average corpus complexity of the frame */ kf_group_bits = (int64_t)(kf_group_bits * (kf_group_avg_error / vbr_corpus_complexity_lap)); } } else { kf_group_bits = (int64_t)(twopass->bits_left * (kf_group_err / twopass->modified_error_left)); } return kf_group_bits; } static int calc_avg_stats(AV1_COMP *cpi, FIRSTPASS_STATS *avg_frame_stat) { RATE_CONTROL *const rc = &cpi->rc; TWO_PASS *const twopass = &cpi->ppi->twopass; FIRSTPASS_STATS cur_frame; av1_zero(cur_frame); int num_frames = 0; // Accumulate total stat using available number of stats. for (num_frames = 0; num_frames < (rc->frames_to_key - 1); ++num_frames) { if (EOF == input_stats(twopass, &cpi->twopass_frame, &cur_frame)) break; av1_accumulate_stats(avg_frame_stat, &cur_frame); } if (num_frames < 2) { return num_frames; } // Average the total stat avg_frame_stat->weight = avg_frame_stat->weight / num_frames; avg_frame_stat->intra_error = avg_frame_stat->intra_error / num_frames; avg_frame_stat->frame_avg_wavelet_energy = avg_frame_stat->frame_avg_wavelet_energy / num_frames; avg_frame_stat->coded_error = avg_frame_stat->coded_error / num_frames; avg_frame_stat->sr_coded_error = avg_frame_stat->sr_coded_error / num_frames; avg_frame_stat->pcnt_inter = avg_frame_stat->pcnt_inter / num_frames; avg_frame_stat->pcnt_motion = avg_frame_stat->pcnt_motion / num_frames; avg_frame_stat->pcnt_second_ref = avg_frame_stat->pcnt_second_ref / num_frames; avg_frame_stat->pcnt_neutral = avg_frame_stat->pcnt_neutral / num_frames; avg_frame_stat->intra_skip_pct = avg_frame_stat->intra_skip_pct / num_frames; avg_frame_stat->inactive_zone_rows = avg_frame_stat->inactive_zone_rows / num_frames; avg_frame_stat->inactive_zone_cols = avg_frame_stat->inactive_zone_cols / num_frames; avg_frame_stat->MVr = avg_frame_stat->MVr / num_frames; avg_frame_stat->mvr_abs = avg_frame_stat->mvr_abs / num_frames; avg_frame_stat->MVc = avg_frame_stat->MVc / num_frames; avg_frame_stat->mvc_abs = avg_frame_stat->mvc_abs / num_frames; avg_frame_stat->MVrv = avg_frame_stat->MVrv / num_frames; avg_frame_stat->MVcv = avg_frame_stat->MVcv / num_frames; avg_frame_stat->mv_in_out_count = avg_frame_stat->mv_in_out_count / num_frames; avg_frame_stat->new_mv_count = avg_frame_stat->new_mv_count / num_frames; avg_frame_stat->count = avg_frame_stat->count / num_frames; avg_frame_stat->duration = avg_frame_stat->duration / num_frames; return num_frames; } static double get_kf_boost_score(AV1_COMP *cpi, double kf_raw_err, double *zero_motion_accumulator, double *sr_accumulator, int use_avg_stat) { RATE_CONTROL *const rc = &cpi->rc; TWO_PASS *const twopass = &cpi->ppi->twopass; FRAME_INFO *const frame_info = &cpi->frame_info; FIRSTPASS_STATS frame_stat; av1_zero(frame_stat); int i = 0, num_stat_used = 0; double boost_score = 0.0; const double kf_max_boost = cpi->oxcf.rc_cfg.mode == AOM_Q ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST), KF_MAX_FRAME_BOOST) : KF_MAX_FRAME_BOOST; // Calculate the average using available number of stats. if (use_avg_stat) num_stat_used = calc_avg_stats(cpi, &frame_stat); for (i = num_stat_used; i < (rc->frames_to_key - 1); ++i) { if (!use_avg_stat && EOF == input_stats(twopass, &cpi->twopass_frame, &frame_stat)) break; // Monitor for static sections. // For the first frame in kf group, the second ref indicator is invalid. if (i > 0) { *zero_motion_accumulator = AOMMIN(*zero_motion_accumulator, get_zero_motion_factor(&frame_stat)); } else { *zero_motion_accumulator = frame_stat.pcnt_inter - frame_stat.pcnt_motion; } // Not all frames in the group are necessarily used in calculating boost. if ((*sr_accumulator < (kf_raw_err * 1.50)) && (i <= rc->max_gf_interval * 2)) { double frame_boost; double zm_factor; // Factor 0.75-1.25 based on how much of frame is static. zm_factor = (0.75 + (*zero_motion_accumulator / 2.0)); if (i < 2) *sr_accumulator = 0.0; frame_boost = calc_kf_frame_boost(&cpi->ppi->p_rc, frame_info, &frame_stat, sr_accumulator, kf_max_boost); boost_score += frame_boost * zm_factor; } } return boost_score; } /*!\brief Interval(in seconds) to clip key-frame distance to in LAP. */ #define MAX_KF_BITS_INTERVAL_SINGLE_PASS 5 /*!\brief Determine the next key frame group * * \ingroup gf_group_algo * This function decides the placement of the next key frame, and * calculates the bit allocation of the KF group and the keyframe itself. * * \param[in] cpi Top-level encoder structure * \param[in] this_frame Pointer to first pass stats */ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; TWO_PASS *const twopass = &cpi->ppi->twopass; GF_GROUP *const gf_group = &cpi->ppi->gf_group; FRAME_INFO *const frame_info = &cpi->frame_info; AV1_COMMON *const cm = &cpi->common; CurrentFrame *const current_frame = &cm->current_frame; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg; const FIRSTPASS_STATS first_frame = *this_frame; FIRSTPASS_STATS next_frame; const FIRSTPASS_INFO *firstpass_info = &twopass->firstpass_info; av1_zero(next_frame); rc->frames_since_key = 0; // Use arfs if possible. p_rc->use_arf_in_this_kf_group = is_altref_enabled( oxcf->gf_cfg.lag_in_frames, oxcf->gf_cfg.enable_auto_arf); // Reset the GF group data structures. av1_zero(*gf_group); cpi->gf_frame_index = 0; // KF is always a GF so clear frames till next gf counter. rc->frames_till_gf_update_due = 0; if (has_no_stats_stage(cpi)) { int num_frames_to_app_forced_key = detect_app_forced_key(cpi); p_rc->this_key_frame_forced = current_frame->frame_number != 0 && rc->frames_to_key == 0; if (num_frames_to_app_forced_key != -1) rc->frames_to_key = num_frames_to_app_forced_key; else rc->frames_to_key = AOMMAX(1, kf_cfg->key_freq_max); correct_frames_to_key(cpi); p_rc->kf_boost = DEFAULT_KF_BOOST; gf_group->update_type[0] = KF_UPDATE; return; } int i; const FIRSTPASS_STATS *const start_position = cpi->twopass_frame.stats_in; int kf_bits = 0; double zero_motion_accumulator = 1.0; double boost_score = 0.0; double kf_raw_err = 0.0; double kf_mod_err = 0.0; double sr_accumulator = 0.0; double kf_group_avg_error = 0.0; int frames_to_key, frames_to_key_clipped = INT_MAX; int64_t kf_group_bits_clipped = INT64_MAX; // Is this a forced key frame by interval. p_rc->this_key_frame_forced = p_rc->next_key_frame_forced; twopass->kf_group_bits = 0; // Total bits available to kf group twopass->kf_group_error_left = 0; // Group modified error score. kf_raw_err = this_frame->intra_error; kf_mod_err = calculate_modified_err(frame_info, twopass, oxcf, this_frame); // We assume the current frame is a key frame and we are looking for the next // key frame. Therefore search_start_idx = 1 frames_to_key = define_kf_interval(cpi, firstpass_info, kf_cfg->key_freq_max, /*search_start_idx=*/1); if (frames_to_key != -1) { rc->frames_to_key = AOMMIN(kf_cfg->key_freq_max, frames_to_key); } else { rc->frames_to_key = kf_cfg->key_freq_max; } if (cpi->ppi->lap_enabled) correct_frames_to_key(cpi); // If there is a max kf interval set by the user we must obey it. // We already breakout of the loop above at 2x max. // This code centers the extra kf if the actual natural interval // is between 1x and 2x. if (kf_cfg->auto_key && rc->frames_to_key > kf_cfg->key_freq_max) { FIRSTPASS_STATS tmp_frame = first_frame; rc->frames_to_key /= 2; // Reset to the start of the group. reset_fpf_position(&cpi->twopass_frame, start_position); // Rescan to get the correct error data for the forced kf group. for (i = 0; i < rc->frames_to_key; ++i) { if (EOF == input_stats(twopass, &cpi->twopass_frame, &tmp_frame)) break; } p_rc->next_key_frame_forced = 1; } else if ((cpi->twopass_frame.stats_in == twopass->stats_buf_ctx->stats_in_end && is_stat_consumption_stage_twopass(cpi)) || rc->frames_to_key >= kf_cfg->key_freq_max) { p_rc->next_key_frame_forced = 1; } else { p_rc->next_key_frame_forced = 0; } double kf_group_err = 0; for (i = 0; i < rc->frames_to_key; ++i) { const FIRSTPASS_STATS *this_stats = av1_firstpass_info_peek(&twopass->firstpass_info, i); if (this_stats != NULL) { // Accumulate kf group error. kf_group_err += calculate_modified_err_new( frame_info, &firstpass_info->total_stats, this_stats, oxcf->rc_cfg.vbrbias, twopass->modified_error_min, twopass->modified_error_max); ++p_rc->num_stats_used_for_kf_boost; } } // Calculate the number of bits that should be assigned to the kf group. if ((twopass->bits_left > 0 && twopass->modified_error_left > 0.0) || (cpi->ppi->lap_enabled && oxcf->rc_cfg.mode != AOM_Q)) { // Maximum number of bits for a single normal frame (not key frame). const int max_bits = frame_max_bits(rc, oxcf); // Maximum number of bits allocated to the key frame group. int64_t max_grp_bits; if (oxcf->rc_cfg.vbr_corpus_complexity_lap) { kf_group_avg_error = get_kf_group_avg_error(twopass, &cpi->twopass_frame, &first_frame, start_position, rc->frames_to_key); } // Default allocation based on bits left and relative // complexity of the section. twopass->kf_group_bits = get_kf_group_bits(cpi, kf_group_err, kf_group_avg_error); // Clip based on maximum per frame rate defined by the user. max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key; if (twopass->kf_group_bits > max_grp_bits) twopass->kf_group_bits = max_grp_bits; } else { twopass->kf_group_bits = 0; } twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits); if (cpi->ppi->lap_enabled) { // In the case of single pass based on LAP, frames to key may have an // inaccurate value, and hence should be clipped to an appropriate // interval. frames_to_key_clipped = (int)(MAX_KF_BITS_INTERVAL_SINGLE_PASS * cpi->framerate); // This variable calculates the bits allocated to kf_group with a clipped // frames_to_key. if (rc->frames_to_key > frames_to_key_clipped) { kf_group_bits_clipped = (int64_t)((double)twopass->kf_group_bits * frames_to_key_clipped / rc->frames_to_key); } } // Reset the first pass file position. reset_fpf_position(&cpi->twopass_frame, start_position); // Scan through the kf group collating various stats used to determine // how many bits to spend on it. boost_score = get_kf_boost_score(cpi, kf_raw_err, &zero_motion_accumulator, &sr_accumulator, 0); reset_fpf_position(&cpi->twopass_frame, start_position); // Store the zero motion percentage twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); // Calculate a section intra ratio used in setting max loop filter. twopass->section_intra_rating = calculate_section_intra_ratio( start_position, twopass->stats_buf_ctx->stats_in_end, rc->frames_to_key); p_rc->kf_boost = (int)boost_score; if (cpi->ppi->lap_enabled) { if (oxcf->rc_cfg.mode == AOM_Q) { p_rc->kf_boost = get_projected_kf_boost(cpi); } else { // TODO(any): Explore using average frame stats for AOM_Q as well. boost_score = get_kf_boost_score( cpi, kf_raw_err, &zero_motion_accumulator, &sr_accumulator, 1); reset_fpf_position(&cpi->twopass_frame, start_position); p_rc->kf_boost += (int)boost_score; } } // Special case for static / slide show content but don't apply // if the kf group is very short. if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) && (rc->frames_to_key > 8)) { p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_STATIC_KF_BOOST); } else { // Apply various clamps for min and max boost p_rc->kf_boost = AOMMAX(p_rc->kf_boost, (rc->frames_to_key * 3)); p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_KF_BOOST); #ifdef STRICT_RC p_rc->kf_boost = AOMMIN(p_rc->kf_boost, MAX_KF_BOOST); #endif } // Work out how many bits to allocate for the key frame itself. // In case of LAP enabled for VBR, if the frames_to_key value is // very high, we calculate the bits based on a clipped value of // frames_to_key. kf_bits = calculate_boost_bits( AOMMIN(rc->frames_to_key, frames_to_key_clipped) - 1, p_rc->kf_boost, AOMMIN(twopass->kf_group_bits, kf_group_bits_clipped)); // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", // p_rc->kf_boost, // kf_bits, twopass->kf_zeromotion_pct); kf_bits = adjust_boost_bits_for_target_level(cpi, rc, kf_bits, twopass->kf_group_bits, 0); twopass->kf_group_bits -= kf_bits; // Save the bits to spend on the key frame. gf_group->bit_allocation[0] = kf_bits; gf_group->update_type[0] = KF_UPDATE; // Note the total error score of the kf group minus the key frame itself. if (cpi->ppi->lap_enabled) // As we don't have enough stats to know the actual error of the group, // we assume the complexity of each frame to be equal to 1, and set the // error as the number of frames in the group(minus the keyframe). twopass->kf_group_error_left = (double)(rc->frames_to_key - 1); else twopass->kf_group_error_left = kf_group_err - kf_mod_err; // Adjust the count of total modified error left. // The count of bits left is adjusted elsewhere based on real coded frame // sizes. twopass->modified_error_left -= kf_group_err; } #define ARF_STATS_OUTPUT 0 #if ARF_STATS_OUTPUT unsigned int arf_count = 0; #endif static int get_section_target_bandwidth(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; CurrentFrame *const current_frame = &cm->current_frame; RATE_CONTROL *const rc = &cpi->rc; TWO_PASS *const twopass = &cpi->ppi->twopass; int64_t section_target_bandwidth; const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count - current_frame->frame_number); if (cpi->ppi->lap_enabled) section_target_bandwidth = rc->avg_frame_bandwidth; else { section_target_bandwidth = twopass->bits_left / frames_left; section_target_bandwidth = AOMMIN(section_target_bandwidth, INT_MAX); } return (int)section_target_bandwidth; } static inline void set_twopass_params_based_on_fp_stats( AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame_ptr) { if (this_frame_ptr == NULL) return; TWO_PASS_FRAME *twopass_frame = &cpi->twopass_frame; // The multiplication by 256 reverses a scaling factor of (>> 8) // applied when combining MB error values for the frame. twopass_frame->mb_av_energy = log1p(this_frame_ptr->intra_error); const FIRSTPASS_STATS *const total_stats = cpi->ppi->twopass.stats_buf_ctx->total_stats; if (is_fp_wavelet_energy_invalid(total_stats) == 0) { twopass_frame->frame_avg_haar_energy = log1p(this_frame_ptr->frame_avg_wavelet_energy); } // Set the frame content type flag. if (this_frame_ptr->intra_skip_pct >= FC_ANIMATION_THRESH) twopass_frame->fr_content_type = FC_GRAPHICS_ANIMATION; else twopass_frame->fr_content_type = FC_NORMAL; } static void process_first_pass_stats(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { AV1_COMMON *const cm = &cpi->common; CurrentFrame *const current_frame = &cm->current_frame; RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; TWO_PASS *const twopass = &cpi->ppi->twopass; FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats; if (cpi->oxcf.rc_cfg.mode != AOM_Q && current_frame->frame_number == 0 && cpi->gf_frame_index == 0 && total_stats && twopass->stats_buf_ctx->total_left_stats) { if (cpi->ppi->lap_enabled) { /* * Accumulate total_stats using available limited number of stats, * and assign it to total_left_stats. */ *twopass->stats_buf_ctx->total_left_stats = *total_stats; } // Special case code for first frame. const int section_target_bandwidth = get_section_target_bandwidth(cpi); const double section_length = twopass->stats_buf_ctx->total_left_stats->count; const double section_error = twopass->stats_buf_ctx->total_left_stats->coded_error / section_length; const double section_intra_skip = twopass->stats_buf_ctx->total_left_stats->intra_skip_pct / section_length; const double section_inactive_zone = (twopass->stats_buf_ctx->total_left_stats->inactive_zone_rows * 2) / ((double)cm->mi_params.mb_rows * section_length); const int tmp_q = get_twopass_worst_quality( cpi, section_error, section_intra_skip + section_inactive_zone, section_target_bandwidth); rc->active_worst_quality = tmp_q; rc->ni_av_qi = tmp_q; p_rc->last_q[INTER_FRAME] = tmp_q; p_rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params->bit_depth); p_rc->avg_frame_qindex[INTER_FRAME] = tmp_q; p_rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.rc_cfg.best_allowed_q) / 2; p_rc->avg_frame_qindex[KEY_FRAME] = p_rc->last_q[KEY_FRAME]; } if (cpi->twopass_frame.stats_in < twopass->stats_buf_ctx->stats_in_end) { *this_frame = *cpi->twopass_frame.stats_in; ++cpi->twopass_frame.stats_in; } set_twopass_params_based_on_fp_stats(cpi, this_frame); } static void setup_target_rate(AV1_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; GF_GROUP *const gf_group = &cpi->ppi->gf_group; int target_rate = gf_group->bit_allocation[cpi->gf_frame_index]; if (has_no_stats_stage(cpi)) { av1_rc_set_frame_target(cpi, target_rate, cpi->common.width, cpi->common.height); } rc->base_frame_target = target_rate; } static void mark_flashes(FIRSTPASS_STATS *first_stats, FIRSTPASS_STATS *last_stats) { FIRSTPASS_STATS *this_stats = first_stats, *next_stats; while (this_stats < last_stats - 1) { next_stats = this_stats + 1; if (next_stats->pcnt_second_ref > next_stats->pcnt_inter && next_stats->pcnt_second_ref >= 0.5) { this_stats->is_flash = 1; } else { this_stats->is_flash = 0; } this_stats = next_stats; } // We always treat the last one as none flash. if (last_stats - 1 >= first_stats) { (last_stats - 1)->is_flash = 0; } } // Smooth-out the noise variance so it is more stable // Returns 0 on success, -1 on memory allocation failure. // TODO(bohanli): Use a better low-pass filter than averaging static int smooth_filter_noise(FIRSTPASS_STATS *first_stats, FIRSTPASS_STATS *last_stats) { int len = (int)(last_stats - first_stats); double *smooth_noise = aom_malloc(len * sizeof(*smooth_noise)); if (!smooth_noise) return -1; for (int i = 0; i < len; i++) { double total_noise = 0; double total_wt = 0; for (int j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) { int idx = AOMMIN(AOMMAX(i + j, 0), len - 1); if (first_stats[idx].is_flash) continue; total_noise += first_stats[idx].noise_var; total_wt += 1.0; } if (total_wt > 0.01) { total_noise /= total_wt; } else { total_noise = first_stats[i].noise_var; } smooth_noise[i] = total_noise; } for (int i = 0; i < len; i++) { first_stats[i].noise_var = smooth_noise[i]; } aom_free(smooth_noise); return 0; } // Estimate the noise variance of each frame from the first pass stats static void estimate_noise(FIRSTPASS_STATS *first_stats, FIRSTPASS_STATS *last_stats, struct aom_internal_error_info *error_info) { FIRSTPASS_STATS *this_stats, *next_stats; double C1, C2, C3, noise; for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) { this_stats->noise_var = 0.0; // flashes tend to have high correlation of innovations, so ignore them. if (this_stats->is_flash || (this_stats - 1)->is_flash || (this_stats - 2)->is_flash) continue; C1 = (this_stats - 1)->intra_error * (this_stats->intra_error - this_stats->coded_error); C2 = (this_stats - 2)->intra_error * ((this_stats - 1)->intra_error - (this_stats - 1)->coded_error); C3 = (this_stats - 2)->intra_error * (this_stats->intra_error - this_stats->sr_coded_error); if (C1 <= 0 || C2 <= 0 || C3 <= 0) continue; C1 = sqrt(C1); C2 = sqrt(C2); C3 = sqrt(C3); noise = (this_stats - 1)->intra_error - C1 * C2 / C3; noise = AOMMAX(noise, 0.01); this_stats->noise_var = noise; } // Copy noise from the neighbor if the noise value is not trustworthy for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) { if (this_stats->is_flash || (this_stats - 1)->is_flash || (this_stats - 2)->is_flash) continue; if (this_stats->noise_var < 1.0) { int found = 0; // TODO(bohanli): consider expanding to two directions at the same time for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) { if (next_stats->is_flash || (next_stats - 1)->is_flash || (next_stats - 2)->is_flash || next_stats->noise_var < 1.0) continue; found = 1; this_stats->noise_var = next_stats->noise_var; break; } if (found) continue; for (next_stats = this_stats - 1; next_stats >= first_stats + 2; next_stats--) { if (next_stats->is_flash || (next_stats - 1)->is_flash || (next_stats - 2)->is_flash || next_stats->noise_var < 1.0) continue; this_stats->noise_var = next_stats->noise_var; break; } } } // copy the noise if this is a flash for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) { if (this_stats->is_flash || (this_stats - 1)->is_flash || (this_stats - 2)->is_flash) { int found = 0; for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) { if (next_stats->is_flash || (next_stats - 1)->is_flash || (next_stats - 2)->is_flash) continue; found = 1; this_stats->noise_var = next_stats->noise_var; break; } if (found) continue; for (next_stats = this_stats - 1; next_stats >= first_stats + 2; next_stats--) { if (next_stats->is_flash || (next_stats - 1)->is_flash || (next_stats - 2)->is_flash) continue; this_stats->noise_var = next_stats->noise_var; break; } } } // if we are at the first 2 frames, copy the noise for (this_stats = first_stats; this_stats < first_stats + 2 && (first_stats + 2) < last_stats; this_stats++) { this_stats->noise_var = (first_stats + 2)->noise_var; } if (smooth_filter_noise(first_stats, last_stats) == -1) { aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, "Error allocating buffers in smooth_filter_noise()"); } } // Estimate correlation coefficient of each frame with its previous frame. static void estimate_coeff(FIRSTPASS_STATS *first_stats, FIRSTPASS_STATS *last_stats) { FIRSTPASS_STATS *this_stats; for (this_stats = first_stats + 1; this_stats < last_stats; this_stats++) { const double C = sqrt(AOMMAX((this_stats - 1)->intra_error * (this_stats->intra_error - this_stats->coded_error), 0.001)); const double cor_coeff = C / AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var, 0.001); this_stats->cor_coeff = cor_coeff * sqrt(AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var, 0.001) / AOMMAX(this_stats->intra_error - this_stats->noise_var, 0.001)); // clip correlation coefficient. this_stats->cor_coeff = AOMMIN(AOMMAX(this_stats->cor_coeff, 0), 1); } first_stats->cor_coeff = 1.0; } void av1_get_second_pass_params(AV1_COMP *cpi, EncodeFrameParams *const frame_params, unsigned int frame_flags) { RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; TWO_PASS *const twopass = &cpi->ppi->twopass; GF_GROUP *const gf_group = &cpi->ppi->gf_group; const AV1EncoderConfig *const oxcf = &cpi->oxcf; if (cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) { frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index]; frame_params->show_frame = !(gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE || gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE); if (cpi->gf_frame_index == 0) { av1_tf_info_reset(&cpi->ppi->tf_info); av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group); } return; } const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in; int update_total_stats = 0; if (is_stat_consumption_stage(cpi) && !cpi->twopass_frame.stats_in) return; // Check forced key frames. const int frames_to_next_forced_key = detect_app_forced_key(cpi); if (frames_to_next_forced_key == 0) { rc->frames_to_key = 0; frame_flags &= FRAMEFLAGS_KEY; } else if (frames_to_next_forced_key > 0 && frames_to_next_forced_key < rc->frames_to_key) { rc->frames_to_key = frames_to_next_forced_key; } assert(cpi->twopass_frame.stats_in != NULL); const int update_type = gf_group->update_type[cpi->gf_frame_index]; frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index]; if (cpi->gf_frame_index < gf_group->size && !(frame_flags & FRAMEFLAGS_KEY)) { assert(cpi->gf_frame_index < gf_group->size); setup_target_rate(cpi); // If this is an arf frame then we dont want to read the stats file or // advance the input pointer as we already have what we need. if (update_type == ARF_UPDATE || update_type == INTNL_ARF_UPDATE) { const FIRSTPASS_STATS *const this_frame_ptr = read_frame_stats(twopass, &cpi->twopass_frame, gf_group->arf_src_offset[cpi->gf_frame_index]); set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr); return; } } if (oxcf->rc_cfg.mode == AOM_Q) rc->active_worst_quality = oxcf->rc_cfg.cq_level; if (cpi->gf_frame_index == gf_group->size) { if (cpi->ppi->lap_enabled && cpi->ppi->p_rc.enable_scenecut_detection) { const int num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1; const int frames_to_key = define_kf_interval( cpi, &twopass->firstpass_info, num_frames_to_detect_scenecut, /*search_start_idx=*/0); if (frames_to_key != -1) rc->frames_to_key = AOMMIN(rc->frames_to_key, frames_to_key); } } FIRSTPASS_STATS this_frame; av1_zero(this_frame); // call above fn if (is_stat_consumption_stage(cpi)) { if (cpi->gf_frame_index < gf_group->size || rc->frames_to_key == 0) { process_first_pass_stats(cpi, &this_frame); update_total_stats = 1; } } else { rc->active_worst_quality = oxcf->rc_cfg.cq_level; } // Keyframe and section processing. FIRSTPASS_STATS this_frame_copy; this_frame_copy = this_frame; if (rc->frames_to_key <= 0) { assert(rc->frames_to_key == 0); // Define next KF group and assign bits to it. frame_params->frame_type = KEY_FRAME; find_next_key_frame(cpi, &this_frame); this_frame = this_frame_copy; } if (rc->frames_to_fwd_kf <= 0) rc->frames_to_fwd_kf = oxcf->kf_cfg.fwd_kf_dist; // Define a new GF/ARF group. (Should always enter here for key frames). if (cpi->gf_frame_index == gf_group->size) { av1_tf_info_reset(&cpi->ppi->tf_info); #if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS vbr_rc_reset_gop_data(&cpi->vbr_rc_info); #endif // CONFIG_BITRATE_ACCURACY int max_gop_length = (oxcf->gf_cfg.lag_in_frames >= 32) ? AOMMIN(MAX_GF_INTERVAL, oxcf->gf_cfg.lag_in_frames - oxcf->algo_cfg.arnr_max_frames / 2) : MAX_GF_LENGTH_LAP; // Handle forward key frame when enabled. if (oxcf->kf_cfg.fwd_kf_dist > 0) max_gop_length = AOMMIN(rc->frames_to_fwd_kf + 1, max_gop_length); // Use the provided gop size in low delay setting if (oxcf->gf_cfg.lag_in_frames == 0) max_gop_length = rc->max_gf_interval; // Limit the max gop length for the last gop in 1 pass setting. max_gop_length = AOMMIN(max_gop_length, rc->frames_to_key); // Identify regions if needed. // TODO(bohanli): identify regions for all stats available. if (rc->frames_since_key == 0 || rc->frames_since_key == 1 || (p_rc->frames_till_regions_update - rc->frames_since_key < rc->frames_to_key && p_rc->frames_till_regions_update - rc->frames_since_key < max_gop_length + 1)) { // how many frames we can analyze from this frame int rest_frames = AOMMIN(rc->frames_to_key, MAX_FIRSTPASS_ANALYSIS_FRAMES); rest_frames = AOMMIN(rest_frames, (int)(twopass->stats_buf_ctx->stats_in_end - cpi->twopass_frame.stats_in + (rc->frames_since_key == 0))); p_rc->frames_till_regions_update = rest_frames; int ret; if (cpi->ppi->lap_enabled) { mark_flashes(twopass->stats_buf_ctx->stats_in_start, twopass->stats_buf_ctx->stats_in_end); estimate_noise(twopass->stats_buf_ctx->stats_in_start, twopass->stats_buf_ctx->stats_in_end, cpi->common.error); estimate_coeff(twopass->stats_buf_ctx->stats_in_start, twopass->stats_buf_ctx->stats_in_end); ret = identify_regions(cpi->twopass_frame.stats_in, rest_frames, (rc->frames_since_key == 0), p_rc->regions, &p_rc->num_regions); } else { ret = identify_regions( cpi->twopass_frame.stats_in - (rc->frames_since_key == 0), rest_frames, 0, p_rc->regions, &p_rc->num_regions); } if (ret == -1) { aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, "Error allocating buffers in identify_regions"); } } int cur_region_idx = find_regions_index(p_rc->regions, p_rc->num_regions, rc->frames_since_key - p_rc->regions_offset); if ((cur_region_idx >= 0 && p_rc->regions[cur_region_idx].type == SCENECUT_REGION) || rc->frames_since_key == 0) { // If we start from a scenecut, then the last GOP's arf boost is not // needed for this GOP. cpi->ppi->gf_state.arf_gf_boost_lst = 0; } int need_gf_len = 1; #if CONFIG_THREE_PASS if (cpi->third_pass_ctx && oxcf->pass == AOM_RC_THIRD_PASS) { // set up bitstream to read if (!cpi->third_pass_ctx->input_file_name && oxcf->two_pass_output) { cpi->third_pass_ctx->input_file_name = oxcf->two_pass_output; } av1_open_second_pass_log(cpi, 1); THIRD_PASS_GOP_INFO *gop_info = &cpi->third_pass_ctx->gop_info; // Read in GOP information from the second pass file. av1_read_second_pass_gop_info(cpi->second_pass_log_stream, gop_info, cpi->common.error); #if CONFIG_BITRATE_ACCURACY TPL_INFO *tpl_info; AOM_CHECK_MEM_ERROR(cpi->common.error, tpl_info, aom_malloc(sizeof(*tpl_info))); av1_read_tpl_info(tpl_info, cpi->second_pass_log_stream, cpi->common.error); aom_free(tpl_info); #if CONFIG_THREE_PASS // TODO(angiebird): Put this part into a func cpi->vbr_rc_info.cur_gop_idx++; #endif // CONFIG_THREE_PASS #endif // CONFIG_BITRATE_ACCURACY // Read in third_pass_info from the bitstream. av1_set_gop_third_pass(cpi->third_pass_ctx); // Read in per-frame info from second-pass encoding av1_read_second_pass_per_frame_info( cpi->second_pass_log_stream, cpi->third_pass_ctx->frame_info, gop_info->num_frames, cpi->common.error); p_rc->cur_gf_index = 0; p_rc->gf_intervals[0] = cpi->third_pass_ctx->gop_info.gf_length; need_gf_len = 0; } #endif // CONFIG_THREE_PASS if (need_gf_len) { // If we cannot obtain GF group length from second_pass_file // TODO(jingning): Resolve the redundant calls here. if (rc->intervals_till_gf_calculate_due == 0 || 1) { calculate_gf_length(cpi, max_gop_length, MAX_NUM_GF_INTERVALS); } if (max_gop_length > 16 && oxcf->algo_cfg.enable_tpl_model && oxcf->gf_cfg.lag_in_frames >= 32 && cpi->sf.tpl_sf.gop_length_decision_method != 3) { int this_idx = rc->frames_since_key + p_rc->gf_intervals[p_rc->cur_gf_index] - p_rc->regions_offset - 1; int this_region = find_regions_index(p_rc->regions, p_rc->num_regions, this_idx); int next_region = find_regions_index(p_rc->regions, p_rc->num_regions, this_idx + 1); // TODO(angiebird): Figure out why this_region and next_region are -1 in // unit test like AltRefFramePresenceTestLarge (aomedia:3134) int is_last_scenecut = p_rc->gf_intervals[p_rc->cur_gf_index] >= rc->frames_to_key || (this_region != -1 && p_rc->regions[this_region].type == SCENECUT_REGION) || (next_region != -1 && p_rc->regions[next_region].type == SCENECUT_REGION); int ori_gf_int = p_rc->gf_intervals[p_rc->cur_gf_index]; if (p_rc->gf_intervals[p_rc->cur_gf_index] > 16 && rc->min_gf_interval <= 16) { // The calculate_gf_length function is previously used with // max_gop_length = 32 with look-ahead gf intervals. define_gf_group(cpi, frame_params, 0); av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group); this_frame = this_frame_copy; if (is_shorter_gf_interval_better(cpi, frame_params)) { // A shorter gf interval is better. // TODO(jingning): Remove redundant computations here. max_gop_length = 16; calculate_gf_length(cpi, max_gop_length, 1); if (is_last_scenecut && (ori_gf_int - p_rc->gf_intervals[p_rc->cur_gf_index] < 4)) { p_rc->gf_intervals[p_rc->cur_gf_index] = ori_gf_int; } } } } } define_gf_group(cpi, frame_params, 0); if (gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE && rc->frames_since_key > 0) process_first_pass_stats(cpi, &this_frame); define_gf_group(cpi, frame_params, 1); #if CONFIG_THREE_PASS // write gop info if needed for third pass. Per-frame info is written after // each frame is encoded. av1_write_second_pass_gop_info(cpi); #endif // CONFIG_THREE_PASS av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group); rc->frames_till_gf_update_due = p_rc->baseline_gf_interval; assert(cpi->gf_frame_index == 0); #if ARF_STATS_OUTPUT { FILE *fpfile; fpfile = fopen("arf.stt", "a"); ++arf_count; fprintf(fpfile, "%10d %10d %10d %10d %10d\n", cpi->common.current_frame.frame_number, rc->frames_till_gf_update_due, cpi->ppi->p_rc.kf_boost, arf_count, p_rc->gfu_boost); fclose(fpfile); } #endif } assert(cpi->gf_frame_index < gf_group->size); if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE || gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) { reset_fpf_position(&cpi->twopass_frame, start_pos); const FIRSTPASS_STATS *const this_frame_ptr = read_frame_stats(twopass, &cpi->twopass_frame, gf_group->arf_src_offset[cpi->gf_frame_index]); set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr); } else { // Back up this frame's stats for updating total stats during post encode. cpi->twopass_frame.this_frame = update_total_stats ? start_pos : NULL; } frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index]; setup_target_rate(cpi); } void av1_init_second_pass(AV1_COMP *cpi) { const AV1EncoderConfig *const oxcf = &cpi->oxcf; TWO_PASS *const twopass = &cpi->ppi->twopass; FRAME_INFO *const frame_info = &cpi->frame_info; double frame_rate; FIRSTPASS_STATS *stats; if (!twopass->stats_buf_ctx->stats_in_end) return; mark_flashes(twopass->stats_buf_ctx->stats_in_start, twopass->stats_buf_ctx->stats_in_end); estimate_noise(twopass->stats_buf_ctx->stats_in_start, twopass->stats_buf_ctx->stats_in_end, cpi->common.error); estimate_coeff(twopass->stats_buf_ctx->stats_in_start, twopass->stats_buf_ctx->stats_in_end); stats = twopass->stats_buf_ctx->total_stats; *stats = *twopass->stats_buf_ctx->stats_in_end; *twopass->stats_buf_ctx->total_left_stats = *stats; frame_rate = 10000000.0 * stats->count / stats->duration; // Each frame can have a different duration, as the frame rate in the source // isn't guaranteed to be constant. The frame rate prior to the first frame // encoded in the second pass is a guess. However, the sum duration is not. // It is calculated based on the actual durations of all frames from the // first pass. av1_new_framerate(cpi, frame_rate); twopass->bits_left = (int64_t)(stats->duration * oxcf->rc_cfg.target_bandwidth / 10000000.0); #if CONFIG_BITRATE_ACCURACY av1_vbr_rc_init(&cpi->vbr_rc_info, twopass->bits_left, (int)round(stats->count)); #endif #if CONFIG_RATECTRL_LOG rc_log_init(&cpi->rc_log); #endif // This variable monitors how far behind the second ref update is lagging. twopass->sr_update_lag = 1; // Scan the first pass file and calculate a modified total error based upon // the bias/power function used to allocate bits. { const double avg_error = stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count); const FIRSTPASS_STATS *s = cpi->twopass_frame.stats_in; double modified_error_total = 0.0; twopass->modified_error_min = (avg_error * oxcf->rc_cfg.vbrmin_section) / 100; twopass->modified_error_max = (avg_error * oxcf->rc_cfg.vbrmax_section) / 100; while (s < twopass->stats_buf_ctx->stats_in_end) { modified_error_total += calculate_modified_err(frame_info, twopass, oxcf, s); ++s; } twopass->modified_error_left = modified_error_total; } // Reset the vbr bits off target counters cpi->ppi->p_rc.vbr_bits_off_target = 0; cpi->ppi->p_rc.vbr_bits_off_target_fast = 0; cpi->ppi->p_rc.rate_error_estimate = 0; // Static sequence monitor variables. twopass->kf_zeromotion_pct = 100; twopass->last_kfgroup_zeromotion_pct = 100; // Initialize bits per macro_block estimate correction factor. twopass->bpm_factor = 1.0; // Initialize actual and target bits counters for ARF groups so that // at the start we have a neutral bpm adjustment. twopass->rolling_arf_group_target_bits = 1; twopass->rolling_arf_group_actual_bits = 1; } void av1_init_single_pass_lap(AV1_COMP *cpi) { TWO_PASS *const twopass = &cpi->ppi->twopass; if (!twopass->stats_buf_ctx->stats_in_end) return; // This variable monitors how far behind the second ref update is lagging. twopass->sr_update_lag = 1; twopass->bits_left = 0; twopass->modified_error_min = 0.0; twopass->modified_error_max = 0.0; twopass->modified_error_left = 0.0; // Reset the vbr bits off target counters cpi->ppi->p_rc.vbr_bits_off_target = 0; cpi->ppi->p_rc.vbr_bits_off_target_fast = 0; cpi->ppi->p_rc.rate_error_estimate = 0; // Static sequence monitor variables. twopass->kf_zeromotion_pct = 100; twopass->last_kfgroup_zeromotion_pct = 100; // Initialize bits per macro_block estimate correction factor. twopass->bpm_factor = 1.0; // Initialize actual and target bits counters for ARF groups so that // at the start we have a neutral bpm adjustment. twopass->rolling_arf_group_target_bits = 1; twopass->rolling_arf_group_actual_bits = 1; } #define MINQ_ADJ_LIMIT 48 #define MINQ_ADJ_LIMIT_CQ 20 #define HIGH_UNDERSHOOT_RATIO 2 void av1_twopass_postencode_update(AV1_COMP *cpi) { TWO_PASS *const twopass = &cpi->ppi->twopass; RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; // Increment the stats_in pointer. if (is_stat_consumption_stage(cpi) && !(cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) && (cpi->gf_frame_index < cpi->ppi->gf_group.size || rc->frames_to_key == 0)) { const int update_type = cpi->ppi->gf_group.update_type[cpi->gf_frame_index]; if (update_type != ARF_UPDATE && update_type != INTNL_ARF_UPDATE) { FIRSTPASS_STATS this_frame; assert(cpi->twopass_frame.stats_in > twopass->stats_buf_ctx->stats_in_start); --cpi->twopass_frame.stats_in; if (cpi->ppi->lap_enabled) { input_stats_lap(twopass, &cpi->twopass_frame, &this_frame); } else { input_stats(twopass, &cpi->twopass_frame, &this_frame); } } else if (cpi->ppi->lap_enabled) { cpi->twopass_frame.stats_in = twopass->stats_buf_ctx->stats_in_start; } } // VBR correction is done through rc->vbr_bits_off_target. Based on the // sign of this value, a limited % adjustment is made to the target rate // of subsequent frames, to try and push it back towards 0. This method // is designed to prevent extreme behaviour at the end of a clip // or group of frames. p_rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size; twopass->bits_left = AOMMAX(twopass->bits_left - rc->base_frame_target, 0); if (cpi->do_update_vbr_bits_off_target_fast) { // Subtract current frame's fast_extra_bits. p_rc->vbr_bits_off_target_fast -= rc->frame_level_fast_extra_bits; rc->frame_level_fast_extra_bits = 0; } // Target vs actual bits for this arf group. if (twopass->rolling_arf_group_target_bits > INT_MAX - rc->base_frame_target) { twopass->rolling_arf_group_target_bits = INT_MAX; } else { twopass->rolling_arf_group_target_bits += rc->base_frame_target; } twopass->rolling_arf_group_actual_bits += rc->projected_frame_size; // Calculate the pct rc error. if (p_rc->total_actual_bits) { p_rc->rate_error_estimate = (int)((p_rc->vbr_bits_off_target * 100) / p_rc->total_actual_bits); p_rc->rate_error_estimate = clamp(p_rc->rate_error_estimate, -100, 100); } else { p_rc->rate_error_estimate = 0; } #if CONFIG_FPMT_TEST /* The variables temp_vbr_bits_off_target, temp_bits_left, * temp_rolling_arf_group_target_bits, temp_rolling_arf_group_actual_bits * temp_rate_error_estimate are introduced for quality simulation purpose, * it retains the value previous to the parallel encode frames. The * variables are updated based on the update flag. * * If there exist show_existing_frames between parallel frames, then to * retain the temp state do not update it. */ const int simulate_parallel_frame = cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; int show_existing_between_parallel_frames = (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE && cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2); if (cpi->do_frame_data_update && !show_existing_between_parallel_frames && simulate_parallel_frame) { cpi->ppi->p_rc.temp_vbr_bits_off_target = p_rc->vbr_bits_off_target; cpi->ppi->p_rc.temp_bits_left = twopass->bits_left; cpi->ppi->p_rc.temp_rolling_arf_group_target_bits = twopass->rolling_arf_group_target_bits; cpi->ppi->p_rc.temp_rolling_arf_group_actual_bits = twopass->rolling_arf_group_actual_bits; cpi->ppi->p_rc.temp_rate_error_estimate = p_rc->rate_error_estimate; } #endif // Update the active best quality pyramid. if (!rc->is_src_frame_alt_ref) { const int pyramid_level = cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]; int i; for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) { p_rc->active_best_quality[i] = cpi->common.quant_params.base_qindex; #if CONFIG_TUNE_VMAF if (cpi->vmaf_info.original_qindex != -1 && (cpi->oxcf.tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && cpi->oxcf.tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) { p_rc->active_best_quality[i] = cpi->vmaf_info.original_qindex; } #endif } } #if 0 { AV1_COMMON *cm = &cpi->common; FILE *fpfile; fpfile = fopen("details.stt", "a"); fprintf(fpfile, "%10d %10d %10d %10" PRId64 " %10" PRId64 " %10d %10d %10d %10.4lf %10.4lf %10.4lf %10.4lf\n", cm->current_frame.frame_number, rc->base_frame_target, rc->projected_frame_size, rc->total_actual_bits, rc->vbr_bits_off_target, p_rc->rate_error_estimate, twopass->rolling_arf_group_target_bits, twopass->rolling_arf_group_actual_bits, (double)twopass->rolling_arf_group_actual_bits / (double)twopass->rolling_arf_group_target_bits, twopass->bpm_factor, av1_convert_qindex_to_q(cpi->common.quant_params.base_qindex, cm->seq_params->bit_depth), av1_convert_qindex_to_q(rc->active_worst_quality, cm->seq_params->bit_depth)); fclose(fpfile); } #endif if (cpi->common.current_frame.frame_type != KEY_FRAME) { twopass->kf_group_bits -= rc->base_frame_target; twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct; } twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0); // If the rate control is drifting consider adjustment to min or maxq. if ((rc_cfg->mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref && (p_rc->rolling_target_bits > 0)) { int minq_adj_limit; int maxq_adj_limit; minq_adj_limit = (rc_cfg->mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT); maxq_adj_limit = (rc->worst_quality - rc->active_worst_quality); // Undershoot if ((rc_cfg->under_shoot_pct < 100) && (p_rc->rolling_actual_bits < p_rc->rolling_target_bits)) { int pct_error = ((p_rc->rolling_target_bits - p_rc->rolling_actual_bits) * 100) / p_rc->rolling_target_bits; if ((pct_error >= rc_cfg->under_shoot_pct) && (p_rc->rate_error_estimate > 0)) { twopass->extend_minq += 1; twopass->extend_maxq -= 1; } // Overshoot } else if ((rc_cfg->over_shoot_pct < 100) && (p_rc->rolling_actual_bits > p_rc->rolling_target_bits)) { int pct_error = ((p_rc->rolling_actual_bits - p_rc->rolling_target_bits) * 100) / p_rc->rolling_target_bits; pct_error = clamp(pct_error, 0, 100); if ((pct_error >= rc_cfg->over_shoot_pct) && (p_rc->rate_error_estimate < 0)) { twopass->extend_maxq += 1; twopass->extend_minq -= 1; } } twopass->extend_minq = clamp(twopass->extend_minq, -minq_adj_limit, minq_adj_limit); twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit); // If there is a big and undexpected undershoot then feed the extra // bits back in quickly. One situation where this may happen is if a // frame is unexpectedly almost perfectly predicted by the ARF or GF // but not very well predcited by the previous frame. if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) { int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO; if (rc->projected_frame_size < fast_extra_thresh) { p_rc->vbr_bits_off_target_fast += fast_extra_thresh - rc->projected_frame_size; p_rc->vbr_bits_off_target_fast = AOMMIN(p_rc->vbr_bits_off_target_fast, (4 * (int64_t)rc->avg_frame_bandwidth)); } } #if CONFIG_FPMT_TEST if (cpi->do_frame_data_update && !show_existing_between_parallel_frames && simulate_parallel_frame) { cpi->ppi->p_rc.temp_vbr_bits_off_target_fast = p_rc->vbr_bits_off_target_fast; cpi->ppi->p_rc.temp_extend_minq = twopass->extend_minq; cpi->ppi->p_rc.temp_extend_maxq = twopass->extend_maxq; } #endif } // Update the frame probabilities obtained from parallel encode frames FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs; #if CONFIG_FPMT_TEST /* The variable temp_active_best_quality is introduced only for quality * simulation purpose, it retains the value previous to the parallel * encode frames. The variable is updated based on the update flag. * * If there exist show_existing_frames between parallel frames, then to * retain the temp state do not update it. */ if (cpi->do_frame_data_update && !show_existing_between_parallel_frames && simulate_parallel_frame) { int i; const int pyramid_level = cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]; if (!rc->is_src_frame_alt_ref) { for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) cpi->ppi->p_rc.temp_active_best_quality[i] = p_rc->active_best_quality[i]; } } // Update the frame probabilities obtained from parallel encode frames FrameProbInfo *const temp_frame_probs_simulation = simulate_parallel_frame ? &cpi->ppi->temp_frame_probs_simulation : frame_probs; FrameProbInfo *const temp_frame_probs = simulate_parallel_frame ? &cpi->ppi->temp_frame_probs : NULL; #endif int i, j, loop; // Sequentially do average on temp_frame_probs_simulation which holds // probabilities of last frame before parallel encode for (loop = 0; loop <= cpi->num_frame_recode; loop++) { // Sequentially update tx_type_probs if (cpi->do_update_frame_probs_txtype[loop] && (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)) { const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); for (i = 0; i < TX_SIZES_ALL; i++) { int left = 1024; for (j = TX_TYPES - 1; j >= 0; j--) { const int new_prob = cpi->frame_new_probs[loop].tx_type_probs[update_type][i][j]; #if CONFIG_FPMT_TEST int prob = (temp_frame_probs_simulation->tx_type_probs[update_type][i][j] + new_prob) >> 1; left -= prob; if (j == 0) prob += left; temp_frame_probs_simulation->tx_type_probs[update_type][i][j] = prob; #else int prob = (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1; left -= prob; if (j == 0) prob += left; frame_probs->tx_type_probs[update_type][i][j] = prob; #endif } } } // Sequentially update obmc_probs if (cpi->do_update_frame_probs_obmc[loop] && cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); for (i = 0; i < BLOCK_SIZES_ALL; i++) { const int new_prob = cpi->frame_new_probs[loop].obmc_probs[update_type][i]; #if CONFIG_FPMT_TEST temp_frame_probs_simulation->obmc_probs[update_type][i] = (temp_frame_probs_simulation->obmc_probs[update_type][i] + new_prob) >> 1; #else frame_probs->obmc_probs[update_type][i] = (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1; #endif } } // Sequentially update warped_probs if (cpi->do_update_frame_probs_warp[loop] && cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); const int new_prob = cpi->frame_new_probs[loop].warped_probs[update_type]; #if CONFIG_FPMT_TEST temp_frame_probs_simulation->warped_probs[update_type] = (temp_frame_probs_simulation->warped_probs[update_type] + new_prob) >> 1; #else frame_probs->warped_probs[update_type] = (frame_probs->warped_probs[update_type] + new_prob) >> 1; #endif } // Sequentially update switchable_interp_probs if (cpi->do_update_frame_probs_interpfilter[loop] && cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { int left = 1536; for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) { const int new_prob = cpi->frame_new_probs[loop] .switchable_interp_probs[update_type][i][j]; #if CONFIG_FPMT_TEST int prob = (temp_frame_probs_simulation ->switchable_interp_probs[update_type][i][j] + new_prob) >> 1; left -= prob; if (j == 0) prob += left; temp_frame_probs_simulation ->switchable_interp_probs[update_type][i][j] = prob; #else int prob = (frame_probs->switchable_interp_probs[update_type][i][j] + new_prob) >> 1; left -= prob; if (j == 0) prob += left; frame_probs->switchable_interp_probs[update_type][i][j] = prob; #endif } } } } #if CONFIG_FPMT_TEST // Copying temp_frame_probs_simulation to temp_frame_probs based on // the flag if (cpi->do_frame_data_update && cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && simulate_parallel_frame) { for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; update_type_idx++) { for (i = 0; i < BLOCK_SIZES_ALL; i++) { temp_frame_probs->obmc_probs[update_type_idx][i] = temp_frame_probs_simulation->obmc_probs[update_type_idx][i]; } temp_frame_probs->warped_probs[update_type_idx] = temp_frame_probs_simulation->warped_probs[update_type_idx]; for (i = 0; i < TX_SIZES_ALL; i++) { for (j = 0; j < TX_TYPES; j++) { temp_frame_probs->tx_type_probs[update_type_idx][i][j] = temp_frame_probs_simulation->tx_type_probs[update_type_idx][i][j]; } } for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { for (j = 0; j < SWITCHABLE_FILTERS; j++) { temp_frame_probs->switchable_interp_probs[update_type_idx][i][j] = temp_frame_probs_simulation ->switchable_interp_probs[update_type_idx][i][j]; } } } } #endif // Update framerate obtained from parallel encode frames if (cpi->common.show_frame && cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) cpi->framerate = cpi->new_framerate; #if CONFIG_FPMT_TEST // SIMULATION PURPOSE int show_existing_between_parallel_frames_cndn = (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE && cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2); if (cpi->common.show_frame && !show_existing_between_parallel_frames_cndn && cpi->do_frame_data_update && simulate_parallel_frame) cpi->temp_framerate = cpi->framerate; #endif } aom-3.12.1/av1/encoder/pass2_strategy.h000066400000000000000000000074061477627663500176410ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_PASS2_STRATEGY_H_ #define AOM_AV1_ENCODER_PASS2_STRATEGY_H_ #ifdef __cplusplus extern "C" { #endif struct AV1_COMP; struct EncodeFrameParams; #include "av1/encoder/encoder.h" /*! * \brief accumulated stats and features in a gf group */ typedef struct { /*!\cond */ double gf_group_err; double gf_group_raw_error; double gf_group_skip_pct; double gf_group_inactive_zone_rows; double mv_ratio_accumulator; double decay_accumulator; double zero_motion_accumulator; double loop_decay_rate; double last_loop_decay_rate; double this_frame_mv_in_out; double mv_in_out_accumulator; double abs_mv_in_out_accumulator; double avg_sr_coded_error; double avg_pcnt_second_ref; double avg_new_mv_count; double avg_wavelet_energy; double avg_raw_err_stdev; int non_zero_stdev_count; /*!\endcond */ } GF_GROUP_STATS; /*! * \brief accumulated stats and features for a frame */ typedef struct { /*!\cond */ double frame_err; double frame_coded_error; double frame_sr_coded_error; /*!\endcond */ } GF_FRAME_STATS; /*!\cond */ void av1_init_second_pass(struct AV1_COMP *cpi); void av1_init_single_pass_lap(AV1_COMP *cpi); /*!\endcond */ /*!\brief Main per frame entry point for second pass of two pass encode * *\ingroup rate_control * * This function is called for each frame in the second pass of a two pass * encode. It checks the frame type and if a new KF or GF/ARF is due. * When a KF is due it calls find_next_key_frame() to work out how long * this key frame group will be and assign bits to the key frame. * At the start of a new GF/ARF group it calls calculate_gf_length() * and define_gf_group() which are the main functions responsible for * defining the size and structure of the new GF/ARF group. * * \param[in] cpi Top - level encoder instance structure * \param[in] frame_params Per frame encoding parameters * \param[in] frame_flags Frame type and coding flags * * \remark No return but analyses first pass stats and assigns a target * number of bits to the current frame and a target Q range. */ void av1_get_second_pass_params(struct AV1_COMP *cpi, struct EncodeFrameParams *const frame_params, unsigned int frame_flags); /*!\brief Adjustments to two pass and rate control after each frame. * *\ingroup rate_control * * This function is called after each frame to make adjustments to * heuristics and data structures that relate to rate control. * * \param[in] cpi Top - level encoder instance structure * * \remark No return value but this function updates various rate control * related data structures that for example track overshoot and * undershoot. */ void av1_twopass_postencode_update(struct AV1_COMP *cpi); int av1_calc_arf_boost(const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info, int offset, int f_frames, int b_frames, int *num_fpstats_used, int *num_fpstats_required, int project_gfu_boost); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_PASS2_STRATEGY_H_ aom-3.12.1/av1/encoder/pickcdef.c000066400000000000000000001234431477627663500164320ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_dsp_rtcd.h" #include "config/aom_scale_rtcd.h" #include "aom/aom_integer.h" #include "av1/common/av1_common_int.h" #include "av1/common/reconinter.h" #include "av1/encoder/encoder.h" #include "av1/encoder/ethread.h" #include "av1/encoder/pickcdef.h" #include "av1/encoder/mcomp.h" // Get primary and secondary filter strength for the given strength index and // search method static inline void get_cdef_filter_strengths(CDEF_PICK_METHOD pick_method, int *pri_strength, int *sec_strength, int strength_idx) { const int tot_sec_filter = (pick_method == CDEF_FAST_SEARCH_LVL5) ? REDUCED_SEC_STRENGTHS_LVL5 : ((pick_method >= CDEF_FAST_SEARCH_LVL3) ? REDUCED_SEC_STRENGTHS_LVL3 : CDEF_SEC_STRENGTHS); const int pri_idx = strength_idx / tot_sec_filter; const int sec_idx = strength_idx % tot_sec_filter; *pri_strength = pri_idx; *sec_strength = sec_idx; if (pick_method == CDEF_FULL_SEARCH) return; switch (pick_method) { case CDEF_FAST_SEARCH_LVL1: assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL1); *pri_strength = priconv_lvl1[pri_idx]; break; case CDEF_FAST_SEARCH_LVL2: assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL2); *pri_strength = priconv_lvl2[pri_idx]; break; case CDEF_FAST_SEARCH_LVL3: assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL2); assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL3); *pri_strength = priconv_lvl2[pri_idx]; *sec_strength = secconv_lvl3[sec_idx]; break; case CDEF_FAST_SEARCH_LVL4: assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL4); assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL3); *pri_strength = priconv_lvl4[pri_idx]; *sec_strength = secconv_lvl3[sec_idx]; break; case CDEF_FAST_SEARCH_LVL5: assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL4); assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL5); *pri_strength = priconv_lvl5[pri_idx]; *sec_strength = secconv_lvl5[sec_idx]; break; default: assert(0 && "Invalid CDEF search method"); } } // Store CDEF filter strength calculated from strength index for given search // method #define STORE_CDEF_FILTER_STRENGTH(cdef_strength, pick_method, strength_idx) \ do { \ get_cdef_filter_strengths((pick_method), &pri_strength, &sec_strength, \ (strength_idx)); \ cdef_strength = pri_strength * CDEF_SEC_STRENGTHS + sec_strength; \ } while (0) /* Search for the best strength to add as an option, knowing we already selected nb_strengths options. */ static uint64_t search_one(int *lev, int nb_strengths, uint64_t mse[][TOTAL_STRENGTHS], int sb_count, CDEF_PICK_METHOD pick_method) { uint64_t tot_mse[TOTAL_STRENGTHS]; const int total_strengths = nb_cdef_strengths[pick_method]; int i, j; uint64_t best_tot_mse = (uint64_t)1 << 63; int best_id = 0; memset(tot_mse, 0, sizeof(tot_mse)); for (i = 0; i < sb_count; i++) { int gi; uint64_t best_mse = (uint64_t)1 << 63; /* Find best mse among already selected options. */ for (gi = 0; gi < nb_strengths; gi++) { if (mse[i][lev[gi]] < best_mse) { best_mse = mse[i][lev[gi]]; } } /* Find best mse when adding each possible new option. */ for (j = 0; j < total_strengths; j++) { uint64_t best = best_mse; if (mse[i][j] < best) best = mse[i][j]; tot_mse[j] += best; } } for (j = 0; j < total_strengths; j++) { if (tot_mse[j] < best_tot_mse) { best_tot_mse = tot_mse[j]; best_id = j; } } lev[nb_strengths] = best_id; return best_tot_mse; } /* Search for the best luma+chroma strength to add as an option, knowing we already selected nb_strengths options. */ static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths, uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count, CDEF_PICK_METHOD pick_method) { uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS]; int i, j; uint64_t best_tot_mse = (uint64_t)1 << 63; int best_id0 = 0; int best_id1 = 0; const int total_strengths = nb_cdef_strengths[pick_method]; memset(tot_mse, 0, sizeof(tot_mse)); for (i = 0; i < sb_count; i++) { int gi; uint64_t best_mse = (uint64_t)1 << 63; /* Find best mse among already selected options. */ for (gi = 0; gi < nb_strengths; gi++) { uint64_t curr = mse[0][i][lev0[gi]]; curr += mse[1][i][lev1[gi]]; if (curr < best_mse) { best_mse = curr; } } /* Find best mse when adding each possible new option. */ for (j = 0; j < total_strengths; j++) { int k; for (k = 0; k < total_strengths; k++) { uint64_t best = best_mse; uint64_t curr = mse[0][i][j]; curr += mse[1][i][k]; if (curr < best) best = curr; tot_mse[j][k] += best; } } } for (j = 0; j < total_strengths; j++) { int k; for (k = 0; k < total_strengths; k++) { if (tot_mse[j][k] < best_tot_mse) { best_tot_mse = tot_mse[j][k]; best_id0 = j; best_id1 = k; } } } lev0[nb_strengths] = best_id0; lev1[nb_strengths] = best_id1; return best_tot_mse; } /* Search for the set of strengths that minimizes mse. */ static uint64_t joint_strength_search(int *best_lev, int nb_strengths, uint64_t mse[][TOTAL_STRENGTHS], int sb_count, CDEF_PICK_METHOD pick_method) { uint64_t best_tot_mse; int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 && pick_method <= CDEF_FAST_SEARCH_LVL5); int i; best_tot_mse = (uint64_t)1 << 63; /* Greedy search: add one strength options at a time. */ for (i = 0; i < nb_strengths; i++) { best_tot_mse = search_one(best_lev, i, mse, sb_count, pick_method); } /* Trying to refine the greedy search by reconsidering each already-selected option. */ if (!fast) { for (i = 0; i < 4 * nb_strengths; i++) { int j; for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1]; best_tot_mse = search_one(best_lev, nb_strengths - 1, mse, sb_count, pick_method); } } return best_tot_mse; } /* Search for the set of luma+chroma strengths that minimizes mse. */ static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1, int nb_strengths, uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count, CDEF_PICK_METHOD pick_method) { uint64_t best_tot_mse; int i; best_tot_mse = (uint64_t)1 << 63; /* Greedy search: add one strength options at a time. */ for (i = 0; i < nb_strengths; i++) { best_tot_mse = search_one_dual(best_lev0, best_lev1, i, mse, sb_count, pick_method); } /* Trying to refine the greedy search by reconsidering each already-selected option. */ for (i = 0; i < 4 * nb_strengths; i++) { int j; for (j = 0; j < nb_strengths - 1; j++) { best_lev0[j] = best_lev0[j + 1]; best_lev1[j] = best_lev1[j + 1]; } best_tot_mse = search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse, sb_count, pick_method); } return best_tot_mse; } static inline void init_src_params(int *src_stride, int *width, int *height, int *width_log2, int *height_log2, BLOCK_SIZE bsize) { *src_stride = block_size_wide[bsize]; *width = block_size_wide[bsize]; *height = block_size_high[bsize]; *width_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize]; *height_log2 = MI_SIZE_LOG2 + mi_size_high_log2[bsize]; } #if CONFIG_AV1_HIGHBITDEPTH /* Compute MSE only on the blocks we filtered. */ static uint64_t compute_cdef_dist_highbd(void *dst, int dstride, uint16_t *src, cdef_list *dlist, int cdef_count, BLOCK_SIZE bsize, int coeff_shift, int row, int col) { assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 || bsize == BLOCK_8X8); uint64_t sum = 0; int bi, bx, by; uint16_t *dst16 = CONVERT_TO_SHORTPTR((uint8_t *)dst); uint16_t *dst_buff = &dst16[row * dstride + col]; int src_stride, width, height, width_log2, height_log2; init_src_params(&src_stride, &width, &height, &width_log2, &height_log2, bsize); for (bi = 0; bi < cdef_count; bi++) { by = dlist[bi].by; bx = dlist[bi].bx; sum += aom_mse_wxh_16bit_highbd( &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride, &src[bi << (height_log2 + width_log2)], src_stride, width, height); } return sum >> 2 * coeff_shift; } #endif // Checks dual and quad block processing is applicable for block widths 8 and 4 // respectively. static inline int is_dual_or_quad_applicable(cdef_list *dlist, int width, int cdef_count, int bi, int iter) { assert(width == 8 || width == 4); const int blk_offset = (width == 8) ? 1 : 3; if ((iter + blk_offset) >= cdef_count) return 0; if (dlist[bi].by == dlist[bi + blk_offset].by && dlist[bi].bx + blk_offset == dlist[bi + blk_offset].bx) return 1; return 0; } static uint64_t compute_cdef_dist(void *dst, int dstride, uint16_t *src, cdef_list *dlist, int cdef_count, BLOCK_SIZE bsize, int coeff_shift, int row, int col) { assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 || bsize == BLOCK_8X8); uint64_t sum = 0; int bi, bx, by; int iter = 0; int inc = 1; uint8_t *dst8 = (uint8_t *)dst; uint8_t *dst_buff = &dst8[row * dstride + col]; int src_stride, width, height, width_log2, height_log2; init_src_params(&src_stride, &width, &height, &width_log2, &height_log2, bsize); const int num_blks = 16 / width; for (bi = 0; bi < cdef_count; bi += inc) { by = dlist[bi].by; bx = dlist[bi].bx; uint16_t *src_tmp = &src[bi << (height_log2 + width_log2)]; uint8_t *dst_tmp = &dst_buff[(by << height_log2) * dstride + (bx << width_log2)]; if (is_dual_or_quad_applicable(dlist, width, cdef_count, bi, iter)) { sum += aom_mse_16xh_16bit(dst_tmp, dstride, src_tmp, width, height); iter += num_blks; inc = num_blks; } else { sum += aom_mse_wxh_16bit(dst_tmp, dstride, src_tmp, src_stride, width, height); iter += 1; inc = 1; } } return sum >> 2 * coeff_shift; } // Fill the boundary regions of the block with CDEF_VERY_LARGE, only if the // region is outside frame boundary static inline void fill_borders_for_fbs_on_frame_boundary( uint16_t *inbuf, int hfilt_size, int vfilt_size, bool is_fb_on_frm_left_boundary, bool is_fb_on_frm_right_boundary, bool is_fb_on_frm_top_boundary, bool is_fb_on_frm_bottom_boundary) { if (!is_fb_on_frm_left_boundary && !is_fb_on_frm_right_boundary && !is_fb_on_frm_top_boundary && !is_fb_on_frm_bottom_boundary) return; if (is_fb_on_frm_bottom_boundary) { // Fill bottom region of the block const int buf_offset = (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE + CDEF_HBORDER; fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, hfilt_size, CDEF_VERY_LARGE); } if (is_fb_on_frm_bottom_boundary || is_fb_on_frm_left_boundary) { const int buf_offset = (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE; // Fill bottom-left region of the block fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); } if (is_fb_on_frm_bottom_boundary || is_fb_on_frm_right_boundary) { const int buf_offset = (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE + hfilt_size + CDEF_HBORDER; // Fill bottom-right region of the block fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); } if (is_fb_on_frm_top_boundary) { // Fill top region of the block fill_rect(&inbuf[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hfilt_size, CDEF_VERY_LARGE); } if (is_fb_on_frm_top_boundary || is_fb_on_frm_left_boundary) { // Fill top-left region of the block fill_rect(inbuf, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); } if (is_fb_on_frm_top_boundary || is_fb_on_frm_right_boundary) { const int buf_offset = hfilt_size + CDEF_HBORDER; // Fill top-right region of the block fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); } if (is_fb_on_frm_left_boundary) { const int buf_offset = CDEF_VBORDER * CDEF_BSTRIDE; // Fill left region of the block fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, vfilt_size, CDEF_HBORDER, CDEF_VERY_LARGE); } if (is_fb_on_frm_right_boundary) { const int buf_offset = CDEF_VBORDER * CDEF_BSTRIDE; // Fill right region of the block fill_rect(&inbuf[buf_offset + hfilt_size + CDEF_HBORDER], CDEF_BSTRIDE, vfilt_size, CDEF_HBORDER, CDEF_VERY_LARGE); } } // Calculate the number of 8x8/4x4 filter units for which SSE can be calculated // after CDEF filtering in single function call static AOM_FORCE_INLINE int get_error_calc_width_in_filt_units( cdef_list *dlist, int cdef_count, int bi, int subsampling_x, int subsampling_y) { // TODO(Ranjit): Extend the optimization for 422 if (subsampling_x != subsampling_y) return 1; // Combining more blocks seems to increase encode time due to increase in // control code if (bi + 3 < cdef_count && dlist[bi].by == dlist[bi + 3].by && dlist[bi].bx + 3 == dlist[bi + 3].bx) { /* Calculate error for four 8x8/4x4 blocks using 32x8/16x4 block specific * logic if y co-ordinates match and x co-ordinates are * separated by 3 for first and fourth 8x8/4x4 blocks in dlist[]. */ return 4; } if (bi + 1 < cdef_count && dlist[bi].by == dlist[bi + 1].by && dlist[bi].bx + 1 == dlist[bi + 1].bx) { /* Calculate error for two 8x8/4x4 blocks using 16x8/8x4 block specific * logic if their y co-ordinates match and x co-ordinates are * separated by 1 for first and second 8x8/4x4 blocks in dlist[]. */ return 2; } return 1; } // Returns the block error after CDEF filtering for a given strength static inline uint64_t get_filt_error( const CdefSearchCtx *cdef_search_ctx, const struct macroblockd_plane *pd, cdef_list *dlist, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], uint16_t *in, uint8_t *ref_buffer, int ref_stride, int row, int col, int pri_strength, int sec_strength, int cdef_count, int pli, int coeff_shift, BLOCK_SIZE bs) { uint64_t curr_sse = 0; const BLOCK_SIZE plane_bsize = get_plane_block_size(bs, pd->subsampling_x, pd->subsampling_y); const int bw_log2 = 3 - pd->subsampling_x; const int bh_log2 = 3 - pd->subsampling_y; // TODO(Ranjit): Extend this optimization for HBD if (!cdef_search_ctx->use_highbitdepth) { // If all 8x8/4x4 blocks in CDEF block need to be filtered, calculate the // error at CDEF block level const int tot_blk_count = (block_size_wide[plane_bsize] * block_size_high[plane_bsize]) >> (bw_log2 + bh_log2); if (cdef_count == tot_blk_count) { // Calculate the offset in the buffer based on block position const FULLPEL_MV this_mv = { row, col }; const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride); if (pri_strength == 0 && sec_strength == 0) { // When CDEF strength is zero, filtering is not applied. Hence // error is calculated between source and unfiltered pixels curr_sse = aom_sse(&ref_buffer[buf_offset], ref_stride, get_buf_from_fullmv(&pd->dst, &this_mv), pd->dst.stride, block_size_wide[plane_bsize], block_size_high[plane_bsize]); } else { DECLARE_ALIGNED(32, uint8_t, tmp_dst8[1 << (MAX_SB_SIZE_LOG2 * 2)]); av1_cdef_filter_fb(tmp_dst8, NULL, (1 << MAX_SB_SIZE_LOG2), in, cdef_search_ctx->xdec[pli], cdef_search_ctx->ydec[pli], dir, dirinit, var, pli, dlist, cdef_count, pri_strength, sec_strength + (sec_strength == 3), cdef_search_ctx->damping, coeff_shift); curr_sse = aom_sse(&ref_buffer[buf_offset], ref_stride, tmp_dst8, (1 << MAX_SB_SIZE_LOG2), block_size_wide[plane_bsize], block_size_high[plane_bsize]); } } else { // If few 8x8/4x4 blocks in CDEF block need to be filtered, filtering // functions produce 8-bit output and the error is calculated in 8-bit // domain if (pri_strength == 0 && sec_strength == 0) { int num_error_calc_filt_units = 1; for (int bi = 0; bi < cdef_count; bi = bi + num_error_calc_filt_units) { const uint8_t by = dlist[bi].by; const uint8_t bx = dlist[bi].bx; const int16_t by_pos = (by << bh_log2); const int16_t bx_pos = (bx << bw_log2); // Calculate the offset in the buffer based on block position const FULLPEL_MV this_mv = { row + by_pos, col + bx_pos }; const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride); num_error_calc_filt_units = get_error_calc_width_in_filt_units( dlist, cdef_count, bi, pd->subsampling_x, pd->subsampling_y); curr_sse += aom_sse( &ref_buffer[buf_offset], ref_stride, get_buf_from_fullmv(&pd->dst, &this_mv), pd->dst.stride, num_error_calc_filt_units * (1 << bw_log2), (1 << bh_log2)); } } else { DECLARE_ALIGNED(32, uint8_t, tmp_dst8[1 << (MAX_SB_SIZE_LOG2 * 2)]); av1_cdef_filter_fb(tmp_dst8, NULL, (1 << MAX_SB_SIZE_LOG2), in, cdef_search_ctx->xdec[pli], cdef_search_ctx->ydec[pli], dir, dirinit, var, pli, dlist, cdef_count, pri_strength, sec_strength + (sec_strength == 3), cdef_search_ctx->damping, coeff_shift); int num_error_calc_filt_units = 1; for (int bi = 0; bi < cdef_count; bi = bi + num_error_calc_filt_units) { const uint8_t by = dlist[bi].by; const uint8_t bx = dlist[bi].bx; const int16_t by_pos = (by << bh_log2); const int16_t bx_pos = (bx << bw_log2); // Calculate the offset in the buffer based on block position const FULLPEL_MV this_mv = { row + by_pos, col + bx_pos }; const FULLPEL_MV tmp_buf_pos = { by_pos, bx_pos }; const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride); const int tmp_buf_offset = get_offset_from_fullmv(&tmp_buf_pos, (1 << MAX_SB_SIZE_LOG2)); num_error_calc_filt_units = get_error_calc_width_in_filt_units( dlist, cdef_count, bi, pd->subsampling_x, pd->subsampling_y); curr_sse += aom_sse( &ref_buffer[buf_offset], ref_stride, &tmp_dst8[tmp_buf_offset], (1 << MAX_SB_SIZE_LOG2), num_error_calc_filt_units * (1 << bw_log2), (1 << bh_log2)); } } } } else { DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]); av1_cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in, cdef_search_ctx->xdec[pli], cdef_search_ctx->ydec[pli], dir, dirinit, var, pli, dlist, cdef_count, pri_strength, sec_strength + (sec_strength == 3), cdef_search_ctx->damping, coeff_shift); curr_sse = cdef_search_ctx->compute_cdef_dist_fn( ref_buffer, ref_stride, tmp_dst, dlist, cdef_count, cdef_search_ctx->bsize[pli], coeff_shift, row, col); } return curr_sse; } // Calculates MSE at block level. // Inputs: // cdef_search_ctx: Pointer to the structure containing parameters related to // CDEF search context. // fbr: Row index in units of 64x64 block // fbc: Column index in units of 64x64 block // Returns: // Nothing will be returned. Contents of cdef_search_ctx will be modified. void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, struct aom_internal_error_info *error_info, int fbr, int fbc, int sb_count) { // TODO(aomedia:3276): Pass error_info to the low-level functions as required // in future to handle error propagation. (void)error_info; const CommonModeInfoParams *const mi_params = cdef_search_ctx->mi_params; const YV12_BUFFER_CONFIG *ref = cdef_search_ctx->ref; const int coeff_shift = cdef_search_ctx->coeff_shift; const int *mi_wide_l2 = cdef_search_ctx->mi_wide_l2; const int *mi_high_l2 = cdef_search_ctx->mi_high_l2; // Declare and initialize the temporary buffers. DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]); cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128]; int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } }; int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } }; uint16_t *const in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER; int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc); int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr); int hb_step = 1, vb_step = 1; BLOCK_SIZE bs; const MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc]; uint8_t *ref_buffer[MAX_MB_PLANE] = { ref->y_buffer, ref->u_buffer, ref->v_buffer }; int ref_stride[MAX_MB_PLANE] = { ref->y_stride, ref->uv_stride, ref->uv_stride }; if (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64 || mbmi->bsize == BLOCK_64X128) { bs = mbmi->bsize; if (bs == BLOCK_128X128 || bs == BLOCK_128X64) { nhb = AOMMIN(MI_SIZE_128X128, mi_params->mi_cols - MI_SIZE_64X64 * fbc); hb_step = 2; } if (bs == BLOCK_128X128 || bs == BLOCK_64X128) { nvb = AOMMIN(MI_SIZE_128X128, mi_params->mi_rows - MI_SIZE_64X64 * fbr); vb_step = 2; } } else { bs = BLOCK_64X64; } // Get number of 8x8 blocks which are not skip. Cdef processing happens for // 8x8 blocks which are not skip. const int cdef_count = av1_cdef_compute_sb_list( mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, bs); const bool is_fb_on_frm_left_boundary = (fbc == 0); const bool is_fb_on_frm_right_boundary = (fbc + hb_step == cdef_search_ctx->nhfb); const bool is_fb_on_frm_top_boundary = (fbr == 0); const bool is_fb_on_frm_bottom_boundary = (fbr + vb_step == cdef_search_ctx->nvfb); const int yoff = CDEF_VBORDER * (!is_fb_on_frm_top_boundary); const int xoff = CDEF_HBORDER * (!is_fb_on_frm_left_boundary); int dirinit = 0; for (int pli = 0; pli < cdef_search_ctx->num_planes; pli++) { /* We avoid filtering the pixels for which some of the pixels to average are outside the frame. We could change the filter instead, but it would add special cases for any future vectorization. */ const int hfilt_size = (nhb << mi_wide_l2[pli]); const int vfilt_size = (nvb << mi_high_l2[pli]); const int ysize = vfilt_size + CDEF_VBORDER * (!is_fb_on_frm_bottom_boundary) + yoff; const int xsize = hfilt_size + CDEF_HBORDER * (!is_fb_on_frm_right_boundary) + xoff; const int row = fbr * MI_SIZE_64X64 << mi_high_l2[pli]; const int col = fbc * MI_SIZE_64X64 << mi_wide_l2[pli]; struct macroblockd_plane pd = cdef_search_ctx->plane[pli]; cdef_search_ctx->copy_fn(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE, pd.dst.buf, row - yoff, col - xoff, pd.dst.stride, ysize, xsize); fill_borders_for_fbs_on_frame_boundary( inbuf, hfilt_size, vfilt_size, is_fb_on_frm_left_boundary, is_fb_on_frm_right_boundary, is_fb_on_frm_top_boundary, is_fb_on_frm_bottom_boundary); for (int gi = 0; gi < cdef_search_ctx->total_strengths; gi++) { int pri_strength, sec_strength; get_cdef_filter_strengths(cdef_search_ctx->pick_method, &pri_strength, &sec_strength, gi); const uint64_t curr_mse = get_filt_error( cdef_search_ctx, &pd, dlist, dir, &dirinit, var, in, ref_buffer[pli], ref_stride[pli], row, col, pri_strength, sec_strength, cdef_count, pli, coeff_shift, bs); if (pli < 2) cdef_search_ctx->mse[pli][sb_count][gi] = curr_mse; else cdef_search_ctx->mse[1][sb_count][gi] += curr_mse; } } cdef_search_ctx->sb_index[sb_count] = MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc; } // MSE calculation at frame level. // Inputs: // cdef_search_ctx: Pointer to the structure containing parameters related to // CDEF search context. // Returns: // Nothing will be returned. Contents of cdef_search_ctx will be modified. static void cdef_mse_calc_frame(CdefSearchCtx *cdef_search_ctx, struct aom_internal_error_info *error_info) { // Loop over each sb. for (int fbr = 0; fbr < cdef_search_ctx->nvfb; ++fbr) { for (int fbc = 0; fbc < cdef_search_ctx->nhfb; ++fbc) { // Checks if cdef processing can be skipped for particular sb. if (cdef_sb_skip(cdef_search_ctx->mi_params, fbr, fbc)) continue; // Calculate mse for each sb and store the relevant sb index. av1_cdef_mse_calc_block(cdef_search_ctx, error_info, fbr, fbc, cdef_search_ctx->sb_count); cdef_search_ctx->sb_count++; } } } // Allocates memory for members of CdefSearchCtx. // Inputs: // cdef_search_ctx: Pointer to the structure containing parameters // related to CDEF search context. // Returns: // Nothing will be returned. Contents of cdef_search_ctx will be modified. static void cdef_alloc_data(AV1_COMMON *cm, CdefSearchCtx *cdef_search_ctx) { const int nvfb = cdef_search_ctx->nvfb; const int nhfb = cdef_search_ctx->nhfb; CHECK_MEM_ERROR( cm, cdef_search_ctx->sb_index, aom_malloc(nvfb * nhfb * sizeof(cdef_search_ctx->sb_index[0]))); cdef_search_ctx->sb_count = 0; CHECK_MEM_ERROR(cm, cdef_search_ctx->mse[0], aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb)); CHECK_MEM_ERROR(cm, cdef_search_ctx->mse[1], aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb)); } // Deallocates the memory allocated for members of CdefSearchCtx. // Inputs: // cdef_search_ctx: Pointer to the structure containing parameters // related to CDEF search context. // Returns: // Nothing will be returned. void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx) { if (cdef_search_ctx) { aom_free(cdef_search_ctx->mse[0]); cdef_search_ctx->mse[0] = NULL; aom_free(cdef_search_ctx->mse[1]); cdef_search_ctx->mse[1] = NULL; aom_free(cdef_search_ctx->sb_index); cdef_search_ctx->sb_index = NULL; } } // Initialize the parameters related to CDEF search context. // Inputs: // frame: Pointer to compressed frame buffer // ref: Pointer to the frame buffer holding the source frame // cm: Pointer to top level common structure // xd: Pointer to common current coding block structure // cdef_search_ctx: Pointer to the structure containing parameters related to // CDEF search context. // pick_method: Search method used to select CDEF parameters // Returns: // Nothing will be returned. Contents of cdef_search_ctx will be modified. static inline void cdef_params_init(const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm, MACROBLOCKD *xd, CdefSearchCtx *cdef_search_ctx, CDEF_PICK_METHOD pick_method) { const CommonModeInfoParams *const mi_params = &cm->mi_params; const int num_planes = av1_num_planes(cm); cdef_search_ctx->mi_params = &cm->mi_params; cdef_search_ctx->ref = ref; cdef_search_ctx->nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; cdef_search_ctx->nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; cdef_search_ctx->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0); cdef_search_ctx->damping = 3 + (cm->quant_params.base_qindex >> 6); cdef_search_ctx->total_strengths = nb_cdef_strengths[pick_method]; cdef_search_ctx->num_planes = num_planes; cdef_search_ctx->pick_method = pick_method; cdef_search_ctx->sb_count = 0; cdef_search_ctx->use_highbitdepth = cm->seq_params->use_highbitdepth; av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0, num_planes); // Initialize plane wise information. for (int pli = 0; pli < num_planes; pli++) { cdef_search_ctx->xdec[pli] = xd->plane[pli].subsampling_x; cdef_search_ctx->ydec[pli] = xd->plane[pli].subsampling_y; cdef_search_ctx->bsize[pli] = cdef_search_ctx->ydec[pli] ? (cdef_search_ctx->xdec[pli] ? BLOCK_4X4 : BLOCK_8X4) : (cdef_search_ctx->xdec[pli] ? BLOCK_4X8 : BLOCK_8X8); cdef_search_ctx->mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x; cdef_search_ctx->mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y; cdef_search_ctx->plane[pli] = xd->plane[pli]; } // Function pointer initialization. #if CONFIG_AV1_HIGHBITDEPTH if (cm->seq_params->use_highbitdepth) { cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_highbd; cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist_highbd; } else { cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_lowbd; cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist; } #else cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_lowbd; cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist; #endif } void av1_pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef, int is_screen_content) { const int bd = cm->seq_params->bit_depth; const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, bd) >> (bd - 8); CdefInfo *const cdef_info = &cm->cdef_info; // Check the speed feature to avoid extra signaling. if (skip_cdef) { cdef_info->cdef_bits = 1; cdef_info->nb_cdef_strengths = 2; } else { cdef_info->cdef_bits = 0; cdef_info->nb_cdef_strengths = 1; } cdef_info->cdef_damping = 3 + (cm->quant_params.base_qindex >> 6); int predicted_y_f1 = 0; int predicted_y_f2 = 0; int predicted_uv_f1 = 0; int predicted_uv_f2 = 0; if (is_screen_content) { predicted_y_f1 = (int)(5.88217781e-06 * q * q + 6.10391455e-03 * q + 9.95043102e-02); predicted_y_f2 = (int)(-7.79934857e-06 * q * q + 6.58957830e-03 * q + 8.81045025e-01); predicted_uv_f1 = (int)(-6.79500136e-06 * q * q + 1.02695586e-02 * q + 1.36126802e-01); predicted_uv_f2 = (int)(-9.99613695e-08 * q * q - 1.79361339e-05 * q + 1.17022324e+0); predicted_y_f1 = clamp(predicted_y_f1, 0, 15); predicted_y_f2 = clamp(predicted_y_f2, 0, 3); predicted_uv_f1 = clamp(predicted_uv_f1, 0, 15); predicted_uv_f2 = clamp(predicted_uv_f2, 0, 3); } else { if (!frame_is_intra_only(cm)) { predicted_y_f1 = clamp((int)roundf(q * q * -0.0000023593946f + q * 0.0068615186f + 0.02709886f), 0, 15); predicted_y_f2 = clamp((int)roundf(q * q * -0.00000057629734f + q * 0.0013993345f + 0.03831067f), 0, 3); predicted_uv_f1 = clamp((int)roundf(q * q * -0.0000007095069f + q * 0.0034628846f + 0.00887099f), 0, 15); predicted_uv_f2 = clamp((int)roundf(q * q * 0.00000023874085f + q * 0.00028223585f + 0.05576307f), 0, 3); } else { predicted_y_f1 = clamp( (int)roundf(q * q * 0.0000033731974f + q * 0.008070594f + 0.0187634f), 0, 15); predicted_y_f2 = clamp((int)roundf(q * q * 0.0000029167343f + q * 0.0027798624f + 0.0079405f), 0, 3); predicted_uv_f1 = clamp((int)roundf(q * q * -0.0000130790995f + q * 0.012892405f - 0.00748388f), 0, 15); predicted_uv_f2 = clamp((int)roundf(q * q * 0.0000032651783f + q * 0.00035520183f + 0.00228092f), 0, 3); } } cdef_info->cdef_strengths[0] = predicted_y_f1 * CDEF_SEC_STRENGTHS + predicted_y_f2; cdef_info->cdef_uv_strengths[0] = predicted_uv_f1 * CDEF_SEC_STRENGTHS + predicted_uv_f2; // mbmi->cdef_strength is already set in the encoding stage. We don't need to // set it again here. if (skip_cdef) { cdef_info->cdef_strengths[1] = 0; cdef_info->cdef_uv_strengths[1] = 0; return; } const CommonModeInfoParams *const mi_params = &cm->mi_params; const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; MB_MODE_INFO **mbmi = mi_params->mi_grid_base; // mbmi is NULL when real-time rate control library is used. if (!mbmi) return; for (int r = 0; r < nvfb; ++r) { for (int c = 0; c < nhfb; ++c) { MB_MODE_INFO *current_mbmi = mbmi[MI_SIZE_64X64 * c]; current_mbmi->cdef_strength = 0; } mbmi += MI_SIZE_64X64 * mi_params->mi_stride; } } void av1_cdef_search(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; CDEF_CONTROL cdef_control = cpi->oxcf.tool_cfg.cdef_control; assert(cdef_control != CDEF_NONE); // For CDEF_ADAPTIVE, turning off CDEF around qindex 32 was best for still // pictures if ((cdef_control == CDEF_REFERENCE && cpi->ppi->rtc_ref.non_reference_frame) || (cdef_control == CDEF_ADAPTIVE && cpi->oxcf.mode == ALLINTRA && (cpi->oxcf.rc_cfg.mode == AOM_Q || cpi->oxcf.rc_cfg.mode == AOM_CQ) && cpi->oxcf.rc_cfg.cq_level <= 32)) { CdefInfo *const cdef_info = &cm->cdef_info; cdef_info->nb_cdef_strengths = 1; cdef_info->cdef_bits = 0; cdef_info->cdef_strengths[0] = 0; cdef_info->cdef_uv_strengths[0] = 0; return; } // Indicate if external RC is used for testing const int rtc_ext_rc = cpi->rc.rtc_external_ratectrl; if (rtc_ext_rc) { av1_pick_cdef_from_qp(cm, 0, 0); return; } CDEF_PICK_METHOD pick_method = cpi->sf.lpf_sf.cdef_pick_method; if (pick_method == CDEF_PICK_FROM_Q) { const int use_screen_content_model = cm->quant_params.base_qindex > AOMMAX(cpi->sf.rt_sf.screen_content_cdef_filter_qindex_thresh, cpi->rc.best_quality + 5) && cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN; av1_pick_cdef_from_qp(cm, cpi->sf.rt_sf.skip_cdef_sb, use_screen_content_model); return; } const CommonModeInfoParams *const mi_params = &cm->mi_params; const int damping = 3 + (cm->quant_params.base_qindex >> 6); const int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 && pick_method <= CDEF_FAST_SEARCH_LVL5); const int num_planes = av1_num_planes(cm); MACROBLOCKD *xd = &cpi->td.mb.e_mbd; if (!cpi->cdef_search_ctx) CHECK_MEM_ERROR(cm, cpi->cdef_search_ctx, aom_malloc(sizeof(*cpi->cdef_search_ctx))); CdefSearchCtx *cdef_search_ctx = cpi->cdef_search_ctx; // Initialize parameters related to CDEF search context. cdef_params_init(&cm->cur_frame->buf, cpi->source, cm, xd, cdef_search_ctx, pick_method); // Allocate CDEF search context buffers. cdef_alloc_data(cm, cdef_search_ctx); // Frame level mse calculation. if (cpi->mt_info.num_workers > 1) { av1_cdef_mse_calc_frame_mt(cpi); } else { cdef_mse_calc_frame(cdef_search_ctx, cm->error); } /* Search for different number of signaling bits. */ int nb_strength_bits = 0; uint64_t best_rd = UINT64_MAX; CdefInfo *const cdef_info = &cm->cdef_info; int sb_count = cdef_search_ctx->sb_count; uint64_t(*mse[2])[TOTAL_STRENGTHS]; mse[0] = cdef_search_ctx->mse[0]; mse[1] = cdef_search_ctx->mse[1]; /* Calculate the maximum number of bits required to signal CDEF strengths at * block level */ const int total_strengths = nb_cdef_strengths[pick_method]; const int joint_strengths = num_planes > 1 ? total_strengths * total_strengths : total_strengths; const int max_signaling_bits = joint_strengths == 1 ? 0 : get_msb(joint_strengths - 1) + 1; int rdmult = cpi->td.mb.rdmult; for (int i = 0; i <= 3; i++) { if (i > max_signaling_bits) break; int best_lev0[CDEF_MAX_STRENGTHS] = { 0 }; int best_lev1[CDEF_MAX_STRENGTHS] = { 0 }; const int nb_strengths = 1 << i; uint64_t tot_mse; if (num_planes > 1) { tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths, mse, sb_count, pick_method); } else { tot_mse = joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count, pick_method); } const int total_bits = sb_count * i + nb_strengths * CDEF_STRENGTH_BITS * (num_planes > 1 ? 2 : 1); const int rate_cost = av1_cost_literal(total_bits); const uint64_t dist = tot_mse * 16; const uint64_t rd = RDCOST(rdmult, rate_cost, dist); if (rd < best_rd) { best_rd = rd; nb_strength_bits = i; memcpy(cdef_info->cdef_strengths, best_lev0, nb_strengths * sizeof(best_lev0[0])); if (num_planes > 1) { memcpy(cdef_info->cdef_uv_strengths, best_lev1, nb_strengths * sizeof(best_lev1[0])); } } } cdef_info->cdef_bits = nb_strength_bits; cdef_info->nb_cdef_strengths = 1 << nb_strength_bits; for (int i = 0; i < sb_count; i++) { uint64_t best_mse = UINT64_MAX; int best_gi = 0; for (int gi = 0; gi < cdef_info->nb_cdef_strengths; gi++) { uint64_t curr = mse[0][i][cdef_info->cdef_strengths[gi]]; if (num_planes > 1) curr += mse[1][i][cdef_info->cdef_uv_strengths[gi]]; if (curr < best_mse) { best_gi = gi; best_mse = curr; } } mi_params->mi_grid_base[cdef_search_ctx->sb_index[i]]->cdef_strength = best_gi; } if (fast) { for (int j = 0; j < cdef_info->nb_cdef_strengths; j++) { const int luma_strength = cdef_info->cdef_strengths[j]; const int chroma_strength = cdef_info->cdef_uv_strengths[j]; int pri_strength, sec_strength; STORE_CDEF_FILTER_STRENGTH(cdef_info->cdef_strengths[j], pick_method, luma_strength); STORE_CDEF_FILTER_STRENGTH(cdef_info->cdef_uv_strengths[j], pick_method, chroma_strength); } } // For CDEF_ADAPTIVE, set primary and secondary CDEF at reduced strength for // qindexes 33 through 220. // Note 1: for odd strengths, the 0.5 discarded by ">> 1" is a significant // part of the strength when the strength is small, and because there are // few strength levels, odd strengths are reduced significantly more than a // half. This is intended behavior for reduced strength. // For example: a pri strength of 3 becomes 1, and a sec strength of 1 // becomes 0. // Note 2: a (signaled) sec strength value of 3 is special as it results in an // actual sec strength of 4. We tried adding +1 to the sec strength 3 so it // maps to a reduced sec strength of 2. However, on Daala's subset1, the // resulting SSIMULACRA 2 scores were either exactly the same (at cpu-used 6), // or within noise level (at cpu-used 3). Given that there were no discernible // improvements, this special mapping was left out for reduced strength. if (cdef_control == CDEF_ADAPTIVE && cpi->oxcf.mode == ALLINTRA && (cpi->oxcf.rc_cfg.mode == AOM_Q || cpi->oxcf.rc_cfg.mode == AOM_CQ) && cpi->oxcf.rc_cfg.cq_level <= 220) { for (int j = 0; j < cdef_info->nb_cdef_strengths; j++) { const int luma_strength = cdef_info->cdef_strengths[j]; const int chroma_strength = cdef_info->cdef_uv_strengths[j]; const int new_pri_luma_strength = (luma_strength / CDEF_SEC_STRENGTHS) >> 1; const int new_sec_luma_strength = (luma_strength % CDEF_SEC_STRENGTHS) >> 1; const int new_pri_chroma_strength = (chroma_strength / CDEF_SEC_STRENGTHS) >> 1; const int new_sec_chroma_strength = (chroma_strength % CDEF_SEC_STRENGTHS) >> 1; cdef_info->cdef_strengths[j] = new_pri_luma_strength * CDEF_SEC_STRENGTHS + new_sec_luma_strength; cdef_info->cdef_uv_strengths[j] = new_pri_chroma_strength * CDEF_SEC_STRENGTHS + new_sec_chroma_strength; } } cdef_info->cdef_damping = damping; // Deallocate CDEF search context buffers. av1_cdef_dealloc_data(cdef_search_ctx); } aom-3.12.1/av1/encoder/pickcdef.h000066400000000000000000000212241477627663500164310ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_PICKCDEF_H_ #define AOM_AV1_ENCODER_PICKCDEF_H_ #include "av1/common/cdef.h" #include "av1/encoder/speed_features.h" #ifdef __cplusplus extern "C" { #endif /*!\enum CDEF_CONTROL * \brief This enum controls to which frames CDEF is applied. */ typedef enum { CDEF_NONE = 0, /* Disable CDEF on all frames. */ CDEF_ALL = 1, /* Enable CDEF for all frames. */ CDEF_REFERENCE = 2, /* Disable CDEF on non reference frames. */ CDEF_ADAPTIVE = 3, /* Enable CDEF adaptively based on frame qindex */ } CDEF_CONTROL; /*!\cond */ struct MultiThreadInfo; #define REDUCED_PRI_STRENGTHS_LVL1 8 #define REDUCED_PRI_STRENGTHS_LVL2 5 #define REDUCED_SEC_STRENGTHS_LVL3 2 #define REDUCED_SEC_STRENGTHS_LVL5 1 #define REDUCED_PRI_STRENGTHS_LVL4 2 #define REDUCED_TOTAL_STRENGTHS_LVL1 \ (REDUCED_PRI_STRENGTHS_LVL1 * CDEF_SEC_STRENGTHS) #define REDUCED_TOTAL_STRENGTHS_LVL2 \ (REDUCED_PRI_STRENGTHS_LVL2 * CDEF_SEC_STRENGTHS) #define REDUCED_TOTAL_STRENGTHS_LVL3 \ (REDUCED_PRI_STRENGTHS_LVL2 * REDUCED_SEC_STRENGTHS_LVL3) #define REDUCED_TOTAL_STRENGTHS_LVL4 \ (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL3) #define REDUCED_TOTAL_STRENGTHS_LVL5 \ (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL5) #define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS) static const int priconv_lvl1[REDUCED_PRI_STRENGTHS_LVL1] = { 0, 1, 2, 3, 5, 7, 10, 13 }; static const int priconv_lvl2[REDUCED_PRI_STRENGTHS_LVL2] = { 0, 2, 4, 8, 14 }; static const int priconv_lvl4[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 11 }; static const int priconv_lvl5[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 5 }; static const int secconv_lvl3[REDUCED_SEC_STRENGTHS_LVL3] = { 0, 2 }; static const int secconv_lvl5[REDUCED_SEC_STRENGTHS_LVL5] = { 0 }; static const int nb_cdef_strengths[CDEF_PICK_METHODS] = { TOTAL_STRENGTHS, REDUCED_TOTAL_STRENGTHS_LVL1, REDUCED_TOTAL_STRENGTHS_LVL2, REDUCED_TOTAL_STRENGTHS_LVL3, REDUCED_TOTAL_STRENGTHS_LVL4, REDUCED_TOTAL_STRENGTHS_LVL5, TOTAL_STRENGTHS }; typedef void (*copy_fn_t)(uint16_t *dst, int dstride, const uint8_t *src, int src_voffset, int src_hoffset, int sstride, int vsize, int hsize); typedef uint64_t (*compute_cdef_dist_t)(void *dst, int dstride, uint16_t *src, cdef_list *dlist, int cdef_count, BLOCK_SIZE bsize, int coeff_shift, int row, int col); /*! \brief CDEF search context. */ typedef struct { /*! * Pointer to the frame buffer holding the source frame */ const YV12_BUFFER_CONFIG *ref; /*! * Pointer to params related to MB_MODE_INFO arrays and related info */ CommonModeInfoParams *mi_params; /*! * Info specific to each plane */ struct macroblockd_plane plane[MAX_MB_PLANE]; /*! * Function pointer of copy_fn */ copy_fn_t copy_fn; /*! * Function pointer of compute_cdef_dist_fn */ compute_cdef_dist_t compute_cdef_dist_fn; /*! * Number of strenghts evaluated in CDEF filter search */ int total_strengths; /*! * Bit-depth dependent shift */ int coeff_shift; /*! * CDEF damping factor */ int damping; /*! * Search method used to select CDEF parameters */ int pick_method; /*! * Number of planes */ int num_planes; /*! * Log2 of width of the MI unit in pixels. mi_wide_l2[i] * indicates the width of the MI unit in pixels for the ith plane */ int mi_wide_l2[MAX_MB_PLANE]; /*! * Log2 of height of the MI unit in pixels. mi_high_l2[i] * indicates the height of the MI unit in pixels for the ith plane */ int mi_high_l2[MAX_MB_PLANE]; /*! * Subsampling in x direction. xdec[i] indicates the subsampling * for the ith plane */ int xdec[MAX_MB_PLANE]; /*! * Subsampling in y direction. ydec[i] indicates the subsampling * for the ith plane */ int ydec[MAX_MB_PLANE]; /*! * bsize[i] indicates the block size of ith plane */ int bsize[MAX_MB_PLANE]; /*! * Number of 64x64 blocks in vertical direction of a frame */ int nvfb; /*! * Number of 64x64 blocks in horizontal direction of a frame */ int nhfb; /*! * Pointer to the mean squared error between the CDEF filtered block and the * source block. mse[i][j][k] stores the MSE of the ith plane (i=0 corresponds * to Y-plane, i=1 corresponds to U and V planes), jth block and kth strength * index */ uint64_t (*mse[2])[TOTAL_STRENGTHS]; /*! * Holds the position (in units of mi's) of the cdef filtered * block in raster scan order */ int *sb_index; /*! * Holds the count of cdef filtered blocks */ int sb_count; /*! * Indicates if 16bit frame buffers are to be used i.e., the content bit-depth * is > 8-bit */ bool use_highbitdepth; } CdefSearchCtx; static inline int sb_all_skip(const CommonModeInfoParams *const mi_params, int mi_row, int mi_col) { const int maxr = AOMMIN(mi_params->mi_rows - mi_row, MI_SIZE_64X64); const int maxc = AOMMIN(mi_params->mi_cols - mi_col, MI_SIZE_64X64); const int stride = mi_params->mi_stride; MB_MODE_INFO **mbmi = mi_params->mi_grid_base + mi_row * stride + mi_col; for (int r = 0; r < maxr; ++r, mbmi += stride) { for (int c = 0; c < maxc; ++c) { if (!mbmi[c]->skip_txfm) return 0; } } return 1; } // Checks if cdef processing can be skipped for particular sb. // Inputs: // cdef_search_ctx: Pointer to the structure containing parameters related to // CDEF search context. // fbr: Row index in units of 64x64 block // fbc: Column index in units of 64x64 block // Returns: // 1/0 will be returned to indicate skip/don't skip cdef processing of sb // respectively. static inline int cdef_sb_skip(const CommonModeInfoParams *const mi_params, int fbr, int fbc) { const MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc]; // No filtering if the entire filter block is skipped. if (sb_all_skip(mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64)) return 1; // Skip odd numbered 64x64 block rows(cols) when bsize is BLOCK_128X128, // BLOCK_64X128(BLOCK_128X128, BLOCK_128X64) as for such blocks CDEF filtering // is done at the corresponding block sizes. if (((fbc & 1) && (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64)) || ((fbr & 1) && (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_64X128))) return 1; return 0; } void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx); void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, struct aom_internal_error_info *error_info, int fbr, int fbc, int sb_count); /*!\endcond */ /*!\brief AV1 CDEF parameter search * * \ingroup in_loop_cdef * * Searches for optimal CDEF parameters for frame * * \param[in,out] cpi Top level encoder structure * * \remark Nothing is returned. Instead, optimal CDEF parameters are stored * in the \c cdef_info structure of type \ref CdefInfo inside \c cm: * \arg \c cdef_bits: Bits of strength parameters * \arg \c nb_cdef_strengths: Number of strength parameters * \arg \c cdef_strengths: list of \c nb_cdef_strengths strength parameters * for the luma plane. * \arg \c uv_cdef_strengths: list of \c nb_cdef_strengths strength parameters * for the chroma planes. * \arg \c damping_factor: CDEF damping factor. * */ void av1_cdef_search(struct AV1_COMP *cpi); /*!\brief AV1 CDEF level from QP * * \ingroup in_loop_cdef * * Calculates CDEF levels from frame QP. Only used for speed 7+ with RT mode. * * \param[in,out] cm Pointer to top level common structure * \param[in] skip_cdef Flag to skip CDEF filtering * \param[in] is_screen_content Flag indicating screen content * */ void av1_pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef, int is_screen_content); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_PICKCDEF_H_ aom-3.12.1/av1/encoder/picklpf.c000066400000000000000000000342101477627663500163030ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/aom_scale_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/psnr.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "av1/common/av1_common_int.h" #include "av1/common/av1_loopfilter.h" #include "av1/common/quant_common.h" #include "av1/encoder/av1_quantize.h" #include "av1/encoder/encoder.h" #include "av1/encoder/picklpf.h" // AV1 loop filter applies to the whole frame according to mi_rows and mi_cols, // which are calculated based on aligned width and aligned height, // In addition, if super res is enabled, it copies the whole frame // according to the aligned width and height (av1_superres_upscale()). // So we need to copy the whole filtered region, instead of the cropped region. // For example, input image size is: 160x90. // Then src->y_crop_width = 160, src->y_crop_height = 90. // The aligned frame size is: src->y_width = 160, src->y_height = 96. // AV1 aligns frame size to a multiple of 8, if there is // chroma subsampling, it is able to ensure the chroma is also // an integer number of mi units. mi unit is 4x4, 8 = 4 * 2, and 2 luma mi // units correspond to 1 chroma mi unit if there is subsampling. // See: aom_realloc_frame_buffer() in yv12config.c. static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc, YV12_BUFFER_CONFIG *dst_bc, int plane) { switch (plane) { case 0: aom_yv12_copy_y(src_bc, dst_bc, 0); break; case 1: aom_yv12_copy_u(src_bc, dst_bc, 0); break; case 2: aom_yv12_copy_v(src_bc, dst_bc, 0); break; default: assert(plane >= 0 && plane <= 2); break; } } static int get_max_filter_level(const AV1_COMP *cpi) { if (is_stat_consumption_stage_twopass(cpi)) { return cpi->ppi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 : MAX_LOOP_FILTER; } else { return MAX_LOOP_FILTER; } } static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd, AV1_COMP *const cpi, int filt_level, int partial_frame, int plane, int dir) { MultiThreadInfo *const mt_info = &cpi->mt_info; int num_workers = mt_info->num_mod_workers[MOD_LPF]; AV1_COMMON *const cm = &cpi->common; int64_t filt_err; assert(plane >= 0 && plane <= 2); int filter_level[2] = { filt_level, filt_level }; if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1]; if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0]; // set base filters for use of get_filter_level (av1_loopfilter.c) when in // DELTA_LF mode switch (plane) { case 0: cm->lf.filter_level[0] = filter_level[0]; cm->lf.filter_level[1] = filter_level[1]; break; case 1: cm->lf.filter_level_u = filter_level[0]; break; case 2: cm->lf.filter_level_v = filter_level[0]; break; } // lpf_opt_level = 1 : Enables dual/quad loop-filtering. int lpf_opt_level = is_inter_tx_size_search_level_one(&cpi->sf.tx_sf); av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane, plane + 1, partial_frame, mt_info->workers, num_workers, &mt_info->lf_row_sync, lpf_opt_level); filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane, cm->seq_params->use_highbitdepth); // Re-instate the unfiltered frame yv12_copy_plane(&cpi->last_frame_uf, &cm->cur_frame->buf, plane); return filt_err; } static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, int partial_frame, const int *last_frame_filter_level, int plane, int dir) { const AV1_COMMON *const cm = &cpi->common; const int min_filter_level = 0; const int max_filter_level = get_max_filter_level(cpi); int filt_direction = 0; int64_t best_err; int filt_best; // Start the search at the previous frame filter level unless it is now out of // range. int lvl; switch (plane) { case 0: switch (dir) { case 2: lvl = (last_frame_filter_level[0] + last_frame_filter_level[1] + 1) >> 1; break; case 0: case 1: lvl = last_frame_filter_level[dir]; break; default: assert(dir >= 0 && dir <= 2); return 0; } break; case 1: lvl = last_frame_filter_level[2]; break; case 2: lvl = last_frame_filter_level[3]; break; default: assert(plane >= 0 && plane <= 2); return 0; } int filt_mid = clamp(lvl, min_filter_level, max_filter_level); int filter_step = filt_mid < 16 ? 4 : filt_mid / 4; // Sum squared error at each filter level int64_t ss_err[MAX_LOOP_FILTER + 1]; const int use_coarse_search = cpi->sf.lpf_sf.use_coarse_filter_level_search; assert(use_coarse_search <= 1); static const int min_filter_step_lookup[2] = { 0, 2 }; // min_filter_step_thesh determines the stopping criteria for the search. // The search is terminated when filter_step equals min_filter_step_thesh. const int min_filter_step_thesh = min_filter_step_lookup[use_coarse_search]; // Set each entry to -1 memset(ss_err, 0xFF, sizeof(ss_err)); yv12_copy_plane(&cm->cur_frame->buf, &cpi->last_frame_uf, plane); best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir); filt_best = filt_mid; ss_err[filt_mid] = best_err; while (filter_step > min_filter_step_thesh) { const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level); const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level); // Bias against raising loop filter in favor of lowering it. int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; if ((is_stat_consumption_stage_twopass(cpi)) && (cpi->ppi->twopass.section_intra_rating < 20)) bias = (bias * cpi->ppi->twopass.section_intra_rating) / 20; // yx, bias less for large block size if (cm->features.tx_mode != ONLY_4X4) bias >>= 1; if (filt_direction <= 0 && filt_low != filt_mid) { // Get Low filter error score if (ss_err[filt_low] < 0) { ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame, plane, dir); } // If value is close to the best so far then bias towards a lower loop // filter value. if (ss_err[filt_low] < (best_err + bias)) { // Was it actually better than the previous best? if (ss_err[filt_low] < best_err) { best_err = ss_err[filt_low]; } filt_best = filt_low; } } // Now look at filt_high if (filt_direction >= 0 && filt_high != filt_mid) { if (ss_err[filt_high] < 0) { ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame, plane, dir); } // If value is significantly better than previous best, bias added against // raising filter value if (ss_err[filt_high] < (best_err - bias)) { best_err = ss_err[filt_high]; filt_best = filt_high; } } // Half the step distance if the best filter value was the same as last time if (filt_best == filt_mid) { filter_step /= 2; filt_direction = 0; } else { filt_direction = (filt_best < filt_mid) ? -1 : 1; filt_mid = filt_best; } } return filt_best; } void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, LPF_PICK_METHOD method) { AV1_COMMON *const cm = &cpi->common; const SequenceHeader *const seq_params = cm->seq_params; const int num_planes = av1_num_planes(cm); struct loopfilter *const lf = &cm->lf; int disable_filter_rt_screen = 0; (void)sd; // Enable loop filter sharpness only for allintra encoding mode, // as frames do not have to serve as references to others lf->sharpness_level = cpi->oxcf.mode == ALLINTRA ? cpi->oxcf.algo_cfg.sharpness : 0; if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->sf.rt_sf.skip_lf_screen) disable_filter_rt_screen = av1_cyclic_refresh_disable_lf_cdef(cpi); if (disable_filter_rt_screen || cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_NONE || (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_REFERENCE && cpi->ppi->rtc_ref.non_reference_frame)) { lf->filter_level[0] = 0; lf->filter_level[1] = 0; return; } if (method == LPF_PICK_MINIMAL_LPF) { lf->filter_level[0] = 0; lf->filter_level[1] = 0; } else if (method >= LPF_PICK_FROM_Q) { const int min_filter_level = 0; const int max_filter_level = get_max_filter_level(cpi); const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, seq_params->bit_depth); // based on tests result for rtc test set // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point const int strength_boost_q_treshold = 0; int inter_frame_multiplier = (q > strength_boost_q_treshold || (cpi->sf.rt_sf.use_nonrd_pick_mode && cpi->common.width * cpi->common.height > 352 * 288)) ? 12034 : 6017; // Increase strength on base TL0 for temporal layers, for low-resoln, // based on frame source_sad. if (cpi->svc.number_temporal_layers > 1 && cpi->svc.temporal_layer_id == 0 && cpi->common.width * cpi->common.height <= 352 * 288 && cpi->sf.rt_sf.use_nonrd_pick_mode) { if (cpi->rc.frame_source_sad > 100000) inter_frame_multiplier = inter_frame_multiplier << 1; else if (cpi->rc.frame_source_sad > 50000) inter_frame_multiplier = 3 * (inter_frame_multiplier >> 1); } else if (cpi->sf.rt_sf.use_fast_fixed_part) { inter_frame_multiplier = inter_frame_multiplier << 1; } // These values were determined by linear fitting the result of the // searched level for 8 bit depth: // Keyframes: filt_guess = q * 0.06699 - 1.60817 // Other frames: filt_guess = q * inter_frame_multiplier + 2.48225 // // And high bit depth separately: // filt_guess = q * 0.316206 + 3.87252 int filt_guess; switch (seq_params->bit_depth) { case AOM_BITS_8: filt_guess = (cm->current_frame.frame_type == KEY_FRAME) ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18) : ROUND_POWER_OF_TWO(q * inter_frame_multiplier + 650707, 18); break; case AOM_BITS_10: filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20); break; case AOM_BITS_12: filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22); break; default: assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 " "or AOM_BITS_12"); return; } if (seq_params->bit_depth != AOM_BITS_8 && cm->current_frame.frame_type == KEY_FRAME) filt_guess -= 4; // TODO(chengchen): retrain the model for Y, U, V filter levels lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level); lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level); lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level); lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level); if (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_SELECTIVELY && !frame_is_intra_only(cm) && !cpi->rc.high_source_sad) { if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { lf->filter_level[0] = 0; lf->filter_level[1] = 0; } else { const int num4x4 = (cm->width >> 2) * (cm->height >> 2); const int newmv_thresh = 7; const int distance_since_key_thresh = 5; if ((cpi->td.rd_counts.newmv_or_intra_blocks * 100 / num4x4) < newmv_thresh && cpi->rc.frames_since_key > distance_since_key_thresh) { lf->filter_level[0] = 0; lf->filter_level[1] = 0; } } } } else { int last_frame_filter_level[4] = { 0 }; if (!frame_is_intra_only(cm)) { last_frame_filter_level[0] = cpi->ppi->filter_level[0]; last_frame_filter_level[1] = cpi->ppi->filter_level[1]; last_frame_filter_level[2] = cpi->ppi->filter_level_u; last_frame_filter_level[3] = cpi->ppi->filter_level_v; } // The frame buffer last_frame_uf is used to store the non-loop filtered // reconstructed frame in search_filter_level(). if (aom_realloc_frame_buffer( &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL, false, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate last frame buffer"); lf->filter_level[0] = lf->filter_level[1] = search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, last_frame_filter_level, 0, 2); if (method != LPF_PICK_FROM_FULL_IMAGE_NON_DUAL) { lf->filter_level[0] = search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, last_frame_filter_level, 0, 0); lf->filter_level[1] = search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, last_frame_filter_level, 0, 1); } if (num_planes > 1) { lf->filter_level_u = search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, last_frame_filter_level, 1, 0); lf->filter_level_v = search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, last_frame_filter_level, 2, 0); } } } aom-3.12.1/av1/encoder/picklpf.h000066400000000000000000000157711477627663500163230ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_PICKLPF_H_ #define AOM_AV1_ENCODER_PICKLPF_H_ #ifdef __cplusplus extern "C" { #endif #include "av1/encoder/encoder.h" struct yv12_buffer_config; struct AV1_COMP; /*!\brief Algorithm for AV1 loop filter level selection. * * \ingroup in_loop_filter * This function determines proper filter levels used for in-loop filter * (deblock filter). * * \param[in] sd The pointer of frame buffer * \param[in] cpi Top-level encoder structure * \param[in] method The method used to select filter levels * * \par * method includes: * \arg \c LPF_PICK_FROM_FULL_IMAGE: Try the full image with different values. * \arg \c LPF_PICK_FROM_FULL_IMAGE_NON_DUAL: Try the full image filter search * with non-dual filter only. * \arg \c LPF_PICK_FROM_SUBIMAGE: Try a small portion of the image with * different values. * \arg \c LPF_PICK_FROM_Q: Estimate the level based on quantizer and frame type * \arg \c LPF_PICK_MINIMAL_LPF: Pick 0 to disable LPF if LPF was enabled last * frame * * \remark Nothing is returned. Instead, filter levels below are stored in the * "loopfilter" structure inside "cpi": * \arg \c filter_level[0]: the vertical filter level for Y plane * \arg \c filter_level[1]: the horizontal filter level for Y plane * \arg \c filter_level_u: the filter level for U plane * \arg \c filter_level_v: the filter level for V plane * * \n * \b Overview * \par * The workflow of deblock filter is shown in Fig.1. \n * Boundary pixels pass through a non-flatness check, followed by a step that * determines smoothness and selects proper types of filters * (4-, 6-, 8-, 14-tap filter). \n * If non-flatness criteria is not satisfied, the encoder will not apply * deblock filtering on these boundary pixels. * \image html filter_flow.png "Fig.1. The workflow of deblock filter" width=70% * * \par * The non-flatness is determined by the boundary pixels and thresholds as shown * in Fig.2. \n * Filtering is applied when \n * \f$|p_0-p_1| 700 ? 0.04590 : 0.02295 \n * For 10 bit and 12 bit: \n * filt_guess = q * 0.316206 + 3.87252 \n * Then filter_level[0] = filter_level[1] = filter_level_u = filter_level_v = * clamp(filt_guess, min_filter_level, max_filter_level) \n * Where min_filter_level = 0, max_filter_level = 64 \n * The equations were determined by linear fitting using filter levels * generated by "LPF_PICK_FROM_FULL_IMAGE" method. * */ void av1_pick_filter_level(const struct yv12_buffer_config *sd, struct AV1_COMP *cpi, LPF_PICK_METHOD method); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_PICKLPF_H_ aom-3.12.1/av1/encoder/pickrst.c000066400000000000000000002557301477627663500163460ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "config/aom_scale_rtcd.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/binary_codes_writer.h" #include "aom_dsp/mathutils.h" #include "aom_dsp/psnr.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "av1/common/av1_common_int.h" #include "av1/common/quant_common.h" #include "av1/common/restoration.h" #include "av1/encoder/av1_quantize.h" #include "av1/encoder/encoder.h" #include "av1/encoder/picklpf.h" #include "av1/encoder/pickrst.h" // Number of Wiener iterations #define NUM_WIENER_ITERS 5 // Penalty factor for use of dual sgr #define DUAL_SGR_PENALTY_MULT 0.01 // Working precision for Wiener filter coefficients #define WIENER_TAP_SCALE_FACTOR ((int64_t)1 << 16) #define SGRPROJ_EP_GRP1_START_IDX 0 #define SGRPROJ_EP_GRP1_END_IDX 9 #define SGRPROJ_EP_GRP1_SEARCH_COUNT 4 #define SGRPROJ_EP_GRP2_3_SEARCH_COUNT 2 static const int sgproj_ep_grp1_seed[SGRPROJ_EP_GRP1_SEARCH_COUNT] = { 0, 3, 6, 9 }; static const int sgproj_ep_grp2_3[SGRPROJ_EP_GRP2_3_SEARCH_COUNT][14] = { { 10, 10, 11, 11, 12, 12, 13, 13, 13, 13, -1, -1, -1, -1 }, { 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15 } }; #if DEBUG_LR_COSTING RestorationUnitInfo lr_ref_params[RESTORE_TYPES][MAX_MB_PLANE] [MAX_LR_UNITS_W * MAX_LR_UNITS_H]; #endif // DEBUG_LR_COSTING typedef int64_t (*sse_extractor_type)(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); typedef int64_t (*sse_part_extractor_type)(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height); typedef uint64_t (*var_part_extractor_type)(const YV12_BUFFER_CONFIG *a, int hstart, int width, int vstart, int height); #if CONFIG_AV1_HIGHBITDEPTH #define NUM_EXTRACTORS (3 * (1 + 1)) #else #define NUM_EXTRACTORS 3 #endif static const sse_part_extractor_type sse_part_extractors[NUM_EXTRACTORS] = { aom_get_y_sse_part, aom_get_u_sse_part, aom_get_v_sse_part, #if CONFIG_AV1_HIGHBITDEPTH aom_highbd_get_y_sse_part, aom_highbd_get_u_sse_part, aom_highbd_get_v_sse_part, #endif }; static const var_part_extractor_type var_part_extractors[NUM_EXTRACTORS] = { aom_get_y_var, aom_get_u_var, aom_get_v_var, #if CONFIG_AV1_HIGHBITDEPTH aom_highbd_get_y_var, aom_highbd_get_u_var, aom_highbd_get_v_var, #endif }; static int64_t sse_restoration_unit(const RestorationTileLimits *limits, const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst, int plane, int highbd) { return sse_part_extractors[3 * highbd + plane]( src, dst, limits->h_start, limits->h_end - limits->h_start, limits->v_start, limits->v_end - limits->v_start); } static uint64_t var_restoration_unit(const RestorationTileLimits *limits, const YV12_BUFFER_CONFIG *src, int plane, int highbd) { return var_part_extractors[3 * highbd + plane]( src, limits->h_start, limits->h_end - limits->h_start, limits->v_start, limits->v_end - limits->v_start); } typedef struct { const YV12_BUFFER_CONFIG *src; YV12_BUFFER_CONFIG *dst; const AV1_COMMON *cm; const MACROBLOCK *x; int plane; int plane_w; int plane_h; RestUnitSearchInfo *rusi; // Speed features const LOOP_FILTER_SPEED_FEATURES *lpf_sf; uint8_t *dgd_buffer; int dgd_stride; const uint8_t *src_buffer; int src_stride; // SSE values for each restoration mode for the current RU // These are saved by each search function for use in search_switchable() int64_t sse[RESTORE_SWITCHABLE_TYPES]; // This flag will be set based on the speed feature // 'prune_sgr_based_on_wiener'. 0 implies no pruning and 1 implies pruning. uint8_t skip_sgr_eval; // Total rate and distortion so far for each restoration type // These are initialised by reset_rsc in search_rest_type int64_t total_sse[RESTORE_TYPES]; int64_t total_bits[RESTORE_TYPES]; // Reference parameters for delta-coding // // For each restoration type, we need to store the latest parameter set which // has been used, so that we can properly cost up the next parameter set. // Note that we have two sets of these - one for the single-restoration-mode // search (ie, frame_restoration_type = RESTORE_WIENER or RESTORE_SGRPROJ) // and one for the switchable mode. This is because these two cases can lead // to different sets of parameters being signaled, but we don't know which // we will pick for sure until the end of the search process. WienerInfo ref_wiener; SgrprojInfo ref_sgrproj; WienerInfo switchable_ref_wiener; SgrprojInfo switchable_ref_sgrproj; // Buffers used to hold dgd-avg and src-avg data respectively during SIMD // call of Wiener filter. int16_t *dgd_avg; int16_t *src_avg; } RestSearchCtxt; static inline void rsc_on_tile(void *priv) { RestSearchCtxt *rsc = (RestSearchCtxt *)priv; set_default_wiener(&rsc->ref_wiener); set_default_sgrproj(&rsc->ref_sgrproj); set_default_wiener(&rsc->switchable_ref_wiener); set_default_sgrproj(&rsc->switchable_ref_sgrproj); } static inline void reset_rsc(RestSearchCtxt *rsc) { memset(rsc->total_sse, 0, sizeof(rsc->total_sse)); memset(rsc->total_bits, 0, sizeof(rsc->total_bits)); } static inline void init_rsc(const YV12_BUFFER_CONFIG *src, const AV1_COMMON *cm, const MACROBLOCK *x, const LOOP_FILTER_SPEED_FEATURES *lpf_sf, int plane, RestUnitSearchInfo *rusi, YV12_BUFFER_CONFIG *dst, RestSearchCtxt *rsc) { rsc->src = src; rsc->dst = dst; rsc->cm = cm; rsc->x = x; rsc->plane = plane; rsc->rusi = rusi; rsc->lpf_sf = lpf_sf; const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf; const int is_uv = plane != AOM_PLANE_Y; int plane_w, plane_h; av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); assert(plane_w == src->crop_widths[is_uv]); assert(plane_h == src->crop_heights[is_uv]); assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]); assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]); rsc->plane_w = plane_w; rsc->plane_h = plane_h; rsc->src_buffer = src->buffers[plane]; rsc->src_stride = src->strides[is_uv]; rsc->dgd_buffer = dgd->buffers[plane]; rsc->dgd_stride = dgd->strides[is_uv]; } static int64_t try_restoration_unit(const RestSearchCtxt *rsc, const RestorationTileLimits *limits, const RestorationUnitInfo *rui) { const AV1_COMMON *const cm = rsc->cm; const int plane = rsc->plane; const int is_uv = plane > 0; const RestorationInfo *rsi = &cm->rst_info[plane]; RestorationLineBuffers rlbs; const int bit_depth = cm->seq_params->bit_depth; const int highbd = cm->seq_params->use_highbitdepth; const YV12_BUFFER_CONFIG *fts = &cm->cur_frame->buf; // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be // also used in encoder. const int optimized_lr = 0; av1_loop_restoration_filter_unit( limits, rui, &rsi->boundaries, &rlbs, rsc->plane_w, rsc->plane_h, is_uv && cm->seq_params->subsampling_x, is_uv && cm->seq_params->subsampling_y, highbd, bit_depth, fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane], rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr, cm->error); return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd); } int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { int i, j; const uint8_t *src = src8; const uint8_t *dat = dat8; int64_t err = 0; if (params->r[0] > 0 && params->r[1] > 0) { for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15)); assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15)); const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); int32_t v = u << SGRPROJ_PRJ_BITS; v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u); const int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; flt0 += flt0_stride; flt1 += flt1_stride; } } else if (params->r[0] > 0) { for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15)); const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); int32_t v = u << SGRPROJ_PRJ_BITS; v += xq[0] * (flt0[j] - u); const int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; flt0 += flt0_stride; } } else if (params->r[1] > 0) { for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15)); const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); int32_t v = u << SGRPROJ_PRJ_BITS; v += xq[1] * (flt1[j] - u); const int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; flt1 += flt1_stride; } } else { for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { const int32_t e = (int32_t)(dat[j]) - src[j]; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; } } return err; } #if CONFIG_AV1_HIGHBITDEPTH int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); int i, j; int64_t err = 0; const int32_t half = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); if (params->r[0] > 0 && params->r[1] > 0) { int xq0 = xq[0]; int xq1 = xq[1]; for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { const int32_t d = dat[j]; const int32_t s = src[j]; const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS); int32_t v0 = flt0[j] - u; int32_t v1 = flt1[j] - u; int32_t v = half; v += xq0 * v0; v += xq1 * v1; const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s; err += ((int64_t)e * e); } dat += dat_stride; flt0 += flt0_stride; flt1 += flt1_stride; src += src_stride; } } else if (params->r[0] > 0 || params->r[1] > 0) { int exq; int32_t *flt; int flt_stride; if (params->r[0] > 0) { exq = xq[0]; flt = flt0; flt_stride = flt0_stride; } else { exq = xq[1]; flt = flt1; flt_stride = flt1_stride; } for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { const int32_t d = dat[j]; const int32_t s = src[j]; const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS); int32_t v = half; v += exq * (flt[j] - u); const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s; err += ((int64_t)e * e); } dat += dat_stride; flt += flt_stride; src += src_stride; } } else { for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { const int32_t d = dat[j]; const int32_t s = src[j]; const int32_t e = d - s; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; } } return err; } #endif // CONFIG_AV1_HIGHBITDEPTH static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int *xqd, const sgr_params_type *params) { int xq[2]; av1_decode_xq(xqd, xq, params); #if CONFIG_AV1_HIGHBITDEPTH if (use_highbitdepth) { return av1_highbd_pixel_proj_error(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, flt1, flt1_stride, xq, params); } else { return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, flt1, flt1_stride, xq, params); } #else (void)use_highbitdepth; return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, flt1, flt1_stride, xq, params); #endif } #define USE_SGRPROJ_REFINEMENT_SEARCH 1 static int64_t finer_search_pixel_proj_error( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int start_step, int *xqd, const sgr_params_type *params) { int64_t err = get_pixel_proj_error( src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0, flt0_stride, flt1, flt1_stride, xqd, params); (void)start_step; #if USE_SGRPROJ_REFINEMENT_SEARCH int64_t err2; int tap_min[] = { SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MIN1 }; int tap_max[] = { SGRPROJ_PRJ_MAX0, SGRPROJ_PRJ_MAX1 }; for (int s = start_step; s >= 1; s >>= 1) { for (int p = 0; p < 2; ++p) { if ((params->r[0] == 0 && p == 0) || (params->r[1] == 0 && p == 1)) { continue; } int skip = 0; do { if (xqd[p] - s >= tap_min[p]) { xqd[p] -= s; err2 = get_pixel_proj_error(src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0, flt0_stride, flt1, flt1_stride, xqd, params); if (err2 > err) { xqd[p] += s; } else { err = err2; skip = 1; // At the highest step size continue moving in the same direction if (s == start_step) continue; } } break; } while (1); if (skip) break; do { if (xqd[p] + s <= tap_max[p]) { xqd[p] += s; err2 = get_pixel_proj_error(src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0, flt0_stride, flt1, flt1_stride, xqd, params); if (err2 > err) { xqd[p] -= s; } else { err = err2; // At the highest step size continue moving in the same direction if (s == start_step) continue; } } break; } while (1); } } #endif // USE_SGRPROJ_REFINEMENT_SEARCH return err; } static int64_t signed_rounded_divide(int64_t dividend, int64_t divisor) { if (dividend < 0) return (dividend - divisor / 2) / divisor; else return (dividend + divisor / 2) / divisor; } static inline void calc_proj_params_r0_r1_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint8_t *src = src8; const uint8_t *dat = dat8; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; ++j) { const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); const int32_t s = (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u; const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u; H[0][0] += (int64_t)f1 * f1; H[1][1] += (int64_t)f2 * f2; H[0][1] += (int64_t)f1 * f2; C[0] += (int64_t)f1 * s; C[1] += (int64_t)f2 * s; } } H[0][0] /= size; H[0][1] /= size; H[1][1] /= size; H[1][0] = H[0][1]; C[0] /= size; C[1] /= size; } #if CONFIG_AV1_HIGHBITDEPTH static inline void calc_proj_params_r0_r1_high_bd_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); for (int i = 0; i < height; ++i) { for (int j = 0; j < width; ++j) { const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); const int32_t s = (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u; const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u; H[0][0] += (int64_t)f1 * f1; H[1][1] += (int64_t)f2 * f2; H[0][1] += (int64_t)f1 * f2; C[0] += (int64_t)f1 * s; C[1] += (int64_t)f2 * s; } } H[0][0] /= size; H[0][1] /= size; H[1][1] /= size; H[1][0] = H[0][1]; C[0] /= size; C[1] /= size; } #endif // CONFIG_AV1_HIGHBITDEPTH static inline void calc_proj_params_r0_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint8_t *src = src8; const uint8_t *dat = dat8; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; ++j) { const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); const int32_t s = (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u; H[0][0] += (int64_t)f1 * f1; C[0] += (int64_t)f1 * s; } } H[0][0] /= size; C[0] /= size; } #if CONFIG_AV1_HIGHBITDEPTH static inline void calc_proj_params_r0_high_bd_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); for (int i = 0; i < height; ++i) { for (int j = 0; j < width; ++j) { const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); const int32_t s = (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u; H[0][0] += (int64_t)f1 * f1; C[0] += (int64_t)f1 * s; } } H[0][0] /= size; C[0] /= size; } #endif // CONFIG_AV1_HIGHBITDEPTH static inline void calc_proj_params_r1_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint8_t *src = src8; const uint8_t *dat = dat8; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; ++j) { const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); const int32_t s = (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u; H[1][1] += (int64_t)f2 * f2; C[1] += (int64_t)f2 * s; } } H[1][1] /= size; C[1] /= size; } #if CONFIG_AV1_HIGHBITDEPTH static inline void calc_proj_params_r1_high_bd_c( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); for (int i = 0; i < height; ++i) { for (int j = 0; j < width; ++j) { const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); const int32_t s = (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u; H[1][1] += (int64_t)f2 * f2; C[1] += (int64_t)f2 * s; } } H[1][1] /= size; C[1] /= size; } #endif // CONFIG_AV1_HIGHBITDEPTH // The function calls 3 subfunctions for the following cases : // 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements // of C and H need to be computed. // 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are // non-zero and need to be computed. // 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are // non-zero and need to be computed. void av1_calc_proj_params_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params) { if ((params->r[0] > 0) && (params->r[1] > 0)) { calc_proj_params_r0_r1_c(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, flt1, flt1_stride, H, C); } else if (params->r[0] > 0) { calc_proj_params_r0_c(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, H, C); } else if (params->r[1] > 0) { calc_proj_params_r1_c(src8, width, height, src_stride, dat8, dat_stride, flt1, flt1_stride, H, C); } } #if CONFIG_AV1_HIGHBITDEPTH void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params) { if ((params->r[0] > 0) && (params->r[1] > 0)) { calc_proj_params_r0_r1_high_bd_c(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, flt1, flt1_stride, H, C); } else if (params->r[0] > 0) { calc_proj_params_r0_high_bd_c(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, H, C); } else if (params->r[1] > 0) { calc_proj_params_r1_high_bd_c(src8, width, height, src_stride, dat8, dat_stride, flt1, flt1_stride, H, C); } } #endif // CONFIG_AV1_HIGHBITDEPTH static inline void get_proj_subspace(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int *xq, const sgr_params_type *params) { int64_t H[2][2] = { { 0, 0 }, { 0, 0 } }; int64_t C[2] = { 0, 0 }; // Default values to be returned if the problem becomes ill-posed xq[0] = 0; xq[1] = 0; if (!use_highbitdepth) { if ((width & 0x7) == 0) { av1_calc_proj_params(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, flt1, flt1_stride, H, C, params); } else { av1_calc_proj_params_c(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, flt1, flt1_stride, H, C, params); } } #if CONFIG_AV1_HIGHBITDEPTH else { // NOLINT if ((width & 0x7) == 0) { av1_calc_proj_params_high_bd(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, flt1, flt1_stride, H, C, params); } else { av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, flt1, flt1_stride, H, C, params); } } #endif if (params->r[0] == 0) { // H matrix is now only the scalar H[1][1] // C vector is now only the scalar C[1] const int64_t Det = H[1][1]; if (Det == 0) return; // ill-posed, return default values xq[0] = 0; xq[1] = (int)signed_rounded_divide(C[1] * (1 << SGRPROJ_PRJ_BITS), Det); } else if (params->r[1] == 0) { // H matrix is now only the scalar H[0][0] // C vector is now only the scalar C[0] const int64_t Det = H[0][0]; if (Det == 0) return; // ill-posed, return default values xq[0] = (int)signed_rounded_divide(C[0] * (1 << SGRPROJ_PRJ_BITS), Det); xq[1] = 0; } else { const int64_t Det = H[0][0] * H[1][1] - H[0][1] * H[1][0]; if (Det == 0) return; // ill-posed, return default values // If scaling up dividend would overflow, instead scale down the divisor const int64_t div1 = H[1][1] * C[0] - H[0][1] * C[1]; if ((div1 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div1) || (div1 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div1)) xq[0] = (int)signed_rounded_divide(div1, Det / (1 << SGRPROJ_PRJ_BITS)); else xq[0] = (int)signed_rounded_divide(div1 * (1 << SGRPROJ_PRJ_BITS), Det); const int64_t div2 = H[0][0] * C[1] - H[1][0] * C[0]; if ((div2 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div2) || (div2 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div2)) xq[1] = (int)signed_rounded_divide(div2, Det / (1 << SGRPROJ_PRJ_BITS)); else xq[1] = (int)signed_rounded_divide(div2 * (1 << SGRPROJ_PRJ_BITS), Det); } } static inline void encode_xq(int *xq, int *xqd, const sgr_params_type *params) { if (params->r[0] == 0) { xqd[0] = 0; xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xq[1], SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1); } else if (params->r[1] == 0) { xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0); xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0], SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1); } else { xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0); xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1], SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1); } } // Apply the self-guided filter across an entire restoration unit. static inline void apply_sgr(int sgr_params_idx, const uint8_t *dat8, int width, int height, int dat_stride, int use_highbd, int bit_depth, int pu_width, int pu_height, int32_t *flt0, int32_t *flt1, int flt_stride, struct aom_internal_error_info *error_info) { for (int i = 0; i < height; i += pu_height) { const int h = AOMMIN(pu_height, height - i); int32_t *flt0_row = flt0 + i * flt_stride; int32_t *flt1_row = flt1 + i * flt_stride; const uint8_t *dat8_row = dat8 + i * dat_stride; // Iterate over the stripe in blocks of width pu_width for (int j = 0; j < width; j += pu_width) { const int w = AOMMIN(pu_width, width - j); if (av1_selfguided_restoration( dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j, flt_stride, sgr_params_idx, bit_depth, use_highbd) != 0) { aom_internal_error( error_info, AOM_CODEC_MEM_ERROR, "Error allocating buffer in av1_selfguided_restoration"); } } } } static inline void compute_sgrproj_err( const uint8_t *dat8, const int width, const int height, const int dat_stride, const uint8_t *src8, const int src_stride, const int use_highbitdepth, const int bit_depth, const int pu_width, const int pu_height, const int ep, int32_t *flt0, int32_t *flt1, const int flt_stride, int *exqd, int64_t *err, struct aom_internal_error_info *error_info) { int exq[2]; apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth, pu_width, pu_height, flt0, flt1, flt_stride, error_info); const sgr_params_type *const params = &av1_sgr_params[ep]; get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq, params); encode_xq(exq, exqd, params); *err = finer_search_pixel_proj_error( src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0, flt_stride, flt1, flt_stride, 2, exqd, params); } static inline void get_best_error(int64_t *besterr, const int64_t err, const int *exqd, int *bestxqd, int *bestep, const int ep) { if (*besterr == -1 || err < *besterr) { *bestep = ep; *besterr = err; bestxqd[0] = exqd[0]; bestxqd[1] = exqd[1]; } } static SgrprojInfo search_selfguided_restoration( const uint8_t *dat8, int width, int height, int dat_stride, const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth, int pu_width, int pu_height, int32_t *rstbuf, int enable_sgr_ep_pruning, struct aom_internal_error_info *error_info) { int32_t *flt0 = rstbuf; int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; int ep, idx, bestep = 0; int64_t besterr = -1; int exqd[2], bestxqd[2] = { 0, 0 }; int flt_stride = ((width + 7) & ~7) + 8; assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) || pu_width == RESTORATION_PROC_UNIT_SIZE); assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) || pu_height == RESTORATION_PROC_UNIT_SIZE); if (!enable_sgr_ep_pruning) { for (ep = 0; ep < SGRPROJ_PARAMS; ep++) { int64_t err; compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride, use_highbitdepth, bit_depth, pu_width, pu_height, ep, flt0, flt1, flt_stride, exqd, &err, error_info); get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep); } } else { // evaluate first four seed ep in first group for (idx = 0; idx < SGRPROJ_EP_GRP1_SEARCH_COUNT; idx++) { ep = sgproj_ep_grp1_seed[idx]; int64_t err; compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride, use_highbitdepth, bit_depth, pu_width, pu_height, ep, flt0, flt1, flt_stride, exqd, &err, error_info); get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep); } // evaluate left and right ep of winner in seed ep int bestep_ref = bestep; for (ep = bestep_ref - 1; ep < bestep_ref + 2; ep += 2) { if (ep < SGRPROJ_EP_GRP1_START_IDX || ep > SGRPROJ_EP_GRP1_END_IDX) continue; int64_t err; compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride, use_highbitdepth, bit_depth, pu_width, pu_height, ep, flt0, flt1, flt_stride, exqd, &err, error_info); get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep); } // evaluate last two group for (idx = 0; idx < SGRPROJ_EP_GRP2_3_SEARCH_COUNT; idx++) { ep = sgproj_ep_grp2_3[idx][bestep]; int64_t err; compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride, use_highbitdepth, bit_depth, pu_width, pu_height, ep, flt0, flt1, flt_stride, exqd, &err, error_info); get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep); } } SgrprojInfo ret; ret.ep = bestep; ret.xqd[0] = bestxqd[0]; ret.xqd[1] = bestxqd[1]; return ret; } static int count_sgrproj_bits(SgrprojInfo *sgrproj_info, SgrprojInfo *ref_sgrproj_info) { int bits = SGRPROJ_PARAMS_BITS; const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep]; if (params->r[0] > 0) bits += aom_count_primitive_refsubexpfin( SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); if (params->r[1] > 0) bits += aom_count_primitive_refsubexpfin( SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); return bits; } static inline void search_sgrproj(const RestorationTileLimits *limits, int rest_unit_idx, void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs, struct aom_internal_error_info *error_info) { (void)rlbs; RestSearchCtxt *rsc = (RestSearchCtxt *)priv; RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; const MACROBLOCK *const x = rsc->x; const AV1_COMMON *const cm = rsc->cm; const int highbd = cm->seq_params->use_highbitdepth; const int bit_depth = cm->seq_params->bit_depth; const int64_t bits_none = x->mode_costs.sgrproj_restore_cost[0]; // Prune evaluation of RESTORE_SGRPROJ if 'skip_sgr_eval' is set if (rsc->skip_sgr_eval) { rsc->total_bits[RESTORE_SGRPROJ] += bits_none; rsc->total_sse[RESTORE_SGRPROJ] += rsc->sse[RESTORE_NONE]; rusi->best_rtype[RESTORE_SGRPROJ - 1] = RESTORE_NONE; rsc->sse[RESTORE_SGRPROJ] = INT64_MAX; return; } uint8_t *dgd_start = rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start; const uint8_t *src_start = rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start; const int is_uv = rsc->plane > 0; const int ss_x = is_uv && cm->seq_params->subsampling_x; const int ss_y = is_uv && cm->seq_params->subsampling_y; const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x; const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; rusi->sgrproj = search_selfguided_restoration( dgd_start, limits->h_end - limits->h_start, limits->v_end - limits->v_start, rsc->dgd_stride, src_start, rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height, tmpbuf, rsc->lpf_sf->enable_sgr_ep_pruning, error_info); RestorationUnitInfo rui; rui.restoration_type = RESTORE_SGRPROJ; rui.sgrproj_info = rusi->sgrproj; rsc->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, &rui); const int64_t bits_sgr = x->mode_costs.sgrproj_restore_cost[1] + (count_sgrproj_bits(&rusi->sgrproj, &rsc->ref_sgrproj) << AV1_PROB_COST_SHIFT); double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST( x->rdmult, bits_none >> 4, rsc->sse[RESTORE_NONE], bit_depth); double cost_sgr = RDCOST_DBL_WITH_NATIVE_BD_DIST( x->rdmult, bits_sgr >> 4, rsc->sse[RESTORE_SGRPROJ], bit_depth); if (rusi->sgrproj.ep < 10) cost_sgr *= (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level); RestorationType rtype = (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE; rusi->best_rtype[RESTORE_SGRPROJ - 1] = rtype; #if DEBUG_LR_COSTING // Store ref params for later checking lr_ref_params[RESTORE_SGRPROJ][rsc->plane][rest_unit_idx].sgrproj_info = rsc->ref_sgrproj; #endif // DEBUG_LR_COSTING rsc->total_sse[RESTORE_SGRPROJ] += rsc->sse[rtype]; rsc->total_bits[RESTORE_SGRPROJ] += (cost_sgr < cost_none) ? bits_sgr : bits_none; if (cost_sgr < cost_none) rsc->ref_sgrproj = rusi->sgrproj; } static void acc_stat_one_line(const uint8_t *dgd, const uint8_t *src, int dgd_stride, int h_start, int h_end, uint8_t avg, const int wiener_halfwin, const int wiener_win2, int32_t *M_int32, int32_t *H_int32, int count) { int j, k, l; int16_t Y[WIENER_WIN2]; for (j = h_start; j < h_end; j++) { const int16_t X = (int16_t)src[j] - (int16_t)avg; int idx = 0; for (k = -wiener_halfwin; k <= wiener_halfwin; k++) { for (l = -wiener_halfwin; l <= wiener_halfwin; l++) { Y[idx] = (int16_t)dgd[(count + l) * dgd_stride + (j + k)] - (int16_t)avg; idx++; } } assert(idx == wiener_win2); for (k = 0; k < wiener_win2; ++k) { M_int32[k] += (int32_t)Y[k] * X; for (l = k; l < wiener_win2; ++l) { // H is a symmetric matrix, so we only need to fill out the upper // triangle here. We can copy it down to the lower triangle outside // the (i, j) loops. H_int32[k * wiener_win2 + l] += (int32_t)Y[k] * Y[l]; } } } } void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats) { (void)dgd_avg; (void)src_avg; int i, k, l; const int wiener_win2 = wiener_win * wiener_win; const int wiener_halfwin = (wiener_win >> 1); uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); int32_t M_row[WIENER_WIN2] = { 0 }; int32_t H_row[WIENER_WIN2 * WIENER_WIN2] = { 0 }; int downsample_factor = use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; memset(M, 0, sizeof(*M) * wiener_win2); memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); for (i = v_start; i < v_end; i = i + downsample_factor) { if (use_downsampled_wiener_stats && (v_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) { downsample_factor = v_end - i; } memset(M_row, 0, sizeof(int32_t) * WIENER_WIN2); memset(H_row, 0, sizeof(int32_t) * WIENER_WIN2 * WIENER_WIN2); acc_stat_one_line(dgd, src + i * src_stride, dgd_stride, h_start, h_end, avg, wiener_halfwin, wiener_win2, M_row, H_row, i); for (k = 0; k < wiener_win2; ++k) { // Scale M matrix based on the downsampling factor M[k] += ((int64_t)M_row[k] * downsample_factor); for (l = k; l < wiener_win2; ++l) { // H is a symmetric matrix, so we only need to fill out the upper // triangle here. We can copy it down to the lower triangle outside // the (i, j) loops. // Scale H Matrix based on the downsampling factor H[k * wiener_win2 + l] += ((int64_t)H_row[k * wiener_win2 + l] * downsample_factor); } } } for (k = 0; k < wiener_win2; ++k) { for (l = k + 1; l < wiener_win2; ++l) { H[l * wiener_win2 + k] = H[k * wiener_win2 + l]; } } } #if CONFIG_AV1_HIGHBITDEPTH void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { (void)dgd_avg; (void)src_avg; int i, j, k, l; int32_t Y[WIENER_WIN2]; const int wiener_win2 = wiener_win * wiener_win; const int wiener_halfwin = (wiener_win >> 1); const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); uint16_t avg = find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); uint8_t bit_depth_divider = 1; if (bit_depth == AOM_BITS_12) bit_depth_divider = 16; else if (bit_depth == AOM_BITS_10) bit_depth_divider = 4; memset(M, 0, sizeof(*M) * wiener_win2); memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); for (i = v_start; i < v_end; i++) { for (j = h_start; j < h_end; j++) { const int32_t X = (int32_t)src[i * src_stride + j] - (int32_t)avg; int idx = 0; for (k = -wiener_halfwin; k <= wiener_halfwin; k++) { for (l = -wiener_halfwin; l <= wiener_halfwin; l++) { Y[idx] = (int32_t)dgd[(i + l) * dgd_stride + (j + k)] - (int32_t)avg; idx++; } } assert(idx == wiener_win2); for (k = 0; k < wiener_win2; ++k) { M[k] += (int64_t)Y[k] * X; for (l = k; l < wiener_win2; ++l) { // H is a symmetric matrix, so we only need to fill out the upper // triangle here. We can copy it down to the lower triangle outside // the (i, j) loops. H[k * wiener_win2 + l] += (int64_t)Y[k] * Y[l]; } } } } for (k = 0; k < wiener_win2; ++k) { M[k] /= bit_depth_divider; H[k * wiener_win2 + k] /= bit_depth_divider; for (l = k + 1; l < wiener_win2; ++l) { H[k * wiener_win2 + l] /= bit_depth_divider; H[l * wiener_win2 + k] = H[k * wiener_win2 + l]; } } } #endif // CONFIG_AV1_HIGHBITDEPTH static inline int wrap_index(int i, int wiener_win) { const int wiener_halfwin1 = (wiener_win >> 1) + 1; return (i >= wiener_halfwin1 ? wiener_win - 1 - i : i); } // Splits each w[i] into smaller components w1[i] and w2[i] such that // w[i] = w1[i] * WIENER_TAP_SCALE_FACTOR + w2[i]. static inline void split_wiener_filter_coefficients(int wiener_win, const int32_t *w, int32_t *w1, int32_t *w2) { for (int i = 0; i < wiener_win; i++) { w1[i] = w[i] / WIENER_TAP_SCALE_FACTOR; w2[i] = w[i] - w1[i] * WIENER_TAP_SCALE_FACTOR; assert(w[i] == w1[i] * WIENER_TAP_SCALE_FACTOR + w2[i]); } } // Calculates x * w / WIENER_TAP_SCALE_FACTOR, where // w = w1 * WIENER_TAP_SCALE_FACTOR + w2. // // The multiplication x * w may overflow, so we multiply x by the components of // w (w1 and w2) and combine the multiplication with the division. static inline int64_t multiply_and_scale(int64_t x, int32_t w1, int32_t w2) { // Let y = x * w / WIENER_TAP_SCALE_FACTOR // = x * (w1 * WIENER_TAP_SCALE_FACTOR + w2) / WIENER_TAP_SCALE_FACTOR const int64_t y = x * w1 + x * w2 / WIENER_TAP_SCALE_FACTOR; return y; } // Solve linear equations to find Wiener filter tap values // Taps are output scaled by WIENER_FILT_STEP static int linsolve_wiener(int n, int64_t *A, int stride, int64_t *b, int64_t *x) { for (int k = 0; k < n - 1; k++) { // Partial pivoting: bring the row with the largest pivot to the top for (int i = n - 1; i > k; i--) { // If row i has a better (bigger) pivot than row (i-1), swap them if (llabs(A[(i - 1) * stride + k]) < llabs(A[i * stride + k])) { for (int j = 0; j < n; j++) { const int64_t c = A[i * stride + j]; A[i * stride + j] = A[(i - 1) * stride + j]; A[(i - 1) * stride + j] = c; } const int64_t c = b[i]; b[i] = b[i - 1]; b[i - 1] = c; } } // b/278065963: The multiplies // c / 256 * A[k * stride + j] / cd * 256 // and // c / 256 * b[k] / cd * 256 // within Gaussian elimination can cause a signed integer overflow. Rework // the multiplies so that larger scaling is used without significantly // impacting the overall precision. // // Precision guidance: // scale_threshold: Pick as high as possible. // For max_abs_akj >= scale_threshold scenario: // scaler_A: Pick as low as possible. Needed for A[(i + 1) * stride + j]. // scaler_c: Pick as low as possible while maintaining scaler_c >= // (1 << 7). Needed for A[(i + 1) * stride + j] and b[i + 1]. int64_t max_abs_akj = 0; for (int j = 0; j < n; j++) { const int64_t abs_akj = llabs(A[k * stride + j]); if (abs_akj > max_abs_akj) max_abs_akj = abs_akj; } const int scale_threshold = 1 << 22; const int scaler_A = max_abs_akj < scale_threshold ? 1 : (1 << 6); const int scaler_c = max_abs_akj < scale_threshold ? 1 : (1 << 7); const int scaler = scaler_c * scaler_A; // Forward elimination (convert A to row-echelon form) for (int i = k; i < n - 1; i++) { if (A[k * stride + k] == 0) return 0; const int64_t c = A[(i + 1) * stride + k] / scaler_c; const int64_t cd = A[k * stride + k]; for (int j = 0; j < n; j++) { A[(i + 1) * stride + j] -= A[k * stride + j] / scaler_A * c / cd * scaler; } b[i + 1] -= c * b[k] / cd * scaler_c; } } // Back-substitution for (int i = n - 1; i >= 0; i--) { if (A[i * stride + i] == 0) return 0; int64_t c = 0; for (int j = i + 1; j <= n - 1; j++) { c += A[i * stride + j] * x[j] / WIENER_TAP_SCALE_FACTOR; } // Store filter taps x in scaled form. x[i] = WIENER_TAP_SCALE_FACTOR * (b[i] - c) / A[i * stride + i]; } return 1; } // Fix vector b, update vector a static inline void update_a_sep_sym(int wiener_win, int64_t **Mc, int64_t **Hc, int32_t *a, const int32_t *b) { int i, j; int64_t S[WIENER_WIN]; int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1]; int32_t b1[WIENER_WIN], b2[WIENER_WIN]; const int wiener_win2 = wiener_win * wiener_win; const int wiener_halfwin1 = (wiener_win >> 1) + 1; memset(A, 0, sizeof(A)); memset(B, 0, sizeof(B)); for (i = 0; i < wiener_win; i++) { for (j = 0; j < wiener_win; ++j) { const int jj = wrap_index(j, wiener_win); A[jj] += Mc[i][j] * b[i] / WIENER_TAP_SCALE_FACTOR; } } split_wiener_filter_coefficients(wiener_win, b, b1, b2); for (i = 0; i < wiener_win; i++) { for (j = 0; j < wiener_win; j++) { int k, l; for (k = 0; k < wiener_win; ++k) { const int kk = wrap_index(k, wiener_win); for (l = 0; l < wiener_win; ++l) { const int ll = wrap_index(l, wiener_win); // Calculate // B[ll * wiener_halfwin1 + kk] += // Hc[j * wiener_win + i][k * wiener_win2 + l] * b[i] / // WIENER_TAP_SCALE_FACTOR * b[j] / WIENER_TAP_SCALE_FACTOR; // // The last multiplication may overflow, so we combine the last // multiplication with the last division. const int64_t x = Hc[j * wiener_win + i][k * wiener_win2 + l] * b[i] / WIENER_TAP_SCALE_FACTOR; // b[j] = b1[j] * WIENER_TAP_SCALE_FACTOR + b2[j] B[ll * wiener_halfwin1 + kk] += multiply_and_scale(x, b1[j], b2[j]); } } } } // Normalization enforcement in the system of equations itself for (i = 0; i < wiener_halfwin1 - 1; ++i) { A[i] -= A[wiener_halfwin1 - 1] * 2 + B[i * wiener_halfwin1 + wiener_halfwin1 - 1] - 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)]; } for (i = 0; i < wiener_halfwin1 - 1; ++i) { for (j = 0; j < wiener_halfwin1 - 1; ++j) { B[i * wiener_halfwin1 + j] -= 2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] + B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] - 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)]); } } if (linsolve_wiener(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) { S[wiener_halfwin1 - 1] = WIENER_TAP_SCALE_FACTOR; for (i = wiener_halfwin1; i < wiener_win; ++i) { S[i] = S[wiener_win - 1 - i]; S[wiener_halfwin1 - 1] -= 2 * S[i]; } for (i = 0; i < wiener_win; ++i) { a[i] = (int32_t)CLIP(S[i], -(1 << (WIENER_FILT_BITS - 1)), (1 << (WIENER_FILT_BITS - 1)) - 1); } } } // Fix vector a, update vector b static inline void update_b_sep_sym(int wiener_win, int64_t **Mc, int64_t **Hc, const int32_t *a, int32_t *b) { int i, j; int64_t S[WIENER_WIN]; int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1]; int32_t a1[WIENER_WIN], a2[WIENER_WIN]; const int wiener_win2 = wiener_win * wiener_win; const int wiener_halfwin1 = (wiener_win >> 1) + 1; memset(A, 0, sizeof(A)); memset(B, 0, sizeof(B)); for (i = 0; i < wiener_win; i++) { const int ii = wrap_index(i, wiener_win); for (j = 0; j < wiener_win; j++) { A[ii] += Mc[i][j] * a[j] / WIENER_TAP_SCALE_FACTOR; } } split_wiener_filter_coefficients(wiener_win, a, a1, a2); for (i = 0; i < wiener_win; i++) { const int ii = wrap_index(i, wiener_win); for (j = 0; j < wiener_win; j++) { const int jj = wrap_index(j, wiener_win); int k, l; for (k = 0; k < wiener_win; ++k) { for (l = 0; l < wiener_win; ++l) { // Calculate // B[jj * wiener_halfwin1 + ii] += // Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] / // WIENER_TAP_SCALE_FACTOR * a[l] / WIENER_TAP_SCALE_FACTOR; // // The last multiplication may overflow, so we combine the last // multiplication with the last division. const int64_t x = Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] / WIENER_TAP_SCALE_FACTOR; // a[l] = a1[l] * WIENER_TAP_SCALE_FACTOR + a2[l] B[jj * wiener_halfwin1 + ii] += multiply_and_scale(x, a1[l], a2[l]); } } } } // Normalization enforcement in the system of equations itself for (i = 0; i < wiener_halfwin1 - 1; ++i) { A[i] -= A[wiener_halfwin1 - 1] * 2 + B[i * wiener_halfwin1 + wiener_halfwin1 - 1] - 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)]; } for (i = 0; i < wiener_halfwin1 - 1; ++i) { for (j = 0; j < wiener_halfwin1 - 1; ++j) { B[i * wiener_halfwin1 + j] -= 2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] + B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] - 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)]); } } if (linsolve_wiener(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) { S[wiener_halfwin1 - 1] = WIENER_TAP_SCALE_FACTOR; for (i = wiener_halfwin1; i < wiener_win; ++i) { S[i] = S[wiener_win - 1 - i]; S[wiener_halfwin1 - 1] -= 2 * S[i]; } for (i = 0; i < wiener_win; ++i) { b[i] = (int32_t)CLIP(S[i], -(1 << (WIENER_FILT_BITS - 1)), (1 << (WIENER_FILT_BITS - 1)) - 1); } } } static void wiener_decompose_sep_sym(int wiener_win, int64_t *M, int64_t *H, int32_t *a, int32_t *b) { static const int32_t init_filt[WIENER_WIN] = { WIENER_FILT_TAP0_MIDV, WIENER_FILT_TAP1_MIDV, WIENER_FILT_TAP2_MIDV, WIENER_FILT_TAP3_MIDV, WIENER_FILT_TAP2_MIDV, WIENER_FILT_TAP1_MIDV, WIENER_FILT_TAP0_MIDV, }; int64_t *Hc[WIENER_WIN2]; int64_t *Mc[WIENER_WIN]; int i, j, iter; const int plane_off = (WIENER_WIN - wiener_win) >> 1; const int wiener_win2 = wiener_win * wiener_win; for (i = 0; i < wiener_win; i++) { a[i] = b[i] = WIENER_TAP_SCALE_FACTOR / WIENER_FILT_STEP * init_filt[i + plane_off]; } for (i = 0; i < wiener_win; i++) { Mc[i] = M + i * wiener_win; for (j = 0; j < wiener_win; j++) { Hc[i * wiener_win + j] = H + i * wiener_win * wiener_win2 + j * wiener_win; } } iter = 1; while (iter < NUM_WIENER_ITERS) { update_a_sep_sym(wiener_win, Mc, Hc, a, b); update_b_sep_sym(wiener_win, Mc, Hc, a, b); iter++; } } // Computes the function x'*H*x - x'*M for the learned 2D filter x, and compares // against identity filters; Final score is defined as the difference between // the function values static int64_t compute_score(int wiener_win, int64_t *M, int64_t *H, InterpKernel vfilt, InterpKernel hfilt) { int32_t ab[WIENER_WIN * WIENER_WIN]; int16_t a[WIENER_WIN], b[WIENER_WIN]; int64_t P = 0, Q = 0; int64_t iP = 0, iQ = 0; int64_t Score, iScore; int i, k, l; const int plane_off = (WIENER_WIN - wiener_win) >> 1; const int wiener_win2 = wiener_win * wiener_win; a[WIENER_HALFWIN] = b[WIENER_HALFWIN] = WIENER_FILT_STEP; for (i = 0; i < WIENER_HALFWIN; ++i) { a[i] = a[WIENER_WIN - i - 1] = vfilt[i]; b[i] = b[WIENER_WIN - i - 1] = hfilt[i]; a[WIENER_HALFWIN] -= 2 * a[i]; b[WIENER_HALFWIN] -= 2 * b[i]; } memset(ab, 0, sizeof(ab)); for (k = 0; k < wiener_win; ++k) { for (l = 0; l < wiener_win; ++l) ab[k * wiener_win + l] = a[l + plane_off] * b[k + plane_off]; } for (k = 0; k < wiener_win2; ++k) { P += ab[k] * M[k] / WIENER_FILT_STEP / WIENER_FILT_STEP; for (l = 0; l < wiener_win2; ++l) { Q += ab[k] * H[k * wiener_win2 + l] * ab[l] / WIENER_FILT_STEP / WIENER_FILT_STEP / WIENER_FILT_STEP / WIENER_FILT_STEP; } } Score = Q - 2 * P; iP = M[wiener_win2 >> 1]; iQ = H[(wiener_win2 >> 1) * wiener_win2 + (wiener_win2 >> 1)]; iScore = iQ - 2 * iP; return Score - iScore; } static inline void finalize_sym_filter(int wiener_win, int32_t *f, InterpKernel fi) { int i; const int wiener_halfwin = (wiener_win >> 1); for (i = 0; i < wiener_halfwin; ++i) { const int64_t dividend = (int64_t)f[i] * WIENER_FILT_STEP; const int64_t divisor = WIENER_TAP_SCALE_FACTOR; // Perform this division with proper rounding rather than truncation if (dividend < 0) { fi[i] = (int16_t)((dividend - (divisor / 2)) / divisor); } else { fi[i] = (int16_t)((dividend + (divisor / 2)) / divisor); } } // Specialize for 7-tap filter if (wiener_win == WIENER_WIN) { fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV); fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV); fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV); } else { fi[2] = CLIP(fi[1], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV); fi[1] = CLIP(fi[0], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV); fi[0] = 0; } // Satisfy filter constraints fi[WIENER_WIN - 1] = fi[0]; fi[WIENER_WIN - 2] = fi[1]; fi[WIENER_WIN - 3] = fi[2]; // The central element has an implicit +WIENER_FILT_STEP fi[3] = -2 * (fi[0] + fi[1] + fi[2]); } static int count_wiener_bits(int wiener_win, WienerInfo *wiener_info, WienerInfo *ref_wiener_info) { int bits = 0; if (wiener_win == WIENER_WIN) bits += aom_count_primitive_refsubexpfin( WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, WIENER_FILT_TAP0_SUBEXP_K, ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV); bits += aom_count_primitive_refsubexpfin( WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, WIENER_FILT_TAP1_SUBEXP_K, ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV); bits += aom_count_primitive_refsubexpfin( WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, WIENER_FILT_TAP2_SUBEXP_K, ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV); if (wiener_win == WIENER_WIN) bits += aom_count_primitive_refsubexpfin( WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, WIENER_FILT_TAP0_SUBEXP_K, ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV); bits += aom_count_primitive_refsubexpfin( WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, WIENER_FILT_TAP1_SUBEXP_K, ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV); bits += aom_count_primitive_refsubexpfin( WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, WIENER_FILT_TAP2_SUBEXP_K, ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV); return bits; } static int64_t finer_search_wiener(const RestSearchCtxt *rsc, const RestorationTileLimits *limits, RestorationUnitInfo *rui, int wiener_win) { const int plane_off = (WIENER_WIN - wiener_win) >> 1; int64_t err = try_restoration_unit(rsc, limits, rui); if (rsc->lpf_sf->disable_wiener_coeff_refine_search) return err; // Refinement search around the wiener filter coefficients. int64_t err2; int tap_min[] = { WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP2_MINV }; int tap_max[] = { WIENER_FILT_TAP0_MAXV, WIENER_FILT_TAP1_MAXV, WIENER_FILT_TAP2_MAXV }; WienerInfo *plane_wiener = &rui->wiener_info; // printf("err pre = %"PRId64"\n", err); const int start_step = 4; for (int s = start_step; s >= 1; s >>= 1) { for (int p = plane_off; p < WIENER_HALFWIN; ++p) { int skip = 0; do { if (plane_wiener->hfilter[p] - s >= tap_min[p]) { plane_wiener->hfilter[p] -= s; plane_wiener->hfilter[WIENER_WIN - p - 1] -= s; plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s; err2 = try_restoration_unit(rsc, limits, rui); if (err2 > err) { plane_wiener->hfilter[p] += s; plane_wiener->hfilter[WIENER_WIN - p - 1] += s; plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s; } else { err = err2; skip = 1; // At the highest step size continue moving in the same direction if (s == start_step) continue; } } break; } while (1); if (skip) break; do { if (plane_wiener->hfilter[p] + s <= tap_max[p]) { plane_wiener->hfilter[p] += s; plane_wiener->hfilter[WIENER_WIN - p - 1] += s; plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s; err2 = try_restoration_unit(rsc, limits, rui); if (err2 > err) { plane_wiener->hfilter[p] -= s; plane_wiener->hfilter[WIENER_WIN - p - 1] -= s; plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s; } else { err = err2; // At the highest step size continue moving in the same direction if (s == start_step) continue; } } break; } while (1); } for (int p = plane_off; p < WIENER_HALFWIN; ++p) { int skip = 0; do { if (plane_wiener->vfilter[p] - s >= tap_min[p]) { plane_wiener->vfilter[p] -= s; plane_wiener->vfilter[WIENER_WIN - p - 1] -= s; plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s; err2 = try_restoration_unit(rsc, limits, rui); if (err2 > err) { plane_wiener->vfilter[p] += s; plane_wiener->vfilter[WIENER_WIN - p - 1] += s; plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s; } else { err = err2; skip = 1; // At the highest step size continue moving in the same direction if (s == start_step) continue; } } break; } while (1); if (skip) break; do { if (plane_wiener->vfilter[p] + s <= tap_max[p]) { plane_wiener->vfilter[p] += s; plane_wiener->vfilter[WIENER_WIN - p - 1] += s; plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s; err2 = try_restoration_unit(rsc, limits, rui); if (err2 > err) { plane_wiener->vfilter[p] -= s; plane_wiener->vfilter[WIENER_WIN - p - 1] -= s; plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s; } else { err = err2; // At the highest step size continue moving in the same direction if (s == start_step) continue; } } break; } while (1); } } // printf("err post = %"PRId64"\n", err); return err; } static inline void search_wiener(const RestorationTileLimits *limits, int rest_unit_idx, void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs, struct aom_internal_error_info *error_info) { (void)tmpbuf; (void)rlbs; (void)error_info; RestSearchCtxt *rsc = (RestSearchCtxt *)priv; RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; const MACROBLOCK *const x = rsc->x; const int64_t bits_none = x->mode_costs.wiener_restore_cost[0]; // Skip Wiener search for low variance contents if (rsc->lpf_sf->prune_wiener_based_on_src_var) { const int scale[3] = { 0, 1, 2 }; // Obtain the normalized Qscale const int qs = av1_dc_quant_QTX(rsc->cm->quant_params.base_qindex, 0, rsc->cm->seq_params->bit_depth) >> 3; // Derive threshold as sqr(normalized Qscale) * scale / 16, const uint64_t thresh = (qs * qs * scale[rsc->lpf_sf->prune_wiener_based_on_src_var]) >> 4; const int highbd = rsc->cm->seq_params->use_highbitdepth; const uint64_t src_var = var_restoration_unit(limits, rsc->src, rsc->plane, highbd); // Do not perform Wiener search if source variance is lower than threshold // or if the reconstruction error is zero int prune_wiener = (src_var < thresh) || (rsc->sse[RESTORE_NONE] == 0); if (prune_wiener) { rsc->total_bits[RESTORE_WIENER] += bits_none; rsc->total_sse[RESTORE_WIENER] += rsc->sse[RESTORE_NONE]; rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE; rsc->sse[RESTORE_WIENER] = INT64_MAX; if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rsc->skip_sgr_eval = 1; return; } } const int wiener_win = (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA; int reduced_wiener_win = wiener_win; if (rsc->lpf_sf->reduce_wiener_window_size) { reduced_wiener_win = (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN_REDUCED : WIENER_WIN_CHROMA; } int64_t M[WIENER_WIN2]; int64_t H[WIENER_WIN2 * WIENER_WIN2]; int32_t vfilter[WIENER_WIN], hfilter[WIENER_WIN]; #if CONFIG_AV1_HIGHBITDEPTH const AV1_COMMON *const cm = rsc->cm; if (cm->seq_params->use_highbitdepth) { // TODO(any) : Add support for use_downsampled_wiener_stats SF in HBD // functions. Optimize intrinsics of HBD design similar to LBD (i.e., // pre-calculate d and s buffers and avoid most of the C operations). av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer, rsc->dgd_avg, rsc->src_avg, limits->h_start, limits->h_end, limits->v_start, limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H, cm->seq_params->bit_depth); } else { av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer, rsc->dgd_avg, rsc->src_avg, limits->h_start, limits->h_end, limits->v_start, limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H, rsc->lpf_sf->use_downsampled_wiener_stats); } #else av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer, rsc->dgd_avg, rsc->src_avg, limits->h_start, limits->h_end, limits->v_start, limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H, rsc->lpf_sf->use_downsampled_wiener_stats); #endif wiener_decompose_sep_sym(reduced_wiener_win, M, H, vfilter, hfilter); RestorationUnitInfo rui; memset(&rui, 0, sizeof(rui)); rui.restoration_type = RESTORE_WIENER; finalize_sym_filter(reduced_wiener_win, vfilter, rui.wiener_info.vfilter); finalize_sym_filter(reduced_wiener_win, hfilter, rui.wiener_info.hfilter); // Filter score computes the value of the function x'*A*x - x'*b for the // learned filter and compares it against identity filer. If there is no // reduction in the function, the filter is reverted back to identity if (compute_score(reduced_wiener_win, M, H, rui.wiener_info.vfilter, rui.wiener_info.hfilter) > 0) { rsc->total_bits[RESTORE_WIENER] += bits_none; rsc->total_sse[RESTORE_WIENER] += rsc->sse[RESTORE_NONE]; rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE; rsc->sse[RESTORE_WIENER] = INT64_MAX; if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rsc->skip_sgr_eval = 1; return; } rsc->sse[RESTORE_WIENER] = finer_search_wiener(rsc, limits, &rui, reduced_wiener_win); rusi->wiener = rui.wiener_info; if (reduced_wiener_win != WIENER_WIN) { assert(rui.wiener_info.vfilter[0] == 0 && rui.wiener_info.vfilter[WIENER_WIN - 1] == 0); assert(rui.wiener_info.hfilter[0] == 0 && rui.wiener_info.hfilter[WIENER_WIN - 1] == 0); } const int64_t bits_wiener = x->mode_costs.wiener_restore_cost[1] + (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->ref_wiener) << AV1_PROB_COST_SHIFT); double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST( x->rdmult, bits_none >> 4, rsc->sse[RESTORE_NONE], rsc->cm->seq_params->bit_depth); double cost_wiener = RDCOST_DBL_WITH_NATIVE_BD_DIST( x->rdmult, bits_wiener >> 4, rsc->sse[RESTORE_WIENER], rsc->cm->seq_params->bit_depth); RestorationType rtype = (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE; rusi->best_rtype[RESTORE_WIENER - 1] = rtype; // Set 'skip_sgr_eval' based on rdcost ratio of RESTORE_WIENER and // RESTORE_NONE or based on best_rtype if (rsc->lpf_sf->prune_sgr_based_on_wiener == 1) { rsc->skip_sgr_eval = cost_wiener > (1.01 * cost_none); } else if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) { rsc->skip_sgr_eval = rusi->best_rtype[RESTORE_WIENER - 1] == RESTORE_NONE; } #if DEBUG_LR_COSTING // Store ref params for later checking lr_ref_params[RESTORE_WIENER][rsc->plane][rest_unit_idx].wiener_info = rsc->ref_wiener; #endif // DEBUG_LR_COSTING rsc->total_sse[RESTORE_WIENER] += rsc->sse[rtype]; rsc->total_bits[RESTORE_WIENER] += (cost_wiener < cost_none) ? bits_wiener : bits_none; if (cost_wiener < cost_none) rsc->ref_wiener = rusi->wiener; } static inline void search_norestore( const RestorationTileLimits *limits, int rest_unit_idx, void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs, struct aom_internal_error_info *error_info) { (void)rest_unit_idx; (void)tmpbuf; (void)rlbs; (void)error_info; RestSearchCtxt *rsc = (RestSearchCtxt *)priv; const int highbd = rsc->cm->seq_params->use_highbitdepth; rsc->sse[RESTORE_NONE] = sse_restoration_unit( limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd); rsc->total_sse[RESTORE_NONE] += rsc->sse[RESTORE_NONE]; } static inline void search_switchable( const RestorationTileLimits *limits, int rest_unit_idx, void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs, struct aom_internal_error_info *error_info) { (void)limits; (void)tmpbuf; (void)rlbs; (void)error_info; RestSearchCtxt *rsc = (RestSearchCtxt *)priv; RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; const MACROBLOCK *const x = rsc->x; const int wiener_win = (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA; double best_cost = 0; int64_t best_bits = 0; RestorationType best_rtype = RESTORE_NONE; for (RestorationType r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) { // If this restoration mode was skipped, or could not find a solution // that was better than RESTORE_NONE, then we can't select it here either. // // Note: It is possible for the restoration search functions to find a // filter which is better than RESTORE_NONE when looking purely at SSE, but // for it to be rejected overall due to its rate cost. In this case, there // is a chance that it may be have a lower rate cost when looking at // RESTORE_SWITCHABLE, and so it might be acceptable here. // // Therefore we prune based on SSE, rather than on whether or not the // previous search function selected this mode. if (r > RESTORE_NONE) { if (rsc->sse[r] > rsc->sse[RESTORE_NONE]) continue; } const int64_t sse = rsc->sse[r]; int64_t coeff_pcost = 0; switch (r) { case RESTORE_NONE: coeff_pcost = 0; break; case RESTORE_WIENER: coeff_pcost = count_wiener_bits(wiener_win, &rusi->wiener, &rsc->switchable_ref_wiener); break; case RESTORE_SGRPROJ: coeff_pcost = count_sgrproj_bits(&rusi->sgrproj, &rsc->switchable_ref_sgrproj); break; default: assert(0); break; } const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT; const int64_t bits = x->mode_costs.switchable_restore_cost[r] + coeff_bits; double cost = RDCOST_DBL_WITH_NATIVE_BD_DIST( x->rdmult, bits >> 4, sse, rsc->cm->seq_params->bit_depth); if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10) cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level); if (r == 0 || cost < best_cost) { best_cost = cost; best_bits = bits; best_rtype = r; } } rusi->best_rtype[RESTORE_SWITCHABLE - 1] = best_rtype; #if DEBUG_LR_COSTING // Store ref params for later checking lr_ref_params[RESTORE_SWITCHABLE][rsc->plane][rest_unit_idx].wiener_info = rsc->switchable_ref_wiener; lr_ref_params[RESTORE_SWITCHABLE][rsc->plane][rest_unit_idx].sgrproj_info = rsc->switchable_ref_sgrproj; #endif // DEBUG_LR_COSTING rsc->total_sse[RESTORE_SWITCHABLE] += rsc->sse[best_rtype]; rsc->total_bits[RESTORE_SWITCHABLE] += best_bits; if (best_rtype == RESTORE_WIENER) rsc->switchable_ref_wiener = rusi->wiener; if (best_rtype == RESTORE_SGRPROJ) rsc->switchable_ref_sgrproj = rusi->sgrproj; } static inline void copy_unit_info(RestorationType frame_rtype, const RestUnitSearchInfo *rusi, RestorationUnitInfo *rui) { assert(frame_rtype > 0); rui->restoration_type = rusi->best_rtype[frame_rtype - 1]; if (rui->restoration_type == RESTORE_WIENER) rui->wiener_info = rusi->wiener; else rui->sgrproj_info = rusi->sgrproj; } static void restoration_search(AV1_COMMON *cm, int plane, RestSearchCtxt *rsc, bool *disable_lr_filter) { const BLOCK_SIZE sb_size = cm->seq_params->sb_size; const int mib_size_log2 = cm->seq_params->mib_size_log2; const CommonTileParams *tiles = &cm->tiles; const int is_uv = plane > 0; const int ss_y = is_uv && cm->seq_params->subsampling_y; RestorationInfo *rsi = &cm->rst_info[plane]; const int ru_size = rsi->restoration_unit_size; const int ext_size = ru_size * 3 / 2; int plane_w, plane_h; av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); static const rest_unit_visitor_t funs[RESTORE_TYPES] = { search_norestore, search_wiener, search_sgrproj, search_switchable }; const int plane_num_units = rsi->num_rest_units; const RestorationType num_rtypes = (plane_num_units > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES; reset_rsc(rsc); // Iterate over restoration units in encoding order, so that each RU gets // the correct reference parameters when we cost it up. This is effectively // a nested iteration over: // * Each tile, order does not matter // * Each superblock within that tile, in raster order // * Each LR unit which is coded within that superblock, in raster order for (int tile_row = 0; tile_row < tiles->rows; tile_row++) { int sb_row_start = tiles->row_start_sb[tile_row]; int sb_row_end = tiles->row_start_sb[tile_row + 1]; for (int tile_col = 0; tile_col < tiles->cols; tile_col++) { int sb_col_start = tiles->col_start_sb[tile_col]; int sb_col_end = tiles->col_start_sb[tile_col + 1]; // Reset reference parameters for delta-coding at the start of each tile rsc_on_tile(rsc); for (int sb_row = sb_row_start; sb_row < sb_row_end; sb_row++) { int mi_row = sb_row << mib_size_log2; for (int sb_col = sb_col_start; sb_col < sb_col_end; sb_col++) { int mi_col = sb_col << mib_size_log2; int rcol0, rcol1, rrow0, rrow1; int has_lr_info = av1_loop_restoration_corners_in_sb( cm, plane, mi_row, mi_col, sb_size, &rcol0, &rcol1, &rrow0, &rrow1); if (!has_lr_info) continue; RestorationTileLimits limits; for (int rrow = rrow0; rrow < rrow1; rrow++) { int y0 = rrow * ru_size; int remaining_h = plane_h - y0; int h = (remaining_h < ext_size) ? remaining_h : ru_size; limits.v_start = y0; limits.v_end = y0 + h; assert(limits.v_end <= plane_h); // Offset upwards to align with the restoration processing stripe const int voffset = RESTORATION_UNIT_OFFSET >> ss_y; limits.v_start = AOMMAX(0, limits.v_start - voffset); if (limits.v_end < plane_h) limits.v_end -= voffset; for (int rcol = rcol0; rcol < rcol1; rcol++) { int x0 = rcol * ru_size; int remaining_w = plane_w - x0; int w = (remaining_w < ext_size) ? remaining_w : ru_size; limits.h_start = x0; limits.h_end = x0 + w; assert(limits.h_end <= plane_w); const int unit_idx = rrow * rsi->horz_units + rcol; rsc->skip_sgr_eval = 0; for (RestorationType r = RESTORE_NONE; r < num_rtypes; r++) { if (disable_lr_filter[r]) continue; funs[r](&limits, unit_idx, rsc, rsc->cm->rst_tmpbuf, NULL, cm->error); } } } } } } } } static inline void av1_derive_flags_for_lr_processing( const LOOP_FILTER_SPEED_FEATURES *lpf_sf, bool *disable_lr_filter) { const bool is_wiener_disabled = lpf_sf->disable_wiener_filter; const bool is_sgr_disabled = lpf_sf->disable_sgr_filter; // Enable None Loop restoration filter if either of Wiener or Self-guided is // enabled. disable_lr_filter[RESTORE_NONE] = (is_wiener_disabled && is_sgr_disabled); disable_lr_filter[RESTORE_WIENER] = is_wiener_disabled; disable_lr_filter[RESTORE_SGRPROJ] = is_sgr_disabled; // Enable Swicthable Loop restoration filter if both of the Wiener and // Self-guided are enabled. disable_lr_filter[RESTORE_SWITCHABLE] = (is_wiener_disabled || is_sgr_disabled); } #define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0 // Allocate both decoder-side and encoder-side info structs for a single plane. // The unit size passed in should be the minimum size which we are going to // search; before each search, set_restoration_unit_size() must be called to // configure the actual size. static RestUnitSearchInfo *allocate_search_structs(AV1_COMMON *cm, RestorationInfo *rsi, int is_uv, int min_luma_unit_size) { #if COUPLED_CHROMA_FROM_LUMA_RESTORATION int sx = cm->seq_params.subsampling_x; int sy = cm->seq_params.subsampling_y; int s = (p > 0) ? AOMMIN(sx, sy) : 0; #else int s = 0; #endif // !COUPLED_CHROMA_FROM_LUMA_RESTORATION int min_unit_size = min_luma_unit_size >> s; int plane_w, plane_h; av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); const int max_horz_units = av1_lr_count_units(min_unit_size, plane_w); const int max_vert_units = av1_lr_count_units(min_unit_size, plane_h); const int max_num_units = max_horz_units * max_vert_units; aom_free(rsi->unit_info); CHECK_MEM_ERROR(cm, rsi->unit_info, (RestorationUnitInfo *)aom_memalign( 16, sizeof(*rsi->unit_info) * max_num_units)); RestUnitSearchInfo *rusi; CHECK_MEM_ERROR( cm, rusi, (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * max_num_units)); // If the restoration unit dimensions are not multiples of // rsi->restoration_unit_size then some elements of the rusi array may be // left uninitialised when we reach copy_unit_info(...). This is not a // problem, as these elements are ignored later, but in order to quiet // Valgrind's warnings we initialise the array below. memset(rusi, 0, sizeof(*rusi) * max_num_units); return rusi; } static void set_restoration_unit_size(AV1_COMMON *cm, RestorationInfo *rsi, int is_uv, int luma_unit_size) { #if COUPLED_CHROMA_FROM_LUMA_RESTORATION int sx = cm->seq_params.subsampling_x; int sy = cm->seq_params.subsampling_y; int s = (p > 0) ? AOMMIN(sx, sy) : 0; #else int s = 0; #endif // !COUPLED_CHROMA_FROM_LUMA_RESTORATION int unit_size = luma_unit_size >> s; int plane_w, plane_h; av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); const int horz_units = av1_lr_count_units(unit_size, plane_w); const int vert_units = av1_lr_count_units(unit_size, plane_h); rsi->restoration_unit_size = unit_size; rsi->num_rest_units = horz_units * vert_units; rsi->horz_units = horz_units; rsi->vert_units = vert_units; } void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->td.mb; const SequenceHeader *const seq_params = cm->seq_params; const LOOP_FILTER_SPEED_FEATURES *lpf_sf = &cpi->sf.lpf_sf; const int num_planes = av1_num_planes(cm); const int highbd = cm->seq_params->use_highbitdepth; assert(!cm->features.all_lossless); av1_fill_lr_rates(&x->mode_costs, x->e_mbd.tile_ctx); // Select unit size based on speed feature settings, and allocate // rui structs based on this size int min_lr_unit_size = cpi->sf.lpf_sf.min_lr_unit_size; int max_lr_unit_size = cpi->sf.lpf_sf.max_lr_unit_size; // The minimum allowed unit size at a syntax level is 1 superblock. // Apply this constraint here so that the speed features code which sets // cpi->sf.lpf_sf.min_lr_unit_size does not need to know the superblock size min_lr_unit_size = AOMMAX(min_lr_unit_size, block_size_wide[cm->seq_params->sb_size]); for (int plane = 0; plane < num_planes; ++plane) { cpi->pick_lr_ctxt.rusi[plane] = allocate_search_structs( cm, &cm->rst_info[plane], plane > 0, min_lr_unit_size); } x->rdmult = cpi->rd.RDMULT; // Allocate the frame buffer trial_frame_rst, which is used to temporarily // store the loop restored frame. if (aom_realloc_frame_buffer( &cpi->trial_frame_rst, cm->superres_upscaled_width, cm->superres_upscaled_height, seq_params->subsampling_x, seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER, cm->features.byte_alignment, NULL, NULL, NULL, false, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate trial restored frame buffer"); RestSearchCtxt rsc; // The buffers 'src_avg' and 'dgd_avg' are used to compute H and M buffers. // These buffers are only required for the AVX2 and NEON implementations of // av1_compute_stats. The buffer size required is calculated based on maximum // width and height of the LRU (i.e., from foreach_rest_unit_in_plane() 1.5 // times the RESTORATION_UNITSIZE_MAX) allowed for Wiener filtering. The width // and height aligned to multiple of 16 is considered for intrinsic purpose. rsc.dgd_avg = NULL; rsc.src_avg = NULL; #if HAVE_AVX2 || HAVE_NEON || HAVE_SVE // The buffers allocated below are used during Wiener filter processing. // Hence, allocate the same when Wiener filter is enabled. Make sure to // allocate these buffers only for the SIMD extensions that make use of them // (i.e. AVX2 for low bitdepth and NEON and SVE for low and high bitdepth). #if HAVE_AVX2 bool allocate_buffers = !cpi->sf.lpf_sf.disable_wiener_filter && !highbd; #elif HAVE_NEON || HAVE_SVE bool allocate_buffers = !cpi->sf.lpf_sf.disable_wiener_filter; #endif if (allocate_buffers) { const int buf_size = sizeof(*cpi->pick_lr_ctxt.dgd_avg) * 6 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX; CHECK_MEM_ERROR(cm, cpi->pick_lr_ctxt.dgd_avg, (int16_t *)aom_memalign(32, buf_size)); rsc.dgd_avg = cpi->pick_lr_ctxt.dgd_avg; // When LRU width isn't multiple of 16, the 256 bits load instruction used // in AVX2 intrinsic can read data beyond valid LRU. Hence, in order to // silence Valgrind warning this buffer is initialized with zero. Overhead // due to this initialization is negligible since it is done at frame level. memset(rsc.dgd_avg, 0, buf_size); rsc.src_avg = rsc.dgd_avg + 3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX; // Asserts the starting address of src_avg is always 32-bytes aligned. assert(!((intptr_t)rsc.src_avg % 32)); } #endif // Initialize all planes, so that any planes we skip searching will still have // valid data for (int plane = 0; plane < num_planes; plane++) { cm->rst_info[plane].frame_restoration_type = RESTORE_NONE; } // Decide which planes to search int plane_start, plane_end; if (lpf_sf->disable_loop_restoration_luma) { plane_start = AOM_PLANE_U; } else { plane_start = AOM_PLANE_Y; } if (num_planes == 1 || lpf_sf->disable_loop_restoration_chroma) { plane_end = AOM_PLANE_Y; } else { plane_end = AOM_PLANE_V; } // Derive the flags to enable/disable Loop restoration filters based on the // speed features 'disable_wiener_filter' and 'disable_sgr_filter'. bool disable_lr_filter[RESTORE_TYPES] = { false }; av1_derive_flags_for_lr_processing(lpf_sf, disable_lr_filter); for (int plane = plane_start; plane <= plane_end; plane++) { const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf; const int is_uv = plane != AOM_PLANE_Y; int plane_w, plane_h; av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); av1_extend_frame(dgd->buffers[plane], plane_w, plane_h, dgd->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER, highbd); } double best_cost = DBL_MAX; int best_luma_unit_size = max_lr_unit_size; for (int luma_unit_size = max_lr_unit_size; luma_unit_size >= min_lr_unit_size; luma_unit_size >>= 1) { int64_t bits_this_size = 0; int64_t sse_this_size = 0; RestorationType best_rtype[MAX_MB_PLANE] = { RESTORE_NONE, RESTORE_NONE, RESTORE_NONE }; for (int plane = plane_start; plane <= plane_end; ++plane) { set_restoration_unit_size(cm, &cm->rst_info[plane], plane > 0, luma_unit_size); init_rsc(src, &cpi->common, x, lpf_sf, plane, cpi->pick_lr_ctxt.rusi[plane], &cpi->trial_frame_rst, &rsc); restoration_search(cm, plane, &rsc, disable_lr_filter); const int plane_num_units = cm->rst_info[plane].num_rest_units; const RestorationType num_rtypes = (plane_num_units > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES; double best_cost_this_plane = DBL_MAX; for (RestorationType r = 0; r < num_rtypes; ++r) { // Disable Loop restoration filter based on the flags set using speed // feature 'disable_wiener_filter' and 'disable_sgr_filter'. if (disable_lr_filter[r]) continue; double cost_this_plane = RDCOST_DBL_WITH_NATIVE_BD_DIST( x->rdmult, rsc.total_bits[r] >> 4, rsc.total_sse[r], cm->seq_params->bit_depth); if (cost_this_plane < best_cost_this_plane) { best_cost_this_plane = cost_this_plane; best_rtype[plane] = r; } } bits_this_size += rsc.total_bits[best_rtype[plane]]; sse_this_size += rsc.total_sse[best_rtype[plane]]; } double cost_this_size = RDCOST_DBL_WITH_NATIVE_BD_DIST( x->rdmult, bits_this_size >> 4, sse_this_size, cm->seq_params->bit_depth); if (cost_this_size < best_cost) { best_cost = cost_this_size; best_luma_unit_size = luma_unit_size; // Copy parameters out of rusi struct, before we overwrite it at // the start of the next iteration bool all_none = true; for (int plane = plane_start; plane <= plane_end; ++plane) { cm->rst_info[plane].frame_restoration_type = best_rtype[plane]; if (best_rtype[plane] != RESTORE_NONE) { all_none = false; const int plane_num_units = cm->rst_info[plane].num_rest_units; for (int u = 0; u < plane_num_units; ++u) { copy_unit_info(best_rtype[plane], &cpi->pick_lr_ctxt.rusi[plane][u], &cm->rst_info[plane].unit_info[u]); } } } // Heuristic: If all best_rtype entries are RESTORE_NONE, this means we // couldn't find any good filters at this size. So we likely won't find // any good filters at a smaller size either, so skip if (all_none) { break; } } else { // Heuristic: If this size is worse than the previous (larger) size, then // the next size down will likely be even worse, so skip break; } } // Final fixup to set the correct unit size // We set this for all planes, even ones we have skipped searching, // so that other code does not need to care which planes were and weren't // searched for (int plane = 0; plane < num_planes; ++plane) { set_restoration_unit_size(cm, &cm->rst_info[plane], plane > 0, best_luma_unit_size); } #if HAVE_AVX2 || HAVE_NEON || HAVE_SVE #if HAVE_AVX2 bool free_buffers = !cpi->sf.lpf_sf.disable_wiener_filter && !highbd; #elif HAVE_NEON || HAVE_SVE bool free_buffers = !cpi->sf.lpf_sf.disable_wiener_filter; #endif if (free_buffers) { aom_free(cpi->pick_lr_ctxt.dgd_avg); cpi->pick_lr_ctxt.dgd_avg = NULL; } #endif for (int plane = 0; plane < num_planes; plane++) { aom_free(cpi->pick_lr_ctxt.rusi[plane]); cpi->pick_lr_ctxt.rusi[plane] = NULL; } } aom-3.12.1/av1/encoder/pickrst.h000066400000000000000000000114071477627663500163420ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_PICKRST_H_ #define AOM_AV1_ENCODER_PICKRST_H_ #ifdef __cplusplus extern "C" { #endif #include "av1/encoder/encoder.h" struct yv12_buffer_config; struct AV1_COMP; // Enable extra debugging for loop restoration costing? // // If this is set to 1, then we record not just the selected LR parameters, but // also the values which the search process thinks they should be delta-coded // against. Then, when writing out the bitstream, we verify this information, // to help ensure that the search code is costing things properly #define DEBUG_LR_COSTING 0 #if DEBUG_LR_COSTING #define MAX_LR_UNITS_W 64 #define MAX_LR_UNITS_H 64 // Storage for reference parameters. // // The storage size is determined by: // * This is always written and then checked within the same frame encode pass, // so we do not need to buffer multiple frames of data // * The parameters can be different per plane within one frame // * The relevant set of ref parameters can differ between the search where // we set the frame restoration mode to RESTORE_WIENER, and the search where // we set it to RESTORE_SWITCHABLE. // So we need to store at least two sets of Wiener params and two sets of // SGR params, and the easiest way to do this is to index by // frame_restoration_type extern RestorationUnitInfo lr_ref_params[RESTORE_TYPES][MAX_MB_PLANE] [MAX_LR_UNITS_W * MAX_LR_UNITS_H]; #endif // DEBUG_LR_COSTING static const uint8_t g_shuffle_stats_data[16] = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, }; static const uint8_t g_shuffle_stats_highbd_data[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, }; static inline uint8_t find_average(const uint8_t *src, int h_start, int h_end, int v_start, int v_end, int stride) { uint64_t sum = 0; for (int i = v_start; i < v_end; i++) { for (int j = h_start; j < h_end; j++) { sum += src[i * stride + j]; } } uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start)); return (uint8_t)avg; } #if CONFIG_AV1_HIGHBITDEPTH static inline uint16_t find_average_highbd(const uint16_t *src, int h_start, int h_end, int v_start, int v_end, int stride) { uint64_t sum = 0; for (int i = v_start; i < v_end; i++) { for (int j = h_start; j < h_end; j++) { sum += src[i * stride + j]; } } uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start)); return (uint16_t)avg; } #endif /*!\brief Algorithm for AV1 loop restoration search and estimation. * * \ingroup in_loop_restoration * This function determines proper restoration filter types and * associated parameters for each restoration unit in a frame. * * \param[in] sd Source frame buffer * \param[in,out] cpi Top-level encoder structure * * \remark Nothing is returned. Instead, chosen restoration filter * types and parameters are stored per plane in the \c rst_info structure * of type \ref RestorationInfo inside \c cpi->common: * \arg \c rst_info[ \c 0 ]: Chosen parameters for Y plane * \arg \c rst_info[ \c 1 ]: Chosen parameters for U plane if it exists * \arg \c rst_info[ \c 2 ]: Chosen parameters for V plane if it exists * \par * The following fields in each \c rst_info[ \c p], \c p = 0, 1, 2 * are populated: * \arg \c rst_info[ \c p ].\c frame_restoration_type * \arg \c rst_info[ \c p ].\c unit_info[ \c u ], * for each \c u in 0, 1, ..., \c n( \c p ) - 1, * where \c n( \c p ) is the number of restoration units in plane \c p. * \par * The following fields in each \c rst_info[ \c p ].\c unit_info[ \c u ], * \c p = 0, 1, 2 and \c u = 0, 1, ..., \c n( \c p ) - 1, of type * \ref RestorationUnitInfo are populated: * \arg \c rst_info[ \c p ].\c unit_info[ \c u ].\c restoration_type * \arg \c rst_info[ \c p ].\c unit_info[ \c u ].\c wiener_info OR * \c rst_info[ \c p ].\c unit_info[ \c u ].\c sgrproj_info OR * neither, depending on * \c rst_info[ \c p ].\c unit_info[ \c u ].\c restoration_type * */ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_PICKRST_H_ aom-3.12.1/av1/encoder/pustats.h000066400000000000000000000215671477627663500163760ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_PUSTATS_H_ #define AOM_AV1_ENCODER_PUSTATS_H_ #ifdef __cplusplus extern "C" { #endif #include "av1/encoder/ml.h" #define NUM_FEATURES_PUSTATS 8 #define NUM_HIDDEN_LAYERS 2 #define HIDDEN_LAYERS_0_NODES 12 #define HIDDEN_LAYERS_1_NODES 10 #define LOGITS_NODES 1 static const float av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS * HIDDEN_LAYERS_0_NODES] = { -0.1758f, -0.0499f, -10.0069f, -2.2838f, -0.3359f, 0.3459f, -0.3285f, -0.0515f, -0.5417f, 0.2357f, -0.0575f, -69.0782f, 0.5348f, 1.4068f, 0.2213f, -1.0490f, -0.0636f, 0.1654f, 1.1002f, 33.4924f, 0.4358f, 1.2499f, 0.1143f, 0.0592f, -1.6335f, -0.0092f, 1.2207f, -28.4543f, -0.4973f, 0.4368f, 0.2341f, -0.1623f, -3.8986f, 0.1311f, -1.8789f, -3.9079f, -0.8158f, -0.8420f, 1.4295f, -2.3629f, -1.4825f, 0.6498f, -5.3669f, 6.4434f, 1.8393f, -35.0678f, 3.7459f, -2.8504f, 2.0502f, -0.1812f, -3.9011f, -1.0155f, 1.8375f, -1.4517f, 1.3917f, 3.8664f, 0.8345f, -0.3472f, 5.7740f, -1.1196f, -0.3264f, -1.2481f, -0.9284f, -4.9657f, 2.2831f, 0.7337f, 2.3176f, 0.6416f, 0.8804f, 1.9988f, -1.3426f, 1.2728f, 1.2249f, -0.1551f, 5.6045f, 0.2046f, -2.1464f, -2.4922f, -0.5334f, 12.1055f, 7.2467f, -0.0070f, 0.0234f, 0.0021f, 0.0215f, -0.0098f, -0.0682f, -6.1494f, -0.3176f, -1.6069f, -0.2119f, -1.0533f, -0.3566f, 0.5294f, -0.4335f, 0.1626f, }; static const float av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = { 10.5266f, 5.3268f, -1.0678f, 7.7411f, 8.7164f, -0.3235f, 7.3028f, 9.0874f, -6.4594f, -1.0102f, -1.1146f, 10.8419f, }; static const float av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES * HIDDEN_LAYERS_1_NODES] = { 10.5932f, 2.5192f, -0.0015f, 5.9479f, 5.2426f, -0.4091f, 5.3220f, 6.0469f, 0.7200f, 3.3241f, 5.5006f, 12.8290f, -1.6396f, 0.5743f, -0.8370f, 1.9956f, -4.9270f, -1.5295f, 2.1350f, -9.4415f, -0.7094f, 5.1822f, 19.7287f, -3.0444f, -0.3320f, 0.0031f, -0.2709f, -0.5249f, 0.3281f, -0.2240f, 0.2225f, -0.2386f, -0.4370f, -0.2438f, -0.4928f, -0.2842f, -2.1772f, 9.2570f, -17.6655f, 3.5448f, -2.8394f, -1.0167f, -0.5115f, -1.9260f, -0.2111f, -0.7528f, -1.2387f, -0.0401f, 5.0716f, -3.3763f, -0.2898f, -0.4956f, -7.9993f, 0.1526f, -0.0242f, 0.7354f, 6.0432f, 4.8043f, 7.4790f, -0.6295f, 1.7565f, 3.7197f, -2.3963f, 6.8945f, 2.9717f, -3.1623f, 3.4241f, 4.4676f, -1.8154f, -2.9401f, -8.5657f, -3.0240f, -1.4661f, 8.1145f, -12.7858f, 3.3624f, -1.0819f, -4.2856f, 1.1801f, -0.5587f, -1.6062f, -1.1813f, -3.5882f, -0.2490f, -24.9566f, -0.4140f, -0.1113f, 3.5537f, 4.4112f, 0.1367f, -1.5876f, 1.6605f, 1.3903f, -0.0253f, -2.1419f, -2.2197f, -0.7659f, -0.4249f, -0.0424f, 0.1486f, 0.4643f, -0.9068f, -0.3619f, -0.7624f, -0.9132f, -0.4947f, -0.3527f, -0.5445f, -0.4768f, -1.7761f, -1.0686f, 0.5462f, 1.3371f, 4.3116f, 0.0777f, -2.7216f, -1.8908f, 3.4989f, 7.7269f, -2.7566f, }; static const float av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = { 13.2435f, -8.5477f, -0.0998f, -1.5131f, -12.0187f, 6.1715f, 0.5094f, 7.6433f, -0.3992f, -1.3555f, }; static const float av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = { 4.3078f, -17.3497f, 0.0195f, 34.6032f, -5.0127f, 5.3079f, 10.0077f, -13.129f, 0.0087f, -8.4009f, }; static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = { 4.5103f, }; static const NN_CONFIG av1_pustats_rate_nnconfig = { NUM_FEATURES_PUSTATS, // num_inputs LOGITS_NODES, // num_outputs NUM_HIDDEN_LAYERS, // num_hidden_layers { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes { av1_pustats_rate_hiddenlayer_0_kernel, av1_pustats_rate_hiddenlayer_1_kernel, av1_pustats_rate_logits_kernel, }, { av1_pustats_rate_hiddenlayer_0_bias, av1_pustats_rate_hiddenlayer_1_bias, av1_pustats_rate_logits_bias, }, }; static const float av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS * HIDDEN_LAYERS_0_NODES] = { -0.2560f, 0.1105f, -0.8434f, -0.0132f, -8.9371f, -1.1176f, -0.3655f, 0.4885f, 1.7518f, 0.4985f, 0.5582f, -0.3739f, 0.9403f, 0.3874f, 0.3265f, 1.7383f, 3.1747f, 0.0285f, 3.3942f, -0.0123f, 0.5057f, 0.1584f, 0.2697f, 4.6151f, 3.6251f, -0.0121f, -1.0047f, -0.0037f, 0.0127f, 0.1935f, -0.5277f, -2.7144f, 0.0729f, -0.1457f, -0.0816f, -0.5462f, 0.4738f, 0.3599f, -0.0564f, 0.0910f, 0.0126f, -0.0310f, -2.1311f, -0.4666f, -0.0074f, -0.0765f, 0.0287f, -0.2662f, -0.0999f, -0.2983f, -0.4899f, -0.2314f, 0.2873f, -0.3614f, 0.1783f, -0.1210f, 0.3569f, 0.5436f, -8.0536f, -0.0044f, -1.5255f, -0.8247f, -0.4556f, 1.9045f, 0.5463f, 0.1102f, -0.9293f, -0.0185f, -0.8302f, -0.4378f, -0.3531f, -1.3095f, 0.6099f, 0.7977f, 4.1950f, -0.0067f, -0.2762f, -0.1574f, -0.2149f, 0.6104f, -1.7053f, 0.1904f, 4.2402f, -0.2671f, 0.8940f, 0.6820f, 0.2241f, -0.9459f, 1.4571f, 0.5255f, 2.3352f, -0.0806f, 0.5231f, 0.3928f, 0.4146f, 2.0956f, }; static const float av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = { 1.1597f, 0.0836f, -0.7471f, -0.2439f, -0.0438f, 2.4626f, 0.f, 1.1485f, 2.7085f, -4.7897f, 1.4093f, -1.657f, }; static const float av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES * HIDDEN_LAYERS_1_NODES] = { -0.5203f, -1.3468f, 0.3865f, -0.6859f, 0.0058f, 4.0682f, 0.4807f, -0.1380f, 0.6050f, 0.8958f, 0.7748f, -0.1311f, 1.7317f, 1.1265f, 0.0827f, 0.1407f, -0.3605f, 0.5429f, 0.1880f, -0.1439f, 0.2837f, 1.6477f, 0.0832f, 0.0593f, -1.8464f, -0.7241f, -1.0672f, -0.3546f, -0.3842f, -2.3637f, 0.2514f, 0.8263f, -0.1872f, 0.5774f, -0.3610f, -0.0205f, 1.3977f, -0.1083f, 0.6923f, 1.3039f, -0.2870f, 1.0622f, -0.0566f, 0.2697f, -0.5429f, -0.6193f, 1.7559f, 0.3246f, 1.9159f, 0.3744f, 0.0686f, 1.0191f, -0.4212f, 1.9591f, -0.0691f, -0.1085f, -1.2034f, 0.0606f, 1.0116f, 0.5565f, -0.1874f, -0.7898f, 0.4796f, 0.2290f, 0.4334f, -0.5817f, -0.2949f, 0.1367f, -0.2932f, -1.1265f, 0.0133f, -0.5309f, -3.3191f, 0.0939f, 0.3895f, -2.5812f, -0.0066f, -3.0063f, -0.2982f, 0.7309f, -0.2422f, -0.2770f, -0.7152f, 0.1700f, 1.9630f, 0.1988f, 0.4194f, 0.8762f, 0.3402f, 0.1051f, -0.1598f, 0.2405f, 0.0392f, 1.1256f, 1.5245f, 0.0950f, 0.2160f, -0.5023f, 0.2584f, 0.2074f, 0.2218f, 0.3966f, -0.0921f, -0.2435f, -0.4560f, -1.1923f, -0.3716f, -0.3286f, -1.3225f, 0.1896f, -0.3342f, -0.7888f, -0.4488f, -1.7168f, 0.3341f, 0.1146f, 0.5226f, 0.2610f, -0.4574f, -0.4164f, }; static const float av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = { -2.3014f, -2.4292f, 1.3317f, -3.2361f, -1.918f, 2.7149f, -2.5649f, 2.7765f, 2.9617f, 2.7684f, }; static const float av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = { -0.6868f, -0.6715f, 0.449f, -1.293f, 0.6214f, 0.9894f, -0.4342f, 0.7002f, 1.4363f, 0.6951f, }; static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = { 2.3371f, }; static const NN_CONFIG av1_pustats_dist_nnconfig = { NUM_FEATURES_PUSTATS, // num_inputs LOGITS_NODES, // num_outputs NUM_HIDDEN_LAYERS, // num_hidden_layers { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes { av1_pustats_dist_hiddenlayer_0_kernel, av1_pustats_dist_hiddenlayer_1_kernel, av1_pustats_dist_logits_kernel, }, { av1_pustats_dist_hiddenlayer_0_bias, av1_pustats_dist_hiddenlayer_1_bias, av1_pustats_dist_logits_bias, }, }; #undef NUM_HIDDEN_LAYERS #undef HIDDEN_LAYERS_0_NODES #undef HIDDEN_LAYERS_1_NODES #undef LOGITS_NODES #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_PUSTATS_H_ aom-3.12.1/av1/encoder/random.h000066400000000000000000000054451477627663500161500ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_RANDOM_H_ #define AOM_AV1_ENCODER_RANDOM_H_ #include #ifdef __cplusplus extern "C" { #endif // Advance the generator to its next state, and generate the next 32-bit output. // Note that the low bits of this output are comparatively low-quality, so users // of this function should ensure that the high bits factor through to their // outputs. static inline uint32_t lcg_next(uint32_t *state) { *state = (uint32_t)(*state * 1103515245ULL + 12345); return *state; } // Generate a random number in the range [0, 32768). static inline uint32_t lcg_rand16(uint32_t *state) { return (lcg_next(state) / 65536) % 32768; } // Generate a random number in the range [0, n) // This is implemented as (rand() * n) / rather than // rand() % n, for a few reasons: This implementation is faster and less biased, // and if is a power of 2, this uses the higher-quality top bits from the RNG // output rather than the lower-quality bottom bits. static inline uint32_t lcg_randint(uint32_t *state, uint32_t n) { uint64_t v = ((uint64_t)lcg_next(state) * n) >> 32; return (uint32_t)v; } // Generate a random number in the range [lo, hi) static inline uint32_t lcg_randrange(uint32_t *state, uint32_t lo, uint32_t hi) { assert(lo < hi); return lo + lcg_randint(state, hi - lo); } // Pick k distinct numbers from the set {0, ..., n-1} // All possible sets of k numbers, and all possible orderings of those numbers, // are equally likely. // // Note: The algorithm used here uses resampling to avoid choosing repeated // values. This works well as long as n >> k, but can potentially lead to many // resampling attempts if n is equal to or only slightly larger than k. static inline void lcg_pick(int n, int k, int *out, unsigned int *seed) { assert(0 <= k && k <= n); for (int i = 0; i < k; i++) { int v; // Inner resampling loop // We have to use a goto here because C does not have a multi-level continue // statement resample: v = (int)lcg_randint(seed, n); for (int j = 0; j < i; j++) { if (v == out[j]) { // Repeated v, resample goto resample; } } // New v, accept out[i] = v; } } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_RANDOM_H_ aom-3.12.1/av1/encoder/ratectrl.c000066400000000000000000005143051477627663500165030ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include #include #include #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "aom_ports/aom_once.h" #include "av1/common/alloccommon.h" #include "av1/encoder/aq_cyclicrefresh.h" #include "av1/common/common.h" #include "av1/common/entropymode.h" #include "av1/common/quant_common.h" #include "av1/common/seg_common.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/encoder_utils.h" #include "av1/encoder/encode_strategy.h" #include "av1/encoder/gop_structure.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/random.h" #include "av1/encoder/ratectrl.h" #include "config/aom_dsp_rtcd.h" #define USE_UNRESTRICTED_Q_IN_CQ_MODE 0 // Max rate target for 1080P and below encodes under normal circumstances // (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB #define MAX_MB_RATE 250 #define MAXRATE_1080P 2025000 #define MIN_BPB_FACTOR 0.005 #define MAX_BPB_FACTOR 50 #define SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO 0 #define SUPERRES_QADJ_PER_DENOM_KEYFRAME 2 #define SUPERRES_QADJ_PER_DENOM_ARFFRAME 0 #define FRAME_OVERHEAD_BITS 200 #define ASSIGN_MINQ_TABLE(bit_depth, name) \ do { \ switch (bit_depth) { \ case AOM_BITS_8: name = name##_8; break; \ case AOM_BITS_10: name = name##_10; break; \ case AOM_BITS_12: name = name##_12; break; \ default: \ assert(0 && \ "bit_depth should be AOM_BITS_8, AOM_BITS_10" \ " or AOM_BITS_12"); \ name = NULL; \ } \ } while (0) // Tables relating active max Q to active min Q static int kf_low_motion_minq_8[QINDEX_RANGE]; static int kf_high_motion_minq_8[QINDEX_RANGE]; static int arfgf_low_motion_minq_8[QINDEX_RANGE]; static int arfgf_high_motion_minq_8[QINDEX_RANGE]; static int inter_minq_8[QINDEX_RANGE]; static int rtc_minq_8[QINDEX_RANGE]; static int kf_low_motion_minq_10[QINDEX_RANGE]; static int kf_high_motion_minq_10[QINDEX_RANGE]; static int arfgf_low_motion_minq_10[QINDEX_RANGE]; static int arfgf_high_motion_minq_10[QINDEX_RANGE]; static int inter_minq_10[QINDEX_RANGE]; static int rtc_minq_10[QINDEX_RANGE]; static int kf_low_motion_minq_12[QINDEX_RANGE]; static int kf_high_motion_minq_12[QINDEX_RANGE]; static int arfgf_low_motion_minq_12[QINDEX_RANGE]; static int arfgf_high_motion_minq_12[QINDEX_RANGE]; static int inter_minq_12[QINDEX_RANGE]; static int rtc_minq_12[QINDEX_RANGE]; static int gf_high = 2400; static int gf_low = 300; #ifdef STRICT_RC static int kf_high = 3200; #else static int kf_high = 5000; #endif static int kf_low = 400; // How many times less pixels there are to encode given the current scaling. // Temporary replacement for rcf_mult and rate_thresh_mult. static double resize_rate_factor(const FrameDimensionCfg *const frm_dim_cfg, int width, int height) { return (double)(frm_dim_cfg->width * frm_dim_cfg->height) / (width * height); } // Functions to compute the active minq lookup table entries based on a // formulaic approach to facilitate easier adjustment of the Q tables. // The formulae were derived from computing a 3rd order polynomial best // fit to the original data (after plotting real maxq vs minq (not q index)) static int get_minq_index(double maxq, double x3, double x2, double x1, aom_bit_depth_t bit_depth) { const double minqtarget = AOMMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq); // Special case handling to deal with the step from q2.0 // down to lossless mode represented by q 1.0. if (minqtarget <= 2.0) return 0; return av1_find_qindex(minqtarget, bit_depth, 0, QINDEX_RANGE - 1); } static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low, int *arfgf_high, int *inter, int *rtc, aom_bit_depth_t bit_depth) { int i; for (i = 0; i < QINDEX_RANGE; i++) { const double maxq = av1_convert_qindex_to_q(i, bit_depth); kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth); kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth); arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth); arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth); rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth); } } static void rc_init_minq_luts(void) { init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8, arfgf_low_motion_minq_8, arfgf_high_motion_minq_8, inter_minq_8, rtc_minq_8, AOM_BITS_8); init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10, arfgf_low_motion_minq_10, arfgf_high_motion_minq_10, inter_minq_10, rtc_minq_10, AOM_BITS_10); init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12, arfgf_low_motion_minq_12, arfgf_high_motion_minq_12, inter_minq_12, rtc_minq_12, AOM_BITS_12); } void av1_rc_init_minq_luts(void) { aom_once(rc_init_minq_luts); } // These functions use formulaic calculations to make playing with the // quantizer tables easier. If necessary they can be replaced by lookup // tables if and when things settle down in the experimental bitstream double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) { // Convert the index to a real Q value (scaled down to match old Q values) switch (bit_depth) { case AOM_BITS_8: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 4.0; case AOM_BITS_10: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 16.0; case AOM_BITS_12: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 64.0; default: assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); return -1.0; } } int av1_convert_q_to_qindex(double q, aom_bit_depth_t bit_depth) { int qindex = MINQ; // Find the first qindex that matches or exceeds q. // Note: this operation can also be done with a binary search, as // av1_convert_qindex_to_q() is monotonically increasing with respect to // increasing qindex. while (qindex < MAXQ && av1_convert_qindex_to_q(qindex, bit_depth) < q) { qindex++; } return qindex; } // Gets the appropriate bpmb enumerator based on the frame and content type static int get_bpmb_enumerator(FRAME_TYPE frame_type, const int is_screen_content_type) { int enumerator; if (is_screen_content_type) { enumerator = (frame_type == KEY_FRAME) ? 1000000 : 750000; } else { enumerator = (frame_type == KEY_FRAME) ? 2000000 : 1500000; } return enumerator; } static int get_init_ratio(double sse) { return (int)(300000 / sse); } // Adjustment based on spatial content and last encoded keyframe. // Allow for increase in enumerator to reduce overshoot. static int adjust_rtc_keyframe(const RATE_CONTROL *rc, int enumerator) { // Don't adjust if most of the image is flat. if (rc->perc_spatial_flat_blocks > 70) return enumerator; if (rc->last_encoded_size_keyframe == 0 || rc->frames_since_scene_change < rc->frames_since_key) { // Very first frame, or if scene change happened after last keyframe. if (rc->frame_spatial_variance > 1000 || (rc->frame_spatial_variance > 500 && rc->perc_spatial_flat_blocks == 0)) return enumerator << 3; else if (rc->frame_spatial_variance > 500 && rc->perc_spatial_flat_blocks < 10) return enumerator << 2; else if (rc->frame_spatial_variance > 400) return enumerator << 1; } else if (rc->frames_since_scene_change >= rc->frames_since_key) { // There was no scene change before previous encoded keyframe, so // use the last_encoded/target_size_keyframe. if (rc->last_encoded_size_keyframe > 4 * rc->last_target_size_keyframe && rc->frame_spatial_variance > 500) return enumerator << 3; else if (rc->last_encoded_size_keyframe > 2 * rc->last_target_size_keyframe && rc->frame_spatial_variance > 200) return enumerator << 2; else if (rc->last_encoded_size_keyframe > rc->last_target_size_keyframe) return enumerator << 1; } return enumerator; } int av1_rc_bits_per_mb(const AV1_COMP *cpi, FRAME_TYPE frame_type, int qindex, double correction_factor, int accurate_estimate) { const AV1_COMMON *const cm = &cpi->common; const int is_screen_content_type = cpi->is_screen_content_type; const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth; const double q = av1_convert_qindex_to_q(qindex, bit_depth); int enumerator = get_bpmb_enumerator(frame_type, is_screen_content_type); assert(correction_factor <= MAX_BPB_FACTOR && correction_factor >= MIN_BPB_FACTOR); if (cpi->oxcf.rc_cfg.mode == AOM_CBR && frame_type != KEY_FRAME && accurate_estimate && cpi->rec_sse != UINT64_MAX) { const int mbs = cm->mi_params.MBs; const double sse_sqrt = (double)((int)sqrt((double)(cpi->rec_sse)) << BPER_MB_NORMBITS) / (double)mbs; const int ratio = (cpi->rc.bit_est_ratio == 0) ? get_init_ratio(sse_sqrt) : cpi->rc.bit_est_ratio; // Clamp the enumerator to lower the q fluctuations. enumerator = AOMMIN(AOMMAX((int)(ratio * sse_sqrt), 20000), 170000); } else if (cpi->oxcf.rc_cfg.mode == AOM_CBR && frame_type == KEY_FRAME && cpi->sf.rt_sf.rc_adjust_keyframe && bit_depth == 8 && cpi->oxcf.rc_cfg.max_intra_bitrate_pct > 0 && cpi->svc.spatial_layer_id == 0) { enumerator = adjust_rtc_keyframe(&cpi->rc, enumerator); } // q based adjustment to baseline enumerator return (int)(enumerator * correction_factor / q); } int av1_estimate_bits_at_q(const AV1_COMP *cpi, int q, double correction_factor) { const AV1_COMMON *const cm = &cpi->common; const FRAME_TYPE frame_type = cm->current_frame.frame_type; const int mbs = cm->mi_params.MBs; const int bpm = (int)(av1_rc_bits_per_mb(cpi, frame_type, q, correction_factor, cpi->sf.hl_sf.accurate_bit_estimate)); return AOMMAX(FRAME_OVERHEAD_BITS, (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS); } static int clamp_pframe_target_size(const AV1_COMP *const cpi, int64_t target, FRAME_UPDATE_TYPE frame_update_type) { const RATE_CONTROL *rc = &cpi->rc; const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; const int min_frame_target = AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); // Clip the frame target to the minimum setup value. if (frame_update_type == OVERLAY_UPDATE || frame_update_type == INTNL_OVERLAY_UPDATE) { // If there is an active ARF at this location use the minimum // bits on this frame even if it is a constructed arf. // The active maximum quantizer insures that an appropriate // number of bits will be spent if needed for constructed ARFs. target = min_frame_target; } else if (target < min_frame_target) { target = min_frame_target; } // Clip the frame target to the maximum allowed value. if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; if (rc_cfg->max_inter_bitrate_pct) { const int64_t max_rate = (int64_t)rc->avg_frame_bandwidth * rc_cfg->max_inter_bitrate_pct / 100; target = AOMMIN(target, max_rate); } return (int)target; } static int clamp_iframe_target_size(const AV1_COMP *const cpi, int64_t target) { const RATE_CONTROL *rc = &cpi->rc; const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; if (rc_cfg->max_intra_bitrate_pct) { const int64_t max_rate = (int64_t)rc->avg_frame_bandwidth * rc_cfg->max_intra_bitrate_pct / 100; target = AOMMIN(target, max_rate); } if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; return (int)target; } // Update the buffer level for higher temporal layers, given the encoded current // temporal layer. static void update_layer_buffer_level(SVC *svc, int encoded_frame_size, bool is_screen) { const int current_temporal_layer = svc->temporal_layer_id; for (int i = current_temporal_layer + 1; i < svc->number_temporal_layers; ++i) { const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers); LAYER_CONTEXT *lc = &svc->layer_context[layer]; PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc; lp_rc->bits_off_target += (int)round(lc->target_bandwidth / lc->framerate) - encoded_frame_size; // Clip buffer level to maximum buffer size for the layer. lp_rc->bits_off_target = AOMMIN(lp_rc->bits_off_target, lp_rc->maximum_buffer_size); lp_rc->buffer_level = lp_rc->bits_off_target; // For screen-content mode: don't let buffer level go below threshold, // given here as -rc->maximum_ buffer_size, to allow buffer to come back // up sooner after slide change with big overshoot. if (is_screen) { lp_rc->bits_off_target = AOMMAX(lp_rc->bits_off_target, -lp_rc->maximum_buffer_size); lp_rc->buffer_level = lp_rc->bits_off_target; } } } // Update the buffer level: leaky bucket model. static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) { const AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; // Non-viewable frames are a special case and are treated as pure overhead. if (!cm->show_frame) p_rc->bits_off_target -= encoded_frame_size; else p_rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size; // Clip the buffer level to the maximum specified buffer size. p_rc->bits_off_target = AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size); // For screen-content mode: don't let buffer level go below threshold, // given here as -rc->maximum_ buffer_size, to allow buffer to come back // up sooner after slide change with big overshoot. if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) p_rc->bits_off_target = AOMMAX(p_rc->bits_off_target, -p_rc->maximum_buffer_size); p_rc->buffer_level = p_rc->bits_off_target; if (cpi->ppi->use_svc) update_layer_buffer_level(&cpi->svc, encoded_frame_size, cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN); #if CONFIG_FPMT_TEST /* The variable temp_buffer_level is introduced for quality * simulation purpose, it retains the value previous to the parallel * encode frames. The variable is updated based on the update flag. * * If there exist show_existing_frames between parallel frames, then to * retain the temp state do not update it. */ int show_existing_between_parallel_frames = (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE && cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2); if (cpi->do_frame_data_update && !show_existing_between_parallel_frames && cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { p_rc->temp_buffer_level = p_rc->buffer_level; } #endif } int av1_rc_get_default_min_gf_interval(int width, int height, double framerate) { // Assume we do not need any constraint lower than 4K 20 fps static const double factor_safe = 3840 * 2160 * 20.0; const double factor = (double)width * height * framerate; const int default_interval = clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL); if (factor <= factor_safe) return default_interval; else return AOMMAX(default_interval, (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5)); // Note this logic makes: // 4K24: 5 // 4K30: 6 // 4K60: 12 } // Note get_default_max_gf_interval() requires the min_gf_interval to // be passed in to ensure that the max_gf_interval returned is at least as big // as that. static int get_default_max_gf_interval(double framerate, int min_gf_interval) { int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75)); interval += (interval & 0x01); // Round to even value interval = AOMMAX(MAX_GF_INTERVAL, interval); return AOMMAX(interval, min_gf_interval); } void av1_primary_rc_init(const AV1EncoderConfig *oxcf, PRIMARY_RATE_CONTROL *p_rc) { const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; int worst_allowed_q = rc_cfg->worst_allowed_q; int min_gf_interval = oxcf->gf_cfg.min_gf_interval; int max_gf_interval = oxcf->gf_cfg.max_gf_interval; if (min_gf_interval == 0) min_gf_interval = av1_rc_get_default_min_gf_interval( oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, oxcf->input_cfg.init_framerate); if (max_gf_interval == 0) max_gf_interval = get_default_max_gf_interval( oxcf->input_cfg.init_framerate, min_gf_interval); p_rc->baseline_gf_interval = (min_gf_interval + max_gf_interval) / 2; p_rc->this_key_frame_forced = 0; p_rc->next_key_frame_forced = 0; p_rc->ni_frames = 0; p_rc->tot_q = 0.0; p_rc->total_actual_bits = 0; p_rc->total_target_bits = 0; p_rc->buffer_level = p_rc->starting_buffer_level; if (oxcf->target_seq_level_idx[0] < SEQ_LEVELS) { worst_allowed_q = 255; } if (oxcf->pass == AOM_RC_ONE_PASS && rc_cfg->mode == AOM_CBR) { p_rc->avg_frame_qindex[KEY_FRAME] = worst_allowed_q; p_rc->avg_frame_qindex[INTER_FRAME] = worst_allowed_q; } else { p_rc->avg_frame_qindex[KEY_FRAME] = (worst_allowed_q + rc_cfg->best_allowed_q) / 2; p_rc->avg_frame_qindex[INTER_FRAME] = (worst_allowed_q + rc_cfg->best_allowed_q) / 2; } p_rc->avg_q = av1_convert_qindex_to_q(rc_cfg->worst_allowed_q, oxcf->tool_cfg.bit_depth); p_rc->last_q[KEY_FRAME] = rc_cfg->best_allowed_q; p_rc->last_q[INTER_FRAME] = rc_cfg->worst_allowed_q; for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) { p_rc->rate_correction_factors[i] = 0.7; } p_rc->rate_correction_factors[KF_STD] = 1.0; p_rc->bits_off_target = p_rc->starting_buffer_level; p_rc->rolling_target_bits = AOMMAX( 1, (int)(oxcf->rc_cfg.target_bandwidth / oxcf->input_cfg.init_framerate)); p_rc->rolling_actual_bits = AOMMAX( 1, (int)(oxcf->rc_cfg.target_bandwidth / oxcf->input_cfg.init_framerate)); } void av1_rc_init(const AV1EncoderConfig *oxcf, RATE_CONTROL *rc) { const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; rc->frames_since_key = 8; // Sensible default for first frame. rc->frames_to_fwd_kf = oxcf->kf_cfg.fwd_kf_dist; rc->frames_till_gf_update_due = 0; rc->ni_av_qi = rc_cfg->worst_allowed_q; rc->ni_tot_qi = 0; rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval; rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval; if (rc->min_gf_interval == 0) rc->min_gf_interval = av1_rc_get_default_min_gf_interval( oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, oxcf->input_cfg.init_framerate); if (rc->max_gf_interval == 0) rc->max_gf_interval = get_default_max_gf_interval( oxcf->input_cfg.init_framerate, rc->min_gf_interval); rc->avg_frame_low_motion = 0; rc->resize_state = ORIG; rc->resize_avg_qp = 0; rc->resize_buffer_underflow = 0; rc->resize_count = 0; rc->rtc_external_ratectrl = 0; rc->frame_level_fast_extra_bits = 0; rc->use_external_qp_one_pass = 0; rc->percent_blocks_inactive = 0; rc->force_max_q = 0; rc->postencode_drop = 0; rc->frames_since_scene_change = 0; } static bool check_buffer_below_thresh(AV1_COMP *cpi, int64_t buffer_level, int drop_mark) { SVC *svc = &cpi->svc; if (!cpi->ppi->use_svc || cpi->svc.number_spatial_layers == 1 || cpi->svc.framedrop_mode == AOM_LAYER_DROP) { return (buffer_level <= drop_mark); } else { // For SVC in the AOM_FULL_SUPERFRAME_DROP): the condition on // buffer is checked on current and upper spatial layers. for (int i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) { const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, svc->number_temporal_layers); LAYER_CONTEXT *lc = &svc->layer_context[layer]; PRIMARY_RATE_CONTROL *lrc = &lc->p_rc; // Exclude check for layer whose bitrate is 0. if (lc->target_bandwidth > 0) { const int drop_thresh = cpi->oxcf.rc_cfg.drop_frames_water_mark; const int drop_mark_layer = (int)(drop_thresh * lrc->optimal_buffer_level / 100); if (lrc->buffer_level <= drop_mark_layer) return true; } } return false; } } int av1_rc_drop_frame(AV1_COMP *cpi) { const AV1EncoderConfig *oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; #if CONFIG_FPMT_TEST const int simulate_parallel_frame = cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; int64_t buffer_level = simulate_parallel_frame ? p_rc->temp_buffer_level : p_rc->buffer_level; #else int64_t buffer_level = p_rc->buffer_level; #endif // Never drop on key frame, or for frame whose base layer is key. // If drop_count_consec hits or exceeds max_consec_drop then don't drop. if (cpi->common.current_frame.frame_type == KEY_FRAME || (cpi->ppi->use_svc && cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) || !oxcf->rc_cfg.drop_frames_water_mark || (rc->max_consec_drop > 0 && rc->drop_count_consec >= rc->max_consec_drop)) { return 0; } else { SVC *svc = &cpi->svc; // In the full_superframe framedrop mode for svc, if the previous spatial // layer was dropped, drop the current spatial layer. if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 && svc->drop_spatial_layer[svc->spatial_layer_id - 1] && svc->framedrop_mode == AOM_FULL_SUPERFRAME_DROP) return 1; // -1 is passed here for drop_mark since we are checking if // buffer goes below 0 (<= -1). if (check_buffer_below_thresh(cpi, buffer_level, -1)) { // Always drop if buffer is below 0. rc->drop_count_consec++; return 1; } else { // If buffer is below drop_mark, for now just drop every other frame // (starting with the next frame) until it increases back over drop_mark. const int drop_mark = (int)(oxcf->rc_cfg.drop_frames_water_mark * p_rc->optimal_buffer_level / 100); const bool buffer_below_thresh = check_buffer_below_thresh(cpi, buffer_level, drop_mark); if (!buffer_below_thresh && rc->decimation_factor > 0) { --rc->decimation_factor; } else if (buffer_below_thresh && rc->decimation_factor == 0) { rc->decimation_factor = 1; } if (rc->decimation_factor > 0) { if (rc->decimation_count > 0) { --rc->decimation_count; rc->drop_count_consec++; return 1; } else { rc->decimation_count = rc->decimation_factor; return 0; } } else { rc->decimation_count = 0; return 0; } } } } static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality, int width, int height) { const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const AV1_COMMON *const cm = &cpi->common; const SVC *const svc = &cpi->svc; const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; // Flag to indicate previous frame has overshoot, and buffer level // for current frame is low (less than ~half of optimal). For such // (inter) frames, if the source_sad is non-zero, relax the max_delta_up // and clamp applied below. const bool overshoot_buffer_low = cpi->rc.rc_1_frame == -1 && rc->frame_source_sad > 1000 && p_rc->buffer_level < (p_rc->optimal_buffer_level >> 1) && rc->frames_since_key > 4; int max_delta_down; int max_delta_up = overshoot_buffer_low ? 120 : 20; const int change_avg_frame_bandwidth = abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) > 0.1 * (rc->avg_frame_bandwidth); // Set the maximum adjustment down for Q for this frame. if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->cyclic_refresh->apply_cyclic_refresh) { // For static screen type content limit the Q drop till the start of the // next refresh cycle. if (cpi->is_screen_content_type && (cpi->cyclic_refresh->sb_index > cpi->cyclic_refresh->last_sb_index)) { max_delta_down = AOMMIN(8, AOMMAX(1, rc->q_1_frame / 32)); } else { max_delta_down = AOMMIN(16, AOMMAX(1, rc->q_1_frame / 8)); } if (!cpi->ppi->use_svc && cpi->is_screen_content_type) { // Link max_delta_up to max_delta_down and buffer status. if (p_rc->buffer_level > p_rc->optimal_buffer_level) { max_delta_up = AOMMAX(4, max_delta_down); } else if (!overshoot_buffer_low) { max_delta_up = AOMMAX(8, max_delta_down); } } } else { max_delta_down = (cpi->is_screen_content_type) ? AOMMIN(8, AOMMAX(1, rc->q_1_frame / 16)) : AOMMIN(16, AOMMAX(1, rc->q_1_frame / 8)); } // For screen static content with stable buffer level: relax the // limit on max_delta_down and apply bias qp, based on buffer fullness. // Only for high speeds levels for now to avoid bdrate regression. if (cpi->sf.rt_sf.rc_faster_convergence_static == 1 && cpi->sf.rt_sf.check_scene_detection && rc->frame_source_sad == 0 && rc->static_since_last_scene_change && p_rc->buffer_level > (p_rc->optimal_buffer_level >> 1) && cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->cyclic_refresh->counter_encode_maxq_scene_change > 4) { int qp_delta = 32; int qp_bias = 16; if (p_rc->buffer_level > p_rc->optimal_buffer_level) { qp_delta = 60; qp_bias = 32; } if (cpi->rc.rc_1_frame == 1) q = q - qp_bias; max_delta_down = AOMMAX(max_delta_down, qp_delta); max_delta_up = AOMMIN(max_delta_up, 4); } // If resolution changes or avg_frame_bandwidth significantly changed, // then set this flag to indicate change in target bits per macroblock. const int change_target_bits_mb = cm->prev_frame && (width != cm->prev_frame->width || height != cm->prev_frame->height || change_avg_frame_bandwidth); // Apply some control/clamp to QP under certain conditions. // Delay the use of the clamping for svc until after num_temporal_layers, // to make they have been set for each temporal layer. // Check for rc->q_1/2_frame > 0 in case they have not been set due to // dropped frames. if (!frame_is_intra_only(cm) && rc->frames_since_key > 1 && rc->q_1_frame > 0 && rc->q_2_frame > 0 && (!cpi->ppi->use_svc || svc->current_superframe > (unsigned int)svc->number_temporal_layers) && !change_target_bits_mb && !cpi->rc.rtc_external_ratectrl && (!cpi->oxcf.rc_cfg.gf_cbr_boost_pct || !(refresh_frame->alt_ref_frame || refresh_frame->golden_frame))) { // If in the previous two frames we have seen both overshoot and undershoot // clamp Q between the two. if (rc->rc_1_frame * rc->rc_2_frame == -1 && rc->q_1_frame != rc->q_2_frame && !overshoot_buffer_low) { int qclamp = clamp(q, AOMMIN(rc->q_1_frame, rc->q_2_frame), AOMMAX(rc->q_1_frame, rc->q_2_frame)); // If the previous frame had overshoot and the current q needs to // increase above the clamped value, reduce the clamp for faster reaction // to overshoot. if (cpi->rc.rc_1_frame == -1 && q > qclamp && rc->frames_since_key > 10) q = (q + qclamp) >> 1; else q = qclamp; } // Adjust Q base on source content change from scene detection. if (cpi->sf.rt_sf.check_scene_detection && rc->prev_avg_source_sad > 0 && rc->frames_since_key > 10 && rc->frame_source_sad > 0 && !cpi->rc.rtc_external_ratectrl) { const int bit_depth = cm->seq_params->bit_depth; double delta = (double)rc->avg_source_sad / (double)rc->prev_avg_source_sad - 1.0; // Push Q downwards if content change is decreasing and buffer level // is stable (at least 1/4-optimal level), so not overshooting. Do so // only for high Q to avoid excess overshoot. // Else reduce decrease in Q from previous frame if content change is // increasing and buffer is below max (so not undershooting). if (delta < 0.0 && p_rc->buffer_level > (p_rc->optimal_buffer_level >> 2) && q > (rc->worst_quality >> 1)) { double q_adj_factor = 1.0 + 0.5 * tanh(4.0 * delta); double q_val = av1_convert_qindex_to_q(q, bit_depth); q += av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); } else if (rc->q_1_frame - q > 0 && delta > 0.1 && p_rc->buffer_level < AOMMIN(p_rc->maximum_buffer_size, p_rc->optimal_buffer_level << 1)) { q = (3 * q + rc->q_1_frame) >> 2; } } // Limit the decrease in Q from previous frame. if (rc->q_1_frame - q > max_delta_down) q = rc->q_1_frame - max_delta_down; // Limit the increase in Q from previous frame. else if (q - rc->q_1_frame > max_delta_up) q = rc->q_1_frame + max_delta_up; } // Adjustment for temporal layers. if (svc->number_temporal_layers > 1 && svc->spatial_layer_id == 0 && !change_target_bits_mb && !cpi->rc.rtc_external_ratectrl && cpi->oxcf.resize_cfg.resize_mode != RESIZE_DYNAMIC) { if (svc->temporal_layer_id > 0) { // Constrain enhancement relative to the previous base TL0. // Get base temporal layer TL0. const int layer = LAYER_IDS_TO_IDX(0, 0, svc->number_temporal_layers); LAYER_CONTEXT *lc = &svc->layer_context[layer]; // lc->rc.avg_frame_bandwidth and lc->p_rc.last_q correspond to the // last TL0 frame. const int last_qindex_tl0 = rc->frames_since_key < svc->number_temporal_layers ? lc->p_rc.last_q[KEY_FRAME] : lc->p_rc.last_q[INTER_FRAME]; if (rc->avg_frame_bandwidth < lc->rc.avg_frame_bandwidth && q < last_qindex_tl0 - 4) q = last_qindex_tl0 - 4; } else if (cpi->svc.temporal_layer_id == 0 && !frame_is_intra_only(cm) && p_rc->buffer_level > (p_rc->optimal_buffer_level >> 2) && rc->frame_source_sad < 100000) { // Push base TL0 Q down if buffer is stable and frame_source_sad // is below threshold. int delta = (svc->number_temporal_layers == 2) ? 4 : 10; q = q - delta; } } // For non-svc (single layer): if resolution has increased push q closer // to the active_worst to avoid excess overshoot. if (!cpi->ppi->use_svc && cm->prev_frame && (width * height > 1.5 * cm->prev_frame->width * cm->prev_frame->height)) q = (q + active_worst_quality) >> 1; // For single layer RPS: Bias Q based on distance of closest reference. if (cpi->ppi->rtc_ref.bias_recovery_frame) { const int min_dist = av1_svc_get_min_ref_dist(cpi); q = q - AOMMIN(min_dist, 20); } return AOMMAX(AOMMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality); } static const RATE_FACTOR_LEVEL rate_factor_levels[FRAME_UPDATE_TYPES] = { KF_STD, // KF_UPDATE INTER_NORMAL, // LF_UPDATE GF_ARF_STD, // GF_UPDATE GF_ARF_STD, // ARF_UPDATE INTER_NORMAL, // OVERLAY_UPDATE INTER_NORMAL, // INTNL_OVERLAY_UPDATE GF_ARF_LOW, // INTNL_ARF_UPDATE }; static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group, int gf_frame_index) { const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index]; assert(update_type < FRAME_UPDATE_TYPES); return rate_factor_levels[update_type]; } /*!\brief Gets a rate vs Q correction factor * * This function returns the current value of a correction factor used to * dynamically adjust the relationship between Q and the expected number * of bits for the frame. * * \ingroup rate_control * \param[in] cpi Top level encoder instance structure * \param[in] width Frame width * \param[in] height Frame height * * \return Returns a correction factor for the current frame */ static double get_rate_correction_factor(const AV1_COMP *cpi, int width, int height) { const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; double rcf; double rate_correction_factors_kfstd; double rate_correction_factors_gfarfstd; double rate_correction_factors_internormal; rate_correction_factors_kfstd = (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) ? rc->frame_level_rate_correction_factors[KF_STD] : p_rc->rate_correction_factors[KF_STD]; rate_correction_factors_gfarfstd = (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) ? rc->frame_level_rate_correction_factors[GF_ARF_STD] : p_rc->rate_correction_factors[GF_ARF_STD]; rate_correction_factors_internormal = (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) ? rc->frame_level_rate_correction_factors[INTER_NORMAL] : p_rc->rate_correction_factors[INTER_NORMAL]; if (cpi->common.current_frame.frame_type == KEY_FRAME) { rcf = rate_correction_factors_kfstd; } else if (is_stat_consumption_stage(cpi)) { const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index); double rate_correction_factors_rflvl = (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) ? rc->frame_level_rate_correction_factors[rf_lvl] : p_rc->rate_correction_factors[rf_lvl]; rcf = rate_correction_factors_rflvl; } else { if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) && !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc && (cpi->oxcf.rc_cfg.mode != AOM_CBR || cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20)) rcf = rate_correction_factors_gfarfstd; else rcf = rate_correction_factors_internormal; } rcf *= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height); return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR); } /*!\brief Sets a rate vs Q correction factor * * This function updates the current value of a correction factor used to * dynamically adjust the relationship between Q and the expected number * of bits for the frame. * * \ingroup rate_control * \param[in] cpi Top level encoder instance structure * \param[in] is_encode_stage Indicates if recode loop or post-encode * \param[in] factor New correction factor * \param[in] width Frame width * \param[in] height Frame height * * \remark Updates the rate correction factor for the * current frame type in cpi->rc. */ static void set_rate_correction_factor(AV1_COMP *cpi, int is_encode_stage, double factor, int width, int height) { RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; int update_default_rcf = 1; // Normalize RCF to account for the size-dependent scaling factor. factor /= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height); factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR); if (cpi->common.current_frame.frame_type == KEY_FRAME) { p_rc->rate_correction_factors[KF_STD] = factor; } else if (is_stat_consumption_stage(cpi)) { const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index); if (is_encode_stage && cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { rc->frame_level_rate_correction_factors[rf_lvl] = factor; update_default_rcf = 0; } if (update_default_rcf) p_rc->rate_correction_factors[rf_lvl] = factor; } else { if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) && !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc && (cpi->oxcf.rc_cfg.mode != AOM_CBR || cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20)) { p_rc->rate_correction_factors[GF_ARF_STD] = factor; } else { if (is_encode_stage && cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { rc->frame_level_rate_correction_factors[INTER_NORMAL] = factor; update_default_rcf = 0; } if (update_default_rcf) p_rc->rate_correction_factors[INTER_NORMAL] = factor; } } } void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int is_encode_stage, int width, int height) { const AV1_COMMON *const cm = &cpi->common; double correction_factor = 1.0; double rate_correction_factor = get_rate_correction_factor(cpi, width, height); double adjustment_limit; int projected_size_based_on_q = 0; int cyclic_refresh_active = cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled; // Do not update the rate factors for arf overlay frames. if (cpi->rc.is_src_frame_alt_ref) return; // Don't update rate correction factors here on scene changes as // it is already reset in av1_encodedframe_overshoot_cbr(), // but reset variables related to previous frame q and size. // Note that the counter of frames since the last scene change // is only valid when cyclic refresh mode is enabled and that // this break out only applies to scene changes that are not // recorded as INTRA only key frames. // Note that av1_encodedframe_overshoot_cbr() is only entered // if cpi->sf.rt_sf.overshoot_detection_cbr == FAST_DETECTION_MAXQ // and cpi->rc.high_source_sad = 1. if ((cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) && (cpi->sf.rt_sf.overshoot_detection_cbr == FAST_DETECTION_MAXQ) && cpi->rc.high_source_sad && (cpi->cyclic_refresh->counter_encode_maxq_scene_change == 0) && !frame_is_intra_only(cm) && !cpi->ppi->use_svc) { cpi->rc.q_2_frame = cm->quant_params.base_qindex; cpi->rc.q_1_frame = cm->quant_params.base_qindex; cpi->rc.rc_2_frame = 0; cpi->rc.rc_1_frame = 0; return; } // Clear down mmx registers to allow floating point in what follows // Work out how big we would have expected the frame to be at this Q given // the current correction factor. // Stay in double to avoid int overflow when values are large if (cyclic_refresh_active) { projected_size_based_on_q = av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor); } else { projected_size_based_on_q = av1_estimate_bits_at_q( cpi, cm->quant_params.base_qindex, rate_correction_factor); } // Work out a size correction factor. if (projected_size_based_on_q > FRAME_OVERHEAD_BITS) correction_factor = (double)cpi->rc.projected_frame_size / (double)projected_size_based_on_q; // Clamp correction factor to prevent anything too extreme correction_factor = AOMMAX(correction_factor, 0.25); cpi->rc.q_2_frame = cpi->rc.q_1_frame; cpi->rc.q_1_frame = cm->quant_params.base_qindex; cpi->rc.rc_2_frame = cpi->rc.rc_1_frame; if (correction_factor > 1.1) cpi->rc.rc_1_frame = -1; else if (correction_factor < 0.9) cpi->rc.rc_1_frame = 1; else cpi->rc.rc_1_frame = 0; // Decide how heavily to dampen the adjustment if (correction_factor > 0.0) { if (cpi->is_screen_content_type) { adjustment_limit = 0.25 + 0.5 * AOMMIN(0.5, fabs(log10(correction_factor))); } else { adjustment_limit = 0.25 + 0.75 * AOMMIN(0.5, fabs(log10(correction_factor))); } } else { adjustment_limit = 0.75; } // Adjustment to delta Q and number of blocks updated in cyclic refresh // based on over or under shoot of target in current frame. if (cyclic_refresh_active && cpi->rc.this_frame_target > 0) { CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; if (correction_factor > 1.25) { cr->percent_refresh_adjustment = AOMMAX(cr->percent_refresh_adjustment - 1, -5); cr->rate_ratio_qdelta_adjustment = AOMMAX(cr->rate_ratio_qdelta_adjustment - 0.05, -0.0); } else if (correction_factor < 0.5) { cr->percent_refresh_adjustment = AOMMIN(cr->percent_refresh_adjustment + 1, 5); cr->rate_ratio_qdelta_adjustment = AOMMIN(cr->rate_ratio_qdelta_adjustment + 0.05, 0.25); } } if (correction_factor > 1.01) { // We are not already at the worst allowable quality correction_factor = (1.0 + ((correction_factor - 1.0) * adjustment_limit)); rate_correction_factor = rate_correction_factor * correction_factor; // Keep rate_correction_factor within limits if (rate_correction_factor > MAX_BPB_FACTOR) rate_correction_factor = MAX_BPB_FACTOR; } else if (correction_factor < 0.99) { // We are not already at the best allowable quality correction_factor = 1.0 / correction_factor; correction_factor = (1.0 + ((correction_factor - 1.0) * adjustment_limit)); correction_factor = 1.0 / correction_factor; rate_correction_factor = rate_correction_factor * correction_factor; // Keep rate_correction_factor within limits if (rate_correction_factor < MIN_BPB_FACTOR) rate_correction_factor = MIN_BPB_FACTOR; } set_rate_correction_factor(cpi, is_encode_stage, rate_correction_factor, width, height); } // Calculate rate for the given 'q'. static int get_bits_per_mb(const AV1_COMP *cpi, int use_cyclic_refresh, double correction_factor, int q) { const AV1_COMMON *const cm = &cpi->common; return use_cyclic_refresh ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor) : av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, q, correction_factor, cpi->sf.hl_sf.accurate_bit_estimate); } /*!\brief Searches for a Q index value predicted to give an average macro * block rate closest to the target value. * * Similar to find_qindex_by_rate() function, but returns a q index with a * rate just above or below the desired rate, depending on which of the two * rates is closer to the desired rate. * Also, respects the selected aq_mode when computing the rate. * * \ingroup rate_control * \param[in] desired_bits_per_mb Target bits per mb * \param[in] cpi Top level encoder instance structure * \param[in] correction_factor Current Q to rate correction factor * \param[in] best_qindex Min allowed Q value. * \param[in] worst_qindex Max allowed Q value. * * \return Returns a correction factor for the current frame */ static int find_closest_qindex_by_rate(int desired_bits_per_mb, const AV1_COMP *cpi, double correction_factor, int best_qindex, int worst_qindex) { const int use_cyclic_refresh = cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->cyclic_refresh->apply_cyclic_refresh; // Find 'qindex' based on 'desired_bits_per_mb'. assert(best_qindex <= worst_qindex); int low = best_qindex; int high = worst_qindex; while (low < high) { const int mid = (low + high) >> 1; const int mid_bits_per_mb = get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, mid); if (mid_bits_per_mb > desired_bits_per_mb) { low = mid + 1; } else { high = mid; } } assert(low == high); // Calculate rate difference of this q index from the desired rate. const int curr_q = low; const int curr_bits_per_mb = get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, curr_q); const int curr_bit_diff = (curr_bits_per_mb <= desired_bits_per_mb) ? desired_bits_per_mb - curr_bits_per_mb : INT_MAX; assert((curr_bit_diff != INT_MAX && curr_bit_diff >= 0) || curr_q == worst_qindex); // Calculate rate difference for previous q index too. const int prev_q = curr_q - 1; int prev_bit_diff; if (curr_bit_diff == INT_MAX || curr_q == best_qindex) { prev_bit_diff = INT_MAX; } else { const int prev_bits_per_mb = get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, prev_q); assert(prev_bits_per_mb > desired_bits_per_mb); prev_bit_diff = prev_bits_per_mb - desired_bits_per_mb; } // Pick one of the two q indices, depending on which one has rate closer to // the desired rate. return (curr_bit_diff <= prev_bit_diff) ? curr_q : prev_q; } int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame, int active_best_quality, int active_worst_quality, int width, int height) { const int MBs = av1_get_MBs(width, height); const double correction_factor = get_rate_correction_factor(cpi, width, height); const int target_bits_per_mb = (int)(((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / MBs); int q = find_closest_qindex_by_rate(target_bits_per_mb, cpi, correction_factor, active_best_quality, active_worst_quality); if (cpi->oxcf.rc_cfg.mode == AOM_CBR && has_no_stats_stage(cpi)) return adjust_q_cbr(cpi, q, active_worst_quality, width, height); return q; } static int get_active_quality(int q, int gfu_boost, int low, int high, int *low_motion_minq, int *high_motion_minq) { if (gfu_boost > high) { return low_motion_minq[q]; } else if (gfu_boost < low) { return high_motion_minq[q]; } else { const int gap = high - low; const int offset = high - gfu_boost; const int qdiff = high_motion_minq[q] - low_motion_minq[q]; const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap; return low_motion_minq[q] + adjustment; } } static int get_kf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q, aom_bit_depth_t bit_depth) { int *kf_low_motion_minq; int *kf_high_motion_minq; ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq); ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq); return get_active_quality(q, p_rc->kf_boost, kf_low, kf_high, kf_low_motion_minq, kf_high_motion_minq); } static int get_gf_active_quality_no_rc(int gfu_boost, int q, aom_bit_depth_t bit_depth) { int *arfgf_low_motion_minq; int *arfgf_high_motion_minq; ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq); ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); return get_active_quality(q, gfu_boost, gf_low, gf_high, arfgf_low_motion_minq, arfgf_high_motion_minq); } static int get_gf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q, aom_bit_depth_t bit_depth) { return get_gf_active_quality_no_rc(p_rc->gfu_boost, q, bit_depth); } static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) { int *arfgf_high_motion_minq; ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); return arfgf_high_motion_minq[q]; } static int calc_active_worst_quality_no_stats_vbr(const AV1_COMP *cpi) { const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; const unsigned int curr_frame = cpi->common.current_frame.frame_number; int active_worst_quality; int last_q_key_frame; int last_q_inter_frame; #if CONFIG_FPMT_TEST const int simulate_parallel_frame = cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; last_q_key_frame = simulate_parallel_frame ? p_rc->temp_last_q[KEY_FRAME] : p_rc->last_q[KEY_FRAME]; last_q_inter_frame = simulate_parallel_frame ? p_rc->temp_last_q[INTER_FRAME] : p_rc->last_q[INTER_FRAME]; #else last_q_key_frame = p_rc->last_q[KEY_FRAME]; last_q_inter_frame = p_rc->last_q[INTER_FRAME]; #endif if (cpi->common.current_frame.frame_type == KEY_FRAME) { active_worst_quality = curr_frame == 0 ? rc->worst_quality : last_q_key_frame * 2; } else { if (!rc->is_src_frame_alt_ref && (refresh_frame->golden_frame || refresh_frame->bwd_ref_frame || refresh_frame->alt_ref_frame)) { active_worst_quality = curr_frame == 1 ? last_q_key_frame * 5 / 4 : last_q_inter_frame; } else { active_worst_quality = curr_frame == 1 ? last_q_key_frame * 2 : last_q_inter_frame * 2; } } return AOMMIN(active_worst_quality, rc->worst_quality); } // Adjust active_worst_quality level based on buffer level. static int calc_active_worst_quality_no_stats_cbr(const AV1_COMP *cpi) { // Adjust active_worst_quality: If buffer is above the optimal/target level, // bring active_worst_quality down depending on fullness of buffer. // If buffer is below the optimal level, let the active_worst_quality go from // ambient Q (at buffer = optimal level) to worst_quality level // (at buffer = critical level). const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *rc = &cpi->rc; const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc; const SVC *const svc = &cpi->svc; unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers; // Buffer level below which we push active_worst to worst_quality. int64_t critical_level = p_rc->optimal_buffer_level >> 3; int64_t buff_lvl_step = 0; int adjustment = 0; int active_worst_quality; int ambient_qp; if (frame_is_intra_only(cm)) return rc->worst_quality; // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME] // for the first few frames following key frame. These are both initialized // to worst_quality and updated with (3/4, 1/4) average in postencode_update. // So for first few frames following key, the qp of that key frame is weighted // into the active_worst_quality setting. For SVC the key frame should // correspond to layer (0, 0), so use that for layer context. int avg_qindex_key = p_rc->avg_frame_qindex[KEY_FRAME]; if (svc->number_temporal_layers > 1) { int layer = LAYER_IDS_TO_IDX(0, 0, svc->number_temporal_layers); const LAYER_CONTEXT *lc = &svc->layer_context[layer]; const PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc; avg_qindex_key = AOMMIN(lp_rc->avg_frame_qindex[KEY_FRAME], lp_rc->last_q[KEY_FRAME]); } if (svc->temporal_layer_id > 0 && rc->frames_since_key < 2 * svc->number_temporal_layers) { ambient_qp = avg_qindex_key; } else { ambient_qp = (cm->current_frame.frame_number < num_frames_weight_key) ? AOMMIN(p_rc->avg_frame_qindex[INTER_FRAME], avg_qindex_key) : p_rc->avg_frame_qindex[INTER_FRAME]; } ambient_qp = AOMMIN(rc->worst_quality, ambient_qp); if (p_rc->buffer_level > p_rc->optimal_buffer_level) { // Adjust down. int max_adjustment_down; // Maximum adjustment down for Q if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && !cpi->ppi->use_svc && (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN)) { active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp); max_adjustment_down = AOMMIN(4, active_worst_quality / 16); } else { active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4); max_adjustment_down = active_worst_quality / 3; } if (max_adjustment_down) { buff_lvl_step = ((p_rc->maximum_buffer_size - p_rc->optimal_buffer_level) / max_adjustment_down); if (buff_lvl_step) adjustment = (int)((p_rc->buffer_level - p_rc->optimal_buffer_level) / buff_lvl_step); active_worst_quality -= adjustment; } } else if (p_rc->buffer_level > critical_level) { // Adjust up from ambient Q. active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp); if (critical_level) { buff_lvl_step = (p_rc->optimal_buffer_level - critical_level); if (buff_lvl_step) { adjustment = (int)((rc->worst_quality - ambient_qp) * (p_rc->optimal_buffer_level - p_rc->buffer_level) / buff_lvl_step); } active_worst_quality += adjustment; } } else { // Set to worst_quality if buffer is below critical level. active_worst_quality = rc->worst_quality; } return active_worst_quality; } // Calculate the active_best_quality level. static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi, int active_worst_quality, int width, int height) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; const CurrentFrame *const current_frame = &cm->current_frame; int *rtc_minq; const int bit_depth = cm->seq_params->bit_depth; int active_best_quality = rc->best_quality; ASSIGN_MINQ_TABLE(bit_depth, rtc_minq); if (frame_is_intra_only(cm)) { // Handle the special case for key frames forced when we have reached // the maximum key frame interval. Here force the Q to a range // based on the ambient Q to reduce the risk of popping. if (p_rc->this_key_frame_forced) { int qindex = p_rc->last_boosted_qindex; double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); int delta_qindex = av1_compute_qdelta(rc, last_boosted_q, (last_boosted_q * 0.75), bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); } else if (current_frame->frame_number > 0) { // not first frame of one pass and kf_boost is set double q_adj_factor = 1.0; double q_val; active_best_quality = get_kf_active_quality( p_rc, p_rc->avg_frame_qindex[KEY_FRAME], bit_depth); // Allow somewhat lower kf minq with small image formats. if ((width * height) <= (352 * 288)) { q_adj_factor -= 0.25; } // Convert the adjustment factor to a qindex delta // on active_best_quality. q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth); active_best_quality += av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); } } else if (!rc->is_src_frame_alt_ref && !cpi->ppi->use_svc && cpi->oxcf.rc_cfg.gf_cbr_boost_pct && (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) { // Use the lower of active_worst_quality and recent // average Q as basis for GF/ARF best Q limit unless last frame was // a key frame. int q = active_worst_quality; if (rc->frames_since_key > 1 && p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { q = p_rc->avg_frame_qindex[INTER_FRAME]; } active_best_quality = get_gf_active_quality(p_rc, q, bit_depth); } else { // Use the lower of active_worst_quality and recent/average Q. FRAME_TYPE frame_type = (current_frame->frame_number > 1) ? INTER_FRAME : KEY_FRAME; if (p_rc->avg_frame_qindex[frame_type] < active_worst_quality) active_best_quality = rtc_minq[p_rc->avg_frame_qindex[frame_type]]; else active_best_quality = rtc_minq[active_worst_quality]; } return active_best_quality; } #if RT_PASSIVE_STRATEGY static int get_q_passive_strategy(const AV1_COMP *const cpi, const int q_candidate, const int threshold) { const AV1_COMMON *const cm = &cpi->common; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const CurrentFrame *const current_frame = &cm->current_frame; int sum = 0; int count = 0; int i = 1; while (i < MAX_Q_HISTORY) { int frame_id = current_frame->frame_number - i; if (frame_id <= 0) break; sum += p_rc->q_history[frame_id % MAX_Q_HISTORY]; ++count; ++i; } if (count > 0) { const int avg_q = sum / count; if (abs(avg_q - q_candidate) <= threshold) return avg_q; } return q_candidate; } #endif // RT_PASSIVE_STRATEGY /*!\brief Picks q and q bounds given CBR rate control parameters in \c cpi->rc. * * Handles the special case when using: * - Constant bit-rate mode: \c cpi->oxcf.rc_cfg.mode == \ref AOM_CBR, and * - 1-pass encoding without LAP (look-ahead processing), so 1st pass stats are * NOT available. * * \ingroup rate_control * \param[in] cpi Top level encoder structure * \param[in] width Coded frame width * \param[in] height Coded frame height * \param[out] bottom_index Bottom bound for q index (best quality) * \param[out] top_index Top bound for q index (worst quality) * \return Returns selected q index to be used for encoding this frame. */ static int rc_pick_q_and_bounds_no_stats_cbr(const AV1_COMP *cpi, int width, int height, int *bottom_index, int *top_index) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const CurrentFrame *const current_frame = &cm->current_frame; int q; int active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi); int active_best_quality = calc_active_best_quality_no_stats_cbr( cpi, active_worst_quality, width, height); assert(has_no_stats_stage(cpi)); assert(cpi->oxcf.rc_cfg.mode == AOM_CBR); // Clip the active best and worst quality values to limits active_best_quality = clamp(active_best_quality, rc->best_quality, rc->worst_quality); active_worst_quality = clamp(active_worst_quality, active_best_quality, rc->worst_quality); *top_index = active_worst_quality; *bottom_index = active_best_quality; // Limit Q range for the adaptive loop. if (current_frame->frame_type == KEY_FRAME && !p_rc->this_key_frame_forced && current_frame->frame_number != 0) { int qdelta = 0; qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type, active_worst_quality, 2.0); *top_index = active_worst_quality + qdelta; *top_index = AOMMAX(*top_index, *bottom_index); } q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, active_worst_quality, width, height); #if RT_PASSIVE_STRATEGY if (current_frame->frame_type != KEY_FRAME && cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { q = get_q_passive_strategy(cpi, q, 50); } #endif // RT_PASSIVE_STRATEGY if (q > *top_index) { // Special case when we are targeting the max allowed rate if (rc->this_frame_target >= rc->max_frame_bandwidth) *top_index = q; else q = *top_index; } assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); assert(*bottom_index <= rc->worst_quality && *bottom_index >= rc->best_quality); assert(q <= rc->worst_quality && q >= rc->best_quality); return q; } static int gf_group_pyramid_level(const GF_GROUP *gf_group, int gf_index) { return gf_group->layer_depth[gf_index]; } static int get_active_cq_level(const RATE_CONTROL *rc, const PRIMARY_RATE_CONTROL *p_rc, const AV1EncoderConfig *const oxcf, int intra_only, aom_superres_mode superres_mode, int superres_denom) { const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; static const double cq_adjust_threshold = 0.1; int active_cq_level = rc_cfg->cq_level; if (rc_cfg->mode == AOM_CQ || rc_cfg->mode == AOM_Q) { // printf("Superres %d %d %d = %d\n", superres_denom, intra_only, // rc->frames_to_key, !(intra_only && rc->frames_to_key <= 1)); if ((superres_mode == AOM_SUPERRES_QTHRESH || superres_mode == AOM_SUPERRES_AUTO) && superres_denom != SCALE_NUMERATOR) { int mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO; if (intra_only && rc->frames_to_key <= 1) { mult = 0; } else if (intra_only) { mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME; } else { mult = SUPERRES_QADJ_PER_DENOM_ARFFRAME; } active_cq_level = AOMMAX( active_cq_level - ((superres_denom - SCALE_NUMERATOR) * mult), 0); } } if (rc_cfg->mode == AOM_CQ && p_rc->total_target_bits > 0) { const double x = (double)p_rc->total_actual_bits / p_rc->total_target_bits; if (x < cq_adjust_threshold) { active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold); } } return active_cq_level; } /*!\brief Picks q and q bounds given non-CBR rate control params in \c cpi->rc. * * Handles the special case when using: * - Any rate control other than constant bit-rate mode: * \c cpi->oxcf.rc_cfg.mode != \ref AOM_CBR, and * - 1-pass encoding without LAP (look-ahead processing), so 1st pass stats are * NOT available. * * \ingroup rate_control * \param[in] cpi Top level encoder structure * \param[in] width Coded frame width * \param[in] height Coded frame height * \param[out] bottom_index Bottom bound for q index (best quality) * \param[out] top_index Top bound for q index (worst quality) * \return Returns selected q index to be used for encoding this frame. */ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width, int height, int *bottom_index, int *top_index) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const CurrentFrame *const current_frame = &cm->current_frame; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode; assert(has_no_stats_stage(cpi)); assert(rc_mode == AOM_VBR || (!USE_UNRESTRICTED_Q_IN_CQ_MODE && rc_mode == AOM_CQ) || rc_mode == AOM_Q); const int cq_level = get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode, cm->superres_scale_denominator); const int bit_depth = cm->seq_params->bit_depth; int active_best_quality; int active_worst_quality = calc_active_worst_quality_no_stats_vbr(cpi); int q; int *inter_minq; ASSIGN_MINQ_TABLE(bit_depth, inter_minq); if (frame_is_intra_only(cm)) { if (rc_mode == AOM_Q) { const int qindex = cq_level; const double q_val = av1_convert_qindex_to_q(qindex, bit_depth); const int delta_qindex = av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); } else if (p_rc->this_key_frame_forced) { #if CONFIG_FPMT_TEST const int simulate_parallel_frame = cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; int qindex = simulate_parallel_frame ? p_rc->temp_last_boosted_qindex : p_rc->last_boosted_qindex; #else int qindex = p_rc->last_boosted_qindex; #endif const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); const int delta_qindex = av1_compute_qdelta( rc, last_boosted_q, last_boosted_q * 0.75, bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); } else { // not first frame of one pass and kf_boost is set double q_adj_factor = 1.0; active_best_quality = get_kf_active_quality( p_rc, p_rc->avg_frame_qindex[KEY_FRAME], bit_depth); // Allow somewhat lower kf minq with small image formats. if ((width * height) <= (352 * 288)) { q_adj_factor -= 0.25; } // Convert the adjustment factor to a qindex delta on active_best_quality. { const double q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth); active_best_quality += av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); } } } else if (!rc->is_src_frame_alt_ref && (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) { // Use the lower of active_worst_quality and recent // average Q as basis for GF/ARF best Q limit unless last frame was // a key frame. q = (rc->frames_since_key > 1 && p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) ? p_rc->avg_frame_qindex[INTER_FRAME] : p_rc->avg_frame_qindex[KEY_FRAME]; // For constrained quality don't allow Q less than the cq level if (rc_mode == AOM_CQ) { if (q < cq_level) q = cq_level; active_best_quality = get_gf_active_quality(p_rc, q, bit_depth); // Constrained quality use slightly lower active best. active_best_quality = active_best_quality * 15 / 16; } else if (rc_mode == AOM_Q) { const int qindex = cq_level; const double q_val = av1_convert_qindex_to_q(qindex, bit_depth); const int delta_qindex = (refresh_frame->alt_ref_frame) ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth) : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); } else { active_best_quality = get_gf_active_quality(p_rc, q, bit_depth); } } else { if (rc_mode == AOM_Q) { const int qindex = cq_level; const double q_val = av1_convert_qindex_to_q(qindex, bit_depth); const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0, 0.70, 1.0, 0.85, 1.0 }; const int delta_qindex = av1_compute_qdelta( rc, q_val, q_val * delta_rate[current_frame->frame_number % FIXED_GF_INTERVAL], bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); } else { // Use the lower of active_worst_quality and recent/average Q. active_best_quality = (current_frame->frame_number > 1) ? inter_minq[p_rc->avg_frame_qindex[INTER_FRAME]] : inter_minq[p_rc->avg_frame_qindex[KEY_FRAME]]; // For the constrained quality mode we don't want // q to fall below the cq level. if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) { active_best_quality = cq_level; } } } // Clip the active best and worst quality values to limits active_best_quality = clamp(active_best_quality, rc->best_quality, rc->worst_quality); active_worst_quality = clamp(active_worst_quality, active_best_quality, rc->worst_quality); *top_index = active_worst_quality; *bottom_index = active_best_quality; // Limit Q range for the adaptive loop. { int qdelta = 0; if (current_frame->frame_type == KEY_FRAME && !p_rc->this_key_frame_forced && current_frame->frame_number != 0) { qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type, active_worst_quality, 2.0); } else if (!rc->is_src_frame_alt_ref && (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) { qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type, active_worst_quality, 1.75); } *top_index = active_worst_quality + qdelta; *top_index = AOMMAX(*top_index, *bottom_index); } if (rc_mode == AOM_Q) { q = active_best_quality; // Special case code to try and match quality with forced key frames } else if ((current_frame->frame_type == KEY_FRAME) && p_rc->this_key_frame_forced) { #if CONFIG_FPMT_TEST const int simulate_parallel_frame = cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; q = simulate_parallel_frame ? p_rc->temp_last_boosted_qindex : p_rc->last_boosted_qindex; #else q = p_rc->last_boosted_qindex; #endif } else { q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, active_worst_quality, width, height); if (q > *top_index) { // Special case when we are targeting the max allowed rate if (rc->this_frame_target >= rc->max_frame_bandwidth) *top_index = q; else q = *top_index; } } assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); assert(*bottom_index <= rc->worst_quality && *bottom_index >= rc->best_quality); assert(q <= rc->worst_quality && q >= rc->best_quality); return q; } static const double arf_layer_deltas[MAX_ARF_LAYERS + 1] = { 2.50, 2.00, 1.75, 1.50, 1.25, 1.15, 1.0 }; static int frame_type_qdelta(const AV1_COMP *cpi, int q) { const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(gf_group, cpi->gf_frame_index); const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index]; const int arf_layer = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); const double rate_factor = (rf_lvl == INTER_NORMAL) ? 1.0 : arf_layer_deltas[arf_layer]; return av1_compute_qdelta_by_rate(cpi, frame_type, q, rate_factor); } // This unrestricted Q selection on CQ mode is useful when testing new features, // but may lead to Q being out of range on current RC restrictions #if USE_UNRESTRICTED_Q_IN_CQ_MODE static int rc_pick_q_and_bounds_no_stats_cq(const AV1_COMP *cpi, int width, int height, int *bottom_index, int *top_index) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const int cq_level = get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode, cm->superres_scale_denominator); const int bit_depth = cm->seq_params->bit_depth; const int q = (int)av1_convert_qindex_to_q(cq_level, bit_depth); (void)width; (void)height; assert(has_no_stats_stage(cpi)); assert(cpi->oxcf.rc_cfg.mode == AOM_CQ); *top_index = q; *bottom_index = q; return q; } #endif // USE_UNRESTRICTED_Q_IN_CQ_MODE #define STATIC_MOTION_THRESH 95 static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height, int *active_best, int *active_worst, int cq_level) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; int active_best_quality; int active_worst_quality = *active_worst; const int bit_depth = cm->seq_params->bit_depth; if (rc->frames_to_key <= 1 && oxcf->rc_cfg.mode == AOM_Q) { // If the next frame is also a key frame or the current frame is the // only frame in the sequence in AOM_Q mode, just use the cq_level // as q. active_best_quality = cq_level; active_worst_quality = cq_level; } else if (p_rc->this_key_frame_forced) { // Handle the special case for key frames forced when we have reached // the maximum key frame interval. Here force the Q to a range // based on the ambient Q to reduce the risk of popping. double last_boosted_q; int delta_qindex; int qindex; #if CONFIG_FPMT_TEST const int simulate_parallel_frame = cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; int last_boosted_qindex = simulate_parallel_frame ? p_rc->temp_last_boosted_qindex : p_rc->last_boosted_qindex; #else int last_boosted_qindex = p_rc->last_boosted_qindex; #endif if (is_stat_consumption_stage_twopass(cpi) && cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { qindex = AOMMIN(p_rc->last_kf_qindex, last_boosted_qindex); active_best_quality = qindex; last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); delta_qindex = av1_compute_qdelta(rc, last_boosted_q, last_boosted_q * 1.25, bit_depth); active_worst_quality = AOMMIN(qindex + delta_qindex, active_worst_quality); } else { qindex = last_boosted_qindex; last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); delta_qindex = av1_compute_qdelta(rc, last_boosted_q, last_boosted_q * 0.50, bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); } } else { // Not forced keyframe. double q_adj_factor = 1.0; double q_val; // Baseline value derived from active_worst_quality and kf boost. active_best_quality = get_kf_active_quality(p_rc, active_worst_quality, bit_depth); if (cpi->is_screen_content_type) { active_best_quality /= 2; } if (is_stat_consumption_stage_twopass(cpi) && cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) { active_best_quality /= 3; } // Allow somewhat lower kf minq with small image formats. if ((width * height) <= (352 * 288)) { q_adj_factor -= 0.25; } // Make a further adjustment based on the kf zero motion measure. if (is_stat_consumption_stage_twopass(cpi)) q_adj_factor += 0.05 - (0.001 * (double)cpi->ppi->twopass.kf_zeromotion_pct); // Convert the adjustment factor to a qindex delta // on active_best_quality. q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth); active_best_quality += av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); // Tweak active_best_quality for AOM_Q mode when superres is on, as this // will be used directly as 'q' later. if (oxcf->rc_cfg.mode == AOM_Q && (cpi->superres_mode == AOM_SUPERRES_QTHRESH || cpi->superres_mode == AOM_SUPERRES_AUTO) && cm->superres_scale_denominator != SCALE_NUMERATOR) { active_best_quality = AOMMAX(active_best_quality - ((cm->superres_scale_denominator - SCALE_NUMERATOR) * SUPERRES_QADJ_PER_DENOM_KEYFRAME), 0); } } *active_best = active_best_quality; *active_worst = active_worst_quality; } static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi, const int is_intrl_arf_boost, int *active_worst, int *active_best) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; int active_best_quality = *active_best; int active_worst_quality = *active_worst; #if CONFIG_FPMT_TEST #endif // Extension to max or min Q if undershoot or overshoot is outside // the permitted range. if (cpi->oxcf.rc_cfg.mode != AOM_Q) { #if CONFIG_FPMT_TEST const int simulate_parallel_frame = cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; const int extend_minq = simulate_parallel_frame ? p_rc->temp_extend_minq : cpi->ppi->twopass.extend_minq; const int extend_maxq = simulate_parallel_frame ? p_rc->temp_extend_maxq : cpi->ppi->twopass.extend_maxq; const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; if (frame_is_intra_only(cm) || (!rc->is_src_frame_alt_ref && (refresh_frame->golden_frame || is_intrl_arf_boost || refresh_frame->alt_ref_frame))) { active_best_quality -= extend_minq; active_worst_quality += (extend_maxq / 2); } else { active_best_quality -= extend_minq / 2; active_worst_quality += extend_maxq; } #else (void)is_intrl_arf_boost; active_best_quality -= cpi->ppi->twopass.extend_minq / 8; active_worst_quality += cpi->ppi->twopass.extend_maxq / 4; #endif } #ifndef STRICT_RC // Static forced key frames Q restrictions dealt with elsewhere. if (!(frame_is_intra_only(cm)) || !p_rc->this_key_frame_forced || (cpi->ppi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) { const int qdelta = frame_type_qdelta(cpi, active_worst_quality); active_worst_quality = AOMMAX(active_worst_quality + qdelta, active_best_quality); } #endif // Modify active_best_quality for downscaled normal frames. if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) { int qdelta = av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type, active_best_quality, 2.0); active_best_quality = AOMMAX(active_best_quality + qdelta, rc->best_quality); } active_best_quality = clamp(active_best_quality, rc->best_quality, rc->worst_quality); active_worst_quality = clamp(active_worst_quality, active_best_quality, rc->worst_quality); *active_best = active_best_quality; *active_worst = active_worst_quality; } /*!\brief Gets a Q value to use for the current frame * * * Selects a Q value from a permitted range that we estimate * will result in approximately the target number of bits. * * \ingroup rate_control * \param[in] cpi Top level encoder instance structure * \param[in] width Width of frame * \param[in] height Height of frame * \param[in] active_worst_quality Max Q allowed * \param[in] active_best_quality Min Q allowed * * \return The suggested Q for this frame. */ static int get_q(const AV1_COMP *cpi, const int width, const int height, const int active_worst_quality, const int active_best_quality) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; int q; #if CONFIG_FPMT_TEST const int simulate_parallel_frame = cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && cpi->ppi->fpmt_unit_test_cfg; int last_boosted_qindex = simulate_parallel_frame ? p_rc->temp_last_boosted_qindex : p_rc->last_boosted_qindex; #else int last_boosted_qindex = p_rc->last_boosted_qindex; #endif if (cpi->oxcf.rc_cfg.mode == AOM_Q || (frame_is_intra_only(cm) && !p_rc->this_key_frame_forced && cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH && rc->frames_to_key > 1)) { q = active_best_quality; // Special case code to try and match quality with forced key frames. } else if (frame_is_intra_only(cm) && p_rc->this_key_frame_forced) { // If static since last kf use better of last boosted and last kf q. if (cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { q = AOMMIN(p_rc->last_kf_qindex, last_boosted_qindex); } else { q = AOMMIN(last_boosted_qindex, (active_best_quality + active_worst_quality) / 2); } q = clamp(q, active_best_quality, active_worst_quality); } else { q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, active_worst_quality, width, height); if (q > active_worst_quality) { // Special case when we are targeting the max allowed rate. if (rc->this_frame_target < rc->max_frame_bandwidth) { q = active_worst_quality; } } q = AOMMAX(q, active_best_quality); } return q; } // Returns |active_best_quality| for an inter frame. // The |active_best_quality| depends on different rate control modes: // VBR, Q, CQ, CBR. // The returning active_best_quality could further be adjusted in // adjust_active_best_and_worst_quality(). static int get_active_best_quality(const AV1_COMP *const cpi, const int active_worst_quality, const int cq_level, const int gf_index) { const AV1_COMMON *const cm = &cpi->common; const int bit_depth = cm->seq_params->bit_depth; const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; const GF_GROUP *gf_group = &cpi->ppi->gf_group; const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode; int *inter_minq; ASSIGN_MINQ_TABLE(bit_depth, inter_minq); int active_best_quality = 0; const int is_intrl_arf_boost = gf_group->update_type[gf_index] == INTNL_ARF_UPDATE; int is_leaf_frame = !(gf_group->update_type[gf_index] == ARF_UPDATE || gf_group->update_type[gf_index] == GF_UPDATE || is_intrl_arf_boost); // TODO(jingning): Consider to rework this hack that covers issues incurred // in lightfield setting. if (cm->tiles.large_scale) { is_leaf_frame = !(refresh_frame->golden_frame || refresh_frame->alt_ref_frame || is_intrl_arf_boost); } const int is_overlay_frame = rc->is_src_frame_alt_ref; if (is_leaf_frame || is_overlay_frame) { if (rc_mode == AOM_Q) return cq_level; active_best_quality = inter_minq[active_worst_quality]; // For the constrained quality mode we don't want // q to fall below the cq level. if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) { active_best_quality = cq_level; } return active_best_quality; } // Determine active_best_quality for frames that are not leaf or overlay. int q = active_worst_quality; // Use the lower of active_worst_quality and recent // average Q as basis for GF/ARF best Q limit unless last frame was // a key frame. if (rc->frames_since_key > 1 && p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { q = p_rc->avg_frame_qindex[INTER_FRAME]; } if (rc_mode == AOM_CQ && q < cq_level) q = cq_level; active_best_quality = get_gf_active_quality(p_rc, q, bit_depth); // Constrained quality use slightly lower active best. if (rc_mode == AOM_CQ) active_best_quality = active_best_quality * 15 / 16; const int min_boost = get_gf_high_motion_quality(q, bit_depth); const int boost = min_boost - active_best_quality; active_best_quality = min_boost - (int)(boost * p_rc->arf_boost_factor); if (!is_intrl_arf_boost) return active_best_quality; if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = p_rc->arf_q; int this_height = gf_group_pyramid_level(gf_group, gf_index); while (this_height > 1) { active_best_quality = (active_best_quality + active_worst_quality + 1) / 2; --this_height; } return active_best_quality; } static int rc_pick_q_and_bounds_q_mode(const AV1_COMP *cpi, int width, int height, int gf_index, int *bottom_index, int *top_index) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const int cq_level = get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode, cm->superres_scale_denominator); int active_best_quality = 0; int active_worst_quality = rc->active_worst_quality; int q; if (frame_is_intra_only(cm)) { get_intra_q_and_bounds(cpi, width, height, &active_best_quality, &active_worst_quality, cq_level); } else { // Active best quality limited by previous layer. active_best_quality = get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index); } if (cq_level > 0) active_best_quality = AOMMAX(1, active_best_quality); *top_index = active_worst_quality; *bottom_index = active_best_quality; *top_index = AOMMAX(*top_index, rc->best_quality); *top_index = AOMMIN(*top_index, rc->worst_quality); *bottom_index = AOMMAX(*bottom_index, rc->best_quality); *bottom_index = AOMMIN(*bottom_index, rc->worst_quality); q = active_best_quality; q = AOMMAX(q, rc->best_quality); q = AOMMIN(q, rc->worst_quality); assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); assert(*bottom_index <= rc->worst_quality && *bottom_index >= rc->best_quality); assert(q <= rc->worst_quality && q >= rc->best_quality); return q; } /*!\brief Picks q and q bounds given rate control parameters in \c cpi->rc. * * Handles the general cases not covered by * \ref rc_pick_q_and_bounds_no_stats_cbr() and * \ref rc_pick_q_and_bounds_no_stats() * * \ingroup rate_control * \param[in] cpi Top level encoder structure * \param[in] width Coded frame width * \param[in] height Coded frame height * \param[in] gf_index Index of this frame in the golden frame group * \param[out] bottom_index Bottom bound for q index (best quality) * \param[out] top_index Top bound for q index (worst quality) * \return Returns selected q index to be used for encoding this frame. */ static int rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height, int gf_index, int *bottom_index, int *top_index) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; const GF_GROUP *gf_group = &cpi->ppi->gf_group; assert(IMPLIES(has_no_stats_stage(cpi), cpi->oxcf.rc_cfg.mode == AOM_Q && gf_group->update_type[gf_index] != ARF_UPDATE)); const int cq_level = get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode, cm->superres_scale_denominator); if (oxcf->rc_cfg.mode == AOM_Q) { return rc_pick_q_and_bounds_q_mode(cpi, width, height, gf_index, bottom_index, top_index); } int active_best_quality = 0; int active_worst_quality = rc->active_worst_quality; int q; const int is_intrl_arf_boost = gf_group->update_type[gf_index] == INTNL_ARF_UPDATE; if (frame_is_intra_only(cm)) { get_intra_q_and_bounds(cpi, width, height, &active_best_quality, &active_worst_quality, cq_level); #ifdef STRICT_RC active_best_quality = 0; #endif } else { // Active best quality limited by previous layer. const int pyramid_level = gf_group_pyramid_level(gf_group, gf_index); if ((pyramid_level <= 1) || (pyramid_level > MAX_ARF_LAYERS)) { active_best_quality = get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index); } else { #if CONFIG_FPMT_TEST const int simulate_parallel_frame = cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; int local_active_best_quality = simulate_parallel_frame ? p_rc->temp_active_best_quality[pyramid_level - 1] : p_rc->active_best_quality[pyramid_level - 1]; active_best_quality = local_active_best_quality + 1; #else active_best_quality = p_rc->active_best_quality[pyramid_level - 1] + 1; #endif active_best_quality = AOMMIN(active_best_quality, active_worst_quality); #ifdef STRICT_RC active_best_quality += (active_worst_quality - active_best_quality) / 16; #else active_best_quality += (active_worst_quality - active_best_quality) / 2; #endif } // For alt_ref and GF frames (including internal arf frames) adjust the // worst allowed quality as well. This insures that even on hard // sections we don't clamp the Q at the same value for arf frames and // leaf (non arf) frames. This is important to the TPL model which assumes // Q drops with each arf level. if (!(rc->is_src_frame_alt_ref) && (refresh_frame->golden_frame || refresh_frame->alt_ref_frame || is_intrl_arf_boost)) { active_worst_quality = (active_best_quality + (3 * active_worst_quality) + 2) / 4; } } adjust_active_best_and_worst_quality( cpi, is_intrl_arf_boost, &active_worst_quality, &active_best_quality); q = get_q(cpi, width, height, active_worst_quality, active_best_quality); // Special case when we are targeting the max allowed rate. if (rc->this_frame_target >= rc->max_frame_bandwidth && q > active_worst_quality) { active_worst_quality = q; } *top_index = active_worst_quality; *bottom_index = active_best_quality; assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); assert(*bottom_index <= rc->worst_quality && *bottom_index >= rc->best_quality); assert(q <= rc->worst_quality && q >= rc->best_quality); return q; } static void rc_compute_variance_onepass_rt(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; YV12_BUFFER_CONFIG const *const unscaled_src = cpi->unscaled_source; if (unscaled_src == NULL) return; const uint8_t *src_y = unscaled_src->y_buffer; const int src_ystride = unscaled_src->y_stride; const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); const uint8_t *pre_y = yv12->buffers[0]; const int pre_ystride = yv12->strides[0]; // TODO(yunqing): support scaled reference frames. if (cpi->scaled_ref_buf[LAST_FRAME - 1]) return; for (int i = 0; i < 2; ++i) { if (unscaled_src->widths[i] != yv12->widths[i] || unscaled_src->heights[i] != yv12->heights[i]) { return; } } const int num_mi_cols = cm->mi_params.mi_cols; const int num_mi_rows = cm->mi_params.mi_rows; const BLOCK_SIZE bsize = BLOCK_64X64; int num_samples = 0; // sse is computed on 64x64 blocks const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128) ? (cm->seq_params->mib_size >> 1) : cm->seq_params->mib_size; const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb; const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb; uint64_t fsse = 0; cpi->rec_sse = 0; for (int sbi_row = 0; sbi_row < sb_rows; ++sbi_row) { for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) { unsigned int sse; uint8_t src[64 * 64] = { 0 }; // Apply 4x4 block averaging/denoising on source frame. for (int i = 0; i < 64; i += 4) { for (int j = 0; j < 64; j += 4) { const unsigned int avg = aom_avg_4x4(src_y + i * src_ystride + j, src_ystride); for (int m = 0; m < 4; ++m) { for (int n = 0; n < 4; ++n) src[i * 64 + j + m * 64 + n] = avg; } } } cpi->ppi->fn_ptr[bsize].vf(src, 64, pre_y, pre_ystride, &sse); fsse += sse; num_samples++; src_y += 64; pre_y += 64; } src_y += (src_ystride << 6) - (sb_cols << 6); pre_y += (pre_ystride << 6) - (sb_cols << 6); } assert(num_samples > 0); // Ensure rec_sse > 0 if (num_samples > 0) cpi->rec_sse = fsse > 0 ? fsse : 1; } int av1_rc_pick_q_and_bounds(AV1_COMP *cpi, int width, int height, int gf_index, int *bottom_index, int *top_index) { PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; int q; // TODO(sarahparker) merge no-stats vbr and altref q computation // with rc_pick_q_and_bounds(). const GF_GROUP *gf_group = &cpi->ppi->gf_group; if ((cpi->oxcf.rc_cfg.mode != AOM_Q || gf_group->update_type[gf_index] == ARF_UPDATE) && has_no_stats_stage(cpi)) { if (cpi->oxcf.rc_cfg.mode == AOM_CBR) { // TODO(yunqing): the results could be used for encoder optimization. cpi->rec_sse = UINT64_MAX; if (cpi->sf.hl_sf.accurate_bit_estimate && cpi->common.current_frame.frame_type != KEY_FRAME) rc_compute_variance_onepass_rt(cpi); q = rc_pick_q_and_bounds_no_stats_cbr(cpi, width, height, bottom_index, top_index); // preserve copy of active worst quality selected. cpi->rc.active_worst_quality = *top_index; #if USE_UNRESTRICTED_Q_IN_CQ_MODE } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) { q = rc_pick_q_and_bounds_no_stats_cq(cpi, width, height, bottom_index, top_index); #endif // USE_UNRESTRICTED_Q_IN_CQ_MODE } else { q = rc_pick_q_and_bounds_no_stats(cpi, width, height, bottom_index, top_index); } } else { q = rc_pick_q_and_bounds(cpi, width, height, gf_index, bottom_index, top_index); } if (gf_group->update_type[gf_index] == ARF_UPDATE) p_rc->arf_q = q; return q; } void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target, int *frame_under_shoot_limit, int *frame_over_shoot_limit) { if (cpi->oxcf.rc_cfg.mode == AOM_Q) { *frame_under_shoot_limit = 0; *frame_over_shoot_limit = INT_MAX; } else { // For very small rate targets where the fractional adjustment // may be tiny make sure there is at least a minimum range. assert(cpi->sf.hl_sf.recode_tolerance <= 100); const int tolerance = (int)AOMMAX( 100, ((int64_t)cpi->sf.hl_sf.recode_tolerance * frame_target) / 100); *frame_under_shoot_limit = AOMMAX(frame_target - tolerance, 0); *frame_over_shoot_limit = (int)AOMMIN((int64_t)frame_target + tolerance, cpi->rc.max_frame_bandwidth); } } void av1_rc_set_frame_target(AV1_COMP *cpi, int target, int width, int height) { const AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; rc->this_frame_target = target; // Modify frame size target when down-scaled. if (av1_frame_scaled(cm) && cpi->oxcf.rc_cfg.mode != AOM_CBR) { rc->this_frame_target = saturate_cast_double_to_int( rc->this_frame_target * resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height)); } // Target rate per SB64 (including partial SB64s. const int64_t sb64_target_rate = ((int64_t)rc->this_frame_target << 12) / (width * height); rc->sb64_target_rate = (int)AOMMIN(sb64_target_rate, INT_MAX); } static void update_alt_ref_frame_stats(AV1_COMP *cpi) { // this frame refreshes means next frames don't unless specified by user RATE_CONTROL *const rc = &cpi->rc; rc->frames_since_golden = 0; } static void update_golden_frame_stats(AV1_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; // Update the Golden frame usage counts. if (cpi->refresh_frame.golden_frame || rc->is_src_frame_alt_ref) { rc->frames_since_golden = 0; } else if (cpi->common.show_frame) { rc->frames_since_golden++; } } void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { const AV1_COMMON *const cm = &cpi->common; const CurrentFrame *const current_frame = &cm->current_frame; RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; const int is_intrnl_arf = gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE; const int qindex = cm->quant_params.base_qindex; #if RT_PASSIVE_STRATEGY const int frame_number = current_frame->frame_number % MAX_Q_HISTORY; p_rc->q_history[frame_number] = qindex; #endif // RT_PASSIVE_STRATEGY // Update rate control heuristics rc->projected_frame_size = (int)(bytes_used << 3); // Post encode loop adjustment of Q prediction. av1_rc_update_rate_correction_factors(cpi, 0, cm->width, cm->height); // Update bit estimation ratio. if (cpi->oxcf.rc_cfg.mode == AOM_CBR && cm->current_frame.frame_type != KEY_FRAME && cpi->sf.hl_sf.accurate_bit_estimate) { const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex, cm->seq_params->bit_depth); const int this_bit_est_ratio = (int)(rc->projected_frame_size * q / sqrt((double)cpi->rec_sse)); cpi->rc.bit_est_ratio = cpi->rc.bit_est_ratio == 0 ? this_bit_est_ratio : (7 * cpi->rc.bit_est_ratio + this_bit_est_ratio) / 8; } // Keep a record of last Q and ambient average Q. if (current_frame->frame_type == KEY_FRAME) { p_rc->last_q[KEY_FRAME] = qindex; p_rc->avg_frame_qindex[KEY_FRAME] = ROUND_POWER_OF_TWO(3 * p_rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); if (cpi->svc.spatial_layer_id == 0) { rc->last_encoded_size_keyframe = rc->projected_frame_size; rc->last_target_size_keyframe = rc->this_frame_target; } } else { if ((cpi->ppi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) || cpi->rc.rtc_external_ratectrl || (!rc->is_src_frame_alt_ref && !(refresh_frame->golden_frame || is_intrnl_arf || refresh_frame->alt_ref_frame))) { p_rc->last_q[INTER_FRAME] = qindex; p_rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO( 3 * p_rc->avg_frame_qindex[INTER_FRAME] + qindex, 2); p_rc->ni_frames++; p_rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params->bit_depth); p_rc->avg_q = p_rc->tot_q / p_rc->ni_frames; // Calculate the average Q for normal inter frames (not key or GFU // frames). rc->ni_tot_qi += qindex; rc->ni_av_qi = rc->ni_tot_qi / p_rc->ni_frames; } } // Keep record of last boosted (KF/GF/ARF) Q value. // If the current frame is coded at a lower Q then we also update it. // If all mbs in this group are skipped only update if the Q value is // better than that already stored. // This is used to help set quality in forced key frames to reduce popping if ((qindex < p_rc->last_boosted_qindex) || (current_frame->frame_type == KEY_FRAME) || (!p_rc->constrained_gf_group && (refresh_frame->alt_ref_frame || is_intrnl_arf || (refresh_frame->golden_frame && !rc->is_src_frame_alt_ref)))) { p_rc->last_boosted_qindex = qindex; } if (current_frame->frame_type == KEY_FRAME) p_rc->last_kf_qindex = qindex; update_buffer_level(cpi, rc->projected_frame_size); rc->prev_avg_frame_bandwidth = rc->avg_frame_bandwidth; // Rolling monitors of whether we are over or underspending used to help // regulate min and Max Q in two pass. if (av1_frame_scaled(cm)) rc->this_frame_target = saturate_cast_double_to_int( rc->this_frame_target / resize_rate_factor(&cpi->oxcf.frm_dim_cfg, cm->width, cm->height)); if (current_frame->frame_type != KEY_FRAME) { p_rc->rolling_target_bits = (int)ROUND_POWER_OF_TWO_64( (int64_t)p_rc->rolling_target_bits * 3 + rc->this_frame_target, 2); p_rc->rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64( (int64_t)p_rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2); } // Actual bits spent p_rc->total_actual_bits += rc->projected_frame_size; p_rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0; if (is_altref_enabled(cpi->oxcf.gf_cfg.lag_in_frames, cpi->oxcf.gf_cfg.enable_auto_arf) && refresh_frame->alt_ref_frame && (current_frame->frame_type != KEY_FRAME && !frame_is_sframe(cm))) // Update the alternate reference frame stats as appropriate. update_alt_ref_frame_stats(cpi); else // Update the Golden frame stats as appropriate. update_golden_frame_stats(cpi); #if CONFIG_FPMT_TEST /*The variables temp_avg_frame_qindex, temp_last_q, temp_avg_q, * temp_last_boosted_qindex are introduced only for quality simulation * purpose, it retains the value previous to the parallel encode frames. The * variables are updated based on the update flag. * * If there exist show_existing_frames between parallel frames, then to * retain the temp state do not update it. */ int show_existing_between_parallel_frames = (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE && cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2); if (cpi->do_frame_data_update && !show_existing_between_parallel_frames && cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { for (int i = 0; i < FRAME_TYPES; i++) { p_rc->temp_last_q[i] = p_rc->last_q[i]; } p_rc->temp_avg_q = p_rc->avg_q; p_rc->temp_last_boosted_qindex = p_rc->last_boosted_qindex; p_rc->temp_total_actual_bits = p_rc->total_actual_bits; p_rc->temp_projected_frame_size = rc->projected_frame_size; for (int i = 0; i < RATE_FACTOR_LEVELS; i++) p_rc->temp_rate_correction_factors[i] = p_rc->rate_correction_factors[i]; } #endif if (current_frame->frame_type == KEY_FRAME) { rc->frames_since_key = 0; rc->frames_since_scene_change = 0; } if (cpi->refresh_frame.golden_frame) rc->frame_num_last_gf_refresh = current_frame->frame_number; rc->prev_coded_width = cm->width; rc->prev_coded_height = cm->height; rc->frame_number_encoded++; rc->prev_frame_is_dropped = 0; rc->drop_count_consec = 0; } void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) { // Update buffer level with zero size, update frame counters, and return. update_buffer_level(cpi, 0); cpi->rc.rc_2_frame = 0; cpi->rc.rc_1_frame = 0; cpi->rc.prev_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth; cpi->rc.prev_coded_width = cpi->common.width; cpi->rc.prev_coded_height = cpi->common.height; cpi->rc.prev_frame_is_dropped = 1; // On a scene/slide change for dropped frame: reset the avg_source_sad to 0, // otherwise the avg_source_sad can get too large and subsequent frames // may miss the scene/slide detection. if (cpi->rc.high_source_sad) cpi->rc.avg_source_sad = 0; if (cpi->ppi->use_svc && cpi->svc.number_spatial_layers > 1) { cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = true; cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = true; } } int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth, int best_qindex, int worst_qindex) { assert(best_qindex <= worst_qindex); int low = best_qindex; int high = worst_qindex; while (low < high) { const int mid = (low + high) >> 1; const double mid_q = av1_convert_qindex_to_q(mid, bit_depth); if (mid_q < desired_q) { low = mid + 1; } else { high = mid; } } assert(low == high); assert(av1_convert_qindex_to_q(low, bit_depth) >= desired_q || low == worst_qindex); return low; } int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, aom_bit_depth_t bit_depth) { const int start_index = av1_find_qindex(qstart, bit_depth, rc->best_quality, rc->worst_quality); const int target_index = av1_find_qindex(qtarget, bit_depth, rc->best_quality, rc->worst_quality); return target_index - start_index; } // Find q_index for the desired_bits_per_mb, within [best_qindex, worst_qindex], // assuming 'correction_factor' is 1.0. // To be precise, 'q_index' is the smallest integer, for which the corresponding // bits per mb <= desired_bits_per_mb. // If no such q index is found, returns 'worst_qindex'. static int find_qindex_by_rate(const AV1_COMP *const cpi, int desired_bits_per_mb, FRAME_TYPE frame_type, int best_qindex, int worst_qindex) { assert(best_qindex <= worst_qindex); int low = best_qindex; int high = worst_qindex; while (low < high) { const int mid = (low + high) >> 1; const int mid_bits_per_mb = av1_rc_bits_per_mb(cpi, frame_type, mid, 1.0, 0); if (mid_bits_per_mb > desired_bits_per_mb) { low = mid + 1; } else { high = mid; } } assert(low == high); assert(av1_rc_bits_per_mb(cpi, frame_type, low, 1.0, 0) <= desired_bits_per_mb || low == worst_qindex); return low; } int av1_compute_qdelta_by_rate(const AV1_COMP *cpi, FRAME_TYPE frame_type, int qindex, double rate_target_ratio) { const RATE_CONTROL *rc = &cpi->rc; // Look up the current projected bits per block for the base index const int base_bits_per_mb = av1_rc_bits_per_mb(cpi, frame_type, qindex, 1.0, 0); // Find the target bits per mb based on the base value and given ratio. const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb); const int target_index = find_qindex_by_rate( cpi, target_bits_per_mb, frame_type, rc->best_quality, rc->worst_quality); return target_index - qindex; } static void set_gf_interval_range(const AV1_COMP *const cpi, RATE_CONTROL *const rc) { const AV1EncoderConfig *const oxcf = &cpi->oxcf; // Special case code for 1 pass fixed Q mode tests if ((has_no_stats_stage(cpi)) && (oxcf->rc_cfg.mode == AOM_Q)) { rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval; rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval; rc->static_scene_max_gf_interval = rc->min_gf_interval + 1; } else { // Set Maximum gf/arf interval rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval; rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval; if (rc->min_gf_interval == 0) rc->min_gf_interval = av1_rc_get_default_min_gf_interval( oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, cpi->framerate); if (rc->max_gf_interval == 0) rc->max_gf_interval = get_default_max_gf_interval(cpi->framerate, rc->min_gf_interval); /* * Extended max interval for genuinely static scenes like slide shows. * The no.of.stats available in the case of LAP is limited, * hence setting to max_gf_interval. */ if (cpi->ppi->lap_enabled) rc->static_scene_max_gf_interval = rc->max_gf_interval + 1; else rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH; if (rc->max_gf_interval > rc->static_scene_max_gf_interval) rc->max_gf_interval = rc->static_scene_max_gf_interval; // Clamp min to max rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval); } } void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) { const AV1EncoderConfig *const oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; const int MBs = av1_get_MBs(width, height); rc->avg_frame_bandwidth = saturate_cast_double_to_int( round(oxcf->rc_cfg.target_bandwidth / cpi->framerate)); int64_t vbr_min_bits = (int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section / 100; vbr_min_bits = AOMMIN(vbr_min_bits, INT_MAX); rc->min_frame_bandwidth = AOMMAX((int)vbr_min_bits, FRAME_OVERHEAD_BITS); // A maximum bitrate for a frame is defined. // The baseline for this aligns with HW implementations that // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits // per 16x16 MB (averaged over a frame). However this limit is extended if // a very high rate is given on the command line or the rate cannot // be achieved because of a user specified max q (e.g. when the user // specifies lossless encode. int64_t vbr_max_bits = (int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmax_section / 100; vbr_max_bits = AOMMIN(vbr_max_bits, INT_MAX); rc->max_frame_bandwidth = AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), (int)vbr_max_bits); set_gf_interval_range(cpi, rc); } #define VBR_PCT_ADJUSTMENT_LIMIT 50 // For VBR...adjustment to the frame target based on error from previous frames static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) { RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; #if CONFIG_FPMT_TEST const int simulate_parallel_frame = cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; int64_t vbr_bits_off_target = simulate_parallel_frame ? cpi->ppi->p_rc.temp_vbr_bits_off_target : p_rc->vbr_bits_off_target; #else int64_t vbr_bits_off_target = p_rc->vbr_bits_off_target; #endif int64_t frame_target = *this_frame_target; const double stats_count = cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL ? cpi->ppi->twopass.stats_buf_ctx->total_stats->count : 0.0; const int frame_window = (int)AOMMIN(16, stats_count - cpi->common.current_frame.frame_number); assert(VBR_PCT_ADJUSTMENT_LIMIT <= 100); if (frame_window > 0) { const int64_t max_delta = AOMMIN(llabs((vbr_bits_off_target / frame_window)), (frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100); // vbr_bits_off_target > 0 means we have extra bits to spend // vbr_bits_off_target < 0 we are currently overshooting frame_target += (vbr_bits_off_target >= 0) ? max_delta : -max_delta; } #if CONFIG_FPMT_TEST int64_t vbr_bits_off_target_fast = simulate_parallel_frame ? cpi->ppi->p_rc.temp_vbr_bits_off_target_fast : p_rc->vbr_bits_off_target_fast; #endif // Fast redistribution of bits arising from massive local undershoot. // Don't do it for kf,arf,gf or overlay frames. if (!frame_is_kf_gf_arf(cpi) && #if CONFIG_FPMT_TEST vbr_bits_off_target_fast && #else p_rc->vbr_bits_off_target_fast && #endif !rc->is_src_frame_alt_ref) { int64_t one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, frame_target); int64_t fast_extra_bits; #if CONFIG_FPMT_TEST fast_extra_bits = AOMMIN(vbr_bits_off_target_fast, one_frame_bits); fast_extra_bits = AOMMIN(fast_extra_bits, AOMMAX(one_frame_bits / 8, vbr_bits_off_target_fast / 8)); #else fast_extra_bits = AOMMIN(p_rc->vbr_bits_off_target_fast, one_frame_bits); fast_extra_bits = AOMMIN(fast_extra_bits, AOMMAX(one_frame_bits / 8, p_rc->vbr_bits_off_target_fast / 8)); #endif fast_extra_bits = AOMMIN(fast_extra_bits, INT_MAX); if (fast_extra_bits > 0) { // Update frame_target only if additional bits are available from // local undershoot. frame_target += fast_extra_bits; } // Store the fast_extra_bits of the frame and reduce it from // vbr_bits_off_target_fast during postencode stage. rc->frame_level_fast_extra_bits = (int)fast_extra_bits; // Retaining the condition to update during postencode stage since // fast_extra_bits are calculated based on vbr_bits_off_target_fast. cpi->do_update_vbr_bits_off_target_fast = 1; } // Clamp the target for the frame to the maximum allowed for one frame. *this_frame_target = (int)AOMMIN(frame_target, INT_MAX); } void av1_set_target_rate(AV1_COMP *cpi, int width, int height) { RATE_CONTROL *const rc = &cpi->rc; int target_rate = rc->base_frame_target; // Correction to rate target based on prior over or under shoot. if (cpi->oxcf.rc_cfg.mode == AOM_VBR || cpi->oxcf.rc_cfg.mode == AOM_CQ) vbr_rate_correction(cpi, &target_rate); av1_rc_set_frame_target(cpi, target_rate, width, height); } int av1_calc_pframe_target_size_one_pass_vbr( const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) { static const int af_ratio = 10; const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; int64_t target; #if USE_ALTREF_FOR_ONE_PASS if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE || frame_update_type == ARF_UPDATE) { target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * af_ratio) / (p_rc->baseline_gf_interval + af_ratio - 1); } else { target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval) / (p_rc->baseline_gf_interval + af_ratio - 1); } #else target = rc->avg_frame_bandwidth; #endif return clamp_pframe_target_size(cpi, target, frame_update_type); } int av1_calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) { static const int kf_ratio = 25; const RATE_CONTROL *rc = &cpi->rc; const int64_t target = (int64_t)rc->avg_frame_bandwidth * kf_ratio; return clamp_iframe_target_size(cpi, target); } int av1_calc_pframe_target_size_one_pass_cbr( const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) { const AV1EncoderConfig *oxcf = &cpi->oxcf; const RATE_CONTROL *rc = &cpi->rc; const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc; const RateControlCfg *rc_cfg = &oxcf->rc_cfg; const int64_t diff = p_rc->optimal_buffer_level - p_rc->buffer_level; const int64_t one_pct_bits = 1 + p_rc->optimal_buffer_level / 100; int min_frame_target = AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS); int64_t target; if (rc_cfg->gf_cbr_boost_pct) { const int af_ratio_pct = rc_cfg->gf_cbr_boost_pct + 100; if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) { target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * af_ratio_pct) / (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100); } else { target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * 100) / (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100); } } else { target = rc->avg_frame_bandwidth; } if (cpi->ppi->use_svc) { // Note that for layers, avg_frame_bandwidth is the cumulative // per-frame-bandwidth. For the target size of this frame, use the // layer average frame size (i.e., non-cumulative per-frame-bw). int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id, cpi->svc.number_temporal_layers); const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; target = lc->avg_frame_size; min_frame_target = AOMMAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS); } if (diff > 0) { // Lower the target bandwidth for this frame. const int pct_low = (int)AOMMIN(diff / one_pct_bits, rc_cfg->under_shoot_pct); target -= (target * pct_low) / 200; } else if (diff < 0) { // Increase the target bandwidth for this frame. const int pct_high = (int)AOMMIN(-diff / one_pct_bits, rc_cfg->over_shoot_pct); target += (target * pct_high) / 200; } if (rc_cfg->max_inter_bitrate_pct) { const int64_t max_rate = (int64_t)rc->avg_frame_bandwidth * rc_cfg->max_inter_bitrate_pct / 100; target = AOMMIN(target, max_rate); } if (target > INT_MAX) target = INT_MAX; return AOMMAX(min_frame_target, (int)target); } int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) { const RATE_CONTROL *rc = &cpi->rc; const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc; int64_t target; if (cpi->common.current_frame.frame_number == 0) { target = ((p_rc->starting_buffer_level / 2) > INT_MAX) ? INT_MAX : (int)(p_rc->starting_buffer_level / 2); if (cpi->svc.number_temporal_layers > 1 && target < (INT_MAX >> 2)) { target = target << AOMMIN(2, (cpi->svc.number_temporal_layers - 1)); } } else { int kf_boost = 32; double framerate = cpi->framerate; kf_boost = AOMMAX(kf_boost, (int)round(2 * framerate - 16)); if (rc->frames_since_key < framerate / 2) { kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2)); } target = ((int64_t)(16 + kf_boost) * rc->avg_frame_bandwidth) >> 4; } return clamp_iframe_target_size(cpi, target); } static void set_golden_update(AV1_COMP *const cpi) { RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; int divisor = 10; if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) divisor = cpi->cyclic_refresh->percent_refresh; // Set minimum gf_interval for GF update to a multiple of the refresh period, // with some max limit. Depending on past encoding stats, GF flag may be // reset and update may not occur until next baseline_gf_interval. const int gf_length_mult[2] = { 8, 4 }; if (divisor > 0) p_rc->baseline_gf_interval = AOMMIN(gf_length_mult[cpi->sf.rt_sf.gf_length_lvl] * (100 / divisor), MAX_GF_INTERVAL_RT); else p_rc->baseline_gf_interval = FIXED_GF_INTERVAL_RT; if (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 40) p_rc->baseline_gf_interval = 16; } static void set_baseline_gf_interval(AV1_COMP *cpi, FRAME_TYPE frame_type) { RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; GF_GROUP *const gf_group = &cpi->ppi->gf_group; set_golden_update(cpi); if (p_rc->baseline_gf_interval > rc->frames_to_key && cpi->oxcf.kf_cfg.auto_key) p_rc->baseline_gf_interval = rc->frames_to_key; p_rc->gfu_boost = DEFAULT_GF_BOOST_RT; p_rc->constrained_gf_group = (p_rc->baseline_gf_interval >= rc->frames_to_key && cpi->oxcf.kf_cfg.auto_key) ? 1 : 0; rc->frames_till_gf_update_due = p_rc->baseline_gf_interval; cpi->gf_frame_index = 0; // SVC does not use GF as periodic boost. // TODO(marpan): Find better way to disable this for SVC. if (cpi->ppi->use_svc) { SVC *const svc = &cpi->svc; p_rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1; p_rc->gfu_boost = 1; p_rc->constrained_gf_group = 0; rc->frames_till_gf_update_due = p_rc->baseline_gf_interval; for (int layer = 0; layer < svc->number_spatial_layers * svc->number_temporal_layers; ++layer) { LAYER_CONTEXT *const lc = &svc->layer_context[layer]; lc->p_rc.baseline_gf_interval = p_rc->baseline_gf_interval; lc->p_rc.gfu_boost = p_rc->gfu_boost; lc->p_rc.constrained_gf_group = p_rc->constrained_gf_group; lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due; lc->group_index = 0; } } gf_group->size = p_rc->baseline_gf_interval; gf_group->update_type[0] = (frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE; gf_group->refbuf_state[cpi->gf_frame_index] = (frame_type == KEY_FRAME) ? REFBUF_RESET : REFBUF_UPDATE; } void av1_adjust_gf_refresh_qp_one_pass_rt(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; const int resize_pending = is_frame_resize_pending(cpi); if (!resize_pending && !rc->high_source_sad) { // Check if we should disable GF refresh (if period is up), // or force a GF refresh update (if we are at least halfway through // period) based on QP. Look into add info on segment deltaq. PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc; const int avg_qp = p_rc->avg_frame_qindex[INTER_FRAME]; const int allow_gf_update = rc->frames_till_gf_update_due <= (p_rc->baseline_gf_interval - 10); int gf_update_changed = 0; int thresh = 87; if ((cm->current_frame.frame_number - cpi->rc.frame_num_last_gf_refresh) < FIXED_GF_INTERVAL_RT && rc->frames_till_gf_update_due == 1 && cm->quant_params.base_qindex > avg_qp) { // Disable GF refresh since QP is above the running average QP. rtc_ref->refresh[rtc_ref->gld_idx_1layer] = 0; gf_update_changed = 1; cpi->refresh_frame.golden_frame = 0; } else if (allow_gf_update && ((cm->quant_params.base_qindex < thresh * avg_qp / 100) || (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 20))) { // Force refresh since QP is well below average QP or this is a high // motion frame. rtc_ref->refresh[rtc_ref->gld_idx_1layer] = 1; gf_update_changed = 1; cpi->refresh_frame.golden_frame = 1; } if (gf_update_changed) { set_baseline_gf_interval(cpi, INTER_FRAME); int refresh_mask = 0; for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { int ref_frame_map_idx = rtc_ref->ref_idx[i]; refresh_mask |= rtc_ref->refresh[ref_frame_map_idx] << ref_frame_map_idx; } cm->current_frame.refresh_frame_flags = refresh_mask; } } } /*!\brief Setup the reference prediction structure for 1 pass real-time * * Set the reference prediction structure for 1 layer. * Current structure is to use 3 references (LAST, GOLDEN, ALTREF), * where ALT_REF always behind current by lag_alt frames, and GOLDEN is * either updated on LAST with period baseline_gf_interval (fixed slot) * or always behind current by lag_gld (gld_fixed_slot = 0, lag_gld <= 7). * * \ingroup rate_control * \param[in] cpi Top level encoder structure * \param[in] gf_update Flag to indicate if GF is updated * * \remark Nothing is returned. Instead the settings for the prediction * structure are set in \c cpi-ext_flags; and the buffer slot index * (for each of 7 references) and refresh flags (for each of the 8 slots) * are set in \c cpi->svc.ref_idx[] and \c cpi->svc.refresh[]. */ void av1_set_rtc_reference_structure_one_layer(AV1_COMP *cpi, int gf_update) { AV1_COMMON *const cm = &cpi->common; ExternalFlags *const ext_flags = &cpi->ext_flags; RATE_CONTROL *const rc = &cpi->rc; ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags = &ext_flags->refresh_frame; RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; unsigned int frame_number = (cpi->oxcf.rc_cfg.drop_frames_water_mark) ? rc->frame_number_encoded : cm->current_frame.frame_number; unsigned int lag_alt = 4; int last_idx = 0; int last_idx_refresh = 0; int gld_idx = 0; int alt_ref_idx = 0; int last2_idx = 0; ext_refresh_frame_flags->update_pending = 1; ext_flags->ref_frame_flags = 0; ext_refresh_frame_flags->last_frame = 1; ext_refresh_frame_flags->golden_frame = 0; ext_refresh_frame_flags->alt_ref_frame = 0; // Decide altref lag adaptively for rt if (cpi->sf.rt_sf.sad_based_adp_altref_lag) { lag_alt = 6; const uint64_t th_frame_sad[4][3] = { { 18000, 18000, 18000 }, // HDRES CPU 9 { 25000, 25000, 25000 }, // MIDRES CPU 9 { 40000, 30000, 20000 }, // HDRES CPU 10 { 30000, 25000, 20000 } // MIDRES CPU 10 }; int th_idx = cpi->sf.rt_sf.sad_based_adp_altref_lag - 1; assert(th_idx < 4); if (rc->avg_source_sad > th_frame_sad[th_idx][0]) lag_alt = 3; else if (rc->avg_source_sad > th_frame_sad[th_idx][1]) lag_alt = 4; else if (rc->avg_source_sad > th_frame_sad[th_idx][2]) lag_alt = 5; } // This defines the reference structure for 1 layer (non-svc) RTC encoding. // To avoid the internal/default reference structure for non-realtime // overwriting this behavior, we use the "svc" ref parameters from the // external control SET_SVC_REF_FRAME_CONFIG. // TODO(marpan): rename that control and the related internal parameters // to rtc_ref. for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) rtc_ref->ref_idx[i] = 7; for (int i = 0; i < REF_FRAMES; ++i) rtc_ref->refresh[i] = 0; // Set the reference frame flags. ext_flags->ref_frame_flags ^= AOM_LAST_FLAG; if (!cpi->sf.rt_sf.force_only_last_ref) { ext_flags->ref_frame_flags ^= AOM_ALT_FLAG; ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG; if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) ext_flags->ref_frame_flags ^= AOM_LAST2_FLAG; } const int sh = 6; // Moving index slot for last: 0 - (sh - 1). if (frame_number > 1) last_idx = ((frame_number - 1) % sh); // Moving index for refresh of last: one ahead for next frame. last_idx_refresh = (frame_number % sh); gld_idx = 6; // Moving index for alt_ref, lag behind LAST by lag_alt frames. if (frame_number > lag_alt) alt_ref_idx = ((frame_number - lag_alt) % sh); if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) { // Moving index for LAST2, lag behind LAST by 2 frames. if (frame_number > 2) last2_idx = ((frame_number - 2) % sh); } rtc_ref->ref_idx[0] = last_idx; // LAST rtc_ref->ref_idx[1] = last_idx_refresh; // LAST2 (for refresh of last). if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) { rtc_ref->ref_idx[1] = last2_idx; // LAST2 rtc_ref->ref_idx[2] = last_idx_refresh; // LAST3 (for refresh of last). } rtc_ref->ref_idx[3] = gld_idx; // GOLDEN rtc_ref->ref_idx[6] = alt_ref_idx; // ALT_REF // Refresh this slot, which will become LAST on next frame. rtc_ref->refresh[last_idx_refresh] = 1; // Update GOLDEN on period for fixed slot case. if (gf_update && cm->current_frame.frame_type != KEY_FRAME) { ext_refresh_frame_flags->golden_frame = 1; rtc_ref->refresh[gld_idx] = 1; } rtc_ref->gld_idx_1layer = gld_idx; // Set the flag to reduce the number of reference frame buffers used. // This assumes that slot 7 is never used. cpi->rt_reduce_num_ref_buffers = 1; cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[0] < 7); cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[1] < 7); cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[3] < 7); cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[6] < 7); if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[2] < 7); } // Returns whether the 64x64 block is active or inactive: used // by the scene detection, which is over 64x64 blocks. static int set_block_is_active(unsigned char *const active_map_4x4, int mi_cols, int mi_rows, int sbi_col, int sbi_row) { int num_4x4 = 16; int r = sbi_row << 4; int c = sbi_col << 4; const int row_max = AOMMIN(num_4x4, mi_rows - r); const int col_max = AOMMIN(num_4x4, mi_cols - c); // Active map is set for 16x16 blocks, so only need to // check over16x16, for (int x = 0; x < row_max; x += 4) { for (int y = 0; y < col_max; y += 4) { if (active_map_4x4[(r + x) * mi_cols + (c + y)] == AM_SEGMENT_ID_ACTIVE) return 1; } } return 0; } // Returns the best sad for column or row motion of the superblock. static unsigned int estimate_scroll_motion( const AV1_COMP *cpi, uint8_t *src_buf, uint8_t *last_src_buf, int src_stride, int ref_stride, BLOCK_SIZE bsize, int pos_col, int pos_row, int *best_intmv_col, int *best_intmv_row, int sw_col, int sw_row) { const AV1_COMMON *const cm = &cpi->common; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const int full_search = 1; // Keep border a multiple of 16. const int border = (cpi->oxcf.border_in_pixels >> 4) << 4; int search_size_width = sw_col; int search_size_height = sw_row; // Adjust based on boundary. if ((pos_col - search_size_width < -border) || (pos_col + search_size_width > cm->width + border)) search_size_width = border; if ((pos_row - search_size_height < -border) || (pos_row + search_size_height > cm->height + border)) search_size_height = border; const uint8_t *ref_buf; const int row_norm_factor = mi_size_high_log2[bsize] + 1; const int col_norm_factor = 3 + (bw >> 5); const int ref_buf_width = (search_size_width << 1) + bw; const int ref_buf_height = (search_size_height << 1) + bh; int16_t *hbuf = (int16_t *)aom_malloc(ref_buf_width * sizeof(*hbuf)); int16_t *vbuf = (int16_t *)aom_malloc(ref_buf_height * sizeof(*vbuf)); int16_t *src_hbuf = (int16_t *)aom_malloc(bw * sizeof(*src_hbuf)); int16_t *src_vbuf = (int16_t *)aom_malloc(bh * sizeof(*src_vbuf)); if (!hbuf || !vbuf || !src_hbuf || !src_vbuf) { aom_free(hbuf); aom_free(vbuf); aom_free(src_hbuf); aom_free(src_vbuf); aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate hbuf, vbuf, src_hbuf, or src_vbuf"); } // Set up prediction 1-D reference set for rows. ref_buf = last_src_buf - search_size_width; aom_int_pro_row(hbuf, ref_buf, ref_stride, ref_buf_width, bh, row_norm_factor); // Set up prediction 1-D reference set for cols ref_buf = last_src_buf - search_size_height * ref_stride; aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, ref_buf_height, col_norm_factor); // Set up src 1-D reference set aom_int_pro_row(src_hbuf, src_buf, src_stride, bw, bh, row_norm_factor); aom_int_pro_col(src_vbuf, src_buf, src_stride, bw, bh, col_norm_factor); unsigned int best_sad; int best_sad_col, best_sad_row; // Find the best match per 1-D search *best_intmv_col = av1_vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize], search_size_width, full_search, &best_sad_col); *best_intmv_row = av1_vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize], search_size_height, full_search, &best_sad_row); if (best_sad_col < best_sad_row) { *best_intmv_row = 0; best_sad = best_sad_col; } else { *best_intmv_col = 0; best_sad = best_sad_row; } aom_free(hbuf); aom_free(vbuf); aom_free(src_hbuf); aom_free(src_vbuf); return best_sad; } /*!\brief Check for scene detection, for 1 pass real-time mode. * * Compute average source sad (temporal sad: between current source and * previous source) over a subset of superblocks. Use this is detect big changes * in content and set the \c cpi->rc.high_source_sad flag. * * \ingroup rate_control * \param[in] cpi Top level encoder structure * \param[in] frame_input Current and last input source frames * * \remark Nothing is returned. Instead the flag \c cpi->rc.high_source_sad * is set if scene change is detected, and \c cpi->rc.avg_source_sad is updated. */ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi, const EncodeFrameInput *frame_input) { AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; YV12_BUFFER_CONFIG const *const unscaled_src = frame_input->source; YV12_BUFFER_CONFIG const *const unscaled_last_src = frame_input->last_source; uint8_t *src_y; int src_ystride; int src_width; int src_height; uint8_t *last_src_y; int last_src_ystride; int last_src_width; int last_src_height; int width = cm->width; int height = cm->height; if (cpi->svc.number_spatial_layers > 1) { width = cpi->oxcf.frm_dim_cfg.width; height = cpi->oxcf.frm_dim_cfg.height; } if (width != cm->render_width || height != cm->render_height || unscaled_src == NULL || unscaled_last_src == NULL) { aom_free(cpi->src_sad_blk_64x64); cpi->src_sad_blk_64x64 = NULL; } if (unscaled_src == NULL || unscaled_last_src == NULL) return; src_y = unscaled_src->y_buffer; src_ystride = unscaled_src->y_stride; src_width = unscaled_src->y_width; src_height = unscaled_src->y_height; last_src_y = unscaled_last_src->y_buffer; last_src_ystride = unscaled_last_src->y_stride; last_src_width = unscaled_last_src->y_width; last_src_height = unscaled_last_src->y_height; if (src_width != last_src_width || src_height != last_src_height) { aom_free(cpi->src_sad_blk_64x64); cpi->src_sad_blk_64x64 = NULL; return; } rc->high_source_sad = 0; rc->percent_blocks_with_motion = 0; rc->max_block_source_sad = 0; rc->prev_avg_source_sad = rc->avg_source_sad; int num_mi_cols = cm->mi_params.mi_cols; int num_mi_rows = cm->mi_params.mi_rows; if (cpi->svc.number_spatial_layers > 1) { num_mi_cols = cpi->svc.mi_cols_full_resoln; num_mi_rows = cpi->svc.mi_rows_full_resoln; } int num_zero_temp_sad = 0; uint32_t min_thresh = (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) ? 8000 : 10000; if (cpi->sf.rt_sf.higher_thresh_scene_detection) { min_thresh = cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0 ? 50000 : 100000; } const BLOCK_SIZE bsize = BLOCK_64X64; // Loop over sub-sample of frame, compute average sad over 64x64 blocks. uint64_t avg_sad = 0; uint64_t tmp_sad = 0; int num_samples = 0; const int thresh = ((cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0) || (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN)) ? 5 : 6; // SAD is computed on 64x64 blocks const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128) ? (cm->seq_params->mib_size >> 1) : cm->seq_params->mib_size; const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb; const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb; uint64_t sum_sq_thresh = 10000; // sum = sqrt(thresh / 64*64)) ~1.5 int num_low_var_high_sumdiff = 0; int light_change = 0; // Flag to check light change or not. const int check_light_change = 0; // TODO(marpan): There seems some difference along the bottom border when // using the source_last_tl0 for last_source (used for temporal layers or // when previous frame is dropped). // Remove this border parameter when issue is resolved: difference is that // non-zero sad exists along bottom border even though source is static. const int border = rc->prev_frame_is_dropped || cpi->svc.number_temporal_layers > 1; // Store blkwise SAD for later use if (width == cm->render_width && height == cm->render_height) { if (cpi->src_sad_blk_64x64 == NULL) { CHECK_MEM_ERROR(cm, cpi->src_sad_blk_64x64, (uint64_t *)aom_calloc(sb_cols * sb_rows, sizeof(*cpi->src_sad_blk_64x64))); } } const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; const int mi_cols = mi_params->mi_cols; const int mi_rows = mi_params->mi_rows; unsigned char *const active_map_4x4 = cpi->active_map.map; // Avoid bottom and right border. for (int sbi_row = 0; sbi_row < sb_rows - border; ++sbi_row) { for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) { int block_is_active = 1; if (cpi->active_map.enabled && rc->percent_blocks_inactive > 0) { block_is_active = set_block_is_active(active_map_4x4, mi_cols, mi_rows, sbi_col, sbi_row); } if (block_is_active) { tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, last_src_ystride); } else { tmp_sad = 0; } if (cpi->src_sad_blk_64x64 != NULL) cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols] = tmp_sad; if (check_light_change) { unsigned int sse, variance; variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y, last_src_ystride, &sse); // Note: sse - variance = ((sum * sum) >> 12) // Detect large lighting change. if (variance < (sse >> 1) && (sse - variance) > sum_sq_thresh) { num_low_var_high_sumdiff++; } } avg_sad += tmp_sad; num_samples++; if (tmp_sad == 0) num_zero_temp_sad++; if (tmp_sad > rc->max_block_source_sad) rc->max_block_source_sad = tmp_sad; src_y += 64; last_src_y += 64; } src_y += (src_ystride << 6) - (sb_cols << 6); last_src_y += (last_src_ystride << 6) - (sb_cols << 6); } if (check_light_change && num_samples > 0 && num_low_var_high_sumdiff > (num_samples >> 1)) light_change = 1; if (num_samples > 0) avg_sad = avg_sad / num_samples; // Set high_source_sad flag if we detect very high increase in avg_sad // between current and previous frame value(s). Use minimum threshold // for cases where there is small change from content that is completely // static. if (!light_change && avg_sad > AOMMAX(min_thresh, (unsigned int)(rc->avg_source_sad * thresh)) && rc->frames_since_key > 1 + cpi->svc.number_spatial_layers && num_zero_temp_sad < 3 * (num_samples >> 2)) rc->high_source_sad = 1; else rc->high_source_sad = 0; rc->avg_source_sad = (3 * rc->avg_source_sad + avg_sad) >> 2; rc->frame_source_sad = avg_sad; if (num_samples > 0) rc->percent_blocks_with_motion = ((num_samples - num_zero_temp_sad) * 100) / num_samples; if (rc->frame_source_sad > 0) rc->static_since_last_scene_change = 0; if (rc->high_source_sad) { cpi->rc.frames_since_scene_change = 0; rc->static_since_last_scene_change = 1; } // Update the high_motion_content_screen_rtc flag on TL0. Avoid the update // if too many consecutive frame drops occurred. const uint64_t thresh_high_motion = 9 * 64 * 64; if (cpi->svc.temporal_layer_id == 0 && rc->drop_count_consec < 3) { cpi->rc.high_motion_content_screen_rtc = 0; if (cpi->oxcf.speed >= 11 && cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && rc->percent_blocks_with_motion > 40 && rc->prev_avg_source_sad > thresh_high_motion && rc->avg_source_sad > thresh_high_motion && rc->avg_frame_low_motion < 60 && unscaled_src->y_width >= 1280 && unscaled_src->y_height >= 720) { cpi->rc.high_motion_content_screen_rtc = 1; // Compute fast coarse/global motion for 128x128 superblock centered // at middle of frame, and one to the upper left and one to lower right. // to determine if motion is scroll. Only test 3 points (pts) for now. // TODO(marpan): Only allow for 8 bit-depth for now. if (cm->seq_params->bit_depth == 8) { const int sw_row = (cpi->rc.frame_source_sad > 20000) ? 512 : 192; const int sw_col = (cpi->rc.frame_source_sad > 20000) ? 512 : 160; const int num_pts = unscaled_src->y_width * unscaled_src->y_height >= 1920 * 1080 ? 3 : 1; for (int pts = 0; pts < num_pts; pts++) { // fac and shift are used to move the center block for the other // two points (pts). int fac = 1; int shift = 1; if (pts == 1) { fac = 1; shift = 2; } else if (pts == 2) { fac = 3; shift = 2; } int pos_col = (fac * unscaled_src->y_width >> shift) - 64; int pos_row = (fac * unscaled_src->y_height >> shift) - 64; pos_col = AOMMAX(sw_col, AOMMIN(unscaled_src->y_width - sw_col - 1, pos_col)); pos_row = AOMMAX( sw_row, AOMMIN(unscaled_src->y_height - sw_row - 1, pos_row)); if (pos_col >= 0 && pos_col < unscaled_src->y_width - 64 && pos_row >= 0 && pos_row < unscaled_src->y_height - 64) { src_y = unscaled_src->y_buffer + pos_row * src_ystride + pos_col; last_src_y = unscaled_last_src->y_buffer + pos_row * last_src_ystride + pos_col; int best_intmv_col = 0; int best_intmv_row = 0; unsigned int y_sad = estimate_scroll_motion( cpi, src_y, last_src_y, src_ystride, last_src_ystride, BLOCK_128X128, pos_col, pos_row, &best_intmv_col, &best_intmv_row, sw_col, sw_row); if (y_sad < 100 && (abs(best_intmv_col) > 16 || abs(best_intmv_row) > 16)) { cpi->rc.high_motion_content_screen_rtc = 0; break; } } } } } // Pass the flag value to all layer frames. if (cpi->svc.number_spatial_layers > 1 || cpi->svc.number_temporal_layers > 1) { SVC *svc = &cpi->svc; for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { for (int tl = 1; tl < svc->number_temporal_layers; ++tl) { const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); LAYER_CONTEXT *lc = &svc->layer_context[layer]; RATE_CONTROL *lrc = &lc->rc; lrc->high_motion_content_screen_rtc = rc->high_motion_content_screen_rtc; } } } } // Scene detection is only on base SLO, and using full/original resolution. // Pass the state to the upper spatial layers. if (cpi->svc.number_spatial_layers > 1) { SVC *svc = &cpi->svc; for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { int tl = svc->temporal_layer_id; const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); LAYER_CONTEXT *lc = &svc->layer_context[layer]; RATE_CONTROL *lrc = &lc->rc; lrc->high_source_sad = rc->high_source_sad; lrc->frame_source_sad = rc->frame_source_sad; lrc->avg_source_sad = rc->avg_source_sad; lrc->percent_blocks_with_motion = rc->percent_blocks_with_motion; lrc->max_block_source_sad = rc->max_block_source_sad; } } } // This is used as a reference when computing the source variance. static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; /*!\brief Compute spatial activity for frame, 1 pass real-time mode. * * Compute average spatial activity/variance for source frame over a * subset of superblocks. * * \ingroup rate_control * \param[in] cpi Top level encoder structure * \param[in] src_y Input source buffer for y channel. * \param[in] src_ystride Input source stride for y channel. * * \remark Nothing is returned. Instead the average spatial variance * computed is stored in flag \c cpi->rc.frame_spatial_variance. */ static void rc_spatial_act_onepass_rt(AV1_COMP *cpi, uint8_t *src_y, int src_ystride) { AV1_COMMON *const cm = &cpi->common; int num_mi_cols = cm->mi_params.mi_cols; int num_mi_rows = cm->mi_params.mi_rows; const BLOCK_SIZE bsize = BLOCK_64X64; // Loop over sub-sample of frame, compute average over 64x64 blocks. uint64_t avg_variance = 0; int num_samples = 0; int num_zero_var_blocks = 0; cpi->rc.perc_spatial_flat_blocks = 0; const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128) ? (cm->seq_params->mib_size >> 1) : cm->seq_params->mib_size; const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb; const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb; for (int sbi_row = 0; sbi_row < sb_rows; ++sbi_row) { for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) { unsigned int sse; const unsigned int var = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, AV1_VAR_OFFS, 0, &sse); avg_variance += var; num_samples++; if (var == 0) num_zero_var_blocks++; src_y += 64; } src_y += (src_ystride << 6) - (sb_cols << 6); } if (num_samples > 0) { cpi->rc.perc_spatial_flat_blocks = 100 * num_zero_var_blocks / num_samples; avg_variance = avg_variance / num_samples; } cpi->rc.frame_spatial_variance = avg_variance >> 12; } /*!\brief Set the GF baseline interval for 1 pass real-time mode. * * * \ingroup rate_control * \param[in] cpi Top level encoder structure * \param[in] frame_type frame type * * \return Return GF update flag, and update the \c cpi->rc with * the next GF interval settings. */ static int set_gf_interval_update_onepass_rt(AV1_COMP *cpi, FRAME_TYPE frame_type) { RATE_CONTROL *const rc = &cpi->rc; int gf_update = 0; const int resize_pending = is_frame_resize_pending(cpi); // GF update based on frames_till_gf_update_due, also // force update on resize pending frame or for scene change. if ((resize_pending || rc->high_source_sad || rc->frames_till_gf_update_due == 0) && cpi->svc.temporal_layer_id == 0 && cpi->svc.spatial_layer_id == 0) { set_baseline_gf_interval(cpi, frame_type); gf_update = 1; } return gf_update; } static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height, int prev_width, int prev_height) { RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; SVC *const svc = &cpi->svc; int target_bits_per_frame; int active_worst_quality; int qindex; double tot_scale_change = (double)(resize_width * resize_height) / (double)(prev_width * prev_height); // Disable the skip mv search for svc on resize frame. svc->skip_mvsearch_last = 0; svc->skip_mvsearch_gf = 0; svc->skip_mvsearch_altref = 0; // Reset buffer level to optimal, update target size. p_rc->buffer_level = p_rc->optimal_buffer_level; p_rc->bits_off_target = p_rc->optimal_buffer_level; rc->this_frame_target = av1_calc_pframe_target_size_one_pass_cbr(cpi, INTER_FRAME); target_bits_per_frame = rc->this_frame_target; if (tot_scale_change > 4.0) p_rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality; else if (tot_scale_change > 1.0) p_rc->avg_frame_qindex[INTER_FRAME] = (p_rc->avg_frame_qindex[INTER_FRAME] + rc->worst_quality) >> 1; active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi); qindex = av1_rc_regulate_q(cpi, target_bits_per_frame, rc->best_quality, active_worst_quality, resize_width, resize_height); // If resize is down, check if projected q index is close to worst_quality, // and if so, reduce the rate correction factor (since likely can afford // lower q for resized frame). if (tot_scale_change < 1.0 && qindex > 90 * rc->worst_quality / 100) p_rc->rate_correction_factors[INTER_NORMAL] *= 0.85; // If resize is back up: check if projected q index is too much above the // previous index, and if so, reduce the rate correction factor // (since prefer to keep q for resized frame at least closet to previous q). // Also check if projected qindex is close to previous qindex, if so // increase correction factor (to push qindex higher and avoid overshoot). if (tot_scale_change >= 1.0) { if (tot_scale_change < 4.0 && qindex > 130 * p_rc->last_q[INTER_FRAME] / 100) p_rc->rate_correction_factors[INTER_NORMAL] *= 0.8; if (qindex <= 120 * p_rc->last_q[INTER_FRAME] / 100) p_rc->rate_correction_factors[INTER_NORMAL] *= 1.5; } if (svc->number_temporal_layers > 1) { // Apply the same rate control reset to all temporal layers. for (int tl = 0; tl < svc->number_temporal_layers; tl++) { LAYER_CONTEXT *lc = NULL; lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers + tl]; lc->rc.resize_state = rc->resize_state; lc->p_rc.buffer_level = lc->p_rc.optimal_buffer_level; lc->p_rc.bits_off_target = lc->p_rc.optimal_buffer_level; lc->p_rc.rate_correction_factors[INTER_NORMAL] = p_rc->rate_correction_factors[INTER_NORMAL]; lc->p_rc.avg_frame_qindex[INTER_FRAME] = p_rc->avg_frame_qindex[INTER_FRAME]; } } } /*!\brief Check for resize based on Q, for 1 pass real-time mode. * * Check if we should resize, based on average QP from past x frames. * Only allow for resize at most 1/2 scale down for now, Scaling factor * for each step may be 3/4 or 1/2. * * \ingroup rate_control * \param[in] cpi Top level encoder structure * * \remark Return resized width/height in \c cpi->resize_pending_params, * and update some resize counters in \c rc. */ static void dynamic_resize_one_pass_cbr(AV1_COMP *cpi) { const AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; RESIZE_ACTION resize_action = NO_RESIZE; const int avg_qp_thr1 = 70; const int avg_qp_thr2 = 50; // Don't allow for resized frame to go below 160x90, resize in steps of 3/4. const int min_width = (160 * 4) / 3; const int min_height = (90 * 4) / 3; int down_size_on = 1; // Don't resize on key frame; reset the counters on key frame. if (cm->current_frame.frame_type == KEY_FRAME) { rc->resize_avg_qp = 0; rc->resize_count = 0; rc->resize_buffer_underflow = 0; return; } // No resizing down if frame size is below some limit. if ((cm->width * cm->height) < min_width * min_height) down_size_on = 0; // Resize based on average buffer underflow and QP over some window. // Ignore samples close to key frame, since QP is usually high after key. if (cpi->rc.frames_since_key > cpi->framerate) { const int window = AOMMIN(30, (int)(2 * cpi->framerate)); rc->resize_avg_qp += p_rc->last_q[INTER_FRAME]; if (cpi->ppi->p_rc.buffer_level < (int)(30 * p_rc->optimal_buffer_level / 100)) ++rc->resize_buffer_underflow; ++rc->resize_count; // Check for resize action every "window" frames. if (rc->resize_count >= window) { int avg_qp = rc->resize_avg_qp / rc->resize_count; // Resize down if buffer level has underflowed sufficient amount in past // window, and we are at original or 3/4 of original resolution. // Resize back up if average QP is low, and we are currently in a resized // down state, i.e. 1/2 or 3/4 of original resolution. // Currently, use a flag to turn 3/4 resizing feature on/off. if (rc->resize_buffer_underflow > (rc->resize_count >> 2) && down_size_on) { if (rc->resize_state == THREE_QUARTER) { resize_action = DOWN_ONEHALF; rc->resize_state = ONE_HALF; } else if (rc->resize_state == ORIG) { resize_action = DOWN_THREEFOUR; rc->resize_state = THREE_QUARTER; } } else if (rc->resize_state != ORIG && avg_qp < avg_qp_thr1 * cpi->rc.worst_quality / 100) { if (rc->resize_state == THREE_QUARTER || avg_qp < avg_qp_thr2 * cpi->rc.worst_quality / 100) { resize_action = UP_ORIG; rc->resize_state = ORIG; } else if (rc->resize_state == ONE_HALF) { resize_action = UP_THREEFOUR; rc->resize_state = THREE_QUARTER; } } // Reset for next window measurement. rc->resize_avg_qp = 0; rc->resize_count = 0; rc->resize_buffer_underflow = 0; } } // If decision is to resize, reset some quantities, and check is we should // reduce rate correction factor, if (resize_action != NO_RESIZE) { int resize_width = cpi->oxcf.frm_dim_cfg.width; int resize_height = cpi->oxcf.frm_dim_cfg.height; int resize_scale_num = 1; int resize_scale_den = 1; if (resize_action == DOWN_THREEFOUR || resize_action == UP_THREEFOUR) { resize_scale_num = 3; resize_scale_den = 4; } else if (resize_action == DOWN_ONEHALF) { resize_scale_num = 1; resize_scale_den = 2; } resize_width = resize_width * resize_scale_num / resize_scale_den; resize_height = resize_height * resize_scale_num / resize_scale_den; resize_reset_rc(cpi, resize_width, resize_height, cm->width, cm->height); } return; } static inline int set_key_frame(AV1_COMP *cpi, unsigned int frame_flags) { RATE_CONTROL *const rc = &cpi->rc; AV1_COMMON *const cm = &cpi->common; SVC *const svc = &cpi->svc; // Very first frame has to be key frame. if (cm->current_frame.frame_number == 0) return 1; // Set key frame if forced by frame flags. if (frame_flags & FRAMEFLAGS_KEY) return 1; if (!cpi->ppi->use_svc) { // Non-SVC if (cpi->oxcf.kf_cfg.auto_key && rc->frames_to_key == 0) return 1; } else { // SVC if (svc->spatial_layer_id == 0 && (cpi->oxcf.kf_cfg.auto_key && (cpi->oxcf.kf_cfg.key_freq_max == 0 || svc->current_superframe % cpi->oxcf.kf_cfg.key_freq_max == 0))) return 1; } return 0; } // Set to true if this frame is a recovery frame, for 1 layer RPS, // and whether we should apply some boost (QP, adjust speed features, etc). // Recovery frame here means frame whose closest reference suddenly // switched from previous frame to one much further away. // TODO(marpan): Consider adding on/off flag to SVC_REF_FRAME_CONFIG to // allow more control for applications. static bool set_flag_rps_bias_recovery_frame(const AV1_COMP *const cpi) { if (cpi->ppi->rtc_ref.set_ref_frame_config && cpi->svc.number_temporal_layers == 1 && cpi->svc.number_spatial_layers == 1 && cpi->ppi->rtc_ref.reference_was_previous_frame) { int min_dist = av1_svc_get_min_ref_dist(cpi); // Only consider boost for this frame if its closest reference is further // than x frames away, using x = 4 for now. if (min_dist != INT_MAX && min_dist > 4) return true; } return false; } void av1_get_one_pass_rt_params(AV1_COMP *cpi, FRAME_TYPE *const frame_type, const EncodeFrameInput *frame_input, unsigned int frame_flags) { RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; AV1_COMMON *const cm = &cpi->common; GF_GROUP *const gf_group = &cpi->ppi->gf_group; SVC *const svc = &cpi->svc; ResizePendingParams *const resize_pending_params = &cpi->resize_pending_params; int target; const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, svc->number_temporal_layers); if (cpi->oxcf.rc_cfg.max_consec_drop_ms > 0) { double framerate = cpi->framerate > 1 ? round(cpi->framerate) : cpi->framerate; rc->max_consec_drop = saturate_cast_double_to_int( ceil(cpi->oxcf.rc_cfg.max_consec_drop_ms * framerate / 1000)); } if (cpi->ppi->use_svc) { av1_update_temporal_layer_framerate(cpi); av1_restore_layer_context(cpi); } cpi->ppi->rtc_ref.bias_recovery_frame = set_flag_rps_bias_recovery_frame(cpi); // Set frame type. if (set_key_frame(cpi, frame_flags)) { *frame_type = KEY_FRAME; p_rc->this_key_frame_forced = cm->current_frame.frame_number != 0 && rc->frames_to_key == 0; rc->frames_to_key = cpi->oxcf.kf_cfg.key_freq_max; p_rc->kf_boost = DEFAULT_KF_BOOST_RT; gf_group->update_type[cpi->gf_frame_index] = KF_UPDATE; gf_group->frame_type[cpi->gf_frame_index] = KEY_FRAME; gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_RESET; if (cpi->ppi->use_svc) { if (cm->current_frame.frame_number > 0) av1_svc_reset_temporal_layers(cpi, 1); svc->layer_context[layer].is_key_frame = 1; } rc->frame_number_encoded = 0; cpi->ppi->rtc_ref.non_reference_frame = 0; rc->static_since_last_scene_change = 0; } else { *frame_type = INTER_FRAME; gf_group->update_type[cpi->gf_frame_index] = LF_UPDATE; gf_group->frame_type[cpi->gf_frame_index] = INTER_FRAME; gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_UPDATE; if (cpi->ppi->use_svc) { LAYER_CONTEXT *lc = &svc->layer_context[layer]; lc->is_key_frame = svc->spatial_layer_id == 0 ? 0 : svc->layer_context[svc->temporal_layer_id].is_key_frame; } // If the user is setting the reference structure with // set_ref_frame_config and did not set any references, set the // frame type to Intra-only. if (cpi->ppi->rtc_ref.set_ref_frame_config) { int no_references_set = 1; for (int i = 0; i < INTER_REFS_PER_FRAME; i++) { if (cpi->ppi->rtc_ref.reference[i]) { no_references_set = 0; break; } } // Set to intra_only_frame if no references are set. // The stream can start decoding on INTRA_ONLY_FRAME so long as the // layer with the intra_only_frame doesn't signal a reference to a slot // that hasn't been set yet. if (no_references_set) *frame_type = INTRA_ONLY_FRAME; } } if (cpi->active_map.enabled && cpi->rc.percent_blocks_inactive == 100) { rc->frame_source_sad = 0; rc->avg_source_sad = (3 * rc->avg_source_sad + rc->frame_source_sad) >> 2; rc->percent_blocks_with_motion = 0; rc->high_source_sad = 0; } else if (cpi->sf.rt_sf.check_scene_detection && svc->spatial_layer_id == 0) { if (rc->prev_coded_width == cm->width && rc->prev_coded_height == cm->height) { rc_scene_detection_onepass_rt(cpi, frame_input); } else { aom_free(cpi->src_sad_blk_64x64); cpi->src_sad_blk_64x64 = NULL; } } if (((*frame_type == KEY_FRAME && cpi->sf.rt_sf.rc_adjust_keyframe) || (cpi->sf.rt_sf.rc_compute_spatial_var_sc && rc->high_source_sad)) && svc->spatial_layer_id == 0 && cm->seq_params->bit_depth == 8 && cpi->oxcf.rc_cfg.max_intra_bitrate_pct > 0) rc_spatial_act_onepass_rt(cpi, frame_input->source->y_buffer, frame_input->source->y_stride); // Check for dynamic resize, for single spatial layer for now. // For temporal layers only check on base temporal layer. if (cpi->oxcf.resize_cfg.resize_mode == RESIZE_DYNAMIC) { if (svc->number_spatial_layers == 1 && svc->temporal_layer_id == 0) dynamic_resize_one_pass_cbr(cpi); if (rc->resize_state == THREE_QUARTER) { resize_pending_params->width = (3 + cpi->oxcf.frm_dim_cfg.width * 3) >> 2; resize_pending_params->height = (3 + cpi->oxcf.frm_dim_cfg.height * 3) >> 2; } else if (rc->resize_state == ONE_HALF) { resize_pending_params->width = (1 + cpi->oxcf.frm_dim_cfg.width) >> 1; resize_pending_params->height = (1 + cpi->oxcf.frm_dim_cfg.height) >> 1; } else { resize_pending_params->width = cpi->oxcf.frm_dim_cfg.width; resize_pending_params->height = cpi->oxcf.frm_dim_cfg.height; } } else if (is_frame_resize_pending(cpi)) { resize_reset_rc(cpi, resize_pending_params->width, resize_pending_params->height, cm->width, cm->height); } // Set the GF interval and update flag. if (!rc->rtc_external_ratectrl) set_gf_interval_update_onepass_rt(cpi, *frame_type); // Set target size. if (cpi->oxcf.rc_cfg.mode == AOM_CBR) { if (*frame_type == KEY_FRAME || *frame_type == INTRA_ONLY_FRAME) { target = av1_calc_iframe_target_size_one_pass_cbr(cpi); } else { target = av1_calc_pframe_target_size_one_pass_cbr( cpi, gf_group->update_type[cpi->gf_frame_index]); } } else { if (*frame_type == KEY_FRAME || *frame_type == INTRA_ONLY_FRAME) { target = av1_calc_iframe_target_size_one_pass_vbr(cpi); } else { target = av1_calc_pframe_target_size_one_pass_vbr( cpi, gf_group->update_type[cpi->gf_frame_index]); } } if (cpi->oxcf.rc_cfg.mode == AOM_Q) rc->active_worst_quality = cpi->oxcf.rc_cfg.cq_level; av1_rc_set_frame_target(cpi, target, cm->width, cm->height); rc->base_frame_target = target; cm->current_frame.frame_type = *frame_type; // For fixed mode SVC: if KSVC is enabled remove inter layer // prediction on spatial enhancement layer frames for frames // whose base is not KEY frame. if (cpi->ppi->use_svc && !svc->use_flexible_mode && svc->ksvc_fixed_mode && svc->number_spatial_layers > 1 && !svc->layer_context[layer].is_key_frame) { ExternalFlags *const ext_flags = &cpi->ext_flags; ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG; } } #define CHECK_INTER_LAYER_PRED(ref_frame) \ ((cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) && \ (av1_check_ref_is_low_spatial_res_super_frame(cpi, ref_frame))) int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) { AV1_COMMON *const cm = &cpi->common; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; double rate_correction_factor = cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL]; const int target_size = cpi->rc.avg_frame_bandwidth; double new_correction_factor; int target_bits_per_mb; double q2; int enumerator; int inter_layer_pred_on = 0; int is_screen_content = (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN); cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0; if (cpi->svc.spatial_layer_id > 0) { // For spatial layers: check if inter-layer (spatial) prediction is used // (check if any reference is being used that is the lower spatial layer), inter_layer_pred_on = CHECK_INTER_LAYER_PRED(LAST_FRAME) || CHECK_INTER_LAYER_PRED(GOLDEN_FRAME) || CHECK_INTER_LAYER_PRED(ALTREF_FRAME); } // If inter-layer prediction is on: we expect to pull up the quality from // the lower spatial layer, so we can use a lower q. if (cpi->svc.spatial_layer_id > 0 && inter_layer_pred_on) { *q = (cpi->rc.worst_quality + *q) >> 1; } else { // For easy scene changes used lower QP, otherwise set max-q. // If rt_sf->compute_spatial_var_sc is enabled relax the max-q // condition based on frame spatial variance. if (cpi->sf.rt_sf.rc_compute_spatial_var_sc) { if (cpi->rc.frame_spatial_variance < 100) { *q = (cpi->rc.worst_quality + *q) >> 1; } else if (cpi->rc.frame_spatial_variance < 400 || (cpi->rc.frame_source_sad < 80000 && cpi->rc.frame_spatial_variance < 1000)) { *q = (3 * cpi->rc.worst_quality + *q) >> 2; } else { *q = cpi->rc.worst_quality; } } else { *q = (3 * cpi->rc.worst_quality + *q) >> 2; // For screen content use the max-q set by the user to allow for less // overshoot on slide changes. if (is_screen_content) *q = cpi->rc.worst_quality; } } // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as // these parameters will affect QP selection for subsequent frames. If they // have settled down to a very different (low QP) state, then not adjusting // them may cause next frame to select low QP and overshoot again. p_rc->avg_frame_qindex[INTER_FRAME] = *q; p_rc->buffer_level = p_rc->optimal_buffer_level; p_rc->bits_off_target = p_rc->optimal_buffer_level; // Reset rate under/over-shoot flags. cpi->rc.rc_1_frame = 0; cpi->rc.rc_2_frame = 0; // Adjust rate correction factor. target_bits_per_mb = (int)(((uint64_t)target_size << BPER_MB_NORMBITS) / cm->mi_params.MBs); // Reset rate correction factor: for now base it on target_bits_per_mb // and qp (==max_QP). This comes from the inverse computation of // av1_rc_bits_per_mb(). q2 = av1_convert_qindex_to_q(*q, cm->seq_params->bit_depth); enumerator = get_bpmb_enumerator(INTER_NORMAL, is_screen_content); new_correction_factor = (double)target_bits_per_mb * q2 / enumerator; if (new_correction_factor > rate_correction_factor) { rate_correction_factor = (new_correction_factor + rate_correction_factor) / 2.0; if (rate_correction_factor > MAX_BPB_FACTOR) rate_correction_factor = MAX_BPB_FACTOR; cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL] = rate_correction_factor; } // For temporal layers: reset the rate control parameters across all // temporal layers. Only do it for spatial enhancement layers when // inter_layer_pred_on is not set (off). if (cpi->svc.number_temporal_layers > 1 && (cpi->svc.spatial_layer_id == 0 || inter_layer_pred_on == 0)) { SVC *svc = &cpi->svc; for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { int sl = svc->spatial_layer_id; const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); LAYER_CONTEXT *lc = &svc->layer_context[layer]; RATE_CONTROL *lrc = &lc->rc; PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc; lp_rc->avg_frame_qindex[INTER_FRAME] = *q; lp_rc->buffer_level = lp_rc->optimal_buffer_level; lp_rc->bits_off_target = lp_rc->optimal_buffer_level; lrc->rc_1_frame = 0; lrc->rc_2_frame = 0; lp_rc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor; } } return 1; } int av1_postencode_drop_cbr(AV1_COMP *cpi, size_t *size) { PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; size_t frame_size = *size << 3; const int64_t new_buffer_level = p_rc->buffer_level + cpi->rc.avg_frame_bandwidth - (int64_t)frame_size; // Drop if new buffer level (given the encoded frame size) goes below a // threshold and encoded frame size is much larger than per-frame-bandwidth. // If the frame is already labelled as scene change (high_source_sad = 1) // or the QP is close to max, then no need to drop. const int qp_thresh = 3 * (cpi->rc.worst_quality >> 2); const int64_t buffer_thresh = p_rc->optimal_buffer_level >> 2; if (!cpi->rc.high_source_sad && new_buffer_level < buffer_thresh && frame_size > 8 * (unsigned int)cpi->rc.avg_frame_bandwidth && cpi->common.quant_params.base_qindex < qp_thresh) { *size = 0; cpi->is_dropped_frame = true; restore_all_coding_context(cpi); av1_rc_postencode_update_drop_frame(cpi); // Force max_q on next fame. Reset some RC parameters. cpi->rc.force_max_q = 1; p_rc->avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality; p_rc->buffer_level = p_rc->optimal_buffer_level; p_rc->bits_off_target = p_rc->optimal_buffer_level; cpi->rc.rc_1_frame = 0; cpi->rc.rc_2_frame = 0; if (cpi->svc.number_spatial_layers > 1 || cpi->svc.number_temporal_layers > 1) { SVC *svc = &cpi->svc; // Postencode drop is only checked on base spatial layer, // for now if max-q is set on base we force it on all layers. for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); LAYER_CONTEXT *lc = &svc->layer_context[layer]; RATE_CONTROL *lrc = &lc->rc; PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc; // Force max_q on next fame. Reset some RC parameters. lrc->force_max_q = 1; lp_rc->avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality; lp_rc->buffer_level = lp_rc->optimal_buffer_level; lp_rc->bits_off_target = lp_rc->optimal_buffer_level; lrc->rc_1_frame = 0; lrc->rc_2_frame = 0; } } } return 1; } return 0; } aom-3.12.1/av1/encoder/ratectrl.h000066400000000000000000000554051477627663500165110ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_RATECTRL_H_ #define AOM_AV1_ENCODER_RATECTRL_H_ #include "aom/aom_codec.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #ifdef __cplusplus extern "C" { #endif /*!\cond */ // Bits Per MB at different Q (Multiplied by 512) #define BPER_MB_NORMBITS 9 // Use this macro to turn on/off use of alt-refs in one-pass mode. #define USE_ALTREF_FOR_ONE_PASS 1 // Threshold used to define if a KF group is static (e.g. a slide show). // Essentially, this means that no frame in the group has more than 1% of MBs // that are not marked as coded with 0,0 motion in the first pass. #define STATIC_KF_GROUP_THRESH 99 #define STATIC_KF_GROUP_FLOAT_THRESH 0.99 // The maximum duration of a GF group that is static (e.g. a slide show). #define MAX_STATIC_GF_GROUP_LENGTH 250 #define MIN_GF_INTERVAL 4 #define MAX_GF_INTERVAL 32 #define FIXED_GF_INTERVAL 16 #define MAX_GF_LENGTH_LAP 16 #define FIXED_GF_INTERVAL_RT 80 #define MAX_GF_INTERVAL_RT 160 #define MAX_NUM_GF_INTERVALS 15 #define MAX_ARF_LAYERS 6 // #define STRICT_RC #define DEFAULT_KF_BOOST_RT 2300 #define DEFAULT_GF_BOOST_RT 2000 // A passive rate control strategy for screen content type in real-time mode. // When it is turned on, the compression performance is improved by // 7.8% (overall_psnr), 5.0% (VMAF) on average. Some clips see gains // over 20% on metric. // The downside is that it does not guarantee frame size. // Since RT mode has a tight restriction on buffer overflow control, we // turn it off by default. #define RT_PASSIVE_STRATEGY 0 #define MAX_Q_HISTORY 1000 typedef struct { int resize_width; int resize_height; uint8_t superres_denom; } size_params_type; enum { INTER_NORMAL, GF_ARF_LOW, GF_ARF_STD, KF_STD, RATE_FACTOR_LEVELS } UENUM1BYTE(RATE_FACTOR_LEVEL); enum { KF_UPDATE, LF_UPDATE, GF_UPDATE, ARF_UPDATE, OVERLAY_UPDATE, INTNL_OVERLAY_UPDATE, // Internal Overlay Frame INTNL_ARF_UPDATE, // Internal Altref Frame FRAME_UPDATE_TYPES } UENUM1BYTE(FRAME_UPDATE_TYPE); enum { REFBUF_RESET, // Clear reference frame buffer REFBUF_UPDATE, // Refresh reference frame buffer REFBUF_STATES } UENUM1BYTE(REFBUF_STATE); typedef enum { NO_RESIZE = 0, DOWN_THREEFOUR = 1, // From orig to 3/4. DOWN_ONEHALF = 2, // From orig or 3/4 to 1/2. UP_THREEFOUR = -1, // From 1/2 to 3/4. UP_ORIG = -2, // From 1/2 or 3/4 to orig. } RESIZE_ACTION; typedef enum { ORIG = 0, THREE_QUARTER = 1, ONE_HALF = 2 } RESIZE_STATE; #define MAX_FIRSTPASS_ANALYSIS_FRAMES 150 typedef enum region_types { STABLE_REGION = 0, HIGH_VAR_REGION = 1, SCENECUT_REGION = 2, BLENDING_REGION = 3, } REGION_TYPES; typedef struct regions { int start; int last; double avg_noise_var; double avg_cor_coeff; double avg_sr_fr_ratio; double avg_intra_err; double avg_coded_err; REGION_TYPES type; } REGIONS; /*!\endcond */ /*! * \brief Rate Control parameters and status */ typedef struct { // Rate targetting variables /*! * Baseline target rate for frame before adjustment for previous under or * over shoot. */ int base_frame_target; /*! * Target rate for frame after adjustment for previous under or over shoot. */ int this_frame_target; // Actual frame target after rc adjustment. /*! * Projected size for current frame */ int projected_frame_size; /*! * Bit size of transform coefficient for current frame. */ int coefficient_size; /*! * Super block rate target used with some adaptive quantization strategies. */ int sb64_target_rate; /*! * Number of frames since the last ARF / GF. */ int frames_since_golden; /*! * Number of frames till the next ARF / GF is due. */ int frames_till_gf_update_due; /*! * Number of determined gf groups left */ int intervals_till_gf_calculate_due; /*!\cond */ int min_gf_interval; int max_gf_interval; int static_scene_max_gf_interval; /*!\endcond */ /*! * Frames before the next key frame */ int frames_to_key; /*!\cond */ int frames_since_key; int frames_to_fwd_kf; int is_src_frame_alt_ref; int sframe_due; int high_source_sad; int high_motion_content_screen_rtc; uint64_t avg_source_sad; uint64_t prev_avg_source_sad; uint64_t frame_source_sad; uint64_t frame_spatial_variance; int static_since_last_scene_change; int last_encoded_size_keyframe; int last_target_size_keyframe; int frames_since_scene_change; int perc_spatial_flat_blocks; int avg_frame_bandwidth; // Average frame size target for clip int min_frame_bandwidth; // Minimum allocation used for any frame int max_frame_bandwidth; // Maximum burst rate allowed for a frame. int prev_avg_frame_bandwidth; int ni_av_qi; int ni_tot_qi; int decimation_factor; int decimation_count; int prev_frame_is_dropped; int drop_count_consec; int max_consec_drop; int force_max_q; int postencode_drop; /*! * Frame number for encoded frames (non-dropped). * Use for setting the rtc reference structure. */ unsigned int frame_number_encoded; /*!\endcond */ /*! * User specified maximum Q allowed for current frame */ int worst_quality; /*! * User specified minimum Q allowed for current frame */ int best_quality; /*!\cond */ // rate control history for last frame(1) and the frame before(2). // -1: overshoot // 1: undershoot // 0: not initialized. int rc_1_frame; int rc_2_frame; int q_1_frame; int q_2_frame; /*!\endcond */ /*! * Proposed maximum allowed Q for current frame */ int active_worst_quality; /*!\cond */ // Track amount of low motion in scene int avg_frame_low_motion; int cnt_zeromv; // signals if number of blocks with motion is high int percent_blocks_with_motion; // signals percentage of 16x16 blocks that are inactive, via active_maps int percent_blocks_inactive; // Maximum value of source sad across all blocks of frame. uint64_t max_block_source_sad; // For dynamic resize, 1 pass cbr. RESIZE_STATE resize_state; int resize_avg_qp; int resize_buffer_underflow; int resize_count; // Flag to disable content related qp adjustment. int rtc_external_ratectrl; // Stores fast_extra_bits of the current frame. int frame_level_fast_extra_bits; double frame_level_rate_correction_factors[RATE_FACTOR_LEVELS]; int frame_num_last_gf_refresh; int prev_coded_width; int prev_coded_height; // The ratio used for inter frames in bit estimation. // TODO(yunqing): if golden frame is treated differently (e.g. gf_cbr_boost_ // pct > THR), consider to add bit_est_ratio_g for golden frames. int bit_est_ratio; // Whether to use a fixed qp for the frame, bypassing internal rate control. // This flag will reset to 0 after every frame. int use_external_qp_one_pass; /*!\endcond */ } RATE_CONTROL; /*! * \brief Primary Rate Control parameters and status */ typedef struct { // Sub-gop level Rate targetting variables /*! * Target bit budget for the current GF / ARF group of frame. */ int64_t gf_group_bits; /*! * Boost factor used to calculate the extra bits allocated to the key frame */ int kf_boost; /*! * Boost factor used to calculate the extra bits allocated to ARFs and GFs */ int gfu_boost; /*! * Stores the determined gf group lengths for a set of gf groups */ int gf_intervals[MAX_NUM_GF_INTERVALS]; /*! * The current group's index into gf_intervals[] */ int cur_gf_index; /*!\cond */ int num_regions; REGIONS regions[MAX_FIRSTPASS_ANALYSIS_FRAMES]; int regions_offset; // offset of regions from the last keyframe int frames_till_regions_update; int baseline_gf_interval; int constrained_gf_group; int this_key_frame_forced; int next_key_frame_forced; /*!\endcond */ /*! * Initial buffuer level in ms for CBR / low delay encoding */ int64_t starting_buffer_level; /*! * Optimum / target buffuer level in ms for CBR / low delay encoding */ int64_t optimal_buffer_level; /*! * Maximum target buffuer level in ms for CBR / low delay encoding */ int64_t maximum_buffer_size; /*! * Q index used for ALT frame */ int arf_q; /*!\cond */ float_t arf_boost_factor; int base_layer_qp; // Total number of stats used only for kf_boost calculation. int num_stats_used_for_kf_boost; // Total number of stats used only for gfu_boost calculation. int num_stats_used_for_gfu_boost; // Total number of stats required by gfu_boost calculation. int num_stats_required_for_gfu_boost; int enable_scenecut_detection; int use_arf_in_this_kf_group; int ni_frames; double tot_q; /*!\endcond */ /*! * Q used for last boosted (non leaf) frame */ int last_kf_qindex; /*! * Average of q index of previous encoded frames in a sequence. */ int avg_frame_qindex[FRAME_TYPES]; #if CONFIG_FPMT_TEST /*! * Temporary variable used in simulating the delayed update of * active_best_quality. */ int temp_active_best_quality[MAX_ARF_LAYERS + 1]; /*! * Temporary variable used in simulating the delayed update of * last_boosted_qindex. */ int temp_last_boosted_qindex; /*! * Temporary variable used in simulating the delayed update of * avg_q. */ double temp_avg_q; /*! * Temporary variable used in simulating the delayed update of * last_q. */ int temp_last_q[FRAME_TYPES]; /*! * Temporary variable used in simulating the delayed update of * projected_frame_size. */ int temp_projected_frame_size; /*! * Temporary variable used in simulating the delayed update of * total_actual_bits. */ int64_t temp_total_actual_bits; /*! * Temporary variable used in simulating the delayed update of * buffer_level. */ int64_t temp_buffer_level; /*! * Temporary variable used in simulating the delayed update of * vbr_bits_off_target. */ int64_t temp_vbr_bits_off_target; /*! * Temporary variable used in simulating the delayed update of * vbr_bits_off_target_fast. */ int64_t temp_vbr_bits_off_target_fast; /*! * Temporary variable used in simulating the delayed update of * rate_correction_factors. */ double temp_rate_correction_factors[RATE_FACTOR_LEVELS]; /*! * Temporary variable used in simulating the delayed update of * rate_error_estimate. */ int temp_rate_error_estimate; /*! * Temporary variable used in simulating the delayed update of * rolling_arf_group_target_bits. */ int temp_rolling_arf_group_target_bits; /*! * Temporary variable used in simulating the delayed update of * rolling_arf_group_actual_bits;. */ int temp_rolling_arf_group_actual_bits; /*! * Temporary variable used in simulating the delayed update of * bits_left;. */ int64_t temp_bits_left; /*! * Temporary variable used in simulating the delayed update of * extend_minq. */ int temp_extend_minq; /*! * Temporary variable used in simulating the delayed update of * extend_maxq. */ int temp_extend_maxq; #endif /*! * Proposed minimum allowed Q different layers in a coding pyramid */ int active_best_quality[MAX_ARF_LAYERS + 1]; /*! * Q used for last boosted (non leaf) frame (GF/KF/ARF) */ int last_boosted_qindex; /*! * Average Q value of previous inter frames */ double avg_q; /*! * Q used on last encoded frame of the given type. */ int last_q[FRAME_TYPES]; /*! * Correction factors used to adjust the q estimate for a given target rate * in the encode loop. */ double rate_correction_factors[RATE_FACTOR_LEVELS]; /*! * Current total consumed bits. */ int64_t total_actual_bits; /*! * Current total target bits. */ int64_t total_target_bits; /*! * Current buffer level. */ int64_t buffer_level; /*! * PCT rc error. */ int rate_error_estimate; /*! * Error bits available from previously encoded frames. */ int64_t vbr_bits_off_target; /*! * Error bits available from previously encoded frames undershoot. */ int64_t vbr_bits_off_target_fast; /*! * Total bits deviated from the average frame target, from previously * encoded frames. */ int64_t bits_off_target; /*! * Rolling monitor target bits updated based on current frame target size. */ int rolling_target_bits; /*! * Rolling monitor actual bits updated based on current frame final projected * size. */ int rolling_actual_bits; /*! * The history of qindex for each frame. * Only used when RT_PASSIVE_STRATEGY = 1. */ int q_history[MAX_Q_HISTORY]; } PRIMARY_RATE_CONTROL; /*!\cond */ struct AV1_COMP; struct AV1EncoderConfig; struct GF_GROUP; void av1_primary_rc_init(const struct AV1EncoderConfig *oxcf, PRIMARY_RATE_CONTROL *p_rc); void av1_rc_init(const struct AV1EncoderConfig *oxcf, RATE_CONTROL *rc); int av1_estimate_bits_at_q(const struct AV1_COMP *cpi, int q, double correction_factor); double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth); // Converts a Q value to a qindex. int av1_convert_q_to_qindex(double q, aom_bit_depth_t bit_depth); void av1_rc_init_minq_luts(void); int av1_rc_get_default_min_gf_interval(int width, int height, double framerate); // Generally at the high level, the following flow is expected // to be enforced for rate control: // First call per frame, one of: // av1_get_one_pass_rt_params() // av1_get_second_pass_params() // depending on the usage to set the rate control encode parameters desired. // // Then, call encode_frame_to_data_rate() to perform the // actual encode. This function will in turn call encode_frame() // one or more times, followed by: // av1_rc_postencode_update_drop_frame() // // The majority of rate control parameters are only expected // to be set in the av1_get_..._params() functions and // updated during the av1_rc_postencode_update...() functions. // The only exceptions are av1_rc_drop_frame() and // av1_rc_update_rate_correction_factors() functions. // Functions to set parameters for encoding before the actual // encode_frame_to_data_rate() function. struct EncodeFrameInput; // Post encode update of the rate control parameters based // on bytes used void av1_rc_postencode_update(struct AV1_COMP *cpi, uint64_t bytes_used); // Post encode update of the rate control parameters for dropped frames void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi); /*!\endcond */ /*!\brief Updates the rate correction factor linking Q to output bits * * This function updates the Q rate correction factor after an encode * cycle depending on whether we overshot or undershot the target rate. * * \ingroup rate_control * \param[in] cpi Top level encoder instance structure * \param[in] is_encode_stage Indicates if recode loop or post-encode * \param[in] width Frame width * \param[in] height Frame height * * \remark Updates the relevant rate correction factor in cpi->rc */ void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi, int is_encode_stage, int width, int height); /*!\cond */ // Decide if we should drop this frame: For 1-pass CBR. // Changes only the decimation count in the rate control structure int av1_rc_drop_frame(struct AV1_COMP *cpi); // Computes frame size bounds. void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi, int this_frame_target, int *frame_under_shoot_limit, int *frame_over_shoot_limit); /*!\endcond */ /*!\brief Picks q and q bounds given the rate control parameters in \c cpi->rc. * * \ingroup rate_control * \param[in] cpi Top level encoder structure * \param[in] width Coded frame width * \param[in] height Coded frame height * \param[in] gf_index Index of this frame in the golden frame group * \param[out] bottom_index Bottom bound for q index (best quality) * \param[out] top_index Top bound for q index (worst quality) * \return Returns selected q index to be used for encoding this frame. * Also, updates \c rc->arf_q. */ int av1_rc_pick_q_and_bounds(struct AV1_COMP *cpi, int width, int height, int gf_index, int *bottom_index, int *top_index); /*!\brief Estimates q to achieve a target bits per frame * * \ingroup rate_control * \param[in] cpi Top level encoder instance structure * \param[in] target_bits_per_frame Frame rate target * \param[in] active_worst_quality Max Q allowed * \param[in] active_best_quality Min Q allowed * \param[in] width Frame width * \param[in] height Frame height * * \return Returns a q index value */ int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame, int active_best_quality, int active_worst_quality, int width, int height); /*!\cond */ // Estimates bits per mb for a given qindex and correction factor. int av1_rc_bits_per_mb(const struct AV1_COMP *cpi, FRAME_TYPE frame_type, int qindex, double correction_factor, int accurate_estimate); // Find q_index corresponding to desired_q, within [best_qindex, worst_qindex]. // To be precise, 'q_index' is the smallest integer, for which the corresponding // q >= desired_q. // If no such q index is found, returns 'worst_qindex'. int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth, int best_qindex, int worst_qindex); // Computes a q delta (in "q index" terms) to get from a starting q value // to a target q value int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, aom_bit_depth_t bit_depth); // Computes a q delta (in "q index" terms) to get from a starting q value // to a value that should equate to the given rate ratio. int av1_compute_qdelta_by_rate(const struct AV1_COMP *cpi, FRAME_TYPE frame_type, int qindex, double rate_target_ratio); void av1_rc_update_framerate(struct AV1_COMP *cpi, int width, int height); void av1_set_target_rate(struct AV1_COMP *cpi, int width, int height); int av1_resize_one_pass_cbr(struct AV1_COMP *cpi); void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target, int width, int height); void av1_adjust_gf_refresh_qp_one_pass_rt(struct AV1_COMP *cpi); void av1_set_rtc_reference_structure_one_layer(struct AV1_COMP *cpi, int gf_update); /*!\endcond */ /*!\brief Calculates how many bits to use for a P frame in one pass vbr * * \ingroup rate_control * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * \param[in] frame_update_type Type of frame * * \return Returns the target number of bits for this frame. */ int av1_calc_pframe_target_size_one_pass_vbr( const struct AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type); /*!\brief Calculates how many bits to use for an i frame in one pass vbr * * \ingroup rate_control * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * * \return Returns the target number of bits for this frame. */ int av1_calc_iframe_target_size_one_pass_vbr(const struct AV1_COMP *const cpi); /*!\brief Calculates how many bits to use for a P frame in one pass cbr * * \ingroup rate_control * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * \param[in] frame_update_type Type of frame * * \return Returns the target number of bits for this frame. */ int av1_calc_pframe_target_size_one_pass_cbr( const struct AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type); /*!\brief Calculates how many bits to use for an i frame in one pass cbr * * \ingroup rate_control * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * * \return Returns the target number of bits for this frame. */ int av1_calc_iframe_target_size_one_pass_cbr(const struct AV1_COMP *cpi); /*!\brief Setup the rate control parameters for 1 pass real-time mode. * * - Sets the frame type and target frame size. * - Sets the GF update. * - Checks for scene change. * - Sets the reference prediction structure for 1 layers (non-SVC). * - Resets and updates are done for SVC. * * \ingroup rate_control * \param[in] cpi Top level encoder structure * \param[in] frame_type Encoder frame type * \param[in] frame_input Current and last input source frames * \param[in] frame_flags Encoder frame flags * * \remark Nothing is returned. Instead the settings computed in this * function are set in: \c frame_params, \c cpi->common, \c cpi->rc, * \c cpi->svc. */ void av1_get_one_pass_rt_params(struct AV1_COMP *cpi, FRAME_TYPE *const frame_type, const struct EncodeFrameInput *frame_input, unsigned int frame_flags); /*!\brief Increase q on expected encoder overshoot, for CBR mode. * * Handles the case when encoder is expected to create a large frame: * - q is increased to value closer to \c cpi->rc.worst_quality * - avg_frame_qindex is reset * - buffer levels are reset * - rate correction factor is adjusted * * \ingroup rate_control * \param[in] cpi Top level encoder structure * \param[in] q Current q index * * \return q is returned, and updates are done to \c cpi->rc. */ int av1_encodedframe_overshoot_cbr(struct AV1_COMP *cpi, int *q); /*!\brief Check if frame should be dropped, for RTC mode. * * \ingroup rate_control * \param[in] cpi Top level encoder structure * \param[in,out] size Size of encoded frame * * \return 1 if frame is to be dropped, 0 otherwise (no drop). * Set cpi->rc.force_max_q if frame is to be dropped, and updates are * made to rate control parameters. *size is set to 0 when this * function returns 1 (frame is dropped). */ int av1_postencode_drop_cbr(struct AV1_COMP *cpi, size_t *size); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_RATECTRL_H_ aom-3.12.1/av1/encoder/rc_utils.h000066400000000000000000000443051477627663500165120ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_RC_UTILS_H_ #define AOM_AV1_ENCODER_RC_UTILS_H_ #include "av1/encoder/encoder.h" #include "aom_dsp/psnr.h" #ifdef __cplusplus extern "C" { #endif static inline void check_reset_rc_flag(AV1_COMP *cpi) { RATE_CONTROL *rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; if (cpi->common.current_frame.frame_number > (unsigned int)cpi->svc.number_spatial_layers) { if (cpi->ppi->use_svc) { av1_svc_check_reset_layer_rc_flag(cpi); } else { if (rc->avg_frame_bandwidth / 3 > (rc->prev_avg_frame_bandwidth >> 1) || rc->avg_frame_bandwidth < (rc->prev_avg_frame_bandwidth >> 1)) { rc->rc_1_frame = 0; rc->rc_2_frame = 0; p_rc->bits_off_target = p_rc->optimal_buffer_level; p_rc->buffer_level = p_rc->optimal_buffer_level; } } } } static inline void set_primary_rc_buffer_sizes(const AV1EncoderConfig *oxcf, AV1_PRIMARY *ppi) { PRIMARY_RATE_CONTROL *p_rc = &ppi->p_rc; const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; const int64_t bandwidth = rc_cfg->target_bandwidth; const int64_t starting = rc_cfg->starting_buffer_level_ms; const int64_t optimal = rc_cfg->optimal_buffer_level_ms; const int64_t maximum = rc_cfg->maximum_buffer_size_ms; p_rc->starting_buffer_level = starting * bandwidth / 1000; p_rc->optimal_buffer_level = (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000; p_rc->maximum_buffer_size = (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000; // Under a configuration change, where maximum_buffer_size may change, // keep buffer level clipped to the maximum allowed buffer size. p_rc->bits_off_target = AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size); p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size); } static inline void config_target_level(AV1_COMP *const cpi, AV1_LEVEL target_level, int tier) { AV1EncoderConfig *const oxcf = &cpi->oxcf; SequenceHeader *const seq_params = cpi->common.seq_params; TileConfig *const tile_cfg = &oxcf->tile_cfg; RateControlCfg *const rc_cfg = &oxcf->rc_cfg; // Adjust target bitrate to be no larger than 70% of level limit. const BITSTREAM_PROFILE profile = seq_params->profile; const double level_bitrate_limit = av1_get_max_bitrate_for_level(target_level, tier, profile); const int64_t max_bitrate = (int64_t)(level_bitrate_limit * 0.70); rc_cfg->target_bandwidth = AOMMIN(rc_cfg->target_bandwidth, max_bitrate); // Also need to update cpi->ppi->twopass.bits_left. TWO_PASS *const twopass = &cpi->ppi->twopass; FIRSTPASS_STATS *stats = twopass->stats_buf_ctx->total_stats; if (stats != NULL) cpi->ppi->twopass.bits_left = (int64_t)(stats->duration * rc_cfg->target_bandwidth / 10000000.0); // Adjust max over-shoot percentage. rc_cfg->over_shoot_pct = 0; // Adjust max quantizer. rc_cfg->worst_allowed_q = 255; // Adjust number of tiles and tile columns to be under level limit. int max_tiles, max_tile_cols; av1_get_max_tiles_for_level(target_level, &max_tiles, &max_tile_cols); while (tile_cfg->tile_columns > 0 && (1 << tile_cfg->tile_columns) > max_tile_cols) { --tile_cfg->tile_columns; } const int tile_cols = (1 << tile_cfg->tile_columns); while (tile_cfg->tile_rows > 0 && tile_cols * (1 << tile_cfg->tile_rows) > max_tiles) { --tile_cfg->tile_rows; } // Adjust min compression ratio. const int still_picture = seq_params->still_picture; const double min_cr = av1_get_min_cr_for_level(target_level, tier, still_picture); rc_cfg->min_cr = AOMMAX(rc_cfg->min_cr, (unsigned int)(min_cr * 100)); } #if !CONFIG_REALTIME_ONLY /*!\brief Function to test for conditions that indicate we should loop * back and recode a frame. * * \ingroup rate_control * * \param[in] cpi Top-level encoder structure * \param[in] high_limit Upper rate threshold * \param[in] low_limit Lower rate threshold * \param[in] q Current q index * \param[in] maxq Maximum allowed q index * \param[in] minq Minimum allowed q index * * \return Indicates if a recode is required. * \retval 1 Recode Required * \retval 0 No Recode required */ static inline int recode_loop_test(AV1_COMP *cpi, int high_limit, int low_limit, int q, int maxq, int minq) { const RATE_CONTROL *const rc = &cpi->rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi); int force_recode = 0; if ((rc->projected_frame_size >= rc->max_frame_bandwidth) || (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE) || (frame_is_kfgfarf && (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE_KFARFGF))) { // TODO(agrange) high_limit could be greater than the scale-down threshold. if ((rc->projected_frame_size > high_limit && q < maxq) || (rc->projected_frame_size < low_limit && q > minq)) { force_recode = 1; } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) { // Deal with frame undershoot and whether or not we are // below the automatically set cq level. if (q > oxcf->rc_cfg.cq_level && rc->projected_frame_size < (((int64_t)rc->this_frame_target * 7) >> 3)) { force_recode = 1; } } } return force_recode; } static inline double av1_get_gfu_boost_projection_factor(double min_factor, double max_factor, int frame_count) { double factor = sqrt((double)frame_count); factor = AOMMIN(factor, max_factor); factor = AOMMAX(factor, min_factor); factor = (200.0 + 10.0 * factor); return factor; } static inline int get_gfu_boost_from_r0_lap(double min_factor, double max_factor, double r0, int frames_to_key) { double factor = av1_get_gfu_boost_projection_factor(min_factor, max_factor, frames_to_key); const int boost = (int)rint(factor / r0); return boost; } static inline double av1_get_kf_boost_projection_factor(int frame_count) { double factor = sqrt((double)frame_count); factor = AOMMIN(factor, 10.0); factor = AOMMAX(factor, 4.0); factor = (75.0 + 14.0 * factor); return factor; } static inline int get_regulated_q_overshoot(AV1_COMP *const cpi, int is_encode_stage, int q_low, int q_high, int top_index, int bottom_index) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width, cm->height); int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, AOMMAX(q_high, top_index), cm->width, cm->height); int retries = 0; while (q_regulated < q_low && retries < 10) { av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width, cm->height); q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, AOMMAX(q_high, top_index), cm->width, cm->height); retries++; } return q_regulated; } static inline int get_regulated_q_undershoot(AV1_COMP *const cpi, int is_encode_stage, int q_high, int top_index, int bottom_index) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width, cm->height); int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, top_index, cm->width, cm->height); int retries = 0; while (q_regulated > q_high && retries < 10) { av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width, cm->height); q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, top_index, cm->width, cm->height); retries++; } return q_regulated; } /*!\brief Called after encode_with_recode_loop() has just encoded a frame. * This function works out whether we undershot or overshot our bitrate * target and adjusts q as appropriate. It also decides whether or not * we need to recode the frame to get closer to the target rate. * * \ingroup rate_control * * \param[in] cpi Top-level encoder structure * \param[out] loop Should we go around the recode loop again * \param[in,out] q New q index value * \param[in,out] q_low Low q index limit for this loop itteration * \param[in,out] q_high High q index limit for this loop itteration * \param[in] top_index Max permited new value for q index * \param[in] bottom_index Min permited new value for q index * \param[in,out] undershoot_seen Have we seen undershoot on this frame * \param[in,out] overshoot_seen Have we seen overshoot on this frame * \param[in,out] low_cr_seen Have we previously trriggered recode * because the compression ration was less * than a given minimum threshold. * \param[in] loop_count Loop itterations so far. * */ static inline void recode_loop_update_q( AV1_COMP *const cpi, int *const loop, int *const q, int *const q_low, int *const q_high, const int top_index, const int bottom_index, int *const undershoot_seen, int *const overshoot_seen, int *const low_cr_seen, const int loop_count) { AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; *loop = 0; // Special case for overlay frame. if (rc->is_src_frame_alt_ref && rc->projected_frame_size < rc->max_frame_bandwidth) return; const int min_cr = rc_cfg->min_cr; if (min_cr > 0) { const double compression_ratio = av1_get_compression_ratio(cm, rc->projected_frame_size >> 3); const double target_cr = min_cr / 100.0; if (compression_ratio < target_cr) { *low_cr_seen = 1; if (*q < rc->worst_quality) { const double cr_ratio = target_cr / compression_ratio; const int projected_q = AOMMAX(*q + 1, (int)(*q * cr_ratio * cr_ratio)); *q = AOMMIN(AOMMIN(projected_q, *q + 32), rc->worst_quality); *q_low = AOMMAX(*q, *q_low); *q_high = AOMMAX(*q, *q_high); *loop = 1; } } if (*low_cr_seen) return; } if (cpi->ppi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) { // Initialize level info. at the beginning of each sequence. if (cm->current_frame.frame_type == KEY_FRAME && cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) { av1_init_level_info(cpi); } const AV1LevelParams *const level_params = &cpi->ppi->level_params; // TODO(any): currently only checking operating point 0 const AV1LevelInfo *const level_info = level_params->level_info[0]; const DECODER_MODEL *const decoder_models = level_info->decoder_models; const AV1_LEVEL target_level = level_params->target_seq_level_idx[0]; if (target_level < SEQ_LEVELS && decoder_models[target_level].status == DECODER_MODEL_OK) { DECODER_MODEL_STATUS status = av1_decoder_model_try_smooth_buf( cpi, rc->projected_frame_size, &decoder_models[target_level]); if ((status == SMOOTHING_BUFFER_UNDERFLOW || status == SMOOTHING_BUFFER_OVERFLOW) && *q < rc->worst_quality) { *q = AOMMIN(*q + 10, rc->worst_quality); *q_low = AOMMAX(*q, *q_low); *q_high = AOMMAX(*q, *q_high); *loop = 1; return; } } } if (rc_cfg->mode == AOM_Q) return; const int last_q = *q; int frame_over_shoot_limit = 0, frame_under_shoot_limit = 0; av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target, &frame_under_shoot_limit, &frame_over_shoot_limit); if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; if (cm->current_frame.frame_type == KEY_FRAME && p_rc->this_key_frame_forced && rc->projected_frame_size < rc->max_frame_bandwidth) { int64_t kf_err; const int64_t high_err_target = cpi->ambient_err; const int64_t low_err_target = cpi->ambient_err >> 1; #if CONFIG_AV1_HIGHBITDEPTH if (cm->seq_params->use_highbitdepth) { kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf); } else { kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); } #else kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); #endif // Prevent possible divide by zero error below for perfect KF kf_err += !kf_err; // The key frame is not good enough or we can afford // to make it better without undue risk of popping. if ((kf_err > high_err_target && rc->projected_frame_size <= frame_over_shoot_limit) || (kf_err > low_err_target && rc->projected_frame_size <= frame_under_shoot_limit)) { // Lower q_high *q_high = AOMMAX(*q - 1, *q_low); // Adjust Q *q = (int)((*q * high_err_target) / kf_err); *q = AOMMIN(*q, (*q_high + *q_low) >> 1); } else if (kf_err < low_err_target && rc->projected_frame_size >= frame_under_shoot_limit) { // The key frame is much better than the previous frame // Raise q_low *q_low = AOMMIN(*q + 1, *q_high); // Adjust Q *q = (int)((*q * low_err_target) / kf_err); *q = AOMMIN(*q, (*q_high + *q_low + 1) >> 1); } // Clamp Q to upper and lower limits: *q = clamp(*q, *q_low, *q_high); *loop = (*q != last_q); return; } if (recode_loop_test(cpi, frame_over_shoot_limit, frame_under_shoot_limit, *q, AOMMAX(*q_high, top_index), bottom_index)) { // Is the projected frame size out of range and are we allowed // to attempt to recode. // Frame size out of permitted range: // Update correction factor & compute new Q to try... // Frame is too large if (rc->projected_frame_size > rc->this_frame_target) { // Special case if the projected size is > the max allowed. if (*q == *q_high && rc->projected_frame_size >= rc->max_frame_bandwidth) { const double q_val_high_current = av1_convert_qindex_to_q(*q_high, cm->seq_params->bit_depth); const double q_val_high_new = q_val_high_current * ((double)rc->projected_frame_size / rc->max_frame_bandwidth); *q_high = av1_find_qindex(q_val_high_new, cm->seq_params->bit_depth, rc->best_quality, rc->worst_quality); } // Raise Qlow as to at least the current value *q_low = AOMMIN(*q + 1, *q_high); if (*undershoot_seen || loop_count > 2 || (loop_count == 2 && !frame_is_intra_only(cm))) { av1_rc_update_rate_correction_factors(cpi, 1, cm->width, cm->height); *q = (*q_high + *q_low + 1) / 2; } else if (loop_count == 2 && frame_is_intra_only(cm)) { const int q_mid = (*q_high + *q_low + 1) / 2; const int q_regulated = get_regulated_q_overshoot( cpi, 1, *q_low, *q_high, top_index, bottom_index); // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth // transition between loop_count < 2 and loop_count > 2. *q = (q_mid + q_regulated + 1) / 2; } else { *q = get_regulated_q_overshoot(cpi, 1, *q_low, *q_high, top_index, bottom_index); } *overshoot_seen = 1; } else { // Frame is too small *q_high = AOMMAX(*q - 1, *q_low); if (*overshoot_seen || loop_count > 2 || (loop_count == 2 && !frame_is_intra_only(cm))) { av1_rc_update_rate_correction_factors(cpi, 1, cm->width, cm->height); *q = (*q_high + *q_low) / 2; } else if (loop_count == 2 && frame_is_intra_only(cm)) { const int q_mid = (*q_high + *q_low) / 2; const int q_regulated = get_regulated_q_undershoot( cpi, 1, *q_high, top_index, bottom_index); // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth // transition between loop_count < 2 and loop_count > 2. *q = (q_mid + q_regulated) / 2; // Special case reset for qlow for constrained quality. // This should only trigger where there is very substantial // undershoot on a frame and the auto cq level is above // the user passsed in value. if (rc_cfg->mode == AOM_CQ && q_regulated < *q_low) { *q_low = *q; } } else { *q = get_regulated_q_undershoot(cpi, 1, *q_high, top_index, bottom_index); // Special case reset for qlow for constrained quality. // This should only trigger where there is very substantial // undershoot on a frame and the auto cq level is above // the user passsed in value. if (rc_cfg->mode == AOM_CQ && *q < *q_low) { *q_low = *q; } } *undershoot_seen = 1; } // Clamp Q to upper and lower limits: *q = clamp(*q, *q_low, *q_high); } *loop = (*q != last_q); } #endif #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_RC_UTILS_H_ aom-3.12.1/av1/encoder/rd.c000066400000000000000000001737521477627663500152770ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/bitops.h" #include "aom_ports/mem.h" #include "aom_ports/aom_once.h" #include "av1/common/common.h" #include "av1/common/entropy.h" #include "av1/common/entropymode.h" #include "av1/common/pred_common.h" #include "av1/common/quant_common.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" #include "av1/common/seg_common.h" #include "av1/encoder/cost.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/encoder.h" #include "av1/encoder/nonrd_opt.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/rd.h" #include "config/aom_config.h" #define RD_THRESH_POW 1.25 // The baseline rd thresholds for breaking out of the rd loop for // certain modes are assumed to be based on 8x8 blocks. // This table is used to correct for block size. // The factors here are << 2 (2 = x0.5, 32 = x8 etc). static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES_ALL] = { 2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32, 48, 48, 64, 4, 4, 8, 8, 16, 16 }; static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA] [EXT_TX_SIZES] = { { 1, 1, 1, 1 }, // unused { 1, 1, 0, 0 }, { 0, 0, 1, 0 }, }; static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER] [EXT_TX_SIZES] = { { 1, 1, 1, 1 }, // unused { 1, 1, 0, 0 }, { 0, 0, 1, 0 }, { 0, 1, 1, 1 }, }; static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA, EXT_TX_SETS_INTER)] = { { // Intra EXT_TX_SET_DCTONLY, EXT_TX_SET_DTT4_IDTX_1DDCT, EXT_TX_SET_DTT4_IDTX, }, { // Inter EXT_TX_SET_DCTONLY, EXT_TX_SET_ALL16, EXT_TX_SET_DTT9_IDTX_1DDCT, EXT_TX_SET_DCT_IDTX, }, }; void av1_fill_mode_rates(AV1_COMMON *const cm, ModeCosts *mode_costs, FRAME_CONTEXT *fc) { int i, j; for (i = 0; i < PARTITION_CONTEXTS; ++i) av1_cost_tokens_from_cdf(mode_costs->partition_cost[i], fc->partition_cdf[i], NULL); if (cm->current_frame.skip_mode_info.skip_mode_flag) { for (i = 0; i < SKIP_MODE_CONTEXTS; ++i) { av1_cost_tokens_from_cdf(mode_costs->skip_mode_cost[i], fc->skip_mode_cdfs[i], NULL); } } for (i = 0; i < SKIP_CONTEXTS; ++i) { av1_cost_tokens_from_cdf(mode_costs->skip_txfm_cost[i], fc->skip_txfm_cdfs[i], NULL); } for (i = 0; i < KF_MODE_CONTEXTS; ++i) for (j = 0; j < KF_MODE_CONTEXTS; ++j) av1_cost_tokens_from_cdf(mode_costs->y_mode_costs[i][j], fc->kf_y_cdf[i][j], NULL); for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) av1_cost_tokens_from_cdf(mode_costs->mbmode_cost[i], fc->y_mode_cdf[i], NULL); for (i = 0; i < CFL_ALLOWED_TYPES; ++i) for (j = 0; j < INTRA_MODES; ++j) av1_cost_tokens_from_cdf(mode_costs->intra_uv_mode_cost[i][j], fc->uv_mode_cdf[i][j], NULL); av1_cost_tokens_from_cdf(mode_costs->filter_intra_mode_cost, fc->filter_intra_mode_cdf, NULL); for (i = 0; i < BLOCK_SIZES_ALL; ++i) { if (av1_filter_intra_allowed_bsize(cm, i)) av1_cost_tokens_from_cdf(mode_costs->filter_intra_cost[i], fc->filter_intra_cdfs[i], NULL); } for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) av1_cost_tokens_from_cdf(mode_costs->switchable_interp_costs[i], fc->switchable_interp_cdf[i], NULL); for (i = 0; i < PALATTE_BSIZE_CTXS; ++i) { av1_cost_tokens_from_cdf(mode_costs->palette_y_size_cost[i], fc->palette_y_size_cdf[i], NULL); av1_cost_tokens_from_cdf(mode_costs->palette_uv_size_cost[i], fc->palette_uv_size_cdf[i], NULL); for (j = 0; j < PALETTE_Y_MODE_CONTEXTS; ++j) { av1_cost_tokens_from_cdf(mode_costs->palette_y_mode_cost[i][j], fc->palette_y_mode_cdf[i][j], NULL); } } for (i = 0; i < PALETTE_UV_MODE_CONTEXTS; ++i) { av1_cost_tokens_from_cdf(mode_costs->palette_uv_mode_cost[i], fc->palette_uv_mode_cdf[i], NULL); } for (i = 0; i < PALETTE_SIZES; ++i) { for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) { av1_cost_tokens_from_cdf(mode_costs->palette_y_color_cost[i][j], fc->palette_y_color_index_cdf[i][j], NULL); av1_cost_tokens_from_cdf(mode_costs->palette_uv_color_cost[i][j], fc->palette_uv_color_index_cdf[i][j], NULL); } } int sign_cost[CFL_JOINT_SIGNS]; av1_cost_tokens_from_cdf(sign_cost, fc->cfl_sign_cdf, NULL); for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) { int *cost_u = mode_costs->cfl_cost[joint_sign][CFL_PRED_U]; int *cost_v = mode_costs->cfl_cost[joint_sign][CFL_PRED_V]; if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO) { memset(cost_u, 0, CFL_ALPHABET_SIZE * sizeof(*cost_u)); } else { const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; av1_cost_tokens_from_cdf(cost_u, cdf_u, NULL); } if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO) { memset(cost_v, 0, CFL_ALPHABET_SIZE * sizeof(*cost_v)); } else { const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; av1_cost_tokens_from_cdf(cost_v, cdf_v, NULL); } for (int u = 0; u < CFL_ALPHABET_SIZE; u++) cost_u[u] += sign_cost[joint_sign]; } for (i = 0; i < MAX_TX_CATS; ++i) for (j = 0; j < TX_SIZE_CONTEXTS; ++j) av1_cost_tokens_from_cdf(mode_costs->tx_size_cost[i][j], fc->tx_size_cdf[i][j], NULL); for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i) { av1_cost_tokens_from_cdf(mode_costs->txfm_partition_cost[i], fc->txfm_partition_cdf[i], NULL); } for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { int s; for (s = 1; s < EXT_TX_SETS_INTER; ++s) { if (use_inter_ext_tx_for_txsize[s][i]) { av1_cost_tokens_from_cdf( mode_costs->inter_tx_type_costs[s][i], fc->inter_ext_tx_cdf[s][i], av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[1][s]]); } } for (s = 1; s < EXT_TX_SETS_INTRA; ++s) { if (use_intra_ext_tx_for_txsize[s][i]) { for (j = 0; j < INTRA_MODES; ++j) { av1_cost_tokens_from_cdf( mode_costs->intra_tx_type_costs[s][i][j], fc->intra_ext_tx_cdf[s][i][j], av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[0][s]]); } } } } for (i = 0; i < DIRECTIONAL_MODES; ++i) { av1_cost_tokens_from_cdf(mode_costs->angle_delta_cost[i], fc->angle_delta_cdf[i], NULL); } av1_cost_tokens_from_cdf(mode_costs->intrabc_cost, fc->intrabc_cdf, NULL); for (i = 0; i < SPATIAL_PREDICTION_PROBS; ++i) { av1_cost_tokens_from_cdf(mode_costs->spatial_pred_cost[i], fc->seg.spatial_pred_seg_cdf[i], NULL); } for (i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) { av1_cost_tokens_from_cdf(mode_costs->tmp_pred_cost[i], fc->seg.pred_cdf[i], NULL); } if (!frame_is_intra_only(cm)) { for (i = 0; i < COMP_INTER_CONTEXTS; ++i) { av1_cost_tokens_from_cdf(mode_costs->comp_inter_cost[i], fc->comp_inter_cdf[i], NULL); } for (i = 0; i < REF_CONTEXTS; ++i) { for (j = 0; j < SINGLE_REFS - 1; ++j) { av1_cost_tokens_from_cdf(mode_costs->single_ref_cost[i][j], fc->single_ref_cdf[i][j], NULL); } } for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i) { av1_cost_tokens_from_cdf(mode_costs->comp_ref_type_cost[i], fc->comp_ref_type_cdf[i], NULL); } for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) { for (j = 0; j < UNIDIR_COMP_REFS - 1; ++j) { av1_cost_tokens_from_cdf(mode_costs->uni_comp_ref_cost[i][j], fc->uni_comp_ref_cdf[i][j], NULL); } } for (i = 0; i < REF_CONTEXTS; ++i) { for (j = 0; j < FWD_REFS - 1; ++j) { av1_cost_tokens_from_cdf(mode_costs->comp_ref_cost[i][j], fc->comp_ref_cdf[i][j], NULL); } } for (i = 0; i < REF_CONTEXTS; ++i) { for (j = 0; j < BWD_REFS - 1; ++j) { av1_cost_tokens_from_cdf(mode_costs->comp_bwdref_cost[i][j], fc->comp_bwdref_cdf[i][j], NULL); } } for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) { av1_cost_tokens_from_cdf(mode_costs->intra_inter_cost[i], fc->intra_inter_cdf[i], NULL); } for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) { av1_cost_tokens_from_cdf(mode_costs->newmv_mode_cost[i], fc->newmv_cdf[i], NULL); } for (i = 0; i < GLOBALMV_MODE_CONTEXTS; ++i) { av1_cost_tokens_from_cdf(mode_costs->zeromv_mode_cost[i], fc->zeromv_cdf[i], NULL); } for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) { av1_cost_tokens_from_cdf(mode_costs->refmv_mode_cost[i], fc->refmv_cdf[i], NULL); } for (i = 0; i < DRL_MODE_CONTEXTS; ++i) { av1_cost_tokens_from_cdf(mode_costs->drl_mode_cost0[i], fc->drl_cdf[i], NULL); } for (i = 0; i < INTER_MODE_CONTEXTS; ++i) av1_cost_tokens_from_cdf(mode_costs->inter_compound_mode_cost[i], fc->inter_compound_mode_cdf[i], NULL); for (i = 0; i < BLOCK_SIZES_ALL; ++i) av1_cost_tokens_from_cdf(mode_costs->compound_type_cost[i], fc->compound_type_cdf[i], NULL); for (i = 0; i < BLOCK_SIZES_ALL; ++i) { if (av1_is_wedge_used(i)) { av1_cost_tokens_from_cdf(mode_costs->wedge_idx_cost[i], fc->wedge_idx_cdf[i], NULL); } } for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) { av1_cost_tokens_from_cdf(mode_costs->interintra_cost[i], fc->interintra_cdf[i], NULL); av1_cost_tokens_from_cdf(mode_costs->interintra_mode_cost[i], fc->interintra_mode_cdf[i], NULL); } for (i = 0; i < BLOCK_SIZES_ALL; ++i) { av1_cost_tokens_from_cdf(mode_costs->wedge_interintra_cost[i], fc->wedge_interintra_cdf[i], NULL); } for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) { av1_cost_tokens_from_cdf(mode_costs->motion_mode_cost[i], fc->motion_mode_cdf[i], NULL); } for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) { av1_cost_tokens_from_cdf(mode_costs->motion_mode_cost1[i], fc->obmc_cdf[i], NULL); } for (i = 0; i < COMP_INDEX_CONTEXTS; ++i) { av1_cost_tokens_from_cdf(mode_costs->comp_idx_cost[i], fc->compound_index_cdf[i], NULL); } for (i = 0; i < COMP_GROUP_IDX_CONTEXTS; ++i) { av1_cost_tokens_from_cdf(mode_costs->comp_group_idx_cost[i], fc->comp_group_idx_cdf[i], NULL); } } } #if !CONFIG_REALTIME_ONLY void av1_fill_lr_rates(ModeCosts *mode_costs, FRAME_CONTEXT *fc) { av1_cost_tokens_from_cdf(mode_costs->switchable_restore_cost, fc->switchable_restore_cdf, NULL); av1_cost_tokens_from_cdf(mode_costs->wiener_restore_cost, fc->wiener_restore_cdf, NULL); av1_cost_tokens_from_cdf(mode_costs->sgrproj_restore_cost, fc->sgrproj_restore_cdf, NULL); } #endif // !CONFIG_REALTIME_ONLY // Values are now correlated to quantizer. static int sad_per_bit_lut_8[QINDEX_RANGE]; static int sad_per_bit_lut_10[QINDEX_RANGE]; static int sad_per_bit_lut_12[QINDEX_RANGE]; static void init_me_luts_bd(int *bit16lut, int range, aom_bit_depth_t bit_depth) { int i; // Initialize the sad lut tables using a formulaic calculation for now. // This is to make it easier to resolve the impact of experimental changes // to the quantizer tables. for (i = 0; i < range; i++) { const double q = av1_convert_qindex_to_q(i, bit_depth); bit16lut[i] = (int)(0.0418 * q + 2.4107); } } static void init_me_luts(void) { init_me_luts_bd(sad_per_bit_lut_8, QINDEX_RANGE, AOM_BITS_8); init_me_luts_bd(sad_per_bit_lut_10, QINDEX_RANGE, AOM_BITS_10); init_me_luts_bd(sad_per_bit_lut_12, QINDEX_RANGE, AOM_BITS_12); } void av1_init_me_luts(void) { aom_once(init_me_luts); } static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12, 8, 8, 4, 4, 2, 2, 1, 0 }; static const int rd_layer_depth_factor[7] = { 160, 160, 160, 160, 192, 208, 224 }; // Returns the default rd multiplier for inter frames for a given qindex. // The function here is a first pass estimate based on data from // a previous Vizer run static double def_inter_rd_multiplier(int qindex) { return 3.2 + (0.0015 * (double)qindex); } // Returns the default rd multiplier for ARF/Golden Frames for a given qindex. // The function here is a first pass estimate based on data from // a previous Vizer run static double def_arf_rd_multiplier(int qindex) { return 3.25 + (0.0015 * (double)qindex); } // Returns the default rd multiplier for key frames for a given qindex. // The function here is a first pass estimate based on data from // a previous Vizer run static double def_kf_rd_multiplier(int qindex) { return 3.3 + (0.0015 * (double)qindex); } int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth, FRAME_UPDATE_TYPE update_type, int qindex, aom_tune_metric tuning) { const int q = av1_dc_quant_QTX(qindex, 0, bit_depth); int64_t rdmult = q * q; if (update_type == KF_UPDATE) { double def_rd_q_mult = def_kf_rd_multiplier(q); rdmult = (int64_t)((double)rdmult * def_rd_q_mult); } else if ((update_type == GF_UPDATE) || (update_type == ARF_UPDATE)) { double def_rd_q_mult = def_arf_rd_multiplier(q); rdmult = (int64_t)((double)rdmult * def_rd_q_mult); } else { double def_rd_q_mult = def_inter_rd_multiplier(q); rdmult = (int64_t)((double)rdmult * def_rd_q_mult); } if (tuning == AOM_TUNE_IQ) { // Further multiply rdmult (by up to 200/128 = 1.5625) to improve image // quality. The most noticeable effect is a mild bias towards choosing // larger transform sizes (e.g. one 16x16 transform instead of 4 8x8 // transforms). // For very high qindexes, start progressively reducing the weight towards // unity (128/128), as transforms are large enough and making them even // larger actually harms subjective quality and SSIMULACRA 2 scores. // This weight part of the equation was determined by iteratively increasing // weight on CID22 and Daala's subset1, and observing its effects on visual // quality and SSIMULACRA 2 scores along the usable (0-100) range. // The ramp-down part of the equation was determined by choosing a fixed // initial qindex point [qindex 159 = (255 - 159) * 3 / 4] where SSIMULACRA // 2 scores for encodes with qindexes greater than 159 scored at or above // their equivalents with no rdmult adjustment. const int weight = clamp(((255 - qindex) * 3) / 4, 0, 72) + 128; rdmult = (int64_t)((double)rdmult * weight / 128.0); } switch (bit_depth) { case AOM_BITS_8: break; case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break; case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break; default: assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); return -1; } return rdmult > 0 ? (int)AOMMIN(rdmult, INT_MAX) : 1; } int av1_compute_rd_mult(const int qindex, const aom_bit_depth_t bit_depth, const FRAME_UPDATE_TYPE update_type, const int layer_depth, const int boost_index, const FRAME_TYPE frame_type, const int use_fixed_qp_offsets, const int is_stat_consumption_stage, const aom_tune_metric tuning) { int64_t rdmult = av1_compute_rd_mult_based_on_qindex(bit_depth, update_type, qindex, tuning); if (is_stat_consumption_stage && !use_fixed_qp_offsets && (frame_type != KEY_FRAME)) { // Layer depth adjustment rdmult = (rdmult * rd_layer_depth_factor[layer_depth]) >> 7; // ARF boost adjustment rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7); } return rdmult > 0 ? (int)AOMMIN(rdmult, INT_MAX) : 1; } int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta) { assert(beta > 0.0); int q = av1_dc_quant_QTX(qindex, 0, bit_depth); int newq = (int)rint(q / sqrt(beta)); int orig_qindex = qindex; if (newq == q) { return 0; } if (newq < q) { while (qindex > 0) { qindex--; q = av1_dc_quant_QTX(qindex, 0, bit_depth); if (newq >= q) { break; } } } else { while (qindex < MAXQ) { qindex++; q = av1_dc_quant_QTX(qindex, 0, bit_depth); if (newq <= q) { break; } } } return qindex - orig_qindex; } int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex, int curr_qindex) { curr_qindex = clamp(curr_qindex, delta_q_res, 256 - delta_q_res); const int sign_deltaq_index = curr_qindex - prev_qindex >= 0 ? 1 : -1; const int deltaq_deadzone = delta_q_res / 4; const int qmask = ~(delta_q_res - 1); int abs_deltaq_index = abs(curr_qindex - prev_qindex); abs_deltaq_index = (abs_deltaq_index + deltaq_deadzone) & qmask; int adjust_qindex = prev_qindex + sign_deltaq_index * abs_deltaq_index; adjust_qindex = AOMMAX(adjust_qindex, MINQ + 1); return adjust_qindex; } #if !CONFIG_REALTIME_ONLY int av1_get_adaptive_rdmult(const AV1_COMP *cpi, double beta) { assert(beta > 0.0); const AV1_COMMON *cm = &cpi->common; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); const FRAME_TYPE frame_type = cm->current_frame.frame_type; const int qindex_rdmult = cm->quant_params.base_qindex; return (int)(av1_compute_rd_mult( qindex_rdmult, cm->seq_params->bit_depth, cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi), cpi->oxcf.tune_cfg.tuning) / beta); } #endif // !CONFIG_REALTIME_ONLY static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) { double q; switch (bit_depth) { case AOM_BITS_8: q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_8) / 4.0; break; case AOM_BITS_10: q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_10) / 16.0; break; case AOM_BITS_12: q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_12) / 64.0; break; default: assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); return -1; } // TODO(debargha): Adjust the function below. return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8); } void av1_set_sad_per_bit(const AV1_COMP *cpi, int *sadperbit, int qindex) { switch (cpi->common.seq_params->bit_depth) { case AOM_BITS_8: *sadperbit = sad_per_bit_lut_8[qindex]; break; case AOM_BITS_10: *sadperbit = sad_per_bit_lut_10[qindex]; break; case AOM_BITS_12: *sadperbit = sad_per_bit_lut_12[qindex]; break; default: assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); } } static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd, int use_nonrd_pick_mode) { int i, bsize, segment_id; THR_MODES mode_indices[RTC_REFS * RTC_MODES] = { 0 }; int num_modes_count = use_nonrd_pick_mode ? 0 : MAX_MODES; if (use_nonrd_pick_mode) { for (int r_idx = 0; r_idx < RTC_REFS; r_idx++) { const MV_REFERENCE_FRAME ref = real_time_ref_combos[r_idx][0]; if (ref != INTRA_FRAME) { for (i = 0; i < RTC_INTER_MODES; i++) mode_indices[num_modes_count++] = mode_idx[ref][mode_offset(inter_mode_list[i])]; } else { for (i = 0; i < RTC_INTRA_MODES; i++) mode_indices[num_modes_count++] = mode_idx[ref][mode_offset(intra_mode_list[i])]; } } } for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) { const int qindex = clamp( av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex) + cm->quant_params.y_dc_delta_q, 0, MAXQ); const int q = compute_rd_thresh_factor(qindex, cm->seq_params->bit_depth); for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { // Threshold here seems unnecessarily harsh but fine given actual // range of values used for cpi->sf.thresh_mult[]. const int t = q * rd_thresh_block_size_factor[bsize]; const int thresh_max = INT_MAX / t; for (i = 0; i < num_modes_count; ++i) { const int mode_index = use_nonrd_pick_mode ? mode_indices[i] : i; rd->threshes[segment_id][bsize][mode_index] = rd->thresh_mult[mode_index] < thresh_max ? rd->thresh_mult[mode_index] * t / 4 : INT_MAX; } } } } void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc, const int num_planes) { const int nplanes = AOMMIN(num_planes, PLANE_TYPES); for (int eob_multi_size = 0; eob_multi_size < 7; ++eob_multi_size) { for (int plane = 0; plane < nplanes; ++plane) { LV_MAP_EOB_COST *pcost = &coeff_costs->eob_costs[eob_multi_size][plane]; for (int ctx = 0; ctx < 2; ++ctx) { aom_cdf_prob *pcdf; switch (eob_multi_size) { case 0: pcdf = fc->eob_flag_cdf16[plane][ctx]; break; case 1: pcdf = fc->eob_flag_cdf32[plane][ctx]; break; case 2: pcdf = fc->eob_flag_cdf64[plane][ctx]; break; case 3: pcdf = fc->eob_flag_cdf128[plane][ctx]; break; case 4: pcdf = fc->eob_flag_cdf256[plane][ctx]; break; case 5: pcdf = fc->eob_flag_cdf512[plane][ctx]; break; case 6: default: pcdf = fc->eob_flag_cdf1024[plane][ctx]; break; } av1_cost_tokens_from_cdf(pcost->eob_cost[ctx], pcdf, NULL); } } } for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) { for (int plane = 0; plane < nplanes; ++plane) { LV_MAP_COEFF_COST *pcost = &coeff_costs->coeff_costs[tx_size][plane]; for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) av1_cost_tokens_from_cdf(pcost->txb_skip_cost[ctx], fc->txb_skip_cdf[tx_size][ctx], NULL); for (int ctx = 0; ctx < SIG_COEF_CONTEXTS_EOB; ++ctx) av1_cost_tokens_from_cdf(pcost->base_eob_cost[ctx], fc->coeff_base_eob_cdf[tx_size][plane][ctx], NULL); for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) av1_cost_tokens_from_cdf(pcost->base_cost[ctx], fc->coeff_base_cdf[tx_size][plane][ctx], NULL); for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) { pcost->base_cost[ctx][4] = 0; pcost->base_cost[ctx][5] = pcost->base_cost[ctx][1] + av1_cost_literal(1) - pcost->base_cost[ctx][0]; pcost->base_cost[ctx][6] = pcost->base_cost[ctx][2] - pcost->base_cost[ctx][1]; pcost->base_cost[ctx][7] = pcost->base_cost[ctx][3] - pcost->base_cost[ctx][2]; } for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx], fc->eob_extra_cdf[tx_size][plane][ctx], NULL); for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx) av1_cost_tokens_from_cdf(pcost->dc_sign_cost[ctx], fc->dc_sign_cdf[plane][ctx], NULL); for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) { int br_rate[BR_CDF_SIZE]; int prev_cost = 0; int i, j; av1_cost_tokens_from_cdf( br_rate, fc->coeff_br_cdf[AOMMIN(tx_size, TX_32X32)][plane][ctx], NULL); // printf("br_rate: "); // for(j = 0; j < BR_CDF_SIZE; j++) // printf("%4d ", br_rate[j]); // printf("\n"); for (i = 0; i < COEFF_BASE_RANGE; i += BR_CDF_SIZE - 1) { for (j = 0; j < BR_CDF_SIZE - 1; j++) { pcost->lps_cost[ctx][i + j] = prev_cost + br_rate[j]; } prev_cost += br_rate[j]; } pcost->lps_cost[ctx][i] = prev_cost; // printf("lps_cost: %d %d %2d : ", tx_size, plane, ctx); // for (i = 0; i <= COEFF_BASE_RANGE; i++) // printf("%5d ", pcost->lps_cost[ctx][i]); // printf("\n"); } for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) { pcost->lps_cost[ctx][0 + COEFF_BASE_RANGE + 1] = pcost->lps_cost[ctx][0]; for (int i = 1; i <= COEFF_BASE_RANGE; ++i) { pcost->lps_cost[ctx][i + COEFF_BASE_RANGE + 1] = pcost->lps_cost[ctx][i] - pcost->lps_cost[ctx][i - 1]; } } } } } void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp, MvCosts *mv_costs) { // Avoid accessing 'mv_costs' when it is not allocated. if (mv_costs == NULL) return; mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX]; mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX]; mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX]; mv_costs->nmv_cost_hp[1] = &mv_costs->nmv_cost_hp_alloc[1][MV_MAX]; if (integer_mv) { mv_costs->mv_cost_stack = (int **)&mv_costs->nmv_cost; av1_build_nmv_cost_table(mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack, nmvc, MV_SUBPEL_NONE); } else { mv_costs->mv_cost_stack = usehp ? mv_costs->nmv_cost_hp : mv_costs->nmv_cost; av1_build_nmv_cost_table(mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack, nmvc, usehp); } } void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs) { dv_costs->dv_costs[0] = &dv_costs->dv_costs_alloc[0][MV_MAX]; dv_costs->dv_costs[1] = &dv_costs->dv_costs_alloc[1][MV_MAX]; av1_build_nmv_cost_table(dv_costs->joint_mv, dv_costs->dv_costs, ndvc, MV_SUBPEL_NONE); } // Populates speed features based on codec control settings (of type // COST_UPDATE_TYPE) and expected speed feature settings (of type // INTERNAL_COST_UPDATE_TYPE) by considering the least frequent cost update. // The populated/updated speed features are used for cost updates in the // encoder. // WARNING: Population of unified cost update frequency needs to be taken care // accordingly, in case of any modifications/additions to the enum // COST_UPDATE_TYPE/INTERNAL_COST_UPDATE_TYPE. static inline void populate_unified_cost_update_freq( const CostUpdateFreq cost_upd_freq, SPEED_FEATURES *const sf) { INTER_MODE_SPEED_FEATURES *const inter_sf = &sf->inter_sf; // Mapping of entropy cost update frequency from the encoder's codec control // settings of type COST_UPDATE_TYPE to speed features of type // INTERNAL_COST_UPDATE_TYPE. static const INTERNAL_COST_UPDATE_TYPE map_cost_upd_to_internal_cost_upd[NUM_COST_UPDATE_TYPES] = { INTERNAL_COST_UPD_SB, INTERNAL_COST_UPD_SBROW, INTERNAL_COST_UPD_TILE, INTERNAL_COST_UPD_OFF }; inter_sf->mv_cost_upd_level = AOMMIN(inter_sf->mv_cost_upd_level, map_cost_upd_to_internal_cost_upd[cost_upd_freq.mv]); inter_sf->coeff_cost_upd_level = AOMMIN(inter_sf->coeff_cost_upd_level, map_cost_upd_to_internal_cost_upd[cost_upd_freq.coeff]); inter_sf->mode_cost_upd_level = AOMMIN(inter_sf->mode_cost_upd_level, map_cost_upd_to_internal_cost_upd[cost_upd_freq.mode]); sf->intra_sf.dv_cost_upd_level = AOMMIN(sf->intra_sf.dv_cost_upd_level, map_cost_upd_to_internal_cost_upd[cost_upd_freq.dv]); } // Checks if entropy costs should be initialized/updated at frame level or not. static inline int is_frame_level_cost_upd_freq_set( const AV1_COMMON *const cm, const INTERNAL_COST_UPDATE_TYPE cost_upd_level, const int use_nonrd_pick_mode, const int frames_since_key) { const int fill_costs = frame_is_intra_only(cm) || (use_nonrd_pick_mode ? frames_since_key < 2 : (cm->current_frame.frame_number & 0x07) == 1); return ((!use_nonrd_pick_mode && cost_upd_level != INTERNAL_COST_UPD_OFF) || cost_upd_level == INTERNAL_COST_UPD_TILE || fill_costs); } // Decide whether we want to update the mode entropy cost for the current frame. // The logit is currently inherited from selective_disable_cdf_rtc. static inline int should_force_mode_cost_update(const AV1_COMP *cpi) { const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; if (!rt_sf->frame_level_mode_cost_update) { return false; } if (cpi->oxcf.algo_cfg.cdf_update_mode == 2) { return cpi->frames_since_last_update == 1; } else if (cpi->oxcf.algo_cfg.cdf_update_mode == 1) { if (cpi->svc.number_spatial_layers == 1 && cpi->svc.number_temporal_layers == 1) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; return frame_is_intra_only(cm) || is_frame_resize_pending(cpi) || rc->high_source_sad || rc->frames_since_key < 10 || cpi->cyclic_refresh->counter_encode_maxq_scene_change < 10 || cm->current_frame.frame_number % 8 == 0; } else if (cpi->svc.number_temporal_layers > 1) { return cpi->svc.temporal_layer_id != cpi->svc.number_temporal_layers - 1; } } return false; } void av1_initialize_rd_consts(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->td.mb; SPEED_FEATURES *const sf = &cpi->sf; RD_OPT *const rd = &cpi->rd; int use_nonrd_pick_mode = cpi->sf.rt_sf.use_nonrd_pick_mode; int frames_since_key = cpi->rc.frames_since_key; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); const FRAME_TYPE frame_type = cm->current_frame.frame_type; const int qindex_rdmult = cm->quant_params.base_qindex + cm->quant_params.y_dc_delta_q; rd->RDMULT = av1_compute_rd_mult( qindex_rdmult, cm->seq_params->bit_depth, cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi), cpi->oxcf.tune_cfg.tuning); #if CONFIG_RD_COMMAND if (cpi->oxcf.pass == 2) { const RD_COMMAND *rd_command = &cpi->rd_command; if (rd_command->option_ls[rd_command->frame_index] == RD_OPTION_SET_Q_RDMULT) { rd->RDMULT = rd_command->rdmult_ls[rd_command->frame_index]; } } #endif // CONFIG_RD_COMMAND av1_set_error_per_bit(&x->errorperbit, rd->RDMULT); set_block_thresholds(cm, rd, cpi->sf.rt_sf.use_nonrd_pick_mode); populate_unified_cost_update_freq(cpi->oxcf.cost_upd_freq, sf); const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf; // Frame level mv cost update if (is_frame_level_cost_upd_freq_set(cm, inter_sf->mv_cost_upd_level, use_nonrd_pick_mode, frames_since_key)) av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv, cm->features.allow_high_precision_mv, x->mv_costs); // Frame level coefficient cost update if (is_frame_level_cost_upd_freq_set(cm, inter_sf->coeff_cost_upd_level, use_nonrd_pick_mode, frames_since_key)) av1_fill_coeff_costs(&x->coeff_costs, cm->fc, av1_num_planes(cm)); // Frame level mode cost update if (should_force_mode_cost_update(cpi) || is_frame_level_cost_upd_freq_set(cm, inter_sf->mode_cost_upd_level, use_nonrd_pick_mode, frames_since_key)) av1_fill_mode_rates(cm, &x->mode_costs, cm->fc); // Frame level dv cost update if (av1_need_dv_costs(cpi)) { if (cpi->td.dv_costs_alloc == NULL) { CHECK_MEM_ERROR( cm, cpi->td.dv_costs_alloc, (IntraBCMVCosts *)aom_malloc(sizeof(*cpi->td.dv_costs_alloc))); cpi->td.mb.dv_costs = cpi->td.dv_costs_alloc; } av1_fill_dv_costs(&cm->fc->ndvc, x->dv_costs); } } static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { // NOTE: The tables below must be of the same size. // The functions described below are sampled at the four most significant // bits of x^2 + 8 / 256. // Normalized rate: // This table models the rate for a Laplacian source with given variance // when quantized with a uniform quantizer with given stepsize. The // closed form expression is: // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance), // and H(x) is the binary entropy function. static const int rate_tab_q10[] = { 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, 4044, 3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, 3133, 3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, 2290, 2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, 1608, 1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963, 911, 864, 821, 781, 745, 680, 623, 574, 530, 490, 455, 424, 395, 345, 304, 269, 239, 213, 190, 171, 154, 126, 104, 87, 73, 61, 52, 44, 38, 28, 21, 16, 12, 10, 8, 6, 5, 3, 2, 1, 1, 1, 0, 0, }; // Normalized distortion: // This table models the normalized distortion for a Laplacian source // with given variance when quantized with a uniform quantizer // with given stepsize. The closed form expression is: // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) // where x = qpstep / sqrt(variance). // Note the actual distortion is Dn * variance. static const int dist_tab_q10[] = { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17, 18, 21, 24, 26, 29, 31, 34, 36, 39, 44, 49, 54, 59, 64, 69, 73, 78, 88, 97, 106, 115, 124, 133, 142, 151, 167, 184, 200, 215, 231, 245, 260, 274, 301, 327, 351, 375, 397, 418, 439, 458, 495, 528, 559, 587, 613, 637, 659, 680, 717, 749, 777, 801, 823, 842, 859, 874, 899, 919, 936, 949, 960, 969, 977, 983, 994, 1001, 1006, 1010, 1013, 1015, 1017, 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024, }; static const int xsq_iq_q10[] = { 0, 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 72, 80, 88, 96, 112, 128, 144, 160, 176, 192, 208, 224, 256, 288, 320, 352, 384, 416, 448, 480, 544, 608, 672, 736, 800, 864, 928, 992, 1120, 1248, 1376, 1504, 1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296, 3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136, 7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328, 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736, 36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696, 81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808, 180192, 196576, 212960, 229344, 245728, }; const int tmp = (xsq_q10 >> 2) + 8; const int k = get_msb(tmp) - 3; const int xq = (k << 3) + ((tmp >> k) & 0x7); const int one_q10 = 1 << 10; const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k); const int b_q10 = one_q10 - a_q10; *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10; *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; } void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2, unsigned int qstep, int *rate, int64_t *dist) { // This function models the rate and distortion for a Laplacian // source with given variance when quantized with a uniform quantizer // with given stepsize. The closed form expressions are in: // Hang and Chen, "Source Model for transform video coder and its // application - Part I: Fundamental Theory", IEEE Trans. Circ. // Sys. for Video Tech., April 1997. if (var == 0) { *rate = 0; *dist = 0; } else { int d_q10, r_q10; static const uint32_t MAX_XSQ_Q10 = 245727; const uint64_t xsq_q10_64 = (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var; const int xsq_q10 = (int)AOMMIN(xsq_q10_64, MAX_XSQ_Q10); model_rd_norm(xsq_q10, &r_q10, &d_q10); *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - AV1_PROB_COST_SHIFT); *dist = (var * (int64_t)d_q10 + 512) >> 10; } } static double interp_cubic(const double *p, double x) { return p[1] + 0.5 * x * (p[2] - p[0] + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + x * (3.0 * (p[1] - p[2]) + p[3] - p[0]))); } /* static double interp_bicubic(const double *p, int p_stride, double x, double y) { double q[4]; q[0] = interp_cubic(p, x); q[1] = interp_cubic(p + p_stride, x); q[2] = interp_cubic(p + 2 * p_stride, x); q[3] = interp_cubic(p + 3 * p_stride, x); return interp_cubic(q, y); } */ static const uint8_t bsize_curvfit_model_cat_lookup[BLOCK_SIZES_ALL] = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 1, 1, 2, 2, 3, 3 }; static int sse_norm_curvfit_model_cat_lookup(double sse_norm) { return (sse_norm > 16.0); } static const double interp_rgrid_curv[4][65] = { { 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 118.257702, 120.210658, 121.434853, 122.100487, 122.377758, 122.436865, 72.290102, 96.974289, 101.652727, 126.830141, 140.417377, 157.644879, 184.315291, 215.823873, 262.300169, 335.919859, 420.624173, 519.185032, 619.854243, 726.053595, 827.663369, 933.127475, 1037.988755, 1138.839609, 1233.342933, 1333.508064, 1428.760126, 1533.396364, 1616.952052, 1744.539319, 1803.413586, 1951.466618, 1994.227838, 2086.031680, 2148.635443, 2239.068450, 2222.590637, 2338.859809, 2402.929011, 2418.727875, 2435.342670, 2471.159469, 2523.187446, 2591.183827, 2674.905840, 2774.110714, 2888.555675, 3017.997952, 3162.194773, 3320.903365, 3493.880956, 3680.884773, 3881.672045, 4096.000000, }, { 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 13.087244, 15.919735, 25.930313, 24.412411, 28.567417, 29.924194, 30.857010, 32.742979, 36.382570, 39.210386, 42.265690, 47.378572, 57.014850, 82.740067, 137.346562, 219.968084, 316.781856, 415.643773, 516.706538, 614.914364, 714.303763, 815.512135, 911.210485, 1008.501528, 1109.787854, 1213.772279, 1322.922561, 1414.752579, 1510.505641, 1615.741888, 1697.989032, 1780.123933, 1847.453790, 1913.742309, 1960.828122, 2047.500168, 2085.454095, 2129.230668, 2158.171824, 2182.231724, 2217.684864, 2269.589211, 2337.264824, 2420.618694, 2519.557814, 2633.989178, 2763.819779, 2908.956609, 3069.306660, 3244.776927, 3435.274401, 3640.706076, 3860.978945, 4096.000000, }, { 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 4.656893, 5.123633, 5.594132, 6.162376, 6.918433, 7.768444, 8.739415, 10.105862, 11.477328, 13.236604, 15.421030, 19.093623, 25.801871, 46.724612, 98.841054, 181.113466, 272.586364, 359.499769, 445.546343, 525.944439, 605.188743, 681.793483, 756.668359, 838.486885, 926.950356, 1015.482542, 1113.353926, 1204.897193, 1288.871992, 1373.464145, 1455.746628, 1527.796460, 1588.475066, 1658.144771, 1710.302500, 1807.563351, 1863.197608, 1927.281616, 1964.450872, 2022.719898, 2100.041145, 2185.205712, 2280.993936, 2387.616216, 2505.282950, 2634.204540, 2774.591385, 2926.653884, 3090.602436, 3266.647443, 3454.999303, 3655.868416, 3869.465182, 4096.000000, }, { 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.337370, 0.391916, 0.468839, 0.566334, 0.762564, 1.069225, 1.384361, 1.787581, 2.293948, 3.251909, 4.412991, 8.050068, 11.606073, 27.668092, 65.227758, 128.463938, 202.097653, 262.715851, 312.464873, 355.601398, 400.609054, 447.201352, 495.761568, 552.871938, 619.067625, 691.984883, 773.753288, 860.628503, 946.262808, 1019.805896, 1106.061360, 1178.422145, 1244.852258, 1302.173987, 1399.650266, 1548.092912, 1545.928652, 1670.817500, 1694.523823, 1779.195362, 1882.155494, 1990.662097, 2108.325181, 2235.456119, 2372.366287, 2519.367059, 2676.769812, 2844.885918, 3024.026754, 3214.503695, 3416.628115, 3630.711389, 3857.064892, 4096.000000, }, }; static const double interp_dgrid_curv[3][65] = { { 16.000000, 15.962891, 15.925174, 15.886888, 15.848074, 15.808770, 15.769015, 15.728850, 15.688313, 15.647445, 15.606284, 15.564870, 15.525918, 15.483820, 15.373330, 15.126844, 14.637442, 14.184387, 13.560070, 12.880717, 12.165995, 11.378144, 10.438769, 9.130790, 7.487633, 5.688649, 4.267515, 3.196300, 2.434201, 1.834064, 1.369920, 1.035921, 0.775279, 0.574895, 0.427232, 0.314123, 0.233236, 0.171440, 0.128188, 0.092762, 0.067569, 0.049324, 0.036330, 0.027008, 0.019853, 0.015539, 0.011093, 0.008733, 0.007624, 0.008105, 0.005427, 0.004065, 0.003427, 0.002848, 0.002328, 0.001865, 0.001457, 0.001103, 0.000801, 0.000550, 0.000348, 0.000193, 0.000085, 0.000021, 0.000000, }, { 16.000000, 15.996116, 15.984769, 15.966413, 15.941505, 15.910501, 15.873856, 15.832026, 15.785466, 15.734633, 15.679981, 15.621967, 15.560961, 15.460157, 15.288367, 15.052462, 14.466922, 13.921212, 13.073692, 12.222005, 11.237799, 9.985848, 8.898823, 7.423519, 5.995325, 4.773152, 3.744032, 2.938217, 2.294526, 1.762412, 1.327145, 1.020728, 0.765535, 0.570548, 0.425833, 0.313825, 0.232959, 0.171324, 0.128174, 0.092750, 0.067558, 0.049319, 0.036330, 0.027008, 0.019853, 0.015539, 0.011093, 0.008733, 0.007624, 0.008105, 0.005427, 0.004065, 0.003427, 0.002848, 0.002328, 0.001865, 0.001457, 0.001103, 0.000801, 0.000550, 0.000348, 0.000193, 0.000085, 0.000021, -0.000000, }, }; void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr, double *rate_f, double *distbysse_f) { const double x_start = -15.5; const double x_end = 16.5; const double x_step = 0.5; const double epsilon = 1e-6; const int rcat = bsize_curvfit_model_cat_lookup[bsize]; const int dcat = sse_norm_curvfit_model_cat_lookup(sse_norm); (void)x_end; xqr = AOMMAX(xqr, x_start + x_step + epsilon); xqr = AOMMIN(xqr, x_end - x_step - epsilon); const double x = (xqr - x_start) / x_step; const int xi = (int)floor(x); const double xo = x - xi; assert(xi > 0); const double *prate = &interp_rgrid_curv[rcat][(xi - 1)]; *rate_f = interp_cubic(prate, xo); const double *pdist = &interp_dgrid_curv[dcat][(xi - 1)]; *distbysse_f = interp_cubic(pdist, xo); } static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize, const struct macroblockd_plane *pd, ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) { const int num_4x4_w = mi_size_wide[plane_bsize]; const int num_4x4_h = mi_size_high[plane_bsize]; const ENTROPY_CONTEXT *const above = pd->above_entropy_context; const ENTROPY_CONTEXT *const left = pd->left_entropy_context; memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); } void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize, const struct macroblockd_plane *pd, ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) { assert(plane_bsize < BLOCK_SIZES_ALL); get_entropy_contexts_plane(plane_bsize, pd, t_above, t_left); } // Special clamping used in the encoder when calculating a prediction // // Logically, all pixel fetches used for prediction are clamped against the // edges of the frame. But doing this directly is slow, so instead we allocate // a finite border around the frame and fill it with copies of the outermost // pixels. // // Since this border is finite, we need to clamp the motion vector before // prediction in order to avoid out-of-bounds reads. At the same time, this // clamp must not change the prediction result. // // We can balance both of these concerns by calculating how far we would have // to go in each direction before the extended prediction region (the current // block + AOM_INTERP_EXTEND many pixels around the block) would be mapped // so that it touches the frame only at one row or column. This is a special // point because any more extreme MV will always lead to the same prediction. // So it is safe to clamp at that point. // // In the worst case, this requires a border of // max_block_width + 2*AOM_INTERP_EXTEND = 128 + 2*4 = 136 pixels // around the frame edges. static inline void enc_clamp_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd, MV *mv) { int bw = xd->width << MI_SIZE_LOG2; int bh = xd->height << MI_SIZE_LOG2; int px_to_left_edge = xd->mi_col << MI_SIZE_LOG2; int px_to_right_edge = (cm->mi_params.mi_cols - xd->mi_col) << MI_SIZE_LOG2; int px_to_top_edge = xd->mi_row << MI_SIZE_LOG2; int px_to_bottom_edge = (cm->mi_params.mi_rows - xd->mi_row) << MI_SIZE_LOG2; const SubpelMvLimits mv_limits = { .col_min = -GET_MV_SUBPEL(px_to_left_edge + bw + AOM_INTERP_EXTEND), .col_max = GET_MV_SUBPEL(px_to_right_edge + AOM_INTERP_EXTEND), .row_min = -GET_MV_SUBPEL(px_to_top_edge + bh + AOM_INTERP_EXTEND), .row_max = GET_MV_SUBPEL(px_to_bottom_edge + AOM_INTERP_EXTEND) }; clamp_mv(mv, &mv_limits); } void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) { const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME }; const int_mv ref_mv = av1_get_ref_mv_from_stack(0, ref_frames, 0, &x->mbmi_ext); const int_mv ref_mv1 = av1_get_ref_mv_from_stack(0, ref_frames, 1, &x->mbmi_ext); MV pred_mv[MAX_MV_REF_CANDIDATES + 1]; int num_mv_refs = 0; pred_mv[num_mv_refs++] = ref_mv.as_mv; if (ref_mv.as_int != ref_mv1.as_int) { pred_mv[num_mv_refs++] = ref_mv1.as_mv; } assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0]))); const uint8_t *const src_y_ptr = x->plane[0].src.buf; int zero_seen = 0; int best_sad = INT_MAX; int max_mv = 0; // Get the sad for each candidate reference mv. for (int i = 0; i < num_mv_refs; ++i) { MV *this_mv = &pred_mv[i]; enc_clamp_mv(&cpi->common, &x->e_mbd, this_mv); const int fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3; const int fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3; max_mv = AOMMAX(max_mv, AOMMAX(abs(this_mv->row), abs(this_mv->col)) >> 3); if (fp_row == 0 && fp_col == 0 && zero_seen) continue; zero_seen |= (fp_row == 0 && fp_col == 0); const uint8_t *const ref_y_ptr = &ref_y_buffer[ref_y_stride * fp_row + fp_col]; // Find sad for current vector. const int this_sad = cpi->ppi->fn_ptr[block_size].sdf( src_y_ptr, x->plane[0].src.stride, ref_y_ptr, ref_y_stride); // Note if it is the best so far. if (this_sad < best_sad) { best_sad = this_sad; } if (i == 0) x->pred_mv0_sad[ref_frame] = this_sad; else if (i == 1) x->pred_mv1_sad[ref_frame] = this_sad; } // Note the index of the mv that worked best in the reference list. x->max_mv_context[ref_frame] = max_mv; x->pred_mv_sad[ref_frame] = best_sad; } void av1_setup_pred_block(const MACROBLOCKD *xd, struct buf_2d dst[MAX_MB_PLANE], const YV12_BUFFER_CONFIG *src, const struct scale_factors *scale, const struct scale_factors *scale_uv, const int num_planes) { dst[0].buf = src->y_buffer; dst[0].stride = src->y_stride; dst[1].buf = src->u_buffer; dst[2].buf = src->v_buffer; dst[1].stride = dst[2].stride = src->uv_stride; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; for (int i = 0; i < num_planes; ++i) { setup_pred_plane(dst + i, xd->mi[0]->bsize, dst[i].buf, i ? src->uv_crop_width : src->y_crop_width, i ? src->uv_crop_height : src->y_crop_height, dst[i].stride, mi_row, mi_col, i ? scale_uv : scale, xd->plane[i].subsampling_x, xd->plane[i].subsampling_y); } } YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi, int ref_frame) { assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME); RefCntBuffer *const scaled_buf = cpi->scaled_ref_buf[ref_frame - 1]; const RefCntBuffer *const ref_buf = get_ref_frame_buf(&cpi->common, ref_frame); return (scaled_buf != ref_buf && scaled_buf != NULL) ? &scaled_buf->buf : NULL; } int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd, InterpFilter interp_filter, int dual_filter) { if (interp_filter == SWITCHABLE) { const MB_MODE_INFO *const mbmi = xd->mi[0]; int inter_filter_cost = 0; for (int dir = 0; dir < 2; ++dir) { if (dir && !dual_filter) break; const int ctx = av1_get_pred_context_switchable_interp(xd, dir); const InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir); inter_filter_cost += x->mode_costs.switchable_interp_costs[ctx][filter]; } return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost; } else { return 0; } } void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { RD_OPT *const rd = &cpi->rd; // Set baseline threshold values. av1_zero(rd->thresh_mult); rd->thresh_mult[THR_NEARESTMV] = 300; rd->thresh_mult[THR_NEARESTL2] = 300; rd->thresh_mult[THR_NEARESTL3] = 300; rd->thresh_mult[THR_NEARESTB] = 300; rd->thresh_mult[THR_NEARESTA2] = 300; rd->thresh_mult[THR_NEARESTA] = 300; rd->thresh_mult[THR_NEARESTG] = 300; rd->thresh_mult[THR_NEWMV] = 1000; rd->thresh_mult[THR_NEWL2] = 1000; rd->thresh_mult[THR_NEWL3] = 1000; rd->thresh_mult[THR_NEWB] = 1000; rd->thresh_mult[THR_NEWA2] = 1100; rd->thresh_mult[THR_NEWA] = 1000; rd->thresh_mult[THR_NEWG] = 1000; rd->thresh_mult[THR_NEARMV] = 1000; rd->thresh_mult[THR_NEARL2] = 1000; rd->thresh_mult[THR_NEARL3] = 1000; rd->thresh_mult[THR_NEARB] = 1000; rd->thresh_mult[THR_NEARA2] = 1000; rd->thresh_mult[THR_NEARA] = 1000; rd->thresh_mult[THR_NEARG] = 1000; rd->thresh_mult[THR_GLOBALMV] = 2200; rd->thresh_mult[THR_GLOBALL2] = 2000; rd->thresh_mult[THR_GLOBALL3] = 2000; rd->thresh_mult[THR_GLOBALB] = 2400; rd->thresh_mult[THR_GLOBALA2] = 2000; rd->thresh_mult[THR_GLOBALG] = 2000; rd->thresh_mult[THR_GLOBALA] = 2400; rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] = 1100; rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] = 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] = 800; rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] = 900; rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] = 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] = 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] = 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] = 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA2] = 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A2] = 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A2] = 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA2] = 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] = 2000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] = 2000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] = 2000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] = 2000; rd->thresh_mult[THR_COMP_NEAR_NEARLA] = 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWLA] = 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTLA] = 1500; rd->thresh_mult[THR_COMP_NEAR_NEWLA] = 1530; rd->thresh_mult[THR_COMP_NEW_NEARLA] = 1870; rd->thresh_mult[THR_COMP_NEW_NEWLA] = 2400; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] = 2750; rd->thresh_mult[THR_COMP_NEAR_NEARL2A] = 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] = 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] = 1500; rd->thresh_mult[THR_COMP_NEAR_NEWL2A] = 1870; rd->thresh_mult[THR_COMP_NEW_NEARL2A] = 1700; rd->thresh_mult[THR_COMP_NEW_NEWL2A] = 1800; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] = 2500; rd->thresh_mult[THR_COMP_NEAR_NEARL3A] = 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] = 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] = 1500; rd->thresh_mult[THR_COMP_NEAR_NEWL3A] = 1700; rd->thresh_mult[THR_COMP_NEW_NEARL3A] = 1700; rd->thresh_mult[THR_COMP_NEW_NEWL3A] = 2000; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] = 3000; rd->thresh_mult[THR_COMP_NEAR_NEARGA] = 1320; rd->thresh_mult[THR_COMP_NEAREST_NEWGA] = 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTGA] = 1500; rd->thresh_mult[THR_COMP_NEAR_NEWGA] = 2040; rd->thresh_mult[THR_COMP_NEW_NEARGA] = 1700; rd->thresh_mult[THR_COMP_NEW_NEWGA] = 2000; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] = 2250; rd->thresh_mult[THR_COMP_NEAR_NEARLB] = 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWLB] = 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTLB] = 1500; rd->thresh_mult[THR_COMP_NEAR_NEWLB] = 1360; rd->thresh_mult[THR_COMP_NEW_NEARLB] = 1700; rd->thresh_mult[THR_COMP_NEW_NEWLB] = 2400; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] = 2250; rd->thresh_mult[THR_COMP_NEAR_NEARL2B] = 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] = 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] = 1500; rd->thresh_mult[THR_COMP_NEAR_NEWL2B] = 1700; rd->thresh_mult[THR_COMP_NEW_NEARL2B] = 1700; rd->thresh_mult[THR_COMP_NEW_NEWL2B] = 2000; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2B] = 2500; rd->thresh_mult[THR_COMP_NEAR_NEARL3B] = 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] = 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] = 1500; rd->thresh_mult[THR_COMP_NEAR_NEWL3B] = 1870; rd->thresh_mult[THR_COMP_NEW_NEARL3B] = 1700; rd->thresh_mult[THR_COMP_NEW_NEWL3B] = 2000; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] = 2500; rd->thresh_mult[THR_COMP_NEAR_NEARGB] = 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWGB] = 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTGB] = 1500; rd->thresh_mult[THR_COMP_NEAR_NEWGB] = 1700; rd->thresh_mult[THR_COMP_NEW_NEARGB] = 1700; rd->thresh_mult[THR_COMP_NEW_NEWGB] = 2000; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] = 2500; rd->thresh_mult[THR_COMP_NEAR_NEARLA2] = 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] = 1800; rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] = 1500; rd->thresh_mult[THR_COMP_NEAR_NEWLA2] = 1700; rd->thresh_mult[THR_COMP_NEW_NEARLA2] = 1700; rd->thresh_mult[THR_COMP_NEW_NEWLA2] = 2000; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA2] = 2500; rd->thresh_mult[THR_COMP_NEAR_NEARL2A2] = 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWL2A2] = 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTL2A2] = 1500; rd->thresh_mult[THR_COMP_NEAR_NEWL2A2] = 1700; rd->thresh_mult[THR_COMP_NEW_NEARL2A2] = 1700; rd->thresh_mult[THR_COMP_NEW_NEWL2A2] = 2000; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] = 2500; rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] = 1440; rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] = 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] = 1500; rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] = 1700; rd->thresh_mult[THR_COMP_NEW_NEARL3A2] = 1700; rd->thresh_mult[THR_COMP_NEW_NEWL3A2] = 2000; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A2] = 2500; rd->thresh_mult[THR_COMP_NEAR_NEARGA2] = 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWGA2] = 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTGA2] = 1500; rd->thresh_mult[THR_COMP_NEAR_NEWGA2] = 1700; rd->thresh_mult[THR_COMP_NEW_NEARGA2] = 1700; rd->thresh_mult[THR_COMP_NEW_NEWGA2] = 2000; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] = 2750; rd->thresh_mult[THR_COMP_NEAR_NEARLL2] = 1600; rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] = 2000; rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] = 2000; rd->thresh_mult[THR_COMP_NEAR_NEWLL2] = 2640; rd->thresh_mult[THR_COMP_NEW_NEARLL2] = 2200; rd->thresh_mult[THR_COMP_NEW_NEWLL2] = 2400; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] = 3200; rd->thresh_mult[THR_COMP_NEAR_NEARLL3] = 1600; rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] = 2000; rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] = 1800; rd->thresh_mult[THR_COMP_NEAR_NEWLL3] = 2200; rd->thresh_mult[THR_COMP_NEW_NEARLL3] = 2200; rd->thresh_mult[THR_COMP_NEW_NEWLL3] = 2400; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] = 3200; rd->thresh_mult[THR_COMP_NEAR_NEARLG] = 1760; rd->thresh_mult[THR_COMP_NEAREST_NEWLG] = 2400; rd->thresh_mult[THR_COMP_NEW_NEARESTLG] = 2000; rd->thresh_mult[THR_COMP_NEAR_NEWLG] = 1760; rd->thresh_mult[THR_COMP_NEW_NEARLG] = 2640; rd->thresh_mult[THR_COMP_NEW_NEWLG] = 2400; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] = 3200; rd->thresh_mult[THR_COMP_NEAR_NEARBA] = 1600; rd->thresh_mult[THR_COMP_NEAREST_NEWBA] = 2000; rd->thresh_mult[THR_COMP_NEW_NEARESTBA] = 2000; rd->thresh_mult[THR_COMP_NEAR_NEWBA] = 2200; rd->thresh_mult[THR_COMP_NEW_NEARBA] = 1980; rd->thresh_mult[THR_COMP_NEW_NEWBA] = 2640; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] = 3200; rd->thresh_mult[THR_DC] = 1000; rd->thresh_mult[THR_PAETH] = 1000; rd->thresh_mult[THR_SMOOTH] = 2200; rd->thresh_mult[THR_SMOOTH_V] = 2000; rd->thresh_mult[THR_SMOOTH_H] = 2000; rd->thresh_mult[THR_H_PRED] = 2000; rd->thresh_mult[THR_V_PRED] = 1800; rd->thresh_mult[THR_D135_PRED] = 2500; rd->thresh_mult[THR_D203_PRED] = 2000; rd->thresh_mult[THR_D157_PRED] = 2500; rd->thresh_mult[THR_D67_PRED] = 2000; rd->thresh_mult[THR_D113_PRED] = 2500; rd->thresh_mult[THR_D45_PRED] = 2500; } static inline void update_thr_fact(int (*factor_buf)[MAX_MODES], THR_MODES best_mode_index, THR_MODES mode_start, THR_MODES mode_end, BLOCK_SIZE min_size, BLOCK_SIZE max_size, int max_rd_thresh_factor) { for (THR_MODES mode = mode_start; mode < mode_end; ++mode) { for (BLOCK_SIZE bs = min_size; bs <= max_size; ++bs) { int *const fact = &factor_buf[bs][mode]; if (mode == best_mode_index) { *fact -= (*fact >> RD_THRESH_LOG_DEC_FACTOR); } else { *fact = AOMMIN(*fact + RD_THRESH_INC, max_rd_thresh_factor); } } } } void av1_update_rd_thresh_fact( const AV1_COMMON *const cm, int (*factor_buf)[MAX_MODES], int use_adaptive_rd_thresh, BLOCK_SIZE bsize, THR_MODES best_mode_index, THR_MODES inter_mode_start, THR_MODES inter_mode_end, THR_MODES intra_mode_start, THR_MODES intra_mode_end) { assert(use_adaptive_rd_thresh > 0); const int max_rd_thresh_factor = use_adaptive_rd_thresh * RD_THRESH_MAX_FACT; const int bsize_is_1_to_4 = bsize > cm->seq_params->sb_size; BLOCK_SIZE min_size, max_size; if (bsize_is_1_to_4) { // This part handles block sizes with 1:4 and 4:1 aspect ratios // TODO(any): Experiment with threshold update for parent/child blocks min_size = bsize; max_size = bsize; } else { min_size = AOMMAX(bsize - 2, BLOCK_4X4); max_size = AOMMIN(bsize + 2, (int)cm->seq_params->sb_size); } update_thr_fact(factor_buf, best_mode_index, inter_mode_start, inter_mode_end, min_size, max_size, max_rd_thresh_factor); update_thr_fact(factor_buf, best_mode_index, intra_mode_start, intra_mode_end, min_size, max_size, max_rd_thresh_factor); } int av1_get_intra_cost_penalty(int qindex, int qdelta, aom_bit_depth_t bit_depth) { const int q = av1_dc_quant_QTX(qindex, qdelta, bit_depth); switch (bit_depth) { case AOM_BITS_8: return 20 * q; case AOM_BITS_10: return 5 * q; case AOM_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2); default: assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); return -1; } } aom-3.12.1/av1/encoder/rd.h000066400000000000000000000333451477627663500152750ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_RD_H_ #define AOM_AV1_ENCODER_RD_H_ #include #include "aom/aomcx.h" #include "av1/common/blockd.h" #include "av1/encoder/block.h" #include "av1/encoder/context_tree.h" #include "av1/encoder/cost.h" #include "av1/encoder/ratectrl.h" #include "config/aom_config.h" #ifdef __cplusplus extern "C" { #endif #define RDDIV_BITS 7 #define RD_EPB_SHIFT 6 #define RDCOST(RM, R, D) \ (ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT) + \ ((D) * (1 << RDDIV_BITS))) #define RDCOST_NEG_R(RM, R, D) \ (((D) * (1 << RDDIV_BITS)) - \ ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT)) #define RDCOST_DBL_WITH_NATIVE_BD_DIST(RM, R, D, BD) \ (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \ ((double)((D) >> (2 * (BD - 8))) * (1 << RDDIV_BITS))) #define QIDX_SKIP_THRESH 115 #define MV_COST_WEIGHT 108 #define MV_COST_WEIGHT_SUB 120 // The fractional part of rd_thresh factor is stored with 5 bits. The maximum // factor that we allow is two, which is stored as 2 ** (5+1) = 64 #define RD_THRESH_FAC_FRAC_BITS (5) #define RD_THRESH_FAC_FRAC_VAL (1 << (RD_THRESH_FAC_FRAC_BITS)) #define RD_THRESH_MAX_FACT ((RD_THRESH_FAC_FRAC_VAL) << 1) #define RD_THRESH_LOG_DEC_FACTOR (4) #define RD_THRESH_INC (1) // Factor to weigh the rate for switchable interp filters. #define SWITCHABLE_INTERP_RATE_FACTOR 1 // Macros for common video resolutions: width x height // For example, 720p represents video resolution of 1280x720 pixels. #define RESOLUTION_288P 352 * 288 #define RESOLUTION_360P 640 * 360 #define RESOLUTION_480P 640 * 480 #define RESOLUTION_720P 1280 * 720 #define RESOLUTION_1080P 1920 * 1080 #define RESOLUTION_1440P 2560 * 1440 #define RESOLUTION_4K 3840 * 2160 #define RTC_REFS 4 static const MV_REFERENCE_FRAME real_time_ref_combos[RTC_REFS][2] = { { LAST_FRAME, NONE_FRAME }, { ALTREF_FRAME, NONE_FRAME }, { GOLDEN_FRAME, NONE_FRAME }, { INTRA_FRAME, NONE_FRAME } }; static inline int mode_offset(const PREDICTION_MODE mode) { if (mode >= NEARESTMV) { return INTER_OFFSET(mode); } else { switch (mode) { case DC_PRED: return 0; case V_PRED: return 1; case H_PRED: return 2; case SMOOTH_PRED: return 3; default: assert(0); return -1; } } } enum { // Default initialization when we are not using winner mode framework. e.g. // intrabc DEFAULT_EVAL = 0, // Initialization for selecting winner mode MODE_EVAL, // Initialization for winner mode evaluation WINNER_MODE_EVAL, // All mode evaluation types MODE_EVAL_TYPES, } UENUM1BYTE(MODE_EVAL_TYPE); typedef struct RD_OPT { // Thresh_mult is used to set a threshold for the rd score. A higher value // means that we will accept the best mode so far more often. This number // is used in combination with the current block size, and thresh_freq_fact // to pick a threshold. int thresh_mult[MAX_MODES]; int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES]; int RDMULT; double r0; } RD_OPT; static inline void av1_init_rd_stats(RD_STATS *rd_stats) { #if CONFIG_RD_DEBUG int plane; #endif rd_stats->rate = 0; rd_stats->dist = 0; rd_stats->rdcost = 0; rd_stats->sse = 0; rd_stats->skip_txfm = 1; rd_stats->zero_rate = 0; #if CONFIG_RD_DEBUG // This may run into problems when monochrome video is // encoded, as there will only be 1 plane for (plane = 0; plane < MAX_MB_PLANE; ++plane) { rd_stats->txb_coeff_cost[plane] = 0; } #endif } static inline void av1_invalid_rd_stats(RD_STATS *rd_stats) { #if CONFIG_RD_DEBUG int plane; #endif rd_stats->rate = INT_MAX; rd_stats->dist = INT64_MAX; rd_stats->rdcost = INT64_MAX; rd_stats->sse = INT64_MAX; rd_stats->skip_txfm = 0; rd_stats->zero_rate = 0; #if CONFIG_RD_DEBUG // This may run into problems when monochrome video is // encoded, as there will only be 1 plane for (plane = 0; plane < MAX_MB_PLANE; ++plane) { rd_stats->txb_coeff_cost[plane] = INT_MAX; } #endif } static inline void av1_merge_rd_stats(RD_STATS *rd_stats_dst, const RD_STATS *rd_stats_src) { if (rd_stats_dst->rate == INT_MAX || rd_stats_src->rate == INT_MAX) { // If rd_stats_dst or rd_stats_src has invalid rate, we will make // rd_stats_dst invalid. av1_invalid_rd_stats(rd_stats_dst); return; } rd_stats_dst->rate = (int)AOMMIN( ((int64_t)rd_stats_dst->rate + (int64_t)rd_stats_src->rate), INT_MAX); if (!rd_stats_dst->zero_rate) rd_stats_dst->zero_rate = rd_stats_src->zero_rate; rd_stats_dst->dist += rd_stats_src->dist; if (rd_stats_dst->sse < INT64_MAX && rd_stats_src->sse < INT64_MAX) { rd_stats_dst->sse += rd_stats_src->sse; } rd_stats_dst->skip_txfm &= rd_stats_src->skip_txfm; #if CONFIG_RD_DEBUG // This may run into problems when monochrome video is // encoded, as there will only be 1 plane for (int plane = 0; plane < MAX_MB_PLANE; ++plane) { rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane]; } #endif } static inline void av1_accumulate_rd_stats(RD_STATS *rd_stats, int64_t dist, int rate, int skip_txfm, int64_t sse, int zero_rate) { assert(rd_stats->rate != INT_MAX && rate != INT_MAX); rd_stats->rate += rate; if (!rd_stats->zero_rate) rd_stats->zero_rate = zero_rate; rd_stats->dist += dist; rd_stats->skip_txfm &= skip_txfm; rd_stats->sse += sse; } static inline int64_t av1_calculate_rd_cost(int mult, int rate, int64_t dist) { assert(mult >= 0); if (rate >= 0) { return RDCOST(mult, rate, dist); } return RDCOST_NEG_R(mult, -rate, dist); } static inline void av1_rd_cost_update(int mult, RD_STATS *rd_cost) { if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX && rd_cost->rdcost < INT64_MAX) { rd_cost->rdcost = av1_calculate_rd_cost(mult, rd_cost->rate, rd_cost->dist); } else { av1_invalid_rd_stats(rd_cost); } } static inline void av1_rd_stats_subtraction(int mult, const RD_STATS *const left, const RD_STATS *const right, RD_STATS *result) { if (left->rate == INT_MAX || right->rate == INT_MAX || left->dist == INT64_MAX || right->dist == INT64_MAX || left->rdcost == INT64_MAX || right->rdcost == INT64_MAX) { av1_invalid_rd_stats(result); } else { result->rate = left->rate - right->rate; result->dist = left->dist - right->dist; result->rdcost = av1_calculate_rd_cost(mult, result->rate, result->dist); } } struct TileInfo; struct TileDataEnc; struct AV1_COMP; struct macroblock; /*!\brief Compute rdmult based on q index and frame update type * * \param[in] bit_depth bit depth * \param[in] update_type frame update type * \param[in] qindex q index * \param[in] tuning visual tuning metric * * \return rdmult */ int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth, FRAME_UPDATE_TYPE update_type, int qindex, aom_tune_metric tuning); int av1_compute_rd_mult(const int qindex, const aom_bit_depth_t bit_depth, const FRAME_UPDATE_TYPE update_type, const int layer_depth, const int boost_index, const FRAME_TYPE frame_type, const int use_fixed_qp_offsets, const int is_stat_consumption_stage, const aom_tune_metric tuning); void av1_initialize_rd_consts(struct AV1_COMP *cpi); // Sets the multiplier to convert mv cost to l1 error during motion search. void av1_set_sad_per_bit(const struct AV1_COMP *cpi, int *sadperbit, int qindex); void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n, unsigned int qstep, int *rate, int64_t *dist); void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr, double *rate_f, double *distbysse_f); int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd, InterpFilter interp_filter, int dual_filter); YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi, int ref_frame); void av1_init_me_luts(void); void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx); void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize, const struct macroblockd_plane *pd, ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]); void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi); void av1_update_rd_thresh_fact(const AV1_COMMON *const cm, int (*fact)[MAX_MODES], int rd_thresh, BLOCK_SIZE bsize, THR_MODES best_mode_index, THR_MODES inter_mode_start, THR_MODES inter_mode_end, THR_MODES intra_mode_start, THR_MODES intra_mode_end); static inline void reset_thresh_freq_fact(MACROBLOCK *const x) { for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { for (int j = 0; j < MAX_MODES; ++j) { x->thresh_freq_fact[i][j] = RD_THRESH_FAC_FRAC_VAL; } } } static inline int rd_less_than_thresh(int64_t best_rd, int64_t thresh, int thresh_fact) { return best_rd < (thresh * thresh_fact >> 5) || thresh == INT_MAX; } void av1_mv_pred(const struct AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame, BLOCK_SIZE block_size); // Sets the multiplier to convert mv cost to l2 error during motion search. static inline void av1_set_error_per_bit(int *errorperbit, int rdmult) { *errorperbit = AOMMAX(rdmult >> RD_EPB_SHIFT, 1); } // Get the threshold for R-D optimization of coefficients depending upon mode // decision/winner mode processing static inline void get_rd_opt_coeff_thresh( const uint32_t (*const coeff_opt_threshold)[2], TxfmSearchParams *txfm_params, int enable_winner_mode_for_coeff_opt, int is_winner_mode) { if (!enable_winner_mode_for_coeff_opt) { // Default initialization of threshold txfm_params->coeff_opt_thresholds[0] = coeff_opt_threshold[DEFAULT_EVAL][0]; txfm_params->coeff_opt_thresholds[1] = coeff_opt_threshold[DEFAULT_EVAL][1]; return; } // TODO(any): Experiment with coeff_opt_dist_threshold values when // enable_winner_mode_for_coeff_opt is ON // TODO(any): Skip the winner mode processing for blocks with lower residual // energy as R-D optimization of coefficients would have been enabled during // mode decision // Use conservative threshold during mode decision and perform R-D // optimization of coeffs always for winner modes if (is_winner_mode) { txfm_params->coeff_opt_thresholds[0] = coeff_opt_threshold[WINNER_MODE_EVAL][0]; txfm_params->coeff_opt_thresholds[1] = coeff_opt_threshold[WINNER_MODE_EVAL][1]; } else { txfm_params->coeff_opt_thresholds[0] = coeff_opt_threshold[MODE_EVAL][0]; txfm_params->coeff_opt_thresholds[1] = coeff_opt_threshold[MODE_EVAL][1]; } } // Used to reset the state of mb rd hash information static inline void reset_mb_rd_record(MB_RD_RECORD *const mb_rd_record) { if (!mb_rd_record) return; // Reset the state for use_mb_rd_hash mb_rd_record->num = mb_rd_record->index_start = 0; } void av1_setup_pred_block(const MACROBLOCKD *xd, struct buf_2d dst[MAX_MB_PLANE], const YV12_BUFFER_CONFIG *src, const struct scale_factors *scale, const struct scale_factors *scale_uv, const int num_planes); int av1_get_intra_cost_penalty(int qindex, int qdelta, aom_bit_depth_t bit_depth); void av1_fill_mode_rates(AV1_COMMON *const cm, ModeCosts *mode_costs, FRAME_CONTEXT *fc); #if !CONFIG_REALTIME_ONLY void av1_fill_lr_rates(ModeCosts *mode_costs, FRAME_CONTEXT *fc); #endif void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc, const int num_planes); void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp, MvCosts *mv_costs); void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs); #if !CONFIG_REALTIME_ONLY int av1_get_adaptive_rdmult(const struct AV1_COMP *cpi, double beta); #endif int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta); /*!\brief Adjust current superblock's q_index based on delta q resolution * * \param[in] delta_q_res delta q resolution * \param[in] prev_qindex previous superblock's q index * \param[in] curr_qindex current superblock's q index * * \return the current superblock's adjusted q_index */ int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex, int curr_qindex); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_RD_H_ aom-3.12.1/av1/encoder/rdopt.c000066400000000000000000010154131477627663500160100ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/blend.h" #include "aom_mem/aom_mem.h" #include "aom_ports/aom_timer.h" #include "aom_ports/mem.h" #include "av1/common/av1_common_int.h" #include "av1/common/cfl.h" #include "av1/common/blockd.h" #include "av1/common/common.h" #include "av1/common/common_data.h" #include "av1/common/entropy.h" #include "av1/common/entropymode.h" #include "av1/common/idct.h" #include "av1/common/mvref_common.h" #include "av1/common/obmc.h" #include "av1/common/pred_common.h" #include "av1/common/quant_common.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" #include "av1/common/scan.h" #include "av1/common/seg_common.h" #include "av1/common/txb_common.h" #include "av1/common/warped_motion.h" #include "av1/encoder/aq_variance.h" #include "av1/encoder/av1_quantize.h" #include "av1/encoder/cost.h" #include "av1/encoder/compound_type.h" #include "av1/encoder/encodemb.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encodetxb.h" #include "av1/encoder/hybrid_fwd_txfm.h" #include "av1/encoder/interp_search.h" #include "av1/encoder/intra_mode_search.h" #include "av1/encoder/intra_mode_search_utils.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/ml.h" #include "av1/encoder/mode_prune_model_weights.h" #include "av1/encoder/model_rd.h" #include "av1/encoder/motion_search_facade.h" #include "av1/encoder/palette.h" #include "av1/encoder/pustats.h" #include "av1/encoder/random.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/rd.h" #include "av1/encoder/rdopt.h" #include "av1/encoder/reconinter_enc.h" #include "av1/encoder/tokenize.h" #include "av1/encoder/tpl_model.h" #include "av1/encoder/tx_search.h" #include "av1/encoder/var_based_part.h" #define LAST_NEW_MV_INDEX 6 // Mode_threshold multiplication factor table for prune_inter_modes_if_skippable // The values are kept in Q12 format and equation used to derive is // (2.5 - ((float)x->qindex / MAXQ) * 1.5) #define MODE_THRESH_QBITS 12 static const int mode_threshold_mul_factor[QINDEX_RANGE] = { 10240, 10216, 10192, 10168, 10144, 10120, 10095, 10071, 10047, 10023, 9999, 9975, 9951, 9927, 9903, 9879, 9854, 9830, 9806, 9782, 9758, 9734, 9710, 9686, 9662, 9638, 9614, 9589, 9565, 9541, 9517, 9493, 9469, 9445, 9421, 9397, 9373, 9349, 9324, 9300, 9276, 9252, 9228, 9204, 9180, 9156, 9132, 9108, 9083, 9059, 9035, 9011, 8987, 8963, 8939, 8915, 8891, 8867, 8843, 8818, 8794, 8770, 8746, 8722, 8698, 8674, 8650, 8626, 8602, 8578, 8553, 8529, 8505, 8481, 8457, 8433, 8409, 8385, 8361, 8337, 8312, 8288, 8264, 8240, 8216, 8192, 8168, 8144, 8120, 8096, 8072, 8047, 8023, 7999, 7975, 7951, 7927, 7903, 7879, 7855, 7831, 7806, 7782, 7758, 7734, 7710, 7686, 7662, 7638, 7614, 7590, 7566, 7541, 7517, 7493, 7469, 7445, 7421, 7397, 7373, 7349, 7325, 7301, 7276, 7252, 7228, 7204, 7180, 7156, 7132, 7108, 7084, 7060, 7035, 7011, 6987, 6963, 6939, 6915, 6891, 6867, 6843, 6819, 6795, 6770, 6746, 6722, 6698, 6674, 6650, 6626, 6602, 6578, 6554, 6530, 6505, 6481, 6457, 6433, 6409, 6385, 6361, 6337, 6313, 6289, 6264, 6240, 6216, 6192, 6168, 6144, 6120, 6096, 6072, 6048, 6024, 5999, 5975, 5951, 5927, 5903, 5879, 5855, 5831, 5807, 5783, 5758, 5734, 5710, 5686, 5662, 5638, 5614, 5590, 5566, 5542, 5518, 5493, 5469, 5445, 5421, 5397, 5373, 5349, 5325, 5301, 5277, 5253, 5228, 5204, 5180, 5156, 5132, 5108, 5084, 5060, 5036, 5012, 4987, 4963, 4939, 4915, 4891, 4867, 4843, 4819, 4795, 4771, 4747, 4722, 4698, 4674, 4650, 4626, 4602, 4578, 4554, 4530, 4506, 4482, 4457, 4433, 4409, 4385, 4361, 4337, 4313, 4289, 4265, 4241, 4216, 4192, 4168, 4144, 4120, 4096 }; static const THR_MODES av1_default_mode_order[MAX_MODES] = { THR_NEARESTMV, THR_NEARESTL2, THR_NEARESTL3, THR_NEARESTB, THR_NEARESTA2, THR_NEARESTA, THR_NEARESTG, THR_NEWMV, THR_NEWL2, THR_NEWL3, THR_NEWB, THR_NEWA2, THR_NEWA, THR_NEWG, THR_NEARMV, THR_NEARL2, THR_NEARL3, THR_NEARB, THR_NEARA2, THR_NEARA, THR_NEARG, THR_GLOBALMV, THR_GLOBALL2, THR_GLOBALL3, THR_GLOBALB, THR_GLOBALA2, THR_GLOBALA, THR_GLOBALG, THR_COMP_NEAREST_NEARESTLA, THR_COMP_NEAREST_NEARESTL2A, THR_COMP_NEAREST_NEARESTL3A, THR_COMP_NEAREST_NEARESTGA, THR_COMP_NEAREST_NEARESTLB, THR_COMP_NEAREST_NEARESTL2B, THR_COMP_NEAREST_NEARESTL3B, THR_COMP_NEAREST_NEARESTGB, THR_COMP_NEAREST_NEARESTLA2, THR_COMP_NEAREST_NEARESTL2A2, THR_COMP_NEAREST_NEARESTL3A2, THR_COMP_NEAREST_NEARESTGA2, THR_COMP_NEAREST_NEARESTLL2, THR_COMP_NEAREST_NEARESTLL3, THR_COMP_NEAREST_NEARESTLG, THR_COMP_NEAREST_NEARESTBA, THR_COMP_NEAR_NEARLB, THR_COMP_NEW_NEWLB, THR_COMP_NEW_NEARESTLB, THR_COMP_NEAREST_NEWLB, THR_COMP_NEW_NEARLB, THR_COMP_NEAR_NEWLB, THR_COMP_GLOBAL_GLOBALLB, THR_COMP_NEAR_NEARLA, THR_COMP_NEW_NEWLA, THR_COMP_NEW_NEARESTLA, THR_COMP_NEAREST_NEWLA, THR_COMP_NEW_NEARLA, THR_COMP_NEAR_NEWLA, THR_COMP_GLOBAL_GLOBALLA, THR_COMP_NEAR_NEARL2A, THR_COMP_NEW_NEWL2A, THR_COMP_NEW_NEARESTL2A, THR_COMP_NEAREST_NEWL2A, THR_COMP_NEW_NEARL2A, THR_COMP_NEAR_NEWL2A, THR_COMP_GLOBAL_GLOBALL2A, THR_COMP_NEAR_NEARL3A, THR_COMP_NEW_NEWL3A, THR_COMP_NEW_NEARESTL3A, THR_COMP_NEAREST_NEWL3A, THR_COMP_NEW_NEARL3A, THR_COMP_NEAR_NEWL3A, THR_COMP_GLOBAL_GLOBALL3A, THR_COMP_NEAR_NEARGA, THR_COMP_NEW_NEWGA, THR_COMP_NEW_NEARESTGA, THR_COMP_NEAREST_NEWGA, THR_COMP_NEW_NEARGA, THR_COMP_NEAR_NEWGA, THR_COMP_GLOBAL_GLOBALGA, THR_COMP_NEAR_NEARL2B, THR_COMP_NEW_NEWL2B, THR_COMP_NEW_NEARESTL2B, THR_COMP_NEAREST_NEWL2B, THR_COMP_NEW_NEARL2B, THR_COMP_NEAR_NEWL2B, THR_COMP_GLOBAL_GLOBALL2B, THR_COMP_NEAR_NEARL3B, THR_COMP_NEW_NEWL3B, THR_COMP_NEW_NEARESTL3B, THR_COMP_NEAREST_NEWL3B, THR_COMP_NEW_NEARL3B, THR_COMP_NEAR_NEWL3B, THR_COMP_GLOBAL_GLOBALL3B, THR_COMP_NEAR_NEARGB, THR_COMP_NEW_NEWGB, THR_COMP_NEW_NEARESTGB, THR_COMP_NEAREST_NEWGB, THR_COMP_NEW_NEARGB, THR_COMP_NEAR_NEWGB, THR_COMP_GLOBAL_GLOBALGB, THR_COMP_NEAR_NEARLA2, THR_COMP_NEW_NEWLA2, THR_COMP_NEW_NEARESTLA2, THR_COMP_NEAREST_NEWLA2, THR_COMP_NEW_NEARLA2, THR_COMP_NEAR_NEWLA2, THR_COMP_GLOBAL_GLOBALLA2, THR_COMP_NEAR_NEARL2A2, THR_COMP_NEW_NEWL2A2, THR_COMP_NEW_NEARESTL2A2, THR_COMP_NEAREST_NEWL2A2, THR_COMP_NEW_NEARL2A2, THR_COMP_NEAR_NEWL2A2, THR_COMP_GLOBAL_GLOBALL2A2, THR_COMP_NEAR_NEARL3A2, THR_COMP_NEW_NEWL3A2, THR_COMP_NEW_NEARESTL3A2, THR_COMP_NEAREST_NEWL3A2, THR_COMP_NEW_NEARL3A2, THR_COMP_NEAR_NEWL3A2, THR_COMP_GLOBAL_GLOBALL3A2, THR_COMP_NEAR_NEARGA2, THR_COMP_NEW_NEWGA2, THR_COMP_NEW_NEARESTGA2, THR_COMP_NEAREST_NEWGA2, THR_COMP_NEW_NEARGA2, THR_COMP_NEAR_NEWGA2, THR_COMP_GLOBAL_GLOBALGA2, THR_COMP_NEAR_NEARLL2, THR_COMP_NEW_NEWLL2, THR_COMP_NEW_NEARESTLL2, THR_COMP_NEAREST_NEWLL2, THR_COMP_NEW_NEARLL2, THR_COMP_NEAR_NEWLL2, THR_COMP_GLOBAL_GLOBALLL2, THR_COMP_NEAR_NEARLL3, THR_COMP_NEW_NEWLL3, THR_COMP_NEW_NEARESTLL3, THR_COMP_NEAREST_NEWLL3, THR_COMP_NEW_NEARLL3, THR_COMP_NEAR_NEWLL3, THR_COMP_GLOBAL_GLOBALLL3, THR_COMP_NEAR_NEARLG, THR_COMP_NEW_NEWLG, THR_COMP_NEW_NEARESTLG, THR_COMP_NEAREST_NEWLG, THR_COMP_NEW_NEARLG, THR_COMP_NEAR_NEWLG, THR_COMP_GLOBAL_GLOBALLG, THR_COMP_NEAR_NEARBA, THR_COMP_NEW_NEWBA, THR_COMP_NEW_NEARESTBA, THR_COMP_NEAREST_NEWBA, THR_COMP_NEW_NEARBA, THR_COMP_NEAR_NEWBA, THR_COMP_GLOBAL_GLOBALBA, THR_DC, THR_PAETH, THR_SMOOTH, THR_SMOOTH_V, THR_SMOOTH_H, THR_H_PRED, THR_V_PRED, THR_D135_PRED, THR_D203_PRED, THR_D157_PRED, THR_D67_PRED, THR_D113_PRED, THR_D45_PRED, }; /*!\cond */ typedef struct SingleInterModeState { int64_t rd; MV_REFERENCE_FRAME ref_frame; int valid; } SingleInterModeState; typedef struct InterModeSearchState { int64_t best_rd; int64_t best_skip_rd[2]; MB_MODE_INFO best_mbmode; int best_rate_y; int best_rate_uv; int best_mode_skippable; int best_skip2; THR_MODES best_mode_index; int num_available_refs; int64_t dist_refs[REF_FRAMES]; int dist_order_refs[REF_FRAMES]; int64_t mode_threshold[MAX_MODES]; int64_t best_intra_rd; unsigned int best_pred_sse; /*! * \brief Keep track of best intra rd for use in compound mode. */ int64_t best_pred_rd[REFERENCE_MODES]; // Save a set of single_newmv for each checked ref_mv. int_mv single_newmv[MAX_REF_MV_SEARCH][REF_FRAMES]; int single_newmv_rate[MAX_REF_MV_SEARCH][REF_FRAMES]; int single_newmv_valid[MAX_REF_MV_SEARCH][REF_FRAMES]; int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES]; // The rd of simple translation in single inter modes int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES]; int64_t best_single_rd[REF_FRAMES]; PREDICTION_MODE best_single_mode[REF_FRAMES]; // Single search results by [directions][modes][reference frames] SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; int single_state_cnt[2][SINGLE_INTER_MODE_NUM]; SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM] [FWD_REFS]; int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM]; MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; IntraModeSearchState intra_search_state; RD_STATS best_y_rdcost; } InterModeSearchState; /*!\endcond */ void av1_inter_mode_data_init(TileDataEnc *tile_data) { for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { InterModeRdModel *md = &tile_data->inter_mode_rd_models[i]; md->ready = 0; md->num = 0; md->dist_sum = 0; md->ld_sum = 0; md->sse_sum = 0; md->sse_sse_sum = 0; md->sse_ld_sum = 0; } } static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize, int64_t sse, int *est_residue_cost, int64_t *est_dist) { const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; if (md->ready) { if (sse < md->dist_mean) { *est_residue_cost = 0; *est_dist = sse; } else { *est_dist = (int64_t)round(md->dist_mean); const double est_ld = md->a * sse + md->b; // Clamp estimated rate cost by INT_MAX / 2. // TODO(angiebird@google.com): find better solution than clamping. if (fabs(est_ld) < 1e-2) { *est_residue_cost = INT_MAX / 2; } else { double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld); if (est_residue_cost_dbl < 0) { *est_residue_cost = 0; } else { *est_residue_cost = (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2); } } if (*est_residue_cost <= 0) { *est_residue_cost = 0; *est_dist = sse; } } return 1; } return 0; } void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) { for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { const int block_idx = inter_mode_data_block_idx(bsize); InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; if (block_idx == -1) continue; if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) { continue; } else { if (md->ready == 0) { md->dist_mean = md->dist_sum / md->num; md->ld_mean = md->ld_sum / md->num; md->sse_mean = md->sse_sum / md->num; md->sse_sse_mean = md->sse_sse_sum / md->num; md->sse_ld_mean = md->sse_ld_sum / md->num; } else { const double factor = 3; md->dist_mean = (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1); md->ld_mean = (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1); md->sse_mean = (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1); md->sse_sse_mean = (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) / (factor + 1); md->sse_ld_mean = (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) / (factor + 1); } const double my = md->ld_mean; const double mx = md->sse_mean; const double dx = sqrt(md->sse_sse_mean); const double dxy = md->sse_ld_mean; md->a = (dxy - mx * my) / (dx * dx - mx * mx); md->b = my - md->a * mx; md->ready = 1; md->num = 0; md->dist_sum = 0; md->ld_sum = 0; md->sse_sum = 0; md->sse_sse_sum = 0; md->sse_ld_sum = 0; } (void)rdmult; } } static inline void inter_mode_data_push(TileDataEnc *tile_data, BLOCK_SIZE bsize, int64_t sse, int64_t dist, int residue_cost) { if (residue_cost == 0 || sse == dist) return; const int block_idx = inter_mode_data_block_idx(bsize); if (block_idx == -1) return; InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize]; if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) { const double ld = (sse - dist) * 1. / residue_cost; ++rd_model->num; rd_model->dist_sum += dist; rd_model->ld_sum += ld; rd_model->sse_sum += sse; rd_model->sse_sse_sum += (double)sse * (double)sse; rd_model->sse_ld_sum += sse * ld; } } static inline void inter_modes_info_push(InterModesInfo *inter_modes_info, int mode_rate, int64_t sse, int64_t rd, RD_STATS *rd_cost, RD_STATS *rd_cost_y, RD_STATS *rd_cost_uv, const MB_MODE_INFO *mbmi) { const int num = inter_modes_info->num; assert(num < MAX_INTER_MODES); inter_modes_info->mbmi_arr[num] = *mbmi; inter_modes_info->mode_rate_arr[num] = mode_rate; inter_modes_info->sse_arr[num] = sse; inter_modes_info->est_rd_arr[num] = rd; inter_modes_info->rd_cost_arr[num] = *rd_cost; inter_modes_info->rd_cost_y_arr[num] = *rd_cost_y; inter_modes_info->rd_cost_uv_arr[num] = *rd_cost_uv; ++inter_modes_info->num; } static int compare_rd_idx_pair(const void *a, const void *b) { if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) { // To avoid inconsistency in qsort() ordering when two elements are equal, // using idx as tie breaker. Refer aomedia:2928 if (((RdIdxPair *)a)->idx == ((RdIdxPair *)b)->idx) return 0; else if (((RdIdxPair *)a)->idx > ((RdIdxPair *)b)->idx) return 1; else return -1; } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) { return 1; } else { return -1; } } static inline void inter_modes_info_sort(const InterModesInfo *inter_modes_info, RdIdxPair *rd_idx_pair_arr) { if (inter_modes_info->num == 0) { return; } for (int i = 0; i < inter_modes_info->num; ++i) { rd_idx_pair_arr[i].idx = i; rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i]; } qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]), compare_rd_idx_pair); } // Similar to get_horver_correlation, but also takes into account first // row/column, when computing horizontal/vertical correlation. void av1_get_horver_correlation_full_c(const int16_t *diff, int stride, int width, int height, float *hcorr, float *vcorr) { // The following notation is used: // x - current pixel // y - left neighbor pixel // z - top neighbor pixel int64_t x_sum = 0, x2_sum = 0, xy_sum = 0, xz_sum = 0; int64_t x_firstrow = 0, x_finalrow = 0, x_firstcol = 0, x_finalcol = 0; int64_t x2_firstrow = 0, x2_finalrow = 0, x2_firstcol = 0, x2_finalcol = 0; // First, process horizontal correlation on just the first row x_sum += diff[0]; x2_sum += diff[0] * diff[0]; x_firstrow += diff[0]; x2_firstrow += diff[0] * diff[0]; for (int j = 1; j < width; ++j) { const int16_t x = diff[j]; const int16_t y = diff[j - 1]; x_sum += x; x_firstrow += x; x2_sum += x * x; x2_firstrow += x * x; xy_sum += x * y; } // Process vertical correlation in the first column x_firstcol += diff[0]; x2_firstcol += diff[0] * diff[0]; for (int i = 1; i < height; ++i) { const int16_t x = diff[i * stride]; const int16_t z = diff[(i - 1) * stride]; x_sum += x; x_firstcol += x; x2_sum += x * x; x2_firstcol += x * x; xz_sum += x * z; } // Now process horiz and vert correlation through the rest unit for (int i = 1; i < height; ++i) { for (int j = 1; j < width; ++j) { const int16_t x = diff[i * stride + j]; const int16_t y = diff[i * stride + j - 1]; const int16_t z = diff[(i - 1) * stride + j]; x_sum += x; x2_sum += x * x; xy_sum += x * y; xz_sum += x * z; } } for (int j = 0; j < width; ++j) { x_finalrow += diff[(height - 1) * stride + j]; x2_finalrow += diff[(height - 1) * stride + j] * diff[(height - 1) * stride + j]; } for (int i = 0; i < height; ++i) { x_finalcol += diff[i * stride + width - 1]; x2_finalcol += diff[i * stride + width - 1] * diff[i * stride + width - 1]; } int64_t xhor_sum = x_sum - x_finalcol; int64_t xver_sum = x_sum - x_finalrow; int64_t y_sum = x_sum - x_firstcol; int64_t z_sum = x_sum - x_firstrow; int64_t x2hor_sum = x2_sum - x2_finalcol; int64_t x2ver_sum = x2_sum - x2_finalrow; int64_t y2_sum = x2_sum - x2_firstcol; int64_t z2_sum = x2_sum - x2_firstrow; const float num_hor = (float)(height * (width - 1)); const float num_ver = (float)((height - 1) * width); const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; if (xhor_var_n > 0 && y_var_n > 0) { *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); *hcorr = *hcorr < 0 ? 0 : *hcorr; } else { *hcorr = 1.0; } if (xver_var_n > 0 && z_var_n > 0) { *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); *vcorr = *vcorr < 0 ? 0 : *vcorr; } else { *vcorr = 1.0; } } static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x, int64_t *sse_y) { const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); const MACROBLOCKD *xd = &x->e_mbd; const MB_MODE_INFO *mbmi = xd->mi[0]; int64_t total_sse = 0; for (int plane = 0; plane < num_planes; ++plane) { if (plane && !xd->is_chroma_ref) break; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE bs = get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); unsigned int sse; cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse); total_sse += sse; if (!plane && sse_y) *sse_y = sse; } total_sse <<= 4; return total_sse; } int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz) { int i; int64_t error = 0, sqcoeff = 0; for (i = 0; i < block_size; i++) { const int diff = coeff[i] - dqcoeff[i]; error += diff * diff; sqcoeff += coeff[i] * coeff[i]; } *ssz = sqcoeff; return error; } int64_t av1_block_error_lp_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size) { int64_t error = 0; for (int i = 0; i < block_size; i++) { const int diff = coeff[i] - dqcoeff[i]; error += diff * diff; } return error; } #if CONFIG_AV1_HIGHBITDEPTH int64_t av1_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd) { int i; int64_t error = 0, sqcoeff = 0; int shift = 2 * (bd - 8); int rounding = (1 << shift) >> 1; for (i = 0; i < block_size; i++) { const int64_t diff = coeff[i] - dqcoeff[i]; error += diff * diff; sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i]; } error = (error + rounding) >> shift; sqcoeff = (sqcoeff + rounding) >> shift; *ssz = sqcoeff; return error; } #endif static int conditional_skipintra(PREDICTION_MODE mode, PREDICTION_MODE best_intra_mode) { if (mode == D113_PRED && best_intra_mode != V_PRED && best_intra_mode != D135_PRED) return 1; if (mode == D67_PRED && best_intra_mode != V_PRED && best_intra_mode != D45_PRED) return 1; if (mode == D203_PRED && best_intra_mode != H_PRED && best_intra_mode != D45_PRED) return 1; if (mode == D157_PRED && best_intra_mode != H_PRED && best_intra_mode != D135_PRED) return 1; return 0; } static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode, int16_t mode_context) { if (is_inter_compound_mode(mode)) { return mode_costs ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)]; } int mode_cost = 0; int16_t mode_ctx = mode_context & NEWMV_CTX_MASK; assert(is_inter_mode(mode)); if (mode == NEWMV) { mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0]; return mode_cost; } else { mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1]; mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; if (mode == GLOBALMV) { mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0]; return mode_cost; } else { mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1]; mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV]; return mode_cost; } } } static inline PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode, int ref_idx) { return ref_idx ? compound_ref1_mode(this_mode) : compound_ref0_mode(this_mode); } static inline void estimate_ref_frame_costs( const AV1_COMMON *cm, const MACROBLOCKD *xd, const ModeCosts *mode_costs, int segment_id, unsigned int *ref_costs_single, unsigned int (*ref_costs_comp)[REF_FRAMES]) { int seg_ref_active = segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME); if (seg_ref_active) { memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single)); int ref_frame; for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) memset(ref_costs_comp[ref_frame], 0, REF_FRAMES * sizeof((*ref_costs_comp)[0])); } else { int intra_inter_ctx = av1_get_intra_inter_context(xd); ref_costs_single[INTRA_FRAME] = mode_costs->intra_inter_cost[intra_inter_ctx][0]; unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1]; for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) ref_costs_single[i] = base_cost; const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd); const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd); const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd); const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd); const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd); const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd); // Determine cost of a single ref frame, where frame types are represented // by a tree: // Level 0: add cost whether this ref is a forward or backward ref ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0]; ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0]; ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0]; ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0]; ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1]; ref_costs_single[ALTREF2_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1]; ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1]; // Level 1: if this ref is forward ref, // add cost whether it is last/last2 or last3/golden ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][0]; ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][0]; ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][1]; ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][1]; // Level 1: if this ref is backward ref // then add cost whether this ref is altref or backward ref ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][0]; ref_costs_single[ALTREF2_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][0]; ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][1]; // Level 2: further add cost whether this ref is last or last2 ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p4][3][0]; ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p4][3][1]; // Level 2: last3 or golden ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p5][4][0]; ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p5][4][1]; // Level 2: bwdref or altref2 ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p6][5][0]; ref_costs_single[ALTREF2_FRAME] += mode_costs->single_ref_cost[ctx_p6][5][1]; if (cm->current_frame.reference_mode != SINGLE_REFERENCE) { // Similar to single ref, determine cost of compound ref frames. // cost_compound_refs = cost_first_ref + cost_second_ref const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd); const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd); const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd); const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd); const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd); const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd); unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 }; ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] = ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] = base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1]; ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0; ref_bicomp_costs[ALTREF_FRAME] = 0; // cost of first ref frame ref_bicomp_costs[LAST_FRAME] += mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0]; ref_bicomp_costs[LAST2_FRAME] += mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0]; ref_bicomp_costs[LAST3_FRAME] += mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1]; ref_bicomp_costs[GOLDEN_FRAME] += mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1]; ref_bicomp_costs[LAST_FRAME] += mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][0]; ref_bicomp_costs[LAST2_FRAME] += mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][1]; ref_bicomp_costs[LAST3_FRAME] += mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][0]; ref_bicomp_costs[GOLDEN_FRAME] += mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][1]; // cost of second ref frame ref_bicomp_costs[BWDREF_FRAME] += mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0]; ref_bicomp_costs[ALTREF2_FRAME] += mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0]; ref_bicomp_costs[ALTREF_FRAME] += mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][1]; ref_bicomp_costs[BWDREF_FRAME] += mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0]; ref_bicomp_costs[ALTREF2_FRAME] += mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1]; // cost: if one ref frame is forward ref, the other ref is backward ref int ref0, ref1; for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) { for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) { ref_costs_comp[ref0][ref1] = ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1]; } } // cost: if both ref frames are the same side. const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd); const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd); const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd); ref_costs_comp[LAST_FRAME][LAST2_FRAME] = base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0]; ref_costs_comp[LAST_FRAME][LAST3_FRAME] = base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0]; ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1]; ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1]; } else { int ref0, ref1; for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) { for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) ref_costs_comp[ref0][ref1] = 512; } ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512; ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512; ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512; ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512; } } } static inline void store_coding_context( #if CONFIG_INTERNAL_STATS MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index, #else MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, #endif // CONFIG_INTERNAL_STATS int skippable) { MACROBLOCKD *const xd = &x->e_mbd; // Take a snapshot of the coding context so it can be // restored if we decide to encode this way ctx->rd_stats.skip_txfm = x->txfm_search_info.skip_txfm; ctx->skippable = skippable; #if CONFIG_INTERNAL_STATS ctx->best_mode_index = mode_index; #endif // CONFIG_INTERNAL_STATS ctx->mic = *xd->mi[0]; av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext, av1_ref_frame_type(xd->mi[0]->ref_frame)); } static inline void setup_buffer_ref_mvs_inter( const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE block_size, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); const YV12_BUFFER_CONFIG *scaled_ref_frame = av1_get_scaled_ref_frame(cpi, ref_frame); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; const struct scale_factors *const sf = get_ref_scale_factors_const(cm, ref_frame); const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame); assert(yv12 != NULL); if (scaled_ref_frame) { // Setup pred block based on scaled reference, because av1_mv_pred() doesn't // support scaling. av1_setup_pred_block(xd, yv12_mb[ref_frame], scaled_ref_frame, NULL, NULL, num_planes); } else { av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes); } // Gets an initial list of candidate vectors from neighbours and orders them av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, mbmi_ext->mode_context); // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame); // Further refinement that is encode side only to test the top few candidates // in full and choose the best as the center point for subsequent searches. // The current implementation doesn't support scaling. av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12_mb[ref_frame][0].stride, ref_frame, block_size); // Go back to unscaled reference. if (scaled_ref_frame) { // We had temporarily setup pred block based on scaled reference above. Go // back to unscaled reference now, for subsequent use. av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes); } } #define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3) #define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3) // TODO(jingning): this mv clamping function should be block size dependent. static inline void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { const SubpelMvLimits mv_limits = { xd->mb_to_left_edge - LEFT_TOP_MARGIN, xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, xd->mb_to_top_edge - LEFT_TOP_MARGIN, xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN }; clamp_mv(mv, &mv_limits); } /* If the current mode shares the same mv with other modes with higher cost, * skip this mode. */ static int skip_repeated_mv(const AV1_COMMON *const cm, const MACROBLOCK *const x, PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frames[2], InterModeSearchState *search_state) { const int is_comp_pred = ref_frames[1] > INTRA_FRAME; const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames); const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; PREDICTION_MODE compare_mode = MB_MODE_COUNT; if (!is_comp_pred) { if (this_mode == NEARMV) { if (ref_mv_count == 0) { // NEARMV has the same motion vector as NEARESTMV compare_mode = NEARESTMV; } if (ref_mv_count == 1 && cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { // NEARMV has the same motion vector as GLOBALMV compare_mode = GLOBALMV; } } if (this_mode == GLOBALMV) { if (ref_mv_count == 0 && cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { // GLOBALMV has the same motion vector as NEARESTMV compare_mode = NEARESTMV; } if (ref_mv_count == 1) { // GLOBALMV has the same motion vector as NEARMV compare_mode = NEARMV; } } if (compare_mode != MB_MODE_COUNT) { // Use modelled_rd to check whether compare mode was searched if (search_state->modelled_rd[compare_mode][0][ref_frames[0]] != INT64_MAX) { const int16_t mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames); const int compare_cost = cost_mv_ref(&x->mode_costs, compare_mode, mode_ctx); const int this_cost = cost_mv_ref(&x->mode_costs, this_mode, mode_ctx); // Only skip if the mode cost is larger than compare mode cost if (this_cost > compare_cost) { search_state->modelled_rd[this_mode][0][ref_frames[0]] = search_state->modelled_rd[compare_mode][0][ref_frames[0]]; return 1; } } } } return 0; } static inline int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv, const AV1_COMMON *cm, const MACROBLOCK *x) { const MACROBLOCKD *const xd = &x->e_mbd; *out_mv = in_mv; lower_mv_precision(&out_mv->as_mv, cm->features.allow_high_precision_mv, cm->features.cur_frame_force_integer_mv); clamp_mv2(&out_mv->as_mv, xd); return av1_is_fullmv_in_range(&x->mv_limits, get_fullmv_from_mv(&out_mv->as_mv)); } // To use single newmv directly for compound modes, need to clamp the mv to the // valid mv range. Without this, encoder would generate out of range mv, and // this is seen in 8k encoding. static inline void clamp_mv_in_range(MACROBLOCK *const x, int_mv *mv, int ref_idx) { const int_mv ref_mv = av1_get_ref_mv(x, ref_idx); SubpelMvLimits mv_limits; av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits, &ref_mv.as_mv); clamp_mv(&mv->as_mv, &mv_limits); } static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize, int_mv *cur_mv, int *const rate_mv, HandleInterModeArgs *const args, inter_mode_info *mode_info) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const int is_comp_pred = has_second_ref(mbmi); const PREDICTION_MODE this_mode = mbmi->mode; const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] }; const int ref_mv_idx = mbmi->ref_mv_idx; if (is_comp_pred) { const int valid_mv0 = args->single_newmv_valid[ref_mv_idx][refs[0]]; const int valid_mv1 = args->single_newmv_valid[ref_mv_idx][refs[1]]; if (this_mode == NEW_NEWMV) { if (valid_mv0) { cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int; clamp_mv_in_range(x, &cur_mv[0], 0); } if (valid_mv1) { cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int; clamp_mv_in_range(x, &cur_mv[1], 1); } *rate_mv = 0; for (int i = 0; i < 2; ++i) { const int_mv ref_mv = av1_get_ref_mv(x, i); *rate_mv += av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost, x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); } } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) { if (valid_mv1) { cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int; clamp_mv_in_range(x, &cur_mv[1], 1); } const int_mv ref_mv = av1_get_ref_mv(x, 1); *rate_mv = av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost, x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); } else { assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV); if (valid_mv0) { cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int; clamp_mv_in_range(x, &cur_mv[0], 0); } const int_mv ref_mv = av1_get_ref_mv(x, 0); *rate_mv = av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost, x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); } } else { // Single ref case. const int ref_idx = 0; int search_range = INT_MAX; if (cpi->sf.mv_sf.reduce_search_range && mbmi->ref_mv_idx > 0) { const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv; int min_mv_diff = INT_MAX; int best_match = -1; MV prev_ref_mv[2] = { { 0 } }; for (int idx = 0; idx < mbmi->ref_mv_idx; ++idx) { prev_ref_mv[idx] = av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, idx, &x->mbmi_ext) .as_mv; const int ref_mv_diff = AOMMAX(abs(ref_mv.row - prev_ref_mv[idx].row), abs(ref_mv.col - prev_ref_mv[idx].col)); if (min_mv_diff > ref_mv_diff) { min_mv_diff = ref_mv_diff; best_match = idx; } } if (min_mv_diff < (16 << 3)) { if (args->single_newmv_valid[best_match][refs[0]]) { search_range = min_mv_diff; search_range += AOMMAX(abs(args->single_newmv[best_match][refs[0]].as_mv.row - prev_ref_mv[best_match].row), abs(args->single_newmv[best_match][refs[0]].as_mv.col - prev_ref_mv[best_match].col)); // Get full pixel search range. search_range = (search_range + 4) >> 3; } } } int_mv best_mv; av1_single_motion_search(cpi, x, bsize, ref_idx, rate_mv, search_range, mode_info, &best_mv, args); if (best_mv.as_int == INVALID_MV) return INT64_MAX; args->single_newmv[ref_mv_idx][refs[0]] = best_mv; args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv; args->single_newmv_valid[ref_mv_idx][refs[0]] = 1; cur_mv[0].as_int = best_mv.as_int; // Return after single_newmv is set. if (mode_info[mbmi->ref_mv_idx].skip) return INT64_MAX; } return 0; } static inline void update_mode_start_end_index( const AV1_COMP *const cpi, const MB_MODE_INFO *const mbmi, int *mode_index_start, int *mode_index_end, int last_motion_mode_allowed, int interintra_allowed, int eval_motion_mode) { *mode_index_start = (int)SIMPLE_TRANSLATION; *mode_index_end = (int)last_motion_mode_allowed + interintra_allowed; if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) { if (!eval_motion_mode) { *mode_index_end = (int)SIMPLE_TRANSLATION; } else { // Set the start index appropriately to process motion modes other than // simple translation *mode_index_start = 1; } } if (cpi->sf.inter_sf.extra_prune_warped && mbmi->bsize > BLOCK_16X16) *mode_index_end = SIMPLE_TRANSLATION; } /*!\brief AV1 motion mode search * * \ingroup inter_mode_search * Function to search over and determine the motion mode. It will update * mbmi->motion_mode to one of SIMPLE_TRANSLATION, OBMC_CAUSAL, or * WARPED_CAUSAL and determine any necessary side information for the selected * motion mode. It will also perform the full transform search, unless the * input parameter do_tx_search indicates to do an estimation of the RD rather * than an RD corresponding to a full transform search. It will return the * RD for the final motion_mode. * Do the RD search for a given inter mode and compute all information relevant * to the input mode. It will compute the best MV, * compound parameters (if the mode is a compound mode) and interpolation filter * parameters. * * \param[in] cpi Top-level encoder structure. * \param[in] tile_data Pointer to struct holding adaptive * data/contexts/models for the tile during * encoding. * \param[in] x Pointer to struct holding all the data for * the current macroblock. * \param[in] bsize Current block size. * \param[in,out] rd_stats Struct to keep track of the overall RD * information. * \param[in,out] rd_stats_y Struct to keep track of the RD information * for only the Y plane. * \param[in,out] rd_stats_uv Struct to keep track of the RD information * for only the UV planes. * \param[in] args HandleInterModeArgs struct holding * miscellaneous arguments for inter mode * search. See the documentation for this * struct for a description of each member. * \param[in] ref_best_rd Best RD found so far for this block. * It is used for early termination of this * search if the RD exceeds this value. * \param[in,out] ref_skip_rd A length 2 array, where skip_rd[0] is the * best total RD for a skip mode so far, and * skip_rd[1] is the best RD for a skip mode so * far in luma. This is used as a speed feature * to skip the transform search if the computed * skip RD for the current mode is not better * than the best skip_rd so far. * \param[in,out] rate_mv The rate associated with the motion vectors. * This will be modified if a motion search is * done in the motion mode search. * \param[in,out] orig_dst A prediction buffer to hold a computed * prediction. This will eventually hold the * final prediction, and the tmp_dst info will * be copied here. * \param[in,out] best_est_rd Estimated RD for motion mode search if * do_tx_search (see below) is 0. * \param[in] do_tx_search Parameter to indicate whether or not to do * a full transform search. This will compute * an estimated RD for the modes without the * transform search and later perform the full * transform search on the best candidates. * \param[in] inter_modes_info InterModesInfo struct to hold inter mode * information to perform a full transform * search only on winning candidates searched * with an estimate for transform coding RD. * \param[in] eval_motion_mode Boolean whether or not to evaluate motion * motion modes other than SIMPLE_TRANSLATION. * \param[out] yrd Stores the rdcost corresponding to encoding * the luma plane. * \return Returns INT64_MAX if the determined motion mode is invalid and the * current motion mode being tested should be skipped. It returns 0 if the * motion mode search is a success. */ static int64_t motion_mode_rd( const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x, BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, HandleInterModeArgs *const args, int64_t ref_best_rd, int64_t *ref_skip_rd, int *rate_mv, const BUFFER_SET *orig_dst, int64_t *best_est_rd, int do_tx_search, InterModesInfo *inter_modes_info, int eval_motion_mode, int64_t *yrd) { const AV1_COMMON *const cm = &cpi->common; const FeatureFlags *const features = &cm->features; TxfmSearchInfo *txfm_info = &x->txfm_search_info; const int num_planes = av1_num_planes(cm); MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; const int is_comp_pred = has_second_ref(mbmi); const PREDICTION_MODE this_mode = mbmi->mode; const int rate2_nocoeff = rd_stats->rate; int best_xskip_txfm = 0; RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; const int rate_mv0 = *rate_mv; const int interintra_allowed = cm->seq_params->enable_interintra_compound && is_interintra_allowed(mbmi) && mbmi->compound_idx; WARP_SAMPLE_INFO *const warp_sample_info = &x->warp_sample_info[mbmi->ref_frame[0]]; int *pts0 = warp_sample_info->pts; int *pts_inref0 = warp_sample_info->pts_inref; assert(mbmi->ref_frame[1] != INTRA_FRAME); const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1]; av1_invalid_rd_stats(&best_rd_stats); mbmi->num_proj_ref = 1; // assume num_proj_ref >=1 MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION; *yrd = INT64_MAX; if (features->switchable_motion_mode) { // Determine which motion modes to search if more than SIMPLE_TRANSLATION // is allowed. last_motion_mode_allowed = motion_mode_allowed( xd->global_motion, xd, mbmi, features->allow_warped_motion); } if (last_motion_mode_allowed == WARPED_CAUSAL) { // Collect projection samples used in least squares approximation of // the warped motion parameters if WARPED_CAUSAL is going to be searched. if (warp_sample_info->num < 0) { warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0); } mbmi->num_proj_ref = warp_sample_info->num; } const int total_samples = mbmi->num_proj_ref; if (total_samples == 0) { // Do not search WARPED_CAUSAL if there are no samples to use to determine // warped parameters. last_motion_mode_allowed = OBMC_CAUSAL; } const MB_MODE_INFO base_mbmi = *mbmi; MB_MODE_INFO best_mbmi; const int interp_filter = features->interp_filter; const int switchable_rate = av1_is_interp_needed(xd) ? av1_get_switchable_rate(x, xd, interp_filter, cm->seq_params->enable_dual_filter) : 0; int64_t best_rd = INT64_MAX; int best_rate_mv = rate_mv0; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; int mode_index_start, mode_index_end; const int txfm_rd_gate_level = get_txfm_rd_gate_level(cm->seq_params->enable_masked_compound, cpi->sf.inter_sf.txfm_rd_gate_level, bsize, TX_SEARCH_MOTION_MODE, eval_motion_mode); // Modify the start and end index according to speed features. For example, // if SIMPLE_TRANSLATION has already been searched according to // the motion_mode_for_winner_cand speed feature, update the mode_index_start // to avoid searching it again. update_mode_start_end_index(cpi, mbmi, &mode_index_start, &mode_index_end, last_motion_mode_allowed, interintra_allowed, eval_motion_mode); // Main function loop. This loops over all of the possible motion modes and // computes RD to determine the best one. This process includes computing // any necessary side information for the motion mode and performing the // transform search. for (int mode_index = mode_index_start; mode_index <= mode_index_end; mode_index++) { if (args->skip_motion_mode && mode_index) continue; int tmp_rate2 = rate2_nocoeff; const int is_interintra_mode = mode_index > (int)last_motion_mode_allowed; int tmp_rate_mv = rate_mv0; *mbmi = base_mbmi; if (is_interintra_mode) { // Only use SIMPLE_TRANSLATION for interintra mbmi->motion_mode = SIMPLE_TRANSLATION; } else { mbmi->motion_mode = (MOTION_MODE)mode_index; assert(mbmi->ref_frame[1] != INTRA_FRAME); } // Do not search OBMC if the probability of selecting it is below a // predetermined threshold for this update_type and block size. const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); int use_actual_frame_probs = 1; int prune_obmc; #if CONFIG_FPMT_TEST use_actual_frame_probs = (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1; if (!use_actual_frame_probs) { prune_obmc = cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize] < cpi->sf.inter_sf.prune_obmc_prob_thresh; } #endif if (use_actual_frame_probs) { prune_obmc = cpi->ppi->frame_probs.obmc_probs[update_type][bsize] < cpi->sf.inter_sf.prune_obmc_prob_thresh; } if ((!cpi->oxcf.motion_mode_cfg.enable_obmc || prune_obmc) && mbmi->motion_mode == OBMC_CAUSAL) continue; if (mbmi->motion_mode == SIMPLE_TRANSLATION && !is_interintra_mode) { // SIMPLE_TRANSLATION mode: no need to recalculate. // The prediction is calculated before motion_mode_rd() is called in // handle_inter_mode() } else if (mbmi->motion_mode == OBMC_CAUSAL) { const uint32_t cur_mv = mbmi->mv[0].as_int; // OBMC_CAUSAL not allowed for compound prediction assert(!is_comp_pred); if (have_newmv_in_inter_mode(this_mode)) { av1_single_motion_search(cpi, x, bsize, 0, &tmp_rate_mv, INT_MAX, NULL, &mbmi->mv[0], NULL); tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv; } if ((mbmi->mv[0].as_int != cur_mv) || eval_motion_mode) { // Build the predictor according to the current motion vector if it has // not already been built av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, 0, av1_num_planes(cm) - 1); } // Build the inter predictor by blending the predictor corresponding to // this MV, and the neighboring blocks using the OBMC model av1_build_obmc_inter_prediction( cm, xd, args->above_pred_buf, args->above_pred_stride, args->left_pred_buf, args->left_pred_stride); #if !CONFIG_REALTIME_ONLY } else if (mbmi->motion_mode == WARPED_CAUSAL) { int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; mbmi->motion_mode = WARPED_CAUSAL; mbmi->wm_params.wmtype = DEFAULT_WMTYPE; mbmi->interp_filters = av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter)); memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); // Select the samples according to motion vector difference if (mbmi->num_proj_ref > 1) { mbmi->num_proj_ref = av1_selectSamples( &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref, bsize); } // Compute the warped motion parameters with a least squares fit // using the collected samples if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col, &mbmi->wm_params, mi_row, mi_col)) { assert(!is_comp_pred); if (have_newmv_in_inter_mode(this_mode)) { // Refine MV for NEWMV mode const int_mv mv0 = mbmi->mv[0]; const WarpedMotionParams wm_params0 = mbmi->wm_params; const int num_proj_ref0 = mbmi->num_proj_ref; const int_mv ref_mv = av1_get_ref_mv(x, 0); SUBPEL_MOTION_SEARCH_PARAMS ms_params; av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv.as_mv, NULL); // Refine MV in a small range. av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0, total_samples, cpi->sf.mv_sf.warp_search_method, cpi->sf.mv_sf.warp_search_iters); if (mv0.as_int != mbmi->mv[0].as_int) { // Keep the refined MV and WM parameters. tmp_rate_mv = av1_mv_bit_cost( &mbmi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost, x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv; } else { // Restore the old MV and WM parameters. mbmi->mv[0] = mv0; mbmi->wm_params = wm_params0; mbmi->num_proj_ref = num_proj_ref0; } } // Build the warped predictor av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, av1_num_planes(cm) - 1); } else { continue; } #endif // !CONFIG_REALTIME_ONLY } else if (is_interintra_mode) { const int ret = av1_handle_inter_intra_mode(cpi, x, bsize, mbmi, args, ref_best_rd, &tmp_rate_mv, &tmp_rate2, orig_dst); if (ret < 0) continue; } // If we are searching newmv and the mv is the same as refmv, skip the // current mode if (!av1_check_newmv_joint_nonzero(cm, x)) continue; // Update rd_stats for the current motion mode txfm_info->skip_txfm = 0; rd_stats->dist = 0; rd_stats->sse = 0; rd_stats->skip_txfm = 1; rd_stats->rate = tmp_rate2; const ModeCosts *mode_costs = &x->mode_costs; if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate; if (interintra_allowed) { rd_stats->rate += mode_costs->interintra_cost[size_group_lookup[bsize]] [mbmi->ref_frame[1] == INTRA_FRAME]; } if ((last_motion_mode_allowed > SIMPLE_TRANSLATION) && (mbmi->ref_frame[1] != INTRA_FRAME)) { if (last_motion_mode_allowed == WARPED_CAUSAL) { rd_stats->rate += mode_costs->motion_mode_cost[bsize][mbmi->motion_mode]; } else { rd_stats->rate += mode_costs->motion_mode_cost1[bsize][mbmi->motion_mode]; } } int64_t this_yrd = INT64_MAX; if (!do_tx_search) { // Avoid doing a transform search here to speed up the overall mode // search. It will be done later in the mode search if the current // motion mode seems promising. int64_t curr_sse = -1; int64_t sse_y = -1; int est_residue_cost = 0; int64_t est_dist = 0; int64_t est_rd = 0; if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { curr_sse = get_sse(cpi, x, &sse_y); const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse, &est_residue_cost, &est_dist); (void)has_est_rd; assert(has_est_rd); } else if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 2 || cpi->sf.rt_sf.use_nonrd_pick_mode) { model_rd_sb_fn[MODELRD_TYPE_MOTION_MODE_RD]( cpi, bsize, x, xd, 0, num_planes - 1, &est_residue_cost, &est_dist, NULL, &curr_sse, NULL, NULL, NULL); sse_y = x->pred_sse[xd->mi[0]->ref_frame[0]]; } est_rd = RDCOST(x->rdmult, rd_stats->rate + est_residue_cost, est_dist); if (est_rd * 0.80 > *best_est_rd) { mbmi->ref_frame[1] = ref_frame_1; continue; } const int mode_rate = rd_stats->rate; rd_stats->rate += est_residue_cost; rd_stats->dist = est_dist; rd_stats->rdcost = est_rd; if (rd_stats->rdcost < *best_est_rd) { *best_est_rd = rd_stats->rdcost; assert(sse_y >= 0); ref_skip_rd[1] = txfm_rd_gate_level ? RDCOST(x->rdmult, mode_rate, (sse_y << 4)) : INT64_MAX; } if (cm->current_frame.reference_mode == SINGLE_REFERENCE) { if (!is_comp_pred) { assert(curr_sse >= 0); inter_modes_info_push(inter_modes_info, mode_rate, curr_sse, rd_stats->rdcost, rd_stats, rd_stats_y, rd_stats_uv, mbmi); } } else { assert(curr_sse >= 0); inter_modes_info_push(inter_modes_info, mode_rate, curr_sse, rd_stats->rdcost, rd_stats, rd_stats_y, rd_stats_uv, mbmi); } mbmi->skip_txfm = 0; } else { // Perform full transform search int64_t skip_rd = INT64_MAX; int64_t skip_rdy = INT64_MAX; if (txfm_rd_gate_level) { // Check if the mode is good enough based on skip RD int64_t sse_y = INT64_MAX; int64_t curr_sse = get_sse(cpi, x, &sse_y); skip_rd = RDCOST(x->rdmult, rd_stats->rate, curr_sse); skip_rdy = RDCOST(x->rdmult, rd_stats->rate, (sse_y << 4)); int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd[0], skip_rd, txfm_rd_gate_level, 0); if (!eval_txfm) continue; } // Do transform search const int mode_rate = rd_stats->rate; if (!av1_txfm_search(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, rd_stats->rate, ref_best_rd)) { if (rd_stats_y->rate == INT_MAX && mode_index == 0) { return INT64_MAX; } continue; } const int skip_ctx = av1_get_skip_txfm_context(xd); const int y_rate = rd_stats->skip_txfm ? x->mode_costs.skip_txfm_cost[skip_ctx][1] : (rd_stats_y->rate + x->mode_costs.skip_txfm_cost[skip_ctx][0]); this_yrd = RDCOST(x->rdmult, y_rate + mode_rate, rd_stats_y->dist); const int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); if (curr_rd < ref_best_rd) { ref_best_rd = curr_rd; ref_skip_rd[0] = skip_rd; ref_skip_rd[1] = skip_rdy; } if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { inter_mode_data_push( tile_data, mbmi->bsize, rd_stats->sse, rd_stats->dist, rd_stats_y->rate + rd_stats_uv->rate + mode_costs->skip_txfm_cost[skip_ctx][mbmi->skip_txfm]); } } if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) { if (is_nontrans_global_motion(xd, xd->mi[0])) { mbmi->interp_filters = av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter)); } } const int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); if (mode_index == 0) { args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd; } if (mode_index == 0 || tmp_rd < best_rd) { // Update best_rd data if this is the best motion mode so far best_mbmi = *mbmi; best_rd = tmp_rd; best_rd_stats = *rd_stats; best_rd_stats_y = *rd_stats_y; best_rate_mv = tmp_rate_mv; *yrd = this_yrd; if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv; memcpy(best_blk_skip, txfm_info->blk_skip, sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width); av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width); best_xskip_txfm = mbmi->skip_txfm; } } // Update RD and mbmi stats for selected motion mode mbmi->ref_frame[1] = ref_frame_1; *rate_mv = best_rate_mv; if (best_rd == INT64_MAX || !av1_check_newmv_joint_nonzero(cm, x)) { av1_invalid_rd_stats(rd_stats); restore_dst_buf(xd, *orig_dst, num_planes); return INT64_MAX; } *mbmi = best_mbmi; *rd_stats = best_rd_stats; *rd_stats_y = best_rd_stats_y; if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv; memcpy(txfm_info->blk_skip, best_blk_skip, sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width); av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width); txfm_info->skip_txfm = best_xskip_txfm; restore_dst_buf(xd, *orig_dst, num_planes); return 0; } static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, const BUFFER_SET *const orig_dst, int64_t best_rd) { assert(bsize < BLOCK_SIZES_ALL); const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; int64_t total_sse = 0; int64_t this_rd = INT64_MAX; const int skip_mode_ctx = av1_get_skip_mode_context(xd); rd_stats->rate = x->mode_costs.skip_mode_cost[skip_mode_ctx][1]; for (int plane = 0; plane < num_planes; ++plane) { // Call av1_enc_build_inter_predictor() for one plane at a time. av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, plane, plane); const struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); av1_subtract_plane(x, plane_bsize, plane); int64_t sse = av1_pixel_diff_dist(x, plane, 0, 0, plane_bsize, plane_bsize, NULL); if (is_cur_buf_hbd(xd)) sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); sse <<= 4; total_sse += sse; // When current rd cost is more than the best rd, skip evaluation of // remaining planes. this_rd = RDCOST(x->rdmult, rd_stats->rate, total_sse); if (this_rd > best_rd) break; } rd_stats->dist = rd_stats->sse = total_sse; rd_stats->rdcost = this_rd; restore_dst_buf(xd, *orig_dst, num_planes); return 0; } // Check NEARESTMV, NEARMV, GLOBALMV ref mvs for duplicate and skip the relevant // mode // Note(rachelbarker): This speed feature currently does not interact correctly // with global motion. The issue is that, when global motion is used, GLOBALMV // produces a different prediction to NEARESTMV/NEARMV even if the motion // vectors are the same. Thus GLOBALMV should not be pruned in this case. static inline int check_repeat_ref_mv(const MB_MODE_INFO_EXT *mbmi_ext, int ref_idx, const MV_REFERENCE_FRAME *ref_frame, PREDICTION_MODE single_mode) { const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame); const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; assert(single_mode != NEWMV); if (single_mode == NEARESTMV) { return 0; } else if (single_mode == NEARMV) { // when ref_mv_count = 0, NEARESTMV and NEARMV are same as GLOBALMV // when ref_mv_count = 1, NEARMV is same as GLOBALMV if (ref_mv_count < 2) return 1; } else if (single_mode == GLOBALMV) { // when ref_mv_count == 0, GLOBALMV is same as NEARESTMV if (ref_mv_count == 0) return 1; // when ref_mv_count == 1, NEARMV is same as GLOBALMV else if (ref_mv_count == 1) return 0; int stack_size = AOMMIN(USABLE_REF_MV_STACK_SIZE, ref_mv_count); // Check GLOBALMV is matching with any mv in ref_mv_stack for (int ref_mv_idx = 0; ref_mv_idx < stack_size; ref_mv_idx++) { int_mv this_mv; if (ref_idx == 0) this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv; else this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv; if (this_mv.as_int == mbmi_ext->global_mvs[ref_frame[ref_idx]].as_int) return 1; } } return 0; } static inline int get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode, int ref_idx, int ref_mv_idx, int skip_repeated_ref_mv, const MV_REFERENCE_FRAME *ref_frame, const MB_MODE_INFO_EXT *mbmi_ext) { const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx); assert(is_inter_singleref_mode(single_mode)); if (single_mode == NEWMV) { this_mv->as_int = INVALID_MV; } else if (single_mode == GLOBALMV) { if (skip_repeated_ref_mv && check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode)) return 0; *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]]; } else { assert(single_mode == NEARMV || single_mode == NEARESTMV); const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame); const int ref_mv_offset = single_mode == NEARESTMV ? 0 : ref_mv_idx + 1; if (ref_mv_offset < mbmi_ext->ref_mv_count[ref_frame_type]) { assert(ref_mv_offset >= 0); if (ref_idx == 0) { *this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].this_mv; } else { *this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].comp_mv; } } else { if (skip_repeated_ref_mv && check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode)) return 0; *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]]; } } return 1; } // Skip NEARESTMV and NEARMV modes based on refmv weight computed in ref mv list // population static inline int skip_nearest_near_mv_using_refmv_weight( const MACROBLOCK *const x, const PREDICTION_MODE this_mode, const int8_t ref_frame_type, PREDICTION_MODE best_mode) { if (this_mode != NEARESTMV && this_mode != NEARMV) return 0; // Do not skip the mode if the current block has not yet obtained a valid // inter mode. if (!is_inter_mode(best_mode)) return 0; const MACROBLOCKD *xd = &x->e_mbd; // Do not skip the mode if both the top and left neighboring blocks are not // available. if (!xd->left_available || !xd->up_available) return 0; const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; const uint16_t *const ref_mv_weight = mbmi_ext->weight[ref_frame_type]; const int ref_mv_count = AOMMIN(MAX_REF_MV_SEARCH, mbmi_ext->ref_mv_count[ref_frame_type]); if (ref_mv_count == 0) return 0; // If ref mv list has at least one nearest candidate do not prune NEARESTMV if (this_mode == NEARESTMV && ref_mv_weight[0] >= REF_CAT_LEVEL) return 0; // Count number of ref mvs populated from nearest candidates int nearest_refmv_count = 0; for (int ref_mv_idx = 0; ref_mv_idx < ref_mv_count; ref_mv_idx++) { if (ref_mv_weight[ref_mv_idx] >= REF_CAT_LEVEL) nearest_refmv_count++; } // nearest_refmv_count indicates the closeness of block motion characteristics // with respect to its spatial neighbor. Smaller value of nearest_refmv_count // w.r.t to ref_mv_count means less correlation with its spatial neighbors. // Hence less possibility for NEARESTMV and NEARMV modes becoming the best // mode since these modes work well for blocks that shares similar motion // characteristics with its neighbor. Thus, NEARMV mode is pruned when // nearest_refmv_count is relatively smaller than ref_mv_count and NEARESTMV // mode is pruned if none of the ref mvs are populated from nearest candidate. const int prune_thresh = 1 + (ref_mv_count >= 2); if (nearest_refmv_count < prune_thresh) return 1; return 0; } // This function update the non-new mv for the current prediction mode static inline int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode, const AV1_COMMON *cm, const MACROBLOCK *x, int skip_repeated_ref_mv) { const MACROBLOCKD *xd = &x->e_mbd; const MB_MODE_INFO *mbmi = xd->mi[0]; const int is_comp_pred = has_second_ref(mbmi); int ret = 1; for (int i = 0; i < is_comp_pred + 1; ++i) { int_mv this_mv; this_mv.as_int = INVALID_MV; ret = get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx, skip_repeated_ref_mv, mbmi->ref_frame, &x->mbmi_ext); if (!ret) return 0; const PREDICTION_MODE single_mode = get_single_mode(this_mode, i); if (single_mode == NEWMV) { const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); cur_mv[i] = (i == 0) ? x->mbmi_ext.ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx] .this_mv : x->mbmi_ext.ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx] .comp_mv; } else { ret &= clamp_and_check_mv(cur_mv + i, this_mv, cm, x); } } return ret; } static inline int get_drl_cost(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext, const int (*const drl_mode_cost0)[2], int8_t ref_frame_type) { int cost = 0; if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) { for (int idx = 0; idx < 2; ++idx) { if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != idx]; if (mbmi->ref_mv_idx == idx) return cost; } } return cost; } if (have_nearmv_in_inter_mode(mbmi->mode)) { for (int idx = 1; idx < 3; ++idx) { if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != (idx - 1)]; if (mbmi->ref_mv_idx == (idx - 1)) return cost; } } return cost; } return cost; } static inline int is_single_newmv_valid(const HandleInterModeArgs *const args, const MB_MODE_INFO *const mbmi, PREDICTION_MODE this_mode) { for (int ref_idx = 0; ref_idx < 2; ++ref_idx) { const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx); const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx]; if (single_mode == NEWMV && args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) { return 0; } } return 1; } static int get_drl_refmv_count(const MACROBLOCK *const x, const MV_REFERENCE_FRAME *ref_frame, PREDICTION_MODE mode) { const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; const int8_t ref_frame_type = av1_ref_frame_type(ref_frame); const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0; const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; const int only_newmv = (mode == NEWMV || mode == NEW_NEWMV); const int has_drl = (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1); const int ref_set = has_drl ? AOMMIN(MAX_REF_MV_SEARCH, ref_mv_count - has_nearmv) : 1; return ref_set; } // Checks if particular ref_mv_idx should be pruned. static int prune_ref_mv_idx_using_qindex(const int reduce_inter_modes, const int qindex, const int ref_mv_idx) { if (reduce_inter_modes >= 3) return 1; // Q-index logic based pruning is enabled only for // reduce_inter_modes = 2. assert(reduce_inter_modes == 2); // When reduce_inter_modes=2, pruning happens as below based on q index. // For q index range between 0 and 85: prune if ref_mv_idx >= 1. // For q index range between 86 and 170: prune if ref_mv_idx == 2. // For q index range between 171 and 255: no pruning. const int min_prune_ref_mv_idx = (qindex * 3 / QINDEX_RANGE) + 1; return (ref_mv_idx >= min_prune_ref_mv_idx); } // Whether this reference motion vector can be skipped, based on initial // heuristics. static bool ref_mv_idx_early_breakout( const SPEED_FEATURES *const sf, const RefFrameDistanceInfo *const ref_frame_dist_info, MACROBLOCK *x, const HandleInterModeArgs *const args, int64_t ref_best_rd, int ref_mv_idx) { MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); const int is_comp_pred = has_second_ref(mbmi); if (sf->inter_sf.reduce_inter_modes && ref_mv_idx > 0) { if (mbmi->ref_frame[0] == LAST2_FRAME || mbmi->ref_frame[0] == LAST3_FRAME || mbmi->ref_frame[1] == LAST2_FRAME || mbmi->ref_frame[1] == LAST3_FRAME) { const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; if (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] < REF_CAT_LEVEL) { return true; } } // TODO(any): Experiment with reduce_inter_modes for compound prediction if (sf->inter_sf.reduce_inter_modes >= 2 && !is_comp_pred && have_newmv_in_inter_mode(mbmi->mode)) { if (mbmi->ref_frame[0] != ref_frame_dist_info->nearest_past_ref && mbmi->ref_frame[0] != ref_frame_dist_info->nearest_future_ref) { const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; const int do_prune = prune_ref_mv_idx_using_qindex( sf->inter_sf.reduce_inter_modes, x->qindex, ref_mv_idx); if (do_prune && (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] < REF_CAT_LEVEL)) { return true; } } } } mbmi->ref_mv_idx = ref_mv_idx; if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, mbmi->mode))) { return true; } size_t est_rd_rate = args->ref_frame_cost + args->single_comp_cost; const int drl_cost = get_drl_cost( mbmi, mbmi_ext, x->mode_costs.drl_mode_cost0, ref_frame_type); est_rd_rate += drl_cost; if (RDCOST(x->rdmult, est_rd_rate, 0) > ref_best_rd && mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) { return true; } return false; } // Compute the estimated RD cost for the motion vector with simple translation. static int64_t simple_translation_pred_rd(AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, HandleInterModeArgs *args, int ref_mv_idx, int64_t ref_best_rd, BLOCK_SIZE bsize) { MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); const AV1_COMMON *cm = &cpi->common; const int is_comp_pred = has_second_ref(mbmi); const ModeCosts *mode_costs = &x->mode_costs; struct macroblockd_plane *p = xd->plane; const BUFFER_SET orig_dst = { { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf }, { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride }, }; av1_init_rd_stats(rd_stats); mbmi->interinter_comp.type = COMPOUND_AVERAGE; mbmi->comp_group_idx = 0; mbmi->compound_idx = 1; if (mbmi->ref_frame[1] == INTRA_FRAME) { mbmi->ref_frame[1] = NONE_FRAME; } int16_t mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); mbmi->num_proj_ref = 0; mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->ref_mv_idx = ref_mv_idx; rd_stats->rate += args->ref_frame_cost + args->single_comp_cost; const int drl_cost = get_drl_cost(mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type); rd_stats->rate += drl_cost; int_mv cur_mv[2]; if (!build_cur_mv(cur_mv, mbmi->mode, cm, x, 0)) { return INT64_MAX; } assert(have_nearmv_in_inter_mode(mbmi->mode)); for (int i = 0; i < is_comp_pred + 1; ++i) { mbmi->mv[i].as_int = cur_mv[i].as_int; } const int ref_mv_cost = cost_mv_ref(mode_costs, mbmi->mode, mode_ctx); rd_stats->rate += ref_mv_cost; if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd) { return INT64_MAX; } mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->num_proj_ref = 0; if (is_comp_pred) { // Only compound_average mbmi->interinter_comp.type = COMPOUND_AVERAGE; mbmi->comp_group_idx = 0; mbmi->compound_idx = 1; } set_default_interp_filters(mbmi, cm->features.interp_filter); const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, AOM_PLANE_Y, AOM_PLANE_Y); int est_rate; int64_t est_dist; model_rd_sb_fn[MODELRD_CURVFIT](cpi, bsize, x, xd, 0, 0, &est_rate, &est_dist, NULL, NULL, NULL, NULL, NULL); return RDCOST(x->rdmult, rd_stats->rate + est_rate, est_dist); } // Represents a set of integers, from 0 to sizeof(int) * 8, as bits in // an integer. 0 for the i-th bit means that integer is excluded, 1 means // it is included. static inline void mask_set_bit(int *mask, int index) { *mask |= (1 << index); } static inline bool mask_check_bit(int mask, int index) { return (mask >> index) & 0x1; } // Before performing the full MV search in handle_inter_mode, do a simple // translation search and see if we can eliminate any motion vectors. // Returns an integer where, if the i-th bit is set, it means that the i-th // motion vector should be searched. This is only set for NEAR_MV. static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, HandleInterModeArgs *const args, int64_t ref_best_rd, BLOCK_SIZE bsize, const int ref_set) { // If the number of ref mv count is equal to 1, do not prune the same. It // is better to evaluate the same than to prune it. if (ref_set == 1) return 1; AV1_COMMON *const cm = &cpi->common; const MACROBLOCKD *const xd = &x->e_mbd; const MB_MODE_INFO *const mbmi = xd->mi[0]; const PREDICTION_MODE this_mode = mbmi->mode; // Only search indices if they have some chance of being good. int good_indices = 0; for (int i = 0; i < ref_set; ++i) { if (ref_mv_idx_early_breakout(&cpi->sf, &cpi->ref_frame_dist_info, x, args, ref_best_rd, i)) { continue; } mask_set_bit(&good_indices, i); } // Only prune in NEARMV mode, if the speed feature is set, and the block size // is large enough. If these conditions are not met, return all good indices // found so far. if (!cpi->sf.inter_sf.prune_mode_search_simple_translation) return good_indices; if (!have_nearmv_in_inter_mode(this_mode)) return good_indices; if (num_pels_log2_lookup[bsize] <= 6) return good_indices; // Do not prune when there is internal resizing. TODO(elliottk) fix this // so b/2384 can be resolved. if (av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[0])) || (mbmi->ref_frame[1] > 0 && av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[1])))) { return good_indices; } // Calculate the RD cost for the motion vectors using simple translation. int64_t idx_rdcost[] = { INT64_MAX, INT64_MAX, INT64_MAX }; for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) { // If this index is bad, ignore it. if (!mask_check_bit(good_indices, ref_mv_idx)) { continue; } idx_rdcost[ref_mv_idx] = simple_translation_pred_rd( cpi, x, rd_stats, args, ref_mv_idx, ref_best_rd, bsize); } // Find the index with the best RD cost. int best_idx = 0; for (int i = 1; i < MAX_REF_MV_SEARCH; ++i) { if (idx_rdcost[i] < idx_rdcost[best_idx]) { best_idx = i; } } // Only include indices that are good and within a % of the best. const double dth = has_second_ref(mbmi) ? 1.05 : 1.001; // If the simple translation cost is not within this multiple of the // best RD, skip it. Note that the cutoff is derived experimentally. const double ref_dth = 5; int result = 0; for (int i = 0; i < ref_set; ++i) { if (mask_check_bit(good_indices, i) && (1.0 * idx_rdcost[i]) / idx_rdcost[best_idx] < dth && (1.0 * idx_rdcost[i]) / ref_best_rd < ref_dth) { mask_set_bit(&result, i); } } return result; } /*!\brief Motion mode information for inter mode search speedup. * * Used in a speed feature to search motion modes other than * SIMPLE_TRANSLATION only on winning candidates. */ typedef struct motion_mode_candidate { /*! * Mode info for the motion mode candidate. */ MB_MODE_INFO mbmi; /*! * Rate describing the cost of the motion vectors for this candidate. */ int rate_mv; /*! * Rate before motion mode search and transform coding is applied. */ int rate2_nocoeff; /*! * An integer value 0 or 1 which indicates whether or not to skip the motion * mode search and default to SIMPLE_TRANSLATION as a speed feature for this * candidate. */ int skip_motion_mode; /*! * Total RD cost for this candidate. */ int64_t rd_cost; } motion_mode_candidate; /*!\cond */ typedef struct motion_mode_best_st_candidate { motion_mode_candidate motion_mode_cand[MAX_WINNER_MOTION_MODES]; int num_motion_mode_cand; } motion_mode_best_st_candidate; // Checks if the current reference frame matches with neighbouring block's // (top/left) reference frames static inline int ref_match_found_in_nb_blocks(MB_MODE_INFO *cur_mbmi, MB_MODE_INFO *nb_mbmi) { MV_REFERENCE_FRAME nb_ref_frames[2] = { nb_mbmi->ref_frame[0], nb_mbmi->ref_frame[1] }; MV_REFERENCE_FRAME cur_ref_frames[2] = { cur_mbmi->ref_frame[0], cur_mbmi->ref_frame[1] }; const int is_cur_comp_pred = has_second_ref(cur_mbmi); int match_found = 0; for (int i = 0; i < (is_cur_comp_pred + 1); i++) { if ((cur_ref_frames[i] == nb_ref_frames[0]) || (cur_ref_frames[i] == nb_ref_frames[1])) match_found = 1; } return match_found; } static inline int find_ref_match_in_above_nbs(const int total_mi_cols, MACROBLOCKD *xd) { if (!xd->up_available) return 1; const int mi_col = xd->mi_col; MB_MODE_INFO **cur_mbmi = xd->mi; // prev_row_mi points into the mi array, starting at the beginning of the // previous row. MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride; const int end_col = AOMMIN(mi_col + xd->width, total_mi_cols); uint8_t mi_step; for (int above_mi_col = mi_col; above_mi_col < end_col; above_mi_col += mi_step) { MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col; mi_step = mi_size_wide[above_mi[0]->bsize]; int match_found = 0; if (is_inter_block(*above_mi)) match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *above_mi); if (match_found) return 1; } return 0; } static inline int find_ref_match_in_left_nbs(const int total_mi_rows, MACROBLOCKD *xd) { if (!xd->left_available) return 1; const int mi_row = xd->mi_row; MB_MODE_INFO **cur_mbmi = xd->mi; // prev_col_mi points into the mi array, starting at the top of the // previous column MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride; const int end_row = AOMMIN(mi_row + xd->height, total_mi_rows); uint8_t mi_step; for (int left_mi_row = mi_row; left_mi_row < end_row; left_mi_row += mi_step) { MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride; mi_step = mi_size_high[left_mi[0]->bsize]; int match_found = 0; if (is_inter_block(*left_mi)) match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *left_mi); if (match_found) return 1; } return 0; } /*!\endcond */ /*! \brief Struct used to hold TPL data to * narrow down parts of the inter mode search. */ typedef struct { /*! * The best inter cost out of all of the reference frames. */ int64_t best_inter_cost; /*! * The inter cost for each reference frame. */ int64_t ref_inter_cost[INTER_REFS_PER_FRAME]; } PruneInfoFromTpl; #if !CONFIG_REALTIME_ONLY // TODO(Remya): Check if get_tpl_stats_b() can be reused static inline void get_block_level_tpl_stats( AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int *valid_refs, PruneInfoFromTpl *inter_cost_info_from_tpl) { AV1_COMMON *const cm = &cpi->common; assert(IMPLIES(cpi->ppi->gf_group.size > 0, cpi->gf_frame_index < cpi->ppi->gf_group.size)); const int tpl_idx = cpi->gf_frame_index; TplParams *const tpl_data = &cpi->ppi->tpl_data; if (!av1_tpl_stats_ready(tpl_data, tpl_idx)) return; const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; const int mi_wide = mi_size_wide[bsize]; const int mi_high = mi_size_high[bsize]; const int tpl_stride = tpl_frame->stride; const int step = 1 << tpl_data->tpl_stats_block_mis_log2; const int mi_col_sr = coded_to_superres_mi(mi_col, cm->superres_scale_denominator); const int mi_col_end_sr = coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator); const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); const int row_step = step; const int col_step_sr = coded_to_superres_mi(step, cm->superres_scale_denominator); for (int row = mi_row; row < AOMMIN(mi_row + mi_high, cm->mi_params.mi_rows); row += row_step) { for (int col = mi_col_sr; col < AOMMIN(mi_col_end_sr, mi_cols_sr); col += col_step_sr) { const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; // Sums up the inter cost of corresponding ref frames for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) { inter_cost_info_from_tpl->ref_inter_cost[ref_idx] += this_stats->pred_error[ref_idx]; } } } // Computes the best inter cost (minimum inter_cost) int64_t best_inter_cost = INT64_MAX; for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) { const int64_t cur_inter_cost = inter_cost_info_from_tpl->ref_inter_cost[ref_idx]; // For invalid ref frames, cur_inter_cost = 0 and has to be handled while // calculating the minimum inter_cost if (cur_inter_cost != 0 && (cur_inter_cost < best_inter_cost) && valid_refs[ref_idx]) best_inter_cost = cur_inter_cost; } inter_cost_info_from_tpl->best_inter_cost = best_inter_cost; } #endif static inline int prune_modes_based_on_tpl_stats( PruneInfoFromTpl *inter_cost_info_from_tpl, const int *refs, int ref_mv_idx, const PREDICTION_MODE this_mode, int prune_mode_level) { const int have_newmv = have_newmv_in_inter_mode(this_mode); if ((prune_mode_level < 2) && have_newmv) return 0; const int64_t best_inter_cost = inter_cost_info_from_tpl->best_inter_cost; if (best_inter_cost == INT64_MAX) return 0; const int prune_level = prune_mode_level - 1; int64_t cur_inter_cost; const int is_globalmv = (this_mode == GLOBALMV) || (this_mode == GLOBAL_GLOBALMV); const int prune_index = is_globalmv ? MAX_REF_MV_SEARCH : ref_mv_idx; // Thresholds used for pruning: // Lower value indicates aggressive pruning and higher value indicates // conservative pruning which is set based on ref_mv_idx and speed feature. // 'prune_index' 0, 1, 2 corresponds to ref_mv indices 0, 1 and 2. prune_index // 3 corresponds to GLOBALMV/GLOBAL_GLOBALMV static const int tpl_inter_mode_prune_mul_factor[3][MAX_REF_MV_SEARCH + 1] = { { 6, 6, 6, 4 }, { 6, 4, 4, 4 }, { 5, 4, 4, 4 } }; const int is_comp_pred = (refs[1] > INTRA_FRAME); if (!is_comp_pred) { cur_inter_cost = inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1]; } else { const int64_t inter_cost_ref0 = inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1]; const int64_t inter_cost_ref1 = inter_cost_info_from_tpl->ref_inter_cost[refs[1] - 1]; // Choose maximum inter_cost among inter_cost_ref0 and inter_cost_ref1 for // more aggressive pruning cur_inter_cost = AOMMAX(inter_cost_ref0, inter_cost_ref1); } // Prune the mode if cur_inter_cost is greater than threshold times // best_inter_cost if (cur_inter_cost > ((tpl_inter_mode_prune_mul_factor[prune_level][prune_index] * best_inter_cost) >> 2)) return 1; return 0; } /*!\brief High level function to select parameters for compound mode. * * \ingroup inter_mode_search * The main search functionality is done in the call to av1_compound_type_rd(). * * \param[in] cpi Top-level encoder structure. * \param[in] x Pointer to struct holding all the data for * the current macroblock. * \param[in] args HandleInterModeArgs struct holding * miscellaneous arguments for inter mode * search. See the documentation for this * struct for a description of each member. * \param[in] ref_best_rd Best RD found so far for this block. * It is used for early termination of this * search if the RD exceeds this value. * \param[in,out] cur_mv Current motion vector. * \param[in] bsize Current block size. * \param[in,out] compmode_interinter_cost RD of the selected interinter compound mode. * \param[in,out] rd_buffers CompoundTypeRdBuffers struct to hold all * allocated buffers for the compound * predictors and masks in the compound type * search. * \param[in,out] orig_dst A prediction buffer to hold a computed * prediction. This will eventually hold the * final prediction, and the tmp_dst info will * be copied here. * \param[in] tmp_dst A temporary prediction buffer to hold a * computed prediction. * \param[in,out] rate_mv The rate associated with the motion vectors. * This will be modified if a motion search is * done in the motion mode search. * \param[in,out] rd_stats Struct to keep track of the overall RD * information. * \param[in,out] skip_rd An array of length 2 where skip_rd[0] is the * best total RD for a skip mode so far, and * skip_rd[1] is the best RD for a skip mode so * far in luma. This is used as a speed feature * to skip the transform search if the computed * skip RD for the current mode is not better * than the best skip_rd so far. * \param[in,out] skip_build_pred Indicates whether or not to build the inter * predictor. If this is 0, the inter predictor * has already been built and thus we can avoid * repeating computation. * \return Returns 1 if this mode is worse than one already seen and 0 if it is * a viable candidate. */ static int process_compound_inter_mode( AV1_COMP *const cpi, MACROBLOCK *x, HandleInterModeArgs *args, int64_t ref_best_rd, int_mv *cur_mv, BLOCK_SIZE bsize, int *compmode_interinter_cost, const CompoundTypeRdBuffers *rd_buffers, const BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst, int *rate_mv, RD_STATS *rd_stats, int64_t *skip_rd, int *skip_build_pred) { MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; const AV1_COMMON *cm = &cpi->common; const int masked_compound_used = is_any_masked_compound_used(bsize) && cm->seq_params->enable_masked_compound; int mode_search_mask = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) | (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD); const int num_planes = av1_num_planes(cm); const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; int is_luma_interp_done = 0; set_default_interp_filters(mbmi, cm->features.interp_filter); int64_t best_rd_compound; int64_t rd_thresh; const int comp_type_rd_shift = COMP_TYPE_RD_THRESH_SHIFT; const int comp_type_rd_scale = COMP_TYPE_RD_THRESH_SCALE; rd_thresh = get_rd_thresh_from_best_rd(ref_best_rd, (1 << comp_type_rd_shift), comp_type_rd_scale); // Select compound type and any parameters related to that type // (for example, the mask parameters if it is a masked mode) and compute // the RD *compmode_interinter_cost = av1_compound_type_rd( cpi, x, args, bsize, cur_mv, mode_search_mask, masked_compound_used, orig_dst, tmp_dst, rd_buffers, rate_mv, &best_rd_compound, rd_stats, ref_best_rd, skip_rd[1], &is_luma_interp_done, rd_thresh); if (ref_best_rd < INT64_MAX && (best_rd_compound >> comp_type_rd_shift) * comp_type_rd_scale > ref_best_rd) { restore_dst_buf(xd, *orig_dst, num_planes); return 1; } // Build only uv predictor for COMPOUND_AVERAGE. // Note there is no need to call av1_enc_build_inter_predictor // for luma if COMPOUND_AVERAGE is selected because it is the first // candidate in av1_compound_type_rd, which means it used the dst_buf // rather than the tmp_buf. if (mbmi->interinter_comp.type == COMPOUND_AVERAGE && is_luma_interp_done) { if (num_planes > 1) { av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, AOM_PLANE_U, num_planes - 1); } *skip_build_pred = 1; } return 0; } // Speed feature to prune out MVs that are similar to previous MVs if they // don't achieve the best RD advantage. static int prune_ref_mv_idx_search(int ref_mv_idx, int best_ref_mv_idx, int_mv save_mv[MAX_REF_MV_SEARCH - 1][2], MB_MODE_INFO *mbmi, int pruning_factor) { int i; const int is_comp_pred = has_second_ref(mbmi); const int thr = (1 + is_comp_pred) << (pruning_factor + 1); // Skip the evaluation if an MV match is found. if (ref_mv_idx > 0) { for (int idx = 0; idx < ref_mv_idx; ++idx) { if (save_mv[idx][0].as_int == INVALID_MV) continue; int mv_diff = 0; for (i = 0; i < 1 + is_comp_pred; ++i) { mv_diff += abs(save_mv[idx][i].as_mv.row - mbmi->mv[i].as_mv.row) + abs(save_mv[idx][i].as_mv.col - mbmi->mv[i].as_mv.col); } // If this mode is not the best one, and current MV is similar to // previous stored MV, terminate this ref_mv_idx evaluation. if (best_ref_mv_idx == -1 && mv_diff <= thr) return 1; } } if (ref_mv_idx < MAX_REF_MV_SEARCH - 1) { for (i = 0; i < is_comp_pred + 1; ++i) save_mv[ref_mv_idx][i].as_int = mbmi->mv[i].as_int; } return 0; } /*!\brief Prunes ZeroMV Search Using Best NEWMV's SSE * * \ingroup inter_mode_search * * Compares the sse of zero mv and the best sse found in single new_mv. If the * sse of the zero_mv is higher, returns 1 to signal zero_mv can be skipped. * Else returns 0. * * Note that the sse of here comes from single_motion_search. So it is * interpolated with the filter in motion search, not the actual interpolation * filter used in encoding. * * \param[in] fn_ptr A table of function pointers to compute SSE. * \param[in] x Pointer to struct holding all the data for * the current macroblock. * \param[in] bsize The current block_size. * \param[in] args The args to handle_inter_mode, used to track * the best SSE. * \param[in] prune_zero_mv_with_sse The argument holds speed feature * prune_zero_mv_with_sse value * \return Returns 1 if zero_mv is pruned, 0 otherwise. */ static inline int prune_zero_mv_with_sse(const aom_variance_fn_ptr_t *fn_ptr, const MACROBLOCK *x, BLOCK_SIZE bsize, const HandleInterModeArgs *args, int prune_zero_mv_with_sse) { const MACROBLOCKD *xd = &x->e_mbd; const MB_MODE_INFO *mbmi = xd->mi[0]; const int is_comp_pred = has_second_ref(mbmi); const MV_REFERENCE_FRAME *refs = mbmi->ref_frame; for (int idx = 0; idx < 1 + is_comp_pred; idx++) { if (xd->global_motion[refs[idx]].wmtype != IDENTITY) { // Pruning logic only works for IDENTITY type models // Note: In theory we could apply similar logic for TRANSLATION // type models, but we do not code these due to a spec bug // (see comments in gm_get_motion_vector() in av1/common/mv.h) assert(xd->global_motion[refs[idx]].wmtype != TRANSLATION); return 0; } // Don't prune if we have invalid data assert(mbmi->mv[idx].as_int == 0); if (args->best_single_sse_in_refs[refs[idx]] == INT32_MAX) { return 0; } } // Sum up the sse of ZEROMV and best NEWMV unsigned int this_sse_sum = 0; unsigned int best_sse_sum = 0; for (int idx = 0; idx < 1 + is_comp_pred; idx++) { const struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; const struct macroblockd_plane *pd = xd->plane; const struct buf_2d *src_buf = &p->src; const struct buf_2d *ref_buf = &pd->pre[idx]; const uint8_t *src = src_buf->buf; const uint8_t *ref = ref_buf->buf; const int src_stride = src_buf->stride; const int ref_stride = ref_buf->stride; unsigned int this_sse; fn_ptr[bsize].vf(ref, ref_stride, src, src_stride, &this_sse); this_sse_sum += this_sse; const unsigned int best_sse = args->best_single_sse_in_refs[refs[idx]]; best_sse_sum += best_sse; } const double mul = prune_zero_mv_with_sse > 1 ? 1.00 : 1.25; if ((double)this_sse_sum > (mul * (double)best_sse_sum)) { return 1; } return 0; } /*!\brief Searches for interpolation filter in realtime mode during winner eval * * \ingroup inter_mode_search * * Does a simple interpolation filter search during winner mode evaluation. This * is currently only used by realtime mode as \ref * av1_interpolation_filter_search is not called during realtime encoding. * * This function only searches over two possible filters. EIGHTTAP_REGULAR is * always search. For lowres clips (<= 240p), MULTITAP_SHARP is also search. For * higher res slips (>240p), EIGHTTAP_SMOOTH is also searched. * * * \param[in] cpi Pointer to the compressor. Used for feature * flags. * \param[in,out] x Pointer to macroblock. This is primarily * used to access the buffers. * \param[in] mi_row The current row in mi unit (4X4 pixels). * \param[in] mi_col The current col in mi unit (4X4 pixels). * \param[in] bsize The current block_size. * \return Returns true if a predictor is built in xd->dst, false otherwise. */ static inline bool fast_interp_search(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, BLOCK_SIZE bsize) { static const InterpFilters filters_ref_set[3] = { { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR }, { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH }, { MULTITAP_SHARP, MULTITAP_SHARP } }; const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mi = xd->mi[0]; int64_t best_cost = INT64_MAX; int best_filter_index = -1; // dst_bufs[0] sores the new predictor, and dist_bifs[1] stores the best const int num_planes = av1_num_planes(cm); const int is_240p_or_lesser = AOMMIN(cm->width, cm->height) <= 240; assert(is_inter_mode(mi->mode)); assert(mi->motion_mode == SIMPLE_TRANSLATION); assert(!is_inter_compound_mode(mi->mode)); if (!av1_is_interp_needed(xd)) { return false; } struct macroblockd_plane *pd = xd->plane; const BUFFER_SET orig_dst = { { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf }, { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride }, }; uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]); const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE, tmp_buf + 2 * MAX_SB_SQUARE }, { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } }; const BUFFER_SET *dst_bufs[2] = { &orig_dst, &tmp_dst }; for (int i = 0; i < 3; ++i) { if (is_240p_or_lesser) { if (filters_ref_set[i].x_filter == EIGHTTAP_SMOOTH) { continue; } } else { if (filters_ref_set[i].x_filter == MULTITAP_SHARP) { continue; } } int64_t cost; RD_STATS tmp_rd = { 0 }; mi->interp_filters.as_filters = filters_ref_set[i]; av1_enc_build_inter_predictor_y(xd, mi_row, mi_col); model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model ? MODELRD_LEGACY : MODELRD_TYPE_INTERP_FILTER]( cpi, bsize, x, xd, AOM_PLANE_Y, AOM_PLANE_Y, &tmp_rd.rate, &tmp_rd.dist, &tmp_rd.skip_txfm, &tmp_rd.sse, NULL, NULL, NULL); tmp_rd.rate += av1_get_switchable_rate(x, xd, cm->features.interp_filter, cm->seq_params->enable_dual_filter); cost = RDCOST(x->rdmult, tmp_rd.rate, tmp_rd.dist); if (cost < best_cost) { best_filter_index = i; best_cost = cost; swap_dst_buf(xd, dst_bufs, num_planes); } } assert(best_filter_index >= 0); mi->interp_filters.as_filters = filters_ref_set[best_filter_index]; const bool is_best_pred_in_orig = &orig_dst == dst_bufs[1]; if (is_best_pred_in_orig) { swap_dst_buf(xd, dst_bufs, num_planes); } else { // Note that xd->pd's bufers are kept in sync with dst_bufs[0]. So if // is_best_pred_in_orig is false, that means the current buffer is the // original one. assert(&orig_dst == dst_bufs[0]); assert(xd->plane[AOM_PLANE_Y].dst.buf == orig_dst.plane[AOM_PLANE_Y]); const int width = block_size_wide[bsize]; const int height = block_size_high[bsize]; #if CONFIG_AV1_HIGHBITDEPTH const bool is_hbd = is_cur_buf_hbd(xd); if (is_hbd) { aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(tmp_dst.plane[AOM_PLANE_Y]), tmp_dst.stride[AOM_PLANE_Y], CONVERT_TO_SHORTPTR(orig_dst.plane[AOM_PLANE_Y]), orig_dst.stride[AOM_PLANE_Y], width, height); } else { aom_convolve_copy(tmp_dst.plane[AOM_PLANE_Y], tmp_dst.stride[AOM_PLANE_Y], orig_dst.plane[AOM_PLANE_Y], orig_dst.stride[AOM_PLANE_Y], width, height); } #else aom_convolve_copy(tmp_dst.plane[AOM_PLANE_Y], tmp_dst.stride[AOM_PLANE_Y], orig_dst.plane[AOM_PLANE_Y], orig_dst.stride[AOM_PLANE_Y], width, height); #endif } // Build the YUV predictor. if (num_planes > 1) { av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, AOM_PLANE_U, AOM_PLANE_V); } return true; } /*!\brief AV1 inter mode RD computation * * \ingroup inter_mode_search * Do the RD search for a given inter mode and compute all information relevant * to the input mode. It will compute the best MV, * compound parameters (if the mode is a compound mode) and interpolation filter * parameters. * * \param[in] cpi Top-level encoder structure. * \param[in] tile_data Pointer to struct holding adaptive * data/contexts/models for the tile during * encoding. * \param[in] x Pointer to structure holding all the data * for the current macroblock. * \param[in] bsize Current block size. * \param[in,out] rd_stats Struct to keep track of the overall RD * information. * \param[in,out] rd_stats_y Struct to keep track of the RD information * for only the Y plane. * \param[in,out] rd_stats_uv Struct to keep track of the RD information * for only the UV planes. * \param[in] args HandleInterModeArgs struct holding * miscellaneous arguments for inter mode * search. See the documentation for this * struct for a description of each member. * \param[in] ref_best_rd Best RD found so far for this block. * It is used for early termination of this * search if the RD exceeds this value. * \param[in] tmp_buf Temporary buffer used to hold predictors * built in this search. * \param[in,out] rd_buffers CompoundTypeRdBuffers struct to hold all * allocated buffers for the compound * predictors and masks in the compound type * search. * \param[in,out] best_est_rd Estimated RD for motion mode search if * do_tx_search (see below) is 0. * \param[in] do_tx_search Parameter to indicate whether or not to do * a full transform search. This will compute * an estimated RD for the modes without the * transform search and later perform the full * transform search on the best candidates. * \param[in,out] inter_modes_info InterModesInfo struct to hold inter mode * information to perform a full transform * search only on winning candidates searched * with an estimate for transform coding RD. * \param[in,out] motion_mode_cand A motion_mode_candidate struct to store * motion mode information used in a speed * feature to search motion modes other than * SIMPLE_TRANSLATION only on winning * candidates. * \param[in,out] skip_rd A length 2 array, where skip_rd[0] is the * best total RD for a skip mode so far, and * skip_rd[1] is the best RD for a skip mode so * far in luma. This is used as a speed feature * to skip the transform search if the computed * skip RD for the current mode is not better * than the best skip_rd so far. * \param[in] inter_cost_info_from_tpl A PruneInfoFromTpl struct used to * narrow down the search based on data * collected in the TPL model. * \param[out] yrd Stores the rdcost corresponding to encoding * the luma plane. * * \return The RD cost for the mode being searched. */ static int64_t handle_inter_mode( AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, HandleInterModeArgs *args, int64_t ref_best_rd, uint8_t *const tmp_buf, const CompoundTypeRdBuffers *rd_buffers, int64_t *best_est_rd, const int do_tx_search, InterModesInfo *inter_modes_info, motion_mode_candidate *motion_mode_cand, int64_t *skip_rd, PruneInfoFromTpl *inter_cost_info_from_tpl, int64_t *yrd) { const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; TxfmSearchInfo *txfm_info = &x->txfm_search_info; const int is_comp_pred = has_second_ref(mbmi); const PREDICTION_MODE this_mode = mbmi->mode; #if CONFIG_REALTIME_ONLY const int prune_modes_based_on_tpl = 0; #else // CONFIG_REALTIME_ONLY const TplParams *const tpl_data = &cpi->ppi->tpl_data; const int prune_modes_based_on_tpl = cpi->sf.inter_sf.prune_inter_modes_based_on_tpl && av1_tpl_stats_ready(tpl_data, cpi->gf_frame_index); #endif // CONFIG_REALTIME_ONLY int i; // Reference frames for this mode const int refs[2] = { mbmi->ref_frame[0], (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; int rate_mv = 0; int64_t rd = INT64_MAX; // Do first prediction into the destination buffer. Do the next // prediction into a temporary buffer. Then keep track of which one // of these currently holds the best predictor, and use the other // one for future predictions. In the end, copy from tmp_buf to // dst if necessary. struct macroblockd_plane *pd = xd->plane; const BUFFER_SET orig_dst = { { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf }, { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride }, }; const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE, tmp_buf + 2 * MAX_SB_SQUARE }, { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } }; int64_t ret_val = INT64_MAX; const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv; int64_t best_rd = INT64_MAX; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; int64_t best_yrd = INT64_MAX; MB_MODE_INFO best_mbmi = *mbmi; int best_xskip_txfm = 0; int64_t newmv_ret_val = INT64_MAX; inter_mode_info mode_info[MAX_REF_MV_SEARCH]; // Do not prune the mode based on inter cost from tpl if the current ref frame // is the winner ref in neighbouring blocks. int ref_match_found_in_above_nb = 0; int ref_match_found_in_left_nb = 0; if (prune_modes_based_on_tpl) { ref_match_found_in_above_nb = find_ref_match_in_above_nbs(cm->mi_params.mi_cols, xd); ref_match_found_in_left_nb = find_ref_match_in_left_nbs(cm->mi_params.mi_rows, xd); } // First, perform a simple translation search for each of the indices. If // an index performs well, it will be fully searched in the main loop // of this function. const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode); // Save MV results from first 2 ref_mv_idx. int_mv save_mv[MAX_REF_MV_SEARCH - 1][2]; int best_ref_mv_idx = -1; const int idx_mask = ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd, bsize, ref_set); const int16_t mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); const ModeCosts *mode_costs = &x->mode_costs; const int ref_mv_cost = cost_mv_ref(mode_costs, this_mode, mode_ctx); const int base_rate = args->ref_frame_cost + args->single_comp_cost + ref_mv_cost; for (i = 0; i < MAX_REF_MV_SEARCH - 1; ++i) { save_mv[i][0].as_int = INVALID_MV; save_mv[i][1].as_int = INVALID_MV; } args->start_mv_cnt = 0; // Main loop of this function. This will iterate over all of the ref mvs // in the dynamic reference list and do the following: // 1.) Get the current MV. Create newmv MV if necessary // 2.) Search compound type and parameters if applicable // 3.) Do interpolation filter search // 4.) Build the inter predictor // 5.) Pick the motion mode (SIMPLE_TRANSLATION, OBMC_CAUSAL, // WARPED_CAUSAL) // 6.) Update stats if best so far for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) { mbmi->ref_mv_idx = ref_mv_idx; mode_info[ref_mv_idx].full_search_mv.as_int = INVALID_MV; mode_info[ref_mv_idx].full_mv_bestsme = INT_MAX; const int drl_cost = get_drl_cost( mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type); mode_info[ref_mv_idx].drl_cost = drl_cost; mode_info[ref_mv_idx].skip = 0; if (!mask_check_bit(idx_mask, ref_mv_idx)) { // MV did not perform well in simple translation search. Skip it. continue; } if (prune_modes_based_on_tpl && !ref_match_found_in_above_nb && !ref_match_found_in_left_nb && (ref_best_rd != INT64_MAX)) { // Skip mode if TPL model indicates it will not be beneficial. if (prune_modes_based_on_tpl_stats( inter_cost_info_from_tpl, refs, ref_mv_idx, this_mode, cpi->sf.inter_sf.prune_inter_modes_based_on_tpl)) continue; } av1_init_rd_stats(rd_stats); // Initialize compound mode data mbmi->interinter_comp.type = COMPOUND_AVERAGE; mbmi->comp_group_idx = 0; mbmi->compound_idx = 1; if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME; mbmi->num_proj_ref = 0; mbmi->motion_mode = SIMPLE_TRANSLATION; // Compute cost for signalling this DRL index rd_stats->rate = base_rate; rd_stats->rate += drl_cost; int rs = 0; int compmode_interinter_cost = 0; int_mv cur_mv[2]; // TODO(Cherma): Extend this speed feature to support compound mode int skip_repeated_ref_mv = is_comp_pred ? 0 : cpi->sf.inter_sf.skip_repeated_ref_mv; // Generate the current mv according to the prediction mode if (!build_cur_mv(cur_mv, this_mode, cm, x, skip_repeated_ref_mv)) { continue; } // The above call to build_cur_mv does not handle NEWMV modes. Build // the mv here if we have NEWMV for any predictors. if (have_newmv_in_inter_mode(this_mode)) { #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, handle_newmv_time); #endif newmv_ret_val = handle_newmv(cpi, x, bsize, cur_mv, &rate_mv, args, mode_info); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, handle_newmv_time); #endif if (newmv_ret_val != 0) continue; if (is_inter_singleref_mode(this_mode) && cur_mv[0].as_int != INVALID_MV) { const MV_REFERENCE_FRAME ref = refs[0]; const unsigned int this_sse = x->pred_sse[ref]; if (this_sse < args->best_single_sse_in_refs[ref]) { args->best_single_sse_in_refs[ref] = this_sse; } if (cpi->sf.rt_sf.skip_newmv_mode_based_on_sse) { const int th_idx = cpi->sf.rt_sf.skip_newmv_mode_based_on_sse - 1; const int pix_idx = num_pels_log2_lookup[bsize] - 4; const double scale_factor[3][11] = { { 0.7, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9 }, { 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 1, 1, 1, 1, 1 }, { 0.7, 0.7, 0.7, 0.7, 1, 1, 1, 1, 1, 1, 1 } }; assert(pix_idx >= 0); assert(th_idx <= 2); if (args->best_pred_sse < scale_factor[th_idx][pix_idx] * this_sse) continue; } } rd_stats->rate += rate_mv; } // Copy the motion vector for this mode into mbmi struct for (i = 0; i < is_comp_pred + 1; ++i) { mbmi->mv[i].as_int = cur_mv[i].as_int; } if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd && mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) { continue; } // Skip the rest of the search if prune_ref_mv_idx_search speed feature // is enabled, and the current MV is similar to a previous one. if (cpi->sf.inter_sf.prune_ref_mv_idx_search && is_comp_pred && prune_ref_mv_idx_search(ref_mv_idx, best_ref_mv_idx, save_mv, mbmi, cpi->sf.inter_sf.prune_ref_mv_idx_search)) continue; if (cpi->sf.gm_sf.prune_zero_mv_with_sse && (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV)) { if (prune_zero_mv_with_sse(cpi->ppi->fn_ptr, x, bsize, args, cpi->sf.gm_sf.prune_zero_mv_with_sse)) { continue; } } int skip_build_pred = 0; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; // Handle a compound predictor, continue if it is determined this // cannot be the best compound mode if (is_comp_pred) { #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, compound_type_rd_time); #endif const int not_best_mode = process_compound_inter_mode( cpi, x, args, ref_best_rd, cur_mv, bsize, &compmode_interinter_cost, rd_buffers, &orig_dst, &tmp_dst, &rate_mv, rd_stats, skip_rd, &skip_build_pred); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, compound_type_rd_time); #endif if (not_best_mode) continue; } if (!args->skip_ifs) { #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, interpolation_filter_search_time); #endif // Determine the interpolation filter for this mode ret_val = av1_interpolation_filter_search( x, cpi, tile_data, bsize, &tmp_dst, &orig_dst, &rd, &rs, &skip_build_pred, args, ref_best_rd); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, interpolation_filter_search_time); #endif if (args->modelled_rd != NULL && !is_comp_pred) { args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd; } if (ret_val != 0) { restore_dst_buf(xd, orig_dst, num_planes); continue; } else if (cpi->sf.inter_sf.model_based_post_interp_filter_breakout && ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) { restore_dst_buf(xd, orig_dst, num_planes); continue; } // Compute modelled RD if enabled if (args->modelled_rd != NULL) { if (is_comp_pred) { const int mode0 = compound_ref0_mode(this_mode); const int mode1 = compound_ref1_mode(this_mode); const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]], args->modelled_rd[mode1][ref_mv_idx][refs[1]]); if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) { restore_dst_buf(xd, orig_dst, num_planes); continue; } } } } rd_stats->rate += compmode_interinter_cost; if (skip_build_pred != 1) { // Build this inter predictor if it has not been previously built av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, 0, av1_num_planes(cm) - 1); } #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, motion_mode_rd_time); #endif int rate2_nocoeff = rd_stats->rate; // Determine the motion mode. This will be one of SIMPLE_TRANSLATION, // OBMC_CAUSAL or WARPED_CAUSAL int64_t this_yrd; ret_val = motion_mode_rd(cpi, tile_data, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, args, ref_best_rd, skip_rd, &rate_mv, &orig_dst, best_est_rd, do_tx_search, inter_modes_info, 0, &this_yrd); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, motion_mode_rd_time); #endif assert( IMPLIES(!av1_check_newmv_joint_nonzero(cm, x), ret_val == INT64_MAX)); if (ret_val != INT64_MAX) { int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); const THR_MODES mode_enum = get_prediction_mode_idx( mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); // Collect mode stats for multiwinner mode processing store_winner_mode_stats(&cpi->common, x, mbmi, rd_stats, rd_stats_y, rd_stats_uv, mode_enum, NULL, bsize, tmp_rd, cpi->sf.winner_mode_sf.multi_winner_mode_type, do_tx_search); if (tmp_rd < best_rd) { best_yrd = this_yrd; // Update the best rd stats if we found the best mode so far best_rd_stats = *rd_stats; best_rd_stats_y = *rd_stats_y; best_rd_stats_uv = *rd_stats_uv; best_rd = tmp_rd; best_mbmi = *mbmi; best_xskip_txfm = txfm_info->skip_txfm; memcpy(best_blk_skip, txfm_info->blk_skip, sizeof(best_blk_skip[0]) * xd->height * xd->width); av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width); motion_mode_cand->rate_mv = rate_mv; motion_mode_cand->rate2_nocoeff = rate2_nocoeff; } if (tmp_rd < ref_best_rd) { ref_best_rd = tmp_rd; best_ref_mv_idx = ref_mv_idx; } } restore_dst_buf(xd, orig_dst, num_planes); } if (best_rd == INT64_MAX) return INT64_MAX; // re-instate status of the best choice *rd_stats = best_rd_stats; *rd_stats_y = best_rd_stats_y; *rd_stats_uv = best_rd_stats_uv; *yrd = best_yrd; *mbmi = best_mbmi; txfm_info->skip_txfm = best_xskip_txfm; assert(IMPLIES(mbmi->comp_group_idx == 1, mbmi->interinter_comp.type != COMPOUND_AVERAGE)); memcpy(txfm_info->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * xd->height * xd->width); av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width); rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); return rd_stats->rdcost; } /*!\brief Search for the best intrabc predictor * * \ingroup intra_mode_search * \callergraph * This function performs a motion search to find the best intrabc predictor. * * \returns Returns the best overall rdcost (including the non-intrabc modes * search before this function). */ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats, BLOCK_SIZE bsize, int64_t best_rd) { const AV1_COMMON *const cm = &cpi->common; if (!av1_allow_intrabc(cm) || !cpi->oxcf.kf_cfg.enable_intrabc || !cpi->sf.mv_sf.use_intrabc || cpi->sf.rt_sf.use_nonrd_pick_mode) return INT64_MAX; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; const TileInfo *tile = &xd->tile; MB_MODE_INFO *mbmi = xd->mi[0]; TxfmSearchInfo *txfm_info = &x->txfm_search_info; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; const int w = block_size_wide[bsize]; const int h = block_size_high[bsize]; const int sb_row = mi_row >> cm->seq_params->mib_size_log2; const int sb_col = mi_col >> cm->seq_params->mib_size_log2; MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; const MV_REFERENCE_FRAME ref_frame = INTRA_FRAME; av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, mbmi_ext->mode_context); // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame); int_mv nearestmv, nearmv; av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv, 0); if (nearestmv.as_int == INVALID_MV) { nearestmv.as_int = 0; } if (nearmv.as_int == INVALID_MV) { nearmv.as_int = 0; } int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv; if (dv_ref.as_int == 0) { av1_find_ref_dv(&dv_ref, tile, cm->seq_params->mib_size, mi_row); } // Ref DV should not have sub-pel. assert((dv_ref.as_mv.col & 7) == 0); assert((dv_ref.as_mv.row & 7) == 0); mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv = dv_ref; struct buf_2d yv12_mb[MAX_MB_PLANE]; av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, NULL, NULL, num_planes); for (int i = 0; i < num_planes; ++i) { xd->plane[i].pre[0] = yv12_mb[i]; } enum IntrabcMotionDirection { IBC_MOTION_ABOVE, IBC_MOTION_LEFT, IBC_MOTION_DIRECTIONS }; MB_MODE_INFO best_mbmi = *mbmi; RD_STATS best_rdstats = *rd_stats; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 }; uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); FULLPEL_MOTION_SEARCH_PARAMS fullms_params; const SEARCH_METHODS search_method = av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize); const search_site_config *lookahead_search_sites = cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD]; const FULLPEL_MV start_mv = get_fullmv_from_mv(&dv_ref.as_mv); av1_make_default_fullpel_ms_params(&fullms_params, cpi, x, bsize, &dv_ref.as_mv, start_mv, lookahead_search_sites, search_method, /*fine_search_interval=*/0); const IntraBCMVCosts *const dv_costs = x->dv_costs; av1_set_ms_to_intra_mode(&fullms_params, dv_costs); for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE; dir < IBC_MOTION_DIRECTIONS; ++dir) { switch (dir) { case IBC_MOTION_ABOVE: fullms_params.mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE; fullms_params.mv_limits.col_max = (tile->mi_col_end - mi_col) * MI_SIZE - w; fullms_params.mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE; fullms_params.mv_limits.row_max = (sb_row * cm->seq_params->mib_size - mi_row) * MI_SIZE - h; break; case IBC_MOTION_LEFT: fullms_params.mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE; fullms_params.mv_limits.col_max = (sb_col * cm->seq_params->mib_size - mi_col) * MI_SIZE - w; // TODO(aconverse@google.com): Minimize the overlap between above and // left areas. fullms_params.mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE; int bottom_coded_mi_edge = AOMMIN((sb_row + 1) * cm->seq_params->mib_size, tile->mi_row_end); fullms_params.mv_limits.row_max = (bottom_coded_mi_edge - mi_row) * MI_SIZE - h; break; default: assert(0); } assert(fullms_params.mv_limits.col_min >= fullms_params.mv_limits.col_min); assert(fullms_params.mv_limits.col_max <= fullms_params.mv_limits.col_max); assert(fullms_params.mv_limits.row_min >= fullms_params.mv_limits.row_min); assert(fullms_params.mv_limits.row_max <= fullms_params.mv_limits.row_max); av1_set_mv_search_range(&fullms_params.mv_limits, &dv_ref.as_mv); if (fullms_params.mv_limits.col_max < fullms_params.mv_limits.col_min || fullms_params.mv_limits.row_max < fullms_params.mv_limits.row_min) { continue; } const int step_param = cpi->mv_search_params.mv_step_param; IntraBCHashInfo *intrabc_hash_info = &x->intrabc_hash_info; int_mv best_mv, best_hash_mv; FULLPEL_MV_STATS best_mv_stats; int bestsme = av1_full_pixel_search(start_mv, &fullms_params, step_param, NULL, &best_mv.as_fullmv, &best_mv_stats, NULL); const int hashsme = av1_intrabc_hash_search( cpi, xd, &fullms_params, intrabc_hash_info, &best_hash_mv.as_fullmv); if (hashsme < bestsme) { best_mv = best_hash_mv; bestsme = hashsme; } if (bestsme == INT_MAX) continue; const MV dv = get_mv_from_fullmv(&best_mv.as_fullmv); if (!av1_is_fullmv_in_range(&fullms_params.mv_limits, get_fullmv_from_mv(&dv))) continue; if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize, cm->seq_params->mib_size_log2)) continue; // DV should not have sub-pel. assert((dv.col & 7) == 0); assert((dv.row & 7) == 0); memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info)); mbmi->filter_intra_mode_info.use_filter_intra = 0; mbmi->use_intrabc = 1; mbmi->mode = DC_PRED; mbmi->uv_mode = UV_DC_PRED; mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->mv[0].as_mv = dv; mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR); mbmi->skip_txfm = 0; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, av1_num_planes(cm) - 1); // TODO(aconverse@google.com): The full motion field defining discount // in MV_COST_WEIGHT is too large. Explore other values. const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, dv_costs->joint_mv, dv_costs->dv_costs, MV_COST_WEIGHT_SUB); const int rate_mode = x->mode_costs.intrabc_cost[1]; RD_STATS rd_stats_yuv, rd_stats_y, rd_stats_uv; if (!av1_txfm_search(cpi, x, bsize, &rd_stats_yuv, &rd_stats_y, &rd_stats_uv, rate_mode + rate_mv, INT64_MAX)) continue; rd_stats_yuv.rdcost = RDCOST(x->rdmult, rd_stats_yuv.rate, rd_stats_yuv.dist); if (rd_stats_yuv.rdcost < best_rd) { best_rd = rd_stats_yuv.rdcost; best_mbmi = *mbmi; best_rdstats = rd_stats_yuv; memcpy(best_blk_skip, txfm_info->blk_skip, sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width); av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width); } } *mbmi = best_mbmi; *rd_stats = best_rdstats; memcpy(txfm_info->blk_skip, best_blk_skip, sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width); av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); #if CONFIG_RD_DEBUG mbmi->rd_stats = *rd_stats; #endif return best_rd; } // TODO(chiyotsai@google.com): We are using struct $struct_name instead of their // typedef here because Doxygen doesn't know about the typedefs yet. So using // the typedef will prevent doxygen from finding this function and generating // the callgraph. Once documents for AV1_COMP and MACROBLOCK are added to // doxygen, we can revert back to using the typedefs. void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x, struct RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const int num_planes = av1_num_planes(cm); TxfmSearchInfo *txfm_info = &x->txfm_search_info; int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0; uint8_t y_skip_txfm = 0, uv_skip_txfm = 0; int64_t dist_y = 0, dist_uv = 0; ctx->rd_stats.skip_txfm = 0; mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE_FRAME; mbmi->use_intrabc = 0; mbmi->mv[0].as_int = 0; mbmi->skip_mode = 0; const int64_t intra_yrd = av1_rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y, &y_skip_txfm, bsize, best_rd, ctx); // Initialize default mode evaluation params set_mode_eval_params(cpi, x, DEFAULT_EVAL); if (intra_yrd < best_rd) { // Search intra modes for uv planes if needed if (num_planes > 1) { // Set up the tx variables for reproducing the y predictions in case we // need it for chroma-from-luma. if (xd->is_chroma_ref && store_cfl_required_rdo(cm, x)) { memcpy(txfm_info->blk_skip, ctx->blk_skip, sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk); } const TX_SIZE max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); av1_rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv, &uv_skip_txfm, bsize, max_uv_tx_size); } // Intra block is always coded as non-skip rd_cost->rate = rate_y + rate_uv + x->mode_costs.skip_txfm_cost[av1_get_skip_txfm_context(xd)][0]; rd_cost->dist = dist_y + dist_uv; rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist); rd_cost->skip_txfm = 0; } else { rd_cost->rate = INT_MAX; } if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd) best_rd = rd_cost->rdcost; if (rd_pick_intrabc_mode_sb(cpi, x, ctx, rd_cost, bsize, best_rd) < best_rd) { ctx->rd_stats.skip_txfm = mbmi->skip_txfm; memcpy(ctx->blk_skip, txfm_info->blk_skip, sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); assert(rd_cost->rate != INT_MAX); } if (rd_cost->rate == INT_MAX) return; ctx->mic = *xd->mi[0]; av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext, av1_ref_frame_type(xd->mi[0]->ref_frame)); av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); } static inline void calc_target_weighted_pred( const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd, const uint8_t *above, int above_stride, const uint8_t *left, int left_stride); static inline void rd_pick_skip_mode( RD_STATS *rd_cost, InterModeSearchState *search_state, const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { const AV1_COMMON *const cm = &cpi->common; const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; x->compound_idx = 1; // COMPOUND_AVERAGE RD_STATS skip_mode_rd_stats; av1_invalid_rd_stats(&skip_mode_rd_stats); if (skip_mode_info->ref_frame_idx_0 == INVALID_IDX || skip_mode_info->ref_frame_idx_1 == INVALID_IDX) { return; } const MV_REFERENCE_FRAME ref_frame = LAST_FRAME + skip_mode_info->ref_frame_idx_0; const MV_REFERENCE_FRAME second_ref_frame = LAST_FRAME + skip_mode_info->ref_frame_idx_1; const PREDICTION_MODE this_mode = NEAREST_NEARESTMV; const THR_MODES mode_index = get_prediction_mode_idx(this_mode, ref_frame, second_ref_frame); if (mode_index == THR_INVALID) { return; } if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp || cpi->sf.inter_sf.disable_onesided_comp) && cpi->all_one_sided_refs) { return; } mbmi->mode = this_mode; mbmi->uv_mode = UV_DC_PRED; mbmi->ref_frame[0] = ref_frame; mbmi->ref_frame[1] = second_ref_frame; const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); if (x->mbmi_ext.ref_mv_count[ref_frame_type] == UINT8_MAX) { MB_MODE_INFO_EXT *mbmi_ext = &x->mbmi_ext; if (mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX || mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) { return; } av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count, xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, mbmi_ext->mode_context); // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_type); } assert(this_mode == NEAREST_NEARESTMV); if (!build_cur_mv(mbmi->mv, this_mode, cm, x, 0)) { return; } mbmi->filter_intra_mode_info.use_filter_intra = 0; mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1); mbmi->comp_group_idx = 0; mbmi->compound_idx = x->compound_idx; mbmi->interinter_comp.type = COMPOUND_AVERAGE; mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->ref_mv_idx = 0; mbmi->skip_mode = mbmi->skip_txfm = 1; mbmi->palette_mode_info.palette_size[0] = 0; mbmi->palette_mode_info.palette_size[1] = 0; set_default_interp_filters(mbmi, cm->features.interp_filter); set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); for (int i = 0; i < num_planes; i++) { xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; } BUFFER_SET orig_dst; for (int i = 0; i < num_planes; i++) { orig_dst.plane[i] = xd->plane[i].dst.buf; orig_dst.stride[i] = xd->plane[i].dst.stride; } // Compare the use of skip_mode with the best intra/inter mode obtained. const int skip_mode_ctx = av1_get_skip_mode_context(xd); int64_t best_intra_inter_mode_cost = INT64_MAX; if (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX) { const ModeCosts *mode_costs = &x->mode_costs; best_intra_inter_mode_cost = RDCOST( x->rdmult, rd_cost->rate + mode_costs->skip_mode_cost[skip_mode_ctx][0], rd_cost->dist); // Account for non-skip mode rate in total rd stats rd_cost->rate += mode_costs->skip_mode_cost[skip_mode_ctx][0]; av1_rd_cost_update(x->rdmult, rd_cost); } // Obtain the rdcost for skip_mode. skip_mode_rd(&skip_mode_rd_stats, cpi, x, bsize, &orig_dst, best_intra_inter_mode_cost); if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost && (!xd->lossless[mbmi->segment_id] || skip_mode_rd_stats.dist == 0)) { assert(mode_index != THR_INVALID); search_state->best_mbmode.skip_mode = 1; search_state->best_mbmode = *mbmi; memset(search_state->best_mbmode.inter_tx_size, search_state->best_mbmode.tx_size, sizeof(search_state->best_mbmode.inter_tx_size)); set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->width, xd->height, search_state->best_mbmode.skip_txfm && is_inter_block(mbmi), xd); search_state->best_mode_index = mode_index; // Update rd_cost rd_cost->rate = skip_mode_rd_stats.rate; rd_cost->dist = rd_cost->sse = skip_mode_rd_stats.dist; rd_cost->rdcost = skip_mode_rd_stats.rdcost; search_state->best_rd = rd_cost->rdcost; search_state->best_skip2 = 1; search_state->best_mode_skippable = 1; x->txfm_search_info.skip_txfm = 1; } } // Get winner mode stats of given mode index static inline MB_MODE_INFO *get_winner_mode_stats( MACROBLOCK *x, MB_MODE_INFO *best_mbmode, RD_STATS *best_rd_cost, int best_rate_y, int best_rate_uv, THR_MODES *best_mode_index, RD_STATS **winner_rd_cost, int *winner_rate_y, int *winner_rate_uv, THR_MODES *winner_mode_index, MULTI_WINNER_MODE_TYPE multi_winner_mode_type, int mode_idx) { MB_MODE_INFO *winner_mbmi; if (multi_winner_mode_type) { assert(mode_idx >= 0 && mode_idx < x->winner_mode_count); WinnerModeStats *winner_mode_stat = &x->winner_mode_stats[mode_idx]; winner_mbmi = &winner_mode_stat->mbmi; *winner_rd_cost = &winner_mode_stat->rd_cost; *winner_rate_y = winner_mode_stat->rate_y; *winner_rate_uv = winner_mode_stat->rate_uv; *winner_mode_index = winner_mode_stat->mode_index; } else { winner_mbmi = best_mbmode; *winner_rd_cost = best_rd_cost; *winner_rate_y = best_rate_y; *winner_rate_uv = best_rate_uv; *winner_mode_index = *best_mode_index; } return winner_mbmi; } // speed feature: fast intra/inter transform type search // Used for speed >= 2 // When this speed feature is on, in rd mode search, only DCT is used. // After the mode is determined, this function is called, to select // transform types and get accurate rdcost. static inline void refine_winner_mode_tx( const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, THR_MODES *best_mode_index, MB_MODE_INFO *best_mbmode, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], int best_rate_y, int best_rate_uv, int *best_skip2, int winner_mode_count) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; TxfmSearchParams *txfm_params = &x->txfm_search_params; TxfmSearchInfo *txfm_info = &x->txfm_search_info; int64_t best_rd; const int num_planes = av1_num_planes(cm); if (!is_winner_mode_processing_enabled(cpi, x, best_mbmode, rd_cost->skip_txfm)) return; // Set params for winner mode evaluation set_mode_eval_params(cpi, x, WINNER_MODE_EVAL); // No best mode identified so far if (*best_mode_index == THR_INVALID) return; best_rd = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist); for (int mode_idx = 0; mode_idx < winner_mode_count; mode_idx++) { RD_STATS *winner_rd_stats = NULL; int winner_rate_y = 0, winner_rate_uv = 0; THR_MODES winner_mode_index = 0; // TODO(any): Combine best mode and multi-winner mode processing paths // Get winner mode stats for current mode index MB_MODE_INFO *winner_mbmi = get_winner_mode_stats( x, best_mbmode, rd_cost, best_rate_y, best_rate_uv, best_mode_index, &winner_rd_stats, &winner_rate_y, &winner_rate_uv, &winner_mode_index, cpi->sf.winner_mode_sf.multi_winner_mode_type, mode_idx); if (xd->lossless[winner_mbmi->segment_id] == 0 && winner_mode_index != THR_INVALID && is_winner_mode_processing_enabled(cpi, x, winner_mbmi, rd_cost->skip_txfm)) { RD_STATS rd_stats = *winner_rd_stats; int skip_blk = 0; RD_STATS rd_stats_y, rd_stats_uv; const int skip_ctx = av1_get_skip_txfm_context(xd); *mbmi = *winner_mbmi; set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); // Select prediction reference frames. for (int i = 0; i < num_planes; i++) { xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; if (has_second_ref(mbmi)) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; } if (is_inter_mode(mbmi->mode)) { const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; bool is_predictor_built = false; const PREDICTION_MODE prediction_mode = mbmi->mode; // Do interpolation filter search for realtime mode if applicable. if (cpi->sf.winner_mode_sf.winner_mode_ifs && cpi->oxcf.mode == REALTIME && cm->current_frame.reference_mode == SINGLE_REFERENCE && is_inter_mode(prediction_mode) && mbmi->motion_mode == SIMPLE_TRANSLATION && !is_inter_compound_mode(prediction_mode)) { is_predictor_built = fast_interp_search(cpi, x, mi_row, mi_col, bsize); } if (!is_predictor_built) { av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, av1_num_planes(cm) - 1); } if (mbmi->motion_mode == OBMC_CAUSAL) av1_build_obmc_inter_predictors_sb(cm, xd); av1_subtract_plane(x, bsize, 0); if (txfm_params->tx_mode_search_type == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); assert(rd_stats_y.rate != INT_MAX); } else { av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); for (int i = 0; i < xd->height * xd->width; ++i) set_blk_skip(txfm_info->blk_skip, 0, i, rd_stats_y.skip_txfm); } } else { av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); } if (num_planes > 1) { av1_txfm_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); } else { av1_init_rd_stats(&rd_stats_uv); } const ModeCosts *mode_costs = &x->mode_costs; if (is_inter_mode(mbmi->mode) && RDCOST(x->rdmult, mode_costs->skip_txfm_cost[skip_ctx][0] + rd_stats_y.rate + rd_stats_uv.rate, (rd_stats_y.dist + rd_stats_uv.dist)) > RDCOST(x->rdmult, mode_costs->skip_txfm_cost[skip_ctx][1], (rd_stats_y.sse + rd_stats_uv.sse))) { skip_blk = 1; rd_stats_y.rate = mode_costs->skip_txfm_cost[skip_ctx][1]; rd_stats_uv.rate = 0; rd_stats_y.dist = rd_stats_y.sse; rd_stats_uv.dist = rd_stats_uv.sse; } else { skip_blk = 0; rd_stats_y.rate += mode_costs->skip_txfm_cost[skip_ctx][0]; } int this_rate = rd_stats.rate + rd_stats_y.rate + rd_stats_uv.rate - winner_rate_y - winner_rate_uv; int64_t this_rd = RDCOST(x->rdmult, this_rate, (rd_stats_y.dist + rd_stats_uv.dist)); if (best_rd > this_rd) { *best_mbmode = *mbmi; *best_mode_index = winner_mode_index; av1_copy_array(ctx->blk_skip, txfm_info->blk_skip, ctx->num_4x4_blk); av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); rd_cost->rate = this_rate; rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist; rd_cost->sse = rd_stats_y.sse + rd_stats_uv.sse; rd_cost->rdcost = this_rd; best_rd = this_rd; *best_skip2 = skip_blk; } } } } /*!\cond */ typedef struct { // Mask for each reference frame, specifying which prediction modes to NOT try // during search. uint32_t pred_modes[REF_FRAMES]; // If ref_combo[i][j + 1] is true, do NOT try prediction using combination of // reference frames (i, j). // Note: indexing with 'j + 1' is due to the fact that 2nd reference can be -1 // (NONE_FRAME). bool ref_combo[REF_FRAMES][REF_FRAMES + 1]; } mode_skip_mask_t; /*!\endcond */ // Update 'ref_combo' mask to disable given 'ref' in single and compound modes. static inline void disable_reference( MV_REFERENCE_FRAME ref, bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) { for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) { ref_combo[ref][ref2 + 1] = true; } } // Update 'ref_combo' mask to disable all inter references except ALTREF. static inline void disable_inter_references_except_altref( bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) { disable_reference(LAST_FRAME, ref_combo); disable_reference(LAST2_FRAME, ref_combo); disable_reference(LAST3_FRAME, ref_combo); disable_reference(GOLDEN_FRAME, ref_combo); disable_reference(BWDREF_FRAME, ref_combo); disable_reference(ALTREF2_FRAME, ref_combo); } static const MV_REFERENCE_FRAME reduced_ref_combos[][2] = { { LAST_FRAME, NONE_FRAME }, { ALTREF_FRAME, NONE_FRAME }, { LAST_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, NONE_FRAME }, { INTRA_FRAME, NONE_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME }, { LAST_FRAME, GOLDEN_FRAME }, { LAST_FRAME, INTRA_FRAME }, { LAST_FRAME, BWDREF_FRAME }, { LAST_FRAME, LAST3_FRAME }, { GOLDEN_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, INTRA_FRAME }, { BWDREF_FRAME, NONE_FRAME }, { BWDREF_FRAME, ALTREF_FRAME }, { ALTREF_FRAME, INTRA_FRAME }, { BWDREF_FRAME, INTRA_FRAME }, }; typedef enum { REF_SET_FULL, REF_SET_REDUCED, REF_SET_REALTIME } REF_SET; static inline void default_skip_mask(mode_skip_mask_t *mask, REF_SET ref_set) { if (ref_set == REF_SET_FULL) { // Everything available by default. memset(mask, 0, sizeof(*mask)); } else { // All modes available by default. memset(mask->pred_modes, 0, sizeof(mask->pred_modes)); // All references disabled first. for (MV_REFERENCE_FRAME ref1 = INTRA_FRAME; ref1 < REF_FRAMES; ++ref1) { for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) { mask->ref_combo[ref1][ref2 + 1] = true; } } const MV_REFERENCE_FRAME(*ref_set_combos)[2]; int num_ref_combos; // Then enable reduced set of references explicitly. switch (ref_set) { case REF_SET_REDUCED: ref_set_combos = reduced_ref_combos; num_ref_combos = (int)sizeof(reduced_ref_combos) / sizeof(reduced_ref_combos[0]); break; case REF_SET_REALTIME: ref_set_combos = real_time_ref_combos; num_ref_combos = (int)sizeof(real_time_ref_combos) / sizeof(real_time_ref_combos[0]); break; default: assert(0); num_ref_combos = 0; } for (int i = 0; i < num_ref_combos; ++i) { const MV_REFERENCE_FRAME *const this_combo = ref_set_combos[i]; mask->ref_combo[this_combo[0]][this_combo[1] + 1] = false; } } } static inline void init_mode_skip_mask(mode_skip_mask_t *mask, const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize) { const AV1_COMMON *const cm = &cpi->common; const struct segmentation *const seg = &cm->seg; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; unsigned char segment_id = mbmi->segment_id; const SPEED_FEATURES *const sf = &cpi->sf; const INTER_MODE_SPEED_FEATURES *const inter_sf = &sf->inter_sf; REF_SET ref_set = REF_SET_FULL; if (sf->rt_sf.use_real_time_ref_set) ref_set = REF_SET_REALTIME; else if (cpi->oxcf.ref_frm_cfg.enable_reduced_reference_set) ref_set = REF_SET_REDUCED; default_skip_mask(mask, ref_set); int min_pred_mv_sad = INT_MAX; MV_REFERENCE_FRAME ref_frame; if (ref_set == REF_SET_REALTIME) { // For real-time encoding, we only look at a subset of ref frames. So the // threshold for pruning should be computed from this subset as well. const int num_rt_refs = sizeof(real_time_ref_combos) / sizeof(*real_time_ref_combos); for (int r_idx = 0; r_idx < num_rt_refs; r_idx++) { const MV_REFERENCE_FRAME ref = real_time_ref_combos[r_idx][0]; if (ref != INTRA_FRAME) { min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref]); } } } else { for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]); } for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame])) { // Skip checking missing reference in both single and compound reference // modes. disable_reference(ref_frame, mask->ref_combo); } else { // Skip fixed mv modes for poor references if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) { mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO; } } if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { // Reference not used for the segment. disable_reference(ref_frame, mask->ref_combo); } } // Note: We use the following drop-out only if the SEG_LVL_REF_FRAME feature // is disabled for this segment. This is to prevent the possibility that we // end up unable to pick any mode. if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame, // unless ARNR filtering is enabled in which case we want // an unfiltered alternative. We allow near/nearest as well // because they may result in zero-zero MVs but be cheaper. if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.algo_cfg.arnr_max_frames == 0)) { disable_inter_references_except_altref(mask->ref_combo); mask->pred_modes[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO; const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME }; int_mv near_mv, nearest_mv, global_mv; get_this_mv(&nearest_mv, NEARESTMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext); get_this_mv(&near_mv, NEARMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext); get_this_mv(&global_mv, GLOBALMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext); if (near_mv.as_int != global_mv.as_int) mask->pred_modes[ALTREF_FRAME] |= (1 << NEARMV); if (nearest_mv.as_int != global_mv.as_int) mask->pred_modes[ALTREF_FRAME] |= (1 << NEARESTMV); } } if (cpi->rc.is_src_frame_alt_ref) { if (inter_sf->alt_ref_search_fp && (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME])) { mask->pred_modes[ALTREF_FRAME] = 0; disable_inter_references_except_altref(mask->ref_combo); disable_reference(INTRA_FRAME, mask->ref_combo); } } if (inter_sf->alt_ref_search_fp) { if (!cm->show_frame && x->best_pred_mv_sad[0] < INT_MAX) { int sad_thresh = x->best_pred_mv_sad[0] + (x->best_pred_mv_sad[0] >> 3); // Conservatively skip the modes w.r.t. BWDREF, ALTREF2 and ALTREF, if // those are past frames MV_REFERENCE_FRAME start_frame = inter_sf->alt_ref_search_fp == 1 ? ALTREF2_FRAME : BWDREF_FRAME; for (ref_frame = start_frame; ref_frame <= ALTREF_FRAME; ref_frame++) { if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] < 0) { // Prune inter modes when relative dist of ALTREF2 and ALTREF is close // to the relative dist of LAST_FRAME. if (inter_sf->alt_ref_search_fp == 1 && (abs(cpi->ref_frame_dist_info .ref_relative_dist[ref_frame - LAST_FRAME]) > 1.5 * abs(cpi->ref_frame_dist_info .ref_relative_dist[LAST_FRAME - LAST_FRAME]))) { continue; } if (x->pred_mv_sad[ref_frame] > sad_thresh) mask->pred_modes[ref_frame] |= INTER_ALL; } } } } if (sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad) { if (x->best_pred_mv_sad[0] < INT_MAX) { int sad_thresh = x->best_pred_mv_sad[0] + (x->best_pred_mv_sad[0] >> 1); const int prune_ref_list[2] = { GOLDEN_FRAME, ALTREF_FRAME }; // Conservatively skip the modes w.r.t. GOLDEN and ALTREF references for (int ref_idx = 0; ref_idx < 2; ref_idx++) { ref_frame = prune_ref_list[ref_idx]; if (x->pred_mv_sad[ref_frame] > sad_thresh) mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO; } } } if (bsize > sf->part_sf.max_intra_bsize) { disable_reference(INTRA_FRAME, mask->ref_combo); } if (!cpi->oxcf.tool_cfg.enable_global_motion) { for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { mask->pred_modes[ref_frame] |= (1 << GLOBALMV); mask->pred_modes[ref_frame] |= (1 << GLOBAL_GLOBALMV); } } mask->pred_modes[INTRA_FRAME] |= ~(uint32_t)sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]]; // Prune reference frames which are not the closest to the current // frame and with large pred_mv_sad. if (inter_sf->prune_single_ref) { assert(inter_sf->prune_single_ref > 0 && inter_sf->prune_single_ref < 3); const double prune_threshes[2] = { 1.20, 1.05 }; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const RefFrameDistanceInfo *const ref_frame_dist_info = &cpi->ref_frame_dist_info; const int is_closest_ref = (ref_frame == ref_frame_dist_info->nearest_past_ref) || (ref_frame == ref_frame_dist_info->nearest_future_ref); if (!is_closest_ref) { const int dir = (ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] < 0) ? 0 : 1; if (x->best_pred_mv_sad[dir] < INT_MAX && x->pred_mv_sad[ref_frame] > prune_threshes[inter_sf->prune_single_ref - 1] * x->best_pred_mv_sad[dir]) mask->pred_modes[ref_frame] |= INTER_SINGLE_ALL; } } } } static inline void init_neighbor_pred_buf(const OBMCBuffer *const obmc_buffer, HandleInterModeArgs *const args, int is_hbd) { if (is_hbd) { const int len = sizeof(uint16_t); args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred); args->above_pred_buf[1] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred + (MAX_SB_SQUARE >> 1) * len); args->above_pred_buf[2] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred + MAX_SB_SQUARE * len); args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->left_pred); args->left_pred_buf[1] = CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1) * len); args->left_pred_buf[2] = CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + MAX_SB_SQUARE * len); } else { args->above_pred_buf[0] = obmc_buffer->above_pred; args->above_pred_buf[1] = obmc_buffer->above_pred + (MAX_SB_SQUARE >> 1); args->above_pred_buf[2] = obmc_buffer->above_pred + MAX_SB_SQUARE; args->left_pred_buf[0] = obmc_buffer->left_pred; args->left_pred_buf[1] = obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1); args->left_pred_buf[2] = obmc_buffer->left_pred + MAX_SB_SQUARE; } } static inline int prune_ref_frame(const AV1_COMP *cpi, const MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame) { const AV1_COMMON *const cm = &cpi->common; MV_REFERENCE_FRAME rf[2]; av1_set_ref_frame(rf, ref_frame); if ((cpi->prune_ref_frame_mask >> ref_frame) & 1) return 1; if (prune_ref_by_selective_ref_frame(cpi, x, rf, cm->cur_frame->ref_display_order_hint)) { return 1; } return 0; } static inline int is_ref_frame_used_by_compound_ref(int ref_frame, int skip_ref_frame_mask) { for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) { if (!(skip_ref_frame_mask & (1 << r))) { const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES]; if (rf[0] == ref_frame || rf[1] == ref_frame) { return 1; } } } return 0; } static inline int is_ref_frame_used_in_cache(MV_REFERENCE_FRAME ref_frame, const MB_MODE_INFO *mi_cache) { if (!mi_cache) { return 0; } if (ref_frame < REF_FRAMES) { return (ref_frame == mi_cache->ref_frame[0] || ref_frame == mi_cache->ref_frame[1]); } // if we are here, then the current mode is compound. MV_REFERENCE_FRAME cached_ref_type = av1_ref_frame_type(mi_cache->ref_frame); return ref_frame == cached_ref_type; } // Please add/modify parameter setting in this function, making it consistent // and easy to read and maintain. static inline void set_params_rd_pick_inter_mode( const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args, BLOCK_SIZE bsize, mode_skip_mask_t *mode_skip_mask, int skip_ref_frame_mask, unsigned int *ref_costs_single, unsigned int (*ref_costs_comp)[REF_FRAMES], struct buf_2d (*yv12_mb)[MAX_MB_PLANE]) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; unsigned char segment_id = mbmi->segment_id; init_neighbor_pred_buf(&x->obmc_buffer, args, is_cur_buf_hbd(&x->e_mbd)); av1_collect_neighbors_ref_counts(xd); estimate_ref_frame_costs(cm, xd, &x->mode_costs, segment_id, ref_costs_single, ref_costs_comp); const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; x->best_pred_mv_sad[0] = INT_MAX; x->best_pred_mv_sad[1] = INT_MAX; for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; mbmi_ext->mode_context[ref_frame] = 0; mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { // Skip the ref frame if the mask says skip and the ref is not used by // compound ref. if (skip_ref_frame_mask & (1 << ref_frame) && !is_ref_frame_used_by_compound_ref(ref_frame, skip_ref_frame_mask) && !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) { continue; } assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL); setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, yv12_mb); } if (cpi->sf.inter_sf.alt_ref_search_fp || cpi->sf.inter_sf.prune_single_ref || cpi->sf.rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad) { // Store the best pred_mv_sad across all past frames if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] < 0) x->best_pred_mv_sad[0] = AOMMIN(x->best_pred_mv_sad[0], x->pred_mv_sad[ref_frame]); else // Store the best pred_mv_sad across all future frames x->best_pred_mv_sad[1] = AOMMIN(x->best_pred_mv_sad[1], x->pred_mv_sad[ref_frame]); } } if (!cpi->sf.rt_sf.use_real_time_ref_set && is_comp_ref_allowed(bsize)) { // No second reference on RT ref set, so no need to initialize for (MV_REFERENCE_FRAME ref_frame = EXTREF_FRAME; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) { mbmi_ext->mode_context[ref_frame] = 0; mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES]; if (!((cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) && (cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]]))) { continue; } if (skip_ref_frame_mask & (1 << ref_frame) && !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) { continue; } // Ref mv list population is not required, when compound references are // pruned. if (prune_ref_frame(cpi, x, ref_frame)) continue; av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, mbmi_ext->mode_context); // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame); } } av1_count_overlappable_neighbors(cm, xd); const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); int use_actual_frame_probs = 1; int prune_obmc; #if CONFIG_FPMT_TEST use_actual_frame_probs = (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1; if (!use_actual_frame_probs) { prune_obmc = cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize] < cpi->sf.inter_sf.prune_obmc_prob_thresh; } #endif if (use_actual_frame_probs) { prune_obmc = cpi->ppi->frame_probs.obmc_probs[update_type][bsize] < cpi->sf.inter_sf.prune_obmc_prob_thresh; } if (cpi->oxcf.motion_mode_cfg.enable_obmc && !prune_obmc) { if (check_num_overlappable_neighbors(mbmi) && is_motion_variation_allowed_bsize(bsize)) { int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 }; int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 }; int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; av1_build_prediction_by_above_preds(cm, xd, args->above_pred_buf, dst_width1, dst_height1, args->above_pred_stride); av1_build_prediction_by_left_preds(cm, xd, args->left_pred_buf, dst_width2, dst_height2, args->left_pred_stride); const int num_planes = av1_num_planes(cm); av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, num_planes); calc_target_weighted_pred( cm, x, xd, args->above_pred_buf[0], args->above_pred_stride[0], args->left_pred_buf[0], args->left_pred_stride[0]); } } init_mode_skip_mask(mode_skip_mask, cpi, x, bsize); // Set params for mode evaluation set_mode_eval_params(cpi, x, MODE_EVAL); x->comp_rd_stats_idx = 0; for (int idx = 0; idx < REF_FRAMES; idx++) { args->best_single_sse_in_refs[idx] = INT32_MAX; } } static inline void init_single_inter_mode_search_state( InterModeSearchState *search_state) { for (int dir = 0; dir < 2; ++dir) { for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) { SingleInterModeState *state; state = &search_state->single_state[dir][mode][ref_frame]; state->ref_frame = NONE_FRAME; state->rd = INT64_MAX; state = &search_state->single_state_modelled[dir][mode][ref_frame]; state->ref_frame = NONE_FRAME; state->rd = INT64_MAX; search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME; } } } for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { search_state->best_single_rd[ref_frame] = INT64_MAX; search_state->best_single_mode[ref_frame] = PRED_MODE_INVALID; } av1_zero(search_state->single_state_cnt); av1_zero(search_state->single_state_modelled_cnt); } static inline void init_inter_mode_search_state( InterModeSearchState *search_state, const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, int64_t best_rd_so_far) { init_intra_mode_search_state(&search_state->intra_search_state); av1_invalid_rd_stats(&search_state->best_y_rdcost); search_state->best_rd = best_rd_so_far; search_state->best_skip_rd[0] = INT64_MAX; search_state->best_skip_rd[1] = INT64_MAX; av1_zero(search_state->best_mbmode); search_state->best_rate_y = INT_MAX; search_state->best_rate_uv = INT_MAX; search_state->best_mode_skippable = 0; search_state->best_skip2 = 0; search_state->best_mode_index = THR_INVALID; const MACROBLOCKD *const xd = &x->e_mbd; const MB_MODE_INFO *const mbmi = xd->mi[0]; const unsigned char segment_id = mbmi->segment_id; search_state->num_available_refs = 0; memset(search_state->dist_refs, -1, sizeof(search_state->dist_refs)); memset(search_state->dist_order_refs, -1, sizeof(search_state->dist_order_refs)); for (int i = 0; i <= LAST_NEW_MV_INDEX; ++i) search_state->mode_threshold[i] = 0; const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize]; for (int i = LAST_NEW_MV_INDEX + 1; i < SINGLE_REF_MODE_END; ++i) search_state->mode_threshold[i] = ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >> RD_THRESH_FAC_FRAC_BITS; search_state->best_intra_rd = INT64_MAX; search_state->best_pred_sse = UINT_MAX; av1_zero(search_state->single_newmv); av1_zero(search_state->single_newmv_rate); av1_zero(search_state->single_newmv_valid); for (int i = SINGLE_INTER_MODE_START; i < SINGLE_INTER_MODE_END; ++i) { for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) { for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { search_state->modelled_rd[i][j][ref_frame] = INT64_MAX; search_state->simple_rd[i][j][ref_frame] = INT64_MAX; } } } for (int i = 0; i < REFERENCE_MODES; ++i) { search_state->best_pred_rd[i] = INT64_MAX; } if (cpi->common.current_frame.reference_mode != SINGLE_REFERENCE) { for (int i = SINGLE_REF_MODE_END; i < THR_INTER_MODE_END; ++i) search_state->mode_threshold[i] = ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >> RD_THRESH_FAC_FRAC_BITS; for (int i = COMP_INTER_MODE_START; i < COMP_INTER_MODE_END; ++i) { for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) { for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { search_state->modelled_rd[i][j][ref_frame] = INT64_MAX; search_state->simple_rd[i][j][ref_frame] = INT64_MAX; } } } init_single_inter_mode_search_state(search_state); } } static bool mask_says_skip(const mode_skip_mask_t *mode_skip_mask, const MV_REFERENCE_FRAME *ref_frame, const PREDICTION_MODE this_mode) { if (mode_skip_mask->pred_modes[ref_frame[0]] & (1 << this_mode)) { return true; } return mode_skip_mask->ref_combo[ref_frame[0]][ref_frame[1] + 1]; } static int inter_mode_compatible_skip(const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, PREDICTION_MODE curr_mode, const MV_REFERENCE_FRAME *ref_frames) { const int comp_pred = ref_frames[1] > INTRA_FRAME; if (comp_pred) { if (!is_comp_ref_allowed(bsize)) return 1; if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frames[1]])) { return 1; } const AV1_COMMON *const cm = &cpi->common; if (frame_is_intra_only(cm)) return 1; const CurrentFrame *const current_frame = &cm->current_frame; if (current_frame->reference_mode == SINGLE_REFERENCE) return 1; const struct segmentation *const seg = &cm->seg; const unsigned char segment_id = x->e_mbd.mi[0]->segment_id; // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1; } if (ref_frames[0] > INTRA_FRAME && ref_frames[1] == INTRA_FRAME) { // Mode must be compatible if (!is_interintra_allowed_bsize(bsize)) return 1; if (!is_interintra_allowed_mode(curr_mode)) return 1; } return 0; } static int fetch_picked_ref_frames_mask(const MACROBLOCK *const x, BLOCK_SIZE bsize, int mib_size) { const int sb_size_mask = mib_size - 1; const MACROBLOCKD *const xd = &x->e_mbd; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; const int mi_row_in_sb = mi_row & sb_size_mask; const int mi_col_in_sb = mi_col & sb_size_mask; const int mi_w = mi_size_wide[bsize]; const int mi_h = mi_size_high[bsize]; int picked_ref_frames_mask = 0; for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_h; ++i) { for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_w; ++j) { picked_ref_frames_mask |= x->picked_ref_frames_mask[i * 32 + j]; } } return picked_ref_frames_mask; } // Check if reference frame pair of the current block matches with the given // block. static inline int match_ref_frame_pair(const MB_MODE_INFO *mbmi, const MV_REFERENCE_FRAME *ref_frames) { return ((ref_frames[0] == mbmi->ref_frame[0]) && (ref_frames[1] == mbmi->ref_frame[1])); } // Case 1: return 0, means don't skip this mode // Case 2: return 1, means skip this mode completely // Case 3: return 2, means skip compound only, but still try single motion modes static int inter_mode_search_order_independent_skip( const AV1_COMP *cpi, const MACROBLOCK *x, mode_skip_mask_t *mode_skip_mask, InterModeSearchState *search_state, int skip_ref_frame_mask, PREDICTION_MODE mode, const MV_REFERENCE_FRAME *ref_frame) { if (mask_says_skip(mode_skip_mask, ref_frame, mode)) { return 1; } const int ref_type = av1_ref_frame_type(ref_frame); if (!cpi->sf.rt_sf.use_real_time_ref_set) if (prune_ref_frame(cpi, x, ref_type)) return 1; // This is only used in motion vector unit test. if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test && ref_frame[0] == INTRA_FRAME) return 1; const AV1_COMMON *const cm = &cpi->common; if (skip_repeated_mv(cm, x, mode, ref_frame, search_state)) { return 1; } // Reuse the prediction mode in cache if (x->use_mb_mode_cache) { const MB_MODE_INFO *cached_mi = x->mb_mode_cache; const PREDICTION_MODE cached_mode = cached_mi->mode; const MV_REFERENCE_FRAME *cached_frame = cached_mi->ref_frame; const int cached_mode_is_single = cached_frame[1] <= INTRA_FRAME; // If the cached mode is intra, then we just need to match the mode. if (is_mode_intra(cached_mode) && mode != cached_mode) { return 1; } // If the cached mode is single inter mode, then we match the mode and // reference frame. if (cached_mode_is_single) { if (mode != cached_mode || ref_frame[0] != cached_frame[0]) { return 1; } } else { // If the cached mode is compound, then we need to consider several cases. const int mode_is_single = ref_frame[1] <= INTRA_FRAME; if (mode_is_single) { // If the mode is single, we know the modes can't match. But we might // still want to search it if compound mode depends on the current mode. int skip_motion_mode_only = 0; if (cached_mode == NEW_NEARMV || cached_mode == NEW_NEARESTMV) { skip_motion_mode_only = (ref_frame[0] == cached_frame[0]); } else if (cached_mode == NEAR_NEWMV || cached_mode == NEAREST_NEWMV) { skip_motion_mode_only = (ref_frame[0] == cached_frame[1]); } else if (cached_mode == NEW_NEWMV) { skip_motion_mode_only = (ref_frame[0] == cached_frame[0] || ref_frame[0] == cached_frame[1]); } return 1 + skip_motion_mode_only; } else { // If both modes are compound, then everything must match. if (mode != cached_mode || ref_frame[0] != cached_frame[0] || ref_frame[1] != cached_frame[1]) { return 1; } } } } const MB_MODE_INFO *const mbmi = x->e_mbd.mi[0]; // If no valid mode has been found so far in PARTITION_NONE when finding a // valid partition is required, do not skip mode. if (search_state->best_rd == INT64_MAX && mbmi->partition == PARTITION_NONE && x->must_find_valid_partition) return 0; const SPEED_FEATURES *const sf = &cpi->sf; // Prune NEARMV and NEAR_NEARMV based on q index and neighbor's reference // frames if (sf->inter_sf.prune_nearmv_using_neighbors && (mode == NEAR_NEARMV || mode == NEARMV)) { const MACROBLOCKD *const xd = &x->e_mbd; if (search_state->best_rd != INT64_MAX && xd->left_available && xd->up_available) { const int thresholds[PRUNE_NEARMV_MAX][3] = { { 1, 0, 0 }, { 1, 1, 0 }, { 2, 1, 0 } }; const int qindex_sub_range = x->qindex * 3 / QINDEX_RANGE; assert(sf->inter_sf.prune_nearmv_using_neighbors <= PRUNE_NEARMV_MAX && qindex_sub_range < 3); const int num_ref_frame_pair_match_thresh = thresholds[sf->inter_sf.prune_nearmv_using_neighbors - 1] [qindex_sub_range]; assert(num_ref_frame_pair_match_thresh <= 2 && num_ref_frame_pair_match_thresh >= 0); int num_ref_frame_pair_match = 0; num_ref_frame_pair_match = match_ref_frame_pair(xd->left_mbmi, ref_frame); num_ref_frame_pair_match += match_ref_frame_pair(xd->above_mbmi, ref_frame); // Pruning based on ref frame pair match with neighbors. if (num_ref_frame_pair_match < num_ref_frame_pair_match_thresh) return 1; } } int skip_motion_mode = 0; if (mbmi->partition != PARTITION_NONE) { int skip_ref = skip_ref_frame_mask & (1 << ref_type); if (ref_type <= ALTREF_FRAME && skip_ref) { // Since the compound ref modes depends on the motion estimation result of // two single ref modes (best mv of single ref modes as the start point), // if current single ref mode is marked skip, we need to check if it will // be used in compound ref modes. if (is_ref_frame_used_by_compound_ref(ref_type, skip_ref_frame_mask)) { // Found a not skipped compound ref mode which contains current // single ref. So this single ref can't be skipped completely // Just skip its motion mode search, still try its simple // transition mode. skip_motion_mode = 1; skip_ref = 0; } } // If we are reusing the prediction from cache, and the current frame is // required by the cache, then we cannot prune it. if (is_ref_frame_used_in_cache(ref_type, x->mb_mode_cache)) { skip_ref = 0; // If the cache only needs the current reference type for compound // prediction, then we can skip motion mode search. skip_motion_mode = (ref_type <= ALTREF_FRAME && x->mb_mode_cache->ref_frame[1] > INTRA_FRAME); } if (skip_ref) return 1; } if (ref_frame[0] == INTRA_FRAME) { if (mode != DC_PRED) { // Disable intra modes other than DC_PRED for blocks with low variance // Threshold for intra skipping based on source variance // TODO(debargha): Specialize the threshold for super block sizes const unsigned int skip_intra_var_thresh = 64; if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) && x->source_variance < skip_intra_var_thresh) return 1; } } if (skip_motion_mode) return 2; return 0; } static inline void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE curr_mode, const MV_REFERENCE_FRAME *ref_frames, const AV1_COMMON *cm) { PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; mbmi->ref_mv_idx = 0; mbmi->mode = curr_mode; mbmi->uv_mode = UV_DC_PRED; mbmi->ref_frame[0] = ref_frames[0]; mbmi->ref_frame[1] = ref_frames[1]; pmi->palette_size[0] = 0; pmi->palette_size[1] = 0; mbmi->filter_intra_mode_info.use_filter_intra = 0; mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0; mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1); set_default_interp_filters(mbmi, cm->features.interp_filter); } static inline void collect_single_states(MACROBLOCK *x, InterModeSearchState *search_state, const MB_MODE_INFO *const mbmi) { int i, j; const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0]; const PREDICTION_MODE this_mode = mbmi->mode; const int dir = ref_frame <= GOLDEN_FRAME ? 0 : 1; const int mode_offset = INTER_OFFSET(this_mode); const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode); // Simple rd int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame]; for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) { const int64_t rd = search_state->simple_rd[this_mode][ref_mv_idx][ref_frame]; if (rd < simple_rd) simple_rd = rd; } // Insertion sort of single_state const SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 }; SingleInterModeState *state_s = search_state->single_state[dir][mode_offset]; i = search_state->single_state_cnt[dir][mode_offset]; for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j) state_s[j] = state_s[j - 1]; state_s[j] = this_state_s; search_state->single_state_cnt[dir][mode_offset]++; // Modelled rd int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame]; for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) { const int64_t rd = search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame]; if (rd < modelled_rd) modelled_rd = rd; } // Insertion sort of single_state_modelled const SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 }; SingleInterModeState *state_m = search_state->single_state_modelled[dir][mode_offset]; i = search_state->single_state_modelled_cnt[dir][mode_offset]; for (j = i; j > 0 && state_m[j - 1].rd > this_state_m.rd; --j) state_m[j] = state_m[j - 1]; state_m[j] = this_state_m; search_state->single_state_modelled_cnt[dir][mode_offset]++; } static inline void analyze_single_states(const AV1_COMP *cpi, InterModeSearchState *search_state) { const int prune_level = cpi->sf.inter_sf.prune_comp_search_by_single_result; assert(prune_level >= 1); int i, j, dir, mode; for (dir = 0; dir < 2; ++dir) { int64_t best_rd; SingleInterModeState(*state)[FWD_REFS]; const int prune_factor = prune_level >= 2 ? 6 : 5; // Use the best rd of GLOBALMV or NEWMV to prune the unlikely // reference frames for all the modes (NEARESTMV and NEARMV may not // have same motion vectors). Always keep the best of each mode // because it might form the best possible combination with other mode. state = search_state->single_state[dir]; best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd, state[INTER_OFFSET(GLOBALMV)][0].rd); for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) { if (state[mode][i].rd != INT64_MAX && (state[mode][i].rd >> 3) * prune_factor > best_rd) { state[mode][i].valid = 0; } } } state = search_state->single_state_modelled[dir]; best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd, state[INTER_OFFSET(GLOBALMV)][0].rd); for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode]; ++i) { if (state[mode][i].rd != INT64_MAX && (state[mode][i].rd >> 3) * prune_factor > best_rd) { state[mode][i].valid = 0; } } } } // Ordering by simple rd first, then by modelled rd for (dir = 0; dir < 2; ++dir) { for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { const int state_cnt_s = search_state->single_state_cnt[dir][mode]; const int state_cnt_m = search_state->single_state_modelled_cnt[dir][mode]; SingleInterModeState *state_s = search_state->single_state[dir][mode]; SingleInterModeState *state_m = search_state->single_state_modelled[dir][mode]; int count = 0; const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m); for (i = 0; i < state_cnt_s; ++i) { if (state_s[i].rd == INT64_MAX) break; if (state_s[i].valid) { search_state->single_rd_order[dir][mode][count++] = state_s[i].ref_frame; } } if (count >= max_candidates) continue; for (i = 0; i < state_cnt_m && count < max_candidates; ++i) { if (state_m[i].rd == INT64_MAX) break; if (!state_m[i].valid) continue; const int ref_frame = state_m[i].ref_frame; int match = 0; // Check if existing already for (j = 0; j < count; ++j) { if (search_state->single_rd_order[dir][mode][j] == ref_frame) { match = 1; break; } } if (match) continue; // Check if this ref_frame is removed in simple rd int valid = 1; for (j = 0; j < state_cnt_s; ++j) { if (ref_frame == state_s[j].ref_frame) { valid = state_s[j].valid; break; } } if (valid) { search_state->single_rd_order[dir][mode][count++] = ref_frame; } } } } } static int compound_skip_get_candidates( const AV1_COMP *cpi, const InterModeSearchState *search_state, const int dir, const PREDICTION_MODE mode) { const int mode_offset = INTER_OFFSET(mode); const SingleInterModeState *state = search_state->single_state[dir][mode_offset]; const SingleInterModeState *state_modelled = search_state->single_state_modelled[dir][mode_offset]; int max_candidates = 0; for (int i = 0; i < FWD_REFS; ++i) { if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break; max_candidates++; } int candidates = max_candidates; if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 2) { candidates = AOMMIN(2, max_candidates); } if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 3) { if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX && state[0].ref_frame == state_modelled[0].ref_frame) candidates = 1; if (mode == NEARMV || mode == GLOBALMV) candidates = 1; } if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 4) { // Limit the number of candidates to 1 in each direction for compound // prediction candidates = AOMMIN(1, candidates); } return candidates; } static int compound_skip_by_single_states( const AV1_COMP *cpi, const InterModeSearchState *search_state, const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame, const MV_REFERENCE_FRAME second_ref_frame, const MACROBLOCK *x) { const MV_REFERENCE_FRAME refs[2] = { ref_frame, second_ref_frame }; const int mode[2] = { compound_ref0_mode(this_mode), compound_ref1_mode(this_mode) }; const int mode_offset[2] = { INTER_OFFSET(mode[0]), INTER_OFFSET(mode[1]) }; const int mode_dir[2] = { refs[0] <= GOLDEN_FRAME ? 0 : 1, refs[1] <= GOLDEN_FRAME ? 0 : 1 }; int ref_searched[2] = { 0, 0 }; int ref_mv_match[2] = { 1, 1 }; int i, j; for (i = 0; i < 2; ++i) { const SingleInterModeState *state = search_state->single_state[mode_dir[i]][mode_offset[i]]; const int state_cnt = search_state->single_state_cnt[mode_dir[i]][mode_offset[i]]; for (j = 0; j < state_cnt; ++j) { if (state[j].ref_frame == refs[i]) { ref_searched[i] = 1; break; } } } const int ref_set = get_drl_refmv_count(x, refs, this_mode); for (i = 0; i < 2; ++i) { if (!ref_searched[i] || (mode[i] != NEARESTMV && mode[i] != NEARMV)) { continue; } const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME }; for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) { int_mv single_mv; int_mv comp_mv; get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, 0, single_refs, &x->mbmi_ext); get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, 0, refs, &x->mbmi_ext); if (single_mv.as_int != comp_mv.as_int) { ref_mv_match[i] = 0; break; } } } for (i = 0; i < 2; ++i) { if (!ref_searched[i] || !ref_mv_match[i]) continue; const int candidates = compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]); const MV_REFERENCE_FRAME *ref_order = search_state->single_rd_order[mode_dir[i]][mode_offset[i]]; int match = 0; for (j = 0; j < candidates; ++j) { if (refs[i] == ref_order[j]) { match = 1; break; } } if (!match) return 1; } return 0; } // Check if ref frames of current block matches with given block. static inline void match_ref_frame(const MB_MODE_INFO *const mbmi, const MV_REFERENCE_FRAME *ref_frames, int *const is_ref_match) { if (is_inter_block(mbmi)) { is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[0]; is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[0]; if (has_second_ref(mbmi)) { is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[1]; is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[1]; } } } // Prune compound mode using ref frames of neighbor blocks. static inline int compound_skip_using_neighbor_refs( MACROBLOCKD *const xd, const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME *ref_frames, int prune_ext_comp_using_neighbors) { // Exclude non-extended compound modes from pruning if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV || this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV) return 0; if (prune_ext_comp_using_neighbors >= 3) return 1; int is_ref_match[2] = { 0 }; // 0 - match for forward refs // 1 - match for backward refs // Check if ref frames of this block matches with left neighbor. if (xd->left_available) match_ref_frame(xd->left_mbmi, ref_frames, is_ref_match); // Check if ref frames of this block matches with above neighbor. if (xd->up_available) match_ref_frame(xd->above_mbmi, ref_frames, is_ref_match); // Combine ref frame match with neighbors in forward and backward refs. const int track_ref_match = is_ref_match[0] + is_ref_match[1]; // Pruning based on ref frame match with neighbors. if (track_ref_match >= prune_ext_comp_using_neighbors) return 0; return 1; } // Update best single mode for the given reference frame based on simple rd. static inline void update_best_single_mode(InterModeSearchState *search_state, const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame, int64_t this_rd) { if (this_rd < search_state->best_single_rd[ref_frame]) { search_state->best_single_rd[ref_frame] = this_rd; search_state->best_single_mode[ref_frame] = this_mode; } } // Prune compound mode using best single mode for the same reference. static inline int skip_compound_using_best_single_mode_ref( const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME *ref_frames, const PREDICTION_MODE *best_single_mode, int prune_comp_using_best_single_mode_ref) { // Exclude non-extended compound modes from pruning if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV || this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV) return 0; assert(this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV); const PREDICTION_MODE comp_mode_ref0 = compound_ref0_mode(this_mode); // Get ref frame direction corresponding to NEWMV // 0 - NEWMV corresponding to forward direction // 1 - NEWMV corresponding to backward direction const int newmv_dir = comp_mode_ref0 != NEWMV; // Avoid pruning the compound mode when ref frame corresponding to NEWMV // have NEWMV as single mode winner. // Example: For an extended-compound mode, // {mode, {fwd_frame, bwd_frame}} = {NEAR_NEWMV, {LAST_FRAME, ALTREF_FRAME}} // - Ref frame corresponding to NEWMV is ALTREF_FRAME // - Avoid pruning this mode, if best single mode corresponding to ref frame // ALTREF_FRAME is NEWMV const PREDICTION_MODE single_mode = best_single_mode[ref_frames[newmv_dir]]; if (single_mode == NEWMV) return 0; // Avoid pruning the compound mode when best single mode is not available if (prune_comp_using_best_single_mode_ref == 1) if (single_mode == MB_MODE_COUNT) return 0; return 1; } static int compare_int64(const void *a, const void *b) { int64_t a64 = *((int64_t *)a); int64_t b64 = *((int64_t *)b); if (a64 < b64) { return -1; } else if (a64 == b64) { return 0; } else { return 1; } } static inline void update_search_state( InterModeSearchState *search_state, RD_STATS *best_rd_stats_dst, PICK_MODE_CONTEXT *ctx, const RD_STATS *new_best_rd_stats, const RD_STATS *new_best_rd_stats_y, const RD_STATS *new_best_rd_stats_uv, THR_MODES new_best_mode, const MACROBLOCK *x, int txfm_search_done) { const MACROBLOCKD *xd = &x->e_mbd; const MB_MODE_INFO *mbmi = xd->mi[0]; const int skip_ctx = av1_get_skip_txfm_context(xd); const int skip_txfm = mbmi->skip_txfm && !is_mode_intra(av1_mode_defs[new_best_mode].mode); const TxfmSearchInfo *txfm_info = &x->txfm_search_info; search_state->best_rd = new_best_rd_stats->rdcost; search_state->best_mode_index = new_best_mode; *best_rd_stats_dst = *new_best_rd_stats; search_state->best_mbmode = *mbmi; search_state->best_skip2 = skip_txfm; search_state->best_mode_skippable = new_best_rd_stats->skip_txfm; // When !txfm_search_done, new_best_rd_stats won't provide correct rate_y and // rate_uv because av1_txfm_search process is replaced by rd estimation. // Therefore, we should avoid updating best_rate_y and best_rate_uv here. // These two values will be updated when av1_txfm_search is called. if (txfm_search_done) { search_state->best_rate_y = new_best_rd_stats_y->rate + x->mode_costs.skip_txfm_cost[skip_ctx] [new_best_rd_stats->skip_txfm || skip_txfm]; search_state->best_rate_uv = new_best_rd_stats_uv->rate; } search_state->best_y_rdcost = *new_best_rd_stats_y; memcpy(ctx->blk_skip, txfm_info->blk_skip, sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); } // Find the best RD for a reference frame (among single reference modes) // and store +10% of it in the 0-th element in ref_frame_rd. static inline void find_top_ref(int64_t ref_frame_rd[REF_FRAMES]) { assert(ref_frame_rd[0] == INT64_MAX); int64_t ref_copy[REF_FRAMES - 1]; memcpy(ref_copy, ref_frame_rd + 1, sizeof(ref_frame_rd[0]) * (REF_FRAMES - 1)); qsort(ref_copy, REF_FRAMES - 1, sizeof(int64_t), compare_int64); int64_t cutoff = ref_copy[0]; // The cut-off is within 10% of the best. if (cutoff != INT64_MAX) { assert(cutoff < INT64_MAX / 200); cutoff = (110 * cutoff) / 100; } ref_frame_rd[0] = cutoff; } // Check if either frame is within the cutoff. static inline bool in_single_ref_cutoff(int64_t ref_frame_rd[REF_FRAMES], MV_REFERENCE_FRAME frame1, MV_REFERENCE_FRAME frame2) { assert(frame2 > 0); return ref_frame_rd[frame1] <= ref_frame_rd[0] || ref_frame_rd[frame2] <= ref_frame_rd[0]; } static inline void evaluate_motion_mode_for_winner_candidates( const AV1_COMP *const cpi, MACROBLOCK *const x, RD_STATS *const rd_cost, HandleInterModeArgs *const args, TileDataEnc *const tile_data, PICK_MODE_CONTEXT *const ctx, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], const motion_mode_best_st_candidate *const best_motion_mode_cands, int do_tx_search, const BLOCK_SIZE bsize, int64_t *const best_est_rd, InterModeSearchState *const search_state, int64_t *yrd) { const AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; InterModesInfo *const inter_modes_info = x->inter_modes_info; const int num_best_cand = best_motion_mode_cands->num_motion_mode_cand; for (int cand = 0; cand < num_best_cand; cand++) { RD_STATS rd_stats; RD_STATS rd_stats_y; RD_STATS rd_stats_uv; av1_init_rd_stats(&rd_stats); av1_init_rd_stats(&rd_stats_y); av1_init_rd_stats(&rd_stats_uv); int rate_mv; rate_mv = best_motion_mode_cands->motion_mode_cand[cand].rate_mv; args->skip_motion_mode = best_motion_mode_cands->motion_mode_cand[cand].skip_motion_mode; *mbmi = best_motion_mode_cands->motion_mode_cand[cand].mbmi; rd_stats.rate = best_motion_mode_cands->motion_mode_cand[cand].rate2_nocoeff; // Continue if the best candidate is compound. if (!is_inter_singleref_mode(mbmi->mode)) continue; x->txfm_search_info.skip_txfm = 0; struct macroblockd_plane *pd = xd->plane; const BUFFER_SET orig_dst = { { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf }, { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride }, }; set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); // Initialize motion mode to simple translation // Calculation of switchable rate depends on it. mbmi->motion_mode = 0; const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME; for (int i = 0; i < num_planes; i++) { xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; } int64_t skip_rd[2] = { search_state->best_skip_rd[0], search_state->best_skip_rd[1] }; int64_t this_yrd = INT64_MAX; int64_t ret_value = motion_mode_rd( cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, args, search_state->best_rd, skip_rd, &rate_mv, &orig_dst, best_est_rd, do_tx_search, inter_modes_info, 1, &this_yrd); if (ret_value != INT64_MAX) { rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist); const THR_MODES mode_enum = get_prediction_mode_idx( mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); // Collect mode stats for multiwinner mode processing store_winner_mode_stats( &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv, mode_enum, NULL, bsize, rd_stats.rdcost, cpi->sf.winner_mode_sf.multi_winner_mode_type, do_tx_search); if (rd_stats.rdcost < search_state->best_rd) { *yrd = this_yrd; update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y, &rd_stats_uv, mode_enum, x, do_tx_search); if (do_tx_search) search_state->best_skip_rd[0] = skip_rd[0]; } } } } /*!\cond */ // Arguments for speed feature pruning of inter mode search typedef struct { int *skip_motion_mode; mode_skip_mask_t *mode_skip_mask; InterModeSearchState *search_state; int skip_ref_frame_mask; int reach_first_comp_mode; int mode_thresh_mul_fact; int num_single_modes_processed; int prune_cpd_using_sr_stats_ready; } InterModeSFArgs; /*!\endcond */ static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, int64_t *ref_frame_rd, int midx, InterModeSFArgs *args, int is_low_temp_var) { const SPEED_FEATURES *const sf = &cpi->sf; MACROBLOCKD *const xd = &x->e_mbd; // Get the actual prediction mode we are trying in this iteration const THR_MODES mode_enum = av1_default_mode_order[midx]; const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum]; const PREDICTION_MODE this_mode = mode_def->mode; const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame; const MV_REFERENCE_FRAME ref_frame = ref_frames[0]; const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1]; const int comp_pred = second_ref_frame > INTRA_FRAME; if (ref_frame == INTRA_FRAME) return 1; const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); if (sf->inter_sf.skip_arf_compound && update_type == ARF_UPDATE && comp_pred) { return 1; } // This is for real time encoding. if (is_low_temp_var && !comp_pred && ref_frame != LAST_FRAME && this_mode != NEARESTMV) return 1; // Check if this mode should be skipped because it is incompatible with the // current frame if (inter_mode_compatible_skip(cpi, x, bsize, this_mode, ref_frames)) return 1; const int ret = inter_mode_search_order_independent_skip( cpi, x, args->mode_skip_mask, args->search_state, args->skip_ref_frame_mask, this_mode, mode_def->ref_frame); if (ret == 1) return 1; *(args->skip_motion_mode) = (ret == 2); // We've reached the first compound prediction mode, get stats from the // single reference predictors to help with pruning. // Disable this pruning logic if interpolation filter search was skipped for // single prediction modes as it can result in aggressive pruning of compound // prediction modes due to the absence of modelled_rd populated by // av1_interpolation_filter_search(). // TODO(Remya): Check the impact of the sf // 'prune_comp_search_by_single_result' if compound prediction modes are // enabled in future for REALTIME encode. if (!sf->interp_sf.skip_interp_filter_search && sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred && args->reach_first_comp_mode == 0) { analyze_single_states(cpi, args->search_state); args->reach_first_comp_mode = 1; } // Prune aggressively when best mode is skippable. int mul_fact = args->search_state->best_mode_skippable ? args->mode_thresh_mul_fact : (1 << MODE_THRESH_QBITS); int64_t mode_threshold = (args->search_state->mode_threshold[mode_enum] * mul_fact) >> MODE_THRESH_QBITS; if (args->search_state->best_rd < mode_threshold) return 1; // Skip this compound mode based on the RD results from the single prediction // modes if (!sf->interp_sf.skip_interp_filter_search && sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred) { if (compound_skip_by_single_states(cpi, args->search_state, this_mode, ref_frame, second_ref_frame, x)) return 1; } if (sf->inter_sf.prune_compound_using_single_ref && comp_pred) { // After we done with single reference modes, find the 2nd best RD // for a reference frame. Only search compound modes that have a reference // frame at least as good as the 2nd best. if (!args->prune_cpd_using_sr_stats_ready && args->num_single_modes_processed == NUM_SINGLE_REF_MODES) { find_top_ref(ref_frame_rd); args->prune_cpd_using_sr_stats_ready = 1; } if (args->prune_cpd_using_sr_stats_ready && !in_single_ref_cutoff(ref_frame_rd, ref_frame, second_ref_frame)) return 1; } // Skip NEW_NEARMV and NEAR_NEWMV extended compound modes if (sf->inter_sf.skip_ext_comp_nearmv_mode && (this_mode == NEW_NEARMV || this_mode == NEAR_NEWMV)) { return 1; } if (sf->inter_sf.prune_ext_comp_using_neighbors && comp_pred) { if (compound_skip_using_neighbor_refs( xd, this_mode, ref_frames, sf->inter_sf.prune_ext_comp_using_neighbors)) return 1; } if (sf->inter_sf.prune_comp_using_best_single_mode_ref && comp_pred) { if (skip_compound_using_best_single_mode_ref( this_mode, ref_frames, args->search_state->best_single_mode, sf->inter_sf.prune_comp_using_best_single_mode_ref)) return 1; } if (sf->inter_sf.prune_nearest_near_mv_using_refmv_weight && !comp_pred) { const int8_t ref_frame_type = av1_ref_frame_type(ref_frames); if (skip_nearest_near_mv_using_refmv_weight( x, this_mode, ref_frame_type, args->search_state->best_mbmode.mode)) { // Ensure the mode is pruned only when the current block has obtained a // valid inter mode. assert(is_inter_mode(args->search_state->best_mbmode.mode)); return 1; } } if (sf->rt_sf.prune_inter_modes_with_golden_ref && ref_frame == GOLDEN_FRAME && !comp_pred) { const int subgop_size = AOMMIN(cpi->ppi->gf_group.size, FIXED_GF_INTERVAL); if (cpi->rc.frames_since_golden > (subgop_size >> 2) && args->search_state->best_mbmode.ref_frame[0] != GOLDEN_FRAME) { if ((bsize > BLOCK_16X16 && this_mode == NEWMV) || this_mode == NEARMV) return 1; } } return 0; } static void record_best_compound(REFERENCE_MODE reference_mode, RD_STATS *rd_stats, int comp_pred, int rdmult, InterModeSearchState *search_state, int compmode_cost) { int64_t single_rd, hybrid_rd, single_rate, hybrid_rate; if (reference_mode == REFERENCE_MODE_SELECT) { single_rate = rd_stats->rate - compmode_cost; hybrid_rate = rd_stats->rate; } else { single_rate = rd_stats->rate; hybrid_rate = rd_stats->rate + compmode_cost; } single_rd = RDCOST(rdmult, single_rate, rd_stats->dist); hybrid_rd = RDCOST(rdmult, hybrid_rate, rd_stats->dist); if (!comp_pred) { if (single_rd < search_state->best_pred_rd[SINGLE_REFERENCE]) search_state->best_pred_rd[SINGLE_REFERENCE] = single_rd; } else { if (single_rd < search_state->best_pred_rd[COMPOUND_REFERENCE]) search_state->best_pred_rd[COMPOUND_REFERENCE] = single_rd; } if (hybrid_rd < search_state->best_pred_rd[REFERENCE_MODE_SELECT]) search_state->best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd; } // Does a transform search over a list of the best inter mode candidates. // This is called if the original mode search computed an RD estimate // for the transform search rather than doing a full search. static void tx_search_best_inter_candidates( AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x, int64_t best_rd_so_far, BLOCK_SIZE bsize, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], int mi_row, int mi_col, InterModeSearchState *search_state, RD_STATS *rd_cost, PICK_MODE_CONTEXT *ctx, int64_t *yrd) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; TxfmSearchInfo *txfm_info = &x->txfm_search_info; const ModeCosts *mode_costs = &x->mode_costs; const int num_planes = av1_num_planes(cm); const int skip_ctx = av1_get_skip_txfm_context(xd); MB_MODE_INFO *const mbmi = xd->mi[0]; InterModesInfo *inter_modes_info = x->inter_modes_info; inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr); search_state->best_rd = best_rd_so_far; search_state->best_mode_index = THR_INVALID; // Initialize best mode stats for winner mode processing x->winner_mode_count = 0; store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, NULL, bsize, best_rd_so_far, cpi->sf.winner_mode_sf.multi_winner_mode_type, 0); inter_modes_info->num = inter_modes_info->num < cpi->sf.rt_sf.num_inter_modes_for_tx_search ? inter_modes_info->num : cpi->sf.rt_sf.num_inter_modes_for_tx_search; const int64_t top_est_rd = inter_modes_info->num > 0 ? inter_modes_info ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx] : INT64_MAX; *yrd = INT64_MAX; int64_t best_rd_in_this_partition = INT64_MAX; int num_inter_mode_cands = inter_modes_info->num; int newmv_mode_evaled = 0; int max_allowed_cands = INT_MAX; if (cpi->sf.inter_sf.limit_inter_mode_cands) { // The bound on the no. of inter mode candidates, beyond which the // candidates are limited if a newmv mode got evaluated, is set as // max_allowed_cands + 1. const int num_allowed_cands[5] = { INT_MAX, 10, 9, 6, 2 }; assert(cpi->sf.inter_sf.limit_inter_mode_cands <= 4); max_allowed_cands = num_allowed_cands[cpi->sf.inter_sf.limit_inter_mode_cands]; } int num_mode_thresh = INT_MAX; if (cpi->sf.inter_sf.limit_txfm_eval_per_mode) { // Bound the no. of transform searches per prediction mode beyond a // threshold. const int num_mode_thresh_ary[4] = { INT_MAX, 4, 3, 0 }; assert(cpi->sf.inter_sf.limit_txfm_eval_per_mode <= 3); num_mode_thresh = num_mode_thresh_ary[cpi->sf.inter_sf.limit_txfm_eval_per_mode]; } int num_tx_cands = 0; int num_tx_search_modes[INTER_MODE_END - INTER_MODE_START] = { 0 }; // Iterate over best inter mode candidates and perform tx search for (int j = 0; j < num_inter_mode_cands; ++j) { const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx; *mbmi = inter_modes_info->mbmi_arr[data_idx]; const PREDICTION_MODE prediction_mode = mbmi->mode; int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx]; if (curr_est_rd * 0.80 > top_est_rd) break; if (num_tx_cands > num_mode_thresh) { if ((prediction_mode != NEARESTMV && num_tx_search_modes[prediction_mode - INTER_MODE_START] >= 1) || (prediction_mode == NEARESTMV && num_tx_search_modes[prediction_mode - INTER_MODE_START] >= 2)) continue; } txfm_info->skip_txfm = 0; set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); // Select prediction reference frames. const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME; for (int i = 0; i < num_planes; i++) { xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; } bool is_predictor_built = false; // Initialize RD stats RD_STATS rd_stats; RD_STATS rd_stats_y; RD_STATS rd_stats_uv; const int mode_rate = inter_modes_info->mode_rate_arr[data_idx]; int64_t skip_rd = INT64_MAX; const int txfm_rd_gate_level = get_txfm_rd_gate_level( cm->seq_params->enable_masked_compound, cpi->sf.inter_sf.txfm_rd_gate_level, bsize, TX_SEARCH_DEFAULT, /*eval_motion_mode=*/0); if (txfm_rd_gate_level) { // Check if the mode is good enough based on skip RD int64_t curr_sse = inter_modes_info->sse_arr[data_idx]; skip_rd = RDCOST(x->rdmult, mode_rate, curr_sse); int eval_txfm = check_txfm_eval(x, bsize, search_state->best_skip_rd[0], skip_rd, txfm_rd_gate_level, 0); if (!eval_txfm) continue; } // Build the prediction for this mode if (!is_predictor_built) { av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, av1_num_planes(cm) - 1); } if (mbmi->motion_mode == OBMC_CAUSAL) { av1_build_obmc_inter_predictors_sb(cm, xd); } num_tx_cands++; if (have_newmv_in_inter_mode(prediction_mode)) newmv_mode_evaled = 1; num_tx_search_modes[prediction_mode - INTER_MODE_START]++; int64_t this_yrd = INT64_MAX; // Do the transform search if (!av1_txfm_search(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, mode_rate, search_state->best_rd)) { continue; } else { const int y_rate = rd_stats.skip_txfm ? mode_costs->skip_txfm_cost[skip_ctx][1] : (rd_stats_y.rate + mode_costs->skip_txfm_cost[skip_ctx][0]); this_yrd = RDCOST(x->rdmult, y_rate + mode_rate, rd_stats_y.dist); if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { inter_mode_data_push( tile_data, mbmi->bsize, rd_stats.sse, rd_stats.dist, rd_stats_y.rate + rd_stats_uv.rate + mode_costs->skip_txfm_cost[skip_ctx][mbmi->skip_txfm]); } } rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist); if (rd_stats.rdcost < best_rd_in_this_partition) { best_rd_in_this_partition = rd_stats.rdcost; *yrd = this_yrd; } const THR_MODES mode_enum = get_prediction_mode_idx( prediction_mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); // Collect mode stats for multiwinner mode processing const int txfm_search_done = 1; store_winner_mode_stats( &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv, mode_enum, NULL, bsize, rd_stats.rdcost, cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done); if (rd_stats.rdcost < search_state->best_rd) { update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y, &rd_stats_uv, mode_enum, x, txfm_search_done); search_state->best_skip_rd[0] = skip_rd; // Limit the total number of modes to be evaluated if the first is valid // and transform skip or compound if (cpi->sf.inter_sf.inter_mode_txfm_breakout) { if (!j && (search_state->best_mbmode.skip_txfm || rd_stats.skip_txfm)) { // Evaluate more candidates at high quantizers where occurrence of // transform skip is high. const int max_cands_cap[5] = { 2, 3, 5, 7, 9 }; const int qindex_band = (5 * x->qindex) >> QINDEX_BITS; num_inter_mode_cands = AOMMIN(max_cands_cap[qindex_band], inter_modes_info->num); } else if (!j && has_second_ref(&search_state->best_mbmode)) { const int aggr = cpi->sf.inter_sf.inter_mode_txfm_breakout - 1; // Evaluate more candidates at low quantizers where occurrence of // single reference mode is high. const int max_cands_cap_cmp[2][4] = { { 10, 7, 5, 4 }, { 10, 7, 5, 3 } }; const int qindex_band_cmp = (4 * x->qindex) >> QINDEX_BITS; num_inter_mode_cands = AOMMIN( max_cands_cap_cmp[aggr][qindex_band_cmp], inter_modes_info->num); } } } // If the number of candidates evaluated exceeds max_allowed_cands, break if // a newmv mode was evaluated already. if ((num_tx_cands > max_allowed_cands) && newmv_mode_evaled) break; } } // Indicates number of winner simple translation modes to be used static const unsigned int num_winner_motion_modes[3] = { 0, 10, 3 }; // Adds a motion mode to the candidate list for motion_mode_for_winner_cand // speed feature. This list consists of modes that have only searched // SIMPLE_TRANSLATION. The final list will be used to search other motion // modes after the initial RD search. static void handle_winner_cand( MB_MODE_INFO *const mbmi, motion_mode_best_st_candidate *best_motion_mode_cands, int max_winner_motion_mode_cand, int64_t this_rd, motion_mode_candidate *motion_mode_cand, int skip_motion_mode) { // Number of current motion mode candidates in list const int num_motion_mode_cand = best_motion_mode_cands->num_motion_mode_cand; int valid_motion_mode_cand_loc = num_motion_mode_cand; // find the best location to insert new motion mode candidate for (int j = 0; j < num_motion_mode_cand; j++) { if (this_rd < best_motion_mode_cands->motion_mode_cand[j].rd_cost) { valid_motion_mode_cand_loc = j; break; } } // Insert motion mode if location is found if (valid_motion_mode_cand_loc < max_winner_motion_mode_cand) { if (num_motion_mode_cand > 0 && valid_motion_mode_cand_loc < max_winner_motion_mode_cand - 1) memmove( &best_motion_mode_cands ->motion_mode_cand[valid_motion_mode_cand_loc + 1], &best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc], (AOMMIN(num_motion_mode_cand, max_winner_motion_mode_cand - 1) - valid_motion_mode_cand_loc) * sizeof(best_motion_mode_cands->motion_mode_cand[0])); motion_mode_cand->mbmi = *mbmi; motion_mode_cand->rd_cost = this_rd; motion_mode_cand->skip_motion_mode = skip_motion_mode; best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc] = *motion_mode_cand; best_motion_mode_cands->num_motion_mode_cand = AOMMIN(max_winner_motion_mode_cand, best_motion_mode_cands->num_motion_mode_cand + 1); } } /*!\brief Search intra modes in interframes * * \ingroup intra_mode_search * * This function searches for the best intra mode when the current frame is an * interframe. This function however does *not* handle luma palette mode. * Palette mode is currently handled by \ref av1_search_palette_mode. * * This function will first iterate through the luma mode candidates to find the * best luma intra mode. Once the best luma mode it's found, it will then search * for the best chroma mode. Because palette mode is currently not handled by * here, a cache of uv mode is stored in * InterModeSearchState::intra_search_state so it can be reused later by \ref * av1_search_palette_mode. * * \param[in,out] search_state Struct keep track of the prediction mode * search state in interframe. * * \param[in] cpi Top-level encoder structure. * \param[in,out] x Pointer to struct holding all the data for * the current prediction block. * \param[out] rd_cost Stores the best rd_cost among all the * prediction modes searched. * \param[in] bsize Current block size. * \param[in,out] ctx Structure to hold the number of 4x4 blks to * copy the tx_type and txfm_skip arrays. * for only the Y plane. * \param[in] sf_args Stores the list of intra mode candidates * to be searched. * \param[in] intra_ref_frame_cost The entropy cost for signaling that the * current ref frame is an intra frame. * \param[in] yrd_threshold The rdcost threshold for luma intra mode to * terminate chroma intra mode search. * * \remark If a new best mode is found, search_state and rd_costs are updated * correspondingly. While x is also modified, it is only used as a temporary * buffer, and the final decisions are stored in search_state. */ static inline void search_intra_modes_in_interframe( InterModeSearchState *search_state, const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, const InterModeSFArgs *sf_args, unsigned int intra_ref_frame_cost, int64_t yrd_threshold) { const AV1_COMMON *const cm = &cpi->common; const SPEED_FEATURES *const sf = &cpi->sf; const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; IntraModeSearchState *intra_search_state = &search_state->intra_search_state; int is_best_y_mode_intra = 0; RD_STATS best_intra_rd_stats_y; int64_t best_rd_y = INT64_MAX; int best_mode_cost_y = -1; MB_MODE_INFO best_mbmi = *xd->mi[0]; THR_MODES best_mode_enum = THR_INVALID; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; const int num_4x4 = bsize_to_num_blk(bsize); // Performs luma search int64_t best_model_rd = INT64_MAX; int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT]; for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) { top_intra_model_rd[i] = INT64_MAX; } for (int mode_idx = 0; mode_idx < LUMA_MODE_COUNT; ++mode_idx) { if (sf->intra_sf.skip_intra_in_interframe && search_state->intra_search_state.skip_intra_modes) break; set_y_mode_and_delta_angle( mode_idx, mbmi, sf->intra_sf.prune_luma_odd_delta_angles_in_intra); assert(mbmi->mode < INTRA_MODE_END); // Use intra_y_mode_mask speed feature to skip intra mode evaluation. if (sf_args->mode_skip_mask->pred_modes[INTRA_FRAME] & (1 << mbmi->mode)) continue; const THR_MODES mode_enum = get_prediction_mode_idx(mbmi->mode, INTRA_FRAME, NONE_FRAME); if ((!intra_mode_cfg->enable_smooth_intra || cpi->sf.intra_sf.disable_smooth_intra) && (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED || mbmi->mode == SMOOTH_V_PRED)) continue; if (!intra_mode_cfg->enable_paeth_intra && mbmi->mode == PAETH_PRED) continue; if (av1_is_directional_mode(mbmi->mode) && !(av1_use_angle_delta(bsize) && intra_mode_cfg->enable_angle_delta) && mbmi->angle_delta[PLANE_TYPE_Y] != 0) continue; const PREDICTION_MODE this_mode = mbmi->mode; assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME); assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME); init_mbmi(mbmi, this_mode, av1_mode_defs[mode_enum].ref_frame, cm); x->txfm_search_info.skip_txfm = 0; if (this_mode != DC_PRED) { // Only search the oblique modes if the best so far is // one of the neighboring directional modes if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) { if (search_state->best_mode_index != THR_INVALID && search_state->best_mbmode.ref_frame[0] > INTRA_FRAME) continue; } if (sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { if (conditional_skipintra( this_mode, search_state->intra_search_state.best_intra_mode)) continue; } } RD_STATS intra_rd_stats_y; int mode_cost_y; int64_t intra_rd_y = INT64_MAX; const int is_luma_result_valid = av1_handle_intra_y_mode( intra_search_state, cpi, x, bsize, intra_ref_frame_cost, ctx, &intra_rd_stats_y, search_state->best_rd, &mode_cost_y, &intra_rd_y, &best_model_rd, top_intra_model_rd); if (is_luma_result_valid && intra_rd_y < yrd_threshold) { is_best_y_mode_intra = 1; if (intra_rd_y < best_rd_y) { best_intra_rd_stats_y = intra_rd_stats_y; best_mode_cost_y = mode_cost_y; best_rd_y = intra_rd_y; best_mbmi = *mbmi; best_mode_enum = mode_enum; memcpy(best_blk_skip, x->txfm_search_info.blk_skip, sizeof(best_blk_skip[0]) * num_4x4); av1_copy_array(best_tx_type_map, xd->tx_type_map, num_4x4); } } } if (!is_best_y_mode_intra) { return; } assert(best_rd_y < INT64_MAX); // Restores the best luma mode *mbmi = best_mbmi; memcpy(x->txfm_search_info.blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * num_4x4); av1_copy_array(xd->tx_type_map, best_tx_type_map, num_4x4); // Performs chroma search RD_STATS intra_rd_stats, intra_rd_stats_uv; av1_init_rd_stats(&intra_rd_stats); av1_init_rd_stats(&intra_rd_stats_uv); const int num_planes = av1_num_planes(cm); if (num_planes > 1) { const int intra_uv_mode_valid = av1_search_intra_uv_modes_in_interframe( intra_search_state, cpi, x, bsize, &intra_rd_stats, &best_intra_rd_stats_y, &intra_rd_stats_uv, search_state->best_rd); if (!intra_uv_mode_valid) { return; } } // Merge the luma and chroma rd stats assert(best_mode_cost_y >= 0); intra_rd_stats.rate = best_intra_rd_stats_y.rate + best_mode_cost_y; if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) { // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size // in the tokenonly rate, but for intra blocks, tx_size is always coded // (prediction granularity), so we account for it in the full rate, // not the tokenonly rate. best_intra_rd_stats_y.rate -= tx_size_cost(x, bsize, mbmi->tx_size); } const ModeCosts *mode_costs = &x->mode_costs; const PREDICTION_MODE mode = mbmi->mode; if (num_planes > 1 && xd->is_chroma_ref) { const int uv_mode_cost = mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mode][mbmi->uv_mode]; intra_rd_stats.rate += intra_rd_stats_uv.rate + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost); } // Intra block is always coded as non-skip intra_rd_stats.skip_txfm = 0; intra_rd_stats.dist = best_intra_rd_stats_y.dist + intra_rd_stats_uv.dist; // Add in the cost of the no skip flag. const int skip_ctx = av1_get_skip_txfm_context(xd); intra_rd_stats.rate += mode_costs->skip_txfm_cost[skip_ctx][0]; // Calculate the final RD estimate for this mode. const int64_t this_rd = RDCOST(x->rdmult, intra_rd_stats.rate, intra_rd_stats.dist); // Keep record of best intra rd if (this_rd < search_state->best_intra_rd) { search_state->best_intra_rd = this_rd; intra_search_state->best_intra_mode = mode; } for (int i = 0; i < REFERENCE_MODES; ++i) { search_state->best_pred_rd[i] = AOMMIN(search_state->best_pred_rd[i], this_rd); } intra_rd_stats.rdcost = this_rd; // Collect mode stats for multiwinner mode processing const int txfm_search_done = 1; store_winner_mode_stats( &cpi->common, x, mbmi, &intra_rd_stats, &best_intra_rd_stats_y, &intra_rd_stats_uv, best_mode_enum, NULL, bsize, intra_rd_stats.rdcost, cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done); if (intra_rd_stats.rdcost < search_state->best_rd) { update_search_state(search_state, rd_cost, ctx, &intra_rd_stats, &best_intra_rd_stats_y, &intra_rd_stats_uv, best_mode_enum, x, txfm_search_done); } } #if !CONFIG_REALTIME_ONLY // Prepare inter_cost and intra_cost from TPL stats, which are used as ML // features in intra mode pruning. static inline void calculate_cost_from_tpl_data(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, int64_t *inter_cost, int64_t *intra_cost) { const AV1_COMMON *const cm = &cpi->common; // Only consider full SB. const BLOCK_SIZE sb_size = cm->seq_params->sb_size; const int tpl_bsize_1d = cpi->ppi->tpl_data.tpl_bsize_1d; const int len = (block_size_wide[sb_size] / tpl_bsize_1d) * (block_size_high[sb_size] / tpl_bsize_1d); SuperBlockEnc *sb_enc = &x->sb_enc; if (sb_enc->tpl_data_count == len) { const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_bsize_1d); const int tpl_stride = sb_enc->tpl_stride; const int tplw = mi_size_wide[tpl_bsize]; const int tplh = mi_size_high[tpl_bsize]; const int nw = mi_size_wide[bsize] / tplw; const int nh = mi_size_high[bsize] / tplh; if (nw >= 1 && nh >= 1) { const int of_h = mi_row % mi_size_high[sb_size]; const int of_w = mi_col % mi_size_wide[sb_size]; const int start = of_h / tplh * tpl_stride + of_w / tplw; for (int k = 0; k < nh; k++) { for (int l = 0; l < nw; l++) { *inter_cost += sb_enc->tpl_inter_cost[start + k * tpl_stride + l]; *intra_cost += sb_enc->tpl_intra_cost[start + k * tpl_stride + l]; } } *inter_cost /= nw * nh; *intra_cost /= nw * nh; } } } #endif // !CONFIG_REALTIME_ONLY // When the speed feature skip_intra_in_interframe > 0, enable ML model to prune // intra mode search. static inline void skip_intra_modes_in_interframe( AV1_COMMON *const cm, struct macroblock *x, BLOCK_SIZE bsize, InterModeSearchState *search_state, const SPEED_FEATURES *const sf, int64_t inter_cost, int64_t intra_cost) { MACROBLOCKD *const xd = &x->e_mbd; const int comp_pred = search_state->best_mbmode.ref_frame[1] > INTRA_FRAME; if (sf->rt_sf.prune_intra_mode_based_on_mv_range && bsize > sf->part_sf.max_intra_bsize && !comp_pred) { const MV best_mv = search_state->best_mbmode.mv[0].as_mv; const int mv_thresh = 16 << sf->rt_sf.prune_intra_mode_based_on_mv_range; if (abs(best_mv.row) < mv_thresh && abs(best_mv.col) < mv_thresh && x->source_variance > 128) { search_state->intra_search_state.skip_intra_modes = 1; return; } } const unsigned int src_var_thresh_intra_skip = 1; const int skip_intra_in_interframe = sf->intra_sf.skip_intra_in_interframe; if (!(skip_intra_in_interframe && (x->source_variance > src_var_thresh_intra_skip))) return; // Prune intra search based on best inter mode being transfrom skip. if ((skip_intra_in_interframe >= 2) && search_state->best_mbmode.skip_txfm) { const int qindex_thresh[2] = { 200, MAXQ }; const int ind = (skip_intra_in_interframe >= 3) ? 1 : 0; if (!have_newmv_in_inter_mode(search_state->best_mbmode.mode) && (x->qindex <= qindex_thresh[ind])) { search_state->intra_search_state.skip_intra_modes = 1; return; } else if ((skip_intra_in_interframe >= 4) && (inter_cost < 0 || intra_cost < 0)) { search_state->intra_search_state.skip_intra_modes = 1; return; } } // Use ML model to prune intra search. if (inter_cost >= 0 && intra_cost >= 0) { const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480) ? &av1_intrap_nn_config : &av1_intrap_hd_nn_config; float nn_features[6]; float scores[2] = { 0.0f }; nn_features[0] = (float)search_state->best_mbmode.skip_txfm; nn_features[1] = (float)mi_size_wide_log2[bsize]; nn_features[2] = (float)mi_size_high_log2[bsize]; nn_features[3] = (float)intra_cost; nn_features[4] = (float)inter_cost; const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd); const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd); nn_features[5] = (float)(ac_q_max / ac_q); av1_nn_predict(nn_features, nn_config, 1, scores); // For two parameters, the max prob returned from av1_nn_softmax equals // 1.0 / (1.0 + e^(-|diff_score|)). Here use scores directly to avoid the // calling of av1_nn_softmax. const float thresh[5] = { 1.4f, 1.4f, 1.4f, 1.4f, 1.4f }; assert(skip_intra_in_interframe <= 5); if (scores[1] > scores[0] + thresh[skip_intra_in_interframe - 1]) { search_state->intra_search_state.skip_intra_modes = 1; } } } static inline bool skip_interp_filter_search(const AV1_COMP *cpi, int is_single_pred) { const MODE encoding_mode = cpi->oxcf.mode; if (encoding_mode == REALTIME) { return (cpi->common.current_frame.reference_mode == SINGLE_REFERENCE && (cpi->sf.interp_sf.skip_interp_filter_search || cpi->sf.winner_mode_sf.winner_mode_ifs)); } else if (encoding_mode == GOOD) { // Skip interpolation filter search for single prediction modes. return (cpi->sf.interp_sf.skip_interp_filter_search && is_single_pred); } return false; } static inline int get_block_temp_var(const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize) { const AV1_COMMON *const cm = &cpi->common; const SPEED_FEATURES *const sf = &cpi->sf; if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION || !sf->rt_sf.short_circuit_low_temp_var || !sf->rt_sf.prune_inter_modes_using_temp_var) { return 0; } const int mi_row = x->e_mbd.mi_row; const int mi_col = x->e_mbd.mi_col; int is_low_temp_var = 0; if (cm->seq_params->sb_size == BLOCK_64X64) is_low_temp_var = av1_get_force_skip_low_temp_var_small_sb( &x->part_search_info.variance_low[0], mi_row, mi_col, bsize); else is_low_temp_var = av1_get_force_skip_low_temp_var( &x->part_search_info.variance_low[0], mi_row, mi_col, bsize); return is_low_temp_var; } // TODO(chiyotsai@google.com): See the todo for av1_rd_pick_intra_mode_sb. void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, struct RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) { AV1_COMMON *const cm = &cpi->common; const FeatureFlags *const features = &cm->features; const int num_planes = av1_num_planes(cm); const SPEED_FEATURES *const sf = &cpi->sf; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; TxfmSearchInfo *txfm_info = &x->txfm_search_info; int i; const ModeCosts *mode_costs = &x->mode_costs; const int *comp_inter_cost = mode_costs->comp_inter_cost[av1_get_reference_mode_context(xd)]; InterModeSearchState search_state; init_inter_mode_search_state(&search_state, cpi, x, bsize, best_rd_so_far); INTERINTRA_MODE interintra_modes[REF_FRAMES] = { INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES }; HandleInterModeArgs args = { { NULL }, { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }, { NULL }, { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 }, NULL, NULL, NULL, search_state.modelled_rd, INT_MAX, INT_MAX, search_state.simple_rd, 0, false, interintra_modes, { { { 0 }, { { 0 } }, { 0 }, 0, 0, 0, 0 } }, { { 0, 0 } }, { 0 }, 0, 0, -1, -1, -1, { 0 }, { 0 }, UINT_MAX }; // Currently, is_low_temp_var is used in real time encoding. const int is_low_temp_var = get_block_temp_var(cpi, x, bsize); for (i = 0; i < MODE_CTX_REF_FRAMES; ++i) args.cmp_mode[i] = -1; // Indicates the appropriate number of simple translation winner modes for // exhaustive motion mode evaluation const int max_winner_motion_mode_cand = num_winner_motion_modes[sf->winner_mode_sf.motion_mode_for_winner_cand]; assert(max_winner_motion_mode_cand <= MAX_WINNER_MOTION_MODES); motion_mode_candidate motion_mode_cand; motion_mode_best_st_candidate best_motion_mode_cands; // Initializing the number of motion mode candidates to zero. best_motion_mode_cands.num_motion_mode_cand = 0; for (i = 0; i < MAX_WINNER_MOTION_MODES; ++i) best_motion_mode_cands.motion_mode_cand[i].rd_cost = INT64_MAX; for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; av1_invalid_rd_stats(rd_cost); for (i = 0; i < REF_FRAMES; ++i) { x->warp_sample_info[i].num = -1; } // Ref frames that are selected by square partition blocks. int picked_ref_frames_mask = 0; if (sf->inter_sf.prune_ref_frame_for_rect_partitions && mbmi->partition != PARTITION_NONE) { // prune_ref_frame_for_rect_partitions = 1 implies prune only extended // partition blocks. prune_ref_frame_for_rect_partitions >=2 // implies prune for vert, horiz and extended partition blocks. if ((mbmi->partition != PARTITION_VERT && mbmi->partition != PARTITION_HORZ) || sf->inter_sf.prune_ref_frame_for_rect_partitions >= 2) { picked_ref_frames_mask = fetch_picked_ref_frames_mask(x, bsize, cm->seq_params->mib_size); } } #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, set_params_rd_pick_inter_mode_time); #endif // Skip ref frames that never selected by square blocks. const int skip_ref_frame_mask = picked_ref_frames_mask ? ~picked_ref_frames_mask : 0; mode_skip_mask_t mode_skip_mask; unsigned int ref_costs_single[REF_FRAMES]; unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES]; struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]; // init params, set frame modes, speed features set_params_rd_pick_inter_mode(cpi, x, &args, bsize, &mode_skip_mask, skip_ref_frame_mask, ref_costs_single, ref_costs_comp, yv12_mb); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, set_params_rd_pick_inter_mode_time); #endif int64_t best_est_rd = INT64_MAX; const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; // If do_tx_search is 0, only estimated RD should be computed. // If do_tx_search is 1, all modes have TX search performed. const int do_tx_search = !((sf->inter_sf.inter_mode_rd_model_estimation == 1 && md->ready) || (sf->inter_sf.inter_mode_rd_model_estimation == 2 && num_pels_log2_lookup[bsize] > 8)); InterModesInfo *inter_modes_info = x->inter_modes_info; inter_modes_info->num = 0; // Temporary buffers used by handle_inter_mode(). uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]); // The best RD found for the reference frame, among single reference modes. // Note that the 0-th element will contain a cut-off that is later used // to determine if we should skip a compound mode. int64_t ref_frame_rd[REF_FRAMES] = { INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX }; // Prepared stats used later to check if we could skip intra mode eval. int64_t inter_cost = -1; int64_t intra_cost = -1; // Need to tweak the threshold for hdres speed 0 & 1. const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; // Obtain the relevant tpl stats for pruning inter modes PruneInfoFromTpl inter_cost_info_from_tpl; #if !CONFIG_REALTIME_ONLY if (sf->inter_sf.prune_inter_modes_based_on_tpl) { // x->tpl_keep_ref_frame[id] = 1 => no pruning in // prune_ref_by_selective_ref_frame() // x->tpl_keep_ref_frame[id] = 0 => ref frame can be pruned in // prune_ref_by_selective_ref_frame() // Populating valid_refs[idx] = 1 ensures that // 'inter_cost_info_from_tpl.best_inter_cost' does not correspond to a // pruned ref frame. int valid_refs[INTER_REFS_PER_FRAME]; for (MV_REFERENCE_FRAME frame = LAST_FRAME; frame < REF_FRAMES; frame++) { const MV_REFERENCE_FRAME refs[2] = { frame, NONE_FRAME }; valid_refs[frame - 1] = x->tpl_keep_ref_frame[frame] || !prune_ref_by_selective_ref_frame( cpi, x, refs, cm->cur_frame->ref_display_order_hint); } av1_zero(inter_cost_info_from_tpl); get_block_level_tpl_stats(cpi, bsize, mi_row, mi_col, valid_refs, &inter_cost_info_from_tpl); } const int do_pruning = (AOMMIN(cm->width, cm->height) > 480 && cpi->speed <= 1) ? 0 : 1; if (do_pruning && sf->intra_sf.skip_intra_in_interframe && cpi->oxcf.algo_cfg.enable_tpl_model) calculate_cost_from_tpl_data(cpi, x, bsize, mi_row, mi_col, &inter_cost, &intra_cost); #endif // !CONFIG_REALTIME_ONLY // Initialize best mode stats for winner mode processing. const int max_winner_mode_count = winner_mode_count_allowed[sf->winner_mode_sf.multi_winner_mode_type]; zero_winner_mode_stats(bsize, max_winner_mode_count, x->winner_mode_stats); x->winner_mode_count = 0; store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, NULL, bsize, best_rd_so_far, sf->winner_mode_sf.multi_winner_mode_type, 0); int mode_thresh_mul_fact = (1 << MODE_THRESH_QBITS); if (sf->inter_sf.prune_inter_modes_if_skippable) { // Higher multiplication factor values for lower quantizers. mode_thresh_mul_fact = mode_threshold_mul_factor[x->qindex]; } // Initialize arguments for mode loop speed features InterModeSFArgs sf_args = { &args.skip_motion_mode, &mode_skip_mask, &search_state, skip_ref_frame_mask, 0, mode_thresh_mul_fact, 0, 0 }; int64_t best_inter_yrd = INT64_MAX; // This is the main loop of this function. It loops over all possible inter // modes and calls handle_inter_mode() to compute the RD for each. // Here midx is just an iterator index that should not be used by itself // except to keep track of the number of modes searched. It should be used // with av1_default_mode_order to get the enum that defines the mode, which // can be used with av1_mode_defs to get the prediction mode and the ref // frames. // TODO(yunqing, any): Setting mode_start and mode_end outside for-loop brings // good speedup for real time case. If we decide to use compound mode in real // time, maybe we can modify av1_default_mode_order table. THR_MODES mode_start = THR_INTER_MODE_START; THR_MODES mode_end = THR_INTER_MODE_END; const CurrentFrame *const current_frame = &cm->current_frame; if (current_frame->reference_mode == SINGLE_REFERENCE) { mode_start = SINGLE_REF_MODE_START; mode_end = SINGLE_REF_MODE_END; } for (THR_MODES midx = mode_start; midx < mode_end; ++midx) { // Get the actual prediction mode we are trying in this iteration const THR_MODES mode_enum = av1_default_mode_order[midx]; const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum]; const PREDICTION_MODE this_mode = mode_def->mode; const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame; const MV_REFERENCE_FRAME ref_frame = ref_frames[0]; const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1]; const int is_single_pred = ref_frame > INTRA_FRAME && second_ref_frame == NONE_FRAME; const int comp_pred = second_ref_frame > INTRA_FRAME; init_mbmi(mbmi, this_mode, ref_frames, cm); txfm_info->skip_txfm = 0; sf_args.num_single_modes_processed += is_single_pred; set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, skip_inter_mode_time); #endif // Apply speed features to decide if this inter mode can be skipped const int is_skip_inter_mode = skip_inter_mode( cpi, x, bsize, ref_frame_rd, midx, &sf_args, is_low_temp_var); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, skip_inter_mode_time); #endif if (is_skip_inter_mode) continue; // Select prediction reference frames. for (i = 0; i < num_planes; i++) { xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; } mbmi->angle_delta[PLANE_TYPE_Y] = 0; mbmi->angle_delta[PLANE_TYPE_UV] = 0; mbmi->filter_intra_mode_info.use_filter_intra = 0; mbmi->ref_mv_idx = 0; const int64_t ref_best_rd = search_state.best_rd; RD_STATS rd_stats, rd_stats_y, rd_stats_uv; av1_init_rd_stats(&rd_stats); const int ref_frame_cost = comp_pred ? ref_costs_comp[ref_frame][second_ref_frame] : ref_costs_single[ref_frame]; const int compmode_cost = is_comp_ref_allowed(mbmi->bsize) ? comp_inter_cost[comp_pred] : 0; const int real_compmode_cost = cm->current_frame.reference_mode == REFERENCE_MODE_SELECT ? compmode_cost : 0; // Point to variables that are maintained between loop iterations args.single_newmv = search_state.single_newmv; args.single_newmv_rate = search_state.single_newmv_rate; args.single_newmv_valid = search_state.single_newmv_valid; args.single_comp_cost = real_compmode_cost; args.ref_frame_cost = ref_frame_cost; args.best_pred_sse = search_state.best_pred_sse; args.skip_ifs = skip_interp_filter_search(cpi, is_single_pred); int64_t skip_rd[2] = { search_state.best_skip_rd[0], search_state.best_skip_rd[1] }; int64_t this_yrd = INT64_MAX; #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, handle_inter_mode_time); #endif int64_t this_rd = handle_inter_mode( cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &args, ref_best_rd, tmp_buf, &x->comp_rd_buffer, &best_est_rd, do_tx_search, inter_modes_info, &motion_mode_cand, skip_rd, &inter_cost_info_from_tpl, &this_yrd); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, handle_inter_mode_time); #endif if (current_frame->reference_mode != SINGLE_REFERENCE) { if (!args.skip_ifs && sf->inter_sf.prune_comp_search_by_single_result > 0 && is_inter_singleref_mode(this_mode)) { collect_single_states(x, &search_state, mbmi); } if (sf->inter_sf.prune_comp_using_best_single_mode_ref > 0 && is_inter_singleref_mode(this_mode)) update_best_single_mode(&search_state, this_mode, ref_frame, this_rd); } if (this_rd == INT64_MAX) continue; if (mbmi->skip_txfm) { rd_stats_y.rate = 0; rd_stats_uv.rate = 0; } if (sf->inter_sf.prune_compound_using_single_ref && is_single_pred && this_rd < ref_frame_rd[ref_frame]) { ref_frame_rd[ref_frame] = this_rd; } // Did this mode help, i.e., is it the new best mode if (this_rd < search_state.best_rd) { assert(IMPLIES(comp_pred, cm->current_frame.reference_mode != SINGLE_REFERENCE)); search_state.best_pred_sse = x->pred_sse[ref_frame]; best_inter_yrd = this_yrd; update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y, &rd_stats_uv, mode_enum, x, do_tx_search); if (do_tx_search) search_state.best_skip_rd[0] = skip_rd[0]; // skip_rd[0] is the best total rd for a skip mode so far. // skip_rd[1] is the best total rd for a skip mode so far in luma. // When do_tx_search = 1, both skip_rd[0] and skip_rd[1] are updated. // When do_tx_search = 0, skip_rd[1] is updated. search_state.best_skip_rd[1] = skip_rd[1]; } if (sf->winner_mode_sf.motion_mode_for_winner_cand) { // Add this mode to motion mode candidate list for motion mode search // if using motion_mode_for_winner_cand speed feature handle_winner_cand(mbmi, &best_motion_mode_cands, max_winner_motion_mode_cand, this_rd, &motion_mode_cand, args.skip_motion_mode); } /* keep record of best compound/single-only prediction */ record_best_compound(cm->current_frame.reference_mode, &rd_stats, comp_pred, x->rdmult, &search_state, compmode_cost); } #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, evaluate_motion_mode_for_winner_candidates_time); #endif if (sf->winner_mode_sf.motion_mode_for_winner_cand) { // For the single ref winner candidates, evaluate other motion modes (non // simple translation). evaluate_motion_mode_for_winner_candidates( cpi, x, rd_cost, &args, tile_data, ctx, yv12_mb, &best_motion_mode_cands, do_tx_search, bsize, &best_est_rd, &search_state, &best_inter_yrd); } #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, evaluate_motion_mode_for_winner_candidates_time); #endif #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, do_tx_search_time); #endif if (do_tx_search != 1) { // A full tx search has not yet been done, do tx search for // top mode candidates tx_search_best_inter_candidates(cpi, tile_data, x, best_rd_so_far, bsize, yv12_mb, mi_row, mi_col, &search_state, rd_cost, ctx, &best_inter_yrd); } #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, do_tx_search_time); #endif #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, handle_intra_mode_time); #endif // Gate intra mode evaluation if best of inter is skip except when source // variance is extremely low and also based on max intra bsize. skip_intra_modes_in_interframe(cm, x, bsize, &search_state, sf, inter_cost, intra_cost); const unsigned int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME]; search_intra_modes_in_interframe(&search_state, cpi, x, rd_cost, bsize, ctx, &sf_args, intra_ref_frame_cost, best_inter_yrd); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, handle_intra_mode_time); #endif #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, refine_winner_mode_tx_time); #endif int winner_mode_count = sf->winner_mode_sf.multi_winner_mode_type ? x->winner_mode_count : 1; // In effect only when fast tx search speed features are enabled. refine_winner_mode_tx( cpi, x, rd_cost, bsize, ctx, &search_state.best_mode_index, &search_state.best_mbmode, yv12_mb, search_state.best_rate_y, search_state.best_rate_uv, &search_state.best_skip2, winner_mode_count); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, refine_winner_mode_tx_time); #endif // Initialize default mode evaluation params set_mode_eval_params(cpi, x, DEFAULT_EVAL); // Only try palette mode when the best mode so far is an intra mode. const int try_palette = cpi->oxcf.tool_cfg.enable_palette && av1_allow_palette(features->allow_screen_content_tools, mbmi->bsize) && !is_inter_mode(search_state.best_mbmode.mode) && rd_cost->rate != INT_MAX; RD_STATS this_rd_cost; int this_skippable = 0; if (try_palette) { #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, av1_search_palette_mode_time); #endif this_skippable = av1_search_palette_mode( &search_state.intra_search_state, cpi, x, bsize, intra_ref_frame_cost, ctx, &this_rd_cost, search_state.best_rd); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, av1_search_palette_mode_time); #endif if (this_rd_cost.rdcost < search_state.best_rd) { search_state.best_mode_index = THR_DC; mbmi->mv[0].as_int = 0; rd_cost->rate = this_rd_cost.rate; rd_cost->dist = this_rd_cost.dist; rd_cost->rdcost = this_rd_cost.rdcost; search_state.best_rd = rd_cost->rdcost; search_state.best_mbmode = *mbmi; search_state.best_skip2 = 0; search_state.best_mode_skippable = this_skippable; memcpy(ctx->blk_skip, txfm_info->blk_skip, sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); } } search_state.best_mbmode.skip_mode = 0; if (cm->current_frame.skip_mode_info.skip_mode_flag && is_comp_ref_allowed(bsize)) { const struct segmentation *const seg = &cm->seg; unsigned char segment_id = mbmi->segment_id; if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, yv12_mb); } } // Make sure that the ref_mv_idx is only nonzero when we're // using a mode which can support ref_mv_idx if (search_state.best_mbmode.ref_mv_idx != 0 && !(search_state.best_mbmode.mode == NEWMV || search_state.best_mbmode.mode == NEW_NEWMV || have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) { search_state.best_mbmode.ref_mv_idx = 0; } if (search_state.best_mode_index == THR_INVALID || search_state.best_rd >= best_rd_so_far) { rd_cost->rate = INT_MAX; rd_cost->rdcost = INT64_MAX; return; } const InterpFilter interp_filter = features->interp_filter; assert((interp_filter == SWITCHABLE) || (interp_filter == search_state.best_mbmode.interp_filters.as_filters.y_filter) || !is_inter_block(&search_state.best_mbmode)); assert((interp_filter == SWITCHABLE) || (interp_filter == search_state.best_mbmode.interp_filters.as_filters.x_filter) || !is_inter_block(&search_state.best_mbmode)); if (!cpi->rc.is_src_frame_alt_ref && sf->inter_sf.adaptive_rd_thresh) { av1_update_rd_thresh_fact( cm, x->thresh_freq_fact, sf->inter_sf.adaptive_rd_thresh, bsize, search_state.best_mode_index, mode_start, mode_end, THR_DC, MAX_MODES); } // macroblock modes *mbmi = search_state.best_mbmode; txfm_info->skip_txfm |= search_state.best_skip2; // Note: this section is needed since the mode may have been forced to // GLOBALMV by the all-zero mode handling of ref-mv. if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) { // Correct the interp filters for GLOBALMV if (is_nontrans_global_motion(xd, xd->mi[0])) { int_interpfilters filters = av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter)); assert(mbmi->interp_filters.as_int == filters.as_int); (void)filters; } } txfm_info->skip_txfm |= search_state.best_mode_skippable; assert(search_state.best_mode_index != THR_INVALID); #if CONFIG_INTERNAL_STATS store_coding_context(x, ctx, search_state.best_mode_index, search_state.best_mode_skippable); #else store_coding_context(x, ctx, search_state.best_mode_skippable); #endif // CONFIG_INTERNAL_STATS if (mbmi->palette_mode_info.palette_size[1] > 0) { assert(try_palette); av1_restore_uv_color_map(cpi, x); } } void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x, int mi_row, int mi_col, RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) { const AV1_COMMON *const cm = &cpi->common; const FeatureFlags *const features = &cm->features; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; unsigned char segment_id = mbmi->segment_id; const int comp_pred = 0; int i; unsigned int ref_costs_single[REF_FRAMES]; unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES]; const ModeCosts *mode_costs = &x->mode_costs; const int *comp_inter_cost = mode_costs->comp_inter_cost[av1_get_reference_mode_context(xd)]; InterpFilter best_filter = SWITCHABLE; int64_t this_rd = INT64_MAX; int rate2 = 0; const int64_t distortion2 = 0; (void)mi_row; (void)mi_col; (void)tile_data; av1_collect_neighbors_ref_counts(xd); estimate_ref_frame_costs(cm, xd, mode_costs, segment_id, ref_costs_single, ref_costs_comp); for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; for (i = LAST_FRAME; i < REF_FRAMES; ++i) x->pred_mv_sad[i] = INT_MAX; rd_cost->rate = INT_MAX; assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)); mbmi->palette_mode_info.palette_size[0] = 0; mbmi->palette_mode_info.palette_size[1] = 0; mbmi->filter_intra_mode_info.use_filter_intra = 0; mbmi->mode = GLOBALMV; mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->uv_mode = UV_DC_PRED; if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) mbmi->ref_frame[0] = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME); else mbmi->ref_frame[0] = LAST_FRAME; mbmi->ref_frame[1] = NONE_FRAME; mbmi->mv[0].as_int = gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]], features->allow_high_precision_mv, bsize, mi_col, mi_row, features->cur_frame_force_integer_mv) .as_int; mbmi->tx_size = max_txsize_lookup[bsize]; x->txfm_search_info.skip_txfm = 1; mbmi->ref_mv_idx = 0; mbmi->motion_mode = SIMPLE_TRANSLATION; av1_count_overlappable_neighbors(cm, xd); if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) { int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref); // Select the samples according to motion vector difference if (mbmi->num_proj_ref > 1) { mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref, bsize); } } const InterpFilter interp_filter = features->interp_filter; set_default_interp_filters(mbmi, interp_filter); if (interp_filter != SWITCHABLE) { best_filter = interp_filter; } else { best_filter = EIGHTTAP_REGULAR; if (av1_is_interp_needed(xd)) { int rs; int best_rs = INT_MAX; for (i = 0; i < SWITCHABLE_FILTERS; ++i) { mbmi->interp_filters = av1_broadcast_interp_filter(i); rs = av1_get_switchable_rate(x, xd, interp_filter, cm->seq_params->enable_dual_filter); if (rs < best_rs) { best_rs = rs; best_filter = mbmi->interp_filters.as_filters.y_filter; } } } } // Set the appropriate filter mbmi->interp_filters = av1_broadcast_interp_filter(best_filter); rate2 += av1_get_switchable_rate(x, xd, interp_filter, cm->seq_params->enable_dual_filter); if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) rate2 += comp_inter_cost[comp_pred]; // Estimate the reference frame signaling cost and add it // to the rolling cost variable. rate2 += ref_costs_single[LAST_FRAME]; this_rd = RDCOST(x->rdmult, rate2, distortion2); rd_cost->rate = rate2; rd_cost->dist = distortion2; rd_cost->rdcost = this_rd; if (this_rd >= best_rd_so_far) { rd_cost->rate = INT_MAX; rd_cost->rdcost = INT64_MAX; return; } assert((interp_filter == SWITCHABLE) || (interp_filter == mbmi->interp_filters.as_filters.y_filter)); if (cpi->sf.inter_sf.adaptive_rd_thresh) { av1_update_rd_thresh_fact(cm, x->thresh_freq_fact, cpi->sf.inter_sf.adaptive_rd_thresh, bsize, THR_GLOBALMV, THR_INTER_MODE_START, THR_INTER_MODE_END, THR_DC, MAX_MODES); } #if CONFIG_INTERNAL_STATS store_coding_context(x, ctx, THR_GLOBALMV, 0); #else store_coding_context(x, ctx, 0); #endif // CONFIG_INTERNAL_STATS } /*!\cond */ struct calc_target_weighted_pred_ctxt { const OBMCBuffer *obmc_buffer; const uint8_t *tmp; int tmp_stride; int overlap; }; /*!\endcond */ static inline void calc_target_weighted_pred_above( MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) { (void)nb_mi; (void)num_planes; (void)rel_mi_row; (void)dir; struct calc_target_weighted_pred_ctxt *ctxt = (struct calc_target_weighted_pred_ctxt *)fun_ctxt; const int bw = xd->width << MI_SIZE_LOG2; const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap); int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_col * MI_SIZE); int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_col * MI_SIZE); const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE; const int is_hbd = is_cur_buf_hbd(xd); if (!is_hbd) { for (int row = 0; row < ctxt->overlap; ++row) { const uint8_t m0 = mask1d[row]; const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; for (int col = 0; col < op_mi_size * MI_SIZE; ++col) { wsrc[col] = m1 * tmp[col]; mask[col] = m0; } wsrc += bw; mask += bw; tmp += ctxt->tmp_stride; } } else { const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp); for (int row = 0; row < ctxt->overlap; ++row) { const uint8_t m0 = mask1d[row]; const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; for (int col = 0; col < op_mi_size * MI_SIZE; ++col) { wsrc[col] = m1 * tmp16[col]; mask[col] = m0; } wsrc += bw; mask += bw; tmp16 += ctxt->tmp_stride; } } } static inline void calc_target_weighted_pred_left( MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) { (void)nb_mi; (void)num_planes; (void)rel_mi_col; (void)dir; struct calc_target_weighted_pred_ctxt *ctxt = (struct calc_target_weighted_pred_ctxt *)fun_ctxt; const int bw = xd->width << MI_SIZE_LOG2; const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap); int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_row * MI_SIZE * bw); int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_row * MI_SIZE * bw); const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride); const int is_hbd = is_cur_buf_hbd(xd); if (!is_hbd) { for (int row = 0; row < op_mi_size * MI_SIZE; ++row) { for (int col = 0; col < ctxt->overlap; ++col) { const uint8_t m0 = mask1d[col]; const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 + (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1; mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0; } wsrc += bw; mask += bw; tmp += ctxt->tmp_stride; } } else { const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp); for (int row = 0; row < op_mi_size * MI_SIZE; ++row) { for (int col = 0; col < ctxt->overlap; ++col) { const uint8_t m0 = mask1d[col]; const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 + (tmp16[col] << AOM_BLEND_A64_ROUND_BITS) * m1; mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0; } wsrc += bw; mask += bw; tmp16 += ctxt->tmp_stride; } } } // This function has a structure similar to av1_build_obmc_inter_prediction // // The OBMC predictor is computed as: // // PObmc(x,y) = // AOM_BLEND_A64(Mh(x), // AOM_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)), // PLeft(x, y)) // // Scaling up by AOM_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate // rounding, this can be written as: // // AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * Pobmc(x,y) = // Mh(x) * Mv(y) * P(x,y) + // Mh(x) * Cv(y) * Pabove(x,y) + // AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y) // // Where : // // Cv(y) = AOM_BLEND_A64_MAX_ALPHA - Mv(y) // Ch(y) = AOM_BLEND_A64_MAX_ALPHA - Mh(y) // // This function computes 'wsrc' and 'mask' as: // // wsrc(x, y) = // AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * src(x, y) - // Mh(x) * Cv(y) * Pabove(x,y) + // AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y) // // mask(x, y) = Mh(x) * Mv(y) // // These can then be used to efficiently approximate the error for any // predictor P in the context of the provided neighbouring predictors by // computing: // // error(x, y) = // wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2) // static inline void calc_target_weighted_pred( const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd, const uint8_t *above, int above_stride, const uint8_t *left, int left_stride) { const BLOCK_SIZE bsize = xd->mi[0]->bsize; const int bw = xd->width << MI_SIZE_LOG2; const int bh = xd->height << MI_SIZE_LOG2; const OBMCBuffer *obmc_buffer = &x->obmc_buffer; int32_t *mask_buf = obmc_buffer->mask; int32_t *wsrc_buf = obmc_buffer->wsrc; const int is_hbd = is_cur_buf_hbd(xd); const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA; // plane 0 should not be sub-sampled assert(xd->plane[0].subsampling_x == 0); assert(xd->plane[0].subsampling_y == 0); av1_zero_array(wsrc_buf, bw * bh); for (int i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA; // handle above row if (xd->up_available) { const int overlap = AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1; struct calc_target_weighted_pred_ctxt ctxt = { obmc_buffer, above, above_stride, overlap }; foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd, max_neighbor_obmc[mi_size_wide_log2[bsize]], calc_target_weighted_pred_above, &ctxt); } for (int i = 0; i < bw * bh; ++i) { wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA; mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA; } // handle left column if (xd->left_available) { const int overlap = AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1; struct calc_target_weighted_pred_ctxt ctxt = { obmc_buffer, left, left_stride, overlap }; foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd, max_neighbor_obmc[mi_size_high_log2[bsize]], calc_target_weighted_pred_left, &ctxt); } if (!is_hbd) { const uint8_t *src = x->plane[0].src.buf; for (int row = 0; row < bh; ++row) { for (int col = 0; col < bw; ++col) { wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col]; } wsrc_buf += bw; src += x->plane[0].src.stride; } } else { const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf); for (int row = 0; row < bh; ++row) { for (int col = 0; col < bw; ++col) { wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col]; } wsrc_buf += bw; src += x->plane[0].src.stride; } } } aom-3.12.1/av1/encoder/rdopt.h000066400000000000000000000333751477627663500160230ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_RDOPT_H_ #define AOM_AV1_ENCODER_RDOPT_H_ #include #include "av1/common/blockd.h" #include "av1/common/txb_common.h" #include "av1/encoder/block.h" #include "av1/encoder/context_tree.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encodetxb.h" #include "av1/encoder/rdopt_utils.h" #ifdef __cplusplus extern "C" { #endif #define COMP_TYPE_RD_THRESH_SCALE 11 #define COMP_TYPE_RD_THRESH_SHIFT 4 #define MAX_WINNER_MOTION_MODES 10 struct TileInfo; struct macroblock; struct RD_STATS; /*!\brief AV1 intra mode selection for intra frames. * * \ingroup intra_mode_search * \callgraph * Top level function for rd-based intra mode selection during intra frame * encoding. This function will first search for the best luma prediction by * calling av1_rd_pick_intra_sby_mode, then it searches for chroma prediction * with av1_rd_pick_intra_sbuv_mode. If applicable, this function ends the * search with an evaluation for intrabc. * * \param[in] cpi Top-level encoder structure. * \param[in] x Pointer to structure holding all the data for the current macroblock. * \param[in] rd_cost Struct to keep track of the RD information. * \param[in] bsize Current block size. * \param[in] ctx Structure to hold snapshot of coding context during the mode picking process. * \param[in] best_rd Best RD seen for this block so far. * * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x * is modified to store information about the best mode computed * in this function. The rd_cost struct is also updated with the RD stats * corresponding to the best mode found. */ void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x, struct RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd); /*!\brief AV1 inter mode selection. * * \ingroup inter_mode_search * \callgraph * Top level function for inter mode selection. This function will loop over * all possible inter modes and select the best one for the current block by * computing the RD cost. The mode search and RD are computed in * handle_inter_mode(), which is called from this function within the main * loop. * * \param[in] cpi Top-level encoder structure * \param[in] tile_data Pointer to struct holding adaptive data/contexts/models for the tile during encoding * \param[in] x Pointer to structure holding all the data for the current macroblock * \param[in] rd_cost Struct to keep track of the RD information * \param[in] bsize Current block size * \param[in] ctx Structure to hold snapshot of coding context during the mode picking process * \param[in] best_rd_so_far Best RD seen for this block so far * * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x * is modified to store information about the best mode computed * in this function. The rd_cost struct is also updated with the RD stats * corresponding to the best mode found. */ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, struct RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); /*!\brief AV1 intra mode selection based on Non-RD optimized model. * * \ingroup nonrd_mode_search * \callgraph * \callergraph * Top level function for Non-RD optimized intra mode selection. * This finction will loop over subset of intra modes and select the best one * based on calculated modelled RD cost. Only 4 intra modes are checked as * specified in \c intra_mode_list. When calculating RD cost Hadamard transform * of residual is used to calculate rate. Estmation of RD cost is performed * in \c av1_estimate_block_intra which is called from this function * * \param[in] cpi Top-level encoder structure * \param[in] x Pointer to structure holding all the data for the current macroblock * \param[in] rd_cost Struct to keep track of the RD information * \param[in] bsize Current block size * \param[in] ctx Structure to hold snapshot of coding context during the mode picking process * * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x * is modified to store information about the best mode computed * in this function. The rd_cost struct is also updated with the RD stats * corresponding to the best mode found. */ void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx); /*!\brief AV1 inter mode selection based on Non-RD optimized model. * * \ingroup nonrd_mode_search * \callgraph * Top level function for Non-RD optimized inter mode selection. * This finction will loop over subset of inter modes and select the best one * based on calculated modelled RD cost. While making decisions which modes to * check, this function applies heuristics based on previously checked modes, * block residual variance, block size, and other factors to prune certain * modes and reference frames. Currently only single reference frame modes * are checked. Additional heuristics are applied to decide if intra modes * need to be checked. * * * \param[in] cpi Top-level encoder structure * \param[in] tile_data Pointer to struct holding adaptive data/contexts/models for the tile during encoding * \param[in] x Pointer to structure holding all the data for the current macroblock * \param[in] rd_cost Struct to keep track of the RD information * \param[in] bsize Current block size * \param[in] ctx Structure to hold snapshot of coding context during the mode picking process * * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x * is modified to store information about the best mode computed * in this function. The rd_cost struct is also updated with the RD stats * corresponding to the best mode found. */ void av1_nonrd_pick_inter_mode_sb(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, struct RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx); void av1_rd_pick_inter_mode_sb_seg_skip( const struct AV1_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); void av1_inter_mode_data_init(struct TileDataEnc *tile_data); void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult); static inline int coded_to_superres_mi(int mi_col, int denom) { return (mi_col * denom + SCALE_NUMERATOR / 2) / SCALE_NUMERATOR; } static inline int av1_encoder_get_relative_dist(int a, int b) { assert(a >= 0 && b >= 0); return (a - b); } // This function will return number of mi's in a superblock. static inline int av1_get_sb_mi_size(const AV1_COMMON *const cm) { const int mi_alloc_size_1d = mi_size_wide[cm->mi_params.mi_alloc_bsize]; int sb_mi_rows = (mi_size_wide[cm->seq_params->sb_size] + mi_alloc_size_1d - 1) / mi_alloc_size_1d; assert(mi_size_wide[cm->seq_params->sb_size] == mi_size_high[cm->seq_params->sb_size]); int sb_mi_size = sb_mi_rows * sb_mi_rows; return sb_mi_size; } // This function prunes the mode if either of the reference frame falls in the // pruning list static inline int prune_ref(const MV_REFERENCE_FRAME *const ref_frame, const unsigned int *const ref_display_order_hint, const unsigned int frame_display_order_hint, const int *ref_frame_list) { for (int i = 0; i < 2; i++) { if (ref_frame_list[i] == NONE_FRAME) continue; if (ref_frame[0] == ref_frame_list[i] || ref_frame[1] == ref_frame_list[i]) { if (av1_encoder_get_relative_dist( ref_display_order_hint[ref_frame_list[i] - LAST_FRAME], frame_display_order_hint) < 0) return 1; } } return 0; } static inline int has_closest_ref_frames(const MV_REFERENCE_FRAME *ref_frame, int8_t closest_past_ref, int8_t closest_future_ref) { int has_closest_past_ref = (ref_frame[0] == closest_past_ref) || (ref_frame[1] == closest_past_ref); int has_closest_future_ref = (ref_frame[0] == closest_future_ref) || (ref_frame[1] == closest_future_ref); return (has_closest_past_ref && has_closest_future_ref); } static inline int has_best_pred_mv_sad(const MV_REFERENCE_FRAME *ref_frame, const MACROBLOCK *const x) { int has_best_past_pred_mv_sad = 0; int has_best_future_pred_mv_sad = 0; if (x->best_pred_mv_sad[0] < INT_MAX && x->best_pred_mv_sad[1] < INT_MAX) { has_best_past_pred_mv_sad = (x->pred_mv_sad[ref_frame[0]] == x->best_pred_mv_sad[0]) || (x->pred_mv_sad[ref_frame[1]] == x->best_pred_mv_sad[0]); has_best_future_pred_mv_sad = (x->pred_mv_sad[ref_frame[0]] == x->best_pred_mv_sad[1]) || (x->pred_mv_sad[ref_frame[1]] == x->best_pred_mv_sad[1]); } return (has_best_past_pred_mv_sad && has_best_future_pred_mv_sad); } static inline int prune_ref_by_selective_ref_frame( const AV1_COMP *const cpi, const MACROBLOCK *const x, const MV_REFERENCE_FRAME *const ref_frame, const unsigned int *const ref_display_order_hint) { const SPEED_FEATURES *const sf = &cpi->sf; if (!sf->inter_sf.selective_ref_frame) return 0; const int comp_pred = ref_frame[1] > INTRA_FRAME; if (sf->inter_sf.selective_ref_frame >= 2 || (sf->inter_sf.selective_ref_frame == 1 && comp_pred)) { int ref_frame_list[2] = { LAST3_FRAME, LAST2_FRAME }; if (x != NULL) { // Disable pruning if either tpl suggests that we keep the frame or // the pred_mv gives us the best sad if (x->tpl_keep_ref_frame[LAST3_FRAME] || x->pred_mv_sad[LAST3_FRAME] == x->best_pred_mv_sad[0]) { ref_frame_list[0] = NONE_FRAME; } if (x->tpl_keep_ref_frame[LAST2_FRAME] || x->pred_mv_sad[LAST2_FRAME] == x->best_pred_mv_sad[0]) { ref_frame_list[1] = NONE_FRAME; } } if (prune_ref(ref_frame, ref_display_order_hint, ref_display_order_hint[GOLDEN_FRAME - LAST_FRAME], ref_frame_list)) return 1; } if (sf->inter_sf.selective_ref_frame >= 3) { int ref_frame_list[2] = { ALTREF2_FRAME, BWDREF_FRAME }; if (x != NULL) { // Disable pruning if either tpl suggests that we keep the frame or // the pred_mv gives us the best sad if (x->tpl_keep_ref_frame[ALTREF2_FRAME] || x->pred_mv_sad[ALTREF2_FRAME] == x->best_pred_mv_sad[0]) { ref_frame_list[0] = NONE_FRAME; } if (x->tpl_keep_ref_frame[BWDREF_FRAME] || x->pred_mv_sad[BWDREF_FRAME] == x->best_pred_mv_sad[0]) { ref_frame_list[1] = NONE_FRAME; } } if (prune_ref(ref_frame, ref_display_order_hint, ref_display_order_hint[LAST_FRAME - LAST_FRAME], ref_frame_list)) return 1; } if (x != NULL && sf->inter_sf.prune_comp_ref_frames && comp_pred) { int closest_ref_frames = has_closest_ref_frames( ref_frame, cpi->ref_frame_dist_info.nearest_past_ref, cpi->ref_frame_dist_info.nearest_future_ref); if (closest_ref_frames == 0) { // Prune reference frames which are not the closest to the current frame. if (sf->inter_sf.prune_comp_ref_frames >= 2) { return 1; } else if (sf->inter_sf.prune_comp_ref_frames == 1) { // Prune reference frames with non minimum pred_mv_sad. if (has_best_pred_mv_sad(ref_frame, x) == 0) return 1; } } } return 0; } // This function will copy the best reference mode information from // MB_MODE_INFO_EXT to MB_MODE_INFO_EXT_FRAME. static inline void av1_copy_mbmi_ext_to_mbmi_ext_frame( MB_MODE_INFO_EXT_FRAME *mbmi_ext_best, const MB_MODE_INFO_EXT *const mbmi_ext, uint8_t ref_frame_type) { memcpy(mbmi_ext_best->ref_mv_stack, mbmi_ext->ref_mv_stack[ref_frame_type], sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE])); memcpy(mbmi_ext_best->weight, mbmi_ext->weight[ref_frame_type], sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE])); mbmi_ext_best->mode_context = mbmi_ext->mode_context[ref_frame_type]; mbmi_ext_best->ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; memcpy(mbmi_ext_best->global_mvs, mbmi_ext->global_mvs, sizeof(mbmi_ext->global_mvs)); } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_RDOPT_H_ aom-3.12.1/av1/encoder/rdopt_data_defs.h000066400000000000000000000317741477627663500200160ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_ #define AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_ #ifdef __cplusplus extern "C" { #endif static const THR_MODES intra_to_mode_idx[INTRA_MODE_NUM] = { THR_DC, // DC_PRED, THR_V_PRED, // V_PRED, THR_H_PRED, // H_PRED, THR_D45_PRED, // D45_PRED, THR_D135_PRED, // D135_PRED, THR_D113_PRED, // D113_PRED, THR_D157_PRED, // D157_PRED, THR_D203_PRED, // D203_PRED, THR_D67_PRED, // D67_PRED, THR_SMOOTH, // SMOOTH_PRED, THR_SMOOTH_V, // SMOOTH_V_PRED, THR_SMOOTH_H, // SMOOTH_H_PRED, THR_PAETH, // PAETH_PRED, }; /* clang-format off */ static const THR_MODES single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM] [REF_FRAMES] = { // NEARESTMV, { THR_INVALID, THR_NEARESTMV, THR_NEARESTL2, THR_NEARESTL3, THR_NEARESTG, THR_NEARESTB, THR_NEARESTA2, THR_NEARESTA, }, // NEARMV, { THR_INVALID, THR_NEARMV, THR_NEARL2, THR_NEARL3, THR_NEARG, THR_NEARB, THR_NEARA2, THR_NEARA, }, // GLOBALMV, { THR_INVALID, THR_GLOBALMV, THR_GLOBALL2, THR_GLOBALL3, THR_GLOBALG, THR_GLOBALB, THR_GLOBALA2, THR_GLOBALA, }, // NEWMV, { THR_INVALID, THR_NEWMV, THR_NEWL2, THR_NEWL3, THR_NEWG, THR_NEWB, THR_NEWA2, THR_NEWA, }, }; /* clang-format on */ /* clang-format off */ static const THR_MODES comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES] [REF_FRAMES] = { // NEAREST_NEARESTMV, { { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, { THR_INVALID, THR_INVALID, THR_COMP_NEAREST_NEARESTLL2, THR_COMP_NEAREST_NEARESTLL3, THR_COMP_NEAREST_NEARESTLG, THR_COMP_NEAREST_NEARESTLB, THR_COMP_NEAREST_NEARESTLA2, THR_COMP_NEAREST_NEARESTLA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEAREST_NEARESTL2B, THR_COMP_NEAREST_NEARESTL2A2, THR_COMP_NEAREST_NEARESTL2A, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEAREST_NEARESTL3B, THR_COMP_NEAREST_NEARESTL3A2, THR_COMP_NEAREST_NEARESTL3A, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEAREST_NEARESTGB, THR_COMP_NEAREST_NEARESTGA2, THR_COMP_NEAREST_NEARESTGA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEAREST_NEARESTBA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, }, // NEAR_NEARMV, { { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, { THR_INVALID, THR_INVALID, THR_COMP_NEAR_NEARLL2, THR_COMP_NEAR_NEARLL3, THR_COMP_NEAR_NEARLG, THR_COMP_NEAR_NEARLB, THR_COMP_NEAR_NEARLA2, THR_COMP_NEAR_NEARLA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEAR_NEARL2B, THR_COMP_NEAR_NEARL2A2, THR_COMP_NEAR_NEARL2A, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEAR_NEARL3B, THR_COMP_NEAR_NEARL3A2, THR_COMP_NEAR_NEARL3A, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEAR_NEARGB, THR_COMP_NEAR_NEARGA2, THR_COMP_NEAR_NEARGA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEAR_NEARBA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, }, // NEAREST_NEWMV, { { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, { THR_INVALID, THR_INVALID, THR_COMP_NEAREST_NEWLL2, THR_COMP_NEAREST_NEWLL3, THR_COMP_NEAREST_NEWLG, THR_COMP_NEAREST_NEWLB, THR_COMP_NEAREST_NEWLA2, THR_COMP_NEAREST_NEWLA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEAREST_NEWL2B, THR_COMP_NEAREST_NEWL2A2, THR_COMP_NEAREST_NEWL2A, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEAREST_NEWL3B, THR_COMP_NEAREST_NEWL3A2, THR_COMP_NEAREST_NEWL3A, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEAREST_NEWGB, THR_COMP_NEAREST_NEWGA2, THR_COMP_NEAREST_NEWGA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEAREST_NEWBA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, }, // NEW_NEARESTMV, { { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, { THR_INVALID, THR_INVALID, THR_COMP_NEW_NEARESTLL2, THR_COMP_NEW_NEARESTLL3, THR_COMP_NEW_NEARESTLG, THR_COMP_NEW_NEARESTLB, THR_COMP_NEW_NEARESTLA2, THR_COMP_NEW_NEARESTLA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEW_NEARESTL2B, THR_COMP_NEW_NEARESTL2A2, THR_COMP_NEW_NEARESTL2A, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEW_NEARESTL3B, THR_COMP_NEW_NEARESTL3A2, THR_COMP_NEW_NEARESTL3A, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEW_NEARESTGB, THR_COMP_NEW_NEARESTGA2, THR_COMP_NEW_NEARESTGA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEW_NEARESTBA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, }, // NEAR_NEWMV, { { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, { THR_INVALID, THR_INVALID, THR_COMP_NEAR_NEWLL2, THR_COMP_NEAR_NEWLL3, THR_COMP_NEAR_NEWLG, THR_COMP_NEAR_NEWLB, THR_COMP_NEAR_NEWLA2, THR_COMP_NEAR_NEWLA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEAR_NEWL2B, THR_COMP_NEAR_NEWL2A2, THR_COMP_NEAR_NEWL2A, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEAR_NEWL3B, THR_COMP_NEAR_NEWL3A2, THR_COMP_NEAR_NEWL3A, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEAR_NEWGB, THR_COMP_NEAR_NEWGA2, THR_COMP_NEAR_NEWGA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEAR_NEWBA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, }, // NEW_NEARMV, { { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, { THR_INVALID, THR_INVALID, THR_COMP_NEW_NEARLL2, THR_COMP_NEW_NEARLL3, THR_COMP_NEW_NEARLG, THR_COMP_NEW_NEARLB, THR_COMP_NEW_NEARLA2, THR_COMP_NEW_NEARLA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEW_NEARL2B, THR_COMP_NEW_NEARL2A2, THR_COMP_NEW_NEARL2A, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEW_NEARL3B, THR_COMP_NEW_NEARL3A2, THR_COMP_NEW_NEARL3A, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEW_NEARGB, THR_COMP_NEW_NEARGA2, THR_COMP_NEW_NEARGA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEW_NEARBA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, }, // GLOBAL_GLOBALMV, { { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, { THR_INVALID, THR_INVALID, THR_COMP_GLOBAL_GLOBALLL2, THR_COMP_GLOBAL_GLOBALLL3, THR_COMP_GLOBAL_GLOBALLG, THR_COMP_GLOBAL_GLOBALLB, THR_COMP_GLOBAL_GLOBALLA2, THR_COMP_GLOBAL_GLOBALLA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_GLOBAL_GLOBALL2B, THR_COMP_GLOBAL_GLOBALL2A2, THR_COMP_GLOBAL_GLOBALL2A, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_GLOBAL_GLOBALL3B, THR_COMP_GLOBAL_GLOBALL3A2, THR_COMP_GLOBAL_GLOBALL3A, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_GLOBAL_GLOBALGB, THR_COMP_GLOBAL_GLOBALGA2, THR_COMP_GLOBAL_GLOBALGA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_GLOBAL_GLOBALBA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, }, // NEW_NEWMV, { { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, { THR_INVALID, THR_INVALID, THR_COMP_NEW_NEWLL2, THR_COMP_NEW_NEWLL3, THR_COMP_NEW_NEWLG, THR_COMP_NEW_NEWLB, THR_COMP_NEW_NEWLA2, THR_COMP_NEW_NEWLA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEW_NEWL2B, THR_COMP_NEW_NEWL2A2, THR_COMP_NEW_NEWL2A, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEW_NEWL3B, THR_COMP_NEW_NEWL3A2, THR_COMP_NEW_NEWL3A, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEW_NEWGB, THR_COMP_NEW_NEWGA2, THR_COMP_NEW_NEWGA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_COMP_NEW_NEWBA, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, }, }, }; #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_ aom-3.12.1/av1/encoder/rdopt_utils.h000066400000000000000000001021131477627663500172260ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_RDOPT_UTILS_H_ #define AOM_AV1_ENCODER_RDOPT_UTILS_H_ #include "aom/aom_integer.h" #include "av1/encoder/block.h" #include "av1/common/cfl.h" #include "av1/common/pred_common.h" #include "av1/encoder/rdopt_data_defs.h" #ifdef __cplusplus extern "C" { #endif #define MAX_REF_MV_SEARCH 3 #define MAX_TX_RD_GATE_LEVEL 5 #define INTER_INTRA_RD_THRESH_SCALE 9 #define INTER_INTRA_RD_THRESH_SHIFT 4 typedef struct { PREDICTION_MODE mode; MV_REFERENCE_FRAME ref_frame[2]; } MODE_DEFINITION; // This array defines the mapping from the enums in THR_MODES to the actual // prediction modes and refrence frames static const MODE_DEFINITION av1_mode_defs[MAX_MODES] = { { NEARESTMV, { LAST_FRAME, NONE_FRAME } }, { NEARESTMV, { LAST2_FRAME, NONE_FRAME } }, { NEARESTMV, { LAST3_FRAME, NONE_FRAME } }, { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } }, { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } }, { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } }, { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } }, { NEWMV, { LAST_FRAME, NONE_FRAME } }, { NEWMV, { LAST2_FRAME, NONE_FRAME } }, { NEWMV, { LAST3_FRAME, NONE_FRAME } }, { NEWMV, { BWDREF_FRAME, NONE_FRAME } }, { NEWMV, { ALTREF2_FRAME, NONE_FRAME } }, { NEWMV, { ALTREF_FRAME, NONE_FRAME } }, { NEWMV, { GOLDEN_FRAME, NONE_FRAME } }, { NEARMV, { LAST_FRAME, NONE_FRAME } }, { NEARMV, { LAST2_FRAME, NONE_FRAME } }, { NEARMV, { LAST3_FRAME, NONE_FRAME } }, { NEARMV, { BWDREF_FRAME, NONE_FRAME } }, { NEARMV, { ALTREF2_FRAME, NONE_FRAME } }, { NEARMV, { ALTREF_FRAME, NONE_FRAME } }, { NEARMV, { GOLDEN_FRAME, NONE_FRAME } }, { GLOBALMV, { LAST_FRAME, NONE_FRAME } }, { GLOBALMV, { LAST2_FRAME, NONE_FRAME } }, { GLOBALMV, { LAST3_FRAME, NONE_FRAME } }, { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } }, { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } }, { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } }, { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } }, // TODO(zoeliu): May need to reconsider the order on the modes to check { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } }, { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } }, { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } }, { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } }, { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } }, { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } }, { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } }, { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } }, { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } }, { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } }, { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } }, { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } }, { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } }, { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } }, { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } }, { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } }, { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } }, { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } }, { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } }, { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } }, { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } }, { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } }, { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } }, { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } }, { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } }, { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } }, { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } }, { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } }, { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } }, { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } }, { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } }, { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } }, { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } }, { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } }, { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } }, { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } }, { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } }, { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } }, { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } }, { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } }, { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } }, // intra modes { DC_PRED, { INTRA_FRAME, NONE_FRAME } }, { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } }, { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } }, { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } }, { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } }, { H_PRED, { INTRA_FRAME, NONE_FRAME } }, { V_PRED, { INTRA_FRAME, NONE_FRAME } }, { D135_PRED, { INTRA_FRAME, NONE_FRAME } }, { D203_PRED, { INTRA_FRAME, NONE_FRAME } }, { D157_PRED, { INTRA_FRAME, NONE_FRAME } }, { D67_PRED, { INTRA_FRAME, NONE_FRAME } }, { D113_PRED, { INTRA_FRAME, NONE_FRAME } }, { D45_PRED, { INTRA_FRAME, NONE_FRAME } }, }; // Number of winner modes allowed for different values of the speed feature // multi_winner_mode_type. static const int winner_mode_count_allowed[MULTI_WINNER_MODE_LEVELS] = { 1, // MULTI_WINNER_MODE_OFF 2, // MULTI_WINNER_MODE_FAST 3 // MULTI_WINNER_MODE_DEFAULT }; static inline void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst, const int num_planes) { for (int i = 0; i < num_planes; i++) { xd->plane[i].dst.buf = dst.plane[i]; xd->plane[i].dst.stride = dst.stride[i]; } } static inline void swap_dst_buf(MACROBLOCKD *xd, const BUFFER_SET *dst_bufs[2], int num_planes) { const BUFFER_SET *buf0 = dst_bufs[0]; dst_bufs[0] = dst_bufs[1]; dst_bufs[1] = buf0; restore_dst_buf(xd, *dst_bufs[0], num_planes); } /* clang-format on */ // Calculate rd threshold based on ref best rd and relevant scaling factors static inline int64_t get_rd_thresh_from_best_rd(int64_t ref_best_rd, int mul_factor, int div_factor) { int64_t rd_thresh = ref_best_rd; if (div_factor != 0) { rd_thresh = ref_best_rd < (div_factor * (INT64_MAX / mul_factor)) ? ((ref_best_rd / div_factor) * mul_factor) : INT64_MAX; } return rd_thresh; } static inline THR_MODES get_prediction_mode_idx( PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame, MV_REFERENCE_FRAME second_ref_frame) { if (this_mode < INTRA_MODE_END) { assert(ref_frame == INTRA_FRAME); assert(second_ref_frame == NONE_FRAME); return intra_to_mode_idx[this_mode - INTRA_MODE_START]; } if (this_mode >= SINGLE_INTER_MODE_START && this_mode < SINGLE_INTER_MODE_END) { assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START] [ref_frame]; } if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END && second_ref_frame != NONE_FRAME) { assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); assert((second_ref_frame > INTRA_FRAME) && (second_ref_frame <= ALTREF_FRAME)); return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame] [second_ref_frame]; } assert(0); return THR_INVALID; } static inline int inter_mode_data_block_idx(BLOCK_SIZE bsize) { if (bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 || bsize == BLOCK_4X16 || bsize == BLOCK_16X4) { return -1; } return 1; } // Get transform block visible dimensions cropped to the MI units. static inline void get_txb_dimensions(const MACROBLOCKD *xd, int plane, BLOCK_SIZE plane_bsize, int blk_row, int blk_col, BLOCK_SIZE tx_bsize, int *width, int *height, int *visible_width, int *visible_height) { assert(tx_bsize <= plane_bsize); const int txb_height = block_size_high[tx_bsize]; const int txb_width = block_size_wide[tx_bsize]; const struct macroblockd_plane *const pd = &xd->plane[plane]; // TODO(aconverse@google.com): Investigate using crop_width/height here rather // than the MI size if (xd->mb_to_bottom_edge >= 0) { *visible_height = txb_height; } else { const int block_height = block_size_high[plane_bsize]; const int block_rows = (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height; *visible_height = clamp(block_rows - (blk_row << MI_SIZE_LOG2), 0, txb_height); } if (height) *height = txb_height; if (xd->mb_to_right_edge >= 0) { *visible_width = txb_width; } else { const int block_width = block_size_wide[plane_bsize]; const int block_cols = (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width; *visible_width = clamp(block_cols - (blk_col << MI_SIZE_LOG2), 0, txb_width); } if (width) *width = txb_width; } static inline int bsize_to_num_blk(BLOCK_SIZE bsize) { int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * MI_SIZE_LOG2); return num_blk; } static inline int check_txfm_eval(MACROBLOCK *const x, BLOCK_SIZE bsize, int64_t best_skip_rd, int64_t skip_rd, int level, int is_luma_only) { int eval_txfm = 1; // Derive aggressiveness factor for gating the transform search // Lower value indicates more aggressiveness. Be more conservative (high // value) for (i) low quantizers (ii) regions where prediction is poor const int scale[MAX_TX_RD_GATE_LEVEL + 1] = { INT_MAX, 4, 3, 2, 2, 1 }; const int qslope = 2 * (!is_luma_only); const int level_to_qindex_map[MAX_TX_RD_GATE_LEVEL + 1] = { 0, 0, 0, 80, 100, 140 }; int aggr_factor = 4; assert(level <= MAX_TX_RD_GATE_LEVEL); const int pred_qindex_thresh = level_to_qindex_map[level]; if (!is_luma_only && level <= 2) { aggr_factor = 4 * AOMMAX(1, ROUND_POWER_OF_TWO((MAXQ - x->qindex) * qslope, QINDEX_BITS)); } if ((best_skip_rd > (x->source_variance << (num_pels_log2_lookup[bsize] + RDDIV_BITS))) && (x->qindex >= pred_qindex_thresh)) aggr_factor *= scale[level]; // For level setting 1, be more conservative for non-luma-only case even when // prediction is good. else if ((level <= 1) && !is_luma_only) aggr_factor = (aggr_factor >> 2) * 6; // Be more conservative for luma only cases (called from compound type rd) // since best_skip_rd is computed after and skip_rd is computed (with 8-bit // prediction signals blended for WEDGE/DIFFWTD rather than 16-bit) before // interpolation filter search const int luma_mul[MAX_TX_RD_GATE_LEVEL + 1] = { INT_MAX, 32, 29, 17, 17, 17 }; int mul_factor = is_luma_only ? luma_mul[level] : 16; int64_t rd_thresh = (best_skip_rd == INT64_MAX) ? best_skip_rd : (int64_t)(best_skip_rd * aggr_factor * mul_factor >> 6); if (skip_rd > rd_thresh) eval_txfm = 0; return eval_txfm; } static TX_MODE select_tx_mode( const AV1_COMMON *cm, const TX_SIZE_SEARCH_METHOD tx_size_search_method) { if (cm->features.coded_lossless) return ONLY_4X4; if (tx_size_search_method == USE_LARGESTALL) { return TX_MODE_LARGEST; } else { assert(tx_size_search_method == USE_FULL_RD || tx_size_search_method == USE_FAST_RD); return TX_MODE_SELECT; } } // Checks the conditions to disable winner mode processing static inline int bypass_winner_mode_processing(const MACROBLOCK *const x, const SPEED_FEATURES *sf, int use_txfm_skip, int actual_txfm_skip, PREDICTION_MODE best_mode) { const int prune_winner_mode_eval_level = sf->winner_mode_sf.prune_winner_mode_eval_level; // Disable winner mode processing for blocks with low source variance. // The aggressiveness of this pruning logic reduces as qindex increases. // The threshold decreases linearly from 64 as qindex varies from 0 to 255. if (prune_winner_mode_eval_level == 1) { const unsigned int src_var_thresh = 64 - 48 * x->qindex / (MAXQ + 1); if (x->source_variance < src_var_thresh) return 1; } else if (prune_winner_mode_eval_level == 2) { // Skip winner mode processing of blocks for which transform turns out to be // skip due to nature of eob alone except NEWMV mode. if (!have_newmv_in_inter_mode(best_mode) && actual_txfm_skip) return 1; } else if (prune_winner_mode_eval_level == 3) { // Skip winner mode processing of blocks for which transform turns out to be // skip except NEWMV mode and considered based on the quantizer. // At high quantizers: Take conservative approach by considering transform // skip based on eob alone. // At low quantizers: Consider transform skip based on eob nature or RD cost // evaluation. const int is_txfm_skip = x->qindex > 127 ? actual_txfm_skip : actual_txfm_skip || use_txfm_skip; if (!have_newmv_in_inter_mode(best_mode) && is_txfm_skip) return 1; } else if (prune_winner_mode_eval_level >= 4) { // Do not skip winner mode evaluation at low quantizers if normal mode's // transform search was too aggressive. if (sf->rd_sf.perform_coeff_opt >= 5 && x->qindex <= 70) return 0; if (use_txfm_skip || actual_txfm_skip) return 1; } return 0; } // Checks the conditions to enable winner mode processing static inline int is_winner_mode_processing_enabled(const struct AV1_COMP *cpi, const MACROBLOCK *const x, MB_MODE_INFO *const mbmi, int actual_txfm_skip) { const SPEED_FEATURES *sf = &cpi->sf; const PREDICTION_MODE best_mode = mbmi->mode; if (bypass_winner_mode_processing(x, sf, mbmi->skip_txfm, actual_txfm_skip, best_mode)) return 0; // TODO(any): Move block independent condition checks to frame level if (is_inter_block(mbmi)) { if (is_inter_mode(best_mode) && (sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != INT_MAX) && !cpi->oxcf.txfm_cfg.use_inter_dct_only) return 1; } else { if (sf->tx_sf.tx_type_search.fast_intra_tx_type_search && !cpi->oxcf.txfm_cfg.use_intra_default_tx_only && !cpi->oxcf.txfm_cfg.use_intra_dct_only) return 1; } // Check speed feature related to winner mode processing if (sf->winner_mode_sf.enable_winner_mode_for_coeff_opt && cpi->optimize_seg_arr[mbmi->segment_id] != NO_TRELLIS_OPT && cpi->optimize_seg_arr[mbmi->segment_id] != FINAL_PASS_TRELLIS_OPT) return 1; if (sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch) return 1; return 0; } static inline void set_tx_size_search_method( const AV1_COMMON *cm, const WinnerModeParams *winner_mode_params, TxfmSearchParams *txfm_params, int enable_winner_mode_for_tx_size_srch, int is_winner_mode) { // Populate transform size search method/transform mode appropriately txfm_params->tx_size_search_method = winner_mode_params->tx_size_search_methods[DEFAULT_EVAL]; if (enable_winner_mode_for_tx_size_srch) { if (is_winner_mode) txfm_params->tx_size_search_method = winner_mode_params->tx_size_search_methods[WINNER_MODE_EVAL]; else txfm_params->tx_size_search_method = winner_mode_params->tx_size_search_methods[MODE_EVAL]; } txfm_params->tx_mode_search_type = select_tx_mode(cm, txfm_params->tx_size_search_method); } static inline void set_tx_type_prune(const SPEED_FEATURES *sf, TxfmSearchParams *txfm_params, int winner_mode_tx_type_pruning, int is_winner_mode) { // Populate prune transform mode appropriately txfm_params->prune_2d_txfm_mode = sf->tx_sf.tx_type_search.prune_2d_txfm_mode; if (!winner_mode_tx_type_pruning) return; const int prune_mode[4][2] = { { TX_TYPE_PRUNE_3, TX_TYPE_PRUNE_0 }, { TX_TYPE_PRUNE_4, TX_TYPE_PRUNE_0 }, { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_2 }, { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_3 } }; txfm_params->prune_2d_txfm_mode = prune_mode[winner_mode_tx_type_pruning - 1][is_winner_mode]; } static inline void set_tx_domain_dist_params( const WinnerModeParams *winner_mode_params, TxfmSearchParams *txfm_params, int enable_winner_mode_for_tx_domain_dist, int is_winner_mode) { if (txfm_params->use_qm_dist_metric) { // QM-weighted PSNR is computed in transform space, so we need to forcibly // enable the use of tx domain distortion. txfm_params->use_transform_domain_distortion = 1; txfm_params->tx_domain_dist_threshold = 0; return; } if (!enable_winner_mode_for_tx_domain_dist) { txfm_params->use_transform_domain_distortion = winner_mode_params->use_transform_domain_distortion[DEFAULT_EVAL]; txfm_params->tx_domain_dist_threshold = winner_mode_params->tx_domain_dist_threshold[DEFAULT_EVAL]; return; } if (is_winner_mode) { txfm_params->use_transform_domain_distortion = winner_mode_params->use_transform_domain_distortion[WINNER_MODE_EVAL]; txfm_params->tx_domain_dist_threshold = winner_mode_params->tx_domain_dist_threshold[WINNER_MODE_EVAL]; } else { txfm_params->use_transform_domain_distortion = winner_mode_params->use_transform_domain_distortion[MODE_EVAL]; txfm_params->tx_domain_dist_threshold = winner_mode_params->tx_domain_dist_threshold[MODE_EVAL]; } } // This function sets mode parameters for different mode evaluation stages static inline void set_mode_eval_params(const struct AV1_COMP *cpi, MACROBLOCK *x, MODE_EVAL_TYPE mode_eval_type) { const AV1_COMMON *cm = &cpi->common; const SPEED_FEATURES *sf = &cpi->sf; const WinnerModeParams *winner_mode_params = &cpi->winner_mode_params; TxfmSearchParams *txfm_params = &x->txfm_search_params; txfm_params->use_qm_dist_metric = cpi->oxcf.tune_cfg.dist_metric == AOM_DIST_METRIC_QM_PSNR; switch (mode_eval_type) { case DEFAULT_EVAL: txfm_params->default_inter_tx_type_prob_thresh = INT_MAX; txfm_params->use_default_intra_tx_type = 0; txfm_params->skip_txfm_level = winner_mode_params->skip_txfm_level[DEFAULT_EVAL]; txfm_params->predict_dc_level = winner_mode_params->predict_dc_level[DEFAULT_EVAL]; // Set default transform domain distortion type set_tx_domain_dist_params(winner_mode_params, txfm_params, 0, 0); // Get default threshold for R-D optimization of coefficients get_rd_opt_coeff_thresh(winner_mode_params->coeff_opt_thresholds, txfm_params, 0, 0); // Set default transform size search method set_tx_size_search_method(cm, winner_mode_params, txfm_params, 0, 0); // Set default transform type prune set_tx_type_prune(sf, txfm_params, 0, 0); break; case MODE_EVAL: txfm_params->use_default_intra_tx_type = (cpi->sf.tx_sf.tx_type_search.fast_intra_tx_type_search || cpi->oxcf.txfm_cfg.use_intra_default_tx_only); txfm_params->default_inter_tx_type_prob_thresh = cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh; txfm_params->skip_txfm_level = winner_mode_params->skip_txfm_level[MODE_EVAL]; txfm_params->predict_dc_level = winner_mode_params->predict_dc_level[MODE_EVAL]; // Set transform domain distortion type for mode evaluation set_tx_domain_dist_params( winner_mode_params, txfm_params, sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 0); // Get threshold for R-D optimization of coefficients during mode // evaluation get_rd_opt_coeff_thresh( winner_mode_params->coeff_opt_thresholds, txfm_params, sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 0); // Set the transform size search method for mode evaluation set_tx_size_search_method( cm, winner_mode_params, txfm_params, sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 0); // Set transform type prune for mode evaluation set_tx_type_prune(sf, txfm_params, sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning, 0); break; case WINNER_MODE_EVAL: txfm_params->default_inter_tx_type_prob_thresh = INT_MAX; txfm_params->use_default_intra_tx_type = 0; txfm_params->skip_txfm_level = winner_mode_params->skip_txfm_level[WINNER_MODE_EVAL]; txfm_params->predict_dc_level = winner_mode_params->predict_dc_level[WINNER_MODE_EVAL]; // Set transform domain distortion type for winner mode evaluation set_tx_domain_dist_params( winner_mode_params, txfm_params, sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 1); // Get threshold for R-D optimization of coefficients for winner mode // evaluation get_rd_opt_coeff_thresh( winner_mode_params->coeff_opt_thresholds, txfm_params, sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 1); // Set the transform size search method for winner mode evaluation set_tx_size_search_method( cm, winner_mode_params, txfm_params, sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1); // Set default transform type prune mode for winner mode evaluation set_tx_type_prune(sf, txfm_params, sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning, 1); break; default: assert(0); } // Rd record collected at a specific mode evaluation stage can not be used // across other evaluation stages as the transform parameters are different. // Hence, reset mb rd record whenever mode evaluation stage type changes. if (txfm_params->mode_eval_type != mode_eval_type) reset_mb_rd_record(x->txfm_search_info.mb_rd_record); txfm_params->mode_eval_type = mode_eval_type; } // Similar to store_cfl_required(), but for use during the RDO process, // where we haven't yet determined whether this block uses CfL. static inline CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm, const MACROBLOCK *x) { const MACROBLOCKD *xd = &x->e_mbd; if (cm->seq_params->monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED; if (!xd->is_chroma_ref) { // For non-chroma-reference blocks, we should always store the luma pixels, // in case the corresponding chroma-reference block uses CfL. // Note that this can only happen for block sizes which are <8 on // their shortest side, as otherwise they would be chroma reference // blocks. return CFL_ALLOWED; } // For chroma reference blocks, we should store data in the encoder iff we're // allowed to try out CfL. return is_cfl_allowed(xd); } static inline void init_sbuv_mode(MB_MODE_INFO *const mbmi) { mbmi->uv_mode = UV_DC_PRED; mbmi->palette_mode_info.palette_size[1] = 0; } // Store best mode stats for winner mode processing static inline void store_winner_mode_stats( const AV1_COMMON *const cm, MACROBLOCK *x, const MB_MODE_INFO *mbmi, RD_STATS *rd_cost, RD_STATS *rd_cost_y, RD_STATS *rd_cost_uv, THR_MODES mode_index, uint8_t *color_map, BLOCK_SIZE bsize, int64_t this_rd, int multi_winner_mode_type, int txfm_search_done) { WinnerModeStats *winner_mode_stats = x->winner_mode_stats; int mode_idx = 0; int is_palette_mode = mbmi->palette_mode_info.palette_size[PLANE_TYPE_Y] > 0; // Mode stat is not required when multiwinner mode processing is disabled if (multi_winner_mode_type == MULTI_WINNER_MODE_OFF) return; // Ignore mode with maximum rd if (this_rd == INT64_MAX) return; // TODO(any): Winner mode processing is currently not applicable for palette // mode in Inter frames. Clean-up the following code, once support is added if (!frame_is_intra_only(cm) && is_palette_mode) return; int max_winner_mode_count = winner_mode_count_allowed[multi_winner_mode_type]; assert(x->winner_mode_count >= 0 && x->winner_mode_count <= max_winner_mode_count); if (x->winner_mode_count) { // Find the mode which has higher rd cost than this_rd for (mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++) if (winner_mode_stats[mode_idx].rd > this_rd) break; if (mode_idx == max_winner_mode_count) { // No mode has higher rd cost than this_rd return; } else if (mode_idx < max_winner_mode_count - 1) { // Create a slot for current mode and move others to the next slot memmove( &winner_mode_stats[mode_idx + 1], &winner_mode_stats[mode_idx], (max_winner_mode_count - mode_idx - 1) * sizeof(*winner_mode_stats)); } } // Add a mode stat for winner mode processing winner_mode_stats[mode_idx].mbmi = *mbmi; winner_mode_stats[mode_idx].rd = this_rd; winner_mode_stats[mode_idx].mode_index = mode_index; // Update rd stats required for inter frame if (!frame_is_intra_only(cm) && rd_cost && rd_cost_y && rd_cost_uv) { const MACROBLOCKD *xd = &x->e_mbd; const int skip_ctx = av1_get_skip_txfm_context(xd); const int is_intra_mode = av1_mode_defs[mode_index].mode < INTRA_MODE_END; const int skip_txfm = mbmi->skip_txfm && !is_intra_mode; winner_mode_stats[mode_idx].rd_cost = *rd_cost; if (txfm_search_done) { winner_mode_stats[mode_idx].rate_y = rd_cost_y->rate + x->mode_costs .skip_txfm_cost[skip_ctx][rd_cost->skip_txfm || skip_txfm]; winner_mode_stats[mode_idx].rate_uv = rd_cost_uv->rate; } } if (color_map) { // Store color_index_map for palette mode const MACROBLOCKD *const xd = &x->e_mbd; int block_width, block_height; av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width, &block_height, NULL, NULL); memcpy(winner_mode_stats[mode_idx].color_index_map, color_map, block_width * block_height * sizeof(color_map[0])); } x->winner_mode_count = AOMMIN(x->winner_mode_count + 1, max_winner_mode_count); } unsigned int av1_get_perpixel_variance(const AV1_COMP *cpi, const MACROBLOCKD *xd, const struct buf_2d *ref, BLOCK_SIZE bsize, int plane, int use_hbd); unsigned int av1_get_perpixel_variance_facade(const struct AV1_COMP *cpi, const MACROBLOCKD *xd, const struct buf_2d *ref, BLOCK_SIZE bsize, int plane); static inline int is_mode_intra(PREDICTION_MODE mode) { return mode < INTRA_MODE_END; } // This function will copy usable ref_mv_stack[ref_frame][4] and // weight[ref_frame][4] information from ref_mv_stack[ref_frame][8] and // weight[ref_frame][8]. static inline void av1_copy_usable_ref_mv_stack_and_weight( const MACROBLOCKD *xd, MB_MODE_INFO_EXT *const mbmi_ext, MV_REFERENCE_FRAME ref_frame) { memcpy(mbmi_ext->weight[ref_frame], xd->weight[ref_frame], USABLE_REF_MV_STACK_SIZE * sizeof(xd->weight[0][0])); memcpy(mbmi_ext->ref_mv_stack[ref_frame], xd->ref_mv_stack[ref_frame], USABLE_REF_MV_STACK_SIZE * sizeof(xd->ref_mv_stack[0][0])); } // Get transform rd gate level for the given transform search case. static inline int get_txfm_rd_gate_level( const int is_masked_compound_enabled, const int txfm_rd_gate_level[TX_SEARCH_CASES], BLOCK_SIZE bsize, TX_SEARCH_CASE tx_search_case, int eval_motion_mode) { assert(tx_search_case < TX_SEARCH_CASES); if (tx_search_case == TX_SEARCH_MOTION_MODE && !eval_motion_mode && num_pels_log2_lookup[bsize] > 8) return txfm_rd_gate_level[TX_SEARCH_MOTION_MODE]; // Enable aggressive gating of transform search only when masked compound type // is enabled. else if (tx_search_case == TX_SEARCH_COMP_TYPE_MODE && is_masked_compound_enabled) return txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE]; return txfm_rd_gate_level[TX_SEARCH_DEFAULT]; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_RDOPT_UTILS_H_ aom-3.12.1/av1/encoder/reconinter_enc.c000066400000000000000000000706441477627663500176630ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "config/aom_scale_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/blend.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/mvref_common.h" #include "av1/common/obmc.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" #include "av1/encoder/reconinter_enc.h" static inline void enc_calc_subpel_params( const MV *const src_mv, InterPredParams *const inter_pred_params, uint8_t **pre, SubpelParams *subpel_params, int *src_stride) { struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf; init_subpel_params(src_mv, inter_pred_params, subpel_params, pre_buf->width, pre_buf->height); *pre = pre_buf->buf0 + (subpel_params->pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + (subpel_params->pos_x >> SCALE_SUBPEL_BITS); *src_stride = pre_buf->stride; } #define IS_DEC 0 #include "av1/common/reconinter_template.inc" #undef IS_DEC void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride, const MV *src_mv, InterPredParams *inter_pred_params) { build_one_inter_predictor(dst, dst_stride, src_mv, inter_pred_params); } static void enc_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, int bw, int bh, int mi_x, int mi_y) { build_inter_predictors(cm, xd, plane, mi, /*build_for_obmc=*/0, bw, bh, mi_x, mi_y); } void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col) { const int mi_x = mi_col * MI_SIZE; const int mi_y = mi_row * MI_SIZE; struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; InterPredParams inter_pred_params; struct buf_2d *const dst_buf = &pd->dst; uint8_t *const dst = dst_buf->buf; const MV mv = xd->mi[0]->mv[0].as_mv; const struct scale_factors *const sf = xd->block_ref_scale_factors[0]; av1_init_inter_params(&inter_pred_params, pd->width, pd->height, mi_y, mi_x, pd->subsampling_x, pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), false, sf, pd->pre, xd->mi[0]->interp_filters); inter_pred_params.conv_params = get_conv_params_no_round( 0, AOM_PLANE_Y, xd->tmp_conv_dst, MAX_SB_SIZE, false, xd->bd); inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0; av1_enc_build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params); } void av1_enc_build_inter_predictor_y_nonrd(MACROBLOCKD *xd, InterPredParams *inter_pred_params, const SubpelParams *subpel_params) { struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; const MB_MODE_INFO *mbmi = xd->mi[0]; struct buf_2d *const dst_buf = &pd->dst; const struct buf_2d *pre_buf = &pd->pre[0]; const uint8_t *src = pre_buf->buf0 + (subpel_params->pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + (subpel_params->pos_x >> SCALE_SUBPEL_BITS); uint8_t *const dst = dst_buf->buf; int src_stride = pre_buf->stride; int dst_stride = dst_buf->stride; inter_pred_params->ref_frame_buf = *pre_buf; // Initialize interp filter for single reference mode. init_interp_filter_params(inter_pred_params->interp_filter_params, &mbmi->interp_filters.as_filters, pd->width, pd->height, /*is_intrabc=*/0); av1_make_inter_predictor(src, src_stride, dst, dst_stride, inter_pred_params, subpel_params); } void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, const BUFFER_SET *ctx, BLOCK_SIZE bsize, int plane_from, int plane_to) { for (int plane = plane_from; plane <= plane_to; ++plane) { if (plane && !xd->is_chroma_ref) break; const int mi_x = mi_col * MI_SIZE; const int mi_y = mi_row * MI_SIZE; enc_build_inter_predictors(cm, xd, plane, xd->mi[0], xd->plane[plane].width, xd->plane[plane].height, mi_x, mi_y); if (is_interintra_pred(xd->mi[0])) { BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf }, { xd->plane[0].dst.stride, xd->plane[1].dst.stride, xd->plane[2].dst.stride } }; if (!ctx) { ctx = &default_ctx; } av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, ctx, plane, bsize); } } } static void setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset, int mi_col_offset, MB_MODE_INFO *ref_mbmi, struct build_prediction_ctxt *ctxt, const int num_planes) { const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->bsize); const int ref_mi_row = xd->mi_row + mi_row_offset; const int ref_mi_col = xd->mi_col + mi_col_offset; for (int plane = 0; plane < num_planes; ++plane) { struct macroblockd_plane *const pd = &xd->plane[plane]; setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane], ctxt->tmp_width[plane], ctxt->tmp_height[plane], ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset, NULL, pd->subsampling_x, pd->subsampling_y); } const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0]; const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); const struct scale_factors *const sf = get_ref_scale_factors_const(ctxt->cm, frame); xd->block_ref_scale_factors[0] = sf; if (!av1_is_valid_scale(sf)) aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, "Reference frame has invalid dimensions"); av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf, num_planes); } static inline void build_obmc_prediction(MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, int dir, MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) { struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; setup_address_for_obmc(xd, rel_mi_row, rel_mi_col, above_mbmi, ctxt, num_planes); const int mi_x = (xd->mi_col + rel_mi_col) << MI_SIZE_LOG2; const int mi_y = (xd->mi_row + rel_mi_row) << MI_SIZE_LOG2; const BLOCK_SIZE bsize = xd->mi[0]->bsize; InterPredParams inter_pred_params; for (int j = 0; j < num_planes; ++j) { const struct macroblockd_plane *pd = &xd->plane[j]; int bw = 0, bh = 0; if (dir) { // prepare left reference block size bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4, block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1)); bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y; } else { // prepare above reference block size bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x; bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4, block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1)); } if (av1_skip_u4x4_pred_in_obmc(bsize, pd, dir)) continue; const struct buf_2d *const pre_buf = &pd->pre[0]; const MV mv = above_mbmi->mv[0].as_mv; av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y, mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0, xd->block_ref_scale_factors[0], pre_buf, above_mbmi->interp_filters); inter_pred_params.conv_params = get_conv_params(0, j, xd->bd); av1_enc_build_one_inter_predictor(pd->dst.buf, pd->dst.stride, &mv, &inter_pred_params); } } void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE], int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) { if (!xd->up_available) return; struct build_prediction_ctxt ctxt = { cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_right_edge, NULL }; BLOCK_SIZE bsize = xd->mi[0]->bsize; foreach_overlappable_nb_above(cm, xd, max_neighbor_obmc[mi_size_wide_log2[bsize]], build_obmc_prediction, &ctxt); } void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE], int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) { if (!xd->left_available) return; struct build_prediction_ctxt ctxt = { cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_bottom_edge, NULL }; BLOCK_SIZE bsize = xd->mi[0]->bsize; foreach_overlappable_nb_left(cm, xd, max_neighbor_obmc[mi_size_high_log2[bsize]], build_obmc_prediction, &ctxt); } void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd) { const int num_planes = av1_num_planes(cm); uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE]; int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; av1_setup_obmc_dst_bufs(xd, dst_buf1, dst_buf2); const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; av1_build_prediction_by_above_preds(cm, xd, dst_buf1, dst_width1, dst_height1, dst_stride1); av1_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2, dst_stride2); av1_setup_dst_planes(xd->plane, xd->mi[0]->bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, num_planes); av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2, dst_stride2); } void av1_build_inter_predictors_for_planes_single_buf( MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref, uint8_t *ext_dst[], int ext_dst_stride[]) { assert(bsize < BLOCK_SIZES_ALL); const MB_MODE_INFO *mi = xd->mi[0]; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; const int mi_x = mi_col * MI_SIZE; const int mi_y = mi_row * MI_SIZE; WarpTypesAllowed warp_types; const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype); warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; for (int plane = plane_from; plane <= plane_to; ++plane) { const struct macroblockd_plane *pd = &xd->plane[plane]; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); const int bw = block_size_wide[plane_bsize]; const int bh = block_size_high[plane_bsize]; InterPredParams inter_pred_params; av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y, mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0, xd->block_ref_scale_factors[ref], &pd->pre[ref], mi->interp_filters); inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi); uint8_t *const dst = get_buf_by_bd(xd, ext_dst[plane]); const MV mv = mi->mv[ref].as_mv; av1_enc_build_one_inter_predictor(dst, ext_dst_stride[plane], &mv, &inter_pred_params); } } static void build_masked_compound( uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, int w) { // Derive subsampling from h and w passed in. May be refactored to // pass in subsampling factors directly. const int subh = (2 << mi_size_high_log2[sb_type]) == h; const int subw = (2 << mi_size_wide_log2[sb_type]) == w; const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, block_size_wide[sb_type], w, h, subw, subh); } #if CONFIG_AV1_HIGHBITDEPTH static void build_masked_compound_highbd( uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride, const uint8_t *src1_8, int src1_stride, const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, int w, int bd) { // Derive subsampling from h and w passed in. May be refactored to // pass in subsampling factors directly. const int subh = (2 << mi_size_high_log2[sb_type]) == h; const int subw = (2 << mi_size_wide_log2[sb_type]) == w; const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); // const uint8_t *mask = // av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type); aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8, src1_stride, mask, block_size_wide[sb_type], w, h, subw, subh, bd); } #endif static void build_wedge_inter_predictor_from_buf( MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0, int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) { MB_MODE_INFO *const mbmi = xd->mi[0]; const int is_compound = has_second_ref(mbmi); MACROBLOCKD_PLANE *const pd = &xd->plane[plane]; struct buf_2d *const dst_buf = &pd->dst; uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; mbmi->interinter_comp.seg_mask = xd->seg_mask; const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp; const int is_hbd = is_cur_buf_hbd(xd); if (is_compound && is_masked_compound_type(comp_data->type)) { if (!plane && comp_data->type == COMPOUND_DIFFWTD) { #if CONFIG_AV1_HIGHBITDEPTH if (is_hbd) { av1_build_compound_diffwtd_mask_highbd( comp_data->seg_mask, comp_data->mask_type, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd); } else { av1_build_compound_diffwtd_mask( comp_data->seg_mask, comp_data->mask_type, ext_dst0, ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w); } #else (void)is_hbd; av1_build_compound_diffwtd_mask(comp_data->seg_mask, comp_data->mask_type, ext_dst0, ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w); #endif // CONFIG_AV1_HIGHBITDEPTH } #if CONFIG_AV1_HIGHBITDEPTH if (is_hbd) { build_masked_compound_highbd( dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data, mbmi->bsize, h, w, xd->bd); } else { build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0, ext_dst1, ext_dst_stride1, comp_data, mbmi->bsize, h, w); } #else build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0, ext_dst1, ext_dst_stride1, comp_data, mbmi->bsize, h, w); #endif } else { #if CONFIG_AV1_HIGHBITDEPTH if (is_hbd) { aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(ext_dst0), ext_dst_stride0, CONVERT_TO_SHORTPTR(dst), dst_buf->stride, w, h); } else { aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h); } #else aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h); #endif } } void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, uint8_t *ext_dst0[], int ext_dst_stride0[], uint8_t *ext_dst1[], int ext_dst_stride1[]) { int plane; assert(bsize < BLOCK_SIZES_ALL); for (plane = plane_from; plane <= plane_to; ++plane) { const BLOCK_SIZE plane_bsize = get_plane_block_size( bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y); const int bw = block_size_wide[plane_bsize]; const int bh = block_size_high[plane_bsize]; build_wedge_inter_predictor_from_buf( xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane], ext_dst1[plane], ext_dst_stride1[plane]); } } // Get pred block from up-sampled reference. void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search) { // expect xd == NULL only in tests if (xd != NULL) { const MB_MODE_INFO *mi = xd->mi[0]; const int ref_num = 0; const int is_intrabc = is_intrabc_block(mi); const struct scale_factors *const sf = is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; const int is_scaled = av1_is_scaled(sf); if (is_scaled) { int plane = 0; const int mi_x = mi_col * MI_SIZE; const int mi_y = mi_row * MI_SIZE; const struct macroblockd_plane *const pd = &xd->plane[plane]; const struct buf_2d *const dst_buf = &pd->dst; const struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref_num]; InterPredParams inter_pred_params; inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); const int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); av1_init_inter_params( &inter_pred_params, width, height, mi_y >> pd->subsampling_y, mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); av1_enc_build_one_inter_predictor(comp_pred, width, mv, &inter_pred_params); return; } } const InterpFilterParams *filter = av1_get_filter(subpel_search); if (!subpel_x_q3 && !subpel_y_q3) { for (int i = 0; i < height; i++) { memcpy(comp_pred, ref, width * sizeof(*comp_pred)); comp_pred += width; ref += ref_stride; } } else if (!subpel_y_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1, width, height); } else if (!subpel_x_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16, width, height); } else { DECLARE_ALIGNED(16, uint8_t, temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); const int16_t *const kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); const int intermediate_height = (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1), ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height); aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, width, height); } } void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search) { int i, j; aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1); } comp_pred += width; pred += width; } } void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int subpel_search) { if (subpel_x_q3 | subpel_y_q3) { aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); ref = comp_pred; ref_stride = width; } aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask, mask_stride, invert_mask); } #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search) { // expect xd == NULL only in tests if (xd != NULL) { const MB_MODE_INFO *mi = xd->mi[0]; const int ref_num = 0; const int is_intrabc = is_intrabc_block(mi); const struct scale_factors *const sf = is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; const int is_scaled = av1_is_scaled(sf); if (is_scaled) { int plane = 0; const int mi_x = mi_col * MI_SIZE; const int mi_y = mi_row * MI_SIZE; const struct macroblockd_plane *const pd = &xd->plane[plane]; const struct buf_2d *const dst_buf = &pd->dst; const struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref_num]; InterPredParams inter_pred_params; inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); const int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); av1_init_inter_params( &inter_pred_params, width, height, mi_y >> pd->subsampling_y, mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); av1_enc_build_one_inter_predictor(comp_pred8, width, mv, &inter_pred_params); return; } } const InterpFilterParams *filter = av1_get_filter(subpel_search); if (!subpel_x_q3 && !subpel_y_q3) { const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); for (int i = 0; i < height; i++) { memcpy(comp_pred, ref, width * sizeof(*comp_pred)); comp_pred += width; ref += ref_stride; } } else if (!subpel_y_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel, 16, NULL, -1, width, height, bd); } else if (!subpel_x_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1, kernel, 16, width, height, bd); } else { DECLARE_ALIGNED(16, uint16_t, temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); const int16_t *const kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); const int intermediate_height = (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1), ref_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd); aom_highbd_convolve8_vert_c( CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, bd); } } void aom_highbd_comp_avg_upsampled_pred_c( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search) { int i, j; const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, bd, subpel_search); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1); } comp_pred += width; pred += width; } } void aom_highbd_comp_mask_upsampled_pred( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd, int subpel_search) { aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, bd, subpel_search); aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width, mask, mask_stride, invert_mask); } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/av1/encoder/reconinter_enc.h000066400000000000000000000110061477627663500176530ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_RECONINTER_ENC_H_ #define AOM_AV1_ENCODER_RECONINTER_ENC_H_ #include "aom/aom_integer.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" #include "av1/common/reconinter.h" #include "av1/common/warped_motion.h" #ifdef __cplusplus extern "C" { #endif void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int subpel_search); void aom_highbd_comp_mask_upsampled_pred( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd, int subpel_search); // Build single or compound reference inter predictors for all planes. // Can build inter-intra predictors, masked predictors etc as well. void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col, const BUFFER_SET *ctx, BLOCK_SIZE bsize, int plane_from, int plane_to); void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col); void av1_enc_build_inter_predictor_y_nonrd(MACROBLOCKD *xd, InterPredParams *inter_pred_params, const SubpelParams *subpel_params); // Build one inter predictor. It is called for building predictor for single // reference case, or just the 1st or 2nd reference in compound reference case. // Can build both regular and masked predictors. void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride, const MV *src_mv, InterPredParams *inter_pred_params); void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE], int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]); void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE], int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]); void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd); // |ext_dst*| are indexed from |plane_from| to |plane_to| inclusive. void av1_build_inter_predictors_for_planes_single_buf( MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref, uint8_t *ext_dst[], int ext_dst_stride[]); // |ext_dst*| are indexed from |plane_from| to |plane_to| inclusive. void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, uint8_t *ext_dst0[], int ext_dst_stride0[], uint8_t *ext_dst1[], int ext_dst_stride1[]); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_RECONINTER_ENC_H_ aom-3.12.1/av1/encoder/saliency_map.c000066400000000000000000001334521477627663500173270ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "av1/encoder/encoder.h" #include "av1/encoder/encoder_utils.h" #include "av1/encoder/firstpass.h" #include "av1/encoder/rdopt.h" #include "av1/encoder/saliency_map.h" // The Gabor filter is generated by setting the parameters as: // ksize = 9 // sigma = 1 // theta = y*np.pi/4, where y /in {0, 1, 2, 3}, i.e., 0, 45, 90, 135 degree // lambda1 = 1 // gamma=0.8 // phi =0 static const double kGaborFilter[4][9][9] = { // [angle: 0, 45, 90, 135 // degree][ksize][ksize] { { 2.0047323e-06, 6.6387620e-05, 8.0876675e-04, 3.6246411e-03, 5.9760227e-03, 3.6246411e-03, 8.0876675e-04, 6.6387620e-05, 2.0047323e-06 }, { 1.8831115e-05, 6.2360091e-04, 7.5970138e-03, 3.4047455e-02, 5.6134764e-02, 3.4047455e-02, 7.5970138e-03, 6.2360091e-04, 1.8831115e-05 }, { 9.3271126e-05, 3.0887155e-03, 3.7628256e-02, 1.6863814e-01, 2.7803731e-01, 1.6863814e-01, 3.7628256e-02, 3.0887155e-03, 9.3271126e-05 }, { 2.4359586e-04, 8.0667874e-03, 9.8273583e-02, 4.4043165e-01, 7.2614902e-01, 4.4043165e-01, 9.8273583e-02, 8.0667874e-03, 2.4359586e-04 }, { 3.3546262e-04, 1.1108996e-02, 1.3533528e-01, 6.0653067e-01, 1.0000000e+00, 6.0653067e-01, 1.3533528e-01, 1.1108996e-02, 3.3546262e-04 }, { 2.4359586e-04, 8.0667874e-03, 9.8273583e-02, 4.4043165e-01, 7.2614902e-01, 4.4043165e-01, 9.8273583e-02, 8.0667874e-03, 2.4359586e-04 }, { 9.3271126e-05, 3.0887155e-03, 3.7628256e-02, 1.6863814e-01, 2.7803731e-01, 1.6863814e-01, 3.7628256e-02, 3.0887155e-03, 9.3271126e-05 }, { 1.8831115e-05, 6.2360091e-04, 7.5970138e-03, 3.4047455e-02, 5.6134764e-02, 3.4047455e-02, 7.5970138e-03, 6.2360091e-04, 1.8831115e-05 }, { 2.0047323e-06, 6.6387620e-05, 8.0876675e-04, 3.6246411e-03, 5.9760227e-03, 3.6246411e-03, 8.0876675e-04, 6.6387620e-05, 2.0047323e-06 } }, { { -6.2165498e-08, 3.8760313e-06, 3.0079011e-06, -4.4602581e-04, 6.6981313e-04, 1.3962291e-03, -9.9486928e-04, -8.1631159e-05, 3.5712848e-05 }, { 3.8760313e-06, 5.7044272e-06, -1.6041942e-03, 4.5687673e-03, 1.8061366e-02, -2.4406660e-02, -3.7979286e-03, 3.1511115e-03, -8.1631159e-05 }, { 3.0079011e-06, -1.6041942e-03, 8.6645801e-03, 6.4960226e-02, -1.6647682e-01, -4.9129307e-02, 7.7304743e-02, -3.7979286e-03, -9.9486928e-04 }, { -4.4602581e-04, 4.5687673e-03, 6.4960226e-02, -3.1572008e-01, -1.7670043e-01, 5.2729243e-01, -4.9129307e-02, -2.4406660e-02, 1.3962291e-03 }, { 6.6981313e-04, 1.8061366e-02, -1.6647682e-01, -1.7670043e-01, 1.0000000e+00, -1.7670043e-01, -1.6647682e-01, 1.8061366e-02, 6.6981313e-04 }, { 1.3962291e-03, -2.4406660e-02, -4.9129307e-02, 5.2729243e-01, -1.7670043e-01, -3.1572008e-01, 6.4960226e-02, 4.5687673e-03, -4.4602581e-04 }, { -9.9486928e-04, -3.7979286e-03, 7.7304743e-02, -4.9129307e-02, -1.6647682e-01, 6.4960226e-02, 8.6645801e-03, -1.6041942e-03, 3.0079011e-06 }, { -8.1631159e-05, 3.1511115e-03, -3.7979286e-03, -2.4406660e-02, 1.8061366e-02, 4.5687673e-03, -1.6041942e-03, 5.7044272e-06, 3.8760313e-06 }, { 3.5712848e-05, -8.1631159e-05, -9.9486928e-04, 1.3962291e-03, 6.6981313e-04, -4.4602581e-04, 3.0079011e-06, 3.8760313e-06, -6.2165498e-08 } }, { { 2.0047323e-06, 1.8831115e-05, 9.3271126e-05, 2.4359586e-04, 3.3546262e-04, 2.4359586e-04, 9.3271126e-05, 1.8831115e-05, 2.0047323e-06 }, { 6.6387620e-05, 6.2360091e-04, 3.0887155e-03, 8.0667874e-03, 1.1108996e-02, 8.0667874e-03, 3.0887155e-03, 6.2360091e-04, 6.6387620e-05 }, { 8.0876675e-04, 7.5970138e-03, 3.7628256e-02, 9.8273583e-02, 1.3533528e-01, 9.8273583e-02, 3.7628256e-02, 7.5970138e-03, 8.0876675e-04 }, { 3.6246411e-03, 3.4047455e-02, 1.6863814e-01, 4.4043165e-01, 6.0653067e-01, 4.4043165e-01, 1.6863814e-01, 3.4047455e-02, 3.6246411e-03 }, { 5.9760227e-03, 5.6134764e-02, 2.7803731e-01, 7.2614902e-01, 1.0000000e+00, 7.2614902e-01, 2.7803731e-01, 5.6134764e-02, 5.9760227e-03 }, { 3.6246411e-03, 3.4047455e-02, 1.6863814e-01, 4.4043165e-01, 6.0653067e-01, 4.4043165e-01, 1.6863814e-01, 3.4047455e-02, 3.6246411e-03 }, { 8.0876675e-04, 7.5970138e-03, 3.7628256e-02, 9.8273583e-02, 1.3533528e-01, 9.8273583e-02, 3.7628256e-02, 7.5970138e-03, 8.0876675e-04 }, { 6.6387620e-05, 6.2360091e-04, 3.0887155e-03, 8.0667874e-03, 1.1108996e-02, 8.0667874e-03, 3.0887155e-03, 6.2360091e-04, 6.6387620e-05 }, { 2.0047323e-06, 1.8831115e-05, 9.3271126e-05, 2.4359586e-04, 3.3546262e-04, 2.4359586e-04, 9.3271126e-05, 1.8831115e-05, 2.0047323e-06 } }, { { 3.5712848e-05, -8.1631159e-05, -9.9486928e-04, 1.3962291e-03, 6.6981313e-04, -4.4602581e-04, 3.0079011e-06, 3.8760313e-06, -6.2165498e-08 }, { -8.1631159e-05, 3.1511115e-03, -3.7979286e-03, -2.4406660e-02, 1.8061366e-02, 4.5687673e-03, -1.6041942e-03, 5.7044272e-06, 3.8760313e-06 }, { -9.9486928e-04, -3.7979286e-03, 7.7304743e-02, -4.9129307e-02, -1.6647682e-01, 6.4960226e-02, 8.6645801e-03, -1.6041942e-03, 3.0079011e-06 }, { 1.3962291e-03, -2.4406660e-02, -4.9129307e-02, 5.2729243e-01, -1.7670043e-01, -3.1572008e-01, 6.4960226e-02, 4.5687673e-03, -4.4602581e-04 }, { 6.6981313e-04, 1.8061366e-02, -1.6647682e-01, -1.7670043e-01, 1.0000000e+00, -1.7670043e-01, -1.6647682e-01, 1.8061366e-02, 6.6981313e-04 }, { -4.4602581e-04, 4.5687673e-03, 6.4960226e-02, -3.1572008e-01, -1.7670043e-01, 5.2729243e-01, -4.9129307e-02, -2.4406660e-02, 1.3962291e-03 }, { 3.0079011e-06, -1.6041942e-03, 8.6645801e-03, 6.4960226e-02, -1.6647682e-01, -4.9129307e-02, 7.7304743e-02, -3.7979286e-03, -9.9486928e-04 }, { 3.8760313e-06, 5.7044272e-06, -1.6041942e-03, 4.5687673e-03, 1.8061366e-02, -2.4406660e-02, -3.7979286e-03, 3.1511115e-03, -8.1631159e-05 }, { -6.2165498e-08, 3.8760313e-06, 3.0079011e-06, -4.4602581e-04, 6.6981313e-04, 1.3962291e-03, -9.9486928e-04, -8.1631159e-05, 3.5712848e-05 } } }; // This function is to extract red/green/blue channels, and calculate intensity // = (r+g+b)/3. Note that it only handles 8bits case now. // TODO(linzhen): add high bitdepth support. static void get_color_intensity(const YV12_BUFFER_CONFIG *src, int subsampling_x, int subsampling_y, double *cr, double *cg, double *cb, double *intensity) { const uint8_t *y = src->buffers[0]; const uint8_t *u = src->buffers[1]; const uint8_t *v = src->buffers[2]; const int y_height = src->crop_heights[0]; const int y_width = src->crop_widths[0]; const int y_stride = src->strides[0]; const int c_stride = src->strides[1]; for (int i = 0; i < y_height; ++i) { for (int j = 0; j < y_width; ++j) { cr[i * y_width + j] = fclamp((double)y[i * y_stride + j] + 1.370 * (double)(v[(i >> subsampling_y) * c_stride + (j >> subsampling_x)] - 128), 0, 255); cg[i * y_width + j] = fclamp((double)y[i * y_stride + j] - 0.698 * (double)(u[(i >> subsampling_y) * c_stride + (j >> subsampling_x)] - 128) - 0.337 * (double)(v[(i >> subsampling_y) * c_stride + (j >> subsampling_x)] - 128), 0, 255); cb[i * y_width + j] = fclamp((double)y[i * y_stride + j] + 1.732 * (double)(u[(i >> subsampling_y) * c_stride + (j >> subsampling_x)] - 128), 0, 255); intensity[i * y_width + j] = (cr[i * y_width + j] + cg[i * y_width + j] + cb[i * y_width + j]) / 3.0; assert(intensity[i * y_width + j] >= 0 && intensity[i * y_width + j] <= 255); intensity[i * y_width + j] /= 256; cr[i * y_width + j] /= 256; cg[i * y_width + j] /= 256; cb[i * y_width + j] /= 256; } } } static inline double convolve_map(const double *filter, const double *map, const int size) { double result = 0; for (int i = 0; i < size; ++i) { result += filter[i] * map[i]; // symmetric filter is used } return result; } // This function is to decimate the map by half, and apply Gaussian filter on // top of the downsampled map. static inline void decimate_map(const double *map, int height, int width, int stride, double *downsampled_map) { const int new_width = width / 2; const int window_size = 5; const double gaussian_filter[25] = { 1. / 256, 1.0 / 64, 3. / 128, 1. / 64, 1. / 256, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 3. / 128, 3. / 32, 9. / 64, 3. / 32, 3. / 128, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 1. / 256, 1. / 64, 3. / 128, 1. / 64, 1. / 256 }; double map_region[25]; for (int y = 0; y < height - 1; y += 2) { for (int x = 0; x < width - 1; x += 2) { int i = 0; for (int yy = y - window_size / 2; yy <= y + window_size / 2; ++yy) { for (int xx = x - window_size / 2; xx <= x + window_size / 2; ++xx) { int yvalue = clamp(yy, 0, height - 1); int xvalue = clamp(xx, 0, width - 1); map_region[i++] = map[yvalue * stride + xvalue]; } } downsampled_map[(y / 2) * new_width + (x / 2)] = convolve_map(gaussian_filter, map_region, window_size * window_size); } } } // This function is to upscale the map from in_level size to out_level size. // Note that the map at "level-1" will upscale the map at "level" by x2. static inline int upscale_map(const double *input, int in_level, int out_level, int height[9], int width[9], double *output) { for (int level = in_level; level > out_level; level--) { const int cur_width = width[level]; const int cur_height = height[level]; const int cur_stride = width[level]; double *original = (level == in_level) ? (double *)input : output; assert(level > 0); const int h_upscale = height[level - 1]; const int w_upscale = width[level - 1]; const int s_upscale = width[level - 1]; double *upscale = aom_malloc(h_upscale * w_upscale * sizeof(*upscale)); if (!upscale) { return 0; } for (int i = 0; i < h_upscale; ++i) { for (int j = 0; j < w_upscale; ++j) { const int ii = clamp((i >> 1), 0, cur_height - 1); const int jj = clamp((j >> 1), 0, cur_width - 1); upscale[j + i * s_upscale] = (double)original[jj + ii * cur_stride]; } } memcpy(output, upscale, h_upscale * w_upscale * sizeof(double)); aom_free(upscale); } return 1; } // This function calculates the differences between a fine scale c and a // coarser scale s yielding the feature maps. c \in {2, 3, 4}, and s = c + // delta, where delta \in {3, 4}. static int center_surround_diff(const double *input[9], int height[9], int width[9], saliency_feature_map *output[6]) { int j = 0; for (int k = 2; k < 5; ++k) { int cur_height = height[k]; int cur_width = width[k]; if (upscale_map(input[k + 3], k + 3, k, height, width, output[j]->buf) == 0) { return 0; } for (int r = 0; r < cur_height; ++r) { for (int c = 0; c < cur_width; ++c) { output[j]->buf[r * cur_width + c] = fabs((double)(input[k][r * cur_width + c] - output[j]->buf[r * cur_width + c])); } } if (upscale_map(input[k + 4], k + 4, k, height, width, output[j + 1]->buf) == 0) { return 0; } for (int r = 0; r < cur_height; ++r) { for (int c = 0; c < cur_width; ++c) { output[j + 1]->buf[r * cur_width + c] = fabs(input[k][r * cur_width + c] - output[j + 1]->buf[r * cur_width + c]); } } j += 2; } return 1; } // For color channels, the differences is calculated based on "color // double-opponency". For example, the RG feature map is constructed between a // fine scale c of R-G component and a coarser scale s of G-R component. static int center_surround_diff_rgb(const double *input_1[9], const double *input_2[9], int height[9], int width[9], saliency_feature_map *output[6]) { int j = 0; for (int k = 2; k < 5; ++k) { int cur_height = height[k]; int cur_width = width[k]; if (upscale_map(input_2[k + 3], k + 3, k, height, width, output[j]->buf) == 0) { return 0; } for (int r = 0; r < cur_height; ++r) { for (int c = 0; c < cur_width; ++c) { output[j]->buf[r * cur_width + c] = fabs((double)(input_1[k][r * cur_width + c] - output[j]->buf[r * cur_width + c])); } } if (upscale_map(input_2[k + 4], k + 4, k, height, width, output[j + 1]->buf) == 0) { return 0; } for (int r = 0; r < cur_height; ++r) { for (int c = 0; c < cur_width; ++c) { output[j + 1]->buf[r * cur_width + c] = fabs(input_1[k][r * cur_width + c] - output[j + 1]->buf[r * cur_width + c]); } } j += 2; } return 1; } // This function is to generate Gaussian pyramid images with indexes from 0 to // 8, and construct the feature maps from calculating the center-surround // differences. static int gaussian_pyramid(const double *src, int width[9], int height[9], saliency_feature_map *dst[6]) { double *gaussian_map[9]; // scale = 9 gaussian_map[0] = (double *)aom_malloc(width[0] * height[0] * sizeof(*gaussian_map[0])); if (!gaussian_map[0]) { return 0; } memcpy(gaussian_map[0], src, width[0] * height[0] * sizeof(double)); for (int i = 1; i < 9; ++i) { int stride = width[i - 1]; int new_width = width[i]; int new_height = height[i]; gaussian_map[i] = (double *)aom_malloc(new_width * new_height * sizeof(*gaussian_map[i])); if (!gaussian_map[i]) { for (int l = 0; l < i; ++l) { aom_free(gaussian_map[l]); } return 0; } memset(gaussian_map[i], 0, new_width * new_height * sizeof(double)); decimate_map(gaussian_map[i - 1], height[i - 1], width[i - 1], stride, gaussian_map[i]); } if (center_surround_diff((const double **)gaussian_map, height, width, dst) == 0) { for (int l = 0; l < 9; ++l) { aom_free(gaussian_map[l]); } return 0; } for (int i = 0; i < 9; ++i) { aom_free(gaussian_map[i]); } return 1; } static int gaussian_pyramid_rgb(double *src_1, double *src_2, int width[9], int height[9], saliency_feature_map *dst[6]) { double *gaussian_map[2][9]; // scale = 9 double *src[2]; src[0] = src_1; src[1] = src_2; for (int k = 0; k < 2; ++k) { gaussian_map[k][0] = (double *)aom_malloc(width[0] * height[0] * sizeof(*gaussian_map[k][0])); if (!gaussian_map[k][0]) { for (int l = 0; l < k; ++l) { aom_free(gaussian_map[l][0]); } return 0; } memcpy(gaussian_map[k][0], src[k], width[0] * height[0] * sizeof(double)); for (int i = 1; i < 9; ++i) { int stride = width[i - 1]; int new_width = width[i]; int new_height = height[i]; gaussian_map[k][i] = (double *)aom_malloc(new_width * new_height * sizeof(*gaussian_map[k][i])); if (!gaussian_map[k][i]) { for (int l = 0; l < k; ++l) { aom_free(gaussian_map[l][i]); } return 0; } memset(gaussian_map[k][i], 0, new_width * new_height * sizeof(double)); decimate_map(gaussian_map[k][i - 1], height[i - 1], width[i - 1], stride, gaussian_map[k][i]); } } if (center_surround_diff_rgb((const double **)gaussian_map[0], (const double **)gaussian_map[1], height, width, dst) == 0) { for (int l = 0; l < 2; ++l) { for (int i = 0; i < 9; ++i) { aom_free(gaussian_map[l][i]); } } return 0; } for (int l = 0; l < 2; ++l) { for (int i = 0; i < 9; ++i) { aom_free(gaussian_map[l][i]); } } return 1; } static int get_feature_map_intensity(double *intensity, int width[9], int height[9], saliency_feature_map *i_map[6]) { if (gaussian_pyramid(intensity, width, height, i_map) == 0) { return 0; } return 1; } static int get_feature_map_rgb(double *cr, double *cg, double *cb, int width[9], int height[9], saliency_feature_map *rg_map[6], saliency_feature_map *by_map[6]) { double *rg_mat = aom_malloc(height[0] * width[0] * sizeof(*rg_mat)); double *by_mat = aom_malloc(height[0] * width[0] * sizeof(*by_mat)); double *gr_mat = aom_malloc(height[0] * width[0] * sizeof(*gr_mat)); double *yb_mat = aom_malloc(height[0] * width[0] * sizeof(*yb_mat)); if (!rg_mat || !by_mat || !gr_mat || !yb_mat) { aom_free(rg_mat); aom_free(by_mat); aom_free(gr_mat); aom_free(yb_mat); return 0; } double r, g, b, y; for (int i = 0; i < height[0]; ++i) { for (int j = 0; j < width[0]; ++j) { r = AOMMAX(0, cr[i * width[0] + j] - (cg[i * width[0] + j] + cb[i * width[0] + j]) / 2); g = AOMMAX(0, cg[i * width[0] + j] - (cr[i * width[0] + j] + cb[i * width[0] + j]) / 2); b = AOMMAX(0, cb[i * width[0] + j] - (cr[i * width[0] + j] + cg[i * width[0] + j]) / 2); y = AOMMAX(0, (cr[i * width[0] + j] + cg[i * width[0] + j]) / 2 - fabs(cr[i * width[0] + j] - cg[i * width[0] + j]) / 2 - cb[i * width[0] + j]); rg_mat[i * width[0] + j] = r - g; by_mat[i * width[0] + j] = b - y; gr_mat[i * width[0] + j] = g - r; yb_mat[i * width[0] + j] = y - b; } } if (gaussian_pyramid_rgb(rg_mat, gr_mat, width, height, rg_map) == 0 || gaussian_pyramid_rgb(by_mat, yb_mat, width, height, by_map) == 0) { aom_free(rg_mat); aom_free(by_mat); aom_free(gr_mat); aom_free(yb_mat); return 0; } aom_free(rg_mat); aom_free(by_mat); aom_free(gr_mat); aom_free(yb_mat); return 1; } static inline void filter2d(const double *input, const double kernel[9][9], int width, int height, double *output) { const int window_size = 9; double map_section[81]; for (int y = 0; y <= height - 1; ++y) { for (int x = 0; x <= width - 1; ++x) { int i = 0; for (int yy = y - window_size / 2; yy <= y + window_size / 2; ++yy) { for (int xx = x - window_size / 2; xx <= x + window_size / 2; ++xx) { int yvalue = clamp(yy, 0, height - 1); int xvalue = clamp(xx, 0, width - 1); map_section[i++] = input[yvalue * width + xvalue]; } } output[y * width + x] = 0; for (int k = 0; k < window_size; ++k) { for (int l = 0; l < window_size; ++l) { output[y * width + x] += kernel[k][l] * map_section[k * window_size + l]; } } } } } static int get_feature_map_orientation(const double *intensity, int width[9], int height[9], saliency_feature_map *dst[24]) { double *gaussian_map[9]; gaussian_map[0] = (double *)aom_malloc(width[0] * height[0] * sizeof(*gaussian_map[0])); if (!gaussian_map[0]) { return 0; } memcpy(gaussian_map[0], intensity, width[0] * height[0] * sizeof(double)); for (int i = 1; i < 9; ++i) { int stride = width[i - 1]; int new_width = width[i]; int new_height = height[i]; gaussian_map[i] = (double *)aom_malloc(new_width * new_height * sizeof(*gaussian_map[i])); if (!gaussian_map[i]) { for (int l = 0; l < i; ++l) { aom_free(gaussian_map[l]); } return 0; } memset(gaussian_map[i], 0, new_width * new_height * sizeof(double)); decimate_map(gaussian_map[i - 1], height[i - 1], width[i - 1], stride, gaussian_map[i]); } double *tempGaborOutput[4][9]; //[angle: 0, 45, 90, 135 degree][filter_size] for (int i = 2; i < 9; ++i) { const int cur_height = height[i]; const int cur_width = width[i]; for (int j = 0; j < 4; ++j) { tempGaborOutput[j][i] = (double *)aom_malloc( cur_height * cur_width * sizeof(*tempGaborOutput[j][i])); if (!tempGaborOutput[j][i]) { for (int l = 0; l < 9; ++l) { aom_free(gaussian_map[l]); } for (int h = 0; h < 4; ++h) { for (int g = 2; g < 9; ++g) { aom_free(tempGaborOutput[h][g]); } } return 0; } filter2d(gaussian_map[i], kGaborFilter[j], cur_width, cur_height, tempGaborOutput[j][i]); } } for (int i = 0; i < 9; ++i) { aom_free(gaussian_map[i]); } saliency_feature_map *tmp[4][6]; //[angle: 0, 45, 90, 135 degree][filter_size] for (int i = 0; i < 6; ++i) { for (int j = 0; j < 4; ++j) { tmp[j][i] = dst[j * 6 + i]; } } for (int j = 0; j < 4; ++j) { if (center_surround_diff((const double **)tempGaborOutput[j], height, width, tmp[j]) == 0) { for (int h = 0; h < 4; ++h) { for (int g = 2; g < 9; ++g) { aom_free(tempGaborOutput[h][g]); } } return 0; } } for (int i = 2; i < 9; ++i) { for (int j = 0; j < 4; ++j) { aom_free(tempGaborOutput[j][i]); } } return 1; } static inline void find_min_max(const saliency_feature_map *input, double *max_value, double *min_value) { assert(input && input->buf); *min_value = DBL_MAX; *max_value = 0.0; for (int i = 0; i < input->height; ++i) { for (int j = 0; j < input->width; ++j) { assert(input->buf[i * input->width + j] >= 0.0); *min_value = fmin(input->buf[i * input->width + j], *min_value); *max_value = fmax(input->buf[i * input->width + j], *max_value); } } } static inline double average_local_max(const saliency_feature_map *input, int stepsize) { int numlocal = 0; double lmaxmean = 0, lmax = 0, dummy = 0; saliency_feature_map local_map; local_map.height = stepsize; local_map.width = stepsize; local_map.buf = (double *)aom_malloc(stepsize * stepsize * sizeof(*local_map.buf)); if (!local_map.buf) { return -1; } for (int y = 0; y < input->height - stepsize; y += stepsize) { for (int x = 0; x < input->width - stepsize; x += stepsize) { for (int i = 0; i < stepsize; ++i) { for (int j = 0; j < stepsize; ++j) { local_map.buf[i * stepsize + j] = input->buf[(y + i) * input->width + x + j]; } } find_min_max(&local_map, &lmax, &dummy); lmaxmean += lmax; numlocal++; } } aom_free(local_map.buf); return lmaxmean / numlocal; } // Linear normalization the values in the map to [0,1]. static void minmax_normalize(saliency_feature_map *input) { double max_value, min_value; find_min_max(input, &max_value, &min_value); for (int i = 0; i < input->height; ++i) { for (int j = 0; j < input->width; ++j) { if (max_value != min_value) { input->buf[i * input->width + j] = input->buf[i * input->width + j] / (max_value - min_value) + min_value / (min_value - max_value); } else { input->buf[i * input->width + j] -= min_value; } } } } // This function is to promote meaningful “activation spots” in the map and // ignores homogeneous areas. static int nomalization_operator(saliency_feature_map *input, int stepsize) { minmax_normalize(input); double lmaxmean = average_local_max(input, stepsize); if (lmaxmean < 0) { return 0; } double normCoeff = (1 - lmaxmean) * (1 - lmaxmean); for (int i = 0; i < input->height; ++i) { for (int j = 0; j < input->width; ++j) { input->buf[i * input->width + j] *= normCoeff; } } return 1; } // Normalize the values in feature maps to [0,1], and then upscale all maps to // the original frame size. static int normalize_fm(saliency_feature_map *input[6], int width[9], int height[9], int num_fm, saliency_feature_map *output[6]) { // Feature maps (FM) are generated by function "center_surround_diff()". The // difference is between a fine scale c and a coarser scale s, where c \in {2, // 3, 4}, and s = c + delta, where delta \in {3, 4}, and the FM size is scale // c. Specifically, i=0: c=2 and s=5, i=1: c=2 and s=6, i=2: c=3 and s=6, i=3: // c=3 and s=7, i=4: c=4 and s=7, i=5: c=4 and s=8. for (int i = 0; i < num_fm; ++i) { if (nomalization_operator(input[i], 8) == 0) { return 0; } // Upscale FM to original frame size if (upscale_map(input[i]->buf, (i / 2) + 2, 0, height, width, output[i]->buf) == 0) { return 0; } } return 1; } // Combine feature maps with the same category (intensity, color, or // orientation) into one conspicuity map. static int normalized_map(saliency_feature_map *input[6], int width[9], int height[9], saliency_feature_map *output) { int num_fm = 6; saliency_feature_map *n_input[6]; for (int i = 0; i < 6; ++i) { n_input[i] = (saliency_feature_map *)aom_malloc(sizeof(*n_input[i])); if (!n_input[i]) { return 0; } n_input[i]->buf = (double *)aom_malloc(width[0] * height[0] * sizeof(*n_input[i]->buf)); if (!n_input[i]->buf) { aom_free(n_input[i]); return 0; } n_input[i]->height = height[0]; n_input[i]->width = width[0]; } if (normalize_fm(input, width, height, num_fm, n_input) == 0) { for (int i = 0; i < num_fm; ++i) { aom_free(n_input[i]->buf); aom_free(n_input[i]); } return 0; } // Add up all normalized feature maps with the same category into one map. for (int i = 0; i < num_fm; ++i) { for (int r = 0; r < height[0]; ++r) { for (int c = 0; c < width[0]; ++c) { output->buf[r * width[0] + c] += n_input[i]->buf[r * width[0] + c]; } } } for (int i = 0; i < num_fm; ++i) { aom_free(n_input[i]->buf); aom_free(n_input[i]); } nomalization_operator(output, 8); return 1; } static int normalized_map_rgb(saliency_feature_map *rg_map[6], saliency_feature_map *by_map[6], int width[9], int height[9], saliency_feature_map *output) { saliency_feature_map *color_cm[2]; // 0: color_cm_rg, 1: color_cm_by for (int i = 0; i < 2; ++i) { color_cm[i] = aom_malloc(sizeof(*color_cm[i])); if (!color_cm[i]) { return 0; } color_cm[i]->buf = (double *)aom_malloc(width[0] * height[0] * sizeof(*color_cm[i]->buf)); if (!color_cm[i]->buf) { for (int l = 0; l < i; ++l) { aom_free(color_cm[l]->buf); } aom_free(color_cm[i]); return 0; } color_cm[i]->width = width[0]; color_cm[i]->height = height[0]; memset(color_cm[i]->buf, 0, width[0] * height[0] * sizeof(*color_cm[i]->buf)); } if (normalized_map(rg_map, width, height, color_cm[0]) == 0 || normalized_map(by_map, width, height, color_cm[1]) == 0) { for (int i = 0; i < 2; ++i) { aom_free(color_cm[i]->buf); aom_free(color_cm[i]); } return 0; } for (int r = 0; r < height[0]; ++r) { for (int c = 0; c < width[0]; ++c) { output->buf[r * width[0] + c] = color_cm[0]->buf[r * width[0] + c] + color_cm[1]->buf[r * width[0] + c]; } } for (int i = 0; i < 2; ++i) { aom_free(color_cm[i]->buf); aom_free(color_cm[i]); } nomalization_operator(output, 8); return 1; } static int normalized_map_orientation(saliency_feature_map *orientation_map[24], int width[9], int height[9], saliency_feature_map *output) { int num_fms_per_angle = 6; saliency_feature_map *ofm[4][6]; for (int i = 0; i < num_fms_per_angle; ++i) { for (int j = 0; j < 4; ++j) { ofm[j][i] = orientation_map[j * num_fms_per_angle + i]; } } // extract conspicuity map for each angle saliency_feature_map *nofm = aom_malloc(sizeof(*nofm)); if (!nofm) { return 0; } nofm->buf = (double *)aom_malloc(width[0] * height[0] * sizeof(*nofm->buf)); if (!nofm->buf) { aom_free(nofm); return 0; } nofm->height = height[0]; nofm->width = width[0]; for (int i = 0; i < 4; ++i) { memset(nofm->buf, 0, width[0] * height[0] * sizeof(*nofm->buf)); if (normalized_map(ofm[i], width, height, nofm) == 0) { aom_free(nofm->buf); aom_free(nofm); return 0; } for (int r = 0; r < height[0]; ++r) { for (int c = 0; c < width[0]; ++c) { output->buf[r * width[0] + c] += nofm->buf[r * width[0] + c]; } } } aom_free(nofm->buf); aom_free(nofm); nomalization_operator(output, 8); return 1; } // Set pixel level saliency mask based on Itti-Koch algorithm int av1_set_saliency_map(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; int frm_width = cm->width; int frm_height = cm->height; int pyr_height[9]; int pyr_width[9]; pyr_height[0] = frm_height; pyr_width[0] = frm_width; for (int i = 1; i < 9; ++i) { pyr_width[i] = pyr_width[i - 1] / 2; pyr_height[i] = pyr_height[i - 1] / 2; } double *cr = aom_malloc(frm_width * frm_height * sizeof(*cr)); double *cg = aom_malloc(frm_width * frm_height * sizeof(*cg)); double *cb = aom_malloc(frm_width * frm_height * sizeof(*cb)); double *intensity = aom_malloc(frm_width * frm_height * sizeof(*intensity)); if (!cr || !cg || !cb || !intensity) { aom_free(cr); aom_free(cg); aom_free(cb); aom_free(intensity); return 0; } // Extract red / green / blue channels and intensity component get_color_intensity(cpi->source, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, cr, cg, cb, intensity); // Feature Map Extraction // intensity map saliency_feature_map *i_map[6]; for (int i = 0; i < 6; ++i) { int cur_height = pyr_height[(i / 2) + 2]; int cur_width = pyr_width[(i / 2) + 2]; i_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*i_map[i])); if (!i_map[i]) { aom_free(cr); aom_free(cg); aom_free(cb); aom_free(intensity); for (int l = 0; l < i; ++l) { aom_free(i_map[l]); } return 0; } i_map[i]->buf = (double *)aom_malloc(cur_height * cur_width * sizeof(*i_map[i]->buf)); if (!i_map[i]->buf) { aom_free(cr); aom_free(cg); aom_free(cb); aom_free(intensity); for (int l = 0; l < i; ++l) { aom_free(i_map[l]->buf); aom_free(i_map[l]); } return 0; } i_map[i]->height = cur_height; i_map[i]->width = cur_width; } if (get_feature_map_intensity(intensity, pyr_width, pyr_height, i_map) == 0) { aom_free(cr); aom_free(cg); aom_free(cb); aom_free(intensity); for (int l = 0; l < 6; ++l) { aom_free(i_map[l]->buf); aom_free(i_map[l]); } return 0; } // RGB map saliency_feature_map *rg_map[6], *by_map[6]; for (int i = 0; i < 6; ++i) { int cur_height = pyr_height[(i / 2) + 2]; int cur_width = pyr_width[(i / 2) + 2]; rg_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*rg_map[i])); by_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*by_map[i])); if (!rg_map[i] || !by_map[i]) { aom_free(cr); aom_free(cg); aom_free(cb); aom_free(intensity); for (int l = 0; l < 6; ++l) { aom_free(i_map[l]->buf); aom_free(i_map[l]); aom_free(rg_map[l]); aom_free(by_map[l]); } return 0; } rg_map[i]->buf = (double *)aom_malloc(cur_height * cur_width * sizeof(*rg_map[i]->buf)); by_map[i]->buf = (double *)aom_malloc(cur_height * cur_width * sizeof(*by_map[i]->buf)); if (!by_map[i]->buf || !rg_map[i]->buf) { aom_free(cr); aom_free(cg); aom_free(cb); aom_free(intensity); for (int l = 0; l < 6; ++l) { aom_free(i_map[l]->buf); aom_free(i_map[l]); } for (int l = 0; l < i; ++l) { aom_free(rg_map[l]->buf); aom_free(by_map[l]->buf); aom_free(rg_map[l]); aom_free(by_map[l]); } return 0; } rg_map[i]->height = cur_height; rg_map[i]->width = cur_width; by_map[i]->height = cur_height; by_map[i]->width = cur_width; } if (get_feature_map_rgb(cr, cg, cb, pyr_width, pyr_height, rg_map, by_map) == 0) { aom_free(cr); aom_free(cg); aom_free(cb); aom_free(intensity); for (int l = 0; l < 6; ++l) { aom_free(i_map[l]->buf); aom_free(rg_map[l]->buf); aom_free(by_map[l]->buf); aom_free(i_map[l]); aom_free(rg_map[l]); aom_free(by_map[l]); } return 0; } // Orientation map saliency_feature_map *orientation_map[24]; for (int i = 0; i < 24; ++i) { int cur_height = pyr_height[((i % 6) / 2) + 2]; int cur_width = pyr_width[((i % 6) / 2) + 2]; orientation_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*orientation_map[i])); if (!orientation_map[i]) { aom_free(cr); aom_free(cg); aom_free(cb); aom_free(intensity); for (int l = 0; l < 6; ++l) { aom_free(i_map[l]->buf); aom_free(rg_map[l]->buf); aom_free(by_map[l]->buf); aom_free(i_map[l]); aom_free(rg_map[l]); aom_free(by_map[l]); } for (int h = 0; h < i; ++h) { aom_free(orientation_map[h]); } return 0; } orientation_map[i]->buf = (double *)aom_malloc( cur_height * cur_width * sizeof(*orientation_map[i]->buf)); if (!orientation_map[i]->buf) { aom_free(cr); aom_free(cg); aom_free(cb); aom_free(intensity); for (int l = 0; l < 6; ++l) { aom_free(i_map[l]->buf); aom_free(rg_map[l]->buf); aom_free(by_map[l]->buf); aom_free(i_map[l]); aom_free(rg_map[l]); aom_free(by_map[l]); } for (int h = 0; h < i; ++h) { aom_free(orientation_map[h]->buf); aom_free(orientation_map[h]->buf); aom_free(orientation_map[h]); aom_free(orientation_map[h]); } return 0; } orientation_map[i]->height = cur_height; orientation_map[i]->width = cur_width; } if (get_feature_map_orientation(intensity, pyr_width, pyr_height, orientation_map) == 0) { aom_free(cr); aom_free(cg); aom_free(cb); aom_free(intensity); for (int l = 0; l < 6; ++l) { aom_free(i_map[l]->buf); aom_free(rg_map[l]->buf); aom_free(by_map[l]->buf); aom_free(i_map[l]); aom_free(rg_map[l]); aom_free(by_map[l]); } for (int h = 0; h < 24; ++h) { aom_free(orientation_map[h]->buf); aom_free(orientation_map[h]); } return 0; } aom_free(cr); aom_free(cg); aom_free(cb); aom_free(intensity); saliency_feature_map *normalized_maps[3]; // 0: intensity, 1: color, 2: orientation for (int i = 0; i < 3; ++i) { normalized_maps[i] = aom_malloc(sizeof(*normalized_maps[i])); if (!normalized_maps[i]) { for (int l = 0; l < 6; ++l) { aom_free(i_map[l]->buf); aom_free(rg_map[l]->buf); aom_free(by_map[l]->buf); aom_free(i_map[l]); aom_free(rg_map[l]); aom_free(by_map[l]); } for (int h = 0; h < 24; ++h) { aom_free(orientation_map[h]->buf); aom_free(orientation_map[h]); } for (int l = 0; l < i; ++l) { aom_free(normalized_maps[l]); } return 0; } normalized_maps[i]->buf = (double *)aom_malloc( frm_width * frm_height * sizeof(*normalized_maps[i]->buf)); if (!normalized_maps[i]->buf) { for (int l = 0; l < 6; ++l) { aom_free(i_map[l]->buf); aom_free(rg_map[l]->buf); aom_free(by_map[l]->buf); aom_free(i_map[l]); aom_free(rg_map[l]); aom_free(by_map[l]); } for (int h = 0; h < 24; ++h) { aom_free(orientation_map[h]->buf); aom_free(orientation_map[h]); } for (int l = 0; l < i; ++l) { aom_free(normalized_maps[l]->buf); aom_free(normalized_maps[l]); } return 0; } normalized_maps[i]->width = frm_width; normalized_maps[i]->height = frm_height; memset(normalized_maps[i]->buf, 0, frm_width * frm_height * sizeof(*normalized_maps[i]->buf)); } // Conspicuity map generation if (normalized_map(i_map, pyr_width, pyr_height, normalized_maps[0]) == 0 || normalized_map_rgb(rg_map, by_map, pyr_width, pyr_height, normalized_maps[1]) == 0 || normalized_map_orientation(orientation_map, pyr_width, pyr_height, normalized_maps[2]) == 0) { for (int i = 0; i < 6; ++i) { aom_free(i_map[i]->buf); aom_free(rg_map[i]->buf); aom_free(by_map[i]->buf); aom_free(i_map[i]); aom_free(rg_map[i]); aom_free(by_map[i]); } for (int i = 0; i < 24; ++i) { aom_free(orientation_map[i]->buf); aom_free(orientation_map[i]); } for (int i = 0; i < 3; ++i) { aom_free(normalized_maps[i]->buf); aom_free(normalized_maps[i]); } return 0; } for (int i = 0; i < 6; ++i) { aom_free(i_map[i]->buf); aom_free(rg_map[i]->buf); aom_free(by_map[i]->buf); aom_free(i_map[i]); aom_free(rg_map[i]); aom_free(by_map[i]); } for (int i = 0; i < 24; ++i) { aom_free(orientation_map[i]->buf); aom_free(orientation_map[i]); } // Pixel level saliency map saliency_feature_map *combined_saliency_map = aom_malloc(sizeof(*combined_saliency_map)); if (!combined_saliency_map) { for (int i = 0; i < 3; ++i) { aom_free(normalized_maps[i]->buf); aom_free(normalized_maps[i]); } return 0; } combined_saliency_map->buf = (double *)aom_malloc( frm_width * frm_height * sizeof(*combined_saliency_map->buf)); if (!combined_saliency_map->buf) { for (int i = 0; i < 3; ++i) { aom_free(normalized_maps[i]->buf); aom_free(normalized_maps[i]); } aom_free(combined_saliency_map); return 0; } combined_saliency_map->height = frm_height; combined_saliency_map->width = frm_width; double w_intensity, w_color, w_orient; w_intensity = w_color = w_orient = (double)1 / 3; for (int r = 0; r < frm_height; ++r) { for (int c = 0; c < frm_width; ++c) { combined_saliency_map->buf[r * frm_width + c] = (w_intensity * normalized_maps[0]->buf[r * frm_width + c] + w_color * normalized_maps[1]->buf[r * frm_width + c] + w_orient * normalized_maps[2]->buf[r * frm_width + c]); } } for (int r = 0; r < frm_height; ++r) { for (int c = 0; c < frm_width; ++c) { int index = r * frm_width + c; cpi->saliency_map[index] = (uint8_t)(combined_saliency_map->buf[index] * 255); } } for (int i = 0; i < 3; ++i) { aom_free(normalized_maps[i]->buf); aom_free(normalized_maps[i]); } aom_free(combined_saliency_map->buf); aom_free(combined_saliency_map); return 1; } // Set superblock level saliency mask for rdmult scaling int av1_setup_sm_rdmult_scaling_factor(AV1_COMP *cpi, double motion_ratio) { AV1_COMMON *cm = &cpi->common; saliency_feature_map *sb_saliency_map = aom_malloc(sizeof(saliency_feature_map)); if (sb_saliency_map == NULL) { return 0; } const BLOCK_SIZE bsize = cm->seq_params->sb_size; const int num_mi_w = mi_size_wide[bsize]; const int num_mi_h = mi_size_high[bsize]; const int block_width = block_size_wide[bsize]; const int block_height = block_size_high[bsize]; const int num_sb_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; const int num_sb_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; sb_saliency_map->height = num_sb_rows; sb_saliency_map->width = num_sb_cols; sb_saliency_map->buf = (double *)aom_malloc(num_sb_rows * num_sb_cols * sizeof(*sb_saliency_map->buf)); if (sb_saliency_map->buf == NULL) { aom_free(sb_saliency_map); return 0; } for (int row = 0; row < num_sb_rows; ++row) { for (int col = 0; col < num_sb_cols; ++col) { const int index = row * num_sb_cols + col; double total_pixel = 0; double total_weight = 0; for (int i = 0; i < block_height; i++) { for (int j = 0; j < block_width; j++) { if ((row * block_height + i) >= cpi->common.height || (col * block_width + j) >= cpi->common.width) continue; total_pixel++; total_weight += cpi->saliency_map[(row * block_height + i) * cpi->common.width + col * block_width + j]; } } assert(total_pixel > 0); // Calculate the superblock level saliency map from pixel level saliency // map sb_saliency_map->buf[index] = total_weight / total_pixel; // Further lower the superblock saliency score for boundary superblocks. if (row < 1 || row > num_sb_rows - 2 || col < 1 || col > num_sb_cols - 2) { sb_saliency_map->buf[index] /= 5; } } } // superblock level saliency map finalization minmax_normalize(sb_saliency_map); double log_sum = 0.0; double sum = 0.0; int block_count = 0; // Calculate the average superblock sm_scaling_factor for a frame, to be used // for clamping later. for (int row = 0; row < num_sb_rows; ++row) { for (int col = 0; col < num_sb_cols; ++col) { const int index = row * num_sb_cols + col; const double saliency = sb_saliency_map->buf[index]; cpi->sm_scaling_factor[index] = 1 - saliency; sum += cpi->sm_scaling_factor[index]; block_count++; } } assert(block_count > 0); sum /= block_count; // Calculate the geometric mean of superblock sm_scaling_factor for a frame, // to be used for normalization. for (int row = 0; row < num_sb_rows; ++row) { for (int col = 0; col < num_sb_cols; ++col) { const int index = row * num_sb_cols + col; log_sum += log(fmax(cpi->sm_scaling_factor[index], 0.001)); cpi->sm_scaling_factor[index] = fmax(cpi->sm_scaling_factor[index], 0.8 * sum); } } log_sum = exp(log_sum / block_count); // Normalize the sm_scaling_factor by geometric mean. for (int row = 0; row < num_sb_rows; ++row) { for (int col = 0; col < num_sb_cols; ++col) { const int index = row * num_sb_cols + col; assert(log_sum > 0); cpi->sm_scaling_factor[index] /= log_sum; // Modulate the sm_scaling_factor by frame basis motion factor cpi->sm_scaling_factor[index] = cpi->sm_scaling_factor[index] * motion_ratio; } } aom_free(sb_saliency_map->buf); aom_free(sb_saliency_map); return 1; } // av1_setup_motion_ratio() is only enabled when CONFIG_REALTIME_ONLY is 0, // because the computations need to access the first pass stats which are // only available when CONFIG_REALTIME_ONLY is equal to 0. #if !CONFIG_REALTIME_ONLY // Set motion_ratio that reflects the motion quantities between two consecutive // frames. Motion_ratio will be used to set up saliency_map based rdmult scaling // factor, i.e., the less the motion quantities are, the more bits will be spent // on this frame, and vice versa. double av1_setup_motion_ratio(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; int frames_since_key = cm->current_frame.display_order_hint - cpi->rc.frames_since_key; const FIRSTPASS_STATS *cur_stats = av1_firstpass_info_peek( &cpi->ppi->twopass.firstpass_info, frames_since_key); assert(cur_stats != NULL); assert(cpi->ppi->twopass.firstpass_info.total_stats.count > 0); const double avg_intra_error = exp(cpi->ppi->twopass.firstpass_info.total_stats.log_intra_error / cpi->ppi->twopass.firstpass_info.total_stats.count); const double avg_inter_error = exp(cpi->ppi->twopass.firstpass_info.total_stats.log_coded_error / cpi->ppi->twopass.firstpass_info.total_stats.count); double inter_error = cur_stats->coded_error; double error_stdev = 0; const double avg_error = cpi->ppi->twopass.firstpass_info.total_stats.intra_error / cpi->ppi->twopass.firstpass_info.total_stats.count; for (int i = 0; i < cpi->ppi->twopass.firstpass_info.total_stats.count; i++) { const FIRSTPASS_STATS *stats = &cpi->ppi->twopass.firstpass_info.stats_buf[i]; error_stdev += (stats->intra_error - avg_error) * (stats->intra_error - avg_error); } error_stdev = sqrt(error_stdev / cpi->ppi->twopass.firstpass_info.total_stats.count); double motion_ratio = 1; if (error_stdev / fmax(avg_intra_error, 1) > 0.1) { motion_ratio = inter_error / fmax(1, avg_inter_error); motion_ratio = AOMMIN(motion_ratio, 1.5); motion_ratio = AOMMAX(motion_ratio, 0.8); } return motion_ratio; } #endif // !CONFIG_REALTIME_ONLY aom-3.12.1/av1/encoder/saliency_map.h000066400000000000000000000020071477627663500173230ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_SALIENCY_MAP_H_ #define AOM_AV1_ENCODER_SALIENCY_MAP_H_ #include "av1/encoder/encoder.h" typedef struct saliency_feature_map { double *buf; // stores values of the map in 1D array int height; int width; } saliency_feature_map; int av1_set_saliency_map(AV1_COMP *cpi); #if !CONFIG_REALTIME_ONLY double av1_setup_motion_ratio(AV1_COMP *cpi); #endif int av1_setup_sm_rdmult_scaling_factor(AV1_COMP *cpi, double motion_ratio); #endif // AOM_AV1_ENCODER_SALIENCY_MAP_H_ aom-3.12.1/av1/encoder/segmentation.c000066400000000000000000000031411477627663500173470ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom_mem/aom_mem.h" #include "av1/common/pred_common.h" #include "av1/common/tile_common.h" #include "av1/encoder/cost.h" #include "av1/encoder/segmentation.h" void av1_enable_segmentation(struct segmentation *seg) { seg->enabled = 1; seg->update_map = 1; seg->update_data = 1; seg->temporal_update = 0; } void av1_disable_segmentation(struct segmentation *seg) { seg->enabled = 0; seg->update_map = 0; seg->update_data = 0; seg->temporal_update = 0; } void av1_disable_segfeature(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id) { seg->feature_mask[segment_id] &= ~(1u << feature_id); } void av1_clear_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id) { seg->feature_data[segment_id][feature_id] = 0; } void av1_reset_segment_features(AV1_COMMON *cm) { struct segmentation *seg = &cm->seg; // Set up default state for MB feature flags seg->enabled = 0; seg->update_map = 0; seg->update_data = 0; av1_clearall_segfeatures(seg); } aom-3.12.1/av1/encoder/segmentation.h000066400000000000000000000024071477627663500173600ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_SEGMENTATION_H_ #define AOM_AV1_ENCODER_SEGMENTATION_H_ #include "av1/common/blockd.h" #include "av1/encoder/encoder.h" #ifdef __cplusplus extern "C" { #endif void av1_enable_segmentation(struct segmentation *seg); void av1_disable_segmentation(struct segmentation *seg); void av1_disable_segfeature(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); void av1_clear_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd); void av1_reset_segment_features(AV1_COMMON *cm); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_SEGMENTATION_H_ aom-3.12.1/av1/encoder/sorting_network.h000066400000000000000000000066001477627663500201200ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*! \file * This file contains several utility functions used to sort small arrays with * sorting networks. * * Sorting network is a (potentially branch-less) way to quickly sort small * arrays with known size. For more details, consult * (https://en.wikipedia.org/wiki/Sorting_network). */ #ifndef AOM_AV1_ENCODER_SORTING_NETWORK_H_ #define AOM_AV1_ENCODER_SORTING_NETWORK_H_ #include "aom/aom_integer.h" #define SWAP(i, j) \ do { \ const float maxf = (k[i] >= k[j]) ? k[i] : k[j]; \ const float minf = (k[i] >= k[j]) ? k[j] : k[i]; \ const int maxi = (k[i] >= k[j]) ? v[i] : v[j]; \ const int mini = (k[i] >= k[j]) ? v[j] : v[i]; \ k[i] = maxf; \ k[j] = minf; \ v[i] = maxi; \ v[j] = mini; \ } while (0) /*!\brief Sorts two size-16 arrays of keys and values in descending order of * keys. * * \param[in,out] k An length-16 array of float serves as the keys. * \param[in,out] v An length-16 array of int32 serves as the * value. */ static inline void av1_sort_fi32_16(float k[], int32_t v[]) { SWAP(0, 1); SWAP(2, 3); SWAP(4, 5); SWAP(6, 7); SWAP(8, 9); SWAP(10, 11); SWAP(12, 13); SWAP(14, 15); SWAP(0, 2); SWAP(1, 3); SWAP(4, 6); SWAP(5, 7); SWAP(8, 10); SWAP(9, 11); SWAP(12, 14); SWAP(13, 15); SWAP(1, 2); SWAP(5, 6); SWAP(0, 4); SWAP(3, 7); SWAP(9, 10); SWAP(13, 14); SWAP(8, 12); SWAP(11, 15); SWAP(1, 5); SWAP(2, 6); SWAP(9, 13); SWAP(10, 14); SWAP(0, 8); SWAP(7, 15); SWAP(1, 4); SWAP(3, 6); SWAP(9, 12); SWAP(11, 14); SWAP(2, 4); SWAP(3, 5); SWAP(10, 12); SWAP(11, 13); SWAP(1, 9); SWAP(6, 14); SWAP(3, 4); SWAP(11, 12); SWAP(1, 8); SWAP(2, 10); SWAP(5, 13); SWAP(7, 14); SWAP(3, 11); SWAP(2, 8); SWAP(4, 12); SWAP(7, 13); SWAP(3, 10); SWAP(5, 12); SWAP(3, 9); SWAP(6, 12); SWAP(3, 8); SWAP(7, 12); SWAP(5, 9); SWAP(6, 10); SWAP(4, 8); SWAP(7, 11); SWAP(5, 8); SWAP(7, 10); SWAP(6, 8); SWAP(7, 9); SWAP(7, 8); } /*!\brief Sorts two size-8 arrays of keys and values in descending order of * keys. * * \param[in,out] k An length-8 array of float serves as the keys. * \param[in,out] v An length-8 array of int32 serves as the values. */ static inline void av1_sort_fi32_8(float k[], int32_t v[]) { SWAP(0, 1); SWAP(2, 3); SWAP(4, 5); SWAP(6, 7); SWAP(0, 2); SWAP(1, 3); SWAP(4, 6); SWAP(5, 7); SWAP(1, 2); SWAP(5, 6); SWAP(0, 4); SWAP(3, 7); SWAP(1, 5); SWAP(2, 6); SWAP(1, 4); SWAP(3, 6); SWAP(2, 4); SWAP(3, 5); SWAP(3, 4); } #undef SWAP #endif // AOM_AV1_ENCODER_SORTING_NETWORK_H_ aom-3.12.1/av1/encoder/sparse_linear_solver.c000066400000000000000000000304571477627663500211050ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/av1_common_int.h" #include "av1/encoder/sparse_linear_solver.h" #include "config/aom_config.h" #include "aom_mem/aom_mem.h" #include "av1/common/alloccommon.h" #if CONFIG_OPTICAL_FLOW_API /* * Input: * rows: array of row positions * cols: array of column positions * values: array of element values * num_elem: total number of elements in the matrix * num_rows: number of rows in the matrix * num_cols: number of columns in the matrix * * Output: * sm: pointer to the sparse matrix to be initialized * * Return: 0 - success * -1 - failed */ int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values, int num_elem, int num_rows, int num_cols, SPARSE_MTX *sm) { sm->n_elem = num_elem; sm->n_rows = num_rows; sm->n_cols = num_cols; if (num_elem == 0) { sm->row_pos = NULL; sm->col_pos = NULL; sm->value = NULL; return 0; } sm->row_pos = aom_calloc(num_elem, sizeof(*sm->row_pos)); sm->col_pos = aom_calloc(num_elem, sizeof(*sm->col_pos)); sm->value = aom_calloc(num_elem, sizeof(*sm->value)); if (!sm->row_pos || !sm->col_pos || !sm->value) { av1_free_sparse_mtx_elems(sm); return -1; } memcpy(sm->row_pos, rows, num_elem * sizeof(*sm->row_pos)); memcpy(sm->col_pos, cols, num_elem * sizeof(*sm->col_pos)); memcpy(sm->value, values, num_elem * sizeof(*sm->value)); return 0; } /* * Combines two sparse matrices (allocating new space). * * Input: * sm1, sm2: matrices to be combined * row_offset1, row_offset2: row offset of each matrix in the new matrix * col_offset1, col_offset2: column offset of each matrix in the new matrix * new_n_rows, new_n_cols: number of rows and columns in the new matrix * * Output: * sm: the combined matrix * * Return: 0 - success * -1 - failed */ int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2, SPARSE_MTX *sm, int row_offset1, int col_offset1, int row_offset2, int col_offset2, int new_n_rows, int new_n_cols) { sm->n_elem = sm1->n_elem + sm2->n_elem; sm->n_cols = new_n_cols; sm->n_rows = new_n_rows; if (sm->n_elem == 0) { sm->row_pos = NULL; sm->col_pos = NULL; sm->value = NULL; return 0; } sm->row_pos = aom_calloc(sm->n_elem, sizeof(*sm->row_pos)); sm->col_pos = aom_calloc(sm->n_elem, sizeof(*sm->col_pos)); sm->value = aom_calloc(sm->n_elem, sizeof(*sm->value)); if (!sm->row_pos || !sm->col_pos || !sm->value) { av1_free_sparse_mtx_elems(sm); return -1; } for (int i = 0; i < sm1->n_elem; i++) { sm->row_pos[i] = sm1->row_pos[i] + row_offset1; sm->col_pos[i] = sm1->col_pos[i] + col_offset1; } memcpy(sm->value, sm1->value, sm1->n_elem * sizeof(*sm1->value)); int n_elem1 = sm1->n_elem; for (int i = 0; i < sm2->n_elem; i++) { sm->row_pos[n_elem1 + i] = sm2->row_pos[i] + row_offset2; sm->col_pos[n_elem1 + i] = sm2->col_pos[i] + col_offset2; } memcpy(sm->value + n_elem1, sm2->value, sm2->n_elem * sizeof(*sm2->value)); return 0; } void av1_free_sparse_mtx_elems(SPARSE_MTX *sm) { sm->n_cols = 0; sm->n_rows = 0; if (sm->n_elem != 0) { aom_free(sm->row_pos); aom_free(sm->col_pos); aom_free(sm->value); } sm->n_elem = 0; } /* * Calculate matrix and vector multiplication: A*b * * Input: * sm: matrix A * srcv: the vector b to be multiplied to * dstl: the length of vectors * * Output: * dstv: pointer to the resulting vector */ void av1_mtx_vect_multi_right(const SPARSE_MTX *sm, const double *srcv, double *dstv, int dstl) { memset(dstv, 0, sizeof(*dstv) * dstl); for (int i = 0; i < sm->n_elem; i++) { dstv[sm->row_pos[i]] += srcv[sm->col_pos[i]] * sm->value[i]; } } /* * Calculate matrix and vector multiplication: b*A * * Input: * sm: matrix A * srcv: the vector b to be multiplied to * dstl: the length of vectors * * Output: * dstv: pointer to the resulting vector */ void av1_mtx_vect_multi_left(const SPARSE_MTX *sm, const double *srcv, double *dstv, int dstl) { memset(dstv, 0, sizeof(*dstv) * dstl); for (int i = 0; i < sm->n_elem; i++) { dstv[sm->col_pos[i]] += srcv[sm->row_pos[i]] * sm->value[i]; } } /* * Calculate inner product of two vectors * * Input: * src1, scr2: the vectors to be multiplied * src1l: length of the vectors * * Output: * the inner product */ double av1_vect_vect_multi(const double *src1, int src1l, const double *src2) { double result = 0; for (int i = 0; i < src1l; i++) { result += src1[i] * src2[i]; } return result; } /* * Multiply each element in the matrix sm with a constant c */ void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c) { for (int i = 0; i < sm->n_elem; i++) { sm->value[i] *= c; } } static inline void free_solver_local_buf(double *buf1, double *buf2, double *buf3, double *buf4, double *buf5, double *buf6, double *buf7) { aom_free(buf1); aom_free(buf2); aom_free(buf3); aom_free(buf4); aom_free(buf5); aom_free(buf6); aom_free(buf7); } /* * Solve for Ax = b * no requirement on A * * Input: * A: the sparse matrix * b: the vector b * bl: length of b * x: the vector x * * Output: * x: pointer to the solution vector * * Return: 0 - success * -1 - failed */ int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x) { double *r = NULL, *r_hat = NULL, *p = NULL, *p_hat = NULL, *Ap = NULL, *p_hatA = NULL, *x_hat = NULL; double alpha, beta, rtr, r_norm_2; double denormtemp; // initialize r = aom_calloc(bl, sizeof(*r)); r_hat = aom_calloc(bl, sizeof(*r_hat)); p = aom_calloc(bl, sizeof(*p)); p_hat = aom_calloc(bl, sizeof(*p_hat)); Ap = aom_calloc(bl, sizeof(*Ap)); p_hatA = aom_calloc(bl, sizeof(*p_hatA)); x_hat = aom_calloc(bl, sizeof(*x_hat)); if (!r || !r_hat || !p || !p_hat || !Ap || !p_hatA || !x_hat) { free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat); return -1; } int i; for (i = 0; i < bl; i++) { r[i] = b[i]; r_hat[i] = b[i]; p[i] = r[i]; p_hat[i] = r_hat[i]; x[i] = 0; x_hat[i] = 0; } r_norm_2 = av1_vect_vect_multi(r_hat, bl, r); for (int k = 0; k < MAX_CG_SP_ITER; k++) { rtr = r_norm_2; av1_mtx_vect_multi_right(A, p, Ap, bl); av1_mtx_vect_multi_left(A, p_hat, p_hatA, bl); denormtemp = av1_vect_vect_multi(p_hat, bl, Ap); if (denormtemp < 1e-10) break; alpha = rtr / denormtemp; r_norm_2 = 0; for (i = 0; i < bl; i++) { x[i] += alpha * p[i]; x_hat[i] += alpha * p_hat[i]; r[i] -= alpha * Ap[i]; r_hat[i] -= alpha * p_hatA[i]; r_norm_2 += r_hat[i] * r[i]; } if (sqrt(r_norm_2) < 1e-2) { break; } if (rtr < 1e-10) break; beta = r_norm_2 / rtr; for (i = 0; i < bl; i++) { p[i] = r[i] + beta * p[i]; p_hat[i] = r_hat[i] + beta * p_hat[i]; } } // free free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat); return 0; } /* * Solve for Ax = b when A is symmetric and positive definite * * Input: * A: the sparse matrix * b: the vector b * bl: length of b * x: the vector x * * Output: * x: pointer to the solution vector * * Return: 0 - success * -1 - failed */ int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x) { double *r = NULL, *p = NULL, *Ap = NULL; double alpha, beta, rtr, r_norm_2; double denormtemp; // initialize r = aom_calloc(bl, sizeof(*r)); p = aom_calloc(bl, sizeof(*p)); Ap = aom_calloc(bl, sizeof(*Ap)); if (!r || !p || !Ap) { free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL); return -1; } int i; for (i = 0; i < bl; i++) { r[i] = b[i]; p[i] = r[i]; x[i] = 0; } r_norm_2 = av1_vect_vect_multi(r, bl, r); int k; for (k = 0; k < MAX_CG_SP_ITER; k++) { rtr = r_norm_2; av1_mtx_vect_multi_right(A, p, Ap, bl); denormtemp = av1_vect_vect_multi(p, bl, Ap); if (denormtemp < 1e-10) break; alpha = rtr / denormtemp; r_norm_2 = 0; for (i = 0; i < bl; i++) { x[i] += alpha * p[i]; r[i] -= alpha * Ap[i]; r_norm_2 += r[i] * r[i]; } if (r_norm_2 < 1e-8 * bl) break; if (rtr < 1e-10) break; beta = r_norm_2 / rtr; for (i = 0; i < bl; i++) { p[i] = r[i] + beta * p[i]; } } // free free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL); return 0; } /* * Solve for Ax = b using Jacobi method * * Input: * A: the sparse matrix * b: the vector b * bl: length of b * x: the vector x * * Output: * x: pointer to the solution vector * * Return: 0 - success * -1 - failed */ int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x) { double *diags = NULL, *Rx = NULL, *x_last = NULL, *x_cur = NULL, *tempx = NULL; double resi2; diags = aom_calloc(bl, sizeof(*diags)); Rx = aom_calloc(bl, sizeof(*Rx)); x_last = aom_calloc(bl, sizeof(*x_last)); x_cur = aom_calloc(bl, sizeof(*x_cur)); if (!diags || !Rx || !x_last || !x_cur) { free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL); return -1; } int i; memset(x_last, 0, sizeof(*x_last) * bl); // get the diagonals of A memset(diags, 0, sizeof(*diags) * bl); for (int c = 0; c < A->n_elem; c++) { if (A->row_pos[c] != A->col_pos[c]) continue; diags[A->row_pos[c]] = A->value[c]; } int k; for (k = 0; k < MAX_CG_SP_ITER; k++) { // R = A - diag(diags) // get R*x_last memset(Rx, 0, sizeof(*Rx) * bl); for (int c = 0; c < A->n_elem; c++) { if (A->row_pos[c] == A->col_pos[c]) continue; Rx[A->row_pos[c]] += x_last[A->col_pos[c]] * A->value[c]; } resi2 = 0; for (i = 0; i < bl; i++) { x_cur[i] = (b[i] - Rx[i]) / diags[i]; resi2 += (x_last[i] - x_cur[i]) * (x_last[i] - x_cur[i]); } if (resi2 <= 1e-10 * bl) break; // swap last & cur buffer ptrs tempx = x_last; x_last = x_cur; x_cur = tempx; } printf("\n numiter: %d\n", k); for (i = 0; i < bl; i++) { x[i] = x_cur[i]; } free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL); return 0; } /* * Solve for Ax = b using Steepest descent method * * Input: * A: the sparse matrix * b: the vector b * bl: length of b * x: the vector x * * Output: * x: pointer to the solution vector * * Return: 0 - success * -1 - failed */ int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x) { double *d = NULL, *Ad = NULL, *Ax = NULL; double resi2, resi2_last, dAd, temp; d = aom_calloc(bl, sizeof(*d)); Ax = aom_calloc(bl, sizeof(*Ax)); Ad = aom_calloc(bl, sizeof(*Ad)); if (!d || !Ax || !Ad) { free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL); return -1; } int i; // initialize with 0s resi2 = 0; for (i = 0; i < bl; i++) { x[i] = 0; d[i] = b[i]; resi2 += d[i] * d[i] / bl; } int k; for (k = 0; k < MAX_CG_SP_ITER; k++) { // get A*x_last av1_mtx_vect_multi_right(A, d, Ad, bl); dAd = resi2 * bl / av1_vect_vect_multi(d, bl, Ad); for (i = 0; i < bl; i++) { temp = dAd * d[i]; x[i] = x[i] + temp; } av1_mtx_vect_multi_right(A, x, Ax, bl); resi2_last = resi2; resi2 = 0; for (i = 0; i < bl; i++) { d[i] = b[i] - Ax[i]; resi2 += d[i] * d[i] / bl; } if (resi2 <= 1e-8) break; if (resi2_last - resi2 < 1e-8) { break; } } free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL); return 0; } #endif // CONFIG_OPTICAL_FLOW_API aom-3.12.1/av1/encoder/sparse_linear_solver.h000066400000000000000000000050331477627663500211020ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ #define AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ #ifdef __cplusplus extern "C" { #endif #include "config/aom_config.h" #if CONFIG_OPTICAL_FLOW_API // Number of iterations for solving linear equations. #define MAX_CG_SP_ITER 100 typedef struct { int n_elem; // number of non-zero elements int n_rows; int n_cols; // using arrays to represent non-zero elements. int *col_pos; int *row_pos; // starts with 0 double *value; } SPARSE_MTX; int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values, int num_elem, int num_rows, int num_cols, SPARSE_MTX *sm); int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2, SPARSE_MTX *sm, int row_offset1, int col_offset1, int row_offset2, int col_offset2, int new_n_rows, int new_n_cols); void av1_free_sparse_mtx_elems(SPARSE_MTX *sm); void av1_mtx_vect_multi_right(const SPARSE_MTX *sm, const double *srcv, double *dstv, int dstl); void av1_mtx_vect_multi_left(const SPARSE_MTX *sm, const double *srcv, double *dstv, int dstl); double av1_vect_vect_multi(const double *src1, int src1l, const double *src2); void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c); int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x); int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x); int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x); int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x); #endif // CONFIG_OPTICAL_FLOW_API #ifdef __cplusplus } // extern "C" #endif #endif /* AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ */ aom-3.12.1/av1/encoder/speed_features.c000066400000000000000000003361201477627663500176560ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/common/reconintra.h" #include "av1/encoder/encoder.h" #include "av1/encoder/speed_features.h" #include "av1/encoder/rdopt.h" #include "aom_dsp/aom_dsp_common.h" #define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method // Max speed setting for tx domain evaluation #define MAX_TX_DOMAIN_EVAL_SPEED 5 static const MESH_PATTERN good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } }, { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, }; // TODO(huisu@google.com): These settings are pretty relaxed, tune them for // each speed setting static const MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } }, { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } }, { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } }, { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } }, { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } }, { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } }, }; // Threshold values to be used for pruning the txfm_domain_distortion // based on block MSE // Index 0: Default mode evaluation, Winner mode processing is not // applicable (Eg : IntraBc). Index 1: Mode evaluation. // Index 2: Winner mode evaluation. Index 1 and 2 are applicable when // enable_winner_mode_for_use_tx_domain_dist speed feature is ON // TODO(any): Experiment the threshold logic based on variance metric static const unsigned int tx_domain_dist_thresholds[4][MODE_EVAL_TYPES] = { { UINT_MAX, UINT_MAX, UINT_MAX }, { 22026, 22026, 22026 }, { 1377, 1377, 1377 }, { 0, 0, 0 } }; // Number of different levels of aggressiveness in using transform domain // distortion during the R-D evaluation based on the speed feature // tx_domain_dist_level. #define TX_DOMAIN_DIST_LEVELS 4 // Transform domain distortion type to be used for default, mode and winner mode // evaluation Index 0: Default mode evaluation, Winner mode processing is not // applicable (Eg : IntraBc). Index 1: Mode evaluation. Index 2: Winner mode // evaluation. Index 1 and 2 are applicable when // enable_winner_mode_for_use_tx_domain_dist speed feature is ON static const unsigned int tx_domain_dist_types[TX_DOMAIN_DIST_LEVELS][MODE_EVAL_TYPES] = { { 0, 2, 0 }, { 1, 2, 0 }, { 2, 2, 0 }, { 2, 2, 2 } }; // Threshold values to be used for disabling coeff RD-optimization // based on block MSE / qstep^2. // TODO(any): Experiment the threshold logic based on variance metric. // Table has satd and dist threshold value index 0 : dist,index 1: satd // For each row, the indices are as follows. // Index 0: Default mode evaluation, Winner mode processing is not applicable // (Eg : IntraBc) // Index 1: Mode evaluation. // Index 2: Winner mode evaluation. // Index 1 and 2 are applicable when enable_winner_mode_for_coeff_opt speed // feature is ON // There are 7 levels with increasing speed, mapping to vertical indices. static const unsigned int coeff_opt_thresholds[9][MODE_EVAL_TYPES][2] = { { { UINT_MAX, UINT_MAX }, { UINT_MAX, UINT_MAX }, { UINT_MAX, UINT_MAX } }, { { 3200, UINT_MAX }, { 250, UINT_MAX }, { UINT_MAX, UINT_MAX } }, { { 1728, UINT_MAX }, { 142, UINT_MAX }, { UINT_MAX, UINT_MAX } }, { { 864, UINT_MAX }, { 142, UINT_MAX }, { UINT_MAX, UINT_MAX } }, { { 432, UINT_MAX }, { 86, UINT_MAX }, { UINT_MAX, UINT_MAX } }, { { 864, 97 }, { 142, 16 }, { UINT_MAX, UINT_MAX } }, { { 432, 97 }, { 86, 16 }, { UINT_MAX, UINT_MAX } }, { { 216, 25 }, { 86, 10 }, { UINT_MAX, UINT_MAX } }, { { 216, 25 }, { 0, 10 }, { UINT_MAX, UINT_MAX } } }; // Transform size to be used for default, mode and winner mode evaluation // Index 0: Default mode evaluation, Winner mode processing is not applicable // (Eg : IntraBc) Index 1: Mode evaluation. Index 2: Winner mode evaluation. // Index 1 and 2 are applicable when enable_winner_mode_for_tx_size_srch speed // feature is ON static const TX_SIZE_SEARCH_METHOD tx_size_search_methods[4][MODE_EVAL_TYPES] = { { USE_FULL_RD, USE_LARGESTALL, USE_FULL_RD }, { USE_FAST_RD, USE_LARGESTALL, USE_FULL_RD }, { USE_LARGESTALL, USE_LARGESTALL, USE_FULL_RD }, { USE_LARGESTALL, USE_LARGESTALL, USE_LARGESTALL } }; // Predict transform skip levels to be used for default, mode and winner mode // evaluation. Index 0: Default mode evaluation, Winner mode processing is not // applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation // Values indicate the aggressiveness of skip flag prediction. // 0 : no early skip prediction // 1 : conservative early skip prediction using DCT_DCT // 2 : early skip prediction based on SSE static const unsigned int predict_skip_levels[3][MODE_EVAL_TYPES] = { { 0, 0, 0 }, { 1, 1, 1 }, { 1, 2, 1 } }; // Predict skip or DC block level used during transform type search. It is // indexed using the following: // First index : Speed feature 'dc_blk_pred_level' (0 to 3) // Second index : Mode evaluation type (DEFAULT_EVAL, MODE_EVAL and // WINNER_MODE_EVAL). // // The values of predict_dc_levels[][] indicate the aggressiveness of predicting // a block as transform skip or DC only. // Type 0 : No skip block or DC only block prediction // Type 1 : Prediction of skip block based on residual mean and variance // Type 2 : Prediction of skip block or DC only block based on residual mean and // variance static const unsigned int predict_dc_levels[4][MODE_EVAL_TYPES] = { { 0, 0, 0 }, { 1, 1, 0 }, { 2, 2, 0 }, { 2, 2, 2 } }; #if !CONFIG_FPMT_TEST // This table holds the maximum number of reference frames for global motion. // The table is indexed as per the speed feature 'gm_search_type'. // 0 : All reference frames are allowed. // 1 : All reference frames except L2 and L3 are allowed. // 2 : All reference frames except L2, L3 and ARF2 are allowed. // 3 : No reference frame is allowed. static const int gm_available_reference_frames[GM_DISABLE_SEARCH + 1] = { INTER_REFS_PER_FRAME, INTER_REFS_PER_FRAME - 2, INTER_REFS_PER_FRAME - 3, 0 }; #endif // Qindex threshold levels used for selecting full-pel motion search. // ms_qthresh[i][j][k] indicates the qindex boundary value for 'k'th qindex band // for resolution index 'j' for aggressiveness level 'i'. // Aggressiveness increases from i = 0 to 2. // j = 0: lower than 720p resolution, j = 1: 720p or larger resolution. // Currently invoked only for speed 0, 1 and 2. static const int ms_qindex_thresh[3][2][2] = { { { 200, 70 }, { MAXQ, 200 } }, { { 170, 50 }, { MAXQ, 200 } }, { { 170, 40 }, { 200, 40 } } }; // Full-pel search methods for aggressive search based on qindex. // Index 0 is for resolutions lower than 720p, index 1 for 720p or larger // resolutions. Currently invoked only for speed 1 and 2. static const SEARCH_METHODS motion_search_method[2] = { CLAMPED_DIAMOND, DIAMOND }; // Intra only frames, golden frames (except alt ref overlays) and // alt ref frames tend to be coded at a higher than ambient quality static int frame_is_boosted(const AV1_COMP *cpi) { return frame_is_kf_gf_arf(cpi); } // Set transform rd gate level for all transform search cases. static inline void set_txfm_rd_gate_level( int txfm_rd_gate_level[TX_SEARCH_CASES], int level) { assert(level <= MAX_TX_RD_GATE_LEVEL); for (int idx = 0; idx < TX_SEARCH_CASES; idx++) txfm_rd_gate_level[idx] = level; } static void set_allintra_speed_feature_framesize_dependent( const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { const AV1_COMMON *const cm = &cpi->common; const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080; const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160; const bool use_hbd = cpi->oxcf.use_highbitdepth; if (is_480p_or_larger) { sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128; if (is_720p_or_larger) sf->part_sf.auto_max_partition_based_on_simple_motion = ADAPT_PRED; else sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED; } else { sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED; if (use_hbd) sf->tx_sf.prune_tx_size_level = 1; } if (is_4k_or_larger) { sf->part_sf.default_min_partition_size = BLOCK_8X8; } // TODO(huisu@google.com): train models for 720P and above. if (!is_720p_or_larger) { sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 sf->part_sf.ml_partition_search_breakout_thresh[3] = 500; // BLOCK_64X64 sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 sf->part_sf.ml_early_term_after_part_split_level = 1; } if (is_720p_or_larger) { // TODO(chiyotsai@google.com): make this speed feature adaptive based on // current block's vertical texture instead of hardcoded with resolution sf->mv_sf.use_downsampled_sad = 2; } if (speed >= 1) { if (is_720p_or_larger) { sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128; } else if (is_480p_or_larger) { sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; } else { sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; } if (!is_720p_or_larger) { sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 sf->part_sf.ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64 sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 } sf->part_sf.ml_early_term_after_part_split_level = 2; } if (speed >= 2) { if (is_720p_or_larger) { sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; } else if (is_480p_or_larger) { sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; } else { sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; } if (is_720p_or_larger) { sf->part_sf.partition_search_breakout_dist_thr = (1 << 24); sf->part_sf.partition_search_breakout_rate_thr = 120; } else { sf->part_sf.partition_search_breakout_dist_thr = (1 << 22); sf->part_sf.partition_search_breakout_rate_thr = 100; } if (is_480p_or_larger) { sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1; if (use_hbd) sf->tx_sf.prune_tx_size_level = 2; } else { if (use_hbd) sf->tx_sf.prune_tx_size_level = 3; } } if (speed >= 3) { sf->part_sf.ml_early_term_after_part_split_level = 0; if (is_720p_or_larger) { sf->part_sf.partition_search_breakout_dist_thr = (1 << 25); sf->part_sf.partition_search_breakout_rate_thr = 200; } else { sf->part_sf.max_intra_bsize = BLOCK_32X32; sf->part_sf.partition_search_breakout_dist_thr = (1 << 23); sf->part_sf.partition_search_breakout_rate_thr = 120; } if (use_hbd) sf->tx_sf.prune_tx_size_level = 3; } if (speed >= 4) { if (is_720p_or_larger) { sf->part_sf.partition_search_breakout_dist_thr = (1 << 26); } else { sf->part_sf.partition_search_breakout_dist_thr = (1 << 24); } if (is_480p_or_larger) { sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2; } } if (speed >= 6) { if (is_720p_or_larger) { sf->part_sf.auto_max_partition_based_on_simple_motion = NOT_IN_USE; } else if (is_480p_or_larger) { sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED; } if (is_1080p_or_larger) { sf->part_sf.default_min_partition_size = BLOCK_8X8; } sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16; } if (speed >= 7) { // TODO(kyslov): add more speed features to control speed/quality } if (speed >= 8) { if (!is_480p_or_larger) { sf->rt_sf.nonrd_check_partition_merge_mode = 2; } if (is_720p_or_larger) { sf->rt_sf.force_large_partition_blocks_intra = 1; } } if (speed >= 9) { // TODO(kyslov): add more speed features to control speed/quality if (!is_4k_or_larger) { // In av1_select_sb_size(), superblock size is set to 64x64 only for // resolutions less than 4k in speed>=9, to improve the multithread // performance. If cost update levels are set to INTERNAL_COST_UPD_OFF // for resolutions >= 4k, the SB size setting can be modified for these // resolutions as well. sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_OFF; sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_OFF; } } } static void set_allintra_speed_features_framesize_independent( const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { const AV1_COMMON *const cm = &cpi->common; const int allow_screen_content_tools = cm->features.allow_screen_content_tools; const int use_hbd = cpi->oxcf.use_highbitdepth; sf->part_sf.less_rectangular_check_level = 1; sf->part_sf.ml_prune_partition = 1; sf->part_sf.prune_ext_partition_types_search_level = 1; sf->part_sf.prune_part4_search = 2; sf->part_sf.simple_motion_search_prune_rect = 1; sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3; sf->part_sf.reuse_prev_rd_results_for_part_ab = 1; sf->part_sf.use_best_rd_for_pruning = 1; sf->intra_sf.intra_pruning_with_hog = 1; sf->intra_sf.prune_luma_palette_size_search_level = 1; sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF; sf->intra_sf.early_term_chroma_palette_size_search = 1; sf->tx_sf.adaptive_txb_search_level = 1; sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1; sf->tx_sf.model_based_prune_tx_search_level = 1; sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1; sf->rt_sf.use_nonrd_pick_mode = 0; sf->rt_sf.use_real_time_ref_set = 0; if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION || cpi->use_screen_content_tools) { sf->mv_sf.exhaustive_searches_thresh = (1 << 20); } else { sf->mv_sf.exhaustive_searches_thresh = (1 << 25); } sf->rd_sf.perform_coeff_opt = 1; sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL; if (speed >= 1) { sf->part_sf.intra_cnn_based_part_prune_level = allow_screen_content_tools ? 0 : 2; sf->part_sf.simple_motion_search_early_term_none = 1; // TODO(Venkat): Clean-up frame type dependency for // simple_motion_search_split in partition search function and set the // speed feature accordingly sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2; sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3; sf->part_sf.reuse_best_prediction_for_part_ab = 1; sf->mv_sf.exhaustive_searches_thresh <<= 1; sf->intra_sf.prune_palette_search_level = 1; sf->intra_sf.prune_luma_palette_size_search_level = 2; sf->intra_sf.top_intra_model_count_allowed = 3; sf->tx_sf.adaptive_txb_search_level = 2; sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; sf->tx_sf.model_based_prune_tx_search_level = 0; sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000; sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2; sf->tx_sf.tx_type_search.skip_tx_search = 1; sf->rd_sf.perform_coeff_opt = 2; sf->rd_sf.tx_domain_dist_level = 1; sf->rd_sf.tx_domain_dist_thres_level = 1; sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1; sf->lpf_sf.dual_sgr_penalty_level = 1; sf->lpf_sf.enable_sgr_ep_pruning = 1; } if (speed >= 2) { sf->mv_sf.auto_mv_step_size = 1; sf->intra_sf.disable_smooth_intra = 1; sf->intra_sf.intra_pruning_with_hog = 2; sf->intra_sf.prune_filter_intra_level = 1; sf->rd_sf.perform_coeff_opt = 3; sf->lpf_sf.prune_wiener_based_on_src_var = 1; sf->lpf_sf.prune_sgr_based_on_wiener = 1; } if (speed >= 3) { sf->hl_sf.high_precision_mv_usage = CURRENT_Q; sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF; sf->part_sf.less_rectangular_check_level = 2; sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL1; sf->part_sf.prune_ext_part_using_split_info = 1; sf->mv_sf.full_pixel_search_level = 1; sf->mv_sf.search_method = DIAMOND; // TODO(chiyotsai@google.com): the thresholds chosen for intra hog are // inherited directly from luma hog with some minor tweaking. Eventually we // should run this with a bayesian optimizer to find the Pareto frontier. sf->intra_sf.chroma_intra_pruning_with_hog = 2; sf->intra_sf.intra_pruning_with_hog = 3; sf->intra_sf.prune_palette_search_level = 2; sf->tx_sf.adaptive_txb_search_level = 2; sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2; sf->tx_sf.use_rd_based_breakout_for_intra_tx_search = true; // TODO(any): evaluate if these lpf features can be moved to speed 2. // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality // loss. sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 1 : 2; sf->lpf_sf.disable_loop_restoration_chroma = 0; sf->lpf_sf.reduce_wiener_window_size = 1; sf->lpf_sf.prune_wiener_based_on_src_var = 2; } if (speed >= 4) { sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL2; sf->part_sf.simple_motion_search_reduce_search_steps = 4; sf->part_sf.prune_ext_part_using_split_info = 2; sf->part_sf.early_term_after_none_split = 1; sf->part_sf.ml_predict_breakout_level = 3; sf->intra_sf.prune_chroma_modes_using_luma_winner = 1; sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL; sf->tpl_sf.prune_starting_mv = 2; sf->tpl_sf.subpel_force_stop = HALF_PEL; sf->tpl_sf.search_method = FAST_BIGDIA; sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2; sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1; sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3; sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1; sf->rd_sf.perform_coeff_opt = 5; sf->rd_sf.tx_domain_dist_thres_level = 3; sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL; sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL3; sf->mv_sf.reduce_search_range = 1; sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1; sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1; sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_DEFAULT; sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1; } if (speed >= 5) { sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL3; sf->part_sf.ext_partition_eval_thresh = allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16; sf->part_sf.intra_cnn_based_part_prune_level = allow_screen_content_tools ? 1 : 2; sf->intra_sf.chroma_intra_pruning_with_hog = 3; sf->lpf_sf.use_coarse_filter_level_search = 0; // Disable Wiener and Self-guided Loop restoration filters. sf->lpf_sf.disable_wiener_filter = true; sf->lpf_sf.disable_sgr_filter = true; sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_2; sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_FAST; } if (speed >= 6) { sf->intra_sf.prune_smooth_intra_mode_for_chroma = 1; sf->intra_sf.prune_filter_intra_level = 2; sf->intra_sf.chroma_intra_pruning_with_hog = 4; sf->intra_sf.intra_pruning_with_hog = 4; sf->intra_sf.cfl_search_range = 1; sf->intra_sf.top_intra_model_count_allowed = 2; sf->intra_sf.adapt_top_model_rd_count_using_neighbors = 1; sf->intra_sf.prune_luma_odd_delta_angles_in_intra = 1; sf->part_sf.prune_rectangular_split_based_on_qidx = allow_screen_content_tools ? 0 : 2; sf->part_sf.prune_rect_part_using_4x4_var_deviation = true; sf->part_sf.prune_rect_part_using_none_pred_mode = true; sf->part_sf.prune_sub_8x8_partition_level = allow_screen_content_tools ? 0 : 1; sf->part_sf.prune_part4_search = 3; // TODO(jingning): This might not be a good trade off if the // target image quality is very low. sf->part_sf.default_max_partition_size = BLOCK_32X32; sf->mv_sf.use_bsize_dependent_search_method = 1; sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 3; sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 0; sf->tx_sf.prune_intra_tx_depths_using_nn = true; sf->rd_sf.perform_coeff_opt = 6; sf->rd_sf.tx_domain_dist_level = 3; sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4; sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q; sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF; sf->winner_mode_sf.prune_winner_mode_eval_level = 1; sf->winner_mode_sf.dc_blk_pred_level = 1; } // The following should make all-intra mode speed 7 approximately equal // to real-time speed 6, // all-intra speed 8 close to real-time speed 7, and all-intra speed 9 // close to real-time speed 8 if (speed >= 7) { sf->part_sf.default_min_partition_size = BLOCK_8X8; sf->part_sf.partition_search_type = VAR_BASED_PARTITION; sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; sf->rt_sf.var_part_split_threshold_shift = 7; } if (speed >= 8) { sf->rt_sf.hybrid_intra_pickmode = 1; sf->rt_sf.use_nonrd_pick_mode = 1; sf->rt_sf.nonrd_check_partition_merge_mode = 1; sf->rt_sf.var_part_split_threshold_shift = 8; // Set mask for intra modes. for (int i = 0; i < BLOCK_SIZES; ++i) if (i >= BLOCK_32X32) sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; else // Use DC, H, V intra mode for block sizes < 32X32. sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V; } if (speed >= 9) { sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW; sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW; sf->rt_sf.nonrd_check_partition_merge_mode = 0; sf->rt_sf.hybrid_intra_pickmode = 0; sf->rt_sf.var_part_split_threshold_shift = 9; sf->rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var = true; sf->rt_sf.prune_h_pred_using_best_mode_so_far = true; sf->rt_sf.enable_intra_mode_pruning_using_neighbors = true; sf->rt_sf.prune_intra_mode_using_best_sad_so_far = true; } // As the speed feature prune_chroma_modes_using_luma_winner already // constrains the number of chroma directional mode evaluations to a maximum // of 1, the HOG computation and the associated pruning logic does not seem to // help speed-up the chroma mode evaluations. Hence disable the speed feature // chroma_intra_pruning_with_hog when prune_chroma_modes_using_luma_winner is // enabled. if (sf->intra_sf.prune_chroma_modes_using_luma_winner) sf->intra_sf.chroma_intra_pruning_with_hog = 0; } static void set_good_speed_feature_framesize_dependent( const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { const AV1_COMMON *const cm = &cpi->common; const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480; const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080; const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160; const bool use_hbd = cpi->oxcf.use_highbitdepth; // Speed features applicable for temporal filtering and tpl modules may be // changed based on frame type at places where the sf is applied (Example : // use_downsampled_sad). This is because temporal filtering and tpl modules // are called before this function (except for the first key frame). // TODO(deepa.kg@ittiam.com): For the speed features applicable to temporal // filtering and tpl modules, modify the sf initialization appropriately // before calling the modules. const int boosted = frame_is_boosted(cpi); const int is_boosted_arf2_bwd_type = boosted || cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE; const int is_lf_frame = cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == LF_UPDATE; const int allow_screen_content_tools = cm->features.allow_screen_content_tools; if (is_480p_or_larger) { sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128; if (is_720p_or_larger) sf->part_sf.auto_max_partition_based_on_simple_motion = ADAPT_PRED; else sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED; } else { sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED; if (use_hbd) sf->tx_sf.prune_tx_size_level = 1; } if (is_4k_or_larger) { sf->part_sf.default_min_partition_size = BLOCK_8X8; } // TODO(huisu@google.com): train models for 720P and above. if (!is_720p_or_larger) { sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 sf->part_sf.ml_partition_search_breakout_thresh[3] = 500; // BLOCK_64X64 sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 sf->part_sf.ml_early_term_after_part_split_level = 1; } if (is_720p_or_larger) { // TODO(chiyotsai@google.com): make this speed feature adaptive based on // current block's vertical texture instead of hardcoded with resolution sf->mv_sf.use_downsampled_sad = 2; } if (!is_720p_or_larger) { const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; const int rate_tolerance = AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct); sf->hl_sf.recode_tolerance = 25 + (rate_tolerance >> 2); } if (speed >= 1) { if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 1; if (is_720p_or_larger) { sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128; } else if (is_480p_or_larger) { sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; } else { sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; } if (!is_720p_or_larger) { sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 sf->part_sf.ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64 sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 } sf->part_sf.ml_early_term_after_part_split_level = 2; sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1; } if (speed >= 2) { if (is_720p_or_larger) { sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; } else if (is_480p_or_larger) { sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; } else { sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; } if (is_720p_or_larger) { sf->part_sf.partition_search_breakout_dist_thr = (1 << 24); sf->part_sf.partition_search_breakout_rate_thr = 120; } else { sf->part_sf.partition_search_breakout_dist_thr = (1 << 22); sf->part_sf.partition_search_breakout_rate_thr = 100; } if (is_720p_or_larger) { sf->inter_sf.prune_obmc_prob_thresh = 16; } else { sf->inter_sf.prune_obmc_prob_thresh = 8; } if (is_480p_or_larger) { sf->inter_sf.disable_interintra_wedge_var_thresh = 100; } else { sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX; } if (is_480p_or_lesser) sf->inter_sf.skip_ext_comp_nearmv_mode = 1; if (is_720p_or_larger) { sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 1 : 0; } else { sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 2 : 0; } if (is_480p_or_larger) { sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1; if (use_hbd) sf->tx_sf.prune_tx_size_level = 2; } else { if (use_hbd) sf->tx_sf.prune_tx_size_level = 3; sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = boosted ? 0 : 1; sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = boosted ? 0 : 1; } if (!is_720p_or_larger) { sf->mv_sf.disable_second_mv = 1; sf->mv_sf.auto_mv_step_size = 2; } else { sf->mv_sf.disable_second_mv = boosted ? 0 : 2; sf->mv_sf.auto_mv_step_size = 1; } if (!is_720p_or_larger) { sf->hl_sf.recode_tolerance = 50; sf->inter_sf.disable_interinter_wedge_newmv_search = is_boosted_arf2_bwd_type ? 0 : 1; sf->inter_sf.enable_fast_wedge_mask_search = 1; } } if (speed >= 3) { sf->inter_sf.enable_fast_wedge_mask_search = 1; sf->inter_sf.skip_newmv_in_drl = 2; sf->inter_sf.skip_ext_comp_nearmv_mode = 1; sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 3 : 0; sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1; sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1; sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = frame_is_intra_only(&cpi->common) ? 0 : 1; sf->part_sf.ml_early_term_after_part_split_level = 0; if (is_720p_or_larger) { sf->part_sf.partition_search_breakout_dist_thr = (1 << 25); sf->part_sf.partition_search_breakout_rate_thr = 200; sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 2 : 0; } else { sf->part_sf.max_intra_bsize = BLOCK_32X32; sf->part_sf.partition_search_breakout_dist_thr = (1 << 23); sf->part_sf.partition_search_breakout_rate_thr = 120; sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 1 : 0; } if (use_hbd) sf->tx_sf.prune_tx_size_level = 3; if (is_480p_or_larger) { sf->part_sf.early_term_after_none_split = 1; } else { sf->part_sf.early_term_after_none_split = 0; } if (is_720p_or_larger) { sf->intra_sf.skip_intra_in_interframe = boosted ? 1 : 2; } else { sf->intra_sf.skip_intra_in_interframe = boosted ? 1 : 3; } if (is_720p_or_larger) { sf->inter_sf.disable_interinter_wedge_var_thresh = 100; sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 1; } else { sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX; sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 2; sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL2; } sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX; } if (speed >= 4) { sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2; sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1; if (is_720p_or_larger) { sf->part_sf.partition_search_breakout_dist_thr = (1 << 26); } else { sf->part_sf.partition_search_breakout_dist_thr = (1 << 24); } sf->part_sf.early_term_after_none_split = 1; if (is_480p_or_larger) { sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2; } else { sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 1; } sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX; sf->inter_sf.prune_obmc_prob_thresh = INT_MAX; sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 2; if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 3; if (is_720p_or_larger) { sf->inter_sf.prune_comp_ref_frames = 1; } else if (is_480p_or_larger) { sf->inter_sf.prune_comp_ref_frames = is_boosted_arf2_bwd_type ? 0 : 1; } if (is_720p_or_larger) sf->hl_sf.recode_tolerance = 32; else sf->hl_sf.recode_tolerance = 55; sf->intra_sf.skip_intra_in_interframe = 4; sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL3; } if (speed >= 5) { if (is_720p_or_larger) { sf->inter_sf.prune_warped_prob_thresh = 16; } else if (is_480p_or_larger) { sf->inter_sf.prune_warped_prob_thresh = 8; } if (is_720p_or_larger) sf->hl_sf.recode_tolerance = 40; sf->inter_sf.skip_newmv_in_drl = 4; sf->inter_sf.prune_comp_ref_frames = 1; sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 1; if (!is_720p_or_larger) { sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW_SET; sf->inter_sf.prune_nearest_near_mv_using_refmv_weight = (boosted || allow_screen_content_tools) ? 0 : 1; sf->mv_sf.use_downsampled_sad = 1; } if (!is_480p_or_larger) { sf->part_sf.partition_search_breakout_dist_thr = (1 << 26); } if (is_480p_or_lesser) { sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL1; } else { sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL2; } if (is_720p_or_larger) sf->part_sf.ext_part_eval_based_on_cur_best = (allow_screen_content_tools || frame_is_intra_only(cm)) ? 0 : 1; if (is_480p_or_larger) { sf->tpl_sf.reduce_num_frames = 1; } } if (speed >= 6) { sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4; sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL3; sf->inter_sf.prune_comp_ref_frames = 2; sf->inter_sf.prune_nearest_near_mv_using_refmv_weight = (boosted || allow_screen_content_tools) ? 0 : 1; sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 2; if (is_720p_or_larger) { sf->part_sf.auto_max_partition_based_on_simple_motion = NOT_IN_USE; } else if (is_480p_or_larger) { sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED; } if (is_480p_or_larger) { sf->hl_sf.allow_sub_blk_me_in_tf = 1; } if (is_1080p_or_larger) { sf->part_sf.default_min_partition_size = BLOCK_8X8; } if (is_720p_or_larger) { sf->inter_sf.disable_masked_comp = 1; } if (!is_720p_or_larger) { sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW; sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW; } if (is_720p_or_larger) { sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; sf->part_sf.partition_search_breakout_dist_thr = (1 << 28); } else { sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16; sf->part_sf.partition_search_breakout_dist_thr = (1 << 26); } if (is_720p_or_larger) { sf->inter_sf.prune_ref_mv_idx_search = 2; } else { sf->inter_sf.prune_ref_mv_idx_search = 1; } if (!is_720p_or_larger) { sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = is_boosted_arf2_bwd_type ? 450 : 150; } sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4; sf->hl_sf.recode_tolerance = 55; } } static void set_good_speed_features_framesize_independent( const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { const AV1_COMMON *const cm = &cpi->common; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int boosted = frame_is_boosted(cpi); const int is_boosted_arf2_bwd_type = boosted || gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE; const int is_inter_frame = gf_group->frame_type[cpi->gf_frame_index] == INTER_FRAME; const int allow_screen_content_tools = cm->features.allow_screen_content_tools; const int use_hbd = cpi->oxcf.use_highbitdepth; if (!cpi->oxcf.tile_cfg.enable_large_scale_tile) { sf->hl_sf.high_precision_mv_usage = LAST_MV_DATA; } // Speed 0 for all speed features that give neutral coding performance change. sf->gm_sf.gm_search_type = boosted ? GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2 : GM_SEARCH_CLOSEST_REFS_ONLY; sf->gm_sf.prune_ref_frame_for_gm_search = boosted ? 0 : 1; sf->gm_sf.disable_gm_search_based_on_stats = 1; sf->part_sf.less_rectangular_check_level = 1; sf->part_sf.ml_prune_partition = 1; sf->part_sf.prune_ext_partition_types_search_level = 1; sf->part_sf.prune_part4_search = 2; sf->part_sf.simple_motion_search_prune_rect = 1; sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3; sf->part_sf.reuse_prev_rd_results_for_part_ab = 1; sf->part_sf.use_best_rd_for_pruning = 1; sf->part_sf.simple_motion_search_prune_agg = allow_screen_content_tools ? NO_PRUNING : SIMPLE_AGG_LVL0; // TODO(debargha): Test, tweak and turn on either 1 or 2 sf->inter_sf.inter_mode_rd_model_estimation = 1; sf->inter_sf.model_based_post_interp_filter_breakout = 1; sf->inter_sf.prune_compound_using_single_ref = 1; sf->inter_sf.prune_mode_search_simple_translation = 1; sf->inter_sf.prune_ref_frame_for_rect_partitions = (boosted || (allow_screen_content_tools)) ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2); sf->inter_sf.reduce_inter_modes = boosted ? 1 : 2; sf->inter_sf.selective_ref_frame = 1; sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH; sf->interp_sf.use_fast_interpolation_filter_search = 1; sf->intra_sf.intra_pruning_with_hog = 1; sf->tx_sf.adaptive_txb_search_level = 1; sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1; sf->tx_sf.model_based_prune_tx_search_level = 1; sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1; sf->tpl_sf.search_method = NSTEP_8PT; sf->rt_sf.use_nonrd_pick_mode = 0; sf->rt_sf.use_real_time_ref_set = 0; if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION || cpi->use_screen_content_tools) { sf->mv_sf.exhaustive_searches_thresh = (1 << 20); } else { sf->mv_sf.exhaustive_searches_thresh = (1 << 25); } sf->rd_sf.perform_coeff_opt = 1; sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL; if (speed >= 1) { sf->hl_sf.adjust_num_frames_for_arf_filtering = allow_screen_content_tools ? 0 : 1; sf->part_sf.intra_cnn_based_part_prune_level = allow_screen_content_tools ? 0 : 2; sf->part_sf.simple_motion_search_early_term_none = 1; // TODO(Venkat): Clean-up frame type dependency for // simple_motion_search_split in partition search function and set the // speed feature accordingly sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2; sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3; sf->mv_sf.exhaustive_searches_thresh <<= 1; sf->mv_sf.obmc_full_pixel_search_level = 1; sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS; sf->mv_sf.disable_extensive_joint_motion_search = 1; sf->inter_sf.prune_comp_search_by_single_result = boosted ? 2 : 1; sf->inter_sf.prune_comp_type_by_comp_avg = 1; sf->inter_sf.prune_comp_type_by_model_rd = boosted ? 0 : 1; sf->inter_sf.prune_ref_frame_for_rect_partitions = (frame_is_intra_only(&cpi->common) || (allow_screen_content_tools)) ? 0 : (boosted ? 1 : 2); sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3; sf->inter_sf.reuse_inter_intra_mode = 1; sf->inter_sf.selective_ref_frame = 2; sf->inter_sf.skip_arf_compound = 1; sf->interp_sf.use_interp_filter = 1; sf->intra_sf.prune_palette_search_level = 1; sf->tx_sf.adaptive_txb_search_level = 2; sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; sf->tx_sf.model_based_prune_tx_search_level = 0; sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000; sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2; sf->tx_sf.tx_type_search.skip_tx_search = 1; sf->rd_sf.perform_coeff_opt = boosted ? 2 : 3; sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2; sf->rd_sf.tx_domain_dist_thres_level = 1; sf->lpf_sf.dual_sgr_penalty_level = 1; sf->lpf_sf.enable_sgr_ep_pruning = 1; // TODO(any, yunqing): move this feature to speed 0. sf->tpl_sf.skip_alike_starting_mv = 1; } if (speed >= 2) { sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF; sf->fp_sf.skip_motion_search_threshold = 25; sf->gm_sf.num_refinement_steps = 2; sf->part_sf.reuse_best_prediction_for_part_ab = !frame_is_intra_only(&cpi->common); sf->mv_sf.simple_motion_subpel_force_stop = QUARTER_PEL; sf->mv_sf.subpel_iters_per_step = 1; sf->mv_sf.reduce_search_range = 1; // TODO(chiyotsai@google.com): We can get 10% speed up if we move // adaptive_rd_thresh to speed 1. But currently it performs poorly on some // clips (e.g. 5% loss on dinner_1080p). We need to examine the sequence a // bit more closely to figure out why. sf->inter_sf.adaptive_rd_thresh = 1; sf->inter_sf.disable_interinter_wedge_var_thresh = 100; sf->inter_sf.fast_interintra_wedge_search = 1; sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 1; sf->inter_sf.prune_ext_comp_using_neighbors = 1; sf->inter_sf.prune_comp_using_best_single_mode_ref = 2; sf->inter_sf.prune_comp_type_by_comp_avg = 2; sf->inter_sf.selective_ref_frame = 3; sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED; sf->inter_sf.enable_fast_compound_mode_search = 1; sf->inter_sf.reuse_mask_search_results = 1; set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level, boosted ? 0 : 1); sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 1; sf->inter_sf.alt_ref_search_fp = 1; sf->interp_sf.adaptive_interp_filter_search = 1; sf->interp_sf.disable_dual_filter = 1; sf->intra_sf.disable_smooth_intra = !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key > 1); sf->intra_sf.intra_pruning_with_hog = 2; sf->intra_sf.skip_intra_in_interframe = is_inter_frame ? 2 : 1; sf->intra_sf.skip_filter_intra_in_inter_frames = 1; sf->tpl_sf.prune_starting_mv = 1; sf->tpl_sf.search_method = DIAMOND; sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 4; sf->rd_sf.use_mb_rd_hash = 1; sf->lpf_sf.prune_wiener_based_on_src_var = 1; sf->lpf_sf.prune_sgr_based_on_wiener = 1; sf->lpf_sf.disable_loop_restoration_chroma = boosted ? 0 : 1; sf->lpf_sf.reduce_wiener_window_size = boosted ? 0 : 1; // TODO(any): Re-evaluate this feature set to 1 in speed 2. sf->tpl_sf.allow_compound_pred = 0; sf->tpl_sf.prune_ref_frames_in_tpl = 1; } if (speed >= 3) { sf->hl_sf.high_precision_mv_usage = CURRENT_Q; sf->gm_sf.prune_ref_frame_for_gm_search = 1; sf->gm_sf.prune_zero_mv_with_sse = 1; sf->gm_sf.num_refinement_steps = 0; sf->part_sf.less_rectangular_check_level = 2; sf->part_sf.simple_motion_search_prune_agg = allow_screen_content_tools ? SIMPLE_AGG_LVL0 : (boosted ? SIMPLE_AGG_LVL1 : QIDX_BASED_AGG_LVL1); sf->part_sf.prune_ext_part_using_split_info = 1; sf->part_sf.simple_motion_search_rect_split = 1; sf->mv_sf.full_pixel_search_level = 1; sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED; sf->mv_sf.search_method = DIAMOND; sf->mv_sf.disable_second_mv = 2; sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_1; sf->mv_sf.use_intrabc = 0; sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1; sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW; sf->inter_sf.disable_onesided_comp = 1; sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX; // TODO(any): Experiment with the early exit mechanism for speeds 0, 1 and 2 // and clean-up the speed feature sf->inter_sf.perform_best_rd_based_gating_for_chroma = 1; sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 1; sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 2; sf->inter_sf.selective_ref_frame = 5; sf->inter_sf.reuse_compound_type_decision = 1; set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level, boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2)); sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 2; sf->interp_sf.adaptive_interp_filter_search = 2; // TODO(chiyotsai@google.com): the thresholds chosen for intra hog are // inherited directly from luma hog with some minor tweaking. Eventually we // should run this with a bayesian optimizer to find the Pareto frontier. sf->intra_sf.chroma_intra_pruning_with_hog = 2; sf->intra_sf.intra_pruning_with_hog = 3; sf->intra_sf.prune_palette_search_level = 2; sf->intra_sf.top_intra_model_count_allowed = 2; sf->tpl_sf.prune_starting_mv = 2; sf->tpl_sf.skip_alike_starting_mv = 2; sf->tpl_sf.prune_intra_modes = 1; sf->tpl_sf.reduce_first_step_size = 6; sf->tpl_sf.subpel_force_stop = QUARTER_PEL; sf->tpl_sf.gop_length_decision_method = 1; sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3; sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2; sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3; // TODO(any): Refactor the code related to following winner mode speed // features sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1; sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1; sf->winner_mode_sf.motion_mode_for_winner_cand = boosted ? 0 : gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE ? 1 : 2; sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 4; // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality // loss. sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 1 : 2; sf->lpf_sf.prune_wiener_based_on_src_var = 2; sf->lpf_sf.use_coarse_filter_level_search = frame_is_intra_only(&cpi->common) ? 0 : 1; sf->lpf_sf.use_downsampled_wiener_stats = 1; } if (speed >= 4) { sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; sf->gm_sf.prune_zero_mv_with_sse = 2; sf->gm_sf.downsample_level = 1; sf->part_sf.simple_motion_search_prune_agg = allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL2; sf->part_sf.simple_motion_search_reduce_search_steps = 4; sf->part_sf.prune_ext_part_using_split_info = 2; sf->part_sf.ml_predict_breakout_level = 3; sf->part_sf.prune_rectangular_split_based_on_qidx = (allow_screen_content_tools || frame_is_intra_only(&cpi->common)) ? 0 : 1; sf->inter_sf.alt_ref_search_fp = 2; sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_DEFAULT] = boosted ? 0 : 3; sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_MOTION_MODE] = boosted ? 0 : 5; sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE] = boosted ? 0 : 3; sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 2; sf->inter_sf.prune_ext_comp_using_neighbors = 2; sf->inter_sf.prune_obmc_prob_thresh = INT_MAX; sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX; sf->interp_sf.cb_pred_filter_search = 1; sf->interp_sf.skip_sharp_interp_filter_search = 1; sf->interp_sf.use_interp_filter = 2; sf->intra_sf.intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL; sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL; sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL; // TODO(any): "intra_y_mode_mask" doesn't help much at speed 4. // sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; // sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; // sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V; sf->intra_sf.skip_intra_in_interframe = 4; sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL; sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_2; sf->tpl_sf.subpel_force_stop = HALF_PEL; sf->tpl_sf.search_method = FAST_BIGDIA; sf->tpl_sf.use_sad_for_mode_decision = 1; sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1; sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 5 : 7; // TODO(any): Extend multi-winner mode processing support for inter frames sf->winner_mode_sf.multi_winner_mode_type = frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_DEFAULT : MULTI_WINNER_MODE_OFF; sf->winner_mode_sf.dc_blk_pred_level = boosted ? 0 : 2; sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL; } if (speed >= 5) { sf->hl_sf.weight_calc_level_in_tf = 1; sf->hl_sf.adjust_num_frames_for_arf_filtering = allow_screen_content_tools ? 0 : 2; sf->fp_sf.reduce_mv_step_param = 4; sf->part_sf.simple_motion_search_prune_agg = allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL3; sf->part_sf.ext_partition_eval_thresh = allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16; sf->part_sf.prune_sub_8x8_partition_level = allow_screen_content_tools ? 1 : 2; sf->mv_sf.warp_search_method = WARP_SEARCH_DIAMOND; sf->inter_sf.prune_inter_modes_if_skippable = 1; sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 1; sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_DEFAULT] = boosted ? 0 : 4; sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE] = boosted ? 0 : 5; sf->inter_sf.enable_fast_compound_mode_search = 2; sf->interp_sf.skip_interp_filter_search = boosted ? 0 : 1; sf->intra_sf.chroma_intra_pruning_with_hog = 3; // TODO(any): Extend multi-winner mode processing support for inter frames sf->winner_mode_sf.multi_winner_mode_type = frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_FAST : MULTI_WINNER_MODE_OFF; // Disable Self-guided Loop restoration filter. sf->lpf_sf.disable_sgr_filter = true; sf->lpf_sf.disable_wiener_coeff_refine_search = true; sf->tpl_sf.prune_starting_mv = 3; sf->tpl_sf.use_y_only_rate_distortion = 1; sf->tpl_sf.subpel_force_stop = FULL_PEL; sf->tpl_sf.gop_length_decision_method = 2; sf->tpl_sf.use_sad_for_mode_decision = 2; sf->winner_mode_sf.dc_blk_pred_level = 2; sf->fp_sf.disable_recon = 1; } if (speed >= 6) { sf->hl_sf.disable_extra_sc_testing = 1; sf->hl_sf.second_alt_ref_filtering = 0; sf->gm_sf.downsample_level = 2; sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3; sf->inter_sf.selective_ref_frame = 6; sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 2; sf->inter_sf.prune_ext_comp_using_neighbors = 3; sf->intra_sf.chroma_intra_pruning_with_hog = 4; sf->intra_sf.intra_pruning_with_hog = 4; sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC; sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC; sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC; sf->intra_sf.early_term_chroma_palette_size_search = 1; sf->part_sf.prune_rectangular_split_based_on_qidx = boosted || allow_screen_content_tools ? 0 : 2; sf->part_sf.prune_part4_search = 3; sf->mv_sf.simple_motion_subpel_force_stop = FULL_PEL; sf->mv_sf.use_bsize_dependent_search_method = 1; sf->tpl_sf.gop_length_decision_method = 3; sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 6 : 8; sf->winner_mode_sf.dc_blk_pred_level = 3; sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF; sf->fp_sf.skip_zeromv_motion_search = 1; } } static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { const AV1_COMMON *const cm = &cpi->common; const int boosted = frame_is_boosted(cpi); const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080; const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360; if (!is_360p_or_larger) { sf->rt_sf.prune_intra_mode_based_on_mv_range = 1; sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 1; if (speed >= 6) sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 2; if (speed == 7) sf->rt_sf.prefer_large_partition_blocks = 2; if (speed >= 7) { sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true; sf->rt_sf.use_rtc_tf = 2; } if (speed == 8) sf->rt_sf.prefer_large_partition_blocks = 1; if (speed >= 8) { sf->rt_sf.use_nonrd_filter_search = 1; sf->rt_sf.tx_size_level_based_on_qstep = 1; } if (speed >= 9) { sf->rt_sf.use_comp_ref_nonrd = 0; sf->rt_sf.nonrd_aggressive_skip = 1; sf->rt_sf.skip_intra_pred = 1; // Only turn on enable_ref_short_signaling for low resolution when only // LAST and GOLDEN ref frames are used. sf->rt_sf.enable_ref_short_signaling = (!sf->rt_sf.use_nonrd_altref_frame && (!sf->rt_sf.use_comp_ref_nonrd || (!sf->rt_sf.ref_frame_comp_nonrd[1] && !sf->rt_sf.ref_frame_comp_nonrd[2]))); // TODO(kyslov) Re-enable when AV1 models are trained #if 0 #if CONFIG_RT_ML_PARTITIONING if (!frame_is_intra_only(cm)) { sf->part_sf.partition_search_type = ML_BASED_PARTITION; sf->rt_sf.reuse_inter_pred_nonrd = 0; } #endif #endif sf->rt_sf.use_adaptive_subpel_search = false; } if (speed >= 10) { // TODO(yunqingwang@google.com): To be conservative, disable // sf->rt_sf.estimate_motion_for_var_based_partition = 3 for speed 10/qvga // for now. May enable it in the future. sf->rt_sf.estimate_motion_for_var_based_partition = 0; sf->rt_sf.skip_intra_pred = 2; sf->rt_sf.hybrid_intra_pickmode = 3; sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 1; sf->rt_sf.reduce_mv_pel_precision_highmotion = 2; sf->rt_sf.use_nonrd_filter_search = 0; } } else { sf->rt_sf.prune_intra_mode_based_on_mv_range = 2; sf->intra_sf.skip_filter_intra_in_inter_frames = 1; if (speed <= 5) { sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = boosted ? INT_MAX : 350; sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 2; } if (speed == 6) sf->part_sf.disable_8x8_part_based_on_qidx = 1; if (speed >= 6) sf->rt_sf.skip_newmv_mode_based_on_sse = 2; if (speed == 7) { sf->rt_sf.prefer_large_partition_blocks = 1; // Enable this feature for [360p, 720p] resolution range initially. // Only enable for low bitdepth to mitigate issue: b/303023614. if (!cpi->rc.rtc_external_ratectrl && AOMMIN(cm->width, cm->height) <= 720 && !cpi->oxcf.use_highbitdepth) sf->hl_sf.accurate_bit_estimate = cpi->oxcf.q_cfg.aq_mode == NO_AQ; } if (speed >= 7) { sf->rt_sf.use_rtc_tf = 1; } if (speed == 8 && !cpi->ppi->use_svc) { sf->rt_sf.short_circuit_low_temp_var = 0; sf->rt_sf.use_nonrd_altref_frame = 1; } if (speed >= 8) sf->rt_sf.tx_size_level_based_on_qstep = 2; if (speed >= 9) { sf->rt_sf.gf_length_lvl = 1; sf->rt_sf.skip_cdef_sb = 1; sf->rt_sf.sad_based_adp_altref_lag = 2; sf->rt_sf.reduce_mv_pel_precision_highmotion = 2; sf->rt_sf.use_adaptive_subpel_search = true; sf->interp_sf.cb_pred_filter_search = 1; } if (speed >= 10) { sf->rt_sf.hybrid_intra_pickmode = 2; sf->rt_sf.sad_based_adp_altref_lag = 4; sf->rt_sf.tx_size_level_based_on_qstep = 0; sf->rt_sf.reduce_mv_pel_precision_highmotion = 3; sf->rt_sf.use_adaptive_subpel_search = false; sf->interp_sf.cb_pred_filter_search = 2; } } if (!is_480p_or_larger) { if (speed == 7) { sf->rt_sf.nonrd_check_partition_merge_mode = 2; } } if (!is_720p_or_larger) { if (speed >= 9) { sf->rt_sf.force_large_partition_blocks_intra = 1; } } else { if (speed >= 6) sf->rt_sf.skip_newmv_mode_based_on_sse = 3; if (speed == 7) sf->rt_sf.prefer_large_partition_blocks = 0; if (speed >= 7) { sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 2; sf->rt_sf.reduce_mv_pel_precision_highmotion = 1; } if (speed >= 9) { sf->rt_sf.sad_based_adp_altref_lag = 1; sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 0; sf->rt_sf.reduce_mv_pel_precision_highmotion = 2; } if (speed >= 10) { sf->rt_sf.sad_based_adp_altref_lag = 3; sf->rt_sf.reduce_mv_pel_precision_highmotion = 3; } } // TODO(Any): Check/Tune settings of other sfs for 1080p. if (is_1080p_or_larger) { if (speed >= 7) { sf->rt_sf.reduce_mv_pel_precision_highmotion = 0; sf->rt_sf.use_adaptive_subpel_search = 0; } if (speed >= 9) sf->interp_sf.cb_pred_filter_search = 0; } else { if (speed >= 9) sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; if (speed >= 10) sf->rt_sf.nonrd_aggressive_skip = 1; } // TODO(marpan): Tune settings for speed 11 video mode, if (speed >= 11 && cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) { sf->rt_sf.skip_cdef_sb = 1; sf->rt_sf.force_only_last_ref = 1; sf->rt_sf.selective_cdf_update = 1; sf->rt_sf.use_nonrd_filter_search = 0; if (is_360p_or_larger) { sf->part_sf.fixed_partition_size = BLOCK_32X32; sf->rt_sf.use_fast_fixed_part = 1; sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 2; } sf->rt_sf.increase_source_sad_thresh = 1; sf->rt_sf.part_early_exit_zeromv = 2; sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2; for (int i = 0; i < BLOCK_SIZES; ++i) { sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; } sf->rt_sf.hybrid_intra_pickmode = 0; } // Setting for SVC, or when the ref_frame_config control is // used to set the reference structure. if (cpi->ppi->use_svc || cpi->ppi->rtc_ref.set_ref_frame_config) { const RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; // For SVC: for greater than 2 temporal layers, use better mv search on // base temporal layers, and only on base spatial layer if highest // resolution is above 640x360. if (cpi->svc.number_temporal_layers >= 2 && cpi->svc.temporal_layer_id == 0 && (cpi->svc.spatial_layer_id == 0 || cpi->oxcf.frm_dim_cfg.width * cpi->oxcf.frm_dim_cfg.height <= 640 * 360)) { sf->mv_sf.search_method = NSTEP; sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED; sf->rt_sf.fullpel_search_step_param = 10; sf->rt_sf.reduce_mv_pel_precision_highmotion = 0; if (cm->width * cm->height <= 352 * 288) sf->rt_sf.nonrd_prune_ref_frame_search = 2; sf->rt_sf.force_large_partition_blocks_intra = 0; } if (speed >= 8) { if (cpi->svc.number_temporal_layers > 2) sf->rt_sf.disable_cdf_update_non_reference_frame = true; sf->rt_sf.reduce_mv_pel_precision_highmotion = 3; if (rtc_ref->non_reference_frame) { sf->rt_sf.nonrd_aggressive_skip = 1; sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; } } if (speed <= 9 && cpi->svc.number_temporal_layers > 2 && cpi->svc.temporal_layer_id == 0) sf->rt_sf.check_only_zero_zeromv_on_large_blocks = false; else sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true; sf->rt_sf.frame_level_mode_cost_update = false; // Compound mode enabling. if (rtc_ref->ref_frame_comp[0] || rtc_ref->ref_frame_comp[1] || rtc_ref->ref_frame_comp[2]) { sf->rt_sf.use_comp_ref_nonrd = 1; sf->rt_sf.ref_frame_comp_nonrd[0] = rtc_ref->ref_frame_comp[0] && rtc_ref->reference[GOLDEN_FRAME - 1]; sf->rt_sf.ref_frame_comp_nonrd[1] = rtc_ref->ref_frame_comp[1] && rtc_ref->reference[LAST2_FRAME - 1]; sf->rt_sf.ref_frame_comp_nonrd[2] = rtc_ref->ref_frame_comp[2] && rtc_ref->reference[ALTREF_FRAME - 1]; } else { sf->rt_sf.use_comp_ref_nonrd = 0; } if (cpi->svc.number_spatial_layers > 1 || cpi->svc.number_temporal_layers > 1) sf->hl_sf.accurate_bit_estimate = 0; sf->rt_sf.estimate_motion_for_var_based_partition = 1; // For single layers RPS: bias/adjustment for recovery frame. if (cpi->ppi->rtc_ref.bias_recovery_frame) { sf->mv_sf.search_method = NSTEP; sf->mv_sf.subpel_search_method = SUBPEL_TREE; sf->rt_sf.fullpel_search_step_param = 8; sf->rt_sf.nonrd_aggressive_skip = 0; } } // Screen settings. if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { // TODO(marpan): Check settings for speed 7 and 8. if (speed >= 7) { sf->rt_sf.reduce_mv_pel_precision_highmotion = 1; sf->mv_sf.use_bsize_dependent_search_method = 0; sf->rt_sf.skip_cdef_sb = 1; sf->rt_sf.increase_color_thresh_palette = 1; if (!frame_is_intra_only(cm)) sf->rt_sf.dct_only_palette_nonrd = 1; } if (speed >= 8) { sf->rt_sf.nonrd_check_partition_merge_mode = 3; sf->rt_sf.nonrd_prune_ref_frame_search = 1; sf->rt_sf.use_nonrd_filter_search = 0; sf->rt_sf.prune_hv_pred_modes_using_src_sad = false; } if (speed >= 9) { sf->rt_sf.prune_idtx_nonrd = 1; sf->rt_sf.part_early_exit_zeromv = 2; sf->rt_sf.skip_lf_screen = 1; sf->rt_sf.nonrd_prune_ref_frame_search = 3; sf->rt_sf.var_part_split_threshold_shift = 10; sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; sf->rt_sf.reduce_mv_pel_precision_highmotion = 3; sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 1; sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; sf->rt_sf.nonrd_check_partition_merge_mode = 0; sf->interp_sf.cb_pred_filter_search = 0; } if (speed >= 10) { if (cm->width * cm->height > 1920 * 1080) sf->part_sf.disable_8x8_part_based_on_qidx = 1; sf->rt_sf.screen_content_cdef_filter_qindex_thresh = 80; sf->rt_sf.part_early_exit_zeromv = 1; sf->rt_sf.nonrd_aggressive_skip = 1; sf->rt_sf.thresh_active_maps_skip_lf_cdef = 90; sf->rt_sf.hybrid_intra_pickmode = 0; sf->rt_sf.dct_only_palette_nonrd = 1; sf->rt_sf.prune_palette_search_nonrd = 1; sf->rt_sf.prune_intra_mode_using_best_sad_so_far = true; sf->rt_sf.rc_faster_convergence_static = 1; sf->rt_sf.rc_compute_spatial_var_sc = 1; } if (speed >= 11) { sf->rt_sf.skip_lf_screen = 2; sf->rt_sf.skip_cdef_sb = 2; sf->rt_sf.prune_palette_search_nonrd = 2; sf->rt_sf.increase_color_thresh_palette = 0; sf->rt_sf.prune_h_pred_using_best_mode_so_far = true; sf->rt_sf.enable_intra_mode_pruning_using_neighbors = true; } sf->rt_sf.skip_encoding_non_reference_slide_change = cpi->oxcf.rc_cfg.drop_frames_water_mark > 0 ? 1 : 0; sf->rt_sf.skip_newmv_flat_blocks_screen = 1; sf->rt_sf.use_idtx_nonrd = 1; sf->rt_sf.higher_thresh_scene_detection = 0; sf->rt_sf.use_nonrd_altref_frame = 0; sf->rt_sf.use_rtc_tf = 0; sf->rt_sf.use_comp_ref_nonrd = 0; sf->rt_sf.source_metrics_sb_nonrd = 1; if (cpi->rc.high_source_sad == 1) { sf->rt_sf.prefer_large_partition_blocks = 0; sf->part_sf.max_intra_bsize = BLOCK_128X128; for (int i = 0; i < BLOCK_SIZES; ++i) { if (i > BLOCK_32X32) sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; else sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V; } } if (speed >= 11 && cpi->rc.high_motion_content_screen_rtc) { sf->rt_sf.higher_thresh_scene_detection = 1; sf->rt_sf.force_only_last_ref = 1; sf->rt_sf.use_nonrd_filter_search = 0; sf->part_sf.fixed_partition_size = BLOCK_32X32; sf->rt_sf.use_fast_fixed_part = 1; sf->rt_sf.increase_source_sad_thresh = 1; sf->rt_sf.selective_cdf_update = 1; sf->mv_sf.search_method = FAST_DIAMOND; } else if (cpi->rc.max_block_source_sad > 20000 && cpi->rc.frame_source_sad > 100 && speed >= 6 && (cpi->rc.percent_blocks_with_motion > 1 || cpi->svc.last_layer_dropped[0])) { sf->mv_sf.search_method = NSTEP; sf->rt_sf.fullpel_search_step_param = 2; } if (cpi->rc.high_source_sad && cpi->ppi->rtc_ref.non_reference_frame) { sf->rt_sf.use_idtx_nonrd = 0; sf->rt_sf.prefer_large_partition_blocks = 1; sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; sf->rt_sf.fullpel_search_step_param = 10; } sf->rt_sf.partition_direct_merging = 0; sf->hl_sf.accurate_bit_estimate = 0; // This feature is for nonrd_pickmode. if (sf->rt_sf.use_nonrd_pick_mode) sf->rt_sf.estimate_motion_for_var_based_partition = 1; else sf->rt_sf.estimate_motion_for_var_based_partition = 0; } if (is_lossless_requested(&cpi->oxcf.rc_cfg)) { sf->rt_sf.use_rtc_tf = 0; // TODO(aomedia:3412): The setting accurate_bit_estimate = 0 // can be removed once it's fixed for lossless mode. sf->hl_sf.accurate_bit_estimate = 0; } if (cpi->oxcf.use_highbitdepth) { // Disable for use_highbitdepth = 1 to mitigate issue: b/303023614. sf->rt_sf.estimate_motion_for_var_based_partition = 0; } if (cpi->oxcf.superres_cfg.enable_superres) { sf->rt_sf.use_rtc_tf = 0; sf->rt_sf.nonrd_prune_ref_frame_search = 1; } // rtc_tf feature allocates new source because of possible // temporal filtering which may change the input source during encoding: // this causes an issue on resized frames when psnr is calculated, // so disable it here for frames that are resized (encoding width/height // different from configured width/height). if (is_psnr_calc_enabled(cpi) && (cpi->oxcf.frm_dim_cfg.width != cm->width || cpi->oxcf.frm_dim_cfg.height != cm->height)) sf->rt_sf.use_rtc_tf = 0; } static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi, SPEED_FEATURES *sf, int speed) { AV1_COMMON *const cm = &cpi->common; const int boosted = frame_is_boosted(cpi); // Currently, rt speed 0, 1, 2, 3, 4, 5 are the same. // Following set of speed features are not impacting encoder's decisions as // the relevant tools are disabled by default. sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH; sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF; sf->inter_sf.reuse_inter_intra_mode = 1; sf->inter_sf.prune_compound_using_single_ref = 0; sf->inter_sf.prune_comp_search_by_single_result = 2; sf->inter_sf.prune_comp_type_by_comp_avg = 2; sf->inter_sf.fast_wedge_sign_estimate = 1; sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED; sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW; sf->inter_sf.disable_interinter_wedge_var_thresh = 100; sf->interp_sf.cb_pred_filter_search = 0; sf->interp_sf.skip_interp_filter_search = 1; sf->part_sf.ml_prune_partition = 1; sf->part_sf.reuse_prev_rd_results_for_part_ab = 1; sf->part_sf.prune_ext_partition_types_search_level = 2; sf->part_sf.less_rectangular_check_level = 2; sf->mv_sf.obmc_full_pixel_search_level = 1; sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF; sf->tx_sf.model_based_prune_tx_search_level = 0; sf->lpf_sf.dual_sgr_penalty_level = 1; // Disable Wiener and Self-guided Loop restoration filters. sf->lpf_sf.disable_wiener_filter = true; sf->lpf_sf.disable_sgr_filter = true; sf->intra_sf.prune_palette_search_level = 2; sf->intra_sf.prune_luma_palette_size_search_level = 2; sf->intra_sf.early_term_chroma_palette_size_search = 1; // End of set // TODO(any, yunqing): tune these features for real-time use cases. sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_SOLO; sf->hl_sf.frame_parameter_update = 0; sf->inter_sf.model_based_post_interp_filter_breakout = 1; // TODO(any): As per the experiments, this speed feature is doing redundant // computation since the model rd based pruning logic is similar to model rd // based gating when inter_mode_rd_model_estimation = 2. Enable this SF if // either of the condition becomes true. // (1) inter_mode_rd_model_estimation != 2 // (2) skip_interp_filter_search == 0 // (3) Motion mode or compound mode is enabled */ sf->inter_sf.prune_mode_search_simple_translation = 0; sf->inter_sf.prune_ref_frame_for_rect_partitions = !boosted; sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX; sf->inter_sf.selective_ref_frame = 4; sf->inter_sf.alt_ref_search_fp = 2; set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level, boosted ? 0 : 4); sf->inter_sf.limit_txfm_eval_per_mode = 3; sf->inter_sf.adaptive_rd_thresh = 4; sf->inter_sf.inter_mode_rd_model_estimation = 2; sf->inter_sf.prune_inter_modes_if_skippable = 1; sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL3; sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3; sf->inter_sf.skip_newmv_in_drl = 4; sf->interp_sf.use_fast_interpolation_filter_search = 1; sf->interp_sf.use_interp_filter = 1; sf->interp_sf.adaptive_interp_filter_search = 1; sf->interp_sf.disable_dual_filter = 1; sf->part_sf.default_max_partition_size = BLOCK_128X128; sf->part_sf.default_min_partition_size = BLOCK_8X8; sf->part_sf.use_best_rd_for_pruning = 1; sf->part_sf.early_term_after_none_split = 1; sf->part_sf.partition_search_breakout_dist_thr = (1 << 25); sf->part_sf.max_intra_bsize = BLOCK_16X16; sf->part_sf.partition_search_breakout_rate_thr = 500; sf->part_sf.partition_search_type = VAR_BASED_PARTITION; sf->part_sf.adjust_var_based_rd_partitioning = 2; sf->mv_sf.full_pixel_search_level = 1; sf->mv_sf.exhaustive_searches_thresh = INT_MAX; sf->mv_sf.auto_mv_step_size = 1; sf->mv_sf.subpel_iters_per_step = 1; sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS; sf->mv_sf.search_method = FAST_DIAMOND; sf->mv_sf.subpel_force_stop = EIGHTH_PEL; sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED; for (int i = 0; i < TX_SIZES; ++i) { sf->intra_sf.intra_y_mode_mask[i] = INTRA_DC; sf->intra_sf.intra_uv_mode_mask[i] = UV_INTRA_DC_CFL; } sf->intra_sf.skip_intra_in_interframe = 5; sf->intra_sf.disable_smooth_intra = 1; sf->intra_sf.skip_filter_intra_in_inter_frames = 1; sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1; sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1; sf->tx_sf.adaptive_txb_search_level = 2; sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; sf->tx_sf.tx_size_search_lgr_block = 1; sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000; sf->tx_sf.tx_type_search.skip_tx_search = 1; sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3; sf->tx_sf.refine_fast_tx_search_results = 0; sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1; sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2; sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4; sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT; sf->rd_sf.simple_model_rd_from_var = 1; sf->rd_sf.tx_domain_dist_level = 2; sf->rd_sf.tx_domain_dist_thres_level = 2; sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4; sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q; sf->winner_mode_sf.dc_blk_pred_level = frame_is_intra_only(cm) ? 0 : 3; sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1; sf->winner_mode_sf.tx_size_search_level = 1; sf->winner_mode_sf.winner_mode_ifs = 1; sf->rt_sf.check_intra_pred_nonrd = 1; sf->rt_sf.estimate_motion_for_var_based_partition = 2; sf->rt_sf.hybrid_intra_pickmode = 1; sf->rt_sf.use_comp_ref_nonrd = 0; sf->rt_sf.ref_frame_comp_nonrd[0] = 0; sf->rt_sf.ref_frame_comp_nonrd[1] = 0; sf->rt_sf.ref_frame_comp_nonrd[2] = 0; sf->rt_sf.use_nonrd_filter_search = 1; sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; sf->rt_sf.num_inter_modes_for_tx_search = 5; sf->rt_sf.prune_inter_modes_using_temp_var = 1; sf->rt_sf.use_real_time_ref_set = 1; sf->rt_sf.use_simple_rd_model = 1; sf->rt_sf.prune_inter_modes_with_golden_ref = boosted ? 0 : 1; // TODO(any): This sf could be removed. sf->rt_sf.short_circuit_low_temp_var = 1; sf->rt_sf.check_scene_detection = 1; if (cpi->rc.rtc_external_ratectrl) sf->rt_sf.check_scene_detection = 0; if (cm->current_frame.frame_type != KEY_FRAME && cpi->oxcf.rc_cfg.mode == AOM_CBR) sf->rt_sf.overshoot_detection_cbr = FAST_DETECTION_MAXQ; // Enable noise estimation only for high resolutions for now. // // Since use_temporal_noise_estimate has no effect for all-intra frame // encoding, it is disabled for this case. if (cpi->oxcf.kf_cfg.key_freq_max != 0 && cm->width * cm->height > 640 * 480) sf->rt_sf.use_temporal_noise_estimate = 1; sf->rt_sf.skip_tx_no_split_var_based_partition = 1; sf->rt_sf.skip_newmv_mode_based_on_sse = 1; sf->rt_sf.mode_search_skip_flags = (cm->current_frame.frame_type == KEY_FRAME) ? 0 : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR | FLAG_EARLY_TERMINATE; sf->rt_sf.var_part_split_threshold_shift = 5; if (!frame_is_intra_only(&cpi->common)) sf->rt_sf.var_part_based_on_qidx = 1; sf->rt_sf.use_fast_fixed_part = 0; sf->rt_sf.increase_source_sad_thresh = 0; if (speed >= 6) { sf->mv_sf.use_fullpel_costlist = 1; sf->rd_sf.tx_domain_dist_thres_level = 3; sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = 0; sf->inter_sf.limit_inter_mode_cands = 4; sf->inter_sf.prune_warped_prob_thresh = 8; sf->inter_sf.extra_prune_warped = 1; sf->rt_sf.gf_refresh_based_on_qp = 1; sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 1; sf->rt_sf.var_part_split_threshold_shift = 7; if (!frame_is_intra_only(&cpi->common)) sf->rt_sf.var_part_based_on_qidx = 2; sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 3; } if (speed >= 7) { sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_1; sf->rt_sf.use_comp_ref_nonrd = 1; sf->rt_sf.ref_frame_comp_nonrd[2] = 1; // LAST_ALTREF sf->tx_sf.intra_tx_size_search_init_depth_sqr = 2; sf->part_sf.partition_search_type = VAR_BASED_PARTITION; sf->part_sf.max_intra_bsize = BLOCK_32X32; sf->mv_sf.search_method = FAST_DIAMOND; sf->mv_sf.subpel_force_stop = QUARTER_PEL; sf->inter_sf.inter_mode_rd_model_estimation = 2; // This sf is not applicable in non-rd path. sf->inter_sf.skip_newmv_in_drl = 0; sf->interp_sf.skip_interp_filter_search = 0; // Disable intra_y_mode_mask pruning since the performance at speed 7 isn't // good. May need more study. for (int i = 0; i < TX_SIZES; ++i) { sf->intra_sf.intra_y_mode_mask[i] = INTRA_ALL; } sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q; sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL5; sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; sf->rt_sf.nonrd_prune_ref_frame_search = 1; // This is for rd path only. sf->rt_sf.prune_inter_modes_using_temp_var = 0; sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 0; sf->rt_sf.prune_intra_mode_based_on_mv_range = 0; #if !CONFIG_REALTIME_ONLY sf->rt_sf.reuse_inter_pred_nonrd = (cpi->oxcf.motion_mode_cfg.enable_warped_motion == 0); #else sf->rt_sf.reuse_inter_pred_nonrd = 1; #endif #if CONFIG_AV1_TEMPORAL_DENOISING sf->rt_sf.reuse_inter_pred_nonrd = (cpi->oxcf.noise_sensitivity == 0); #endif sf->rt_sf.short_circuit_low_temp_var = 0; // For spatial layers, only LAST and GOLDEN are currently used in the SVC // for nonrd. The flag use_nonrd_altref_frame can disable GOLDEN in the // get_ref_frame_flags() for some patterns, so disable it here for // spatial layers. sf->rt_sf.use_nonrd_altref_frame = (cpi->svc.number_spatial_layers > 1) ? 0 : 1; sf->rt_sf.use_nonrd_pick_mode = 1; sf->rt_sf.nonrd_check_partition_merge_mode = 3; sf->rt_sf.skip_intra_pred = 1; sf->rt_sf.source_metrics_sb_nonrd = 1; // Set mask for intra modes. for (int i = 0; i < BLOCK_SIZES; ++i) if (i >= BLOCK_32X32) sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; else // Use DC, H, V intra mode for block sizes < 32X32. sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V; sf->winner_mode_sf.dc_blk_pred_level = 0; sf->rt_sf.var_part_based_on_qidx = 3; sf->rt_sf.prune_compoundmode_with_singlecompound_var = true; sf->rt_sf.prune_compoundmode_with_singlemode_var = true; sf->rt_sf.skip_compound_based_on_var = true; sf->rt_sf.use_adaptive_subpel_search = true; } if (speed >= 8) { sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_2; sf->intra_sf.intra_pruning_with_hog = 1; sf->rt_sf.short_circuit_low_temp_var = 1; sf->rt_sf.use_nonrd_altref_frame = 0; sf->rt_sf.nonrd_prune_ref_frame_search = 2; sf->rt_sf.nonrd_check_partition_merge_mode = 0; sf->rt_sf.var_part_split_threshold_shift = 8; sf->rt_sf.var_part_based_on_qidx = 4; sf->rt_sf.partition_direct_merging = 1; sf->rt_sf.prune_compoundmode_with_singlemode_var = false; sf->mv_sf.use_bsize_dependent_search_method = 2; sf->rt_sf.prune_hv_pred_modes_using_src_sad = true; } if (speed >= 9) { sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_3; sf->rt_sf.estimate_motion_for_var_based_partition = 3; sf->rt_sf.prefer_large_partition_blocks = 3; sf->rt_sf.skip_intra_pred = 2; sf->rt_sf.var_part_split_threshold_shift = 9; for (int i = 0; i < BLOCK_SIZES; ++i) sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; sf->rt_sf.var_part_based_on_qidx = 0; sf->rt_sf.frame_level_mode_cost_update = true; sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true; sf->rt_sf.reduce_mv_pel_precision_highmotion = 0; sf->rt_sf.use_adaptive_subpel_search = true; sf->mv_sf.use_bsize_dependent_search_method = 0; } if (speed >= 10) { sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_4; sf->rt_sf.nonrd_prune_ref_frame_search = 3; sf->rt_sf.var_part_split_threshold_shift = 10; sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; } if (speed >= 11 && !frame_is_intra_only(cm) && cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { sf->winner_mode_sf.dc_blk_pred_level = 3; } } static inline void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) { // best quality defaults hl_sf->frame_parameter_update = 1; hl_sf->recode_loop = ALLOW_RECODE; // Recode loop tolerance %. hl_sf->recode_tolerance = 25; hl_sf->high_precision_mv_usage = CURRENT_Q; hl_sf->superres_auto_search_type = SUPERRES_AUTO_ALL; hl_sf->disable_extra_sc_testing = 0; hl_sf->second_alt_ref_filtering = 1; hl_sf->adjust_num_frames_for_arf_filtering = 0; hl_sf->accurate_bit_estimate = 0; hl_sf->weight_calc_level_in_tf = 0; hl_sf->allow_sub_blk_me_in_tf = 0; } static inline void init_fp_sf(FIRST_PASS_SPEED_FEATURES *fp_sf) { fp_sf->reduce_mv_step_param = 3; fp_sf->skip_motion_search_threshold = 0; fp_sf->disable_recon = 0; fp_sf->skip_zeromv_motion_search = 0; } static inline void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) { tpl_sf->gop_length_decision_method = 0; tpl_sf->prune_intra_modes = 0; tpl_sf->prune_starting_mv = 0; tpl_sf->reduce_first_step_size = 0; tpl_sf->skip_alike_starting_mv = 0; tpl_sf->subpel_force_stop = EIGHTH_PEL; tpl_sf->search_method = NSTEP; tpl_sf->prune_ref_frames_in_tpl = 0; tpl_sf->allow_compound_pred = 1; tpl_sf->use_y_only_rate_distortion = 0; tpl_sf->use_sad_for_mode_decision = 0; tpl_sf->reduce_num_frames = 0; } static inline void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) { gm_sf->gm_search_type = GM_FULL_SEARCH; gm_sf->prune_ref_frame_for_gm_search = 0; gm_sf->prune_zero_mv_with_sse = 0; gm_sf->disable_gm_search_based_on_stats = 0; gm_sf->downsample_level = 0; gm_sf->num_refinement_steps = GM_MAX_REFINEMENT_STEPS; } static inline void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) { part_sf->partition_search_type = SEARCH_PARTITION; part_sf->less_rectangular_check_level = 0; part_sf->use_square_partition_only_threshold = BLOCK_128X128; part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE; part_sf->default_max_partition_size = BLOCK_LARGEST; part_sf->default_min_partition_size = BLOCK_4X4; part_sf->adjust_var_based_rd_partitioning = 0; part_sf->max_intra_bsize = BLOCK_LARGEST; // This setting only takes effect when partition_search_type is set // to FIXED_PARTITION. part_sf->fixed_partition_size = BLOCK_16X16; // Recode loop tolerance %. part_sf->partition_search_breakout_dist_thr = 0; part_sf->partition_search_breakout_rate_thr = 0; part_sf->prune_ext_partition_types_search_level = 0; part_sf->prune_part4_search = 0; part_sf->ml_prune_partition = 0; part_sf->ml_early_term_after_part_split_level = 0; for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) { part_sf->ml_partition_search_breakout_thresh[i] = -1; // -1 means not enabled. } part_sf->simple_motion_search_prune_agg = SIMPLE_AGG_LVL0; part_sf->simple_motion_search_split = 0; part_sf->simple_motion_search_prune_rect = 0; part_sf->simple_motion_search_early_term_none = 0; part_sf->simple_motion_search_reduce_search_steps = 0; part_sf->intra_cnn_based_part_prune_level = 0; part_sf->ext_partition_eval_thresh = BLOCK_8X8; part_sf->rect_partition_eval_thresh = BLOCK_128X128; part_sf->ext_part_eval_based_on_cur_best = 0; part_sf->prune_ext_part_using_split_info = 0; part_sf->prune_rectangular_split_based_on_qidx = 0; part_sf->prune_rect_part_using_4x4_var_deviation = false; part_sf->prune_rect_part_using_none_pred_mode = false; part_sf->early_term_after_none_split = 0; part_sf->ml_predict_breakout_level = 0; part_sf->prune_sub_8x8_partition_level = 0; part_sf->simple_motion_search_rect_split = 0; part_sf->reuse_prev_rd_results_for_part_ab = 0; part_sf->reuse_best_prediction_for_part_ab = 0; part_sf->use_best_rd_for_pruning = 0; part_sf->skip_non_sq_part_based_on_none = 0; part_sf->disable_8x8_part_based_on_qidx = 0; } static inline void init_mv_sf(MV_SPEED_FEATURES *mv_sf) { mv_sf->full_pixel_search_level = 0; mv_sf->auto_mv_step_size = 0; mv_sf->exhaustive_searches_thresh = 0; mv_sf->obmc_full_pixel_search_level = 0; mv_sf->prune_mesh_search = PRUNE_MESH_SEARCH_DISABLED; mv_sf->reduce_search_range = 0; mv_sf->search_method = NSTEP; mv_sf->simple_motion_subpel_force_stop = EIGHTH_PEL; mv_sf->subpel_force_stop = EIGHTH_PEL; mv_sf->subpel_iters_per_step = 2; mv_sf->subpel_search_method = SUBPEL_TREE; mv_sf->use_accurate_subpel_search = USE_8_TAPS; mv_sf->use_bsize_dependent_search_method = 0; mv_sf->use_fullpel_costlist = 0; mv_sf->use_downsampled_sad = 0; mv_sf->disable_extensive_joint_motion_search = 0; mv_sf->disable_second_mv = 0; mv_sf->skip_fullpel_search_using_startmv = 0; mv_sf->warp_search_method = WARP_SEARCH_SQUARE; mv_sf->warp_search_iters = 8; mv_sf->use_intrabc = 1; } static inline void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) { inter_sf->adaptive_rd_thresh = 0; inter_sf->model_based_post_interp_filter_breakout = 0; inter_sf->reduce_inter_modes = 0; inter_sf->alt_ref_search_fp = 0; inter_sf->prune_single_ref = 0; inter_sf->prune_comp_ref_frames = 0; inter_sf->selective_ref_frame = 0; inter_sf->prune_ref_frame_for_rect_partitions = 0; inter_sf->fast_wedge_sign_estimate = 0; inter_sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED; inter_sf->reuse_inter_intra_mode = 0; inter_sf->mv_cost_upd_level = INTERNAL_COST_UPD_SB; inter_sf->coeff_cost_upd_level = INTERNAL_COST_UPD_SB; inter_sf->mode_cost_upd_level = INTERNAL_COST_UPD_SB; inter_sf->prune_inter_modes_based_on_tpl = 0; inter_sf->prune_nearmv_using_neighbors = PRUNE_NEARMV_OFF; inter_sf->prune_comp_search_by_single_result = 0; inter_sf->skip_repeated_ref_mv = 0; inter_sf->skip_newmv_in_drl = 0; inter_sf->inter_mode_rd_model_estimation = 0; inter_sf->prune_compound_using_single_ref = 0; inter_sf->prune_ext_comp_using_neighbors = 0; inter_sf->skip_ext_comp_nearmv_mode = 0; inter_sf->prune_comp_using_best_single_mode_ref = 0; inter_sf->prune_nearest_near_mv_using_refmv_weight = 0; inter_sf->disable_onesided_comp = 0; inter_sf->prune_mode_search_simple_translation = 0; inter_sf->prune_comp_type_by_comp_avg = 0; inter_sf->disable_interinter_wedge_newmv_search = 0; inter_sf->fast_interintra_wedge_search = 0; inter_sf->prune_comp_type_by_model_rd = 0; inter_sf->perform_best_rd_based_gating_for_chroma = 0; inter_sf->prune_obmc_prob_thresh = 0; inter_sf->disable_interinter_wedge_var_thresh = 0; inter_sf->disable_interintra_wedge_var_thresh = 0; inter_sf->prune_ref_mv_idx_search = 0; inter_sf->prune_warped_prob_thresh = 0; inter_sf->reuse_compound_type_decision = 0; inter_sf->prune_inter_modes_if_skippable = 0; inter_sf->disable_masked_comp = 0; inter_sf->enable_fast_compound_mode_search = 0; inter_sf->reuse_mask_search_results = 0; inter_sf->enable_fast_wedge_mask_search = 0; inter_sf->inter_mode_txfm_breakout = 0; inter_sf->limit_inter_mode_cands = 0; inter_sf->limit_txfm_eval_per_mode = 0; inter_sf->skip_arf_compound = 0; set_txfm_rd_gate_level(inter_sf->txfm_rd_gate_level, 0); } static inline void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) { interp_sf->adaptive_interp_filter_search = 0; interp_sf->cb_pred_filter_search = 0; interp_sf->disable_dual_filter = 0; interp_sf->skip_sharp_interp_filter_search = 0; interp_sf->use_fast_interpolation_filter_search = 0; interp_sf->use_interp_filter = 0; interp_sf->skip_interp_filter_search = 0; } static inline void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) { intra_sf->dv_cost_upd_level = INTERNAL_COST_UPD_SB; intra_sf->skip_intra_in_interframe = 1; intra_sf->intra_pruning_with_hog = 0; intra_sf->chroma_intra_pruning_with_hog = 0; intra_sf->prune_palette_search_level = 0; intra_sf->prune_luma_palette_size_search_level = 0; for (int i = 0; i < TX_SIZES; i++) { intra_sf->intra_y_mode_mask[i] = INTRA_ALL; intra_sf->intra_uv_mode_mask[i] = UV_INTRA_ALL; } intra_sf->disable_smooth_intra = 0; intra_sf->prune_smooth_intra_mode_for_chroma = 0; intra_sf->prune_filter_intra_level = 0; intra_sf->prune_chroma_modes_using_luma_winner = 0; intra_sf->cfl_search_range = 3; intra_sf->top_intra_model_count_allowed = TOP_INTRA_MODEL_COUNT; intra_sf->adapt_top_model_rd_count_using_neighbors = 0; intra_sf->early_term_chroma_palette_size_search = 0; intra_sf->skip_filter_intra_in_inter_frames = 0; intra_sf->prune_luma_odd_delta_angles_in_intra = 0; } static inline void init_tx_sf(TX_SPEED_FEATURES *tx_sf) { tx_sf->inter_tx_size_search_init_depth_sqr = 0; tx_sf->inter_tx_size_search_init_depth_rect = 0; tx_sf->intra_tx_size_search_init_depth_rect = 0; tx_sf->intra_tx_size_search_init_depth_sqr = 0; tx_sf->tx_size_search_lgr_block = 0; tx_sf->model_based_prune_tx_search_level = 0; tx_sf->tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_1; tx_sf->tx_type_search.ml_tx_split_thresh = 8500; tx_sf->tx_type_search.use_skip_flag_prediction = 1; tx_sf->tx_type_search.use_reduced_intra_txset = 0; tx_sf->tx_type_search.fast_intra_tx_type_search = 0; tx_sf->tx_type_search.fast_inter_tx_type_prob_thresh = INT_MAX; tx_sf->tx_type_search.skip_tx_search = 0; tx_sf->tx_type_search.prune_tx_type_using_stats = 0; tx_sf->tx_type_search.prune_tx_type_est_rd = 0; tx_sf->tx_type_search.winner_mode_tx_type_pruning = 0; tx_sf->txb_split_cap = 1; tx_sf->adaptive_txb_search_level = 0; tx_sf->refine_fast_tx_search_results = 1; tx_sf->prune_tx_size_level = 0; tx_sf->prune_intra_tx_depths_using_nn = false; tx_sf->use_rd_based_breakout_for_intra_tx_search = false; } static inline void init_rd_sf(RD_CALC_SPEED_FEATURES *rd_sf, const AV1EncoderConfig *oxcf) { const int disable_trellis_quant = oxcf->algo_cfg.disable_trellis_quant; if (disable_trellis_quant == 3) { rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg) ? NO_ESTIMATE_YRD_TRELLIS_OPT : NO_TRELLIS_OPT; } else if (disable_trellis_quant == 2) { rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg) ? FINAL_PASS_TRELLIS_OPT : NO_TRELLIS_OPT; } else if (disable_trellis_quant == 0) { if (is_lossless_requested(&oxcf->rc_cfg)) { rd_sf->optimize_coefficients = NO_TRELLIS_OPT; } else { rd_sf->optimize_coefficients = FULL_TRELLIS_OPT; } } else if (disable_trellis_quant == 1) { rd_sf->optimize_coefficients = NO_TRELLIS_OPT; } else { assert(0 && "Invalid disable_trellis_quant value"); } rd_sf->use_mb_rd_hash = 0; rd_sf->simple_model_rd_from_var = 0; rd_sf->tx_domain_dist_level = 0; rd_sf->tx_domain_dist_thres_level = 0; rd_sf->perform_coeff_opt = 0; } static inline void init_winner_mode_sf( WINNER_MODE_SPEED_FEATURES *winner_mode_sf) { winner_mode_sf->motion_mode_for_winner_cand = 0; // Set this at the appropriate speed levels winner_mode_sf->tx_size_search_level = 0; winner_mode_sf->enable_winner_mode_for_coeff_opt = 0; winner_mode_sf->enable_winner_mode_for_tx_size_srch = 0; winner_mode_sf->enable_winner_mode_for_use_tx_domain_dist = 0; winner_mode_sf->multi_winner_mode_type = 0; winner_mode_sf->dc_blk_pred_level = 0; winner_mode_sf->winner_mode_ifs = 0; winner_mode_sf->prune_winner_mode_eval_level = 0; } static inline void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) { lpf_sf->disable_loop_restoration_chroma = 0; lpf_sf->disable_loop_restoration_luma = 0; lpf_sf->min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE; lpf_sf->max_lr_unit_size = RESTORATION_UNITSIZE_MAX; lpf_sf->prune_wiener_based_on_src_var = 0; lpf_sf->prune_sgr_based_on_wiener = 0; lpf_sf->enable_sgr_ep_pruning = 0; lpf_sf->reduce_wiener_window_size = 0; lpf_sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE; lpf_sf->use_coarse_filter_level_search = 0; lpf_sf->cdef_pick_method = CDEF_FULL_SEARCH; // Set decoder side speed feature to use less dual sgr modes lpf_sf->dual_sgr_penalty_level = 0; // Enable Wiener and Self-guided Loop restoration filters by default. lpf_sf->disable_wiener_filter = false; lpf_sf->disable_sgr_filter = false; lpf_sf->disable_wiener_coeff_refine_search = false; lpf_sf->use_downsampled_wiener_stats = 0; } static inline void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) { rt_sf->check_intra_pred_nonrd = 0; rt_sf->skip_intra_pred = 0; rt_sf->estimate_motion_for_var_based_partition = 0; rt_sf->nonrd_check_partition_merge_mode = 0; rt_sf->nonrd_check_partition_split = 0; rt_sf->mode_search_skip_flags = 0; rt_sf->nonrd_prune_ref_frame_search = 0; rt_sf->use_nonrd_pick_mode = 0; rt_sf->use_nonrd_altref_frame = 0; rt_sf->use_comp_ref_nonrd = 0; rt_sf->use_real_time_ref_set = 0; rt_sf->short_circuit_low_temp_var = 0; rt_sf->reuse_inter_pred_nonrd = 0; rt_sf->num_inter_modes_for_tx_search = INT_MAX; rt_sf->use_nonrd_filter_search = 0; rt_sf->use_simple_rd_model = 0; rt_sf->hybrid_intra_pickmode = 0; rt_sf->prune_palette_search_nonrd = 0; rt_sf->source_metrics_sb_nonrd = 0; rt_sf->overshoot_detection_cbr = NO_DETECTION; rt_sf->check_scene_detection = 0; rt_sf->rc_adjust_keyframe = 0; rt_sf->rc_compute_spatial_var_sc = 0; rt_sf->prefer_large_partition_blocks = 0; rt_sf->use_temporal_noise_estimate = 0; rt_sf->fullpel_search_step_param = 0; for (int i = 0; i < BLOCK_SIZES; ++i) rt_sf->intra_y_mode_bsize_mask_nrd[i] = INTRA_ALL; rt_sf->prune_hv_pred_modes_using_src_sad = false; rt_sf->nonrd_aggressive_skip = 0; rt_sf->skip_cdef_sb = 0; rt_sf->force_large_partition_blocks_intra = 0; rt_sf->skip_tx_no_split_var_based_partition = 0; rt_sf->skip_newmv_mode_based_on_sse = 0; rt_sf->gf_length_lvl = 0; rt_sf->prune_inter_modes_with_golden_ref = 0; rt_sf->prune_inter_modes_wrt_gf_arf_based_on_sad = 0; rt_sf->prune_inter_modes_using_temp_var = 0; rt_sf->reduce_mv_pel_precision_highmotion = 0; rt_sf->reduce_mv_pel_precision_lowcomplex = 0; rt_sf->prune_intra_mode_based_on_mv_range = 0; rt_sf->var_part_split_threshold_shift = 7; rt_sf->gf_refresh_based_on_qp = 0; rt_sf->use_rtc_tf = 0; rt_sf->use_idtx_nonrd = 0; rt_sf->prune_idtx_nonrd = 0; rt_sf->dct_only_palette_nonrd = 0; rt_sf->part_early_exit_zeromv = 0; rt_sf->sse_early_term_inter_search = EARLY_TERM_DISABLED; rt_sf->skip_lf_screen = 0; rt_sf->thresh_active_maps_skip_lf_cdef = 100; rt_sf->sad_based_adp_altref_lag = 0; rt_sf->partition_direct_merging = 0; rt_sf->var_part_based_on_qidx = 0; rt_sf->tx_size_level_based_on_qstep = 0; rt_sf->vbp_prune_16x16_split_using_min_max_sub_blk_var = false; rt_sf->prune_compoundmode_with_singlecompound_var = false; rt_sf->frame_level_mode_cost_update = false; rt_sf->prune_h_pred_using_best_mode_so_far = false; rt_sf->enable_intra_mode_pruning_using_neighbors = false; rt_sf->prune_intra_mode_using_best_sad_so_far = false; rt_sf->check_only_zero_zeromv_on_large_blocks = false; rt_sf->disable_cdf_update_non_reference_frame = false; rt_sf->prune_compoundmode_with_singlemode_var = false; rt_sf->skip_compound_based_on_var = false; rt_sf->set_zeromv_skip_based_on_source_sad = 1; rt_sf->use_adaptive_subpel_search = false; rt_sf->screen_content_cdef_filter_qindex_thresh = 0; rt_sf->enable_ref_short_signaling = false; rt_sf->check_globalmv_on_single_ref = true; rt_sf->increase_color_thresh_palette = false; rt_sf->selective_cdf_update = 0; rt_sf->force_only_last_ref = 0; rt_sf->higher_thresh_scene_detection = 1; rt_sf->skip_newmv_flat_blocks_screen = 0; rt_sf->skip_encoding_non_reference_slide_change = 0; rt_sf->rc_faster_convergence_static = 0; } static fractional_mv_step_fp *const fractional_mv_search[SUBPEL_SEARCH_METHODS] = { av1_find_best_sub_pixel_tree, // SUBPEL_TREE = 0 av1_find_best_sub_pixel_tree_pruned, // SUBPEL_TREE_PRUNED = 1 av1_find_best_sub_pixel_tree_pruned_more // SUBPEL_TREE_PRUNED_MORE = 2 }; // Populate appropriate sub-pel search method based on speed feature and user // specified settings static void set_subpel_search_method( MotionVectorSearchParams *mv_search_params, unsigned int motion_vector_unit_test, SUBPEL_SEARCH_METHOD subpel_search_method) { assert(subpel_search_method <= SUBPEL_TREE_PRUNED_MORE); mv_search_params->find_fractional_mv_step = fractional_mv_search[subpel_search_method]; // This is only used in motion vector unit test. if (motion_vector_unit_test == 1) mv_search_params->find_fractional_mv_step = av1_return_max_sub_pixel_mv; else if (motion_vector_unit_test == 2) mv_search_params->find_fractional_mv_step = av1_return_min_sub_pixel_mv; } void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) { SPEED_FEATURES *const sf = &cpi->sf; const AV1EncoderConfig *const oxcf = &cpi->oxcf; switch (oxcf->mode) { case GOOD: set_good_speed_feature_framesize_dependent(cpi, sf, speed); break; case ALLINTRA: set_allintra_speed_feature_framesize_dependent(cpi, sf, speed); break; case REALTIME: set_rt_speed_feature_framesize_dependent(cpi, sf, speed); break; } if (!cpi->ppi->seq_params_locked) { cpi->common.seq_params->enable_masked_compound &= !sf->inter_sf.disable_masked_comp; cpi->common.seq_params->enable_interintra_compound &= (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX); } set_subpel_search_method(&cpi->mv_search_params, cpi->oxcf.unit_test_cfg.motion_vector_unit_test, sf->mv_sf.subpel_search_method); // For multi-thread use case with row_mt enabled, cost update for a set of // SB rows is not desirable. Hence, the sf mv_cost_upd_level is set to // INTERNAL_COST_UPD_SBROW in such cases. if ((cpi->oxcf.row_mt == 1) && (cpi->mt_info.num_workers > 1)) { if (sf->inter_sf.mv_cost_upd_level == INTERNAL_COST_UPD_SBROW_SET) { // Set mv_cost_upd_level to use row level update. sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW; } } } void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) { SPEED_FEATURES *const sf = &cpi->sf; WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params; const AV1EncoderConfig *const oxcf = &cpi->oxcf; int i; init_hl_sf(&sf->hl_sf); init_fp_sf(&sf->fp_sf); init_tpl_sf(&sf->tpl_sf); init_gm_sf(&sf->gm_sf); init_part_sf(&sf->part_sf); init_mv_sf(&sf->mv_sf); init_inter_sf(&sf->inter_sf); init_interp_sf(&sf->interp_sf); init_intra_sf(&sf->intra_sf); init_tx_sf(&sf->tx_sf); init_rd_sf(&sf->rd_sf, oxcf); init_winner_mode_sf(&sf->winner_mode_sf); init_lpf_sf(&sf->lpf_sf); init_rt_sf(&sf->rt_sf); switch (oxcf->mode) { case GOOD: set_good_speed_features_framesize_independent(cpi, sf, speed); break; case ALLINTRA: set_allintra_speed_features_framesize_independent(cpi, sf, speed); break; case REALTIME: set_rt_speed_features_framesize_independent(cpi, sf, speed); break; } // Note: when use_nonrd_pick_mode is true, the transform size is the // minimum of 16x16 and the largest possible size of the current block, // which conflicts with the speed feature "enable_tx_size_search". if (!oxcf->txfm_cfg.enable_tx_size_search && sf->rt_sf.use_nonrd_pick_mode == 0) { sf->winner_mode_sf.tx_size_search_level = 3; } if (cpi->mt_info.num_workers > 1) { // Loop restoration stage is conditionally disabled for speed 5, 6 when // num_workers > 1. Since av1_pick_filter_restoration() is not // multi-threaded, enabling the Loop restoration stage will cause an // increase in encode time (3% to 7% increase depends on frame // resolution). // TODO(aomedia:3446): Implement multi-threading of // av1_pick_filter_restoration() and enable Wiener filter for speed 5, 6 // similar to single thread encoding path. if (speed >= 5) { sf->lpf_sf.disable_sgr_filter = true; sf->lpf_sf.disable_wiener_filter = true; } } if (!cpi->ppi->seq_params_locked) { cpi->common.seq_params->order_hint_info.enable_dist_wtd_comp &= (sf->inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED); cpi->common.seq_params->enable_dual_filter &= !sf->interp_sf.disable_dual_filter; // Set the flag 'enable_restoration', if one the Loop restoration filters // (i.e., Wiener or Self-guided) is enabled. cpi->common.seq_params->enable_restoration &= (!sf->lpf_sf.disable_wiener_filter || !sf->lpf_sf.disable_sgr_filter); cpi->common.seq_params->enable_interintra_compound &= (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX); } const int mesh_speed = AOMMIN(speed, MAX_MESH_SPEED); for (i = 0; i < MAX_MESH_STEP; ++i) { sf->mv_sf.mesh_patterns[i].range = good_quality_mesh_patterns[mesh_speed][i].range; sf->mv_sf.mesh_patterns[i].interval = good_quality_mesh_patterns[mesh_speed][i].interval; } // Update the mesh pattern of exhaustive motion search for intraBC // Though intraBC mesh pattern is populated for all frame types, it is used // only for intra frames of screen contents for (i = 0; i < MAX_MESH_STEP; ++i) { sf->mv_sf.intrabc_mesh_patterns[i].range = intrabc_mesh_patterns[mesh_speed][i].range; sf->mv_sf.intrabc_mesh_patterns[i].interval = intrabc_mesh_patterns[mesh_speed][i].interval; } // Slow quant, dct and trellis not worthwhile for first pass // so make sure they are always turned off. if (is_stat_generation_stage(cpi)) sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT; // No recode for 1 pass. if (oxcf->pass == AOM_RC_ONE_PASS && has_no_stats_stage(cpi)) sf->hl_sf.recode_loop = DISALLOW_RECODE; set_subpel_search_method(&cpi->mv_search_params, cpi->oxcf.unit_test_cfg.motion_vector_unit_test, sf->mv_sf.subpel_search_method); // assert ensures that tx_domain_dist_level is accessed correctly assert(cpi->sf.rd_sf.tx_domain_dist_thres_level >= 0 && cpi->sf.rd_sf.tx_domain_dist_thres_level < 4); memcpy(winner_mode_params->tx_domain_dist_threshold, tx_domain_dist_thresholds[cpi->sf.rd_sf.tx_domain_dist_thres_level], sizeof(winner_mode_params->tx_domain_dist_threshold)); assert(cpi->sf.rd_sf.tx_domain_dist_level >= 0 && cpi->sf.rd_sf.tx_domain_dist_level < TX_DOMAIN_DIST_LEVELS); memcpy(winner_mode_params->use_transform_domain_distortion, tx_domain_dist_types[cpi->sf.rd_sf.tx_domain_dist_level], sizeof(winner_mode_params->use_transform_domain_distortion)); // assert ensures that coeff_opt_thresholds is accessed correctly assert(cpi->sf.rd_sf.perform_coeff_opt >= 0 && cpi->sf.rd_sf.perform_coeff_opt < 9); memcpy(winner_mode_params->coeff_opt_thresholds, &coeff_opt_thresholds[cpi->sf.rd_sf.perform_coeff_opt], sizeof(winner_mode_params->coeff_opt_thresholds)); // assert ensures that predict_skip_levels is accessed correctly assert(cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction >= 0 && cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction < 3); memcpy(winner_mode_params->skip_txfm_level, predict_skip_levels[cpi->sf.tx_sf.tx_type_search .use_skip_flag_prediction], sizeof(winner_mode_params->skip_txfm_level)); // assert ensures that tx_size_search_level is accessed correctly assert(cpi->sf.winner_mode_sf.tx_size_search_level >= 0 && cpi->sf.winner_mode_sf.tx_size_search_level <= 3); memcpy(winner_mode_params->tx_size_search_methods, tx_size_search_methods[cpi->sf.winner_mode_sf.tx_size_search_level], sizeof(winner_mode_params->tx_size_search_methods)); memcpy(winner_mode_params->predict_dc_level, predict_dc_levels[cpi->sf.winner_mode_sf.dc_blk_pred_level], sizeof(winner_mode_params->predict_dc_level)); if (cpi->oxcf.row_mt == 1 && (cpi->mt_info.num_workers > 1)) { if (sf->inter_sf.inter_mode_rd_model_estimation == 1) { // Revert to type 2 sf->inter_sf.inter_mode_rd_model_estimation = 2; } #if !CONFIG_FPMT_TEST // Disable the speed feature 'prune_ref_frame_for_gm_search' to achieve // better parallelism when number of threads available are greater than or // equal to maximum number of reference frames allowed for global motion. if (sf->gm_sf.gm_search_type != GM_DISABLE_SEARCH && (cpi->mt_info.num_workers >= gm_available_reference_frames[sf->gm_sf.gm_search_type])) sf->gm_sf.prune_ref_frame_for_gm_search = 0; #endif } // This only applies to the real time mode. Adaptive gf refresh is disabled if // gf_cbr_boost_pct that is set by the user is larger than 0. if (cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 0) sf->rt_sf.gf_refresh_based_on_qp = 0; } // Override some speed features based on qindex void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) { AV1_COMMON *const cm = &cpi->common; SPEED_FEATURES *const sf = &cpi->sf; WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params; const int boosted = frame_is_boosted(cpi); const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480; const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080; const int is_1440p_or_larger = AOMMIN(cm->width, cm->height) >= 1440; const int is_arf2_bwd_type = cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE; if (cpi->oxcf.mode == REALTIME) { if (speed >= 6) { const int qindex_thresh = boosted ? 190 : (is_720p_or_larger ? 120 : 150); sf->part_sf.adjust_var_based_rd_partitioning = frame_is_intra_only(cm) ? 0 : cm->quant_params.base_qindex > qindex_thresh; } return; } if (speed == 0) { // qindex_thresh for resolution < 720p const int qindex_thresh = boosted ? 70 : (is_arf2_bwd_type ? 110 : 140); if (!is_720p_or_larger && cm->quant_params.base_qindex <= qindex_thresh) { sf->part_sf.simple_motion_search_split = cm->features.allow_screen_content_tools ? 1 : 2; sf->part_sf.simple_motion_search_early_term_none = 1; sf->tx_sf.model_based_prune_tx_search_level = 0; } if (is_720p_or_larger && cm->quant_params.base_qindex <= 128) { sf->rd_sf.perform_coeff_opt = 2 + is_1080p_or_larger; memcpy(winner_mode_params->coeff_opt_thresholds, &coeff_opt_thresholds[sf->rd_sf.perform_coeff_opt], sizeof(winner_mode_params->coeff_opt_thresholds)); sf->part_sf.simple_motion_search_split = cm->features.allow_screen_content_tools ? 1 : 2; sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; sf->tx_sf.model_based_prune_tx_search_level = 0; if (is_1080p_or_larger && cm->quant_params.base_qindex <= 108) { sf->inter_sf.selective_ref_frame = 2; sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2; sf->rd_sf.tx_domain_dist_thres_level = 1; sf->part_sf.simple_motion_search_early_term_none = 1; sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000; sf->interp_sf.cb_pred_filter_search = 0; sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2; sf->tx_sf.tx_type_search.skip_tx_search = 1; } } } if (speed >= 2) { // Disable extended partitions for lower quantizers const int aggr = AOMMIN(4, speed - 2); const int qindex_thresh1[4] = { 50, 50, 80, 100 }; const int qindex_thresh2[4] = { 80, 100, 120, 160 }; int qindex_thresh; if (aggr <= 1) { const int qthresh2 = (!aggr && !is_480p_or_larger) ? 70 : qindex_thresh2[aggr]; qindex_thresh = cm->features.allow_screen_content_tools ? qindex_thresh1[aggr] : qthresh2; if (cm->quant_params.base_qindex <= qindex_thresh && !boosted) sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; } else if (aggr <= 2) { qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr]; if (cm->quant_params.base_qindex <= qindex_thresh && !frame_is_intra_only(cm)) sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; } else if (aggr <= 3) { if (!is_480p_or_larger) { sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; } else if (!is_720p_or_larger && !frame_is_intra_only(cm) && !cm->features.allow_screen_content_tools) { sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; } else { qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr]; if (cm->quant_params.base_qindex <= qindex_thresh && !frame_is_intra_only(cm)) sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; } } else { sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; } } if (speed >= 4) { // Disable rectangular partitions for lower quantizers const int aggr = AOMMIN(1, speed - 4); const int qindex_thresh[2] = { 65, 80 }; int disable_rect_part; disable_rect_part = !boosted; if (cm->quant_params.base_qindex <= qindex_thresh[aggr] && disable_rect_part && is_480p_or_larger) { sf->part_sf.rect_partition_eval_thresh = BLOCK_8X8; } } if (speed <= 2) { if (!is_stat_generation_stage(cpi)) { // Use faster full-pel motion search for high quantizers. // Also use reduced total search range for low resolutions at high // quantizers. const int aggr = speed; const int qindex_thresh1 = ms_qindex_thresh[aggr][is_720p_or_larger][0]; const int qindex_thresh2 = ms_qindex_thresh[aggr][is_720p_or_larger][1]; const SEARCH_METHODS search_method = motion_search_method[is_720p_or_larger]; if (cm->quant_params.base_qindex > qindex_thresh1) { sf->mv_sf.search_method = search_method; sf->tpl_sf.search_method = search_method; } else if (cm->quant_params.base_qindex > qindex_thresh2) { sf->mv_sf.search_method = NSTEP_8PT; } } } if (speed >= 4) { // Disable LR search at low and high quantizers and enable only for // mid-quantizer range. if (!boosted && !is_arf2_bwd_type) { const int qindex_low[2] = { 100, 60 }; const int qindex_high[2] = { 180, 160 }; if (cm->quant_params.base_qindex <= qindex_low[is_720p_or_larger] || cm->quant_params.base_qindex > qindex_high[is_720p_or_larger]) { sf->lpf_sf.disable_loop_restoration_luma = 1; } } } if (speed == 1) { // Reuse interinter wedge mask search from first search for non-boosted // non-internal-arf frames, except at very high quantizers. if (cm->quant_params.base_qindex <= 200) { if (!boosted && !is_arf2_bwd_type) sf->inter_sf.reuse_mask_search_results = 1; } } if (speed == 5) { if (!(frame_is_intra_only(&cpi->common) || cm->features.allow_screen_content_tools)) { const int qindex[2] = { 256, 128 }; // Set the sf value as 3 for low resolution and // for higher resolutions with low quantizers. if (cm->quant_params.base_qindex < qindex[is_480p_or_larger]) sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 3; } } if (speed >= 5) { // Disable the sf for low quantizers in case of low resolution screen // contents. if (cm->features.allow_screen_content_tools && cm->quant_params.base_qindex < 128 && is_480p_or_lesser) { sf->part_sf.prune_sub_8x8_partition_level = 0; } } // Loop restoration size search // At speed 0, always search all available sizes for the maximum possible gain sf->lpf_sf.min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE; sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX; if (speed >= 1) { // For large frames, small restoration units are almost never useful, // so prune them away if (is_1440p_or_larger) { sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX; } else if (is_720p_or_larger) { sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1; } } if (speed >= 3 || (cpi->oxcf.mode == ALLINTRA && speed >= 1)) { // At this speed, a full search is too expensive. Instead, pick a single // size based on size and qindex. Note that, in general, higher quantizers // (== lower quality) and larger frames generally want to use larger // restoration units. int qindex_thresh = 96; if (cm->quant_params.base_qindex <= qindex_thresh && !is_1440p_or_larger) { sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1; sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1; } else { sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX; sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX; } } set_subpel_search_method(&cpi->mv_search_params, cpi->oxcf.unit_test_cfg.motion_vector_unit_test, sf->mv_sf.subpel_search_method); } aom-3.12.1/av1/encoder/speed_features.h000066400000000000000000002355511477627663500176710ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_SPEED_FEATURES_H_ #define AOM_AV1_ENCODER_SPEED_FEATURES_H_ #include "av1/common/enums.h" #include "av1/encoder/enc_enums.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/encodemb.h" #ifdef __cplusplus extern "C" { #endif /*! @file */ /*!\cond */ #define MAX_MESH_STEP 4 typedef struct MESH_PATTERN { int range; int interval; } MESH_PATTERN; enum { GM_FULL_SEARCH, GM_REDUCED_REF_SEARCH_SKIP_L2_L3, GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2, // Same as GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2 but with extra filtering // to keep at most two ref frames GM_SEARCH_CLOSEST_REFS_ONLY, GM_DISABLE_SEARCH } UENUM1BYTE(GM_SEARCH_TYPE); enum { DIST_WTD_COMP_ENABLED, DIST_WTD_COMP_SKIP_MV_SEARCH, DIST_WTD_COMP_DISABLED, } UENUM1BYTE(DIST_WTD_COMP_FLAG); enum { INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) | (1 << D135_PRED) | (1 << D113_PRED) | (1 << D157_PRED) | (1 << D203_PRED) | (1 << D67_PRED) | (1 << SMOOTH_PRED) | (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) | (1 << PAETH_PRED), UV_INTRA_ALL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) | (1 << UV_D45_PRED) | (1 << UV_D135_PRED) | (1 << UV_D113_PRED) | (1 << UV_D157_PRED) | (1 << UV_D203_PRED) | (1 << UV_D67_PRED) | (1 << UV_SMOOTH_PRED) | (1 << UV_SMOOTH_V_PRED) | (1 << UV_SMOOTH_H_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED), UV_INTRA_DC = (1 << UV_DC_PRED), UV_INTRA_DC_CFL = (1 << UV_DC_PRED) | (1 << UV_CFL_PRED), UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED), UV_INTRA_DC_PAETH_CFL = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED), UV_INTRA_DC_H_V = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED), UV_INTRA_DC_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) | (1 << UV_CFL_PRED), UV_INTRA_DC_PAETH_H_V = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED), UV_INTRA_DC_PAETH_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) | (1 << UV_CFL_PRED), INTRA_DC = (1 << DC_PRED), INTRA_DC_TM = (1 << DC_PRED) | (1 << PAETH_PRED), INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED), INTRA_DC_H_V_SMOOTH = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << SMOOTH_PRED), INTRA_DC_PAETH_H_V = (1 << DC_PRED) | (1 << PAETH_PRED) | (1 << V_PRED) | (1 << H_PRED) }; enum { INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) | (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) | (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV), INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) | (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) | (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | (1 << NEAR_NEARMV), INTER_SINGLE_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | (1 << NEWMV), }; enum { DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) | (1 << THR_ALTR) | (1 << THR_GOLD) | (1 << THR_LAST), DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT, DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA), LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) | (1 << THR_ALTR) | (1 << THR_GOLD) }; enum { TXFM_CODING_SF = 1, INTER_PRED_SF = 2, INTRA_PRED_SF = 4, PARTITION_SF = 8, LOOP_FILTER_SF = 16, RD_SKIP_SF = 32, RESERVE_2_SF = 64, RESERVE_3_SF = 128, } UENUM1BYTE(DEV_SPEED_FEATURES); /* This enumeration defines when the rate control recode loop will be * enabled. */ enum { /* * No recodes allowed */ DISALLOW_RECODE = 0, /* * Allow recode only for KF/ARF/GF frames */ ALLOW_RECODE_KFARFGF = 1, /* * Allow recode for all frame types based on bitrate constraints. */ ALLOW_RECODE = 2, } UENUM1BYTE(RECODE_LOOP_TYPE); enum { SUBPEL_TREE = 0, SUBPEL_TREE_PRUNED = 1, // Prunes 1/2-pel searches SUBPEL_TREE_PRUNED_MORE = 2, // Prunes 1/2-pel searches more aggressively SUBPEL_SEARCH_METHODS } UENUM1BYTE(SUBPEL_SEARCH_METHOD); enum { // Try the full image with different values. LPF_PICK_FROM_FULL_IMAGE, // Try the full image filter search with non-dual filter only. LPF_PICK_FROM_FULL_IMAGE_NON_DUAL, // Try a small portion of the image with different values. LPF_PICK_FROM_SUBIMAGE, // Estimate the level based on quantizer and frame type LPF_PICK_FROM_Q, // Pick 0 to disable LPF if LPF was enabled last frame LPF_PICK_MINIMAL_LPF } UENUM1BYTE(LPF_PICK_METHOD); /*!\endcond */ /*!\enum CDEF_PICK_METHOD * \brief This enumeration defines a variety of CDEF pick methods */ typedef enum { CDEF_FULL_SEARCH, /**< Full search */ CDEF_FAST_SEARCH_LVL1, /**< Search among a subset of all possible filters. */ CDEF_FAST_SEARCH_LVL2, /**< Search reduced subset of filters than Level 1. */ CDEF_FAST_SEARCH_LVL3, /**< Search reduced subset of secondary filters than Level 2. */ CDEF_FAST_SEARCH_LVL4, /**< Search reduced subset of filters than Level 3. */ CDEF_FAST_SEARCH_LVL5, /**< Search reduced subset of filters than Level 4. */ CDEF_PICK_FROM_Q, /**< Estimate filter strength based on quantizer. */ CDEF_PICK_METHODS } CDEF_PICK_METHOD; /*!\cond */ enum { // Terminate search early based on distortion so far compared to // qp step, distortion in the neighborhood of the frame, etc. FLAG_EARLY_TERMINATE = 1 << 0, // Skips comp inter modes if the best so far is an intra mode. FLAG_SKIP_COMP_BESTINTRA = 1 << 1, // Skips oblique intra modes if the best so far is an inter mode. FLAG_SKIP_INTRA_BESTINTER = 1 << 3, // Skips oblique intra modes at angles 27, 63, 117, 153 if the best // intra so far is not one of the neighboring directions. FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4, // Skips intra modes other than DC_PRED if the source variance is small FLAG_SKIP_INTRA_LOWVAR = 1 << 5, } UENUM1BYTE(MODE_SEARCH_SKIP_LOGIC); enum { // No tx type pruning TX_TYPE_PRUNE_0 = 0, // adaptively prunes the least perspective tx types out of all 16 // (tuned to provide negligible quality loss) TX_TYPE_PRUNE_1 = 1, // similar, but applies much more aggressive pruning to get better speed-up TX_TYPE_PRUNE_2 = 2, TX_TYPE_PRUNE_3 = 3, // More aggressive pruning based on tx type score and allowed tx count TX_TYPE_PRUNE_4 = 4, TX_TYPE_PRUNE_5 = 5, } UENUM1BYTE(TX_TYPE_PRUNE_MODE); enum { // No reaction to rate control on a detected slide/scene change. NO_DETECTION = 0, // Set to larger Q based only on the detected slide/scene change and // current/past Q. FAST_DETECTION_MAXQ = 1, } UENUM1BYTE(OVERSHOOT_DETECTION_CBR); enum { // Turns off multi-winner mode. So we will do txfm search on either all modes // if winner mode is off, or we will only on txfm search on a single winner // mode. MULTI_WINNER_MODE_OFF = 0, // Limits the number of winner modes to at most 2 MULTI_WINNER_MODE_FAST = 1, // Uses the default number of winner modes, which is 3 for intra mode, and 1 // for inter mode. MULTI_WINNER_MODE_DEFAULT = 2, // Maximum number of winner modes allowed. MULTI_WINNER_MODE_LEVELS, } UENUM1BYTE(MULTI_WINNER_MODE_TYPE); enum { PRUNE_NEARMV_OFF = 0, // Turn off nearmv pruning PRUNE_NEARMV_LEVEL1 = 1, // Prune nearmv for qindex (0-85) PRUNE_NEARMV_LEVEL2 = 2, // Prune nearmv for qindex (0-170) PRUNE_NEARMV_LEVEL3 = 3, // Prune nearmv more aggressively for qindex (0-170) PRUNE_NEARMV_MAX = PRUNE_NEARMV_LEVEL3, } UENUM1BYTE(PRUNE_NEARMV_LEVEL); enum { // Default transform search used in evaluation of best inter candidates // (MODE_EVAL stage) and motion mode winner processing (WINNER_MODE_EVAL // stage). TX_SEARCH_DEFAULT = 0, // Transform search in motion mode rd during MODE_EVAL stage. TX_SEARCH_MOTION_MODE, // Transform search in compound type mode rd during MODE_EVAL stage. TX_SEARCH_COMP_TYPE_MODE, // All transform search cases TX_SEARCH_CASES } UENUM1BYTE(TX_SEARCH_CASE); typedef struct { TX_TYPE_PRUNE_MODE prune_2d_txfm_mode; int fast_intra_tx_type_search; // INT_MAX: Disable fast search. // 1 - 1024: Probability threshold used for conditionally forcing tx type, // during mode search. // 0: Force tx type to be DCT_DCT unconditionally, during // mode search. int fast_inter_tx_type_prob_thresh; // Prune less likely chosen transforms for each intra mode. The speed // feature ranges from 0 to 2, for different speed / compression trade offs. int use_reduced_intra_txset; // Use a skip flag prediction model to detect blocks with skip = 1 early // and avoid doing full TX type search for such blocks. int use_skip_flag_prediction; // Threshold used by the ML based method to predict TX block split decisions. int ml_tx_split_thresh; // skip remaining transform type search when we found the rdcost of skip is // better than applying transform int skip_tx_search; // Prune tx type search using previous frame stats. int prune_tx_type_using_stats; // Prune tx type search using estimated RDcost int prune_tx_type_est_rd; // Flag used to control the winner mode processing for tx type pruning for // inter blocks. It enables further tx type mode pruning based on ML model for // mode evaluation and disables tx type mode pruning for winner mode // processing. int winner_mode_tx_type_pruning; } TX_TYPE_SEARCH; enum { // Search partitions using RD criterion SEARCH_PARTITION, // Always use a fixed size partition FIXED_PARTITION, // Partition using source variance VAR_BASED_PARTITION, #if CONFIG_RT_ML_PARTITIONING // Partition using ML model ML_BASED_PARTITION #endif } UENUM1BYTE(PARTITION_SEARCH_TYPE); enum { NOT_IN_USE, DIRECT_PRED, RELAXED_PRED, ADAPT_PRED } UENUM1BYTE(MAX_PART_PRED_MODE); enum { LAST_MV_DATA, CURRENT_Q, QTR_ONLY, } UENUM1BYTE(MV_PREC_LOGIC); enum { SUPERRES_AUTO_ALL, // Tries all possible superres ratios SUPERRES_AUTO_DUAL, // Tries no superres and q-based superres ratios SUPERRES_AUTO_SOLO, // Only apply the q-based superres ratio } UENUM1BYTE(SUPERRES_AUTO_SEARCH_TYPE); /*!\endcond */ /*!\enum INTERNAL_COST_UPDATE_TYPE * \brief This enum decides internally how often to update the entropy costs * * INTERNAL_COST_UPD_TYPE is similar to \ref COST_UPDATE_TYPE but has slightly * more flexibility in update frequency. This enum is separate from \ref * COST_UPDATE_TYPE because although \ref COST_UPDATE_TYPE is not exposed, its * values are public so it cannot be modified without breaking public API. * Due to the use of AOMMIN() in populate_unified_cost_update_freq() to * compute the unified cost update frequencies (out of COST_UPDATE_TYPE and * INTERNAL_COST_UPDATE_TYPE), the values of this enum type must be listed in * the order of increasing frequencies. * * \warning In case of any updates/modifications to the enum COST_UPDATE_TYPE, * update the enum INTERNAL_COST_UPDATE_TYPE as well. */ typedef enum { INTERNAL_COST_UPD_OFF, /*!< Turn off cost updates. */ INTERNAL_COST_UPD_TILE, /*!< Update every tile. */ INTERNAL_COST_UPD_SBROW_SET, /*!< Update every row_set of height 256 pixs. */ INTERNAL_COST_UPD_SBROW, /*!< Update every sb rows inside a tile. */ INTERNAL_COST_UPD_SB, /*!< Update every sb. */ } INTERNAL_COST_UPDATE_TYPE; /*!\enum SIMPLE_MOTION_SEARCH_PRUNE_LEVEL * \brief This enumeration defines a variety of simple motion search based * partition prune levels */ typedef enum { NO_PRUNING = -1, SIMPLE_AGG_LVL0, /*!< Simple prune aggressiveness level 0. */ SIMPLE_AGG_LVL1, /*!< Simple prune aggressiveness level 1. */ SIMPLE_AGG_LVL2, /*!< Simple prune aggressiveness level 2. */ SIMPLE_AGG_LVL3, /*!< Simple prune aggressiveness level 3. */ QIDX_BASED_AGG_LVL1, /*!< Qindex based prune aggressiveness level, aggressive level maps to simple agg level 1 or 2 based on qindex. */ TOTAL_SIMPLE_AGG_LVLS = QIDX_BASED_AGG_LVL1, /*!< Total number of simple prune aggressiveness levels. */ TOTAL_QINDEX_BASED_AGG_LVLS = QIDX_BASED_AGG_LVL1 - SIMPLE_AGG_LVL3, /*!< Total number of qindex based simple prune aggressiveness levels. */ TOTAL_AGG_LVLS = TOTAL_SIMPLE_AGG_LVLS + TOTAL_QINDEX_BASED_AGG_LVLS, /*!< Total number of levels. */ } SIMPLE_MOTION_SEARCH_PRUNE_LEVEL; /*!\enum PRUNE_MESH_SEARCH_LEVEL * \brief This enumeration defines a variety of mesh search prune levels. */ typedef enum { PRUNE_MESH_SEARCH_DISABLED = 0, /*!< Prune mesh search level 0. */ PRUNE_MESH_SEARCH_LVL_1 = 1, /*!< Prune mesh search level 1. */ PRUNE_MESH_SEARCH_LVL_2 = 2, /*!< Prune mesh search level 2. */ } PRUNE_MESH_SEARCH_LEVEL; /*!\enum INTER_SEARCH_EARLY_TERM_IDX * \brief This enumeration defines inter search early termination index in * non-rd path based on sse value. */ typedef enum { EARLY_TERM_DISABLED = 0, /*!< Early terminate inter mode search based on sse disabled. */ EARLY_TERM_IDX_1 = 1, /*!< Early terminate inter mode search based on sse, index 1. */ EARLY_TERM_IDX_2 = 2, /*!< Early terminate inter mode search based on sse, index 2. */ EARLY_TERM_IDX_3 = 3, /*!< Early terminate inter mode search based on sse, index 3. */ EARLY_TERM_IDX_4 = 4, /*!< Early terminate inter mode search based on sse, index 4. */ EARLY_TERM_INDICES, /*!< Total number of early terminate indices */ } INTER_SEARCH_EARLY_TERM_IDX; /*! * \brief Sequence/frame level speed vs quality features */ typedef struct HIGH_LEVEL_SPEED_FEATURES { /*! Frame level coding parameter update. */ int frame_parameter_update; /*! * Cases and frame types for which the recode loop is enabled. */ RECODE_LOOP_TYPE recode_loop; /*! * Controls the tolerance vs target rate used in deciding whether to * recode a frame. It has no meaning if recode is disabled. */ int recode_tolerance; /*! * Determine how motion vector precision is chosen. The possibilities are: * LAST_MV_DATA: use the mv data from the last coded frame * CURRENT_Q: use the current q as a threshold * QTR_ONLY: use quarter pel precision only. */ MV_PREC_LOGIC high_precision_mv_usage; /*! * Always set to 0. If on it enables 0 cost background transmission * (except for the initial transmission of the segmentation). The feature is * disabled because the addition of very large block sizes make the * backgrounds very to cheap to encode, and the segmentation we have * adds overhead. */ int static_segmentation; /*! * Superres-auto mode search type: */ SUPERRES_AUTO_SEARCH_TYPE superres_auto_search_type; /*! * Enable/disable extra screen content test by encoding key frame twice. */ int disable_extra_sc_testing; /*! * Enable/disable second_alt_ref temporal filtering. */ int second_alt_ref_filtering; /*! * The number of frames to be used during temporal filtering of an ARF frame * is adjusted based on noise level of the current frame. The sf has three * levels to decide number of frames to be considered for filtering: * 0 : Use default number of frames * 1 and 2 : Reduce the number of frames based on noise level with varied * aggressiveness */ int adjust_num_frames_for_arf_filtering; /*! * Decide the bit estimation approach used in qindex decision. * 0: estimate bits based on a constant value; * 1: estimate bits more accurately based on the frame complexity. */ int accurate_bit_estimate; /*! * Decide the approach for weight calculation during temporal filtering. * 0: Calculate weight using exp() * 1: Calculate weight using a lookup table that approximates exp(). */ int weight_calc_level_in_tf; /*! * Decide whether to perform motion estimation at split block (i.e. 16x16) * level or not. * 0: Always allow motion estimation. * 1: Conditionally allow motion estimation based on 4x4 sub-blocks variance. */ int allow_sub_blk_me_in_tf; } HIGH_LEVEL_SPEED_FEATURES; /*! * Speed features for the first pass. */ typedef struct FIRST_PASS_SPEED_FEATURES { /*! * \brief Reduces the mv search window. * By default, the initial search window is around * MIN(MIN(dims), MAX_FULL_PEL_VAL) = MIN(MIN(dims), 1023). * Each step reduction decrease the window size by about a factor of 2. */ int reduce_mv_step_param; /*! * \brief Skips the motion search when the zero mv has small sse. */ int skip_motion_search_threshold; /*! * \brief Skips reconstruction by using source buffers for prediction */ int disable_recon; /*! * \brief Skips the motion search centered on 0,0 mv. */ int skip_zeromv_motion_search; } FIRST_PASS_SPEED_FEATURES; /*!\cond */ typedef struct TPL_SPEED_FEATURES { // GOP length adaptive decision. // If set to 0, tpl model decides whether a shorter gf interval is better. // If set to 1, tpl stats of ARFs from base layer, (base+1) layer and // (base+2) layer decide whether a shorter gf interval is better. // If set to 2, tpl stats of ARFs from base layer, (base+1) layer and GF boost // decide whether a shorter gf interval is better. // If set to 3, gop length adaptive decision is disabled. int gop_length_decision_method; // Prune the intra modes search by tpl. // If set to 0, we will search all intra modes from DC_PRED to PAETH_PRED. // If set to 1, we only search DC_PRED, V_PRED, and H_PRED. int prune_intra_modes; // This parameter controls which step in the n-step process we start at. int reduce_first_step_size; // Skip motion estimation based on the precision of center MVs and the // difference between center MVs. // If set to 0, motion estimation is skipped for duplicate center MVs // (default). If set to 1, motion estimation is skipped for duplicate // full-pixel center MVs. If set to 2, motion estimation is skipped if the // difference between center MVs is less than the threshold. int skip_alike_starting_mv; // When to stop subpel search. SUBPEL_FORCE_STOP subpel_force_stop; // Which search method to use. SEARCH_METHODS search_method; // Prune starting mvs in TPL based on sad scores. int prune_starting_mv; // Prune reference frames in TPL. int prune_ref_frames_in_tpl; // Support compound predictions. int allow_compound_pred; // Calculate rate and distortion based on Y plane only. int use_y_only_rate_distortion; // Use SAD instead of SATD during intra/inter mode search. // If set to 0, use SATD always. // If set to 1, use SAD during intra/inter mode search for frames in the // higher temporal layers of the hierarchical prediction structure. // If set to 2, use SAD during intra/inter mode search for all frames. // This sf is disabled for the first GF group of the key-frame interval, // i.e., SATD is used during intra/inter mode search of the first GF group. int use_sad_for_mode_decision; // Skip tpl processing for frames of type LF_UPDATE. // This sf is disabled for the first GF group of the key-frame interval. int reduce_num_frames; } TPL_SPEED_FEATURES; typedef struct GLOBAL_MOTION_SPEED_FEATURES { GM_SEARCH_TYPE gm_search_type; // During global motion estimation, prune remaining reference frames in a // given direction(past/future), if the evaluated ref_frame in that direction // yields gm_type as INVALID/TRANSLATION/IDENTITY int prune_ref_frame_for_gm_search; // When the current GM type is set to ZEROMV, prune ZEROMV if its performance // is worse than NEWMV under SSE metric. // 0 : no pruning // 1 : conservative pruning // 2 : aggressive pruning int prune_zero_mv_with_sse; // Disable global motion estimation based on stats of previous frames in the // GF group int disable_gm_search_based_on_stats; // Downsampling pyramid level to use for global motion estimation int downsample_level; // Number of refinement steps to apply after initial model generation int num_refinement_steps; } GLOBAL_MOTION_SPEED_FEATURES; typedef struct PARTITION_SPEED_FEATURES { PARTITION_SEARCH_TYPE partition_search_type; // Used if partition_search_type = FIXED_PARTITION BLOCK_SIZE fixed_partition_size; // Prune extended partition types search based on the current best partition // and the combined rdcost of the subblocks estimated from previous // partitions. Can take values 0 - 2, 0 referring to no pruning, and 1 - 2 // increasing aggressiveness of pruning in order. int prune_ext_partition_types_search_level; // Prune part4 based on block size int prune_part4_search; // Use a ML model to prune rectangular, ab and 4-way horz // and vert partitions int ml_prune_partition; // Use a ML model to adaptively terminate partition search after trying // PARTITION_SPLIT. Can take values 0 - 2, 0 meaning not being enabled, and // 1 - 2 increasing aggressiveness in order. int ml_early_term_after_part_split_level; // Skip rectangular partition test when partition type none gives better // rd than partition type split. Can take values 0 - 2, 0 referring to no // skipping, and 1 - 2 increasing aggressiveness of skipping in order. int less_rectangular_check_level; // Use square partition only beyond this block size. BLOCK_SIZE use_square_partition_only_threshold; // Sets max square partition levels for this superblock based on // motion vector and prediction error distribution produced from 16x16 // simple motion search MAX_PART_PRED_MODE auto_max_partition_based_on_simple_motion; // Min and max square partition size we enable (block_size) as per auto // min max, but also used by adjust partitioning, and pick_partitioning. BLOCK_SIZE default_min_partition_size; BLOCK_SIZE default_max_partition_size; // Sets level of adjustment of variance-based partitioning during // rd_use_partition 0 - no partition adjustment, 1 - try to merge partitions // for small blocks and high QP, 2 - try to merge partitions, 3 - try to merge // and split leaf partitions and 0 - 3 decreasing aggressiveness in order. int adjust_var_based_rd_partitioning; // Partition search early breakout thresholds. int64_t partition_search_breakout_dist_thr; int partition_search_breakout_rate_thr; // Thresholds for ML based partition search breakout. int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES]; // Aggressiveness levels for pruning split and rectangular partitions based on // simple_motion_search. SIMPLE_AGG_LVL0 to SIMPLE_AGG_LVL3 correspond to // simple motion search based pruning. QIDX_BASED_AGG_LVL1 corresponds to // qindex based and simple motion search based pruning. int simple_motion_search_prune_agg; // Perform simple_motion_search on each possible subblock and use it to prune // PARTITION_HORZ and PARTITION_VERT. int simple_motion_search_prune_rect; // Perform simple motion search before none_partition to decide if we // want to remove all partitions other than PARTITION_SPLIT. If set to 0, this // model is disabled. If set to 1, the model attempts to perform // PARTITION_SPLIT only. If set to 2, the model also attempts to prune // PARTITION_SPLIT. int simple_motion_search_split; // Use features from simple_motion_search to terminate prediction block // partition after PARTITION_NONE int simple_motion_search_early_term_none; // Controls whether to reduce the number of motion search steps. If this is 0, // then simple_motion_search has the same number of steps as // single_motion_search (assuming no other speed features). Otherwise, reduce // the number of steps by the value contained in this variable. int simple_motion_search_reduce_search_steps; // This variable controls the maximum block size where intra blocks can be // used in inter frames. // TODO(aconverse): Fold this into one of the other many mode skips BLOCK_SIZE max_intra_bsize; // Use CNN with luma pixels on source frame on each of the 64x64 subblock to // perform partition pruning in intra frames. // 0: No Pruning // 1: Prune split and rectangular partitions only // 2: Prune none, split and rectangular partitions int intra_cnn_based_part_prune_level; // Disable extended partition search if the current bsize is greater than the // threshold. Must be a square block size BLOCK_8X8 or higher. BLOCK_SIZE ext_partition_eval_thresh; // Use best partition decision so far to tune 'ext_partition_eval_thresh' int ext_part_eval_based_on_cur_best; // Disable rectangular partitions for larger block sizes. int rect_partition_eval_thresh; // Prune extended partition search based on whether the split/rect partitions // provided an improvement in the previous search. // 0 : no pruning // 1 : prune 1:4 partition search using winner info from split partitions // 2 : prune 1:4 and AB partition search using split and HORZ/VERT info int prune_ext_part_using_split_info; // Prunt rectangular, AB and 4-way partition based on q index and block size // 0 : no pruning // 1 : prune sub_8x8 at very low quantizers // 2 : prune all block size based on qindex int prune_rectangular_split_based_on_qidx; // Prune rectangular partitions based on 4x4 sub-block variance // false : no pruning // true : prune rectangular partitions based on 4x4 sub-block variance // deviation // // For allintra encode, this speed feature reduces instruction count by 6.4% // for speed=6 with coding performance change less than 0.24%. For AVIF image // encode, this speed feature reduces encode time by 8.14% for speed 6 on a // typical image dataset with coding performance change less than 0.16%. This // speed feature is not applicable to speed >= 7. bool prune_rect_part_using_4x4_var_deviation; // Prune rectangular partitions based on prediction mode chosen by NONE // partition. // false : no pruning // true : prunes rectangular partition as described below // If prediction mode chosen by NONE partition is // DC_PRED or SMOOTH_PRED: Prunes both horizontal and vertical partitions if // at least one of the left and top neighbor blocks is larger than the // current block. // Directional Mode: Prunes either of the horizontal and vertical partition // based on center angle of the prediction mode chosen by NONE partition. For // example, vertical partition is pruned if center angle of the prediction // mode chosen by NONE partition is close to 180 degrees (i.e. horizontal // direction) and vice versa. // For allintra encode, this speed feature reduces instruction count by 5.1% // for speed=6 with coding performance change less than 0.22%. For AVIF image // encode, this speed feature reduces encode time by 4.44% for speed 6 on a // typical image dataset with coding performance change less than 0.15%. // For speed >= 7, variance-based logic is used to determine the partition // structure instead of recursive partition search. Therefore, this speed // feature is not applicable in such cases. bool prune_rect_part_using_none_pred_mode; // Terminate partition search for child partition, // when NONE and SPLIT partition rd_costs are INT64_MAX. int early_term_after_none_split; // Level used to adjust threshold for av1_ml_predict_breakout(). At lower // levels, more conservative threshold is used, and value of 0 indicates // av1_ml_predict_breakout() is disabled. Value of 3 corresponds to default // case with no adjustment to lbd thresholds. int ml_predict_breakout_level; // Prune sub_8x8 (BLOCK_4X4, BLOCK_4X8 and BLOCK_8X4) partitions. // 0 : no pruning // 1 : pruning based on neighbour block information // 2 : prune always int prune_sub_8x8_partition_level; // Prune rectangular split based on simple motion search split/no_split score. // 0: disable pruning, 1: enable pruning int simple_motion_search_rect_split; // The current encoder adopts a DFS search for block partitions. // Therefore the mode selection and associated rdcost is ready for smaller // blocks before the mode selection for some partition types. // AB partition could use previous rd information and skip mode search. // An example is: // // current block // +---+---+ // | | // + + // | | // +-------+ // // SPLIT partition has been searched first before trying HORZ_A // +---+---+ // | R | R | // +---+---+ // | R | R | // +---+---+ // // HORZ_A // +---+---+ // | | | // +---+---+ // | | // +-------+ // // With this speed feature, the top two sub blocks can directly use rdcost // searched in split partition, and the mode info is also copied from // saved info. Similarly, the bottom rectangular block can also use // the available information from previous rectangular search. int reuse_prev_rd_results_for_part_ab; // Reuse the best prediction modes found in PARTITION_SPLIT and PARTITION_RECT // when encoding PARTITION_AB. int reuse_best_prediction_for_part_ab; // The current partition search records the best rdcost so far and uses it // in mode search and transform search to early skip when some criteria is // met. For example, when the current rdcost is larger than the best rdcost, // or the model rdcost is larger than the best rdcost times some thresholds. // By default, this feature is turned on to speed up the encoder partition // search. // If disabling it, at speed 0, 30 frames, we could get // about -0.25% quality gain (psnr, ssim, vmaf), with about 13% slowdown. int use_best_rd_for_pruning; // Skip evaluation of non-square partitions based on the corresponding NONE // partition. // 0: no pruning // 1: prune extended partitions if NONE is skippable // 2: on top of 1, prune rectangular partitions if NONE is inter, not a newmv // mode and skippable int skip_non_sq_part_based_on_none; // Disables 8x8 and below partitions for low quantizers. int disable_8x8_part_based_on_qidx; } PARTITION_SPEED_FEATURES; typedef struct MV_SPEED_FEATURES { // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc). SEARCH_METHODS search_method; // Enable the use of faster, less accurate mv search method // 0: disable, 1: if bsize >= BLOCK_32X32, 2: based on bsize, SAD and qp // TODO(chiyotsai@google.com): Take the clip's resolution and mv activity into // account. int use_bsize_dependent_search_method; // If this is set to 1, we limit the motion search range to 2 times the // largest motion vector found in the last frame. int auto_mv_step_size; // Subpel_search_method can only be subpel_tree which does a subpixel // logarithmic search that keeps stepping at 1/2 pixel units until // you stop getting a gain, and then goes on to 1/4 and repeats // the same process. Along the way it skips many diagonals. SUBPEL_SEARCH_METHOD subpel_search_method; // Maximum number of steps in logarithmic subpel search before giving up. int subpel_iters_per_step; // When to stop subpel search. SUBPEL_FORCE_STOP subpel_force_stop; // When to stop subpel search in simple motion search. SUBPEL_FORCE_STOP simple_motion_subpel_force_stop; // If true, sub-pixel search uses the exact convolve function used for final // encoding and decoding; otherwise, it uses bilinear interpolation. SUBPEL_SEARCH_TYPE use_accurate_subpel_search; // Threshold for allowing exhaustive motion search. int exhaustive_searches_thresh; // Pattern to be used for any exhaustive mesh searches (except intraBC ME). MESH_PATTERN mesh_patterns[MAX_MESH_STEP]; // Pattern to be used for exhaustive mesh searches of intraBC ME. MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_STEP]; // Reduce single motion search range based on MV result of prior ref_mv_idx. int reduce_search_range; // Prune mesh search. PRUNE_MESH_SEARCH_LEVEL prune_mesh_search; // Use the rd cost around the best FULLPEL_MV to speed up subpel search int use_fullpel_costlist; // Set the full pixel search level of obmc // 0: obmc_full_pixel_diamond // 1: obmc_refining_search_sad (faster) int obmc_full_pixel_search_level; // Accurate full pixel motion search based on TPL stats. int full_pixel_search_level; // Allow intrabc motion search int use_intrabc; // Whether to downsample the rows in sad calculation during motion search. // This is only active when there are at least 16 rows. When this sf is // active, if there is a large discrepancy in the SAD values for the final // motion vector between skipping vs not skipping, motion search is redone // with skip row features off. // 0: Disabled (do not downsample rows) // 1: Skip SAD calculation of odd rows if the SAD deviation of the even and // odd rows for the starting MV is small. Redo motion search with sf off // when SAD deviation is high for the final motion vector. // 2: Skip SAD calculation of odd rows. SAD deviation is not tested for the // start MV and tested only for the final MV. int use_downsampled_sad; // Enable/disable extensive joint motion search. int disable_extensive_joint_motion_search; // Enable second best mv check in joint mv search. // 0: allow second MV (use rd cost as the metric) // 1: use var as the metric // 2: disable second MV int disable_second_mv; // Skips full pixel search based on start mv of prior ref_mv_idx. // 0: Disabled // 1: Skips the full pixel search upto 4 neighbor full-pel MV positions. // 2: Skips the full pixel search upto 8 neighbor full-pel MV positions. int skip_fullpel_search_using_startmv; // Method to use for refining WARPED_CAUSAL motion vectors // TODO(rachelbarker): Can this be unified with OBMC in some way? WARP_SEARCH_METHOD warp_search_method; // Maximum number of iterations in WARPED_CAUSAL refinement search int warp_search_iters; } MV_SPEED_FEATURES; typedef struct INTER_MODE_SPEED_FEATURES { // 2-pass inter mode model estimation where the preliminary pass skips // transform search and uses a model to estimate rd, while the final pass // computes the full transform search. Two types of models are supported: // 0: not used // 1: used with online dynamic rd model // 2: used with static rd model int inter_mode_rd_model_estimation; // Bypass transform search based on skip rd at following stages // i. Compound type mode search // ii. Motion mode search (mode evaluation and winner motion mode stage) // iii. Transform search for best inter candidates int txfm_rd_gate_level[TX_SEARCH_CASES]; // Limit the inter mode tested in the RD loop int reduce_inter_modes; // This variable is used to cap the maximum number of times we skip testing a // mode to be evaluated. A high value means we will be faster. int adaptive_rd_thresh; // Aggressively prune inter modes when best mode is skippable. int prune_inter_modes_if_skippable; // Drop less likely to be picked reference frames in the RD search. // Has seven levels for now: 0, 1, 2, 3, 4, 5 and 6 where higher levels prune // more aggressively than lower ones. (0 means no pruning). int selective_ref_frame; // Prune reference frames for rectangular partitions. // 0 implies no pruning // 1 implies prune for extended partition // 2 implies prune horiz, vert and extended partition int prune_ref_frame_for_rect_partitions; // Prune inter modes w.r.t past reference frames // 0 no pruning // 1 prune inter modes w.r.t ALTREF2 and ALTREF reference frames // 2 prune inter modes w.r.t BWDREF, ALTREF2 and ALTREF reference frames int alt_ref_search_fp; // Prune reference frames for single prediction modes based on temporal // distance and pred MV SAD. Feasible values are 0, 1, 2. The feature is // disabled for 0. An increasing value indicates more aggressive pruning // threshold. int prune_single_ref; // Prune compound reference frames // 0 no pruning // 1 prune compound references which do not satisfy the two conditions: // a) The references are at a nearest distance from the current frame in // both past and future direction. // b) The references have minimum pred_mv_sad in both past and future // direction. // 2 prune compound references except the one with nearest distance from the // current frame in both past and future direction. int prune_comp_ref_frames; // Skip the current ref_mv in NEW_MV mode based on mv, rate cost, etc. // This speed feature equaling 0 means no skipping. // If the speed feature equals 1 or 2, skip the current ref_mv in NEW_MV mode // if we have already encountered ref_mv in the drl such that: // 1. The other drl has the same mv during the SIMPLE_TRANSLATION search // process as the current mv. // 2. The rate needed to encode the current mv is larger than that for the // other ref_mv. // The speed feature equaling 1 means using subpel mv in the comparison. // The speed feature equaling 2 means using fullpel mv in the comparison. // If the speed feature >= 3, skip the current ref_mv in NEW_MV mode based on // known full_mv bestsme and drl cost. int skip_newmv_in_drl; // This speed feature checks duplicate ref MVs among NEARESTMV, NEARMV, // GLOBALMV and skips NEARMV or GLOBALMV (in order) if a duplicate is found // TODO(any): Instead of skipping repeated ref mv, use the recalculated // rd-cost based on mode rate and skip the mode evaluation int skip_repeated_ref_mv; // Flag used to control the ref_best_rd based gating for chroma int perform_best_rd_based_gating_for_chroma; // Reuse the inter_intra_mode search result from NEARESTMV mode to other // single ref modes int reuse_inter_intra_mode; // prune wedge and compound segment approximate rd evaluation based on // compound average modeled rd int prune_comp_type_by_model_rd; // prune wedge and compound segment approximate rd evaluation based on // compound average rd/ref_best_rd int prune_comp_type_by_comp_avg; // Skip some ref frames in compound motion search by single motion search // result. Has three levels for now: 0 referring to no skipping, and 1 - 3 // increasing aggressiveness of skipping in order. // Note: The search order might affect the result. It assumes that the single // reference modes are searched before compound modes. It is better to search // same single inter mode as a group. int prune_comp_search_by_single_result; // Instead of performing a full MV search, do a simple translation first // and only perform a full MV search on the motion vectors that performed // well. int prune_mode_search_simple_translation; // Only search compound modes with at least one "good" reference frame. // A reference frame is good if, after looking at its performance among // the single reference modes, it is one of the two best performers. int prune_compound_using_single_ref; // Skip extended compound mode (NEAREST_NEWMV, NEW_NEARESTMV, NEAR_NEWMV, // NEW_NEARMV) using ref frames of above and left neighbor // blocks. // 0 : no pruning // 1 : prune ext compound modes using neighbor blocks (less aggressiveness) // 2 : prune ext compound modes using neighbor blocks (high aggressiveness) // 3 : prune ext compound modes unconditionally (highest aggressiveness) int prune_ext_comp_using_neighbors; // Skip NEW_NEARMV and NEAR_NEWMV extended compound modes int skip_ext_comp_nearmv_mode; // Skip extended compound mode when ref frame corresponding to NEWMV does not // have NEWMV as single mode winner. // 0 : no pruning // 1 : prune extended compound mode (less aggressiveness) // 2 : prune extended compound mode (high aggressiveness) int prune_comp_using_best_single_mode_ref; // Skip NEARESTMV and NEARMV using weight computed in ref mv list population // // Pruning is enabled only when both the top and left neighbor blocks are // available and when the current block already has a valid inter prediction. int prune_nearest_near_mv_using_refmv_weight; // Based on previous ref_mv_idx search result, prune the following search. int prune_ref_mv_idx_search; // Disable one sided compound modes. int disable_onesided_comp; // Prune obmc search using previous frame stats. // INT_MAX : disable obmc search int prune_obmc_prob_thresh; // Prune warped motion search using previous frame stats. int prune_warped_prob_thresh; // Variance threshold to enable/disable Interintra wedge search unsigned int disable_interintra_wedge_var_thresh; // Variance threshold to enable/disable Interinter wedge search unsigned int disable_interinter_wedge_var_thresh; // De-couple wedge and mode search during interintra RDO. int fast_interintra_wedge_search; // Whether fast wedge sign estimate is used int fast_wedge_sign_estimate; // Enable/disable ME for interinter wedge search. int disable_interinter_wedge_newmv_search; // Decide when and how to use joint_comp. DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag; // Clip the frequency of updating the mv cost. INTERNAL_COST_UPDATE_TYPE mv_cost_upd_level; // Clip the frequency of updating the coeff cost. INTERNAL_COST_UPDATE_TYPE coeff_cost_upd_level; // Clip the frequency of updating the mode cost. INTERNAL_COST_UPDATE_TYPE mode_cost_upd_level; // Prune inter modes based on tpl stats // 0 : no pruning // 1 - 3 indicate increasing aggressiveness in order. int prune_inter_modes_based_on_tpl; // Skip NEARMV and NEAR_NEARMV modes using ref frames of above and left // neighbor blocks and qindex. PRUNE_NEARMV_LEVEL prune_nearmv_using_neighbors; // Model based breakout after interpolation filter search // 0: no breakout // 1: use model based rd breakout int model_based_post_interp_filter_breakout; // Reuse compound type rd decision when exact match is found // 0: No reuse // 1: Reuse the compound type decision int reuse_compound_type_decision; // Enable/disable masked compound. int disable_masked_comp; // Enable/disable MV refinement for compound modes corresponds to compound // types COMPOUND_AVERAGE, COMPOUND_DISTWTD (currently, this compound type // is disabled for speeds >= 2 using the sf 'use_dist_wtd_comp_flag') and // COMPOUND_DIFFWTD based on the availability. Levels 0 to 3 indicate // increasing order of aggressiveness to disable MV refinement. // 0: MV Refinement is enabled and for NEW_NEWMV mode used two iterations of // refinement in av1_joint_motion_search(). // 1: MV Refinement is disabled for COMPOUND_DIFFWTD and enabled for // COMPOUND_AVERAGE & COMPOUND_DISTWTD. // 2: MV Refinement is enabled for COMPOUND_AVERAGE & COMPOUND_DISTWTD for // NEW_NEWMV mode with one iteration of refinement in // av1_joint_motion_search() and MV Refinement is disabled for other compound // type modes. // 3: MV Refinement is disabled. int enable_fast_compound_mode_search; // Reuse masked compound type search results int reuse_mask_search_results; // Enable/disable fast search for wedge masks int enable_fast_wedge_mask_search; // Early breakout from transform search of inter modes int inter_mode_txfm_breakout; // Limit number of inter modes for txfm search if a newmv mode gets // evaluated among the top modes. // 0: no pruning // 1 to 3 indicate increasing order of aggressiveness int limit_inter_mode_cands; // Cap the no. of txfm searches for a given prediction mode. // 0: no cap, 1: cap beyond first 4 searches, 2: cap beyond first 3 searches. int limit_txfm_eval_per_mode; // Prune warped motion search based on block size. int extra_prune_warped; // Do not search compound modes for ARF. // The intuition is that ARF is predicted by frames far away from it, // whose temporal correlations with the ARF are likely low. // It is therefore likely that compound modes do not work as well for ARF // as other inter frames. // Speed/quality impact: // Speed 1: 12% faster, 0.1% psnr loss. // Speed 2: 2% faster, 0.05% psnr loss. // No change for speed 3 and up, because |disable_onesided_comp| is true. int skip_arf_compound; } INTER_MODE_SPEED_FEATURES; typedef struct INTERP_FILTER_SPEED_FEATURES { // Do limited interpolation filter search for dual filters, since best choice // usually includes EIGHTTAP_REGULAR. int use_fast_interpolation_filter_search; // Disable dual filter int disable_dual_filter; // Save results of av1_interpolation_filter_search for a block // Check mv and ref_frames before search, if they are very close with previous // saved results, filter search can be skipped. int use_interp_filter; // skip sharp_filter evaluation based on regular and smooth filter rd for // dual_filter=0 case int skip_sharp_interp_filter_search; // skip interpolation filter search for a block in chessboard pattern int cb_pred_filter_search; // adaptive interp_filter search to allow skip of certain filter types. int adaptive_interp_filter_search; // Forces interpolation filter to EIGHTTAP_REGULAR and skips interpolation // filter search. int skip_interp_filter_search; } INTERP_FILTER_SPEED_FEATURES; typedef struct INTRA_MODE_SPEED_FEATURES { // These bit masks allow you to enable or disable intra modes for each // transform size separately. int intra_y_mode_mask[TX_SIZES]; int intra_uv_mode_mask[TX_SIZES]; // flag to allow skipping intra mode for inter frame prediction int skip_intra_in_interframe; // Prune intra mode candidates based on source block histogram of gradient. // Applies to luma plane only. // Feasible values are 0..4. The feature is disabled for 0. An increasing // value indicates more aggressive pruning threshold. int intra_pruning_with_hog; // Prune intra mode candidates based on source block histogram of gradient. // Applies to chroma plane only. // Feasible values are 0..4. The feature is disabled for 0. An increasing // value indicates more aggressive pruning threshold. int chroma_intra_pruning_with_hog; // Enable/disable smooth intra modes. int disable_smooth_intra; // Prune UV_SMOOTH_PRED mode for chroma based on chroma source variance. // false : No pruning // true : Prune UV_SMOOTH_PRED mode based on chroma source variance // // For allintra encode, this speed feature reduces instruction count // by 1.90%, 2.21% and 1.97% for speed 6, 7 and 8 with coding performance // change less than 0.04%. For AVIF image encode, this speed feature reduces // encode time by 1.56%, 2.14% and 0.90% for speed 6, 7 and 8 on a typical // image dataset with coding performance change less than 0.05%. bool prune_smooth_intra_mode_for_chroma; // Prune filter intra modes in intra frames. // 0 : No pruning // 1 : Evaluate applicable filter intra modes based on best intra mode so far // 2 : Do not evaluate filter intra modes int prune_filter_intra_level; // prune palette search // 0: No pruning // 1: Perform coarse search to prune the palette colors. For winner colors, // neighbors are also evaluated using a finer search. // 2: Perform 2 way palette search from max colors to min colors (and min // colors to remaining colors) and terminate the search if current number of // palette colors is not the winner. int prune_palette_search_level; // Terminate early in luma palette_size search. Speed feature values indicate // increasing level of pruning. // 0: No early termination // 1: Terminate early for higher luma palette_size, if header rd cost of lower // palette_size is more than 2 * best_rd. This level of pruning is more // conservative when compared to sf level 2 as the cases which will get pruned // with sf level 1 is a subset of the cases which will get pruned with sf // level 2. // 2: Terminate early for higher luma palette_size, if header rd cost of lower // palette_size is more than best_rd. // For allintra encode, this sf reduces instruction count by 2.49%, 1.07%, // 2.76%, 2.30%, 1.84%, 2.69%, 2.04%, 2.05% and 1.44% for speed 0, 1, 2, 3, 4, // 5, 6, 7 and 8 on screen content set with coding performance change less // than 0.01% for speed <= 2 and less than 0.03% for speed >= 3. For AVIF // image encode, this sf reduces instruction count by 1.94%, 1.13%, 1.29%, // 0.93%, 0.89%, 1.03%, 1.07%, 1.20% and 0.18% for speed 0, 1, 2, 3, 4, 5, 6, // 7 and 8 on a typical image dataset with coding performance change less than // 0.01%. int prune_luma_palette_size_search_level; // Prune chroma intra modes based on luma intra mode winner. // 0: No pruning // 1: Prune chroma intra modes other than UV_DC_PRED, UV_SMOOTH_PRED, // UV_CFL_PRED and the mode that corresponds to luma intra mode winner. int prune_chroma_modes_using_luma_winner; // Clip the frequency of updating the mv cost for intrabc. INTERNAL_COST_UPDATE_TYPE dv_cost_upd_level; // We use DCT_DCT transform followed by computing SATD (Sum of Absolute // Transformed Differences) as an estimation of RD score to quickly find the // best possible Chroma from Luma (CFL) parameter. Then we do a full RD search // near the best possible parameter. The search range is set here. // The range of cfl_searh_range should be [1, 33], and the following are the // recommended values. // 1: Fastest mode. // 3: Default mode that provides good speedup without losing compression // performance at speed 0. // 33: Exhaustive rd search (33 == CFL_MAGS_SIZE). This mode should only // be used for debugging purpose. int cfl_search_range; // TOP_INTRA_MODEL_COUNT is 4 that is the number of top model rd to store in // intra mode decision. Here, add a speed feature to reduce this number for // higher speeds. int top_intra_model_count_allowed; // Adapt top_intra_model_count_allowed locally to prune luma intra modes using // neighbor block and quantizer information. int adapt_top_model_rd_count_using_neighbors; // Prune the evaluation of odd delta angles of directional luma intra modes by // using the rdcosts of neighbouring delta angles. // For allintra encode, this speed feature reduces instruction count // by 4.461%, 3.699% and 3.536% for speed 6, 7 and 8 on a typical video // dataset with coding performance change less than 0.26%. For AVIF image // encode, this speed feature reduces encode time by 2.849%, 2.471%, // and 2.051% for speed 6, 7 and 8 on a typical image dataset with coding // performance change less than 0.27%. int prune_luma_odd_delta_angles_in_intra; // Terminate early in chroma palette_size search. // 0: No early termination // 1: Terminate early for higher palette_size, if header rd cost of lower // palette_size is more than best_rd. // For allintra encode, this sf reduces instruction count by 0.45%, // 0.62%, 1.73%, 2.50%, 2.89%, 3.09% and 3.86% for speed 0 to 6 on screen // content set with coding performance change less than 0.01%. // For AVIF image encode, this sf reduces instruction count by 0.45%, 0.81%, // 0.85%, 1.05%, 1.45%, 1.66% and 1.95% for speed 0 to 6 on a typical image // dataset with no quality drop. int early_term_chroma_palette_size_search; // Skips the evaluation of filter intra modes in inter frames if rd evaluation // of luma intra dc mode results in invalid rd stats. int skip_filter_intra_in_inter_frames; } INTRA_MODE_SPEED_FEATURES; typedef struct TX_SPEED_FEATURES { // Init search depth for square and rectangular transform partitions. // Values: // 0 - search full tree, 1: search 1 level, 2: search the highest level only int inter_tx_size_search_init_depth_sqr; int inter_tx_size_search_init_depth_rect; int intra_tx_size_search_init_depth_sqr; int intra_tx_size_search_init_depth_rect; // If any dimension of a coding block size above 64, always search the // largest transform only, since the largest transform block size is 64x64. int tx_size_search_lgr_block; TX_TYPE_SEARCH tx_type_search; // Skip split transform block partition when the collocated bigger block // is selected as all zero coefficients. int txb_split_cap; // Shortcut the transform block partition and type search when the target // rdcost is relatively lower. // Values are 0 (not used) , or 1 - 2 with progressively increasing // aggressiveness int adaptive_txb_search_level; // Prune level for tx_size_type search for inter based on rd model // 0: no pruning // 1-2: progressively increasing aggressiveness of pruning int model_based_prune_tx_search_level; // Refine TX type after fast TX search. int refine_fast_tx_search_results; // Prune transform split/no_split eval based on residual properties. A value // of 0 indicates no pruning, and the aggressiveness of pruning progressively // increases from levels 1 to 3. int prune_tx_size_level; // Prune the evaluation of transform depths as decided by the NN model. // false: No pruning. // true : Avoid the evaluation of specific transform depths using NN model. // // For allintra encode, this speed feature reduces instruction count // by 4.76%, 8.92% and 11.28% for speed 6, 7 and 8 with coding performance // change less than 0.32%. For AVIF image encode, this speed feature reduces // encode time by 4.65%, 9.16% and 10.45% for speed 6, 7 and 8 on a typical // image dataset with coding performance change less than 0.19%. bool prune_intra_tx_depths_using_nn; // Enable/disable early breakout during transform search of intra modes, by // using the minimum rd cost possible. By using this approach, the rd // evaluation of applicable transform blocks (in the current block) can be // avoided as // 1) best_rd evolves during the search in choose_tx_size_type_from_rd() // 2) appropriate ref_best_rd is passed in intra_block_yrd() // // For allintra encode, this speed feature reduces instruction count // by 1.11%, 1.08%, 1.02% and 0.93% for speed 3, 6, 7 and 8 with coding // performance change less than 0.02%. For AVIF image encode, this speed // feature reduces encode time by 0.93%, 1.46%, 1.07%, 0.84%, 0.99% and 0.73% // for speed 3, 4, 5, 6, 7 and 8 on a typical image dataset with coding // performance change less than 0.004%. bool use_rd_based_breakout_for_intra_tx_search; } TX_SPEED_FEATURES; typedef struct RD_CALC_SPEED_FEATURES { // Fast approximation of av1_model_rd_from_var_lapndz int simple_model_rd_from_var; // Perform faster distortion computation during the R-D evaluation by trying // to approximate the prediction error with transform coefficients (faster but // less accurate) rather than computing distortion in the pixel domain (slower // but more accurate). The following methods are used for distortion // computation: // Method 0: Always compute distortion in the pixel domain // Method 1: Based on block error, try using transform domain distortion for // tx_type search and compute distortion in pixel domain for final RD_STATS // Method 2: Based on block error, try to compute distortion in transform // domain // Methods 1 and 2 may fallback to computing distortion in the pixel domain in // case the block error is less than the threshold, which is controlled by the // speed feature tx_domain_dist_thres_level. // // The speed feature tx_domain_dist_level decides which of the above methods // needs to be used across different mode evaluation stages as described // below: // Eval type: Default Mode Winner // Level 0 : Method 0 Method 2 Method 0 // Level 1 : Method 1 Method 2 Method 0 // Level 2 : Method 2 Method 2 Method 0 // Level 3 : Method 2 Method 2 Method 2 int tx_domain_dist_level; // Transform domain distortion threshold level int tx_domain_dist_thres_level; // Trellis (dynamic programming) optimization of quantized values TRELLIS_OPT_TYPE optimize_coefficients; // Use hash table to store macroblock RD search results // to avoid repeated search on the same residue signal. int use_mb_rd_hash; // Flag used to control the extent of coeff R-D optimization int perform_coeff_opt; } RD_CALC_SPEED_FEATURES; typedef struct WINNER_MODE_SPEED_FEATURES { // Flag used to control the winner mode processing for better R-D optimization // of quantized coeffs int enable_winner_mode_for_coeff_opt; // Flag used to control the winner mode processing for transform size // search method int enable_winner_mode_for_tx_size_srch; // Control transform size search level // Eval type: Default Mode Winner // Level 0 : FULL RD LARGEST ALL FULL RD // Level 1 : FAST RD LARGEST ALL FULL RD // Level 2 : LARGEST ALL LARGEST ALL FULL RD // Level 3 : LARGEST ALL LARGEST ALL LARGEST ALL int tx_size_search_level; // Flag used to control the winner mode processing for use transform // domain distortion int enable_winner_mode_for_use_tx_domain_dist; // Flag used to enable processing of multiple winner modes MULTI_WINNER_MODE_TYPE multi_winner_mode_type; // Motion mode for winner candidates: // 0: speed feature OFF // 1 / 2 : Use configured number of winner candidates int motion_mode_for_winner_cand; // Controls the prediction of transform skip block or DC only block. // // Different speed feature values (0 to 3) decide the aggressiveness of // prediction (refer to predict_dc_levels[][] in speed_features.c) to be used // during different mode evaluation stages. int dc_blk_pred_level; // If on, disables interpolation filter search in handle_inter_mode loop, and // performs it during winner mode processing by \ref // tx_search_best_inter_candidates. int winner_mode_ifs; // Controls the disabling of winner mode processing. Speed feature levels // are ordered in increasing aggressiveness of pruning. The method considered // for disabling, depends on the sf level value and it is described as below. // 0: Do not disable // 1: Disable for blocks with low source variance. // 2: Disable for blocks which turn out to be transform skip (skipped based on // eob) during MODE_EVAL stage except NEWMV mode. // 3: Disable for blocks which turn out to be transform skip during MODE_EVAL // stage except NEWMV mode. For high quantizers, prune conservatively based on // transform skip (skipped based on eob) except for NEWMV mode. // 4: Disable for blocks which turn out to be transform skip during MODE_EVAL // stage. int prune_winner_mode_eval_level; } WINNER_MODE_SPEED_FEATURES; typedef struct LOOP_FILTER_SPEED_FEATURES { // This feature controls how the loop filter level is determined. LPF_PICK_METHOD lpf_pick; // Skip some final iterations in the determination of the best loop filter // level. int use_coarse_filter_level_search; // Control how the CDEF strength is determined. CDEF_PICK_METHOD cdef_pick_method; // Decoder side speed feature to add penalty for use of dual-sgr filters. // Takes values 0 - 10, 0 indicating no penalty and each additional level // adding a penalty of 1% int dual_sgr_penalty_level; // prune sgr ep using binary search like mechanism int enable_sgr_ep_pruning; // Disable loop restoration for Chroma plane int disable_loop_restoration_chroma; // Disable loop restoration for luma plane int disable_loop_restoration_luma; // Range of loop restoration unit sizes to search // The minimum size is clamped against the superblock size in // av1_pick_filter_restoration, so that the code which sets this value does // not need to know the superblock size ahead of time. int min_lr_unit_size; int max_lr_unit_size; // Prune RESTORE_WIENER evaluation based on source variance // 0 : no pruning // 1 : conservative pruning // 2 : aggressive pruning int prune_wiener_based_on_src_var; // Prune self-guided loop restoration based on wiener search results // 0 : no pruning // 1 : pruning based on rdcost ratio of RESTORE_WIENER and RESTORE_NONE // 2 : pruning based on winner restoration type among RESTORE_WIENER and // RESTORE_NONE int prune_sgr_based_on_wiener; // Reduce the wiener filter win size for luma int reduce_wiener_window_size; // Flag to disable Wiener Loop restoration filter. bool disable_wiener_filter; // Flag to disable Self-guided Loop restoration filter. bool disable_sgr_filter; // Disable the refinement search around the wiener filter coefficients. bool disable_wiener_coeff_refine_search; // Whether to downsample the rows in computation of wiener stats. int use_downsampled_wiener_stats; } LOOP_FILTER_SPEED_FEATURES; typedef struct REAL_TIME_SPEED_FEATURES { // check intra prediction for non-RD mode. int check_intra_pred_nonrd; // Skip checking intra prediction. // 0 - don't skip // 1 - skip if TX is skipped and best mode is not NEWMV // 2 - skip if TX is skipped // Skipping aggressiveness increases from level 1 to 2. int skip_intra_pred; // Estimate motion before calculating variance in variance-based partition // 0 - Only use zero MV // 1 - perform coarse ME // 2 - perform coarse ME, and also use neighbours' MVs // 3 - use neighbours' MVs without performing coarse ME int estimate_motion_for_var_based_partition; // For nonrd_use_partition: mode of extra check of leaf partition // 0 - don't check merge // 1 - always check merge // 2 - check merge and prune checking final split // 3 - check merge and prune checking final split based on bsize and qindex int nonrd_check_partition_merge_mode; // For nonrd_use_partition: check of leaf partition extra split int nonrd_check_partition_split; // Implements various heuristics to skip searching modes // The heuristics selected are based on flags // defined in the MODE_SEARCH_SKIP_HEURISTICS enum unsigned int mode_search_skip_flags; // For nonrd: Reduces ref frame search. // 0 - low level of search prune in non last frames // 1 - pruned search in non last frames // 2 - more pruned search in non last frames int nonrd_prune_ref_frame_search; // This flag controls the use of non-RD mode decision. int use_nonrd_pick_mode; // Use ALTREF frame in non-RD mode decision. int use_nonrd_altref_frame; // Use compound reference for non-RD mode. int use_comp_ref_nonrd; // Reference frames for compound prediction for nonrd pickmode: // LAST_GOLDEN (0), LAST_LAST2 (1), or LAST_ALTREF (2). int ref_frame_comp_nonrd[3]; // use reduced ref set for real-time mode int use_real_time_ref_set; // Skip a number of expensive mode evaluations for blocks with very low // temporal variance. int short_circuit_low_temp_var; // Reuse inter prediction in fast non-rd mode. int reuse_inter_pred_nonrd; // Number of best inter modes to search transform. INT_MAX - search all. int num_inter_modes_for_tx_search; // Use interpolation filter search in non-RD mode decision. int use_nonrd_filter_search; // Use simplified RD model for interpolation search and Intra int use_simple_rd_model; // For nonrd mode: use hybrid intra mode search for intra only frames based on // block properties. // 0 : use nonrd pick intra for all blocks // 1 : use rd for bsize < 16x16, nonrd otherwise // 2 : use rd for bsize < 16x16 and src var >= 101, nonrd otherwise int hybrid_intra_pickmode; // Filter blocks by certain criteria such as SAD, source variance, such that // fewer blocks will go through the palette search. // For nonrd encoding path, enable this feature reduces encoding time when // palette mode is used. Disabling it leads to better compression efficiency. // 0: off // 1: less aggressive pruning mode // 2: more aggressive pruning mode int prune_palette_search_nonrd; // Compute variance/sse on source difference, prior to encoding superblock. int source_metrics_sb_nonrd; // Flag to indicate process for handling overshoot on slide/scene change, // for real-time CBR mode. OVERSHOOT_DETECTION_CBR overshoot_detection_cbr; // Check for scene/content change detection on every frame before encoding. int check_scene_detection; // For keyframes in rtc: adjust the rc_bits_per_mb, to reduce overshoot. int rc_adjust_keyframe; // On scene change: compute spatial variance. int rc_compute_spatial_var_sc; // For nonrd mode: Prefer larger partition blks in variance based partitioning // 0: disabled, 1-3: increasing aggressiveness int prefer_large_partition_blocks; // uses results of temporal noise estimate int use_temporal_noise_estimate; // Parameter indicating initial search window to be used in full-pixel search // for nonrd_pickmode. Range [0, MAX_MVSEARCH_STEPS - 1]. Lower value // indicates larger window. If set to 0, step_param is set based on internal // logic in set_mv_search_params(). int fullpel_search_step_param; // Bit mask to enable or disable intra modes for each prediction block size // separately, for nonrd_pickmode. Currently, the sf is not respected when // 'force_intra_check' is true in 'av1_estimate_intra_mode()' function. Also, // H and V pred modes allowed through this sf can be further pruned when //'prune_hv_pred_modes_using_src_sad' sf is true. int intra_y_mode_bsize_mask_nrd[BLOCK_SIZES]; // Prune H and V intra predition modes evalution in inter frame. // The sf does not have any impact. // i. when frame_source_sad is 1.1 times greater than avg_source_sad // ii. when cyclic_refresh_segment_id_boosted is enabled // iii. when SB level source sad is greater than kMedSad // iv. when color sensitivity is non zero for both the chroma channels bool prune_hv_pred_modes_using_src_sad; // Skips mode checks more aggressively in nonRD mode int nonrd_aggressive_skip; // Skip cdef on 64x64 blocks/ // 0: disabled // 1: skip when NEWMV or INTRA is not picked or color sensitivity is off. // When color sensitivity is on for a superblock, all 64x64 blocks within // will not skip. // 2: more aggressive mode where skip is done for all frames where // rc->high_source_sad = 0 (non slide-changes), and color sensitivity off. int skip_cdef_sb; // Force selective cdf update. int selective_cdf_update; // Force only single reference (LAST) for prediction. int force_only_last_ref; // Forces larger partition blocks in variance based partitioning for intra // frames int force_large_partition_blocks_intra; // Use fixed partition for superblocks based on source_sad. // 0: disabled // 1: enabled int use_fast_fixed_part; // Increase source_sad thresholds in nonrd pickmode. int increase_source_sad_thresh; // Skip evaluation of no split in tx size selection for merge partition int skip_tx_no_split_var_based_partition; // Intermediate termination of newMV mode evaluation based on so far best mode // sse int skip_newmv_mode_based_on_sse; // Define gf length multiplier. // Level 0: use large multiplier, level 1: use medium multiplier. int gf_length_lvl; // Prune inter modes with golden frame as reference for NEARMV and NEWMV modes int prune_inter_modes_with_golden_ref; // Prune inter modes w.r.t golden or alt-ref frame based on sad int prune_inter_modes_wrt_gf_arf_based_on_sad; // Prune inter mode search in rd path based on current block's temporal // variance wrt LAST reference. int prune_inter_modes_using_temp_var; // Reduce MV precision to halfpel for higher int MV value & frame-level motion // 0: disabled // 1-2: Reduce precision to halfpel, fullpel based on conservative // thresholds, aggressiveness increases with increase in level // 3: Reduce precision to halfpel using more aggressive thresholds int reduce_mv_pel_precision_highmotion; // Reduce MV precision for low complexity blocks // 0: disabled // 1: Reduce the mv resolution for zero mv if the variance is low // 2: Switch to halfpel, fullpel based on low block spatial-temporal // complexity. int reduce_mv_pel_precision_lowcomplex; // Prune intra mode evaluation in inter frames based on mv range. BLOCK_SIZE prune_intra_mode_based_on_mv_range; // The number of times to left shift the splitting thresholds in variance // based partitioning. The minimum values should be 7 to avoid left shifting // by a negative number. int var_part_split_threshold_shift; // Qindex based variance partition threshold index, which determines // the aggressiveness of partition pruning // 0: disabled for speeds 9,10 // 1,2: (rd-path) lowers qindex thresholds conditionally (for low SAD sb) // 3,4: (non-rd path) uses pre-tuned qindex thresholds int var_part_based_on_qidx; // Enable GF refresh based on Q value. int gf_refresh_based_on_qp; // Temporal filtering // The value can be 1 or 2, which indicates the threshold to use. // Must be off for lossless mode. int use_rtc_tf; // Use of the identity transform in nonrd_pickmode, int use_idtx_nonrd; // Prune the use of the identity transform in nonrd_pickmode: // only for smaller blocks and higher spatial variance, and when skip_txfm // is not already set. int prune_idtx_nonrd; // Force to only use dct for palette search in nonrd pickmode. int dct_only_palette_nonrd; // Skip loopfilter, for static content after slide change // or key frame, once quality has ramped up. // 0: disabled // 1: skip only after quality is ramped up. // 2: aggrssive mode, where skip is done for all frames that // where rc->high_source_sad = 0 (no slide-changes). int skip_lf_screen; // Threshold on the active/inactive region percent to disable // the loopfilter and cdef. Setting to 100 disables this feature. int thresh_active_maps_skip_lf_cdef; // For nonrd: early exit out of variance partition that sets the // block size to superblock size, and sets mode to zeromv-last skip. // 0: disabled // 1: zeromv-skip is enabled at SB level only // 2: zeromv-skip is enabled at SB level and coding block level int part_early_exit_zeromv; // Early terminate inter mode search based on sse in non-rd path. INTER_SEARCH_EARLY_TERM_IDX sse_early_term_inter_search; // SAD based adaptive altref selection int sad_based_adp_altref_lag; // Enable/disable partition direct merging. int partition_direct_merging; // Level of aggressiveness for obtaining tx size based on qstep int tx_size_level_based_on_qstep; // Avoid the partitioning of a 16x16 block in variance based partitioning // (VBP) by making use of minimum and maximum sub-block variances. // For allintra encode, this speed feature reduces instruction count by 5.39% // for speed 9 on a typical video dataset with coding performance gain // of 1.44%. // For AVIF image encode, this speed feature reduces encode time // by 8.44% for speed 9 on a typical image dataset with coding performance // gain of 0.78%. bool vbp_prune_16x16_split_using_min_max_sub_blk_var; // A qindex threshold that determines whether to use qindex based CDEF filter // strength estimation for screen content types. The strength estimation model // used for screen contents prefers to allow cdef filtering for more frames. // This sf is used to limit the frames which go through cdef filtering and // following explains the setting of the same. // MAXQ (255): This disables the usage of this sf. Here, frame does not use a // screen content model thus reduces the number of frames that go through cdef // filtering. // MINQ (0): Frames always use screen content model thus increasing the number // of frames that go through cdef filtering. // This speed feature has a substantial gain on coding metrics, with moderate // increase encoding time. Select threshold based on speed vs quality // trade-off. int screen_content_cdef_filter_qindex_thresh; // Prune compound mode if its variance is higher than the variance of single // modes. bool prune_compoundmode_with_singlecompound_var; // Allow mode cost update at frame level every couple frames. This // overrides the command line setting --mode-cost-upd-freq=3 (never update // except on key frame and first delta). bool frame_level_mode_cost_update; // Prune H_PRED during intra mode evaluation in the nonrd path based on best // mode so far. // // For allintra encode, this speed feature reduces instruction count by 1.10% // for speed 9 with coding performance change less than 0.04%. // For AVIF image encode, this speed feature reduces encode time by 1.03% for // speed 9 on a typical image dataset with coding performance change less than // 0.08%. bool prune_h_pred_using_best_mode_so_far; // Enable pruning of intra mode evaluations in nonrd path based on source // variance and best mode so far. The pruning logic is enabled only if the // mode is not a winner mode of both the neighboring blocks (left/top). // // For allintra encode, this speed feature reduces instruction count by 3.96% // for speed 9 with coding performance change less than 0.38%. // For AVIF image encode, this speed feature reduces encode time by 3.46% for // speed 9 on a typical image dataset with coding performance change less than // -0.06%. bool enable_intra_mode_pruning_using_neighbors; // Prune intra mode evaluations in nonrd path based on best sad so far. // // For allintra encode, this speed feature reduces instruction count by 3.05% // for speed 9 with coding performance change less than 0.24%. // For AVIF image encode, this speed feature reduces encode time by 1.87% for // speed 9 on a typical image dataset with coding performance change less than // 0.16%. bool prune_intra_mode_using_best_sad_so_far; // If compound is enabled, and the current block size is \geq BLOCK_16X16, // limit the compound modes to GLOBAL_GLOBALMV. This does not apply to the // base layer of svc. bool check_only_zero_zeromv_on_large_blocks; // Allow for disabling cdf update for non reference frames in svc mode. bool disable_cdf_update_non_reference_frame; // Prune compound modes if the single modes variances do not perform well. bool prune_compoundmode_with_singlemode_var; // Skip searching all compound mode if the variance of single_mode residue is // sufficiently low. bool skip_compound_based_on_var; // Sets force_zeromv_skip based on the source sad available. Aggressiveness // increases with increase in the level set for speed feature. // 0: No setting // 1: If source sad is kZeroSad // 2: If source sad <= kVeryLowSad int set_zeromv_skip_based_on_source_sad; // Downgrades the block-level subpel motion search to // av1_find_best_sub_pixel_tree_pruned_more for higher QP and when fullpel // search performed well, zeromv has low sad or low source_var bool use_adaptive_subpel_search; // A flag used in RTC case to control frame_refs_short_signaling. Note that // the final decision is made in check_frame_refs_short_signaling(). The flag // can only be turned on when res < 360p and speed >= 9, in which case only // LAST and GOLDEN ref frames are used now. bool enable_ref_short_signaling; // A flag that controls if we check or bypass GLOBALMV in rtc single ref frame // case. bool check_globalmv_on_single_ref; // Allows for increasing the color_threshold for palette prediction. // This generally leads to better coding efficiency but with some speed loss. // Only used for screen content and for nonrd_pickmode. bool increase_color_thresh_palette; // Flag to indicate selecting of higher threshold for scenee change detection. int higher_thresh_scene_detection; // FLag to indicate skip testing of NEWMV for flat blocks. int skip_newmv_flat_blocks_screen; // Flag to force skip encoding for non_reference_frame on slide/scene changes. int skip_encoding_non_reference_slide_change; // Flag to indicate more aggressive QP downward adjustment for screen static // content, to make convergence to min_qp faster. int rc_faster_convergence_static; } REAL_TIME_SPEED_FEATURES; /*!\endcond */ /*! * \brief Top level speed vs quality trade off data struture. */ typedef struct SPEED_FEATURES { /*! * Sequence/frame level speed features: */ HIGH_LEVEL_SPEED_FEATURES hl_sf; /*! * Speed features for the first pass. */ FIRST_PASS_SPEED_FEATURES fp_sf; /*! * Speed features related to how tpl's searches are done. */ TPL_SPEED_FEATURES tpl_sf; /*! * Global motion speed features: */ GLOBAL_MOTION_SPEED_FEATURES gm_sf; /*! * Partition search speed features: */ PARTITION_SPEED_FEATURES part_sf; /*! * Motion search speed features: */ MV_SPEED_FEATURES mv_sf; /*! * Inter mode search speed features: */ INTER_MODE_SPEED_FEATURES inter_sf; /*! * Interpolation filter search speed features: */ INTERP_FILTER_SPEED_FEATURES interp_sf; /*! * Intra mode search speed features: */ INTRA_MODE_SPEED_FEATURES intra_sf; /*! * Transform size/type search speed features: */ TX_SPEED_FEATURES tx_sf; /*! * RD calculation speed features: */ RD_CALC_SPEED_FEATURES rd_sf; /*! * Two-pass mode evaluation features: */ WINNER_MODE_SPEED_FEATURES winner_mode_sf; /*! * In-loop filter speed features: */ LOOP_FILTER_SPEED_FEATURES lpf_sf; /*! * Real-time mode speed features: */ REAL_TIME_SPEED_FEATURES rt_sf; } SPEED_FEATURES; /*!\cond */ struct AV1_COMP; /*!\endcond */ /*!\brief Frame size independent speed vs quality trade off flags * *\ingroup speed_features * * \param[in] cpi Top - level encoder instance structure * \param[in] speed Speed setting passed in from the command line * * \remark No return value but configures the various speed trade off flags * based on the passed in speed setting. (Higher speed gives lower * quality) */ void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi, int speed); /*!\brief Frame size dependent speed vs quality trade off flags * *\ingroup speed_features * * \param[in] cpi Top - level encoder instance structure * \param[in] speed Speed setting passed in from the command line * * \remark No return value but configures the various speed trade off flags * based on the passed in speed setting and frame size. (Higher speed * corresponds to lower quality) */ void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi, int speed); /*!\brief Q index dependent speed vs quality trade off flags * *\ingroup speed_features * * \param[in] cpi Top - level encoder instance structure * \param[in] speed Speed setting passed in from the command line * * \remark No return value but configures the various speed trade off flags * based on the passed in speed setting and current frame's Q index. * (Higher speed corresponds to lower quality) */ void av1_set_speed_features_qindex_dependent(struct AV1_COMP *cpi, int speed); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_SPEED_FEATURES_H_ aom-3.12.1/av1/encoder/superres_scale.c000066400000000000000000000415211477627663500176750ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/encoder/encoder_alloc.h" #include "av1/encoder/superres_scale.h" #include "av1/encoder/random.h" // Compute the horizontal frequency components' energy in a frame // by calculuating the 16x4 Horizontal DCT. This is to be used to // decide the superresolution parameters. static void analyze_hor_freq(const AV1_COMP *cpi, double *energy) { uint64_t freq_energy[16] = { 0 }; const YV12_BUFFER_CONFIG *buf = cpi->source; const int bd = cpi->td.mb.e_mbd.bd; const int width = buf->y_crop_width; const int height = buf->y_crop_height; DECLARE_ALIGNED(16, int32_t, coeff[16 * 4]); int n = 0; memset(freq_energy, 0, sizeof(freq_energy)); if (buf->flags & YV12_FLAG_HIGHBITDEPTH) { const int16_t *src16 = (const int16_t *)CONVERT_TO_SHORTPTR(buf->y_buffer); for (int i = 0; i < height - 4; i += 4) { for (int j = 0; j < width - 16; j += 16) { av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride, H_DCT, bd); for (int k = 1; k < 16; ++k) { const uint64_t this_energy = ((int64_t)coeff[k] * coeff[k]) + ((int64_t)coeff[k + 16] * coeff[k + 16]) + ((int64_t)coeff[k + 32] * coeff[k + 32]) + ((int64_t)coeff[k + 48] * coeff[k + 48]); freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8)); } n++; } } } else { assert(bd == 8); DECLARE_ALIGNED(16, int16_t, src16[16 * 4]); for (int i = 0; i < height - 4; i += 4) { for (int j = 0; j < width - 16; j += 16) { for (int ii = 0; ii < 4; ++ii) for (int jj = 0; jj < 16; ++jj) src16[ii * 16 + jj] = buf->y_buffer[(i + ii) * buf->y_stride + (j + jj)]; av1_fwd_txfm2d_16x4(src16, coeff, 16, H_DCT, bd); for (int k = 1; k < 16; ++k) { const uint64_t this_energy = ((int64_t)coeff[k] * coeff[k]) + ((int64_t)coeff[k + 16] * coeff[k + 16]) + ((int64_t)coeff[k + 32] * coeff[k + 32]) + ((int64_t)coeff[k + 48] * coeff[k + 48]); freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2); } n++; } } } if (n) { for (int k = 1; k < 16; ++k) energy[k] = (double)freq_energy[k] / n; // Convert to cumulative energy for (int k = 14; k > 0; --k) energy[k] += energy[k + 1]; } else { for (int k = 1; k < 16; ++k) energy[k] = 1e+20; } } static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) { // Choose an arbitrary random number static unsigned int seed = 56789; const ResizeCfg *resize_cfg = &cpi->oxcf.resize_cfg; if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR; uint8_t new_denom = SCALE_NUMERATOR; if (cpi->common.seq_params->reduced_still_picture_hdr) return SCALE_NUMERATOR; switch (resize_cfg->resize_mode) { case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break; case RESIZE_FIXED: if (cpi->common.current_frame.frame_type == KEY_FRAME) new_denom = resize_cfg->resize_kf_scale_denominator; else new_denom = resize_cfg->resize_scale_denominator; break; case RESIZE_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break; default: assert(0); } return new_denom; } int av1_superres_in_recode_allowed(const AV1_COMP *const cpi) { const AV1EncoderConfig *const oxcf = &cpi->oxcf; // Empirically found to not be beneficial for image coding. return oxcf->superres_cfg.superres_mode == AOM_SUPERRES_AUTO && cpi->sf.hl_sf.superres_auto_search_type != SUPERRES_AUTO_SOLO && cpi->rc.frames_to_key > 1; } #define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO 0.012 #define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME 0.008 #define SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME 0.008 #define SUPERRES_ENERGY_BY_AC_THRESH 0.2 static double get_energy_by_q2_thresh(const GF_GROUP *gf_group, const RATE_CONTROL *rc, int gf_frame_index) { // TODO(now): Return keyframe thresh * factor based on frame type / pyramid // level. if (gf_group->update_type[gf_frame_index] == ARF_UPDATE) { return SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME; } else if (gf_group->update_type[gf_frame_index] == KF_UPDATE) { if (rc->frames_to_key <= 1) return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO; else return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME; } else { assert(0); } return 0; } static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy, double threshq, double threshp) { const double q = av1_convert_qindex_to_q(qindex, AOM_BITS_8); const double tq = threshq * q * q; const double tp = threshp * energy[1]; const double thresh = AOMMIN(tq, tp); int k; for (k = SCALE_NUMERATOR * 2; k > SCALE_NUMERATOR; --k) { if (energy[k - 1] > thresh) break; } return 3 * SCALE_NUMERATOR - k; } static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex, int sr_kf, int sr_arf) { // Use superres for Key-frames and Alt-ref frames only. const GF_GROUP *gf_group = &cpi->ppi->gf_group; if (gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE && gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE) { return SCALE_NUMERATOR; } if (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE && !sr_kf) { return SCALE_NUMERATOR; } if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE && !sr_arf) { return SCALE_NUMERATOR; } double energy[16]; analyze_hor_freq(cpi, energy); const double energy_by_q2_thresh = get_energy_by_q2_thresh(gf_group, &cpi->rc, cpi->gf_frame_index); int denom = get_superres_denom_from_qindex_energy( qindex, energy, energy_by_q2_thresh, SUPERRES_ENERGY_BY_AC_THRESH); /* printf("\nenergy = ["); for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]); printf("]\n"); printf("boost = %d\n", (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE) ? cpi->ppi->p_rc.kf_boost : cpi->rc.gfu_boost); printf("denom = %d\n", denom); */ if (av1_superres_in_recode_allowed(cpi)) { assert(cpi->superres_mode != AOM_SUPERRES_NONE); // Force superres to be tried in the recode loop, as full-res is also going // to be tried anyway. denom = AOMMAX(denom, SCALE_NUMERATOR + 1); } return denom; } static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) { // Choose an arbitrary random number static unsigned int seed = 34567; const AV1EncoderConfig *oxcf = &cpi->oxcf; const SuperResCfg *const superres_cfg = &oxcf->superres_cfg; const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR; uint8_t new_denom = SCALE_NUMERATOR; // Make sure that superres mode of the frame is consistent with the // sequence-level flag. assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_NONE, cpi->common.seq_params->enable_superres)); assert(IMPLIES(!cpi->common.seq_params->enable_superres, superres_cfg->superres_mode == AOM_SUPERRES_NONE)); // Make sure that superres mode for current encoding is consistent with user // provided superres mode. assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_AUTO, cpi->superres_mode == superres_cfg->superres_mode)); // Note: we must look at the current superres_mode to be tried in 'cpi' here, // not the user given mode in 'oxcf'. switch (cpi->superres_mode) { case AOM_SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break; case AOM_SUPERRES_FIXED: if (cpi->common.current_frame.frame_type == KEY_FRAME) new_denom = superres_cfg->superres_kf_scale_denominator; else new_denom = superres_cfg->superres_scale_denominator; break; case AOM_SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break; case AOM_SUPERRES_QTHRESH: { // Do not use superres when screen content tools are used. if (cpi->common.features.allow_screen_content_tools) break; if (rc_cfg->mode == AOM_VBR || rc_cfg->mode == AOM_CQ) av1_set_target_rate(cpi, frm_dim_cfg->width, frm_dim_cfg->height); // Now decide the use of superres based on 'q'. int bottom_index, top_index; const int q = av1_rc_pick_q_and_bounds( cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index, &bottom_index, &top_index); const int qthresh = (frame_is_intra_only(&cpi->common)) ? superres_cfg->superres_kf_qthresh : superres_cfg->superres_qthresh; if (q <= qthresh) { new_denom = SCALE_NUMERATOR; } else { new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1); } break; } case AOM_SUPERRES_AUTO: { if (cpi->common.features.allow_screen_content_tools) break; if (rc_cfg->mode == AOM_VBR || rc_cfg->mode == AOM_CQ) av1_set_target_rate(cpi, frm_dim_cfg->width, frm_dim_cfg->height); // Now decide the use of superres based on 'q'. int bottom_index, top_index; const int q = av1_rc_pick_q_and_bounds( cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index, &bottom_index, &top_index); const SUPERRES_AUTO_SEARCH_TYPE sr_search_type = cpi->sf.hl_sf.superres_auto_search_type; const int qthresh = (sr_search_type == SUPERRES_AUTO_SOLO) ? 128 : 0; if (q <= qthresh) { new_denom = SCALE_NUMERATOR; // Don't use superres. } else { if (sr_search_type == SUPERRES_AUTO_ALL) { if (cpi->common.current_frame.frame_type == KEY_FRAME) new_denom = superres_cfg->superres_kf_scale_denominator; else new_denom = superres_cfg->superres_scale_denominator; } else { new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1); } } break; } default: assert(0); } return new_denom; } static int dimension_is_ok(int orig_dim, int resized_dim, int denom) { return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2); } static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) { // Only need to check the width, as scaling is horizontal only. (void)oheight; return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom); } static int validate_size_scales(RESIZE_MODE resize_mode, aom_superres_mode superres_mode, int owidth, int oheight, size_params_type *rsz) { if (dimensions_are_ok(owidth, oheight, rsz)) { // Nothing to do. return 1; } // Calculate current resize scale. int resize_denom = AOMMAX(DIVIDE_AND_ROUND(owidth * SCALE_NUMERATOR, rsz->resize_width), DIVIDE_AND_ROUND(oheight * SCALE_NUMERATOR, rsz->resize_height)); if (resize_mode != RESIZE_RANDOM && superres_mode == AOM_SUPERRES_RANDOM) { // Alter superres scale as needed to enforce conformity. rsz->superres_denom = (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / resize_denom; if (!dimensions_are_ok(owidth, oheight, rsz)) { if (rsz->superres_denom > SCALE_NUMERATOR) --rsz->superres_denom; } } else if (resize_mode == RESIZE_RANDOM && superres_mode != AOM_SUPERRES_RANDOM) { // Alter resize scale as needed to enforce conformity. resize_denom = (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / rsz->superres_denom; rsz->resize_width = owidth; rsz->resize_height = oheight; av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height, resize_denom); if (!dimensions_are_ok(owidth, oheight, rsz)) { if (resize_denom > SCALE_NUMERATOR) { --resize_denom; rsz->resize_width = owidth; rsz->resize_height = oheight; av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height, resize_denom); } } } else if (resize_mode == RESIZE_RANDOM && superres_mode == AOM_SUPERRES_RANDOM) { // Alter both resize and superres scales as needed to enforce conformity. do { if (resize_denom > rsz->superres_denom) --resize_denom; else --rsz->superres_denom; rsz->resize_width = owidth; rsz->resize_height = oheight; av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height, resize_denom); } while (!dimensions_are_ok(owidth, oheight, rsz) && (resize_denom > SCALE_NUMERATOR || rsz->superres_denom > SCALE_NUMERATOR)); } else { // We are allowed to alter neither resize scale nor superres // scale. return 0; } return dimensions_are_ok(owidth, oheight, rsz); } // Calculates resize and superres params for next frame static size_params_type calculate_next_size_params(AV1_COMP *cpi) { const AV1EncoderConfig *oxcf = &cpi->oxcf; ResizePendingParams *resize_pending_params = &cpi->resize_pending_params; const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; size_params_type rsz = { frm_dim_cfg->width, frm_dim_cfg->height, SCALE_NUMERATOR }; int resize_denom = SCALE_NUMERATOR; if (has_no_stats_stage(cpi) && cpi->ppi->use_svc && (cpi->common.width != cpi->oxcf.frm_dim_cfg.width || cpi->common.height != cpi->oxcf.frm_dim_cfg.height)) { rsz.resize_width = cpi->common.width; rsz.resize_height = cpi->common.height; return rsz; } if (is_stat_generation_stage(cpi)) return rsz; if (resize_pending_params->width && resize_pending_params->height) { rsz.resize_width = resize_pending_params->width; rsz.resize_height = resize_pending_params->height; resize_pending_params->width = resize_pending_params->height = 0; if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE) return rsz; } else { resize_denom = calculate_next_resize_scale(cpi); rsz.resize_width = frm_dim_cfg->width; rsz.resize_height = frm_dim_cfg->height; av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height, resize_denom); } rsz.superres_denom = calculate_next_superres_scale(cpi); if (!validate_size_scales(oxcf->resize_cfg.resize_mode, cpi->superres_mode, frm_dim_cfg->width, frm_dim_cfg->height, &rsz)) assert(0 && "Invalid scale parameters"); return rsz; } static void setup_frame_size_from_params(AV1_COMP *cpi, const size_params_type *rsz) { int encode_width = rsz->resize_width; int encode_height = rsz->resize_height; AV1_COMMON *cm = &cpi->common; cm->superres_upscaled_width = encode_width; cm->superres_upscaled_height = encode_height; cm->superres_scale_denominator = rsz->superres_denom; av1_calculate_scaled_superres_size(&encode_width, &encode_height, rsz->superres_denom); av1_set_frame_size(cpi, encode_width, encode_height); } void av1_setup_frame_size(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; // Reset superres params from previous frame. cm->superres_scale_denominator = SCALE_NUMERATOR; const size_params_type rsz = calculate_next_size_params(cpi); setup_frame_size_from_params(cpi, &rsz); assert(av1_is_min_tile_width_satisfied(cm)); } void av1_superres_post_encode(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; assert(cpi->oxcf.superres_cfg.enable_superres); assert(!is_lossless_requested(&cpi->oxcf.rc_cfg)); assert(!cm->features.all_lossless); av1_superres_upscale(cm, NULL, cpi->alloc_pyramid); // If regular resizing is occurring the source will need to be downscaled to // match the upscaled superres resolution. Otherwise the original source is // used. if (!av1_resize_scaled(cm)) { cpi->source = cpi->unscaled_source; if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source; } else { assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width); assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height); // Do downscale. cm->(width|height) has been updated by // av1_superres_upscale cpi->source = realloc_and_scale_source(cpi, cm->superres_upscaled_width, cm->superres_upscaled_height); } } aom-3.12.1/av1/encoder/superres_scale.h000066400000000000000000000015711477627663500177030ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_SUPERRES_SCALE_H_ #define AOM_AV1_ENCODER_SUPERRES_SCALE_H_ #include "av1/encoder/encoder.h" #ifdef __cplusplus extern "C" { #endif int av1_superres_in_recode_allowed(const AV1_COMP *const cpi); void av1_superres_post_encode(AV1_COMP *cpi); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_SUPERRES_SCALE_H_ aom-3.12.1/av1/encoder/svc_layercontext.c000066400000000000000000000743731477627663500202650ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "av1/encoder/encoder.h" #include "av1/encoder/encoder_alloc.h" static void swap_ptr(void *a, void *b) { void **a_p = (void **)a; void **b_p = (void **)b; void *c = *a_p; *a_p = *b_p; *b_p = c; } void av1_init_layer_context(AV1_COMP *const cpi) { AV1_COMMON *const cm = &cpi->common; const AV1EncoderConfig *const oxcf = &cpi->oxcf; SVC *const svc = &cpi->svc; int mi_rows = cpi->common.mi_params.mi_rows; int mi_cols = cpi->common.mi_params.mi_cols; svc->base_framerate = 30.0; svc->current_superframe = 0; svc->force_zero_mode_spatial_ref = 1; svc->num_encoded_top_layer = 0; svc->use_flexible_mode = 0; svc->has_lower_quality_layer = 0; for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); LAYER_CONTEXT *const lc = &svc->layer_context[layer]; RATE_CONTROL *const lrc = &lc->rc; PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc; lrc->ni_av_qi = oxcf->rc_cfg.worst_allowed_q; lp_rc->total_actual_bits = 0; lrc->ni_tot_qi = 0; lp_rc->tot_q = 0.0; lp_rc->avg_q = 0.0; lp_rc->ni_frames = 0; lrc->decimation_count = 0; lrc->decimation_factor = 0; lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q); lrc->best_quality = av1_quantizer_to_qindex(lc->min_q); lrc->rtc_external_ratectrl = 0; for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) { lp_rc->rate_correction_factors[i] = 1.0; } lc->target_bandwidth = lc->layer_target_bitrate; lp_rc->last_q[INTER_FRAME] = lrc->worst_quality; lp_rc->avg_frame_qindex[INTER_FRAME] = lrc->worst_quality; lp_rc->avg_frame_qindex[KEY_FRAME] = lrc->worst_quality; lp_rc->buffer_level = oxcf->rc_cfg.starting_buffer_level_ms * lc->target_bandwidth / 1000; lp_rc->bits_off_target = lp_rc->buffer_level; // Initialize the cyclic refresh parameters. If spatial layers are used // (i.e., ss_number_layers > 1), these need to be updated per spatial // layer. Cyclic refresh is only applied on base temporal layer. if (svc->number_spatial_layers > 1 && tl == 0) { lc->sb_index = 0; lc->actual_num_seg1_blocks = 0; lc->actual_num_seg2_blocks = 0; lc->counter_encode_maxq_scene_change = 0; aom_free(lc->map); CHECK_MEM_ERROR(cm, lc->map, aom_calloc(mi_rows * mi_cols, sizeof(*lc->map))); } } svc->downsample_filter_type[sl] = BILINEAR; svc->downsample_filter_phase[sl] = 8; svc->last_layer_dropped[sl] = false; svc->drop_spatial_layer[sl] = false; } if (svc->number_spatial_layers == 3) { svc->downsample_filter_type[0] = EIGHTTAP_SMOOTH; } } bool av1_alloc_layer_context(AV1_COMP *cpi, int num_layers) { SVC *const svc = &cpi->svc; if (svc->layer_context == NULL || svc->num_allocated_layers < num_layers) { assert(num_layers > 1); aom_free(svc->layer_context); svc->num_allocated_layers = 0; svc->layer_context = (LAYER_CONTEXT *)aom_calloc(num_layers, sizeof(*svc->layer_context)); if (svc->layer_context == NULL) return false; svc->num_allocated_layers = num_layers; } return true; } // Update the layer context from a change_config() call. void av1_update_layer_context_change_config(AV1_COMP *const cpi, const int64_t target_bandwidth) { const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; AV1_COMMON *const cm = &cpi->common; SVC *const svc = &cpi->svc; int layer = 0; int64_t spatial_layer_target = 0; float bitrate_alloc = 1.0; const int mi_rows = cm->mi_params.mi_rows; const int mi_cols = cm->mi_params.mi_cols; for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); LAYER_CONTEXT *const lc = &svc->layer_context[layer]; svc->layer_context[layer].target_bandwidth = lc->layer_target_bitrate; } spatial_layer_target = svc->layer_context[layer].target_bandwidth; for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { LAYER_CONTEXT *const lc = &svc->layer_context[sl * svc->number_temporal_layers + tl]; RATE_CONTROL *const lrc = &lc->rc; PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc; lc->spatial_layer_target_bandwidth = spatial_layer_target; if (target_bandwidth != 0) { bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; } lp_rc->starting_buffer_level = (int64_t)(p_rc->starting_buffer_level * bitrate_alloc); lp_rc->optimal_buffer_level = (int64_t)(p_rc->optimal_buffer_level * bitrate_alloc); lp_rc->maximum_buffer_size = (int64_t)(p_rc->maximum_buffer_size * bitrate_alloc); lp_rc->bits_off_target = AOMMIN(lp_rc->bits_off_target, lp_rc->maximum_buffer_size); lp_rc->buffer_level = AOMMIN(lp_rc->buffer_level, lp_rc->maximum_buffer_size); lc->framerate = cpi->framerate / lc->framerate_factor; lrc->avg_frame_bandwidth = (int)round(lc->target_bandwidth / lc->framerate); lrc->max_frame_bandwidth = rc->max_frame_bandwidth; lrc->rtc_external_ratectrl = rc->rtc_external_ratectrl; lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q); lrc->best_quality = av1_quantizer_to_qindex(lc->min_q); if (rc->use_external_qp_one_pass) { lrc->worst_quality = rc->worst_quality; lrc->best_quality = rc->best_quality; } // Reset the cyclic refresh parameters, if needed (map is NULL), // or number of spatial layers has changed. // Cyclic refresh is only applied on base temporal layer. if (svc->number_spatial_layers > 1 && tl == 0 && (lc->map == NULL || svc->prev_number_spatial_layers != svc->number_spatial_layers)) { lc->sb_index = 0; lc->actual_num_seg1_blocks = 0; lc->actual_num_seg2_blocks = 0; lc->counter_encode_maxq_scene_change = 0; aom_free(lc->map); CHECK_MEM_ERROR(cm, lc->map, aom_calloc(mi_rows * mi_cols, sizeof(*lc->map))); } } } } /*!\brief Return layer context for current layer. * * \ingroup rate_control * \param[in] cpi Top level encoder structure * * \return LAYER_CONTEXT for current layer. */ static LAYER_CONTEXT *get_layer_context(AV1_COMP *const cpi) { return &cpi->svc.layer_context[cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + cpi->svc.temporal_layer_id]; } void av1_update_temporal_layer_framerate(AV1_COMP *const cpi) { SVC *const svc = &cpi->svc; LAYER_CONTEXT *const lc = get_layer_context(cpi); RATE_CONTROL *const lrc = &lc->rc; const int tl = svc->temporal_layer_id; lc->framerate = cpi->framerate / lc->framerate_factor; lrc->avg_frame_bandwidth = (int)round(lc->target_bandwidth / lc->framerate); lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth; // Update the average layer frame size (non-cumulative per-frame-bw). if (tl == 0) { lc->avg_frame_size = lrc->avg_frame_bandwidth; } else { int prev_layer = svc->spatial_layer_id * svc->number_temporal_layers + svc->temporal_layer_id - 1; LAYER_CONTEXT *const lcprev = &svc->layer_context[prev_layer]; const double prev_layer_framerate = cpi->framerate / lcprev->framerate_factor; const int64_t prev_layer_target_bandwidth = lcprev->layer_target_bitrate; if (lc->framerate > prev_layer_framerate) { lc->avg_frame_size = (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) / (lc->framerate - prev_layer_framerate)); } else { lc->avg_frame_size = (int)round(lc->target_bandwidth / lc->framerate); } } } bool av1_check_ref_is_low_spatial_res_super_frame(AV1_COMP *const cpi, int ref_frame) { SVC *svc = &cpi->svc; RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; int ref_frame_idx = rtc_ref->ref_idx[ref_frame - 1]; return rtc_ref->buffer_time_index[ref_frame_idx] == svc->current_superframe && rtc_ref->buffer_spatial_layer[ref_frame_idx] <= svc->spatial_layer_id - 1; } void av1_restore_layer_context(AV1_COMP *const cpi) { SVC *const svc = &cpi->svc; RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; const AV1_COMMON *const cm = &cpi->common; LAYER_CONTEXT *const lc = get_layer_context(cpi); const int old_frame_since_key = cpi->rc.frames_since_key; const int old_frame_to_key = cpi->rc.frames_to_key; const int frames_since_scene_change = cpi->rc.frames_since_scene_change; const int last_encoded_size_keyframe = cpi->rc.last_encoded_size_keyframe; const int last_target_size_keyframe = cpi->rc.last_target_size_keyframe; const int max_consec_drop = cpi->rc.max_consec_drop; const int postencode_drop = cpi->rc.postencode_drop; const int static_since_last_scene_change = cpi->rc.static_since_last_scene_change; // Restore layer rate control. cpi->rc = lc->rc; cpi->ppi->p_rc = lc->p_rc; cpi->oxcf.rc_cfg.target_bandwidth = lc->target_bandwidth; cpi->gf_frame_index = 0; cpi->mv_search_params.max_mv_magnitude = lc->max_mv_magnitude; if (cpi->mv_search_params.max_mv_magnitude == 0) cpi->mv_search_params.max_mv_magnitude = AOMMAX(cm->width, cm->height); // Reset the following parameters to their values before // the layer restore. Keep these defined for the stream (not layer). cpi->rc.frames_since_key = old_frame_since_key; cpi->rc.frames_to_key = old_frame_to_key; cpi->rc.frames_since_scene_change = frames_since_scene_change; cpi->rc.last_encoded_size_keyframe = last_encoded_size_keyframe; cpi->rc.last_target_size_keyframe = last_target_size_keyframe; cpi->rc.max_consec_drop = max_consec_drop; cpi->rc.postencode_drop = postencode_drop; cpi->rc.static_since_last_scene_change = static_since_last_scene_change; // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, // for the base temporal layer. if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && svc->number_spatial_layers > 1 && svc->temporal_layer_id == 0) { CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; swap_ptr(&cr->map, &lc->map); cr->sb_index = lc->sb_index; cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks; cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks; cr->counter_encode_maxq_scene_change = lc->counter_encode_maxq_scene_change; } svc->skip_mvsearch_last = 0; svc->skip_mvsearch_gf = 0; svc->skip_mvsearch_altref = 0; // For each reference (LAST/GOLDEN) set the skip_mvsearch_last/gf frame flags. // This is to skip searching mv for that reference if it was last // refreshed (i.e., buffer slot holding that reference was refreshed) on the // previous spatial layer(s) at the same time (current_superframe). if (rtc_ref->set_ref_frame_config && svc->force_zero_mode_spatial_ref && cpi->sf.rt_sf.use_nonrd_pick_mode) { if (av1_check_ref_is_low_spatial_res_super_frame(cpi, LAST_FRAME)) { svc->skip_mvsearch_last = 1; } if (av1_check_ref_is_low_spatial_res_super_frame(cpi, GOLDEN_FRAME)) { svc->skip_mvsearch_gf = 1; } if (av1_check_ref_is_low_spatial_res_super_frame(cpi, ALTREF_FRAME)) { svc->skip_mvsearch_altref = 1; } } } void av1_svc_update_buffer_slot_refreshed(AV1_COMP *const cpi) { SVC *const svc = &cpi->svc; RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; const unsigned int current_frame = cpi->ppi->use_svc ? svc->current_superframe : cpi->common.current_frame.frame_number; // For any buffer slot that is refreshed, update it with // the spatial_layer_id and the current_superframe. if (cpi->common.current_frame.frame_type == KEY_FRAME) { // All slots are refreshed on KEY. for (unsigned int i = 0; i < REF_FRAMES; i++) { rtc_ref->buffer_time_index[i] = current_frame; rtc_ref->buffer_spatial_layer[i] = svc->spatial_layer_id; } } else if (rtc_ref->set_ref_frame_config) { for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { const int ref_frame_map_idx = rtc_ref->ref_idx[i]; if (rtc_ref->refresh[ref_frame_map_idx]) { rtc_ref->buffer_time_index[ref_frame_map_idx] = current_frame; rtc_ref->buffer_spatial_layer[ref_frame_map_idx] = svc->spatial_layer_id; } } } } void av1_save_layer_context(AV1_COMP *const cpi) { SVC *const svc = &cpi->svc; const AV1_COMMON *const cm = &cpi->common; LAYER_CONTEXT *lc = get_layer_context(cpi); lc->rc = cpi->rc; lc->p_rc = cpi->ppi->p_rc; lc->target_bandwidth = (int)cpi->oxcf.rc_cfg.target_bandwidth; lc->group_index = cpi->gf_frame_index; lc->max_mv_magnitude = cpi->mv_search_params.max_mv_magnitude; if (svc->spatial_layer_id == 0) svc->base_framerate = cpi->framerate; // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, // for the base temporal layer. if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->svc.number_spatial_layers > 1 && svc->temporal_layer_id == 0) { CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; signed char *temp = lc->map; lc->map = cr->map; cr->map = temp; lc->sb_index = cr->sb_index; lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks; lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks; lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change; } if (!cpi->is_dropped_frame) { av1_svc_update_buffer_slot_refreshed(cpi); for (unsigned int i = 0; i < REF_FRAMES; i++) { if (frame_is_intra_only(cm) || cm->current_frame.refresh_frame_flags & (1 << i)) { svc->spatial_layer_fb[i] = svc->spatial_layer_id; svc->temporal_layer_fb[i] = svc->temporal_layer_id; } } } if (svc->spatial_layer_id == svc->number_spatial_layers - 1) { svc->current_superframe++; // Reset drop flag to false for next superframe. for (int sl = 0; sl < svc->number_spatial_layers; sl++) svc->drop_spatial_layer[sl] = false; } } int av1_svc_primary_ref_frame(const AV1_COMP *const cpi) { const SVC *const svc = &cpi->svc; const AV1_COMMON *const cm = &cpi->common; int fb_idx = -1; int primary_ref_frame = PRIMARY_REF_NONE; if (cpi->svc.number_spatial_layers > 1 || cpi->svc.number_temporal_layers > 1) { // Set the primary_ref_frame to LAST_FRAME if that buffer slot for LAST // was last updated on a lower temporal layer (or base TL0) and for the // same spatial layer. For RTC patterns this allows for continued decoding // when set of enhancement layers are dropped (continued decoding starting // at next base TL0), so error_resilience can be off/0 for all layers. fb_idx = get_ref_frame_map_idx(cm, LAST_FRAME); if (cpi->ppi->rtc_ref.reference[0] == 1 && svc->spatial_layer_fb[fb_idx] == svc->spatial_layer_id && (svc->temporal_layer_fb[fb_idx] < svc->temporal_layer_id || svc->temporal_layer_fb[fb_idx] == 0)) { primary_ref_frame = 0; // LAST_FRAME: ref_frame - LAST_FRAME } } else if (cpi->ppi->rtc_ref.set_ref_frame_config) { const ExternalFlags *const ext_flags = &cpi->ext_flags; int flags = ext_flags->ref_frame_flags; if (flags & AOM_LAST_FLAG) { primary_ref_frame = 0; // LAST_FRAME: ref_frame - LAST_FRAME } else if (flags & AOM_GOLD_FLAG) { primary_ref_frame = GOLDEN_FRAME - LAST_FRAME; } else if (flags & AOM_ALT_FLAG) { primary_ref_frame = ALTREF_FRAME - LAST_FRAME; } } return primary_ref_frame; } void av1_free_svc_cyclic_refresh(AV1_COMP *const cpi) { SVC *const svc = &cpi->svc; for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); LAYER_CONTEXT *const lc = &svc->layer_context[layer]; aom_free(lc->map); lc->map = NULL; } } } void av1_svc_reset_temporal_layers(AV1_COMP *const cpi, int is_key) { SVC *const svc = &cpi->svc; LAYER_CONTEXT *lc = NULL; for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl]; if (is_key) lc->frames_from_key_frame = 0; } } av1_update_temporal_layer_framerate(cpi); av1_restore_layer_context(cpi); } void av1_get_layer_resolution(const int width_org, const int height_org, const int num, const int den, int *width_out, int *height_out) { int w, h; if (width_out == NULL || height_out == NULL || den == 0) return; if (den == 1 && num == 1) { *width_out = width_org; *height_out = height_org; return; } w = width_org * num / den; h = height_org * num / den; // Make height and width even. w += w % 2; h += h % 2; *width_out = w; *height_out = h; } void av1_one_pass_cbr_svc_start_layer(AV1_COMP *const cpi) { SVC *const svc = &cpi->svc; AV1_COMMON *const cm = &cpi->common; LAYER_CONTEXT *lc = NULL; int width = 0, height = 0; lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers + svc->temporal_layer_id]; // Set the lower quality layer flag. svc->has_lower_quality_layer = 0; if (cpi->svc.spatial_layer_id > 0) { const LAYER_CONTEXT *lc_prev = &svc->layer_context[(svc->spatial_layer_id - 1) * svc->number_temporal_layers + svc->temporal_layer_id]; if (lc_prev->scaling_factor_den == 1 && lc_prev->scaling_factor_num == 1) svc->has_lower_quality_layer = 1; } av1_get_layer_resolution(cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height, lc->scaling_factor_num, lc->scaling_factor_den, &width, &height); // Use Eightap_smooth for low resolutions. if (width * height <= 320 * 240) svc->downsample_filter_type[svc->spatial_layer_id] = EIGHTTAP_SMOOTH; cm->width = width; cm->height = height; alloc_mb_mode_info_buffers(cpi); av1_update_frame_size(cpi); if (svc->spatial_layer_id == svc->number_spatial_layers - 1) { svc->mi_cols_full_resoln = cm->mi_params.mi_cols; svc->mi_rows_full_resoln = cm->mi_params.mi_rows; } } enum { SVC_LAST_FRAME = 0, SVC_LAST2_FRAME, SVC_LAST3_FRAME, SVC_GOLDEN_FRAME, SVC_BWDREF_FRAME, SVC_ALTREF2_FRAME, SVC_ALTREF_FRAME }; // For fixed svc mode: fixed pattern is set based on the number of // spatial and temporal layers, and the ksvc_fixed_mode. void av1_set_svc_fixed_mode(AV1_COMP *const cpi) { SVC *const svc = &cpi->svc; RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; int i; assert(svc->use_flexible_mode == 0); // Fixed SVC mode only supports at most 3 spatial or temporal layers. assert(svc->number_spatial_layers >= 1 && svc->number_spatial_layers <= 3 && svc->number_temporal_layers >= 1 && svc->number_temporal_layers <= 3); rtc_ref->set_ref_frame_config = 1; int superframe_cnt = svc->current_superframe; // Set the reference map buffer idx for the 7 references: // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). for (i = 0; i < INTER_REFS_PER_FRAME; i++) { rtc_ref->reference[i] = 0; rtc_ref->ref_idx[i] = i; } for (i = 0; i < REF_FRAMES; i++) rtc_ref->refresh[i] = 0; // Always reference LAST, and reference GOLDEN on SL > 0. // For KSVC: GOLDEN reference will be removed on INTER_FRAMES later // when frame_type is set. rtc_ref->reference[SVC_LAST_FRAME] = 1; if (svc->spatial_layer_id > 0) rtc_ref->reference[SVC_GOLDEN_FRAME] = 1; if (svc->temporal_layer_id == 0) { // Base temporal layer. if (svc->spatial_layer_id == 0) { // Set all buffer_idx to 0. Update slot 0 (LAST). for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; rtc_ref->refresh[0] = 1; } else if (svc->spatial_layer_id == 1) { // Set buffer_idx for LAST to slot 1, GOLDEN (and all other refs) to // slot 0. Update slot 1 (LAST). for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; rtc_ref->ref_idx[SVC_LAST_FRAME] = 1; rtc_ref->refresh[1] = 1; } else if (svc->spatial_layer_id == 2) { // Set buffer_idx for LAST to slot 2, GOLDEN (and all other refs) to // slot 1. Update slot 2 (LAST). for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 1; rtc_ref->ref_idx[SVC_LAST_FRAME] = 2; rtc_ref->refresh[2] = 1; } } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 1) % 4 == 0) { // First top temporal enhancement layer. if (svc->spatial_layer_id == 0) { // Reference LAST (slot 0). // Set GOLDEN to slot 3 and update slot 3. // Set all other buffer_idx to slot 0. for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; if (svc->spatial_layer_id < svc->number_spatial_layers - 1) { rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3; rtc_ref->refresh[3] = 1; } } else if (svc->spatial_layer_id == 1) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, // GOLDEN (and all other refs) to slot 3. // Set LAST2 to slot 4 and Update slot 4. for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 3; rtc_ref->ref_idx[SVC_LAST_FRAME] = 1; if (svc->spatial_layer_id < svc->number_spatial_layers - 1) { rtc_ref->ref_idx[SVC_LAST2_FRAME] = 4; rtc_ref->refresh[4] = 1; } } else if (svc->spatial_layer_id == 2) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, // GOLDEN (and all other refs) to slot 4. // No update. for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 4; rtc_ref->ref_idx[SVC_LAST_FRAME] = 2; } } else if (svc->temporal_layer_id == 1) { // Middle temporal enhancement layer. if (svc->spatial_layer_id == 0) { // Reference LAST. // Set all buffer_idx to 0. // Set GOLDEN to slot 5 and update slot 5. for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; if (svc->temporal_layer_id < svc->number_temporal_layers - 1 || svc->spatial_layer_id < svc->number_spatial_layers - 1) { rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 5; rtc_ref->refresh[5] = 1; } } else if (svc->spatial_layer_id == 1) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, // GOLDEN (and all other refs) to slot 5. // Set LAST3 to slot 6 and update slot 6. for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 5; rtc_ref->ref_idx[SVC_LAST_FRAME] = 1; if (svc->temporal_layer_id < svc->number_temporal_layers - 1 || svc->spatial_layer_id < svc->number_spatial_layers - 1) { rtc_ref->ref_idx[SVC_LAST3_FRAME] = 6; rtc_ref->refresh[6] = 1; } } else if (svc->spatial_layer_id == 2) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, // GOLDEN (and all other refs) to slot 6. // Set LAST3 to slot 7 and update slot 7. for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 6; rtc_ref->ref_idx[SVC_LAST_FRAME] = 2; if (svc->temporal_layer_id < svc->number_temporal_layers - 1) { rtc_ref->ref_idx[SVC_LAST3_FRAME] = 7; rtc_ref->refresh[7] = 1; } } } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 3) % 4 == 0) { // Second top temporal enhancement layer. if (svc->spatial_layer_id == 0) { // Set LAST to slot 5 and reference LAST. // Set GOLDEN to slot 3 and update slot 3. // Set all other buffer_idx to 0. for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; rtc_ref->ref_idx[SVC_LAST_FRAME] = 5; if (svc->spatial_layer_id < svc->number_spatial_layers - 1) { rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3; rtc_ref->refresh[3] = 1; } } else if (svc->spatial_layer_id == 1) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6, // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4. for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; rtc_ref->ref_idx[SVC_LAST_FRAME] = 6; rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3; if (svc->spatial_layer_id < svc->number_spatial_layers - 1) { rtc_ref->ref_idx[SVC_LAST2_FRAME] = 4; rtc_ref->refresh[4] = 1; } } else if (svc->spatial_layer_id == 2) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7, // GOLDEN to slot 4. No update. for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; rtc_ref->ref_idx[SVC_LAST_FRAME] = 7; rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 4; } } } void av1_svc_check_reset_layer_rc_flag(AV1_COMP *const cpi) { SVC *const svc = &cpi->svc; for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { // Check for reset based on avg_frame_bandwidth for spatial layer sl. // If avg_frame_bandwidth for top temporal layer is not set // (because enhancement layer was inactive), use the base TL0 int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1, svc->number_temporal_layers); LAYER_CONTEXT *lc = &svc->layer_context[layer]; RATE_CONTROL *lrc = &lc->rc; int avg_frame_bandwidth = lrc->avg_frame_bandwidth; int prev_avg_frame_bandwidth = lrc->prev_avg_frame_bandwidth; if (avg_frame_bandwidth == 0 || prev_avg_frame_bandwidth == 0) { // Use base TL0. layer = LAYER_IDS_TO_IDX(sl, 0, svc->number_temporal_layers); lc = &svc->layer_context[layer]; lrc = &lc->rc; avg_frame_bandwidth = lrc->avg_frame_bandwidth; prev_avg_frame_bandwidth = lrc->prev_avg_frame_bandwidth; } if (avg_frame_bandwidth / 3 > (prev_avg_frame_bandwidth >> 1) || avg_frame_bandwidth < (prev_avg_frame_bandwidth >> 1)) { // Reset for all temporal layers with spatial layer sl. for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { int layer2 = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); LAYER_CONTEXT *lc2 = &svc->layer_context[layer2]; RATE_CONTROL *lrc2 = &lc2->rc; PRIMARY_RATE_CONTROL *lp_rc2 = &lc2->p_rc; PRIMARY_RATE_CONTROL *const lp_rc = &lc2->p_rc; lrc2->rc_1_frame = 0; lrc2->rc_2_frame = 0; lp_rc2->bits_off_target = lp_rc->optimal_buffer_level; lp_rc2->buffer_level = lp_rc->optimal_buffer_level; } } } } void av1_svc_set_last_source(AV1_COMP *const cpi, EncodeFrameInput *frame_input, YV12_BUFFER_CONFIG *prev_source) { frame_input->last_source = prev_source != NULL ? prev_source : NULL; if (!cpi->ppi->use_svc && cpi->rc.prev_frame_is_dropped && cpi->rc.frame_number_encoded > 0) { frame_input->last_source = &cpi->svc.source_last_TL0; } else { RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; if (cpi->svc.spatial_layer_id == 0) { // For base spatial layer: if the LAST reference (index 0) is not // the previous (super)frame set the last_source to the source // corresponding to the last TL0, otherwise keep it at prev_source. // Always use source_last_TL0 if previous base TL0 was dropped. if (cpi->svc.current_superframe > 0) { const int buffslot_last = rtc_ref->ref_idx[0]; // Check if previous frame was dropped on base TL0 layer. const int layer = LAYER_IDS_TO_IDX(0, 0, cpi->svc.number_temporal_layers); LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; RATE_CONTROL *lrc = &lc->rc; if (lrc->prev_frame_is_dropped || rtc_ref->buffer_time_index[buffslot_last] < cpi->svc.current_superframe - 1) { frame_input->last_source = &cpi->svc.source_last_TL0; } } } else if (cpi->svc.spatial_layer_id > 0) { // For spatial enhancement layers: the previous source (prev_source) // corresponds to the lower spatial layer (which is the same source so // we can't use that), so always set the last_source to the source of the // last TL0. if (cpi->svc.current_superframe > 0) frame_input->last_source = &cpi->svc.source_last_TL0; else frame_input->last_source = NULL; } } } int av1_svc_get_min_ref_dist(const AV1_COMP *cpi) { RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; int min_dist = INT_MAX; const unsigned int current_frame_num = cpi->ppi->use_svc ? cpi->svc.current_superframe : cpi->common.current_frame.frame_number; for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { if (rtc_ref->reference[i]) { const int ref_frame_map_idx = rtc_ref->ref_idx[i]; const int dist = current_frame_num - rtc_ref->buffer_time_index[ref_frame_map_idx]; if (dist < min_dist) min_dist = dist; } } return min_dist; } void av1_svc_set_reference_was_previous(AV1_COMP *cpi) { RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; // Check if the encoded frame had some reference that was the // previous frame. const unsigned int current_frame = cpi->ppi->use_svc ? cpi->svc.current_superframe : cpi->common.current_frame.frame_number; rtc_ref->reference_was_previous_frame = true; if (current_frame > 0) { rtc_ref->reference_was_previous_frame = false; for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { if (rtc_ref->reference[i]) { const int ref_frame_map_idx = rtc_ref->ref_idx[i]; if (rtc_ref->buffer_time_index[ref_frame_map_idx] == current_frame - 1) rtc_ref->reference_was_previous_frame = true; } } } } aom-3.12.1/av1/encoder/svc_layercontext.h000066400000000000000000000223021477627663500202530ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_ #define AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_ #include "aom_scale/yv12config.h" #include "av1/encoder/aq_cyclicrefresh.h" #include "av1/encoder/encoder.h" #include "av1/encoder/ratectrl.h" #ifdef __cplusplus extern "C" { #endif /*! * \brief The stucture of quantities related to each spatial and temporal layer. * \ingroup SVC */ typedef struct { /*!\cond */ RATE_CONTROL rc; PRIMARY_RATE_CONTROL p_rc; int framerate_factor; int64_t layer_target_bitrate; // In bits per second. int scaling_factor_num; int scaling_factor_den; int64_t target_bandwidth; int64_t spatial_layer_target_bandwidth; double framerate; int avg_frame_size; int max_q; int min_q; int frames_from_key_frame; /*!\endcond */ /*! * Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame. */ int sb_index; /*! * Segmentation map */ int8_t *map; /*! * Number of blocks on segment 1 */ int actual_num_seg1_blocks; /*! * Number of blocks on segment 2 */ int actual_num_seg2_blocks; /*! * Counter used to detect scene change. */ int counter_encode_maxq_scene_change; /*! * Speed settings for each layer. */ uint8_t speed; /*! * GF group index. */ unsigned char group_index; /*! * If current layer is key frame. */ int is_key_frame; /*! * Maximum motion magnitude of previous encoded layer. */ int max_mv_magnitude; } LAYER_CONTEXT; /*! * \brief The stucture of SVC. * \ingroup SVC */ typedef struct SVC { /*!\cond */ int spatial_layer_id; int temporal_layer_id; int number_spatial_layers; int number_temporal_layers; int prev_number_spatial_layers; int use_flexible_mode; int ksvc_fixed_mode; /*!\endcond */ /*!\cond */ double base_framerate; unsigned int current_superframe; int skip_mvsearch_last; int skip_mvsearch_gf; int skip_mvsearch_altref; int spatial_layer_fb[REF_FRAMES]; int temporal_layer_fb[REF_FRAMES]; int num_encoded_top_layer; int first_layer_denoise; YV12_BUFFER_CONFIG source_last_TL0; int mi_cols_full_resoln; int mi_rows_full_resoln; /*!\endcond */ /*! * Layer context used for rate control in CBR mode. * An array. The index for spatial layer `sl` and temporal layer `tl` is * sl * number_temporal_layers + tl. */ LAYER_CONTEXT *layer_context; /*! * Number of layers allocated for layer_context. If nonzero, must be greater * than or equal to number_spatial_layers * number_temporal_layers. */ int num_allocated_layers; /*! * EIGHTTAP_SMOOTH or BILINEAR */ InterpFilter downsample_filter_type[AOM_MAX_SS_LAYERS]; /*! * Downsample_filter_phase: = 0 will do sub-sampling (no weighted average), * = 8 will center the target pixel and get a symmetric averaging filter. */ int downsample_filter_phase[AOM_MAX_SS_LAYERS]; /*! * Force zero-mv in mode search for the spatial/inter-layer reference. */ int force_zero_mode_spatial_ref; /*! * Flag to indicate that current spatial layer has a lower quality layer * (at the same timestamp) that can be used as a reference. * Lower quality layer refers to the same resolution but encoded at * different/lower bitrate. */ int has_lower_quality_layer; /*! * Flag to indicate the frame drop mode for SVC: one of the two settings: * AOM_LAYER_DROP (default) or AOM_FULL_SUPERFRAME_DROP. */ AOM_SVC_FRAME_DROP_MODE framedrop_mode; /*! * Flag to indicate if frame was dropped for a given spatial_layer_id on * previous superframe. */ bool last_layer_dropped[AOM_MAX_SS_LAYERS]; /*! * Flag to indicate if a previous spatial was dropped for the same superframe. */ bool drop_spatial_layer[AOM_MAX_SS_LAYERS]; } SVC; struct AV1_COMP; struct EncodeFrameInput; /*!\brief Initialize layer context data from init_config(). * * \ingroup SVC * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * * \remark Nothing returned. Set cpi->svc. */ void av1_init_layer_context(struct AV1_COMP *const cpi); /*!\brief Allocate layer context data. * * \ingroup SVC * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * \param[in] num_layers Number of layers to be allocated * * \remark Allocates memory for cpi->svc.layer_context. * \return True on success, false on allocation failure. */ bool av1_alloc_layer_context(struct AV1_COMP *cpi, int num_layers); /*!\brief Update the layer context from a change_config() call. * * \ingroup SVC * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * \param[in] target_bandwidth Total target bandwidth * * \remark Nothing returned. Buffer level for each layer is set. */ void av1_update_layer_context_change_config(struct AV1_COMP *const cpi, const int64_t target_bandwidth); /*!\brief Prior to encoding the frame, update framerate-related quantities for the current temporal layer. * * \ingroup SVC * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * * \remark Nothing returned. Frame related quantities for current temporal layer are updated. */ void av1_update_temporal_layer_framerate(struct AV1_COMP *const cpi); /*!\brief Prior to check if reference is lower spatial layer at the same * timestamp/superframe. * * \ingroup SVC * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * \param[in] ref_frame Reference frame * * \return True if the ref_frame if lower spatial layer, otherwise false. */ bool av1_check_ref_is_low_spatial_res_super_frame(struct AV1_COMP *const cpi, int ref_frame); /*!\brief Prior to encoding the frame, set the layer context, for the current layer to be encoded, to the cpi struct. * * \ingroup SVC * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * * \remark Nothing returned. Layer context for current layer is set. */ void av1_restore_layer_context(struct AV1_COMP *const cpi); /*!\brief Save the layer context after encoding the frame. * * \ingroup SVC * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure */ void av1_save_layer_context(struct AV1_COMP *const cpi); /*!\brief Free the memory used for cyclic refresh in layer context. * * \ingroup SVC * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure */ void av1_free_svc_cyclic_refresh(struct AV1_COMP *const cpi); /*!\brief Reset on key frame: reset counters, references and buffer updates. * * \ingroup SVC * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * \param[in] is_key Whether current layer is key frame */ void av1_svc_reset_temporal_layers(struct AV1_COMP *const cpi, int is_key); /*!\brief Before encoding, set resolutions and allocate compressor data. * * \ingroup SVC * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure */ void av1_one_pass_cbr_svc_start_layer(struct AV1_COMP *const cpi); /*!\brief Get primary reference frame for current layer * * \ingroup SVC * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * * \return The primary reference frame for current layer. */ int av1_svc_primary_ref_frame(const struct AV1_COMP *const cpi); /*!\brief Get resolution for current layer. * * \ingroup SVC * \param[in] width_org Original width, unscaled * \param[in] height_org Original height, unscaled * \param[in] num Numerator for the scale ratio * \param[in] den Denominator for the scale ratio * \param[in] width_out Output width, scaled for current layer * \param[in] height_out Output height, scaled for current layer * * \remark Nothing is returned. Instead the scaled width and height are set. */ void av1_get_layer_resolution(const int width_org, const int height_org, const int num, const int den, int *width_out, int *height_out); void av1_set_svc_fixed_mode(struct AV1_COMP *const cpi); void av1_svc_check_reset_layer_rc_flag(struct AV1_COMP *const cpi); void av1_svc_set_last_source(struct AV1_COMP *const cpi, struct EncodeFrameInput *frame_input, YV12_BUFFER_CONFIG *prev_source); void av1_svc_update_buffer_slot_refreshed(struct AV1_COMP *const cpi); int av1_svc_get_min_ref_dist(const struct AV1_COMP *cpi); void av1_svc_set_reference_was_previous(struct AV1_COMP *cpi); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_ aom-3.12.1/av1/encoder/temporal_filter.c000066400000000000000000002067201477627663500200520ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #include "config/aom_scale_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/mathutils.h" #include "aom_dsp/odintrin.h" #include "aom_mem/aom_mem.h" #include "aom_ports/aom_timer.h" #include "aom_ports/mem.h" #include "av1/common/alloccommon.h" #include "av1/common/av1_common_int.h" #include "av1/common/quant_common.h" #include "av1/common/reconinter.h" #include "av1/encoder/av1_quantize.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encoder.h" #include "av1/encoder/ethread.h" #include "av1/encoder/extend.h" #include "av1/encoder/firstpass.h" #include "av1/encoder/gop_structure.h" #include "av1/encoder/intra_mode_search_utils.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/motion_search_facade.h" #include "av1/encoder/pass2_strategy.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/reconinter_enc.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/temporal_filter.h" /*!\cond */ // NOTE: All `tf` in this file means `temporal filtering`. // Forward Declaration. static void tf_determine_block_partition(const MV block_mv, const int block_mse, MV *subblock_mvs, int *subblock_mses); // This function returns the minimum and maximum log variances for 4x4 sub // blocks in the current block. static inline void get_log_var_4x4sub_blk( AV1_COMP *cpi, const YV12_BUFFER_CONFIG *const frame_to_filter, int mb_row, int mb_col, BLOCK_SIZE block_size, double *blk_4x4_var_min, double *blk_4x4_var_max, int is_hbd) { const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; int var_min = INT_MAX; int var_max = 0; // Derive the source buffer. const int src_stride = frame_to_filter->y_stride; const int y_offset = mb_row * mb_height * src_stride + mb_col * mb_width; const uint8_t *src_buf = frame_to_filter->y_buffer + y_offset; aom_variance_fn_t vf = cpi->ppi->fn_ptr[BLOCK_4X4].vf; for (int i = 0; i < mb_height; i += MI_SIZE) { for (int j = 0; j < mb_width; j += MI_SIZE) { // Calculate the 4x4 sub-block variance. const int var = av1_calc_normalized_variance( vf, src_buf + (i * src_stride) + j, src_stride, is_hbd); // Record min and max for over-arching block var_min = AOMMIN(var_min, var); var_max = AOMMAX(var_max, var); } } *blk_4x4_var_min = log1p(var_min / 16.0); *blk_4x4_var_max = log1p(var_max / 16.0); } // Helper function to get `q` used for encoding. static int get_q(const AV1_COMP *cpi) { const GF_GROUP *gf_group = &cpi->ppi->gf_group; const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index]; const int q = (int)av1_convert_qindex_to_q(cpi->ppi->p_rc.avg_frame_qindex[frame_type], cpi->common.seq_params->bit_depth); return q; } /*!\endcond */ /*!\brief Does motion search for blocks in temporal filtering. This is * the first step for temporal filtering. More specifically, given a frame to * be filtered and another frame as reference, this function searches the * reference frame to find out the most similar block as that from the frame * to be filtered. This found block will be further used for weighted * averaging. * * NOTE: Besides doing motion search for the entire block, this function will * also do motion search for each 1/4 sub-block to get more precise * predictions. Then, this function will determines whether to use 4 * sub-blocks to replace the entire block. If we do need to split the * entire block, 4 elements in `subblock_mvs` and `subblock_mses` refer to * the searched motion vector and search error (MSE) w.r.t. each sub-block * respectively. Otherwise, the 4 elements will be the same, all of which * are assigned as the searched motion vector and search error (MSE) for * the entire block. * * \ingroup src_frame_proc * \param[in] cpi Top level encoder instance structure * \param[in] mb Pointer to macroblock * \param[in] frame_to_filter Pointer to the frame to be filtered * \param[in] ref_frame Pointer to the reference frame * \param[in] block_size Block size used for motion search * \param[in] mb_row Row index of the block in the frame * \param[in] mb_col Column index of the block in the frame * \param[in] ref_mv Reference motion vector, which is commonly * inherited from the motion search result of * previous frame. * \param[in] allow_me_for_sub_blks Flag to indicate whether motion search at * 16x16 sub-block level is needed or not. * \param[out] subblock_mvs Pointer to the motion vectors for * 4 sub-blocks * \param[out] subblock_mses Pointer to the search errors (MSE) for * 4 sub-blocks * \param[out] is_dc_diff_large Pointer to the value that tells if the DC * difference is large for the block * * \remark Nothing will be returned. Results are saved in subblock_mvs and * subblock_mses */ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb, const YV12_BUFFER_CONFIG *frame_to_filter, const YV12_BUFFER_CONFIG *ref_frame, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, MV *ref_mv, bool allow_me_for_sub_blks, MV *subblock_mvs, int *subblock_mses, int *is_dc_diff_large) { // Frame information const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height); // Block information (ONLY Y-plane is used for motion search). const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; const int mb_pels = mb_height * mb_width; const int y_stride = frame_to_filter->y_stride; const int src_width = frame_to_filter->y_width; const int ref_width = ref_frame->y_width; assert(y_stride == ref_frame->y_stride); assert(src_width == ref_width); const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width; // Save input state. MACROBLOCKD *const mbd = &mb->e_mbd; const struct buf_2d ori_src_buf = mb->plane[0].src; const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0]; // Parameters used for motion search. FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; SUBPEL_MOTION_SEARCH_PARAMS ms_params; const int step_param = av1_init_search_range( AOMMAX(frame_to_filter->y_crop_width, frame_to_filter->y_crop_height)); const SUBPEL_SEARCH_TYPE subpel_search_type = USE_8_TAPS; const int force_integer_mv = cpi->common.features.cur_frame_force_integer_mv; const MV_COST_TYPE mv_cost_type = min_frame_size >= 720 ? MV_COST_L1_HDRES : (min_frame_size >= 480 ? MV_COST_L1_MIDRES : MV_COST_L1_LOWRES); // Starting position for motion search. FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv); // Baseline position for motion search (used for rate distortion comparison). const MV baseline_mv = kZeroMv; // Setup. mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset; mb->plane[0].src.stride = y_stride; mb->plane[0].src.width = src_width; mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset; mbd->plane[0].pre[0].stride = y_stride; mbd->plane[0].pre[0].width = ref_width; *is_dc_diff_large = 0; const SEARCH_METHODS search_method = NSTEP; const search_site_config *search_site_cfg = av1_get_search_site_config(cpi, mb, search_method); // Unused intermediate results for motion search. unsigned int sse, error; int distortion; int cost_list[5]; // Do motion search. int_mv best_mv; // Searched motion vector. FULLPEL_MV_STATS best_mv_stats; int block_mse = INT_MAX; MV block_mv = kZeroMv; const int q = get_q(cpi); av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size, &baseline_mv, start_mv, search_site_cfg, search_method, /*fine_search_interval=*/0); full_ms_params.run_mesh_search = 1; full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type; if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) { // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1. full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1; full_ms_params.mesh_search_mv_diff_threshold = 2; } av1_full_pixel_search(start_mv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list), &best_mv.as_fullmv, &best_mv_stats, NULL); if (force_integer_mv == 1) { // Only do full search on the entire block. const int mv_row = best_mv.as_mv.row; const int mv_col = best_mv.as_mv.col; best_mv.as_mv.row = GET_MV_SUBPEL(mv_row); best_mv.as_mv.col = GET_MV_SUBPEL(mv_col); const int mv_offset = mv_row * y_stride + mv_col; error = cpi->ppi->fn_ptr[block_size].vf( ref_frame->y_buffer + y_offset + mv_offset, y_stride, frame_to_filter->y_buffer + y_offset, y_stride, &sse); block_mse = DIVIDE_AND_ROUND(error, mb_pels); block_mv = best_mv.as_mv; } else { // Do fractional search on the entire block and all sub-blocks. av1_make_default_subpel_ms_params(&ms_params, cpi, mb, block_size, &baseline_mv, cost_list); ms_params.forced_stop = EIGHTH_PEL; ms_params.var_params.subpel_search_type = subpel_search_type; // Since we are merely refining the result from full pixel search, we don't // need regularization for subpel search ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE; best_mv_stats.err_cost = 0; MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); error = cpi->mv_search_params.find_fractional_mv_step( &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv.as_mv, &distortion, &sse, NULL); block_mse = DIVIDE_AND_ROUND(error, mb_pels); block_mv = best_mv.as_mv; *ref_mv = best_mv.as_mv; *is_dc_diff_large = 50 * error < sse; if (allow_me_for_sub_blks) { // On 4 sub-blocks. const BLOCK_SIZE subblock_size = av1_ss_size_lookup[block_size][1][1]; const int subblock_height = block_size_high[subblock_size]; const int subblock_width = block_size_wide[subblock_size]; const int subblock_pels = subblock_height * subblock_width; start_mv = get_fullmv_from_mv(ref_mv); int subblock_idx = 0; for (int i = 0; i < mb_height; i += subblock_height) { for (int j = 0; j < mb_width; j += subblock_width) { const int offset = i * y_stride + j; mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset; mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset; av1_make_default_fullpel_ms_params( &full_ms_params, cpi, mb, subblock_size, &baseline_mv, start_mv, search_site_cfg, search_method, /*fine_search_interval=*/0); full_ms_params.run_mesh_search = 1; full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type; if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) { // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1. full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1; full_ms_params.mesh_search_mv_diff_threshold = 2; } av1_full_pixel_search(start_mv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list), &best_mv.as_fullmv, &best_mv_stats, NULL); av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size, &baseline_mv, cost_list); ms_params.forced_stop = EIGHTH_PEL; ms_params.var_params.subpel_search_type = subpel_search_type; // Since we are merely refining the result from full pixel search, we // don't need regularization for subpel search ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE; best_mv_stats.err_cost = 0; subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); assert( av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); error = cpi->mv_search_params.find_fractional_mv_step( &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv.as_mv, &distortion, &sse, NULL); subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels); subblock_mvs[subblock_idx] = best_mv.as_mv; ++subblock_idx; } } } } // Restore input state. mb->plane[0].src = ori_src_buf; mbd->plane[0].pre[0] = ori_pre_buf; // Make partition decision. if (allow_me_for_sub_blks) { tf_determine_block_partition(block_mv, block_mse, subblock_mvs, subblock_mses); } else { // Copy 32X32 block mv and mse values to sub blocks for (int i = 0; i < 4; ++i) { subblock_mvs[i] = block_mv; subblock_mses[i] = block_mse; } } // Do not pass down the reference motion vector if error is too large. const int thresh = (min_frame_size >= 720) ? 12 : 3; if (block_mse > (thresh << (mbd->bd - 8))) { *ref_mv = kZeroMv; } } /*!\cond */ // Determines whether to split the entire block to 4 sub-blocks for filtering. // In particular, this decision is made based on the comparison between the // motion search error of the entire block and the errors of all sub-blocks. // Inputs: // block_mv: Motion vector for the entire block (ONLY as reference). // block_mse: Motion search error (MSE) for the entire block (ONLY as // reference). // subblock_mvs: Pointer to the motion vectors for 4 sub-blocks (will be // modified based on the partition decision). // subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks (will // be modified based on the partition decision). // Returns: // Nothing will be returned. Results are saved in `subblock_mvs` and // `subblock_mses`. static void tf_determine_block_partition(const MV block_mv, const int block_mse, MV *subblock_mvs, int *subblock_mses) { int min_subblock_mse = INT_MAX; int max_subblock_mse = INT_MIN; int64_t sum_subblock_mse = 0; for (int i = 0; i < 4; ++i) { sum_subblock_mse += subblock_mses[i]; min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]); max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]); } // TODO(any): The following magic numbers may be tuned to improve the // performance OR find a way to get rid of these magic numbers. if (((block_mse * 15 < sum_subblock_mse * 4) && max_subblock_mse - min_subblock_mse < 48) || ((block_mse * 14 < sum_subblock_mse * 4) && max_subblock_mse - min_subblock_mse < 24)) { // No split. for (int i = 0; i < 4; ++i) { subblock_mvs[i] = block_mv; subblock_mses[i] = block_mse; } } } // Helper function to determine whether a frame is encoded with high bit-depth. static inline int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) { return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; } /*!\endcond */ /*!\brief Builds predictor for blocks in temporal filtering. This is the * second step for temporal filtering, which is to construct predictions from * all reference frames INCLUDING the frame to be filtered itself. These * predictors are built based on the motion search results (motion vector is * set as 0 for the frame to be filtered), and will be futher used for * weighted averaging. * * \ingroup src_frame_proc * \param[in] ref_frame Pointer to the reference frame (or the frame * to be filtered) * \param[in] mbd Pointer to the block for filtering. Besides * containing the subsampling information of all * planes, this field also gives the searched * motion vector for the entire block, i.e., * `mbd->mi[0]->mv[0]`. This vector should be 0 * if the `ref_frame` itself is the frame to be * filtered. * \param[in] block_size Size of the block * \param[in] mb_row Row index of the block in the frame * \param[in] mb_col Column index of the block in the frame * \param[in] num_planes Number of planes in the frame * \param[in] scale Scaling factor * \param[in] subblock_mvs The motion vectors for each sub-block (row-major * order) * \param[out] pred Pointer to the predictor to be built * * \remark Nothing returned, But the contents of `pred` will be modified */ static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const struct scale_factors *scale, const MV *subblock_mvs, uint8_t *pred) { // Information of the entire block. const int mb_height = block_size_high[block_size]; // Height. const int mb_width = block_size_wide[block_size]; // Width. const int mb_y = mb_height * mb_row; // Y-coord (Top-left). const int mb_x = mb_width * mb_col; // X-coord (Top-left). const int bit_depth = mbd->bd; // Bit depth. const int is_intrabc = 0; // Is intra-copied? const int is_high_bitdepth = is_frame_high_bitdepth(ref_frame); // Default interpolation filters. const int_interpfilters interp_filters = av1_broadcast_interp_filter(MULTITAP_SHARP2); // Handle Y-plane, U-plane and V-plane (if needed) in sequence. int plane_offset = 0; for (int plane = 0; plane < num_planes; ++plane) { const int subsampling_y = mbd->plane[plane].subsampling_y; const int subsampling_x = mbd->plane[plane].subsampling_x; // Information of each sub-block in current plane. const int plane_h = mb_height >> subsampling_y; // Plane height. const int plane_w = mb_width >> subsampling_x; // Plane width. const int plane_y = mb_y >> subsampling_y; // Y-coord (Top-left). const int plane_x = mb_x >> subsampling_x; // X-coord (Top-left). const int h = plane_h >> 1; // Sub-block height. const int w = plane_w >> 1; // Sub-block width. const int is_y_plane = (plane == 0); // Is Y-plane? const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane], ref_frame->widths[is_y_plane ? 0 : 1], ref_frame->heights[is_y_plane ? 0 : 1], ref_frame->strides[is_y_plane ? 0 : 1] }; // Handle each subblock. int subblock_idx = 0; for (int i = 0; i < plane_h; i += h) { for (int j = 0; j < plane_w; j += w) { // Choose proper motion vector. const MV mv = subblock_mvs[subblock_idx++]; assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX && mv.col >= INT16_MIN && mv.col <= INT16_MAX); const int y = plane_y + i; const int x = plane_x + j; // Build predictior for each sub-block on current plane. InterPredParams inter_pred_params; av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x, subsampling_y, bit_depth, is_high_bitdepth, is_intrabc, scale, &ref_buf, interp_filters); inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth); av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j], plane_w, &mv, &inter_pred_params); } } plane_offset += plane_h * plane_w; } } /*!\cond */ // Computes temporal filter weights and accumulators for the frame to be // filtered. More concretely, the filter weights for all pixels are the same. // Inputs: // mbd: Pointer to the block for filtering, which is ONLY used to get // subsampling information of all planes as well as the bit-depth. // block_size: Size of the block. // num_planes: Number of planes in the frame. // pred: Pointer to the well-built predictors. // accum: Pointer to the pixel-wise accumulator for filtering. // count: Pointer to the pixel-wise counter fot filtering. // Returns: // Nothing will be returned. But the content to which `accum` and `pred` // point will be modified. static void tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, uint32_t *accum, uint16_t *count) { // Block information. const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; const int is_high_bitdepth = is_cur_buf_hbd(mbd); int plane_offset = 0; for (int plane = 0; plane < num_planes; ++plane) { const int subsampling_y = mbd->plane[plane].subsampling_y; const int subsampling_x = mbd->plane[plane].subsampling_x; const int h = mb_height >> subsampling_y; // Plane height. const int w = mb_width >> subsampling_x; // Plane width. const int frame_stride = ref_frame->strides[plane == AOM_PLANE_Y ? 0 : 1]; const uint8_t *buf8 = ref_frame->buffers[plane]; const uint16_t *buf16 = CONVERT_TO_SHORTPTR(buf8); const int frame_offset = mb_row * h * frame_stride + mb_col * w; int pred_idx = 0; int pixel_idx = 0; for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { const int idx = plane_offset + pred_idx; // Index with plane shift. const int pred_value = is_high_bitdepth ? buf16[frame_offset + pixel_idx] : buf8[frame_offset + pixel_idx]; accum[idx] += TF_WEIGHT_SCALE * pred_value; count[idx] += TF_WEIGHT_SCALE; ++pred_idx; ++pixel_idx; } pixel_idx += (frame_stride - w); } plane_offset += h * w; } } // Function to compute pixel-wise squared difference between two buffers. // Inputs: // ref: Pointer to reference buffer. // ref_offset: Start position of reference buffer for computation. // ref_stride: Stride for reference buffer. // tgt: Pointer to target buffer. // tgt_offset: Start position of target buffer for computation. // tgt_stride: Stride for target buffer. // height: Height of block for computation. // width: Width of block for computation. // is_high_bitdepth: Whether the two buffers point to high bit-depth frames. // square_diff: Pointer to save the squared differces. // Returns: // Nothing will be returned. But the content to which `square_diff` points // will be modified. static inline void compute_square_diff(const uint8_t *ref, const int ref_offset, const int ref_stride, const uint8_t *tgt, const int tgt_offset, const int tgt_stride, const int height, const int width, const int is_high_bitdepth, uint32_t *square_diff) { const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref); const uint16_t *tgt16 = CONVERT_TO_SHORTPTR(tgt); int ref_idx = 0; int tgt_idx = 0; int idx = 0; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; ++j) { const uint16_t ref_value = is_high_bitdepth ? ref16[ref_offset + ref_idx] : ref[ref_offset + ref_idx]; const uint16_t tgt_value = is_high_bitdepth ? tgt16[tgt_offset + tgt_idx] : tgt[tgt_offset + tgt_idx]; const uint32_t diff = (ref_value > tgt_value) ? (ref_value - tgt_value) : (tgt_value - ref_value); square_diff[idx] = diff * diff; ++ref_idx; ++tgt_idx; ++idx; } ref_idx += (ref_stride - width); tgt_idx += (tgt_stride - width); } } // Function to accumulate pixel-wise squared difference between two luma buffers // to be consumed while filtering the chroma planes. // Inputs: // square_diff: Pointer to squared differences from luma plane. // luma_sse_sum: Pointer to save the sum of luma squared differences. // block_height: Height of block for computation. // block_width: Width of block for computation. // ss_x_shift: Chroma subsampling shift in 'X' direction // ss_y_shift: Chroma subsampling shift in 'Y' direction // Returns: // Nothing will be returned. But the content to which `luma_sse_sum` points // will be modified. static void compute_luma_sq_error_sum(uint32_t *square_diff, uint32_t *luma_sse_sum, int block_height, int block_width, int ss_x_shift, int ss_y_shift) { for (int i = 0; i < block_height; ++i) { for (int j = 0; j < block_width; ++j) { for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. const int ww = block_width << ss_x_shift; // Width of Y-plane. luma_sse_sum[i * block_width + j] += square_diff[yy * ww + xx]; } } } } } /*!\endcond */ /*!\brief Applies temporal filtering. NOTE that there are various optimised * versions of this function called where the appropriate instruction set is * supported. * * \ingroup src_frame_proc * \param[in] frame_to_filter Pointer to the frame to be filtered, which is * used as reference to compute squared * difference from the predictor. * \param[in] mbd Pointer to the block for filtering, ONLY used * to get subsampling information for the planes * \param[in] block_size Size of the block * \param[in] mb_row Row index of the block in the frame * \param[in] mb_col Column index of the block in the frame * \param[in] num_planes Number of planes in the frame * \param[in] noise_levels Estimated noise levels for each plane * in the frame (Y,U,V) * \param[in] subblock_mvs Pointer to the motion vectors for 4 sub-blocks * \param[in] subblock_mses Pointer to the search errors (MSE) for 4 * sub-blocks * \param[in] q_factor Quantization factor. This is actually the `q` * defined in libaom, converted from `qindex` * \param[in] filter_strength Filtering strength. This value lies in range * [0, 6] where 6 is the maximum strength. * \param[in] tf_wgt_calc_lvl Controls the weight calculation method during * temporal filtering * \param[out] pred Pointer to the well-built predictors * \param[out] accum Pointer to the pixel-wise accumulator for * filtering * \param[out] count Pointer to the pixel-wise counter for * filtering * * \remark Nothing returned, But the contents of `accum`, `pred` and 'count' * will be modified */ void av1_apply_temporal_filter_c( const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count) { // Block information. const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; const int mb_pels = mb_height * mb_width; const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter); const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred); // Frame information. const int frame_height = frame_to_filter->y_crop_height; const int frame_width = frame_to_filter->y_crop_width; const int min_frame_size = AOMMIN(frame_height, frame_width); // Variables to simplify combined error calculation. const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * TF_SEARCH_ERROR_NORM_WEIGHT); const double weight_factor = (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; // Decay factors for non-local mean approach. double decay_factor[MAX_MB_PLANE] = { 0 }; // Adjust filtering based on q. // Larger q -> stronger filtering -> larger weight. // Smaller q -> weaker filtering -> smaller weight. double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); q_decay = CLIP(q_decay, 1e-5, 1); if (q_factor >= TF_QINDEX_CUTOFF) { // Max q_factor is 255, therefore the upper bound of q_decay is 8. // We do not need a clip here. q_decay = 0.5 * pow((double)q_factor / 64, 2); } // Smaller strength -> smaller filtering weight. double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); s_decay = CLIP(s_decay, 1e-5, 1); for (int plane = 0; plane < num_planes; plane++) { // Larger noise -> larger filtering weight. const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); decay_factor[plane] = 1 / (n_decay * q_decay * s_decay); } double d_factor[4] = { 0 }; for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { // Larger motion vector -> smaller filtering weight. const MV mv = subblock_mvs[subblock_idx]; const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; distance_threshold = AOMMAX(distance_threshold, 1); d_factor[subblock_idx] = distance / distance_threshold; d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); } // Allocate memory for pixel-wise squared differences. They, // regardless of the subsampling, are assigned with memory of size `mb_pels`. uint32_t *square_diff = aom_memalign(16, mb_pels * sizeof(uint32_t)); if (!square_diff) { aom_internal_error(mbd->error_info, AOM_CODEC_MEM_ERROR, "Error allocating temporal filter data"); } memset(square_diff, 0, mb_pels * sizeof(square_diff[0])); // Allocate memory for accumulated luma squared error. This value will be // consumed while filtering the chroma planes. uint32_t *luma_sse_sum = aom_memalign(32, mb_pels * sizeof(uint32_t)); if (!luma_sse_sum) { aom_free(square_diff); aom_internal_error(mbd->error_info, AOM_CODEC_MEM_ERROR, "Error allocating temporal filter data"); } memset(luma_sse_sum, 0, mb_pels * sizeof(luma_sse_sum[0])); // Get window size for pixel-wise filtering. assert(TF_WINDOW_LENGTH % 2 == 1); const int half_window = TF_WINDOW_LENGTH >> 1; // Handle planes in sequence. int plane_offset = 0; for (int plane = 0; plane < num_planes; ++plane) { // Locate pixel on reference frame. const int subsampling_y = mbd->plane[plane].subsampling_y; const int subsampling_x = mbd->plane[plane].subsampling_x; const int h = mb_height >> subsampling_y; // Plane height. const int w = mb_width >> subsampling_x; // Plane width. const int frame_stride = frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1]; const int frame_offset = mb_row * h * frame_stride + mb_col * w; const uint8_t *ref = frame_to_filter->buffers[plane]; const int ss_y_shift = subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; const int ss_x_shift = subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); const double inv_num_ref_pixels = 1.0 / num_ref_pixels; // Filter U-plane and V-plane using Y-plane. This is because motion // search is only done on Y-plane, so the information from Y-plane will // be more accurate. The luma sse sum is reused in both chroma planes. if (plane == AOM_PLANE_U) compute_luma_sq_error_sum(square_diff, luma_sse_sum, h, w, ss_x_shift, ss_y_shift); compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset, w, h, w, is_high_bitdepth, square_diff); // Perform filtering. int pred_idx = 0; for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { // non-local mean approach uint64_t sum_square_diff = 0; for (int wi = -half_window; wi <= half_window; ++wi) { for (int wj = -half_window; wj <= half_window; ++wj) { const int y = CLIP(i + wi, 0, h - 1); // Y-coord on current plane. const int x = CLIP(j + wj, 0, w - 1); // X-coord on current plane. sum_square_diff += square_diff[y * w + x]; } } sum_square_diff += luma_sse_sum[i * w + j]; // Scale down the difference for high bit depth input. if (mbd->bd > 8) sum_square_diff >>= ((mbd->bd - 8) * 2); // Combine window error and block error, and normalize it. const double window_error = sum_square_diff * inv_num_ref_pixels; const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2); const double block_error = (double)subblock_mses[subblock_idx]; const double combined_error = weight_factor * window_error + block_error * inv_factor; // Compute filter weight. double scaled_error = combined_error * d_factor[subblock_idx] * decay_factor[plane]; scaled_error = AOMMIN(scaled_error, 7); int weight; if (tf_wgt_calc_lvl == 0) { weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); } else { const float fweight = approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; weight = iroundpf(fweight); } const int idx = plane_offset + pred_idx; // Index with plane shift. const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx]; accum[idx] += weight * pred_value; count[idx] += weight; ++pred_idx; } } plane_offset += h * w; } aom_free(square_diff); aom_free(luma_sse_sum); } #if CONFIG_AV1_HIGHBITDEPTH // Calls High bit-depth temporal filter void av1_highbd_apply_temporal_filter_c( const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count) { av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, noise_levels, subblock_mvs, subblock_mses, q_factor, filter_strength, tf_wgt_calc_lvl, pred, accum, count); } #endif // CONFIG_AV1_HIGHBITDEPTH /*!\brief Normalizes the accumulated filtering result to produce the filtered * frame * * \ingroup src_frame_proc * \param[in] mbd Pointer to the block for filtering, which is * ONLY used to get subsampling information for * all the planes * \param[in] block_size Size of the block * \param[in] mb_row Row index of the block in the frame * \param[in] mb_col Column index of the block in the frame * \param[in] num_planes Number of planes in the frame * \param[in] accum Pointer to the pre-computed accumulator * \param[in] count Pointer to the pre-computed count * \param[out] result_buffer Pointer to result buffer * * \remark Nothing returned, but the content to which `result_buffer` pointer * will be modified */ static void tf_normalize_filtered_frame( const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const uint32_t *accum, const uint16_t *count, YV12_BUFFER_CONFIG *result_buffer) { // Block information. const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; const int is_high_bitdepth = is_frame_high_bitdepth(result_buffer); int plane_offset = 0; for (int plane = 0; plane < num_planes; ++plane) { const int plane_h = mb_height >> mbd->plane[plane].subsampling_y; const int plane_w = mb_width >> mbd->plane[plane].subsampling_x; const int frame_stride = result_buffer->strides[plane == 0 ? 0 : 1]; const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; uint8_t *const buf = result_buffer->buffers[plane]; uint16_t *const buf16 = CONVERT_TO_SHORTPTR(buf); int plane_idx = 0; // Pixel index on current plane (block-base). int frame_idx = frame_offset; // Pixel index on the entire frame. for (int i = 0; i < plane_h; ++i) { for (int j = 0; j < plane_w; ++j) { const int idx = plane_idx + plane_offset; const uint16_t rounding = count[idx] >> 1; if (is_high_bitdepth) { buf16[frame_idx] = (uint16_t)OD_DIVU(accum[idx] + rounding, count[idx]); } else { buf[frame_idx] = (uint8_t)OD_DIVU(accum[idx] + rounding, count[idx]); } ++plane_idx; ++frame_idx; } frame_idx += (frame_stride - plane_w); } plane_offset += plane_h * plane_w; } } void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) { TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; YV12_BUFFER_CONFIG **frames = tf_ctx->frames; const int num_frames = tf_ctx->num_frames; const int filter_frame_idx = tf_ctx->filter_frame_idx; const int compute_frame_diff = tf_ctx->compute_frame_diff; const struct scale_factors *scale = &tf_ctx->sf; const double *noise_levels = tf_ctx->noise_levels; const int num_pels = tf_ctx->num_pels; const int q_factor = tf_ctx->q_factor; const BLOCK_SIZE block_size = TF_BLOCK_SIZE; const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx]; MACROBLOCK *const mb = &td->mb; MACROBLOCKD *const mbd = &mb->e_mbd; TemporalFilterData *const tf_data = &td->tf_data; const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; const int mi_h = mi_size_high_log2[block_size]; const int mi_w = mi_size_wide_log2[block_size]; const int num_planes = av1_num_planes(&cpi->common); const int weight_calc_level_in_tf = cpi->sf.hl_sf.weight_calc_level_in_tf; uint32_t *accum = tf_data->accum; uint16_t *count = tf_data->count; uint8_t *pred = tf_data->pred; // Factor to control the filering strength. int filter_strength = cpi->oxcf.algo_cfg.arnr_strength; const GF_GROUP *gf_group = &cpi->ppi->gf_group; const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index]; // Do filtering. FRAME_DIFF *diff = &td->tf_data.diff; av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits, (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2), cpi->oxcf.border_in_pixels); for (int mb_col = 0; mb_col < tf_ctx->mb_cols; mb_col++) { av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits, (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2), cpi->oxcf.border_in_pixels); memset(accum, 0, num_pels * sizeof(accum[0])); memset(count, 0, num_pels * sizeof(count[0])); MV ref_mv = kZeroMv; // Reference motion vector passed down along frames. // Perform temporal filtering frame by frame. // Decide whether to perform motion search at 16x16 sub-block level or not // based on 4x4 sub-blocks source variance. Allow motion search for split // partition only if the difference between max and min source variance of // 4x4 blocks is greater than a threshold (which is derived empirically). bool allow_me_for_sub_blks = true; if (cpi->sf.hl_sf.allow_sub_blk_me_in_tf) { const int is_hbd = is_frame_high_bitdepth(frame_to_filter); // Initialize minimum variance to a large value and maximum variance to 0. double blk_4x4_var_min = DBL_MAX; double blk_4x4_var_max = 0; get_log_var_4x4sub_blk(cpi, frame_to_filter, mb_row, mb_col, TF_BLOCK_SIZE, &blk_4x4_var_min, &blk_4x4_var_max, is_hbd); // TODO(sanampudi.venkatarao@ittiam.com): Experiment and adjust the // threshold for high bit depth. if ((blk_4x4_var_max - blk_4x4_var_min) <= 4.0) allow_me_for_sub_blks = false; } for (int frame = 0; frame < num_frames; frame++) { if (frames[frame] == NULL) continue; // Motion search. MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv }; int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; int is_dc_diff_large = 0; if (frame == filter_frame_idx) { // Frame to be filtered. // Change ref_mv sign for following frames. ref_mv.row *= -1; ref_mv.col *= -1; } else { // Other reference frames. tf_motion_search(cpi, mb, frame_to_filter, frames[frame], block_size, mb_row, mb_col, &ref_mv, allow_me_for_sub_blks, subblock_mvs, subblock_mses, &is_dc_diff_large); } if (cpi->oxcf.kf_cfg.enable_keyframe_filtering == 1 && frame_type == KEY_FRAME && is_dc_diff_large) filter_strength = AOMMIN(filter_strength, 1); // Perform weighted averaging. if (frame == filter_frame_idx) { // Frame to be filtered. tf_apply_temporal_filter_self(frames[frame], mbd, block_size, mb_row, mb_col, num_planes, accum, count); } else { // Other reference frames. tf_build_predictor(frames[frame], mbd, block_size, mb_row, mb_col, num_planes, scale, subblock_mvs, pred); // All variants of av1_apply_temporal_filter() contain floating point // operations. Hence, clear the system state. // TODO(any): avx2/sse2 version should be changed to align with C // function before using. In particular, current avx2/sse2 function // only supports 32x32 block size and 5x5 filtering window. if (is_frame_high_bitdepth(frame_to_filter)) { // for high bit-depth #if CONFIG_AV1_HIGHBITDEPTH if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) { av1_highbd_apply_temporal_filter( frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, noise_levels, subblock_mvs, subblock_mses, q_factor, filter_strength, weight_calc_level_in_tf, pred, accum, count); } else { #endif // CONFIG_AV1_HIGHBITDEPTH av1_apply_temporal_filter_c( frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, noise_levels, subblock_mvs, subblock_mses, q_factor, filter_strength, weight_calc_level_in_tf, pred, accum, count); #if CONFIG_AV1_HIGHBITDEPTH } #endif // CONFIG_AV1_HIGHBITDEPTH } else { // for 8-bit if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) { av1_apply_temporal_filter( frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, noise_levels, subblock_mvs, subblock_mses, q_factor, filter_strength, weight_calc_level_in_tf, pred, accum, count); } else { av1_apply_temporal_filter_c( frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, noise_levels, subblock_mvs, subblock_mses, q_factor, filter_strength, weight_calc_level_in_tf, pred, accum, count); } } } } tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes, accum, count, tf_ctx->output_frame); if (compute_frame_diff) { const int y_height = mb_height >> mbd->plane[0].subsampling_y; const int y_width = mb_width >> mbd->plane[0].subsampling_x; const int source_y_stride = frame_to_filter->y_stride; const int filter_y_stride = tf_ctx->output_frame->y_stride; const int source_offset = mb_row * y_height * source_y_stride + mb_col * y_width; const int filter_offset = mb_row * y_height * filter_y_stride + mb_col * y_width; unsigned int sse = 0; cpi->ppi->fn_ptr[block_size].vf( frame_to_filter->y_buffer + source_offset, source_y_stride, tf_ctx->output_frame->y_buffer + filter_offset, filter_y_stride, &sse); diff->sum += sse; diff->sse += sse * (int64_t)sse; } } } /*!\brief Does temporal filter for a given frame. * * \ingroup src_frame_proc * \param[in] cpi Top level encoder instance structure * * \remark Nothing will be returned, but the contents of td->diff will be modified. */ static void tf_do_filtering(AV1_COMP *cpi) { // Basic information. ThreadData *td = &cpi->td; TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; const struct scale_factors *scale = &tf_ctx->sf; const int num_planes = av1_num_planes(&cpi->common); assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); MACROBLOCKD *mbd = &td->mb.e_mbd; uint8_t *input_buffer[MAX_MB_PLANE]; MB_MODE_INFO **input_mb_mode_info; tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes); tf_setup_macroblockd(mbd, &td->tf_data, scale); // Perform temporal filtering for each row. for (int mb_row = 0; mb_row < tf_ctx->mb_rows; mb_row++) av1_tf_do_filtering_row(cpi, td, mb_row); tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes); } /*!\brief Setups the frame buffer for temporal filtering. This fuction * determines how many frames will be used for temporal filtering and then * groups them into a buffer. This function will also estimate the noise level * of the to-filter frame. * * \ingroup src_frame_proc * \param[in] cpi Top level encoder instance structure * \param[in] filter_frame_lookahead_idx The index of the to-filter frame * in the lookahead buffer cpi->lookahead * \param[in] gf_frame_index GOP index * * \remark Nothing will be returned. But the fields `frames`, `num_frames`, * `filter_frame_idx` and `noise_levels` will be updated in cpi->tf_ctx. */ static void tf_setup_filtering_buffer(AV1_COMP *cpi, int filter_frame_lookahead_idx, int gf_frame_index) { const GF_GROUP *gf_group = &cpi->ppi->gf_group; const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index]; const FRAME_TYPE frame_type = gf_group->frame_type[gf_frame_index]; const int is_forward_keyframe = av1_gop_check_forward_keyframe(gf_group, gf_frame_index); TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; YV12_BUFFER_CONFIG **frames = tf_ctx->frames; // Number of frames used for filtering. Set `arnr_max_frames` as 1 to disable // temporal filtering. int num_frames = AOMMAX(cpi->oxcf.algo_cfg.arnr_max_frames, 1); int num_before = 0; // Number of filtering frames before the to-filter frame. int num_after = 0; // Number of filtering frames after the to-filer frame. const int lookahead_depth = av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage); // Temporal filtering should not go beyond key frames const int key_to_curframe = AOMMAX(cpi->rc.frames_since_key + filter_frame_lookahead_idx, 0); const int curframe_to_key = AOMMAX(cpi->rc.frames_to_key - filter_frame_lookahead_idx - 1, 0); // Number of buffered frames before the to-filter frame. int max_before = AOMMIN(filter_frame_lookahead_idx, key_to_curframe); // Number of buffered frames after the to-filter frame. int max_after = AOMMIN(lookahead_depth - filter_frame_lookahead_idx - 1, curframe_to_key); // Estimate noises for each plane. const struct lookahead_entry *to_filter_buf = av1_lookahead_peek( cpi->ppi->lookahead, filter_frame_lookahead_idx, cpi->compressor_stage); assert(to_filter_buf != NULL); const YV12_BUFFER_CONFIG *to_filter_frame = &to_filter_buf->img; const int num_planes = av1_num_planes(&cpi->common); double *noise_levels = tf_ctx->noise_levels; av1_estimate_noise_level(to_filter_frame, noise_levels, AOM_PLANE_Y, num_planes - 1, cpi->common.seq_params->bit_depth, NOISE_ESTIMATION_EDGE_THRESHOLD); // Get quantization factor. const int q = get_q(cpi); // Get correlation estimates from first-pass; const FIRSTPASS_STATS *stats = cpi->twopass_frame.stats_in - (cpi->rc.frames_since_key == 0); double accu_coeff0 = 1.0, accu_coeff1 = 1.0; for (int i = 1; i <= max_after; i++) { if (stats + filter_frame_lookahead_idx + i >= cpi->ppi->twopass.stats_buf_ctx->stats_in_end) { max_after = i - 1; break; } accu_coeff1 *= AOMMAX(stats[filter_frame_lookahead_idx + i].cor_coeff, 0.001); } if (max_after >= 1) { accu_coeff1 = pow(accu_coeff1, 1.0 / (double)max_after); } for (int i = 1; i <= max_before; i++) { if (stats + filter_frame_lookahead_idx - i + 1 <= cpi->ppi->twopass.stats_buf_ctx->stats_in_start) { max_before = i - 1; break; } accu_coeff0 *= AOMMAX(stats[filter_frame_lookahead_idx - i + 1].cor_coeff, 0.001); } if (max_before >= 1) { accu_coeff0 = pow(accu_coeff0, 1.0 / (double)max_before); } // Adjust number of filtering frames based on quantization factor. When the // quantization factor is small enough (lossless compression), we will not // change the number of frames for key frame filtering, which is to avoid // visual quality drop. int adjust_num = 6; const int adjust_num_frames_for_arf_filtering = cpi->sf.hl_sf.adjust_num_frames_for_arf_filtering; if (num_frames == 1) { // `arnr_max_frames = 1` is used to disable filtering. adjust_num = 0; } else if ((update_type == KF_UPDATE) && q <= 10) { adjust_num = 0; } else if (adjust_num_frames_for_arf_filtering > 0 && update_type != KF_UPDATE && (cpi->rc.frames_since_key > 0)) { // Since screen content detection happens after temporal filtering, // 'frames_since_key' check is added to ensure the sf is disabled for the // first alt-ref frame. // Adjust number of frames to be considered for filtering based on noise // level of the current frame. For low-noise frame, use more frames to // filter such that the filtered frame can provide better predictions for // subsequent frames and vice versa. const uint8_t av1_adjust_num_using_noise_lvl[2][3] = { { 6, 4, 2 }, { 4, 2, 0 } }; const uint8_t *adjust_num_frames = av1_adjust_num_using_noise_lvl[adjust_num_frames_for_arf_filtering - 1]; if (noise_levels[AOM_PLANE_Y] < 0.5) adjust_num = adjust_num_frames[0]; else if (noise_levels[AOM_PLANE_Y] < 1.0) adjust_num = adjust_num_frames[1]; else adjust_num = adjust_num_frames[2]; } num_frames = AOMMIN(num_frames + adjust_num, lookahead_depth); if (frame_type == KEY_FRAME) { num_before = AOMMIN(is_forward_keyframe ? num_frames / 2 : 0, max_before); num_after = AOMMIN(num_frames - 1, max_after); } else { int gfu_boost = av1_calc_arf_boost(&cpi->ppi->twopass, &cpi->twopass_frame, &cpi->ppi->p_rc, &cpi->frame_info, filter_frame_lookahead_idx, max_before, max_after, NULL, NULL, 0); num_frames = AOMMIN(num_frames, gfu_boost / 150); num_frames += !(num_frames & 1); // Make the number odd. // Only use 2 neighbours for the second ARF. if (update_type == INTNL_ARF_UPDATE) num_frames = AOMMIN(num_frames, 3); if (AOMMIN(max_after, max_before) >= num_frames / 2) { // just use half half num_before = num_frames / 2; num_after = num_frames / 2; } else { if (max_after < num_frames / 2) { num_after = max_after; num_before = AOMMIN(num_frames - 1 - num_after, max_before); } else { num_before = max_before; num_after = AOMMIN(num_frames - 1 - num_before, max_after); } // Adjust insymmetry based on frame-level correlation if (max_after > 0 && max_before > 0) { if (num_after < num_before) { const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff1, 0.01)); num_before = AOMMIN(num_before, num_after + insym); } else { const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff0, 0.01)); num_after = AOMMIN(num_after, num_before + insym); } } } } num_frames = num_before + 1 + num_after; // Setup the frame buffer. for (int frame = 0; frame < num_frames; ++frame) { const int lookahead_idx = frame - num_before + filter_frame_lookahead_idx; struct lookahead_entry *buf = av1_lookahead_peek( cpi->ppi->lookahead, lookahead_idx, cpi->compressor_stage); assert(buf != NULL); frames[frame] = &buf->img; } tf_ctx->num_frames = num_frames; tf_ctx->filter_frame_idx = num_before; assert(frames[tf_ctx->filter_frame_idx] == to_filter_frame); av1_setup_src_planes(&cpi->td.mb, &to_filter_buf->img, 0, 0, num_planes, cpi->common.seq_params->sb_size); av1_setup_block_planes(&cpi->td.mb.e_mbd, cpi->common.seq_params->subsampling_x, cpi->common.seq_params->subsampling_y, num_planes); } /*!\cond */ double av1_estimate_noise_from_single_plane_c(const uint8_t *src, int height, int width, int stride, int edge_thresh) { int64_t accum = 0; int count = 0; for (int i = 1; i < height - 1; ++i) { for (int j = 1; j < width - 1; ++j) { // Setup a small 3x3 matrix. const int center_idx = i * stride + j; int mat[3][3]; for (int ii = -1; ii <= 1; ++ii) { for (int jj = -1; jj <= 1; ++jj) { const int idx = center_idx + ii * stride + jj; mat[ii + 1][jj + 1] = src[idx]; } } // Compute sobel gradients. const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) + 2 * (mat[1][0] - mat[1][2]); const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) + 2 * (mat[0][1] - mat[2][1]); const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), 0); // Accumulate Laplacian. if (Ga < edge_thresh) { // Only count smooth pixels. const int v = 4 * mat[1][1] - 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) + (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]); accum += ROUND_POWER_OF_TWO(abs(v), 0); ++count; } } } // Return -1.0 (unreliable estimation) if there are too few smooth pixels. return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2; } #if CONFIG_AV1_HIGHBITDEPTH double av1_highbd_estimate_noise_from_single_plane_c(const uint16_t *src16, int height, int width, const int stride, int bit_depth, int edge_thresh) { int64_t accum = 0; int count = 0; for (int i = 1; i < height - 1; ++i) { for (int j = 1; j < width - 1; ++j) { // Setup a small 3x3 matrix. const int center_idx = i * stride + j; int mat[3][3]; for (int ii = -1; ii <= 1; ++ii) { for (int jj = -1; jj <= 1; ++jj) { const int idx = center_idx + ii * stride + jj; mat[ii + 1][jj + 1] = src16[idx]; } } // Compute sobel gradients. const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) + 2 * (mat[1][0] - mat[1][2]); const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) + 2 * (mat[0][1] - mat[2][1]); const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), bit_depth - 8); // Accumulate Laplacian. if (Ga < edge_thresh) { // Only count smooth pixels. const int v = 4 * mat[1][1] - 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) + (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]); accum += ROUND_POWER_OF_TWO(abs(v), bit_depth - 8); ++count; } } } // Return -1.0 (unreliable estimation) if there are too few smooth pixels. return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2; } #endif void av1_estimate_noise_level(const YV12_BUFFER_CONFIG *frame, double *noise_level, int plane_from, int plane_to, int bit_depth, int edge_thresh) { for (int plane = plane_from; plane <= plane_to; plane++) { const bool is_uv_plane = (plane != AOM_PLANE_Y); const int height = frame->crop_heights[is_uv_plane]; const int width = frame->crop_widths[is_uv_plane]; const int stride = frame->strides[is_uv_plane]; const uint8_t *src = frame->buffers[plane]; #if CONFIG_AV1_HIGHBITDEPTH const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); const int is_high_bitdepth = is_frame_high_bitdepth(frame); if (is_high_bitdepth) { noise_level[plane] = av1_highbd_estimate_noise_from_single_plane( src16, height, width, stride, bit_depth, edge_thresh); } else { noise_level[plane] = av1_estimate_noise_from_single_plane( src, height, width, stride, edge_thresh); } #else (void)bit_depth; noise_level[plane] = av1_estimate_noise_from_single_plane( src, height, width, stride, edge_thresh); #endif } } // Initializes the members of TemporalFilterCtx // Inputs: // cpi: Top level encoder instance structure // check_show_existing: If 1, check whether the filtered frame is similar // to the original frame. // filter_frame_lookahead_idx: The index of the frame to be filtered in the // lookahead buffer cpi->lookahead. // Returns: // Nothing will be returned. But the contents of cpi->tf_ctx will be modified. static void init_tf_ctx(AV1_COMP *cpi, int filter_frame_lookahead_idx, int gf_frame_index, int compute_frame_diff, YV12_BUFFER_CONFIG *output_frame) { TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; // Setup frame buffer for filtering. YV12_BUFFER_CONFIG **frames = tf_ctx->frames; tf_ctx->num_frames = 0; tf_ctx->filter_frame_idx = -1; tf_ctx->output_frame = output_frame; tf_ctx->compute_frame_diff = compute_frame_diff; tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, gf_frame_index); assert(tf_ctx->num_frames > 0); assert(tf_ctx->filter_frame_idx < tf_ctx->num_frames); // Setup scaling factors. Scaling on each of the arnr frames is not // supported. // ARF is produced at the native frame size and resized when coded. struct scale_factors *sf = &tf_ctx->sf; av1_setup_scale_factors_for_frame( sf, frames[0]->y_crop_width, frames[0]->y_crop_height, frames[0]->y_crop_width, frames[0]->y_crop_height); // Initialize temporal filter parameters. MACROBLOCKD *mbd = &cpi->td.mb.e_mbd; const int filter_frame_idx = tf_ctx->filter_frame_idx; const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx]; const BLOCK_SIZE block_size = TF_BLOCK_SIZE; const int frame_height = frame_to_filter->y_crop_height; const int frame_width = frame_to_filter->y_crop_width; const int mb_width = block_size_wide[block_size]; const int mb_height = block_size_high[block_size]; const int mb_rows = get_num_blocks(frame_height, mb_height); const int mb_cols = get_num_blocks(frame_width, mb_width); const int mb_pels = mb_width * mb_height; const int is_highbitdepth = is_frame_high_bitdepth(frame_to_filter); const int num_planes = av1_num_planes(&cpi->common); int num_pels = 0; for (int i = 0; i < num_planes; i++) { const int subsampling_x = mbd->plane[i].subsampling_x; const int subsampling_y = mbd->plane[i].subsampling_y; num_pels += mb_pels >> (subsampling_x + subsampling_y); } tf_ctx->num_pels = num_pels; tf_ctx->mb_rows = mb_rows; tf_ctx->mb_cols = mb_cols; tf_ctx->is_highbitdepth = is_highbitdepth; tf_ctx->q_factor = get_q(cpi); } int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame, const FRAME_DIFF *frame_diff, int q_index, aom_bit_depth_t bit_depth) { const int frame_height = frame->y_crop_height; const int frame_width = frame->y_crop_width; const int block_height = block_size_high[TF_BLOCK_SIZE]; const int block_width = block_size_wide[TF_BLOCK_SIZE]; const int mb_rows = get_num_blocks(frame_height, block_height); const int mb_cols = get_num_blocks(frame_width, block_width); const int num_mbs = AOMMAX(1, mb_rows * mb_cols); const float mean = (float)frame_diff->sum / num_mbs; const float std = (float)sqrt((float)frame_diff->sse / num_mbs - mean * mean); const int ac_q_step = av1_ac_quant_QTX(q_index, 0, bit_depth); const float threshold = 0.7f * ac_q_step * ac_q_step; if (mean < threshold && std < mean * 1.2) { return 1; } return 0; } void av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx, int gf_frame_index, FRAME_DIFF *frame_diff, YV12_BUFFER_CONFIG *output_frame) { MultiThreadInfo *const mt_info = &cpi->mt_info; // Basic informaton of the current frame. TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; TemporalFilterData *tf_data = &cpi->td.tf_data; const int compute_frame_diff = frame_diff != NULL; // TODO(anyone): Currently, we enforce the filtering strength on internal // ARFs except the second ARF to be zero. We should investigate in which case // it is more beneficial to use non-zero strength filtering. // Only parallel level 0 frames go through temporal filtering. assert(cpi->ppi->gf_group.frame_parallel_level[gf_frame_index] == 0); // Initialize temporal filter context structure. init_tf_ctx(cpi, filter_frame_lookahead_idx, gf_frame_index, compute_frame_diff, output_frame); // Allocate and reset temporal filter buffers. const int is_highbitdepth = tf_ctx->is_highbitdepth; if (!tf_alloc_and_reset_data(tf_data, tf_ctx->num_pels, is_highbitdepth)) { aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, "Error allocating temporal filter data"); } // Perform temporal filtering process. if (mt_info->num_workers > 1) av1_tf_do_filtering_mt(cpi); else tf_do_filtering(cpi); if (compute_frame_diff) { *frame_diff = tf_data->diff; } // Deallocate temporal filter buffers. tf_dealloc_data(tf_data, is_highbitdepth); } int av1_is_temporal_filter_on(const AV1EncoderConfig *oxcf) { return oxcf->algo_cfg.arnr_max_frames > 0 && oxcf->gf_cfg.lag_in_frames > 1; } bool av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, const AV1_COMP *cpi) { const AV1EncoderConfig *oxcf = &cpi->oxcf; tf_info->is_temporal_filter_on = av1_is_temporal_filter_on(oxcf); if (tf_info->is_temporal_filter_on == 0) return true; const AV1_COMMON *cm = &cpi->common; const SequenceHeader *const seq_params = cm->seq_params; for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) { if (aom_realloc_frame_buffer( &tf_info->tf_buf[i], oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL, cpi->alloc_pyramid, 0)) { return false; } } return true; } void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info) { if (tf_info->is_temporal_filter_on == 0) return; for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) { aom_free_frame_buffer(&tf_info->tf_buf[i]); } aom_free_frame_buffer(&tf_info->tf_buf_second_arf); } void av1_tf_info_reset(TEMPORAL_FILTER_INFO *tf_info) { av1_zero(tf_info->tf_buf_valid); av1_zero(tf_info->tf_buf_gf_index); av1_zero(tf_info->tf_buf_display_index_offset); } void av1_tf_info_filtering(TEMPORAL_FILTER_INFO *tf_info, AV1_COMP *cpi, const GF_GROUP *gf_group) { if (tf_info->is_temporal_filter_on == 0) return; const AV1_COMMON *const cm = &cpi->common; for (int gf_index = 0; gf_index < gf_group->size; ++gf_index) { int update_type = gf_group->update_type[gf_index]; if (update_type == KF_UPDATE || update_type == ARF_UPDATE) { int buf_idx = gf_group->frame_type[gf_index] == INTER_FRAME; int lookahead_idx = gf_group->arf_src_offset[gf_index] + gf_group->cur_frame_idx[gf_index]; // This function is designed to be called multiple times after // av1_tf_info_reset(). It will only generate the filtered frame that does // not exist yet. if (tf_info->tf_buf_valid[buf_idx] == 0 || tf_info->tf_buf_display_index_offset[buf_idx] != lookahead_idx) { YV12_BUFFER_CONFIG *out_buf = &tf_info->tf_buf[buf_idx]; av1_temporal_filter(cpi, lookahead_idx, gf_index, &tf_info->frame_diff[buf_idx], out_buf); aom_extend_frame_borders(out_buf, av1_num_planes(cm)); tf_info->tf_buf_gf_index[buf_idx] = gf_index; tf_info->tf_buf_display_index_offset[buf_idx] = lookahead_idx; tf_info->tf_buf_valid[buf_idx] = 1; } } } } YV12_BUFFER_CONFIG *av1_tf_info_get_filtered_buf(TEMPORAL_FILTER_INFO *tf_info, int gf_index, FRAME_DIFF *frame_diff) { if (tf_info->is_temporal_filter_on == 0) return NULL; YV12_BUFFER_CONFIG *out_buf = NULL; for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) { if (tf_info->tf_buf_valid[i] && tf_info->tf_buf_gf_index[i] == gf_index) { out_buf = &tf_info->tf_buf[i]; *frame_diff = tf_info->frame_diff[i]; } } return out_buf; } /*!\endcond */ aom-3.12.1/av1/encoder/temporal_filter.h000066400000000000000000000422211477627663500200510ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ #define AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ #include #include "aom_util/aom_pthread.h" #ifdef __cplusplus extern "C" { #endif /*!\cond */ struct AV1_COMP; struct AV1EncoderConfig; struct ThreadData; // TODO(wtc): These two variables are only used in avx2, sse2, neon // implementations, where the block size is still hard coded to TF_BLOCK_SIZE. // This should be fixed to align with the c implementation. #define BH 32 #define BW 32 // Block size used in temporal filtering. #define TF_BLOCK_SIZE BLOCK_32X32 // Window size for temporal filtering. #define TF_WINDOW_LENGTH 5 // A constant number, sqrt(pi / 2), used for noise estimation. static const double SQRT_PI_BY_2 = 1.25331413732; // Hyper-parameters used to compute filtering weight. These hyper-parameters can // be tuned for a better performance. // 0. A scale factor used in temporal filtering to raise the filter weight from // `double` with range [0, 1] to `int` with range [0, 1000]. #define TF_WEIGHT_SCALE 1000 // 1. Weight factor used to balance the weighted-average between window error // and block error. The weight is for window error while the weight for block // error is always set as 1. #define TF_WINDOW_BLOCK_BALANCE_WEIGHT 5 // 2. Threshold for using q to adjust the filtering weight. Concretely, when // using a small q (high bitrate), we would like to reduce the filtering // strength such that more detailed information can be preserved. Hence, when // q is smaller than this threshold, we will adjust the filtering weight // based on the q-value. #define TF_Q_DECAY_THRESHOLD 20 // 3. Normalization factor used to normalize the motion search error. Since the // motion search error can be large and uncontrollable, we will simply // normalize it before using it to compute the filtering weight. #define TF_SEARCH_ERROR_NORM_WEIGHT 20 // 4. Threshold for using `arnr_strength` to adjust the filtering strength. // Concretely, users can use `arnr_strength` arguments to control the // strength of temporal filtering. When `arnr_strength` is small enough ( // i.e., smaller than this threshold), we will adjust the filtering weight // based on the strength value. #define TF_STRENGTH_THRESHOLD 4 // 5. Threshold for using motion search distance to adjust the filtering weight. // Concretely, larger motion search vector leads to a higher probability of // unreliable search. Hence, we would like to reduce the filtering strength // when the distance is large enough. Considering that the distance actually // relies on the frame size, this threshold is also a resolution-based // threshold. Taking 720p videos as an instance, if this field equals to 0.1, // then the actual threshold will be 720 * 0.1 = 72. Similarly, the threshold // for 360p videos will be 360 * 0.1 = 36. #define TF_SEARCH_DISTANCE_THRESHOLD 0.1 // 6. Threshold to identify if the q is in a relative high range. // Above this cutoff q, a stronger filtering is applied. // For a high q, the quantization throws away more information, and thus a // stronger filtering is less likely to distort the encoded quality, while a // stronger filtering could reduce bit rates. // Ror a low q, more details are expected to be retained. Filtering is thus // more conservative. #define TF_QINDEX_CUTOFF 128 #define NOISE_ESTIMATION_EDGE_THRESHOLD 50 // Sum and SSE source vs filtered frame difference returned by // temporal filter. typedef struct { int64_t sum; int64_t sse; } FRAME_DIFF; /*!\endcond */ /*! * \brief Parameters related to temporal filtering. */ typedef struct { /*! * Frame buffers used for temporal filtering. */ YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS]; /*! * Number of frames in the frame buffer. */ int num_frames; /*! * Output filtered frame */ YV12_BUFFER_CONFIG *output_frame; /*! * Index of the frame to be filtered. */ int filter_frame_idx; /*! * Whether to accumulate diff for show existing condition check. */ int compute_frame_diff; /*! * Frame scaling factor. */ struct scale_factors sf; /*! * Estimated noise levels for each plane in the frame. */ double noise_levels[MAX_MB_PLANE]; /*! * Number of pixels in the temporal filtering block across all planes. */ int num_pels; /*! * Number of temporal filtering block rows. */ int mb_rows; /*! * Number of temporal filtering block columns. */ int mb_cols; /*! * Whether the frame is high-bitdepth or not. */ int is_highbitdepth; /*! * Quantization factor used in temporal filtering. */ int q_factor; } TemporalFilterCtx; /*! * buffer count in TEMPORAL_FILTER_INFO * Currently we only apply filtering on KEY and ARF after * define_gf_group(). Hence, the count is two. */ #define TF_INFO_BUF_COUNT 2 /*! * \brief Temporal filter info for a gop */ typedef struct TEMPORAL_FILTER_INFO { /*! * A flag indicate whether temporal filter shoud be applied. * This flag will stored the result of * av1_is_temporal_filter_on() */ int is_temporal_filter_on; /*! * buffers used for temporal filtering in a GOP * index 0 for key frame and index 1 for ARF */ YV12_BUFFER_CONFIG tf_buf[TF_INFO_BUF_COUNT]; /*! * buffers used for temporal filtering for * INTNL_ARF_UPDATE * Check av1_gop_is_second_arf() for the * definition of second_arf in detail */ YV12_BUFFER_CONFIG tf_buf_second_arf; /*! * whether to show the buffer directly or not. */ FRAME_DIFF frame_diff[TF_INFO_BUF_COUNT]; /*! * the corresponding gf_index for the buffer. */ int tf_buf_gf_index[TF_INFO_BUF_COUNT]; /*! * the display_index offset between next show frame and the frames in the GOP */ int tf_buf_display_index_offset[TF_INFO_BUF_COUNT]; /*! * whether the buf is valid or not. */ int tf_buf_valid[TF_INFO_BUF_COUNT]; } TEMPORAL_FILTER_INFO; /*!\brief Check whether we should apply temporal filter at all. * \param[in] oxcf AV1 encoder config * * \return 1: temporal filter is on 0: temporal is off */ int av1_is_temporal_filter_on(const struct AV1EncoderConfig *oxcf); /*!\brief Allocate buffers for TEMPORAL_FILTER_INFO * \param[in,out] tf_info Temporal filter info for a gop * \param[in,out] cpi Top level encoder instance structure * * \return True on success, false on memory allocation failure. */ bool av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, const struct AV1_COMP *cpi); /*!\brief Free buffers for TEMPORAL_FILTER_INFO * \param[in,out] tf_info Temporal filter info for a gop */ void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info); /*!\brief Reset validity of tf_buf in TEMPORAL_FILTER_INFO * \param[in,out] tf_info Temporal filter info for a gop */ void av1_tf_info_reset(TEMPORAL_FILTER_INFO *tf_info); /*!\brief Apply temporal filter for key frame and ARF in a gop * \param[in,out] tf_info Temporal filter info for a gop * \param[in,out] cpi Top level encoder instance structure * \param[in] gf_group GF/ARF group data structure */ void av1_tf_info_filtering(TEMPORAL_FILTER_INFO *tf_info, struct AV1_COMP *cpi, const GF_GROUP *gf_group); /*!\brief Get a filtered buffer from TEMPORAL_FILTER_INFO * \param[in,out] tf_info Temporal filter info for a gop * \param[in] gf_index gf_index for the target buffer * \param[out] show_tf_buf whether the target buffer can be shown * directly */ YV12_BUFFER_CONFIG *av1_tf_info_get_filtered_buf(TEMPORAL_FILTER_INFO *tf_info, int gf_index, FRAME_DIFF *frame_diff); /*!\cond */ // Data related to temporal filtering. typedef struct { // Source vs filtered frame error. FRAME_DIFF diff; // Pointer to temporary block info used to store state in temporal filtering // process. MB_MODE_INFO *tmp_mbmi; // Pointer to accumulator buffer used in temporal filtering process. uint32_t *accum; // Pointer to count buffer used in temporal filtering process. uint16_t *count; // Pointer to predictor used in temporal filtering process. uint8_t *pred; } TemporalFilterData; // Data related to temporal filter multi-thread synchronization. typedef struct { #if CONFIG_MULTITHREAD // Mutex lock used for dispatching jobs. pthread_mutex_t *mutex_; #endif // CONFIG_MULTITHREAD // Next temporal filter block row to be filtered. int next_tf_row; // Initialized to false, set to true by the worker thread that encounters an // error in order to abort the processing of other worker threads. bool tf_mt_exit; } AV1TemporalFilterSync; // Estimates noise level from a given frame using a single plane (Y, U, or V). // This is an adaptation of the mehtod in the following paper: // Shen-Chuan Tai, Shih-Ming Yang, "A fast method for image noise // estimation using Laplacian operator and adaptive edge detection", // Proc. 3rd International Symposium on Communications, Control and // Signal Processing, 2008, St Julians, Malta. // Inputs: // frame: Pointer to the frame to estimate noise level from. // noise_level: Pointer to store the estimated noise. // plane_from: Index of the starting plane used for noise estimation. // Commonly, 0 for Y-plane, 1 for U-plane, and 2 for V-plane. // plane_to: Index of the end plane used for noise estimation. // bit_depth: Actual bit-depth instead of the encoding bit-depth of the frame. // edge_thresh: Edge threshold. void av1_estimate_noise_level(const YV12_BUFFER_CONFIG *frame, double *noise_level, int plane_from, int plane_to, int bit_depth, int edge_thresh); /*!\endcond */ /*!\brief Does temporal filter for a given macroblock row. * * \ingroup src_frame_proc * \param[in] cpi Top level encoder instance structure * \param[in] td Pointer to thread data * \param[in] mb_row Macroblock row to be filtered filtering * * \remark Nothing will be returned, but the contents of td->diff will be modified. */ void av1_tf_do_filtering_row(struct AV1_COMP *cpi, struct ThreadData *td, int mb_row); /*!\brief Performs temporal filtering if needed on a source frame. * For example to create a filtered alternate reference frame (ARF) * * In this function, the lookahead index is different from the 0-based * real index. For example, if we want to filter the first frame in the * pre-fetched buffer `cpi->lookahead`, the lookahead index will be -1 instead * of 0. More concretely, 0 indicates the first LOOKAHEAD frame, which is the * second frame in the pre-fetched buffer. Another example: if we want to filter * the 17-th frame, which is an ARF, the lookahead index is 15 instead of 16. * Futhermore, negative number is used for key frame in one-pass mode, where key * frame is filtered with the frames before it instead of after it. For example, * -15 means to filter the 17-th frame, which is a key frame in one-pass mode. * * \ingroup src_frame_proc * \param[in] cpi Top level encoder instance * structure * \param[in] filter_frame_lookahead_idx The index of the * to-filter frame in the lookahead * buffer cpi->lookahead. * \param[in] gf_frame_index Index of GOP * \param[in,out] frame_diff structure of sse and sum of the * filtered frame. * \param[out] output_frame Ouput filtered frame. */ void av1_temporal_filter(struct AV1_COMP *cpi, const int filter_frame_lookahead_idx, int gf_frame_index, FRAME_DIFF *frame_diff, YV12_BUFFER_CONFIG *output_frame); /*!\brief Check whether a filtered frame can be show directly * * This function will use the filtered frame's sse and current q index * to make decision. * * \ingroup src_frame_proc * \param[in] frame filtered frame's buffer * \param[in] frame_diff structure of sse and sum of the * filtered frame. * \param[in] q_index q_index used for this frame * \param[in] bit_depth bit depth * \return return 1 if this frame can be shown directly, otherwise * return 0 */ int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame, const FRAME_DIFF *frame_diff, int q_index, aom_bit_depth_t bit_depth); /*!\cond */ // Allocates memory for members of TemporalFilterData. // Inputs: // tf_data: Pointer to the structure containing temporal filter related data. // num_pels: Number of pixels in the block across all planes. // is_high_bitdepth: Whether the frame is high-bitdepth or not. // Returns: // True if allocation is successful and false otherwise. static inline bool tf_alloc_and_reset_data(TemporalFilterData *tf_data, int num_pels, int is_high_bitdepth) { tf_data->tmp_mbmi = (MB_MODE_INFO *)aom_calloc(1, sizeof(*tf_data->tmp_mbmi)); tf_data->accum = (uint32_t *)aom_memalign(16, num_pels * sizeof(*tf_data->accum)); tf_data->count = (uint16_t *)aom_memalign(16, num_pels * sizeof(*tf_data->count)); if (is_high_bitdepth) tf_data->pred = CONVERT_TO_BYTEPTR( aom_memalign(32, num_pels * 2 * sizeof(*tf_data->pred))); else tf_data->pred = (uint8_t *)aom_memalign(32, num_pels * sizeof(*tf_data->pred)); // In case of an allocation failure, other successfully allocated buffers will // be freed by the tf_dealloc_data() call in encoder_destroy(). if (!(tf_data->tmp_mbmi && tf_data->accum && tf_data->count && tf_data->pred)) return false; memset(&tf_data->diff, 0, sizeof(tf_data->diff)); return true; } // Setup macroblockd params for temporal filtering process. // Inputs: // mbd: Pointer to the block for filtering. // tf_data: Pointer to the structure containing temporal filter related data. // scale: Scaling factor. // Returns: // Nothing will be returned. Contents of mbd will be modified. static inline void tf_setup_macroblockd(MACROBLOCKD *mbd, TemporalFilterData *tf_data, const struct scale_factors *scale) { mbd->block_ref_scale_factors[0] = scale; mbd->block_ref_scale_factors[1] = scale; mbd->mi = &tf_data->tmp_mbmi; mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION; } // Deallocates the memory allocated for members of TemporalFilterData. // Inputs: // tf_data: Pointer to the structure containing temporal filter related data. // is_high_bitdepth: Whether the frame is high-bitdepth or not. // Returns: // Nothing will be returned. static inline void tf_dealloc_data(TemporalFilterData *tf_data, int is_high_bitdepth) { if (is_high_bitdepth) tf_data->pred = (uint8_t *)CONVERT_TO_SHORTPTR(tf_data->pred); aom_free(tf_data->tmp_mbmi); tf_data->tmp_mbmi = NULL; aom_free(tf_data->accum); tf_data->accum = NULL; aom_free(tf_data->count); tf_data->count = NULL; aom_free(tf_data->pred); tf_data->pred = NULL; } // Saves the state prior to temporal filter process. // Inputs: // mbd: Pointer to the block for filtering. // input_mbmi: Backup block info to save input state. // input_buffer: Backup buffer pointer to save input state. // num_planes: Number of planes. // Returns: // Nothing will be returned. Contents of input_mbmi and input_buffer will be // modified. static inline void tf_save_state(MACROBLOCKD *mbd, MB_MODE_INFO ***input_mbmi, uint8_t **input_buffer, int num_planes) { for (int i = 0; i < num_planes; i++) { input_buffer[i] = mbd->plane[i].pre[0].buf; } *input_mbmi = mbd->mi; } // Restores the initial state after temporal filter process. // Inputs: // mbd: Pointer to the block for filtering. // input_mbmi: Backup block info from where input state is restored. // input_buffer: Backup buffer pointer from where input state is restored. // num_planes: Number of planes. // Returns: // Nothing will be returned. Contents of mbd will be modified. static inline void tf_restore_state(MACROBLOCKD *mbd, MB_MODE_INFO **input_mbmi, uint8_t **input_buffer, int num_planes) { for (int i = 0; i < num_planes; i++) { mbd->plane[i].pre[0].buf = input_buffer[i]; } mbd->mi = input_mbmi; } /*!\endcond */ #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ aom-3.12.1/av1/encoder/thirdpass.c000066400000000000000000000746721477627663500166740ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/encoder/thirdpass.h" #if CONFIG_THREE_PASS && CONFIG_AV1_DECODER #include "aom/aom_codec.h" #include "aom/aomdx.h" #include "aom_dsp/psnr.h" #include "aom_mem/aom_mem.h" #include "av1/av1_iface_common.h" #include "av1/encoder/encoder.h" #include "av1/encoder/firstpass.h" #include "av1/common/blockd.h" #include "common/ivfdec.h" static void setup_two_pass_stream_input( struct AvxInputContext **input_ctx_ptr, const char *input_file_name, struct aom_internal_error_info *err_info) { FILE *infile; infile = fopen(input_file_name, "rb"); if (!infile) { aom_internal_error(err_info, AOM_CODEC_INVALID_PARAM, "Failed to open input file '%s'.", input_file_name); } struct AvxInputContext *aom_input_ctx = aom_malloc(sizeof(*aom_input_ctx)); if (!aom_input_ctx) { fclose(infile); aom_internal_error(err_info, AOM_CODEC_MEM_ERROR, "Failed to allocate memory for third-pass context."); } memset(aom_input_ctx, 0, sizeof(*aom_input_ctx)); aom_input_ctx->filename = input_file_name; aom_input_ctx->file = infile; if (file_is_ivf(aom_input_ctx)) { aom_input_ctx->file_type = FILE_TYPE_IVF; } else { fclose(infile); aom_free(aom_input_ctx); aom_internal_error(err_info, AOM_CODEC_INVALID_PARAM, "Unrecognized input file type."); } *input_ctx_ptr = aom_input_ctx; } static void init_third_pass(THIRD_PASS_DEC_CTX *ctx) { if (!ctx->input_ctx) { if (ctx->input_file_name == NULL) { aom_internal_error(ctx->err_info, AOM_CODEC_INVALID_PARAM, "No third pass input specified."); } setup_two_pass_stream_input(&ctx->input_ctx, ctx->input_file_name, ctx->err_info); } if (!ctx->decoder.iface) { aom_codec_iface_t *decoder_iface = &aom_codec_av1_inspect_algo; if (aom_codec_dec_init(&ctx->decoder, decoder_iface, NULL, 0)) { aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, "Failed to initialize decoder."); } } } // Return 0: success // 1: cannot read because this is end of file // -1: failure to read the frame static int read_frame(THIRD_PASS_DEC_CTX *ctx) { if (!ctx->input_ctx || !ctx->decoder.iface) { init_third_pass(ctx); } if (!ctx->have_frame) { if (ivf_read_frame(ctx->input_ctx, &ctx->buf, &ctx->bytes_in_buffer, &ctx->buffer_size, NULL) != 0) { if (feof(ctx->input_ctx->file)) { return 1; } else { return -1; } } ctx->frame = ctx->buf; ctx->end_frame = ctx->frame + ctx->bytes_in_buffer; ctx->have_frame = 1; } Av1DecodeReturn adr; if (aom_codec_decode(&ctx->decoder, ctx->frame, (unsigned int)ctx->bytes_in_buffer, &adr) != AOM_CODEC_OK) { aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, "Failed to decode frame for third pass."); } ctx->this_frame_bits = (int)(adr.buf - ctx->frame) << 3; ctx->frame = adr.buf; ctx->bytes_in_buffer = ctx->end_frame - ctx->frame; if (ctx->frame == ctx->end_frame) ctx->have_frame = 0; return 0; } static void free_frame_info(THIRD_PASS_FRAME_INFO *frame_info) { if (!frame_info) return; aom_free(frame_info->mi_info); frame_info->mi_info = NULL; } // This function gets the information needed from the recently decoded frame, // via various decoder APIs, and saves the info into ctx->frame_info. // Return 0: success // 1: cannot read because this is end of file // -1: failure to read the frame static int get_frame_info(THIRD_PASS_DEC_CTX *ctx) { int ret = read_frame(ctx); if (ret != 0) return ret; int cur = ctx->frame_info_count; ctx->frame_info[cur].actual_bits = ctx->this_frame_bits; if (cur >= MAX_THIRD_PASS_BUF) { aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, "Third pass frame info ran out of available slots."); } aom_codec_frame_flags_t frame_type_flags = 0; if (aom_codec_control(&ctx->decoder, AOMD_GET_FRAME_FLAGS, &frame_type_flags) != AOM_CODEC_OK) { aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, "Failed to read frame flags."); } if (frame_type_flags & AOM_FRAME_IS_KEY) { ctx->frame_info[cur].frame_type = KEY_FRAME; } else if (frame_type_flags & AOM_FRAME_IS_INTRAONLY) { ctx->frame_info[cur].frame_type = INTRA_ONLY_FRAME; } else if (frame_type_flags & AOM_FRAME_IS_SWITCH) { ctx->frame_info[cur].frame_type = S_FRAME; } else { ctx->frame_info[cur].frame_type = INTER_FRAME; } // Get frame width and height int frame_size[2]; if (aom_codec_control(&ctx->decoder, AV1D_GET_FRAME_SIZE, frame_size) != AOM_CODEC_OK) { aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, "Failed to read frame size."); } // Check if we need to re-alloc the mi fields. const int mi_cols = (frame_size[0] + 3) >> 2; const int mi_rows = (frame_size[1] + 3) >> 2; ctx->frame_info[cur].mi_stride = mi_cols; ctx->frame_info[cur].mi_rows = mi_rows; ctx->frame_info[cur].mi_cols = mi_cols; if (ctx->frame_info[cur].width != frame_size[0] || ctx->frame_info[cur].height != frame_size[1] || !ctx->frame_info[cur].mi_info) { free_frame_info(&ctx->frame_info[cur]); ctx->frame_info[cur].mi_info = aom_malloc(mi_cols * mi_rows * sizeof(*ctx->frame_info[cur].mi_info)); if (!ctx->frame_info[cur].mi_info) { aom_internal_error(ctx->err_info, AOM_CODEC_MEM_ERROR, "Failed to allocate mi buffer for the third pass."); } } ctx->frame_info[cur].width = frame_size[0]; ctx->frame_info[cur].height = frame_size[1]; // Get frame base q idx if (aom_codec_control(&ctx->decoder, AOMD_GET_BASE_Q_IDX, &ctx->frame_info[cur].base_q_idx) != AOM_CODEC_OK) { aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, "Failed to read base q index."); } // Get show existing frame flag if (aom_codec_control(&ctx->decoder, AOMD_GET_SHOW_EXISTING_FRAME_FLAG, &ctx->frame_info[cur].is_show_existing_frame) != AOM_CODEC_OK) { aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, "Failed to read show existing frame flag."); } // Get show frame flag if (aom_codec_control(&ctx->decoder, AOMD_GET_SHOW_FRAME_FLAG, &ctx->frame_info[cur].is_show_frame) != AOM_CODEC_OK) { aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, "Failed to read show frame flag."); } // Get order hint if (aom_codec_control(&ctx->decoder, AOMD_GET_ORDER_HINT, &ctx->frame_info[cur].order_hint) != AOM_CODEC_OK) { aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, "Failed to read order hint."); } // Clear MI info for (int mi_row = 0; mi_row < mi_rows; mi_row++) { for (int mi_col = 0; mi_col < mi_cols; mi_col++) { ctx->frame_info[cur].mi_info[mi_row * mi_cols + mi_col].bsize = BLOCK_INVALID; } } // Get relevant information regarding each 4x4 MI MB_MODE_INFO cur_mi_info; THIRD_PASS_MI_INFO *const this_mi = ctx->frame_info[cur].mi_info; for (int mi_row = 0; mi_row < mi_rows; mi_row++) { for (int mi_col = 0; mi_col < mi_cols; mi_col++) { const int offset = mi_row * mi_cols + mi_col; if (this_mi[offset].bsize != BLOCK_INVALID) { continue; } // Get info of this MI if (aom_codec_control(&ctx->decoder, AV1D_GET_MI_INFO, mi_row, mi_col, &cur_mi_info) != AOM_CODEC_OK) { aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, "Failed to read mi info."); } const int blk_mi_rows = mi_size_high[cur_mi_info.bsize]; const int blk_mi_cols = mi_size_wide[cur_mi_info.bsize]; for (int h = 0; h < blk_mi_rows; h++) { for (int w = 0; w < blk_mi_cols; w++) { if (h + mi_row >= mi_rows || w + mi_col >= mi_cols) { continue; } const int this_offset = offset + h * mi_cols + w; this_mi[this_offset].bsize = cur_mi_info.bsize; this_mi[this_offset].partition = cur_mi_info.partition; this_mi[this_offset].mi_row_start = mi_row; this_mi[this_offset].mi_col_start = mi_col; this_mi[this_offset].mv[0] = cur_mi_info.mv[0]; this_mi[this_offset].mv[1] = cur_mi_info.mv[1]; this_mi[this_offset].ref_frame[0] = cur_mi_info.ref_frame[0]; this_mi[this_offset].ref_frame[1] = cur_mi_info.ref_frame[1]; this_mi[this_offset].pred_mode = cur_mi_info.mode; } } } } ctx->frame_info_count++; return 0; } #define USE_SECOND_PASS_FILE 1 #if !USE_SECOND_PASS_FILE // Parse the frames in the gop and determine the last frame of the current GOP. // Decode more frames if necessary. The variable max_num is the maximum static // GOP length if we detect an IPPP structure, and it is expected that max_mum >= // MAX_GF_INTERVAL. static void get_current_gop_end(THIRD_PASS_DEC_CTX *ctx, int max_num, int *last_idx) { assert(max_num >= MAX_GF_INTERVAL); *last_idx = 0; int cur_idx = 0; int arf_order_hint = -1; int num_show_frames = 0; while (num_show_frames < max_num) { assert(cur_idx < MAX_THIRD_PASS_BUF); // Read in from bitstream if needed. if (cur_idx >= ctx->frame_info_count) { int ret = get_frame_info(ctx); if (ret == 1) { // At the end of the file, GOP ends in the prev frame. if (arf_order_hint >= 0) { aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, "Failed to derive GOP length."); } *last_idx = cur_idx - 1; return; } if (ret < 0) { aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, "Failed to read frame for third pass."); } } // TODO(bohanli): verify that fwd_kf works here. if (ctx->frame_info[cur_idx].frame_type == KEY_FRAME && ctx->frame_info[cur_idx].is_show_frame) { if (cur_idx != 0) { // If this is a key frame and is not the first kf in this kf group, we // have reached the next key frame. Stop here. *last_idx = cur_idx - 1; return; } } else if (!ctx->frame_info[cur_idx].is_show_frame && arf_order_hint == -1) { // If this is an arf (the first no show) if (num_show_frames <= 1) { // This is an arf and we should end the GOP with its overlay. arf_order_hint = ctx->frame_info[cur_idx].order_hint; } else { // There are multiple show frames before the this arf, so we treat the // frames previous to this arf as a GOP. *last_idx = cur_idx - 1; return; } } else if (arf_order_hint >= 0 && ctx->frame_info[cur_idx].order_hint == (unsigned int)arf_order_hint) { // If this is the overlay/show existing of the arf assert(ctx->frame_info[cur_idx].is_show_frame); *last_idx = cur_idx; return; } else { // This frame is part of the GOP. if (ctx->frame_info[cur_idx].is_show_frame) num_show_frames++; } cur_idx++; } // This is a long IPPP GOP and we will use a length of max_num here. assert(arf_order_hint < 0); *last_idx = max_num - 1; return; } #endif static inline void read_gop_frames(THIRD_PASS_DEC_CTX *ctx) { int cur_idx = 0; while (cur_idx < ctx->gop_info.num_frames) { assert(cur_idx < MAX_THIRD_PASS_BUF); // Read in from bitstream if needed. if (cur_idx >= ctx->frame_info_count) { int ret = get_frame_info(ctx); if (ret != 0) { aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, "Failed to read frame for third pass."); } } cur_idx++; } return; } void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx) { // Read in future frames in the current GOP. read_gop_frames(ctx); int gf_len = 0; // Check the GOP length against the value read from second_pass_file for (int i = 0; i < ctx->gop_info.num_frames; i++) { if (ctx->frame_info[i].is_show_frame) gf_len++; } if (gf_len != ctx->gop_info.gf_length) { aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, "Mismatch in third pass GOP length!"); } } void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx) { if (ctx->frame_info_count == 0) { aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, "No available frame info for third pass."); } ctx->frame_info_count--; free_frame_info(&ctx->frame_info[0]); for (int i = 0; i < ctx->frame_info_count; i++) { ctx->frame_info[i] = ctx->frame_info[i + 1]; } ctx->frame_info[ctx->frame_info_count].mi_info = NULL; } void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx, const char *file) { av1_free_thirdpass_ctx(*ctx); CHECK_MEM_ERROR(cm, *ctx, aom_calloc(1, sizeof(**ctx))); THIRD_PASS_DEC_CTX *ctx_ptr = *ctx; ctx_ptr->input_file_name = file; ctx_ptr->prev_gop_end = -1; ctx_ptr->err_info = cm->error; } void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx) { if (ctx == NULL) return; if (ctx->decoder.iface) { aom_codec_destroy(&ctx->decoder); } if (ctx->input_ctx && ctx->input_ctx->file) fclose(ctx->input_ctx->file); aom_free(ctx->input_ctx); if (ctx->buf) free(ctx->buf); for (int i = 0; i < MAX_THIRD_PASS_BUF; i++) { free_frame_info(&ctx->frame_info[i]); } aom_free(ctx); } void av1_write_second_pass_gop_info(AV1_COMP *cpi) { const AV1EncoderConfig *const oxcf = &cpi->oxcf; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; if (oxcf->pass == AOM_RC_SECOND_PASS && oxcf->second_pass_log) { // Write the GOP length to a log file. av1_open_second_pass_log(cpi, 0); THIRD_PASS_GOP_INFO gop_info; gop_info.num_frames = gf_group->size; gop_info.use_arf = (gf_group->arf_index >= 0); gop_info.gf_length = p_rc->baseline_gf_interval; size_t count = fwrite(&gop_info, sizeof(gop_info), 1, cpi->second_pass_log_stream); if (count < 1) { aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, "Could not write to second pass log file!"); } } } void av1_write_second_pass_per_frame_info(AV1_COMP *cpi, int gf_index) { const AV1EncoderConfig *const oxcf = &cpi->oxcf; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; if (oxcf->pass == AOM_RC_SECOND_PASS && oxcf->second_pass_log) { // write target bitrate int bits = gf_group->bit_allocation[gf_index]; size_t count = fwrite(&bits, sizeof(bits), 1, cpi->second_pass_log_stream); if (count < 1) { aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, "Could not write to second pass log file!"); } // write sse uint64_t sse = 0; int pkt_idx = cpi->ppi->output_pkt_list->cnt - 1; if (pkt_idx >= 0 && cpi->ppi->output_pkt_list->pkts[pkt_idx].kind == AOM_CODEC_PSNR_PKT) { sse = cpi->ppi->output_pkt_list->pkts[pkt_idx].data.psnr.sse[0]; #if CONFIG_INTERNAL_STATS } else if (cpi->ppi->b_calculate_psnr) { sse = cpi->ppi->total_sq_error[0]; #endif } else { const YV12_BUFFER_CONFIG *orig = cpi->source; const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; PSNR_STATS psnr; #if CONFIG_AV1_HIGHBITDEPTH const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth; const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth); #else aom_calc_psnr(orig, recon, &psnr); #endif sse = psnr.sse[0]; } count = fwrite(&sse, sizeof(sse), 1, cpi->second_pass_log_stream); if (count < 1) { aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, "Could not write to second pass log file!"); } // write bpm_factor double factor = cpi->ppi->twopass.bpm_factor; count = fwrite(&factor, sizeof(factor), 1, cpi->second_pass_log_stream); if (count < 1) { aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, "Could not write to second pass log file!"); } } } void av1_open_second_pass_log(AV1_COMP *cpi, int is_read) { const AV1EncoderConfig *const oxcf = &cpi->oxcf; if (oxcf->second_pass_log == NULL) { aom_internal_error(cpi->common.error, AOM_CODEC_INVALID_PARAM, "No second pass log file specified for the third pass!"); } // Read the GOP length from a file. if (!cpi->second_pass_log_stream) { if (is_read) { cpi->second_pass_log_stream = fopen(cpi->oxcf.second_pass_log, "rb"); } else { cpi->second_pass_log_stream = fopen(cpi->oxcf.second_pass_log, "wb"); } if (!cpi->second_pass_log_stream) { aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, "Could not open second pass log file!"); } } } void av1_close_second_pass_log(AV1_COMP *cpi) { if (cpi->second_pass_log_stream) { int ret = fclose(cpi->second_pass_log_stream); if (ret != 0) { aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, "Could not close second pass log file!"); } cpi->second_pass_log_stream = 0; } } void av1_read_second_pass_gop_info(FILE *second_pass_log_stream, THIRD_PASS_GOP_INFO *gop_info, struct aom_internal_error_info *error) { size_t count = fread(gop_info, sizeof(*gop_info), 1, second_pass_log_stream); if (count < 1) { aom_internal_error(error, AOM_CODEC_ERROR, "Could not read from second pass log file!"); } } void av1_read_second_pass_per_frame_info( FILE *second_pass_log_stream, THIRD_PASS_FRAME_INFO *frame_info_arr, int frame_info_count, struct aom_internal_error_info *error) { for (int i = 0; i < frame_info_count; i++) { // read target bits int bits = 0; size_t count = fread(&bits, sizeof(bits), 1, second_pass_log_stream); if (count < 1) { aom_internal_error(error, AOM_CODEC_ERROR, "Could not read from second pass log file!"); } frame_info_arr[i].bits_allocated = bits; // read distortion uint64_t sse; count = fread(&sse, sizeof(sse), 1, second_pass_log_stream); if (count < 1) { aom_internal_error(error, AOM_CODEC_ERROR, "Could not read from second pass log file!"); } frame_info_arr[i].sse = sse; // read bpm factor double factor; count = fread(&factor, sizeof(factor), 1, second_pass_log_stream); if (count < 1) { aom_internal_error(error, AOM_CODEC_ERROR, "Could not read from second pass log file!"); } frame_info_arr[i].bpm_factor = factor; } } int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx) { if (ctx == NULL) return -1; int use_arf = 0; for (int i = 0; i < ctx->gop_info.gf_length; i++) { if (ctx->frame_info[i].order_hint != 0 && ctx->frame_info[i].is_show_frame == 0) { use_arf = 1; } } if (use_arf != ctx->gop_info.use_arf) { aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, "Mismatch in third pass GOP length!"); } return use_arf; } void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight, int fwidth, double *ratio_h, double *ratio_w) { assert(ctx); assert(fidx < ctx->frame_info_count); const int fheight_second_pass = ctx->frame_info[fidx].height; const int fwidth_second_pass = ctx->frame_info[fidx].width; assert(fheight_second_pass <= fheight && fwidth_second_pass <= fwidth); *ratio_h = (double)fheight / fheight_second_pass; *ratio_w = (double)fwidth / fwidth_second_pass; } THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx, int mi_row, int mi_col, double ratio_h, double ratio_w) { assert(ctx); assert(fidx < ctx->frame_info_count); const int mi_rows_second_pass = ctx->frame_info[fidx].mi_rows; const int mi_cols_second_pass = ctx->frame_info[fidx].mi_cols; const int mi_row_second_pass = clamp((int)round(mi_row / ratio_h), 0, mi_rows_second_pass - 1); const int mi_col_second_pass = clamp((int)round(mi_col / ratio_w), 0, mi_cols_second_pass - 1); const int mi_stride_second_pass = ctx->frame_info[fidx].mi_stride; THIRD_PASS_MI_INFO *this_mi = ctx->frame_info[fidx].mi_info + mi_row_second_pass * mi_stride_second_pass + mi_col_second_pass; return this_mi; } void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi, double ratio_h, double ratio_w, int *mi_row, int *mi_col) { *mi_row = (int)round(third_pass_mi->mi_row_start * ratio_h); *mi_col = (int)round(third_pass_mi->mi_col_start * ratio_w); } int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi, double ratio_h, double ratio_w, MV_REFERENCE_FRAME frame) { assert(this_mi != NULL); int_mv cur_mv; cur_mv.as_int = INVALID_MV; if (frame < LAST_FRAME || frame > ALTREF_FRAME) return cur_mv; for (int r = 0; r < 2; r++) { if (this_mi->ref_frame[r] == frame) { cur_mv.as_mv.row = (int16_t)round(this_mi->mv[r].as_mv.row * ratio_h); cur_mv.as_mv.col = (int16_t)round(this_mi->mv[r].as_mv.col * ratio_w); } } return cur_mv; } BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi, double ratio_h, double ratio_w) { assert(this_mi != NULL); BLOCK_SIZE bsize = BLOCK_INVALID; const BLOCK_SIZE bsize_second_pass = this_mi->bsize; assert(bsize_second_pass != BLOCK_INVALID); const int w_second_pass = block_size_wide[bsize_second_pass]; const int h_second_pass = block_size_high[bsize_second_pass]; int part_type; if (w_second_pass == h_second_pass) { part_type = PARTITION_NONE; } else if (w_second_pass / h_second_pass == 2) { part_type = PARTITION_HORZ; } else if (w_second_pass / h_second_pass == 4) { part_type = PARTITION_HORZ_4; } else if (h_second_pass / w_second_pass == 2) { part_type = PARTITION_VERT; } else if (h_second_pass / w_second_pass == 4) { part_type = PARTITION_VERT_4; } else { part_type = PARTITION_INVALID; } assert(part_type != PARTITION_INVALID); const int w = (int)(round(w_second_pass * ratio_w)); const int h = (int)(round(h_second_pass * ratio_h)); for (int i = 0; i < SQR_BLOCK_SIZES; i++) { const BLOCK_SIZE this_bsize = subsize_lookup[part_type][i]; if (this_bsize == BLOCK_INVALID) continue; const int this_w = block_size_wide[this_bsize]; const int this_h = block_size_high[this_bsize]; if (this_w >= w && this_h >= h) { // find the smallest block size that contains the mapped block bsize = this_bsize; break; } } if (bsize == BLOCK_INVALID) { // could not find a proper one, just use the largest then. bsize = BLOCK_128X128; } return bsize; } PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx, THIRD_PASS_MI_INFO *this_mi) { int mi_stride = ctx->frame_info[0].mi_stride; int mi_row = this_mi->mi_row_start; int mi_col = this_mi->mi_col_start; THIRD_PASS_MI_INFO *corner_mi = &ctx->frame_info[0].mi_info[mi_row * mi_stride + mi_col]; return corner_mi->partition; } #else // !(CONFIG_THREE_PASS && CONFIG_AV1_DECODER) void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx, const char *file) { (void)ctx; (void)file; aom_internal_error(cm->error, AOM_CODEC_ERROR, "To utilize three-pass encoding, libaom must be built " "with CONFIG_THREE_PASS=1 & CONFIG_AV1_DECODER=1."); } void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; } void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; } void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; } void av1_open_second_pass_log(struct AV1_COMP *cpi, int is_read) { (void)cpi; (void)is_read; } void av1_close_second_pass_log(struct AV1_COMP *cpi) { (void)cpi; } void av1_write_second_pass_gop_info(struct AV1_COMP *cpi) { (void)cpi; } void av1_write_second_pass_per_frame_info(struct AV1_COMP *cpi, int gf_index) { (void)cpi; (void)gf_index; } void av1_read_second_pass_gop_info(FILE *second_pass_log_stream, THIRD_PASS_GOP_INFO *gop_info, struct aom_internal_error_info *error) { (void)second_pass_log_stream; (void)gop_info; (void)error; } void av1_read_second_pass_per_frame_info( FILE *second_pass_log_stream, THIRD_PASS_FRAME_INFO *frame_info_arr, int frame_info_count, struct aom_internal_error_info *error) { (void)second_pass_log_stream; (void)frame_info_arr; (void)frame_info_count; (void)error; } int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; return 1; } void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight, int fwidth, double *ratio_h, double *ratio_w) { (void)ctx; (void)fidx; (void)fheight; (void)fwidth; (void)ratio_h; (void)ratio_w; } THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx, int mi_row, int mi_col, double ratio_h, double ratio_w) { (void)ctx; (void)fidx; (void)mi_row; (void)mi_col; (void)ratio_h; (void)ratio_w; return NULL; } int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi, double ratio_h, double ratio_w, MV_REFERENCE_FRAME frame) { (void)this_mi; (void)ratio_h; (void)ratio_w; (void)frame; int_mv mv; mv.as_int = INVALID_MV; return mv; } BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi, double ratio_h, double ratio_w) { (void)this_mi; (void)ratio_h; (void)ratio_w; return BLOCK_INVALID; } void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi, double ratio_h, double ratio_w, int *mi_row, int *mi_col) { (void)third_pass_mi; (void)ratio_h; (void)ratio_w; (void)mi_row; (void)mi_col; } PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx, THIRD_PASS_MI_INFO *this_mi) { (void)ctx; (void)this_mi; return PARTITION_INVALID; } #endif // CONFIG_THREE_PASS && CONFIG_AV1_DECODER #if CONFIG_BITRATE_ACCURACY static void fwrite_and_check(const void *ptr, size_t size, size_t nmemb, FILE *stream, struct aom_internal_error_info *error) { size_t count = fwrite(ptr, size, nmemb, stream); if (count < nmemb) { aom_internal_error(error, AOM_CODEC_ERROR, "fwrite_and_check failed\n"); } } static void fread_and_check(void *ptr, size_t size, size_t nmemb, FILE *stream, struct aom_internal_error_info *error) { size_t count = fread(ptr, size, nmemb, stream); if (count < nmemb) { aom_internal_error(error, AOM_CODEC_ERROR, "fread_and_check failed\n"); } } void av1_pack_tpl_info(TPL_INFO *tpl_info, const GF_GROUP *gf_group, const TplParams *tpl_data) { tpl_info->tpl_ready = tpl_data->ready; if (tpl_info->tpl_ready) { tpl_info->gf_length = gf_group->size; for (int i = 0; i < tpl_info->gf_length; ++i) { tpl_info->txfm_stats_list[i] = tpl_data->txfm_stats_list[i]; tpl_info->qstep_ratio_ls[i] = av1_tpl_get_qstep_ratio(tpl_data, i); tpl_info->update_type_list[i] = gf_group->update_type[i]; } } } void av1_write_tpl_info(const TPL_INFO *tpl_info, FILE *log_stream, struct aom_internal_error_info *error) { fwrite_and_check(&tpl_info->tpl_ready, sizeof(tpl_info->tpl_ready), 1, log_stream, error); if (tpl_info->tpl_ready) { fwrite_and_check(&tpl_info->gf_length, sizeof(tpl_info->gf_length), 1, log_stream, error); assert(tpl_info->gf_length <= MAX_LENGTH_TPL_FRAME_STATS); fwrite_and_check(&tpl_info->txfm_stats_list, sizeof(tpl_info->txfm_stats_list[0]), tpl_info->gf_length, log_stream, error); fwrite_and_check(&tpl_info->qstep_ratio_ls, sizeof(tpl_info->qstep_ratio_ls[0]), tpl_info->gf_length, log_stream, error); fwrite_and_check(&tpl_info->update_type_list, sizeof(tpl_info->update_type_list[0]), tpl_info->gf_length, log_stream, error); } } void av1_read_tpl_info(TPL_INFO *tpl_info, FILE *log_stream, struct aom_internal_error_info *error) { av1_zero(*tpl_info); fread_and_check(&tpl_info->tpl_ready, sizeof(tpl_info->tpl_ready), 1, log_stream, error); if (tpl_info->tpl_ready) { fread_and_check(&tpl_info->gf_length, sizeof(tpl_info->gf_length), 1, log_stream, error); assert(tpl_info->gf_length <= MAX_LENGTH_TPL_FRAME_STATS); fread_and_check(&tpl_info->txfm_stats_list, sizeof(tpl_info->txfm_stats_list[0]), tpl_info->gf_length, log_stream, error); fread_and_check(&tpl_info->qstep_ratio_ls, sizeof(tpl_info->qstep_ratio_ls[0]), tpl_info->gf_length, log_stream, error); fread_and_check(&tpl_info->update_type_list, sizeof(tpl_info->update_type_list[0]), tpl_info->gf_length, log_stream, error); } } #endif // CONFIG_BITRATE_ACCURACY aom-3.12.1/av1/encoder/thirdpass.h000066400000000000000000000160361477627663500166670ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_THIRDPASS_H_ #define AOM_AV1_ENCODER_THIRDPASS_H_ #include "av1/common/enums.h" #ifdef __cplusplus extern "C" { #endif #include "av1/encoder/firstpass.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/tpl_model.h" struct AV1_COMP; // TODO(bohanli): optimize this number #define MAX_THIRD_PASS_BUF \ (AOMMAX((2 * MAX_GF_INTERVAL + 1), MAX_STATIC_GF_GROUP_LENGTH)) // Struct to store useful information related to a GOP, in addition to what is // available in the bitstream typedef struct { int gf_length; int num_frames; int use_arf; } THIRD_PASS_GOP_INFO; #if CONFIG_BITRATE_ACCURACY typedef struct TPL_INFO { int gf_length; int tpl_ready; TplTxfmStats txfm_stats_list[MAX_LENGTH_TPL_FRAME_STATS]; double qstep_ratio_ls[MAX_LENGTH_TPL_FRAME_STATS]; FRAME_UPDATE_TYPE update_type_list[MAX_LENGTH_TPL_FRAME_STATS]; } TPL_INFO; #endif // CONFIG_BITRATE_ACCURACY typedef struct { BLOCK_SIZE bsize; PARTITION_TYPE partition; int mi_row_start; int mi_col_start; int_mv mv[2]; MV_REFERENCE_FRAME ref_frame[2]; PREDICTION_MODE pred_mode; } THIRD_PASS_MI_INFO; // Struct to store useful information about a frame for the third pass. // The members are extracted from the decoder by function get_frame_info. typedef struct { int width; int height; int mi_stride; int mi_rows; int mi_cols; int base_q_idx; int is_show_existing_frame; int is_show_frame; int bits_allocated; int actual_bits; uint64_t sse; double bpm_factor; FRAME_TYPE frame_type; unsigned int order_hint; THIRD_PASS_MI_INFO *mi_info; } THIRD_PASS_FRAME_INFO; typedef struct { /* --- Input and decoding related members --- */ // the input file const char *input_file_name; #if CONFIG_THREE_PASS // input context struct AvxInputContext *input_ctx; #endif // decoder codec context aom_codec_ctx_t decoder; // start of the frame in buf const unsigned char *frame; // end of the frame(s) in buf const unsigned char *end_frame; // whether we still have following frames in buf int have_frame; // pointer to buffer for the read frames uint8_t *buf; // size of data in buffer size_t bytes_in_buffer; // current buffer size size_t buffer_size; // error info pointer struct aom_internal_error_info *err_info; int this_frame_bits; /* --- Members for third pass encoding --- */ // Array to store info about each frame. // frame_info[0] should point to the current frame. THIRD_PASS_FRAME_INFO frame_info[MAX_THIRD_PASS_BUF]; // number of frames available in frame_info int frame_info_count; // the end of the previous GOP (order hint) int prev_gop_end; THIRD_PASS_GOP_INFO gop_info; } THIRD_PASS_DEC_CTX; void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx, const char *file); void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx); // Set the GOP structure from the twopass bitstream. // TODO(bohanli): this is currently a skeleton and we only return the gop // length. This function also saves all frame information in the array // ctx->frame_info for this GOP. void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx); // Pop one frame out of the array ctx->frame_info. This function is used to make // sure that frame_info[0] always corresponds to the current frame. void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx); void av1_open_second_pass_log(struct AV1_COMP *cpi, int is_read); void av1_close_second_pass_log(struct AV1_COMP *cpi); // Write the current GOP information into the second pass log file. void av1_write_second_pass_gop_info(struct AV1_COMP *cpi); // Write the information of the frames in this GOP into the second pass log // file. void av1_write_second_pass_per_frame_info(struct AV1_COMP *cpi, int gf_index); // Read the next GOP information from the second pass log file. void av1_read_second_pass_gop_info(FILE *second_pass_log_stream, THIRD_PASS_GOP_INFO *gop_info, struct aom_internal_error_info *error); // read the information of the frames in next GOP from the second pass log file. void av1_read_second_pass_per_frame_info(FILE *second_pass_log_stream, THIRD_PASS_FRAME_INFO *frame_info_arr, int frame_info_count, struct aom_internal_error_info *error); int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx); // Calculate the ratio of third pass frame dimensions over second pass frame // dimensions. Return them in ratio_h and ratio_w. void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight, int fwidth, double *ratio_h, double *ratio_w); // Get the pointer to a second pass mi info, where mi_row and mi_col are the mi // location in the thirdpass frame. THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx, int mi_row, int mi_col, double ratio_h, double ratio_w); // Get the adjusted MVs of this_mi, associated with the reference frame. If no // MV is found with the reference frame, INVALID_MV is returned. int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi, double ratio_h, double ratio_w, MV_REFERENCE_FRAME frame); // Get the adjusted block size of this_mi. BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi, double ratio_h, double ratio_w); // Get the adjusted mi position in the third pass frame, of a given // third_pass_mi. Location is returned in mi_row and mi_col. void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi, double ratio_h, double ratio_w, int *mi_row, int *mi_col); PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx, THIRD_PASS_MI_INFO *this_mi); #if CONFIG_BITRATE_ACCURACY void av1_pack_tpl_info(TPL_INFO *tpl_info, const GF_GROUP *gf_group, const TplParams *tpl_data); void av1_write_tpl_info(const TPL_INFO *tpl_info, FILE *log_stream, struct aom_internal_error_info *error); void av1_read_tpl_info(TPL_INFO *tpl_info, FILE *log_stream, struct aom_internal_error_info *error); #endif // CONFIG_BITRATE_ACCURACY #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_THIRDPASS_H_ aom-3.12.1/av1/encoder/tokenize.c000066400000000000000000000355711477627663500165160ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "aom_mem/aom_mem.h" #include "av1/common/entropy.h" #include "av1/common/pred_common.h" #include "av1/common/scan.h" #include "av1/common/seg_common.h" #include "av1/encoder/cost.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encodetxb.h" #include "av1/encoder/rdopt.h" #include "av1/encoder/tokenize.h" static inline int av1_fast_palette_color_index_context_on_edge( const uint8_t *color_map, int stride, int r, int c, int *color_idx) { const bool has_left = (c - 1 >= 0); const bool has_above = (r - 1 >= 0); assert(r > 0 || c > 0); assert(has_above ^ has_left); assert(color_idx); (void)has_left; const uint8_t color_neighbor = has_above ? color_map[(r - 1) * stride + (c - 0)] : color_map[(r - 0) * stride + (c - 1)]; // If the neighbor color has higher index than current color index, then we // move up by 1. const uint8_t current_color = *color_idx = color_map[r * stride + c]; if (color_neighbor > current_color) { (*color_idx)++; } else if (color_neighbor == current_color) { *color_idx = 0; } // Get hash value of context. // The non-diagonal neighbors get a weight of 2. const uint8_t color_score = 2; const uint8_t hash_multiplier = 1; const uint8_t color_index_ctx_hash = color_score * hash_multiplier; // Lookup context from hash. const int color_index_ctx = av1_palette_color_index_context_lookup[color_index_ctx_hash]; assert(color_index_ctx == 0); (void)color_index_ctx; return 0; } #define SWAP(i, j) \ do { \ const uint8_t tmp_score = score_rank[i]; \ const uint8_t tmp_color = color_rank[i]; \ score_rank[i] = score_rank[j]; \ color_rank[i] = color_rank[j]; \ score_rank[j] = tmp_score; \ color_rank[j] = tmp_color; \ } while (0) #define INVALID_COLOR_IDX (UINT8_MAX) // A faster version of av1_get_palette_color_index_context used by the encoder // exploiting the fact that the encoder does not need to maintain a color order. static inline int av1_fast_palette_color_index_context(const uint8_t *color_map, int stride, int r, int c, int *color_idx) { assert(r > 0 || c > 0); const bool has_above = (r - 1 >= 0); const bool has_left = (c - 1 >= 0); assert(has_above || has_left); if (has_above ^ has_left) { return av1_fast_palette_color_index_context_on_edge(color_map, stride, r, c, color_idx); } // This goes in the order of left, top, and top-left. This has the advantage // that unless anything here are not distinct or invalid, this will already // be in sorted order. Furthermore, if either of the first two is // invalid, we know the last one is also invalid. uint8_t color_neighbors[NUM_PALETTE_NEIGHBORS]; color_neighbors[0] = color_map[(r - 0) * stride + (c - 1)]; color_neighbors[1] = color_map[(r - 1) * stride + (c - 0)]; color_neighbors[2] = color_map[(r - 1) * stride + (c - 1)]; // Aggregate duplicated values. // Since our array is so small, using a couple if statements is faster uint8_t scores[NUM_PALETTE_NEIGHBORS] = { 2, 2, 1 }; uint8_t num_invalid_colors = 0; if (color_neighbors[0] == color_neighbors[1]) { scores[0] += scores[1]; color_neighbors[1] = INVALID_COLOR_IDX; num_invalid_colors += 1; if (color_neighbors[0] == color_neighbors[2]) { scores[0] += scores[2]; num_invalid_colors += 1; } } else if (color_neighbors[0] == color_neighbors[2]) { scores[0] += scores[2]; num_invalid_colors += 1; } else if (color_neighbors[1] == color_neighbors[2]) { scores[1] += scores[2]; num_invalid_colors += 1; } const uint8_t num_valid_colors = NUM_PALETTE_NEIGHBORS - num_invalid_colors; uint8_t *color_rank = color_neighbors; uint8_t *score_rank = scores; // Sort everything if (num_valid_colors > 1) { if (color_neighbors[1] == INVALID_COLOR_IDX) { scores[1] = scores[2]; color_neighbors[1] = color_neighbors[2]; } // We need to swap the first two elements if they have the same score but // the color indices are not in the right order if (score_rank[0] < score_rank[1] || (score_rank[0] == score_rank[1] && color_rank[0] > color_rank[1])) { SWAP(0, 1); } if (num_valid_colors > 2) { if (score_rank[0] < score_rank[2]) { SWAP(0, 2); } if (score_rank[1] < score_rank[2]) { SWAP(1, 2); } } } // If any of the neighbor colors has higher index than current color index, // then we move up by 1 unless the current color is the same as one of the // neighbors. const uint8_t current_color = *color_idx = color_map[r * stride + c]; for (int idx = 0; idx < num_valid_colors; idx++) { if (color_rank[idx] > current_color) { (*color_idx)++; } else if (color_rank[idx] == current_color) { *color_idx = idx; break; } } // Get hash value of context. uint8_t color_index_ctx_hash = 0; static const uint8_t hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 }; for (int idx = 0; idx < num_valid_colors; ++idx) { color_index_ctx_hash += score_rank[idx] * hash_multipliers[idx]; } assert(color_index_ctx_hash > 0); assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH); // Lookup context from hash. const int color_index_ctx = 9 - color_index_ctx_hash; assert(color_index_ctx == av1_palette_color_index_context_lookup[color_index_ctx_hash]); assert(color_index_ctx >= 0); assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS); return color_index_ctx; } #undef INVALID_COLOR_IDX #undef SWAP static int cost_and_tokenize_map(Av1ColorMapParam *param, TokenExtra **t, int plane, int calc_rate, int allow_update_cdf, FRAME_COUNTS *counts) { const uint8_t *const color_map = param->color_map; MapCdf map_cdf = param->map_cdf; ColorCost color_cost = param->color_cost; const int plane_block_width = param->plane_width; const int rows = param->rows; const int cols = param->cols; const int n = param->n_colors; const int palette_size_idx = n - PALETTE_MIN_SIZE; int this_rate = 0; (void)plane; (void)counts; for (int k = 1; k < rows + cols - 1; ++k) { for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) { int i = k - j; int color_new_idx; const int color_ctx = av1_fast_palette_color_index_context( color_map, plane_block_width, i, j, &color_new_idx); assert(color_new_idx >= 0 && color_new_idx < n); if (calc_rate) { this_rate += color_cost[palette_size_idx][color_ctx][color_new_idx]; } else { (*t)->token = color_new_idx; (*t)->color_ctx = color_ctx; ++(*t); if (allow_update_cdf) update_cdf(map_cdf[palette_size_idx][color_ctx], color_new_idx, n); #if CONFIG_ENTROPY_STATS if (plane) { ++counts->palette_uv_color_index[palette_size_idx][color_ctx] [color_new_idx]; } else { ++counts->palette_y_color_index[palette_size_idx][color_ctx] [color_new_idx]; } #endif } } } if (calc_rate) return this_rate; return 0; } static void get_palette_params(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize, Av1ColorMapParam *params) { const MACROBLOCKD *const xd = &x->e_mbd; const MB_MODE_INFO *const mbmi = xd->mi[0]; const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; params->color_map = xd->plane[plane].color_index_map; params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf : xd->tile_ctx->palette_y_color_index_cdf; params->color_cost = plane ? x->mode_costs.palette_uv_color_cost : x->mode_costs.palette_y_color_cost; params->n_colors = pmi->palette_size[plane]; av1_get_block_dimensions(bsize, plane, xd, ¶ms->plane_width, NULL, ¶ms->rows, ¶ms->cols); } // TODO(any): Remove this function static void get_color_map_params(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize, TX_SIZE tx_size, COLOR_MAP_TYPE type, Av1ColorMapParam *params) { (void)tx_size; memset(params, 0, sizeof(*params)); switch (type) { case PALETTE_MAP: get_palette_params(x, plane, bsize, params); break; default: assert(0 && "Invalid color map type"); return; } } int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize, TX_SIZE tx_size, COLOR_MAP_TYPE type) { assert(plane == 0 || plane == 1); Av1ColorMapParam color_map_params; get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params); return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL); } void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, TokenExtra **t, BLOCK_SIZE bsize, TX_SIZE tx_size, COLOR_MAP_TYPE type, int allow_update_cdf, FRAME_COUNTS *counts) { assert(plane == 0 || plane == 1); Av1ColorMapParam color_map_params; get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params); // The first color index does not use context or entropy. (*t)->token = color_map_params.color_map[0]; (*t)->color_ctx = -1; ++(*t); cost_and_tokenize_map(&color_map_params, t, plane, 0, allow_update_cdf, counts); } static void tokenize_vartx(ThreadData *td, TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int blk_row, int blk_col, int block, int plane, void *arg) { MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int max_blocks_high = max_block_high(xd, plane_bsize, plane); const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; const TX_SIZE plane_tx_size = plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x, pd->subsampling_y) : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, blk_col)]; if (tx_size == plane_tx_size || plane) { plane_bsize = get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); struct tokenize_b_args *args = arg; if (args->allow_update_cdf) av1_update_and_record_txb_context(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg); else av1_record_txb_context(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg); } else { // Half the block size in transform block unit. const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; const int step = bsw * bsh; const int row_end = AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); const int col_end = AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); assert(bsw > 0 && bsh > 0); for (int row = 0; row < row_end; row += bsh) { const int offsetr = blk_row + row; for (int col = 0; col < col_end; col += bsw) { const int offsetc = blk_col + col; tokenize_vartx(td, sub_txs, plane_bsize, offsetr, offsetc, block, plane, arg); block += step; } } } } void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate, uint8_t allow_update_cdf) { assert(bsize < BLOCK_SIZES_ALL); const AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) return; const int num_planes = av1_num_planes(cm); MB_MODE_INFO *const mbmi = xd->mi[0]; struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run }; if (mbmi->skip_txfm) { av1_reset_entropy_context(xd, bsize, num_planes); return; } for (int plane = 0; plane < num_planes; ++plane) { if (plane && !xd->is_chroma_ref) break; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int ss_x = pd->subsampling_x; const int ss_y = pd->subsampling_y; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); assert(plane_bsize < BLOCK_SIZES_ALL); const int mi_width = mi_size_wide[plane_bsize]; const int mi_height = mi_size_high[plane_bsize]; const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size]; const int bw = mi_size_wide[txb_size]; const int bh = mi_size_high[txb_size]; int block = 0; const int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; const BLOCK_SIZE max_unit_bsize = get_plane_block_size(BLOCK_64X64, ss_x, ss_y); int mu_blocks_wide = mi_size_wide[max_unit_bsize]; int mu_blocks_high = mi_size_high[max_unit_bsize]; mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide); mu_blocks_high = AOMMIN(mi_height, mu_blocks_high); for (int idy = 0; idy < mi_height; idy += mu_blocks_high) { for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) { const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height); const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width); for (int blk_row = idy; blk_row < unit_height; blk_row += bh) { for (int blk_col = idx; blk_col < unit_width; blk_col += bw) { tokenize_vartx(td, max_tx_size, plane_bsize, blk_row, blk_col, block, plane, &arg); block += step; } } } } } if (rate) *rate += arg.this_rate; } aom-3.12.1/av1/encoder/tokenize.h000066400000000000000000000135171477627663500165170ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_TOKENIZE_H_ #define AOM_AV1_ENCODER_TOKENIZE_H_ #include "av1/common/entropy.h" #include "av1/encoder/block.h" #include "aom_dsp/bitwriter.h" #ifdef __cplusplus extern "C" { #endif // The token and color_ctx members of the TokenExtra structure are used // to store the indices of color and color context of each pixel in // case of palette mode. // 1) token can take values in the range of [0, 7] as maximum number of possible // colors is 8 (PALETTE_COLORS). Hence token requires 3 bits (unsigned). // 2) The reserved field (1-bit) is positioned such that color_ctx occupies the // most significant bits and token occupies the least significant bits of the // byte. Thus accesses to token and color_ctx are optimal. If TokenExtra is // defined as: // typedef struct { // int8_t color_ctx : 4; // uint8_t token : 3; // } TokenExtra; // then read of color_ctx requires an extra left shift to facilitate sign // extension and write of token requires an extra masking. // 3) color_ctx can take 5 (PALETTE_COLOR_INDEX_CONTEXTS) valid values, i.e., // from 0 to 4. As per the current implementation it can take values in the // range of [-1, 4]. Here -1 corresponds to invalid color index context and is // used for default initialization. Hence color_ctx requires 4 bits (signed). typedef struct { uint8_t token : 3; uint8_t reserved : 1; int8_t color_ctx : 4; } TokenExtra; typedef struct { TokenExtra *start; unsigned int count; } TokenList; typedef struct { // Number of tile tokens for which memory is allocated. unsigned int tokens_allocated; // tile_tok[i][j] is a pointer to the buffer storing palette tokens of the ith // tile row, jth tile column. TokenExtra *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS]; // tplist[i][j][k] holds the start pointer of tile_tok[i][j] and the count of // palette tokens for the kth superblock row of the ith tile row, jth tile // column. TokenList *tplist[MAX_TILE_ROWS][MAX_TILE_COLS]; } TokenInfo; struct AV1_COMP; struct ThreadData; struct FRAME_COUNTS; enum { OUTPUT_ENABLED = 0, DRY_RUN_NORMAL, DRY_RUN_COSTCOEFFS, } UENUM1BYTE(RUN_TYPE); struct tokenize_b_args { const struct AV1_COMP *cpi; struct ThreadData *td; int this_rate; uint8_t allow_update_cdf; RUN_TYPE dry_run; }; // Note in all the tokenize functions rate if non NULL is incremented // with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS, // otherwise rate is not incremented. void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td, RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate, uint8_t allow_update_cdf); int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize, TX_SIZE tx_size, COLOR_MAP_TYPE type); void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, TokenExtra **t, BLOCK_SIZE bsize, TX_SIZE tx_size, COLOR_MAP_TYPE type, int allow_update_cdf, struct FRAME_COUNTS *counts); static inline int av1_get_tx_eob(const struct segmentation *seg, int segment_id, TX_SIZE tx_size) { const int eob_max = av1_get_max_eob(tx_size); return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; } // Token buffer is only used for palette tokens. static inline unsigned int get_token_alloc(int mb_rows, int mb_cols, int sb_size_log2, const int num_planes) { // Calculate the maximum number of max superblocks in the image. const int shift = sb_size_log2 - 4; const int sb_size = 1 << sb_size_log2; const int sb_size_square = sb_size * sb_size; const int sb_rows = CEIL_POWER_OF_TWO(mb_rows, shift); const int sb_cols = CEIL_POWER_OF_TWO(mb_cols, shift); // One palette token for each pixel. There can be palettes on two planes. const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square; return sb_rows * sb_cols * sb_palette_toks; } // Allocate memory for token related info. static inline void alloc_token_info(AV1_COMMON *cm, TokenInfo *token_info, unsigned int tokens_required) { int sb_rows = CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2); token_info->tokens_allocated = tokens_required; CHECK_MEM_ERROR(cm, token_info->tile_tok[0][0], (TokenExtra *)aom_calloc( tokens_required, sizeof(*token_info->tile_tok[0][0]))); CHECK_MEM_ERROR( cm, token_info->tplist[0][0], (TokenList *)aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS, sizeof(*token_info->tplist[0][0]))); } // Check if memory allocation has been done for token related info. static inline bool is_token_info_allocated(const TokenInfo *token_info) { return ((token_info->tile_tok[0][0] != NULL) && (token_info->tplist[0][0] != NULL)); } // Free memory from token related variables. static inline void free_token_info(TokenInfo *token_info) { aom_free(token_info->tile_tok[0][0]); token_info->tile_tok[0][0] = NULL; aom_free(token_info->tplist[0][0]); token_info->tplist[0][0] = NULL; token_info->tokens_allocated = 0; } #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_TOKENIZE_H_ aom-3.12.1/av1/encoder/tpl_model.c000066400000000000000000003061161477627663500166410ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/aom_config.h" #if CONFIG_THREE_PASS #include "av1/encoder/thirdpass.h" #endif #include "config/aom_dsp_rtcd.h" #include "config/aom_scale_rtcd.h" #include "aom/aom_codec.h" #include "aom_util/aom_pthread.h" #include "av1/common/av1_common_int.h" #include "av1/common/enums.h" #include "av1/common/idct.h" #include "av1/common/reconintra.h" #include "av1/encoder/encoder.h" #include "av1/encoder/ethread.h" #include "av1/encoder/encodeframe_utils.h" #include "av1/encoder/encode_strategy.h" #include "av1/encoder/hybrid_fwd_txfm.h" #include "av1/encoder/motion_search_facade.h" #include "av1/encoder/rd.h" #include "av1/encoder/rdopt.h" #include "av1/encoder/reconinter_enc.h" #include "av1/encoder/tpl_model.h" static inline double exp_bounded(double v) { // When v > 700 or <-700, the exp function will be close to overflow // For details, see the "Notes" in the following link. // https://en.cppreference.com/w/c/numeric/math/exp if (v > 700) { return DBL_MAX; } else if (v < -700) { return 0; } return exp(v); } void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats) { tpl_txfm_stats->ready = 0; tpl_txfm_stats->coeff_num = 256; tpl_txfm_stats->txfm_block_count = 0; memset(tpl_txfm_stats->abs_coeff_sum, 0, sizeof(tpl_txfm_stats->abs_coeff_sum[0]) * tpl_txfm_stats->coeff_num); memset(tpl_txfm_stats->abs_coeff_mean, 0, sizeof(tpl_txfm_stats->abs_coeff_mean[0]) * tpl_txfm_stats->coeff_num); } #if CONFIG_BITRATE_ACCURACY void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats, TplTxfmStats *accumulated_stats) { accumulated_stats->txfm_block_count += sub_stats->txfm_block_count; for (int i = 0; i < accumulated_stats->coeff_num; ++i) { accumulated_stats->abs_coeff_sum[i] += sub_stats->abs_coeff_sum[i]; } } void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats, const tran_low_t *coeff) { // For transform larger than 16x16, the scale of coeff need to be adjusted. // It's not LOSSLESS_Q_STEP. assert(tpl_txfm_stats->coeff_num <= 256); for (int i = 0; i < tpl_txfm_stats->coeff_num; ++i) { tpl_txfm_stats->abs_coeff_sum[i] += abs(coeff[i]) / (double)LOSSLESS_Q_STEP; } ++tpl_txfm_stats->txfm_block_count; } void av1_tpl_txfm_stats_update_abs_coeff_mean(TplTxfmStats *txfm_stats) { if (txfm_stats->txfm_block_count > 0) { for (int j = 0; j < txfm_stats->coeff_num; j++) { txfm_stats->abs_coeff_mean[j] = txfm_stats->abs_coeff_sum[j] / txfm_stats->txfm_block_count; } txfm_stats->ready = 1; } else { txfm_stats->ready = 0; } } static inline void av1_tpl_store_txfm_stats(TplParams *tpl_data, const TplTxfmStats *tpl_txfm_stats, const int frame_index) { tpl_data->txfm_stats_list[frame_index] = *tpl_txfm_stats; } #endif // CONFIG_BITRATE_ACCURACY static inline void get_quantize_error(const MACROBLOCK *x, int plane, const tran_low_t *coeff, tran_low_t *qcoeff, tran_low_t *dqcoeff, TX_SIZE tx_size, uint16_t *eob, int64_t *recon_error, int64_t *sse) { const struct macroblock_plane *const p = &x->plane[plane]; const MACROBLOCKD *xd = &x->e_mbd; const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; const int shift = tx_size == TX_32X32 ? 0 : 2; QUANT_PARAM quant_param; av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param); #if CONFIG_AV1_HIGHBITDEPTH if (is_cur_buf_hbd(xd)) { av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order, &quant_param); *recon_error = av1_highbd_block_error(coeff, dqcoeff, pix_num, sse, xd->bd) >> shift; } else { av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order, &quant_param); *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift; } #else (void)xd; av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order, &quant_param); *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift; #endif // CONFIG_AV1_HIGHBITDEPTH *recon_error = AOMMAX(*recon_error, 1); *sse = (*sse) >> shift; *sse = AOMMAX(*sse, 1); } static inline void set_tpl_stats_block_size(uint8_t *block_mis_log2, uint8_t *tpl_bsize_1d) { // tpl stats bsize: 2 means 16x16 *block_mis_log2 = 2; // Block size used in tpl motion estimation *tpl_bsize_1d = 16; // MIN_TPL_BSIZE_1D = 16; assert(*tpl_bsize_1d >= 16); } void av1_setup_tpl_buffers(AV1_PRIMARY *const ppi, CommonModeInfoParams *const mi_params, int width, int height, int byte_alignment, int lag_in_frames) { SequenceHeader *const seq_params = &ppi->seq_params; TplParams *const tpl_data = &ppi->tpl_data; set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2, &tpl_data->tpl_bsize_1d); const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; tpl_data->border_in_pixels = ALIGN_POWER_OF_TWO(tpl_data->tpl_bsize_1d + 2 * AOM_INTERP_EXTEND, 5); const int alloc_y_plane_only = ppi->cpi->sf.tpl_sf.use_y_only_rate_distortion ? 1 : 0; for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) { const int mi_cols = ALIGN_POWER_OF_TWO(mi_params->mi_cols, MAX_MIB_SIZE_LOG2); const int mi_rows = ALIGN_POWER_OF_TWO(mi_params->mi_rows, MAX_MIB_SIZE_LOG2); TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame]; tpl_frame->is_valid = 0; tpl_frame->width = mi_cols >> block_mis_log2; tpl_frame->height = mi_rows >> block_mis_log2; tpl_frame->stride = tpl_data->tpl_stats_buffer[frame].width; tpl_frame->mi_rows = mi_params->mi_rows; tpl_frame->mi_cols = mi_params->mi_cols; } tpl_data->tpl_frame = &tpl_data->tpl_stats_buffer[REF_FRAMES + 1]; // If lag_in_frames <= 1, TPL module is not invoked. Hence dynamic memory // allocations are avoided for buffers in tpl_data. if (lag_in_frames <= 1) return; AOM_CHECK_MEM_ERROR(&ppi->error, tpl_data->txfm_stats_list, aom_calloc(MAX_LENGTH_TPL_FRAME_STATS, sizeof(*tpl_data->txfm_stats_list))); for (int frame = 0; frame < lag_in_frames; ++frame) { AOM_CHECK_MEM_ERROR( &ppi->error, tpl_data->tpl_stats_pool[frame], aom_calloc(tpl_data->tpl_stats_buffer[frame].width * tpl_data->tpl_stats_buffer[frame].height, sizeof(*tpl_data->tpl_stats_buffer[frame].tpl_stats_ptr))); if (aom_alloc_frame_buffer( &tpl_data->tpl_rec_pool[frame], width, height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, tpl_data->border_in_pixels, byte_alignment, false, alloc_y_plane_only)) aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } } static inline int32_t tpl_get_satd_cost(BitDepthInfo bd_info, int16_t *src_diff, int diff_stride, const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, tran_low_t *coeff, int bw, int bh, TX_SIZE tx_size) { const int pix_num = bw * bh; av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride, dst, dst_stride); av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff); return aom_satd(coeff, pix_num); } static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) { const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob); int rate_cost = 1; for (int idx = 0; idx < eob; ++idx) { unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]); rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0); } return (rate_cost << AV1_PROB_COST_SHIFT); } static inline void txfm_quant_rdcost( const MACROBLOCK *x, int16_t *src_diff, int diff_stride, uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, tran_low_t *coeff, tran_low_t *qcoeff, tran_low_t *dqcoeff, int bw, int bh, TX_SIZE tx_size, int do_recon, int *rate_cost, int64_t *recon_error, int64_t *sse) { const MACROBLOCKD *xd = &x->e_mbd; const BitDepthInfo bd_info = get_bit_depth_info(xd); uint16_t eob; av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride, dst, dst_stride); av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff); get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &eob, recon_error, sse); *rate_cost = rate_estimator(qcoeff, eob, tx_size); if (do_recon) av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst, dst_stride, eob, 0); } static uint32_t motion_estimation(AV1_COMP *cpi, MACROBLOCK *x, uint8_t *cur_frame_buf, uint8_t *ref_frame_buf, int stride, int ref_stride, int width, int ref_width, BLOCK_SIZE bsize, MV center_mv, int_mv *best_mv) { AV1_COMMON *cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf; int step_param; uint32_t bestsme = UINT_MAX; FULLPEL_MV_STATS best_mv_stats; int distortion; uint32_t sse; int cost_list[5]; FULLPEL_MV start_mv = get_fullmv_from_mv(¢er_mv); // Setup frame pointers x->plane[0].src.buf = cur_frame_buf; x->plane[0].src.stride = stride; x->plane[0].src.width = width; xd->plane[0].pre[0].buf = ref_frame_buf; xd->plane[0].pre[0].stride = ref_stride; xd->plane[0].pre[0].width = ref_width; step_param = tpl_sf->reduce_first_step_size; step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2); const search_site_config *search_site_cfg = cpi->mv_search_params.search_site_cfg[SS_CFG_SRC]; if (search_site_cfg->stride != ref_stride) search_site_cfg = cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD]; assert(search_site_cfg->stride == ref_stride); FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, ¢er_mv, start_mv, search_site_cfg, tpl_sf->search_method, /*fine_search_interval=*/0); bestsme = av1_full_pixel_search(start_mv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list), &best_mv->as_fullmv, &best_mv_stats, NULL); // When sub-pel motion search is skipped, populate sub-pel precision MV and // return. if (tpl_sf->subpel_force_stop == FULL_PEL) { best_mv->as_mv = get_mv_from_fullmv(&best_mv->as_fullmv); return bestsme; } SUBPEL_MOTION_SEARCH_PARAMS ms_params; av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, ¢er_mv, cost_list); ms_params.forced_stop = tpl_sf->subpel_force_stop; ms_params.var_params.subpel_search_type = USE_2_TAPS; ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE; best_mv_stats.err_cost = 0; MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv); assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); bestsme = cpi->mv_search_params.find_fractional_mv_step( xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv->as_mv, &distortion, &sse, NULL); return bestsme; } typedef struct { int_mv mv; int sad; } center_mv_t; static int compare_sad(const void *a, const void *b) { const int diff = ((center_mv_t *)a)->sad - ((center_mv_t *)b)->sad; if (diff < 0) return -1; else if (diff > 0) return 1; return 0; } static int is_alike_mv(int_mv candidate_mv, center_mv_t *center_mvs, int center_mvs_count, int skip_alike_starting_mv) { // MV difference threshold is in 1/8 precision. const int mv_diff_thr[3] = { 1, (8 << 3), (16 << 3) }; int thr = mv_diff_thr[skip_alike_starting_mv]; int i; for (i = 0; i < center_mvs_count; i++) { if (abs(center_mvs[i].mv.as_mv.col - candidate_mv.as_mv.col) < thr && abs(center_mvs[i].mv.as_mv.row - candidate_mv.as_mv.row) < thr) return 1; } return 0; } static void get_rate_distortion( int *rate_cost, int64_t *recon_error, int64_t *pred_error, int16_t *src_diff, tran_low_t *coeff, tran_low_t *qcoeff, tran_low_t *dqcoeff, AV1_COMMON *cm, MACROBLOCK *x, const YV12_BUFFER_CONFIG *ref_frame_ptr[2], uint8_t *rec_buffer_pool[3], const int rec_stride_pool[3], TX_SIZE tx_size, PREDICTION_MODE best_mode, int mi_row, int mi_col, int use_y_only_rate_distortion, int do_recon, TplTxfmStats *tpl_txfm_stats) { const SequenceHeader *seq_params = cm->seq_params; *rate_cost = 0; *recon_error = 1; *pred_error = 1; (void)tpl_txfm_stats; MACROBLOCKD *xd = &x->e_mbd; int is_compound = (best_mode == NEW_NEWMV); int num_planes = use_y_only_rate_distortion ? 1 : MAX_MB_PLANE; uint8_t *src_buffer_pool[MAX_MB_PLANE] = { xd->cur_buf->y_buffer, xd->cur_buf->u_buffer, xd->cur_buf->v_buffer, }; const int src_stride_pool[MAX_MB_PLANE] = { xd->cur_buf->y_stride, xd->cur_buf->uv_stride, xd->cur_buf->uv_stride, }; const int_interpfilters kernel = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); for (int plane = 0; plane < num_planes; ++plane) { struct macroblockd_plane *pd = &xd->plane[plane]; BLOCK_SIZE bsize_plane = av1_ss_size_lookup[txsize_to_bsize[tx_size]][pd->subsampling_x] [pd->subsampling_y]; int dst_buffer_stride = rec_stride_pool[plane]; int dst_mb_offset = ((mi_row * MI_SIZE * dst_buffer_stride) >> pd->subsampling_y) + ((mi_col * MI_SIZE) >> pd->subsampling_x); uint8_t *dst_buffer = rec_buffer_pool[plane] + dst_mb_offset; for (int ref = 0; ref < 1 + is_compound; ++ref) { if (!is_inter_mode(best_mode)) { av1_predict_intra_block( xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, block_size_wide[bsize_plane], block_size_high[bsize_plane], max_txsize_rect_lookup[bsize_plane], best_mode, 0, 0, FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, dst_buffer, dst_buffer_stride, 0, 0, plane); } else { int_mv best_mv = xd->mi[0]->mv[ref]; uint8_t *ref_buffer_pool[MAX_MB_PLANE] = { ref_frame_ptr[ref]->y_buffer, ref_frame_ptr[ref]->u_buffer, ref_frame_ptr[ref]->v_buffer, }; InterPredParams inter_pred_params; struct buf_2d ref_buf = { NULL, ref_buffer_pool[plane], plane ? ref_frame_ptr[ref]->uv_width : ref_frame_ptr[ref]->y_width, plane ? ref_frame_ptr[ref]->uv_height : ref_frame_ptr[ref]->y_height, plane ? ref_frame_ptr[ref]->uv_stride : ref_frame_ptr[ref]->y_stride }; av1_init_inter_params(&inter_pred_params, block_size_wide[bsize_plane], block_size_high[bsize_plane], (mi_row * MI_SIZE) >> pd->subsampling_y, (mi_col * MI_SIZE) >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0, xd->block_ref_scale_factors[0], &ref_buf, kernel); if (is_compound) av1_init_comp_mode(&inter_pred_params); inter_pred_params.conv_params = get_conv_params_no_round( ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd); av1_enc_build_one_inter_predictor(dst_buffer, dst_buffer_stride, &best_mv.as_mv, &inter_pred_params); } } int src_stride = src_stride_pool[plane]; int src_mb_offset = ((mi_row * MI_SIZE * src_stride) >> pd->subsampling_y) + ((mi_col * MI_SIZE) >> pd->subsampling_x); int this_rate = 1; int64_t this_recon_error = 1; int64_t sse; txfm_quant_rdcost( x, src_diff, block_size_wide[bsize_plane], src_buffer_pool[plane] + src_mb_offset, src_stride, dst_buffer, dst_buffer_stride, coeff, qcoeff, dqcoeff, block_size_wide[bsize_plane], block_size_high[bsize_plane], max_txsize_rect_lookup[bsize_plane], do_recon, &this_rate, &this_recon_error, &sse); #if CONFIG_BITRATE_ACCURACY if (plane == 0 && tpl_txfm_stats) { // We only collect Y plane's transform coefficient av1_record_tpl_txfm_block(tpl_txfm_stats, coeff); } #endif // CONFIG_BITRATE_ACCURACY *recon_error += this_recon_error; *pred_error += sse; *rate_cost += this_rate; } } static inline int32_t get_inter_cost(const AV1_COMP *cpi, MACROBLOCKD *xd, const uint8_t *src_mb_buffer, int src_stride, TplBuffers *tpl_tmp_buffers, BLOCK_SIZE bsize, TX_SIZE tx_size, int mi_row, int mi_col, int rf_idx, MV *rfidx_mv, int use_pred_sad) { const BitDepthInfo bd_info = get_bit_depth_info(xd); TplParams *tpl_data = &cpi->ppi->tpl_data; const YV12_BUFFER_CONFIG *const ref_frame_ptr = tpl_data->src_ref_frame[rf_idx]; int16_t *src_diff = tpl_tmp_buffers->src_diff; tran_low_t *coeff = tpl_tmp_buffers->coeff; const int bw = 4 << mi_size_wide_log2[bsize]; const int bh = 4 << mi_size_high_log2[bsize]; int32_t inter_cost; if (cpi->sf.tpl_sf.subpel_force_stop != FULL_PEL) { const int_interpfilters kernel = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); uint8_t *predictor8 = tpl_tmp_buffers->predictor8; uint8_t *predictor = is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8; struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer, ref_frame_ptr->y_width, ref_frame_ptr->y_height, ref_frame_ptr->y_stride }; InterPredParams inter_pred_params; av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE, mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0, &tpl_data->sf, &ref_buf, kernel); inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd); av1_enc_build_one_inter_predictor(predictor, bw, rfidx_mv, &inter_pred_params); if (use_pred_sad) { inter_cost = (int)cpi->ppi->fn_ptr[bsize].sdf(src_mb_buffer, src_stride, predictor, bw); } else { inter_cost = tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, predictor, bw, coeff, bw, bh, tx_size); } } else { int ref_mb_offset = mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE; uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset; int ref_stride = ref_frame_ptr->y_stride; const FULLPEL_MV fullmv = get_fullmv_from_mv(rfidx_mv); // Since sub-pel motion search is not performed, use the prediction pixels // directly from the reference block ref_mb if (use_pred_sad) { inter_cost = (int)cpi->ppi->fn_ptr[bsize].sdf( src_mb_buffer, src_stride, &ref_mb[fullmv.row * ref_stride + fullmv.col], ref_stride); } else { inter_cost = tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, &ref_mb[fullmv.row * ref_stride + fullmv.col], ref_stride, coeff, bw, bh, tx_size); } } return inter_cost; } static inline void mode_estimation(AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats, TplBuffers *tpl_tmp_buffers, MACROBLOCK *x, int mi_row, int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, TplDepStats *tpl_stats) { AV1_COMMON *cm = &cpi->common; const GF_GROUP *gf_group = &cpi->ppi->gf_group; TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf; (void)gf_group; MACROBLOCKD *xd = &x->e_mbd; const BitDepthInfo bd_info = get_bit_depth_info(xd); TplParams *tpl_data = &cpi->ppi->tpl_data; TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx]; const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; const int bw = 4 << mi_size_wide_log2[bsize]; const int bh = 4 << mi_size_high_log2[bsize]; int32_t best_intra_cost = INT32_MAX; int32_t intra_cost; PREDICTION_MODE best_mode = DC_PRED; const int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; uint8_t *src_mb_buffer = xd->cur_buf->y_buffer + mb_y_offset; const int src_stride = xd->cur_buf->y_stride; const int src_width = xd->cur_buf->y_width; int dst_mb_offset = mi_row * MI_SIZE * tpl_frame->rec_picture->y_stride + mi_col * MI_SIZE; uint8_t *dst_buffer = tpl_frame->rec_picture->y_buffer + dst_mb_offset; int dst_buffer_stride = tpl_frame->rec_picture->y_stride; int use_y_only_rate_distortion = tpl_sf->use_y_only_rate_distortion; uint8_t *rec_buffer_pool[3] = { tpl_frame->rec_picture->y_buffer, tpl_frame->rec_picture->u_buffer, tpl_frame->rec_picture->v_buffer, }; const int rec_stride_pool[3] = { tpl_frame->rec_picture->y_stride, tpl_frame->rec_picture->uv_stride, tpl_frame->rec_picture->uv_stride, }; for (int plane = 1; plane < MAX_MB_PLANE; ++plane) { struct macroblockd_plane *pd = &xd->plane[plane]; pd->subsampling_x = xd->cur_buf->subsampling_x; pd->subsampling_y = xd->cur_buf->subsampling_y; } uint8_t *predictor8 = tpl_tmp_buffers->predictor8; int16_t *src_diff = tpl_tmp_buffers->src_diff; tran_low_t *coeff = tpl_tmp_buffers->coeff; tran_low_t *qcoeff = tpl_tmp_buffers->qcoeff; tran_low_t *dqcoeff = tpl_tmp_buffers->dqcoeff; uint8_t *predictor = is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8; int64_t recon_error = 1; int64_t pred_error = 1; memset(tpl_stats, 0, sizeof(*tpl_stats)); tpl_stats->ref_frame_index[0] = -1; tpl_stats->ref_frame_index[1] = -1; const int mi_width = mi_size_wide[bsize]; const int mi_height = mi_size_high[bsize]; set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd, mi_row, mi_col); set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width, cm->mi_params.mi_rows, cm->mi_params.mi_cols); set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], av1_num_planes(cm)); xd->mi[0]->bsize = bsize; xd->mi[0]->motion_mode = SIMPLE_TRANSLATION; // Intra prediction search xd->mi[0]->ref_frame[0] = INTRA_FRAME; // Pre-load the bottom left line. if (xd->left_available && mi_row + tx_size_high_unit[tx_size] < xd->tile.mi_row_end) { if (is_cur_buf_hbd(xd)) { uint16_t *dst = CONVERT_TO_SHORTPTR(dst_buffer); for (int i = 0; i < bw; ++i) dst[(bw + i) * dst_buffer_stride - 1] = dst[(bw - 1) * dst_buffer_stride - 1]; } else { for (int i = 0; i < bw; ++i) dst_buffer[(bw + i) * dst_buffer_stride - 1] = dst_buffer[(bw - 1) * dst_buffer_stride - 1]; } } // if cpi->sf.tpl_sf.prune_intra_modes is on, then search only DC_PRED, // H_PRED, and V_PRED const PREDICTION_MODE last_intra_mode = tpl_sf->prune_intra_modes ? D45_PRED : INTRA_MODE_END; const SequenceHeader *seq_params = cm->seq_params; for (PREDICTION_MODE mode = INTRA_MODE_START; mode < last_intra_mode; ++mode) { av1_predict_intra_block(xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, block_size_wide[bsize], block_size_high[bsize], tx_size, mode, 0, 0, FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, predictor, bw, 0, 0, 0); if (tpl_frame->use_pred_sad) { intra_cost = (int32_t)cpi->ppi->fn_ptr[bsize].sdf( src_mb_buffer, src_stride, predictor, bw); } else { intra_cost = tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, predictor, bw, coeff, bw, bh, tx_size); } if (intra_cost < best_intra_cost) { best_intra_cost = intra_cost; best_mode = mode; } } // Calculate SATD of the best intra mode if SAD was used for mode decision // as best_intra_cost is used in ML model to skip intra mode evaluation. if (tpl_frame->use_pred_sad) { av1_predict_intra_block( xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, block_size_wide[bsize], block_size_high[bsize], tx_size, best_mode, 0, 0, FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, predictor, bw, 0, 0, 0); best_intra_cost = tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, predictor, bw, coeff, bw, bh, tx_size); } int rate_cost = 1; if (cpi->use_ducky_encode) { get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, qcoeff, dqcoeff, cm, x, NULL, rec_buffer_pool, rec_stride_pool, tx_size, best_mode, mi_row, mi_col, use_y_only_rate_distortion, 1 /*do_recon*/, NULL); tpl_stats->intra_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; tpl_stats->intra_sse = pred_error << TPL_DEP_COST_SCALE_LOG2; tpl_stats->intra_rate = rate_cost; } #if CONFIG_THREE_PASS const int frame_offset = tpl_data->frame_idx - cpi->gf_frame_index; if (cpi->third_pass_ctx && frame_offset < cpi->third_pass_ctx->frame_info_count && tpl_data->frame_idx < gf_group->size) { double ratio_h, ratio_w; av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height, cm->width, &ratio_h, &ratio_w); THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi( cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w); PREDICTION_MODE third_pass_mode = this_mi->pred_mode; if (third_pass_mode >= last_intra_mode && third_pass_mode < INTRA_MODE_END) { av1_predict_intra_block( xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, block_size_wide[bsize], block_size_high[bsize], tx_size, third_pass_mode, 0, 0, FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, predictor, bw, 0, 0, 0); intra_cost = tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, predictor, bw, coeff, bw, bh, tx_size); if (intra_cost < best_intra_cost) { best_intra_cost = intra_cost; best_mode = third_pass_mode; } } } #endif // CONFIG_THREE_PASS // Motion compensated prediction xd->mi[0]->ref_frame[0] = INTRA_FRAME; xd->mi[0]->ref_frame[1] = NONE_FRAME; xd->mi[0]->compound_idx = 1; int best_rf_idx = -1; int_mv best_mv[2]; int32_t inter_cost; int32_t best_inter_cost = INT32_MAX; int rf_idx; int_mv single_mv[INTER_REFS_PER_FRAME]; best_mv[0].as_int = INVALID_MV; best_mv[1].as_int = INVALID_MV; for (rf_idx = 0; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx) { single_mv[rf_idx].as_int = INVALID_MV; if (tpl_data->ref_frame[rf_idx] == NULL || tpl_data->src_ref_frame[rf_idx] == NULL) { tpl_stats->mv[rf_idx].as_int = INVALID_MV; continue; } const YV12_BUFFER_CONFIG *ref_frame_ptr = tpl_data->src_ref_frame[rf_idx]; const int ref_mb_offset = mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE; uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset; const int ref_stride = ref_frame_ptr->y_stride; const int ref_width = ref_frame_ptr->y_width; int_mv best_rfidx_mv = { 0 }; uint32_t bestsme = UINT32_MAX; center_mv_t center_mvs[4] = { { { 0 }, INT_MAX }, { { 0 }, INT_MAX }, { { 0 }, INT_MAX }, { { 0 }, INT_MAX } }; int refmv_count = 1; int idx; if (xd->up_available) { TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( mi_row - mi_height, mi_col, tpl_frame->stride, block_mis_log2)]; if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count, tpl_sf->skip_alike_starting_mv)) { center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int; ++refmv_count; } } if (xd->left_available) { TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( mi_row, mi_col - mi_width, tpl_frame->stride, block_mis_log2)]; if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count, tpl_sf->skip_alike_starting_mv)) { center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int; ++refmv_count; } } if (xd->up_available && mi_col + mi_width < xd->tile.mi_col_end) { TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( mi_row - mi_height, mi_col + mi_width, tpl_frame->stride, block_mis_log2)]; if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count, tpl_sf->skip_alike_starting_mv)) { center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int; ++refmv_count; } } #if CONFIG_THREE_PASS if (cpi->third_pass_ctx && frame_offset < cpi->third_pass_ctx->frame_info_count && tpl_data->frame_idx < gf_group->size) { double ratio_h, ratio_w; av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height, cm->width, &ratio_h, &ratio_w); THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi( cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w); int_mv tp_mv = av1_get_third_pass_adjusted_mv(this_mi, ratio_h, ratio_w, rf_idx + LAST_FRAME); if (tp_mv.as_int != INVALID_MV && !is_alike_mv(tp_mv, center_mvs + 1, refmv_count - 1, tpl_sf->skip_alike_starting_mv)) { center_mvs[0].mv = tp_mv; } } #endif // CONFIG_THREE_PASS // Prune starting mvs if (tpl_sf->prune_starting_mv && refmv_count > 1) { // Get each center mv's sad. for (idx = 0; idx < refmv_count; ++idx) { FULLPEL_MV mv = get_fullmv_from_mv(¢er_mvs[idx].mv.as_mv); clamp_fullmv(&mv, &x->mv_limits); center_mvs[idx].sad = (int)cpi->ppi->fn_ptr[bsize].sdf( src_mb_buffer, src_stride, &ref_mb[mv.row * ref_stride + mv.col], ref_stride); } // Rank center_mv using sad. qsort(center_mvs, refmv_count, sizeof(center_mvs[0]), compare_sad); refmv_count = AOMMIN(4 - tpl_sf->prune_starting_mv, refmv_count); // Further reduce number of refmv based on sad difference. if (refmv_count > 1) { int last_sad = center_mvs[refmv_count - 1].sad; int second_to_last_sad = center_mvs[refmv_count - 2].sad; if ((last_sad - second_to_last_sad) * 5 > second_to_last_sad) refmv_count--; } } for (idx = 0; idx < refmv_count; ++idx) { int_mv this_mv; uint32_t thissme = motion_estimation( cpi, x, src_mb_buffer, ref_mb, src_stride, ref_stride, src_width, ref_width, bsize, center_mvs[idx].mv.as_mv, &this_mv); if (thissme < bestsme) { bestsme = thissme; best_rfidx_mv = this_mv; } } tpl_stats->mv[rf_idx].as_int = best_rfidx_mv.as_int; single_mv[rf_idx] = best_rfidx_mv; inter_cost = get_inter_cost( cpi, xd, src_mb_buffer, src_stride, tpl_tmp_buffers, bsize, tx_size, mi_row, mi_col, rf_idx, &best_rfidx_mv.as_mv, tpl_frame->use_pred_sad); // Store inter cost for each ref frame. This is used to prune inter modes. tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost); if (inter_cost < best_inter_cost) { best_rf_idx = rf_idx; best_inter_cost = inter_cost; best_mv[0].as_int = best_rfidx_mv.as_int; } } // Calculate SATD of the best inter mode if SAD was used for mode decision // as best_inter_cost is used in ML model to skip intra mode evaluation. if (best_inter_cost < INT32_MAX && tpl_frame->use_pred_sad) { assert(best_rf_idx != -1); best_inter_cost = get_inter_cost( cpi, xd, src_mb_buffer, src_stride, tpl_tmp_buffers, bsize, tx_size, mi_row, mi_col, best_rf_idx, &best_mv[0].as_mv, 0 /* use_pred_sad */); } if (best_rf_idx != -1 && best_inter_cost < best_intra_cost) { best_mode = NEWMV; xd->mi[0]->ref_frame[0] = best_rf_idx + LAST_FRAME; xd->mi[0]->mv[0].as_int = best_mv[0].as_int; } // Start compound predition search. int comp_ref_frames[3][2] = { { 0, 4 }, { 0, 6 }, { 3, 6 }, }; int start_rf = 0; int end_rf = 3; if (!tpl_sf->allow_compound_pred) end_rf = 0; #if CONFIG_THREE_PASS if (cpi->third_pass_ctx && frame_offset < cpi->third_pass_ctx->frame_info_count && tpl_data->frame_idx < gf_group->size) { double ratio_h, ratio_w; av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height, cm->width, &ratio_h, &ratio_w); THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi( cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w); if (this_mi->ref_frame[0] >= LAST_FRAME && this_mi->ref_frame[1] >= LAST_FRAME) { int found = 0; for (int i = 0; i < 3; i++) { if (comp_ref_frames[i][0] + LAST_FRAME == this_mi->ref_frame[0] && comp_ref_frames[i][1] + LAST_FRAME == this_mi->ref_frame[1]) { found = 1; break; } } if (!found || !tpl_sf->allow_compound_pred) { comp_ref_frames[2][0] = this_mi->ref_frame[0] - LAST_FRAME; comp_ref_frames[2][1] = this_mi->ref_frame[1] - LAST_FRAME; if (!tpl_sf->allow_compound_pred) { start_rf = 2; end_rf = 3; } } } } #endif // CONFIG_THREE_PASS xd->mi_row = mi_row; xd->mi_col = mi_col; int best_cmp_rf_idx = -1; const int_interpfilters kernel = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); for (int cmp_rf_idx = start_rf; cmp_rf_idx < end_rf; ++cmp_rf_idx) { int rf_idx0 = comp_ref_frames[cmp_rf_idx][0]; int rf_idx1 = comp_ref_frames[cmp_rf_idx][1]; if (tpl_data->ref_frame[rf_idx0] == NULL || tpl_data->src_ref_frame[rf_idx0] == NULL || tpl_data->ref_frame[rf_idx1] == NULL || tpl_data->src_ref_frame[rf_idx1] == NULL) { continue; } const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = { tpl_data->src_ref_frame[rf_idx0], tpl_data->src_ref_frame[rf_idx1], }; xd->mi[0]->ref_frame[0] = rf_idx0 + LAST_FRAME; xd->mi[0]->ref_frame[1] = rf_idx1 + LAST_FRAME; xd->mi[0]->mode = NEW_NEWMV; const int8_t ref_frame_type = av1_ref_frame_type(xd->mi[0]->ref_frame); // Set up ref_mv for av1_joint_motion_search(). CANDIDATE_MV *this_ref_mv_stack = x->mbmi_ext.ref_mv_stack[ref_frame_type]; this_ref_mv_stack[xd->mi[0]->ref_mv_idx].this_mv = single_mv[rf_idx0]; this_ref_mv_stack[xd->mi[0]->ref_mv_idx].comp_mv = single_mv[rf_idx1]; struct buf_2d yv12_mb[2][MAX_MB_PLANE]; for (int i = 0; i < 2; ++i) { av1_setup_pred_block(xd, yv12_mb[i], ref_frame_ptr[i], xd->block_ref_scale_factors[i], xd->block_ref_scale_factors[i], MAX_MB_PLANE); for (int plane = 0; plane < MAX_MB_PLANE; ++plane) { xd->plane[plane].pre[i] = yv12_mb[i][plane]; } } int_mv tmp_mv[2] = { single_mv[rf_idx0], single_mv[rf_idx1] }; int rate_mv; av1_joint_motion_search(cpi, x, bsize, tmp_mv, NULL, 0, &rate_mv, !cpi->sf.mv_sf.disable_second_mv, NUM_JOINT_ME_REFINE_ITER); for (int ref = 0; ref < 2; ++ref) { struct buf_2d ref_buf = { NULL, ref_frame_ptr[ref]->y_buffer, ref_frame_ptr[ref]->y_width, ref_frame_ptr[ref]->y_height, ref_frame_ptr[ref]->y_stride }; InterPredParams inter_pred_params; av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE, mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0, &tpl_data->sf, &ref_buf, kernel); av1_init_comp_mode(&inter_pred_params); inter_pred_params.conv_params = get_conv_params_no_round( ref, 0, xd->tmp_conv_dst, MAX_SB_SIZE, 1, xd->bd); av1_enc_build_one_inter_predictor(predictor, bw, &tmp_mv[ref].as_mv, &inter_pred_params); } inter_cost = tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, predictor, bw, coeff, bw, bh, tx_size); if (inter_cost < best_inter_cost) { best_cmp_rf_idx = cmp_rf_idx; best_inter_cost = inter_cost; best_mv[0] = tmp_mv[0]; best_mv[1] = tmp_mv[1]; } } if (best_cmp_rf_idx != -1 && best_inter_cost < best_intra_cost) { best_mode = NEW_NEWMV; const int best_rf_idx0 = comp_ref_frames[best_cmp_rf_idx][0]; const int best_rf_idx1 = comp_ref_frames[best_cmp_rf_idx][1]; xd->mi[0]->ref_frame[0] = best_rf_idx0 + LAST_FRAME; xd->mi[0]->ref_frame[1] = best_rf_idx1 + LAST_FRAME; } if (best_inter_cost < INT32_MAX && is_inter_mode(best_mode)) { xd->mi[0]->mv[0].as_int = best_mv[0].as_int; xd->mi[0]->mv[1].as_int = best_mv[1].as_int; const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = { best_cmp_rf_idx >= 0 ? tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]] : tpl_data->src_ref_frame[best_rf_idx], best_cmp_rf_idx >= 0 ? tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]] : NULL, }; rate_cost = 1; get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, rec_stride_pool, tx_size, best_mode, mi_row, mi_col, use_y_only_rate_distortion, 0 /*do_recon*/, NULL); tpl_stats->srcrf_rate = rate_cost; } best_intra_cost = AOMMAX(best_intra_cost, 1); best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost); tpl_stats->inter_cost = best_inter_cost; tpl_stats->intra_cost = best_intra_cost; tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2; // Final encode rate_cost = 0; const YV12_BUFFER_CONFIG *ref_frame_ptr[2]; ref_frame_ptr[0] = best_mode == NEW_NEWMV ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]] : best_rf_idx >= 0 ? tpl_data->ref_frame[best_rf_idx] : NULL; ref_frame_ptr[1] = best_mode == NEW_NEWMV ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]] : NULL; get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, rec_stride_pool, tx_size, best_mode, mi_row, mi_col, use_y_only_rate_distortion, 1 /*do_recon*/, tpl_txfm_stats); tpl_stats->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; tpl_stats->recrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2; tpl_stats->recrf_rate = rate_cost; if (!is_inter_mode(best_mode)) { tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; tpl_stats->srcrf_rate = rate_cost; tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2; } tpl_stats->recrf_dist = AOMMAX(tpl_stats->srcrf_dist, tpl_stats->recrf_dist); tpl_stats->recrf_rate = AOMMAX(tpl_stats->srcrf_rate, tpl_stats->recrf_rate); if (best_mode == NEW_NEWMV) { ref_frame_ptr[0] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]; ref_frame_ptr[1] = tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]; get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, rec_stride_pool, tx_size, best_mode, mi_row, mi_col, use_y_only_rate_distortion, 1 /*do_recon*/, NULL); tpl_stats->cmp_recrf_dist[0] = recon_error << TPL_DEP_COST_SCALE_LOG2; tpl_stats->cmp_recrf_rate[0] = rate_cost; tpl_stats->cmp_recrf_dist[0] = AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[0]); tpl_stats->cmp_recrf_rate[0] = AOMMAX(tpl_stats->srcrf_rate, tpl_stats->cmp_recrf_rate[0]); tpl_stats->cmp_recrf_dist[0] = AOMMIN(tpl_stats->recrf_dist, tpl_stats->cmp_recrf_dist[0]); tpl_stats->cmp_recrf_rate[0] = AOMMIN(tpl_stats->recrf_rate, tpl_stats->cmp_recrf_rate[0]); rate_cost = 0; ref_frame_ptr[0] = tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]; ref_frame_ptr[1] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]; get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, rec_stride_pool, tx_size, best_mode, mi_row, mi_col, use_y_only_rate_distortion, 1 /*do_recon*/, NULL); tpl_stats->cmp_recrf_dist[1] = recon_error << TPL_DEP_COST_SCALE_LOG2; tpl_stats->cmp_recrf_rate[1] = rate_cost; tpl_stats->cmp_recrf_dist[1] = AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[1]); tpl_stats->cmp_recrf_rate[1] = AOMMAX(tpl_stats->srcrf_rate, tpl_stats->cmp_recrf_rate[1]); tpl_stats->cmp_recrf_dist[1] = AOMMIN(tpl_stats->recrf_dist, tpl_stats->cmp_recrf_dist[1]); tpl_stats->cmp_recrf_rate[1] = AOMMIN(tpl_stats->recrf_rate, tpl_stats->cmp_recrf_rate[1]); } if (best_mode == NEWMV) { tpl_stats->mv[best_rf_idx] = best_mv[0]; tpl_stats->ref_frame_index[0] = best_rf_idx; tpl_stats->ref_frame_index[1] = NONE_FRAME; } else if (best_mode == NEW_NEWMV) { tpl_stats->ref_frame_index[0] = comp_ref_frames[best_cmp_rf_idx][0]; tpl_stats->ref_frame_index[1] = comp_ref_frames[best_cmp_rf_idx][1]; tpl_stats->mv[tpl_stats->ref_frame_index[0]] = best_mv[0]; tpl_stats->mv[tpl_stats->ref_frame_index[1]] = best_mv[1]; } for (int idy = 0; idy < mi_height; ++idy) { for (int idx = 0; idx < mi_width; ++idx) { if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > idx && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > idy) { xd->mi[idx + idy * cm->mi_params.mi_stride] = xd->mi[0]; } } } } static int round_floor(int ref_pos, int bsize_pix) { int round; if (ref_pos < 0) round = -(1 + (-ref_pos - 1) / bsize_pix); else round = ref_pos / bsize_pix; return round; } int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width, int height) { int min_row = AOMMAX(row_a, row_b); int max_row = AOMMIN(row_a + height, row_b + height); int min_col = AOMMAX(col_a, col_b); int max_col = AOMMIN(col_a + width, col_b + width); if (min_row < max_row && min_col < max_col) { return (max_row - min_row) * (max_col - min_col); } return 0; } int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift) { return (mi_row >> right_shift) * stride + (mi_col >> right_shift); } int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist, int64_t srcrf_dist, int pix_num) { double beta = (double)srcrf_dist / recrf_dist; int64_t rate_cost = delta_rate; if (srcrf_dist <= 128) return rate_cost; double dr = (double)(delta_rate >> (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT)) / pix_num; double log_den = log(beta) / log(2.0) + 2.0 * dr; if (log_den > log(10.0) / log(2.0)) { rate_cost = (int64_t)((log(1.0 / beta) * pix_num) / log(2.0) / 2.0); rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT); return rate_cost; } double num = pow(2.0, log_den); double den = num * beta + (1 - beta) * beta; rate_cost = (int64_t)((pix_num * log(num / den)) / log(2.0) / 2.0); rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT); return rate_cost; } static inline void tpl_model_update_b(TplParams *const tpl_data, int mi_row, int mi_col, const BLOCK_SIZE bsize, int frame_idx, int ref) { TplDepFrame *tpl_frame_ptr = &tpl_data->tpl_frame[frame_idx]; TplDepStats *tpl_ptr = tpl_frame_ptr->tpl_stats_ptr; TplDepFrame *tpl_frame = tpl_data->tpl_frame; const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; TplDepStats *tpl_stats_ptr = &tpl_ptr[av1_tpl_ptr_pos( mi_row, mi_col, tpl_frame->stride, block_mis_log2)]; int is_compound = tpl_stats_ptr->ref_frame_index[1] >= 0; if (tpl_stats_ptr->ref_frame_index[ref] < 0) return; const int ref_frame_index = tpl_stats_ptr->ref_frame_index[ref]; TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_frame[frame_idx].ref_map_index[ref_frame_index]]; TplDepStats *ref_stats_ptr = ref_tpl_frame->tpl_stats_ptr; if (tpl_frame[frame_idx].ref_map_index[ref_frame_index] < 0) return; const FULLPEL_MV full_mv = get_fullmv_from_mv(&tpl_stats_ptr->mv[ref_frame_index].as_mv); const int ref_pos_row = mi_row * MI_SIZE + full_mv.row; const int ref_pos_col = mi_col * MI_SIZE + full_mv.col; const int bw = 4 << mi_size_wide_log2[bsize]; const int bh = 4 << mi_size_high_log2[bsize]; const int mi_height = mi_size_high[bsize]; const int mi_width = mi_size_wide[bsize]; const int pix_num = bw * bh; // top-left on grid block location in pixel int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; int block; int64_t srcrf_dist = is_compound ? tpl_stats_ptr->cmp_recrf_dist[!ref] : tpl_stats_ptr->srcrf_dist; int64_t srcrf_rate = is_compound ? (tpl_stats_ptr->cmp_recrf_rate[!ref] << TPL_DEP_COST_SCALE_LOG2) : (tpl_stats_ptr->srcrf_rate << TPL_DEP_COST_SCALE_LOG2); int64_t cur_dep_dist = tpl_stats_ptr->recrf_dist - srcrf_dist; int64_t mc_dep_dist = (int64_t)(tpl_stats_ptr->mc_dep_dist * ((double)(tpl_stats_ptr->recrf_dist - srcrf_dist) / tpl_stats_ptr->recrf_dist)); int64_t delta_rate = (tpl_stats_ptr->recrf_rate << TPL_DEP_COST_SCALE_LOG2) - srcrf_rate; int64_t mc_dep_rate = av1_delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist, srcrf_dist, pix_num); for (block = 0; block < 4; ++block) { int grid_pos_row = grid_pos_row_base + bh * (block >> 1); int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { int overlap_area = av1_get_overlap_area(grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, bw, bh); int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; assert((1 << block_mis_log2) == mi_height); assert((1 << block_mis_log2) == mi_width); TplDepStats *des_stats = &ref_stats_ptr[av1_tpl_ptr_pos( ref_mi_row, ref_mi_col, ref_tpl_frame->stride, block_mis_log2)]; des_stats->mc_dep_dist += ((cur_dep_dist + mc_dep_dist) * overlap_area) / pix_num; des_stats->mc_dep_rate += ((delta_rate + mc_dep_rate) * overlap_area) / pix_num; } } } static inline void tpl_model_update(TplParams *const tpl_data, int mi_row, int mi_col, int frame_idx) { const BLOCK_SIZE tpl_stats_block_size = convert_length_to_bsize(MI_SIZE << tpl_data->tpl_stats_block_mis_log2); tpl_model_update_b(tpl_data, mi_row, mi_col, tpl_stats_block_size, frame_idx, 0); tpl_model_update_b(tpl_data, mi_row, mi_col, tpl_stats_block_size, frame_idx, 1); } static inline void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row, int mi_col, int stride, const TplDepStats *src_stats, uint8_t block_mis_log2) { int index = av1_tpl_ptr_pos(mi_row, mi_col, stride, block_mis_log2); TplDepStats *tpl_ptr = &tpl_stats_ptr[index]; *tpl_ptr = *src_stats; tpl_ptr->intra_cost = AOMMAX(1, tpl_ptr->intra_cost); tpl_ptr->inter_cost = AOMMAX(1, tpl_ptr->inter_cost); tpl_ptr->srcrf_dist = AOMMAX(1, tpl_ptr->srcrf_dist); tpl_ptr->srcrf_sse = AOMMAX(1, tpl_ptr->srcrf_sse); tpl_ptr->recrf_dist = AOMMAX(1, tpl_ptr->recrf_dist); tpl_ptr->srcrf_rate = AOMMAX(1, tpl_ptr->srcrf_rate); tpl_ptr->recrf_rate = AOMMAX(1, tpl_ptr->recrf_rate); tpl_ptr->cmp_recrf_dist[0] = AOMMAX(1, tpl_ptr->cmp_recrf_dist[0]); tpl_ptr->cmp_recrf_dist[1] = AOMMAX(1, tpl_ptr->cmp_recrf_dist[1]); tpl_ptr->cmp_recrf_rate[0] = AOMMAX(1, tpl_ptr->cmp_recrf_rate[0]); tpl_ptr->cmp_recrf_rate[1] = AOMMAX(1, tpl_ptr->cmp_recrf_rate[1]); } // Reset the ref and source frame pointers of tpl_data. static inline void tpl_reset_src_ref_frames(TplParams *tpl_data) { for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { tpl_data->ref_frame[i] = NULL; tpl_data->src_ref_frame[i] = NULL; } } static inline int get_gop_length(const GF_GROUP *gf_group) { int gop_length = AOMMIN(gf_group->size, MAX_TPL_FRAME_IDX - 1); return gop_length; } // Initialize the mc_flow parameters used in computing tpl data. static inline void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx, int pframe_qindex) { TplParams *const tpl_data = &cpi->ppi->tpl_data; TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx]; const YV12_BUFFER_CONFIG *this_frame = tpl_frame->gf_picture; const YV12_BUFFER_CONFIG *ref_frames_ordered[INTER_REFS_PER_FRAME]; uint32_t ref_frame_display_indices[INTER_REFS_PER_FRAME]; const GF_GROUP *gf_group = &cpi->ppi->gf_group; TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf; int ref_pruning_enabled = is_frame_eligible_for_ref_pruning( gf_group, cpi->sf.inter_sf.selective_ref_frame, tpl_sf->prune_ref_frames_in_tpl, frame_idx); int gop_length = get_gop_length(gf_group); int ref_frame_flags; AV1_COMMON *cm = &cpi->common; int rdmult, idx; ThreadData *td = &cpi->td; MACROBLOCK *x = &td->mb; MACROBLOCKD *xd = &x->e_mbd; TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats; tpl_data->frame_idx = frame_idx; tpl_reset_src_ref_frames(tpl_data); av1_tile_init(&xd->tile, cm, 0, 0); const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); const FRAME_TYPE frame_type = cm->current_frame.frame_type; // Setup scaling factor av1_setup_scale_factors_for_frame( &tpl_data->sf, this_frame->y_crop_width, this_frame->y_crop_height, this_frame->y_crop_width, this_frame->y_crop_height); xd->cur_buf = this_frame; for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) { TplDepFrame *tpl_ref_frame = &tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]]; tpl_data->ref_frame[idx] = tpl_ref_frame->rec_picture; tpl_data->src_ref_frame[idx] = tpl_ref_frame->gf_picture; ref_frame_display_indices[idx] = tpl_ref_frame->frame_display_index; } // Store the reference frames based on priority order for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { ref_frames_ordered[i] = tpl_data->ref_frame[ref_frame_priority_order[i] - 1]; } // Work out which reference frame slots may be used. ref_frame_flags = get_ref_frame_flags(&cpi->sf, is_one_pass_rt_params(cpi), ref_frames_ordered, cpi->ext_flags.ref_frame_flags); enforce_max_ref_frames(cpi, &ref_frame_flags, ref_frame_display_indices, tpl_frame->frame_display_index); // Prune reference frames for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) { if ((ref_frame_flags & (1 << idx)) == 0) { tpl_data->ref_frame[idx] = NULL; } } // Skip motion estimation w.r.t. reference frames which are not // considered in RD search, using "selective_ref_frame" speed feature. // The reference frame pruning is not enabled for frames beyond the gop // length, as there are fewer reference frames and the reference frames // differ from the frames considered during RD search. if (ref_pruning_enabled && (frame_idx < gop_length)) { for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) { const MV_REFERENCE_FRAME refs[2] = { idx + 1, NONE_FRAME }; if (prune_ref_by_selective_ref_frame(cpi, NULL, refs, ref_frame_display_indices)) { tpl_data->ref_frame[idx] = NULL; } } } // Make a temporary mbmi for tpl model MB_MODE_INFO mbmi; memset(&mbmi, 0, sizeof(mbmi)); MB_MODE_INFO *mbmi_ptr = &mbmi; xd->mi = &mbmi_ptr; xd->block_ref_scale_factors[0] = &tpl_data->sf; xd->block_ref_scale_factors[1] = &tpl_data->sf; const int base_qindex = cpi->use_ducky_encode ? gf_group->q_val[frame_idx] : pframe_qindex; // The TPL model is only meant to be run in inter mode, so ensure that we are // not running in all intra mode, which implies we are not tuning for image // quality (IQ). assert(cpi->oxcf.tune_cfg.tuning != AOM_TUNE_IQ && cpi->oxcf.mode != ALLINTRA); // Get rd multiplier set up. rdmult = av1_compute_rd_mult( base_qindex, cm->seq_params->bit_depth, cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi), cpi->oxcf.tune_cfg.tuning); if (rdmult < 1) rdmult = 1; av1_set_error_per_bit(&x->errorperbit, rdmult); av1_set_sad_per_bit(cpi, &x->sadperbit, base_qindex); tpl_frame->is_valid = 1; cm->quant_params.base_qindex = base_qindex; av1_frame_init_quantizer(cpi); const BitDepthInfo bd_info = get_bit_depth_info(xd); const FRAME_UPDATE_TYPE update_type = gf_group->update_type[cpi->gf_frame_index]; tpl_frame->base_rdmult = av1_compute_rd_mult_based_on_qindex( bd_info.bit_depth, update_type, base_qindex, cpi->oxcf.tune_cfg.tuning) / 6; if (cpi->use_ducky_encode) tpl_frame->base_rdmult = gf_group->rdmult_val[frame_idx]; av1_init_tpl_txfm_stats(tpl_txfm_stats); // Initialize x->mbmi_ext when compound predictions are enabled. if (tpl_sf->allow_compound_pred) av1_zero(x->mbmi_ext); // Set the pointer to null since mbmi is only allocated inside this function. assert(xd->mi == &mbmi_ptr); xd->mi = NULL; // Tpl module is called before the setting of speed features at frame level. // Thus, turning off this speed feature for key frame is done here and not // integrated into the speed feature setting itself. const int layer_depth_th = (tpl_sf->use_sad_for_mode_decision == 1) ? 5 : 0; tpl_frame->use_pred_sad = tpl_sf->use_sad_for_mode_decision && gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE && gf_group->layer_depth[frame_idx] >= layer_depth_th; } // This function stores the motion estimation dependencies of all the blocks in // a row void av1_mc_flow_dispenser_row(AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats, TplBuffers *tpl_tmp_buffers, MACROBLOCK *x, int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size) { AV1_COMMON *const cm = &cpi->common; MultiThreadInfo *const mt_info = &cpi->mt_info; AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt; const CommonModeInfoParams *const mi_params = &cm->mi_params; const int mi_width = mi_size_wide[bsize]; TplParams *const tpl_data = &cpi->ppi->tpl_data; TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx]; MACROBLOCKD *xd = &x->e_mbd; const int tplb_cols_in_tile = ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]); const int tplb_row = ROUND_POWER_OF_TWO(mi_row, mi_size_high_log2[bsize]); assert(mi_size_high[bsize] == (1 << tpl_data->tpl_stats_block_mis_log2)); assert(mi_size_wide[bsize] == (1 << tpl_data->tpl_stats_block_mis_log2)); for (int mi_col = 0, tplb_col_in_tile = 0; mi_col < mi_params->mi_cols; mi_col += mi_width, tplb_col_in_tile++) { (*tpl_row_mt->sync_read_ptr)(&tpl_data->tpl_mt_sync, tplb_row, tplb_col_in_tile); #if CONFIG_MULTITHREAD if (mt_info->num_workers > 1) { pthread_mutex_lock(tpl_row_mt->mutex_); const bool tpl_mt_exit = tpl_row_mt->tpl_mt_exit; pthread_mutex_unlock(tpl_row_mt->mutex_); // Exit in case any worker has encountered an error. if (tpl_mt_exit) return; } #endif TplDepStats tpl_stats; // Motion estimation column boundary av1_set_mv_col_limits(mi_params, &x->mv_limits, mi_col, mi_width, tpl_data->border_in_pixels); xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE); xd->mb_to_right_edge = GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col); mode_estimation(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row, mi_col, bsize, tx_size, &tpl_stats); // Motion flow dependency dispenser. tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, tpl_frame->stride, &tpl_stats, tpl_data->tpl_stats_block_mis_log2); (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row, tplb_col_in_tile, tplb_cols_in_tile); } } static inline void mc_flow_dispenser(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; ThreadData *td = &cpi->td; MACROBLOCK *x = &td->mb; MACROBLOCKD *xd = &x->e_mbd; const BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); const TX_SIZE tx_size = max_txsize_lookup[bsize]; const int mi_height = mi_size_high[bsize]; for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) { // Motion estimation row boundary av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height, cpi->ppi->tpl_data.border_in_pixels); xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE); av1_mc_flow_dispenser_row(cpi, &td->tpl_txfm_stats, &td->tpl_tmp_buffers, x, mi_row, bsize, tx_size); } } static void mc_flow_synthesizer(TplParams *tpl_data, int frame_idx, int mi_rows, int mi_cols) { if (!frame_idx) { return; } const BLOCK_SIZE bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d); const int mi_height = mi_size_high[bsize]; const int mi_width = mi_size_wide[bsize]; assert(mi_height == (1 << tpl_data->tpl_stats_block_mis_log2)); assert(mi_width == (1 << tpl_data->tpl_stats_block_mis_log2)); for (int mi_row = 0; mi_row < mi_rows; mi_row += mi_height) { for (int mi_col = 0; mi_col < mi_cols; mi_col += mi_width) { tpl_model_update(tpl_data, mi_row, mi_col, frame_idx); } } } static inline void init_gop_frames_for_tpl( AV1_COMP *cpi, const EncodeFrameParams *const init_frame_params, GF_GROUP *gf_group, int *tpl_group_frames, int *pframe_qindex) { AV1_COMMON *cm = &cpi->common; assert(cpi->gf_frame_index == 0); *pframe_qindex = 0; RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]; init_ref_map_pair(cpi, ref_frame_map_pairs); int remapped_ref_idx[REF_FRAMES]; EncodeFrameParams frame_params = *init_frame_params; TplParams *const tpl_data = &cpi->ppi->tpl_data; int ref_picture_map[REF_FRAMES]; for (int i = 0; i < REF_FRAMES; ++i) { if (frame_params.frame_type == KEY_FRAME) { tpl_data->tpl_frame[-i - 1].gf_picture = NULL; tpl_data->tpl_frame[-i - 1].rec_picture = NULL; tpl_data->tpl_frame[-i - 1].frame_display_index = 0; } else { tpl_data->tpl_frame[-i - 1].gf_picture = &cm->ref_frame_map[i]->buf; tpl_data->tpl_frame[-i - 1].rec_picture = &cm->ref_frame_map[i]->buf; tpl_data->tpl_frame[-i - 1].frame_display_index = cm->ref_frame_map[i]->display_order_hint; } ref_picture_map[i] = -i - 1; } *tpl_group_frames = 0; int gf_index; int process_frame_count = 0; const int gop_length = get_gop_length(gf_group); for (gf_index = 0; gf_index < gop_length; ++gf_index) { TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index]; FRAME_UPDATE_TYPE frame_update_type = gf_group->update_type[gf_index]; int lookahead_index = gf_group->cur_frame_idx[gf_index] + gf_group->arf_src_offset[gf_index]; frame_params.show_frame = frame_update_type != ARF_UPDATE && frame_update_type != INTNL_ARF_UPDATE; frame_params.show_existing_frame = frame_update_type == INTNL_OVERLAY_UPDATE || frame_update_type == OVERLAY_UPDATE; frame_params.frame_type = gf_group->frame_type[gf_index]; if (frame_update_type == LF_UPDATE) *pframe_qindex = gf_group->q_val[gf_index]; const struct lookahead_entry *buf = av1_lookahead_peek( cpi->ppi->lookahead, lookahead_index, cpi->compressor_stage); if (buf == NULL) break; tpl_frame->gf_picture = &buf->img; // Use filtered frame buffer if available. This will make tpl stats more // precise. FRAME_DIFF frame_diff; const YV12_BUFFER_CONFIG *tf_buf = av1_tf_info_get_filtered_buf(&cpi->ppi->tf_info, gf_index, &frame_diff); if (tf_buf != NULL) { tpl_frame->gf_picture = tf_buf; } // 'cm->current_frame.frame_number' is the display number // of the current frame. // 'lookahead_index' is frame offset within the gf group. // 'lookahead_index + cm->current_frame.frame_number' // is the display index of the frame. tpl_frame->frame_display_index = lookahead_index + cm->current_frame.frame_number; assert(buf->display_idx == cpi->frame_index_set.show_frame_count + lookahead_index); if (frame_update_type != OVERLAY_UPDATE && frame_update_type != INTNL_OVERLAY_UPDATE) { tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count]; tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count]; ++process_frame_count; } const int true_disp = (int)(tpl_frame->frame_display_index); av1_get_ref_frames(ref_frame_map_pairs, true_disp, cpi, gf_index, 0, remapped_ref_idx); int refresh_mask = av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type, gf_index, true_disp, ref_frame_map_pairs); // Make the frames marked as is_frame_non_ref to non-reference frames. if (cpi->ppi->gf_group.is_frame_non_ref[gf_index]) refresh_mask = 0; int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask); if (refresh_frame_map_index < REF_FRAMES && refresh_frame_map_index != INVALID_IDX) { ref_frame_map_pairs[refresh_frame_map_index].disp_order = AOMMAX(0, true_disp); ref_frame_map_pairs[refresh_frame_map_index].pyr_level = get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp, cpi->ppi->gf_group.max_layer_depth); } for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) tpl_frame->ref_map_index[i - LAST_FRAME] = ref_picture_map[remapped_ref_idx[i - LAST_FRAME]]; if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index; ++*tpl_group_frames; } const int tpl_extend = cpi->oxcf.gf_cfg.lag_in_frames - MAX_GF_INTERVAL; int extend_frame_count = 0; int extend_frame_length = AOMMIN( tpl_extend, cpi->rc.frames_to_key - cpi->ppi->p_rc.baseline_gf_interval); int frame_display_index = gf_group->cur_frame_idx[gop_length - 1] + gf_group->arf_src_offset[gop_length - 1] + 1; for (; gf_index < MAX_TPL_FRAME_IDX && extend_frame_count < extend_frame_length; ++gf_index) { TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index]; FRAME_UPDATE_TYPE frame_update_type = LF_UPDATE; frame_params.show_frame = frame_update_type != ARF_UPDATE && frame_update_type != INTNL_ARF_UPDATE; frame_params.show_existing_frame = frame_update_type == INTNL_OVERLAY_UPDATE; frame_params.frame_type = INTER_FRAME; int lookahead_index = frame_display_index; struct lookahead_entry *buf = av1_lookahead_peek( cpi->ppi->lookahead, lookahead_index, cpi->compressor_stage); if (buf == NULL) break; tpl_frame->gf_picture = &buf->img; tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count]; tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count]; // 'cm->current_frame.frame_number' is the display number // of the current frame. // 'frame_display_index' is frame offset within the gf group. // 'frame_display_index + cm->current_frame.frame_number' // is the display index of the frame. tpl_frame->frame_display_index = frame_display_index + cm->current_frame.frame_number; ++process_frame_count; gf_group->update_type[gf_index] = LF_UPDATE; #if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS if (cpi->oxcf.pass == AOM_RC_SECOND_PASS) { if (cpi->oxcf.rc_cfg.mode == AOM_Q) { *pframe_qindex = cpi->oxcf.rc_cfg.cq_level; } else if (cpi->oxcf.rc_cfg.mode == AOM_VBR) { // TODO(angiebird): Find a more adaptive method to decide pframe_qindex // override the pframe_qindex in the second pass when bitrate accuracy // is on. We found that setting this pframe_qindex make the tpl stats // more stable. *pframe_qindex = 128; } } #endif // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS gf_group->q_val[gf_index] = *pframe_qindex; const int true_disp = (int)(tpl_frame->frame_display_index); av1_get_ref_frames(ref_frame_map_pairs, true_disp, cpi, gf_index, 0, remapped_ref_idx); int refresh_mask = av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type, gf_index, true_disp, ref_frame_map_pairs); int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask); if (refresh_frame_map_index < REF_FRAMES && refresh_frame_map_index != INVALID_IDX) { ref_frame_map_pairs[refresh_frame_map_index].disp_order = AOMMAX(0, true_disp); ref_frame_map_pairs[refresh_frame_map_index].pyr_level = get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp, cpi->ppi->gf_group.max_layer_depth); } for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) tpl_frame->ref_map_index[i - LAST_FRAME] = ref_picture_map[remapped_ref_idx[i - LAST_FRAME]]; tpl_frame->ref_map_index[ALTREF_FRAME - LAST_FRAME] = -1; tpl_frame->ref_map_index[LAST3_FRAME - LAST_FRAME] = -1; tpl_frame->ref_map_index[BWDREF_FRAME - LAST_FRAME] = -1; tpl_frame->ref_map_index[ALTREF2_FRAME - LAST_FRAME] = -1; if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index; ++*tpl_group_frames; ++extend_frame_count; ++frame_display_index; } } void av1_init_tpl_stats(TplParams *const tpl_data) { tpl_data->ready = 0; set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2, &tpl_data->tpl_bsize_1d); for (int frame_idx = 0; frame_idx < MAX_LENGTH_TPL_FRAME_STATS; ++frame_idx) { TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx]; tpl_frame->is_valid = 0; } for (int frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) { TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx]; if (tpl_data->tpl_stats_pool[frame_idx] == NULL) continue; memset(tpl_data->tpl_stats_pool[frame_idx], 0, tpl_frame->height * tpl_frame->width * sizeof(*tpl_frame->tpl_stats_ptr)); } } int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index) { if (tpl_data->ready == 0) { return 0; } if (gf_frame_index >= MAX_TPL_FRAME_IDX) { // The sub-GOP length exceeds the TPL buffer capacity. // Hence the TPL related functions are disabled hereafter. return 0; } return tpl_data->tpl_frame[gf_frame_index].is_valid; } static inline int eval_gop_length(double *beta, int gop_eval) { switch (gop_eval) { case 1: // Allow larger GOP size if the base layer ARF has higher dependency // factor than the intermediate ARF and both ARFs have reasonably high // dependency factors. return (beta[0] >= beta[1] + 0.7) && beta[0] > 3.0; case 2: if ((beta[0] >= beta[1] + 0.4) && beta[0] > 1.6) return 1; // Don't shorten the gf interval else if ((beta[0] < beta[1] + 0.1) || beta[0] <= 1.4) return 0; // Shorten the gf interval else return 2; // Cannot decide the gf interval, so redo the // tpl stats calculation. case 3: return beta[0] > 1.1; default: return 2; } } // TODO(jingning): Restructure av1_rc_pick_q_and_bounds() to narrow down // the scope of input arguments. void av1_tpl_preload_rc_estimate(AV1_COMP *cpi, const EncodeFrameParams *const frame_params) { AV1_COMMON *cm = &cpi->common; GF_GROUP *gf_group = &cpi->ppi->gf_group; int bottom_index, top_index; if (cpi->use_ducky_encode) return; cm->current_frame.frame_type = frame_params->frame_type; for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size; ++gf_index) { cm->current_frame.frame_type = gf_group->frame_type[gf_index]; cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE && gf_group->update_type[gf_index] != INTNL_ARF_UPDATE; gf_group->q_val[gf_index] = av1_rc_pick_q_and_bounds( cpi, cm->width, cm->height, gf_index, &bottom_index, &top_index); } } static inline int skip_tpl_for_frame(const GF_GROUP *gf_group, int frame_idx, int gop_eval, int approx_gop_eval, int reduce_num_frames) { // When gop_eval is set to 2, tpl stats calculation is done for ARFs from base // layer, (base+1) layer and (base+2) layer. When gop_eval is set to 3, // tpl stats calculation is limited to ARFs from base layer and (base+1) // layer. const int num_arf_layers = (gop_eval == 2) ? 3 : 2; const int gop_length = get_gop_length(gf_group); if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE || gf_group->update_type[frame_idx] == OVERLAY_UPDATE) return 1; // When approx_gop_eval = 1, skip tpl stats calculation for higher layer // frames and for frames beyond gop length. if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers || frame_idx >= gop_length)) return 1; if (reduce_num_frames && gf_group->update_type[frame_idx] == LF_UPDATE && frame_idx < gop_length) return 1; return 0; } /*!\brief Compute the frame importance from TPL stats * * \param[in] tpl_data TPL struct * \param[in] gf_frame_index current frame index in the GOP * * \return frame_importance */ static double get_frame_importance(const TplParams *tpl_data, int gf_frame_index) { const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_frame_index]; const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; const int tpl_stride = tpl_frame->stride; double intra_cost_base = 0; double mc_dep_cost_base = 0; double cbcmp_base = 1; const int step = 1 << tpl_data->tpl_stats_block_mis_log2; for (int row = 0; row < tpl_frame->mi_rows; row += step) { for (int col = 0; col < tpl_frame->mi_cols; col += step) { const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; double cbcmp = (double)this_stats->srcrf_dist; const int64_t mc_dep_delta = RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, this_stats->mc_dep_dist); double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS); dist_scaled = AOMMAX(dist_scaled, 1); intra_cost_base += log(dist_scaled) * cbcmp; mc_dep_cost_base += log(dist_scaled + mc_dep_delta) * cbcmp; cbcmp_base += cbcmp; } } return exp((mc_dep_cost_base - intra_cost_base) / cbcmp_base); } int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval, const EncodeFrameParams *const frame_params) { #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, av1_tpl_setup_stats_time); #endif assert(cpi->gf_frame_index == 0); AV1_COMMON *cm = &cpi->common; MultiThreadInfo *const mt_info = &cpi->mt_info; AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt; GF_GROUP *gf_group = &cpi->ppi->gf_group; EncodeFrameParams this_frame_params = *frame_params; TplParams *const tpl_data = &cpi->ppi->tpl_data; int approx_gop_eval = (gop_eval > 1); if (cpi->superres_mode != AOM_SUPERRES_NONE) { assert(cpi->superres_mode != AOM_SUPERRES_AUTO); av1_init_tpl_stats(tpl_data); return 0; } cm->current_frame.frame_type = frame_params->frame_type; for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size; ++gf_index) { cm->current_frame.frame_type = gf_group->frame_type[gf_index]; av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame, gf_group->update_type[gf_index], gf_group->refbuf_state[gf_index], 0); memcpy(&cpi->refresh_frame, &this_frame_params.refresh_frame, sizeof(cpi->refresh_frame)); } int pframe_qindex; int tpl_gf_group_frames; init_gop_frames_for_tpl(cpi, frame_params, gf_group, &tpl_gf_group_frames, &pframe_qindex); cpi->ppi->p_rc.base_layer_qp = pframe_qindex; av1_init_tpl_stats(tpl_data); TplBuffers *tpl_tmp_buffers = &cpi->td.tpl_tmp_buffers; if (!tpl_alloc_temp_buffers(tpl_tmp_buffers, tpl_data->tpl_bsize_1d)) { aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, "Error allocating tpl data"); } tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read_dummy; tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write_dummy; av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height, cm->width, cm->height); if (frame_params->frame_type == KEY_FRAME) { av1_init_mv_probs(cm); } av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv, cm->features.allow_high_precision_mv, cpi->td.mb.mv_costs); const int num_planes = cpi->sf.tpl_sf.use_y_only_rate_distortion ? 1 : av1_num_planes(cm); // As tpl module is called before the setting of speed features at frame // level, turning off this speed feature for the first GF group of the // key-frame interval is done here. int reduce_num_frames = cpi->sf.tpl_sf.reduce_num_frames && gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE && gf_group->max_layer_depth > 2; // TPL processing is skipped for frames of type LF_UPDATE when // 'reduce_num_frames' is 1, which affects the r0 calcuation. Thus, a factor // to adjust r0 is used. The value of 1.6 corresponds to using ~60% of the // frames in the gf group on an average. tpl_data->r0_adjust_factor = reduce_num_frames ? 1.6 : 1.0; // Backward propagation from tpl_group_frames to 1. for (int frame_idx = cpi->gf_frame_index; frame_idx < tpl_gf_group_frames; ++frame_idx) { if (skip_tpl_for_frame(gf_group, frame_idx, gop_eval, approx_gop_eval, reduce_num_frames)) continue; init_mc_flow_dispenser(cpi, frame_idx, pframe_qindex); if (mt_info->num_workers > 1) { tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read; tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write; av1_mc_flow_dispenser_mt(cpi); } else { mc_flow_dispenser(cpi); } #if CONFIG_BITRATE_ACCURACY av1_tpl_txfm_stats_update_abs_coeff_mean(&cpi->td.tpl_txfm_stats); av1_tpl_store_txfm_stats(tpl_data, &cpi->td.tpl_txfm_stats, frame_idx); #endif // CONFIG_BITRATE_ACCURACY #if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY if (cpi->oxcf.pass == AOM_RC_THIRD_PASS) { int frame_coding_idx = av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, frame_idx); rc_log_frame_stats(&cpi->rc_log, frame_coding_idx, &cpi->td.tpl_txfm_stats); } #endif // CONFIG_RATECTRL_LOG aom_extend_frame_borders(tpl_data->tpl_frame[frame_idx].rec_picture, num_planes); } for (int frame_idx = tpl_gf_group_frames - 1; frame_idx >= cpi->gf_frame_index; --frame_idx) { if (skip_tpl_for_frame(gf_group, frame_idx, gop_eval, approx_gop_eval, reduce_num_frames)) continue; mc_flow_synthesizer(tpl_data, frame_idx, cm->mi_params.mi_rows, cm->mi_params.mi_cols); } av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame, gf_group->update_type[cpi->gf_frame_index], gf_group->update_type[cpi->gf_frame_index], 0); cm->current_frame.frame_type = frame_params->frame_type; cm->show_frame = frame_params->show_frame; #if CONFIG_COLLECT_COMPONENT_TIMING // Record the time if the function returns. if (cpi->common.tiles.large_scale || gf_group->max_layer_depth_allowed == 0 || !gop_eval) end_timing(cpi, av1_tpl_setup_stats_time); #endif tpl_dealloc_temp_buffers(tpl_tmp_buffers); if (!approx_gop_eval) { tpl_data->ready = 1; } if (cpi->common.tiles.large_scale) return 0; if (gf_group->max_layer_depth_allowed == 0) return 1; if (!gop_eval) return 0; assert(gf_group->arf_index >= 0); double beta[2] = { 0.0 }; const int frame_idx_0 = gf_group->arf_index; const int frame_idx_1 = AOMMIN(tpl_gf_group_frames - 1, gf_group->arf_index + 1); beta[0] = get_frame_importance(tpl_data, frame_idx_0); beta[1] = get_frame_importance(tpl_data, frame_idx_1); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, av1_tpl_setup_stats_time); #endif return eval_gop_length(beta, gop_eval); } void av1_tpl_rdmult_setup(AV1_COMP *cpi) { const AV1_COMMON *const cm = &cpi->common; const int tpl_idx = cpi->gf_frame_index; assert( IMPLIES(cpi->ppi->gf_group.size > 0, tpl_idx < cpi->ppi->gf_group.size)); TplParams *const tpl_data = &cpi->ppi->tpl_data; const TplDepFrame *const tpl_frame = &tpl_data->tpl_frame[tpl_idx]; if (!tpl_frame->is_valid) return; const TplDepStats *const tpl_stats = tpl_frame->tpl_stats_ptr; const int tpl_stride = tpl_frame->stride; const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); const int block_size = BLOCK_16X16; const int num_mi_w = mi_size_wide[block_size]; const int num_mi_h = mi_size_high[block_size]; const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w; const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; const double c = 1.2; const int step = 1 << tpl_data->tpl_stats_block_mis_log2; // Loop through each 'block_size' X 'block_size' block. for (int row = 0; row < num_rows; row++) { for (int col = 0; col < num_cols; col++) { double intra_cost = 0.0, mc_dep_cost = 0.0; // Loop through each mi block. for (int mi_row = row * num_mi_h; mi_row < (row + 1) * num_mi_h; mi_row += step) { for (int mi_col = col * num_mi_w; mi_col < (col + 1) * num_mi_w; mi_col += step) { if (mi_row >= cm->mi_params.mi_rows || mi_col >= mi_cols_sr) continue; const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( mi_row, mi_col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; int64_t mc_dep_delta = RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, this_stats->mc_dep_dist); intra_cost += (double)(this_stats->recrf_dist << RDDIV_BITS); mc_dep_cost += (double)(this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta; } } const double rk = intra_cost / mc_dep_cost; const int index = row * num_cols + col; cpi->tpl_rdmult_scaling_factors[index] = rk / cpi->rd.r0 + c; } } } void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x, BLOCK_SIZE sb_size, int mi_row, int mi_col) { AV1_COMMON *const cm = &cpi->common; GF_GROUP *gf_group = &cpi->ppi->gf_group; assert(IMPLIES(cpi->ppi->gf_group.size > 0, cpi->gf_frame_index < cpi->ppi->gf_group.size)); const int tpl_idx = cpi->gf_frame_index; const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); const FRAME_TYPE frame_type = cm->current_frame.frame_type; if (tpl_idx >= MAX_TPL_FRAME_IDX) return; TplDepFrame *tpl_frame = &cpi->ppi->tpl_data.tpl_frame[tpl_idx]; if (!tpl_frame->is_valid) return; if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return; if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return; const int mi_col_sr = coded_to_superres_mi(mi_col, cm->superres_scale_denominator); const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); const int sb_mi_width_sr = coded_to_superres_mi( mi_size_wide[sb_size], cm->superres_scale_denominator); const int bsize_base = BLOCK_16X16; const int num_mi_w = mi_size_wide[bsize_base]; const int num_mi_h = mi_size_high[bsize_base]; const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w; const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; const int num_bcols = (sb_mi_width_sr + num_mi_w - 1) / num_mi_w; const int num_brows = (mi_size_high[sb_size] + num_mi_h - 1) / num_mi_h; int row, col; double base_block_count = 0.0; double log_sum = 0.0; for (row = mi_row / num_mi_w; row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { for (col = mi_col_sr / num_mi_h; col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) { const int index = row * num_cols + col; log_sum += log(cpi->tpl_rdmult_scaling_factors[index]); base_block_count += 1.0; } } const CommonQuantParams *quant_params = &cm->quant_params; const int orig_qindex_rdmult = quant_params->base_qindex + quant_params->y_dc_delta_q; const int orig_rdmult = av1_compute_rd_mult( orig_qindex_rdmult, cm->seq_params->bit_depth, cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi), cpi->oxcf.tune_cfg.tuning); const int new_qindex_rdmult = quant_params->base_qindex + x->rdmult_delta_qindex + quant_params->y_dc_delta_q; const int new_rdmult = av1_compute_rd_mult( new_qindex_rdmult, cm->seq_params->bit_depth, cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi), cpi->oxcf.tune_cfg.tuning); const double scaling_factor = (double)new_rdmult / (double)orig_rdmult; double scale_adj = log(scaling_factor) - log_sum / base_block_count; scale_adj = exp_bounded(scale_adj); for (row = mi_row / num_mi_w; row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { for (col = mi_col_sr / num_mi_h; col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) { const int index = row * num_cols + col; cpi->ppi->tpl_sb_rdmult_scaling_factors[index] = scale_adj * cpi->tpl_rdmult_scaling_factors[index]; } } } double av1_exponential_entropy(double q_step, double b) { b = AOMMAX(b, TPL_EPSILON); double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON); return -log2(1 - z) - z * log2(z) / (1 - z); } double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio) { // zero bin's size is zero_bin_ratio * q_step // non-zero bin's size is q_step b = AOMMAX(b, TPL_EPSILON); double z = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON); double h = av1_exponential_entropy(q_step, b); double r = -(1 - z) * log2(1 - z) - z * log2(z) + z * (h + 1); return r; } #if CONFIG_BITRATE_ACCURACY double av1_laplace_estimate_frame_rate(int q_index, int block_count, const double *abs_coeff_mean, int coeff_num) { double zero_bin_ratio = 2; double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.; double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.; double est_rate = 0; // dc coeff est_rate += av1_laplace_entropy(dc_q_step, abs_coeff_mean[0], zero_bin_ratio); // ac coeff for (int i = 1; i < coeff_num; ++i) { est_rate += av1_laplace_entropy(ac_q_step, abs_coeff_mean[i], zero_bin_ratio); } est_rate *= block_count; return est_rate; } #endif // CONFIG_BITRATE_ACCURACY double av1_estimate_coeff_entropy(double q_step, double b, double zero_bin_ratio, int qcoeff) { b = AOMMAX(b, TPL_EPSILON); int abs_qcoeff = abs(qcoeff); double z0 = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON); if (abs_qcoeff == 0) { double r = -log2(1 - z0); return r; } else { double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON); double r = 1 - log2(z0) - log2(1 - z) - (abs_qcoeff - 1) * log2(z); return r; } } #if CONFIG_RD_COMMAND void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command) { FILE *fptr = fopen(filepath, "r"); fscanf(fptr, "%d", &rd_command->frame_count); rd_command->frame_index = 0; for (int i = 0; i < rd_command->frame_count; ++i) { int option; fscanf(fptr, "%d", &option); rd_command->option_ls[i] = (RD_OPTION)option; if (option == RD_OPTION_SET_Q) { fscanf(fptr, "%d", &rd_command->q_index_ls[i]); } else if (option == RD_OPTION_SET_Q_RDMULT) { fscanf(fptr, "%d", &rd_command->q_index_ls[i]); fscanf(fptr, "%d", &rd_command->rdmult_ls[i]); } } fclose(fptr); } #endif // CONFIG_RD_COMMAND double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index) { if (!av1_tpl_stats_ready(tpl_data, gf_frame_index)) { return 1; } const double frame_importance = get_frame_importance(tpl_data, gf_frame_index); return sqrt(1 / frame_importance); } int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio, aom_bit_depth_t bit_depth) { const double leaf_qstep = av1_dc_quant_QTX(leaf_qindex, 0, bit_depth); const double target_qstep = leaf_qstep * qstep_ratio; int qindex = leaf_qindex; if (qstep_ratio < 1.0) { for (qindex = leaf_qindex; qindex > 0; --qindex) { const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth); if (qstep <= target_qstep) break; } } else { for (qindex = leaf_qindex; qindex <= MAXQ; ++qindex) { const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth); if (qstep >= target_qstep) break; } } return qindex; } int av1_tpl_get_q_index(const TplParams *tpl_data, int gf_frame_index, int leaf_qindex, aom_bit_depth_t bit_depth) { const double qstep_ratio = av1_tpl_get_qstep_ratio(tpl_data, gf_frame_index); return av1_get_q_index_from_qstep_ratio(leaf_qindex, qstep_ratio, bit_depth); } #if CONFIG_BITRATE_ACCURACY void av1_vbr_rc_init(VBR_RATECTRL_INFO *vbr_rc_info, double total_bit_budget, int show_frame_count) { av1_zero(*vbr_rc_info); vbr_rc_info->ready = 0; vbr_rc_info->total_bit_budget = total_bit_budget; vbr_rc_info->show_frame_count = show_frame_count; const double scale_factors[FRAME_UPDATE_TYPES] = { 0.94559, 0.94559, 1, 0.94559, 1, 1, 0.94559 }; // TODO(angiebird): Based on the previous code, only the scale factor 0.94559 // will be used in most of the cases with --limi=17. Figure out if the // following scale factors works better. // const double scale_factors[FRAME_UPDATE_TYPES] = { 0.94559, 0.12040, 1, // 1.10199, 1, 1, // 0.16393 }; const double mv_scale_factors[FRAME_UPDATE_TYPES] = { 3, 3, 3, 3, 3, 3, 3 }; memcpy(vbr_rc_info->scale_factors, scale_factors, sizeof(scale_factors[0]) * FRAME_UPDATE_TYPES); memcpy(vbr_rc_info->mv_scale_factors, mv_scale_factors, sizeof(mv_scale_factors[0]) * FRAME_UPDATE_TYPES); vbr_rc_reset_gop_data(vbr_rc_info); #if CONFIG_THREE_PASS // TODO(angiebird): Explain why we use -1 here vbr_rc_info->cur_gop_idx = -1; vbr_rc_info->gop_count = 0; vbr_rc_info->total_frame_count = 0; #endif // CONFIG_THREE_PASS } #if CONFIG_THREE_PASS int av1_vbr_rc_frame_coding_idx(const VBR_RATECTRL_INFO *vbr_rc_info, int gf_frame_index) { int gop_idx = vbr_rc_info->cur_gop_idx; int gop_start_idx = vbr_rc_info->gop_start_idx_list[gop_idx]; return gop_start_idx + gf_frame_index; } void av1_vbr_rc_append_tpl_info(VBR_RATECTRL_INFO *vbr_rc_info, const TPL_INFO *tpl_info) { int gop_start_idx = vbr_rc_info->total_frame_count; vbr_rc_info->gop_start_idx_list[vbr_rc_info->gop_count] = gop_start_idx; vbr_rc_info->gop_length_list[vbr_rc_info->gop_count] = tpl_info->gf_length; assert(gop_start_idx + tpl_info->gf_length <= VBR_RC_INFO_MAX_FRAMES); for (int i = 0; i < tpl_info->gf_length; ++i) { vbr_rc_info->txfm_stats_list[gop_start_idx + i] = tpl_info->txfm_stats_list[i]; vbr_rc_info->qstep_ratio_list[gop_start_idx + i] = tpl_info->qstep_ratio_ls[i]; vbr_rc_info->update_type_list[gop_start_idx + i] = tpl_info->update_type_list[i]; } vbr_rc_info->total_frame_count += tpl_info->gf_length; vbr_rc_info->gop_count++; } #endif // CONFIG_THREE_PASS void av1_vbr_rc_set_gop_bit_budget(VBR_RATECTRL_INFO *vbr_rc_info, int gop_showframe_count) { vbr_rc_info->gop_showframe_count = gop_showframe_count; vbr_rc_info->gop_bit_budget = vbr_rc_info->total_bit_budget * gop_showframe_count / vbr_rc_info->show_frame_count; } void av1_vbr_rc_compute_q_indices(int base_q_index, int frame_count, const double *qstep_ratio_list, aom_bit_depth_t bit_depth, int *q_index_list) { for (int i = 0; i < frame_count; ++i) { q_index_list[i] = av1_get_q_index_from_qstep_ratio( base_q_index, qstep_ratio_list[i], bit_depth); } } double av1_vbr_rc_info_estimate_gop_bitrate( int base_q_index, aom_bit_depth_t bit_depth, const double *update_type_scale_factors, int frame_count, const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list, const TplTxfmStats *stats_list, int *q_index_list, double *estimated_bitrate_byframe) { av1_vbr_rc_compute_q_indices(base_q_index, frame_count, qstep_ratio_list, bit_depth, q_index_list); double estimated_gop_bitrate = 0; for (int frame_index = 0; frame_index < frame_count; frame_index++) { const TplTxfmStats *frame_stats = &stats_list[frame_index]; double frame_bitrate = 0; if (frame_stats->ready) { int q_index = q_index_list[frame_index]; frame_bitrate = av1_laplace_estimate_frame_rate( q_index, frame_stats->txfm_block_count, frame_stats->abs_coeff_mean, frame_stats->coeff_num); } FRAME_UPDATE_TYPE update_type = update_type_list[frame_index]; estimated_gop_bitrate += frame_bitrate * update_type_scale_factors[update_type]; if (estimated_bitrate_byframe != NULL) { estimated_bitrate_byframe[frame_index] = frame_bitrate; } } return estimated_gop_bitrate; } int av1_vbr_rc_info_estimate_base_q( double bit_budget, aom_bit_depth_t bit_depth, const double *update_type_scale_factors, int frame_count, const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list, const TplTxfmStats *stats_list, int *q_index_list, double *estimated_bitrate_byframe) { int q_max = 255; // Maximum q value. int q_min = 0; // Minimum q value. int q = (q_max + q_min) / 2; double q_max_estimate = av1_vbr_rc_info_estimate_gop_bitrate( q_max, bit_depth, update_type_scale_factors, frame_count, update_type_list, qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe); double q_min_estimate = av1_vbr_rc_info_estimate_gop_bitrate( q_min, bit_depth, update_type_scale_factors, frame_count, update_type_list, qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe); while (q_min + 1 < q_max) { double estimate = av1_vbr_rc_info_estimate_gop_bitrate( q, bit_depth, update_type_scale_factors, frame_count, update_type_list, qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe); if (estimate > bit_budget) { q_min = q; q_min_estimate = estimate; } else { q_max = q; q_max_estimate = estimate; } q = (q_max + q_min) / 2; } // Pick the estimate that lands closest to the budget. if (fabs(q_max_estimate - bit_budget) < fabs(q_min_estimate - bit_budget)) { q = q_max; } else { q = q_min; } // Update q_index_list and vbr_rc_info. av1_vbr_rc_info_estimate_gop_bitrate( q, bit_depth, update_type_scale_factors, frame_count, update_type_list, qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe); return q; } void av1_vbr_rc_update_q_index_list(VBR_RATECTRL_INFO *vbr_rc_info, const TplParams *tpl_data, const GF_GROUP *gf_group, aom_bit_depth_t bit_depth) { vbr_rc_info->q_index_list_ready = 1; double gop_bit_budget = vbr_rc_info->gop_bit_budget; for (int i = 0; i < gf_group->size; i++) { vbr_rc_info->qstep_ratio_list[i] = av1_tpl_get_qstep_ratio(tpl_data, i); } double mv_bits = 0; for (int i = 0; i < gf_group->size; i++) { double frame_mv_bits = 0; if (av1_tpl_stats_ready(tpl_data, i)) { TplDepFrame *tpl_frame = &tpl_data->tpl_frame[i]; frame_mv_bits = av1_tpl_compute_frame_mv_entropy( tpl_frame, tpl_data->tpl_stats_block_mis_log2); FRAME_UPDATE_TYPE updae_type = gf_group->update_type[i]; mv_bits += frame_mv_bits * vbr_rc_info->mv_scale_factors[updae_type]; } } mv_bits = AOMMIN(mv_bits, 0.6 * gop_bit_budget); gop_bit_budget -= mv_bits; vbr_rc_info->base_q_index = av1_vbr_rc_info_estimate_base_q( gop_bit_budget, bit_depth, vbr_rc_info->scale_factors, gf_group->size, gf_group->update_type, vbr_rc_info->qstep_ratio_list, tpl_data->txfm_stats_list, vbr_rc_info->q_index_list, NULL); } #endif // CONFIG_BITRATE_ACCURACY // Use upper and left neighbor block as the reference MVs. // Compute the minimum difference between current MV and reference MV. int_mv av1_compute_mv_difference(const TplDepFrame *tpl_frame, int row, int col, int step, int tpl_stride, int right_shift) { const TplDepStats *tpl_stats = &tpl_frame ->tpl_stats_ptr[av1_tpl_ptr_pos(row, col, tpl_stride, right_shift)]; int_mv current_mv = tpl_stats->mv[tpl_stats->ref_frame_index[0]]; int current_mv_magnitude = abs(current_mv.as_mv.row) + abs(current_mv.as_mv.col); // Retrieve the up and left neighbors. int up_error = INT_MAX; int_mv up_mv_diff; if (row - step >= 0) { tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( row - step, col, tpl_stride, right_shift)]; up_mv_diff = tpl_stats->mv[tpl_stats->ref_frame_index[0]]; up_mv_diff.as_mv.row = current_mv.as_mv.row - up_mv_diff.as_mv.row; up_mv_diff.as_mv.col = current_mv.as_mv.col - up_mv_diff.as_mv.col; up_error = abs(up_mv_diff.as_mv.row) + abs(up_mv_diff.as_mv.col); } int left_error = INT_MAX; int_mv left_mv_diff; if (col - step >= 0) { tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( row, col - step, tpl_stride, right_shift)]; left_mv_diff = tpl_stats->mv[tpl_stats->ref_frame_index[0]]; left_mv_diff.as_mv.row = current_mv.as_mv.row - left_mv_diff.as_mv.row; left_mv_diff.as_mv.col = current_mv.as_mv.col - left_mv_diff.as_mv.col; left_error = abs(left_mv_diff.as_mv.row) + abs(left_mv_diff.as_mv.col); } // Return the MV with the minimum distance from current. if (up_error < left_error && up_error < current_mv_magnitude) { return up_mv_diff; } else if (left_error < up_error && left_error < current_mv_magnitude) { return left_mv_diff; } return current_mv; } /* Compute the entropy of motion vectors for a single frame. */ double av1_tpl_compute_frame_mv_entropy(const TplDepFrame *tpl_frame, uint8_t right_shift) { if (!tpl_frame->is_valid) { return 0; } int count_row[500] = { 0 }; int count_col[500] = { 0 }; int n = 0; // number of MVs to process const int tpl_stride = tpl_frame->stride; const int step = 1 << right_shift; for (int row = 0; row < tpl_frame->mi_rows; row += step) { for (int col = 0; col < tpl_frame->mi_cols; col += step) { int_mv mv = av1_compute_mv_difference(tpl_frame, row, col, step, tpl_stride, right_shift); count_row[clamp(mv.as_mv.row, 0, 499)] += 1; count_col[clamp(mv.as_mv.row, 0, 499)] += 1; n += 1; } } // Estimate the bits used using the entropy formula. double rate_row = 0; double rate_col = 0; for (int i = 0; i < 500; i++) { if (count_row[i] != 0) { double p = count_row[i] / (double)n; rate_row += count_row[i] * -log2(p); } if (count_col[i] != 0) { double p = count_col[i] / (double)n; rate_col += count_col[i] * -log2(p); } } return rate_row + rate_col; } aom-3.12.1/av1/encoder/tpl_model.h000066400000000000000000000654531477627663500166540ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_TPL_MODEL_H_ #define AOM_AV1_ENCODER_TPL_MODEL_H_ #ifdef __cplusplus extern "C" { #endif /*!\cond */ struct AV1_PRIMARY; struct AV1_COMP; struct AV1_SEQ_CODING_TOOLS; struct EncodeFrameParams; struct EncodeFrameInput; struct GF_GROUP; struct ThreadData; struct TPL_INFO; #include "config/aom_config.h" #include "aom_scale/yv12config.h" #include "aom_util/aom_pthread.h" #include "av1/common/mv.h" #include "av1/common/scale.h" #include "av1/encoder/block.h" #include "av1/encoder/lookahead.h" #include "av1/encoder/ratectrl.h" static inline BLOCK_SIZE convert_length_to_bsize(int length) { switch (length) { case 64: return BLOCK_64X64; case 32: return BLOCK_32X32; case 16: return BLOCK_16X16; case 8: return BLOCK_8X8; case 4: return BLOCK_4X4; default: assert(0 && "Invalid block size for tpl model"); return BLOCK_16X16; } } typedef struct AV1TplRowMultiThreadSync { #if CONFIG_MULTITHREAD // Synchronization objects for top-right dependency. pthread_mutex_t *mutex_; pthread_cond_t *cond_; #endif // Buffer to store the macroblock whose encoding is complete. // num_finished_cols[i] stores the number of macroblocks which finished // encoding in the ith macroblock row. int *num_finished_cols; // Number of extra macroblocks of the top row to be complete for encoding // of the current macroblock to start. A value of 1 indicates top-right // dependency. int sync_range; // Number of macroblock rows. int rows; // Number of threads processing the current tile. int num_threads_working; } AV1TplRowMultiThreadSync; typedef struct AV1TplRowMultiThreadInfo { // Initialized to false, set to true by the worker thread that encounters an // error in order to abort the processing of other worker threads. bool tpl_mt_exit; #if CONFIG_MULTITHREAD // Mutex lock object used for error handling. pthread_mutex_t *mutex_; #endif // Row synchronization related function pointers. void (*sync_read_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c); void (*sync_write_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c, int cols); } AV1TplRowMultiThreadInfo; // TODO(jingning): This needs to be cleaned up next. // TPL stats buffers are prepared for every frame in the GOP, // including (internal) overlays and (internal) arfs. // In addition, frames in the lookahead that are outside of the GOP // are also used. // Thus it should use // (gop_length) + (# overlays) + (MAX_LAG_BUFFERS - gop_len) = // MAX_LAG_BUFFERS + (# overlays) // 2 * MAX_LAG_BUFFERS is therefore a safe estimate. // TODO(bohanli): test setting it to 1.5 * MAX_LAG_BUFFER #define MAX_TPL_FRAME_IDX (2 * MAX_LAG_BUFFERS) // The first REF_FRAMES + 1 buffers are reserved. // tpl_data->tpl_frame starts after REF_FRAMES + 1 #define MAX_LENGTH_TPL_FRAME_STATS (MAX_TPL_FRAME_IDX + REF_FRAMES + 1) #define TPL_DEP_COST_SCALE_LOG2 4 #define TPL_EPSILON 0.0000001 typedef struct TplTxfmStats { int ready; // Whether abs_coeff_mean is ready double abs_coeff_sum[256]; // Assume we are using 16x16 transform block double abs_coeff_mean[256]; int txfm_block_count; int coeff_num; } TplTxfmStats; typedef struct { uint8_t *predictor8; int16_t *src_diff; tran_low_t *coeff; tran_low_t *qcoeff; tran_low_t *dqcoeff; } TplBuffers; typedef struct TplDepStats { int64_t srcrf_sse; int64_t srcrf_dist; int64_t recrf_sse; int64_t recrf_dist; int64_t intra_sse; int64_t intra_dist; int64_t cmp_recrf_dist[2]; int64_t mc_dep_rate; int64_t mc_dep_dist; int64_t pred_error[INTER_REFS_PER_FRAME]; int32_t intra_cost; int32_t inter_cost; int32_t srcrf_rate; int32_t recrf_rate; int32_t intra_rate; int32_t cmp_recrf_rate[2]; int_mv mv[INTER_REFS_PER_FRAME]; int8_t ref_frame_index[2]; } TplDepStats; typedef struct TplDepFrame { uint8_t is_valid; TplDepStats *tpl_stats_ptr; const YV12_BUFFER_CONFIG *gf_picture; YV12_BUFFER_CONFIG *rec_picture; int ref_map_index[REF_FRAMES]; int stride; int width; int height; int mi_rows; int mi_cols; int base_rdmult; uint32_t frame_display_index; // When set, SAD metric is used for intra and inter mode decision. int use_pred_sad; } TplDepFrame; /*!\endcond */ /*! * \brief Params related to temporal dependency model. */ typedef struct TplParams { /*! * Whether the tpl stats is ready. */ int ready; /*! * Block granularity of tpl score storage. */ uint8_t tpl_stats_block_mis_log2; /*! * Tpl motion estimation block 1d size. tpl_bsize_1d >= 16. */ uint8_t tpl_bsize_1d; /*! * Buffer to store the frame level tpl information for each frame in a gf * group. tpl_stats_buffer[i] stores the tpl information of ith frame in a gf * group */ TplDepFrame tpl_stats_buffer[MAX_LENGTH_TPL_FRAME_STATS]; /*! * Buffer to store tpl stats at block granularity. * tpl_stats_pool[i][j] stores the tpl stats of jth block of ith frame in a gf * group. */ TplDepStats *tpl_stats_pool[MAX_LAG_BUFFERS]; /*! * Pointer to the buffer which stores tpl transform stats per frame. * txfm_stats_list[i] stores the TplTxfmStats of the ith frame in a gf group. * Memory is allocated dynamically for MAX_LENGTH_TPL_FRAME_STATS frames when * tpl is enabled. */ TplTxfmStats *txfm_stats_list; /*! * Buffer to store tpl reconstructed frame. * tpl_rec_pool[i] stores the reconstructed frame of ith frame in a gf group. */ YV12_BUFFER_CONFIG tpl_rec_pool[MAX_LAG_BUFFERS]; /*! * Pointer to tpl_stats_buffer. */ TplDepFrame *tpl_frame; /*! * Scale factors for the current frame. */ struct scale_factors sf; /*! * GF group index of the current frame. */ int frame_idx; /*! * Array of pointers to the frame buffers holding the source frame. * src_ref_frame[i] stores the pointer to the source frame of the ith * reference frame type. */ const YV12_BUFFER_CONFIG *src_ref_frame[INTER_REFS_PER_FRAME]; /*! * Array of pointers to the frame buffers holding the tpl reconstructed frame. * ref_frame[i] stores the pointer to the tpl reconstructed frame of the ith * reference frame type. */ const YV12_BUFFER_CONFIG *ref_frame[INTER_REFS_PER_FRAME]; /*! * Parameters related to synchronization for top-right dependency in row based * multi-threading of tpl */ AV1TplRowMultiThreadSync tpl_mt_sync; /*! * Frame border for tpl frame. */ int border_in_pixels; /*! * Factor to adjust r0 if TPL uses a subset of frames in the gf group. */ double r0_adjust_factor; } TplParams; #if CONFIG_BITRATE_ACCURACY || CONFIG_RATECTRL_LOG #define VBR_RC_INFO_MAX_FRAMES 500 #endif // CONFIG_BITRATE_ACCURACY || CONFIG_RATECTRL_LOG #if CONFIG_BITRATE_ACCURACY /*! * \brief This structure stores information needed for bitrate accuracy * experiment. */ typedef struct { int ready; double total_bit_budget; // The total bit budget of the entire video int show_frame_count; // Number of show frames in the entire video int gop_showframe_count; // The number of show frames in the current gop double gop_bit_budget; // The bitbudget for the current gop double scale_factors[FRAME_UPDATE_TYPES]; // Scale factors to improve the // budget estimation double mv_scale_factors[FRAME_UPDATE_TYPES]; // Scale factors to improve // MV entropy estimation // === Below this line are GOP related data that will be updated per GOP === int base_q_index; // Stores the base q index. int q_index_list_ready; int q_index_list[VBR_RC_INFO_MAX_FRAMES]; // q indices for the current // GOP // Array to store qstep_ratio for each frame in a GOP double qstep_ratio_list[VBR_RC_INFO_MAX_FRAMES]; #if CONFIG_THREE_PASS TplTxfmStats txfm_stats_list[VBR_RC_INFO_MAX_FRAMES]; FRAME_UPDATE_TYPE update_type_list[VBR_RC_INFO_MAX_FRAMES]; int gop_start_idx_list[VBR_RC_INFO_MAX_FRAMES]; int gop_length_list[VBR_RC_INFO_MAX_FRAMES]; int cur_gop_idx; int total_frame_count; int gop_count; #endif // CONFIG_THREE_PASS } VBR_RATECTRL_INFO; static inline void vbr_rc_reset_gop_data(VBR_RATECTRL_INFO *vbr_rc_info) { vbr_rc_info->q_index_list_ready = 0; av1_zero(vbr_rc_info->q_index_list); } void av1_vbr_rc_init(VBR_RATECTRL_INFO *vbr_rc_info, double total_bit_budget, int show_frame_count); int av1_vbr_rc_frame_coding_idx(const VBR_RATECTRL_INFO *vbr_rc_info, int gf_frame_index); void av1_vbr_rc_append_tpl_info(VBR_RATECTRL_INFO *vbr_rc_info, const struct TPL_INFO *tpl_info); void av1_vbr_rc_set_gop_bit_budget(VBR_RATECTRL_INFO *vbr_rc_info, int gop_showframe_count); void av1_vbr_rc_compute_q_indices(int base_q_index, int frame_count, const double *qstep_ratio_list, aom_bit_depth_t bit_depth, int *q_index_list); /*!\brief Update q_index_list in vbr_rc_info based on tpl stats * * \param[out] vbr_rc_info Rate control info for BITRATE_ACCURACY * experiment * \param[in] tpl_data TPL struct * \param[in] gf_group GOP struct * \param[in] bit_depth bit depth */ void av1_vbr_rc_update_q_index_list(VBR_RATECTRL_INFO *vbr_rc_info, const TplParams *tpl_data, const struct GF_GROUP *gf_group, aom_bit_depth_t bit_depth); /* *!\brief Compute the number of bits needed to encode a GOP * * \param[in] base_q_index base layer q_index * \param[in] bit_depth bit depth * \param[in] update_type_scale_factors array of scale factors for each * update_type * \param[in] frame_count size of update_type_list, * qstep_ratio_list stats_list, * q_index_list and * estimated_bitrate_byframe * \param[in] update_type_list array of update_type, one per frame * \param[in] qstep_ratio_list array of qstep_ratio, one per frame * \param[in] stats_list array of transform stats, one per * frame * \param[out] q_index_list array of q_index, one per frame * \param[out] estimated_bitrate_byframe array to keep track of frame * bitrate * * \return The estimated GOP bitrate. * */ double av1_vbr_rc_info_estimate_gop_bitrate( int base_q_index, aom_bit_depth_t bit_depth, const double *update_type_scale_factors, int frame_count, const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list, const TplTxfmStats *stats_list, int *q_index_list, double *estimated_bitrate_byframe); /*!\brief Estimate the optimal base q index for a GOP. * * This function uses a binary search to find base layer q index to * achieve the specified bit budget. * * \param[in] bit_budget target bit budget * \param[in] bit_depth bit depth * \param[in] update_type_scale_factors array of scale factors for each * update_type * \param[in] frame_count size of update_type_list, qstep_ratio_list * stats_list, q_index_list and * estimated_bitrate_byframe * \param[in] update_type_list array of update_type, one per frame * \param[in] qstep_ratio_list array of qstep_ratio, one per frame * \param[in] stats_list array of transform stats, one per frame * \param[out] q_index_list array of q_index, one per frame * \param[out] estimated_bitrate_byframe Array to keep track of frame * bitrate * * \return Returns the optimal base q index to use. */ int av1_vbr_rc_info_estimate_base_q( double bit_budget, aom_bit_depth_t bit_depth, const double *update_type_scale_factors, int frame_count, const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list, const TplTxfmStats *stats_list, int *q_index_list, double *estimated_bitrate_byframe); #endif // CONFIG_BITRATE_ACCURACY #if CONFIG_RD_COMMAND typedef enum { RD_OPTION_NONE, RD_OPTION_SET_Q, RD_OPTION_SET_Q_RDMULT } RD_OPTION; typedef struct RD_COMMAND { RD_OPTION option_ls[MAX_LENGTH_TPL_FRAME_STATS]; int q_index_ls[MAX_LENGTH_TPL_FRAME_STATS]; int rdmult_ls[MAX_LENGTH_TPL_FRAME_STATS]; int frame_count; int frame_index; } RD_COMMAND; void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command); #endif // CONFIG_RD_COMMAND /*!\brief Allocate buffers used by tpl model * * \param[in] Top-level encode/decode structure * \param[in] lag_in_frames number of lookahead frames * * \param[out] tpl_data tpl data structure */ void av1_setup_tpl_buffers(struct AV1_PRIMARY *const ppi, CommonModeInfoParams *const mi_params, int width, int height, int byte_alignment, int lag_in_frames); static inline void tpl_dealloc_temp_buffers(TplBuffers *tpl_tmp_buffers) { aom_free(tpl_tmp_buffers->predictor8); tpl_tmp_buffers->predictor8 = NULL; aom_free(tpl_tmp_buffers->src_diff); tpl_tmp_buffers->src_diff = NULL; aom_free(tpl_tmp_buffers->coeff); tpl_tmp_buffers->coeff = NULL; aom_free(tpl_tmp_buffers->qcoeff); tpl_tmp_buffers->qcoeff = NULL; aom_free(tpl_tmp_buffers->dqcoeff); tpl_tmp_buffers->dqcoeff = NULL; } static inline bool tpl_alloc_temp_buffers(TplBuffers *tpl_tmp_buffers, uint8_t tpl_bsize_1d) { // Number of pixels in a tpl block const int tpl_block_pels = tpl_bsize_1d * tpl_bsize_1d; // Allocate temporary buffers used in mode estimation. tpl_tmp_buffers->predictor8 = (uint8_t *)aom_memalign( 32, tpl_block_pels * 2 * sizeof(*tpl_tmp_buffers->predictor8)); tpl_tmp_buffers->src_diff = (int16_t *)aom_memalign( 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->src_diff)); tpl_tmp_buffers->coeff = (tran_low_t *)aom_memalign( 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->coeff)); tpl_tmp_buffers->qcoeff = (tran_low_t *)aom_memalign( 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->qcoeff)); tpl_tmp_buffers->dqcoeff = (tran_low_t *)aom_memalign( 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->dqcoeff)); if (!(tpl_tmp_buffers->predictor8 && tpl_tmp_buffers->src_diff && tpl_tmp_buffers->coeff && tpl_tmp_buffers->qcoeff && tpl_tmp_buffers->dqcoeff)) { tpl_dealloc_temp_buffers(tpl_tmp_buffers); return false; } return true; } /*!\brief Implements temporal dependency modelling for a GOP (GF/ARF * group) and selects between 16 and 32 frame GOP structure. * *\ingroup tpl_modelling * * \param[in] cpi Top - level encoder instance structure * \param[in] gop_eval Flag if it is in the GOP length decision stage * \param[in] frame_params Per frame encoding parameters * * \return Indicates whether or not we should use a longer GOP length. */ int av1_tpl_setup_stats(struct AV1_COMP *cpi, int gop_eval, const struct EncodeFrameParams *const frame_params); /*!\cond */ void av1_tpl_preload_rc_estimate( struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params); int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift); void av1_init_tpl_stats(TplParams *const tpl_data); int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index); void av1_tpl_rdmult_setup(struct AV1_COMP *cpi); void av1_tpl_rdmult_setup_sb(struct AV1_COMP *cpi, MACROBLOCK *const x, BLOCK_SIZE sb_size, int mi_row, int mi_col); void av1_mc_flow_dispenser_row(struct AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats, TplBuffers *tpl_tmp_buffers, MACROBLOCK *x, int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size); /*!\brief Compute the entropy of an exponential probability distribution * function (pdf) subjected to uniform quantization. * * pdf(x) = b*exp(-b*x) * *\ingroup tpl_modelling * * \param[in] q_step quantizer step size * \param[in] b parameter of exponential distribution * * \return entropy cost */ double av1_exponential_entropy(double q_step, double b); /*!\brief Compute the entropy of a Laplace probability distribution * function (pdf) subjected to non-uniform quantization. * * pdf(x) = 0.5*b*exp(-0.5*b*|x|) * *\ingroup tpl_modelling * * \param[in] q_step quantizer step size for non-zero bins * \param[in] b parameter of Laplace distribution * \param[in] zero_bin_ratio zero bin's size is zero_bin_ratio * q_step * * \return entropy cost */ double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio); #if CONFIG_BITRATE_ACCURACY /*!\brief Compute the frame rate using transform block stats * * Assume each position i in the transform block is of Laplace distribution * with mean absolute deviation abs_coeff_mean[i] * * Then we can use av1_laplace_entropy() to compute the expected frame * rate. * *\ingroup tpl_modelling * * \param[in] q_index quantizer index * \param[in] block_count number of transform blocks * \param[in] abs_coeff_mean array of mean absolute deviation * \param[in] coeff_num number of coefficients per transform block * * \return expected frame rate */ double av1_laplace_estimate_frame_rate(int q_index, int block_count, const double *abs_coeff_mean, int coeff_num); #endif // CONFIG_BITRATE_ACCURACY /* *!\brief Init TplTxfmStats * * \param[in] tpl_txfm_stats a structure for storing transform stats * */ void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats); #if CONFIG_BITRATE_ACCURACY /* *!\brief Accumulate TplTxfmStats * * \param[in] sub_stats a structure for storing sub transform stats * \param[out] accumulated_stats a structure for storing accumulated *transform stats * */ void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats, TplTxfmStats *accumulated_stats); /* *!\brief Record a transform block into TplTxfmStats * * \param[in] tpl_txfm_stats A structure for storing transform stats * \param[out] coeff An array of transform coefficients. Its size * should equal to tpl_txfm_stats.coeff_num. * */ void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats, const tran_low_t *coeff); /* *!\brief Update abs_coeff_mean and ready of txfm_stats * If txfm_block_count > 0, this function will use abs_coeff_sum and * txfm_block_count to compute abs_coeff_mean. Moreover, reday flag * will be set to one. * * \param[in] txfm_stats A structure for storing transform stats */ void av1_tpl_txfm_stats_update_abs_coeff_mean(TplTxfmStats *txfm_stats); #endif // CONFIG_BITRATE_ACCURACY /*!\brief Estimate coefficient entropy using Laplace dsitribution * *\ingroup tpl_modelling * * This function is equivalent to -log2(laplace_prob()), where laplace_prob() *is defined in tpl_model_test.cc * * \param[in] q_step quantizer step size without any scaling * \param[in] b mean absolute deviation of Laplace *distribution \param[in] zero_bin_ratio zero bin's size is zero_bin_ratio ** q_step \param[in] qcoeff quantized coefficient * * \return estimated coefficient entropy * */ double av1_estimate_coeff_entropy(double q_step, double b, double zero_bin_ratio, int qcoeff); // TODO(angiebird): Add doxygen description here. int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist, int64_t srcrf_dist, int pix_num); /*!\brief Compute the overlap area between two blocks with the same size * *\ingroup tpl_modelling * * If there is no overlap, this function should return zero. * * \param[in] row_a row position of the first block * \param[in] col_a column position of the first block * \param[in] row_b row position of the second block * \param[in] col_b column position of the second block * \param[in] width width shared by the two blocks * \param[in] height height shared by the two blocks * * \return overlap area of the two blocks */ int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width, int height); /*!\brief Get current frame's q_index from tpl stats and leaf_qindex * * \param[in] tpl_data TPL struct * \param[in] gf_frame_index current frame index in the GOP * \param[in] leaf_qindex q index of leaf frame * \param[in] bit_depth bit depth * * \return q_index */ int av1_tpl_get_q_index(const TplParams *tpl_data, int gf_frame_index, int leaf_qindex, aom_bit_depth_t bit_depth); /*!\brief Compute the ratio between arf q step and the leaf q step based on * TPL stats * * \param[in] tpl_data TPL struct * \param[in] gf_frame_index current frame index in the GOP * \param[in] leaf_qindex q index of leaf frame * \param[in] bit_depth bit depth * * \return qstep_ratio */ double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index); /*!\brief Find a q index whose step size is near qstep_ratio * leaf_qstep * * \param[in] leaf_qindex q index of leaf frame * \param[in] qstep_ratio step ratio between target q index and * leaf q index \param[in] bit_depth bit depth * * \return q_index */ int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio, aom_bit_depth_t bit_depth); /*!\brief Improve the motion vector estimation by taking neighbors into * account. * * Use the upper and left neighbor block as the reference MVs. * Compute the minimum difference between current MV and reference MV. * * \param[in] tpl_frame Tpl frame struct * \param[in] row Current row * \param[in] col Current column * \param[in] step Step parameter for av1_tpl_ptr_pos * \param[in] tpl_stride Stride parameter for av1_tpl_ptr_pos * \param[in] right_shift Right shift parameter for * av1_tpl_ptr_pos */ int_mv av1_compute_mv_difference(const TplDepFrame *tpl_frame, int row, int col, int step, int tpl_stride, int right_shift); /*!\brief Compute the entropy of motion vectors for a single frame. * * \param[in] tpl_frame TPL frame struct * \param[in] right_shift right shift value for step * * \return Bits used by the motion vectors for one frame. */ double av1_tpl_compute_frame_mv_entropy(const TplDepFrame *tpl_frame, uint8_t right_shift); #if CONFIG_RATECTRL_LOG typedef struct { int coding_frame_count; int base_q_index; // Encode decision int q_index_list[VBR_RC_INFO_MAX_FRAMES]; double qstep_ratio_list[VBR_RC_INFO_MAX_FRAMES]; FRAME_UPDATE_TYPE update_type_list[VBR_RC_INFO_MAX_FRAMES]; // Frame stats TplTxfmStats txfm_stats_list[VBR_RC_INFO_MAX_FRAMES]; // Estimated encode results double est_coeff_rate_list[VBR_RC_INFO_MAX_FRAMES]; // Actual encode results double act_rate_list[VBR_RC_INFO_MAX_FRAMES]; double act_coeff_rate_list[VBR_RC_INFO_MAX_FRAMES]; } RATECTRL_LOG; static inline void rc_log_init(RATECTRL_LOG *rc_log) { av1_zero(*rc_log); } static inline void rc_log_frame_stats(RATECTRL_LOG *rc_log, int coding_index, const TplTxfmStats *txfm_stats) { rc_log->txfm_stats_list[coding_index] = *txfm_stats; } #if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY static inline void rc_log_frame_encode_param(RATECTRL_LOG *rc_log, int coding_index, double qstep_ratio, int q_index, FRAME_UPDATE_TYPE update_type) { rc_log->qstep_ratio_list[coding_index] = qstep_ratio; rc_log->q_index_list[coding_index] = q_index; rc_log->update_type_list[coding_index] = update_type; const TplTxfmStats *txfm_stats = &rc_log->txfm_stats_list[coding_index]; rc_log->est_coeff_rate_list[coding_index] = 0; if (txfm_stats->ready) { rc_log->est_coeff_rate_list[coding_index] = av1_laplace_estimate_frame_rate( q_index, txfm_stats->txfm_block_count, txfm_stats->abs_coeff_mean, txfm_stats->coeff_num); } } #endif // CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY static inline void rc_log_frame_entropy(RATECTRL_LOG *rc_log, int coding_index, double act_rate, double act_coeff_rate) { rc_log->act_rate_list[coding_index] = act_rate; rc_log->act_coeff_rate_list[coding_index] = act_coeff_rate; } static inline void rc_log_record_chunk_info(RATECTRL_LOG *rc_log, int base_q_index, int coding_frame_count) { rc_log->base_q_index = base_q_index; rc_log->coding_frame_count = coding_frame_count; } static inline void rc_log_show(const RATECTRL_LOG *rc_log) { printf("= chunk 1\n"); printf("coding_frame_count %d base_q_index %d\n", rc_log->coding_frame_count, rc_log->base_q_index); printf("= frame %d\n", rc_log->coding_frame_count); for (int coding_idx = 0; coding_idx < rc_log->coding_frame_count; coding_idx++) { printf( "coding_idx %d update_type %d q %d qstep_ratio %f est_coeff_rate %f " "act_coeff_rate %f act_rate %f\n", coding_idx, rc_log->update_type_list[coding_idx], rc_log->q_index_list[coding_idx], rc_log->qstep_ratio_list[coding_idx], rc_log->est_coeff_rate_list[coding_idx], rc_log->act_coeff_rate_list[coding_idx], rc_log->act_rate_list[coding_idx]); } } #endif // CONFIG_RATECTRL_LOG /*!\endcond */ #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_TPL_MODEL_H_ aom-3.12.1/av1/encoder/tune_butteraugli.c000066400000000000000000000305611477627663500202420ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "av1/encoder/tune_butteraugli.h" #include "aom_dsp/butteraugli.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encoder_utils.h" #include "av1/encoder/extend.h" #include "av1/encoder/var_based_part.h" static const int resize_factor = 2; static void set_mb_butteraugli_rdmult_scaling(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *recon, const double K) { AV1_COMMON *const cm = &cpi->common; SequenceHeader *const seq_params = cm->seq_params; const CommonModeInfoParams *const mi_params = &cm->mi_params; const aom_color_range_t color_range = seq_params->color_range != 0 ? AOM_CR_FULL_RANGE : AOM_CR_STUDIO_RANGE; const int bit_depth = cpi->td.mb.e_mbd.bd; const int width = source->y_crop_width; const int height = source->y_crop_height; const int ss_x = source->subsampling_x; const int ss_y = source->subsampling_y; float *diffmap; CHECK_MEM_ERROR(cm, diffmap, aom_malloc(width * height * sizeof(*diffmap))); if (!aom_calc_butteraugli(source, recon, bit_depth, seq_params->matrix_coefficients, color_range, diffmap)) { aom_internal_error(cm->error, AOM_CODEC_ERROR, "Failed to calculate Butteraugli distances."); } const int num_mi_w = mi_size_wide[butteraugli_rdo_bsize] / resize_factor; const int num_mi_h = mi_size_high[butteraugli_rdo_bsize] / resize_factor; const int num_cols = (mi_params->mi_cols / resize_factor + num_mi_w - 1) / num_mi_w; const int num_rows = (mi_params->mi_rows / resize_factor + num_mi_h - 1) / num_mi_h; const int block_w = num_mi_w << 2; const int block_h = num_mi_h << 2; double log_sum = 0.0; double blk_count = 0.0; // Loop through each block. for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { const int index = row * num_cols + col; const int y_start = row * block_h; const int x_start = col * block_w; float dbutteraugli = 0.0f; float dmse = 0.0f; float px_count = 0.0f; // Loop through each pixel. for (int y = y_start; y < y_start + block_h && y < height; y++) { for (int x = x_start; x < x_start + block_w && x < width; x++) { dbutteraugli += powf(diffmap[y * width + x], 12.0f); float px_diff = source->y_buffer[y * source->y_stride + x] - recon->y_buffer[y * recon->y_stride + x]; dmse += px_diff * px_diff; px_count += 1.0f; } } const int y_end = AOMMIN((y_start >> ss_y) + (block_h >> ss_y), (height + ss_y) >> ss_y); for (int y = y_start >> ss_y; y < y_end; y++) { const int x_end = AOMMIN((x_start >> ss_x) + (block_w >> ss_x), (width + ss_x) >> ss_x); for (int x = x_start >> ss_x; x < x_end; x++) { const int src_px_index = y * source->uv_stride + x; const int recon_px_index = y * recon->uv_stride + x; const float px_diff_u = (float)(source->u_buffer[src_px_index] - recon->u_buffer[recon_px_index]); const float px_diff_v = (float)(source->v_buffer[src_px_index] - recon->v_buffer[recon_px_index]); dmse += px_diff_u * px_diff_u + px_diff_v * px_diff_v; px_count += 2.0f; } } dbutteraugli = powf(dbutteraugli, 1.0f / 12.0f); dmse = dmse / px_count; const float eps = 0.01f; double weight; if (dbutteraugli < eps || dmse < eps) { weight = -1.0; } else { blk_count += 1.0; weight = dmse / dbutteraugli; weight = AOMMIN(weight, 5.0); weight += K; log_sum += log(weight); } cpi->butteraugli_info.rdmult_scaling_factors[index] = weight; } } // Geometric average of the weights. log_sum = exp(log_sum / blk_count); for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { const int index = row * num_cols + col; double *weight = &cpi->butteraugli_info.rdmult_scaling_factors[index]; if (*weight <= 0.0) { *weight = 1.0; } else { *weight /= log_sum; } *weight = AOMMIN(*weight, 2.5); *weight = AOMMAX(*weight, 0.4); } } aom_free(diffmap); } void av1_set_butteraugli_rdmult(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, int *rdmult) { assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI); if (!cpi->butteraugli_info.recon_set) { return; } const AV1_COMMON *const cm = &cpi->common; const int num_mi_w = mi_size_wide[butteraugli_rdo_bsize]; const int num_mi_h = mi_size_high[butteraugli_rdo_bsize]; const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w; const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h; double num_of_mi = 0.0; double geom_mean_of_scale = 0.0; for (int row = mi_row / num_mi_w; row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { for (int col = mi_col / num_mi_h; col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) { const int index = row * num_cols + col; geom_mean_of_scale += log(cpi->butteraugli_info.rdmult_scaling_factors[index]); num_of_mi += 1.0; } } geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi); *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5); *rdmult = AOMMAX(*rdmult, 0); av1_set_error_per_bit(&x->errorperbit, *rdmult); } static void copy_plane(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h) { for (int row = 0; row < h; row++) { memcpy(dst, src, w); src += src_stride; dst += dst_stride; } } static void copy_img(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int width, int height) { copy_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, width, height); const int width_uv = (width + src->subsampling_x) >> src->subsampling_x; const int height_uv = (height + src->subsampling_y) >> src->subsampling_y; copy_plane(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, width_uv, height_uv); copy_plane(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, width_uv, height_uv); } static void zero_plane(uint8_t *dst, int dst_stride, int h) { for (int row = 0; row < h; row++) { memset(dst, 0, dst_stride); dst += dst_stride; } } static void zero_img(YV12_BUFFER_CONFIG *dst) { zero_plane(dst->y_buffer, dst->y_stride, dst->y_height); zero_plane(dst->u_buffer, dst->uv_stride, dst->uv_height); zero_plane(dst->v_buffer, dst->uv_stride, dst->uv_height); } void av1_setup_butteraugli_source(AV1_COMP *cpi) { YV12_BUFFER_CONFIG *const dst = &cpi->butteraugli_info.source; AV1_COMMON *const cm = &cpi->common; const int width = cpi->source->y_crop_width; const int height = cpi->source->y_crop_height; const int bit_depth = cpi->td.mb.e_mbd.bd; const int ss_x = cpi->source->subsampling_x; const int ss_y = cpi->source->subsampling_y; if (dst->buffer_alloc_sz == 0) { aom_alloc_frame_buffer( dst, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); } av1_copy_and_extend_frame(cpi->source, dst); YV12_BUFFER_CONFIG *const resized_dst = &cpi->butteraugli_info.resized_source; if (resized_dst->buffer_alloc_sz == 0) { aom_alloc_frame_buffer( resized_dst, width / resize_factor, height / resize_factor, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); } if (!av1_resize_and_extend_frame_nonnormative( cpi->source, resized_dst, bit_depth, av1_num_planes(cm))) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Error allocating buffers during resize"); } zero_img(cpi->source); copy_img(resized_dst, cpi->source, width / resize_factor, height / resize_factor); } void av1_setup_butteraugli_rdmult_and_restore_source(AV1_COMP *cpi, double K) { av1_copy_and_extend_frame(&cpi->butteraugli_info.source, cpi->source); AV1_COMMON *const cm = &cpi->common; const int width = cpi->source->y_crop_width; const int height = cpi->source->y_crop_height; const int ss_x = cpi->source->subsampling_x; const int ss_y = cpi->source->subsampling_y; YV12_BUFFER_CONFIG resized_recon; memset(&resized_recon, 0, sizeof(resized_recon)); aom_alloc_frame_buffer( &resized_recon, width / resize_factor, height / resize_factor, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); copy_img(&cpi->common.cur_frame->buf, &resized_recon, width / resize_factor, height / resize_factor); set_mb_butteraugli_rdmult_scaling(cpi, &cpi->butteraugli_info.resized_source, &resized_recon, K); cpi->butteraugli_info.recon_set = true; aom_free_frame_buffer(&resized_recon); } void av1_setup_butteraugli_rdmult(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const QuantizationCfg *const q_cfg = &oxcf->q_cfg; const int q_index = 96; // Setup necessary params for encoding, including frame source, etc. if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi); av1_set_frame_size(cpi, cm->superres_upscaled_width, cm->superres_upscaled_height); cpi->source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); if (cpi->unscaled_last_source != NULL) { cpi->last_source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_last_source, &cpi->scaled_last_source, cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); } av1_setup_butteraugli_source(cpi); av1_setup_frame(cpi); if (cm->seg.enabled) { if (!cm->seg.update_data && cm->prev_frame) { segfeatures_copy(&cm->seg, &cm->prev_frame->seg); cm->seg.enabled = cm->prev_frame->seg.enabled; } else { av1_calculate_segdata(&cm->seg); } } else { memset(&cm->seg, 0, sizeof(cm->seg)); } segfeatures_copy(&cm->cur_frame->seg, &cm->seg); cm->cur_frame->seg.enabled = cm->seg.enabled; const PARTITION_SEARCH_TYPE partition_search_type = cpi->sf.part_sf.partition_search_type; const BLOCK_SIZE fixed_partition_size = cpi->sf.part_sf.fixed_partition_size; // Enable a quicker pass by uncommenting the following lines: // cpi->sf.part_sf.partition_search_type = FIXED_PARTITION; // cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32; av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q_index, q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq, oxcf->mode == ALLINTRA, oxcf->tune_cfg.tuning); av1_set_speed_features_qindex_dependent(cpi, oxcf->speed); av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, cm->seq_params->bit_depth); av1_set_variance_partition_thresholds(cpi, q_index, 0); av1_encode_frame(cpi); av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.3); cpi->sf.part_sf.partition_search_type = partition_search_type; cpi->sf.part_sf.fixed_partition_size = fixed_partition_size; } aom-3.12.1/av1/encoder/tune_butteraugli.h000066400000000000000000000033151477627663500202440ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_ #define AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_ #include "aom_scale/yv12config.h" #include "av1/common/enums.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/block.h" typedef struct { // Stores the scaling factors for rdmult when tuning for Butteraugli. // rdmult_scaling_factors[row * num_cols + col] stores the scaling factors for // 4x4 block at (row, col). double *rdmult_scaling_factors; YV12_BUFFER_CONFIG source, resized_source; bool recon_set; } TuneButteraugliInfo; struct AV1_COMP; static const BLOCK_SIZE butteraugli_rdo_bsize = BLOCK_16X16; void av1_set_butteraugli_rdmult(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, int *rdmult); void av1_setup_butteraugli_source(struct AV1_COMP *cpi); // 'K' is used to balance the rate-distortion distribution between PSNR // and Butteraugli. void av1_setup_butteraugli_rdmult_and_restore_source(struct AV1_COMP *cpi, double K); void av1_setup_butteraugli_rdmult(struct AV1_COMP *cpi); #endif // AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_ aom-3.12.1/av1/encoder/tune_vmaf.c000066400000000000000000001347041477627663500166500ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/encoder/tune_vmaf.h" #include "aom_dsp/psnr.h" #include "av1/encoder/extend.h" #include "av1/encoder/rdopt.h" #include "config/aom_scale_rtcd.h" static const double kBaselineVmaf = 97.42773; static double get_layer_value(const double *array, int layer) { while (array[layer] < 0.0 && layer > 0) layer--; return AOMMAX(array[layer], 0.0); } static void motion_search(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *ref, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, FULLPEL_MV *ref_mv) { // Block information (ONLY Y-plane is used for motion search). const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; const int y_stride = src->y_stride; assert(y_stride == ref->y_stride); const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width; // Save input state. MACROBLOCK *const mb = &cpi->td.mb; MACROBLOCKD *const mbd = &mb->e_mbd; const struct buf_2d ori_src_buf = mb->plane[0].src; const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0]; // Parameters used for motion search. FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; FULLPEL_MV_STATS best_mv_stats; const SEARCH_METHODS search_method = NSTEP; const search_site_config *search_site_cfg = cpi->mv_search_params.search_site_cfg[SS_CFG_FPF]; const int step_param = av1_init_search_range(AOMMAX(src->y_crop_width, src->y_crop_height)); // Baseline position for motion search (used for rate distortion comparison). const MV baseline_mv = kZeroMv; // Setup. mb->plane[0].src.buf = src->y_buffer + y_offset; mb->plane[0].src.stride = y_stride; mbd->plane[0].pre[0].buf = ref->y_buffer + y_offset; mbd->plane[0].pre[0].stride = y_stride; // Unused intermediate results for motion search. int cost_list[5]; // Do motion search. // Only do full search on the entire block. av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size, &baseline_mv, *ref_mv, search_site_cfg, search_method, /*fine_search_interval=*/0); av1_full_pixel_search(*ref_mv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list), ref_mv, &best_mv_stats, NULL); // Restore input state. mb->plane[0].src = ori_src_buf; mbd->plane[0].pre[0] = ori_pre_buf; } static unsigned int residual_variance(const AV1_COMP *cpi, const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *ref, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, FULLPEL_MV ref_mv, unsigned int *sse) { const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; const int y_stride = src->y_stride; assert(y_stride == ref->y_stride); const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width; const int mv_offset = ref_mv.row * y_stride + ref_mv.col; const unsigned int var = cpi->ppi->fn_ptr[block_size].vf( ref->y_buffer + y_offset + mv_offset, y_stride, src->y_buffer + y_offset, y_stride, sse); return var; } static double frame_average_variance(const AV1_COMP *const cpi, const YV12_BUFFER_CONFIG *const frame) { const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; const uint8_t *const y_buffer = frame->y_buffer; const int y_stride = frame->y_stride; const BLOCK_SIZE block_size = BLOCK_64X64; const int block_w = mi_size_wide[block_size] * 4; const int block_h = mi_size_high[block_size] * 4; int row, col; double var = 0.0, var_count = 0.0; const int use_hbd = frame->flags & YV12_FLAG_HIGHBITDEPTH; // Loop through each block. for (row = 0; row < frame->y_height / block_h; ++row) { for (col = 0; col < frame->y_width / block_w; ++col) { struct buf_2d buf; const int row_offset_y = row * block_h; const int col_offset_y = col * block_w; buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y; buf.stride = y_stride; var += av1_get_perpixel_variance(cpi, xd, &buf, block_size, AOM_PLANE_Y, use_hbd); var_count += 1.0; } } var /= var_count; return var; } static double residual_frame_average_variance(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *ref, FULLPEL_MV *mvs) { if (ref == NULL) return frame_average_variance(cpi, src); const BLOCK_SIZE block_size = BLOCK_16X16; const int frame_height = src->y_height; const int frame_width = src->y_width; const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; const int mb_rows = (frame_height + mb_height - 1) / mb_height; const int mb_cols = (frame_width + mb_width - 1) / mb_width; const int num_planes = av1_num_planes(&cpi->common); const int mi_h = mi_size_high_log2[block_size]; const int mi_w = mi_size_wide_log2[block_size]; assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); // Save input state. MACROBLOCK *const mb = &cpi->td.mb; MACROBLOCKD *const mbd = &mb->e_mbd; uint8_t *input_buffer[MAX_MB_PLANE]; for (int i = 0; i < num_planes; i++) { input_buffer[i] = mbd->plane[i].pre[0].buf; } MB_MODE_INFO **input_mb_mode_info = mbd->mi; bool do_motion_search = false; if (mvs == NULL) { do_motion_search = true; CHECK_MEM_ERROR(&cpi->common, mvs, (FULLPEL_MV *)aom_calloc(mb_rows * mb_cols, sizeof(*mvs))); } unsigned int variance = 0; // Perform temporal filtering block by block. for (int mb_row = 0; mb_row < mb_rows; mb_row++) { av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits, (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2), cpi->oxcf.border_in_pixels); for (int mb_col = 0; mb_col < mb_cols; mb_col++) { av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits, (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2), cpi->oxcf.border_in_pixels); FULLPEL_MV *ref_mv = &mvs[mb_col + mb_row * mb_cols]; if (do_motion_search) { motion_search(cpi, src, ref, block_size, mb_row, mb_col, ref_mv); } unsigned int mv_sse; const unsigned int blk_var = residual_variance( cpi, src, ref, block_size, mb_row, mb_col, *ref_mv, &mv_sse); variance += blk_var; } } // Restore input state for (int i = 0; i < num_planes; i++) { mbd->plane[i].pre[0].buf = input_buffer[i]; } mbd->mi = input_mb_mode_info; return (double)variance / (double)(mb_rows * mb_cols); } // TODO(sdeng): Add the SIMD implementation. static inline void highbd_unsharp_rect(const uint16_t *source, int source_stride, const uint16_t *blurred, int blurred_stride, uint16_t *dst, int dst_stride, int w, int h, double amount, int bit_depth) { const int max_value = (1 << bit_depth) - 1; for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { const double val = (double)source[j] + amount * ((double)source[j] - (double)blurred[j]); dst[j] = (uint16_t)clamp((int)(val + 0.5), 0, max_value); } source += source_stride; blurred += blurred_stride; dst += dst_stride; } } static inline void unsharp_rect(const uint8_t *source, int source_stride, const uint8_t *blurred, int blurred_stride, uint8_t *dst, int dst_stride, int w, int h, double amount) { for (int i = 0; i < h; ++i) { for (int j = 0; j < w; ++j) { const double val = (double)source[j] + amount * ((double)source[j] - (double)blurred[j]); dst[j] = (uint8_t)clamp((int)(val + 0.5), 0, 255); } source += source_stride; blurred += blurred_stride; dst += dst_stride; } } static inline void unsharp(const AV1_COMP *const cpi, const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *blurred, const YV12_BUFFER_CONFIG *dst, double amount) { const int bit_depth = cpi->td.mb.e_mbd.bd; if (cpi->common.seq_params->use_highbitdepth) { assert(source->flags & YV12_FLAG_HIGHBITDEPTH); assert(blurred->flags & YV12_FLAG_HIGHBITDEPTH); assert(dst->flags & YV12_FLAG_HIGHBITDEPTH); highbd_unsharp_rect(CONVERT_TO_SHORTPTR(source->y_buffer), source->y_stride, CONVERT_TO_SHORTPTR(blurred->y_buffer), blurred->y_stride, CONVERT_TO_SHORTPTR(dst->y_buffer), dst->y_stride, source->y_width, source->y_height, amount, bit_depth); } else { unsharp_rect(source->y_buffer, source->y_stride, blurred->y_buffer, blurred->y_stride, dst->y_buffer, dst->y_stride, source->y_width, source->y_height, amount); } } // 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128, // all co-efficients must be even. // The array is of size 9 to allow passing gauss_filter + 1 to // _mm_loadu_si128() in prepare_coeffs_6t(). DECLARE_ALIGNED(16, static const int16_t, gauss_filter[9]) = { 0, 8, 30, 52, 30, 8, 0, 0 }; static inline void gaussian_blur(const int bit_depth, const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dst) { const int block_size = BLOCK_128X128; const int block_w = mi_size_wide[block_size] * 4; const int block_h = mi_size_high[block_size] * 4; const int num_cols = (source->y_width + block_w - 1) / block_w; const int num_rows = (source->y_height + block_h - 1) / block_h; int row, col; ConvolveParams conv_params = get_conv_params(0, 0, bit_depth); InterpFilterParams filter = { .filter_ptr = gauss_filter, .taps = 8, .interp_filter = EIGHTTAP_REGULAR }; for (row = 0; row < num_rows; ++row) { for (col = 0; col < num_cols; ++col) { const int row_offset_y = row * block_h; const int col_offset_y = col * block_w; uint8_t *src_buf = source->y_buffer + row_offset_y * source->y_stride + col_offset_y; uint8_t *dst_buf = dst->y_buffer + row_offset_y * dst->y_stride + col_offset_y; if (source->flags & YV12_FLAG_HIGHBITDEPTH) { av1_highbd_convolve_2d_sr( CONVERT_TO_SHORTPTR(src_buf), source->y_stride, CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride, block_w, block_h, &filter, &filter, 0, 0, &conv_params, bit_depth); } else { av1_convolve_2d_sr(src_buf, source->y_stride, dst_buf, dst->y_stride, block_w, block_h, &filter, &filter, 0, 0, &conv_params); } } } } static inline double cal_approx_vmaf( const AV1_COMP *const cpi, double source_variance, const YV12_BUFFER_CONFIG *const source, const YV12_BUFFER_CONFIG *const sharpened) { const int bit_depth = cpi->td.mb.e_mbd.bd; const bool cal_vmaf_neg = cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; double new_vmaf; aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, sharpened, bit_depth, cal_vmaf_neg, &new_vmaf); const double sharpened_var = frame_average_variance(cpi, sharpened); return source_variance / sharpened_var * (new_vmaf - kBaselineVmaf); } static double find_best_frame_unsharp_amount_loop( const AV1_COMP *const cpi, const YV12_BUFFER_CONFIG *const source, const YV12_BUFFER_CONFIG *const blurred, const YV12_BUFFER_CONFIG *const sharpened, double best_vmaf, const double baseline_variance, const double unsharp_amount_start, const double step_size, const int max_loop_count, const double max_amount) { const double min_amount = 0.0; int loop_count = 0; double approx_vmaf = best_vmaf; double unsharp_amount = unsharp_amount_start; do { best_vmaf = approx_vmaf; unsharp_amount += step_size; if (unsharp_amount > max_amount || unsharp_amount < min_amount) break; unsharp(cpi, source, blurred, sharpened, unsharp_amount); approx_vmaf = cal_approx_vmaf(cpi, baseline_variance, source, sharpened); loop_count++; } while (approx_vmaf > best_vmaf && loop_count < max_loop_count); unsharp_amount = approx_vmaf > best_vmaf ? unsharp_amount : unsharp_amount - step_size; return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount)); } static double find_best_frame_unsharp_amount( const AV1_COMP *const cpi, const YV12_BUFFER_CONFIG *const source, const YV12_BUFFER_CONFIG *const blurred, const double unsharp_amount_start, const double step_size, const int max_loop_count, const double max_filter_amount) { const AV1_COMMON *const cm = &cpi->common; const int width = source->y_width; const int height = source->y_height; YV12_BUFFER_CONFIG sharpened; memset(&sharpened, 0, sizeof(sharpened)); aom_alloc_frame_buffer( &sharpened, width, height, source->subsampling_x, source->subsampling_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); const double baseline_variance = frame_average_variance(cpi, source); double unsharp_amount; if (unsharp_amount_start <= step_size) { unsharp_amount = find_best_frame_unsharp_amount_loop( cpi, source, blurred, &sharpened, 0.0, baseline_variance, 0.0, step_size, max_loop_count, max_filter_amount); } else { double a0 = unsharp_amount_start - step_size, a1 = unsharp_amount_start; double v0, v1; unsharp(cpi, source, blurred, &sharpened, a0); v0 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened); unsharp(cpi, source, blurred, &sharpened, a1); v1 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened); if (fabs(v0 - v1) < 0.01) { unsharp_amount = a0; } else if (v0 > v1) { unsharp_amount = find_best_frame_unsharp_amount_loop( cpi, source, blurred, &sharpened, v0, baseline_variance, a0, -step_size, max_loop_count, max_filter_amount); } else { unsharp_amount = find_best_frame_unsharp_amount_loop( cpi, source, blurred, &sharpened, v1, baseline_variance, a1, step_size, max_loop_count, max_filter_amount); } } aom_free_frame_buffer(&sharpened); return unsharp_amount; } void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi, const YV12_BUFFER_CONFIG *const source) { const AV1_COMMON *const cm = &cpi->common; const int bit_depth = cpi->td.mb.e_mbd.bd; const int width = source->y_width; const int height = source->y_height; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); const double best_frame_unsharp_amount = get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); if (best_frame_unsharp_amount <= 0.0) return; YV12_BUFFER_CONFIG blurred; memset(&blurred, 0, sizeof(blurred)); aom_alloc_frame_buffer( &blurred, width, height, source->subsampling_x, source->subsampling_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); gaussian_blur(bit_depth, source, &blurred); unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount); aom_free_frame_buffer(&blurred); } void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi, const YV12_BUFFER_CONFIG *const source) { const AV1_COMMON *const cm = &cpi->common; const int bit_depth = cpi->td.mb.e_mbd.bd; const int width = source->y_width; const int height = source->y_height; YV12_BUFFER_CONFIG source_extended, blurred; memset(&source_extended, 0, sizeof(source_extended)); memset(&blurred, 0, sizeof(blurred)); aom_alloc_frame_buffer( &source_extended, width, height, source->subsampling_x, source->subsampling_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer( &blurred, width, height, source->subsampling_x, source->subsampling_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); av1_copy_and_extend_frame(source, &source_extended); gaussian_blur(bit_depth, &source_extended, &blurred); aom_free_frame_buffer(&source_extended); const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); const double last_frame_unsharp_amount = get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); const double best_frame_unsharp_amount = find_best_frame_unsharp_amount( cpi, source, &blurred, last_frame_unsharp_amount, 0.05, 20, 1.01); cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] = best_frame_unsharp_amount; unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount); aom_free_frame_buffer(&blurred); } void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi, const YV12_BUFFER_CONFIG *const source) { const AV1_COMMON *const cm = &cpi->common; const int width = source->y_width; const int height = source->y_height; const int bit_depth = cpi->td.mb.e_mbd.bd; const int ss_x = source->subsampling_x; const int ss_y = source->subsampling_y; YV12_BUFFER_CONFIG source_extended, blurred; memset(&blurred, 0, sizeof(blurred)); memset(&source_extended, 0, sizeof(source_extended)); aom_alloc_frame_buffer( &blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&source_extended, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); av1_copy_and_extend_frame(source, &source_extended); gaussian_blur(bit_depth, &source_extended, &blurred); aom_free_frame_buffer(&source_extended); const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); const double last_frame_unsharp_amount = get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); const double best_frame_unsharp_amount = find_best_frame_unsharp_amount( cpi, source, &blurred, last_frame_unsharp_amount, 0.05, 20, 1.01); cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] = best_frame_unsharp_amount; const int block_size = BLOCK_64X64; const int block_w = mi_size_wide[block_size] * 4; const int block_h = mi_size_high[block_size] * 4; const int num_cols = (source->y_width + block_w - 1) / block_w; const int num_rows = (source->y_height + block_h - 1) / block_h; double *best_unsharp_amounts = aom_calloc(num_cols * num_rows, sizeof(*best_unsharp_amounts)); if (!best_unsharp_amounts) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Error allocating vmaf data"); } YV12_BUFFER_CONFIG source_block, blurred_block; memset(&source_block, 0, sizeof(source_block)); memset(&blurred_block, 0, sizeof(blurred_block)); aom_alloc_frame_buffer(&source_block, block_w, block_h, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&blurred_block, block_w, block_h, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { const int row_offset_y = row * block_h; const int col_offset_y = col * block_w; const int block_width = AOMMIN(width - col_offset_y, block_w); const int block_height = AOMMIN(height - row_offset_y, block_h); const int index = col + row * num_cols; if (cm->seq_params->use_highbitdepth) { assert(source->flags & YV12_FLAG_HIGHBITDEPTH); assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH); uint16_t *frame_src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) + row_offset_y * source->y_stride + col_offset_y; uint16_t *frame_blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) + row_offset_y * blurred.y_stride + col_offset_y; uint16_t *blurred_dst = CONVERT_TO_SHORTPTR(blurred_block.y_buffer); uint16_t *src_dst = CONVERT_TO_SHORTPTR(source_block.y_buffer); // Copy block from source frame. for (int i = 0; i < block_h; ++i) { for (int j = 0; j < block_w; ++j) { if (i >= block_height || j >= block_width) { src_dst[j] = 0; blurred_dst[j] = 0; } else { src_dst[j] = frame_src_buf[j]; blurred_dst[j] = frame_blurred_buf[j]; } } frame_src_buf += source->y_stride; frame_blurred_buf += blurred.y_stride; src_dst += source_block.y_stride; blurred_dst += blurred_block.y_stride; } } else { uint8_t *frame_src_buf = source->y_buffer + row_offset_y * source->y_stride + col_offset_y; uint8_t *frame_blurred_buf = blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y; uint8_t *blurred_dst = blurred_block.y_buffer; uint8_t *src_dst = source_block.y_buffer; // Copy block from source frame. for (int i = 0; i < block_h; ++i) { for (int j = 0; j < block_w; ++j) { if (i >= block_height || j >= block_width) { src_dst[j] = 0; blurred_dst[j] = 0; } else { src_dst[j] = frame_src_buf[j]; blurred_dst[j] = frame_blurred_buf[j]; } } frame_src_buf += source->y_stride; frame_blurred_buf += blurred.y_stride; src_dst += source_block.y_stride; blurred_dst += blurred_block.y_stride; } } best_unsharp_amounts[index] = find_best_frame_unsharp_amount( cpi, &source_block, &blurred_block, best_frame_unsharp_amount, 0.1, 3, 1.5); } } // Apply best blur amounts for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { const int row_offset_y = row * block_h; const int col_offset_y = col * block_w; const int block_width = AOMMIN(source->y_width - col_offset_y, block_w); const int block_height = AOMMIN(source->y_height - row_offset_y, block_h); const int index = col + row * num_cols; if (cm->seq_params->use_highbitdepth) { assert(source->flags & YV12_FLAG_HIGHBITDEPTH); assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH); uint16_t *src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) + row_offset_y * source->y_stride + col_offset_y; uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) + row_offset_y * blurred.y_stride + col_offset_y; highbd_unsharp_rect(src_buf, source->y_stride, blurred_buf, blurred.y_stride, src_buf, source->y_stride, block_width, block_height, best_unsharp_amounts[index], bit_depth); } else { uint8_t *src_buf = source->y_buffer + row_offset_y * source->y_stride + col_offset_y; uint8_t *blurred_buf = blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y; unsharp_rect(src_buf, source->y_stride, blurred_buf, blurred.y_stride, src_buf, source->y_stride, block_width, block_height, best_unsharp_amounts[index]); } } } aom_free_frame_buffer(&source_block); aom_free_frame_buffer(&blurred_block); aom_free_frame_buffer(&blurred); aom_free(best_unsharp_amounts); } void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; const int y_width = cpi->source->y_width; const int y_height = cpi->source->y_height; const int resized_block_size = BLOCK_32X32; const int resize_factor = 2; const int bit_depth = cpi->td.mb.e_mbd.bd; const int ss_x = cpi->source->subsampling_x; const int ss_y = cpi->source->subsampling_y; YV12_BUFFER_CONFIG resized_source; memset(&resized_source, 0, sizeof(resized_source)); aom_alloc_frame_buffer( &resized_source, y_width / resize_factor, y_height / resize_factor, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); if (!av1_resize_and_extend_frame_nonnormative( cpi->source, &resized_source, bit_depth, av1_num_planes(cm))) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Error allocating buffers during resize"); } const int resized_y_width = resized_source.y_width; const int resized_y_height = resized_source.y_height; const int resized_block_w = mi_size_wide[resized_block_size] * 4; const int resized_block_h = mi_size_high[resized_block_size] * 4; const int num_cols = (resized_y_width + resized_block_w - 1) / resized_block_w; const int num_rows = (resized_y_height + resized_block_h - 1) / resized_block_h; YV12_BUFFER_CONFIG blurred; memset(&blurred, 0, sizeof(blurred)); aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); gaussian_blur(bit_depth, &resized_source, &blurred); YV12_BUFFER_CONFIG recon; memset(&recon, 0, sizeof(recon)); aom_alloc_frame_buffer(&recon, resized_y_width, resized_y_height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); aom_yv12_copy_frame(&resized_source, &recon, 1); VmafContext *vmaf_context; const bool cal_vmaf_neg = cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; aom_init_vmaf_context(&vmaf_context, cpi->vmaf_info.vmaf_model, cal_vmaf_neg); unsigned int *sses = aom_calloc(num_rows * num_cols, sizeof(*sses)); if (!sses) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Error allocating vmaf data"); } // Loop through each 'block_size' block. for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { const int index = row * num_cols + col; const int row_offset_y = row * resized_block_h; const int col_offset_y = col * resized_block_w; uint8_t *const orig_buf = resized_source.y_buffer + row_offset_y * resized_source.y_stride + col_offset_y; uint8_t *const blurred_buf = blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y; cpi->ppi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride, blurred_buf, blurred.y_stride, &sses[index]); uint8_t *const recon_buf = recon.y_buffer + row_offset_y * recon.y_stride + col_offset_y; // Set recon buf if (cpi->common.seq_params->use_highbitdepth) { highbd_unsharp_rect(CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride, CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride, CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride, resized_block_w, resized_block_h, 0.0, bit_depth); } else { unsharp_rect(blurred_buf, blurred.y_stride, blurred_buf, blurred.y_stride, recon_buf, recon.y_stride, resized_block_w, resized_block_h, 0.0); } aom_read_vmaf_image(vmaf_context, &resized_source, &recon, bit_depth, index); // Restore recon buf if (cpi->common.seq_params->use_highbitdepth) { highbd_unsharp_rect( CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride, CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride, CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride, resized_block_w, resized_block_h, 0.0, bit_depth); } else { unsharp_rect(orig_buf, resized_source.y_stride, orig_buf, resized_source.y_stride, recon_buf, recon.y_stride, resized_block_w, resized_block_h, 0.0); } } } aom_flush_vmaf_context(vmaf_context); for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { const int index = row * num_cols + col; const double vmaf = aom_calc_vmaf_at_index( vmaf_context, cpi->vmaf_info.vmaf_model, index); const double dvmaf = kBaselineVmaf - vmaf; const double mse = (double)sses[index] / (double)(resized_y_width * resized_y_height); double weight; const double eps = 0.01 / (num_rows * num_cols); if (dvmaf < eps || mse < eps) { weight = 1.0; } else { weight = mse / dvmaf; } // Normalize it with a data fitted model. weight = 6.0 * (1.0 - exp(-0.05 * weight)) + 0.8; cpi->vmaf_info.rdmult_scaling_factors[index] = weight; } } aom_free_frame_buffer(&resized_source); aom_free_frame_buffer(&blurred); aom_close_vmaf_context(vmaf_context); aom_free(sses); } void av1_set_vmaf_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize, const int mi_row, const int mi_col, int *const rdmult) { const AV1_COMMON *const cm = &cpi->common; const int bsize_base = BLOCK_64X64; const int num_mi_w = mi_size_wide[bsize_base]; const int num_mi_h = mi_size_high[bsize_base]; const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w; const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h; int row, col; double num_of_mi = 0.0; double geom_mean_of_scale = 0.0; for (row = mi_row / num_mi_w; row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { for (col = mi_col / num_mi_h; col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) { const int index = row * num_cols + col; geom_mean_of_scale += log(cpi->vmaf_info.rdmult_scaling_factors[index]); num_of_mi += 1.0; } } geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi); *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5); *rdmult = AOMMAX(*rdmult, 0); av1_set_error_per_bit(&x->errorperbit, *rdmult); } // TODO(sdeng): replace them with the SIMD versions. static inline double highbd_image_sad_c(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, int w, int h) { double accum = 0.0; int i, j; for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { double img1px = src[i * src_stride + j]; double img2px = ref[i * ref_stride + j]; accum += fabs(img1px - img2px); } } return accum / (double)(h * w); } static inline double image_sad_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int w, int h) { double accum = 0.0; int i, j; for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { double img1px = src[i * src_stride + j]; double img2px = ref[i * ref_stride + j]; accum += fabs(img1px - img2px); } } return accum / (double)(h * w); } static double calc_vmaf_motion_score(const AV1_COMP *const cpi, const AV1_COMMON *const cm, const YV12_BUFFER_CONFIG *const cur, const YV12_BUFFER_CONFIG *const last, const YV12_BUFFER_CONFIG *const next) { const int y_width = cur->y_width; const int y_height = cur->y_height; YV12_BUFFER_CONFIG blurred_cur, blurred_last, blurred_next; const int bit_depth = cpi->td.mb.e_mbd.bd; const int ss_x = cur->subsampling_x; const int ss_y = cur->subsampling_y; memset(&blurred_cur, 0, sizeof(blurred_cur)); memset(&blurred_last, 0, sizeof(blurred_last)); memset(&blurred_next, 0, sizeof(blurred_next)); aom_alloc_frame_buffer(&blurred_cur, y_width, y_height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&blurred_last, y_width, y_height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&blurred_next, y_width, y_height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); gaussian_blur(bit_depth, cur, &blurred_cur); gaussian_blur(bit_depth, last, &blurred_last); if (next) gaussian_blur(bit_depth, next, &blurred_next); double motion1, motion2 = 65536.0; if (cm->seq_params->use_highbitdepth) { assert(blurred_cur.flags & YV12_FLAG_HIGHBITDEPTH); assert(blurred_last.flags & YV12_FLAG_HIGHBITDEPTH); const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8)); motion1 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer), blurred_cur.y_stride, CONVERT_TO_SHORTPTR(blurred_last.y_buffer), blurred_last.y_stride, y_width, y_height) * scale_factor; if (next) { assert(blurred_next.flags & YV12_FLAG_HIGHBITDEPTH); motion2 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer), blurred_cur.y_stride, CONVERT_TO_SHORTPTR(blurred_next.y_buffer), blurred_next.y_stride, y_width, y_height) * scale_factor; } } else { motion1 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride, blurred_last.y_buffer, blurred_last.y_stride, y_width, y_height); if (next) { motion2 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride, blurred_next.y_buffer, blurred_next.y_stride, y_width, y_height); } } aom_free_frame_buffer(&blurred_cur); aom_free_frame_buffer(&blurred_last); aom_free_frame_buffer(&blurred_next); return AOMMIN(motion1, motion2); } static inline void get_neighbor_frames(const AV1_COMP *const cpi, const YV12_BUFFER_CONFIG **last, const YV12_BUFFER_CONFIG **next) { const AV1_COMMON *const cm = &cpi->common; const GF_GROUP *gf_group = &cpi->ppi->gf_group; const int src_index = cm->show_frame != 0 ? 0 : gf_group->arf_src_offset[cpi->gf_frame_index]; struct lookahead_entry *last_entry = av1_lookahead_peek( cpi->ppi->lookahead, src_index - 1, cpi->compressor_stage); struct lookahead_entry *next_entry = av1_lookahead_peek( cpi->ppi->lookahead, src_index + 1, cpi->compressor_stage); *next = &next_entry->img; *last = cm->show_frame ? cpi->last_source : &last_entry->img; } // Calculates the new qindex from the VMAF motion score. This is based on the // observation: when the motion score becomes higher, the VMAF score of the // same source and distorted frames would become higher. int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) { const AV1_COMMON *const cm = &cpi->common; if (cm->current_frame.frame_number == 0 || cpi->oxcf.pass == 1) { return current_qindex; } const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); const double last_frame_ysse = get_layer_value(cpi->vmaf_info.last_frame_ysse, layer_depth); const double last_frame_vmaf = get_layer_value(cpi->vmaf_info.last_frame_vmaf, layer_depth); const int bit_depth = cpi->td.mb.e_mbd.bd; const double approx_sse = last_frame_ysse / (double)((1 << (bit_depth - 8)) * (1 << (bit_depth - 8))); const double approx_dvmaf = kBaselineVmaf - last_frame_vmaf; const double sse_threshold = 0.01 * cpi->source->y_width * cpi->source->y_height; const double vmaf_threshold = 0.01; if (approx_sse < sse_threshold || approx_dvmaf < vmaf_threshold) { return current_qindex; } const YV12_BUFFER_CONFIG *cur_buf = cpi->source; if (cm->show_frame == 0) { const int src_index = gf_group->arf_src_offset[cpi->gf_frame_index]; struct lookahead_entry *cur_entry = av1_lookahead_peek( cpi->ppi->lookahead, src_index, cpi->compressor_stage); cur_buf = &cur_entry->img; } assert(cur_buf); const YV12_BUFFER_CONFIG *next_buf, *last_buf; get_neighbor_frames(cpi, &last_buf, &next_buf); assert(last_buf); const double motion = calc_vmaf_motion_score(cpi, cm, cur_buf, last_buf, next_buf); // Get dVMAF through a data fitted model. const double dvmaf = 26.11 * (1.0 - exp(-0.06 * motion)); const double dsse = dvmaf * approx_sse / approx_dvmaf; // Clamping beta to address VQ issue (aomedia:3170). const double beta = AOMMAX(approx_sse / (dsse + approx_sse), 0.5); const int offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, current_qindex, beta); int qindex = current_qindex + offset; qindex = AOMMIN(qindex, MAXQ); qindex = AOMMAX(qindex, MINQ); return qindex; } static inline double cal_approx_score( AV1_COMP *const cpi, double src_variance, double new_variance, double src_score, const YV12_BUFFER_CONFIG *const src, const YV12_BUFFER_CONFIG *const recon_sharpened) { double score; const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; const bool cal_vmaf_neg = cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; aom_calc_vmaf(cpi->vmaf_info.vmaf_model, src, recon_sharpened, bit_depth, cal_vmaf_neg, &score); return src_variance / new_variance * (score - src_score); } static double find_best_frame_unsharp_amount_loop_neg( AV1_COMP *const cpi, double src_variance, double base_score, const YV12_BUFFER_CONFIG *const src, const YV12_BUFFER_CONFIG *const recon, const YV12_BUFFER_CONFIG *const ref, const YV12_BUFFER_CONFIG *const src_blurred, const YV12_BUFFER_CONFIG *const recon_blurred, const YV12_BUFFER_CONFIG *const src_sharpened, const YV12_BUFFER_CONFIG *const recon_sharpened, FULLPEL_MV *mvs, double best_score, const double unsharp_amount_start, const double step_size, const int max_loop_count, const double max_amount) { const double min_amount = 0.0; int loop_count = 0; double approx_score = best_score; double unsharp_amount = unsharp_amount_start; do { best_score = approx_score; unsharp_amount += step_size; if (unsharp_amount > max_amount || unsharp_amount < min_amount) break; unsharp(cpi, recon, recon_blurred, recon_sharpened, unsharp_amount); unsharp(cpi, src, src_blurred, src_sharpened, unsharp_amount); const double new_variance = residual_frame_average_variance(cpi, src_sharpened, ref, mvs); approx_score = cal_approx_score(cpi, src_variance, new_variance, base_score, src, recon_sharpened); loop_count++; } while (approx_score > best_score && loop_count < max_loop_count); unsharp_amount = approx_score > best_score ? unsharp_amount : unsharp_amount - step_size; return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount)); } static double find_best_frame_unsharp_amount_neg( AV1_COMP *const cpi, const YV12_BUFFER_CONFIG *const src, const YV12_BUFFER_CONFIG *const recon, const YV12_BUFFER_CONFIG *const ref, double base_score, const double unsharp_amount_start, const double step_size, const int max_loop_count, const double max_filter_amount) { FULLPEL_MV *mvs = NULL; const double src_variance = residual_frame_average_variance(cpi, src, ref, mvs); const AV1_COMMON *const cm = &cpi->common; const int width = recon->y_width; const int height = recon->y_height; const int bit_depth = cpi->td.mb.e_mbd.bd; const int ss_x = recon->subsampling_x; const int ss_y = recon->subsampling_y; YV12_BUFFER_CONFIG src_blurred, recon_blurred, src_sharpened, recon_sharpened; memset(&recon_sharpened, 0, sizeof(recon_sharpened)); memset(&src_sharpened, 0, sizeof(src_sharpened)); memset(&recon_blurred, 0, sizeof(recon_blurred)); memset(&src_blurred, 0, sizeof(src_blurred)); aom_alloc_frame_buffer(&recon_sharpened, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&src_sharpened, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&recon_blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer( &src_blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); gaussian_blur(bit_depth, recon, &recon_blurred); gaussian_blur(bit_depth, src, &src_blurred); unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_start); unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_start); const double variance_start = residual_frame_average_variance(cpi, &src_sharpened, ref, mvs); const double score_start = cal_approx_score( cpi, src_variance, variance_start, base_score, src, &recon_sharpened); const double unsharp_amount_next = unsharp_amount_start + step_size; unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_next); unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_next); const double variance_next = residual_frame_average_variance(cpi, &src_sharpened, ref, mvs); const double score_next = cal_approx_score(cpi, src_variance, variance_next, base_score, src, &recon_sharpened); double unsharp_amount; if (score_next > score_start) { unsharp_amount = find_best_frame_unsharp_amount_loop_neg( cpi, src_variance, base_score, src, recon, ref, &src_blurred, &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_next, unsharp_amount_next, step_size, max_loop_count, max_filter_amount); } else { unsharp_amount = find_best_frame_unsharp_amount_loop_neg( cpi, src_variance, base_score, src, recon, ref, &src_blurred, &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_start, unsharp_amount_start, -step_size, max_loop_count, max_filter_amount); } aom_free_frame_buffer(&recon_sharpened); aom_free_frame_buffer(&src_sharpened); aom_free_frame_buffer(&recon_blurred); aom_free_frame_buffer(&src_blurred); aom_free(mvs); return unsharp_amount; } void av1_update_vmaf_curve(AV1_COMP *cpi) { const YV12_BUFFER_CONFIG *source = cpi->source; const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; const int bit_depth = cpi->td.mb.e_mbd.bd; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); double base_score; const bool cal_vmaf_neg = cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, recon, bit_depth, cal_vmaf_neg, &base_score); cpi->vmaf_info.last_frame_vmaf[layer_depth] = base_score; if (cpi->common.seq_params->use_highbitdepth) { assert(source->flags & YV12_FLAG_HIGHBITDEPTH); assert(recon->flags & YV12_FLAG_HIGHBITDEPTH); cpi->vmaf_info.last_frame_ysse[layer_depth] = (double)aom_highbd_get_y_sse(source, recon); } else { cpi->vmaf_info.last_frame_ysse[layer_depth] = (double)aom_get_y_sse(source, recon); } if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { const YV12_BUFFER_CONFIG *last, *next; get_neighbor_frames(cpi, &last, &next); double best_unsharp_amount_start = get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); const int max_loop_count = 5; cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] = find_best_frame_unsharp_amount_neg(cpi, source, recon, last, base_score, best_unsharp_amount_start, 0.025, max_loop_count, 1.01); } } aom-3.12.1/av1/encoder/tune_vmaf.h000066400000000000000000000042151477627663500166460ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_TUNE_VMAF_H_ #define AOM_AV1_ENCODER_TUNE_VMAF_H_ #include "aom_dsp/vmaf.h" #include "aom_scale/yv12config.h" #include "av1/common/enums.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/block.h" typedef struct { // Stores the scaling factors for rdmult when tuning for VMAF. // rdmult_scaling_factors[row * num_cols + col] stores the scaling factors for // 64x64 block at (row, col). double *rdmult_scaling_factors; // Stores the luma sse of the last frame. double last_frame_ysse[MAX_ARF_LAYERS]; // Stores the VMAF of the last frame. double last_frame_vmaf[MAX_ARF_LAYERS]; // Stores the filter strength of the last frame. double last_frame_unsharp_amount[MAX_ARF_LAYERS]; // Stores the origial qindex before scaling. int original_qindex; // VMAF model used in VMAF caculations. VmafModel *vmaf_model; } TuneVMAFInfo; struct AV1_COMP; void av1_vmaf_blk_preprocessing(struct AV1_COMP *cpi, const YV12_BUFFER_CONFIG *source); void av1_vmaf_frame_preprocessing(struct AV1_COMP *cpi, const YV12_BUFFER_CONFIG *source); void av1_vmaf_neg_preprocessing(struct AV1_COMP *cpi, const YV12_BUFFER_CONFIG *source); void av1_set_mb_vmaf_rdmult_scaling(struct AV1_COMP *cpi); void av1_set_vmaf_rdmult(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, int *rdmult); int av1_get_vmaf_base_qindex(const struct AV1_COMP *cpi, int current_qindex); void av1_update_vmaf_curve(struct AV1_COMP *cpi); #endif // AOM_AV1_ENCODER_TUNE_VMAF_H_ aom-3.12.1/av1/encoder/tx_prune_model_weights.h000066400000000000000000004714531477627663500214540ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*! \file * Contains the details of the ML models used for pruning transform size. This * file is only included by av1/encoder/tx_search.c. */ #ifndef AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ #define AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ #ifdef __cplusplus extern "C" { #endif #include "av1/encoder/ml.h" /***************************CONFIG_NN_V2 (New)********************************/ #if CONFIG_NN_V2 // Tx type model for 4x4 block. static float av1_tx_type_nn_4x4_hor_layer0_weights[32] = { -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f, 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f, -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f, 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f, 1.35792f, 0.27733f, 0.88660f, -0.68304f, }; static float av1_tx_type_nn_4x4_hor_layer0_bias[8] = { 1.38742f, 0.59540f, -1.37622f, 1.92114f, 0.00000f, -0.38998f, -0.32726f, -0.15650f, }; static float av1_tx_type_nn_4x4_hor_layer1_weights[32] = { 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f, -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f, -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f, 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f, -0.26782f, -0.65416f, -0.10648f, 0.05568f, }; static float av1_tx_type_nn_4x4_hor_layer1_bias[4] = { 4.07177f, 3.26961f, 0.58083f, 1.21199f, }; static float av1_tx_type_nn_4x4_hor_layer0_out[8] = { 0 }; static float av1_tx_type_nn_4x4_hor_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_hor = { 1, // num_hidden_layers { // fc layer setting { // layer 0 4, // num_inputs 8, // num_outputs av1_tx_type_nn_4x4_hor_layer0_weights, // weights av1_tx_type_nn_4x4_hor_layer0_bias, // bias RELU, // activation av1_tx_type_nn_4x4_hor_layer0_out, // output NULL, NULL, NULL, }, { 8, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_4x4_hor_layer1_weights, av1_tx_type_nn_4x4_hor_layer1_bias, NONE, av1_tx_type_nn_4x4_hor_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_4x4_hor_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; static float av1_tx_type_nn_4x4_ver_layer0_weights[32] = { -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f, 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f, 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f, 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f, -0.06589f, -0.28142f, -0.33118f, 1.72227f, }; static float av1_tx_type_nn_4x4_ver_layer0_bias[8] = { -0.33685f, 0.22025f, 0.28140f, 0.56138f, 0.93489f, -1.77048f, 1.34989f, -0.93747f, }; static float av1_tx_type_nn_4x4_ver_layer1_weights[32] = { -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f, 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f, -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f, -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f, -0.86315f, -0.53336f, 0.30320f, -1.32331f, }; static float av1_tx_type_nn_4x4_ver_layer1_bias[4] = { -1.31519f, -3.26321f, 1.71794f, -1.90778f, }; static float av1_tx_type_nn_4x4_ver_layer0_out[8] = { 0 }; static float av1_tx_type_nn_4x4_ver_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_ver = { 1, // num_hidden_layers { // fc layer setting { // layer 0 4, // num_inputs 8, // num_outputs av1_tx_type_nn_4x4_ver_layer0_weights, // weights av1_tx_type_nn_4x4_ver_layer0_bias, // bias RELU, // activation av1_tx_type_nn_4x4_ver_layer0_out, // output NULL, NULL, NULL, }, { 8, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_4x4_ver_layer1_weights, av1_tx_type_nn_4x4_ver_layer1_bias, NONE, av1_tx_type_nn_4x4_ver_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_4x4_ver_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; /******************************************************************************/ // Tx type model for 4x8 block. static float av1_tx_type_nn_4x8_hor_layer0_weights[32] = { 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f, 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f, -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f, -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f, -1.35896f, -1.17121f, 1.68866f, 0.10357f, }; static float av1_tx_type_nn_4x8_hor_layer0_bias[8] = { 2.93391f, 0.66831f, -0.21419f, 0.00000f, -0.72878f, 0.15127f, -1.46755f, 0.16658f, }; static float av1_tx_type_nn_4x8_hor_layer1_weights[32] = { -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f, -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f, 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f, 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f, -0.50191f, 0.18219f, 1.83664f, -0.75276f, }; static float av1_tx_type_nn_4x8_hor_layer1_bias[4] = { -1.17455f, -2.26089f, -1.79863f, -2.26333f, }; static float av1_tx_type_nn_4x8_hor_layer0_out[8] = { 0 }; static float av1_tx_type_nn_4x8_hor_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_hor = { 1, // num_hidden_layers { // fc layer setting { // layer 0 4, // num_inputs 8, // num_outputs av1_tx_type_nn_4x8_hor_layer0_weights, // weights av1_tx_type_nn_4x8_hor_layer0_bias, // bias RELU, // activation av1_tx_type_nn_4x8_hor_layer0_out, // output NULL, NULL, NULL, }, { 8, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_4x8_hor_layer1_weights, av1_tx_type_nn_4x8_hor_layer1_bias, NONE, av1_tx_type_nn_4x8_hor_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_4x8_hor_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; static float av1_tx_type_nn_4x8_ver_layer0_weights[128] = { -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f, -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f, -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f, 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f, 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f, 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f, -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f, -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f, 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f, -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f, -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f, -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f, 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f, 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f, -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f, -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f, 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f, -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f, -0.21958f, 0.05970f, }; static float av1_tx_type_nn_4x8_ver_layer0_bias[16] = { 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f, 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f, 0.08288f, 0.18195f, -0.79890f, 0.10047f, }; static float av1_tx_type_nn_4x8_ver_layer1_weights[64] = { -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f, -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f, -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f, -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f, 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f, 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f, -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f, -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f, -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f, -1.01848f, }; static float av1_tx_type_nn_4x8_ver_layer1_bias[4] = { -1.45955f, -2.08949f, -1.24813f, -1.55368f, }; static float av1_tx_type_nn_4x8_ver_layer0_out[16] = { 0 }; static float av1_tx_type_nn_4x8_ver_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_ver = { 1, // num_hidden_layers { // fc layer setting { // layer 0 8, // num_inputs 16, // num_outputs av1_tx_type_nn_4x8_ver_layer0_weights, // weights av1_tx_type_nn_4x8_ver_layer0_bias, // bias RELU, // activation av1_tx_type_nn_4x8_ver_layer0_out, // output NULL, NULL, NULL, }, { 16, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_4x8_ver_layer1_weights, av1_tx_type_nn_4x8_ver_layer1_bias, NONE, av1_tx_type_nn_4x8_ver_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_4x8_ver_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; /******************************************************************************/ // Tx type model for 8x4 block. static float av1_tx_type_nn_8x4_hor_layer0_weights[128] = { -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f, 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f, -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f, -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f, -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f, 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f, 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f, -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f, -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f, 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f, 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f, -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f, -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f, 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f, 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f, 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f, -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f, -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f, -1.85523f, 0.92532f, }; static float av1_tx_type_nn_8x4_hor_layer0_bias[16] = { 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f, -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f, -0.28958f, -0.32869f, -0.01704f, 0.68171f, }; static float av1_tx_type_nn_8x4_hor_layer1_weights[64] = { -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f, -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f, 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f, -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f, 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f, -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f, -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f, 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f, 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f, -1.10654f, }; static float av1_tx_type_nn_8x4_hor_layer1_bias[4] = { -0.92861f, -1.45151f, -1.33588f, -4.33853f, }; static float av1_tx_type_nn_8x4_hor_layer0_out[16] = { 0 }; static float av1_tx_type_nn_8x4_hor_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_hor = { 1, // num_hidden_layers { // fc layer setting { // layer 0 8, // num_inputs 16, // num_outputs av1_tx_type_nn_8x4_hor_layer0_weights, // weights av1_tx_type_nn_8x4_hor_layer0_bias, // bias RELU, // activation av1_tx_type_nn_8x4_hor_layer0_out, // output NULL, NULL, NULL, }, { 16, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_8x4_hor_layer1_weights, av1_tx_type_nn_8x4_hor_layer1_bias, NONE, av1_tx_type_nn_8x4_hor_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_8x4_hor_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; static float av1_tx_type_nn_8x4_ver_layer0_weights[32] = { -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f, -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f, -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f, -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f, 1.66212f, 1.70826f, 1.55182f, 0.12230f, }; static float av1_tx_type_nn_8x4_ver_layer0_bias[8] = { 0.10943f, 2.09789f, 2.16578f, 0.15766f, -0.42461f, 0.00000f, 1.22090f, -1.28717f, }; static float av1_tx_type_nn_8x4_ver_layer1_weights[32] = { 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f, 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f, 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f, -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f, -1.15005f, -0.39311f, 1.51236f, -1.68973f, }; static float av1_tx_type_nn_8x4_ver_layer1_bias[4] = { 1.81013f, 1.10517f, 2.90059f, 0.95391f, }; static float av1_tx_type_nn_8x4_ver_layer0_out[8] = { 0 }; static float av1_tx_type_nn_8x4_ver_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_ver = { 1, // num_hidden_layers { // fc layer setting { // layer 0 4, // num_inputs 8, // num_outputs av1_tx_type_nn_8x4_ver_layer0_weights, // weights av1_tx_type_nn_8x4_ver_layer0_bias, // bias RELU, // activation av1_tx_type_nn_8x4_ver_layer0_out, // output NULL, NULL, NULL, }, { 8, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_8x4_ver_layer1_weights, av1_tx_type_nn_8x4_ver_layer1_bias, NONE, av1_tx_type_nn_8x4_ver_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_8x4_ver_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; /******************************************************************************/ // Tx type model for 8x8 block. static float av1_tx_type_nn_8x8_hor_layer0_weights[128] = { -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f, -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f, 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f, 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f, -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f, -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f, -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f, 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f, 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f, -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f, 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f, -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f, 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f, 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f, 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f, 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f, 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f, 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f, -0.99892f, 1.09823f, }; static float av1_tx_type_nn_8x8_hor_layer0_bias[16] = { -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f, -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f, -0.26319f, 2.65579f, -1.30137f, -0.01487f, }; static float av1_tx_type_nn_8x8_hor_layer1_weights[64] = { -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f, -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f, 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f, 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f, 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f, -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f, 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f, 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f, 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f, 0.06161f, }; static float av1_tx_type_nn_8x8_hor_layer1_bias[4] = { 1.70385f, 1.82373f, 1.78496f, 1.80826f, }; static float av1_tx_type_nn_8x8_hor_layer0_out[16] = { 0 }; static float av1_tx_type_nn_8x8_hor_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_hor = { 1, // num_hidden_layers { // fc layer setting { // layer 0 8, // num_inputs 16, // num_outputs av1_tx_type_nn_8x8_hor_layer0_weights, // weights av1_tx_type_nn_8x8_hor_layer0_bias, // bias RELU, // activation av1_tx_type_nn_8x8_hor_layer0_out, // output NULL, NULL, NULL, }, { 16, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_8x8_hor_layer1_weights, av1_tx_type_nn_8x8_hor_layer1_bias, NONE, av1_tx_type_nn_8x8_hor_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_8x8_hor_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; static float av1_tx_type_nn_8x8_ver_layer0_weights[128] = { -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f, 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f, -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f, -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f, 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f, 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f, 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f, -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f, -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f, 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f, 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f, -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f, 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f, 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f, -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f, 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f, -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f, -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f, -1.29848f, 0.39308f, }; static float av1_tx_type_nn_8x8_ver_layer0_bias[16] = { -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f, 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f, 0.83015f, 0.06024f, 1.17180f, 0.65122f, }; static float av1_tx_type_nn_8x8_ver_layer1_weights[64] = { -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f, 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f, 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f, 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f, 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f, 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f, 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f, 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f, -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f, -0.41305f, }; static float av1_tx_type_nn_8x8_ver_layer1_bias[4] = { 2.14067f, 2.76699f, 2.04233f, 1.34803f, }; static float av1_tx_type_nn_8x8_ver_layer0_out[16] = { 0 }; static float av1_tx_type_nn_8x8_ver_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_ver = { 1, // num_hidden_layers { // fc layer setting { // layer 0 8, // num_inputs 16, // num_outputs av1_tx_type_nn_8x8_ver_layer0_weights, // weights av1_tx_type_nn_8x8_ver_layer0_bias, // bias RELU, // activation av1_tx_type_nn_8x8_ver_layer0_out, // output NULL, NULL, NULL, }, { 16, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_8x8_ver_layer1_weights, av1_tx_type_nn_8x8_ver_layer1_bias, NONE, av1_tx_type_nn_8x8_ver_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_8x8_ver_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; /******************************************************************************/ // Tx type model for 8x16 block. static float av1_tx_type_nn_8x16_hor_layer0_weights[128] = { -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f, 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f, -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f, 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f, -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f, 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f, -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f, 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f, -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f, -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f, 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f, 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f, -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f, 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f, -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f, 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f, 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f, -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f, -0.28136f, 0.42556f, }; static float av1_tx_type_nn_8x16_hor_layer0_bias[16] = { 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f, -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f, 1.81560f, -1.02643f, -0.81690f, 0.08302f, }; static float av1_tx_type_nn_8x16_hor_layer1_weights[64] = { 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f, -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f, 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f, -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f, 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f, 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f, 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f, 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f, 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f, -1.31243f, }; static float av1_tx_type_nn_8x16_hor_layer1_bias[4] = { 0.83359f, 1.06875f, 1.77645f, 1.49570f, }; static float av1_tx_type_nn_8x16_hor_layer0_out[16] = { 0 }; static float av1_tx_type_nn_8x16_hor_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_hor = { 1, // num_hidden_layers { // fc layer setting { // layer 0 8, // num_inputs 16, // num_outputs av1_tx_type_nn_8x16_hor_layer0_weights, // weights av1_tx_type_nn_8x16_hor_layer0_bias, // bias RELU, // activation av1_tx_type_nn_8x16_hor_layer0_out, // output NULL, NULL, NULL, }, { 16, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_8x16_hor_layer1_weights, av1_tx_type_nn_8x16_hor_layer1_bias, NONE, av1_tx_type_nn_8x16_hor_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_8x16_hor_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; static float av1_tx_type_nn_8x16_ver_layer0_weights[128] = { 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f, -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f, -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f, 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f, -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f, 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f, 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f, 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f, -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f, -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f, 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f, 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f, -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f, -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f, -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f, -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f, -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f, 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f, -0.12236f, 0.16075f, }; static float av1_tx_type_nn_8x16_ver_layer0_bias[16] = { -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f, -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f, 0.57598f, 0.99819f, 0.75175f, 0.17044f, }; static float av1_tx_type_nn_8x16_ver_layer1_weights[64] = { -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f, 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f, -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f, 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f, -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f, -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f, -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f, 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f, 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f, 2.20547f, }; static float av1_tx_type_nn_8x16_ver_layer1_bias[4] = { -0.44080f, -1.67455f, -1.46332f, -6.13206f, }; static float av1_tx_type_nn_8x16_ver_layer0_out[16] = { 0 }; static float av1_tx_type_nn_8x16_ver_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_ver = { 1, // num_hidden_layers { // fc layer setting { // layer 0 8, // num_inputs 16, // num_outputs av1_tx_type_nn_8x16_ver_layer0_weights, // weights av1_tx_type_nn_8x16_ver_layer0_bias, // bias RELU, // activation av1_tx_type_nn_8x16_ver_layer0_out, // output NULL, NULL, NULL, }, { 16, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_8x16_ver_layer1_weights, av1_tx_type_nn_8x16_ver_layer1_bias, NONE, av1_tx_type_nn_8x16_ver_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_8x16_ver_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; /******************************************************************************/ // Tx type model for 16x8 block. static float av1_tx_type_nn_16x8_hor_layer0_weights[128] = { 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f, -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f, -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f, 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f, 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f, 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f, 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f, -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f, -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f, -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f, 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f, -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f, -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f, -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f, 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f, -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f, -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f, 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f, -0.36570f, -0.50757f, }; static float av1_tx_type_nn_16x8_hor_layer0_bias[16] = { -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f, 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f, -0.12329f, 0.08986f, 1.08117f, -0.00220f, }; static float av1_tx_type_nn_16x8_hor_layer1_weights[64] = { 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f, 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f, -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f, -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f, -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f, -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f, 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f, 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f, 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f, -0.23347f, }; static float av1_tx_type_nn_16x8_hor_layer1_bias[4] = { 3.57175f, 2.42612f, 3.31259f, 2.08287f, }; static float av1_tx_type_nn_16x8_hor_layer0_out[16] = { 0 }; static float av1_tx_type_nn_16x8_hor_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_hor = { 1, // num_hidden_layers { // fc layer setting { // layer 0 8, // num_inputs 16, // num_outputs av1_tx_type_nn_16x8_hor_layer0_weights, // weights av1_tx_type_nn_16x8_hor_layer0_bias, // bias RELU, // activation av1_tx_type_nn_16x8_hor_layer0_out, // output NULL, NULL, NULL, }, { 16, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_16x8_hor_layer1_weights, av1_tx_type_nn_16x8_hor_layer1_bias, NONE, av1_tx_type_nn_16x8_hor_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_16x8_hor_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; static float av1_tx_type_nn_16x8_ver_layer0_weights[128] = { 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f, 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f, -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f, 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f, 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f, -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f, 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f, -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f, 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f, 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f, 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f, -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f, -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f, -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f, 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f, 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f, -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f, -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f, -0.81945f, -0.41647f, }; static float av1_tx_type_nn_16x8_ver_layer0_bias[16] = { 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f, 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f, -0.04510f, 0.48000f, -0.09354f, -0.42422f, }; static float av1_tx_type_nn_16x8_ver_layer1_weights[64] = { 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f, -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f, 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f, -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f, -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f, 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f, 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f, -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f, 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f, -0.00873f, }; static float av1_tx_type_nn_16x8_ver_layer1_bias[4] = { 3.34981f, 3.74710f, 1.38339f, 0.45176f, }; static float av1_tx_type_nn_16x8_ver_layer0_out[16] = { 0 }; static float av1_tx_type_nn_16x8_ver_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_ver = { 1, // num_hidden_layers { // fc layer setting { // layer 0 8, // num_inputs 16, // num_outputs av1_tx_type_nn_16x8_ver_layer0_weights, // weights av1_tx_type_nn_16x8_ver_layer0_bias, // bias RELU, // activation av1_tx_type_nn_16x8_ver_layer0_out, // output NULL, NULL, NULL, }, { 16, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_16x8_ver_layer1_weights, av1_tx_type_nn_16x8_ver_layer1_bias, NONE, av1_tx_type_nn_16x8_ver_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_16x8_ver_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; /******************************************************************************/ // Tx type model for 16x16 block. static float av1_tx_type_nn_16x16_layer0_weights[128] = { 1.26592f, 1.36313f, 1.30956f, 1.29926f, 1.48816f, 1.68851f, 1.32000f, 0.13321f, -0.22477f, -0.88906f, -0.19622f, 1.69605f, 1.22180f, -1.57771f, -1.15765f, 0.05710f, -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f, -0.77952f, -1.15723f, 1.17809f, 1.35602f, -0.05243f, -0.37596f, 0.26108f, 0.17611f, -0.10323f, 0.77279f, -0.48911f, -0.79308f, 0.55112f, 0.43918f, 0.27872f, 0.28714f, 0.45830f, 1.05689f, 0.03705f, -2.49975f, -0.01940f, 0.05709f, 0.07942f, -0.13290f, -0.10359f, 0.00143f, 0.37303f, 0.96470f, 0.53293f, 1.14459f, 0.89185f, 0.43378f, 0.47764f, 0.90924f, 0.15279f, -0.15361f, 0.02949f, 0.42240f, 0.68143f, 0.89588f, 0.73754f, 0.10974f, 1.57755f, -0.39870f, -0.32914f, 0.35638f, 0.34991f, -0.00003f, -0.23373f, 0.29630f, -0.76699f, -0.01356f, 0.04234f, 0.84253f, 1.92078f, 0.93160f, 0.71993f, 0.71604f, 0.76455f, -1.59782f, 0.32332f, 1.11628f, 0.33062f, -0.03728f, -0.05710f, 0.80447f, -0.14719f, 1.34658f, -0.05718f, 0.64015f, 0.21926f, 0.41653f, 0.12720f, 0.54092f, 1.39411f, 1.81819f, -0.24513f, 0.00955f, 0.38011f, -0.57787f, -0.41759f, 0.68834f, -0.31783f, -0.40607f, -0.10107f, -0.79374f, 0.75599f, -0.16282f, -0.14490f, -0.20783f, -0.55019f, -0.13793f, -0.22293f, 0.18305f, 0.12445f, 0.56830f, 0.24567f, 0.09278f, 0.70803f, 0.35803f, -1.52676f, -0.89624f, 0.77665f, 0.19877f, 0.77175f, 0.50355f, 0.08592f, }; static float av1_tx_type_nn_16x16_layer0_bias[16] = { -1.31834f, 0.14346f, -0.10062f, 0.84489f, 0.95617f, -0.06720f, -0.68502f, -0.91442f, -0.31932f, 0.25276f, -0.15138f, -1.57661f, -0.14062f, -0.42120f, 0.94573f, -0.09287f, }; static float av1_tx_type_nn_16x16_layer1_weights[64] = { -1.80333f, -1.06353f, 0.55139f, 0.74644f, 0.13747f, -0.93018f, -0.10286f, 0.67133f, 0.24460f, 1.44583f, 0.02173f, 0.26037f, -0.73687f, 0.19566f, 0.61846f, -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f, 0.96224f, -0.59139f, 0.03813f, 0.05403f, 1.33427f, -0.54375f, -1.92181f, 0.54704f, 0.13608f, 0.22151f, -0.38076f, 1.18390f, -0.77508f, -1.84283f, 1.00894f, 0.62318f, -0.15296f, 1.27600f, 0.22822f, 0.12751f, 0.93910f, -0.28502f, 0.53912f, -0.96889f, 0.10182f, 0.81508f, -0.43028f, 2.67386f, 0.52204f, 0.49820f, -0.41711f, 1.05038f, 1.12192f, 0.74349f, -0.75417f, -0.03718f, -0.35769f, 0.89651f, 0.63236f, 0.54215f, -0.07894f, 0.48274f, 1.08829f, }; static float av1_tx_type_nn_16x16_layer1_bias[4] = { 0.81986f, 1.26865f, 0.11118f, 2.48404f, }; static float av1_tx_type_nn_16x16_layer0_out[16] = { 0 }; static float av1_tx_type_nn_16x16_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_16x16 = { 1, // num_hidden_layers { // fc layer setting { // layer 0 8, // num_inputs 16, // num_outputs av1_tx_type_nn_16x16_layer0_weights, // weights av1_tx_type_nn_16x16_layer0_bias, // bias RELU, // activation av1_tx_type_nn_16x16_layer0_out, // output NULL, NULL, NULL, }, { 16, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_16x16_layer1_weights, av1_tx_type_nn_16x16_layer1_bias, NONE, av1_tx_type_nn_16x16_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_16x16_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; /******************************************************************************/ // Tx type model for 4x16 block. static float av1_tx_type_nn_4x16_hor_layer0_weights[32] = { 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f, 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f, 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f, 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f, -1.74563f, -0.88830f, -1.77603f, 2.15935f, }; static float av1_tx_type_nn_4x16_hor_layer0_bias[8] = { -0.36435f, -2.22731f, -0.00837f, -1.34546f, 0.62806f, -0.20675f, 4.91940f, -0.56079f, }; static float av1_tx_type_nn_4x16_hor_layer1_weights[32] = { -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f, -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f, 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f, 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f, 1.28413f, -0.30326f, 2.45329f, -0.83335f, }; static float av1_tx_type_nn_4x16_hor_layer1_bias[4] = { 2.33198f, 3.36245f, 1.62603f, 2.91056f, }; static float av1_tx_type_nn_4x16_hor_layer0_out[8] = { 0 }; static float av1_tx_type_nn_4x16_hor_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_hor = { 1, // num_hidden_layers { // fc layer setting { // layer 0 4, // num_inputs 8, // num_outputs av1_tx_type_nn_4x16_hor_layer0_weights, // weights av1_tx_type_nn_4x16_hor_layer0_bias, // bias RELU, // activation av1_tx_type_nn_4x16_hor_layer0_out, // output NULL, NULL, NULL, }, { 8, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_4x16_hor_layer1_weights, av1_tx_type_nn_4x16_hor_layer1_bias, NONE, av1_tx_type_nn_4x16_hor_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_4x16_hor_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; static float av1_tx_type_nn_4x16_ver_layer0_weights[128] = { 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f, 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f, -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f, -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f, -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f, -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f, 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f, 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f, 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f, -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f, -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f, 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f, 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f, 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f, 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f, -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f, 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f, 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f, -0.27975f, -0.01149f, }; static float av1_tx_type_nn_4x16_ver_layer0_bias[16] = { -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f, -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f, -0.32530f, 0.73483f, 0.08322f, -0.23890f, }; static float av1_tx_type_nn_4x16_ver_layer1_weights[64] = { 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f, -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f, 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f, -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f, 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f, -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f, 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f, 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f, -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f, -0.56513f, }; static float av1_tx_type_nn_4x16_ver_layer1_bias[4] = { 4.60896f, 4.53551f, 4.53124f, 4.27435f, }; static float av1_tx_type_nn_4x16_ver_layer0_out[16] = { 0 }; static float av1_tx_type_nn_4x16_ver_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_ver = { 1, // num_hidden_layers { // fc layer setting { // layer 0 8, // num_inputs 16, // num_outputs av1_tx_type_nn_4x16_ver_layer0_weights, // weights av1_tx_type_nn_4x16_ver_layer0_bias, // bias RELU, // activation av1_tx_type_nn_4x16_ver_layer0_out, // output NULL, NULL, NULL, }, { 16, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_4x16_ver_layer1_weights, av1_tx_type_nn_4x16_ver_layer1_bias, NONE, av1_tx_type_nn_4x16_ver_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_4x16_ver_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; /******************************************************************************/ // Tx type model for 16x4 block. static float av1_tx_type_nn_16x4_hor_layer0_weights[128] = { 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f, 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f, -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f, -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f, -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f, -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f, 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f, 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f, 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f, -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f, 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f, -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f, 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f, -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f, -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f, -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f, 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f, 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f, 0.19055f, -1.56413f, }; static float av1_tx_type_nn_16x4_hor_layer0_bias[16] = { -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f, 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f, 1.14048f, 0.33308f, -1.10886f, 0.41184f, }; static float av1_tx_type_nn_16x4_hor_layer1_weights[64] = { -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f, 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f, -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f, -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f, 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f, -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f, -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f, 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f, 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f, -0.43819f, }; static float av1_tx_type_nn_16x4_hor_layer1_bias[4] = { 2.32575f, 2.75703f, 1.12304f, 2.15567f, }; static float av1_tx_type_nn_16x4_hor_layer0_out[16] = { 0 }; static float av1_tx_type_nn_16x4_hor_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_hor = { 1, // num_hidden_layers { // fc layer setting { // layer 0 8, // num_inputs 16, // num_outputs av1_tx_type_nn_16x4_hor_layer0_weights, // weights av1_tx_type_nn_16x4_hor_layer0_bias, // bias RELU, // activation av1_tx_type_nn_16x4_hor_layer0_out, // output NULL, NULL, NULL, }, { 16, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_16x4_hor_layer1_weights, av1_tx_type_nn_16x4_hor_layer1_bias, NONE, av1_tx_type_nn_16x4_hor_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_16x4_hor_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; static float av1_tx_type_nn_16x4_ver_layer0_weights[32] = { 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f, 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f, -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f, -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f, -0.17967f, -0.96622f, 0.42635f, -1.04784f, }; static float av1_tx_type_nn_16x4_ver_layer0_bias[8] = { -0.52088f, 0.52844f, -1.03655f, -0.30974f, 2.59952f, -1.93604f, 0.00000f, 2.51787f, }; static float av1_tx_type_nn_16x4_ver_layer1_weights[32] = { 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f, 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f, 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f, -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f, 1.26814f, -1.93873f, -0.00768f, 1.58309f, }; static float av1_tx_type_nn_16x4_ver_layer1_bias[4] = { 2.34713f, 1.68667f, 1.25488f, 1.69812f, }; static float av1_tx_type_nn_16x4_ver_layer0_out[8] = { 0 }; static float av1_tx_type_nn_16x4_ver_layer1_out[4] = { 0 }; static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_ver = { 1, // num_hidden_layers { // fc layer setting { // layer 0 4, // num_inputs 8, // num_outputs av1_tx_type_nn_16x4_ver_layer0_weights, // weights av1_tx_type_nn_16x4_ver_layer0_bias, // bias RELU, // activation av1_tx_type_nn_16x4_ver_layer0_out, // output NULL, NULL, NULL, }, { 8, // num_inputs (!!same as num_outputs of last layer) 4, av1_tx_type_nn_16x4_ver_layer1_weights, av1_tx_type_nn_16x4_ver_layer1_bias, NONE, av1_tx_type_nn_16x4_ver_layer1_out, NULL, NULL, NULL, }, }, 4, // num_outputs av1_tx_type_nn_16x4_ver_layer1_out, // logits (!!same as last layer output) SOFTMAX_CROSS_ENTROPY, }; /******************************************************************************/ // Map tx_size to its corresponding neural net model for tx type prediction. static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_hor[] = { &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform &av1_tx_type_nnconfig_16x16, // 16x16 transform NULL, // 32x32 transform NULL, // 64x64 transform &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform NULL, // 16x32 transform NULL, // 32x16 transform NULL, // 32x64 transform NULL, // 64x32 transform &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform NULL, // 8x32 transform NULL, // 32x8 transform NULL, // 16x64 transform NULL, // 64x16 transform }; static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_ver[] = { &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform &av1_tx_type_nnconfig_16x16, // 16x16 transform NULL, // 32x32 transform NULL, // 64x64 transform &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform NULL, // 16x32 transform NULL, // 32x16 transform NULL, // 32x64 transform NULL, // 64x32 transform &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform NULL, // 8x32 transform NULL, // 32x8 transform NULL, // 16x64 transform NULL, // 64x16 transform }; #else /******************************CONFIG_NN***************************************/ // Tx type model for 4x4 block. static const float av1_tx_type_nn_weights_4x4_hor_layer0[32] = { -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f, 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f, -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f, 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f, 1.35792f, 0.27733f, 0.88660f, -0.68304f, }; static const float av1_tx_type_nn_bias_4x4_hor_layer0[8] = { 1.38742f, 0.59540f, -1.37622f, 1.92114f, 0.00000f, -0.38998f, -0.32726f, -0.15650f, }; static const float av1_tx_type_nn_weights_4x4_hor_layer1[32] = { 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f, -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f, -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f, 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f, -0.26782f, -0.65416f, -0.10648f, 0.05568f, }; static const float av1_tx_type_nn_bias_4x4_hor_layer1[4] = { 4.07177f, 3.26961f, 0.58083f, 1.21199f, }; static const NN_CONFIG av1_tx_type_nnconfig_4x4_hor = { 4, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 8, }, // num_hidden_nodes { av1_tx_type_nn_weights_4x4_hor_layer0, av1_tx_type_nn_weights_4x4_hor_layer1 }, { av1_tx_type_nn_bias_4x4_hor_layer0, av1_tx_type_nn_bias_4x4_hor_layer1 } }; static const float av1_tx_type_nn_weights_4x4_ver_layer0[32] = { -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f, 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f, 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f, 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f, -0.06589f, -0.28142f, -0.33118f, 1.72227f, }; static const float av1_tx_type_nn_bias_4x4_ver_layer0[8] = { -0.33685f, 0.22025f, 0.28140f, 0.56138f, 0.93489f, -1.77048f, 1.34989f, -0.93747f, }; static const float av1_tx_type_nn_weights_4x4_ver_layer1[32] = { -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f, 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f, -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f, -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f, -0.86315f, -0.53336f, 0.30320f, -1.32331f, }; static const float av1_tx_type_nn_bias_4x4_ver_layer1[4] = { -1.31519f, -3.26321f, 1.71794f, -1.90778f, }; static const NN_CONFIG av1_tx_type_nnconfig_4x4_ver = { 4, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 8, }, // num_hidden_nodes { av1_tx_type_nn_weights_4x4_ver_layer0, av1_tx_type_nn_weights_4x4_ver_layer1 }, { av1_tx_type_nn_bias_4x4_ver_layer0, av1_tx_type_nn_bias_4x4_ver_layer1 } }; /******************************************************************************/ // Tx type model for 4x8 block. static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = { 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f, 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f, -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f, -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f, -1.35896f, -1.17121f, 1.68866f, 0.10357f, }; static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = { 2.93391f, 0.66831f, -0.21419f, 0.00000f, -0.72878f, 0.15127f, -1.46755f, 0.16658f, }; static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = { -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f, -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f, 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f, 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f, -0.50191f, 0.18219f, 1.83664f, -0.75276f, }; static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = { -1.17455f, -2.26089f, -1.79863f, -2.26333f, }; static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = { 4, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 8, }, // num_hidden_nodes { av1_tx_type_nn_weights_4x8_hor_layer0, av1_tx_type_nn_weights_4x8_hor_layer1 }, { av1_tx_type_nn_bias_4x8_hor_layer0, av1_tx_type_nn_bias_4x8_hor_layer1 } }; static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = { -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f, -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f, -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f, 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f, 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f, 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f, -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f, -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f, 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f, -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f, -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f, -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f, 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f, 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f, -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f, -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f, 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f, -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f, -0.21958f, 0.05970f, }; static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = { 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f, 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f, 0.08288f, 0.18195f, -0.79890f, 0.10047f, }; static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = { -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f, -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f, -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f, -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f, 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f, 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f, -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f, -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f, -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f, -1.01848f, }; static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = { -1.45955f, -2.08949f, -1.24813f, -1.55368f, }; static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = { 8, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes { av1_tx_type_nn_weights_4x8_ver_layer0, av1_tx_type_nn_weights_4x8_ver_layer1 }, { av1_tx_type_nn_bias_4x8_ver_layer0, av1_tx_type_nn_bias_4x8_ver_layer1 } }; /******************************************************************************/ // Tx type model for 8x4 block. static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = { -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f, 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f, -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f, -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f, -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f, 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f, 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f, -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f, -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f, 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f, 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f, -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f, -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f, 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f, 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f, 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f, -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f, -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f, -1.85523f, 0.92532f, }; static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = { 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f, -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f, -0.28958f, -0.32869f, -0.01704f, 0.68171f, }; static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = { -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f, -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f, 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f, -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f, 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f, -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f, -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f, 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f, 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f, -1.10654f, }; static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = { -0.92861f, -1.45151f, -1.33588f, -4.33853f, }; static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = { 8, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes { av1_tx_type_nn_weights_8x4_hor_layer0, av1_tx_type_nn_weights_8x4_hor_layer1 }, { av1_tx_type_nn_bias_8x4_hor_layer0, av1_tx_type_nn_bias_8x4_hor_layer1 } }; static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = { -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f, -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f, -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f, -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f, 1.66212f, 1.70826f, 1.55182f, 0.12230f, }; static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = { 0.10943f, 2.09789f, 2.16578f, 0.15766f, -0.42461f, 0.00000f, 1.22090f, -1.28717f, }; static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = { 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f, 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f, 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f, -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f, -1.15005f, -0.39311f, 1.51236f, -1.68973f, }; static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = { 1.81013f, 1.10517f, 2.90059f, 0.95391f, }; static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = { 4, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 8, }, // num_hidden_nodes { av1_tx_type_nn_weights_8x4_ver_layer0, av1_tx_type_nn_weights_8x4_ver_layer1 }, { av1_tx_type_nn_bias_8x4_ver_layer0, av1_tx_type_nn_bias_8x4_ver_layer1 } }; /******************************************************************************/ // Tx type model for 8x8 block. static const float av1_tx_type_nn_weights_8x8_hor_layer0[128] = { -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f, -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f, 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f, 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f, -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f, -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f, -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f, 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f, 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f, -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f, 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f, -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f, 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f, 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f, 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f, 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f, 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f, 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f, -0.99892f, 1.09823f, }; static const float av1_tx_type_nn_bias_8x8_hor_layer0[16] = { -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f, -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f, -0.26319f, 2.65579f, -1.30137f, -0.01487f, }; static const float av1_tx_type_nn_weights_8x8_hor_layer1[64] = { -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f, -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f, 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f, 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f, 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f, -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f, 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f, 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f, 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f, 0.06161f, }; static const float av1_tx_type_nn_bias_8x8_hor_layer1[4] = { 1.70385f, 1.82373f, 1.78496f, 1.80826f, }; static const NN_CONFIG av1_tx_type_nnconfig_8x8_hor = { 8, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes { av1_tx_type_nn_weights_8x8_hor_layer0, av1_tx_type_nn_weights_8x8_hor_layer1 }, { av1_tx_type_nn_bias_8x8_hor_layer0, av1_tx_type_nn_bias_8x8_hor_layer1 } }; static const float av1_tx_type_nn_weights_8x8_ver_layer0[128] = { -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f, 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f, -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f, -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f, 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f, 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f, 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f, -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f, -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f, 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f, 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f, -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f, 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f, 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f, -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f, 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f, -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f, -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f, -1.29848f, 0.39308f, }; static const float av1_tx_type_nn_bias_8x8_ver_layer0[16] = { -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f, 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f, 0.83015f, 0.06024f, 1.17180f, 0.65122f, }; static const float av1_tx_type_nn_weights_8x8_ver_layer1[64] = { -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f, 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f, 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f, 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f, 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f, 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f, 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f, 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f, -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f, -0.41305f, }; static const float av1_tx_type_nn_bias_8x8_ver_layer1[4] = { 2.14067f, 2.76699f, 2.04233f, 1.34803f, }; static const NN_CONFIG av1_tx_type_nnconfig_8x8_ver = { 8, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes { av1_tx_type_nn_weights_8x8_ver_layer0, av1_tx_type_nn_weights_8x8_ver_layer1 }, { av1_tx_type_nn_bias_8x8_ver_layer0, av1_tx_type_nn_bias_8x8_ver_layer1 } }; /******************************************************************************/ // Tx type model for 8x16 block. static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = { -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f, 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f, -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f, 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f, -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f, 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f, -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f, 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f, -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f, -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f, 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f, 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f, -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f, 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f, -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f, 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f, 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f, -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f, -0.28136f, 0.42556f, }; static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = { 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f, -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f, 1.81560f, -1.02643f, -0.81690f, 0.08302f, }; static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = { 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f, -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f, 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f, -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f, 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f, 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f, 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f, 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f, 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f, -1.31243f, }; static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = { 0.83359f, 1.06875f, 1.77645f, 1.49570f, }; static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = { 8, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes { av1_tx_type_nn_weights_8x16_hor_layer0, av1_tx_type_nn_weights_8x16_hor_layer1 }, { av1_tx_type_nn_bias_8x16_hor_layer0, av1_tx_type_nn_bias_8x16_hor_layer1 } }; static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = { 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f, -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f, -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f, 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f, -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f, 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f, 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f, 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f, -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f, -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f, 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f, 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f, -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f, -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f, -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f, -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f, -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f, 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f, -0.12236f, 0.16075f, }; static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = { -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f, -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f, 0.57598f, 0.99819f, 0.75175f, 0.17044f, }; static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = { -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f, 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f, -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f, 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f, -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f, -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f, -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f, 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f, 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f, 2.20547f, }; static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = { -0.44080f, -1.67455f, -1.46332f, -6.13206f, }; static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = { 8, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes { av1_tx_type_nn_weights_8x16_ver_layer0, av1_tx_type_nn_weights_8x16_ver_layer1 }, { av1_tx_type_nn_bias_8x16_ver_layer0, av1_tx_type_nn_bias_8x16_ver_layer1 } }; /******************************************************************************/ // Tx type model for 16x8 block. static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = { 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f, -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f, -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f, 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f, 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f, 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f, 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f, -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f, -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f, -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f, 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f, -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f, -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f, -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f, 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f, -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f, -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f, 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f, -0.36570f, -0.50757f, }; static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = { -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f, 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f, -0.12329f, 0.08986f, 1.08117f, -0.00220f, }; static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = { 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f, 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f, -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f, -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f, -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f, -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f, 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f, 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f, 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f, -0.23347f, }; static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = { 3.57175f, 2.42612f, 3.31259f, 2.08287f, }; static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = { 8, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes { av1_tx_type_nn_weights_16x8_hor_layer0, av1_tx_type_nn_weights_16x8_hor_layer1 }, { av1_tx_type_nn_bias_16x8_hor_layer0, av1_tx_type_nn_bias_16x8_hor_layer1 } }; static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = { 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f, 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f, -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f, 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f, 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f, -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f, 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f, -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f, 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f, 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f, 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f, -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f, -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f, -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f, 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f, 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f, -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f, -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f, -0.81945f, -0.41647f, }; static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = { 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f, 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f, -0.04510f, 0.48000f, -0.09354f, -0.42422f, }; static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = { 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f, -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f, 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f, -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f, -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f, 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f, 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f, -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f, 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f, -0.00873f, }; static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = { 3.34981f, 3.74710f, 1.38339f, 0.45176f, }; static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = { 8, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes { av1_tx_type_nn_weights_16x8_ver_layer0, av1_tx_type_nn_weights_16x8_ver_layer1 }, { av1_tx_type_nn_bias_16x8_ver_layer0, av1_tx_type_nn_bias_16x8_ver_layer1 } }; /******************************************************************************/ // Tx type model for 16x16 block. static const float av1_tx_type_nn_weights_16x16_layer0[128] = { 1.26592f, 1.36313f, 1.30956f, 1.29926f, 1.48816f, 1.68851f, 1.32000f, 0.13321f, -0.22477f, -0.88906f, -0.19622f, 1.69605f, 1.22180f, -1.57771f, -1.15765f, 0.05710f, -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f, -0.77952f, -1.15723f, 1.17809f, 1.35602f, -0.05243f, -0.37596f, 0.26108f, 0.17611f, -0.10323f, 0.77279f, -0.48911f, -0.79308f, 0.55112f, 0.43918f, 0.27872f, 0.28714f, 0.45830f, 1.05689f, 0.03705f, -2.49975f, -0.01940f, 0.05709f, 0.07942f, -0.13290f, -0.10359f, 0.00143f, 0.37303f, 0.96470f, 0.53293f, 1.14459f, 0.89185f, 0.43378f, 0.47764f, 0.90924f, 0.15279f, -0.15361f, 0.02949f, 0.42240f, 0.68143f, 0.89588f, 0.73754f, 0.10974f, 1.57755f, -0.39870f, -0.32914f, 0.35638f, 0.34991f, -0.00003f, -0.23373f, 0.29630f, -0.76699f, -0.01356f, 0.04234f, 0.84253f, 1.92078f, 0.93160f, 0.71993f, 0.71604f, 0.76455f, -1.59782f, 0.32332f, 1.11628f, 0.33062f, -0.03728f, -0.05710f, 0.80447f, -0.14719f, 1.34658f, -0.05718f, 0.64015f, 0.21926f, 0.41653f, 0.12720f, 0.54092f, 1.39411f, 1.81819f, -0.24513f, 0.00955f, 0.38011f, -0.57787f, -0.41759f, 0.68834f, -0.31783f, -0.40607f, -0.10107f, -0.79374f, 0.75599f, -0.16282f, -0.14490f, -0.20783f, -0.55019f, -0.13793f, -0.22293f, 0.18305f, 0.12445f, 0.56830f, 0.24567f, 0.09278f, 0.70803f, 0.35803f, -1.52676f, -0.89624f, 0.77665f, 0.19877f, 0.77175f, 0.50355f, 0.08592f, }; static const float av1_tx_type_nn_bias_16x16_layer0[16] = { -1.31834f, 0.14346f, -0.10062f, 0.84489f, 0.95617f, -0.06720f, -0.68502f, -0.91442f, -0.31932f, 0.25276f, -0.15138f, -1.57661f, -0.14062f, -0.42120f, 0.94573f, -0.09287f, }; static const float av1_tx_type_nn_weights_16x16_layer1[64] = { -1.80333f, -1.06353f, 0.55139f, 0.74644f, 0.13747f, -0.93018f, -0.10286f, 0.67133f, 0.24460f, 1.44583f, 0.02173f, 0.26037f, -0.73687f, 0.19566f, 0.61846f, -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f, 0.96224f, -0.59139f, 0.03813f, 0.05403f, 1.33427f, -0.54375f, -1.92181f, 0.54704f, 0.13608f, 0.22151f, -0.38076f, 1.18390f, -0.77508f, -1.84283f, 1.00894f, 0.62318f, -0.15296f, 1.27600f, 0.22822f, 0.12751f, 0.93910f, -0.28502f, 0.53912f, -0.96889f, 0.10182f, 0.81508f, -0.43028f, 2.67386f, 0.52204f, 0.49820f, -0.41711f, 1.05038f, 1.12192f, 0.74349f, -0.75417f, -0.03718f, -0.35769f, 0.89651f, 0.63236f, 0.54215f, -0.07894f, 0.48274f, 1.08829f, }; static const float av1_tx_type_nn_bias_16x16_layer1[4] = { 0.81986f, 1.26865f, 0.11118f, 2.48404f, }; static const NN_CONFIG av1_tx_type_nnconfig_16x16 = { 8, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes { av1_tx_type_nn_weights_16x16_layer0, av1_tx_type_nn_weights_16x16_layer1, }, { av1_tx_type_nn_bias_16x16_layer0, av1_tx_type_nn_bias_16x16_layer1, }, }; /******************************************************************************/ // Tx type model for 4x16 block. static const float av1_tx_type_nn_weights_4x16_hor_layer0[32] = { 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f, 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f, 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f, 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f, -1.74563f, -0.88830f, -1.77603f, 2.15935f, }; static const float av1_tx_type_nn_bias_4x16_hor_layer0[8] = { -0.36435f, -2.22731f, -0.00837f, -1.34546f, 0.62806f, -0.20675f, 4.91940f, -0.56079f, }; static const float av1_tx_type_nn_weights_4x16_hor_layer1[32] = { -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f, -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f, 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f, 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f, 1.28413f, -0.30326f, 2.45329f, -0.83335f, }; static const float av1_tx_type_nn_bias_4x16_hor_layer1[4] = { 2.33198f, 3.36245f, 1.62603f, 2.91056f, }; static const NN_CONFIG av1_tx_type_nnconfig_4x16_hor = { 4, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 8, }, // num_hidden_nodes { av1_tx_type_nn_weights_4x16_hor_layer0, av1_tx_type_nn_weights_4x16_hor_layer1 }, { av1_tx_type_nn_bias_4x16_hor_layer0, av1_tx_type_nn_bias_4x16_hor_layer1 } }; static const float av1_tx_type_nn_weights_4x16_ver_layer0[128] = { 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f, 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f, -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f, -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f, -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f, -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f, 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f, 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f, 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f, -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f, -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f, 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f, 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f, 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f, 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f, -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f, 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f, 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f, -0.27975f, -0.01149f, }; static const float av1_tx_type_nn_bias_4x16_ver_layer0[16] = { -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f, -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f, -0.32530f, 0.73483f, 0.08322f, -0.23890f, }; static const float av1_tx_type_nn_weights_4x16_ver_layer1[64] = { 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f, -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f, 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f, -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f, 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f, -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f, 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f, 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f, -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f, -0.56513f, }; static const float av1_tx_type_nn_bias_4x16_ver_layer1[4] = { 4.60896f, 4.53551f, 4.53124f, 4.27435f, }; static const NN_CONFIG av1_tx_type_nnconfig_4x16_ver = { 8, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes { av1_tx_type_nn_weights_4x16_ver_layer0, av1_tx_type_nn_weights_4x16_ver_layer1 }, { av1_tx_type_nn_bias_4x16_ver_layer0, av1_tx_type_nn_bias_4x16_ver_layer1 } }; /******************************************************************************/ // Tx type model for 16x4 block. static const float av1_tx_type_nn_weights_16x4_hor_layer0[128] = { 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f, 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f, -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f, -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f, -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f, -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f, 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f, 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f, 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f, -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f, 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f, -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f, 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f, -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f, -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f, -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f, 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f, 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f, 0.19055f, -1.56413f, }; static const float av1_tx_type_nn_bias_16x4_hor_layer0[16] = { -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f, 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f, 1.14048f, 0.33308f, -1.10886f, 0.41184f, }; static const float av1_tx_type_nn_weights_16x4_hor_layer1[64] = { -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f, 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f, -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f, -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f, 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f, -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f, -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f, 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f, 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f, -0.43819f, }; static const float av1_tx_type_nn_bias_16x4_hor_layer1[4] = { 2.32575f, 2.75703f, 1.12304f, 2.15567f, }; static const NN_CONFIG av1_tx_type_nnconfig_16x4_hor = { 8, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes { av1_tx_type_nn_weights_16x4_hor_layer0, av1_tx_type_nn_weights_16x4_hor_layer1 }, { av1_tx_type_nn_bias_16x4_hor_layer0, av1_tx_type_nn_bias_16x4_hor_layer1 } }; static const float av1_tx_type_nn_weights_16x4_ver_layer0[32] = { 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f, 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f, -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f, -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f, -0.17967f, -0.96622f, 0.42635f, -1.04784f, }; static const float av1_tx_type_nn_bias_16x4_ver_layer0[8] = { -0.52088f, 0.52844f, -1.03655f, -0.30974f, 2.59952f, -1.93604f, 0.00000f, 2.51787f, }; static const float av1_tx_type_nn_weights_16x4_ver_layer1[32] = { 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f, 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f, 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f, -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f, 1.26814f, -1.93873f, -0.00768f, 1.58309f, }; static const float av1_tx_type_nn_bias_16x4_ver_layer1[4] = { 2.34713f, 1.68667f, 1.25488f, 1.69812f, }; static const NN_CONFIG av1_tx_type_nnconfig_16x4_ver = { 4, // num_inputs 4, // num_outputs 1, // num_hidden_layers { 8, }, // num_hidden_nodes { av1_tx_type_nn_weights_16x4_ver_layer0, av1_tx_type_nn_weights_16x4_ver_layer1 }, { av1_tx_type_nn_bias_16x4_ver_layer0, av1_tx_type_nn_bias_16x4_ver_layer1 } }; /******************************************************************************/ // Map tx_size to its corresponding neural net model for tx type prediction. static const NN_CONFIG *const av1_tx_type_nnconfig_map_hor[] = { &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform &av1_tx_type_nnconfig_16x16, // 16x16 transform NULL, // 32x32 transform NULL, // 64x64 transform &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform NULL, // 16x32 transform NULL, // 32x16 transform NULL, // 32x64 transform NULL, // 64x32 transform &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform NULL, // 8x32 transform NULL, // 32x8 transform NULL, // 16x64 transform NULL, // 64x16 transform }; static const NN_CONFIG *const av1_tx_type_nnconfig_map_ver[] = { &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform &av1_tx_type_nnconfig_16x16, // 16x16 transform NULL, // 32x32 transform NULL, // 64x64 transform &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform NULL, // 16x32 transform NULL, // 32x16 transform NULL, // 32x64 transform NULL, // 64x32 transform &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform NULL, // 8x32 transform NULL, // 32x8 transform NULL, // 16x64 transform NULL, // 64x16 transform }; #endif // CONFIG_NN_V2 // Tx split model for 4x8 block. static const float av1_tx_split_nn_weights_4x8_layer0[8 * 16] = { 0.068650f, -0.732073f, -0.040361f, 0.322550f, -0.021123f, 0.212518f, -0.350546f, 0.435987f, -0.111756f, -0.401568f, 0.069548f, -0.313000f, 0.073918f, -0.373805f, -0.775810f, -0.124753f, 0.181094f, -0.602641f, -0.026219f, -0.350112f, 0.020599f, -0.311752f, -0.476482f, -0.669465f, -0.310921f, 0.348869f, -0.115984f, 0.154250f, 0.200485f, -0.016689f, 0.020392f, 0.413810f, 0.634064f, -0.627530f, 0.399178f, -0.012284f, 0.472030f, 0.091087f, -0.706100f, -0.447944f, -0.274226f, 0.445656f, 0.309339f, 0.505522f, 0.038496f, -0.152809f, 0.408684f, -0.068151f, 0.271612f, 0.353233f, -0.150365f, 0.075212f, -0.035096f, 0.346615f, 0.124382f, 0.477072f, 0.216288f, 0.070548f, -0.106362f, 0.681613f, -0.145502f, -0.218631f, -0.099248f, -0.001983f, -0.196819f, -0.969045f, 0.063009f, -0.123053f, 0.104875f, -0.137581f, -0.282933f, -0.003624f, -0.315659f, -0.333523f, -0.503000f, -0.100063f, -0.536711f, -0.059978f, -0.670248f, -0.353762f, 0.181109f, 0.289715f, -0.071206f, 0.261141f, 0.052796f, -0.114554f, -0.139214f, -0.261380f, 0.075984f, -0.647925f, -0.099528f, -0.677814f, 0.015712f, -0.389385f, -0.095622f, -0.165117f, -0.109454f, -0.175240f, -0.393914f, 0.212330f, 0.037822f, 0.248280f, 0.180197f, 0.110493f, -0.525727f, -0.092329f, -0.524029f, -0.407364f, -0.542373f, -0.435626f, -0.912194f, 0.062794f, 0.160433f, 0.741485f, -0.103659f, -0.119327f, -0.055275f, 0.334358f, 0.014713f, 0.046327f, 0.831114f, -0.576682f, 0.354369f, -0.082088f, 0.452331f, 0.039730f, -0.792429f, -0.385862f, }; static const float av1_tx_split_nn_bias_4x8_layer0[16] = { 0.238621f, 2.186830f, 1.383035f, -0.867139f, 1.257119f, -0.351571f, -0.240650f, -0.971692f, 2.744843f, 1.116991f, 0.139062f, -0.165332f, 0.262171f, -1.598153f, -1.427340f, -1.602306f, }; static const float av1_tx_split_nn_weights_4x8_layer1[16] = { -0.367134f, 1.373058f, -0.897039f, -0.326819f, -0.734030f, -0.290413f, -0.501249f, 0.505321f, -0.537692f, -0.767893f, 0.268697f, 0.278987f, 0.085082f, 0.614986f, 0.847904f, 0.637578f, }; static const float av1_tx_split_nn_bias_4x8_layer1[1] = { 0.20586078f, }; static const NN_CONFIG av1_tx_split_nnconfig_4x8 = { 8, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes { av1_tx_split_nn_weights_4x8_layer0, av1_tx_split_nn_weights_4x8_layer1, }, { av1_tx_split_nn_bias_4x8_layer0, av1_tx_split_nn_bias_4x8_layer1, }, }; /******************************************************************************/ // Tx split model for 8x8 block. static const float av1_tx_split_nn_weights_8x8_layer0[144] = { 0.177983f, -0.938386f, -0.074460f, -0.221843f, -0.073182f, -0.295155f, -0.098202f, -0.279510f, 0.001054f, -0.119319f, -1.835282f, -0.581507f, -1.222222f, -1.049006f, -0.807508f, -0.454252f, -0.774879f, -0.180607f, -0.886976f, -0.231971f, -0.824677f, -0.351872f, -1.323819f, 0.235378f, 0.015331f, -0.341818f, 0.145549f, -0.348362f, 0.147647f, -0.323400f, 0.047558f, -0.553025f, -0.295485f, -0.330368f, -0.530605f, -0.407516f, 0.447740f, 0.782381f, -0.179164f, -0.584675f, -0.052645f, 0.038656f, -0.096783f, 0.038342f, -0.170762f, -0.405844f, -0.552665f, -0.509866f, 0.757204f, -1.296465f, 0.631015f, 0.009265f, 0.646192f, 0.044523f, 0.653161f, 0.033820f, 0.849639f, -0.068555f, -1.036085f, -0.511652f, 0.104693f, -1.458690f, 0.286051f, -0.089800f, 0.381564f, -0.302640f, 0.304465f, -0.268706f, 0.432603f, -0.117914f, -2.070031f, -0.565696f, -0.073027f, -1.783570f, -0.318144f, -0.320990f, -0.343966f, -0.140996f, -0.322977f, -0.232147f, -0.373210f, -0.158266f, -1.922305f, -0.634373f, 0.101894f, -0.221847f, 0.018412f, -0.423887f, -0.266684f, -0.444930f, -0.196237f, 0.106638f, -0.065834f, -0.538401f, -0.280772f, -0.620348f, 1.089957f, -0.799928f, 0.504112f, -0.165763f, 0.578741f, -0.172653f, 0.547316f, -0.143484f, 0.717220f, -0.297190f, -1.237854f, -0.074819f, -0.977304f, -0.484092f, -0.646427f, -0.451443f, -0.612126f, -0.224475f, -0.731608f, -0.257077f, -0.665857f, -0.346742f, -1.216372f, 0.227267f, 0.231249f, -1.693073f, -0.035899f, 0.380845f, -0.058476f, 0.409405f, -0.066679f, 0.406731f, -0.068501f, 0.396748f, 0.639462f, 0.150834f, -0.418659f, -1.421931f, 0.101889f, 0.083573f, 0.129746f, 0.134460f, 0.081185f, 0.127420f, 0.083664f, 0.051096f, 1.361688f, 0.386093f, }; static const float av1_tx_split_nn_bias_8x8_layer0[12] = { 4.280443f, 2.218902f, -0.256953f, 3.161431f, 2.082548f, 2.506052f, 2.563224f, 1.421976f, -1.627813f, -1.436085f, 2.297265f, 1.500469f, }; static const float av1_tx_split_nn_weights_8x8_layer1[12] = { 1.178833f, -0.428527f, -0.078737f, 0.381434f, -0.466895f, -0.901745f, -0.766968f, -0.356663f, 0.450146f, 0.509370f, -0.356604f, -0.443506f, }; static const float av1_tx_split_nn_bias_8x8_layer1[1] = { -0.156294f, }; static const NN_CONFIG av1_tx_split_nnconfig_8x8 = { 12, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 12, }, // num_hidden_nodes { av1_tx_split_nn_weights_8x8_layer0, av1_tx_split_nn_weights_8x8_layer1, }, { av1_tx_split_nn_bias_8x8_layer0, av1_tx_split_nn_bias_8x8_layer1, }, }; /******************************************************************************/ // Tx split model for 8x16 block. static const float av1_tx_split_nn_weights_8x16_layer0[8 * 64] = { 0.374660f, 0.218905f, -0.139779f, 0.212141f, 0.056517f, 0.051114f, 0.042860f, -0.273258f, -0.340809f, 0.138983f, -0.216996f, -0.241519f, -0.123244f, 0.078577f, -0.472273f, -0.194201f, 0.125056f, 0.239761f, -0.332782f, 0.174782f, -0.211400f, -0.129795f, 0.062195f, 0.113176f, -0.008869f, 0.140764f, 0.059833f, 0.163826f, 0.359293f, -0.109797f, -0.022091f, -0.059536f, -0.188226f, 0.179709f, 0.031386f, 0.164790f, 0.214364f, 0.198555f, 0.152262f, -0.242980f, 0.319367f, -0.136902f, 0.046524f, -0.043591f, 0.342178f, -0.011757f, -0.014286f, 0.072871f, -0.278314f, -0.345303f, -0.252103f, -0.107154f, -0.235101f, -0.106739f, -0.120865f, -0.160042f, 0.240028f, 0.112902f, -0.141587f, -0.703012f, -0.136591f, 0.318993f, -0.154417f, -0.054668f, 0.192870f, 0.176166f, -0.029965f, 0.266942f, -0.178384f, 0.038680f, 0.134403f, -0.002426f, 0.534825f, -0.070923f, 0.413281f, 0.418148f, 0.093729f, 0.016454f, 0.305358f, -0.040512f, 0.069904f, -0.227588f, -0.362220f, -0.031604f, -0.394901f, 0.071506f, -0.342833f, -0.142550f, -0.164005f, 0.182600f, 0.213062f, 0.076805f, 0.278758f, 0.125613f, -0.035552f, 0.040971f, 0.182785f, -0.227961f, -0.105413f, -0.074949f, -0.084629f, -0.254767f, 0.114657f, 0.047121f, 0.195902f, 0.264759f, 0.017799f, 0.210230f, 0.150749f, -0.142142f, 0.182494f, -0.142415f, -0.259782f, -0.114830f, -0.198826f, 0.000061f, -0.375668f, -0.276656f, -0.373202f, 0.210298f, 0.422680f, 0.066960f, 0.351106f, -0.209034f, 0.367195f, -0.110274f, 0.115573f, -0.066642f, -0.389673f, -0.260447f, 0.056949f, -0.180425f, 0.069922f, -0.153506f, -0.097053f, -0.111757f, 0.094069f, 0.144837f, -0.052984f, -0.506681f, -0.034474f, 0.279057f, -0.105025f, 0.006656f, -0.125017f, -0.114096f, 0.103153f, -0.117402f, -0.359472f, 0.072534f, 0.110291f, 0.003088f, -0.456897f, 0.038331f, -0.322298f, 0.113942f, -0.119916f, -0.194392f, 0.093167f, 0.193459f, 0.074671f, 0.033602f, 0.004440f, -0.179578f, -0.036637f, -0.216172f, -0.296530f, -0.318992f, 0.319160f, -0.066218f, 0.291246f, 0.181292f, 0.089914f, 0.025273f, 0.303128f, 0.019063f, 0.078545f, -0.396919f, 0.014065f, -0.122121f, 0.037107f, -0.151886f, -0.299392f, -0.172207f, -0.124571f, -0.232553f, 0.102970f, -0.225040f, 0.061059f, -0.258188f, -0.469871f, -0.099607f, -0.061524f, -0.213700f, 0.070237f, -0.289134f, -0.238225f, 0.256403f, -0.119344f, 0.067782f, -0.398983f, -0.123975f, -0.200205f, -0.047038f, 0.026569f, 0.031037f, 0.094302f, -0.101239f, 0.433307f, -0.303612f, 0.088537f, -0.164436f, 0.202471f, -0.048592f, -0.251904f, 0.122577f, -0.309874f, -0.263405f, -0.292503f, 0.216589f, 0.035378f, 0.136599f, -0.145844f, -0.018211f, 0.174084f, -0.449941f, -0.001428f, 0.064134f, 0.039652f, 0.111083f, -0.246076f, -0.204733f, 0.056559f, -0.000123f, 0.104049f, 0.138512f, -0.128309f, 0.087855f, 0.232784f, 0.247138f, 0.162766f, 0.154829f, 0.313605f, -0.164115f, -0.050844f, 0.156549f, 0.185279f, -0.238962f, -0.308281f, -0.179592f, -0.193262f, 0.201670f, -0.203399f, -0.096831f, -0.127867f, 0.310674f, -0.008181f, 0.004078f, -0.211038f, -0.193480f, -0.185639f, -0.150202f, -0.204858f, -0.240758f, 0.114268f, -0.032535f, -0.052403f, -0.234333f, -0.064072f, -0.208444f, -0.352853f, -0.224001f, -0.156330f, 0.215436f, 0.171846f, 0.291849f, 0.108832f, 0.046991f, -0.127801f, 0.032485f, 0.141493f, 0.123319f, -0.057250f, 0.315346f, -0.061317f, -0.465086f, -0.130179f, -0.217841f, -0.239089f, -0.073251f, -0.327718f, 0.054905f, -0.283169f, -0.028900f, 0.071450f, 0.270072f, 0.248891f, 0.088052f, 0.253319f, 0.122808f, 0.175490f, -0.147805f, 0.089169f, -0.045457f, -0.330788f, 0.099791f, -0.137376f, -0.195977f, -0.350942f, -0.284930f, -0.559037f, 0.030504f, 0.162554f, -0.199100f, -0.050453f, -0.131320f, -0.077863f, -0.066253f, -0.379723f, -0.424047f, -0.081182f, -0.252261f, -0.102815f, 0.058240f, -0.182036f, 0.176772f, -0.070823f, 0.216054f, -0.211533f, -0.232992f, 0.279346f, 0.117984f, 0.236674f, 0.126625f, -0.046220f, 0.044919f, 0.278492f, 0.083944f, 0.180512f, 0.217994f, 0.401170f, -0.064417f, 0.011636f, -0.139597f, -0.050020f, -0.268438f, -0.032803f, 0.024908f, -0.085713f, -0.012984f, -0.055192f, -0.338657f, 0.045826f, -0.312849f, -0.023393f, -0.168800f, -0.030886f, -0.131816f, -0.253542f, -0.104812f, -0.354389f, 0.169464f, 0.094151f, -0.217122f, -0.456397f, 0.211478f, 0.219232f, -0.155519f, -0.353700f, -0.264759f, -0.034709f, 0.034409f, -0.148639f, -0.132850f, -0.216791f, -0.118492f, 0.173721f, -0.144181f, 0.335028f, 0.176439f, 0.105980f, 0.169390f, 0.155615f, -0.040618f, -0.176029f, 0.155569f, -0.184833f, -0.171099f, -0.178663f, -0.032051f, -0.434334f, 0.092238f, -0.263103f, 0.061804f, -0.172957f, 0.005962f, -0.100176f, 0.125898f, 0.048092f, -0.088141f, 0.247196f, -0.221601f, -0.114474f, -0.124410f, -0.156393f, -0.181782f, -0.083562f, 0.034937f, 0.403401f, -0.046200f, 0.322259f, 0.219678f, 0.109850f, 0.051837f, 0.196861f, -0.019118f, 0.248818f, -0.137567f, 0.127862f, 0.052293f, 0.298726f, 0.275788f, 0.015344f, 0.058714f, 0.283691f, -0.053794f, -0.123270f, -0.227761f, -0.141744f, -0.268515f, -0.007189f, -0.242117f, -0.252396f, -0.069017f, 0.034803f, -0.003388f, -0.262577f, 0.062115f, -0.298393f, 0.215415f, -0.153615f, 0.289902f, 0.085886f, -0.504290f, 0.077178f, 0.150861f, -0.228848f, -0.261020f, 0.198204f, 0.162113f, 0.346418f, -0.286950f, 0.354756f, -0.226419f, 0.024720f, 0.208037f, 0.107286f, -0.110849f, 0.104415f, -0.207725f, 0.063932f, -0.037748f, -0.167037f, -0.068282f, 0.320815f, -0.051884f, 0.099989f, -0.078388f, 0.127071f, 0.046675f, -0.336571f, -0.273080f, 0.264694f, -0.007352f, -0.093828f, 0.094773f, -0.144434f, 0.091795f, -0.031615f, 0.056914f, 0.064673f, -0.136669f, 0.344734f, 0.225926f, 0.283451f, -0.068354f, 0.030572f, 0.180784f, -0.378047f, -0.092962f, -0.083291f, 0.038970f, 0.052094f, -0.017932f, 0.216302f, -0.184396f, 0.079888f, 0.210406f, -0.020627f, 0.244744f, 0.336972f, -0.182914f, -0.220976f, -0.304225f, -0.330974f, -0.370868f, -0.084935f, -0.136489f, -0.210082f, -0.188088f, -0.408768f, 0.184693f, }; static const float av1_tx_split_nn_bias_8x16_layer0[64] = { -0.274107f, 0.445751f, 0.234359f, 0.291593f, 0.163298f, 0.183707f, -0.548839f, -0.190779f, -0.163346f, -0.669028f, 0.399209f, -0.354974f, 0.000000f, -0.254630f, 0.220149f, 0.371104f, 0.789759f, 0.270300f, 0.195126f, -0.206958f, 0.917708f, -0.256232f, 1.131933f, 1.178944f, 0.461270f, 0.246169f, -0.818614f, -0.111986f, 0.759355f, 0.154889f, 0.470299f, -1.025250f, 0.678678f, 0.959346f, -0.164105f, 0.544079f, -0.448733f, 0.649221f, -0.536672f, 0.962758f, -0.256427f, 0.808664f, -0.118694f, 0.684873f, -0.015635f, -0.046469f, 0.075481f, 0.412647f, 0.454456f, -0.107169f, 0.775235f, -0.261629f, -1.194849f, 0.010093f, -0.231289f, 0.658286f, -0.769320f, 0.564545f, 0.482962f, -0.131378f, -0.255844f, -0.078400f, 0.476752f, 0.643001f, }; static const float av1_tx_split_nn_weights_8x16_layer1[64] = { -0.145065f, -0.145101f, 0.174786f, 0.196692f, 0.102025f, -0.087735f, 0.386353f, -0.660539f, -0.183940f, 0.490045f, -0.276404f, -0.145669f, 0.209846f, -0.085574f, -0.156821f, -0.377450f, -0.950010f, 0.450709f, -0.108545f, -0.261181f, 1.435606f, -0.176621f, -1.158548f, 2.035680f, 0.218069f, -0.138629f, 0.305958f, -0.277194f, -0.602468f, 0.203873f, 0.120720f, 0.216095f, -0.434502f, -0.579746f, -0.239450f, 0.755529f, 0.545643f, 0.232091f, 0.330169f, 0.988136f, -0.070465f, -0.345584f, -0.162455f, -0.617064f, 0.123881f, -0.201098f, 0.222756f, 0.112932f, 0.048647f, -0.147890f, 0.394584f, -0.262148f, 0.280564f, -0.195432f, -0.047515f, 1.133410f, 0.255415f, -0.299032f, -0.397807f, -0.153246f, -0.256734f, 0.177370f, 0.213522f, -0.530158f, }; static const float av1_tx_split_nn_bias_8x16_layer1[1] = { 0.14910713f, }; static const NN_CONFIG av1_tx_split_nnconfig_8x16 = { 8, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 64, }, // num_hidden_nodes { av1_tx_split_nn_weights_8x16_layer0, av1_tx_split_nn_weights_8x16_layer1, }, { av1_tx_split_nn_bias_8x16_layer0, av1_tx_split_nn_bias_8x16_layer1, }, }; /******************************************************************************/ // Tx split model for 16x16 block. static const float av1_tx_split_nn_weights_16x16_layer0[12 * 24] = { -0.177215f, -0.297166f, 0.299924f, 0.207878f, 0.216871f, 0.173264f, 0.295464f, 0.048395f, 0.154731f, 0.305880f, 0.056787f, -0.166617f, 0.115653f, -0.529477f, -0.073995f, -0.211746f, -0.018169f, 0.000788f, -0.024940f, -0.007055f, 0.001392f, 0.021678f, -1.594600f, -0.099593f, 0.332930f, 0.103574f, 0.158249f, 0.182601f, 0.332665f, 0.226207f, -0.139566f, 0.185531f, 0.099074f, -0.185654f, -0.203121f, -0.285678f, -0.313453f, -0.294452f, -0.143707f, -0.031265f, -0.453030f, -0.061874f, -0.066150f, -0.099058f, -0.458879f, 0.127544f, 0.338314f, -0.161350f, 0.030091f, -0.075528f, 0.004320f, 0.353690f, -0.013480f, -0.420402f, -0.004659f, -0.329401f, -0.001745f, 0.227384f, -0.055183f, 0.121405f, 0.160340f, 0.143603f, -0.221813f, 0.079107f, -0.657639f, -0.084348f, -0.303414f, 0.046774f, -0.367679f, 0.060005f, 0.168645f, 0.084421f, -0.133625f, 0.301375f, 0.079412f, -0.419303f, 0.017235f, 0.068637f, 0.018384f, -0.428325f, -0.019753f, 0.149444f, -0.474836f, -0.287162f, 0.198083f, 0.028292f, -0.299092f, -0.005849f, -0.256245f, 0.233277f, -0.217561f, -0.264003f, 0.269411f, 0.207032f, -0.339411f, -0.198431f, -0.028521f, 0.158076f, 0.177116f, 0.345702f, -0.145132f, 0.064623f, -0.090867f, 0.288816f, -0.263198f, -0.071028f, -0.044546f, 0.380017f, -0.014100f, -0.271192f, -0.318559f, 0.129015f, -0.050314f, -0.093355f, -0.578498f, 0.099090f, -0.133080f, -0.029975f, -0.059828f, -0.157765f, -0.321153f, -0.343671f, -0.242959f, 0.128304f, 0.017170f, 0.072787f, -0.475838f, -0.003806f, -0.068615f, 0.150556f, -0.159903f, -0.416513f, 0.218794f, -0.290456f, -0.084569f, -0.170014f, -0.044414f, -0.153069f, -0.077329f, -0.089747f, -0.096526f, 0.537952f, 0.134725f, -0.006469f, -0.323335f, -0.168183f, -0.107163f, -0.139954f, 0.011286f, -0.021712f, -0.513992f, 0.259135f, -0.319808f, 0.077811f, 0.104613f, 0.370571f, 0.185244f, 0.065530f, -0.091098f, -0.573741f, 0.111934f, 0.437417f, -0.123691f, 0.220641f, -0.024783f, -0.149460f, -0.354185f, -0.134127f, 0.038015f, -0.380596f, 0.250980f, 0.142208f, 0.135170f, -0.131129f, -0.357556f, -0.530945f, 0.159672f, -0.147025f, -0.377829f, -0.504508f, -0.492870f, 0.020753f, 0.142818f, 0.025172f, 0.086140f, 0.091283f, 0.087491f, -0.186415f, 0.177785f, -0.195121f, -1.191148f, -0.477102f, 0.023371f, 0.227004f, -0.023502f, -0.242913f, -0.074398f, -0.153480f, 0.162900f, 0.415509f, -0.162565f, -0.131709f, -0.258852f, -0.252027f, -0.080845f, -0.330274f, 0.021874f, 0.232398f, 0.069277f, 0.220567f, -0.024237f, -0.366771f, 0.081673f, -0.429906f, -0.302170f, 0.061045f, 0.352777f, -0.230376f, 0.408153f, 0.064758f, 0.142051f, 0.007219f, 0.622878f, 0.212577f, 0.036489f, 0.081150f, -0.284767f, 0.107763f, -0.529786f, -0.072190f, -0.300421f, -0.287959f, -0.568900f, 0.011547f, -0.131696f, -0.356854f, -0.587962f, -0.026598f, 0.405829f, 0.057565f, 0.414265f, -0.159155f, 0.221456f, 0.146314f, 0.265776f, -0.006516f, 0.473978f, -0.186431f, 0.288672f, -0.060437f, 0.083380f, -0.205641f, 0.360016f, 0.222041f, 0.420011f, 0.024579f, 0.377546f, 0.250380f, -0.069900f, 0.296743f, 0.073532f, -0.243225f, -0.374987f, -0.387288f, -0.237255f, -0.287013f, 0.417831f, -0.252988f, -0.257652f, -0.066775f, -0.253926f, 0.057841f, 0.346133f, -0.157797f, -0.406028f, -0.286893f, 0.274507f, -0.452561f, 0.143381f, -0.097755f, 0.021242f, 0.034561f, 0.044115f, 0.004065f, 0.066729f, 0.043558f, 0.102991f, -0.477574f, }; static const float av1_tx_split_nn_bias_16x16_layer0[24] = { -0.479033f, 1.467402f, -0.366291f, 0.372511f, 0.715322f, -0.605500f, 0.176848f, 0.032318f, 0.237429f, -0.046047f, 0.452082f, 0.451805f, -0.822845f, 0.636762f, -0.057350f, 1.163978f, 0.728287f, 0.603654f, -0.245519f, -0.893569f, -1.428185f, 0.808870f, -0.076159f, 1.231976f, }; static const float av1_tx_split_nn_weights_16x16_layer1[24] = { -0.176161f, 1.670188f, -0.180755f, -0.321326f, 0.249728f, -0.170504f, -0.538432f, 0.033893f, 0.149842f, 0.404140f, -0.377812f, 0.338838f, -0.176091f, 0.249844f, -0.362533f, 1.412460f, 0.196862f, 0.278194f, -0.140444f, 0.297746f, 0.172533f, 0.116470f, -0.151656f, -0.603250f, }; static const float av1_tx_split_nn_bias_16x16_layer1[1] = { 0.184803f, }; static const NN_CONFIG av1_tx_split_nnconfig_16x16 = { 12, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 24, }, // num_hidden_nodes { av1_tx_split_nn_weights_16x16_layer0, av1_tx_split_nn_weights_16x16_layer1, }, { av1_tx_split_nn_bias_16x16_layer0, av1_tx_split_nn_bias_16x16_layer1, }, }; /******************************************************************************/ // Tx split model for 32x32 block. static const float av1_tx_split_nn_weights_32x32_layer0[12 * 32] = { -0.439303f, 0.004813f, -0.365052f, -0.116868f, -0.356716f, -0.196537f, -0.196770f, -0.076096f, 0.357004f, -0.044909f, -0.112910f, -0.129081f, 0.156725f, -0.386346f, 0.038971f, 0.160696f, 0.204923f, -0.384333f, -0.319546f, 0.028179f, -0.250524f, -0.289669f, -0.284138f, -0.258963f, -0.180854f, -0.000807f, -0.029620f, -0.353134f, 0.212408f, 0.141414f, 0.303016f, 0.098066f, 0.482455f, 0.036069f, -0.166279f, 0.210119f, -0.086337f, -0.023550f, -0.250796f, -0.183945f, -0.393856f, 0.170608f, -0.306403f, 0.026318f, -0.277296f, 0.092684f, -0.033584f, -0.018371f, -0.025043f, -0.257659f, -0.139163f, -0.206949f, -0.190105f, 0.028053f, 0.361851f, -0.364726f, -0.096771f, -0.184166f, -0.433228f, -0.182191f, -0.097051f, 0.259172f, 0.016432f, 0.259358f, 0.145059f, 0.037196f, 0.091581f, -0.219644f, 0.140384f, -0.446837f, -0.234531f, 0.149508f, -0.083429f, 0.186189f, -0.099890f, -0.111277f, 0.495214f, 0.085053f, -0.266613f, -0.051366f, 0.148593f, 0.111875f, 0.077787f, -0.371653f, -0.146157f, -0.229235f, 0.076203f, 0.488975f, 0.096771f, -0.009483f, 0.192985f, 0.246273f, -0.192671f, -0.557890f, -0.292650f, -0.088907f, -0.106892f, -0.329659f, 0.012105f, -0.359326f, 0.170723f, -0.004357f, 0.171593f, -0.478768f, -0.236016f, -0.035077f, 0.133731f, 0.137962f, -0.397926f, -0.155164f, -0.276709f, -0.186602f, -0.258301f, 0.036965f, -0.649359f, 0.127605f, 0.097930f, 0.182775f, -0.313324f, 0.053349f, 0.204203f, -0.222948f, -0.059008f, -0.049759f, -0.056848f, 0.087497f, -0.039987f, -0.055042f, -0.041623f, -0.078424f, -0.317291f, -0.191398f, 0.632147f, 0.221825f, 0.268394f, -0.096357f, 0.442545f, -0.007117f, -0.036125f, 0.000525f, 0.088092f, -0.203653f, 0.086925f, 0.439141f, 0.329889f, -0.370050f, -0.194306f, -0.207430f, 0.132779f, -0.217614f, -0.039444f, -0.053019f, -0.260725f, -0.116563f, -0.271048f, 0.283737f, -0.007300f, 0.062257f, -0.347865f, -0.296767f, -0.359123f, 0.230459f, -0.189117f, -0.087622f, -0.561091f, 0.184182f, -0.044980f, 0.012643f, 0.241672f, 0.050272f, -0.204851f, -0.159285f, -0.064081f, -0.118666f, -0.269471f, 0.231668f, 0.135749f, -0.131162f, 0.062760f, 0.100949f, 0.074967f, -0.056918f, 0.251707f, 0.034098f, 0.341290f, -0.105027f, 0.313246f, -0.092679f, -0.014632f, -0.390967f, 0.136881f, -0.241554f, 0.097674f, 0.110832f, -0.390245f, 0.017654f, -0.506222f, 0.065252f, 0.244834f, -0.171352f, -0.331702f, 0.111043f, 0.125217f, -0.058116f, -0.382595f, -0.052545f, 0.114261f, -0.493617f, 0.243984f, -0.171053f, 0.165009f, -0.063020f, 0.096502f, 0.341339f, -0.013443f, 0.056372f, 0.339284f, 0.398376f, 0.389409f, 0.257252f, 0.517368f, 0.078856f, 0.087716f, -0.171092f, 0.227461f, 0.125307f, -0.054423f, -0.143161f, 0.224041f, -0.086477f, -0.092548f, 0.072392f, -0.061608f, 0.258347f, 0.147033f, -0.478244f, -0.204869f, 0.038552f, -0.144563f, 0.224087f, -0.296705f, 0.153889f, -0.064624f, 0.085265f, -0.103826f, 0.127971f, 0.019965f, 0.111937f, -0.074187f, -0.029518f, -0.127305f, -0.012210f, 0.042714f, 0.070052f, -0.202360f, 0.348144f, -0.132097f, -0.209585f, -0.248286f, -0.065774f, -0.089482f, -0.133226f, 0.325430f, -0.013468f, -0.406090f, -0.144936f, 0.208620f, 0.343445f, -0.059639f, 0.114857f, -0.069431f, -0.218725f, 0.190575f, -0.368101f, 0.030030f, 0.062815f, -0.239369f, -0.537852f, 0.022487f, 0.023038f, 0.190788f, 0.040123f, -0.004304f, 0.060749f, -0.108929f, 0.136796f, -0.542875f, -0.227074f, -0.182244f, 0.082559f, 0.019149f, 0.178854f, 0.120284f, 0.009070f, 0.068268f, -0.544822f, 0.120536f, 0.354028f, -0.119890f, -0.122055f, -0.405335f, 0.122341f, -0.304412f, 0.062405f, -0.302568f, -0.276505f, -0.120915f, -0.221841f, 0.282007f, -0.253971f, 0.059517f, -0.144976f, 0.149391f, -0.047355f, -0.167742f, -0.392333f, -0.041132f, 0.342135f, 0.017485f, 0.021038f, -0.023728f, -0.192181f, -0.103996f, 0.092873f, -0.114365f, -0.397732f, -0.065421f, 0.053084f, 0.035201f, 0.053019f, -0.105377f, -0.039500f, 0.131904f, -0.123911f, -0.390328f, -0.125198f, -0.000126f, 0.014864f, -0.220187f, 0.084056f, -0.492155f, -0.164979f, 0.133592f, 0.121519f, -0.240813f, 0.186680f, 0.118673f, 0.235006f, -0.239894f, -0.185759f, -0.336992f, 0.209620f, -0.298845f, 0.127803f, -0.083992f, 0.194340f, -0.245378f, 0.212308f, 0.142512f, -0.163324f, 0.383495f, 0.291065f, 0.286620f, -0.239957f, 0.225127f, -0.174424f, 0.297231f, -0.045434f, 0.156444f, -0.184273f, -0.204567f, 0.202551f, 0.370019f, -0.073910f, 0.344897f, 0.063100f, 0.338547f, -0.099145f, 0.391863f, -0.214244f, -0.241734f, -0.281851f, -0.035133f, -0.153157f, }; static const float av1_tx_split_nn_bias_32x32_layer0[32] = { 0.143343f, -0.021982f, -0.314939f, 0.170867f, -0.081248f, 0.125758f, -0.355762f, 0.279798f, 1.027712f, -0.434660f, 1.072005f, 0.668893f, -0.031216f, -0.528650f, 0.328349f, 0.543645f, -0.188810f, 0.221110f, -1.638637f, 0.058045f, -1.731105f, -0.444284f, 0.513693f, 0.890025f, 0.160288f, 0.393312f, 0.332856f, -0.080767f, 0.299822f, 0.235876f, 0.254942f, -0.017796f, }; static const float av1_tx_split_nn_weights_32x32_layer1[32] = { -0.090326f, -0.267553f, -0.026071f, 0.100912f, 0.279137f, 0.079064f, -0.074885f, 0.053804f, 0.736810f, -0.031693f, -0.970514f, 0.174069f, 0.095940f, -0.065047f, 0.052911f, 0.176728f, -0.058274f, 0.148364f, -0.162210f, 0.093875f, -0.367663f, 0.020876f, 0.137280f, -1.099116f, 0.146854f, 0.075590f, 0.228534f, 0.141993f, 0.072143f, 0.101421f, -0.068547f, -0.154148f, }; static const float av1_tx_split_nn_bias_32x32_layer1[1] = { 0.316622f, }; static const NN_CONFIG av1_tx_split_nnconfig_32x32 = { 12, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 32, }, // num_hidden_nodes { av1_tx_split_nn_weights_32x32_layer0, av1_tx_split_nn_weights_32x32_layer1, }, { av1_tx_split_nn_bias_32x32_layer0, av1_tx_split_nn_bias_32x32_layer1, }, }; /******************************************************************************/ // Tx split model for 64x64 block. static const float av1_tx_split_nn_weights_64x64_layer0[12 * 32] = { -0.006828f, 0.149944f, -0.017614f, -0.044599f, -0.024517f, 0.507698f, 0.001039f, 0.037164f, 0.015091f, -0.306620f, -0.162047f, -0.369440f, 0.396310f, 0.087121f, 0.208609f, -0.083068f, 0.493774f, 0.217682f, 0.377393f, 0.172879f, 0.397422f, 0.078919f, 0.741350f, 0.064169f, -0.099989f, -0.192983f, -0.278230f, -0.310048f, -0.439965f, -0.226698f, -0.436596f, -0.007551f, -0.396721f, 0.153570f, -0.190838f, -0.071869f, 0.048799f, -0.301301f, -0.005015f, 0.500480f, -0.030622f, -0.559095f, -0.032634f, -0.054160f, -0.056979f, -0.456545f, 0.306536f, -0.411323f, -0.005366f, -0.069496f, 0.019990f, 0.327931f, -0.002516f, 0.393190f, 0.001759f, 0.035093f, -0.030302f, -0.528984f, 0.174781f, 0.241462f, -0.415427f, -0.164502f, 0.143065f, -0.122595f, 0.082049f, -0.143346f, 0.055642f, -0.124701f, 0.004050f, -0.216235f, -2.681730f, 0.101658f, 0.381239f, 0.465936f, 0.331154f, 0.301708f, -0.360171f, 0.054886f, -0.118658f, 0.287921f, 0.277859f, 0.203784f, 0.247809f, 0.656924f, -0.354628f, 0.315081f, 0.105108f, -0.510179f, 0.059267f, 0.061386f, 0.076423f, 0.347119f, 0.100134f, 0.028402f, -0.118621f, -0.238689f, 0.080141f, -0.138863f, 0.009009f, -0.100526f, -0.138875f, 0.066992f, 0.005949f, 0.564336f, 0.046994f, 0.004655f, 0.366047f, 0.014695f, -0.146928f, -0.024665f, -0.440357f, -0.109395f, 0.527231f, -0.020925f, -0.227236f, -0.068141f, 0.282009f, 0.040192f, -0.267100f, 0.229228f, 0.133861f, 0.338706f, -0.030178f, -0.040919f, -0.026343f, -0.330338f, -0.066931f, -0.110580f, -0.072056f, 0.599457f, -0.020738f, 0.169200f, 0.836240f, -0.157548f, 0.386273f, 0.002404f, 0.329410f, -0.007020f, 0.351705f, -0.041259f, 0.388861f, 0.003899f, 0.582627f, 0.023572f, 0.409912f, -0.158472f, 0.536383f, 0.525093f, 0.604247f, 0.439159f, 0.692832f, 0.046272f, 0.590367f, -0.082166f, 0.262357f, 0.478671f, 0.031935f, 0.042675f, 0.120002f, 0.398616f, -0.078967f, 0.227986f, -0.044679f, 0.151061f, -0.085564f, 0.220205f, -0.265606f, -0.203623f, 0.204719f, -0.125922f, 0.038544f, -0.269379f, 0.025866f, 0.109967f, 0.019064f, -0.237297f, -0.309746f, -0.329118f, -0.278368f, -0.063859f, 0.278496f, 0.018620f, 0.209971f, 0.296250f, 0.142850f, 0.288689f, 0.137084f, 0.130517f, 0.128171f, -0.155396f, -0.008449f, -0.099845f, 0.173455f, -0.059909f, -0.147318f, 0.102851f, -0.251389f, -0.001448f, 0.103907f, 0.297273f, -0.027846f, 0.028260f, -0.382601f, 0.346695f, -0.601641f, 0.162366f, -0.477495f, -0.042731f, -0.387871f, -0.051791f, -0.401498f, -0.048446f, -0.456270f, -0.062287f, 0.493919f, 0.003008f, 0.099917f, -0.358525f, -0.094903f, -0.022811f, -0.062259f, 0.019455f, -0.050644f, 0.020041f, -0.132912f, -0.061578f, -3.083691f, -0.014961f, -0.129115f, -0.710559f, 0.157213f, -0.844037f, -0.121991f, -0.943386f, -0.231269f, -0.003462f, 0.331478f, -0.132703f, -1.285993f, -0.120957f, -0.373755f, -0.322609f, 0.309059f, -0.131523f, -0.118334f, -0.063805f, -0.104251f, 0.012166f, -0.094699f, -0.283753f, 0.128168f, -0.526929f, -0.050331f, 0.186153f, 0.005913f, -0.221236f, 0.036363f, 0.160909f, -0.001342f, -0.382749f, 0.037820f, 0.281689f, -0.024275f, 0.028854f, 0.318291f, 0.318526f, 0.035778f, 0.034031f, 0.189663f, -0.293367f, 0.082022f, 0.127923f, 0.078866f, -0.081361f, -0.268117f, 0.246675f, 0.248605f, -0.215479f, -0.073084f, 0.496140f, -0.067327f, 0.396237f, -0.120739f, 0.033752f, -0.044120f, -0.218941f, -0.028078f, 0.195132f, -0.040400f, 0.281604f, -0.100471f, 0.415207f, -0.258503f, -0.429749f, 0.150569f, -0.010859f, 0.136448f, 0.026589f, 0.148466f, 0.110764f, 0.380967f, 0.009177f, 0.103075f, 0.116417f, 0.226273f, -0.327746f, 0.169346f, 0.284553f, -0.094986f, 0.312745f, -0.147840f, 0.025062f, -0.494482f, 0.112388f, -0.213962f, 0.107050f, -0.433371f, -0.096276f, -0.244835f, -0.003518f, -0.459148f, -0.145080f, 0.017150f, 0.042846f, -0.237479f, 0.104746f, 0.158677f, 0.358937f, 0.099921f, 0.277109f, 0.012410f, -0.062897f, 0.116130f, 0.255309f, 0.341628f, 0.145002f, -0.429344f, -0.016433f, -0.068985f, 0.285194f, -0.286719f, -0.018298f, -0.179369f, -0.194655f, -0.165380f, 0.026071f, -0.428268f, -0.379929f, -0.727543f, 0.179610f, -0.963979f, -0.042026f, -0.616202f, 0.133401f, -0.784966f, 0.061205f, -0.713357f, 0.129795f, 0.120512f, -0.339545f, 0.353557f, 0.114906f, -0.329813f, -0.209987f, 0.085410f, 0.214313f, -0.122082f, 0.335770f, -0.020937f, 0.202456f, 0.289023f, -0.421186f, 0.337905f, 0.407663f, 0.132771f, 0.071734f, 0.213914f, 0.128595f, 0.302659f, -0.209501f, 0.217756f, 0.253079f, -0.089505f, -0.205614f, }; static const float av1_tx_split_nn_bias_64x64_layer0[32] = { 0.296914f, -1.826816f, 0.346130f, 0.969520f, -0.528154f, 1.175862f, -0.075985f, -0.097323f, -0.233059f, 0.004846f, 0.401279f, -2.272435f, 0.086257f, 0.414162f, -0.194786f, -0.233887f, -0.113215f, -2.453546f, 0.861214f, 0.298361f, 0.267397f, -0.158557f, -0.119911f, -0.098134f, -0.339263f, 0.385871f, -0.678123f, 0.263218f, 0.251611f, -1.155773f, -0.365437f, 0.229255f, }; static const float av1_tx_split_nn_weights_64x64_layer1[32] = { 0.502104f, -0.708023f, 0.419648f, 1.583418f, 0.419355f, -1.462981f, -0.439623f, 0.405691f, 0.823257f, 0.061654f, 0.750875f, 0.775031f, -0.387909f, 0.447385f, 0.284690f, 0.353262f, -0.224347f, 0.832864f, -1.708491f, -1.042447f, -0.272829f, 0.540640f, 0.310509f, 0.723745f, 0.245592f, -0.218417f, -0.597987f, -0.362301f, 0.702217f, -0.692614f, 0.207812f, 0.513560f, }; static const float av1_tx_split_nn_bias_64x64_layer1[1] = { -0.2307045f }; static const NN_CONFIG av1_tx_split_nnconfig_64x64 = { 12, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 32, }, // num_hidden_nodes { av1_tx_split_nn_weights_64x64_layer0, av1_tx_split_nn_weights_64x64_layer1, }, { av1_tx_split_nn_bias_64x64_layer0, av1_tx_split_nn_bias_64x64_layer1, }, }; /******************************************************************************/ // Tx split model for 4x16 block. static const float av1_tx_split_nn_weights_4x16_layer0[8 * 16] = { -1.344184f, -1.454625f, -0.703110f, -0.140570f, -0.841536f, -0.068131f, -2.128968f, -0.655518f, 0.432180f, 0.879752f, -0.222211f, 0.061615f, -0.230969f, 0.569496f, 1.424188f, 0.598063f, -0.436005f, -0.737606f, -0.137875f, -0.085730f, -0.076512f, -0.583101f, -0.937377f, -0.203556f, -0.215797f, -0.015361f, -0.124098f, -0.411917f, 0.340441f, -0.331752f, -0.472607f, -0.097714f, -0.930572f, -1.354713f, -0.550724f, 0.176212f, -0.636060f, 0.183271f, -0.610212f, 0.345895f, -1.100906f, -1.605713f, 0.111888f, -0.140937f, 0.063013f, -0.013315f, -0.273472f, -0.255870f, 1.200328f, 0.274002f, 1.005776f, 0.322392f, 1.222373f, 0.158227f, 0.408810f, 0.145022f, 0.139842f, -1.249412f, 0.286672f, -0.635699f, 0.312562f, -0.495606f, -1.117034f, -0.085107f, -0.097484f, -0.341521f, -0.132199f, -0.863055f, 0.217579f, -1.161425f, -0.302087f, -1.357271f, -0.520724f, -1.211069f, -1.048729f, -0.333087f, -1.171527f, -0.280824f, -2.057684f, -0.228755f, 0.606278f, 0.101198f, -0.314847f, -1.303255f, -0.294964f, 1.301923f, 0.041712f, 0.077593f, -1.152746f, 0.495315f, -0.751566f, 0.230249f, -0.840661f, 0.100731f, 1.346269f, 0.649898f, -1.432258f, -0.456710f, -1.018123f, -0.348559f, -1.225226f, -0.170717f, -0.354072f, 0.068292f, -0.234168f, 0.277503f, 0.179134f, 0.907420f, 0.354626f, -0.627210f, 0.905779f, 0.512612f, 0.161190f, -0.843177f, 0.014953f, -0.354983f, 0.011116f, -0.429598f, -1.017138f, -0.211432f, 0.941840f, -0.281747f, 0.957776f, -0.541914f, 1.041880f, -0.433580f, -1.416451f, -0.166467f, }; static const float av1_tx_split_nn_bias_4x16_layer0[16] = { 3.086118f, -3.235095f, 4.830956f, -0.165706f, 0.955031f, 4.055783f, -0.311489f, 4.660205f, -0.576277f, -0.248111f, -0.790519f, -1.686412f, -1.191704f, -3.800073f, 4.121552f, -1.399397f, }; static const float av1_tx_split_nn_weights_4x16_layer1[16] = { -0.758677f, 0.388776f, 0.439906f, 0.011390f, -0.084319f, -0.667969f, -0.467316f, -0.875491f, -0.160668f, 0.805292f, 0.114393f, -0.549682f, 0.462109f, 0.343315f, 1.092593f, 0.483152f, }; static const float av1_tx_split_nn_bias_4x16_layer1[1] = { 0.8205083f, }; static const NN_CONFIG av1_tx_split_nnconfig_4x16 = { 8, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes { av1_tx_split_nn_weights_4x16_layer0, av1_tx_split_nn_weights_4x16_layer1, }, { av1_tx_split_nn_bias_4x16_layer0, av1_tx_split_nn_bias_4x16_layer1, }, }; /******************************************************************************/ // Tx split model for 16x32 block. static const float av1_tx_split_nn_weights_16x32_layer0[8 * 32] = { 0.180713f, 0.033211f, 0.607561f, 0.138642f, 0.637204f, -0.000940f, 0.012630f, 0.358109f, 0.022238f, 0.190418f, 0.079088f, 0.065925f, 0.038242f, 0.162380f, -0.122728f, 0.379382f, -0.303283f, -0.327550f, 0.029120f, -0.284553f, 0.269588f, -0.309805f, -0.241036f, -0.161103f, -0.304887f, 0.239843f, -0.149146f, 0.311234f, -0.073640f, -0.132718f, 0.178901f, 0.474712f, 0.020280f, 0.063685f, -0.609170f, -0.013658f, -0.338074f, 0.250429f, 0.082978f, -0.186315f, -0.788959f, 0.039859f, -0.426461f, -0.001524f, -0.447211f, 0.378102f, 0.315617f, 0.017428f, 0.745494f, -0.219024f, 0.512836f, 0.200522f, 0.680449f, 0.313686f, -0.412569f, -0.132927f, 0.631120f, 0.042735f, 0.336153f, 0.044772f, 0.432606f, 0.175681f, -0.634411f, -0.073509f, -0.040643f, -0.559260f, -0.104034f, -0.570495f, -0.247365f, 0.063256f, -0.582021f, -0.492585f, -0.194955f, -0.207934f, -0.506627f, 0.021743f, -0.416518f, 0.320876f, 0.115889f, 0.149399f, -0.229376f, 0.095505f, 0.115191f, -0.471921f, 0.113068f, 0.343684f, -0.036831f, 0.021240f, 0.295112f, 0.031166f, 0.448201f, -0.132241f, 0.164032f, 0.355572f, 0.072154f, 0.017335f, -0.046113f, 0.178719f, -0.026881f, -0.242590f, 0.055073f, -0.012958f, 0.077904f, 0.351356f, 0.107655f, 0.260568f, -0.080052f, -0.197553f, 0.085763f, 0.263416f, -0.327741f, 0.158855f, 0.056899f, -0.162121f, 0.339518f, -0.571204f, 0.264966f, -0.252214f, -0.202560f, -0.134213f, -0.330188f, 0.009470f, -0.468376f, -0.065240f, -0.307957f, 0.116479f, -0.222238f, -0.458716f, 0.186493f, -0.391415f, 0.118649f, -0.104653f, -0.259958f, -0.332081f, -0.403785f, -0.050147f, -0.573511f, 0.177117f, -0.598358f, 0.164947f, -0.119694f, -0.058520f, 0.203829f, -0.267404f, -0.048202f, -0.600006f, 0.181594f, -0.731805f, 0.146417f, -0.687148f, -1.210525f, -0.450101f, -0.620635f, 0.208825f, -0.611357f, 0.112202f, -0.309468f, -0.323545f, 0.357770f, 0.308061f, 0.553199f, 0.049012f, 0.530093f, -0.208597f, 0.607882f, -0.058120f, -0.527634f, 0.018136f, 0.060753f, 0.118894f, 0.175649f, 0.014731f, 0.428318f, -0.106465f, -0.119077f, 0.080179f, 0.524997f, 0.368286f, 0.528286f, 0.213659f, 0.639286f, 0.195079f, -0.049815f, -0.092008f, -0.302958f, 0.298149f, -0.173870f, -0.145205f, -0.233589f, -0.303368f, 0.141275f, 0.325622f, -0.115293f, 0.155188f, 0.047225f, 0.231050f, -0.167447f, 0.349754f, 0.295544f, -0.319466f, 0.095144f, 0.174612f, -0.194652f, 0.305915f, -0.239008f, -0.037453f, 0.280696f, 0.125850f, 0.749196f, -0.101919f, 0.791808f, -0.236811f, 0.064157f, 0.032865f, -0.225911f, 0.350384f, 0.723183f, -0.103992f, 0.483085f, -0.123992f, 0.602138f, 0.023895f, -0.692601f, -0.118387f, 0.162527f, 0.145178f, -0.184702f, -0.017753f, -0.159436f, 0.124105f, -0.131067f, 0.310275f, 0.151499f, 0.138924f, 0.537459f, 0.263212f, 0.615896f, 0.281255f, 0.021293f, -0.473459f, 0.210145f, -0.056682f, 0.063658f, 0.377254f, -0.314410f, -0.183487f, 0.300384f, 0.328471f, 0.164694f, -0.159272f, -0.160942f, -0.502861f, -0.129147f, 0.045916f, -0.606865f, -0.101378f, }; static const float av1_tx_split_nn_bias_16x32_layer0[32] = { 0.051664f, -0.212487f, -0.077596f, -0.818467f, 0.638475f, -0.759937f, 0.157198f, 0.989640f, 1.586035f, 0.431144f, 0.041605f, 0.543085f, 0.498379f, 0.320504f, 0.134233f, 0.670979f, -0.105562f, -1.574879f, 1.261812f, -0.287530f, -1.610592f, 0.730899f, -0.894240f, -0.657790f, 0.270806f, -0.181708f, 0.298578f, 0.817240f, -0.221508f, -0.201771f, -0.294389f, 1.456413f, }; static const float av1_tx_split_nn_weights_16x32_layer1[32] = { 1.208914f, 0.324728f, 0.383352f, -0.874321f, 0.172565f, -0.580927f, -0.432927f, 0.433698f, -0.801935f, 0.672028f, 0.563493f, 0.260077f, -0.200557f, -0.121638f, 0.530735f, -0.525196f, 0.281799f, 0.624204f, -0.662775f, -0.230887f, 0.980989f, 0.223437f, -0.790591f, 0.600724f, -0.273445f, 0.427635f, -0.501641f, -0.878390f, 0.234731f, -0.172550f, 0.418904f, 1.792187f, }; static const float av1_tx_split_nn_bias_16x32_layer1[1] = { -0.29233751f, }; static const NN_CONFIG av1_tx_split_nnconfig_16x32 = { 8, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 32, }, // num_hidden_nodes { av1_tx_split_nn_weights_16x32_layer0, av1_tx_split_nn_weights_16x32_layer1, }, { av1_tx_split_nn_bias_16x32_layer0, av1_tx_split_nn_bias_16x32_layer1, }, }; /******************************************************************************/ // Tx split model for 32x64 block. static const float av1_tx_split_nn_weights_32x64_layer0[8 * 32] = { 0.031614f, -0.110926f, 0.052418f, -0.702506f, 0.045708f, 0.238329f, -0.021806f, -0.208128f, 0.509745f, -0.293891f, 0.277788f, 0.113937f, 0.741576f, 0.062848f, 0.351878f, 0.212532f, 0.385842f, 0.081517f, 0.398502f, -0.015156f, 0.242616f, 0.214619f, -0.182678f, -0.170546f, 0.110605f, -0.236749f, -0.023831f, -0.285243f, 0.147156f, -0.257639f, 0.341355f, -0.571641f, -0.721797f, 0.139588f, -0.518494f, -0.206526f, -0.570560f, -0.184295f, 0.110271f, 0.210292f, -0.109132f, -0.001080f, 0.129251f, -0.204230f, -0.396312f, -0.183024f, 0.421243f, -0.013154f, 0.222627f, 0.169826f, 0.226037f, 0.218153f, -0.343528f, 0.274906f, -0.156632f, 0.250261f, -0.484020f, 0.019909f, -0.349575f, -0.286643f, -0.507396f, 0.202446f, -0.154110f, -0.292644f, 0.122666f, 0.306963f, 0.424895f, 0.005579f, 0.494094f, -0.079551f, 0.473740f, 0.352414f, -0.356917f, 0.264331f, -0.554487f, 0.119978f, 0.012291f, -0.141641f, -0.254714f, -0.213723f, -0.116701f, -0.011267f, 0.190025f, -0.118501f, 0.305151f, -0.316782f, -0.220801f, -0.308420f, -0.324285f, 0.421329f, -0.177066f, -0.055114f, 0.229698f, -0.199523f, 0.054278f, 0.365020f, -0.060586f, -0.300618f, 0.157563f, -0.064338f, -0.005711f, -0.176991f, -0.424502f, -0.111914f, 0.092608f, 0.126621f, 0.078547f, 0.148008f, 0.024221f, 0.124599f, 0.001343f, 0.059402f, 0.453753f, 0.047102f, 0.242544f, 0.055735f, -0.067451f, -0.170061f, -0.170469f, -0.232173f, 0.214908f, 0.248889f, 0.544348f, -0.084566f, 0.402478f, 0.298031f, 0.099038f, -0.238019f, -0.475085f, -0.070042f, -0.754955f, -0.049095f, -0.783801f, -0.099857f, -0.582008f, -0.055194f, -0.103655f, 0.143689f, 0.100219f, 0.293934f, 0.099271f, -0.036320f, 0.356626f, -0.261445f, 0.879544f, 0.000878f, 0.532920f, -0.093918f, 0.508867f, -0.040215f, -0.789042f, -0.145380f, -0.090040f, -0.066636f, 0.015212f, 0.352989f, -0.058831f, -0.164588f, 0.039890f, 0.122861f, 0.222508f, 0.061217f, 0.466487f, 0.022666f, 0.423777f, -0.002200f, -0.656835f, -0.099760f, -0.520606f, 0.303204f, -0.563620f, -0.160922f, -0.243203f, 0.313354f, -0.336516f, -0.206764f, -0.236040f, 0.325899f, -0.418748f, 0.163205f, -0.476242f, -0.121928f, 0.139178f, -0.157193f, -0.531766f, -0.180202f, -0.485254f, 0.187703f, -0.440072f, 0.137854f, 0.029139f, 0.109530f, -0.078475f, -0.360618f, -0.334672f, -0.350890f, -0.403976f, 0.180336f, -0.304542f, 0.005123f, 0.413995f, 0.314639f, 0.342648f, -0.293264f, 0.358135f, -0.180425f, -0.369530f, -0.048413f, 0.498366f, 0.121875f, 0.270948f, -0.187966f, 0.342503f, 0.174420f, -0.352105f, 0.088080f, 0.008277f, 0.020275f, -0.002381f, 0.504389f, -0.018832f, -0.366047f, -0.090947f, -0.168150f, 0.016184f, -0.328914f, 0.089579f, -0.017349f, 0.005844f, -0.005010f, -1.857514f, -0.282426f, 0.010177f, -0.214727f, -0.182529f, 0.156943f, -0.162032f, -0.472654f, 0.069432f, 0.016901f, -0.767905f, 0.137129f, -0.411463f, 0.049056f, -0.431657f, -0.037641f, 0.785500f, 0.046225f, 0.195831f, 0.245204f, 0.368614f, 0.212261f, 0.440626f, -0.158048f, -0.461031f, -0.146280f, }; static const float av1_tx_split_nn_bias_32x64_layer0[32] = { 0.490777f, -1.894238f, 0.621333f, -0.076756f, 0.286298f, 0.286375f, -0.126431f, -0.350034f, -1.017572f, 0.620125f, 0.408128f, 0.238756f, -0.060728f, 0.210912f, 0.043124f, 0.445649f, 0.907025f, 0.360272f, 1.083101f, -0.068952f, 1.062348f, 0.396354f, 0.280075f, 0.501732f, 0.328422f, 0.066241f, 0.474697f, 0.126313f, 0.741206f, 0.314796f, 0.552712f, 0.299410f, }; static const float av1_tx_split_nn_weights_32x64_layer1[32] = { 1.033823f, 0.603439f, 0.304591f, -0.279940f, -0.780909f, -0.132801f, 0.154059f, 0.662014f, -0.718368f, 0.198733f, 0.039766f, -0.208516f, -0.104909f, -0.394209f, 0.081617f, 0.365041f, -0.874960f, -0.063315f, -1.189897f, 0.337225f, 0.410893f, 0.307519f, 0.221323f, 0.233895f, 0.469536f, 0.438557f, 0.280144f, 0.422423f, -1.394513f, 0.781900f, 0.352981f, 0.111265f, }; static const float av1_tx_split_nn_bias_32x64_layer1[1] = { -0.18160765f, }; static const NN_CONFIG av1_tx_split_nnconfig_32x64 = { 8, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 32, }, // num_hidden_nodes { av1_tx_split_nn_weights_32x64_layer0, av1_tx_split_nn_weights_32x64_layer1, }, { av1_tx_split_nn_bias_32x64_layer0, av1_tx_split_nn_bias_32x64_layer1, }, }; /******************************************************************************/ // Tx split model for 8x32 block. static const float av1_tx_split_nn_weights_8x32_layer0[8 * 24] = { -0.687846f, 0.121404f, -0.372905f, 0.126770f, -0.103298f, -0.101650f, -0.148490f, -0.271740f, 0.682915f, -0.079765f, 0.634347f, -0.151503f, 0.287692f, -0.079072f, -0.236948f, 0.065064f, 0.713383f, 0.397123f, 0.553621f, 0.368529f, 0.767663f, -0.046601f, -0.392402f, -0.294822f, -0.292325f, -0.010573f, -0.837945f, 0.050113f, -0.811360f, 0.199162f, 0.150832f, 0.011602f, 0.369694f, -0.225876f, 0.234113f, -0.269808f, 0.303805f, -0.190281f, -0.451136f, 0.209755f, -0.308894f, 0.326956f, 0.313591f, 0.089923f, -0.095754f, 0.390981f, 0.467366f, 0.169670f, 0.853322f, 0.054055f, 0.830319f, -0.121918f, 0.262019f, -0.093526f, 0.385558f, 0.419174f, 0.040198f, -0.347030f, -0.450492f, -0.106764f, 0.487502f, -0.204188f, 0.430374f, -0.116388f, 0.236407f, -0.157376f, 0.732294f, -0.651387f, 0.347446f, 0.342575f, 0.048406f, 0.187657f, 0.434899f, -0.447782f, 0.032728f, -0.071168f, -0.255327f, 0.104174f, 0.095689f, -0.431743f, 0.725694f, 0.031797f, 0.523171f, 0.061801f, 0.469804f, -0.071068f, -0.059024f, -0.211937f, 0.392134f, -0.321490f, 0.366060f, -0.427798f, 0.166771f, 0.299652f, 0.044660f, 0.205142f, 0.039133f, -0.051835f, -0.465475f, 0.216976f, -0.341156f, 0.095358f, 0.230807f, 0.201674f, 0.279266f, -0.713534f, -0.091690f, -0.569708f, -0.119001f, 0.252160f, -1.544578f, -0.284477f, 0.555348f, 0.226471f, 0.347690f, 0.034365f, 0.770835f, -0.241859f, -0.130241f, 0.292936f, 0.396622f, -0.417916f, 0.492224f, 0.125517f, 0.344824f, 0.232172f, -0.432106f, -0.278745f, 0.035069f, -0.307247f, -0.120760f, 0.170950f, 0.433601f, 0.044286f, 0.141463f, -0.041382f, 0.529346f, 0.010868f, -0.323674f, 0.185205f, 0.623459f, 0.232842f, -0.406693f, -0.142944f, 0.222988f, 0.343634f, 0.065401f, 0.002621f, 0.805335f, -0.426926f, 0.279181f, 0.131364f, 0.192339f, -0.402391f, 0.544120f, -0.060618f, 0.467780f, 0.165224f, -0.373131f, 0.002427f, 0.688064f, 0.322317f, 0.259713f, 0.130583f, 0.185032f, -0.189111f, -0.067821f, 0.010875f, 0.644724f, -0.179291f, 0.463222f, 0.155230f, 0.721384f, -0.046019f, 0.438501f, 0.440027f, -0.462090f, -0.002039f, -0.468026f, -0.008890f, -0.328530f, 0.370102f, 0.482531f, 0.043471f, -0.469732f, -0.532663f, 0.122081f, -0.379659f, 0.037219f, -0.519913f, -0.128975f, -0.404365f, }; static const float av1_tx_split_nn_bias_8x32_layer0[24] = { -1.198965f, 0.395204f, -0.408627f, -0.021654f, -0.658355f, 0.154525f, -0.288354f, 1.207574f, 0.411608f, 0.964678f, -1.176893f, 1.059006f, -0.472969f, 2.087975f, 1.065536f, 0.595569f, 0.197907f, -0.349938f, 1.013651f, -0.931093f, -0.973595f, -0.459094f, -1.253062f, 1.624782f, }; static const float av1_tx_split_nn_weights_8x32_layer1[24] = { 0.815787f, -0.393465f, -0.483427f, -0.565592f, 0.493494f, 0.430229f, -0.507073f, -0.251379f, -0.353418f, -0.495445f, 0.820029f, 0.649146f, -0.487383f, 1.844503f, 0.480324f, -0.982705f, -0.501446f, -0.220584f, 0.334299f, 0.802238f, 0.805838f, -0.487848f, 0.300772f, -1.232857f, }; static const float av1_tx_split_nn_bias_8x32_layer1[1] = { 0.13435879f, }; static const NN_CONFIG av1_tx_split_nnconfig_8x32 = { 8, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 24, }, // num_hidden_nodes { av1_tx_split_nn_weights_8x32_layer0, av1_tx_split_nn_weights_8x32_layer1, }, { av1_tx_split_nn_bias_8x32_layer0, av1_tx_split_nn_bias_8x32_layer1, }, }; /******************************************************************************/ // Tx split model for 16x32 block. static const float av1_tx_split_nn_weights_16x64_layer0[8 * 16] = { -0.378223f, -0.124216f, -0.514089f, -0.110117f, -0.585801f, -0.094838f, -0.455385f, -0.220254f, -0.504568f, -0.082351f, -0.476420f, -0.253993f, -0.454709f, -0.059461f, 0.210313f, -0.155683f, 0.192968f, -0.127804f, 0.471996f, 0.253377f, 0.472625f, 0.485322f, 0.150560f, 0.164868f, -0.475587f, 0.447559f, -0.455759f, -0.306665f, -0.194866f, -0.283716f, -0.243897f, 0.293020f, -0.308298f, -0.191904f, -0.468568f, 0.014053f, -0.618848f, 0.096273f, -0.444586f, 0.347750f, -0.280643f, -0.062872f, 0.118661f, 0.540099f, 0.104141f, -0.279300f, -0.098721f, -0.173427f, -0.984558f, -0.424559f, -0.411928f, -0.120875f, -0.488999f, -0.050716f, -0.523103f, 0.093620f, -0.930396f, -0.431997f, -1.163297f, 0.190384f, -0.422581f, -0.005354f, 0.450552f, 0.369210f, 0.562484f, 0.679922f, 0.282099f, -0.039075f, 0.404196f, 0.006371f, 0.069679f, -0.196160f, -0.213675f, 0.275187f, -0.104235f, -0.193090f, 0.003116f, -0.252454f, -0.094591f, 0.210439f, -0.137070f, 0.145043f, 0.024558f, 0.121718f, 0.010138f, 0.301651f, -0.377990f, 0.444414f, 0.001845f, -0.095334f, 0.550259f, 0.087603f, 0.792492f, -0.044584f, 0.641706f, -0.328458f, -0.447791f, 0.135376f, 0.356385f, 0.135748f, 0.310370f, 0.293757f, -0.062000f, -0.056368f, 0.343930f, 0.312039f, 0.370763f, 0.452381f, -0.023630f, -0.185909f, 0.422277f, -0.006306f, 0.045166f, 0.423359f, -0.157735f, -0.084901f, 0.219527f, -0.209510f, 0.575057f, 0.249276f, 0.069267f, 0.233898f, -0.229392f, 0.117197f, -0.038551f, 0.293976f, 0.101996f, 0.120878f, }; static const float av1_tx_split_nn_bias_16x64_layer0[16] = { 1.036995f, 0.160249f, 0.100264f, 0.694881f, 0.694677f, 0.128379f, -0.843405f, -0.405515f, 0.104139f, 0.182980f, -0.025472f, 0.901067f, -0.299866f, -0.103079f, -0.190352f, -0.048121f, }; static const float av1_tx_split_nn_weights_16x64_layer1[16] = { -1.778868f, 0.174690f, 0.211991f, 0.712138f, 0.589352f, 0.466652f, 1.029146f, -0.490044f, 0.483015f, 0.600215f, -0.577776f, -0.755546f, 0.348337f, -0.205082f, 0.347129f, -0.322277f, }; static const float av1_tx_split_nn_bias_16x64_layer1[1] = { 0.04230947f, }; static const NN_CONFIG av1_tx_split_nnconfig_16x64 = { 8, // num_inputs 1, // num_outputs 1, // num_hidden_layers { 16, }, // num_hidden_nodes { av1_tx_split_nn_weights_16x64_layer0, av1_tx_split_nn_weights_16x64_layer1, }, { av1_tx_split_nn_bias_16x64_layer0, av1_tx_split_nn_bias_16x64_layer1, }, }; /******************************************************************************/ // Map block size to its corresponding neural net model for tx split prediction. static const NN_CONFIG *const av1_tx_split_nnconfig_map[TX_SIZES_ALL] = { NULL, // TX_4X4, &av1_tx_split_nnconfig_8x8, // TX_8X8, &av1_tx_split_nnconfig_16x16, // TX_16X16, &av1_tx_split_nnconfig_32x32, // TX_32X32, &av1_tx_split_nnconfig_64x64, // TX_64X64, &av1_tx_split_nnconfig_4x8, // TX_4X8, &av1_tx_split_nnconfig_4x8, // TX_8X4, &av1_tx_split_nnconfig_8x16, // TX_8X16, &av1_tx_split_nnconfig_8x16, // TX_16X8, &av1_tx_split_nnconfig_16x32, // TX_16X32, &av1_tx_split_nnconfig_16x32, // TX_32X16, &av1_tx_split_nnconfig_32x64, // TX_32X64, &av1_tx_split_nnconfig_32x64, // TX_64X32, &av1_tx_split_nnconfig_4x16, // TX_4X16, &av1_tx_split_nnconfig_4x16, // TX_16X4, &av1_tx_split_nnconfig_8x32, // TX_8X32, &av1_tx_split_nnconfig_8x32, // TX_32X8, &av1_tx_split_nnconfig_16x64, // TX_16X64, &av1_tx_split_nnconfig_16x64, // TX_64X16, }; #if !CONFIG_REALTIME_ONLY #define NUM_INTRA_TX_SPLIT_FEATURES 14 #define NUM_INTRA_TX_SPLIT_HIDDEN_LAYERS 1 #define NUM_INTRA_TX_SPLIT_HIDDEN_NODES 16 // Model to prune intra transform depth for intra 8x8 block. static const float av1_intra_tx_split_8x8_mean[NUM_INTRA_TX_SPLIT_FEATURES] = { 0.110706f, 18.901518f, 0.250436f, 13.483487f, 0.118141f, 14.318728f, 0.028409f, 14.257664f, 0.045839f, 15.143358f, 9.702971f, 14.300809f, 6.018646f, 3.682534f, }; static const float av1_intra_tx_split_8x8_std[NUM_INTRA_TX_SPLIT_FEATURES] = { 13.750575f, 13.440116f, 14.334330f, 12.236641f, 18.415247f, 12.733355f, 18.309339f, 12.858130f, 23.465142f, 13.447014f, 8.625048f, 10.456774f, 1.185447f, 1.810423f, }; static const float av1_intra_tx_split_nn_weights_8x8_layer0 [NUM_INTRA_TX_SPLIT_FEATURES * NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = { -0.156142f, -0.753623f, 0.026883f, 0.039188f, -0.035310f, 0.106140f, 0.051622f, 0.077838f, 0.101632f, 0.107278f, 0.232200f, 0.269083f, 0.048966f, -1.553293f, -0.113983f, -0.151248f, -0.067369f, 0.787292f, 0.076651f, -0.802634f, 0.266414f, 1.107563f, -0.068848f, -0.956468f, -0.074920f, -0.192258f, 0.006207f, 0.176196f, -0.493442f, 0.152290f, -0.208874f, -0.014658f, 0.297385f, -0.351695f, 0.246295f, -0.178519f, -0.204191f, 0.049663f, -0.330343f, -0.299754f, 0.246215f, -0.014558f, -0.117611f, 0.206445f, 0.045840f, -0.047563f, -0.049679f, 0.406892f, -0.052307f, -1.513404f, 0.166166f, 0.520760f, -0.143320f, -0.593928f, -0.010533f, 0.250752f, 0.076738f, 0.537512f, -0.082619f, -1.534031f, 0.047109f, 0.634247f, -0.089730f, 0.545534f, -0.022742f, -0.779047f, -0.606358f, -0.199145f, -0.051269f, 0.248784f, 0.327545f, -0.851751f, 0.071739f, 0.035975f, 0.387781f, -0.136427f, -0.284436f, 0.578449f, -0.198276f, 0.579950f, 0.600111f, -0.370164f, -0.215297f, 0.517342f, 0.200061f, -2.507660f, -0.030851f, 0.227315f, -0.078289f, 0.276052f, -0.050281f, 0.251481f, -0.139318f, 0.281175f, 0.226524f, 0.058968f, 0.197436f, 0.517294f, -0.105914f, -1.599567f, 0.064985f, 0.043209f, -0.280038f, 0.126874f, 0.330387f, -0.014407f, 0.031241f, 0.237801f, 0.948959f, -0.253791f, -0.022622f, -0.061430f, 0.265852f, 0.750823f, 0.086606f, 0.853527f, -0.180971f, -1.255744f, -0.152979f, -1.022198f, -0.044708f, 0.506424f, -0.501968f, -0.416863f, -0.012688f, 0.193523f, -0.093698f, 0.430875f, 0.007379f, 0.019278f, 0.080890f, 0.462755f, -0.054326f, -0.157611f, -0.004851f, -1.275676f, -0.060528f, -0.508170f, 0.195429f, -0.023534f, 0.355211f, 0.983561f, -0.122036f, -0.911948f, -0.172280f, -1.135245f, -0.043211f, 0.576456f, -0.075247f, 0.429734f, -0.246309f, -0.355575f, -0.048809f, 0.217113f, 0.078385f, 0.720341f, 0.007070f, 0.144617f, -0.167642f, 0.303056f, -0.031425f, 0.123448f, -0.320530f, 0.164070f, -0.497849f, -0.233918f, -0.032123f, 0.084983f, 0.312216f, 0.062609f, -0.389815f, 0.237593f, 0.000157f, -0.642068f, 0.167898f, 0.495234f, -0.083493f, -0.555971f, 0.124437f, 0.381125f, -0.459219f, 0.047924f, -0.138222f, -2.232816f, 0.127585f, -0.102420f, 0.131598f, 0.036837f, -0.163055f, -0.067429f, -0.078521f, -0.055666f, 1.387057f, 0.400154f, -0.003355f, -0.073627f, -0.305098f, -0.413383f, -0.008266f, -0.038329f, 0.209808f, 0.375777f, 0.037274f, -0.050226f, -0.100576f, 0.237441f, 0.237854f, 0.828296f, 0.001149f, -0.093964f, 0.214051f, -0.031486f, -0.561307f, 0.014540f, 0.169357f, 0.323202f, -0.395334f, -0.038941f, 0.476800f, -0.213122f, -0.287521f, -0.420717f, -0.054142f, -0.102266f, }; static const float av1_intra_tx_split_nn_bias_8x8_layer0[NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = { -1.150850f, -0.236404f, 0.184554f, -0.904162f, -0.949979f, 0.427016f, -0.546867f, -0.611094f, -0.676570f, -0.208959f, -0.286384f, 0.562238f, 0.434197f, -0.746518f, 0.123085f, -0.549836f, }; static const float av1_intra_tx_split_nn_weights_8x8_layer1 [NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = { 0.749814f, 0.598172f, 0.375611f, 0.751612f, 0.947538f, -0.282228f, -1.457522f, -1.092290f, 0.738657f, 0.575779f, 0.514823f, -0.560616f, -0.491619f, -1.482014f, 0.524625f, -0.533590f, }; static const float av1_intra_tx_split_nn_bias_8x8_layer1[1] = { -0.488888f, }; static const NN_CONFIG av1_intra_tx_split_nnconfig_8x8 = { NUM_INTRA_TX_SPLIT_FEATURES, // num_inputs 1, // num_outputs NUM_INTRA_TX_SPLIT_HIDDEN_LAYERS, // num_hidden_layers { NUM_INTRA_TX_SPLIT_HIDDEN_NODES, }, // num_hidden_nodes { av1_intra_tx_split_nn_weights_8x8_layer0, av1_intra_tx_split_nn_weights_8x8_layer1, }, { av1_intra_tx_split_nn_bias_8x8_layer0, av1_intra_tx_split_nn_bias_8x8_layer1, }, }; static const float av1_intra_tx_prune_nn_thresh_8x8[2] = { -0.405465f, 0.405465f }; #endif // !CONFIG_REALTIME_ONLY #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ aom-3.12.1/av1/encoder/tx_search.c000066400000000000000000004560311477627663500166440ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/cfl.h" #include "av1/common/reconintra.h" #include "av1/encoder/block.h" #include "av1/encoder/hybrid_fwd_txfm.h" #include "av1/common/idct.h" #include "av1/encoder/model_rd.h" #include "av1/encoder/random.h" #include "av1/encoder/rdopt_utils.h" #include "av1/encoder/sorting_network.h" #include "av1/encoder/tx_prune_model_weights.h" #include "av1/encoder/tx_search.h" #include "av1/encoder/txb_rdopt.h" #define PROB_THRESH_OFFSET_TX_TYPE 100 struct rdcost_block_args { const AV1_COMP *cpi; MACROBLOCK *x; ENTROPY_CONTEXT t_above[MAX_MIB_SIZE]; ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]; RD_STATS rd_stats; int64_t current_rd; int64_t best_rd; int exit_early; int incomplete_exit; FAST_TX_SEARCH_MODE ftxs_mode; int skip_trellis; }; typedef struct { int64_t rd; int txb_entropy_ctx; TX_TYPE tx_type; } TxCandidateInfo; // origin_threshold * 128 / 100 static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = { { 64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68, }, { 88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68, }, { 90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74, }, }; // lookup table for predict_skip_txfm // int max_tx_size = max_txsize_rect_lookup[bsize]; // if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16) // max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16); static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = { TX_4X4, TX_4X8, TX_8X4, TX_8X8, TX_8X16, TX_16X8, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16, TX_16X4, TX_8X8, TX_8X8, TX_16X16, TX_16X16, }; // look-up table for sqrt of number of pixels in a transform block // rounded up to the nearest integer. static const int sqrt_tx_pixels_2d[TX_SIZES_ALL] = { 4, 8, 16, 32, 32, 6, 6, 12, 12, 23, 23, 32, 32, 8, 8, 16, 16, 23, 23 }; static inline uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) { const int rows = block_size_high[bsize]; const int cols = block_size_wide[bsize]; const int16_t *diff = x->plane[0].src_diff; const uint32_t hash = av1_get_crc32c_value(&x->txfm_search_info.mb_rd_record->crc_calculator, (uint8_t *)diff, 2 * rows * cols); return (hash << 5) + bsize; } static inline int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record, const int64_t ref_best_rd, const uint32_t hash) { int32_t match_index = -1; if (ref_best_rd != INT64_MAX) { for (int i = 0; i < mb_rd_record->num; ++i) { const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN; // If there is a match in the mb_rd_record, fetch the RD decision and // terminate early. if (mb_rd_record->mb_rd_info[index].hash_value == hash) { match_index = index; break; } } } return match_index; } static inline void fetch_mb_rd_info(int n4, const MB_RD_INFO *const mb_rd_info, RD_STATS *const rd_stats, MACROBLOCK *const x) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; mbmi->tx_size = mb_rd_info->tx_size; memcpy(x->txfm_search_info.blk_skip, mb_rd_info->blk_skip, sizeof(mb_rd_info->blk_skip[0]) * n4); av1_copy(mbmi->inter_tx_size, mb_rd_info->inter_tx_size); av1_copy_array(xd->tx_type_map, mb_rd_info->tx_type_map, n4); *rd_stats = mb_rd_info->rd_stats; } int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row, int blk_col, const BLOCK_SIZE plane_bsize, const BLOCK_SIZE tx_bsize, unsigned int *block_mse_q8) { int visible_rows, visible_cols; const MACROBLOCKD *xd = &x->e_mbd; get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL, NULL, &visible_cols, &visible_rows); const int diff_stride = block_size_wide[plane_bsize]; const int16_t *diff = x->plane[plane].src_diff; diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2); uint64_t sse = aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows); if (block_mse_q8 != NULL) { if (visible_cols > 0 && visible_rows > 0) *block_mse_q8 = (unsigned int)((256 * sse) / (visible_cols * visible_rows)); else *block_mse_q8 = UINT_MAX; } return sse; } // Computes the residual block's SSE and mean on all visible 4x4s in the // transform block static inline int64_t pixel_diff_stats( MACROBLOCK *x, int plane, int blk_row, int blk_col, const BLOCK_SIZE plane_bsize, const BLOCK_SIZE tx_bsize, unsigned int *block_mse_q8, int64_t *per_px_mean, uint64_t *block_var) { int visible_rows, visible_cols; const MACROBLOCKD *xd = &x->e_mbd; get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL, NULL, &visible_cols, &visible_rows); const int diff_stride = block_size_wide[plane_bsize]; const int16_t *diff = x->plane[plane].src_diff; diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2); uint64_t sse = 0; int sum = 0; sse = aom_sum_sse_2d_i16(diff, diff_stride, visible_cols, visible_rows, &sum); if (visible_cols > 0 && visible_rows > 0) { double norm_factor = 1.0 / (visible_cols * visible_rows); int sign_sum = sum > 0 ? 1 : -1; // Conversion to transform domain *per_px_mean = (int64_t)(norm_factor * abs(sum)) << 7; *per_px_mean = sign_sum * (*per_px_mean); *block_mse_q8 = (unsigned int)(norm_factor * (256 * sse)); *block_var = (uint64_t)(sse - (uint64_t)(norm_factor * sum * sum)); } else { *block_mse_q8 = UINT_MAX; } return sse; } // Uses simple features on top of DCT coefficients to quickly predict // whether optimal RD decision is to skip encoding the residual. // The sse value is stored in dist. static int predict_skip_txfm(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist, int reduced_tx_set) { const TxfmSearchParams *txfm_params = &x->txfm_search_params; const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; const MACROBLOCKD *xd = &x->e_mbd; const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd); *dist = av1_pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL); const int64_t mse = *dist / bw / bh; // Normalized quantizer takes the transform upscaling factor (8 for tx size // smaller than 32) into account. const int16_t normalized_dc_q = dc_q >> 3; const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8; // For faster early skip decision, use dist to compare against threshold so // that quality risk is less for the skip=1 decision. Otherwise, use mse // since the fwd_txfm coeff checks will take care of quality // TODO(any): Use dist to return 0 when skip_txfm_level is 1 int64_t pred_err = (txfm_params->skip_txfm_level >= 2) ? *dist : mse; // Predict not to skip when error is larger than threshold. if (pred_err > mse_thresh) return 0; // Return as skip otherwise for aggressive early skip else if (txfm_params->skip_txfm_level >= 2) return 1; const int max_tx_size = max_predict_sf_tx_size[bsize]; const int tx_h = tx_size_high[max_tx_size]; const int tx_w = tx_size_wide[max_tx_size]; DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]); TxfmParam param; param.tx_type = DCT_DCT; param.tx_size = max_tx_size; param.bd = xd->bd; param.is_hbd = is_cur_buf_hbd(xd); param.lossless = 0; param.tx_set_type = av1_get_ext_tx_set_type( param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set); const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2); const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize]; const int16_t *src_diff = x->plane[0].src_diff; const int n_coeff = tx_w * tx_h; const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd); const uint32_t dc_thresh = max_qcoef_thresh * dc_q; const uint32_t ac_thresh = max_qcoef_thresh * ac_q; for (int row = 0; row < bh; row += tx_h) { for (int col = 0; col < bw; col += tx_w) { av1_fwd_txfm(src_diff + col, coefs, bw, ¶m); // Operating on TX domain, not pixels; we want the QTX quantizers const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7); if (dc_coef >= dc_thresh) return 0; for (int i = 1; i < n_coeff; ++i) { const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7); if (ac_coef >= ac_thresh) return 0; } } src_diff += tx_h * bw; } return 1; } // Used to set proper context for early termination with skip = 1. static inline void set_skip_txfm(MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bsize, int64_t dist) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const int n4 = bsize_to_num_blk(bsize); const TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; memset(xd->tx_type_map, DCT_DCT, sizeof(xd->tx_type_map[0]) * n4); memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size)); mbmi->tx_size = tx_size; for (int i = 0; i < n4; ++i) set_blk_skip(x->txfm_search_info.blk_skip, 0, i, 1); rd_stats->skip_txfm = 1; if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2); rd_stats->dist = rd_stats->sse = (dist << 4); // Though decision is to make the block as skip based on luma stats, // it is possible that block becomes non skip after chroma rd. In addition // intermediate non skip costs calculated by caller function will be // incorrect, if rate is set as zero (i.e., if zero_blk_rate is not // accounted). Hence intermediate rate is populated to code the luma tx blks // as skip, the caller function based on final rd decision (i.e., skip vs // non-skip) sets the final rate accordingly. Here the rate populated // corresponds to coding all the tx blocks with zero_blk_rate (based on max tx // size possible) in the current block. Eg: For 128*128 block, rate would be // 4 * zero_blk_rate where zero_blk_rate corresponds to coding of one 64x64 tx // block as 'all zeros' ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl); ENTROPY_CONTEXT *ta = ctxa; ENTROPY_CONTEXT *tl = ctxl; const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); TXB_CTX txb_ctx; get_txb_ctx(bsize, tx_size, 0, ta, tl, &txb_ctx); const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y] .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; rd_stats->rate = zero_blk_rate * (block_size_wide[bsize] >> tx_size_wide_log2[tx_size]) * (block_size_high[bsize] >> tx_size_high_log2[tx_size]); } static inline void save_mb_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x, const RD_STATS *const rd_stats, MB_RD_RECORD *mb_rd_record) { int index; if (mb_rd_record->num < RD_RECORD_BUFFER_LEN) { index = (mb_rd_record->index_start + mb_rd_record->num) % RD_RECORD_BUFFER_LEN; ++mb_rd_record->num; } else { index = mb_rd_record->index_start; mb_rd_record->index_start = (mb_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN; } MB_RD_INFO *const mb_rd_info = &mb_rd_record->mb_rd_info[index]; const MACROBLOCKD *const xd = &x->e_mbd; const MB_MODE_INFO *const mbmi = xd->mi[0]; mb_rd_info->hash_value = hash; mb_rd_info->tx_size = mbmi->tx_size; memcpy(mb_rd_info->blk_skip, x->txfm_search_info.blk_skip, sizeof(mb_rd_info->blk_skip[0]) * n4); av1_copy(mb_rd_info->inter_tx_size, mbmi->inter_tx_size); av1_copy_array(mb_rd_info->tx_type_map, xd->tx_type_map, n4); mb_rd_info->rd_stats = *rd_stats; } static int get_search_init_depth(int mi_width, int mi_height, int is_inter, const SPEED_FEATURES *sf, int tx_size_search_method) { if (tx_size_search_method == USE_LARGESTALL) return MAX_VARTX_DEPTH; if (sf->tx_sf.tx_size_search_lgr_block) { if (mi_width > mi_size_wide[BLOCK_64X64] || mi_height > mi_size_high[BLOCK_64X64]) return MAX_VARTX_DEPTH; } if (is_inter) { return (mi_height != mi_width) ? sf->tx_sf.inter_tx_size_search_init_depth_rect : sf->tx_sf.inter_tx_size_search_init_depth_sqr; } else { return (mi_height != mi_width) ? sf->tx_sf.intra_tx_size_search_init_depth_rect : sf->tx_sf.intra_tx_size_search_init_depth_sqr; } } static inline void select_tx_block( const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd, int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode); // NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values // 0: Do not collect any RD stats // 1: Collect RD stats for transform units // 2: Collect RD stats for partition units #if CONFIG_COLLECT_RD_STATS static inline void get_energy_distribution_fine( const AV1_COMP *cpi, BLOCK_SIZE bsize, const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, int need_4th, double *hordist, double *verdist) { const int bw = block_size_wide[bsize]; const int bh = block_size_high[bsize]; unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) { // Special cases: calculate 'esq' values manually, as we don't have 'vf' // functions for the 16 (very small) sub-blocks of this block. const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3; const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3; assert(bw <= 32); assert(bh <= 32); assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15); if (cpi->common.seq_params->use_highbitdepth) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); for (int i = 0; i < bh; ++i) for (int j = 0; j < bw; ++j) { const int index = (j >> w_shift) + ((i >> h_shift) << 2); esq[index] += (src16[j + i * src_stride] - dst16[j + i * dst_stride]) * (src16[j + i * src_stride] - dst16[j + i * dst_stride]); } } else { for (int i = 0; i < bh; ++i) for (int j = 0; j < bw; ++j) { const int index = (j >> w_shift) + ((i >> h_shift) << 2); esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) * (src[j + i * src_stride] - dst[j + i * dst_stride]); } } } else { // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks. const int f_index = (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16; assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL); const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index; assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]); assert(block_size_high[bsize] == 4 * block_size_high[subsize]); cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]); cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, &esq[1]); cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, &esq[2]); cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, dst_stride, &esq[3]); src += bh / 4 * src_stride; dst += bh / 4 * dst_stride; cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]); cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, &esq[5]); cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, &esq[6]); cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, dst_stride, &esq[7]); src += bh / 4 * src_stride; dst += bh / 4 * dst_stride; cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]); cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, &esq[9]); cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, &esq[10]); cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, dst_stride, &esq[11]); src += bh / 4 * src_stride; dst += bh / 4 * dst_stride; cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]); cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, &esq[13]); cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, &esq[14]); cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, dst_stride, &esq[15]); } double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] + esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] + esq[12] + esq[13] + esq[14] + esq[15]; if (total > 0) { const double e_recip = 1.0 / total; hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip; hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip; hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip; if (need_4th) { hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip; } verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip; verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip; verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip; if (need_4th) { verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip; } } else { hordist[0] = verdist[0] = 0.25; hordist[1] = verdist[1] = 0.25; hordist[2] = verdist[2] = 0.25; if (need_4th) { hordist[3] = verdist[3] = 0.25; } } } static double get_sse_norm(const int16_t *diff, int stride, int w, int h) { double sum = 0.0; for (int j = 0; j < h; ++j) { for (int i = 0; i < w; ++i) { const int err = diff[j * stride + i]; sum += err * err; } } assert(w > 0 && h > 0); return sum / (w * h); } static double get_sad_norm(const int16_t *diff, int stride, int w, int h) { double sum = 0.0; for (int j = 0; j < h; ++j) { for (int i = 0; i < w; ++i) { sum += abs(diff[j * stride + i]); } } assert(w > 0 && h > 0); return sum / (w * h); } static inline void get_2x2_normalized_sses_and_sads( const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src, int src_stride, const uint8_t *const dst, int dst_stride, const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr, double *const sad_norm_arr) { const BLOCK_SIZE tx_bsize_half = get_partition_subsize(tx_bsize, PARTITION_SPLIT); if (tx_bsize_half == BLOCK_INVALID) { // manually calculate stats const int half_width = block_size_wide[tx_bsize] / 2; const int half_height = block_size_high[tx_bsize] / 2; for (int row = 0; row < 2; ++row) { for (int col = 0; col < 2; ++col) { const int16_t *const this_src_diff = src_diff + row * half_height * diff_stride + col * half_width; if (sse_norm_arr) { sse_norm_arr[row * 2 + col] = get_sse_norm(this_src_diff, diff_stride, half_width, half_height); } if (sad_norm_arr) { sad_norm_arr[row * 2 + col] = get_sad_norm(this_src_diff, diff_stride, half_width, half_height); } } } } else { // use function pointers to calculate stats const int half_width = block_size_wide[tx_bsize_half]; const int half_height = block_size_high[tx_bsize_half]; const int num_samples_half = half_width * half_height; for (int row = 0; row < 2; ++row) { for (int col = 0; col < 2; ++col) { const uint8_t *const this_src = src + row * half_height * src_stride + col * half_width; const uint8_t *const this_dst = dst + row * half_height * dst_stride + col * half_width; if (sse_norm_arr) { unsigned int this_sse; cpi->ppi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst, dst_stride, &this_sse); sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half; } if (sad_norm_arr) { const unsigned int this_sad = cpi->ppi->fn_ptr[tx_bsize_half].sdf( this_src, src_stride, this_dst, dst_stride); sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half; } } } } } #if CONFIG_COLLECT_RD_STATS == 1 static double get_mean(const int16_t *diff, int stride, int w, int h) { double sum = 0.0; for (int j = 0; j < h; ++j) { for (int i = 0; i < w; ++i) { sum += diff[j * stride + i]; } } assert(w > 0 && h > 0); return sum / (w * h); } static inline void PrintTransformUnitStats( const AV1_COMP *const cpi, MACROBLOCK *x, const RD_STATS *const rd_stats, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, TX_TYPE tx_type, int64_t rd) { if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return; // Generate small sample to restrict output size. static unsigned int seed = 21743; if (lcg_rand16(&seed) % 256 > 0) return; const char output_file[] = "tu_stats.txt"; FILE *fout = fopen(output_file, "a"); if (!fout) return; const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; const MACROBLOCKD *const xd = &x->e_mbd; const int plane = 0; struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int txw = tx_size_wide[tx_size]; const int txh = tx_size_high[tx_size]; const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; const int q_step = p->dequant_QTX[1] >> dequant_shift; const int num_samples = txw * txh; const double rate_norm = (double)rd_stats->rate / num_samples; const double dist_norm = (double)rd_stats->dist / num_samples; fprintf(fout, "%g %g", rate_norm, dist_norm); const int src_stride = p->src.stride; const uint8_t *const src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2]; const int dst_stride = pd->dst.stride; const uint8_t *const dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; unsigned int sse; cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); const double sse_norm = (double)sse / num_samples; const unsigned int sad = cpi->ppi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride); const double sad_norm = (double)sad / num_samples; fprintf(fout, " %g %g", sse_norm, sad_norm); const int diff_stride = block_size_wide[plane_bsize]; const int16_t *const src_diff = &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2]; double sse_norm_arr[4], sad_norm_arr[4]; get_2x2_normalized_sses_and_sads(cpi, tx_bsize, src, src_stride, dst, dst_stride, src_diff, diff_stride, sse_norm_arr, sad_norm_arr); for (int i = 0; i < 4; ++i) { fprintf(fout, " %g", sse_norm_arr[i]); } for (int i = 0; i < 4; ++i) { fprintf(fout, " %g", sad_norm_arr[i]); } const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type]; const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type]; fprintf(fout, " %d %d %d %d %d", q_step, tx_size_wide[tx_size], tx_size_high[tx_size], tx_type_1d_row, tx_type_1d_col); int model_rate; int64_t model_dist; model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples, &model_rate, &model_dist); const double model_rate_norm = (double)model_rate / num_samples; const double model_dist_norm = (double)model_dist / num_samples; fprintf(fout, " %g %g", model_rate_norm, model_dist_norm); const double mean = get_mean(src_diff, diff_stride, txw, txh); float hor_corr, vert_corr; av1_get_horver_correlation_full(src_diff, diff_stride, txw, txh, &hor_corr, &vert_corr); fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr); double hdist[4] = { 0 }, vdist[4] = { 0 }; get_energy_distribution_fine(cpi, tx_bsize, src, src_stride, dst, dst_stride, 1, hdist, vdist); fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2], hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]); fprintf(fout, " %d %" PRId64, x->rdmult, rd); fprintf(fout, "\n"); fclose(fout); } #endif // CONFIG_COLLECT_RD_STATS == 1 #if CONFIG_COLLECT_RD_STATS >= 2 static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) { const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); const MACROBLOCKD *xd = &x->e_mbd; const MB_MODE_INFO *mbmi = xd->mi[0]; int64_t total_sse = 0; for (int plane = 0; plane < num_planes; ++plane) { const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE bs = get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); unsigned int sse; if (plane) continue; cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse); total_sse += sse; } total_sse <<= 4; return total_sse; } static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize, int64_t sse, int *est_residue_cost, int64_t *est_dist) { const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; if (md->ready) { if (sse < md->dist_mean) { *est_residue_cost = 0; *est_dist = sse; } else { *est_dist = (int64_t)round(md->dist_mean); const double est_ld = md->a * sse + md->b; // Clamp estimated rate cost by INT_MAX / 2. // TODO(angiebird@google.com): find better solution than clamping. if (fabs(est_ld) < 1e-2) { *est_residue_cost = INT_MAX / 2; } else { double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld); if (est_residue_cost_dbl < 0) { *est_residue_cost = 0; } else { *est_residue_cost = (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2); } } if (*est_residue_cost <= 0) { *est_residue_cost = 0; *est_dist = sse; } } return 1; } return 0; } static double get_highbd_diff_mean(const uint8_t *src8, int src_stride, const uint8_t *dst8, int dst_stride, int w, int h) { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); double sum = 0.0; for (int j = 0; j < h; ++j) { for (int i = 0; i < w; ++i) { const int diff = src[j * src_stride + i] - dst[j * dst_stride + i]; sum += diff; } } assert(w > 0 && h > 0); return sum / (w * h); } static double get_diff_mean(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, int w, int h) { double sum = 0.0; for (int j = 0; j < h; ++j) { for (int i = 0; i < w; ++i) { const int diff = src[j * src_stride + i] - dst[j * dst_stride + i]; sum += diff; } } assert(w > 0 && h > 0); return sum / (w * h); } static inline void PrintPredictionUnitStats(const AV1_COMP *const cpi, const TileDataEnc *tile_data, MACROBLOCK *x, const RD_STATS *const rd_stats, BLOCK_SIZE plane_bsize) { if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return; if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 && (tile_data == NULL || !tile_data->inter_mode_rd_models[plane_bsize].ready)) return; (void)tile_data; // Generate small sample to restrict output size. static unsigned int seed = 95014; if ((lcg_rand16(&seed) % (1 << (14 - num_pels_log2_lookup[plane_bsize]))) != 1) return; const char output_file[] = "pu_stats.txt"; FILE *fout = fopen(output_file, "a"); if (!fout) return; MACROBLOCKD *const xd = &x->e_mbd; const int plane = 0; struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *pd = &xd->plane[plane]; const int diff_stride = block_size_wide[plane_bsize]; int bw, bh; get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, &bh); const int num_samples = bw * bh; const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; const int q_step = p->dequant_QTX[1] >> dequant_shift; const int shift = (xd->bd - 8); const double rate_norm = (double)rd_stats->rate / num_samples; const double dist_norm = (double)rd_stats->dist / num_samples; const double rdcost_norm = (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples; fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm); const int src_stride = p->src.stride; const uint8_t *const src = p->src.buf; const int dst_stride = pd->dst.stride; const uint8_t *const dst = pd->dst.buf; const int16_t *const src_diff = p->src_diff; int64_t sse = calculate_sse(xd, p, pd, bw, bh); const double sse_norm = (double)sse / num_samples; const unsigned int sad = cpi->ppi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride); const double sad_norm = (double)sad / (1 << num_pels_log2_lookup[plane_bsize]); fprintf(fout, " %g %g", sse_norm, sad_norm); double sse_norm_arr[4], sad_norm_arr[4]; get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst, dst_stride, src_diff, diff_stride, sse_norm_arr, sad_norm_arr); if (shift) { for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift)); for (int k = 0; k < 4; ++k) sad_norm_arr[k] /= (1 << shift); } for (int i = 0; i < 4; ++i) { fprintf(fout, " %g", sse_norm_arr[i]); } for (int i = 0; i < 4; ++i) { fprintf(fout, " %g", sad_norm_arr[i]); } fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh); int model_rate; int64_t model_dist; model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples, &model_rate, &model_dist); const double model_rdcost_norm = (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples; const double model_rate_norm = (double)model_rate / num_samples; const double model_dist_norm = (double)model_dist / num_samples; fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm, model_rdcost_norm); double mean; if (is_cur_buf_hbd(xd)) { mean = get_highbd_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh); } else { mean = get_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh); } mean /= (1 << shift); float hor_corr, vert_corr; av1_get_horver_correlation_full(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr); fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr); double hdist[4] = { 0 }, vdist[4] = { 0 }; get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst, dst_stride, 1, hdist, vdist); fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2], hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]); if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { assert(tile_data->inter_mode_rd_models[plane_bsize].ready); const int64_t overall_sse = get_sse(cpi, x); int est_residue_cost = 0; int64_t est_dist = 0; get_est_rate_dist(tile_data, plane_bsize, overall_sse, &est_residue_cost, &est_dist); const double est_residue_cost_norm = (double)est_residue_cost / num_samples; const double est_dist_norm = (double)est_dist / num_samples; const double est_rdcost_norm = (double)RDCOST(x->rdmult, est_residue_cost, est_dist) / num_samples; fprintf(fout, " %g %g %g", est_residue_cost_norm, est_dist_norm, est_rdcost_norm); } fprintf(fout, "\n"); fclose(fout); } #endif // CONFIG_COLLECT_RD_STATS >= 2 #endif // CONFIG_COLLECT_RD_STATS static inline void inverse_transform_block_facade(MACROBLOCK *const x, int plane, int block, int blk_row, int blk_col, int eob, int reduced_tx_set) { if (!eob) return; struct macroblock_plane *const p = &x->plane[plane]; MACROBLOCKD *const xd = &x->e_mbd; tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); const PLANE_TYPE plane_type = get_plane_type(plane); const TX_SIZE tx_size = av1_get_tx_size(plane, xd); const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, reduced_tx_set); struct macroblockd_plane *const pd = &xd->plane[plane]; const int dst_stride = pd->dst.stride; uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, dst_stride, eob, reduced_tx_set); } static inline void recon_intra(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, const TXB_CTX *const txb_ctx, int skip_trellis, TX_TYPE best_tx_type, int do_quant, int *rate_cost, uint16_t best_eob) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; const int is_inter = is_inter_block(mbmi); if (!is_inter && best_eob && (blk_row + tx_size_high_unit[tx_size] < mi_size_high[plane_bsize] || blk_col + tx_size_wide_unit[tx_size] < mi_size_wide[plane_bsize])) { // if the quantized coefficients are stored in the dqcoeff buffer, we don't // need to do transform and quantization again. if (do_quant) { TxfmParam txfm_param_intra; QUANT_PARAM quant_param_intra; av1_setup_xform(cm, x, tx_size, best_tx_type, &txfm_param_intra); av1_setup_quant(tx_size, !skip_trellis, skip_trellis ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP) : AV1_XFORM_QUANT_FP, cpi->oxcf.q_cfg.quant_b_adapt, &quant_param_intra); av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, best_tx_type, &quant_param_intra); av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param_intra, &quant_param_intra); if (quant_param_intra.use_optimize_b) { av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx, rate_cost); } } inverse_transform_block_facade(x, plane, block, blk_row, blk_col, x->plane[plane].eobs[block], cm->features.reduced_tx_set_used); // This may happen because of hash collision. The eob stored in the hash // table is non-zero, but the real eob is zero. We need to make sure tx_type // is DCT_DCT in this case. if (plane == 0 && x->plane[plane].eobs[block] == 0 && best_tx_type != DCT_DCT) { update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); } } } static unsigned pixel_dist_visible_only( const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src, const int src_stride, const uint8_t *dst, const int dst_stride, const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows, int visible_cols) { unsigned sse; if (txb_rows == visible_rows && txb_cols == visible_cols) { cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); return sse; } #if CONFIG_AV1_HIGHBITDEPTH const MACROBLOCKD *xd = &x->e_mbd; if (is_cur_buf_hbd(xd)) { uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols, visible_rows); return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2); } #else (void)x; #endif sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols, visible_rows); return sse; } // Compute the pixel domain distortion from src and dst on all visible 4x4s in // the // transform block. static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x, int plane, const uint8_t *src, const int src_stride, const uint8_t *dst, const int dst_stride, int blk_row, int blk_col, const BLOCK_SIZE plane_bsize, const BLOCK_SIZE tx_bsize) { int txb_rows, txb_cols, visible_rows, visible_cols; const MACROBLOCKD *xd = &x->e_mbd; get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, &txb_cols, &txb_rows, &visible_cols, &visible_rows); assert(visible_rows > 0); assert(visible_cols > 0); unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst, dst_stride, tx_bsize, txb_rows, txb_cols, visible_rows, visible_cols); return sse; } static inline int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col, TX_SIZE tx_size) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const uint16_t eob = p->eobs[block]; const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; const int bsw = block_size_wide[tx_bsize]; const int bsh = block_size_high[tx_bsize]; const int src_stride = x->plane[plane].src.stride; const int dst_stride = xd->plane[plane].dst.stride; // Scale the transform block index to pixel unit. const int src_idx = (blk_row * src_stride + blk_col) << MI_SIZE_LOG2; const int dst_idx = (blk_row * dst_stride + blk_col) << MI_SIZE_LOG2; const uint8_t *src = &x->plane[plane].src.buf[src_idx]; const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx]; const tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); assert(cpi != NULL); assert(tx_size_wide_log2[0] == tx_size_high_log2[0]); uint8_t *recon; DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]); #if CONFIG_AV1_HIGHBITDEPTH if (is_cur_buf_hbd(xd)) { recon = CONVERT_TO_BYTEPTR(recon16); aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw, bsh); } else { recon = (uint8_t *)recon16; aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh); } #else recon = (uint8_t *)recon16; aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh); #endif const PLANE_TYPE plane_type = get_plane_type(plane); TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, cpi->common.features.reduced_tx_set_used); av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon, MAX_TX_SIZE, eob, cpi->common.features.reduced_tx_set_used); return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE, blk_row, blk_col, plane_bsize, tx_bsize); } // pruning thresholds for prune_txk_type and prune_txk_type_separ static const int prune_factors[5] = { 200, 200, 120, 80, 40 }; // scale 1000 static const int mul_factors[5] = { 80, 80, 70, 50, 30 }; // scale 100 // R-D costs are sorted in ascending order. static inline void sort_rd(int64_t rds[], int txk[], int len) { int i, j, k; for (i = 1; i <= len - 1; ++i) { for (j = 0; j < i; ++j) { if (rds[j] > rds[i]) { int64_t temprd; int tempi; temprd = rds[i]; tempi = txk[i]; for (k = i; k > j; k--) { rds[k] = rds[k - 1]; txk[k] = txk[k - 1]; } rds[j] = temprd; txk[j] = tempi; break; } } } } static inline int64_t av1_block_error_qm( const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, const qm_val_t *qmatrix, const int16_t *scan, int64_t *ssz, int bd) { int i; int64_t error = 0, sqcoeff = 0; int shift = 2 * (bd - 8); int rounding = (1 << shift) >> 1; for (i = 0; i < block_size; i++) { int64_t weight = qmatrix[scan[i]]; int64_t dd = coeff[i] - dqcoeff[i]; dd *= weight; int64_t cc = coeff[i]; cc *= weight; // The ranges of coeff and dqcoeff are // bd8 : 18 bits (including sign) // bd10: 20 bits (including sign) // bd12: 22 bits (including sign) // As AOM_QM_BITS is 5, the intermediate quantities in the calculation // below should fit in 54 bits, thus no overflow should happen. error += (dd * dd + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS); sqcoeff += (cc * cc + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS); } error = (error + rounding) >> shift; sqcoeff = (sqcoeff + rounding) >> shift; *ssz = sqcoeff; return error; } static inline void dist_block_tx_domain(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, const qm_val_t *qmatrix, const int16_t *scan, int64_t *out_dist, int64_t *out_sse) { const struct macroblock_plane *const p = &x->plane[plane]; // Transform domain distortion computation is more efficient as it does // not involve an inverse transform, but it is less accurate. const int buffer_length = av1_get_max_eob(tx_size); int64_t this_sse; // TX-domain results need to shift down to Q2/D10 to match pixel // domain distortion values which are in Q2^2 int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2; const int block_offset = BLOCK_OFFSET(block); tran_low_t *const coeff = p->coeff + block_offset; tran_low_t *const dqcoeff = p->dqcoeff + block_offset; #if CONFIG_AV1_HIGHBITDEPTH MACROBLOCKD *const xd = &x->e_mbd; if (is_cur_buf_hbd(xd)) { if (qmatrix == NULL || !x->txfm_search_params.use_qm_dist_metric) { *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, xd->bd); } else { *out_dist = av1_block_error_qm(coeff, dqcoeff, buffer_length, qmatrix, scan, &this_sse, xd->bd); } } else { #endif if (qmatrix == NULL || !x->txfm_search_params.use_qm_dist_metric) { *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse); } else { *out_dist = av1_block_error_qm(coeff, dqcoeff, buffer_length, qmatrix, scan, &this_sse, 8); } #if CONFIG_AV1_HIGHBITDEPTH } #endif *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift); *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift); } static uint16_t prune_txk_type_separ( const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, int *txk_map, int16_t allowed_tx_mask, int prune_factor, const TXB_CTX *const txb_ctx, int reduced_tx_set_used, int64_t ref_best_rd, int num_sel) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; int idx; int64_t rds_v[4]; int64_t rds_h[4]; int idx_v[4] = { 0, 1, 2, 3 }; int idx_h[4] = { 0, 1, 2, 3 }; int skip_v[4] = { 0 }; int skip_h[4] = { 0 }; const int idx_map[16] = { DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT, ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST, FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST, H_DCT, H_ADST, H_FLIPADST, IDTX }; const int sel_pattern_v[16] = { 0, 0, 1, 1, 0, 2, 1, 2, 2, 0, 3, 1, 3, 2, 3, 3 }; const int sel_pattern_h[16] = { 0, 1, 0, 1, 2, 0, 2, 1, 2, 3, 0, 3, 1, 3, 2, 3 }; QUANT_PARAM quant_param; TxfmParam txfm_param; av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param); av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt, &quant_param); int tx_type; // to ensure we can try ones even outside of ext_tx_set of current block // this function should only be called for size < 16 assert(txsize_sqr_up_map[tx_size] <= TX_16X16); txfm_param.tx_set_type = EXT_TX_SET_ALL16; int rate_cost = 0; int64_t dist = 0, sse = 0; // evaluate horizontal with vertical DCT for (idx = 0; idx < 4; ++idx) { tx_type = idx_map[idx]; txfm_param.tx_type = tx_type; av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, &quant_param); av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, &quant_param); const SCAN_ORDER *const scan_order = get_scan(txfm_param.tx_size, txfm_param.tx_type); dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix, scan_order->scan, &dist, &sse); rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type, txb_ctx, reduced_tx_set_used, 0); rds_h[idx] = RDCOST(x->rdmult, rate_cost, dist); if ((rds_h[idx] - (rds_h[idx] >> 2)) > ref_best_rd) { skip_h[idx] = 1; } } sort_rd(rds_h, idx_h, 4); for (idx = 1; idx < 4; idx++) { if (rds_h[idx] > rds_h[0] * 1.2) skip_h[idx_h[idx]] = 1; } if (skip_h[idx_h[0]]) return (uint16_t)0xFFFF; // evaluate vertical with the best horizontal chosen rds_v[0] = rds_h[0]; int start_v = 1, end_v = 4; const int *idx_map_v = idx_map + idx_h[0]; for (idx = start_v; idx < end_v; ++idx) { tx_type = idx_map_v[idx_v[idx] * 4]; txfm_param.tx_type = tx_type; av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, &quant_param); av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, &quant_param); const SCAN_ORDER *const scan_order = get_scan(txfm_param.tx_size, txfm_param.tx_type); dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix, scan_order->scan, &dist, &sse); rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type, txb_ctx, reduced_tx_set_used, 0); rds_v[idx] = RDCOST(x->rdmult, rate_cost, dist); if ((rds_v[idx] - (rds_v[idx] >> 2)) > ref_best_rd) { skip_v[idx] = 1; } } sort_rd(rds_v, idx_v, 4); for (idx = 1; idx < 4; idx++) { if (rds_v[idx] > rds_v[0] * 1.2) skip_v[idx_v[idx]] = 1; } // combine rd_h and rd_v to prune tx candidates int i_v, i_h; int64_t rds[16]; int num_cand = 0, last = TX_TYPES - 1; for (int i = 0; i < 16; i++) { i_v = sel_pattern_v[i]; i_h = sel_pattern_h[i]; tx_type = idx_map[idx_v[i_v] * 4 + idx_h[i_h]]; if (!(allowed_tx_mask & (1 << tx_type)) || skip_h[idx_h[i_h]] || skip_v[idx_v[i_v]]) { txk_map[last] = tx_type; last--; } else { txk_map[num_cand] = tx_type; rds[num_cand] = rds_v[i_v] + rds_h[i_h]; if (rds[num_cand] == 0) rds[num_cand] = 1; num_cand++; } } sort_rd(rds, txk_map, num_cand); uint16_t prune = (uint16_t)(~(1 << txk_map[0])); num_sel = AOMMIN(num_sel, num_cand); for (int i = 1; i < num_sel; i++) { int64_t factor = 1800 * (rds[i] - rds[0]) / (rds[0]); if (factor < (int64_t)prune_factor) prune &= ~(1 << txk_map[i]); else break; } return prune; } static uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, int *txk_map, uint16_t allowed_tx_mask, int prune_factor, const TXB_CTX *const txb_ctx, int reduced_tx_set_used) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; int tx_type; int64_t rds[TX_TYPES]; int num_cand = 0; int last = TX_TYPES - 1; TxfmParam txfm_param; QUANT_PARAM quant_param; av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param); av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt, &quant_param); for (int idx = 0; idx < TX_TYPES; idx++) { tx_type = idx; int rate_cost = 0; int64_t dist = 0, sse = 0; if (!(allowed_tx_mask & (1 << tx_type))) { txk_map[last] = tx_type; last--; continue; } txfm_param.tx_type = tx_type; av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, &quant_param); // do txfm and quantization av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, &quant_param); // estimate rate cost rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type, txb_ctx, reduced_tx_set_used, 0); // tx domain dist const SCAN_ORDER *const scan_order = get_scan(txfm_param.tx_size, txfm_param.tx_type); dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix, scan_order->scan, &dist, &sse); txk_map[num_cand] = tx_type; rds[num_cand] = RDCOST(x->rdmult, rate_cost, dist); if (rds[num_cand] == 0) rds[num_cand] = 1; num_cand++; } if (num_cand == 0) return (uint16_t)0xFFFF; sort_rd(rds, txk_map, num_cand); uint16_t prune = (uint16_t)(~(1 << txk_map[0])); // 0 < prune_factor <= 1000 controls aggressiveness int64_t factor = 0; for (int idx = 1; idx < num_cand; idx++) { factor = 1000 * (rds[idx] - rds[0]) / rds[0]; if (factor < (int64_t)prune_factor) prune &= ~(1 << txk_map[idx]); else break; } return prune; } // These thresholds were calibrated to provide a certain number of TX types // pruned by the model on average, i.e. selecting a threshold with index i // will lead to pruning i+1 TX types on average static const float *prune_2D_adaptive_thresholds[] = { // TX_4X4 (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f, 0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f, 0.09778f, 0.11780f }, // TX_8X8 (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f, 0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f, 0.10803f, 0.14124f }, // TX_16X16 (float[]){ 0.01404f, 0.02000f, 0.04211f, 0.05164f, 0.05798f, 0.06335f, 0.06897f, 0.07629f, 0.08875f, 0.11169f }, // TX_32X32 NULL, // TX_64X64 NULL, // TX_4X8 (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f, 0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f, 0.10168f, 0.12585f }, // TX_8X4 (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f, 0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f, 0.10583f, 0.13123f }, // TX_8X16 (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f, 0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f, 0.10730f, 0.14221f }, // TX_16X8 (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f, 0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f, 0.10339f, 0.13464f }, // TX_16X32 NULL, // TX_32X16 NULL, // TX_32X64 NULL, // TX_64X32 NULL, // TX_4X16 (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f, 0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f, 0.10242f, 0.12878f }, // TX_16X4 (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f, 0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f, 0.10217f, 0.12610f }, // TX_8X32 NULL, // TX_32X8 NULL, // TX_16X64 NULL, // TX_64X16 NULL, }; static inline float get_adaptive_thresholds( TX_SIZE tx_size, TxSetType tx_set_type, TX_TYPE_PRUNE_MODE prune_2d_txfm_mode) { const int prune_aggr_table[5][2] = { { 4, 1 }, { 6, 3 }, { 9, 6 }, { 9, 6 }, { 12, 9 } }; int pruning_aggressiveness = 0; if (tx_set_type == EXT_TX_SET_ALL16) pruning_aggressiveness = prune_aggr_table[prune_2d_txfm_mode - TX_TYPE_PRUNE_1][0]; else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) pruning_aggressiveness = prune_aggr_table[prune_2d_txfm_mode - TX_TYPE_PRUNE_1][1]; return prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness]; } static inline void get_energy_distribution_finer(const int16_t *diff, int stride, int bw, int bh, float *hordist, float *verdist) { // First compute downscaled block energy values (esq); downscale factors // are defined by w_shift and h_shift. unsigned int esq[256]; const int w_shift = bw <= 8 ? 0 : 1; const int h_shift = bh <= 8 ? 0 : 1; const int esq_w = bw >> w_shift; const int esq_h = bh >> h_shift; const int esq_sz = esq_w * esq_h; int i, j; memset(esq, 0, esq_sz * sizeof(esq[0])); if (w_shift) { for (i = 0; i < bh; i++) { unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w; const int16_t *cur_diff_row = diff + i * stride; for (j = 0; j < bw; j += 2) { cur_esq_row[j >> 1] += (cur_diff_row[j] * cur_diff_row[j] + cur_diff_row[j + 1] * cur_diff_row[j + 1]); } } } else { for (i = 0; i < bh; i++) { unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w; const int16_t *cur_diff_row = diff + i * stride; for (j = 0; j < bw; j++) { cur_esq_row[j] += cur_diff_row[j] * cur_diff_row[j]; } } } uint64_t total = 0; for (i = 0; i < esq_sz; i++) total += esq[i]; // Output hordist and verdist arrays are normalized 1D projections of esq if (total == 0) { float hor_val = 1.0f / esq_w; for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val; float ver_val = 1.0f / esq_h; for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val; return; } const float e_recip = 1.0f / (float)total; memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0])); memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0])); const unsigned int *cur_esq_row; for (i = 0; i < esq_h - 1; i++) { cur_esq_row = esq + i * esq_w; for (j = 0; j < esq_w - 1; j++) { hordist[j] += (float)cur_esq_row[j]; verdist[i] += (float)cur_esq_row[j]; } verdist[i] += (float)cur_esq_row[j]; } cur_esq_row = esq + i * esq_w; for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j]; for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip; for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip; } static inline bool check_bit_mask(uint16_t mask, int val) { return mask & (1 << val); } static inline void set_bit_mask(uint16_t *mask, int val) { *mask |= (1 << val); } static inline void unset_bit_mask(uint16_t *mask, int val) { *mask &= ~(1 << val); } static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, int blk_row, int blk_col, TxSetType tx_set_type, TX_TYPE_PRUNE_MODE prune_2d_txfm_mode, int *txk_map, uint16_t *allowed_tx_mask) { // This table is used because the search order is different from the enum // order. static const int tx_type_table_2D[16] = { DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT, ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST, FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST, H_DCT, H_ADST, H_FLIPADST, IDTX }; if (tx_set_type != EXT_TX_SET_ALL16 && tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT) return; #if CONFIG_NN_V2 NN_CONFIG_V2 *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size]; NN_CONFIG_V2 *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size]; #else const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size]; const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size]; #endif if (!nn_config_hor || !nn_config_ver) return; // Model not established yet. float hfeatures[16], vfeatures[16]; float hscores[4], vscores[4]; float scores_2D_raw[16]; const int bw = tx_size_wide[tx_size]; const int bh = tx_size_high[tx_size]; const int hfeatures_num = bw <= 8 ? bw : bw / 2; const int vfeatures_num = bh <= 8 ? bh : bh / 2; assert(hfeatures_num <= 16); assert(vfeatures_num <= 16); const struct macroblock_plane *const p = &x->plane[0]; const int diff_stride = block_size_wide[bsize]; const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col; get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures, vfeatures); av1_get_horver_correlation_full(diff, diff_stride, bw, bh, &hfeatures[hfeatures_num - 1], &vfeatures[vfeatures_num - 1]); #if CONFIG_NN_V2 av1_nn_predict_v2(hfeatures, nn_config_hor, 0, hscores); av1_nn_predict_v2(vfeatures, nn_config_ver, 0, vscores); #else av1_nn_predict(hfeatures, nn_config_hor, 1, hscores); av1_nn_predict(vfeatures, nn_config_ver, 1, vscores); #endif for (int i = 0; i < 4; i++) { float *cur_scores_2D = scores_2D_raw + i * 4; cur_scores_2D[0] = vscores[i] * hscores[0]; cur_scores_2D[1] = vscores[i] * hscores[1]; cur_scores_2D[2] = vscores[i] * hscores[2]; cur_scores_2D[3] = vscores[i] * hscores[3]; } assert(TX_TYPES == 16); // This version of the function only works when there are at most 16 classes. // So we will need to change the optimization or use av1_nn_softmax instead if // this ever gets changed. av1_nn_fast_softmax_16(scores_2D_raw, scores_2D_raw); const float score_thresh = get_adaptive_thresholds(tx_size, tx_set_type, prune_2d_txfm_mode); // Always keep the TX type with the highest score, prune all others with // score below score_thresh. int max_score_i = 0; float max_score = 0.0f; uint16_t allow_bitmask = 0; float sum_score = 0.0; // Calculate sum of allowed tx type score and Populate allow bit mask based // on score_thresh and allowed_tx_mask int allow_count = 0; int tx_type_allowed[16] = { TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID }; float scores_2D[16] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, }; for (int tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) { const int allow_tx_type = check_bit_mask(*allowed_tx_mask, tx_type_table_2D[tx_idx]); if (!allow_tx_type) { continue; } if (scores_2D_raw[tx_idx] > max_score) { max_score = scores_2D_raw[tx_idx]; max_score_i = tx_idx; } if (scores_2D_raw[tx_idx] >= score_thresh) { // Set allow mask based on score_thresh set_bit_mask(&allow_bitmask, tx_type_table_2D[tx_idx]); // Accumulate score of allowed tx type sum_score += scores_2D_raw[tx_idx]; scores_2D[allow_count] = scores_2D_raw[tx_idx]; tx_type_allowed[allow_count] = tx_type_table_2D[tx_idx]; allow_count += 1; } } if (!check_bit_mask(allow_bitmask, tx_type_table_2D[max_score_i])) { // If even the tx_type with max score is pruned, this means that no other // tx_type is feasible. When this happens, we force enable max_score_i and // end the search. set_bit_mask(&allow_bitmask, tx_type_table_2D[max_score_i]); memcpy(txk_map, tx_type_table_2D, sizeof(tx_type_table_2D)); *allowed_tx_mask = allow_bitmask; return; } // Sort tx type probability of all types if (allow_count <= 8) { av1_sort_fi32_8(scores_2D, tx_type_allowed); } else { av1_sort_fi32_16(scores_2D, tx_type_allowed); } // Enable more pruning based on tx type probability and number of allowed tx // types if (prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) { float temp_score = 0.0; float score_ratio = 0.0; int tx_idx, tx_count = 0; const float inv_sum_score = 100 / sum_score; // Get allowed tx types based on sorted probability score and tx count for (tx_idx = 0; tx_idx < allow_count; tx_idx++) { // Skip the tx type which has more than 30% of cumulative // probability and allowed tx type count is more than 2 if (score_ratio > 30.0 && tx_count >= 2) break; assert(check_bit_mask(allow_bitmask, tx_type_allowed[tx_idx])); // Calculate cumulative probability temp_score += scores_2D[tx_idx]; // Calculate percentage of cumulative probability of allowed tx type score_ratio = temp_score * inv_sum_score; tx_count++; } // Set remaining tx types as pruned for (; tx_idx < allow_count; tx_idx++) unset_bit_mask(&allow_bitmask, tx_type_allowed[tx_idx]); } memcpy(txk_map, tx_type_allowed, sizeof(tx_type_table_2D)); *allowed_tx_mask = allow_bitmask; } static float get_dev(float mean, double x2_sum, int num) { const float e_x2 = (float)(x2_sum / num); const float diff = e_x2 - mean * mean; const float dev = (diff > 0) ? sqrtf(diff) : 0; return dev; } // Writes the features required by the ML model to predict tx split based on // mean and standard deviation values of the block and sub-blocks. // Returns the number of elements written to the output array which is at most // 12 currently. Hence 'features' buffer should be able to accommodate at least // 12 elements. static inline int get_mean_dev_features(const int16_t *data, int stride, int bw, int bh, float *features) { const int16_t *const data_ptr = &data[0]; const int subh = (bh >= bw) ? (bh >> 1) : bh; const int subw = (bw >= bh) ? (bw >> 1) : bw; const int num = bw * bh; const int sub_num = subw * subh; int feature_idx = 2; int total_x_sum = 0; int64_t total_x2_sum = 0; int num_sub_blks = 0; double mean2_sum = 0.0f; float dev_sum = 0.0f; for (int row = 0; row < bh; row += subh) { for (int col = 0; col < bw; col += subw) { int x_sum; int64_t x2_sum; // TODO(any): Write a SIMD version. Clear registers. aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh, &x_sum, &x2_sum); total_x_sum += x_sum; total_x2_sum += x2_sum; const float mean = (float)x_sum / sub_num; const float dev = get_dev(mean, (double)x2_sum, sub_num); features[feature_idx++] = mean; features[feature_idx++] = dev; mean2_sum += (double)(mean * mean); dev_sum += dev; num_sub_blks++; } } const float lvl0_mean = (float)total_x_sum / num; features[0] = lvl0_mean; features[1] = get_dev(lvl0_mean, (double)total_x2_sum, num); // Deviation of means. features[feature_idx++] = get_dev(lvl0_mean, mean2_sum, num_sub_blks); // Mean of deviations. features[feature_idx++] = dev_sum / num_sub_blks; return feature_idx; } static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row, int blk_col, TX_SIZE tx_size) { const NN_CONFIG *nn_config = av1_tx_split_nnconfig_map[tx_size]; if (!nn_config) return -1; const int diff_stride = block_size_wide[bsize]; const int16_t *diff = x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col; const int bw = tx_size_wide[tx_size]; const int bh = tx_size_high[tx_size]; float features[64] = { 0.0f }; get_mean_dev_features(diff, diff_stride, bw, bh, features); float score = 0.0f; av1_nn_predict(features, nn_config, 1, &score); int int_score = (int)(score * 10000); return clamp(int_score, -80000, 80000); } static inline uint16_t get_tx_mask( const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, const TXB_CTX *const txb_ctx, FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_best_rd, TX_TYPE *allowed_txk_types, int *txk_map) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; const TxfmSearchParams *txfm_params = &x->txfm_search_params; const int is_inter = is_inter_block(mbmi); const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY; // if txk_allowed = TX_TYPES, >1 tx types are allowed, else, if txk_allowed < // TX_TYPES, only that specific tx type is allowed. TX_TYPE txk_allowed = TX_TYPES; const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); int use_actual_frame_probs = 1; const int *tx_type_probs; #if CONFIG_FPMT_TEST use_actual_frame_probs = (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1; if (!use_actual_frame_probs) { tx_type_probs = (int *)cpi->ppi->temp_frame_probs.tx_type_probs[update_type][tx_size]; } #endif if (use_actual_frame_probs) { tx_type_probs = cpi->ppi->frame_probs.tx_type_probs[update_type][tx_size]; } if ((!is_inter && txfm_params->use_default_intra_tx_type) || (is_inter && txfm_params->default_inter_tx_type_prob_thresh == 0)) { txk_allowed = get_default_tx_type(0, xd, tx_size, cpi->use_screen_content_tools); } else if (is_inter && txfm_params->default_inter_tx_type_prob_thresh != INT_MAX) { if (tx_type_probs[DEFAULT_INTER_TX_TYPE] > txfm_params->default_inter_tx_type_prob_thresh) { txk_allowed = DEFAULT_INTER_TX_TYPE; } else { int force_tx_type = 0; int max_prob = 0; const int tx_type_prob_threshold = txfm_params->default_inter_tx_type_prob_thresh + PROB_THRESH_OFFSET_TX_TYPE; for (int i = 1; i < TX_TYPES; i++) { // find maximum probability. if (tx_type_probs[i] > max_prob) { max_prob = tx_type_probs[i]; force_tx_type = i; } } if (max_prob > tx_type_prob_threshold) // force tx type with max prob. txk_allowed = force_tx_type; else if (x->rd_model == LOW_TXFM_RD) { if (plane == 0) txk_allowed = DCT_DCT; } } } else if (x->rd_model == LOW_TXFM_RD) { if (plane == 0) txk_allowed = DCT_DCT; } const TxSetType tx_set_type = av1_get_ext_tx_set_type( tx_size, is_inter, cm->features.reduced_tx_set_used); TX_TYPE uv_tx_type = DCT_DCT; if (plane) { // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y uv_tx_type = txk_allowed = av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size, cm->features.reduced_tx_set_used); } PREDICTION_MODE intra_dir = mbmi->filter_intra_mode_info.use_filter_intra ? fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode] : mbmi->mode; uint16_t ext_tx_used_flag = cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset != 0 && tx_set_type == EXT_TX_SET_DTT4_IDTX_1DDCT ? av1_reduced_intra_tx_used_flag[intra_dir] : av1_ext_tx_used_flag[tx_set_type]; if (cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset == 2) ext_tx_used_flag &= av1_derived_intra_tx_used_flag[intra_dir]; if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 || ext_tx_used_flag == 0x0001 || (is_inter && cpi->oxcf.txfm_cfg.use_inter_dct_only) || (!is_inter && cpi->oxcf.txfm_cfg.use_intra_dct_only)) { txk_allowed = DCT_DCT; } if (cpi->oxcf.txfm_cfg.enable_flip_idtx == 0) ext_tx_used_flag &= DCT_ADST_TX_MASK; uint16_t allowed_tx_mask = 0; // 1: allow; 0: skip. if (txk_allowed < TX_TYPES) { allowed_tx_mask = 1 << txk_allowed; allowed_tx_mask &= ext_tx_used_flag; } else if (fast_tx_search) { allowed_tx_mask = 0x0c01; // V_DCT, H_DCT, DCT_DCT allowed_tx_mask &= ext_tx_used_flag; } else { assert(plane == 0); allowed_tx_mask = ext_tx_used_flag; int num_allowed = 0; int i; if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { static const int thresh_arr[2][7] = { { 10, 15, 15, 10, 15, 15, 15 }, { 10, 17, 17, 10, 17, 17, 17 } }; const int thresh = thresh_arr[cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats - 1] [update_type]; uint16_t prune = 0; int max_prob = -1; int max_idx = 0; for (i = 0; i < TX_TYPES; i++) { if (tx_type_probs[i] > max_prob && (allowed_tx_mask & (1 << i))) { max_prob = tx_type_probs[i]; max_idx = i; } if (tx_type_probs[i] < thresh) prune |= (1 << i); } if ((prune >> max_idx) & 0x01) prune &= ~(1 << max_idx); allowed_tx_mask &= (~prune); } for (i = 0; i < TX_TYPES; i++) { if (allowed_tx_mask & (1 << i)) num_allowed++; } assert(num_allowed > 0); if (num_allowed > 2 && cpi->sf.tx_sf.tx_type_search.prune_tx_type_est_rd) { int pf = prune_factors[txfm_params->prune_2d_txfm_mode]; int mf = mul_factors[txfm_params->prune_2d_txfm_mode]; if (num_allowed <= 7) { const uint16_t prune = prune_txk_type(cpi, x, plane, block, tx_size, blk_row, blk_col, plane_bsize, txk_map, allowed_tx_mask, pf, txb_ctx, cm->features.reduced_tx_set_used); allowed_tx_mask &= (~prune); } else { const int num_sel = (num_allowed * mf + 50) / 100; const uint16_t prune = prune_txk_type_separ( cpi, x, plane, block, tx_size, blk_row, blk_col, plane_bsize, txk_map, allowed_tx_mask, pf, txb_ctx, cm->features.reduced_tx_set_used, ref_best_rd, num_sel); allowed_tx_mask &= (~prune); } } else { assert(num_allowed > 0); int allowed_tx_count = (txfm_params->prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) ? 1 : 5; // !fast_tx_search && txk_end != txk_start && plane == 0 if (txfm_params->prune_2d_txfm_mode >= TX_TYPE_PRUNE_1 && is_inter && num_allowed > allowed_tx_count) { prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type, txfm_params->prune_2d_txfm_mode, txk_map, &allowed_tx_mask); } } } // Need to have at least one transform type allowed. if (allowed_tx_mask == 0) { txk_allowed = (plane ? uv_tx_type : DCT_DCT); allowed_tx_mask = (1 << txk_allowed); } assert(IMPLIES(txk_allowed < TX_TYPES, allowed_tx_mask == 1 << txk_allowed)); *allowed_txk_types = txk_allowed; return allowed_tx_mask; } #if CONFIG_RD_DEBUG static inline void update_txb_coeff_cost(RD_STATS *rd_stats, int plane, int txb_coeff_cost) { rd_stats->txb_coeff_cost[plane] += txb_coeff_cost; } #endif static inline int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, const TX_TYPE tx_type, const TXB_CTX *const txb_ctx, int reduced_tx_set_used) { #if TXCOEFF_COST_TIMER struct aom_usec_timer timer; aom_usec_timer_start(&timer); #endif const int cost = av1_cost_coeffs_txb(x, plane, block, tx_size, tx_type, txb_ctx, reduced_tx_set_used); #if TXCOEFF_COST_TIMER AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common; aom_usec_timer_mark(&timer); const int64_t elapsed_time = aom_usec_timer_elapsed(&timer); tmp_cm->txcoeff_cost_timer += elapsed_time; ++tmp_cm->txcoeff_cost_count; #endif return cost; } static int skip_trellis_opt_based_on_satd(MACROBLOCK *x, QUANT_PARAM *quant_param, int plane, int block, TX_SIZE tx_size, int quant_b_adapt, int qstep, unsigned int coeff_opt_satd_threshold, int skip_trellis, int dc_only_blk) { if (skip_trellis || (coeff_opt_satd_threshold == UINT_MAX)) return skip_trellis; const struct macroblock_plane *const p = &x->plane[plane]; const int block_offset = BLOCK_OFFSET(block); tran_low_t *const coeff_ptr = p->coeff + block_offset; const int n_coeffs = av1_get_max_eob(tx_size); const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)); int satd = (dc_only_blk) ? abs(coeff_ptr[0]) : aom_satd(coeff_ptr, n_coeffs); satd = RIGHT_SIGNED_SHIFT(satd, shift); satd >>= (x->e_mbd.bd - 8); const int skip_block_trellis = ((uint64_t)satd > (uint64_t)coeff_opt_satd_threshold * qstep * sqrt_tx_pixels_2d[tx_size]); av1_setup_quant( tx_size, !skip_block_trellis, skip_block_trellis ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP) : AV1_XFORM_QUANT_FP, quant_b_adapt, quant_param); return skip_block_trellis; } // Predict DC only blocks if the residual variance is below a qstep based // threshold.For such blocks, transform type search is bypassed. static inline void predict_dc_only_block( MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int block, int blk_row, int blk_col, RD_STATS *best_rd_stats, int64_t *block_sse, unsigned int *block_mse_q8, int64_t *per_px_mean, int *dc_only_blk) { MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift; uint64_t block_var = UINT64_MAX; const int dc_qstep = x->plane[plane].dequant_QTX[0] >> 3; *block_sse = pixel_diff_stats(x, plane, blk_row, blk_col, plane_bsize, txsize_to_bsize[tx_size], block_mse_q8, per_px_mean, &block_var); assert((*block_mse_q8) != UINT_MAX); uint64_t var_threshold = (uint64_t)(1.8 * qstep * qstep); if (is_cur_buf_hbd(xd)) block_var = ROUND_POWER_OF_TWO(block_var, (xd->bd - 8) * 2); if (block_var >= var_threshold) return; const unsigned int predict_dc_level = x->txfm_search_params.predict_dc_level; assert(predict_dc_level != 0); // Prediction of skip block if residual mean and variance are less // than qstep based threshold if ((llabs(*per_px_mean) * dc_coeff_scale[tx_size]) < (dc_qstep << 12)) { // If the normalized mean of residual block is less than the dc qstep and // the normalized block variance is less than ac qstep, then the block is // assumed to be a skip block and its rdcost is updated accordingly. best_rd_stats->skip_txfm = 1; x->plane[plane].eobs[block] = 0; if (is_cur_buf_hbd(xd)) *block_sse = ROUND_POWER_OF_TWO((*block_sse), (xd->bd - 8) * 2); best_rd_stats->dist = (*block_sse) << 4; best_rd_stats->sse = best_rd_stats->dist; ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; av1_get_entropy_contexts(plane_bsize, &xd->plane[plane], ctxa, ctxl); ENTROPY_CONTEXT *ta = ctxa; ENTROPY_CONTEXT *tl = ctxl; const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); TXB_CTX txb_ctx_tmp; const PLANE_TYPE plane_type = get_plane_type(plane); get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx_tmp); const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][plane_type] .txb_skip_cost[txb_ctx_tmp.txb_skip_ctx][1]; best_rd_stats->rate = zero_blk_rate; best_rd_stats->rdcost = RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->sse); x->plane[plane].txb_entropy_ctx[block] = 0; } else if (predict_dc_level > 1) { // Predict DC only blocks based on residual variance. // For chroma plane, this prediction is disabled for intra blocks. if ((plane == 0) || (plane > 0 && is_inter_block(mbmi))) *dc_only_blk = 1; } } // Search for the best transform type for a given transform block. // This function can be used for both inter and intra, both luma and chroma. static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, const TXB_CTX *const txb_ctx, FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis, int64_t ref_best_rd, RD_STATS *best_rd_stats) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; const TxfmSearchParams *txfm_params = &x->txfm_search_params; int64_t best_rd = INT64_MAX; uint16_t best_eob = 0; TX_TYPE best_tx_type = DCT_DCT; int rate_cost = 0; struct macroblock_plane *const p = &x->plane[plane]; tran_low_t *orig_dqcoeff = p->dqcoeff; tran_low_t *best_dqcoeff = x->dqcoeff_buf; const int tx_type_map_idx = plane ? 0 : blk_row * xd->tx_type_map_stride + blk_col; av1_invalid_rd_stats(best_rd_stats); skip_trellis |= !is_trellis_used(cpi->optimize_seg_arr[xd->mi[0]->segment_id], DRY_RUN_NORMAL); uint8_t best_txb_ctx = 0; // txk_allowed = TX_TYPES: >1 tx types are allowed // txk_allowed < TX_TYPES: only that specific tx type is allowed. TX_TYPE txk_allowed = TX_TYPES; int txk_map[TX_TYPES] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift; const uint8_t txw = tx_size_wide[tx_size]; const uint8_t txh = tx_size_high[tx_size]; int64_t block_sse; unsigned int block_mse_q8; int dc_only_blk = 0; const bool predict_dc_block = txfm_params->predict_dc_level >= 1 && txw != 64 && txh != 64; int64_t per_px_mean = INT64_MAX; if (predict_dc_block) { predict_dc_only_block(x, plane, plane_bsize, tx_size, block, blk_row, blk_col, best_rd_stats, &block_sse, &block_mse_q8, &per_px_mean, &dc_only_blk); if (best_rd_stats->skip_txfm == 1) { const TX_TYPE tx_type = DCT_DCT; if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type; return; } } else { block_sse = av1_pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, txsize_to_bsize[tx_size], &block_mse_q8); assert(block_mse_q8 != UINT_MAX); } // Bit mask to indicate which transform types are allowed in the RD search. uint16_t tx_mask; // Use DCT_DCT transform for DC only block. if (dc_only_blk || cpi->sf.rt_sf.dct_only_palette_nonrd == 1) tx_mask = 1 << DCT_DCT; else tx_mask = get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, txb_ctx, ftxs_mode, ref_best_rd, &txk_allowed, txk_map); const uint16_t allowed_tx_mask = tx_mask; if (is_cur_buf_hbd(xd)) { block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2); block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2); } block_sse *= 16; // Use mse / qstep^2 based threshold logic to take decision of R-D // optimization of coeffs. For smaller residuals, coeff optimization // would be helpful. For larger residuals, R-D optimization may not be // effective. // TODO(any): Experiment with variance and mean based thresholds const int perform_block_coeff_opt = ((uint64_t)block_mse_q8 <= (uint64_t)txfm_params->coeff_opt_thresholds[0] * qstep * qstep); skip_trellis |= !perform_block_coeff_opt; // Flag to indicate if distortion should be calculated in transform domain or // not during iterating through transform type candidates. // Transform domain distortion is accurate for higher residuals. // TODO(any): Experiment with variance and mean based thresholds int use_transform_domain_distortion = (txfm_params->use_transform_domain_distortion > 0) && (block_mse_q8 >= txfm_params->tx_domain_dist_threshold) && // Any 64-pt transforms only preserves half the coefficients. // Therefore transform domain distortion is not valid for these // transform sizes. (txsize_sqr_up_map[tx_size] != TX_64X64) && // Use pixel domain distortion for DC only blocks !dc_only_blk; // Flag to indicate if an extra calculation of distortion in the pixel domain // should be performed at the end, after the best transform type has been // decided. int calc_pixel_domain_distortion_final = txfm_params->use_transform_domain_distortion == 1 && use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD; if (calc_pixel_domain_distortion_final && (txk_allowed < TX_TYPES || allowed_tx_mask == 0x0001)) calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0; const uint16_t *eobs_ptr = x->plane[plane].eobs; TxfmParam txfm_param; QUANT_PARAM quant_param; int skip_trellis_based_on_satd[TX_TYPES] = { 0 }; av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param); av1_setup_quant(tx_size, !skip_trellis, skip_trellis ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP) : AV1_XFORM_QUANT_FP, cpi->oxcf.q_cfg.quant_b_adapt, &quant_param); // Iterate through all transform type candidates. for (int idx = 0; idx < TX_TYPES; ++idx) { const TX_TYPE tx_type = (TX_TYPE)txk_map[idx]; if (tx_type == TX_TYPE_INVALID || !check_bit_mask(allowed_tx_mask, tx_type)) continue; txfm_param.tx_type = tx_type; if (av1_use_qmatrix(&cm->quant_params, xd, mbmi->segment_id)) { av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, &quant_param); } if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type; RD_STATS this_rd_stats; av1_invalid_rd_stats(&this_rd_stats); if (!dc_only_blk) av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param); else av1_xform_dc_only(x, plane, block, &txfm_param, per_px_mean); skip_trellis_based_on_satd[tx_type] = skip_trellis_opt_based_on_satd( x, &quant_param, plane, block, tx_size, cpi->oxcf.q_cfg.quant_b_adapt, qstep, txfm_params->coeff_opt_thresholds[1], skip_trellis, dc_only_blk); av1_quant(x, plane, block, &txfm_param, &quant_param); // Calculate rate cost of quantized coefficients. if (quant_param.use_optimize_b) { // TODO(aomedia:3209): update Trellis quantization to take into account // quantization matrices. av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, &rate_cost); } else { rate_cost = cost_coeffs(x, plane, block, tx_size, tx_type, txb_ctx, cm->features.reduced_tx_set_used); } // If rd cost based on coeff rate alone is already more than best_rd, // terminate early. if (RDCOST(x->rdmult, rate_cost, 0) > best_rd) continue; // Calculate distortion. if (eobs_ptr[block] == 0) { // When eob is 0, pixel domain distortion is more efficient and accurate. this_rd_stats.dist = this_rd_stats.sse = block_sse; } else if (dc_only_blk) { this_rd_stats.sse = block_sse; this_rd_stats.dist = dist_block_px_domain( cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size); } else if (use_transform_domain_distortion) { const SCAN_ORDER *const scan_order = get_scan(txfm_param.tx_size, txfm_param.tx_type); dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix, scan_order->scan, &this_rd_stats.dist, &this_rd_stats.sse); } else { int64_t sse_diff = INT64_MAX; // high_energy threshold assumes that every pixel within a txfm block // has a residue energy of at least 25% of the maximum, i.e. 128 * 128 // for 8 bit. const int64_t high_energy_thresh = ((int64_t)128 * 128 * tx_size_2d[tx_size]); const int is_high_energy = (block_sse >= high_energy_thresh); if (tx_size == TX_64X64 || is_high_energy) { // Because 3 out 4 quadrants of transform coefficients are forced to // zero, the inverse transform has a tendency to overflow. sse_diff // is effectively the energy of those 3 quadrants, here we use it // to decide if we should do pixel domain distortion. If the energy // is mostly in first quadrant, then it is unlikely that we have // overflow issue in inverse transform. const SCAN_ORDER *const scan_order = get_scan(txfm_param.tx_size, txfm_param.tx_type); dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix, scan_order->scan, &this_rd_stats.dist, &this_rd_stats.sse); sse_diff = block_sse - this_rd_stats.sse; } if (tx_size != TX_64X64 || !is_high_energy || (sse_diff * 2) < this_rd_stats.sse) { const int64_t tx_domain_dist = this_rd_stats.dist; this_rd_stats.dist = dist_block_px_domain( cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size); // For high energy blocks, occasionally, the pixel domain distortion // can be artificially low due to clamping at reconstruction stage // even when inverse transform output is hugely different from the // actual residue. if (is_high_energy && this_rd_stats.dist < tx_domain_dist) this_rd_stats.dist = tx_domain_dist; } else { assert(sse_diff < INT64_MAX); this_rd_stats.dist += sse_diff; } this_rd_stats.sse = block_sse; } this_rd_stats.rate = rate_cost; const int64_t rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); if (rd < best_rd) { best_rd = rd; *best_rd_stats = this_rd_stats; best_tx_type = tx_type; best_txb_ctx = x->plane[plane].txb_entropy_ctx[block]; best_eob = x->plane[plane].eobs[block]; // Swap dqcoeff buffers tran_low_t *const tmp_dqcoeff = best_dqcoeff; best_dqcoeff = p->dqcoeff; p->dqcoeff = tmp_dqcoeff; } #if CONFIG_COLLECT_RD_STATS == 1 if (plane == 0) { PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col, plane_bsize, tx_size, tx_type, rd); } #endif // CONFIG_COLLECT_RD_STATS == 1 #if COLLECT_TX_SIZE_DATA // Generate small sample to restrict output size. static unsigned int seed = 21743; if (lcg_rand16(&seed) % 200 == 0) { FILE *fp = NULL; if (within_border) { fp = fopen(av1_tx_size_data_output_file, "a"); } if (fp) { // Transform info and RD const int txb_w = tx_size_wide[tx_size]; const int txb_h = tx_size_high[tx_size]; // Residue signal. const int diff_stride = block_size_wide[plane_bsize]; struct macroblock_plane *const p = &x->plane[plane]; const int16_t *src_diff = &p->src_diff[(blk_row * diff_stride + blk_col) * 4]; for (int r = 0; r < txb_h; ++r) { for (int c = 0; c < txb_w; ++c) { fprintf(fp, "%d,", src_diff[c]); } src_diff += diff_stride; } fprintf(fp, "%d,%d,%d,%" PRId64, txb_w, txb_h, tx_type, rd); fprintf(fp, "\n"); fclose(fp); } } #endif // COLLECT_TX_SIZE_DATA // If the current best RD cost is much worse than the reference RD cost, // terminate early. if (cpi->sf.tx_sf.adaptive_txb_search_level) { if ((best_rd - (best_rd >> cpi->sf.tx_sf.adaptive_txb_search_level)) > ref_best_rd) { break; } } // Terminate transform type search if the block has been quantized to // all zero. if (cpi->sf.tx_sf.tx_type_search.skip_tx_search && !best_eob) break; } assert(best_rd != INT64_MAX); best_rd_stats->skip_txfm = best_eob == 0; if (plane == 0) update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type); x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx; x->plane[plane].eobs[block] = best_eob; skip_trellis = skip_trellis_based_on_satd[best_tx_type]; // Point dqcoeff to the quantized coefficients corresponding to the best // transform type, then we can skip transform and quantization, e.g. in the // final pixel domain distortion calculation and recon_intra(). p->dqcoeff = best_dqcoeff; if (calc_pixel_domain_distortion_final && best_eob) { best_rd_stats->dist = dist_block_px_domain( cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size); best_rd_stats->sse = block_sse; } // Intra mode needs decoded pixels such that the next transform block // can use them for prediction. recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, txb_ctx, skip_trellis, best_tx_type, 0, &rate_cost, best_eob); p->dqcoeff = orig_dqcoeff; } // Pick transform type for a luma transform block of tx_size. Note this function // is used only for inter-predicted blocks. static inline void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size, int blk_row, int blk_col, int block, int plane_bsize, TXB_CTX *txb_ctx, RD_STATS *rd_stats, FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost) { assert(is_inter_block(x->e_mbd.mi[0])); RD_STATS this_rd_stats; const int skip_trellis = 0; search_tx_type(cpi, x, 0, block, blk_row, blk_col, plane_bsize, tx_size, txb_ctx, ftxs_mode, skip_trellis, ref_rdcost, &this_rd_stats); av1_merge_rd_stats(rd_stats, &this_rd_stats); } static inline void try_tx_block_no_split( const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl, int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode, TxCandidateInfo *no_split) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; struct macroblock_plane *const p = &x->plane[0]; const int bw = mi_size_wide[plane_bsize]; const ENTROPY_CONTEXT *const pta = ta + blk_col; const ENTROPY_CONTEXT *const ptl = tl + blk_row; const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); TXB_CTX txb_ctx; get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx); const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y] .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; rd_stats->zero_rate = zero_blk_rate; const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col); mbmi->inter_tx_size[index] = tx_size; tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx, rd_stats, ftxs_mode, ref_best_rd); assert(rd_stats->rate < INT_MAX); const int pick_skip_txfm = !xd->lossless[mbmi->segment_id] && (rd_stats->skip_txfm == 1 || RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse)); if (pick_skip_txfm) { #if CONFIG_RD_DEBUG update_txb_coeff_cost(rd_stats, 0, zero_blk_rate - rd_stats->rate); #endif // CONFIG_RD_DEBUG rd_stats->rate = zero_blk_rate; rd_stats->dist = rd_stats->sse; p->eobs[block] = 0; update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); } rd_stats->skip_txfm = pick_skip_txfm; set_blk_skip(x->txfm_search_info.blk_skip, 0, blk_row * bw + blk_col, pick_skip_txfm); if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) rd_stats->rate += x->mode_costs.txfm_partition_cost[txfm_partition_ctx][0]; no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); no_split->txb_entropy_ctx = p->txb_entropy_ctx[block]; no_split->tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; } static inline void try_tx_block_split( const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode, RD_STATS *split_rd_stats) { assert(tx_size < TX_SIZES_ALL); MACROBLOCKD *const xd = &x->e_mbd; const int max_blocks_high = max_block_high(xd, plane_bsize, 0); const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0); const int txb_width = tx_size_wide_unit[tx_size]; const int txb_height = tx_size_high_unit[tx_size]; // Transform size after splitting current block. const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; const int sub_txb_width = tx_size_wide_unit[sub_txs]; const int sub_txb_height = tx_size_high_unit[sub_txs]; const int sub_step = sub_txb_width * sub_txb_height; const int nblks = (txb_height / sub_txb_height) * (txb_width / sub_txb_width); assert(nblks > 0); av1_init_rd_stats(split_rd_stats); split_rd_stats->rate = x->mode_costs.txfm_partition_cost[txfm_partition_ctx][1]; for (int r = 0, blk_idx = 0; r < txb_height; r += sub_txb_height) { const int offsetr = blk_row + r; if (offsetr >= max_blocks_high) break; for (int c = 0; c < txb_width; c += sub_txb_width, ++blk_idx) { assert(blk_idx < 4); const int offsetc = blk_col + c; if (offsetc >= max_blocks_wide) continue; RD_STATS this_rd_stats; int this_cost_valid = 1; select_tx_block(cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, ta, tl, tx_above, tx_left, &this_rd_stats, no_split_rd / nblks, ref_best_rd - split_rd_stats->rdcost, &this_cost_valid, ftxs_mode); if (!this_cost_valid) { split_rd_stats->rdcost = INT64_MAX; return; } av1_merge_rd_stats(split_rd_stats, &this_rd_stats); split_rd_stats->rdcost = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist); if (split_rd_stats->rdcost > ref_best_rd) { split_rd_stats->rdcost = INT64_MAX; return; } block += sub_step; } } } static float get_var(float mean, double x2_sum, int num) { const float e_x2 = (float)(x2_sum / num); const float diff = e_x2 - mean * mean; return diff; } static inline void get_blk_var_dev(const int16_t *data, int stride, int bw, int bh, float *dev_of_mean, float *var_of_vars) { const int16_t *const data_ptr = &data[0]; const int subh = (bh >= bw) ? (bh >> 1) : bh; const int subw = (bw >= bh) ? (bw >> 1) : bw; const int num = bw * bh; const int sub_num = subw * subh; int total_x_sum = 0; int64_t total_x2_sum = 0; int blk_idx = 0; float var_sum = 0.0f; float mean_sum = 0.0f; double var2_sum = 0.0f; double mean2_sum = 0.0f; for (int row = 0; row < bh; row += subh) { for (int col = 0; col < bw; col += subw) { int x_sum; int64_t x2_sum; aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh, &x_sum, &x2_sum); total_x_sum += x_sum; total_x2_sum += x2_sum; const float mean = (float)x_sum / sub_num; const float var = get_var(mean, (double)x2_sum, sub_num); mean_sum += mean; mean2_sum += (double)(mean * mean); var_sum += var; var2_sum += var * var; blk_idx++; } } const float lvl0_mean = (float)total_x_sum / num; const float block_var = get_var(lvl0_mean, (double)total_x2_sum, num); mean_sum += lvl0_mean; mean2_sum += (double)(lvl0_mean * lvl0_mean); var_sum += block_var; var2_sum += block_var * block_var; const float av_mean = mean_sum / 5; if (blk_idx > 1) { // Deviation of means. *dev_of_mean = get_dev(av_mean, mean2_sum, (blk_idx + 1)); // Variance of variances. const float mean_var = var_sum / (blk_idx + 1); *var_of_vars = get_var(mean_var, var2_sum, (blk_idx + 1)); } } static void prune_tx_split_no_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row, int blk_col, TX_SIZE tx_size, int *try_no_split, int *try_split, int pruning_level) { const int diff_stride = block_size_wide[bsize]; const int16_t *diff = x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col; const int bw = tx_size_wide[tx_size]; const int bh = tx_size_high[tx_size]; float dev_of_means = 0.0f; float var_of_vars = 0.0f; // This function calculates the deviation of means, and the variance of pixel // variances of the block as well as it's sub-blocks. get_blk_var_dev(diff, diff_stride, bw, bh, &dev_of_means, &var_of_vars); const int dc_q = x->plane[0].dequant_QTX[0] >> 3; const int ac_q = x->plane[0].dequant_QTX[1] >> 3; const int no_split_thresh_scales[4] = { 0, 24, 8, 8 }; const int no_split_thresh_scale = no_split_thresh_scales[pruning_level]; const int split_thresh_scales[4] = { 0, 24, 10, 8 }; const int split_thresh_scale = split_thresh_scales[pruning_level]; if ((dev_of_means <= dc_q) && (split_thresh_scale * var_of_vars <= ac_q * ac_q)) { *try_split = 0; } if ((dev_of_means > no_split_thresh_scale * dc_q) && (var_of_vars > no_split_thresh_scale * ac_q * ac_q)) { *try_no_split = 0; } } // Search for the best transform partition(recursive)/type for a given // inter-predicted luma block. The obtained transform selection will be saved // in xd->mi[0], the corresponding RD stats will be saved in rd_stats. static inline void select_tx_block( const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd, int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode) { assert(tx_size < TX_SIZES_ALL); av1_init_rd_stats(rd_stats); if (ref_best_rd < 0) { *is_cost_valid = 0; return; } MACROBLOCKD *const xd = &x->e_mbd; assert(blk_row < max_block_high(xd, plane_bsize, 0) && blk_col < max_block_wide(xd, plane_bsize, 0)); MB_MODE_INFO *const mbmi = xd->mi[0]; const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row, mbmi->bsize, tx_size); struct macroblock_plane *const p = &x->plane[0]; int try_no_split = (cpi->oxcf.txfm_cfg.enable_tx64 || txsize_sqr_up_map[tx_size] != TX_64X64) && (cpi->oxcf.txfm_cfg.enable_rect_tx || tx_size_wide[tx_size] == tx_size_high[tx_size]); int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH; TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES }; // Prune tx_split and no-split based on sub-block properties. if (tx_size != TX_4X4 && try_split == 1 && try_no_split == 1 && cpi->sf.tx_sf.prune_tx_size_level > 0) { prune_tx_split_no_split(x, plane_bsize, blk_row, blk_col, tx_size, &try_no_split, &try_split, cpi->sf.tx_sf.prune_tx_size_level); } if (cpi->sf.rt_sf.skip_tx_no_split_var_based_partition) { if (x->try_merge_partition && try_split && p->eobs[block]) try_no_split = 0; } // Try using current block as a single transform block without split. if (try_no_split) { try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth, plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd, ftxs_mode, &no_split); // Speed features for early termination. const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level; if (search_level) { if ((no_split.rd - (no_split.rd >> (1 + search_level))) > ref_best_rd) { *is_cost_valid = 0; return; } if (no_split.rd - (no_split.rd >> (2 + search_level)) > prev_level_rd) { try_split = 0; } } if (cpi->sf.tx_sf.txb_split_cap) { if (p->eobs[block] == 0) try_split = 0; } } // ML based speed feature to skip searching for split transform blocks. if (x->e_mbd.bd == 8 && try_split && !(ref_best_rd == INT64_MAX && no_split.rd == INT64_MAX)) { const int threshold = cpi->sf.tx_sf.tx_type_search.ml_tx_split_thresh; if (threshold >= 0) { const int split_score = ml_predict_tx_split(x, plane_bsize, blk_row, blk_col, tx_size); if (split_score < -threshold) try_split = 0; } } RD_STATS split_rd_stats; split_rd_stats.rdcost = INT64_MAX; // Try splitting current block into smaller transform blocks. if (try_split) { try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth, plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd, AOMMIN(no_split.rd, ref_best_rd), ftxs_mode, &split_rd_stats); } if (no_split.rd < split_rd_stats.rdcost) { ENTROPY_CONTEXT *pta = ta + blk_col; ENTROPY_CONTEXT *ptl = tl + blk_row; p->txb_entropy_ctx[block] = no_split.txb_entropy_ctx; av1_set_txb_context(x, 0, block, tx_size, pta, ptl); txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size, tx_size); for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) { for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) { const int index = av1_get_txb_size_index(plane_bsize, blk_row + idy, blk_col + idx); mbmi->inter_tx_size[index] = tx_size; } } mbmi->tx_size = tx_size; update_txk_array(xd, blk_row, blk_col, tx_size, no_split.tx_type); const int bw = mi_size_wide[plane_bsize]; set_blk_skip(x->txfm_search_info.blk_skip, 0, blk_row * bw + blk_col, rd_stats->skip_txfm); } else { *rd_stats = split_rd_stats; if (split_rd_stats.rdcost == INT64_MAX) *is_cost_valid = 0; } } static inline void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const TxfmSearchParams *txfm_params = &x->txfm_search_params; mbmi->tx_size = tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type); // If tx64 is not enabled, we need to go down to the next available size if (!cpi->oxcf.txfm_cfg.enable_tx64 && cpi->oxcf.txfm_cfg.enable_rect_tx) { static const TX_SIZE tx_size_max_32[TX_SIZES_ALL] = { TX_4X4, // 4x4 transform TX_8X8, // 8x8 transform TX_16X16, // 16x16 transform TX_32X32, // 32x32 transform TX_32X32, // 64x64 transform TX_4X8, // 4x8 transform TX_8X4, // 8x4 transform TX_8X16, // 8x16 transform TX_16X8, // 16x8 transform TX_16X32, // 16x32 transform TX_32X16, // 32x16 transform TX_32X32, // 32x64 transform TX_32X32, // 64x32 transform TX_4X16, // 4x16 transform TX_16X4, // 16x4 transform TX_8X32, // 8x32 transform TX_32X8, // 32x8 transform TX_16X32, // 16x64 transform TX_32X16, // 64x16 transform }; mbmi->tx_size = tx_size_max_32[mbmi->tx_size]; } else if (cpi->oxcf.txfm_cfg.enable_tx64 && !cpi->oxcf.txfm_cfg.enable_rect_tx) { static const TX_SIZE tx_size_max_square[TX_SIZES_ALL] = { TX_4X4, // 4x4 transform TX_8X8, // 8x8 transform TX_16X16, // 16x16 transform TX_32X32, // 32x32 transform TX_64X64, // 64x64 transform TX_4X4, // 4x8 transform TX_4X4, // 8x4 transform TX_8X8, // 8x16 transform TX_8X8, // 16x8 transform TX_16X16, // 16x32 transform TX_16X16, // 32x16 transform TX_32X32, // 32x64 transform TX_32X32, // 64x32 transform TX_4X4, // 4x16 transform TX_4X4, // 16x4 transform TX_8X8, // 8x32 transform TX_8X8, // 32x8 transform TX_16X16, // 16x64 transform TX_16X16, // 64x16 transform }; mbmi->tx_size = tx_size_max_square[mbmi->tx_size]; } else if (!cpi->oxcf.txfm_cfg.enable_tx64 && !cpi->oxcf.txfm_cfg.enable_rect_tx) { static const TX_SIZE tx_size_max_32_square[TX_SIZES_ALL] = { TX_4X4, // 4x4 transform TX_8X8, // 8x8 transform TX_16X16, // 16x16 transform TX_32X32, // 32x32 transform TX_32X32, // 64x64 transform TX_4X4, // 4x8 transform TX_4X4, // 8x4 transform TX_8X8, // 8x16 transform TX_8X8, // 16x8 transform TX_16X16, // 16x32 transform TX_16X16, // 32x16 transform TX_32X32, // 32x64 transform TX_32X32, // 64x32 transform TX_4X4, // 4x16 transform TX_4X4, // 16x4 transform TX_8X8, // 8x32 transform TX_8X8, // 32x8 transform TX_16X16, // 16x64 transform TX_16X16, // 64x16 transform }; mbmi->tx_size = tx_size_max_32_square[mbmi->tx_size]; } const int skip_ctx = av1_get_skip_txfm_context(xd); const int no_skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][0]; const int skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][1]; // Skip RDcost is used only for Inter blocks const int64_t skip_txfm_rd = is_inter_block(mbmi) ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX; const int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_rate, 0); const int skip_trellis = 0; av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOMMIN(no_skip_txfm_rd, skip_txfm_rd), AOM_PLANE_Y, bs, mbmi->tx_size, FTXS_NONE, skip_trellis); } static inline void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; mbmi->tx_size = TX_4X4; // TODO(any) : Pass this_rd based on skip/non-skip cost const int skip_trellis = 0; av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, 0, bs, mbmi->tx_size, FTXS_NONE, skip_trellis); } #if !CONFIG_REALTIME_ONLY static void ml_predict_intra_tx_depth_prune(MACROBLOCK *x, int blk_row, int blk_col, BLOCK_SIZE bsize, TX_SIZE tx_size) { const MACROBLOCKD *const xd = &x->e_mbd; const MB_MODE_INFO *const mbmi = xd->mi[0]; // Disable the pruning logic using NN model for the following cases: // 1) Lossless coding as only 4x4 transform is evaluated in this case // 2) When transform and current block sizes do not match as the features are // obtained over the current block // 3) When operating bit-depth is not 8-bit as the input features are not // scaled according to bit-depth. if (xd->lossless[mbmi->segment_id] || txsize_to_bsize[tx_size] != bsize || xd->bd != 8) return; // Currently NN model based pruning is supported only when largest transform // size is 8x8 if (tx_size != TX_8X8) return; // Neural network model is a sequential neural net and was trained using SGD // optimizer. The model can be further improved in terms of speed/quality by // considering the following experiments: // 1) Generate ML model by training with balanced data for different learning // rates and optimizers. // 2) Experiment with ML model by adding features related to the statistics of // top and left pixels to capture the accuracy of reconstructed neighbouring // pixels for 4x4 blocks numbered 1, 2, 3 in 8x8 block, source variance of 4x4 // sub-blocks, etc. // 3) Generate ML models for transform blocks other than 8x8. const NN_CONFIG *const nn_config = &av1_intra_tx_split_nnconfig_8x8; const float *const intra_tx_prune_thresh = av1_intra_tx_prune_nn_thresh_8x8; float features[NUM_INTRA_TX_SPLIT_FEATURES] = { 0.0f }; const int diff_stride = block_size_wide[bsize]; const int16_t *diff = x->plane[0].src_diff + MI_SIZE * blk_row * diff_stride + MI_SIZE * blk_col; const int bw = tx_size_wide[tx_size]; const int bh = tx_size_high[tx_size]; int feature_idx = get_mean_dev_features(diff, diff_stride, bw, bh, features); features[feature_idx++] = log1pf((float)x->source_variance); const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); const float log_dc_q_square = log1pf((float)(dc_q * dc_q) / 256.0f); features[feature_idx++] = log_dc_q_square; assert(feature_idx == NUM_INTRA_TX_SPLIT_FEATURES); for (int i = 0; i < NUM_INTRA_TX_SPLIT_FEATURES; i++) { features[i] = (features[i] - av1_intra_tx_split_8x8_mean[i]) / av1_intra_tx_split_8x8_std[i]; } float score; av1_nn_predict(features, nn_config, 1, &score); TxfmSearchParams *const txfm_params = &x->txfm_search_params; if (score <= intra_tx_prune_thresh[0]) txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_SPLIT; else if (score > intra_tx_prune_thresh[1]) txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_LARGEST; } #endif // !CONFIG_REALTIME_ONLY /*!\brief Transform type search for luma macroblock with fixed transform size. * * \ingroup transform_search * Search for the best transform type and return the transform coefficients RD * cost of current luma macroblock with the given uniform transform size. * * \param[in] x Pointer to structure holding the data for the current encoding macroblock * \param[in] cpi Top-level encoder structure * \param[in] rd_stats Pointer to struct to keep track of the RD stats * \param[in] ref_best_rd Best RD cost seen for this block so far * \param[in] bs Size of the current macroblock * \param[in] tx_size The given transform size * \param[in] ftxs_mode Transform search mode specifying desired speed and quality tradeoff * \param[in] skip_trellis Binary flag indicating if trellis optimization should be skipped * \return An int64_t value that is the best RD cost found. */ static int64_t uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs, TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) { assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs))); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const TxfmSearchParams *txfm_params = &x->txfm_search_params; const ModeCosts *mode_costs = &x->mode_costs; const int is_inter = is_inter_block(mbmi); const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT && block_signals_txsize(mbmi->bsize); int tx_size_rate = 0; if (tx_select) { const int ctx = txfm_partition_context( xd->above_txfm_context, xd->left_txfm_context, mbmi->bsize, tx_size); tx_size_rate = is_inter ? mode_costs->txfm_partition_cost[ctx][0] : tx_size_cost(x, bs, tx_size); } const int skip_ctx = av1_get_skip_txfm_context(xd); const int no_skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][0]; const int skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][1]; const int64_t skip_txfm_rd = is_inter ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX; const int64_t no_this_rd = RDCOST(x->rdmult, no_skip_txfm_rate + tx_size_rate, 0); mbmi->tx_size = tx_size; av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOMMIN(no_this_rd, skip_txfm_rd), AOM_PLANE_Y, bs, tx_size, ftxs_mode, skip_trellis); if (rd_stats->rate == INT_MAX) return INT64_MAX; int64_t rd; // rdstats->rate should include all the rate except skip/non-skip cost as the // same is accounted in the caller functions after rd evaluation of all // planes. However the decisions should be done after considering the // skip/non-skip header cost if (rd_stats->skip_txfm && is_inter) { rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); } else { // Intra blocks are always signalled as non-skip rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate, rd_stats->dist); rd_stats->rate += tx_size_rate; } // Check if forcing the block to skip transform leads to smaller RD cost. if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) { int64_t temp_skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); if (temp_skip_txfm_rd <= rd) { rd = temp_skip_txfm_rd; rd_stats->rate = 0; rd_stats->dist = rd_stats->sse; rd_stats->skip_txfm = 1; } } return rd; } // Search for the best uniform transform size and type for current coding block. static inline void choose_tx_size_type_from_rd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs) { av1_invalid_rd_stats(rd_stats); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; TxfmSearchParams *const txfm_params = &x->txfm_search_params; const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs]; const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT; int start_tx; // The split depth can be at most MAX_TX_DEPTH, so the init_depth controls // how many times of splitting is allowed during the RD search. int init_depth; if (tx_select) { start_tx = max_rect_tx_size; init_depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs], is_inter_block(mbmi), &cpi->sf, txfm_params->tx_size_search_method); if (init_depth == MAX_TX_DEPTH && !cpi->oxcf.txfm_cfg.enable_tx64 && txsize_sqr_up_map[start_tx] == TX_64X64) { start_tx = sub_tx_size_map[start_tx]; } } else { const TX_SIZE chosen_tx_size = tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type); start_tx = chosen_tx_size; init_depth = MAX_TX_DEPTH; } const int skip_trellis = 0; uint8_t best_txk_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; TX_SIZE best_tx_size = max_rect_tx_size; int64_t best_rd = INT64_MAX; const int num_blks = bsize_to_num_blk(bs); x->rd_model = FULL_TXFM_RD; int64_t rd[MAX_TX_DEPTH + 1] = { INT64_MAX, INT64_MAX, INT64_MAX }; TxfmSearchInfo *txfm_info = &x->txfm_search_info; for (int tx_size = start_tx, depth = init_depth; depth <= MAX_TX_DEPTH; depth++, tx_size = sub_tx_size_map[tx_size]) { if ((!cpi->oxcf.txfm_cfg.enable_tx64 && txsize_sqr_up_map[tx_size] == TX_64X64) || (!cpi->oxcf.txfm_cfg.enable_rect_tx && tx_size_wide[tx_size] != tx_size_high[tx_size])) { continue; } #if !CONFIG_REALTIME_ONLY if (txfm_params->nn_prune_depths_for_intra_tx == TX_PRUNE_SPLIT) break; // Set the flag to enable the evaluation of NN classifier to prune transform // depths. As the features are based on intra residual information of // largest transform, the evaluation of NN model is enabled only for this // case. txfm_params->enable_nn_prune_intra_tx_depths = (cpi->sf.tx_sf.prune_intra_tx_depths_using_nn && tx_size == start_tx); #endif RD_STATS this_rd_stats; // When the speed feature use_rd_based_breakout_for_intra_tx_search is // enabled, use the known minimum best_rd for early termination. const int64_t rd_thresh = cpi->sf.tx_sf.use_rd_based_breakout_for_intra_tx_search ? AOMMIN(ref_best_rd, best_rd) : ref_best_rd; rd[depth] = uniform_txfm_yrd(cpi, x, &this_rd_stats, rd_thresh, bs, tx_size, FTXS_NONE, skip_trellis); if (rd[depth] < best_rd) { av1_copy_array(best_blk_skip, txfm_info->blk_skip, num_blks); av1_copy_array(best_txk_type_map, xd->tx_type_map, num_blks); best_tx_size = tx_size; best_rd = rd[depth]; *rd_stats = this_rd_stats; } if (tx_size == TX_4X4) break; // If we are searching three depths, prune the smallest size depending // on rd results for the first two depths for low contrast blocks. if (depth > init_depth && depth != MAX_TX_DEPTH && x->source_variance < 256) { if (rd[depth - 1] != INT64_MAX && rd[depth] > rd[depth - 1]) break; } } if (rd_stats->rate != INT_MAX) { mbmi->tx_size = best_tx_size; av1_copy_array(xd->tx_type_map, best_txk_type_map, num_blks); av1_copy_array(txfm_info->blk_skip, best_blk_skip, num_blks); } #if !CONFIG_REALTIME_ONLY // Reset the flags to avoid any unintentional evaluation of NN model and // consumption of prune depths. txfm_params->enable_nn_prune_intra_tx_depths = false; txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_NONE; #endif } // Search for the best transform type for the given transform block in the // given plane/channel, and calculate the corresponding RD cost. static inline void block_rd_txfm(int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { struct rdcost_block_args *args = arg; if (args->exit_early) { args->incomplete_exit = 1; return; } MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; const int is_inter = is_inter_block(xd->mi[0]); const AV1_COMP *cpi = args->cpi; ENTROPY_CONTEXT *a = args->t_above + blk_col; ENTROPY_CONTEXT *l = args->t_left + blk_row; const AV1_COMMON *cm = &cpi->common; RD_STATS this_rd_stats; av1_init_rd_stats(&this_rd_stats); if (!is_inter) { av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size); av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); #if !CONFIG_REALTIME_ONLY const TxfmSearchParams *const txfm_params = &x->txfm_search_params; if (txfm_params->enable_nn_prune_intra_tx_depths) { ml_predict_intra_tx_depth_prune(x, blk_row, blk_col, plane_bsize, tx_size); if (txfm_params->nn_prune_depths_for_intra_tx == TX_PRUNE_LARGEST) { av1_invalid_rd_stats(&args->rd_stats); args->exit_early = 1; return; } } #endif } TXB_CTX txb_ctx; get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); search_tx_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, &txb_ctx, args->ftxs_mode, args->skip_trellis, args->best_rd - args->current_rd, &this_rd_stats); #if !CONFIG_REALTIME_ONLY if (plane == AOM_PLANE_Y && xd->cfl.store_y) { assert(!is_inter || plane_bsize < BLOCK_8X8); cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize); } #endif #if CONFIG_RD_DEBUG update_txb_coeff_cost(&this_rd_stats, plane, this_rd_stats.rate); #endif // CONFIG_RD_DEBUG av1_set_txb_context(x, plane, block, tx_size, a, l); const int blk_idx = blk_row * (block_size_wide[plane_bsize] >> MI_SIZE_LOG2) + blk_col; TxfmSearchInfo *txfm_info = &x->txfm_search_info; if (plane == 0) set_blk_skip(txfm_info->blk_skip, plane, blk_idx, x->plane[plane].eobs[block] == 0); else set_blk_skip(txfm_info->blk_skip, plane, blk_idx, 0); int64_t rd; if (is_inter) { const int64_t no_skip_txfm_rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); const int64_t skip_txfm_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse); rd = AOMMIN(no_skip_txfm_rd, skip_txfm_rd); this_rd_stats.skip_txfm &= !x->plane[plane].eobs[block]; } else { // Signal non-skip_txfm for Intra blocks rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); this_rd_stats.skip_txfm = 0; } av1_merge_rd_stats(&args->rd_stats, &this_rd_stats); args->current_rd += rd; if (args->current_rd > args->best_rd) args->exit_early = 1; } int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs, TX_SIZE tx_size) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const TxfmSearchParams *txfm_params = &x->txfm_search_params; const ModeCosts *mode_costs = &x->mode_costs; const int is_inter = is_inter_block(mbmi); const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT && block_signals_txsize(mbmi->bsize); int tx_size_rate = 0; if (tx_select) { const int ctx = txfm_partition_context( xd->above_txfm_context, xd->left_txfm_context, mbmi->bsize, tx_size); tx_size_rate = mode_costs->txfm_partition_cost[ctx][0]; } const int skip_ctx = av1_get_skip_txfm_context(xd); const int no_skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][0]; const int skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][1]; const int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, 0); const int64_t no_this_rd = RDCOST(x->rdmult, no_skip_txfm_rate + tx_size_rate, 0); mbmi->tx_size = tx_size; const uint8_t txw_unit = tx_size_wide_unit[tx_size]; const uint8_t txh_unit = tx_size_high_unit[tx_size]; const int step = txw_unit * txh_unit; const int max_blocks_wide = max_block_wide(xd, bs, 0); const int max_blocks_high = max_block_high(xd, bs, 0); struct rdcost_block_args args; av1_zero(args); args.x = x; args.cpi = cpi; args.best_rd = ref_best_rd; args.current_rd = AOMMIN(no_this_rd, skip_txfm_rd); av1_init_rd_stats(&args.rd_stats); av1_get_entropy_contexts(bs, &xd->plane[0], args.t_above, args.t_left); int i = 0; for (int blk_row = 0; blk_row < max_blocks_high && !args.incomplete_exit; blk_row += txh_unit) { for (int blk_col = 0; blk_col < max_blocks_wide; blk_col += txw_unit) { RD_STATS this_rd_stats; av1_init_rd_stats(&this_rd_stats); if (args.exit_early) { args.incomplete_exit = 1; break; } ENTROPY_CONTEXT *a = args.t_above + blk_col; ENTROPY_CONTEXT *l = args.t_left + blk_row; TXB_CTX txb_ctx; get_txb_ctx(bs, tx_size, 0, a, l, &txb_ctx); TxfmParam txfm_param; QUANT_PARAM quant_param; av1_setup_xform(&cpi->common, x, tx_size, DCT_DCT, &txfm_param); av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, 0, &quant_param); av1_xform(x, 0, i, blk_row, blk_col, bs, &txfm_param); av1_quant(x, 0, i, &txfm_param, &quant_param); this_rd_stats.rate = cost_coeffs(x, 0, i, tx_size, txfm_param.tx_type, &txb_ctx, 0); const SCAN_ORDER *const scan_order = get_scan(txfm_param.tx_size, txfm_param.tx_type); dist_block_tx_domain(x, 0, i, tx_size, quant_param.qmatrix, scan_order->scan, &this_rd_stats.dist, &this_rd_stats.sse); const int64_t no_skip_txfm_rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); const int64_t skip_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse); this_rd_stats.skip_txfm &= !x->plane[0].eobs[i]; av1_merge_rd_stats(&args.rd_stats, &this_rd_stats); args.current_rd += AOMMIN(no_skip_txfm_rd, skip_rd); if (args.current_rd > ref_best_rd) { args.exit_early = 1; break; } av1_set_txb_context(x, 0, i, tx_size, a, l); i += step; } } if (args.incomplete_exit) av1_invalid_rd_stats(&args.rd_stats); *rd_stats = args.rd_stats; if (rd_stats->rate == INT_MAX) return INT64_MAX; int64_t rd; // rdstats->rate should include all the rate except skip/non-skip cost as the // same is accounted in the caller functions after rd evaluation of all // planes. However the decisions should be done after considering the // skip/non-skip header cost if (rd_stats->skip_txfm && is_inter) { rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); } else { // Intra blocks are always signalled as non-skip rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate, rd_stats->dist); rd_stats->rate += tx_size_rate; } // Check if forcing the block to skip transform leads to smaller RD cost. if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) { int64_t temp_skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); if (temp_skip_txfm_rd <= rd) { rd = temp_skip_txfm_rd; rd_stats->rate = 0; rd_stats->dist = rd_stats->sse; rd_stats->skip_txfm = 1; } } return rd; } // Search for the best transform type for a luma inter-predicted block, given // the transform block partitions. // This function is used only when some speed features are enabled. static inline void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int depth, ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, int64_t ref_best_rd, RD_STATS *rd_stats, FAST_TX_SEARCH_MODE ftxs_mode) { assert(tx_size < TX_SIZES_ALL); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; assert(is_inter_block(mbmi)); const int max_blocks_high = max_block_high(xd, plane_bsize, 0); const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0); if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index( plane_bsize, blk_row, blk_col)]; const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row, mbmi->bsize, tx_size); av1_init_rd_stats(rd_stats); if (tx_size == plane_tx_size) { ENTROPY_CONTEXT *ta = above_ctx + blk_col; ENTROPY_CONTEXT *tl = left_ctx + blk_row; const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); TXB_CTX txb_ctx; get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx); const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][get_plane_type(0)] .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; rd_stats->zero_rate = zero_blk_rate; tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx, rd_stats, ftxs_mode, ref_best_rd); const int mi_width = mi_size_wide[plane_bsize]; TxfmSearchInfo *txfm_info = &x->txfm_search_info; if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) || rd_stats->skip_txfm == 1) { rd_stats->rate = zero_blk_rate; rd_stats->dist = rd_stats->sse; rd_stats->skip_txfm = 1; set_blk_skip(txfm_info->blk_skip, 0, blk_row * mi_width + blk_col, 1); x->plane[0].eobs[block] = 0; x->plane[0].txb_entropy_ctx[block] = 0; update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); } else { rd_stats->skip_txfm = 0; set_blk_skip(txfm_info->blk_skip, 0, blk_row * mi_width + blk_col, 0); } if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) rd_stats->rate += x->mode_costs.txfm_partition_cost[ctx][0]; av1_set_txb_context(x, 0, block, tx_size, ta, tl); txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size, tx_size); } else { const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; const int txb_width = tx_size_wide_unit[sub_txs]; const int txb_height = tx_size_high_unit[sub_txs]; const int step = txb_height * txb_width; const int row_end = AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); const int col_end = AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); RD_STATS pn_rd_stats; int64_t this_rd = 0; assert(txb_width > 0 && txb_height > 0); for (int row = 0; row < row_end; row += txb_height) { const int offsetr = blk_row + row; for (int col = 0; col < col_end; col += txb_width) { const int offsetc = blk_col + col; av1_init_rd_stats(&pn_rd_stats); tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize, depth + 1, above_ctx, left_ctx, tx_above, tx_left, ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode); if (pn_rd_stats.rate == INT_MAX) { av1_invalid_rd_stats(rd_stats); return; } av1_merge_rd_stats(rd_stats, &pn_rd_stats); this_rd += RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist); block += step; } } if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) rd_stats->rate += x->mode_costs.txfm_partition_cost[ctx][1]; } } // search for tx type with tx sizes already decided for a inter-predicted luma // partition block. It's used only when some speed features are enabled. // Return value 0: early termination triggered, no valid rd cost available; // 1: rd cost values are valid. static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bsize, int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) { if (ref_best_rd < 0) { av1_invalid_rd_stats(rd_stats); return 0; } av1_init_rd_stats(rd_stats); MACROBLOCKD *const xd = &x->e_mbd; const TxfmSearchParams *txfm_params = &x->txfm_search_params; const struct macroblockd_plane *const pd = &xd->plane[0]; const int mi_width = mi_size_wide[bsize]; const int mi_height = mi_size_high[bsize]; const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0); const int bh = tx_size_high_unit[max_tx_size]; const int bw = tx_size_wide_unit[max_tx_size]; const int step = bw * bh; const int init_depth = get_search_init_depth( mi_width, mi_height, 1, &cpi->sf, txfm_params->tx_size_search_method); ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; TXFM_CONTEXT tx_above[MAX_MIB_SIZE]; TXFM_CONTEXT tx_left[MAX_MIB_SIZE]; av1_get_entropy_contexts(bsize, pd, ctxa, ctxl); memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width); memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height); int64_t this_rd = 0; for (int idy = 0, block = 0; idy < mi_height; idy += bh) { for (int idx = 0; idx < mi_width; idx += bw) { RD_STATS pn_rd_stats; av1_init_rd_stats(&pn_rd_stats); tx_block_yrd(cpi, x, idy, idx, block, max_tx_size, bsize, init_depth, ctxa, ctxl, tx_above, tx_left, ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode); if (pn_rd_stats.rate == INT_MAX) { av1_invalid_rd_stats(rd_stats); return 0; } av1_merge_rd_stats(rd_stats, &pn_rd_stats); this_rd += AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist), RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse)); block += step; } } const int skip_ctx = av1_get_skip_txfm_context(xd); const int no_skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][0]; const int skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][1]; const int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); this_rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate, rd_stats->dist); if (skip_txfm_rd < this_rd) { this_rd = skip_txfm_rd; rd_stats->rate = 0; rd_stats->dist = rd_stats->sse; rd_stats->skip_txfm = 1; } const int is_cost_valid = this_rd > ref_best_rd; if (!is_cost_valid) { // reset cost value av1_invalid_rd_stats(rd_stats); } return is_cost_valid; } // Search for the best transform size and type for current inter-predicted // luma block with recursive transform block partitioning. The obtained // transform selection will be saved in xd->mi[0], the corresponding RD stats // will be saved in rd_stats. The returned value is the corresponding RD cost. static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bsize, int64_t ref_best_rd) { MACROBLOCKD *const xd = &x->e_mbd; const TxfmSearchParams *txfm_params = &x->txfm_search_params; assert(is_inter_block(xd->mi[0])); assert(bsize < BLOCK_SIZES_ALL); const int fast_tx_search = txfm_params->tx_size_search_method > USE_FULL_RD; int64_t rd_thresh = ref_best_rd; if (rd_thresh == 0) { av1_invalid_rd_stats(rd_stats); return INT64_MAX; } if (fast_tx_search && rd_thresh < INT64_MAX) { if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3); } assert(rd_thresh > 0); const FAST_TX_SEARCH_MODE ftxs_mode = fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE; const struct macroblockd_plane *const pd = &xd->plane[0]; assert(bsize < BLOCK_SIZES_ALL); const int mi_width = mi_size_wide[bsize]; const int mi_height = mi_size_high[bsize]; ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; TXFM_CONTEXT tx_above[MAX_MIB_SIZE]; TXFM_CONTEXT tx_left[MAX_MIB_SIZE]; av1_get_entropy_contexts(bsize, pd, ctxa, ctxl); memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width); memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height); const int init_depth = get_search_init_depth( mi_width, mi_height, 1, &cpi->sf, txfm_params->tx_size_search_method); const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize]; const int bh = tx_size_high_unit[max_tx_size]; const int bw = tx_size_wide_unit[max_tx_size]; const int step = bw * bh; const int skip_ctx = av1_get_skip_txfm_context(xd); const int no_skip_txfm_cost = x->mode_costs.skip_txfm_cost[skip_ctx][0]; const int skip_txfm_cost = x->mode_costs.skip_txfm_cost[skip_ctx][1]; int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, 0); int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_cost, 0); int block = 0; av1_init_rd_stats(rd_stats); for (int idy = 0; idy < max_block_high(xd, bsize, 0); idy += bh) { for (int idx = 0; idx < max_block_wide(xd, bsize, 0); idx += bw) { const int64_t best_rd_sofar = (rd_thresh == INT64_MAX) ? INT64_MAX : (rd_thresh - (AOMMIN(skip_txfm_rd, no_skip_txfm_rd))); int is_cost_valid = 1; RD_STATS pn_rd_stats; // Search for the best transform block size and type for the sub-block. select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, bsize, ctxa, ctxl, tx_above, tx_left, &pn_rd_stats, INT64_MAX, best_rd_sofar, &is_cost_valid, ftxs_mode); if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) { av1_invalid_rd_stats(rd_stats); return INT64_MAX; } av1_merge_rd_stats(rd_stats, &pn_rd_stats); skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse); no_skip_txfm_rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist); block += step; } } if (rd_stats->rate == INT_MAX) return INT64_MAX; rd_stats->skip_txfm = (skip_txfm_rd <= no_skip_txfm_rd); // If fast_tx_search is true, only DCT and 1D DCT were tested in // select_inter_block_yrd() above. Do a better search for tx type with // tx sizes already decided. if (fast_tx_search && cpi->sf.tx_sf.refine_fast_tx_search_results) { if (!inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, FTXS_NONE)) return INT64_MAX; } int64_t final_rd; if (rd_stats->skip_txfm) { final_rd = RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse); } else { final_rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist); if (!xd->lossless[xd->mi[0]->segment_id]) { final_rd = AOMMIN(final_rd, RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse)); } } return final_rd; } // Return 1 to terminate transform search early. The decision is made based on // the comparison with the reference RD cost and the model-estimated RD cost. static inline int model_based_tx_search_prune(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int64_t ref_best_rd) { const int level = cpi->sf.tx_sf.model_based_prune_tx_search_level; assert(level >= 0 && level <= 2); int model_rate; int64_t model_dist; uint8_t model_skip; MACROBLOCKD *const xd = &x->e_mbd; model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE]( cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist, &model_skip, NULL, NULL, NULL, NULL); if (model_skip) return 0; const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist); // TODO(debargha, urvang): Improve the model and make the check below // tighter. static const int prune_factor_by8[] = { 3, 5 }; const int factor = prune_factor_by8[level - 1]; return ((model_rd * factor) >> 3) > ref_best_rd; } void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bsize, int64_t ref_best_rd) { MACROBLOCKD *const xd = &x->e_mbd; const TxfmSearchParams *txfm_params = &x->txfm_search_params; assert(is_inter_block(xd->mi[0])); av1_invalid_rd_stats(rd_stats); // If modeled RD cost is a lot worse than the best so far, terminate early. if (cpi->sf.tx_sf.model_based_prune_tx_search_level && ref_best_rd != INT64_MAX) { if (model_based_tx_search_prune(cpi, x, bsize, ref_best_rd)) return; } // Hashing based speed feature. If the hash of the prediction residue block is // found in the hash table, use previous search results and terminate early. uint32_t hash = 0; MB_RD_RECORD *mb_rd_record = NULL; const int mi_row = x->e_mbd.mi_row; const int mi_col = x->e_mbd.mi_col; const int within_border = mi_row >= xd->tile.mi_row_start && (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) && mi_col >= xd->tile.mi_col_start && (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end); const int is_mb_rd_hash_enabled = (within_border && cpi->sf.rd_sf.use_mb_rd_hash); const int n4 = bsize_to_num_blk(bsize); if (is_mb_rd_hash_enabled) { hash = get_block_residue_hash(x, bsize); mb_rd_record = x->txfm_search_info.mb_rd_record; const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash); if (match_index != -1) { MB_RD_INFO *mb_rd_info = &mb_rd_record->mb_rd_info[match_index]; fetch_mb_rd_info(n4, mb_rd_info, rd_stats, x); return; } } // If we predict that skip is the optimal RD decision - set the respective // context and terminate early. int64_t dist; if (txfm_params->skip_txfm_level && predict_skip_txfm(x, bsize, &dist, cpi->common.features.reduced_tx_set_used)) { set_skip_txfm(x, rd_stats, bsize, dist); // Save the RD search results into mb_rd_record. if (is_mb_rd_hash_enabled) save_mb_rd_info(n4, hash, x, rd_stats, mb_rd_record); return; } #if CONFIG_SPEED_STATS ++x->txfm_search_info.tx_search_count; #endif // CONFIG_SPEED_STATS const int64_t rd = select_tx_size_and_type(cpi, x, rd_stats, bsize, ref_best_rd); if (rd == INT64_MAX) { // We should always find at least one candidate unless ref_best_rd is less // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type // might have failed to find something better) assert(ref_best_rd != INT64_MAX); av1_invalid_rd_stats(rd_stats); return; } // Save the RD search results into mb_rd_record. if (is_mb_rd_hash_enabled) { assert(mb_rd_record != NULL); save_mb_rd_info(n4, hash, x, rd_stats, mb_rd_record); } } void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bs, int64_t ref_best_rd) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const TxfmSearchParams *tx_params = &x->txfm_search_params; assert(bs == mbmi->bsize); const int is_inter = is_inter_block(mbmi); const int mi_row = xd->mi_row; const int mi_col = xd->mi_col; av1_init_rd_stats(rd_stats); // Hashing based speed feature for inter blocks. If the hash of the residue // block is found in the table, use previously saved search results and // terminate early. uint32_t hash = 0; MB_RD_RECORD *mb_rd_record = NULL; const int num_blks = bsize_to_num_blk(bs); if (is_inter && cpi->sf.rd_sf.use_mb_rd_hash) { const int within_border = mi_row >= xd->tile.mi_row_start && (mi_row + mi_size_high[bs] < xd->tile.mi_row_end) && mi_col >= xd->tile.mi_col_start && (mi_col + mi_size_wide[bs] < xd->tile.mi_col_end); if (within_border) { hash = get_block_residue_hash(x, bs); mb_rd_record = x->txfm_search_info.mb_rd_record; const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash); if (match_index != -1) { MB_RD_INFO *mb_rd_info = &mb_rd_record->mb_rd_info[match_index]; fetch_mb_rd_info(num_blks, mb_rd_info, rd_stats, x); return; } } } // If we predict that skip is the optimal RD decision - set the respective // context and terminate early. int64_t dist; if (tx_params->skip_txfm_level && is_inter && !xd->lossless[mbmi->segment_id] && predict_skip_txfm(x, bs, &dist, cpi->common.features.reduced_tx_set_used)) { // Populate rdstats as per skip decision set_skip_txfm(x, rd_stats, bs, dist); // Save the RD search results into mb_rd_record. if (mb_rd_record) { save_mb_rd_info(num_blks, hash, x, rd_stats, mb_rd_record); } return; } if (xd->lossless[mbmi->segment_id]) { // Lossless mode can only pick the smallest (4x4) transform size. choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs); } else if (tx_params->tx_size_search_method == USE_LARGESTALL) { choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs); } else { choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs); } // Save the RD search results into mb_rd_record for possible reuse in future. if (mb_rd_record) { save_mb_rd_info(num_blks, hash, x, rd_stats, mb_rd_record); } } int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bsize, int64_t ref_best_rd) { av1_init_rd_stats(rd_stats); if (ref_best_rd < 0) return 0; if (!x->e_mbd.is_chroma_ref) return 1; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U]; const int is_inter = is_inter_block(mbmi); int64_t this_rd = 0, skip_txfm_rd = 0; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); if (is_inter) { for (int plane = 1; plane < MAX_MB_PLANE; ++plane) av1_subtract_plane(x, plane_bsize, plane); } const int skip_trellis = 0; const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); int is_cost_valid = 1; for (int plane = 1; plane < MAX_MB_PLANE; ++plane) { RD_STATS this_rd_stats; int64_t chroma_ref_best_rd = ref_best_rd; // For inter blocks, refined ref_best_rd is used for early exit // For intra blocks, even though current rd crosses ref_best_rd, early // exit is not recommended as current rd is used for gating subsequent // modes as well (say, for angular modes) // TODO(any): Extend the early exit mechanism for intra modes as well if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma && is_inter && chroma_ref_best_rd != INT64_MAX) chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_txfm_rd); av1_txfm_rd_in_plane(x, cpi, &this_rd_stats, chroma_ref_best_rd, 0, plane, plane_bsize, uv_tx_size, FTXS_NONE, skip_trellis); if (this_rd_stats.rate == INT_MAX) { is_cost_valid = 0; break; } av1_merge_rd_stats(rd_stats, &this_rd_stats); this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); skip_txfm_rd = RDCOST(x->rdmult, 0, rd_stats->sse); if (AOMMIN(this_rd, skip_txfm_rd) > ref_best_rd) { is_cost_valid = 0; break; } } if (!is_cost_valid) { // reset cost value av1_invalid_rd_stats(rd_stats); } return is_cost_valid; } void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, RD_STATS *rd_stats, int64_t ref_best_rd, int64_t current_rd, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) { assert(IMPLIES(plane == 0, x->e_mbd.mi[0]->tx_size == tx_size)); if (!cpi->oxcf.txfm_cfg.enable_tx64 && txsize_sqr_up_map[tx_size] == TX_64X64) { av1_invalid_rd_stats(rd_stats); return; } if (current_rd > ref_best_rd) { av1_invalid_rd_stats(rd_stats); return; } MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; struct rdcost_block_args args; av1_zero(args); args.x = x; args.cpi = cpi; args.best_rd = ref_best_rd; args.current_rd = current_rd; args.ftxs_mode = ftxs_mode; args.skip_trellis = skip_trellis; av1_init_rd_stats(&args.rd_stats); av1_get_entropy_contexts(plane_bsize, pd, args.t_above, args.t_left); av1_foreach_transformed_block_in_plane(xd, plane_bsize, plane, block_rd_txfm, &args); MB_MODE_INFO *const mbmi = xd->mi[0]; const int is_inter = is_inter_block(mbmi); const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early; if (invalid_rd) { av1_invalid_rd_stats(rd_stats); } else { *rd_stats = args.rd_stats; } } int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd) { MACROBLOCKD *const xd = &x->e_mbd; TxfmSearchParams *txfm_params = &x->txfm_search_params; const int skip_ctx = av1_get_skip_txfm_context(xd); const int skip_txfm_cost[2] = { x->mode_costs.skip_txfm_cost[skip_ctx][0], x->mode_costs.skip_txfm_cost[skip_ctx][1] }; const int64_t min_header_rate = mode_rate + AOMMIN(skip_txfm_cost[0], skip_txfm_cost[1]); // Account for minimum skip and non_skip rd. // Eventually either one of them will be added to mode_rate const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0); if (min_header_rd_possible > ref_best_rd) { av1_invalid_rd_stats(rd_stats_y); return 0; } const AV1_COMMON *cm = &cpi->common; MB_MODE_INFO *const mbmi = xd->mi[0]; const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0); const int64_t rd_thresh = ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd; av1_init_rd_stats(rd_stats); av1_init_rd_stats(rd_stats_y); rd_stats->rate = mode_rate; // cost and distortion av1_subtract_plane(x, bsize, 0); if (txfm_params->tx_mode_search_type == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { av1_pick_recursive_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh); #if CONFIG_COLLECT_RD_STATS == 2 PrintPredictionUnitStats(cpi, tile_data, x, rd_stats_y, bsize); #endif // CONFIG_COLLECT_RD_STATS == 2 } else { av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh); memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); for (int i = 0; i < xd->height * xd->width; ++i) set_blk_skip(x->txfm_search_info.blk_skip, 0, i, rd_stats_y->skip_txfm); } if (rd_stats_y->rate == INT_MAX) return 0; av1_merge_rd_stats(rd_stats, rd_stats_y); const int64_t non_skip_txfm_rdcosty = RDCOST(x->rdmult, rd_stats->rate + skip_txfm_cost[0], rd_stats->dist); const int64_t skip_txfm_rdcosty = RDCOST(x->rdmult, mode_rate + skip_txfm_cost[1], rd_stats->sse); const int64_t min_rdcosty = AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty); if (min_rdcosty > ref_best_rd) return 0; av1_init_rd_stats(rd_stats_uv); const int num_planes = av1_num_planes(cm); if (num_planes > 1) { int64_t ref_best_chroma_rd = ref_best_rd; // Calculate best rd cost possible for chroma if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma && (ref_best_chroma_rd != INT64_MAX)) { ref_best_chroma_rd = (ref_best_chroma_rd - AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty)); } const int is_cost_valid_uv = av1_txfm_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_chroma_rd); if (!is_cost_valid_uv) return 0; av1_merge_rd_stats(rd_stats, rd_stats_uv); } int choose_skip_txfm = rd_stats->skip_txfm; if (!choose_skip_txfm && !xd->lossless[mbmi->segment_id]) { const int64_t rdcost_no_skip_txfm = RDCOST( x->rdmult, rd_stats_y->rate + rd_stats_uv->rate + skip_txfm_cost[0], rd_stats->dist); const int64_t rdcost_skip_txfm = RDCOST(x->rdmult, skip_txfm_cost[1], rd_stats->sse); if (rdcost_no_skip_txfm >= rdcost_skip_txfm) choose_skip_txfm = 1; } if (choose_skip_txfm) { rd_stats_y->rate = 0; rd_stats_uv->rate = 0; rd_stats->rate = mode_rate + skip_txfm_cost[1]; rd_stats->dist = rd_stats->sse; rd_stats_y->dist = rd_stats_y->sse; rd_stats_uv->dist = rd_stats_uv->sse; mbmi->skip_txfm = 1; if (rd_stats->skip_txfm) { const int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); if (tmprd > ref_best_rd) return 0; } } else { rd_stats->rate += skip_txfm_cost[0]; mbmi->skip_txfm = 0; } return 1; } aom-3.12.1/av1/encoder/tx_search.h000066400000000000000000000225771477627663500166550ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_ #define AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_ #include "av1/common/pred_common.h" #include "av1/encoder/encoder.h" #ifdef __cplusplus extern "C" { #endif // Set this macro as 1 to collect data about tx size selection. #define COLLECT_TX_SIZE_DATA 0 #if COLLECT_TX_SIZE_DATA static const char av1_tx_size_data_output_file[] = "tx_size_data.txt"; #endif enum { FTXS_NONE = 0, FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0, FTXS_DISABLE_TRELLIS_OPT = 1 << 1, FTXS_USE_TRANSFORM_DOMAIN = 1 << 2 } UENUM1BYTE(FAST_TX_SEARCH_MODE); static inline int tx_size_cost(const MACROBLOCK *const x, BLOCK_SIZE bsize, TX_SIZE tx_size) { assert(bsize == x->e_mbd.mi[0]->bsize); if (x->txfm_search_params.tx_mode_search_type != TX_MODE_SELECT || !block_signals_txsize(bsize)) return 0; const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); const int depth = tx_size_to_depth(tx_size, bsize); const MACROBLOCKD *const xd = &x->e_mbd; const int tx_size_ctx = get_tx_size_context(xd); return x->mode_costs.tx_size_cost[tx_size_cat][tx_size_ctx][depth]; } /*!\brief Compute the pixel domain distortion. * * \ingroup transform_search * Compute the pixel domain distortion from diff on all visible 4x4s in the * transform block. * * \param[in] x Pointer to structure holding the data for the current encoding macroblock * \param[in] plane Plane index * \param[in] blk_row Block row index * \param[in] blk_col Block col index * \param[in] plane_bsize Current plane block size * \param[in] tx_bsize Transform size * \param[in] block_mse_q8 Block mse * \return An int64_t value that is the block sse. */ int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row, int blk_col, const BLOCK_SIZE plane_bsize, const BLOCK_SIZE tx_bsize, unsigned int *block_mse_q8); int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs, TX_SIZE tx_size); /*!\brief Recursive transform size and type search. * * \ingroup transform_search * Search for best transform size and type for luma inter blocks. The transform * block partitioning can be recursive resulting in non-uniform transform sizes. * The best transform size and type, if found, will be saved in the MB_MODE_INFO * structure, and the corresponding RD stats will be saved in rd_stats. * * \param[in] cpi Top-level encoder structure * \param[in] x Pointer to structure holding the data for the current encoding macroblock * \param[in] rd_stats Pointer to struct to keep track of the RD stats * \param[in] bsize Current macroblock size * \param[in] ref_best_rd Best RD cost seen for this block so far * \remark Nothing is returned. The selected transform size and type will be saved in the MB_MODE_INFO structure */ void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bsize, int64_t ref_best_rd); /*!\brief Uniform transform size and type search. * * \ingroup transform_search * Search for the best transform size and type for current macroblock block, * with the assumption that all the transform blocks have a uniform size * (VP9 style). The selected transform size and type will be saved in the * MB_MODE_INFO structure; the corresponding RD stats will be saved in rd_stats. * This function may be used for both intra and inter predicted blocks. * * \param[in] cpi Top-level encoder structure * \param[in] x Pointer to structure holding the data for the current encoding macroblock * \param[in] rd_stats Pointer to struct to keep track of the RD stats * \param[in] bs Current macroblock size * \param[in] ref_best_rd Best RD cost seen for this block so far * \remark Nothing is returned. The selected transform size and type will be saved in the MB_MODE_INFO structure */ void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bs, int64_t ref_best_rd); /*!\brief Chroma block transform search. * * \ingroup transform_search * Calculate the transform coefficient RD cost for the given chroma macroblock * If the current mode is intra, then this function will compute the predictor. * * \param[in] cpi Top-level encoder structure * \param[in] x Pointer to structure holding the data for the current encoding macroblock * \param[in] rd_stats Pointer to struct to keep track of the RD stats * \param[in] bsize Current macroblock size * \param[in] ref_best_rd Best RD cost seen for this block so far * \return An integer value is returned. 0: early termination triggered, no valid rd cost available; 1: rd cost values are valid. */ int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bsize, int64_t ref_best_rd); /*!\brief Transform type search with fixed transform size. * * \ingroup transform_search * Search for the best transform type and calculate the transform coefficients * RD cost of the current transform block with the specified (uniform) transform * size and plane. The RD results will be saved in rd_stats. * * \param[in] x Pointer to structure holding the data for the current encoding macroblock * \param[in] cpi Top-level encoder structure * \param[in] rd_stats Pointer to struct to keep track of the RD stats * \param[in] ref_best_rd Best RD cost seen for this block so far * \param[in] current_rd Current RD cost for this block so far * \param[in] plane Plane index * \param[in] plane_bsize Size of the current macroblock considering sup-sampling * \param[in] tx_size The given transform size * \param[in] ftxs_mode Transform search mode specifying desired speed and quality tradeoff * \param[in] skip_trellis Binary flag indicating if trellis optimization should be skipped * * \remark Nothing is returned. The RD results will be saved in rd_stats. */ void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, RD_STATS *rd_stats, int64_t ref_best_rd, int64_t current_rd, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis); /*!\brief Recursive transform size and type search. * * \ingroup transform_search * This function combines y and uv planes' transform search processes together * for inter-predicted blocks (including IntraBC), when the prediction is * already generated. It first does subtraction to obtain the prediction error. * Then it calls * av1_pick_recursive_tx_size_type_yrd/av1_pick_uniform_tx_size_type_yrd and * av1_txfm_uvrd sequentially and handles possible early terminations. * The RD metrics are calculated and stored in rd_stats/_y/_uv. * * \param[in] cpi Top-level encoder structure * \param[in] x Pointer to structure holding the data for the current encoding macroblock * \param[in] bsize Current macroblock size * \param[in] rd_stats Pointer to struct to keep track of the overal RD stats * \param[in] rd_stats_y Pointer to struct to keep track of the RD stats for the luma plane * \param[in] rd_stats_uv Pointer to struct to keep track of the RD stats for the chroma planes * \param[in] mode_rate Rate cost to encode the prediction mode info. of the current macroblock * \param[in] ref_best_rd Best RD cost seen for this block so far * * \return An integer value is returned indicating if a valid transform candidate is found (1) or not (0). */ int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_ aom-3.12.1/av1/encoder/txb_rdopt.c000066400000000000000000000645541477627663500166760ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/encoder/txb_rdopt.h" #include "av1/encoder/txb_rdopt_utils.h" #include "aom_ports/mem.h" #include "av1/common/idct.h" static inline void update_coeff_general( int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class, int bhl, int width, int64_t rdmult, int shift, int dc_sign_ctx, const int16_t *dequant, const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, const qm_val_t *iqmatrix, const qm_val_t *qmatrix) { const int dqv = get_dqv(dequant, scan[si], iqmatrix); const int ci = scan[si]; const tran_low_t qc = qcoeff[ci]; const int is_last = si == (eob - 1); const int coeff_ctx = get_lower_levels_ctx_general( is_last, si, bhl, width, levels, ci, tx_size, tx_class); if (qc == 0) { *accu_rate += txb_costs->base_cost[coeff_ctx][0]; } else { const int sign = (qc < 0) ? 1 : 0; const tran_low_t abs_qc = abs(qc); const tran_low_t tqc = tcoeff[ci]; const tran_low_t dqc = dqcoeff[ci]; const int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci); const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci); const int rate = get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx, txb_costs, bhl, tx_class, levels); const int64_t rd = RDCOST(rdmult, rate, dist); tran_low_t qc_low, dqc_low; tran_low_t abs_qc_low; int64_t dist_low, rd_low; int rate_low; if (abs_qc == 1) { abs_qc_low = qc_low = dqc_low = 0; dist_low = dist0; rate_low = txb_costs->base_cost[coeff_ctx][0]; } else { get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low); abs_qc_low = abs_qc - 1; dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci); rate_low = get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx, dc_sign_ctx, txb_costs, bhl, tx_class, levels); } rd_low = RDCOST(rdmult, rate_low, dist_low); if (rd_low < rd) { qcoeff[ci] = qc_low; dqcoeff[ci] = dqc_low; levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX); *accu_rate += rate_low; *accu_dist += dist_low - dist0; } else { *accu_rate += rate; *accu_dist += dist - dist0; } } } static AOM_FORCE_INLINE void update_coeff_simple( int *accu_rate, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class, int bhl, int64_t rdmult, int shift, const int16_t *dequant, const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, const qm_val_t *iqmatrix, const qm_val_t *qmatrix) { const int dqv = get_dqv(dequant, scan[si], iqmatrix); (void)eob; // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0) // and not the last (scan_idx != eob - 1) assert(si != eob - 1); assert(si > 0); const int ci = scan[si]; const tran_low_t qc = qcoeff[ci]; const int coeff_ctx = get_lower_levels_ctx(levels, ci, bhl, tx_size, tx_class); if (qc == 0) { *accu_rate += txb_costs->base_cost[coeff_ctx][0]; } else { const tran_low_t abs_qc = abs(qc); const tran_low_t abs_tqc = abs(tcoeff[ci]); const tran_low_t abs_dqc = abs(dqcoeff[ci]); int rate_low = 0; const int rate = get_two_coeff_cost_simple( ci, abs_qc, coeff_ctx, txb_costs, bhl, tx_class, levels, &rate_low); if (abs_dqc < abs_tqc) { *accu_rate += rate; return; } const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci); const int64_t rd = RDCOST(rdmult, rate, dist); const tran_low_t abs_qc_low = abs_qc - 1; const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift; const int64_t dist_low = get_coeff_dist(abs_tqc, abs_dqc_low, shift, qmatrix, ci); const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low); if (rd_low < rd) { const int sign = (qc < 0) ? 1 : 0; qcoeff[ci] = (-sign ^ abs_qc_low) + sign; dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign; levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX); *accu_rate += rate_low; } else { *accu_rate += rate; } } } static AOM_FORCE_INLINE void update_coeff_eob( int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci, int si, TX_SIZE tx_size, TX_CLASS tx_class, int bhl, int width, int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant, const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs, const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness, const qm_val_t *iqmatrix, const qm_val_t *qmatrix) { const int dqv = get_dqv(dequant, scan[si], iqmatrix); assert(si != *eob - 1); const int ci = scan[si]; const tran_low_t qc = qcoeff[ci]; const int coeff_ctx = get_lower_levels_ctx(levels, ci, bhl, tx_size, tx_class); if (qc == 0) { *accu_rate += txb_costs->base_cost[coeff_ctx][0]; } else { int lower_level = 0; const tran_low_t abs_qc = abs(qc); const tran_low_t tqc = tcoeff[ci]; const tran_low_t dqc = dqcoeff[ci]; const int sign = (qc < 0) ? 1 : 0; const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci); int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci) - dist0; int rate = get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx, txb_costs, bhl, tx_class, levels); int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist); tran_low_t qc_low, dqc_low; tran_low_t abs_qc_low; int64_t dist_low, rd_low; int rate_low; if (abs_qc == 1) { abs_qc_low = 0; dqc_low = qc_low = 0; dist_low = 0; rate_low = txb_costs->base_cost[coeff_ctx][0]; rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist); } else { get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low); abs_qc_low = abs_qc - 1; dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci) - dist0; rate_low = get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx, dc_sign_ctx, txb_costs, bhl, tx_class, levels); rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low); } int lower_level_new_eob = 0; const int new_eob = si + 1; const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bhl, width, si); const int new_eob_cost = get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class); int rate_coeff_eob = new_eob_cost + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx_new_eob, dc_sign_ctx, txb_costs, bhl, tx_class); int64_t dist_new_eob = dist; int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob); if (abs_qc_low > 0) { const int rate_coeff_eob_low = new_eob_cost + get_coeff_cost_eob(ci, abs_qc_low, sign, coeff_ctx_new_eob, dc_sign_ctx, txb_costs, bhl, tx_class); const int64_t dist_new_eob_low = dist_low; const int64_t rd_new_eob_low = RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low); if (rd_new_eob_low < rd_new_eob) { lower_level_new_eob = 1; rd_new_eob = rd_new_eob_low; rate_coeff_eob = rate_coeff_eob_low; dist_new_eob = dist_new_eob_low; } } if (sharpness == 0 || abs_qc > 1) { if (rd_low < rd) { lower_level = 1; rd = rd_low; rate = rate_low; dist = dist_low; } } if (sharpness == 0 && rd_new_eob < rd) { for (int ni = 0; ni < *nz_num; ++ni) { int last_ci = nz_ci[ni]; levels[get_padded_idx(last_ci, bhl)] = 0; qcoeff[last_ci] = 0; dqcoeff[last_ci] = 0; } *eob = new_eob; *nz_num = 0; *accu_rate = rate_coeff_eob; *accu_dist = dist_new_eob; lower_level = lower_level_new_eob; } else { *accu_rate += rate; *accu_dist += dist; } if (lower_level) { qcoeff[ci] = qc_low; dqcoeff[ci] = dqc_low; levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX); } if (qcoeff[ci]) { nz_ci[*nz_num] = ci; ++*nz_num; } } } static inline void update_skip(int *accu_rate, int64_t accu_dist, int *eob, int nz_num, int *nz_ci, int64_t rdmult, int skip_cost, int non_skip_cost, tran_low_t *qcoeff, tran_low_t *dqcoeff) { const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist); const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0); if (rd_new_eob < rd) { for (int i = 0; i < nz_num; ++i) { const int ci = nz_ci[i]; qcoeff[ci] = 0; dqcoeff[ci] = 0; // no need to set up levels because this is the last step // levels[get_padded_idx(ci, bhl)] = 0; } *accu_rate = 0; *eob = 0; } } // TODO(angiebird): use this function whenever it's possible static int get_tx_type_cost(const MACROBLOCK *x, const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, TX_TYPE tx_type, int reduced_tx_set_used) { if (plane > 0) return 0; const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; const MB_MODE_INFO *mbmi = xd->mi[0]; const int is_inter = is_inter_block(mbmi); if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 && !xd->lossless[xd->mi[0]->segment_id]) { const int ext_tx_set = get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used); if (is_inter) { if (ext_tx_set > 0) return x->mode_costs .inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type]; } else { if (ext_tx_set > 0) { PREDICTION_MODE intra_dir; if (mbmi->filter_intra_mode_info.use_filter_intra) intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info .filter_intra_mode]; else intra_dir = mbmi->mode; return x->mode_costs.intra_tx_type_costs[ext_tx_set][square_tx_size] [intra_dir][tx_type]; } } } return 0; } int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, TX_TYPE tx_type, const TXB_CTX *const txb_ctx, int *rate_cost, int sharpness) { MACROBLOCKD *xd = &x->e_mbd; const struct macroblock_plane *p = &x->plane[plane]; const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type); const int16_t *scan = scan_order->scan; const int shift = av1_get_tx_scale(tx_size); int eob = p->eobs[block]; const int16_t *dequant = p->dequant_QTX; const qm_val_t *iqmatrix = av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type); const qm_val_t *qmatrix = cpi->oxcf.tune_cfg.dist_metric == AOM_DIST_METRIC_QM_PSNR ? av1_get_qmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type) : NULL; const int block_offset = BLOCK_OFFSET(block); tran_low_t *qcoeff = p->qcoeff + block_offset; tran_low_t *dqcoeff = p->dqcoeff + block_offset; const tran_low_t *tcoeff = p->coeff + block_offset; const CoeffCosts *coeff_costs = &x->coeff_costs; // This function is not called if eob = 0. assert(eob > 0); const AV1_COMMON *cm = &cpi->common; const PLANE_TYPE plane_type = get_plane_type(plane); const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); const TX_CLASS tx_class = tx_type_to_class[tx_type]; const MB_MODE_INFO *mbmi = xd->mi[0]; const int bhl = get_txb_bhl(tx_size); const int width = get_txb_wide(tx_size); const int height = get_txb_high(tx_size); assert(height == (1 << bhl)); const int is_inter = is_inter_block(mbmi); const LV_MAP_COEFF_COST *txb_costs = &coeff_costs->coeff_costs[txs_ctx][plane_type]; const int eob_multi_size = txsize_log2_minus4[tx_size]; const LV_MAP_EOB_COST *txb_eob_costs = &coeff_costs->eob_costs[eob_multi_size][plane_type]; // For the IQ tune, increase rshift from 2 to 4. // This biases trellis quantization towards keeping more coefficients, and // together with the IQ rdmult adjustment in // av1_compute_rd_mult_based_on_qindex(), this helps preserve image // features (like repeating patterns and camera noise/film grain), which // improves SSIMULACRA 2 scores. const int rshift = cpi->oxcf.tune_cfg.tuning == AOM_TUNE_IQ ? 4 : 2; const int64_t rdmult = ROUND_POWER_OF_TWO( (int64_t)x->rdmult * (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))), rshift); uint8_t levels_buf[TX_PAD_2D]; uint8_t *const levels = set_levels(levels_buf, height); if (eob > 1) av1_txb_init_levels(qcoeff, width, height, levels); // TODO(angirbird): check iqmatrix const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0]; const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class); int accu_rate = eob_cost; int64_t accu_dist = 0; int si = eob - 1; const int ci = scan[si]; const tran_low_t qc = qcoeff[ci]; const tran_low_t abs_qc = abs(qc); const int sign = qc < 0; const int max_nz_num = 2; int nz_num = 1; int nz_ci[3] = { ci, 0, 0 }; if (abs_qc >= 2) { update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class, bhl, width, rdmult, shift, txb_ctx->dc_sign_ctx, dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff, levels, iqmatrix, qmatrix); --si; } else { assert(abs_qc == 1); const int coeff_ctx = get_lower_levels_ctx_eob(bhl, width, si); accu_rate += get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx, txb_ctx->dc_sign_ctx, txb_costs, bhl, tx_class); const tran_low_t tqc = tcoeff[ci]; const tran_low_t dqc = dqcoeff[ci]; const int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci); const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci); accu_dist += dist - dist0; --si; } #define UPDATE_COEFF_EOB_CASE(tx_class_literal) \ case tx_class_literal: \ for (; si >= 0 && nz_num <= max_nz_num; --si) { \ update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si, \ tx_size, tx_class_literal, bhl, width, \ txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \ txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff, \ levels, sharpness, iqmatrix, qmatrix); \ } \ break switch (tx_class) { UPDATE_COEFF_EOB_CASE(TX_CLASS_2D); UPDATE_COEFF_EOB_CASE(TX_CLASS_HORIZ); UPDATE_COEFF_EOB_CASE(TX_CLASS_VERT); #undef UPDATE_COEFF_EOB_CASE default: assert(false); } if (si == -1 && nz_num <= max_nz_num && sharpness == 0) { update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost, non_skip_cost, qcoeff, dqcoeff); } #define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal) \ case tx_class_literal: \ for (; si >= 1; --si) { \ update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bhl, \ rdmult, shift, dequant, scan, txb_costs, tcoeff, \ qcoeff, dqcoeff, levels, iqmatrix, qmatrix); \ } \ break switch (tx_class) { UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D); UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_HORIZ); UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_VERT); #undef UPDATE_COEFF_SIMPLE_CASE default: assert(false); } // DC position if (si == 0) { // no need to update accu_dist because it's not used after this point int64_t dummy_dist = 0; update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class, bhl, width, rdmult, shift, txb_ctx->dc_sign_ctx, dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff, levels, iqmatrix, qmatrix); } const int tx_type_cost = get_tx_type_cost(x, xd, plane, tx_size, tx_type, cm->features.reduced_tx_set_used); if (eob == 0) accu_rate += skip_cost; else accu_rate += non_skip_cost + tx_type_cost; p->eobs[block] = eob; p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(qcoeff, scan_order, p->eobs[block]); *rate_cost = accu_rate; return eob; } static AOM_FORCE_INLINE int warehouse_efficients_txb( const MACROBLOCK *x, const int plane, const int block, const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, const struct macroblock_plane *p, const int eob, const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs, const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class, int reduced_tx_set_used) { const tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block); const int txb_skip_ctx = txb_ctx->txb_skip_ctx; const int bhl = get_txb_bhl(tx_size); const int width = get_txb_wide(tx_size); const int height = get_txb_high(tx_size); const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); const int16_t *const scan = scan_order->scan; uint8_t levels_buf[TX_PAD_2D]; uint8_t *const levels = set_levels(levels_buf, height); DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); const int eob_multi_size = txsize_log2_minus4[tx_size]; const LV_MAP_EOB_COST *const eob_costs = &x->coeff_costs.eob_costs[eob_multi_size][plane_type]; int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0]; av1_txb_init_levels(qcoeff, width, height, levels); cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used); cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class); av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts); const int(*lps_cost)[COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1] = coeff_costs->lps_cost; int c = eob - 1; { const int pos = scan[c]; const tran_low_t v = qcoeff[pos]; const int sign = AOMSIGN(v); const int level = (v ^ sign) - sign; const int coeff_ctx = coeff_contexts[pos]; cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1]; if (v) { // sign bit cost if (level > NUM_BASE_LEVELS) { const int ctx = get_br_ctx_eob(pos, bhl, tx_class); cost += get_br_cost(level, lps_cost[ctx]); } if (c) { cost += av1_cost_literal(1); } else { const int sign01 = (sign ^ sign) - sign; const int dc_sign_ctx = txb_ctx->dc_sign_ctx; cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01]; return cost; } } } const int(*base_cost)[8] = coeff_costs->base_cost; for (c = eob - 2; c >= 1; --c) { const int pos = scan[c]; const int coeff_ctx = coeff_contexts[pos]; const tran_low_t v = qcoeff[pos]; const int level = abs(v); cost += base_cost[coeff_ctx][AOMMIN(level, 3)]; if (v) { // sign bit cost cost += av1_cost_literal(1); if (level > NUM_BASE_LEVELS) { const int ctx = get_br_ctx(levels, pos, bhl, tx_class); cost += get_br_cost(level, lps_cost[ctx]); } } } // c == 0 after previous loop { const int pos = scan[c]; const tran_low_t v = qcoeff[pos]; const int coeff_ctx = coeff_contexts[pos]; const int sign = AOMSIGN(v); const int level = (v ^ sign) - sign; cost += base_cost[coeff_ctx][AOMMIN(level, 3)]; if (v) { // sign bit cost const int sign01 = (sign ^ sign) - sign; const int dc_sign_ctx = txb_ctx->dc_sign_ctx; cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01]; if (level > NUM_BASE_LEVELS) { const int ctx = get_br_ctx(levels, pos, bhl, tx_class); cost += get_br_cost(level, lps_cost[ctx]); } } } return cost; } /*!\brief Estimate the entropy cost of transform coefficients using Laplacian * distribution. * * \ingroup coefficient_coding * * This function assumes each transform coefficient is of its own Laplacian * distribution and the coefficient is the only observation of the Laplacian * distribution. * * Based on that, each coefficient's coding cost can be estimated by computing * the entropy of the corresponding Laplacian distribution. * * This function then return the sum of the estimated entropy cost for all * coefficients in the transform block. * * Note that the entropy cost of end of block (eob) and transform type (tx_type) * are not included. * * \param[in] x Pointer to structure holding the data for the current encoding macroblock * \param[in] plane The index of the current plane * \param[in] block The index of the current transform block in the * macroblock. It's defined by number of 4x4 units that have been coded before * the currernt transform block * \param[in] tx_size The transform size * \param[in] tx_type The transform type * \return int Estimated entropy cost of coefficients in the * transform block. */ static int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane, const int block, const TX_SIZE tx_size, const TX_TYPE tx_type) { assert(plane == 0); int cost = 0; const struct macroblock_plane *p = &x->plane[plane]; const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type); const int16_t *scan = scan_order->scan; tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block); int eob = p->eobs[block]; // coeffs int c = eob - 1; // eob { const int pos = scan[c]; const tran_low_t v = abs(qcoeff[pos]) - 1; cost += (v << (AV1_PROB_COST_SHIFT + 2)); } // other coeffs for (c = eob - 2; c >= 0; c--) { const int pos = scan[c]; const tran_low_t v = abs(qcoeff[pos]); const int idx = AOMMIN(v, 14); cost += costLUT[idx]; } // const_term does not contain DC, and log(e) does not contain eob, so both // (eob-1) cost += (const_term + loge_par) * (eob - 1); return cost; } static AOM_FORCE_INLINE int warehouse_efficients_txb_laplacian( const MACROBLOCK *x, const int plane, const int block, const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, const int eob, const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs, const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class, int reduced_tx_set_used) { const int txb_skip_ctx = txb_ctx->txb_skip_ctx; const int eob_multi_size = txsize_log2_minus4[tx_size]; const LV_MAP_EOB_COST *const eob_costs = &x->coeff_costs.eob_costs[eob_multi_size][plane_type]; int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0]; cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used); cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class); cost += av1_cost_coeffs_txb_estimate(x, plane, block, tx_size, tx_type); return cost; } int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block, const TX_SIZE tx_size, const TX_TYPE tx_type, const TXB_CTX *const txb_ctx, int reduced_tx_set_used) { const struct macroblock_plane *p = &x->plane[plane]; const int eob = p->eobs[block]; const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); const PLANE_TYPE plane_type = get_plane_type(plane); const LV_MAP_COEFF_COST *const coeff_costs = &x->coeff_costs.coeff_costs[txs_ctx][plane_type]; if (eob == 0) { return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; } const MACROBLOCKD *const xd = &x->e_mbd; const TX_CLASS tx_class = tx_type_to_class[tx_type]; return warehouse_efficients_txb(x, plane, block, tx_size, txb_ctx, p, eob, plane_type, coeff_costs, xd, tx_type, tx_class, reduced_tx_set_used); } int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane, const int block, const TX_SIZE tx_size, const TX_TYPE tx_type, const TXB_CTX *const txb_ctx, const int reduced_tx_set_used, const int adjust_eob) { const struct macroblock_plane *p = &x->plane[plane]; int eob = p->eobs[block]; if (adjust_eob) { const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type); const int16_t *scan = scan_order->scan; tran_low_t *tcoeff = p->coeff + BLOCK_OFFSET(block); tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block); tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); update_coeff_eob_fast(&eob, av1_get_tx_scale(tx_size), p->dequant_QTX, scan, tcoeff, qcoeff, dqcoeff); p->eobs[block] = eob; } const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); const PLANE_TYPE plane_type = get_plane_type(plane); const LV_MAP_COEFF_COST *const coeff_costs = &x->coeff_costs.coeff_costs[txs_ctx][plane_type]; if (eob == 0) { return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; } const MACROBLOCKD *const xd = &x->e_mbd; const TX_CLASS tx_class = tx_type_to_class[tx_type]; return warehouse_efficients_txb_laplacian( x, plane, block, tx_size, txb_ctx, eob, plane_type, coeff_costs, xd, tx_type, tx_class, reduced_tx_set_used); } aom-3.12.1/av1/encoder/txb_rdopt.h000066400000000000000000000127341477627663500166740ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_TXB_RDOPT_H_ #define AOM_AV1_ENCODER_TXB_RDOPT_H_ #include "av1/common/blockd.h" #include "av1/common/txb_common.h" #include "av1/encoder/encoder.h" #ifdef __cplusplus extern "C" { #endif /*!\brief Adjust the magnitude of quantized coefficients to achieve better * rate-distortion (RD) trade-off. * * \ingroup coefficient_coding * * This function goes through each coefficient and greedily choose to lower * the coefficient magnitude by 1 or not based on the RD score. * * The coefficients are processing in reversed scan order. * * Note that, the end of block position (eob) may change if the original last * coefficient is lowered to zero. * * \param[in] cpi Top-level encoder structure * \param[in] x Pointer to structure holding the data for the current encoding macroblock * \param[in] plane The index of the current plane * \param[in] block The index of the current transform block in the * \param[in] tx_size The transform size * \param[in] tx_type The transform type * \param[in] txb_ctx Context info for entropy coding transform block * skip flag (tx_skip) and the sign of DC coefficient (dc_sign). * \param[out] rate_cost The entropy cost of coding the transform block * after adjustment of coefficients. * \param[in] sharpness When sharpness > 0, the function will be less * aggressive towards lowering the magnitude of coefficients. * In this way, the transform block will contain more high-frequency * coefficients and therefore will preserve the sharpness of the reconstructed * block. */ int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, TX_TYPE tx_type, const TXB_CTX *const txb_ctx, int *rate_cost, int sharpness); /*!\brief Compute the entropy cost of coding coefficients in a transform block. * * \ingroup coefficient_coding * * \param[in] x Pointer to structure holding the data for the current encoding macroblock. * \param[in] plane The index of the current plane. * \param[in] block The index of the current transform block in the * macroblock. It's defined by number of 4x4 units that have been coded before * the currernt transform block. * \param[in] tx_size The transform size. * \param[in] tx_type The transform type. * \param[in] txb_ctx Context info for entropy coding transform block * skip flag (tx_skip) and the sign of DC coefficient (dc_sign). * \param[in] reduced_tx_set_used Whether the transform type is chosen from * a reduced set. */ int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block, const TX_SIZE tx_size, const TX_TYPE tx_type, const TXB_CTX *const txb_ctx, int reduced_tx_set_used); /*!\brief Estimate the entropy cost of coding a transform block using Laplacian * distribution. * * \ingroup coefficient_coding * * This function compute the entropy costs of the end of block position (eob) * and the transform type (tx_type) precisely. * * Then using av1_cost_coeffs_txb_estimate() (see av1/encoder/txb_rdopt.c) to * estimate the entropy costs of coefficients in the transform block. * * In the end, the function returns the sum of entropy costs of end of block * position (eob), transform type (tx_type) and coefficients. * * Compared to \ref av1_cost_coeffs_txb, this function is much faster but less * accurate. * * \param[in] x Pointer to structure holding the data for the current encoding macroblock * \param[in] plane The index of the current plane * \param[in] block The index of the current transform block in the * macroblock. It's defined by number of 4x4 units that have been coded before * the currernt transform block * \param[in] tx_size The transform size * \param[in] tx_type The transform type * \param[in] txb_ctx Context info for entropy coding transform block * skip flag (tx_skip) and the sign of DC coefficient (dc_sign). * \param[in] reduced_tx_set_used Whether the transform type is chosen from * a reduced set. * \param[in] adjust_eob Whether to adjust the end of block position (eob) * or not. * \return int Estimated entropy cost of coding the transform block. */ int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane, const int block, const TX_SIZE tx_size, const TX_TYPE tx_type, const TXB_CTX *const txb_ctx, const int reduced_tx_set_used, const int adjust_eob); #ifdef __cplusplus } #endif #endif // AOM_AV1_ENCODER_TXB_RDOPT_H_ aom-3.12.1/av1/encoder/txb_rdopt_utils.h000066400000000000000000000215071477627663500201120ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_ #define AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_ #include "av1/encoder/encodetxb.h" static const int golomb_bits_cost[32] = { 0, 512, 512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9 }; static const int golomb_cost_diff[32] = { 0, 512, 512 * 2, 0, 512 * 2, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // Look up table of individual cost of coefficient by its quantization level. // determined based on Laplacian distribution conditioned on estimated context static const int costLUT[15] = { -1143, 53, 545, 825, 1031, 1209, 1393, 1577, 1763, 1947, 2132, 2317, 2501, 2686, 2871 }; static const int const_term = (1 << AV1_PROB_COST_SHIFT); static const int loge_par = ((14427 << AV1_PROB_COST_SHIFT) + 5000) / 10000; static inline int get_dqv(const int16_t *dequant, int coeff_idx, const qm_val_t *iqmatrix) { int dqv = dequant[!!coeff_idx]; if (iqmatrix != NULL) dqv = ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; return dqv; } static inline int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff, int shift, const qm_val_t *qmatrix, int coeff_idx) { int64_t diff = (tcoeff - dqcoeff) * (1 << shift); if (qmatrix == NULL) { return diff * diff; } // When AOM_DIST_METRIC_QM_PSNR is enabled, this mirrors the rate-distortion // computation done in av1_block_error_qm, improving visual quality. // The maximum value of `shift` is 2, `tcoeff` and `dqcoeff` are at most 22 // bits, and AOM_QM_BITS is 5, so `diff` should fit in 29-bits. The // multiplication `diff * diff` then does not risk overflowing. diff *= qmatrix[coeff_idx]; const int64_t error = (diff * diff + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS); return error; } static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs, const LV_MAP_COEFF_COST *txb_costs, TX_CLASS tx_class) { int eob_extra; const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra); int eob_cost = 0; const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1]; if (av1_eob_offset_bits[eob_pt] > 0) { const int eob_ctx = eob_pt - 3; const int eob_shift = av1_eob_offset_bits[eob_pt] - 1; const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit]; const int offset_bits = av1_eob_offset_bits[eob_pt]; if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1); } return eob_cost; } static inline int get_golomb_cost(int abs_qc) { if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) { const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS; const int length = get_msb(r) + 1; return av1_cost_literal(2 * length - 1); } return 0; } static inline int get_br_cost(tran_low_t level, const int *coeff_lps) { const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE); return coeff_lps[base_range] + get_golomb_cost(level); } static inline int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps, int *diff) { const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE); int golomb_bits = 0; if (level <= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) *diff += coeff_lps[base_range + COEFF_BASE_RANGE + 1]; if (level >= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) { int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS; if (r < 32) { golomb_bits = golomb_bits_cost[r]; *diff += golomb_cost_diff[r]; } else { golomb_bits = get_golomb_cost(level); *diff += (r & (r - 1)) == 0 ? 1024 : 0; } } return coeff_lps[base_range] + golomb_bits; } static AOM_FORCE_INLINE int get_two_coeff_cost_simple( int ci, tran_low_t abs_qc, int coeff_ctx, const LV_MAP_COEFF_COST *txb_costs, int bhl, TX_CLASS tx_class, const uint8_t *levels, int *cost_low) { // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0) // and not the last (scan_idx != eob - 1) assert(ci > 0); int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)]; int diff = 0; if (abs_qc <= 3) diff = txb_costs->base_cost[coeff_ctx][abs_qc + 4]; if (abs_qc) { cost += av1_cost_literal(1); if (abs_qc > NUM_BASE_LEVELS) { const int br_ctx = get_br_ctx(levels, ci, bhl, tx_class); int brcost_diff = 0; cost += get_br_cost_with_diff(abs_qc, txb_costs->lps_cost[br_ctx], &brcost_diff); diff += brcost_diff; } } *cost_low = cost - diff; return cost; } static inline int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign, int coeff_ctx, int dc_sign_ctx, const LV_MAP_COEFF_COST *txb_costs, int bhl, TX_CLASS tx_class) { int cost = 0; cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1]; if (abs_qc != 0) { if (ci == 0) { cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign]; } else { cost += av1_cost_literal(1); } if (abs_qc > NUM_BASE_LEVELS) { int br_ctx; br_ctx = get_br_ctx_eob(ci, bhl, tx_class); cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]); } } return cost; } static inline int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc, int sign, int coeff_ctx, int dc_sign_ctx, const LV_MAP_COEFF_COST *txb_costs, int bhl, TX_CLASS tx_class, const uint8_t *levels) { int cost = 0; if (is_last) { cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1]; } else { cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)]; } if (abs_qc != 0) { if (ci == 0) { cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign]; } else { cost += av1_cost_literal(1); } if (abs_qc > NUM_BASE_LEVELS) { int br_ctx; if (is_last) br_ctx = get_br_ctx_eob(ci, bhl, tx_class); else br_ctx = get_br_ctx(levels, ci, bhl, tx_class); cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]); } } return cost; } static inline void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv, int shift, tran_low_t *qc_low, tran_low_t *dqc_low) { tran_low_t abs_qc_low = abs_qc - 1; *qc_low = (-sign ^ abs_qc_low) + sign; assert((sign ? -abs_qc_low : abs_qc_low) == *qc_low); tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift; *dqc_low = (-sign ^ abs_dqc_low) + sign; assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low); } static inline void update_coeff_eob_fast(int *eob, int shift, const int16_t *dequant_ptr, const int16_t *scan, const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) { // TODO(sarahparker) make this work for aomqm int eob_out = *eob; int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7), dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) }; for (int i = *eob - 1; i >= 0; i--) { const int rc = scan[i]; const int qcoeff = qcoeff_ptr[rc]; const int coeff = coeff_ptr[rc]; const int coeff_sign = AOMSIGN(coeff); int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) { eob_out--; qcoeff_ptr[rc] = 0; dqcoeff_ptr[rc] = 0; } else { break; } } *eob = eob_out; } #endif // AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_ aom-3.12.1/av1/encoder/var_based_part.c000066400000000000000000002406661477627663500176450ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/binary_codes_writer.h" #include "aom_ports/mem.h" #include "aom_ports/aom_timer.h" #include "av1/common/reconinter.h" #include "av1/common/blockd.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodeframe_utils.h" #include "av1/encoder/var_based_part.h" #include "av1/encoder/reconinter_enc.h" #include "av1/encoder/rdopt_utils.h" // Possible values for the force_split variable while evaluating variance based // partitioning. enum { // Evaluate all partition types PART_EVAL_ALL = 0, // Force PARTITION_SPLIT PART_EVAL_ONLY_SPLIT = 1, // Force PARTITION_NONE PART_EVAL_ONLY_NONE = 2 } UENUM1BYTE(PART_EVAL_STATUS); typedef struct { VPVariance *part_variances; VPartVar *split[4]; } variance_node; static inline void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { node->part_variances = NULL; switch (bsize) { case BLOCK_128X128: { VP128x128 *vt = (VP128x128 *)data; node->part_variances = &vt->part_variances; for (int split_idx = 0; split_idx < 4; split_idx++) node->split[split_idx] = &vt->split[split_idx].part_variances.none; break; } case BLOCK_64X64: { VP64x64 *vt = (VP64x64 *)data; node->part_variances = &vt->part_variances; for (int split_idx = 0; split_idx < 4; split_idx++) node->split[split_idx] = &vt->split[split_idx].part_variances.none; break; } case BLOCK_32X32: { VP32x32 *vt = (VP32x32 *)data; node->part_variances = &vt->part_variances; for (int split_idx = 0; split_idx < 4; split_idx++) node->split[split_idx] = &vt->split[split_idx].part_variances.none; break; } case BLOCK_16X16: { VP16x16 *vt = (VP16x16 *)data; node->part_variances = &vt->part_variances; for (int split_idx = 0; split_idx < 4; split_idx++) node->split[split_idx] = &vt->split[split_idx].part_variances.none; break; } case BLOCK_8X8: { VP8x8 *vt = (VP8x8 *)data; node->part_variances = &vt->part_variances; for (int split_idx = 0; split_idx < 4; split_idx++) node->split[split_idx] = &vt->split[split_idx].part_variances.none; break; } default: { VP4x4 *vt = (VP4x4 *)data; assert(bsize == BLOCK_4X4); node->part_variances = &vt->part_variances; for (int split_idx = 0; split_idx < 4; split_idx++) node->split[split_idx] = &vt->split[split_idx]; break; } } } // Set variance values given sum square error, sum error, count. static inline void fill_variance(uint32_t s2, int32_t s, int c, VPartVar *v) { v->sum_square_error = s2; v->sum_error = s; v->log2_count = c; } static inline void get_variance(VPartVar *v) { v->variance = (int)(256 * (v->sum_square_error - (uint32_t)(((int64_t)v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count); } static inline void sum_2_variances(const VPartVar *a, const VPartVar *b, VPartVar *r) { assert(a->log2_count == b->log2_count); fill_variance(a->sum_square_error + b->sum_square_error, a->sum_error + b->sum_error, a->log2_count + 1, r); } static inline void fill_variance_tree(void *data, BLOCK_SIZE bsize) { variance_node node; memset(&node, 0, sizeof(node)); tree_to_node(data, bsize, &node); sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]); sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]); sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]); sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]); sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1], &node.part_variances->none); } static inline void set_block_size(AV1_COMP *const cpi, int mi_row, int mi_col, BLOCK_SIZE bsize) { if (cpi->common.mi_params.mi_cols > mi_col && cpi->common.mi_params.mi_rows > mi_row) { CommonModeInfoParams *mi_params = &cpi->common.mi_params; const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col); const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col); MB_MODE_INFO *mi = mi_params->mi_grid_base[mi_grid_idx] = &mi_params->mi_alloc[mi_alloc_idx]; mi->bsize = bsize; } } static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile, void *data, BLOCK_SIZE bsize, int mi_row, int mi_col, int64_t threshold, BLOCK_SIZE bsize_min, PART_EVAL_STATUS force_split) { AV1_COMMON *const cm = &cpi->common; variance_node vt; const int block_width = mi_size_wide[bsize]; const int block_height = mi_size_high[bsize]; int bs_width_check = block_width; int bs_height_check = block_height; int bs_width_vert_check = block_width >> 1; int bs_height_horiz_check = block_height >> 1; // On the right and bottom boundary we only need to check // if half the bsize fits, because boundary is extended // up to 64. So do this check only for sb_size = 64X64. if (cm->seq_params->sb_size == BLOCK_64X64) { if (tile->mi_col_end == cm->mi_params.mi_cols) { bs_width_check = (block_width >> 1) + 1; bs_width_vert_check = (block_width >> 2) + 1; } if (tile->mi_row_end == cm->mi_params.mi_rows) { bs_height_check = (block_height >> 1) + 1; bs_height_horiz_check = (block_height >> 2) + 1; } } assert(block_height == block_width); tree_to_node(data, bsize, &vt); if (mi_col + bs_width_check <= tile->mi_col_end && mi_row + bs_height_check <= tile->mi_row_end && force_split == PART_EVAL_ONLY_NONE) { set_block_size(cpi, mi_row, mi_col, bsize); return 1; } if (force_split == PART_EVAL_ONLY_SPLIT) return 0; // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if // variance is below threshold, otherwise split will be selected. // No check for vert/horiz split as too few samples for variance. if (bsize == bsize_min) { // Variance already computed to set the force_split. if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); if (mi_col + bs_width_check <= tile->mi_col_end && mi_row + bs_height_check <= tile->mi_row_end && vt.part_variances->none.variance < threshold) { set_block_size(cpi, mi_row, mi_col, bsize); return 1; } return 0; } else if (bsize > bsize_min) { // Variance already computed to set the force_split. if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); // For key frame: take split for bsize above 32X32 or very high variance. if (frame_is_intra_only(cm) && (bsize > BLOCK_32X32 || vt.part_variances->none.variance > (threshold << 4))) { return 0; } // If variance is low, take the bsize (no split). if (mi_col + bs_width_check <= tile->mi_col_end && mi_row + bs_height_check <= tile->mi_row_end && vt.part_variances->none.variance < threshold) { set_block_size(cpi, mi_row, mi_col, bsize); return 1; } // Check vertical split. if (mi_row + bs_height_check <= tile->mi_row_end && mi_col + bs_width_vert_check <= tile->mi_col_end) { BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT); BLOCK_SIZE plane_bsize = get_plane_block_size(subsize, xd->plane[AOM_PLANE_U].subsampling_x, xd->plane[AOM_PLANE_U].subsampling_y); get_variance(&vt.part_variances->vert[0]); get_variance(&vt.part_variances->vert[1]); if (vt.part_variances->vert[0].variance < threshold && vt.part_variances->vert[1].variance < threshold && plane_bsize < BLOCK_INVALID) { set_block_size(cpi, mi_row, mi_col, subsize); set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize); return 1; } } // Check horizontal split. if (mi_col + bs_width_check <= tile->mi_col_end && mi_row + bs_height_horiz_check <= tile->mi_row_end) { BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ); BLOCK_SIZE plane_bsize = get_plane_block_size(subsize, xd->plane[AOM_PLANE_U].subsampling_x, xd->plane[AOM_PLANE_U].subsampling_y); get_variance(&vt.part_variances->horz[0]); get_variance(&vt.part_variances->horz[1]); if (vt.part_variances->horz[0].variance < threshold && vt.part_variances->horz[1].variance < threshold && plane_bsize < BLOCK_INVALID) { set_block_size(cpi, mi_row, mi_col, subsize); set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize); return 1; } } return 0; } return 0; } static inline int all_blks_inside(int x16_idx, int y16_idx, int pixels_wide, int pixels_high) { int all_inside = 1; for (int idx = 0; idx < 4; idx++) { all_inside &= ((x16_idx + GET_BLK_IDX_X(idx, 3)) < pixels_wide); all_inside &= ((y16_idx + GET_BLK_IDX_Y(idx, 3)) < pixels_high); } return all_inside; } #if CONFIG_AV1_HIGHBITDEPTH // TODO(yunqingwang): Perform average of four 8x8 blocks similar to lowbd static inline void fill_variance_8x8avg_highbd( const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf, int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int pixels_wide, int pixels_high) { for (int idx = 0; idx < 4; idx++) { const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3); const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3); unsigned int sse = 0; int sum = 0; if (x8_idx < pixels_wide && y8_idx < pixels_high) { int src_avg = aom_highbd_avg_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride); int dst_avg = aom_highbd_avg_8x8(dst_buf + y8_idx * dst_stride + x8_idx, dst_stride); sum = src_avg - dst_avg; sse = sum * sum; } fill_variance(sse, sum, 0, &vst->split[idx].part_variances.none); } } #endif static inline void fill_variance_8x8avg_lowbd( const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf, int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int pixels_wide, int pixels_high) { unsigned int sse[4] = { 0 }; int sum[4] = { 0 }; if (all_blks_inside(x16_idx, y16_idx, pixels_wide, pixels_high)) { int src_avg[4]; int dst_avg[4]; aom_avg_8x8_quad(src_buf, src_stride, x16_idx, y16_idx, src_avg); aom_avg_8x8_quad(dst_buf, dst_stride, x16_idx, y16_idx, dst_avg); for (int idx = 0; idx < 4; idx++) { sum[idx] = src_avg[idx] - dst_avg[idx]; sse[idx] = sum[idx] * sum[idx]; } } else { for (int idx = 0; idx < 4; idx++) { const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3); const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3); if (x8_idx < pixels_wide && y8_idx < pixels_high) { int src_avg = aom_avg_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride); int dst_avg = aom_avg_8x8(dst_buf + y8_idx * dst_stride + x8_idx, dst_stride); sum[idx] = src_avg - dst_avg; sse[idx] = sum[idx] * sum[idx]; } } } for (int idx = 0; idx < 4; idx++) { fill_variance(sse[idx], sum[idx], 0, &vst->split[idx].part_variances.none); } } // Obtain parameters required to calculate variance (such as sum, sse, etc,.) // at 8x8 sub-block level for a given 16x16 block. // The function can be called only when is_key_frame is false since sum is // computed between source and reference frames. static inline void fill_variance_8x8avg(const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf, int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int highbd_flag, int pixels_wide, int pixels_high) { #if CONFIG_AV1_HIGHBITDEPTH if (highbd_flag) { fill_variance_8x8avg_highbd(src_buf, src_stride, dst_buf, dst_stride, x16_idx, y16_idx, vst, pixels_wide, pixels_high); return; } #else (void)highbd_flag; #endif // CONFIG_AV1_HIGHBITDEPTH fill_variance_8x8avg_lowbd(src_buf, src_stride, dst_buf, dst_stride, x16_idx, y16_idx, vst, pixels_wide, pixels_high); } static int compute_minmax_8x8(const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf, int dst_stride, int x16_idx, int y16_idx, #if CONFIG_AV1_HIGHBITDEPTH int highbd_flag, #endif int pixels_wide, int pixels_high) { int minmax_max = 0; int minmax_min = 255; // Loop over the 4 8x8 subblocks. for (int idx = 0; idx < 4; idx++) { const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3); const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3); int min = 0; int max = 0; if (x8_idx < pixels_wide && y8_idx < pixels_high) { #if CONFIG_AV1_HIGHBITDEPTH if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { aom_highbd_minmax_8x8( src_buf + y8_idx * src_stride + x8_idx, src_stride, dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min, &max); } else { aom_minmax_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride, dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min, &max); } #else aom_minmax_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride, dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min, &max); #endif if ((max - min) > minmax_max) minmax_max = (max - min); if ((max - min) < minmax_min) minmax_min = (max - min); } } return (minmax_max - minmax_min); } // Function to compute average and variance of 4x4 sub-block. // The function can be called only when is_key_frame is true since sum is // computed using source frame only. static inline void fill_variance_4x4avg(const uint8_t *src_buf, int src_stride, int x8_idx, int y8_idx, VP8x8 *vst, #if CONFIG_AV1_HIGHBITDEPTH int highbd_flag, #endif int pixels_wide, int pixels_high, int border_offset_4x4) { for (int idx = 0; idx < 4; idx++) { const int x4_idx = x8_idx + GET_BLK_IDX_X(idx, 2); const int y4_idx = y8_idx + GET_BLK_IDX_Y(idx, 2); unsigned int sse = 0; int sum = 0; if (x4_idx < pixels_wide - border_offset_4x4 && y4_idx < pixels_high - border_offset_4x4) { int src_avg; int dst_avg = 128; #if CONFIG_AV1_HIGHBITDEPTH if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { src_avg = aom_highbd_avg_4x4(src_buf + y4_idx * src_stride + x4_idx, src_stride); } else { src_avg = aom_avg_4x4(src_buf + y4_idx * src_stride + x4_idx, src_stride); } #else src_avg = aom_avg_4x4(src_buf + y4_idx * src_stride + x4_idx, src_stride); #endif sum = src_avg - dst_avg; sse = sum * sum; } fill_variance(sse, sum, 0, &vst->split[idx].part_variances.none); } } static int64_t scale_part_thresh_content(int64_t threshold_base, int speed, int non_reference_frame, int is_static) { int64_t threshold = threshold_base; if (non_reference_frame && !is_static) threshold = (3 * threshold) >> 1; if (speed >= 8) { return (5 * threshold) >> 2; } return threshold; } // Tune thresholds less or more aggressively to prefer larger partitions static inline void tune_thresh_based_on_qindex( AV1_COMP *cpi, int64_t thresholds[], uint64_t block_sad, int current_qindex, int num_pixels, bool is_segment_id_boosted, int source_sad_nonrd, int lighting_change) { double weight; if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 3) { const int win = 20; if (current_qindex < QINDEX_LARGE_BLOCK_THR - win) weight = 1.0; else if (current_qindex > QINDEX_LARGE_BLOCK_THR + win) weight = 0.0; else weight = 1.0 - (current_qindex - QINDEX_LARGE_BLOCK_THR + win) / (2 * win); if (num_pixels > RESOLUTION_480P) { for (int i = 0; i < 4; i++) { thresholds[i] <<= 1; } } if (num_pixels <= RESOLUTION_288P) { thresholds[3] = INT64_MAX; if (is_segment_id_boosted == false) { thresholds[1] <<= 2; thresholds[2] <<= (source_sad_nonrd <= kLowSad) ? 5 : 4; } else { thresholds[1] <<= 1; thresholds[2] <<= 3; } // Allow for split to 8x8 for superblocks where part of it has // moving boundary. So allow for sb with source_sad above threshold, // and avoid very large source_sad or high source content, to avoid // too many 8x8 within superblock. uint64_t avg_source_sad_thresh = 25000; uint64_t block_sad_low = 25000; uint64_t block_sad_high = 50000; if (cpi->svc.temporal_layer_id == 0 && cpi->svc.number_temporal_layers > 1) { // Increase the sad thresholds for base TL0, as reference/LAST is // 2/4 frames behind (for 2/3 #TL). avg_source_sad_thresh = 40000; block_sad_high = 70000; } if (is_segment_id_boosted == false && cpi->rc.avg_source_sad < avg_source_sad_thresh && block_sad > block_sad_low && block_sad < block_sad_high && !lighting_change) { thresholds[2] = (3 * thresholds[2]) >> 2; thresholds[3] = thresholds[2] << 3; } // Condition the increase of partition thresholds on the segment // and the content. Avoid the increase for superblocks which have // high source sad, unless the whole frame has very high motion // (i.e, cpi->rc.avg_source_sad is very large, in which case all blocks // have high source sad). } else if (num_pixels > RESOLUTION_480P && is_segment_id_boosted == false && (source_sad_nonrd != kHighSad || cpi->rc.avg_source_sad > 50000)) { thresholds[0] = (3 * thresholds[0]) >> 1; thresholds[3] = INT64_MAX; if (current_qindex > QINDEX_LARGE_BLOCK_THR) { thresholds[1] = (int)((1 - weight) * (thresholds[1] << 1) + weight * thresholds[1]); thresholds[2] = (int)((1 - weight) * (thresholds[2] << 1) + weight * thresholds[2]); } } else if (current_qindex > QINDEX_LARGE_BLOCK_THR && is_segment_id_boosted == false && (source_sad_nonrd != kHighSad || cpi->rc.avg_source_sad > 50000)) { thresholds[1] = (int)((1 - weight) * (thresholds[1] << 2) + weight * thresholds[1]); thresholds[2] = (int)((1 - weight) * (thresholds[2] << 4) + weight * thresholds[2]); thresholds[3] = INT64_MAX; } } else if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 2) { thresholds[1] <<= (source_sad_nonrd <= kLowSad) ? 2 : 0; thresholds[2] = (source_sad_nonrd <= kLowSad) ? (3 * thresholds[2]) : thresholds[2]; } else if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 1) { const int fac = (source_sad_nonrd <= kLowSad) ? 2 : 1; if (current_qindex < QINDEX_LARGE_BLOCK_THR - 45) weight = 1.0; else if (current_qindex > QINDEX_LARGE_BLOCK_THR + 45) weight = 0.0; else weight = 1.0 - (current_qindex - QINDEX_LARGE_BLOCK_THR + 45) / (2 * 45); thresholds[1] = (int)((1 - weight) * (thresholds[1] << 1) + weight * thresholds[1]); thresholds[2] = (int)((1 - weight) * (thresholds[2] << 1) + weight * thresholds[2]); thresholds[3] = (int)((1 - weight) * (thresholds[3] << fac) + weight * thresholds[3]); } if (cpi->sf.part_sf.disable_8x8_part_based_on_qidx && (current_qindex < 128)) thresholds[3] = INT64_MAX; } static void set_vbp_thresholds_key_frame(AV1_COMP *cpi, int64_t thresholds[], int64_t threshold_base, int threshold_left_shift, int num_pixels) { if (cpi->sf.rt_sf.force_large_partition_blocks_intra) { const int shift_steps = threshold_left_shift - (cpi->oxcf.mode == ALLINTRA ? 7 : 8); assert(shift_steps >= 0); threshold_base <<= shift_steps; } thresholds[0] = threshold_base; thresholds[1] = threshold_base; if (num_pixels < RESOLUTION_720P) { thresholds[2] = threshold_base / 3; thresholds[3] = threshold_base >> 1; } else { int shift_val = 2; if (cpi->sf.rt_sf.force_large_partition_blocks_intra) { shift_val = 0; } thresholds[2] = threshold_base >> shift_val; thresholds[3] = threshold_base >> shift_val; } thresholds[4] = threshold_base << 2; } static inline void tune_thresh_based_on_resolution( AV1_COMP *cpi, int64_t thresholds[], int64_t threshold_base, int current_qindex, int source_sad_rd, int num_pixels) { if (num_pixels >= RESOLUTION_720P) thresholds[3] = thresholds[3] << 1; if (num_pixels <= RESOLUTION_288P) { const int qindex_thr[5][2] = { { 200, 220 }, { 140, 170 }, { 120, 150 }, { 200, 210 }, { 170, 220 }, }; int th_idx = 0; if (cpi->sf.rt_sf.var_part_based_on_qidx >= 1) th_idx = (source_sad_rd <= kLowSad) ? cpi->sf.rt_sf.var_part_based_on_qidx : 0; if (cpi->sf.rt_sf.var_part_based_on_qidx >= 3) th_idx = cpi->sf.rt_sf.var_part_based_on_qidx; const int qindex_low_thr = qindex_thr[th_idx][0]; const int qindex_high_thr = qindex_thr[th_idx][1]; if (current_qindex >= qindex_high_thr) { threshold_base = (5 * threshold_base) >> 1; thresholds[1] = threshold_base >> 3; thresholds[2] = threshold_base << 2; thresholds[3] = threshold_base << 5; } else if (current_qindex < qindex_low_thr) { thresholds[1] = threshold_base >> 3; thresholds[2] = threshold_base >> 1; thresholds[3] = threshold_base << 3; } else { int64_t qi_diff_low = current_qindex - qindex_low_thr; int64_t qi_diff_high = qindex_high_thr - current_qindex; int64_t threshold_diff = qindex_high_thr - qindex_low_thr; int64_t threshold_base_high = (5 * threshold_base) >> 1; threshold_diff = threshold_diff > 0 ? threshold_diff : 1; threshold_base = (qi_diff_low * threshold_base_high + qi_diff_high * threshold_base) / threshold_diff; thresholds[1] = threshold_base >> 3; thresholds[2] = ((qi_diff_low * threshold_base) + qi_diff_high * (threshold_base >> 1)) / threshold_diff; thresholds[3] = ((qi_diff_low * (threshold_base << 5)) + qi_diff_high * (threshold_base << 3)) / threshold_diff; } } else if (num_pixels < RESOLUTION_720P) { thresholds[2] = (5 * threshold_base) >> 2; } else if (num_pixels < RESOLUTION_1080P) { thresholds[2] = threshold_base << 1; } else { // num_pixels >= RESOLUTION_1080P if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { if (num_pixels < RESOLUTION_1440P) { thresholds[2] = (5 * threshold_base) >> 1; } else { thresholds[2] = (7 * threshold_base) >> 1; } } else { if (cpi->oxcf.speed > 7) { thresholds[2] = 6 * threshold_base; } else { thresholds[2] = 3 * threshold_base; } } } } // Increase the base partition threshold, based on content and noise level. static inline int64_t tune_base_thresh_content(AV1_COMP *cpi, int64_t threshold_base, int content_lowsumdiff, int source_sad_nonrd, int num_pixels) { AV1_COMMON *const cm = &cpi->common; int64_t updated_thresh_base = threshold_base; if (cpi->noise_estimate.enabled && content_lowsumdiff && num_pixels > RESOLUTION_480P && cm->current_frame.frame_number > 60) { NOISE_LEVEL noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate); if (noise_level == kHigh) updated_thresh_base = (5 * updated_thresh_base) >> 1; else if (noise_level == kMedium && !cpi->sf.rt_sf.prefer_large_partition_blocks) updated_thresh_base = (5 * updated_thresh_base) >> 2; } updated_thresh_base = scale_part_thresh_content( updated_thresh_base, cpi->oxcf.speed, cpi->ppi->rtc_ref.non_reference_frame, cpi->rc.frame_source_sad == 0); if (cpi->oxcf.speed >= 11 && source_sad_nonrd > kLowSad && cpi->rc.high_motion_content_screen_rtc) updated_thresh_base = updated_thresh_base << 4; return updated_thresh_base; } static inline void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[], uint64_t blk_sad, int qindex, int content_lowsumdiff, int source_sad_nonrd, int source_sad_rd, bool is_segment_id_boosted, int lighting_change) { AV1_COMMON *const cm = &cpi->common; const int is_key_frame = frame_is_intra_only(cm); const int threshold_multiplier = is_key_frame ? 120 : 1; const int ac_q = av1_ac_quant_QTX(qindex, 0, cm->seq_params->bit_depth); int64_t threshold_base = (int64_t)(threshold_multiplier * ac_q); const int current_qindex = cm->quant_params.base_qindex; const int threshold_left_shift = cpi->sf.rt_sf.var_part_split_threshold_shift; const int num_pixels = cm->width * cm->height; if (is_key_frame) { set_vbp_thresholds_key_frame(cpi, thresholds, threshold_base, threshold_left_shift, num_pixels); return; } threshold_base = tune_base_thresh_content( cpi, threshold_base, content_lowsumdiff, source_sad_nonrd, num_pixels); thresholds[0] = threshold_base >> 1; thresholds[1] = threshold_base; thresholds[3] = threshold_base << threshold_left_shift; tune_thresh_based_on_resolution(cpi, thresholds, threshold_base, current_qindex, source_sad_rd, num_pixels); tune_thresh_based_on_qindex(cpi, thresholds, blk_sad, current_qindex, num_pixels, is_segment_id_boosted, source_sad_nonrd, lighting_change); } // Set temporal variance low flag for superblock 64x64. // Only first 25 in the array are used in this case. static inline void set_low_temp_var_flag_64x64(CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info, MACROBLOCKD *xd, VP64x64 *vt, const int64_t thresholds[], int mi_col, int mi_row) { if (xd->mi[0]->bsize == BLOCK_64X64) { if ((vt->part_variances).none.variance < (thresholds[0] >> 1)) part_info->variance_low[0] = 1; } else if (xd->mi[0]->bsize == BLOCK_64X32) { for (int part_idx = 0; part_idx < 2; part_idx++) { if (vt->part_variances.horz[part_idx].variance < (thresholds[0] >> 2)) part_info->variance_low[part_idx + 1] = 1; } } else if (xd->mi[0]->bsize == BLOCK_32X64) { for (int part_idx = 0; part_idx < 2; part_idx++) { if (vt->part_variances.vert[part_idx].variance < (thresholds[0] >> 2)) part_info->variance_low[part_idx + 3] = 1; } } else { static const int idx[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } }; for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) { const int idx_str = mi_params->mi_stride * (mi_row + idx[lvl1_idx][0]) + mi_col + idx[lvl1_idx][1]; MB_MODE_INFO **this_mi = mi_params->mi_grid_base + idx_str; if (mi_params->mi_cols <= mi_col + idx[lvl1_idx][1] || mi_params->mi_rows <= mi_row + idx[lvl1_idx][0]) continue; if (*this_mi == NULL) continue; if ((*this_mi)->bsize == BLOCK_32X32) { int64_t threshold_32x32 = (5 * thresholds[1]) >> 3; if (vt->split[lvl1_idx].part_variances.none.variance < threshold_32x32) part_info->variance_low[lvl1_idx + 5] = 1; } else { // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block // inside. if ((*this_mi)->bsize == BLOCK_16X16 || (*this_mi)->bsize == BLOCK_32X16 || (*this_mi)->bsize == BLOCK_16X32) { for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) { if (vt->split[lvl1_idx] .split[lvl2_idx] .part_variances.none.variance < (thresholds[2] >> 8)) part_info->variance_low[(lvl1_idx << 2) + lvl2_idx + 9] = 1; } } } } } } static inline void set_low_temp_var_flag_128x128( CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info, MACROBLOCKD *xd, VP128x128 *vt, const int64_t thresholds[], int mi_col, int mi_row) { if (xd->mi[0]->bsize == BLOCK_128X128) { if (vt->part_variances.none.variance < (thresholds[0] >> 1)) part_info->variance_low[0] = 1; } else if (xd->mi[0]->bsize == BLOCK_128X64) { for (int part_idx = 0; part_idx < 2; part_idx++) { if (vt->part_variances.horz[part_idx].variance < (thresholds[0] >> 2)) part_info->variance_low[part_idx + 1] = 1; } } else if (xd->mi[0]->bsize == BLOCK_64X128) { for (int part_idx = 0; part_idx < 2; part_idx++) { if (vt->part_variances.vert[part_idx].variance < (thresholds[0] >> 2)) part_info->variance_low[part_idx + 3] = 1; } } else { static const int idx64[4][2] = { { 0, 0 }, { 0, 16 }, { 16, 0 }, { 16, 16 } }; static const int idx32[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } }; for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) { const int idx_str = mi_params->mi_stride * (mi_row + idx64[lvl1_idx][0]) + mi_col + idx64[lvl1_idx][1]; MB_MODE_INFO **mi_64 = mi_params->mi_grid_base + idx_str; if (*mi_64 == NULL) continue; if (mi_params->mi_cols <= mi_col + idx64[lvl1_idx][1] || mi_params->mi_rows <= mi_row + idx64[lvl1_idx][0]) continue; const int64_t threshold_64x64 = (5 * thresholds[1]) >> 3; if ((*mi_64)->bsize == BLOCK_64X64) { if (vt->split[lvl1_idx].part_variances.none.variance < threshold_64x64) part_info->variance_low[5 + lvl1_idx] = 1; } else if ((*mi_64)->bsize == BLOCK_64X32) { for (int part_idx = 0; part_idx < 2; part_idx++) if (vt->split[lvl1_idx].part_variances.horz[part_idx].variance < (threshold_64x64 >> 1)) part_info->variance_low[9 + (lvl1_idx << 1) + part_idx] = 1; } else if ((*mi_64)->bsize == BLOCK_32X64) { for (int part_idx = 0; part_idx < 2; part_idx++) if (vt->split[lvl1_idx].part_variances.vert[part_idx].variance < (threshold_64x64 >> 1)) part_info->variance_low[17 + (lvl1_idx << 1) + part_idx] = 1; } else { for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) { const int idx_str1 = mi_params->mi_stride * idx32[lvl2_idx][0] + idx32[lvl2_idx][1]; MB_MODE_INFO **mi_32 = mi_params->mi_grid_base + idx_str + idx_str1; if (*mi_32 == NULL) continue; if (mi_params->mi_cols <= mi_col + idx64[lvl1_idx][1] + idx32[lvl2_idx][1] || mi_params->mi_rows <= mi_row + idx64[lvl1_idx][0] + idx32[lvl2_idx][0]) continue; const int64_t threshold_32x32 = (5 * thresholds[2]) >> 3; if ((*mi_32)->bsize == BLOCK_32X32) { if (vt->split[lvl1_idx] .split[lvl2_idx] .part_variances.none.variance < threshold_32x32) part_info->variance_low[25 + (lvl1_idx << 2) + lvl2_idx] = 1; } else { // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block // inside. if ((*mi_32)->bsize == BLOCK_16X16 || (*mi_32)->bsize == BLOCK_32X16 || (*mi_32)->bsize == BLOCK_16X32) { for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) { VPartVar *none_var = &vt->split[lvl1_idx] .split[lvl2_idx] .split[lvl3_idx] .part_variances.none; if (none_var->variance < (thresholds[3] >> 8)) part_info->variance_low[41 + (lvl1_idx << 4) + (lvl2_idx << 2) + lvl3_idx] = 1; } } } } } } } } static inline void set_low_temp_var_flag( AV1_COMP *cpi, PartitionSearchInfo *part_info, MACROBLOCKD *xd, VP128x128 *vt, int64_t thresholds[], MV_REFERENCE_FRAME ref_frame_partition, int mi_col, int mi_row, const bool is_small_sb) { AV1_COMMON *const cm = &cpi->common; // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected. // If the temporal variance is small set the flag // variance_low for the block. The variance threshold can be adjusted, the // higher the more aggressive. if (ref_frame_partition == LAST_FRAME) { if (is_small_sb) set_low_temp_var_flag_64x64(&cm->mi_params, part_info, xd, &(vt->split[0]), thresholds, mi_col, mi_row); else set_low_temp_var_flag_128x128(&cm->mi_params, part_info, xd, vt, thresholds, mi_col, mi_row); } } static const int pos_shift_16x16[4][4] = { { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 } }; int av1_get_force_skip_low_temp_var_small_sb(const uint8_t *variance_low, int mi_row, int mi_col, BLOCK_SIZE bsize) { // Relative indices of MB inside the superblock. const int mi_x = mi_row & 0xF; const int mi_y = mi_col & 0xF; // Relative indices of 16x16 block inside the superblock. const int i = mi_x >> 2; const int j = mi_y >> 2; int force_skip_low_temp_var = 0; // Set force_skip_low_temp_var based on the block size and block offset. switch (bsize) { case BLOCK_64X64: force_skip_low_temp_var = variance_low[0]; break; case BLOCK_64X32: if (!mi_y && !mi_x) { force_skip_low_temp_var = variance_low[1]; } else if (!mi_y && mi_x) { force_skip_low_temp_var = variance_low[2]; } break; case BLOCK_32X64: if (!mi_y && !mi_x) { force_skip_low_temp_var = variance_low[3]; } else if (mi_y && !mi_x) { force_skip_low_temp_var = variance_low[4]; } break; case BLOCK_32X32: if (!mi_y && !mi_x) { force_skip_low_temp_var = variance_low[5]; } else if (mi_y && !mi_x) { force_skip_low_temp_var = variance_low[6]; } else if (!mi_y && mi_x) { force_skip_low_temp_var = variance_low[7]; } else if (mi_y && mi_x) { force_skip_low_temp_var = variance_low[8]; } break; case BLOCK_32X16: case BLOCK_16X32: case BLOCK_16X16: force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]]; break; default: break; } return force_skip_low_temp_var; } int av1_get_force_skip_low_temp_var(const uint8_t *variance_low, int mi_row, int mi_col, BLOCK_SIZE bsize) { int force_skip_low_temp_var = 0; int x, y; x = (mi_col & 0x1F) >> 4; // y = (mi_row & 0x1F) >> 4; // const int idx64 = (y << 1) + x; y = (mi_row & 0x17) >> 3; const int idx64 = y + x; x = (mi_col & 0xF) >> 3; // y = (mi_row & 0xF) >> 3; // const int idx32 = (y << 1) + x; y = (mi_row & 0xB) >> 2; const int idx32 = y + x; x = (mi_col & 0x7) >> 2; // y = (mi_row & 0x7) >> 2; // const int idx16 = (y << 1) + x; y = (mi_row & 0x5) >> 1; const int idx16 = y + x; // Set force_skip_low_temp_var based on the block size and block offset. switch (bsize) { case BLOCK_128X128: force_skip_low_temp_var = variance_low[0]; break; case BLOCK_128X64: assert((mi_col & 0x1F) == 0); force_skip_low_temp_var = variance_low[1 + ((mi_row & 0x1F) != 0)]; break; case BLOCK_64X128: assert((mi_row & 0x1F) == 0); force_skip_low_temp_var = variance_low[3 + ((mi_col & 0x1F) != 0)]; break; case BLOCK_64X64: // Location of this 64x64 block inside the 128x128 superblock force_skip_low_temp_var = variance_low[5 + idx64]; break; case BLOCK_64X32: x = (mi_col & 0x1F) >> 4; y = (mi_row & 0x1F) >> 3; /* .---------------.---------------. | x=0,y=0,idx=0 | x=0,y=0,idx=2 | :---------------+---------------: | x=0,y=1,idx=1 | x=1,y=1,idx=3 | :---------------+---------------: | x=0,y=2,idx=4 | x=1,y=2,idx=6 | :---------------+---------------: | x=0,y=3,idx=5 | x=1,y=3,idx=7 | '---------------'---------------' */ const int idx64x32 = (x << 1) + (y % 2) + ((y >> 1) << 2); force_skip_low_temp_var = variance_low[9 + idx64x32]; break; case BLOCK_32X64: x = (mi_col & 0x1F) >> 3; y = (mi_row & 0x1F) >> 4; const int idx32x64 = (y << 2) + x; force_skip_low_temp_var = variance_low[17 + idx32x64]; break; case BLOCK_32X32: force_skip_low_temp_var = variance_low[25 + (idx64 << 2) + idx32]; break; case BLOCK_32X16: case BLOCK_16X32: case BLOCK_16X16: force_skip_low_temp_var = variance_low[41 + (idx64 << 4) + (idx32 << 2) + idx16]; break; default: break; } return force_skip_low_temp_var; } void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int qindex, int content_lowsumdiff) { SPEED_FEATURES *const sf = &cpi->sf; if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION) { return; } else { set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, 0, qindex, content_lowsumdiff, 0, 0, 0, 0); // The threshold below is not changed locally. cpi->vbp_info.threshold_minmax = 15 + (qindex >> 3); } } static inline void chroma_check(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, unsigned int y_sad, unsigned int y_sad_g, unsigned int y_sad_alt, bool is_key_frame, bool zero_motion, unsigned int *uv_sad) { MACROBLOCKD *xd = &x->e_mbd; const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd; int shift_upper_limit = 1; int shift_lower_limit = 3; int fac_uv = 6; if (is_key_frame || cpi->oxcf.tool_cfg.enable_monochrome) return; // Use lower threshold (more conservative in setting color flag) for // higher resolutions non-screen, which tend to have more camera noise. // Since this may be used to skip compound mode in nonrd pickmode, which // is generally more effective for higher resolutions, better to be more // conservative. if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) { if (cpi->common.width * cpi->common.height >= RESOLUTION_1080P) fac_uv = 3; else fac_uv = 5; } if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && cpi->rc.high_source_sad) { shift_lower_limit = 7; } else if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && cpi->rc.percent_blocks_with_motion > 90 && cpi->rc.frame_source_sad > 10000 && source_sad_nonrd > kLowSad) { shift_lower_limit = 8; shift_upper_limit = 3; } else if (source_sad_nonrd >= kMedSad && x->source_variance > 500 && cpi->common.width * cpi->common.height >= 640 * 360) { shift_upper_limit = 2; shift_lower_limit = source_sad_nonrd > kMedSad ? 5 : 4; } MB_MODE_INFO *mi = xd->mi[0]; const AV1_COMMON *const cm = &cpi->common; const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); const YV12_BUFFER_CONFIG *yv12_alt = get_ref_frame_yv12_buf(cm, ALTREF_FRAME); const struct scale_factors *const sf = get_ref_scale_factors_const(cm, LAST_FRAME); struct buf_2d dst; unsigned int uv_sad_g = 0; unsigned int uv_sad_alt = 0; for (int plane = AOM_PLANE_U; plane < MAX_MB_PLANE; ++plane) { struct macroblock_plane *p = &x->plane[plane]; struct macroblockd_plane *pd = &xd->plane[plane]; const BLOCK_SIZE bs = get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); if (bs != BLOCK_INVALID) { // For last: if (zero_motion) { if (mi->ref_frame[0] == LAST_FRAME) { uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf( p->src.buf, p->src.stride, pd->pre[0].buf, pd->pre[0].stride); } else { uint8_t *src = (plane == 1) ? yv12->u_buffer : yv12->v_buffer; setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12->uv_crop_width, yv12->uv_crop_height, yv12->uv_stride, xd->mi_row, xd->mi_col, sf, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y); uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf( p->src.buf, p->src.stride, dst.buf, dst.stride); } } else { uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf( p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); } // For golden: if (y_sad_g != UINT_MAX) { uint8_t *src = (plane == 1) ? yv12_g->u_buffer : yv12_g->v_buffer; setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12_g->uv_crop_width, yv12_g->uv_crop_height, yv12_g->uv_stride, xd->mi_row, xd->mi_col, sf, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y); uv_sad_g = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, dst.buf, dst.stride); } // For altref: if (y_sad_alt != UINT_MAX) { uint8_t *src = (plane == 1) ? yv12_alt->u_buffer : yv12_alt->v_buffer; setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12_alt->uv_crop_width, yv12_alt->uv_crop_height, yv12_alt->uv_stride, xd->mi_row, xd->mi_col, sf, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y); uv_sad_alt = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, dst.buf, dst.stride); } } if (uv_sad[plane - 1] > (y_sad >> shift_upper_limit)) x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 1; else if (uv_sad[plane - 1] < (y_sad >> shift_lower_limit)) x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 0; // Borderline case: to be refined at coding block level in nonrd_pickmode, // for coding block size < sb_size. else x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 2; x->color_sensitivity_sb_g[COLOR_SENS_IDX(plane)] = uv_sad_g > y_sad_g / fac_uv; x->color_sensitivity_sb_alt[COLOR_SENS_IDX(plane)] = uv_sad_alt > y_sad_alt / fac_uv; } } static void fill_variance_tree_leaves( AV1_COMP *cpi, MACROBLOCK *x, VP128x128 *vt, PART_EVAL_STATUS *force_split, int avg_16x16[][4], int maxvar_16x16[][4], int minvar_16x16[][4], int64_t *thresholds, const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf, int dst_stride, bool is_key_frame, const bool is_small_sb) { MACROBLOCKD *xd = &x->e_mbd; const int num_64x64_blocks = is_small_sb ? 1 : 4; // TODO(kyslov) Bring back compute_minmax_variance with content type detection const int compute_minmax_variance = 0; const int segment_id = xd->mi[0]->segment_id; int pixels_wide = 128, pixels_high = 128; int border_offset_4x4 = 0; int temporal_denoising = cpi->sf.rt_sf.use_rtc_tf; // dst_buf pointer is not used for is_key_frame, so it should be NULL. assert(IMPLIES(is_key_frame, dst_buf == NULL)); if (is_small_sb) { pixels_wide = 64; pixels_high = 64; } if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3); #if CONFIG_AV1_TEMPORAL_DENOISING temporal_denoising |= cpi->oxcf.noise_sensitivity; #endif // For temporal filtering or temporal denoiser enabled: since the source // is modified we need to avoid 4x4 avg along superblock boundary, since // simd code will load 8 pixels for 4x4 avg and so can access source // data outside superblock (while its being modified by temporal filter). // Temporal filtering is never done on key frames. if (!is_key_frame && temporal_denoising) border_offset_4x4 = 4; for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; blk64_idx++) { const int x64_idx = GET_BLK_IDX_X(blk64_idx, 6); const int y64_idx = GET_BLK_IDX_Y(blk64_idx, 6); const int blk64_scale_idx = blk64_idx << 2; force_split[blk64_idx + 1] = PART_EVAL_ALL; for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) { const int x32_idx = x64_idx + GET_BLK_IDX_X(lvl1_idx, 5); const int y32_idx = y64_idx + GET_BLK_IDX_Y(lvl1_idx, 5); const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2; force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ALL; avg_16x16[blk64_idx][lvl1_idx] = 0; maxvar_16x16[blk64_idx][lvl1_idx] = 0; minvar_16x16[blk64_idx][lvl1_idx] = INT_MAX; for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) { const int x16_idx = x32_idx + GET_BLK_IDX_X(lvl2_idx, 4); const int y16_idx = y32_idx + GET_BLK_IDX_Y(lvl2_idx, 4); const int split_index = 21 + lvl1_scale_idx + lvl2_idx; VP16x16 *vst = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; force_split[split_index] = PART_EVAL_ALL; if (is_key_frame) { // Go down to 4x4 down-sampling for variance. for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) { const int x8_idx = x16_idx + GET_BLK_IDX_X(lvl3_idx, 3); const int y8_idx = y16_idx + GET_BLK_IDX_Y(lvl3_idx, 3); VP8x8 *vst2 = &vst->split[lvl3_idx]; fill_variance_4x4avg(src_buf, src_stride, x8_idx, y8_idx, vst2, #if CONFIG_AV1_HIGHBITDEPTH xd->cur_buf->flags, #endif pixels_wide, pixels_high, border_offset_4x4); } } else { fill_variance_8x8avg(src_buf, src_stride, dst_buf, dst_stride, x16_idx, y16_idx, vst, is_cur_buf_hbd(xd), pixels_wide, pixels_high); fill_variance_tree(vst, BLOCK_16X16); VPartVar *none_var = &vt->split[blk64_idx] .split[lvl1_idx] .split[lvl2_idx] .part_variances.none; get_variance(none_var); const int val_none_var = none_var->variance; avg_16x16[blk64_idx][lvl1_idx] += val_none_var; minvar_16x16[blk64_idx][lvl1_idx] = AOMMIN(minvar_16x16[blk64_idx][lvl1_idx], val_none_var); maxvar_16x16[blk64_idx][lvl1_idx] = AOMMAX(maxvar_16x16[blk64_idx][lvl1_idx], val_none_var); if (val_none_var > thresholds[3]) { // 16X16 variance is above threshold for split, so force split to // 8x8 for this 16x16 block (this also forces splits for upper // levels). force_split[split_index] = PART_EVAL_ONLY_SPLIT; force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT; force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT; force_split[0] = PART_EVAL_ONLY_SPLIT; } else if (!cyclic_refresh_segment_id_boosted(segment_id) && compute_minmax_variance && val_none_var > thresholds[2]) { // We have some nominal amount of 16x16 variance (based on average), // compute the minmax over the 8x8 sub-blocks, and if above // threshold, force split to 8x8 block for this 16x16 block. int minmax = compute_minmax_8x8(src_buf, src_stride, dst_buf, dst_stride, x16_idx, y16_idx, #if CONFIG_AV1_HIGHBITDEPTH xd->cur_buf->flags, #endif pixels_wide, pixels_high); const int thresh_minmax = (int)cpi->vbp_info.threshold_minmax; if (minmax > thresh_minmax) { force_split[split_index] = PART_EVAL_ONLY_SPLIT; force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT; force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT; force_split[0] = PART_EVAL_ONLY_SPLIT; } } } } } } } static inline void set_ref_frame_for_partition( AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, MV_REFERENCE_FRAME *ref_frame_partition, MB_MODE_INFO *mi, unsigned int *y_sad, unsigned int *y_sad_g, unsigned int *y_sad_alt, const YV12_BUFFER_CONFIG *yv12_g, const YV12_BUFFER_CONFIG *yv12_alt, int mi_row, int mi_col, int num_planes) { AV1_COMMON *const cm = &cpi->common; const bool is_set_golden_ref_frame = *y_sad_g < 0.9 * *y_sad && *y_sad_g < *y_sad_alt; const bool is_set_altref_ref_frame = *y_sad_alt < 0.9 * *y_sad && *y_sad_alt < *y_sad_g; if (is_set_golden_ref_frame) { av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes); mi->ref_frame[0] = GOLDEN_FRAME; mi->mv[0].as_int = 0; *y_sad = *y_sad_g; *ref_frame_partition = GOLDEN_FRAME; x->nonrd_prune_ref_frame_search = 0; x->sb_me_partition = 0; } else if (is_set_altref_ref_frame) { av1_setup_pre_planes(xd, 0, yv12_alt, mi_row, mi_col, get_ref_scale_factors(cm, ALTREF_FRAME), num_planes); mi->ref_frame[0] = ALTREF_FRAME; mi->mv[0].as_int = 0; *y_sad = *y_sad_alt; *ref_frame_partition = ALTREF_FRAME; x->nonrd_prune_ref_frame_search = 0; x->sb_me_partition = 0; } else { *ref_frame_partition = LAST_FRAME; x->nonrd_prune_ref_frame_search = cpi->sf.rt_sf.nonrd_prune_ref_frame_search; } } static AOM_FORCE_INLINE int mv_distance(const FULLPEL_MV *mv0, const FULLPEL_MV *mv1) { return abs(mv0->row - mv1->row) + abs(mv0->col - mv1->col); } static inline void evaluate_neighbour_mvs(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad, bool is_small_sb, int est_motion) { const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd; // TODO(yunqingwang@google.com): test if this condition works with other // speeds. if (est_motion > 2 && source_sad_nonrd > kMedSad) return; MACROBLOCKD *xd = &x->e_mbd; BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128; MB_MODE_INFO *mi = xd->mi[0]; unsigned int above_y_sad = UINT_MAX; unsigned int left_y_sad = UINT_MAX; FULLPEL_MV above_mv = kZeroFullMv; FULLPEL_MV left_mv = kZeroFullMv; SubpelMvLimits subpel_mv_limits; const MV dummy_mv = { 0, 0 }; av1_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, &dummy_mv); // Current best MV FULLPEL_MV best_mv = get_fullmv_from_mv(&mi->mv[0].as_mv); const int multi = (est_motion > 2 && source_sad_nonrd > kLowSad) ? 7 : 8; if (xd->up_available) { const MB_MODE_INFO *above_mbmi = xd->above_mbmi; if (above_mbmi->mode >= INTRA_MODE_END && above_mbmi->ref_frame[0] == LAST_FRAME) { MV temp = above_mbmi->mv[0].as_mv; clamp_mv(&temp, &subpel_mv_limits); above_mv = get_fullmv_from_mv(&temp); if (mv_distance(&best_mv, &above_mv) > 0) { uint8_t const *ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &above_mv); above_y_sad = cpi->ppi->fn_ptr[bsize].sdf( x->plane[0].src.buf, x->plane[0].src.stride, ref_buf, xd->plane[0].pre[0].stride); } } } if (xd->left_available) { const MB_MODE_INFO *left_mbmi = xd->left_mbmi; if (left_mbmi->mode >= INTRA_MODE_END && left_mbmi->ref_frame[0] == LAST_FRAME) { MV temp = left_mbmi->mv[0].as_mv; clamp_mv(&temp, &subpel_mv_limits); left_mv = get_fullmv_from_mv(&temp); if (mv_distance(&best_mv, &left_mv) > 0 && mv_distance(&above_mv, &left_mv) > 0) { uint8_t const *ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &left_mv); left_y_sad = cpi->ppi->fn_ptr[bsize].sdf( x->plane[0].src.buf, x->plane[0].src.stride, ref_buf, xd->plane[0].pre[0].stride); } } } if (above_y_sad < ((multi * *y_sad) >> 3) && above_y_sad < left_y_sad) { *y_sad = above_y_sad; mi->mv[0].as_mv = get_mv_from_fullmv(&above_mv); clamp_mv(&mi->mv[0].as_mv, &subpel_mv_limits); } if (left_y_sad < ((multi * *y_sad) >> 3) && left_y_sad < above_y_sad) { *y_sad = left_y_sad; mi->mv[0].as_mv = get_mv_from_fullmv(&left_mv); clamp_mv(&mi->mv[0].as_mv, &subpel_mv_limits); } } static void do_int_pro_motion_estimation(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad, int mi_row, int mi_col, int source_sad_nonrd) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mi = xd->mi[0]; const int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN; const int increase_col_sw = source_sad_nonrd > kMedSad && !cpi->rc.high_motion_content_screen_rtc; int me_search_size_col = is_screen ? increase_col_sw ? 512 : 96 : block_size_wide[cm->seq_params->sb_size] >> 1; // For screen use larger search size row motion to capture // vertical scroll, which can be larger motion. int me_search_size_row = is_screen ? source_sad_nonrd > kMedSad ? 512 : 192 : block_size_high[cm->seq_params->sb_size] >> 1; unsigned int y_sad_zero; *y_sad = av1_int_pro_motion_estimation( cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv, &y_sad_zero, me_search_size_col, me_search_size_row); // The logic below selects whether the motion estimated in the // int_pro_motion() will be used in nonrd_pickmode. Only do this // for screen for now. if (is_screen) { unsigned int thresh_sad = (cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000; if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) { x->sb_me_partition = 1; x->sb_me_mv.as_int = mi->mv[0].as_int; } else { x->sb_me_partition = 0; // Fall back to using zero motion. *y_sad = y_sad_zero; mi->mv[0].as_int = 0; } } } static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad, unsigned int *y_sad_g, unsigned int *y_sad_alt, unsigned int *y_sad_last, MV_REFERENCE_FRAME *ref_frame_partition, struct scale_factors *sf_no_scale, int mi_row, int mi_col, bool is_small_sb, bool scaled_ref_last) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; const int num_planes = av1_num_planes(cm); bool scaled_ref_golden = false; bool scaled_ref_alt = false; BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128; MB_MODE_INFO *mi = xd->mi[0]; const YV12_BUFFER_CONFIG *yv12 = scaled_ref_last ? av1_get_scaled_ref_frame(cpi, LAST_FRAME) : get_ref_frame_yv12_buf(cm, LAST_FRAME); assert(yv12 != NULL); const YV12_BUFFER_CONFIG *yv12_g = NULL; const YV12_BUFFER_CONFIG *yv12_alt = NULL; // Check if LAST is a reference. For spatial layers always use it as // reference scaling. int use_last_ref = (cpi->ref_frame_flags & AOM_LAST_FLAG) || cpi->svc.number_spatial_layers > 1; int use_golden_ref = cpi->ref_frame_flags & AOM_GOLD_FLAG; int use_alt_ref = cpi->ppi->rtc_ref.set_ref_frame_config || cpi->sf.rt_sf.use_nonrd_altref_frame || (cpi->sf.rt_sf.use_comp_ref_nonrd && cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 1); // For 1 spatial layer: GOLDEN is another temporal reference. // Check if it should be used as reference for partitioning. if (cpi->svc.number_spatial_layers == 1 && use_golden_ref && (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) { yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); if (yv12_g && (yv12_g->y_crop_height != cm->height || yv12_g->y_crop_width != cm->width)) { yv12_g = av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME); scaled_ref_golden = true; } if (yv12_g && yv12_g != yv12) { av1_setup_pre_planes( xd, 0, yv12_g, mi_row, mi_col, scaled_ref_golden ? NULL : get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes); *y_sad_g = cpi->ppi->fn_ptr[bsize].sdf( x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride, xd->plane[AOM_PLANE_Y].pre[0].buf, xd->plane[AOM_PLANE_Y].pre[0].stride); } } // For 1 spatial layer: ALTREF is another temporal reference. // Check if it should be used as reference for partitioning. if (cpi->svc.number_spatial_layers == 1 && use_alt_ref && (cpi->ref_frame_flags & AOM_ALT_FLAG) && (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) { yv12_alt = get_ref_frame_yv12_buf(cm, ALTREF_FRAME); if (yv12_alt && (yv12_alt->y_crop_height != cm->height || yv12_alt->y_crop_width != cm->width)) { yv12_alt = av1_get_scaled_ref_frame(cpi, ALTREF_FRAME); scaled_ref_alt = true; } if (yv12_alt && yv12_alt != yv12) { av1_setup_pre_planes( xd, 0, yv12_alt, mi_row, mi_col, scaled_ref_alt ? NULL : get_ref_scale_factors(cm, ALTREF_FRAME), num_planes); *y_sad_alt = cpi->ppi->fn_ptr[bsize].sdf( x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride, xd->plane[AOM_PLANE_Y].pre[0].buf, xd->plane[AOM_PLANE_Y].pre[0].stride); } } if (use_last_ref) { const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd; av1_setup_pre_planes( xd, 0, yv12, mi_row, mi_col, scaled_ref_last ? NULL : get_ref_scale_factors(cm, LAST_FRAME), num_planes); mi->ref_frame[0] = LAST_FRAME; mi->ref_frame[1] = NONE_FRAME; mi->bsize = cm->seq_params->sb_size; mi->mv[0].as_int = 0; mi->interp_filters = av1_broadcast_interp_filter(BILINEAR); int est_motion = cpi->sf.rt_sf.estimate_motion_for_var_based_partition; // TODO(b/290596301): Look into adjusting this condition. // There is regression on color content when // estimate_motion_for_var_based_partition = 3 and high motion, // so for now force it to 2 based on superblock sad. if (est_motion > 2 && source_sad_nonrd > kMedSad) est_motion = 2; if ((est_motion == 1 || est_motion == 2) && xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0 && x->source_variance > 100 && source_sad_nonrd > kLowSad) { do_int_pro_motion_estimation(cpi, x, y_sad, mi_row, mi_col, source_sad_nonrd); } if (*y_sad == UINT_MAX) { *y_sad = cpi->ppi->fn_ptr[bsize].sdf( x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride, xd->plane[AOM_PLANE_Y].pre[0].buf, xd->plane[AOM_PLANE_Y].pre[0].stride); } // Evaluate if neighbours' MVs give better predictions. Zero MV is tested // already, so only non-zero MVs are tested here. Here the neighbour blocks // are the first block above or left to this superblock. if (est_motion >= 2 && (xd->up_available || xd->left_available)) evaluate_neighbour_mvs(cpi, x, y_sad, is_small_sb, est_motion); *y_sad_last = *y_sad; } // Pick the ref frame for partitioning, use golden or altref frame only if // its lower sad, bias to LAST with factor 0.9. set_ref_frame_for_partition(cpi, x, xd, ref_frame_partition, mi, y_sad, y_sad_g, y_sad_alt, yv12_g, yv12_alt, mi_row, mi_col, num_planes); // Only calculate the predictor for non-zero MV. if (mi->mv[0].as_int != 0) { if (!scaled_ref_last) { set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); } else { xd->block_ref_scale_factors[0] = sf_no_scale; xd->block_ref_scale_factors[1] = sf_no_scale; } av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, cm->seq_params->sb_size, AOM_PLANE_Y, num_planes - 1); } } // Decides whether to split or merge a 16x16 partition block in variance based // partitioning based on the 8x8 sub-block variances. static inline PART_EVAL_STATUS get_part_eval_based_on_sub_blk_var( VP16x16 *var_16x16_info, int64_t threshold16) { int max_8x8_var = 0, min_8x8_var = INT_MAX; for (int split_idx = 0; split_idx < 4; split_idx++) { get_variance(&var_16x16_info->split[split_idx].part_variances.none); int this_8x8_var = var_16x16_info->split[split_idx].part_variances.none.variance; max_8x8_var = AOMMAX(this_8x8_var, max_8x8_var); min_8x8_var = AOMMIN(this_8x8_var, min_8x8_var); } // If the difference between maximum and minimum sub-block variances is high, // then only evaluate PARTITION_SPLIT for the 16x16 block. Otherwise, evaluate // only PARTITION_NONE. The shift factor for threshold16 has been derived // empirically. return ((max_8x8_var - min_8x8_var) > (threshold16 << 2)) ? PART_EVAL_ONLY_SPLIT : PART_EVAL_ONLY_NONE; } static inline bool is_set_force_zeromv_skip_based_on_src_sad( int set_zeromv_skip_based_on_source_sad, SOURCE_SAD source_sad_nonrd) { if (set_zeromv_skip_based_on_source_sad == 0) return false; if (set_zeromv_skip_based_on_source_sad >= 3) return source_sad_nonrd <= kLowSad; else if (set_zeromv_skip_based_on_source_sad >= 2) return source_sad_nonrd <= kVeryLowSad; else if (set_zeromv_skip_based_on_source_sad >= 1) return source_sad_nonrd == kZeroSad; return false; } static inline bool set_force_zeromv_skip_for_sb( AV1_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, VP128x128 *vt, unsigned int *uv_sad, int mi_row, int mi_col, unsigned int y_sad, BLOCK_SIZE bsize) { AV1_COMMON *const cm = &cpi->common; if (!is_set_force_zeromv_skip_based_on_src_sad( cpi->sf.rt_sf.set_zeromv_skip_based_on_source_sad, x->content_state_sb.source_sad_nonrd)) return false; int shift = cpi->sf.rt_sf.increase_source_sad_thresh ? 1 : 0; const int block_width = mi_size_wide[cm->seq_params->sb_size]; const int block_height = mi_size_high[cm->seq_params->sb_size]; const unsigned int thresh_exit_part_y = cpi->zeromv_skip_thresh_exit_part[bsize] << shift; unsigned int thresh_exit_part_uv = CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y) << shift; // Be more aggressive in UV threshold if source_sad >= VeryLowSad // to suppreess visual artifact caused by the speed feature: // set_zeromv_skip_based_on_source_sad = 2. For now only for // part_early_exit_zeromv = 1. if (x->content_state_sb.source_sad_nonrd >= kVeryLowSad && cpi->sf.rt_sf.part_early_exit_zeromv == 1) thresh_exit_part_uv = thresh_exit_part_uv >> 3; if (mi_col + block_width <= tile->mi_col_end && mi_row + block_height <= tile->mi_row_end && y_sad < thresh_exit_part_y && uv_sad[0] < thresh_exit_part_uv && uv_sad[1] < thresh_exit_part_uv) { set_block_size(cpi, mi_row, mi_col, bsize); x->force_zeromv_skip_for_sb = 1; aom_free(vt); // Partition shape is set here at SB level. // Exit needs to happen from av1_choose_var_based_partitioning(). return true; } else if (x->content_state_sb.source_sad_nonrd == kZeroSad && cpi->sf.rt_sf.part_early_exit_zeromv >= 2) x->force_zeromv_skip_for_sb = 2; return false; } int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile, ThreadData *td, MACROBLOCK *x, int mi_row, int mi_col) { #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, choose_var_based_partitioning_time); #endif AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; const int64_t *const vbp_thresholds = cpi->vbp_info.thresholds; PART_EVAL_STATUS force_split[85]; int avg_64x64; int max_var_32x32[4]; int min_var_32x32[4]; int var_32x32; int var_64x64; int min_var_64x64 = INT_MAX; int max_var_64x64 = 0; int avg_16x16[4][4]; int maxvar_16x16[4][4]; int minvar_16x16[4][4]; const uint8_t *src_buf; const uint8_t *dst_buf; int dst_stride; unsigned int uv_sad[MAX_MB_PLANE - 1]; NOISE_LEVEL noise_level = kLow; bool is_zero_motion = true; bool scaled_ref_last = false; struct scale_factors sf_no_scale; av1_setup_scale_factors_for_frame(&sf_no_scale, cm->width, cm->height, cm->width, cm->height); bool is_key_frame = (frame_is_intra_only(cm) || (cpi->ppi->use_svc && cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)); assert(cm->seq_params->sb_size == BLOCK_64X64 || cm->seq_params->sb_size == BLOCK_128X128); const bool is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64); const int num_64x64_blocks = is_small_sb ? 1 : 4; unsigned int y_sad = UINT_MAX; unsigned int y_sad_g = UINT_MAX; unsigned int y_sad_alt = UINT_MAX; unsigned int y_sad_last = UINT_MAX; BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128; // Force skip encoding for all superblocks on slide change for // non_reference_frames. if (cpi->sf.rt_sf.skip_encoding_non_reference_slide_change && cpi->rc.high_source_sad && cpi->ppi->rtc_ref.non_reference_frame) { MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + get_mi_grid_idx(&cm->mi_params, mi_row, mi_col); av1_set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize); x->force_zeromv_skip_for_sb = 1; return 0; } // Ref frame used in partitioning. MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME; int64_t thresholds[5] = { vbp_thresholds[0], vbp_thresholds[1], vbp_thresholds[2], vbp_thresholds[3], vbp_thresholds[4] }; const int segment_id = xd->mi[0]->segment_id; uint64_t blk_sad = 0; if (cpi->src_sad_blk_64x64 != NULL && cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128) ? (cm->seq_params->mib_size >> 1) : cm->seq_params->mib_size; const int sb_cols = (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb; const int sbi_col = mi_col / sb_size_by_mb; const int sbi_row = mi_row / sb_size_by_mb; blk_sad = cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols]; } const bool is_segment_id_boosted = cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && cyclic_refresh_segment_id_boosted(segment_id); const int qindex = is_segment_id_boosted ? av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex) : cm->quant_params.base_qindex; set_vbp_thresholds( cpi, thresholds, blk_sad, qindex, x->content_state_sb.low_sumdiff, x->content_state_sb.source_sad_nonrd, x->content_state_sb.source_sad_rd, is_segment_id_boosted, x->content_state_sb.lighting_change); src_buf = x->plane[AOM_PLANE_Y].src.buf; int src_stride = x->plane[AOM_PLANE_Y].src.stride; // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks, // 5-20 for the 16x16 blocks. force_split[0] = PART_EVAL_ALL; memset(x->part_search_info.variance_low, 0, sizeof(x->part_search_info.variance_low)); // Check if LAST frame is NULL, and if so, treat this frame // as a key frame, for the purpose of the superblock partitioning. // LAST == NULL can happen in cases where enhancement spatial layers are // enabled dyanmically and the only reference is the spatial(GOLDEN). // If LAST frame has a different resolution: set the scaled_ref_last flag // and check if ref_scaled is NULL. if (!frame_is_intra_only(cm)) { const YV12_BUFFER_CONFIG *ref = get_ref_frame_yv12_buf(cm, LAST_FRAME); if (ref == NULL) { is_key_frame = true; } else if (ref->y_crop_height != cm->height || ref->y_crop_width != cm->width) { scaled_ref_last = true; const YV12_BUFFER_CONFIG *ref_scaled = av1_get_scaled_ref_frame(cpi, LAST_FRAME); if (ref_scaled == NULL) is_key_frame = true; } } x->source_variance = UINT_MAX; // For nord_pickmode: compute source_variance, only for superblocks with // some motion for now. This input can then be used to bias the partitioning // or the chroma_check. if (cpi->sf.rt_sf.use_nonrd_pick_mode && x->content_state_sb.source_sad_nonrd > kLowSad) x->source_variance = av1_get_perpixel_variance_facade( cpi, xd, &x->plane[0].src, cm->seq_params->sb_size, AOM_PLANE_Y); if (!is_key_frame) { setup_planes(cpi, x, &y_sad, &y_sad_g, &y_sad_alt, &y_sad_last, &ref_frame_partition, &sf_no_scale, mi_row, mi_col, is_small_sb, scaled_ref_last); MB_MODE_INFO *mi = xd->mi[0]; // Use reference SB directly for zero mv. if (mi->mv[0].as_int != 0) { dst_buf = xd->plane[AOM_PLANE_Y].dst.buf; dst_stride = xd->plane[AOM_PLANE_Y].dst.stride; is_zero_motion = false; } else { dst_buf = xd->plane[AOM_PLANE_Y].pre[0].buf; dst_stride = xd->plane[AOM_PLANE_Y].pre[0].stride; } } else { dst_buf = NULL; dst_stride = 0; } // check and set the color sensitivity of sb. av1_zero(uv_sad); chroma_check(cpi, x, bsize, y_sad_last, y_sad_g, y_sad_alt, is_key_frame, is_zero_motion, uv_sad); x->force_zeromv_skip_for_sb = 0; VP128x128 *vt; AOM_CHECK_MEM_ERROR(xd->error_info, vt, aom_malloc(sizeof(*vt))); vt->split = td->vt64x64; // If the superblock is completely static (zero source sad) and // the y_sad (relative to LAST ref) is very small, take the sb_size partition // and exit, and force zeromv_last skip mode for nonrd_pickmode. // Only do this on the base segment (so the QP-boosted segment, if applied, // can still continue cleaning/ramping up the quality). // Condition on color uv_sad is also added. if (!is_key_frame && cpi->sf.rt_sf.part_early_exit_zeromv && cpi->rc.frames_since_key > 30 && segment_id == CR_SEGMENT_ID_BASE && ref_frame_partition == LAST_FRAME && xd->mi[0]->mv[0].as_int == 0) { // Exit here, if zero mv skip flag is set at SB level. if (set_force_zeromv_skip_for_sb(cpi, x, tile, vt, uv_sad, mi_row, mi_col, y_sad, bsize)) return 0; } if (cpi->noise_estimate.enabled) noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate); // Fill in the entire tree of 8x8 (for inter frames) or 4x4 (for key frames) // variances for splits. fill_variance_tree_leaves(cpi, x, vt, force_split, avg_16x16, maxvar_16x16, minvar_16x16, thresholds, src_buf, src_stride, dst_buf, dst_stride, is_key_frame, is_small_sb); avg_64x64 = 0; for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; ++blk64_idx) { max_var_32x32[blk64_idx] = 0; min_var_32x32[blk64_idx] = INT_MAX; const int blk64_scale_idx = blk64_idx << 2; for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) { const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2; for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) { if (!is_key_frame) continue; VP16x16 *vtemp = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) fill_variance_tree(&vtemp->split[lvl3_idx], BLOCK_8X8); fill_variance_tree(vtemp, BLOCK_16X16); // If variance of this 16x16 block is above the threshold, force block // to split. This also forces a split on the upper levels. get_variance(&vtemp->part_variances.none); if (vtemp->part_variances.none.variance > thresholds[3]) { const int split_index = 21 + lvl1_scale_idx + lvl2_idx; force_split[split_index] = cpi->sf.rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var ? get_part_eval_based_on_sub_blk_var(vtemp, thresholds[3]) : PART_EVAL_ONLY_SPLIT; force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT; force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT; force_split[0] = PART_EVAL_ONLY_SPLIT; } } fill_variance_tree(&vt->split[blk64_idx].split[lvl1_idx], BLOCK_32X32); // If variance of this 32x32 block is above the threshold, or if its above // (some threshold of) the average variance over the sub-16x16 blocks, // then force this block to split. This also forces a split on the upper // (64x64) level. uint64_t frame_sad_thresh = 20000; const int is_360p_or_smaller = cm->width * cm->height <= RESOLUTION_360P; if (cpi->svc.number_temporal_layers > 2 && cpi->svc.temporal_layer_id == 0) frame_sad_thresh = frame_sad_thresh << 1; if (force_split[5 + blk64_scale_idx + lvl1_idx] == PART_EVAL_ALL) { get_variance(&vt->split[blk64_idx].split[lvl1_idx].part_variances.none); var_32x32 = vt->split[blk64_idx].split[lvl1_idx].part_variances.none.variance; max_var_32x32[blk64_idx] = AOMMAX(var_32x32, max_var_32x32[blk64_idx]); min_var_32x32[blk64_idx] = AOMMIN(var_32x32, min_var_32x32[blk64_idx]); const int max_min_var_16X16_diff = (maxvar_16x16[blk64_idx][lvl1_idx] - minvar_16x16[blk64_idx][lvl1_idx]); if (var_32x32 > thresholds[2] || (!is_key_frame && var_32x32 > (thresholds[2] >> 1) && var_32x32 > (avg_16x16[blk64_idx][lvl1_idx] >> 1))) { force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT; force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT; force_split[0] = PART_EVAL_ONLY_SPLIT; } else if (!is_key_frame && is_360p_or_smaller && ((max_min_var_16X16_diff > (thresholds[2] >> 1) && maxvar_16x16[blk64_idx][lvl1_idx] > thresholds[2]) || (cpi->sf.rt_sf.prefer_large_partition_blocks && x->content_state_sb.source_sad_nonrd > kLowSad && cpi->rc.frame_source_sad < frame_sad_thresh && maxvar_16x16[blk64_idx][lvl1_idx] > (thresholds[2] >> 4) && maxvar_16x16[blk64_idx][lvl1_idx] > (minvar_16x16[blk64_idx][lvl1_idx] << 2)))) { force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT; force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT; force_split[0] = PART_EVAL_ONLY_SPLIT; } } } if (force_split[1 + blk64_idx] == PART_EVAL_ALL) { fill_variance_tree(&vt->split[blk64_idx], BLOCK_64X64); get_variance(&vt->split[blk64_idx].part_variances.none); var_64x64 = vt->split[blk64_idx].part_variances.none.variance; max_var_64x64 = AOMMAX(var_64x64, max_var_64x64); min_var_64x64 = AOMMIN(var_64x64, min_var_64x64); // If the difference of the max-min variances of sub-blocks or max // variance of a sub-block is above some threshold of then force this // block to split. Only checking this for noise level >= medium, if // encoder is in SVC or if we already forced large blocks. const int max_min_var_32x32_diff = max_var_32x32[blk64_idx] - min_var_32x32[blk64_idx]; const int check_max_var = max_var_32x32[blk64_idx] > thresholds[1] >> 1; const bool check_noise_lvl = noise_level >= kMedium || cpi->ppi->use_svc || cpi->sf.rt_sf.prefer_large_partition_blocks; const int64_t set_threshold = 3 * (thresholds[1] >> 3); if (!is_key_frame && max_min_var_32x32_diff > set_threshold && check_max_var && check_noise_lvl) { force_split[1 + blk64_idx] = PART_EVAL_ONLY_SPLIT; force_split[0] = PART_EVAL_ONLY_SPLIT; } avg_64x64 += var_64x64; } if (is_small_sb) force_split[0] = PART_EVAL_ONLY_SPLIT; } if (force_split[0] == PART_EVAL_ALL) { fill_variance_tree(vt, BLOCK_128X128); get_variance(&vt->part_variances.none); const int set_avg_64x64 = (9 * avg_64x64) >> 5; if (!is_key_frame && vt->part_variances.none.variance > set_avg_64x64) force_split[0] = PART_EVAL_ONLY_SPLIT; if (!is_key_frame && (max_var_64x64 - min_var_64x64) > 3 * (thresholds[0] >> 3) && max_var_64x64 > thresholds[0] >> 1) force_split[0] = PART_EVAL_ONLY_SPLIT; } if (mi_col + 32 > tile->mi_col_end || mi_row + 32 > tile->mi_row_end || !set_vt_partitioning(cpi, xd, tile, vt, BLOCK_128X128, mi_row, mi_col, thresholds[0], BLOCK_16X16, force_split[0])) { for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; ++blk64_idx) { const int x64_idx = GET_BLK_IDX_X(blk64_idx, 4); const int y64_idx = GET_BLK_IDX_Y(blk64_idx, 4); const int blk64_scale_idx = blk64_idx << 2; // Now go through the entire structure, splitting every block size until // we get to one that's got a variance lower than our threshold. if (set_vt_partitioning(cpi, xd, tile, &vt->split[blk64_idx], BLOCK_64X64, mi_row + y64_idx, mi_col + x64_idx, thresholds[1], BLOCK_16X16, force_split[1 + blk64_idx])) continue; for (int lvl1_idx = 0; lvl1_idx < 4; ++lvl1_idx) { const int x32_idx = GET_BLK_IDX_X(lvl1_idx, 3); const int y32_idx = GET_BLK_IDX_Y(lvl1_idx, 3); const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2; if (set_vt_partitioning( cpi, xd, tile, &vt->split[blk64_idx].split[lvl1_idx], BLOCK_32X32, (mi_row + y64_idx + y32_idx), (mi_col + x64_idx + x32_idx), thresholds[2], BLOCK_16X16, force_split[5 + blk64_scale_idx + lvl1_idx])) continue; for (int lvl2_idx = 0; lvl2_idx < 4; ++lvl2_idx) { const int x16_idx = GET_BLK_IDX_X(lvl2_idx, 2); const int y16_idx = GET_BLK_IDX_Y(lvl2_idx, 2); const int split_index = 21 + lvl1_scale_idx + lvl2_idx; VP16x16 *vtemp = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; if (set_vt_partitioning(cpi, xd, tile, vtemp, BLOCK_16X16, mi_row + y64_idx + y32_idx + y16_idx, mi_col + x64_idx + x32_idx + x16_idx, thresholds[3], BLOCK_8X8, force_split[split_index])) continue; for (int lvl3_idx = 0; lvl3_idx < 4; ++lvl3_idx) { const int x8_idx = GET_BLK_IDX_X(lvl3_idx, 1); const int y8_idx = GET_BLK_IDX_Y(lvl3_idx, 1); set_block_size(cpi, (mi_row + y64_idx + y32_idx + y16_idx + y8_idx), (mi_col + x64_idx + x32_idx + x16_idx + x8_idx), BLOCK_8X8); } } } } } if (cpi->sf.rt_sf.short_circuit_low_temp_var) { set_low_temp_var_flag(cpi, &x->part_search_info, xd, vt, thresholds, ref_frame_partition, mi_col, mi_row, is_small_sb); } aom_free(vt); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, choose_var_based_partitioning_time); #endif return 0; } aom-3.12.1/av1/encoder/var_based_part.h000066400000000000000000000102641477627663500176370ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_VAR_BASED_PART_H_ #define AOM_AV1_ENCODER_VAR_BASED_PART_H_ #include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" #include "av1/encoder/encoder.h" // Calculate block index x and y from split level and index #define GET_BLK_IDX_X(idx, level) (((idx) & (0x01)) << (level)) #define GET_BLK_IDX_Y(idx, level) (((idx) >> (0x01)) << (level)) #ifdef __cplusplus extern "C" { #endif #define QINDEX_LARGE_BLOCK_THR \ 100 // Use increased thresholds for midres for speed 9 when qindex is above // this threshold #define CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part) \ ((3 * (thresh_exit_part)) >> 2) /*!\brief Set the thresholds for variance based partition. * * Set the variance split thresholds for following the block sizes: * 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32, * 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is * currently only used on key frame. The thresholds are based om Q, resolution, * noise level, and content state. * * \ingroup variance_partition * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * \param[in] q q index * \param[in] content_lowsumdiff Low sumdiff flag for superblock * * \remark Returns the set of thresholds in \c cpi->vbp_info.thresholds. */ void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q, int content_lowsumdiff); /*!\brief Variance based partition selection. * * Select the partitioning based on the variance of the residual signal, * residual generated as the difference between the source and prediction. * The prediction is the reconstructed LAST or reconstructed GOLDEN, whichever * has lower y sad. For LAST, option exists (speed feature) to use motion * compensation based on superblock motion via int_pro_motion_estimation. For * key frames reference is fixed 128 level, so variance is the source variance. * The variance is computed for downsampled inputs (8x8 or 4x4 downsampled), * and selection is done top-down via as set of partition thresholds. defined * for each block level, and set based on Q, resolution, noise level, and * content state. * * \ingroup variance_partition * \callgraph * \callergraph * * \param[in] cpi Top level encoder structure * \param[in] tile Pointer to TileInfo * \param[in] td Pointer to ThreadData * \param[in] x Pointer to MACROBLOCK * \param[in] mi_row Row coordinate of the superblock in a step size of MI_SIZE * \param[in] mi_col Column coordinate of the super block in a step size of MI_SIZE * * \return Returns the partition in \c xd->mi[0]->sb_type. Also sets the low * temporal variance flag and the color sensitivity flag (both used in * nonrd_pickmode). */ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile, ThreadData *td, MACROBLOCK *x, int mi_row, int mi_col); // Read out the block's temporal variance for 64x64 SB case. int av1_get_force_skip_low_temp_var_small_sb(const uint8_t *variance_low, int mi_row, int mi_col, BLOCK_SIZE bsize); // Read out the block's temporal variance for 128x128 SB case. int av1_get_force_skip_low_temp_var(const uint8_t *variance_low, int mi_row, int mi_col, BLOCK_SIZE bsize); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_ENCODER_VAR_BASED_PART_H_ aom-3.12.1/av1/encoder/wedge_utils.c000066400000000000000000000100061477627663500171630ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "aom_dsp/aom_dsp_common.h" #include "av1/common/reconinter.h" #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) /** * Computes SSE of a compound predictor constructed from 2 fundamental * predictors p0 and p1 using blending with mask. * * r1: Residuals of p1. * (source - p1) * d: Difference of p1 and p0. * (p1 - p0) * m: The blending mask * N: Number of pixels * * 'r1', 'd', and 'm' are contiguous. * * Computes: * Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to: * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2), * where r0 is (source - p0), and r1 is (source - p1), which is in turn * is equivalent to: * Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2), * which is the SSE of the residuals of the compound predictor scaled up by * MAX_MASK_VALUE**2. * * Note that we clamp the partial term in the loop to 16 bits signed. This is * to facilitate equivalent SIMD implementation. It should have no effect if * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always * holds for 8 bit input, and on real input, it should hold practically always, * as residuals are expected to be small. */ uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, const uint8_t *m, int N) { uint64_t csse = 0; int i; for (i = 0; i < N; i++) { int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i]; t = clamp(t, INT16_MIN, INT16_MAX); csse += t * t; } return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); } /** * Choose the mask sign for a compound predictor. * * ds: Difference of the squares of the residuals. * r0**2 - r1**2 * m: The blending mask * N: Number of pixels * limit: Pre-computed threshold value. * MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2)) * * 'ds' and 'm' are contiguous. * * Returns true if the negated mask has lower SSE compared to the positive * mask. Computation is based on: * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2) * > * Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2) * * which can be simplified to: * * Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2)) * * The right hand side does not depend on the mask, and needs to be passed as * the 'limit' parameter. * * After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left * hand side is simply a scalar product between an int16_t and uint8_t vector. * * Note that for efficiency, ds is stored on 16 bits. Real input residuals * being small, this should not cause a noticeable issue. */ int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, int N, int64_t limit) { int64_t acc = 0; do { acc += *ds++ * *m++; } while (--N); return acc > limit; } /** * Compute the element-wise difference of the squares of 2 arrays. * * d: Difference of the squares of the inputs: a**2 - b**2 * a: First input array * b: Second input array * N: Number of elements * * 'd', 'a', and 'b' are contiguous. * * The result is saturated to signed 16 bits. */ void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a, const int16_t *b, int N) { int i; for (i = 0; i < N; i++) d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX); } aom-3.12.1/av1/encoder/x86/000077500000000000000000000000001477627663500151345ustar00rootroot00000000000000aom-3.12.1/av1/encoder/x86/av1_fwd_txfm1d_sse4.c000066400000000000000000001460101477627663500210520ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/encoder/x86/av1_txfm1d_sse4.h" void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit, const int stride) { __m128i buf0[32]; __m128i buf1[32]; const int32_t *cospi; int startidx = 0 * stride; int endidx = 31 * stride; // stage 0 // stage 1 buf1[0] = _mm_add_epi32(input[startidx], input[endidx]); buf1[31] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += stride; endidx -= stride; buf1[1] = _mm_add_epi32(input[startidx], input[endidx]); buf1[30] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += stride; endidx -= stride; buf1[2] = _mm_add_epi32(input[startidx], input[endidx]); buf1[29] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += stride; endidx -= stride; buf1[3] = _mm_add_epi32(input[startidx], input[endidx]); buf1[28] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += stride; endidx -= stride; buf1[4] = _mm_add_epi32(input[startidx], input[endidx]); buf1[27] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += stride; endidx -= stride; buf1[5] = _mm_add_epi32(input[startidx], input[endidx]); buf1[26] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += stride; endidx -= stride; buf1[6] = _mm_add_epi32(input[startidx], input[endidx]); buf1[25] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += stride; endidx -= stride; buf1[7] = _mm_add_epi32(input[startidx], input[endidx]); buf1[24] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += stride; endidx -= stride; buf1[8] = _mm_add_epi32(input[startidx], input[endidx]); buf1[23] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += stride; endidx -= stride; buf1[9] = _mm_add_epi32(input[startidx], input[endidx]); buf1[22] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += stride; endidx -= stride; buf1[10] = _mm_add_epi32(input[startidx], input[endidx]); buf1[21] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += stride; endidx -= stride; buf1[11] = _mm_add_epi32(input[startidx], input[endidx]); buf1[20] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += stride; endidx -= stride; buf1[12] = _mm_add_epi32(input[startidx], input[endidx]); buf1[19] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += stride; endidx -= stride; buf1[13] = _mm_add_epi32(input[startidx], input[endidx]); buf1[18] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += stride; endidx -= stride; buf1[14] = _mm_add_epi32(input[startidx], input[endidx]); buf1[17] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += stride; endidx -= stride; buf1[15] = _mm_add_epi32(input[startidx], input[endidx]); buf1[16] = _mm_sub_epi32(input[startidx], input[endidx]); // stage 2 cospi = cospi_arr(cos_bit); buf0[0] = _mm_add_epi32(buf1[0], buf1[15]); buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]); buf0[1] = _mm_add_epi32(buf1[1], buf1[14]); buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]); buf0[2] = _mm_add_epi32(buf1[2], buf1[13]); buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]); buf0[3] = _mm_add_epi32(buf1[3], buf1[12]); buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]); buf0[4] = _mm_add_epi32(buf1[4], buf1[11]); buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]); buf0[5] = _mm_add_epi32(buf1[5], buf1[10]); buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]); buf0[6] = _mm_add_epi32(buf1[6], buf1[9]); buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]); buf0[7] = _mm_add_epi32(buf1[7], buf1[8]); buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]); buf0[16] = buf1[16]; buf0[17] = buf1[17]; buf0[18] = buf1[18]; buf0[19] = buf1[19]; btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20], buf0[27], cos_bit); btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21], buf0[26], cos_bit); btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22], buf0[25], cos_bit); btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23], buf0[24], cos_bit); buf0[28] = buf1[28]; buf0[29] = buf1[29]; buf0[30] = buf1[30]; buf0[31] = buf1[31]; // stage 3 cospi = cospi_arr(cos_bit); buf1[0] = _mm_add_epi32(buf0[0], buf0[7]); buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]); buf1[1] = _mm_add_epi32(buf0[1], buf0[6]); buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]); buf1[2] = _mm_add_epi32(buf0[2], buf0[5]); buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]); buf1[3] = _mm_add_epi32(buf0[3], buf0[4]); buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]); buf1[8] = buf0[8]; buf1[9] = buf0[9]; btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10], buf1[13], cos_bit); btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11], buf1[12], cos_bit); buf1[14] = buf0[14]; buf1[15] = buf0[15]; buf1[16] = _mm_add_epi32(buf0[16], buf0[23]); buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]); buf1[17] = _mm_add_epi32(buf0[17], buf0[22]); buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]); buf1[18] = _mm_add_epi32(buf0[18], buf0[21]); buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]); buf1[19] = _mm_add_epi32(buf0[19], buf0[20]); buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]); buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]); buf1[31] = _mm_add_epi32(buf0[31], buf0[24]); buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]); buf1[30] = _mm_add_epi32(buf0[30], buf0[25]); buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]); buf1[29] = _mm_add_epi32(buf0[29], buf0[26]); buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]); buf1[28] = _mm_add_epi32(buf0[28], buf0[27]); // stage 4 cospi = cospi_arr(cos_bit); buf0[0] = _mm_add_epi32(buf1[0], buf1[3]); buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]); buf0[1] = _mm_add_epi32(buf1[1], buf1[2]); buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]); buf0[4] = buf1[4]; btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6], cos_bit); buf0[7] = buf1[7]; buf0[8] = _mm_add_epi32(buf1[8], buf1[11]); buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]); buf0[9] = _mm_add_epi32(buf1[9], buf1[10]); buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]); buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]); buf0[15] = _mm_add_epi32(buf1[15], buf1[12]); buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]); buf0[14] = _mm_add_epi32(buf1[14], buf1[13]); buf0[16] = buf1[16]; buf0[17] = buf1[17]; btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18], buf0[29], cos_bit); btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19], buf0[28], cos_bit); btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20], buf0[27], cos_bit); btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21], buf0[26], cos_bit); buf0[22] = buf1[22]; buf0[23] = buf1[23]; buf0[24] = buf1[24]; buf0[25] = buf1[25]; buf0[30] = buf1[30]; buf0[31] = buf1[31]; // stage 5 cospi = cospi_arr(cos_bit); btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1], cos_bit); btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3], cos_bit); buf1[4] = _mm_add_epi32(buf0[4], buf0[5]); buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]); buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]); buf1[7] = _mm_add_epi32(buf0[7], buf0[6]); buf1[8] = buf0[8]; btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14], cos_bit); btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10], buf1[13], cos_bit); buf1[11] = buf0[11]; buf1[12] = buf0[12]; buf1[15] = buf0[15]; buf1[16] = _mm_add_epi32(buf0[16], buf0[19]); buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]); buf1[17] = _mm_add_epi32(buf0[17], buf0[18]); buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]); buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]); buf1[23] = _mm_add_epi32(buf0[23], buf0[20]); buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]); buf1[22] = _mm_add_epi32(buf0[22], buf0[21]); buf1[24] = _mm_add_epi32(buf0[24], buf0[27]); buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]); buf1[25] = _mm_add_epi32(buf0[25], buf0[26]); buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]); buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]); buf1[31] = _mm_add_epi32(buf0[31], buf0[28]); buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]); buf1[30] = _mm_add_epi32(buf0[30], buf0[29]); // stage 6 cospi = cospi_arr(cos_bit); buf0[0] = buf1[0]; buf0[1] = buf1[1]; buf0[2] = buf1[2]; buf0[3] = buf1[3]; btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7], cos_bit); btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6], cos_bit); buf0[8] = _mm_add_epi32(buf1[8], buf1[9]); buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]); buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]); buf0[11] = _mm_add_epi32(buf1[11], buf1[10]); buf0[12] = _mm_add_epi32(buf1[12], buf1[13]); buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]); buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]); buf0[15] = _mm_add_epi32(buf1[15], buf1[14]); buf0[16] = buf1[16]; btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], buf0[30], cos_bit); btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18], buf0[29], cos_bit); buf0[19] = buf1[19]; buf0[20] = buf1[20]; btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21], buf0[26], cos_bit); btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22], buf0[25], cos_bit); buf0[23] = buf1[23]; buf0[24] = buf1[24]; buf0[27] = buf1[27]; buf0[28] = buf1[28]; buf0[31] = buf1[31]; // stage 7 cospi = cospi_arr(cos_bit); buf1[0] = buf0[0]; buf1[1] = buf0[1]; buf1[2] = buf0[2]; buf1[3] = buf0[3]; buf1[4] = buf0[4]; buf1[5] = buf0[5]; buf1[6] = buf0[6]; buf1[7] = buf0[7]; btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15], cos_bit); btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], buf1[14], cos_bit); btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10], buf1[13], cos_bit); btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11], buf1[12], cos_bit); buf1[16] = _mm_add_epi32(buf0[16], buf0[17]); buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]); buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]); buf1[19] = _mm_add_epi32(buf0[19], buf0[18]); buf1[20] = _mm_add_epi32(buf0[20], buf0[21]); buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]); buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]); buf1[23] = _mm_add_epi32(buf0[23], buf0[22]); buf1[24] = _mm_add_epi32(buf0[24], buf0[25]); buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]); buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]); buf1[27] = _mm_add_epi32(buf0[27], buf0[26]); buf1[28] = _mm_add_epi32(buf0[28], buf0[29]); buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]); buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]); buf1[31] = _mm_add_epi32(buf0[31], buf0[30]); // stage 8 cospi = cospi_arr(cos_bit); buf0[0] = buf1[0]; buf0[1] = buf1[1]; buf0[2] = buf1[2]; buf0[3] = buf1[3]; buf0[4] = buf1[4]; buf0[5] = buf1[5]; buf0[6] = buf1[6]; buf0[7] = buf1[7]; buf0[8] = buf1[8]; buf0[9] = buf1[9]; buf0[10] = buf1[10]; buf0[11] = buf1[11]; buf0[12] = buf1[12]; buf0[13] = buf1[13]; buf0[14] = buf1[14]; buf0[15] = buf1[15]; btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], buf0[31], cos_bit); btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17], buf0[30], cos_bit); btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18], buf0[29], cos_bit); btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19], buf0[28], cos_bit); btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20], buf0[27], cos_bit); btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21], buf0[26], cos_bit); btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22], buf0[25], cos_bit); btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], buf0[24], cos_bit); startidx = 0 * stride; endidx = 31 * stride; // stage 9 output[startidx] = buf0[0]; output[endidx] = buf0[31]; startidx += stride; endidx -= stride; output[startidx] = buf0[16]; output[endidx] = buf0[15]; startidx += stride; endidx -= stride; output[startidx] = buf0[8]; output[endidx] = buf0[23]; startidx += stride; endidx -= stride; output[startidx] = buf0[24]; output[endidx] = buf0[7]; startidx += stride; endidx -= stride; output[startidx] = buf0[4]; output[endidx] = buf0[27]; startidx += stride; endidx -= stride; output[startidx] = buf0[20]; output[endidx] = buf0[11]; startidx += stride; endidx -= stride; output[startidx] = buf0[12]; output[endidx] = buf0[19]; startidx += stride; endidx -= stride; output[startidx] = buf0[28]; output[endidx] = buf0[3]; startidx += stride; endidx -= stride; output[startidx] = buf0[2]; output[endidx] = buf0[29]; startidx += stride; endidx -= stride; output[startidx] = buf0[18]; output[endidx] = buf0[13]; startidx += stride; endidx -= stride; output[startidx] = buf0[10]; output[endidx] = buf0[21]; startidx += stride; endidx -= stride; output[startidx] = buf0[26]; output[endidx] = buf0[5]; startidx += stride; endidx -= stride; output[startidx] = buf0[6]; output[endidx] = buf0[25]; startidx += stride; endidx -= stride; output[startidx] = buf0[22]; output[endidx] = buf0[9]; startidx += stride; endidx -= stride; output[startidx] = buf0[14]; output[endidx] = buf0[17]; startidx += stride; endidx -= stride; output[startidx] = buf0[30]; output[endidx] = buf0[1]; } void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit, const int instride, const int outstride) { const int32_t *cospi = cospi_arr(cos_bit); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); __m128i cospi_m32 = _mm_set1_epi32(-cospi[32]); __m128i cospi_p32 = _mm_set1_epi32(cospi[32]); __m128i cospi_m16 = _mm_set1_epi32(-cospi[16]); __m128i cospi_p48 = _mm_set1_epi32(cospi[48]); __m128i cospi_m48 = _mm_set1_epi32(-cospi[48]); __m128i cospi_p16 = _mm_set1_epi32(cospi[16]); __m128i cospi_m08 = _mm_set1_epi32(-cospi[8]); __m128i cospi_p56 = _mm_set1_epi32(cospi[56]); __m128i cospi_m56 = _mm_set1_epi32(-cospi[56]); __m128i cospi_m40 = _mm_set1_epi32(-cospi[40]); __m128i cospi_p24 = _mm_set1_epi32(cospi[24]); __m128i cospi_m24 = _mm_set1_epi32(-cospi[24]); __m128i cospi_p08 = _mm_set1_epi32(cospi[8]); __m128i cospi_p40 = _mm_set1_epi32(cospi[40]); __m128i cospi_p60 = _mm_set1_epi32(cospi[60]); __m128i cospi_p04 = _mm_set1_epi32(cospi[4]); __m128i cospi_p28 = _mm_set1_epi32(cospi[28]); __m128i cospi_p36 = _mm_set1_epi32(cospi[36]); __m128i cospi_p44 = _mm_set1_epi32(cospi[44]); __m128i cospi_p20 = _mm_set1_epi32(cospi[20]); __m128i cospi_p12 = _mm_set1_epi32(cospi[12]); __m128i cospi_p52 = _mm_set1_epi32(cospi[52]); __m128i cospi_m04 = _mm_set1_epi32(-cospi[4]); __m128i cospi_m60 = _mm_set1_epi32(-cospi[60]); __m128i cospi_m36 = _mm_set1_epi32(-cospi[36]); __m128i cospi_m28 = _mm_set1_epi32(-cospi[28]); __m128i cospi_m20 = _mm_set1_epi32(-cospi[20]); __m128i cospi_m44 = _mm_set1_epi32(-cospi[44]); __m128i cospi_m52 = _mm_set1_epi32(-cospi[52]); __m128i cospi_m12 = _mm_set1_epi32(-cospi[12]); __m128i cospi_p62 = _mm_set1_epi32(cospi[62]); __m128i cospi_p02 = _mm_set1_epi32(cospi[2]); __m128i cospi_p30 = _mm_set1_epi32(cospi[30]); __m128i cospi_p34 = _mm_set1_epi32(cospi[34]); __m128i cospi_p46 = _mm_set1_epi32(cospi[46]); __m128i cospi_p18 = _mm_set1_epi32(cospi[18]); __m128i cospi_p14 = _mm_set1_epi32(cospi[14]); __m128i cospi_p50 = _mm_set1_epi32(cospi[50]); __m128i cospi_p54 = _mm_set1_epi32(cospi[54]); __m128i cospi_p10 = _mm_set1_epi32(cospi[10]); __m128i cospi_p22 = _mm_set1_epi32(cospi[22]); __m128i cospi_p42 = _mm_set1_epi32(cospi[42]); __m128i cospi_p38 = _mm_set1_epi32(cospi[38]); __m128i cospi_p26 = _mm_set1_epi32(cospi[26]); __m128i cospi_p06 = _mm_set1_epi32(cospi[6]); __m128i cospi_p58 = _mm_set1_epi32(cospi[58]); __m128i cospi_p63 = _mm_set1_epi32(cospi[63]); __m128i cospi_p01 = _mm_set1_epi32(cospi[1]); __m128i cospi_p31 = _mm_set1_epi32(cospi[31]); __m128i cospi_p33 = _mm_set1_epi32(cospi[33]); __m128i cospi_p47 = _mm_set1_epi32(cospi[47]); __m128i cospi_p17 = _mm_set1_epi32(cospi[17]); __m128i cospi_p15 = _mm_set1_epi32(cospi[15]); __m128i cospi_p49 = _mm_set1_epi32(cospi[49]); __m128i cospi_p55 = _mm_set1_epi32(cospi[55]); __m128i cospi_p09 = _mm_set1_epi32(cospi[9]); __m128i cospi_p23 = _mm_set1_epi32(cospi[23]); __m128i cospi_p41 = _mm_set1_epi32(cospi[41]); __m128i cospi_p39 = _mm_set1_epi32(cospi[39]); __m128i cospi_p25 = _mm_set1_epi32(cospi[25]); __m128i cospi_p07 = _mm_set1_epi32(cospi[7]); __m128i cospi_p57 = _mm_set1_epi32(cospi[57]); __m128i cospi_p59 = _mm_set1_epi32(cospi[59]); __m128i cospi_p05 = _mm_set1_epi32(cospi[5]); __m128i cospi_p27 = _mm_set1_epi32(cospi[27]); __m128i cospi_p37 = _mm_set1_epi32(cospi[37]); __m128i cospi_p43 = _mm_set1_epi32(cospi[43]); __m128i cospi_p21 = _mm_set1_epi32(cospi[21]); __m128i cospi_p11 = _mm_set1_epi32(cospi[11]); __m128i cospi_p53 = _mm_set1_epi32(cospi[53]); __m128i cospi_p51 = _mm_set1_epi32(cospi[51]); __m128i cospi_p13 = _mm_set1_epi32(cospi[13]); __m128i cospi_p19 = _mm_set1_epi32(cospi[19]); __m128i cospi_p45 = _mm_set1_epi32(cospi[45]); __m128i cospi_p35 = _mm_set1_epi32(cospi[35]); __m128i cospi_p29 = _mm_set1_epi32(cospi[29]); __m128i cospi_p03 = _mm_set1_epi32(cospi[3]); __m128i cospi_p61 = _mm_set1_epi32(cospi[61]); int startidx = 0 * instride; int endidx = 63 * instride; // stage 1 __m128i x1[64]; x1[0] = _mm_add_epi32(input[startidx], input[endidx]); x1[63] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[1] = _mm_add_epi32(input[startidx], input[endidx]); x1[62] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[2] = _mm_add_epi32(input[startidx], input[endidx]); x1[61] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[3] = _mm_add_epi32(input[startidx], input[endidx]); x1[60] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[4] = _mm_add_epi32(input[startidx], input[endidx]); x1[59] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[5] = _mm_add_epi32(input[startidx], input[endidx]); x1[58] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[6] = _mm_add_epi32(input[startidx], input[endidx]); x1[57] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[7] = _mm_add_epi32(input[startidx], input[endidx]); x1[56] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[8] = _mm_add_epi32(input[startidx], input[endidx]); x1[55] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[9] = _mm_add_epi32(input[startidx], input[endidx]); x1[54] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[10] = _mm_add_epi32(input[startidx], input[endidx]); x1[53] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[11] = _mm_add_epi32(input[startidx], input[endidx]); x1[52] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[12] = _mm_add_epi32(input[startidx], input[endidx]); x1[51] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[13] = _mm_add_epi32(input[startidx], input[endidx]); x1[50] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[14] = _mm_add_epi32(input[startidx], input[endidx]); x1[49] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[15] = _mm_add_epi32(input[startidx], input[endidx]); x1[48] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[16] = _mm_add_epi32(input[startidx], input[endidx]); x1[47] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[17] = _mm_add_epi32(input[startidx], input[endidx]); x1[46] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[18] = _mm_add_epi32(input[startidx], input[endidx]); x1[45] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[19] = _mm_add_epi32(input[startidx], input[endidx]); x1[44] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[20] = _mm_add_epi32(input[startidx], input[endidx]); x1[43] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[21] = _mm_add_epi32(input[startidx], input[endidx]); x1[42] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[22] = _mm_add_epi32(input[startidx], input[endidx]); x1[41] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[23] = _mm_add_epi32(input[startidx], input[endidx]); x1[40] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[24] = _mm_add_epi32(input[startidx], input[endidx]); x1[39] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[25] = _mm_add_epi32(input[startidx], input[endidx]); x1[38] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[26] = _mm_add_epi32(input[startidx], input[endidx]); x1[37] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[27] = _mm_add_epi32(input[startidx], input[endidx]); x1[36] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[28] = _mm_add_epi32(input[startidx], input[endidx]); x1[35] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[29] = _mm_add_epi32(input[startidx], input[endidx]); x1[34] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[30] = _mm_add_epi32(input[startidx], input[endidx]); x1[33] = _mm_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[31] = _mm_add_epi32(input[startidx], input[endidx]); x1[32] = _mm_sub_epi32(input[startidx], input[endidx]); // stage 2 __m128i x2[64]; x2[0] = _mm_add_epi32(x1[0], x1[31]); x2[31] = _mm_sub_epi32(x1[0], x1[31]); x2[1] = _mm_add_epi32(x1[1], x1[30]); x2[30] = _mm_sub_epi32(x1[1], x1[30]); x2[2] = _mm_add_epi32(x1[2], x1[29]); x2[29] = _mm_sub_epi32(x1[2], x1[29]); x2[3] = _mm_add_epi32(x1[3], x1[28]); x2[28] = _mm_sub_epi32(x1[3], x1[28]); x2[4] = _mm_add_epi32(x1[4], x1[27]); x2[27] = _mm_sub_epi32(x1[4], x1[27]); x2[5] = _mm_add_epi32(x1[5], x1[26]); x2[26] = _mm_sub_epi32(x1[5], x1[26]); x2[6] = _mm_add_epi32(x1[6], x1[25]); x2[25] = _mm_sub_epi32(x1[6], x1[25]); x2[7] = _mm_add_epi32(x1[7], x1[24]); x2[24] = _mm_sub_epi32(x1[7], x1[24]); x2[8] = _mm_add_epi32(x1[8], x1[23]); x2[23] = _mm_sub_epi32(x1[8], x1[23]); x2[9] = _mm_add_epi32(x1[9], x1[22]); x2[22] = _mm_sub_epi32(x1[9], x1[22]); x2[10] = _mm_add_epi32(x1[10], x1[21]); x2[21] = _mm_sub_epi32(x1[10], x1[21]); x2[11] = _mm_add_epi32(x1[11], x1[20]); x2[20] = _mm_sub_epi32(x1[11], x1[20]); x2[12] = _mm_add_epi32(x1[12], x1[19]); x2[19] = _mm_sub_epi32(x1[12], x1[19]); x2[13] = _mm_add_epi32(x1[13], x1[18]); x2[18] = _mm_sub_epi32(x1[13], x1[18]); x2[14] = _mm_add_epi32(x1[14], x1[17]); x2[17] = _mm_sub_epi32(x1[14], x1[17]); x2[15] = _mm_add_epi32(x1[15], x1[16]); x2[16] = _mm_sub_epi32(x1[15], x1[16]); x2[32] = x1[32]; x2[33] = x1[33]; x2[34] = x1[34]; x2[35] = x1[35]; x2[36] = x1[36]; x2[37] = x1[37]; x2[38] = x1[38]; x2[39] = x1[39]; btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48], __rounding, cos_bit); x2[56] = x1[56]; x2[57] = x1[57]; x2[58] = x1[58]; x2[59] = x1[59]; x2[60] = x1[60]; x2[61] = x1[61]; x2[62] = x1[62]; x2[63] = x1[63]; // stage 3 __m128i x3[64]; x3[0] = _mm_add_epi32(x2[0], x2[15]); x3[15] = _mm_sub_epi32(x2[0], x2[15]); x3[1] = _mm_add_epi32(x2[1], x2[14]); x3[14] = _mm_sub_epi32(x2[1], x2[14]); x3[2] = _mm_add_epi32(x2[2], x2[13]); x3[13] = _mm_sub_epi32(x2[2], x2[13]); x3[3] = _mm_add_epi32(x2[3], x2[12]); x3[12] = _mm_sub_epi32(x2[3], x2[12]); x3[4] = _mm_add_epi32(x2[4], x2[11]); x3[11] = _mm_sub_epi32(x2[4], x2[11]); x3[5] = _mm_add_epi32(x2[5], x2[10]); x3[10] = _mm_sub_epi32(x2[5], x2[10]); x3[6] = _mm_add_epi32(x2[6], x2[9]); x3[9] = _mm_sub_epi32(x2[6], x2[9]); x3[7] = _mm_add_epi32(x2[7], x2[8]); x3[8] = _mm_sub_epi32(x2[7], x2[8]); x3[16] = x2[16]; x3[17] = x2[17]; x3[18] = x2[18]; x3[19] = x2[19]; btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24], __rounding, cos_bit); x3[28] = x2[28]; x3[29] = x2[29]; x3[30] = x2[30]; x3[31] = x2[31]; x3[32] = _mm_add_epi32(x2[32], x2[47]); x3[47] = _mm_sub_epi32(x2[32], x2[47]); x3[33] = _mm_add_epi32(x2[33], x2[46]); x3[46] = _mm_sub_epi32(x2[33], x2[46]); x3[34] = _mm_add_epi32(x2[34], x2[45]); x3[45] = _mm_sub_epi32(x2[34], x2[45]); x3[35] = _mm_add_epi32(x2[35], x2[44]); x3[44] = _mm_sub_epi32(x2[35], x2[44]); x3[36] = _mm_add_epi32(x2[36], x2[43]); x3[43] = _mm_sub_epi32(x2[36], x2[43]); x3[37] = _mm_add_epi32(x2[37], x2[42]); x3[42] = _mm_sub_epi32(x2[37], x2[42]); x3[38] = _mm_add_epi32(x2[38], x2[41]); x3[41] = _mm_sub_epi32(x2[38], x2[41]); x3[39] = _mm_add_epi32(x2[39], x2[40]); x3[40] = _mm_sub_epi32(x2[39], x2[40]); x3[48] = _mm_sub_epi32(x2[63], x2[48]); x3[63] = _mm_add_epi32(x2[63], x2[48]); x3[49] = _mm_sub_epi32(x2[62], x2[49]); x3[62] = _mm_add_epi32(x2[62], x2[49]); x3[50] = _mm_sub_epi32(x2[61], x2[50]); x3[61] = _mm_add_epi32(x2[61], x2[50]); x3[51] = _mm_sub_epi32(x2[60], x2[51]); x3[60] = _mm_add_epi32(x2[60], x2[51]); x3[52] = _mm_sub_epi32(x2[59], x2[52]); x3[59] = _mm_add_epi32(x2[59], x2[52]); x3[53] = _mm_sub_epi32(x2[58], x2[53]); x3[58] = _mm_add_epi32(x2[58], x2[53]); x3[54] = _mm_sub_epi32(x2[57], x2[54]); x3[57] = _mm_add_epi32(x2[57], x2[54]); x3[55] = _mm_sub_epi32(x2[56], x2[55]); x3[56] = _mm_add_epi32(x2[56], x2[55]); // stage 4 __m128i x4[64]; x4[0] = _mm_add_epi32(x3[0], x3[7]); x4[7] = _mm_sub_epi32(x3[0], x3[7]); x4[1] = _mm_add_epi32(x3[1], x3[6]); x4[6] = _mm_sub_epi32(x3[1], x3[6]); x4[2] = _mm_add_epi32(x3[2], x3[5]); x4[5] = _mm_sub_epi32(x3[2], x3[5]); x4[3] = _mm_add_epi32(x3[3], x3[4]); x4[4] = _mm_sub_epi32(x3[3], x3[4]); x4[8] = x3[8]; x4[9] = x3[9]; btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12], __rounding, cos_bit); x4[14] = x3[14]; x4[15] = x3[15]; x4[16] = _mm_add_epi32(x3[16], x3[23]); x4[23] = _mm_sub_epi32(x3[16], x3[23]); x4[17] = _mm_add_epi32(x3[17], x3[22]); x4[22] = _mm_sub_epi32(x3[17], x3[22]); x4[18] = _mm_add_epi32(x3[18], x3[21]); x4[21] = _mm_sub_epi32(x3[18], x3[21]); x4[19] = _mm_add_epi32(x3[19], x3[20]); x4[20] = _mm_sub_epi32(x3[19], x3[20]); x4[24] = _mm_sub_epi32(x3[31], x3[24]); x4[31] = _mm_add_epi32(x3[31], x3[24]); x4[25] = _mm_sub_epi32(x3[30], x3[25]); x4[30] = _mm_add_epi32(x3[30], x3[25]); x4[26] = _mm_sub_epi32(x3[29], x3[26]); x4[29] = _mm_add_epi32(x3[29], x3[26]); x4[27] = _mm_sub_epi32(x3[28], x3[27]); x4[28] = _mm_add_epi32(x3[28], x3[27]); x4[32] = x3[32]; x4[33] = x3[33]; x4[34] = x3[34]; x4[35] = x3[35]; btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52], __rounding, cos_bit); x4[44] = x3[44]; x4[45] = x3[45]; x4[46] = x3[46]; x4[47] = x3[47]; x4[48] = x3[48]; x4[49] = x3[49]; x4[50] = x3[50]; x4[51] = x3[51]; x4[60] = x3[60]; x4[61] = x3[61]; x4[62] = x3[62]; x4[63] = x3[63]; // stage 5 __m128i x5[64]; x5[0] = _mm_add_epi32(x4[0], x4[3]); x5[3] = _mm_sub_epi32(x4[0], x4[3]); x5[1] = _mm_add_epi32(x4[1], x4[2]); x5[2] = _mm_sub_epi32(x4[1], x4[2]); x5[4] = x4[4]; btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6], __rounding, cos_bit); x5[7] = x4[7]; x5[8] = _mm_add_epi32(x4[8], x4[11]); x5[11] = _mm_sub_epi32(x4[8], x4[11]); x5[9] = _mm_add_epi32(x4[9], x4[10]); x5[10] = _mm_sub_epi32(x4[9], x4[10]); x5[12] = _mm_sub_epi32(x4[15], x4[12]); x5[15] = _mm_add_epi32(x4[15], x4[12]); x5[13] = _mm_sub_epi32(x4[14], x4[13]); x5[14] = _mm_add_epi32(x4[14], x4[13]); x5[16] = x4[16]; x5[17] = x4[17]; btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26], __rounding, cos_bit); x5[22] = x4[22]; x5[23] = x4[23]; x5[24] = x4[24]; x5[25] = x4[25]; x5[30] = x4[30]; x5[31] = x4[31]; x5[32] = _mm_add_epi32(x4[32], x4[39]); x5[39] = _mm_sub_epi32(x4[32], x4[39]); x5[33] = _mm_add_epi32(x4[33], x4[38]); x5[38] = _mm_sub_epi32(x4[33], x4[38]); x5[34] = _mm_add_epi32(x4[34], x4[37]); x5[37] = _mm_sub_epi32(x4[34], x4[37]); x5[35] = _mm_add_epi32(x4[35], x4[36]); x5[36] = _mm_sub_epi32(x4[35], x4[36]); x5[40] = _mm_sub_epi32(x4[47], x4[40]); x5[47] = _mm_add_epi32(x4[47], x4[40]); x5[41] = _mm_sub_epi32(x4[46], x4[41]); x5[46] = _mm_add_epi32(x4[46], x4[41]); x5[42] = _mm_sub_epi32(x4[45], x4[42]); x5[45] = _mm_add_epi32(x4[45], x4[42]); x5[43] = _mm_sub_epi32(x4[44], x4[43]); x5[44] = _mm_add_epi32(x4[44], x4[43]); x5[48] = _mm_add_epi32(x4[48], x4[55]); x5[55] = _mm_sub_epi32(x4[48], x4[55]); x5[49] = _mm_add_epi32(x4[49], x4[54]); x5[54] = _mm_sub_epi32(x4[49], x4[54]); x5[50] = _mm_add_epi32(x4[50], x4[53]); x5[53] = _mm_sub_epi32(x4[50], x4[53]); x5[51] = _mm_add_epi32(x4[51], x4[52]); x5[52] = _mm_sub_epi32(x4[51], x4[52]); x5[56] = _mm_sub_epi32(x4[63], x4[56]); x5[63] = _mm_add_epi32(x4[63], x4[56]); x5[57] = _mm_sub_epi32(x4[62], x4[57]); x5[62] = _mm_add_epi32(x4[62], x4[57]); x5[58] = _mm_sub_epi32(x4[61], x4[58]); x5[61] = _mm_add_epi32(x4[61], x4[58]); x5[59] = _mm_sub_epi32(x4[60], x4[59]); x5[60] = _mm_add_epi32(x4[60], x4[59]); // stage 6 __m128i x6[64]; btf_32_type0_sse4_1_new(cospi_p32, cospi_p32, x5[0], x5[1], x6[0], x6[1], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p48, cospi_p16, x5[2], x5[3], x6[2], x6[3], __rounding, cos_bit); x6[4] = _mm_add_epi32(x5[4], x5[5]); x6[5] = _mm_sub_epi32(x5[4], x5[5]); x6[6] = _mm_sub_epi32(x5[7], x5[6]); x6[7] = _mm_add_epi32(x5[7], x5[6]); x6[8] = x5[8]; btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13], __rounding, cos_bit); x6[11] = x5[11]; x6[12] = x5[12]; x6[15] = x5[15]; x6[16] = _mm_add_epi32(x5[16], x5[19]); x6[19] = _mm_sub_epi32(x5[16], x5[19]); x6[17] = _mm_add_epi32(x5[17], x5[18]); x6[18] = _mm_sub_epi32(x5[17], x5[18]); x6[20] = _mm_sub_epi32(x5[23], x5[20]); x6[23] = _mm_add_epi32(x5[23], x5[20]); x6[21] = _mm_sub_epi32(x5[22], x5[21]); x6[22] = _mm_add_epi32(x5[22], x5[21]); x6[24] = _mm_add_epi32(x5[24], x5[27]); x6[27] = _mm_sub_epi32(x5[24], x5[27]); x6[25] = _mm_add_epi32(x5[25], x5[26]); x6[26] = _mm_sub_epi32(x5[25], x5[26]); x6[28] = _mm_sub_epi32(x5[31], x5[28]); x6[31] = _mm_add_epi32(x5[31], x5[28]); x6[29] = _mm_sub_epi32(x5[30], x5[29]); x6[30] = _mm_add_epi32(x5[30], x5[29]); x6[32] = x5[32]; x6[33] = x5[33]; btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58], __rounding, cos_bit); x6[38] = x5[38]; x6[39] = x5[39]; x6[40] = x5[40]; x6[41] = x5[41]; btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50], __rounding, cos_bit); x6[46] = x5[46]; x6[47] = x5[47]; x6[48] = x5[48]; x6[49] = x5[49]; x6[54] = x5[54]; x6[55] = x5[55]; x6[56] = x5[56]; x6[57] = x5[57]; x6[62] = x5[62]; x6[63] = x5[63]; // stage 7 __m128i x7[64]; x7[0] = x6[0]; x7[1] = x6[1]; x7[2] = x6[2]; x7[3] = x6[3]; btf_32_type1_sse4_1_new(cospi_p56, cospi_p08, x6[4], x6[7], x7[4], x7[7], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p24, cospi_p40, x6[5], x6[6], x7[5], x7[6], __rounding, cos_bit); x7[8] = _mm_add_epi32(x6[8], x6[9]); x7[9] = _mm_sub_epi32(x6[8], x6[9]); x7[10] = _mm_sub_epi32(x6[11], x6[10]); x7[11] = _mm_add_epi32(x6[11], x6[10]); x7[12] = _mm_add_epi32(x6[12], x6[13]); x7[13] = _mm_sub_epi32(x6[12], x6[13]); x7[14] = _mm_sub_epi32(x6[15], x6[14]); x7[15] = _mm_add_epi32(x6[15], x6[14]); x7[16] = x6[16]; btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29], __rounding, cos_bit); x7[19] = x6[19]; x7[20] = x6[20]; btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25], __rounding, cos_bit); x7[23] = x6[23]; x7[24] = x6[24]; x7[27] = x6[27]; x7[28] = x6[28]; x7[31] = x6[31]; x7[32] = _mm_add_epi32(x6[32], x6[35]); x7[35] = _mm_sub_epi32(x6[32], x6[35]); x7[33] = _mm_add_epi32(x6[33], x6[34]); x7[34] = _mm_sub_epi32(x6[33], x6[34]); x7[36] = _mm_sub_epi32(x6[39], x6[36]); x7[39] = _mm_add_epi32(x6[39], x6[36]); x7[37] = _mm_sub_epi32(x6[38], x6[37]); x7[38] = _mm_add_epi32(x6[38], x6[37]); x7[40] = _mm_add_epi32(x6[40], x6[43]); x7[43] = _mm_sub_epi32(x6[40], x6[43]); x7[41] = _mm_add_epi32(x6[41], x6[42]); x7[42] = _mm_sub_epi32(x6[41], x6[42]); x7[44] = _mm_sub_epi32(x6[47], x6[44]); x7[47] = _mm_add_epi32(x6[47], x6[44]); x7[45] = _mm_sub_epi32(x6[46], x6[45]); x7[46] = _mm_add_epi32(x6[46], x6[45]); x7[48] = _mm_add_epi32(x6[48], x6[51]); x7[51] = _mm_sub_epi32(x6[48], x6[51]); x7[49] = _mm_add_epi32(x6[49], x6[50]); x7[50] = _mm_sub_epi32(x6[49], x6[50]); x7[52] = _mm_sub_epi32(x6[55], x6[52]); x7[55] = _mm_add_epi32(x6[55], x6[52]); x7[53] = _mm_sub_epi32(x6[54], x6[53]); x7[54] = _mm_add_epi32(x6[54], x6[53]); x7[56] = _mm_add_epi32(x6[56], x6[59]); x7[59] = _mm_sub_epi32(x6[56], x6[59]); x7[57] = _mm_add_epi32(x6[57], x6[58]); x7[58] = _mm_sub_epi32(x6[57], x6[58]); x7[60] = _mm_sub_epi32(x6[63], x6[60]); x7[63] = _mm_add_epi32(x6[63], x6[60]); x7[61] = _mm_sub_epi32(x6[62], x6[61]); x7[62] = _mm_add_epi32(x6[62], x6[61]); // stage 8 __m128i x8[64]; x8[0] = x7[0]; x8[1] = x7[1]; x8[2] = x7[2]; x8[3] = x7[3]; x8[4] = x7[4]; x8[5] = x7[5]; x8[6] = x7[6]; x8[7] = x7[7]; btf_32_type1_sse4_1_new(cospi_p60, cospi_p04, x7[8], x7[15], x8[8], x8[15], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p28, cospi_p36, x7[9], x7[14], x8[9], x8[14], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p44, cospi_p20, x7[10], x7[13], x8[10], x8[13], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p12, cospi_p52, x7[11], x7[12], x8[11], x8[12], __rounding, cos_bit); x8[16] = _mm_add_epi32(x7[16], x7[17]); x8[17] = _mm_sub_epi32(x7[16], x7[17]); x8[18] = _mm_sub_epi32(x7[19], x7[18]); x8[19] = _mm_add_epi32(x7[19], x7[18]); x8[20] = _mm_add_epi32(x7[20], x7[21]); x8[21] = _mm_sub_epi32(x7[20], x7[21]); x8[22] = _mm_sub_epi32(x7[23], x7[22]); x8[23] = _mm_add_epi32(x7[23], x7[22]); x8[24] = _mm_add_epi32(x7[24], x7[25]); x8[25] = _mm_sub_epi32(x7[24], x7[25]); x8[26] = _mm_sub_epi32(x7[27], x7[26]); x8[27] = _mm_add_epi32(x7[27], x7[26]); x8[28] = _mm_add_epi32(x7[28], x7[29]); x8[29] = _mm_sub_epi32(x7[28], x7[29]); x8[30] = _mm_sub_epi32(x7[31], x7[30]); x8[31] = _mm_add_epi32(x7[31], x7[30]); x8[32] = x7[32]; btf_32_type0_sse4_1_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61], __rounding, cos_bit); x8[35] = x7[35]; x8[36] = x7[36]; btf_32_type0_sse4_1_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57], __rounding, cos_bit); x8[39] = x7[39]; x8[40] = x7[40]; btf_32_type0_sse4_1_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53], __rounding, cos_bit); x8[43] = x7[43]; x8[44] = x7[44]; btf_32_type0_sse4_1_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50], __rounding, cos_bit); btf_32_type0_sse4_1_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49], __rounding, cos_bit); x8[47] = x7[47]; x8[48] = x7[48]; x8[51] = x7[51]; x8[52] = x7[52]; x8[55] = x7[55]; x8[56] = x7[56]; x8[59] = x7[59]; x8[60] = x7[60]; x8[63] = x7[63]; // stage 9 __m128i x9[64]; x9[0] = x8[0]; x9[1] = x8[1]; x9[2] = x8[2]; x9[3] = x8[3]; x9[4] = x8[4]; x9[5] = x8[5]; x9[6] = x8[6]; x9[7] = x8[7]; x9[8] = x8[8]; x9[9] = x8[9]; x9[10] = x8[10]; x9[11] = x8[11]; x9[12] = x8[12]; x9[13] = x8[13]; x9[14] = x8[14]; x9[15] = x8[15]; btf_32_type1_sse4_1_new(cospi_p62, cospi_p02, x8[16], x8[31], x9[16], x9[31], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p30, cospi_p34, x8[17], x8[30], x9[17], x9[30], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p46, cospi_p18, x8[18], x8[29], x9[18], x9[29], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p14, cospi_p50, x8[19], x8[28], x9[19], x9[28], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p54, cospi_p10, x8[20], x8[27], x9[20], x9[27], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p22, cospi_p42, x8[21], x8[26], x9[21], x9[26], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p38, cospi_p26, x8[22], x8[25], x9[22], x9[25], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p06, cospi_p58, x8[23], x8[24], x9[23], x9[24], __rounding, cos_bit); x9[32] = _mm_add_epi32(x8[32], x8[33]); x9[33] = _mm_sub_epi32(x8[32], x8[33]); x9[34] = _mm_sub_epi32(x8[35], x8[34]); x9[35] = _mm_add_epi32(x8[35], x8[34]); x9[36] = _mm_add_epi32(x8[36], x8[37]); x9[37] = _mm_sub_epi32(x8[36], x8[37]); x9[38] = _mm_sub_epi32(x8[39], x8[38]); x9[39] = _mm_add_epi32(x8[39], x8[38]); x9[40] = _mm_add_epi32(x8[40], x8[41]); x9[41] = _mm_sub_epi32(x8[40], x8[41]); x9[42] = _mm_sub_epi32(x8[43], x8[42]); x9[43] = _mm_add_epi32(x8[43], x8[42]); x9[44] = _mm_add_epi32(x8[44], x8[45]); x9[45] = _mm_sub_epi32(x8[44], x8[45]); x9[46] = _mm_sub_epi32(x8[47], x8[46]); x9[47] = _mm_add_epi32(x8[47], x8[46]); x9[48] = _mm_add_epi32(x8[48], x8[49]); x9[49] = _mm_sub_epi32(x8[48], x8[49]); x9[50] = _mm_sub_epi32(x8[51], x8[50]); x9[51] = _mm_add_epi32(x8[51], x8[50]); x9[52] = _mm_add_epi32(x8[52], x8[53]); x9[53] = _mm_sub_epi32(x8[52], x8[53]); x9[54] = _mm_sub_epi32(x8[55], x8[54]); x9[55] = _mm_add_epi32(x8[55], x8[54]); x9[56] = _mm_add_epi32(x8[56], x8[57]); x9[57] = _mm_sub_epi32(x8[56], x8[57]); x9[58] = _mm_sub_epi32(x8[59], x8[58]); x9[59] = _mm_add_epi32(x8[59], x8[58]); x9[60] = _mm_add_epi32(x8[60], x8[61]); x9[61] = _mm_sub_epi32(x8[60], x8[61]); x9[62] = _mm_sub_epi32(x8[63], x8[62]); x9[63] = _mm_add_epi32(x8[63], x8[62]); // stage 10 __m128i x10[64]; x10[0] = x9[0]; x10[1] = x9[1]; x10[2] = x9[2]; x10[3] = x9[3]; x10[4] = x9[4]; x10[5] = x9[5]; x10[6] = x9[6]; x10[7] = x9[7]; x10[8] = x9[8]; x10[9] = x9[9]; x10[10] = x9[10]; x10[11] = x9[11]; x10[12] = x9[12]; x10[13] = x9[13]; x10[14] = x9[14]; x10[15] = x9[15]; x10[16] = x9[16]; x10[17] = x9[17]; x10[18] = x9[18]; x10[19] = x9[19]; x10[20] = x9[20]; x10[21] = x9[21]; x10[22] = x9[22]; x10[23] = x9[23]; x10[24] = x9[24]; x10[25] = x9[25]; x10[26] = x9[26]; x10[27] = x9[27]; x10[28] = x9[28]; x10[29] = x9[29]; x10[30] = x9[30]; x10[31] = x9[31]; btf_32_type1_sse4_1_new(cospi_p63, cospi_p01, x9[32], x9[63], x10[32], x10[63], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p31, cospi_p33, x9[33], x9[62], x10[33], x10[62], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p47, cospi_p17, x9[34], x9[61], x10[34], x10[61], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p15, cospi_p49, x9[35], x9[60], x10[35], x10[60], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p55, cospi_p09, x9[36], x9[59], x10[36], x10[59], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p23, cospi_p41, x9[37], x9[58], x10[37], x10[58], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p39, cospi_p25, x9[38], x9[57], x10[38], x10[57], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p07, cospi_p57, x9[39], x9[56], x10[39], x10[56], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p59, cospi_p05, x9[40], x9[55], x10[40], x10[55], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p27, cospi_p37, x9[41], x9[54], x10[41], x10[54], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p43, cospi_p21, x9[42], x9[53], x10[42], x10[53], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p11, cospi_p53, x9[43], x9[52], x10[43], x10[52], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p51, cospi_p13, x9[44], x9[51], x10[44], x10[51], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p19, cospi_p45, x9[45], x9[50], x10[45], x10[50], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p35, cospi_p29, x9[46], x9[49], x10[46], x10[49], __rounding, cos_bit); btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47], x10[48], __rounding, cos_bit); startidx = 0 * outstride; endidx = 63 * outstride; // stage 11 output[startidx] = x10[0]; output[endidx] = x10[63]; startidx += outstride; endidx -= outstride; output[startidx] = x10[32]; output[endidx] = x10[31]; startidx += outstride; endidx -= outstride; output[startidx] = x10[16]; output[endidx] = x10[47]; startidx += outstride; endidx -= outstride; output[startidx] = x10[48]; output[endidx] = x10[15]; startidx += outstride; endidx -= outstride; output[startidx] = x10[8]; output[endidx] = x10[55]; startidx += outstride; endidx -= outstride; output[startidx] = x10[40]; output[endidx] = x10[23]; startidx += outstride; endidx -= outstride; output[startidx] = x10[24]; output[endidx] = x10[39]; startidx += outstride; endidx -= outstride; output[startidx] = x10[56]; output[endidx] = x10[7]; startidx += outstride; endidx -= outstride; output[startidx] = x10[4]; output[endidx] = x10[59]; startidx += outstride; endidx -= outstride; output[startidx] = x10[36]; output[endidx] = x10[27]; startidx += outstride; endidx -= outstride; output[startidx] = x10[20]; output[endidx] = x10[43]; startidx += outstride; endidx -= outstride; output[startidx] = x10[52]; output[endidx] = x10[11]; startidx += outstride; endidx -= outstride; output[startidx] = x10[12]; output[endidx] = x10[51]; startidx += outstride; endidx -= outstride; output[startidx] = x10[44]; output[endidx] = x10[19]; startidx += outstride; endidx -= outstride; output[startidx] = x10[28]; output[endidx] = x10[35]; startidx += outstride; endidx -= outstride; output[startidx] = x10[60]; output[endidx] = x10[3]; startidx += outstride; endidx -= outstride; output[startidx] = x10[2]; output[endidx] = x10[61]; startidx += outstride; endidx -= outstride; output[startidx] = x10[34]; output[endidx] = x10[29]; startidx += outstride; endidx -= outstride; output[startidx] = x10[18]; output[endidx] = x10[45]; startidx += outstride; endidx -= outstride; output[startidx] = x10[50]; output[endidx] = x10[13]; startidx += outstride; endidx -= outstride; output[startidx] = x10[10]; output[endidx] = x10[53]; startidx += outstride; endidx -= outstride; output[startidx] = x10[42]; output[endidx] = x10[21]; startidx += outstride; endidx -= outstride; output[startidx] = x10[26]; output[endidx] = x10[37]; startidx += outstride; endidx -= outstride; output[startidx] = x10[58]; output[endidx] = x10[5]; startidx += outstride; endidx -= outstride; output[startidx] = x10[6]; output[endidx] = x10[57]; startidx += outstride; endidx -= outstride; output[startidx] = x10[38]; output[endidx] = x10[25]; startidx += outstride; endidx -= outstride; output[startidx] = x10[22]; output[endidx] = x10[41]; startidx += outstride; endidx -= outstride; output[startidx] = x10[54]; output[endidx] = x10[9]; startidx += outstride; endidx -= outstride; output[startidx] = x10[14]; output[endidx] = x10[49]; startidx += outstride; endidx -= outstride; output[startidx] = x10[46]; output[endidx] = x10[17]; startidx += outstride; endidx -= outstride; output[startidx] = x10[30]; output[endidx] = x10[33]; startidx += outstride; endidx -= outstride; output[startidx] = x10[62]; output[endidx] = x10[1]; } void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit, const int col_num) { (void)cos_bit; for (int i = 0; i < 32; i++) { output[i * col_num] = _mm_slli_epi32(input[i * col_num], 2); } } aom-3.12.1/av1/encoder/x86/av1_fwd_txfm2d_avx2.c000066400000000000000000004052241477627663500210620ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/av1_rtcd.h" #include "av1/common/enums.h" #include "av1/common/av1_txfm.h" #include "av1/encoder/x86/av1_fwd_txfm_avx2.h" #include "av1/common/x86/av1_txfm_sse2.h" #include "av1/encoder/av1_fwd_txfm1d_cfg.h" #include "av1/encoder/x86/av1_txfm1d_sse4.h" #include "av1/encoder/x86/av1_fwd_txfm_sse2.h" #include "aom_dsp/x86/txfm_common_avx2.h" static inline void fdct16x16_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); // stage 1 __m256i x1[16]; btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]); btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]); btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]); btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]); btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]); btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]); btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]); btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]); // stage 2 btf_16_adds_subs_avx2(&x1[0], &x1[7]); btf_16_adds_subs_avx2(&x1[1], &x1[6]); btf_16_adds_subs_avx2(&x1[2], &x1[5]); btf_16_adds_subs_avx2(&x1[3], &x1[4]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); // stage 3 btf_16_adds_subs_avx2(&x1[0], &x1[3]); btf_16_adds_subs_avx2(&x1[1], &x1[2]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); btf_16_adds_subs_avx2(&x1[8], &x1[11]); btf_16_adds_subs_avx2(&x1[9], &x1[10]); btf_16_adds_subs_avx2(&x1[15], &x1[12]); btf_16_adds_subs_avx2(&x1[14], &x1[13]); // stage 4 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); btf_16_adds_subs_avx2(&x1[4], &x1[5]); btf_16_adds_subs_avx2(&x1[7], &x1[6]); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); // stage 5 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); btf_16_adds_subs_avx2(&x1[8], &x1[9]); btf_16_adds_subs_avx2(&x1[11], &x1[10]); btf_16_adds_subs_avx2(&x1[12], &x1[13]); btf_16_adds_subs_avx2(&x1[15], &x1[14]); // stage 6 btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); // stage 7 output[0] = x1[0]; output[1] = x1[8]; output[2] = x1[4]; output[3] = x1[12]; output[4] = x1[2]; output[5] = x1[10]; output[6] = x1[6]; output[7] = x1[14]; output[8] = x1[1]; output[9] = x1[9]; output[10] = x1[5]; output[11] = x1[13]; output[12] = x1[3]; output[13] = x1[11]; output[14] = x1[7]; output[15] = x1[15]; } static inline void fdct16x32_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]); __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]); __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]); __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]); __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]); __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]); __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]); __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]); __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]); __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]); __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]); __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]); __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]); __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]); __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]); __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]); // stage 1 __m256i x1[32]; btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]); btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]); btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]); btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]); btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]); btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]); btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]); btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]); btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]); btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]); btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]); btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]); btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]); btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]); btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]); btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]); // stage 2 btf_16_adds_subs_avx2(&x1[0], &x1[15]); btf_16_adds_subs_avx2(&x1[1], &x1[14]); btf_16_adds_subs_avx2(&x1[2], &x1[13]); btf_16_adds_subs_avx2(&x1[3], &x1[12]); btf_16_adds_subs_avx2(&x1[4], &x1[11]); btf_16_adds_subs_avx2(&x1[5], &x1[10]); btf_16_adds_subs_avx2(&x1[6], &x1[9]); btf_16_adds_subs_avx2(&x1[7], &x1[8]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit); // stage 3 btf_16_adds_subs_avx2(&x1[0], &x1[7]); btf_16_adds_subs_avx2(&x1[1], &x1[6]); btf_16_adds_subs_avx2(&x1[2], &x1[5]); btf_16_adds_subs_avx2(&x1[3], &x1[4]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); btf_16_adds_subs_avx2(&x1[16], &x1[23]); btf_16_adds_subs_avx2(&x1[17], &x1[22]); btf_16_adds_subs_avx2(&x1[18], &x1[21]); btf_16_adds_subs_avx2(&x1[19], &x1[20]); btf_16_adds_subs_avx2(&x1[31], &x1[24]); btf_16_adds_subs_avx2(&x1[30], &x1[25]); btf_16_adds_subs_avx2(&x1[29], &x1[26]); btf_16_adds_subs_avx2(&x1[28], &x1[27]); // stage 4 btf_16_adds_subs_avx2(&x1[0], &x1[3]); btf_16_adds_subs_avx2(&x1[1], &x1[2]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); btf_16_adds_subs_avx2(&x1[8], &x1[11]); btf_16_adds_subs_avx2(&x1[9], &x1[10]); btf_16_adds_subs_avx2(&x1[15], &x1[12]); btf_16_adds_subs_avx2(&x1[14], &x1[13]); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit); // stage 5 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); btf_16_adds_subs_avx2(&x1[4], &x1[5]); btf_16_adds_subs_avx2(&x1[7], &x1[6]); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); btf_16_adds_subs_avx2(&x1[16], &x1[19]); btf_16_adds_subs_avx2(&x1[17], &x1[18]); btf_16_adds_subs_avx2(&x1[23], &x1[20]); btf_16_adds_subs_avx2(&x1[22], &x1[21]); btf_16_adds_subs_avx2(&x1[24], &x1[27]); btf_16_adds_subs_avx2(&x1[25], &x1[26]); btf_16_adds_subs_avx2(&x1[31], &x1[28]); btf_16_adds_subs_avx2(&x1[30], &x1[29]); // stage 6 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); btf_16_adds_subs_avx2(&x1[8], &x1[9]); btf_16_adds_subs_avx2(&x1[11], &x1[10]); btf_16_adds_subs_avx2(&x1[12], &x1[13]); btf_16_adds_subs_avx2(&x1[15], &x1[14]); btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit); btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit); btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit); btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit); // stage 7 btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); btf_16_adds_subs_avx2(&x1[16], &x1[17]); btf_16_adds_subs_avx2(&x1[19], &x1[18]); btf_16_adds_subs_avx2(&x1[20], &x1[21]); btf_16_adds_subs_avx2(&x1[23], &x1[22]); btf_16_adds_subs_avx2(&x1[24], &x1[25]); btf_16_adds_subs_avx2(&x1[27], &x1[26]); btf_16_adds_subs_avx2(&x1[28], &x1[29]); btf_16_adds_subs_avx2(&x1[31], &x1[30]); // stage 8 btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit); btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit); btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit); btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit); btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit); btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit); btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit); btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit); // stage 9 output[0] = x1[0]; output[1] = x1[16]; output[2] = x1[8]; output[3] = x1[24]; output[4] = x1[4]; output[5] = x1[20]; output[6] = x1[12]; output[7] = x1[28]; output[8] = x1[2]; output[9] = x1[18]; output[10] = x1[10]; output[11] = x1[26]; output[12] = x1[6]; output[13] = x1[22]; output[14] = x1[14]; output[15] = x1[30]; output[16] = x1[1]; output[17] = x1[17]; output[18] = x1[9]; output[19] = x1[25]; output[20] = x1[5]; output[21] = x1[21]; output[22] = x1[13]; output[23] = x1[29]; output[24] = x1[3]; output[25] = x1[19]; output[26] = x1[11]; output[27] = x1[27]; output[28] = x1[7]; output[29] = x1[23]; output[30] = x1[15]; output[31] = x1[31]; } static inline void fdct16x64_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]); __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]); __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]); __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]); __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]); __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]); __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]); __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]); __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]); __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]); __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]); __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]); __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]); __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]); __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]); __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]); __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]); __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]); __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]); __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]); __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]); __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]); __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]); __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]); __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]); __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]); __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]); __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]); __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]); __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]); __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]); __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]); __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]); __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]); __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]); __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]); __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]); __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]); __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]); __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]); __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]); __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]); __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]); __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]); __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]); __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]); __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]); __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]); __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]); __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]); // stage 1 __m256i x1[64]; btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]); btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]); btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]); btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]); btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]); btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]); btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]); btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]); btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]); btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]); btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]); btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]); btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]); btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]); btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]); btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]); btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]); btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]); btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]); btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]); btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]); btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]); btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]); btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]); btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]); btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]); btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]); btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]); btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]); btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]); btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]); btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]); // stage 2 btf_16_adds_subs_avx2(&x1[0], &x1[31]); btf_16_adds_subs_avx2(&x1[1], &x1[30]); btf_16_adds_subs_avx2(&x1[2], &x1[29]); btf_16_adds_subs_avx2(&x1[3], &x1[28]); btf_16_adds_subs_avx2(&x1[4], &x1[27]); btf_16_adds_subs_avx2(&x1[5], &x1[26]); btf_16_adds_subs_avx2(&x1[6], &x1[25]); btf_16_adds_subs_avx2(&x1[7], &x1[24]); btf_16_adds_subs_avx2(&x1[8], &x1[23]); btf_16_adds_subs_avx2(&x1[9], &x1[22]); btf_16_adds_subs_avx2(&x1[10], &x1[21]); btf_16_adds_subs_avx2(&x1[11], &x1[20]); btf_16_adds_subs_avx2(&x1[12], &x1[19]); btf_16_adds_subs_avx2(&x1[13], &x1[18]); btf_16_adds_subs_avx2(&x1[14], &x1[17]); btf_16_adds_subs_avx2(&x1[15], &x1[16]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit); // stage 3 btf_16_adds_subs_avx2(&x1[0], &x1[15]); btf_16_adds_subs_avx2(&x1[1], &x1[14]); btf_16_adds_subs_avx2(&x1[2], &x1[13]); btf_16_adds_subs_avx2(&x1[3], &x1[12]); btf_16_adds_subs_avx2(&x1[4], &x1[11]); btf_16_adds_subs_avx2(&x1[5], &x1[10]); btf_16_adds_subs_avx2(&x1[6], &x1[9]); btf_16_adds_subs_avx2(&x1[7], &x1[8]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit); btf_16_adds_subs_avx2(&x1[32], &x1[47]); btf_16_adds_subs_avx2(&x1[33], &x1[46]); btf_16_adds_subs_avx2(&x1[34], &x1[45]); btf_16_adds_subs_avx2(&x1[35], &x1[44]); btf_16_adds_subs_avx2(&x1[36], &x1[43]); btf_16_adds_subs_avx2(&x1[37], &x1[42]); btf_16_adds_subs_avx2(&x1[38], &x1[41]); btf_16_adds_subs_avx2(&x1[39], &x1[40]); btf_16_adds_subs_avx2(&x1[63], &x1[48]); btf_16_adds_subs_avx2(&x1[62], &x1[49]); btf_16_adds_subs_avx2(&x1[61], &x1[50]); btf_16_adds_subs_avx2(&x1[60], &x1[51]); btf_16_adds_subs_avx2(&x1[59], &x1[52]); btf_16_adds_subs_avx2(&x1[58], &x1[53]); btf_16_adds_subs_avx2(&x1[57], &x1[54]); btf_16_adds_subs_avx2(&x1[56], &x1[55]); // stage 4 btf_16_adds_subs_avx2(&x1[0], &x1[7]); btf_16_adds_subs_avx2(&x1[1], &x1[6]); btf_16_adds_subs_avx2(&x1[2], &x1[5]); btf_16_adds_subs_avx2(&x1[3], &x1[4]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); btf_16_adds_subs_avx2(&x1[16], &x1[23]); btf_16_adds_subs_avx2(&x1[17], &x1[22]); btf_16_adds_subs_avx2(&x1[18], &x1[21]); btf_16_adds_subs_avx2(&x1[19], &x1[20]); btf_16_adds_subs_avx2(&x1[31], &x1[24]); btf_16_adds_subs_avx2(&x1[30], &x1[25]); btf_16_adds_subs_avx2(&x1[29], &x1[26]); btf_16_adds_subs_avx2(&x1[28], &x1[27]); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit); // stage 5 btf_16_adds_subs_avx2(&x1[0], &x1[3]); btf_16_adds_subs_avx2(&x1[1], &x1[2]); btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); btf_16_adds_subs_avx2(&x1[8], &x1[11]); btf_16_adds_subs_avx2(&x1[9], &x1[10]); btf_16_adds_subs_avx2(&x1[15], &x1[12]); btf_16_adds_subs_avx2(&x1[14], &x1[13]); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit); btf_16_adds_subs_avx2(&x1[32], &x1[39]); btf_16_adds_subs_avx2(&x1[33], &x1[38]); btf_16_adds_subs_avx2(&x1[34], &x1[37]); btf_16_adds_subs_avx2(&x1[35], &x1[36]); btf_16_adds_subs_avx2(&x1[47], &x1[40]); btf_16_adds_subs_avx2(&x1[46], &x1[41]); btf_16_adds_subs_avx2(&x1[45], &x1[42]); btf_16_adds_subs_avx2(&x1[44], &x1[43]); btf_16_adds_subs_avx2(&x1[48], &x1[55]); btf_16_adds_subs_avx2(&x1[49], &x1[54]); btf_16_adds_subs_avx2(&x1[50], &x1[53]); btf_16_adds_subs_avx2(&x1[51], &x1[52]); btf_16_adds_subs_avx2(&x1[63], &x1[56]); btf_16_adds_subs_avx2(&x1[62], &x1[57]); btf_16_adds_subs_avx2(&x1[61], &x1[58]); btf_16_adds_subs_avx2(&x1[60], &x1[59]); // stage 6 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); btf_16_adds_subs_avx2(&x1[4], &x1[5]); btf_16_adds_subs_avx2(&x1[7], &x1[6]); btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); btf_16_adds_subs_avx2(&x1[16], &x1[19]); btf_16_adds_subs_avx2(&x1[17], &x1[18]); btf_16_adds_subs_avx2(&x1[23], &x1[20]); btf_16_adds_subs_avx2(&x1[22], &x1[21]); btf_16_adds_subs_avx2(&x1[24], &x1[27]); btf_16_adds_subs_avx2(&x1[25], &x1[26]); btf_16_adds_subs_avx2(&x1[31], &x1[28]); btf_16_adds_subs_avx2(&x1[30], &x1[29]); btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit); btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit); btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit); btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit); btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit); btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit); btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit); btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit); // stage 7 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); btf_16_adds_subs_avx2(&x1[8], &x1[9]); btf_16_adds_subs_avx2(&x1[11], &x1[10]); btf_16_adds_subs_avx2(&x1[12], &x1[13]); btf_16_adds_subs_avx2(&x1[15], &x1[14]); btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit); btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit); btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit); btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit); btf_16_adds_subs_avx2(&x1[32], &x1[35]); btf_16_adds_subs_avx2(&x1[33], &x1[34]); btf_16_adds_subs_avx2(&x1[39], &x1[36]); btf_16_adds_subs_avx2(&x1[38], &x1[37]); btf_16_adds_subs_avx2(&x1[40], &x1[43]); btf_16_adds_subs_avx2(&x1[41], &x1[42]); btf_16_adds_subs_avx2(&x1[47], &x1[44]); btf_16_adds_subs_avx2(&x1[46], &x1[45]); btf_16_adds_subs_avx2(&x1[48], &x1[51]); btf_16_adds_subs_avx2(&x1[49], &x1[50]); btf_16_adds_subs_avx2(&x1[55], &x1[52]); btf_16_adds_subs_avx2(&x1[54], &x1[53]); btf_16_adds_subs_avx2(&x1[56], &x1[59]); btf_16_adds_subs_avx2(&x1[57], &x1[58]); btf_16_adds_subs_avx2(&x1[63], &x1[60]); btf_16_adds_subs_avx2(&x1[62], &x1[61]); // stage 8 btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); btf_16_adds_subs_avx2(&x1[16], &x1[17]); btf_16_adds_subs_avx2(&x1[19], &x1[18]); btf_16_adds_subs_avx2(&x1[20], &x1[21]); btf_16_adds_subs_avx2(&x1[23], &x1[22]); btf_16_adds_subs_avx2(&x1[24], &x1[25]); btf_16_adds_subs_avx2(&x1[27], &x1[26]); btf_16_adds_subs_avx2(&x1[28], &x1[29]); btf_16_adds_subs_avx2(&x1[31], &x1[30]); btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit); btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit); btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit); btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit); btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit); btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit); btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit); btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit); // stage 9 btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit); btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit); btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit); btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit); btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit); btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit); btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit); btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit); btf_16_adds_subs_avx2(&x1[32], &x1[33]); btf_16_adds_subs_avx2(&x1[35], &x1[34]); btf_16_adds_subs_avx2(&x1[36], &x1[37]); btf_16_adds_subs_avx2(&x1[39], &x1[38]); btf_16_adds_subs_avx2(&x1[40], &x1[41]); btf_16_adds_subs_avx2(&x1[43], &x1[42]); btf_16_adds_subs_avx2(&x1[44], &x1[45]); btf_16_adds_subs_avx2(&x1[47], &x1[46]); btf_16_adds_subs_avx2(&x1[48], &x1[49]); btf_16_adds_subs_avx2(&x1[51], &x1[50]); btf_16_adds_subs_avx2(&x1[52], &x1[53]); btf_16_adds_subs_avx2(&x1[55], &x1[54]); btf_16_adds_subs_avx2(&x1[56], &x1[57]); btf_16_adds_subs_avx2(&x1[59], &x1[58]); btf_16_adds_subs_avx2(&x1[60], &x1[61]); btf_16_adds_subs_avx2(&x1[63], &x1[62]); // stage 10 btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit); btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit); btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit); btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit); btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit); btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit); btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit); btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit); btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit); btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit); btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit); btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit); btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit); btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit); btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit); btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit); // stage 11 output[0] = x1[0]; output[1] = x1[32]; output[2] = x1[16]; output[3] = x1[48]; output[4] = x1[8]; output[5] = x1[40]; output[6] = x1[24]; output[7] = x1[56]; output[8] = x1[4]; output[9] = x1[36]; output[10] = x1[20]; output[11] = x1[52]; output[12] = x1[12]; output[13] = x1[44]; output[14] = x1[28]; output[15] = x1[60]; output[16] = x1[2]; output[17] = x1[34]; output[18] = x1[18]; output[19] = x1[50]; output[20] = x1[10]; output[21] = x1[42]; output[22] = x1[26]; output[23] = x1[58]; output[24] = x1[6]; output[25] = x1[38]; output[26] = x1[22]; output[27] = x1[54]; output[28] = x1[14]; output[29] = x1[46]; output[30] = x1[30]; output[31] = x1[62]; output[32] = x1[1]; output[33] = x1[33]; output[34] = x1[17]; output[35] = x1[49]; output[36] = x1[9]; output[37] = x1[41]; output[38] = x1[25]; output[39] = x1[57]; output[40] = x1[5]; output[41] = x1[37]; output[42] = x1[21]; output[43] = x1[53]; output[44] = x1[13]; output[45] = x1[45]; output[46] = x1[29]; output[47] = x1[61]; output[48] = x1[3]; output[49] = x1[35]; output[50] = x1[19]; output[51] = x1[51]; output[52] = x1[11]; output[53] = x1[43]; output[54] = x1[27]; output[55] = x1[59]; output[56] = x1[7]; output[57] = x1[39]; output[58] = x1[23]; output[59] = x1[55]; output[60] = x1[15]; output[61] = x1[47]; output[62] = x1[31]; output[63] = x1[63]; } static inline void fdct32_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { __m256i x1[32]; const int32_t *cospi = cospi_arr(cos_bit); const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); // stage 0 // stage 1 btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]); btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]); btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]); btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]); btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]); btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]); btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]); btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]); btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]); btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]); btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]); btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]); btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]); btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]); btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]); btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]); // stage 2 btf_32_add_sub_avx2(&x1[0], &x1[15]); btf_32_add_sub_avx2(&x1[1], &x1[14]); btf_32_add_sub_avx2(&x1[2], &x1[13]); btf_32_add_sub_avx2(&x1[3], &x1[12]); btf_32_add_sub_avx2(&x1[4], &x1[11]); btf_32_add_sub_avx2(&x1[5], &x1[10]); btf_32_add_sub_avx2(&x1[6], &x1[9]); btf_32_add_sub_avx2(&x1[7], &x1[8]); btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit); btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit); btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit); btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit); // stage 3 btf_32_add_sub_avx2(&x1[0], &x1[7]); btf_32_add_sub_avx2(&x1[1], &x1[6]); btf_32_add_sub_avx2(&x1[2], &x1[5]); btf_32_add_sub_avx2(&x1[3], &x1[4]); btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit); btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit); btf_32_add_sub_avx2(&x1[16], &x1[23]); btf_32_add_sub_avx2(&x1[17], &x1[22]); btf_32_add_sub_avx2(&x1[18], &x1[21]); btf_32_add_sub_avx2(&x1[19], &x1[20]); btf_32_add_sub_avx2(&x1[31], &x1[24]); btf_32_add_sub_avx2(&x1[30], &x1[25]); btf_32_add_sub_avx2(&x1[29], &x1[26]); btf_32_add_sub_avx2(&x1[28], &x1[27]); // stage 4 btf_32_add_sub_avx2(&x1[0], &x1[3]); btf_32_add_sub_avx2(&x1[1], &x1[2]); btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit); btf_32_add_sub_avx2(&x1[8], &x1[11]); btf_32_add_sub_avx2(&x1[9], &x1[10]); btf_32_add_sub_avx2(&x1[15], &x1[12]); btf_32_add_sub_avx2(&x1[14], &x1[13]); btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit); btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit); btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit); btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit); // stage 5 btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit); btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit); btf_32_add_sub_avx2(&x1[4], &x1[5]); btf_32_add_sub_avx2(&x1[7], &x1[6]); btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit); btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit); btf_32_add_sub_avx2(&x1[16], &x1[19]); btf_32_add_sub_avx2(&x1[17], &x1[18]); btf_32_add_sub_avx2(&x1[23], &x1[20]); btf_32_add_sub_avx2(&x1[22], &x1[21]); btf_32_add_sub_avx2(&x1[24], &x1[27]); btf_32_add_sub_avx2(&x1[25], &x1[26]); btf_32_add_sub_avx2(&x1[31], &x1[28]); btf_32_add_sub_avx2(&x1[30], &x1[29]); // stage 6 btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit); btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit); btf_32_add_sub_avx2(&x1[8], &x1[9]); btf_32_add_sub_avx2(&x1[11], &x1[10]); btf_32_add_sub_avx2(&x1[12], &x1[13]); btf_32_add_sub_avx2(&x1[15], &x1[14]); btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit); btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit); btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit); btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit); // stage 7 btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit); btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit); btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit); btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit); btf_32_add_sub_avx2(&x1[16], &x1[17]); btf_32_add_sub_avx2(&x1[19], &x1[18]); btf_32_add_sub_avx2(&x1[20], &x1[21]); btf_32_add_sub_avx2(&x1[23], &x1[22]); btf_32_add_sub_avx2(&x1[24], &x1[25]); btf_32_add_sub_avx2(&x1[27], &x1[26]); btf_32_add_sub_avx2(&x1[28], &x1[29]); btf_32_add_sub_avx2(&x1[31], &x1[30]); // stage 8 btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit); btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit); btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit); btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit); btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit); btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit); btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit); btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit); // stage 9 output[0] = x1[0]; output[1] = x1[16]; output[2] = x1[8]; output[3] = x1[24]; output[4] = x1[4]; output[5] = x1[20]; output[6] = x1[12]; output[7] = x1[28]; output[8] = x1[2]; output[9] = x1[18]; output[10] = x1[10]; output[11] = x1[26]; output[12] = x1[6]; output[13] = x1[22]; output[14] = x1[14]; output[15] = x1[30]; output[16] = x1[1]; output[17] = x1[17]; output[18] = x1[9]; output[19] = x1[25]; output[20] = x1[5]; output[21] = x1[21]; output[22] = x1[13]; output[23] = x1[29]; output[24] = x1[3]; output[25] = x1[19]; output[26] = x1[11]; output[27] = x1[27]; output[28] = x1[7]; output[29] = x1[23]; output[30] = x1[15]; output[31] = x1[31]; } static inline void fdct64_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]); __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]); __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]); __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]); __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]); __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]); __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]); __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]); __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]); __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]); __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]); __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]); __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]); __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]); __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]); __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]); __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]); __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]); __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]); __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]); __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]); __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]); __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]); __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]); __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]); __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]); __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]); __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]); __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]); __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]); __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]); __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]); __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]); __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]); __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]); __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]); __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]); __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]); __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]); __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]); __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]); __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]); __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]); __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]); __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]); __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]); __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]); __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]); __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]); __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]); __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]); __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]); __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]); __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]); __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]); __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]); __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]); __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]); __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]); __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]); __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]); __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]); __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]); __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]); __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]); __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]); __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]); __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]); __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]); __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]); __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]); __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]); __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]); __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]); __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]); __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]); __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]); __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]); // stage 1 __m256i x1[64]; btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]); btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]); btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]); btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]); btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]); btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]); btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]); btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]); btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]); btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]); btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]); btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]); btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]); btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]); btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]); btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]); btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]); btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]); btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]); btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]); btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]); btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]); btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]); btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]); btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]); btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]); btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]); btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]); btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]); btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]); btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]); btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]); // stage 2 btf_32_add_sub_avx2(&x1[0], &x1[31]); btf_32_add_sub_avx2(&x1[1], &x1[30]); btf_32_add_sub_avx2(&x1[2], &x1[29]); btf_32_add_sub_avx2(&x1[3], &x1[28]); btf_32_add_sub_avx2(&x1[4], &x1[27]); btf_32_add_sub_avx2(&x1[5], &x1[26]); btf_32_add_sub_avx2(&x1[6], &x1[25]); btf_32_add_sub_avx2(&x1[7], &x1[24]); btf_32_add_sub_avx2(&x1[8], &x1[23]); btf_32_add_sub_avx2(&x1[9], &x1[22]); btf_32_add_sub_avx2(&x1[10], &x1[21]); btf_32_add_sub_avx2(&x1[11], &x1[20]); btf_32_add_sub_avx2(&x1[12], &x1[19]); btf_32_add_sub_avx2(&x1[13], &x1[18]); btf_32_add_sub_avx2(&x1[14], &x1[17]); btf_32_add_sub_avx2(&x1[15], &x1[16]); btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit); btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit); btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit); btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit); btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit); btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit); btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit); btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit); // stage 3 btf_32_add_sub_avx2(&x1[0], &x1[15]); btf_32_add_sub_avx2(&x1[1], &x1[14]); btf_32_add_sub_avx2(&x1[2], &x1[13]); btf_32_add_sub_avx2(&x1[3], &x1[12]); btf_32_add_sub_avx2(&x1[4], &x1[11]); btf_32_add_sub_avx2(&x1[5], &x1[10]); btf_32_add_sub_avx2(&x1[6], &x1[9]); btf_32_add_sub_avx2(&x1[7], &x1[8]); btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit); btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit); btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit); btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit); btf_32_add_sub_avx2(&x1[32], &x1[47]); btf_32_add_sub_avx2(&x1[33], &x1[46]); btf_32_add_sub_avx2(&x1[34], &x1[45]); btf_32_add_sub_avx2(&x1[35], &x1[44]); btf_32_add_sub_avx2(&x1[36], &x1[43]); btf_32_add_sub_avx2(&x1[37], &x1[42]); btf_32_add_sub_avx2(&x1[38], &x1[41]); btf_32_add_sub_avx2(&x1[39], &x1[40]); btf_32_add_sub_avx2(&x1[63], &x1[48]); btf_32_add_sub_avx2(&x1[62], &x1[49]); btf_32_add_sub_avx2(&x1[61], &x1[50]); btf_32_add_sub_avx2(&x1[60], &x1[51]); btf_32_add_sub_avx2(&x1[59], &x1[52]); btf_32_add_sub_avx2(&x1[58], &x1[53]); btf_32_add_sub_avx2(&x1[57], &x1[54]); btf_32_add_sub_avx2(&x1[56], &x1[55]); // stage 4 btf_32_add_sub_avx2(&x1[0], &x1[7]); btf_32_add_sub_avx2(&x1[1], &x1[6]); btf_32_add_sub_avx2(&x1[2], &x1[5]); btf_32_add_sub_avx2(&x1[3], &x1[4]); btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit); btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit); btf_32_add_sub_avx2(&x1[16], &x1[23]); btf_32_add_sub_avx2(&x1[17], &x1[22]); btf_32_add_sub_avx2(&x1[18], &x1[21]); btf_32_add_sub_avx2(&x1[19], &x1[20]); btf_32_add_sub_avx2(&x1[31], &x1[24]); btf_32_add_sub_avx2(&x1[30], &x1[25]); btf_32_add_sub_avx2(&x1[29], &x1[26]); btf_32_add_sub_avx2(&x1[28], &x1[27]); btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit); btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit); btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit); btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit); btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit); btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit); btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit); btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit); // stage 5 btf_32_add_sub_avx2(&x1[0], &x1[3]); btf_32_add_sub_avx2(&x1[1], &x1[2]); btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit); btf_32_add_sub_avx2(&x1[8], &x1[11]); btf_32_add_sub_avx2(&x1[9], &x1[10]); btf_32_add_sub_avx2(&x1[15], &x1[12]); btf_32_add_sub_avx2(&x1[14], &x1[13]); btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit); btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit); btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit); btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit); btf_32_add_sub_avx2(&x1[32], &x1[39]); btf_32_add_sub_avx2(&x1[33], &x1[38]); btf_32_add_sub_avx2(&x1[34], &x1[37]); btf_32_add_sub_avx2(&x1[35], &x1[36]); btf_32_add_sub_avx2(&x1[47], &x1[40]); btf_32_add_sub_avx2(&x1[46], &x1[41]); btf_32_add_sub_avx2(&x1[45], &x1[42]); btf_32_add_sub_avx2(&x1[44], &x1[43]); btf_32_add_sub_avx2(&x1[48], &x1[55]); btf_32_add_sub_avx2(&x1[49], &x1[54]); btf_32_add_sub_avx2(&x1[50], &x1[53]); btf_32_add_sub_avx2(&x1[51], &x1[52]); btf_32_add_sub_avx2(&x1[63], &x1[56]); btf_32_add_sub_avx2(&x1[62], &x1[57]); btf_32_add_sub_avx2(&x1[61], &x1[58]); btf_32_add_sub_avx2(&x1[60], &x1[59]); // stage 6 btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit); btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit); btf_32_add_sub_avx2(&x1[4], &x1[5]); btf_32_add_sub_avx2(&x1[7], &x1[6]); btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit); btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit); btf_32_add_sub_avx2(&x1[16], &x1[19]); btf_32_add_sub_avx2(&x1[17], &x1[18]); btf_32_add_sub_avx2(&x1[23], &x1[20]); btf_32_add_sub_avx2(&x1[22], &x1[21]); btf_32_add_sub_avx2(&x1[24], &x1[27]); btf_32_add_sub_avx2(&x1[25], &x1[26]); btf_32_add_sub_avx2(&x1[31], &x1[28]); btf_32_add_sub_avx2(&x1[30], &x1[29]); btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit); btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit); btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit); btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit); btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit); btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit); btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit); btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit); // stage 7 btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit); btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit); btf_32_add_sub_avx2(&x1[8], &x1[9]); btf_32_add_sub_avx2(&x1[11], &x1[10]); btf_32_add_sub_avx2(&x1[12], &x1[13]); btf_32_add_sub_avx2(&x1[15], &x1[14]); btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit); btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit); btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit); btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit); btf_32_add_sub_avx2(&x1[32], &x1[35]); btf_32_add_sub_avx2(&x1[33], &x1[34]); btf_32_add_sub_avx2(&x1[39], &x1[36]); btf_32_add_sub_avx2(&x1[38], &x1[37]); btf_32_add_sub_avx2(&x1[40], &x1[43]); btf_32_add_sub_avx2(&x1[41], &x1[42]); btf_32_add_sub_avx2(&x1[47], &x1[44]); btf_32_add_sub_avx2(&x1[46], &x1[45]); btf_32_add_sub_avx2(&x1[48], &x1[51]); btf_32_add_sub_avx2(&x1[49], &x1[50]); btf_32_add_sub_avx2(&x1[55], &x1[52]); btf_32_add_sub_avx2(&x1[54], &x1[53]); btf_32_add_sub_avx2(&x1[56], &x1[59]); btf_32_add_sub_avx2(&x1[57], &x1[58]); btf_32_add_sub_avx2(&x1[63], &x1[60]); btf_32_add_sub_avx2(&x1[62], &x1[61]); // stage 8 btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit); btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit); btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit); btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit); btf_32_add_sub_avx2(&x1[16], &x1[17]); btf_32_add_sub_avx2(&x1[19], &x1[18]); btf_32_add_sub_avx2(&x1[20], &x1[21]); btf_32_add_sub_avx2(&x1[23], &x1[22]); btf_32_add_sub_avx2(&x1[24], &x1[25]); btf_32_add_sub_avx2(&x1[27], &x1[26]); btf_32_add_sub_avx2(&x1[28], &x1[29]); btf_32_add_sub_avx2(&x1[31], &x1[30]); btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit); btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit); btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit); btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit); btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit); btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit); btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit); btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit); // stage 9 btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit); btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit); btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit); btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit); btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit); btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit); btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit); btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit); btf_32_add_sub_avx2(&x1[32], &x1[33]); btf_32_add_sub_avx2(&x1[35], &x1[34]); btf_32_add_sub_avx2(&x1[36], &x1[37]); btf_32_add_sub_avx2(&x1[39], &x1[38]); btf_32_add_sub_avx2(&x1[40], &x1[41]); btf_32_add_sub_avx2(&x1[43], &x1[42]); btf_32_add_sub_avx2(&x1[44], &x1[45]); btf_32_add_sub_avx2(&x1[47], &x1[46]); btf_32_add_sub_avx2(&x1[48], &x1[49]); btf_32_add_sub_avx2(&x1[51], &x1[50]); btf_32_add_sub_avx2(&x1[52], &x1[53]); btf_32_add_sub_avx2(&x1[55], &x1[54]); btf_32_add_sub_avx2(&x1[56], &x1[57]); btf_32_add_sub_avx2(&x1[59], &x1[58]); btf_32_add_sub_avx2(&x1[60], &x1[61]); btf_32_add_sub_avx2(&x1[63], &x1[62]); // stage 10 btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit); btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit); btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit); btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit); btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit); btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit); btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit); btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit); btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit); btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit); btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit); btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit); btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit); btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit); btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit); btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit); // stage 11 output[0] = x1[0]; output[1] = x1[32]; output[2] = x1[16]; output[3] = x1[48]; output[4] = x1[8]; output[5] = x1[40]; output[6] = x1[24]; output[7] = x1[56]; output[8] = x1[4]; output[9] = x1[36]; output[10] = x1[20]; output[11] = x1[52]; output[12] = x1[12]; output[13] = x1[44]; output[14] = x1[28]; output[15] = x1[60]; output[16] = x1[2]; output[17] = x1[34]; output[18] = x1[18]; output[19] = x1[50]; output[20] = x1[10]; output[21] = x1[42]; output[22] = x1[26]; output[23] = x1[58]; output[24] = x1[6]; output[25] = x1[38]; output[26] = x1[22]; output[27] = x1[54]; output[28] = x1[14]; output[29] = x1[46]; output[30] = x1[30]; output[31] = x1[62]; output[32] = x1[1]; output[33] = x1[33]; output[34] = x1[17]; output[35] = x1[49]; output[36] = x1[9]; output[37] = x1[41]; output[38] = x1[25]; output[39] = x1[57]; output[40] = x1[5]; output[41] = x1[37]; output[42] = x1[21]; output[43] = x1[53]; output[44] = x1[13]; output[45] = x1[45]; output[46] = x1[29]; output[47] = x1[61]; output[48] = x1[3]; output[49] = x1[35]; output[50] = x1[19]; output[51] = x1[51]; output[52] = x1[11]; output[53] = x1[43]; output[54] = x1[27]; output[55] = x1[59]; output[56] = x1[7]; output[57] = x1[39]; output[58] = x1[23]; output[59] = x1[55]; output[60] = x1[15]; output[61] = x1[47]; output[62] = x1[31]; output[63] = x1[63]; } static inline void fadst16x16_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m256i __zero = _mm256_setzero_si256(); const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); // stage 1 __m256i x1[16]; x1[0] = input[0]; x1[1] = _mm256_subs_epi16(__zero, input[15]); x1[2] = _mm256_subs_epi16(__zero, input[7]); x1[3] = input[8]; x1[4] = _mm256_subs_epi16(__zero, input[3]); x1[5] = input[12]; x1[6] = input[4]; x1[7] = _mm256_subs_epi16(__zero, input[11]); x1[8] = _mm256_subs_epi16(__zero, input[1]); x1[9] = input[14]; x1[10] = input[6]; x1[11] = _mm256_subs_epi16(__zero, input[9]); x1[12] = input[2]; x1[13] = _mm256_subs_epi16(__zero, input[13]); x1[14] = _mm256_subs_epi16(__zero, input[5]); x1[15] = input[10]; // stage 2 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit); btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit); btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit); btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit); // stage 3 btf_16_adds_subs_avx2(&x1[0], &x1[2]); btf_16_adds_subs_avx2(&x1[1], &x1[3]); btf_16_adds_subs_avx2(&x1[4], &x1[6]); btf_16_adds_subs_avx2(&x1[5], &x1[7]); btf_16_adds_subs_avx2(&x1[8], &x1[10]); btf_16_adds_subs_avx2(&x1[9], &x1[11]); btf_16_adds_subs_avx2(&x1[12], &x1[14]); btf_16_adds_subs_avx2(&x1[13], &x1[15]); // stage 4 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit); btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit); btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit); btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit); // stage 5 btf_16_adds_subs_avx2(&x1[0], &x1[4]); btf_16_adds_subs_avx2(&x1[1], &x1[5]); btf_16_adds_subs_avx2(&x1[2], &x1[6]); btf_16_adds_subs_avx2(&x1[3], &x1[7]); btf_16_adds_subs_avx2(&x1[8], &x1[12]); btf_16_adds_subs_avx2(&x1[9], &x1[13]); btf_16_adds_subs_avx2(&x1[10], &x1[14]); btf_16_adds_subs_avx2(&x1[11], &x1[15]); // stage 6 btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit); btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit); btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit); btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit); // stage 7 btf_16_adds_subs_avx2(&x1[0], &x1[8]); btf_16_adds_subs_avx2(&x1[1], &x1[9]); btf_16_adds_subs_avx2(&x1[2], &x1[10]); btf_16_adds_subs_avx2(&x1[3], &x1[11]); btf_16_adds_subs_avx2(&x1[4], &x1[12]); btf_16_adds_subs_avx2(&x1[5], &x1[13]); btf_16_adds_subs_avx2(&x1[6], &x1[14]); btf_16_adds_subs_avx2(&x1[7], &x1[15]); // stage 8 btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit); btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit); btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit); btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit); btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit); btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit); btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit); btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit); // stage 9 output[0] = x1[1]; output[1] = x1[14]; output[2] = x1[3]; output[3] = x1[12]; output[4] = x1[5]; output[5] = x1[10]; output[6] = x1[7]; output[7] = x1[8]; output[8] = x1[9]; output[9] = x1[6]; output[10] = x1[11]; output[11] = x1[4]; output[12] = x1[13]; output[13] = x1[2]; output[14] = x1[15]; output[15] = x1[0]; } static inline void fidentity16x16_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)cos_bit; const __m256i one = _mm256_set1_epi16(1); for (int i = 0; i < 16; ++i) { const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one); const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one); const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2); const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2); output[i] = _mm256_packs_epi32(b_lo, b_hi); } } static inline void fidentity16x32_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)cos_bit; for (int i = 0; i < 32; ++i) { output[i] = _mm256_slli_epi16(input[i], 2); } } static inline void store_output_32bit_w16(int32_t *const out, const __m256i *const in1, const __m256i *const in2, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { _mm256_store_si256((__m256i *)(out + stride * i), in1[i]); _mm256_store_si256((__m256i *)(out + stride * i + 8), in2[i]); } } // Store 8 16 bit values. Sign extend the values. static inline void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in, int32_t *out, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { _mm256_store_si256((__m256i *)(out), _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i]))); _mm256_store_si256( (__m256i *)(out + 8), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1))); out += stride; } } static inline void store_rect_16bit_to_32bit_avx2(const __m256i a, int32_t *const b) { const __m256i one = _mm256_set1_epi16(1); const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8); const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one); const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one); const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2); const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2); _mm256_store_si256((__m256i *)b, b_lo); _mm256_store_si256((__m256i *)(b + 8), b_hi); } static inline void store_rect_buffer_16bit_to_32bit_w16_avx2( const __m256i *const in, int32_t *const out, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { store_rect_16bit_to_32bit_avx2(in[i], out + i * stride); } } typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output, int8_t cos_bit); static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = { fdct16x32_avx2, // DCT_DCT NULL, // ADST_DCT NULL, // DCT_ADST NULL, // ADST_ADST NULL, // FLIPADST_DCT NULL, // DCT_FLIPADST NULL, // FLIPADST_FLIPADST NULL, // ADST_FLIPADST NULL, // FLIPADST_ADST fidentity16x32_avx2, // IDTX fdct16x32_avx2, // V_DCT fidentity16x32_avx2, // H_DCT NULL, // V_ADST NULL, // H_ADST NULL, // V_FLIPADST NULL // H_FLIPADST }; static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = { fdct16x32_avx2, // DCT_DCT NULL, // ADST_DCT NULL, // DCT_ADST NULL, // ADST_ADST NULL, // FLIPADST_DCT NULL, // DCT_FLIPADST NULL, // FLIPADST_FLIPADST NULL, // ADST_FLIPADST NULL, // FLIPADST_ADST fidentity16x32_avx2, // IDTX fidentity16x32_avx2, // V_DCT fdct16x32_avx2, // H_DCT NULL, // V_ADST NULL, // H_ADST NULL, // V_FLIPADST NULL // H_FLIPADST }; static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = { fdct16x16_new_avx2, // DCT_DCT fadst16x16_new_avx2, // ADST_DCT fdct16x16_new_avx2, // DCT_ADST fadst16x16_new_avx2, // ADST_ADST fadst16x16_new_avx2, // FLIPADST_DCT fdct16x16_new_avx2, // DCT_FLIPADST fadst16x16_new_avx2, // FLIPADST_FLIPADST fadst16x16_new_avx2, // ADST_FLIPADST fadst16x16_new_avx2, // FLIPADST_ADST fidentity16x16_new_avx2, // IDTX fdct16x16_new_avx2, // V_DCT fidentity16x16_new_avx2, // H_DCT fadst16x16_new_avx2, // V_ADST fidentity16x16_new_avx2, // H_ADST fadst16x16_new_avx2, // V_FLIPADST fidentity16x16_new_avx2 // H_FLIPADST }; static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = { fdct16x16_new_avx2, // DCT_DCT fdct16x16_new_avx2, // ADST_DCT fadst16x16_new_avx2, // DCT_ADST fadst16x16_new_avx2, // ADST_ADST fdct16x16_new_avx2, // FLIPADST_DCT fadst16x16_new_avx2, // DCT_FLIPADST fadst16x16_new_avx2, // FLIPADST_FLIPADST fadst16x16_new_avx2, // ADST_FLIPADST fadst16x16_new_avx2, // FLIPADST_ADST fidentity16x16_new_avx2, // IDTX fidentity16x16_new_avx2, // V_DCT fdct16x16_new_avx2, // H_DCT fidentity16x16_new_avx2, // V_ADST fadst16x16_new_avx2, // H_ADST fidentity16x16_new_avx2, // V_FLIPADST fadst16x16_new_avx2 // H_FLIPADST }; static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = { fdct8x8_new_sse2, // DCT_DCT fadst8x8_new_sse2, // ADST_DCT fdct8x8_new_sse2, // DCT_ADST fadst8x8_new_sse2, // ADST_ADST fadst8x8_new_sse2, // FLIPADST_DCT fdct8x8_new_sse2, // DCT_FLIPADST fadst8x8_new_sse2, // FLIPADST_FLIPADST fadst8x8_new_sse2, // ADST_FLIPADST fadst8x8_new_sse2, // FLIPADST_ADST fidentity8x8_new_sse2, // IDTX fdct8x8_new_sse2, // V_DCT fidentity8x8_new_sse2, // H_DCT fadst8x8_new_sse2, // V_ADST fidentity8x8_new_sse2, // H_ADST fadst8x8_new_sse2, // V_FLIPADST fidentity8x8_new_sse2, // H_FLIPADST }; static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = { fdct8x8_new_sse2, // DCT_DCT fdct8x8_new_sse2, // ADST_DCT fadst8x8_new_sse2, // DCT_ADST fadst8x8_new_sse2, // ADST_ADST fdct8x8_new_sse2, // FLIPADST_DCT fadst8x8_new_sse2, // DCT_FLIPADST fadst8x8_new_sse2, // FLIPADST_FLIPADST fadst8x8_new_sse2, // ADST_FLIPADST fadst8x8_new_sse2, // FLIPADST_ADST fidentity8x8_new_sse2, // IDTX fidentity8x8_new_sse2, // V_DCT fdct8x8_new_sse2, // H_DCT fidentity8x8_new_sse2, // V_ADST fadst8x8_new_sse2, // H_ADST fidentity8x8_new_sse2, // V_FLIPADST fadst8x8_new_sse2 // H_FLIPADST }; static inline void load_buffer_and_round_shift(const int16_t *in, int stride, __m128i *out, int bit) { out[0] = _mm_load_si128((const __m128i *)(in + 0 * stride)); out[1] = _mm_load_si128((const __m128i *)(in + 1 * stride)); out[2] = _mm_load_si128((const __m128i *)(in + 2 * stride)); out[3] = _mm_load_si128((const __m128i *)(in + 3 * stride)); out[4] = _mm_load_si128((const __m128i *)(in + 4 * stride)); out[5] = _mm_load_si128((const __m128i *)(in + 5 * stride)); out[6] = _mm_load_si128((const __m128i *)(in + 6 * stride)); out[7] = _mm_load_si128((const __m128i *)(in + 7 * stride)); out[0] = _mm_slli_epi16(out[0], bit); out[1] = _mm_slli_epi16(out[1], bit); out[2] = _mm_slli_epi16(out[2], bit); out[3] = _mm_slli_epi16(out[3], bit); out[4] = _mm_slli_epi16(out[4], bit); out[5] = _mm_slli_epi16(out[5], bit); out[6] = _mm_slli_epi16(out[6], bit); out[7] = _mm_slli_epi16(out[7], bit); } static inline void load_buffer_and_flip_round_shift(const int16_t *in, int stride, __m128i *out, int bit) { out[7] = load_16bit_to_16bit(in + 0 * stride); out[6] = load_16bit_to_16bit(in + 1 * stride); out[5] = load_16bit_to_16bit(in + 2 * stride); out[4] = load_16bit_to_16bit(in + 3 * stride); out[3] = load_16bit_to_16bit(in + 4 * stride); out[2] = load_16bit_to_16bit(in + 5 * stride); out[1] = load_16bit_to_16bit(in + 6 * stride); out[0] = load_16bit_to_16bit(in + 7 * stride); out[7] = _mm_slli_epi16(out[7], bit); out[6] = _mm_slli_epi16(out[6], bit); out[5] = _mm_slli_epi16(out[5], bit); out[4] = _mm_slli_epi16(out[4], bit); out[3] = _mm_slli_epi16(out[3], bit); out[2] = _mm_slli_epi16(out[2], bit); out[1] = _mm_slli_epi16(out[1], bit); out[0] = _mm_slli_epi16(out[0], bit); } #define TRANSPOSE_8X8_AVX2() \ { \ /* aa0: 00 10 01 11 02 12 03 13 | 40 50 41 51 42 52 43 53*/ \ /* aa1: 04 14 05 15 06 16 07 17 | 44 54 45 55 46 56 47 57*/ \ /* aa2: 20 30 21 31 22 32 23 33 | 60 70 61 71 62 72 63 73*/ \ /* aa3: 24 34 25 35 26 36 27 37 | 64 74 65 75 66 76 67 77*/ \ const __m256i aa0 = _mm256_unpacklo_epi16(b0, b1); \ const __m256i aa1 = _mm256_unpackhi_epi16(b0, b1); \ const __m256i aa2 = _mm256_unpacklo_epi16(b2, b3); \ const __m256i aa3 = _mm256_unpackhi_epi16(b2, b3); \ /* Unpack 32 bit elements resulting in: */ \ /* bb0: 00 10 20 30 01 11 21 31 | 40 50 60 70 41 51 61 71*/ \ /* bb1: 02 12 22 32 03 13 23 33 | 42 52 62 72 43 53 63 73*/ \ /* bb2: 04 14 24 34 05 15 25 35 | 44 54 64 74 45 55 65 75*/ \ /* bb2: 06 16 26 36 07 17 27 37 | 46 56 66 76 47 57 67 77*/ \ const __m256i bb0 = _mm256_unpacklo_epi32(aa0, aa2); \ const __m256i bb1 = _mm256_unpackhi_epi32(aa0, aa2); \ const __m256i bb2 = _mm256_unpacklo_epi32(aa1, aa3); \ const __m256i bb3 = _mm256_unpackhi_epi32(aa1, aa3); \ /* bb0: 00 10 20 30 40 50 60 70| 01 11 21 31 41 51 61 71*/ \ /* bb1: 02 12 22 32 42 52 62 72| 03 13 23 33 43 53 63 73*/ \ /* bb2: 04 14 24 34 44 54 64 74| 05 15 25 35 45 55 65 75*/ \ /* bb2: 06 16 26 36 46 56 66 76| 07 17 27 37 47 57 67 77*/ \ c0 = _mm256_permute4x64_epi64(bb0, 0xd8); \ c1 = _mm256_permute4x64_epi64(bb1, 0xd8); \ c2 = _mm256_permute4x64_epi64(bb2, 0xd8); \ c3 = _mm256_permute4x64_epi64(bb3, 0xd8); \ } static inline void transpose_round_shift_flip_8x8(__m128i *const in, __m128i *const out, int bit) { __m256i c0, c1, c2, c3; bit = -bit; const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1)); const __m256i s04 = _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1); const __m256i s15 = _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1); const __m256i s26 = _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1); const __m256i s37 = _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1); const __m256i a0 = _mm256_adds_epi16(s04, rounding); const __m256i a1 = _mm256_adds_epi16(s15, rounding); const __m256i a2 = _mm256_adds_epi16(s26, rounding); const __m256i a3 = _mm256_adds_epi16(s37, rounding); // b0: 00 01 02 03 04 05 06 07 | 40 41 42 43 44 45 46 47 // b1: 10 11 12 13 14 15 16 17 | 50 51 52 53 54 55 56 57 // b2: 20 21 22 23 24 25 26 27 | 60 61 62 63 64 65 66 67 // b3: 30 31 32 33 34 35 36 37 | 70 71 72 73 74 75 76 77 const __m256i b0 = _mm256_srai_epi16(a0, bit); const __m256i b1 = _mm256_srai_epi16(a1, bit); const __m256i b2 = _mm256_srai_epi16(a2, bit); const __m256i b3 = _mm256_srai_epi16(a3, bit); TRANSPOSE_8X8_AVX2() // Unpack 64 bit elements resulting in: // out[7]: 00 10 20 30 40 50 60 70 // out[6]: 01 11 21 31 41 51 61 71 // out[5]: 02 12 22 32 42 52 62 72 // out[4]: 03 13 23 33 43 53 63 73 // out[3]: 04 14 24 34 44 54 64 74 // out[2]: 05 15 25 35 45 55 65 75 // out[1]: 06 16 26 36 46 56 66 76 // out[0]: 07 17 27 37 47 57 67 77 out[7] = _mm256_castsi256_si128(c0); out[6] = _mm256_extractf128_si256(c0, 1); out[5] = _mm256_castsi256_si128(c1); out[4] = _mm256_extractf128_si256(c1, 1); out[3] = _mm256_castsi256_si128(c2); out[2] = _mm256_extractf128_si256(c2, 1); out[1] = _mm256_castsi256_si128(c3); out[0] = _mm256_extractf128_si256(c3, 1); } static inline void transpose_round_shift_8x8(__m128i *const in, __m128i *const out, int bit) { __m256i c0, c1, c2, c3; bit = -bit; const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1)); const __m256i s04 = _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1); const __m256i s15 = _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1); const __m256i s26 = _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1); const __m256i s37 = _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1); const __m256i a0 = _mm256_adds_epi16(s04, rounding); const __m256i a1 = _mm256_adds_epi16(s15, rounding); const __m256i a2 = _mm256_adds_epi16(s26, rounding); const __m256i a3 = _mm256_adds_epi16(s37, rounding); // b0: 00 01 02 03 04 05 06 07 | 40 41 42 43 44 45 46 47 // b1: 10 11 12 13 14 15 16 17 | 50 51 52 53 54 55 56 57 // b2: 20 21 22 23 24 25 26 27 | 60 61 62 63 64 65 66 67 // b3: 30 31 32 33 34 35 36 37 | 70 71 72 73 74 75 76 77 const __m256i b0 = _mm256_srai_epi16(a0, bit); const __m256i b1 = _mm256_srai_epi16(a1, bit); const __m256i b2 = _mm256_srai_epi16(a2, bit); const __m256i b3 = _mm256_srai_epi16(a3, bit); TRANSPOSE_8X8_AVX2() // Unpack 64 bit elements resulting in: // out[7]: 00 10 20 30 40 50 60 70 // out[6]: 01 11 21 31 41 51 61 71 // out[5]: 02 12 22 32 42 52 62 72 // out[4]: 03 13 23 33 43 53 63 73 // out[3]: 04 14 24 34 44 54 64 74 // out[2]: 05 15 25 35 45 55 65 75 // out[1]: 06 16 26 36 46 56 66 76 // out[0]: 07 17 27 37 47 57 67 77 out[0] = _mm256_castsi256_si128(c0); out[1] = _mm256_extractf128_si256(c0, 1); out[2] = _mm256_castsi256_si128(c1); out[3] = _mm256_extractf128_si256(c1, 1); out[4] = _mm256_castsi256_si128(c2); out[5] = _mm256_extractf128_si256(c2, 1); out[6] = _mm256_castsi256_si128(c3); out[7] = _mm256_extractf128_si256(c3, 1); } static inline void store_buffer_16bit_to_32bit_w8_avx2(const __m128i *const in, int32_t *const out, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { _mm256_store_si256((__m256i *)(out + i * stride), _mm256_cvtepi16_epi32(in[i])); } } static void av1_lowbd_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf0[8], buf1[8], *buf; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8]; const int txw_idx = get_txw_idx(TX_8X8); const int txh_idx = get_txh_idx(TX_8X8); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // Condition to check shift bit is avoided while round shifting, by assuming // that shift[0] will always be positive. assert(shift[0] > 0); if (ud_flip) load_buffer_and_flip_round_shift(input, stride, buf0, shift[0]); else load_buffer_and_round_shift(input, stride, buf0, shift[0]); col_txfm(buf0, buf0, cos_bit_col); // Condition to check shift bit is avoided while round shifting, by assuming // that shift[1] will always be negative. assert(shift[1] < 0); if (lr_flip) { transpose_round_shift_flip_8x8(buf0, buf1, shift[1]); } else { transpose_round_shift_8x8(buf0, buf1, shift[1]); } buf = buf1; row_txfm(buf, buf, cos_bit_row); // Round and shift operation is avoided here as the shift bit is assumed to be // zero always. assert(shift[2] == 0); store_buffer_16bit_to_32bit_w8_avx2(buf, output, 8, 8); } static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; const TX_SIZE tx_size = TX_16X16; __m256i buf0[16], buf1[16]; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type]; const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); const int32_t i = 0; if (ud_flip) { load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height); } else { load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); } round_shift_16bit_w16_avx2(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit_w16_avx2(buf0, height, shift[1]); transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i); __m256i *buf; if (lr_flip) { buf = buf0; flip_buf_avx2(buf1 + width * i, buf, width); } else { buf = buf1 + width * i; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit_w16_avx2(buf, width, shift[2]); store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width); } static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; const TX_SIZE tx_size = TX_32X32; __m256i buf0[32], buf1[128]; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < 2; i++) { if (ud_flip) { load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height); } else { load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); } round_shift_16bit_w16_avx2(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit_w16_avx2(buf0, height, shift[1]); transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i); transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i); } for (int i = 0; i < 2; i++) { __m256i *buf; if (lr_flip) { buf = buf0; flip_buf_avx2(buf1 + width * i, buf, width); } else { buf = buf1 + width * i; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit_w16_avx2(buf, width, shift[2]); store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width); } } static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; (void)tx_type; assert(tx_type == DCT_DCT); const TX_SIZE tx_size = TX_64X64; __m256i buf0[64], buf1[256]; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; const int width_div16 = (width >> 4); const int height_div16 = (height >> 4); for (int i = 0; i < width_div16; i++) { load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); round_shift_16bit_w16_avx2(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit_w16_avx2(buf0, height, shift[1]); for (int j = 0; j < AOMMIN(2, height_div16); ++j) { transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); } } for (int i = 0; i < AOMMIN(2, height_div16); i++) { __m256i bufA[64]; __m256i bufB[64]; __m128i *buf = (__m128i *)(buf1 + width * i); for (int j = 0; j < width; ++j) { bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); } fdct64_new_avx2(bufA, bufA, cos_bit_row); fdct64_new_avx2(bufB, bufB, cos_bit_row); round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]); round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]); store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32); } } static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; const TX_SIZE tx_size = TX_16X32; __m256i buf0[32], buf1[32]; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); if (ud_flip) { load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height); } else { load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height); } round_shift_16bit_w16_avx2(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit_w16_avx2(buf0, height, shift[1]); transpose_16bit_16x16_avx2(buf0, buf1); transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16); for (int i = 0; i < 2; i++) { __m256i *buf; if (lr_flip) { buf = buf0; flip_buf_avx2(buf1 + width * i, buf, width); } else { buf = buf1 + width * i; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit_w16_avx2(buf, width, shift[2]); store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width); } } static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m256i buf0[32], buf1[64]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16]; const int txw_idx = get_txw_idx(TX_32X16); const int txh_idx = get_txh_idx(TX_32X16); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 32; const int height = 16; const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type]; const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < 2; i++) { if (ud_flip) { load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height); } else { load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); } round_shift_16bit_w16_avx2(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit_w16_avx2(buf0, height, shift[1]); transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i); } __m256i *buf; if (lr_flip) { buf = buf0; flip_buf_avx2(buf1, buf, width); } else { buf = buf1; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit_w16_avx2(buf, width, shift[2]); store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, height, width); } static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; const TX_SIZE tx_size = TX_64X32; __m256i buf0[64], buf1[256]; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; const int width_div16 = (width >> 4); const int height_div16 = (height >> 4); for (int i = 0; i < width_div16; i++) { load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); round_shift_16bit_w16_avx2(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit_w16_avx2(buf0, height, shift[1]); for (int j = 0; j < AOMMIN(4, height_div16); ++j) { transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); } } assert(tx_type == DCT_DCT); for (int i = 0; i < AOMMIN(2, height_div16); i++) { __m256i bufA[64]; __m256i bufB[64]; __m128i *buf = (__m128i *)(buf1 + width * i); for (int j = 0; j < width; ++j) { bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); } fdct64_new_avx2(bufA, bufA, cos_bit_row); fdct64_new_avx2(bufB, bufB, cos_bit_row); round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2); round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2); store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32); } } static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; (void)tx_type; assert(tx_type == DCT_DCT); const TX_SIZE tx_size = TX_32X64; __m256i buf0[64], buf1[256]; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; const int width_div16 = (width >> 4); const int height_div16 = (height >> 4); for (int i = 0; i < width_div16; i++) { load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); round_shift_16bit_w16_avx2(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit_w16_avx2(buf0, height, shift[1]); for (int j = 0; j < AOMMIN(2, height_div16); ++j) { transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); } } for (int i = 0; i < AOMMIN(2, height_div16); i++) { __m256i bufA[32]; __m256i bufB[32]; __m128i *buf = (__m128i *)(buf1 + width * i); for (int j = 0; j < width; ++j) { bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); } fdct32_avx2(bufA, bufA, cos_bit_row); fdct32_avx2(bufB, bufB, cos_bit_row); round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2); round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2); store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32); } } static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; (void)tx_type; assert(tx_type == DCT_DCT); const TX_SIZE tx_size = TX_16X64; __m256i buf0[64], buf1[64]; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; const transform_1d_avx2 row_txfm = fdct16x16_new_avx2; const int width_div16 = (width >> 4); const int height_div16 = (height >> 4); for (int i = 0; i < width_div16; i++) { load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); round_shift_16bit_w16_avx2(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit_w16_avx2(buf0, height, shift[1]); for (int j = 0; j < height_div16; ++j) { transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); } } for (int i = 0; i < AOMMIN(2, height_div16); i++) { __m256i *buf = buf1 + width * i; row_txfm(buf, buf, cos_bit_row); round_shift_16bit_w16_avx2(buf, width, shift[2]); store_buffer_16bit_to_32bit_w16_avx2(buf, output + width * i, 32, width); } } static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; (void)tx_type; assert(tx_type == DCT_DCT); const TX_SIZE tx_size = TX_64X16; __m256i buf0[64], buf1[64]; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const transform_1d_avx2 col_txfm = fdct16x16_new_avx2; const transform_1d_avx2 row_txfm = fdct16x64_new_avx2; const int width_div16 = (width >> 4); const int height_div16 = (height >> 4); for (int i = 0; i < width_div16; i++) { load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); round_shift_16bit_w16_avx2(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit_w16_avx2(buf0, height, shift[1]); for (int j = 0; j < height_div16; ++j) { transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); } } for (int i = 0; i < height_div16; i++) { __m256i *buf = buf1 + width * i; row_txfm(buf, buf, cos_bit_row); round_shift_16bit_w16_avx2(buf, width, shift[2]); store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * i, 16, 32); } // Zero out the bottom 16x32 area. memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); } static inline void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0, __m256i *in1, __m128i *out0, __m128i *out1, __m128i *out2, __m128i *out3, const __m256i *__rounding, int8_t *cos_bit) { __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1); __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1); __m256i u0 = _mm256_madd_epi16(t0, *w0); __m256i u1 = _mm256_madd_epi16(t1, *w0); __m256i v0 = _mm256_madd_epi16(t0, *w1); __m256i v1 = _mm256_madd_epi16(t1, *w1); __m256i a0 = _mm256_add_epi32(u0, *__rounding); __m256i a1 = _mm256_add_epi32(u1, *__rounding); __m256i b0 = _mm256_add_epi32(v0, *__rounding); __m256i b1 = _mm256_add_epi32(v1, *__rounding); __m256i c0 = _mm256_srai_epi32(a0, *cos_bit); __m256i c1 = _mm256_srai_epi32(a1, *cos_bit); __m256i d0 = _mm256_srai_epi32(b0, *cos_bit); __m256i d1 = _mm256_srai_epi32(b1, *cos_bit); __m256i temp0 = _mm256_packs_epi32(c0, c1); __m256i temp1 = _mm256_packs_epi32(d0, d1); *out0 = _mm256_castsi256_si128(temp0); *out1 = _mm256_castsi256_si128(temp1); *out2 = _mm256_extracti128_si256(temp0, 0x01); *out3 = _mm256_extracti128_si256(temp1, 0x01); } static inline void fdct8x8_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); // stage 1 __m256i x1[8]; x1[0] = _mm256_adds_epi16(input[0], input[7]); x1[7] = _mm256_subs_epi16(input[0], input[7]); x1[1] = _mm256_adds_epi16(input[1], input[6]); x1[6] = _mm256_subs_epi16(input[1], input[6]); x1[2] = _mm256_adds_epi16(input[2], input[5]); x1[5] = _mm256_subs_epi16(input[2], input[5]); x1[3] = _mm256_adds_epi16(input[3], input[4]); x1[4] = _mm256_subs_epi16(input[3], input[4]); // stage 2 __m256i x2[8]; x2[0] = _mm256_adds_epi16(x1[0], x1[3]); x2[3] = _mm256_subs_epi16(x1[0], x1[3]); x2[1] = _mm256_adds_epi16(x1[1], x1[2]); x2[2] = _mm256_subs_epi16(x1[1], x1[2]); x2[4] = x1[4]; btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding, cos_bit); x2[5] = x1[5]; x2[6] = x1[6]; x2[7] = x1[7]; // stage 3 __m256i x3[8]; btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding, cos_bit); x3[0] = x2[0]; x3[1] = x2[1]; btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding, cos_bit); x3[2] = x2[2]; x3[3] = x2[3]; x3[4] = _mm256_adds_epi16(x2[4], x2[5]); x3[5] = _mm256_subs_epi16(x2[4], x2[5]); x3[6] = _mm256_subs_epi16(x2[7], x2[6]); x3[7] = _mm256_adds_epi16(x2[7], x2[6]); // stage 4 __m256i x4[8]; x4[0] = x3[0]; x4[1] = x3[1]; x4[2] = x3[2]; x4[3] = x3[3]; btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding, cos_bit); x4[4] = x3[4]; x4[7] = x3[7]; btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding, cos_bit); x4[5] = x3[5]; x4[6] = x3[6]; // stage 5 output[0] = x4[0]; output[1] = x4[4]; output[2] = x4[2]; output[3] = x4[6]; output[4] = x4[1]; output[5] = x4[5]; output[6] = x4[3]; output[7] = x4[7]; } static inline void fadst8x8_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m256i __zero = _mm256_setzero_si256(); const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); // stage 1 __m256i x1[8]; x1[0] = input[0]; x1[1] = _mm256_subs_epi16(__zero, input[7]); x1[2] = _mm256_subs_epi16(__zero, input[3]); x1[3] = input[4]; x1[4] = _mm256_subs_epi16(__zero, input[1]); x1[5] = input[6]; x1[6] = input[2]; x1[7] = _mm256_subs_epi16(__zero, input[5]); // stage 2 __m256i x2[8]; x2[0] = x1[0]; x2[1] = x1[1]; btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding, cos_bit); x2[2] = x1[2]; x2[3] = x1[3]; x2[4] = x1[4]; x2[5] = x1[5]; btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding, cos_bit); x2[6] = x1[6]; x2[7] = x1[7]; // stage 3 __m256i x3[8]; x3[0] = _mm256_adds_epi16(x2[0], x2[2]); x3[2] = _mm256_subs_epi16(x2[0], x2[2]); x3[1] = _mm256_adds_epi16(x2[1], x2[3]); x3[3] = _mm256_subs_epi16(x2[1], x2[3]); x3[4] = _mm256_adds_epi16(x2[4], x2[6]); x3[6] = _mm256_subs_epi16(x2[4], x2[6]); x3[5] = _mm256_adds_epi16(x2[5], x2[7]); x3[7] = _mm256_subs_epi16(x2[5], x2[7]); // stage 4 __m256i x4[8]; x4[0] = x3[0]; x4[1] = x3[1]; x4[2] = x3[2]; x4[3] = x3[3]; btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding, cos_bit); x4[4] = x3[4]; x4[5] = x3[5]; btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding, cos_bit); x4[6] = x3[6]; x4[7] = x3[7]; // stage 5 __m256i x5[8]; x5[0] = _mm256_adds_epi16(x4[0], x4[4]); x5[4] = _mm256_subs_epi16(x4[0], x4[4]); x5[1] = _mm256_adds_epi16(x4[1], x4[5]); x5[5] = _mm256_subs_epi16(x4[1], x4[5]); x5[2] = _mm256_adds_epi16(x4[2], x4[6]); x5[6] = _mm256_subs_epi16(x4[2], x4[6]); x5[3] = _mm256_adds_epi16(x4[3], x4[7]); x5[7] = _mm256_subs_epi16(x4[3], x4[7]); // stage 6 __m256i x6[8]; btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding, cos_bit); x6[0] = x5[0]; x6[1] = x5[1]; btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding, cos_bit); x6[2] = x5[2]; x6[3] = x5[3]; btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding, cos_bit); x6[4] = x5[4]; x6[5] = x5[5]; btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding, cos_bit); x6[6] = x5[6]; x6[7] = x5[7]; // stage 7 output[0] = x6[1]; output[1] = x6[6]; output[2] = x6[3]; output[3] = x6[4]; output[4] = x6[5]; output[5] = x6[2]; output[6] = x6[7]; output[7] = x6[0]; } static inline void fidentity8x8_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)cos_bit; output[0] = _mm256_adds_epi16(input[0], input[0]); output[1] = _mm256_adds_epi16(input[1], input[1]); output[2] = _mm256_adds_epi16(input[2], input[2]); output[3] = _mm256_adds_epi16(input[3], input[3]); output[4] = _mm256_adds_epi16(input[4], input[4]); output[5] = _mm256_adds_epi16(input[5], input[5]); output[6] = _mm256_adds_epi16(input[6], input[6]); output[7] = _mm256_adds_epi16(input[7], input[7]); } static inline void fdct8x16_new_avx2(const __m128i *input, __m128i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1)); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); __m128i temp0, temp1, temp2, temp3; __m256i in0, in1; __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); __m256i cospi_arr[12]; cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32), cospi_m32_p32, 0x1); cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), cospi_p32_p32, 0x1); cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), cospi_p48_p16, 0x1); cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), cospi_m16_p48, 0x1); cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48), cospi_m48_m16, 0x1); cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16), cospi_m16_p48, 0x1); cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08), cospi_p24_p40, 0x1); cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56), cospi_m40_p24, 0x1); cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04), cospi_p28_p36, 0x1); cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60), cospi_m36_p28, 0x1); cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20), cospi_p12_p52, 0x1); cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44), cospi_m52_p12, 0x1); __m256i x[8]; x[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1); x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14], 0x1); x[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1); x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12], 0x1); x[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1); x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11], 0x1); x[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1); x[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1); // stage 1 __m256i x1[8]; x1[0] = _mm256_adds_epi16(x[0], x[1]); x1[7] = _mm256_subs_epi16(x[0], x[1]); x1[1] = _mm256_adds_epi16(x[2], x[3]); x1[6] = _mm256_subs_epi16(x[2], x[3]); x1[2] = _mm256_adds_epi16(x[4], x[5]); x1[5] = _mm256_subs_epi16(x[4], x[5]); x1[3] = _mm256_adds_epi16(x[6], x[7]); x1[4] = _mm256_subs_epi16(x[6], x[7]); // stage 2 __m256i x2[8]; x2[0] = _mm256_adds_epi16(x1[0], x1[3]); x2[7] = _mm256_subs_epi16(x1[0], x1[3]); x2[1] = _mm256_adds_epi16(x1[1], x1[2]); x2[6] = _mm256_subs_epi16(x1[1], x1[2]); x2[2] = x1[4]; x2[3] = x1[7]; btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &x1[5], &x1[6], &temp0, &temp1, &temp2, &temp3, &__rounding_256, &cos_bit); x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1); x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1); // stage 3 __m256i x3[8]; x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e); x3[0] = _mm256_adds_epi16(x2[0], x2[1]); x3[1] = _mm256_subs_epi16(x2[0], x2[1]); x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]), _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1); x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1); x3[3] = _mm256_adds_epi16(x2[2], x2[4]); x3[4] = _mm256_subs_epi16(x2[2], x2[4]); x3[5] = _mm256_adds_epi16(x2[3], x2[5]); x3[6] = _mm256_subs_epi16(x2[3], x2[5]); // stage 4 __m256i x4[8]; x4[0] = _mm256_blend_epi32(x3[0], x3[1], 0xf0); x4[1] = _mm256_permute2f128_si256(x3[0], x3[1], 0x21); btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &x4[0], &x4[1], &output[0], &output[8], &output[4], &output[12], &__rounding_256, &cos_bit); x4[2] = _mm256_adds_epi16(x3[2], x3[7]); x4[3] = _mm256_subs_epi16(x3[2], x3[7]); x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20); x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20); in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31); in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31); btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2, &temp3, &__rounding_256, &cos_bit); x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1); x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1); // stage 5 __m256i x5[4]; in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31); in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20); btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &output[2], &output[14], &output[10], &output[6], &__rounding_256, &cos_bit); x5[0] = _mm256_adds_epi16(x4[4], x4[6]); x5[1] = _mm256_subs_epi16(x4[4], x4[6]); x5[2] = _mm256_adds_epi16(x4[5], x4[7]); x5[3] = _mm256_subs_epi16(x4[5], x4[7]); // stage 6 in0 = _mm256_permute2f128_si256(x5[0], x5[1], 0x20); in1 = _mm256_permute2f128_si256(x5[2], x5[3], 0x31); btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &output[1], &output[15], &output[9], &output[7], &__rounding_256, &cos_bit); in0 = _mm256_permute2f128_si256(x5[1], x5[0], 0x31); in1 = _mm256_permute2f128_si256(x5[3], x5[2], 0x20); btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &output[5], &output[11], &output[13], &output[3], &__rounding_256, &cos_bit); } static inline void fadst8x16_new_avx2(const __m128i *input, __m128i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m256i __zero = _mm256_setzero_si256(); const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1)); __m256i in0, in1; __m128i temp0, temp1, temp2, temp3; __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); __m256i cospi_arr[20]; cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), cospi_p32_p32, 0x1); cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), cospi_p32_m32, 0x1); cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), cospi_p32_p32, 0x1); cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), cospi_p32_m32, 0x1); cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48), cospi_m48_p16, 0x1); cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16), cospi_p16_p48, 0x1); cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48), cospi_m48_p16, 0x1); cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16), cospi_p16_p48, 0x1); cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56), cospi_p40_p24, 0x1); cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08), cospi_p24_m40, 0x1); cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08), cospi_m24_p40, 0x1); cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56), cospi_p40_p24, 0x1); cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62), cospi_p10_p54, 0x1); cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02), cospi_p54_m10, 0x1); cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46), cospi_p26_p38, 0x1); cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18), cospi_p38_m26, 0x1); cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30), cospi_p42_p22, 0x1); cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34), cospi_p22_m42, 0x1); cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14), cospi_p58_p06, 0x1); cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50), cospi_p06_m58, 0x1); __m256i x[8]; x[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1); x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1); x[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1); x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14], 0x1); x[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1); x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1); x[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1); x[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1); // stage 1 __m256i x1[8]; x1[0] = x[0]; x1[1] = _mm256_subs_epi16(__zero, x[7]); x1[2] = x[2]; x1[3] = _mm256_subs_epi16(__zero, x[5]); x1[4] = _mm256_subs_epi16(__zero, x[4]); x1[5] = x[3]; x1[6] = _mm256_subs_epi16(__zero, x[6]); x1[7] = x[1]; // stage 2 __m256i x2[8]; x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0); x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0); x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0); x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0); in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0); in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0); btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &in0, &in1, &temp0, &temp1, &temp2, &temp3, &__rounding_256, &cos_bit); x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21); in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21); btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &in0, &in1, &temp0, &temp1, &temp2, &temp3, &__rounding_256, &cos_bit); x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); // stage 3 __m256i x3[8]; x3[0] = _mm256_adds_epi16(x2[0], x2[1]); x3[1] = _mm256_subs_epi16(x2[0], x2[1]); x3[2] = _mm256_adds_epi16(x2[3], x2[2]); x3[3] = _mm256_subs_epi16(x2[3], x2[2]); x3[4] = _mm256_adds_epi16(x2[4], x2[5]); x3[5] = _mm256_subs_epi16(x2[4], x2[5]); x3[6] = _mm256_adds_epi16(x2[7], x2[6]); x3[7] = _mm256_subs_epi16(x2[7], x2[6]); // stage 4 __m256i x4[8]; x4[0] = x3[0]; x4[1] = x3[1]; x4[4] = x3[4]; x4[5] = x3[5]; in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20); in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31); btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2, &temp3, &__rounding_256, &cos_bit); x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20); in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31); btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &temp0, &temp1, &temp2, &temp3, &__rounding_256, &cos_bit); x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); // stage 5 __m256i x5[8]; x5[0] = _mm256_adds_epi16(x4[0], x4[2]); x5[1] = _mm256_subs_epi16(x4[0], x4[2]); x5[2] = _mm256_adds_epi16(x4[1], x4[3]); x5[3] = _mm256_subs_epi16(x4[1], x4[3]); x5[4] = _mm256_adds_epi16(x4[4], x4[6]); x5[5] = _mm256_subs_epi16(x4[4], x4[6]); x5[6] = _mm256_adds_epi16(x4[5], x4[7]); x5[7] = _mm256_subs_epi16(x4[5], x4[7]); // stage 6 __m256i x6[8]; x6[0] = x5[0]; x6[1] = x5[2]; x6[2] = x5[1]; x6[3] = x5[3]; in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20); in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31); btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &temp0, &temp1, &temp2, &temp3, &__rounding_256, &cos_bit); x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20); in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31); btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &temp0, &temp1, &temp2, &temp3, &__rounding_256, &cos_bit); x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); // stage 7 __m256i x7[8]; x7[0] = _mm256_adds_epi16(x6[0], x6[4]); x7[1] = _mm256_subs_epi16(x6[0], x6[4]); x7[2] = _mm256_adds_epi16(x6[1], x6[5]); x7[3] = _mm256_subs_epi16(x6[1], x6[5]); x7[4] = _mm256_adds_epi16(x6[2], x6[6]); x7[5] = _mm256_subs_epi16(x6[2], x6[6]); x7[6] = _mm256_adds_epi16(x6[3], x6[7]); x7[7] = _mm256_subs_epi16(x6[3], x6[7]); // stage 8 in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20); in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31); btf_16_avx2(&cospi_arr[12], &cospi_arr[13], &in0, &in1, &output[15], &output[0], &output[13], &output[2], &__rounding_256, &cos_bit); in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20); in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31); btf_16_avx2(&cospi_arr[14], &cospi_arr[15], &in0, &in1, &output[11], &output[4], &output[9], &output[6], &__rounding_256, &cos_bit); in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20); in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31); btf_16_avx2(&cospi_arr[16], &cospi_arr[17], &in0, &in1, &output[7], &output[8], &output[5], &output[10], &__rounding_256, &cos_bit); in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20); in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31); btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3], &output[12], &output[1], &output[14], &__rounding_256, &cos_bit); } static inline void fidentity8x16_new_avx2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)cos_bit; const __m256i one = _mm256_set1_epi16(1); __m256i temp; for (int i = 0; i < 16; i += 2) { temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]), input[i + 1], 0x1); const __m256i a_lo = _mm256_unpacklo_epi16(temp, one); const __m256i a_hi = _mm256_unpackhi_epi16(temp, one); const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2); const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2); temp = _mm256_packs_epi32(b_lo, b_hi); output[i] = _mm256_castsi256_si128(temp); output[i + 1] = _mm256_extractf128_si256(temp, 0x1); } } static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = { fdct8x8_new_avx2, // DCT_DCT fdct8x8_new_avx2, // ADST_DCT fadst8x8_new_avx2, // DCT_ADST fadst8x8_new_avx2, // ADST_ADST fdct8x8_new_avx2, // FLIPADST_DCT fadst8x8_new_avx2, // DCT_FLIPADST fadst8x8_new_avx2, // FLIPADST_FLIPADST fadst8x8_new_avx2, // ADST_FLIPADST fadst8x8_new_avx2, // FLIPADST_ADST fidentity8x8_new_avx2, // IDTX fidentity8x8_new_avx2, // V_DCT fdct8x8_new_avx2, // H_DCT fidentity8x8_new_avx2, // V_ADST fadst8x8_new_avx2, // H_ADST fidentity8x8_new_avx2, // V_FLIPADST fadst8x8_new_avx2 // H_FLIPADST }; static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = { fdct8x16_new_avx2, // DCT_DCT fadst8x16_new_avx2, // ADST_DCT fdct8x16_new_avx2, // DCT_ADST fadst8x16_new_avx2, // ADST_ADST fadst8x16_new_avx2, // FLIPADST_DCT fdct8x16_new_avx2, // DCT_FLIPADST fadst8x16_new_avx2, // FLIPADST_FLIPADST fadst8x16_new_avx2, // ADST_FLIPADST fadst8x16_new_avx2, // FLIPADST_ADST fidentity8x16_new_avx2, // IDTX fdct8x16_new_avx2, // V_DCT fidentity8x16_new_avx2, // H_DCT fadst8x16_new_avx2, // V_ADST fidentity8x16_new_avx2, // H_ADST fadst8x16_new_avx2, // V_FLIPADST fidentity8x16_new_avx2 // H_FLIPADST }; static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = { fdct8x8_new_avx2, // DCT_DCT fadst8x8_new_avx2, // ADST_DCT fdct8x8_new_avx2, // DCT_ADST fadst8x8_new_avx2, // ADST_ADST fadst8x8_new_avx2, // FLIPADST_DCT fdct8x8_new_avx2, // DCT_FLIPADST fadst8x8_new_avx2, // FLIPADST_FLIPADST fadst8x8_new_avx2, // ADST_FLIPADST fadst8x8_new_avx2, // FLIPADST_ADST fidentity8x8_new_avx2, // IDTX fdct8x8_new_avx2, // V_DCT fidentity8x8_new_avx2, // H_DCT fadst8x8_new_avx2, // V_ADST fidentity8x8_new_avx2, // H_ADST fadst8x8_new_avx2, // V_FLIPADST fidentity8x8_new_avx2, // H_FLIPADST }; static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = { fdct8x16_new_avx2, // DCT_DCT fdct8x16_new_avx2, // ADST_DCT fadst8x16_new_avx2, // DCT_ADST fadst8x16_new_avx2, // ADST_ADST fdct8x16_new_avx2, // FLIPADST_DCT fadst8x16_new_avx2, // DCT_FLIPADST fadst8x16_new_avx2, // FLIPADST_FLIPADST fadst8x16_new_avx2, // ADST_FLIPADST fadst8x16_new_avx2, // FLIPADST_ADST fidentity8x16_new_avx2, // IDTX fidentity8x16_new_avx2, // V_DCT fdct8x16_new_avx2, // H_DCT fidentity8x16_new_avx2, // V_ADST fadst8x16_new_avx2, // H_ADST fidentity8x16_new_avx2, // V_FLIPADST fadst8x16_new_avx2 // H_FLIPADST }; static void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf0[16], buf1[16]; __m256i buf2[8]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; const int txw_idx = get_txw_idx(TX_8X16); const int txh_idx = get_txh_idx(TX_8X16); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 8; const int height = 16; const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); if (ud_flip) { load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); } else { load_buffer_16bit_to_16bit(input, stride, buf0, height); } round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); transpose_16bit_8x8(buf0, buf1); transpose_16bit_8x8(buf0 + 8, buf1 + 8); __m128i *bufl, *bufu; if (lr_flip) { bufl = buf0; bufu = buf0 + 8; flip_buf_sse2(buf1 + width * 0, bufl, width); flip_buf_sse2(buf1 + width * 1, bufu, width); } else { bufl = buf1 + width * 0; bufu = buf1 + width * 1; } pack_reg(bufl, bufu, buf2); row_txfm(buf2, buf2, cos_bit_row); round_shift_16bit_w16_avx2(buf2, width, shift[2]); store_rect_buffer_16bit_to_32bit_w16_avx2(buf2, output, height, width); } static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf0[16], buf1[16]; __m256i buf2[8]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; const int txw_idx = get_txw_idx(TX_16X8); const int txh_idx = get_txh_idx(TX_16X8); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 16; const int height = 8; const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type]; const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type]; __m128i *buf; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); if (ud_flip) { load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height); load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height); } else { load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height); load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height); } pack_reg(buf0, &buf0[8], buf2); round_shift_16bit_w16_avx2(buf2, height, shift[0]); col_txfm(buf2, buf2, cos_bit_col); round_shift_16bit_w16_avx2(buf2, height, shift[1]); transpose_16bit_16x8_avx2(buf2, buf2); extract_reg(buf2, buf1); if (lr_flip) { buf = buf0; flip_buf_sse2(buf1, buf, width); } else { buf = buf1; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width); } static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform av1_lowbd_fwd_txfm2d_8x8_avx2, // 8x8 transform lowbd_fwd_txfm2d_16x16_avx2, // 16x16 transform lowbd_fwd_txfm2d_32x32_avx2, // 32x32 transform lowbd_fwd_txfm2d_64x64_avx2, // 64x64 transform av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform lowbd_fwd_txfm2d_8x16_avx2, // 8x16 transform lowbd_fwd_txfm2d_16x8_avx2, // 16x8 transform lowbd_fwd_txfm2d_16x32_avx2, // 16x32 transform lowbd_fwd_txfm2d_32x16_avx2, // 32x16 transform lowbd_fwd_txfm2d_32x64_avx2, // 32x64 transform lowbd_fwd_txfm2d_64x32_avx2, // 64x32 transform av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform lowbd_fwd_txfm2d_16x64_avx2, // 16x64 transform lowbd_fwd_txfm2d_64x16_avx2, // 64x16 transform }; void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) { av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); } else { fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); } } aom-3.12.1/av1/encoder/x86/av1_fwd_txfm2d_sse4.c000066400000000000000000000333521477627663500210570ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "config/av1_rtcd.h" #include "av1/common/enums.h" #include "av1/common/av1_txfm.h" #include "av1/common/x86/av1_txfm_sse2.h" #include "av1/common/x86/highbd_txfm_utility_sse4.h" #include "av1/encoder/av1_fwd_txfm1d_cfg.h" #include "av1/encoder/x86/av1_txfm1d_sse4.h" #include "av1/encoder/x86/av1_fwd_txfm_sse2.h" static inline void int16_array_with_stride_to_int32_array_without_stride( const int16_t *input, int stride, int32_t *output, int txfm1d_size) { int r, c; for (r = 0; r < txfm1d_size; r++) { for (c = 0; c < txfm1d_size; c++) { output[r * txfm1d_size + c] = (int32_t)input[r * stride + c]; } } } static inline void store_output_32bit_w8(int32_t *const out, const __m128i *const in1, const __m128i *const in2, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { _mm_store_si128((__m128i *)(out + stride * i), in1[i]); _mm_store_si128((__m128i *)(out + stride * i + 4), in2[i]); } } typedef void (*TxfmFuncSSE2)(__m128i *input, __m128i *output, const int8_t cos_bit, const int8_t *stage_range); static void fdct32_sse4_1(__m128i *input, __m128i *output, const int8_t cos_bit, const int8_t *stage_range) { const int txfm_size = 32; const int num_per_128 = 4; int col_num = txfm_size / num_per_128; int col; (void)stage_range; for (col = 0; col < col_num; col++) { av1_fdct32_sse4_1((input + col), (output + col), cos_bit, col_num); } } static void fdct64_new_sse4_1(__m128i *input, __m128i *output, const int8_t cos_bit, const int8_t *stage_range) { const int txfm_size = 64; const int num_per_128 = 4; int col_num = txfm_size / num_per_128; (void)stage_range; for (int col = 0; col < col_num; col++) { av1_fdct64_sse4_1((input + col), (output + col), cos_bit, col_num, col_num); } } static void idtx32x32_sse4_1(__m128i *input, __m128i *output, const int8_t cos_bit, const int8_t *stage_range) { (void)stage_range; for (int i = 0; i < 8; i++) { av1_idtx32_sse4_1(&input[i * 32], &output[i * 32], cos_bit, 1); } } static inline TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { switch (txfm_type) { case TXFM_TYPE_DCT32: return fdct32_sse4_1; case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; case TXFM_TYPE_IDENTITY32: return idtx32x32_sse4_1; default: assert(0); } return NULL; } static inline void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output, const int stride, const TXFM_2D_FLIP_CFG *cfg, int32_t *txfm_buf) { // TODO(sarahparker) This does not currently support rectangular transforms // and will break without splitting txfm_size out into row and col size. // Rectangular transforms use c code only, so it should be ok for now. // It will be corrected when there are sse implementations for rectangular // transforms. assert(cfg->tx_size < TX_SIZES); const int txfm_size = tx_size_wide[cfg->tx_size]; const int8_t *shift = cfg->shift; const int8_t *stage_range_col = cfg->stage_range_col; const int8_t *stage_range_row = cfg->stage_range_row; const int8_t cos_bit_col = cfg->cos_bit_col; const int8_t cos_bit_row = cfg->cos_bit_row; const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row); __m128i *buf_128 = (__m128i *)txfm_buf; __m128i *out_128 = (__m128i *)output; int num_per_128 = 4; int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf, txfm_size); av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]); txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); transpose_32(txfm_size, out_128, buf_128); txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row); av1_round_shift_array_32_sse4_1(out_128, out_128, txfm2d_size_128, -shift[2]); } static inline void fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, const int stride, const TXFM_2D_FLIP_CFG *cfg, int32_t *txfm_buf) { assert(cfg->tx_size < TX_SIZES); const int txfm_size = tx_size_wide[cfg->tx_size]; const int8_t *shift = cfg->shift; const int8_t *stage_range_col = cfg->stage_range_col; const int8_t cos_bit_col = cfg->cos_bit_col; const int8_t cos_bit_row = cfg->cos_bit_row; const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); __m128i *buf_128 = (__m128i *)txfm_buf; __m128i *out_128 = (__m128i *)output; const int num_per_128 = 4; int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; int col_num = txfm_size / num_per_128; int16_array_with_stride_to_int32_array_without_stride(input, stride, output, txfm_size); /*col wise transform*/ txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); transpose_32(txfm_size, out_128, buf_128); /*row wise transform*/ for (int col = 0; col < (col_num >> 1); col++) { av1_fdct64_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row, col_num, (col_num >> 1)); } txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1); av1_round_shift_array_32_sse4_1(out_128, out_128, txfm2d_size_128, -shift[2]); } void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]); TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg); (void)bd; fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf); } void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]); TXFM_2D_FLIP_CFG cfg; av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg); (void)bd; fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf); } static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; (void)tx_type; assert(tx_type == DCT_DCT); const TX_SIZE tx_size = TX_64X64; __m128i buf0[64], buf1[512]; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2; const int width_div8 = (width >> 3); const int height_div8 = (height >> 3); for (int i = 0; i < width_div8; i++) { load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); for (int j = 0; j < AOMMIN(4, height_div8); ++j) { transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); } } for (int i = 0; i < AOMMIN(4, height_div8); i++) { __m128i bufA[64]; __m128i bufB[64]; __m128i *buf = buf1 + width * i; for (int j = 0; j < width; ++j) { bufA[j] = _mm_cvtepi16_epi32(buf[j]); bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); } av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1); av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1); av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]); av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]); store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32); } } static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; const TX_SIZE tx_size = TX_64X32; __m128i buf0[64], buf1[256]; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; const int width_div8 = (width >> 3); const int height_div8 = (height >> 3); for (int i = 0; i < width_div8; i++) { load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); for (int j = 0; j < AOMMIN(4, height_div8); ++j) { transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); } } assert(tx_type == DCT_DCT); for (int i = 0; i < AOMMIN(4, height_div8); i++) { __m128i bufA[64]; __m128i bufB[64]; __m128i *buf = buf1 + width * i; for (int j = 0; j < width; ++j) { bufA[j] = _mm_cvtepi16_epi32(buf[j]); bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); } av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1); av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1); av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2); store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32); } } static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; (void)tx_type; assert(tx_type == DCT_DCT); const TX_SIZE tx_size = TX_32X64; __m128i buf0[64], buf1[256]; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2; const int width_div8 = (width >> 3); const int height_div8 = (height >> 3); for (int i = 0; i < width_div8; i++) { load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); for (int j = 0; j < AOMMIN(4, height_div8); ++j) { transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); } } for (int i = 0; i < AOMMIN(4, height_div8); i++) { __m128i bufA[32]; __m128i bufB[32]; __m128i *buf = buf1 + width * i; for (int j = 0; j < width; ++j) { bufA[j] = _mm_cvtepi16_epi32(buf[j]); bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); } av1_fdct32_sse4_1(bufA, bufA, cos_bit_row, 1); av1_fdct32_sse4_1(bufB, bufB, cos_bit_row, 1); av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2); store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32); } } static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform lowbd_fwd_txfm2d_64x64_sse4_1, // 64x64 transform av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform lowbd_fwd_txfm2d_32x64_sse4_1, // 32x64 transform lowbd_fwd_txfm2d_64x32_sse4_1, // 64x32 transform av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform }; void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) { av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); } else { fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); } } aom-3.12.1/av1/encoder/x86/av1_fwd_txfm_avx2.h000066400000000000000000000101451477627663500206330ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ #define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ #include // out0 = in0*w0 + in1*w1 // out1 = -in1*w0 + in0*w1 static inline void btf_32_avx2_type0(const int32_t w0, const int32_t w1, __m256i *in0, __m256i *in1, const __m256i _r, const int32_t cos_bit) { __m256i _in0 = *in0; __m256i _in1 = *in1; const __m256i ww0 = _mm256_set1_epi32(w0); const __m256i ww1 = _mm256_set1_epi32(w1); const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); temp0 = _mm256_add_epi32(temp0, _r); *in0 = _mm256_srai_epi32(temp0, cos_bit); const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0); temp1 = _mm256_add_epi32(temp1, _r); *in1 = _mm256_srai_epi32(temp1, cos_bit); } static inline void btf_32_avx2_type1(const int32_t w0, const int32_t w1, __m256i *in0, __m256i *in1, const __m256i _r, const int32_t cos_bit) { __m256i _in0 = *in0; __m256i _in1 = *in1; const __m256i ww0 = _mm256_set1_epi32(w0); const __m256i ww1 = _mm256_set1_epi32(w1); const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); temp0 = _mm256_add_epi32(temp0, _r); *in0 = _mm256_srai_epi32(temp0, cos_bit); const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1); temp1 = _mm256_add_epi32(temp1, _r); *in1 = _mm256_srai_epi32(temp1, cos_bit); } // out0 = in0*w0 + in1*w1 // out1 = -in1*w0 + in0*w1 static inline void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1, __m256i *in0, __m256i *in1, const __m256i _r, const int32_t cos_bit) { __m256i _in0 = *in0; __m256i _in1 = *in1; const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); temp0 = _mm256_add_epi32(temp0, _r); *in0 = _mm256_srai_epi32(temp0, cos_bit); const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0); temp1 = _mm256_add_epi32(temp1, _r); *in1 = _mm256_srai_epi32(temp1, cos_bit); } // out0 = in0*w0 + in1*w1 // out1 = in1*w0 - in0*w1 static inline void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1, __m256i *in0, __m256i *in1, const __m256i _r, const int32_t cos_bit) { __m256i _in0 = *in0; __m256i _in1 = *in1; const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); temp0 = _mm256_add_epi32(temp0, _r); *in0 = _mm256_srai_epi32(temp0, cos_bit); const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1); temp1 = _mm256_add_epi32(temp1, _r); *in1 = _mm256_srai_epi32(temp1, cos_bit); } #endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ aom-3.12.1/av1/encoder/x86/av1_fwd_txfm_sse2.c000066400000000000000000003116221477627663500206260ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/x86/av1_txfm_sse2.h" #include "av1/encoder/av1_fwd_txfm1d_cfg.h" #include "av1/encoder/x86/av1_fwd_txfm_sse2.h" // TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible). static void fdct4x4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); __m128i u[4], v[4]; u[0] = _mm_unpacklo_epi16(input[0], input[1]); u[1] = _mm_unpacklo_epi16(input[3], input[2]); v[0] = _mm_add_epi16(u[0], u[1]); v[1] = _mm_sub_epi16(u[0], u[1]); u[0] = _mm_madd_epi16(v[0], cospi_p32_p32); // 0 u[1] = _mm_madd_epi16(v[0], cospi_p32_m32); // 2 u[2] = _mm_madd_epi16(v[1], cospi_p16_p48); // 1 u[3] = _mm_madd_epi16(v[1], cospi_p48_m16); // 3 v[0] = _mm_add_epi32(u[0], __rounding); v[1] = _mm_add_epi32(u[1], __rounding); v[2] = _mm_add_epi32(u[2], __rounding); v[3] = _mm_add_epi32(u[3], __rounding); u[0] = _mm_srai_epi32(v[0], cos_bit); u[1] = _mm_srai_epi32(v[1], cos_bit); u[2] = _mm_srai_epi32(v[2], cos_bit); u[3] = _mm_srai_epi32(v[3], cos_bit); output[0] = _mm_packs_epi32(u[0], u[1]); output[1] = _mm_packs_epi32(u[2], u[3]); output[2] = _mm_srli_si128(output[0], 8); output[3] = _mm_srli_si128(output[1], 8); } static void fdct8x4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); // stage 1 __m128i x1[4]; x1[0] = _mm_adds_epi16(input[0], input[3]); x1[3] = _mm_subs_epi16(input[0], input[3]); x1[1] = _mm_adds_epi16(input[1], input[2]); x1[2] = _mm_subs_epi16(input[1], input[2]); // stage 2 __m128i x2[4]; btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]); btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x1[2], x1[3], x2[2], x2[3]); // stage 3 output[0] = x2[0]; output[1] = x2[2]; output[2] = x2[1]; output[3] = x2[3]; } static void fdct4x8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); // stage 1 __m128i x1[8]; x1[0] = _mm_adds_epi16(input[0], input[7]); x1[7] = _mm_subs_epi16(input[0], input[7]); x1[1] = _mm_adds_epi16(input[1], input[6]); x1[6] = _mm_subs_epi16(input[1], input[6]); x1[2] = _mm_adds_epi16(input[2], input[5]); x1[5] = _mm_subs_epi16(input[2], input[5]); x1[3] = _mm_adds_epi16(input[3], input[4]); x1[4] = _mm_subs_epi16(input[3], input[4]); // stage 2 __m128i x2[8]; x2[0] = _mm_adds_epi16(x1[0], x1[3]); x2[3] = _mm_subs_epi16(x1[0], x1[3]); x2[1] = _mm_adds_epi16(x1[1], x1[2]); x2[2] = _mm_subs_epi16(x1[1], x1[2]); x2[4] = x1[4]; btf_16_w4_sse2(&cospi_m32_p32, &cospi_p32_p32, __rounding, cos_bit, &x1[5], &x1[6], &x2[5], &x2[6]); x2[7] = x1[7]; // stage 3 __m128i x3[8]; btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x2[0], &x2[1], &x3[0], &x3[1]); btf_16_w4_sse2(&cospi_p48_p16, &cospi_m16_p48, __rounding, cos_bit, &x2[2], &x2[3], &x3[2], &x3[3]); x3[4] = _mm_adds_epi16(x2[4], x2[5]); x3[5] = _mm_subs_epi16(x2[4], x2[5]); x3[6] = _mm_subs_epi16(x2[7], x2[6]); x3[7] = _mm_adds_epi16(x2[7], x2[6]); // stage 4 __m128i x4[8]; x4[0] = x3[0]; x4[1] = x3[1]; x4[2] = x3[2]; x4[3] = x3[3]; btf_16_w4_sse2(&cospi_p56_p08, &cospi_m08_p56, __rounding, cos_bit, &x3[4], &x3[7], &x4[4], &x4[7]); btf_16_w4_sse2(&cospi_p24_p40, &cospi_m40_p24, __rounding, cos_bit, &x3[5], &x3[6], &x4[5], &x4[6]); // stage 5 output[0] = x4[0]; output[1] = x4[4]; output[2] = x4[2]; output[3] = x4[6]; output[4] = x4[1]; output[5] = x4[5]; output[6] = x4[3]; output[7] = x4[7]; } static void fdct8x16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); // stage 1 __m128i x1[16]; x1[0] = _mm_adds_epi16(input[0], input[15]); x1[15] = _mm_subs_epi16(input[0], input[15]); x1[1] = _mm_adds_epi16(input[1], input[14]); x1[14] = _mm_subs_epi16(input[1], input[14]); x1[2] = _mm_adds_epi16(input[2], input[13]); x1[13] = _mm_subs_epi16(input[2], input[13]); x1[3] = _mm_adds_epi16(input[3], input[12]); x1[12] = _mm_subs_epi16(input[3], input[12]); x1[4] = _mm_adds_epi16(input[4], input[11]); x1[11] = _mm_subs_epi16(input[4], input[11]); x1[5] = _mm_adds_epi16(input[5], input[10]); x1[10] = _mm_subs_epi16(input[5], input[10]); x1[6] = _mm_adds_epi16(input[6], input[9]); x1[9] = _mm_subs_epi16(input[6], input[9]); x1[7] = _mm_adds_epi16(input[7], input[8]); x1[8] = _mm_subs_epi16(input[7], input[8]); // stage 2 __m128i x2[16]; x2[0] = _mm_adds_epi16(x1[0], x1[7]); x2[7] = _mm_subs_epi16(x1[0], x1[7]); x2[1] = _mm_adds_epi16(x1[1], x1[6]); x2[6] = _mm_subs_epi16(x1[1], x1[6]); x2[2] = _mm_adds_epi16(x1[2], x1[5]); x2[5] = _mm_subs_epi16(x1[2], x1[5]); x2[3] = _mm_adds_epi16(x1[3], x1[4]); x2[4] = _mm_subs_epi16(x1[3], x1[4]); x2[8] = x1[8]; x2[9] = x1[9]; btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[10], x1[13], x2[10], x2[13]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[11], x1[12], x2[11], x2[12]); x2[14] = x1[14]; x2[15] = x1[15]; // stage 3 __m128i x3[16]; x3[0] = _mm_adds_epi16(x2[0], x2[3]); x3[3] = _mm_subs_epi16(x2[0], x2[3]); x3[1] = _mm_adds_epi16(x2[1], x2[2]); x3[2] = _mm_subs_epi16(x2[1], x2[2]); x3[4] = x2[4]; btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[5], x2[6], x3[5], x3[6]); x3[7] = x2[7]; x3[8] = _mm_adds_epi16(x2[8], x2[11]); x3[11] = _mm_subs_epi16(x2[8], x2[11]); x3[9] = _mm_adds_epi16(x2[9], x2[10]); x3[10] = _mm_subs_epi16(x2[9], x2[10]); x3[12] = _mm_subs_epi16(x2[15], x2[12]); x3[15] = _mm_adds_epi16(x2[15], x2[12]); x3[13] = _mm_subs_epi16(x2[14], x2[13]); x3[14] = _mm_adds_epi16(x2[14], x2[13]); // stage 4 __m128i x4[16]; btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]); btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x3[2], x3[3], x4[2], x4[3]); x4[4] = _mm_adds_epi16(x3[4], x3[5]); x4[5] = _mm_subs_epi16(x3[4], x3[5]); x4[6] = _mm_subs_epi16(x3[7], x3[6]); x4[7] = _mm_adds_epi16(x3[7], x3[6]); x4[8] = x3[8]; btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]); x4[11] = x3[11]; x4[12] = x3[12]; x4[15] = x3[15]; // stage 5 __m128i x5[16]; x5[0] = x4[0]; x5[1] = x4[1]; x5[2] = x4[2]; x5[3] = x4[3]; btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x4[4], x4[7], x5[4], x5[7]); btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x4[5], x4[6], x5[5], x5[6]); x5[8] = _mm_adds_epi16(x4[8], x4[9]); x5[9] = _mm_subs_epi16(x4[8], x4[9]); x5[10] = _mm_subs_epi16(x4[11], x4[10]); x5[11] = _mm_adds_epi16(x4[11], x4[10]); x5[12] = _mm_adds_epi16(x4[12], x4[13]); x5[13] = _mm_subs_epi16(x4[12], x4[13]); x5[14] = _mm_subs_epi16(x4[15], x4[14]); x5[15] = _mm_adds_epi16(x4[15], x4[14]); // stage 6 __m128i x6[16]; x6[0] = x5[0]; x6[1] = x5[1]; x6[2] = x5[2]; x6[3] = x5[3]; x6[4] = x5[4]; x6[5] = x5[5]; x6[6] = x5[6]; x6[7] = x5[7]; btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x5[8], x5[15], x6[8], x6[15]); btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x5[9], x5[14], x6[9], x6[14]); btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x5[10], x5[13], x6[10], x6[13]); btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x5[11], x5[12], x6[11], x6[12]); // stage 7 output[0] = x6[0]; output[1] = x6[8]; output[2] = x6[4]; output[3] = x6[12]; output[4] = x6[2]; output[5] = x6[10]; output[6] = x6[6]; output[7] = x6[14]; output[8] = x6[1]; output[9] = x6[9]; output[10] = x6[5]; output[11] = x6[13]; output[12] = x6[3]; output[13] = x6[11]; output[14] = x6[7]; output[15] = x6[15]; } void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]); __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]); __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]); __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]); __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]); __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]); __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]); __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]); __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]); __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]); __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]); __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]); __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]); __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]); __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]); __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]); // stage 1 __m128i x1[32]; x1[0] = _mm_adds_epi16(input[0], input[31]); x1[31] = _mm_subs_epi16(input[0], input[31]); x1[1] = _mm_adds_epi16(input[1], input[30]); x1[30] = _mm_subs_epi16(input[1], input[30]); x1[2] = _mm_adds_epi16(input[2], input[29]); x1[29] = _mm_subs_epi16(input[2], input[29]); x1[3] = _mm_adds_epi16(input[3], input[28]); x1[28] = _mm_subs_epi16(input[3], input[28]); x1[4] = _mm_adds_epi16(input[4], input[27]); x1[27] = _mm_subs_epi16(input[4], input[27]); x1[5] = _mm_adds_epi16(input[5], input[26]); x1[26] = _mm_subs_epi16(input[5], input[26]); x1[6] = _mm_adds_epi16(input[6], input[25]); x1[25] = _mm_subs_epi16(input[6], input[25]); x1[7] = _mm_adds_epi16(input[7], input[24]); x1[24] = _mm_subs_epi16(input[7], input[24]); x1[8] = _mm_adds_epi16(input[8], input[23]); x1[23] = _mm_subs_epi16(input[8], input[23]); x1[9] = _mm_adds_epi16(input[9], input[22]); x1[22] = _mm_subs_epi16(input[9], input[22]); x1[10] = _mm_adds_epi16(input[10], input[21]); x1[21] = _mm_subs_epi16(input[10], input[21]); x1[11] = _mm_adds_epi16(input[11], input[20]); x1[20] = _mm_subs_epi16(input[11], input[20]); x1[12] = _mm_adds_epi16(input[12], input[19]); x1[19] = _mm_subs_epi16(input[12], input[19]); x1[13] = _mm_adds_epi16(input[13], input[18]); x1[18] = _mm_subs_epi16(input[13], input[18]); x1[14] = _mm_adds_epi16(input[14], input[17]); x1[17] = _mm_subs_epi16(input[14], input[17]); x1[15] = _mm_adds_epi16(input[15], input[16]); x1[16] = _mm_subs_epi16(input[15], input[16]); // stage 2 __m128i x2[32]; x2[0] = _mm_adds_epi16(x1[0], x1[15]); x2[15] = _mm_subs_epi16(x1[0], x1[15]); x2[1] = _mm_adds_epi16(x1[1], x1[14]); x2[14] = _mm_subs_epi16(x1[1], x1[14]); x2[2] = _mm_adds_epi16(x1[2], x1[13]); x2[13] = _mm_subs_epi16(x1[2], x1[13]); x2[3] = _mm_adds_epi16(x1[3], x1[12]); x2[12] = _mm_subs_epi16(x1[3], x1[12]); x2[4] = _mm_adds_epi16(x1[4], x1[11]); x2[11] = _mm_subs_epi16(x1[4], x1[11]); x2[5] = _mm_adds_epi16(x1[5], x1[10]); x2[10] = _mm_subs_epi16(x1[5], x1[10]); x2[6] = _mm_adds_epi16(x1[6], x1[9]); x2[9] = _mm_subs_epi16(x1[6], x1[9]); x2[7] = _mm_adds_epi16(x1[7], x1[8]); x2[8] = _mm_subs_epi16(x1[7], x1[8]); x2[16] = x1[16]; x2[17] = x1[17]; x2[18] = x1[18]; x2[19] = x1[19]; btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[20], x1[27], x2[20], x2[27]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[21], x1[26], x2[21], x2[26]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[22], x1[25], x2[22], x2[25]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[23], x1[24], x2[23], x2[24]); x2[28] = x1[28]; x2[29] = x1[29]; x2[30] = x1[30]; x2[31] = x1[31]; // stage 3 __m128i x3[32]; x3[0] = _mm_adds_epi16(x2[0], x2[7]); x3[7] = _mm_subs_epi16(x2[0], x2[7]); x3[1] = _mm_adds_epi16(x2[1], x2[6]); x3[6] = _mm_subs_epi16(x2[1], x2[6]); x3[2] = _mm_adds_epi16(x2[2], x2[5]); x3[5] = _mm_subs_epi16(x2[2], x2[5]); x3[3] = _mm_adds_epi16(x2[3], x2[4]); x3[4] = _mm_subs_epi16(x2[3], x2[4]); x3[8] = x2[8]; x3[9] = x2[9]; btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[10], x2[13], x3[10], x3[13]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[11], x2[12], x3[11], x3[12]); x3[14] = x2[14]; x3[15] = x2[15]; x3[16] = _mm_adds_epi16(x2[16], x2[23]); x3[23] = _mm_subs_epi16(x2[16], x2[23]); x3[17] = _mm_adds_epi16(x2[17], x2[22]); x3[22] = _mm_subs_epi16(x2[17], x2[22]); x3[18] = _mm_adds_epi16(x2[18], x2[21]); x3[21] = _mm_subs_epi16(x2[18], x2[21]); x3[19] = _mm_adds_epi16(x2[19], x2[20]); x3[20] = _mm_subs_epi16(x2[19], x2[20]); x3[24] = _mm_subs_epi16(x2[31], x2[24]); x3[31] = _mm_adds_epi16(x2[31], x2[24]); x3[25] = _mm_subs_epi16(x2[30], x2[25]); x3[30] = _mm_adds_epi16(x2[30], x2[25]); x3[26] = _mm_subs_epi16(x2[29], x2[26]); x3[29] = _mm_adds_epi16(x2[29], x2[26]); x3[27] = _mm_subs_epi16(x2[28], x2[27]); x3[28] = _mm_adds_epi16(x2[28], x2[27]); // stage 4 __m128i x4[32]; x4[0] = _mm_adds_epi16(x3[0], x3[3]); x4[3] = _mm_subs_epi16(x3[0], x3[3]); x4[1] = _mm_adds_epi16(x3[1], x3[2]); x4[2] = _mm_subs_epi16(x3[1], x3[2]); x4[4] = x3[4]; btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]); x4[7] = x3[7]; x4[8] = _mm_adds_epi16(x3[8], x3[11]); x4[11] = _mm_subs_epi16(x3[8], x3[11]); x4[9] = _mm_adds_epi16(x3[9], x3[10]); x4[10] = _mm_subs_epi16(x3[9], x3[10]); x4[12] = _mm_subs_epi16(x3[15], x3[12]); x4[15] = _mm_adds_epi16(x3[15], x3[12]); x4[13] = _mm_subs_epi16(x3[14], x3[13]); x4[14] = _mm_adds_epi16(x3[14], x3[13]); x4[16] = x3[16]; x4[17] = x3[17]; btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[18], x3[29], x4[18], x4[29]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[19], x3[28], x4[19], x4[28]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[20], x3[27], x4[20], x4[27]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[21], x3[26], x4[21], x4[26]); x4[22] = x3[22]; x4[23] = x3[23]; x4[24] = x3[24]; x4[25] = x3[25]; x4[30] = x3[30]; x4[31] = x3[31]; // stage 5 __m128i x5[32]; btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]); btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x4[2], x4[3], x5[2], x5[3]); x5[4] = _mm_adds_epi16(x4[4], x4[5]); x5[5] = _mm_subs_epi16(x4[4], x4[5]); x5[6] = _mm_subs_epi16(x4[7], x4[6]); x5[7] = _mm_adds_epi16(x4[7], x4[6]); x5[8] = x4[8]; btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]); x5[11] = x4[11]; x5[12] = x4[12]; x5[15] = x4[15]; x5[16] = _mm_adds_epi16(x4[16], x4[19]); x5[19] = _mm_subs_epi16(x4[16], x4[19]); x5[17] = _mm_adds_epi16(x4[17], x4[18]); x5[18] = _mm_subs_epi16(x4[17], x4[18]); x5[20] = _mm_subs_epi16(x4[23], x4[20]); x5[23] = _mm_adds_epi16(x4[23], x4[20]); x5[21] = _mm_subs_epi16(x4[22], x4[21]); x5[22] = _mm_adds_epi16(x4[22], x4[21]); x5[24] = _mm_adds_epi16(x4[24], x4[27]); x5[27] = _mm_subs_epi16(x4[24], x4[27]); x5[25] = _mm_adds_epi16(x4[25], x4[26]); x5[26] = _mm_subs_epi16(x4[25], x4[26]); x5[28] = _mm_subs_epi16(x4[31], x4[28]); x5[31] = _mm_adds_epi16(x4[31], x4[28]); x5[29] = _mm_subs_epi16(x4[30], x4[29]); x5[30] = _mm_adds_epi16(x4[30], x4[29]); // stage 6 __m128i x6[32]; x6[0] = x5[0]; x6[1] = x5[1]; x6[2] = x5[2]; x6[3] = x5[3]; btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x5[4], x5[7], x6[4], x6[7]); btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x5[5], x5[6], x6[5], x6[6]); x6[8] = _mm_adds_epi16(x5[8], x5[9]); x6[9] = _mm_subs_epi16(x5[8], x5[9]); x6[10] = _mm_subs_epi16(x5[11], x5[10]); x6[11] = _mm_adds_epi16(x5[11], x5[10]); x6[12] = _mm_adds_epi16(x5[12], x5[13]); x6[13] = _mm_subs_epi16(x5[12], x5[13]); x6[14] = _mm_subs_epi16(x5[15], x5[14]); x6[15] = _mm_adds_epi16(x5[15], x5[14]); x6[16] = x5[16]; btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[17], x5[30], x6[17], x6[30]); btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[18], x5[29], x6[18], x6[29]); x6[19] = x5[19]; x6[20] = x5[20]; btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[21], x5[26], x6[21], x6[26]); btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[22], x5[25], x6[22], x6[25]); x6[23] = x5[23]; x6[24] = x5[24]; x6[27] = x5[27]; x6[28] = x5[28]; x6[31] = x5[31]; // stage 7 __m128i x7[32]; x7[0] = x6[0]; x7[1] = x6[1]; x7[2] = x6[2]; x7[3] = x6[3]; x7[4] = x6[4]; x7[5] = x6[5]; x7[6] = x6[6]; x7[7] = x6[7]; btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x6[8], x6[15], x7[8], x7[15]); btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x6[9], x6[14], x7[9], x7[14]); btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x6[10], x6[13], x7[10], x7[13]); btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x6[11], x6[12], x7[11], x7[12]); x7[16] = _mm_adds_epi16(x6[16], x6[17]); x7[17] = _mm_subs_epi16(x6[16], x6[17]); x7[18] = _mm_subs_epi16(x6[19], x6[18]); x7[19] = _mm_adds_epi16(x6[19], x6[18]); x7[20] = _mm_adds_epi16(x6[20], x6[21]); x7[21] = _mm_subs_epi16(x6[20], x6[21]); x7[22] = _mm_subs_epi16(x6[23], x6[22]); x7[23] = _mm_adds_epi16(x6[23], x6[22]); x7[24] = _mm_adds_epi16(x6[24], x6[25]); x7[25] = _mm_subs_epi16(x6[24], x6[25]); x7[26] = _mm_subs_epi16(x6[27], x6[26]); x7[27] = _mm_adds_epi16(x6[27], x6[26]); x7[28] = _mm_adds_epi16(x6[28], x6[29]); x7[29] = _mm_subs_epi16(x6[28], x6[29]); x7[30] = _mm_subs_epi16(x6[31], x6[30]); x7[31] = _mm_adds_epi16(x6[31], x6[30]); // stage 8 __m128i x8[32]; x8[0] = x7[0]; x8[1] = x7[1]; x8[2] = x7[2]; x8[3] = x7[3]; x8[4] = x7[4]; x8[5] = x7[5]; x8[6] = x7[6]; x8[7] = x7[7]; x8[8] = x7[8]; x8[9] = x7[9]; x8[10] = x7[10]; x8[11] = x7[11]; x8[12] = x7[12]; x8[13] = x7[13]; x8[14] = x7[14]; x8[15] = x7[15]; btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x7[16], x7[31], x8[16], x8[31]); btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x7[17], x7[30], x8[17], x8[30]); btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x7[18], x7[29], x8[18], x8[29]); btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x7[19], x7[28], x8[19], x8[28]); btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x7[20], x7[27], x8[20], x8[27]); btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x7[21], x7[26], x8[21], x8[26]); btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x7[22], x7[25], x8[22], x8[25]); btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x7[23], x7[24], x8[23], x8[24]); // stage 9 output[0] = x8[0]; output[1] = x8[16]; output[2] = x8[8]; output[3] = x8[24]; output[4] = x8[4]; output[5] = x8[20]; output[6] = x8[12]; output[7] = x8[28]; output[8] = x8[2]; output[9] = x8[18]; output[10] = x8[10]; output[11] = x8[26]; output[12] = x8[6]; output[13] = x8[22]; output[14] = x8[14]; output[15] = x8[30]; output[16] = x8[1]; output[17] = x8[17]; output[18] = x8[9]; output[19] = x8[25]; output[20] = x8[5]; output[21] = x8[21]; output[22] = x8[13]; output[23] = x8[29]; output[24] = x8[3]; output[25] = x8[19]; output[26] = x8[11]; output[27] = x8[27]; output[28] = x8[7]; output[29] = x8[23]; output[30] = x8[15]; output[31] = x8[31]; } void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]); __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]); __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]); __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]); __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]); __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]); __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]); __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]); __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]); __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]); __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]); __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]); __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]); __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]); __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]); __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]); __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]); __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]); __m128i cospi_p63_p01 = pair_set_epi16(cospi[63], cospi[1]); __m128i cospi_m01_p63 = pair_set_epi16(-cospi[1], cospi[63]); __m128i cospi_p31_p33 = pair_set_epi16(cospi[31], cospi[33]); __m128i cospi_m33_p31 = pair_set_epi16(-cospi[33], cospi[31]); __m128i cospi_p47_p17 = pair_set_epi16(cospi[47], cospi[17]); __m128i cospi_m17_p47 = pair_set_epi16(-cospi[17], cospi[47]); __m128i cospi_p15_p49 = pair_set_epi16(cospi[15], cospi[49]); __m128i cospi_m49_p15 = pair_set_epi16(-cospi[49], cospi[15]); __m128i cospi_p55_p09 = pair_set_epi16(cospi[55], cospi[9]); __m128i cospi_m09_p55 = pair_set_epi16(-cospi[9], cospi[55]); __m128i cospi_p23_p41 = pair_set_epi16(cospi[23], cospi[41]); __m128i cospi_m41_p23 = pair_set_epi16(-cospi[41], cospi[23]); __m128i cospi_p39_p25 = pair_set_epi16(cospi[39], cospi[25]); __m128i cospi_m25_p39 = pair_set_epi16(-cospi[25], cospi[39]); __m128i cospi_p07_p57 = pair_set_epi16(cospi[7], cospi[57]); __m128i cospi_m57_p07 = pair_set_epi16(-cospi[57], cospi[7]); __m128i cospi_p59_p05 = pair_set_epi16(cospi[59], cospi[5]); __m128i cospi_m05_p59 = pair_set_epi16(-cospi[5], cospi[59]); __m128i cospi_p27_p37 = pair_set_epi16(cospi[27], cospi[37]); __m128i cospi_m37_p27 = pair_set_epi16(-cospi[37], cospi[27]); __m128i cospi_p43_p21 = pair_set_epi16(cospi[43], cospi[21]); __m128i cospi_m21_p43 = pair_set_epi16(-cospi[21], cospi[43]); __m128i cospi_p11_p53 = pair_set_epi16(cospi[11], cospi[53]); __m128i cospi_m53_p11 = pair_set_epi16(-cospi[53], cospi[11]); __m128i cospi_p51_p13 = pair_set_epi16(cospi[51], cospi[13]); __m128i cospi_m13_p51 = pair_set_epi16(-cospi[13], cospi[51]); __m128i cospi_p19_p45 = pair_set_epi16(cospi[19], cospi[45]); __m128i cospi_m45_p19 = pair_set_epi16(-cospi[45], cospi[19]); __m128i cospi_p35_p29 = pair_set_epi16(cospi[35], cospi[29]); __m128i cospi_m29_p35 = pair_set_epi16(-cospi[29], cospi[35]); __m128i cospi_p03_p61 = pair_set_epi16(cospi[3], cospi[61]); __m128i cospi_m61_p03 = pair_set_epi16(-cospi[61], cospi[3]); // stage 1 __m128i x1[64]; x1[0] = _mm_adds_epi16(input[0], input[63]); x1[63] = _mm_subs_epi16(input[0], input[63]); x1[1] = _mm_adds_epi16(input[1], input[62]); x1[62] = _mm_subs_epi16(input[1], input[62]); x1[2] = _mm_adds_epi16(input[2], input[61]); x1[61] = _mm_subs_epi16(input[2], input[61]); x1[3] = _mm_adds_epi16(input[3], input[60]); x1[60] = _mm_subs_epi16(input[3], input[60]); x1[4] = _mm_adds_epi16(input[4], input[59]); x1[59] = _mm_subs_epi16(input[4], input[59]); x1[5] = _mm_adds_epi16(input[5], input[58]); x1[58] = _mm_subs_epi16(input[5], input[58]); x1[6] = _mm_adds_epi16(input[6], input[57]); x1[57] = _mm_subs_epi16(input[6], input[57]); x1[7] = _mm_adds_epi16(input[7], input[56]); x1[56] = _mm_subs_epi16(input[7], input[56]); x1[8] = _mm_adds_epi16(input[8], input[55]); x1[55] = _mm_subs_epi16(input[8], input[55]); x1[9] = _mm_adds_epi16(input[9], input[54]); x1[54] = _mm_subs_epi16(input[9], input[54]); x1[10] = _mm_adds_epi16(input[10], input[53]); x1[53] = _mm_subs_epi16(input[10], input[53]); x1[11] = _mm_adds_epi16(input[11], input[52]); x1[52] = _mm_subs_epi16(input[11], input[52]); x1[12] = _mm_adds_epi16(input[12], input[51]); x1[51] = _mm_subs_epi16(input[12], input[51]); x1[13] = _mm_adds_epi16(input[13], input[50]); x1[50] = _mm_subs_epi16(input[13], input[50]); x1[14] = _mm_adds_epi16(input[14], input[49]); x1[49] = _mm_subs_epi16(input[14], input[49]); x1[15] = _mm_adds_epi16(input[15], input[48]); x1[48] = _mm_subs_epi16(input[15], input[48]); x1[16] = _mm_adds_epi16(input[16], input[47]); x1[47] = _mm_subs_epi16(input[16], input[47]); x1[17] = _mm_adds_epi16(input[17], input[46]); x1[46] = _mm_subs_epi16(input[17], input[46]); x1[18] = _mm_adds_epi16(input[18], input[45]); x1[45] = _mm_subs_epi16(input[18], input[45]); x1[19] = _mm_adds_epi16(input[19], input[44]); x1[44] = _mm_subs_epi16(input[19], input[44]); x1[20] = _mm_adds_epi16(input[20], input[43]); x1[43] = _mm_subs_epi16(input[20], input[43]); x1[21] = _mm_adds_epi16(input[21], input[42]); x1[42] = _mm_subs_epi16(input[21], input[42]); x1[22] = _mm_adds_epi16(input[22], input[41]); x1[41] = _mm_subs_epi16(input[22], input[41]); x1[23] = _mm_adds_epi16(input[23], input[40]); x1[40] = _mm_subs_epi16(input[23], input[40]); x1[24] = _mm_adds_epi16(input[24], input[39]); x1[39] = _mm_subs_epi16(input[24], input[39]); x1[25] = _mm_adds_epi16(input[25], input[38]); x1[38] = _mm_subs_epi16(input[25], input[38]); x1[26] = _mm_adds_epi16(input[26], input[37]); x1[37] = _mm_subs_epi16(input[26], input[37]); x1[27] = _mm_adds_epi16(input[27], input[36]); x1[36] = _mm_subs_epi16(input[27], input[36]); x1[28] = _mm_adds_epi16(input[28], input[35]); x1[35] = _mm_subs_epi16(input[28], input[35]); x1[29] = _mm_adds_epi16(input[29], input[34]); x1[34] = _mm_subs_epi16(input[29], input[34]); x1[30] = _mm_adds_epi16(input[30], input[33]); x1[33] = _mm_subs_epi16(input[30], input[33]); x1[31] = _mm_adds_epi16(input[31], input[32]); x1[32] = _mm_subs_epi16(input[31], input[32]); // stage 2 __m128i x2[64]; x2[0] = _mm_adds_epi16(x1[0], x1[31]); x2[31] = _mm_subs_epi16(x1[0], x1[31]); x2[1] = _mm_adds_epi16(x1[1], x1[30]); x2[30] = _mm_subs_epi16(x1[1], x1[30]); x2[2] = _mm_adds_epi16(x1[2], x1[29]); x2[29] = _mm_subs_epi16(x1[2], x1[29]); x2[3] = _mm_adds_epi16(x1[3], x1[28]); x2[28] = _mm_subs_epi16(x1[3], x1[28]); x2[4] = _mm_adds_epi16(x1[4], x1[27]); x2[27] = _mm_subs_epi16(x1[4], x1[27]); x2[5] = _mm_adds_epi16(x1[5], x1[26]); x2[26] = _mm_subs_epi16(x1[5], x1[26]); x2[6] = _mm_adds_epi16(x1[6], x1[25]); x2[25] = _mm_subs_epi16(x1[6], x1[25]); x2[7] = _mm_adds_epi16(x1[7], x1[24]); x2[24] = _mm_subs_epi16(x1[7], x1[24]); x2[8] = _mm_adds_epi16(x1[8], x1[23]); x2[23] = _mm_subs_epi16(x1[8], x1[23]); x2[9] = _mm_adds_epi16(x1[9], x1[22]); x2[22] = _mm_subs_epi16(x1[9], x1[22]); x2[10] = _mm_adds_epi16(x1[10], x1[21]); x2[21] = _mm_subs_epi16(x1[10], x1[21]); x2[11] = _mm_adds_epi16(x1[11], x1[20]); x2[20] = _mm_subs_epi16(x1[11], x1[20]); x2[12] = _mm_adds_epi16(x1[12], x1[19]); x2[19] = _mm_subs_epi16(x1[12], x1[19]); x2[13] = _mm_adds_epi16(x1[13], x1[18]); x2[18] = _mm_subs_epi16(x1[13], x1[18]); x2[14] = _mm_adds_epi16(x1[14], x1[17]); x2[17] = _mm_subs_epi16(x1[14], x1[17]); x2[15] = _mm_adds_epi16(x1[15], x1[16]); x2[16] = _mm_subs_epi16(x1[15], x1[16]); x2[32] = x1[32]; x2[33] = x1[33]; x2[34] = x1[34]; x2[35] = x1[35]; x2[36] = x1[36]; x2[37] = x1[37]; x2[38] = x1[38]; x2[39] = x1[39]; btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[40], x1[55], x2[40], x2[55]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[41], x1[54], x2[41], x2[54]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[42], x1[53], x2[42], x2[53]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[43], x1[52], x2[43], x2[52]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[44], x1[51], x2[44], x2[51]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[45], x1[50], x2[45], x2[50]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[46], x1[49], x2[46], x2[49]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[47], x1[48], x2[47], x2[48]); x2[56] = x1[56]; x2[57] = x1[57]; x2[58] = x1[58]; x2[59] = x1[59]; x2[60] = x1[60]; x2[61] = x1[61]; x2[62] = x1[62]; x2[63] = x1[63]; // stage 3 __m128i x3[64]; x3[0] = _mm_adds_epi16(x2[0], x2[15]); x3[15] = _mm_subs_epi16(x2[0], x2[15]); x3[1] = _mm_adds_epi16(x2[1], x2[14]); x3[14] = _mm_subs_epi16(x2[1], x2[14]); x3[2] = _mm_adds_epi16(x2[2], x2[13]); x3[13] = _mm_subs_epi16(x2[2], x2[13]); x3[3] = _mm_adds_epi16(x2[3], x2[12]); x3[12] = _mm_subs_epi16(x2[3], x2[12]); x3[4] = _mm_adds_epi16(x2[4], x2[11]); x3[11] = _mm_subs_epi16(x2[4], x2[11]); x3[5] = _mm_adds_epi16(x2[5], x2[10]); x3[10] = _mm_subs_epi16(x2[5], x2[10]); x3[6] = _mm_adds_epi16(x2[6], x2[9]); x3[9] = _mm_subs_epi16(x2[6], x2[9]); x3[7] = _mm_adds_epi16(x2[7], x2[8]); x3[8] = _mm_subs_epi16(x2[7], x2[8]); x3[16] = x2[16]; x3[17] = x2[17]; x3[18] = x2[18]; x3[19] = x2[19]; btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[20], x2[27], x3[20], x3[27]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[21], x2[26], x3[21], x3[26]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[22], x2[25], x3[22], x3[25]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[23], x2[24], x3[23], x3[24]); x3[28] = x2[28]; x3[29] = x2[29]; x3[30] = x2[30]; x3[31] = x2[31]; x3[32] = _mm_adds_epi16(x2[32], x2[47]); x3[47] = _mm_subs_epi16(x2[32], x2[47]); x3[33] = _mm_adds_epi16(x2[33], x2[46]); x3[46] = _mm_subs_epi16(x2[33], x2[46]); x3[34] = _mm_adds_epi16(x2[34], x2[45]); x3[45] = _mm_subs_epi16(x2[34], x2[45]); x3[35] = _mm_adds_epi16(x2[35], x2[44]); x3[44] = _mm_subs_epi16(x2[35], x2[44]); x3[36] = _mm_adds_epi16(x2[36], x2[43]); x3[43] = _mm_subs_epi16(x2[36], x2[43]); x3[37] = _mm_adds_epi16(x2[37], x2[42]); x3[42] = _mm_subs_epi16(x2[37], x2[42]); x3[38] = _mm_adds_epi16(x2[38], x2[41]); x3[41] = _mm_subs_epi16(x2[38], x2[41]); x3[39] = _mm_adds_epi16(x2[39], x2[40]); x3[40] = _mm_subs_epi16(x2[39], x2[40]); x3[48] = _mm_subs_epi16(x2[63], x2[48]); x3[63] = _mm_adds_epi16(x2[63], x2[48]); x3[49] = _mm_subs_epi16(x2[62], x2[49]); x3[62] = _mm_adds_epi16(x2[62], x2[49]); x3[50] = _mm_subs_epi16(x2[61], x2[50]); x3[61] = _mm_adds_epi16(x2[61], x2[50]); x3[51] = _mm_subs_epi16(x2[60], x2[51]); x3[60] = _mm_adds_epi16(x2[60], x2[51]); x3[52] = _mm_subs_epi16(x2[59], x2[52]); x3[59] = _mm_adds_epi16(x2[59], x2[52]); x3[53] = _mm_subs_epi16(x2[58], x2[53]); x3[58] = _mm_adds_epi16(x2[58], x2[53]); x3[54] = _mm_subs_epi16(x2[57], x2[54]); x3[57] = _mm_adds_epi16(x2[57], x2[54]); x3[55] = _mm_subs_epi16(x2[56], x2[55]); x3[56] = _mm_adds_epi16(x2[56], x2[55]); // stage 4 __m128i x4[64]; x4[0] = _mm_adds_epi16(x3[0], x3[7]); x4[7] = _mm_subs_epi16(x3[0], x3[7]); x4[1] = _mm_adds_epi16(x3[1], x3[6]); x4[6] = _mm_subs_epi16(x3[1], x3[6]); x4[2] = _mm_adds_epi16(x3[2], x3[5]); x4[5] = _mm_subs_epi16(x3[2], x3[5]); x4[3] = _mm_adds_epi16(x3[3], x3[4]); x4[4] = _mm_subs_epi16(x3[3], x3[4]); x4[8] = x3[8]; x4[9] = x3[9]; btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[10], x3[13], x4[10], x4[13]); btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[11], x3[12], x4[11], x4[12]); x4[14] = x3[14]; x4[15] = x3[15]; x4[16] = _mm_adds_epi16(x3[16], x3[23]); x4[23] = _mm_subs_epi16(x3[16], x3[23]); x4[17] = _mm_adds_epi16(x3[17], x3[22]); x4[22] = _mm_subs_epi16(x3[17], x3[22]); x4[18] = _mm_adds_epi16(x3[18], x3[21]); x4[21] = _mm_subs_epi16(x3[18], x3[21]); x4[19] = _mm_adds_epi16(x3[19], x3[20]); x4[20] = _mm_subs_epi16(x3[19], x3[20]); x4[24] = _mm_subs_epi16(x3[31], x3[24]); x4[31] = _mm_adds_epi16(x3[31], x3[24]); x4[25] = _mm_subs_epi16(x3[30], x3[25]); x4[30] = _mm_adds_epi16(x3[30], x3[25]); x4[26] = _mm_subs_epi16(x3[29], x3[26]); x4[29] = _mm_adds_epi16(x3[29], x3[26]); x4[27] = _mm_subs_epi16(x3[28], x3[27]); x4[28] = _mm_adds_epi16(x3[28], x3[27]); x4[32] = x3[32]; x4[33] = x3[33]; x4[34] = x3[34]; x4[35] = x3[35]; btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[36], x3[59], x4[36], x4[59]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[37], x3[58], x4[37], x4[58]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[38], x3[57], x4[38], x4[57]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[39], x3[56], x4[39], x4[56]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[40], x3[55], x4[40], x4[55]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[41], x3[54], x4[41], x4[54]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[42], x3[53], x4[42], x4[53]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[43], x3[52], x4[43], x4[52]); x4[44] = x3[44]; x4[45] = x3[45]; x4[46] = x3[46]; x4[47] = x3[47]; x4[48] = x3[48]; x4[49] = x3[49]; x4[50] = x3[50]; x4[51] = x3[51]; x4[60] = x3[60]; x4[61] = x3[61]; x4[62] = x3[62]; x4[63] = x3[63]; // stage 5 __m128i x5[64]; x5[0] = _mm_adds_epi16(x4[0], x4[3]); x5[3] = _mm_subs_epi16(x4[0], x4[3]); x5[1] = _mm_adds_epi16(x4[1], x4[2]); x5[2] = _mm_subs_epi16(x4[1], x4[2]); x5[4] = x4[4]; btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]); x5[7] = x4[7]; x5[8] = _mm_adds_epi16(x4[8], x4[11]); x5[11] = _mm_subs_epi16(x4[8], x4[11]); x5[9] = _mm_adds_epi16(x4[9], x4[10]); x5[10] = _mm_subs_epi16(x4[9], x4[10]); x5[12] = _mm_subs_epi16(x4[15], x4[12]); x5[15] = _mm_adds_epi16(x4[15], x4[12]); x5[13] = _mm_subs_epi16(x4[14], x4[13]); x5[14] = _mm_adds_epi16(x4[14], x4[13]); x5[16] = x4[16]; x5[17] = x4[17]; btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[18], x4[29], x5[18], x5[29]); btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[19], x4[28], x5[19], x5[28]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[20], x4[27], x5[20], x5[27]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[21], x4[26], x5[21], x5[26]); x5[22] = x4[22]; x5[23] = x4[23]; x5[24] = x4[24]; x5[25] = x4[25]; x5[30] = x4[30]; x5[31] = x4[31]; x5[32] = _mm_adds_epi16(x4[32], x4[39]); x5[39] = _mm_subs_epi16(x4[32], x4[39]); x5[33] = _mm_adds_epi16(x4[33], x4[38]); x5[38] = _mm_subs_epi16(x4[33], x4[38]); x5[34] = _mm_adds_epi16(x4[34], x4[37]); x5[37] = _mm_subs_epi16(x4[34], x4[37]); x5[35] = _mm_adds_epi16(x4[35], x4[36]); x5[36] = _mm_subs_epi16(x4[35], x4[36]); x5[40] = _mm_subs_epi16(x4[47], x4[40]); x5[47] = _mm_adds_epi16(x4[47], x4[40]); x5[41] = _mm_subs_epi16(x4[46], x4[41]); x5[46] = _mm_adds_epi16(x4[46], x4[41]); x5[42] = _mm_subs_epi16(x4[45], x4[42]); x5[45] = _mm_adds_epi16(x4[45], x4[42]); x5[43] = _mm_subs_epi16(x4[44], x4[43]); x5[44] = _mm_adds_epi16(x4[44], x4[43]); x5[48] = _mm_adds_epi16(x4[48], x4[55]); x5[55] = _mm_subs_epi16(x4[48], x4[55]); x5[49] = _mm_adds_epi16(x4[49], x4[54]); x5[54] = _mm_subs_epi16(x4[49], x4[54]); x5[50] = _mm_adds_epi16(x4[50], x4[53]); x5[53] = _mm_subs_epi16(x4[50], x4[53]); x5[51] = _mm_adds_epi16(x4[51], x4[52]); x5[52] = _mm_subs_epi16(x4[51], x4[52]); x5[56] = _mm_subs_epi16(x4[63], x4[56]); x5[63] = _mm_adds_epi16(x4[63], x4[56]); x5[57] = _mm_subs_epi16(x4[62], x4[57]); x5[62] = _mm_adds_epi16(x4[62], x4[57]); x5[58] = _mm_subs_epi16(x4[61], x4[58]); x5[61] = _mm_adds_epi16(x4[61], x4[58]); x5[59] = _mm_subs_epi16(x4[60], x4[59]); x5[60] = _mm_adds_epi16(x4[60], x4[59]); // stage 6 __m128i x6[64]; btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x5[0], x5[1], x6[0], x6[1]); btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x5[2], x5[3], x6[2], x6[3]); x6[4] = _mm_adds_epi16(x5[4], x5[5]); x6[5] = _mm_subs_epi16(x5[4], x5[5]); x6[6] = _mm_subs_epi16(x5[7], x5[6]); x6[7] = _mm_adds_epi16(x5[7], x5[6]); x6[8] = x5[8]; btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[9], x5[14], x6[9], x6[14]); btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[10], x5[13], x6[10], x6[13]); x6[11] = x5[11]; x6[12] = x5[12]; x6[15] = x5[15]; x6[16] = _mm_adds_epi16(x5[16], x5[19]); x6[19] = _mm_subs_epi16(x5[16], x5[19]); x6[17] = _mm_adds_epi16(x5[17], x5[18]); x6[18] = _mm_subs_epi16(x5[17], x5[18]); x6[20] = _mm_subs_epi16(x5[23], x5[20]); x6[23] = _mm_adds_epi16(x5[23], x5[20]); x6[21] = _mm_subs_epi16(x5[22], x5[21]); x6[22] = _mm_adds_epi16(x5[22], x5[21]); x6[24] = _mm_adds_epi16(x5[24], x5[27]); x6[27] = _mm_subs_epi16(x5[24], x5[27]); x6[25] = _mm_adds_epi16(x5[25], x5[26]); x6[26] = _mm_subs_epi16(x5[25], x5[26]); x6[28] = _mm_subs_epi16(x5[31], x5[28]); x6[31] = _mm_adds_epi16(x5[31], x5[28]); x6[29] = _mm_subs_epi16(x5[30], x5[29]); x6[30] = _mm_adds_epi16(x5[30], x5[29]); x6[32] = x5[32]; x6[33] = x5[33]; btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[34], x5[61], x6[34], x6[61]); btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[35], x5[60], x6[35], x6[60]); btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[36], x5[59], x6[36], x6[59]); btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[37], x5[58], x6[37], x6[58]); x6[38] = x5[38]; x6[39] = x5[39]; x6[40] = x5[40]; x6[41] = x5[41]; btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[42], x5[53], x6[42], x6[53]); btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[43], x5[52], x6[43], x6[52]); btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[44], x5[51], x6[44], x6[51]); btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[45], x5[50], x6[45], x6[50]); x6[46] = x5[46]; x6[47] = x5[47]; x6[48] = x5[48]; x6[49] = x5[49]; x6[54] = x5[54]; x6[55] = x5[55]; x6[56] = x5[56]; x6[57] = x5[57]; x6[62] = x5[62]; x6[63] = x5[63]; // stage 7 __m128i x7[64]; x7[0] = x6[0]; x7[1] = x6[1]; x7[2] = x6[2]; x7[3] = x6[3]; btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x6[4], x6[7], x7[4], x7[7]); btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x6[5], x6[6], x7[5], x7[6]); x7[8] = _mm_adds_epi16(x6[8], x6[9]); x7[9] = _mm_subs_epi16(x6[8], x6[9]); x7[10] = _mm_subs_epi16(x6[11], x6[10]); x7[11] = _mm_adds_epi16(x6[11], x6[10]); x7[12] = _mm_adds_epi16(x6[12], x6[13]); x7[13] = _mm_subs_epi16(x6[12], x6[13]); x7[14] = _mm_subs_epi16(x6[15], x6[14]); x7[15] = _mm_adds_epi16(x6[15], x6[14]); x7[16] = x6[16]; btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x6[17], x6[30], x7[17], x7[30]); btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x6[18], x6[29], x7[18], x7[29]); x7[19] = x6[19]; x7[20] = x6[20]; btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x6[21], x6[26], x7[21], x7[26]); btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x6[22], x6[25], x7[22], x7[25]); x7[23] = x6[23]; x7[24] = x6[24]; x7[27] = x6[27]; x7[28] = x6[28]; x7[31] = x6[31]; x7[32] = _mm_adds_epi16(x6[32], x6[35]); x7[35] = _mm_subs_epi16(x6[32], x6[35]); x7[33] = _mm_adds_epi16(x6[33], x6[34]); x7[34] = _mm_subs_epi16(x6[33], x6[34]); x7[36] = _mm_subs_epi16(x6[39], x6[36]); x7[39] = _mm_adds_epi16(x6[39], x6[36]); x7[37] = _mm_subs_epi16(x6[38], x6[37]); x7[38] = _mm_adds_epi16(x6[38], x6[37]); x7[40] = _mm_adds_epi16(x6[40], x6[43]); x7[43] = _mm_subs_epi16(x6[40], x6[43]); x7[41] = _mm_adds_epi16(x6[41], x6[42]); x7[42] = _mm_subs_epi16(x6[41], x6[42]); x7[44] = _mm_subs_epi16(x6[47], x6[44]); x7[47] = _mm_adds_epi16(x6[47], x6[44]); x7[45] = _mm_subs_epi16(x6[46], x6[45]); x7[46] = _mm_adds_epi16(x6[46], x6[45]); x7[48] = _mm_adds_epi16(x6[48], x6[51]); x7[51] = _mm_subs_epi16(x6[48], x6[51]); x7[49] = _mm_adds_epi16(x6[49], x6[50]); x7[50] = _mm_subs_epi16(x6[49], x6[50]); x7[52] = _mm_subs_epi16(x6[55], x6[52]); x7[55] = _mm_adds_epi16(x6[55], x6[52]); x7[53] = _mm_subs_epi16(x6[54], x6[53]); x7[54] = _mm_adds_epi16(x6[54], x6[53]); x7[56] = _mm_adds_epi16(x6[56], x6[59]); x7[59] = _mm_subs_epi16(x6[56], x6[59]); x7[57] = _mm_adds_epi16(x6[57], x6[58]); x7[58] = _mm_subs_epi16(x6[57], x6[58]); x7[60] = _mm_subs_epi16(x6[63], x6[60]); x7[63] = _mm_adds_epi16(x6[63], x6[60]); x7[61] = _mm_subs_epi16(x6[62], x6[61]); x7[62] = _mm_adds_epi16(x6[62], x6[61]); // stage 8 __m128i x8[64]; x8[0] = x7[0]; x8[1] = x7[1]; x8[2] = x7[2]; x8[3] = x7[3]; x8[4] = x7[4]; x8[5] = x7[5]; x8[6] = x7[6]; x8[7] = x7[7]; btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x7[8], x7[15], x8[8], x8[15]); btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x7[9], x7[14], x8[9], x8[14]); btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x7[10], x7[13], x8[10], x8[13]); btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x7[11], x7[12], x8[11], x8[12]); x8[16] = _mm_adds_epi16(x7[16], x7[17]); x8[17] = _mm_subs_epi16(x7[16], x7[17]); x8[18] = _mm_subs_epi16(x7[19], x7[18]); x8[19] = _mm_adds_epi16(x7[19], x7[18]); x8[20] = _mm_adds_epi16(x7[20], x7[21]); x8[21] = _mm_subs_epi16(x7[20], x7[21]); x8[22] = _mm_subs_epi16(x7[23], x7[22]); x8[23] = _mm_adds_epi16(x7[23], x7[22]); x8[24] = _mm_adds_epi16(x7[24], x7[25]); x8[25] = _mm_subs_epi16(x7[24], x7[25]); x8[26] = _mm_subs_epi16(x7[27], x7[26]); x8[27] = _mm_adds_epi16(x7[27], x7[26]); x8[28] = _mm_adds_epi16(x7[28], x7[29]); x8[29] = _mm_subs_epi16(x7[28], x7[29]); x8[30] = _mm_subs_epi16(x7[31], x7[30]); x8[31] = _mm_adds_epi16(x7[31], x7[30]); x8[32] = x7[32]; btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x7[33], x7[62], x8[33], x8[62]); btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x7[34], x7[61], x8[34], x8[61]); x8[35] = x7[35]; x8[36] = x7[36]; btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x7[37], x7[58], x8[37], x8[58]); btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x7[38], x7[57], x8[38], x8[57]); x8[39] = x7[39]; x8[40] = x7[40]; btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x7[41], x7[54], x8[41], x8[54]); btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x7[42], x7[53], x8[42], x8[53]); x8[43] = x7[43]; x8[44] = x7[44]; btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x7[45], x7[50], x8[45], x8[50]); btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x7[46], x7[49], x8[46], x8[49]); x8[47] = x7[47]; x8[48] = x7[48]; x8[51] = x7[51]; x8[52] = x7[52]; x8[55] = x7[55]; x8[56] = x7[56]; x8[59] = x7[59]; x8[60] = x7[60]; x8[63] = x7[63]; // stage 9 __m128i x9[64]; x9[0] = x8[0]; x9[1] = x8[1]; x9[2] = x8[2]; x9[3] = x8[3]; x9[4] = x8[4]; x9[5] = x8[5]; x9[6] = x8[6]; x9[7] = x8[7]; x9[8] = x8[8]; x9[9] = x8[9]; x9[10] = x8[10]; x9[11] = x8[11]; x9[12] = x8[12]; x9[13] = x8[13]; x9[14] = x8[14]; x9[15] = x8[15]; btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x8[16], x8[31], x9[16], x9[31]); btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x8[17], x8[30], x9[17], x9[30]); btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x8[18], x8[29], x9[18], x9[29]); btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x8[19], x8[28], x9[19], x9[28]); btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x8[20], x8[27], x9[20], x9[27]); btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x8[21], x8[26], x9[21], x9[26]); btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x8[22], x8[25], x9[22], x9[25]); btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x8[23], x8[24], x9[23], x9[24]); x9[32] = _mm_adds_epi16(x8[32], x8[33]); x9[33] = _mm_subs_epi16(x8[32], x8[33]); x9[34] = _mm_subs_epi16(x8[35], x8[34]); x9[35] = _mm_adds_epi16(x8[35], x8[34]); x9[36] = _mm_adds_epi16(x8[36], x8[37]); x9[37] = _mm_subs_epi16(x8[36], x8[37]); x9[38] = _mm_subs_epi16(x8[39], x8[38]); x9[39] = _mm_adds_epi16(x8[39], x8[38]); x9[40] = _mm_adds_epi16(x8[40], x8[41]); x9[41] = _mm_subs_epi16(x8[40], x8[41]); x9[42] = _mm_subs_epi16(x8[43], x8[42]); x9[43] = _mm_adds_epi16(x8[43], x8[42]); x9[44] = _mm_adds_epi16(x8[44], x8[45]); x9[45] = _mm_subs_epi16(x8[44], x8[45]); x9[46] = _mm_subs_epi16(x8[47], x8[46]); x9[47] = _mm_adds_epi16(x8[47], x8[46]); x9[48] = _mm_adds_epi16(x8[48], x8[49]); x9[49] = _mm_subs_epi16(x8[48], x8[49]); x9[50] = _mm_subs_epi16(x8[51], x8[50]); x9[51] = _mm_adds_epi16(x8[51], x8[50]); x9[52] = _mm_adds_epi16(x8[52], x8[53]); x9[53] = _mm_subs_epi16(x8[52], x8[53]); x9[54] = _mm_subs_epi16(x8[55], x8[54]); x9[55] = _mm_adds_epi16(x8[55], x8[54]); x9[56] = _mm_adds_epi16(x8[56], x8[57]); x9[57] = _mm_subs_epi16(x8[56], x8[57]); x9[58] = _mm_subs_epi16(x8[59], x8[58]); x9[59] = _mm_adds_epi16(x8[59], x8[58]); x9[60] = _mm_adds_epi16(x8[60], x8[61]); x9[61] = _mm_subs_epi16(x8[60], x8[61]); x9[62] = _mm_subs_epi16(x8[63], x8[62]); x9[63] = _mm_adds_epi16(x8[63], x8[62]); // stage 10 __m128i x10[64]; x10[0] = x9[0]; x10[1] = x9[1]; x10[2] = x9[2]; x10[3] = x9[3]; x10[4] = x9[4]; x10[5] = x9[5]; x10[6] = x9[6]; x10[7] = x9[7]; x10[8] = x9[8]; x10[9] = x9[9]; x10[10] = x9[10]; x10[11] = x9[11]; x10[12] = x9[12]; x10[13] = x9[13]; x10[14] = x9[14]; x10[15] = x9[15]; x10[16] = x9[16]; x10[17] = x9[17]; x10[18] = x9[18]; x10[19] = x9[19]; x10[20] = x9[20]; x10[21] = x9[21]; x10[22] = x9[22]; x10[23] = x9[23]; x10[24] = x9[24]; x10[25] = x9[25]; x10[26] = x9[26]; x10[27] = x9[27]; x10[28] = x9[28]; x10[29] = x9[29]; x10[30] = x9[30]; x10[31] = x9[31]; btf_16_sse2(cospi_p63_p01, cospi_m01_p63, x9[32], x9[63], x10[32], x10[63]); btf_16_sse2(cospi_p31_p33, cospi_m33_p31, x9[33], x9[62], x10[33], x10[62]); btf_16_sse2(cospi_p47_p17, cospi_m17_p47, x9[34], x9[61], x10[34], x10[61]); btf_16_sse2(cospi_p15_p49, cospi_m49_p15, x9[35], x9[60], x10[35], x10[60]); btf_16_sse2(cospi_p55_p09, cospi_m09_p55, x9[36], x9[59], x10[36], x10[59]); btf_16_sse2(cospi_p23_p41, cospi_m41_p23, x9[37], x9[58], x10[37], x10[58]); btf_16_sse2(cospi_p39_p25, cospi_m25_p39, x9[38], x9[57], x10[38], x10[57]); btf_16_sse2(cospi_p07_p57, cospi_m57_p07, x9[39], x9[56], x10[39], x10[56]); btf_16_sse2(cospi_p59_p05, cospi_m05_p59, x9[40], x9[55], x10[40], x10[55]); btf_16_sse2(cospi_p27_p37, cospi_m37_p27, x9[41], x9[54], x10[41], x10[54]); btf_16_sse2(cospi_p43_p21, cospi_m21_p43, x9[42], x9[53], x10[42], x10[53]); btf_16_sse2(cospi_p11_p53, cospi_m53_p11, x9[43], x9[52], x10[43], x10[52]); btf_16_sse2(cospi_p51_p13, cospi_m13_p51, x9[44], x9[51], x10[44], x10[51]); btf_16_sse2(cospi_p19_p45, cospi_m45_p19, x9[45], x9[50], x10[45], x10[50]); btf_16_sse2(cospi_p35_p29, cospi_m29_p35, x9[46], x9[49], x10[46], x10[49]); btf_16_sse2(cospi_p03_p61, cospi_m61_p03, x9[47], x9[48], x10[47], x10[48]); // stage 11 output[0] = x10[0]; output[1] = x10[32]; output[2] = x10[16]; output[3] = x10[48]; output[4] = x10[8]; output[5] = x10[40]; output[6] = x10[24]; output[7] = x10[56]; output[8] = x10[4]; output[9] = x10[36]; output[10] = x10[20]; output[11] = x10[52]; output[12] = x10[12]; output[13] = x10[44]; output[14] = x10[28]; output[15] = x10[60]; output[16] = x10[2]; output[17] = x10[34]; output[18] = x10[18]; output[19] = x10[50]; output[20] = x10[10]; output[21] = x10[42]; output[22] = x10[26]; output[23] = x10[58]; output[24] = x10[6]; output[25] = x10[38]; output[26] = x10[22]; output[27] = x10[54]; output[28] = x10[14]; output[29] = x10[46]; output[30] = x10[30]; output[31] = x10[62]; output[32] = x10[1]; output[33] = x10[33]; output[34] = x10[17]; output[35] = x10[49]; output[36] = x10[9]; output[37] = x10[41]; output[38] = x10[25]; output[39] = x10[57]; output[40] = x10[5]; output[41] = x10[37]; output[42] = x10[21]; output[43] = x10[53]; output[44] = x10[13]; output[45] = x10[45]; output[46] = x10[29]; output[47] = x10[61]; output[48] = x10[3]; output[49] = x10[35]; output[50] = x10[19]; output[51] = x10[51]; output[52] = x10[11]; output[53] = x10[43]; output[54] = x10[27]; output[55] = x10[59]; output[56] = x10[7]; output[57] = x10[39]; output[58] = x10[23]; output[59] = x10[55]; output[60] = x10[15]; output[61] = x10[47]; output[62] = x10[31]; output[63] = x10[63]; } static void fadst4x4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { const int32_t *sinpi = sinpi_arr(cos_bit); const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]); const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]); const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]); const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]); const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]); const __m128i __zero = _mm_setzero_si128(); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); const __m128i in7 = _mm_add_epi16(input[0], input[1]); __m128i u[8], v[8]; u[0] = _mm_unpacklo_epi16(input[0], input[1]); u[1] = _mm_unpacklo_epi16(input[2], input[3]); u[2] = _mm_unpacklo_epi16(in7, __zero); u[3] = _mm_unpacklo_epi16(input[2], __zero); u[4] = _mm_unpacklo_epi16(input[3], __zero); v[0] = _mm_madd_epi16(u[0], sinpi_p01_p02); // s0 + s2 v[1] = _mm_madd_epi16(u[1], sinpi_p03_p04); // s4 + s5 v[2] = _mm_madd_epi16(u[2], sinpi_p03_p03); // x1 v[3] = _mm_madd_epi16(u[0], sinpi_p04_m01); // s1 - s3 v[4] = _mm_madd_epi16(u[1], sinpi_m03_p02); // -s4 + s6 v[5] = _mm_madd_epi16(u[3], sinpi_p03_p03); // s4 v[6] = _mm_madd_epi16(u[4], sinpi_p03_p03); u[0] = _mm_add_epi32(v[0], v[1]); u[1] = _mm_sub_epi32(v[2], v[6]); u[2] = _mm_add_epi32(v[3], v[4]); u[3] = _mm_sub_epi32(u[2], u[0]); u[4] = _mm_slli_epi32(v[5], 2); u[5] = _mm_sub_epi32(u[4], v[5]); u[6] = _mm_add_epi32(u[3], u[5]); v[0] = _mm_add_epi32(u[0], __rounding); v[1] = _mm_add_epi32(u[1], __rounding); v[2] = _mm_add_epi32(u[2], __rounding); v[3] = _mm_add_epi32(u[6], __rounding); u[0] = _mm_srai_epi32(v[0], cos_bit); u[1] = _mm_srai_epi32(v[1], cos_bit); u[2] = _mm_srai_epi32(v[2], cos_bit); u[3] = _mm_srai_epi32(v[3], cos_bit); output[0] = _mm_packs_epi32(u[0], u[2]); output[1] = _mm_packs_epi32(u[1], u[3]); output[2] = _mm_srli_si128(output[0], 8); output[3] = _mm_srli_si128(output[1], 8); } static void fadst4x8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m128i __zero = _mm_setzero_si128(); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); // stage 1 __m128i x1[8]; x1[0] = input[0]; x1[1] = _mm_subs_epi16(__zero, input[7]); x1[2] = _mm_subs_epi16(__zero, input[3]); x1[3] = input[4]; x1[4] = _mm_subs_epi16(__zero, input[1]); x1[5] = input[6]; x1[6] = input[2]; x1[7] = _mm_subs_epi16(__zero, input[5]); // stage 2 __m128i x2[8]; x2[0] = x1[0]; x2[1] = x1[1]; btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[2], &x1[3], &x2[2], &x2[3]); x2[4] = x1[4]; x2[5] = x1[5]; btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[6], &x1[7], &x2[6], &x2[7]); // stage 3 __m128i x3[8]; x3[0] = _mm_adds_epi16(x2[0], x2[2]); x3[2] = _mm_subs_epi16(x2[0], x2[2]); x3[1] = _mm_adds_epi16(x2[1], x2[3]); x3[3] = _mm_subs_epi16(x2[1], x2[3]); x3[4] = _mm_adds_epi16(x2[4], x2[6]); x3[6] = _mm_subs_epi16(x2[4], x2[6]); x3[5] = _mm_adds_epi16(x2[5], x2[7]); x3[7] = _mm_subs_epi16(x2[5], x2[7]); // stage 4 __m128i x4[8]; x4[0] = x3[0]; x4[1] = x3[1]; x4[2] = x3[2]; x4[3] = x3[3]; btf_16_w4_sse2(&cospi_p16_p48, &cospi_p48_m16, __rounding, cos_bit, &x3[4], &x3[5], &x4[4], &x4[5]); btf_16_w4_sse2(&cospi_m48_p16, &cospi_p16_p48, __rounding, cos_bit, &x3[6], &x3[7], &x4[6], &x4[7]); // stage 5 __m128i x5[8]; x5[0] = _mm_adds_epi16(x4[0], x4[4]); x5[4] = _mm_subs_epi16(x4[0], x4[4]); x5[1] = _mm_adds_epi16(x4[1], x4[5]); x5[5] = _mm_subs_epi16(x4[1], x4[5]); x5[2] = _mm_adds_epi16(x4[2], x4[6]); x5[6] = _mm_subs_epi16(x4[2], x4[6]); x5[3] = _mm_adds_epi16(x4[3], x4[7]); x5[7] = _mm_subs_epi16(x4[3], x4[7]); // stage 6 __m128i x6[8]; btf_16_w4_sse2(&cospi_p04_p60, &cospi_p60_m04, __rounding, cos_bit, &x5[0], &x5[1], &x6[0], &x6[1]); btf_16_w4_sse2(&cospi_p20_p44, &cospi_p44_m20, __rounding, cos_bit, &x5[2], &x5[3], &x6[2], &x6[3]); btf_16_w4_sse2(&cospi_p36_p28, &cospi_p28_m36, __rounding, cos_bit, &x5[4], &x5[5], &x6[4], &x6[5]); btf_16_w4_sse2(&cospi_p52_p12, &cospi_p12_m52, __rounding, cos_bit, &x5[6], &x5[7], &x6[6], &x6[7]); // stage 7 output[0] = x6[1]; output[1] = x6[6]; output[2] = x6[3]; output[3] = x6[4]; output[4] = x6[5]; output[5] = x6[2]; output[6] = x6[7]; output[7] = x6[0]; } static void fadst8x4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { const int32_t *sinpi = sinpi_arr(cos_bit); const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]); const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]); const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]); const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]); const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]); const __m128i __zero = _mm_setzero_si128(); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); const __m128i in7 = _mm_add_epi16(input[0], input[1]); __m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8]; u_lo[0] = _mm_unpacklo_epi16(input[0], input[1]); u_hi[0] = _mm_unpackhi_epi16(input[0], input[1]); u_lo[1] = _mm_unpacklo_epi16(input[2], input[3]); u_hi[1] = _mm_unpackhi_epi16(input[2], input[3]); u_lo[2] = _mm_unpacklo_epi16(in7, __zero); u_hi[2] = _mm_unpackhi_epi16(in7, __zero); u_lo[3] = _mm_unpacklo_epi16(input[2], __zero); u_hi[3] = _mm_unpackhi_epi16(input[2], __zero); u_lo[4] = _mm_unpacklo_epi16(input[3], __zero); u_hi[4] = _mm_unpackhi_epi16(input[3], __zero); v_lo[0] = _mm_madd_epi16(u_lo[0], sinpi_p01_p02); // s0 + s2 v_hi[0] = _mm_madd_epi16(u_hi[0], sinpi_p01_p02); // s0 + s2 v_lo[1] = _mm_madd_epi16(u_lo[1], sinpi_p03_p04); // s4 + s5 v_hi[1] = _mm_madd_epi16(u_hi[1], sinpi_p03_p04); // s4 + s5 v_lo[2] = _mm_madd_epi16(u_lo[2], sinpi_p03_p03); // x1 v_hi[2] = _mm_madd_epi16(u_hi[2], sinpi_p03_p03); // x1 v_lo[3] = _mm_madd_epi16(u_lo[0], sinpi_p04_m01); // s1 - s3 v_hi[3] = _mm_madd_epi16(u_hi[0], sinpi_p04_m01); // s1 - s3 v_lo[4] = _mm_madd_epi16(u_lo[1], sinpi_m03_p02); // -s4 + s6 v_hi[4] = _mm_madd_epi16(u_hi[1], sinpi_m03_p02); // -s4 + s6 v_lo[5] = _mm_madd_epi16(u_lo[3], sinpi_p03_p03); // s4 v_hi[5] = _mm_madd_epi16(u_hi[3], sinpi_p03_p03); // s4 v_lo[6] = _mm_madd_epi16(u_lo[4], sinpi_p03_p03); v_hi[6] = _mm_madd_epi16(u_hi[4], sinpi_p03_p03); u_lo[0] = _mm_add_epi32(v_lo[0], v_lo[1]); u_hi[0] = _mm_add_epi32(v_hi[0], v_hi[1]); u_lo[1] = _mm_sub_epi32(v_lo[2], v_lo[6]); u_hi[1] = _mm_sub_epi32(v_hi[2], v_hi[6]); u_lo[2] = _mm_add_epi32(v_lo[3], v_lo[4]); u_hi[2] = _mm_add_epi32(v_hi[3], v_hi[4]); u_lo[3] = _mm_sub_epi32(u_lo[2], u_lo[0]); u_hi[3] = _mm_sub_epi32(u_hi[2], u_hi[0]); u_lo[4] = _mm_slli_epi32(v_lo[5], 2); u_hi[4] = _mm_slli_epi32(v_hi[5], 2); u_lo[5] = _mm_sub_epi32(u_lo[4], v_lo[5]); u_hi[5] = _mm_sub_epi32(u_hi[4], v_hi[5]); u_lo[6] = _mm_add_epi32(u_lo[3], u_lo[5]); u_hi[6] = _mm_add_epi32(u_hi[3], u_hi[5]); v_lo[0] = _mm_add_epi32(u_lo[0], __rounding); v_hi[0] = _mm_add_epi32(u_hi[0], __rounding); v_lo[1] = _mm_add_epi32(u_lo[1], __rounding); v_hi[1] = _mm_add_epi32(u_hi[1], __rounding); v_lo[2] = _mm_add_epi32(u_lo[2], __rounding); v_hi[2] = _mm_add_epi32(u_hi[2], __rounding); v_lo[3] = _mm_add_epi32(u_lo[6], __rounding); v_hi[3] = _mm_add_epi32(u_hi[6], __rounding); u_lo[0] = _mm_srai_epi32(v_lo[0], cos_bit); u_hi[0] = _mm_srai_epi32(v_hi[0], cos_bit); u_lo[1] = _mm_srai_epi32(v_lo[1], cos_bit); u_hi[1] = _mm_srai_epi32(v_hi[1], cos_bit); u_lo[2] = _mm_srai_epi32(v_lo[2], cos_bit); u_hi[2] = _mm_srai_epi32(v_hi[2], cos_bit); u_lo[3] = _mm_srai_epi32(v_lo[3], cos_bit); u_hi[3] = _mm_srai_epi32(v_hi[3], cos_bit); output[0] = _mm_packs_epi32(u_lo[0], u_hi[0]); output[1] = _mm_packs_epi32(u_lo[1], u_hi[1]); output[2] = _mm_packs_epi32(u_lo[2], u_hi[2]); output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]); } static void fadst8x16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m128i __zero = _mm_setzero_si128(); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); // stage 1 __m128i x1[16]; x1[0] = input[0]; x1[1] = _mm_subs_epi16(__zero, input[15]); x1[2] = _mm_subs_epi16(__zero, input[7]); x1[3] = input[8]; x1[4] = _mm_subs_epi16(__zero, input[3]); x1[5] = input[12]; x1[6] = input[4]; x1[7] = _mm_subs_epi16(__zero, input[11]); x1[8] = _mm_subs_epi16(__zero, input[1]); x1[9] = input[14]; x1[10] = input[6]; x1[11] = _mm_subs_epi16(__zero, input[9]); x1[12] = input[2]; x1[13] = _mm_subs_epi16(__zero, input[13]); x1[14] = _mm_subs_epi16(__zero, input[5]); x1[15] = input[10]; // stage 2 __m128i x2[16]; x2[0] = x1[0]; x2[1] = x1[1]; btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]); x2[4] = x1[4]; x2[5] = x1[5]; btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]); x2[8] = x1[8]; x2[9] = x1[9]; btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x2[10], x2[11]); x2[12] = x1[12]; x2[13] = x1[13]; btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x2[14], x2[15]); // stage 3 __m128i x3[16]; x3[0] = _mm_adds_epi16(x2[0], x2[2]); x3[2] = _mm_subs_epi16(x2[0], x2[2]); x3[1] = _mm_adds_epi16(x2[1], x2[3]); x3[3] = _mm_subs_epi16(x2[1], x2[3]); x3[4] = _mm_adds_epi16(x2[4], x2[6]); x3[6] = _mm_subs_epi16(x2[4], x2[6]); x3[5] = _mm_adds_epi16(x2[5], x2[7]); x3[7] = _mm_subs_epi16(x2[5], x2[7]); x3[8] = _mm_adds_epi16(x2[8], x2[10]); x3[10] = _mm_subs_epi16(x2[8], x2[10]); x3[9] = _mm_adds_epi16(x2[9], x2[11]); x3[11] = _mm_subs_epi16(x2[9], x2[11]); x3[12] = _mm_adds_epi16(x2[12], x2[14]); x3[14] = _mm_subs_epi16(x2[12], x2[14]); x3[13] = _mm_adds_epi16(x2[13], x2[15]); x3[15] = _mm_subs_epi16(x2[13], x2[15]); // stage 4 __m128i x4[16]; x4[0] = x3[0]; x4[1] = x3[1]; x4[2] = x3[2]; x4[3] = x3[3]; btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]); btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]); x4[8] = x3[8]; x4[9] = x3[9]; x4[10] = x3[10]; x4[11] = x3[11]; btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[12], x3[13], x4[12], x4[13]); btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[14], x3[15], x4[14], x4[15]); // stage 5 __m128i x5[16]; x5[0] = _mm_adds_epi16(x4[0], x4[4]); x5[4] = _mm_subs_epi16(x4[0], x4[4]); x5[1] = _mm_adds_epi16(x4[1], x4[5]); x5[5] = _mm_subs_epi16(x4[1], x4[5]); x5[2] = _mm_adds_epi16(x4[2], x4[6]); x5[6] = _mm_subs_epi16(x4[2], x4[6]); x5[3] = _mm_adds_epi16(x4[3], x4[7]); x5[7] = _mm_subs_epi16(x4[3], x4[7]); x5[8] = _mm_adds_epi16(x4[8], x4[12]); x5[12] = _mm_subs_epi16(x4[8], x4[12]); x5[9] = _mm_adds_epi16(x4[9], x4[13]); x5[13] = _mm_subs_epi16(x4[9], x4[13]); x5[10] = _mm_adds_epi16(x4[10], x4[14]); x5[14] = _mm_subs_epi16(x4[10], x4[14]); x5[11] = _mm_adds_epi16(x4[11], x4[15]); x5[15] = _mm_subs_epi16(x4[11], x4[15]); // stage 6 __m128i x6[16]; x6[0] = x5[0]; x6[1] = x5[1]; x6[2] = x5[2]; x6[3] = x5[3]; x6[4] = x5[4]; x6[5] = x5[5]; x6[6] = x5[6]; x6[7] = x5[7]; btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x5[8], x5[9], x6[8], x6[9]); btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x5[10], x5[11], x6[10], x6[11]); btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x5[12], x5[13], x6[12], x6[13]); btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x5[14], x5[15], x6[14], x6[15]); // stage 7 __m128i x7[16]; x7[0] = _mm_adds_epi16(x6[0], x6[8]); x7[8] = _mm_subs_epi16(x6[0], x6[8]); x7[1] = _mm_adds_epi16(x6[1], x6[9]); x7[9] = _mm_subs_epi16(x6[1], x6[9]); x7[2] = _mm_adds_epi16(x6[2], x6[10]); x7[10] = _mm_subs_epi16(x6[2], x6[10]); x7[3] = _mm_adds_epi16(x6[3], x6[11]); x7[11] = _mm_subs_epi16(x6[3], x6[11]); x7[4] = _mm_adds_epi16(x6[4], x6[12]); x7[12] = _mm_subs_epi16(x6[4], x6[12]); x7[5] = _mm_adds_epi16(x6[5], x6[13]); x7[13] = _mm_subs_epi16(x6[5], x6[13]); x7[6] = _mm_adds_epi16(x6[6], x6[14]); x7[14] = _mm_subs_epi16(x6[6], x6[14]); x7[7] = _mm_adds_epi16(x6[7], x6[15]); x7[15] = _mm_subs_epi16(x6[7], x6[15]); // stage 8 __m128i x8[16]; btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x7[0], x7[1], x8[0], x8[1]); btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x7[2], x7[3], x8[2], x8[3]); btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x7[4], x7[5], x8[4], x8[5]); btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x7[6], x7[7], x8[6], x8[7]); btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x7[8], x7[9], x8[8], x8[9]); btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x7[10], x7[11], x8[10], x8[11]); btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x7[12], x7[13], x8[12], x8[13]); btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x7[14], x7[15], x8[14], x8[15]); // stage 9 output[0] = x8[1]; output[1] = x8[14]; output[2] = x8[3]; output[3] = x8[12]; output[4] = x8[5]; output[5] = x8[10]; output[6] = x8[7]; output[7] = x8[8]; output[8] = x8[9]; output[9] = x8[6]; output[10] = x8[11]; output[11] = x8[4]; output[12] = x8[13]; output[13] = x8[2]; output[14] = x8[15]; output[15] = x8[0]; } static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = { fdct4x4_new_sse2, // DCT_DCT fadst4x4_new_sse2, // ADST_DCT fdct4x4_new_sse2, // DCT_ADST fadst4x4_new_sse2, // ADST_ADST fadst4x4_new_sse2, // FLIPADST_DCT fdct4x4_new_sse2, // DCT_FLIPADST fadst4x4_new_sse2, // FLIPADST_FLIPADST fadst4x4_new_sse2, // ADST_FLIPADST fadst4x4_new_sse2, // FLIPADST_ADST fidentity4x4_new_sse2, // IDTX fdct4x4_new_sse2, // V_DCT fidentity4x4_new_sse2, // H_DCT fadst4x4_new_sse2, // V_ADST fidentity4x4_new_sse2, // H_ADST fadst4x4_new_sse2, // V_FLIPADST fidentity4x4_new_sse2 // H_FLIPADST }; static const transform_1d_sse2 row_txfm4x4_arr[TX_TYPES] = { fdct4x4_new_sse2, // DCT_DCT fdct4x4_new_sse2, // ADST_DCT fadst4x4_new_sse2, // DCT_ADST fadst4x4_new_sse2, // ADST_ADST fdct4x4_new_sse2, // FLIPADST_DCT fadst4x4_new_sse2, // DCT_FLIPADST fadst4x4_new_sse2, // FLIPADST_FLIPADST fadst4x4_new_sse2, // ADST_FLIPADST fadst4x4_new_sse2, // FLIPADST_ADST fidentity4x4_new_sse2, // IDTX fidentity4x4_new_sse2, // V_DCT fdct4x4_new_sse2, // H_DCT fidentity4x4_new_sse2, // V_ADST fadst4x4_new_sse2, // H_ADST fidentity4x4_new_sse2, // V_FLIPADST fadst4x4_new_sse2 // H_FLIPADST }; static const transform_1d_sse2 col_txfm4x8_arr[TX_TYPES] = { fdct4x8_new_sse2, // DCT_DCT fadst4x8_new_sse2, // ADST_DCT fdct4x8_new_sse2, // DCT_ADST fadst4x8_new_sse2, // ADST_ADST fadst4x8_new_sse2, // FLIPADST_DCT fdct4x8_new_sse2, // DCT_FLIPADST fadst4x8_new_sse2, // FLIPADST_FLIPADST fadst4x8_new_sse2, // ADST_FLIPADST fadst4x8_new_sse2, // FLIPADST_ADST fidentity8x8_new_sse2, // IDTX fdct4x8_new_sse2, // V_DCT fidentity8x8_new_sse2, // H_DCT fadst4x8_new_sse2, // V_ADST fidentity8x8_new_sse2, // H_ADST fadst4x8_new_sse2, // V_FLIPADST fidentity8x8_new_sse2 // H_FLIPADST }; static const transform_1d_sse2 row_txfm8x4_arr[TX_TYPES] = { fdct8x4_new_sse2, // DCT_DCT fdct8x4_new_sse2, // ADST_DCT fadst8x4_new_sse2, // DCT_ADST fadst8x4_new_sse2, // ADST_ADST fdct8x4_new_sse2, // FLIPADST_DCT fadst8x4_new_sse2, // DCT_FLIPADST fadst8x4_new_sse2, // FLIPADST_FLIPADST fadst8x4_new_sse2, // ADST_FLIPADST fadst8x4_new_sse2, // FLIPADST_ADST fidentity8x4_new_sse2, // IDTX fidentity8x4_new_sse2, // V_DCT fdct8x4_new_sse2, // H_DCT fidentity8x4_new_sse2, // V_ADST fadst8x4_new_sse2, // H_ADST fidentity8x4_new_sse2, // V_FLIPADST fadst8x4_new_sse2 // H_FLIPADST }; static const transform_1d_sse2 col_txfm8x4_arr[TX_TYPES] = { fdct8x4_new_sse2, // DCT_DCT fadst8x4_new_sse2, // ADST_DCT fdct8x4_new_sse2, // DCT_ADST fadst8x4_new_sse2, // ADST_ADST fadst8x4_new_sse2, // FLIPADST_DCT fdct8x4_new_sse2, // DCT_FLIPADST fadst8x4_new_sse2, // FLIPADST_FLIPADST fadst8x4_new_sse2, // ADST_FLIPADST fadst8x4_new_sse2, // FLIPADST_ADST fidentity8x4_new_sse2, // IDTX fdct8x4_new_sse2, // V_DCT fidentity8x4_new_sse2, // H_DCT fadst8x4_new_sse2, // V_ADST fidentity8x4_new_sse2, // H_ADST fadst8x4_new_sse2, // V_FLIPADST fidentity8x4_new_sse2 // H_FLIPADST }; static const transform_1d_sse2 row_txfm4x8_arr[TX_TYPES] = { fdct4x8_new_sse2, // DCT_DCT fdct4x8_new_sse2, // ADST_DCT fadst4x8_new_sse2, // DCT_ADST fadst4x8_new_sse2, // ADST_ADST fdct4x8_new_sse2, // FLIPADST_DCT fadst4x8_new_sse2, // DCT_FLIPADST fadst4x8_new_sse2, // FLIPADST_FLIPADST fadst4x8_new_sse2, // ADST_FLIPADST fadst4x8_new_sse2, // FLIPADST_ADST fidentity8x8_new_sse2, // IDTX fidentity8x8_new_sse2, // V_DCT fdct4x8_new_sse2, // H_DCT fidentity8x8_new_sse2, // V_ADST fadst4x8_new_sse2, // H_ADST fidentity8x8_new_sse2, // V_FLIPADST fadst4x8_new_sse2 // H_FLIPADST }; static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = { fdct8x8_new_sse2, // DCT_DCT fadst8x8_new_sse2, // ADST_DCT fdct8x8_new_sse2, // DCT_ADST fadst8x8_new_sse2, // ADST_ADST fadst8x8_new_sse2, // FLIPADST_DCT fdct8x8_new_sse2, // DCT_FLIPADST fadst8x8_new_sse2, // FLIPADST_FLIPADST fadst8x8_new_sse2, // ADST_FLIPADST fadst8x8_new_sse2, // FLIPADST_ADST fidentity8x8_new_sse2, // IDTX fdct8x8_new_sse2, // V_DCT fidentity8x8_new_sse2, // H_DCT fadst8x8_new_sse2, // V_ADST fidentity8x8_new_sse2, // H_ADST fadst8x8_new_sse2, // V_FLIPADST fidentity8x8_new_sse2, // H_FLIPADST }; static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = { fdct8x8_new_sse2, // DCT_DCT fdct8x8_new_sse2, // ADST_DCT fadst8x8_new_sse2, // DCT_ADST fadst8x8_new_sse2, // ADST_ADST fdct8x8_new_sse2, // FLIPADST_DCT fadst8x8_new_sse2, // DCT_FLIPADST fadst8x8_new_sse2, // FLIPADST_FLIPADST fadst8x8_new_sse2, // ADST_FLIPADST fadst8x8_new_sse2, // FLIPADST_ADST fidentity8x8_new_sse2, // IDTX fidentity8x8_new_sse2, // V_DCT fdct8x8_new_sse2, // H_DCT fidentity8x8_new_sse2, // V_ADST fadst8x8_new_sse2, // H_ADST fidentity8x8_new_sse2, // V_FLIPADST fadst8x8_new_sse2 // H_FLIPADST }; static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = { fdct8x16_new_sse2, // DCT_DCT fadst8x16_new_sse2, // ADST_DCT fdct8x16_new_sse2, // DCT_ADST fadst8x16_new_sse2, // ADST_ADST fadst8x16_new_sse2, // FLIPADST_DCT fdct8x16_new_sse2, // DCT_FLIPADST fadst8x16_new_sse2, // FLIPADST_FLIPADST fadst8x16_new_sse2, // ADST_FLIPADST fadst8x16_new_sse2, // FLIPADST_ADST fidentity8x16_new_sse2, // IDTX fdct8x16_new_sse2, // V_DCT fidentity8x16_new_sse2, // H_DCT fadst8x16_new_sse2, // V_ADST fidentity8x16_new_sse2, // H_ADST fadst8x16_new_sse2, // V_FLIPADST fidentity8x16_new_sse2 // H_FLIPADST }; static const transform_1d_sse2 row_txfm8x16_arr[TX_TYPES] = { fdct8x16_new_sse2, // DCT_DCT fdct8x16_new_sse2, // ADST_DCT fadst8x16_new_sse2, // DCT_ADST fadst8x16_new_sse2, // ADST_ADST fdct8x16_new_sse2, // FLIPADST_DCT fadst8x16_new_sse2, // DCT_FLIPADST fadst8x16_new_sse2, // FLIPADST_FLIPADST fadst8x16_new_sse2, // ADST_FLIPADST fadst8x16_new_sse2, // FLIPADST_ADST fidentity8x16_new_sse2, // IDTX fidentity8x16_new_sse2, // V_DCT fdct8x16_new_sse2, // H_DCT fidentity8x16_new_sse2, // V_ADST fadst8x16_new_sse2, // H_ADST fidentity8x16_new_sse2, // V_FLIPADST fadst8x16_new_sse2 // H_FLIPADST }; static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = { av1_fdct8x32_new_sse2, // DCT_DCT NULL, // ADST_DCT NULL, // DCT_ADST NULL, // ADST_ADST NULL, // FLIPADST_DCT NULL, // DCT_FLIPADST NULL, // FLIPADST_FLIPADST NULL, // ADST_FLIPADST NULL, // FLIPADST_ADST fidentity8x32_new_sse2, // IDTX fidentity8x32_new_sse2, // V_DCT av1_fdct8x32_new_sse2, // H_DCT NULL, // V_ADST NULL, // H_ADST NULL, // V_FLIPADST NULL // H_FLIPADST }; void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf0[4], buf1[4], *buf; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4]; const int txw_idx = get_txw_idx(TX_4X4); const int txh_idx = get_txh_idx(TX_4X4); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 4; const int height = 4; const transform_1d_sse2 col_txfm = col_txfm4x4_arr[tx_type]; const transform_1d_sse2 row_txfm = row_txfm4x4_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); if (ud_flip) { load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); } else { load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); } round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); transpose_16bit_4x4(buf0, buf1); if (lr_flip) { buf = buf0; flip_buf_sse2(buf1, buf, width); } else { buf = buf1; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_buffer_16bit_to_32bit_w4(buf, output, height, width); } void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)stride; (void)bd; __m128i buf0[8], buf1[8], *buf; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8]; const int txw_idx = get_txw_idx(TX_4X8); const int txh_idx = get_txh_idx(TX_4X8); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 4; const int height = 8; const transform_1d_sse2 col_txfm = col_txfm4x8_arr[tx_type]; const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); if (ud_flip) { load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); } else { load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); } round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); transpose_16bit_4x8(buf0, buf1); if (lr_flip) { buf = buf0; flip_buf_sse2(buf1, buf, width); } else { buf = buf1; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width); } void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf0[16], buf1[16]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16]; const int txw_idx = get_txw_idx(TX_4X16); const int txh_idx = get_txh_idx(TX_4X16); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 4; const int height = 16; const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); if (ud_flip) { load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); } else { load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); } round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); transpose_16bit_4x8(buf0, buf1); transpose_16bit_4x8(buf0 + 8, buf1 + 8); for (int i = 0; i < 2; i++) { __m128i *buf; if (lr_flip) { buf = buf0; flip_buf_sse2(buf1 + 8 * i, buf, width); } else { buf = buf1 + 8 * i; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); } } void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf0[8], buf1[8], *buf; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4]; const int txw_idx = get_txw_idx(TX_8X4); const int txh_idx = get_txh_idx(TX_8X4); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 8; const int height = 4; const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type]; const transform_1d_sse2 row_txfm = row_txfm4x8_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); if (ud_flip) load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); else load_buffer_16bit_to_16bit(input, stride, buf0, height); round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); transpose_16bit_8x8(buf0, buf1); if (lr_flip) { buf = buf0; flip_buf_sse2(buf1, buf, width); } else { buf = buf1; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_rect_buffer_16bit_to_32bit_w4(buf, output, height, width); } void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf0[8], buf1[8], *buf; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8]; const int txw_idx = get_txw_idx(TX_8X8); const int txh_idx = get_txh_idx(TX_8X8); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 8; const int height = 8; const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); if (ud_flip) load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); else load_buffer_16bit_to_16bit(input, stride, buf0, height); round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); transpose_16bit_8x8(buf0, buf1); if (lr_flip) { buf = buf0; flip_buf_sse2(buf1, buf, width); } else { buf = buf1; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_buffer_16bit_to_32bit_w8(buf, output, height, width); } void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf0[16], buf1[16]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; const int txw_idx = get_txw_idx(TX_8X16); const int txh_idx = get_txh_idx(TX_8X16); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 8; const int height = 16; const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); if (ud_flip) { load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); } else { load_buffer_16bit_to_16bit(input, stride, buf0, height); } round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); transpose_16bit_8x8(buf0, buf1); transpose_16bit_8x8(buf0 + 8, buf1 + 8); for (int i = 0; i < 2; i++) { __m128i *buf; if (lr_flip) { buf = buf0; flip_buf_sse2(buf1 + width * i, buf, width); } else { buf = buf1 + width * i; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); } } void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf0[32], buf1[32]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32]; const int txw_idx = get_txw_idx(TX_8X32); const int txh_idx = get_txh_idx(TX_8X32); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 8; const int height = 32; const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); if (ud_flip) { load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); } else { load_buffer_16bit_to_16bit(input, stride, buf0, height); } round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); transpose_16bit_8x8(buf0, buf1); transpose_16bit_8x8(buf0 + 8, buf1 + 8); transpose_16bit_8x8(buf0 + 16, buf1 + 16); transpose_16bit_8x8(buf0 + 24, buf1 + 24); for (int i = 0; i < 4; i++) { __m128i *buf; if (lr_flip) { buf = buf0; flip_buf_sse2(buf1 + width * i, buf, width); } else { buf = buf1 + width * i; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); } } void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf0[16], buf1[16]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4]; const int txw_idx = get_txw_idx(TX_16X4); const int txh_idx = get_txh_idx(TX_16X4); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 16; const int height = 4; const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type]; const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; __m128i *buf; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < 2; i++) { if (ud_flip) { load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); } else { load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); } round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); transpose_16bit_8x4(buf0, buf1 + 8 * i); } if (lr_flip) { buf = buf0; flip_buf_sse2(buf1, buf, width); } else { buf = buf1; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_buffer_16bit_to_32bit_w4(buf, output, height, width); } void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf0[16], buf1[16]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; const int txw_idx = get_txw_idx(TX_16X8); const int txh_idx = get_txh_idx(TX_16X8); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 16; const int height = 8; const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; __m128i *buf; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < 2; i++) { if (ud_flip) { load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); } else { load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); } round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); transpose_16bit_8x8(buf0, buf1 + 8 * i); } if (lr_flip) { buf = buf0; flip_buf_sse2(buf1, buf, width); } else { buf = buf1; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width); } void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf0[16], buf1[32]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16]; const int txw_idx = get_txw_idx(TX_16X16); const int txh_idx = get_txh_idx(TX_16X16); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 16; const int height = 16; const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < 2; i++) { if (ud_flip) { load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); } else { load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); } round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i); } for (int i = 0; i < 2; i++) { __m128i *buf; if (lr_flip) { buf = buf0; flip_buf_sse2(buf1 + width * i, buf, width); } else { buf = buf1 + width * i; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); } } void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf0[32], buf1[64]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32]; const int txw_idx = get_txw_idx(TX_16X32); const int txh_idx = get_txh_idx(TX_16X32); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 16; const int height = 32; const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; if (col_txfm != NULL && row_txfm != NULL) { int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < 2; i++) { if (ud_flip) { load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); } else { load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); } round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i); transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i); transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i); transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i); } for (int i = 0; i < 4; i++) { __m128i *buf; if (lr_flip) { buf = buf0; flip_buf_sse2(buf1 + width * i, buf, width); } else { buf = buf1 + width * i; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); } } else { av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd); } } void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf0[32], buf1[32]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8]; const int txw_idx = get_txw_idx(TX_32X8); const int txh_idx = get_txh_idx(TX_32X8); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 32; const int height = 8; const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; if (col_txfm != NULL && row_txfm != NULL) { int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < 4; i++) { if (ud_flip) { load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); } else { load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); } round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); } for (int i = 0; i < 1; i++) { __m128i *buf; if (lr_flip) { buf = buf0; flip_buf_sse2(buf1 + width * i, buf, width); } else { buf = buf1 + width * i; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); } } else { av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); } } void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf0[32], buf1[64]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16]; const int txw_idx = get_txw_idx(TX_32X16); const int txh_idx = get_txh_idx(TX_32X16); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 32; const int height = 16; const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; if (col_txfm != NULL && row_txfm != NULL) { int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < 4; i++) { if (ud_flip) { load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); } else { load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); } round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i); } for (int i = 0; i < 2; i++) { __m128i *buf; if (lr_flip) { buf = buf0; flip_buf_sse2(buf1 + width * i, buf, width); } else { buf = buf1 + width * i; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); } } else { av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); } } void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m128i buf0[32], buf1[128]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X32]; const int txw_idx = get_txw_idx(TX_32X32); const int txh_idx = get_txh_idx(TX_32X32); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = 32; const int height = 32; const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; if (col_txfm != NULL && row_txfm != NULL) { int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < 4; i++) { if (ud_flip) { load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); } else { load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); } round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i); transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i); transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i); transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i); } for (int i = 0; i < 4; i++) { __m128i *buf; if (lr_flip) { buf = buf0; flip_buf_sse2(buf1 + width * i, buf, width); } else { buf = buf1 + width * i; } row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); } } else { av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd); } } void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; (void)tx_type; assert(tx_type == DCT_DCT); const TX_SIZE tx_size = TX_64X16; __m128i buf0[64], buf1[128]; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const transform_1d_sse2 col_txfm = fdct8x16_new_sse2; const transform_1d_sse2 row_txfm = av1_fdct8x64_new_sse2; const int width_div8 = (width >> 3); const int height_div8 = (height >> 3); for (int i = 0; i < width_div8; i++) { load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); for (int j = 0; j < height_div8; ++j) { transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); } } for (int i = 0; i < height_div8; i++) { __m128i *buf = buf1 + width * i; row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 16, 32); } // Zero out the bottom 16x32 area. memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); } void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; (void)tx_type; assert(tx_type == DCT_DCT); const TX_SIZE tx_size = TX_16X64; __m128i buf0[64], buf1[128]; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2; const transform_1d_sse2 row_txfm = fdct8x16_new_sse2; const int width_div8 = (width >> 3); const int height_div8 = (height >> 3); for (int i = 0; i < width_div8; i++) { load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); round_shift_16bit(buf0, height, shift[0]); col_txfm(buf0, buf0, cos_bit_col); round_shift_16bit(buf0, height, shift[1]); for (int j = 0; j < height_div8; ++j) { transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); } } for (int i = 0; i < AOMMIN(4, height_div8); i++) { __m128i *buf = buf1 + width * i; row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, width, shift[2]); store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 32, 16); } } // Include top-level function only for 32-bit x86, to support Valgrind. // For normal use, we require SSE4.1, so av1_lowbd_fwd_txfm_sse4_1 will be used // instead of this function. However, 32-bit Valgrind does not support SSE4.1, // so we include a fallback to SSE2 to improve performance #if AOM_ARCH_X86 static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform NULL, // 64x64 transform av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform NULL, // 32x64 transform NULL, // 64x32 transform av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform }; void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param) { FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; if ((fwd_txfm2d_func == NULL) || (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); else fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); } #endif // AOM_ARCH_X86 aom-3.12.1/av1/encoder/x86/av1_fwd_txfm_sse2.h000066400000000000000000000224731477627663500206360ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ #define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ #include #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/transpose_sse2.h" #include "aom_dsp/x86/txfm_common_sse2.h" #ifdef __cplusplus extern "C" { #endif void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit); void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit); static inline void fidentity4x4_new_sse2(const __m128i *const input, __m128i *const output, const int8_t cos_bit) { (void)cos_bit; const __m128i one = _mm_set1_epi16(1); for (int i = 0; i < 4; ++i) { const __m128i a = _mm_unpacklo_epi16(input[i], one); const __m128i b = scale_round_sse2(a, NewSqrt2); output[i] = _mm_packs_epi32(b, b); } } static inline void fidentity8x4_new_sse2(const __m128i *const input, __m128i *const output, const int8_t cos_bit) { (void)cos_bit; const __m128i one = _mm_set1_epi16(1); for (int i = 0; i < 4; ++i) { const __m128i a_lo = _mm_unpacklo_epi16(input[i], one); const __m128i a_hi = _mm_unpackhi_epi16(input[i], one); const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2); output[i] = _mm_packs_epi32(b_lo, b_hi); } } static inline void fidentity8x8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)cos_bit; output[0] = _mm_adds_epi16(input[0], input[0]); output[1] = _mm_adds_epi16(input[1], input[1]); output[2] = _mm_adds_epi16(input[2], input[2]); output[3] = _mm_adds_epi16(input[3], input[3]); output[4] = _mm_adds_epi16(input[4], input[4]); output[5] = _mm_adds_epi16(input[5], input[5]); output[6] = _mm_adds_epi16(input[6], input[6]); output[7] = _mm_adds_epi16(input[7], input[7]); } static inline void fdct8x8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); // stage 1 __m128i x1[8]; x1[0] = _mm_adds_epi16(input[0], input[7]); x1[7] = _mm_subs_epi16(input[0], input[7]); x1[1] = _mm_adds_epi16(input[1], input[6]); x1[6] = _mm_subs_epi16(input[1], input[6]); x1[2] = _mm_adds_epi16(input[2], input[5]); x1[5] = _mm_subs_epi16(input[2], input[5]); x1[3] = _mm_adds_epi16(input[3], input[4]); x1[4] = _mm_subs_epi16(input[3], input[4]); // stage 2 __m128i x2[8]; x2[0] = _mm_adds_epi16(x1[0], x1[3]); x2[3] = _mm_subs_epi16(x1[0], x1[3]); x2[1] = _mm_adds_epi16(x1[1], x1[2]); x2[2] = _mm_subs_epi16(x1[1], x1[2]); x2[4] = x1[4]; btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]); x2[7] = x1[7]; // stage 3 __m128i x3[8]; btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]); btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]); x3[4] = _mm_adds_epi16(x2[4], x2[5]); x3[5] = _mm_subs_epi16(x2[4], x2[5]); x3[6] = _mm_subs_epi16(x2[7], x2[6]); x3[7] = _mm_adds_epi16(x2[7], x2[6]); // stage 4 and 5 output[0] = x3[0]; output[4] = x3[1]; output[2] = x3[2]; output[6] = x3[3]; btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], output[1], output[7]); btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], output[5], output[3]); } static inline void fadst8x8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { const int32_t *cospi = cospi_arr(cos_bit); const __m128i __zero = _mm_setzero_si128(); const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); // stage 1 __m128i x1[8]; x1[0] = input[0]; x1[1] = _mm_subs_epi16(__zero, input[7]); x1[2] = _mm_subs_epi16(__zero, input[3]); x1[3] = input[4]; x1[4] = _mm_subs_epi16(__zero, input[1]); x1[5] = input[6]; x1[6] = input[2]; x1[7] = _mm_subs_epi16(__zero, input[5]); // stage 2 __m128i x2[8]; x2[0] = x1[0]; x2[1] = x1[1]; btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]); x2[4] = x1[4]; x2[5] = x1[5]; btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]); // stage 3 __m128i x3[8]; x3[0] = _mm_adds_epi16(x2[0], x2[2]); x3[2] = _mm_subs_epi16(x2[0], x2[2]); x3[1] = _mm_adds_epi16(x2[1], x2[3]); x3[3] = _mm_subs_epi16(x2[1], x2[3]); x3[4] = _mm_adds_epi16(x2[4], x2[6]); x3[6] = _mm_subs_epi16(x2[4], x2[6]); x3[5] = _mm_adds_epi16(x2[5], x2[7]); x3[7] = _mm_subs_epi16(x2[5], x2[7]); // stage 4 __m128i x4[8]; x4[0] = x3[0]; x4[1] = x3[1]; x4[2] = x3[2]; x4[3] = x3[3]; btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]); btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]); // stage 5, 6 and 7 output[7] = _mm_adds_epi16(x4[0], x4[4]); output[3] = _mm_subs_epi16(x4[0], x4[4]); output[0] = _mm_adds_epi16(x4[1], x4[5]); output[4] = _mm_subs_epi16(x4[1], x4[5]); output[5] = _mm_adds_epi16(x4[2], x4[6]); output[1] = _mm_subs_epi16(x4[2], x4[6]); output[2] = _mm_adds_epi16(x4[3], x4[7]); output[6] = _mm_subs_epi16(x4[3], x4[7]); btf_16_sse2(cospi_p04_p60, cospi_p60_m04, output[7], output[0], output[7], output[0]); btf_16_sse2(cospi_p20_p44, cospi_p44_m20, output[5], output[2], output[5], output[2]); btf_16_sse2(cospi_p36_p28, cospi_p28_m36, output[3], output[4], output[3], output[4]); btf_16_sse2(cospi_p52_p12, cospi_p12_m52, output[1], output[6], output[1], output[6]); } static inline void fidentity8x16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)cos_bit; const __m128i one = _mm_set1_epi16(1); for (int i = 0; i < 16; ++i) { const __m128i a_lo = _mm_unpacklo_epi16(input[i], one); const __m128i a_hi = _mm_unpackhi_epi16(input[i], one); const __m128i b_lo = scale_round_sse2(a_lo, 2 * NewSqrt2); const __m128i b_hi = scale_round_sse2(a_hi, 2 * NewSqrt2); output[i] = _mm_packs_epi32(b_lo, b_hi); } } static inline void fidentity8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) { (void)cos_bit; for (int i = 0; i < 32; ++i) { output[i] = _mm_slli_epi16(input[i], 2); } } static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = { av1_fdct8x32_new_sse2, // DCT_DCT NULL, // ADST_DCT NULL, // DCT_ADST NULL, // ADST_ADST NULL, // FLIPADST_DCT NULL, // DCT_FLIPADST NULL, // FLIPADST_FLIPADST NULL, // ADST_FLIPADST NULL, // FLIPADST_ADST fidentity8x32_new_sse2, // IDTX av1_fdct8x32_new_sse2, // V_DCT fidentity8x32_new_sse2, // H_DCT NULL, // V_ADST NULL, // H_ADST NULL, // V_FLIPADST NULL // H_FLIPADST }; #ifdef __cplusplus } #endif #endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ aom-3.12.1/av1/encoder/x86/av1_highbd_quantize_avx2.c000066400000000000000000000117251477627663500221620ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" static inline void init_one_qp(const __m128i *p, __m256i *qp) { const __m128i zero = _mm_setzero_si128(); const __m128i dc = _mm_unpacklo_epi16(*p, zero); const __m128i ac = _mm_unpackhi_epi16(*p, zero); *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); } static inline void update_qp(__m256i *qp) { qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11); qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11); qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11); } static inline void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *dequant_ptr, int log_scale, __m256i *qp) { __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); if (log_scale) { const __m128i round_scale = _mm_set1_epi16(1 << (15 - log_scale)); round = _mm_mulhrs_epi16(round, round_scale); } const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); init_one_qp(&round, &qp[0]); init_one_qp(&quant, &qp[1]); init_one_qp(&dequant, &qp[2]); } static inline void quantize(const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, int log_scale, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { const __m256i abs_coeff = _mm256_abs_epi32(*c); __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); __m256i q_lo = _mm256_mul_epi32(q, qp[1]); __m256i q_hi = _mm256_srli_epi64(q, 32); const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32); q_hi = _mm256_mul_epi32(q_hi, qp_hi); q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale); q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); q_hi = _mm256_slli_epi64(q_hi, 32); q = _mm256_or_si256(q_lo, q_hi); const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s); q = _mm256_andnot_si256(mask, q); __m256i dq = _mm256_mullo_epi32(q, qp[2]); dq = _mm256_srai_epi32(dq, log_scale); q = _mm256_sign_epi32(q, *c); dq = _mm256_sign_epi32(dq, *c); _mm256_storeu_si256((__m256i *)qcoeff, q); _mm256_storeu_si256((__m256i *)dqcoeff, dq); const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); const __m128i zr = _mm_setzero_si128(); const __m128i lo = _mm_unpacklo_epi16(isc, zr); const __m128i hi = _mm_unpackhi_epi16(isc, zr); const __m256i iscan = _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); const __m256i zero = _mm256_setzero_si256(); const __m256i zc = _mm256_cmpeq_epi32(dq, zero); const __m256i nz = _mm256_cmpeq_epi32(zc, zero); __m256i cur_eob = _mm256_sub_epi32(iscan, nz); cur_eob = _mm256_and_si256(cur_eob, nz); *eob = _mm256_max_epi32(cur_eob, *eob); } void av1_highbd_quantize_fp_avx2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale) { (void)scan; (void)zbin_ptr; (void)quant_shift_ptr; const unsigned int step = 8; __m256i qp[3], coeff; init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp); coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); __m256i eob = _mm256_setzero_si256(); quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan += step; n_coeffs -= step; update_qp(qp); while (n_coeffs > 0) { coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan += step; n_coeffs -= step; } { __m256i eob_s; eob_s = _mm256_shuffle_epi32(eob, 0xe); eob = _mm256_max_epi16(eob, eob_s); eob_s = _mm256_shufflelo_epi16(eob, 0xe); eob = _mm256_max_epi16(eob, eob_s); eob_s = _mm256_shufflelo_epi16(eob, 1); eob = _mm256_max_epi16(eob, eob_s); const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), _mm256_extractf128_si256(eob, 1)); *eob_ptr = _mm_extract_epi16(final_eob, 0); } } aom-3.12.1/av1/encoder/x86/av1_highbd_quantize_sse4.c000066400000000000000000000167421477627663500221640ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/x86/synonyms.h" // Coefficient quantization phase 1 // param[0-2] : rounding/quan/dequan constants static inline void quantize_coeff_phase1(__m128i *coeff, const __m128i *param, const int shift, const int scale, __m128i *qcoeff, __m128i *dquan, __m128i *sign) { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi32(1); *sign = _mm_cmplt_epi32(*coeff, zero); *sign = _mm_or_si128(*sign, one); *coeff = _mm_abs_epi32(*coeff); qcoeff[0] = _mm_add_epi32(*coeff, param[0]); qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero); qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero); qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]); qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift); dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]); dquan[0] = _mm_srli_epi64(dquan[0], scale); const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale); qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]); } // Coefficient quantization phase 2 static inline void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan, const __m128i *sign, const __m128i *param, const int shift, const int scale, tran_low_t *qAddr, tran_low_t *dqAddr) { __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0); __m128i mask0H = _mm_set_epi32(0, 0, -1, -1); qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]); qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift); dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]); dquan[1] = _mm_srli_epi64(dquan[1], scale); // combine L&H qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8); qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d); qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H); qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L); dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8); dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d); dquan[0] = _mm_and_si128(dquan[0], mask0H); dquan[1] = _mm_and_si128(dquan[1], mask0L); qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]); dquan[0] = _mm_or_si128(dquan[0], dquan[1]); qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign); dquan[0] = _mm_sign_epi32(dquan[0], *sign); qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]); dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]); _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]); _mm_storeu_si128((__m128i *)dqAddr, dquan[0]); } static inline void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan, __m128i *eob) { const __m128i zero = _mm_setzero_si128(); __m128i mask, iscanIdx; const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr); const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4)); __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero); __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero); nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero); nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero); mask = _mm_packs_epi32(nz_flag0, nz_flag1); iscanIdx = _mm_loadu_si128((__m128i const *)iscan); iscanIdx = _mm_sub_epi16(iscanIdx, mask); iscanIdx = _mm_and_si128(iscanIdx, mask); *eob = _mm_max_epi16(*eob, iscanIdx); } static inline uint16_t get_accumulated_eob(__m128i *eob) { __m128i eob_shuffled; uint16_t eobValue; eob_shuffled = _mm_shuffle_epi32(*eob, 0xe); *eob = _mm_max_epi16(*eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe); *eob = _mm_max_epi16(*eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1); *eob = _mm_max_epi16(*eob, eob_shuffled); eobValue = _mm_extract_epi16(*eob, 0); return eobValue; } void av1_highbd_quantize_fp_sse4_1( const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale) { __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign; __m128i eob = _mm_setzero_si128(); const tran_low_t *src = coeff_ptr; tran_low_t *quanAddr = qcoeff_ptr; tran_low_t *dquanAddr = dqcoeff_ptr; const int shift = 16 - log_scale; const int coeff_stride = 4; const int quan_stride = coeff_stride; (void)zbin_ptr; (void)quant_shift_ptr; (void)scan; memset(quanAddr, 0, count * sizeof(quanAddr[0])); memset(dquanAddr, 0, count * sizeof(dquanAddr[0])); coeff[0] = _mm_loadu_si128((__m128i const *)src); const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale); const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale); qparam[0] = _mm_set_epi32(round1, round1, round1, round0); qparam[1] = _mm_set_epi64x((uint32_t)quant_ptr[1], (uint32_t)quant_ptr[0]); qparam[2] = _mm_set_epi64x((uint32_t)dequant_ptr[1], (uint32_t)dequant_ptr[0]); qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1], dequant_ptr[0]); // DC and first 3 AC quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, &coeff_sign); // update round/quan/dquan for AC qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]); qparam[1] = _mm_set1_epi64x((uint32_t)quant_ptr[1]); qparam[2] = _mm_set1_epi64x((uint32_t)dequant_ptr[1]); qparam[3] = _mm_set1_epi32(dequant_ptr[1]); quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, quanAddr, dquanAddr); // next 4 AC coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant, &coeff_sign); quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, quanAddr + quan_stride, dquanAddr + quan_stride); find_eob(quanAddr, iscan, &eob); count -= 8; // loop for the rest of AC while (count > 0) { src += coeff_stride << 1; quanAddr += quan_stride << 1; dquanAddr += quan_stride << 1; iscan += quan_stride << 1; coeff[0] = _mm_loadu_si128((__m128i const *)src); coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, &coeff_sign); quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, quanAddr, dquanAddr); quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant, &coeff_sign); quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, quanAddr + quan_stride, dquanAddr + quan_stride); find_eob(quanAddr, iscan, &eob); count -= 8; } *eob_ptr = get_accumulated_eob(&eob); } aom-3.12.1/av1/encoder/x86/av1_k_means_avx2.c000066400000000000000000000120571477627663500204310ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // AVX2 #include "config/av1_rtcd.h" #include "aom_dsp/x86/synonyms.h" static int64_t k_means_horizontal_sum_avx2(__m256i a) { const __m128i low = _mm256_castsi256_si128(a); const __m128i high = _mm256_extracti128_si256(a, 1); const __m128i sum = _mm_add_epi64(low, high); const __m128i sum_high = _mm_unpackhi_epi64(sum, sum); int64_t res; _mm_storel_epi64((__m128i *)&res, _mm_add_epi64(sum, sum_high)); return res; } void av1_calc_indices_dim1_avx2(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k) { const __m256i v_zero = _mm256_setzero_si256(); __m256i sum = _mm256_setzero_si256(); __m256i cents[PALETTE_MAX_SIZE]; for (int j = 0; j < k; ++j) { cents[j] = _mm256_set1_epi16(centroids[j]); } for (int i = 0; i < n; i += 16) { const __m256i in = _mm256_loadu_si256((__m256i *)data); __m256i ind = _mm256_setzero_si256(); // Compute the distance to the first centroid. __m256i d1 = _mm256_sub_epi16(in, cents[0]); __m256i dist_min = _mm256_abs_epi16(d1); for (int j = 1; j < k; ++j) { // Compute the distance to the centroid. d1 = _mm256_sub_epi16(in, cents[j]); const __m256i dist = _mm256_abs_epi16(d1); // Compare to the minimal one. const __m256i cmp = _mm256_cmpgt_epi16(dist_min, dist); dist_min = _mm256_min_epi16(dist_min, dist); const __m256i ind1 = _mm256_set1_epi16(j); ind = _mm256_or_si256(_mm256_andnot_si256(cmp, ind), _mm256_and_si256(cmp, ind1)); } const __m256i p1 = _mm256_packus_epi16(ind, v_zero); const __m256i px = _mm256_permute4x64_epi64(p1, 0x58); const __m128i d2 = _mm256_extracti128_si256(px, 0); _mm_storeu_si128((__m128i *)indices, d2); if (total_dist) { // Square, convert to 32 bit and add together. dist_min = _mm256_madd_epi16(dist_min, dist_min); // Convert to 64 bit and add to sum. const __m256i dist1 = _mm256_unpacklo_epi32(dist_min, v_zero); const __m256i dist2 = _mm256_unpackhi_epi32(dist_min, v_zero); sum = _mm256_add_epi64(sum, dist1); sum = _mm256_add_epi64(sum, dist2); } indices += 16; data += 16; } if (total_dist) { *total_dist = k_means_horizontal_sum_avx2(sum); } } void av1_calc_indices_dim2_avx2(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k) { const __m256i v_zero = _mm256_setzero_si256(); const __m256i permute = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0); __m256i sum = _mm256_setzero_si256(); __m256i ind[2]; __m256i cents[PALETTE_MAX_SIZE]; for (int j = 0; j < k; ++j) { const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1]; cents[j] = _mm256_set_epi16(cy, cx, cy, cx, cy, cx, cy, cx, cy, cx, cy, cx, cy, cx, cy, cx); } for (int i = 0; i < n; i += 16) { for (int l = 0; l < 2; ++l) { const __m256i in = _mm256_loadu_si256((__m256i *)data); ind[l] = _mm256_setzero_si256(); // Compute the distance to the first centroid. __m256i d1 = _mm256_sub_epi16(in, cents[0]); __m256i dist_min = _mm256_madd_epi16(d1, d1); for (int j = 1; j < k; ++j) { // Compute the distance to the centroid. d1 = _mm256_sub_epi16(in, cents[j]); const __m256i dist = _mm256_madd_epi16(d1, d1); // Compare to the minimal one. const __m256i cmp = _mm256_cmpgt_epi32(dist_min, dist); dist_min = _mm256_min_epi32(dist_min, dist); const __m256i ind1 = _mm256_set1_epi32(j); ind[l] = _mm256_or_si256(_mm256_andnot_si256(cmp, ind[l]), _mm256_and_si256(cmp, ind1)); } if (total_dist) { // Convert to 64 bit and add to sum. const __m256i dist1 = _mm256_unpacklo_epi32(dist_min, v_zero); const __m256i dist2 = _mm256_unpackhi_epi32(dist_min, v_zero); sum = _mm256_add_epi64(sum, dist1); sum = _mm256_add_epi64(sum, dist2); } data += 16; } // Cast to 8 bit and store. const __m256i d2 = _mm256_packus_epi32(ind[0], ind[1]); const __m256i d3 = _mm256_packus_epi16(d2, v_zero); const __m256i d4 = _mm256_permutevar8x32_epi32(d3, permute); const __m128i d5 = _mm256_extracti128_si256(d4, 0); _mm_storeu_si128((__m128i *)indices, d5); indices += 16; } if (total_dist) { *total_dist = k_means_horizontal_sum_avx2(sum); } } aom-3.12.1/av1/encoder/x86/av1_k_means_sse2.c000066400000000000000000000111341477627663500204200ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // SSE2 #include "config/av1_rtcd.h" #include "aom_dsp/x86/synonyms.h" static int64_t k_means_horizontal_sum_sse2(__m128i a) { const __m128i sum1 = _mm_unpackhi_epi64(a, a); const __m128i sum2 = _mm_add_epi64(a, sum1); int64_t res; _mm_storel_epi64((__m128i *)&res, sum2); return res; } void av1_calc_indices_dim1_sse2(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k) { const __m128i v_zero = _mm_setzero_si128(); __m128i sum = _mm_setzero_si128(); __m128i cents[PALETTE_MAX_SIZE]; for (int j = 0; j < k; ++j) { cents[j] = _mm_set1_epi16(centroids[j]); } for (int i = 0; i < n; i += 8) { const __m128i in = _mm_loadu_si128((__m128i *)data); __m128i ind = _mm_setzero_si128(); // Compute the distance to the first centroid. __m128i d1 = _mm_sub_epi16(in, cents[0]); __m128i d2 = _mm_sub_epi16(cents[0], in); __m128i dist_min = _mm_max_epi16(d1, d2); for (int j = 1; j < k; ++j) { // Compute the distance to the centroid. d1 = _mm_sub_epi16(in, cents[j]); d2 = _mm_sub_epi16(cents[j], in); const __m128i dist = _mm_max_epi16(d1, d2); // Compare to the minimal one. const __m128i cmp = _mm_cmpgt_epi16(dist_min, dist); dist_min = _mm_min_epi16(dist_min, dist); const __m128i ind1 = _mm_set1_epi16(j); ind = _mm_or_si128(_mm_andnot_si128(cmp, ind), _mm_and_si128(cmp, ind1)); } if (total_dist) { // Square, convert to 32 bit and add together. dist_min = _mm_madd_epi16(dist_min, dist_min); // Convert to 64 bit and add to sum. const __m128i dist1 = _mm_unpacklo_epi32(dist_min, v_zero); const __m128i dist2 = _mm_unpackhi_epi32(dist_min, v_zero); sum = _mm_add_epi64(sum, dist1); sum = _mm_add_epi64(sum, dist2); } __m128i p2 = _mm_packus_epi16(ind, v_zero); _mm_storel_epi64((__m128i *)indices, p2); indices += 8; data += 8; } if (total_dist) { *total_dist = k_means_horizontal_sum_sse2(sum); } } void av1_calc_indices_dim2_sse2(const int16_t *data, const int16_t *centroids, uint8_t *indices, int64_t *total_dist, int n, int k) { const __m128i v_zero = _mm_setzero_si128(); __m128i sum = _mm_setzero_si128(); __m128i ind[2]; __m128i cents[PALETTE_MAX_SIZE]; for (int j = 0; j < k; ++j) { const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1]; cents[j] = _mm_set_epi16(cy, cx, cy, cx, cy, cx, cy, cx); } for (int i = 0; i < n; i += 8) { for (int l = 0; l < 2; ++l) { const __m128i in = _mm_loadu_si128((__m128i *)data); ind[l] = _mm_setzero_si128(); // Compute the distance to the first centroid. __m128i d1 = _mm_sub_epi16(in, cents[0]); __m128i dist_min = _mm_madd_epi16(d1, d1); for (int j = 1; j < k; ++j) { // Compute the distance to the centroid. d1 = _mm_sub_epi16(in, cents[j]); const __m128i dist = _mm_madd_epi16(d1, d1); // Compare to the minimal one. const __m128i cmp = _mm_cmpgt_epi32(dist_min, dist); const __m128i dist1 = _mm_andnot_si128(cmp, dist_min); const __m128i dist2 = _mm_and_si128(cmp, dist); dist_min = _mm_or_si128(dist1, dist2); const __m128i ind1 = _mm_set1_epi32(j); ind[l] = _mm_or_si128(_mm_andnot_si128(cmp, ind[l]), _mm_and_si128(cmp, ind1)); } if (total_dist) { // Convert to 64 bit and add to sum. const __m128i dist1 = _mm_unpacklo_epi32(dist_min, v_zero); const __m128i dist2 = _mm_unpackhi_epi32(dist_min, v_zero); sum = _mm_add_epi64(sum, dist1); sum = _mm_add_epi64(sum, dist2); } data += 8; } // Cast to 8 bit and store. const __m128i d2 = _mm_packus_epi16(ind[0], ind[1]); const __m128i d3 = _mm_packus_epi16(d2, v_zero); _mm_storel_epi64((__m128i *)indices, d3); indices += 8; } if (total_dist) { *total_dist = k_means_horizontal_sum_sse2(sum); } } aom-3.12.1/av1/encoder/x86/av1_quantize_avx2.c000066400000000000000000000371601477627663500206560ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" static inline void write_zero(tran_low_t *qcoeff) { const __m256i zero = _mm256_setzero_si256(); _mm256_storeu_si256((__m256i *)qcoeff, zero); _mm256_storeu_si256((__m256i *)qcoeff + 1, zero); } static inline void init_one_qp(const __m128i *p, __m256i *qp) { const __m128i ac = _mm_unpackhi_epi64(*p, *p); *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1); } static inline void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *dequant_ptr, int log_scale, __m256i *thr, __m256i *qp) { __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); if (log_scale > 0) { const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1)); round = _mm_add_epi16(round, rnd); round = _mm_srai_epi16(round, log_scale); } init_one_qp(&round, &qp[0]); init_one_qp(&quant, &qp[1]); if (log_scale == 1) { qp[1] = _mm256_slli_epi16(qp[1], log_scale); } init_one_qp(&dequant, &qp[2]); *thr = _mm256_srai_epi16(qp[2], 1 + log_scale); // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when // calculating the zbin mask. *thr = _mm256_sub_epi16(*thr, _mm256_set1_epi16(1)); } static inline void update_qp(__m256i *thr, __m256i *qp) { qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11); qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11); qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11); *thr = _mm256_permute2x128_si256(*thr, *thr, 0x11); } static inline __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) { const __m256i coeff1 = _mm256_load_si256((__m256i *)coeff_ptr); const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); return _mm256_packs_epi32(coeff1, coeff2); } static inline void store_coefficients_avx2(__m256i coeff_vals, tran_low_t *coeff_ptr) { __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); _mm256_store_si256((__m256i *)coeff_ptr, coeff_vals_lo); _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); } static inline uint16_t quant_gather_eob(__m256i eob) { const __m128i eob_lo = _mm256_castsi256_si128(eob); const __m128i eob_hi = _mm256_extractf128_si256(eob, 1); __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi); eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s); eob_s = _mm_minpos_epu16(eob_s); return INT16_MAX - _mm_extract_epi16(eob_s, 0); } static inline int16_t accumulate_eob256(__m256i eob256) { const __m128i eob_lo = _mm256_castsi256_si128(eob256); const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1); __m128i eob = _mm_max_epi16(eob_lo, eob_hi); __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); eob = _mm_max_epi16(eob, eob_shuffled); return _mm_extract_epi16(eob, 1); } static AOM_FORCE_INLINE void quantize_lp_16_first( const int16_t *coeff_ptr, const int16_t *iscan_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, __m256i *round256, __m256i *quant256, __m256i *dequant256, __m256i *eob) { const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); const __m256i abs_coeff = _mm256_abs_epi16(coeff); const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256); const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256); const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256); const __m256i nz_mask = _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); _mm256_storeu_si256((__m256i *)qcoeff_ptr, qcoeff); _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dqcoeff); const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr); const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask); const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask); *eob = _mm256_max_epi16(*eob, nz_iscan); } static AOM_FORCE_INLINE void quantize_lp_16( const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *iscan_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, __m256i *round256, __m256i *quant256, __m256i *dequant256, __m256i *eob) { const __m256i coeff = _mm256_loadu_si256((const __m256i *)(coeff_ptr + n_coeffs)); const __m256i abs_coeff = _mm256_abs_epi16(coeff); const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256); const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256); const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256); const __m256i nz_mask = _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff); _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), dqcoeff); const __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr + n_coeffs)); const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask); const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask); *eob = _mm256_max_epi16(*eob, nz_iscan); } void av1_quantize_lp_avx2(const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { (void)scan; __m256i eob256 = _mm256_setzero_si256(); // Setup global values. __m256i round256 = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); __m256i quant256 = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); __m256i dequant256 = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); // Populate upper AC values. round256 = _mm256_permute4x64_epi64(round256, 0x54); quant256 = _mm256_permute4x64_epi64(quant256, 0x54); dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54); // Process DC and the first 15 AC coeffs. quantize_lp_16_first(coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &round256, &quant256, &dequant256, &eob256); if (n_coeffs > 16) { // Overwrite the DC constants with AC constants dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31); quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31); round256 = _mm256_permute2x128_si256(round256, round256, 0x31); // AC only loop. for (int idx = 16; idx < n_coeffs; idx += 16) { quantize_lp_16(coeff_ptr, idx, iscan, qcoeff_ptr, dqcoeff_ptr, &round256, &quant256, &dequant256, &eob256); } } *eob_ptr = accumulate_eob256(eob256); } static AOM_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, __m256i v_eobmax, __m256i v_mask) { const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8); const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask); const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask); return _mm256_max_epi16(v_eobmax, v_nz_iscan); } static AOM_FORCE_INLINE void quantize_fp_16( const __m256i *thr, const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) { const __m256i coeff = load_coefficients_avx2(coeff_ptr); const __m256i abs_coeff = _mm256_abs_epi16(coeff); const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); const int nzflag = _mm256_movemask_epi8(mask); if (nzflag) { const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, qp[0]); const __m256i abs_q = _mm256_mulhi_epi16(tmp_rnd, qp[1]); const __m256i q = _mm256_sign_epi16(abs_q, coeff); const __m256i dq = _mm256_mullo_epi16(q, qp[2]); const __m256i nz_mask = _mm256_cmpgt_epi16(abs_q, _mm256_setzero_si256()); store_coefficients_avx2(q, qcoeff_ptr); store_coefficients_avx2(dq, dqcoeff_ptr); *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); } else { write_zero(qcoeff_ptr); write_zero(dqcoeff_ptr); } } void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { (void)scan_ptr; (void)zbin_ptr; (void)quant_shift_ptr; const int log_scale = 0; const int step = 16; __m256i qp[3], thr; __m256i eob = _mm256_setzero_si256(); init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); quantize_fp_16(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan_ptr += step; n_coeffs -= step; update_qp(&thr, qp); while (n_coeffs > 0) { quantize_fp_16(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan_ptr += step; n_coeffs -= step; } *eob_ptr = quant_gather_eob(eob); } static AOM_FORCE_INLINE void quantize_fp_32x32( const __m256i *thr, const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) { const __m256i coeff = load_coefficients_avx2(coeff_ptr); const __m256i abs_coeff = _mm256_abs_epi16(coeff); const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); const int nzflag = _mm256_movemask_epi8(mask); if (nzflag) { const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, qp[0]); const __m256i abs_q = _mm256_mulhi_epu16(tmp_rnd, qp[1]); const __m256i q = _mm256_sign_epi16(abs_q, coeff); const __m256i abs_dq = _mm256_srli_epi16(_mm256_mullo_epi16(abs_q, qp[2]), 1); const __m256i nz_mask = _mm256_cmpgt_epi16(abs_q, _mm256_setzero_si256()); const __m256i dq = _mm256_sign_epi16(abs_dq, coeff); store_coefficients_avx2(q, qcoeff_ptr); store_coefficients_avx2(dq, dqcoeff_ptr); *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); } else { write_zero(qcoeff_ptr); write_zero(dqcoeff_ptr); } } void av1_quantize_fp_32x32_avx2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { (void)scan_ptr; (void)zbin_ptr; (void)quant_shift_ptr; const int log_scale = 1; const unsigned int step = 16; __m256i qp[3], thr; __m256i eob = _mm256_setzero_si256(); init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); quantize_fp_32x32(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan_ptr += step; n_coeffs -= step; update_qp(&thr, qp); while (n_coeffs > 0) { quantize_fp_32x32(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan_ptr += step; n_coeffs -= step; } *eob_ptr = quant_gather_eob(eob); } static inline void quantize_fp_64x64(const __m256i *thr, const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) { const __m256i coeff = load_coefficients_avx2(coeff_ptr); const __m256i abs_coeff = _mm256_abs_epi16(coeff); const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); const int nzflag = _mm256_movemask_epi8(mask); if (nzflag) { const __m256i tmp_rnd = _mm256_and_si256(_mm256_adds_epi16(abs_coeff, qp[0]), mask); const __m256i qh = _mm256_slli_epi16(_mm256_mulhi_epi16(tmp_rnd, qp[1]), 2); const __m256i ql = _mm256_srli_epi16(_mm256_mullo_epi16(tmp_rnd, qp[1]), 14); const __m256i abs_q = _mm256_or_si256(qh, ql); const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(abs_q, qp[2]), 14); const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(abs_q, qp[2]), 2); const __m256i abs_dq = _mm256_or_si256(dqh, dql); const __m256i q = _mm256_sign_epi16(abs_q, coeff); const __m256i dq = _mm256_sign_epi16(abs_dq, coeff); // Check the signed q/dq value here instead of the absolute value. When // dequant equals 4, the dequant threshold (*thr) becomes 0 after being // scaled down by (1 + log_scale). See init_qp(). When *thr is 0 and the // abs_coeff is 0, the nzflag will be set. As a result, the eob will be // incorrectly calculated. The psign instruction corrects the error by // zeroing out q/dq if coeff is zero. const __m256i z_mask = _mm256_cmpeq_epi16(dq, _mm256_setzero_si256()); const __m256i nz_mask = _mm256_cmpeq_epi16(z_mask, _mm256_setzero_si256()); store_coefficients_avx2(q, qcoeff_ptr); store_coefficients_avx2(dq, dqcoeff_ptr); *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); } else { write_zero(qcoeff_ptr); write_zero(dqcoeff_ptr); } } void av1_quantize_fp_64x64_avx2( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { (void)scan_ptr; (void)zbin_ptr; (void)quant_shift_ptr; const int log_scale = 2; const unsigned int step = 16; __m256i qp[3], thr; __m256i eob = _mm256_setzero_si256(); init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); quantize_fp_64x64(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan_ptr += step; n_coeffs -= step; update_qp(&thr, qp); while (n_coeffs > 0) { quantize_fp_64x64(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; qcoeff_ptr += step; dqcoeff_ptr += step; iscan_ptr += step; n_coeffs -= step; } *eob_ptr = quant_gather_eob(eob); } aom-3.12.1/av1/encoder/x86/av1_quantize_sse2.c000066400000000000000000000271361477627663500206540ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/quantize_x86.h" static inline void read_coeff(const tran_low_t *coeff, intptr_t offset, __m128i *c0, __m128i *c1) { const tran_low_t *addr = coeff + offset; if (sizeof(tran_low_t) == 4) { const __m128i x0 = _mm_load_si128((const __m128i *)addr); const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1); const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2); const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3); *c0 = _mm_packs_epi32(x0, x1); *c1 = _mm_packs_epi32(x2, x3); } else { *c0 = _mm_load_si128((const __m128i *)addr); *c1 = _mm_load_si128((const __m128i *)addr + 1); } } static inline void write_qcoeff(const __m128i *qc0, const __m128i *qc1, tran_low_t *qcoeff, intptr_t offset) { tran_low_t *addr = qcoeff + offset; if (sizeof(tran_low_t) == 4) { const __m128i zero = _mm_setzero_si128(); __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero); __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits); __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits); _mm_store_si128((__m128i *)addr, y0); _mm_store_si128((__m128i *)addr + 1, y1); sign_bits = _mm_cmplt_epi16(*qc1, zero); y0 = _mm_unpacklo_epi16(*qc1, sign_bits); y1 = _mm_unpackhi_epi16(*qc1, sign_bits); _mm_store_si128((__m128i *)addr + 2, y0); _mm_store_si128((__m128i *)addr + 3, y1); } else { _mm_store_si128((__m128i *)addr, *qc0); _mm_store_si128((__m128i *)addr + 1, *qc1); } } static inline void write_zero(tran_low_t *qcoeff, intptr_t offset) { const __m128i zero = _mm_setzero_si128(); tran_low_t *addr = qcoeff + offset; if (sizeof(tran_low_t) == 4) { _mm_store_si128((__m128i *)addr, zero); _mm_store_si128((__m128i *)addr + 1, zero); _mm_store_si128((__m128i *)addr + 2, zero); _mm_store_si128((__m128i *)addr + 3, zero); } else { _mm_store_si128((__m128i *)addr, zero); _mm_store_si128((__m128i *)addr + 1, zero); } } static inline void quantize(const int16_t *iscan_ptr, const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const __m128i *round0, const __m128i *round1, const __m128i *quant0, const __m128i *quant1, const __m128i *dequant0, const __m128i *dequant1, const __m128i *thr0, const __m128i *thr1, __m128i *eob) { __m128i coeff0, coeff1; // Do DC and first 15 AC read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1); // Poor man's sign extract const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15); const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15); __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0), _mm_cmpeq_epi16(qcoeff0, *thr0)); const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1), _mm_cmpeq_epi16(qcoeff1, *thr1)); const int nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1); if (nzflag) { qcoeff0 = _mm_adds_epi16(qcoeff0, *round0); qcoeff1 = _mm_adds_epi16(qcoeff1, *round1); const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0); const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1); // Reinsert signs qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs); coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0); coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1); write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs); const __m128i zero = _mm_setzero_si128(); // Scan for eob const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); const __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); const __m128i iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0); const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1); const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0); const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1); const __m128i eob2 = _mm_max_epi16(eob0, eob1); *eob = _mm_max_epi16(*eob, eob2); } else { write_zero(qcoeff_ptr, n_coeffs); write_zero(dqcoeff_ptr, n_coeffs); } } void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { (void)scan_ptr; (void)zbin_ptr; (void)quant_shift_ptr; coeff_ptr += n_coeffs; iscan_ptr += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr); const __m128i round1 = _mm_unpackhi_epi64(round0, round0); const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr); const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0); const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr); const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0); const __m128i thr0 = _mm_srai_epi16(dequant0, 1); const __m128i thr1 = _mm_srai_epi16(dequant1, 1); __m128i eob = _mm_setzero_si128(); quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0, &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob); n_coeffs += 8 * 2; // AC only loop while (n_coeffs < 0) { quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1, &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1, &eob); n_coeffs += 8 * 2; } // Accumulate EOB { __m128i eob_shuffled; eob_shuffled = _mm_shuffle_epi32(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); eob = _mm_max_epi16(eob, eob_shuffled); *eob_ptr = _mm_extract_epi16(eob, 1); } } static inline void quantize_lp(const int16_t *iscan_ptr, const int16_t *coeff_ptr, intptr_t n_coeffs, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const __m128i *round0, const __m128i *round1, const __m128i *quant0, const __m128i *quant1, const __m128i *dequant0, const __m128i *dequant1, __m128i *eob) { const int16_t *read = coeff_ptr + n_coeffs; __m128i coeff0 = _mm_load_si128((const __m128i *)read); __m128i coeff1 = _mm_load_si128((const __m128i *)read + 1); // Poor man's sign extract const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15); const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15); __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); qcoeff0 = _mm_adds_epi16(qcoeff0, *round0); qcoeff1 = _mm_adds_epi16(qcoeff1, *round1); const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0); const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1); // Reinsert signs qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); int16_t *addr = qcoeff_ptr + n_coeffs; _mm_store_si128((__m128i *)addr, qcoeff0); _mm_store_si128((__m128i *)addr + 1, qcoeff1); coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0); coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1); addr = dqcoeff_ptr + n_coeffs; _mm_store_si128((__m128i *)addr, coeff0); _mm_store_si128((__m128i *)addr + 1, coeff1); const __m128i zero = _mm_setzero_si128(); // Scan for eob const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); const __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); const __m128i iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0); const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1); const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0); const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1); const __m128i eob2 = _mm_max_epi16(eob0, eob1); *eob = _mm_max_epi16(*eob, eob2); } void av1_quantize_lp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { (void)scan; coeff_ptr += n_coeffs; iscan += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; // Setup global values const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr); const __m128i round1 = _mm_unpackhi_epi64(round0, round0); const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr); const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0); const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr); const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0); __m128i eob = _mm_setzero_si128(); // DC and first 15 AC quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0, &round1, &quant0, &quant1, &dequant0, &dequant1, &eob); n_coeffs += 8 * 2; // AC only loop while (n_coeffs < 0) { quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1, &round1, &quant1, &quant1, &dequant1, &dequant1, &eob); n_coeffs += 8 * 2; } // Accumulate EOB *eob_ptr = accumulate_eob(eob); } aom-3.12.1/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm000066400000000000000000000206371477627663500224130ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %define private_prefix av1 %include "third_party/x86inc/x86inc.asm" SECTION_RODATA pw_1: times 8 dw 1 SECTION .text %macro QUANTIZE_FP 2 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ shift, qcoeff, dqcoeff, dequant, \ eob, scan, iscan cmp dword skipm, 0 jne .blank ; actual quantize loop - setup pointers, rounders, etc. movifnidn coeffq, coeffmp movifnidn ncoeffq, ncoeffmp mov r2, dequantmp movifnidn zbinq, zbinmp movifnidn roundq, roundmp movifnidn quantq, quantmp mova m1, [roundq] ; m1 = round mova m2, [quantq] ; m2 = quant %ifidn %1, fp_32x32 pcmpeqw m5, m5 psrlw m5, 15 paddw m1, m5 psrlw m1, 1 ; m1 = (m1 + 1) / 2 %endif mova m3, [r2q] ; m3 = dequant mov r3, qcoeffmp mov r4, dqcoeffmp mov r5, iscanmp %ifidn %1, fp_32x32 psllw m2, 1 %endif pxor m5, m5 ; m5 = dedicated zero lea coeffq, [ coeffq+ncoeffq*2] lea r5q, [ r5q+ncoeffq*2] lea r3q, [ r3q+ncoeffq*2] lea r4q, [r4q+ncoeffq*2] neg ncoeffq ; get DC and first 15 AC coeffs mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) pcmpeqw m7, m7 paddsw m6, m1 ; m6 += round punpckhqdq m1, m1 paddsw m11, m1 ; m11 += round pmulhw m8, m6, m2 ; m8 = m6*q>>16 punpckhqdq m2, m2 pmulhw m13, m11, m2 ; m13 = m11*q>>16 psignw m8, m9 ; m8 = reinsert sign psignw m13, m10 ; m13 = reinsert sign mova [r3q+ncoeffq*2+ 0], m8 mova [r3q+ncoeffq*2+16], m13 %ifidn %1, fp_32x32 pabsw m8, m8 pabsw m13, m13 %endif pmullw m8, m3 ; r4[i] = r3[i] * q punpckhqdq m3, m3 pmullw m13, m3 ; r4[i] = r3[i] * q %ifidn %1, fp_32x32 psrlw m8, 1 psrlw m13, 1 psignw m8, m9 psignw m13, m10 psrlw m0, m3, 2 %else psrlw m0, m3, 1 %endif mova [r4q+ncoeffq*2+ 0], m8 mova [r4q+ncoeffq*2+16], m13 pcmpeqw m8, m5 ; m8 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] psubw m6, m7 ; m6 = scan[i] + 1 psubw m11, m7 ; m11 = scan[i] + 1 pandn m8, m6 ; m8 = max(eob) pandn m13, m11 ; m13 = max(eob) pmaxsw m8, m13 add ncoeffq, mmsize jz .accumulate_eob .ac_only_loop: mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) pcmpgtw m7, m6, m0 pcmpgtw m12, m11, m0 pmovmskb r6d, m7 pmovmskb r2d, m12 or r6, r2 jz .skip_iter pcmpeqw m7, m7 paddsw m6, m1 ; m6 += round paddsw m11, m1 ; m11 += round pmulhw m14, m6, m2 ; m14 = m6*q>>16 pmulhw m13, m11, m2 ; m13 = m11*q>>16 psignw m14, m9 ; m14 = reinsert sign psignw m13, m10 ; m13 = reinsert sign mova [r3q+ncoeffq*2+ 0], m14 mova [r3q+ncoeffq*2+16], m13 %ifidn %1, fp_32x32 pabsw m14, m14 pabsw m13, m13 %endif pmullw m14, m3 ; r4[i] = r3[i] * q pmullw m13, m3 ; r4[i] = r3[i] * q %ifidn %1, fp_32x32 psrlw m14, 1 psrlw m13, 1 psignw m14, m9 psignw m13, m10 %endif mova [r4q+ncoeffq*2+ 0], m14 mova [r4q+ncoeffq*2+16], m13 pcmpeqw m14, m5 ; m14 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] psubw m6, m7 ; m6 = scan[i] + 1 psubw m11, m7 ; m11 = scan[i] + 1 pandn m14, m6 ; m14 = max(eob) pandn m13, m11 ; m13 = max(eob) pmaxsw m8, m14 pmaxsw m8, m13 add ncoeffq, mmsize jl .ac_only_loop jmp .accumulate_eob .skip_iter: mova [r3q+ncoeffq*2+ 0], m5 mova [r3q+ncoeffq*2+16], m5 mova [r4q+ncoeffq*2+ 0], m5 mova [r4q+ncoeffq*2+16], m5 add ncoeffq, mmsize jl .ac_only_loop .accumulate_eob: ; horizontally accumulate/max eobs and write into [eob] memory pointer mov r2, eobmp pshufd m7, m8, 0xe pmaxsw m8, m7 pshuflw m7, m8, 0xe pmaxsw m8, m7 pshuflw m7, m8, 0x1 pmaxsw m8, m7 pextrw r6, m8, 0 mov [r2], r6 RET ; skip-block, i.e. just write all zeroes .blank: mov r0, dqcoeffmp movifnidn ncoeffq, ncoeffmp mov r2, qcoeffmp mov r3, eobmp lea r0q, [r0q+ncoeffq*2] lea r2q, [r2q+ncoeffq*2] neg ncoeffq pxor m7, m7 .blank_loop: mova [r0q+ncoeffq*2+ 0], m7 mova [r0q+ncoeffq*2+16], m7 mova [r2q+ncoeffq*2+ 0], m7 mova [r2q+ncoeffq*2+16], m7 add ncoeffq, mmsize jl .blank_loop mov word [r3q], 0 RET %endmacro INIT_XMM ssse3 QUANTIZE_FP fp, 7 QUANTIZE_FP fp_32x32, 7 aom-3.12.1/av1/encoder/x86/av1_ssim_opt_x86_64.asm000066400000000000000000000143751477627663500212720ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "aom_ports/x86_abi_support.asm" ; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr %macro TABULATE_SSIM 0 paddusw xmm15, xmm3 ; sum_s paddusw xmm14, xmm4 ; sum_r movdqa xmm1, xmm3 pmaddwd xmm1, xmm1 paddd xmm13, xmm1 ; sum_sq_s movdqa xmm2, xmm4 pmaddwd xmm2, xmm2 paddd xmm12, xmm2 ; sum_sq_r pmaddwd xmm3, xmm4 paddd xmm11, xmm3 ; sum_sxr %endmacro ; Sum across the register %1 starting with q words %macro SUM_ACROSS_Q 1 movdqa xmm2,%1 punpckldq %1,xmm0 punpckhdq xmm2,xmm0 paddq %1,xmm2 movdqa xmm2,%1 punpcklqdq %1,xmm0 punpckhqdq xmm2,xmm0 paddq %1,xmm2 %endmacro ; Sum across the register %1 starting with q words %macro SUM_ACROSS_W 1 movdqa xmm1, %1 punpcklwd %1,xmm0 punpckhwd xmm1,xmm0 paddd %1, xmm1 SUM_ACROSS_Q %1 %endmacro SECTION .text ;void ssim_parms_sse2( ; unsigned char *s, ; int sp, ; unsigned char *r, ; int rp ; unsigned long *sum_s, ; unsigned long *sum_r, ; unsigned long *sum_sq_s, ; unsigned long *sum_sq_r, ; unsigned long *sum_sxr); ; ; TODO: Use parm passing through structure, probably don't need the pxors ; ( calling app will initialize to 0 ) could easily fit everything in sse2 ; without too much hastle, and can probably do better estimates with psadw ; or pavgb At this point this is just meant to be first pass for calculating ; all the parms needed for 16x16 ssim so we can play with dssim as distortion ; in mode selection code. globalsym(av1_ssim_parms_16x16_sse2) sym(av1_ssim_parms_16x16_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 SAVE_XMM 15 push rsi push rdi ; end prolog mov rsi, arg(0) ;s mov rcx, arg(1) ;sp mov rdi, arg(2) ;r mov rax, arg(3) ;rp pxor xmm0, xmm0 pxor xmm15,xmm15 ;sum_s pxor xmm14,xmm14 ;sum_r pxor xmm13,xmm13 ;sum_sq_s pxor xmm12,xmm12 ;sum_sq_r pxor xmm11,xmm11 ;sum_sxr mov rdx, 16 ;row counter .NextRow: ;grab source and reference pixels movdqu xmm5, [rsi] movdqu xmm6, [rdi] movdqa xmm3, xmm5 movdqa xmm4, xmm6 punpckhbw xmm3, xmm0 ; high_s punpckhbw xmm4, xmm0 ; high_r TABULATE_SSIM movdqa xmm3, xmm5 movdqa xmm4, xmm6 punpcklbw xmm3, xmm0 ; low_s punpcklbw xmm4, xmm0 ; low_r TABULATE_SSIM add rsi, rcx ; next s row add rdi, rax ; next r row dec rdx ; counter jnz .NextRow SUM_ACROSS_W xmm15 SUM_ACROSS_W xmm14 SUM_ACROSS_Q xmm13 SUM_ACROSS_Q xmm12 SUM_ACROSS_Q xmm11 mov rdi,arg(4) movd [rdi], xmm15; mov rdi,arg(5) movd [rdi], xmm14; mov rdi,arg(6) movd [rdi], xmm13; mov rdi,arg(7) movd [rdi], xmm12; mov rdi,arg(8) movd [rdi], xmm11; ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret ;void ssim_parms_sse2( ; unsigned char *s, ; int sp, ; unsigned char *r, ; int rp ; unsigned long *sum_s, ; unsigned long *sum_r, ; unsigned long *sum_sq_s, ; unsigned long *sum_sq_r, ; unsigned long *sum_sxr); ; ; TODO: Use parm passing through structure, probably don't need the pxors ; ( calling app will initialize to 0 ) could easily fit everything in sse2 ; without too much hastle, and can probably do better estimates with psadw ; or pavgb At this point this is just meant to be first pass for calculating ; all the parms needed for 16x16 ssim so we can play with dssim as distortion ; in mode selection code. globalsym(av1_ssim_parms_8x8_sse2) sym(av1_ssim_parms_8x8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 SAVE_XMM 15 push rsi push rdi ; end prolog mov rsi, arg(0) ;s mov rcx, arg(1) ;sp mov rdi, arg(2) ;r mov rax, arg(3) ;rp pxor xmm0, xmm0 pxor xmm15,xmm15 ;sum_s pxor xmm14,xmm14 ;sum_r pxor xmm13,xmm13 ;sum_sq_s pxor xmm12,xmm12 ;sum_sq_r pxor xmm11,xmm11 ;sum_sxr mov rdx, 8 ;row counter .NextRow: ;grab source and reference pixels movq xmm3, [rsi] movq xmm4, [rdi] punpcklbw xmm3, xmm0 ; low_s punpcklbw xmm4, xmm0 ; low_r TABULATE_SSIM add rsi, rcx ; next s row add rdi, rax ; next r row dec rdx ; counter jnz .NextRow SUM_ACROSS_W xmm15 SUM_ACROSS_W xmm14 SUM_ACROSS_Q xmm13 SUM_ACROSS_Q xmm12 SUM_ACROSS_Q xmm11 mov rdi,arg(4) movd [rdi], xmm15; mov rdi,arg(5) movd [rdi], xmm14; mov rdi,arg(6) movd [rdi], xmm13; mov rdi,arg(7) movd [rdi], xmm12; mov rdi,arg(8) movd [rdi], xmm11; ; begin epilog pop rdi pop rsi RESTORE_XMM UNSHADOW_ARGS pop rbp ret aom-3.12.1/av1/encoder/x86/av1_temporal_denoiser_sse2.c000066400000000000000000000333121477627663500225200ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include // SSE2 #include "aom/aom_integer.h" #include "aom_dsp/x86/mem_sse2.h" #include "av1/common/reconinter.h" #include "av1/encoder/context_tree.h" #include "av1/encoder/av1_temporal_denoiser.h" // Compute the sum of all pixel differences of this MB. static inline int sum_diff_16x1(__m128i acc_diff) { const __m128i k_1 = _mm_set1_epi16(1); const __m128i acc_diff_lo = _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8); const __m128i acc_diff_hi = _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8); const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); return _mm_cvtsi128_si32(hgfedcba); } // Denoise a 16x1 vector. static inline __m128i av1_denoiser_16x1_sse2( const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, const __m128i *k_0, const __m128i *k_4, const __m128i *k_8, const __m128i *k_16, const __m128i *l3, const __m128i *l32, const __m128i *l21, __m128i acc_diff) { // Calculate differences const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); __m128i v_running_avg_y; const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); // Obtain the sign. FF if diff is negative. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0); // Clamp absolute difference to 16 to be used to get mask. Doing this // allows us to use _mm_cmpgt_epi8, which operates on signed byte. const __m128i clamped_absdiff = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16); // Get masks for l2 l1 and l0 adjustments. const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff); const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff); const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff); // Get adjustments for l2, l1, and l0. __m128i adj2 = _mm_and_si128(mask2, *l32); const __m128i adj1 = _mm_and_si128(mask1, *l21); const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); __m128i adj, padj, nadj; // Combine the adjustments and get absolute adjustments. adj2 = _mm_add_epi8(adj2, adj1); adj = _mm_sub_epi8(*l3, adj2); adj = _mm_andnot_si128(mask0, adj); adj = _mm_or_si128(adj, adj0); // Restore the sign and get positive and negative adjustments. padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); // Calculate filtered value. v_running_avg_y = _mm_adds_epu8(v_sig, padj); v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); // Adjustments <=7, and each element in acc_diff can fit in signed // char. acc_diff = _mm_adds_epi8(acc_diff, padj); acc_diff = _mm_subs_epi8(acc_diff, nadj); return acc_diff; } // Denoise a 16x1 vector with a weaker filter. static inline __m128i av1_denoiser_adj_16x1_sse2( const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, const __m128i k_0, const __m128i k_delta, __m128i acc_diff) { __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0])); // Calculate differences. const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); // Obtain the sign. FF if diff is negative. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); // Clamp absolute difference to delta to get the adjustment. const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); // Restore the sign and get positive and negative adjustments. __m128i padj, nadj; padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); // Calculate filtered value. v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj); v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); // Accumulate the adjustments. acc_diff = _mm_subs_epi8(acc_diff, padj); acc_diff = _mm_adds_epi8(acc_diff, nadj); return acc_diff; } // Denoise 8x8 and 8x16 blocks. static int av1_denoiser_NxM_sse2_small(const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y, int mc_avg_y_stride, uint8_t *running_avg_y, int avg_y_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude, int width) { int sum_diff_thresh, r, sum_diff = 0; const int shift_inc = (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16]; __m128i acc_diff = _mm_setzero_si128(); const __m128i k_0 = _mm_setzero_si128(); const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); const __m128i k_8 = _mm_set1_epi8(8); const __m128i k_16 = _mm_set1_epi8(16); // Modify each level's adjustment according to motion_magnitude. const __m128i l3 = _mm_set1_epi8( (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); // Difference between level 3 and level 2 is 2. const __m128i l32 = _mm_set1_epi8(2); // Difference between level 2 and level 1 is 1. const __m128i l21 = _mm_set1_epi8(1); const int b_height = block_size_high[bs] >> 1; for (r = 0; r < b_height; ++r) { memcpy(sig_buffer[r], sig, width); memcpy(sig_buffer[r] + width, sig + sig_stride, width); memcpy(mc_running_buffer[r], mc_running_avg_y, width); memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride, width); memcpy(running_buffer[r], running_avg_y, width); memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width); acc_diff = av1_denoiser_16x1_sse2(sig_buffer[r], mc_running_buffer[r], running_buffer[r], &k_0, &k_4, &k_8, &k_16, &l3, &l32, &l21, acc_diff); memcpy(running_avg_y, running_buffer[r], width); memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width); // Update pointers for next iteration. sig += (sig_stride << 1); mc_running_avg_y += (mc_avg_y_stride << 1); running_avg_y += (avg_y_stride << 1); } { sum_diff = sum_diff_16x1(acc_diff); sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); if (abs(sum_diff) > sum_diff_thresh) { // Before returning to copy the block (i.e., apply no denoising), // check if we can still apply some (weaker) temporal filtering to // this block, that would otherwise not be denoised at all. Simplest // is to apply an additional adjustment to running_avg_y to bring it // closer to sig. The adjustment is capped by a maximum delta, and // chosen such that in most cases the resulting sum_diff will be // within the acceptable range given by sum_diff_thresh. // The delta is set by the excess of absolute pixel diff over the // threshold. const int delta = ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; // Only apply the adjustment for max delta up to 3. if (delta < 4) { const __m128i k_delta = _mm_set1_epi8(delta); running_avg_y -= avg_y_stride * (b_height << 1); for (r = 0; r < b_height; ++r) { acc_diff = av1_denoiser_adj_16x1_sse2( sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_0, k_delta, acc_diff); memcpy(running_avg_y, running_buffer[r], width); memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width); // Update pointers for next iteration. running_avg_y += (avg_y_stride << 1); } sum_diff = sum_diff_16x1(acc_diff); if (abs(sum_diff) > sum_diff_thresh) { return COPY_BLOCK; } } else { return COPY_BLOCK; } } } return FILTER_BLOCK; } // Denoise 16x16 to 128x128 blocks. static int av1_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y, int mc_avg_y_stride, uint8_t *running_avg_y, int avg_y_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude) { int sum_diff_thresh, r, c, sum_diff = 0; const int shift_inc = (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; __m128i acc_diff[8][8]; const __m128i k_0 = _mm_setzero_si128(); const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); const __m128i k_8 = _mm_set1_epi8(8); const __m128i k_16 = _mm_set1_epi8(16); // Modify each level's adjustment according to motion_magnitude. const __m128i l3 = _mm_set1_epi8( (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); // Difference between level 3 and level 2 is 2. const __m128i l32 = _mm_set1_epi8(2); // Difference between level 2 and level 1 is 1. const __m128i l21 = _mm_set1_epi8(1); const int b_width = block_size_wide[bs]; const int b_height = block_size_high[bs]; const int b_width_shift4 = b_width >> 4; for (r = 0; r < 8; ++r) { for (c = 0; c < b_width_shift4; ++c) { acc_diff[c][r] = _mm_setzero_si128(); } } for (r = 0; r < b_height; ++r) { for (c = 0; c < b_width_shift4; ++c) { acc_diff[c][r >> 4] = av1_denoiser_16x1_sse2( sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, &k_8, &k_16, &l3, &l32, &l21, acc_diff[c][r >> 4]); // Update pointers for next iteration. sig += 16; mc_running_avg_y += 16; running_avg_y += 16; } if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { for (c = 0; c < b_width_shift4; ++c) { sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]); } } // Update pointers for next iteration. sig = sig - b_width + sig_stride; mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; running_avg_y = running_avg_y - b_width + avg_y_stride; } { sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); if (abs(sum_diff) > sum_diff_thresh) { const int delta = ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; // Only apply the adjustment for max delta up to 3. if (delta < 4) { const __m128i k_delta = _mm_set1_epi8(delta); sig -= sig_stride * b_height; mc_running_avg_y -= mc_avg_y_stride * b_height; running_avg_y -= avg_y_stride * b_height; sum_diff = 0; for (r = 0; r < b_height; ++r) { for (c = 0; c < b_width_shift4; ++c) { acc_diff[c][r >> 4] = av1_denoiser_adj_16x1_sse2(sig, mc_running_avg_y, running_avg_y, k_0, k_delta, acc_diff[c][r >> 4]); // Update pointers for next iteration. sig += 16; mc_running_avg_y += 16; running_avg_y += 16; } if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { for (c = 0; c < b_width_shift4; ++c) { sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]); } } sig = sig - b_width + sig_stride; mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; running_avg_y = running_avg_y - b_width + avg_y_stride; } if (abs(sum_diff) > sum_diff_thresh) { return COPY_BLOCK; } } else { return COPY_BLOCK; } } } return FILTER_BLOCK; } int av1_denoiser_filter_sse2(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude) { // Rank by frequency of the block type to have an early termination. if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 || bs == BLOCK_128X128 || bs == BLOCK_128X64 || bs == BLOCK_64X128 || bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 || bs == BLOCK_32X64 || bs == BLOCK_64X32) { return av1_denoiser_NxM_sse2_big(sig, sig_stride, mc_avg, mc_avg_stride, avg, avg_stride, increase_denoising, bs, motion_magnitude); } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) { return av1_denoiser_NxM_sse2_small(sig, sig_stride, mc_avg, mc_avg_stride, avg, avg_stride, increase_denoising, bs, motion_magnitude, 8); } else { return COPY_BLOCK; } } aom-3.12.1/av1/encoder/x86/av1_txfm1d_sse4.h000066400000000000000000000114611477627663500202200ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ #define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ #include #include "av1/common/av1_txfm.h" #include "av1/common/x86/av1_txfm_sse4.h" #ifdef __cplusplus extern "C" { #endif void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit, const int stride); void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit, const int instride, const int outstride); void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit, const int col_num); static inline void transpose_32_4x4(int stride, const __m128i *input, __m128i *output) { __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]); __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]); __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]); __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]); output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2); output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2); output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3); output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3); } // the entire input block can be represent by a grid of 4x4 blocks // each 4x4 blocks can be represent by 4 vertical __m128i // we first transpose each 4x4 block internally // then transpose the grid static inline void transpose_32(int txfm_size, const __m128i *input, __m128i *output) { const int num_per_128 = 4; const int row_size = txfm_size; const int col_size = txfm_size / num_per_128; int r, c; // transpose each 4x4 block internally for (r = 0; r < row_size; r += 4) { for (c = 0; c < col_size; c++) { transpose_32_4x4(col_size, &input[r * col_size + c], &output[c * 4 * col_size + r / 4]); } } } // out0 = in0*w0 + in1*w1 // out1 = -in1*w0 + in0*w1 #define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \ do { \ const __m128i ww0 = _mm_set1_epi32(w0); \ const __m128i ww1 = _mm_set1_epi32(w1); \ const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \ const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \ out0 = _mm_add_epi32(in0_w0, in1_w1); \ out0 = av1_round_shift_32_sse4_1(out0, bit); \ const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \ const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \ out1 = _mm_sub_epi32(in0_w1, in1_w0); \ out1 = av1_round_shift_32_sse4_1(out1, bit); \ } while (0) // out0 = in0*w0 + in1*w1 // out1 = in1*w0 - in0*w1 #define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \ do { \ btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit); \ } while (0) // out0 = in0*w0 + in1*w1 // out1 = -in1*w0 + in0*w1 #define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ do { \ const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \ const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \ out0 = _mm_add_epi32(in0_w0, in1_w1); \ out0 = _mm_add_epi32(out0, r); \ out0 = _mm_srai_epi32(out0, bit); \ const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \ const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \ out1 = _mm_sub_epi32(in0_w1, in1_w0); \ out1 = _mm_add_epi32(out1, r); \ out1 = _mm_srai_epi32(out1, bit); \ } while (0) // out0 = in0*w0 + in1*w1 // out1 = in1*w0 - in0*w1 #define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ do { \ btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit); \ } while (0) #ifdef __cplusplus } #endif #endif // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ aom-3.12.1/av1/encoder/x86/cnn_avx2.c000066400000000000000000000620451477627663500170250ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "aom_dsp/aom_dsp_common.h" #include "av1/common/av1_common_int.h" #include "av1/encoder/cnn.h" // This mask rearranges source pixels in the order shown below. // shuffle_src_layer0[0][8]: applied on source pixels 0 to 7. // shuffle_src_layer0[1][8]: applied on source pixels 7 to 14. // This shuffling is needed to process 3 5x5 blocks which need // source pixels in the following order. // 1st 5x5 block: source pixels needed are 0 to 4, // 2nd 5x5 block: source pixels needed are 4 to 8, // 3rd 5x5 block: source pixels needed are 8 to 12. // Source pixels are loaded like mentioned below. // load_src0 : 0, 1, 2, 3, 4, 5, 6, 7 // load_src1 : 7, 8, 9, 10, 11, 12, 13, 14 // After applying masks, source bytes will be in the order: // load_src0 : 0, 1, 2, 3, 4, 4, 5, 6 // consists 5 pixels needed for 1st 5x5 block and // first 3 pixels needed for 2nd 5x5 block. // load_src1 : 7, 8, 8, 9, 10, 11, 12, x // consists last 2 pixels needed for 2nd 5x5 block and // 5 pixels needed for 3rd 5x5 block. DECLARE_ALIGNED(32, static const uint32_t, shuffle_src_layer0[2][8]) = { { 0, 1, 2, 3, 4, 4, 5, 6 }, { 0, 1, 1, 2, 3, 4, 5, 0 } }; // This mask rearrange the weights to match shuffled source pixels order. DECLARE_ALIGNED(32, static const uint32_t, shuffle_weight_layer0[2][8]) = { { 0, 1, 2, 3, 4, 0, 1, 2 }, { 3, 4, 0, 1, 2, 3, 4, 0 } }; // Shuffle mask used to rearrange weights corresponding to layer 1 and layer 2. // For layer 1 and layer 2, convolution happens at 2x2 as filter_width and // filter_height are equal to 2. So rearranging the weights in the // order shown below to match source pixels. Basically this mask replicates // the weights across the width of 2. DECLARE_ALIGNED(32, static const uint32_t, shuffle_weight_layer_1_and_2[2][8]) = { { 0, 1, 0, 1, 0, 1, 0, 1 }, { 2, 3, 2, 3, 2, 3, 2, 3 } }; // After the stages of multiplication and accumulation, the output values // in the register will be jumbled. In order to store register into // output buffer in a proper way, the following mask is applied on output // register. DECLARE_ALIGNED(32, static const uint32_t, shuffle_output_layer_1_and_2[8]) = { 0, 1, 4, 5, 2, 3, 6, 7 }; // Load weights needed for layer 0 (for 5x5 block processing), // and fill the registers appropriately to match source pixel mapping. static inline void prepare_weights_for_5x5_convolve( const float *layer_config_weights, int off, float weight[5][8], const int cstep, __m256 *shuffle_weight, const __m256i weight_mask_0, const __m256i weight_mask_1) { for (int row = 0; row < 5; ++row) { for (int col = 0; col < 5; ++col) { weight[row][col] = layer_config_weights[off]; off += cstep; } } shuffle_weight[0] = _mm256_loadu_ps(weight[0]); shuffle_weight[1] = _mm256_loadu_ps(weight[1]); shuffle_weight[2] = _mm256_loadu_ps(weight[2]); shuffle_weight[3] = _mm256_loadu_ps(weight[3]); shuffle_weight[4] = _mm256_loadu_ps(weight[4]); shuffle_weight[0] = _mm256_permutevar8x32_ps(shuffle_weight[0], weight_mask_0); shuffle_weight[1] = _mm256_permutevar8x32_ps(shuffle_weight[1], weight_mask_0); shuffle_weight[2] = _mm256_permutevar8x32_ps(shuffle_weight[2], weight_mask_0); shuffle_weight[3] = _mm256_permutevar8x32_ps(shuffle_weight[3], weight_mask_0); shuffle_weight[4] = _mm256_permutevar8x32_ps(shuffle_weight[4], weight_mask_0); shuffle_weight[5] = _mm256_permutevar8x32_ps(shuffle_weight[0], weight_mask_1); shuffle_weight[6] = _mm256_permutevar8x32_ps(shuffle_weight[1], weight_mask_1); shuffle_weight[7] = _mm256_permutevar8x32_ps(shuffle_weight[2], weight_mask_1); shuffle_weight[8] = _mm256_permutevar8x32_ps(shuffle_weight[3], weight_mask_1); shuffle_weight[9] = _mm256_permutevar8x32_ps(shuffle_weight[4], weight_mask_1); } // For each row, loads source pixels 0 to 7(load_src_0), 7 to 14(load_src_1) and // arranges them appropriately to process 3 blocks. #define PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS() \ do { \ for (int row = 0; row < 5; row++) { \ load_src_0 = _mm256_loadu_ps(input_ptr); \ load_src_1 = _mm256_loadu_ps(input_ptr + 7); \ load_src_0 = _mm256_permutevar8x32_ps(load_src_0, block0_1); \ load_src_1 = _mm256_permutevar8x32_ps(load_src_1, block1_2); \ load_src_0 = _mm256_mul_ps(load_src_0, shuffle_weight[0 + row]); \ load_src_1 = _mm256_mul_ps(load_src_1, shuffle_weight[5 + row]); \ accum_src_0 = _mm256_add_ps(load_src_0, accum_src_0); \ accum_src_1 = _mm256_add_ps(load_src_1, accum_src_1); \ input_ptr += in_stride; \ } \ } while (0) // Load masks needed for shuffling of output and weights. static inline void load_shuffle_masks_for_2x2_convolve(__m256i *output_mask, __m256i *weight_mask) { // Load shuffle buffer needed to sort the output. *output_mask = _mm256_load_si256((const __m256i *)shuffle_output_layer_1_and_2); // Load shuffle buffers needed for weight. weight_mask[0] = _mm256_load_si256((const __m256i *)shuffle_weight_layer_1_and_2[0]); weight_mask[1] = _mm256_load_si256((const __m256i *)shuffle_weight_layer_1_and_2[1]); } // Load weights needed for layer 1 and 2 (for 2x2 block processing), // and fill the registers appropriately to match source pixel mapping. static inline void prepare_weights_for_2x2_convolve( const float *layer_config_weights, int off, const int cstep, __m256 *shuffle_weight, __m256i *weight_mask) { // Weights needed for 2x2 block. float weight[4] = { 0 }; for (int i = 0; i < 4; ++i) { weight[i] = layer_config_weights[off]; off += cstep; } const __m256 weight_vec = _mm256_castps128_ps256(_mm_loadu_ps(weight)); shuffle_weight[0] = _mm256_permutevar8x32_ps(weight_vec, weight_mask[0]); shuffle_weight[1] = _mm256_permutevar8x32_ps(weight_vec, weight_mask[1]); } // Do convolution of one 5x5 block. #define PERFORM_CONVOLVE_FOR_1_5X5_BLOCK(w, accum0, in_stride) \ do { \ __m128 load_src[5]; \ load_src[0] = _mm_loadu_ps(input_ptr); \ last_column_sum += input_ptr[4] * weight[0][4]; \ input_ptr += in_stride; \ load_src[1] = _mm_loadu_ps(input_ptr); \ last_column_sum += input_ptr[4] * weight[1][4]; \ input_ptr += in_stride; \ load_src[2] = _mm_loadu_ps(input_ptr); \ last_column_sum += input_ptr[4] * weight[2][4]; \ input_ptr += in_stride; \ load_src[3] = _mm_loadu_ps(input_ptr); \ last_column_sum += input_ptr[4] * weight[3][4]; \ input_ptr += in_stride; \ load_src[4] = _mm_loadu_ps(input_ptr); \ last_column_sum += input_ptr[4] * weight[4][4]; \ \ load_src[0] = _mm_mul_ps(load_src[0], _mm256_castps256_ps128(w[0])); \ load_src[1] = _mm_mul_ps(load_src[1], _mm256_castps256_ps128(w[1])); \ load_src[2] = _mm_mul_ps(load_src[2], _mm256_castps256_ps128(w[2])); \ load_src[3] = _mm_mul_ps(load_src[3], _mm256_castps256_ps128(w[3])); \ load_src[4] = _mm_mul_ps(load_src[4], _mm256_castps256_ps128(w[4])); \ \ accum0 = _mm_add_ps(load_src[0], accum0); \ load_src[1] = _mm_add_ps(load_src[1], load_src[2]); \ load_src[3] = _mm_add_ps(load_src[3], load_src[4]); \ load_src[1] = _mm_add_ps(load_src[1], load_src[3]); \ accum0 = _mm_add_ps(accum0, load_src[1]); \ } while (0) // Do convolution on 8 horizontal 2x2 blocks. static inline void perform_convolve_for_8h_2x2_blocks( const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum, __m256i shuffle_output_mask) { __m256 load_src[4]; // Load input into source registers. load_src[0] = _mm256_loadu_ps(input_ptr); load_src[1] = _mm256_loadu_ps(input_ptr + 8); load_src[2] = _mm256_loadu_ps(input_ptr + in_stride); load_src[3] = _mm256_loadu_ps(input_ptr + in_stride + 8); // Multiply the loaded input with corresponding weights. load_src[0] = _mm256_mul_ps(load_src[0], weight[0]); load_src[1] = _mm256_mul_ps(load_src[1], weight[0]); load_src[2] = _mm256_mul_ps(load_src[2], weight[1]); load_src[3] = _mm256_mul_ps(load_src[3], weight[1]); // Accumulate across 2x2 blocks. load_src[0] = _mm256_add_ps(load_src[0], load_src[2]); load_src[1] = _mm256_add_ps(load_src[1], load_src[3]); load_src[0] = _mm256_hadd_ps(load_src[0], load_src[1]); // Sort the output in order to store into output buffer. load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask); *out_accum = _mm256_add_ps(*out_accum, load_src[0]); } // Do convolution on 8 (4 horizontal x 2 vertical) 2x2 blocks. static inline void perform_convolve_for_4hx2v_2x2_blocks( const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum, __m256i shuffle_output_mask) { __m256 load_src[4]; // Load input into source registers. load_src[0] = _mm256_loadu_ps(input_ptr); load_src[1] = _mm256_loadu_ps(input_ptr + in_stride); load_src[2] = _mm256_loadu_ps(input_ptr + (in_stride * 2)); load_src[3] = _mm256_loadu_ps(input_ptr + (in_stride * 3)); // Multiply the loaded input with corresponding weights. load_src[0] = _mm256_mul_ps(load_src[0], weight[0]); load_src[1] = _mm256_mul_ps(load_src[1], weight[1]); load_src[2] = _mm256_mul_ps(load_src[2], weight[0]); load_src[3] = _mm256_mul_ps(load_src[3], weight[1]); // Accumulate across 2x2 blocks. load_src[0] = _mm256_add_ps(load_src[0], load_src[1]); load_src[2] = _mm256_add_ps(load_src[2], load_src[3]); load_src[0] = _mm256_hadd_ps(load_src[0], load_src[2]); // Sort the output in order to store into output buffer. load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask); *out_accum = _mm256_add_ps(*out_accum, load_src[0]); } // AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when // filter_width and filter_height are equal to 5. // CNN convolve parsing is based on av1_intra_mode_cnn_partition_cnn_config. // Based on the configuration set for each layer, the current encoder // always chooses the case of no_maxpool_padding_valid. // And also for layer 0 convolution happens at 5x5 level as the // filter_width and filter_height are set as 5. static void cnn_convolve_no_maxpool_padding_valid_5x5_avx2( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, int start_idx, const int cstep, const int channel_step) { const int kFilterWidth = 5; const int kFilterHeight = 5; const int kSkipWidth = 4; const int kSkipHeight = 4; assert(layer_config->filter_width == kFilterWidth && layer_config->filter_height == kFilterHeight); assert(layer_config->skip_width == kSkipWidth && layer_config->skip_height == kSkipHeight); // Load shuffle buffers needed for source. const __m256i block0_1 = _mm256_load_si256((const __m256i *)shuffle_src_layer0[0]); const __m256i block1_2 = _mm256_load_si256((const __m256i *)shuffle_src_layer0[1]); // Load shuffle buffers needed for weight. const __m256i weight_mask_0 = _mm256_load_si256((const __m256i *)shuffle_weight_layer0[0]); const __m256i weight_mask_1 = _mm256_load_si256((const __m256i *)shuffle_weight_layer0[1]); // Width needs to be moved to go to next iteration of processing 3 5x5 blocks. const int kSkipWidthForNextIter = kSkipWidth * 3; // Minimum width required to process 3 5x5 blocks at a time. // min width (for processing 3 5x5 block) = 2*skip_width + filter_width // Here, skip_width specifies how much width we should move while processing // next block convolution and filter_width specifies for how many pixels // filter needs to be applied. const int kMinWidthFor3_5x5Blocks = (kSkipWidth * 2) + kFilterWidth; for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { const float out_ch_bias = layer_config->bias[i]; for (int k = 0; k < layer_config->in_channels; ++k) { __m256 shuffle_weight[10]; // Weights needed are 5x5, for SIMD purpose made this array as 5x8. float weight[5][8] = { { 0 } }; int off = k * layer_config->out_channels + i; // In layer 0, the convolution process happens at 5x5. // The weights needed for 5x5 block are same across the in-channels, // which is why the load of weights happens once for each in-channel. prepare_weights_for_5x5_convolve(layer_config->weights, off, weight, cstep, shuffle_weight, weight_mask_0, weight_mask_1); for (int h = 0, u = 0; h < in_height - kFilterHeight + 1; h += kSkipHeight, ++u) { const int out_h = u * out_stride; int v = 0; int w = 0; int rem_width = in_width; // Processing 3 5x5 blocks at a time, if sufficient width is present. while (rem_width >= kMinWidthFor3_5x5Blocks) { __m256 load_src_0, load_src_1; __m256 accum_src_0 = _mm256_setzero_ps(); __m256 accum_src_1 = _mm256_setzero_ps(); const float *input_ptr = &input[k][h * in_stride + w]; PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS(); // Accumulate across column. __m256 accum = _mm256_hadd_ps(accum_src_0, accum_src_1); __m128 tmp_reg_0 = _mm256_extractf128_ps(accum_src_0, 1); __m128 tmp_reg_1 = _mm256_extractf128_ps(accum_src_1, 1); __m128 accum_l = _mm256_castps256_ps128(accum); __m128 accum_h = _mm256_extractf128_ps(accum, 1); __m128 tmp_reg_2 = _mm_add_ps(accum_l, tmp_reg_0); __m128 tmp_reg_3 = _mm_add_ps(tmp_reg_0, accum_h); __m128 tmp_reg_4 = _mm_add_ps(tmp_reg_1, accum_h); // 1st 5x5 block output. output[i][out_h + v] = out_ch_bias + _mm_cvtss_f32(tmp_reg_2) + _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 1)); // 2nd 5x5 block output. output[i][out_h + v + 1] = out_ch_bias + _mm_cvtss_f32(_mm_shuffle_ps(tmp_reg_3, tmp_reg_3, 1)) + _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 2)); // 3rd 5x5 block output. output[i][out_h + v + 2] = out_ch_bias + _mm_cvtss_f32(_mm_shuffle_ps(tmp_reg_4, tmp_reg_4, 2)) + _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 3)); v += 3; w += kSkipWidthForNextIter; rem_width -= kSkipWidthForNextIter; } // Process remaining blocks as single 5x5 block at a time. while (rem_width >= kFilterWidth) { float last_column_sum = 0; __m128 accum = _mm_setzero_ps(); const float *input_ptr = &input[k][h * in_stride + w]; PERFORM_CONVOLVE_FOR_1_5X5_BLOCK(shuffle_weight, accum, in_stride); // Accumulate across column. accum = _mm_hadd_ps(accum, accum); output[i][out_h + v] = out_ch_bias + last_column_sum + _mm_cvtss_f32(accum) + _mm_cvtss_f32(_mm_shuffle_ps(accum, accum, 1)); v += 1; w += kSkipWidth; rem_width -= kSkipWidth; } } } } } // AVX2 implementation for layer 1. static inline void cnn_convolve_no_maxpool_padding_valid_layer1_avx2( const float **input, int in_stride, const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, int start_idx, const int cstep, const int channel_step) { __m256i weight_mask[2]; __m256i shuffle_output_mask; load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask); const int kInHeight = 16; const int kFilterHeight = 2; const int kSkipHeight = 2; for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]); // out_accum registers are used to store the 2x2 convolve outputs // (calculated over input block size), which are accumulated across the // in_channels. As per the design, each iteration of for loop processes 8 // (horizontal) 2x2 blocks and stores in corresponding out_accum register // (as input size is 16x16, a total of 64 2x2 blocks are present and 8 // out_accum registers are enough to store the outputs). // Hence for loops corresponding to 'j' and 'h', below, run over the number // of out_accum registers. __m256 out_accum[8]; for (int j = 0; j < 8; ++j) out_accum[j] = bias_reg; for (int k = 0; k < layer_config->in_channels; ++k) { __m256 shuffle_weight[2]; int off = k * layer_config->out_channels + i; // In layer 1, the convolution process happens at 2x2. // The weights needed for 2x2 block are same across the in-channels, // which is why the load of weights happens once for each in-channel. prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep, shuffle_weight, weight_mask); for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1; h += kSkipHeight, ++u) { const float *input_ptr = &input[k][h * in_stride]; perform_convolve_for_8h_2x2_blocks(input_ptr, in_stride, shuffle_weight, &out_accum[u], shuffle_output_mask); } } // Store output of layer 1. for (int j = 0; j < 8; ++j) { _mm256_storeu_ps(&output[i][j * out_stride], out_accum[j]); } } } // AVX2 implementation for layer 2. static inline void cnn_convolve_no_maxpool_padding_valid_layer2_avx2( const float **input, int in_stride, const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, int start_idx, const int cstep, const int channel_step) { __m256i weight_mask[2]; __m256i shuffle_output_mask; load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask); const int kInHeight = 8; const int kFilterHeight = 2; const int kSkipHeight = 2; for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]); // out_accum registers are used to store the 2x2 convolve outputs // (calculated over input block size), which are accumulated across the // in_channels. As per the design, each iteration of for loop processes 8 // (4 horizontal x 2 vertical) 2x2 blocks and stores in corresponding // out_accum register (as input size is 8x8, a total of 16 2x2 blocks are // present and 2 out_accum registers are enough to store the outputs). // Hence for loops corresponding to 'j' and 'h', below, run over the number // of out_accum registers. __m256 out_accum[2]; // Height needs to be moved to go to next iteration of processing // while processing 2 2x2 blocks vertically. const int kSkipHeightForNextIter = kSkipHeight * 2; for (int j = 0; j < 2; ++j) out_accum[j] = bias_reg; for (int k = 0; k < layer_config->in_channels; ++k) { __m256 shuffle_weight[2]; int off = k * layer_config->out_channels + i; // In layer 2, the convolution process happens at 2x2. // The weights needed for 2x2 block are same across the in-channels, // which is why the load of weights happens once for each in-channel. prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep, shuffle_weight, weight_mask); for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1; h += kSkipHeightForNextIter, ++u) { const float *input_ptr = &input[k][h * in_stride]; perform_convolve_for_4hx2v_2x2_blocks(input_ptr, in_stride, shuffle_weight, &out_accum[u], shuffle_output_mask); } } // Store output of layer 2. for (int j = 0; j < 2; ++j) { _mm256_storeu_ps(&output[i][j * out_stride * 2], out_accum[j]); } } } // AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when // filter_width and filter_height are equal to 2. // As per the layer config set by av1_intra_mode_cnn_partition_cnn_config, // the filter_width and filter_height are equal to 2 for layer >= 1. So // convolution happens at 2x2 for layer >= 1. static void cnn_convolve_no_maxpool_padding_valid_2x2_avx2( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, int start_idx, const int cstep, const int channel_step) { assert(layer_config->filter_width == 2 && layer_config->filter_height == 2); assert(layer_config->skip_width == 2 && layer_config->skip_height == 2); if (in_width == 16 && in_height == 16) { // This case of in_width and in_height equal to 16 corresponds to layer 1. // The output size of this layer is 8x8. cnn_convolve_no_maxpool_padding_valid_layer1_avx2( input, in_stride, layer_config, output, out_stride, start_idx, cstep, channel_step); } else if (in_width == 8 && in_height == 8) { // This case of in_width and in_height equal to 8 corresponds to layer 2. // The output size of this layer is 4x4. cnn_convolve_no_maxpool_padding_valid_layer2_avx2( input, in_stride, layer_config, output, out_stride, start_idx, cstep, channel_step); } else { // For layer equal to 3 and 4, the input is of size 4x4 and 2x2 // respectively. Implementing SIMD for these cases might not be optimal, // which is why we call C path for layer >= 3. av1_cnn_convolve_no_maxpool_padding_valid_c( input, in_width, in_height, in_stride, layer_config, output, out_stride, start_idx, cstep, channel_step); } } // AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(). // As per the current encoder, av1_cnn_convolve function gets called for // block size equal to 64x64. av1_cnn_convolve() uses layer config values // set by av1_intra_mode_cnn_partition_cnn_config. The following are a few // details related to each layer's config parameters. // Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht // 0 64x64 16x16 5 5 4 4 // 1 16x16 8x8 2 2 2 2 // 2 8x8 4x4 2 2 2 2 // 3 4x4 2x2 2 2 2 2 // 4 2x2 1x1 2 2 2 2 // Here, // filter_wd = filter_width and filter_ht = filter_height, // skip_wd = skip_width and skip_ht = skip_height. void av1_cnn_convolve_no_maxpool_padding_valid_avx2( const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int cstep, int channel_step) { if (layer_config->filter_width == 5 && layer_config->filter_height == 5 && layer_config->skip_width == 4 && layer_config->skip_height == 4) { cnn_convolve_no_maxpool_padding_valid_5x5_avx2( input, in_width, in_height, in_stride, layer_config, output, out_stride, start_idx, cstep, channel_step); } else if (layer_config->filter_width == 2 && layer_config->filter_height == 2 && layer_config->skip_width == 2 && layer_config->skip_height == 2) { cnn_convolve_no_maxpool_padding_valid_2x2_avx2( input, in_width, in_height, in_stride, layer_config, output, out_stride, start_idx, cstep, channel_step); } else { av1_cnn_convolve_no_maxpool_padding_valid_c( input, in_width, in_height, in_stride, layer_config, output, out_stride, start_idx, cstep, channel_step); } } aom-3.12.1/av1/encoder/x86/dct_sse2.asm000066400000000000000000000051771477627663500173560ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; %define private_prefix av1 %include "third_party/x86inc/x86inc.asm" SECTION .text %macro TRANSFORM_COLS 0 paddw m0, m1 movq m4, m0 psubw m3, m2 psubw m4, m3 psraw m4, 1 movq m5, m4 psubw m5, m1 ;b1 psubw m4, m2 ;c1 psubw m0, m4 paddw m3, m5 ; m0 a0 SWAP 1, 4 ; m1 c1 SWAP 2, 3 ; m2 d1 SWAP 3, 5 ; m3 b1 %endmacro %macro TRANSPOSE_4X4 0 ; 00 01 02 03 ; 10 11 12 13 ; 20 21 22 23 ; 30 31 32 33 punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13 punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33 mova m1, m0 punpckldq m0, m2 ; 00 10 20 30 01 11 21 31 punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33 %endmacro INIT_XMM sse2 cglobal fwht4x4, 3, 4, 8, input, output, stride lea r3q, [inputq + strideq*4] movq m0, [inputq] ;a1 movq m1, [inputq + strideq*2] ;b1 movq m2, [r3q] ;c1 movq m3, [r3q + strideq*2] ;d1 TRANSFORM_COLS TRANSPOSE_4X4 SWAP 1, 2 psrldq m1, m0, 8 psrldq m3, m2, 8 TRANSFORM_COLS TRANSPOSE_4X4 psllw m0, 2 psllw m1, 2 ; sign extension mova m2, m0 mova m3, m1 punpcklwd m0, m0 punpcklwd m1, m1 punpckhwd m2, m2 punpckhwd m3, m3 psrad m0, 16 psrad m1, 16 psrad m2, 16 psrad m3, 16 mova [outputq], m0 mova [outputq + 16], m2 mova [outputq + 32], m1 mova [outputq + 48], m3 RET aom-3.12.1/av1/encoder/x86/encodetxb_avx2.c000066400000000000000000000116141477627663500202160ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include // SSE2 #include /* SSE4.1 */ #include /* AVX2 */ #include "aom/aom_integer.h" #include "aom_dsp/x86/mem_sse2.h" #include "av1/common/av1_common_int.h" #include "av1/common/txb_common.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels) { const int stride = height + TX_PAD_HOR; const __m256i y_zeros = _mm256_setzero_si256(); const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); uint8_t *bottom_buf_end = levels + (width + TX_PAD_BOTTOM) * stride; uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31)); do { yy_storeu_256(bottom_buf, y_zeros); bottom_buf += 32; } while (bottom_buf < bottom_buf_end); int i = 0; uint8_t *ls = levels; const tran_low_t *cf = coeff; if (height == 4) { do { const __m256i c0 = yy_loadu_256(cf); const __m256i c1 = yy_loadu_256(cf + 8); const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1)); const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros); const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8); const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8); yy_storeu_256(ls, res); ls += 32; cf += 16; i += 4; } while (i < width); } else if (height == 8) { do { const __m256i coeffA = yy_loadu_256(cf); const __m256i coeffB = yy_loadu_256(cf + 8); const __m256i coeffC = yy_loadu_256(cf + 16); const __m256i coeffD = yy_loadu_256(cf + 24); const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); const __m256i absAB = _mm256_abs_epi16(coeffAB); const __m256i absCD = _mm256_abs_epi16(coeffCD); const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); const __m128i res0 = _mm256_castsi256_si128(res); const __m128i res1 = _mm256_extracti128_si256(res, 1); xx_storel_64(ls, res0); *(int32_t *)(ls + height) = 0; xx_storel_64(ls + stride, _mm_srli_si128(res0, 8)); *(int32_t *)(ls + height + stride) = 0; xx_storel_64(ls + stride * 2, res1); *(int32_t *)(ls + height + stride * 2) = 0; xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8)); *(int32_t *)(ls + height + stride * 3) = 0; cf += 32; ls += stride << 2; i += 4; } while (i < width); } else if (height == 16) { do { const __m256i coeffA = yy_loadu_256(cf); const __m256i coeffB = yy_loadu_256(cf + 8); const __m256i coeffC = yy_loadu_256(cf + 16); const __m256i coeffD = yy_loadu_256(cf + 24); const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); const __m256i absAB = _mm256_abs_epi16(coeffAB); const __m256i absCD = _mm256_abs_epi16(coeffCD); const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); xx_storeu_128(ls, _mm256_castsi256_si128(res)); xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1)); cf += 32; *(int32_t *)(ls + height) = 0; *(int32_t *)(ls + stride + height) = 0; ls += stride << 1; i += 2; } while (i < width); } else { do { const __m256i coeffA = yy_loadu_256(cf); const __m256i coeffB = yy_loadu_256(cf + 8); const __m256i coeffC = yy_loadu_256(cf + 16); const __m256i coeffD = yy_loadu_256(cf + 24); const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); const __m256i absAB = _mm256_abs_epi16(coeffAB); const __m256i absCD = _mm256_abs_epi16(coeffCD); const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); yy_storeu_256(ls, res); cf += 32; *(int32_t *)(ls + height) = 0; ls += stride; i += 1; } while (i < width); } } aom-3.12.1/av1/encoder/x86/encodetxb_sse2.c000066400000000000000000000501011477627663500202040ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include // SSE2 #include "aom/aom_integer.h" #include "aom_dsp/x86/mem_sse2.h" #include "av1/common/av1_common_int.h" #include "av1/common/txb_common.h" static inline void load_levels_4x4x5_sse2(const uint8_t *const src, const int stride, const ptrdiff_t *const offsets, __m128i *const level) { level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride); level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride); level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride); level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride); level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride); } static inline void load_levels_8x2x5_sse2(const uint8_t *const src, const int stride, const ptrdiff_t *const offsets, __m128i *const level) { level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride); level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride); level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride); level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride); level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride); } static inline void load_levels_16x1x5_sse2(const uint8_t *const src, const int stride, const ptrdiff_t *const offsets, __m128i *const level) { level[0] = _mm_loadu_si128((__m128i *)(src + 1)); level[1] = _mm_loadu_si128((__m128i *)(src + stride)); level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0])); level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1])); level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2])); } static inline __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) { const __m128i const_3 = _mm_set1_epi8(3); const __m128i const_4 = _mm_set1_epi8(4); __m128i count; count = _mm_min_epu8(level[0], const_3); level[1] = _mm_min_epu8(level[1], const_3); level[2] = _mm_min_epu8(level[2], const_3); level[3] = _mm_min_epu8(level[3], const_3); level[4] = _mm_min_epu8(level[4], const_3); count = _mm_add_epi8(count, level[1]); count = _mm_add_epi8(count, level[2]); count = _mm_add_epi8(count, level[3]); count = _mm_add_epi8(count, level[4]); count = _mm_avg_epu8(count, _mm_setzero_si128()); count = _mm_min_epu8(count, const_4); return count; } static inline void get_4_nz_map_contexts_2d(const uint8_t *levels, const int width, const ptrdiff_t *const offsets, int8_t *const coeff_contexts) { const int stride = 4 + TX_PAD_HOR; const __m128i pos_to_offset_large = _mm_set1_epi8(21); __m128i pos_to_offset = (width == 4) ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21) : _mm_setr_epi8(0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, 21, 21); __m128i count; __m128i level[5]; int8_t *cc = coeff_contexts; int col = width; assert(!(width % 4)); do { load_levels_4x4x5_sse2(levels, stride, offsets, level); count = get_coeff_contexts_kernel_sse2(level); count = _mm_add_epi8(count, pos_to_offset); _mm_store_si128((__m128i *)cc, count); pos_to_offset = pos_to_offset_large; levels += 4 * stride; cc += 16; col -= 4; } while (col); coeff_contexts[0] = 0; } static inline void get_4_nz_map_contexts_ver(const uint8_t *levels, const int width, const ptrdiff_t *const offsets, int8_t *coeff_contexts) { const int stride = 4 + TX_PAD_HOR; const __m128i pos_to_offset = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); __m128i count; __m128i level[5]; int col = width; assert(!(width % 4)); do { load_levels_4x4x5_sse2(levels, stride, offsets, level); count = get_coeff_contexts_kernel_sse2(level); count = _mm_add_epi8(count, pos_to_offset); _mm_store_si128((__m128i *)coeff_contexts, count); levels += 4 * stride; coeff_contexts += 16; col -= 4; } while (col); } static inline void get_4_nz_map_contexts_hor(const uint8_t *levels, const int width, const ptrdiff_t *const offsets, int8_t *coeff_contexts) { const int stride = 4 + TX_PAD_HOR; const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); __m128i pos_to_offset = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); __m128i count; __m128i level[5]; int col = width; assert(!(width % 4)); do { load_levels_4x4x5_sse2(levels, stride, offsets, level); count = get_coeff_contexts_kernel_sse2(level); count = _mm_add_epi8(count, pos_to_offset); _mm_store_si128((__m128i *)coeff_contexts, count); pos_to_offset = pos_to_offset_large; levels += 4 * stride; coeff_contexts += 16; col -= 4; } while (col); } static inline void get_8_coeff_contexts_2d(const uint8_t *levels, const int width, const ptrdiff_t *const offsets, int8_t *coeff_contexts) { const int stride = 8 + TX_PAD_HOR; int8_t *cc = coeff_contexts; int col = width; __m128i count; __m128i level[5]; __m128i pos_to_offset[3]; assert(!(width % 2)); if (width == 8) { pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21); pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21); } else if (width < 8) { pos_to_offset[0] = _mm_setr_epi8(0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21); pos_to_offset[1] = _mm_setr_epi8(11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21); } else { pos_to_offset[0] = _mm_setr_epi8(0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21); } pos_to_offset[2] = _mm_set1_epi8(21); do { load_levels_8x2x5_sse2(levels, stride, offsets, level); count = get_coeff_contexts_kernel_sse2(level); count = _mm_add_epi8(count, pos_to_offset[0]); _mm_store_si128((__m128i *)cc, count); pos_to_offset[0] = pos_to_offset[1]; pos_to_offset[1] = pos_to_offset[2]; levels += 2 * stride; cc += 16; col -= 2; } while (col); coeff_contexts[0] = 0; } static inline void get_8_coeff_contexts_ver(const uint8_t *levels, const int width, const ptrdiff_t *const offsets, int8_t *coeff_contexts) { const int stride = 8 + TX_PAD_HOR; const __m128i pos_to_offset = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); int col = width; __m128i count; __m128i level[5]; assert(!(width % 2)); do { load_levels_8x2x5_sse2(levels, stride, offsets, level); count = get_coeff_contexts_kernel_sse2(level); count = _mm_add_epi8(count, pos_to_offset); _mm_store_si128((__m128i *)coeff_contexts, count); levels += 2 * stride; coeff_contexts += 16; col -= 2; } while (col); } static inline void get_8_coeff_contexts_hor(const uint8_t *levels, const int width, const ptrdiff_t *const offsets, int8_t *coeff_contexts) { const int stride = 8 + TX_PAD_HOR; const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); __m128i pos_to_offset = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5); int col = width; __m128i count; __m128i level[5]; assert(!(width % 2)); do { load_levels_8x2x5_sse2(levels, stride, offsets, level); count = get_coeff_contexts_kernel_sse2(level); count = _mm_add_epi8(count, pos_to_offset); _mm_store_si128((__m128i *)coeff_contexts, count); pos_to_offset = pos_to_offset_large; levels += 2 * stride; coeff_contexts += 16; col -= 2; } while (col); } static inline void get_16n_coeff_contexts_2d(const uint8_t *levels, const int real_width, const int real_height, const int width, const int height, const ptrdiff_t *const offsets, int8_t *coeff_contexts) { const int stride = height + TX_PAD_HOR; int8_t *cc = coeff_contexts; int col = width; __m128i pos_to_offset[5]; __m128i pos_to_offset_large[3]; __m128i count; __m128i level[5]; assert(!(height % 16)); pos_to_offset_large[2] = _mm_set1_epi8(21); if (real_width == real_height) { pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21); pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21); pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21); pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21); pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2]; } else if (real_width < real_height) { pos_to_offset[0] = _mm_setr_epi8(0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21); pos_to_offset[1] = _mm_setr_epi8(11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21); pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8( 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21); pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2]; } else { // real_width > real_height pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8( 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21); pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21); pos_to_offset[4] = pos_to_offset_large[2]; pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(16); } do { int h = height; do { load_levels_16x1x5_sse2(levels, stride, offsets, level); count = get_coeff_contexts_kernel_sse2(level); count = _mm_add_epi8(count, pos_to_offset[0]); _mm_store_si128((__m128i *)cc, count); levels += 16; cc += 16; h -= 16; pos_to_offset[0] = pos_to_offset_large[0]; } while (h); pos_to_offset[0] = pos_to_offset[1]; pos_to_offset[1] = pos_to_offset[2]; pos_to_offset[2] = pos_to_offset[3]; pos_to_offset[3] = pos_to_offset[4]; pos_to_offset_large[0] = pos_to_offset_large[1]; pos_to_offset_large[1] = pos_to_offset_large[2]; levels += TX_PAD_HOR; } while (--col); coeff_contexts[0] = 0; } static inline void get_16n_coeff_contexts_ver(const uint8_t *levels, const int width, const int height, const ptrdiff_t *const offsets, int8_t *coeff_contexts) { const int stride = height + TX_PAD_HOR; const __m128i pos_to_offset_large = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); __m128i count; __m128i level[5]; int col = width; assert(!(height % 16)); do { __m128i pos_to_offset = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); int h = height; do { load_levels_16x1x5_sse2(levels, stride, offsets, level); count = get_coeff_contexts_kernel_sse2(level); count = _mm_add_epi8(count, pos_to_offset); _mm_store_si128((__m128i *)coeff_contexts, count); pos_to_offset = pos_to_offset_large; levels += 16; coeff_contexts += 16; h -= 16; } while (h); levels += TX_PAD_HOR; } while (--col); } static inline void get_16n_coeff_contexts_hor(const uint8_t *levels, const int width, const int height, const ptrdiff_t *const offsets, int8_t *coeff_contexts) { const int stride = height + TX_PAD_HOR; __m128i pos_to_offset[3]; __m128i count; __m128i level[5]; int col = width; assert(!(height % 16)); pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0); pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5); pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); do { int h = height; do { load_levels_16x1x5_sse2(levels, stride, offsets, level); count = get_coeff_contexts_kernel_sse2(level); count = _mm_add_epi8(count, pos_to_offset[0]); _mm_store_si128((__m128i *)coeff_contexts, count); levels += 16; coeff_contexts += 16; h -= 16; } while (h); pos_to_offset[0] = pos_to_offset[1]; pos_to_offset[1] = pos_to_offset[2]; levels += TX_PAD_HOR; } while (--col); } // Note: levels[] must be in the range [0, 127], inclusive. void av1_get_nz_map_contexts_sse2(const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts) { const int last_idx = eob - 1; if (!last_idx) { coeff_contexts[0] = 0; return; } const int real_width = tx_size_wide[tx_size]; const int real_height = tx_size_high[tx_size]; const int width = get_txb_wide(tx_size); const int height = get_txb_high(tx_size); const int stride = height + TX_PAD_HOR; ptrdiff_t offsets[3]; /* coeff_contexts must be 16 byte aligned. */ assert(!((intptr_t)coeff_contexts & 0xf)); if (tx_class == TX_CLASS_2D) { offsets[0] = 0 * stride + 2; offsets[1] = 1 * stride + 1; offsets[2] = 2 * stride + 0; if (height == 4) { get_4_nz_map_contexts_2d(levels, width, offsets, coeff_contexts); } else if (height == 8) { get_8_coeff_contexts_2d(levels, width, offsets, coeff_contexts); } else if (height == 16) { get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, offsets, coeff_contexts); } else { get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, offsets, coeff_contexts); } } else if (tx_class == TX_CLASS_HORIZ) { offsets[0] = 2 * stride; offsets[1] = 3 * stride; offsets[2] = 4 * stride; if (height == 4) { get_4_nz_map_contexts_hor(levels, width, offsets, coeff_contexts); } else if (height == 8) { get_8_coeff_contexts_hor(levels, width, offsets, coeff_contexts); } else { get_16n_coeff_contexts_hor(levels, width, height, offsets, coeff_contexts); } } else { // TX_CLASS_VERT offsets[0] = 2; offsets[1] = 3; offsets[2] = 4; if (height == 4) { get_4_nz_map_contexts_ver(levels, width, offsets, coeff_contexts); } else if (height == 8) { get_8_coeff_contexts_ver(levels, width, offsets, coeff_contexts); } else { get_16n_coeff_contexts_ver(levels, width, height, offsets, coeff_contexts); } } const int bhl = get_txb_bhl(tx_size); const int pos = scan[last_idx]; if (last_idx <= (width << bhl) / 8) coeff_contexts[pos] = 1; else if (last_idx <= (width << bhl) / 4) coeff_contexts[pos] = 2; else coeff_contexts[pos] = 3; } aom-3.12.1/av1/encoder/x86/encodetxb_sse4.c000066400000000000000000000057451477627663500202240ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include // SSE2 #include /* SSE4.1 */ #include "aom/aom_integer.h" #include "av1/common/av1_common_int.h" #include "av1/common/txb_common.h" #include "aom_dsp/x86/synonyms.h" void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels) { const int stride = height + TX_PAD_HOR; const __m128i zeros = _mm_setzero_si128(); const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); uint8_t *bottom_buf = levels + stride * width; uint8_t *bottom_buf_end = bottom_buf + bottom_len; do { _mm_storeu_si128((__m128i *)(bottom_buf), zeros); bottom_buf += 16; } while (bottom_buf < bottom_buf_end); int i = 0; uint8_t *ls = levels; const tran_low_t *cf = coeff; if (height == 4) { do { const __m128i coeffA = xx_loadu_128(cf); const __m128i coeffB = xx_loadu_128(cf + 4); const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); const __m128i absAB = _mm_abs_epi16(coeffAB); const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros); xx_storeu_128(ls, lsAB); ls += (stride << 1); cf += (height << 1); i += 2; } while (i < width); } else if (height == 8) { do { const __m128i coeffA = xx_loadu_128(cf); const __m128i coeffB = xx_loadu_128(cf + 4); const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); const __m128i absAB = _mm_abs_epi16(coeffAB); const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); xx_storeu_128(ls, absAB8); ls += stride; cf += height; i += 1; } while (i < width); } else { do { int j = 0; do { const __m128i coeffA = xx_loadu_128(cf); const __m128i coeffB = xx_loadu_128(cf + 4); const __m128i coeffC = xx_loadu_128(cf + 8); const __m128i coeffD = xx_loadu_128(cf + 12); const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD); const __m128i absAB = _mm_abs_epi16(coeffAB); const __m128i absCD = _mm_abs_epi16(coeffCD); const __m128i absABCD = _mm_packs_epi16(absAB, absCD); xx_storeu_128(ls + j, absABCD); j += 16; cf += 16; } while (j < height); *(int32_t *)(ls + height) = 0; ls += stride; i += 1; } while (i < width); } } aom-3.12.1/av1/encoder/x86/error_intrin_avx2.c000066400000000000000000000224061477627663500207600ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // AVX2 #include "config/av1_rtcd.h" #include "aom/aom_integer.h" static inline void read_coeff(const tran_low_t *coeff, intptr_t offset, __m256i *c) { const tran_low_t *addr = coeff + offset; if (sizeof(tran_low_t) == 4) { const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr); const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1); const __m256i y = _mm256_packs_epi32(x0, x1); *c = _mm256_permute4x64_epi64(y, 0xD8); } else { *c = _mm256_loadu_si256((const __m256i *)addr); } } static inline void av1_block_error_block_size16_avx2(const int16_t *coeff, const int16_t *dqcoeff, __m256i *sse_256) { const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff); const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff); // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); // r0 r1 r2 r3 r4 r5 r6 r7 const __m256i error = _mm256_madd_epi16(diff, diff); // r0+r1 r2+r3 | r0+r1 r2+r3 | r4+r5 r6+r7 | r4+r5 r6+r7 const __m256i error_hi = _mm256_hadd_epi32(error, error); // r0+r1 | r2+r3 | r4+r5 | r6+r7 *sse_256 = _mm256_unpacklo_epi32(error_hi, _mm256_setzero_si256()); } static inline void av1_block_error_block_size32_avx2(const int16_t *coeff, const int16_t *dqcoeff, __m256i *sse_256) { const __m256i zero = _mm256_setzero_si256(); const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff); const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff); const __m256i _coeff_1 = _mm256_loadu_si256((const __m256i *)(coeff + 16)); const __m256i _dqcoeff_1 = _mm256_loadu_si256((const __m256i *)(dqcoeff + 16)); // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 const __m256i diff_0 = _mm256_sub_epi16(_dqcoeff_0, _coeff_0); const __m256i diff_1 = _mm256_sub_epi16(_dqcoeff_1, _coeff_1); // r0 r1 r2 r3 r4 r5 r6 r7 const __m256i error_0 = _mm256_madd_epi16(diff_0, diff_0); const __m256i error_1 = _mm256_madd_epi16(diff_1, diff_1); const __m256i err_final_0 = _mm256_add_epi32(error_0, error_1); // For extreme input values, the accumulation needs to happen in 64 bit // precision to avoid any overflow. const __m256i exp0_error_lo = _mm256_unpacklo_epi32(err_final_0, zero); const __m256i exp0_error_hi = _mm256_unpackhi_epi32(err_final_0, zero); const __m256i sum_temp_0 = _mm256_add_epi64(exp0_error_hi, exp0_error_lo); *sse_256 = _mm256_add_epi64(*sse_256, sum_temp_0); } static inline void av1_block_error_block_size64_avx2(const int16_t *coeff, const int16_t *dqcoeff, __m256i *sse_256, intptr_t block_size) { const __m256i zero = _mm256_setzero_si256(); for (int i = 0; i < block_size; i += 64) { // Load 64 elements for coeff and dqcoeff. const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff); const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff); const __m256i _coeff_1 = _mm256_loadu_si256((const __m256i *)(coeff + 16)); const __m256i _dqcoeff_1 = _mm256_loadu_si256((const __m256i *)(dqcoeff + 16)); const __m256i _coeff_2 = _mm256_loadu_si256((const __m256i *)(coeff + 32)); const __m256i _dqcoeff_2 = _mm256_loadu_si256((const __m256i *)(dqcoeff + 32)); const __m256i _coeff_3 = _mm256_loadu_si256((const __m256i *)(coeff + 48)); const __m256i _dqcoeff_3 = _mm256_loadu_si256((const __m256i *)(dqcoeff + 48)); // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 const __m256i diff_0 = _mm256_sub_epi16(_dqcoeff_0, _coeff_0); const __m256i diff_1 = _mm256_sub_epi16(_dqcoeff_1, _coeff_1); const __m256i diff_2 = _mm256_sub_epi16(_dqcoeff_2, _coeff_2); const __m256i diff_3 = _mm256_sub_epi16(_dqcoeff_3, _coeff_3); // r0 r1 r2 r3 r4 r5 r6 r7 const __m256i error_0 = _mm256_madd_epi16(diff_0, diff_0); const __m256i error_1 = _mm256_madd_epi16(diff_1, diff_1); const __m256i error_2 = _mm256_madd_epi16(diff_2, diff_2); const __m256i error_3 = _mm256_madd_epi16(diff_3, diff_3); // r00 r01 r02 r03 r04 r05 r06 r07 const __m256i err_final_0 = _mm256_add_epi32(error_0, error_1); // r10 r11 r12 r13 r14 r15 r16 r17 const __m256i err_final_1 = _mm256_add_epi32(error_2, error_3); // For extreme input values, the accumulation needs to happen in 64 bit // precision to avoid any overflow. r00 r01 r04 r05 const __m256i exp0_error_lo = _mm256_unpacklo_epi32(err_final_0, zero); // r02 r03 r06 r07 const __m256i exp0_error_hi = _mm256_unpackhi_epi32(err_final_0, zero); // r10 r11 r14 r15 const __m256i exp1_error_lo = _mm256_unpacklo_epi32(err_final_1, zero); // r12 r13 r16 r17 const __m256i exp1_error_hi = _mm256_unpackhi_epi32(err_final_1, zero); const __m256i sum_temp_0 = _mm256_add_epi64(exp0_error_hi, exp0_error_lo); const __m256i sum_temp_1 = _mm256_add_epi64(exp1_error_hi, exp1_error_lo); const __m256i sse_256_temp = _mm256_add_epi64(sum_temp_1, sum_temp_0); *sse_256 = _mm256_add_epi64(*sse_256, sse_256_temp); coeff += 64; dqcoeff += 64; } } int64_t av1_block_error_lp_avx2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size) { assert(block_size % 16 == 0); __m256i sse_256 = _mm256_setzero_si256(); int64_t sse; if (block_size == 16) av1_block_error_block_size16_avx2(coeff, dqcoeff, &sse_256); else if (block_size == 32) av1_block_error_block_size32_avx2(coeff, dqcoeff, &sse_256); else av1_block_error_block_size64_avx2(coeff, dqcoeff, &sse_256, block_size); // Save the higher 64 bit of each 128 bit lane. const __m256i sse_hi = _mm256_srli_si256(sse_256, 8); // Add the higher 64 bit to the low 64 bit. sse_256 = _mm256_add_epi64(sse_256, sse_hi); // Accumulate the sse_256 register to get final sse const __m128i sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), _mm256_extractf128_si256(sse_256, 1)); // Store the results. _mm_storel_epi64((__m128i *)&sse, sse_128); return sse; } int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz) { __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; __m256i sse_reg_64hi, ssz_reg_64hi; __m128i sse_reg128, ssz_reg128; int64_t sse; int i; const __m256i zero_reg = _mm256_setzero_si256(); // init sse and ssz registerd to zero sse_reg = _mm256_setzero_si256(); ssz_reg = _mm256_setzero_si256(); for (i = 0; i < block_size; i += 16) { // load 32 bytes from coeff and dqcoeff read_coeff(coeff, i, &coeff_reg); read_coeff(dqcoeff, i, &dqcoeff_reg); // dqcoeff - coeff dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); // madd (dqcoeff - coeff) dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); // madd coeff coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); // expand each double word of madd (dqcoeff - coeff) to quad word exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg); exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg); // expand each double word of madd (coeff) to quad word exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg); exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg); // add each quad word of madd (dqcoeff - coeff) and madd (coeff) sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo); ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo); sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi); ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi); } // save the higher 64 bit of each 128 bit lane sse_reg_64hi = _mm256_srli_si256(sse_reg, 8); ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8); // add the higher 64 bit to the low 64 bit sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi); ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi); // add each 64 bit from each of the 128 bit lane of the 256 bit sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1)); ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg), _mm256_extractf128_si256(ssz_reg, 1)); // store the results _mm_storel_epi64((__m128i *)(&sse), sse_reg128); _mm_storel_epi64((__m128i *)(ssz), ssz_reg128); _mm256_zeroupper(); return sse; } aom-3.12.1/av1/encoder/x86/error_intrin_sse2.c000066400000000000000000000050571477627663500207570ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // SSE2 #include "config/av1_rtcd.h" #include "aom/aom_integer.h" static inline __m128i reduce_sum_epi64(__m128i reg) { __m128i reg_hi = _mm_srli_si128(reg, 8); reg = _mm_add_epi64(reg, reg_hi); return reg; } int64_t av1_block_error_lp_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size) { assert(block_size % 16 == 0); assert(block_size >= 16); const __m128i zero = _mm_setzero_si128(); __m128i accum_0 = zero; __m128i accum_1 = zero; for (int i = 0; i < block_size; i += 16) { // Load 8 elements for coeff and dqcoeff. const __m128i _coeff_0 = _mm_loadu_si128((const __m128i *)coeff); const __m128i _coeff_1 = _mm_loadu_si128((const __m128i *)(coeff + 8)); const __m128i _dqcoeff_0 = _mm_loadu_si128((const __m128i *)dqcoeff); const __m128i _dqcoeff_1 = _mm_loadu_si128((const __m128i *)(dqcoeff + 8)); // Compute the diff const __m128i diff_0 = _mm_sub_epi16(_dqcoeff_0, _coeff_0); const __m128i diff_1 = _mm_sub_epi16(_dqcoeff_1, _coeff_1); // Compute the error const __m128i error_0 = _mm_madd_epi16(diff_0, diff_0); const __m128i error_1 = _mm_madd_epi16(diff_1, diff_1); const __m128i error_lo_0 = _mm_unpacklo_epi32(error_0, zero); const __m128i error_lo_1 = _mm_unpacklo_epi32(error_1, zero); const __m128i error_hi_0 = _mm_unpackhi_epi32(error_0, zero); const __m128i error_hi_1 = _mm_unpackhi_epi32(error_1, zero); // Accumulate accum_0 = _mm_add_epi64(accum_0, error_lo_0); accum_1 = _mm_add_epi64(accum_1, error_lo_1); accum_0 = _mm_add_epi64(accum_0, error_hi_0); accum_1 = _mm_add_epi64(accum_1, error_hi_1); // Advance coeff += 16; dqcoeff += 16; } __m128i accum = _mm_add_epi64(accum_0, accum_1); // Reduce sum the register accum = reduce_sum_epi64(accum); // Store the results. #if AOM_ARCH_X86_64 return _mm_cvtsi128_si64(accum); #else int64_t result; _mm_storel_epi64((__m128i *)&result, accum); return result; #endif // AOM_ARCH_X86_64 } aom-3.12.1/av1/encoder/x86/error_sse2.asm000066400000000000000000000047111477627663500177260ustar00rootroot00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; ; Increment %1 by sizeof() tran_low_t * %2. %macro INCREMENT_ELEMENTS_TRAN_LOW 2 lea %1, [%1 + %2 * 4] %endmacro ; Load %2 + %3 into m%1. ; %3 is the offset in elements, not bytes. ; If tran_low_t is 16 bits (low bit depth configuration) then load the value ; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack ; the values down to 16 bits. %macro LOAD_TRAN_LOW 3 mova m%1, [%2 + (%3) * 4] packssdw m%1, [%2 + (%3) * 4 + 16] %endmacro %define private_prefix av1 %include "third_party/x86inc/x86inc.asm" SECTION .text ; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, ; int64_t *ssz) INIT_XMM sse2 cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz pxor m4, m4 ; sse accumulator pxor m6, m6 ; ssz accumulator pxor m5, m5 ; dedicated zero register .loop: LOAD_TRAN_LOW 2, uqcq, 0 LOAD_TRAN_LOW 0, dqcq, 0 LOAD_TRAN_LOW 3, uqcq, 8 LOAD_TRAN_LOW 1, dqcq, 8 INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 sub sizeq, 16 psubw m0, m2 psubw m1, m3 ; individual errors are max. 15bit+sign, so squares are 30bit, and ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) pmaddwd m0, m0 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 ; the sum of 2 31bit integers will fit in a 32bit unsigned integer paddd m0, m1 paddd m2, m3 ; accumulate in 64bit punpckldq m7, m0, m5 punpckhdq m0, m5 paddq m4, m7 punpckldq m7, m2, m5 paddq m4, m0 punpckhdq m2, m5 paddq m6, m7 paddq m6, m2 jg .loop ; accumulate horizontally and store in return value movhlps m5, m4 movhlps m7, m6 paddq m4, m5 paddq m6, m7 %if AOM_ARCH_X86_64 movq rax, m4 movq [sszq], m6 %else mov eax, sszm pshufd m5, m4, 0x1 movq [eax], m6 movd eax, m4 movd edx, m5 %endif RET aom-3.12.1/av1/encoder/x86/hash_sse42.c000066400000000000000000000032741477627663500172510ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" // Byte-boundary alignment issues #define ALIGN_SIZE 8 #define ALIGN_MASK (ALIGN_SIZE - 1) #define CALC_CRC(op, crc, type, buf, len) \ while ((len) >= sizeof(type)) { \ (crc) = op((crc), *(type *)(buf)); \ (len) -= sizeof(type); \ buf += sizeof(type); \ } /** * Calculates 32-bit CRC for the input buffer * polynomial is 0x11EDC6F41 * @return A 32-bit unsigned integer representing the CRC */ uint32_t av1_get_crc32c_value_sse4_2(void *crc_calculator, uint8_t *p, size_t len) { (void)crc_calculator; const uint8_t *buf = p; uint32_t crc = 0xFFFFFFFF; // Align the input to the word boundary for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) { crc = _mm_crc32_u8(crc, *buf); } #ifdef __x86_64__ uint64_t crc64 = crc; CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len) crc = (uint32_t)crc64; #endif CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len) CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len) CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len) return (crc ^ 0xFFFFFFFF); } aom-3.12.1/av1/encoder/x86/highbd_block_error_intrin_avx2.c000066400000000000000000000054051477627663500234370ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom/aom_integer.h" #include "av1/common/common.h" #include "config/av1_rtcd.h" int64_t av1_highbd_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bps) { int i; int64_t temp1[8]; int64_t error = 0, sqcoeff = 0; const int shift = 2 * (bps - 8); const int rounding = (1 << shift) >> 1; for (i = 0; i < block_size; i += 16) { __m256i mm256_coeff = _mm256_loadu_si256((__m256i *)(coeff + i)); __m256i mm256_coeff2 = _mm256_loadu_si256((__m256i *)(coeff + i + 8)); __m256i mm256_dqcoeff = _mm256_loadu_si256((__m256i *)(dqcoeff + i)); __m256i mm256_dqcoeff2 = _mm256_loadu_si256((__m256i *)(dqcoeff + i + 8)); __m256i diff1 = _mm256_sub_epi32(mm256_coeff, mm256_dqcoeff); __m256i diff2 = _mm256_sub_epi32(mm256_coeff2, mm256_dqcoeff2); __m256i diff1h = _mm256_srli_epi64(diff1, 32); __m256i diff2h = _mm256_srli_epi64(diff2, 32); __m256i res = _mm256_mul_epi32(diff1, diff1); __m256i res1 = _mm256_mul_epi32(diff1h, diff1h); __m256i res2 = _mm256_mul_epi32(diff2, diff2); __m256i res3 = _mm256_mul_epi32(diff2h, diff2h); __m256i res_diff = _mm256_add_epi64(_mm256_add_epi64(res, res1), _mm256_add_epi64(res2, res3)); __m256i mm256_coeffh = _mm256_srli_epi64(mm256_coeff, 32); __m256i mm256_coeffh2 = _mm256_srli_epi64(mm256_coeff2, 32); res = _mm256_mul_epi32(mm256_coeff, mm256_coeff); res1 = _mm256_mul_epi32(mm256_coeffh, mm256_coeffh); res2 = _mm256_mul_epi32(mm256_coeff2, mm256_coeff2); res3 = _mm256_mul_epi32(mm256_coeffh2, mm256_coeffh2); __m256i res_sqcoeff = _mm256_add_epi64(_mm256_add_epi64(res, res1), _mm256_add_epi64(res2, res3)); _mm256_storeu_si256((__m256i *)temp1, res_diff); _mm256_storeu_si256((__m256i *)temp1 + 1, res_sqcoeff); error += temp1[0] + temp1[1] + temp1[2] + temp1[3]; sqcoeff += temp1[4] + temp1[5] + temp1[6] + temp1[7]; } error = (error + rounding) >> shift; sqcoeff = (sqcoeff + rounding) >> shift; *ssz = sqcoeff; return error; } aom-3.12.1/av1/encoder/x86/highbd_block_error_intrin_sse2.c000066400000000000000000000057341477627663500234400ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "av1/common/common.h" #include "config/av1_rtcd.h" int64_t av1_highbd_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bps) { int i, j, test; uint32_t temp[4]; __m128i max, min, cmp0, cmp1, cmp2, cmp3; int64_t error = 0, sqcoeff = 0; const int shift = 2 * (bps - 8); const int rounding = (1 << shift) >> 1; for (i = 0; i < block_size; i += 8) { // Load the data into xmm registers __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i)); __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4)); __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i)); __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4)); // Check if any values require more than 15 bit max = _mm_set1_epi32(0x3fff); min = _mm_set1_epi32((int)0xffffc000); cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max), _mm_cmplt_epi32(mm_coeff, min)); cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max), _mm_cmplt_epi32(mm_coeff2, min)); cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max), _mm_cmplt_epi32(mm_dqcoeff, min)); cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max), _mm_cmplt_epi32(mm_dqcoeff2, min)); test = _mm_movemask_epi8( _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3))); if (!test) { __m128i mm_diff, error_sse2, sqcoeff_sse2; mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2); mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2); mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff); error_sse2 = _mm_madd_epi16(mm_diff, mm_diff); sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff); _mm_storeu_si128((__m128i *)temp, error_sse2); error = error + temp[0] + temp[1] + temp[2] + temp[3]; _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2); sqcoeff += temp[0] + temp[1] + temp[2] + temp[3]; } else { for (j = 0; j < 8; j++) { const int64_t diff = coeff[i + j] - dqcoeff[i + j]; error += diff * diff; sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j]; } } } error = (error + rounding) >> shift; sqcoeff = (sqcoeff + rounding) >> shift; *ssz = sqcoeff; return error; } aom-3.12.1/av1/encoder/x86/highbd_fwd_txfm_avx2.c000066400000000000000000003715061477627663500213770ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include /*AVX2*/ #include "config/aom_config.h" #include "config/av1_rtcd.h" #include "av1/common/av1_txfm.h" #include "av1/encoder/av1_fwd_txfm1d_cfg.h" #include "aom_dsp/txfm_common.h" #include "aom_ports/mem.h" #include "aom_dsp/x86/txfm_common_sse2.h" #include "aom_dsp/x86/txfm_common_avx2.h" static inline void load_buffer_8x8_avx2(const int16_t *input, __m256i *out, int stride, int flipud, int fliplr, int shift) { __m128i out1[8]; if (!flipud) { out1[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); out1[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); out1[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); out1[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); out1[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); out1[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); out1[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); out1[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); } else { out1[7] = _mm_load_si128((const __m128i *)(input + 0 * stride)); out1[6] = _mm_load_si128((const __m128i *)(input + 1 * stride)); out1[5] = _mm_load_si128((const __m128i *)(input + 2 * stride)); out1[4] = _mm_load_si128((const __m128i *)(input + 3 * stride)); out1[3] = _mm_load_si128((const __m128i *)(input + 4 * stride)); out1[2] = _mm_load_si128((const __m128i *)(input + 5 * stride)); out1[1] = _mm_load_si128((const __m128i *)(input + 6 * stride)); out1[0] = _mm_load_si128((const __m128i *)(input + 7 * stride)); } if (!fliplr) { out[0] = _mm256_cvtepi16_epi32(out1[0]); out[1] = _mm256_cvtepi16_epi32(out1[1]); out[2] = _mm256_cvtepi16_epi32(out1[2]); out[3] = _mm256_cvtepi16_epi32(out1[3]); out[4] = _mm256_cvtepi16_epi32(out1[4]); out[5] = _mm256_cvtepi16_epi32(out1[5]); out[6] = _mm256_cvtepi16_epi32(out1[6]); out[7] = _mm256_cvtepi16_epi32(out1[7]); } else { out[0] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[0])); out[1] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[1])); out[2] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[2])); out[3] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[3])); out[4] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[4])); out[5] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[5])); out[6] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[6])); out[7] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[7])); } out[0] = _mm256_slli_epi32(out[0], shift); out[1] = _mm256_slli_epi32(out[1], shift); out[2] = _mm256_slli_epi32(out[2], shift); out[3] = _mm256_slli_epi32(out[3], shift); out[4] = _mm256_slli_epi32(out[4], shift); out[5] = _mm256_slli_epi32(out[5], shift); out[6] = _mm256_slli_epi32(out[6], shift); out[7] = _mm256_slli_epi32(out[7], shift); } static inline void col_txfm_8x8_rounding(__m256i *in, int shift) { const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1)); in[0] = _mm256_add_epi32(in[0], rounding); in[1] = _mm256_add_epi32(in[1], rounding); in[2] = _mm256_add_epi32(in[2], rounding); in[3] = _mm256_add_epi32(in[3], rounding); in[4] = _mm256_add_epi32(in[4], rounding); in[5] = _mm256_add_epi32(in[5], rounding); in[6] = _mm256_add_epi32(in[6], rounding); in[7] = _mm256_add_epi32(in[7], rounding); in[0] = _mm256_srai_epi32(in[0], shift); in[1] = _mm256_srai_epi32(in[1], shift); in[2] = _mm256_srai_epi32(in[2], shift); in[3] = _mm256_srai_epi32(in[3], shift); in[4] = _mm256_srai_epi32(in[4], shift); in[5] = _mm256_srai_epi32(in[5], shift); in[6] = _mm256_srai_epi32(in[6], shift); in[7] = _mm256_srai_epi32(in[7], shift); } static inline void load_buffer_8x16_avx2(const int16_t *input, __m256i *out, int stride, int flipud, int fliplr, int shift) { const int16_t *topL = input; const int16_t *botL = input + 8 * stride; const int16_t *tmp; if (flipud) { tmp = topL; topL = botL; botL = tmp; } load_buffer_8x8_avx2(topL, out, stride, flipud, fliplr, shift); load_buffer_8x8_avx2(botL, out + 8, stride, flipud, fliplr, shift); } static inline void load_buffer_16xn_avx2(const int16_t *input, __m256i *out, int stride, int height, int outstride, int flipud, int fliplr) { __m256i out1[64]; if (!flipud) { for (int i = 0; i < height; i++) { out1[i] = _mm256_loadu_si256((const __m256i *)(input + i * stride)); } } else { for (int i = 0; i < height; i++) { out1[(height - 1) - i] = _mm256_loadu_si256((const __m256i *)(input + i * stride)); } } if (!fliplr) { for (int i = 0; i < height; i++) { out[i * outstride] = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(out1[i])); out[i * outstride + 1] = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(out1[i], 1)); } } else { for (int i = 0; i < height; i++) { out[i * outstride + 1] = _mm256_cvtepi16_epi32( mm_reverse_epi16(_mm256_castsi256_si128(out1[i]))); out[i * outstride + 0] = _mm256_cvtepi16_epi32( mm_reverse_epi16(_mm256_extractf128_si256(out1[i], 1))); } } } static void fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out, const int instride, const int outstride) { __m256i u0, u1, u2, u3, u4, u5, u6, u7; __m256i x0, x1; u0 = _mm256_unpacklo_epi32(in[0 * instride], in[1 * instride]); u1 = _mm256_unpackhi_epi32(in[0 * instride], in[1 * instride]); u2 = _mm256_unpacklo_epi32(in[2 * instride], in[3 * instride]); u3 = _mm256_unpackhi_epi32(in[2 * instride], in[3 * instride]); u4 = _mm256_unpacklo_epi32(in[4 * instride], in[5 * instride]); u5 = _mm256_unpackhi_epi32(in[4 * instride], in[5 * instride]); u6 = _mm256_unpacklo_epi32(in[6 * instride], in[7 * instride]); u7 = _mm256_unpackhi_epi32(in[6 * instride], in[7 * instride]); x0 = _mm256_unpacklo_epi64(u0, u2); x1 = _mm256_unpacklo_epi64(u4, u6); out[0 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); out[4 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); x0 = _mm256_unpackhi_epi64(u0, u2); x1 = _mm256_unpackhi_epi64(u4, u6); out[1 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); out[5 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); x0 = _mm256_unpacklo_epi64(u1, u3); x1 = _mm256_unpacklo_epi64(u5, u7); out[2 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); out[6 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); x0 = _mm256_unpackhi_epi64(u1, u3); x1 = _mm256_unpackhi_epi64(u5, u7); out[3 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); out[7 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); } static inline void round_shift_32_8xn_avx2(__m256i *in, int size, int bit, int stride) { if (bit < 0) { bit = -bit; __m256i round = _mm256_set1_epi32(1 << (bit - 1)); for (int i = 0; i < size; ++i) { in[stride * i] = _mm256_add_epi32(in[stride * i], round); in[stride * i] = _mm256_srai_epi32(in[stride * i], bit); } } else if (bit > 0) { for (int i = 0; i < size; ++i) { in[stride * i] = _mm256_slli_epi32(in[stride * i], bit); } } } static inline void store_buffer_avx2(const __m256i *const in, int32_t *out, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { _mm256_store_si256((__m256i *)(out), in[i]); out += stride; } } static inline void fwd_txfm_transpose_16x16_avx2(const __m256i *in, __m256i *out) { fwd_txfm_transpose_8x8_avx2(&in[0], &out[0], 2, 2); fwd_txfm_transpose_8x8_avx2(&in[1], &out[16], 2, 2); fwd_txfm_transpose_8x8_avx2(&in[16], &out[1], 2, 2); fwd_txfm_transpose_8x8_avx2(&in[17], &out[17], 2, 2); } static inline __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0, const __m256i *w1, const __m256i *n1, const __m256i *rounding, int bit) { __m256i x, y; x = _mm256_mullo_epi32(*w0, *n0); y = _mm256_mullo_epi32(*w1, *n1); x = _mm256_add_epi32(x, y); x = _mm256_add_epi32(x, *rounding); x = _mm256_srai_epi32(x, bit); return x; } #define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) \ do { \ const __m256i ww0 = _mm256_set1_epi32(w0); \ const __m256i ww1 = _mm256_set1_epi32(w1); \ const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \ const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \ out0 = _mm256_add_epi32(in0_w0, in1_w1); \ round_shift_32_8xn_avx2(&out0, 1, -bit, 1); \ const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \ const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \ out1 = _mm256_sub_epi32(in0_w1, in1_w0); \ round_shift_32_8xn_avx2(&out1, 1, -bit, 1); \ } while (0) #define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ do { \ const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \ const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \ out0 = _mm256_add_epi32(in0_w0, in1_w1); \ out0 = _mm256_add_epi32(out0, r); \ out0 = _mm256_srai_epi32(out0, bit); \ const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \ const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \ out1 = _mm256_sub_epi32(in0_w1, in1_w0); \ out1 = _mm256_add_epi32(out1, r); \ out1 = _mm256_srai_epi32(out1, bit); \ } while (0) typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, const int8_t cos_bit, int instride, int outstride); static void fdct8_avx2(__m256i *in, __m256i *out, const int8_t bit, const int col_num, const int outstride) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); __m256i u[8], v[8]; for (int col = 0; col < col_num; ++col) { u[0] = _mm256_add_epi32(in[0 * col_num + col], in[7 * col_num + col]); v[7] = _mm256_sub_epi32(in[0 * col_num + col], in[7 * col_num + col]); u[1] = _mm256_add_epi32(in[1 * col_num + col], in[6 * col_num + col]); u[6] = _mm256_sub_epi32(in[1 * col_num + col], in[6 * col_num + col]); u[2] = _mm256_add_epi32(in[2 * col_num + col], in[5 * col_num + col]); u[5] = _mm256_sub_epi32(in[2 * col_num + col], in[5 * col_num + col]); u[3] = _mm256_add_epi32(in[3 * col_num + col], in[4 * col_num + col]); v[4] = _mm256_sub_epi32(in[3 * col_num + col], in[4 * col_num + col]); v[0] = _mm256_add_epi32(u[0], u[3]); v[3] = _mm256_sub_epi32(u[0], u[3]); v[1] = _mm256_add_epi32(u[1], u[2]); v[2] = _mm256_sub_epi32(u[1], u[2]); v[5] = _mm256_mullo_epi32(u[5], cospim32); v[6] = _mm256_mullo_epi32(u[6], cospi32); v[5] = _mm256_add_epi32(v[5], v[6]); v[5] = _mm256_add_epi32(v[5], rnding); v[5] = _mm256_srai_epi32(v[5], bit); u[0] = _mm256_mullo_epi32(u[5], cospi32); v[6] = _mm256_mullo_epi32(u[6], cospim32); v[6] = _mm256_sub_epi32(u[0], v[6]); v[6] = _mm256_add_epi32(v[6], rnding); v[6] = _mm256_srai_epi32(v[6], bit); // stage 3 // type 0 v[0] = _mm256_mullo_epi32(v[0], cospi32); v[1] = _mm256_mullo_epi32(v[1], cospi32); u[0] = _mm256_add_epi32(v[0], v[1]); u[0] = _mm256_add_epi32(u[0], rnding); u[0] = _mm256_srai_epi32(u[0], bit); u[1] = _mm256_sub_epi32(v[0], v[1]); u[1] = _mm256_add_epi32(u[1], rnding); u[1] = _mm256_srai_epi32(u[1], bit); // type 1 v[0] = _mm256_mullo_epi32(v[2], cospi48); v[1] = _mm256_mullo_epi32(v[3], cospi16); u[2] = _mm256_add_epi32(v[0], v[1]); u[2] = _mm256_add_epi32(u[2], rnding); u[2] = _mm256_srai_epi32(u[2], bit); v[0] = _mm256_mullo_epi32(v[2], cospi16); v[1] = _mm256_mullo_epi32(v[3], cospi48); u[3] = _mm256_sub_epi32(v[1], v[0]); u[3] = _mm256_add_epi32(u[3], rnding); u[3] = _mm256_srai_epi32(u[3], bit); u[4] = _mm256_add_epi32(v[4], v[5]); u[5] = _mm256_sub_epi32(v[4], v[5]); u[6] = _mm256_sub_epi32(v[7], v[6]); u[7] = _mm256_add_epi32(v[7], v[6]); // stage 4 // stage 5 v[0] = _mm256_mullo_epi32(u[4], cospi56); v[1] = _mm256_mullo_epi32(u[7], cospi8); v[0] = _mm256_add_epi32(v[0], v[1]); v[0] = _mm256_add_epi32(v[0], rnding); out[1 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[4] v[0] = _mm256_mullo_epi32(u[4], cospi8); v[1] = _mm256_mullo_epi32(u[7], cospi56); v[0] = _mm256_sub_epi32(v[1], v[0]); v[0] = _mm256_add_epi32(v[0], rnding); out[7 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[7] v[0] = _mm256_mullo_epi32(u[5], cospi24); v[1] = _mm256_mullo_epi32(u[6], cospi40); v[0] = _mm256_add_epi32(v[0], v[1]); v[0] = _mm256_add_epi32(v[0], rnding); out[5 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[5] v[0] = _mm256_mullo_epi32(u[5], cospi40); v[1] = _mm256_mullo_epi32(u[6], cospi24); v[0] = _mm256_sub_epi32(v[1], v[0]); v[0] = _mm256_add_epi32(v[0], rnding); out[3 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[6] out[0 * outstride + col] = u[0]; // buf0[0] out[4 * outstride + col] = u[1]; // buf0[1] out[2 * outstride + col] = u[2]; // buf0[2] out[6 * outstride + col] = u[3]; // buf0[3] } } static void fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit, const int col_num, const int outstirde) { (void)col_num; const int32_t *cospi = cospi_arr(bit); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); const __m256i zero = _mm256_setzero_si256(); __m256i u0, u1, u2, u3, u4, u5, u6, u7; __m256i v0, v1, v2, v3, v4, v5, v6, v7; __m256i x, y; for (int col = 0; col < col_num; ++col) { u0 = in[0 * col_num + col]; u1 = _mm256_sub_epi32(zero, in[7 * col_num + col]); u2 = _mm256_sub_epi32(zero, in[3 * col_num + col]); u3 = in[4 * col_num + col]; u4 = _mm256_sub_epi32(zero, in[1 * col_num + col]); u5 = in[6 * col_num + col]; u6 = in[2 * col_num + col]; u7 = _mm256_sub_epi32(zero, in[5 * col_num + col]); // stage 2 v0 = u0; v1 = u1; x = _mm256_mullo_epi32(u2, cospi32); y = _mm256_mullo_epi32(u3, cospi32); v2 = _mm256_add_epi32(x, y); v2 = _mm256_add_epi32(v2, rnding); v2 = _mm256_srai_epi32(v2, bit); v3 = _mm256_sub_epi32(x, y); v3 = _mm256_add_epi32(v3, rnding); v3 = _mm256_srai_epi32(v3, bit); v4 = u4; v5 = u5; x = _mm256_mullo_epi32(u6, cospi32); y = _mm256_mullo_epi32(u7, cospi32); v6 = _mm256_add_epi32(x, y); v6 = _mm256_add_epi32(v6, rnding); v6 = _mm256_srai_epi32(v6, bit); v7 = _mm256_sub_epi32(x, y); v7 = _mm256_add_epi32(v7, rnding); v7 = _mm256_srai_epi32(v7, bit); // stage 3 u0 = _mm256_add_epi32(v0, v2); u1 = _mm256_add_epi32(v1, v3); u2 = _mm256_sub_epi32(v0, v2); u3 = _mm256_sub_epi32(v1, v3); u4 = _mm256_add_epi32(v4, v6); u5 = _mm256_add_epi32(v5, v7); u6 = _mm256_sub_epi32(v4, v6); u7 = _mm256_sub_epi32(v5, v7); // stage 4 v0 = u0; v1 = u1; v2 = u2; v3 = u3; x = _mm256_mullo_epi32(u4, cospi16); y = _mm256_mullo_epi32(u5, cospi48); v4 = _mm256_add_epi32(x, y); v4 = _mm256_add_epi32(v4, rnding); v4 = _mm256_srai_epi32(v4, bit); x = _mm256_mullo_epi32(u4, cospi48); y = _mm256_mullo_epi32(u5, cospim16); v5 = _mm256_add_epi32(x, y); v5 = _mm256_add_epi32(v5, rnding); v5 = _mm256_srai_epi32(v5, bit); x = _mm256_mullo_epi32(u6, cospim48); y = _mm256_mullo_epi32(u7, cospi16); v6 = _mm256_add_epi32(x, y); v6 = _mm256_add_epi32(v6, rnding); v6 = _mm256_srai_epi32(v6, bit); x = _mm256_mullo_epi32(u6, cospi16); y = _mm256_mullo_epi32(u7, cospi48); v7 = _mm256_add_epi32(x, y); v7 = _mm256_add_epi32(v7, rnding); v7 = _mm256_srai_epi32(v7, bit); // stage 5 u0 = _mm256_add_epi32(v0, v4); u1 = _mm256_add_epi32(v1, v5); u2 = _mm256_add_epi32(v2, v6); u3 = _mm256_add_epi32(v3, v7); u4 = _mm256_sub_epi32(v0, v4); u5 = _mm256_sub_epi32(v1, v5); u6 = _mm256_sub_epi32(v2, v6); u7 = _mm256_sub_epi32(v3, v7); // stage 6 x = _mm256_mullo_epi32(u0, cospi4); y = _mm256_mullo_epi32(u1, cospi60); v0 = _mm256_add_epi32(x, y); v0 = _mm256_add_epi32(v0, rnding); v0 = _mm256_srai_epi32(v0, bit); x = _mm256_mullo_epi32(u0, cospi60); y = _mm256_mullo_epi32(u1, cospim4); v1 = _mm256_add_epi32(x, y); v1 = _mm256_add_epi32(v1, rnding); v1 = _mm256_srai_epi32(v1, bit); x = _mm256_mullo_epi32(u2, cospi20); y = _mm256_mullo_epi32(u3, cospi44); v2 = _mm256_add_epi32(x, y); v2 = _mm256_add_epi32(v2, rnding); v2 = _mm256_srai_epi32(v2, bit); x = _mm256_mullo_epi32(u2, cospi44); y = _mm256_mullo_epi32(u3, cospim20); v3 = _mm256_add_epi32(x, y); v3 = _mm256_add_epi32(v3, rnding); v3 = _mm256_srai_epi32(v3, bit); x = _mm256_mullo_epi32(u4, cospi36); y = _mm256_mullo_epi32(u5, cospi28); v4 = _mm256_add_epi32(x, y); v4 = _mm256_add_epi32(v4, rnding); v4 = _mm256_srai_epi32(v4, bit); x = _mm256_mullo_epi32(u4, cospi28); y = _mm256_mullo_epi32(u5, cospim36); v5 = _mm256_add_epi32(x, y); v5 = _mm256_add_epi32(v5, rnding); v5 = _mm256_srai_epi32(v5, bit); x = _mm256_mullo_epi32(u6, cospi52); y = _mm256_mullo_epi32(u7, cospi12); v6 = _mm256_add_epi32(x, y); v6 = _mm256_add_epi32(v6, rnding); v6 = _mm256_srai_epi32(v6, bit); x = _mm256_mullo_epi32(u6, cospi12); y = _mm256_mullo_epi32(u7, cospim52); v7 = _mm256_add_epi32(x, y); v7 = _mm256_add_epi32(v7, rnding); v7 = _mm256_srai_epi32(v7, bit); // stage 7 out[0 * outstirde + col] = v1; out[1 * outstirde + col] = v6; out[2 * outstirde + col] = v3; out[3 * outstirde + col] = v4; out[4 * outstirde + col] = v5; out[5 * outstirde + col] = v2; out[6 * outstirde + col] = v7; out[7 * outstirde + col] = v0; } } static void idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit, int col_num, int outstride) { (void)bit; (void)outstride; int num_iters = 8 * col_num; for (int i = 0; i < num_iters; i += 8) { out[i] = _mm256_add_epi32(in[i], in[i]); out[i + 1] = _mm256_add_epi32(in[i + 1], in[i + 1]); out[i + 2] = _mm256_add_epi32(in[i + 2], in[i + 2]); out[i + 3] = _mm256_add_epi32(in[i + 3], in[i + 3]); out[i + 4] = _mm256_add_epi32(in[i + 4], in[i + 4]); out[i + 5] = _mm256_add_epi32(in[i + 5], in[i + 5]); out[i + 6] = _mm256_add_epi32(in[i + 6], in[i + 6]); out[i + 7] = _mm256_add_epi32(in[i + 7], in[i + 7]); } } void av1_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m256i in[8], out[8]; const TX_SIZE tx_size = TX_8X8; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int width = tx_size_wide[tx_size]; const int width_div8 = (width >> 3); switch (tx_type) { case DCT_DCT: load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); col_txfm_8x8_rounding(out, -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 8); break; case ADST_DCT: load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); col_txfm_8x8_rounding(out, -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 8); break; case DCT_ADST: load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); col_txfm_8x8_rounding(out, -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 8); break; case ADST_ADST: load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); col_txfm_8x8_rounding(out, -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 8); break; case FLIPADST_DCT: load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); col_txfm_8x8_rounding(out, -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 8); break; case DCT_FLIPADST: load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); col_txfm_8x8_rounding(out, -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 8); break; case FLIPADST_FLIPADST: load_buffer_8x8_avx2(input, in, stride, 1, 1, shift[0]); fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); col_txfm_8x8_rounding(out, -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 8); break; case ADST_FLIPADST: load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); col_txfm_8x8_rounding(out, -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 8); break; case FLIPADST_ADST: load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); col_txfm_8x8_rounding(out, -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 8); break; case IDTX: load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); col_txfm_8x8_rounding(out, -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 8); break; case V_DCT: load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); col_txfm_8x8_rounding(out, -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 8); break; case H_DCT: load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); col_txfm_8x8_rounding(out, -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 8); break; case V_ADST: load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); col_txfm_8x8_rounding(out, -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 8); break; case H_ADST: load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); col_txfm_8x8_rounding(out, -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 8); break; case V_FLIPADST: load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); col_txfm_8x8_rounding(out, -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 8); break; case H_FLIPADST: load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); col_txfm_8x8_rounding(out, -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 8); break; default: assert(0); } (void)bd; } static void fdct16_avx2(__m256i *in, __m256i *out, const int8_t bit, const int col_num, const int outstride) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); __m256i u[16], v[16], x; int col; // Calculate the column 0, 1, 2, 3 for (col = 0; col < col_num; ++col) { // stage 0 // stage 1 u[0] = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]); u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]); u[1] = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]); u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]); u[2] = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]); u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]); u[3] = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]); u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]); u[4] = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]); u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]); u[5] = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]); u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]); u[6] = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]); u[9] = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]); u[7] = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]); u[8] = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]); // stage 2 v[0] = _mm256_add_epi32(u[0], u[7]); v[7] = _mm256_sub_epi32(u[0], u[7]); v[1] = _mm256_add_epi32(u[1], u[6]); v[6] = _mm256_sub_epi32(u[1], u[6]); v[2] = _mm256_add_epi32(u[2], u[5]); v[5] = _mm256_sub_epi32(u[2], u[5]); v[3] = _mm256_add_epi32(u[3], u[4]); v[4] = _mm256_sub_epi32(u[3], u[4]); v[8] = u[8]; v[9] = u[9]; v[10] = _mm256_mullo_epi32(u[10], cospim32); x = _mm256_mullo_epi32(u[13], cospi32); v[10] = _mm256_add_epi32(v[10], x); v[10] = _mm256_add_epi32(v[10], rnding); v[10] = _mm256_srai_epi32(v[10], bit); v[13] = _mm256_mullo_epi32(u[10], cospi32); x = _mm256_mullo_epi32(u[13], cospim32); v[13] = _mm256_sub_epi32(v[13], x); v[13] = _mm256_add_epi32(v[13], rnding); v[13] = _mm256_srai_epi32(v[13], bit); v[11] = _mm256_mullo_epi32(u[11], cospim32); x = _mm256_mullo_epi32(u[12], cospi32); v[11] = _mm256_add_epi32(v[11], x); v[11] = _mm256_add_epi32(v[11], rnding); v[11] = _mm256_srai_epi32(v[11], bit); v[12] = _mm256_mullo_epi32(u[11], cospi32); x = _mm256_mullo_epi32(u[12], cospim32); v[12] = _mm256_sub_epi32(v[12], x); v[12] = _mm256_add_epi32(v[12], rnding); v[12] = _mm256_srai_epi32(v[12], bit); v[14] = u[14]; v[15] = u[15]; // stage 3 u[0] = _mm256_add_epi32(v[0], v[3]); u[3] = _mm256_sub_epi32(v[0], v[3]); u[1] = _mm256_add_epi32(v[1], v[2]); u[2] = _mm256_sub_epi32(v[1], v[2]); u[4] = v[4]; u[5] = _mm256_mullo_epi32(v[5], cospim32); x = _mm256_mullo_epi32(v[6], cospi32); u[5] = _mm256_add_epi32(u[5], x); u[5] = _mm256_add_epi32(u[5], rnding); u[5] = _mm256_srai_epi32(u[5], bit); u[6] = _mm256_mullo_epi32(v[5], cospi32); x = _mm256_mullo_epi32(v[6], cospim32); u[6] = _mm256_sub_epi32(u[6], x); u[6] = _mm256_add_epi32(u[6], rnding); u[6] = _mm256_srai_epi32(u[6], bit); u[7] = v[7]; u[8] = _mm256_add_epi32(v[8], v[11]); u[11] = _mm256_sub_epi32(v[8], v[11]); u[9] = _mm256_add_epi32(v[9], v[10]); u[10] = _mm256_sub_epi32(v[9], v[10]); u[12] = _mm256_sub_epi32(v[15], v[12]); u[15] = _mm256_add_epi32(v[15], v[12]); u[13] = _mm256_sub_epi32(v[14], v[13]); u[14] = _mm256_add_epi32(v[14], v[13]); // stage 4 u[0] = _mm256_mullo_epi32(u[0], cospi32); u[1] = _mm256_mullo_epi32(u[1], cospi32); v[0] = _mm256_add_epi32(u[0], u[1]); v[0] = _mm256_add_epi32(v[0], rnding); v[0] = _mm256_srai_epi32(v[0], bit); v[1] = _mm256_sub_epi32(u[0], u[1]); v[1] = _mm256_add_epi32(v[1], rnding); v[1] = _mm256_srai_epi32(v[1], bit); v[2] = _mm256_mullo_epi32(u[2], cospi48); x = _mm256_mullo_epi32(u[3], cospi16); v[2] = _mm256_add_epi32(v[2], x); v[2] = _mm256_add_epi32(v[2], rnding); v[2] = _mm256_srai_epi32(v[2], bit); v[3] = _mm256_mullo_epi32(u[2], cospi16); x = _mm256_mullo_epi32(u[3], cospi48); v[3] = _mm256_sub_epi32(x, v[3]); v[3] = _mm256_add_epi32(v[3], rnding); v[3] = _mm256_srai_epi32(v[3], bit); v[4] = _mm256_add_epi32(u[4], u[5]); v[5] = _mm256_sub_epi32(u[4], u[5]); v[6] = _mm256_sub_epi32(u[7], u[6]); v[7] = _mm256_add_epi32(u[7], u[6]); v[8] = u[8]; v[9] = _mm256_mullo_epi32(u[9], cospim16); x = _mm256_mullo_epi32(u[14], cospi48); v[9] = _mm256_add_epi32(v[9], x); v[9] = _mm256_add_epi32(v[9], rnding); v[9] = _mm256_srai_epi32(v[9], bit); v[14] = _mm256_mullo_epi32(u[9], cospi48); x = _mm256_mullo_epi32(u[14], cospim16); v[14] = _mm256_sub_epi32(v[14], x); v[14] = _mm256_add_epi32(v[14], rnding); v[14] = _mm256_srai_epi32(v[14], bit); v[10] = _mm256_mullo_epi32(u[10], cospim48); x = _mm256_mullo_epi32(u[13], cospim16); v[10] = _mm256_add_epi32(v[10], x); v[10] = _mm256_add_epi32(v[10], rnding); v[10] = _mm256_srai_epi32(v[10], bit); v[13] = _mm256_mullo_epi32(u[10], cospim16); x = _mm256_mullo_epi32(u[13], cospim48); v[13] = _mm256_sub_epi32(v[13], x); v[13] = _mm256_add_epi32(v[13], rnding); v[13] = _mm256_srai_epi32(v[13], bit); v[11] = u[11]; v[12] = u[12]; v[15] = u[15]; // stage 5 u[0] = v[0]; u[1] = v[1]; u[2] = v[2]; u[3] = v[3]; u[4] = _mm256_mullo_epi32(v[4], cospi56); x = _mm256_mullo_epi32(v[7], cospi8); u[4] = _mm256_add_epi32(u[4], x); u[4] = _mm256_add_epi32(u[4], rnding); u[4] = _mm256_srai_epi32(u[4], bit); u[7] = _mm256_mullo_epi32(v[4], cospi8); x = _mm256_mullo_epi32(v[7], cospi56); u[7] = _mm256_sub_epi32(x, u[7]); u[7] = _mm256_add_epi32(u[7], rnding); u[7] = _mm256_srai_epi32(u[7], bit); u[5] = _mm256_mullo_epi32(v[5], cospi24); x = _mm256_mullo_epi32(v[6], cospi40); u[5] = _mm256_add_epi32(u[5], x); u[5] = _mm256_add_epi32(u[5], rnding); u[5] = _mm256_srai_epi32(u[5], bit); u[6] = _mm256_mullo_epi32(v[5], cospi40); x = _mm256_mullo_epi32(v[6], cospi24); u[6] = _mm256_sub_epi32(x, u[6]); u[6] = _mm256_add_epi32(u[6], rnding); u[6] = _mm256_srai_epi32(u[6], bit); u[8] = _mm256_add_epi32(v[8], v[9]); u[9] = _mm256_sub_epi32(v[8], v[9]); u[10] = _mm256_sub_epi32(v[11], v[10]); u[11] = _mm256_add_epi32(v[11], v[10]); u[12] = _mm256_add_epi32(v[12], v[13]); u[13] = _mm256_sub_epi32(v[12], v[13]); u[14] = _mm256_sub_epi32(v[15], v[14]); u[15] = _mm256_add_epi32(v[15], v[14]); // stage 6 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = u[4]; v[5] = u[5]; v[6] = u[6]; v[7] = u[7]; v[8] = _mm256_mullo_epi32(u[8], cospi60); x = _mm256_mullo_epi32(u[15], cospi4); v[8] = _mm256_add_epi32(v[8], x); v[8] = _mm256_add_epi32(v[8], rnding); v[8] = _mm256_srai_epi32(v[8], bit); v[15] = _mm256_mullo_epi32(u[8], cospi4); x = _mm256_mullo_epi32(u[15], cospi60); v[15] = _mm256_sub_epi32(x, v[15]); v[15] = _mm256_add_epi32(v[15], rnding); v[15] = _mm256_srai_epi32(v[15], bit); v[9] = _mm256_mullo_epi32(u[9], cospi28); x = _mm256_mullo_epi32(u[14], cospi36); v[9] = _mm256_add_epi32(v[9], x); v[9] = _mm256_add_epi32(v[9], rnding); v[9] = _mm256_srai_epi32(v[9], bit); v[14] = _mm256_mullo_epi32(u[9], cospi36); x = _mm256_mullo_epi32(u[14], cospi28); v[14] = _mm256_sub_epi32(x, v[14]); v[14] = _mm256_add_epi32(v[14], rnding); v[14] = _mm256_srai_epi32(v[14], bit); v[10] = _mm256_mullo_epi32(u[10], cospi44); x = _mm256_mullo_epi32(u[13], cospi20); v[10] = _mm256_add_epi32(v[10], x); v[10] = _mm256_add_epi32(v[10], rnding); v[10] = _mm256_srai_epi32(v[10], bit); v[13] = _mm256_mullo_epi32(u[10], cospi20); x = _mm256_mullo_epi32(u[13], cospi44); v[13] = _mm256_sub_epi32(x, v[13]); v[13] = _mm256_add_epi32(v[13], rnding); v[13] = _mm256_srai_epi32(v[13], bit); v[11] = _mm256_mullo_epi32(u[11], cospi12); x = _mm256_mullo_epi32(u[12], cospi52); v[11] = _mm256_add_epi32(v[11], x); v[11] = _mm256_add_epi32(v[11], rnding); v[11] = _mm256_srai_epi32(v[11], bit); v[12] = _mm256_mullo_epi32(u[11], cospi52); x = _mm256_mullo_epi32(u[12], cospi12); v[12] = _mm256_sub_epi32(x, v[12]); v[12] = _mm256_add_epi32(v[12], rnding); v[12] = _mm256_srai_epi32(v[12], bit); out[0 * outstride + col] = v[0]; out[1 * outstride + col] = v[8]; out[2 * outstride + col] = v[4]; out[3 * outstride + col] = v[12]; out[4 * outstride + col] = v[2]; out[5 * outstride + col] = v[10]; out[6 * outstride + col] = v[6]; out[7 * outstride + col] = v[14]; out[8 * outstride + col] = v[1]; out[9 * outstride + col] = v[9]; out[10 * outstride + col] = v[5]; out[11 * outstride + col] = v[13]; out[12 * outstride + col] = v[3]; out[13 * outstride + col] = v[11]; out[14 * outstride + col] = v[7]; out[15 * outstride + col] = v[15]; } } static void fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit, const int num_cols, const int outstride) { const int32_t *cospi = cospi_arr(bit); const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]); const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]); const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]); const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]); const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); const __m256i zero = _mm256_setzero_si256(); __m256i u[16], v[16], x, y; int col; for (col = 0; col < num_cols; ++col) { // stage 0 // stage 1 u[0] = in[0 * num_cols + col]; u[1] = _mm256_sub_epi32(zero, in[15 * num_cols + col]); u[2] = _mm256_sub_epi32(zero, in[7 * num_cols + col]); u[3] = in[8 * num_cols + col]; u[4] = _mm256_sub_epi32(zero, in[3 * num_cols + col]); u[5] = in[12 * num_cols + col]; u[6] = in[4 * num_cols + col]; u[7] = _mm256_sub_epi32(zero, in[11 * num_cols + col]); u[8] = _mm256_sub_epi32(zero, in[1 * num_cols + col]); u[9] = in[14 * num_cols + col]; u[10] = in[6 * num_cols + col]; u[11] = _mm256_sub_epi32(zero, in[9 * num_cols + col]); u[12] = in[2 * num_cols + col]; u[13] = _mm256_sub_epi32(zero, in[13 * num_cols + col]); u[14] = _mm256_sub_epi32(zero, in[5 * num_cols + col]); u[15] = in[10 * num_cols + col]; // stage 2 v[0] = u[0]; v[1] = u[1]; x = _mm256_mullo_epi32(u[2], cospi32); y = _mm256_mullo_epi32(u[3], cospi32); v[2] = _mm256_add_epi32(x, y); v[2] = _mm256_add_epi32(v[2], rnding); v[2] = _mm256_srai_epi32(v[2], bit); v[3] = _mm256_sub_epi32(x, y); v[3] = _mm256_add_epi32(v[3], rnding); v[3] = _mm256_srai_epi32(v[3], bit); v[4] = u[4]; v[5] = u[5]; x = _mm256_mullo_epi32(u[6], cospi32); y = _mm256_mullo_epi32(u[7], cospi32); v[6] = _mm256_add_epi32(x, y); v[6] = _mm256_add_epi32(v[6], rnding); v[6] = _mm256_srai_epi32(v[6], bit); v[7] = _mm256_sub_epi32(x, y); v[7] = _mm256_add_epi32(v[7], rnding); v[7] = _mm256_srai_epi32(v[7], bit); v[8] = u[8]; v[9] = u[9]; x = _mm256_mullo_epi32(u[10], cospi32); y = _mm256_mullo_epi32(u[11], cospi32); v[10] = _mm256_add_epi32(x, y); v[10] = _mm256_add_epi32(v[10], rnding); v[10] = _mm256_srai_epi32(v[10], bit); v[11] = _mm256_sub_epi32(x, y); v[11] = _mm256_add_epi32(v[11], rnding); v[11] = _mm256_srai_epi32(v[11], bit); v[12] = u[12]; v[13] = u[13]; x = _mm256_mullo_epi32(u[14], cospi32); y = _mm256_mullo_epi32(u[15], cospi32); v[14] = _mm256_add_epi32(x, y); v[14] = _mm256_add_epi32(v[14], rnding); v[14] = _mm256_srai_epi32(v[14], bit); v[15] = _mm256_sub_epi32(x, y); v[15] = _mm256_add_epi32(v[15], rnding); v[15] = _mm256_srai_epi32(v[15], bit); // stage 3 u[0] = _mm256_add_epi32(v[0], v[2]); u[1] = _mm256_add_epi32(v[1], v[3]); u[2] = _mm256_sub_epi32(v[0], v[2]); u[3] = _mm256_sub_epi32(v[1], v[3]); u[4] = _mm256_add_epi32(v[4], v[6]); u[5] = _mm256_add_epi32(v[5], v[7]); u[6] = _mm256_sub_epi32(v[4], v[6]); u[7] = _mm256_sub_epi32(v[5], v[7]); u[8] = _mm256_add_epi32(v[8], v[10]); u[9] = _mm256_add_epi32(v[9], v[11]); u[10] = _mm256_sub_epi32(v[8], v[10]); u[11] = _mm256_sub_epi32(v[9], v[11]); u[12] = _mm256_add_epi32(v[12], v[14]); u[13] = _mm256_add_epi32(v[13], v[15]); u[14] = _mm256_sub_epi32(v[12], v[14]); u[15] = _mm256_sub_epi32(v[13], v[15]); // stage 4 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = av1_half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit); v[5] = av1_half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit); v[6] = av1_half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit); v[7] = av1_half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit); v[8] = u[8]; v[9] = u[9]; v[10] = u[10]; v[11] = u[11]; v[12] = av1_half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit); v[13] = av1_half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit); v[14] = av1_half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit); v[15] = av1_half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit); // stage 5 u[0] = _mm256_add_epi32(v[0], v[4]); u[1] = _mm256_add_epi32(v[1], v[5]); u[2] = _mm256_add_epi32(v[2], v[6]); u[3] = _mm256_add_epi32(v[3], v[7]); u[4] = _mm256_sub_epi32(v[0], v[4]); u[5] = _mm256_sub_epi32(v[1], v[5]); u[6] = _mm256_sub_epi32(v[2], v[6]); u[7] = _mm256_sub_epi32(v[3], v[7]); u[8] = _mm256_add_epi32(v[8], v[12]); u[9] = _mm256_add_epi32(v[9], v[13]); u[10] = _mm256_add_epi32(v[10], v[14]); u[11] = _mm256_add_epi32(v[11], v[15]); u[12] = _mm256_sub_epi32(v[8], v[12]); u[13] = _mm256_sub_epi32(v[9], v[13]); u[14] = _mm256_sub_epi32(v[10], v[14]); u[15] = _mm256_sub_epi32(v[11], v[15]); // stage 6 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = u[4]; v[5] = u[5]; v[6] = u[6]; v[7] = u[7]; v[8] = av1_half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit); v[9] = av1_half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit); v[10] = av1_half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit); v[11] = av1_half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit); v[12] = av1_half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit); v[13] = av1_half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit); v[14] = av1_half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit); v[15] = av1_half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit); // stage 7 u[0] = _mm256_add_epi32(v[0], v[8]); u[1] = _mm256_add_epi32(v[1], v[9]); u[2] = _mm256_add_epi32(v[2], v[10]); u[3] = _mm256_add_epi32(v[3], v[11]); u[4] = _mm256_add_epi32(v[4], v[12]); u[5] = _mm256_add_epi32(v[5], v[13]); u[6] = _mm256_add_epi32(v[6], v[14]); u[7] = _mm256_add_epi32(v[7], v[15]); u[8] = _mm256_sub_epi32(v[0], v[8]); u[9] = _mm256_sub_epi32(v[1], v[9]); u[10] = _mm256_sub_epi32(v[2], v[10]); u[11] = _mm256_sub_epi32(v[3], v[11]); u[12] = _mm256_sub_epi32(v[4], v[12]); u[13] = _mm256_sub_epi32(v[5], v[13]); u[14] = _mm256_sub_epi32(v[6], v[14]); u[15] = _mm256_sub_epi32(v[7], v[15]); // stage 8 v[0] = av1_half_btf_avx2(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit); v[1] = av1_half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit); v[2] = av1_half_btf_avx2(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit); v[3] = av1_half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit); v[4] = av1_half_btf_avx2(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit); v[5] = av1_half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit); v[6] = av1_half_btf_avx2(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit); v[7] = av1_half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit); v[8] = av1_half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit); v[9] = av1_half_btf_avx2(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit); v[10] = av1_half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit); v[11] = av1_half_btf_avx2(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit); v[12] = av1_half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit); v[13] = av1_half_btf_avx2(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit); v[14] = av1_half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit); v[15] = av1_half_btf_avx2(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); // stage 9 out[0 * outstride + col] = v[1]; out[1 * outstride + col] = v[14]; out[2 * outstride + col] = v[3]; out[3 * outstride + col] = v[12]; out[4 * outstride + col] = v[5]; out[5 * outstride + col] = v[10]; out[6 * outstride + col] = v[7]; out[7 * outstride + col] = v[8]; out[8 * outstride + col] = v[9]; out[9 * outstride + col] = v[6]; out[10 * outstride + col] = v[11]; out[11 * outstride + col] = v[4]; out[12 * outstride + col] = v[13]; out[13 * outstride + col] = v[2]; out[14 * outstride + col] = v[15]; out[15 * outstride + col] = v[0]; } } static void idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit, int col_num, const int outstride) { (void)bit; (void)outstride; __m256i fact = _mm256_set1_epi32(2 * NewSqrt2); __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1)); __m256i a_low; int num_iters = 16 * col_num; for (int i = 0; i < num_iters; i++) { a_low = _mm256_mullo_epi32(in[i], fact); a_low = _mm256_add_epi32(a_low, offset); out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits); } } static const transform_1d_avx2 col_highbd_txfm8x16_arr[TX_TYPES] = { fdct16_avx2, // DCT_DCT fadst16_avx2, // ADST_DCT fdct16_avx2, // DCT_ADST fadst16_avx2, // ADST_ADST fadst16_avx2, // FLIPADST_DCT fdct16_avx2, // DCT_FLIPADST fadst16_avx2, // FLIPADST_FLIPADST fadst16_avx2, // ADST_FLIPADST fadst16_avx2, // FLIPADST_ADST idtx16_avx2, // IDTX fdct16_avx2, // V_DCT idtx16_avx2, // H_DCT fadst16_avx2, // V_ADST idtx16_avx2, // H_ADST fadst16_avx2, // V_FLIPADST idtx16_avx2 // H_FLIPADST }; static const transform_1d_avx2 row_highbd_txfm8x8_arr[TX_TYPES] = { fdct8_avx2, // DCT_DCT fdct8_avx2, // ADST_DCT fadst8_avx2, // DCT_ADST fadst8_avx2, // ADST_ADST fdct8_avx2, // FLIPADST_DCT fadst8_avx2, // DCT_FLIPADST fadst8_avx2, // FLIPADST_FLIPADST fadst8_avx2, // ADST_FLIPADST fadst8_avx2, // FLIPADST_ADST idtx8_avx2, // IDTX idtx8_avx2, // V_DCT fdct8_avx2, // H_DCT idtx8_avx2, // V_ADST fadst8_avx2, // H_ADST idtx8_avx2, // V_FLIPADST fadst8_avx2 // H_FLIPADST }; void av1_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m256i in[16], out[16]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; const int txw_idx = get_txw_idx(TX_8X16); const int txh_idx = get_txh_idx(TX_8X16); const transform_1d_avx2 col_txfm = col_highbd_txfm8x16_arr[tx_type]; const transform_1d_avx2 row_txfm = row_highbd_txfm8x8_arr[tx_type]; const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); load_buffer_8x16_avx2(input, in, stride, ud_flip, lr_flip, shift[0]); col_txfm(in, out, bit, 1, 1); col_txfm_8x8_rounding(out, -shift[1]); col_txfm_8x8_rounding(&out[8], -shift[1]); fwd_txfm_transpose_8x8_avx2(out, in, 1, 2); fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2); row_txfm(in, out, bit, 2, 2); round_shift_rect_array_32_avx2(out, in, 16, -shift[2], NewSqrt2); store_buffer_avx2(in, coeff, 8, 16); (void)bd; } static const transform_1d_avx2 col_highbd_txfm8x8_arr[TX_TYPES] = { fdct8_avx2, // DCT_DCT fadst8_avx2, // ADST_DCT fdct8_avx2, // DCT_ADST fadst8_avx2, // ADST_ADST fadst8_avx2, // FLIPADST_DCT fdct8_avx2, // DCT_FLIPADST fadst8_avx2, // FLIPADST_FLIPADST fadst8_avx2, // ADST_FLIPADST fadst8_avx2, // FLIPADST_ADST idtx8_avx2, // IDTX fdct8_avx2, // V_DCT idtx8_avx2, // H_DCT fadst8_avx2, // V_ADST idtx8_avx2, // H_ADST fadst8_avx2, // V_FLIPADST idtx8_avx2 // H_FLIPADST }; static const transform_1d_avx2 row_highbd_txfm8x16_arr[TX_TYPES] = { fdct16_avx2, // DCT_DCT fdct16_avx2, // ADST_DCT fadst16_avx2, // DCT_ADST fadst16_avx2, // ADST_ADST fdct16_avx2, // FLIPADST_DCT fadst16_avx2, // DCT_FLIPADST fadst16_avx2, // FLIPADST_FLIPADST fadst16_avx2, // ADST_FLIPADST fadst16_avx2, // FLIPADST_ADST idtx16_avx2, // IDTX idtx16_avx2, // V_DCT fdct16_avx2, // H_DCT idtx16_avx2, // V_ADST fadst16_avx2, // H_ADST idtx16_avx2, // V_FLIPADST fadst16_avx2 // H_FLIPADST }; void av1_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m256i in[16], out[16]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; const int txw_idx = get_txw_idx(TX_16X8); const int txh_idx = get_txh_idx(TX_16X8); const transform_1d_avx2 col_txfm = col_highbd_txfm8x8_arr[tx_type]; const transform_1d_avx2 row_txfm = row_highbd_txfm8x16_arr[tx_type]; const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); load_buffer_16xn_avx2(input, in, stride, 8, 2, ud_flip, lr_flip); round_shift_32_8xn_avx2(in, 16, shift[0], 1); col_txfm(in, out, bit, 2, 2); round_shift_32_8xn_avx2(out, 16, shift[1], 1); fwd_txfm_transpose_8x8_avx2(out, in, 2, 1); fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1); row_txfm(in, out, bit, 1, 1); round_shift_rect_array_32_avx2(out, out, 16, -shift[2], NewSqrt2); store_buffer_avx2(out, coeff, 8, 16); (void)bd; } void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m256i in[32], out[32]; const TX_SIZE tx_size = TX_16X16; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const int width_div8 = (width >> 3); const int width_div16 = (width >> 4); const int size = (height << 1); switch (tx_type) { case DCT_DCT: load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); round_shift_32_8xn_avx2(in, size, shift[0], width_div16); fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); round_shift_32_8xn_avx2(out, size, shift[1], width_div16); fwd_txfm_transpose_16x16_avx2(out, in); fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 32); break; case ADST_DCT: load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); round_shift_32_8xn_avx2(in, size, shift[0], width_div16); fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); round_shift_32_8xn_avx2(out, size, shift[1], width_div16); fwd_txfm_transpose_16x16_avx2(out, in); fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 32); break; case DCT_ADST: load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); round_shift_32_8xn_avx2(in, size, shift[0], width_div16); fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); round_shift_32_8xn_avx2(out, size, shift[1], width_div16); fwd_txfm_transpose_16x16_avx2(out, in); fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 32); break; case ADST_ADST: load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); round_shift_32_8xn_avx2(in, size, shift[0], width_div16); fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); round_shift_32_8xn_avx2(out, size, shift[1], width_div16); fwd_txfm_transpose_16x16_avx2(out, in); fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 32); break; case FLIPADST_DCT: load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); round_shift_32_8xn_avx2(in, size, shift[0], width_div16); fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); round_shift_32_8xn_avx2(out, size, shift[1], width_div16); fwd_txfm_transpose_16x16_avx2(out, in); fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 32); break; case DCT_FLIPADST: load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); round_shift_32_8xn_avx2(in, size, shift[0], width_div16); fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); round_shift_32_8xn_avx2(out, size, shift[1], width_div16); fwd_txfm_transpose_16x16_avx2(out, in); fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 32); break; case FLIPADST_FLIPADST: load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 1); round_shift_32_8xn_avx2(in, size, shift[0], width_div16); fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); round_shift_32_8xn_avx2(out, size, shift[1], width_div16); fwd_txfm_transpose_16x16_avx2(out, in); fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 32); break; case ADST_FLIPADST: load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); round_shift_32_8xn_avx2(in, size, shift[0], width_div16); fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); round_shift_32_8xn_avx2(out, size, shift[1], width_div16); fwd_txfm_transpose_16x16_avx2(out, in); fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 32); break; case FLIPADST_ADST: load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); round_shift_32_8xn_avx2(in, size, shift[0], width_div16); fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); round_shift_32_8xn_avx2(out, size, shift[1], width_div16); fwd_txfm_transpose_16x16_avx2(out, in); fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 32); break; case IDTX: load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); round_shift_32_8xn_avx2(in, size, shift[0], width_div16); idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); round_shift_32_8xn_avx2(out, size, shift[1], width_div16); fwd_txfm_transpose_16x16_avx2(out, in); idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 32); break; case V_DCT: load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); round_shift_32_8xn_avx2(in, size, shift[0], width_div16); fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); round_shift_32_8xn_avx2(out, size, shift[1], width_div16); fwd_txfm_transpose_16x16_avx2(out, in); idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 32); break; case H_DCT: load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); round_shift_32_8xn_avx2(in, size, shift[0], width_div16); idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); round_shift_32_8xn_avx2(out, size, shift[1], width_div16); fwd_txfm_transpose_16x16_avx2(out, in); fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 32); break; case V_ADST: load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); round_shift_32_8xn_avx2(in, size, shift[0], width_div16); fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); round_shift_32_8xn_avx2(out, size, shift[1], width_div16); fwd_txfm_transpose_16x16_avx2(out, in); idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 32); break; case H_ADST: load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); round_shift_32_8xn_avx2(in, size, shift[0], width_div16); idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); round_shift_32_8xn_avx2(out, size, shift[1], width_div16); fwd_txfm_transpose_16x16_avx2(out, in); fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 32); break; case V_FLIPADST: load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); round_shift_32_8xn_avx2(in, size, shift[0], width_div16); fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); round_shift_32_8xn_avx2(out, size, shift[1], width_div16); fwd_txfm_transpose_16x16_avx2(out, in); idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 32); break; case H_FLIPADST: load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); round_shift_32_8xn_avx2(in, size, shift[0], width_div16); idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, width_div8); round_shift_32_8xn_avx2(out, size, shift[1], width_div16); fwd_txfm_transpose_16x16_avx2(out, in); fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, width_div8); store_buffer_avx2(out, coeff, 8, 32); break; default: assert(0); } (void)bd; } static inline void fdct32_avx2(__m256i *input, __m256i *output, const int8_t cos_bit, const int instride, const int outstride) { __m256i buf0[32]; __m256i buf1[32]; const int32_t *cospi; int startidx = 0 * instride; int endidx = 31 * instride; // stage 0 // stage 1 buf1[0] = _mm256_add_epi32(input[startidx], input[endidx]); buf1[31] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; buf1[1] = _mm256_add_epi32(input[startidx], input[endidx]); buf1[30] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; buf1[2] = _mm256_add_epi32(input[startidx], input[endidx]); buf1[29] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; buf1[3] = _mm256_add_epi32(input[startidx], input[endidx]); buf1[28] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; buf1[4] = _mm256_add_epi32(input[startidx], input[endidx]); buf1[27] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; buf1[5] = _mm256_add_epi32(input[startidx], input[endidx]); buf1[26] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; buf1[6] = _mm256_add_epi32(input[startidx], input[endidx]); buf1[25] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; buf1[7] = _mm256_add_epi32(input[startidx], input[endidx]); buf1[24] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; buf1[8] = _mm256_add_epi32(input[startidx], input[endidx]); buf1[23] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; buf1[9] = _mm256_add_epi32(input[startidx], input[endidx]); buf1[22] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; buf1[10] = _mm256_add_epi32(input[startidx], input[endidx]); buf1[21] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; buf1[11] = _mm256_add_epi32(input[startidx], input[endidx]); buf1[20] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; buf1[12] = _mm256_add_epi32(input[startidx], input[endidx]); buf1[19] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; buf1[13] = _mm256_add_epi32(input[startidx], input[endidx]); buf1[18] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; buf1[14] = _mm256_add_epi32(input[startidx], input[endidx]); buf1[17] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; buf1[15] = _mm256_add_epi32(input[startidx], input[endidx]); buf1[16] = _mm256_sub_epi32(input[startidx], input[endidx]); // stage 2 cospi = cospi_arr(cos_bit); buf0[0] = _mm256_add_epi32(buf1[0], buf1[15]); buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]); buf0[1] = _mm256_add_epi32(buf1[1], buf1[14]); buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]); buf0[2] = _mm256_add_epi32(buf1[2], buf1[13]); buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]); buf0[3] = _mm256_add_epi32(buf1[3], buf1[12]); buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]); buf0[4] = _mm256_add_epi32(buf1[4], buf1[11]); buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]); buf0[5] = _mm256_add_epi32(buf1[5], buf1[10]); buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]); buf0[6] = _mm256_add_epi32(buf1[6], buf1[9]); buf0[9] = _mm256_sub_epi32(buf1[6], buf1[9]); buf0[7] = _mm256_add_epi32(buf1[7], buf1[8]); buf0[8] = _mm256_sub_epi32(buf1[7], buf1[8]); buf0[16] = buf1[16]; buf0[17] = buf1[17]; buf0[18] = buf1[18]; buf0[19] = buf1[19]; btf_32_avx2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20], buf0[27], cos_bit); btf_32_avx2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21], buf0[26], cos_bit); btf_32_avx2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22], buf0[25], cos_bit); btf_32_avx2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23], buf0[24], cos_bit); buf0[28] = buf1[28]; buf0[29] = buf1[29]; buf0[30] = buf1[30]; buf0[31] = buf1[31]; // stage 3 cospi = cospi_arr(cos_bit); buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]); buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]); buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]); buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]); buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]); buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]); buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]); buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]); buf1[8] = buf0[8]; buf1[9] = buf0[9]; btf_32_avx2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10], buf1[13], cos_bit); btf_32_avx2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11], buf1[12], cos_bit); buf1[14] = buf0[14]; buf1[15] = buf0[15]; buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]); buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]); buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]); buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]); buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]); buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]); buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]); buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]); buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]); buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]); buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]); buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]); buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]); buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]); buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]); buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]); // stage 4 cospi = cospi_arr(cos_bit); buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]); buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]); buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]); buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]); buf0[4] = buf1[4]; btf_32_avx2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6], cos_bit); buf0[7] = buf1[7]; buf0[8] = _mm256_add_epi32(buf1[8], buf1[11]); buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]); buf0[9] = _mm256_add_epi32(buf1[9], buf1[10]); buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]); buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]); buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]); buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]); buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]); buf0[16] = buf1[16]; buf0[17] = buf1[17]; btf_32_avx2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18], buf0[29], cos_bit); btf_32_avx2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19], buf0[28], cos_bit); btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20], buf0[27], cos_bit); btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21], buf0[26], cos_bit); buf0[22] = buf1[22]; buf0[23] = buf1[23]; buf0[24] = buf1[24]; buf0[25] = buf1[25]; buf0[30] = buf1[30]; buf0[31] = buf1[31]; // stage 5 cospi = cospi_arr(cos_bit); btf_32_avx2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1], cos_bit); btf_32_avx2_type0(cospi[16], cospi[48], buf0[3], buf0[2], buf1[2], buf1[3], cos_bit); buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]); buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]); buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]); buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]); buf1[8] = buf0[8]; btf_32_avx2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14], cos_bit); btf_32_avx2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10], buf1[13], cos_bit); buf1[11] = buf0[11]; buf1[12] = buf0[12]; buf1[15] = buf0[15]; buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]); buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]); buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]); buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]); buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]); buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]); buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]); buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]); buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]); buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]); buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]); buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]); buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]); buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]); buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]); buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]); // stage 6 cospi = cospi_arr(cos_bit); buf0[0] = buf1[0]; buf0[1] = buf1[1]; buf0[2] = buf1[2]; buf0[3] = buf1[3]; btf_32_avx2_type0(cospi[8], cospi[56], buf1[7], buf1[4], buf0[4], buf0[7], cos_bit); btf_32_avx2_type0(cospi[40], cospi[24], buf1[6], buf1[5], buf0[5], buf0[6], cos_bit); buf0[8] = _mm256_add_epi32(buf1[8], buf1[9]); buf0[9] = _mm256_sub_epi32(buf1[8], buf1[9]); buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]); buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]); buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]); buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]); buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]); buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]); buf0[16] = buf1[16]; btf_32_avx2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], buf0[30], cos_bit); btf_32_avx2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18], buf0[29], cos_bit); buf0[19] = buf1[19]; buf0[20] = buf1[20]; btf_32_avx2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21], buf0[26], cos_bit); btf_32_avx2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22], buf0[25], cos_bit); buf0[23] = buf1[23]; buf0[24] = buf1[24]; buf0[27] = buf1[27]; buf0[28] = buf1[28]; buf0[31] = buf1[31]; // stage 7 cospi = cospi_arr(cos_bit); buf1[0] = buf0[0]; buf1[1] = buf0[1]; buf1[2] = buf0[2]; buf1[3] = buf0[3]; buf1[4] = buf0[4]; buf1[5] = buf0[5]; buf1[6] = buf0[6]; buf1[7] = buf0[7]; btf_32_avx2_type0(cospi[4], cospi[60], buf0[15], buf0[8], buf1[8], buf1[15], cos_bit); btf_32_avx2_type0(cospi[36], cospi[28], buf0[14], buf0[9], buf1[9], buf1[14], cos_bit); btf_32_avx2_type0(cospi[20], cospi[44], buf0[13], buf0[10], buf1[10], buf1[13], cos_bit); btf_32_avx2_type0(cospi[52], cospi[12], buf0[12], buf0[11], buf1[11], buf1[12], cos_bit); buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]); buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]); buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]); buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]); buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]); buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]); buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]); buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]); buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]); buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]); buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]); buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]); buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]); buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]); buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]); buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]); // stage 8 cospi = cospi_arr(cos_bit); buf0[0] = buf1[0]; buf0[1] = buf1[1]; buf0[2] = buf1[2]; buf0[3] = buf1[3]; buf0[4] = buf1[4]; buf0[5] = buf1[5]; buf0[6] = buf1[6]; buf0[7] = buf1[7]; buf0[8] = buf1[8]; buf0[9] = buf1[9]; buf0[10] = buf1[10]; buf0[11] = buf1[11]; buf0[12] = buf1[12]; buf0[13] = buf1[13]; buf0[14] = buf1[14]; buf0[15] = buf1[15]; btf_32_avx2_type0(cospi[2], cospi[62], buf1[31], buf1[16], buf0[16], buf0[31], cos_bit); btf_32_avx2_type0(cospi[34], cospi[30], buf1[30], buf1[17], buf0[17], buf0[30], cos_bit); btf_32_avx2_type0(cospi[18], cospi[46], buf1[29], buf1[18], buf0[18], buf0[29], cos_bit); btf_32_avx2_type0(cospi[50], cospi[14], buf1[28], buf1[19], buf0[19], buf0[28], cos_bit); btf_32_avx2_type0(cospi[10], cospi[54], buf1[27], buf1[20], buf0[20], buf0[27], cos_bit); btf_32_avx2_type0(cospi[42], cospi[22], buf1[26], buf1[21], buf0[21], buf0[26], cos_bit); btf_32_avx2_type0(cospi[26], cospi[38], buf1[25], buf1[22], buf0[22], buf0[25], cos_bit); btf_32_avx2_type0(cospi[58], cospi[6], buf1[24], buf1[23], buf0[23], buf0[24], cos_bit); startidx = 0 * outstride; endidx = 31 * outstride; // stage 9 output[startidx] = buf0[0]; output[endidx] = buf0[31]; startidx += outstride; endidx -= outstride; output[startidx] = buf0[16]; output[endidx] = buf0[15]; startidx += outstride; endidx -= outstride; output[startidx] = buf0[8]; output[endidx] = buf0[23]; startidx += outstride; endidx -= outstride; output[startidx] = buf0[24]; output[endidx] = buf0[7]; startidx += outstride; endidx -= outstride; output[startidx] = buf0[4]; output[endidx] = buf0[27]; startidx += outstride; endidx -= outstride; output[startidx] = buf0[20]; output[endidx] = buf0[11]; startidx += outstride; endidx -= outstride; output[startidx] = buf0[12]; output[endidx] = buf0[19]; startidx += outstride; endidx -= outstride; output[startidx] = buf0[28]; output[endidx] = buf0[3]; startidx += outstride; endidx -= outstride; output[startidx] = buf0[2]; output[endidx] = buf0[29]; startidx += outstride; endidx -= outstride; output[startidx] = buf0[18]; output[endidx] = buf0[13]; startidx += outstride; endidx -= outstride; output[startidx] = buf0[10]; output[endidx] = buf0[21]; startidx += outstride; endidx -= outstride; output[startidx] = buf0[26]; output[endidx] = buf0[5]; startidx += outstride; endidx -= outstride; output[startidx] = buf0[6]; output[endidx] = buf0[25]; startidx += outstride; endidx -= outstride; output[startidx] = buf0[22]; output[endidx] = buf0[9]; startidx += outstride; endidx -= outstride; output[startidx] = buf0[14]; output[endidx] = buf0[17]; startidx += outstride; endidx -= outstride; output[startidx] = buf0[30]; output[endidx] = buf0[1]; } static inline void idtx32x32_avx2(__m256i *input, __m256i *output, const int8_t cos_bit, int instride, int outstride) { (void)cos_bit; for (int i = 0; i < 32; i += 8) { output[i * outstride] = _mm256_slli_epi32(input[i * instride], 2); output[(i + 1) * outstride] = _mm256_slli_epi32(input[(i + 1) * instride], 2); output[(i + 2) * outstride] = _mm256_slli_epi32(input[(i + 2) * instride], 2); output[(i + 3) * outstride] = _mm256_slli_epi32(input[(i + 3) * instride], 2); output[(i + 4) * outstride] = _mm256_slli_epi32(input[(i + 4) * instride], 2); output[(i + 5) * outstride] = _mm256_slli_epi32(input[(i + 5) * instride], 2); output[(i + 6) * outstride] = _mm256_slli_epi32(input[(i + 6) * instride], 2); output[(i + 7) * outstride] = _mm256_slli_epi32(input[(i + 7) * instride], 2); } } static const transform_1d_avx2 col_txfm8x32_arr[TX_TYPES] = { fdct32_avx2, // DCT_DCT NULL, // ADST_DCT NULL, // DCT_ADST NULL, // ADST_ADST NULL, // FLIPADST_DCT NULL, // DCT_FLIPADST NULL, // FLIPADST_FLIPADST NULL, // ADST_FLIPADST NULL, // FLIPADST_ADST idtx32x32_avx2, // IDTX NULL, // V_DCT NULL, // H_DCT NULL, // V_ADST NULL, // H_ADST NULL, // V_FLIPADST NULL // H_FLIPADST }; static const transform_1d_avx2 row_txfm8x32_arr[TX_TYPES] = { fdct32_avx2, // DCT_DCT NULL, // ADST_DCT NULL, // DCT_ADST NULL, // ADST_ADST NULL, // FLIPADST_DCT NULL, // DCT_FLIPADST NULL, // FLIPADST_FLIPADST NULL, // ADST_FLIPADST NULL, // FLIPADST_ADST idtx32x32_avx2, // IDTX NULL, // V_DCT NULL, // H_DCT NULL, // V_ADST NULL, // H_ADST NULL, // V_FLIPADST NULL // H_FLIPADST }; void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; __m256i buf0[128], buf1[128]; const int tx_size = TX_32X32; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const transform_1d_avx2 col_txfm = col_txfm8x32_arr[tx_type]; const transform_1d_avx2 row_txfm = row_txfm8x32_arr[tx_type]; int r, c; const int width_div16 = (width >> 4); const int width_div8 = (width >> 3); for (int i = 0; i < width_div16; i++) { load_buffer_16xn_avx2(input + (i << 4), &buf0[(i << 1)], stride, height, width_div8, 0, 0); round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[0], width_div8); round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8); col_txfm(&buf0[(i << 1)], &buf0[(i << 1)], cos_bit_col, width_div8, width_div8); col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8, width_div8); round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[1], width_div8); round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8); } for (r = 0; r < height; r += 8) { for (c = 0; c < width_div8; c++) { fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c], &buf1[c * 8 * width_div8 + (r >> 3)], width_div8, width_div8); } } for (int i = 0; i < width_div16; i++) { row_txfm(&buf1[(i << 1)], &buf1[(i << 1)], cos_bit_row, width_div8, width_div8); row_txfm(&buf1[(i << 1) + 1], &buf1[(i << 1) + 1], cos_bit_row, width_div8, width_div8); round_shift_32_8xn_avx2(&buf1[(i << 1)], height, shift[2], width_div8); round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2], width_div8); } store_buffer_avx2(buf1, output, 8, 128); } static inline void fdct64_stage2_avx2(__m256i *x1, __m256i *x2, __m256i *cospi_m32, __m256i *cospi_p32, const __m256i *__rounding, int8_t cos_bit) { x2[0] = _mm256_add_epi32(x1[0], x1[31]); x2[31] = _mm256_sub_epi32(x1[0], x1[31]); x2[1] = _mm256_add_epi32(x1[1], x1[30]); x2[30] = _mm256_sub_epi32(x1[1], x1[30]); x2[2] = _mm256_add_epi32(x1[2], x1[29]); x2[29] = _mm256_sub_epi32(x1[2], x1[29]); x2[3] = _mm256_add_epi32(x1[3], x1[28]); x2[28] = _mm256_sub_epi32(x1[3], x1[28]); x2[4] = _mm256_add_epi32(x1[4], x1[27]); x2[27] = _mm256_sub_epi32(x1[4], x1[27]); x2[5] = _mm256_add_epi32(x1[5], x1[26]); x2[26] = _mm256_sub_epi32(x1[5], x1[26]); x2[6] = _mm256_add_epi32(x1[6], x1[25]); x2[25] = _mm256_sub_epi32(x1[6], x1[25]); x2[7] = _mm256_add_epi32(x1[7], x1[24]); x2[24] = _mm256_sub_epi32(x1[7], x1[24]); x2[8] = _mm256_add_epi32(x1[8], x1[23]); x2[23] = _mm256_sub_epi32(x1[8], x1[23]); x2[9] = _mm256_add_epi32(x1[9], x1[22]); x2[22] = _mm256_sub_epi32(x1[9], x1[22]); x2[10] = _mm256_add_epi32(x1[10], x1[21]); x2[21] = _mm256_sub_epi32(x1[10], x1[21]); x2[11] = _mm256_add_epi32(x1[11], x1[20]); x2[20] = _mm256_sub_epi32(x1[11], x1[20]); x2[12] = _mm256_add_epi32(x1[12], x1[19]); x2[19] = _mm256_sub_epi32(x1[12], x1[19]); x2[13] = _mm256_add_epi32(x1[13], x1[18]); x2[18] = _mm256_sub_epi32(x1[13], x1[18]); x2[14] = _mm256_add_epi32(x1[14], x1[17]); x2[17] = _mm256_sub_epi32(x1[14], x1[17]); x2[15] = _mm256_add_epi32(x1[15], x1[16]); x2[16] = _mm256_sub_epi32(x1[15], x1[16]); x2[32] = x1[32]; x2[33] = x1[33]; x2[34] = x1[34]; x2[35] = x1[35]; x2[36] = x1[36]; x2[37] = x1[37]; x2[38] = x1[38]; x2[39] = x1[39]; btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[40], x1[55], x2[40], x2[55], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[41], x1[54], x2[41], x2[54], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[42], x1[53], x2[42], x2[53], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[43], x1[52], x2[43], x2[52], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[44], x1[51], x2[44], x2[51], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[45], x1[50], x2[45], x2[50], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[46], x1[49], x2[46], x2[49], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[47], x1[48], x2[47], x2[48], *__rounding, cos_bit); x2[56] = x1[56]; x2[57] = x1[57]; x2[58] = x1[58]; x2[59] = x1[59]; x2[60] = x1[60]; x2[61] = x1[61]; x2[62] = x1[62]; x2[63] = x1[63]; } static inline void fdct64_stage3_avx2(__m256i *x2, __m256i *x3, __m256i *cospi_m32, __m256i *cospi_p32, const __m256i *__rounding, int8_t cos_bit) { x3[0] = _mm256_add_epi32(x2[0], x2[15]); x3[15] = _mm256_sub_epi32(x2[0], x2[15]); x3[1] = _mm256_add_epi32(x2[1], x2[14]); x3[14] = _mm256_sub_epi32(x2[1], x2[14]); x3[2] = _mm256_add_epi32(x2[2], x2[13]); x3[13] = _mm256_sub_epi32(x2[2], x2[13]); x3[3] = _mm256_add_epi32(x2[3], x2[12]); x3[12] = _mm256_sub_epi32(x2[3], x2[12]); x3[4] = _mm256_add_epi32(x2[4], x2[11]); x3[11] = _mm256_sub_epi32(x2[4], x2[11]); x3[5] = _mm256_add_epi32(x2[5], x2[10]); x3[10] = _mm256_sub_epi32(x2[5], x2[10]); x3[6] = _mm256_add_epi32(x2[6], x2[9]); x3[9] = _mm256_sub_epi32(x2[6], x2[9]); x3[7] = _mm256_add_epi32(x2[7], x2[8]); x3[8] = _mm256_sub_epi32(x2[7], x2[8]); x3[16] = x2[16]; x3[17] = x2[17]; x3[18] = x2[18]; x3[19] = x2[19]; btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[20], x2[27], x3[20], x3[27], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[21], x2[26], x3[21], x3[26], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[22], x2[25], x3[22], x3[25], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[23], x2[24], x3[23], x3[24], *__rounding, cos_bit); x3[28] = x2[28]; x3[29] = x2[29]; x3[30] = x2[30]; x3[31] = x2[31]; x3[32] = _mm256_add_epi32(x2[32], x2[47]); x3[47] = _mm256_sub_epi32(x2[32], x2[47]); x3[33] = _mm256_add_epi32(x2[33], x2[46]); x3[46] = _mm256_sub_epi32(x2[33], x2[46]); x3[34] = _mm256_add_epi32(x2[34], x2[45]); x3[45] = _mm256_sub_epi32(x2[34], x2[45]); x3[35] = _mm256_add_epi32(x2[35], x2[44]); x3[44] = _mm256_sub_epi32(x2[35], x2[44]); x3[36] = _mm256_add_epi32(x2[36], x2[43]); x3[43] = _mm256_sub_epi32(x2[36], x2[43]); x3[37] = _mm256_add_epi32(x2[37], x2[42]); x3[42] = _mm256_sub_epi32(x2[37], x2[42]); x3[38] = _mm256_add_epi32(x2[38], x2[41]); x3[41] = _mm256_sub_epi32(x2[38], x2[41]); x3[39] = _mm256_add_epi32(x2[39], x2[40]); x3[40] = _mm256_sub_epi32(x2[39], x2[40]); x3[48] = _mm256_sub_epi32(x2[63], x2[48]); x3[63] = _mm256_add_epi32(x2[63], x2[48]); x3[49] = _mm256_sub_epi32(x2[62], x2[49]); x3[62] = _mm256_add_epi32(x2[62], x2[49]); x3[50] = _mm256_sub_epi32(x2[61], x2[50]); x3[61] = _mm256_add_epi32(x2[61], x2[50]); x3[51] = _mm256_sub_epi32(x2[60], x2[51]); x3[60] = _mm256_add_epi32(x2[60], x2[51]); x3[52] = _mm256_sub_epi32(x2[59], x2[52]); x3[59] = _mm256_add_epi32(x2[59], x2[52]); x3[53] = _mm256_sub_epi32(x2[58], x2[53]); x3[58] = _mm256_add_epi32(x2[58], x2[53]); x3[54] = _mm256_sub_epi32(x2[57], x2[54]); x3[57] = _mm256_add_epi32(x2[57], x2[54]); x3[55] = _mm256_sub_epi32(x2[56], x2[55]); x3[56] = _mm256_add_epi32(x2[56], x2[55]); } static inline void fdct64_stage4_avx2(__m256i *x3, __m256i *x4, __m256i *cospi_m32, __m256i *cospi_p32, __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48, const __m256i *__rounding, int8_t cos_bit) { x4[0] = _mm256_add_epi32(x3[0], x3[7]); x4[7] = _mm256_sub_epi32(x3[0], x3[7]); x4[1] = _mm256_add_epi32(x3[1], x3[6]); x4[6] = _mm256_sub_epi32(x3[1], x3[6]); x4[2] = _mm256_add_epi32(x3[2], x3[5]); x4[5] = _mm256_sub_epi32(x3[2], x3[5]); x4[3] = _mm256_add_epi32(x3[3], x3[4]); x4[4] = _mm256_sub_epi32(x3[3], x3[4]); x4[8] = x3[8]; x4[9] = x3[9]; btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[10], x3[13], x4[10], x4[13], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[11], x3[12], x4[11], x4[12], *__rounding, cos_bit); x4[14] = x3[14]; x4[15] = x3[15]; x4[16] = _mm256_add_epi32(x3[16], x3[23]); x4[23] = _mm256_sub_epi32(x3[16], x3[23]); x4[17] = _mm256_add_epi32(x3[17], x3[22]); x4[22] = _mm256_sub_epi32(x3[17], x3[22]); x4[18] = _mm256_add_epi32(x3[18], x3[21]); x4[21] = _mm256_sub_epi32(x3[18], x3[21]); x4[19] = _mm256_add_epi32(x3[19], x3[20]); x4[20] = _mm256_sub_epi32(x3[19], x3[20]); x4[24] = _mm256_sub_epi32(x3[31], x3[24]); x4[31] = _mm256_add_epi32(x3[31], x3[24]); x4[25] = _mm256_sub_epi32(x3[30], x3[25]); x4[30] = _mm256_add_epi32(x3[30], x3[25]); x4[26] = _mm256_sub_epi32(x3[29], x3[26]); x4[29] = _mm256_add_epi32(x3[29], x3[26]); x4[27] = _mm256_sub_epi32(x3[28], x3[27]); x4[28] = _mm256_add_epi32(x3[28], x3[27]); x4[32] = x3[32]; x4[33] = x3[33]; x4[34] = x3[34]; x4[35] = x3[35]; btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[36], x3[59], x4[36], x4[59], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[37], x3[58], x4[37], x4[58], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[38], x3[57], x4[38], x4[57], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[39], x3[56], x4[39], x4[56], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[40], x3[55], x4[40], x4[55], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[41], x3[54], x4[41], x4[54], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[42], x3[53], x4[42], x4[53], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[43], x3[52], x4[43], x4[52], *__rounding, cos_bit); x4[44] = x3[44]; x4[45] = x3[45]; x4[46] = x3[46]; x4[47] = x3[47]; x4[48] = x3[48]; x4[49] = x3[49]; x4[50] = x3[50]; x4[51] = x3[51]; x4[60] = x3[60]; x4[61] = x3[61]; x4[62] = x3[62]; x4[63] = x3[63]; } static inline void fdct64_stage5_avx2(__m256i *x4, __m256i *x5, __m256i *cospi_m32, __m256i *cospi_p32, __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48, const __m256i *__rounding, int8_t cos_bit) { x5[0] = _mm256_add_epi32(x4[0], x4[3]); x5[3] = _mm256_sub_epi32(x4[0], x4[3]); x5[1] = _mm256_add_epi32(x4[1], x4[2]); x5[2] = _mm256_sub_epi32(x4[1], x4[2]); x5[4] = x4[4]; btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x4[5], x4[6], x5[5], x5[6], *__rounding, cos_bit); x5[7] = x4[7]; x5[8] = _mm256_add_epi32(x4[8], x4[11]); x5[11] = _mm256_sub_epi32(x4[8], x4[11]); x5[9] = _mm256_add_epi32(x4[9], x4[10]); x5[10] = _mm256_sub_epi32(x4[9], x4[10]); x5[12] = _mm256_sub_epi32(x4[15], x4[12]); x5[15] = _mm256_add_epi32(x4[15], x4[12]); x5[13] = _mm256_sub_epi32(x4[14], x4[13]); x5[14] = _mm256_add_epi32(x4[14], x4[13]); x5[16] = x4[16]; x5[17] = x4[17]; btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[18], x4[29], x5[18], x5[29], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[19], x4[28], x5[19], x5[28], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[20], x4[27], x5[20], x5[27], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[21], x4[26], x5[21], x5[26], *__rounding, cos_bit); x5[22] = x4[22]; x5[23] = x4[23]; x5[24] = x4[24]; x5[25] = x4[25]; x5[30] = x4[30]; x5[31] = x4[31]; x5[32] = _mm256_add_epi32(x4[32], x4[39]); x5[39] = _mm256_sub_epi32(x4[32], x4[39]); x5[33] = _mm256_add_epi32(x4[33], x4[38]); x5[38] = _mm256_sub_epi32(x4[33], x4[38]); x5[34] = _mm256_add_epi32(x4[34], x4[37]); x5[37] = _mm256_sub_epi32(x4[34], x4[37]); x5[35] = _mm256_add_epi32(x4[35], x4[36]); x5[36] = _mm256_sub_epi32(x4[35], x4[36]); x5[40] = _mm256_sub_epi32(x4[47], x4[40]); x5[47] = _mm256_add_epi32(x4[47], x4[40]); x5[41] = _mm256_sub_epi32(x4[46], x4[41]); x5[46] = _mm256_add_epi32(x4[46], x4[41]); x5[42] = _mm256_sub_epi32(x4[45], x4[42]); x5[45] = _mm256_add_epi32(x4[45], x4[42]); x5[43] = _mm256_sub_epi32(x4[44], x4[43]); x5[44] = _mm256_add_epi32(x4[44], x4[43]); x5[48] = _mm256_add_epi32(x4[48], x4[55]); x5[55] = _mm256_sub_epi32(x4[48], x4[55]); x5[49] = _mm256_add_epi32(x4[49], x4[54]); x5[54] = _mm256_sub_epi32(x4[49], x4[54]); x5[50] = _mm256_add_epi32(x4[50], x4[53]); x5[53] = _mm256_sub_epi32(x4[50], x4[53]); x5[51] = _mm256_add_epi32(x4[51], x4[52]); x5[52] = _mm256_sub_epi32(x4[51], x4[52]); x5[56] = _mm256_sub_epi32(x4[63], x4[56]); x5[63] = _mm256_add_epi32(x4[63], x4[56]); x5[57] = _mm256_sub_epi32(x4[62], x4[57]); x5[62] = _mm256_add_epi32(x4[62], x4[57]); x5[58] = _mm256_sub_epi32(x4[61], x4[58]); x5[61] = _mm256_add_epi32(x4[61], x4[58]); x5[59] = _mm256_sub_epi32(x4[60], x4[59]); x5[60] = _mm256_add_epi32(x4[60], x4[59]); } static inline void fdct64_stage6_avx2( __m256i *x5, __m256i *x6, __m256i *cospi_p16, __m256i *cospi_p32, __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48, __m256i *cospi_m08, __m256i *cospi_p56, __m256i *cospi_m56, __m256i *cospi_m40, __m256i *cospi_p24, __m256i *cospi_m24, const __m256i *__rounding, int8_t cos_bit) { btf_32_type0_avx2_new(*cospi_p32, *cospi_p32, x5[0], x5[1], x6[0], x6[1], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_p16, *cospi_p48, x5[3], x5[2], x6[2], x6[3], *__rounding, cos_bit); x6[4] = _mm256_add_epi32(x5[4], x5[5]); x6[5] = _mm256_sub_epi32(x5[4], x5[5]); x6[6] = _mm256_sub_epi32(x5[7], x5[6]); x6[7] = _mm256_add_epi32(x5[7], x5[6]); x6[8] = x5[8]; btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x5[9], x5[14], x6[9], x6[14], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x5[10], x5[13], x6[10], x6[13], *__rounding, cos_bit); x6[11] = x5[11]; x6[12] = x5[12]; x6[15] = x5[15]; x6[16] = _mm256_add_epi32(x5[16], x5[19]); x6[19] = _mm256_sub_epi32(x5[16], x5[19]); x6[17] = _mm256_add_epi32(x5[17], x5[18]); x6[18] = _mm256_sub_epi32(x5[17], x5[18]); x6[20] = _mm256_sub_epi32(x5[23], x5[20]); x6[23] = _mm256_add_epi32(x5[23], x5[20]); x6[21] = _mm256_sub_epi32(x5[22], x5[21]); x6[22] = _mm256_add_epi32(x5[22], x5[21]); x6[24] = _mm256_add_epi32(x5[24], x5[27]); x6[27] = _mm256_sub_epi32(x5[24], x5[27]); x6[25] = _mm256_add_epi32(x5[25], x5[26]); x6[26] = _mm256_sub_epi32(x5[25], x5[26]); x6[28] = _mm256_sub_epi32(x5[31], x5[28]); x6[31] = _mm256_add_epi32(x5[31], x5[28]); x6[29] = _mm256_sub_epi32(x5[30], x5[29]); x6[30] = _mm256_add_epi32(x5[30], x5[29]); x6[32] = x5[32]; x6[33] = x5[33]; btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[34], x5[61], x6[34], x6[61], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[35], x5[60], x6[35], x6[60], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[36], x5[59], x6[36], x6[59], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[37], x5[58], x6[37], x6[58], *__rounding, cos_bit); x6[38] = x5[38]; x6[39] = x5[39]; x6[40] = x5[40]; x6[41] = x5[41]; btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[42], x5[53], x6[42], x6[53], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[43], x5[52], x6[43], x6[52], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[44], x5[51], x6[44], x6[51], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[45], x5[50], x6[45], x6[50], *__rounding, cos_bit); x6[46] = x5[46]; x6[47] = x5[47]; x6[48] = x5[48]; x6[49] = x5[49]; x6[54] = x5[54]; x6[55] = x5[55]; x6[56] = x5[56]; x6[57] = x5[57]; x6[62] = x5[62]; x6[63] = x5[63]; } static inline void fdct64_stage7_avx2(__m256i *x6, __m256i *x7, __m256i *cospi_p08, __m256i *cospi_p56, __m256i *cospi_p40, __m256i *cospi_p24, __m256i *cospi_m08, __m256i *cospi_m56, __m256i *cospi_m40, __m256i *cospi_m24, const __m256i *__rounding, int8_t cos_bit) { x7[0] = x6[0]; x7[1] = x6[1]; x7[2] = x6[2]; x7[3] = x6[3]; btf_32_type0_avx2_new(*cospi_p08, *cospi_p56, x6[7], x6[4], x7[4], x7[7], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_p40, *cospi_p24, x6[6], x6[5], x7[5], x7[6], *__rounding, cos_bit); x7[8] = _mm256_add_epi32(x6[8], x6[9]); x7[9] = _mm256_sub_epi32(x6[8], x6[9]); x7[10] = _mm256_sub_epi32(x6[11], x6[10]); x7[11] = _mm256_add_epi32(x6[11], x6[10]); x7[12] = _mm256_add_epi32(x6[12], x6[13]); x7[13] = _mm256_sub_epi32(x6[12], x6[13]); x7[14] = _mm256_sub_epi32(x6[15], x6[14]); x7[15] = _mm256_add_epi32(x6[15], x6[14]); x7[16] = x6[16]; btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x6[17], x6[30], x7[17], x7[30], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x6[18], x6[29], x7[18], x7[29], *__rounding, cos_bit); x7[19] = x6[19]; x7[20] = x6[20]; btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x6[21], x6[26], x7[21], x7[26], *__rounding, cos_bit); btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x6[22], x6[25], x7[22], x7[25], *__rounding, cos_bit); x7[23] = x6[23]; x7[24] = x6[24]; x7[27] = x6[27]; x7[28] = x6[28]; x7[31] = x6[31]; x7[32] = _mm256_add_epi32(x6[32], x6[35]); x7[35] = _mm256_sub_epi32(x6[32], x6[35]); x7[33] = _mm256_add_epi32(x6[33], x6[34]); x7[34] = _mm256_sub_epi32(x6[33], x6[34]); x7[36] = _mm256_sub_epi32(x6[39], x6[36]); x7[39] = _mm256_add_epi32(x6[39], x6[36]); x7[37] = _mm256_sub_epi32(x6[38], x6[37]); x7[38] = _mm256_add_epi32(x6[38], x6[37]); x7[40] = _mm256_add_epi32(x6[40], x6[43]); x7[43] = _mm256_sub_epi32(x6[40], x6[43]); x7[41] = _mm256_add_epi32(x6[41], x6[42]); x7[42] = _mm256_sub_epi32(x6[41], x6[42]); x7[44] = _mm256_sub_epi32(x6[47], x6[44]); x7[47] = _mm256_add_epi32(x6[47], x6[44]); x7[45] = _mm256_sub_epi32(x6[46], x6[45]); x7[46] = _mm256_add_epi32(x6[46], x6[45]); x7[48] = _mm256_add_epi32(x6[48], x6[51]); x7[51] = _mm256_sub_epi32(x6[48], x6[51]); x7[49] = _mm256_add_epi32(x6[49], x6[50]); x7[50] = _mm256_sub_epi32(x6[49], x6[50]); x7[52] = _mm256_sub_epi32(x6[55], x6[52]); x7[55] = _mm256_add_epi32(x6[55], x6[52]); x7[53] = _mm256_sub_epi32(x6[54], x6[53]); x7[54] = _mm256_add_epi32(x6[54], x6[53]); x7[56] = _mm256_add_epi32(x6[56], x6[59]); x7[59] = _mm256_sub_epi32(x6[56], x6[59]); x7[57] = _mm256_add_epi32(x6[57], x6[58]); x7[58] = _mm256_sub_epi32(x6[57], x6[58]); x7[60] = _mm256_sub_epi32(x6[63], x6[60]); x7[63] = _mm256_add_epi32(x6[63], x6[60]); x7[61] = _mm256_sub_epi32(x6[62], x6[61]); x7[62] = _mm256_add_epi32(x6[62], x6[61]); } static inline void fdct64_stage8_avx2(__m256i *x7, __m256i *x8, const int32_t *cospi, const __m256i *__rounding, int8_t cos_bit) { __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]); __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]); __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]); __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]); __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]); __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]); __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]); __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]); __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]); __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]); __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]); __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]); __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]); __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]); __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]); __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]); x8[0] = x7[0]; x8[1] = x7[1]; x8[2] = x7[2]; x8[3] = x7[3]; x8[4] = x7[4]; x8[5] = x7[5]; x8[6] = x7[6]; x8[7] = x7[7]; btf_32_type0_avx2_new(cospi_p04, cospi_p60, x7[15], x7[8], x8[8], x8[15], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p36, cospi_p28, x7[14], x7[9], x8[9], x8[14], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p20, cospi_p44, x7[13], x7[10], x8[10], x8[13], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p52, cospi_p12, x7[12], x7[11], x8[11], x8[12], *__rounding, cos_bit); x8[16] = _mm256_add_epi32(x7[16], x7[17]); x8[17] = _mm256_sub_epi32(x7[16], x7[17]); x8[18] = _mm256_sub_epi32(x7[19], x7[18]); x8[19] = _mm256_add_epi32(x7[19], x7[18]); x8[20] = _mm256_add_epi32(x7[20], x7[21]); x8[21] = _mm256_sub_epi32(x7[20], x7[21]); x8[22] = _mm256_sub_epi32(x7[23], x7[22]); x8[23] = _mm256_add_epi32(x7[23], x7[22]); x8[24] = _mm256_add_epi32(x7[24], x7[25]); x8[25] = _mm256_sub_epi32(x7[24], x7[25]); x8[26] = _mm256_sub_epi32(x7[27], x7[26]); x8[27] = _mm256_add_epi32(x7[27], x7[26]); x8[28] = _mm256_add_epi32(x7[28], x7[29]); x8[29] = _mm256_sub_epi32(x7[28], x7[29]); x8[30] = _mm256_sub_epi32(x7[31], x7[30]); x8[31] = _mm256_add_epi32(x7[31], x7[30]); x8[32] = x7[32]; btf_32_type0_avx2_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61], *__rounding, cos_bit); x8[35] = x7[35]; x8[36] = x7[36]; btf_32_type0_avx2_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57], *__rounding, cos_bit); x8[39] = x7[39]; x8[40] = x7[40]; btf_32_type0_avx2_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53], *__rounding, cos_bit); x8[43] = x7[43]; x8[44] = x7[44]; btf_32_type0_avx2_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49], *__rounding, cos_bit); x8[47] = x7[47]; x8[48] = x7[48]; x8[51] = x7[51]; x8[52] = x7[52]; x8[55] = x7[55]; x8[56] = x7[56]; x8[59] = x7[59]; x8[60] = x7[60]; x8[63] = x7[63]; } static inline void fdct64_stage9_avx2(__m256i *x8, __m256i *x9, const int32_t *cospi, const __m256i *__rounding, int8_t cos_bit) { __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]); __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]); __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]); __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]); __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]); __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]); __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]); __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]); __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]); __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]); __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]); __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]); __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]); __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]); __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]); __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]); x9[0] = x8[0]; x9[1] = x8[1]; x9[2] = x8[2]; x9[3] = x8[3]; x9[4] = x8[4]; x9[5] = x8[5]; x9[6] = x8[6]; x9[7] = x8[7]; x9[8] = x8[8]; x9[9] = x8[9]; x9[10] = x8[10]; x9[11] = x8[11]; x9[12] = x8[12]; x9[13] = x8[13]; x9[14] = x8[14]; x9[15] = x8[15]; btf_32_type0_avx2_new(cospi_p02, cospi_p62, x8[31], x8[16], x9[16], x9[31], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p34, cospi_p30, x8[30], x8[17], x9[17], x9[30], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p18, cospi_p46, x8[29], x8[18], x9[18], x9[29], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p50, cospi_p14, x8[28], x8[19], x9[19], x9[28], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p10, cospi_p54, x8[27], x8[20], x9[20], x9[27], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p42, cospi_p22, x8[26], x8[21], x9[21], x9[26], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p26, cospi_p38, x8[25], x8[22], x9[22], x9[25], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p58, cospi_p06, x8[24], x8[23], x9[23], x9[24], *__rounding, cos_bit); x9[32] = _mm256_add_epi32(x8[32], x8[33]); x9[33] = _mm256_sub_epi32(x8[32], x8[33]); x9[34] = _mm256_sub_epi32(x8[35], x8[34]); x9[35] = _mm256_add_epi32(x8[35], x8[34]); x9[36] = _mm256_add_epi32(x8[36], x8[37]); x9[37] = _mm256_sub_epi32(x8[36], x8[37]); x9[38] = _mm256_sub_epi32(x8[39], x8[38]); x9[39] = _mm256_add_epi32(x8[39], x8[38]); x9[40] = _mm256_add_epi32(x8[40], x8[41]); x9[41] = _mm256_sub_epi32(x8[40], x8[41]); x9[42] = _mm256_sub_epi32(x8[43], x8[42]); x9[43] = _mm256_add_epi32(x8[43], x8[42]); x9[44] = _mm256_add_epi32(x8[44], x8[45]); x9[45] = _mm256_sub_epi32(x8[44], x8[45]); x9[46] = _mm256_sub_epi32(x8[47], x8[46]); x9[47] = _mm256_add_epi32(x8[47], x8[46]); x9[48] = _mm256_add_epi32(x8[48], x8[49]); x9[49] = _mm256_sub_epi32(x8[48], x8[49]); x9[50] = _mm256_sub_epi32(x8[51], x8[50]); x9[51] = _mm256_add_epi32(x8[51], x8[50]); x9[52] = _mm256_add_epi32(x8[52], x8[53]); x9[53] = _mm256_sub_epi32(x8[52], x8[53]); x9[54] = _mm256_sub_epi32(x8[55], x8[54]); x9[55] = _mm256_add_epi32(x8[55], x8[54]); x9[56] = _mm256_add_epi32(x8[56], x8[57]); x9[57] = _mm256_sub_epi32(x8[56], x8[57]); x9[58] = _mm256_sub_epi32(x8[59], x8[58]); x9[59] = _mm256_add_epi32(x8[59], x8[58]); x9[60] = _mm256_add_epi32(x8[60], x8[61]); x9[61] = _mm256_sub_epi32(x8[60], x8[61]); x9[62] = _mm256_sub_epi32(x8[63], x8[62]); x9[63] = _mm256_add_epi32(x8[63], x8[62]); } static inline void fdct64_stage10_avx2(__m256i *x9, __m256i *x10, const int32_t *cospi, const __m256i *__rounding, int8_t cos_bit) { __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]); __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]); __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]); __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]); __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]); __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]); __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]); __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]); __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]); __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]); __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]); __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]); __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]); __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]); __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]); __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]); __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]); __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]); __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]); __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]); __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]); __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]); __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]); __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]); __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]); __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]); __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]); __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]); __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]); __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]); __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]); __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]); x10[0] = x9[0]; x10[1] = x9[1]; x10[2] = x9[2]; x10[3] = x9[3]; x10[4] = x9[4]; x10[5] = x9[5]; x10[6] = x9[6]; x10[7] = x9[7]; x10[8] = x9[8]; x10[9] = x9[9]; x10[10] = x9[10]; x10[11] = x9[11]; x10[12] = x9[12]; x10[13] = x9[13]; x10[14] = x9[14]; x10[15] = x9[15]; x10[16] = x9[16]; x10[17] = x9[17]; x10[18] = x9[18]; x10[19] = x9[19]; x10[20] = x9[20]; x10[21] = x9[21]; x10[22] = x9[22]; x10[23] = x9[23]; x10[24] = x9[24]; x10[25] = x9[25]; x10[26] = x9[26]; x10[27] = x9[27]; x10[28] = x9[28]; x10[29] = x9[29]; x10[30] = x9[30]; x10[31] = x9[31]; btf_32_type0_avx2_new(cospi_p01, cospi_p63, x9[63], x9[32], x10[32], x10[63], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p33, cospi_p31, x9[62], x9[33], x10[33], x10[62], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p17, cospi_p47, x9[61], x9[34], x10[34], x10[61], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p49, cospi_p15, x9[60], x9[35], x10[35], x10[60], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p09, cospi_p55, x9[59], x9[36], x10[36], x10[59], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p41, cospi_p23, x9[58], x9[37], x10[37], x10[58], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p25, cospi_p39, x9[57], x9[38], x10[38], x10[57], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p57, cospi_p07, x9[56], x9[39], x10[39], x10[56], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p05, cospi_p59, x9[55], x9[40], x10[40], x10[55], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p37, cospi_p27, x9[54], x9[41], x10[41], x10[54], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p21, cospi_p43, x9[53], x9[42], x10[42], x10[53], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p53, cospi_p11, x9[52], x9[43], x10[43], x10[52], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p13, cospi_p51, x9[51], x9[44], x10[44], x10[51], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p45, cospi_p19, x9[50], x9[45], x10[45], x10[50], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p29, cospi_p35, x9[49], x9[46], x10[46], x10[49], *__rounding, cos_bit); btf_32_type0_avx2_new(cospi_p61, cospi_p03, x9[48], x9[47], x10[47], x10[48], *__rounding, cos_bit); } static void fdct64_avx2(__m256i *input, __m256i *output, int8_t cos_bit, const int instride, const int outstride) { const int32_t *cospi = cospi_arr(cos_bit); const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]); __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]); __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]); __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]); __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]); __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]); __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]); __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]); __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]); __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]); __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]); __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]); __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]); __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]); int startidx = 0 * instride; int endidx = 63 * instride; // stage 1 __m256i x1[64]; x1[0] = _mm256_add_epi32(input[startidx], input[endidx]); x1[63] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[1] = _mm256_add_epi32(input[startidx], input[endidx]); x1[62] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[2] = _mm256_add_epi32(input[startidx], input[endidx]); x1[61] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[3] = _mm256_add_epi32(input[startidx], input[endidx]); x1[60] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[4] = _mm256_add_epi32(input[startidx], input[endidx]); x1[59] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[5] = _mm256_add_epi32(input[startidx], input[endidx]); x1[58] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[6] = _mm256_add_epi32(input[startidx], input[endidx]); x1[57] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[7] = _mm256_add_epi32(input[startidx], input[endidx]); x1[56] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[8] = _mm256_add_epi32(input[startidx], input[endidx]); x1[55] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[9] = _mm256_add_epi32(input[startidx], input[endidx]); x1[54] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[10] = _mm256_add_epi32(input[startidx], input[endidx]); x1[53] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[11] = _mm256_add_epi32(input[startidx], input[endidx]); x1[52] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[12] = _mm256_add_epi32(input[startidx], input[endidx]); x1[51] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[13] = _mm256_add_epi32(input[startidx], input[endidx]); x1[50] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[14] = _mm256_add_epi32(input[startidx], input[endidx]); x1[49] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[15] = _mm256_add_epi32(input[startidx], input[endidx]); x1[48] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[16] = _mm256_add_epi32(input[startidx], input[endidx]); x1[47] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[17] = _mm256_add_epi32(input[startidx], input[endidx]); x1[46] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[18] = _mm256_add_epi32(input[startidx], input[endidx]); x1[45] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[19] = _mm256_add_epi32(input[startidx], input[endidx]); x1[44] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[20] = _mm256_add_epi32(input[startidx], input[endidx]); x1[43] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[21] = _mm256_add_epi32(input[startidx], input[endidx]); x1[42] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[22] = _mm256_add_epi32(input[startidx], input[endidx]); x1[41] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[23] = _mm256_add_epi32(input[startidx], input[endidx]); x1[40] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[24] = _mm256_add_epi32(input[startidx], input[endidx]); x1[39] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[25] = _mm256_add_epi32(input[startidx], input[endidx]); x1[38] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[26] = _mm256_add_epi32(input[startidx], input[endidx]); x1[37] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[27] = _mm256_add_epi32(input[startidx], input[endidx]); x1[36] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[28] = _mm256_add_epi32(input[startidx], input[endidx]); x1[35] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[29] = _mm256_add_epi32(input[startidx], input[endidx]); x1[34] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[30] = _mm256_add_epi32(input[startidx], input[endidx]); x1[33] = _mm256_sub_epi32(input[startidx], input[endidx]); startidx += instride; endidx -= instride; x1[31] = _mm256_add_epi32(input[startidx], input[endidx]); x1[32] = _mm256_sub_epi32(input[startidx], input[endidx]); // stage 2 __m256i x2[64]; fdct64_stage2_avx2(x1, x2, &cospi_m32, &cospi_p32, &__rounding, cos_bit); // stage 3 fdct64_stage3_avx2(x2, x1, &cospi_m32, &cospi_p32, &__rounding, cos_bit); // stage 4 fdct64_stage4_avx2(x1, x2, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48, &cospi_m48, &__rounding, cos_bit); // stage 5 fdct64_stage5_avx2(x2, x1, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48, &cospi_m48, &__rounding, cos_bit); // stage 6 fdct64_stage6_avx2(x1, x2, &cospi_p16, &cospi_p32, &cospi_m16, &cospi_p48, &cospi_m48, &cospi_m08, &cospi_p56, &cospi_m56, &cospi_m40, &cospi_p24, &cospi_m24, &__rounding, cos_bit); // stage 7 fdct64_stage7_avx2(x2, x1, &cospi_p08, &cospi_p56, &cospi_p40, &cospi_p24, &cospi_m08, &cospi_m56, &cospi_m40, &cospi_m24, &__rounding, cos_bit); // stage 8 fdct64_stage8_avx2(x1, x2, cospi, &__rounding, cos_bit); // stage 9 fdct64_stage9_avx2(x2, x1, cospi, &__rounding, cos_bit); // stage 10 fdct64_stage10_avx2(x1, x2, cospi, &__rounding, cos_bit); startidx = 0 * outstride; endidx = 63 * outstride; // stage 11 output[startidx] = x2[0]; output[endidx] = x2[63]; startidx += outstride; endidx -= outstride; output[startidx] = x2[32]; output[endidx] = x2[31]; startidx += outstride; endidx -= outstride; output[startidx] = x2[16]; output[endidx] = x2[47]; startidx += outstride; endidx -= outstride; output[startidx] = x2[48]; output[endidx] = x2[15]; startidx += outstride; endidx -= outstride; output[startidx] = x2[8]; output[endidx] = x2[55]; startidx += outstride; endidx -= outstride; output[startidx] = x2[40]; output[endidx] = x2[23]; startidx += outstride; endidx -= outstride; output[startidx] = x2[24]; output[endidx] = x2[39]; startidx += outstride; endidx -= outstride; output[startidx] = x2[56]; output[endidx] = x2[7]; startidx += outstride; endidx -= outstride; output[startidx] = x2[4]; output[endidx] = x2[59]; startidx += outstride; endidx -= outstride; output[startidx] = x2[36]; output[endidx] = x2[27]; startidx += outstride; endidx -= outstride; output[startidx] = x2[20]; output[endidx] = x2[43]; startidx += outstride; endidx -= outstride; output[startidx] = x2[52]; output[endidx] = x2[11]; startidx += outstride; endidx -= outstride; output[startidx] = x2[12]; output[endidx] = x2[51]; startidx += outstride; endidx -= outstride; output[startidx] = x2[44]; output[endidx] = x2[19]; startidx += outstride; endidx -= outstride; output[startidx] = x2[28]; output[endidx] = x2[35]; startidx += outstride; endidx -= outstride; output[startidx] = x2[60]; output[endidx] = x2[3]; startidx += outstride; endidx -= outstride; output[startidx] = x2[2]; output[endidx] = x2[61]; startidx += outstride; endidx -= outstride; output[startidx] = x2[34]; output[endidx] = x2[29]; startidx += outstride; endidx -= outstride; output[startidx] = x2[18]; output[endidx] = x2[45]; startidx += outstride; endidx -= outstride; output[startidx] = x2[50]; output[endidx] = x2[13]; startidx += outstride; endidx -= outstride; output[startidx] = x2[10]; output[endidx] = x2[53]; startidx += outstride; endidx -= outstride; output[startidx] = x2[42]; output[endidx] = x2[21]; startidx += outstride; endidx -= outstride; output[startidx] = x2[26]; output[endidx] = x2[37]; startidx += outstride; endidx -= outstride; output[startidx] = x2[58]; output[endidx] = x2[5]; startidx += outstride; endidx -= outstride; output[startidx] = x2[6]; output[endidx] = x2[57]; startidx += outstride; endidx -= outstride; output[startidx] = x2[38]; output[endidx] = x2[25]; startidx += outstride; endidx -= outstride; output[startidx] = x2[22]; output[endidx] = x2[41]; startidx += outstride; endidx -= outstride; output[startidx] = x2[54]; output[endidx] = x2[9]; startidx += outstride; endidx -= outstride; output[startidx] = x2[14]; output[endidx] = x2[49]; startidx += outstride; endidx -= outstride; output[startidx] = x2[46]; output[endidx] = x2[17]; startidx += outstride; endidx -= outstride; output[startidx] = x2[30]; output[endidx] = x2[33]; startidx += outstride; endidx -= outstride; output[startidx] = x2[62]; output[endidx] = x2[1]; } void av1_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; (void)tx_type; assert(tx_type == DCT_DCT); const TX_SIZE tx_size = TX_64X64; __m256i buf0[512], buf1[512]; const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; const int txw_idx = get_txw_idx(tx_size); const int txh_idx = get_txh_idx(tx_size); const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const transform_1d_avx2 col_txfm = fdct64_avx2; const transform_1d_avx2 row_txfm = fdct64_avx2; const int width_div16 = (width >> 4); const int width_div8 = (width >> 3); int r, c; for (int i = 0; i < width_div16; i++) { load_buffer_16xn_avx2(input + (i << 4), &buf0[i << 1], stride, height, width_div8, 0, 0); round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[0], width_div8); round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8); col_txfm(&buf0[i << 1], &buf0[i << 1], cos_bit_col, width_div8, width_div8); col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8, width_div8); round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[1], width_div8); round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8); } for (r = 0; r < height; r += 8) { for (c = 0; c < width_div8; c++) { fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c], &buf1[c * 8 * width_div8 + (r >> 3)], width_div8, width_div8); } } for (int i = 0; i < 2; i++) { row_txfm(&buf1[i << 1], &buf0[i << 1], cos_bit_row, width_div8, width_div16); row_txfm(&buf1[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_row, width_div8, width_div16); round_shift_32_8xn_avx2(&buf0[i << 1], (height >> 1), shift[2], width_div16); round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], (height >> 1), shift[2], width_div16); } store_buffer_avx2(buf0, output, 8, 128); } aom-3.12.1/av1/encoder/x86/highbd_fwd_txfm_sse4.c000066400000000000000000002733711477627663500213760ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include /* SSE4.1 */ #include "aom_dsp/txfm_common.h" #include "aom_dsp/x86/transpose_sse2.h" #include "aom_dsp/x86/txfm_common_sse2.h" #include "aom_ports/mem.h" #include "av1/common/av1_txfm.h" #include "av1/common/x86/highbd_txfm_utility_sse4.h" #include "av1/encoder/av1_fwd_txfm1d_cfg.h" #include "av1/encoder/x86/av1_txfm1d_sse4.h" #include "config/aom_config.h" #include "config/av1_rtcd.h" static inline void store_output_w4(int32_t *const out, const __m128i *const in, const int stride, const int out_size) { for (int i = 0; i < out_size; ++i) { _mm_store_si128((__m128i *)(out + i * stride), in[i]); } } void av1_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride) { __m128i in[4]; in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); // Convert to int32_t. __m128i op[4]; op[0] = _mm_cvtepi16_epi32(in[0]); op[1] = _mm_cvtepi16_epi32(in[1]); op[2] = _mm_cvtepi16_epi32(in[2]); op[3] = _mm_cvtepi16_epi32(in[3]); for (int i = 0; i < 2; ++i) { __m128i a1 = op[0]; __m128i b1 = op[1]; __m128i c1 = op[2]; __m128i d1 = op[3]; __m128i e1; a1 = _mm_add_epi32(a1, b1); // a1 += b1 d1 = _mm_sub_epi32(d1, c1); // d1 = d1 - c1 e1 = _mm_sub_epi32(a1, d1); // e1 = (a1 - d1) >> 1 e1 = _mm_srai_epi32(e1, 1); b1 = _mm_sub_epi32(e1, b1); // b1 = e1 - b1 c1 = _mm_sub_epi32(e1, c1); // c1 = e1 - c1 a1 = _mm_sub_epi32(a1, c1); // a1 -= c1 d1 = _mm_add_epi32(d1, b1); // d1 += b1 op[0] = a1; op[1] = c1; op[2] = d1; op[3] = b1; if (i == 0) { transpose_32bit_4x4(op, op); } } op[0] = _mm_slli_epi32(op[0], UNIT_QUANT_SHIFT); op[1] = _mm_slli_epi32(op[1], UNIT_QUANT_SHIFT); op[2] = _mm_slli_epi32(op[2], UNIT_QUANT_SHIFT); op[3] = _mm_slli_epi32(op[3], UNIT_QUANT_SHIFT); _mm_storeu_si128((__m128i *)(output + 0), op[0]); _mm_storeu_si128((__m128i *)(output + 4), op[1]); _mm_storeu_si128((__m128i *)(output + 8), op[2]); _mm_storeu_si128((__m128i *)(output + 12), op[3]); } static inline void load_buffer_4x4(const int16_t *input, __m128i *in, int stride, int flipud, int fliplr, int shift) { if (!flipud) { in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); } else { in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); } if (fliplr) { in[0] = _mm_shufflelo_epi16(in[0], 0x1b); in[1] = _mm_shufflelo_epi16(in[1], 0x1b); in[2] = _mm_shufflelo_epi16(in[2], 0x1b); in[3] = _mm_shufflelo_epi16(in[3], 0x1b); } in[0] = _mm_cvtepi16_epi32(in[0]); in[1] = _mm_cvtepi16_epi32(in[1]); in[2] = _mm_cvtepi16_epi32(in[2]); in[3] = _mm_cvtepi16_epi32(in[3]); in[0] = _mm_slli_epi32(in[0], shift); in[1] = _mm_slli_epi32(in[1], shift); in[2] = _mm_slli_epi32(in[2], shift); in[3] = _mm_slli_epi32(in[3], shift); } // We only use stage-2 bit; // shift[0] is used in load_buffer_4x4() // shift[1] is used in txfm_func_col() // shift[2] is used in txfm_func_row() static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int bit, const int num_col) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); __m128i s0, s1, s2, s3; __m128i u0, u1, u2, u3; __m128i v0, v1, v2, v3; int endidx = 3 * num_col; s0 = _mm_add_epi32(in[0], in[endidx]); s3 = _mm_sub_epi32(in[0], in[endidx]); endidx -= num_col; s1 = _mm_add_epi32(in[num_col], in[endidx]); s2 = _mm_sub_epi32(in[num_col], in[endidx]); // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit); u0 = _mm_mullo_epi32(s0, cospi32); u1 = _mm_mullo_epi32(s1, cospi32); u2 = _mm_add_epi32(u0, u1); v0 = _mm_sub_epi32(u0, u1); u3 = _mm_add_epi32(u2, rnding); v1 = _mm_add_epi32(v0, rnding); u0 = _mm_srai_epi32(u3, bit); u2 = _mm_srai_epi32(v1, bit); // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit); v0 = _mm_mullo_epi32(s2, cospi48); v1 = _mm_mullo_epi32(s3, cospi16); v2 = _mm_add_epi32(v0, v1); v3 = _mm_add_epi32(v2, rnding); u1 = _mm_srai_epi32(v3, bit); v0 = _mm_mullo_epi32(s2, cospi16); v1 = _mm_mullo_epi32(s3, cospi48); v2 = _mm_sub_epi32(v1, v0); v3 = _mm_add_epi32(v2, rnding); u3 = _mm_srai_epi32(v3, bit); // Note: shift[1] and shift[2] are zeros out[0] = u0; out[1] = u1; out[2] = u2; out[3] = u3; } static inline void write_buffer_4x4(__m128i *res, int32_t *output) { _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); _mm_store_si128((__m128i *)(output + 3 * 4), res[3]); } static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, const int num_col) { const int32_t *sinpi = sinpi_arr(bit); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]); const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]); const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]); const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]); __m128i t; __m128i s0, s1, s2, s3, s4, s5, s6, s7; __m128i x0, x1, x2, x3; __m128i u0, u1, u2, u3; int idx = 0 * num_col; s0 = _mm_mullo_epi32(in[idx], sinpi1); s1 = _mm_mullo_epi32(in[idx], sinpi4); t = _mm_add_epi32(in[idx], in[idx + num_col]); idx += num_col; s2 = _mm_mullo_epi32(in[idx], sinpi2); s3 = _mm_mullo_epi32(in[idx], sinpi1); idx += num_col; s4 = _mm_mullo_epi32(in[idx], sinpi3); idx += num_col; s5 = _mm_mullo_epi32(in[idx], sinpi4); s6 = _mm_mullo_epi32(in[idx], sinpi2); s7 = _mm_sub_epi32(t, in[idx]); t = _mm_add_epi32(s0, s2); x0 = _mm_add_epi32(t, s5); x1 = _mm_mullo_epi32(s7, sinpi3); t = _mm_sub_epi32(s1, s3); x2 = _mm_add_epi32(t, s6); x3 = s4; s0 = _mm_add_epi32(x0, x3); s1 = x1; s2 = _mm_sub_epi32(x2, x3); t = _mm_sub_epi32(x2, x0); s3 = _mm_add_epi32(t, x3); u0 = _mm_add_epi32(s0, rnding); u0 = _mm_srai_epi32(u0, bit); u1 = _mm_add_epi32(s1, rnding); u1 = _mm_srai_epi32(u1, bit); u2 = _mm_add_epi32(s2, rnding); u2 = _mm_srai_epi32(u2, bit); u3 = _mm_add_epi32(s3, rnding); u3 = _mm_srai_epi32(u3, bit); out[0] = u0; out[1] = u1; out[2] = u2; out[3] = u3; } static void idtx4x4_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { (void)bit; __m128i fact = _mm_set1_epi32(NewSqrt2); __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); __m128i a_low; for (int i = 0; i < 4; i++) { a_low = _mm_mullo_epi32(in[i * col_num], fact); a_low = _mm_add_epi32(a_low, offset); out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits); } } void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff, int input_stride, TX_TYPE tx_type, int bd) { __m128i in[4]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4]; const int txw_idx = get_txw_idx(TX_4X4); const int txh_idx = get_txh_idx(TX_4X4); switch (tx_type) { case DCT_DCT: load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); transpose_32bit_4x4(in, in); fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); write_buffer_4x4(in, coeff); break; case ADST_DCT: load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); transpose_32bit_4x4(in, in); fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); write_buffer_4x4(in, coeff); break; case DCT_ADST: load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); transpose_32bit_4x4(in, in); fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); write_buffer_4x4(in, coeff); break; case ADST_ADST: load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); transpose_32bit_4x4(in, in); fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); write_buffer_4x4(in, coeff); break; case FLIPADST_DCT: load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); transpose_32bit_4x4(in, in); fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); write_buffer_4x4(in, coeff); break; case DCT_FLIPADST: load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); transpose_32bit_4x4(in, in); fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); write_buffer_4x4(in, coeff); break; case FLIPADST_FLIPADST: load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]); fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); transpose_32bit_4x4(in, in); fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); write_buffer_4x4(in, coeff); break; case ADST_FLIPADST: load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); transpose_32bit_4x4(in, in); fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); write_buffer_4x4(in, coeff); break; case FLIPADST_ADST: load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); transpose_32bit_4x4(in, in); fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); write_buffer_4x4(in, coeff); break; case IDTX: load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); transpose_32bit_4x4(in, in); idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); write_buffer_4x4(in, coeff); break; case V_DCT: load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); transpose_32bit_4x4(in, in); idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); write_buffer_4x4(in, coeff); break; case H_DCT: load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); transpose_32bit_4x4(in, in); fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); write_buffer_4x4(in, coeff); break; case V_ADST: load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); transpose_32bit_4x4(in, in); idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); write_buffer_4x4(in, coeff); break; case H_ADST: load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); transpose_32bit_4x4(in, in); fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); write_buffer_4x4(in, coeff); break; case V_FLIPADST: load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); transpose_32bit_4x4(in, in); idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); write_buffer_4x4(in, coeff); break; case H_FLIPADST: load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); transpose_32bit_4x4(in, in); fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); write_buffer_4x4(in, coeff); break; default: assert(0); } (void)bd; } static inline void load_buffer_8x8(const int16_t *input, __m128i *in, int stride, int flipud, int fliplr, int shift) { __m128i u; if (!flipud) { in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); } else { in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride)); in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride)); in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride)); in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride)); in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride)); in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride)); in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride)); in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride)); } if (fliplr) { in[0] = mm_reverse_epi16(in[0]); in[1] = mm_reverse_epi16(in[1]); in[2] = mm_reverse_epi16(in[2]); in[3] = mm_reverse_epi16(in[3]); in[4] = mm_reverse_epi16(in[4]); in[5] = mm_reverse_epi16(in[5]); in[6] = mm_reverse_epi16(in[6]); in[7] = mm_reverse_epi16(in[7]); } u = _mm_unpackhi_epi64(in[4], in[4]); in[8] = _mm_cvtepi16_epi32(in[4]); in[9] = _mm_cvtepi16_epi32(u); u = _mm_unpackhi_epi64(in[5], in[5]); in[10] = _mm_cvtepi16_epi32(in[5]); in[11] = _mm_cvtepi16_epi32(u); u = _mm_unpackhi_epi64(in[6], in[6]); in[12] = _mm_cvtepi16_epi32(in[6]); in[13] = _mm_cvtepi16_epi32(u); u = _mm_unpackhi_epi64(in[7], in[7]); in[14] = _mm_cvtepi16_epi32(in[7]); in[15] = _mm_cvtepi16_epi32(u); u = _mm_unpackhi_epi64(in[3], in[3]); in[6] = _mm_cvtepi16_epi32(in[3]); in[7] = _mm_cvtepi16_epi32(u); u = _mm_unpackhi_epi64(in[2], in[2]); in[4] = _mm_cvtepi16_epi32(in[2]); in[5] = _mm_cvtepi16_epi32(u); u = _mm_unpackhi_epi64(in[1], in[1]); in[2] = _mm_cvtepi16_epi32(in[1]); in[3] = _mm_cvtepi16_epi32(u); u = _mm_unpackhi_epi64(in[0], in[0]); in[0] = _mm_cvtepi16_epi32(in[0]); in[1] = _mm_cvtepi16_epi32(u); in[0] = _mm_slli_epi32(in[0], shift); in[1] = _mm_slli_epi32(in[1], shift); in[2] = _mm_slli_epi32(in[2], shift); in[3] = _mm_slli_epi32(in[3], shift); in[4] = _mm_slli_epi32(in[4], shift); in[5] = _mm_slli_epi32(in[5], shift); in[6] = _mm_slli_epi32(in[6], shift); in[7] = _mm_slli_epi32(in[7], shift); in[8] = _mm_slli_epi32(in[8], shift); in[9] = _mm_slli_epi32(in[9], shift); in[10] = _mm_slli_epi32(in[10], shift); in[11] = _mm_slli_epi32(in[11], shift); in[12] = _mm_slli_epi32(in[12], shift); in[13] = _mm_slli_epi32(in[13], shift); in[14] = _mm_slli_epi32(in[14], shift); in[15] = _mm_slli_epi32(in[15], shift); } static inline void col_txfm_8x8_rounding(__m128i *in, int shift) { const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); in[0] = _mm_add_epi32(in[0], rounding); in[1] = _mm_add_epi32(in[1], rounding); in[2] = _mm_add_epi32(in[2], rounding); in[3] = _mm_add_epi32(in[3], rounding); in[4] = _mm_add_epi32(in[4], rounding); in[5] = _mm_add_epi32(in[5], rounding); in[6] = _mm_add_epi32(in[6], rounding); in[7] = _mm_add_epi32(in[7], rounding); in[8] = _mm_add_epi32(in[8], rounding); in[9] = _mm_add_epi32(in[9], rounding); in[10] = _mm_add_epi32(in[10], rounding); in[11] = _mm_add_epi32(in[11], rounding); in[12] = _mm_add_epi32(in[12], rounding); in[13] = _mm_add_epi32(in[13], rounding); in[14] = _mm_add_epi32(in[14], rounding); in[15] = _mm_add_epi32(in[15], rounding); in[0] = _mm_srai_epi32(in[0], shift); in[1] = _mm_srai_epi32(in[1], shift); in[2] = _mm_srai_epi32(in[2], shift); in[3] = _mm_srai_epi32(in[3], shift); in[4] = _mm_srai_epi32(in[4], shift); in[5] = _mm_srai_epi32(in[5], shift); in[6] = _mm_srai_epi32(in[6], shift); in[7] = _mm_srai_epi32(in[7], shift); in[8] = _mm_srai_epi32(in[8], shift); in[9] = _mm_srai_epi32(in[9], shift); in[10] = _mm_srai_epi32(in[10], shift); in[11] = _mm_srai_epi32(in[11], shift); in[12] = _mm_srai_epi32(in[12], shift); in[13] = _mm_srai_epi32(in[13], shift); in[14] = _mm_srai_epi32(in[14], shift); in[15] = _mm_srai_epi32(in[15], shift); } static inline void col_txfm_4x8_rounding(__m128i *in, int shift) { const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); in[0] = _mm_add_epi32(in[0], rounding); in[1] = _mm_add_epi32(in[1], rounding); in[2] = _mm_add_epi32(in[2], rounding); in[3] = _mm_add_epi32(in[3], rounding); in[4] = _mm_add_epi32(in[4], rounding); in[5] = _mm_add_epi32(in[5], rounding); in[6] = _mm_add_epi32(in[6], rounding); in[7] = _mm_add_epi32(in[7], rounding); in[0] = _mm_srai_epi32(in[0], shift); in[1] = _mm_srai_epi32(in[1], shift); in[2] = _mm_srai_epi32(in[2], shift); in[3] = _mm_srai_epi32(in[3], shift); in[4] = _mm_srai_epi32(in[4], shift); in[5] = _mm_srai_epi32(in[5], shift); in[6] = _mm_srai_epi32(in[6], shift); in[7] = _mm_srai_epi32(in[7], shift); } static inline void write_buffer_8x8(const __m128i *res, int32_t *output) { _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); _mm_store_si128((__m128i *)(output + 3 * 4), res[3]); _mm_store_si128((__m128i *)(output + 4 * 4), res[4]); _mm_store_si128((__m128i *)(output + 5 * 4), res[5]); _mm_store_si128((__m128i *)(output + 6 * 4), res[6]); _mm_store_si128((__m128i *)(output + 7 * 4), res[7]); _mm_store_si128((__m128i *)(output + 8 * 4), res[8]); _mm_store_si128((__m128i *)(output + 9 * 4), res[9]); _mm_store_si128((__m128i *)(output + 10 * 4), res[10]); _mm_store_si128((__m128i *)(output + 11 * 4), res[11]); _mm_store_si128((__m128i *)(output + 12 * 4), res[12]); _mm_store_si128((__m128i *)(output + 13 * 4), res[13]); _mm_store_si128((__m128i *)(output + 14 * 4), res[14]); _mm_store_si128((__m128i *)(output + 15 * 4), res[15]); } static inline void write_buffer_16x8(const __m128i *res, int32_t *output, const int stride) { _mm_storeu_si128((__m128i *)(output), res[0]); _mm_storeu_si128((__m128i *)(output + 4), res[1]); _mm_storeu_si128((__m128i *)(output + stride), res[2]); _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]); _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]); _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]); _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]); _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]); _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]); _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]); _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]); _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]); _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]); _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]); _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]); _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]); } static void fdct4x8_sse4_1(__m128i *in, __m128i *out, int bit, const int col_num) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); __m128i u[8], v[8]; int startidx = 0 * col_num; int endidx = 7 * col_num; // Even 8 points 0, 2, ..., 14 // stage 0 // stage 1 u[0] = _mm_add_epi32(in[startidx], in[endidx]); v[7] = _mm_sub_epi32(in[startidx], in[endidx]); // v[7] startidx += col_num; endidx -= col_num; u[1] = _mm_add_epi32(in[startidx], in[endidx]); u[6] = _mm_sub_epi32(in[startidx], in[endidx]); startidx += col_num; endidx -= col_num; u[2] = _mm_add_epi32(in[startidx], in[endidx]); u[5] = _mm_sub_epi32(in[startidx], in[endidx]); startidx += col_num; endidx -= col_num; u[3] = _mm_add_epi32(in[startidx], in[endidx]); v[4] = _mm_sub_epi32(in[startidx], in[endidx]); // v[4] // stage 2 v[0] = _mm_add_epi32(u[0], u[3]); v[3] = _mm_sub_epi32(u[0], u[3]); v[1] = _mm_add_epi32(u[1], u[2]); v[2] = _mm_sub_epi32(u[1], u[2]); v[5] = _mm_mullo_epi32(u[5], cospim32); v[6] = _mm_mullo_epi32(u[6], cospi32); v[5] = _mm_add_epi32(v[5], v[6]); v[5] = _mm_add_epi32(v[5], rnding); v[5] = _mm_srai_epi32(v[5], bit); u[0] = _mm_mullo_epi32(u[5], cospi32); v[6] = _mm_mullo_epi32(u[6], cospim32); v[6] = _mm_sub_epi32(u[0], v[6]); v[6] = _mm_add_epi32(v[6], rnding); v[6] = _mm_srai_epi32(v[6], bit); // stage 3 // type 0 v[0] = _mm_mullo_epi32(v[0], cospi32); v[1] = _mm_mullo_epi32(v[1], cospi32); u[0] = _mm_add_epi32(v[0], v[1]); u[0] = _mm_add_epi32(u[0], rnding); u[0] = _mm_srai_epi32(u[0], bit); u[1] = _mm_sub_epi32(v[0], v[1]); u[1] = _mm_add_epi32(u[1], rnding); u[1] = _mm_srai_epi32(u[1], bit); // type 1 v[0] = _mm_mullo_epi32(v[2], cospi48); v[1] = _mm_mullo_epi32(v[3], cospi16); u[2] = _mm_add_epi32(v[0], v[1]); u[2] = _mm_add_epi32(u[2], rnding); u[2] = _mm_srai_epi32(u[2], bit); v[0] = _mm_mullo_epi32(v[2], cospi16); v[1] = _mm_mullo_epi32(v[3], cospi48); u[3] = _mm_sub_epi32(v[1], v[0]); u[3] = _mm_add_epi32(u[3], rnding); u[3] = _mm_srai_epi32(u[3], bit); u[4] = _mm_add_epi32(v[4], v[5]); u[5] = _mm_sub_epi32(v[4], v[5]); u[6] = _mm_sub_epi32(v[7], v[6]); u[7] = _mm_add_epi32(v[7], v[6]); // stage 4 // stage 5 v[0] = _mm_mullo_epi32(u[4], cospi56); v[1] = _mm_mullo_epi32(u[7], cospi8); v[0] = _mm_add_epi32(v[0], v[1]); v[0] = _mm_add_epi32(v[0], rnding); out[1 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[4] v[0] = _mm_mullo_epi32(u[4], cospi8); v[1] = _mm_mullo_epi32(u[7], cospi56); v[0] = _mm_sub_epi32(v[1], v[0]); v[0] = _mm_add_epi32(v[0], rnding); out[7 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[7] v[0] = _mm_mullo_epi32(u[5], cospi24); v[1] = _mm_mullo_epi32(u[6], cospi40); v[0] = _mm_add_epi32(v[0], v[1]); v[0] = _mm_add_epi32(v[0], rnding); out[5 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[5] v[0] = _mm_mullo_epi32(u[5], cospi40); v[1] = _mm_mullo_epi32(u[6], cospi24); v[0] = _mm_sub_epi32(v[1], v[0]); v[0] = _mm_add_epi32(v[0], rnding); out[3 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[6] out[0 * col_num] = u[0]; // buf0[0] out[4 * col_num] = u[1]; // buf0[1] out[2 * col_num] = u[2]; // buf0[2] out[6 * col_num] = u[3]; // buf0[3] } static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit, const int col_num) { fdct4x8_sse4_1(in, out, bit, col_num); fdct4x8_sse4_1(in + 1, out + 1, bit, col_num); } static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, const int col_num) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospi20 = _mm_set1_epi32(cospi[20]); const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); const __m128i cospi44 = _mm_set1_epi32(cospi[44]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); const __m128i cospi36 = _mm_set1_epi32(cospi[36]); const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); const __m128i cospi52 = _mm_set1_epi32(cospi[52]); const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); const __m128i cospi12 = _mm_set1_epi32(cospi[12]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const __m128i zero = _mm_setzero_si128(); __m128i u0, u1, u2, u3, u4, u5, u6, u7; __m128i v0, v1, v2, v3, v4, v5, v6, v7; __m128i x, y; int col; // Note: // Even column: 0, 2, ..., 14 // Odd column: 1, 3, ..., 15 // one even column plus one odd column constructs one row (8 coeffs) // total we have 8 rows (8x8). for (col = 0; col < col_num; ++col) { // stage 0 // stage 1 u0 = in[col_num * 0 + col]; u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]); u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]); u3 = in[col_num * 4 + col]; u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]); u5 = in[col_num * 6 + col]; u6 = in[col_num * 2 + col]; u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]); // stage 2 v0 = u0; v1 = u1; x = _mm_mullo_epi32(u2, cospi32); y = _mm_mullo_epi32(u3, cospi32); v2 = _mm_add_epi32(x, y); v2 = _mm_add_epi32(v2, rnding); v2 = _mm_srai_epi32(v2, bit); v3 = _mm_sub_epi32(x, y); v3 = _mm_add_epi32(v3, rnding); v3 = _mm_srai_epi32(v3, bit); v4 = u4; v5 = u5; x = _mm_mullo_epi32(u6, cospi32); y = _mm_mullo_epi32(u7, cospi32); v6 = _mm_add_epi32(x, y); v6 = _mm_add_epi32(v6, rnding); v6 = _mm_srai_epi32(v6, bit); v7 = _mm_sub_epi32(x, y); v7 = _mm_add_epi32(v7, rnding); v7 = _mm_srai_epi32(v7, bit); // stage 3 u0 = _mm_add_epi32(v0, v2); u1 = _mm_add_epi32(v1, v3); u2 = _mm_sub_epi32(v0, v2); u3 = _mm_sub_epi32(v1, v3); u4 = _mm_add_epi32(v4, v6); u5 = _mm_add_epi32(v5, v7); u6 = _mm_sub_epi32(v4, v6); u7 = _mm_sub_epi32(v5, v7); // stage 4 v0 = u0; v1 = u1; v2 = u2; v3 = u3; x = _mm_mullo_epi32(u4, cospi16); y = _mm_mullo_epi32(u5, cospi48); v4 = _mm_add_epi32(x, y); v4 = _mm_add_epi32(v4, rnding); v4 = _mm_srai_epi32(v4, bit); x = _mm_mullo_epi32(u4, cospi48); y = _mm_mullo_epi32(u5, cospim16); v5 = _mm_add_epi32(x, y); v5 = _mm_add_epi32(v5, rnding); v5 = _mm_srai_epi32(v5, bit); x = _mm_mullo_epi32(u6, cospim48); y = _mm_mullo_epi32(u7, cospi16); v6 = _mm_add_epi32(x, y); v6 = _mm_add_epi32(v6, rnding); v6 = _mm_srai_epi32(v6, bit); x = _mm_mullo_epi32(u6, cospi16); y = _mm_mullo_epi32(u7, cospi48); v7 = _mm_add_epi32(x, y); v7 = _mm_add_epi32(v7, rnding); v7 = _mm_srai_epi32(v7, bit); // stage 5 u0 = _mm_add_epi32(v0, v4); u1 = _mm_add_epi32(v1, v5); u2 = _mm_add_epi32(v2, v6); u3 = _mm_add_epi32(v3, v7); u4 = _mm_sub_epi32(v0, v4); u5 = _mm_sub_epi32(v1, v5); u6 = _mm_sub_epi32(v2, v6); u7 = _mm_sub_epi32(v3, v7); // stage 6 x = _mm_mullo_epi32(u0, cospi4); y = _mm_mullo_epi32(u1, cospi60); v0 = _mm_add_epi32(x, y); v0 = _mm_add_epi32(v0, rnding); v0 = _mm_srai_epi32(v0, bit); x = _mm_mullo_epi32(u0, cospi60); y = _mm_mullo_epi32(u1, cospim4); v1 = _mm_add_epi32(x, y); v1 = _mm_add_epi32(v1, rnding); v1 = _mm_srai_epi32(v1, bit); x = _mm_mullo_epi32(u2, cospi20); y = _mm_mullo_epi32(u3, cospi44); v2 = _mm_add_epi32(x, y); v2 = _mm_add_epi32(v2, rnding); v2 = _mm_srai_epi32(v2, bit); x = _mm_mullo_epi32(u2, cospi44); y = _mm_mullo_epi32(u3, cospim20); v3 = _mm_add_epi32(x, y); v3 = _mm_add_epi32(v3, rnding); v3 = _mm_srai_epi32(v3, bit); x = _mm_mullo_epi32(u4, cospi36); y = _mm_mullo_epi32(u5, cospi28); v4 = _mm_add_epi32(x, y); v4 = _mm_add_epi32(v4, rnding); v4 = _mm_srai_epi32(v4, bit); x = _mm_mullo_epi32(u4, cospi28); y = _mm_mullo_epi32(u5, cospim36); v5 = _mm_add_epi32(x, y); v5 = _mm_add_epi32(v5, rnding); v5 = _mm_srai_epi32(v5, bit); x = _mm_mullo_epi32(u6, cospi52); y = _mm_mullo_epi32(u7, cospi12); v6 = _mm_add_epi32(x, y); v6 = _mm_add_epi32(v6, rnding); v6 = _mm_srai_epi32(v6, bit); x = _mm_mullo_epi32(u6, cospi12); y = _mm_mullo_epi32(u7, cospim52); v7 = _mm_add_epi32(x, y); v7 = _mm_add_epi32(v7, rnding); v7 = _mm_srai_epi32(v7, bit); // stage 7 out[col_num * 0 + col] = v1; out[col_num * 1 + col] = v6; out[col_num * 2 + col] = v3; out[col_num * 3 + col] = v4; out[col_num * 4 + col] = v5; out[col_num * 5 + col] = v2; out[col_num * 6 + col] = v7; out[col_num * 7 + col] = v0; } } static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { (void)bit; for (int i = 0; i < col_num; i += 1) { out[0 + 8 * i] = _mm_add_epi32(in[0 + 8 * i], in[0 + 8 * i]); out[1 + 8 * i] = _mm_add_epi32(in[1 + 8 * i], in[1 + 8 * i]); out[2 + 8 * i] = _mm_add_epi32(in[2 + 8 * i], in[2 + 8 * i]); out[3 + 8 * i] = _mm_add_epi32(in[3 + 8 * i], in[3 + 8 * i]); out[4 + 8 * i] = _mm_add_epi32(in[4 + 8 * i], in[4 + 8 * i]); out[5 + 8 * i] = _mm_add_epi32(in[5 + 8 * i], in[5 + 8 * i]); out[6 + 8 * i] = _mm_add_epi32(in[6 + 8 * i], in[6 + 8 * i]); out[7 + 8 * i] = _mm_add_epi32(in[7 + 8 * i], in[7 + 8 * i]); } } #if !CONFIG_REALTIME_ONLY static void idtx32x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { (void)bit; (void)col_num; for (int j = 0; j < 2; j++) { out[j + 8 * 0] = _mm_add_epi32(in[j + 8 * 0], in[j + 8 * 0]); out[j + 8 * 1] = _mm_add_epi32(in[j + 8 * 1], in[j + 8 * 1]); out[j + 8 * 2] = _mm_add_epi32(in[j + 8 * 2], in[j + 8 * 2]); out[j + 8 * 3] = _mm_add_epi32(in[j + 8 * 3], in[j + 8 * 3]); out[j + 8 * 4] = _mm_add_epi32(in[j + 8 * 4], in[j + 8 * 4]); out[j + 8 * 5] = _mm_add_epi32(in[j + 8 * 5], in[j + 8 * 5]); out[j + 8 * 6] = _mm_add_epi32(in[j + 8 * 6], in[j + 8 * 6]); out[j + 8 * 7] = _mm_add_epi32(in[j + 8 * 7], in[j + 8 * 7]); } } #endif void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[16], out[16]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8]; const int txw_idx = get_txw_idx(TX_8X8); const int txh_idx = get_txh_idx(TX_8X8); switch (tx_type) { case DCT_DCT: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); write_buffer_8x8(out, coeff); break; case ADST_DCT: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); write_buffer_8x8(out, coeff); break; case DCT_ADST: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); write_buffer_8x8(out, coeff); break; case ADST_ADST: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); write_buffer_8x8(out, coeff); break; case FLIPADST_DCT: load_buffer_8x8(input, in, stride, 1, 0, shift[0]); fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); write_buffer_8x8(out, coeff); break; case DCT_FLIPADST: load_buffer_8x8(input, in, stride, 0, 1, shift[0]); fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); write_buffer_8x8(out, coeff); break; case FLIPADST_FLIPADST: load_buffer_8x8(input, in, stride, 1, 1, shift[0]); fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); write_buffer_8x8(out, coeff); break; case ADST_FLIPADST: load_buffer_8x8(input, in, stride, 0, 1, shift[0]); fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); write_buffer_8x8(out, coeff); break; case FLIPADST_ADST: load_buffer_8x8(input, in, stride, 1, 0, shift[0]); fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); write_buffer_8x8(out, coeff); break; case IDTX: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); write_buffer_8x8(out, coeff); break; case V_DCT: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); write_buffer_8x8(out, coeff); break; case H_DCT: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); write_buffer_8x8(out, coeff); break; case V_ADST: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); write_buffer_8x8(out, coeff); break; case H_ADST: load_buffer_8x8(input, in, stride, 0, 0, shift[0]); idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); write_buffer_8x8(out, coeff); break; case V_FLIPADST: load_buffer_8x8(input, in, stride, 1, 0, shift[0]); fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); write_buffer_8x8(out, coeff); break; case H_FLIPADST: load_buffer_8x8(input, in, stride, 0, 1, shift[0]); idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); col_txfm_8x8_rounding(out, -shift[1]); transpose_8x8(out, in); fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); write_buffer_8x8(out, coeff); break; default: assert(0); } (void)bd; } // Hybrid Transform 16x16 static inline void convert_8x8_to_16x16(const __m128i *in, __m128i *out) { int row_index = 0; int dst_index = 0; int src_index = 0; // row 0, 1, .., 7 do { out[dst_index] = in[src_index]; out[dst_index + 1] = in[src_index + 1]; out[dst_index + 2] = in[src_index + 16]; out[dst_index + 3] = in[src_index + 17]; dst_index += 4; src_index += 2; row_index += 1; } while (row_index < 8); // row 8, 9, ..., 15 src_index += 16; do { out[dst_index] = in[src_index]; out[dst_index + 1] = in[src_index + 1]; out[dst_index + 2] = in[src_index + 16]; out[dst_index + 3] = in[src_index + 17]; dst_index += 4; src_index += 2; row_index += 1; } while (row_index < 16); } static inline void load_buffer_16x16(const int16_t *input, __m128i *out, int stride, int flipud, int fliplr, int shift) { __m128i in[64]; // Load 4 8x8 blocks const int16_t *topL = input; const int16_t *topR = input + 8; const int16_t *botL = input + 8 * stride; const int16_t *botR = input + 8 * stride + 8; const int16_t *tmp; if (flipud) { // Swap left columns tmp = topL; topL = botL; botL = tmp; // Swap right columns tmp = topR; topR = botR; botR = tmp; } if (fliplr) { // Swap top rows tmp = topL; topL = topR; topR = tmp; // Swap bottom rows tmp = botL; botL = botR; botR = tmp; } // load first 8 columns load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift); load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift); // load second 8 columns load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift); load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift); convert_8x8_to_16x16(in, out); } static inline void load_buffer_8x16(const int16_t *input, __m128i *out, int stride, int flipud, int fliplr, int shift) { const int16_t *topL = input; const int16_t *botL = input + 8 * stride; const int16_t *tmp; if (flipud) { tmp = topL; topL = botL; botL = tmp; } load_buffer_8x8(topL, out, stride, flipud, fliplr, shift); load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift); } static inline void load_buffer_8x4(const int16_t *input, __m128i *out, int stride, int flipud, int fliplr, int shift) { const int16_t *topL = input; const int16_t *topR = input + 4; const int16_t *tmp; if (fliplr) { tmp = topL; topL = topR; topR = tmp; } load_buffer_4x4(topL, out, stride, flipud, fliplr, shift); load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, shift); } static inline void load_buffer_16x4(const int16_t *input, __m128i *out, int stride, int flipud, int fliplr, int shift) { const int16_t *topL = input; const int16_t *topR = input + 8; const int16_t *tmp; if (fliplr) { tmp = topL; topL = topR; topR = tmp; } load_buffer_8x4(topL, out, stride, flipud, fliplr, shift); load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, shift); } static inline void load_buffer_4x8(const int16_t *input, __m128i *out, int stride, int flipud, int fliplr, int shift) { const int16_t *topL = input; const int16_t *botL = input + 4 * stride; const int16_t *tmp; if (flipud) { tmp = topL; topL = botL; botL = tmp; } load_buffer_4x4(topL, out, stride, flipud, fliplr, shift); load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, shift); } #if !CONFIG_REALTIME_ONLY static inline void load_buffer_4x16(const int16_t *input, __m128i *out, const int stride, const int flipud, const int fliplr, const int shift) { const int16_t *topL = input; const int16_t *botL = input + 8 * stride; const int16_t *tmp; if (flipud) { tmp = topL; topL = botL; botL = tmp; } load_buffer_4x8(topL, out, stride, flipud, fliplr, shift); load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, shift); } #endif static inline void load_buffer_32x8n(const int16_t *input, __m128i *out, int stride, int flipud, int fliplr, int shift, const int height) { const int16_t *in = input; __m128i *output = out; for (int col = 0; col < height; col++) { in = input + col * stride; output = out + col * 8; load_buffer_4x4(in, output, 4, flipud, fliplr, shift); load_buffer_4x4((in + 16), (output + 4), 4, flipud, fliplr, shift); } } static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit, const int col_num) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i cospi60 = _mm_set1_epi32(cospi[60]); const __m128i cospi4 = _mm_set1_epi32(cospi[4]); const __m128i cospi28 = _mm_set1_epi32(cospi[28]); const __m128i cospi36 = _mm_set1_epi32(cospi[36]); const __m128i cospi44 = _mm_set1_epi32(cospi[44]); const __m128i cospi20 = _mm_set1_epi32(cospi[20]); const __m128i cospi12 = _mm_set1_epi32(cospi[12]); const __m128i cospi52 = _mm_set1_epi32(cospi[52]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); __m128i u[16], v[16], x; int col; // Calculate the column 0, 1, 2, 3 for (col = 0; col < col_num; ++col) { // stage 0 // stage 1 u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]); u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]); u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]); u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]); u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]); u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]); u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]); u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]); u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]); u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]); u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]); u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]); u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]); u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]); u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]); u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]); // stage 2 v[0] = _mm_add_epi32(u[0], u[7]); v[7] = _mm_sub_epi32(u[0], u[7]); v[1] = _mm_add_epi32(u[1], u[6]); v[6] = _mm_sub_epi32(u[1], u[6]); v[2] = _mm_add_epi32(u[2], u[5]); v[5] = _mm_sub_epi32(u[2], u[5]); v[3] = _mm_add_epi32(u[3], u[4]); v[4] = _mm_sub_epi32(u[3], u[4]); v[8] = u[8]; v[9] = u[9]; v[10] = _mm_mullo_epi32(u[10], cospim32); x = _mm_mullo_epi32(u[13], cospi32); v[10] = _mm_add_epi32(v[10], x); v[10] = _mm_add_epi32(v[10], rnding); v[10] = _mm_srai_epi32(v[10], bit); v[13] = _mm_mullo_epi32(u[10], cospi32); x = _mm_mullo_epi32(u[13], cospim32); v[13] = _mm_sub_epi32(v[13], x); v[13] = _mm_add_epi32(v[13], rnding); v[13] = _mm_srai_epi32(v[13], bit); v[11] = _mm_mullo_epi32(u[11], cospim32); x = _mm_mullo_epi32(u[12], cospi32); v[11] = _mm_add_epi32(v[11], x); v[11] = _mm_add_epi32(v[11], rnding); v[11] = _mm_srai_epi32(v[11], bit); v[12] = _mm_mullo_epi32(u[11], cospi32); x = _mm_mullo_epi32(u[12], cospim32); v[12] = _mm_sub_epi32(v[12], x); v[12] = _mm_add_epi32(v[12], rnding); v[12] = _mm_srai_epi32(v[12], bit); v[14] = u[14]; v[15] = u[15]; // stage 3 u[0] = _mm_add_epi32(v[0], v[3]); u[3] = _mm_sub_epi32(v[0], v[3]); u[1] = _mm_add_epi32(v[1], v[2]); u[2] = _mm_sub_epi32(v[1], v[2]); u[4] = v[4]; u[5] = _mm_mullo_epi32(v[5], cospim32); x = _mm_mullo_epi32(v[6], cospi32); u[5] = _mm_add_epi32(u[5], x); u[5] = _mm_add_epi32(u[5], rnding); u[5] = _mm_srai_epi32(u[5], bit); u[6] = _mm_mullo_epi32(v[5], cospi32); x = _mm_mullo_epi32(v[6], cospim32); u[6] = _mm_sub_epi32(u[6], x); u[6] = _mm_add_epi32(u[6], rnding); u[6] = _mm_srai_epi32(u[6], bit); u[7] = v[7]; u[8] = _mm_add_epi32(v[8], v[11]); u[11] = _mm_sub_epi32(v[8], v[11]); u[9] = _mm_add_epi32(v[9], v[10]); u[10] = _mm_sub_epi32(v[9], v[10]); u[12] = _mm_sub_epi32(v[15], v[12]); u[15] = _mm_add_epi32(v[15], v[12]); u[13] = _mm_sub_epi32(v[14], v[13]); u[14] = _mm_add_epi32(v[14], v[13]); // stage 4 u[0] = _mm_mullo_epi32(u[0], cospi32); u[1] = _mm_mullo_epi32(u[1], cospi32); v[0] = _mm_add_epi32(u[0], u[1]); v[0] = _mm_add_epi32(v[0], rnding); v[0] = _mm_srai_epi32(v[0], bit); v[1] = _mm_sub_epi32(u[0], u[1]); v[1] = _mm_add_epi32(v[1], rnding); v[1] = _mm_srai_epi32(v[1], bit); v[2] = _mm_mullo_epi32(u[2], cospi48); x = _mm_mullo_epi32(u[3], cospi16); v[2] = _mm_add_epi32(v[2], x); v[2] = _mm_add_epi32(v[2], rnding); v[2] = _mm_srai_epi32(v[2], bit); v[3] = _mm_mullo_epi32(u[2], cospi16); x = _mm_mullo_epi32(u[3], cospi48); v[3] = _mm_sub_epi32(x, v[3]); v[3] = _mm_add_epi32(v[3], rnding); v[3] = _mm_srai_epi32(v[3], bit); v[4] = _mm_add_epi32(u[4], u[5]); v[5] = _mm_sub_epi32(u[4], u[5]); v[6] = _mm_sub_epi32(u[7], u[6]); v[7] = _mm_add_epi32(u[7], u[6]); v[8] = u[8]; v[9] = _mm_mullo_epi32(u[9], cospim16); x = _mm_mullo_epi32(u[14], cospi48); v[9] = _mm_add_epi32(v[9], x); v[9] = _mm_add_epi32(v[9], rnding); v[9] = _mm_srai_epi32(v[9], bit); v[14] = _mm_mullo_epi32(u[9], cospi48); x = _mm_mullo_epi32(u[14], cospim16); v[14] = _mm_sub_epi32(v[14], x); v[14] = _mm_add_epi32(v[14], rnding); v[14] = _mm_srai_epi32(v[14], bit); v[10] = _mm_mullo_epi32(u[10], cospim48); x = _mm_mullo_epi32(u[13], cospim16); v[10] = _mm_add_epi32(v[10], x); v[10] = _mm_add_epi32(v[10], rnding); v[10] = _mm_srai_epi32(v[10], bit); v[13] = _mm_mullo_epi32(u[10], cospim16); x = _mm_mullo_epi32(u[13], cospim48); v[13] = _mm_sub_epi32(v[13], x); v[13] = _mm_add_epi32(v[13], rnding); v[13] = _mm_srai_epi32(v[13], bit); v[11] = u[11]; v[12] = u[12]; v[15] = u[15]; // stage 5 u[0] = v[0]; u[1] = v[1]; u[2] = v[2]; u[3] = v[3]; u[4] = _mm_mullo_epi32(v[4], cospi56); x = _mm_mullo_epi32(v[7], cospi8); u[4] = _mm_add_epi32(u[4], x); u[4] = _mm_add_epi32(u[4], rnding); u[4] = _mm_srai_epi32(u[4], bit); u[7] = _mm_mullo_epi32(v[4], cospi8); x = _mm_mullo_epi32(v[7], cospi56); u[7] = _mm_sub_epi32(x, u[7]); u[7] = _mm_add_epi32(u[7], rnding); u[7] = _mm_srai_epi32(u[7], bit); u[5] = _mm_mullo_epi32(v[5], cospi24); x = _mm_mullo_epi32(v[6], cospi40); u[5] = _mm_add_epi32(u[5], x); u[5] = _mm_add_epi32(u[5], rnding); u[5] = _mm_srai_epi32(u[5], bit); u[6] = _mm_mullo_epi32(v[5], cospi40); x = _mm_mullo_epi32(v[6], cospi24); u[6] = _mm_sub_epi32(x, u[6]); u[6] = _mm_add_epi32(u[6], rnding); u[6] = _mm_srai_epi32(u[6], bit); u[8] = _mm_add_epi32(v[8], v[9]); u[9] = _mm_sub_epi32(v[8], v[9]); u[10] = _mm_sub_epi32(v[11], v[10]); u[11] = _mm_add_epi32(v[11], v[10]); u[12] = _mm_add_epi32(v[12], v[13]); u[13] = _mm_sub_epi32(v[12], v[13]); u[14] = _mm_sub_epi32(v[15], v[14]); u[15] = _mm_add_epi32(v[15], v[14]); // stage 6 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = u[4]; v[5] = u[5]; v[6] = u[6]; v[7] = u[7]; v[8] = _mm_mullo_epi32(u[8], cospi60); x = _mm_mullo_epi32(u[15], cospi4); v[8] = _mm_add_epi32(v[8], x); v[8] = _mm_add_epi32(v[8], rnding); v[8] = _mm_srai_epi32(v[8], bit); v[15] = _mm_mullo_epi32(u[8], cospi4); x = _mm_mullo_epi32(u[15], cospi60); v[15] = _mm_sub_epi32(x, v[15]); v[15] = _mm_add_epi32(v[15], rnding); v[15] = _mm_srai_epi32(v[15], bit); v[9] = _mm_mullo_epi32(u[9], cospi28); x = _mm_mullo_epi32(u[14], cospi36); v[9] = _mm_add_epi32(v[9], x); v[9] = _mm_add_epi32(v[9], rnding); v[9] = _mm_srai_epi32(v[9], bit); v[14] = _mm_mullo_epi32(u[9], cospi36); x = _mm_mullo_epi32(u[14], cospi28); v[14] = _mm_sub_epi32(x, v[14]); v[14] = _mm_add_epi32(v[14], rnding); v[14] = _mm_srai_epi32(v[14], bit); v[10] = _mm_mullo_epi32(u[10], cospi44); x = _mm_mullo_epi32(u[13], cospi20); v[10] = _mm_add_epi32(v[10], x); v[10] = _mm_add_epi32(v[10], rnding); v[10] = _mm_srai_epi32(v[10], bit); v[13] = _mm_mullo_epi32(u[10], cospi20); x = _mm_mullo_epi32(u[13], cospi44); v[13] = _mm_sub_epi32(x, v[13]); v[13] = _mm_add_epi32(v[13], rnding); v[13] = _mm_srai_epi32(v[13], bit); v[11] = _mm_mullo_epi32(u[11], cospi12); x = _mm_mullo_epi32(u[12], cospi52); v[11] = _mm_add_epi32(v[11], x); v[11] = _mm_add_epi32(v[11], rnding); v[11] = _mm_srai_epi32(v[11], bit); v[12] = _mm_mullo_epi32(u[11], cospi52); x = _mm_mullo_epi32(u[12], cospi12); v[12] = _mm_sub_epi32(x, v[12]); v[12] = _mm_add_epi32(v[12], rnding); v[12] = _mm_srai_epi32(v[12], bit); out[0 * col_num + col] = v[0]; out[1 * col_num + col] = v[8]; out[2 * col_num + col] = v[4]; out[3 * col_num + col] = v[12]; out[4 * col_num + col] = v[2]; out[5 * col_num + col] = v[10]; out[6 * col_num + col] = v[6]; out[7 * col_num + col] = v[14]; out[8 * col_num + col] = v[1]; out[9 * col_num + col] = v[9]; out[10 * col_num + col] = v[5]; out[11 * col_num + col] = v[13]; out[12 * col_num + col] = v[3]; out[13 * col_num + col] = v[11]; out[14 * col_num + col] = v[7]; out[15 * col_num + col] = v[15]; } } static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, const int num_cols) { const int32_t *cospi = cospi_arr(bit); const __m128i cospi32 = _mm_set1_epi32(cospi[32]); const __m128i cospi48 = _mm_set1_epi32(cospi[48]); const __m128i cospi16 = _mm_set1_epi32(cospi[16]); const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); const __m128i cospi8 = _mm_set1_epi32(cospi[8]); const __m128i cospi56 = _mm_set1_epi32(cospi[56]); const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); const __m128i cospi24 = _mm_set1_epi32(cospi[24]); const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); const __m128i cospi40 = _mm_set1_epi32(cospi[40]); const __m128i cospi2 = _mm_set1_epi32(cospi[2]); const __m128i cospi62 = _mm_set1_epi32(cospi[62]); const __m128i cospim2 = _mm_set1_epi32(-cospi[2]); const __m128i cospi10 = _mm_set1_epi32(cospi[10]); const __m128i cospi54 = _mm_set1_epi32(cospi[54]); const __m128i cospim10 = _mm_set1_epi32(-cospi[10]); const __m128i cospi18 = _mm_set1_epi32(cospi[18]); const __m128i cospi46 = _mm_set1_epi32(cospi[46]); const __m128i cospim18 = _mm_set1_epi32(-cospi[18]); const __m128i cospi26 = _mm_set1_epi32(cospi[26]); const __m128i cospi38 = _mm_set1_epi32(cospi[38]); const __m128i cospim26 = _mm_set1_epi32(-cospi[26]); const __m128i cospi34 = _mm_set1_epi32(cospi[34]); const __m128i cospi30 = _mm_set1_epi32(cospi[30]); const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); const __m128i cospi42 = _mm_set1_epi32(cospi[42]); const __m128i cospi22 = _mm_set1_epi32(cospi[22]); const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); const __m128i cospi50 = _mm_set1_epi32(cospi[50]); const __m128i cospi14 = _mm_set1_epi32(cospi[14]); const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); const __m128i cospi58 = _mm_set1_epi32(cospi[58]); const __m128i cospi6 = _mm_set1_epi32(cospi[6]); const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); const __m128i zero = _mm_setzero_si128(); __m128i u[16], v[16], x, y; int col; for (col = 0; col < num_cols; ++col) { // stage 0 // stage 1 u[0] = in[0 * num_cols + col]; u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]); u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]); u[3] = in[8 * num_cols + col]; u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]); u[5] = in[12 * num_cols + col]; u[6] = in[4 * num_cols + col]; u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]); u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]); u[9] = in[14 * num_cols + col]; u[10] = in[6 * num_cols + col]; u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]); u[12] = in[2 * num_cols + col]; u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]); u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]); u[15] = in[10 * num_cols + col]; // stage 2 v[0] = u[0]; v[1] = u[1]; x = _mm_mullo_epi32(u[2], cospi32); y = _mm_mullo_epi32(u[3], cospi32); v[2] = _mm_add_epi32(x, y); v[2] = _mm_add_epi32(v[2], rnding); v[2] = _mm_srai_epi32(v[2], bit); v[3] = _mm_sub_epi32(x, y); v[3] = _mm_add_epi32(v[3], rnding); v[3] = _mm_srai_epi32(v[3], bit); v[4] = u[4]; v[5] = u[5]; x = _mm_mullo_epi32(u[6], cospi32); y = _mm_mullo_epi32(u[7], cospi32); v[6] = _mm_add_epi32(x, y); v[6] = _mm_add_epi32(v[6], rnding); v[6] = _mm_srai_epi32(v[6], bit); v[7] = _mm_sub_epi32(x, y); v[7] = _mm_add_epi32(v[7], rnding); v[7] = _mm_srai_epi32(v[7], bit); v[8] = u[8]; v[9] = u[9]; x = _mm_mullo_epi32(u[10], cospi32); y = _mm_mullo_epi32(u[11], cospi32); v[10] = _mm_add_epi32(x, y); v[10] = _mm_add_epi32(v[10], rnding); v[10] = _mm_srai_epi32(v[10], bit); v[11] = _mm_sub_epi32(x, y); v[11] = _mm_add_epi32(v[11], rnding); v[11] = _mm_srai_epi32(v[11], bit); v[12] = u[12]; v[13] = u[13]; x = _mm_mullo_epi32(u[14], cospi32); y = _mm_mullo_epi32(u[15], cospi32); v[14] = _mm_add_epi32(x, y); v[14] = _mm_add_epi32(v[14], rnding); v[14] = _mm_srai_epi32(v[14], bit); v[15] = _mm_sub_epi32(x, y); v[15] = _mm_add_epi32(v[15], rnding); v[15] = _mm_srai_epi32(v[15], bit); // stage 3 u[0] = _mm_add_epi32(v[0], v[2]); u[1] = _mm_add_epi32(v[1], v[3]); u[2] = _mm_sub_epi32(v[0], v[2]); u[3] = _mm_sub_epi32(v[1], v[3]); u[4] = _mm_add_epi32(v[4], v[6]); u[5] = _mm_add_epi32(v[5], v[7]); u[6] = _mm_sub_epi32(v[4], v[6]); u[7] = _mm_sub_epi32(v[5], v[7]); u[8] = _mm_add_epi32(v[8], v[10]); u[9] = _mm_add_epi32(v[9], v[11]); u[10] = _mm_sub_epi32(v[8], v[10]); u[11] = _mm_sub_epi32(v[9], v[11]); u[12] = _mm_add_epi32(v[12], v[14]); u[13] = _mm_add_epi32(v[13], v[15]); u[14] = _mm_sub_epi32(v[12], v[14]); u[15] = _mm_sub_epi32(v[13], v[15]); // stage 4 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit); v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit); v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit); v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit); v[8] = u[8]; v[9] = u[9]; v[10] = u[10]; v[11] = u[11]; v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit); v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit); v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit); v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit); // stage 5 u[0] = _mm_add_epi32(v[0], v[4]); u[1] = _mm_add_epi32(v[1], v[5]); u[2] = _mm_add_epi32(v[2], v[6]); u[3] = _mm_add_epi32(v[3], v[7]); u[4] = _mm_sub_epi32(v[0], v[4]); u[5] = _mm_sub_epi32(v[1], v[5]); u[6] = _mm_sub_epi32(v[2], v[6]); u[7] = _mm_sub_epi32(v[3], v[7]); u[8] = _mm_add_epi32(v[8], v[12]); u[9] = _mm_add_epi32(v[9], v[13]); u[10] = _mm_add_epi32(v[10], v[14]); u[11] = _mm_add_epi32(v[11], v[15]); u[12] = _mm_sub_epi32(v[8], v[12]); u[13] = _mm_sub_epi32(v[9], v[13]); u[14] = _mm_sub_epi32(v[10], v[14]); u[15] = _mm_sub_epi32(v[11], v[15]); // stage 6 v[0] = u[0]; v[1] = u[1]; v[2] = u[2]; v[3] = u[3]; v[4] = u[4]; v[5] = u[5]; v[6] = u[6]; v[7] = u[7]; v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit); v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit); v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit); v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit); v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit); v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit); v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit); v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit); // stage 7 u[0] = _mm_add_epi32(v[0], v[8]); u[1] = _mm_add_epi32(v[1], v[9]); u[2] = _mm_add_epi32(v[2], v[10]); u[3] = _mm_add_epi32(v[3], v[11]); u[4] = _mm_add_epi32(v[4], v[12]); u[5] = _mm_add_epi32(v[5], v[13]); u[6] = _mm_add_epi32(v[6], v[14]); u[7] = _mm_add_epi32(v[7], v[15]); u[8] = _mm_sub_epi32(v[0], v[8]); u[9] = _mm_sub_epi32(v[1], v[9]); u[10] = _mm_sub_epi32(v[2], v[10]); u[11] = _mm_sub_epi32(v[3], v[11]); u[12] = _mm_sub_epi32(v[4], v[12]); u[13] = _mm_sub_epi32(v[5], v[13]); u[14] = _mm_sub_epi32(v[6], v[14]); u[15] = _mm_sub_epi32(v[7], v[15]); // stage 8 v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit); v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit); v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit); v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit); v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit); v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit); v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit); v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit); v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit); v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit); v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit); v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit); v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit); v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit); v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit); v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); // stage 9 out[0 * num_cols + col] = v[1]; out[1 * num_cols + col] = v[14]; out[2 * num_cols + col] = v[3]; out[3 * num_cols + col] = v[12]; out[4 * num_cols + col] = v[5]; out[5 * num_cols + col] = v[10]; out[6 * num_cols + col] = v[7]; out[7 * num_cols + col] = v[8]; out[8 * num_cols + col] = v[9]; out[9 * num_cols + col] = v[6]; out[10 * num_cols + col] = v[11]; out[11 * num_cols + col] = v[4]; out[12 * num_cols + col] = v[13]; out[13 * num_cols + col] = v[2]; out[14 * num_cols + col] = v[15]; out[15 * num_cols + col] = v[0]; } } static void col_txfm_16x16_rounding(__m128i *in, int shift) { // Note: // We split 16x16 rounding into 4 sections of 8x8 rounding, // instead of 4 columns col_txfm_8x8_rounding(&in[0], shift); col_txfm_8x8_rounding(&in[16], shift); col_txfm_8x8_rounding(&in[32], shift); col_txfm_8x8_rounding(&in[48], shift); } static void col_txfm_8x16_rounding(__m128i *in, int shift) { col_txfm_8x8_rounding(&in[0], shift); col_txfm_8x8_rounding(&in[16], shift); } static void write_buffer_16x16(const __m128i *in, int32_t *output) { const int size_8x8 = 16 * 4; write_buffer_8x8(&in[0], output); output += size_8x8; write_buffer_8x8(&in[16], output); output += size_8x8; write_buffer_8x8(&in[32], output); output += size_8x8; write_buffer_8x8(&in[48], output); } static void idtx16x16_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { (void)bit; __m128i fact = _mm_set1_epi32(2 * NewSqrt2); __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); __m128i a_low; int num_iters = 16 * col_num; for (int i = 0; i < num_iters; i++) { a_low = _mm_mullo_epi32(in[i], fact); a_low = _mm_add_epi32(a_low, offset); out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits); } } void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[64], out[64]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16]; const int txw_idx = get_txw_idx(TX_16X16); const int txh_idx = get_txh_idx(TX_16X16); const int col_num = 4; switch (tx_type) { case DCT_DCT: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); write_buffer_16x16(out, coeff); break; case ADST_DCT: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); write_buffer_16x16(out, coeff); break; case DCT_ADST: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); write_buffer_16x16(out, coeff); break; case ADST_ADST: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); write_buffer_16x16(out, coeff); break; case FLIPADST_DCT: load_buffer_16x16(input, in, stride, 1, 0, shift[0]); fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); write_buffer_16x16(out, coeff); break; case DCT_FLIPADST: load_buffer_16x16(input, in, stride, 0, 1, shift[0]); fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); write_buffer_16x16(out, coeff); break; case FLIPADST_FLIPADST: load_buffer_16x16(input, in, stride, 1, 1, shift[0]); fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); write_buffer_16x16(out, coeff); break; case ADST_FLIPADST: load_buffer_16x16(input, in, stride, 0, 1, shift[0]); fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); write_buffer_16x16(out, coeff); break; case FLIPADST_ADST: load_buffer_16x16(input, in, stride, 1, 0, shift[0]); fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); write_buffer_16x16(out, coeff); break; case IDTX: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); write_buffer_16x16(out, coeff); break; case V_DCT: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); write_buffer_16x16(out, coeff); break; case H_DCT: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); write_buffer_16x16(out, coeff); break; case V_ADST: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); write_buffer_16x16(out, coeff); break; case H_ADST: load_buffer_16x16(input, in, stride, 0, 0, shift[0]); idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); write_buffer_16x16(out, coeff); break; case V_FLIPADST: load_buffer_16x16(input, in, stride, 1, 0, shift[0]); fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); write_buffer_16x16(out, coeff); break; case H_FLIPADST: load_buffer_16x16(input, in, stride, 0, 1, shift[0]); idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); col_txfm_16x16_rounding(out, -shift[1]); transpose_16x16(out, in); fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); write_buffer_16x16(out, coeff); break; default: assert(0); } (void)bd; } static inline void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) { for (int i = 0; i < size; i += 2) in[30 - i] = out[i]; for (int i = 1; i < size; i += 2) in[size - i] = out[i]; } static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = { fdct8x8_sse4_1, // DCT_DCT fadst8x8_sse4_1, // ADST_DCT fdct8x8_sse4_1, // DCT_ADST fadst8x8_sse4_1, // ADST_ADST fadst8x8_sse4_1, // FLIPADST_DCT fdct8x8_sse4_1, // DCT_FLIPADST fadst8x8_sse4_1, // FLIPADST_FLIPADST fadst8x8_sse4_1, // ADST_FLIPADST fadst8x8_sse4_1, // FLIPADST_ADST idtx8x8_sse4_1, // IDTX fdct8x8_sse4_1, // V_DCT idtx8x8_sse4_1, // H_DCT fadst8x8_sse4_1, // V_ADST idtx8x8_sse4_1, // H_ADST fadst8x8_sse4_1, // V_FLIPADST idtx8x8_sse4_1 // H_FLIPADST }; #if !CONFIG_REALTIME_ONLY static const fwd_transform_1d_sse4_1 row_highbd_txfm32x8_arr[TX_TYPES] = { fdct8x8_sse4_1, // DCT_DCT NULL, // ADST_DCT NULL, // DCT_ADST NULL, // ADST_ADST NULL, // FLIPADST_DCT NULL, // DCT_FLIPADST NULL, // FLIPADST_FLIPADST NULL, // ADST_FLIPADST NULL, // FLIPADST-ADST idtx32x8_sse4_1, // IDTX NULL, // V_DCT NULL, // H_DCT NULL, // V_ADST NULL, // H_ADST NULL, // V_FLIPADST NULL, // H_FLIPADST }; #endif static const fwd_transform_1d_sse4_1 col_highbd_txfm4x8_arr[TX_TYPES] = { fdct4x8_sse4_1, // DCT_DCT fadst8x8_sse4_1, // ADST_DCT fdct4x8_sse4_1, // DCT_ADST fadst8x8_sse4_1, // ADST_ADST fadst8x8_sse4_1, // FLIPADST_DCT fdct4x8_sse4_1, // DCT_FLIPADST fadst8x8_sse4_1, // FLIPADST_FLIPADST fadst8x8_sse4_1, // ADST_FLIPADST fadst8x8_sse4_1, // FLIPADST_ADST idtx8x8_sse4_1, // IDTX fdct4x8_sse4_1, // V_DCT idtx8x8_sse4_1, // H_DCT fadst8x8_sse4_1, // V_ADST idtx8x8_sse4_1, // H_ADST fadst8x8_sse4_1, // V_FLIPADST idtx8x8_sse4_1 // H_FLIPADST }; static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = { fdct16x16_sse4_1, // DCT_DCT fdct16x16_sse4_1, // ADST_DCT fadst16x16_sse4_1, // DCT_ADST fadst16x16_sse4_1, // ADST_ADST fdct16x16_sse4_1, // FLIPADST_DCT fadst16x16_sse4_1, // DCT_FLIPADST fadst16x16_sse4_1, // FLIPADST_FLIPADST fadst16x16_sse4_1, // ADST_FLIPADST fadst16x16_sse4_1, // FLIPADST_ADST idtx16x16_sse4_1, // IDTX idtx16x16_sse4_1, // V_DCT fdct16x16_sse4_1, // H_DCT idtx16x16_sse4_1, // V_ADST fadst16x16_sse4_1, // H_ADST idtx16x16_sse4_1, // V_FLIPADST fadst16x16_sse4_1 // H_FLIPADST }; static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = { fdct16x16_sse4_1, // DCT_DCT fadst16x16_sse4_1, // ADST_DCT fdct16x16_sse4_1, // DCT_ADST fadst16x16_sse4_1, // ADST_ADST fadst16x16_sse4_1, // FLIPADST_DCT fdct16x16_sse4_1, // DCT_FLIPADST fadst16x16_sse4_1, // FLIPADST_FLIPADST fadst16x16_sse4_1, // ADST_FLIPADST fadst16x16_sse4_1, // FLIPADST_ADST idtx16x16_sse4_1, // IDTX fdct16x16_sse4_1, // V_DCT idtx16x16_sse4_1, // H_DCT fadst16x16_sse4_1, // V_ADST idtx16x16_sse4_1, // H_ADST fadst16x16_sse4_1, // V_FLIPADST idtx16x16_sse4_1 // H_FLIPADST }; static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = { fdct8x8_sse4_1, // DCT_DCT fdct8x8_sse4_1, // ADST_DCT fadst8x8_sse4_1, // DCT_ADST fadst8x8_sse4_1, // ADST_ADST fdct8x8_sse4_1, // FLIPADST_DCT fadst8x8_sse4_1, // DCT_FLIPADST fadst8x8_sse4_1, // FLIPADST_FLIPADST fadst8x8_sse4_1, // ADST_FLIPADST fadst8x8_sse4_1, // FLIPADST_ADST idtx8x8_sse4_1, // IDTX idtx8x8_sse4_1, // V_DCT fdct8x8_sse4_1, // H_DCT idtx8x8_sse4_1, // V_ADST fadst8x8_sse4_1, // H_ADST idtx8x8_sse4_1, // V_FLIPADST fadst8x8_sse4_1 // H_FLIPADST }; static const fwd_transform_1d_sse4_1 row_highbd_txfm4x8_arr[TX_TYPES] = { fdct4x8_sse4_1, // DCT_DCT fdct4x8_sse4_1, // ADST_DCT fadst8x8_sse4_1, // DCT_ADST fadst8x8_sse4_1, // ADST_ADST fdct4x8_sse4_1, // FLIPADST_DCT fadst8x8_sse4_1, // DCT_FLIPADST fadst8x8_sse4_1, // FLIPADST_FLIPADST fadst8x8_sse4_1, // ADST_FLIPADST fadst8x8_sse4_1, // FLIPADST_ADST idtx8x8_sse4_1, // IDTX idtx8x8_sse4_1, // V_DCT fdct4x8_sse4_1, // H_DCT idtx8x8_sse4_1, // V_ADST fadst8x8_sse4_1, // H_ADST idtx8x8_sse4_1, // V_FLIPADST fadst8x8_sse4_1 // H_FLIPADST }; static const fwd_transform_1d_sse4_1 row_highbd_txfm4x4_arr[TX_TYPES] = { fdct4x4_sse4_1, // DCT_DCT fdct4x4_sse4_1, // ADST_DCT fadst4x4_sse4_1, // DCT_ADST fadst4x4_sse4_1, // ADST_ADST fdct4x4_sse4_1, // FLIPADST_DCT fadst4x4_sse4_1, // DCT_FLIPADST fadst4x4_sse4_1, // FLIPADST_FLIPADST fadst4x4_sse4_1, // ADST_FLIPADST fadst4x4_sse4_1, // FLIPADST_ADST idtx4x4_sse4_1, // IDTX idtx4x4_sse4_1, // V_DCT fdct4x4_sse4_1, // H_DCT idtx4x4_sse4_1, // V_ADST fadst4x4_sse4_1, // H_ADST idtx4x4_sse4_1, // V_FLIPADST fadst4x4_sse4_1 // H_FLIPADST }; static const fwd_transform_1d_sse4_1 col_highbd_txfm4x4_arr[TX_TYPES] = { fdct4x4_sse4_1, // DCT_DCT fadst4x4_sse4_1, // ADST_DCT fdct4x4_sse4_1, // DCT_ADST fadst4x4_sse4_1, // ADST_ADST fadst4x4_sse4_1, // FLIPADST_DCT fdct4x4_sse4_1, // DCT_FLIPADST fadst4x4_sse4_1, // FLIPADST_FLIPADST fadst4x4_sse4_1, // ADST_FLIPADST fadst4x4_sse4_1, // FLIPADST_ADST idtx4x4_sse4_1, // IDTX fdct4x4_sse4_1, // V_DCT idtx4x4_sse4_1, // H_DCT fadst4x4_sse4_1, // V_ADST idtx4x4_sse4_1, // H_ADST fadst4x4_sse4_1, // V_FLIPADST idtx4x4_sse4_1 // H_FLIPADST }; static const fwd_transform_1d_sse4_1 col_highbd_txfm8x32_arr[TX_TYPES] = { av1_fdct32_sse4_1, // DCT_DCT NULL, // ADST_DCT NULL, // DCT_ADST NULL, // ADST_ADST NULL, // FLIPADST_DCT NULL, // DCT_FLIPADST NULL, // FLIPADST_FLIPADST NULL, // ADST_FLIPADST NULL, // FLIPADST_ADST av1_idtx32_sse4_1, // IDTX NULL, // V_DCT NULL, // H_DCT NULL, // V_ADST NULL, // H_ADST NULL, // V_FLIPADST NULL // H_FLIPADST }; static const fwd_transform_1d_sse4_1 row_highbd_txfm8x32_arr[TX_TYPES] = { fdct16x16_sse4_1, // DCT_DCT NULL, // ADST_DCT NULL, // DCT_ADST NULL, // ADST_ADST NULL, // FLIPADST_DCT NULL, // DCT_FLIPADST NULL, // FLIPADST_FLIPADST NULL, // ADST_FLIPADST NULL, // FLIPADST_ADST idtx16x16_sse4_1, // IDTX NULL, // V_DCT NULL, // H_DCT NULL, // V_ADST NULL, // H_ADST NULL, // V_FLIPADST NULL // H_FLIPADST }; void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[32], out[32]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; const int txw_idx = get_txw_idx(TX_16X8); const int txh_idx = get_txh_idx(TX_16X8); const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type]; const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type]; int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); for (int i = 0; i < 2; i++) { load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]); col_txfm(in, in, bit, 2); col_txfm_8x8_rounding(in, -shift[1]); transpose_8x8(in, out + i * 16); } if (lr_flip) { flip_buf_sse4_1(in, out, 32); row_txfm(in, out, bit, 2); } else { row_txfm(out, out, bit, 2); } for (int i = 0; i < 2; i++) { av1_round_shift_rect_array_32_sse4_1(out + i * 16, in, 16, -shift[2], NewSqrt2); write_buffer_8x8(in, coeff + i * 64); } (void)bd; } void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[32], out[32]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; const int txw_idx = get_txw_idx(TX_8X16); const int txh_idx = get_txh_idx(TX_8X16); const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type]; const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type]; int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]); col_txfm(in, in, bit, 2); col_txfm_8x16_rounding(in, -shift[1]); transpose_8x8(in, out); transpose_8x8(in + 16, out + 16); for (int i = 0; i < 2; i++) { row_txfm(out + i * 16, out, bit, 2); av1_round_shift_rect_array_32_sse4_1(out, out, 16, -shift[2], NewSqrt2); write_buffer_16x8(out, coeff + i * 8, 16); } (void)bd; } #if !CONFIG_REALTIME_ONLY void av1_fwd_txfm2d_4x16_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[16]; __m128i *outcoeff128 = (__m128i *)coeff; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16]; const int txw_idx = get_txw_idx(TX_4X16); const int txh_idx = get_txh_idx(TX_4X16); const int txfm_size_col = tx_size_wide[TX_4X16]; const int txfm_size_row = tx_size_high[TX_4X16]; int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type]; const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // col transform load_buffer_4x16(input, in, stride, ud_flip, lr_flip, shift[0]); col_txfm(in, outcoeff128, bitcol, 1); col_txfm_8x8_rounding(outcoeff128, -shift[1]); transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row); // row transform for (int i = 0; i < 4; i++) { __m128i tmp[4]; row_txfm(in + i, tmp, bitrow, txfm_size_row >> 2); store_output_w4(coeff + i * 4, tmp, txfm_size_row, txfm_size_col); } (void)bd; } #endif void av1_fwd_txfm2d_16x4_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[16]; __m128i *outcoeff128 = (__m128i *)coeff; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4]; const int txw_idx = get_txw_idx(TX_16X4); const int txh_idx = get_txh_idx(TX_16X4); const int txfm_size_col = tx_size_wide[TX_16X4]; const int txfm_size_row = tx_size_high[TX_16X4]; int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type]; const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // col transform load_buffer_16x4(input, in, stride, ud_flip, lr_flip, shift[0]); for (int i = 0; i < (txfm_size_col >> 2); i++) { __m128i *cur_in = &in[i * txfm_size_row]; col_txfm(cur_in, cur_in, bitcol, 1); transpose_32bit_4x4(cur_in, cur_in); } col_txfm_8x8_rounding(in, -shift[1]); // row transform row_txfm(in, outcoeff128, bitrow, 1); (void)bd; } void av1_fwd_txfm2d_16x32_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[128]; __m128i *outcoef128 = (__m128i *)coeff; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32]; const int txw_idx = get_txw_idx(TX_16X32); const int txh_idx = get_txh_idx(TX_16X32); const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type]; const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x32_arr[tx_type]; int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; // column transform load_buffer_16x16(input, in, stride, 0, 0, shift[0]); load_buffer_16x16(input + 16 * stride, in + 64, stride, 0, 0, shift[0]); for (int i = 0; i < 4; i++) { col_txfm((in + i), (in + i), bitcol, 4); } col_txfm_16x16_rounding(&in[0], -shift[1]); col_txfm_16x16_rounding(&in[64], -shift[1]); transpose_8nx8n(in, outcoef128, 16, 32); // row transform row_txfm(outcoef128, in, bitrow, 8); av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 128, -shift[2], NewSqrt2); (void)bd; } void av1_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)tx_type; __m128i in[512]; __m128i *outcoef128 = (__m128i *)coeff; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64]; const int txw_idx = get_txw_idx(TX_32X64); const int txh_idx = get_txh_idx(TX_32X64); const int txfm_size_col = tx_size_wide[TX_32X64]; const int txfm_size_row = tx_size_high[TX_32X64]; int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int num_row = txfm_size_row >> 2; const int num_col = txfm_size_col >> 2; // column transform load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row); for (int i = 0; i < num_col; i++) { av1_fdct64_sse4_1((in + i), (in + i), bitcol, num_col, num_col); } for (int i = 0; i < num_col; i++) { col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]); } transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); // row transform for (int i = 0; i < num_row; i++) { av1_fdct32_sse4_1((outcoef128 + i), (in + i), bitrow, num_row); } for (int i = 0; i < txfm_size_col; i++) { av1_round_shift_rect_array_32_sse4_1(in + i * 16, outcoef128 + i * 8, 8, -shift[2], NewSqrt2); } (void)bd; } void av1_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { (void)tx_type; __m128i in[512]; __m128i *outcoef128 = (__m128i *)coeff; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32]; const int txw_idx = get_txw_idx(TX_64X32); const int txh_idx = get_txh_idx(TX_64X32); const int txfm_size_col = tx_size_wide[TX_64X32]; const int txfm_size_row = tx_size_high[TX_64X32]; int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int num_row = txfm_size_row >> 2; const int num_col = txfm_size_col >> 2; // column transform for (int i = 0; i < 32; i++) { load_buffer_4x4(input + 0 + i * stride, in + 0 + i * 16, 4, 0, 0, shift[0]); load_buffer_4x4(input + 16 + i * stride, in + 4 + i * 16, 4, 0, 0, shift[0]); load_buffer_4x4(input + 32 + i * stride, in + 8 + i * 16, 4, 0, 0, shift[0]); load_buffer_4x4(input + 48 + i * stride, in + 12 + i * 16, 4, 0, 0, shift[0]); } for (int i = 0; i < num_col; i++) { av1_fdct32_sse4_1((in + i), (in + i), bitcol, num_col); } for (int i = 0; i < num_row; i++) { col_txfm_16x16_rounding((in + i * txfm_size_col), -shift[1]); } transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); // row transform for (int i = 0; i < num_row; i++) { av1_fdct64_sse4_1((outcoef128 + i), (in + i), bitrow, num_row, num_row); } av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 512, -shift[2], NewSqrt2); (void)bd; } void av1_fwd_txfm2d_32x16_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[128]; __m128i *outcoef128 = (__m128i *)coeff; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16]; const int txw_idx = get_txw_idx(TX_32X16); const int txh_idx = get_txh_idx(TX_32X16); const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm8x32_arr[tx_type]; const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type]; int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; // column transform load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16); col_txfm(in, in, bitcol, 8); col_txfm_16x16_rounding(&in[0], -shift[1]); col_txfm_16x16_rounding(&in[64], -shift[1]); transpose_8nx8n(in, outcoef128, 32, 16); // row transform for (int i = 0; i < 4; i++) { row_txfm((outcoef128 + i), (in + i), bitrow, 4); } av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 128, -shift[2], NewSqrt2); (void)bd; } #if !CONFIG_REALTIME_ONLY void av1_fwd_txfm2d_8x32_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[64]; __m128i *outcoef128 = (__m128i *)coeff; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32]; const int txw_idx = get_txw_idx(TX_8X32); const int txh_idx = get_txh_idx(TX_8X32); const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type]; const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm32x8_arr[tx_type]; int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[TX_8X32]; const int txfm_size_row = tx_size_high[TX_8X32]; const int num_col = txfm_size_col >> 2; // column transform load_buffer_8x16(input, in, stride, 0, 0, shift[0]); load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row, stride, 0, 0, shift[0]); for (int i = 0; i < num_col; i++) { col_txfm((in + i), (in + i), bitcol, num_col); } col_txfm_16x16_rounding(in, -shift[1]); transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); // row transform for (int i = 0; i < txfm_size_col; i += 2) { row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, txfm_size_col); } (void)bd; } void av1_fwd_txfm2d_32x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[64]; __m128i *outcoef128 = (__m128i *)coeff; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8]; const int txw_idx = get_txw_idx(TX_32X8); const int txh_idx = get_txh_idx(TX_32X8); const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm32x8_arr[tx_type]; const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type]; int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const int txfm_size_col = tx_size_wide[TX_32X8]; const int txfm_size_row = tx_size_high[TX_32X8]; const int num_col = txfm_size_row >> 2; // column transform load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8); for (int i = 0; i < txfm_size_row; i += 2) { col_txfm((in + i), (in + i), bitcol, txfm_size_row); } col_txfm_16x16_rounding(&in[0], -shift[1]); transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); // row transform for (int i = 0; i < num_col; i++) { row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, num_col); } (void)bd; } #endif void av1_fwd_txfm2d_4x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[8]; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8]; const int txw_idx = get_txw_idx(TX_4X8); const int txh_idx = get_txh_idx(TX_4X8); const int txfm_size_col = tx_size_wide[TX_4X8]; const int txfm_size_row = tx_size_high[TX_4X8]; int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x8_arr[tx_type]; const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); load_buffer_4x8(input, in, stride, ud_flip, lr_flip, shift[0]); col_txfm(in, in, bitcol, 1); col_txfm_4x8_rounding(in, -shift[1]); for (int i = 0; i < 2; i++) { __m128i *cur_in = &in[i * 4]; transpose_32bit_4x4(cur_in, cur_in); row_txfm(cur_in, cur_in, bitrow, 1); av1_round_shift_rect_array_32_sse4_1(cur_in, cur_in, txfm_size_col, -shift[2], NewSqrt2); store_output_w4(coeff + i * 4, cur_in, txfm_size_row, 4); } (void)bd; } void av1_fwd_txfm2d_8x4_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[8]; __m128i *outcoeff128 = (__m128i *)coeff; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4]; const int txw_idx = get_txw_idx(TX_8X4); const int txh_idx = get_txh_idx(TX_8X4); const int txfm_size_col = tx_size_wide[TX_8X4]; const int txfm_size_row = tx_size_high[TX_8X4]; int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type]; const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x8_arr[tx_type]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // col tranform load_buffer_8x4(input, in, stride, ud_flip, lr_flip, shift[0]); for (int i = 0; i < 2; i++) { __m128i *cur_in = &in[i * txfm_size_row]; col_txfm(cur_in, cur_in, bitcol, 1); transpose_32bit_4x4(cur_in, cur_in); } col_txfm_4x8_rounding(in, -shift[1]); // row tranform row_txfm(in, outcoeff128, bitrow, 1); av1_round_shift_rect_array_32_sse4_1(outcoeff128, outcoeff128, txfm_size_col, -shift[2], NewSqrt2); (void)bd; } #if !CONFIG_REALTIME_ONLY void av1_fwd_txfm2d_16x64_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[256]; __m128i *outcoeff128 = (__m128i *)coeff; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64]; const int txw_idx = get_txw_idx(TX_16X64); const int txh_idx = get_txh_idx(TX_16X64); const int txfm_size_col = tx_size_wide[TX_16X64]; const int txfm_size_row = tx_size_high[TX_16X64]; int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); const int num_col = txfm_size_col >> 2; // col tranform for (int i = 0; i < txfm_size_row; i += num_col) { load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col, ud_flip, lr_flip, shift[0]); load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col, ud_flip, lr_flip, shift[0]); load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col, ud_flip, lr_flip, shift[0]); load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col, ud_flip, lr_flip, shift[0]); } for (int i = 0; i < num_col; i++) { av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitcol, num_col, num_col); } col_txfm_16x16_rounding(outcoeff128, -shift[1]); col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]); col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]); col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]); transpose_8nx8n(outcoeff128, in, txfm_size_col, 32); fdct16x16_sse4_1(in, outcoeff128, bitrow, 8); (void)bd; } void av1_fwd_txfm2d_64x16_sse4_1(const int16_t *input, int32_t *coeff, int stride, TX_TYPE tx_type, int bd) { __m128i in[256]; __m128i *outcoeff128 = (__m128i *)coeff; const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16]; const int txw_idx = get_txw_idx(TX_64X16); const int txh_idx = get_txh_idx(TX_64X16); const int txfm_size_col = tx_size_wide[TX_64X16]; const int txfm_size_row = tx_size_high[TX_64X16]; int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; int ud_flip, lr_flip; get_flip_cfg(tx_type, &ud_flip, &lr_flip); // col tranform for (int i = 0; i < txfm_size_row; i++) { load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4, ud_flip, lr_flip, shift[0]); load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4, ud_flip, lr_flip, shift[0]); load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4, ud_flip, lr_flip, shift[0]); load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4, ud_flip, lr_flip, shift[0]); } fdct16x16_sse4_1(in, outcoeff128, bitcol, txfm_size_row); col_txfm_16x16_rounding(outcoeff128, -shift[1]); col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]); col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]); col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]); transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row); for (int i = 0; i < 4; i++) { av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitrow, 4, 4); } memset(coeff + txfm_size_row * 32, 0, txfm_size_row * 32 * sizeof(*coeff)); (void)bd; } #endif aom-3.12.1/av1/encoder/x86/highbd_temporal_filter_avx2.c000066400000000000000000000444401477627663500227430ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "aom_dsp/mathutils.h" #include "av1/encoder/encoder.h" #include "av1/encoder/temporal_filter.h" #define SSE_STRIDE (BW + 4) DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 }, { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 }, { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 }, { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } }; static AOM_FORCE_INLINE void get_squared_error_16x16_avx2( const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2, const unsigned int stride2, const int block_width, const int block_height, uint32_t *frame_sse, const unsigned int sse_stride) { (void)block_width; const uint16_t *src1 = frame1; const uint16_t *src2 = frame2; uint32_t *dst = frame_sse + 2; for (int i = 0; i < block_height; i++) { __m256i v_src1 = _mm256_loadu_si256((__m256i *)src1); __m256i v_src2 = _mm256_loadu_si256((__m256i *)src2); __m256i v_diff = _mm256_sub_epi16(v_src1, v_src2); __m256i v_mullo = _mm256_mullo_epi16(v_diff, v_diff); __m256i v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff); __m256i v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi); __m256i v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi); __m256i diff_lo = _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1); __m256i diff_hi = _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0); _mm256_storeu_si256((__m256i *)dst, diff_lo); dst += 8; _mm256_storeu_si256((__m256i *)dst, diff_hi); src1 += stride, src2 += stride2; dst += sse_stride - 8; } } static AOM_FORCE_INLINE void get_squared_error_32x32_avx2( const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2, const unsigned int stride2, const int block_width, const int block_height, uint32_t *frame_sse, const unsigned int sse_stride) { (void)block_width; const uint16_t *src1 = frame1; const uint16_t *src2 = frame2; uint32_t *dst = frame_sse + 2; for (int i = 0; i < block_height; i++) { __m256i v_src1 = _mm256_loadu_si256((__m256i *)src1); __m256i v_src2 = _mm256_loadu_si256((__m256i *)src2); __m256i v_diff = _mm256_sub_epi16(v_src1, v_src2); __m256i v_mullo = _mm256_mullo_epi16(v_diff, v_diff); __m256i v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff); __m256i v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi); __m256i v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi); __m256i diff_lo = _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1); __m256i diff_hi = _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0); _mm256_storeu_si256((__m256i *)dst, diff_lo); _mm256_storeu_si256((__m256i *)(dst + 8), diff_hi); v_src1 = _mm256_loadu_si256((__m256i *)(src1 + 16)); v_src2 = _mm256_loadu_si256((__m256i *)(src2 + 16)); v_diff = _mm256_sub_epi16(v_src1, v_src2); v_mullo = _mm256_mullo_epi16(v_diff, v_diff); v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff); v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi); v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi); diff_lo = _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1); diff_hi = _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0); _mm256_storeu_si256((__m256i *)(dst + 16), diff_lo); _mm256_storeu_si256((__m256i *)(dst + 24), diff_hi); src1 += stride; src2 += stride2; dst += sse_stride; } } static AOM_FORCE_INLINE void xx_load_and_pad_left(uint32_t *src, __m256i *v256tmp) { *v256tmp = _mm256_loadu_si256((__m256i *)src); // For the first column, replicate the first element twice to the left __m256i v256tmp1 = _mm256_shuffle_epi32(*v256tmp, 0xEA); *v256tmp = _mm256_inserti128_si256(*v256tmp, _mm256_extracti128_si256(v256tmp1, 0), 0); } static AOM_FORCE_INLINE void xx_load_and_pad_right(uint32_t *src, __m256i *v256tmp) { *v256tmp = _mm256_loadu_si256((__m256i *)src); // For the last column, replicate the last element twice to the right __m256i v256tmp1 = _mm256_shuffle_epi32(*v256tmp, 0x54); *v256tmp = _mm256_inserti128_si256(*v256tmp, _mm256_extracti128_si256(v256tmp1, 1), 1); } static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) { // Mask the required 5 values inside the vector __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]); __m128i v128a, v128b; // Extract 256b as two 128b registers A and B v128a = _mm256_castsi256_si128(vtmp); v128b = _mm256_extracti128_si256(vtmp, 1); // A = [A0+B0, A1+B1, A2+B2, A3+B3] v128a = _mm_add_epi32(v128a, v128b); // B = [A2+B2, A3+B3, 0, 0] v128b = _mm_srli_si128(v128a, 8); // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X] v128a = _mm_add_epi32(v128a, v128b); // B = [A1+B1+A3+B3, 0, 0, 0] v128b = _mm_srli_si128(v128a, 4); // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X] v128a = _mm_add_epi32(v128a, v128b); return _mm_extract_epi32(v128a, 0); } static void highbd_apply_temporal_filter( const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2, const unsigned int stride2, const int block_width, const int block_height, const int *subblock_mses, unsigned int *accumulator, uint16_t *count, uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd, const double inv_num_ref_pixels, const double decay_factor, const double inv_factor, const double weight_factor, double *d_factor, int tf_wgt_calc_lvl) { assert(((block_width == 16) || (block_width == 32)) && ((block_height == 16) || (block_height == 32))); uint32_t acc_5x5_sse[BH][BW]; if (block_width == 32) { get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width, block_height, frame_sse, SSE_STRIDE); } else { get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width, block_height, frame_sse, SSE_STRIDE); } __m256i vsrc[5]; // Traverse 4 columns at a time // First and last columns will require padding int col; uint32_t *src = frame_sse; for (int i = 2; i < 5; i++) { xx_load_and_pad_left(src, &vsrc[i]); src += SSE_STRIDE; } // Copy first row to first 2 vectors vsrc[0] = vsrc[2]; vsrc[1] = vsrc[2]; for (int row = 0; row < block_height - 3; row++) { __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); for (int i = 0; i < 4; i++) { vsrc[i] = vsrc[i + 1]; } xx_load_and_pad_left(src, &vsrc[4]); src += SSE_STRIDE; acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0); acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1); acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2); acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3); } for (int row = block_height - 3; row < block_height; row++) { __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); for (int i = 0; i < 4; i++) { vsrc[i] = vsrc[i + 1]; } acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0); acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1); acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2); acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3); } for (col = 4; col < block_width - 4; col += 4) { src = frame_sse + col; // Load and pad(for first and last col) 3 rows from the top for (int i = 2; i < 5; i++) { vsrc[i] = _mm256_loadu_si256((__m256i *)src); src += SSE_STRIDE; } // Copy first row to first 2 vectors vsrc[0] = vsrc[2]; vsrc[1] = vsrc[2]; for (int row = 0; row < block_height - 3; row++) { __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); for (int i = 0; i < 4; i++) { vsrc[i] = vsrc[i + 1]; } vsrc[4] = _mm256_loadu_si256((__m256i *)src); src += SSE_STRIDE; acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0); acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1); acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2); acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3); } for (int row = block_height - 3; row < block_height; row++) { __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); for (int i = 0; i < 4; i++) { vsrc[i] = vsrc[i + 1]; } acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0); acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1); acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2); acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3); } } src = frame_sse + col; // Load and pad(for first and last col) 3 rows from the top for (int i = 2; i < 5; i++) { xx_load_and_pad_right(src, &vsrc[i]); src += SSE_STRIDE; } // Copy first row to first 2 vectors vsrc[0] = vsrc[2]; vsrc[1] = vsrc[2]; for (int row = 0; row < block_height - 3; row++) { __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); for (int i = 0; i < 4; i++) { vsrc[i] = vsrc[i + 1]; } xx_load_and_pad_right(src, &vsrc[4]); src += SSE_STRIDE; acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0); acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1); acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2); acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3); } for (int row = block_height - 3; row < block_height; row++) { __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); for (int i = 0; i < 4; i++) { vsrc[i] = vsrc[i + 1]; } acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0); acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1); acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2); acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3); } double subblock_mses_scaled[4]; double d_factor_decayed[4]; for (int idx = 0; idx < 4; idx++) { subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor; d_factor_decayed[idx] = d_factor[idx] * decay_factor; } if (tf_wgt_calc_lvl == 0) { for (int i = 0, k = 0; i < block_height; i++) { const int y_blk_raster_offset = (i >= block_height / 2) * 2; for (int j = 0; j < block_width; j++, k++) { const int pixel_value = frame2[i * stride2 + j]; uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; // Scale down the difference for high bit depth input. diff_sse >>= ((bd - 8) * 2); const double window_error = diff_sse * inv_num_ref_pixels; const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); const double combined_error = weight_factor * window_error + subblock_mses_scaled[subblock_idx]; double scaled_error = combined_error * d_factor_decayed[subblock_idx]; scaled_error = AOMMIN(scaled_error, 7); const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); count[k] += weight; accumulator[k] += weight * pixel_value; } } } else { for (int i = 0, k = 0; i < block_height; i++) { const int y_blk_raster_offset = (i >= block_height / 2) * 2; for (int j = 0; j < block_width; j++, k++) { const int pixel_value = frame2[i * stride2 + j]; uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; // Scale down the difference for high bit depth input. diff_sse >>= ((bd - 8) * 2); const double window_error = diff_sse * inv_num_ref_pixels; const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); const double combined_error = weight_factor * window_error + subblock_mses_scaled[subblock_idx]; double scaled_error = combined_error * d_factor_decayed[subblock_idx]; scaled_error = AOMMIN(scaled_error, 7); const float fweight = approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; const int weight = iroundpf(fweight); count[k] += weight; accumulator[k] += weight * pixel_value; } } } } void av1_highbd_apply_temporal_filter_avx2( const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count) { const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!"); assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!"); assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); (void)is_high_bitdepth; const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; const int frame_height = frame_to_filter->y_crop_height; const int frame_width = frame_to_filter->y_crop_width; const int min_frame_size = AOMMIN(frame_height, frame_width); // Variables to simplify combined error calculation. const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * TF_SEARCH_ERROR_NORM_WEIGHT); const double weight_factor = (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; // Adjust filtering based on q. // Larger q -> stronger filtering -> larger weight. // Smaller q -> weaker filtering -> smaller weight. double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); q_decay = CLIP(q_decay, 1e-5, 1); if (q_factor >= TF_QINDEX_CUTOFF) { // Max q_factor is 255, therefore the upper bound of q_decay is 8. // We do not need a clip here. q_decay = 0.5 * pow((double)q_factor / 64, 2); } // Smaller strength -> smaller filtering weight. double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); s_decay = CLIP(s_decay, 1e-5, 1); double d_factor[4] = { 0 }; uint32_t frame_sse[SSE_STRIDE * BH] = { 0 }; uint32_t luma_sse_sum[BW * BH] = { 0 }; uint16_t *pred1 = CONVERT_TO_SHORTPTR(pred); for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { // Larger motion vector -> smaller filtering weight. const MV mv = subblock_mvs[subblock_idx]; const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; distance_threshold = AOMMAX(distance_threshold, 1); d_factor[subblock_idx] = distance / distance_threshold; d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); } // Handle planes in sequence. int plane_offset = 0; for (int plane = 0; plane < num_planes; ++plane) { const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1]; const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; const uint16_t *ref = CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset; const int ss_x_shift = mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; const int ss_y_shift = mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); const double inv_num_ref_pixels = 1.0 / num_ref_pixels; // Larger noise -> larger filtering weight. const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); // Decay factors for non-local mean approach. const double decay_factor = 1 / (n_decay * q_decay * s_decay); // Filter U-plane and V-plane using Y-plane. This is because motion // search is only done on Y-plane, so the information from Y-plane // will be more accurate. The luma sse sum is reused in both chroma // planes. if (plane == AOM_PLANE_U) { for (unsigned int i = 0, k = 0; i < plane_h; i++) { for (unsigned int j = 0; j < plane_w; j++, k++) { for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2]; } } } } } highbd_apply_temporal_filter( ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h, subblock_mses, accum + plane_offset, count + plane_offset, frame_sse, luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor, weight_factor, d_factor, tf_wgt_calc_lvl); plane_offset += plane_h * plane_w; } } aom-3.12.1/av1/encoder/x86/highbd_temporal_filter_sse2.c000066400000000000000000000340171477627663500227360ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "aom_dsp/mathutils.h" #include "av1/encoder/encoder.h" #include "av1/encoder/temporal_filter.h" // For the squared error buffer, keep a padding for 4 samples #define SSE_STRIDE (BW + 4) DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = { { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } }, { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } }, { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } }, { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } } }; static void get_squared_error(const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2, const unsigned int stride2, const int block_width, const int block_height, uint32_t *frame_sse, const unsigned int dst_stride) { const uint16_t *src1 = frame1; const uint16_t *src2 = frame2; uint32_t *dst = frame_sse; for (int i = 0; i < block_height; i++) { for (int j = 0; j < block_width; j += 8) { __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j)); __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j)); __m128i vdiff = _mm_sub_epi16(vsrc1, vsrc2); __m128i vmullo = _mm_mullo_epi16(vdiff, vdiff); __m128i vmullh = _mm_mulhi_epi16(vdiff, vdiff); __m128i vres1 = _mm_unpacklo_epi16(vmullo, vmullh); __m128i vres2 = _mm_unpackhi_epi16(vmullo, vmullh); _mm_storeu_si128((__m128i *)(dst + j + 2), vres1); _mm_storeu_si128((__m128i *)(dst + j + 6), vres2); } src1 += stride; src2 += stride2; dst += dst_stride; } } static void xx_load_and_pad(uint32_t *src, __m128i *dstvec, int col, int block_width) { __m128i vtmp1 = _mm_loadu_si128((__m128i *)src); __m128i vtmp2 = _mm_loadu_si128((__m128i *)(src + 4)); // For the first column, replicate the first element twice to the left dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA); // For the last column, replicate the last element twice to the right dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54); } static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) { __m128i veca, vecb; // Mask and obtain the required 5 values inside the vector veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]); vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]); // A = [A0+B0, A1+B1, A2+B2, A3+B3] veca = _mm_add_epi32(veca, vecb); // B = [A2+B2, A3+B3, 0, 0] vecb = _mm_srli_si128(veca, 8); // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X] veca = _mm_add_epi32(veca, vecb); // B = [A1+B1+A3+B3, 0, 0, 0] vecb = _mm_srli_si128(veca, 4); // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X] veca = _mm_add_epi32(veca, vecb); return _mm_cvtsi128_si32(veca); } static void highbd_apply_temporal_filter( const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2, const unsigned int stride2, const int block_width, const int block_height, const int *subblock_mses, unsigned int *accumulator, uint16_t *count, uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd, const double inv_num_ref_pixels, const double decay_factor, const double inv_factor, const double weight_factor, double *d_factor, int tf_wgt_calc_lvl) { assert(((block_width == 16) || (block_width == 32)) && ((block_height == 16) || (block_height == 32))); uint32_t acc_5x5_sse[BH][BW]; get_squared_error(frame1, stride, frame2, stride2, block_width, block_height, frame_sse, SSE_STRIDE); __m128i vsrc[5][2]; // Traverse 4 columns at a time // First and last columns will require padding for (int col = 0; col < block_width; col += 4) { uint32_t *src = frame_sse + col; // Load and pad(for first and last col) 3 rows from the top for (int i = 2; i < 5; i++) { xx_load_and_pad(src, vsrc[i], col, block_width); src += SSE_STRIDE; } // Padding for top 2 rows vsrc[0][0] = vsrc[2][0]; vsrc[0][1] = vsrc[2][1]; vsrc[1][0] = vsrc[2][0]; vsrc[1][1] = vsrc[2][1]; for (int row = 0; row < block_height - 3; row++) { __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]); __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]); __m128i vsum13 = _mm_add_epi32(vsum11, vsum12); __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]); __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]); __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]); __m128i vsum23 = _mm_add_epi32(vsum21, vsum22); __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]); vsrc[0][0] = vsrc[1][0]; vsrc[0][1] = vsrc[1][1]; vsrc[1][0] = vsrc[2][0]; vsrc[1][1] = vsrc[2][1]; vsrc[2][0] = vsrc[3][0]; vsrc[2][1] = vsrc[3][1]; vsrc[3][0] = vsrc[4][0]; vsrc[3][1] = vsrc[4][1]; // Load next row xx_load_and_pad(src, vsrc[4], col, block_width); src += SSE_STRIDE; acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0); acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1); acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2); acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3); } for (int row = block_height - 3; row < block_height; row++) { __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]); __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]); __m128i vsum13 = _mm_add_epi32(vsum11, vsum12); __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]); __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]); __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]); __m128i vsum23 = _mm_add_epi32(vsum21, vsum22); __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]); vsrc[0][0] = vsrc[1][0]; vsrc[0][1] = vsrc[1][1]; vsrc[1][0] = vsrc[2][0]; vsrc[1][1] = vsrc[2][1]; vsrc[2][0] = vsrc[3][0]; vsrc[2][1] = vsrc[3][1]; vsrc[3][0] = vsrc[4][0]; vsrc[3][1] = vsrc[4][1]; acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0); acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1); acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2); acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3); } } double subblock_mses_scaled[4]; double d_factor_decayed[4]; for (int idx = 0; idx < 4; idx++) { subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor; d_factor_decayed[idx] = d_factor[idx] * decay_factor; } if (tf_wgt_calc_lvl == 0) { for (int i = 0, k = 0; i < block_height; i++) { const int y_blk_raster_offset = (i >= block_height / 2) * 2; for (int j = 0; j < block_width; j++, k++) { const int pixel_value = frame2[i * stride2 + j]; uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; // Scale down the difference for high bit depth input. diff_sse >>= ((bd - 8) * 2); const double window_error = diff_sse * inv_num_ref_pixels; const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); const double combined_error = weight_factor * window_error + subblock_mses_scaled[subblock_idx]; double scaled_error = combined_error * d_factor_decayed[subblock_idx]; scaled_error = AOMMIN(scaled_error, 7); const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); count[k] += weight; accumulator[k] += weight * pixel_value; } } } else { for (int i = 0, k = 0; i < block_height; i++) { const int y_blk_raster_offset = (i >= block_height / 2) * 2; for (int j = 0; j < block_width; j++, k++) { const int pixel_value = frame2[i * stride2 + j]; uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; // Scale down the difference for high bit depth input. diff_sse >>= ((bd - 8) * 2); const double window_error = diff_sse * inv_num_ref_pixels; const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); const double combined_error = weight_factor * window_error + subblock_mses_scaled[subblock_idx]; double scaled_error = combined_error * d_factor_decayed[subblock_idx]; scaled_error = AOMMIN(scaled_error, 7); const float fweight = approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; const int weight = iroundpf(fweight); count[k] += weight; accumulator[k] += weight * pixel_value; } } } } void av1_highbd_apply_temporal_filter_sse2( const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count) { const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!"); assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!"); assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); (void)is_high_bitdepth; const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; const int frame_height = frame_to_filter->y_crop_height; const int frame_width = frame_to_filter->y_crop_width; const int min_frame_size = AOMMIN(frame_height, frame_width); // Variables to simplify combined error calculation. const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * TF_SEARCH_ERROR_NORM_WEIGHT); const double weight_factor = (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; // Adjust filtering based on q. // Larger q -> stronger filtering -> larger weight. // Smaller q -> weaker filtering -> smaller weight. double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); q_decay = CLIP(q_decay, 1e-5, 1); if (q_factor >= TF_QINDEX_CUTOFF) { // Max q_factor is 255, therefore the upper bound of q_decay is 8. // We do not need a clip here. q_decay = 0.5 * pow((double)q_factor / 64, 2); } // Smaller strength -> smaller filtering weight. double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); s_decay = CLIP(s_decay, 1e-5, 1); double d_factor[4] = { 0 }; uint32_t frame_sse[SSE_STRIDE * BH] = { 0 }; uint32_t luma_sse_sum[BW * BH] = { 0 }; uint16_t *pred1 = CONVERT_TO_SHORTPTR(pred); for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { // Larger motion vector -> smaller filtering weight. const MV mv = subblock_mvs[subblock_idx]; const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; distance_threshold = AOMMAX(distance_threshold, 1); d_factor[subblock_idx] = distance / distance_threshold; d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); } // Handle planes in sequence. int plane_offset = 0; for (int plane = 0; plane < num_planes; ++plane) { const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1]; const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; const uint16_t *ref = CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset; const int ss_x_shift = mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x; const int ss_y_shift = mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y; const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); const double inv_num_ref_pixels = 1.0 / num_ref_pixels; // Larger noise -> larger filtering weight. const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); // Decay factors for non-local mean approach. const double decay_factor = 1 / (n_decay * q_decay * s_decay); // Filter U-plane and V-plane using Y-plane. This is because motion // search is only done on Y-plane, so the information from Y-plane // will be more accurate. The luma sse sum is reused in both chroma // planes. if (plane == AOM_PLANE_U) { for (unsigned int i = 0, k = 0; i < plane_h; i++) { for (unsigned int j = 0; j < plane_w; j++, k++) { for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2]; } } } } } highbd_apply_temporal_filter( ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h, subblock_mses, accum + plane_offset, count + plane_offset, frame_sse, luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor, weight_factor, d_factor, tf_wgt_calc_lvl); plane_offset += plane_h * plane_w; } } aom-3.12.1/av1/encoder/x86/ml_avx2.c000066400000000000000000000251551477627663500166600ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "config/av1_rtcd.h" #include "av1/encoder/ml.h" #include "av1/encoder/x86/ml_sse3.h" #define CALC_OUTPUT_FOR_2ROWS \ const int index = weight_idx + (2 * i * tot_num_inputs); \ const __m256 weight0 = _mm256_loadu_ps(&weights[index]); \ const __m256 weight1 = _mm256_loadu_ps(&weights[index + tot_num_inputs]); \ const __m256 mul0 = _mm256_mul_ps(inputs256, weight0); \ const __m256 mul1 = _mm256_mul_ps(inputs256, weight1); \ hadd[i] = _mm256_hadd_ps(mul0, mul1); static inline void nn_propagate_8to1( const float *const inputs, const float *const weights, const float *const bias, int num_inputs_to_process, int tot_num_inputs, int num_outputs, float *const output_nodes, int is_clip_required) { // Process one output row at a time. for (int out = 0; out < num_outputs; out++) { __m256 in_result = _mm256_setzero_ps(); float bias_val = bias[out]; for (int in = 0; in < num_inputs_to_process; in += 8) { const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]); const int weight_idx = in + (out * tot_num_inputs); const __m256 weight0 = _mm256_loadu_ps(&weights[weight_idx]); const __m256 mul0 = _mm256_mul_ps(inputs256, weight0); in_result = _mm256_add_ps(in_result, mul0); } const __m128 low_128 = _mm256_castps256_ps128(in_result); const __m128 high_128 = _mm256_extractf128_ps(in_result, 1); const __m128 sum_par_0 = _mm_add_ps(low_128, high_128); const __m128 sum_par_1 = _mm_hadd_ps(sum_par_0, sum_par_0); const __m128 sum_tot = _mm_add_ps(_mm_shuffle_ps(sum_par_1, sum_par_1, 0x99), sum_par_1); bias_val += _mm_cvtss_f32(sum_tot); if (is_clip_required) bias_val = AOMMAX(bias_val, 0); output_nodes[out] = bias_val; } } static inline void nn_propagate_8to4( const float *const inputs, const float *const weights, const float *const bias, int num_inputs_to_process, int tot_num_inputs, int num_outputs, float *const output_nodes, int is_clip_required) { __m256 hadd[2]; for (int out = 0; out < num_outputs; out += 4) { __m128 bias_reg = _mm_loadu_ps(&bias[out]); __m128 in_result = _mm_setzero_ps(); for (int in = 0; in < num_inputs_to_process; in += 8) { const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]); const int weight_idx = in + (out * tot_num_inputs); // Process two output row at a time. for (int i = 0; i < 2; i++) { CALC_OUTPUT_FOR_2ROWS } const __m256 sum_par = _mm256_hadd_ps(hadd[0], hadd[1]); const __m128 low_128 = _mm256_castps256_ps128(sum_par); const __m128 high_128 = _mm256_extractf128_ps(sum_par, 1); const __m128 result = _mm_add_ps(low_128, high_128); in_result = _mm_add_ps(in_result, result); } in_result = _mm_add_ps(in_result, bias_reg); if (is_clip_required) in_result = _mm_max_ps(in_result, _mm_setzero_ps()); _mm_storeu_ps(&output_nodes[out], in_result); } } static inline void nn_propagate_8to8( const float *const inputs, const float *const weights, const float *const bias, int num_inputs_to_process, int tot_num_inputs, int num_outputs, float *const output_nodes, int is_clip_required) { __m256 hadd[4]; for (int out = 0; out < num_outputs; out += 8) { __m256 bias_reg = _mm256_loadu_ps(&bias[out]); __m256 in_result = _mm256_setzero_ps(); for (int in = 0; in < num_inputs_to_process; in += 8) { const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]); const int weight_idx = in + (out * tot_num_inputs); // Process two output rows at a time. for (int i = 0; i < 4; i++) { CALC_OUTPUT_FOR_2ROWS } const __m256 hh0 = _mm256_hadd_ps(hadd[0], hadd[1]); const __m256 hh1 = _mm256_hadd_ps(hadd[2], hadd[3]); __m256 ht_0 = _mm256_permute2f128_ps(hh0, hh1, 0x20); __m256 ht_1 = _mm256_permute2f128_ps(hh0, hh1, 0x31); __m256 result = _mm256_add_ps(ht_0, ht_1); in_result = _mm256_add_ps(in_result, result); } in_result = _mm256_add_ps(in_result, bias_reg); if (is_clip_required) in_result = _mm256_max_ps(in_result, _mm256_setzero_ps()); _mm256_storeu_ps(&output_nodes[out], in_result); } } static inline void nn_propagate_input_multiple_of_8( const float *const inputs, const float *const weights, const float *const bias, int num_inputs_to_process, int tot_num_inputs, bool is_output_layer, int num_outputs, float *const output_nodes) { // The saturation of output is considered for hidden layer which is not equal // to final hidden layer. const int is_clip_required = !is_output_layer && num_inputs_to_process == tot_num_inputs; if (num_outputs % 8 == 0) { nn_propagate_8to8(inputs, weights, bias, num_inputs_to_process, tot_num_inputs, num_outputs, output_nodes, is_clip_required); } else if (num_outputs % 4 == 0) { nn_propagate_8to4(inputs, weights, bias, num_inputs_to_process, tot_num_inputs, num_outputs, output_nodes, is_clip_required); } else { nn_propagate_8to1(inputs, weights, bias, num_inputs_to_process, tot_num_inputs, num_outputs, output_nodes, is_clip_required); } } void av1_nn_predict_avx2(const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output) { float buf[2][NN_MAX_NODES_PER_LAYER]; int buf_index = 0; int num_inputs = nn_config->num_inputs; assert(num_inputs > 0 && num_inputs <= NN_MAX_NODES_PER_LAYER); for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) { const float *layer_weights = nn_config->weights[layer]; const float *layer_bias = nn_config->bias[layer]; bool is_output_layer = layer == nn_config->num_hidden_layers; float *const output_nodes = is_output_layer ? output : &buf[buf_index][0]; const int num_outputs = is_output_layer ? nn_config->num_outputs : nn_config->num_hidden_nodes[layer]; assert(num_outputs > 0 && num_outputs <= NN_MAX_NODES_PER_LAYER); // Process input multiple of 8 using AVX2 intrinsic. if (num_inputs % 8 == 0) { nn_propagate_input_multiple_of_8(input_nodes, layer_weights, layer_bias, num_inputs, num_inputs, is_output_layer, num_outputs, output_nodes); } else { // When number of inputs is not multiple of 8, use hybrid approach of AVX2 // and SSE3 based on the need. const int in_mul_8 = num_inputs / 8; const int num_inputs_to_process = in_mul_8 * 8; int bias_is_considered = 0; if (in_mul_8) { nn_propagate_input_multiple_of_8( input_nodes, layer_weights, layer_bias, num_inputs_to_process, num_inputs, is_output_layer, num_outputs, output_nodes); bias_is_considered = 1; } const float *out_temp = bias_is_considered ? output_nodes : layer_bias; const int input_remaining = num_inputs % 8; if (input_remaining % 4 == 0 && num_outputs % 8 == 0) { for (int out = 0; out < num_outputs; out += 8) { __m128 out_h = _mm_loadu_ps(&out_temp[out + 4]); __m128 out_l = _mm_loadu_ps(&out_temp[out]); for (int in = in_mul_8 * 8; in < num_inputs; in += 4) { av1_nn_propagate_4to8_sse3(&input_nodes[in], &layer_weights[out * num_inputs + in], &out_h, &out_l, num_inputs); } if (!is_output_layer) { const __m128 zero = _mm_setzero_ps(); out_h = _mm_max_ps(out_h, zero); out_l = _mm_max_ps(out_l, zero); } _mm_storeu_ps(&output_nodes[out + 4], out_h); _mm_storeu_ps(&output_nodes[out], out_l); } } else if (input_remaining % 4 == 0 && num_outputs % 4 == 0) { for (int out = 0; out < num_outputs; out += 4) { __m128 outputs = _mm_loadu_ps(&out_temp[out]); for (int in = in_mul_8 * 8; in < num_inputs; in += 4) { av1_nn_propagate_4to4_sse3(&input_nodes[in], &layer_weights[out * num_inputs + in], &outputs, num_inputs); } if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps()); _mm_storeu_ps(&output_nodes[out], outputs); } } else if (input_remaining % 4 == 0) { for (int out = 0; out < num_outputs; out++) { __m128 outputs = _mm_load1_ps(&out_temp[out]); for (int in = in_mul_8 * 8; in < num_inputs; in += 4) { av1_nn_propagate_4to1_sse3(&input_nodes[in], &layer_weights[out * num_inputs + in], &outputs); } if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps()); output_nodes[out] = _mm_cvtss_f32(outputs); } } else { // Use SSE instructions for scalar operations to avoid the latency // of swapping between SIMD and FPU modes. for (int out = 0; out < num_outputs; out++) { __m128 outputs = _mm_load1_ps(&out_temp[out]); for (int in_node = in_mul_8 * 8; in_node < num_inputs; in_node++) { __m128 input = _mm_load1_ps(&input_nodes[in_node]); __m128 weight = _mm_load1_ps(&layer_weights[num_inputs * out + in_node]); outputs = _mm_add_ps(outputs, _mm_mul_ps(input, weight)); } if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps()); output_nodes[out] = _mm_cvtss_f32(outputs); } } } // Before processing the next layer, treat the output of current layer as // input to next layer. input_nodes = output_nodes; num_inputs = num_outputs; buf_index = 1 - buf_index; } if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs); } aom-3.12.1/av1/encoder/x86/ml_sse3.c000066400000000000000000000307411477627663500166520ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "av1/encoder/ml.h" #include "av1/encoder/x86/ml_sse3.h" // In order to avoid the high-latency of swapping between FPU and SIMD // operations, we keep the result in a 128-bit register even though we only // care about a single value. static void nn_propagate_8to1(const float *const inputs, const float *const weights, __m128 *const output) { const __m128 inputs_h = _mm_loadu_ps(&inputs[4]); const __m128 inputs_l = _mm_loadu_ps(inputs); const __m128 weights_h = _mm_loadu_ps(&weights[4]); const __m128 weights_l = _mm_loadu_ps(weights); const __m128 mul_h = _mm_mul_ps(inputs_h, weights_h); const __m128 mul_l = _mm_mul_ps(inputs_l, weights_l); // [7 6 5 4] [3 2 1 0] (weight and input indices) const __m128 vadd = _mm_add_ps(mul_l, mul_h); // [7+3 6+2 5+1 4+0] const __m128 hadd1 = _mm_hadd_ps(vadd, vadd); // [7+6+3+2 5+4+1+0 7+6+3+2 5+4+1+0] const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1); // [7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0] *output = _mm_add_ps(*output, hadd2); } void av1_nn_propagate_4to1_sse3(const float *const inputs, const float *const weights, __m128 *const output) { const __m128 inputs128 = _mm_loadu_ps(inputs); const __m128 weights128 = _mm_loadu_ps(weights); const __m128 mul = _mm_mul_ps(inputs128, weights128); // [3 2 1 0] (weight and input indices) const __m128 hadd1 = _mm_hadd_ps(mul, mul); // [3+2 1+0 3+2 1+0] const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1); // [3+2+1+0 3+2+1+0 3+2+1+0 3+2+1+0] *output = _mm_add_ps(*output, hadd2); } void av1_nn_propagate_4to4_sse3(const float *const inputs, const float *const weights, __m128 *const outputs, const int num_inputs) { const __m128 inputs128 = _mm_loadu_ps(inputs); __m128 hadd[2]; for (int i = 0; i < 2; i++) { // For each pair of outputs const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]); const __m128 mul0 = _mm_mul_ps(weight0, inputs128); const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]); const __m128 mul1 = _mm_mul_ps(weight1, inputs128); hadd[i] = _mm_hadd_ps(mul0, mul1); } // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices) // hadd[1] = [15+14 13+12 11+10 9+8] const __m128 hh = _mm_hadd_ps(hadd[0], hadd[1]); // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0] *outputs = _mm_add_ps(*outputs, hh); } void av1_nn_propagate_4to8_sse3(const float *const inputs, const float *const weights, __m128 *const out_h, __m128 *const out_l, const int num_inputs) { const __m128 inputs128 = _mm_loadu_ps(inputs); __m128 hadd[4]; for (int i = 0; i < 4; i++) { // For each pair of outputs const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]); const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]); const __m128 mul0 = _mm_mul_ps(inputs128, weight0); const __m128 mul1 = _mm_mul_ps(inputs128, weight1); hadd[i] = _mm_hadd_ps(mul0, mul1); } // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices) // hadd[1] = [15+14 13+12 11+10 9+8] // hadd[2] = [23+22 21+20 19+18 17+16] // hadd[3] = [31+30 29+28 27+26 25+24] const __m128 hh0 = _mm_hadd_ps(hadd[0], hadd[1]); // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0] const __m128 hh1 = _mm_hadd_ps(hadd[2], hadd[3]); // [31+30+29+28 27+26+25+24 23+22+21+20 19+18+17+16] *out_h = _mm_add_ps(*out_h, hh1); *out_l = _mm_add_ps(*out_l, hh0); } static void nn_propagate_8to4(const float *const inputs, const float *const weights, __m128 *const outputs, const int num_inputs) { const __m128 inputs_h = _mm_loadu_ps(inputs + 4); const __m128 inputs_l = _mm_loadu_ps(inputs); // [7 6 5 4] [3 2 1 0] (input indices) __m128 add[4]; for (int i = 0; i < 4; i++) { // For each output: const __m128 weight_h = _mm_loadu_ps(&weights[i * num_inputs + 4]); const __m128 weight_l = _mm_loadu_ps(&weights[i * num_inputs]); const __m128 mul_h = _mm_mul_ps(inputs_h, weight_h); const __m128 mul_l = _mm_mul_ps(inputs_l, weight_l); add[i] = _mm_add_ps(mul_l, mul_h); } // add[0] = [7+3 6+2 5+1 4+0] // add[1] = [15+11 14+10 13+9 12+8] // add[2] = [23+19 22+18 21+17 20+16] // add[3] = [31+27 30+26 29+25 28+24] const __m128 hadd_h = _mm_hadd_ps(add[2], add[3]); // [31+30+27+26 29+28+25+24 23+22+19+18 21+20+17+16] const __m128 hadd_l = _mm_hadd_ps(add[0], add[1]); // [15+14+11+10 13+12+9+8 7+6+3+2 5+4+1+0] const __m128 haddhadd = _mm_hadd_ps(hadd_l, hadd_h); // [31+30+29+28+27+26+25+24 23+22+21+20+19+18+17+16 // 15+14+13+12+11+10+9+8 7+6+5+4+3+2+1+0] *outputs = _mm_add_ps(*outputs, haddhadd); } static void nn_activate8(__m128 *out_h, __m128 *out_l) { const __m128 zero = _mm_setzero_ps(); *out_h = _mm_max_ps(*out_h, zero); *out_l = _mm_max_ps(*out_l, zero); } static void nn_activate4(__m128 *x) { *x = _mm_max_ps(*x, _mm_setzero_ps()); } // Calculate prediction based on the given input features and neural net config. // Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden // layer. void av1_nn_predict_sse3(const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output) { float buf[2][NN_MAX_NODES_PER_LAYER]; int buf_index = 0; int num_inputs = nn_config->num_inputs; // Hidden layers, except the final iteration is the output layer. for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) { const float *layer_weights = nn_config->weights[layer]; const float *layer_bias = nn_config->bias[layer]; bool output_layer = (layer == nn_config->num_hidden_layers); float *const output_nodes = output_layer ? output : &buf[buf_index][0]; const int num_outputs = output_layer ? nn_config->num_outputs : nn_config->num_hidden_nodes[layer]; if (num_inputs % 4 == 0 && num_outputs % 8 == 0) { for (int out = 0; out < num_outputs; out += 8) { __m128 out_h = _mm_loadu_ps(&layer_bias[out + 4]); __m128 out_l = _mm_loadu_ps(&layer_bias[out]); for (int in = 0; in < num_inputs; in += 4) { av1_nn_propagate_4to8_sse3(&input_nodes[in], &layer_weights[out * num_inputs + in], &out_h, &out_l, num_inputs); } if (!output_layer) nn_activate8(&out_h, &out_l); _mm_storeu_ps(&output_nodes[out + 4], out_h); _mm_storeu_ps(&output_nodes[out], out_l); } } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) { for (int out = 0; out < num_outputs; out += 4) { __m128 outputs = _mm_loadu_ps(&layer_bias[out]); for (int in = 0; in < num_inputs; in += 8) { nn_propagate_8to4(&input_nodes[in], &layer_weights[out * num_inputs + in], &outputs, num_inputs); } if (!output_layer) nn_activate4(&outputs); _mm_storeu_ps(&output_nodes[out], outputs); } } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) { for (int out = 0; out < num_outputs; out += 4) { __m128 outputs = _mm_loadu_ps(&layer_bias[out]); for (int in = 0; in < num_inputs; in += 4) { av1_nn_propagate_4to4_sse3(&input_nodes[in], &layer_weights[out * num_inputs + in], &outputs, num_inputs); } if (!output_layer) nn_activate4(&outputs); _mm_storeu_ps(&output_nodes[out], outputs); } } else if (num_inputs % 8 == 0) { for (int out = 0; out < num_outputs; out++) { __m128 total = _mm_load1_ps(&layer_bias[out]); for (int in = 0; in < num_inputs; in += 8) { nn_propagate_8to1(&input_nodes[in], &layer_weights[out * num_inputs + in], &total); } if (!output_layer) nn_activate4(&total); output_nodes[out] = _mm_cvtss_f32(total); } } else if (num_inputs % 4 == 0) { for (int out = 0; out < num_outputs; out++) { __m128 total = _mm_load1_ps(&layer_bias[out]); for (int in = 0; in < num_inputs; in += 4) { av1_nn_propagate_4to1_sse3( &input_nodes[in], &layer_weights[out * num_inputs + in], &total); } if (!output_layer) nn_activate4(&total); output_nodes[out] = _mm_cvtss_f32(total); } } else { // Use SSE instructions for scalar operations to avoid the latency of // swapping between SIMD and FPU modes. for (int out = 0; out < num_outputs; out++) { __m128 total = _mm_load1_ps(&layer_bias[out]); for (int in_node = 0; in_node < num_inputs; in_node++) { __m128 input = _mm_load1_ps(&input_nodes[in_node]); __m128 weight = _mm_load1_ps(&layer_weights[num_inputs * out + in_node]); total = _mm_add_ps(total, _mm_mul_ps(input, weight)); } if (!output_layer) nn_activate4(&total); output_nodes[out] = _mm_cvtss_f32(total); } } input_nodes = output_nodes; num_inputs = num_outputs; buf_index = 1 - buf_index; } if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs); } // Based on N. N. Schraudolph. A Fast, Compact Approximation of the Exponential // Function. Neural Computation, 11(4):853–862, 1999. static inline __m128 approx_exp(__m128 y) { #define A ((1 << 23) / 0.69314718056f) // (1 << 23) / ln(2) #define B \ 127 // Offset for the exponent according to IEEE floating point standard. #define C 60801 // Magic number controls the accuracy of approximation const __m128 multiplier = _mm_set1_ps(A); const __m128i offset = _mm_set1_epi32(B * (1 << 23) - C); y = _mm_mul_ps(y, multiplier); y = _mm_castsi128_ps(_mm_add_epi32(_mm_cvtps_epi32(y), offset)); return y; #undef A #undef B #undef C } static inline __m128 reduce_max(__m128 reg) { __m128 tmp_reg; tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e); // 01 00 11 10 reg = _mm_max_ps(reg, tmp_reg); tmp_reg = _mm_shuffle_ps(reg, reg, 0xb1); // 10 11 00 01 reg = _mm_max_ps(reg, tmp_reg); return reg; } static inline __m128 reduce_sum(__m128 reg) { __m128 tmp_reg; tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e); // 01 00 11 10 reg = _mm_add_ps(reg, tmp_reg); tmp_reg = _mm_shuffle_ps(reg, reg, 0xb1); // 10 11 00 01 reg = _mm_add_ps(reg, tmp_reg); return reg; } void av1_nn_fast_softmax_16_sse3(const float *input, float *output) { // Clips at -10 to avoid underflowing const __m128 clipper = _mm_set1_ps(-10.0f); // Load in 16 values __m128 in_0 = _mm_loadu_ps(&input[0]); __m128 in_1 = _mm_loadu_ps(&input[4]); __m128 in_2 = _mm_loadu_ps(&input[8]); __m128 in_3 = _mm_loadu_ps(&input[12]); // Get the max __m128 max_0 = _mm_max_ps(in_0, in_1); __m128 max_1 = _mm_max_ps(in_2, in_3); max_0 = _mm_max_ps(max_0, max_1); max_0 = reduce_max(max_0); // Subtract the max off and clip in_0 = _mm_sub_ps(in_0, max_0); in_1 = _mm_sub_ps(in_1, max_0); in_2 = _mm_sub_ps(in_2, max_0); in_3 = _mm_sub_ps(in_3, max_0); in_0 = _mm_max_ps(in_0, clipper); in_1 = _mm_max_ps(in_1, clipper); in_2 = _mm_max_ps(in_2, clipper); in_3 = _mm_max_ps(in_3, clipper); // Exponentiate and compute the denominator __m128 sum = in_0 = approx_exp(in_0); in_1 = approx_exp(in_1); sum = _mm_add_ps(sum, in_1); in_2 = approx_exp(in_2); sum = _mm_add_ps(sum, in_2); in_3 = approx_exp(in_3); sum = _mm_add_ps(sum, in_3); sum = reduce_sum(sum); // Divide to get the probability in_0 = _mm_div_ps(in_0, sum); in_1 = _mm_div_ps(in_1, sum); in_2 = _mm_div_ps(in_2, sum); in_3 = _mm_div_ps(in_3, sum); _mm_storeu_ps(&output[0], in_0); _mm_storeu_ps(&output[4], in_1); _mm_storeu_ps(&output[8], in_2); _mm_storeu_ps(&output[12], in_3); } aom-3.12.1/av1/encoder/x86/ml_sse3.h000066400000000000000000000023621477627663500166550ustar00rootroot00000000000000/* * Copyright (c) 2023, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_ENCODER_X86_ML_SSE3_H_ #define AOM_AV1_ENCODER_X86_ML_SSE3_H_ #include void av1_nn_propagate_4to1_sse3(const float *const inputs, const float *const weights, __m128 *const output); void av1_nn_propagate_4to4_sse3(const float *const inputs, const float *const weights, __m128 *const outputs, const int num_inputs); void av1_nn_propagate_4to8_sse3(const float *const inputs, const float *const weights, __m128 *const out_h, __m128 *const out_l, const int num_inputs); #endif // AOM_AV1_ENCODER_X86_ML_SSE3_H_ aom-3.12.1/av1/encoder/x86/pickrst_avx2.c000066400000000000000000003162301477627663500177240ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // AVX2 #include "aom_dsp/x86/mem_sse2.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" #include "aom_dsp/x86/transpose_sse2.h" #include "config/av1_rtcd.h" #include "av1/common/restoration.h" #include "av1/encoder/pickrst.h" #if CONFIG_AV1_HIGHBITDEPTH static inline void acc_stat_highbd_avx2(int64_t *dst, const uint16_t *dgd, const __m256i *shuffle, const __m256i *dgd_ijkl) { // Load two 128-bit chunks from dgd const __m256i s0 = _mm256_inserti128_si256( _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)dgd)), _mm_loadu_si128((__m128i *)(dgd + 4)), 1); // s0 = [11 10 9 8 7 6 5 4] [7 6 5 4 3 2 1 0] as u16 (values are dgd indices) // The weird order is so the shuffle stays within 128-bit lanes // Shuffle 16x u16 values within lanes according to the mask: // [0 1 1 2 2 3 3 4] [0 1 1 2 2 3 3 4] // (Actually we shuffle u8 values as there's no 16-bit shuffle) const __m256i s1 = _mm256_shuffle_epi8(s0, *shuffle); // s1 = [8 7 7 6 6 5 5 4] [4 3 3 2 2 1 1 0] as u16 (values are dgd indices) // Multiply 16x 16-bit integers in dgd_ijkl and s1, resulting in 16x 32-bit // integers then horizontally add pairs of these integers resulting in 8x // 32-bit integers const __m256i d0 = _mm256_madd_epi16(*dgd_ijkl, s1); // d0 = [a b c d] [e f g h] as u32 // Take the lower-half of d0, extend to u64, add it on to dst (H) const __m256i d0l = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 0)); // d0l = [a b] [c d] as u64 const __m256i dst0 = yy_load_256(dst); yy_store_256(dst, _mm256_add_epi64(d0l, dst0)); // Take the upper-half of d0, extend to u64, add it on to dst (H) const __m256i d0h = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 1)); // d0h = [e f] [g h] as u64 const __m256i dst1 = yy_load_256(dst + 4); yy_store_256(dst + 4, _mm256_add_epi64(d0h, dst1)); } static inline void acc_stat_highbd_win7_one_line_avx2( const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, int dgd_stride, const __m256i *shuffle, int32_t *sumX, int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN], int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { int j, k, l; const int wiener_win = WIENER_WIN; // Main loop handles two pixels at a time // We can assume that h_start is even, since it will always be aligned to // a tile edge + some number of restoration units, and both of those will // be 64-pixel aligned. // However, at the edge of the image, h_end may be odd, so we need to handle // that case correctly. assert(h_start % 2 == 0); const int h_end_even = h_end & ~1; const int has_odd_pixel = h_end & 1; for (j = h_start; j < h_end_even; j += 2) { const uint16_t X1 = src[j]; const uint16_t X2 = src[j + 1]; *sumX += X1 + X2; const uint16_t *dgd_ij = dgd + j; for (k = 0; k < wiener_win; k++) { const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; for (l = 0; l < wiener_win; l++) { int64_t *H_ = &H_int[(l * wiener_win + k)][0]; const uint16_t D1 = dgd_ijk[l]; const uint16_t D2 = dgd_ijk[l + 1]; sumY[k][l] += D1 + D2; M_int[k][l] += D1 * X1 + D2 * X2; // Load two u16 values from dgd_ijkl combined as a u32, // then broadcast to 8x u32 slots of a 256 const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l)); // dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16 acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &dgd_ijkl); } } } // If the width is odd, add in the final pixel if (has_odd_pixel) { const uint16_t X1 = src[j]; *sumX += X1; const uint16_t *dgd_ij = dgd + j; for (k = 0; k < wiener_win; k++) { const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; for (l = 0; l < wiener_win; l++) { int64_t *H_ = &H_int[(l * wiener_win + k)][0]; const uint16_t D1 = dgd_ijk[l]; sumY[k][l] += D1; M_int[k][l] += D1 * X1; // The `acc_stat_highbd_avx2` function wants its input to have // interleaved copies of two pixels, but we only have one. However, the // pixels are (effectively) used as inputs to a multiply-accumulate. So // if we set the extra pixel slot to 0, then it is effectively ignored. const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1); acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &dgd_ijkl); } } } } static inline void compute_stats_highbd_win7_opt_avx2( const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { int i, j, k, l, m, n; const int wiener_win = WIENER_WIN; const int pixel_count = (h_end - h_start) * (v_end - v_start); const int wiener_win2 = wiener_win * wiener_win; const int wiener_halfwin = (wiener_win >> 1); const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); const uint16_t avg = find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } }; DECLARE_ALIGNED(32, int64_t, H_int[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } }; int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; int32_t sumX = 0; const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data); for (j = v_start; j < v_end; j += 64) { const int vert_end = AOMMIN(64, v_end - j) + j; for (i = j; i < vert_end; i++) { acc_stat_highbd_win7_one_line_avx2( dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, dgd_stride, &shuffle, &sumX, sumY, M_int, H_int); } } uint8_t bit_depth_divider = 1; if (bit_depth == AOM_BITS_12) bit_depth_divider = 16; else if (bit_depth == AOM_BITS_10) bit_depth_divider = 4; const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; for (k = 0; k < wiener_win; k++) { for (l = 0; l < wiener_win; l++) { const int32_t idx0 = l * wiener_win + k; M[idx0] = (M_int[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / bit_depth_divider; int64_t *H_ = H + idx0 * wiener_win2; int64_t *H_int_ = &H_int[idx0][0]; for (m = 0; m < wiener_win; m++) { for (n = 0; n < wiener_win; n++) { H_[m * wiener_win + n] = (H_int_[n * 8 + m] + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / bit_depth_divider; } } } } } static inline void acc_stat_highbd_win5_one_line_avx2( const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, int dgd_stride, const __m256i *shuffle, int32_t *sumX, int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { int j, k, l; const int wiener_win = WIENER_WIN_CHROMA; // Main loop handles two pixels at a time // We can assume that h_start is even, since it will always be aligned to // a tile edge + some number of restoration units, and both of those will // be 64-pixel aligned. // However, at the edge of the image, h_end may be odd, so we need to handle // that case correctly. assert(h_start % 2 == 0); const int h_end_even = h_end & ~1; const int has_odd_pixel = h_end & 1; for (j = h_start; j < h_end_even; j += 2) { const uint16_t X1 = src[j]; const uint16_t X2 = src[j + 1]; *sumX += X1 + X2; const uint16_t *dgd_ij = dgd + j; for (k = 0; k < wiener_win; k++) { const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; for (l = 0; l < wiener_win; l++) { int64_t *H_ = &H_int[(l * wiener_win + k)][0]; const uint16_t D1 = dgd_ijk[l]; const uint16_t D2 = dgd_ijk[l + 1]; sumY[k][l] += D1 + D2; M_int[k][l] += D1 * X1 + D2 * X2; // Load two u16 values from dgd_ijkl combined as a u32, // then broadcast to 8x u32 slots of a 256 const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l)); // dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16 acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &dgd_ijkl); } } } // If the width is odd, add in the final pixel if (has_odd_pixel) { const uint16_t X1 = src[j]; *sumX += X1; const uint16_t *dgd_ij = dgd + j; for (k = 0; k < wiener_win; k++) { const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; for (l = 0; l < wiener_win; l++) { int64_t *H_ = &H_int[(l * wiener_win + k)][0]; const uint16_t D1 = dgd_ijk[l]; sumY[k][l] += D1; M_int[k][l] += D1 * X1; // The `acc_stat_highbd_avx2` function wants its input to have // interleaved copies of two pixels, but we only have one. However, the // pixels are (effectively) used as inputs to a multiply-accumulate. So // if we set the extra pixel slot to 0, then it is effectively ignored. const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1); acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &dgd_ijkl); } } } } static inline void compute_stats_highbd_win5_opt_avx2( const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { int i, j, k, l, m, n; const int wiener_win = WIENER_WIN_CHROMA; const int pixel_count = (h_end - h_start) * (v_end - v_start); const int wiener_win2 = wiener_win * wiener_win; const int wiener_halfwin = (wiener_win >> 1); const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); const uint16_t avg = find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; DECLARE_ALIGNED( 32, int64_t, H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } }; int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; int32_t sumX = 0; const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data); for (j = v_start; j < v_end; j += 64) { const int vert_end = AOMMIN(64, v_end - j) + j; for (i = j; i < vert_end; i++) { acc_stat_highbd_win5_one_line_avx2( dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, dgd_stride, &shuffle, &sumX, sumY, M_int64, H_int64); } } uint8_t bit_depth_divider = 1; if (bit_depth == AOM_BITS_12) bit_depth_divider = 16; else if (bit_depth == AOM_BITS_10) bit_depth_divider = 4; const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; for (k = 0; k < wiener_win; k++) { for (l = 0; l < wiener_win; l++) { const int32_t idx0 = l * wiener_win + k; M[idx0] = (M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / bit_depth_divider; int64_t *H_ = H + idx0 * wiener_win2; int64_t *H_int_ = &H_int64[idx0][0]; for (m = 0; m < wiener_win; m++) { for (n = 0; n < wiener_win; n++) { H_[m * wiener_win + n] = (H_int_[n * 8 + m] + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / bit_depth_divider; } } } } } void av1_compute_stats_highbd_avx2(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { if (wiener_win == WIENER_WIN) { (void)dgd_avg; (void)src_avg; compute_stats_highbd_win7_opt_avx2(dgd8, src8, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else if (wiener_win == WIENER_WIN_CHROMA) { (void)dgd_avg; (void)src_avg; compute_stats_highbd_win5_opt_avx2(dgd8, src8, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else { av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } } #endif // CONFIG_AV1_HIGHBITDEPTH static inline void madd_and_accum_avx2(__m256i src, __m256i dgd, __m256i *sum) { *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(src, dgd)); } static inline __m256i convert_and_add_avx2(__m256i src) { const __m256i s0 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(src)); const __m256i s1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(src, 1)); return _mm256_add_epi64(s0, s1); } static inline __m256i hadd_four_32_to_64_avx2(__m256i src0, __m256i src1, __m256i *src2, __m256i *src3) { // 00 01 10 11 02 03 12 13 const __m256i s_0 = _mm256_hadd_epi32(src0, src1); // 20 21 30 31 22 23 32 33 const __m256i s_1 = _mm256_hadd_epi32(*src2, *src3); // 00+01 10+11 20+21 30+31 02+03 12+13 22+23 32+33 const __m256i s_2 = _mm256_hadd_epi32(s_0, s_1); return convert_and_add_avx2(s_2); } static inline __m128i add_64bit_lvl_avx2(__m256i src0, __m256i src1) { // 00 10 02 12 const __m256i t0 = _mm256_unpacklo_epi64(src0, src1); // 01 11 03 13 const __m256i t1 = _mm256_unpackhi_epi64(src0, src1); // 00+01 10+11 02+03 12+13 const __m256i sum = _mm256_add_epi64(t0, t1); // 00+01 10+11 const __m128i sum0 = _mm256_castsi256_si128(sum); // 02+03 12+13 const __m128i sum1 = _mm256_extracti128_si256(sum, 1); // 00+01+02+03 10+11+12+13 return _mm_add_epi64(sum0, sum1); } static inline __m128i convert_32_to_64_add_avx2(__m256i src0, __m256i src1) { // 00 01 02 03 const __m256i s0 = convert_and_add_avx2(src0); // 10 11 12 13 const __m256i s1 = convert_and_add_avx2(src1); return add_64bit_lvl_avx2(s0, s1); } static inline int32_t calc_sum_of_register(__m256i src) { const __m128i src_l = _mm256_castsi256_si128(src); const __m128i src_h = _mm256_extracti128_si256(src, 1); const __m128i sum = _mm_add_epi32(src_l, src_h); const __m128i dst0 = _mm_add_epi32(sum, _mm_srli_si128(sum, 8)); const __m128i dst1 = _mm_add_epi32(dst0, _mm_srli_si128(dst0, 4)); return _mm_cvtsi128_si32(dst1); } static inline void transpose_64bit_4x4_avx2(const __m256i *const src, __m256i *const dst) { // Unpack 64 bit elements. Goes from: // src[0]: 00 01 02 03 // src[1]: 10 11 12 13 // src[2]: 20 21 22 23 // src[3]: 30 31 32 33 // to: // reg0: 00 10 02 12 // reg1: 20 30 22 32 // reg2: 01 11 03 13 // reg3: 21 31 23 33 const __m256i reg0 = _mm256_unpacklo_epi64(src[0], src[1]); const __m256i reg1 = _mm256_unpacklo_epi64(src[2], src[3]); const __m256i reg2 = _mm256_unpackhi_epi64(src[0], src[1]); const __m256i reg3 = _mm256_unpackhi_epi64(src[2], src[3]); // Unpack 64 bit elements resulting in: // dst[0]: 00 10 20 30 // dst[1]: 01 11 21 31 // dst[2]: 02 12 22 32 // dst[3]: 03 13 23 33 dst[0] = _mm256_inserti128_si256(reg0, _mm256_castsi256_si128(reg1), 1); dst[1] = _mm256_inserti128_si256(reg2, _mm256_castsi256_si128(reg3), 1); dst[2] = _mm256_inserti128_si256(reg1, _mm256_extracti128_si256(reg0, 1), 0); dst[3] = _mm256_inserti128_si256(reg3, _mm256_extracti128_si256(reg2, 1), 0); } // When we load 32 values of int8_t type and need less than 32 values for // processing, the below mask is used to make the extra values zero. static const int8_t mask_8bit[32] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 16 bytes 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 bytes }; // When we load 16 values of int16_t type and need less than 16 values for // processing, the below mask is used to make the extra values zero. static const int16_t mask_16bit[32] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 16 bytes 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 bytes }; static inline uint8_t calc_dgd_buf_avg_avx2(const uint8_t *src, int32_t h_start, int32_t h_end, int32_t v_start, int32_t v_end, int32_t stride) { const uint8_t *src_temp = src + v_start * stride + h_start; const __m256i zero = _mm256_setzero_si256(); const int32_t width = h_end - h_start; const int32_t height = v_end - v_start; const int32_t wd_beyond_mul32 = width & 31; const int32_t wd_mul32 = width - wd_beyond_mul32; __m128i mask_low, mask_high; __m256i ss = zero; // When width is not multiple of 32, it still loads 32 and to make the data // which is extra (beyond required) as zero using the below mask. if (wd_beyond_mul32 >= 16) { mask_low = _mm_set1_epi8(-1); mask_high = _mm_loadu_si128((__m128i *)(&mask_8bit[32 - wd_beyond_mul32])); } else { mask_low = _mm_loadu_si128((__m128i *)(&mask_8bit[16 - wd_beyond_mul32])); mask_high = _mm_setzero_si128(); } const __m256i mask = _mm256_inserti128_si256(_mm256_castsi128_si256(mask_low), mask_high, 1); int32_t proc_ht = 0; do { // Process width in multiple of 32. int32_t proc_wd = 0; while (proc_wd < wd_mul32) { const __m256i s_0 = _mm256_loadu_si256((__m256i *)(src_temp + proc_wd)); const __m256i sad_0 = _mm256_sad_epu8(s_0, zero); ss = _mm256_add_epi32(ss, sad_0); proc_wd += 32; } // Process the remaining width. if (wd_beyond_mul32) { const __m256i s_0 = _mm256_loadu_si256((__m256i *)(src_temp + proc_wd)); const __m256i s_m_0 = _mm256_and_si256(s_0, mask); const __m256i sad_0 = _mm256_sad_epu8(s_m_0, zero); ss = _mm256_add_epi32(ss, sad_0); } src_temp += stride; proc_ht++; } while (proc_ht < height); const uint32_t sum = calc_sum_of_register(ss); const uint8_t avg = sum / (width * height); return avg; } // Fill (src-avg) or (dgd-avg) buffers. Note that when n = (width % 16) is not // 0, it writes (16 - n) more data than required. static inline void sub_avg_block_avx2(const uint8_t *src, int32_t src_stride, uint8_t avg, int32_t width, int32_t height, int16_t *dst, int32_t dst_stride, int use_downsampled_wiener_stats) { const __m256i avg_reg = _mm256_set1_epi16(avg); int32_t proc_ht = 0; do { int ds_factor = use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; if (use_downsampled_wiener_stats && (height - proc_ht < WIENER_STATS_DOWNSAMPLE_FACTOR)) { ds_factor = height - proc_ht; } int32_t proc_wd = 0; while (proc_wd < width) { const __m128i s = _mm_loadu_si128((__m128i *)(src + proc_wd)); const __m256i ss = _mm256_cvtepu8_epi16(s); const __m256i d = _mm256_sub_epi16(ss, avg_reg); _mm256_storeu_si256((__m256i *)(dst + proc_wd), d); proc_wd += 16; } src += ds_factor * src_stride; dst += ds_factor * dst_stride; proc_ht += ds_factor; } while (proc_ht < height); } // Fills lower-triangular elements of H buffer from upper triangular elements of // the same static inline void fill_lower_triag_elements_avx2(const int32_t wiener_win2, int64_t *const H) { for (int32_t i = 0; i < wiener_win2 - 1; i += 4) { __m256i in[4], out[4]; in[0] = _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + i + 1)); in[1] = _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + i + 1)); in[2] = _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + i + 1)); in[3] = _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i + 1)); transpose_64bit_4x4_avx2(in, out); _mm_storel_epi64((__m128i *)(H + (i + 1) * wiener_win2 + i), _mm256_castsi256_si128(out[0])); _mm_storeu_si128((__m128i *)(H + (i + 2) * wiener_win2 + i), _mm256_castsi256_si128(out[1])); _mm256_storeu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i), out[2]); _mm256_storeu_si256((__m256i *)(H + (i + 4) * wiener_win2 + i), out[3]); for (int32_t j = i + 5; j < wiener_win2; j += 4) { in[0] = _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + j)); in[1] = _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + j)); in[2] = _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + j)); in[3] = _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + j)); transpose_64bit_4x4_avx2(in, out); _mm256_storeu_si256((__m256i *)(H + (j + 0) * wiener_win2 + i), out[0]); _mm256_storeu_si256((__m256i *)(H + (j + 1) * wiener_win2 + i), out[1]); _mm256_storeu_si256((__m256i *)(H + (j + 2) * wiener_win2 + i), out[2]); _mm256_storeu_si256((__m256i *)(H + (j + 3) * wiener_win2 + i), out[3]); } } } // Fill H buffer based on loop_count. #define INIT_H_VALUES(d, loop_count) \ for (int g = 0; g < (loop_count); g++) { \ const __m256i dgd0 = \ _mm256_loadu_si256((__m256i *)((d) + (g * d_stride))); \ madd_and_accum_avx2(dgd_mul_df, dgd0, &sum_h[g]); \ } // Fill M & H buffer. #define INIT_MH_VALUES(d) \ for (int g = 0; g < wiener_win; g++) { \ const __m256i dgds_0 = \ _mm256_loadu_si256((__m256i *)((d) + (g * d_stride))); \ madd_and_accum_avx2(src_mul_df, dgds_0, &sum_m[g]); \ madd_and_accum_avx2(dgd_mul_df, dgds_0, &sum_h[g]); \ } // Update the dgd pointers appropriately. #define INITIALIZATION(wiener_window_sz) \ j = i / (wiener_window_sz); \ const int16_t *d_window = d + j; \ const int16_t *d_current_row = \ d + j + ((i % (wiener_window_sz)) * d_stride); \ int proc_ht = v_start; \ downsample_factor = \ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \ __m256i sum_h[wiener_window_sz]; \ memset(sum_h, 0, sizeof(sum_h)); // Update the downsample factor appropriately. #define UPDATE_DOWNSAMPLE_FACTOR \ int proc_wd = 0; \ if (use_downsampled_wiener_stats && \ ((v_end - proc_ht) < WIENER_STATS_DOWNSAMPLE_FACTOR)) { \ downsample_factor = v_end - proc_ht; \ } \ const __m256i df_reg = _mm256_set1_epi16(downsample_factor); #define CALCULATE_REMAINING_H_WIN5 \ while (j < wiener_win) { \ d_window = d; \ d_current_row = d + (i / wiener_win) + ((i % wiener_win) * d_stride); \ const __m256i zero = _mm256_setzero_si256(); \ sum_h[0] = zero; \ sum_h[1] = zero; \ sum_h[2] = zero; \ sum_h[3] = zero; \ sum_h[4] = zero; \ \ proc_ht = v_start; \ downsample_factor = \ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \ do { \ UPDATE_DOWNSAMPLE_FACTOR; \ \ /* Process the amount of width multiple of 16.*/ \ while (proc_wd < wd_mul16) { \ const __m256i dgd = \ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); \ INIT_H_VALUES(d_window + j + proc_wd, 5) \ \ proc_wd += 16; \ }; \ \ /* Process the remaining width here. */ \ if (wd_beyond_mul16) { \ const __m256i dgd = \ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \ const __m256i dgd_mask = _mm256_and_si256(dgd, mask); \ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); \ INIT_H_VALUES(d_window + j + proc_wd, 5) \ } \ proc_ht += downsample_factor; \ d_window += downsample_factor * d_stride; \ d_current_row += downsample_factor * d_stride; \ } while (proc_ht < v_end); \ const __m256i s_h0 = \ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); \ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), \ s_h0); \ const __m256i s_m_h = convert_and_add_avx2(sum_h[4]); \ const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h); \ _mm_storel_epi64( \ (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_m_h0); \ j++; \ } #define CALCULATE_REMAINING_H_WIN7 \ while (j < wiener_win) { \ d_window = d; \ d_current_row = d + (i / wiener_win) + ((i % wiener_win) * d_stride); \ const __m256i zero = _mm256_setzero_si256(); \ sum_h[0] = zero; \ sum_h[1] = zero; \ sum_h[2] = zero; \ sum_h[3] = zero; \ sum_h[4] = zero; \ sum_h[5] = zero; \ sum_h[6] = zero; \ \ proc_ht = v_start; \ downsample_factor = \ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \ do { \ UPDATE_DOWNSAMPLE_FACTOR; \ \ /* Process the amount of width multiple of 16.*/ \ while (proc_wd < wd_mul16) { \ const __m256i dgd = \ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); \ INIT_H_VALUES(d_window + j + proc_wd, 7) \ \ proc_wd += 16; \ }; \ \ /* Process the remaining width here. */ \ if (wd_beyond_mul16) { \ const __m256i dgd = \ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \ const __m256i dgd_mask = _mm256_and_si256(dgd, mask); \ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); \ INIT_H_VALUES(d_window + j + proc_wd, 7) \ } \ proc_ht += downsample_factor; \ d_window += downsample_factor * d_stride; \ d_current_row += downsample_factor * d_stride; \ } while (proc_ht < v_end); \ const __m256i s_h1 = \ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); \ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), \ s_h1); \ const __m256i s_h2 = \ hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]); \ _mm256_storeu_si256( \ (__m256i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_h2); \ j++; \ } // The buffers H(auto-covariance) and M(cross-correlation) are used to estimate // the filter tap values required for wiener filtering. Here, the buffer H is of // size ((wiener_window_size^2)*(wiener_window_size^2)) and M is of size // (wiener_window_size*wiener_window_size). H is a symmetric matrix where the // value above the diagonal (upper triangle) are equal to the values below the // diagonal (lower triangle). The calculation of elements/stats of H(upper // triangle) and M is done in steps as described below where each step fills // specific values of H and M. // Once the upper triangular elements of H matrix are derived, the same will be // copied to lower triangular using the function // fill_lower_triag_elements_avx2(). // Example: Wiener window size = // WIENER_WIN_CHROMA (5) M buffer = [M0 M1 M2 ---- M23 M24] H buffer = Hxy // (x-row, y-column) [H00 H01 H02 ---- H023 H024] [H10 H11 H12 ---- H123 H124] // [H30 H31 H32 ---- H323 H324] // [H40 H41 H42 ---- H423 H424] // [H50 H51 H52 ---- H523 H524] // [H60 H61 H62 ---- H623 H624] // || // || // [H230 H231 H232 ---- H2323 H2324] // [H240 H241 H242 ---- H2423 H2424] // In Step 1, whole M buffers (i.e., M0 to M24) and the first row of H (i.e., // H00 to H024) is filled. The remaining rows of H buffer are filled through // steps 2 to 6. static void compute_stats_win5_avx2(const int16_t *const d, int32_t d_stride, const int16_t *const s, int32_t s_stride, int32_t width, int v_start, int v_end, int64_t *const M, int64_t *const H, int use_downsampled_wiener_stats) { const int32_t wiener_win = WIENER_WIN_CHROMA; const int32_t wiener_win2 = wiener_win * wiener_win; // Amount of width which is beyond multiple of 16. This case is handled // appropriately to process only the required width towards the end. const int32_t wd_mul16 = width & ~15; const int32_t wd_beyond_mul16 = width - wd_mul16; const __m256i mask = _mm256_loadu_si256((__m256i *)(&mask_16bit[16 - wd_beyond_mul16])); int downsample_factor; // Step 1: Full M (i.e., M0 to M24) and first row H (i.e., H00 to H024) // values are filled here. Here, the loop over 'j' is executed for values 0 // to 4 (wiener_win-1). When the loop executed for a specific 'j', 5 values of // M and H are filled as shown below. // j=0: M0-M4 and H00-H04, j=1: M5-M9 and H05-H09 are filled etc,. int j = 0; do { const int16_t *s_t = s; const int16_t *d_t = d; __m256i sum_m[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() }; __m256i sum_h[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() }; downsample_factor = use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; int proc_ht = v_start; do { UPDATE_DOWNSAMPLE_FACTOR // Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd)); const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd)); const __m256i src_mul_df = _mm256_mullo_epi16(src, df_reg); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); INIT_MH_VALUES(d_t + j + proc_wd) proc_wd += 16; } // Process the remaining width here. if (wd_beyond_mul16) { const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd)); const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd)); const __m256i src_mask = _mm256_and_si256(src, mask); const __m256i dgd_mask = _mm256_and_si256(dgd, mask); const __m256i src_mul_df = _mm256_mullo_epi16(src_mask, df_reg); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); INIT_MH_VALUES(d_t + j + proc_wd) } proc_ht += downsample_factor; s_t += downsample_factor * s_stride; d_t += downsample_factor * d_stride; } while (proc_ht < v_end); const __m256i s_m = hadd_four_32_to_64_avx2(sum_m[0], sum_m[1], &sum_m[2], &sum_m[3]); const __m128i s_m_h = convert_32_to_64_add_avx2(sum_m[4], sum_h[4]); _mm256_storeu_si256((__m256i *)(M + wiener_win * j), s_m); _mm_storel_epi64((__m128i *)&M[wiener_win * j + 4], s_m_h); const __m256i s_h = hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); _mm256_storeu_si256((__m256i *)(H + wiener_win * j), s_h); _mm_storeh_epi64((__m128i *)&H[wiener_win * j + 4], s_m_h); } while (++j < wiener_win); // The below steps are designed to fill remaining rows of H buffer. Here, aim // is to fill only upper triangle elements correspond to each row and lower // triangle elements are copied from upper-triangle elements. Also, as // mentioned in Step 1, the core function is designed to fill 5 // elements/stats/values of H buffer. // // Step 2: Here, the rows 1, 6, 11, 16 and 21 are filled. As we need to fill // only upper-triangle elements, H10 from row1, H60-H64 and H65 from row6,etc, // are need not be filled. As the core function process 5 values, in first // iteration of 'j' only 4 values to be filled i.e., H11-H14 from row1,H66-H69 // from row6, etc. for (int i = 1; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started. INITIALIZATION(WIENER_WIN_CHROMA) do { UPDATE_DOWNSAMPLE_FACTOR // Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 4) proc_wd += 16; } // Process the remaining width here. if (wd_beyond_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mask = _mm256_and_si256(dgd, mask); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 4) } proc_ht += downsample_factor; d_window += downsample_factor * d_stride; d_current_row += downsample_factor * d_stride; } while (proc_ht < v_end); const __m256i s_h = hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); // process the remaining 'j' iterations. j++; CALCULATE_REMAINING_H_WIN5 } // Step 3: Here, the rows 2, 7, 12, 17 and 22 are filled. As we need to fill // only upper-triangle elements, H20-H21 from row2, H70-H74 and H75-H76 from // row7, etc, are need not be filled. As the core function process 5 values, // in first iteration of 'j' only 3 values to be filled i.e., H22-H24 from // row2, H77-H79 from row7, etc. for (int i = 2; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started. INITIALIZATION(WIENER_WIN_CHROMA) do { UPDATE_DOWNSAMPLE_FACTOR // Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 3) proc_wd += 16; } // Process the remaining width here. if (wd_beyond_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mask = _mm256_and_si256(dgd, mask); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 3) } proc_ht += downsample_factor; d_window += downsample_factor * d_stride; d_current_row += downsample_factor * d_stride; } while (proc_ht < v_end); const __m256i s_h = hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); // process the remaining 'j' iterations. j++; CALCULATE_REMAINING_H_WIN5 } // Step 4: Here, the rows 3, 8, 13, 18 and 23 are filled. As we need to fill // only upper-triangle elements, H30-H32 from row3, H80-H84 and H85-H87 from // row8, etc, are need not be filled. As the core function process 5 values, // in first iteration of 'j' only 2 values to be filled i.e., H33-H34 from // row3, H88-89 from row8, etc. for (int i = 3; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started. INITIALIZATION(WIENER_WIN_CHROMA) do { UPDATE_DOWNSAMPLE_FACTOR // Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 2) proc_wd += 16; } // Process the remaining width here. if (wd_beyond_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mask = _mm256_and_si256(dgd, mask); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 2) } proc_ht += downsample_factor; d_window += downsample_factor * d_stride; d_current_row += downsample_factor * d_stride; } while (proc_ht < v_end); const __m128i s_h = convert_32_to_64_add_avx2(sum_h[0], sum_h[1]); _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i), s_h); // process the remaining 'j' iterations. j++; CALCULATE_REMAINING_H_WIN5 } // Step 5: Here, the rows 4, 9, 14, 19 and 24 are filled. As we need to fill // only upper-triangle elements, H40-H43 from row4, H90-H94 and H95-H98 from // row9, etc, are need not be filled. As the core function process 5 values, // in first iteration of 'j' only 1 values to be filled i.e., H44 from row4, // H99 from row9, etc. for (int i = 4; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started. INITIALIZATION(WIENER_WIN_CHROMA) do { UPDATE_DOWNSAMPLE_FACTOR // Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 1) proc_wd += 16; } // Process the remaining width here. if (wd_beyond_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mask = _mm256_and_si256(dgd, mask); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 1) } proc_ht += downsample_factor; d_window += downsample_factor * d_stride; d_current_row += downsample_factor * d_stride; } while (proc_ht < v_end); const __m128i s_h = convert_32_to_64_add_avx2(sum_h[0], sum_h[1]); _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i), s_h); // process the remaining 'j' iterations. j++; CALCULATE_REMAINING_H_WIN5 } // Step 6: Here, the rows 5, 10, 15 and 20 are filled. As we need to fill only // upper-triangle elements, H50-H54 from row5, H100-H104 and H105-H109 from // row10,etc, are need not be filled. The first iteration of 'j' fills H55-H59 // from row5 and H1010-H1014 from row10, etc. for (int i = 5; i < wiener_win2; i += wiener_win) { // Derive j'th iteration from where the H buffer filling needs to be // started. j = i / wiener_win; int shift = 0; do { // Update the dgd pointers appropriately. int proc_ht = v_start; const int16_t *d_window = d + (i / wiener_win); const int16_t *d_current_row = d + (i / wiener_win) + ((i % wiener_win) * d_stride); downsample_factor = use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; __m256i sum_h[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() }; do { UPDATE_DOWNSAMPLE_FACTOR // Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); INIT_H_VALUES(d_window + shift + proc_wd, 5) proc_wd += 16; } // Process the remaining width here. if (wd_beyond_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mask = _mm256_and_si256(dgd, mask); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); INIT_H_VALUES(d_window + shift + proc_wd, 5) } proc_ht += downsample_factor; d_window += downsample_factor * d_stride; d_current_row += downsample_factor * d_stride; } while (proc_ht < v_end); const __m256i s_h = hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), s_h); const __m256i s_m_h = convert_and_add_avx2(sum_h[4]); const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h); _mm_storel_epi64( (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_m_h0); shift++; } while (++j < wiener_win); } fill_lower_triag_elements_avx2(wiener_win2, H); } // The buffers H(auto-covariance) and M(cross-correlation) are used to estimate // the filter tap values required for wiener filtering. Here, the buffer H is of // size ((wiener_window_size^2)*(wiener_window_size^2)) and M is of size // (wiener_window_size*wiener_window_size). H is a symmetric matrix where the // value above the diagonal (upper triangle) are equal to the values below the // diagonal (lower triangle). The calculation of elements/stats of H(upper // triangle) and M is done in steps as described below where each step fills // specific values of H and M. // Example: // Wiener window size = WIENER_WIN (7) // M buffer = [M0 M1 M2 ---- M47 M48] // H buffer = Hxy (x-row, y-column) // [H00 H01 H02 ---- H047 H048] // [H10 H11 H12 ---- H147 H148] // [H30 H31 H32 ---- H347 H348] // [H40 H41 H42 ---- H447 H448] // [H50 H51 H52 ---- H547 H548] // [H60 H61 H62 ---- H647 H648] // || // || // [H470 H471 H472 ---- H4747 H4748] // [H480 H481 H482 ---- H4847 H4848] // In Step 1, whole M buffers (i.e., M0 to M48) and the first row of H (i.e., // H00 to H048) is filled. The remaining rows of H buffer are filled through // steps 2 to 8. static void compute_stats_win7_avx2(const int16_t *const d, int32_t d_stride, const int16_t *const s, int32_t s_stride, int32_t width, int v_start, int v_end, int64_t *const M, int64_t *const H, int use_downsampled_wiener_stats) { const int32_t wiener_win = WIENER_WIN; const int32_t wiener_win2 = wiener_win * wiener_win; // Amount of width which is beyond multiple of 16. This case is handled // appropriately to process only the required width towards the end. const int32_t wd_mul16 = width & ~15; const int32_t wd_beyond_mul16 = width - wd_mul16; const __m256i mask = _mm256_loadu_si256((__m256i *)(&mask_16bit[16 - wd_beyond_mul16])); int downsample_factor; // Step 1: Full M (i.e., M0 to M48) and first row H (i.e., H00 to H048) // values are filled here. Here, the loop over 'j' is executed for values 0 // to 6. When the loop executed for a specific 'j', 7 values of M and H are // filled as shown below. // j=0: M0-M6 and H00-H06, j=1: M7-M13 and H07-H013 are filled etc,. int j = 0; do { const int16_t *s_t = s; const int16_t *d_t = d; __m256i sum_m[WIENER_WIN] = { _mm256_setzero_si256() }; __m256i sum_h[WIENER_WIN] = { _mm256_setzero_si256() }; downsample_factor = use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; int proc_ht = v_start; do { UPDATE_DOWNSAMPLE_FACTOR // Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd)); const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd)); const __m256i src_mul_df = _mm256_mullo_epi16(src, df_reg); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); INIT_MH_VALUES(d_t + j + proc_wd) proc_wd += 16; } if (wd_beyond_mul16) { const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd)); const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd)); const __m256i src_mask = _mm256_and_si256(src, mask); const __m256i dgd_mask = _mm256_and_si256(dgd, mask); const __m256i src_mul_df = _mm256_mullo_epi16(src_mask, df_reg); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); INIT_MH_VALUES(d_t + j + proc_wd) } proc_ht += downsample_factor; s_t += downsample_factor * s_stride; d_t += downsample_factor * d_stride; } while (proc_ht < v_end); const __m256i s_m0 = hadd_four_32_to_64_avx2(sum_m[0], sum_m[1], &sum_m[2], &sum_m[3]); const __m256i s_m1 = hadd_four_32_to_64_avx2(sum_m[4], sum_m[5], &sum_m[6], &sum_m[6]); _mm256_storeu_si256((__m256i *)(M + wiener_win * j + 0), s_m0); _mm_storeu_si128((__m128i *)(M + wiener_win * j + 4), _mm256_castsi256_si128(s_m1)); _mm_storel_epi64((__m128i *)&M[wiener_win * j + 6], _mm256_extracti128_si256(s_m1, 1)); const __m256i sh_0 = hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); const __m256i sh_1 = hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]); _mm256_storeu_si256((__m256i *)(H + wiener_win * j + 0), sh_0); _mm_storeu_si128((__m128i *)(H + wiener_win * j + 4), _mm256_castsi256_si128(sh_1)); _mm_storel_epi64((__m128i *)&H[wiener_win * j + 6], _mm256_extracti128_si256(sh_1, 1)); } while (++j < wiener_win); // The below steps are designed to fill remaining rows of H buffer. Here, aim // is to fill only upper triangle elements correspond to each row and lower // triangle elements are copied from upper-triangle elements. Also, as // mentioned in Step 1, the core function is designed to fill 7 // elements/stats/values of H buffer. // // Step 2: Here, the rows 1, 8, 15, 22, 29, 36 and 43 are filled. As we need // to fill only upper-triangle elements, H10 from row1, H80-H86 and H87 from // row8, etc. are need not be filled. As the core function process 7 values, // in first iteration of 'j' only 6 values to be filled i.e., H11-H16 from // row1 and H88-H813 from row8, etc. for (int i = 1; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started. INITIALIZATION(WIENER_WIN) do { UPDATE_DOWNSAMPLE_FACTOR // Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 6) proc_wd += 16; } // Process the remaining width here. if (wd_beyond_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mask = _mm256_and_si256(dgd, mask); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 6) } proc_ht += downsample_factor; d_window += downsample_factor * d_stride; d_current_row += downsample_factor * d_stride; } while (proc_ht < v_end); const __m256i s_h = hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); const __m128i s_h0 = convert_32_to_64_add_avx2(sum_h[4], sum_h[5]); _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i + 4), s_h0); // process the remaining 'j' iterations. j++; CALCULATE_REMAINING_H_WIN7 } // Step 3: Here, the rows 2, 9, 16, 23, 30, 37 and 44 are filled. As we need // to fill only upper-triangle elements, H20-H21 from row2, H90-H96 and // H97-H98 from row9, etc. are need not be filled. As the core function // process 7 values, in first iteration of 'j' only 5 values to be filled // i.e., H22-H26 from row2 and H99-H913 from row9, etc. for (int i = 2; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started. INITIALIZATION(WIENER_WIN) do { UPDATE_DOWNSAMPLE_FACTOR // Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 5) proc_wd += 16; } // Process the remaining width here. if (wd_beyond_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mask = _mm256_and_si256(dgd, mask); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 5) } proc_ht += downsample_factor; d_window += downsample_factor * d_stride; d_current_row += downsample_factor * d_stride; } while (proc_ht < v_end); const __m256i s_h = hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); const __m256i s_m_h = convert_and_add_avx2(sum_h[4]); const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h); _mm_storel_epi64((__m128i *)(H + (i * wiener_win2) + i + 4), s_m_h0); // process the remaining 'j' iterations. j++; CALCULATE_REMAINING_H_WIN7 } // Step 4: Here, the rows 3, 10, 17, 24, 31, 38 and 45 are filled. As we need // to fill only upper-triangle elements, H30-H32 from row3, H100-H106 and // H107-H109 from row10, etc. are need not be filled. As the core function // process 7 values, in first iteration of 'j' only 4 values to be filled // i.e., H33-H36 from row3 and H1010-H1013 from row10, etc. for (int i = 3; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started. INITIALIZATION(WIENER_WIN) do { UPDATE_DOWNSAMPLE_FACTOR // Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 4) proc_wd += 16; } // Process the remaining width here. if (wd_beyond_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mask = _mm256_and_si256(dgd, mask); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 4) } proc_ht += downsample_factor; d_window += downsample_factor * d_stride; d_current_row += downsample_factor * d_stride; } while (proc_ht < v_end); const __m256i s_h = hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); // process the remaining 'j' iterations. j++; CALCULATE_REMAINING_H_WIN7 } // Step 5: Here, the rows 4, 11, 18, 25, 32, 39 and 46 are filled. As we need // to fill only upper-triangle elements, H40-H43 from row4, H110-H116 and // H117-H1110 from row10, etc. are need not be filled. As the core function // process 7 values, in first iteration of 'j' only 3 values to be filled // i.e., H44-H46 from row4 and H1111-H1113 from row11, etc. for (int i = 4; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started. INITIALIZATION(WIENER_WIN) do { UPDATE_DOWNSAMPLE_FACTOR // Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 3) proc_wd += 16; } // Process the remaining width here. if (wd_beyond_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mask = _mm256_and_si256(dgd, mask); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 3) } proc_ht += downsample_factor; d_window += downsample_factor * d_stride; d_current_row += downsample_factor * d_stride; } while (proc_ht < v_end); const __m256i s_h = hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); // process the remaining 'j' iterations. j++; CALCULATE_REMAINING_H_WIN7 } // Step 6: Here, the rows 5, 12, 19, 26, 33, 40 and 47 are filled. As we need // to fill only upper-triangle elements, H50-H54 from row5, H120-H126 and // H127-H1211 from row12, etc. are need not be filled. As the core function // process 7 values, in first iteration of 'j' only 2 values to be filled // i.e., H55-H56 from row5 and H1212-H1213 from row12, etc. for (int i = 5; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started. INITIALIZATION(WIENER_WIN) do { UPDATE_DOWNSAMPLE_FACTOR // Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); INIT_H_VALUES(d_window + proc_wd + (5 * d_stride), 2) proc_wd += 16; } // Process the remaining width here. if (wd_beyond_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mask = _mm256_and_si256(dgd, mask); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); INIT_H_VALUES(d_window + proc_wd + (5 * d_stride), 2) } proc_ht += downsample_factor; d_window += downsample_factor * d_stride; d_current_row += downsample_factor * d_stride; } while (proc_ht < v_end); const __m256i s_h = hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); // process the remaining 'j' iterations. j++; CALCULATE_REMAINING_H_WIN7 } // Step 7: Here, the rows 6, 13, 20, 27, 34, 41 and 48 are filled. As we need // to fill only upper-triangle elements, H60-H65 from row6, H130-H136 and // H137-H1312 from row13, etc. are need not be filled. As the core function // process 7 values, in first iteration of 'j' only 1 value to be filled // i.e., H66 from row6 and H1313 from row13, etc. for (int i = 6; i < wiener_win2; i += wiener_win) { // Update the dgd pointers appropriately and also derive the 'j'th iteration // from where the H buffer filling needs to be started. INITIALIZATION(WIENER_WIN) do { UPDATE_DOWNSAMPLE_FACTOR // Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); INIT_H_VALUES(d_window + proc_wd + (6 * d_stride), 1) proc_wd += 16; } // Process the remaining width here. if (wd_beyond_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mask = _mm256_and_si256(dgd, mask); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); INIT_H_VALUES(d_window + proc_wd + (6 * d_stride), 1) } proc_ht += downsample_factor; d_window += downsample_factor * d_stride; d_current_row += downsample_factor * d_stride; } while (proc_ht < v_end); const __m256i s_h = hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); xx_storel_64(&H[(i * wiener_win2) + i], _mm256_castsi256_si128(s_h)); // process the remaining 'j' iterations. j++; CALCULATE_REMAINING_H_WIN7 } // Step 8: Here, the rows 7, 14, 21, 28, 35 and 42 are filled. As we need // to fill only upper-triangle elements, H70-H75 from row7, H140-H146 and // H147-H1413 from row14, etc. are need not be filled. The first iteration of // 'j' fills H77-H713 from row7 and H1414-H1420 from row14, etc. for (int i = 7; i < wiener_win2; i += wiener_win) { // Derive j'th iteration from where the H buffer filling needs to be // started. j = i / wiener_win; int shift = 0; do { // Update the dgd pointers appropriately. int proc_ht = v_start; const int16_t *d_window = d + (i / WIENER_WIN); const int16_t *d_current_row = d + (i / WIENER_WIN) + ((i % WIENER_WIN) * d_stride); downsample_factor = use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; __m256i sum_h[WIENER_WIN] = { _mm256_setzero_si256() }; do { UPDATE_DOWNSAMPLE_FACTOR // Process the amount of width multiple of 16. while (proc_wd < wd_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); INIT_H_VALUES(d_window + shift + proc_wd, 7) proc_wd += 16; } // Process the remaining width here. if (wd_beyond_mul16) { const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); const __m256i dgd_mask = _mm256_and_si256(dgd, mask); const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); INIT_H_VALUES(d_window + shift + proc_wd, 7) } proc_ht += downsample_factor; d_window += downsample_factor * d_stride; d_current_row += downsample_factor * d_stride; } while (proc_ht < v_end); const __m256i sh_0 = hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); const __m256i sh_1 = hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]); _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), sh_0); _mm_storeu_si128( (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), _mm256_castsi256_si128(sh_1)); _mm_storel_epi64((__m128i *)&H[(i * wiener_win2) + (wiener_win * j) + 6], _mm256_extracti128_si256(sh_1, 1)); shift++; } while (++j < wiener_win); } fill_lower_triag_elements_avx2(wiener_win2, H); } void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd, const uint8_t *src, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats) { if (wiener_win != WIENER_WIN && wiener_win != WIENER_WIN_CHROMA) { // Currently, libaom supports Wiener filter processing with window sizes as // WIENER_WIN_CHROMA(5) and WIENER_WIN(7). For any other window size, SIMD // support is not facilitated. Hence, invoke C function for the same. av1_compute_stats_c(wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, use_downsampled_wiener_stats); return; } const int32_t wiener_halfwin = wiener_win >> 1; const uint8_t avg = calc_dgd_buf_avg_avx2(dgd, h_start, h_end, v_start, v_end, dgd_stride); const int32_t width = h_end - h_start; const int32_t height = v_end - v_start; const int32_t d_stride = (width + 2 * wiener_halfwin + 15) & ~15; const int32_t s_stride = (width + 15) & ~15; // Based on the sf 'use_downsampled_wiener_stats', process either once for // UPDATE_DOWNSAMPLE_FACTOR or for each row. sub_avg_block_avx2(src + v_start * src_stride + h_start, src_stride, avg, width, height, src_avg, s_stride, use_downsampled_wiener_stats); // Compute (dgd-avg) buffer here which is used to fill H buffer. sub_avg_block_avx2( dgd + (v_start - wiener_halfwin) * dgd_stride + h_start - wiener_halfwin, dgd_stride, avg, width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, dgd_avg, d_stride, 0); if (wiener_win == WIENER_WIN) { compute_stats_win7_avx2(dgd_avg, d_stride, src_avg, s_stride, width, v_start, v_end, M, H, use_downsampled_wiener_stats); } else if (wiener_win == WIENER_WIN_CHROMA) { compute_stats_win5_avx2(dgd_avg, d_stride, src_avg, s_stride, width, v_start, v_end, M, H, use_downsampled_wiener_stats); } } static inline __m256i pair_set_epi16(int a, int b) { return _mm256_set1_epi32( (int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16))); } int64_t av1_lowbd_pixel_proj_error_avx2( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { int i, j, k; const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1)); __m256i sum64 = _mm256_setzero_si256(); const uint8_t *src = src8; const uint8_t *dat = dat8; int64_t err = 0; if (params->r[0] > 0 && params->r[1] > 0) { __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]); for (i = 0; i < height; ++i) { __m256i sum32 = _mm256_setzero_si256(); for (j = 0; j <= width - 16; j += 16) { const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); const __m256i flt0_16b = _mm256_permute4x64_epi64( _mm256_packs_epi32(yy_loadu_256(flt0 + j), yy_loadu_256(flt0 + j + 8)), 0xd8); const __m256i flt1_16b = _mm256_permute4x64_epi64( _mm256_packs_epi32(yy_loadu_256(flt1 + j), yy_loadu_256(flt1 + j + 8)), 0xd8); const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS); const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0); const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0); const __m256i v0 = _mm256_madd_epi16( xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u)); const __m256i v1 = _mm256_madd_epi16( xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u)); const __m256i vr0 = _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift); const __m256i vr1 = _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift); const __m256i e0 = _mm256_sub_epi16( _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0); const __m256i err0 = _mm256_madd_epi16(e0, e0); sum32 = _mm256_add_epi32(sum32, err0); } for (k = j; k < width; ++k) { const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; flt0 += flt0_stride; flt1 += flt1_stride; const __m256i sum64_0 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); const __m256i sum64_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); sum64 = _mm256_add_epi64(sum64, sum64_0); sum64 = _mm256_add_epi64(sum64, sum64_1); } } else if (params->r[0] > 0 || params->r[1] > 0) { const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; const __m256i xq_coeff = pair_set_epi16(xq_active, -xq_active * (1 << SGRPROJ_RST_BITS)); const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; for (i = 0; i < height; ++i) { __m256i sum32 = _mm256_setzero_si256(); for (j = 0; j <= width - 16; j += 16) { const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); const __m256i flt_16b = _mm256_permute4x64_epi64( _mm256_packs_epi32(yy_loadu_256(flt + j), yy_loadu_256(flt + j + 8)), 0xd8); const __m256i v0 = _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt_16b, d0)); const __m256i v1 = _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt_16b, d0)); const __m256i vr0 = _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift); const __m256i vr1 = _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift); const __m256i e0 = _mm256_sub_epi16( _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0); const __m256i err0 = _mm256_madd_epi16(e0, e0); sum32 = _mm256_add_epi32(sum32, err0); } for (k = j; k < width; ++k) { const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); int32_t v = xq_active * (flt[k] - u); const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; flt += flt_stride; const __m256i sum64_0 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); const __m256i sum64_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); sum64 = _mm256_add_epi64(sum64, sum64_0); sum64 = _mm256_add_epi64(sum64, sum64_1); } } else { __m256i sum32 = _mm256_setzero_si256(); for (i = 0; i < height; ++i) { for (j = 0; j <= width - 16; j += 16) { const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); const __m256i diff0 = _mm256_sub_epi16(d0, s0); const __m256i err0 = _mm256_madd_epi16(diff0, diff0); sum32 = _mm256_add_epi32(sum32, err0); } for (k = j; k < width; ++k) { const int32_t e = (int32_t)(dat[k]) - src[k]; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; } const __m256i sum64_0 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); const __m256i sum64_1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); sum64 = _mm256_add_epi64(sum64_0, sum64_1); } int64_t sum[4]; yy_storeu_256(sum, sum64); err += sum[0] + sum[1] + sum[2] + sum[3]; return err; } // When params->r[0] > 0 and params->r[1] > 0. In this case all elements of // C and H need to be computed. static inline void calc_proj_params_r0_r1_avx2( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint8_t *src = src8; const uint8_t *dat = dat8; __m256i h00, h01, h11, c0, c1; const __m256i zero = _mm256_setzero_si256(); h01 = h11 = c0 = c1 = h00 = zero; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; j += 8) { const __m256i u_load = _mm256_cvtepu8_epi32( _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); const __m256i s_load = _mm256_cvtepu8_epi32( _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j)); __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j)); __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); s = _mm256_sub_epi32(s, d); f1 = _mm256_sub_epi32(f1, d); f2 = _mm256_sub_epi32(f2, d); const __m256i h00_even = _mm256_mul_epi32(f1, f1); const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(f1, 32)); h00 = _mm256_add_epi64(h00, h00_even); h00 = _mm256_add_epi64(h00, h00_odd); const __m256i h01_even = _mm256_mul_epi32(f1, f2); const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(f2, 32)); h01 = _mm256_add_epi64(h01, h01_even); h01 = _mm256_add_epi64(h01, h01_odd); const __m256i h11_even = _mm256_mul_epi32(f2, f2); const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(f2, 32)); h11 = _mm256_add_epi64(h11, h11_even); h11 = _mm256_add_epi64(h11, h11_odd); const __m256i c0_even = _mm256_mul_epi32(f1, s); const __m256i c0_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32)); c0 = _mm256_add_epi64(c0, c0_even); c0 = _mm256_add_epi64(c0, c0_odd); const __m256i c1_even = _mm256_mul_epi32(f2, s); const __m256i c1_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32)); c1 = _mm256_add_epi64(c1, c1_even); c1 = _mm256_add_epi64(c1, c1_odd); } } __m256i c_low = _mm256_unpacklo_epi64(c0, c1); const __m256i c_high = _mm256_unpackhi_epi64(c0, c1); c_low = _mm256_add_epi64(c_low, c_high); const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1), _mm256_castsi256_si128(c_low)); __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01); const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01); h0x_low = _mm256_add_epi64(h0x_low, h0x_high); const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1), _mm256_castsi256_si128(h0x_low)); // Using the symmetric properties of H, calculations of H[1][0] are not // needed. __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11); const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11); h1x_low = _mm256_add_epi64(h1x_low, h1x_high); const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1), _mm256_castsi256_si128(h1x_low)); xx_storeu_128(C, c_128bit); xx_storeu_128(H[0], h0x_128bit); xx_storeu_128(H[1], h1x_128bit); H[0][0] /= size; H[0][1] /= size; H[1][1] /= size; // Since H is a symmetric matrix H[1][0] = H[0][1]; C[0] /= size; C[1] /= size; } // When only params->r[0] > 0. In this case only H[0][0] and C[0] are // non-zero and need to be computed. static inline void calc_proj_params_r0_avx2(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint8_t *src = src8; const uint8_t *dat = dat8; __m256i h00, c0; const __m256i zero = _mm256_setzero_si256(); c0 = h00 = zero; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; j += 8) { const __m256i u_load = _mm256_cvtepu8_epi32( _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); const __m256i s_load = _mm256_cvtepu8_epi32( _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j)); __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); s = _mm256_sub_epi32(s, d); f1 = _mm256_sub_epi32(f1, d); const __m256i h00_even = _mm256_mul_epi32(f1, f1); const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(f1, 32)); h00 = _mm256_add_epi64(h00, h00_even); h00 = _mm256_add_epi64(h00, h00_odd); const __m256i c0_even = _mm256_mul_epi32(f1, s); const __m256i c0_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32)); c0 = _mm256_add_epi64(c0, c0_even); c0 = _mm256_add_epi64(c0, c0_odd); } } const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1), _mm256_castsi256_si128(h00)); const __m128i h00_val = _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8)); const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1), _mm256_castsi256_si128(c0)); const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8)); const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero)); const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero)); xx_storeu_128(C, c); xx_storeu_128(H[0], h0x); H[0][0] /= size; C[0] /= size; } // When only params->r[1] > 0. In this case only H[1][1] and C[1] are // non-zero and need to be computed. static inline void calc_proj_params_r1_avx2(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint8_t *src = src8; const uint8_t *dat = dat8; __m256i h11, c1; const __m256i zero = _mm256_setzero_si256(); c1 = h11 = zero; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; j += 8) { const __m256i u_load = _mm256_cvtepu8_epi32( _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); const __m256i s_load = _mm256_cvtepu8_epi32( _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j)); __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); s = _mm256_sub_epi32(s, d); f2 = _mm256_sub_epi32(f2, d); const __m256i h11_even = _mm256_mul_epi32(f2, f2); const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(f2, 32)); h11 = _mm256_add_epi64(h11, h11_even); h11 = _mm256_add_epi64(h11, h11_odd); const __m256i c1_even = _mm256_mul_epi32(f2, s); const __m256i c1_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32)); c1 = _mm256_add_epi64(c1, c1_even); c1 = _mm256_add_epi64(c1, c1_odd); } } const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1), _mm256_castsi256_si128(h11)); const __m128i h11_val = _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8)); const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1), _mm256_castsi256_si128(c1)); const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8)); const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val); const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val); xx_storeu_128(C, c); xx_storeu_128(H[1], h1x); H[1][1] /= size; C[1] /= size; } // AVX2 variant of av1_calc_proj_params_c. void av1_calc_proj_params_avx2(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params) { if ((params->r[0] > 0) && (params->r[1] > 0)) { calc_proj_params_r0_r1_avx2(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, flt1, flt1_stride, H, C); } else if (params->r[0] > 0) { calc_proj_params_r0_avx2(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, H, C); } else if (params->r[1] > 0) { calc_proj_params_r1_avx2(src8, width, height, src_stride, dat8, dat_stride, flt1, flt1_stride, H, C); } } #if CONFIG_AV1_HIGHBITDEPTH static inline void calc_proj_params_r0_r1_high_bd_avx2( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); __m256i h00, h01, h11, c0, c1; const __m256i zero = _mm256_setzero_si256(); h01 = h11 = c0 = c1 = h00 = zero; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; j += 8) { const __m256i u_load = _mm256_cvtepu16_epi32( _mm_load_si128((__m128i *)(dat + i * dat_stride + j))); const __m256i s_load = _mm256_cvtepu16_epi32( _mm_load_si128((__m128i *)(src + i * src_stride + j))); __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j)); __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j)); __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); s = _mm256_sub_epi32(s, d); f1 = _mm256_sub_epi32(f1, d); f2 = _mm256_sub_epi32(f2, d); const __m256i h00_even = _mm256_mul_epi32(f1, f1); const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(f1, 32)); h00 = _mm256_add_epi64(h00, h00_even); h00 = _mm256_add_epi64(h00, h00_odd); const __m256i h01_even = _mm256_mul_epi32(f1, f2); const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(f2, 32)); h01 = _mm256_add_epi64(h01, h01_even); h01 = _mm256_add_epi64(h01, h01_odd); const __m256i h11_even = _mm256_mul_epi32(f2, f2); const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(f2, 32)); h11 = _mm256_add_epi64(h11, h11_even); h11 = _mm256_add_epi64(h11, h11_odd); const __m256i c0_even = _mm256_mul_epi32(f1, s); const __m256i c0_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32)); c0 = _mm256_add_epi64(c0, c0_even); c0 = _mm256_add_epi64(c0, c0_odd); const __m256i c1_even = _mm256_mul_epi32(f2, s); const __m256i c1_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32)); c1 = _mm256_add_epi64(c1, c1_even); c1 = _mm256_add_epi64(c1, c1_odd); } } __m256i c_low = _mm256_unpacklo_epi64(c0, c1); const __m256i c_high = _mm256_unpackhi_epi64(c0, c1); c_low = _mm256_add_epi64(c_low, c_high); const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1), _mm256_castsi256_si128(c_low)); __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01); const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01); h0x_low = _mm256_add_epi64(h0x_low, h0x_high); const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1), _mm256_castsi256_si128(h0x_low)); // Using the symmetric properties of H, calculations of H[1][0] are not // needed. __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11); const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11); h1x_low = _mm256_add_epi64(h1x_low, h1x_high); const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1), _mm256_castsi256_si128(h1x_low)); xx_storeu_128(C, c_128bit); xx_storeu_128(H[0], h0x_128bit); xx_storeu_128(H[1], h1x_128bit); H[0][0] /= size; H[0][1] /= size; H[1][1] /= size; // Since H is a symmetric matrix H[1][0] = H[0][1]; C[0] /= size; C[1] /= size; } static inline void calc_proj_params_r0_high_bd_avx2( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); __m256i h00, c0; const __m256i zero = _mm256_setzero_si256(); c0 = h00 = zero; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; j += 8) { const __m256i u_load = _mm256_cvtepu16_epi32( _mm_load_si128((__m128i *)(dat + i * dat_stride + j))); const __m256i s_load = _mm256_cvtepu16_epi32( _mm_load_si128((__m128i *)(src + i * src_stride + j))); __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j)); __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); s = _mm256_sub_epi32(s, d); f1 = _mm256_sub_epi32(f1, d); const __m256i h00_even = _mm256_mul_epi32(f1, f1); const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(f1, 32)); h00 = _mm256_add_epi64(h00, h00_even); h00 = _mm256_add_epi64(h00, h00_odd); const __m256i c0_even = _mm256_mul_epi32(f1, s); const __m256i c0_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32)); c0 = _mm256_add_epi64(c0, c0_even); c0 = _mm256_add_epi64(c0, c0_odd); } } const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1), _mm256_castsi256_si128(h00)); const __m128i h00_val = _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8)); const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1), _mm256_castsi256_si128(c0)); const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8)); const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero)); const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero)); xx_storeu_128(C, c); xx_storeu_128(H[0], h0x); H[0][0] /= size; C[0] /= size; } static inline void calc_proj_params_r1_high_bd_avx2( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); __m256i h11, c1; const __m256i zero = _mm256_setzero_si256(); c1 = h11 = zero; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; j += 8) { const __m256i u_load = _mm256_cvtepu16_epi32( _mm_load_si128((__m128i *)(dat + i * dat_stride + j))); const __m256i s_load = _mm256_cvtepu16_epi32( _mm_load_si128((__m128i *)(src + i * src_stride + j))); __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j)); __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); s = _mm256_sub_epi32(s, d); f2 = _mm256_sub_epi32(f2, d); const __m256i h11_even = _mm256_mul_epi32(f2, f2); const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(f2, 32)); h11 = _mm256_add_epi64(h11, h11_even); h11 = _mm256_add_epi64(h11, h11_odd); const __m256i c1_even = _mm256_mul_epi32(f2, s); const __m256i c1_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32)); c1 = _mm256_add_epi64(c1, c1_even); c1 = _mm256_add_epi64(c1, c1_odd); } } const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1), _mm256_castsi256_si128(h11)); const __m128i h11_val = _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8)); const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1), _mm256_castsi256_si128(c1)); const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8)); const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val); const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val); xx_storeu_128(C, c); xx_storeu_128(H[1], h1x); H[1][1] /= size; C[1] /= size; } // AVX2 variant of av1_calc_proj_params_high_bd_c. void av1_calc_proj_params_high_bd_avx2(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params) { if ((params->r[0] > 0) && (params->r[1] > 0)) { calc_proj_params_r0_r1_high_bd_avx2(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, flt1, flt1_stride, H, C); } else if (params->r[0] > 0) { calc_proj_params_r0_high_bd_avx2(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, H, C); } else if (params->r[1] > 0) { calc_proj_params_r1_high_bd_avx2(src8, width, height, src_stride, dat8, dat_stride, flt1, flt1_stride, H, C); } } int64_t av1_highbd_pixel_proj_error_avx2( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { int i, j, k; const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1)); __m256i sum64 = _mm256_setzero_si256(); const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); int64_t err = 0; if (params->r[0] > 0 && params->r[1] > 0) { // Both filters are enabled const __m256i xq0 = _mm256_set1_epi32(xq[0]); const __m256i xq1 = _mm256_set1_epi32(xq[1]); for (i = 0; i < height; ++i) { __m256i sum32 = _mm256_setzero_si256(); for (j = 0; j <= width - 16; j += 16) { // Process 16 pixels at a time // Load 16 pixels each from source image and corrupted image const __m256i s0 = yy_loadu_256(src + j); const __m256i d0 = yy_loadu_256(dat + j); // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 (indices) // Shift-up each pixel to match filtered image scaling const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS); // Split u0 into two halves and pad each from u16 to i32 const __m256i u0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(u0)); const __m256i u0h = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(u0, 1)); // u0h, u0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32 // Load 16 pixels from each filtered image const __m256i flt0l = yy_loadu_256(flt0 + j); const __m256i flt0h = yy_loadu_256(flt0 + j + 8); const __m256i flt1l = yy_loadu_256(flt1 + j); const __m256i flt1h = yy_loadu_256(flt1 + j + 8); // flt?l, flt?h = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32 // Subtract shifted corrupt image from each filtered image const __m256i flt0l_subu = _mm256_sub_epi32(flt0l, u0l); const __m256i flt0h_subu = _mm256_sub_epi32(flt0h, u0h); const __m256i flt1l_subu = _mm256_sub_epi32(flt1l, u0l); const __m256i flt1h_subu = _mm256_sub_epi32(flt1h, u0h); // Multiply basis vectors by appropriate coefficients const __m256i v0l = _mm256_mullo_epi32(flt0l_subu, xq0); const __m256i v0h = _mm256_mullo_epi32(flt0h_subu, xq0); const __m256i v1l = _mm256_mullo_epi32(flt1l_subu, xq1); const __m256i v1h = _mm256_mullo_epi32(flt1h_subu, xq1); // Add together the contributions from the two basis vectors const __m256i vl = _mm256_add_epi32(v0l, v1l); const __m256i vh = _mm256_add_epi32(v0h, v1h); // Right-shift v with appropriate rounding const __m256i vrl = _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift); const __m256i vrh = _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift); // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] // Saturate each i32 to an i16 then combine both halves // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes const __m256i vr = _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8); // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0] // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] // Add twin-subspace-sgr-filter to corrupt image then subtract source const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0); // Calculate squared error and add adjacent values const __m256i err0 = _mm256_madd_epi16(e0, e0); sum32 = _mm256_add_epi32(sum32, err0); } const __m256i sum32l = _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32)); sum64 = _mm256_add_epi64(sum64, sum32l); const __m256i sum32h = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1)); sum64 = _mm256_add_epi64(sum64, sum32h); // Process remaining pixels in this row (modulo 16) for (k = j; k < width; ++k) { const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; flt0 += flt0_stride; flt1 += flt1_stride; } } else if (params->r[0] > 0 || params->r[1] > 0) { // Only one filter enabled const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1]; const __m256i xq_active = _mm256_set1_epi32(xq_on); const __m256i xq_inactive = _mm256_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS)); const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; for (i = 0; i < height; ++i) { __m256i sum32 = _mm256_setzero_si256(); for (j = 0; j <= width - 16; j += 16) { // Load 16 pixels from source image const __m256i s0 = yy_loadu_256(src + j); // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 // Load 16 pixels from corrupted image and pad each u16 to i32 const __m256i d0 = yy_loadu_256(dat + j); const __m256i d0h = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(d0, 1)); const __m256i d0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(d0)); // d0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 // d0h, d0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32 // Load 16 pixels from the filtered image const __m256i flth = yy_loadu_256(flt + j + 8); const __m256i fltl = yy_loadu_256(flt + j); // flth, fltl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32 const __m256i flth_xq = _mm256_mullo_epi32(flth, xq_active); const __m256i fltl_xq = _mm256_mullo_epi32(fltl, xq_active); const __m256i d0h_xq = _mm256_mullo_epi32(d0h, xq_inactive); const __m256i d0l_xq = _mm256_mullo_epi32(d0l, xq_inactive); const __m256i vh = _mm256_add_epi32(flth_xq, d0h_xq); const __m256i vl = _mm256_add_epi32(fltl_xq, d0l_xq); // Shift this down with appropriate rounding const __m256i vrh = _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift); const __m256i vrl = _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift); // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32 // Saturate each i32 to an i16 then combine both halves // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes const __m256i vr = _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8); // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0] as u16 // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 // Subtract twin-subspace-sgr filtered from source image to get error const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0); // Calculate squared error and add adjacent values const __m256i err0 = _mm256_madd_epi16(e0, e0); sum32 = _mm256_add_epi32(sum32, err0); } const __m256i sum32l = _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32)); sum64 = _mm256_add_epi64(sum64, sum32l); const __m256i sum32h = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1)); sum64 = _mm256_add_epi64(sum64, sum32h); // Process remaining pixels in this row (modulo 16) for (k = j; k < width; ++k) { const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); int32_t v = xq_on * (flt[k] - u); const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; flt += flt_stride; } } else { // Neither filter is enabled for (i = 0; i < height; ++i) { __m256i sum32 = _mm256_setzero_si256(); for (j = 0; j <= width - 32; j += 32) { // Load 2x16 u16 from source image const __m256i s0l = yy_loadu_256(src + j); const __m256i s0h = yy_loadu_256(src + j + 16); // Load 2x16 u16 from corrupted image const __m256i d0l = yy_loadu_256(dat + j); const __m256i d0h = yy_loadu_256(dat + j + 16); // Subtract corrupted image from source image const __m256i diffl = _mm256_sub_epi16(d0l, s0l); const __m256i diffh = _mm256_sub_epi16(d0h, s0h); // Square error and add adjacent values const __m256i err0l = _mm256_madd_epi16(diffl, diffl); const __m256i err0h = _mm256_madd_epi16(diffh, diffh); sum32 = _mm256_add_epi32(sum32, err0l); sum32 = _mm256_add_epi32(sum32, err0h); } const __m256i sum32l = _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32)); sum64 = _mm256_add_epi64(sum64, sum32l); const __m256i sum32h = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1)); sum64 = _mm256_add_epi64(sum64, sum32h); // Process remaining pixels (modulu 16) for (k = j; k < width; ++k) { const int32_t e = (int32_t)(dat[k]) - src[k]; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; } } // Sum 4 values from sum64l and sum64h into err int64_t sum[4]; yy_storeu_256(sum, sum64); err += sum[0] + sum[1] + sum[2] + sum[3]; return err; } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/av1/encoder/x86/pickrst_sse4.c000066400000000000000000001734561477627663500177350ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/x86/mem_sse2.h" #include "aom_dsp/x86/synonyms.h" #include "config/av1_rtcd.h" #include "av1/common/restoration.h" #include "av1/encoder/pickrst.h" static inline void acc_stat_sse41(int32_t *dst, const uint8_t *src, const __m128i *shuffle, const __m128i *kl) { const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle); const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s)); const __m128i d1 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8))); const __m128i dst0 = xx_loadu_128(dst); const __m128i dst1 = xx_loadu_128(dst + 4); const __m128i r0 = _mm_add_epi32(dst0, d0); const __m128i r1 = _mm_add_epi32(dst1, d1); xx_storeu_128(dst, r0); xx_storeu_128(dst + 4, r1); } static inline void acc_stat_win7_one_line_sse4_1( const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int dgd_stride, const __m128i *shuffle, int32_t *sumX, int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN], int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { const int wiener_win = 7; int j, k, l; // Main loop handles two pixels at a time // We can assume that h_start is even, since it will always be aligned to // a tile edge + some number of restoration units, and both of those will // be 64-pixel aligned. // However, at the edge of the image, h_end may be odd, so we need to handle // that case correctly. assert(h_start % 2 == 0); const int h_end_even = h_end & ~1; const int has_odd_pixel = h_end & 1; for (j = h_start; j < h_end_even; j += 2) { const uint8_t *dgd_ij = dgd + j; const uint8_t X1 = src[j]; const uint8_t X2 = src[j + 1]; *sumX += X1 + X2; for (k = 0; k < wiener_win; k++) { const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; for (l = 0; l < wiener_win; l++) { int32_t *H_ = &H_int[(l * wiener_win + k)][0]; const uint8_t D1 = dgd_ijk[l]; const uint8_t D2 = dgd_ijk[l + 1]; sumY[k][l] += D1 + D2; M_int[k][l] += D1 * X1 + D2 * X2; const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16(loadu_int16(dgd_ijk + l))); acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl); } } } // If the width is odd, add in the final pixel if (has_odd_pixel) { const uint8_t *dgd_ij = dgd + j; const uint8_t X1 = src[j]; *sumX += X1; for (k = 0; k < wiener_win; k++) { const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; for (l = 0; l < wiener_win; l++) { int32_t *H_ = &H_int[(l * wiener_win + k)][0]; const uint8_t D1 = dgd_ijk[l]; sumY[k][l] += D1; M_int[k][l] += D1 * X1; // The `acc_stat_sse41` function wants its input to have interleaved // copies of two pixels, but we only have one. However, the pixels // are (effectively) used as inputs to a multiply-accumulate. // So if we set the extra pixel slot to 0, then it is effectively // ignored. const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1)); acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl); } } } } static inline void compute_stats_win7_opt_sse4_1( const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats) { int i, j, k, l, m, n; const int wiener_win = WIENER_WIN; const int pixel_count = (h_end - h_start) * (v_end - v_start); const int wiener_win2 = wiener_win * wiener_win; const int wiener_halfwin = (wiener_win >> 1); const uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } }; int32_t M_int32_row[WIENER_WIN][WIENER_WIN] = { { 0 } }; int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } }; int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; int32_t H_int32_row[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; int32_t sumX = 0; const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; int downsample_factor = use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; int32_t sumX_row = 0; int32_t sumY_row[WIENER_WIN][WIENER_WIN] = { { 0 } }; const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); for (j = v_start; j < v_end; j += 64) { const int vert_end = AOMMIN(64, v_end - j) + j; for (i = j; i < vert_end; i = i + downsample_factor) { if (use_downsampled_wiener_stats && (vert_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) { downsample_factor = vert_end - i; } sumX_row = 0; memset(sumY_row, 0, sizeof(int32_t) * WIENER_WIN * WIENER_WIN); memset(M_int32_row, 0, sizeof(int32_t) * WIENER_WIN * WIENER_WIN); memset(H_int32_row, 0, sizeof(int32_t) * WIENER_WIN2 * (WIENER_WIN * 8)); acc_stat_win7_one_line_sse4_1( dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, dgd_stride, &shuffle, &sumX_row, sumY_row, M_int32_row, H_int32_row); sumX += sumX_row * downsample_factor; // Scale M matrix based on the downsampling factor for (k = 0; k < wiener_win; ++k) { for (l = 0; l < wiener_win; ++l) { sumY[k][l] += (sumY_row[k][l] * downsample_factor); M_int32[k][l] += (M_int32_row[k][l] * downsample_factor); } } // Scale H matrix based on the downsampling factor for (k = 0; k < WIENER_WIN2; ++k) { for (l = 0; l < WIENER_WIN * 8; ++l) { H_int32[k][l] += (H_int32_row[k][l] * downsample_factor); } } } for (k = 0; k < wiener_win; ++k) { for (l = 0; l < wiener_win; ++l) { M_int64[k][l] += M_int32[k][l]; M_int32[k][l] = 0; } } for (k = 0; k < WIENER_WIN2; ++k) { for (l = 0; l < WIENER_WIN * 8; ++l) { H_int64[k][l] += H_int32[k][l]; H_int32[k][l] = 0; } } } const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; for (k = 0; k < wiener_win; k++) { for (l = 0; l < wiener_win; l++) { const int32_t idx0 = l * wiener_win + k; M[idx0] = M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l])); int64_t *H_ = H + idx0 * wiener_win2; int64_t *H_int_ = &H_int64[idx0][0]; for (m = 0; m < wiener_win; m++) { for (n = 0; n < wiener_win; n++) { H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]); } } } } } #if CONFIG_AV1_HIGHBITDEPTH static inline void acc_stat_highbd_sse41(int64_t *dst, const uint16_t *dgd, const __m128i *shuffle, const __m128i *dgd_ijkl) { // Load 256 bits from dgd in two chunks const __m128i s0l = xx_loadu_128(dgd); const __m128i s0h = xx_loadu_128(dgd + 4); // s0l = [7 6 5 4 3 2 1 0] as u16 values (dgd indices) // s0h = [11 10 9 8 7 6 5 4] as u16 values (dgd indices) // (Slightly strange order so we can apply the same shuffle to both halves) // Shuffle the u16 values in each half (actually using 8-bit shuffle mask) const __m128i s1l = _mm_shuffle_epi8(s0l, *shuffle); const __m128i s1h = _mm_shuffle_epi8(s0h, *shuffle); // s1l = [4 3 3 2 2 1 1 0] as u16 values (dgd indices) // s1h = [8 7 7 6 6 5 5 4] as u16 values (dgd indices) // Multiply s1 by dgd_ijkl resulting in 8x u32 values // Horizontally add pairs of u32 resulting in 4x u32 const __m128i dl = _mm_madd_epi16(*dgd_ijkl, s1l); const __m128i dh = _mm_madd_epi16(*dgd_ijkl, s1h); // dl = [d c b a] as u32 values // dh = [h g f e] as u32 values // Add these 8x u32 results on to dst in four parts const __m128i dll = _mm_cvtepu32_epi64(dl); const __m128i dlh = _mm_cvtepu32_epi64(_mm_srli_si128(dl, 8)); const __m128i dhl = _mm_cvtepu32_epi64(dh); const __m128i dhh = _mm_cvtepu32_epi64(_mm_srli_si128(dh, 8)); // dll = [b a] as u64 values, etc. const __m128i rll = _mm_add_epi64(xx_loadu_128(dst), dll); xx_storeu_128(dst, rll); const __m128i rlh = _mm_add_epi64(xx_loadu_128(dst + 2), dlh); xx_storeu_128(dst + 2, rlh); const __m128i rhl = _mm_add_epi64(xx_loadu_128(dst + 4), dhl); xx_storeu_128(dst + 4, rhl); const __m128i rhh = _mm_add_epi64(xx_loadu_128(dst + 6), dhh); xx_storeu_128(dst + 6, rhh); } static inline void acc_stat_highbd_win7_one_line_sse4_1( const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, int dgd_stride, const __m128i *shuffle, int32_t *sumX, int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN], int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { int j, k, l; const int wiener_win = WIENER_WIN; // Main loop handles two pixels at a time // We can assume that h_start is even, since it will always be aligned to // a tile edge + some number of restoration units, and both of those will // be 64-pixel aligned. // However, at the edge of the image, h_end may be odd, so we need to handle // that case correctly. assert(h_start % 2 == 0); const int h_end_even = h_end & ~1; const int has_odd_pixel = h_end & 1; for (j = h_start; j < h_end_even; j += 2) { const uint16_t X1 = src[j]; const uint16_t X2 = src[j + 1]; *sumX += X1 + X2; const uint16_t *dgd_ij = dgd + j; for (k = 0; k < wiener_win; k++) { const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; for (l = 0; l < wiener_win; l++) { int64_t *H_ = &H_int[(l * wiener_win + k)][0]; const uint16_t D1 = dgd_ijk[l]; const uint16_t D2 = dgd_ijk[l + 1]; sumY[k][l] += D1 + D2; M_int[k][l] += D1 * X1 + D2 * X2; // Load two u16 values from dgd as a single u32 // Then broadcast to 4x u32 slots of a 128 const __m128i dgd_ijkl = _mm_set1_epi32(loadu_int32(dgd_ijk + l)); // dgd_ijkl = [y x y x y x y x] as u16 acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &dgd_ijkl); } } } // If the width is odd, add in the final pixel if (has_odd_pixel) { const uint16_t X1 = src[j]; *sumX += X1; const uint16_t *dgd_ij = dgd + j; for (k = 0; k < wiener_win; k++) { const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; for (l = 0; l < wiener_win; l++) { int64_t *H_ = &H_int[(l * wiener_win + k)][0]; const uint16_t D1 = dgd_ijk[l]; sumY[k][l] += D1; M_int[k][l] += D1 * X1; // The `acc_stat_highbd_sse41` function wants its input to have // interleaved copies of two pixels, but we only have one. However, the // pixels are (effectively) used as inputs to a multiply-accumulate. So // if we set the extra pixel slot to 0, then it is effectively ignored. const __m128i dgd_ijkl = _mm_set1_epi32((int)D1); acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &dgd_ijkl); } } } } static inline void compute_stats_highbd_win7_opt_sse4_1( const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { int i, j, k, l, m, n; const int wiener_win = WIENER_WIN; const int pixel_count = (h_end - h_start) * (v_end - v_start); const int wiener_win2 = wiener_win * wiener_win; const int wiener_halfwin = (wiener_win >> 1); const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); const uint16_t avg = find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } }; int64_t H_int[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; int32_t sumX = 0; const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; // Load just half of the 256-bit shuffle control used for the AVX2 version const __m128i shuffle = xx_loadu_128(g_shuffle_stats_highbd_data); for (j = v_start; j < v_end; j += 64) { const int vert_end = AOMMIN(64, v_end - j) + j; for (i = j; i < vert_end; i++) { acc_stat_highbd_win7_one_line_sse4_1( dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, dgd_stride, &shuffle, &sumX, sumY, M_int, H_int); } } uint8_t bit_depth_divider = 1; if (bit_depth == AOM_BITS_12) bit_depth_divider = 16; else if (bit_depth == AOM_BITS_10) bit_depth_divider = 4; const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; for (k = 0; k < wiener_win; k++) { for (l = 0; l < wiener_win; l++) { const int32_t idx0 = l * wiener_win + k; M[idx0] = (M_int[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / bit_depth_divider; int64_t *H_ = H + idx0 * wiener_win2; int64_t *H_int_ = &H_int[idx0][0]; for (m = 0; m < wiener_win; m++) { for (n = 0; n < wiener_win; n++) { H_[m * wiener_win + n] = (H_int_[n * 8 + m] + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / bit_depth_divider; } } } } } static inline void acc_stat_highbd_win5_one_line_sse4_1( const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, int dgd_stride, const __m128i *shuffle, int32_t *sumX, int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { int j, k, l; const int wiener_win = WIENER_WIN_CHROMA; // Main loop handles two pixels at a time // We can assume that h_start is even, since it will always be aligned to // a tile edge + some number of restoration units, and both of those will // be 64-pixel aligned. // However, at the edge of the image, h_end may be odd, so we need to handle // that case correctly. assert(h_start % 2 == 0); const int h_end_even = h_end & ~1; const int has_odd_pixel = h_end & 1; for (j = h_start; j < h_end_even; j += 2) { const uint16_t X1 = src[j]; const uint16_t X2 = src[j + 1]; *sumX += X1 + X2; const uint16_t *dgd_ij = dgd + j; for (k = 0; k < wiener_win; k++) { const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; for (l = 0; l < wiener_win; l++) { int64_t *H_ = &H_int[(l * wiener_win + k)][0]; const uint16_t D1 = dgd_ijk[l]; const uint16_t D2 = dgd_ijk[l + 1]; sumY[k][l] += D1 + D2; M_int[k][l] += D1 * X1 + D2 * X2; // Load two u16 values from dgd as a single u32 // then broadcast to 4x u32 slots of a 128 const __m128i dgd_ijkl = _mm_set1_epi32(loadu_int32(dgd_ijk + l)); // dgd_ijkl = [y x y x y x y x] as u16 acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &dgd_ijkl); } } } // If the width is odd, add in the final pixel if (has_odd_pixel) { const uint16_t X1 = src[j]; *sumX += X1; const uint16_t *dgd_ij = dgd + j; for (k = 0; k < wiener_win; k++) { const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; for (l = 0; l < wiener_win; l++) { int64_t *H_ = &H_int[(l * wiener_win + k)][0]; const uint16_t D1 = dgd_ijk[l]; sumY[k][l] += D1; M_int[k][l] += D1 * X1; // The `acc_stat_highbd_sse41` function wants its input to have // interleaved copies of two pixels, but we only have one. However, the // pixels are (effectively) used as inputs to a multiply-accumulate. So // if we set the extra pixel slot to 0, then it is effectively ignored. const __m128i dgd_ijkl = _mm_set1_epi32((int)D1); acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &dgd_ijkl); acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &dgd_ijkl); } } } } static inline void compute_stats_highbd_win5_opt_sse4_1( const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { int i, j, k, l, m, n; const int wiener_win = WIENER_WIN_CHROMA; const int pixel_count = (h_end - h_start) * (v_end - v_start); const int wiener_win2 = wiener_win * wiener_win; const int wiener_halfwin = (wiener_win >> 1); const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); const uint16_t avg = find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; int32_t sumX = 0; const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; // Load just half of the 256-bit shuffle control used for the AVX2 version const __m128i shuffle = xx_loadu_128(g_shuffle_stats_highbd_data); for (j = v_start; j < v_end; j += 64) { const int vert_end = AOMMIN(64, v_end - j) + j; for (i = j; i < vert_end; i++) { acc_stat_highbd_win5_one_line_sse4_1( dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, dgd_stride, &shuffle, &sumX, sumY, M_int, H_int); } } uint8_t bit_depth_divider = 1; if (bit_depth == AOM_BITS_12) bit_depth_divider = 16; else if (bit_depth == AOM_BITS_10) bit_depth_divider = 4; const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; for (k = 0; k < wiener_win; k++) { for (l = 0; l < wiener_win; l++) { const int32_t idx0 = l * wiener_win + k; M[idx0] = (M_int[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / bit_depth_divider; int64_t *H_ = H + idx0 * wiener_win2; int64_t *H_int_ = &H_int[idx0][0]; for (m = 0; m < wiener_win; m++) { for (n = 0; n < wiener_win; n++) { H_[m * wiener_win + n] = (H_int_[n * 8 + m] + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / bit_depth_divider; } } } } } void av1_compute_stats_highbd_sse4_1(int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth) { if (wiener_win == WIENER_WIN) { (void)dgd_avg; (void)src_avg; compute_stats_highbd_win7_opt_sse4_1(dgd8, src8, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else if (wiener_win == WIENER_WIN_CHROMA) { (void)dgd_avg; (void)src_avg; compute_stats_highbd_win5_opt_sse4_1(dgd8, src8, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } else { av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, bit_depth); } } #endif // CONFIG_AV1_HIGHBITDEPTH static inline void acc_stat_win5_one_line_sse4_1( const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int dgd_stride, const __m128i *shuffle, int32_t *sumX, int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { const int wiener_win = WIENER_WIN_CHROMA; int j, k, l; // Main loop handles two pixels at a time // We can assume that h_start is even, since it will always be aligned to // a tile edge + some number of restoration units, and both of those will // be 64-pixel aligned. // However, at the edge of the image, h_end may be odd, so we need to handle // that case correctly. assert(h_start % 2 == 0); const int h_end_even = h_end & ~1; const int has_odd_pixel = h_end & 1; for (j = h_start; j < h_end_even; j += 2) { const uint8_t *dgd_ij = dgd + j; const uint8_t X1 = src[j]; const uint8_t X2 = src[j + 1]; *sumX += X1 + X2; for (k = 0; k < wiener_win; k++) { const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; for (l = 0; l < wiener_win; l++) { int32_t *H_ = &H_int[(l * wiener_win + k)][0]; const uint8_t D1 = dgd_ijk[l]; const uint8_t D2 = dgd_ijk[l + 1]; sumY[k][l] += D1 + D2; M_int[k][l] += D1 * X1 + D2 * X2; const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16(loadu_int16(dgd_ijk + l))); acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); } } } // If the width is odd, add in the final pixel if (has_odd_pixel) { const uint8_t *dgd_ij = dgd + j; const uint8_t X1 = src[j]; *sumX += X1; for (k = 0; k < wiener_win; k++) { const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; for (l = 0; l < wiener_win; l++) { int32_t *H_ = &H_int[(l * wiener_win + k)][0]; const uint8_t D1 = dgd_ijk[l]; sumY[k][l] += D1; M_int[k][l] += D1 * X1; // The `acc_stat_sse41` function wants its input to have interleaved // copies of two pixels, but we only have one. However, the pixels // are (effectively) used as inputs to a multiply-accumulate. // So if we set the extra pixel slot to 0, then it is effectively // ignored. const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1)); acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); } } } } static inline void compute_stats_win5_opt_sse4_1( const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats) { int i, j, k, l, m, n; const int wiener_win = WIENER_WIN_CHROMA; const int pixel_count = (h_end - h_start) * (v_end - v_start); const int wiener_win2 = wiener_win * wiener_win; const int wiener_halfwin = (wiener_win >> 1); const uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; int32_t M_int32_row[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; int32_t H_int32_row[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; int32_t sumX = 0; const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; int downsample_factor = use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; int32_t sumX_row = 0; int32_t sumY_row[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); for (j = v_start; j < v_end; j += 64) { const int vert_end = AOMMIN(64, v_end - j) + j; for (i = j; i < vert_end; i = i + downsample_factor) { if (use_downsampled_wiener_stats && (vert_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) { downsample_factor = vert_end - i; } sumX_row = 0; memset(sumY_row, 0, sizeof(int32_t) * WIENER_WIN_CHROMA * WIENER_WIN_CHROMA); memset(M_int32_row, 0, sizeof(int32_t) * WIENER_WIN_CHROMA * WIENER_WIN_CHROMA); memset(H_int32_row, 0, sizeof(int32_t) * WIENER_WIN2_CHROMA * (WIENER_WIN_CHROMA * 8)); acc_stat_win5_one_line_sse4_1( dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, dgd_stride, &shuffle, &sumX_row, sumY_row, M_int32_row, H_int32_row); sumX += sumX_row * downsample_factor; // Scale M matrix based on the downsampling factor for (k = 0; k < wiener_win; ++k) { for (l = 0; l < wiener_win; ++l) { sumY[k][l] += (sumY_row[k][l] * downsample_factor); M_int32[k][l] += (M_int32_row[k][l] * downsample_factor); } } // Scale H matrix based on the downsampling factor for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) { for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) { H_int32[k][l] += (H_int32_row[k][l] * downsample_factor); } } } for (k = 0; k < wiener_win; ++k) { for (l = 0; l < wiener_win; ++l) { M_int64[k][l] += M_int32[k][l]; M_int32[k][l] = 0; } } for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) { for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) { H_int64[k][l] += H_int32[k][l]; H_int32[k][l] = 0; } } } const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; for (k = 0; k < wiener_win; k++) { for (l = 0; l < wiener_win; l++) { const int32_t idx0 = l * wiener_win + k; M[idx0] = M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l])); int64_t *H_ = H + idx0 * wiener_win2; int64_t *H_int_ = &H_int64[idx0][0]; for (m = 0; m < wiener_win; m++) { for (n = 0; n < wiener_win; n++) { H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]); } } } } } void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd, const uint8_t *src, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats) { if (wiener_win == WIENER_WIN) { compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, use_downsampled_wiener_stats); } else if (wiener_win == WIENER_WIN_CHROMA) { compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, use_downsampled_wiener_stats); } else { av1_compute_stats_c(wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end, v_start, v_end, dgd_stride, src_stride, M, H, use_downsampled_wiener_stats); } } static inline __m128i pair_set_epi16(int a, int b) { return _mm_set1_epi32( (int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16))); } int64_t av1_lowbd_pixel_proj_error_sse4_1( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { int i, j, k; const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); __m128i sum64 = _mm_setzero_si128(); const uint8_t *src = src8; const uint8_t *dat = dat8; int64_t err = 0; if (params->r[0] > 0 && params->r[1] > 0) { __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]); for (i = 0; i < height; ++i) { __m128i sum32 = _mm_setzero_si128(); for (j = 0; j <= width - 8; j += 8) { const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j)); const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j)); const __m128i flt0_16b = _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4)); const __m128i flt1_16b = _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4)); const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS); const __m128i flt0_0_sub_u = _mm_sub_epi16(flt0_16b, u0); const __m128i flt1_0_sub_u = _mm_sub_epi16(flt1_16b, u0); const __m128i v0 = _mm_madd_epi16( xq_coeff, _mm_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u)); const __m128i v1 = _mm_madd_epi16( xq_coeff, _mm_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u)); const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift); const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift); const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0); const __m128i err0 = _mm_madd_epi16(e0, e0); sum32 = _mm_add_epi32(sum32, err0); } for (k = j; k < width; ++k) { const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; flt0 += flt0_stride; flt1 += flt1_stride; const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); sum64 = _mm_add_epi64(sum64, sum64_0); sum64 = _mm_add_epi64(sum64, sum64_1); } } else if (params->r[0] > 0 || params->r[1] > 0) { const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; const __m128i xq_coeff = pair_set_epi16(xq_active, -xq_active * (1 << SGRPROJ_RST_BITS)); const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; for (i = 0; i < height; ++i) { __m128i sum32 = _mm_setzero_si128(); for (j = 0; j <= width - 8; j += 8) { const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j)); const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j)); const __m128i flt_16b = _mm_packs_epi32(xx_loadu_128(flt + j), xx_loadu_128(flt + j + 4)); const __m128i v0 = _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt_16b, d0)); const __m128i v1 = _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt_16b, d0)); const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift); const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift); const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0); const __m128i err0 = _mm_madd_epi16(e0, e0); sum32 = _mm_add_epi32(sum32, err0); } for (k = j; k < width; ++k) { const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); int32_t v = xq_active * (flt[k] - u); const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; flt += flt_stride; const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); sum64 = _mm_add_epi64(sum64, sum64_0); sum64 = _mm_add_epi64(sum64, sum64_1); } } else { __m128i sum32 = _mm_setzero_si128(); for (i = 0; i < height; ++i) { for (j = 0; j <= width - 16; j += 16) { const __m128i d = xx_loadu_128(dat + j); const __m128i s = xx_loadu_128(src + j); const __m128i d0 = _mm_cvtepu8_epi16(d); const __m128i d1 = _mm_cvtepu8_epi16(_mm_srli_si128(d, 8)); const __m128i s0 = _mm_cvtepu8_epi16(s); const __m128i s1 = _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)); const __m128i diff0 = _mm_sub_epi16(d0, s0); const __m128i diff1 = _mm_sub_epi16(d1, s1); const __m128i err0 = _mm_madd_epi16(diff0, diff0); const __m128i err1 = _mm_madd_epi16(diff1, diff1); sum32 = _mm_add_epi32(sum32, err0); sum32 = _mm_add_epi32(sum32, err1); } for (k = j; k < width; ++k) { const int32_t e = (int32_t)(dat[k]) - src[k]; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; } const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); sum64 = _mm_add_epi64(sum64_0, sum64_1); } int64_t sum[2]; xx_storeu_128(sum, sum64); err += sum[0] + sum[1]; return err; } // When params->r[0] > 0 and params->r[1] > 0. In this case all elements of // C and H need to be computed. static inline void calc_proj_params_r0_r1_sse4_1( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint8_t *src = src8; const uint8_t *dat = dat8; __m128i h00, h01, h11, c0, c1; const __m128i zero = _mm_setzero_si128(); h01 = h11 = c0 = c1 = h00 = zero; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; j += 4) { const __m128i u_load = _mm_cvtepu8_epi32( _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j)))); const __m128i s_load = _mm_cvtepu8_epi32( _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j)))); __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j)); __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j)); __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); s = _mm_sub_epi32(s, d); f1 = _mm_sub_epi32(f1, d); f2 = _mm_sub_epi32(f2, d); const __m128i h00_even = _mm_mul_epi32(f1, f1); const __m128i h00_odd = _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32)); h00 = _mm_add_epi64(h00, h00_even); h00 = _mm_add_epi64(h00, h00_odd); const __m128i h01_even = _mm_mul_epi32(f1, f2); const __m128i h01_odd = _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f2, 32)); h01 = _mm_add_epi64(h01, h01_even); h01 = _mm_add_epi64(h01, h01_odd); const __m128i h11_even = _mm_mul_epi32(f2, f2); const __m128i h11_odd = _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32)); h11 = _mm_add_epi64(h11, h11_even); h11 = _mm_add_epi64(h11, h11_odd); const __m128i c0_even = _mm_mul_epi32(f1, s); const __m128i c0_odd = _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32)); c0 = _mm_add_epi64(c0, c0_even); c0 = _mm_add_epi64(c0, c0_odd); const __m128i c1_even = _mm_mul_epi32(f2, s); const __m128i c1_odd = _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32)); c1 = _mm_add_epi64(c1, c1_even); c1 = _mm_add_epi64(c1, c1_odd); } } __m128i c_low = _mm_unpacklo_epi64(c0, c1); const __m128i c_high = _mm_unpackhi_epi64(c0, c1); c_low = _mm_add_epi64(c_low, c_high); __m128i h0x_low = _mm_unpacklo_epi64(h00, h01); const __m128i h0x_high = _mm_unpackhi_epi64(h00, h01); h0x_low = _mm_add_epi64(h0x_low, h0x_high); // Using the symmetric properties of H, calculations of H[1][0] are not // needed. __m128i h1x_low = _mm_unpacklo_epi64(zero, h11); const __m128i h1x_high = _mm_unpackhi_epi64(zero, h11); h1x_low = _mm_add_epi64(h1x_low, h1x_high); xx_storeu_128(C, c_low); xx_storeu_128(H[0], h0x_low); xx_storeu_128(H[1], h1x_low); H[0][0] /= size; H[0][1] /= size; H[1][1] /= size; // Since H is a symmetric matrix H[1][0] = H[0][1]; C[0] /= size; C[1] /= size; } // When only params->r[0] > 0. In this case only H[0][0] and C[0] are // non-zero and need to be computed. static inline void calc_proj_params_r0_sse4_1(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint8_t *src = src8; const uint8_t *dat = dat8; __m128i h00, c0; const __m128i zero = _mm_setzero_si128(); c0 = h00 = zero; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; j += 4) { const __m128i u_load = _mm_cvtepu8_epi32( _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j)))); const __m128i s_load = _mm_cvtepu8_epi32( _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j)))); __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j)); __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); s = _mm_sub_epi32(s, d); f1 = _mm_sub_epi32(f1, d); const __m128i h00_even = _mm_mul_epi32(f1, f1); const __m128i h00_odd = _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32)); h00 = _mm_add_epi64(h00, h00_even); h00 = _mm_add_epi64(h00, h00_odd); const __m128i c0_even = _mm_mul_epi32(f1, s); const __m128i c0_odd = _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32)); c0 = _mm_add_epi64(c0, c0_even); c0 = _mm_add_epi64(c0, c0_odd); } } const __m128i h00_val = _mm_add_epi64(h00, _mm_srli_si128(h00, 8)); const __m128i c0_val = _mm_add_epi64(c0, _mm_srli_si128(c0, 8)); const __m128i c = _mm_unpacklo_epi64(c0_val, zero); const __m128i h0x = _mm_unpacklo_epi64(h00_val, zero); xx_storeu_128(C, c); xx_storeu_128(H[0], h0x); H[0][0] /= size; C[0] /= size; } // When only params->r[1] > 0. In this case only H[1][1] and C[1] are // non-zero and need to be computed. static inline void calc_proj_params_r1_sse4_1(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint8_t *src = src8; const uint8_t *dat = dat8; __m128i h11, c1; const __m128i zero = _mm_setzero_si128(); c1 = h11 = zero; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; j += 4) { const __m128i u_load = _mm_cvtepu8_epi32( _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j)))); const __m128i s_load = _mm_cvtepu8_epi32( _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j)))); __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j)); __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); s = _mm_sub_epi32(s, d); f2 = _mm_sub_epi32(f2, d); const __m128i h11_even = _mm_mul_epi32(f2, f2); const __m128i h11_odd = _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32)); h11 = _mm_add_epi64(h11, h11_even); h11 = _mm_add_epi64(h11, h11_odd); const __m128i c1_even = _mm_mul_epi32(f2, s); const __m128i c1_odd = _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32)); c1 = _mm_add_epi64(c1, c1_even); c1 = _mm_add_epi64(c1, c1_odd); } } const __m128i h11_val = _mm_add_epi64(h11, _mm_srli_si128(h11, 8)); const __m128i c1_val = _mm_add_epi64(c1, _mm_srli_si128(c1, 8)); const __m128i c = _mm_unpacklo_epi64(zero, c1_val); const __m128i h1x = _mm_unpacklo_epi64(zero, h11_val); xx_storeu_128(C, c); xx_storeu_128(H[1], h1x); H[1][1] /= size; C[1] /= size; } // SSE4.1 variant of av1_calc_proj_params_c. void av1_calc_proj_params_sse4_1(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params) { if ((params->r[0] > 0) && (params->r[1] > 0)) { calc_proj_params_r0_r1_sse4_1(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, flt1, flt1_stride, H, C); } else if (params->r[0] > 0) { calc_proj_params_r0_sse4_1(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, H, C); } else if (params->r[1] > 0) { calc_proj_params_r1_sse4_1(src8, width, height, src_stride, dat8, dat_stride, flt1, flt1_stride, H, C); } } #if CONFIG_AV1_HIGHBITDEPTH static inline void calc_proj_params_r0_r1_high_bd_sse4_1( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); __m128i h00, h01, h11, c0, c1; const __m128i zero = _mm_setzero_si128(); h01 = h11 = c0 = c1 = h00 = zero; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; j += 4) { const __m128i u_load = _mm_cvtepu16_epi32( _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); const __m128i s_load = _mm_cvtepu16_epi32( _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j)); __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j)); __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); s = _mm_sub_epi32(s, d); f1 = _mm_sub_epi32(f1, d); f2 = _mm_sub_epi32(f2, d); const __m128i h00_even = _mm_mul_epi32(f1, f1); const __m128i h00_odd = _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32)); h00 = _mm_add_epi64(h00, h00_even); h00 = _mm_add_epi64(h00, h00_odd); const __m128i h01_even = _mm_mul_epi32(f1, f2); const __m128i h01_odd = _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f2, 32)); h01 = _mm_add_epi64(h01, h01_even); h01 = _mm_add_epi64(h01, h01_odd); const __m128i h11_even = _mm_mul_epi32(f2, f2); const __m128i h11_odd = _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32)); h11 = _mm_add_epi64(h11, h11_even); h11 = _mm_add_epi64(h11, h11_odd); const __m128i c0_even = _mm_mul_epi32(f1, s); const __m128i c0_odd = _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32)); c0 = _mm_add_epi64(c0, c0_even); c0 = _mm_add_epi64(c0, c0_odd); const __m128i c1_even = _mm_mul_epi32(f2, s); const __m128i c1_odd = _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32)); c1 = _mm_add_epi64(c1, c1_even); c1 = _mm_add_epi64(c1, c1_odd); } } __m128i c_low = _mm_unpacklo_epi64(c0, c1); const __m128i c_high = _mm_unpackhi_epi64(c0, c1); c_low = _mm_add_epi64(c_low, c_high); __m128i h0x_low = _mm_unpacklo_epi64(h00, h01); const __m128i h0x_high = _mm_unpackhi_epi64(h00, h01); h0x_low = _mm_add_epi64(h0x_low, h0x_high); // Using the symmetric properties of H, calculations of H[1][0] are not // needed. __m128i h1x_low = _mm_unpacklo_epi64(zero, h11); const __m128i h1x_high = _mm_unpackhi_epi64(zero, h11); h1x_low = _mm_add_epi64(h1x_low, h1x_high); xx_storeu_128(C, c_low); xx_storeu_128(H[0], h0x_low); xx_storeu_128(H[1], h1x_low); H[0][0] /= size; H[0][1] /= size; H[1][1] /= size; // Since H is a symmetric matrix H[1][0] = H[0][1]; C[0] /= size; C[1] /= size; } // When only params->r[0] > 0. In this case only H[0][0] and C[0] are // non-zero and need to be computed. static inline void calc_proj_params_r0_high_bd_sse4_1( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); __m128i h00, c0; const __m128i zero = _mm_setzero_si128(); c0 = h00 = zero; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; j += 4) { const __m128i u_load = _mm_cvtepu16_epi32( _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); const __m128i s_load = _mm_cvtepu16_epi32( _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j)); __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); s = _mm_sub_epi32(s, d); f1 = _mm_sub_epi32(f1, d); const __m128i h00_even = _mm_mul_epi32(f1, f1); const __m128i h00_odd = _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32)); h00 = _mm_add_epi64(h00, h00_even); h00 = _mm_add_epi64(h00, h00_odd); const __m128i c0_even = _mm_mul_epi32(f1, s); const __m128i c0_odd = _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32)); c0 = _mm_add_epi64(c0, c0_even); c0 = _mm_add_epi64(c0, c0_odd); } } const __m128i h00_val = _mm_add_epi64(h00, _mm_srli_si128(h00, 8)); const __m128i c0_val = _mm_add_epi64(c0, _mm_srli_si128(c0, 8)); const __m128i c = _mm_unpacklo_epi64(c0_val, zero); const __m128i h0x = _mm_unpacklo_epi64(h00_val, zero); xx_storeu_128(C, c); xx_storeu_128(H[0], h0x); H[0][0] /= size; C[0] /= size; } // When only params->r[1] > 0. In this case only H[1][1] and C[1] are // non-zero and need to be computed. static inline void calc_proj_params_r1_high_bd_sse4_1( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { const int size = width * height; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); __m128i h11, c1; const __m128i zero = _mm_setzero_si128(); c1 = h11 = zero; for (int i = 0; i < height; ++i) { for (int j = 0; j < width; j += 4) { const __m128i u_load = _mm_cvtepu16_epi32( _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); const __m128i s_load = _mm_cvtepu16_epi32( _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j)); __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); s = _mm_sub_epi32(s, d); f2 = _mm_sub_epi32(f2, d); const __m128i h11_even = _mm_mul_epi32(f2, f2); const __m128i h11_odd = _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32)); h11 = _mm_add_epi64(h11, h11_even); h11 = _mm_add_epi64(h11, h11_odd); const __m128i c1_even = _mm_mul_epi32(f2, s); const __m128i c1_odd = _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32)); c1 = _mm_add_epi64(c1, c1_even); c1 = _mm_add_epi64(c1, c1_odd); } } const __m128i h11_val = _mm_add_epi64(h11, _mm_srli_si128(h11, 8)); const __m128i c1_val = _mm_add_epi64(c1, _mm_srli_si128(c1, 8)); const __m128i c = _mm_unpacklo_epi64(zero, c1_val); const __m128i h1x = _mm_unpacklo_epi64(zero, h11_val); xx_storeu_128(C, c); xx_storeu_128(H[1], h1x); H[1][1] /= size; C[1] /= size; } // SSE4.1 variant of av1_calc_proj_params_high_bd_c. void av1_calc_proj_params_high_bd_sse4_1(const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params) { if ((params->r[0] > 0) && (params->r[1] > 0)) { calc_proj_params_r0_r1_high_bd_sse4_1(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, flt1, flt1_stride, H, C); } else if (params->r[0] > 0) { calc_proj_params_r0_high_bd_sse4_1(src8, width, height, src_stride, dat8, dat_stride, flt0, flt0_stride, H, C); } else if (params->r[1] > 0) { calc_proj_params_r1_high_bd_sse4_1(src8, width, height, src_stride, dat8, dat_stride, flt1, flt1_stride, H, C); } } int64_t av1_highbd_pixel_proj_error_sse4_1( const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { int i, j, k; const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); __m128i sum64 = _mm_setzero_si128(); const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); int64_t err = 0; if (params->r[0] > 0 && params->r[1] > 0) { // Both filters are enabled const __m128i xq0 = _mm_set1_epi32(xq[0]); const __m128i xq1 = _mm_set1_epi32(xq[1]); for (i = 0; i < height; ++i) { __m128i sum32 = _mm_setzero_si128(); for (j = 0; j <= width - 8; j += 8) { // Load 8x pixels from source image const __m128i s0 = xx_loadu_128(src + j); // s0 = [7 6 5 4 3 2 1 0] as i16 (indices of src[]) // Load 8x pixels from corrupted image const __m128i d0 = xx_loadu_128(dat + j); // d0 = [7 6 5 4 3 2 1 0] as i16 (indices of dat[]) // Shift each pixel value up by SGRPROJ_RST_BITS const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS); // Split u0 into two halves and pad each from u16 to i32 const __m128i u0l = _mm_cvtepu16_epi32(u0); const __m128i u0h = _mm_cvtepu16_epi32(_mm_srli_si128(u0, 8)); // u0h = [7 6 5 4] as i32, u0l = [3 2 1 0] as i32, all dat[] indices // Load 8 pixels from first and second filtered images const __m128i flt0l = xx_loadu_128(flt0 + j); const __m128i flt0h = xx_loadu_128(flt0 + j + 4); const __m128i flt1l = xx_loadu_128(flt1 + j); const __m128i flt1h = xx_loadu_128(flt1 + j + 4); // flt0 = [7 6 5 4] [3 2 1 0] as i32 (indices of flt0+j) // flt1 = [7 6 5 4] [3 2 1 0] as i32 (indices of flt1+j) // Subtract shifted corrupt image from each filtered image // This gives our two basis vectors for the projection const __m128i flt0l_subu = _mm_sub_epi32(flt0l, u0l); const __m128i flt0h_subu = _mm_sub_epi32(flt0h, u0h); const __m128i flt1l_subu = _mm_sub_epi32(flt1l, u0l); const __m128i flt1h_subu = _mm_sub_epi32(flt1h, u0h); // flt?h_subu = [ f[7]-u[7] f[6]-u[6] f[5]-u[5] f[4]-u[4] ] as i32 // flt?l_subu = [ f[3]-u[3] f[2]-u[2] f[1]-u[1] f[0]-u[0] ] as i32 // Multiply each basis vector by the corresponding coefficient const __m128i v0l = _mm_mullo_epi32(flt0l_subu, xq0); const __m128i v0h = _mm_mullo_epi32(flt0h_subu, xq0); const __m128i v1l = _mm_mullo_epi32(flt1l_subu, xq1); const __m128i v1h = _mm_mullo_epi32(flt1h_subu, xq1); // Add together the contribution from each scaled basis vector const __m128i vl = _mm_add_epi32(v0l, v1l); const __m128i vh = _mm_add_epi32(v0h, v1h); // Right-shift v with appropriate rounding const __m128i vrl = _mm_srai_epi32(_mm_add_epi32(vl, rounding), shift); const __m128i vrh = _mm_srai_epi32(_mm_add_epi32(vh, rounding), shift); // Saturate each i32 value to i16 and combine lower and upper halves const __m128i vr = _mm_packs_epi32(vrl, vrh); // Add twin-subspace-sgr-filter to corrupt image then subtract source const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(vr, d0), s0); // Calculate squared error and add adjacent values const __m128i err0 = _mm_madd_epi16(e0, e0); sum32 = _mm_add_epi32(sum32, err0); } const __m128i sum32l = _mm_cvtepu32_epi64(sum32); sum64 = _mm_add_epi64(sum64, sum32l); const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8)); sum64 = _mm_add_epi64(sum64, sum32h); // Process remaining pixels in this row (modulo 8) for (k = j; k < width; ++k) { const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; flt0 += flt0_stride; flt1 += flt1_stride; } } else if (params->r[0] > 0 || params->r[1] > 0) { // Only one filter enabled const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1]; const __m128i xq_active = _mm_set1_epi32(xq_on); const __m128i xq_inactive = _mm_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS)); const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; for (i = 0; i < height; ++i) { __m128i sum32 = _mm_setzero_si128(); for (j = 0; j <= width - 8; j += 8) { // Load 8x pixels from source image const __m128i s0 = xx_loadu_128(src + j); // s0 = [7 6 5 4 3 2 1 0] as u16 (indices of src[]) // Load 8x pixels from corrupted image and pad each u16 to i32 const __m128i d0 = xx_loadu_128(dat + j); const __m128i d0h = _mm_cvtepu16_epi32(_mm_srli_si128(d0, 8)); const __m128i d0l = _mm_cvtepu16_epi32(d0); // d0h, d0l = [7 6 5 4], [3 2 1 0] as u32 (indices of dat[]) // Load 8 pixels from the filtered image const __m128i flth = xx_loadu_128(flt + j + 4); const __m128i fltl = xx_loadu_128(flt + j); // flth, fltl = [7 6 5 4], [3 2 1 0] as i32 (indices of flt+j) const __m128i flth_xq = _mm_mullo_epi32(flth, xq_active); const __m128i fltl_xq = _mm_mullo_epi32(fltl, xq_active); const __m128i d0h_xq = _mm_mullo_epi32(d0h, xq_inactive); const __m128i d0l_xq = _mm_mullo_epi32(d0l, xq_inactive); const __m128i vh = _mm_add_epi32(flth_xq, d0h_xq); const __m128i vl = _mm_add_epi32(fltl_xq, d0l_xq); // vh = [ xq0(f[7]-d[7]) xq0(f[6]-d[6]) xq0(f[5]-d[5]) xq0(f[4]-d[4]) ] // vl = [ xq0(f[3]-d[3]) xq0(f[2]-d[2]) xq0(f[1]-d[1]) xq0(f[0]-d[0]) ] // Shift this down with appropriate rounding const __m128i vrh = _mm_srai_epi32(_mm_add_epi32(vh, rounding), shift); const __m128i vrl = _mm_srai_epi32(_mm_add_epi32(vl, rounding), shift); // Saturate vr0 and vr1 from i32 to i16 then pack together const __m128i vr = _mm_packs_epi32(vrl, vrh); // Subtract twin-subspace-sgr filtered from source image to get error const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(vr, d0), s0); // Calculate squared error and add adjacent values const __m128i err0 = _mm_madd_epi16(e0, e0); sum32 = _mm_add_epi32(sum32, err0); } const __m128i sum32l = _mm_cvtepu32_epi64(sum32); sum64 = _mm_add_epi64(sum64, sum32l); const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8)); sum64 = _mm_add_epi64(sum64, sum32h); // Process remaining pixels in this row (modulo 8) for (k = j; k < width; ++k) { const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); int32_t v = xq_on * (flt[k] - u); const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; flt += flt_stride; } } else { // Neither filter is enabled for (i = 0; i < height; ++i) { __m128i sum32 = _mm_setzero_si128(); for (j = 0; j <= width - 16; j += 16) { // Load 2x8 u16 from source image const __m128i s0 = xx_loadu_128(src + j); const __m128i s1 = xx_loadu_128(src + j + 8); // Load 2x8 u16 from corrupted image const __m128i d0 = xx_loadu_128(dat + j); const __m128i d1 = xx_loadu_128(dat + j + 8); // Subtract corrupted image from source image const __m128i diff0 = _mm_sub_epi16(d0, s0); const __m128i diff1 = _mm_sub_epi16(d1, s1); // Square error and add adjacent values const __m128i err0 = _mm_madd_epi16(diff0, diff0); const __m128i err1 = _mm_madd_epi16(diff1, diff1); sum32 = _mm_add_epi32(sum32, err0); sum32 = _mm_add_epi32(sum32, err1); } const __m128i sum32l = _mm_cvtepu32_epi64(sum32); sum64 = _mm_add_epi64(sum64, sum32l); const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8)); sum64 = _mm_add_epi64(sum64, sum32h); // Process remaining pixels (modulu 8) for (k = j; k < width; ++k) { const int32_t e = (int32_t)(dat[k]) - src[k]; err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; } } // Sum 4 values from sum64l and sum64h into err int64_t sum[2]; xx_storeu_128(sum, sum64); err += sum[0] + sum[1]; return err; } #endif // CONFIG_AV1_HIGHBITDEPTH aom-3.12.1/av1/encoder/x86/rdopt_avx2.c000066400000000000000000000230071477627663500173720ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/x86/mem_sse2.h" #include "aom_dsp/x86/synonyms_avx2.h" #include "config/av1_rtcd.h" #include "av1/encoder/rdopt.h" // Process horizontal and vertical correlations in a 4x4 block of pixels. // We actually use the 4x4 pixels to calculate correlations corresponding to // the top-left 3x3 pixels, so this function must be called with 1x1 overlap, // moving the window along/down by 3 pixels at a time. static inline void horver_correlation_4x4(const int16_t *diff, int stride, __m256i *xy_sum_32, __m256i *xz_sum_32, __m256i *x_sum_32, __m256i *x2_sum_32) { // Pixels in this 4x4 [ a b c d ] // are referred to as: [ e f g h ] // [ i j k l ] // [ m n o p ] const __m256i pixels = _mm256_set_epi64x( loadu_int64(&diff[0 * stride]), loadu_int64(&diff[1 * stride]), loadu_int64(&diff[2 * stride]), loadu_int64(&diff[3 * stride])); // pixels = [d c b a h g f e] [l k j i p o n m] as i16 const __m256i slli = _mm256_slli_epi64(pixels, 16); // slli = [c b a 0 g f e 0] [k j i 0 o n m 0] as i16 const __m256i madd_xy = _mm256_madd_epi16(pixels, slli); // madd_xy = [bc+cd ab fg+gh ef] [jk+kl ij no+op mn] as i32 *xy_sum_32 = _mm256_add_epi32(*xy_sum_32, madd_xy); // Permute control [3 2] [1 0] => [2 1] [0 0], 0b10010000 = 0x90 const __m256i perm = _mm256_permute4x64_epi64(slli, 0x90); // perm = [g f e 0 k j i 0] [o n m 0 o n m 0] as i16 const __m256i madd_xz = _mm256_madd_epi16(slli, perm); // madd_xz = [cg+bf ae gk+fj ei] [ko+jn im oo+nn mm] as i32 *xz_sum_32 = _mm256_add_epi32(*xz_sum_32, madd_xz); // Sum every element in slli (and then also their squares) const __m256i madd1_slli = _mm256_madd_epi16(slli, _mm256_set1_epi16(1)); // madd1_slli = [c+b a g+f e] [k+j i o+n m] as i32 *x_sum_32 = _mm256_add_epi32(*x_sum_32, madd1_slli); const __m256i madd_slli = _mm256_madd_epi16(slli, slli); // madd_slli = [cc+bb aa gg+ff ee] [kk+jj ii oo+nn mm] as i32 *x2_sum_32 = _mm256_add_epi32(*x2_sum_32, madd_slli); } void av1_get_horver_correlation_full_avx2(const int16_t *diff, int stride, int width, int height, float *hcorr, float *vcorr) { // The following notation is used: // x - current pixel // y - right neighbour pixel // z - below neighbour pixel // w - down-right neighbour pixel int64_t xy_sum = 0, xz_sum = 0; int64_t x_sum = 0, x2_sum = 0; // Process horizontal and vertical correlations through the body in 4x4 // blocks. This excludes the final row and column and possibly one extra // column depending how 3 divides into width and height int32_t xy_xz_tmp[8] = { 0 }, x_x2_tmp[8] = { 0 }; __m256i xy_sum_32 = _mm256_setzero_si256(); __m256i xz_sum_32 = _mm256_setzero_si256(); __m256i x_sum_32 = _mm256_setzero_si256(); __m256i x2_sum_32 = _mm256_setzero_si256(); for (int i = 0; i <= height - 4; i += 3) { for (int j = 0; j <= width - 4; j += 3) { horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32, &xz_sum_32, &x_sum_32, &x2_sum_32); } const __m256i hadd_xy_xz = _mm256_hadd_epi32(xy_sum_32, xz_sum_32); // hadd_xy_xz = [ae+bf+cg ei+fj+gk ab+bc+cd ef+fg+gh] // [im+jn+ko mm+nn+oo ij+jk+kl mn+no+op] as i32 yy_storeu_256(xy_xz_tmp, hadd_xy_xz); xy_sum += (int64_t)xy_xz_tmp[5] + xy_xz_tmp[4] + xy_xz_tmp[1]; xz_sum += (int64_t)xy_xz_tmp[7] + xy_xz_tmp[6] + xy_xz_tmp[3]; const __m256i hadd_x_x2 = _mm256_hadd_epi32(x_sum_32, x2_sum_32); // hadd_x_x2 = [aa+bb+cc ee+ff+gg a+b+c e+f+g] // [ii+jj+kk mm+nn+oo i+j+k m+n+o] as i32 yy_storeu_256(x_x2_tmp, hadd_x_x2); x_sum += (int64_t)x_x2_tmp[5] + x_x2_tmp[4] + x_x2_tmp[1]; x2_sum += (int64_t)x_x2_tmp[7] + x_x2_tmp[6] + x_x2_tmp[3]; xy_sum_32 = _mm256_setzero_si256(); xz_sum_32 = _mm256_setzero_si256(); x_sum_32 = _mm256_setzero_si256(); x2_sum_32 = _mm256_setzero_si256(); } // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0; // Do we have 2 rows remaining or just the one? Note that width and height // are powers of 2, so each modulo 3 must be 1 or 2. if (height % 3 == 1) { // Just horiz corrs on the final row const int16_t x0 = diff[(height - 1) * stride]; x_sum += x0; x_finalrow += x0; x2_sum += x0 * x0; x2_finalrow += x0 * x0; for (int j = 0; j < width - 1; ++j) { const int16_t x = diff[(height - 1) * stride + j]; const int16_t y = diff[(height - 1) * stride + j + 1]; xy_sum += x * y; x_sum += y; x2_sum += y * y; x_finalrow += y; x2_finalrow += y * y; } } else { // Two rows remaining to do const int16_t x0 = diff[(height - 2) * stride]; const int16_t z0 = diff[(height - 1) * stride]; x_sum += x0 + z0; x2_sum += x0 * x0 + z0 * z0; x_finalrow += z0; x2_finalrow += z0 * z0; for (int j = 0; j < width - 1; ++j) { const int16_t x = diff[(height - 2) * stride + j]; const int16_t y = diff[(height - 2) * stride + j + 1]; const int16_t z = diff[(height - 1) * stride + j]; const int16_t w = diff[(height - 1) * stride + j + 1]; // Horizontal and vertical correlations for the penultimate row: xy_sum += x * y; xz_sum += x * z; // Now just horizontal correlations for the final row: xy_sum += z * w; x_sum += y + w; x2_sum += y * y + w * w; x_finalrow += w; x2_finalrow += w * w; } } // Do we have 2 columns remaining or just the one? if (width % 3 == 1) { // Just vert corrs on the final col const int16_t x0 = diff[width - 1]; x_sum += x0; x_finalcol += x0; x2_sum += x0 * x0; x2_finalcol += x0 * x0; for (int i = 0; i < height - 1; ++i) { const int16_t x = diff[i * stride + width - 1]; const int16_t z = diff[(i + 1) * stride + width - 1]; xz_sum += x * z; x_finalcol += z; x2_finalcol += z * z; // So the bottom-right elements don't get counted twice: if (i < height - (height % 3 == 1 ? 2 : 3)) { x_sum += z; x2_sum += z * z; } } } else { // Two cols remaining const int16_t x0 = diff[width - 2]; const int16_t y0 = diff[width - 1]; x_sum += x0 + y0; x2_sum += x0 * x0 + y0 * y0; x_finalcol += y0; x2_finalcol += y0 * y0; for (int i = 0; i < height - 1; ++i) { const int16_t x = diff[i * stride + width - 2]; const int16_t y = diff[i * stride + width - 1]; const int16_t z = diff[(i + 1) * stride + width - 2]; const int16_t w = diff[(i + 1) * stride + width - 1]; // Horizontal and vertical correlations for the penultimate col: // Skip these on the last iteration of this loop if we also had two // rows remaining, otherwise the final horizontal and vertical correlation // get erroneously processed twice if (i < height - 2 || height % 3 == 1) { xy_sum += x * y; xz_sum += x * z; } x_finalcol += w; x2_finalcol += w * w; // So the bottom-right elements don't get counted twice: if (i < height - (height % 3 == 1 ? 2 : 3)) { x_sum += z + w; x2_sum += z * z + w * w; } // Now just vertical correlations for the final column: xz_sum += y * w; } } // Calculate the simple sums and squared-sums int64_t x_firstrow = 0, x_firstcol = 0; int64_t x2_firstrow = 0, x2_firstcol = 0; for (int j = 0; j < width; ++j) { x_firstrow += diff[j]; x2_firstrow += diff[j] * diff[j]; } for (int i = 0; i < height; ++i) { x_firstcol += diff[i * stride]; x2_firstcol += diff[i * stride] * diff[i * stride]; } int64_t xhor_sum = x_sum - x_finalcol; int64_t xver_sum = x_sum - x_finalrow; int64_t y_sum = x_sum - x_firstcol; int64_t z_sum = x_sum - x_firstrow; int64_t x2hor_sum = x2_sum - x2_finalcol; int64_t x2ver_sum = x2_sum - x2_finalrow; int64_t y2_sum = x2_sum - x2_firstcol; int64_t z2_sum = x2_sum - x2_firstrow; const float num_hor = (float)(height * (width - 1)); const float num_ver = (float)((height - 1) * width); const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; if (xhor_var_n > 0 && y_var_n > 0) { *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); *hcorr = *hcorr < 0 ? 0 : *hcorr; } else { *hcorr = 1.0; } if (xver_var_n > 0 && z_var_n > 0) { *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); *vcorr = *vcorr < 0 ? 0 : *vcorr; } else { *vcorr = 1.0; } } aom-3.12.1/av1/encoder/x86/rdopt_sse4.c000066400000000000000000000237321477627663500173750ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/x86/synonyms.h" #include "config/av1_rtcd.h" #include "av1/encoder/rdopt.h" // Process horizontal and vertical correlations in a 4x4 block of pixels. // We actually use the 4x4 pixels to calculate correlations corresponding to // the top-left 3x3 pixels, so this function must be called with 1x1 overlap, // moving the window along/down by 3 pixels at a time. static inline void horver_correlation_4x4(const int16_t *diff, int stride, __m128i *xy_sum_32, __m128i *xz_sum_32, __m128i *x_sum_32, __m128i *x2_sum_32) { // Pixels in this 4x4 [ a b c d ] // are referred to as: [ e f g h ] // [ i j k l ] // [ m n o p ] const __m128i pixelsa = xx_loadu_2x64(&diff[0 * stride], &diff[2 * stride]); const __m128i pixelsb = xx_loadu_2x64(&diff[1 * stride], &diff[3 * stride]); // pixelsa = [d c b a l k j i] as i16 // pixelsb = [h g f e p o n m] as i16 const __m128i slli_a = _mm_slli_epi64(pixelsa, 16); const __m128i slli_b = _mm_slli_epi64(pixelsb, 16); // slli_a = [c b a 0 k j i 0] as i16 // slli_b = [g f e 0 o n m 0] as i16 const __m128i xy_madd_a = _mm_madd_epi16(pixelsa, slli_a); const __m128i xy_madd_b = _mm_madd_epi16(pixelsb, slli_b); // xy_madd_a = [bc+cd ab jk+kl ij] as i32 // xy_madd_b = [fg+gh ef no+op mn] as i32 const __m128i xy32 = _mm_hadd_epi32(xy_madd_b, xy_madd_a); // xy32 = [ab+bc+cd ij+jk+kl ef+fg+gh mn+no+op] as i32 *xy_sum_32 = _mm_add_epi32(*xy_sum_32, xy32); const __m128i xz_madd_a = _mm_madd_epi16(slli_a, slli_b); // xz_madd_a = [bf+cg ae jn+ko im] i32 const __m128i swap_b = _mm_srli_si128(slli_b, 8); // swap_b = [0 0 0 0 g f e 0] as i16 const __m128i xz_madd_b = _mm_madd_epi16(slli_a, swap_b); // xz_madd_b = [0 0 gk+fj ei] i32 const __m128i xz32 = _mm_hadd_epi32(xz_madd_b, xz_madd_a); // xz32 = [ae+bf+cg im+jn+ko 0 ei+fj+gk] i32 *xz_sum_32 = _mm_add_epi32(*xz_sum_32, xz32); // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k // (sum up every element in slli_a and swap_b) const __m128i sum_slli_a = _mm_hadd_epi16(slli_a, slli_a); const __m128i sum_slli_a32 = _mm_cvtepi16_epi32(sum_slli_a); // sum_slli_a32 = [c+b a k+j i] as i32 const __m128i swap_b32 = _mm_cvtepi16_epi32(swap_b); // swap_b32 = [g f e 0] as i32 *x_sum_32 = _mm_add_epi32(*x_sum_32, sum_slli_a32); *x_sum_32 = _mm_add_epi32(*x_sum_32, swap_b32); // sum = [c+b+g a+f k+j+e i] as i32 // Also sum their squares const __m128i slli_a_2 = _mm_madd_epi16(slli_a, slli_a); const __m128i swap_b_2 = _mm_madd_epi16(swap_b, swap_b); // slli_a_2 = [c2+b2 a2 k2+j2 i2] // swap_b_2 = [0 0 g2+f2 e2] const __m128i sum2 = _mm_hadd_epi32(slli_a_2, swap_b_2); // sum2 = [0 g2+f2+e2 c2+b2+a2 k2+j2+i2] *x2_sum_32 = _mm_add_epi32(*x2_sum_32, sum2); } void av1_get_horver_correlation_full_sse4_1(const int16_t *diff, int stride, int width, int height, float *hcorr, float *vcorr) { // The following notation is used: // x - current pixel // y - right neighbour pixel // z - below neighbour pixel // w - down-right neighbour pixel int64_t xy_sum = 0, xz_sum = 0; int64_t x_sum = 0, x2_sum = 0; // Process horizontal and vertical correlations through the body in 4x4 // blocks. This excludes the final row and column and possibly one extra // column depending how 3 divides into width and height int32_t xy_tmp[4] = { 0 }, xz_tmp[4] = { 0 }; int32_t x_tmp[4] = { 0 }, x2_tmp[4] = { 0 }; __m128i xy_sum_32 = _mm_setzero_si128(); __m128i xz_sum_32 = _mm_setzero_si128(); __m128i x_sum_32 = _mm_setzero_si128(); __m128i x2_sum_32 = _mm_setzero_si128(); for (int i = 0; i <= height - 4; i += 3) { for (int j = 0; j <= width - 4; j += 3) { horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32, &xz_sum_32, &x_sum_32, &x2_sum_32); } xx_storeu_128(xy_tmp, xy_sum_32); xx_storeu_128(xz_tmp, xz_sum_32); xx_storeu_128(x_tmp, x_sum_32); xx_storeu_128(x2_tmp, x2_sum_32); xy_sum += (int64_t)xy_tmp[3] + xy_tmp[2] + xy_tmp[1]; xz_sum += (int64_t)xz_tmp[3] + xz_tmp[2] + xz_tmp[0]; x_sum += (int64_t)x_tmp[3] + x_tmp[2] + x_tmp[1] + x_tmp[0]; x2_sum += (int64_t)x2_tmp[2] + x2_tmp[1] + x2_tmp[0]; xy_sum_32 = _mm_setzero_si128(); xz_sum_32 = _mm_setzero_si128(); x_sum_32 = _mm_setzero_si128(); x2_sum_32 = _mm_setzero_si128(); } // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0; // Do we have 2 rows remaining or just the one? Note that width and height // are powers of 2, so each modulo 3 must be 1 or 2. if (height % 3 == 1) { // Just horiz corrs on the final row const int16_t x0 = diff[(height - 1) * stride]; x_sum += x0; x_finalrow += x0; x2_sum += x0 * x0; x2_finalrow += x0 * x0; for (int j = 0; j < width - 1; ++j) { const int16_t x = diff[(height - 1) * stride + j]; const int16_t y = diff[(height - 1) * stride + j + 1]; xy_sum += x * y; x_sum += y; x2_sum += y * y; x_finalrow += y; x2_finalrow += y * y; } } else { // Two rows remaining to do const int16_t x0 = diff[(height - 2) * stride]; const int16_t z0 = diff[(height - 1) * stride]; x_sum += x0 + z0; x2_sum += x0 * x0 + z0 * z0; x_finalrow += z0; x2_finalrow += z0 * z0; for (int j = 0; j < width - 1; ++j) { const int16_t x = diff[(height - 2) * stride + j]; const int16_t y = diff[(height - 2) * stride + j + 1]; const int16_t z = diff[(height - 1) * stride + j]; const int16_t w = diff[(height - 1) * stride + j + 1]; // Horizontal and vertical correlations for the penultimate row: xy_sum += x * y; xz_sum += x * z; // Now just horizontal correlations for the final row: xy_sum += z * w; x_sum += y + w; x2_sum += y * y + w * w; x_finalrow += w; x2_finalrow += w * w; } } // Do we have 2 columns remaining or just the one? if (width % 3 == 1) { // Just vert corrs on the final col const int16_t x0 = diff[width - 1]; x_sum += x0; x_finalcol += x0; x2_sum += x0 * x0; x2_finalcol += x0 * x0; for (int i = 0; i < height - 1; ++i) { const int16_t x = diff[i * stride + width - 1]; const int16_t z = diff[(i + 1) * stride + width - 1]; xz_sum += x * z; x_finalcol += z; x2_finalcol += z * z; // So the bottom-right elements don't get counted twice: if (i < height - (height % 3 == 1 ? 2 : 3)) { x_sum += z; x2_sum += z * z; } } } else { // Two cols remaining const int16_t x0 = diff[width - 2]; const int16_t y0 = diff[width - 1]; x_sum += x0 + y0; x2_sum += x0 * x0 + y0 * y0; x_finalcol += y0; x2_finalcol += y0 * y0; for (int i = 0; i < height - 1; ++i) { const int16_t x = diff[i * stride + width - 2]; const int16_t y = diff[i * stride + width - 1]; const int16_t z = diff[(i + 1) * stride + width - 2]; const int16_t w = diff[(i + 1) * stride + width - 1]; // Horizontal and vertical correlations for the penultimate col: // Skip these on the last iteration of this loop if we also had two // rows remaining, otherwise the final horizontal and vertical correlation // get erroneously processed twice if (i < height - 2 || height % 3 == 1) { xy_sum += x * y; xz_sum += x * z; } x_finalcol += w; x2_finalcol += w * w; // So the bottom-right elements don't get counted twice: if (i < height - (height % 3 == 1 ? 2 : 3)) { x_sum += z + w; x2_sum += z * z + w * w; } // Now just vertical correlations for the final column: xz_sum += y * w; } } // Calculate the simple sums and squared-sums int64_t x_firstrow = 0, x_firstcol = 0; int64_t x2_firstrow = 0, x2_firstcol = 0; for (int j = 0; j < width; ++j) { x_firstrow += diff[j]; x2_firstrow += diff[j] * diff[j]; } for (int i = 0; i < height; ++i) { x_firstcol += diff[i * stride]; x2_firstcol += diff[i * stride] * diff[i * stride]; } int64_t xhor_sum = x_sum - x_finalcol; int64_t xver_sum = x_sum - x_finalrow; int64_t y_sum = x_sum - x_firstcol; int64_t z_sum = x_sum - x_firstrow; int64_t x2hor_sum = x2_sum - x2_finalcol; int64_t x2ver_sum = x2_sum - x2_finalrow; int64_t y2_sum = x2_sum - x2_firstcol; int64_t z2_sum = x2_sum - x2_firstrow; const float num_hor = (float)(height * (width - 1)); const float num_ver = (float)((height - 1) * width); const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; if (xhor_var_n > 0 && y_var_n > 0) { *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); *hcorr = *hcorr < 0 ? 0 : *hcorr; } else { *hcorr = 1.0; } if (xver_var_n > 0 && z_var_n > 0) { *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); *vcorr = *vcorr < 0 ? 0 : *vcorr; } else { *vcorr = 1.0; } } aom-3.12.1/av1/encoder/x86/reconinter_enc_sse2.c000066400000000000000000000306531477627663500212400ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include // SSE2 #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "config/aom_scale_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/blend.h" #include "aom_dsp/x86/mem_sse2.h" #include "aom_dsp/x86/synonyms.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/mvref_common.h" #include "av1/common/obmc.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" #include "av1/encoder/reconinter_enc.h" void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search) { // expect xd == NULL only in tests if (xd != NULL) { const MB_MODE_INFO *mi = xd->mi[0]; const int ref_num = 0; const int is_intrabc = is_intrabc_block(mi); const struct scale_factors *const sf = is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; const int is_scaled = av1_is_scaled(sf); if (is_scaled) { int plane = 0; const int mi_x = mi_col * MI_SIZE; const int mi_y = mi_row * MI_SIZE; const struct macroblockd_plane *const pd = &xd->plane[plane]; const struct buf_2d *const dst_buf = &pd->dst; const struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref_num]; InterPredParams inter_pred_params; inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); const int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); av1_init_inter_params( &inter_pred_params, width, height, mi_y >> pd->subsampling_y, mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); av1_enc_build_one_inter_predictor(comp_pred, width, mv, &inter_pred_params); return; } } const InterpFilterParams *filter = av1_get_filter(subpel_search); // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for // 2-tap yet. int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS; if (!subpel_x_q3 && !subpel_y_q3) { if (width >= 16) { int i; assert(!(width & 15)); /*Read 16 pixels one row at a time.*/ for (i = 0; i < height; i++) { int j; for (j = 0; j < width; j += 16) { xx_storeu_128(comp_pred, xx_loadu_128(ref)); comp_pred += 16; ref += 16; } ref += ref_stride - width; } } else if (width >= 8) { int i; assert(!(width & 7)); assert(!(height & 1)); /*Read 8 pixels two rows at a time.*/ for (i = 0; i < height; i += 2) { __m128i s0 = xx_loadl_64(ref + 0 * ref_stride); __m128i s1 = xx_loadl_64(ref + 1 * ref_stride); xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1)); comp_pred += 16; ref += 2 * ref_stride; } } else { int i; assert(!(width & 3)); assert(!(height & 3)); /*Read 4 pixels four rows at a time.*/ for (i = 0; i < height; i++) { const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride); const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride); const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride); const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride); const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1), _mm_unpacklo_epi32(row2, row3)); xx_storeu_128(comp_pred, reg); comp_pred += 16; ref += 4 * ref_stride; } } } else if (!subpel_y_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1, width, height); } else if (!subpel_x_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16, width, height); } else { DECLARE_ALIGNED(16, uint8_t, temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); const int16_t *const kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1); uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp; uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); int intermediate_height = (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height); aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, width, height); } } #if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search) { // expect xd == NULL only in tests if (xd != NULL) { const MB_MODE_INFO *mi = xd->mi[0]; const int ref_num = 0; const int is_intrabc = is_intrabc_block(mi); const struct scale_factors *const sf = is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; const int is_scaled = av1_is_scaled(sf); if (is_scaled) { int plane = 0; const int mi_x = mi_col * MI_SIZE; const int mi_y = mi_row * MI_SIZE; const struct macroblockd_plane *const pd = &xd->plane[plane]; const struct buf_2d *const dst_buf = &pd->dst; const struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref_num]; InterPredParams inter_pred_params; inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); const int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); av1_init_inter_params( &inter_pred_params, width, height, mi_y >> pd->subsampling_y, mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); av1_enc_build_one_inter_predictor(comp_pred8, width, mv, &inter_pred_params); return; } } const InterpFilterParams *filter = av1_get_filter(subpel_search); int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS; if (!subpel_x_q3 && !subpel_y_q3) { uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); if (width >= 8) { int i; assert(!(width & 7)); /*Read 8 pixels one row at a time.*/ for (i = 0; i < height; i++) { int j; for (j = 0; j < width; j += 8) { __m128i s0 = _mm_loadu_si128((const __m128i *)ref); _mm_storeu_si128((__m128i *)comp_pred, s0); comp_pred += 8; ref += 8; } ref += ref_stride - width; } } else { int i; assert(!(width & 3)); /*Read 4 pixels two rows at a time.*/ for (i = 0; i < height; i += 2) { __m128i s0 = _mm_loadl_epi64((const __m128i *)ref); __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride)); __m128i t0 = _mm_unpacklo_epi64(s0, s1); _mm_storeu_si128((__m128i *)comp_pred, t0); comp_pred += 8; ref += 2 * ref_stride; } } } else if (!subpel_y_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16, NULL, -1, width, height, bd); } else if (!subpel_x_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1, kernel, 16, width, height, bd); } else { DECLARE_ALIGNED(16, uint16_t, temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); const int16_t *const kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1); uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp; uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); const int intermediate_height = (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); aom_highbd_convolve8_horiz( ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz), MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd); aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, bd); } } void aom_highbd_comp_avg_upsampled_pred_sse2( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search) { aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, bd, subpel_search); uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/ assert(!(width * height & 7)); int n = width * height >> 3; for (int i = 0; i < n; i++) { __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16); __m128i p0 = _mm_loadu_si128((const __m128i *)pred); _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0)); comp_pred16 += 8; pred += 8; } } #endif // CONFIG_AV1_HIGHBITDEPTH void aom_comp_avg_upsampled_pred_sse2( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search) { int n; int i; aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ assert(!(width * height & 15)); n = width * height >> 4; for (i = 0; i < n; i++) { __m128i s0 = xx_loadu_128(comp_pred); __m128i p0 = xx_loadu_128(pred); xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0)); comp_pred += 16; pred += 16; } } aom-3.12.1/av1/encoder/x86/temporal_filter_avx2.c000066400000000000000000000661361477627663500214440ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "av1/encoder/encoder.h" #include "av1/encoder/temporal_filter.h" #define SSE_STRIDE (BW + 2) DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 }, { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 }, { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 }, { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } }; DECLARE_ALIGNED(32, static const uint8_t, shufflemask_16b[2][16]) = { { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 11, 10, 11 } }; #define CALC_X_GRADIENT(AC, GI, DF, out) \ out = _mm256_abs_epi16( \ _mm256_add_epi16(_mm256_add_epi16(AC, GI), _mm256_slli_epi16(DF, 1))); #define CALC_Y_GRADIENT(AC, GI, BH, out) \ out = _mm256_abs_epi16( \ _mm256_add_epi16(_mm256_sub_epi16(AC, GI), _mm256_slli_epi16(BH, 1))); double av1_estimate_noise_from_single_plane_avx2(const uint8_t *src, int height, int width, int stride, int edge_thresh) { int count = 0; int64_t accum = 0; // w32 stores width multiple of 32. const int w32 = (width - 1) & ~0x1f; const __m256i zero = _mm256_setzero_si256(); const __m256i edge_threshold = _mm256_set1_epi16(edge_thresh); __m256i num_accumulator = zero; __m256i sum_accumulator = zero; // A | B | C // D | E | F // G | H | I // g_x = (A - C) + (G - I) + 2*(D - F) // g_y = (A + C) - (G + I) + 2*(B - H) // v = 4*E - 2*(D+F+B+H) + (A+C+G+I) // Process the width multiple of 32 here. for (int w = 1; w < w32; w += 32) { int h = 1; const int start_idx = h * stride + w; const int stride_0 = start_idx - stride; __m256i num_accum_row_lvl = zero; const __m256i A = _mm256_loadu_si256((__m256i *)(&src[stride_0 - 1])); const __m256i C = _mm256_loadu_si256((__m256i *)(&src[stride_0 + 1])); const __m256i D = _mm256_loadu_si256((__m256i *)(&src[start_idx - 1])); const __m256i F = _mm256_loadu_si256((__m256i *)(&src[start_idx + 1])); __m256i B = _mm256_loadu_si256((__m256i *)(&src[stride_0])); __m256i E = _mm256_loadu_si256((__m256i *)(&src[start_idx])); const __m256i A_lo = _mm256_unpacklo_epi8(A, zero); const __m256i A_hi = _mm256_unpackhi_epi8(A, zero); const __m256i C_lo = _mm256_unpacklo_epi8(C, zero); const __m256i C_hi = _mm256_unpackhi_epi8(C, zero); const __m256i D_lo = _mm256_unpacklo_epi8(D, zero); const __m256i D_hi = _mm256_unpackhi_epi8(D, zero); const __m256i F_lo = _mm256_unpacklo_epi8(F, zero); const __m256i F_hi = _mm256_unpackhi_epi8(F, zero); __m256i sub_AC_lo = _mm256_sub_epi16(A_lo, C_lo); __m256i sub_AC_hi = _mm256_sub_epi16(A_hi, C_hi); __m256i sum_AC_lo = _mm256_add_epi16(A_lo, C_lo); __m256i sum_AC_hi = _mm256_add_epi16(A_hi, C_hi); __m256i sub_DF_lo = _mm256_sub_epi16(D_lo, F_lo); __m256i sub_DF_hi = _mm256_sub_epi16(D_hi, F_hi); __m256i sum_DF_lo = _mm256_add_epi16(D_lo, F_lo); __m256i sum_DF_hi = _mm256_add_epi16(D_hi, F_hi); for (; h < height - 1; h++) { __m256i sum_GI_lo, sub_GI_lo, sum_GI_hi, sub_GI_hi, gx_lo, gy_lo, gx_hi, gy_hi; const int k = h * stride + w; const __m256i G = _mm256_loadu_si256((__m256i *)(&src[k + stride - 1])); const __m256i H = _mm256_loadu_si256((__m256i *)(&src[k + stride])); const __m256i I = _mm256_loadu_si256((__m256i *)(&src[k + stride + 1])); const __m256i B_lo = _mm256_unpacklo_epi8(B, zero); const __m256i B_hi = _mm256_unpackhi_epi8(B, zero); const __m256i G_lo = _mm256_unpacklo_epi8(G, zero); const __m256i G_hi = _mm256_unpackhi_epi8(G, zero); const __m256i I_lo = _mm256_unpacklo_epi8(I, zero); const __m256i I_hi = _mm256_unpackhi_epi8(I, zero); const __m256i H_lo = _mm256_unpacklo_epi8(H, zero); const __m256i H_hi = _mm256_unpackhi_epi8(H, zero); sub_GI_lo = _mm256_sub_epi16(G_lo, I_lo); sub_GI_hi = _mm256_sub_epi16(G_hi, I_hi); sum_GI_lo = _mm256_add_epi16(G_lo, I_lo); sum_GI_hi = _mm256_add_epi16(G_hi, I_hi); const __m256i sub_BH_lo = _mm256_sub_epi16(B_lo, H_lo); const __m256i sub_BH_hi = _mm256_sub_epi16(B_hi, H_hi); CALC_X_GRADIENT(sub_AC_lo, sub_GI_lo, sub_DF_lo, gx_lo) CALC_Y_GRADIENT(sum_AC_lo, sum_GI_lo, sub_BH_lo, gy_lo) const __m256i ga_lo = _mm256_add_epi16(gx_lo, gy_lo); CALC_X_GRADIENT(sub_AC_hi, sub_GI_hi, sub_DF_hi, gx_hi) CALC_Y_GRADIENT(sum_AC_hi, sum_GI_hi, sub_BH_hi, gy_hi) const __m256i ga_hi = _mm256_add_epi16(gx_hi, gy_hi); __m256i cmp_lo = _mm256_cmpgt_epi16(edge_threshold, ga_lo); __m256i cmp_hi = _mm256_cmpgt_epi16(edge_threshold, ga_hi); const __m256i comp_reg = _mm256_add_epi16(cmp_lo, cmp_hi); // v = 4*E -2*(D+F+B+H) + (A+C+G+I) if (_mm256_movemask_epi8(comp_reg) != 0) { const __m256i sum_BH_lo = _mm256_add_epi16(B_lo, H_lo); const __m256i sum_BH_hi = _mm256_add_epi16(B_hi, H_hi); // 2*(D+F+B+H) const __m256i sum_DFBH_lo = _mm256_slli_epi16(_mm256_add_epi16(sum_DF_lo, sum_BH_lo), 1); // (A+C+G+I) const __m256i sum_ACGI_lo = _mm256_add_epi16(sum_AC_lo, sum_GI_lo); const __m256i sum_DFBH_hi = _mm256_slli_epi16(_mm256_add_epi16(sum_DF_hi, sum_BH_hi), 1); const __m256i sum_ACGI_hi = _mm256_add_epi16(sum_AC_hi, sum_GI_hi); // Convert E register values from 8bit to 16bit const __m256i E_lo = _mm256_unpacklo_epi8(E, zero); const __m256i E_hi = _mm256_unpackhi_epi8(E, zero); // 4*E - 2*(D+F+B+H)+ (A+C+G+I) const __m256i var_lo_0 = _mm256_abs_epi16(_mm256_add_epi16( _mm256_sub_epi16(_mm256_slli_epi16(E_lo, 2), sum_DFBH_lo), sum_ACGI_lo)); const __m256i var_hi_0 = _mm256_abs_epi16(_mm256_add_epi16( _mm256_sub_epi16(_mm256_slli_epi16(E_hi, 2), sum_DFBH_hi), sum_ACGI_hi)); cmp_lo = _mm256_srli_epi16(cmp_lo, 15); cmp_hi = _mm256_srli_epi16(cmp_hi, 15); const __m256i var_lo = _mm256_mullo_epi16(var_lo_0, cmp_lo); const __m256i var_hi = _mm256_mullo_epi16(var_hi_0, cmp_hi); num_accum_row_lvl = _mm256_add_epi16(num_accum_row_lvl, cmp_lo); num_accum_row_lvl = _mm256_add_epi16(num_accum_row_lvl, cmp_hi); sum_accumulator = _mm256_add_epi32(sum_accumulator, _mm256_unpacklo_epi16(var_lo, zero)); sum_accumulator = _mm256_add_epi32(sum_accumulator, _mm256_unpackhi_epi16(var_lo, zero)); sum_accumulator = _mm256_add_epi32(sum_accumulator, _mm256_unpacklo_epi16(var_hi, zero)); sum_accumulator = _mm256_add_epi32(sum_accumulator, _mm256_unpackhi_epi16(var_hi, zero)); } sub_AC_lo = sub_DF_lo; sub_AC_hi = sub_DF_hi; sub_DF_lo = sub_GI_lo; sub_DF_hi = sub_GI_hi; sum_AC_lo = sum_DF_lo; sum_AC_hi = sum_DF_hi; sum_DF_lo = sum_GI_lo; sum_DF_hi = sum_GI_hi; B = E; E = H; } const __m256i num_0 = _mm256_unpacklo_epi16(num_accum_row_lvl, zero); const __m256i num_1 = _mm256_unpackhi_epi16(num_accum_row_lvl, zero); num_accumulator = _mm256_add_epi32(num_accumulator, _mm256_add_epi32(num_0, num_1)); } // Process the remaining width here. for (int h = 1; h < height - 1; ++h) { for (int w = w32 + 1; w < width - 1; ++w) { const int k = h * stride + w; // Compute sobel gradients const int g_x = (src[k - stride - 1] - src[k - stride + 1]) + (src[k + stride - 1] - src[k + stride + 1]) + 2 * (src[k - 1] - src[k + 1]); const int g_y = (src[k - stride - 1] - src[k + stride - 1]) + (src[k - stride + 1] - src[k + stride + 1]) + 2 * (src[k - stride] - src[k + stride]); const int ga = abs(g_x) + abs(g_y); if (ga < edge_thresh) { // Find Laplacian const int v = 4 * src[k] - 2 * (src[k - 1] + src[k + 1] + src[k - stride] + src[k + stride]) + (src[k - stride - 1] + src[k - stride + 1] + src[k + stride - 1] + src[k + stride + 1]); accum += abs(v); ++count; } } } // s0 s1 n0 n1 s2 s3 n2 n3 __m256i sum_avx = _mm256_hadd_epi32(sum_accumulator, num_accumulator); __m128i sum_avx_lo = _mm256_castsi256_si128(sum_avx); __m128i sum_avx_hi = _mm256_extractf128_si256(sum_avx, 1); // s0+s2 s1+s3 n0+n2 n1+n3 __m128i sum_avx_1 = _mm_add_epi32(sum_avx_lo, sum_avx_hi); // s0+s2+s1+s3 n0+n2+n1+n3 __m128i result = _mm_add_epi32(_mm_srli_si128(sum_avx_1, 4), sum_avx_1); accum += _mm_cvtsi128_si32(result); count += _mm_extract_epi32(result, 2); // If very few smooth pels, return -1 since the estimate is unreliable. return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2; } static AOM_FORCE_INLINE void get_squared_error_16x16_avx2( const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, const unsigned int stride2, const int block_width, const int block_height, uint16_t *frame_sse, const unsigned int sse_stride) { (void)block_width; const uint8_t *src1 = frame1; const uint8_t *src2 = frame2; uint16_t *dst = frame_sse; for (int i = 0; i < block_height; i++) { __m128i vf1_128, vf2_128; __m256i vf1, vf2, vdiff1, vsqdiff1; vf1_128 = _mm_loadu_si128((__m128i *)(src1)); vf2_128 = _mm_loadu_si128((__m128i *)(src2)); vf1 = _mm256_cvtepu8_epi16(vf1_128); vf2 = _mm256_cvtepu8_epi16(vf2_128); vdiff1 = _mm256_sub_epi16(vf1, vf2); vsqdiff1 = _mm256_mullo_epi16(vdiff1, vdiff1); _mm256_storeu_si256((__m256i *)(dst), vsqdiff1); // Set zero to uninitialized memory to avoid uninitialized loads later *(int *)(dst + 16) = _mm_cvtsi128_si32(_mm_setzero_si128()); src1 += stride, src2 += stride2; dst += sse_stride; } } static AOM_FORCE_INLINE void get_squared_error_32x32_avx2( const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, const unsigned int stride2, const int block_width, const int block_height, uint16_t *frame_sse, const unsigned int sse_stride) { (void)block_width; const uint8_t *src1 = frame1; const uint8_t *src2 = frame2; uint16_t *dst = frame_sse; for (int i = 0; i < block_height; i++) { __m256i vsrc1, vsrc2, vmin, vmax, vdiff, vdiff1, vdiff2, vres1, vres2; vsrc1 = _mm256_loadu_si256((__m256i *)src1); vsrc2 = _mm256_loadu_si256((__m256i *)src2); vmax = _mm256_max_epu8(vsrc1, vsrc2); vmin = _mm256_min_epu8(vsrc1, vsrc2); vdiff = _mm256_subs_epu8(vmax, vmin); __m128i vtmp1 = _mm256_castsi256_si128(vdiff); __m128i vtmp2 = _mm256_extracti128_si256(vdiff, 1); vdiff1 = _mm256_cvtepu8_epi16(vtmp1); vdiff2 = _mm256_cvtepu8_epi16(vtmp2); vres1 = _mm256_mullo_epi16(vdiff1, vdiff1); vres2 = _mm256_mullo_epi16(vdiff2, vdiff2); _mm256_storeu_si256((__m256i *)(dst), vres1); _mm256_storeu_si256((__m256i *)(dst + 16), vres2); // Set zero to uninitialized memory to avoid uninitialized loads later *(int *)(dst + 32) = _mm_cvtsi128_si32(_mm_setzero_si128()); src1 += stride; src2 += stride2; dst += sse_stride; } } static AOM_FORCE_INLINE __m256i xx_load_and_pad(uint16_t *src, int col, int block_width) { __m128i v128tmp = _mm_loadu_si128((__m128i *)(src)); if (col == 0) { // For the first column, replicate the first element twice to the left v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[0]); } if (col == block_width - 4) { // For the last column, replicate the last element twice to the right v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[1]); } return _mm256_cvtepu16_epi32(v128tmp); } static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) { // Mask the required 5 values inside the vector __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]); __m128i v128a, v128b; // Extract 256b as two 128b registers A and B v128a = _mm256_castsi256_si128(vtmp); v128b = _mm256_extracti128_si256(vtmp, 1); // A = [A0+B0, A1+B1, A2+B2, A3+B3] v128a = _mm_add_epi32(v128a, v128b); // B = [A2+B2, A3+B3, 0, 0] v128b = _mm_srli_si128(v128a, 8); // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X] v128a = _mm_add_epi32(v128a, v128b); // B = [A1+B1+A3+B3, 0, 0, 0] v128b = _mm_srli_si128(v128a, 4); // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X] v128a = _mm_add_epi32(v128a, v128b); return _mm_extract_epi32(v128a, 0); } // AVX2 implementation of approx_exp() static inline __m256 approx_exp_avx2(__m256 y) { #define A ((1 << 23) / 0.69314718056f) // (1 << 23) / ln(2) #define B \ 127 // Offset for the exponent according to IEEE floating point standard. #define C 60801 // Magic number controls the accuracy of approximation const __m256 multiplier = _mm256_set1_ps(A); const __m256i offset = _mm256_set1_epi32(B * (1 << 23) - C); y = _mm256_mul_ps(y, multiplier); y = _mm256_castsi256_ps(_mm256_add_epi32(_mm256_cvttps_epi32(y), offset)); return y; #undef A #undef B #undef C } static void apply_temporal_filter( const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, const unsigned int stride2, const int block_width, const int block_height, const int *subblock_mses, unsigned int *accumulator, uint16_t *count, uint16_t *frame_sse, uint32_t *luma_sse_sum, const double inv_num_ref_pixels, const double decay_factor, const double inv_factor, const double weight_factor, double *d_factor, int tf_wgt_calc_lvl) { assert(((block_width == 16) || (block_width == 32)) && ((block_height == 16) || (block_height == 32))); uint32_t acc_5x5_sse[BH][BW]; if (block_width == 32) { get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width, block_height, frame_sse, SSE_STRIDE); } else { get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width, block_height, frame_sse, SSE_STRIDE); } __m256i vsrc[5]; // Traverse 4 columns at a time // First and last columns will require padding for (int col = 0; col < block_width; col += 4) { uint16_t *src = (col) ? frame_sse + col - 2 : frame_sse; // Load and pad(for first and last col) 3 rows from the top for (int i = 2; i < 5; i++) { vsrc[i] = xx_load_and_pad(src, col, block_width); src += SSE_STRIDE; } // Copy first row to first 2 vectors vsrc[0] = vsrc[2]; vsrc[1] = vsrc[2]; for (int row = 0; row < block_height; row++) { __m256i vsum = _mm256_setzero_si256(); // Add 5 consecutive rows for (int i = 0; i < 5; i++) { vsum = _mm256_add_epi32(vsum, vsrc[i]); } // Push all elements by one element to the top for (int i = 0; i < 4; i++) { vsrc[i] = vsrc[i + 1]; } // Load next row to the last element if (row <= block_height - 4) { vsrc[4] = xx_load_and_pad(src, col, block_width); src += SSE_STRIDE; } else { vsrc[4] = vsrc[3]; } // Accumulate the sum horizontally for (int i = 0; i < 4; i++) { acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum, i); } } } double subblock_mses_scaled[4]; double d_factor_decayed[4]; for (int idx = 0; idx < 4; idx++) { subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor; d_factor_decayed[idx] = d_factor[idx] * decay_factor; } if (tf_wgt_calc_lvl == 0) { for (int i = 0, k = 0; i < block_height; i++) { const int y_blk_raster_offset = (i >= block_height / 2) * 2; for (int j = 0; j < block_width; j++, k++) { const int pixel_value = frame2[i * stride2 + j]; uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; const double window_error = diff_sse * inv_num_ref_pixels; const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); const double combined_error = weight_factor * window_error + subblock_mses_scaled[subblock_idx]; double scaled_error = combined_error * d_factor_decayed[subblock_idx]; scaled_error = AOMMIN(scaled_error, 7); const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); count[k] += weight; accumulator[k] += weight * pixel_value; } } } else { __m256d subblock_mses_reg[4]; __m256d d_factor_mul_n_decay_qr_invs[4]; const __m256 zero = _mm256_set1_ps(0.0f); const __m256 point_five = _mm256_set1_ps(0.5f); const __m256 seven = _mm256_set1_ps(7.0f); const __m256d inv_num_ref_pixel_256bit = _mm256_set1_pd(inv_num_ref_pixels); const __m256d weight_factor_256bit = _mm256_set1_pd(weight_factor); const __m256 tf_weight_scale = _mm256_set1_ps((float)TF_WEIGHT_SCALE); // Maintain registers to hold mse and d_factor at subblock level. subblock_mses_reg[0] = _mm256_set1_pd(subblock_mses_scaled[0]); subblock_mses_reg[1] = _mm256_set1_pd(subblock_mses_scaled[1]); subblock_mses_reg[2] = _mm256_set1_pd(subblock_mses_scaled[2]); subblock_mses_reg[3] = _mm256_set1_pd(subblock_mses_scaled[3]); d_factor_mul_n_decay_qr_invs[0] = _mm256_set1_pd(d_factor_decayed[0]); d_factor_mul_n_decay_qr_invs[1] = _mm256_set1_pd(d_factor_decayed[1]); d_factor_mul_n_decay_qr_invs[2] = _mm256_set1_pd(d_factor_decayed[2]); d_factor_mul_n_decay_qr_invs[3] = _mm256_set1_pd(d_factor_decayed[3]); for (int i = 0; i < block_height; i++) { const int y_blk_raster_offset = (i >= block_height / 2) * 2; uint32_t *luma_sse_sum_temp = luma_sse_sum + i * BW; for (int j = 0; j < block_width; j += 8) { const __m256i acc_sse = _mm256_lddqu_si256((__m256i *)(acc_5x5_sse[i] + j)); const __m256i luma_sse = _mm256_lddqu_si256((__m256i *)((luma_sse_sum_temp + j))); // uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; const __m256i diff_sse = _mm256_add_epi32(acc_sse, luma_sse); const __m256d diff_sse_pd_1 = _mm256_cvtepi32_pd(_mm256_castsi256_si128(diff_sse)); const __m256d diff_sse_pd_2 = _mm256_cvtepi32_pd(_mm256_extracti128_si256(diff_sse, 1)); // const double window_error = diff_sse * inv_num_ref_pixels; const __m256d window_error_1 = _mm256_mul_pd(diff_sse_pd_1, inv_num_ref_pixel_256bit); const __m256d window_error_2 = _mm256_mul_pd(diff_sse_pd_2, inv_num_ref_pixel_256bit); // const int subblock_idx = y_blk_raster_offset + (j >= block_width / // 2); const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); const __m256d blk_error = subblock_mses_reg[subblock_idx]; // const double combined_error = // weight_factor *window_error + subblock_mses_scaled[subblock_idx]; const __m256d combined_error_1 = _mm256_add_pd( _mm256_mul_pd(window_error_1, weight_factor_256bit), blk_error); const __m256d combined_error_2 = _mm256_add_pd( _mm256_mul_pd(window_error_2, weight_factor_256bit), blk_error); // d_factor_decayed[subblock_idx] const __m256d d_fact_mul_n_decay = d_factor_mul_n_decay_qr_invs[subblock_idx]; // double scaled_error = combined_error * // d_factor_decayed[subblock_idx]; const __m256d scaled_error_1 = _mm256_mul_pd(combined_error_1, d_fact_mul_n_decay); const __m256d scaled_error_2 = _mm256_mul_pd(combined_error_2, d_fact_mul_n_decay); const __m128 scaled_error_ps_1 = _mm256_cvtpd_ps(scaled_error_1); const __m128 scaled_error_ps_2 = _mm256_cvtpd_ps(scaled_error_2); const __m256 scaled_error_ps = _mm256_insertf128_ps( _mm256_castps128_ps256(scaled_error_ps_1), scaled_error_ps_2, 0x1); // scaled_error = AOMMIN(scaled_error, 7); const __m256 scaled_diff_ps = _mm256_min_ps(scaled_error_ps, seven); const __m256 minus_scaled_diff_ps = _mm256_sub_ps(zero, scaled_diff_ps); // const int weight = //(int)(approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE + 0.5f); const __m256 exp_result = approx_exp_avx2(minus_scaled_diff_ps); const __m256 scale_weight_exp_result = _mm256_mul_ps(exp_result, tf_weight_scale); const __m256 round_result = _mm256_add_ps(scale_weight_exp_result, point_five); __m256i weights_in_32bit = _mm256_cvttps_epi32(round_result); __m128i weights_in_16bit = _mm_packus_epi32(_mm256_castsi256_si128(weights_in_32bit), _mm256_extractf128_si256(weights_in_32bit, 0x1)); // count[k] += weight; // accumulator[k] += weight * pixel_value; const int stride_idx = i * stride2 + j; const __m128i count_array = _mm_loadu_si128((__m128i *)(count + stride_idx)); _mm_storeu_si128((__m128i *)(count + stride_idx), _mm_add_epi16(count_array, weights_in_16bit)); const __m256i accumulator_array = _mm256_loadu_si256((__m256i *)(accumulator + stride_idx)); const __m128i pred_values = _mm_loadl_epi64((__m128i *)(frame2 + stride_idx)); const __m256i pred_values_u32 = _mm256_cvtepu8_epi32(pred_values); const __m256i mull_frame2_weight_u32 = _mm256_mullo_epi32(pred_values_u32, weights_in_32bit); _mm256_storeu_si256( (__m256i *)(accumulator + stride_idx), _mm256_add_epi32(accumulator_array, mull_frame2_weight_u32)); } } } } void av1_apply_temporal_filter_avx2( const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count) { const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; assert(block_size == BLOCK_32X32 && "Only support 32x32 block with avx2!"); assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with avx2!"); assert(!is_high_bitdepth && "Only support low bit-depth with avx2!"); assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); (void)is_high_bitdepth; const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; const int frame_height = frame_to_filter->y_crop_height; const int frame_width = frame_to_filter->y_crop_width; const int min_frame_size = AOMMIN(frame_height, frame_width); // Variables to simplify combined error calculation. const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * TF_SEARCH_ERROR_NORM_WEIGHT); const double weight_factor = (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; // Adjust filtering based on q. // Larger q -> stronger filtering -> larger weight. // Smaller q -> weaker filtering -> smaller weight. double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); q_decay = CLIP(q_decay, 1e-5, 1); if (q_factor >= TF_QINDEX_CUTOFF) { // Max q_factor is 255, therefore the upper bound of q_decay is 8. // We do not need a clip here. q_decay = 0.5 * pow((double)q_factor / 64, 2); } // Smaller strength -> smaller filtering weight. double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); s_decay = CLIP(s_decay, 1e-5, 1); double d_factor[4] = { 0 }; uint16_t frame_sse[SSE_STRIDE * BH] = { 0 }; uint32_t luma_sse_sum[BW * BH] = { 0 }; for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { // Larger motion vector -> smaller filtering weight. const MV mv = subblock_mvs[subblock_idx]; const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; distance_threshold = AOMMAX(distance_threshold, 1); d_factor[subblock_idx] = distance / distance_threshold; d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); } // Handle planes in sequence. int plane_offset = 0; for (int plane = 0; plane < num_planes; ++plane) { const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1]; const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset; const int ss_x_shift = mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; const int ss_y_shift = mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); const double inv_num_ref_pixels = 1.0 / num_ref_pixels; // Larger noise -> larger filtering weight. const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); // Decay factors for non-local mean approach. const double decay_factor = 1 / (n_decay * q_decay * s_decay); // Filter U-plane and V-plane using Y-plane. This is because motion // search is only done on Y-plane, so the information from Y-plane // will be more accurate. The luma sse sum is reused in both chroma // planes. if (plane == AOM_PLANE_U) { for (unsigned int i = 0, k = 0; i < plane_h; i++) { for (unsigned int j = 0; j < plane_w; j++, k++) { for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx]; } } } } } apply_temporal_filter(ref, frame_stride, pred + plane_offset, plane_w, plane_w, plane_h, subblock_mses, accum + plane_offset, count + plane_offset, frame_sse, luma_sse_sum, inv_num_ref_pixels, decay_factor, inv_factor, weight_factor, d_factor, tf_wgt_calc_lvl); plane_offset += plane_h * plane_w; } } aom-3.12.1/av1/encoder/x86/temporal_filter_sse2.c000066400000000000000000000321331477627663500214260ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "config/av1_rtcd.h" #include "aom_dsp/mathutils.h" #include "av1/encoder/encoder.h" #include "av1/encoder/temporal_filter.h" // For the squared error buffer, keep a padding for 4 samples #define SSE_STRIDE (BW + 4) DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = { { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } }, { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } }, { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } }, { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } } }; static void get_squared_error(const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, const unsigned int stride2, const int block_width, const int block_height, uint16_t *frame_sse, const unsigned int dst_stride) { const uint8_t *src1 = frame1; const uint8_t *src2 = frame2; uint16_t *dst = frame_sse; for (int i = 0; i < block_height; i++) { for (int j = 0; j < block_width; j += 16) { // Set zero to uninitialized memory to avoid uninitialized loads later *(int *)(dst) = _mm_cvtsi128_si32(_mm_setzero_si128()); __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j)); __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j)); __m128i vmax = _mm_max_epu8(vsrc1, vsrc2); __m128i vmin = _mm_min_epu8(vsrc1, vsrc2); __m128i vdiff = _mm_subs_epu8(vmax, vmin); __m128i vzero = _mm_setzero_si128(); __m128i vdiff1 = _mm_unpacklo_epi8(vdiff, vzero); __m128i vdiff2 = _mm_unpackhi_epi8(vdiff, vzero); __m128i vres1 = _mm_mullo_epi16(vdiff1, vdiff1); __m128i vres2 = _mm_mullo_epi16(vdiff2, vdiff2); _mm_storeu_si128((__m128i *)(dst + j + 2), vres1); _mm_storeu_si128((__m128i *)(dst + j + 10), vres2); } // Set zero to uninitialized memory to avoid uninitialized loads later *(int *)(dst + block_width + 2) = _mm_cvtsi128_si32(_mm_setzero_si128()); src1 += stride; src2 += stride2; dst += dst_stride; } } static void xx_load_and_pad(uint16_t *src, __m128i *dstvec, int col, int block_width) { __m128i vtmp = _mm_loadu_si128((__m128i *)src); __m128i vzero = _mm_setzero_si128(); __m128i vtmp1 = _mm_unpacklo_epi16(vtmp, vzero); __m128i vtmp2 = _mm_unpackhi_epi16(vtmp, vzero); // For the first column, replicate the first element twice to the left dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA); // For the last column, replicate the last element twice to the right dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54); } static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) { __m128i veca, vecb; // Mask and obtain the required 5 values inside the vector veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]); vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]); // A = [A0+B0, A1+B1, A2+B2, A3+B3] veca = _mm_add_epi32(veca, vecb); // B = [A2+B2, A3+B3, 0, 0] vecb = _mm_srli_si128(veca, 8); // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X] veca = _mm_add_epi32(veca, vecb); // B = [A1+B1+A3+B3, 0, 0, 0] vecb = _mm_srli_si128(veca, 4); // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X] veca = _mm_add_epi32(veca, vecb); return _mm_cvtsi128_si32(veca); } static void apply_temporal_filter( const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, const unsigned int stride2, const int block_width, const int block_height, const int *subblock_mses, unsigned int *accumulator, uint16_t *count, uint16_t *frame_sse, uint32_t *luma_sse_sum, const double inv_num_ref_pixels, const double decay_factor, const double inv_factor, const double weight_factor, double *d_factor, int tf_wgt_calc_lvl) { assert(((block_width == 16) || (block_width == 32)) && ((block_height == 16) || (block_height == 32))); uint32_t acc_5x5_sse[BH][BW]; get_squared_error(frame1, stride, frame2, stride2, block_width, block_height, frame_sse, SSE_STRIDE); __m128i vsrc[5][2]; // Traverse 4 columns at a time // First and last columns will require padding for (int col = 0; col < block_width; col += 4) { uint16_t *src = frame_sse + col; // Load and pad(for first and last col) 3 rows from the top for (int i = 2; i < 5; i++) { xx_load_and_pad(src, vsrc[i], col, block_width); src += SSE_STRIDE; } // Padding for top 2 rows vsrc[0][0] = vsrc[2][0]; vsrc[0][1] = vsrc[2][1]; vsrc[1][0] = vsrc[2][0]; vsrc[1][1] = vsrc[2][1]; for (int row = 0; row < block_height; row++) { __m128i vsum1 = _mm_setzero_si128(); __m128i vsum2 = _mm_setzero_si128(); // Add 5 consecutive rows for (int i = 0; i < 5; i++) { vsum1 = _mm_add_epi32(vsrc[i][0], vsum1); vsum2 = _mm_add_epi32(vsrc[i][1], vsum2); } // Push all elements by one element to the top for (int i = 0; i < 4; i++) { vsrc[i][0] = vsrc[i + 1][0]; vsrc[i][1] = vsrc[i + 1][1]; } if (row <= block_height - 4) { // Load next row xx_load_and_pad(src, vsrc[4], col, block_width); src += SSE_STRIDE; } else { // Padding for bottom 2 rows vsrc[4][0] = vsrc[3][0]; vsrc[4][1] = vsrc[3][1]; } // Accumulate the sum horizontally for (int i = 0; i < 4; i++) { acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum1, vsum2, i); } } } double subblock_mses_scaled[4]; double d_factor_decayed[4]; for (int idx = 0; idx < 4; idx++) { subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor; d_factor_decayed[idx] = d_factor[idx] * decay_factor; } if (tf_wgt_calc_lvl == 0) { for (int i = 0, k = 0; i < block_height; i++) { const int y_blk_raster_offset = (i >= block_height / 2) * 2; for (int j = 0; j < block_width; j++, k++) { const int pixel_value = frame2[i * stride2 + j]; uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; const double window_error = diff_sse * inv_num_ref_pixels; const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); const double combined_error = weight_factor * window_error + subblock_mses_scaled[subblock_idx]; double scaled_error = combined_error * d_factor_decayed[subblock_idx]; scaled_error = AOMMIN(scaled_error, 7); const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); count[k] += weight; accumulator[k] += weight * pixel_value; } } } else { for (int i = 0, k = 0; i < block_height; i++) { const int y_blk_raster_offset = (i >= block_height / 2) * 2; for (int j = 0; j < block_width; j++, k++) { const int pixel_value = frame2[i * stride2 + j]; uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; const double window_error = diff_sse * inv_num_ref_pixels; const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); const double combined_error = weight_factor * window_error + subblock_mses_scaled[subblock_idx]; double scaled_error = combined_error * d_factor_decayed[subblock_idx]; scaled_error = AOMMIN(scaled_error, 7); const float fweight = approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; const int weight = iroundpf(fweight); count[k] += weight; accumulator[k] += weight * pixel_value; } } } } void av1_apply_temporal_filter_sse2( const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const MV *subblock_mvs, const int *subblock_mses, const int q_factor, const int filter_strength, int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, uint16_t *count) { const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!"); assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!"); assert(!is_high_bitdepth && "Only support low bit-depth with sse2!"); assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); (void)is_high_bitdepth; const int mb_height = block_size_high[block_size]; const int mb_width = block_size_wide[block_size]; const int frame_height = frame_to_filter->y_crop_height; const int frame_width = frame_to_filter->y_crop_width; const int min_frame_size = AOMMIN(frame_height, frame_width); // Variables to simplify combined error calculation. const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * TF_SEARCH_ERROR_NORM_WEIGHT); const double weight_factor = (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; // Adjust filtering based on q. // Larger q -> stronger filtering -> larger weight. // Smaller q -> weaker filtering -> smaller weight. double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); q_decay = CLIP(q_decay, 1e-5, 1); if (q_factor >= TF_QINDEX_CUTOFF) { // Max q_factor is 255, therefore the upper bound of q_decay is 8. // We do not need a clip here. q_decay = 0.5 * pow((double)q_factor / 64, 2); } // Smaller strength -> smaller filtering weight. double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); s_decay = CLIP(s_decay, 1e-5, 1); double d_factor[4] = { 0 }; uint16_t frame_sse[SSE_STRIDE * BH] = { 0 }; uint32_t luma_sse_sum[BW * BH] = { 0 }; for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { // Larger motion vector -> smaller filtering weight. const MV mv = subblock_mvs[subblock_idx]; const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; distance_threshold = AOMMAX(distance_threshold, 1); d_factor[subblock_idx] = distance / distance_threshold; d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); } // Handle planes in sequence. int plane_offset = 0; for (int plane = 0; plane < num_planes; ++plane) { const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1]; const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset; const int ss_x_shift = mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; const int ss_y_shift = mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); const double inv_num_ref_pixels = 1.0 / num_ref_pixels; // Larger noise -> larger filtering weight. const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); // Decay factors for non-local mean approach. const double decay_factor = 1 / (n_decay * q_decay * s_decay); // Filter U-plane and V-plane using Y-plane. This is because motion // search is only done on Y-plane, so the information from Y-plane // will be more accurate. The luma sse sum is reused in both chroma // planes. if (plane == AOM_PLANE_U) { for (unsigned int i = 0, k = 0; i < plane_h; i++) { for (unsigned int j = 0; j < plane_w; j++, k++) { for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2]; } } } } } apply_temporal_filter(ref, frame_stride, pred + plane_offset, plane_w, plane_w, plane_h, subblock_mses, accum + plane_offset, count + plane_offset, frame_sse, luma_sse_sum, inv_num_ref_pixels, decay_factor, inv_factor, weight_factor, d_factor, tf_wgt_calc_lvl); plane_offset += plane_h * plane_w; } } aom-3.12.1/av1/encoder/x86/wedge_utils_avx2.c000066400000000000000000000176661477627663500205730ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" #include "aom/aom_integer.h" #include "av1/common/reconinter.h" #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) /** * See av1_wedge_sse_from_residuals_c */ uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d, const uint8_t *m, int N) { int n = -N; uint64_t csse; const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE); const __m256i v_zext_q = _mm256_set1_epi64x(~0u); __m256i v_acc0_q = _mm256_setzero_si256(); assert(N % 64 == 0); r1 += N; d += N; m += N; do { const __m256i v_r0_w = _mm256_lddqu_si256((__m256i *)(r1 + n)); const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(d + n)); const __m128i v_m01_b = _mm_lddqu_si128((__m128i *)(m + n)); const __m256i v_rd0l_w = _mm256_unpacklo_epi16(v_d0_w, v_r0_w); const __m256i v_rd0h_w = _mm256_unpackhi_epi16(v_d0_w, v_r0_w); const __m256i v_m0_w = _mm256_cvtepu8_epi16(v_m01_b); const __m256i v_m0l_w = _mm256_unpacklo_epi16(v_m0_w, v_mask_max_w); const __m256i v_m0h_w = _mm256_unpackhi_epi16(v_m0_w, v_mask_max_w); const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w); const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w); const __m256i v_t0_w = _mm256_packs_epi32(v_t0l_d, v_t0h_d); const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w); const __m256i v_sum0_q = _mm256_add_epi64( _mm256_and_si256(v_sq0_d, v_zext_q), _mm256_srli_epi64(v_sq0_d, 32)); v_acc0_q = _mm256_add_epi64(v_acc0_q, v_sum0_q); n += 16; } while (n); v_acc0_q = _mm256_add_epi64(v_acc0_q, _mm256_srli_si256(v_acc0_q, 8)); __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc0_q); __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc0_q, 1); v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1); #if AOM_ARCH_X86_64 csse = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0); #else xx_storel_64(&csse, v_acc_q_0); #endif return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); } /** * See av1_wedge_sign_from_residuals_c */ int8_t av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m, int N, int64_t limit) { int64_t acc; __m256i v_acc0_d = _mm256_setzero_si256(); // Input size limited to 8192 by the use of 32 bit accumulators and m // being between [0, 64]. Overflow might happen at larger sizes, // though it is practically impossible on real video input. assert(N < 8192); assert(N % 64 == 0); do { const __m256i v_m01_b = _mm256_lddqu_si256((__m256i *)(m)); const __m256i v_m23_b = _mm256_lddqu_si256((__m256i *)(m + 32)); const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(ds)); const __m256i v_d1_w = _mm256_lddqu_si256((__m256i *)(ds + 16)); const __m256i v_d2_w = _mm256_lddqu_si256((__m256i *)(ds + 32)); const __m256i v_d3_w = _mm256_lddqu_si256((__m256i *)(ds + 48)); const __m256i v_m0_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m01_b)); const __m256i v_m1_w = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m01_b, 1)); const __m256i v_m2_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m23_b)); const __m256i v_m3_w = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m23_b, 1)); const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w); const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w); const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w); const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w); const __m256i v_p01_d = _mm256_add_epi32(v_p0_d, v_p1_d); const __m256i v_p23_d = _mm256_add_epi32(v_p2_d, v_p3_d); const __m256i v_p0123_d = _mm256_add_epi32(v_p01_d, v_p23_d); v_acc0_d = _mm256_add_epi32(v_acc0_d, v_p0123_d); ds += 64; m += 64; N -= 64; } while (N); __m256i v_sign_d = _mm256_srai_epi32(v_acc0_d, 31); v_acc0_d = _mm256_add_epi64(_mm256_unpacklo_epi32(v_acc0_d, v_sign_d), _mm256_unpackhi_epi32(v_acc0_d, v_sign_d)); __m256i v_acc_q = _mm256_add_epi64(v_acc0_d, _mm256_srli_si256(v_acc0_d, 8)); __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc_q); __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc_q, 1); v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1); #if AOM_ARCH_X86_64 acc = _mm_extract_epi64(v_acc_q_0, 0); #else xx_storel_64(&acc, v_acc_q_0); #endif return acc > limit; } /** * av1_wedge_compute_delta_squares_c */ void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a, const int16_t *b, int N) { const __m256i v_neg_w = _mm256_set1_epi32((int)0xffff0001); assert(N % 64 == 0); do { const __m256i v_a0_w = _mm256_lddqu_si256((__m256i *)(a)); const __m256i v_b0_w = _mm256_lddqu_si256((__m256i *)(b)); const __m256i v_a1_w = _mm256_lddqu_si256((__m256i *)(a + 16)); const __m256i v_b1_w = _mm256_lddqu_si256((__m256i *)(b + 16)); const __m256i v_a2_w = _mm256_lddqu_si256((__m256i *)(a + 32)); const __m256i v_b2_w = _mm256_lddqu_si256((__m256i *)(b + 32)); const __m256i v_a3_w = _mm256_lddqu_si256((__m256i *)(a + 48)); const __m256i v_b3_w = _mm256_lddqu_si256((__m256i *)(b + 48)); const __m256i v_ab0l_w = _mm256_unpacklo_epi16(v_a0_w, v_b0_w); const __m256i v_ab0h_w = _mm256_unpackhi_epi16(v_a0_w, v_b0_w); const __m256i v_ab1l_w = _mm256_unpacklo_epi16(v_a1_w, v_b1_w); const __m256i v_ab1h_w = _mm256_unpackhi_epi16(v_a1_w, v_b1_w); const __m256i v_ab2l_w = _mm256_unpacklo_epi16(v_a2_w, v_b2_w); const __m256i v_ab2h_w = _mm256_unpackhi_epi16(v_a2_w, v_b2_w); const __m256i v_ab3l_w = _mm256_unpacklo_epi16(v_a3_w, v_b3_w); const __m256i v_ab3h_w = _mm256_unpackhi_epi16(v_a3_w, v_b3_w); // Negate top word of pairs const __m256i v_abl0n_w = _mm256_sign_epi16(v_ab0l_w, v_neg_w); const __m256i v_abh0n_w = _mm256_sign_epi16(v_ab0h_w, v_neg_w); const __m256i v_abl1n_w = _mm256_sign_epi16(v_ab1l_w, v_neg_w); const __m256i v_abh1n_w = _mm256_sign_epi16(v_ab1h_w, v_neg_w); const __m256i v_abl2n_w = _mm256_sign_epi16(v_ab2l_w, v_neg_w); const __m256i v_abh2n_w = _mm256_sign_epi16(v_ab2h_w, v_neg_w); const __m256i v_abl3n_w = _mm256_sign_epi16(v_ab3l_w, v_neg_w); const __m256i v_abh3n_w = _mm256_sign_epi16(v_ab3h_w, v_neg_w); const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w); const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w); const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w); const __m256i v_r1h_w = _mm256_madd_epi16(v_ab1h_w, v_abh1n_w); const __m256i v_r2l_w = _mm256_madd_epi16(v_ab2l_w, v_abl2n_w); const __m256i v_r2h_w = _mm256_madd_epi16(v_ab2h_w, v_abh2n_w); const __m256i v_r3l_w = _mm256_madd_epi16(v_ab3l_w, v_abl3n_w); const __m256i v_r3h_w = _mm256_madd_epi16(v_ab3h_w, v_abh3n_w); const __m256i v_r0_w = _mm256_packs_epi32(v_r0l_w, v_r0h_w); const __m256i v_r1_w = _mm256_packs_epi32(v_r1l_w, v_r1h_w); const __m256i v_r2_w = _mm256_packs_epi32(v_r2l_w, v_r2h_w); const __m256i v_r3_w = _mm256_packs_epi32(v_r3l_w, v_r3h_w); _mm256_store_si256((__m256i *)(d), v_r0_w); _mm256_store_si256((__m256i *)(d + 16), v_r1_w); _mm256_store_si256((__m256i *)(d + 32), v_r2_w); _mm256_store_si256((__m256i *)(d + 48), v_r3_w); a += 64; b += 64; d += 64; N -= 64; } while (N); } aom-3.12.1/av1/encoder/x86/wedge_utils_sse2.c000066400000000000000000000226361477627663500205600ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom_dsp/x86/synonyms.h" #include "aom/aom_integer.h" #include "av1/common/reconinter.h" #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) /** * See av1_wedge_sse_from_residuals_c */ uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, const uint8_t *m, int N) { int n = -N; int n8 = n + 8; uint64_t csse; const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE); const __m128i v_zext_q = _mm_set1_epi64x(~0u); __m128i v_acc0_q = _mm_setzero_si128(); assert(N % 64 == 0); r1 += N; d += N; m += N; do { const __m128i v_r0_w = xx_load_128(r1 + n); const __m128i v_r1_w = xx_load_128(r1 + n8); const __m128i v_d0_w = xx_load_128(d + n); const __m128i v_d1_w = xx_load_128(d + n8); const __m128i v_m01_b = xx_load_128(m + n); const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w); const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w); const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w); const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w); const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w); const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w); const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w); const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w); const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w); const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w); const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w); const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w); const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d); const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d); const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w); const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w); const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q), _mm_srli_epi64(v_sq0_d, 32)); const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q), _mm_srli_epi64(v_sq1_d, 32)); v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q); v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q); n8 += 16; n += 16; } while (n); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); #if AOM_ARCH_X86_64 csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q); #else xx_storel_64(&csse, v_acc0_q); #endif return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); } /** * See av1_wedge_sign_from_residuals_c */ int8_t av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m, int N, int64_t limit) { int64_t acc; __m128i v_sign_d; __m128i v_acc0_d = _mm_setzero_si128(); __m128i v_acc1_d = _mm_setzero_si128(); __m128i v_acc_q; // Input size limited to 8192 by the use of 32 bit accumulators and m // being between [0, 64]. Overflow might happen at larger sizes, // though it is practically impossible on real video input. assert(N < 8192); assert(N % 64 == 0); do { const __m128i v_m01_b = xx_load_128(m); const __m128i v_m23_b = xx_load_128(m + 16); const __m128i v_m45_b = xx_load_128(m + 32); const __m128i v_m67_b = xx_load_128(m + 48); const __m128i v_d0_w = xx_load_128(ds); const __m128i v_d1_w = xx_load_128(ds + 8); const __m128i v_d2_w = xx_load_128(ds + 16); const __m128i v_d3_w = xx_load_128(ds + 24); const __m128i v_d4_w = xx_load_128(ds + 32); const __m128i v_d5_w = xx_load_128(ds + 40); const __m128i v_d6_w = xx_load_128(ds + 48); const __m128i v_d7_w = xx_load_128(ds + 56); const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128()); const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128()); const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128()); const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128()); const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128()); const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128()); const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w); const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w); const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w); const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w); const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w); const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w); const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w); const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w); const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d); const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d); const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d); const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d); const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d); const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d); v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d); v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d); ds += 64; m += 64; N -= 64; } while (N); v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128()); v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d), _mm_unpackhi_epi32(v_acc0_d, v_sign_d)); v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128()); v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d), _mm_unpackhi_epi32(v_acc1_d, v_sign_d)); v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d); v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); #if AOM_ARCH_X86_64 acc = _mm_cvtsi128_si64(v_acc_q); #else xx_storel_64(&acc, v_acc_q); #endif return acc > limit; } // Negate under mask static inline __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) { return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w); } /** * av1_wedge_compute_delta_squares_c */ void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a, const int16_t *b, int N) { const __m128i v_neg_w = _mm_set_epi16((short)0xffff, 0, (short)0xffff, 0, (short)0xffff, 0, (short)0xffff, 0); assert(N % 64 == 0); do { const __m128i v_a0_w = xx_load_128(a); const __m128i v_b0_w = xx_load_128(b); const __m128i v_a1_w = xx_load_128(a + 8); const __m128i v_b1_w = xx_load_128(b + 8); const __m128i v_a2_w = xx_load_128(a + 16); const __m128i v_b2_w = xx_load_128(b + 16); const __m128i v_a3_w = xx_load_128(a + 24); const __m128i v_b3_w = xx_load_128(b + 24); const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w); const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w); const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w); const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w); const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w); const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w); const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w); const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w); // Negate top word of pairs const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w); const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w); const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w); const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w); const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w); const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w); const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w); const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w); const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w); const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w); const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w); const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w); const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w); const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w); const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w); const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w); const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w); const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w); const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w); const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w); xx_store_128(d, v_r0_w); xx_store_128(d + 8, v_r1_w); xx_store_128(d + 16, v_r2_w); xx_store_128(d + 24, v_r3_w); a += 32; b += 32; d += 32; N -= 32; } while (N); } aom-3.12.1/av1/exports_com000066400000000000000000000000421477627663500153510ustar00rootroot00000000000000text aom_read_obu_header_and_size aom-3.12.1/av1/exports_dec000066400000000000000000000000611477627663500153270ustar00rootroot00000000000000data aom_codec_av1_dx_algo text aom_codec_av1_dx aom-3.12.1/av1/exports_enc000066400000000000000000000000611477627663500153410ustar00rootroot00000000000000data aom_codec_av1_cx_algo text aom_codec_av1_cx aom-3.12.1/av1/exports_ident000066400000000000000000000000371477627663500157020ustar00rootroot00000000000000text ifd_init text ifd_inspect aom-3.12.1/av1/ratectrl_rtc.cc000066400000000000000000000445371477627663500161040ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/ratectrl_rtc.h" #include #include #include "aom/aom_encoder.h" #include "aom/aomcx.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "av1/common/common.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encoder_utils.h" #include "av1/encoder/pickcdef.h" #include "av1/encoder/picklpf.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/rc_utils.h" #include "av1/encoder/svc_layercontext.h" namespace { void AomAV1RateControlRtcConfigInitDefault(AomAV1RateControlRtcConfig *config) { if (config == nullptr) return; config->width = 1280; config->height = 720; config->is_screen = false; config->max_quantizer = 63; config->min_quantizer = 2; config->target_bandwidth = 1000; config->buf_initial_sz = 600; config->buf_optimal_sz = 600; config->buf_sz = 1000; config->undershoot_pct = 50; config->overshoot_pct = 50; config->max_intra_bitrate_pct = 50; config->max_inter_bitrate_pct = 0; config->frame_drop_thresh = 0; config->max_consec_drop_ms = 0; config->framerate = 30.0; av1_zero(config->layer_target_bitrate); config->layer_target_bitrate[0] = static_cast(config->target_bandwidth); av1_zero(config->ts_rate_decimator); config->ts_rate_decimator[0] = 1; config->aq_mode = 0; config->ss_number_layers = 1; config->ts_number_layers = 1; av1_zero(config->max_quantizers); av1_zero(config->min_quantizers); av1_zero(config->scaling_factor_num); av1_zero(config->scaling_factor_den); config->max_quantizers[0] = config->max_quantizer; config->min_quantizers[0] = config->min_quantizer; config->scaling_factor_num[0] = 1; config->scaling_factor_den[0] = 1; } } // namespace AomAV1RateControlRtcConfig::AomAV1RateControlRtcConfig() { AomAV1RateControlRtcConfigInitDefault(this); } namespace aom { std::unique_ptr AV1RateControlRTC::Create( const AV1RateControlRtcConfig &cfg) { std::unique_ptr rc_api(new (std::nothrow) AV1RateControlRTC()); if (!rc_api) return nullptr; rc_api->cpi_ = static_cast(aom_memalign(32, sizeof(*cpi_))); if (!rc_api->cpi_) return nullptr; av1_zero(*rc_api->cpi_); rc_api->cpi_->ppi = static_cast(aom_memalign(32, sizeof(AV1_PRIMARY))); if (!rc_api->cpi_->ppi) return nullptr; av1_zero(*rc_api->cpi_->ppi); rc_api->cpi_->common.seq_params = &rc_api->cpi_->ppi->seq_params; av1_zero(*rc_api->cpi_->common.seq_params); if (!rc_api->InitRateControl(cfg)) return nullptr; if (cfg.aq_mode) { AV1_COMP *const cpi = rc_api->cpi_; cpi->enc_seg.map = static_cast(aom_calloc( cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols, sizeof(*cpi->enc_seg.map))); if (!cpi->enc_seg.map) return nullptr; cpi->cyclic_refresh = av1_cyclic_refresh_alloc( cpi->common.mi_params.mi_rows, cpi->common.mi_params.mi_cols); if (!cpi->cyclic_refresh) return nullptr; } return rc_api; } AV1RateControlRTC::~AV1RateControlRTC() { if (cpi_) { if (cpi_->svc.number_spatial_layers > 1 || cpi_->svc.number_temporal_layers > 1) { for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) { for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) { int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->svc.number_temporal_layers); LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer]; aom_free(lc->map); } } } aom_free(cpi_->svc.layer_context); cpi_->svc.layer_context = nullptr; if (cpi_->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) { aom_free(cpi_->enc_seg.map); cpi_->enc_seg.map = nullptr; av1_cyclic_refresh_free(cpi_->cyclic_refresh); } aom_free(cpi_->ppi); aom_free(cpi_); } } bool AV1RateControlRTC::InitRateControl(const AV1RateControlRtcConfig &rc_cfg) { AV1_COMMON *cm = &cpi_->common; AV1EncoderConfig *oxcf = &cpi_->oxcf; RATE_CONTROL *const rc = &cpi_->rc; cm->seq_params->profile = PROFILE_0; cm->seq_params->bit_depth = AOM_BITS_8; cm->show_frame = 1; oxcf->profile = cm->seq_params->profile; oxcf->mode = REALTIME; oxcf->rc_cfg.mode = AOM_CBR; oxcf->pass = AOM_RC_ONE_PASS; oxcf->q_cfg.aq_mode = rc_cfg.aq_mode ? CYCLIC_REFRESH_AQ : NO_AQ; oxcf->tune_cfg.content = AOM_CONTENT_DEFAULT; oxcf->rc_cfg.drop_frames_water_mark = rc_cfg.frame_drop_thresh; if (rc_cfg.max_consec_drop_ms > 0) { rc->max_consec_drop = saturate_cast_double_to_int( ceil(cpi_->framerate * rc_cfg.max_consec_drop_ms / 1000)); } cpi_->svc.framedrop_mode = AOM_FULL_SUPERFRAME_DROP; oxcf->tool_cfg.bit_depth = AOM_BITS_8; oxcf->tool_cfg.superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC; oxcf->algo_cfg.loopfilter_control = LOOPFILTER_ALL; cm->current_frame.frame_number = 0; cpi_->ppi->p_rc.kf_boost = DEFAULT_KF_BOOST_RT; for (auto &lvl_idx : oxcf->target_seq_level_idx) lvl_idx = SEQ_LEVEL_MAX; memcpy(cpi_->ppi->level_params.target_seq_level_idx, oxcf->target_seq_level_idx, sizeof(oxcf->target_seq_level_idx)); if (!UpdateRateControl(rc_cfg)) return false; set_sb_size(cm->seq_params, av1_select_sb_size(oxcf, cm->width, cm->height, cpi_->svc.number_spatial_layers)); cpi_->ppi->use_svc = cpi_->svc.number_spatial_layers > 1 || cpi_->svc.number_temporal_layers > 1; av1_primary_rc_init(oxcf, &cpi_->ppi->p_rc); rc->rc_1_frame = 0; rc->rc_2_frame = 0; av1_rc_init_minq_luts(); av1_rc_init(oxcf, rc); // Enable external rate control. cpi_->rc.rtc_external_ratectrl = 1; cpi_->sf.rt_sf.use_nonrd_pick_mode = 1; return true; } bool AV1RateControlRTC::UpdateRateControl( const AV1RateControlRtcConfig &rc_cfg) { if (rc_cfg.ss_number_layers < 1 || rc_cfg.ss_number_layers > AOM_MAX_SS_LAYERS || rc_cfg.ts_number_layers < 1 || rc_cfg.ts_number_layers > AOM_MAX_TS_LAYERS) { return false; } const int num_layers = rc_cfg.ss_number_layers * rc_cfg.ts_number_layers; if (num_layers > 1 && !av1_alloc_layer_context(cpi_, num_layers)) { return false; } AV1_COMMON *cm = &cpi_->common; AV1EncoderConfig *oxcf = &cpi_->oxcf; RATE_CONTROL *const rc = &cpi_->rc; initial_width_ = rc_cfg.width; initial_height_ = rc_cfg.height; cm->width = rc_cfg.width; cm->height = rc_cfg.height; oxcf->frm_dim_cfg.width = rc_cfg.width; oxcf->frm_dim_cfg.height = rc_cfg.height; oxcf->rc_cfg.worst_allowed_q = av1_quantizer_to_qindex(rc_cfg.max_quantizer); oxcf->rc_cfg.best_allowed_q = av1_quantizer_to_qindex(rc_cfg.min_quantizer); rc->worst_quality = oxcf->rc_cfg.worst_allowed_q; rc->best_quality = oxcf->rc_cfg.best_allowed_q; oxcf->input_cfg.init_framerate = rc_cfg.framerate; oxcf->rc_cfg.target_bandwidth = rc_cfg.target_bandwidth > INT64_MAX / 1000 ? INT64_MAX : 1000 * rc_cfg.target_bandwidth; oxcf->rc_cfg.starting_buffer_level_ms = rc_cfg.buf_initial_sz; oxcf->rc_cfg.optimal_buffer_level_ms = rc_cfg.buf_optimal_sz; oxcf->rc_cfg.maximum_buffer_size_ms = rc_cfg.buf_sz; oxcf->rc_cfg.under_shoot_pct = rc_cfg.undershoot_pct; oxcf->rc_cfg.over_shoot_pct = rc_cfg.overshoot_pct; oxcf->rc_cfg.drop_frames_water_mark = rc_cfg.frame_drop_thresh; if (rc_cfg.max_consec_drop_ms > 0) { rc->max_consec_drop = saturate_cast_double_to_int( ceil(cpi_->framerate * rc_cfg.max_consec_drop_ms / 1000)); } oxcf->rc_cfg.max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct; oxcf->rc_cfg.max_inter_bitrate_pct = rc_cfg.max_inter_bitrate_pct; cpi_->framerate = rc_cfg.framerate; if (rc_cfg.is_screen) { cpi_->oxcf.tune_cfg.content = AOM_CONTENT_SCREEN; cpi_->is_screen_content_type = 1; } cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers; cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers; set_primary_rc_buffer_sizes(oxcf, cpi_->ppi); enc_set_mb_mi(&cm->mi_params, cm->width, cm->height, BLOCK_8X8); av1_new_framerate(cpi_, cpi_->framerate); if (cpi_->svc.number_temporal_layers > 1 || cpi_->svc.number_spatial_layers > 1) { int64_t target_bandwidth_svc = 0; for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) { for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) { const int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->svc.number_temporal_layers); LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer]; RATE_CONTROL *const lrc = &lc->rc; lc->layer_target_bitrate = 1000 * rc_cfg.layer_target_bitrate[layer]; lc->max_q = rc_cfg.max_quantizers[layer]; lc->min_q = rc_cfg.min_quantizers[layer]; lrc->worst_quality = av1_quantizer_to_qindex(rc_cfg.max_quantizers[layer]); lrc->best_quality = av1_quantizer_to_qindex(rc_cfg.min_quantizers[layer]); lc->scaling_factor_num = rc_cfg.scaling_factor_num[sl]; lc->scaling_factor_den = rc_cfg.scaling_factor_den[sl]; lc->framerate_factor = rc_cfg.ts_rate_decimator[tl]; if (tl == cpi_->svc.number_temporal_layers - 1) target_bandwidth_svc += lc->layer_target_bitrate; } } if (cm->current_frame.frame_number == 0) av1_init_layer_context(cpi_); // This is needed to initialize external RC flag in layer context structure. cpi_->rc.rtc_external_ratectrl = 1; av1_update_layer_context_change_config(cpi_, target_bandwidth_svc); } check_reset_rc_flag(cpi_); return true; } FrameDropDecision AV1RateControlRTC::ComputeQP( const AV1FrameParamsRTC &frame_params) { AV1_COMMON *const cm = &cpi_->common; int width, height; GF_GROUP *const gf_group = &cpi_->ppi->gf_group; cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id; cpi_->svc.temporal_layer_id = frame_params.temporal_layer_id; if (cpi_->svc.number_spatial_layers > 1) { const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id, cpi_->svc.temporal_layer_id, cpi_->svc.number_temporal_layers); LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer]; av1_get_layer_resolution(initial_width_, initial_height_, lc->scaling_factor_num, lc->scaling_factor_den, &width, &height); cm->width = width; cm->height = height; } enc_set_mb_mi(&cm->mi_params, cm->width, cm->height, BLOCK_8X8); cm->current_frame.frame_type = frame_params.frame_type; cpi_->refresh_frame.golden_frame = (cm->current_frame.frame_type == KEY_FRAME) ? 1 : 0; cpi_->sf.rt_sf.use_nonrd_pick_mode = 1; if (frame_params.frame_type == kKeyFrame) { gf_group->update_type[cpi_->gf_frame_index] = KF_UPDATE; gf_group->frame_type[cpi_->gf_frame_index] = KEY_FRAME; gf_group->refbuf_state[cpi_->gf_frame_index] = REFBUF_RESET; if (cpi_->ppi->use_svc) { const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id, cpi_->svc.temporal_layer_id, cpi_->svc.number_temporal_layers); if (cm->current_frame.frame_number > 0) av1_svc_reset_temporal_layers(cpi_, 1); cpi_->svc.layer_context[layer].is_key_frame = 1; } } else { gf_group->update_type[cpi_->gf_frame_index] = LF_UPDATE; gf_group->frame_type[cpi_->gf_frame_index] = INTER_FRAME; gf_group->refbuf_state[cpi_->gf_frame_index] = REFBUF_UPDATE; if (cpi_->ppi->use_svc) { const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id, cpi_->svc.temporal_layer_id, cpi_->svc.number_temporal_layers); cpi_->svc.layer_context[layer].is_key_frame = 0; } } if (cpi_->svc.number_spatial_layers > 1 || cpi_->svc.number_temporal_layers > 1) { av1_update_temporal_layer_framerate(cpi_); av1_restore_layer_context(cpi_); } int target = 0; if (cpi_->oxcf.rc_cfg.mode == AOM_CBR) { if (cpi_->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) av1_cyclic_refresh_update_parameters(cpi_); if (frame_is_intra_only(cm)) { target = av1_calc_iframe_target_size_one_pass_cbr(cpi_); cpi_->common.current_frame.frame_number = 0; } else { target = av1_calc_pframe_target_size_one_pass_cbr( cpi_, gf_group->update_type[cpi_->gf_frame_index]); } } av1_rc_set_frame_target(cpi_, target, cm->width, cm->height); // Always drop for spatial enhancement layer if layer bandwidth is 0. // Otherwise check for frame-dropping based on buffer level in // av1_rc_drop_frame(). if ((cpi_->svc.spatial_layer_id > 0 && cpi_->oxcf.rc_cfg.target_bandwidth == 0) || av1_rc_drop_frame(cpi_)) { cpi_->is_dropped_frame = true; av1_rc_postencode_update_drop_frame(cpi_); if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1) cpi_->rc.frames_since_key++; if (cpi_->svc.number_spatial_layers > 1 || cpi_->svc.number_temporal_layers > 1) { av1_save_layer_context(cpi_); } cpi_->frame_index_set.show_frame_count++; cpi_->common.current_frame.frame_number++; return kFrameDropDecisionDrop; } int bottom_index = 0, top_index = 0; cpi_->common.quant_params.base_qindex = av1_rc_pick_q_and_bounds(cpi_, cm->width, cm->height, cpi_->gf_frame_index, &bottom_index, &top_index); if (cpi_->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) av1_cyclic_refresh_setup(cpi_); return kFrameDropDecisionOk; } int AV1RateControlRTC::GetQP() const { return cpi_->common.quant_params.base_qindex; } AV1LoopfilterLevel AV1RateControlRTC::GetLoopfilterLevel() const { av1_pick_filter_level(nullptr, cpi_, LPF_PICK_FROM_Q); AV1LoopfilterLevel lpf_level; lpf_level.filter_level[0] = cpi_->common.lf.filter_level[0]; lpf_level.filter_level[1] = cpi_->common.lf.filter_level[1]; lpf_level.filter_level_u = cpi_->common.lf.filter_level_u; lpf_level.filter_level_v = cpi_->common.lf.filter_level_v; return lpf_level; } AV1CdefInfo AV1RateControlRTC::GetCdefInfo() const { av1_pick_cdef_from_qp(&cpi_->common, 0, 0); AV1CdefInfo cdef_level; cdef_level.cdef_strength_y = cpi_->common.cdef_info.cdef_strengths[0]; cdef_level.cdef_strength_uv = cpi_->common.cdef_info.cdef_uv_strengths[0]; cdef_level.damping = cpi_->common.cdef_info.cdef_damping; return cdef_level; } bool AV1RateControlRTC::GetSegmentationData( AV1SegmentationData *segmentation_data) const { if (cpi_->oxcf.q_cfg.aq_mode == 0) { return false; } // Don't update the segmentation map if cyclic refresh is not enabled. if (!cpi_->cyclic_refresh->apply_cyclic_refresh) { return false; } segmentation_data->segmentation_map = cpi_->enc_seg.map; segmentation_data->segmentation_map_size = cpi_->common.mi_params.mi_rows * cpi_->common.mi_params.mi_cols; segmentation_data->delta_q = cpi_->cyclic_refresh->qindex_delta; segmentation_data->delta_q_size = 3u; return true; } void AV1RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) { cpi_->common.current_frame.frame_number++; av1_rc_postencode_update(cpi_, encoded_frame_size); if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1) { cpi_->svc.prev_number_spatial_layers = cpi_->svc.number_spatial_layers; cpi_->rc.frames_since_key++; } if (cpi_->svc.number_spatial_layers > 1 || cpi_->svc.number_temporal_layers > 1) av1_save_layer_context(cpi_); } } // namespace aom extern "C" { AomAV1RateControlRTC *av1_ratecontrol_rtc_create( const AomAV1RateControlRtcConfig *rc_cfg) { if (rc_cfg == nullptr) return nullptr; return reinterpret_cast( aom::AV1RateControlRTC::Create(*rc_cfg).release()); } void av1_ratecontrol_rtc_destroy(AomAV1RateControlRTC *controller) { delete reinterpret_cast(controller); } bool av1_ratecontrol_rtc_update( AomAV1RateControlRTC *controller, const struct AomAV1RateControlRtcConfig *rc_cfg) { if (controller == nullptr || rc_cfg == nullptr) return false; return reinterpret_cast(controller) ->UpdateRateControl(*rc_cfg); } int av1_ratecontrol_rtc_get_qp(const AomAV1RateControlRTC *controller) { if (controller == nullptr) return 0; return reinterpret_cast(controller)->GetQP(); } AomAV1LoopfilterLevel av1_ratecontrol_rtc_get_loop_filter_level( const AomAV1RateControlRTC *controller) { if (controller == nullptr) { return { { 0, 0 }, 0, 0 }; } return reinterpret_cast(controller) ->GetLoopfilterLevel(); } AomFrameDropDecision av1_ratecontrol_rtc_compute_qp( AomAV1RateControlRTC *controller, const AomAV1FrameParamsRTC *frame_params) { if (controller == nullptr || frame_params == nullptr) return kAomFrameDropDecisionOk; return reinterpret_cast(controller) ->ComputeQP(*frame_params); } void av1_ratecontrol_rtc_post_encode_update(AomAV1RateControlRTC *controller, uint64_t encoded_frame_size) { if (controller == nullptr) return; reinterpret_cast(controller) ->PostEncodeUpdate(encoded_frame_size); } bool av1_ratecontrol_rtc_get_segmentation( const AomAV1RateControlRTC *controller, AomAV1SegmentationData *segmentation_data) { if (controller == nullptr || segmentation_data == nullptr) return false; return reinterpret_cast(controller) ->GetSegmentationData(segmentation_data); } AomAV1CdefInfo av1_ratecontrol_rtc_get_cdef_info( const AomAV1RateControlRTC *controller) { if (controller == nullptr) { return { 0, 0, 0 }; } return reinterpret_cast(controller) ->GetCdefInfo(); } void av1_ratecontrol_rtc_init_ratecontrol_config( AomAV1RateControlRtcConfig *config) { AomAV1RateControlRtcConfigInitDefault(config); } } // extern "C" aom-3.12.1/av1/ratectrl_rtc.h000066400000000000000000000134661477627663500157430ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_AV1_RATECTRL_RTC_H_ #define AOM_AV1_RATECTRL_RTC_H_ #ifdef __cplusplus #include #include #include #else #include #include #include #endif // __cplusplus struct AV1_COMP; typedef struct AomAV1LoopfilterLevel { int filter_level[2]; int filter_level_u; int filter_level_v; } AomAV1LoopfilterLevel; typedef struct AomAV1CdefInfo { int cdef_strength_y; int cdef_strength_uv; int damping; } AomAV1CdefInfo; typedef struct AomAV1SegmentationData { const uint8_t *segmentation_map; size_t segmentation_map_size; const int *delta_q; size_t delta_q_size; } AomAV1SegmentationData; typedef enum AomFrameType { kAomKeyFrame, kAomInterFrame } AomFrameType; typedef struct AomAV1FrameParamsRTC { AomFrameType frame_type; int spatial_layer_id; int temporal_layer_id; } AomAV1FrameParamsRTC; typedef enum AomFrameDropDecision { kAomFrameDropDecisionOk, // Frame is encoded. kAomFrameDropDecisionDrop, // Frame is dropped. } AomFrameDropDecision; // These constants come from AV1 spec. enum { kAomAV1MaxLayers = 32, kAomAV1MaxTemporalLayers = 8, kAomAV1MaxSpatialLayers = 4, }; typedef struct AomAV1RateControlRtcConfig { #ifdef __cplusplus AomAV1RateControlRtcConfig(); #endif int width; int height; // Flag indicating if the content is screen or not. bool is_screen; // 0-63 int max_quantizer; int min_quantizer; int64_t target_bandwidth; int64_t buf_initial_sz; int64_t buf_optimal_sz; int64_t buf_sz; int undershoot_pct; int overshoot_pct; int max_intra_bitrate_pct; int max_inter_bitrate_pct; int frame_drop_thresh; int max_consec_drop_ms; double framerate; int layer_target_bitrate[kAomAV1MaxLayers]; int ts_rate_decimator[kAomAV1MaxTemporalLayers]; int aq_mode; // Number of spatial layers int ss_number_layers; // Number of temporal layers int ts_number_layers; int max_quantizers[kAomAV1MaxLayers]; int min_quantizers[kAomAV1MaxLayers]; int scaling_factor_num[kAomAV1MaxSpatialLayers]; int scaling_factor_den[kAomAV1MaxSpatialLayers]; } AomAV1RateControlRtcConfig; struct AomAV1RateControlRTC; typedef struct AomAV1RateControlRTC AomAV1RateControlRTC; #ifdef __cplusplus namespace aom { using AV1LoopfilterLevel = AomAV1LoopfilterLevel; using AV1CdefInfo = AomAV1CdefInfo; using AV1SegmentationData = AomAV1SegmentationData; using AV1FrameParamsRTC = AomAV1FrameParamsRTC; using AV1RateControlRtcConfig = AomAV1RateControlRtcConfig; using FrameType = AomFrameType; constexpr FrameType kKeyFrame = kAomKeyFrame; constexpr FrameType kInterFrame = kAomInterFrame; using FrameDropDecision = AomFrameDropDecision; constexpr FrameDropDecision kFrameDropDecisionOk = kAomFrameDropDecisionOk; constexpr FrameDropDecision kFrameDropDecisionDrop = kAomFrameDropDecisionDrop; class AV1RateControlRTC { public: static std::unique_ptr Create( const AV1RateControlRtcConfig &cfg); ~AV1RateControlRTC(); bool UpdateRateControl(const AV1RateControlRtcConfig &rc_cfg); // GetQP() needs to be called after ComputeQP() to get the latest QP int GetQP() const; // GetLoopfilterLevel() needs to be called after ComputeQP() AV1LoopfilterLevel GetLoopfilterLevel() const; // GetCdefInfo() needs to be called after ComputeQP() AV1CdefInfo GetCdefInfo() const; // Returns the segmentation map used for cyclic refresh, based on 4x4 blocks. bool GetSegmentationData(AV1SegmentationData *segmentation_data) const; // ComputeQP returns the QP if the frame is not dropped (kOk return), // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate // are not to be called (av1_rc_postencode_update_drop_frame is already // called via ComputeQP if drop is decided). FrameDropDecision ComputeQP(const AV1FrameParamsRTC &frame_params); // Feedback to rate control with the size of current encoded frame void PostEncodeUpdate(uint64_t encoded_frame_size); private: AV1RateControlRTC() = default; bool InitRateControl(const AV1RateControlRtcConfig &cfg); AV1_COMP *cpi_; int initial_width_; int initial_height_; }; } // namespace aom #endif // __cplusplus #ifdef __cplusplus extern "C" { #endif AomAV1RateControlRTC *av1_ratecontrol_rtc_create( const AomAV1RateControlRtcConfig *rc_cfg); void av1_ratecontrol_rtc_destroy(AomAV1RateControlRTC *controller); bool av1_ratecontrol_rtc_update(AomAV1RateControlRTC *controller, const AomAV1RateControlRtcConfig *rc_cfg); int av1_ratecontrol_rtc_get_qp(const AomAV1RateControlRTC *controller); AomAV1LoopfilterLevel av1_ratecontrol_rtc_get_loop_filter_level( const AomAV1RateControlRTC *controller); AomFrameDropDecision av1_ratecontrol_rtc_compute_qp( AomAV1RateControlRTC *controller, const AomAV1FrameParamsRTC *frame_params); void av1_ratecontrol_rtc_post_encode_update(AomAV1RateControlRTC *controller, uint64_t encoded_frame_size); bool av1_ratecontrol_rtc_get_segmentation( const AomAV1RateControlRTC *controller, AomAV1SegmentationData *segmentation_data); AomAV1CdefInfo av1_ratecontrol_rtc_get_cdef_info( const AomAV1RateControlRTC *controller); void av1_ratecontrol_rtc_init_ratecontrol_config( AomAV1RateControlRtcConfig *config); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_AV1_RATECTRL_RTC_H_ aom-3.12.1/build/000077500000000000000000000000001477627663500135005ustar00rootroot00000000000000aom-3.12.1/build/.gitattributes000066400000000000000000000000511477627663500163670ustar00rootroot00000000000000*-vs8/*.rules -crlf *-msvs/*.rules -crlf aom-3.12.1/build/.gitignore000066400000000000000000000000171477627663500154660ustar00rootroot00000000000000x86*-win32-vs* aom-3.12.1/build/cmake/000077500000000000000000000000001477627663500145605ustar00rootroot00000000000000aom-3.12.1/build/cmake/aom_config.c.template000066400000000000000000000012341477627663500206370ustar00rootroot00000000000000/* * Copyright (c) @year@, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "aom/aom_codec.h" static const char* const cfg = "${AOM_CMAKE_CONFIG}"; const char *aom_codec_build_config(void) {return cfg;} aom-3.12.1/build/cmake/aom_config_defaults.cmake000066400000000000000000000277651477627663500215730ustar00rootroot00000000000000# # Copyright (c) 2016, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. include("${AOM_ROOT}/build/cmake/util.cmake") # This file sets default values for libaom configuration variables. All libaom # config variables are added to the CMake variable cache via the macros provided # in util.cmake. # # The variables in this section of the file are detected at configuration time, # but can be overridden via the use of CONFIG_* and ENABLE_* values also defined # in this file. # # CPUs. set_aom_detect_var(AOM_ARCH_AARCH64 0 "Enables AArch64 architecture.") set_aom_detect_var(AOM_ARCH_ARM 0 "Enables ARM architecture.") set_aom_detect_var(AOM_ARCH_PPC 0 "Enables PPC architecture.") set_aom_detect_var(AOM_ARCH_X86 0 "Enables X86 architecture.") set_aom_detect_var(AOM_ARCH_X86_64 0 "Enables X86_64 architecture.") set_aom_detect_var(AOM_ARCH_RISCV 0 "Enables RISC-V architecture.") # Arm/AArch64 feature flags. set_aom_detect_var(HAVE_NEON 0 "Enables Neon intrinsics optimizations.") set_aom_detect_var(HAVE_ARM_CRC32 0 "Enables Arm CRC32 optimizations.") set_aom_detect_var(HAVE_NEON_DOTPROD 0 "Enables Armv8.2-A Neon dotprod intrinsics optimizations.") set_aom_detect_var(HAVE_NEON_I8MM 0 "Enables Armv8.2-A Neon i8mm intrinsics optimizations.") set_aom_detect_var(HAVE_SVE 0 "Enables Armv8.2-A SVE intrinsics optimizations.") set_aom_detect_var(HAVE_SVE2 0 "Enables Armv9-A SVE2 intrinsics optimizations.") # PPC feature flags. set_aom_detect_var(HAVE_VSX 0 "Enables VSX optimizations.") # x86/x86_64 feature flags. set_aom_detect_var(HAVE_MMX 0 "Enables MMX optimizations. ") set_aom_detect_var(HAVE_SSE 0 "Enables SSE optimizations.") set_aom_detect_var(HAVE_SSE2 0 "Enables SSE2 optimizations.") set_aom_detect_var(HAVE_SSE3 0 "Enables SSE3 optimizations.") set_aom_detect_var(HAVE_SSSE3 0 "Enables SSSE3 optimizations.") set_aom_detect_var(HAVE_SSE4_1 0 "Enables SSE 4.1 optimizations.") set_aom_detect_var(HAVE_SSE4_2 0 "Enables SSE 4.2 optimizations.") set_aom_detect_var(HAVE_AVX 0 "Enables AVX optimizations.") set_aom_detect_var(HAVE_AVX2 0 "Enables AVX2 optimizations.") # RISC-V64 feature flags. set_aom_detect_var(HAVE_RVV 0 "Enables RVV optimizations.") # Flags describing the build environment. set_aom_detect_var(HAVE_FEXCEPT 0 "Internal flag, GNU fenv.h present for target.") set_aom_detect_var(HAVE_PTHREAD_H 0 "Internal flag, target pthread support.") set_aom_detect_var(HAVE_UNISTD_H 0 "Internal flag, unistd.h present for target.") set_aom_detect_var(HAVE_WXWIDGETS 0 "WxWidgets present.") # # Variables in this section can be set from the CMake command line or from # within the CMake GUI. The variables control libaom features. # # Build configuration flags. set_aom_config_var(AOM_RTCD_FLAGS "" "Arguments to pass to rtcd.pl. Separate with ';'") set_aom_config_var(CONFIG_AV1_DECODER 1 "Enable AV1 decoder.") set_aom_config_var(CONFIG_AV1_ENCODER 1 "Enable AV1 encoder.") set_aom_config_var(CONFIG_BIG_ENDIAN 0 "Internal flag.") set_aom_config_var(CONFIG_FPMT_TEST 0 "Enable FPMT testing.") set_aom_config_var(CONFIG_GCC 0 "Building with GCC (detect).") set_aom_config_var(CONFIG_GCOV 0 "Enable gcov support.") set_aom_config_var(CONFIG_GPROF 0 "Enable gprof support.") set_aom_config_var(CONFIG_LIBYUV 1 "Enables libyuv scaling/conversion support.") # Set CONFIG_SVT_AV1 to 0 to avoid the BSD 3-Clause Clear License used by the # code in third_party/SVT-AV1/. set_aom_config_var(CONFIG_SVT_AV1 1 "Enables SVT-AV1 AVX2 convolution support.") set_aom_config_var(CONFIG_AV1_HIGHBITDEPTH 1 "Build with high bitdepth support.") set_aom_config_var(CONFIG_AV1_TEMPORAL_DENOISING 0 "Build with temporal denoising support.") set_aom_config_var(CONFIG_MULTITHREAD 1 "Multithread support.") set_aom_config_var(CONFIG_OS_SUPPORT 0 "Internal flag.") set_aom_config_var(CONFIG_PIC 0 "Build with PIC enabled.") set_aom_config_var(CONFIG_QUANT_MATRIX 1 "Build with quantization matrices for AV1 encoder." "AV1 decoder is always built with quantization matrices.") set_aom_config_var(CONFIG_REALTIME_ONLY 0 "Build for RTC-only. See aomcx.h for all disabled features.") set_aom_config_var(CONFIG_RUNTIME_CPU_DETECT 1 "Runtime CPU detection support.") set_aom_config_var(CONFIG_SHARED 0 "Build shared libs.") set_aom_config_var(CONFIG_WEBM_IO 1 "Enables WebM support.") # Debugging flags. set_aom_config_var(CONFIG_DEBUG 0 "Enable debug-only code.") set_aom_config_var(CONFIG_EXCLUDE_SIMD_MISMATCH 0 "Exclude mismatch in SIMD functions for testing/debugging.") set_aom_config_var(CONFIG_MISMATCH_DEBUG 0 "Mismatch debugging flag.") # AV1 feature flags. set_aom_config_var(CONFIG_ACCOUNTING 0 "Enables bit accounting.") set_aom_config_var(CONFIG_ANALYZER 0 "Enables bit stream analyzer.") set_aom_config_var(CONFIG_COEFFICIENT_RANGE_CHECKING 0 "Coefficient range check.") set_aom_config_var(CONFIG_DENOISE 1 "Denoise/noise modeling support in encoder.") set_aom_config_var(CONFIG_INSPECTION 0 "Enables bitstream inspection.") set_aom_config_var(CONFIG_INTERNAL_STATS 0 "Enables internal encoder stats.") set_aom_config_var(FORCE_HIGHBITDEPTH_DECODING 0 "Force high bitdepth decoding pipeline on 8-bit input.") mark_as_advanced(FORCE_HIGHBITDEPTH_DECODING) set_aom_config_var(CONFIG_MAX_DECODE_PROFILE 2 "Max profile to support decoding.") set_aom_config_var( CONFIG_NORMAL_TILE_MODE 0 "Only enables general decoding (disables large scale tile decoding).") set_aom_config_var(CONFIG_SIZE_LIMIT 0 "Limit max decode width/height.") set_aom_config_var(CONFIG_TUNE_BUTTERAUGLI 0 "Enable encoding tuning for Butteraugli.") set_aom_config_var(CONFIG_TUNE_VMAF 0 "Enable encoding tuning for VMAF.") set_aom_config_var(DECODE_HEIGHT_LIMIT 0 "Set limit for decode height.") set_aom_config_var(DECODE_WIDTH_LIMIT 0 "Set limit for decode width.") set_aom_config_var(STATIC_LINK_JXL 0 "Statically link the JPEG-XL library.") # AV1 experiment flags. set_aom_config_var(CONFIG_BITRATE_ACCURACY 0 "AV1 experiment: Improve bitrate accuracy.") set_aom_config_var( CONFIG_BITRATE_ACCURACY_BL 0 "AV1 experiment: Baseline of improve bitrate accuracy experiment.") set_aom_config_var(CONFIG_BITSTREAM_DEBUG 0 "AV1 experiment: Bitstream debugging.") set_aom_config_var( CONFIG_COLLECT_COMPONENT_TIMING 0 "AV1 experiment: Collect encoding component timing information.") set_aom_config_var( CONFIG_COLLECT_PARTITION_STATS 0 "AV1 experiment: Collect partition timing stats. Can be 1 or 2.") set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 "AV1 experiment.") set_aom_config_var( CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1 "AV1 experiment: Disable full_pixel_motion_search_based_split on BLOCK_8X8.") set_aom_config_var(CONFIG_ENTROPY_STATS 0 "AV1 experiment.") set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 "AV1 experiment.") set_aom_config_var(CONFIG_NN_V2 0 "AV1 experiment: Fully-connected neural nets ver.2.") set_aom_config_var(CONFIG_OPTICAL_FLOW_API 0 "AV1 experiment: for optical flow API.") set_aom_config_var(CONFIG_PARTITION_SEARCH_ORDER 0 "AV1 experiment: Use alternative partition search order.") set_aom_config_var(CONFIG_RATECTRL_LOG 0 "AV1 experiment: Log rate control decision.") set_aom_config_var(CONFIG_RD_COMMAND 0 "AV1 experiment: Use external rdmult and q_index.") set_aom_config_var(CONFIG_RD_DEBUG 0 "AV1 experiment.") set_aom_config_var( CONFIG_RT_ML_PARTITIONING 0 "AV1 experiment: Build with ML-based partitioning for Real Time.") set_aom_config_var(CONFIG_SPEED_STATS 0 "AV1 experiment.") set_aom_config_var(CONFIG_TFLITE 0 "AV1 experiment: Enable tensorflow lite library.") set_aom_config_var(CONFIG_THREE_PASS 0 "AV1 experiment: Enable three-pass encoding.") set_aom_config_var(CONFIG_OUTPUT_FRAME_SIZE 0 "AV1 experiment: Output frame size information.") set_aom_config_var( CONFIG_SALIENCY_MAP 0 "AV1 experiment: Enable saliency map based encoding tuning for VMAF.") set_aom_config_var(CONFIG_CWG_C013 0 "AV1 experiment: Support for 7.x and 8.x levels.") set_aom_config_var(CONFIG_CWG_E050 0 "AV1 experiment: Support for multilayer metadata OBU.") # Add this change to make aomenc reported PSNR consistent with libvmaf result. set_aom_config_var(CONFIG_LIBVMAF_PSNR_PEAK 1 "Use libvmaf PSNR peak for 10- and 12-bit") # # Variables in this section control optional features of the build system. # set_aom_option_var(ENABLE_CCACHE "Enable ccache support." OFF) set_aom_option_var(ENABLE_DECODE_PERF_TESTS "Enables decoder performance tests" OFF) set_aom_option_var(ENABLE_DISTCC "Enable distcc support." OFF) set_aom_option_var(ENABLE_DOCS "Enable documentation generation (doxygen required)." ON) set_aom_option_var(ENABLE_ENCODE_PERF_TESTS "Enables encoder performance tests" OFF) set_aom_option_var(ENABLE_EXAMPLES "Enables build of example code." ON) set_aom_option_var(ENABLE_GOMA "Enable goma support." OFF) set_aom_option_var( ENABLE_IDE_TEST_HOSTING "Enables running tests within IDEs like Visual Studio and Xcode." OFF) set_aom_option_var(ENABLE_NASM "Use nasm instead of yasm for x86 assembly." OFF) set_aom_option_var(ENABLE_TESTDATA "Enables unit test data download targets." ON) set_aom_option_var(ENABLE_TESTS "Enables unit tests." ON) set_aom_option_var(ENABLE_TOOLS "Enable applications in tools sub directory." ON) set_aom_option_var(ENABLE_WERROR "Converts warnings to errors at compile time." OFF) # Arm/AArch64 assembly/intrinsics flags. set_aom_option_var(ENABLE_NEON "Enables Neon optimizations on Arm/AArch64 targets." ON) set_aom_option_var(ENABLE_ARM_CRC32 "Enables Arm CRC32 optimizations." ON) set_aom_option_var( ENABLE_NEON_DOTPROD "Enables Armv8.2-A Neon dotprod optimizations on AArch64 targets." ON) set_aom_option_var( ENABLE_NEON_I8MM "Enables Armv8.2-A Neon i8mm optimizations on AArch64 targets." ON) set_aom_option_var(ENABLE_SVE "Enables Armv8.2-A SVE optimizations on AArch64 targets." ON) set_aom_option_var(ENABLE_SVE2 "Enables Armv9-A SVE2 optimizations on AArch64 targets." ON) # VSX intrinsics flags. set_aom_option_var(ENABLE_VSX "Enables VSX optimizations on PowerPC targets." ON) # x86/x86_64 assembly/intrinsics flags. set_aom_option_var(ENABLE_MMX "Enables MMX optimizations on x86/x86_64 targets." ON) set_aom_option_var(ENABLE_SSE "Enables SSE optimizations on x86/x86_64 targets." ON) set_aom_option_var(ENABLE_SSE2 "Enables SSE2 optimizations on x86/x86_64 targets." ON) set_aom_option_var(ENABLE_SSE3 "Enables SSE3 optimizations on x86/x86_64 targets." ON) set_aom_option_var(ENABLE_SSSE3 "Enables SSSE3 optimizations on x86/x86_64 targets." ON) set_aom_option_var(ENABLE_SSE4_1 "Enables SSE4_1 optimizations on x86/x86_64 targets." ON) set_aom_option_var(ENABLE_SSE4_2 "Enables SSE4_2 optimizations on x86/x86_64 targets." ON) set_aom_option_var(ENABLE_AVX "Enables AVX optimizations on x86/x86_64 targets." ON) set_aom_option_var(ENABLE_AVX2 "Enables AVX2 optimizations on x86/x86_64 targets." ON) # RVV intrinsics flags. set_aom_option_var(ENABLE_RVV "Enables RVV optimizations on RISC-V targets." ON) aom-3.12.1/build/cmake/aom_configure.cmake000066400000000000000000000457441477627663500204150ustar00rootroot00000000000000# # Copyright (c) 2016, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # if(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_) return() endif() # AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_ set(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_ 1) include(FindThreads) include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake") include("${AOM_ROOT}/build/cmake/aom_experiment_deps.cmake") include("${AOM_ROOT}/build/cmake/aom_optimization.cmake") include("${AOM_ROOT}/build/cmake/compiler_flags.cmake") include("${AOM_ROOT}/build/cmake/compiler_tests.cmake") include("${AOM_ROOT}/build/cmake/util.cmake") if(DEFINED CONFIG_LOWBITDEPTH) message(WARNING "CONFIG_LOWBITDEPTH has been removed. \ Use -DFORCE_HIGHBITDEPTH_DECODING=1 instead of -DCONFIG_LOWBITDEPTH=0 \ and -DFORCE_HIGHBITDEPTH_DECODING=0 instead of -DCONFIG_LOWBITDEPTH=1.") if(NOT CONFIG_LOWBITDEPTH) set(FORCE_HIGHBITDEPTH_DECODING 1 CACHE STRING "${cmake_cmdline_helpstring}" FORCE) endif() endif() if(FORCE_HIGHBITDEPTH_DECODING AND NOT CONFIG_AV1_HIGHBITDEPTH) change_config_and_warn(CONFIG_AV1_HIGHBITDEPTH 1 "FORCE_HIGHBITDEPTH_DECODING") endif() if(CONFIG_THREE_PASS AND NOT CONFIG_AV1_DECODER) change_config_and_warn(CONFIG_THREE_PASS 0 "CONFIG_AV1_DECODER=0") endif() # Generate the user config settings. list(APPEND aom_build_vars ${AOM_CONFIG_VARS} ${AOM_OPTION_VARS}) foreach(cache_var ${aom_build_vars}) get_property(cache_var_helpstring CACHE ${cache_var} PROPERTY HELPSTRING) if(cache_var_helpstring STREQUAL cmake_cmdline_helpstring) set(AOM_CMAKE_CONFIG "${AOM_CMAKE_CONFIG} -D${cache_var}=${${cache_var}}") endif() endforeach() string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG) # Detect target CPU. if(NOT AOM_TARGET_CPU) string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" cpu_lowercase) if(cpu_lowercase STREQUAL "amd64" OR cpu_lowercase STREQUAL "x86_64") if(CMAKE_SIZEOF_VOID_P EQUAL 4) set(AOM_TARGET_CPU "x86") elseif(CMAKE_SIZEOF_VOID_P EQUAL 8) set(AOM_TARGET_CPU "x86_64") else() message( FATAL_ERROR "--- Unexpected pointer size (${CMAKE_SIZEOF_VOID_P}) for\n" " CMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}\n" " CMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}\n" " CMAKE_GENERATOR=${CMAKE_GENERATOR}\n") endif() elseif(cpu_lowercase STREQUAL "i386" OR cpu_lowercase STREQUAL "x86") set(AOM_TARGET_CPU "x86") elseif(cpu_lowercase MATCHES "^arm") set(AOM_TARGET_CPU "${cpu_lowercase}") elseif(cpu_lowercase MATCHES "aarch64") set(AOM_TARGET_CPU "arm64") elseif(cpu_lowercase MATCHES "^ppc") set(AOM_TARGET_CPU "ppc") elseif(cpu_lowercase MATCHES "^riscv") set(AOM_TARGET_CPU "riscv") else() message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not " "supported, falling back to the generic target") set(AOM_TARGET_CPU "generic") endif() endif() if(CMAKE_TOOLCHAIN_FILE) # Add toolchain file to config string. if(IS_ABSOLUTE "${CMAKE_TOOLCHAIN_FILE}") file(RELATIVE_PATH toolchain_path "${AOM_CONFIG_DIR}" "${CMAKE_TOOLCHAIN_FILE}") else() set(toolchain_path "${CMAKE_TOOLCHAIN_FILE}") endif() set(toolchain_string "-DCMAKE_TOOLCHAIN_FILE=\\\"${toolchain_path}\\\"") set(AOM_CMAKE_CONFIG "${toolchain_string} ${AOM_CMAKE_CONFIG}") else() # Add detected CPU to the config string. set(AOM_CMAKE_CONFIG "-DAOM_TARGET_CPU=${AOM_TARGET_CPU} ${AOM_CMAKE_CONFIG}") endif() set(AOM_CMAKE_CONFIG "-G \\\"${CMAKE_GENERATOR}\\\" ${AOM_CMAKE_CONFIG}") file(RELATIVE_PATH source_path "${AOM_CONFIG_DIR}" "${AOM_ROOT}") set(AOM_CMAKE_CONFIG "cmake ${source_path} ${AOM_CMAKE_CONFIG}") string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG) message("--- aom_configure: Detected CPU: ${AOM_TARGET_CPU}") set(AOM_TARGET_SYSTEM ${CMAKE_SYSTEM_NAME}) string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase) if(build_type_lowercase STREQUAL "debug") set(CONFIG_DEBUG 1) endif() if(BUILD_SHARED_LIBS) set(CONFIG_PIC 1) set(CONFIG_SHARED 1) elseif(NOT CONFIG_PIC) # Update the variable only when it does not carry the CMake assigned help # string for variables specified via the command line. This allows the user to # force CONFIG_PIC=0. unset(cache_helpstring) get_property(cache_helpstring CACHE CONFIG_PIC PROPERTY HELPSTRING) if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}") aom_check_c_compiles("pie_check" " #if !(__pie__ || __PIE__) #error Neither __pie__ or __PIE__ are set #endif extern void unused(void); void unused(void) {}" HAVE_PIE) if(HAVE_PIE) # If -fpie or -fPIE are used ensure the assembly code has PIC enabled to # avoid DT_TEXTRELs: /usr/bin/ld: warning: creating DT_TEXTREL in a PIE set(CONFIG_PIC 1) message( "CONFIG_PIC enabled for position independent executable (PIE) build") endif() endif() unset(cache_helpstring) endif() if(NOT MSVC) if(CONFIG_PIC) # TODO(tomfinegan): clang needs -pie in CMAKE_EXE_LINKER_FLAGS for this to # work. set(CMAKE_POSITION_INDEPENDENT_CODE ON) if(AOM_TARGET_SYSTEM STREQUAL "Linux" AND AOM_TARGET_CPU MATCHES "^armv[78]") set(AOM_AS_FLAGS ${AOM_AS_FLAGS} --defsym PIC=1) else() set(AOM_AS_FLAGS ${AOM_AS_FLAGS} -DPIC) endif() endif() endif() if(AOM_TARGET_CPU STREQUAL "x86" OR AOM_TARGET_CPU STREQUAL "x86_64") find_program(CMAKE_ASM_NASM_COMPILER yasm $ENV{YASM_PATH}) if(NOT CMAKE_ASM_NASM_COMPILER OR ENABLE_NASM) unset(CMAKE_ASM_NASM_COMPILER CACHE) find_program(CMAKE_ASM_NASM_COMPILER nasm $ENV{NASM_PATH}) endif() include(CheckLanguage) check_language(ASM_NASM) if(CMAKE_ASM_NASM_COMPILER) get_asm_obj_format("objformat") unset(CMAKE_ASM_NASM_OBJECT_FORMAT) set(CMAKE_ASM_NASM_OBJECT_FORMAT ${objformat}) enable_language(ASM_NASM) if(CMAKE_ASM_NASM_COMPILER_ID STREQUAL "NASM") test_nasm() endif() # Xcode requires building the objects manually, so pass the object format # flag. if(XCODE) set(AOM_AS_FLAGS -f ${objformat} ${AOM_AS_FLAGS}) endif() else() message( FATAL_ERROR "Unable to find assembler. Install 'yasm' or 'nasm.' " "To build without optimizations, add -DAOM_TARGET_CPU=generic to " "your cmake command line.") endif() string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS) elseif(AOM_TARGET_CPU MATCHES "arm") if(AOM_TARGET_SYSTEM STREQUAL "Darwin") if(NOT CMAKE_ASM_COMPILER) set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER}) endif() set(AOM_AS_FLAGS -arch ${AOM_TARGET_CPU} -isysroot ${CMAKE_OSX_SYSROOT}) elseif(AOM_TARGET_SYSTEM STREQUAL "Windows") if(NOT CMAKE_ASM_COMPILER) set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER} "-c -mimplicit-it=always") endif() else() if(NOT CMAKE_ASM_COMPILER) set(CMAKE_ASM_COMPILER as) endif() endif() include(CheckLanguage) check_language(ASM) if(NOT CMAKE_ASM_COMPILER) message( FATAL_ERROR "Unable to find assembler and optimizations are enabled." "Searched for ${CMAKE_ASM_COMPILER}. Install it, add it to your path," "or set the assembler directly by adding " "-DCMAKE_ASM_COMPILER= to your CMake command line." "To build without optimizations, add -DAOM_TARGET_CPU=generic to your " "cmake command line.") endif() enable_language(ASM) string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS) endif() if(CONFIG_ANALYZER) find_package(wxWidgets REQUIRED adv base core) include(${wxWidgets_USE_FILE}) endif() if(NOT MSVC AND CMAKE_C_COMPILER_ID MATCHES "GNU\|Clang") set(CONFIG_GCC 1) endif() if(CONFIG_GCOV) message("--- Testing for CONFIG_GCOV support.") require_linker_flag("-fprofile-arcs -ftest-coverage") require_compiler_flag("-fprofile-arcs -ftest-coverage" YES) endif() if(CONFIG_GPROF) message("--- Testing for CONFIG_GPROF support.") require_compiler_flag("-pg" YES) endif() if(AOM_TARGET_SYSTEM MATCHES "Darwin\|Linux\|Windows\|Android") set(CONFIG_OS_SUPPORT 1) endif() if(AOM_TARGET_SYSTEM STREQUAL "Windows") # The default _WIN32_WINNT value in MinGW is 0x0502 (Windows XP with SP2). Set # it to 0x0601 (Windows 7). add_compiler_flag_if_supported("-D_WIN32_WINNT=0x0601") # Quiet warnings related to fopen, printf, etc. add_compiler_flag_if_supported("-D_CRT_SECURE_NO_WARNINGS") endif() # # Fix CONFIG_* dependencies. This must be done before including cpu.cmake to # ensure RTCD_CONFIG_* are properly set. fix_experiment_configs() # Don't just check for pthread.h, but use the result of the full pthreads # including a linking check in FindThreads above. set(HAVE_PTHREAD_H ${CMAKE_USE_PTHREADS_INIT}) aom_check_source_compiles("unistd_check" "#include " HAVE_UNISTD_H) if(NOT WIN32) aom_push_var(CMAKE_REQUIRED_LIBRARIES "m") aom_check_c_compiles("fenv_check" "#define _GNU_SOURCE #include void unused(void) { (void)unused; (void)feenableexcept(FE_DIVBYZERO | FE_INVALID); }" HAVE_FEXCEPT) aom_pop_var(CMAKE_REQUIRED_LIBRARIES) endif() include("${AOM_ROOT}/build/cmake/cpu.cmake") if(ENABLE_CCACHE) set_compiler_launcher(ENABLE_CCACHE ccache) endif() if(ENABLE_DISTCC) set_compiler_launcher(ENABLE_DISTCC distcc) endif() if(ENABLE_GOMA) set_compiler_launcher(ENABLE_GOMA gomacc) endif() if(NOT CONFIG_AV1_DECODER AND NOT CONFIG_AV1_ENCODER) message(FATAL_ERROR "Decoder and encoder disabled, nothing to build.") endif() if(DECODE_HEIGHT_LIMIT OR DECODE_WIDTH_LIMIT) change_config_and_warn(CONFIG_SIZE_LIMIT 1 "DECODE_HEIGHT_LIMIT and DECODE_WIDTH_LIMIT") endif() if(CONFIG_SIZE_LIMIT) if(NOT DECODE_HEIGHT_LIMIT OR NOT DECODE_WIDTH_LIMIT) message(FATAL_ERROR "When setting CONFIG_SIZE_LIMIT, DECODE_HEIGHT_LIMIT " "and DECODE_WIDTH_LIMIT must be set.") endif() endif() # Test compiler flags. if(MSVC) # It isn't possible to specify C99 conformance for MSVC. MSVC doesn't support # C++ standards modes earlier than C++14. add_cxx_flag_if_supported("/std:c++14") add_compiler_flag_if_supported("/W3") # Disable MSVC warnings that suggest making code non-portable. add_compiler_flag_if_supported("/wd4996") if(ENABLE_WERROR) add_compiler_flag_if_supported("/WX") endif() # Compile source files in parallel add_compiler_flag_if_supported("/MP") else() require_c_flag("-std=c99" YES) if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "GNU" AND CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC") # Microsoft's C++ Standard Library requires C++14 as it's MSVC's default and # minimum supported C++ version. If Clang is using this Standard Library # implementation, it cannot target C++11. require_cxx_flag_nomsvc("-std=c++14" YES) elseif(CYGWIN AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") # The GNU C++ compiler in Cygwin needs the -std=gnu++11 flag to make the # POSIX function declarations visible in the Standard C Library headers. require_cxx_flag_nomsvc("-std=gnu++11" YES) else() require_cxx_flag_nomsvc("-std=c++11" YES) endif() add_compiler_flag_if_supported("-Wall") add_compiler_flag_if_supported("-Wdisabled-optimization") add_compiler_flag_if_supported("-Wextra") # Prior to version 3.19.0 cmake would fail to parse the warning emitted by gcc # with this flag. Note the order of this check and -Wextra-semi-stmt is # important due to is_flag_present() matching substrings with string(FIND # ...). if(CMAKE_VERSION VERSION_LESS "3.19" AND CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 10) add_cxx_flag_if_supported("-Wextra-semi") else() add_compiler_flag_if_supported("-Wextra-semi") endif() add_compiler_flag_if_supported("-Wextra-semi-stmt") add_compiler_flag_if_supported("-Wfloat-conversion") add_compiler_flag_if_supported("-Wformat=2") add_c_flag_if_supported("-Wimplicit-function-declaration") add_compiler_flag_if_supported("-Wlogical-op") add_compiler_flag_if_supported("-Wmissing-declarations") if(CMAKE_C_COMPILER_ID MATCHES "Clang") add_compiler_flag_if_supported("-Wmissing-prototypes") else() add_c_flag_if_supported("-Wmissing-prototypes") endif() add_compiler_flag_if_supported("-Wpointer-arith") add_compiler_flag_if_supported("-Wshadow") add_compiler_flag_if_supported("-Wshorten-64-to-32") add_compiler_flag_if_supported("-Wsign-compare") add_compiler_flag_if_supported("-Wstring-conversion") add_compiler_flag_if_supported("-Wtype-limits") add_compiler_flag_if_supported("-Wundef") add_compiler_flag_if_supported("-Wuninitialized") add_compiler_flag_if_supported("-Wunreachable-code-aggressive") add_compiler_flag_if_supported("-Wunused") add_compiler_flag_if_supported("-Wvla") add_cxx_flag_if_supported("-Wc++14-extensions") add_cxx_flag_if_supported("-Wc++17-extensions") add_cxx_flag_if_supported("-Wc++20-extensions") if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND SANITIZE MATCHES "address|undefined") # This combination has more stack overhead, so we account for it by # providing higher stack limit than usual. add_c_flag_if_supported("-Wstack-usage=285000") add_cxx_flag_if_supported("-Wstack-usage=270000") elseif(CONFIG_RD_DEBUG) # Another case where higher stack usage is expected. add_c_flag_if_supported("-Wstack-usage=135000") add_cxx_flag_if_supported("-Wstack-usage=240000") else() add_c_flag_if_supported("-Wstack-usage=100000") add_cxx_flag_if_supported("-Wstack-usage=240000") endif() if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND SANITIZE MATCHES "address") # Disable no optimization warning when compiling with sanitizers add_compiler_flag_if_supported("-Wno-disabled-optimization") endif() # Quiet gcc 6 vs 7 abi warnings: # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728 if(AOM_TARGET_CPU MATCHES "arm") add_cxx_flag_if_supported("-Wno-psabi") endif() if(ENABLE_WERROR) add_compiler_flag_if_supported("-Werror") endif() if(build_type_lowercase MATCHES "rel") add_compiler_flag_if_supported("-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0") endif() add_compiler_flag_if_supported("-D_LARGEFILE_SOURCE") add_compiler_flag_if_supported("-D_FILE_OFFSET_BITS=64") # Do not allow implicit vector type conversions on Clang builds (this is # already the default on GCC builds). if(CMAKE_C_COMPILER_ID MATCHES "Clang") # Clang 8.0.1 (in Cygwin) doesn't support -flax-vector-conversions=none. add_compiler_flag_if_supported("-flax-vector-conversions=none") endif() endif() # Prior to r23, or with ANDROID_USE_LEGACY_TOOLCHAIN_FILE set, # android.toolchain.cmake would set normal (non-cache) versions of variables # like CMAKE_C_FLAGS_RELEASE which would mask the ones added to the cache # variable in add_compiler_flag_if_supported(), etc. As a workaround we add # everything accumulated in AOM_C/CXX_FLAGS to the normal versions. This could # also be addressed by reworking the flag tests and adding the results directly # to target_compile_options() as in e.g., libgav1, but that's a larger task. # https://github.com/android/ndk/wiki/Changelog-r23#changes if(ANDROID AND ("${ANDROID_NDK_MAJOR}" LESS 23 OR ANDROID_USE_LEGACY_TOOLCHAIN_FILE)) foreach(lang C;CXX) string(STRIP "${AOM_${lang}_FLAGS}" AOM_${lang}_FLAGS) if(AOM_${lang}_FLAGS) foreach(config ${AOM_${lang}_CONFIGS}) set(${config} "${${config}} ${AOM_${lang}_FLAGS}") endforeach() endif() endforeach() endif() set(AOM_LIB_LINK_TYPE PUBLIC) if(EMSCRIPTEN) # Avoid CMake generation time errors resulting from collisions with the form # of target_link_libraries() used by Emscripten.cmake. unset(AOM_LIB_LINK_TYPE) endif() # Generate aom_config templates. set(aom_config_asm_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake") set(aom_config_h_template "${AOM_CONFIG_DIR}/config/aom_config.h.cmake") execute_process( COMMAND ${CMAKE_COMMAND} -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} -DAOM_ROOT=${AOM_ROOT} -P "${AOM_ROOT}/build/cmake/generate_aom_config_templates.cmake") # Generate aom_config.{asm,h}. configure_file("${aom_config_asm_template}" "${AOM_CONFIG_DIR}/config/aom_config.asm") configure_file("${aom_config_h_template}" "${AOM_CONFIG_DIR}/config/aom_config.h") # Read the current git hash. find_package(Git) if(NOT GIT_FOUND) message("--- Git missing, version will be read from CHANGELOG.") endif() string(TIMESTAMP year "%Y") configure_file("${AOM_ROOT}/build/cmake/aom_config.c.template" "${AOM_CONFIG_DIR}/config/aom_config.c") # Find Perl and generate the RTCD sources. find_package(Perl) if(NOT PERL_FOUND) message(FATAL_ERROR "Perl is required to build libaom.") endif() set(AOM_RTCD_CONFIG_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl" "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl" "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl") set(AOM_RTCD_HEADER_FILE_LIST "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h" "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h" "${AOM_CONFIG_DIR}/config/av1_rtcd.h") set(AOM_RTCD_SOURCE_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c" "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c" "${AOM_ROOT}/av1/common/av1_rtcd.c") set(AOM_RTCD_SYMBOL_LIST aom_dsp_rtcd aom_scale_rtcd av1_rtcd) list(LENGTH AOM_RTCD_SYMBOL_LIST AOM_RTCD_CUSTOM_COMMAND_COUNT) math(EXPR AOM_RTCD_CUSTOM_COMMAND_COUNT "${AOM_RTCD_CUSTOM_COMMAND_COUNT} - 1") foreach(NUM RANGE ${AOM_RTCD_CUSTOM_COMMAND_COUNT}) list(GET AOM_RTCD_CONFIG_FILE_LIST ${NUM} AOM_RTCD_CONFIG_FILE) list(GET AOM_RTCD_HEADER_FILE_LIST ${NUM} AOM_RTCD_HEADER_FILE) list(GET AOM_RTCD_SOURCE_FILE_LIST ${NUM} AOM_RTCD_SOURCE_FILE) list(GET AOM_RTCD_SYMBOL_LIST ${NUM} AOM_RTCD_SYMBOL) execute_process( COMMAND ${PERL_EXECUTABLE} "${AOM_ROOT}/build/cmake/rtcd.pl" --arch=${AOM_TARGET_CPU} --sym=${AOM_RTCD_SYMBOL} ${AOM_RTCD_FLAGS} --config=${AOM_CONFIG_DIR}/config/aom_config.h ${AOM_RTCD_CONFIG_FILE} OUTPUT_FILE ${AOM_RTCD_HEADER_FILE}) endforeach() # Generate aom_version.h. execute_process(COMMAND ${CMAKE_COMMAND} -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} -DAOM_ROOT=${AOM_ROOT} -DGIT_EXECUTABLE=${GIT_EXECUTABLE} -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P "${AOM_ROOT}/build/cmake/version.cmake") aom-3.12.1/build/cmake/aom_experiment_deps.cmake000066400000000000000000000016161477627663500216150ustar00rootroot00000000000000# # Copyright (c) 2017, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # if(AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_) return() endif() # AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_ set(AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_ 1) # Adjusts CONFIG_* CMake variables to address conflicts between active AV1 # experiments. macro(fix_experiment_configs) if(CONFIG_ANALYZER) change_config_and_warn(CONFIG_INSPECTION 1 CONFIG_ANALYZER) endif() endmacro() aom-3.12.1/build/cmake/aom_install.cmake000066400000000000000000000075001477627663500200660ustar00rootroot00000000000000# # Copyright (c) 2018, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aom.h" "${AOM_ROOT}/aom/aom_codec.h" "${AOM_ROOT}/aom/aom_frame_buffer.h" "${AOM_ROOT}/aom/aom_image.h" "${AOM_ROOT}/aom/aom_integer.h") if(CONFIG_AV1_DECODER) list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aom_decoder.h" "${AOM_ROOT}/aom/aomdx.h") endif() if(CONFIG_AV1_ENCODER) list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aomcx.h" "${AOM_ROOT}/aom/aom_encoder.h" "${AOM_ROOT}/aom/aom_external_partition.h") endif() # Generate aom.pc and setup dependencies to ensure it is created when necessary. # Note: aom.pc generation uses GNUInstallDirs: # https://cmake.org/cmake/help/latest/module/GNUInstallDirs.html macro(setup_aom_install_targets) if(NOT XCODE) include("GNUInstallDirs") set(AOM_PKG_CONFIG_FILE "${AOM_CONFIG_DIR}/aom.pc") # Create a library target for creating aom.pc. create_no_op_source_file(aom_pc c AOM_PKG_CONFIG_SOURCES) add_library(aom_pc ${AOM_PKG_CONFIG_SOURCES}) # Setup a rule to generate aom.pc. add_custom_command( OUTPUT "${AOM_PKG_CONFIG_FILE}" COMMAND ${CMAKE_COMMAND} ARGS -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} -DAOM_ROOT=${AOM_ROOT} -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR} -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR} -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR} -DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME} -DCMAKE_THREAD_LIBS_INIT=${CMAKE_THREAD_LIBS_INIT} -DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD} -DCONFIG_TUNE_VMAF=${CONFIG_TUNE_VMAF} -DCONFIG_TUNE_BUTTERAUGLI=${CONFIG_TUNE_BUTTERAUGLI} -DCONFIG_SALIENCY_MAP=${CONFIG_SALIENCY_MAP} -DCONFIG_TFLITE=${CONFIG_TFLITE} -P "${AOM_ROOT}/build/cmake/pkg_config.cmake" COMMENT "Writing aom.pc" VERBATIM) # Explicitly add a dependency on the pkg-config file to ensure it's built. get_property(aom_pc_sources TARGET aom_pc PROPERTY SOURCES) set_source_files_properties(${aom_pc_sources} OBJECT_DEPENDS "${AOM_PKG_CONFIG_FILE}") # Our pkg-config file carries version information: add a dependency on the # version rule. add_dependencies(aom_pc aom_version) if(CONFIG_AV1_DECODER) if(ENABLE_EXAMPLES) list(APPEND AOM_INSTALL_BINS aomdec) endif() endif() if(CONFIG_AV1_ENCODER) if(ENABLE_EXAMPLES) list(APPEND AOM_INSTALL_BINS aomenc) endif() endif() if(BUILD_SHARED_LIBS) set(AOM_INSTALL_LIBS aom aom_static) else() set(AOM_INSTALL_LIBS aom) endif() # Setup the install rules. install() will automatically prepend # CMAKE_INSTALL_PREFIX to relative paths install(FILES ${AOM_INSTALL_INCS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/aom") install(FILES "${AOM_PKG_CONFIG_FILE}" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") install(TARGETS ${AOM_INSTALL_LIBS};${AOM_INSTALL_BINS} RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}") endif() endmacro() aom-3.12.1/build/cmake/aom_optimization.cmake000066400000000000000000000256121477627663500211520ustar00rootroot00000000000000# # Copyright (c) 2017, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # if(AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_) return() endif() # AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_ set(AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_ 1) include("${AOM_ROOT}/build/cmake/util.cmake") # Translate $flag to one which MSVC understands, and write the new flag to the # variable named by $translated_flag (or unset it, when MSVC needs no flag). function(get_msvc_intrinsic_flag flag translated_flag) if("${flag}" STREQUAL "-mavx") set(${translated_flag} "/arch:AVX" PARENT_SCOPE) elseif("${flag}" STREQUAL "-mavx2") set(${translated_flag} "/arch:AVX2" PARENT_SCOPE) else() # MSVC does not need flags for intrinsics flavors other than AVX/AVX2. unset(${translated_flag} PARENT_SCOPE) endif() endfunction() # Adds an object library target. Terminates generation if $flag is not supported # by the current compiler. $flag is the intrinsics flag required by the current # compiler, and is added to the compile flags for all sources in $sources. # $opt_name is used to name the target. $target_to_update is made dependent upon # the created target. # # Note: this function always updates the aom, and aom_static targets because # OBJECT libraries have rules that disallow the direct addition of .o files to # them as dependencies. Static and shared libraries do not have this limitation. function(add_intrinsics_object_library flag opt_name target_to_update sources) if("${${sources}}" STREQUAL "") return() endif() set(target_name ${target_to_update}_${opt_name}_intrinsics) add_library(${target_name} OBJECT ${${sources}}) set_property(TARGET ${target_name} PROPERTY FOLDER ${AOM_TARGET_CPU}) # MSVC does not need flags for intrinsics flavors other than AVX/AVX2. # However, for clang-cl, the default is SSE2, and the MSVC frontend does not # provide any flags to enable SSE3 up to SSE4.1. So we need to restrict the # usage of MSVC-style flags to only the real MSVC. if(CMAKE_C_COMPILER_ID STREQUAL "MSVC") get_msvc_intrinsic_flag("${flag}" "flag") endif() if("${flag}" STREQUAL "-mavx2") unset(FLAG_SUPPORTED) check_c_compiler_flag("-mno-avx256-split-unaligned-load" FLAG_SUPPORTED) if(${FLAG_SUPPORTED}) set(flag "${flag} -mno-avx256-split-unaligned-load") endif() unset(FLAG_SUPPORTED) check_c_compiler_flag("-mno-avx256-split-unaligned-store" FLAG_SUPPORTED) if(${FLAG_SUPPORTED}) set(flag "${flag} -mno-avx256-split-unaligned-store") endif() endif() if(flag) separate_arguments(flag) target_compile_options(${target_name} PUBLIC ${flag}) endif() target_sources(aom PRIVATE $) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $) endif() # Add the new lib target to the global list of aom library targets. list(APPEND AOM_LIB_TARGETS ${target_name}) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) endfunction() # Adds sources in list named by $sources to $target and adds $flag to the # compile flags for each source file. function(add_intrinsics_source_to_target flag target sources) target_sources(${target} PRIVATE ${${sources}}) if(MSVC) get_msvc_intrinsic_flag("${flag}" "flag") endif() if(flag) foreach(source ${${sources}}) set_property(SOURCE ${source} APPEND PROPERTY COMPILE_FLAGS ${flag}) endforeach() endif() endfunction() # Writes object format for the current target to the var named by $out_format, # or terminates the build when the object format for the current target is # unknown. function(get_asm_obj_format out_format) if("${AOM_TARGET_CPU}" STREQUAL "x86_64") if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") set(objformat "macho64") elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" OR "${AOM_TARGET_SYSTEM}" STREQUAL "CYGWIN" OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows") set(objformat "win64") else() set(objformat "elf64") endif() elseif("${AOM_TARGET_CPU}" STREQUAL "x86") if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") set(objformat "macho32") elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" OR "${AOM_TARGET_SYSTEM}" STREQUAL "CYGWIN" OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows") set(objformat "win32") else() set(objformat "elf32") endif() else() message( FATAL_ERROR "Unknown obj format: ${AOM_TARGET_CPU}-${AOM_TARGET_SYSTEM}") endif() set(${out_format} ${objformat} PARENT_SCOPE) endfunction() # Adds library target named $lib_name for ASM files in variable named by # $asm_sources. Builds an output directory path from $lib_name. Links $lib_name # into the aom library target(s). Generates a C file with an unused no-op # function to ensure that all cmake generators can determine the linker # language, and that build tools don't complain that an object exposes no # symbols. # # In Xcode-based builds every step described above happens twice, and # directory/target/object names are updated to include _shared and _static # suffixes. function(add_asm_library lib_name asm_sources) if("${${asm_sources}}" STREQUAL "") return() endif() if(XCODE) # CMake's generator does not output a build rule for Nasm files. Moreover, # it makes Xcode believe Nasm files are of type "sourcecode" instead of # "sourcecode.nasm", which prevents even the default rule from applying. # This default rule is broken, though, because it doesn't apply any of the # flags specified for ASM_NASM. See https://discourse.cmake.org/t/building- # nasm-files-with-xcode/7934 list(APPEND asm_configs "static") if(BUILD_SHARED_LIBS) list(APPEND asm_configs "shared") endif() set(as_executable "${CMAKE_ASM_NASM_COMPILER}") if(NOT as_executable) set(as_executable "${CMAKE_ASM_COMPILER}") endif() foreach(asm_config ${asm_configs}) set(asm_lib_name ${lib_name}_${asm_config}) set(asm_lib_obj_dir "${AOM_CONFIG_DIR}/asm_objects/${asm_lib_name}") if(NOT EXISTS "${asm_lib_obj_dir}") file(MAKE_DIRECTORY "${asm_lib_obj_dir}") endif() foreach(asm_source ${${asm_sources}}) get_filename_component(asm_source_name "${asm_source}" NAME) set(asm_object "${asm_lib_obj_dir}/${asm_source_name}.o") add_custom_command(OUTPUT "${asm_object}" COMMAND ${as_executable} ARGS ${AOM_AS_FLAGS} -I${AOM_ROOT}/ -I${AOM_CONFIG_DIR}/ -o "${asm_object}" "${asm_source}" DEPENDS "${asm_source}" COMMENT "Building ASM object ${asm_object}" WORKING_DIRECTORY "${AOM_CONFIG_DIR}" VERBATIM) if(BUILD_SHARED_LIBS AND "${asm_config}" STREQUAL "static") target_sources(aom_static PRIVATE "${asm_object}") else() target_sources(aom PRIVATE "${asm_object}") endif() endforeach() endforeach() else() # For non-Xcode generators, CMake does not need extra help. The language # support takes care of it. set(asm_lib_name ${lib_name}) add_library(${asm_lib_name} OBJECT ${${asm_sources}}) target_include_directories(${asm_lib_name} PRIVATE ${AOM_ROOT} ${AOM_CONFIG_DIR}) target_compile_options(${asm_lib_name} PRIVATE ${AOM_AS_FLAGS}) set_property(TARGET ${asm_lib_name} PROPERTY FOLDER ${AOM_TARGET_CPU}) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE "$") endif() target_sources(aom PRIVATE "$") # Add the new lib target to the global list of aom library targets. list(APPEND AOM_LIB_TARGETS ${asm_lib_name}) endif() set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) endfunction() # Terminates generation if nasm found in PATH does not meet requirements. # Currently checks only for presence of required object formats and support for # the -Ox argument (multipass optimization). function(test_nasm) execute_process(COMMAND ${CMAKE_ASM_NASM_COMPILER} -hf OUTPUT_VARIABLE nasm_helptext) if(NOT "${nasm_helptext}" MATCHES "-Ox") message( FATAL_ERROR "Unsupported nasm: multipass optimization not supported.") endif() if("${AOM_TARGET_CPU}" STREQUAL "x86") if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") if(NOT "${nasm_helptext}" MATCHES "macho32") message( FATAL_ERROR "Unsupported nasm: macho32 object format not supported.") endif() elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows") if(NOT "${nasm_helptext}" MATCHES "win32") message( FATAL_ERROR "Unsupported nasm: win32 object format not supported.") endif() else() if(NOT "${nasm_helptext}" MATCHES "elf32") message( FATAL_ERROR "Unsupported nasm: elf32 object format not supported.") endif() endif() else() if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") if(NOT "${nasm_helptext}" MATCHES "macho64") message( FATAL_ERROR "Unsupported nasm: macho64 object format not supported.") endif() elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows") if(NOT "${nasm_helptext}" MATCHES "win64") message( FATAL_ERROR "Unsupported nasm: win64 object format not supported.") endif() else() if(NOT "${nasm_helptext}" MATCHES "elf64") message( FATAL_ERROR "Unsupported nasm: elf64 object format not supported.") endif() endif() endif() endfunction() # Adds build command for generation of rtcd C source files using # build/cmake/rtcd.pl. $config is the input perl file, $output is the output C # include file, $source is the C source file, and $symbol is used for the symbol # argument passed to rtcd.pl. function(add_rtcd_build_step config output source symbol) add_custom_command( OUTPUT ${output} COMMAND ${PERL_EXECUTABLE} ARGS "${AOM_ROOT}/build/cmake/rtcd.pl" --arch=${AOM_TARGET_CPU} --sym=${symbol} ${AOM_RTCD_FLAGS} --config=${AOM_CONFIG_DIR}/config/aom_config.h ${config} > ${output} DEPENDS "${AOM_ROOT}/build/cmake/rtcd.pl" ${config} COMMENT "Generating ${output}" WORKING_DIRECTORY ${AOM_CONFIG_DIR} VERBATIM) set_property(SOURCE ${source} PROPERTY OBJECT_DEPENDS ${output}) set_property(SOURCE ${output} PROPERTY GENERATED TRUE) endfunction() aom-3.12.1/build/cmake/compiler_flags.cmake000066400000000000000000000300001477627663500205410ustar00rootroot00000000000000# # Copyright (c) 2016, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # if(AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_) return() endif() # AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_ set(AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_ 1) include(CheckCCompilerFlag) include(CheckCXXCompilerFlag) include("${AOM_ROOT}/build/cmake/compiler_tests.cmake") # Strings used to cache flags. set(AOM_C_FLAGS) set(AOM_CXX_FLAGS) set(AOM_EXE_LINKER_FLAGS) set(AOM_FAILED_C_FLAGS) set(AOM_FAILED_CXX_FLAGS) # Sets variable named by $out_is_present to YES in the caller's scope when $flag # is found in the string variable named by $flag_cache. Sets the var to NO # otherwise. function(is_flag_present flag_cache flag out_is_present) string(FIND "${${flag_cache}}" "${flag}" flag_pos) if(${flag_pos} EQUAL -1) set(${out_is_present} NO PARENT_SCOPE) else() set(${out_is_present} YES PARENT_SCOPE) endif() endfunction() # Appends $flag to $flags. Ignores scope via use of FORCE with set() call. function(append_flag flags flag) string(FIND "${${flags}}" "${flag}" found) if(${found} EQUAL -1) set(${flags} "${${flags}} ${flag}" CACHE STRING "" FORCE) endif() endfunction() # Checks C compiler for support of $c_flag. Adds $c_flag to all # $CMAKE_C_FLAGS_s stored in AOM_C_CONFIGS when the compile test passes. # Caches $c_flag in $AOM_C_FLAGS or $AOM_FAILED_C_FLAGS depending on test # outcome. function(add_c_flag_if_supported c_flag) if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) return() endif() is_flag_present(AOM_C_FLAGS "${c_flag}" flag_ok) is_flag_present(AOM_FAILED_C_FLAGS "${c_flag}" flag_failed) if(${flag_ok} OR ${flag_failed}) return() endif() # Between 3.17.0 and 3.18.2 check_c_compiler_flag() sets a normal variable at # parent scope while check_cxx_source_compiles() continues to set an internal # cache variable, so we unset both to avoid the failure / success state # persisting between checks. See # https://gitlab.kitware.com/cmake/cmake/-/issues/21207. unset(C_FLAG_SUPPORTED) unset(C_FLAG_SUPPORTED CACHE) message("Checking C compiler flag support for: " ${c_flag}) check_c_compiler_flag("${c_flag}" C_FLAG_SUPPORTED) if(${C_FLAG_SUPPORTED}) append_flag(AOM_C_FLAGS "${c_flag}") foreach(config ${AOM_C_CONFIGS}) unset(C_FLAG_FOUND) append_flag("${config}" "${c_flag}") endforeach() else() append_flag(AOM_FAILED_C_FLAGS "${c_flag}") endif() endfunction() # Checks C++ compiler for support of $cxx_flag. Adds $cxx_flag to all # $CMAKE_CXX_FLAGS_s stored in AOM_CXX_CONFIGS when the compile test # passes. Caches $cxx_flag in $AOM_CXX_FLAGS or $AOM_FAILED_CXX_FLAGS depending # on test outcome. function(add_cxx_flag_if_supported cxx_flag) if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) return() endif() is_flag_present(AOM_CXX_FLAGS "${cxx_flag}" flag_ok) is_flag_present(AOM_FAILED_CXX_FLAGS "${cxx_flag}" flag_failed) if(${flag_ok} OR ${flag_failed}) return() endif() # Between 3.17.0 and 3.18.2 check_cxx_compiler_flag() sets a normal variable # at parent scope while check_cxx_source_compiles() continues to set an # internal cache variable, so we unset both to avoid the failure / success # state persisting between checks. See # https://gitlab.kitware.com/cmake/cmake/-/issues/21207. unset(CXX_FLAG_SUPPORTED) unset(CXX_FLAG_SUPPORTED CACHE) message("Checking C++ compiler flag support for: " ${cxx_flag}) check_cxx_compiler_flag("${cxx_flag}" CXX_FLAG_SUPPORTED) if(${CXX_FLAG_SUPPORTED}) append_flag(AOM_CXX_FLAGS "${cxx_flag}") foreach(config ${AOM_CXX_CONFIGS}) unset(CXX_FLAG_FOUND) append_flag("${config}" "${cxx_flag}") endforeach() else() append_flag(AOM_FAILED_CXX_FLAGS "${cxx_flag}") endif() endfunction() # Convenience method for adding a flag to both the C and C++ compiler command # lines. function(add_compiler_flag_if_supported flag) add_c_flag_if_supported(${flag}) add_cxx_flag_if_supported(${flag}) endfunction() # Checks C compiler for support of $c_flag and terminates generation when # support is not present. function(require_c_flag c_flag update_c_flags) if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) return() endif() is_flag_present(AOM_C_FLAGS "${c_flag}" flag_ok) if(${flag_ok}) return() endif() if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "") aom_push_var(CMAKE_EXE_LINKER_FLAGS "${AOM_EXE_LINKER_FLAGS}") endif() unset(HAVE_C_FLAG CACHE) message("Checking C compiler flag support for: " ${c_flag}) check_c_compiler_flag("${c_flag}" HAVE_C_FLAG) if(NOT HAVE_C_FLAG) message( FATAL_ERROR "${PROJECT_NAME} requires support for C flag: ${c_flag}.") endif() if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "") aom_pop_var(CMAKE_EXE_LINKER_FLAGS) endif() append_flag(AOM_C_FLAGS "${c_flag}") if(update_c_flags) foreach(config ${AOM_C_CONFIGS}) set(${config} "${${config}} ${c_flag}" CACHE STRING "" FORCE) endforeach() endif() endfunction() # Checks CXX compiler for support of $cxx_flag and terminates generation when # support is not present. function(require_cxx_flag cxx_flag update_cxx_flags) if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) return() endif() is_flag_present(AOM_CXX_FLAGS "${cxx_flag}" flag_ok) if(${flag_ok}) return() endif() if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "") aom_push_var(CMAKE_EXE_LINKER_FLAGS "${AOM_EXE_LINKER_FLAGS}") endif() unset(HAVE_CXX_FLAG CACHE) message("Checking C++ compiler flag support for: " ${cxx_flag}) check_cxx_compiler_flag("${cxx_flag}" HAVE_CXX_FLAG) if(NOT HAVE_CXX_FLAG) message( FATAL_ERROR "${PROJECT_NAME} requires support for C++ flag: ${cxx_flag}.") endif() if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "") aom_pop_var(CMAKE_EXE_LINKER_FLAGS) endif() append_flag(AOM_CXX_FLAGS "${cxx_flag}") if(update_cxx_flags) foreach(config ${AOM_CXX_CONFIGS}) set(${config} "${${config}} ${cxx_flag}" CACHE STRING "" FORCE) endforeach() endif() endfunction() # Checks for support of $flag by both the C and CXX compilers. Terminates # generation when support is not present in both compilers. function(require_compiler_flag flag update_cmake_flags) require_c_flag(${flag} ${update_cmake_flags}) require_cxx_flag(${flag} ${update_cmake_flags}) endfunction() # Checks only non-MSVC targets for support of $c_flag and terminates generation # when support is not present. function(require_c_flag_nomsvc c_flag update_c_flags) if(NOT MSVC) require_c_flag(${c_flag} ${update_c_flags}) endif() endfunction() # Checks only non-MSVC targets for support of $cxx_flag and terminates # generation when support is not present. function(require_cxx_flag_nomsvc cxx_flag update_cxx_flags) if(NOT MSVC) require_cxx_flag(${cxx_flag} ${update_cxx_flags}) endif() endfunction() # Checks only non-MSVC targets for support of $flag by both the C and CXX # compilers. Terminates generation when support is not present in both # compilers. function(require_compiler_flag_nomsvc flag update_cmake_flags) require_c_flag_nomsvc(${flag} ${update_cmake_flags}) require_cxx_flag_nomsvc(${flag} ${update_cmake_flags}) endfunction() # Adds $preproc_def to C compiler command line (as -D$preproc_def) if not # already present. function(add_c_preproc_definition preproc_def) set(preproc_def "-D${preproc_def}") is_flag_present(AOM_C_FLAGS "${preproc_def}" flag_cached) if(${flag_cached}) return() endif() foreach(config ${AOM_C_CONFIGS}) set(${config} "${${config}} ${preproc_def}" CACHE STRING "" FORCE) endforeach() endfunction() # Adds $preproc_def to CXX compiler command line (as -D$preproc_def) if not # already present. function(add_cxx_preproc_definition preproc_def) set(preproc_def "-D${preproc_def}") is_flag_present(AOM_CXX_FLAGS "${preproc_def}" flag_cached) if(${flag_cached}) return() endif() foreach(config ${AOM_CXX_CONFIGS}) set(${config} "${${config}} ${preproc_def}" CACHE STRING "" FORCE) endforeach() endfunction() # Adds $preproc_def to C and CXX compiler command line (as -D$preproc_def) if # not already present. function(add_preproc_definition preproc_def) add_c_preproc_definition(${preproc_def}) add_cxx_preproc_definition(${preproc_def}) endfunction() # Adds $flag to assembler command line. function(append_as_flag flag) is_flag_present(AOM_AS_FLAGS "${flag}" flag_cached) if(${flag_cached}) return() endif() append_flag(AOM_AS_FLAGS "${flag}") endfunction() # Adds $flag to the C compiler command line. function(append_c_flag flag) is_flag_present(AOM_C_FLAGS "${flag}" flag_cached) if(${flag_cached}) return() endif() foreach(config ${AOM_C_CONFIGS}) append_flag(${config} "${flag}") endforeach() endfunction() # Adds $flag to the CXX compiler command line. function(append_cxx_flag flag) is_flag_present(AOM_CXX_FLAGS "${flag}" flag_cached) if(${flag_cached}) return() endif() foreach(config ${AOM_CXX_CONFIGS}) append_flag(${config} "${flag}") endforeach() endfunction() # Adds $flag to the C and CXX compiler command lines. function(append_compiler_flag flag) append_c_flag(${flag}) append_cxx_flag(${flag}) endfunction() # Adds $flag to the executable linker command line when not present. function(append_exe_linker_flag flag) is_flag_present(AOM_EXE_LINKER_FLAGS "${flag}" flag_cached) if(${flag_cached}) return() endif() append_flag(AOM_EXE_LINKER_FLAGS "${flag}") foreach(config ${AOM_EXE_LINKER_CONFIGS}) append_flag(${config} "${flag}") endforeach() endfunction() # Adds $flag to the link flags for $target. function(append_link_flag_to_target target flag) unset(target_link_flags) get_target_property(target_link_flags ${target} LINK_FLAGS) if(target_link_flags) is_flag_present(target_link_flags "${flag}" flag_found) if(${flag_found}) return() endif() set(target_link_flags "${target_link_flags} ${flag}") else() set(target_link_flags "${flag}") endif() set_target_properties(${target} PROPERTIES LINK_FLAGS ${target_link_flags}) endfunction() # Adds $flag to executable linker flags, and makes sure C/CXX builds still work. function(require_linker_flag flag) if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) return() endif() append_exe_linker_flag(${flag}) unset(c_passed) aom_check_c_compiles("LINKER_FLAG_C_TEST(${flag})" "" c_passed) unset(cxx_passed) aom_check_cxx_compiles("LINKER_FLAG_CXX_TEST(${flag})" "" cxx_passed) if(NOT c_passed OR NOT cxx_passed) message(FATAL_ERROR "Linker flag test for ${flag} failed.") endif() endfunction() # Appends flags in $AOM_EXTRA__FLAGS variables to the flags used at build # time. function(set_user_flags) # Linker flags are handled first because some C/CXX flags require that a # linker flag is present at link time. if(AOM_EXTRA_EXE_LINKER_FLAGS) is_flag_present(AOM_EXE_LINKER_FLAGS "${AOM_EXTRA_EXE_LINKER_FLAGS}" extra_present) if(NOT ${extra_present}) require_linker_flag("${AOM_EXTRA_EXE_LINKER_FLAGS}") endif() endif() if(AOM_EXTRA_AS_FLAGS) # TODO(tomfinegan): assembler flag testing would be a good thing to have. is_flag_present(AOM_AS_FLAGS "${AOM_EXTRA_AS_FLAGS}" extra_present) if(NOT ${extra_present}) append_flag(AOM_AS_FLAGS "${AOM_EXTRA_AS_FLAGS}") endif() endif() if(AOM_EXTRA_C_FLAGS) is_flag_present(AOM_C_FLAGS "${AOM_EXTRA_C_FLAGS}" extra_present) if(NOT ${extra_present}) require_c_flag("${AOM_EXTRA_C_FLAGS}" YES) endif() endif() if(AOM_EXTRA_CXX_FLAGS) is_flag_present(AOM_CXX_FLAGS "${AOM_EXTRA_CXX_FLAGS}" extra_present) if(NOT ${extra_present}) require_cxx_flag("${AOM_EXTRA_CXX_FLAGS}" YES) endif() endif() endfunction() aom-3.12.1/build/cmake/compiler_tests.cmake000066400000000000000000000143031477627663500206170ustar00rootroot00000000000000# # Copyright (c) 2016, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # if(AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_) return() endif() # AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_ set(AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_ 1) include(CheckCSourceCompiles) include(CheckCXXSourceCompiles) # CMake passes command line flags like this: # # * $compiler $lang_flags $lang_flags_config ... # # To ensure the flags tested here and elsewhere are obeyed a list of active # build configuration types is built, and flags are applied to the flag strings # for each configuration currently active for C and CXX builds as determined by # reading $CMAKE_CONFIGURATION_TYPES and $CMAKE_BUILD_TYPE. When # $CMAKE_CONFIGURATION_TYPES is non-empty a multi- configuration generator is in # use: currently this includes MSVC and Xcode. For other generators # $CMAKE_BUILD_TYPE is used. For both cases AOM__CONFIGS is populated with # CMake string variable names that contain flags for the currently available # configuration(s). unset(AOM_C_CONFIGS) unset(AOM_CXX_CONFIGS) list(LENGTH CMAKE_CONFIGURATION_TYPES num_configs) if(${num_configs} GREATER 0) foreach(config ${CMAKE_CONFIGURATION_TYPES}) string(TOUPPER ${config} config) list(APPEND AOM_C_CONFIGS "CMAKE_C_FLAGS_${config}") list(APPEND AOM_CXX_CONFIGS "CMAKE_CXX_FLAGS_${config}") list(APPEND AOM_EXE_LINKER_CONFIGS "CMAKE_EXE_LINKER_FLAGS_${config}") endforeach() else() string(TOUPPER ${CMAKE_BUILD_TYPE} config) set(AOM_C_CONFIGS "CMAKE_C_FLAGS_${config}") set(AOM_CXX_CONFIGS "CMAKE_CXX_FLAGS_${config}") set(AOM_EXE_LINKER_CONFIGS "CMAKE_EXE_LINKER_FLAGS_${config}") endif() # The basic main() function used in all compile tests. set(AOM_C_MAIN "\nint main(void) { return 0; }") set(AOM_CXX_MAIN "\nint main() { return 0; }") # Strings containing the names of passed and failed tests. set(AOM_C_PASSED_TESTS) set(AOM_C_FAILED_TESTS) set(AOM_CXX_PASSED_TESTS) set(AOM_CXX_FAILED_TESTS) function(aom_push_var var new_value) set(SAVED_${var} ${${var}} PARENT_SCOPE) set(${var} "${${var}} ${new_value}" PARENT_SCOPE) endfunction() function(aom_pop_var var) set(var ${SAVED_${var}} PARENT_SCOPE) unset(SAVED_${var} PARENT_SCOPE) endfunction() # Confirms $test_source compiles and stores $test_name in one of # $AOM_C_PASSED_TESTS or $AOM_C_FAILED_TESTS depending on out come. When the # test passes $result_var is set to 1. When it fails $result_var is unset. The # test is not run if the test name is found in either of the passed or failed # test variables. function(aom_check_c_compiles test_name test_source result_var) if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) return() endif() unset(C_TEST_PASSED CACHE) unset(C_TEST_FAILED CACHE) string(FIND "${AOM_C_PASSED_TESTS}" "${test_name}" C_TEST_PASSED) string(FIND "${AOM_C_FAILED_TESTS}" "${test_name}" C_TEST_FAILED) if(${C_TEST_PASSED} EQUAL -1 AND ${C_TEST_FAILED} EQUAL -1) unset(C_TEST_COMPILED CACHE) message("Running C compiler test: ${test_name}") check_c_source_compiles("${test_source} ${AOM_C_MAIN}" C_TEST_COMPILED) set(${result_var} ${C_TEST_COMPILED} PARENT_SCOPE) if(C_TEST_COMPILED) set(AOM_C_PASSED_TESTS "${AOM_C_PASSED_TESTS} ${test_name}" CACHE STRING "" FORCE) else() set(AOM_C_FAILED_TESTS "${AOM_C_FAILED_TESTS} ${test_name}" CACHE STRING "" FORCE) message("C Compiler test ${test_name} failed.") endif() elseif(NOT ${C_TEST_PASSED} EQUAL -1) set(${result_var} 1 PARENT_SCOPE) else() # ${C_TEST_FAILED} NOT EQUAL -1 unset(${result_var} PARENT_SCOPE) endif() endfunction() # Confirms $test_source compiles and stores $test_name in one of # $AOM_CXX_PASSED_TESTS or $AOM_CXX_FAILED_TESTS depending on out come. When the # test passes $result_var is set to 1. When it fails $result_var is unset. The # test is not run if the test name is found in either of the passed or failed # test variables. function(aom_check_cxx_compiles test_name test_source result_var) if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) return() endif() unset(CXX_TEST_PASSED CACHE) unset(CXX_TEST_FAILED CACHE) string(FIND "${AOM_CXX_PASSED_TESTS}" "${test_name}" CXX_TEST_PASSED) string(FIND "${AOM_CXX_FAILED_TESTS}" "${test_name}" CXX_TEST_FAILED) if(${CXX_TEST_PASSED} EQUAL -1 AND ${CXX_TEST_FAILED} EQUAL -1) unset(CXX_TEST_COMPILED CACHE) message("Running CXX compiler test: ${test_name}") check_cxx_source_compiles("${test_source} ${AOM_CXX_MAIN}" CXX_TEST_COMPILED) set(${result_var} ${CXX_TEST_COMPILED} PARENT_SCOPE) if(CXX_TEST_COMPILED) set(AOM_CXX_PASSED_TESTS "${AOM_CXX_PASSED_TESTS} ${test_name}" CACHE STRING "" FORCE) else() set(AOM_CXX_FAILED_TESTS "${AOM_CXX_FAILED_TESTS} ${test_name}" CACHE STRING "" FORCE) message("CXX Compiler test ${test_name} failed.") endif() elseif(NOT ${CXX_TEST_PASSED} EQUAL -1) set(${result_var} 1 PARENT_SCOPE) else() # ${CXX_TEST_FAILED} NOT EQUAL -1 unset(${result_var} PARENT_SCOPE) endif() endfunction() # Convenience function that confirms $test_source compiles as C and C++. # $result_var is set to 1 when both tests are successful, and 0 when one or both # tests fail. Note: This function is intended to be used to write to result # variables that are expanded via configure_file(). $result_var is set to 1 or 0 # to allow direct usage of the value in generated source files. function(aom_check_source_compiles test_name test_source result_var) unset(C_PASSED) unset(CXX_PASSED) aom_check_c_compiles(${test_name} ${test_source} C_PASSED) aom_check_cxx_compiles(${test_name} ${test_source} CXX_PASSED) if(C_PASSED AND CXX_PASSED) set(${result_var} 1 PARENT_SCOPE) else() set(${result_var} 0 PARENT_SCOPE) endif() endfunction() aom-3.12.1/build/cmake/cpu.cmake000066400000000000000000000121461477627663500163550ustar00rootroot00000000000000# # Copyright (c) 2017, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # if("${AOM_TARGET_CPU}" STREQUAL "arm64") set(AOM_ARCH_ARM 1) set(AOM_ARCH_AARCH64 1) set(RTCD_ARCH_ARM "yes") set(ARM64_FLAVORS "NEON;ARM_CRC32;NEON_DOTPROD;NEON_I8MM;SVE;SVE2") set(AOM_ARM_CRC32_DEFAULT_FLAG "-march=armv8-a+crc") set(AOM_NEON_DOTPROD_DEFAULT_FLAG "-march=armv8.2-a+dotprod") set(AOM_NEON_I8MM_DEFAULT_FLAG "-march=armv8.2-a+dotprod+i8mm") set(AOM_SVE_DEFAULT_FLAG "-march=armv8.2-a+dotprod+i8mm+sve") set(AOM_SVE2_DEFAULT_FLAG "-march=armv9-a+i8mm+sve2") # SVE2 is a v9-only # feature # Check that the compiler flag to enable each flavor is supported by the # compiler. This may not be the case for new architecture features on old # compiler versions. foreach(flavor ${ARM64_FLAVORS}) if(ENABLE_${flavor} AND NOT DEFINED AOM_${flavor}_FLAG) set(AOM_${flavor}_FLAG "${AOM_${flavor}_DEFAULT_FLAG}") string(TOLOWER "${flavor}" flavor_lower) # Do not use check_c_compiler_flag here since the regex used to match # against stderr does not recognise the "invalid feature modifier" error # produced by certain versions of GCC, leading to the feature being # incorrectly marked as available. set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS}) set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${AOM_${flavor}_FLAG}") unset(FLAG_SUPPORTED) aom_check_source_compiles("arm_feature_flag_${flavor_lower}_available" "static void function(void) {}" FLAG_SUPPORTED) set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS}) if(NOT ${FLAG_SUPPORTED}) set(ENABLE_${flavor} 0) endif() endif() endforeach() # SVE and SVE2 require that the Neon-SVE bridge header is also available. if(ENABLE_SVE OR ENABLE_SVE2) set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS}) set(OLD_CMAKE_TRY_COMPILE_TARGET_TYPE ${CMAKE_TRY_COMPILE_TARGET_TYPE}) set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${AOM_SVE_FLAG}") set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) aom_check_source_compiles("arm_neon_sve_bridge_available" " #ifndef __ARM_NEON_SVE_BRIDGE #error 1 #endif #include #include " HAVE_SVE_HEADERS) # Check whether the compiler can compile SVE functions that require # backup/restore of SVE registers according to AAPCS. Clang for Windows used # to fail this, see https://github.com/llvm/llvm-project/issues/80009. aom_check_source_compiles("arm_sve_preserve" " #include void other(void)\; svfloat32_t func(svfloat32_t a) { other()\; return a\; }" CAN_COMPILE_SVE) set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS}) set(CMAKE_TRY_COMPILE_TARGET_TYPE ${OLD_CMAKE_TRY_COMPILE_TARGET_TYPE}) if(HAVE_SVE_HEADERS EQUAL 0 OR CAN_COMPILE_SVE EQUAL 0) set(ENABLE_SVE 0) set(ENABLE_SVE2 0) endif() endif() foreach(flavor ${ARM64_FLAVORS}) if(ENABLE_${flavor}) set(HAVE_${flavor} 1) set(RTCD_HAVE_${flavor} "yes") else() set(HAVE_${flavor} 0) string(TOLOWER ${flavor} flavor) set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor}) endif() endforeach() elseif("${AOM_TARGET_CPU}" MATCHES "^arm") set(AOM_ARCH_ARM 1) set(RTCD_ARCH_ARM "yes") if(ENABLE_NEON) set(HAVE_NEON 1) set(RTCD_HAVE_NEON "yes") else() set(HAVE_NEON 0) set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-neon) endif() elseif("${AOM_TARGET_CPU}" MATCHES "ppc") set(AOM_ARCH_PPC 1) set(RTCD_ARCH_PPC "yes") if(ENABLE_VSX) set(HAVE_VSX 1) set(RTCD_HAVE_VSX "yes") else() set(HAVE_VSX 0) set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-vsx) endif() elseif("${AOM_TARGET_CPU}" MATCHES "^x86") if("${AOM_TARGET_CPU}" STREQUAL "x86") set(AOM_ARCH_X86 1) set(RTCD_ARCH_X86 "yes") elseif("${AOM_TARGET_CPU}" STREQUAL "x86_64") set(AOM_ARCH_X86_64 1) set(RTCD_ARCH_X86_64 "yes") endif() set(X86_FLAVORS "MMX;SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;AVX;AVX2") foreach(flavor ${X86_FLAVORS}) if(ENABLE_${flavor} AND NOT disable_remaining_flavors) set(HAVE_${flavor} 1) set(RTCD_HAVE_${flavor} "yes") else() set(disable_remaining_flavors 1) set(HAVE_${flavor} 0) string(TOLOWER ${flavor} flavor) set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor}) endif() endforeach() elseif("${AOM_TARGET_CPU}" MATCHES "riscv") set(AOM_ARCH_RISCV64 1) set(RTCD_ARCH_RISCV64 "yes") if(ENABLE_RVV) set(HAVE_RVV 1) set(RTCD_HAVE_RVV "yes") else() set(HAVE_RVV 0) set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-rvv) endif() endif() aom-3.12.1/build/cmake/dist.cmake000066400000000000000000000042701477627663500165300ustar00rootroot00000000000000# # Copyright (c) 2017, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # cmake_minimum_required(VERSION 3.16) # Converts spaces in $in_string to semicolons and writes the output to # $out_string. In CMake's eyes this converts the input string to a list. function(listify_string in_string out_string) string(REPLACE " " ";" ${out_string} ${in_string}) set(${out_string} "${${out_string}}" PARENT_SCOPE) endfunction() set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_DIST_DIR" "AOM_DIST_INCLUDES" "AOM_DIST_LIBS" "ENABLE_DOCS") foreach(arg ${REQUIRED_ARGS}) if("${${arg}}" STREQUAL "") message(FATAL_ERROR "${arg} must not be empty.") endif() endforeach() if(ENABLE_DOCS) file(INSTALL "${AOM_CONFIG_DIR}/docs" DESTINATION "${AOM_DIST_DIR}") endif() if(AOM_DIST_EXAMPLES) listify_string("${AOM_DIST_EXAMPLES}" "AOM_DIST_EXAMPLES") foreach(example ${AOM_DIST_EXAMPLES}) if(NOT "${example}" MATCHES "aomdec\|aomenc") file(INSTALL "${example}" DESTINATION "${AOM_DIST_DIR}/bin/examples") endif() endforeach() endif() if(AOM_DIST_TOOLS) listify_string("${AOM_DIST_TOOLS}" "AOM_DIST_TOOLS") foreach(tool ${AOM_DIST_TOOLS}) file(INSTALL "${tool}" DESTINATION "${AOM_DIST_DIR}/bin/tools") endforeach() endif() if(AOM_DIST_APPS) listify_string("${AOM_DIST_APPS}" "AOM_DIST_APPS") foreach(app ${AOM_DIST_APPS}) file(INSTALL "${app}" DESTINATION "${AOM_DIST_DIR}/bin") endforeach() endif() listify_string("${AOM_DIST_INCLUDES}" "AOM_DIST_INCLUDES") foreach(inc ${AOM_DIST_INCLUDES}) file(INSTALL "${inc}" DESTINATION "${AOM_DIST_DIR}/include/aom") endforeach() listify_string("${AOM_DIST_LIBS}" "AOM_DIST_LIBS") foreach(lib ${AOM_DIST_LIBS}) file(INSTALL "${lib}" DESTINATION "${AOM_DIST_DIR}/lib") endforeach() aom-3.12.1/build/cmake/exports.cmake000066400000000000000000000053141477627663500172710ustar00rootroot00000000000000# # Copyright (c) 2017, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # if(AOM_BUILD_CMAKE_EXPORTS_CMAKE_) return() endif() # AOM_BUILD_CMAKE_EXPORTS_CMAKE_ set(AOM_BUILD_CMAKE_EXPORTS_CMAKE_ 1) include("${AOM_ROOT}/build/cmake/exports_sources.cmake") # Creates the custom target which handles generation of the symbol export lists. function(setup_exports_target) if(APPLE) set(symbol_file_ext "syms") elseif(WIN32) set(symbol_file_ext "def") else() set(symbol_file_ext "ver") endif() set(aom_sym_file "${AOM_CONFIG_DIR}/libaom.${symbol_file_ext}") add_custom_target( generate_exports COMMAND ${CMAKE_COMMAND} -DAOM_ROOT="${AOM_ROOT}" -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}" -DAOM_TARGET_SYSTEM=${AOM_TARGET_SYSTEM} -DAOM_SYM_FILE="${aom_sym_file}" -DAOM_MSVC=${MSVC} -DAOM_XCODE=${XCODE} -DCMAKE_SHARED_LIBRARY_PREFIX="${CMAKE_SHARED_LIBRARY_PREFIX}" -DCONFIG_NAME=$ -DCONFIG_AV1_DECODER=${CONFIG_AV1_DECODER} -DCONFIG_AV1_ENCODER=${CONFIG_AV1_ENCODER} -DCONFIG_INSPECTION=${CONFIG_INSPECTION} -DENABLE_TESTS=${ENABLE_TESTS} -P "${AOM_ROOT}/build/cmake/generate_exports.cmake" SOURCES ${AOM_EXPORTS_SOURCES} DEPENDS ${AOM_EXPORTS_SOURCES} BYPRODUCTS ${aom_sym_file}) # Make libaom depend on the exports file, and set flags to pick it up when # creating the dylib. add_dependencies(aom generate_exports) if(APPLE) set_property(TARGET aom APPEND_STRING PROPERTY LINK_FLAGS "-exported_symbols_list ${aom_sym_file}") elseif(WIN32) if(MSVC) set_property(TARGET aom APPEND_STRING PROPERTY LINK_FLAGS "/DEF:${aom_sym_file}") else() # For MinGW and MSYS compilers, you can use either version scripts or # module definition files. If the latter, it must be supplied as an # "object". set_property(TARGET aom APPEND_STRING PROPERTY LINK_FLAGS "${aom_sym_file}") endif() else() set_property(TARGET aom APPEND_STRING PROPERTY LINK_FLAGS "-Wl,--version-script,${aom_sym_file}") endif() endfunction() aom-3.12.1/build/cmake/exports_sources.cmake000066400000000000000000000022071477627663500210320ustar00rootroot00000000000000# # Copyright (c) 2017, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # if(AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_) return() endif() # AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_ set(AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_ 1) list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_com" "${AOM_ROOT}/av1/exports_com") if(CONFIG_AV1_DECODER) list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_dec" "${AOM_ROOT}/av1/exports_dec") if(CONFIG_INSPECTION) list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/av1/exports_ident") endif() endif() if(CONFIG_AV1_ENCODER) list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_enc" "${AOM_ROOT}/av1/exports_enc") endif() aom-3.12.1/build/cmake/generate_aom_config_templates.cmake000066400000000000000000000074721477627663500236250ustar00rootroot00000000000000# # Copyright (c) 2017, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # cmake_minimum_required(VERSION 3.16) string(TIMESTAMP year "%Y") set(asm_file_header_block "\; \; Copyright (c) ${year}, Alliance for Open Media. All rights reserved. \; \; This source code is subject to the terms of the BSD 2 Clause License and \; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License \; was not distributed with this source code in the LICENSE file, you can \; obtain it at www.aomedia.org/license/software. If the Alliance for Open \; Media Patent License 1.0 was not distributed with this source code in the \; PATENTS file, you can obtain it at www.aomedia.org/license/patent. \; ") set(h_file_header_block "/* * Copyright (c) ${year}, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ \#ifndef AOM_CONFIG_H_ \#define AOM_CONFIG_H_ ") set(cmake_file_header_block "## ## Copyright (c) ${year}, Alliance for Open Media. All rights reserved. ## ## This source code is subject to the terms of the BSD 2 Clause License and ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ## was not distributed with this source code in the LICENSE file, you can ## obtain it at www.aomedia.org/license/software. If the Alliance for Open ## Media Patent License 1.0 was not distributed with this source code in the ## PATENTS file, you can obtain it at www.aomedia.org/license/patent. ## ") # Terminates cmake execution when $var_name is an empty string, or the variable # name it contains does not expand to an existing directory. function(check_directory_var var_name) if("${var_name}" STREQUAL "") message(FATAL_ERROR "The CMake variable ${var_name} must be defined.") endif() if(NOT EXISTS "${${var_name}}") message(FATAL_ERROR "${${var_name}} (${var_name}) missing.") endif() endfunction() check_directory_var(AOM_CONFIG_DIR) check_directory_var(AOM_ROOT) set(AOM_DEFAULTS "${AOM_ROOT}/build/cmake/aom_config_defaults.cmake") if(NOT EXISTS "${AOM_DEFAULTS}") message( FATAL_ERROR "Configuration default values file (${AOM_DEFAULTS}) missing.") endif() include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake") list(APPEND aom_build_vars ${AOM_DETECT_VARS} ${AOM_CONFIG_VARS}) list(SORT aom_build_vars) set(aom_config_h_template "${AOM_CONFIG_DIR}/config/aom_config.h.cmake") file(WRITE "${aom_config_h_template}" ${h_file_header_block}) foreach(aom_var ${aom_build_vars}) if(NOT "${aom_var}" STREQUAL "AOM_RTCD_FLAGS") file(APPEND "${aom_config_h_template}" "\#define ${aom_var} \${${aom_var}}\n") endif() endforeach() file(APPEND "${aom_config_h_template}" "\#endif // AOM_CONFIG_H_") set(aom_asm_config_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake") file(WRITE "${aom_asm_config_template}" ${asm_file_header_block}) foreach(aom_var ${aom_build_vars}) if(NOT "${aom_var}" STREQUAL "AOM_RTCD_FLAGS") file(APPEND "${aom_asm_config_template}" "${aom_var} equ \${${aom_var}}\n") endif() endforeach() aom-3.12.1/build/cmake/generate_exports.cmake000066400000000000000000000047261477627663500211510ustar00rootroot00000000000000# # Copyright (c) 2017, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # cmake_minimum_required(VERSION 3.16) # CMAKE_SHARED_LIBRARY_PREFIX can be empty set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_TARGET_SYSTEM" "AOM_SYM_FILE" "CONFIG_AV1_DECODER" "CONFIG_AV1_ENCODER") foreach(arg ${REQUIRED_ARGS}) if("${${arg}}" STREQUAL "") message(FATAL_ERROR "${arg} must not be empty.") endif() endforeach() include("${AOM_ROOT}/build/cmake/exports_sources.cmake") if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") set(symbol_prefix "_") elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS") file(WRITE "${AOM_SYM_FILE}" "LIBRARY ${CMAKE_SHARED_LIBRARY_PREFIX}aom\n" "EXPORTS\n") else() set(symbol_suffix ";") endif() set(aom_sym_file "${AOM_SYM_FILE}") if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") file(REMOVE "${aom_sym_file}") elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS") file(WRITE "${aom_sym_file}" "LIBRARY ${CMAKE_SHARED_LIBRARY_PREFIX}aom\n" "EXPORTS\n") else() file(WRITE "${aom_sym_file}" "{\nglobal:\n") endif() foreach(export_file ${AOM_EXPORTS_SOURCES}) file(STRINGS "${export_file}" exported_file_data) set(exported_symbols "${exported_symbols} ${exported_file_data};") string(STRIP "${exported_symbols}" exported_symbols) endforeach() foreach(exported_symbol ${exported_symbols}) string(STRIP "${exported_symbol}" exported_symbol) if("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS") string(SUBSTRING ${exported_symbol} 0 4 export_type) string(COMPARE EQUAL "${export_type}" "data" is_data) if(is_data) set(symbol_suffix " DATA") else() set(symbol_suffix "") endif() endif() string(REGEX REPLACE "text \|data " "" "exported_symbol" "${exported_symbol}") set(exported_symbol " ${symbol_prefix}${exported_symbol}${symbol_suffix}") file(APPEND "${aom_sym_file}" "${exported_symbol}\n") endforeach() if("${aom_sym_file}" MATCHES "ver$") file(APPEND "${aom_sym_file}" " \nlocal:\n *;\n};") endif() aom-3.12.1/build/cmake/pkg_config.cmake000066400000000000000000000054351477627663500176770ustar00rootroot00000000000000# # Copyright (c) 2017, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # cmake_minimum_required(VERSION 3.16) set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "CMAKE_INSTALL_PREFIX" "CMAKE_INSTALL_BINDIR" "CMAKE_INSTALL_INCLUDEDIR" "CMAKE_INSTALL_LIBDIR" "CMAKE_PROJECT_NAME" "CONFIG_MULTITHREAD") foreach(arg ${REQUIRED_ARGS}) if("${${arg}}" STREQUAL "") message(FATAL_ERROR "${arg} must not be empty.") endif() endforeach() include("${AOM_ROOT}/build/cmake/util.cmake") extract_version_string("${AOM_CONFIG_DIR}/config/aom_version.h" aom_version) # Create a version string suitable for comparison using the RPM version compare # algorithm: strip out everything after the number. string(FIND "${aom_version}" "-" dash_pos) if(${dash_pos} EQUAL -1) set(package_version "${aom_version}") else() string(SUBSTRING "${aom_version}" 0 ${dash_pos} package_version) endif() # Write pkg-config info. set(prefix "${CMAKE_INSTALL_PREFIX}") set(bindir "${CMAKE_INSTALL_BINDIR}") set(includedir "${CMAKE_INSTALL_INCLUDEDIR}") set(libdir "${CMAKE_INSTALL_LIBDIR}") set(pkgconfig_file "${AOM_CONFIG_DIR}/aom.pc") string(TOLOWER ${CMAKE_PROJECT_NAME} pkg_name) file(WRITE "${pkgconfig_file}" "# libaom pkg-config.\n") file(APPEND "${pkgconfig_file}" "prefix=${prefix}\n") file(APPEND "${pkgconfig_file}" "exec_prefix=\${prefix}\n") file(APPEND "${pkgconfig_file}" "includedir=\${prefix}/${includedir}\n") file(APPEND "${pkgconfig_file}" "libdir=\${exec_prefix}/${libdir}\n\n") file(APPEND "${pkgconfig_file}" "Name: ${pkg_name}\n") file( APPEND "${pkgconfig_file}" "Description: Alliance for Open Media AV1 codec library v${aom_version}.\n") file(APPEND "${pkgconfig_file}" "Version: ${package_version}\n") file(APPEND "${pkgconfig_file}" "Requires:") if(CONFIG_TUNE_VMAF) file(APPEND "${pkgconfig_file}" " libvmaf") endif() if(CONFIG_TUNE_BUTTERAUGLI) file(APPEND "${pkgconfig_file}" " libjxl") endif() file(APPEND "${pkgconfig_file}" "\nConflicts:\n") file(APPEND "${pkgconfig_file}" "Libs: -L\${libdir} -l${pkg_name}\n") file(APPEND "${pkgconfig_file}" "Libs.private:") if(NOT WIN32 AND NOT APPLE) file(APPEND "${pkgconfig_file}" " -lm") endif() if(CONFIG_MULTITHREAD AND CMAKE_THREAD_LIBS_INIT) file(APPEND "${pkgconfig_file}" " ${CMAKE_THREAD_LIBS_INIT}") endif() file(APPEND "${pkgconfig_file}" "\nCflags: -I\${includedir}\n") aom-3.12.1/build/cmake/rtcd.pl000077500000000000000000000241301477627663500160540ustar00rootroot00000000000000#!/usr/bin/env perl ## ## Copyright (c) 2017, Alliance for Open Media. All rights reserved. ## ## This source code is subject to the terms of the BSD 2 Clause License and ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ## was not distributed with this source code in the LICENSE file, you can ## obtain it at www.aomedia.org/license/software. If the Alliance for Open ## Media Patent License 1.0 was not distributed with this source code in the ## PATENTS file, you can obtain it at www.aomedia.org/license/patent. ## no strict 'refs'; use warnings; use Getopt::Long; Getopt::Long::Configure("auto_help") if $Getopt::Long::VERSION > 2.32; my %ALL_FUNCS = (); my @ALL_ARCHS; my @ALL_FORWARD_DECLS; my @REQUIRES; my %opts = (); my %disabled = (); my %required = (); my @argv; foreach (@ARGV) { $disabled{$1} = 1, next if /--disable-(.*)/; $required{$1} = 1, next if /--require-(.*)/; push @argv, $_; } # NB: use GetOptions() instead of GetOptionsFromArray() for compatibility. @ARGV = @argv; GetOptions( \%opts, 'arch=s', 'sym=s', 'config=s', ); foreach my $opt (qw/arch config/) { if (!defined($opts{$opt})) { warn "--$opt is required!\n"; Getopt::Long::HelpMessage('-exit' => 1); } } foreach my $defs_file (@ARGV) { if (!-f $defs_file) { warn "$defs_file: $!\n"; Getopt::Long::HelpMessage('-exit' => 1); } } open CONFIG_FILE, $opts{config} or die "Error opening config file '$opts{config}': $!\n"; my %config = (); while () { # TODO(aomedia:349428506,349436249,349450845,349455146): remove AOM_ARCH_ # after armv7 SIGBUS issues are fixed. next if !/^#define\s+(?:AOM_ARCH_|CONFIG_|HAVE_)/; chomp; my @line_components = split /\s/; scalar @line_components > 2 or die "Invalid input passed to rtcd.pl via $opts{config}."; # $line_components[0] = #define # $line_components[1] = flag name ({AOM_ARCH,CONFIG,HAVE}_SOMETHING) # $line_components[2] = flag value (0 or 1) $config{$line_components[1]} = "$line_components[2]" eq "1" ? "yes" : ""; } close CONFIG_FILE; # # Routines for the RTCD DSL to call # sub aom_config($) { return (defined $config{$_[0]}) ? $config{$_[0]} : ""; } sub specialize { if (@_ <= 1) { die "'specialize' must be called with a function name and at least one ", "architecture ('C' is implied): \n@_\n"; } my $fn=$_[0]; shift; foreach my $opt (@_) { eval "\$${fn}_${opt}=${fn}_${opt}"; } } sub add_proto { my $fn = splice(@_, -2, 1); my @proto = @_; foreach (@proto) { tr/\t/ / } $ALL_FUNCS{$fn} = \@proto; specialize $fn, "c"; } sub require { foreach my $fn (keys %ALL_FUNCS) { foreach my $opt (@_) { my $ofn = eval "\$${fn}_${opt}"; next if !$ofn; # if we already have a default, then we can disable it, as we know # we can do better. my $best = eval "\$${fn}_default"; if ($best) { my $best_ofn = eval "\$${best}"; if ($best_ofn && "$best_ofn" ne "$ofn") { eval "\$${best}_link = 'false'"; } } eval "\$${fn}_default=${fn}_${opt}"; eval "\$${fn}_${opt}_link='true'"; } } } sub forward_decls { push @ALL_FORWARD_DECLS, @_; } # # Include the user's directives # foreach my $f (@ARGV) { open FILE, "<", $f or die "cannot open $f: $!\n"; my $contents = join('', ); close FILE; eval $contents or warn "eval failed: $@\n"; } # # Process the directives according to the command line # sub process_forward_decls() { foreach (@ALL_FORWARD_DECLS) { $_->(); } } sub determine_indirection { aom_config("CONFIG_RUNTIME_CPU_DETECT") eq "yes" or &require(@ALL_ARCHS); foreach my $fn (keys %ALL_FUNCS) { my $n = ""; my @val = @{$ALL_FUNCS{$fn}}; my $args = pop @val; my $rtyp = "@val"; my $dfn = eval "\$${fn}_default"; $dfn = eval "\$${dfn}"; foreach my $opt (@_) { my $ofn = eval "\$${fn}_${opt}"; next if !$ofn; my $link = eval "\$${fn}_${opt}_link"; next if $link && $link eq "false"; $n .= "x"; } if ($n eq "x") { eval "\$${fn}_indirect = 'false'"; } else { eval "\$${fn}_indirect = 'true'"; } } } sub declare_function_pointers { foreach my $fn (sort keys %ALL_FUNCS) { my @val = @{$ALL_FUNCS{$fn}}; my $args = pop @val; my $rtyp = "@val"; my $dfn = eval "\$${fn}_default"; $dfn = eval "\$${dfn}"; foreach my $opt (@_) { my $ofn = eval "\$${fn}_${opt}"; next if !$ofn; print "$rtyp ${ofn}($args);\n"; } if (eval "\$${fn}_indirect" eq "false") { print "#define ${fn} ${dfn}\n"; } else { print "RTCD_EXTERN $rtyp (*${fn})($args);\n"; } print "\n"; } } sub set_function_pointers { foreach my $fn (sort keys %ALL_FUNCS) { my @val = @{$ALL_FUNCS{$fn}}; my $args = pop @val; my $rtyp = "@val"; my $dfn = eval "\$${fn}_default"; $dfn = eval "\$${dfn}"; if (eval "\$${fn}_indirect" eq "true") { print " $fn = $dfn;\n"; foreach my $opt (@_) { my $ofn = eval "\$${fn}_${opt}"; next if !$ofn; next if "$ofn" eq "$dfn"; my $link = eval "\$${fn}_${opt}_link"; next if $link && $link eq "false"; my $cond = eval "\$have_${opt}"; print " if (${cond}) $fn = $ofn;\n" } } } } sub filter { my @filtered; foreach (@_) { push @filtered, $_ unless $disabled{$_}; } return @filtered; } # # Helper functions for generating the arch specific RTCD files # sub common_top() { my $include_guard = uc($opts{sym})."_H_"; my @time = localtime; my $year = $time[5] + 1900; print < \$version_data, 'version_filename=s' => \$version_filename) or die("Invalid arg(s): $!"); if (!defined $version_data || length($version_data) == 0 || !defined $version_filename || length($version_filename) == 0) { die("--version_data and --version_filename are required."); } # Determine if $version_data is a filename or a git tag/description. my $version_string; chomp($version_data); if (-r $version_data) { # $version_data is the path to the CHANGELOG. Parse the most recent version. my $changelog_filename = $version_data; open(my $changelog_file, '<', $changelog_filename) or die("Unable to open CHANGELOG @ $changelog_filename: $!."); while (my $line = <$changelog_file>) { my @split_line = split(" ", $line, 3); next if @split_line < 2; $version_string = $split_line[1]; last if substr($version_string, 0, 1) eq "v"; } close($changelog_file); } else { # $version_data is either a tag name or a full git description, one of: # tagName OR tagName-commitsSinceTag-shortCommitHash # In either case we want the first element of the array returned by split. $version_string = (split("-", $version_data))[0]; $git_desc = $version_data; } if (substr($version_string, 0, 1) eq "v") { $version_string = substr($version_string, 1); } my @version_components = split('\.', $version_string, 4); my $version_major = $version_components[0]; my $version_minor = $version_components[1]; my $version_patch = $version_components[2]; my $version_extra = ""; if (length($git_desc) > 0) { my @git_desc_components = split('-', $git_desc, 2); if (@git_desc_components > 1) { $version_extra = $git_desc_components[1]; } } open(my $version_file, '>', $version_filename) or die("Cannot open $version_filename: $!"); my $version_packed = "((VERSION_MAJOR << 16) | (VERSION_MINOR << 8) | (VERSION_PATCH))"; my $year = (localtime)[5] + 1900; my $lic_block = << "EOF"; /* * Copyright (c) $year, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ EOF select $version_file; if (length($git_desc)) { print << "EOF"; $lic_block #ifndef AOM_VERSION_H_ #define AOM_VERSION_H_ #define VERSION_MAJOR $version_major #define VERSION_MINOR $version_minor #define VERSION_PATCH $version_patch #define VERSION_EXTRA \"$version_extra\" #define VERSION_PACKED \\ $version_packed #define VERSION_STRING_NOSP \"$git_desc\" #define VERSION_STRING \" $git_desc\" #endif // AOM_VERSION_H_ EOF } else { print << "EOF"; $lic_block #ifndef AOM_VERSION_H_ #define AOM_VERSION_H_ #define VERSION_MAJOR $version_major #define VERSION_MINOR $version_minor #define VERSION_PATCH $version_patch #define VERSION_EXTRA \"$version_extra\" #define VERSION_PACKED \\ $version_packed #define VERSION_STRING_NOSP \"v$version_string\" #define VERSION_STRING \" v$version_string\" #endif // AOM_VERSION_H_ EOF } close($version_file); aom-3.12.1/codereview.settings000066400000000000000000000002511477627663500163150ustar00rootroot00000000000000# This file is used by git cl to get repository specific information. GERRIT_HOST: True CODE_REVIEW_SERVER: aomedia-review.googlesource.com GERRIT_SQUASH_UPLOADS: False aom-3.12.1/common/000077500000000000000000000000001477627663500136715ustar00rootroot00000000000000aom-3.12.1/common/args.c000066400000000000000000000163521477627663500150000ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "common/args.h" #include #include #include #include #include "aom/aom_integer.h" #include "aom/aom_codec.h" #include "common/tools_common.h" static const char kSbSizeWarningString[] = "super_block_size has to be 64 or 128."; static const char kMinpartWarningString[] = "min_partition_size has to be smaller or equal to max_partition_size."; static const char kMaxpartWarningString[] = "max_partition_size has to be smaller or equal to super_block_size."; static char *ignore_front_spaces(const char *str) { while (str[0] == ' ' || str[0] == '\t') ++str; return (char *)str; } static void ignore_end_spaces(char *str) { char *end = str + strlen(str); while (end > str && (end[0] == ' ' || end[0] == '\t' || end[0] == '\n' || end[0] == '\r' || end[0] == '\0')) --end; if (end >= str) end[1] = '\0'; } int parse_cfg(const char *file, cfg_options_t *config) { char line[1024 * 10]; FILE *f = fopen(file, "r"); if (!f) return 1; #define GET_PARAMS(field) \ if (strcmp(left, #field) == 0) { \ config->field = atoi(right); \ continue; \ } while (fgets(line, sizeof(line) - 1, f)) { char *actual_line = ignore_front_spaces(line); char *left, *right, *comment; size_t length = strlen(actual_line); if (length == 0 || actual_line[0] == '#') continue; right = strchr(actual_line, '='); if (right == NULL) continue; right[0] = '\0'; left = ignore_front_spaces(actual_line); right = ignore_front_spaces(right + 1); comment = strchr(right, '#'); if (comment != NULL) comment[0] = '\0'; ignore_end_spaces(left); ignore_end_spaces(right); GET_PARAMS(super_block_size) GET_PARAMS(max_partition_size) GET_PARAMS(min_partition_size) GET_PARAMS(disable_ab_partition_type) GET_PARAMS(disable_rect_partition_type) GET_PARAMS(disable_1to4_partition_type) GET_PARAMS(disable_flip_idtx) GET_PARAMS(disable_cdef) GET_PARAMS(disable_lr) GET_PARAMS(disable_obmc) GET_PARAMS(disable_warp_motion) GET_PARAMS(disable_global_motion) GET_PARAMS(disable_dist_wtd_comp) GET_PARAMS(disable_diff_wtd_comp) GET_PARAMS(disable_inter_intra_comp) GET_PARAMS(disable_masked_comp) GET_PARAMS(disable_one_sided_comp) GET_PARAMS(disable_palette) GET_PARAMS(disable_intrabc) GET_PARAMS(disable_cfl) GET_PARAMS(disable_smooth_intra) GET_PARAMS(disable_filter_intra) GET_PARAMS(disable_dual_filter) GET_PARAMS(disable_intra_angle_delta) GET_PARAMS(disable_intra_edge_filter) GET_PARAMS(disable_tx_64x64) GET_PARAMS(disable_smooth_inter_intra) GET_PARAMS(disable_inter_inter_wedge) GET_PARAMS(disable_inter_intra_wedge) GET_PARAMS(disable_paeth_intra) GET_PARAMS(disable_trellis_quant) GET_PARAMS(disable_ref_frame_mv) GET_PARAMS(reduced_reference_set) GET_PARAMS(reduced_tx_type_set) fprintf(stderr, "\nInvalid parameter: %s", left); exit(-1); } if (config->super_block_size != 128 && config->super_block_size != 64) { fprintf(stderr, "\n%s", kSbSizeWarningString); exit(-1); } if (config->min_partition_size > config->max_partition_size) { fprintf(stderr, "\n%s", kMinpartWarningString); exit(-1); } if (config->max_partition_size > config->super_block_size) { fprintf(stderr, "\n%s", kMaxpartWarningString); exit(-1); } fclose(f); config->init_by_cfg_file = 1; return 0; } int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) { char err_msg[ARG_ERR_MSG_MAX_LEN]; int ret = arg_match_helper(arg_, def, argv, err_msg); if (err_msg[0] != '\0') { die("%s", err_msg); } return ret; } const char *arg_next(struct arg *arg) { if (arg->argv[0]) arg->argv += arg->argv_step; return *arg->argv; } char **argv_dup(int argc, const char **argv) { char **new_argv = malloc((argc + 1) * sizeof(*argv)); if (!new_argv) return NULL; memcpy(new_argv, argv, argc * sizeof(*argv)); new_argv[argc] = NULL; return new_argv; } void arg_show_usage(FILE *fp, const struct arg_def *const *defs) { for (; *defs; defs++) { const struct arg_def *def = *defs; char *short_val = def->has_val ? " " : ""; char *long_val = def->has_val ? "=" : ""; int n = 0; // Short options are indented with two spaces. Long options are indented // with 12 spaces. if (def->short_name && def->long_name) { char *comma = def->has_val ? "," : ", "; n = fprintf(fp, " -%s%s%s --%s%s", def->short_name, short_val, comma, def->long_name, long_val); } else if (def->short_name) n = fprintf(fp, " -%s%s", def->short_name, short_val); else if (def->long_name) n = fprintf(fp, " --%s%s", def->long_name, long_val); // Descriptions are indented with 40 spaces. If an option is 40 characters // or longer, its description starts on the next line. if (n < 40) for (int i = 0; i < 40 - n; i++) fputc(' ', fp); else fputs("\n ", fp); fprintf(fp, "%s\n", def->desc); if (def->enums) { const struct arg_enum_list *listptr; fprintf(fp, " %-37s\t ", ""); for (listptr = def->enums; listptr->name; listptr++) fprintf(fp, "%s%s", listptr->name, listptr[1].name ? ", " : "\n"); } } } unsigned int arg_parse_uint(const struct arg *arg) { char err_msg[ARG_ERR_MSG_MAX_LEN]; unsigned int ret = arg_parse_uint_helper(arg, err_msg); if (err_msg[0] != '\0') { die("%s", err_msg); } return ret; } int arg_parse_int(const struct arg *arg) { char err_msg[ARG_ERR_MSG_MAX_LEN]; int ret = arg_parse_int_helper(arg, err_msg); if (err_msg[0] != '\0') { die("%s", err_msg); } return ret; } struct aom_rational arg_parse_rational(const struct arg *arg) { char err_msg[ARG_ERR_MSG_MAX_LEN]; struct aom_rational ret = arg_parse_rational_helper(arg, err_msg); if (err_msg[0] != '\0') { die("%s", err_msg); } return ret; } int arg_parse_enum(const struct arg *arg) { char err_msg[ARG_ERR_MSG_MAX_LEN]; int ret = arg_parse_enum_helper(arg, err_msg); if (err_msg[0] != '\0') { die("%s", err_msg); } return ret; } int arg_parse_enum_or_int(const struct arg *arg) { char err_msg[ARG_ERR_MSG_MAX_LEN]; int ret = arg_parse_enum_or_int_helper(arg, err_msg); if (err_msg[0] != '\0') { die("%s", err_msg); } return ret; } // parse a comma separated list of at most n integers // return the number of elements in the list int arg_parse_list(const struct arg *arg, int *list, int n) { char err_msg[ARG_ERR_MSG_MAX_LEN]; int ret = arg_parse_list_helper(arg, list, n, err_msg); if (err_msg[0] != '\0') { die("%s", err_msg); } return ret; } aom-3.12.1/common/args.h000066400000000000000000000025701477627663500150020ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_COMMON_ARGS_H_ #define AOM_COMMON_ARGS_H_ #include #include "aom/aom_codec.h" #include "aom/aom_encoder.h" #include "common/args_helper.h" #ifdef __cplusplus extern "C" { #endif int arg_match(struct arg *arg_, const struct arg_def *def, char **argv); int parse_cfg(const char *file, cfg_options_t *config); const char *arg_next(struct arg *arg); void arg_show_usage(FILE *fp, const struct arg_def *const *defs); char **argv_dup(int argc, const char **argv); unsigned int arg_parse_uint(const struct arg *arg); int arg_parse_int(const struct arg *arg); struct aom_rational arg_parse_rational(const struct arg *arg); int arg_parse_enum(const struct arg *arg); int arg_parse_enum_or_int(const struct arg *arg); int arg_parse_list(const struct arg *arg, int *list, int n); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_COMMON_ARGS_H_ aom-3.12.1/common/args_helper.c000066400000000000000000000145061477627663500163360ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "common/args_helper.h" #include #include #include #include #include #define SET_ERR_STRING(...) \ if (err_msg) snprintf(err_msg, ARG_ERR_MSG_MAX_LEN, __VA_ARGS__) static struct arg arg_init(char **argv) { struct arg a; a.argv = argv; a.argv_step = 1; a.name = NULL; a.val = NULL; a.def = NULL; return a; } int arg_match_helper(struct arg *arg_, const struct arg_def *def, char **argv, char *err_msg) { struct arg arg; if (err_msg) err_msg[0] = '\0'; assert(def->has_val == 0 || def->has_val == 1 || def->has_val == -1); if (!argv[0] || argv[0][0] != '-') return 0; arg = arg_init(argv); if (def->short_name && !strcmp(arg.argv[0] + 1, def->short_name)) { arg.name = arg.argv[0] + 1; arg.val = def->has_val ? arg.argv[1] : NULL; arg.argv_step = def->has_val ? 2 : 1; } else if (def->long_name) { const size_t name_len = strlen(def->long_name); if (arg.argv[0][1] == '-' && !strncmp(arg.argv[0] + 2, def->long_name, name_len) && (arg.argv[0][name_len + 2] == '=' || arg.argv[0][name_len + 2] == '\0')) { arg.name = arg.argv[0] + 2; arg.val = arg.name[name_len] == '=' ? arg.name + name_len + 1 : NULL; arg.argv_step = 1; } } if (arg.name) { if (def->has_val == -1) { arg.def = def; *arg_ = arg; return 1; } if (!arg.val && def->has_val) { SET_ERR_STRING("Error: option %s requires argument.\n", arg.name); return 0; } if (arg.val && !def->has_val) { SET_ERR_STRING("Error: option %s requires no argument.\n", arg.name); return 0; } arg.def = def; *arg_ = arg; return 1; } return 0; } unsigned int arg_parse_uint_helper(const struct arg *arg, char *err_msg) { char *endptr; const unsigned long rawval = strtoul(arg->val, &endptr, 10); // NOLINT if (err_msg) err_msg[0] = '\0'; if (arg->val[0] != '\0' && endptr[0] == '\0') { if (rawval <= UINT_MAX) return (unsigned int)rawval; SET_ERR_STRING("Option %s: Value %lu out of range for unsigned int\n", arg->name, rawval); return 0; } SET_ERR_STRING("Option %s: Invalid character '%c'\n", arg->name, *endptr); return 0; } int arg_parse_int_helper(const struct arg *arg, char *err_msg) { char *endptr; const long rawval = strtol(arg->val, &endptr, 10); // NOLINT if (err_msg) err_msg[0] = '\0'; if (arg->val[0] != '\0' && endptr[0] == '\0') { if (rawval >= INT_MIN && rawval <= INT_MAX) return (int)rawval; SET_ERR_STRING("Option %s: Value %ld out of range for signed int\n", arg->name, rawval); return 0; } SET_ERR_STRING("Option %s: Invalid character '%c'\n", arg->name, *endptr); return 0; } struct aom_rational arg_parse_rational_helper(const struct arg *arg, char *err_msg) { long rawval; // NOLINT char *endptr; struct aom_rational rat = { 0, 1 }; if (err_msg) err_msg[0] = '\0'; /* parse numerator */ rawval = strtol(arg->val, &endptr, 10); if (arg->val[0] != '\0' && endptr[0] == '/') { if (rawval >= INT_MIN && rawval <= INT_MAX) { rat.num = (int)rawval; } else { SET_ERR_STRING("Option %s: Value %ld out of range for signed int\n", arg->name, rawval); return rat; } } else { SET_ERR_STRING("Option %s: Expected / at '%c'\n", arg->name, *endptr); return rat; } /* parse denominator */ rawval = strtol(endptr + 1, &endptr, 10); if (arg->val[0] != '\0' && endptr[0] == '\0') { if (rawval >= INT_MIN && rawval <= INT_MAX) { rat.den = (int)rawval; } else { SET_ERR_STRING("Option %s: Value %ld out of range for signed int\n", arg->name, rawval); return rat; } } else { SET_ERR_STRING("Option %s: Invalid character '%c'\n", arg->name, *endptr); return rat; } return rat; } int arg_parse_enum_helper(const struct arg *arg, char *err_msg) { const struct arg_enum_list *listptr; long rawval; // NOLINT char *endptr; if (err_msg) err_msg[0] = '\0'; /* First see if the value can be parsed as a raw value */ rawval = strtol(arg->val, &endptr, 10); if (arg->val[0] != '\0' && endptr[0] == '\0') { /* Got a raw value, make sure it's valid */ for (listptr = arg->def->enums; listptr->name; listptr++) if (listptr->val == rawval) return (int)rawval; } /* Next see if it can be parsed as a string */ for (listptr = arg->def->enums; listptr->name; listptr++) if (!strcmp(arg->val, listptr->name)) return listptr->val; SET_ERR_STRING("Option %s: Invalid value '%s'\n", arg->name, arg->val); return 0; } int arg_parse_enum_or_int_helper(const struct arg *arg, char *err_msg) { if (arg->def->enums) return arg_parse_enum_helper(arg, err_msg); return arg_parse_int_helper(arg, err_msg); } // parse a comma separated list of at most n integers // return the number of elements in the list int arg_parse_list_helper(const struct arg *arg, int *list, int n, char *err_msg) { const char *ptr = arg->val; char *endptr; int i = 0; if (err_msg) err_msg[0] = '\0'; while (ptr[0] != '\0') { long rawval = strtol(ptr, &endptr, 10); // NOLINT if (rawval < INT_MIN || rawval > INT_MAX) { SET_ERR_STRING("Option %s: Value %ld out of range for signed int\n", arg->name, rawval); return 0; } else if (i >= n) { SET_ERR_STRING("Option %s: List has more than %d entries\n", arg->name, n); return 0; } else if (*endptr == ',') { endptr++; } else if (*endptr != '\0') { SET_ERR_STRING("Option %s: Bad list separator '%c'\n", arg->name, *endptr); return 0; } list[i++] = (int)rawval; ptr = endptr; } return i; } aom-3.12.1/common/args_helper.h000066400000000000000000000050161477627663500163370ustar00rootroot00000000000000/* * Copyright (c) 2020, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_COMMON_ARGS_HELPER_H_ #define AOM_COMMON_ARGS_HELPER_H_ #include "aom/aom_encoder.h" #ifdef __cplusplus extern "C" { #endif // Maximum length of the error messages for the helper functions. #define ARG_ERR_MSG_MAX_LEN 200 struct arg { char **argv; const char *name; const char *val; unsigned int argv_step; const struct arg_def *def; }; struct arg_enum_list { const char *name; int val; }; #define ARG_ENUM_LIST_END \ { 0 } typedef struct arg_def { const char *short_name; const char *long_name; int has_val; // 0: The argument must not have a value. // 1: The argument must have a value. // -1: The argument may or may not have a value. const char *desc; const struct arg_enum_list *enums; } arg_def_t; #define ARG_DEF(s, l, v, d) \ { s, l, v, d, NULL } #define ARG_DEF_ENUM(s, l, v, d, e) \ { s, l, v, d, e } #define ARG_DEF_LIST_END \ { 0 } /* * The helper functions below all take an optional parameter err_msg for * error reporting. When err_msg is not NULL (must point to a buffer * which is at least ARG_ERR_MSG_MAX_LEN bytes long), a related error message is * stored in it if an error occurs. It will be set to an empty string if no * error occurs. */ int arg_match_helper(struct arg *arg_, const struct arg_def *def, char **argv, char *err_msg); // Note: arg_match_helper() must be called before invoking these functions. unsigned int arg_parse_uint_helper(const struct arg *arg, char *err_msg); int arg_parse_int_helper(const struct arg *arg, char *err_msg); struct aom_rational arg_parse_rational_helper(const struct arg *arg, char *err_msg); int arg_parse_enum_helper(const struct arg *arg, char *err_msg); int arg_parse_enum_or_int_helper(const struct arg *arg, char *err_msg); int arg_parse_list_helper(const struct arg *arg, int *list, int n, char *err_msg); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_COMMON_ARGS_HELPER_H_ aom-3.12.1/common/av1_config.c000066400000000000000000000442331477627663500160570ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include "aom/aom_image.h" #include "aom/aom_integer.h" #include "aom_dsp/bitreader_buffer.h" #include "av1/common/obu_util.h" #include "common/av1_config.h" #include "config/aom_config.h" // Helper macros to reduce verbosity required to check for read errors. // // Note that when using these macros, even single line if statements should use // curly braces to avoid unexpected behavior because all but the // AV1C_POP_ERROR_HANDLER_DATA() macro consist of multiple statements. #define AV1C_READ_BIT_OR_RETURN_ERROR(field) \ int field = 0; \ do { \ field = aom_rb_read_bit(reader); \ if (result == -1) { \ fprintf(stderr, \ "av1c: Error reading bit for " #field ", value=%d result=%d.\n", \ field, result); \ return -1; \ } \ } while (0) #define AV1C_READ_BITS_OR_RETURN_ERROR(field, length) \ int field = 0; \ do { \ field = aom_rb_read_literal(reader, (length)); \ if (result == -1) { \ fprintf(stderr, \ "av1c: Could not read bits for " #field \ ", value=%d result=%d.\n", \ field, result); \ return -1; \ } \ } while (0) // Helper macros for setting/restoring the error handler data in // aom_read_bit_buffer. #define AV1C_PUSH_ERROR_HANDLER_DATA(new_data) \ void *original_error_handler_data = NULL; \ do { \ original_error_handler_data = reader->error_handler_data; \ reader->error_handler_data = &new_data; \ } while (0) #define AV1C_POP_ERROR_HANDLER_DATA() \ do { \ reader->error_handler_data = original_error_handler_data; \ } while (0) static const size_t kAv1cSize = 4; static void bitreader_error_handler(void *data) { int *error_val = (int *)data; *error_val = -1; } // Parse the AV1 timing_info() structure: // timing_info( ) { // num_units_in_display_tick f(32) // time_scale f(32) // equal_picture_interval f(1) // if (equal_picture_interval) // num_ticks_per_picture_minus_1 uvlc() // } static int parse_timing_info(struct aom_read_bit_buffer *reader) { int result = 0; AV1C_PUSH_ERROR_HANDLER_DATA(result); AV1C_READ_BITS_OR_RETURN_ERROR(num_units_in_display_tick, 32); AV1C_READ_BITS_OR_RETURN_ERROR(time_scale, 32); AV1C_READ_BIT_OR_RETURN_ERROR(equal_picture_interval); if (equal_picture_interval) { uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(reader); if (result == -1) { fprintf(stderr, "av1c: Could not read bits for " "num_ticks_per_picture_minus_1, value=%u.\n", num_ticks_per_picture_minus_1); return result; } } AV1C_POP_ERROR_HANDLER_DATA(); return result; } // Parse the AV1 decoder_model_info() structure: // decoder_model_info( ) { // buffer_delay_length_minus_1 f(5) // num_units_in_decoding_tick f(32) // buffer_removal_time_length_minus_1 f(5) // frame_presentation_time_length_minus_1 f(5) // } // // Returns -1 upon failure, or the value of buffer_delay_length_minus_1 + 1. static int parse_decoder_model_info(struct aom_read_bit_buffer *reader) { int result = 0; AV1C_PUSH_ERROR_HANDLER_DATA(result); AV1C_READ_BITS_OR_RETURN_ERROR(buffer_delay_length_minus_1, 5); AV1C_READ_BITS_OR_RETURN_ERROR(num_units_in_decoding_tick, 32); AV1C_READ_BITS_OR_RETURN_ERROR(buffer_removal_time_length_minus_1, 5); AV1C_READ_BITS_OR_RETURN_ERROR(frame_presentation_time_length_minus_1, 5); AV1C_POP_ERROR_HANDLER_DATA(); return buffer_delay_length_minus_1 + 1; } // Parse the AV1 operating_parameters_info() structure: // operating_parameters_info( op ) { // n = buffer_delay_length_minus_1 + 1 // decoder_buffer_delay[ op ] f(n) // encoder_buffer_delay[ op ] f(n) // low_delay_mode_flag[ op ] f(1) // } static int parse_operating_parameters_info(struct aom_read_bit_buffer *reader, int buffer_delay_length_minus_1) { int result = 0; AV1C_PUSH_ERROR_HANDLER_DATA(result); const int buffer_delay_length = buffer_delay_length_minus_1 + 1; AV1C_READ_BITS_OR_RETURN_ERROR(decoder_buffer_delay, buffer_delay_length); AV1C_READ_BITS_OR_RETURN_ERROR(encoder_buffer_delay, buffer_delay_length); AV1C_READ_BIT_OR_RETURN_ERROR(low_delay_mode_flag); AV1C_POP_ERROR_HANDLER_DATA(); return result; } // Parse the AV1 color_config() structure..See: // https://aomediacodec.github.io/av1-spec/av1-spec.pdf#page=44 static int parse_color_config(struct aom_read_bit_buffer *reader, Av1Config *config) { int result = 0; AV1C_PUSH_ERROR_HANDLER_DATA(result); AV1C_READ_BIT_OR_RETURN_ERROR(high_bitdepth); config->high_bitdepth = high_bitdepth; int bit_depth = 0; if (config->seq_profile == 2 && config->high_bitdepth) { AV1C_READ_BIT_OR_RETURN_ERROR(twelve_bit); config->twelve_bit = twelve_bit; bit_depth = config->twelve_bit ? 12 : 10; } else { bit_depth = config->high_bitdepth ? 10 : 8; } if (config->seq_profile != 1) { AV1C_READ_BIT_OR_RETURN_ERROR(mono_chrome); config->monochrome = mono_chrome; } int color_primaries = AOM_CICP_CP_UNSPECIFIED; int transfer_characteristics = AOM_CICP_TC_UNSPECIFIED; int matrix_coefficients = AOM_CICP_MC_UNSPECIFIED; AV1C_READ_BIT_OR_RETURN_ERROR(color_description_present_flag); if (color_description_present_flag) { AV1C_READ_BITS_OR_RETURN_ERROR(color_primaries_val, 8); color_primaries = color_primaries_val; AV1C_READ_BITS_OR_RETURN_ERROR(transfer_characteristics_val, 8); transfer_characteristics = transfer_characteristics_val; AV1C_READ_BITS_OR_RETURN_ERROR(matrix_coefficients_val, 8); matrix_coefficients = matrix_coefficients_val; } if (config->monochrome) { AV1C_READ_BIT_OR_RETURN_ERROR(color_range); config->chroma_subsampling_x = 1; config->chroma_subsampling_y = 1; } else if (color_primaries == AOM_CICP_CP_BT_709 && transfer_characteristics == AOM_CICP_TC_SRGB && matrix_coefficients == AOM_CICP_MC_IDENTITY) { config->chroma_subsampling_x = 0; config->chroma_subsampling_y = 0; } else { AV1C_READ_BIT_OR_RETURN_ERROR(color_range); if (config->seq_profile == 0) { config->chroma_subsampling_x = 1; config->chroma_subsampling_y = 1; } else if (config->seq_profile == 1) { config->chroma_subsampling_x = 0; config->chroma_subsampling_y = 0; } else { if (bit_depth == 12) { AV1C_READ_BIT_OR_RETURN_ERROR(subsampling_x); config->chroma_subsampling_x = subsampling_x; if (subsampling_x) { AV1C_READ_BIT_OR_RETURN_ERROR(subsampling_y); config->chroma_subsampling_y = subsampling_y; } else { config->chroma_subsampling_y = 0; } } else { config->chroma_subsampling_x = 1; config->chroma_subsampling_y = 0; } } if (config->chroma_subsampling_x && config->chroma_subsampling_y) { AV1C_READ_BITS_OR_RETURN_ERROR(chroma_sample_position, 2); config->chroma_sample_position = chroma_sample_position; } } if (!config->monochrome) { AV1C_READ_BIT_OR_RETURN_ERROR(separate_uv_delta_q); } AV1C_POP_ERROR_HANDLER_DATA(); return result; } // Parse AV1 Sequence Header OBU. See: // https://aomediacodec.github.io/av1-spec/av1-spec.pdf#page=41 static int parse_sequence_header(const uint8_t *const buffer, size_t length, Av1Config *config) { int result = 0; // The reader instance is local to this function, but a pointer to the // reader instance is used within this function and throughout this file to // allow use of the helper macros that reduce parse error checking verbosity. struct aom_read_bit_buffer reader_instance = { buffer, buffer + length, 0, &result, bitreader_error_handler }; struct aom_read_bit_buffer *reader = &reader_instance; AV1C_READ_BITS_OR_RETURN_ERROR(seq_profile, 3); config->seq_profile = seq_profile; AV1C_READ_BIT_OR_RETURN_ERROR(still_picture); AV1C_READ_BIT_OR_RETURN_ERROR(reduced_still_picture_header); if (reduced_still_picture_header) { config->initial_presentation_delay_present = 0; AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx_0, 5); config->seq_level_idx_0 = seq_level_idx_0; config->seq_tier_0 = 0; } else { int has_decoder_model = 0; int buffer_delay_length = 0; AV1C_READ_BIT_OR_RETURN_ERROR(timing_info_present_flag); if (timing_info_present_flag) { if (parse_timing_info(reader) != 0) return -1; AV1C_READ_BIT_OR_RETURN_ERROR(decoder_model_info_present_flag); if (decoder_model_info_present_flag && (buffer_delay_length = parse_decoder_model_info(reader)) == -1) { return -1; } has_decoder_model = 1; } AV1C_READ_BIT_OR_RETURN_ERROR(initial_presentation_delay_present); config->initial_presentation_delay_present = initial_presentation_delay_present; AV1C_READ_BITS_OR_RETURN_ERROR(operating_points_cnt_minus_1, 5); const int num_operating_points = operating_points_cnt_minus_1 + 1; for (int op_index = 0; op_index < num_operating_points; ++op_index) { AV1C_READ_BITS_OR_RETURN_ERROR(operating_point_idc, 12); AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx, 5); int seq_tier = 0; if (seq_level_idx > 7) { AV1C_READ_BIT_OR_RETURN_ERROR(seq_tier_this_op); seq_tier = seq_tier_this_op; } if (has_decoder_model) { AV1C_READ_BIT_OR_RETURN_ERROR(decoder_model_present_for_op); if (decoder_model_present_for_op) { if (parse_operating_parameters_info(reader, buffer_delay_length) == -1) { return -1; } } } if (config->initial_presentation_delay_present) { // Skip the initial presentation delay bits if present since this // function has no access to the data required to properly set the // field. AV1C_READ_BIT_OR_RETURN_ERROR( initial_presentation_delay_present_for_this_op); if (initial_presentation_delay_present_for_this_op) { AV1C_READ_BITS_OR_RETURN_ERROR(initial_presentation_delay_minus_1, 4); } } if (op_index == 0) { // Av1Config needs only the values from the first operating point. config->seq_level_idx_0 = seq_level_idx; config->seq_tier_0 = seq_tier; config->initial_presentation_delay_present = 0; config->initial_presentation_delay_minus_one = 0; } } } AV1C_READ_BITS_OR_RETURN_ERROR(frame_width_bits_minus_1, 4); AV1C_READ_BITS_OR_RETURN_ERROR(frame_height_bits_minus_1, 4); AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_width_minus_1, frame_width_bits_minus_1 + 1); AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_height_minus_1, frame_height_bits_minus_1 + 1); uint8_t frame_id_numbers_present = 0; if (!reduced_still_picture_header) { AV1C_READ_BIT_OR_RETURN_ERROR(frame_id_numbers_present_flag); frame_id_numbers_present = frame_id_numbers_present_flag; } if (frame_id_numbers_present) { AV1C_READ_BITS_OR_RETURN_ERROR(delta_frame_id_length_minus_2, 4); AV1C_READ_BITS_OR_RETURN_ERROR(additional_frame_id_length_minus_1, 3); } AV1C_READ_BIT_OR_RETURN_ERROR(use_128x128_superblock); AV1C_READ_BIT_OR_RETURN_ERROR(enable_filter_intra); AV1C_READ_BIT_OR_RETURN_ERROR(enable_intra_edge_filter); if (!reduced_still_picture_header) { AV1C_READ_BIT_OR_RETURN_ERROR(enable_interintra_compound); AV1C_READ_BIT_OR_RETURN_ERROR(enable_masked_compound); AV1C_READ_BIT_OR_RETURN_ERROR(enable_warped_motion); AV1C_READ_BIT_OR_RETURN_ERROR(enable_dual_filter); AV1C_READ_BIT_OR_RETURN_ERROR(enable_order_hint); if (enable_order_hint) { AV1C_READ_BIT_OR_RETURN_ERROR(enable_dist_wtd_comp); AV1C_READ_BIT_OR_RETURN_ERROR(enable_ref_frame_mvs); } const int SELECT_SCREEN_CONTENT_TOOLS = 2; int seq_force_screen_content_tools = SELECT_SCREEN_CONTENT_TOOLS; AV1C_READ_BIT_OR_RETURN_ERROR(seq_choose_screen_content_tools); if (!seq_choose_screen_content_tools) { AV1C_READ_BIT_OR_RETURN_ERROR(seq_force_screen_content_tools_val); seq_force_screen_content_tools = seq_force_screen_content_tools_val; } if (seq_force_screen_content_tools > 0) { AV1C_READ_BIT_OR_RETURN_ERROR(seq_choose_integer_mv); if (!seq_choose_integer_mv) { AV1C_READ_BIT_OR_RETURN_ERROR(seq_force_integer_mv); } } if (enable_order_hint) { AV1C_READ_BITS_OR_RETURN_ERROR(order_hint_bits_minus_1, 3); } } AV1C_READ_BIT_OR_RETURN_ERROR(enable_superres); AV1C_READ_BIT_OR_RETURN_ERROR(enable_cdef); AV1C_READ_BIT_OR_RETURN_ERROR(enable_restoration); if (parse_color_config(reader, config) != 0) { fprintf(stderr, "av1c: color_config() parse failed.\n"); return -1; } AV1C_READ_BIT_OR_RETURN_ERROR(film_grain_params_present); return 0; } int get_av1config_from_obu(const uint8_t *buffer, size_t length, int is_annexb, Av1Config *config) { if (!buffer || length == 0 || !config) { return -1; } ObuHeader obu_header; memset(&obu_header, 0, sizeof(obu_header)); size_t sequence_header_length = 0; size_t obu_header_length = 0; if (aom_read_obu_header_and_size(buffer, length, is_annexb, &obu_header, &sequence_header_length, &obu_header_length) != AOM_CODEC_OK || obu_header.type != OBU_SEQUENCE_HEADER || sequence_header_length + obu_header_length > length) { return -1; } memset(config, 0, sizeof(*config)); config->marker = 1; config->version = 1; return parse_sequence_header(buffer + obu_header_length, sequence_header_length, config); } int read_av1config(const uint8_t *buffer, size_t buffer_length, size_t *bytes_read, Av1Config *config) { if (!buffer || buffer_length < kAv1cSize || !bytes_read || !config) return -1; *bytes_read = 0; int result = 0; struct aom_read_bit_buffer reader_instance = { buffer, buffer + buffer_length, 0, &result, bitreader_error_handler }; struct aom_read_bit_buffer *reader = &reader_instance; memset(config, 0, sizeof(*config)); AV1C_READ_BIT_OR_RETURN_ERROR(marker); config->marker = marker; AV1C_READ_BITS_OR_RETURN_ERROR(version, 7); config->version = version; AV1C_READ_BITS_OR_RETURN_ERROR(seq_profile, 3); config->seq_profile = seq_profile; AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx_0, 5); config->seq_level_idx_0 = seq_level_idx_0; AV1C_READ_BIT_OR_RETURN_ERROR(seq_tier_0); config->seq_tier_0 = seq_tier_0; AV1C_READ_BIT_OR_RETURN_ERROR(high_bitdepth); config->high_bitdepth = high_bitdepth; AV1C_READ_BIT_OR_RETURN_ERROR(twelve_bit); config->twelve_bit = twelve_bit; AV1C_READ_BIT_OR_RETURN_ERROR(monochrome); config->monochrome = monochrome; AV1C_READ_BIT_OR_RETURN_ERROR(chroma_subsampling_x); config->chroma_subsampling_x = chroma_subsampling_x; AV1C_READ_BIT_OR_RETURN_ERROR(chroma_subsampling_y); config->chroma_subsampling_y = chroma_subsampling_y; AV1C_READ_BITS_OR_RETURN_ERROR(chroma_sample_position, 2); config->chroma_sample_position = chroma_sample_position; AV1C_READ_BITS_OR_RETURN_ERROR(reserved, 3); AV1C_READ_BIT_OR_RETURN_ERROR(initial_presentation_delay_present); config->initial_presentation_delay_present = initial_presentation_delay_present; AV1C_READ_BITS_OR_RETURN_ERROR(initial_presentation_delay_minus_one, 4); config->initial_presentation_delay_minus_one = initial_presentation_delay_minus_one; *bytes_read = aom_rb_bytes_read(reader); return 0; } int write_av1config(const Av1Config *config, size_t capacity, size_t *bytes_written, uint8_t *buffer) { if (!config || !buffer || capacity < kAv1cSize || !bytes_written) return -1; buffer[0] = (config->marker << 7) | config->version; buffer[1] = (config->seq_profile << 5) | config->seq_level_idx_0; buffer[2] = (config->seq_tier_0 << 7) | (config->high_bitdepth << 6) | (config->twelve_bit << 5) | (config->monochrome << 4) | (config->chroma_subsampling_x << 3) | (config->chroma_subsampling_y << 2) | config->chroma_sample_position; buffer[3] = config->initial_presentation_delay_present << 4; if (config->initial_presentation_delay_present) { buffer[3] |= config->initial_presentation_delay_minus_one; } *bytes_written = kAv1cSize; return 0; } #undef AV1C_READ_BIT_OR_RETURN_ERROR #undef AV1C_READ_BITS_OR_RETURN_ERROR #undef AV1C_PUSH_ERROR_HANDLER_DATA #undef AV1C_POP_ERROR_HANDLER_DATA aom-3.12.1/common/av1_config.h000066400000000000000000000060221477627663500160560ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_COMMON_AV1_CONFIG_H_ #define AOM_COMMON_AV1_CONFIG_H_ #include "aom/aom_integer.h" #ifdef __cplusplus extern "C" { #endif // Struct representing ISOBMFF/Matroska AV1 config. See: // https://aomediacodec.github.io/av1-isobmff/#av1codecconfigurationbox-syntax // // The AV1 config has the following format: // // unsigned int (1) marker = 1; // unsigned int (7) version = 1; // unsigned int (3) seq_profile; // unsigned int (5) seq_level_idx_0; // unsigned int (1) seq_tier_0; // unsigned int (1) high_bitdepth; // unsigned int (1) twelve_bit; // unsigned int (1) monochrome; // unsigned int (1) chroma_subsampling_x; // unsigned int (1) chroma_subsampling_y; // unsigned int (2) chroma_sample_position; // unsigned int (3) reserved = 0; // // unsigned int (1) initial_presentation_delay_present; // if (initial_presentation_delay_present) { // unsigned int (4) initial_presentation_delay_minus_one; // } else { // unsigned int (4) reserved = 0; // } // // unsigned int (8)[] configOBUs; // // Note: get_av1config_from_obu() does not currently store 'configOBUs' data, so // the field is omitted. typedef struct _Av1Config { uint8_t marker; uint8_t version; uint8_t seq_profile; uint8_t seq_level_idx_0; uint8_t seq_tier_0; uint8_t high_bitdepth; uint8_t twelve_bit; uint8_t monochrome; uint8_t chroma_subsampling_x; uint8_t chroma_subsampling_y; uint8_t chroma_sample_position; uint8_t initial_presentation_delay_present; uint8_t initial_presentation_delay_minus_one; } Av1Config; // Attempts to parse a Sequence Header OBU and set the paramenters of 'config'. // Returns 0 upon success, and -1 upon failure. 'buffer' can contain multiple // OBUs, but the Sequence Header OBU must be the first OBU within the buffer. int get_av1config_from_obu(const uint8_t *buffer, size_t length, int is_annexb, Av1Config *config); // Attempts to parse an AV1 config from 'buffer'. Returns 0 upon success. // Returns -1 when 'buffer_length' is less than 4, when passed NULL pointers, or // when parsing of 'buffer' fails. int read_av1config(const uint8_t *buffer, size_t buffer_length, size_t *bytes_read, Av1Config *config); // Writes 'config' to 'buffer'. Returns 0 upon successful write to 'buffer'. // Returns -1 when passed NULL pointers or when 'capacity' insufficient. int write_av1config(const Av1Config *config, size_t capacity, size_t *bytes_written, uint8_t *buffer); #ifdef __cplusplus } /* extern "C" */ #endif #endif // AOM_COMMON_AV1_CONFIG_H_ aom-3.12.1/common/ivf_dec.cmake000066400000000000000000000022341477627663500162730ustar00rootroot00000000000000# # Copyright (c) 2021, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # if(AOM_COMMON_IVF_DEC_CMAKE_) return() endif() # AOM_COMMON_AOM_COMMON_CMAKE_ set(AOM_COMMON_IVF_DEC_CMAKE_ 1) list(APPEND IVF_DEC_SOURCES "${AOM_ROOT}/common/ivfdec.c" "${AOM_ROOT}/common/ivfdec.h") # Creates the aom_common build target and makes libaom depend on it. The libaom # target must exist before this function is called. function(setup_ivf_dec_targets) add_library(ivf_dec OBJECT ${IVF_DEC_SOURCES}) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} ivf_dec PARENT_SCOPE) target_sources(aom PRIVATE $) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $) endif() endfunction() aom-3.12.1/common/ivfdec.c000066400000000000000000000065361477627663500153070ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "common/ivfdec.h" #include #include #include #include "aom_ports/mem_ops.h" #include "aom_ports/sanitizer.h" #include "tools_common.h" static const char *IVF_SIGNATURE = "DKIF"; static void fix_framerate(int *num, int *den) { if (*den <= 0 || *den >= 1000000000 || *num <= 0 || *num >= 1000) { // framerate seems to be invalid, just default to 30fps. *num = 30; *den = 1; } } int file_is_ivf(struct AvxInputContext *input_ctx) { unsigned char raw_hdr[32]; int is_ivf = 0; if (buffer_input(input_ctx, 32, raw_hdr, /*buffered=*/true) == 32) { if (memcmp(IVF_SIGNATURE, raw_hdr, 4) == 0) { is_ivf = 1; if (mem_get_le16(raw_hdr + 4) != 0) { fprintf(stderr, "Error: Unrecognized IVF version! This file may not" " decode properly.\n"); } input_ctx->fourcc = mem_get_le32(raw_hdr + 8); input_ctx->width = mem_get_le16(raw_hdr + 12); input_ctx->height = mem_get_le16(raw_hdr + 14); input_ctx->framerate.numerator = mem_get_le32(raw_hdr + 16); input_ctx->framerate.denominator = mem_get_le32(raw_hdr + 20); fix_framerate(&input_ctx->framerate.numerator, &input_ctx->framerate.denominator); } } if (!is_ivf) { rewind_detect(input_ctx); } return is_ivf; } int ivf_read_frame(struct AvxInputContext *input_ctx, uint8_t **buffer, size_t *bytes_read, size_t *buffer_size, aom_codec_pts_t *pts) { unsigned char raw_header[IVF_FRAME_HDR_SZ] = { 0 }; size_t frame_size = 0; if (read_from_input(input_ctx, IVF_FRAME_HDR_SZ, raw_header) != IVF_FRAME_HDR_SZ) { if (!input_eof(input_ctx)) fprintf(stderr, "Warning: Failed to read frame size\n"); } else { frame_size = mem_get_le32(raw_header); if (frame_size > 256 * 1024 * 1024) { fprintf(stderr, "Warning: Read invalid frame size (%u)\n", (unsigned int)frame_size); frame_size = 0; } if (frame_size > *buffer_size) { uint8_t *new_buffer = (uint8_t *)realloc(*buffer, 2 * frame_size); if (new_buffer) { *buffer = new_buffer; *buffer_size = 2 * frame_size; } else { fprintf(stderr, "Warning: Failed to allocate compressed data buffer\n"); frame_size = 0; } } if (pts) { *pts = mem_get_le32(&raw_header[4]); *pts += ((aom_codec_pts_t)mem_get_le32(&raw_header[8]) << 32); } } if (!input_eof(input_ctx)) { ASAN_UNPOISON_MEMORY_REGION(*buffer, *buffer_size); if (read_from_input(input_ctx, frame_size, *buffer) != frame_size) { fprintf(stderr, "Warning: Failed to read full frame\n"); return 1; } ASAN_POISON_MEMORY_REGION(*buffer + frame_size, *buffer_size - frame_size); *bytes_read = frame_size; return 0; } return 1; } aom-3.12.1/common/ivfdec.h000066400000000000000000000017371477627663500153120ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_COMMON_IVFDEC_H_ #define AOM_COMMON_IVFDEC_H_ #include "aom/aom_codec.h" #include "common/tools_common.h" #ifdef __cplusplus extern "C" { #endif int file_is_ivf(struct AvxInputContext *input); int ivf_read_frame(struct AvxInputContext *input_ctx, uint8_t **buffer, size_t *bytes_read, size_t *buffer_size, aom_codec_pts_t *pts); #ifdef __cplusplus } /* extern "C" */ #endif #endif // AOM_COMMON_IVFDEC_H_ aom-3.12.1/common/ivfenc.c000066400000000000000000000035301477627663500153100ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "common/ivfenc.h" #include "aom/aom_encoder.h" #include "aom_ports/mem_ops.h" void ivf_write_file_header(FILE *outfile, const struct aom_codec_enc_cfg *cfg, unsigned int fourcc, int frame_cnt) { char header[32]; header[0] = 'D'; header[1] = 'K'; header[2] = 'I'; header[3] = 'F'; mem_put_le16(header + 4, 0); // version mem_put_le16(header + 6, 32); // header size mem_put_le32(header + 8, fourcc); // fourcc mem_put_le16(header + 12, cfg->g_w); // width mem_put_le16(header + 14, cfg->g_h); // height mem_put_le32(header + 16, cfg->g_timebase.den); // rate mem_put_le32(header + 20, cfg->g_timebase.num); // scale mem_put_le32(header + 24, frame_cnt); // length mem_put_le32(header + 28, 0); // unused fwrite(header, 1, 32, outfile); } void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size) { char header[12]; mem_put_le32(header, (int)frame_size); mem_put_le32(header + 4, (int)(pts & 0xFFFFFFFF)); mem_put_le32(header + 8, (int)(pts >> 32)); fwrite(header, 1, 12, outfile); } void ivf_write_frame_size(FILE *outfile, size_t frame_size) { char header[4]; mem_put_le32(header, (int)frame_size); fwrite(header, 1, 4, outfile); } aom-3.12.1/common/ivfenc.h000066400000000000000000000020601477627663500153120ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_COMMON_IVFENC_H_ #define AOM_COMMON_IVFENC_H_ #include "common/tools_common.h" struct aom_codec_enc_cfg; struct aom_codec_cx_pkt; #ifdef __cplusplus extern "C" { #endif void ivf_write_file_header(FILE *outfile, const struct aom_codec_enc_cfg *cfg, uint32_t fourcc, int frame_cnt); void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size); void ivf_write_frame_size(FILE *outfile, size_t frame_size); #ifdef __cplusplus } /* extern "C" */ #endif #endif // AOM_COMMON_IVFENC_H_ aom-3.12.1/common/md5_utils.c000066400000000000000000000201111477627663500157350ustar00rootroot00000000000000/* * This code implements the MD5 message-digest algorithm. * The algorithm is due to Ron Rivest. This code was * written by Colin Plumb in 1993, no copyright is claimed. * This code is in the public domain; do with it what you wish. * * Equivalent code is available from RSA Data Security, Inc. * This code has been tested against that, and is equivalent, * except that you don't need to include two pages of legalese * with every copy. * * To compute the message digest of a chunk of bytes, declare an * MD5Context structure, pass it to MD5Init, call MD5Update as * needed on buffers full of bytes, and then call MD5Final, which * will fill a supplied 16-byte array with the digest. * * Changed so as no longer to depend on Colin Plumb's `usual.h' header * definitions * - Ian Jackson . * Still in the public domain. */ #include /* for memcpy() */ #include "common/md5_utils.h" static void byteSwap(UWORD32 *buf, unsigned words) { md5byte *p; /* Only swap bytes for big endian machines */ int i = 1; if (*(char *)&i == 1) return; p = (md5byte *)buf; do { *buf++ = (UWORD32)((unsigned)p[3] << 8 | p[2]) << 16 | ((unsigned)p[1] << 8 | p[0]); p += 4; } while (--words); } /* * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious * initialization constants. */ void MD5Init(struct MD5Context *ctx) { ctx->buf[0] = 0x67452301; ctx->buf[1] = 0xefcdab89; ctx->buf[2] = 0x98badcfe; ctx->buf[3] = 0x10325476; ctx->bytes[0] = 0; ctx->bytes[1] = 0; } /* * Update context to reflect the concatenation of another buffer full * of bytes. */ void MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) { UWORD32 t; /* Update byte count */ t = ctx->bytes[0]; if ((ctx->bytes[0] = t + len) < t) ctx->bytes[1]++; /* Carry from low to high */ t = 64 - (t & 0x3f); /* Space available in ctx->in (at least 1) */ if (t > len) { memcpy((md5byte *)ctx->in + 64 - t, buf, len); return; } /* First chunk is an odd size */ memcpy((md5byte *)ctx->in + 64 - t, buf, t); byteSwap(ctx->in, 16); MD5Transform(ctx->buf, ctx->in); buf += t; len -= t; /* Process data in 64-byte chunks */ while (len >= 64) { memcpy(ctx->in, buf, 64); byteSwap(ctx->in, 16); MD5Transform(ctx->buf, ctx->in); buf += 64; len -= 64; } /* Handle any remaining bytes of data. */ memcpy(ctx->in, buf, len); } /* * Final wrapup - pad to 64-byte boundary with the bit pattern * 1 0* (64-bit count of bits processed, MSB-first) */ void MD5Final(md5byte digest[16], struct MD5Context *ctx) { int count = ctx->bytes[0] & 0x3f; /* Number of bytes in ctx->in */ md5byte *p = (md5byte *)ctx->in + count; /* Set the first char of padding to 0x80. There is always room. */ *p++ = 0x80; /* Bytes of padding needed to make 56 bytes (-8..55) */ count = 56 - 1 - count; if (count < 0) { /* Padding forces an extra block */ memset(p, 0, count + 8); byteSwap(ctx->in, 16); MD5Transform(ctx->buf, ctx->in); p = (md5byte *)ctx->in; count = 56; } memset(p, 0, count); byteSwap(ctx->in, 14); /* Append length in bits and transform */ ctx->in[14] = ctx->bytes[0] << 3; ctx->in[15] = ctx->bytes[1] << 3 | ctx->bytes[0] >> 29; MD5Transform(ctx->buf, ctx->in); byteSwap(ctx->buf, 4); memcpy(digest, ctx->buf, 16); memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */ } #ifndef ASM_MD5 /* The four core functions - F1 is optimized somewhat */ /* #define F1(x, y, z) (x & y | ~x & z) */ #define F1(x, y, z) (z ^ (x & (y ^ z))) #define F2(x, y, z) F1(z, x, y) #define F3(x, y, z) (x ^ y ^ z) #define F4(x, y, z) (y ^ (x | ~z)) /* This is the central step in the MD5 algorithm. */ #define MD5STEP(f, w, x, y, z, in, s) \ (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x) #if defined(__clang__) && defined(__has_attribute) #if __has_attribute(no_sanitize) #define AOM_NO_UNSIGNED_OVERFLOW_CHECK \ __attribute__((no_sanitize("unsigned-integer-overflow"))) #endif #if __clang_major__ >= 12 #define VPX_NO_UNSIGNED_SHIFT_CHECK \ __attribute__((no_sanitize("unsigned-shift-base"))) #endif // __clang__ >= 12 #endif // __clang__ #ifndef AOM_NO_UNSIGNED_OVERFLOW_CHECK #define AOM_NO_UNSIGNED_OVERFLOW_CHECK #endif #ifndef AOM_NO_UNSIGNED_SHIFT_CHECK #define AOM_NO_UNSIGNED_SHIFT_CHECK #endif /* * The core of the MD5 algorithm, this alters an existing MD5 hash to * reflect the addition of 16 longwords of new data. MD5Update blocks * the data and converts bytes into longwords for this routine. */ AOM_NO_UNSIGNED_OVERFLOW_CHECK AOM_NO_UNSIGNED_SHIFT_CHECK void MD5Transform( UWORD32 buf[4], UWORD32 const in[16]) { register UWORD32 a, b, c, d; a = buf[0]; b = buf[1]; c = buf[2]; d = buf[3]; MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); buf[0] += a; buf[1] += b; buf[2] += c; buf[3] += d; } #undef AOM_NO_UNSIGNED_OVERFLOW_CHECK #undef AOM_NO_UNSIGNED_SHIFT_CHECK #endif aom-3.12.1/common/md5_utils.h000066400000000000000000000027211477627663500157510ustar00rootroot00000000000000/* * This is the header file for the MD5 message-digest algorithm. * The algorithm is due to Ron Rivest. This code was * written by Colin Plumb in 1993, no copyright is claimed. * This code is in the public domain; do with it what you wish. * * Equivalent code is available from RSA Data Security, Inc. * This code has been tested against that, and is equivalent, * except that you don't need to include two pages of legalese * with every copy. * * To compute the message digest of a chunk of bytes, declare an * MD5Context structure, pass it to MD5Init, call MD5Update as * needed on buffers full of bytes, and then call MD5Final, which * will fill a supplied 16-byte array with the digest. * * Changed so as no longer to depend on Colin Plumb's `usual.h' * header definitions * - Ian Jackson . * Still in the public domain. */ #ifndef AOM_COMMON_MD5_UTILS_H_ #define AOM_COMMON_MD5_UTILS_H_ #ifdef __cplusplus extern "C" { #endif #define md5byte unsigned char #define UWORD32 unsigned int typedef struct MD5Context MD5Context; struct MD5Context { UWORD32 buf[4]; UWORD32 bytes[2]; UWORD32 in[16]; }; void MD5Init(struct MD5Context *context); void MD5Update(struct MD5Context *context, md5byte const *buf, unsigned len); void MD5Final(unsigned char digest[16], struct MD5Context *context); void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_COMMON_MD5_UTILS_H_ aom-3.12.1/common/obudec.c000066400000000000000000000414121477627663500153000ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "common/obudec.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/mem_ops.h" #include "av1/common/common.h" #include "av1/common/obu_util.h" #include "tools_common.h" #define OBU_BUFFER_SIZE (500 * 1024) #define OBU_HEADER_SIZE 1 #define OBU_EXTENSION_SIZE 1 #define OBU_MAX_LENGTH_FIELD_SIZE 8 #define OBU_MAX_HEADER_SIZE \ (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE + 2 * OBU_MAX_LENGTH_FIELD_SIZE) #define OBU_DETECTION_SIZE \ (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE + 4 * OBU_MAX_LENGTH_FIELD_SIZE) // Reads unsigned LEB128 integer and returns 0 upon successful read and decode. // Stores raw bytes in 'value_buffer', length of the number in 'value_length', // and decoded value in 'value'. If 'buffered' is true, it is buffered in the // detect buffer first. static int obudec_read_leb128(struct AvxInputContext *input_ctx, uint8_t *value_buffer, size_t *value_length, uint64_t *value, bool buffered) { if (!input_ctx || !value_buffer || !value_length || !value) return -1; size_t len; for (len = 0; len < OBU_MAX_LENGTH_FIELD_SIZE; ++len) { const size_t num_read = buffer_input(input_ctx, 1, &value_buffer[len], buffered); if (num_read == 0) { if (len == 0 && input_eof(input_ctx)) { *value_length = 0; return 0; } // Ran out of data before completing read of value. return -1; } if ((value_buffer[len] >> 7) == 0) { ++len; *value_length = len; break; } } return aom_uleb_decode(value_buffer, len, value, NULL); } // Reads OBU header from 'input_ctx'. The 'buffer_capacity' passed in must be // large enough to store an OBU header with extension (2 bytes). Raw OBU data is // written to 'obu_data', parsed OBU header values are written to 'obu_header', // and total bytes read from file are written to 'bytes_read'. Returns 0 for // success, and non-zero on failure. When end of file is reached, the return // value is 0 and the 'bytes_read' value is set to 0. If 'buffered' is true, it // is buffered in the detect buffer first. static int obudec_read_obu_header(struct AvxInputContext *input_ctx, size_t buffer_capacity, int is_annexb, uint8_t *obu_data, ObuHeader *obu_header, size_t *bytes_read, bool buffered) { if (!input_ctx || buffer_capacity < (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE) || !obu_data || !obu_header || !bytes_read) { return -1; } *bytes_read = buffer_input(input_ctx, 1, obu_data, buffered); if (input_eof(input_ctx) && *bytes_read == 0) { return 0; } else if (*bytes_read != 1) { fprintf(stderr, "obudec: Failure reading OBU header.\n"); return -1; } const int has_extension = (obu_data[0] >> 2) & 0x1; if (has_extension) { if (buffer_input(input_ctx, 1, &obu_data[1], buffered) != 1) { fprintf(stderr, "obudec: Failure reading OBU extension."); return -1; } ++*bytes_read; } size_t obu_bytes_parsed = 0; const aom_codec_err_t parse_result = aom_read_obu_header( obu_data, *bytes_read, &obu_bytes_parsed, obu_header, is_annexb); if (parse_result != AOM_CODEC_OK || *bytes_read != obu_bytes_parsed) { fprintf(stderr, "obudec: Error parsing OBU header.\n"); return -1; } return 0; } // Reads OBU payload from 'input_ctx' and returns 0 for success when all payload // bytes are read from the file. Payload data is written to 'obu_data', and // actual bytes read added to 'bytes_read'. If 'buffered' is true, it is // buffered in the detect buffer first. static int obudec_read_obu_payload(struct AvxInputContext *input_ctx, size_t payload_length, uint8_t *obu_data, size_t *bytes_read, bool buffered) { if (!input_ctx || payload_length == 0 || !obu_data || !bytes_read) return -1; if (buffer_input(input_ctx, payload_length, obu_data, buffered) != payload_length) { fprintf(stderr, "obudec: Failure reading OBU payload.\n"); return -1; } *bytes_read += payload_length; return 0; } static int obudec_read_obu_header_and_size( struct AvxInputContext *input_ctx, size_t buffer_capacity, int is_annexb, uint8_t *buffer, size_t *bytes_read, size_t *payload_length, ObuHeader *obu_header, bool buffered) { const size_t kMinimumBufferSize = OBU_MAX_HEADER_SIZE; if (!input_ctx || !buffer || !bytes_read || !payload_length || !obu_header || buffer_capacity < kMinimumBufferSize) { return -1; } size_t leb128_length_obu = 0; size_t leb128_length_payload = 0; uint64_t obu_size = 0; if (is_annexb) { if (obudec_read_leb128(input_ctx, &buffer[0], &leb128_length_obu, &obu_size, buffered) != 0) { fprintf(stderr, "obudec: Failure reading OBU size length.\n"); return -1; } else if (leb128_length_obu == 0) { *payload_length = 0; return 0; } if (obu_size > UINT32_MAX) { fprintf(stderr, "obudec: OBU payload length too large.\n"); return -1; } } size_t header_size = 0; if (obudec_read_obu_header(input_ctx, buffer_capacity - leb128_length_obu, is_annexb, buffer + leb128_length_obu, obu_header, &header_size, buffered) != 0) { return -1; } else if (header_size == 0) { *payload_length = 0; return 0; } if (!obu_header->has_size_field) { assert(is_annexb); if (obu_size < header_size) { fprintf(stderr, "obudec: OBU size is too small.\n"); return -1; } *payload_length = (size_t)obu_size - header_size; } else { uint64_t u64_payload_length = 0; if (obudec_read_leb128(input_ctx, &buffer[leb128_length_obu + header_size], &leb128_length_payload, &u64_payload_length, buffered) != 0) { fprintf(stderr, "obudec: Failure reading OBU payload length.\n"); return -1; } if (u64_payload_length > UINT32_MAX) { fprintf(stderr, "obudec: OBU payload length too large.\n"); return -1; } *payload_length = (size_t)u64_payload_length; } *bytes_read = leb128_length_obu + header_size + leb128_length_payload; return 0; } static int obudec_grow_buffer(size_t growth_amount, uint8_t **obu_buffer, size_t *obu_buffer_capacity) { if (!*obu_buffer || !obu_buffer_capacity || growth_amount == 0) { return -1; } const size_t capacity = *obu_buffer_capacity; if (SIZE_MAX - growth_amount < capacity) { fprintf(stderr, "obudec: cannot grow buffer, capacity will roll over.\n"); return -1; } const size_t new_capacity = capacity + growth_amount; #if defined AOM_MAX_ALLOCABLE_MEMORY if (new_capacity > AOM_MAX_ALLOCABLE_MEMORY) { fprintf(stderr, "obudec: OBU size exceeds max alloc size.\n"); return -1; } #endif uint8_t *new_buffer = (uint8_t *)realloc(*obu_buffer, new_capacity); if (!new_buffer) { fprintf(stderr, "obudec: Failed to allocate compressed data buffer.\n"); return -1; } *obu_buffer = new_buffer; *obu_buffer_capacity = new_capacity; return 0; } static int obudec_read_one_obu(struct AvxInputContext *input_ctx, uint8_t **obu_buffer, size_t obu_bytes_buffered, size_t *obu_buffer_capacity, size_t *obu_length, ObuHeader *obu_header, int is_annexb, bool buffered) { if (!input_ctx || !(*obu_buffer) || !obu_buffer_capacity || !obu_length || !obu_header) { return -1; } size_t bytes_read = 0; size_t obu_payload_length = 0; size_t available_buffer_capacity = *obu_buffer_capacity - obu_bytes_buffered; if (available_buffer_capacity < OBU_MAX_HEADER_SIZE) { if (obudec_grow_buffer(AOMMAX(*obu_buffer_capacity, OBU_MAX_HEADER_SIZE), obu_buffer, obu_buffer_capacity) != 0) { *obu_length = bytes_read; return -1; } available_buffer_capacity += AOMMAX(*obu_buffer_capacity, OBU_MAX_HEADER_SIZE); } const int status = obudec_read_obu_header_and_size( input_ctx, available_buffer_capacity, is_annexb, *obu_buffer + obu_bytes_buffered, &bytes_read, &obu_payload_length, obu_header, buffered); if (status < 0) return status; if (obu_payload_length > SIZE_MAX - bytes_read) return -1; if (obu_payload_length > 256 * 1024 * 1024) { fprintf(stderr, "obudec: Read invalid OBU size (%u)\n", (unsigned int)obu_payload_length); *obu_length = bytes_read + obu_payload_length; return -1; } if (bytes_read + obu_payload_length > available_buffer_capacity && obudec_grow_buffer(AOMMAX(*obu_buffer_capacity, obu_payload_length), obu_buffer, obu_buffer_capacity) != 0) { *obu_length = bytes_read + obu_payload_length; return -1; } if (obu_payload_length > 0 && obudec_read_obu_payload(input_ctx, obu_payload_length, *obu_buffer + obu_bytes_buffered + bytes_read, &bytes_read, buffered) != 0) { return -1; } *obu_length = bytes_read; return 0; } int file_is_obu(struct ObuDecInputContext *obu_ctx) { if (!obu_ctx || !obu_ctx->avx_ctx) return 0; struct AvxInputContext *avx_ctx = obu_ctx->avx_ctx; uint8_t detect_buf[OBU_DETECTION_SIZE] = { 0 }; const int is_annexb = obu_ctx->is_annexb; size_t payload_length = 0; ObuHeader obu_header; memset(&obu_header, 0, sizeof(obu_header)); size_t length_of_unit_size = 0; size_t annexb_header_length = 0; uint64_t unit_size = 0; if (is_annexb) { // read the size of first temporal unit if (obudec_read_leb128(avx_ctx, &detect_buf[0], &length_of_unit_size, &unit_size, /*buffered=*/true) != 0) { fprintf(stderr, "obudec: Failure reading temporal unit header\n"); rewind_detect(avx_ctx); return 0; } // read the size of first frame unit if (obudec_read_leb128(avx_ctx, &detect_buf[length_of_unit_size], &annexb_header_length, &unit_size, /*buffered=*/true) != 0) { fprintf(stderr, "obudec: Failure reading frame unit header\n"); rewind_detect(avx_ctx); return 0; } annexb_header_length += length_of_unit_size; } size_t bytes_read = 0; if (obudec_read_obu_header_and_size( avx_ctx, OBU_DETECTION_SIZE - annexb_header_length, is_annexb, &detect_buf[annexb_header_length], &bytes_read, &payload_length, &obu_header, /*buffered=*/true) != 0) { fprintf(stderr, "obudec: Failure reading first OBU.\n"); rewind_detect(avx_ctx); return 0; } if (is_annexb) { bytes_read += annexb_header_length; } if (obu_header.type != OBU_TEMPORAL_DELIMITER && obu_header.type != OBU_SEQUENCE_HEADER) { rewind_detect(avx_ctx); return 0; } if (obu_header.has_size_field) { if (obu_header.type == OBU_TEMPORAL_DELIMITER && payload_length != 0) { fprintf( stderr, "obudec: Invalid OBU_TEMPORAL_DELIMITER payload length (non-zero)."); rewind_detect(avx_ctx); return 0; } } else if (!is_annexb) { fprintf(stderr, "obudec: OBU size fields required, cannot decode input.\n"); rewind_detect(avx_ctx); return 0; } // Appears that input is valid Section 5 AV1 stream. obu_ctx->buffer = (uint8_t *)malloc(OBU_BUFFER_SIZE); if (!obu_ctx->buffer) { fprintf(stderr, "Out of memory.\n"); rewind_detect(avx_ctx); return 0; } obu_ctx->buffer_capacity = OBU_BUFFER_SIZE; memcpy(obu_ctx->buffer, &detect_buf[0], bytes_read); obu_ctx->bytes_buffered = bytes_read; // If the first OBU is a SEQUENCE_HEADER, then it will have a payload. // We need to read this in so that our buffer only contains complete OBUs. if (payload_length > 0) { if (payload_length > (obu_ctx->buffer_capacity - bytes_read)) { fprintf(stderr, "obudec: First OBU's payload is too large\n"); rewind_detect(avx_ctx); obudec_free(obu_ctx); return 0; } size_t payload_bytes = 0; const int status = obudec_read_obu_payload( avx_ctx, payload_length, &obu_ctx->buffer[bytes_read], &payload_bytes, /*buffered=*/false); if (status < 0) { rewind_detect(avx_ctx); obudec_free(obu_ctx); return 0; } obu_ctx->bytes_buffered += payload_bytes; } return 1; } int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx, uint8_t **buffer, size_t *bytes_read, size_t *buffer_size) { FILE *f = obu_ctx->avx_ctx->file; if (!f) return -1; *buffer_size = 0; *bytes_read = 0; if (input_eof(obu_ctx->avx_ctx)) { return 1; } size_t tu_size; size_t obu_size = 0; size_t length_of_temporal_unit_size = 0; uint8_t tuheader[OBU_MAX_LENGTH_FIELD_SIZE] = { 0 }; if (obu_ctx->is_annexb) { uint64_t size = 0; if (obu_ctx->bytes_buffered == 0) { if (obudec_read_leb128(obu_ctx->avx_ctx, &tuheader[0], &length_of_temporal_unit_size, &size, /*buffered=*/false) != 0) { fprintf(stderr, "obudec: Failure reading temporal unit header\n"); return -1; } if (size == 0 && input_eof(obu_ctx->avx_ctx)) { return 1; } } else { // temporal unit size was already stored in buffer if (aom_uleb_decode(obu_ctx->buffer, obu_ctx->bytes_buffered, &size, &length_of_temporal_unit_size) != 0) { fprintf(stderr, "obudec: Failure reading temporal unit header\n"); return -1; } } if (size > UINT32_MAX || size + length_of_temporal_unit_size > UINT32_MAX) { fprintf(stderr, "obudec: TU too large.\n"); return -1; } size += length_of_temporal_unit_size; tu_size = (size_t)size; } else { while (1) { ObuHeader obu_header; memset(&obu_header, 0, sizeof(obu_header)); if (obudec_read_one_obu(obu_ctx->avx_ctx, &obu_ctx->buffer, obu_ctx->bytes_buffered, &obu_ctx->buffer_capacity, &obu_size, &obu_header, 0, /*buffered=*/false) != 0) { fprintf(stderr, "obudec: read_one_obu failed in TU loop\n"); return -1; } if (obu_header.type == OBU_TEMPORAL_DELIMITER || obu_size == 0) { tu_size = obu_ctx->bytes_buffered; break; } else { obu_ctx->bytes_buffered += obu_size; } } } #if defined AOM_MAX_ALLOCABLE_MEMORY if (tu_size > AOM_MAX_ALLOCABLE_MEMORY) { fprintf(stderr, "obudec: Temporal Unit size exceeds max alloc size.\n"); return -1; } #endif if (tu_size > 0) { uint8_t *new_buffer = (uint8_t *)realloc(*buffer, tu_size); if (!new_buffer) { free(*buffer); fprintf(stderr, "obudec: Out of memory.\n"); return -1; } *buffer = new_buffer; } *bytes_read = tu_size; *buffer_size = tu_size; if (!obu_ctx->is_annexb) { memcpy(*buffer, obu_ctx->buffer, tu_size); // At this point, (obu_ctx->buffer + obu_ctx->bytes_buffered + obu_size) // points to the end of the buffer. memmove(obu_ctx->buffer, obu_ctx->buffer + obu_ctx->bytes_buffered, obu_size); obu_ctx->bytes_buffered = obu_size; } else { if (!input_eof(obu_ctx->avx_ctx)) { size_t data_size; size_t offset; if (!obu_ctx->bytes_buffered) { data_size = tu_size - length_of_temporal_unit_size; memcpy(*buffer, &tuheader[0], length_of_temporal_unit_size); offset = length_of_temporal_unit_size; } else { const size_t copy_size = AOMMIN(obu_ctx->bytes_buffered, tu_size); memcpy(*buffer, obu_ctx->buffer, copy_size); offset = copy_size; data_size = tu_size - copy_size; obu_ctx->bytes_buffered -= copy_size; } if (read_from_input(obu_ctx->avx_ctx, data_size, *buffer + offset) != data_size) { fprintf(stderr, "obudec: Failed to read full temporal unit\n"); return -1; } } } return 0; } void obudec_free(struct ObuDecInputContext *obu_ctx) { free(obu_ctx->buffer); obu_ctx->buffer = NULL; obu_ctx->buffer_capacity = 0; obu_ctx->bytes_buffered = 0; } aom-3.12.1/common/obudec.h000066400000000000000000000032741477627663500153110ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_COMMON_OBUDEC_H_ #define AOM_COMMON_OBUDEC_H_ #include "common/tools_common.h" #ifdef __cplusplus extern "C" { #endif struct ObuDecInputContext { struct AvxInputContext *avx_ctx; uint8_t *buffer; size_t buffer_capacity; size_t bytes_buffered; int is_annexb; }; // Returns 1 when file data starts (if Annex B stream, after reading the // size of the OBU) with what appears to be a Temporal Delimiter // OBU as defined by Section 5 of the AV1 bitstream specification. int file_is_obu(struct ObuDecInputContext *obu_ctx); // Reads one Temporal Unit from the input file. Returns 0 when a TU is // successfully read, 1 when end of file is reached, and less than 0 when an // error occurs. Stores TU data in 'buffer'. Reallocs buffer to match TU size, // returns buffer capacity via 'buffer_size', and returns size of buffered data // via 'bytes_read'. int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx, uint8_t **buffer, size_t *bytes_read, size_t *buffer_size); void obudec_free(struct ObuDecInputContext *obu_ctx); #ifdef __cplusplus } /* extern "C" */ #endif #endif // AOM_COMMON_OBUDEC_H_ aom-3.12.1/common/rawenc.c000066400000000000000000000076131477627663500153230ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "common/rawenc.h" // Number of bytes to write per batch in write_greyscale. #define BATCH_SIZE 8 // Interface to writing to either a file or MD5Context. Takes a pointer to // either the file or MD5Context, the buffer, the size of each element, and // number of elements to write. Note that size and nmemb (last two args) must // be unsigned int, as the interface to MD5Update requires that. typedef void (*WRITER)(void *, const uint8_t *, unsigned int, unsigned int); static void write_file(void *fp, const uint8_t *buffer, unsigned int size, unsigned int nmemb) { fwrite(buffer, size, nmemb, (FILE *)fp); } static void write_md5(void *md5, const uint8_t *buffer, unsigned int size, unsigned int nmemb) { MD5Update((MD5Context *)md5, buffer, size * nmemb); } // Writes out n neutral chroma samples (for greyscale). static void write_greyscale(const aom_image_t *img, int n, WRITER writer_func, void *file_or_md5) { // Batch 8 writes for low bit-depth, 4 writes for high bit-depth. int bytes_per_sample; union { uint8_t u8[BATCH_SIZE]; uint16_t u16[BATCH_SIZE / 2]; } batched; if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { bytes_per_sample = 2; for (int i = 0; i < BATCH_SIZE / 2; ++i) { batched.u16[i] = 1 << (img->bit_depth - 1); } } else { bytes_per_sample = 1; for (int i = 0; i < BATCH_SIZE; ++i) { batched.u8[i] = 0x80; } } const int samples_per_batch = BATCH_SIZE / bytes_per_sample; const int num_batched_writes = n / samples_per_batch; for (int i = 0; i < num_batched_writes; ++i) { writer_func(file_or_md5, batched.u8, sizeof(uint8_t), BATCH_SIZE); } const int remaining = n % samples_per_batch; for (int i = 0; i < remaining; ++i) { writer_func(file_or_md5, batched.u8, sizeof(uint8_t), bytes_per_sample); } } // Encapsulates the logic for writing raw data to either an image file or // to an MD5 context. static void raw_write_image_file_or_md5(const aom_image_t *img, const int *planes, const int num_planes, void *file_or_md5, WRITER writer_func) { const bool high_bitdepth = img->fmt & AOM_IMG_FMT_HIGHBITDEPTH; const int bytes_per_sample = high_bitdepth ? 2 : 1; for (int i = 0; i < num_planes; ++i) { const int plane = planes[i]; const int w = aom_img_plane_width(img, plane); const int h = aom_img_plane_height(img, plane); // If we're on a color plane and the output is monochrome, write a greyscale // value. Since there are only YUV planes, compare against Y. if (img->monochrome && plane != AOM_PLANE_Y) { write_greyscale(img, w * h, writer_func, file_or_md5); continue; } const unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; for (int y = 0; y < h; ++y) { writer_func(file_or_md5, buf, bytes_per_sample, w); buf += stride; } } } void raw_write_image_file(const aom_image_t *img, const int *planes, const int num_planes, FILE *file) { raw_write_image_file_or_md5(img, planes, num_planes, file, write_file); } void raw_update_image_md5(const aom_image_t *img, const int *planes, const int num_planes, MD5Context *md5) { raw_write_image_file_or_md5(img, planes, num_planes, md5, write_md5); } aom-3.12.1/common/rawenc.h000066400000000000000000000020511477627663500153170ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_COMMON_RAWENC_H_ #define AOM_COMMON_RAWENC_H_ #include "aom/aom_decoder.h" #include "common/md5_utils.h" #include "common/tools_common.h" #ifdef __cplusplus extern "C" { #endif void raw_write_image_file(const aom_image_t *img, const int *planes, const int num_planes, FILE *file); void raw_update_image_md5(const aom_image_t *img, const int *planes, const int num_planes, MD5Context *md5); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_COMMON_RAWENC_H_ aom-3.12.1/common/tools_common.c000066400000000000000000000473461477627663500165630ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include #include #include "common/tools_common.h" #if CONFIG_AV1_ENCODER #include "aom/aomcx.h" #endif #if CONFIG_AV1_DECODER #include "aom/aomdx.h" #endif #if defined(_WIN32) #include #include #endif #define LOG_ERROR(label) \ do { \ const char *l = label; \ va_list ap; \ va_start(ap, fmt); \ if (l) fprintf(stderr, "%s: ", l); \ vfprintf(stderr, fmt, ap); \ fprintf(stderr, "\n"); \ va_end(ap); \ } while (0) FILE *set_binary_mode(FILE *stream) { (void)stream; #if defined(_WIN32) _setmode(_fileno(stream), _O_BINARY); #endif return stream; } void die(const char *fmt, ...) { LOG_ERROR(NULL); usage_exit(); } void fatal(const char *fmt, ...) { LOG_ERROR("Fatal"); exit(EXIT_FAILURE); } void aom_tools_warn(const char *fmt, ...) { LOG_ERROR("Warning"); } void die_codec(aom_codec_ctx_t *ctx, const char *s) { const char *detail = aom_codec_error_detail(ctx); fprintf(stderr, "%s: %s\n", s, aom_codec_error(ctx)); if (detail) fprintf(stderr, " %s\n", detail); exit(EXIT_FAILURE); } const char *image_format_to_string(aom_img_fmt_t fmt) { switch (fmt) { case AOM_IMG_FMT_I420: return "I420"; case AOM_IMG_FMT_I422: return "I422"; case AOM_IMG_FMT_I444: return "I444"; case AOM_IMG_FMT_YV12: return "YV12"; case AOM_IMG_FMT_NV12: return "NV12"; case AOM_IMG_FMT_YV1216: return "YV1216"; case AOM_IMG_FMT_I42016: return "I42016"; case AOM_IMG_FMT_I42216: return "I42216"; case AOM_IMG_FMT_I44416: return "I44416"; default: return "Other"; } } int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame) { FILE *f = input_ctx->file; struct FileTypeDetectionBuffer *detect = &input_ctx->detect; int plane = 0; int shortread = 0; const int bytespp = (yuv_frame->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; for (plane = 0; plane < 3; ++plane) { uint8_t *ptr; int w = aom_img_plane_width(yuv_frame, plane); const int h = aom_img_plane_height(yuv_frame, plane); int r; // Assuming that for nv12 we read all chroma data at once if (yuv_frame->fmt == AOM_IMG_FMT_NV12 && plane > 1) break; if (yuv_frame->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2; /* Determine the correct plane based on the image format. The for-loop * always counts in Y,U,V order, but this may not match the order of * the data on disk. */ switch (plane) { case 1: ptr = yuv_frame->planes[yuv_frame->fmt == AOM_IMG_FMT_YV12 ? AOM_PLANE_V : AOM_PLANE_U]; break; case 2: ptr = yuv_frame->planes[yuv_frame->fmt == AOM_IMG_FMT_YV12 ? AOM_PLANE_U : AOM_PLANE_V]; break; default: ptr = yuv_frame->planes[plane]; } for (r = 0; r < h; ++r) { size_t needed = w * bytespp; size_t buf_position = 0; const size_t left = detect->buf_read - detect->position; if (left > 0) { const size_t more = (left < needed) ? left : needed; memcpy(ptr, detect->buf + detect->position, more); buf_position = more; needed -= more; detect->position += more; } if (needed > 0) { shortread |= (fread(ptr + buf_position, 1, needed, f) < needed); } ptr += yuv_frame->stride[plane]; } } return shortread; } struct CodecInfo { // Pointer to a function of zero arguments that returns an aom_codec_iface_t. aom_codec_iface_t *(*interface)(void); const char *short_name; uint32_t fourcc; }; #if CONFIG_AV1_ENCODER static const struct CodecInfo aom_encoders[] = { { &aom_codec_av1_cx, "av1", AV1_FOURCC }, }; int get_aom_encoder_count(void) { return sizeof(aom_encoders) / sizeof(aom_encoders[0]); } aom_codec_iface_t *get_aom_encoder_by_index(int i) { assert(i >= 0 && i < get_aom_encoder_count()); return aom_encoders[i].interface(); } aom_codec_iface_t *get_aom_encoder_by_short_name(const char *name) { for (int i = 0; i < get_aom_encoder_count(); ++i) { const struct CodecInfo *info = &aom_encoders[i]; if (strcmp(info->short_name, name) == 0) return info->interface(); } return NULL; } uint32_t get_fourcc_by_aom_encoder(aom_codec_iface_t *iface) { for (int i = 0; i < get_aom_encoder_count(); ++i) { const struct CodecInfo *info = &aom_encoders[i]; if (info->interface() == iface) { return info->fourcc; } } return 0; } const char *get_short_name_by_aom_encoder(aom_codec_iface_t *iface) { for (int i = 0; i < get_aom_encoder_count(); ++i) { const struct CodecInfo *info = &aom_encoders[i]; if (info->interface() == iface) { return info->short_name; } } return NULL; } #endif // CONFIG_AV1_ENCODER #if CONFIG_AV1_DECODER static const struct CodecInfo aom_decoders[] = { { &aom_codec_av1_dx, "av1", AV1_FOURCC }, }; int get_aom_decoder_count(void) { return sizeof(aom_decoders) / sizeof(aom_decoders[0]); } aom_codec_iface_t *get_aom_decoder_by_index(int i) { assert(i >= 0 && i < get_aom_decoder_count()); return aom_decoders[i].interface(); } aom_codec_iface_t *get_aom_decoder_by_short_name(const char *name) { for (int i = 0; i < get_aom_decoder_count(); ++i) { const struct CodecInfo *info = &aom_decoders[i]; if (strcmp(info->short_name, name) == 0) return info->interface(); } return NULL; } aom_codec_iface_t *get_aom_decoder_by_fourcc(uint32_t fourcc) { for (int i = 0; i < get_aom_decoder_count(); ++i) { const struct CodecInfo *info = &aom_decoders[i]; if (info->fourcc == fourcc) return info->interface(); } return NULL; } const char *get_short_name_by_aom_decoder(aom_codec_iface_t *iface) { for (int i = 0; i < get_aom_decoder_count(); ++i) { const struct CodecInfo *info = &aom_decoders[i]; if (info->interface() == iface) { return info->short_name; } } return NULL; } uint32_t get_fourcc_by_aom_decoder(aom_codec_iface_t *iface) { for (int i = 0; i < get_aom_decoder_count(); ++i) { const struct CodecInfo *info = &aom_decoders[i]; if (info->interface() == iface) { return info->fourcc; } } return 0; } #endif // CONFIG_AV1_DECODER void aom_img_write(const aom_image_t *img, FILE *file) { int plane; const int bytespp = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; for (plane = 0; plane < 3; ++plane) { const unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; int w = aom_img_plane_width(img, plane); const int h = aom_img_plane_height(img, plane); int y; // Assuming that for nv12 we write all chroma data at once if (img->fmt == AOM_IMG_FMT_NV12 && plane > 1) break; if (img->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2; for (y = 0; y < h; ++y) { fwrite(buf, bytespp, w, file); buf += stride; } } } bool aom_img_read(aom_image_t *img, FILE *file) { int plane; const int bytespp = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; for (plane = 0; plane < 3; ++plane) { unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; int w = aom_img_plane_width(img, plane); const int h = aom_img_plane_height(img, plane); int y; // Assuming that for nv12 we read all chroma data at once if (img->fmt == AOM_IMG_FMT_NV12 && plane > 1) break; if (img->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2; for (y = 0; y < h; ++y) { if (fread(buf, bytespp, w, file) != (size_t)w) return false; buf += stride; } } return true; } // TODO(dkovalev) change sse_to_psnr signature: double -> int64_t double sse_to_psnr(double samples, double peak, double sse) { static const double kMaxPSNR = 100.0; if (sse > 0.0) { const double psnr = 10.0 * log10(samples * peak * peak / sse); return psnr > kMaxPSNR ? kMaxPSNR : psnr; } else { return kMaxPSNR; } } // TODO(debargha): Consolidate the functions below into a separate file. static void highbd_img_upshift(aom_image_t *dst, const aom_image_t *src, int input_shift) { // Note the offset is 1 less than half. const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0; int plane; if (dst->d_w != src->d_w || dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift || dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt || input_shift < 0) { fatal("Unsupported image conversion"); } switch (src->fmt) { case AOM_IMG_FMT_I42016: case AOM_IMG_FMT_I42216: case AOM_IMG_FMT_I44416: break; default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; int h = src->d_h; int x, y; if (plane) { w = (w + src->x_chroma_shift) >> src->x_chroma_shift; h = (h + src->y_chroma_shift) >> src->y_chroma_shift; } for (y = 0; y < h; y++) { const uint16_t *p_src = (const uint16_t *)(src->planes[plane] + y * src->stride[plane]); uint16_t *p_dst = (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); for (x = 0; x < w; x++) *p_dst++ = (*p_src++ << input_shift) + offset; } } } static void lowbd_img_upshift(aom_image_t *dst, const aom_image_t *src, int input_shift) { // Note the offset is 1 less than half. const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0; int plane; if (dst->d_w != src->d_w || dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift || dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt + AOM_IMG_FMT_HIGHBITDEPTH || input_shift < 0) { fatal("Unsupported image conversion"); } switch (src->fmt) { case AOM_IMG_FMT_YV12: case AOM_IMG_FMT_I420: case AOM_IMG_FMT_I422: case AOM_IMG_FMT_I444: break; default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; int h = src->d_h; int x, y; if (plane) { w = (w + src->x_chroma_shift) >> src->x_chroma_shift; h = (h + src->y_chroma_shift) >> src->y_chroma_shift; } for (y = 0; y < h; y++) { const uint8_t *p_src = src->planes[plane] + y * src->stride[plane]; uint16_t *p_dst = (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); for (x = 0; x < w; x++) { *p_dst++ = (*p_src++ << input_shift) + offset; } } } } void aom_img_upshift(aom_image_t *dst, const aom_image_t *src, int input_shift) { if (src->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { highbd_img_upshift(dst, src, input_shift); } else { lowbd_img_upshift(dst, src, input_shift); } } void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src) { int plane; if (dst->fmt + AOM_IMG_FMT_HIGHBITDEPTH != src->fmt || dst->d_w != src->d_w || dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift || dst->y_chroma_shift != src->y_chroma_shift) { fatal("Unsupported image conversion"); } switch (dst->fmt) { case AOM_IMG_FMT_I420: case AOM_IMG_FMT_I422: case AOM_IMG_FMT_I444: break; default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; int h = src->d_h; int x, y; if (plane) { w = (w + src->x_chroma_shift) >> src->x_chroma_shift; h = (h + src->y_chroma_shift) >> src->y_chroma_shift; } for (y = 0; y < h; y++) { const uint16_t *p_src = (const uint16_t *)(src->planes[plane] + y * src->stride[plane]); uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane]; for (x = 0; x < w; x++) { *p_dst++ = (uint8_t)(*p_src++); } } } } static void highbd_img_downshift(aom_image_t *dst, const aom_image_t *src, int down_shift) { int plane; if (dst->d_w != src->d_w || dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift || dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt || down_shift < 0) { fatal("Unsupported image conversion"); } switch (src->fmt) { case AOM_IMG_FMT_I42016: case AOM_IMG_FMT_I42216: case AOM_IMG_FMT_I44416: break; default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; int h = src->d_h; int x, y; if (plane) { w = (w + src->x_chroma_shift) >> src->x_chroma_shift; h = (h + src->y_chroma_shift) >> src->y_chroma_shift; } for (y = 0; y < h; y++) { const uint16_t *p_src = (const uint16_t *)(src->planes[plane] + y * src->stride[plane]); uint16_t *p_dst = (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); for (x = 0; x < w; x++) *p_dst++ = *p_src++ >> down_shift; } } } static void lowbd_img_downshift(aom_image_t *dst, const aom_image_t *src, int down_shift) { int plane; if (dst->d_w != src->d_w || dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift || dst->y_chroma_shift != src->y_chroma_shift || src->fmt != dst->fmt + AOM_IMG_FMT_HIGHBITDEPTH || down_shift < 0) { fatal("Unsupported image conversion"); } switch (dst->fmt) { case AOM_IMG_FMT_I420: case AOM_IMG_FMT_I422: case AOM_IMG_FMT_I444: break; default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; int h = src->d_h; int x, y; if (plane) { w = (w + src->x_chroma_shift) >> src->x_chroma_shift; h = (h + src->y_chroma_shift) >> src->y_chroma_shift; } for (y = 0; y < h; y++) { const uint16_t *p_src = (const uint16_t *)(src->planes[plane] + y * src->stride[plane]); uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane]; for (x = 0; x < w; x++) { *p_dst++ = *p_src++ >> down_shift; } } } } void aom_img_downshift(aom_image_t *dst, const aom_image_t *src, int down_shift) { if (dst->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { highbd_img_downshift(dst, src, down_shift); } else { lowbd_img_downshift(dst, src, down_shift); } } static int img_shifted_realloc_required(const aom_image_t *img, const aom_image_t *shifted, aom_img_fmt_t required_fmt) { return img->d_w != shifted->d_w || img->d_h != shifted->d_h || required_fmt != shifted->fmt; } bool aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr, aom_image_t **img_shifted_ptr) { aom_image_t *img = *img_ptr; aom_image_t *img_shifted = *img_shifted_ptr; const aom_img_fmt_t shifted_fmt = output_bit_depth == 8 ? img->fmt & ~AOM_IMG_FMT_HIGHBITDEPTH : img->fmt | AOM_IMG_FMT_HIGHBITDEPTH; if (shifted_fmt != img->fmt || output_bit_depth != img->bit_depth) { if (img_shifted && img_shifted_realloc_required(img, img_shifted, shifted_fmt)) { aom_img_free(img_shifted); img_shifted = NULL; } if (img_shifted) { img_shifted->monochrome = img->monochrome; } if (!img_shifted) { img_shifted = aom_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16); if (!img_shifted) { *img_shifted_ptr = NULL; return false; } img_shifted->bit_depth = output_bit_depth; img_shifted->monochrome = img->monochrome; img_shifted->csp = img->csp; } if (output_bit_depth > img->bit_depth) { aom_img_upshift(img_shifted, img, output_bit_depth - img->bit_depth); } else { aom_img_downshift(img_shifted, img, img->bit_depth - output_bit_depth); } *img_shifted_ptr = img_shifted; *img_ptr = img_shifted; } return true; } // Related to I420, NV12 format has one luma "luminance" plane Y and one plane // with U and V values interleaved. void aom_img_write_nv12(const aom_image_t *img, FILE *file) { // Y plane const unsigned char *buf = img->planes[0]; int stride = img->stride[0]; int w = aom_img_plane_width(img, 0) * ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); int h = aom_img_plane_height(img, 0); int x, y; for (y = 0; y < h; ++y) { fwrite(buf, 1, w, file); buf += stride; } // Interleaved U and V plane const unsigned char *ubuf = img->planes[1]; const unsigned char *vbuf = img->planes[2]; const size_t size = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; stride = img->stride[1]; w = aom_img_plane_width(img, 1); h = aom_img_plane_height(img, 1); for (y = 0; y < h; ++y) { for (x = 0; x < w; ++x) { fwrite(ubuf, size, 1, file); fwrite(vbuf, size, 1, file); ubuf += size; vbuf += size; } ubuf += (stride - w * size); vbuf += (stride - w * size); } } size_t read_from_input(struct AvxInputContext *input_ctx, size_t n, unsigned char *buf) { const size_t buffered_bytes = input_ctx->detect.buf_read - input_ctx->detect.position; size_t read_n; if (buffered_bytes == 0) { read_n = fread(buf, 1, n, input_ctx->file); } else if (n <= buffered_bytes) { memcpy(buf, input_ctx->detect.buf + input_ctx->detect.position, n); input_ctx->detect.position += n; read_n = n; } else { memcpy(buf, input_ctx->detect.buf + input_ctx->detect.position, buffered_bytes); input_ctx->detect.position += buffered_bytes; read_n = buffered_bytes; read_n += fread(buf + buffered_bytes, 1, n - buffered_bytes, input_ctx->file); } return read_n; } size_t input_to_detect_buf(struct AvxInputContext *input_ctx, size_t n) { if (n + input_ctx->detect.position > DETECT_BUF_SZ) { die("Failed to store in the detect buffer, maximum size exceeded."); } const size_t buffered_bytes = input_ctx->detect.buf_read - input_ctx->detect.position; size_t read_n; if (buffered_bytes == 0) { read_n = fread(input_ctx->detect.buf + input_ctx->detect.buf_read, 1, n, input_ctx->file); input_ctx->detect.buf_read += read_n; } else if (n <= buffered_bytes) { // In this case, don't need to do anything as the data is already in // the detect buffer read_n = n; } else { read_n = fread(input_ctx->detect.buf + input_ctx->detect.buf_read, 1, n - buffered_bytes, input_ctx->file); input_ctx->detect.buf_read += read_n; read_n += buffered_bytes; } return read_n; } // Read from detect buffer to a buffer. If not enough, read from input and also // buffer them first. size_t buffer_input(struct AvxInputContext *input_ctx, size_t n, unsigned char *buf, bool buffered) { if (!buffered) { return read_from_input(input_ctx, n, buf); } const size_t buf_n = input_to_detect_buf(input_ctx, n); if (buf_n < n) { return buf_n; } return read_from_input(input_ctx, n, buf); } void rewind_detect(struct AvxInputContext *input_ctx) { input_ctx->detect.position = 0; } bool input_eof(struct AvxInputContext *input_ctx) { return feof(input_ctx->file) && input_ctx->detect.position == input_ctx->detect.buf_read; } aom-3.12.1/common/tools_common.h000066400000000000000000000152401477627663500165540ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_COMMON_TOOLS_COMMON_H_ #define AOM_COMMON_TOOLS_COMMON_H_ #include #include #include "config/aom_config.h" #include "aom/aom_codec.h" #include "aom/aom_image.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" #if CONFIG_AV1_ENCODER #include "common/y4minput.h" #endif #if defined(_MSC_VER) /* MSVS uses _f{seek,tell}i64. */ #define fseeko _fseeki64 #define ftello _ftelli64 typedef int64_t FileOffset; #elif defined(_WIN32) #include /* NOLINT*/ /* MinGW uses f{seek,tell}o64 for large files. */ #define fseeko fseeko64 #define ftello ftello64 typedef off64_t FileOffset; #elif CONFIG_OS_SUPPORT && \ !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \ defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64) /* POSIX.1 has fseeko and ftello. fseeko and ftello are not available before * Android API level 24. See * https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md */ #include /* NOLINT */ typedef off_t FileOffset; /* Use 32-bit file operations in WebM file format when building ARM * executables (.axf) with RVCT. */ #else #define fseeko fseek #define ftello ftell typedef long FileOffset; /* NOLINT */ #endif /* CONFIG_OS_SUPPORT */ #if CONFIG_OS_SUPPORT #if defined(_MSC_VER) #include /* NOLINT */ #define isatty _isatty #define fileno _fileno #else #include /* NOLINT */ #endif /* _MSC_VER */ #endif /* CONFIG_OS_SUPPORT */ #define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo) #ifndef PATH_MAX #define PATH_MAX 512 #endif #define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */ #define IVF_FILE_HDR_SZ 32 #define RAW_FRAME_HDR_SZ sizeof(uint32_t) #define OBU_DETECTION_SZ 34 // See common/obudec.c #define DETECT_BUF_SZ 34 // Max of the above header sizes #define AV1_FOURCC 0x31305641 enum VideoFileType { FILE_TYPE_OBU, FILE_TYPE_RAW, FILE_TYPE_IVF, FILE_TYPE_Y4M, FILE_TYPE_WEBM }; // The fourcc for large_scale_tile encoding is "LSTC". #define LST_FOURCC 0x4354534c struct FileTypeDetectionBuffer { char buf[DETECT_BUF_SZ]; size_t buf_read; size_t position; }; struct AvxRational { int numerator; int denominator; }; struct AvxInputContext { const char *filename; FILE *file; int64_t length; struct FileTypeDetectionBuffer detect; enum VideoFileType file_type; uint32_t width; uint32_t height; struct AvxRational pixel_aspect_ratio; aom_img_fmt_t fmt; aom_bit_depth_t bit_depth; int only_i420; uint32_t fourcc; struct AvxRational framerate; #if CONFIG_AV1_ENCODER y4m_input y4m; #endif aom_color_range_t color_range; }; #ifdef __cplusplus extern "C" { #endif #if defined(__GNUC__) #define AOM_NO_RETURN __attribute__((noreturn)) #elif defined(_MSC_VER) #define AOM_NO_RETURN __declspec(noreturn) #else #define AOM_NO_RETURN #endif // Tells the compiler to perform `printf` format string checking if the // compiler supports it; see the 'format' attribute in // . #define AOM_TOOLS_FORMAT_PRINTF(string_index, first_to_check) #if defined(__has_attribute) #if __has_attribute(format) #undef AOM_TOOLS_FORMAT_PRINTF #define AOM_TOOLS_FORMAT_PRINTF(string_index, first_to_check) \ __attribute__((__format__(__printf__, string_index, first_to_check))) #endif #endif /* Sets a stdio stream into binary mode */ FILE *set_binary_mode(FILE *stream); AOM_NO_RETURN void die(const char *fmt, ...) AOM_TOOLS_FORMAT_PRINTF(1, 2); AOM_NO_RETURN void fatal(const char *fmt, ...) AOM_TOOLS_FORMAT_PRINTF(1, 2); void aom_tools_warn(const char *fmt, ...) AOM_TOOLS_FORMAT_PRINTF(1, 2); AOM_NO_RETURN void die_codec(aom_codec_ctx_t *ctx, const char *s); /* The tool including this file must define usage_exit() */ AOM_NO_RETURN void usage_exit(void); #undef AOM_NO_RETURN // The AOM library can support different encoders / decoders. These // functions provide different ways to lookup / iterate through them. // The return result may be NULL to indicate no codec was found. int get_aom_encoder_count(void); aom_codec_iface_t *get_aom_encoder_by_index(int i); aom_codec_iface_t *get_aom_encoder_by_short_name(const char *name); // If the interface is unknown, returns NULL. const char *get_short_name_by_aom_encoder(aom_codec_iface_t *encoder); // If the interface is unknown, returns 0. uint32_t get_fourcc_by_aom_encoder(aom_codec_iface_t *iface); int get_aom_decoder_count(void); aom_codec_iface_t *get_aom_decoder_by_index(int i); aom_codec_iface_t *get_aom_decoder_by_short_name(const char *name); aom_codec_iface_t *get_aom_decoder_by_fourcc(uint32_t fourcc); const char *get_short_name_by_aom_decoder(aom_codec_iface_t *decoder); // If the interface is unknown, returns 0. uint32_t get_fourcc_by_aom_decoder(aom_codec_iface_t *iface); const char *image_format_to_string(aom_img_fmt_t fmt); int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame); void aom_img_write(const aom_image_t *img, FILE *file); // Returns true on success, false on failure. bool aom_img_read(aom_image_t *img, FILE *file); double sse_to_psnr(double samples, double peak, double mse); void aom_img_upshift(aom_image_t *dst, const aom_image_t *src, int input_shift); void aom_img_downshift(aom_image_t *dst, const aom_image_t *src, int down_shift); // Returns true on success, false on failure. bool aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr, aom_image_t **img_shifted_ptr); void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src); // Output in NV12 format. void aom_img_write_nv12(const aom_image_t *img, FILE *file); size_t read_from_input(struct AvxInputContext *input_ctx, size_t n, unsigned char *buf); size_t input_to_detect_buf(struct AvxInputContext *input_ctx, size_t n); size_t buffer_input(struct AvxInputContext *input_ctx, size_t n, unsigned char *buf, bool buffered); void rewind_detect(struct AvxInputContext *input_ctx); bool input_eof(struct AvxInputContext *input_ctx); #ifdef __cplusplus } /* extern "C" */ #endif #endif // AOM_COMMON_TOOLS_COMMON_H_ aom-3.12.1/common/video_common.h000066400000000000000000000014761477627663500165300ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_COMMON_VIDEO_COMMON_H_ #define AOM_COMMON_VIDEO_COMMON_H_ #include "common/tools_common.h" typedef struct { uint32_t codec_fourcc; int frame_width; int frame_height; struct AvxRational time_base; unsigned int is_annexb; } AvxVideoInfo; #endif // AOM_COMMON_VIDEO_COMMON_H_ aom-3.12.1/common/video_reader.c000066400000000000000000000106411477627663500164670ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "aom_ports/mem_ops.h" #include "common/ivfdec.h" #include "common/obudec.h" #include "common/tools_common.h" #include "common/video_reader.h" #include "common/webmdec.h" struct AvxVideoReaderStruct { AvxVideoInfo info; struct AvxInputContext input_ctx; struct ObuDecInputContext obu_ctx; struct WebmInputContext webm_ctx; uint8_t *buffer; size_t buffer_size; size_t frame_size; aom_codec_pts_t pts; }; AvxVideoReader *aom_video_reader_open(const char *filename) { AvxVideoReader *reader = NULL; const bool using_file = strcmp(filename, "-") != 0; FILE *const file = using_file ? fopen(filename, "rb") : set_binary_mode(stdin); if (!file) return NULL; // Can't open file reader = (AvxVideoReader *)calloc(1, sizeof(*reader)); if (!reader) { fclose(file); return NULL; // Can't allocate AvxVideoReader } reader->input_ctx.filename = filename; reader->input_ctx.file = file; reader->obu_ctx.avx_ctx = &reader->input_ctx; reader->obu_ctx.is_annexb = 1; // TODO(https://crbug.com/aomedia/1706): webm type does not support reading // from stdin yet, and file_is_webm is not using the detect buffer when // determining the type. Therefore it should only be checked when using a file // and needs to be checked prior to other types. if (false) { #if CONFIG_WEBM_IO } else if (using_file && file_is_webm(&reader->webm_ctx, &reader->input_ctx)) { reader->input_ctx.file_type = FILE_TYPE_WEBM; reader->info.codec_fourcc = reader->input_ctx.fourcc; reader->info.frame_width = reader->input_ctx.width; reader->info.frame_height = reader->input_ctx.height; #endif } else if (file_is_ivf(&reader->input_ctx)) { reader->input_ctx.file_type = FILE_TYPE_IVF; reader->info.codec_fourcc = reader->input_ctx.fourcc; reader->info.frame_width = reader->input_ctx.width; reader->info.frame_height = reader->input_ctx.height; } else if (file_is_obu(&reader->obu_ctx)) { reader->input_ctx.file_type = FILE_TYPE_OBU; // assume AV1 reader->info.codec_fourcc = AV1_FOURCC; reader->info.is_annexb = reader->obu_ctx.is_annexb; } else { fclose(file); free(reader); return NULL; // Unknown file type } return reader; } void aom_video_reader_close(AvxVideoReader *reader) { if (reader) { fclose(reader->input_ctx.file); if (reader->input_ctx.file_type == FILE_TYPE_OBU) { obudec_free(&reader->obu_ctx); } free(reader->buffer); free(reader); } } int aom_video_reader_read_frame(AvxVideoReader *reader) { if (reader->input_ctx.file_type == FILE_TYPE_IVF) { return !ivf_read_frame(&reader->input_ctx, &reader->buffer, &reader->frame_size, &reader->buffer_size, &reader->pts); } else if (reader->input_ctx.file_type == FILE_TYPE_OBU) { return !obudec_read_temporal_unit(&reader->obu_ctx, &reader->buffer, &reader->frame_size, &reader->buffer_size); #if CONFIG_WEBM_IO } else if (reader->input_ctx.file_type == FILE_TYPE_WEBM) { return !webm_read_frame(&reader->webm_ctx, &reader->buffer, &reader->frame_size, &reader->buffer_size); #endif } else { assert(0); return 0; } } const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader, size_t *size) { if (size) *size = reader->frame_size; return reader->buffer; } int64_t aom_video_reader_get_frame_pts(AvxVideoReader *reader) { return (int64_t)reader->pts; } FILE *aom_video_reader_get_file(AvxVideoReader *reader) { return reader->input_ctx.file; } const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader) { return &reader->info; } void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc) { reader->info.codec_fourcc = fourcc; } aom-3.12.1/common/video_reader.h000066400000000000000000000044171477627663500165000ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_COMMON_VIDEO_READER_H_ #define AOM_COMMON_VIDEO_READER_H_ #include "common/video_common.h" // The following code is work in progress. It is going to support transparent // reading of input files. Right now only IVF format is supported for // simplicity. The main goal the API is to be simple and easy to use in example // code and in aomenc/aomdec later. All low-level details like memory // buffer management are hidden from API users. struct AvxVideoReaderStruct; typedef struct AvxVideoReaderStruct AvxVideoReader; #ifdef __cplusplus extern "C" { #endif // Opens the input file for reading and inspects it to determine file type. // Returns an opaque AvxVideoReader* upon success, or NULL upon failure. // Right now only IVF format is supported. AvxVideoReader *aom_video_reader_open(const char *filename); // Frees all resources associated with AvxVideoReader* returned from // aom_video_reader_open() call. void aom_video_reader_close(AvxVideoReader *reader); // Reads frame from the file and stores it in internal buffer. int aom_video_reader_read_frame(AvxVideoReader *reader); // Returns the pointer to memory buffer with frame data read by last call to // aom_video_reader_read_frame(). const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader, size_t *size); // Returns the pts of the frame. int64_t aom_video_reader_get_frame_pts(AvxVideoReader *reader); // Return the reader file. FILE *aom_video_reader_get_file(AvxVideoReader *reader); // Fills AvxVideoInfo with information from opened video file. const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader); // Set fourcc. void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_COMMON_VIDEO_READER_H_ aom-3.12.1/common/video_writer.c000066400000000000000000000045261477627663500165460ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "common/video_writer.h" #include #include "aom/aom_encoder.h" #include "common/ivfenc.h" struct AvxVideoWriterStruct { AvxVideoInfo info; FILE *file; int frame_count; }; static void write_header(FILE *file, const AvxVideoInfo *info, int frame_count) { struct aom_codec_enc_cfg cfg; cfg.g_w = info->frame_width; cfg.g_h = info->frame_height; cfg.g_timebase.num = info->time_base.numerator; cfg.g_timebase.den = info->time_base.denominator; ivf_write_file_header(file, &cfg, info->codec_fourcc, frame_count); } AvxVideoWriter *aom_video_writer_open(const char *filename, AvxContainer container, const AvxVideoInfo *info) { if (container == kContainerIVF) { AvxVideoWriter *writer = NULL; FILE *const file = fopen(filename, "wb"); if (!file) return NULL; writer = malloc(sizeof(*writer)); if (!writer) { fclose(file); return NULL; } writer->frame_count = 0; writer->info = *info; writer->file = file; write_header(writer->file, info, 0); return writer; } return NULL; } void aom_video_writer_close(AvxVideoWriter *writer) { if (writer) { // Rewriting frame header with real frame count rewind(writer->file); write_header(writer->file, &writer->info, writer->frame_count); fclose(writer->file); free(writer); } } int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer, size_t size, int64_t pts) { ivf_write_frame_header(writer->file, pts, size); if (fwrite(buffer, 1, size, writer->file) != size) return 0; ++writer->frame_count; return 1; } void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc) { writer->info.codec_fourcc = fourcc; } aom-3.12.1/common/video_writer.h000066400000000000000000000032121477627663500165420ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_COMMON_VIDEO_WRITER_H_ #define AOM_COMMON_VIDEO_WRITER_H_ #include "common/video_common.h" enum { kContainerIVF } UENUM1BYTE(AvxContainer); struct AvxVideoWriterStruct; typedef struct AvxVideoWriterStruct AvxVideoWriter; #ifdef __cplusplus extern "C" { #endif // Finds and opens writer for specified container format. // Returns an opaque AvxVideoWriter* upon success, or NULL upon failure. // Right now only IVF format is supported. AvxVideoWriter *aom_video_writer_open(const char *filename, AvxContainer container, const AvxVideoInfo *info); // Frees all resources associated with AvxVideoWriter* returned from // aom_video_writer_open() call. void aom_video_writer_close(AvxVideoWriter *writer); // Writes frame bytes to the file. int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer, size_t size, int64_t pts); // Set fourcc. void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_COMMON_VIDEO_WRITER_H_ aom-3.12.1/common/warnings.c000066400000000000000000000060071477627663500156700ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "common/warnings.h" #include #include #include #include #include "aom/aom_encoder.h" #include "apps/aomenc.h" #include "common/tools_common.h" static const char quantizer_warning_string[] = "Bad quantizer values. Quantizer values should not be equal, and should " "differ by at least 8."; struct WarningListNode { const char *warning_string; struct WarningListNode *next_warning; }; struct WarningList { struct WarningListNode *warning_node; }; static void add_warning(const char *warning_string, struct WarningList *warning_list) { struct WarningListNode **node = &warning_list->warning_node; struct WarningListNode *new_node = malloc(sizeof(*new_node)); if (new_node == NULL) { fatal("Unable to allocate warning node."); } new_node->warning_string = warning_string; new_node->next_warning = NULL; while (*node != NULL) node = &(*node)->next_warning; *node = new_node; } static void free_warning_list(struct WarningList *warning_list) { while (warning_list->warning_node != NULL) { struct WarningListNode *const node = warning_list->warning_node; warning_list->warning_node = node->next_warning; free(node); } } static int continue_prompt(int num_warnings) { int c; fprintf(stderr, "%d encoder configuration warning(s). Continue? (y to continue) ", num_warnings); c = getchar(); return c == 'y'; } static void check_quantizer(int min_q, int max_q, struct WarningList *warning_list) { const int lossless = min_q == 0 && max_q == 0; if (!lossless && (min_q == max_q || abs(max_q - min_q) < 8)) add_warning(quantizer_warning_string, warning_list); } void check_encoder_config(int disable_prompt, const struct AvxEncoderConfig *global_config, const struct aom_codec_enc_cfg *stream_config) { int num_warnings = 0; struct WarningListNode *warning = NULL; struct WarningList warning_list = { 0 }; (void)global_config; check_quantizer(stream_config->rc_min_quantizer, stream_config->rc_max_quantizer, &warning_list); /* Count and print warnings. */ for (warning = warning_list.warning_node; warning != NULL; warning = warning->next_warning, ++num_warnings) { aom_tools_warn("%s", warning->warning_string); } free_warning_list(&warning_list); if (num_warnings) { if (!disable_prompt && !continue_prompt(num_warnings)) exit(EXIT_FAILURE); } } aom-3.12.1/common/warnings.h000066400000000000000000000021701477627663500156720ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_COMMON_WARNINGS_H_ #define AOM_COMMON_WARNINGS_H_ #ifdef __cplusplus extern "C" { #endif struct aom_codec_enc_cfg; struct AvxEncoderConfig; /* * Checks config for improperly used settings. Warns user upon encountering * settings that will lead to poor output quality. Prompts user to continue * when warnings are issued. */ void check_encoder_config(int disable_prompt, const struct AvxEncoderConfig *global_config, const struct aom_codec_enc_cfg *stream_config); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_COMMON_WARNINGS_H_ aom-3.12.1/common/webmdec.cc000066400000000000000000000172001477627663500156060ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "common/webmdec.h" #include #include #include #include "third_party/libwebm/mkvparser/mkvparser.h" #include "third_party/libwebm/mkvparser/mkvreader.h" namespace { void reset(struct WebmInputContext *const webm_ctx) { if (webm_ctx->reader != NULL) { mkvparser::MkvReader *const reader = reinterpret_cast(webm_ctx->reader); delete reader; } if (webm_ctx->segment != NULL) { mkvparser::Segment *const segment = reinterpret_cast(webm_ctx->segment); delete segment; } if (webm_ctx->buffer != NULL) { delete[] webm_ctx->buffer; } webm_ctx->reader = NULL; webm_ctx->segment = NULL; webm_ctx->buffer = NULL; webm_ctx->cluster = NULL; webm_ctx->block_entry = NULL; webm_ctx->block = NULL; webm_ctx->block_frame_index = 0; webm_ctx->video_track_index = 0; webm_ctx->timestamp_ns = 0; webm_ctx->is_key_frame = false; } void get_first_cluster(struct WebmInputContext *const webm_ctx) { mkvparser::Segment *const segment = reinterpret_cast(webm_ctx->segment); const mkvparser::Cluster *const cluster = segment->GetFirst(); webm_ctx->cluster = cluster; } void rewind_and_reset(struct WebmInputContext *const webm_ctx, struct AvxInputContext *const aom_ctx) { rewind(aom_ctx->file); reset(webm_ctx); } } // namespace int file_is_webm(struct WebmInputContext *webm_ctx, struct AvxInputContext *aom_ctx) { mkvparser::MkvReader *const reader = new mkvparser::MkvReader(aom_ctx->file); webm_ctx->reader = reader; webm_ctx->reached_eos = 0; mkvparser::EBMLHeader header; long long pos = 0; if (header.Parse(reader, pos) < 0) { rewind_and_reset(webm_ctx, aom_ctx); return 0; } mkvparser::Segment *segment; if (mkvparser::Segment::CreateInstance(reader, pos, segment)) { rewind_and_reset(webm_ctx, aom_ctx); return 0; } webm_ctx->segment = segment; if (segment->Load() < 0) { rewind_and_reset(webm_ctx, aom_ctx); return 0; } const mkvparser::Tracks *const tracks = segment->GetTracks(); const mkvparser::VideoTrack *video_track = NULL; for (unsigned long i = 0; i < tracks->GetTracksCount(); ++i) { const mkvparser::Track *const track = tracks->GetTrackByIndex(i); if (track->GetType() == mkvparser::Track::kVideo) { video_track = static_cast(track); webm_ctx->video_track_index = static_cast(track->GetNumber()); break; } } if (video_track == NULL || video_track->GetCodecId() == NULL) { rewind_and_reset(webm_ctx, aom_ctx); return 0; } if (!strncmp(video_track->GetCodecId(), "V_AV1", 5)) { aom_ctx->fourcc = AV1_FOURCC; } else { rewind_and_reset(webm_ctx, aom_ctx); return 0; } aom_ctx->framerate.denominator = 0; aom_ctx->framerate.numerator = 0; aom_ctx->width = static_cast(video_track->GetWidth()); aom_ctx->height = static_cast(video_track->GetHeight()); get_first_cluster(webm_ctx); return 1; } int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, size_t *bytes_read, size_t *buffer_size) { assert(webm_ctx->buffer == *buffer); // This check is needed for frame parallel decoding, in which case this // function could be called even after it has reached end of input stream. if (webm_ctx->reached_eos) { return 1; } mkvparser::Segment *const segment = reinterpret_cast(webm_ctx->segment); const mkvparser::Cluster *cluster = reinterpret_cast(webm_ctx->cluster); const mkvparser::Block *block = reinterpret_cast(webm_ctx->block); const mkvparser::BlockEntry *block_entry = reinterpret_cast(webm_ctx->block_entry); bool block_entry_eos = false; do { long status = 0; bool get_new_block = false; if (block_entry == NULL && !block_entry_eos) { status = cluster->GetFirst(block_entry); get_new_block = true; } else if (block_entry_eos || block_entry->EOS()) { cluster = segment->GetNext(cluster); if (cluster == NULL || cluster->EOS()) { *bytes_read = 0; webm_ctx->reached_eos = 1; return 1; } status = cluster->GetFirst(block_entry); block_entry_eos = false; get_new_block = true; } else if (block == NULL || webm_ctx->block_frame_index == block->GetFrameCount() || block->GetTrackNumber() != webm_ctx->video_track_index) { status = cluster->GetNext(block_entry, block_entry); if (block_entry == NULL || block_entry->EOS()) { block_entry_eos = true; continue; } get_new_block = true; } if (status || block_entry == NULL) { return -1; } if (get_new_block) { block = block_entry->GetBlock(); if (block == NULL) return -1; webm_ctx->block_frame_index = 0; } } while (block_entry_eos || block->GetTrackNumber() != webm_ctx->video_track_index); webm_ctx->cluster = cluster; webm_ctx->block_entry = block_entry; webm_ctx->block = block; const mkvparser::Block::Frame &frame = block->GetFrame(webm_ctx->block_frame_index); ++webm_ctx->block_frame_index; if (frame.len > static_cast(*buffer_size)) { delete[] * buffer; *buffer = new uint8_t[frame.len]; webm_ctx->buffer = *buffer; if (*buffer == NULL) { return -1; } *buffer_size = frame.len; } *bytes_read = frame.len; webm_ctx->timestamp_ns = block->GetTime(cluster); webm_ctx->is_key_frame = block->IsKey(); mkvparser::MkvReader *const reader = reinterpret_cast(webm_ctx->reader); return frame.Read(reader, *buffer) ? -1 : 0; } // Calculate the greatest common divisor between two numbers. static int gcd(int a, int b) { int remainder; while (b > 0) { remainder = a % b; a = b; b = remainder; } return a; } int webm_guess_framerate(struct WebmInputContext *webm_ctx, struct AvxInputContext *aom_ctx) { uint32_t i = 0; uint8_t *buffer = NULL; size_t buffer_size = 0; size_t bytes_read = 0; assert(webm_ctx->buffer == NULL); while (webm_ctx->timestamp_ns < 1000000000 && i < 50) { if (webm_read_frame(webm_ctx, &buffer, &bytes_read, &buffer_size)) { break; } ++i; } aom_ctx->framerate.numerator = (i - 1) * 1000000; aom_ctx->framerate.denominator = static_cast(webm_ctx->timestamp_ns / 1000); // Fraction might be represented in large numbers, like 49000000/980000 // for 50fps. Simplify as much as possible. int g = gcd(aom_ctx->framerate.numerator, aom_ctx->framerate.denominator); if (g != 0) { aom_ctx->framerate.numerator /= g; aom_ctx->framerate.denominator /= g; } delete[] buffer; webm_ctx->buffer = NULL; get_first_cluster(webm_ctx); webm_ctx->block = NULL; webm_ctx->block_entry = NULL; webm_ctx->block_frame_index = 0; webm_ctx->timestamp_ns = 0; webm_ctx->reached_eos = 0; return 0; } void webm_free(struct WebmInputContext *webm_ctx) { reset(webm_ctx); } aom-3.12.1/common/webmdec.h000066400000000000000000000046101477627663500154510ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_COMMON_WEBMDEC_H_ #define AOM_COMMON_WEBMDEC_H_ #include "common/tools_common.h" #ifdef __cplusplus extern "C" { #endif struct AvxInputContext; struct WebmInputContext { void *reader; void *segment; uint8_t *buffer; const void *cluster; const void *block_entry; const void *block; int block_frame_index; int video_track_index; int64_t timestamp_ns; int is_key_frame; int reached_eos; }; // Checks if the input is a WebM file. If so, initializes WebMInputContext so // that webm_read_frame can be called to retrieve a video frame. // Returns 1 on success and 0 on failure or input is not WebM file. // TODO(vigneshv): Refactor this function into two smaller functions specific // to their task. int file_is_webm(struct WebmInputContext *webm_ctx, struct AvxInputContext *aom_ctx); // Reads a WebM Video Frame. Memory for the buffer is created, owned and managed // by this function. For the first call, |buffer| should be NULL and // |*buffer_size| should be 0. Once all the frames are read and used, // webm_free() should be called, otherwise there will be a leak. // Parameters: // webm_ctx - WebmInputContext object // buffer - pointer where the frame data will be filled. // bytes_read - pointer to bytes read. // buffer_size - pointer to buffer size. // Return values: // 0 - Success // 1 - End of Stream // -1 - Error int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, size_t *bytes_read, size_t *buffer_size); // Guesses the frame rate of the input file based on the container timestamps. int webm_guess_framerate(struct WebmInputContext *webm_ctx, struct AvxInputContext *aom_ctx); // Resets the WebMInputContext. void webm_free(struct WebmInputContext *webm_ctx); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_COMMON_WEBMDEC_H_ aom-3.12.1/common/webmenc.cc000066400000000000000000000171771477627663500156350ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "common/webmenc.h" #include #include #include #include #include #include "common/av1_config.h" #include "third_party/libwebm/mkvmuxer/mkvmuxer.h" #include "third_party/libwebm/mkvmuxer/mkvmuxerutil.h" #include "third_party/libwebm/mkvmuxer/mkvwriter.h" namespace { const uint64_t kDebugTrackUid = 0xDEADBEEF; const int kVideoTrackNumber = 1; // Simplistic mechanism to detect if an argv parameter refers to // an input or output file. Returns the total number of arguments that // should be skipped. int skip_input_output_arg(const char *arg, const char *input_fname) { if (strcmp(arg, input_fname) == 0) { return 1; } if (strcmp(arg, "-o") == 0 || strcmp(arg, "--output") == 0) { return 2; } if (strncmp(arg, "--output=", strlen("--output=")) == 0) { return 1; } return 0; } } // namespace char *extract_encoder_settings(const char *version, const char **argv, int argc, const char *input_fname) { // + 9 for "version:" prefix and for null terminator. size_t total_size = strlen(version) + 9; int i = 1; while (i < argc) { int num_skip = skip_input_output_arg(argv[i], input_fname); i += num_skip; if (num_skip == 0) { total_size += strlen(argv[i]) + 1; // + 1 is for space separator. ++i; } } char *result = static_cast(malloc(total_size)); if (result == nullptr) { return nullptr; } char *cur = result; cur += snprintf(cur, total_size, "version:%s", version); i = 1; while (i < argc) { int num_skip = skip_input_output_arg(argv[i], input_fname); i += num_skip; if (num_skip == 0) { cur += snprintf(cur, total_size, " %s", argv[i]); ++i; } } *cur = '\0'; return result; } int write_webm_file_header(struct WebmOutputContext *webm_ctx, aom_codec_ctx_t *encoder_ctx, const aom_codec_enc_cfg_t *cfg, stereo_format_t stereo_fmt, unsigned int fourcc, const struct AvxRational *par, const char *encoder_settings) { std::unique_ptr writer( new (std::nothrow) mkvmuxer::MkvWriter(webm_ctx->stream)); std::unique_ptr segment(new (std::nothrow) mkvmuxer::Segment()); if (writer == nullptr || segment == nullptr) { fprintf(stderr, "webmenc> mkvmuxer objects alloc failed, out of memory?\n"); return -1; } bool ok = segment->Init(writer.get()); if (!ok) { fprintf(stderr, "webmenc> mkvmuxer Init failed.\n"); return -1; } segment->set_mode(mkvmuxer::Segment::kFile); segment->OutputCues(true); mkvmuxer::SegmentInfo *const info = segment->GetSegmentInfo(); if (!info) { fprintf(stderr, "webmenc> Cannot retrieve Segment Info.\n"); return -1; } const uint64_t kTimecodeScale = 1000000; info->set_timecode_scale(kTimecodeScale); std::string version = "aomenc"; if (!webm_ctx->debug) { version.append(std::string(" ") + aom_codec_version_str()); } info->set_writing_app(version.c_str()); const uint64_t video_track_id = segment->AddVideoTrack(static_cast(cfg->g_w), static_cast(cfg->g_h), kVideoTrackNumber); mkvmuxer::VideoTrack *const video_track = static_cast( segment->GetTrackByNumber(video_track_id)); if (!video_track) { fprintf(stderr, "webmenc> Video track creation failed.\n"); return -1; } ok = false; aom_fixed_buf_t *obu_sequence_header = aom_codec_get_global_headers(encoder_ctx); if (obu_sequence_header) { Av1Config av1_config; if (get_av1config_from_obu( reinterpret_cast(obu_sequence_header->buf), obu_sequence_header->sz, false, &av1_config) == 0) { uint8_t av1_config_buffer[4] = { 0 }; size_t bytes_written = 0; if (write_av1config(&av1_config, sizeof(av1_config_buffer), &bytes_written, av1_config_buffer) == 0) { ok = video_track->SetCodecPrivate(av1_config_buffer, sizeof(av1_config_buffer)); } } free(obu_sequence_header->buf); free(obu_sequence_header); } if (!ok) { fprintf(stderr, "webmenc> Unable to set AV1 config.\n"); return -1; } ok = video_track->SetStereoMode(stereo_fmt); if (!ok) { fprintf(stderr, "webmenc> Unable to set stereo mode.\n"); return -1; } if (fourcc != AV1_FOURCC) { fprintf(stderr, "webmenc> Unsupported codec (unknown 4 CC).\n"); return -1; } video_track->set_codec_id("V_AV1"); if (par->numerator > 1 || par->denominator > 1) { // TODO(fgalligan): Add support of DisplayUnit, Display Aspect Ratio type // to WebM format. const uint64_t display_width = static_cast( ((cfg->g_w * par->numerator * 1.0) / par->denominator) + .5); video_track->set_display_width(display_width); video_track->set_display_height(cfg->g_h); } if (encoder_settings != nullptr) { mkvmuxer::Tag *tag = segment->AddTag(); if (tag == nullptr) { fprintf(stderr, "webmenc> Unable to allocate memory for encoder settings tag.\n"); return -1; } ok = tag->add_simple_tag("ENCODER_SETTINGS", encoder_settings); if (!ok) { fprintf(stderr, "webmenc> Unable to allocate memory for encoder settings tag.\n"); return -1; } } if (webm_ctx->debug) { video_track->set_uid(kDebugTrackUid); } webm_ctx->writer = writer.release(); webm_ctx->segment = segment.release(); return 0; } int write_webm_block(struct WebmOutputContext *webm_ctx, const aom_codec_enc_cfg_t *cfg, const aom_codec_cx_pkt_t *pkt) { if (!webm_ctx->segment) { fprintf(stderr, "webmenc> segment is NULL.\n"); return -1; } mkvmuxer::Segment *const segment = reinterpret_cast(webm_ctx->segment); int64_t pts_ns = pkt->data.frame.pts * 1000000000ll * cfg->g_timebase.num / cfg->g_timebase.den; if (pts_ns <= webm_ctx->last_pts_ns) pts_ns = webm_ctx->last_pts_ns + 1000000; webm_ctx->last_pts_ns = pts_ns; if (!segment->AddFrame(static_cast(pkt->data.frame.buf), pkt->data.frame.sz, kVideoTrackNumber, pts_ns, pkt->data.frame.flags & AOM_FRAME_IS_KEY)) { fprintf(stderr, "webmenc> AddFrame failed.\n"); return -1; } return 0; } int write_webm_file_footer(struct WebmOutputContext *webm_ctx) { if (!webm_ctx->writer || !webm_ctx->segment) { fprintf(stderr, "webmenc> segment or writer NULL.\n"); return -1; } mkvmuxer::MkvWriter *const writer = reinterpret_cast(webm_ctx->writer); mkvmuxer::Segment *const segment = reinterpret_cast(webm_ctx->segment); const bool ok = segment->Finalize(); delete segment; delete writer; webm_ctx->writer = NULL; webm_ctx->segment = NULL; if (!ok) { fprintf(stderr, "webmenc> Segment::Finalize failed.\n"); return -1; } return 0; } aom-3.12.1/common/webmenc.h000066400000000000000000000046561477627663500154750ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_COMMON_WEBMENC_H_ #define AOM_COMMON_WEBMENC_H_ #include #include #include "tools_common.h" #include "aom/aom_encoder.h" #ifdef __cplusplus extern "C" { #endif struct WebmOutputContext { int debug; FILE *stream; int64_t last_pts_ns; void *writer; void *segment; }; /* Stereo 3D packed frame format */ enum { STEREO_FORMAT_MONO = 0, STEREO_FORMAT_LEFT_RIGHT = 1, STEREO_FORMAT_BOTTOM_TOP = 2, STEREO_FORMAT_TOP_BOTTOM = 3, STEREO_FORMAT_RIGHT_LEFT = 11 } UENUM1BYTE(stereo_format_t); // Simplistic mechanism to extract encoder settings, without having // to re-invoke the entire flag-parsing logic. It lists the codec version // and then copies the arguments as-is from argv, but skips the binary name, // any arguments that match the input filename, and the output flags "-o" // and "--output" (and the following argument for those flags). The caller // is responsible for free-ing the returned string. If there is insufficient // memory, it returns nullptr. char *extract_encoder_settings(const char *version, const char **argv, int argc, const char *input_fname); // The following functions wrap libwebm's mkvmuxer. All functions return 0 upon // success, or -1 upon failure. int write_webm_file_header(struct WebmOutputContext *webm_ctx, aom_codec_ctx_t *encoder_ctx, const aom_codec_enc_cfg_t *cfg, stereo_format_t stereo_fmt, unsigned int fourcc, const struct AvxRational *par, const char *encoder_settings); int write_webm_block(struct WebmOutputContext *webm_ctx, const aom_codec_enc_cfg_t *cfg, const aom_codec_cx_pkt_t *pkt); int write_webm_file_footer(struct WebmOutputContext *webm_ctx); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_COMMON_WEBMENC_H_ aom-3.12.1/common/y4menc.c000066400000000000000000000104011477627663500152300ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include "common/rawenc.h" #include "common/y4menc.h" // Returns the Y4M name associated with the monochrome colorspace. static const char *monochrome_colorspace(unsigned int bit_depth) { switch (bit_depth) { case 8: return "Cmono"; case 9: return "Cmono9"; case 10: return "Cmono10"; case 12: return "Cmono12"; case 16: return "Cmono16"; default: assert(0); return NULL; } } // Return the Y4M name of the 8-bit colorspace, given the chroma position and // image format. static const char *colorspace8(aom_chroma_sample_position_t csp, aom_img_fmt_t fmt) { switch (fmt) { case AOM_IMG_FMT_I444: return "C444"; case AOM_IMG_FMT_I422: return "C422"; default: if (csp == AOM_CSP_VERTICAL) { return "C420mpeg2 XYSCSS=420MPEG2"; } else if (csp == AOM_CSP_COLOCATED) { // Note that Y4M does not have a dedicated header for colocated chroma, // and that FFMPEG interprets C420 as C420jpeg. return "C420"; } else { return "C420jpeg"; } } } // Return the Y4M name of the colorspace, given the bit depth and image format. static const char *colorspace(unsigned int bit_depth, aom_chroma_sample_position_t csp, aom_img_fmt_t fmt) { switch (bit_depth) { case 8: return colorspace8(csp, fmt); case 9: return fmt == AOM_IMG_FMT_I44416 ? "C444p9 XYSCSS=444P9" : fmt == AOM_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9" : "C420p9 XYSCSS=420P9"; case 10: return fmt == AOM_IMG_FMT_I44416 ? "C444p10 XYSCSS=444P10" : fmt == AOM_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10" : "C420p10 XYSCSS=420P10"; case 12: return fmt == AOM_IMG_FMT_I44416 ? "C444p12 XYSCSS=444P12" : fmt == AOM_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12" : "C420p12 XYSCSS=420P12"; case 14: return fmt == AOM_IMG_FMT_I44416 ? "C444p14 XYSCSS=444P14" : fmt == AOM_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14" : "C420p14 XYSCSS=420P14"; case 16: return fmt == AOM_IMG_FMT_I44416 ? "C444p16 XYSCSS=444P16" : fmt == AOM_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16" : "C420p16 XYSCSS=420P16"; default: assert(0); return NULL; } } int y4m_write_file_header(char *buf, size_t len, int width, int height, const struct AvxRational *framerate, int monochrome, aom_chroma_sample_position_t csp, aom_img_fmt_t fmt, unsigned int bit_depth, aom_color_range_t range) { const char *color = monochrome ? monochrome_colorspace(bit_depth) : colorspace(bit_depth, csp, fmt); const char *color_range = ""; // Default assumption is studio range. if (range == AOM_CR_FULL_RANGE) { color_range = " XCOLORRANGE=FULL"; } return snprintf(buf, len, "YUV4MPEG2 W%d H%d F%d:%d Ip %s%s\n", width, height, framerate->numerator, framerate->denominator, color, color_range); } int y4m_write_frame_header(char *buf, size_t len) { return snprintf(buf, len, "FRAME\n"); } void y4m_write_image_file(const aom_image_t *img, const int *planes, FILE *file) { int num_planes = img->monochrome ? 1 : 3; raw_write_image_file(img, planes, num_planes, file); } void y4m_update_image_md5(const aom_image_t *img, const int *planes, MD5Context *md5) { int num_planes = img->monochrome ? 1 : 3; raw_update_image_md5(img, planes, num_planes, md5); } aom-3.12.1/common/y4menc.h000066400000000000000000000025771477627663500152540ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #ifndef AOM_COMMON_Y4MENC_H_ #define AOM_COMMON_Y4MENC_H_ #include "aom/aom_decoder.h" #include "common/md5_utils.h" #include "common/tools_common.h" #ifdef __cplusplus extern "C" { #endif #define Y4M_BUFFER_SIZE 256 int y4m_write_file_header(char *buf, size_t len, int width, int height, const struct AvxRational *framerate, int monochrome, aom_chroma_sample_position_t csp, aom_img_fmt_t fmt, unsigned int bit_depth, aom_color_range_t range); int y4m_write_frame_header(char *buf, size_t len); void y4m_write_image_file(const aom_image_t *img, const int *planes, FILE *file); void y4m_update_image_md5(const aom_image_t *img, const int *planes, MD5Context *md5); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_COMMON_Y4MENC_H_ aom-3.12.1/common/y4minput.c000066400000000000000000001242261477627663500156350ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. * * Based on code from the OggTheora software codec source code, * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors. */ #include #include #include #include #include "aom/aom_integer.h" #include "y4minput.h" // Reads 'size' bytes from 'file' into 'buf' with some fault tolerance. // Returns true on success. static int file_read(void *buf, size_t size, FILE *file) { const int kMaxTries = 5; int try_count = 0; int file_error = 0; size_t len = 0; while (!feof(file) && len < size && try_count < kMaxTries) { const size_t n = fread((uint8_t *)buf + len, 1, size - len, file); ++try_count; len += n; file_error = ferror(file); if (file_error) { if (errno == EINTR || errno == EAGAIN) { clearerr(file); continue; } else { fprintf(stderr, "Error reading file: %u of %u bytes read, %d: %s\n", (uint32_t)len, (uint32_t)size, errno, strerror(errno)); return 0; } } } if (!feof(file) && len != size) { fprintf(stderr, "Error reading file: %u of %u bytes read," " error: %d, tries: %d, %d: %s\n", (uint32_t)len, (uint32_t)size, file_error, try_count, errno, strerror(errno)); } return len == size; } // Stores the color range in 'y4m_ctx', returning 1 if successfully parsed, // 0 otherwise. static int parse_color_range(y4m_input *y4m_ctx, const char *buf) { // Note that default is studio range. if (strcmp(buf, "LIMITED") == 0) { return 1; } if (strcmp(buf, "FULL") == 0) { y4m_ctx->color_range = AOM_CR_FULL_RANGE; return 1; } fprintf(stderr, "Unknown color range value: %s\n", buf); return 0; } static int parse_metadata(y4m_input *y4m_ctx, const char *buf) { if (strncmp(buf, "COLORRANGE=", 11) == 0) { return parse_color_range(y4m_ctx, buf + 11); } return 1; // No support for other metadata, just ignore them. } static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { char *p; char *q; for (p = _tags;; p = q) { /*Skip any leading spaces.*/ while (*p == ' ') p++; /*If that's all we have, stop.*/ if (p[0] == '\0') break; /*Find the end of this tag.*/ for (q = p + 1; *q != '\0' && *q != ' '; q++) { } /*Process the tag.*/ switch (p[0]) { case 'W': { if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1) return -1; } break; case 'H': { if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1) return -1; } break; case 'F': { if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) { return -1; } } break; case 'I': { _y4m->interlace = p[1]; } break; case 'A': { if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) { return -1; } } break; case 'C': { if (q - p > 16) return -1; memcpy(_y4m->chroma_type, p + 1, q - p - 1); _y4m->chroma_type[q - p - 1] = '\0'; } break; case 'X': { if (!parse_metadata(_y4m, p + 1)) return -1; } break; default: break; /*Ignore unknown tags.*/ } } return 0; } // Copy a single tag into the buffer, along with a null character. // Returns 0 if any file IO errors occur. static int copy_tag(char *buf, size_t buf_len, char *end_tag, FILE *file) { size_t i; assert(buf_len >= 1); // Skip leading space characters. do { if (!file_read(buf, 1, file)) { return 0; } } while (buf[0] == ' '); // If we hit the newline, treat this as the "empty" tag. if (buf[0] == '\n') { buf[0] = '\0'; *end_tag = '\n'; return 1; } // Copy over characters until a space is hit, or the buffer is exhausted. for (i = 1; i < buf_len; ++i) { if (!file_read(buf + i, 1, file)) { return 0; } if (buf[i] == ' ' || buf[i] == '\n') { break; } } if (i == buf_len) { fprintf(stderr, "Error: Y4M header tags must be less than %lu characters\n", (unsigned long)i); return 0; } *end_tag = buf[i]; buf[i] = '\0'; return 1; } // Returns 1 if tags were parsed successfully, 0 otherwise. static int parse_tags(y4m_input *y4m_ctx, FILE *file) { char tag[256]; char end; // Character denoting the end of the tag, ' ' or '\n'. // Set Y4M tags to defaults, updating them as processing occurs. Mandatory // fields are marked with -1 and will be checked after the tags are parsed. y4m_ctx->pic_w = -1; y4m_ctx->pic_h = -1; y4m_ctx->fps_n = -1; // Also serves as marker for fps_d y4m_ctx->par_n = 0; y4m_ctx->par_d = 0; y4m_ctx->interlace = '?'; y4m_ctx->color_range = AOM_CR_STUDIO_RANGE; snprintf(y4m_ctx->chroma_type, sizeof(y4m_ctx->chroma_type), "420"); // Find one tag at a time. do { if (!copy_tag(tag, sizeof(tag), &end, file)) { return 0; } // y4m_parse_tags returns 0 on success. if (y4m_parse_tags(y4m_ctx, tag)) { return 0; } } while (end != '\n'); // Check the mandatory fields. if (y4m_ctx->pic_w == -1) { fprintf(stderr, "Width field missing\n"); return 0; } if (y4m_ctx->pic_h == -1) { fprintf(stderr, "Height field missing\n"); return 0; } if (y4m_ctx->fps_n == -1) { fprintf(stderr, "FPS field missing\n"); return 0; } return 1; } /*All anti-aliasing filters in the following conversion functions are based on one of two window functions: The 6-tap Lanczos window (for down-sampling and shifts): sinc(\pi*t)*sinc(\pi*t/3), |t|<3 (sinc(t)==sin(t)/t) 0, |t|>=3 The 4-tap Mitchell window (for up-sampling): 7|t|^3-12|t|^2+16/3, |t|<1 -(7/3)|x|^3+12|x|^2-20|x|+32/3, |t|<2 0, |t|>=2 The number of taps is intentionally kept small to reduce computational overhead and limit ringing. The taps from these filters are scaled so that their sum is 1, and the result is scaled by 128 and rounded to integers to create a filter whose intermediate values fit inside 16 bits. Coefficients are rounded in such a way as to ensure their sum is still 128, which is usually equivalent to normal rounding. Conversions which require both horizontal and vertical filtering could have these steps pipelined, for less memory consumption and better cache performance, but we do them separately for simplicity.*/ #define OC_MINI(_a, _b) ((_a) > (_b) ? (_b) : (_a)) #define OC_MAXI(_a, _b) ((_a) < (_b) ? (_b) : (_a)) #define OC_CLAMPI(_a, _b, _c) (OC_MAXI(_a, OC_MINI(_b, _c))) /*420jpeg chroma samples are sited like: Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | 420mpeg2 chroma samples are sited like: Y-------Y-------Y-------Y------- | | | | BR | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | Y-------Y-------Y-------Y------- | | | | BR | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | We use a resampling filter to shift the site locations one quarter pixel (at the chroma plane's resolution) to the right. The 4:2:2 modes look exactly the same, except there are twice as many chroma lines, and they are vertically co-sited with the luma samples in both the mpeg2 and jpeg cases (thus requiring no vertical resampling).*/ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst, const unsigned char *_src, int _c_w, int _c_h) { int y; int x; for (y = 0; y < _c_h; y++) { /*Filter: [4 -17 114 35 -9 1]/128, derived from a 6-tap Lanczos window.*/ for (x = 0; x < OC_MINI(_c_w, 2); x++) { _dst[x] = (unsigned char)OC_CLAMPI( 0, (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[OC_MINI(x + 3, _c_w - 1)] + 64) >> 7, 255); } for (; x < _c_w - 3; x++) { _dst[x] = (unsigned char)OC_CLAMPI( 0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >> 7, 255); } for (; x < _c_w; x++) { _dst[x] = (unsigned char)OC_CLAMPI( 0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >> 7, 255); } _dst += _c_w; _src += _c_w; } } /*This format is only used for interlaced content, but is included for completeness. 420jpeg chroma samples are sited like: Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | 420paldv chroma samples are sited like: YR------Y-------YR------Y------- | | | | | | | | | | | | YB------Y-------YB------Y------- | | | | | | | | | | | | YR------Y-------YR------Y------- | | | | | | | | | | | | YB------Y-------YB------Y------- | | | | | | | | | | | | We use a resampling filter to shift the site locations one quarter pixel (at the chroma plane's resolution) to the right. Then we use another filter to move the C_r location down one quarter pixel, and the C_b location up one quarter pixel.*/ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst, unsigned char *_aux) { unsigned char *tmp; int c_w; int c_h; int c_sz; int pli; int y; int x; /*Skip past the luma data.*/ _dst += _y4m->pic_w * _y4m->pic_h; /*Compute the size of each chroma plane.*/ c_w = (_y4m->pic_w + 1) / 2; c_h = (_y4m->pic_h + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; c_sz = c_w * c_h; tmp = _aux + 2 * c_sz; for (pli = 1; pli < 3; pli++) { /*First do the horizontal re-sampling. This is the same as the mpeg2 case, except that after the horizontal case, we need to apply a second vertical filter.*/ y4m_42xmpeg2_42xjpeg_helper(tmp, _aux, c_w, c_h); _aux += c_sz; switch (pli) { case 1: { /*Slide C_b up a quarter-pel. This is the same filter used above, but in the other order.*/ for (x = 0; x < c_w; x++) { for (y = 0; y < OC_MINI(c_h, 3); y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( 0, (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] + 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] - 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >> 7, 255); } for (; y < c_h - 2; y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( 0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >> 7, 255); } for (; y < c_h; y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( 0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + 4 * tmp[(c_h - 1) * c_w] + 64) >> 7, 255); } _dst++; tmp++; } _dst += c_sz - c_w; tmp -= c_w; } break; case 2: { /*Slide C_r down a quarter-pel. This is the same as the horizontal filter.*/ for (x = 0; x < c_w; x++) { for (y = 0; y < OC_MINI(c_h, 2); y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( 0, (4 * tmp[0] - 17 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] - 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + tmp[OC_MINI(y + 3, c_h - 1) * c_w] + 64) >> 7, 255); } for (; y < c_h - 3; y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( 0, (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] - 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >> 7, 255); } for (; y < c_h; y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( 0, (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] - 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + tmp[(c_h - 1) * c_w] + 64) >> 7, 255); } _dst++; tmp++; } } break; } /*For actual interlaced material, this would have to be done separately on each field, and the shift amounts would be different. C_r moves down 1/8, C_b up 3/8 in the top field, and C_r moves down 3/8, C_b up 1/8 in the bottom field. The corresponding filters would be: Down 1/8 (reverse order for up): [3 -11 125 15 -4 0]/128 Down 3/8 (reverse order for up): [4 -19 98 56 -13 2]/128*/ } } /*Perform vertical filtering to reduce a single plane from 4:2:2 to 4:2:0. This is used as a helper by several conversion routines.*/ static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst, const unsigned char *_src, int _c_w, int _c_h) { int y; int x; /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/ for (x = 0; x < _c_w; x++) { for (y = 0; y < OC_MINI(_c_h, 2); y += 2) { _dst[(y >> 1) * _c_w] = OC_CLAMPI(0, (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] - 17 * _src[OC_MINI(2, _c_h - 1) * _c_w] + 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >> 7, 255); } for (; y < _c_h - 3; y += 2) { _dst[(y >> 1) * _c_w] = OC_CLAMPI(0, (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) - 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) + 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >> 7, 255); } for (; y < _c_h; y += 2) { _dst[(y >> 1) * _c_w] = OC_CLAMPI( 0, (3 * (_src[(y - 2) * _c_w] + _src[(_c_h - 1) * _c_w]) - 17 * (_src[(y - 1) * _c_w] + _src[OC_MINI(y + 2, _c_h - 1) * _c_w]) + 78 * (_src[y * _c_w] + _src[OC_MINI(y + 1, _c_h - 1) * _c_w]) + 64) >> 7, 255); } _src++; _dst++; } } /*420jpeg chroma samples are sited like: Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | 422jpeg chroma samples are sited like: Y---BR--Y-------Y---BR--Y------- | | | | | | | | | | | | Y---BR--Y-------Y---BR--Y------- | | | | | | | | | | | | Y---BR--Y-------Y---BR--Y------- | | | | | | | | | | | | Y---BR--Y-------Y---BR--Y------- | | | | | | | | | | | | We use a resampling filter to decimate the chroma planes by two in the vertical direction.*/ static void y4m_convert_422jpeg_420jpeg(y4m_input *_y4m, unsigned char *_dst, unsigned char *_aux) { int c_w; int c_h; int c_sz; int dst_c_w; int dst_c_h; int dst_c_sz; int pli; /*Skip past the luma data.*/ _dst += _y4m->pic_w * _y4m->pic_h; /*Compute the size of each chroma plane.*/ c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h; c_h = _y4m->pic_h; dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; c_sz = c_w * c_h; dst_c_sz = dst_c_w * dst_c_h; for (pli = 1; pli < 3; pli++) { y4m_422jpeg_420jpeg_helper(_dst, _aux, c_w, c_h); _aux += c_sz; _dst += dst_c_sz; } } /*420jpeg chroma samples are sited like: Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | 422 chroma samples are sited like: YBR-----Y-------YBR-----Y------- | | | | | | | | | | | | YBR-----Y-------YBR-----Y------- | | | | | | | | | | | | YBR-----Y-------YBR-----Y------- | | | | | | | | | | | | YBR-----Y-------YBR-----Y------- | | | | | | | | | | | | We use a resampling filter to shift the original site locations one quarter pixel (at the original chroma resolution) to the right. Then we use a second resampling filter to decimate the chroma planes by two in the vertical direction.*/ static void y4m_convert_422_420jpeg(y4m_input *_y4m, unsigned char *_dst, unsigned char *_aux) { unsigned char *tmp; int c_w; int c_h; int c_sz; int dst_c_h; int dst_c_sz; int pli; /*Skip past the luma data.*/ _dst += _y4m->pic_w * _y4m->pic_h; /*Compute the size of each chroma plane.*/ c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h; c_h = _y4m->pic_h; dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; c_sz = c_w * c_h; dst_c_sz = c_w * dst_c_h; tmp = _aux + 2 * c_sz; for (pli = 1; pli < 3; pli++) { /*In reality, the horizontal and vertical steps could be pipelined, for less memory consumption and better cache performance, but we do them separately for simplicity.*/ /*First do horizontal filtering (convert to 422jpeg)*/ y4m_42xmpeg2_42xjpeg_helper(tmp, _aux, c_w, c_h); /*Now do the vertical filtering.*/ y4m_422jpeg_420jpeg_helper(_dst, tmp, c_w, c_h); _aux += c_sz; _dst += dst_c_sz; } } /*420jpeg chroma samples are sited like: Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | 411 chroma samples are sited like: YBR-----Y-------Y-------Y------- | | | | | | | | | | | | YBR-----Y-------Y-------Y------- | | | | | | | | | | | | YBR-----Y-------Y-------Y------- | | | | | | | | | | | | YBR-----Y-------Y-------Y------- | | | | | | | | | | | | We use a filter to resample at site locations one eighth pixel (at the source chroma plane's horizontal resolution) and five eighths of a pixel to the right. Then we use another filter to decimate the planes by 2 in the vertical direction.*/ static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst, unsigned char *_aux) { unsigned char *tmp; int c_w; int c_h; int c_sz; int dst_c_w; int dst_c_h; int dst_c_sz; int tmp_sz; int pli; int y; int x; /*Skip past the luma data.*/ _dst += _y4m->pic_w * _y4m->pic_h; /*Compute the size of each chroma plane.*/ c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h; c_h = _y4m->pic_h; dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; c_sz = c_w * c_h; dst_c_sz = dst_c_w * dst_c_h; tmp_sz = dst_c_w * c_h; tmp = _aux + 2 * c_sz; for (pli = 1; pli < 3; pli++) { /*In reality, the horizontal and vertical steps could be pipelined, for less memory consumption and better cache performance, but we do them separately for simplicity.*/ /*First do horizontal filtering (convert to 422jpeg)*/ for (y = 0; y < c_h; y++) { /*Filters: [1 110 18 -1]/128 and [-3 50 86 -5]/128, both derived from a 4-tap Mitchell window.*/ for (x = 0; x < OC_MINI(c_w, 1); x++) { tmp[x << 1] = (unsigned char)OC_CLAMPI( 0, (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] - _aux[OC_MINI(2, c_w - 1)] + 64) >> 7, 255); tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( 0, (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] - 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >> 7, 255); } for (; x < c_w - 2; x++) { tmp[x << 1] = (unsigned char)OC_CLAMPI(0, (_aux[x - 1] + 110 * _aux[x] + 18 * _aux[x + 1] - _aux[x + 2] + 64) >> 7, 255); tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( 0, (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] - 5 * _aux[x + 2] + 64) >> 7, 255); } for (; x < c_w; x++) { tmp[x << 1] = (unsigned char)OC_CLAMPI( 0, (_aux[x - 1] + 110 * _aux[x] + 18 * _aux[OC_MINI(x + 1, c_w - 1)] - _aux[c_w - 1] + 64) >> 7, 255); if ((x << 1 | 1) < dst_c_w) { tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( 0, (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[OC_MINI(x + 1, c_w - 1)] - 5 * _aux[c_w - 1] + 64) >> 7, 255); } } tmp += dst_c_w; _aux += c_w; } tmp -= tmp_sz; /*Now do the vertical filtering.*/ y4m_422jpeg_420jpeg_helper(_dst, tmp, dst_c_w, c_h); _dst += dst_c_sz; } } /*Convert 444 to 420jpeg.*/ static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst, unsigned char *_aux) { unsigned char *tmp; int c_w; int c_h; int c_sz; int dst_c_w; int dst_c_h; int dst_c_sz; int tmp_sz; int pli; int y; int x; /*Skip past the luma data.*/ _dst += _y4m->pic_w * _y4m->pic_h; /*Compute the size of each chroma plane.*/ c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h; c_h = _y4m->pic_h; dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; c_sz = c_w * c_h; dst_c_sz = dst_c_w * dst_c_h; tmp_sz = dst_c_w * c_h; tmp = _aux + 2 * c_sz; for (pli = 1; pli < 3; pli++) { /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/ for (y = 0; y < c_h; y++) { for (x = 0; x < OC_MINI(c_w, 2); x += 2) { tmp[x >> 1] = OC_CLAMPI(0, (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] - 17 * _aux[OC_MINI(2, c_w - 1)] + 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >> 7, 255); } for (; x < c_w - 3; x += 2) { tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[x + 3]) - 17 * (_aux[x - 1] + _aux[x + 2]) + 78 * (_aux[x] + _aux[x + 1]) + 64) >> 7, 255); } for (; x < c_w; x += 2) { tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[c_w - 1]) - 17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) + 78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >> 7, 255); } tmp += dst_c_w; _aux += c_w; } tmp -= tmp_sz; /*Now do the vertical filtering.*/ y4m_422jpeg_420jpeg_helper(_dst, tmp, dst_c_w, c_h); _dst += dst_c_sz; } } /*The image is padded with empty chroma components at 4:2:0.*/ static void y4m_convert_mono_420jpeg(y4m_input *_y4m, unsigned char *_dst, unsigned char *_aux) { int c_sz; (void)_aux; _dst += _y4m->pic_w * _y4m->pic_h; c_sz = ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) * ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v); memset(_dst, 128, c_sz * 2); } /*No conversion function needed.*/ static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst, unsigned char *_aux) { (void)_y4m; (void)_dst; (void)_aux; } static const char TAG[] = "YUV4MPEG2"; int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, int num_skip, aom_chroma_sample_position_t csp, int only_420) { // File must start with |TAG|. char tag_buffer[9]; // 9 == strlen(TAG) // Read as much as possible from |skip_buffer|, which were characters // that were previously read from the file to do input-type detection. assert(num_skip >= 0 && num_skip <= 8); if (num_skip > 0) { memcpy(tag_buffer, skip_buffer, num_skip); } // Start reading from the file now that the |skip_buffer| is depleted. if (!file_read(tag_buffer + num_skip, 9 - num_skip, file)) { return -1; } if (memcmp(TAG, tag_buffer, 9) != 0) { fprintf(stderr, "Error parsing header: must start with %s\n", TAG); return -1; } // Next character must be a space. if (!file_read(tag_buffer, 1, file) || tag_buffer[0] != ' ') { fprintf(stderr, "Error parsing header: space must follow %s\n", TAG); return -1; } if (!parse_tags(y4m_ctx, file)) { fprintf(stderr, "Error parsing %s header.\n", TAG); return -1; } if (y4m_ctx->interlace == '?') { fprintf(stderr, "Warning: Input video interlacing format unknown; " "assuming progressive scan.\n"); } else if (y4m_ctx->interlace != 'p') { fprintf(stderr, "Input video is interlaced; " "Only progressive scan handled.\n"); return -1; } /* Only support vertical chroma sample position if the input format is * already 420mpeg2. Colocated is not supported in Y4M. */ if (csp == AOM_CSP_VERTICAL && strcmp(y4m_ctx->chroma_type, "420mpeg2") != 0) { fprintf(stderr, "Vertical chroma sample position only supported " "for 420mpeg2 input\n"); return -1; } if (csp == AOM_CSP_COLOCATED) { // TODO(any): check the right way to handle this in y4m fprintf(stderr, "Ignoring colocated chroma sample position for reading in Y4M\n"); } y4m_ctx->aom_fmt = AOM_IMG_FMT_I420; y4m_ctx->bps = 12; y4m_ctx->bit_depth = 8; y4m_ctx->aux_buf = NULL; y4m_ctx->dst_buf = NULL; if (strcmp(y4m_ctx->chroma_type, "420") == 0 || strcmp(y4m_ctx->chroma_type, "420jpeg") == 0 || strcmp(y4m_ctx->chroma_type, "420mpeg2") == 0) { y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v = y4m_ctx->dst_c_dec_v = 2; y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); /* Natively supported: no conversion required. */ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; y4m_ctx->convert = y4m_convert_null; } else if (strcmp(y4m_ctx->chroma_type, "420p10") == 0) { y4m_ctx->src_c_dec_h = 2; y4m_ctx->dst_c_dec_h = 2; y4m_ctx->src_c_dec_v = 2; y4m_ctx->dst_c_dec_v = 2; y4m_ctx->dst_buf_read_sz = 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2)); /* Natively supported: no conversion required. */ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; y4m_ctx->convert = y4m_convert_null; y4m_ctx->bit_depth = 10; y4m_ctx->bps = 15; y4m_ctx->aom_fmt = AOM_IMG_FMT_I42016; if (only_420) { fprintf(stderr, "Unsupported conversion from 420p10 to 420jpeg\n"); return -1; } } else if (strcmp(y4m_ctx->chroma_type, "420p12") == 0) { y4m_ctx->src_c_dec_h = 2; y4m_ctx->dst_c_dec_h = 2; y4m_ctx->src_c_dec_v = 2; y4m_ctx->dst_c_dec_v = 2; y4m_ctx->dst_buf_read_sz = 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2)); /* Natively supported: no conversion required. */ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; y4m_ctx->convert = y4m_convert_null; y4m_ctx->bit_depth = 12; y4m_ctx->bps = 18; y4m_ctx->aom_fmt = AOM_IMG_FMT_I42016; if (only_420) { fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n"); return -1; } } else if (strcmp(y4m_ctx->chroma_type, "420paldv") == 0) { y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v = y4m_ctx->dst_c_dec_v = 2; y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first. We need to make two filter passes, so we need some extra space in the aux buffer.*/ y4m_ctx->aux_buf_sz = 3 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); y4m_ctx->aux_buf_read_sz = 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); y4m_ctx->convert = y4m_convert_42xpaldv_42xjpeg; } else if (strcmp(y4m_ctx->chroma_type, "422jpeg") == 0) { y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = 2; y4m_ctx->src_c_dec_v = 1; y4m_ctx->dst_c_dec_v = 2; y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first.*/ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; y4m_ctx->convert = y4m_convert_422jpeg_420jpeg; } else if (strcmp(y4m_ctx->chroma_type, "422") == 0) { y4m_ctx->src_c_dec_h = 2; y4m_ctx->src_c_dec_v = 1; if (only_420) { y4m_ctx->dst_c_dec_h = 2; y4m_ctx->dst_c_dec_v = 2; y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first. We need to make two filter passes, so we need some extra space in the aux buffer.*/ y4m_ctx->aux_buf_read_sz = 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; y4m_ctx->convert = y4m_convert_422_420jpeg; } else { y4m_ctx->aom_fmt = AOM_IMG_FMT_I422; y4m_ctx->bps = 16; y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; /*Natively supported: no conversion required.*/ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; y4m_ctx->convert = y4m_convert_null; } } else if (strcmp(y4m_ctx->chroma_type, "422p10") == 0) { y4m_ctx->src_c_dec_h = 2; y4m_ctx->src_c_dec_v = 1; y4m_ctx->aom_fmt = AOM_IMG_FMT_I42216; y4m_ctx->bps = 20; y4m_ctx->bit_depth = 10; y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; y4m_ctx->dst_buf_read_sz = 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h); y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; y4m_ctx->convert = y4m_convert_null; if (only_420) { fprintf(stderr, "Unsupported conversion from 422p10 to 420jpeg\n"); return -1; } } else if (strcmp(y4m_ctx->chroma_type, "422p12") == 0) { y4m_ctx->src_c_dec_h = 2; y4m_ctx->src_c_dec_v = 1; y4m_ctx->aom_fmt = AOM_IMG_FMT_I42216; y4m_ctx->bps = 24; y4m_ctx->bit_depth = 12; y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; y4m_ctx->dst_buf_read_sz = 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h); y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; y4m_ctx->convert = y4m_convert_null; if (only_420) { fprintf(stderr, "Unsupported conversion from 422p12 to 420jpeg\n"); return -1; } } else if (strcmp(y4m_ctx->chroma_type, "411") == 0) { y4m_ctx->src_c_dec_h = 4; y4m_ctx->dst_c_dec_h = 2; y4m_ctx->src_c_dec_v = 1; y4m_ctx->dst_c_dec_v = 2; y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first. We need to make two filter passes, so we need some extra space in the aux buffer.*/ y4m_ctx->aux_buf_read_sz = 2 * ((y4m_ctx->pic_w + 3) / 4) * y4m_ctx->pic_h; y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; y4m_ctx->convert = y4m_convert_411_420jpeg; } else if (strcmp(y4m_ctx->chroma_type, "444") == 0) { y4m_ctx->src_c_dec_h = 1; y4m_ctx->src_c_dec_v = 1; if (only_420) { y4m_ctx->dst_c_dec_h = 2; y4m_ctx->dst_c_dec_v = 2; y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first. We need to make two filter passes, so we need some extra space in the aux buffer.*/ y4m_ctx->aux_buf_read_sz = 2 * y4m_ctx->pic_w * y4m_ctx->pic_h; y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; y4m_ctx->convert = y4m_convert_444_420jpeg; } else { y4m_ctx->aom_fmt = AOM_IMG_FMT_I444; y4m_ctx->bps = 24; y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; y4m_ctx->dst_buf_read_sz = 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; /*Natively supported: no conversion required.*/ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; y4m_ctx->convert = y4m_convert_null; } } else if (strcmp(y4m_ctx->chroma_type, "444p10") == 0) { y4m_ctx->src_c_dec_h = 1; y4m_ctx->src_c_dec_v = 1; y4m_ctx->aom_fmt = AOM_IMG_FMT_I44416; y4m_ctx->bps = 30; y4m_ctx->bit_depth = 10; y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; y4m_ctx->convert = y4m_convert_null; if (only_420) { fprintf(stderr, "Unsupported conversion from 444p10 to 420jpeg\n"); return -1; } } else if (strcmp(y4m_ctx->chroma_type, "444p12") == 0) { y4m_ctx->src_c_dec_h = 1; y4m_ctx->src_c_dec_v = 1; y4m_ctx->aom_fmt = AOM_IMG_FMT_I44416; y4m_ctx->bps = 36; y4m_ctx->bit_depth = 12; y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; y4m_ctx->convert = y4m_convert_null; if (only_420) { fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n"); return -1; } } else if (strcmp(y4m_ctx->chroma_type, "444alpha") == 0) { y4m_ctx->src_c_dec_h = 1; y4m_ctx->src_c_dec_v = 1; if (only_420) { y4m_ctx->dst_c_dec_h = 2; y4m_ctx->dst_c_dec_v = 2; y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first. We need to make two filter passes, so we need some extra space in the aux buffer. The extra plane also gets read into the aux buf. It will be discarded.*/ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; y4m_ctx->convert = y4m_convert_444_420jpeg; } else { fprintf(stderr, "Unsupported format: 444A\n"); return -1; } } else if (strcmp(y4m_ctx->chroma_type, "mono") == 0) { y4m_ctx->src_c_dec_h = y4m_ctx->src_c_dec_v = 0; y4m_ctx->dst_c_dec_h = y4m_ctx->dst_c_dec_v = 2; y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*No extra space required, but we need to clear the chroma planes.*/ y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; y4m_ctx->convert = y4m_convert_mono_420jpeg; } else { fprintf(stderr, "Unknown chroma sampling type: %s\n", y4m_ctx->chroma_type); return -1; } /*The size of the final frame buffers is always computed from the destination chroma decimation type.*/ y4m_ctx->dst_buf_sz = y4m_ctx->pic_w * y4m_ctx->pic_h + 2 * ((y4m_ctx->pic_w + y4m_ctx->dst_c_dec_h - 1) / y4m_ctx->dst_c_dec_h) * ((y4m_ctx->pic_h + y4m_ctx->dst_c_dec_v - 1) / y4m_ctx->dst_c_dec_v); if (y4m_ctx->bit_depth == 8) y4m_ctx->dst_buf = (unsigned char *)malloc(y4m_ctx->dst_buf_sz); else y4m_ctx->dst_buf = (unsigned char *)malloc(2 * y4m_ctx->dst_buf_sz); if (!y4m_ctx->dst_buf) return -1; if (y4m_ctx->aux_buf_sz > 0) { y4m_ctx->aux_buf = (unsigned char *)malloc(y4m_ctx->aux_buf_sz); if (!y4m_ctx->aux_buf) { free(y4m_ctx->dst_buf); return -1; } } return 0; } void y4m_input_close(y4m_input *_y4m) { free(_y4m->dst_buf); free(_y4m->aux_buf); } int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, aom_image_t *_img) { char frame[6]; int pic_sz; int c_w; int c_h; int c_sz; int bytes_per_sample = _y4m->bit_depth > 8 ? 2 : 1; /*Read and skip the frame header.*/ if (!file_read(frame, 6, _fin)) return 0; if (memcmp(frame, "FRAME", 5)) { fprintf(stderr, "Loss of framing in Y4M input data\n"); return -1; } if (frame[5] != '\n') { char c; int j; for (j = 0; j < 79 && file_read(&c, 1, _fin) && c != '\n'; j++) { } if (j == 79) { fprintf(stderr, "Error parsing Y4M frame header\n"); return -1; } } /*Read the frame data that needs no conversion.*/ if (!file_read(_y4m->dst_buf, _y4m->dst_buf_read_sz, _fin)) { fprintf(stderr, "Error reading Y4M frame data.\n"); return -1; } /*Read the frame data that does need conversion.*/ if (!file_read(_y4m->aux_buf, _y4m->aux_buf_read_sz, _fin)) { fprintf(stderr, "Error reading Y4M frame data.\n"); return -1; } /*Now convert the just read frame.*/ (*_y4m->convert)(_y4m, _y4m->dst_buf, _y4m->aux_buf); /*Fill in the frame buffer pointers. We don't use aom_img_wrap() because it forces padding for odd picture sizes, which would require a separate fread call for every row.*/ memset(_img, 0, sizeof(*_img)); /*Y4M has the planes in Y'CbCr order, which libaom calls Y, U, and V.*/ _img->fmt = _y4m->aom_fmt; _img->w = _img->d_w = _y4m->pic_w; _img->h = _img->d_h = _y4m->pic_h; _img->bit_depth = _y4m->bit_depth; _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1; _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1; _img->bps = _y4m->bps; /*Set up the buffer pointers.*/ pic_sz = _y4m->pic_w * _y4m->pic_h * bytes_per_sample; c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; c_w *= bytes_per_sample; c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; c_sz = c_w * c_h; _img->stride[AOM_PLANE_Y] = _y4m->pic_w * bytes_per_sample; _img->stride[AOM_PLANE_U] = _img->stride[AOM_PLANE_V] = c_w; _img->planes[AOM_PLANE_Y] = _y4m->dst_buf; _img->planes[AOM_PLANE_U] = _y4m->dst_buf + pic_sz; _img->planes[AOM_PLANE_V] = _y4m->dst_buf + pic_sz + c_sz; return 1; } aom-3.12.1/common/y4minput.h000066400000000000000000000050751477627663500156420ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. * * Based on code from the OggTheora software codec source code, * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors. */ #ifndef AOM_COMMON_Y4MINPUT_H_ #define AOM_COMMON_Y4MINPUT_H_ #include #include "aom/aom_image.h" #ifdef __cplusplus extern "C" { #endif typedef struct y4m_input y4m_input; /*The function used to perform chroma conversion.*/ typedef void (*y4m_convert_func)(y4m_input *_y4m, unsigned char *_dst, unsigned char *_src); struct y4m_input { int pic_w; int pic_h; int fps_n; int fps_d; int par_n; int par_d; char interlace; int src_c_dec_h; int src_c_dec_v; int dst_c_dec_h; int dst_c_dec_v; char chroma_type[16]; /*The size of each converted frame buffer.*/ size_t dst_buf_sz; /*The amount to read directly into the converted frame buffer.*/ size_t dst_buf_read_sz; /*The size of the auxilliary buffer.*/ size_t aux_buf_sz; /*The amount to read into the auxilliary buffer.*/ size_t aux_buf_read_sz; y4m_convert_func convert; unsigned char *dst_buf; unsigned char *aux_buf; enum aom_img_fmt aom_fmt; int bps; unsigned int bit_depth; aom_color_range_t color_range; }; /** * Open the input file, treating it as Y4M. |y4m_ctx| is filled in after * reading it. Note that |csp| should only be set for 420 input, and the input * chroma is shifted if necessary. The code does not support the conversion * from co-located to vertical. The |skip_buffer| indicates bytes that were * previously read from |file|, to do input-type detection; this buffer will * be read before the |file| is read. It is of size |num_skip|, which *must* * be 8 or less. * * Returns 0 on success, -1 on failure. */ int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, int num_skip, aom_chroma_sample_position_t csp, int only_420); void y4m_input_close(y4m_input *_y4m); int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, aom_image_t *img); #ifdef __cplusplus } // extern "C" #endif #endif // AOM_COMMON_Y4MINPUT_H_ aom-3.12.1/doc/000077500000000000000000000000001477627663500131465ustar00rootroot00000000000000aom-3.12.1/doc/AlgorithmDescription.md000066400000000000000000001162401477627663500176260ustar00rootroot00000000000000
Algorithm Description
# Abstract This document describes technical aspects of coding tools included in the associated codec. This document is not a specification of the associated codec. Instead, it summarizes the highlighted features of coding tools for new developers. This document should be updated when significant new normative changes have been integrated into the associated codec. # Table of Contents [Abbreviations](#Abbreviations) [Algorithm description](#Algorithm-Description) - [Block Partitioning](#Block-Partitioning) - [Coding block partition](#Coding-block-partition) - [Transform block partition](#Transform-block-partition) - [Intra Prediction](#Intra-Prediction) - [Directional intra prediction modes](#Directional-intra-prediction-modes) - [Non-directional intra prediction modes](#Non-directional-intra-prediction-modes) - [Recursive filtering modes](#Recursive-filtering-modes) - [Chroma from Luma mode](#Chroma-from-Luma-mode) - [Inter Prediction](#Inter-Prediction) - [Motion vector prediction](#Motion-vector-prediction) - [Motion vector coding](#Motion-vector-coding) - [Interpolation filter for motion compensation](#Interpolation-filter-for-motion-compensation) - [Warped motion compensation](#Warped-motion-compensation) - [Overlapped block motion compensation](#Overlapped-block-motion-compensation) - [Reference frames](#Reference-frames) - [Compound Prediction](#Compound-Prediction) - [Transform](#Transform) - [Quantization](#Quantization) - [Entropy Coding](#Entropy-Coding) - [Loop filtering and post-processing](#Loop-filtering-and-post-processing) - [Deblocking](#Deblocking) - [Constrained directional enhancement](#Constrained-directional-enhancement) - [Loop Restoration filter](#Loop-Restoration-filter) - [Frame super-resolution](#Frame-super-resolution) - [Film grain synthesis](#Film-grain-synthesis) - [Screen content coding](#Screen-content-coding) - [Intra block copy](#Intra-block-copy) - [Palette mode](#Palette-mode) [References](#References) # Abbreviations CfL: Chroma from Luma\ IntraBC: Intra block copy\ LCU: Largest coding unit\ OBMC: Overlapped Block Motion Compensation\ CDEF: Constrained Directional Enhancement Filter # Algorithm Description ## Block Partitioning ### Coding block partition The largest coding block unit (LCU) applied in this codec is 128×128. In addition to no split mode `PARTITION_NONE`, the partition tree supports 9 different partitioning patterns, as shown in below figure.
Partition
Figure 1: Supported coding block partitions
According to the number of sub-partitions, the 9 partition modes are summarized as follows: 1. Four partitions: `PARTITION_SPLIT`, `PARTITION_VERT_4`, `PARTITION_HORZ_4` 2. Three partitions (T-Shape): `PARTITION_HORZ_A`, `PARTITION_HORZ_B`, `PARTITION_VERT_A`, `PARTITION_HORZ_B` 3. Two partitions: `PARTITION_HORZ`, `PARTITION_VERT` Among all the 9 partitioning patterns, only `PARTITION_SPLIT` mode supports recursive partitioning, i.e., sub-partitions can be further split, other partitioning modes cannot further split. Particularly, for 8x8 and 128x128, `PARTITION_VERT_4`, `PARTITION_HORZ_4` are not used, and for 8x8, T-Shape partitions are not used either. ### Transform block partition For both intra and inter coded blocks, the coding block can be further partitioned into multiple transform units with the partitioning depth up to 2 levels. The mapping from the transform size of the current depth to the transform size of the next depth is shown in the following Table 1.
Table 1: Transform partition size setting
Partition
Furthermore, for intra coded blocks, the transform partition is done in a way that all the transform blocks have the same size, and the transform blocks are coded in a raster scan order. An example of the transform block partitioning for intra coded block is shown in the Figure 2.
Partition
Figure 2: Example of transform partitioning for intra coded block
For inter coded blocks, the transform unit partitioning can be done in a recursive manner with the partitioning depth up to 2 levels. The transform partitioning supports 1:1 (square), 1:2/2:1, and 1:4/4:1 transform unit sizes ranging from 4×4 to 64×64. If the coding block is smaller than or equal to 64x64, the transform block partitioning can only apply to luma component, for chroma blocks, the transform block size is identical to the coding block size. Otherwise, if the coding block width or height is greater than 64, then both the luma and chroma coding blocks will implicitly split into multiples of min(W, 64)x min(H, 64) and min(W, 32)x min(H, 32) transform blocks, respectively.
Partition
Figure 3: Example of transform partitioning for inter coded block
## Intra Prediction ### Directional intra prediction modes Directional intra prediction modes are applied in intra prediction, which models local textures using a given direction pattern. Directional intra prediction modes are represented by nominal modes and angle delta. The nominal modes are similar set of intra prediction angles used in VP9, which includes 8 angles. The index value of angle delta is ranging from -3 ~ +3, and zero delta angle indicates a nominal mode. The prediction angle is represented by a nominal intra angle plus an angle delta. In total, there are 56 directional intra prediction modes, as shown in the following figure. In the below figure, solid arrows indicate directional intra prediction modes and dotted arrows represent non-zero angle delta.
Directional intra
Figure 4: Directional intra prediction modes
The nominal mode index and angle delta index is signalled separately, and nominal mode index is signalled before the associated angle delta index. It is noted that for small block sizes, where the coding gain from extending intra prediction angles may saturate, only the nominal modes are used and angle delta index is not signalled. ### Non-directional intra prediction modes In addition to directional intra prediction modes, four non-directional intra modes which simulate smooth textures are also included. The four non-directional intra modes include `SMOOTH_V`, `SMOOTH_H`, `SMOOTH` and `PAETH predictor`. In `SMOOTH V`, `SMOOTH H` and `SMOOTH modes`, the prediction values are generated using quadratic interpolation along vertical, horizontal directions, or the average thereof. The samples used in the quadratic interpolation include reconstructed samples from the top and left neighboring blocks and samples from the right and bottom boundaries which are approximated by top reconstructed samples and the left reconstructed samples. In `PAETH predictor` mode, the prediction for each sample is assigned as one from the top (T), left (L) and top-left (TL) reference samples, which has the value closest to the Paeth predictor value, i.e., T + L -TL. The samples used in `PAETH predictor` are illustrated in below figure.
Directional
intra
Figure 5: Paeth predictor
### Recursive filtering modes Five filtering intra modes are defined, and each mode specify a set of eight 7-tap filters. Given the selected filtering mode index (0~4), the current block is divided into 4x2 sub-blocks. For one 4×2 sub-block, each sample is predicted by 7-tap interpolation using the 7 top and left neighboring samples as inputs. Different filters are applied for samples located at different coordinates within a 4×2 sub-block. The prediction process can be done recursively in unit 4x2 sub-block, which means that prediction samples generated for one 4x2 prediction block can be used to predict another 4x2 sub-block.
Directional intra
Figure 6: Recursive filtering modes
### Chroma from Luma mode Chroma from Luma (CfL) is a chroma intra prediction mode, which models chroma samples as a linear function of co-located reconstructed luma samples. To align the resolution between luma and chroma samples for different chroma sampling format, e.g., 4:2:0 and 4:2:2, reconstructed luma pixels may need to be sub-sampled before being used in CfL mode. In addition, the DC component is removed to form the AC contribution. In CfL mode, the model parameters which specify the linear function between two color components are optimized by encoder signalled in the bitstream.
Directional
intra
Figure 7: CfL prediction
## Inter Prediction ### Motion vector prediction Motion vectors are predicted by neighboring blocks which can be either spatial neighboring blocks, or temporal neighboring blocks located in a reference frame. A set of MV predictors will be identified by checking all these blocks and utilized to encode the motion vector information. **Spatial motion vector prediction** There are two sets of spatial neighboring blocks that can be utilized for finding spatial MV predictors, including the adjacent spatial neighbors which are direct top and left neighbors of the current block, and second outer spatial neighbors which are close but not directly adjacent to the current block. The two sets of spatial neighboring blocks are illustrated in an example shown in Figure 8.
Directional intra
Figure 8: Motion field estimation by linear projection
For each set of spatial neighbors, the top row will be checked from left to right and then the left column will be checked from top to down. For the adjacent spatial neighbors, an additional top-right block will be also checked after checking the left column neighboring blocks. For the non-adjacent spatial neighbors, the top-left block located at (-1, -1) position will be checked first, then the top row and left column in a similar manner as the adjacent neighbors. The adjacent neighbors will be checked first, then the temporal MV predictor that will be described in the next subsection will be checked second, after that, the non-adjacent spatial neighboring blocks will be checked. For compound prediction which utilizes a pair of reference frames, the non-adjacent spatial neighbors are not used for deriving the MV predictor. **Temporal motion vector prediction** In addition to spatial neighboring blocks, MV predictor can be also derived using co-located blocks of reference pictures, namely temporal MV predictor. To generate temporal MV predictor, the MVs of reference frames are first stored together with reference indices associated with the reference frame. Then for each 8x8 block of the current frame, the MVs of a reference frame which pass the 8x8 block are identified and stored together with the reference frame index in a temporal MV buffer. In an example shown in Figure 5, the MV of reference frame 1 (R1) pointing from R1 to a reference frame of R1 is identified, i.e., MVref, which passes a 8x8 block (shaded in blue dots) of current frame. Then this MVref is stored in the temporal MV buffer associated with this 8x8 block.
Directional
intra
Figure 9: Motion field estimation by linear projection
Finally, given a couple of pre-defined block coordinates, the associated MVs stored in the temporal MV buffer are identified and projected accordingly to derive a temporal MV predictor which points from the current block to its reference frame, e.g., MV0 in Figure 5. In Figure 6, the pre-defined block positions for deriving temporal MV predictors of a 16x16 block are shown and up to 7 blocks will be checked to find valid temporal MV predictors.
Directional intra
Figure 10: Block positions for deriving temporal MV predictors
The temporal MV predictors are checked after the nearest spatial MV predictors but before the non-adjacent spatial MV predictors. All the spatial and temporal MV candidates will be put together in a pool, with each predictor associated with a weighting determined during the scanning of the spatial and temporal neighboring blocks. Based on the associated weightings, the candidates are sorted and ranked, and up to four candidates will be used as a list MV predictor list. ### Motion vector coding ### Interpolation filter for motion compensation [Ed.: to be added] ### Warped motion compensation **Global warped motion** The global motion information is signalled at each inter frame, wherein the global motion type and motion parameters are included. The global motion types and the number of the associated parameters are listed in the following table. | Global motion type | Number of parameters | |:------------------:|:--------------------:| | Identity (zero motion)| 0 | | Translation | 2 | | Rotzoom | 4 | | General affine | 6 | For an inter coded block, after the reference frame index is transmitted, if the motion of current block is indicated as global motion, the global motion type and the associated parameters of the given reference will be used for current block. **Local warped motion** For an inter coded block, local warped motion is allowed when the following conditions are all satisfied: * Current block is single prediction * Width or height is greater than or equal to 8 samples * At least one of the immediate neighbors uses same reference frame with current block If the local warped motion is used for current block, instead of signalling the affine parameters, they are estimated by using mean square minimization of the distance between the reference projection and modeled projection based on the motion vectors of current block and its immediate neighbors. To estimate the parameters of local warped motion, the projection sample pair of the center pixel in neighboring block and its corresponding pixel in the reference frame are collected if the neighboring block uses the same reference frame with current block. After that, 3 extra samples are created by shifting the center position by a quarter sample in one or two dimensions, and these samples are also considered as projection sample pairs to ensure the stability of the model parameter estimation process. ### Overlapped block motion compensation For an inter-coded block, overlapped block motion compensation (OBMC) is allowed when the following conditions are all satisfied. * Current block is single prediction * Width or height is greater than or equal to 8 samples * At least one of the neighboring blocks are inter-coded blocks When OBMC is applied to current block, firstly, the initial inter prediction samples is generated by using the assigned motion vector of current block, then the inter predicted samples for the current block and inter predicted samples based on motion vectors from the above and left blocks are blended to generate the final prediction samples.The maximum number of neighboring motion vectors is limited based on the size of current block, and up to 4 motion vectors from each of upper and left blocks can be involved in the OBMC process of current block. One example of the processing order of neighboring blocks is shown in the following picture, wherein the values marked in each block indicate the processing order of the motion vectors of current block and neighboring blocks. To be specific, the motion vector of current block is firstly applied to generate inter prediction samples P0(x,y). Then motion vector of block 1 is applied to generate the prediction samples p1(x,y). After that, the prediction samples in the overlapping area between block 0 and block 1 is an weighted average of p0(x,y) and p1(x,y). The overlapping area of block 1 and block 0 is marked in grey in the following picture. The motion vectors of block 2, 3, 4 are further applied and blended in the same way.
Directional
intra
Figure 11: neighboring blocks for OBMC process
### Reference frames [Ed.: to be added] ### Compound Prediction [Ed.: to be added] **Compound wedge prediction** [Ed.: to be added] **Difference-modulated masked prediction** [Ed.: to be added] **Frame distance-based compound prediction** [Ed.: to be added] **Compound inter-intra prediction** [Ed.: to be added] ## Transform The separable 2D transform process is applied on prediction residuals. For the forward transform, a 1-D vertical transform is performed first on each column of the input residual block, then a horizontal transform is performed on each row of the vertical transform output. For the backward transform, a 1-D horizontal transform is performed first on each row of the input de-quantized coefficient block, then a vertical transform is performed on each column of the horizontal transform output. The primary 1-D transforms include four different types of transform: a) 4-point, 8-point, 16-point, 32-point, 64-point DCT-2; b) 4-point, 8-point, 16-point asymmetric DST’s (DST-4, DST-7) and c) their flipped versions; d) 4-point, 8-point, 16-point, 32-point identity transforms. When transform size is 4-point, ADST refers to DST-7, otherwise, when transform size is greater than 4-point, ADST refers to DST-4.
Table 2: Transform basis functions (DCT-2, DST-4 and DST-7 for N-point input.
Partition
For luma component, each transform block can select one pair of horizontal and vertical transform combination given a pre-defined set of transform type candidates, and the selection is explicitly signalled into the bitstream. However, the selection is not signalled when Max(width,height) is 64. When the maximum of transform block width and height is greater than or equal to 32, the set of transform type candidates depend on the prediction mode, as described in Table 3. Otherwise, when the maximum of transform block width and height is smaller than 32, the set of transform type candidates depend on the prediction mode, as described in Table 4.
Table 3: Transform type candidates for luma component when max(width, height) is greater than or equal to 32.
Partition
Table 4: Transform type candidates for luma component when max(width, height) is smaller than 32.
Partition
The set of transform type candidates (namely transform set) is defined in Table 5.
Table 5: Definition of transform set.
Partition
For chroma component, the transform type selection is done in an implicit way. For intra prediction residuals, the transform type is selected according to the intra prediction mode, as specified in Table 4. For inter prediction residuals, the transform type is selected according to the transform type selection of the co-located luma block. Therefore, for chroma component, there is no transform type signalling in the bitstream.
Table 6: Transform type selection for chroma component intra prediction residuals.
Partition
The computational cost of large size (e.g., 64-point) transforms is further reduced by zeroing out all the coefficients except the following two cases: 1. The top-left 32×32 quadrant for 64×64/64×32/32×64 DCT_DCT hybrid transforms 2. The left 32×16 area for 64×16 and top 16×32 for16×64 DCT_DCT hybrid transforms. Both the DCT-2 and ADST (DST-4, DST-7) are implemented using butterfly structure [1], which included multiple stages of butterfly operations. Each butterfly operations can be calculated in parallel and different stages are cascaded in a sequential order. ## Quantization Quantization of transform coefficients may apply different quantization step size for DC and AC transform coefficients, and different quantization step size for luma and chroma transform coefficients. To specify the quantization step size, in the frame header, a _**base_q_idx**_ syntax element is first signalled, which is a 8-bit fixed length code specifying the quantization step size for luma AC coefficients. The valid range of _**base_q_idx**_ is [0, 255]. After that, the delta value relative to base_q_idx for Luma DC coefficients, indicated as DeltaQYDc is further signalled. Furthermore, if there are more than one color plane, then a flag _**diff_uv_delta**_ is signaled to indicate whether Cb and Cr color components apply different quantization index values. If _**diff_uv_delta**_ is signalled as 0, then only the delta values relative to base_q_idx for chroma DC coefficients (indicated as DeltaQUDc) and AC coefficients (indicated as DeltaQUAc) are signalled. Otherwise, the delta values relative to base_q_idx for both the Cb and Cr DC coefficients (indicated as DeltaQUDc and DeltaQVDc) and AC coefficients (indicated as DeltaQUAc and DeltaQVAc) are signalled. The above decoded DeltaQYDc, DeltaQUAc, DeltaQUDc, DeltaQVAc and DeltaQVDc are added to _base_q_idx_ to derive the quantization indices. Then these quantization indices are further mapped to quantization step size according to two tables. For DC coefficients, the mapping from quantization index to quantization step size for 8-bit, 10-bit and 12-bit internal bit depth is specified by a lookup table Dc_Qlookup[3][256], and the mapping from quantization index to quantization step size for 8-bit, 10-bit and 12-bit is specified by a lookup table Ac_Qlookup[3][256].
quant_dc
Figure 11: Quantization step size of DC coefficients for different internal bit-depth
quant_ac
Figure 12: Quantization step size of AC coefficients for different internal bit-depth
Given the quantization step size, indicated as _Qstep_, the input quantized coefficients is further de-quantized using the following formula: _F_ = sign * ( (_f_ * _Qstep_) % 0xFFFFFF ) / _deNorm_ , where _f_ is the input quantized coefficient, _F_ is the output dequantized coefficient, _deNorm_ is a constant value derived from the transform block area size, as indicated by the following table: | _deNorm_ | Tx block area size | |----------|:--------------------------| | 1| Less than 512 samples | | 2 | 512 or 1024 samples | | 4 | Greater than 1024 samples | When the quantization index is 0, the quantization is performed using a quantization step size equal to 1, which is lossless coding mode. ## Entropy Coding **Entropy coding engine** [Ed.: to be added] **Coefficient coding** For each transform unit, the coefficient coding starts with coding a skip sign, which is followed by the signaling of primary transform kernel type and the end-of-block (EOB) position in case the transform coding is not skipped. After that, the coefficient values are coded in a multiple level map manner plus sign values. The level maps are coded as three level planes, namely lower-level, middle-level and higher-level planes, and the sign is coded as another separate plane. The lower-level, middle-level and higher-level planes correspond to correspond to different ranges of coefficient magnitudes. The lower level plane corresponds to the range of 0–2, the middle level plane takes care of the range of 3–14, and the higher-level plane covers the range of 15 and above. The three level planes are coded as follows. After the EOB position is coded, the lower-level and middle-level planes are coded together in backward scan order, and the scan order refers to zig-zag scan applied on the entire transform unit basis. Then the sign plane and higher-level plane are coded together in forward scan order. After that, the remainder (coefficient level minus 14) is entropy coded using Exp-Golomb code. The context model applied to the lower level plane depends on the primary transform directions, including: bi-directional, horizontal, and vertical, as well as transform size, and up to five neighbor (in frequency domain) coefficients are used to derive the context. The middle level plane uses a similar context model, but the number of context neighbor coefficients is reduced from 5 to 2. The higher-level plane is coded by Exp-Golomb code without using context model. For the sign plane, except the DC sign that is coded using the DC signs from its neighboring transform units, sign values of other coefficients are coded directly without using context model. ## Loop filtering and post-processing ### Deblocking There are four methods when picking deblocking filter level, which are listed below: * LPF_PICK_FROM_FULL_IMAGE: search the full image with different values * LPF_PICK_FROM_Q: estimate the filter level based on quantizer and frame type * LPF_PICK_FROM_SUBIMAGE: estimate the level from a portion of image * LPF_PICK_MINIMAL_LPF: set the filter level to 0 and disable the deblocking When estimating the filter level from the full image or sub-image, the searching starts from the previous frame filter level, ends when the filter step is less or equal to zero. In addition to filter level, there are some other parameters which control the deblocking filter such as sharpness level, mode deltas, and reference deltas. Deblocking is performed at 128x128 super block level, and the vertical and horizontal edges are filtered respectively. For a 128x128 super block, the vertical/horizontal edges aligned with each 8x8 block is firstly filtered. If the 4x4 transform is used, the internal edge aligned with a 4x4 block will be further filtered. The filter length is switchable from 4-tap, 6-tap, 8-tap, 14-tap, and 0-tap (no filtering). The location of filter taps are identified based on the number of filter taps in order to compute the filter mask. When finally performing the filtering, outer taps are added if there is high edge variance. ### Constrained directional enhancement filter **Edge Direction Estimation**\ In CDEF, edge direction search is performed at 8x8 block-level. There are eight edge directions in total, as illustrated in Figure 13.
Edge direction
Figure 13: Line number k for pixels following direction d=0:7 in an 8x8 block.
The optimal edge direction d_opt is found by maximizing the following term [3]:
Equation edge direction
where x_p is the value of pixel p, P_{d,k} is the set of pixels in line k following direction d, N_{d,k} is the cardinality of P_{d,k}. **Directional filter**\ CDEF consists two filter taps: the primary tap and the secondary tap. The primary tap works along the edge direction (as shown in Figure 14), while the secondary tap forms an oriented 45 degree off the edge direction (as shown in Figure 15).
Primary tap
Figure 14: Primary filter taps following edge direction. For even strengths a = 2 and b = 4, for odd strengths a = 3 and b = 3. The filtered pixel is shown in the highlighted center.
Edge direction
Figure 15: Secondary filter taps. The filtered pixel is shown in the highlighted center.
CDEF can be described by the following equation:
Equation direction search
where x(i,j) and y(i,j) are the input and output reconstructed values of CDEF. p denotes primary tap, and s denotes secondary tap, w is the weight between primary and secondary tap. f(d,S,D) is a non-linear filtering function, S denotes filter strength, D is a damping parameter. For 8-bit content, S^p ranges from 0 to 15, and S^s can be 0, 1, 2, or 4. D ranges from 3 to 6 for luma, and 2 to 4 for chroma. **Non linear filter**\ CDEF uses a non-linear filtering function to prevent excessive blurring when applied across an edge. It is achieved by ignoring pixels that are too different from the current pixels to be filtered. When the difference between current pixel and it's neighboring pixel d is within a threshold, f(d,S,D) = d, otherwise f(d,S,D) = 0. Specifically, the strength S determines the maximum difference allowed and damping D determines the point to ignore the filter tap. ### Loop Restoration filter **Separable symmetric wiener filter** Let F be a w x w 2D filter taps around the pixel to be filtered, denoted as a w^2 x 1 column vector. When compared with traditional Wiener Filter, Separable Symmetric Wiener Filter has the following three constraints in order to save signaling bits and reduce complexity [4]: 1) The w x w filter window of is separated into horizontal and vertical w-tap convolutions. 2) The horizontal and vertical filters are constrained to be symmetric. 3) It is assumed that the summation of horizontal/vertical filter coefficients is 1. As a result, F can be written as F = column_vectorize[ab^T], subject to a(i) = a(w - 1 - i), b(i) = b(w - 1 - i), for i = [0, r - 1], and sum(a(i)) = sum(b(i)) = 1, where a is the vertical filters and b is the horizontal filters. The derivation of the filters a and b starts from an initial guess of horizontal and vertical filters, optimizing one of the two while holding the other fixed. In the implementation w = 7, thus, 3 taps need to be sent for filters a and b, respectively. When signaling the filter coefficients, 4, 5 and 6 bits are used for the first three filter taps, and the remaining ones are obtained from the normalization and symmetry constraints. 30 bits in total are transmitted for both vertical and horizontal filters. **Dual self-guided filter** Dual self-guided filter is designed to firstly obtain two coarse restorations X1 and X2 of the degraded frame X, and the final restoration Xr is obtained as a combination of the degraded samples, and the difference between the degraded samples and the coarse restorations [4]:
Equation dual self guided filter
At encoder side, alpha and beta are computed using:
Equation dual self guided filter parameter
where A = {X1 - X, X2 - X}, b = Y - X, and Y is the original source. X1 and X2 are obtained using guided filtering, and the filtering is controlled by a radius r and a noise parameter e, where a higher r implies a higher spatial variance and a higher e implies a higher range variance [4]. X1 and X2 can be described by {r1, e1} and {r2, e2}, respectively. The encoder sends a 6-tuple {r1, e1, r2, e2, alpha, beta} to the decoder. In the implementation, {r1, e1, r2, e2} uses a 3-bit codebook, and {alpha, beta} uses 7-bit each due to much higher precision, resulting in a total of 17 bits. r is always less or equal to 3 [4]. Guided filtering can be described by a local linear model:
Equation guided filter
where x and y are the input and output samples, F and G are determined by the statistics in the neighboring of the pixel to be filtered. It is called self-guided filtering when the guidance image is the same as the degraded image[4]. Following are three steps when deriving F and G of the self-guided filtering: 1) Compute mean u and variance d of pixels in a (2r + 1) x (2r + 1) window around the pixel to be filtered. 2) For each pixel, compute f = d / (d + e); g = (1 - f)u. 3) Compute F and G for each pixel as averages of f and g values in a 3 x 3 window around the pixel for use in step 2. ### Frame super-resolution In order to improve the perceptual quality of decoded pictures, a super-resolution process is applied at low bit-rates [5]. First, at encoder side, the source video is downscaled as a non-normative procedure. Second, the downscaled video is encoded, followed by deblocking and CDEF process. Third, a linear upscaling process is applied as a normative procedure to bring the encoded video back to it's original spatial resolution. Lastly, the loop restoration is applied to resolve part of the high frequency lost. The last two steps together are called super-resolving process [5]. Similarly, decoding, deblocking and CDEF processes are applied at lower spatial resolution at decoder side. Then, the frames go through the super-resolving process. In order to reduce overheads in line-buffers with respect to hardware implementation, the upscaling and downscaling process are applied to horizontal dimension only. ### Film grain synthesis At encoder side, film grain is removed from the input video as a denoising process. Then, the structure and intensity of the input video are analyzed by canny edge detector, and smooth areas are used to estimate the strength of film grain. Once the strength is estimated, the denoised video and film grain parameters are sent to decoder side. Those parameters are used to synthesis the grain and add it back to the decoded video, producing the final output video. In order to reconstruct the film grain, the following parameters are sent to decoder side: lag value, autoregressive coefficients, values for precomputed look-up table index of chroma components, and a set of points for a piece-wise linear scaling function [6]. Those parameters are signaled as quantized integers including 64 bytes for scaling function and 74 bytes for autoregressive coefficients. Once the parameters are received, an autoregressive process is applied in a raster scan order to generate one 64x64 luma and two 32x32 chroma film grain templates [6]. Those templates are used to generate the grain for the remaining part of a picture. ## Screen content coding To improve the coding performance of screen content coding, the associated video codec incorporates several coding tools,for example, intra block copy (IntraBC) is employed to handle the repeated patterns in a screen picture, and palette mode is used to handle the screen blocks with a limited number of different colors. ### Intra block copy Intra Block Copy (IntraBC) [2] is a coding tool similar to inter-picture prediction. The main difference is that in IntraBC, a predictor block is formed from the reconstructed samples (before application of in-loop filtering) of the current picture. Therefore, IntraBC can be considered as "motion compensation" within current picture. A block vector (BV) was coded to specify the location of the predictor block. The BV precision is integer. The BV will be signalled in the bitstream since the decoder needs it to locate the predictor. For current block, the flag use IntraBC indicating whether current block is IntraBC mode is first transmitted in bit stream. Then, if the current block is IntraBC mode, the BV difference diff is obtained by subtracting the reference BV from the current BV, and then diff is classified into four types according to the diff values of horizontal and vertical component. Type information needs to be transmitted into the bitstream, after that, diff values of two components may be signalled based on the type info. IntraBC is very effective for screen content coding, but it also brings a lot of difficulties to hardware design. To facilitate the hardware design, the following modifications are adopted. 1) when IntraBC is allowed, the loop filters are disabled, which are de-blocking filter, the CDEF (Constrained Directional Enhancement Filter), and the Loop Restoration. By doing this, picture buffer of reconstructed samples can be shared between IntraBC and inter prediction. 2) To facilitate parallel decoding, the prediction cannot exceed the restricted areas. For one super block, if the coordinate of its top-left position is (x0, y0), the prediction at position (x, y) can be accessed by IntraBC, if y < y0 and x < x0 + 2 * (y0 - y) 3) To allow hardware writing back delay, immediate reconstructed areas cannot be accessed by IntraBC prediction. The restricted immediate reconstructed area can be 1 ∼ n super blocks. So on top of modification 2, if the coordinate of one super block's top-left position is (x0, y0), the prediction at position (x, y) can be accessed by IntraBC, if y < y0 and x < x0 + 2 * (y0 - y) - D, where D denotes the restricted immediate reconstructed area. When D is one super block, the prediction area is shown in below figure.
Intra block
copy
Figure 13: the prediction area for IntraBC mode in one super block prediction
### Palette mode # References [1] J. Han, Y. Xu and D. Mukherjee, "A butterfly structured design of the hybrid transform coding scheme," 2013 Picture Coding Symposium (PCS), San Jose, CA, 2013, pp. 17-20.\ [2] J. Li, H. Su, A. Converse, B. Li, R. Zhou, B. Lin, J. Xu, Y. Lu, and R. Xiong, "Intra Block Copy for Screen Content in the Emerging AV1 Video Codec," 2018 Data Compression Conference, Snowbird, Utah, USA.\ [3] S. Midtskogen and J.M. Valin. "The AV1 constrained directional enhancement filter (CDEF)." In 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1193-1197. IEEE, 2018.\ [4] D. Mukherjee, S. Li, Y. Chen, A. Anis, S. Parker, and J. Bankoski. "A switchable loop-restoration with side-information framework for the emerging AV1 video codec." In 2017 IEEE International Conference on Image Processing (ICIP), pp. 265-269. IEEE, 2017.\ [5] Y. Chen, D. Murherjee, J. Han, A. Grange, Y. Xu, Z. Liu,... & C.H.Chiang, (2018, June). "An overview of core coding tools in the AV1 video codec."" In 2018 Picture Coding Symposium (PCS) (pp. 41-45). IEEE.\ [6] A. Norkin, & N. Birkbeck, (2018, March). "Film grain synthesis for AV1 video codec." In 2018 Data Compression Conference (pp. 3-12). IEEE. aom-3.12.1/doc/dev_guide/000077500000000000000000000000001477627663500151015ustar00rootroot00000000000000aom-3.12.1/doc/dev_guide/av1_decoder.dox000066400000000000000000000003031477627663500177650ustar00rootroot00000000000000/*!\page decoder_guide AV1 DECODER GUIDE Describe AV1 decoding techniques here. \cond \if av1_md_support [AV1 Algorithm Description](\ref LALGORITHMDESCRIPTION) \endif \endcond */ aom-3.12.1/doc/dev_guide/av1_encoder.dox000066400000000000000000002235441477627663500200150ustar00rootroot00000000000000/*!\page encoder_guide AV1 ENCODER GUIDE \tableofcontents \section architecture_introduction Introduction This document provides an architectural overview of the libaom AV1 encoder. It is intended as a high level starting point for anyone wishing to contribute to the project, that will help them to more quickly understand the structure of the encoder and find their way around the codebase. It stands above and will where necessary link to more detailed function level documents. \subsection architecture_gencodecs Generic Block Transform Based Codecs Most modern video encoders including VP8, H.264, VP9, HEVC and AV1 (in increasing order of complexity) share a common basic paradigm. This comprises separating a stream of raw video frames into a series of discrete blocks (of one or more sizes), then computing a prediction signal and a quantized, transform coded, residual error signal. The prediction and residual error signal, along with any side information needed by the decoder, are then entropy coded and packed to form the encoded bitstream. See Figure 1: below, where the blue blocks are, to all intents and purposes, the lossless parts of the encoder and the red block is the lossy part. This is of course a gross oversimplification, even in regard to the simplest of the above codecs. For example, all of them allow for block based prediction at multiple different scales (i.e. different block sizes) and may use previously coded pixels in the current frame for prediction or pixels from one or more previously encoded frames. Further, they may support multiple different transforms and transform sizes and quality optimization tools like loop filtering. \image html genericcodecflow.png "" width=70% \subsection architecture_av1_structure AV1 Structure and Complexity As previously stated, AV1 adopts the same underlying paradigm as other block transform based codecs. However, it is much more complicated than previous generation codecs and supports many more block partitioning, prediction and transform options. AV1 supports block partitions of various sizes from 128x128 pixels down to 4x4 pixels using a multi-layer recursive tree structure as illustrated in figure 2 below. \image html av1partitions.png "" width=70% AV1 also provides 71 basic intra prediction modes, 56 single frame inter prediction modes (7 reference frames x 4 modes x 2 for OBMC (overlapped block motion compensation)), 12768 compound inter prediction modes (that combine inter predictors from two reference frames) and 36708 compound inter / intra prediction modes. Furthermore, in addition to simple inter motion estimation, AV1 also supports warped motion prediction using affine transforms. In terms of transform coding, it has 16 separable 2-D transform kernels \f$(DCT, ADST, fADST, IDTX)^2\f$ that can be applied at up to 19 different scales from 64x64 down to 4x4 pixels. When combined together, this means that for any one 8x8 pixel block in a source frame, there are approximately 45,000,000 different ways that it can be encoded. Consequently, AV1 requires complex control processes. While not necessarily a normative part of the bitstream, these are the algorithms that turn a set of compression tools and a bitstream format specification, into a coherent and useful codec implementation. These may include but are not limited to things like :- - Rate distortion optimization (The process of trying to choose the most efficient combination of block size, prediction mode, transform type etc.) - Rate control (regulation of the output bitrate) - Encoder speed vs quality trade offs. - Features such as two pass encoding or optimization for low delay encoding. For a more detailed overview of AV1's encoding tools and a discussion of some of the design considerations and hardware constraints that had to be accommodated, please refer to A Technical Overview of AV1. Figure 3 provides a slightly expanded but still simplistic view of the AV1 encoder architecture with blocks that relate to some of the subsequent sections of this document. In this diagram, the raw uncompressed frame buffers are shown in dark green and the reconstructed frame buffers used for prediction in light green. Red indicates those parts of the codec that are (or may be) lossy, where fidelity can be traded off against compression efficiency, whilst light blue shows algorithms or coding tools that are lossless. The yellow blocks represent non-bitstream normative configuration and control algorithms. \image html av1encoderflow.png "" width=70% \section architecture_command_line The Libaom Command Line Interface Add details or links here: TODO ? elliotk@ \section architecture_enc_data_structures Main Encoder Data Structures The following are the main high level data structures used by the libaom AV1 encoder and referenced elsewhere in this overview document: - \ref AV1_PRIMARY - \ref AV1_PRIMARY.gf_group (\ref GF_GROUP) - \ref AV1_PRIMARY.lap_enabled - \ref AV1_PRIMARY.twopass (\ref TWO_PASS) - \ref AV1_PRIMARY.p_rc (\ref PRIMARY_RATE_CONTROL) - \ref AV1_PRIMARY.tf_info (\ref TEMPORAL_FILTER_INFO) - \ref AV1_COMP - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig) - \ref AV1_COMP.rc (\ref RATE_CONTROL) - \ref AV1_COMP.speed - \ref AV1_COMP.sf (\ref SPEED_FEATURES) - \ref AV1EncoderConfig (Encoder configuration parameters) - \ref AV1EncoderConfig.pass - \ref AV1EncoderConfig.algo_cfg (\ref AlgoCfg) - \ref AV1EncoderConfig.kf_cfg (\ref KeyFrameCfg) - \ref AV1EncoderConfig.rc_cfg (\ref RateControlCfg) - \ref AlgoCfg (Algorithm related configuration parameters) - \ref AlgoCfg.arnr_max_frames - \ref AlgoCfg.arnr_strength - \ref KeyFrameCfg (Keyframe coding configuration parameters) - \ref KeyFrameCfg.enable_keyframe_filtering - \ref RateControlCfg (Rate control configuration) - \ref RateControlCfg.mode - \ref RateControlCfg.target_bandwidth - \ref RateControlCfg.best_allowed_q - \ref RateControlCfg.worst_allowed_q - \ref RateControlCfg.cq_level - \ref RateControlCfg.under_shoot_pct - \ref RateControlCfg.over_shoot_pct - \ref RateControlCfg.maximum_buffer_size_ms - \ref RateControlCfg.starting_buffer_level_ms - \ref RateControlCfg.optimal_buffer_level_ms - \ref RateControlCfg.vbrbias - \ref RateControlCfg.vbrmin_section - \ref RateControlCfg.vbrmax_section - \ref PRIMARY_RATE_CONTROL (Primary Rate control status) - \ref PRIMARY_RATE_CONTROL.gf_intervals[] - \ref PRIMARY_RATE_CONTROL.cur_gf_index - \ref RATE_CONTROL (Rate control status) - \ref RATE_CONTROL.intervals_till_gf_calculate_due - \ref RATE_CONTROL.frames_till_gf_update_due - \ref RATE_CONTROL.frames_to_key - \ref TWO_PASS (Two pass status and control data) - \ref GF_GROUP (Data related to the current GF/ARF group) - \ref FIRSTPASS_STATS (Defines entries in the first pass stats buffer) - \ref FIRSTPASS_STATS.coded_error - \ref SPEED_FEATURES (Encode speed vs quality tradeoff parameters) - \ref SPEED_FEATURES.hl_sf (\ref HIGH_LEVEL_SPEED_FEATURES) - \ref HIGH_LEVEL_SPEED_FEATURES - \ref HIGH_LEVEL_SPEED_FEATURES.recode_loop - \ref HIGH_LEVEL_SPEED_FEATURES.recode_tolerance - \ref TplParams \section architecture_enc_use_cases Encoder Use Cases The libaom AV1 encoder is configurable to support a number of different use cases and rate control strategies. The principle use cases for which it is optimised are as follows: - Video on Demand / Streaming - Low Delay or Live Streaming - Video Conferencing / Real Time Coding (RTC) - Fixed Quality / Testing Other examples of use cases for which the encoder could be configured but for which there is less by way of specific optimizations include: - Download and Play - Disk Playback> - Storage - Editing - Broadcast video Specific use cases may have particular requirements or constraints. For example: Video Conferencing: In a video conference we need to encode the video in real time and to avoid any coding tools that could increase latency, such as frame look ahead. Live Streams: In cases such as live streaming of games or events, it may be possible to allow some limited buffering of the video and use of lookahead coding tools to improve encoding quality. However, whilst a lag of a second or two may be fine given the one way nature of this type of video, it is clearly not possible to use tools such as two pass coding. Broadcast: Broadcast video (e.g. digital TV over satellite) may have specific requirements such as frequent and regular key frames (e.g. once per second or more) as these are important as entry points to users when switching channels. There may also be strict upper limits on bandwidth over a short window of time. Download and Play: Download and play applications may have less strict requirements in terms of local frame by frame rate control but there may be a requirement to accurately hit a file size target for the video clip as a whole. Similar considerations may apply to playback from mass storage devices such as DVD or disk drives. Editing: In certain special use cases such as offline editing, it may be desirable to have very high quality and data rate but also very frequent key frames or indeed to encode the video exclusively as key frames. Lossless video encoding may also be required in this use case. VOD / Streaming: One of the most important and common use cases for AV1 is video on demand or streaming, for services such as YouTube and Netflix. In this use case it is possible to do two or even multi-pass encoding to improve compression efficiency. Streaming services will often store many encoded copies of a video at different resolutions and data rates to support users with different types of playback device and bandwidth limitations. Furthermore, these services support dynamic switching between multiple streams, so that they can respond to changing network conditions. Exact rate control when encoding for a specific format (e.g 360P or 1080P on YouTube) may not be critical, provided that the video bandwidth remains within allowed limits. Whilst a format may have a nominal target data rate, this can be considered more as the desired average egress rate over the video corpus rather than a strict requirement for any individual clip. Indeed, in order to maintain optimal quality of experience for the end user, it may be desirable to encode some easier videos or sections of video at a lower data rate and harder videos or sections at a higher rate. VOD / streaming does not usually require very frequent key frames (as in the broadcast case) but key frames are important in trick play (scanning back and forth to different points in a video) and for adaptive stream switching. As such, in a use case like YouTube, there is normally an upper limit on the maximum time between key frames of a few seconds, but within certain limits the encoder can try to align key frames with real scene cuts. Whilst encoder speed may not seem to be as critical in this use case, for services such as YouTube, where millions of new videos have to be encoded every day, encoder speed is still important, so libaom allows command line control of the encode speed vs quality trade off. Fixed Quality / Testing Mode: Libaom also has a fixed quality encoder pathway designed for testing under highly constrained conditions. \section architecture_enc_speed_quality Speed vs Quality Trade Off In any modern video encoder there are trade offs that can be made in regard to the amount of time spent encoding a video or video frame vs the quality of the final encode. These trade offs typically limit the scope of the search for an optimal prediction / transform combination with faster encode modes doing fewer partition, reference frame, prediction mode and transform searches at the cost of some reduction in coding efficiency. The pruning of the size of the search tree is typically based on assumptions about the likelihood of different search modes being selected based on what has gone before and features such as the dimensions of the video frames and the Q value selected for encoding the frame. For example certain intra modes are less likely to be chosen at high Q but may be more likely if similar modes were used for the previously coded blocks above and to the left of the current block. The speed settings depend both on the use case (e.g. Real Time encoding) and an explicit speed control passed in on the command line as --cpu-used and stored in the \ref AV1_COMP.speed field of the main compressor instance data structure (cpi). The control flags for the speed trade off are stored the \ref AV1_COMP.sf field of the compressor instancve and are set in the following functions:- - \ref av1_set_speed_features_framesize_independent() - \ref av1_set_speed_features_framesize_dependent() - \ref av1_set_speed_features_qindex_dependent() A second factor impacting the speed of encode is rate distortion optimisation (rd vs non-rd encoding). When rate distortion optimization is enabled each candidate combination of a prediction mode and transform coding strategy is fully encoded and the resulting error (or distortion) as compared to the original source and the number of bits used, are passed to a rate distortion function. This function converts the distortion and cost in bits to a single RD value (where lower is better). This RD value is used to decide between different encoding strategies for the current block where, for example, a one may result in a lower distortion but a larger number of bits. The calculation of this RD value is broadly speaking as follows: \f[ RD = (λ * Rate) + Distortion \f] This assumes a linear relationship between the number of bits used and distortion (represented by the rate multiplier value λ) which is not actually valid across a broad range of rate and distortion values. Typically, where distortion is high, expending a small number of extra bits will result in a large change in distortion. However, at lower values of distortion the cost in bits of each incremental improvement is large. To deal with this we scale the value of λ based on the quantizer value chosen for the frame. This is assumed to be a proxy for our approximate position on the true rate distortion curve and it is further assumed that over a limited range of distortion values, a linear relationship between distortion and rate is a valid approximation. Doing a rate distortion test on each candidate prediction / transform combination is expensive in terms of cpu cycles. Hence, for cases where encode speed is critical, libaom implements a non-rd pathway where the RD value is estimated based on the prediction error and quantizer setting. \section architecture_enc_src_proc Source Frame Processing \subsection architecture_enc_frame_proc_data Main Data Structures The following are the main data structures referenced in this section (see also \ref architecture_enc_data_structures): - \ref AV1_PRIMARY ppi (the primary compressor instance data structure) - \ref AV1_PRIMARY.tf_info (\ref TEMPORAL_FILTER_INFO) - \ref AV1_COMP cpi (the main compressor instance data structure) - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig) - \ref AV1EncoderConfig (Encoder configuration parameters) - \ref AV1EncoderConfig.algo_cfg (\ref AlgoCfg) - \ref AV1EncoderConfig.kf_cfg (\ref KeyFrameCfg) - \ref AlgoCfg (Algorithm related configuration parameters) - \ref AlgoCfg.arnr_max_frames - \ref AlgoCfg.arnr_strength - \ref KeyFrameCfg (Keyframe coding configuration parameters) - \ref KeyFrameCfg.enable_keyframe_filtering \subsection architecture_enc_frame_proc_ingest Frame Ingest / Coding Pipeline To encode a frame, first call \ref av1_receive_raw_frame() to obtain the raw frame data. Then call \ref av1_get_compressed_data() to encode raw frame data into compressed frame data. The main body of \ref av1_get_compressed_data() is \ref av1_encode_strategy(), which determines high-level encode strategy (frame type, frame placement, etc.) and then encodes the frame by calling \ref av1_encode(). In \ref av1_encode(), \ref av1_first_pass() will execute the first_pass of two-pass encoding, while \ref encode_frame_to_data_rate() will perform the final pass for either one-pass or two-pass encoding. The main body of \ref encode_frame_to_data_rate() is \ref encode_with_recode_loop_and_filter(), which handles encoding before in-loop filters (with recode loops \ref encode_with_recode_loop(), or without any recode loop \ref encode_without_recode()), followed by in-loop filters (deblocking filters \ref loopfilter_frame(), CDEF filters and restoration filters \ref cdef_restoration_frame()). Except for rate/quality control, both \ref encode_with_recode_loop() and \ref encode_without_recode() call \ref av1_encode_frame() to manage the reference frame buffers and \ref encode_frame_internal() to perform the rest of encoding that does not require access to external frames. \ref encode_frame_internal() is the starting point for the partition search (see \ref architecture_enc_partitions). \subsection architecture_enc_frame_proc_tf Temporal Filtering \subsubsection architecture_enc_frame_proc_tf_overview Overview Video codecs exploit the spatial and temporal correlations in video signals to achieve compression efficiency. The noise factor in the source signal attenuates such correlation and impedes the codec performance. Denoising the video signal is potentially a promising solution. One strategy for denoising a source is motion compensated temporal filtering. Unlike image denoising, where only the spatial information is available, video denoising can leverage a combination of the spatial and temporal information. Specifically, in the temporal domain, similar pixels can often be tracked along the motion trajectory of moving objects. Motion estimation is applied to neighboring frames to find similar patches or blocks of pixels that can be combined to create a temporally filtered output. AV1, in common with VP8 and VP9, uses an in-loop motion compensated temporal filter to generate what are referred to as alternate reference frames (or ARF frames). These can be encoded in the bitstream and stored as frame buffers for use in the prediction of subsequent frames, but are not usually directly displayed (hence they are sometimes referred to as non-display frames). The following command line parameters set the strength of the filter, the number of frames used and determine whether filtering is allowed for key frames. - --arnr-strength (\ref AlgoCfg.arnr_strength) - --arnr-maxframes (\ref AlgoCfg.arnr_max_frames) - --enable-keyframe-filtering (\ref KeyFrameCfg.enable_keyframe_filtering) Note that in AV1, the temporal filtering scheme is designed around the hierarchical ARF based pyramid coding structure. We typically apply denoising only on key frame and ARF frames at the highest (and sometimes the second highest) layer in the hierarchical coding structure. \subsubsection architecture_enc_frame_proc_tf_algo Temporal Filtering Algorithm Our method divides the current frame into "MxM" blocks. For each block, a motion search is applied on frames before and after the current frame. Only the best matching patch with the smallest mean square error (MSE) is kept as a candidate patch for a neighbour frame. The current block is also a candidate patch. A total of N candidate patches are combined to generate the filtered output. Let f(i) represent the filtered sample value and \f$p_{j}(i)\f$ the sample value of the j-th patch. The filtering process is: \f[ f(i) = \frac{p_{0}(i) + \sum_{j=1}^{N} ω_{j}(i).p_{j}(i)} {1 + \sum_{j=1}^{N} ω_{j}(i)} \f] where \f$ ω_{j}(i) \f$ is the weight of the j-th patch from a total of N patches. The weight is determined by the patch difference as: \f[ ω_{j}(i) = exp(-\frac{D_{j}(i)}{h^2}) \f] where \f$ D_{j}(i) \f$ is the sum of squared difference between the current block and the j-th candidate patch: \f[ D_{j}(i) = \sum_{k\inΩ_{i}}||p_{0}(k) - p_{j}(k)||_{2} \f] where: - \f$p_{0}\f$ refers to the current frame. - \f$Ω_{i}\f$ is the patch window, an "LxL" pixel square. - h is a critical parameter that controls the decay of the weights measured by the Euclidean distance. It is derived from an estimate of noise amplitude in the source. This allows the filter coefficients to adapt for videos with different noise characteristics. - Usually, M = 32, N = 7, and L = 5, but they can be adjusted. It is recommended that the reader refers to the code for more details. \subsubsection architecture_enc_frame_proc_tf_funcs Temporal Filter Functions The main entry point for temporal filtering is \ref av1_temporal_filter(). This function returns 1 if temporal filtering is successful, otherwise 0. When temporal filtering is applied, the filtered frame will be held in the output_frame, which is the frame to be encoded in the following encoding process. Almost all temporal filter related code is in av1/encoder/temporal_filter.c and av1/encoder/temporal_filter.h. Inside \ref av1_temporal_filter(), the reader's attention is directed to \ref tf_setup_filtering_buffer() and \ref tf_do_filtering(). - \ref tf_setup_filtering_buffer(): sets up the frame buffer for temporal filtering, determines the number of frames to be used, and calculates the noise level of each frame. - \ref tf_do_filtering(): the main function for the temporal filtering algorithm. It breaks each frame into "MxM" blocks. For each block a motion search \ref tf_motion_search() is applied to find the motion vector from one neighboring frame. tf_build_predictor() is then called to build the matching patch and \ref av1_apply_temporal_filter_c() (see also optimised SIMD versions) to apply temporal filtering. The weighted average over each pixel is accumulated and finally normalized in \ref tf_normalize_filtered_frame() to generate the final filtered frame. - \ref av1_apply_temporal_filter_c(): the core function of our temporal filtering algorithm (see also optimised SIMD versions). \subsection architecture_enc_frame_proc_film Film Grain Modelling Add details here. \section architecture_enc_rate_ctrl Rate Control \subsection architecture_enc_rate_ctrl_data Main Data Structures The following are the main data structures referenced in this section (see also \ref architecture_enc_data_structures): - \ref AV1_PRIMARY ppi (the primary compressor instance data structure) - \ref AV1_PRIMARY.twopass (\ref TWO_PASS) - \ref AV1_COMP cpi (the main compressor instance data structure) - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig) - \ref AV1_COMP.rc (\ref RATE_CONTROL) - \ref AV1_COMP.sf (\ref SPEED_FEATURES) - \ref AV1EncoderConfig (Encoder configuration parameters) - \ref AV1EncoderConfig.rc_cfg (\ref RateControlCfg) - \ref FIRSTPASS_STATS *frame_stats_buf (used to store per frame first pass stats) - \ref SPEED_FEATURES (Encode speed vs quality tradeoff parameters) - \ref SPEED_FEATURES.hl_sf (\ref HIGH_LEVEL_SPEED_FEATURES) \subsection architecture_enc_rate_ctrl_options Supported Rate Control Options Different use cases (\ref architecture_enc_use_cases) may have different requirements in terms of data rate control. The broad rate control strategy is selected using the --end-usage parameter on the command line, which maps onto the field \ref aom_codec_enc_cfg_t.rc_end_usage in \ref aom_encoder.h. The four supported options are:- - VBR (Variable Bitrate) - CBR (Constant Bitrate) - CQ (Constrained Quality mode ; A constrained variant of VBR) - Fixed Q (Constant quality of Q mode) The value of \ref aom_codec_enc_cfg_t.rc_end_usage is in turn copied over into the encoder rate control configuration data structure as \ref RateControlCfg.mode. In regards to the most important use cases above, Video on demand uses either VBR or CQ mode. CBR is the preferred rate control model for RTC and Live streaming and Fixed Q is only used in testing. The behaviour of each of these modes is regulated by a series of secondary command line rate control options but also depends somewhat on the selected use case, whether 2-pass coding is enabled and the selected encode speed vs quality trade offs (\ref AV1_COMP.speed and \ref AV1_COMP.sf). The list below gives the names of the main rate control command line options together with the names of the corresponding fields in the rate control configuration data structures. - --target-bitrate (\ref RateControlCfg.target_bandwidth) - --min-q (\ref RateControlCfg.best_allowed_q) - --max-q (\ref RateControlCfg.worst_allowed_q) - --cq-level (\ref RateControlCfg.cq_level) - --undershoot-pct (\ref RateControlCfg.under_shoot_pct) - --overshoot-pct (\ref RateControlCfg.over_shoot_pct) The following control aspects of vbr encoding - --bias-pct (\ref RateControlCfg.vbrbias) - --minsection-pct ((\ref RateControlCfg.vbrmin_section) - --maxsection-pct ((\ref RateControlCfg.vbrmax_section) The following relate to buffer and delay management in one pass low delay and real time coding - --buf-sz (\ref RateControlCfg.maximum_buffer_size_ms) - --buf-initial-sz (\ref RateControlCfg.starting_buffer_level_ms) - --buf-optimal-sz (\ref RateControlCfg.optimal_buffer_level_ms) \subsection architecture_enc_vbr Variable Bitrate (VBR) Encoding For streamed VOD content the most common rate control strategy is Variable Bitrate (VBR) encoding. The CQ mode mentioned above is a variant of this where additional quantizer and quality constraints are applied. VBR encoding may in theory be used in conjunction with either 1-pass or 2-pass encoding. VBR encoding varies the number of bits given to each frame or group of frames according to the difficulty of that frame or group of frames, such that easier frames are allocated fewer bits and harder frames are allocated more bits. The intent here is to even out the quality between frames. This contrasts with Constant Bitrate (CBR) encoding where each frame is allocated the same number of bits. Whilst for any given frame or group of frames the data rate may vary, the VBR algorithm attempts to deliver a given average bitrate over a wider time interval. In standard VBR encoding, the time interval over which the data rate is averaged is usually the duration of the video clip. An alternative approach is to target an average VBR bitrate over the entire video corpus for a particular video format (corpus VBR). \subsubsection architecture_enc_1pass_vbr 1 Pass VBR Encoding The command line for libaom does allow 1 Pass VBR, but this has not been properly optimised and behaves much like 1 pass CBR in most regards, with bits allocated to frames by the following functions: - \ref av1_calc_iframe_target_size_one_pass_vbr( const struct AV1_COMP *const cpi) "av1_calc_iframe_target_size_one_pass_vbr()" - \ref av1_calc_pframe_target_size_one_pass_vbr( const struct AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) "av1_calc_pframe_target_size_one_pass_vbr()" \subsubsection architecture_enc_2pass_vbr 2 Pass VBR Encoding The main focus here will be on 2-pass VBR encoding (and the related CQ mode) as these are the modes most commonly used for VOD content. 2-pass encoding is selected on the command line by setting --passes=2 (or -p 2). Generally speaking, in 2-pass encoding, an encoder will first encode a video using a default set of parameters and assumptions. Depending on the outcome of that first encode, the baseline assumptions and parameters will be adjusted to optimize the output during the second pass. In essence the first pass is a fact finding mission to establish the complexity and variability of the video, in order to allow a better allocation of bits in the second pass. The libaom 2-pass algorithm is unusual in that the first pass is not a full encode of the video. Rather it uses a limited set of prediction and transform options and a fixed quantizer, to generate statistics about each frame. No output bitstream is created and the per frame first pass statistics are stored entirely in volatile memory. This has some disadvantages when compared to a full first pass encode, but avoids the need for file I/O and improves speed. For two pass encoding, the function \ref av1_encode() will first be called for each frame in the video with the value \ref AV1EncoderConfig.pass = 1. This will result in calls to \ref av1_first_pass(). Statistics for each frame are stored in \ref FIRSTPASS_STATS frame_stats_buf. After completion of the first pass, \ref av1_encode() will be called again for each frame with \ref AV1EncoderConfig.pass = 2. The frames are then encoded in accordance with the statistics gathered during the first pass by calls to \ref encode_frame_to_data_rate() which in turn calls \ref av1_get_second_pass_params(). In summary the second pass code :- - Searches for scene cuts (if auto key frame detection is enabled). - Defines the length of and hierarchical structure to be used in each ARF/GF group. - Allocates bits based on the relative complexity of each frame, the quality of frame to frame prediction and the type of frame (e.g. key frame, ARF frame, golden frame or normal leaf frame). - Suggests a maximum Q (quantizer value) for each ARF/GF group, based on estimated complexity and recent rate control compliance (\ref RATE_CONTROL.active_worst_quality) - Tracks adherence to the overall rate control objectives and adjusts heuristics. The main two pass functions in regard to the above include:- - \ref find_next_key_frame() - \ref define_gf_group() - \ref calculate_total_gf_group_bits() - \ref get_twopass_worst_quality() - \ref av1_gop_setup_structure() - \ref av1_gop_bit_allocation() - \ref av1_twopass_postencode_update() For each frame, the two pass algorithm defines a target number of bits \ref RATE_CONTROL.base_frame_target, which is then adjusted if necessary to reflect any undershoot or overshoot on previous frames to give \ref RATE_CONTROL.this_frame_target. As well as \ref RATE_CONTROL.active_worst_quality, the two pass code also maintains a record of the actual Q value used to encode previous frames at each level in the current pyramid hierarchy (\ref PRIMARY_RATE_CONTROL.active_best_quality). The function \ref rc_pick_q_and_bounds(), uses these values to set a permitted Q range for each frame. \subsubsection architecture_enc_1pass_lagged 1 Pass Lagged VBR Encoding 1 pass lagged encode falls between simple 1 pass encoding and full two pass encoding and is used for cases where it is not possible to do a full first pass through the entire video clip, but where some delay is permissible. For example near live streaming where there is a delay of up to a few seconds. In this case the first pass and second pass are in effect combined such that the first pass starts encoding the clip and the second pass lags behind it by a few frames. When using this method, full sequence level statistics are not available, but it is possible to collect and use frame or group of frame level data to help in the allocation of bits and in defining ARF/GF coding hierarchies. The reader is referred to the \ref AV1_PRIMARY.lap_enabled field in the main compressor instance (where lap stands for look ahead processing). This encoding mode for the most part uses the same rate control pathways as two pass VBR encoding. \subsection architecture_enc_rc_loop The Main Rate Control Loop Having established a target rate for a given frame and an allowed range of Q values, the encoder then tries to encode the frame at a rate that is as close as possible to the target value, given the Q range constraints. There are two main mechanisms by which this is achieved. The first selects a frame level Q, using an adaptive estimate of the number of bits that will be generated when the frame is encoded at any given Q. Fundamentally this mechanism is common to VBR, CBR and to use cases such as RTC with small adjustments. As the Q value mainly adjusts the precision of the residual signal, it is not actually a reliable basis for accurately predicting the number of bits that will be generated across all clips. A well predicted clip, for example, may have a much smaller error residual after prediction. The algorithm copes with this by adapting its predictions on the fly using a feedback loop based on how well it did the previous time around. The main functions responsible for the prediction of Q and the adaptation over time, for the two pass encoding pipeline are: - \ref rc_pick_q_and_bounds() - \ref get_q() - \ref av1_rc_regulate_q( const struct AV1_COMP *cpi, int target_bits_per_frame, int active_best_quality, int active_worst_quality, int width, int height) "av1_rc_regulate_q()" - \ref get_rate_correction_factor() - \ref set_rate_correction_factor() - \ref find_closest_qindex_by_rate() - \ref av1_twopass_postencode_update() - \ref av1_rc_update_rate_correction_factors() A second mechanism for control comes into play if there is a large rate miss for the current frame (much too big or too small). This is a recode mechanism which allows the current frame to be re-encoded one or more times with a revised Q value. This obviously has significant implications for encode speed and in the case of RTC latency (hence it is not used for the RTC pathway). Whether or not a recode is allowed for a given frame depends on the selected encode speed vs quality trade off. This is set on the command line using the --cpu-used parameter which maps onto the \ref AV1_COMP.speed field in the main compressor instance data structure. The value of \ref AV1_COMP.speed, combined with the use case, is used to populate the speed features data structure AV1_COMP.sf. In particular \ref HIGH_LEVEL_SPEED_FEATURES.recode_loop determines the types of frames that may be recoded and \ref HIGH_LEVEL_SPEED_FEATURES.recode_tolerance is a rate error trigger threshold. For more information the reader is directed to the following functions: - \ref encode_with_recode_loop() - \ref encode_without_recode() - \ref recode_loop_update_q() - \ref recode_loop_test() - \ref av1_set_speed_features_framesize_independent() - \ref av1_set_speed_features_framesize_dependent() \subsection architecture_enc_fixed_q Fixed Q Mode There are two main fixed Q cases: -# Fixed Q with adaptive qp offsets: same qp offset for each pyramid level in a given video, but these offsets are adaptive based on video content. -# Fixed Q with fixed qp offsets: content-independent fixed qp offsets for each pyramid level. The reader is also refered to the following functions: - \ref av1_rc_pick_q_and_bounds() - \ref rc_pick_q_and_bounds_no_stats_cbr() - \ref rc_pick_q_and_bounds_no_stats() - \ref rc_pick_q_and_bounds() \section architecture_enc_frame_groups GF/ ARF Frame Groups & Hierarchical Coding \subsection architecture_enc_frame_groups_data Main Data Structures The following are the main data structures referenced in this section (see also \ref architecture_enc_data_structures): - \ref AV1_COMP cpi (the main compressor instance data structure) - \ref AV1_COMP.rc (\ref RATE_CONTROL) - \ref FIRSTPASS_STATS *frame_stats_buf (used to store per frame first pass stats) \subsection architecture_enc_frame_groups_groups Frame Groups To process a sequence/stream of video frames, the encoder divides the frames into groups and encodes them sequentially (possibly dependent on previous groups). In AV1 such a group is usually referred to as a golden frame group (GF group) or sometimes an Alt-Ref (ARF) group or a group of pictures (GOP). A GF group determines and stores the coding structure of the frames (for example, frame type, usage of the hierarchical structure, usage of overlay frames, etc.) and can be considered as the base unit to process the frames, therefore playing an important role in the encoder. The length of a specific GF group is arguably the most important aspect when determining a GF group. This is because most GF group level decisions are based on the frame characteristics, if not on the length itself directly. Note that the GF group is always a group of consecutive frames, which means the start and end of the group (so again, the length of it) determines which frames are included in it and hence determines the characteristics of the GF group. Therefore, in this document we will first discuss the GF group length decision in Libaom, followed by frame structure decisions when defining a GF group with a certain length. \subsection architecture_enc_gf_length GF / ARF Group Length Determination The basic intuition of determining the GF group length is that it is usually desirable to group together frames that are similar. Hence, we may choose longer groups when consecutive frames are very alike and shorter ones when they are very different. The determination of the GF group length is done in function \ref calculate_gf_length(). The following encoder use cases are supported:
  • Single pass with look-ahead disabled(\ref has_no_stats_stage()): in this case there is no information available on the following stream of frames, therefore the function will set the GF group length for the current and the following GF groups (a total number of MAX_NUM_GF_INTERVALS groups) to be the maximum value allowed.
  • Single pass with look-ahead enabled (\ref AV1_PRIMARY.lap_enabled): look-ahead processing is enabled for single pass, therefore there is a limited amount of information available regarding future frames. In this case the function will determine the length based on \ref FIRSTPASS_STATS (which is generated when processing the look-ahead buffer) for only the current GF group.
  • Two pass: the first pass in two-pass encoding collects the stats and will not call the function. In the second pass, the function tries to determine the GF group length of the current and the following GF groups (a total number of MAX_NUM_GF_INTERVALS groups) based on the first-pass statistics. Note that as we will be discussing later, such decisions may not be accurate and can be changed later.
Except for the first trivial case where there is no prior knowledge of the following frames, the function \ref calculate_gf_length() tries to determine the GF group length based on the first pass statistics. The determination is divided into two parts:
  1. Baseline decision based on accumulated statistics: this part of the function iterates through the firstpass statistics of the following frames and accumulates the statistics with function accumulate_next_frame_stats. The accumulated statistics are then used to determine whether the correlation in the GF group has dropped too much in function detect_gf_cut. If detect_gf_cut returns non-zero, or if we've reached the end of first-pass statistics, the baseline decision is set at the current point.
  2. If we are not at the end of the first-pass statistics, the next part will try to refine the baseline decision. This algorithm is based on the analysis of firstpass stats. It tries to cut the groups in stable regions or relatively stable points. Also it tries to avoid cutting in a blending region.
As mentioned, for two-pass encoding, the function \ref calculate_gf_length() tries to determine the length of as many as MAX_NUM_GF_INTERVALS groups. The decisions are stored in \ref PRIMARY_RATE_CONTROL.gf_intervals[]. The variables \ref RATE_CONTROL.intervals_till_gf_calculate_due and \ref PRIMARY_RATE_CONTROL.gf_intervals[] help with managing and updating the stored decisions. In the function \ref define_gf_group(), the corresponding stored length decision will be used to define the current GF group. When the maximum GF group length is larger or equal to 32, the encoder will enforce an extra layer to determine whether to use maximum GF length of 32 or 16 for every GF group. In such a case, \ref calculate_gf_length() is first called with the original maximum length (>=32). Afterwards, \ref av1_tpl_setup_stats() is called to analyze the determined GF group and compare the reference to the last frame and the middle frame. If it is decided that we should use a maximum GF length of 16, the function \ref calculate_gf_length() is called again with the updated maximum length, and it only sets the length for a single GF group (\ref RATE_CONTROL.intervals_till_gf_calculate_due is set to 1). This process is shown below. \image html tplgfgroupdiagram.png "" width=40% Before encoding each frame, the encoder checks \ref RATE_CONTROL.frames_till_gf_update_due. If it is zero, indicating processing of the current GF group is done, the encoder will check whether \ref RATE_CONTROL.intervals_till_gf_calculate_due is zero. If it is, as discussed above, \ref calculate_gf_length() is called with original maximum length. If it is not zero, then the GF group length value stored in \ref PRIMARY_RATE_CONTROL.gf_intervals[\ref PRIMARY_RATE_CONTROL.cur_gf_index] is used (subject to change as discussed above). \subsection architecture_enc_gf_structure Defining a GF Group's Structure The function \ref define_gf_group() defines the frame structure as well as other GF group level parameters (e.g. bit allocation) once the length of the current GF group is determined. The function first iterates through the first pass statistics in the GF group to accumulate various stats, using accumulate_this_frame_stats() and accumulate_next_frame_stats(). The accumulated statistics are then used to determine the use of the use of ALTREF frame along with other properties of the GF group. The values of \ref PRIMARY_RATE_CONTROL.cur_gf_index, \ref RATE_CONTROL.intervals_till_gf_calculate_due and \ref RATE_CONTROL.frames_till_gf_update_due are also updated accordingly. The function \ref av1_gop_setup_structure() is called at the end to determine the frame layers and reference maps in the GF group, where the construct_multi_layer_gf_structure() function sets the frame update types for each frame and the group structure. - If ALTREF frames are allowed for the GF group: the first frame is set to KF_UPDATE, GF_UPDATE or ARF_UPDATE. The last frames of the GF group is set to OVERLAY_UPDATE. Then in set_multi_layer_params(), frame update types are determined recursively in a binary tree fashion, and assigned to give the final IBBB structure for the group. - If the current branch has more than 2 frames and we have not reached maximum layer depth, then the middle frame is set as INTNL_ARF_UPDATE, and the left and right branches are processed recursively. - If the current branch has less than 3 frames, or we have reached maximum layer depth, then every frame in the branch is set to LF_UPDATE. - If ALTREF frame is not allowed for the GF group: the frames are set as LF_UPDATE. This basically forms an IPPP GF group structure. As mentioned, the encoder may use Temporal dependancy modelling (TPL - see \ref architecture_enc_tpl) to determine whether we should use a maximum length of 32 or 16 for the current GF group. This requires calls to \ref define_gf_group() but should not change other settings (since it is in essence a trial). This special case is indicated by the setting parameter is_final_pass for to zero. For single pass encodes where look-ahead processing is disabled (\ref AV1_PRIMARY.lap_enabled = 0), \ref define_gf_group_pass0() is used instead of \ref define_gf_group(). \subsection architecture_enc_kf_groups Key Frame Groups A special constraint for GF group length is the location of the next keyframe (KF). The frames between two KFs are referred to as a KF group. Each KF group can be encoded and decoded independently. Because of this, a GF group cannot span beyond a KF and the location of the next KF is set as a hard boundary for GF group length.
  • For two-pass encoding \ref RATE_CONTROL.frames_to_key controls when to encode a key frame. When it is zero, the current frame is a keyframe and the function \ref find_next_key_frame() is called. This in turn calls \ref define_kf_interval() to work out where the next key frame should be placed.
  • For single-pass with look-ahead enabled, \ref define_kf_interval() is called whenever a GF group update is needed (when \ref RATE_CONTROL.frames_till_gf_update_due is zero). This is because generally KFs are more widely spaced and the look-ahead buffer is usually not long enough.
  • For single-pass with look-ahead disabled, the KFs are placed according to the command line parameter --kf-max-dist (The above two cases are also subject to this constraint).
The function \ref define_kf_interval() tries to detect a scenecut. If a scenecut within kf-max-dist is detected, then it is set as the next keyframe. Otherwise the given maximum value is used. \section architecture_enc_tpl Temporal Dependency Modelling The temporal dependency model runs at the beginning of each GOP. It builds the motion trajectory within the GOP in units of 16x16 blocks. The temporal dependency of a 16x16 block is evaluated as the predictive coding gains it contributes to its trailing motion trajectory. This temporal dependency model reflects how important a coding block is for the coding efficiency of the overall GOP. It is hence used to scale the Lagrangian multiplier used in the rate-distortion optimization framework. \subsection architecture_enc_tpl_config Configurations The temporal dependency model and its applications are by default turned on in libaom encoder for the VoD use case. To disable it, use --tpl-model=0 in the aomenc configuration. \subsection architecture_enc_tpl_algoritms Algorithms The scheme works in the reverse frame processing order over the source frames, propagating information from future frames back to the current frame. For each frame, a propagation step is run for each MB. it operates as follows:
  • Estimate the intra prediction cost in terms of sum of absolute Hadamard transform difference (SATD) noted as intra_cost. It also loads the motion information available from the first-pass encode and estimates the inter prediction cost as inter_cost. Due to the use of hybrid inter/intra prediction mode, the inter_cost value is further upper bounded by intra_cost. A propagation cost variable is used to collect all the information flowed back from future processing frames. It is initialized as 0 for all the blocks in the last processing frame in a group of pictures (GOP).
  • The fraction of information from a current block to be propagated towards its reference block is estimated as: \f[ propagation\_fraction = (1 - inter\_cost/intra\_cost) \f] It reflects how much the motion compensated reference would reduce the prediction error in percentage.
  • The total amount of information the current block contributes to the GOP is estimated as intra_cost + propagation_cost. The information that it propagates towards its reference block is captured by: \f[ propagation\_amount = (intra\_cost + propagation\_cost) * propagation\_fraction \f]
  • Note that the reference block may not necessarily sit on the grid of 16x16 blocks. The propagation amount is hence dispensed to all the blocks that overlap with the reference block. The corresponding block in the reference frame accumulates its own propagation cost as it receives back propagation. \f[ propagation\_cost = propagation\_cost + (\frac{overlap\_area}{(16*16)} * propagation\_amount) \f]
  • In the final encoding stage, the distortion propagation factor of a block is evaluated as \f$(1 + \frac{propagation\_cost}{intra\_cost})\f$, where the second term captures its impact on later frames in a GOP.
  • The Lagrangian multiplier is adapted at the 64x64 block level. For every 64x64 block in a frame, we have a distortion propagation factor: \f[ dist\_prop[i] = 1 + \frac{propagation\_cost[i]}{intra\_cost[i]} \f] where i denotes the block index in the frame. We also have the frame level distortion propagation factor: \f[ dist\_prop = 1 + \frac{\sum_{i}propagation\_cost[i]}{\sum_{i}intra\_cost[i]} \f] which is used to normalize the propagation factor at the 64x64 block level. The Lagrangian multiplier is hence adapted as: \f[ λ[i] = λ[0] * \frac{dist\_prop}{dist\_prop[i]} \f] where λ0 is the multiplier associated with the frame level QP. The 64x64 block level QP is scaled according to the Lagrangian multiplier.
\subsection architecture_enc_tpl_keyfun Key Functions and data structures The reader is also refered to the following functions and data structures: - \ref TplParams - \ref av1_tpl_setup_stats() builds the TPL model. - \ref setup_delta_q() Assign different quantization parameters to each super block based on its TPL weight. \section architecture_enc_partitions Block Partition Search A frame is first split into tiles in \ref encode_tiles(), with each tile compressed by av1_encode_tile(). Then a tile is processed in superblock rows via \ref av1_encode_sb_row() and then \ref encode_sb_row(). The partition search processes superblocks sequentially in \ref encode_sb_row(). Two search modes are supported, depending upon the encoding configuration, \ref encode_nonrd_sb() is for 1-pass and real-time modes, while \ref encode_rd_sb() performs more exhaustive rate distortion based searches. Partition search over the recursive quad-tree space is implemented by recursive calls to \ref av1_nonrd_use_partition(), \ref av1_rd_use_partition(), or av1_rd_pick_partition() and returning best options for sub-trees to their parent partitions. In libaom, the partition search lays on top of the mode search (predictor, transform, etc.), instead of being a separate module. The interface of mode search is \ref pick_sb_modes(), which connects the partition_search with \ref architecture_enc_inter_modes and \ref architecture_enc_intra_modes. To make good decisions, reconstruction is also required in order to build references and contexts. This is implemented by \ref encode_sb() at the sub-tree level and \ref encode_b() at coding block level. See also \ref partition_search \section architecture_enc_intra_modes Intra Mode Search AV1 also provides 71 different intra prediction modes, i.e. modes that predict only based upon information in the current frame with no dependency on previous or future frames. For key frames, where this independence from any other frame is a defining requirement and for other cases where intra only frames are required, the encoder need only considers these modes in the rate distortion loop. Even so, in most use cases, searching all possible intra prediction modes for every block and partition size is not practical and some pruning of the search tree is necessary. For the Rate distortion optimized case, the main top level function responsible for selecting the intra prediction mode for a given block is \ref av1_rd_pick_intra_mode_sb(). The readers attention is also drawn to the functions \ref hybrid_intra_mode_search() and \ref av1_nonrd_pick_intra_mode() which may be used where encode speed is critical. The choice between the rd path and the non rd or hybrid paths depends on the encoder use case and the \ref AV1_COMP.speed parameter. Further fine control of the speed vs quality trade off is provided by means of fields in \ref AV1_COMP.sf (which has type \ref SPEED_FEATURES). Note that some intra modes are only considered for specific use cases or types of video. For example the palette based prediction modes are often valueable for graphics or screen share content but not for natural video. (See \ref av1_search_palette_mode()) See also \ref intra_mode_search for more details. \section architecture_enc_inter_modes Inter Prediction Mode Search For inter frames, where we also allow prediction using one or more previously coded frames (which may chronologically speaking be past or future frames or non-display reference buffers such as ARF frames), the size of the search tree that needs to be traversed, to select a prediction mode, is considerably more massive. In addition to the 71 possible intra modes we also need to consider 56 single frame inter prediction modes (7 reference frames x 4 modes x 2 for OBMC (overlapped block motion compensation)), 12768 compound inter prediction modes (these are modes that combine inter predictors from two reference frames) and 36708 compound inter / intra prediction modes. As with the intra mode search, libaom supports an RD based pathway and a non rd pathway for speed critical use cases. The entry points for these two cases are \ref av1_rd_pick_inter_mode() and \ref av1_nonrd_pick_inter_mode_sb() respectively. Various heuristics and predictive strategies are used to prune the search tree with fine control provided through the speed features parameter in the main compressor instance data structure \ref AV1_COMP.sf. It is worth noting, that some prediction modes incurr a much larger rate cost than others (ignoring for now the cost of coding the error residual). For example, a compound mode that requires the encoder to specify two reference frames and two new motion vectors will almost inevitable have a higher rate cost than a simple inter prediction mode that uses a predicted or 0,0 motion vector. As such, if we have already found a mode for the current block that has a low RD cost, we can skip a large number of the possible modes on the basis that even if the error residual is 0 the inherent rate cost of the mode itself will garauntee that it is not chosen. See also \ref inter_mode_search for more details. \section architecture_enc_tx_search Transform Search AV1 implements the transform stage using 4 seperable 1-d transforms (DCT, ADST, FLIPADST and IDTX, where FLIPADST is the reversed version of ADST and IDTX is the identity transform) which can be combined to give 16 2-d combinations. These combinations can be applied at 19 different scales from 64x64 pixels down to 4x4 pixels. This gives rise to a large number of possible candidate transform options for coding the residual error after prediction. An exhaustive rate-distortion based evaluation of all candidates would not be practical from a speed perspective in a production encoder implementation. Hence libaom addopts a number of strategies to prune the selection of both the transform size and transform type. There are a number of strategies that have been tested and implememnted in libaom including: - A statistics based approach that looks at the frequency with which certain combinations are used in a given context and prunes out very unlikely candidates. It is worth noting here that some size candidates can be pruned out immediately based on the size of the prediction partition. For example it does not make sense to use a transform size that is larger than the prediction partition size but also a very large prediction partition size is unlikely to be optimally pared with small transforms. - A Machine learning based model - A method that initially tests candidates using a fast algorithm that skips entropy encoding and uses an estimated cost model to choose a reduced subset for full RD analysis. This subject is covered more fully in a paper authored by Bohan Li, Jingning Han, and Yaowu Xu titled: Fast Transform Type Selection Using Conditional Laplace Distribution Based Rate Estimation TODO Add link to paper when available See also \ref transform_search for more details. \section architecture_post_enc_filt Post Encode Loop Filtering AV1 supports three types of post encode in loop filtering to improve the quality of the reconstructed video. - Deblocking Filter The first of these is a farily traditional boundary deblocking filter that attempts to smooth discontinuities that may occur at the boundaries between blocks. See also \ref in_loop_filter. - CDEF Filter The constrained directional enhancement filter (CDEF) allows the codec to apply a non-linear deringing filter along certain (potentially oblique) directions. A primary filter is applied along the selected direction, whilst a secondary filter is applied at 45 degrees to the primary direction. (See also \ref in_loop_cdef and A Technical Overview of AV1. - Loop Restoration Filter The loop restoration filter is applied after any prior post filtering stages. It acts on units of either 64 x 64, 128 x 128, or 256 x 256 pixel blocks, refered to as loop restoration units. Each unit can independently select either to bypass filtering, use a Wiener filter, or use a self-guided filter. (See also \ref in_loop_restoration and A Technical Overview of AV1. \section architecture_entropy Entropy Coding \subsection architecture_entropy_aritmetic Arithmetic Coder VP9, used a binary arithmetic coder to encode symbols, where the propability of a 1 or 0 at each descision node was based on a context model that took into account recently coded values (for example previously coded coefficients in the current block). A mechanism existed to update the context model each frame, either explicitly in the bitstream, or implicitly at both the encoder and decoder based on the observed frequency of different outcomes in the previous frame. VP9 also supported seperate context models for different types of frame (e.g. inter coded frames and key frames). In contrast, AV1 uses an M-ary symbol arithmetic coder to compress the syntax elements, where integer \f$M\in[2, 14]\f$. This approach is based upon the entropy coding strategy used in the Daala video codec and allows for some bit-level parallelism in its implementation. AV1 also has an extended context model and allows for updates to the probabilities on a per symbol basis as opposed to the per frame strategy in VP9. To improve the performance / throughput of the arithmetic encoder, especially in hardware implementations, the probability model is updated and maintained at 15-bit precision, but the arithmetic encoder only uses the most significant 9 bits when encoding a symbol. A more detailed discussion of the algorithm and design constraints can be found in A Technical Overview of AV1. TODO add references to key functions / files. As with VP9, a mechanism exists in AV1 to encode some elements into the bitstream as uncrompresed bits or literal values, without using the arithmetic coder. For example, some frame and sequence header values, where it is beneficial to be able to read the values directly. TODO add references to key functions / files. \subsection architecture_entropy_coef Transform Coefficient Coding and Optimization \image html coeff_coding.png "" width=70% \subsubsection architecture_entropy_coef_what Transform coefficient coding Transform coefficient coding is where the encoder compresses a quantized version of prediction residue into the bitstream. \paragraph architecture_entropy_coef_prepare Preparation - transform and quantize Before the entropy coding stage, the encoder decouple the pixel-to-pixel correlation of the prediction residue by transforming the residue from the spatial domain to the frequency domain. Then the encoder quantizes the transform coefficients to make the coefficients ready for entropy coding. \paragraph architecture_entropy_coef_coding The coding process The encoder uses \ref av1_write_coeffs_txb() to write the coefficients of a transform block into the bitstream. The coding process has three stages. 1. The encoder will code transform block skip flag (txb_skip). If the skip flag is off, then the encoder will code the end of block position (eob) which is the scan index of the last non-zero coefficient plus one. 2. Second, the encoder will code lower magnitude levels of each coefficient in reverse scan order. 3. Finally, the encoder will code the sign and higher magnitude levels for each coefficient if they are available. Related functions: - \ref av1_write_coeffs_txb() - write_inter_txb_coeff() - \ref av1_write_intra_coeffs_mb() \paragraph architecture_entropy_coef_context Context information To improve the compression efficiency, the encoder uses several context models tailored for transform coefficients to capture the correlations between coding symbols. Most of the context models are built to capture the correlations between the coefficients within the same transform block. However, transform block skip flag (txb_skip) and the sign of dc coefficient (dc_sign) require context info from neighboring transform blocks. Here is how context info spread between transform blocks. Before coding a transform block, the encoder will use get_txb_ctx() to collect the context information from neighboring transform blocks. Then the context information will be used for coding transform block skip flag (txb_skip) and the sign of dc coefficient (dc_sign). After the transform block is coded, the encoder will extract the context info from the current block using \ref av1_get_txb_entropy_context(). Then encoder will store the context info into a byte (uint8_t) using av1_set_entropy_contexts(). The encoder will use the context info to code other transform blocks. Related functions: - \ref av1_get_txb_entropy_context() - av1_set_entropy_contexts() - get_txb_ctx() - \ref av1_update_intra_mb_txb_context() \subsubsection architecture_entropy_coef_rd RD optimization Beside the actual entropy coding, the encoder uses several utility functions to make optimal RD decisions. \paragraph architecture_entropy_coef_cost Entropy cost The encoder uses \ref av1_cost_coeffs_txb() or \ref av1_cost_coeffs_txb_laplacian() to estimate the entropy cost of a transform block. Note that \ref av1_cost_coeffs_txb() is slower but accurate whereas \ref av1_cost_coeffs_txb_laplacian() is faster but less accurate. Related functions: - \ref av1_cost_coeffs_txb() - \ref av1_cost_coeffs_txb_laplacian() - av1_cost_coeffs_txb_estimate() (see av1/encoder/txb_rdopt.c) \paragraph architecture_entropy_coef_opt Quantized level optimization Beside computing entropy cost, the encoder also uses \ref av1_optimize_txb() to adjust the coefficient’s quantized levels to achieve optimal RD trade-off. In \ref av1_optimize_txb(), the encoder goes through each quantized coefficient and lowers the quantized coefficient level by one if the action yields a better RD score. Related functions: - \ref av1_optimize_txb() All the related functions are listed in \ref coefficient_coding. \section architecture_simd SIMD usage In order to efficiently encode video on modern platforms, it is necessary to implement optimized versions of many core encoding and decoding functions using architecture-specific SIMD instructions. Functions which have optimized implementations will have multiple variants in the code, each suffixed with the name of the appropriate instruction set. There will additionally be an `_c` version, which acts as a reference implementation which the SIMD variants can be tested against. As different machines with the same nominal architecture may support different subsets of SIMD instructions, we have dynamic CPU detection logic which chooses the appropriate functions to use at run time. This process is handled by `build/cmake/rtcd.pl`, with function definitions in the files `*_rtcd_defs.pl` elsewhere in the codebase. Currently SIMD is supported on the following platforms: - x86: Requires SSE4.1 or above - Arm: Requires Neon (Armv7-A and above) We aim to provide implementations of all performance-critical functions which are compatible with the instruction sets listed above. Additional SIMD extensions (e.g. AVX on x86, SVE on Arm) are also used to provide even greater performance where available. */ /*!\defgroup encoder_algo Encoder Algorithm * * The encoder algorithm describes how a sequence is encoded, including high * level decision as well as algorithm used at every encoding stage. */ /*!\defgroup high_level_algo High-level Algorithm * \ingroup encoder_algo * This module describes sequence level/frame level algorithm in AV1. * More details will be added. * @{ */ /*!\defgroup speed_features Speed vs Quality Trade Off * \ingroup high_level_algo * This module describes the encode speed vs quality tradeoff * @{ */ /*! @} - end defgroup speed_features */ /*!\defgroup src_frame_proc Source Frame Processing * \ingroup high_level_algo * This module describes algorithms in AV1 assosciated with the * pre-processing of source frames. See also \ref architecture_enc_src_proc * * @{ */ /*! @} - end defgroup src_frame_proc */ /*!\defgroup rate_control Rate Control * \ingroup high_level_algo * This module describes rate control algorithm in AV1. * See also \ref architecture_enc_rate_ctrl * @{ */ /*! @} - end defgroup rate_control */ /*!\defgroup tpl_modelling Temporal Dependency Modelling * \ingroup high_level_algo * This module includes algorithms to implement temporal dependency modelling. * See also \ref architecture_enc_tpl * @{ */ /*! @} - end defgroup tpl_modelling */ /*!\defgroup two_pass_algo Two Pass Mode \ingroup high_level_algo In two pass mode, the input file is passed into the encoder for a quick first pass, where statistics are gathered. These statistics and the input file are then passed back into the encoder for a second pass. The statistics help the encoder reach the desired bitrate without as much overshooting or undershooting. During the first pass, the codec will return "stats" packets that contain information useful for the second pass. The caller should concatenate these packets as they are received. In the second pass, the concatenated packets are passed in, along with the frames to encode. During the second pass, "frame" packets are returned that represent the compressed video. A complete example can be found in `examples/twopass_encoder.c`. Pseudocode is provided below to illustrate the core parts. During the first pass, the uncompressed frames are passed in and stats information is appended to a byte array. ~~~~~~~~~~~~~~~{.c} // For simplicity, assume that there is enough memory in the stats buffer. // Actual code will want to use a resizable array. stats_len represents // the length of data already present in the buffer. void get_stats_data(aom_codec_ctx_t *encoder, char *stats, size_t *stats_len, bool *got_data) { const aom_codec_cx_pkt_t *pkt; aom_codec_iter_t iter = NULL; while ((pkt = aom_codec_get_cx_data(encoder, &iter))) { *got_data = true; if (pkt->kind != AOM_CODEC_STATS_PKT) continue; memcpy(stats + *stats_len, pkt->data.twopass_stats.buf, pkt->data.twopass_stats.sz); *stats_len += pkt->data.twopass_stats.sz; } } void first_pass(char *stats, size_t *stats_len) { struct aom_codec_enc_cfg first_pass_cfg; ... // Initialize the config as needed. first_pass_cfg.g_pass = AOM_RC_FIRST_PASS; aom_codec_ctx_t first_pass_encoder; ... // Initialize the encoder. while (frame_available) { // Read in the uncompressed frame, update frame_available aom_image_t *frame_to_encode = ...; aom_codec_encode(&first_pass_encoder, img, pts, duration, flags); get_stats_data(&first_pass_encoder, stats, stats_len); } // After all frames have been processed, call aom_codec_encode with // a NULL ptr repeatedly, until no more data is returned. The NULL // ptr tells the encoder that no more frames are available. bool got_data; do { got_data = false; aom_codec_encode(&first_pass_encoder, NULL, pts, duration, flags); get_stats_data(&first_pass_encoder, stats, stats_len, &got_data); } while (got_data); aom_codec_destroy(&first_pass_encoder); } ~~~~~~~~~~~~~~~ During the second pass, the uncompressed frames and the stats are passed into the encoder. ~~~~~~~~~~~~~~~{.c} // Write out each encoded frame to the file. void get_cx_data(aom_codec_ctx_t *encoder, FILE *file, bool *got_data) { const aom_codec_cx_pkt_t *pkt; aom_codec_iter_t iter = NULL; while ((pkt = aom_codec_get_cx_data(encoder, &iter))) { *got_data = true; if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) continue; fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, file); } } void second_pass(char *stats, size_t stats_len) { struct aom_codec_enc_cfg second_pass_cfg; ... // Initialize the config file as needed. second_pass_cfg.g_pass = AOM_RC_LAST_PASS; cfg.rc_twopass_stats_in.buf = stats; cfg.rc_twopass_stats_in.sz = stats_len; aom_codec_ctx_t second_pass_encoder; ... // Initialize the encoder from the config. FILE *output = fopen("output.obu", "wb"); while (frame_available) { // Read in the uncompressed frame, update frame_available aom_image_t *frame_to_encode = ...; aom_codec_encode(&second_pass_encoder, img, pts, duration, flags); get_cx_data(&second_pass_encoder, output); } // Pass in NULL to flush the encoder. bool got_data; do { got_data = false; aom_codec_encode(&second_pass_encoder, NULL, pts, duration, flags); get_cx_data(&second_pass_encoder, output, &got_data); } while (got_data); aom_codec_destroy(&second_pass_encoder); } ~~~~~~~~~~~~~~~ */ /*!\defgroup look_ahead_buffer The Look-Ahead Buffer \ingroup high_level_algo A program should call \ref aom_codec_encode() for each frame that needs processing. These frames are internally copied and stored in a fixed-size circular buffer, known as the look-ahead buffer. Other parts of the code will use future frame information to inform current frame decisions; examples include the first-pass algorithm, TPL model, and temporal filter. Note that this buffer also keeps a reference to the last source frame. The look-ahead buffer is defined in \ref av1/encoder/lookahead.h. It acts as an opaque structure, with an interface to create and free memory associated with it. It supports pushing and popping frames onto the structure in a FIFO fashion. It also allows look-ahead when using the \ref av1_lookahead_peek() function with a non-negative number, and look-behind when -1 is passed in (for the last source frame; e.g., firstpass will use this for motion estimation). The \ref av1_lookahead_depth() function returns the current number of frames stored in it. Note that \ref av1_lookahead_pop() is a bit of a misnomer - it only pops if either the "flush" variable is set, or the buffer is at maximum capacity. The buffer is stored in the \ref AV1_PRIMARY::lookahead field. It is initialized in the first call to \ref aom_codec_encode(), in the \ref av1_receive_raw_frame() sub-routine. The buffer size is defined by the g_lag_in_frames parameter set in the \ref aom_codec_enc_cfg_t::g_lag_in_frames struct. This can be modified manually but should only be set once. On the command line, the flag "--lag-in-frames" controls it. The default size is 19 for non-realtime usage and 1 for realtime. Note that a maximum value of 35 is enforced. A frame will stay in the buffer as long as possible. As mentioned above, the \ref av1_lookahead_pop() only removes a frame when either flush is set, or the buffer is full. Note that each call to \ref aom_codec_encode() inserts another frame into the buffer, and pop is called by the sub-function \ref av1_encode_strategy(). The buffer is told to flush when \ref aom_codec_encode() is passed a NULL image pointer. Note that the caller must repeatedly call \ref aom_codec_encode() with a NULL image pointer, until no more packets are available, in order to fully flush the buffer. */ /*! @} - end defgroup high_level_algo */ /*!\defgroup partition_search Partition Search * \ingroup encoder_algo * For and overview of the partition search see \ref architecture_enc_partitions * @{ */ /*! @} - end defgroup partition_search */ /*!\defgroup intra_mode_search Intra Mode Search * \ingroup encoder_algo * This module describes intra mode search algorithm in AV1. * More details will be added. * @{ */ /*! @} - end defgroup intra_mode_search */ /*!\defgroup inter_mode_search Inter Mode Search * \ingroup encoder_algo * This module describes inter mode search algorithm in AV1. * More details will be added. * @{ */ /*! @} - end defgroup inter_mode_search */ /*!\defgroup palette_mode_search Palette Mode Search * \ingroup intra_mode_search * This module describes palette mode search algorithm in AV1. * More details will be added. * @{ */ /*! @} - end defgroup palette_mode_search */ /*!\defgroup transform_search Transform Search * \ingroup encoder_algo * This module describes transform search algorithm in AV1. * @{ */ /*! @} - end defgroup transform_search */ /*!\defgroup coefficient_coding Transform Coefficient Coding and Optimization * \ingroup encoder_algo * This module describes the algorithms of transform coefficient coding and optimization in AV1. * More details will be added. * @{ */ /*! @} - end defgroup coefficient_coding */ /*!\defgroup in_loop_filter In-loop Filter * \ingroup encoder_algo * This module describes in-loop filter algorithm in AV1. * More details will be added. * @{ */ /*! @} - end defgroup in_loop_filter */ /*!\defgroup in_loop_cdef CDEF * \ingroup encoder_algo * This module describes the CDEF parameter search algorithm * in AV1. More details will be added. * @{ */ /*! @} - end defgroup in_loop_restoration */ /*!\defgroup in_loop_restoration Loop Restoration * \ingroup encoder_algo * This module describes the loop restoration search * and estimation algorithm in AV1. * More details will be added. * @{ */ /*! @} - end defgroup in_loop_restoration */ /*!\defgroup cyclic_refresh Cyclic Refresh * \ingroup encoder_algo * This module describes the cyclic refresh (aq-mode=3) in AV1. * More details will be added. * @{ */ /*! @} - end defgroup cyclic_refresh */ /*!\defgroup SVC Scalable Video Coding * \ingroup encoder_algo * This module describes scalable video coding algorithm in AV1. * More details will be added. * @{ */ /*! @} - end defgroup SVC */ /*!\defgroup variance_partition Variance Partition * \ingroup encoder_algo * This module describes variance partition algorithm in AV1. * More details will be added. * @{ */ /*! @} - end defgroup variance_partition */ /*!\defgroup nonrd_mode_search NonRD Optimized Mode Search * \ingroup encoder_algo * This module describes NonRD Optimized Mode Search used in Real-Time mode. * More details will be added. * @{ */ /*! @} - end defgroup nonrd_mode_search */ aom-3.12.1/doc/dev_guide/av1encoderflow.png000066400000000000000000002756171477627663500205500ustar00rootroot00000000000000PNG  IHDRjyIDATxxg@44SJ<{7*X >]RxRm/[vtѢ.>lYUU6yo2wrg9r$uWN̙Ϝ9'g~8ޭ$Vʒ,:!R8͈~OZL~ o=ɝ sv6I\vw>?˺;ؼ%8r;#鼨XȲ9 gC}QagF{'fpvɖ7+n} PXy.P,/\ܞs'@|&x_Xj,>g&Y`'?&ڪ]p;RxL5!j ft3}פvw<:.;ϮskϽ8Y?bz~;zF\-+]]{#ׯf$<'q^vD7z},/qNcK$c>y-`Xߢ3zO9Nx0PSA?NЪ:g6[( R(׎8_daρDxL7Pta5.fv {mr:,Cn8.N̗tE"z}<h쑐 d\ğ!^:5P tZ[QrQ2iLZaZ>qx9E=T58Ny *QX)r'9e]%C'|DFӬDp{:߄OѴ 8>j~AO<NSd] L=q܉ʤ Y؎L{8}?t:j'#>j#St N#981X.W@6{//UV(ؔԺ8rn8ͺH Xm/u)\ I/M1"yƺ0 nɺ( u|mi%oV[G\Ub,|ٯ@Ez+m ?u]LK&YNk 2j+ .ΏF| (^oF DoOxz6>m;|Wk{ȃʥs'8ҩ^AfQ+o'֖Ek |&E9ljm 3Vi?5ֹ{2pNU;m\?s:s&BkcNxk"뽹im7 N:ޝNhFc7ͪop*=Oq;l Ye;<~_:.,mda_EH(mu,^q>_s)3&3"wМvHlwX>s7v&s2s';nϛ=N^\oG$*rpmtL/e<ƈ)n/ o=rڿ0XD*/js~0]zNŦ,04$J/3".~f=DwYFc]a< ^ꭠ/M.dr\LcHi!aYms-:N{@:2\EMs_C8ˬ̰[ϴgMd$CNs'Xǣ""3, 9[¨dV.#?a5cm2J'|mq0 ;-*>aWxyab3%!;Rǭ99L kTKimU򈿙`Dx4~}Fc}'\u5]|UXܐs.l•^ϡw$9JC_̫.rRblW~2L˄M1Ɏ8_RGl/j?ӢeNYB Kv~0/b 6DML܈:-R3[\':oRy937'|V;sƜ#'R8ԺZP.^ ż68wք|~$VUer D:Phh ۹n.PJ-O"Px™mj?nx ar0ŋD3s1pa ~i'16;"<geq&LI̅g{`.m./t{&@NNM~Nt<pkgj‰)|v\Ǎ=?ɞh~u;g|']90j-vZ'PgYy:ZU'1|29awm5ݞx~{5^1UU avdhCjR_Z3^3ӽ=g(јX I^9N0XwqHЙ`p [BpRI!Lal\uaLu dD+FAv!cS T 3""u,MX63+lGij^?u3@0I|OhrƒXSWfLi)LfB ַw_8]weij.O@GdhcFY.,NkKtǧ>?xik?$n5dHԙ`vQ=c_nC]ğ)4I&yngW F(Lb=yx~3tGύRx[Sz}b=qǷ[j"g(kk?+.\7$Xr>?QhrΟD#:8Nk蘉IoW9'v &8ׄ f y?wIL2φI]N SR :+4uNU80>s#j\#!߇seCe2,(PvNzŨܺ8f}+Lb{aF9Q9[qn,8uZ¦EygC}D Q`u6ǹZlq$sQb$_{65qߘX:`u1:~)@(` EQC'osW x3Ѳ\9΋L8U 1qu|,͋ssD|<8iK7}?Nk $ '̡V.x 9.%y>gOXt5ъ͇B>M% G\- >Q!jkyCHPGKutX7-ѱ=ڗ:Ø<ӎ;g[lk!Ntx1QԞ$kMs&l|mPfX_zEFM=h}Rl gS[fm)$Pffww  ;<11;X<#2}]v ;JuBM_Ksk{*|' esoi<:dx:+L9jcں9?}ֹLԃmL7Z!Z_syjʢ0[총6[smLo=Z_|Ud~F|^_[s1?-Qy$M2*?'.;sin',tf:`:fY>9{"V~zK#{:Os|~Ixe2TB{r,y&d =ֹe&ìosg <5c?s@z " vp{2uƧ:!_*x_j|ޥJf6 ^̲n]dn"_G ѐcuƿ(͉x-7L <ZSujCNji{y28]BޛƎ}eI#f6W;m, s ;O8m'Fե)OsRki|G͐ϒF52];ק:Nkˠ70Gdsў~ eG=F_i!^SDw\| gN3sΟ4̰[էynu?G#>?Dw/< {m39n%r_"˝+seI7PLIǫi<+EU~>շ_VqB"%׿[؜3IԷ0'^x9"޻E~c^E`ǔ&>ۿ{"/7׫?ΡAaM9?FQ,Mp8Wd}V,?#eq΍x}ϯwcgKI#%~p#ٿ&hyug9Mߡ[j9&^gnW@SoY30x%[J3!9I~)(ZO>'Wi;_?N88-lw8TY#IY˙YNյ!_2kC*k i`W7_ӤqAs8TY)Ѹ8?8ZEMsI zrN0Β+3beaTJN{~%R hq#?VNa94) UJȍ=r=O@Jces1V bemڿXY4w)^^g[+e]jkݏʓ~=LXiKs NaX݂V}ri eN- +SuiQ92O-qKǯY{z.u2BU^0<{5B|'~(PM-O:@{(S@6mѣGҨQo}9hР#F4>=oݝ?|%\fx%??_-URR gNz0 Ursp`n99}ڥtL{_Id9ϣֻ ė{С4 0u]7{j5OQ^Fw˖-^©SyEs/**z!|vrs[7k's~^э*:=M' ^/2gNt./,,ܓ[uս+`sw}k./"|Q~]z;wܳ{ܯ_ ^8pu t3`ok*ܷ nӫ岠uΣwrr^/8}y~~999G?7ݻw/\W^yk1xwsUWaРANJZ?4{P'*%x\{Mfz՘sksssO 2.>-5_Z5|?t@ . C"pgj^8NLݕ4/߿q 7y' z@ G}\MMͯb_Oyܝ ׺tZW!]_'XEcIYcn]߾}Oݫ }衇^0a{_sDz$DLe~1;"8ى`uC: N)D"L!t}ոq~o}ғ}i<ϟ4@X?3NʓJC;N 71z&e'7nx@v^jկrrrYYDE~hVNsKceO bB`Y!ۮO'>4χ{'Y,y~իW%? J,PºJji^/1-l.~ )vwfC4ʲQ&,cR߼_~ۿ[ӈ#$7MawUk:x;D6_okq6N$ 8m[%T1eEݹ|:aY.\V{U,X=Lcc#!X/.7ΕQWW2*S N 'A /?{ai?ӸǭP/QԪ_X8__VؘRk/qcGkk׮Z~x'? X/*?7 ޺uw>4F__mkO@MjbmOv nQyڇ8~UV6l;8õe=S^8D6*>;{ ^|y=\.^M7|s[@2h@e=:Wd{h}sgJKKό3իW$4E+kvqqzkӾ};pCHֳݼ.]}s2dțo}ݮ |WԺA~>n}}7͜9M7@HIdq72套^rnzk_|rss}8}}]fΎr!wǎ?i+xo߾;v_͛۷{c27 ^ˋ/n۶퍛nW\sgveX!?O(AE-05fc5Z͜95jԹ3~?xްakJ!c__5kO0#F[mWݲeO4|>uVdwԩ1cIƎٳg뮻Ng$t%[FH!̾07Ϳ|K_'?_VUUfѯZVZZN^駟7mٲ忾/|ʕlٲΟ?ؼy~q5#o/?]qgF#F4eذa"RXXVlMF:S^{X]~hѢ馛NqZjկm~wt#T9Ver s`p}˶OJ|rG7Է, {{k߽ӻτzv _q@Mkzu(Ŗt۾On R=X6Yñ2#Vsl *(<+bԑ nEy{͔K;n-/+vKG mi w?3zNE6T;u u9{+F 1햄QࡽA- 4Z^ :wu w5@@[+{=b.ۯ҉I} 4֞n奣j} Ō-x5{C_߲=ze='|g۷ۙuyǼ>LkL~X_VZg%lZ.#L?Tk;KZE^{@S ?4ԂOq&^^jRyx:}U*Pu<{q9RWWשu0aW@quܿV(jѧw}9ru V An:e]I`o ƛ|Eu[Ә [9W Pci{F`m_zBd*ˆY~WiW]kobp&ok;3u5H)vUo5.ODjg)$6a {-k?|Wϔf{~(D׶Mݼy`/>}ز\]eO=T˲M6SLoYmh}-[b[YY&ʝ4i{m/wΝ;/^-Wa@z.R- Y~D s!^C|ufΜz!֭[@Na۷{E N~WЧmqk׮W]]; \i}zf"[nqʼ۟BFS555z Y_46_N-T}zlqqq@R`6[H@0d]yp޼yԩS۬1_xZ O4?{ T}zCvQ+B@@HHIu뼱wT7_cΜ9m&P7^nZL -c;4U?g2@vhZ 5&3f7~} eZֱj@-tڇAA XuN6$ҁES7g}6t`xKߚǨ^pl?S0gOaU+A-3 {i?]7 wnsf&BHH!0rGҫ{oэgTwv<[|Z6 Xj٩#ܧԽϛ)X2m;~2CvEowmo?ߣ߿+aR_$$н@veW99}3?B33եĭT~Vpp{>qvn}KHee˧{cMئu6=V~s[#·KGyòW^ԲrIm'~|=RBczxQç믭*3-V\d "i?VBvjhP^K[@keЄ(TLXf GuM^H~ڞ m;|3]QE (JEw ){+Vyv Ժϴ 뮺ͪ]U2M4v>vv`\YWxPO!`2\Б0{gլ  L >i2 L`u5]m@h媨%փfTg@f&Pi\@v™ ?ZG5c2OzfD qƹ ,pgΜ.Yb wڵ۷#GPIv&;!`O*꾫@OaO6f?uUpPS`Mږj,>j٧.&hwmGwP~.j;jէǚ@ZYzfWٻwwzL0k/W/@zA@'!`ÌA3&P>jj٧M~. ՝W-qL]u]ii?6X`؂ zO) 6>{}{{t7%`套^ZM4ɽꪫZ3u۝5kԴB{myrT_}ZWuϝ;wwuaÆzLuuyK_bЮ~7bu{ӦM^]/^ݧj;f{[n>VVVzk.wV NV|ѢE^Î /EuҺs񎁖rJo)SZgI7owxyc)T}ƌ_m8r-OQ.%SXwOc:h^Nuf@HHNʯceẘL۷o_ks fuZٙд3-FrPu3 z)3 U/̄nw~.úDŽf¨`NBNwm׶UoG!l& 0Bn5 #js~ 4?PAT6B2;вõನ0xަ]hiSNZ1^2_jU(3W۰CJ=*ӾZ'*@sm-S:'S~y3V+JI{@u6VhjfST{}fVnm; f,xiL?nڷZ1&P- U=7P]y:jZ"jd@u62© /cǘZ=+ӂ_{'wupK!EU`nm-ִ AƨP-ԽLeU`iAi#3qH<7!vP'rr*1% a?n )S8@JaS4uWդft@SN͚1 -SCO MWSpif\@W PcjZ_u[Z_c~&T}UE qS,\S732d$ϿE&SXn esO41UX]n/t[-L0ˬyMŜowr>43k?N~TOWW\{[eWML;=F4zQڎyھ=JضU'=Vu@Tׄ 5q6MB,jmWWWapA  IyɮK*z#_vR}͟vbK{uisQx}e9^=xlֵo5ˎp6-u_qkΛXԺryzE˴3?j~{6 Mmx,QMruW]5ݒB@ P~Ľst)0t#~RoY٨޺&S]&^^ v&{̫S4,%TӺ+Fzsgϸk+AZ;nۖ~{˶|Fo¼b+n:EU_SR<8.硺i{׭e1O/ 3(d,0`PSG6xU^zrOE-[['-S+<րf; PM-evp O-tk.oNA9걦߉m[9w;_KW^.P-z (r %2_[;Re`ÿƪ/l³o?|K)P3|94I6LA]u0NZ:jz+/jL]:T5n+,U(mPTݤ )N'2e< N g*7ijiRLf*@L'o@-3 %8N `*̦6S/ hd2hR^ԢMaMW(S˼ O뚱U&jƳ'1QLoPA}L2jl@],Y]6Μ93uw„ Iً׭[G@/ѠIa&QL*D8z&PS+<-xjl =MXd$Q0N) /e7f-6 g9reSҭ Wg QW.{;3i23Y~j]Jjg~̌E}鶺k"j,?|ngmꡖڶ8a ޶oO c}IG;l޼ٝ2e-Xevߗ^zɽ[I&W]u{]wNoԩS[- }0(0SXKOa_MMLe]h ʼp΄tzB=i `OQئN)߿)۷o)cme@a&0*PSȧn̘1mZi}=>X?E~W }ٶZ!*:Q>rHcv.]4*q "i_[eu?8.k]oy2᤽뮹ܭ(/i  *3u2*8[-LĻ}/m-=J/8τp ^uQ_> u۴^TpzUl_u"$k@{v_E@3}W/`m}@m}&h6 䍯 %8 >/>2Ji;@zMh3AY3;O3.]^gc??=^P>u?V݂GT&>(tpU6B62564 OEn(m.v ?T)Pm ݊54EZ`/PTm' /^uUO?m/Zݹs'  =#T)`SW߰tU`uո}kD;~ ̘z~GvjRL! T4Zjr|ڟ kd@V{mvPȑ#۷3@S.ОLġ@N^O7+&8ڏRPuhzLpZ*4P-L8#U=񇞗fVvp֭nuu;awkjj㩟O={]w6m~Wyܙ3gm֦eߪUV&MrW\ٲN^^w umw߾}-׭[lî ZUѺڶ͛72@;$  왋* $p޽^vZw^XVV~/>|d/S Rv=x/^mk^7j(wŊ^py  LC3fH-uVƎK~mS2eV;vPQ.nmϿl vދ/644vVЦ@<6Q*3˴v.<@}r--++,,A^E-S?]HKЌ15֡ZvU=Zk9/^; jh=X~iJD!~, =[(S빩Sza·P}UYYz>̌'nT@;Hc:]v]X  =Y/jy8zec.]v׍tU`p/Qh+3!.La&P@hB[n~RS~}o&;U^X2e=h|ǼߊE@zyf=|[ K =OƱ /4΋3|2AsVOиf=4ͭ(/y#7,>~ _xbwy dVΣ'8A塯w =Ͻ f%<(((x-v^ @ >j䰳n$<ֹC{ig7`yꩧzqihhp/?Nz> Çm& Dԑ ]ν9d셭̙NW`=Ob:؈!GG!`,j c[:k6l؏F<@'p.; @)1}k պQ9w{.zon~V9_ ٳ޵kY*۷owg̘q.77aÆ}'zW @(/yM_ߴțM-{Zk>xlwo-/^ЃNâ9dȐ^x+VO?4!Z7,qɒ% > 4h}-.6dHބѣ~O9t ήr_u7;G3sExȘ|:NӧO{qK]j{5׼UPPpV_<{' @ẗk/w瞻×[|{pO7,;/[^Id^vJRusqq?uݺu])G?ZM1tЃiY) @Q+˗M0 N ~o>R:ſz&_yXY:x} Sy+Wtw4KPo>w֭n]]еO;77O^N~9(ER+ sРA/lAA_.Ι3Oj K-טmy5f͛Yz]vk|O>"v[a_]reR+}Fnn|׻un9r&Lpcon6lp5qOVTx]l_Sڈ#~;&7v\^nZZҲRo[bOOoGl}yw3b[~NM8bXZ7;~)Z'_ͧ[7n|xUW}hDc,ebߚ5uxo?XBxxj/tjb9O|6L%KQEQEQb-d+<ϋ{a->sphˎ??EQ~AV׹8 _/ـ'-fr)( t#_~wQ1cCBOlL%$EQobw;fd9h~8g^7TQn8|(>(bxe!DSf-8vĔ`(_,n @n &uj6z|ϲ_Y 4!SҾW~5>_]:]Bze~`{ ?2m}ScۃXNXT"[OQTRf`肕@E{avqÀ(ʇټEbOW&Ȣ/F8Lz|EKǭc1QaFz鿌uh*WpNf JNO ּΔFق}DHX؄ MDDۮD!]Dnb7-+=sUXV^;/)s+'&/ZgwʔLLm[L|9[#(kt\ lܬћM2?4񎈸C%U?!klUqn*Sj?^M5%]ZiEV7ӛ5,%5 jC/$6JиI竃WkG)'ĝ/~5b nc׳pI6V̑GM!xU[μ01\Ddt>Ả_x?M3[ltq8(j[)f;ԩK{?3OQ17f *E-N:}> *Ww>ڰAfsǦ4a{Hql;/cMD>D=Ȣ| V~{Ŗ3޸nqż桢ϰc϶"EȘ_q_B!DElب)qÂ83n(ɷV = <}IM'y~f=ѱ3.=:IHVxD1b'ޫ~u_%^Ym[01^~9Q4m^iO5l*OIV%X32~"=V<hA-[OQT*L:m3#+˿#X2_Pj _G )o-յ 5ɹS`bed!i& qOa?%|LPHP$`пpƼ=Ztu ErPpЙ_5M{k3d}NdۂT=OL%E0)jKtީ__ѯ=wkq&/= %($hvTȝaG6m o1" jHKf1z0=,f5<2Șpz|Y8a]-侮GG8EՕ#c"g5M}Aø|FHqE "jxG6"KMo|:8$lbOS5d!۟$k֯c[OQԥ{ͷZٹ@el?0%RSlKYҷ sFN0e?:1ڶk)owwL%S^>A})v !aOljc(F)Il'4H}z·@^5rF#LJN[xxL%EbOϘA8Eך'㣷2(j m۷{YFꂳۤ~n6|trWphA{dة~P;scDZF#"6T7"Bl7l k{V*A-!}n /X}(.  Р}wu#3 @ AvqP`ZW~\|CG׈6kzNJq%4jvl}~s~-#.x,2*pttrSIq850pU;!k1v^(d|fğ &0PVZ^,5Dn#EFguOVXuvZ"a܍w^5񅷞5[,KcEl!a1dЏ~?;żs.7>؝&L kCڵ{\̿*E(% BJW ?M/̺W%V~!Y=foبQ:_/k6=WtMލψWcE2c⢿̻ˏ6׈cbc{~c,)}ر;>g:s_o ܫ?{@fs ;&&cgX'p n˻^5/ȯcE&5 A]!"*xpXp ;Rt/![((- o@~1)u3'(/u ^y񀀀3&FzRŻ&kz?"n?+l}=xǮg=" |zo@k`HѵngGp?P$wiye0g馲;r8K>g!^ 7ikTrUGvS$vmi]c~ [PiXQ_= *B|7a [8P]Np~Q:*[uy.eu>^/%آCլB=ʎHQQⱿGEK ?v~ע(hhUALHͿ/@wi8ǃs/nosUmmy;y:wf]nV4AHRTu~myi3]eڸ~DU^5҂Qealp_*8|Q{=Ю9cqαU $z^Du#L%o;5[|߳-sgsZ{|py/M 2">,dUo{jѕk$w$ MRz9= f94mƖmjwJS;Xw <<ξjv켖I6٧]7im8g¹C?CP ɱK7&Ǟ Ʒ @m뎥5M=g =#{34MH|~JQ\ vN8lN0|8 4jt1+t@O[@ĻRy N#M`~Yck+4';\'b |oƍ}Vpɬe?Hط{sn|cx}yY 7L}}(vzsyc\oBGk캠.#E'D'4(nڰ ^ZNhV ;|`u6<&lMW\ @. p٧:pab/_?<"X-~A{|}׶'}^4MH d&',QA|Tb';?9Y)ⶢ!_Y!KNvĨPʖi^Y2D+C=LOY WS#P?Z`íbdQHJK8b:}yWoY @je7}ӹ_ =aY_%{Lѽe;:9u [X ) : 8 yi|5p焀{ojY'p|8UQGk?[[韖{_⠷S_cegM뗏r>w<i n0PO/l"RX,RױmW/׋M%iu*ߴJh'ŝY7k~57 V*bB1pToު9[&q'; mq*e}/4 E44U4:*w?7{tY@UG.ltx PYfo],{nn#za.^~c 4 EQ&Sb.!QzH-|@'_<7g 8l: J58$;hTS@Kj\>oY>Q,{r`G!c1}EǞmEb x"BΦe'o?0X;_2i@MΏN[;þֻ͛dxsF&f/=R񟝄uYH\fZ/ kNrхf~jC5~4 E{G|mU DA{V`WyJ*]KXV 靎v>"qŶ`[lB̅6.DbߺCaWcOuNcC.@i484|Bj/X)g^pGM_2i@6,[$V;~u]=v7yoRl6o'A ~Y؏}Rz({(P)'k(-E|Bŝw8z՘N.JHUߕ ~R G.%~س/8˯o.Od׏;NjrosDv$ؓ,F+wso oE>npJD 1V6OR .+^sP!+]DwNmYc!{pk X;\gۮ亓& \GP7\WT\F MۦHa9yyG-~ )W`yFaO-wr9?>!^߮9,d|BY52*BRJh![^qU o(!gF@5PS7R܈zW"5øIFu$؋mkJjS-۶Gndžة#}~jxյy#i@ϲ}/`D]?{@VŞ}x~*퀉( ~>S}B1` {߿ZA 5TY#m~`yϓ {+f8j5?'[|( }L )@"i@&꫐"jU,q0}͐9ʧ i@&i mpIɱۧX.f>^4  MHQ4  MHQ57>!W9|uYB4{p^&i@ iw| ;E9A`= :_?v|NX{zۜ:p QA975c X& i@" }[F7O|b!W5e)?6@5 ppIh@2D*6f;!a/ܔW67TKf Ft E2~qbڼI>4 EhER4] չb6h$NjLZ5z|y$Ɗf]tVP6]ּ٭3t<9ɺ 5UpPQvkp>y:uOwOw-xO,s/*ڳsxs@^]Y{+~p~-4 EHR4] DvJT*r V%`e! x@B.<'o /Ιz@ i@&i@DGDu3 xUBp0y m]326"%`߽txhDپf8_#Fk+ _/x֡|Sׅ-EE cyF_#:_o>l\?.D<սU׌q9 ) s)&\섀QH-~/TRAl SE'R-m#gŋJ~^dg&[J )C$G~ha*aRz0@^]:B`!Rbe;n.Ow׌yO0,`ZUg:u`\#1T4dYPh )r+9׷ ~.RrqT[@7HU*0PS(D!"E :1wїE׼ M Z MO@8wmpCFVYH;b\lЯt`GM@:H<m:Pz$(,儅~f@٧ J1:>`PoÞ"` 5VVZ/*t܇?$ MHHR M]S@>A҃N ({<0 Qms@#"?#D֍ M4Z@GD!~zL^C*bkz 0@, EQ>uȇ06U8 tb2R*bm(TpQ8zkCAs` Ш,bgM&Z$X$@aˡaab\temIDATv96=gܸAoޓ+-n1i1"XSIU@C^b0uu^<|jy.ҕ1VXbixZ7'8õ )X@tW*C=hE:0M/ ݸFs@.{GzSO|ñw^+a^=^5S@HB@:c1A;c}[zh~q^X7[zݝb@_K& Kw?釟}QGDJGHHQ45 !pBT] PUq"Pcq`C<W!ŋTdkp7Z`"ŨS׭GTa4c-z1kTץ_#Td#Qo_C6јyPiX~q 3aF@J)0|`[P~"1o;jgti@@_Ic&o[q۲-A}e۴pkn|yt`:gRs`Xj۱WpE1»>JYr6<ǵ '(@`EvqV׭?OEYHG^E} sٙI.Cj(]Udw+NO7+|zEb4]nWiGO*Re#o|\ !`8}ZTtڴF іִ\Fw/[~9;<W*O_hه6Duu%THBuMEH&P^~m벑rkL'!xjrl^`8"<ɷ xeE.ЧYm$\C,@_Fe#bީH!";rB># ~0X2 {^=j< E Ѿe_*XqfFнU})ELtzI-WL/DdIS ME +-(XHGU(^C_:TD Ғq עp{ '(enwU4[`⿷ԸY"MHE UX #}ף~8{7j'GȄ0a zxlQѴsAX\* "`/ "GMR{"R9׈8T)C *TJ2qnKD>b_AG<È$(6ZEqkm{1ٶG>rE:= =gT-*.MHQT +00mCf 飝 ?EPW ̑>T_o{?V_ ok_&," x`m7e_@@PC"7bqxeþz(dCz- "1V|ƨ;}=C @*a}HI69D2 1/ֆkTiND"ucLy}XްQ@I@/bؓϱ?'(/X|w<ዠj)Hx^qu> ;k.wtE ƥE7;_Rc pFtӎ1F,h֭0#WvPS6l-#uc(ySȞ5M~bbR"D1Y-WU]eD{7N MA4U@mأO(ku%7zm_ :9; l`C"姠1Os=pQ߃c`m kL#1$$A+jn#$K?81CT@H5:(`*]9  ܥbM8>sF8AXE=thV5a\@=QqʺXC @2 ,\II2RP +%61BlI6qYYYrNwKm i#ׅ06`Ə u*Py i0}q0ڐ]+ցH?*[]kµuHOt,T` wAmJTWxTQ?"C~!W髀zCq|a|fRpP Lj,1aD7 p.! UlvEAsuj *Wͻs޼RyG-N[?sk3`@ צG8(yd@&Tv%B@>D)# OsDSaCTat޸{Tz:ěk;`N Á /q8{6M ~j??{+N7D fl,C6R**Ct! duJ_== 8<Ο?B+)rW8ç>ßcD"T߻FJ.8TCړkTU"Z}h7HETqAuO cH8ݐb1"Ѧ{(@#D3(,+-[$+c0~y !"eTC!Noù4ھ}v7m@P8  `:uRPP9$H:q.**ا_|G/L{W% +-9q;_6ןn QE:@"8&7@UQX@G>zg*?.fujT3i)H9vk E^}Tdbe փujD3`S.'QyKhE{"U[ǐ iQzJJcs1E,1\뀭pN>y |a.> C1Z$8@b jLwu@=GGE)"bߍ)a4@TkØ  ﲧ|wH<#D~zthbj 8n(CZ/ 3@-W*uue"qQ# $8_5`zcEv "Ԟ;a^.us)}!GQ\}_ްӱM^YN(gao?Ç>=TQWUUu*D1"ET`@F s P$3Eߞ69 hov=$??@c) Є5(=K~V:Cq 04qӇUU*xyy((Xhn|H5D *觏[OՍY=TV1Ӑ |苂~8)3'X)cWD=Gq쿇j^=E-IM{JF@\+ '/! EHW7]/ 7]pw)/C^=D{ !*E:p^, pnLF%`n3s1/߿D*x@{Ig{l7x)?PܟERrc!d:M(' 4`ـv-"ME)@ù s(BܖW!#Y$^ySy|/m˻98;+ q'8W)3ETL>90b-/.0Y׃ua^q]R?Z9@ee9KPER4`e5ݜ"|ѲKƑVu~`859yoDE^;]Ψ?]Վgx`!n6h(Қ6= }~'m ܭW?ѪM{9+ v5hoR }ц8֚.ĹzwkZp͘ cP(kUn+)_AI}}wU5˚h"i@@@@f9pt|I{^#%po{utmQ?l.BHQ}|~9c&tSpmA\d9@цz{-Z  _ig? s a\c lz"㘊.SX?ZV 0is!"Qyz2 l|(VU$i@JH%l#([Dlb/ K+'0sN Sg/*~Ϝ7 @!`EVRpmʈ 1/; Rb@1[=`XS P>y6W X.ꇱ刨@Ub"רR}Bz ! EHEU=*LHX+8 ༪Km{kWu/|i]]}z{M~bGc֍uTwlT-ww%_+?T_I5C;= `Ku*:Ǎ3@:wUj?AwK@㘕T*BR@FcфW>QT;B1@V5.*wbTؽ~uxʻ>wy i@jHV82~mg5Ee[Y57ǼF=9#3XRaSVqU<2Oð?x4aXGu5DѼ{L' 9A`yzeyb>`i:TCD֪8$sDcA#$ނxC:qYˆ.b@>Ն񐢌Te #qO_BOG#d8 7",%NۈHT1a@WRu&ti:u_ܺ+꼘>i˕:Oq VІ1nC=DE'z\_o׶,&fƊ!rI4ez]+y *j}0\OAM]VצX#xX ĸH%&>SIoT%Ug l!UJOoU<65@BLUs? Ce_ Xu<ܵb>g4P A_=:kת*+ ~[(?: "%sP7qӯIg4\Dҥ6K 5]䊇t<٦Uqu,;]zfCF5ߣy:qtj%V<\u`4QW;Z7c\ٶ|p}xv=$ mM/5;j/ӣGLΗk ٯiuJl܋爒{nڴ R*{0d2_XP\X:ܹ"n +?Xe\ѿMRph)X 9E!٬_C+[̌quyJ(s_o/l"*6u>~ ф?ϓ.!?{[|l{qcDb>mYut/35άfW.V I̓6_\ƚ8 8b Rb\ͺ5/$}!PTU8>A>ڿ]kYQTyFz' [U/ v5s8.00P3tbzZ:z\/ŚlkH_p @l9s@D˓(T|i#DvfҷsFtۦ]TD{88|0^gK(Ks:eH`@ڵno<@'§J`H?@/1`@#ɮݹԘ E~qы Z82"U!֪Skĸ:jNe#!AiLƸj]WߍK8`=hy  : ) !JN#O 8` "Uyx~:}n8w<i#PES(HB5.֤![$A&;G)[OM"1O.@4A?@668=Z0;%EhRB<*HE1"~H"j@c#Uй\;͓畕lLv׆HA@|U `pUF5R9n+(! Ⱦ&v<_0}~j ̺~h60U./;@9-Qu:8CĠ_(a0 RMnPQ:PC$^+\^9"psXT=:P5.84ÚZLXgb'īN |e:.YERae9@x1q2ࡂz~ j@HӢ(fs*`uRU# uȇj"iz_ͧ`R<^*=q݇B@^ ԏ!Wmu4DDtކjoK)Rdd;Mۧ>z M z#>U"MURv1$#C oe 2˚8 um}Q!`H5`O<&^zk,|azd]6T#Vm\Y&DC" {U-vȐEyZs w8]ڧ @U⸞ji k^D@"mUqR؋vj:"N.| jOF}9E~n=ՆyT+Z6̇HEKz@ã 矻h@e*_1*S&( 7&C*.CğJ'\',C_7#Qk@4J\b!Sc A:4" >ľxn@`y)*Eh-#xaHiFbYmHEF"S)u%Y}} ~9^x{g}dG\@@O "1!2pCU`yr.`[:¾u8HyHq@;sbojN8m }pMW瀔7@k#>u xx U`ǹxaԫ{]]8)AH{BER*( `XU*PTZT~}hG joA}8* W^D8^#zf?cD  ZK5@;8uLтF#b}PD/j}e@DaCLHQOfcc7W,ܿ~b4ߪ@{Qi؟[#i_2S]HRsyQH5CLꭊS|UxW<ƜRSkE10ׂ>6tc}jX1 ZχZO3zA 1`͗R5+S6L ! U'pR]v>7D)>q(W5^xk=ErLUǣ]:>ط{X9o y/W0QA?}<LXo_Qbgʧ4SI;58*[6*HH=Uz"Pի1H  C4C`ĹHUmG! pQU5hjhøj0ք>j~ l<V\rA}zwߚ*W&fdU Ҿ+HH>}HU| {CHQTŚ-746rgt}HEu_Tͮ1ZU6 " +eDE"}T\e ~M@@+eQM5[DR4 `-EE=!(ʋ9Yi~\yb17 k_qo3)AH HH4 EQTp8A-/_ :7"V*TU KQ MHHRA٬u1N_x<e+p } BU$i@((jRV PxP\,5RE9Lѵr#5}8:R ҾHH4MHQbȪ"LK5eسsAcɵLS U zyxe5MhEHH4MHQb;uWo#_*` m|hwQPxۊ~v/}q^ Dk(+*>tMxˏ[5y~1@>@D(qBZ Ҿ /5 " i((34zl#7TT^Q. r4@@or<ց9TJF:1օḍx*2[OX}&+AH D|SI^& Ҿ MK$i@&(0$nuHhQ`C SE@@s\D ւJSED'ᐾmdRQ|Q9@Qf ?@W ;^ZCM }D ! EE4 EQP^={ n.OZP)/lrѾ~. 7=\}PQW+ mx|z͈(^cT6N>d9RqCED,SI7jt@ TQ4MHEy@Bg.hyeE1m3[,:/W`ûNτ' k*  i(hxlt)_(" +Bϟ3|H~յ/^@D!oLx (@B@@&i@jZaAK B*&Q%Uw‚_lV ǭU>=Ћvy9>ŵ~k*AH TQ5NHHzOr )p{ #E%{wkSHCwS=:CE+^';a|gګ M_< ~_Dnk^ rb1Cp#bychρ-Ă>9{L%F"Ou}G Xk?W? @mѸiDsjѕ77踨s7˟@6Hѵ,9JQuƻGUѨ><ڠS!֣ >#ptrCV=|Y|8@e>=]C׾ᏄWtMxs1q >紦 뉲;|*U pЈMLmkg;n?Rt}a(*OiQ!'k*m i\}[́:=__V3Pۜ{<]C׮uOn /QנKz" @n8;@0vooW$"c"ujlW/ΙC E׺|~](HЇϼc!_9CCl,zy a >9`F.b9¬?B,ϛV ؊ůw*% !%4-75Kum{6o_n$ha%.p>~聢3.T[QqP@JI{C?$uVC{zlߒNKoS|wʊ6DIc **-x'gF6U-6{w?c-.+WPKwԟt& 뾫<.UV>*0?]^♽{h92 RϾ̇k׫Ҩ|qb7PI[0e_uِ$jPsOYiс>QA +g~z_x/-.$% W=lSA2Z z[I˄NA=Yr}eGP8h +6vwߥy PPrں_ҧͧMPڰy uGO8ݲ2UV=I_j<Ԭ,ƍneY.'$I߲[*K?'E =J#ޯ(/~ ݸ(qW/IG3w̫&zfnE Zէbw~A90*KLdќ%L0/?x[ K󡟻eodZov((%{8Ws6w]t{x WN0.7͛w/kN;0r1|7/?Xَ#sYS^RoTۥӧ~$IJZC}Z*wָOoY~Ƭ~_n}xˊ)*F_1F`< sd;i>dṦjßKξ.;0޽?{ěk^/,.xssڻId"m`#969ٕ;Ӆ?ibE,{ێZ۰cǜ ~_((ćL|GVEy븱Ǿ~W?p=+JL՝%Y½SgN5`qKJwwu)%WuM/Tjr`/=_P^eo4㰉^w7NY:T ZX1TZ=ɝd{cg yԄxTkUPyyyt+6?5Sf}yyo޴w]k،u1m3o?^^{oKoXV9wgc>3'c~/Jy|WYywϔ+~bcN"oZ'eoxhT^RrF],/-*Ywے/~qoIQѳN$IRiLU}ԼXR\g +gyy G&,3#x`C]^MrC6ٟ$rCS$XfCt>gwfC=Y5 /..C\lu Cf'3^RO_߹Eo o)cag~mklab 9'ǡ  df7ϝaiq1$I ~}j~P+'nZ*c у?81>.^r__=`]wղK.~Qz7~@(r~~ǽj*^or udMa!!<@VH HSCOhKK+[dIpq»w5S^$^uؿ|ʺn"mI-f'ᡰoUP_rMUnӻjGcCկzWs]mōN9~;L$S.wta0 i_\;٬0hKS+^ͫ?ٶyn+)+)]ޑ~Jfŀx*XY{sF'+-~{.+I(W@dUЯsjr>xe{hG5/%%އҲpop%I &' :jϯY0hj'I$uPYOɝsa=%B˲hOu6TVUx@`nڃ<93Wb㤐}Gz JzQg{pՂ[6 Seekj$I$u ièLX#Ec?UV}87cѶ߼%g#'d~QלL{ J *Ů+/)was-˗ v3DȇGQ+I%)%V uIo^Fa.1 9jں=u|4ytѣ~; em۱k{{/TV^a]طjv2((IRf}}eڊM1 |뽜oa6 ޼l?wxP귆w*$ JRvMF\GaciQ4ںSYYaaѡw#|5;D$I]Qԅ$I]$&m~"$σ wB %I:?7 "HO$I.K5I$eRD+X'@y֓ wB y"A@I"I$I.$}“o"~  J$F~ I$I('/mo5ɚ/>%-jv%I#oN$I] "e*GnTCR(^f$I$I]E(/!_ITXxN'* ){  J$1xN$I]ds6&d;VmN'OatA@I$#{?I$IP eN%ݪ=N5t$ wBN1+(IR;5$I4 ~xI@;k3\NGR~'^a($)W$I]|p~o?j[* )$<ֳa,(IR.i?ԩ$I4˼~xwTNx[ѦTCR4- %IdS$I]f%֟\NHI* )jv  JuKz*I$ibf'{#Uo69 +/-e $)Z_5TH$IإOɠG,*'bqNGRI0z:/@$)z[\%I$M @1$|rCG!?i9=d $) 4Q(Oj"I$ib׽2Џzږ;7YVx\xL*4ERosYgA@IG$I]7ӏ\G~'&OyuNXӠ *ۼ $I٨OQNeI$I.@;n "{H V7#, $I٤?7SU$I&vW#']MNks2?&TH`ԕ^HPlP?ήө$I4&??W8\eRnkծI?lRWz"A@I,vJ$I4KQ~Rkh-R)['+[ $Iacʥ,I$ibF9'f;|[|)9No# H}0_$f/@$(IR4TH$IaR`27J&) *:o'~t/PIu+ $IaD,:KB$I 9$wԑڞ':&]k{߉t:"l4YA@IzJ{T?I$))hs:GAj[5N0$n HA n9p{)v/@$g4I?g$I*b yT1[/R5Nԩ E  J1g5$IRؕnq(ǘbq$BSu:&ZSrGQ4rA@Iҭk$IlQ: I?`L#mRᏎ#pa]Qqe.-+ʹ[aa%e;KKv&θS1!^#(IRz|gI$)[]7/nm%6C);ޟMHVD%eOԸ\|{C?w />?{eYb~߹>ξ4dWAaprUT $I 'ZS$IP|_ý!܎HOnL߉qGoUQQ>?rSw E~%wXloIIrE  JԝZI$IYt}'mMߞ8~RDZ'FqNGFkAQQɟO9m'~cevv\@IiW;&%/@F57לrYW=yҿ{G}>}+Q0W w1/)[y㦜GN߯B#=$IjG j` qM9?$JH?ZW,??~8-=0 lG>5/˯UNumt_vߚ< 6Os I$)/qIxCvO^!D4HNU#Fg&2c/`Yyşs E Pu>ućkMytp~~ڗ\c'|9@`*&GQO$IR /wvr^tC?r+Kn1z&n[6@#7*2˯[&dv>.k(z 4S'_߫_9 Ysk{}xO_+I$IZ$r1>&'#L>>bt?k^$I=0\_Yy{S޿߫Kbˊ:J"bO]}$&jq꿖2/h?//7/KRV~TF 0!`EM Ȍ= /uC{%*m$IRt4%EEG Ow?ݼlmjwǺu[,K߳_>jUf@ܞa_" $ǎ^?YGlɟ<.Gz*^YUޮkYԮfrahE$IFU?ڧz9s׮0!{}ދ/>8_@E~?n1PJ?c7OդNVs/(@¢>UY]rAjW$I"iCٯ_Ygmx~<\11jT}mض74u=Sn~r 0!=M]#I$)bW[ d{/`iQ6]M%i)a5: Om&twnE,;Ǯ\fqU7u*;вUDnd?I)VZTt d6?EܮLZMzGjdNmċo%spy.ǎ9 `aa`SY;{8 ]6IGk$IA_qMKUIIlqMV?e=`|r:5/%ܼY]nKwN> &?I~9sd]frc%%%!;ɎR@`&^Y =}QY;()-ۙE*o!IU mlܹer,Yŋݾ553ʟmq#%=G?eW.zf)t{=)'Ij `ɺlOZb΁ˆq#\"Y'ФX&mcSc]vG$)T\X7ub"~0yy:nmF&2M{Jx6 ji .'Ic;+Yw؎uܲVo hR,Ӥ8 !X7t#I$e#SOu?+077nt_e\Vxʕg,ڰN ۓk);4)r 0Pק,W:=r!+$I` p7k8ܝ;iݩ??vF ?^;6 8.GVp =i$^-^*Q-}4OhԩoVĶ-\{< XѣYeA<()έ x82mr^A/Kl+TTo#(H?4)iRߙ: >eQDBaG$I[ D{P^`q@5G0@ XD3mT/E `65&Fm@#!7tK}|x~>Yfl x e{ &=lK{cLv>/ X3<. ޗMM7;fĮ>nD~Q"!ФXIq y Euۭ{!I$[Y!gP-l@ V  ;fЏy嗻/tFNBӟ8iO{9OKlg 8V,3@ih ؞c}& nx2֑X!Ws%M5)U 0\Pק,cN0#I$u+ÍZGHR@j^kIec` aǍpҴ6 no o>a@ٗ}1t5q{ nus9GH'ū}[ӫe ӷ{woOi;V~ ){7E˯IK/wcNLlzl[@k`.(h%X gmI$) T~x saױ!am@϶30\-s#Fxfxmqig{IH5mxq#pcIJђ pR#O->{<`g,*X֝F 8 7 ھ+-PT[$/@@A+Y: I$IiJH 7H{3`O~p`f6LӶ w"aHҧ Jȝ;g!L5y @rn #d.sn9cqIkIGH'ŀGg7Ͻ滼w`>e ^m/<{ 7{pn:ɧvyHvxcgKX64LSp1+Sc_桄`u!I$ڞu, v<{,o06d۱.X^02:vGvu~';M? %HkEU؎g%"ͺAq~M?۷v` PT#ēb~^Ǐ{0-ﵠןy=i;7O[׻_G83 }86  /0` pv-*N`/`>vXFg>*iokކoҔS/&ڀ@zy0hú90eSΘ垿`[^QAMJ^က9CLX{&Z$I2dEA -?;'?̬9sh@ (ǁW4ǏL((c=q{nI$'`[c, Ď] dѤtbLj{;R( ܀6< 9RH pF؟Y!^wy ^ ۘ鶼(mܳFvp:r &{ G"0f01nפ% XƸb mc$IR(f<,םF*. z"tҥRᏐOcɊXG(l*0*+I-olXkDLx6<ȹl>&fLKQ$/VNVǸ2%-Ռ@B002ȸnhM->TcI$)PM nTܞouT# &L, (f rs!L0p[ObH&mvƘ㕗J0e8]9zlIڡ&' Lؓ@1a QQ g!SLP{\PLf4ރc`?B~н$I$(E Ъ9sF*%b`!E˯ۀOL8) b[A$I"dRtiT#KP@2Jņaef90 1ۛeoK*Mc  6hбq.xV> .c4緵v3&yq %3F3g[+$mo$I$]}'V8G ^˜<]9'25=d{&/l ¤~hW6aI_ {K$IaЛH nCUƍzIxL4rI&Q{u c^դG ?Ag 3T!7 ނ]e}ߩ{I$I0GGK7BtTCǍy;!y ̹NE@nP 0q.~d,! m jFtK$I Lq=lSC%EEFp_\[ɓ/V[/9vW,Ν4}[Z[Q^[ sjsa]asǏon:M{5h[K_;h`^wW 73M>gUfyvNzV&'ފR2M*yfu>g$Ii=j xt4 |{Bvߎ:ԃwK.zd -@o`G;aN-.g,W[1/"}5 ׀Jʕ^x?ɾY [/ "'ܖ *yfqk{5H$I0񖣿x3k, o؟~v.=! 3 ۼl>(i< eAh  E{@CaV/};JmK?r F5Ԥ8G$/B@]u|S$I,s^~qx۲a_2 b@]EImz&7m=ɖϒGb@nʑן $w 0IryM,[?O\}`t$/B@#0uޒ$IR5g @XfWccvEYޏ5;H;& x i^T }}Oxۦm dI?r K$IeF~9qD;幖$Ir @sxЏA]0y̙ky/>B}vcmRxcA 5ɀ=M=08ݿY$=HG@*I^RW{wox&_v,xg 6vx-Z~yU߾{m\:o?e#_q>os/,Z^ӇW8ob[;Z6>d`)+3|{7Җ͌ez08{Ɨkn7cCu)K$I9yGBx)Aح uXv&?klC,~h{F",/aǶC@i`` 0ÿn5qLJ<'RcVYU0冝8d xsef149[Ӿ5ndwy3Bi7o`]> { H3`lϲMCMmɟf^c~_uXnoe$Lpy?g"o(Y'yf =s6ckyl,Kۯ&Dc:)$Ioxa' pg/.a<؆/`ɖh5h32 ?&DS80W&xþ>n1]R@*0Yh+y=i7:/^ra6Ǎ?SN`cR @e:k oam/yX&m'g<$[}؃=J-a 03PGu.h7ÖL|&$tp={I$)`ءWW=G`<Nɉ]̇~h#!l6ë$#LY?r l T |x%;v X6ދ% ZL?k #3<@vf/\,ga}x5`6!uَqe b_ A͹`=Ĥ@/`1&ޛlx.m0?Q>2_e<<msIY}9LDHq??=$Ir/Y(H2 zF6]R+RV g Gfͺs2 qٟ|vP[ Ds7Ci4v=ShɱBmȱ1.nx[' B {z)I$ " )ݿןԞTCMI%>&]LM 42g'%DŽID`|_M|L~i 6ۅ6Mq$ U$k%/C{`ǿ&>amy&Ylr>>うIi -^`pdl 5i[G {$I$@YT`sv7ԞTCЛBf: ~ ?#r3H{{=A1)S f'CE(/B>g݂ {?>$396LJW dZ,_$₡b>xS/e4cc,$H$ӓ!߬kT'  ng*I;(3m]< 936) 04f{ %d=< #}H4;M@ne(I$E 1W6 9zɖQ ͘3ǎmv޲|;r.5kܸmJ Mrd劸v T#7Pv'h% um!۞ L\!exH @ٮ-<쇿x]lR>ã/ O=l}3ˍ!_YFG6T FlO;3o`e>CMI$)zaQ-@ q&?7bD&ByIxq Х&u@,AM)H'eӞM'٢J 'C9\gv N욈B$L]<߯d`KV(0Tmm~Le_M]?i*,Osu켤N'>g-YLѾ@Q;BsiBD28֙ps?I$)zX^fɚYSEIۯ̙-z9cqKqCOZ>zo;7Pͺ,{dŊro{ nhp5e}ciox~na~SOmGx0i+WG56z>^oYuy+‡iӼ~q8^{qږ}vE^8xm%c4l ή=;psHXZԔ5hy))ۀVmND۟b{Sl[R<)e)?fwŜqkܮwJV2I^]P\!Ǿ=N$)$IMXkn ~t X/}馛k8 iYG#raF^5ۣ} A}8@]ه) qo 4I$I]^0Lxv7c4ı'߹-&mdU; mЦ'X[}ò[}*gޥ+6JծxI[fk*t FN pNEGb`cyvv_Ɣ*1Fz\$I$n2rѶ{lyGwoG b|Fؙ„ d4+x:]H,w[y\쇾YW 0Ԝ o r`{>IDATVu`@EDavߏw+Ys/\x[^Q2oǍɀ^ugMRΙ[vq'xz`6Xn&߀#Wa6z@d;`rƬקo_,4k?$ mҾ 7 C{*r_?aҔS+74tHr_(vd2.ۆ&gΞ{xǃ#c}$I @gWԳ P@fǟl,,X6}MMhpc3/wShNl۰sצ]T8Jϴo2`Rc IJ'&Tg Lö>[\Rnzly3vp $6ͺ@d7>@[ 'Aqk9,Gt F^LMgm\2c e0c7W9x2vq|f 0MoA`i/~{3q}s^8f $w޳-J1U^;ہvR s;fb#!cd~^@3y0`[}mk&U{x i;T@I$I J d0^vX,@Kx1Q=f Ku ; `ifd4'&sAn+{G{vLt9nj/>bk $?}'dž$ufIqa?֔ c>L=3ٞzk{{3FpA28xK[@=JOx5BL@=((I$IY ~{gukfn5!E7[+JBL`0M1B$w"ߵ‡4 T#u&(-ldIl+!'5hlohMaӾIH? 6 l x$I$e)ܼlW܂׏X:4cY>.uKŧ7eW&a[*fOP b tI^/Tpw!5^}Afb} #^v hS(.WS0Ю}wL;x[&p}c$IIԐ!SW_5F`)Q@. ieCs}_vq ،'m3KZ^`Le_cC;vNll雝.J;*6ю# nlu'x\$I$)Mp9KnԠA[Kς^݆*0?[]>r{ر-M.:խ./kkciL11?hbFo s_\K_םwֿl پ{k{f^:moqߚ9}{g\k–Yٞs/(eP$aLPJr 4ʱ i$IP@`'`;`1S?}馛7lkZA| ,r,c9ϝ4kX,c|ow ͍ɇ} @^؀?޳b< C˱]tV45kZaL8Ol,YPJsTCPIn d7uf#.!X'I$I@%^/0}qԨVmvgl2j hDex )^k@vv=g^~v!@d=29NLPʐqTCPI~'2\4.?u2u$I`gBz?zph[5A x TÂ˂஽v@R;لގ~=r@qv I cHiR,Y/@69]JN$I_BRM*aăt,3@7YFk*oK7< RqIB`ڭo jR &NN$IJ?=_%e&@H0Џvx,P+_|>@!\ m6as9(t?@)dR s܋^yQ ,s*C.{eR/]y]KEI>[JwU7fշouڶa[ڳ+b GڢOTdO`Wh}`+(zF9hQ%]Xlό-bɪ+k$I0<ɀp6`»eGE\{ 5~~T $+Α)h $`@{; 7AH?3N{gL&M9zSwy 8=,43>y9{[YU[m&m?{Liv[:;nvkzչ.ԝ>\w]FӑܣG~3fyǹ,p [Ԇhߺ\}ؑ c 4X]8ؖf|Lf,XZyYc$Iih > ˌ'`pBLh0|hk ^)vgIRۅ;m}xLS8:d9kCnDS$vj!UYϜo@)MRt6,3^DfO^|nlO= ?cO9̓vu7215}WT`Kz< K3&G @?@ E O"K 23@I$I ixQflz3^qa6* ݠ.]ɒ f̀_xf}e[2aƻnP\s6c #0x> G 284\Bj⡈۞&^yПxccƫm˜Xǃcp1㳠peOv8/x͌sY}kmLsisǶ;fW+EKؿR@Y;9L2ٹ (^<@ړa`Ћ 5Њu3b] mJe{e0SҴM6T ?3ll `[Ҥt3SuLa@I$IPG!R@Y Fc&fr-(ePa(ux XsxXfLhkc*&v7U}ڞA{vTڦ,j\Ҟ=z.5I$I/v/6͝5n[QRymY]0e[]^yhҖ^9p[4%,۶z;cܒ"wСc_p;e+W~v;a0/Fמ}ۯϼɓ[Bur}Tc{ٌB|5sq(o{ Ԑ!೫fj+ı J! dgh(uA(X'I$I9 r d$I$I0)z^謽=ɡz ߚٳ[g]0`@Ղ Mn he[i{7 J=(4)^?u2u$I`X Pe&$ UU-pa9G.=@&]w.[ `"S)1ZqMذCR@LUOy.;\cLc$I$ `=@gÎe̠Fh:̶ƛT"f[Je9mHd}"lRumwk2 4)~e m[::I$IQ첋lųx=`^,5:x5uM:L.gqm/^~52'V5^&l8qC]R9!4)2‡d$I$)* /u)&!xߥӦ9 r dr 2.OcLP$I=d7 +&$FXCw?ꪊT#ێݚɺ~ n,VKUn/@!5*$Ia}B;wߧ_몊T#ǿw5u^_eJ Hfus~A> +$IRtbSW'%]6c;Aaі 䞶nGiB'}~*n/@~'6e sY]yniYV$IPcPVKUu*Rԝ|Q:YwˮqcU5tYVhcPIi넨:u}-nYyѐ"I$JM55M `ɺbT$.+ꪊT#75gȰ ;O}wj.Ъ; dxiYӄ:u] G )$IRԿ[/Ȓu#oUJ?rWwgI*AQ/@6U߾Uת[¯- ZD$I F N@vTc㾲♺"+mm⺛_դN55)2^;yb}Zu ;=H$Ia٫.+ʕY} UeeeYtAC^֤N;~t9exsO'ߺ,>*"bKnIio$IB55: h:jG7(R pwSFn75!c%5Y$I)*舺^&A.Y׻u3RVm&zTlcX4ctdV8ɽ/r_9j]}544B$I·v05kʄ oH .EW*!uh\S!_na|_QiE<"\-\N4o^ޣ[G рAaB$IF,Yy_߰A,-O]}/~㼉)/9&w\ HUao'\⥗i> _$]^ۻN#;po_;|C ٬aB$I#۝qyyy]YvpITITUVH?h@MTVU47ꯚ F"g:XV{,vƏH¿_;`A VN@nMh:{&$I$)zru K?У=u Ys{u߹ N۶c󡟻|vwڙ⣆>}_Qz(pB9#rVPՁnvk0+&$I$IPM*!0z송Ï}qU9:+**W{G4 yeG?QUUQ(+r Ձk{սv܉cRU@,7MGuO9ɒ$I$ETRT I2$9hT^^Xeջ)pCV,E>.1'鵫 $I$eTC;!Ih:3i/@KϫULII)f۷za NطyorO)okɍMd8$I$M$)k IB<G{6Jʞ.-+#CG :r"'$bQF XZ,<4n"EQDQQSa ]EťV^qbŖԢRE{s_L2!dZ3a&ɏu_߻{&P5I?~NoWzƸ>]; I0 PmהjBM,v@cHo ^ @g ;]; 1 /B .@ғ 6Yt??'l!?@QߖP˄]; 0 c-O H?__M+.@Іҋf? @c05JUj @zP]PҜc}N; 0 k! $ȹ)?~N }(tSC hb k6nՎhkE5+\;õ;\ꏀѵVjjL]sTsS/"@BM hUUn jC7}){_ϕWYwvv6\C!Զ/\?{X @҃ jg5.@sp{/X=uv:5ϖTN?:+PT8nAfNzLc"A&9#vB]"e*gBsḎ|#mgXo7T?cz  ?~N nG pBwO4xr2~6{upb!@;1 >nBN lP,T; @HȧPȥv90߭G a1&9gÿd k^E_W롾ꯣ`pgkڎjm'rsWa v@bHm+%Y.@.uUEJ}- | G(x\a(gkyTm`*=bv@[`HmYt?8aH] 4\" ϝ.@Ls:E h&9ԥn]"ɟ];R?:*vU` V4LsPj.}ҖFTFYυ b R@Q\]@z *\{ςU> CbH= ^ W}~t髡U<-a-1CRbH-s HO ׅkSOhhc˴p[ u٢cz٪ϜwǴ,9jsw2廜cZhiL$2热/xIDuC(=¿]@Qw0T{rsuYU -y;Ncߑ 6H:|νqƆj][v>toksCW@ "D"` l  ?@O1ikBtFAj=nCP! tKwp]VPB c,U;6S@j"D"` s# 6ztGc ժB  v&um߷-=VjXfU5R IPWucg( E-v&,qַ V;*P  E&D^ yOשJС H/}soI9BG»?Z{_C՟%xБk ZBVHlC /a?:j~~V7_VK\U3ot~?ֲ{C@;iȋ1ԟ8td"&go*g_F~: o?8?#'W}y?~6T;C ;%K>퐕57;{OfӤz_۾?OIJ?| &X;Hff֖>3϶)}/sr; _]vO{]݅~lr)Ѥ3 *S39MvˆC+Ct0 Z~dff+V {)_\%cx7$מ7w2}#T{'ggfG>Ӧ'5ۖ,1kj̟;j5)ڱlyy"sػK?A#h8EQ2u̼3''ϥ=zi?}}c-;>6;pJ~gofnqe}><} ZNyvXOWmK R} <$j; L|UE:|׮-4۷/O@-}s?|&TI3G} <77N%]T/!Y73s7 G_fgGҬ!85T{$--.3Ҭh/sffd, 1S/&X!?e^A_B0ʫt8CٹKSQU>ݷsaX ۷ӵHc;dgqĉߦX*N8@Û!dÿ}%^`iaa{g>v|@EE|/ :s@N9Y]5k_ ([5Ӭ!ݴJ m^9& If'E |KER~yݷ3w4rfC^Tcɤh"~n;Eyy1:T~mtc&I$XYX7.-I4Ka'q:Q'a88"F9Yw^JG5 n7izbϪU]TZ1yH0?";:mXб_n>-[4;{Cõ!qOۋup~D׃G9"y9/XjԪ}wN1/{+VrQ8?MOc8jmXzlzLRͪKg^i=߫S ,YC;B@ D 233rұ`_),UigvQQs u댺E;No&]}N8a_C>W{T_wacNU?~Kv&@ҤP{cfEY oz PvfkPow {?o?]/пuWOg?f"8s ׋~_gVq^}k9ف2ZcżZV^n_lQM)v]E] xԆ¢Rs4PwvYYvh?+//N?L+CBlӿu;dW|=BsF_Q懑 r#{>m#8V^߯<`zjzԪ ?/h9co"ԢZSKAAM /(ת]}WaYWT\k%͉PGCmfUOQ:2nC4Z9Hg}w5yg/PzwC C繚x Lg9Y\AEUyc9YQՎeL^NΗ)U~~/^`i~~7/Њk=4+)Iʹ}F˜pBQ~xg}! |.PR,`@`k.%׵qN23n5K,9u 9wTc jN8𛢼)EZ S>BRu֘M堡7a_ -u-^yjM6j[5x#@ s?|}B,Y7ܬm]vOCXTsꡙ3MNv[ R¿*]J{塧^&Zj[o/7EWUsP׌,g@<rSvjujPvRCt?vtgg~^TЇk4 !&̌{V"ĢU{׬19YY=oWu'Hg-سcqo yߦ&a0Rlm{ v[;ðκGYfwz|[l |?~hg~ i}FEţ}'Sx[ʲKEţtI]LZ pn:' 3~lN<#ߟ- 9+0NOtκ`F n74pJ]s:21ICgE8WWw0=dg>~ +*.us_^Ͻ׌nxӳs$M_S~[>HHEh͹FsʩoeZQayWT\g]wV\my|Μ}WTث$ t.|<Ϟ{j=V0u޷eWT\.69Y;+vm[+*ub)??Y.!a𯀏6aXܹϾ;B*EkËMim `Zk>kM@xEţ&ggwV\1ڱl)?$¿"]YYjB*%K!s.y@+:dffKvߣWTjwMf>$&¿]TT>L_c^%=jÿ m$$Z$&SC?ђ )@RiW Z ([ YK $AHGhÿ Z (@@@  E@* @vO $΄p] +^õ'\8)nq?uklj > nv ?/"  EHHQI%xvNQmcX4i۸pUj/õ_Vz oFEg;Up ?Eخl釂懋6|@0  EHHQ T%xuNQmCj~q`*v+(k~g90oە;(?>M|7m@R\ -H8E x kSPFاx, "mgM18bm@@' K $>?o{z 6{0[%7@  EHHQIVd%xtNQm ?+ `Mvp\]olYPuo՟cdks@  EHHQI*ɭ"L.a{ &!F?S7_=nO~m|TDYTI  )*yJ?[:RI\{xh_6.{\]|{XaQ>s:{Fٶ   )*@u KY<Μ d^Ny\aӽ?|B><`n]uz   ERGClo՜. {kjY攁?VoffC䆗kh $(@@0)h"t_ҵkv|PNqc+,>5   E@*@ =@6 0)@y'lN|1*Qԅy6TusLs>Uv;u5vE}fY R}wVvN]:Jo?zѴjz:3~6Ve/^2sbr}U% $AHWo/_9Y:2Ԑ!FWЧκO.lRw¿ .Z)+;o}}UuE_q >C .cPdaQǺzL˴Azn" $AHOhÿ ޺i<\CMLP|rgulؤ@PyvJlN;=nPIP:8}D miu+ǰC   E@*+^S2T=WCcm@#G{]Gxڏt?ZC: աv)ڏGSXE"}/BPfb\   Ej5+3$4 0)@z u!=LyZuḶi6ަC;NС Pznch_`$1׃wL|yNEsŴ}kYnF„j>\c}JB-O#$Ejԣg,︛,HPmDze~i*hJ`R 4W3*55$Ψrfu;xAfյt]>-φ*ӱt}Asni *j^_ӺvhMk(TgBEuXӆ@@B=o,Ս]N6݊?>`*i6 %ls'$l(c3cF55J[e:|xvչߞUi%\Jt&EP,p)uD/S7LA8{_v6_=Nۻih=w;)HtgvˮǚdN>{~dD<ok+8TJ!F{:HQ}R`R{Һ|nYs%uCYw I}E;۷swOSuj<4,_8ۇk]Gj sqY}9o{v?jkyqz:^SomiCͫ_4i4+(f8陼˼m-YRoOΝk?//Zde.:㌺O֭~VLOÆuK&9[˽sйKN\e[/2;Óu{j=󷯣n|u C+,t &z^-h  0iRKCoگNry90m@-\}z: ? v)}uB. //?bHﶶWٯ(/5N+B4 A:a޹uVW ӴaqnF t;/u؃;Gu>珮4Lb233aȑC1BF{cꞛ J 6URb*{6?L1 :xaݕcƘ=r1L20/,\- 4X˺yB>ֲgf?׻c*us<};K Z_9+똗g.<;v~NN]N=W=ZφsNj\ѩS!}؀sI'y)rگ^CMP߫k Z kH1 ߴiNt.S" UHN7{_h1NA%p^cnܜz߿tW(Sp>TS_u6dsSxgSyY">wfzVS) `s@p:k Nٮ6Veٯ_ڗ2v*NWh ~(SfBuz}ujysuu9w@JCD IOǴ˴3õ x{tQ 5p\jF˹M'?.0m@քS(+u4VᚂB'1)@THE ՝>-:q?uE:ngNa:NIAIްaw߶)XShZ Wut[r~Sjs@-3ztcj9)s@mwC4=hywV1ڏ9 Bc>W;lBsIP?(.@)*Q@*p@jh.-@N종`SئN>-tCF]P*(S1R_7Tanin,nȧnHXΈ TPIH}LB0ÿ ~Ī.@)5v)R@ki&`gS`-WfUu) {Uxs]t:?]Φk)]u)ڡ*k9+Tc辝PO9T <8z^5  `<@s* _UpM]tX&Q4WnW0a:B1ݷLjgF]Oឺ|O ;Z4^+PvM D#گbNnW u&Exև̍ -\=zWZ :GIRIR#>Tfl)-Lᠺvr g ⴝ-VKin-uڷYwi aPP}n:1F{o7xSPs2 oV(^Ͻ|sWks @}F*uvR[sWK_i2;;st?nquj۳ 7YB>~:f8Iwn<W|)Cr5:= qBF{ =H=WVw`x  C344*"}xyz;,\IRSW7ZM醙 S% jx "uz'ߪ0svCOln7jJv@_yKWz!X,;eC3=cn E7{r9o[=h]߶;7'w=Ssy2CRR LR:%O m]?Q.V 4dēsS lQ~2ٴ4>}gT{sb^i}}fƜchwz삋gy]>^0fS.VҙW#Gp:Sz6hs;y@\F7.+,ݟR})\xvz^ڗ~# yz=Ǝ{g_j1V&pP ܵ"`Ok识ç(T+\+x!"uZM *;,wv}U " Ve4լ{]:B.uEڷ鴾VsX?o6St Sa=ޣϼw 17 \T  Ve&s(@%%@@T6Z@ 5@u)R-uq`f~`.RhC.=:fPZ=wu*S sr-x  Q䞃{zn8gKylܲ 5Xez͢*l$A  R Z=n?0plPR5 T5 @ "ns}BmNO^OiحBE0`]v8p:`@=?mN>w܃`Ͻn- 5&DMK kV #C+8'\ aa@ٹx4'xzCmg3m ecW': A{>Ɲ\Dwk,sa-O}ĹPR]6BI5ӱu-v0` `ziWI\fq״Igx7;vC%晇fuzq ۲lDze s>kjL͔)oyڧf}tlMj ׅ4P(pA D :u;a1 LPv nZ0MO%>ڗ^O&Lp5js 4ϏabdvC.37Md^ܼh{N;{_q7;WSA}o_nuۿqLeAqt<>s_nɺuuC3gzc}-H/{Nuj?Q۪qRub@MbOݗvFf@@u+*2hG4=zD q0\9f9[7o6V6t%8 蹯 uvy*$U@@ye/ E,kN=9wn]fK Z)ܳl)$Ç{!]h: B0ֆ@uϽUPg@u !Cgil8=nnW_NVVv`7y*}UPh%Z*et> @P`;Cqjjl>]@u847ڵR4WS>>G@LAԉ #].sjpC:#N=]@ U'kKqԕ|\Ǎ*S}iH; }Ꜵ y7*aznz_L8?C݀j/!PSKg uZ׸m`S-c%!}=]cǜѣ3îBP&_,:QgC>=G=H[9CUB ڟo_p=t/> D Njت^S@/Rm&\]ONݠڄگED.hNB̌`0TW`S@xpH& um }wXf܂;V!:I᮱ `D X@ui86=c#T`Ґ\9sDA=k,èxĉu>`{H Qp"I*s*%TuϹ3*L5TnC{_AHn;S<D]30x aks@}UP|>_S@iد;U J4JGO;Q¯X@mggSZOA{zn la nnkRյ49j[w_"M u>nǡJϓF8E$5ᄂ,PBS\Rk(vS'[BQ9(]}lCSЧNI,6)Ti-RiX8SGPNCvN*0}7[1u8z꺈v+QN>PRכX@MJZzR=x54 HedWѮ>57[}{z;f  D7{S)3غk+{_2C!#X ihki[6S}\t_}]Rxкݯ )^uj],^K턴]|WLaQGsSͥ34e'5C ي; |qo_4ڌ:o)ڽPO|w} R}w1ׅs)0a=~bf57֝wxi:@GCuGu^ZS˵LyhẅQxL=AHBi& #=TxWesulfC26j۷>7 ٧Rh;<:庯 W8h_@@HH" @HІg@KnP]W]p 4\6V†@gK|Ps*|~haFuK{c6@:b`Kq@7T^P|@@"JjX6\0v*. $L6 t21vnUۏn۹!:Wcj;V ulѕk%܁U!cG4=:u7WcTo: r/tSչL2=ڻm0׭;̳6P.8Xᚖ]2lW:?:ETg@Nuq 1uV}zu3{}~*mpON^7mU]ߞ )@t]KO!y]q ԁ uz龺t_Pk*ShP~nw; )Z 鰺DHC+{ ԭ~tl/r? W1` <[r:ʺe::D7$X@n:lPCd{V1/]Z\#O>ܜ=`@}~zk ! (@Ka2 UN),ӰX@wm0}?2 Vsq;TWzz\Q=odPp'RAPw:u)l,T}* m@Z|ɾ (@gڡ9Y޵鶝lCe:u-~tRA;TYo7tv_HPTu칚"PAvCB6ݏa:n: #Oe}5t6sC V@`ʬ@uIjr,鮣@;E.XMPSf;z2)|_s?:'.QSgKSzt-BTj:c6:g ':si(:;c1 `sqU* D8;7Pa t->]OA: UwB@[ ~j{{]/?' ۴CjUMmuZfû@u%cj6궎OՁ)@$ k \> HP)S A4 r-S'5 O]f+L_mv{^{.SHlՍNQ c*SXg;Tx.Eimn鞇jYD$&$lni8&P1u YI .)EΖsBix]sr)Ӻ:{lr&vD݀VZGA_H{n mס{D4Yssp_@RM o]:@ UkRED*umy+֚}\o[1rrf-{fՃO9}>+fËgw77yg_w^ 4;  EHԫtfLI^ftJ^w.1M);uēͨ&~'í섾^C{֘MLs/xzeSXO n&Y?5(@@"`NnBLn+xsSGT.@WB緿_ lĨsF__L )@k)ȳ5,W=r6T7yT "3noVo80 $(@@ lbiXo0Ԑ`V'+5Tna Z_Ww`7v@ /2 )@(C' ۵u?]OvC.< 9Wysubބ5)n+@#vw=B )"&ttݏm|ouiumܲ ~̱^GӺ [Ps@Z_iaJ$EHHG:ܷޜQcSO]ۚC@u 趾\k@;ه},_YhC>mgoÃ{@IAtl )@(BD`Z^Qqo?xgՖs|DxEţ~}e{Q3z'_ Zn>S~bm Lkڵ{_7WTxuǛB"}|>HHEh͹FS~y@fM;+*u1ź0Vq5=NGXVH]KK\pTB*Eo K.`+jn޶deTdkfo%4p.o+$ ;}EHE 9a9YuuYS q%j: kO Ւ;; @Z:e"Ģt-)<4S[ Ky;!t/©?jሳw?+x@ZY!YMW2/7J-&33#e :8oDx;!~?wֽ?g;u+{knjd Tk GE弍Z԰w-uqiɼJJ2IZQSNUgfekO/PjJM3PNv B-/##cWT\jM.]X:Xj~۹ci{:k-*ںx):tA";sI^լvSԱ]{ }jc).B)6CVߟ;j_Ҕ5 |ʩ_m߽ :1=sLx 7B@*Z=2ӹ8sQc0uuRjwM[y t3G{jjm}c3`KwMAvvq[?Zg~1cҹo;JYuxQjے%WIE: 1%w j?vF7yׇ#8Kd/mybsڤ3^%ÊuƩ~RXxeev1:|yhLi|c2/4"Ww/2㞽=:zv\p {}}ܵ`OTUNvJuE}??Uuē_yiiarqn;~>i2Vvf%?|챯+GRѿ'섾Jt"}{>Rܥ}w}O {頊2@S)/HeHg[߉+y)@"@ןu/HUHWL,7>b,IENDB`aom-3.12.1/doc/dev_guide/av1partitions.png000066400000000000000000003404741477627663500204270ustar00rootroot00000000000000PNG  IHDRjyIDATxTUֆpDEAAr$VTL$gLD  MN"9;dWuzBS>2LH)ei8쐘qv>fEQܘ/{3Wx90x^ŘX- }||:W⸆ԤX̹X}߱.Cuf p=6Z c=bpVqo Q~<%8xf8^ϣ`k]Nʸ;f 5"sf>?iz=sc1KNգm1i}c޳Ș"|BWS#;5v~UL,._ ߹ ->^z>V }GƭcH p %I'}TxŘ);{v̜=¯;2߹0"53Ad|V1b]Ut><<^w. L0j~?LXTZ<7o%b+  .PDGR+Ōlǽ`m]2Yzc^\b.җm/xŗ]U>*Fu-[XX8l1?1W r΂B\$y՛|pU<s\ĵaR@WK{4Ծ#nēP{/:-U_m_#SP"Kk(޾gR{ ((?;'\5.U p.V4cb7-㾾w~sc}-_ Z"?Sg*O]A-_uƵ`k=ϪXyw\Cs=c\/y~B1/ ɼ {,xK#|Wy{ O׫R [1 x%K\^1cSk|sMU !߹=<>;[^s)ut-w9׫M\ xsCg|׊y~עy*MJ^/5_ᐘsw 6a71s;$uVb;kY*_;RD^1ב%HP8@x17Wr^K+ŌJ"^?Uya ;ߗ@/> /Y>~Ϋ<͏W5 7P{wR}5_ f򛟄.0% Ά煀o>F[_ߝjy] w/pf6߹ /HQ8Hhd&`%`U2ͅW7?cs}xˆwίd*;W͖%HPZ}SQZ9<7пqH8 EɫLy] f%p޽o"WJվ~0AOLֽGnRhg27 C*\@Et/#v#0# ^? U-[rY/c^j@{yUY+e_$bwUK*$ዻkks$&*#4q]|7g _}7<(4*;4xq".;G|i7B̈X1- cĿB4S"io\ϸy(I վ xݏݛ* t"|zѺ8~ P)(*TO9P4@ǫ~sWUn _R >%ss:#ܾs# @5О\6Kx!\\*ǽ%Z2 _#/ʢžT^eB0v #2'3 wEGD'^sˇ_&~S2D){$W?Q 8>ou5x0zMm3迯eG[|| d"|֍ҿDBsḾJsU"y*TU  w8$1PP9P]s_=3;\_f]ׅc=g>Fy={0WƐF#9OB*8?KP ;N_d [^(B @F4w/ŝz >C{%5cK۟Q:{*(`g߃_%с>?yc/pȘ/=s님ioXh9U%X%>9be/i@"D71B g 0!` _sD%nN;|vH0>ŜY E8P#݊P6u/flS^%7v<~A5h SVx~'DĚUqE/_u(KZT-yb ˆJ1ϭweQ-ȃ1Kނe7gX$*1?_l$wrMU gyfb^?"\b^~}1~^56"3Trs}r]yhɸBi16ſ;;O\hLqSUvl}zļGߘ1;3Gs\ 5}hleՙ* **RWʪ߹}3U@@ UӲ>o8-T`yER5->_^؍3E@@^}6sPs,L Na߹+@B@t 1/c$^ HH@x # @:F@t 1/c$^ HH@x # @:F@t 1/c$^ HH@x # @:F@t 1/c$^ HH@x # @:F@t 1/c$^ HH@x # @:F@t 1/c$^ 4ٳz 9sƎ;f{ ve[n;v$kٳ׶muwޓ|ט?5oGС>ӧ\۷/{yM1۷8:|zkIvU>6``=z\&1ٳ 6"^w-ZN8a9 Ӧ͘~7Lcg1O3fڮ]  @A۶m?Lv:qCr=օt[HVHפi ܵ0(^[9j5j̾zn?#G 8tc&k٪ 8uj?4] C qi`oֱSg-.K,w:gcw4du;Wݐܯ@x q]AiީSl(Nn֮[cծ*[mK.;bogϙ,f:[beX]?}e˭{WgPK.wk`S~V^+WJ޴9sǼU6jԍWKUڂS`lּ9fΚǼp36b7χH KKо?dsI\UL6#Y_[}}k4۲ekcm=mѢ%=jÆAe02z^;tbQ^ZJ6o}v=4+V'zdW٦M}65FKN{1].# # Mx@11kt%ċ D&< ߘ S5֏ H/@G0|c&Lc+\,M4"$/@G0|c&Lcv`OlٲH;px08Є3`sԦM㮮K8 q# $`p 7f?f '/@G0|c&Lcw@`$ " $`p 7f?f}]wCibB@H"Co.ٖG+@F@H"CoCą D&< ߘ S]*H c&/@G0|c&Lcv`N7/@G0|c&Lcem]xx08Є3`Qֶ]G[x08Є3` Q֦mG[ibB@H"Co^z4R # Mx@11+lݶ^'/@G0|c&LcVتM{[C@H@ID`h0YUluă D&< ߘ S]ت% ^$^ $" 04`L|.\oIc&/@G0|c&Lc^~h֭[o/@G0|c&Lcv`6nsD@\H@ID`h0 u}7/@G0|c&LcVؼEk@@H@ID`h0y LF@C@H"Co 6oe7FsDHI$^ $" 04`Ɯ@8vp5m>rx08]Xs ﺖN>킧` `xǜ@]O?E}رzӵpݹ( q" $FN76FwO$00|ׯwAܹsmҥvZl~zw(џe 0j ,Ν;[Zwߵ֭[[=_~6f9sޢl. v@]WߧcuQӖƼ&z@@*tcgUbܵ.ZdG>_CH/{#GƍQF־}{O]P3zh:u͚5~7oSͪUlYd4k߱z"$ƫDߖ3gN+SU\^xwu͆j? /^lk֬qk6g/?l(L1Μ5͞3V^=?GE?@  ܡ=L TgTl۶WawdwӬ޽{[]}֫w_@1w(ǿ/]xS`AgŊZjV׭Y֦]GzǶnj_k+VP^z%kѢ=+ּe[0ckѪֲֻU[k۾csm\" F /_RU߾}k׮֬Y3{wvO]we7x]{VD k[owgݺGHsk矏7ɔ)? .v&GE`Jd}/޻w 6̙3'8K|>b~n9"#W&S1g/@;W[dYڗ=A=մO"""즛nrAW_m rK,Y\ ,8gϞ^yի 2= U(^tEOUɓǮ*-ϨYUڵsk0) 24F9Os=窸 ,#v ިRK{InywMi +斘.]F[y5jGK)8 L^[O5?~}逸ouft\y啮TE&2S7ijR4}tw6= >6n[@@H7a!-[ؐ!C\X@2w ]_U` }xA=ڶm]eZѢE]WdIhH56l;6mWxqZ tlX?}thQMקZ[ou]"P>zf͚V~}С}nȟمͫVr Q@PוL:5t͞=]S_AK792'ZGuKl[i3O> 0^ _ax/]:uB n)DUUR%{]5Y0`k\U`z˯Nkժ[6/ИS6*1~>tOa裏7p=]#^u~(cfT\ =:w캒:_^^EhC~k_|{_~پ+Wi׼AK/ `Ĩw*Q[zcŦWWG@]+ w ɓ]և~hժUsYZr^UR¹sKU>oR¡>}` `k>oիW@X~?5R(Z歀OA׉B^]7{NQ^UiZkFRاӏ?؍OcN5OH  YiWȥf͚e C^Zv7wkISJwcf0:Ze^jQw֭w(QUB?: ˻njҠ xA*yW0)Ѕ0`mY.T3-VܰaCOpSj̡>m'pWKzoWAt7PgrPxBG#$/H7Y ʹi嫯*nVnܸq.9ݐK3zz,XpDXj  t^ՌA)ԩ붪 )1QȣG]y۵kg&Mr|6mrt jT. 7fmv5;믿UioG]޵C׋LPj>u ѵssy~%Z$ 6R +; vTQ5p@io-ݠ!wa|ӾZ $]ݐ[nn~`K c.ZVFGBU>3y*xA~RUyrK)$Tʕ+]lbK1+lѲP]{\s (lƌڰa(u43 x@R8jQFh9*ts=Unb+kh@kj7``v߾}m6WܪRsZ ]S9r.U{.ўj@'9Bo ;wۯQLg ) MnՍS|Z.jԠCtM֨QtC72ܱc{-U='|okG^V~Z©*?-mݺ)8RP\#}o̮ '=ܟT3 x@r5 /^l;vtK9մA T#X6mZf醙$6Ҿf^->~ZҫNUVuKzըAiIJ*}>fΜiK,q SĄ p9j`RH@ M4ջwo<ȩe yF}%F\(Yf-Uo?2q{.\X_TO{Hj{@1"LH)$^ o~s4hn/ ԩ:ڵ+{dghѢɓDžUg׵kWנA!pZ`LP3 x@IRx@o6dW[Sخ];7o^X! .T`jX2Zʕ_UΞ=C g@@J! TC{iO6->}hUvy/nժUs)ܳgOVyp֭6eT?\@/fB`LP3 x@:n޼٦Mf-[tp VP!^O?U3֯_*w& rEF 7f@b ) RBM6ԩSGVvm+U]~Vti{?t8s07̓sAEF 7f@b ) R&H6sʕ_c=ܡ@a͚53f'5" 04g@@J! TkoI&V\9 ,[lvW pذan? G0|c&D(RH@ 7< G6zhSS?~ߛoi %K& RM(`p.o<R FG| VXaFrKzVꂿ Z_޽{ٳm۶mi+`pN|-\N1"LH)$^ -ݸqM8ڶm?wW'|ڵkgsqAaZ$ 04ׁcgϙK1"LH)$^ 7 6ol3gtŊs?+1b[ڿ$ 0ηݻm֭SelI֭{0g@@J! d ,$mذ-}gD#G˓'UT}4i 487H?q tbA/B]ZGur*+P]_`LP3 x@od`(Xtjժ>U*TsV\=GGz$ \Чo퉹w^W`7n}O駟x5ɹk… [\뮻x*`LP3 x@B GͳX {uRRxQn]ꫯl͚5.HhޡH-UHᄈ[ݭE.Ӓx}_˗.RW5{5M7dw}{+=khm۶]a<R $*Rş{L:x +[]~.SuK/d rI7eP׷-ݳg۫/**V^m/vK{¬] _ve9sf˚5̙u.^ oQ {acǎuճީСC 0g@@J! Zb ӧ=.]B -s(\q}DջPGtYd 4ĦM`ǟoNT`ֳg[h?~MC wsPa~}.}_wݮz˞=eʔ}YdqMp͛폩ڴicCs9q}]m ?]k֭xǶn=Y{ٲHܵ[ ,U=Gka#  :fEa[dn>+]ϟժUnݺٌ3\$#U((1Zmu!9VYcsƍ~6#Y_{6f.\xiիݘGkǯ,rKV EקjCZ«O9slԨQ*?իwa˗2eʸ%Eq)2^Uj??U~Gnɓ' ׯ_~1=Nq}7~vtj?6lh~m-\8ߣRׯh?LZl*:3JʫlEu۶m|lݺ-/Um)v{  @I7+ӦMsMZx7PCKrf&LuֹP"۶?L;ct5ƆmC!Fؠ!1pпD9=O%=C0p ~z}O_oQֵۧu8q}sR9-Z*W'|bM6^{u~衇ow~ Tѧ}O{թS7nl{ 5>}ܴiU5 죦-yk #Fqc:9Lcz۴BǏ4rjQn8мT1  Tt0DUPR_5ܞe=}]>|mٲ%*+Wوr>o  j_|k4ld/\j<[˞_GS}Ckۮ2tğ7Iy4x7.\B]ukr3gt9 SO=ewy*UʮJ袋RwߧbŊ*`_}Uǟ/\Fgػw_@Z8?(ϴ3ns䨱.ռwvI4ѻO_~]v  Z|r2dO{#rʹ?ZSVyZf;|=zu֭jٮtmϵ>ͷR;';wvUj̡e:wګKQ*]@ZAԁ?l̈́셷:Y}wھf hoԵW{ƞzY{^k{Mb;0捵џ5uO?c5_go5zՖSH`J]dd[bg{g>/N#j0B~뭷OUf~-^f8+U7TAuÆ TЧ͛*{1{RJrUs…]ȧJ?ͻs\uUU߬YjΡe  `N飻moWoJ+>et9 ɶ/%X*؍Z+e]a{ug-iGOA͟J^ GW~u5K/;f.E#Z[j-횧?6ٖ#lh홈vUlveY-kX)QnA;}6*F"7 *ne"jثm$i>Hi֡CsLן>S_OVKj6^9[rk 2Kw:jS,T ~{\=<ϥ)9駟l̘1ַo_ڵ JG?k-Vu> UUWF C?駟e\`[` `3ّ=mI6Kvg,YkG[y`]ev?l vŖ-OQ+ZU(Wڮwe, {m͈:-y,oJSOcwRrY|eCNFp-'NSge.7>ǏNi'ٞMl.֮voKYڀ{ݹ¦w~֞%]zq6uV|9D+"V0vcͷ6V*Kf˖pCn)bW\ 50֞fZWTZIPwa\mΜ9obT:ڵkݞ|Fk۶ժU-rS) Te.Zf. bUmʈIy5jP/j>퍨 Q/[٧Uea~ +W?-i׵ʾe˖| 2Zxٱ ied/]ۿ3 A-OP6lg,>gˑ+}ah\n)Qr֞4vgۖvOfg+lEn{~6o~[-ZawCO }[آsmނeۻa}1z]~{ gZ۝W7=a \cDCͲ3SMu96}Rb貂V^6uv*[3ݟ+TfMōsc*qʔ)IUg2eY E]LUQf ,)AzUӜ.]ԾsVc5_N*?=GUř^'Qsݻr )8UUo,Y˜9 QUXhQWx۴i:*kj4@zmϭU@vs|veΜvvF[;gložV9j'-%C^ y ykؓm&6_zX'n9KZ*Ml~(ԼBi|uq+[lt[)|]+2Ğ]q~6cv۵kPJr[Mk?i͙ zqEA+gۺ}-U2-Q^{q0U ZTioUCi/435;hذ{Lk chh9TNZ*?-ѷz|A+U ,rj4Qzua믿9=/5o̡YsD}#GtK54})V]ͥ:j)Wͧ:uXvܜ*V=|W#jn7`FAfN˧_: ZY۵E=3vA;c7~6mZry,kk-/WۖZģV㥆읪ESvvۼz-u-`[c)[n|?S⑮նa 5VVfػFչY'XTEUDnzM&W%w~ VzIs 3u}G ꦛnrk[B=x`:u۳NmzHy{ќyRꜫ 5w K/dO> 4ZKiiRKwUѧ=(5/ͧهVԜR 0NჭlگF ιkmCŬpvÓUۻ{*v59-k+-oV8W6˞]}vTC8~v/ccZ?ijW*ef/nWUx^'唝=}5=V̚rks]i9_k=5v4;u>~ߺ8]e.-tUm3l9hb>vʾ7Tv6kM{'{q+yo z脝UCjyj+takQUۯO >k @T=sm[j?y޼ϖ Ui /Sgرmj)Yr||o^kca[uKvZok&5rg}iI+G6zI{ c9}|Z5ن6n,`k3ImLz\βgcW?i #>oڝ/ "?h֒pFɦM2ɇz0ꊪ%v5k~ˈTܪ"SHڇO '4UUjw}kҡ *UR>C4hS՛*@LY׈'͑V 5Vi=uV]{*L5g ԅGu}pY j+=!@tFߨQ޹f}չXi+Qw͋g{3gO sY"}klöctAUzu6~ڈ~ ĎE6KV)%vؿ/cozi~gةl63vgVF1;};nl*gKs^cw7-fM ^j uN3Y Sϟo_~k& Zjhի][F Yb7Pdu@Y[ҫN\s M떩s{թɇQyPzZ{C͍Q5PgĈo ܲg58oUAJQ5Qe+b͛7w!Ak)MU *,, @P~Ohk~jW®-X؊{k𕍙6njw=kff҅YbD,mV[:6:>{ɚŲgj^./pUhMYj_xA۽jZjreQlz&ۺmصv`?ooU.]o;~XN:hۖOY2XˉBj7pa+o{`[jUl<h^y(4.R矻*H5P:k?U)R߳>K.6vX5D*VЧ/jN<vګOMbj׮횛jT˜uMRu]g~=ϻRsalڴiBR| p}C}bZ6چa3_he-l%e/m >ېmʯ+l_Sl}toNV^yrBmW{Lf+jR7~c--TNZZF{*W\QQQ&S~zmճʕ+`TzdUUk75CզM+=4*CUT.ꜫ`S_WQ͍jCq*?jSpڢE W=kOKtA>6dL_>}q~+]tyEΧ6d:[h3knsms/3]ge 涬WUب?[>w UـO߲ZXKY$,U ` o)cn}jt#'#lT'\ٲ[n. WȊeMiO 3B.:GMK4+V/(ӒLS\S&d;ЪT娮t˟?wa M:p۶m.,SŠ\_WׅWKOn:ej= +|(Ԝ((XscQ{/nn~zgj# Gm]}?zM]5vb:[7o }ݧ9 ofcgԱu,5I[kѤ5mɺn#-ێڙSG l166xT[a{䄝:ٶDNFWڊͻ-?{ݶejç[vp6mPuF_A#g{mvlf[7[ޫuiܚ~̚fFdVnmN@hKAՄ \𧪵n[ڇMUn;vtUl W9RSEZ]Хϯ}4 Br){\ś*t]?#Z0VyA>>Oˑ_*;GkժnBw]we%Kt׵XA:Oա j?>-%j{(@.9y6]k6m]z}3hBWZ>*Ծ||ƫtҮMKz*"Uqַo_Cs!ݪSH &O>?Sz:ОZ«*b-ݭP[kΡXM`ԉWyUB@ B UdiI3SjڰtR43=***4i*TѦ뢋.r󡥬 _A9ܥ:2fWK_*ٳ5jjԨᚓS /ts TSb_}{ U *@|H@ڃMU^pݡjղC(-0U8j;~jܡЪSNIѢE]' .oOjܡ8g1*Ӳc}V-9VU*{9~jȡϬOT-V=6qD[rÐ}$^Ho-P\R5**ƌkvmiԮ]զnZ,}SȩJcǺ\\ w]jQR%WK5?Z|rW!~G^J^W9t^OU}^7bUq#O<of+[[ >5P?skֲeKݻM4-g5zj'Ks  @7ꔺv:[r5GȎUzZ۱cwPVi^hiAPI@AIɒY2EsιADAEbt:3;߭絿cY@'Vk}TsOnK1던kΝm[|]weׯ bpyeþ F M8(֭kԨA.1h3 ԣuMf%'Ol;>;6̜`ۋ;{E-_p$ ^kذمO>1׎RzW=L=K?0$I%~xm߱͞3-XX+Lknno_-ti&7rHkʓ'aÆ 4}LcpQ:1uT׵kWWV-W@) :|psb&MHJIhLƝHn*Ul^Qw6'#-8p/^9~@B`LjyăP\Q8NL J:ƉDW_}Ξ;OvN해ltիz3$I%P 2uֽ[rVWܤ)CM9Ӎ7:XqnsހFCB:Zp '#xM-)ؤ~̶g[V h4on}x1@[p1qUT|7Οػi}w]>}<.;s{mשKw׭G/0I>uN:8ܳEKݐawA1t'`{{Foب73I$IP ͞=׍0n³1Zd[4!@_vA-1 5k# ")>z2p̦Wh27#FO?mQX@r3")q~>Sׇ3@\{b.3G-#FpK,1J $9}gkîC.M :v[zޣ;rhk,qN֭ܡk-צnn˖1Vq'MqcM$I$( ~'OWz3lU.a,=h.ݻ׍35nؕ,YҜa#J/!"fYw}̣[b5jm;d\r%;߈֨Q\&L0(yfz\ A\ZhŁ9o<v]t1xɾ`.gΜ'OĔ)dah[l=ݻ 6}JQI;vzKzIK+ wu Nz_"kCܼ,8by?~€ڲ2>ezC$I%@@X(Ph6m p1vW]c`*<-=jU _k׶}Gb]ٲe͗o||10,#J?3iα-:SNAڣmp~V۷لZ Y|R,@ lC Ç?wC rpnUI󯇋/3fkֹ+W{;c(I$E^"^ "JsڵVz3n3D(h;… [t5Ҿ[seo~+d2PFTve`$`͸88qKLL4%.ѣGvY27^bL  Fe˖ (aK׺ukg@" Ev$R@PKP$ID$@)(eF]s("6Yܹ}g-~%BpSiXzA! mYzͿGG1W/70GM۶m6\?h N`1^ #^9}DqQ+V̶n9, t>]}1ǠA X["׸.]~^Rfyo@PKP$ID$@)(e&u}g-J |RR2 )]s^6l1ۼysWjU+e c,^"kُ} "Iu!$IMxIRP0?1nj9"EO?4bn?q\YժU=c6W`n?nݺ[w_GM ˗/[g y} PpX͟?J${`>G Z$IR4%% JA)#@n( S  0E *:ypz6\DƎZl^xs>\JEqc[x $jmPI\>}lN?(/nD@ЭZj9~;vm+?o`I>w#rXhşS~F)yHW3ŋ7_Wk⥜wc]Nפ$eE F`%(I$ES"^brsX)V^t#KyZ=Mx̯ڀЍ1qfB|М~ 8`3s߀!^ݻ<8 (aV] $c}DR&BԩSl@瑛O"&$IMxIRPEn/\nܹs[ wa:uyYonD~1f17@F-3^{5mf;4`{wѣ ,o@􎾔h fpf-cef!6^# qqqV:GDF`%(I$ES"^bz >"̰eWP!G'u8G٘j ũGq1x^~D}۴icDz>l=y}<}ǏUV@5lнKV7o^3gNs2{g,6n߾ݶ"pr K(`kխ[לs?@.r׽x~Gc0$ExFs`%(I$ES"^ 0 '܀ LK̗Š*U6=s%qĥK1cƸv`Lws(x +՘6mۺup> ~[fƍszrM6b\x.OX̗]BDĉ䓤+~ %IhJKR4/o ccȀo4wƟw}۷o9pҀKz,fQAC?oP #@?rŒ b+V6`?/8Im!۷o_s>2Ȯ$eN Fs`%(I$ES"^"}ԗ f͜9}qySN+ C$7!b#汁qq;n„ ވx}ʟ}!یy}L|ꩧl>!EY ſ1GZj9|c[y~pݫC2#9BP0$I)/IP J|pƣdy{.\ .WB׽{w ,f᢫W=ֳ>o?Oe M4Du)q8%իWS֭[c&!=D)᱙ׇ{hˬ>`ƍyp={\EEy%)I0#C-@I$).p ==s9s( x|v`c>čg!E7j"N?σH秜`V P2 7+]=6Bd^ѣGW>IR Fs`%(I$ES"^B 0f1W,stD}|MkÏ,\btiѣ9ۿEnq1S-8 dQ`ߩS,v <ܴi;w9 X|y!G|B8cN0G1ۃ$IRZ#9BP0$I)/IP Jsb.X-[`%vp 6~+W̝7etąX{˔)c= 1[\@C/Y|z_JG~8}>x]BB=&ۥȮ$I!(j J$Iє$(`w8~ٳ{ɗ[ъ;|ps|鲸&[Wom[~<`.?ⷸꋓY{]} p?Ōq "=}-qƹ+Vݻw'NX0;#6*+IR`>G Z$IR4%% JAf ܁ch+X#K̗ tmƎk@?8?s(`^ժUYmԨQ~O/3(u#G+x׬}nȐ!*4dÆ p+2@$I!(j J$Iє$(8pm޼/7nlY8G9κÇ,@@~8vԩcNG Z$IR4%% JAFP0oƍcǎ.o޼ycN@#G\ǥB_Μ9-_"ǯ<&%>8w⻸rI-C~Gޏ2B@0#22q$IMxIRP0ҥK۴iSsἣLY}Dx=|z@@~?@n>~c¡CYf5kָ['Oo5OB ʌ("b~)EB+8$67kwk*+@I$)},={vv>{=G}b@A_bZx6l:u\m۶%&&FF$)־?1ƻ~(;wW療F0)#@[=͛v._4  %IhJKK-w;n\ f;h ri|8N )g_ܳ>k :`s F J$I1ڵ{gݔ?7"ݟgCGyĢO?=N%\ҥ-[Bs&dk]m[UTIuUZ5UZ^)LmiNjԨUfM[ުU)WڵZ\uIu[.桥uޜrխ[^z?[jSw}zz8)єI&\ vҺ7o~{l]YS-[pj* \-fp\ڶmkrNq6_$4P#..}[]={ރha]tmٲ 8}Uzw@! .{3_| R}Tt@Z…n9o$I$( Ja͚ry͛ۏ 7!a. fP.7,gZ1p,JVҳh[fw7(X dj13Kʑ#GXs8ӳps1&fVc^Z3&ԨYӵiލ9M6S+ knmiiéd\:tHu ~vk|!vx>f=.O2ڽ}q?Q[Glq=掲uKu׉׉~%Ds&&qq:Ÿkn7rX7g|w칛n?$I%@@:wk]|3M q"u]S)B¢`WJ TYPʺY8o0 x|3Y7pdw{ҳx]>-Y@,\,Y4eQ4tNw~Ÿ%Giy {&@xеi5Z_Gv-Zrm)_s^7z/IF?Ⱥx,k9S#~ɮ}n87"+Zϕ[+Zb.bꤿ_'UV&kVs͝ =_$I$(aEC\-כdW`4x*ZqVM-.MK70֛Z70B֕Z<90sjQbׁܩž}<^JII<VTiDs% ݺJXfߏ\cƺtZj&Mf̜ϘEg40)9Mk 7yW.֤1&&sM&$'&Mƍk;5_k7fטqFsof@I$I0K>}+lPZ7"܄0ܯcǺuֹǏ3g5KѾ?lH~<(%KŋZ#)9t9|E~{e/'@"wby~Q۟o&yos6m2c3 &MmT"(I$ J5+̙l?1:ˍFj1cnKw Qi &$$ج"n~J %Ieؽ{H"8')O!Iɓ' -ߧ"=YiY:eqٳ@? !CX̑#G˗OPP$Ik@f,]6o6l`Nq d&nnB+Б +q>SNu[lq.]`4foW=gEmڵĉ?^$IMxIRPS@! lj@C)sz] {R@C~ϿyF@A~003?#7}tfsN6%&&ZQ,@I-k{ׯۺuT0#2*`6e.&`/(%x= W$IR4%% JA)%D8︹c(9%8(@W}G'=Z \L\80cPдiS7x`xbw(i;|&h$I.~Ȉkˤ?8J8`3#JP$ID$@)(P`]vYDKfZh |f; !KUiAh1 Om42;kF5C1o+{`6}3gضlDXNK0$I)/IP J@(\ ɓdžϚ5-]b@=`W"E6pW` ǬR˟?D_c1XbŊrʮ\rhѢxl#=3y̺u:#G\Gk*?]3%I4爌 `x%(I$ES"^sqjժx˙3{޽{m>n@=O9pq2?GHڵ]&MlԩcTR8<`H$TEvab@@!D`"x#ܹsggv˗/w~ Z7 Jn!$IMxIRPb&+\խ{9ϻQFYt^xnf Dq>c'3g<4Ā?@Ey3fpÆ sZb` nC/A/ JIf)Sƍm:%#_}An}K$)TC0r#C-@I$)p{_I4Λ7ժU\xy+n8|}?8/nN:Y&7ȴlc>C( <]e˖Ǐ#pB7h +a!F8, #$l[- *x@NZS)(Ǡ$I!(j J$Iє$(@ q={YPcߪUd;t ,-P '^9 }G;` N?܃ $nÐ=vKLLt4޴i̹G30ߓO>ioϵcǎm7l`NX?Ŏq Xl/583֭[[1`˗%衠$I7!(j J$Iє$(@^|M[QGzp FQkf>@ѣݒ%Kܜ9s\߾}1h-\A7"| ܹsh'^h=?ElWٲe-ܦM~<qcݻ5xnnȐ!6'yY 2W;m1)!Ȅ8yÇsv2W&@ (((I`d?G Z$IR4%% JA)=ЋԨQZxqm8p8p#Ƌ[87p@qu3G`GBx<x1c8g@`a͚5]%.Đy8#FHvt#|(P2n8׽{w -Ody\)6>;M##G3gi$eM Fs`%(I$ES"^n"@/`[r2{oHS  H<1_>` F[<*%Hi|`/{Vc0qq4i7o~6'O0`G!Gy`ʙ/;?P#~\'IR`d?G Z$IR4%% JAf b%}17RB@28sPF ?"`m{ʕ3_:v֭[g: 7qDפI1udq&ҥ ȶqsLNq>H _+]AG?L?Ny浶c3oG3%)I0#C-@I$) ,u! ]J=;XGqQ`bl`aDltDޘ H+W~2{qq.RRB/_\Hwmξ"E0@"xrC+,ɶ@3h(ۙ;wn5=g`V4nժL2 iQs%IP$I%@@)b1Xb~`0 9\ݺu`J,ڵkf͚e݃y-[4XH/b\u?J<jpsSn3N;nD@=܁l3K,S9q+I&e˖[SN>%)C0r#C-@I$)jG Z$IR4%% JA)\jT+ {4y6>$c0/mbsŊR@0ϵ}v4)Rb<>0ٹsgsL \(b> <l۶9@"LA@+ e`Y.AIJkR0r#C-@I$)N2g884P#nQA[o.]j׃R<qsv+P~X: #fgsHTFߔ";̛7{7 ^Q"8\j1bY>p>%FXdFJ8(IO_!(j J$Iє$(p@?ݻ[ytR $ȝ>}`3ʗ/oE<>k;fq}(`<ϟoM/$.KtA")gG@<$y+:1c=/d8t| P2ǐ}]x5%v Jcӑ}"LcAA Pt׮]m4}R fb%KzY2G Khf֡Ò$$IMxIRP 7#_|)1fFEČqSF 7l/^.,#Ĉy,f#KHqM@8&-x48qEyč>+WڴicEu.0|1f sA`)s@S"B5jXks.]+V6/a0P2#9BP0$I)/IP J^8í?zpsCyǠA cr}Ǎ\sZQc[q8߀[3hGy:-XG㱇 fNJ^iAfks 2֫W/s;JA8.ئM->kUT%E`%mpp +gO>0mfW$@$ǟ)0(e6 Fs`%(I$ES"^" 9n@,@]ڵe,? +WfQhí|rNO옸1q]Zui Rc6ێK+mD!x=t9yf'[nmٳ氋ԍЎÏy c5pݸ69\8,oo4Q2#9BP0$I)/IP JQ 2j(s%:uԐ@@@ӧ[-4\hʟ&&&y{ހ[;w4g @GK f!qu֙/ToF45`FS/;\vwq6g-|Aܑĭ\y|8?7" $3'MuyWxq;o\z NL:G61Pg$FϒC0r#C-@I$)i?j#ٶm[!- O8 iZ9h<5mM6p-%pE4҈ d{Lbf[(a<.sHv\Ll8.DY)QA0 ַSY *e܌7ov }:NS$\A"l/wyGVZvAuDl@ p@#"g#hAH]]DpHt H/C)QL@1ێ҃AW_u͛7w{6 %-4?@f0ȵ נm~!ץ[O{^W +n˖w$I$IS hgږ%pK-8D@7ځ88ր;+ `(!7@ "@1 ]ll!@”PpL z[nM~~prCy8 :ǻ_4adƹHRce;9&vvc̘1 mf}WPBf͚cǎ/_5ű J߹FMsfu6l Zc7i4ײu;GY.(((I$ J1]e҇mnZaZC'aСQ9߾H-wQOOr޹`lٲ͝1ˏ f1cfo8pNmCh-\yp5xfa ]a(lҤ3gEŔ1&<~xעE s ͛׮8sq?[CYA@)=of$}&N3w1%I$IPLCU׺5kkiL: If8h`,Z73SCB @ ͸ntξ9r#"s,W\c;{SsYe,jKq1ܻޥ6p@Uo~65E l+dD(PƁsنcǎ{ĦC8[w%vy۶m; fV`ʕ >9!q>oK6'hve9K~㚺tt@°-+W e0pࠡSonX܈^lCܥ$I$( ҘS…Za\gϝwW= ?ZjPa#]jլ|PKX2`ĉ79p)7tŊ3ȉEGn+G"v;e(JqÇ7w M@X '@zȕ*Uʚy?8/JNʵ|G܎^z | cĶcqmč)U!>M6ojΫ G$INIIo[fΚ_ ܬ9%Kܮ]uI$I$# ;wnq1@N8 afqR"@!b9J&b4 pܹӍ5ծ]vDO6 pҥСC6 d:$ie .fpKYy)pv~p nذ&g38!Cwĉ)8G2gPAI}Z1oR+c~#I$ JJ `gX"0ɗyPA3q9lv_}P< 3aF<z"LEΝ9Jтgl+Q_J4r4don98nD6m20K0,pXA\@`שS'b@@LA\DqC2SP@I$I$IB%/IPʔ 'O6n2eʸѣG \Dĥ;un <\s@ *ˎ[AC:  :a6,[Ŷn)ppr(!2L23Z&Ӳ@`+Wݷ}vgSCK5W  \3 3)f͚5ѣիrJ$I$I.xIRM={x#/\>"8`pu ~ꩧ hQRzus E;/8y͙cf]] >@ƍL:J*тIl32e".JU5k^z%/_>;fz# -`@ahwQ IʯR;pJ Qw2NI09pXbI$I$IʺL/id%bF68j$ wԢ!9wly袼7Tej+@#L̖)` 7_b;]Bݻ87iI,k2mՈbmh5 e rvZ b  ?a?i@D}sr=*$I$I!/IP$1snر/#dt$ t oeϞ &T0y9bC(-(Ȗ-EN .lFcY@ ȱ-(-`^ܝT_8o\Ң x7ηsRˏ(7Qo00}8;q ˜Cf*$I$I$)4,!?̙3uGTR...Π\\#xsY4HPJz88p^ QSR~%^J\pұ$N6ܳgm'%(ĪK['3icѠ 03>iAp 5g 0'[H0JcRR%;nďqO8NCV$I$IJKh:u27`X4|mtBXpbbAf+V\yZjYQ` W@~x̙+TVŖgϞm,ӳ~_| j޼yvGFɒ% OĞ@O 57K.u;w40sn}\Sv2#wږ0)pGs1pzᵢ$I$Iq$% JYN@J7ڵkg GhS@\@+wЂKd/`C'-V:܉@4`ܹs0c)>(!*l@Zt4 9/q>{`=}H8Q'# (11JȖ-[hӧOƕ˳%I$I$)JK"ň<>6x˖-LІH˟?m323BQH8xǜBr2f14 XecH;2qg1}dFdV̝ 12z6 c54' \88&̉ )1qzpJ א$I$I$IKK L=9}Ys8ȋ&"l7իW*>s1b8LJ Rʌ9f{ R8<# L[bE9u|ȶ^z!cn޻kn8fѪKA\D]ŋ.nh|w^8H" 91:8q-ĬsJ.5jd kI$I$I"//IPʲ;@D XD AцYD)LQo{-I[oݒ%Klf][S!(qXh.]\ l{JfFi_<9؅#yk֬1ڽ{w_(rQB,@,Z\̐123(5s7oق(y s=(P @ .J:E#/P-H 1,I$I$IRx$% JY^8M! l *%Au x0?ܶ, #'1!-Ǻ[.eGĉ]׮]_WW\9?3}őI cвeK7b|ܹ/N<}ᇮN:gw r QDC`5k׶_v]I$I$I~$(I)%fל[@u`U,<@J7p1w-{܀OA`&NH @ yM6u/m!4*L<7s߈yfyo> a!t̙Hl? _H1n8ܦf=,-;fNA^{{uׯ7ۯe s0>FLNoͧ5<~xk!Ol^!YÒ$I$IRV$(IG+,Dql2n-jI ~Sp8qw:u*6pDRn#C#.y&A"O=Yݻ[ " F\Aܑ\۶mpXD9Ab8x8N={<=UHǗH$I$IQ"^$]Cf"VL_|f bK(JXt6f1*.Jmd1Y+YydpC w b\qpQD]7:)F8D}Xifψ{6WAL~OtQ;*U$C3@㟙Yr/8p q=g/] 3G1i"Y9xw瘞={rc X`@A^rDz-sӺM9V GgڴyVW@X$I$@IP=f'| Li՘*PBб$Om޼!iݺu^qqUP~a4c!@]f^ 7 cA930J\?Kg x1a%eVˈtݺu6a/Z5LDy2[(]v6Ӓ k}x7ŏF5Ǝ fU$I$I$e)ȀfhpCbVp-|Q1h52#G9s:5jX 3@bzf͙l `<^VfxlfB|W]޼y) &1nBuj 271G?`ໍ%}E+С%I$@IPbF;L@[bPdyk|w݌3E+WhW6m8#8/DaZr˗7'@'#EYnκK߻wP` q9ss^jUW3gl<\w)c#&K̗m,i\\\֭5kZLFk39/ͽą,G_8p#CaNx?[ (I$E\"^$qH1`zn:sL*]`_8( f zJ!.X9`olQFlٲXpZ2[u^RA\;bSĵe]ݺu- o]PyӦMC]ly\A 6+WLK *TRvff hqir=sni#"(8 J$Iђ$(IA}\98uGԯxn2Z;dD) ȬCb8?c( .$cǎv>Fbnd!}{pV %KX2@NjǍ m"qd2{98oć/^lMcڇ)u9`c|6/@Qga?Gg⠸<գi 0xݺu&ZbEk?{٤I8aqA`#onm˓&M2Wn];y5x} /Y56j1m۶+o@)YP$ID$@I M$mÆ rPJ2eR2K~za[ҥA4PիW,D;^~p.qbc? *d5܎ŋw76ۚ5kbva4ov}H!= xlٲc0(9̕lٲ_\NLܘjB0R8$I-xIIѣ]bo~'O68n|c08Z~me &i(`,sNQUםwiƜ޽{\;7(_D|9\_Gq[Y3Sy_9sZLk1; a^R8$I-xIIco[v`Qf:^e_ !94#e$h6qp2+PtAs…]ݪU f׊/:9s[r0`]Eov+q6 03+Ƚg J8$IR$% JR˗/=#.K1Hfzya+ @\rrʹ]ZtnV[ &jСW_x0@nA|Mfo1(_\sq}53+/H4 4p}uw_|]WfFKP q$I%/IP"bqYb|L:ݻP.(rP dp YfYXxfv!|dwߑ#Gƍ- Tm֬^y} I߳g]u`$(,(I$EK"^$EҥK' f 9Dcqn|.J!r;w΢&4}Y C{sH{nub}b,c׆o^~hx\\Wo]ժU]Ne+w(ך`dߧpg@I$)Z%)J78he& {1s3 }v2䘟GF]QTY믿n9+p2:3|0e _}K vp-ZfNbq3Lm۶u ,pOBpm Fq%IhIK(`2o<gRfsp`=sVހ(┌g*b̀]MqB %J-ZE #70\MuԱ2fr AP(B9'|b ݱ,(I$EK"^$EfP1o<+x衇>0gv 6x&s1@@UP\}zc~ qtF<4x[~5eig@I$)Z%)n.?b \J#G4RV\%Gr6p9r0giln+#\j$ܹs]6mlQ>_b?nqQfe85Df֣G+ ]z9'qwedG1c,J$>NHcvZ$%V=0x9ĉV(2l0479wuW!H0uHH<1Ho{2[^.gQq0]~h5jp8GOq$I%/IPbƀGZp׿vr岒bYM<9bq+ ' 0/Tqh[oو.\oN^t::9ϙ3ǽ6sv/~ıYxqW&mZur,˓(##w={^z{8G %>އ$8?nR˖-3P0rYP$ID$@IN0QF7jO? (rm57nl˙f9"\i^{` plB k P ̙3͍Id@eA\˗/2&uk.ڜOf2sׄ/=~಄{O߷o_{O5s9L@G}`}ٲe]5\֭-nNw3=5Sm&< F{$I-xI7f̘Vj7sv={4g3ò1ư8,S86@8Yf>ʍ<jҥ;m֬Ĺ.pIR>}ۺu\Xsp.RXte& 0I&hcøTq p.K&F0rpPa|<@p6aРAoqrVV͠`k8S +zˮ b8Gybٳg5csX>'((I$ J$E&*Uv`$ 1u&Lio79tk7&(?++V̮ kqE,pК8$qaJ B o^x@iL g`]\\+DqB\<`aÓ/?x!\#+W&^ݺu?0pk3_p \?SgP3^l !3x$I%IPbF!.!n"=! aV9Ǎsq^z%bPoiǍ(Cg!!0qTǀ?Ssn89-86q_>[bœ_WwK -X֭A@7pBA;'?^?}qv;fK,UAD-l x 4 x$I-xI7 @,bנAsd4 ŒUV#~g `M?08!(y䱢J 8̗kڴAQDViu  `sw7o<צMր/p )ȝ8qA\ Fzk7[1O>aÆxxb7܄Dˉ9-_ܠ3O #@ 8P`ljk#=]P$ID$@I7 @e̗"B#G (/({v&("DP9sQ9#p}.\թSлAIԫW@υ#tB3 08>x0`^+YEqpfPrMլYFp)Q™{e`X0r$IR$% JRiS<`80@燎Z ' ؘ0~faqFQ!q"7qرc2ș39)b4Uĥ$rhC@܀7)SU & nx]~Fp <~x{ e?K68 d*#8GS8 3Ct~G%F}@P$ID$@I@7EGy\ľTwg lj n_ykcL8^fl K>R&!Jp"z]Ne$qb\\)AqlٲY|Y8q(V\↻  H$^lOyM8lb !0\r6W5zPkkwQ]_߿ww-(ZE )P8][HI QHM&@>/Lfum&wνs> ,( t0v<៶s U*Jeq)R)Tl pz#(`Ip߾}qW\8s,7&c2:vޜgϞ-aL9&&ҥ_ c^̶b:0_cl0ױcGߘ6mt]tF@P3>?bkF9Ē~|B,%(dk|Pv Mk?L;r9_5T*JJJz⏍;&,q͞=;Zl)\Z yӱ$>NdI03[gϞ~\س&(ey0A 3ye,~vd]|uLUő#G°az GOi287afSyk~gߕ*,KTV0x5F[1lh?sFk;8A)TT*@@J~=3~P, $ӧBտNjYpj#|a4ǭpRJ]lY,!j\ 43p`t֬Yٳ'~W $A K YBLHE]? ,JzQ2sY\^k_=:M@r~2# @l?vK#F {{{))nݶ#Wp6v+ѽg9獛X9o眧LPRT U U*Cƒk b& 399YD;3fR1jȐ!]rAk.43u,9eRSaMN0|ٳG2o{` =f. _wF=?~,E1cH)Q!ȬhfG9+Ws? |fАaX9y-VT*@@JHl@O/R1=u~~:l|lAT**6KPP~Ȳ.,v?>v\C0j(,8t)K_Le4[nIˣ5k&ٰ0`tR>MfZlUedk- .M菿D@@M\#*JM)R)T bPfm7~ܲ̚etU.R.nܸq-l&ձ񌓓&L jY:`~?~x Ƈʱu*4l~ep(<PRT)%^**UZ1AO>|8?.To?.!3''O,0DRJ}f-\Pr3)fwHB#A 3Y}:jԨ!F!c2+.>-S=y=`TR!R z GGG* @H#H7 Uqn#$$8ByOSWZCPP mڴf ۷ Da"6m€P|yS6 ZZرC`a|Z C/?ǃ̖45a)5 tY@] *|#O{_`g k֮{,EK`r;_jL9Vڅ[a6nX{0scj=.3rmb=85o5, n߾'N0'^Φ}@@*l۶m[/3gCicO:Ϝ9͚5lҪU+^G͛75${܃F͌nݺf͚R3gNiRlY?g̘WWWx{{Nf*T@U{H̞3_ֻƊy N .A]0wB,*=lt~۹L:fƦ[*v6nݺ`=w>.ò>8#Fi3{Κ3ի$.y/~Ya1vp 1&o~!1wBL@%^**U<+S $8dxxxX4pkpJ_E1-qH"ˇMb2Wx\` ,xJ ].ozpڰ;Z,z;w c3!=bY{GꛘxOo ޣ%Fb-X$bp޶]q}v^Q@JJ/w.u0lbx1he K/e˖zZ 0K1c$31٢E lF+"\ti)YgWkmg@=/(T{8y {: XAỠ߷}ǮιP+Wy+;kn `I!6lرcy KkIEA1h ^Z|  PUh{RE)sV2~~,O)4xTLA:H XBww,^X .,lo-3/oܸ!W{F{zzbժUE&+,fypRУGرcH=-ZN m m)TK ?N ?Lf; )%^**U<- LO>LT.]$Jslףy f!YncY0wݣ YNSN 4eiul+0I$QgϞػw6w -Uh1c'!-X(TK ?N ?LN ,%^**U<,̴LY"]tS{|>.Ni3O)kM(d_VV ӧO&-[3h"2vՔ)S"wܨX5kHve\Th9)h˖-+]8I:qD8;;ٳh -=Uh`|ϝF@U<zf/@JC[nH,kZkhhte_"E )_0uT)wF!owMC]Zt)Zj"# "LJ'px l%3Pe $ !"B*JIɬ]SQJT U*Ջ ! 3I رف*k$r_"uvRªeFfM2ˏ2[h|?.؄o߶&, -'7a\+,'&p@/@J ! ҟf͚[L4yᎿ?6m$nK@W^cƌ,=򂫫.֯__ʭ9,u/^8֭CJ&&A# /] @U|9 :ЌRRRg~~@mۊg͛7u8 Y&,lҺukd[ijr Znc .̓'OFNPF d0˯k׮->Rl}oZ9;0.;a T Uz:МRRR^Hb]VH/5fKxAe7X9ѣGKcܹs#sҭr-1H` cj dcf:88s(W (ٳ#}(S|ѢE䞏mrRh#suV*^KIIKJJzBYSǏ_l׮4 QY0G3VhQ|/2 Cل5+Kf:u GӦM5kVߤI(8* WRqRa2'tVhV)R)TTo]0YfÆ /:vTVlbUq:0 c0w\$bf^2M?LEyU \p! L0tfΑ#d qȪUZWh9) EdQPdVThF)R)TT\1"E'`:uk.ܺuKɂׂYk:V+VDL$[vÇۥǗA' S FzI5{Bp@z3Kp۶mR"l `rRh;piWRqRa<ЬRRRkr`1't...Rܵ`I'Ob޼yhժJ.-ph޼4 9z;X۪8|_0x`4i˗̄e vfͤ;3,!94ouZF m 2cD*>KI2KKPR}|8|0  gnnn``s E;ټy_#4/_,6 yoо}{%Ki,Ŋ8NXx9ɌeyL -''O sV2~~ ]RRR>hB퍱c"YdׯoubI %x>}:Vɓ#I$ȓ'd_^7d t]`x˗Op3f̈RJ $d)6cc*Wh9)x2L!]#~9v7a@sJJJ K}}}mVR6bFGEħ݂FƍK,ǚ e,5Ȧ!̐߿?֭['x {]3.;dr;/(f8m&|؇1}~Ç`r;[) _m ̄Eܹ)#Fĉ1k,,]T6۱cTI_.Tl\.1v=njKW1&{I5{@**c-ǏǠAd֭ڀ" opp06m$BصM,[WV-̿v 梅%\P0㏁sϞ=LpիWG=xbٳG|h/Ǒr fΚ5da9ͲxeQqNئ]'c6-5f@**S, $|7;C׆A& .\eЦM_4h3gBv1]ǛΝ;( f20gΜpY #lܽ{7|||߼?q?I=vxHiiTԓ8잽cfgݻ〱߂#b*)s>f; vf' -[V4WZ%XRQѻsdZq;}aVm[9g"qpQgfrC u:-ogIy|mtjcڻ_Cf$@cp\'XAhngʔI޻_r0~x#H;}d 9M@T`' ~'.z005 a[rrM3bvGGJWfN:2vܸqRJO^~RTY&Kn `7Μ9#su|~t%z똱D|XdW19c=E# .,;oϘInJ^Z6Mǜ]*T)TT62}AhLR7J5(2+Uv ;|f3hfY6&c,?ay0]b6W4FRTq/bv?,}oY3, fi~/  W0McF^fXi +@\a\ggg':uxn42돕%KD$UȌSJ|׮]' *T)TTf܍a5K g.Aן)#,,LɊE˷ֱcG)M5cY*;1dK) `,v2Xg'>̒%tOok׮ɿ3-`T*Jeb8z dTRAy٤ y2^aCٳg3LSѽkGWSBSR~NNNҔVZGp\x0eX$7~Q矿'gL@**J?`;,`: ԩV cϞ=Ơ;d5Uo:|g84fYҩSqwc cǎQT*Ymݻw?-+\s̑,>f1dB2eʠk׮9r屣w"77:$Я~N[ݺu n`3{'bGL?cǎYf/|'-`M7] U U*@^fWV\)*,㦗#36 3հ|_~rUh`7ͺ \/:=,~jѣO1QWT*;n-Նlϙ'[\< c 6'f!XKyfdC2,w۷/.^(u`޼yehB:gggǸ xMA͹>@_?`fF Q*T)TT |(pז>#FrJ5d`.N4f039eV'eŋlׅY#.R(.$ل;ZRTݖ-[СC]l@[ n|\9ǚ@dv=7l'8|[dpYce$JH6Ha6"&ĉ: 2%t/%^**bb€N/^Ѥ~1p1=f6l؀Aj] 2MJ4fKYܠA˗O`;MK*%]kk]U*|%޽{aSJ^4 cCvJBW;N ̝+g,?(0)})R)TTGV;v,ʕ+'`͚5䪬:|X2Tf/^\٭~=lZF"\(h2`@ά@~مm۶Y r3[f>|EJR>]Ϝ9#vfe[z-m 2՗4׾4#GW0F]@ۗ/@JeqxqOOO$,"hذ6-PεdN_sIY0;Ҳ g{F!~~~R2ׅ fPlv*Bx]XLo)6ae˖ %m]U*JgիW+KYY.]:iU4|sL VM:ƜäKPR8 … _` B]OKɌ+WJ66QT׋ٳ&\&7Ug-0i.)8)0)})R)TT$|}}1ydi&A? 菣6C\谜 C A50\_^2b Jn;k ̼ubfJ]ر#f̘)VRT}W/]$O>'Ɍ?nh.]ӧO[RAIIKJJu8lR`ggІ!z6`fnvPX1cY:u0o<#`2׈q$cvcdcϟӧL͛cpvvF>!**?]& z߲ə5'^G S m_JT U*U" .H90ъ+ĿLeۋ%t>H,b7D2ˌd&arFcz.H߾}{ɓG2yd͚U< 6ŋy QRuYw,r_Z\0sqݗm}!W7wL<8f<ױnVvo{իЩsw^£:v qxN& F->2˖#ظ9Lԩ/= ܴY>5pZ+TRRo4dF w)\m1caoo/eIb fOԮ]cle]$%l-. !nٲeѦM8:: 8`Je$qpp@&M7o^ʍ]p+nACa8'O;?so}}1vDڽoA[ ;Uڷ9s` f 47F_c#q-Yz^gc t0 :tA}ddyxzšCGpQ>c}Xr5ƍΝ⏕PDI?Wݺĉd28:mNs?&>,,LC|}}%ӏr93ø4PRRX 2=|rۭ5T޻gV,$n`4iR\I$.q`Y$%^Xoځ9֫)j/rLGy/_ ?ר&ab7/ 2n#&L:T))gX T-{bBwq0;8:2gyJԪRzu} l00ӈ忥J2bŊac[{VSvE.}L@A-+*\\ߑCXvE=),}fuT*UlÇGʕe/l &ȑ# 2> XʵS>&L(3ڵg' 7yN dehɟ? 8qb~7i9&d̨*K*+ݪ@/ė w◎$IS"MoPR ÖOg!:FdW _=I8I L *5CI ģH<7n =<~>~:!JB} ^ڨ{q; @FpGx^Aua$1yFw70D/U_3,~'#pNلMkWaXr'o(D=}T<'q^;nȻtx9_\cc8sq;oŖݧwo^%C8}V-_嫜y [#9 GD\:6mqB/)9<'%| ^gp'qVb5_ npŏV! 3L]GKO@TFDž'0b,\k9bMz|׬1n$%5vfSƌŷ \(Дԑ1\؜={+VaLcF~Mh׾3:v&GУg0ÆqGYEOKAu띰aVar5,Z;Ul'!`Tzl""giÆ _.C2M8L]ewر ̈c5aYl$[iÆ |<X-f={^~7K͘1bCI SRR^ RyS>{}`~oP&G |T яvB EPrn>{'̛.=;<㡏+v#Ujh0&Px^̖YAbUPDLp{{o)f@k@PNgv Ma3\=DXN M?i8bVJU@Dj-,C\ّ՘d́J\d\0Z aPpYTk1{bͨVH  hpxJ}Jsq@ ƷG*E/g6cBΎ\Kl=^ƀN E7a 4{A~V{ڍ1c'Jv5/ovކ^}H˸sƍֽwfmZss܀C Pc-3 }tUUYV$b7UFY4ۻw/Y8۷ohvJPe<=2~SbzG8w9-jJ Mh>," ʮ&%R婌.q]0rJ,e89;#Zό$R#};9s:WBi*[IYVBڔQᘲüqa(L2CVce sZU@)H= >gz.%>2Rg̈Ti3#M(C KAuhܺ_dFQ@,<wnΡXб,&L? zL$J)'̘q3WM|Y#76w4>dʅEA㺨R4Εy*C72/GleӦM{ec9%e&M!,ŒqsAcov4dS?.rʅ} H ܻoHٳOv|pyyL6 'OzX>\|q>3CBBѺmGxyŭ[m8u4F#+6qǎevMO*ˈ1cu놲e|AHKŋ̙31Ws#̛;w.:v(Ьpi`aAYlϖ۷OXLhY|y)MfvD1e?!7yY~L~~JPLĩTfuԨZF4鑭Om6cLQxۗa7b&֝wIBҌylMsEGt)>e]?xE!{"(V;FtW%T 2VK=q(\'_%3 EQL߄bɄhZN_})VzS0}x,TEG,%PvWv1yP7F,G<߶8R$φ"`~_ ώ=1,*+ëBѬِlC4 bɨ&hX.?d)ҭDX$"FefqAN5kZ*cG)7b6(˹cC@sx6nL@k=.Ӧhc OĀAClӶ#._g9f7 ld 뇵Z}q]dIF|"|x.#F1ai/3G9sk̲435!ayǥ<͛o/ZėwfP~Xd|7oR)T?_&Lyr Ș&&J$"K_kɘ|Ϟ>7K!ek9$IY OȐ*'I ]{渜Źx|GUFѬ*Ϩi6zxa7l~ȉJ^{l fw'Cx~gY5p AϑMOBfӻxP:t)S{xX{JJ@v<9Rd*6G]4Hy$\vCO^ˉnvs%OCZ-Ѣny͒&DŒ%PLl=w A8*>{ԩrKDYCm0h(?Q͌)AnT jZEA1+ =œȇw`||pv (4P~;vH=K&ِ'EB|8RgȂy h:i<8@#cU8N^DTN:El !G9 î]$Is:ᇴɐ&Iv. Q#bpf}~H 6$Ӊ[e9fsW; .[A/כFLI"uԒ)ѩS' e *Tz| }+@0at2oOScӧO @cF;%{Y"˃%0xyb4մg"3rjڱ1qϑ#@LKnƱqT*@U 9Ku"o~V0r&Ky+Q.ppO"vq7ֶ͋\i"Aگhа! ժrfh1`>웂G Sv)nq(Llʢ֠UXHx1e9| 3~OczsQLH $HIdG"֫#Ϣq&,kY2eEFMѺP $R-$9kۢt$H0|Sw7ԮQ U+@>,Xg3'He[8x))|1 %vSbX.hc$,wDZS'|*)R v$>Mp[<s†qy\;S~j尻`,,A|M5?H0L?,n!.`eXd%V/ƜЭvq|#%コ?0(Dx(552ȇl򠀱XC/L߈(E/S S<(\z/ ͎p\>B\{Z,Ky㓍A ,3fŋfWh]eڰl˖- 31{֭[eAhkGXQx|aFwhJz&-Ѫ(X7eF$p3Ov{uGMѴU7WvTEi 1G+sa,B]~]ClF>/Vd'fq#0a}vfōSNIYd,ׯrʉ%3X+SN.yk9JP0Ę> UGm]Kb(dOي cus Ƀzt%V%DtP$LrAWԢ&C1y7v:bwj0n:gLpqtS4. /aŶ8xNuPvG ۇh&gYB0N݁ڠqȔ k`LEPwy|g8<ӫwk#m&cعq?Or0B/dr+*ږȀ\<(q\.DIu\=]KfB <+vq%C1SsԫnWx*dj Bi6?ˁgΜ)9 @F!bc+WJVǙ@ظqc]@ -8M<8< eCtUԮSB¥Qr$ amÛSDzĄ?ˣP,PY&U*rE#˷΄F"ڂ41?P 0@ȦLR,4iiӦرc/|b DrҰ{ XKwIcݻc޼yGs'4wߟ Çc'n5h@`*8wr|L/EzˏY̴ee3-98OY{!~ϟ 1ϼ>roXf- >c`S'?m{}0Z `(gQxryȑ*5R䫎6}ܸzۦvsᛲcUx>@<6~$ )3R$Jtkb .>B,2ب#wjZe PZ s+.܆Qݸ(*٢ڵk&?A4PR7 [wok1lJNiEڢgkp֔H0`8SKڠn|kߖ?zĀIWY#mh: .]ߍy tDH\2X.K[BAPj\2@4k,>pt2(uty Mu9g"pźԂ4.kJ}Zj%o؆Qxy 3XU g|u9l)1ܵkd1~,?T_֣G (vlYCI=0jx8oݎ#Vѵ[/_b&*&L غm;\\݌c' /Ŵ3`bl#cE3w73&T!\? û};ǵ'xy*yyw^W8ov]q.!"nœ8f<?{`51޽3a vƅ/M?8jmض$N㧑xp>'vuR,6'NǜeNذ4bh<{o]ظbfOf`5ظCAZ} &Du r-wq[ hPePy<ETv)1_k[?3Q~"?{!=<5u1~Sɱ?|wo 2} .!FD>Sn]ŕKF`zAo#W1ϞDQ`Kq}Qϓc`|֧~h{v .BLjz1O<﨨WPL##50ƹ?ww_ĭxlgJTV`N5 V1cFa9WM mޟ;m۶I f=yo jL\?yьA 3.A 5jHˀޟvI&?)ܼys-[V2\[E2oƟ'd#ܞ\6sC|PvkD\eQ +<}r SЭ ͋orG-Bdei pv :tg23sA~7-g/3,;vPGbŤ|۶m{.c'3ͮi5i$3fq̸*nr~c7_z I̊{q+x}d1gKc^cϾ}Fl# *T)TTD P3(6ˆ3s>6)}}'c`#:tô<6 -ta,zi,0ko fJWOcmCK4|!O4χ,zaԖh_9/g)C<|fgwGB=)We\ &U33]ze@$X0 #WF cSi|ر](IJ^vy/YdM[l)ـ...jˊCV0& G9k&>,vpqX>oԌK"bt /j PHD_]A$  )佋'ħqR$ 4h2h2L'_41b_>}zun޼>p}{G&&D/YN%K&2eR75{t FX|7fR͠?1͛6<ɥ#y\}iUfy$c顲rrWNˡU#ſ^^I4ZtM_{ Zj?hLGIÆ fDu.ٲeK)U|A:׮][w9 ad׉rY> ͸>W$_H`oVՠAԥHzTH 44L?ǽH"$äc͚5w`0#BNGD???-ǻᆱǎA&10=<_;>y7;AQF$B\#kUecj^s{am^_bSYq䬜rytԳԫJO{`V}K"+X@MM&>d=Z5u m9zT~Hǎ.| v;ɚ5=gJ7`TDNZW9rN:Ipp}֮@xnR>~xߴ&q=ܧF$I8onzNɱCSr?c襳r19f:).]:79t>}k.ȥs'q9uh:D}GťFIffQ[Nٍ3y)Bj OzP&qqyz~;G\`XnCt3QH7ajRFTԠe=|VqPI33Wˏ~ 4h2h2La&ҙ1c:z7Ƚ bDp~CKmD֣t]Zx&k֕)SgtA . X_5JT ,͚5S7@u5Ӟ9n"y@ ɘ1$LzȬY* F,J` ҭ`L]g}zv (5oĉq4?-։.1_>-ZT%>ƬCD~P!U|@ L&SMBO0A|8 }|ٹsv"K <غM{d2Y0F$AseĨҤYKϿZbr/kv#  vtݻ6 Hӂ\Pȗ/Bbjr"2hj7N ..Z9Z,~{(y} Џt"D lM?|9Hm`7"! ~lC={cSϟ?_Gys 4hd2E`BBĈt0)iѢC"5h@ ࡁw`b'N _/:+8|"7s.0)#ֳgOG<3F!t\`N#kra@w}sw<:Gv*ӦMSp&" Fpv0N]4vQNFD7q07}PB p]>W!P|:{g>|Ysxި@MM&jRȟ LWZ5YFp0^z:lQ:,_4؈q3Zv6LPy1nժUe~ 4hdd2y̷tz_x1>}*U;  oX|$I ssTK:utl`Xf q4<6') F8g͚%իWsFg4N@o }7>2 7Ty /((Hu릍Bh BI? RO8o޼y%LMAo[X 44L{81:x2a?uĽL: Ш+D qP(?N@jN¥aLds\L=gh2Gѭj7`ĭ&= =E3 pbGJ5tp9K|It{._q^vm- Ͻٵ $uH_bʀL\~c{xeۀ8pH-?g/z.hm{Н4h2h2L>ο3d mp{hָֿg.\P"EdyIdo߾ `K8䨱7qD}B8qOQw8PL1O}6a" n}?>w}%QE H~hѢ yiӦihs~ŋٳkpt84z;b=%K/4e$ҝJҥԩSp=YK\;pǎuq=O}@Cb3gHZ]1iW7d"ƍ{MO\7hdd2bx0)!fGnݺgi&=hǿ] m5f@@q 2I%y!vc) .@Q8ph$ I@ԕoEqO9g)<  CTt@cjÍ#G@e'N. d}Ə/]vպ@ ģ&zq+Zao ,n9jzSB_ȹr^eҦm}?43ЪK-q϶|Au^wg <~%B߹(  N1c˗{l߱c2h0>c@&&tO>hteSpaf\)˔m4xGw(LH> Hgj+ap̞=`a׋I/889"ds{B~f`xu K" WX17n' m4 |tu谛.]:p9(Ǿr[]<nj֓:}|'IJ_~vuu,2wo7?0c P/p}r83cιBV_9w 樂GbG_r4h2h2L)>DQ`Mr$[ln\R~Chۘ:9N1  pο|l0@oN@2,t&%pMU2%Ʉu$6h[qqSınLt+qK4!`}Ȑ3gNK/F 6x_ NW| 9 8f7pq9s/}|ۤj'۫c/Hp5B?n.< gVZIǎqz[,,exҿ"u, 3 [&N@@d\"8ČZj#3h# P.١<&ի4o\cj0QOVX?pLiOk]^֮]0Ѝkk &:kcjY5V\nŊ+Vz| c mޣcrKt,왰ನQna1.\P4th|/4[1,Y' :,!äk: ,Bf[CsKணAlj'ns{.-J,YR 4nT.讋{Y"/(8 #24{qOC,;00P]ܣV֜dǎ68uqqR˽/1ǜKDUy\X|mBy-W9g}mbP ;f̜-#G6h2h2L1ɬYŰ耛Ⱥbι\s;E^LwI5! 6Bk'M5=;`ҵIS7E}gcr&Hu|" &dpLps0>uw#PGLLԵ_'aGzs6b/ #t&P9돎7uDzp Qw@\jU^Z󮮟{_4i\f\| _hH"N?L#M߄ s!g: [;89c_@\rOK`w4{6 W a) C3 -[s--\HڵG8Ǎ9Z&&3uEu n+۶hdݺ6"aL-[˒%sq4A,S&L^'NTQA4k3zƼkc<Ø{m1{7ơ@'ǂZ|°#_ɇ$ au}L{K"n2~իȡCc/_ع3Clr{S "ciAp_ N&ܣ=\Upuyph{#hۏ:\2 ñ7n8~4dɢ.!thxР;6 $84_o@S*D@:[lNpG'0 ԅs~qG }/w21F3@9xc=!NS*z=z9B=K9%O UXN]=tZs- tX>%vusNyd2Y%RתA rmDYڴjRWhc /tQ+WKCM | q̵HXnrd}uN޼Hr?2Q9^&$1u .cƎk@>HcD4@<Yn4iv=e P*cp)q J@\^>p'R_‘>/0/("p;%kp9/86@=z09gGrp*U$ 4PL qr.8a \'ԤqhF>9Grwܩ8}tSP"\r/+\cl+&AW*;h6#,FyMM&)m;x>PRi#3gbO]V<kM4 HԊ-9\ľtWPA ,Lƙ*TH'ȭz&dgn&ٲ:qVGyP =L MZ qnyuGE?p M@8qRnpc~ӐDž7OrЌF ; JisY@; ߏ?'u˨QFSq'n3#5wC| p#嚕q|p׀lC/Z9d$Bg%.=u߀4"tA=P(W)Q\=TԻ\8F8Ոׁ#7=Ϯ C8d*'In`{\r;V3hdd2<lӮ5Llnb`d_/?p0xM1If2|޼y2c M2Eƌ{kxmRVʖ(bŋK*/Ise˟r)40\;U&v'g=V2YV:''/F1NWqtQ{ $ s=oBt&-YX *Ws ͝;W4N@^kE_O?)$qNr(zWĽ`j\XvdSqxuJ@h[J,^qr|Wo.tٳDgv1{M\ž[xܯܣXEs~w\>"+sPLt0dP\4=kw`qrmq 2{0v] L&@ 0Թ +kg׮ݷrni#)FZɐ5y靏@R@>(`59}14Jv7-2|#|;sI]hsKH]7@C@ @ >W wrlHnCF=-:s[I   fLmFyQ;@B?VWrE%K&DF&׵;;}{[ L%3ܣ;M.zM/j֬+R`P kะٿ< ce cvs9aYO foN9pQϏ:uf͚]]`5]̘mqu]n&>ϹGlx Ȧz6@MKkIMj 7nlv[ 44L& 4G`OTI:&'՗&dO"Y/F)Xآx6y5Cnyn'Q2Gd$ \{Zt@qQ` 9# wN =G<Źs T!RK &JM5!bW .0` (/1DD+W^ Ib*ݣ" v]C hA|<-ZTDs&  jxw]#ڍcuQe޸> l^# - b;8o;n[%2q+V5q޸sW"NI"P'"@gsݪy7kvPu L&@ 0OsϿ#o('ժWߪV2e쯾,Rj8.'_H%{kYm~<㙥BL@q"" !r vdMF# ~ EAj&3װad̙ lp!|DI&n7" uV|y 0ա]\n@4 y%}*Il:IJZ<;l\OKeL 6T0S8pj>surєh4w"<()N>jHp qmu8 i\?F5 *t$[(bV4pf.|'=A1 *(.uhBCD𔚉܀sn71]xĨY4h=OwC ww_z'0Õ1QG8jyA ǟ8@lٲ h0LaD9S)Qbܞ^ul߾BVJAmyAl/+ Y&M8@9tPk < p*p w7B45 d"J\ա,ˤ9`&@Y9Zt0X040..>⥀6-GWy74#m$>'IO'Z̀z=W^~1HG[^ N9jHq86<έ 5J>g\=֋-lĹq^|1]sqqr'krM]XnDL8)?q=QU<J~StjZΜsQqMLU嵀4~IM9 ׷~QW\wDqǹ47Ի}-փ/5js- Z4p 1p|s 5ˆ.< 44L& 4NՋdR$ӯH $Y9}9jw+'a2Ƈ&飒D7C^ȱ"J]8"4m0(|t tN2 k$.IFG_Q{dirrPPE Ĺf.@W&M9G-A H-HvRg@ c{.Kl9p1\t< <"+bYx>C@  9.b&vLڋ82="Nb@=\z8߯1[@9& 44L& 4xmRAj%E 5([A6-SWpcxU.*6՚Iݱ{>\"\9@E{'L.?lӍ|d.r@ PcԊ\h)wUVc  M,bG<5oj۱Q^$Iם.N5~4Y. 5] \ j}>״/`fDL-Az<4yIC\t9v\SǏ_DZ [C @ L&@@c>aw q?\g+k:wcr#A p~7mXA zDrUu죙HppFRwEkcu@:Ĭ ]񯯧`&Ly]tޅ7X'w뢻@|@M M6pc ȣ6 L"QM1p9G|ٹpѡǾq d@d2EnAvx&v۶ȖlD*k׭im~ʲe+䯿IgX E<5jx,n< ŵ@-vԷY n=sc?wuK$7 ˥<.\XG̶F Hdȑp-]nq*ǽztv0Šj׺B3fɌ1=V*\Y?O L&SڴWڬs<2mD˜25PWAæ2g<9yt ufs}i^A}; 7s#rK \zse{w.:` Z"G7|S2g|9F}ѫ|`rmD+ ^,mvy7$d{q'=`["mg̔A /sވd2E"4GN k&KA# sMoÍwxs ]C>]A@rg.y.4èTH3FM&y8G_A{t!Pue]7axx?{Kvڥ?}a{p]E5t2i޲4iRwnM r;timظt9+HP0h24L@ ݻw9z39n#1$Aܤ]N .]l'@cs,lfuŽcxJ"EFjԕ׺uk7tP?~ Y`:\C jar)}mܫk{+VUeU$\7VEr{>SR~f| L&S@k%ekb=?o`\}o֮][ 2D cL&SDc'r&#^&&×@߸?&H4)S) t 9N a` 鬛+W.m $ۨQ#}aÆi']n\|۵k?&F2_k2e2h2Li0|ZCls BBB#.]{{9w qatI(HԩSk\_VHޒ%KJ*UI&ڑwĈM6:bBL&Ɉd2E@vl0|ZCl4XltMA>oǟg}V~my.]:KŇictN*O^_(1cY_|.]Z;iSYp xs0a``l4N\^=iӦ4nXz@>DoVkף{0B^W"zKɓG>SꫯpժUh:2|p:u,ZHlAe2LNFLM&)2e`ob+t:iײeKɗ/Cɑ#dʔI6@?`#O ?%J y}g"` fL<__kڤ~ұcG8vX]ҥKe"ܾ}v2ӧQd2ݡx L&Sd0j'×@߸?x}:VZ X# D:t[# A.@Ν;ǁm޼Y!I_~Z/1b!CG  3'u ]!ˠA E g#p17HN: 0 jҵژ(Hsý@d244L`N /Q+q+=7 ./R! p-o޼ f̘>\~p @t8`;#_pV ` @cҤd޼yZui֬T\Yac$k~?an]x qc\/B4ihb"<68 !0cƌ3Dz8!]P:M0AבeVd2u24L`N /Q+qK[D`Νua2s*T(/_vL2Ev*?dΜYpf CH!ɓ{nٰaqѣ5ؾ}{iذ>Bu' W".@q4S ԩS+31;b?!A n 6<G#j Wds- ԺOls @GQٳgk'c[l})e L&&&EYllV6o٪P6">NxƁ=ұs7ٴip e 9V~ 5V*2g|Yn2sliШL6]-[̺vSf̜-g=ED +Y-\p/\n# ˍ.1Y ;U#X(+[.oڴi =׿yŽ|DY"vx.NFݻwW!N={ァm 8X&.C?b>\DV4dy,{ Pt1d LLkזVZ{1`#l@'QIucA3 6ע/ )-;LMM&)%Kn2t5WlD>K&LH6cʘ${BX} c2p/-\,ը' 'NJ-v+ƏѮ}']iAݥά_붲zZ9Uo/"_v4 ģ>:PYw "VZ70$zKLm۶ ] 0H#ND⻀=#*:/q i&cuf}ɣuNjnZwzc8!]^ǻQ 5)aR믿J&M4r 9> V0~H۸i߰gc.,`244Lh*jf꠮Ā)2iT4&Lٯer1;&+W>HM{1ztSZi@_ݺ~mzͺqfD2$dlٲM1}޼yNهy9矑z^qaZ8w8h/lF Ȉ#@y@@@H. okܘD42IlAnt^ߜ9sdɺ;vZjIRPBⱞ6 q6.VFW.-ZЈ2`wăy BLd]8HNƸ!A8~xOŊWNr$G1~ 4Tpt3ݻK5+c{. L&SdDDa# r?qq$#GdC-uL wISe/zd`1+/\s+`\N‚GȢvP`LnhP6n; cٸ\df@H"ȾvXu8i&m@MPwHPhѢG)'MTĊzFK.-UV뫣k@H\8-WREc $5  0P?W!ND}Q&msLqGb4WczoѲN/_/Udz.w` ޣ?|a7P&LD@d2E\xEQ0L [sd/? YW);L0gYyP&Z ly@]^@-\o@+0G`Z|I-0 h B۩x7۪.\<|[rA` GWb `F 8s@o7 nG 1_\l߸q#;.?:  g}qa@$3[l$v6!ŌYj_.7 "&L3f{sҥfuy8qBa›/og Ȯ{ALBKj5Rw1ҹK7YD/zJ>Ӎx L&d2LH@4zpa` H<hֱvZu]%O_,0' Hm@3>A_Zr"'S|Q 9K~$qn=z: (H @>9W8 8*iIx̙ 4UPA{] 9@^\!l7K/7\# ΁ڨ8kv >R>3`}RZׯߠFW2ts2h24L&d2bڭ5yùGs ~ ل0\PG-;V.@+P W`,"cUK'ӽ ݻW8\AyGp p@$IH*mf;CyCBB#jRX($r $IJ.]X3X d= ݺ q_ƽևs(s8iTYQchd2L& D9"ۙ׭['SLQ8F0q2:Y/ $0$`j@ L&d2LG8jpѵ+ 7ڭTZ)0C؄g ja\s @! 5p@:%f8*vdeܹ6L-?bŀR)P0GD}Bj\j/[A#p%:Ź;qD=-[Jݺuu]}7zh$% d=pN I CmX&%.u 6cL- 44L&d2LJL|2GQpc 7up9߀^<:Li&uX6pwQqp7#Fh,zu1ij ?Gۧ GٳG?ujAbl3`:zsV8J0\QO p nOD8Ⱥ7j94i :Tcg-p0 tŹu#:n7xC!"xh$*9;1 @%`ROXG+j3r\X4hdd2L&d2Z|.1j%JH!\R4bz`',D\ZJI&O]@j$k֬H8Av]nа]9"0`ׯ_?=+`u'p *s V^]"PϹ'+ Tr\qnD|qicQ`9AdȬǒucPO{c <z?:*]bŴQt phrX…F(*g Uy+ P : i¶1x+QT\Yk>n;v( ;<^tŋ5KF==y!9[dɮ7*d q9.ƜDqE2PDz4pnA L&d2L8!϶mШڀ)RP3rHF O@:Y@ RsH5j$Z)U8`1b@[4E `@0[4quRy NqpX;3Wg֬Yi 6MDZݦ10N[A@!I$pB3,0i, d]p3'.Qרu){zČq8 `'j>〼@&&d2L&)NPdl@` xg3 vPGk4gX?[BA#8" jР'" $R k> HK! Pč8ō#;&#&$k`йĉbQup]9@u0$zR3nRܠt1:r\q d]=BW'c:r;"4< &%8Iy  u笗3g6hd2L&d͛3`C 7V0y ^\c<7DA;D=8hNb:~||jժO%J򅺁w"Ņq `roxsԩ6b3[vdUiB$$rM4-u28V[?^}z:B;Q^`""87nܨӧ kH'O]7n4)! 6xJ1B8?gQe 44L&d2LqG BH0,"ȈW8وG4C;^@!\4/N5-DYW<胣ebSDVb[n܀m#*C```?\1Y>L$6 Ոk$ݻwo,v)CuSnRd@_8]Wj$Ds ]#_}UT l6dedd2L&d2]d]aG=_ M\y@ri%" :okxذa pL$KG}TԎ]F5Ip%J d d~i81s% =kc Dwdx5%i>gDq)F4ͺ qz+e 0@0r8 BMÔkN$KwAoC}_FLM&d2L&ӿDI>,.1v@jQ.2&ծC60 wups6/Ȃ8Ā:i>s=ԒZQHL7!Bȼn\@$q]A @M@C{-N@@4c>+18qG"8pXbݽ54I8Yqe#EmذZ <4N@5q@l@&&d2L&) G7 pQw 8f"ˉDjpxQt^T3{ԓ#V #e1qqlb15HĀA:2Sg$4k#P:|nj/r|psnjǍsHwjK@'cCa^2WJ 0y[ U ?.Kfۍm}24L&d2L7]{ @@Qd {+po:<32Qk,h f5Wq2rR 4qdD'89.@Zerd_9mB : .eʔ&MrD"`y H81~(Q"];59r̚]wנ蝜Dtc U4ˈd2L&d-T:uF.r;իW׸hTNy}ܖ-[4 `@"9w`"lŋk8T"Eԝ8qb pg5Nh2ݾ\b9>9?Rgvƍ5N_j@ ;8]MDs9NL3ŋpr>?}1Rͻ֙:^JV8l {#@ߗ/@d2L&tᨷG]V [ H 8p4n:J*a⪬3>'\e$ur!SV0rN4X8ޢBl ) )&G_U~~Dɓkb%c $ RpuuX`A8w &# `$v$ݦMuQg1& ;&LA@!A@+}RJk4rpQog .+@gHH6`ŎQN@Avp|86.6/BǒF HMdF2lp?ox`ZxgZsuXXh$~q5~_М;\iu Gr#F^ L&d2L\8NvԕBo>'i|XND'N*xL}3)#Hf S-oZT2>|*|\}NЋGiҤЛ-] yd$qb:< |U$m_g꟫Y䃗Sҿ/|[B1qlnÆ ՍD.-܀@{gF9ցLl7(m޼z3~p I=zmVʗ/OpK=dY6QKiĄq%Nc݉69,FYf h^zҽ{wӌ_2dpݷ?ٹ'qu $:c |tXHťEK?iݶ4oJ " ѭ{/ܵOld.afҵ[OMM&d2L]+'e~_}I^&ɘaX}\ ;V*Wįx&y偄%a`h%9bLV5~eёrSrx\ ' 6VdCPSr_\&Rjq찜8LZVU$]FKˋ7ɖR㽧J 8pgÁDb8(HiΉ8# D7%xUWk;Yoq5d+XdIm6̙St?e<ѭ}>ˍ>/ڤI4+nGf\0<}λޝĩXËLО}JSj.RgϞҷ_ҵE&M)S#mtUZj+IFY.@@d2L&=W12yay5q"I'R8`,^0GNsʚGً%99f{-}PҗW/ɡC|ߓ!dlֳy6m&)To[,%="Y~*-/KȢyeĹh}Oyce2ƧJ)닒%ٓhqm4L4i$K,+/ʣƍeʔ)9O?UGH]9Sj̒rdqm%K*00 5h B[btgŚ5k1e_jEC@Y˖-y9}t[G`- |9Gw O@bى,Yɵ~_xL$|s:u: 2hpYd+=xpջ@&&d2L{^2ZNyD0e~)Q-W\B (M/Nȟq\KLݻJ9`RD@Y"Sۖ2?,/,ߊe<*~. 'mU{bR/7xRha)Pk8DFog._kY$M)))<4<@o8n@\p@1N-ǥa0z/<@Sk-U(od.'Kɜi'ߒ:Ⱥ+r D؀?VFGRzޘ 8h/"6m_|Q~LRr!eʔш0Qm۶ PԖ-[&ҩS')[|Ģ0A [cp 4 w Ndd2L&tWee$y?+R ]IBL-s$ѕ$3J/+oIn$ OIEǎ$`Isd{HS\>rj^X)֐"KBg@IDAT$MR{"Y}lJfH.7 Ք{eN,-Mry<۞UV=zKy$W'%Ci׿'ʼ]=#:{RL2)#K3&x/ qF` sD&F]C\4bwl;h 1aY'|5GfĂ#NkL$u  ng3eYn@&&d2LEMCdl&aBI,TJK."G,'G+^JTH|$M<9y䕗SˣǓ IGRJ,X9x\=L,%ʽ"I/'E;zS55Fcט{XKF[W^( X芅b*R76'8(׵rAf92|-5!0g̵ KC)my<W|ƳU#,s;w,d a P~pP+3>xj\t6~J|:cL Nܼ?Y<~[ިydL9)fLnrlܯb?=r;tz=+,K.O+^+*kt/|˶\>p衇K,QZaw23/+ oAGV1:/esLbۀ=26k̥"x!gg[q9/qV('I8}ˠy';5|@$B [pGkov_9~ecCO(.8}cŗ5 θc {{m:TjݪaI䲱? Þk\<8g[iXa]cFM=Fm>/7tXzbŭO;</QO@?Ƴ.2 Us6`n}'l0`@ rqHtY+jVB5kW_mڴ)7_lkVsOwyM8( µZT sf`׮]aJu!;3||%BLo>/=)pXycUW5XycGh}i\LF11m#qbYQlMqϰb̏.755WY1[e8ӯSt񳟯uGo ㌽6W].V\qXyubeV4>ڸexwz|??3gLO:'nVWϊOu6|_~˖eY\Ya82[1Yɗ2`.7f+onP3g~ѠA޽{i9t 򚯾ۄO=8c뭷_9?0Fl?"sgm$Bdf̘2&&-^zL.;N8'36kb̘+n?qA'ę7NY3'kZg[Ƈ[Dkƾ󦸷KO㳏ߊލ.q`箱NλwyآWq|un53;n98Se[3#i{(dۓO>\rIYn;C}%u F G\ѡCRxAY~9/0}dc.[\~aWe$/33 =ܳ\oM7uY'fRqxWD&M,^{T(<;ܙ`W@193M >]v7VO</1:>xׇo1=f#}_Wx/FOYG^a/>}EߗߎI3*~z֌:it^8/} 7?MjbHnT&͢q'ف1GӾbđk_1`OռɊ -ove+nXZ`s F.lAIVѳgϨSNr)%YYwm8sm۶%4̐3du1+!s>bއ siM7{lL UڕKIr`gzN6΂8c$;3|ɮp  #@bIb1iʌMy۱7Ի[fekYK.d+KV~|Ym"\V7^z8$lmZ*[~5fۉsaVOykVu]Tb !5jT.c,ʁpKh٪u @`-lqᇗp 6(뮻VV^zEfJua믿~*y]vO>?ԩSülybkrǗE!so~=Cz(^x5jԷ6g&ސ!C[o58eko.wYyaYCpU'$^V902c=J[kX $9ru2|m/ƃ>_|qN%xK tVuYep@g^nBʙRv7xc?1r9I>Nn΋-[C-l%pqa";3ҵ`?$B,_/m:X|˒m#C 벅7tR^Sǹ.۟b/}dHw?^f3';+31bDY u֍OJ0kTAf:7}3{/00 #(f(x}_ytС}p`_|%BT+YVs1ME/oˊA"j{79眲#[[nޥ:Cݻ* i*sJ.00 _~hb}-ۄsp}-EukΝ>yاo?|$BT+rYElwܱT%$J {Zk}WJU`\] gowz/R=,.eA`9/+*s 駟 3|j6 &0[sa. Y xk}}xQ1jԻ yO8⭷^hOu<#$^jM0̨ـ[mU7)K-r6`.M9?pa _rG.ygJ@yڳp 6(!nVfm 07g :'"å rkV r^{Uff՗9C0d`墒lIɓ'벒ˇnݟ+>j*:u~2:?e?-Z.XVuXm2|MQ:>^*1{x33JݤiN= Pe4vRR,LYfq5ה9nrqH.ɀ/+8_j V+"ڵkWy 1x/e0uG޽KUݖV̉|M3ݎ;N8Te }s_׸ˢEB2܄$eU970sQI.f\f8 36:[g} Xnj3cܫ|6?WyΗV`tL3)!̐/R\tE%xJxꩧV7|̧[U.#͛\Kٗm9+1{d(-=z(Uyfn_e[j$><20g6۔لoyyON:餸JpmVV. ʀZ _!B,!XQFqGJ+Tsn]VU-+rpܚ|7R˗`t-,A].@.+syJ{s{2(Y-V930K.$c@dW.U q'mV||oP5$^ڇ~XZSVs15[fE:g%>裥E/KYrŚk{Yq 7DJ{9!bn ݻw/a`>qWdI |k6Zl (@`W)^zqQGņnXlE;_~%Zd{E.]ʒs=7:R+"O J#G~9707[4iR6g^{lP6% +eE_V$F [jUڄ;찲Ax%W_s0#(3yx!YYRmꪫZkUu+W=z뭷 MUW]U6g;rovieO>Y~**(++52 m'pBi_|_|qԩSgUa>,$^EZ2 xy* r݅^X*[\UϪȼW90gWFRy=и_|1}U>~^'ߋn.!?OSWym9+;spwڵ E}d%B,rKKߕW^YZK_2.+KsVݢrGޫ\ޑ87o^6eJf .spUECnuJlS~Kx.R|9pw5k0C%<6lՁf(x!í 3jԨQ 5>q= DhAګZBқo9mV+![op@ oȑ/7XU |m۶e[!R5\,z lm90 gkq.4}ea%B(=% eYfkV WzYve%e`ee-ajV fkntlɭ3ySO=T$⊱J+zWb6ܰaY}ea$B5",e{iVe\Vծ]N4ɍCr!HVNs9?9^{2_1+w2cAѣG*[瑭^=#q700lK mݶ|ׯ]t_0x!]5[W/袲i6+r]Eڵ+-Y'Ƙ1cJׯh֬Y~q6\rǗj\./ˊªM^+SVvf裏c- Dn2/p5(a`_~y ޽{ EՄ cȐaѯ_?`` Kg@sС*BfepTN8K%[V҉p:th ~ 0{1襗W_] W^F#><@\{u,9}lI/W\mڴVެZxyl߮\ m 6/쮳o׬̠G ;txU5ƏP|T y*}O嵧L^yhبiq=qzQUrn;;:ԯv]ѬCѧO Awߍnݺ6\ꪫN;?omi7T'o@?j|K4j$[*(sV`V%X|-\wuOEUx۴{]8裏8(vuV[λ8/,?WѢeh߾ct~NѡCXS'b7?Aۓ{~>jվ5:<1{xO*9M=T>6UxV ?7O ,b(+8蠃J5`.ȶќ?TuVnQ~x> ҵ[Ժh^&ޞ={FӦMK_n\έyOsb5J;[3kԩQ;A&Ѻ#ѾcѮl:5nW7nΊ-j׾[o=hԨiGi_~}_Gm_VqF5sZn[*x~mm}^qmwoħSg?ydzuuGL*fA/-*ޣ:vvr @EHPڱc8F%\2vi2+0>[Ww@`^ciy-'b7llE[f:|nl\k_ _]vYn:S̜,zTVe+^X^\7bذo>?~^{}H@OvC8hhаqu`V;b[UR6mE{9'^y,fP`d fo^mۖ!-2iU/9A*=5jTѼy83K[YY{g] f!CbΝ; \]=|^ˠI}oN_1yUr<ĤknѸI*lp~x!r.-zhY ZkŮ{o 8N#Y/\֑6Zjͭ9g2w~W 粔\-Rd5E;W_U}WK/3} ƞ=- g'BPGq{-X${'G-Je[U[X@/>eCpnw}c֋eYT^fU?ϲ'b{pz㪫*K[6l8묳[qzx!"89wqRm5׌v-./Oʢ~YVeO<w}wi!yWYe` J8!aÆ K?lM lp>C#+ 7@*HV eA^e BڷoofbXy*pܸq1thժU~xαjv!98K?i,ұ&E:uc͜3}?cFcTċcc1kW8sZNck5ǎ'VqYM;sN8;@*Ujce`Sٮ'rNW?Z|o21 ݺu/N9j'%<.ReaK&MʖbĘ:zPlz۪̐E,?Oق!P9|'N}9{WYfـ׏?rXb%be5j֯ 3lШIt) /r{{}Ѹifx7Y bj\pA<ݲK^:x[<ݽkϱ֏Mw?>7$>6c_nGX4=u#{Xc?Q7oOϾ!<ȿ;K=SW̌?F3dmW^ye#>9 @̅U7rh׮]r!& vUnڵkKu6˙5+Sp2]lb-~k,tr]3o)N3x 6eX>~֏ \)V\ױz>5>>K@nU;jպ+q8C71cYO=խTcf+[f0[1b`ɱ/<#N언q>ʿ:tNx1񳙳usVE,qmWN Q/}y cĈe`Ή{te 6ؠw\\wu% kJy\Ĝ}駟^7_7PXI2hժUiO~3m5[6_x/ N[gO‹}-_]X2}Lx͏w[{rl}861K/Yn8{17bcm묰Nqvh`o!CK`'C{|67o^;c=b-_ץ :L<۠+?kYma`~=ز&[7@Q^y晱kOO4niK2׿pehv%E~LƯ5nPra qS'ŧ;C-_Sf~Q7gVr[ό&L}(tXSO(Ϝ=r8t} 37Ige`޽qq饗007|է2l2dHL2 7@Q JV.&+)\j={ƤIo?pK1KbUWOlw{7>^qg~ڞEl⹱b yv{;%=ǐIbƴ1~Hhuѯ77uS+ۚ_{&kַV2lSCMGZ"Q-[ks37J/S?0OՍO/y3 :=>#>pV|VxQ^Ԙ0-}̊韎C Əg3|<9N>2<Fbl9KXI28qb޽{<eKI'T*vq,ҳ79AFѨIS<p}{kXqsm_]>ErFeӊǼ;<ٳwjHg21gӿo8~~LZy~2eji]0pPh*Y@?Fn`7n\Y0xҥKԭ[7?8#JV[m뮻n s<{JpdK@2<͍g qeWƋ/-Uq<۵SѸI2tL\J+ Ӝ9oW/KBrk]dfm .k>j|k]t4f>}ŀOe__Z??>GG]!Dɪ 2pԨQꫯ :t(guVԬY3;糽JW7.hgqQAhּ^pѥѮ}`i} ΠA/G޽ /)gR%͓Պ-[9 3V[Fg}JzhyqGW|O<1N9r>r-q㍵ڷUyؾ}Ǹ?o ԭ_N||~5iũWQԫڷ7պ%ڴ}T(DP=dؓY8hР֭[ԫ nx[=zL\ψ#KӴC Dxǭ 6.mUq5jZ*N;23/euׇqk=z(U]v-gy,իWlW\u)sTř\qw{/."['L1K@$Vk/T:fiV1ḟ1 fܻ/*[zx!20(Cm;Ϊ̯_h2sfY2k~Yל9떤T7\p g'BPM w3ƌ[Æ!CbЊy#F g_}W811lh5+[y7Fwf|s(sITSy]ɣ?398OO?#ξW͏g/%vΌ s<-N=55<¸Q;csN8;@jJ83?8>|Uc -w3ӟb}ݷ06j8̻~1u=uN.^mtƾ{Slz!-Őfp g'BPM w..wF qQc֍xRhX|No?,{ct|ַ}=|Xccæ4<NλvG_O2n渥m6$sGߎɟ͚pXn㏧_P;n梓V7>"N1泘x g'BPMeظI:7~>#F 6{Jklw6u\%V\ur;;DwQxƱڿ>|7_Xu=KZDof6`N~h _XSOuo%5iV*Q㐿 Gֈ]7X)[{8Ѹ׈AݜUn.Flq~mX{s'īc>i/ITS֭ 5(jpi@Fe_ 6F_Ko]ՋzYcضqmA>q\Ѷs˨?m֍U֭g58~ITS6h$)^zy>{?o#7i^ ➧ގIߊ1/'ljlnQԻ 1Kaݜ[ m?3>^7>Fmm806_ak^aψf3$^)K@S#f:$oc-]1^4=7pҬ[ދ׋5*>NnLNλ ?4<,θkt860oF|3ώj7pf|طuty9`1j92vxEq1GIW6OgsN8;@jJ8+Ik+#ƘI}ђ;WAcbg3cUΊiߏo|<=;3OO֘p^ g'BPM wG8x!N8w՛  @5%wӣgueرbIUrty*|x.c=B4kP46mGm@fznj3~{P?ѥ' 1 L'Y@IDATx |ՙ@6*uR[xԊumuj+JjZEֻ b ^ `=q2$͛7|~33g9'g-l&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&`&}>՚IUR[dl&`E нu 00KQ5}PNJ60NA`?n @'P=!Ѹ0A\4lS>o O֝/5'ƬJ GJLLL:-ڳ MS Zj q?%{BZ.|T%a{V.#Khչ*M6>)#x,Q&MYS/me @t/BLLe~fI`)MPFFض*1R0FfHJ3%ۺIa2U jiM%ھunӕ>T$Fm&`&`&`&Х IU \ReYunYr0YĘ cU.̈́ퟗ&3##q?+YlN!''.V:|aJЎŠMCRRZ,ٯOS!D$PFfJs7R8-{,Z+&l&`E""v5&`&L(?rq\2pb}Ub=#sU*N kT#|@ HuI{Z3RmB+ͽSPAy'Mژt600#Y%;qRfhriT+-h!kRVn7K'6bYDFϑ%d4v?~+҃$ @(R=LLx}˟C-%4U-|[S" K%'A 8Hn('j% 9qe AO)ͺ{}$}O'hϥHH60000N;i&`-"нE[y#0000M;sM(.rf&`&`&`&`&`&`&`&Ph\_UB] @ ,TcrLд&cUWț,'v"+#lC .aa Nicu M20(Ad+*ël'C۩PJ\"m* =5.D#]`{WbZbژaC4?JW7bMM- FqO-O^sS i[DGؔQ)Ԧzy T#jTuM8~S;HJk%,EMeRe1bJj(??miɅNw*70ڱPK<^G8Dx*Q% 2g`E_,= ĩu}ܔm)cTBM1cԺJS?!$ aҬw$QzUD]NǘN8# F$ֳ=K8V`](?T4vW5*knJ{s4e;J si$$l7Z ze=&QFd<&kWP{z?ӞinSdڑBI4ZI81Ϩ 8I3UKlGzJ +M0F&H1ú<&fd#JL֔mh;2pC~u҉6@" Bc-TbĈ* R0:H0FaFSV橓O>N[$HjY-|䯔J;tq֑7ҚDߒVIC=/#:%[!0YVX_K>?_2mml$1 eM|Gkgc20"XL7S!PQQ`UJLDIR LДr$ƤqAa#JT9Nڹᄏ&mfH.pU-T0)ljcI)XuSFw1[@@RJZ,6OtCPYK4XO0iL*\:}2IϔBdy_$۩Y['0Y瘢}a__&)X?y؞{s|}u$((*J")ysib/X~0fIR0ߑv`^֣ieTK60glsS,|4Xrb}U.S4pb?)i2ɟ^7_mJnCѿrufFY,]69!ڎDk8=/qᄐ%l4%'$N6Rc6D ]-/`B"1]HVKrr ' _V:e׆O;Sr:GJ!.JI?N`Pbr @ $'Q bV&a{4RӛuL\ FtpX $9CDY&Β>SC$l4_b=J d!?[1l@:wSl89ԆjMC+3~%itF60'ީO;lAMtI'9 6[9a J89?P26N6ib@).5ڮBh$o['pvnWg~Ia=Y~Gjʗs'*C.ȏ6=elCJUҏЎە>Ob4!it_鱒L5xPM@G˒ɹSU͉%t._ VN1'5k)Xr]XshX2϶T5 jl;O%-nNH$db}( Z9d\&Tӻ> }E+q"lŒޭU_|^Z.ME.G OM(*Y-q:I/ƌy%'a1$/T,.Jc%o̦jHUߐEimҨ܂9$UK{KXj&Kgi#`vı^XWg+attR6Dmt4@O*@P%:%:M5u! _w˻q@ApR)q¢t"N"aW( ##= DYKBF=KQ(ΖsR%32Na1ұ@’[.-`1Jj9HiM>,?W^ 6Q EߥC%7T(Jj(.u$-l|G+gI1J!ӾZxiDZu$GE jO=%ѿΖ۞U(fΓHT\jT.Iݿhh ! Sc&PR5Q600Rߑ&6?K3!J,HmXPCB6Mb6'J(Xչg%}YV[ViJ`UJS&KJ~3ՒL #&xFZS50D~K>0FX%iO]QZiqnI֒H,mcw~+q u{\ by~f%RGC~1eYd+$rƺ*v~/t'aK%l&`%Btcx000000000000000000000000hG<pMXJ/?] LLhv{Kdb&`&U юVvvy;/ M-Ql[-ҖRcmb[;pG;aN@!$`_' x;h)o_%-x1(/ҝy)wO.ItRg39֔eh oЖ#JO۝ $c6/f&`!@xat#.GKi* Nbcto4&N#77_)eSV%Q`J Wɹ&m,#䷵-%-g^<^ǭ3h'@N"KΔKF%Z,'H5$dJ!/ПeaUR]nIl2Z"=HJANv&'WJ L C{a[ʂDpqN-Ji~.N# )YosɺR֛lnbMC;R.m1 ~`)cV=TsZLQi i+itT%$^ /m-\:YJ[L(1BjelO@=l*ݡ఩9t_$l)itpIK'בX6NӐC[¼CS:-i{j2Xە@:R%[?Η[I#lw^'盌R!r-?҅L k$0T("I%:%=RebAt'U'ҴY]Myd/Vz?wLܢ% Q\# (N @Y6kV/k뛥ۤK$my\b4A\:O";N~. ([$͕nI riD:V([1qK>WRܵ ڎ,m\4]`YF Hb9)ygJK$ 6H$֓o4AJOkK2'`)>sX:)uWMh)y%%dw%,_,U*-WR}j0PaC$G23z^J. <5000HlL*M'/(=+$ Ohʺ]46ASe{gD(dJuR˔`s%ʠ7zLJ@Rld&`&PO`U%Jp>WY~/'R-1wtM^{tT+M*Q'vD@G] ӤpL?JEzZ:AzW 4FW^iy]>Ogϲ+//_XQQ^%o=^U>[#ЫWl|}gYY^Q+y?zib6?e=g}j 'f@"O+o[vd͛56#o E#Pʟ[oɚ7k> mF^g#2=ywjϕy͚BH9 LLLLX1000" b&`&`&`" X] %t0000("zLLLL8,ᦘ @8,ic&`&`&`]@jq|77e,eL5oYVT^|.F_el|n??fl?{W?odOm2n5x`&]ӻܱ ݻw_z*E>mϗ[n+OeCy7WfuoNlԦn:nWZ>Zz9z굺wޫmڵk_\zӂ;iv!v?EBVU<얱춬{(scYO}*9j-)}R{Nі yl5ƹ:b}K~6nQw&lRyA6lXfE!=|-]4zף{{c-[JFQZlunSLlg2-VSjSWXqo>y>|{K/E?s̩TO>dJU@a]--O?Ço*gG`o?~>֯Kɳ<њmmEs 3l67ވ}ٿiT`]02?RߧO]ਣv ̗FO3g446G=UetmrG/Kyx5wFWj~={|so%zQv5@U{hHOѿ{37̪knj{ܸqWVVRY-w wt N8>SR] \y啣7h#^ilnz`x}9l1}J\`MA`2dȠc=@%,q.%Wcl#ol[t4X*{X[2;9^z*zKGs)AK> $^lٷ6<CvmW>!bp~XK80EzYeK9rVzI-ݾ3oglk\{[!2w}FRDeYY'C7)oD=wS޼ѣ{|y,D?7oX/[f:^f;^"kzc^_bive=䟟<O覼SYPm)+?3őd3?{_sy|8R7|F ~}XD'roP9e¨oۺ(|MWޞq^ƼYŝg2{md#i~| ~t3[7iI ~,[y7ŏFfͷ?o{J3Ckc_c}@'cެbXffWArWOKuƌh]wIO]^N;ow"efm" zl]\MD]RSOn>çOmn?6φ@(>OH_-Z Eѥ^͚5+z" 6zTsO=TDy4}.nWK /pn:vTދR&IVNE)mo.(b;3?>:Sy6kt?\8qbDUHկ~ժr?L^?& LL w}^[GD0(7.8X ;(>t:+ҷ{,0}ʮ|73gNtI'Ezh6Z/e?>=zt}@hr9'7@vWUUQA;#ԧ2{ iۼy?âPv{2= 9 @qGŽvoF4wx=mI&ESLSO}8岎 』nH r؞zC,K*m._K]=P4&{h?ǣ]w] MYPI  OMLJb\徼pOΝq>o~s[ݻk4cm}\~ۧ2g׫w/>^ڊ>>ׯFt&Pf^?^њK@>G^\^,fl\eq^?h6wSL:53mڴW^_C`eVsOF`SK_R9h4(n|vE?}7~X /CDYYKoVU- b|@Q+>ٸчY?5_!g:ktZ̟[>koYle:G{|rE)oeg?9S Z^rwƯL+<=y@׸-}ڕt,>#.Ƕ\(va .M;s7Q@/4<̾6v?d\tBuf͛56#oK5BtY 1L-:~Y;*Ț7kB}51w0kެ(y$ /P|92i'gQ\bf F0IF`^^P&+t<̺ ,ko(#"ho|7ʠl~ mc Z887LL klml<\VMw/=X`Tgh\ w A'4}<ϛ@SVz @$ sW l J000v2000 pA<^i&`&`&`y\[Wlw4[T72f=\fn&` 5b%iHo=){c+Y>IzUJ)ilC٬G@-\ -ĺaZN(YRwrre[򊘊ˇnӧ/Tjboȧ2Vu/gj"/;뻲G=|ūTCb=oT(e.+>{/U*VQwؑ?/o??y~o~~R:7'?I;xI0#^ۙrؑ?+TywyE>}7Z٣Gʛ?C uye];vW&PKm F)~ẕ ]+M&aGKIyEX"hV wIH T)ҳ2iT+"MhRYQFW\g"߲ΗW,+`,yn|E3FJ&okWZ?oٲ +W~|YIg)ehԚ5kV6qKNXjEF\kV~dE?[zyA?ͽ] h. ,'] %#{^!4,'͒Hu4DXDz%G0j]5҂ 3H =JK؁Rؿ(}DԊ𼿆6Kk׮VX {e|?|C/bO!Ѿz*.;>=ۄHnԛ&c?ͤ\:WR\#Y؇ Јe=ϔeH>Ňy(v`_;Bxy.u6/OG yBqRd??$ua_H<1wX7o޼eS f]BJү(?'lÔ-OdY2{\i$KwH_VHI3<Ji4K F3WhJ]WbYHRaAbJeͳ~[p&åfoYiK%@"_ݚ<%# LL.>;/%Fg9c=ohv{>N'ϟO|?hܸqq=&y{=M4)E邏N/q!-H m:E.{7#Dُ#IyqY~o?!Wrj5ӧO]a!V~M~Q8zIo~/9+c |˕r"EݔM=rp3%$}ꫯg‰\'}w{l>T$;N9+lQ+eG}4%.(YA _?0aBKS Sn?'|2oʡ|/,SBWiNw<)믏C}oiNog'ǟ@=6x7i='VFNģ|?״y89A.|6G(;B}s$tiՏqh# Fw=:W^y%z(:th3×On!0Av?GOم[NJZ 6iə&YْyS5:{Yj>ݖmf o@GgT-xQGſ" 9х_T/Tx6:_q/^Ofz:Hΐ{S0 TVV6lω'2Fw1>/2ECF2URWQ@2l__j6s+A`38#|){WԏM0kGqD\|:O:}81$"-=.O<܇k[Xa-mt*t@l4~ Ӂqٌ2B '尜*'lX]?b-!7r!OKNZZF'{XzHz\jgnp>01NmSv֟IeY?X) s$.d(oms3,CDy`ퟤR0|X)W624<6GL RJ)3եmkA83f"OKkk3I'%k2jpI`3f̈F`nxs@y[ _.  M+؁Rf%-ʢL~@ct fc6K_"KaTfHm@Nډi)gY%v8[A 0ťͽ/Kw_|ɂK#QocA #"\&#_rT Tl%E`(0T;ɏc~/_ȍI[ HvX4_I1*ÈwRKy(%ܤ|A'J&B^wp9 r!mq"2:H]ZsA H~h$oۡlL):N45FGʔؒ'Zʤcyu\b #2tf8'O;X6lS$u0ü.ym?'MN<6tɎ0HhGkl~<_iM1q' -?A }_/ ‰σIXהq_^cK=Kl/P>^15ery0W_}|wnFKmaT[쓡Og8cN|,ۧ'[#gͦ.XI,EĖ'KH_;};TR I?(t9A`zE8 J-Ll r6p_}TDv&`&`&y 0Zo}+y{ 8Gm6v" JvjKTø%{X^ZT^/7 =N/`f@d|` ("@ ] t$;r[MLLL@10xx/{u1x;@ 8l4ob&`&9~T^&p=αs C&`&`]_2dHai& K~ t,|/uR{s7|a)|9/K-ᐇ']v٥8aP/:|6hB@k8l =ok&`&P4w_ ޻'Dߦ>ꨣ6}daXqEƍou]M2%WXʰ~ӦM$|Nj^5A]6uR__?w Cs7?&P|O`)LL{E#<!Sl͝;7^ I&!6~81cFTV,k}@oԩq|dq=S c%LA` 7LL`8FN9x{lWlcpkm/RĶضn?p?ak2y$]N}lMښ&MLZM_Ƞ@%/%sO>~qGyd bꫯ-{ @?nрux;h+j] [vL0쟭~<{GwygԻw}?>;ꪫ_WC'xG׿BfԨQщ'M81x׿Ƹ/'i!E(9jٚ:b[|RWrb]fCz%|߃>9{̘1o\rptk6<I^;2aÆ2iZBp8xO2σ<-L`jժ `Bm=0'`\I?W8O}W:CK; 1 «g8∆;^BsYBMrSځ ){on4E[WiZy,[3gvx'?I1O >%R5}1g*R<X=sdZge[$+E, +Uڡj_5gΜkgVv>yujL~8 ۸:mXA)b߻s>:jмiTi5َ(to}z_FsxaЧT61 '`{ٳ?C T@{kѢE ٻlgzgWXqO7 ]ZV::_.|[/ޜ_:thyawiW^+x;낰 nOV?3|[?UP*FK܃1B$2 dʷDzZKzD7P u>Y9 @&/TAn*IENDB`aom-3.12.1/doc/dev_guide/filter_flow.png000066400000000000000000000736301477627663500201340ustar00rootroot00000000000000PNG  IHDR@#zTXtRaw profile type exifxm& sIYyM[c@ Yzda "sĕJ*\1x]UCg F^#Uぇj m^Gٞ| 9_rJ55v 3kؿÛp$SH\wŝO#QUJ8ǖ=_,|ξgޒ֐Ow0'Sȅ +P;-9k`Vw6mзZasuYPgiCCPICC profilex}=H@_S"8dNEE EjVL.4$).kŪ "%/)=B4m tL%b&"^}EXf1+II{zYjb@@$aiOm}+*9I$~όqX,¬hjēQU)_xr⬕*qOpN_^:A$EH `#FN}_"B 0ṛ Zq/)_c|;N>Wz_ӟWZ.\Olʮ)}SV8}U88 =Hr 7 pHYs  tIME;G<: IDATxy-uӻtr(m[K(mh"^&b"Ƌ#[8"Hx5p-|#Y|8[3^e,&bY15  +{{xʩ>}:HGWUTyߧyA$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$I$Iu.@$IRM]&-[{,I$I7o}P*w[$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*V$I$I*@$I*cx";|+1_$I$IWl{3fzWc{]q?۞'%9ң? :s^WK*sx$I$ Ɗlp}-ؕI$I$i U5 Ni}xZ~8{n40O(p[P+;x-3E^iLlCSM{lwɞV$ITnǁ'xݸAJzN?E/u+1)7_I=&W\& ̺oe/dV;7<>\~ϰ~G$I$UMz0V89HkkuH>{x aqVϑzYe[^SCLJHޯq2w2]PWU]$IRm6z7}y돀y<ߠk Ro"hB9l!;L=yHJ>A:{~Wo*J$IQ@[)@^Fw.,48)Ԍ}o6yFl)9|'eA>r7Ϡ$I,s%m@4)p;Vۀoݽ*0|zٍQ:j'ͥ3h%I$NFU`;ڏ3 Tp%3D9Bwz4e2b!?TR2$I::/*xwZcӫ,i|!yC7hNn?$>uHI7kKǠ$I;kH{Hn9`;iFpÙxhaӤZSIC ?աm~'il Rgc;IELjI$I$u'I.-~'`\=FP p<;ϟ.,&;8ߕ{lW<6S6Rܶ gP_^d$I$iNzeeqn R%3et0OҘɯcg ajV$I$)0h3h%-ÜV$Ibx %I%dJ$IR]l^Z4[B$I$sz[e6^sx$I$I57ܱ11h%-ၒ$IT1$I$s$I$Ibx wؔz2i%I$=ܷ($: ZI$Iwa>+I$I$0qM(I$IRMMV6UFe8mI$I${E!-ބ$I$u ) Re"$ITQ[H$I$IҸL[ Ҳi%I$IR,iETY$ITE[qh$5V$I9eA$I$Ie1<, iETeJ$IRlZԿ ZI$IJ$I$tnJ$I2V,J$Im+$I$I")J$I2V,J$IkcBA+I$IU,A$I$Ie1, iU(I$IRmf-i Z(I$ /[ $I$I*YRo+IcO+I$I:dJRe95V2(I$,^$I$IT&ׁ]\=Pk$I$&]^yA+i$IT[p@IRA+I$Ie<$I$IT"CMR9~rx$In l1H$I$L.[J=@QAsI$I1 <KM6U}\@I$IݰxxlQHZaJ$IR7<$I$IT2wmæT}\5`H$IMmb6S+kh=v{~ܟ?k$I$It(neS([d?fz\lM+gۓߕ{vTM {ZI$ICIʦ s#`kNĶoo Q7Mź;Hꁕ߾|jPcà K$I1-U>|/{ | KoQ1=vS_$I$IݔwW{mYbsSq&{kFI=#! OwZm M eRN1c;HD/;/G\'+1?`L~q?{fR02ϋfXc^w8^wN{j͖쁒$+7[@n"]$I}j)$$HAxhfig0)zށ\=};+yRo2G+m==.^Sݡ=>̑X^U$Iԃj"]$I}h;GLLoh h9\*xJPc+lw$I5Ja};LI1$IK'!E6dA+A+Ie5,)$I)c[w1hi$IRG )/S'6rVRnak6mEVhpIPT{x&[w-VI0Loof,Wba%x'w68nЯ6fJ.1AR <7pq iVXViIihns簇[eD] mCv# agx!I.@Vm3MQ24z*HRiY *S/fd|+O7YuiH޿wt>^cUY:%ව:H=ޏCi%ⶁy x w2>7Kmhc\6k$IRs+t 8mqK3LJv|iaRO 5,ARkpr4 *}V7 k&b$Imi8];S>ĺYRߟWfHAK`9`?)PPXn=7< ]}O&oyA+Innn]HѸς38AZ ڝ8_flRШ&11h%IP4,=%o+*mnI涅lQ":-;Bwz$ Ƽ*Ƞ$IZuz7aK4r*J1T)PeIΒXgĨ׀=X!R7"V$iM6z:Mx;WTv#!iq،bt!<)u吝In⒍yA+IjQ \9NJ4lJ*4z"X-R8^d9D Ƽ*̠ICg6a4fzLwQq5~v㐳#ɚc}8e6k>-~$/ihn)ǯIQR[U-1D-tl[M=طf13h%/h(FR/;3amޭ7?0FyuH|736 ZI}nqe O=yYKui_"ه"]̸G VZ$M  iHyVR&uEv=e]!vAj[ɮhDI1M*f>z [m8 K3I"2k4ѱ>,bݑVg/i1LNuo]MƱډ1^3h%q|VYi}G_IÚj,j zwvIrؘW2h%qsP[I9Oyh>v5mu *&I.N;CL6կ ZI}( T7qzk)3֧2k[n÷Y4K?ЧNO};y1Ƽz͠ԇFIUO|V6ʵF-P~v 8-oơh\8e m迡HI:e'ly9VRAϫf'FrkWm$>CJ-J}C910h%!Y՛ZCaAjG9R\AJbtA%URoNa6U>d>z;Ժh gϦv7P zgb*7Xe7TsK'V3}0zK]d>++{Ε7@@M?ҰH廙qR=T=q g#}0zK]d>zjWq؈W?A*b#!wa奨Ψ Y@>~.uJ&0aQ%L87YRosT iC7r 6k>c>z;G=fl ؆hU?"c/HU=?rRBv@ofW1^3h%YY3Z9Oܤ3 mQO狤^ T?_Khlxűlc^fJ#泪J+7b/ ٢PsPf;m{ ޙn8` zt?|V/i)glFdS4&- U(aB}j;BWU .HyA+Ϫ/-0ZPE:bQBc&O{/ "}\Y&Us`0)>Ud4 X.ՖmӅ6nt|^vۘ5ϥ.0UMRLTz6gJnIT7H`[H iY)njзA[n|Vf>+-ՐC \i Y?fIR/dy{=\u 8W+uA+ gUoR(*.-eyjQdFVU/{a9[I܇Z1h%u86EBV (fcg^P BE.ԝ6FHګ IDATN Z8.wʽ6o >c¯}?ҧ-jʪS:}'ҧjrh*vU8泪;Y ',gAB]6դVѽ 4FII3B߯MV^Df]B}{/p)^?Ki>XXXx9 I=ok60p4^w46 Z|V4&NFiQK8fQH-=W{u}X3?;Vg5ArZ-:%NOuZ<7uh'"]T8˸:{ R~>VmҮuUM-5'Z7 iB]ptORKp_~G2h:sR,mϰ883+jPfF@)2Nǁo.Q+)׏e? M~ɯGzH 7Zxozg׿E#52ϷC|V3'%zy_[B%s,qFV TA*NCၳ|KCci+=,gn#jb>2hYONy_^#'inGTTjm$#q_jhiy̽uIKRoZNcۛԍr濻9u Y_X߱_qV6Y5 \XTZ5"4]7\b38ɂnI9aiz؊*Hedju@1H RpRv[kv!`Oh-z zj!$[[?9~' !]#)# |Ooz _>Q흈?.<~ {m~;y)uMR8Bf0CqlȋT܆²IJoĺo>/?Lo6-{  FlÕ8=؆}؟o?Vm?MFy|+KDf 8tnس>O0;kz"Q($w¦ ,"?n\ tzm\[Ϥ Xގ:VT/pޚ#*;v3$-LGl3q^9Z8oZe>Y]jss,~ﲋb6zNZ)UYM74󷤼U}|">o$ >\_9=\8Px1 {.<\fX8.?si?߬?$?V>qZg;M޻>>>J7[&Ve%9T4(񛤠kM']s;c>=O]ÞꭷX ,7sWAaq*ç?|4˝߰CmR0U[w.+S'!mm,tR%~ Z uω})@>|BJ| & ^jC#F *$;lN/Tl/.h}ְ컹߼f 4y <`j\V ~o7\Eir 9Yr75bb:0wZޖطHj>@ I]C*mɟ00ut{Ѯ7Y8q; wT.ѶЙUä 3u"SM@_tsO+Xƭ7ccM=z@x&X8P7\1k{ߢ?gca|Eb8L!sǑ&+3q4H;DU0 YR,r1b 琝Q1Gq-lUƧc=cl&癱&۷5yRosq<+__ʽg6lI/Υ;թ*fב\i Mruy.ST8$[1'̟k,at?_4$ǹ4kjc4翗*ߦROl.[=tnV;X<=t~) @T,Žsljǣbvȭ{w.ŁX*u_]O5iL6YFbsؗZ+oUl'y8m14.6U}|}|v|(WWf.fYxQd0r6`u7 1i:$ɂVnr;I~ȦB>VXd쌗r6hu!{@뚍0ĺ .ٺML\U-ݞZ6`Q{ftB}6A;Otr(Ι\Yݳ[hAwX8^Χ;t!p>bXXOkV у#nul|-(LmXG~5Ajz!O7 <(kx2+v;ֽk/"Y1tAvtEAZ=aQITnG/zy6U}|Κ c:mr=^xB;8{^ASVϯgga=4F޷}8~.7|œC=KutE~tGE}R(KG=Z +ڴICmR0)$kI姰$^cap7}Gczꍓ| II+G-3h%èmBǰ$h߰(z=c)ӍX8C1?0i5?~,) 2^& m;?zYe`^Jb[wc[o{-?$rP1D+ܰ;*1=ۤ gx4;簯/}FSf>;Du8xQz6iJd_PvzsK.0))#/HZd HS`gL,~qVx~uscZ.cius=DߤCqGO 2:ݛElH?&Xyx1DŸHwFMЛ|A`'7CҰW A "*,c1%RoyV gU:ַ.:ga}׀g_$?e!9*˷8IJH,÷Xp;Xdφ +y8fbؘA+u,%6+{,pSV[a.~ IL 0m)? |lQ凁Wh']~HCl'H=#I9H~ƐQ̭YFvۤamPߡKy K 6_ |ƐqS{#, ~YY܍Yb._i(0 S$Lzx+ Z)6yh e(f_KR /?xΟ'LcHJW R/xhڿ~_3a4q.RϫW4v|Ջ׮%SRox> 7y xwE7Ikh$zJ & SrH }LTOiOΨc97rͯzϑzu-R0gC WARy?3LmRϡjF A:87m,|ڞLf7XARr u^.w[D}H(yR]NɅQ IDATC ZA!@wB65t?6, waaRxPEâ ),ӤV:FcqR;e;X(G[-RO?& ռI n [tU'3GY8tݣ?/yYvh,93}PV{X~#ۦ8'+Y8Mz3IP8DCs[&+WX's,쥵=N,l IҬXO4fj*H)М*wPr !q=BUtFi. yGH(iד$q⼖q FOqTx\&}3M;9i71rtѦΪ:t׷l=')i @J:mcvf(&fQcVLiVG~^ȜRpJw k;D .{0)uFx߅&H;ݤVۙbj* {i)nXCf{ֆ:V=Xs;5_e퓬 VH=;ۘWAsEӤs֠g,Yθ?D[3ѸF Npfy{)ACqPiJ'ZV')wPM7J n(J;b17ZE*6KE{HMk&stA >~.iX]dAbPil~i>{Q]Y{8;D"bj_?Na?;0ԍZL"k!3v.Ҥ^{$]uyat?Ztw?A>Gcq5ٔiL{=WeS6g=AaR,UHUچfg #8tX*c4Ov)-E/.ƹ"m/TؘWAsUֻ0?Y·Won_>|g~ ǁPޜQz_!,Fp!nG]0D+Ϩ/h[-~M]vHo~x#$?UqL~c/g'WszXؕj]'III 泪q\90^R[r#P5 s:_9 R[,:Y܆]&% Wu}(Ԃ**w;qAΒXrFXR;ޛ],i-G)ztwoc^fJ*WiT5.,W!7Yz&:L6D rgj}F0UTg< :~vg H-1@M / A [+VZb{ǣnO>ANsfo9Ҙc )ZuJU0N|2)=ݟ*vA+ǪJ}@=h-'GY8#V?s@-:Zh=RO|ޡY^ Wf s 2 q)i̧$jd)`lmom|?Y>kG䗓$0Ou`<~)CI~;%z>w?$:v~ (\|4s:O_HW87wK~. 74ZO\~$oj/ցiЍ$O)aH$N;S|u?IdJxyWF3z=|,e[Pop`#YeO+K$qV)ߓڝpcz|x1ɇbf˞)K)5Ѿ9`ϲVJp3j 6G[ç/կ\7lP&ZJxzrʌ7|_|}CWh525qoݗruMb ? 64(M=S)5U,ݘےrؿF CI~Eso̕;t>s/%y+ώ>F54q3f4Y?~;ٱ% t>i?sa}h΃)@Uܻ良 dJٜF,ܚ[ͧ\r7W5VL&yCY>ak唙`0w։Ϥ0 nGҝ)n̥,h Ʌ$5ܓ=)mƪϥAv_LYq$$Գړ,_o!K+T۟ꁧhTJ@V |n'ҭ &prmNL)=)&ni `wn]JXjV̔_NRj~镬幔f} =ߵz {lof:=oߜ_=6{\pxnsXi l\Ҋ1Ld$Oh xJ-;=2d3ѥi#C [mtdhb6v-F`w.e# k8Sa\&w?Q6Y7鷐9 XR6o:p4əLϥ]~6 ¸ .Plef2=̯m!Bi9 QϪfR. l}`=! HUx,O`m/u-sԳbwJaFhj+m+2IE}+c)oOr֔]_D<}9TQϪOYya>e7mc+2I3))[c?sԳ%'ٮP>Mf)zh XUuw嚓NCϡRϪԳbzK`@Z3ɕz$걯t{ysHyjԳ]Qϊxol +Ң=$}AS`siR)9ԁik<D&y!ݾ,V ct3ܭz;]Lwp2O~)h^{fOG-{a@$"Xǹ7hKw//w2O~),j^ꄂ ZOfj;f@xWKW38<}97R[-UO&`vl^wJt.Vtd>AS ŔYC;4źX).o.Dqv\JɅ.VIYRWg<}9'9M&+u#f:ͥ׵jTW<}9]op(2+2-׾,bZt=JJijhUHYZ0$j[X)i47RgdL'0~Ng#INh2\,Y)i2;]iYtfyszM=+N&9h̶zWSX)^RO4p{gڒjsyszM=+ԳUkvR }02c0䩔C.SI >[Y=e6 ǭH:Wm>Ƌޙ;a=NC-82ZDJpӷˆH_-F+ڴ+P/%9 0&~NogʼnBкүJݠՋsJʪ} `}P?Գbg6ə#hR ,GjP`&eN^O/&szI=+1t|Jz`ٔ7Kvh76v??<}9YE;Rf]??sR Ĭ+6B'qvհ-IڊNC%x6}nۓRyN$̀'f]1.}]5ɴ]x<}9t._x]"t|gE}R נ( X$l)3FaSf8yszgKJ=+O ]Lʒ]_+V #ɩ7a,\I f+{1= `}P?w=$/h:n/{'K?r 䁔c97:iSee{X_t'<z화aRfY<&+9)4Jfj_ł&Y is'zVg4ٞ2[過aͦԻ2 S8Ix8J ~Ng֨gϵR s+Y0OrR LLʬs)tBsWRzs`<^H[8}P?WޯޭK1=6䕴*=|Pj&L ť@Jۧ&s2YM+BԽr9)hJ)iMs>d>AР~}+B7̦\{2p )6lK ^IySx\8)Կ0>J}V nK}u*%:2{8XWRV|@Ll},$0IA)E7+tؚ2LJLJ XHD,x3Zs&Rԫu|qZ{l, +tڞ`T)Ecݚ+Ir>2J0خVD?L$SV7+B?̦UG$/Z;4Ϛw}g\K KSt )Z@lMy_JO[R ?ݥKܞH7xmr,%ԻRLXJgNw-$Њ&#;RI^?|$u gRV0ݙ2(I_ϧl3)C?I{uq.g|GStOWwn1eI$ymw_l=I5?O wvY$yLWw$ඳLS&p_JFhEgw?Lm ծ,'i5DJ m60khFʌs=6fSFg^x]a?J1Z0ٔG> c!t\w<%l*'v5R$Y 6,H {[ja԰­wFBf)JcZ@ͤ~Jݶ$WK!v詹"eCBh=-;|),@ݟfZ@Nr)zK 5$)3hpB+ f}AԮk&s e%]`c $KVLv>%ڧ`ٔ7uvh PB+<*q+&yJ3Zg6ə$GFR m>$[4lpˠbJ}J^' aV-v$2J>`I^w${5l`@4 В$R.Ej1ד<`wn>$[4OfZ-䏓<)`<w$h 0 %g `, (@KLrJ3(`-Ir)NMƢB@n$y:!M˫AIDAT[po&Z?ƢV@N$k 8{+ɏRj\m 86cQ` `cI1D0PO 1NhL%1Nϙ~ ]q#Ii w51D)f%o'f5nڙRJ+hs@q*Ƀ'd_'d&$9Fc Вǒ<q~(@K$d1DAMt;IaMwλ[0>;\Jqq~(@3^Jr1D0PMc4Ɖ9`y=nhs@Zr8Ihs@Z%$'Lj` kIi XnؑRYhs@ZR'0I'W Z=$8FB+XV@ͤWH'Iܯ)Mht$ۓ@tÚۄV@׽ I~5+_&8l!)a_Kyq&$Ii ZsIڗ '9\}"ǒHr:+6&ZCI8:%eIׯєU[?|ʌqR6$WvKh&M!vXV^־Lpw&ygI87&0MֺLp&hY> O'yT3t &ד|*+&n+ɛwRȺ_St 6?>r쪷ݟj6Z_Hf`3 b6BbIr&$O&9bn~%LlJAh;})vQW}3)!GKY^}\o`r c4ٌ>kYHm5Mlj90L, -hI;Fq~(@ΧWi иGMƉ9`M)ȾPoa ǒ$6~IDJv;IsטbIvU~8YDcQ@O$I_=>|~:}MJ(\&⩺\H3vshGs׍E6I%,V-g<3l;!$FF~7l_/ĔVj.'y(;I}| ɹ$Nul3&~on `syw ڱ?isSf5{j&%:mY_\6x~$g]a?l{>ɃxuKCϳ~n3Noz])G^?[g[L$Ohh,JghGU?~#kl~=ɫInK w~1ezƮ$ϓrޙo{ے<t/&Q?͔YSIr}Sjl-&*^}SBV y'6.ShfO-̂zpfǏִ:Z=zᑱ)' f~]WRjYjZ ^o~uFŞE,5>VG{7Sfm%ǹ_ogiURf}9|b_˭~/%D7rt{t3f=I>;r߷䱔0h{t[V\LJ*;F?t %D O&o]fRV#|56ԛ)aרY9@!YX$4vʝ]ytfL%y߱}?܄x~|#\J.&膷|jV2nl_eO)~y݄x5eO֟N?mb藷Ϥ%$ǒܗR,Vs#1t*{R 3ϥU')$RV!2iֱ}9sg߭Ike? Zp'Y!1SGaR}P?̴9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#93Inj&&`cQC.9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#9B+#93 Ivh&$?I 1&s:4Gh@sV4Gh@sV4Gh@sV4Gh@sV4Gh@sV4Gh@sV4Gh@sV4Gh@sV4Gh@sV4Gh@sV4Gh@sV4Gh@sV4Gh@sV4Gh@sV4Gh@sV4Gh@sV4Gh@sV4Gh@sV4Gh@sV4GhЎ$G\c]u |$O%9$kR^ycpSc:-Ճ8U|IԯBcHrq^G4f4@N$$nk-&y9GIJk<^Iv}ݿ\ОRcz3Ir&ycw뾪 B+|%eӣYe$ 'y69K>^??RbuC߻/r~(c#_~f̲mC?$Wk^='ɹI^H>n `9u`:Nӯex,tZH g.(Xzߩ$lkw j5_ߝ$'y8%znw|R}Sv6'yvU]8XXե,6P*5pBZ-Zaͩ,y4Sl]GqbX\̏dvvVFs}2Cq.U.K777777݆N')u>2ki%j@$7$)͍B,oqg:|7¸m)%~=;#}UVߤ0 >2[K) ,q&B$ʹ玑j ]|vnZͰj$M5dl$8ɟ5מs /0.OŔʭ`?K x&;Yp`.e՛czԏ;W}ԴOhRWsk!O\FƸ_|>Rcjc)U[^Kk$7S.\N$/%ٟ@JI>6<)TR$gN)TL$cZ,WXuFm9I>Rpw}׏_ֵ zYb}tb]!0.b璜Lr%ɵ>cMMDSgwJ)^M{)ArA}PZZZZZZZZZZZZZZZZZМM$75tDGWIENDB`aom-3.12.1/doc/dev_guide/filter_thr.png000066400000000000000000000312511477627663500177530ustar00rootroot00000000000000PNG  IHDR1j zTXtRaw profile type exifxڭm8D{$8AvTuOlUdY@d"pKZPZF^._|XfoI8W}//O޿k9d?_似ΧA;:O(w #K[yP99|W)\r\rϩ|>Ư 䏣c//}E!#?ϟϗΟ$߻۽5Q*oF=Ȑ_8\~nl[9gl-|'J= ܐJiγ_ib#^dI~εle90`ĖXyKNșKz۟xN\*r ?½SތM/yMrMlobAU'͍ 8_CLMp+?8gSJ{J0)@?MR" K.2A &H)9W &McVTi 2yV) 4844kQժ-hQs-UkV]e+VͬYr+M[mZmt ^{CALyNm9YeV[}-;odbmI=iqͷ\m՟Zz&R~}`1Dr9Q Ĥ$7GBc[*E9,v(TR#$ћ>-h- t?'msAUӘ>ucez);Gt,:}n޵ݨmMKQޝEդLɺ(=2 /6T剣MJFϞlNKϧ J~o skeene xvJ&vf^rF9mmUaԅ+ieNx=n8ie8'>(_q=*HT2n@ " f U9&pHtZU;vƺ6UGB~;h@NɻyJ43Dth4Flg`D+@WLQ ']nz< A]MsbrzWs3Q!/#G*mCܼsW;U 驅Fua)6np,{`OHM V5ƅ6 UXP۳4תjQ6\L[ {.D 0[E a*5 K٘GfA= V&a9O4seòXQrwo#Lk VݕۭBecŊ,gV0Ydi2ufF~%Ґ n]7Do֡=YHv''m#wPpИIכ4~;G \8E1VtvwNc B/%JvHߎ=%Gza_  WlSi$ =bqlyPT4y'b6)!کtJ%/6OtDF 󺩉r,^4 lKҕ# -.UuQPSgfB<€iacѰ& Rm`QEoOA,EjFk%I7m?,1gԔ7ۉ+2<4' __舑CzU@7`ڻ%*HRJ$.KPN9y2 Hp#S9FG;;Dl9Ktftܤ)Ŷ.,><% EKϲr(m%^Jq4UeϲuX2&2rɮ68ӵ6ZD_̈́"tB|s{JS/H5҄WJq+ eiCCPICC profilex}=H@_S"8dNEE EjVL.4$).kŪ "%/)=B4m tL%b&"^}EXf1+II{zYjb@@$aiOm}+*9I$~όqX,¬hjēQU)_xr⬕*qOpN_^:A$EH `#FN}_"B 0ṛ Zq/)_c|;N>Wz_ӟWZ.\Olʮ)}SV8}U88 =Hr 7 pHYs  tIME1SݳZ IDATxPw7EFX3Pυ6^r` :ő{+`Syp8Έ'>+-X@PPxGCvx,YRPP^677[VݳgO DGG;NstE].WNNVyyycպn:ٜjsss)eҥ|ʕQL*x MtTTl޲eb9tEQ]]]FjZ,PhZaqqo! B!@V+T*n7 ͧN(tFbXbq[n]bX,nhh@@omZv}3tvv](zZrlrW\n;R6l0\y<ټk.LnWեDdr9;`٤RiѽHfV zKb1EQ#[oE3EQIII$+&''Kґ24%H|'xdR*#:@B1@xu„ #:r2424 C'OjǏF]^=[|9MӅBA.a߯&'y<9d)FG* `4-| ..nZlŋO"dhrz?XpFhF fBr C24p4/Oj<dhdh՛@\ Bn ܢ( B)ݾIQ7}= BHo?4v!vxԞ}ݳ)Ӎ@><~츊wy`U?](=](F\@)H6vYʦ^5/qYn5z"G>9 C<+3|ʯYf\a s$0x${= ΍&L `eYYY;vDFF.Xa6d'OD]FX|ҥx[ZZSRR233ys)))l7JL&㡬ԁX.^(H.^800e˖X~i~tppPRW\h0쁁(bKJJ(qq DR__?88X^^.HWil(GnOe/^>CcA&]zu0MMM7oloow8III˖-}_LIIٰa%)JTyfXc˵w^rt:8,g>r0k֬h4Ek)))+VF-\ȑ#"hÆ 2= UUUܞ`vuu9frxbRYjjjT*.^?D7oJ'ODDDpuQkkk333˹Rkkk=GRiZZmoo/EQ'zX,doaaǏL&\ROP(LKJ8z(ӳo>_paͤx!NWYYYssFH$ .xt2\rUUUt:J })咒BQ__{nHd4;;;R?룢:r .d~!33S$ )A,s{ hBaww7~r\x#ҥKRUUE[__5tBBBjjjcccRR@ @Vwvvr[b}}=MH$O(L&ZV9?~ F`0@`NII |d贴4򺲲P(z{{kTz%NGtCCCggg||ƍ9۷og'H!wrΜ9Î(۫P(\ 6\[[ CBBNc뜫[ZZش{bWUUY/?5x޸q-[|asX ZT;$fHh:h˻MBBy}̙!u -رP(| #sh&[z}}}ffR+33s˖-) yD TKl6`ܞ- :;;M&X,vj]"JŖN l6[dh>Vs#3ga52 L&Hk\~ePP({=y$T*Z1..8Ju ժD"Y~}||n?wijjx<*[l#R4vU&Y,vJfffcc#⡣t:D.ycۑXcNq#1xw91Nן{q:gm@B{C*PKD r.++b%I$YYYl2].3YYY---.#;FߎT7odbx0N=:su*y!\.xOOsq+bطsD.kȨ000p2!CV'%%eggs>uU&M&Y>J4Rۥ2,99A".; ,P(L jmqζmۢr9QiiM͛P(>aq4M\!~V՜ ߟVI^pAVWUU'Qw^aAMMMVVV~~~JJ ۑv޽j*abccFr2hYndh@@БaG6a#6b#6b#6|fhnb)A^^?ȍ #7Zϫ~aޙfFr54ϧ+pj`yY0\C_v??1m4V3v ExڵƮk׮M6M vn:qnNh,YѷmnݺUWWg~I&EFFtp~ o؛/":T.9Ё/^hѸqzzznܸ?W}oqϟ'͞=;zj_ZiӦ4FF<<0Am/^x„ }wws"""|+7 N9C/0}tNjnn޳gϸq8,ѣfyݺuw{ 5 ޓO>aq筷Z|yttH$7Ԉ5ng#8q/ JG۱cOʺ7w\rϟUqv,33&饗^3f %xɤj9,ۯ=6@҈eee<7͛׈H~&X__F7nܳ>Ryyyˡ|| WSS:ځ:Hcρz>oEmӧOs'|⧸%K7dɒǟ={[[[u:Μ9mw9~8ύꫯRnc 65"EQ:* v ?"9?P믿7oޝ8qY,\?|GG%0%s˄i۷osۈsJ+ӈ܎l><қ5#8Ё駟&Mx8,3v>K;wD"?vuuqۈc4iO?s#rc5":uj; ߶/C?S~?( 9,n̙/z{{]?@#%D;w` 8an ;rlOF9sxCy f#"".^xw]Os3<oU(ܖ'@R"+,HWs8{4"1Ν;kwwml>nkgyD===F5GЁVǟ8q={0_~~㏍WWWw-q_|.]mwcss /m#Lͪ㏟91~z?>Fh4[FW^}D8P?P#rϟ?_V]5.\p/"Ž vFҥK̙O׿ ;݃wNIIz_& Geeƍ ƴ<^;d#~`0.]__xbжmۚ:;;[ZZl6`HJJX,B𘊉anj-#ɛa ,Yiȇ322GnV;m"q>䰑~ 틦iRr|7ZUVEFF^nG-ڱcq8K.Za <΁Y&55]qЏ"XLQTSSE"{9r`bssswڵi&{̙ .Z|رcNۑ?r ȭ=99ԉg\+Ǝ(5칩tk֬arz>??Bl욚T(HȚ ӹtR;(ba24<\.mjjڰa|˖-dΝ;Z]*++;;;bqLLL]]YS]]M&^A2'9994mmON{e M?:<v8fY{rڵk)ڰaÁjkkӣ(:z(7n\bŐ=߽rڴi߁dhu7ofm/X"..0ܹskXL֯z<*Z27Lݍz҅sY\:jIDATJ۞={rrrZZZQ#Orj5"JQTUU{ZLQM鯾jD/Џ9@{Z"\pv8!{f999f-[fgg{̣ac,W̙< QS[[[TTDQF9|00ݡB"wILLljj*))IMMEm/}ht:_upaÆիWKRXIQƍׯ_o0VkUU{%R֭[؈Z.1ݙ8> UT}haJJJٷbbbw9" cbbfϞjJD"ܽ{&ٽTx>t^w:4Mzv[cCa aaaLZTT޽{Ʌĉu:L&S555f7\~=EQ'N}}zꩰ 4K/]v駟&)x'=\ߎ7IQ3um۶%$$ .JmmmgΜd4iSO=jjj,Kfffbb"EQf͊iz}}}Z"loǏߺumۖ|;  0u|wWxBv- 2p `dLť #1Y(L dQBmҬVtasι\sO1v322@s[Ď;f̘=o<n24CQQQIIInݺ5>>fL&+< @QRR"J=*˅BeN#GP9poE6jz>// EQVuݺuf9##C>ZE9N?@HJ1@ Je CssSHZ5b@prY$24oxf]_$711d2vfJp/jrH D``x뭷y餤$w/~I,?|/24G|o!JڪWQܝ***z@ hkkh4adh~EEE)))QQQmmmWZ3335M]]~EV{t˗ssǓNQJڳg[o97 C˗//,,4}}}} Ϫjn8k>v6YF(:vӆϵd|fPq.| c ׉*+م'cَ(ie}줭-|s#Y #NəMY(i;!Q8^>?N%;;[KmN0 qe逸p>v&hL]W8|-51H-/qרϷ5٭H7s,M O2+FQyrE$&I'BRd[6UBH|YB$y3&J:ЗҒ{d71;Rv6ٞ%6ezFG[:ǪiW<Xu 3 +nU/kVN%N'rkp_?D8Zw1wg|Qt.h&|z~I:؟57 0?fڇd#qf&|ZfCyPs5Oj 9MGɹD82#>eoԇχӄ$YLGl oakxtzN,㥻p7lM^k0uLEj0CMn]Wr_HHs'??) {,ޟ~vWZG {ɷJsc|^Is ^'OlD6Iשq]È}v&>w{az:xh>:|-s)sjS$9>ρx_EY5Z>ْXߢ{,vN{3׎xܝN'?nh;h_+DΚS<^.D{֟=ۖ:$ w.~|>|bGxQyƶ9L{p|[~lˏMØ0YlwX|Ux; `IlHOg?JKCfkg*&ߖslpcVt7\4` nG^$$ I'B$}luQ:ɷ9I5阞mQxHI",2y=%SiM:ńP^kv>9Igq}IPґqS+aw$;tRWi[ɏ]PJ:5>}#E[ g~NnOWOZ8Zb3O嗩wI.S-Okh†d:etV9'æsgr^\WuG* {I?'G[t`^]G(9ϏVL+`DpWVHc5:e{j$ c8r+Z6NCO~Zi QXupg*978$Q,yp]XSl[r IYY9rے*ӕt]q-ץ p| o К|uyޒd}Vh{֟_r<պ ,'7xޛY`ge!O`S5$dLߖ ~.E+$ (jI_l?w^Ԩo68N$T`AK[sƋ餣ۋ˹H^-+yͲp{JKK?Bi\Xc/IKƼ|ǦAwrΓ]ؠ:PR~ק;KXKҮJGtsd'ubUN@@htleNV&!HȆ/e,/*k"q乖xL:wbjsV>a1ё|ZƮ|UIGM;ȴ؁>57ZJΓ>n:Q5&4 4w>K+}~IgɬHZ]W%L8[{9Urޗ% z㻣xZ S8^xWV֛3+On|8us|柳Ļ ˚Hr5'!s^xXU8 ܄%$U7[K] bΙY.;:tsgr+IvCt<GӅ`j1\;D[h0 t4PdGﯳMk-1{+L}G qMiaKk0 :]ÙLKZ:M |teFl]E|ZlGsL Lry.?tr(tvd/el^"k(MgvJ=y{(;76iPh5c b`F.L^wACNw1upe}$̹lp+Ngu Mn͖{p$spSvy8F9=>tlghʪH鈹q^\=|t1|%)7'(+ycfÛlr"=^o3MO&B*0F *y%?'['O`ov$ZK=]oỪZ!C=u; juxIa'I<;ez<&J9g~M:fu> G%a+M&"FG}'WG 'ʤ\ EPxle]z mh|IGM/1n/\<;?qz]ՖרwszwMzv? eC[D]W_Ͳy 1a87h@'':HX|ّJ.K:A6~N_(giKZk ,;?D¦=k|m{<g/(:'WtZQw4ѡ[wܒDLlgI:͜%b>GUUzhJ0?w48wʊ&ZqMØ0I˺Eu[r>,h")3呒;tvd̿m MVyi)K.KΗZVfd~4&KQgZ[^8?j/&S* ]\҅9ZK(_48oW$eJzudcH,ȚYڤVҹ<'Of/ܜ >&gjuvdV|/<P+q;\a:zMґ謑)K5 E{=% Zs`+4x,,9]:{$Ov_q Y:sQ0ɿ}Y9'$Az.%Ʊwc=`='tgYp{6pj3 f4b3 ?̪o\쿮:d%oWr1߯eXz% 8_F[%kN'V#LEPҹlW$H.蕏b;hؙ >XXktܤ.Қ<~ ׼^;T򘶬ݝ%DYNs`j<&?/tp&sdWOg'ƎlJ'f9[s43qN򤃕'ZJ!+}њϯ^{aa杆5)ުeϷ0v%1:W'!~bsI= f>hϙI&$۶$Agk}w&+v_6pQ{Xc39:CIfY($KqDP=ItdsEϚgrm,pmB֑+%Ŏ&XF&p x8uplu%r y_Pcx=9&9d<񜎐^/35ߛw}(;ǡ$N\u_њ4A$j],}:i))umrteHێ$q+)sqN)?o&D60Kύ.wΟ;47H_4-Iu0 ,6`CtDlK:8ANx tu%IfY.| Ykt:ɵCCfۧ qGѵ$]p{|ɘ# ux4z.v˚L53q?w:C8>ָQ?^8lke%CZ.H:5O6Nq܅& FPʿ.| ݑ Im.|W' A/"Xin*orwMԙ$'طuy_ep~L꘭Z\8?H2eSDryQaΙYvϿ9Ft.yՙ0* ^,>NwCfULf-j_bc>ty6fibPL (h?י?4Q"7~WG+k$c$nc:h}֛+%R6'x!~?(}yϦhO|$j?wau\YʀZ>nvC48V&t)1UfioC6Q ?O~̭*~ec]c2kn3@P @ϾTΊVB|{!~yoV^7B{!?zW嵋<B :mrVT|'g63f)gŸp,O:$ZN#_yF9+Eլ;f_Cg6Ѥ.N)gŸ3@P @u׿]6}95NZ@D@]+!thuKtJСԵB/)BP )B@]+@DuP_S" B} N:4V%:%BZ!ԗCkP_S" B} S" V%N:4Z@D@]+!thuKtJСԵB/)BP )B@]+@DuP_S" V/)BP @D@]+!thuKtJСԵB/tJСP )B@]+ԗCkP_S" B} N:4V%:%BZ!ԗ:%qo\u〘+oՕ_ =/'~uMob_wfWZ@]+$)կM}!k*W̘YywVӽz|a}׿ie!<&}!n}_ا:4Z!NɨH(>p'0}a](CkP_SD6\?{_00$h':/sa0pk&k=yy 0"0!~u]{%<.?|9O7FE}[n&\G@]+$:%&^՛ cρ'*ބWKÜ{a$ +7Շ{Yzm~*&ъ 0p6 M5k 7$nm |o^H.?cx9za +NkQ 1yJ \{}"raoMߐ `{K"pɿK9C-!?rkzG/A5$2|<^3$:C"3\.pG oY՛$#V=죰oe!P @5Nɨ0d څ˃ ppVc.Gڦ80$ٲh>̓_^a~w猍KxHB2Gi/q..FJk@dT'9 I0/,Qk`H n0`xlU%(, #ſ^EA҄]Hs01$gGa!FkP_蔌?c9sQsautE|D\}opVM s9j%#%{ pC ɻtQ䟡qaH~stQp[x VH:tJFM0$[e3$V>/^1ah{uE3e0/ ɿ|4"^#_%VxYnHڅ{HW ss^xl>CHQ?cH OQsm%aJk@dT@o*0/^L7]P#$}a޼r0.$zKqa8 # yaD]:ۆI7D7ܗ00<&A. ]ؾGC0$-$ C3$˞d @D@]KtJСԵB/)BP istJ8C@P @nE5^D:4;q[ѩ$kZO8S"Di3&Oń\E ( X.z5nQC.P[@P/%Fӱ\5:;%xի+ux^_l}{%ckЧf<^ CFήƱjRE (ԗFaj|?Ƽ)~x{zUo|lD^w_[yVzm=`/.CӒ͟y$xP_0j?/Ӑ(G`H}1y0<.<&'.ki?aܟyۇ_||xW;f Bkz~MfJa0fXASk)dWoݽ¾7݆dž[o/ M::O<[|l6C4 F$pI批pys6߬0/V ۆ}a gܻ}L8>$C0<>C9\7XXpTthf}yB=-B} @Jaᏹ:%#$~nO~7a.-r!jta$^Hą]X\$ '|ÿca$ax_}ox|k}_cBҮbEA/~FK";95KQ:%BMWL 92B} @pYaMD1١ oFK+@P%Yj:%BMX$V,VKVWNCV^U#1 @!ԗLl0\;5d[eYhcX$P_Lc1;4C}B} 0͋&4dpÂ![ѩ8 @!ԗSoNCU8]՘X @!ԗGX7;E4d_jlFK+"AP%֖]PDDXd]5Nd}(.AP%ش9KD2adj<]#Y*-NB} 06.I5d8  mUKmwN:4ѓ'်V/HpI߅^ʧ!+ЄBa$qNS B/Fpj,)BftWck5NWc{5(fAP%uJСf| #d}#uK;vλtJС!mYaYߜV%E&ߠS"E]U8= kR} 0TX7N:4SH=ˠU\K8GcPDK˃ev k᱾uJС%!9I7U.uP_0t=GDQ&K[qF @!ԗ^eIDQCco K'p_CVԖ  %Kj <]:%BЌ1GZpX5pַ0H `fiM/~k?~UOz6BF8F1+g_:f_{嗿zlF7kq@D1nYLI @!ԗ\Z]7v/noM~^k+wՇ+;>ރOVU~nc䑯<{|dʆW~^/ӫ:f?N 0\!thƉpypL.AP%Om/}g_s+f]UG'mi#dIRpk1'{ҨrsO;X>B:kQkB ?о\d ( pi; ]8Mj.럋q,8O e} boXASgo9[]8]2mY8}3#1#,umsfׄ5~ ^f ( py-$|{crou7`kL =Jn{>߭26>{_n-QաY9\3>ʼi.by<5C.H !L`tar ܓR,<4œ\)G?.ag{5ujT Ǒq?Ty+kvQFnu[u @!$1 '_h.8]oK;m[skΩDΛ+Qz$n›o'$>CbGI^Q.jX<ey*.APH C1W& {8F/x,Ɔڼ\f tiY'1x*&%H @FL۾|Ą>\yw>;N0hb4#Wvs }KA!o @!8rm˂T Ds]׍ B|_\wMehzh/Gn~9ĄOhUp´4ajcYߢ!K$ W҄Qgk F"fh#g]5Kb;ͷ#\'̼UBH6+8.Ug0zNj<_&| PSHo/)t2:m_ֿ˜, #OO-۬/ǭ)|5񱕘ܓ \euv$XR @РBc…yVsf%1~8bs;[ 7ׅ>Zk~QLv^&åqqp{A,&Nkzb,$6%IJ,]ymK6ߺ`bdLv٬E11ئY?x>5O$9 @.uhQK@m;6s5(V u̹wxV:ĄZ> %&OǶz IF¨#cy%IV ?۴~[xCB@!(lIqD&/.B+b1'%Ou9_'C-ݣA!EpW̘y9ĸ/>i~ť36z#t2%Bhs~TKɺ<xpΘ+&lKD]wrxָm{ms 5\7$4[c2p~9*ȥ5fdMI 1rǎ\v/I?篻bFg%vܽB^ \6aV#&j%6l+IV$ѝ.G!rϓ^#\QKf^ 0JO; [& !4F8O;ͼ[fuGvs\qyo8L:M.FYaVDP S( cR\WM/͓wk!`K/9.KvĜsɷ֘k8 oYkt,I$?l;]KdMŎ9? T>/Is u1x$><B}GVOk#iΝ $'?ӿA֕_ⶳFu!{[O61KbQ<¯r%4x*;eIz.?;=#sb5㖛o't9{?l&Y8uPJf]"( U0&_# #WNCY߹pooy+^J0>r !4F fwzau0KFW~ӦO"3)\~.e W%BH|3a9YD`H҅~;IuUm‰;/>&$%<*ˉ,ur1$@2Q 宮ߘuU w1-S?1S&.>c^{] @`IkfgG- JLڭ,,ls8naq%aУq1ءA!H5&LyWr쫿sNʕ~{֟]1ZZP`d'G{] @ !4F.;˝o]mWoq|ǟCzhmhfN4X몓rBh4), @^uW̘ykU>ijCqIOWLif7}/L}3][o^B@B´=tpGi?֗]{*.#}G?o;}Ō?:5L634kk?ōׅo |!bʎO|.~)ǫhzޗ;fm !Yc$$¯;M {fκ]5_1mm.ki9:+Āhi'ӦOΌ3vO},.[ +wmOǶ'ud}ӔvDߺxk|}18'&+yJ|=Iŧc9eY'=BȎp @`"ɽyOܢ.X]˴ -_ɻ%k--SIrog7u8y%gis֦dvtNƓ[ !FQcda,t~&z*vX;u!@pEL#V|-D0o{<^V䶭qs6$Wt&%P Uqq*y'|mkL=oIDs&n& ƆKnqPoWeY `u;;?H2.::Dzo[IC#Kx/)9989ĿWfȇ.s$ whP1n#2  03xlvv]./6B%Gߐ 8zs{nҩmYTea~Zsn_4z¨4s pYNސ??.&ϳ'Xr{/ՠb5FeAyFЄe~L^܀=cey& A5M|nӠb7FV2EQ (sBhs&g .$y ϝ6*$dkP1#aʁ0B4bNlÇeӖB,MKdɼp]1Yֿ0 pr.y1//*!!1إF`؅rTlϗB¼}[l7;}#HL4F=zxт WN(a`"D0RN(%{٩~7f:BE<Q?B*0҆QƫL8 NC@@P+#<|)?hGC{*%^G y:^Em~C}魃u{~N`dw<i2tJOo{ݸQO?;ee;^oں3t&Amܲ `{Os8Jz/lTl۴Y׺V!  ` Jׅ^z"i_Еt|¬v9.4'_Tzo{!6'0"i_:գN)qU j*VTEȠjhjShwy(F{f^ۚ~lbXeK]1ٯT*{㯜ztگcWm3m Zޞz:v}揷`o2X'˘5LyPk'=ELEt.+hL>s)e^k{o͟e5;31F9 YADq}}4kl[% z>y==/ *;`N-Qٻ6rD}%*жtWV\ ԐUc{W=MvbEő)+~V)dٱIųOoj`kbTŎAiĩ3m]_j>śL?꡶bDAm›kS37R[̾m*cPko>AqW5]L>о0Z/ ` (d8e>꛵ʤMOLada6*4=4.ZyD1>|h?i mH梯'ہlNeSE"v/ y$Ql͜c:rS=G ?1ikM0۩ (c/VLm`+kč2+V|+V,)V;.}5iS4R2H?^ } 6t+o{Ɛ4הi4TP}7ُ&8ѶkD0T`@"4@T!{xb J/vX^^ơmn3xTex^e*.}~3NfA0ےL12a\1Q€/%:>Kd8؟a2MףHwhm 1Z,@¤ @;@4gL8# `bT2b&fϣu"f̾+`R.n3OSMk\c`@TyA j&[g*ִ{ƚ޴+hi}2+mcTSjMc'`ACZ٫ ^ܮD>=HP;?CiZ4"un PO oS ^c @3nD#2 Fb A^V`IԓE'P^P(5XjiO0F^;5Ke)kϤK>$cjщ%JCb} 4=T\iS]"<" dKy(T`_1z|/}=ZeJ31V I heɼwc*ZwTyoC @ujTa 8s5: 47Y.r&L+o10L\(,X5hL%Ŝ3s64XSk 3n}Rז @{cSrAQpcΙ Xdq֩6'ѽ|jbwScg{ -k쮒0YnZl 5M0CgB7(]2A-Lfk2y%3OkZ,&3 )!_ӾL:gڡyt=q mW5x,t^mGAH}z 6pM0rWkq @hI8ـ{=mO|<&M$sƊڕ$dzߎQZck`?ۚ'^w:5IT&_پP|+ūjQz :f1H&7v&}Q;>޷ۇ7CgFYIRAJ*c,O3`@+A_k~m02X&&Ng˾[U9MbnDlbHzT|=__'(`!~ZY~&UTŒ.X^~vuO0QK?L?POodwd%2`@xZTFTnRDȨV1#$P~&WOmib{wMqs?v/eAn0(?ˀ)d ! @H}L@ @z BG#0N0!Mc11QT۔ І\+uibbV40ct<Ԧ;>Ы^ݺm_?suFj?|RF#w0-/Mr*Ú4Ju|Ywk6t5pcQ"##cGIcY_haƾ]C%K#zF,ӴBcYѽ10Y#Y^4_iVa;MxK'1yA3cnyf꡶j'#SNFOp7i0xD0H#y+oe3AF,kTƧ}! Fx:;?52wy:`0hwat=VLj^ˈ!v ;Vh?WV*{.buƑ`EG~ harj'מg~P[Uki׺Pj~O +0A 'lWky'9$41Z>n0R 2==zxx”nIuaŤ&64toچLž8WMlgXZqzӃ1 83j݊nvbDNdۗI*#0XIxŊѵ}IId f==}gv%J1& @35L2>b{Ұ^z?YV]* _\hڤk`]ͣ҃]O:|TҔ!+1bB)0`@[+OcRTLX%&u6LCOŧ&~5R\k5Jz-#Oq\''7rd~=3M>O&S<#ǜ?Wllb蠁(i_ML6z/MmtsOУwY8u?m}m>+"Ŭ㺥kզfBdD0H.F 8tqgگM^m\K`4Yt*6uf]Ov5S7@&cR2Pƚ,;ֵˀ2X񪉑*U1 A;^2f=c1h2 ZoRهA jYrf=G,/ ?{sgfAD)#:MOv <(;Oӷ^s%̩)`Iӵ]] vlK:`g:?-O+V2Tjdg曉ebn;CrXʼQEe9V;74{Xmejv3 s4Y2!Ug942~,0vs nGA>kV:V|s&_Œm/1' FJ+ @h!N-aUM})UW42L1b*/M2djzh/3Ni51tnz_#&KflLGf22c*,Ca`b֤7V{S<{;,WPiX4ԅ%' # @h9pI9Pxd.QL@h؞+Θj vȺܠ!Dr o2%â 0P mNjeRTljU\*CN1VjM2XgJ}e\22N,3[4jڥߔ˴4LS{4,6QʹxU[gN @h#m gPz2k^4XODOO75yʩ mʙ'vʽ_$H ΣizdtӏK0! 479KxXӂ](UliL7Ŝ2cŹKM,mUh}y֣mcЎ,^}m08>i48lmFTuO`:0ߏvo>?mWyts @h5TOCBO1 4'E>TJZJHYv}d>bB+Ǯ'42NQJS~ 462T`OuO`UĻ=SRMgèI2 O `@x먥Rd@}GOD06 p?%Ty:#O%PQT#bnD(5JzT`J-xj{Z!k~ω17`"V( Úa7#bnD(J+O+8Crq?D$NCP#v78dNp17`"BdR`hmN#bnD(\fRO~[ʺzNsr>w~P;RXTt}q{;*.ީh_lqz9\NEEJ E H})/)y{?ؾui~Ύ*:" @twKҸWѽc|=oN:ӝf;mw%#O}YaU} (;;%:,ko쒂2pFpG8 _1ei\F`'i9gjqb^,X\2Uf.#=vUu=dOS17`"`dC)pUyqGTTz~5?(;/*NY$/;c:vXeO(դƹ%G ,$UNFt+?5>'W|s)E)Gv¢57Q_nv[7fGT׶ŋݾ;wBpΪM눘0J`DOy:5?4} PXTtD%|Pwe^IvIc.0Bzvo x\ _yxs B(T9xKʏ f/\E^~׹_DNyy n?֩|~te^s6+3s߿pC(Jcova`"`$-JN{"6;u~#QBQ{w,-=ҩCZ|{iZw"bnD(HZw(xm=BKot;v.; tfddoCEQ+&Nt+KKN7e#P{xsf֣C(?:¼0PTwj 7K\(mMD`D 09aЃf]wEVk7lvwv饧[ e]~_ulV8a`"D0"j ۏHYyşYC(ڱ瀛} hn7#L e=|5nג]-|io-" @(TR]~<5 lnV@2nNV֗-l/t:B U)pE/b2yM2=aഝ=7@uOwNieO#D`$HJ8e|P5vѣZwg)B:x! @ @Fb2 ɨw۲Uk庍nCK/@bDp} kzZJ  B# tZ0RN͜##zWuoHOaL @D b>iZiTJ @D /DgdiR`wa"p-mu^@[lYY?B:ok͓[kwj04pڶʀX)>5nJƞ?=o$T[^ 3]/v=[dDo{]wݾ~t<}nG›݉Sg־gM`-^>=v//wZM۵}wʕs]6IDATX~OomlY%ApQ6=o1Kh M-N`rN&C˞i6m[Wٶ|̓)qܞ[ѽ[ҩ;mֵ?ǝmnUnNn^u:2km fSY=ܛƏ_?(D %hJ)pJߝjڣTjSC}C;<N͑4l?lN?}뼧̻!zjZSl)ӡĻ Sj 6?o1Kh*M)(3'_pϹx{o??G5̘nv2֕6{#ǜϣ(3h~e"#FufZޔj;ZYFehSiYe2*t폲d䍿rjgkq~{:NZce.J( zm?}&f]1ՠQ:??z_d_|yg,<2MI7n|fg ʴqY~9j_?c{W<;.׺0yCkGm0ی%j?}UAP~h* @0҉<aՙ&CKƚ=Mf^K2ѧ)9;[N6Yjj-:cv#M&Y&^--oS&4-^ 1Ғ92 .{0IFkoنW}^@i k}^LYA#GWydJQ&*TFQ1Z&]?>^p[AЖMeR橣G{|O^ghFc2eX3d*2XOLa@sQOTYd(U娶$Kx21&P+Cd}2Lcϭ2d vFMG5gT9QTԔ/[?XkJ{O$Q+0XPd%kRٮ ^vYyh45e &)P|255몫ڞ2qӶUlϣt9W4! @tD{KCi6T^L6RZ @f @iyg#snC4fa,c\&h @Tce9%SSTߺ15|?Id&?3311[XpʀӺ5]p2ZT->cW%ZNfLTNC_E✷tD -iH)pZ2'LxA+6,B Kg4G0O},7G> E3(Ic @}NA2Q~e2\325e)CNs&#Of\KsO24?e&k6l8qmcaͩtb! @hnR zИMS3E*edxiS2猁f 2{D_3͘{ZFo^sdUzYx8 P,5}LÖ! @&R2dr)OolCkȄӼpKI(sOsZYGy2d)N)Q1ZT۴=C-Όīۃsh>k bl4J5OS @ u|>1d)[NYm2.6|6`~ӈC}ݩVƈ浌4ͣA2 ,DB-^gрOghjc aI iAм}kUJl7! @(2zS")Viۃg>e*Ɉ7d*#d=ؒA=9u׌dk 9 86;/25X55QشϞK4}o>2Cc\MOOl?cqhk=>Je.*Pƣ)O ]eأoWf /32d04q2 e|G;dZn46dکWK?>)Q8㬳uf4be=چ=^S6lWEmG8Vm,ܿ47g;yi0PdڙGe`'f1@4! @(1ʩVR2e*Oك*QV%745Xi, @0JT=,>42/S4B\/̿=}XtIoJdt>6oS/' ;H朙1㏪PwMeq?ߤk'3v6mϫzgο f*wĉ3:˝p~ ѣδKT}.s{tv5umyPj{X>}>id>i: B\/z e}sO+|S2ʩ~\*2F<;O.bQ{{lv|$9 `S~ }6;/ffZ?1`!PMxuToJz7n\gͺ>bY ovw9p]24?}7kK0%jigR5x;ʀؼKd$2Ub{~i`\TƠ];}ޑ'.w .,3 ۨ+'MeԴ5uhۉ _WiҽӦ-m=|5Ge)jzȑ~c`>:ZnٷR4B\/Qgz:՛s29i*Uz*Vѽ_._<~;2z- @e?6TۜF4qz٩.ׯ @\C @R,u߸v-g;wVtV*7ix?PVm7eyvYeQe O몫jyf}oiږJVw_En^mG&i=Yg?mƦˠu CVcy{9 BQQ)QFݠLS柌;owAchgez_{”-bQYIE}YWV*vjq&CK>(o4.X=L;i˂uAOݻvD17-OrKSސ$>&hbgxOn^%+5ex1d"цn-ej A(N%NEE1k+V1T`Z pSRW* ttW{:~i*Ok7OYH4t䆦M9iqs6 `}: $hLW}%c6fdg 4R"cgOiEKNL~ @a`20hTg%/STڎP#&D$cdL 㝳a0ZdUk ]1/+ @_1k-BJz}ro٦`k=*1\Ц:&[s_9?ĶWvYYD=\n?O BnhRMuCFOFm.Ppl$7Tdkhp @a`FJ-Qe??ej 7 53*h"Gwfס׶(N gjS>e2,Fߢ[WF.z4MyPl4Q5eӈ2dUv>1k$`-L;{ 4k[+9sQzFmk;NsǍ3@*\kLkȓQB =`M[w־SNFMƜ7j~ei9ج1g#ʈگ Tb7;c1(C#"sCiZOXƗ X9-gfOaKvS7ڐ^jg~:4ZڎG%-P ;{Kwچ7}Gf ]/6o'N=}mKԾy1"RdM5ιxoߝfacԋm=\Fߙ1;`pm;s' ;? xk{ڮYԤ鳹lQs)SB;eOݬLwӼyM*35؊y| @qԼ)I3xe24r̺N$n+PDŽ @ @eY7d56lPe{ZKh{٤̺N2T+Cj庍1KU;wϳu۵ǟ3ԴWNuQބ)G9^-()0xTl̴be#Ff]n{?3muڤifM۴ug4-cbJvul39 @K%{o=Gk|-u9o}:c֧y2_e-V;n?͞G8k`ߔS&^0K.o~{w-_OKp1Ç;oŝqY~,XP3#G ss}Oy0DA.+7 fޗ Ծo8}2 53Ak+S=PO&2eRɄvf2ev]ڬmt^uGkߴ~mG/MƘnsdv|vݱ@OԾjj֯PƤQ?}Wm>jژڮ*qC;7n?ޗ4aGd$Ժ:7ɞz_Eߧ]-ө!~mŊڌ'˯Oۗ1hPPoO&ڢL?M6Y'^9b  Bz7%2d\t2dNc%ڇ|v51=6h:ccP2dJyQ֡_ *Z2eSF @-kg2j(#RxA=c(Pm PԴ֯Ol{>:2lW `'NyfL5cʒ3(.̺dޗٗ]}eM3&ayQ&؜9I3 O{lYYu<2ߏ$*4LF @ @0nJdzi;F چVc @S<2 &QF2H{'ޠHS֠BmӴ`鮙Wf clTƛLCcYmP`߃ڗd&jJy55R= -sevd v`nh0R2bkL0uʨSVܽӦU\2 <(jg|H􋵌2U,XHabb"vS8%nL6!+'JvơٞX0#k*ϴIp @e34:~,>1-6茷`2>@Ϟf}?'sLx;.M}IO+ D}Ҷ>e,j+C3 &0J @0 nJtMɧo26^sl4c) y~ēS9&C0h*;.Vw2*=GL֢}|T02`|&PvCr2ڌ*U[/B㮫3 )PmD%M /eܱ˔͈-aP2#e43 zJi4K^0QwږJg駌BnjPf]jʖ=T{ߝ%( @3x}=i0nsLba`dhi[yWF1A1UB+#>Pof$\l:TkFmhڡuA=Nͯ 2%TfJ{g2u Q(_vڨ6h&cOr_C4_yԆ:~?Ġ o@( b a @ @ch\5>(LfFѕI,;ei@gFM}XS{T\_55`Qڶڠk2gȤd` @0 enJdȐQy5J= `]Pl1XhIk!Vۖ$.;O0&Y5Tªأjc.*Δ \+NPۨ Gm >Iy6i9?g>~+kQyͶd3GBJ1A4ڥccc?{D4rd'CPf$˘&{/˅e ɗFC_&c}V`QlEREdTa2\kD @dԫr[j;WLT\kD @c/S<X)A @0z ܔ  B\/0Z pS B\kD @0"74arE\/D!M B`"K! @0")A! @a`"  B\kD @0"74arE\/D!M B`"K! @0")A! @a`"  B\kD @0"M B`"K!nh0D%pS B!M BarE@! @0"74arE!nh0Z^7%qC B\/D!M BarE%pS B\kD(ھ{hnVf\kdeeٱ,[ڥ_Bp~ΝwAh}0/a@;| fQu#O#aKAhi<}~b" (Yu-_w/̙Z)S!ty 2>hQ5ACm ia]wNܚ'mFFƑ:o- hz2>hQ5mֵO^vg #?Ή٩~ Dft*-qԙ({xG3Bpѯ߿c z>{%~ǞZ0褗ŐТ}q= lNvoc;ts>ΉT?irr6[Bs_x' BF޸e;A-?[ZCtru{0P~ (/o{H;/ޢtФ!N dt)+` jQu\ս!:mW^2|VL e]}脠\6E3]5DcD">UA٢BQ{w|ꄣl_ZV^g~oQإѬ krg_3; EG,+d큝f6ϟ߅{mB63@HQ0-YU.e]yF/`(j 'pSٮ?Y(̚8u{ !9=(2ڻnAa*lFFBQҖ ܼ}9svlUEaVyE#E: l1zJJ)ħ=zSEIwWyYYهz_Jmܲ\Z13@+(*D.JwbnNnܑ2m=C( a=asr6Z/0j”n߁''\H {ʩ#Di'g48mu2ȿ];t8}DŽ|?Fa),jW;\B dow-/JWy7;;GeLpzz{֋D(wganvlneIg=UEeaQ6,-j_,ol:wE(&}'=3;7B?ww FitT|tKot5 ^lfF/.Y^Hvvx4:_]Faÿq}#4HSegO_?+'gKSrr]JZm^$775(Gk.]Tw7J0x觘(7l-';0_٥xի?@Bak+Vexizɦ;F)y-PEҾDOVn~?U_:{(2 7¨g.Ϙ{ %&1eʧک(YY <G)(UV=һOɨ.(K{u}>DQxθqXBaw<9ڽ}wF<.AL@ ߨ[^$6ލ?- QJ;seeeh >g=Zѱ=p1P*_kNծ6Z(//C̎7 8\ѣràPN`g =P}.nw65{0~fdd8?͉#FxvS%gWFrKK8crjn^q^PKI9HiY-i$̗2eg-j_:wf-M+ʅw9@~?}7&M{iSgff~}^S^Yy998p̳ڹb5kK;wމիKwOgYgzgg9##5{S8$''oiNn߾z K{݂{V^Tg46\iw/]'NuvVz4[L^sz9]98gWxQ$ ?_PٜE?RTtuKow|{Uw+,R*{iw]LwˎKV瑼1SKJnޫzVnZmOq_9GBYYG:t߬b=ڣױ˼rz@\]IENDB`aom-3.12.1/doc/dev_guide/gf_group.png000066400000000000000000003550721477627663500174330ustar00rootroot00000000000000PNG  IHDRpVד pHYs.#.#x?v cHRMz%u0`:o_FIDATxwEևOu99aC"wuM9>*bV@9oChf3թS 1(((<6l0o޼/Mޒt%r-999G^ϒ%KK._>`p8_nn.+D`($H,A$I/gAD}ytQFqw2$u*_N_9} \Zׯp8 +77|}ٮ-֭STA̝;w[n5ou{?!iӦظq#B---e֬Y#mW+ "bBWFWj4x &X֮UUU׿233 Cll)S,XpCY /5 G7oA*$?7nh4 O O> ` /_z{W^s5˗/6mdlƍ۸qc(;fs~~ᅬ- ;322 CLL\bŊC}qSNEm[[۔)Sn?+++66kD"`)S]"##GUUUW=<@߾}-[ֵEkflٲ^{JOOIII?'ZAAAUP85kL]},piii:gyQIk׮5LPTTtW~:<:u9sd|EDڷo_^o6i:ujqqx7k=z!ʬ\j5Ba}Dz 99yٳgϖ/h4VUU.a̙)))!7Yg5be~箦+bѲO?t[v-L0RW%uV=4f9''vkggg!jnj Ag-I[oEQ3fΜ9VZZ*8c ?~j+/X 1c~텅pKKP+((,9pD"o`H8oe9믿@llTUU;#3YVTT[n7n>j{9?䓏G2WVղhGG͛].׫*o%xxGW\E=VUR;пҒ?sS>$]t>㸁ƍ,|rprs=spI|ͼyo{ꩧ~_~b&K/t…sΝ;w?~a !!_2d$I?3ǎ+ /L4O?yّ9sKP8Fs ljjw / <fC=d0X~^hٺO0aʕ]OȊ-99yذa_ /Weg;;;;;;SRR{t/9|py!c=&?^z]kX|m۶o[n6lvܧz$IRc_~OcYvݣ?== !$IJJJye eS˲* cLĠA*X,VU$:;;/_A6??O>999<M](bw(IRfffrrrז;\jnjjڶm,'şLBB6Wwi&Yj"\[[Ep@H6 !$1]?ᄏ$I/|ƌs̑[k.QqVh4˭Y>O?+VL(,z{bYsw%dRRR,Xtr4hnojjjhhBAAAA }1 P]]}_/;n}a#O_XO1!$Ъ{($IvW];wΜ9OzIC@Ӵų>w; 8_>ҳ A>۶?_iggs'[v$p8v!{>r?͛7쓢b\> )))iii|s e,FXVr'z*br 96"g$9jԨ%K̟?{R ]å>, d dcaw'?@4EaIX,qQQ ]"IbjuWk.(̙('%IF>]SO?a/0aBzzyMz( ɏ?_Mh貳#seI*.=~O)>?Lyy>uYJIMm˿[QU&L*ظ Y(yO` BlG"dW FAAAA g W\q?e˖_|{ٿ=x vxm۶@bb"mCUVVY뭫#Bv!HJJڹs'qn^kܕ`pTpO?/|뭷vm_wvW̟?3gĉ?qd ;vg?ܹsg͚ug_@>CR2mv%tmx{8 ollٳgBٳgkݽu^xЭ${֯_ߵ_۷p̚5Kg?cǎ}nݺ뒢zH$vm?|~4ߺus9~b]"!!⭽ F~;[TT$/h۴i9쳻hСCmllo8{@2Æ [pa2>`uf=cv[zz;2+dGWAAA@0*((6mmm:.66Q]CPiii[[N+((s  555Y|>͛}>_zz#HZZZc1$%%۷ku<p׽ $I&&&  577 aYEV_frVZЀ1NLLd5&%%u-xOnof֬Ys׮]֖4dȐcvQPPP9p*++ Կ?a;P]]]AAѣ,Y=,g/袮` ʐZAAA~k׮]lY^cY?ŵ,q=`41E*(((r<߻YVZ+|u!.:Ix;)Mp@z/v;D\\NzZ4Ș1c,YOV0;p8(((((((()GLZs DQlnn6lhT]AAAAAAt_~1bDvvvLL gu\6l())4L<-c|lN)$IRf }F7o޼m۶la!BOы[ljjjX$iFRvS$N An;qszRz# =H'889u"q+"##C)'^ΪTdr$Iy^tSTplF 5caVEkEbŊe˖$0Lvvvqqqnnn 'EfKKKnݺk.xQAAAAAAAdѢEW۷a3Bvt:[[[NXJAAAAAAAGdF3gUPPPPPPPPPq UPPPPPPPPPP䬂"g9pC)M!F9F1* "gxwWXvh44o[aaWrW_}u̘1wqDZFii'|yfZO>s?~rz%gΜY[[ 6$I^k>뮻 =Qy䑗_~9n)wgiZY 2|nɓ'/[lׯ_f͵^Ί#ajp>SO=oUV_~Æ 6lxi~|Mf)((((((iLEEիTyczzaÚ.]駟>#.]c?3gN⤤[o矿 Һ(nB7Z>Ζ &N;i ˲$YVy~ϛJW^yQ䫯:k֬+Wvy5kVw .I .1cٳk:tJ _|… :묉';rM6x#F6ly't׸sϬYjjj6:nn+V\9k֬k~'cǎ:t;o<(((((( -IIIp׮[e \~aזH$RPP˗/ϝ;`0 0@3g ;e0B/>X%IHOOWTr^z)wޑ})`9uuU[]]t:]uu5?<HLL:J?QPPPPPkXgOc~a裏Ǝ;a„oyܹ ,e. Pԟ'SNݸqoV`0RVjj4٩!~zwiiw~i_MMM/^{5y/T_AFh4K馛6m$BB+7(]EAAAAA/;{zs7LzkÆ W^zᆱV>|pGUnccc`„ /_gxg'%%rSFs666vŝ$i„ GgBoƘ1c_oH$r7I&SLyovNw矟+ȧ:a„xnOgΜ"gNfϞ=k֬-[^zɒ%uuu ,Xj7|3a„#jܸq4hPllleee}}nRL&|$us=W^^޽fAd9+IZrEqӦM0k֬'NT__~pX;{ܸqjz׮]NsbE*N$ 2dȐ*.[ضm=ܳq#%{vnݺP('y]?$e˖ @wYSSC4IUDQ,,, Ifu?4Bn755qw!va0Vk UPPPPPP©$I=К5kz&XZ=eʔ}nٲ=11 |>JRvO>*4Rӓ`]w.EQ7xcW%4MZ{A쟼cgl=4aE8t ai{?Crr2A4M2QSs9}'B{{{SSSBBjKl"gjtsssIk_zJZNuf4Eq\ױ]r&$$(FAAAAAA \r |7|(]9{GX0`@||<}vW{K.moo'~~`pĈA0 CwC.-->}Ν;Ʋ~(Y &O |V7| 9ۻ |<#gk9o ƍ Cq68袋.>ꫯ~'Mh… 7oެ|IYt??饗4MVVVIIɳ>+bW+YnF"3f裕+W^pwuF_$裏Z|̘1ׯ93˗/7ںWp8?'xBϝ;JKK۔WF {ZѸm矻yK/*'N0̚5k0Ɵ9EQ_|qaaaW iTVV^}zn\, j4!Cp}[v풉裏 ?{o}uuu]i Ə/[ea((((((AGFԤ}%%%^l62dȑ]Xjy?~;:: zyjhhӧϹ=naikk[reEEE$lÆ 2dH~Inݪh +{+/t8œ'OJPPPPPPkY}z!5(` Q26SPPPPPP䬂ĉs6p---;BSWW|Oi$l6+M𗒳jXn0WƘazP Jj#a#Q:BhΝ$Ij:))iذa999dPPPPPP8m ~:Yʩf3EQʒCOvӯW!$IaY/OFR,.B4'''Ϟ={]VPPPP8-Lkk-[JJJM&}(ʸ_6mGqgZ333jɥг$`bbbbbɓZQw3 Z6tЂtAN JAW1 UPPPPPPPPP8R 1JSEr %n*{~AB7"D{I)H>( JJZ}I>Ѿ[NgA^cN"D*wE*((2Ȋ|H`XG] 4"DQ7n  QB 4H%h{MCX6Zc`q1$H"cv@PR(ƀi i#H"Dj+"Zm%( AyPȅpK";E. BlB'nBEMB0cpG {ChUY!},kl)X" 0HښBLRIF*- D B  $hF$Q:ڹ+jd}OEMK|'u1ƨv@HF [ڒIc6FoS[hYi[$flBQ%l4F.Gao`I`ĉ\0NH!3&%J6#s6 !cfZm!(-s- |ԍ%KRXl> 4H $D%17"JeĀ)Pj+"iZmE$1X.VUFG1KXE.q7l8n.V.lć} lh>Gj>ب2 @ƖB|W!B[,e+^s U@r!66Kc$"] l $rᰫ0$Ez+y (FoŒH4H"O $XĤR*=ce vQkb e$EP Rh %Q{| }[| ۢV>)D\%A0:&&dtަ2с`,q'w l(!Gm@ubiIg2 7 0$!h U8jTT?`@_88&ꯏj`3vRd hbԆ$ZҧP*ʐLRz!RAv 'vb! 5|߼!Ytv>̂AhUqfD$`VFdZ$ :{ D,js<ɔ[*!LSYu-$bIh cuh n,E\ЍA(J+F sgTJ&Zg4Zk\,ZYSW1]mWYi\N1{`.n wTKͥ!G-wsLE"ƠWP!b@?{t-Jix$v$,q p X( @@R@ЈdI 8i0Ȳkk[tnXhڴ\1COV3]VS* IN呆$B4q5FMOk)n^ $\xIR%38ZoQ[hA:h;ib <Ę4FyZkƔ6ejMBWǒ 4;6Eu%֒m"Ԁ(`}XwE^^D A7tc!yV?k^.Ʌ<р;HtH0ZJ'֩ sڒv9PI$ԡ$Dž-k. ]X Ik0EdjkBp5P!$a@ $BhĪt >Iv\Ϡ~\ΝKîzIDJId Ƥako G<-BgJ`m&kl)*?쬏v6Z˼;~ႝ:{.1gr T8P9*(rV@ HB? Abs y8 m@8 DK< DXH"=P$A!D#1Xۭ1IdRZ(4"io$][tm u\(ZkU6Refq:k֒6eꬅT$sO XpG:#.l9kHYhTidtjkڒjK6&d4]H_ֆ=A6"D=" ZkV%Ɍ6dzA^(# $Q#;w訜rTicT;AjhĨm(y 65񶤉đƸac.5@2{fl=@P{wH !@  G@A1$bQ$!n|owWGJ<+rQ!>JSb4$-YmMv`5f `;#.QM*DA$E묈)1ra'jaFE;0`@DfIByχ="B5j T@h1"]mMf 6 ,HG\9ȅ%՘t1Ck)P3['No;.ؾy\_hMAi*sIz1z"FKȴn2 u[mW3MqZGPZ:yR7Y2VlN:+$}B1WP왊 1@P:H`ỹ1e&H)}J\ Q"D2@kR* $Ȏ!K@v/lx, E(A!B h=b@简%>x[ܥr..^6ܡ'l}EZKژ]O( PkE HPdCB/AJmIT-Yh֙IF-A$&>%ǢX%> A#RB""vD}_mȵ#D *FmHR3 C|94G<@ 0 BO9)tXd.F1cQ247hY;c"Ss֍ߴo*ႝ~\hH-2 <_!G~k֟7~K ԢA*ۮ3) \+ e<"@ĩ䡏%G@b.H<" ) U@EG0` [nXI|ay{vI=u@0 m= H;vK ⫉|ZHiC>nOo$ϖ:f"yy<$#>…l8;-`kYygFkjs" j``656-~.*a |7/j`Ii1^6c>=)O K"fJeA)\CBlDd)W鯳՘sDF $ BXhD.̇XK"|D⹰!"vJeE)n JA ڜ$ QQb@kX$G#QH,  vp! >@$b8 !$a>uIbD6;eKP*a/ HFH66RNk]tm9YJ QbxC(AX4mlD:[Žب F= !ml`hAkͰ6'k )Zm*A I@kww* BÒ( k U[B|$:H@gaĤL sGй=б1"|IЌ>H3s]V] "I,cȅv=+Ę>-Ԗa {cSsοߜ5m|iFQe0ڜ54n0kƇ}l]?WT-~LcJ;ٿ:@РJ|̪-EP%c?% bB `RwO?^3)Es q%a2 N |[ew7 $  H`Q$mtlpvLjS֜%zqQ`CR4(CKY9N86ESqڄ,L$J% A!G"^9V}5ʤ'#D8$1h5Zd`KGܒ ȇ) !zI`S|\QID!"HWATM9 }:06, ,r"!gXژ4VjږF*_VgK{)`hu Sj1$6HF uuæ= ^9iF+ Q(\{l悝!B^Y ~'t!DH\] h5E&>XV"1+0jٖ5)6Bc`DYuG7M(nlz4 `5-٘4&" H %.lo&@@2@J(&C;쬋;5<$, [9k9̡{y;떗{OV 019S]G1D@{U,]^NH?VS^4ʮ;5t\ռjnaG..;9sKŧWcO Y6p\Z(|E *~G%YV:6: ,!R# L@j+ B EA^$  DkoJ"Xd F' "PNoO&61d)$ۇQ$DA %"i8%"Rg}QE:[-*_.!ة2'3F;c#h̘MDk)!RTGB͒Ř LRZ>Em3T4>R[D=,ʅ:vyZ5b$( HxQF` *#aHΚ%h n5H7n|nC1Y3Ǽҧ鶖[1Zc-'>Yc;H`$aH 3cs칔D2jDҬwpAwe'䂝!D|jK9Xk4W4)4|0X@ @hHf;;[K⣁2OjwPGuIq'*H xܻ"jRe^bMlN,0;[Vma[9937oo-J}Ϣ@siKߢTgOɜ=;{ײMriP/{߸ݒ;&{]sR+9{kY" A'kPg?A\!" EBh@@,֘ bn!%$?W!ꀞm_f+ܵ?hɨ͖ᖴIđc >CcIIb`!KGk|+&ᐄAO7FK4\F0 ژB}L@QHlj{p>,Ȧ-agmq7 l%pQ K'FԖ$-UmNP[UX%Ҙh ph.⫎*BR6$IFmLgZkϔ8FmHC{f $XrC[YcHt/A̘0,_w_ $g߱ncβ|'PD\mf/flHCQ]\{4 G8g {ao ;XG)Ţ@iBiDiM*C,55Il0ؼ3Qꨦuf9RTDCJ_}RS) ؅0²'9/B쌄h-Pj5 ܱؐeX(zdL^o'k]QYS1EW$ouτ;2ι%g=ڬ;~}odLKzk7sbRIakݵy4Q/8M7kҪTʜH5^YyԃM&P^{EA}9a 4@6A  ںWf E@ `vv u{$,{#{IdmOΆV4LKYԌ %.oeE J]p^g D\ r1C\X"b4(r%QmI6 wvDRňҚiڸ,ml66֚z,fXdA$$`IID$h ԈOPٗ,|!~t=߸߸' ?x)}Pkǒv6tf[4Shv?WmK.ۖoͫPo/mژ: cߓF!A Kt CfVOou,ϻ![8E((rE C9%`s:]i% sĻtD\ t06b?A84FT@z{ZV: ",ִɖ q:k-x$O"ۑގ"X)  bDHD0FCp,h^Xjd}Q֒>9Ԑ6POl߾&о)hxHZeJhR4t:H\{%LH3c6 ݁@ˮ@{e\|+p>tVml>>GKQb[i\DȆ;Ce55< Ak12;[ˎ7+GR-:XWoK^rX\75 K(ڜ|gLrٿ\5?JR4Q>n.>n uTG\Qo;3*hK $a! *M%sYUe(rXABԚÝ-ۿxS`CSA$S|eͻ㮳'mra>О;;ϵq6o׮~!mggoo 燐꟞m^.>+wq/8tfcְQnyl8%wE((r{9U"zWv T`zfU,Xzs"" 8`bRa"5 x4^%`)F OPnk{}[oo՘2#mG~ b3  `0a'DHmA% A1 Rف4a> UKe=D6$q>ּ1~: 58 ֻZWAMTSgYQ3!p] !0BQoknKCZ6ftKjM*S>!Og'(Q#ߞˆ,)c2IfJmIZ X ljf'6g`oqisW<Άގx˵|[~7 [U5Oߜ3 tC/," ;!G]Yz۸;#&.3Nx"IZI3'Y'[޼Z׼b"}X@l@k$ 9j[6|,]h.RS_g/bHw_[vk 7&N3gʾz0[\ҾȨo\x\U_X?2#"E޺ͣ^W [~Qݓ!GMMEunPP!T 6ĭ: 4F(9""T0TbA UIQ%!YQ>۲2Y-<[t}l>f"akD̩He׆qЁ0B^ fAb P @ZT4G<-]M#jP1}& T;ZgGKg$B1YRZ|&xSš5%Q#杁G-p!w*!1>1_cKQdNv?o[W4s>8ZOboZ=ccϲNbI8L쮚oK(ܳYߖ;є1mbX>Y`~C[sO#h᷷+xY /{) EΞRZVH Hl+͠X֜a.8@Tb $u4Y< `evcz&l|߱۴QUS1edNNTC5߂ΎL:Cqȁ9?cz7YP3)D@0؁2P DֲP[fרIe-BK[=sO4x߱ӰQ9?ܡ6kb̉cmӴ| aOaJant6E\ ;+Q[%l66TVL,U-y/՜4~_7x4ot6H"* Ƙ0<&c!?<&oG%?HGzJQg+or4o͜X8uIto;l }]8W;~8&kri_l) \Qj W:j3V1"qٶ/#0BX@$ |Z,jٺZΚrhʜ~wk h|p|o*Y褑_~mQ6GԱW^Ɩzl-߲)c{]6DMe7 fzTdSP)Kĉ݅!@E=Bև(-Ry*R0 Ȯ<]#NiMVh7kYTк |@%Ab6HӐfNr"@CGWpa0!IᦰUU5Qq7 р%{Xl߉̡ }|n @ǖ֤b.' SRO@;v @0oq7v Hƒ3ܐהVR41'zhZV}|WxsKgRwOQ_$JaYz д兪?w9uY{+SϺ6R;~N 尻va|-Sa;pWYq7[˽uͥ\#DCBmn,O'H-r rXRYyO[~RYG\u)~_غm'O9O4 inwG$uϜ-rQ U}${ϯ!)gbLؙgߪHEΞp`A夝C h 8 QaPf`J9Rxn@6-% \co_mYVQƒgJ!~qٹD^rWa'lو9$H0>R%`SQ8Ƥjvkv5w9ZkGfW$Z-)gďҩ([.h̽ۅΐ6ZoFjK졌q97; Q hBk+͚eMdJq䣲%u+lַ@SI㊏W~D0}6&U|?-5ցpqI-I#S﷥O9懒f 9{+'z%, K7t>1Gho9ԠM)k+hHcLw'C-;V|ҼvȆ2'-y I@:lZGVW[2?vءoW/|~ {9ZvVOƜwx\ا-BʿOE5mL9{*!!\LNi`(JYJl!'"hdLDj nԃ: w) #D6D|ΪΪoBI'q1nI"Uو:){.@EAc?|tq{U/D S4- iwlj+?_󪈿Fg-˿ܜ<Θ0x+t=_$ =5:+WGMQoE-YMjFkUg~-_{_Vv8)t<FϘG2\6c,-_Fp/wk HBdwXˠ+U}];<[bJ6^kl֒w~%oKDР6xo[cSrQcs.޿Oaαz6,}ͅs%.kg %?B_KIwm9{M@ {@_xR-y@N>) |޾{r\ @XP# , )t9 ֵ%z[VsfK9qƨ)~(+$1Rb9_GEΒŁg# 0dg^;JΪ;> 8v J(̚dHr Bb SqI]:W+YO4&X2ht9 d^e֗+k̅=UgUQYCs<3`Jaϻ$.CY7mzih"pִf {v/ݴo%e/Y5Ywhg!fTΕ.׶xfů|ϡ>~;Cm޺]wV&6&͒54x%g5d_ߨ_>!'}'IA悈K+=ez֘0k_mys/)Wy3;+Ѷwڒ0OŊPPIcz!\ Nc{|&Tml8Fj2&!zQb!\4YL\ h0񀨞h4?D*AUiSmo֒}A|Ԇo]a>7xwl]Y$iYQ c·@m 6_RsS[هP_sn3x Q*–}lQ$7nrp-|ch3)oA Iv&c c\7Z|Wg{Ά_;?lXAT؜YZK^76ܴE\ՆaX|Խ~0_[_qsF=uyjcfň&<5>gƃy=vH>Tm~[J))bc&ZsK/y\kK,ad}} O7GcGic3\ko[޺Jex\cJ^mpG <Σnӊ`{wn{ig =pPgZ'&E!TL,p%Z"1iXJ,h  R\;JögZdayo-%W/.J{#yD o6"&^8v,vl[q5zK\T[߳MiuqYݯA 9h aJA$uT|ѶzFmNx-k>kv0x7,ȘGcÒć܎Ke*1e 7'$1 k6h=\biQ"|c(qĄ\;5?QN5'L.Mcml+8A˗_=Մ>7SO;[KvuElOG^{šײrW2G=6?kWgږ:'0DoFo͆ _uVz}bAlI R+}ӷ%\˞Y{W]LR{u^13n{13m~m}9nLW~x͂=M\뇫z1}}|5JͲ?n6JQB r6( l+`;0quurd1]!j  4c Φ=a%,H{} 6~,YBL:{-l )XHpt6/iLCdg҆tbiD`8mU[;+W;-oVs))7&%{cJD$,y!%b:#4myz}f0%|uT~^y"Ae0$˷}O>/yʘ*~Uk*UJ7~/{ܫnkY@_3E7=[AW^aI78才؞`,E;[~f S͎0՜9kC˾x~{/sKjKGkC-`̺&c3tJwWִk+gk'u^h#϶ ^SsNtwƒ{+l!i]Sawy\ur^q"tcqX:.Yz~_[Tn$T9sHBakbҺN䣿A SkhضpK3߷+plݭ3$f͔(.1{4$*bȅ%K:le :^4ײi+Κou1sg%MT,`N=\-/:k‡?ڷh-Sb#Izm!NGsע֕î É?c^+8#c0ƸY1&T6WsКoňs?˿Gٴg|}쀓؆aw醹҆ޟ1≞D+SlJ0_ݖ՟o!4 3F]S8Uc׸sin5OE^(+ަ/_i(kE,5#ƈ !ORW#p׶u.)1(ZCA̵>n<7x!A4Dk!R m/A &! $-йP{k{/ClSβ^C01wD9-[ʒDWy+CU*sBⰋmyc-9iQޱnQ|twd i=`)$ IȊr‹b/+ kRom7/5 *=h-1.S h-P4:$3h)\v6,}ͳ^z;y\ɹ=K 6m$P鍩wGr'y'_+ӼbeK~cK·6dڇ7??%gk.o}b.>obeضD 8eU)g@SS3/c_15`k'=Z {k\e& l|QjHcF*3fBڎ 6~[ _Wgwͻf?3~En((r6BLE$_#}<2$* \,2 ?dNԩ΃;Z&Ib%ݦbp>PU$ڣP`\@rV s:w,.[lYo.>ǚ7&ye!aޖ)eLE{_vHf!@RV"6ŒQ  Uz6ulѱc1A2B[fa€DQc~p6/W%`ɃSV$ _!U =v-?~|$psMI@í;jRb?=1|ɵ!wK7 og'nJ|rzNu0Ǿ kYeoLIKxsʸm֌snsKeLw~{Gz ' ߵ3:{LA`p' RgXǚ3:k?=s>_+ݙP09REٸRiR+&i?f! XP%%w4a-߱(xk7t',$s98wMx[֘ƥǖ6`G5 ;JB !7) 6|վ@6wN$9 @ǃ&$6>":w!C"q|Wl2ry5-$a}Ԁ6H@&0J8des/ji7ymK w@O쀐th\qYK8.[mu㦗ܵߛG-saB]fKe3g|vS[WR`5$/0aLFk! !$pΎ(6`@@_l^G v` i/T #r˳e7vЎm J>M>?w\կ{2*W,$ E`t1/EStv|l|;vS߳('8n[ K;GNm˺)Jy(.y 2=qHԸUFOzKc_t|E61ۿA_~cǵͯ1 xˇjK(((rW-<%IzȘDo T) E T@%, ohL6nKd`3qXp9 K;"pk-?KIJ4ӏx*6<پK]La֨mia]Eh,`JJ^Bk[ o>βɑ&-R .I7 qJ%W/Y$e>4moGl˿z7^Tp3S_^F ػ`0-C;)[+=>ȳkH CI4 ;j떼Y%Dp{cQo2q%)F6ؑ? {}^va>H?NUQ6<>_MYBAALJKiY6=u% j7P%̃*@ļqV[鈧ښ19uчW{•#q7Լzvڒh< kޘQGM[_kʐ6«1p=m[mƄāŏRGat"Zv[9mD:DvC;ƍ L{mNk* >=vG6):L@'@Lߏ͞)gxfW͏î.'/n/7q)p'*~ }}0X>[^*|6쮌O0[9r 46yUi).'^4UFհi{hl)CC=rH&64 CDm1  B @DPĒ^I󖗃RiJu4/+<UeXOkLI#/R[{%V~psƈ\{ 4@siò%CU”!3,I%GDhHtu$f;u 3JzH{&~Í_q}=K޺ѸO6p{GwvtN/b3";71rA-`>1Nh~}Cby!c!'p-P oW.՜<>Bf}vcI@Tfâ__+/py\sȝ_3(rDAǁPhOiK !ZCWa4@u"Td6?ƄaG_KBJXDvc/|\~qCGѤ ]qi|+3F>6WY*]왱ك7&$IXBHg'G'yD AԉH%Vrz;2|cNx(-->ϜrG%OώGukm6@%Rb?N7_< K/B1^ӪOvGV3OPS ?>ضPyf@8[ȡw/t|,/[|5ƨMIgz,  aC* FGl^Ȱ7\v6{~ u"S E((rX W S"=u #G $`̌о8,AȣF!O놯}HҪ >ߜOD`JRu(p O7m~]'k3Գr!vѼwrߑ8Ҥs,9)) c)À I{d @^ U\-RI {)>ёP޶r6cOtWG\ o? *W՗+9ojr'}_xuyv͓cT~{*'pl[PS b]r{%p[7zVNo|uGXc.nAڈ1xLJ7y֎zd uFƮz[?w~a}?3"gE^'u`5n1ac#fܵ?լ~8y}?}Q802]}ͫ>A$c4A ZU ooWU#̑$݂Ƙv9K-y=^mL4QW0F{D.o m 2&$>5h2{=EN6ԓLy9gyҾm\Gʜu>ց "HZtlQ0Kq Κz3pk}m^|NǶE^pëzU\b1EBIj6 nes.ҦȑKRhLtJeXD$FQا!Rą#>" m :ߴc#3OqӼiQoǨ m٣btY`tȒ:jd`7퐻vAg}mkm˔0.m~kjhY5uWm̴7N0$YK 8G{J?^9QcY`oͦql[,p&ܚ;C <]KR1xKaփuнQAm?Z^kft-ᔱUoxBJIc,I pYZ!!Ei__{1G68CF>G_x~l1AB"/J#D"R%,! aǞs%_t18҉#H. ;g&*p~uϟ~Ub( {=ChԉEZ-TfR%@dWl׀ccӦ;*4'7G[ƴX\'nrÒYe-k%gxO '1!xewWx :n :w.mXFa9pƿ&A[4ز` iPAc\ -hmH~P0ې%D?״奄%-ˇ}%Һ˟ɚz,+=PD{KXWQ1-_ewTM%׽o~!j})1GU ~3GHe@*pSxpg ""q7l}ʈ^sS%q=?/L~B59{d5PL~wHV1-}vA/$^8\Q9ۿv(J@ 1 @*/l =kT{y*D)>~iXsÆ Wծ< y_fk&4@stY>i]? 4jb f?c/8'%B0]kl=Ϩ+O d"؟Ė;ևZJT÷j%T4M17?=xȐADuU?V$)s@%::qs(Gc9@HC 1@*/Xs4al~A E*i5sNFFuߝa!Zb4@ND }أ9n`SB߿Fag7/ 4 +iDa=و :PB*\tn7؏bNҷI}>AjxV< E^#(:;B8҉!icJeL3Qkh\QEd5y}>}N_U8PPP>a:)g;[8/.GY6D5ւ 5!IG ( $J(JZ]A8̓Hu02Qi$ɬ @xd]8 YR2aۇڼ3IdB/iYU" uZٰHFg+Spɳq02"XtmMْYSᔏꥵùdIbU+Ծ>oD og':FTG۱  hD0I_=fXtl&FsFî LS-4#!%I9sfBm6:&Ԃ{-4e{Y>}wY "u.uoCZ+;pY2Oأ%K$6mxiь|wS`-V礼wr6ܱmAڄIՙn 0y~E{wy4zlϽA3 yo3> a>6myzŃa}}>h<^Һ&Mg\$= T3LySw]m!9UP <Of%g@^oJA1(Ff% ^~J<5mz,K9'cM*sH܅mo٪Î/}Y2yJLvBWt}>QOCSW)Lc-.yt1$Qdr;@S,f ¾,D8I.p1߁$3 RTizFj8؎'heFk{s%uRV۴?7t zf_ش9gWIދ/2ǟE "EsrΡby: !`JBxNѨMo.g1xЭn|uvw9{po W<ԃ"bv‰0A@2Ί-W-$ t[ p ?iG[`_lN1 p\$G܄)V;6z%r]-Cniź܁|]' !Ky&H-DjAuOBڤ@ǟ-記3՞ [˫;>g政S9BBcw=9h.o>30Hb4I J~77WEA长_yw48jܠM_fM[yN9{㦬hYw+Z&z{Z#D_@j?O(eɏЌƣmD!\ވ tRH/,i=;k~dhC}Q[-ghY$ٴr$rƿ*@&OO,h+_|qgt"NptSiD\66Ò3RyT9]κ2yd,@1y' BV7=6YYENzIkI~8 ~}^±}ۜuLHpn^b.aN9B=T⮖>뗼UjkJH9{[$5j)UB;vUs#gѠ"ՠJp{ !'bڐ{Ww{J?]e= cLe$|[-eySi1@{[D=`@?hzDC!/;ˑʄCda S0`[W. =Kx %O hNP!,r "l9w/AzVc6мs^IZwi 040=䨩_V˚/D.~?F^jL>h*~\`~y.; n—\BKw?tC$^N>R1z> T@4Ds8bڌ#H{ZG|0Gj+++E!Yv|q_6f%Q4W9^ʗE}yjMHCjCJKhcvFZ 5OMe#;^Y }bR'b eBq6-A[W{,O.0fJ|$P:AvCӊ5-@mMI;ooUG#@})K,A"wl|Zc?w>s.|1xݛFNЩF%("#`(@3H#*0D*c<1/\Aӕ7bJ%9*L-AZm&iĜC>mŨ[8bFaݶ!3 ͘ `kD#划%`oߚ'ƕ~xK *=P጗X L‰bIA"O~~*sˍh=gD(_RS|Cl4%.xs8 0G[gۗ /9q-j t tZb.1&ߝ 1u1 ׾eA ?cj!jͽhNX(Ww1v hBDukD9<9{I:"h5mW]VѲGetz OoҰޚ?Vi*X~98kK:T7,5{AKb{/5J4|CjJ3\ K)B%O-r)Od@0c Hެa(^ fF&爔Gɯ ױ{CR:Fk e0q؍h-tl)i6A.U)DK߮15pRb}*tXII\qd26[,TADM6V6ebI7Dc9zaM;qQژfZlj|:*w;"/YaJs< "U:wqucDIuq8!@XBO)}ys5^Cr?{s^p]@h{80>Β{b'," w|Ɛ4S!ri "he:-\ԄnG_y> $!Y4M[>fXq<}G?H|({j @RfH- fRaW_`[qֳs+c_@˞o[h=.9|Zo3:)8ͤܩf:6O\mlzۦ=(aJ%b.;$goI˷|;/ΕaGT)^ֲX6ww[oΝ5m@Jzq4 pU/\_I>;(5𡣭) 5$ӏc ^,Ö?z]?ڋR`!0KJVf.PJU?hTC& 9{VqPrkZ+RQ~6g~s`_B6#.Ek-u<-tU%NzbbnBXWuږ tq >T%CW]3aGh?GAwU*CQ,ֆ";]+&fj1eO4 V|= d{)4`SكeY.(*x@Z)zrV?6qC C<e[@V:{oemJ6 QdLxMI{hUU+uY}~"֏G܁p˕WXGcbyA\tѸ[z3{DA"LmBC" v18pKFu*06 <-W!P;$/uBF.gJ"9H6 y&DD5/h "ADžSچ^ %-,FUQ&!#@&KZމ8B98 x?hOb@A/$H(HnT!,ZfɘpF$yQ8I^ty?ի9Bм@X/["ZB|6m%?NY`]Pԩ9AaFVl 52NI14jW*ջæs-`"@mݵY%j@5!j1- >.3t8o3Z`5LvUg%J@sfZ>(u.#M j)7Cfs{w:#3- @i/MT$8-B$bYYsyss~ӄjhsV[mvWe9%U(Wj'F HI DtJZ!^uL{>zʤe0>q-մ, 7/ F(K2B#5" l.ddky/#%,Y;GgWC"K6iePYASmfU jdjZV/j>@ۄeTi/}$#@F>rkKDP%H? 6s0E_oASX/x,-:{rguJe=E,5Go'w:pd =xꉵb29>:YF)]oYh2Iژ@In)'EaDu7e/ȞF*pYmDǜ=9 cGW5m=p+!,˼V !@Q@![a- 5RIVPSHaIK4 (O=\L.Ĵ"e!@~[⟟u56R/ ndy^mB]:Z.y RL[wꠗsSSjDžr*GX6HqM s 7cb'-`vZ /&FUP* So;YqBA腒}ė[,P_m\!\RC*ዪ}K72jh+{߻\PLH|D>{Vj+/eB fb$ũUlˠ'@+{o~m+@dkdZkORL Cc܊% y; VWgqPȞL0VF= 6ϩ/Zs:upVW9}Ƞ%iKt:]xP= (()e&&2Lk2nO+ xwGލƨ(5@,8sn]"ux*]&ux 2B!+DoSN@ B|'< H2LD!F[Чτ&=GD;>2!,Ma)DcD)k2EFFZ>qȇr~q:-]S_\ϐ8}8ڎSbڨa]^?iU2'ܺpc6PLFl.?s]?™% Wwُ'~l,LDPe1C;466 J+Y D)ԛ_sTb.F/7Q쓤0VyVxzuTȎrBp tIw͆m?e1oD< f"ѷ;x-,`~oU-JU5 KMP;҈&_ I}}Y$ @+1E:Jf^Qyr^W=q#ÛljT*?BSPγci|XjuG :݊ҁ\iPg>e ^u{*}na9Br SVgzC[v폎~Ғ՛vx3gLZ `C9/ '6(!%zxbsKe>B>[r 8JG|cʽ~j|F5bOJbA *H\I x!L 02Ȩeb A=2뉠 +(* G#-DReT)+-8EAD[2=Wx/4B"BId0kP2@1j(BfNIic`BW R+CW_aPXR*>OK,' ! a}BXt|j%kI^0B4bSh44 Dh}/]ܟBuPkNeP:4'on!k2GgRzOQ1:VT R$Kl7gx{f'*9}#@Uytgn?99fOPc)#I&6t@{ᦺ~i4nX7mFKB8E飉*8*dk1#MyJfL6c'(uxpx@)ϋ,|"؜"WIyQY#p* 3pm7D3^?0čR7>߫_ D8,f`&r~Xq1f>>1mb@vs?#*c؅rsj3+Li#ګD.:82X To]'${yO2練޻bHR!EaQ 'W.RePi*P|(#Da$4FAbd$!FUSá >B*P1HJ  s(3R}axF:Dhd9w/K?rOH{B J 8#mbp[{8=na8{is Z<e:<Ͽw4215xiI`ϼYF ~F r2 bB  @@&@91B*%dœ_ɂ/fՉc/AAqe\lY@@kApr Nu%nH1B4do4EP۠ Okg;wS-:vXt*g O'VDӏ{˖]vT-IJco JJVE41VRJdb,4bG"XÑ 6kd$,,R RӒR ;q1`f@SHay7ֶiX ڲUc8s釧3WyΒS!8SO ! $RT rMN\^yj^ȭz|tyREDKc8á:;}@!me`w LZ0RT> H>%"@@A! D&DBHb'.S'z2F:&󶪰?.`P{p|L1.Bƥ☓d90Iho- VmOHⰷ[[Ke>wVdIGii`v<ת]Yg>QѠ.tv{Aq)c& &]ҭg}陾?>q05k@̺ ţV$ ,!U{,k\G]fG#sFU]S_`/` rUWL(R gòؽ2]Uуs::xϯ_/6:|6ҀP@N.لw"N{*TvÚ=Thrҽuxk;.H!(vﮧq+/.ҿ! T.[qԲ! 7$]1jY#i( jaԴU27@qZmbYַb~Z)=si fsb+.{s]kmj"cŤV+(()mN3`} _3Qc┈ۏCc7WXRh5f$5J9dwGo9mgo;]Z-Zu]4]c0VH앲.):>`FEGטЏӇp^^rF\On(D ֆf b4/w۫ŇK+OYY$"i_ˮ[Lgrϯ{߿1vn~ L鑎xwwrU6 *hR3GuUŇoDKʪ`c δڝ!5{e\mWo!y!Y&6L(3Siwl5QƁ+ <Jh.GX۩Y[oN'6JI3hW mr 7jhc*Fg_EkEV8Ξw88`OoY JGEBke7'>A"Z 4`{ӶGy|64WZu<'{Ir $Y.qbyk8=k5wd\ f2`Dղ[.k.,p/ Ǩ 4\1h3HAiMBG"{:viƧ2Lmp^^r&9=L >055pFF mxZ~ ecnB@<!Qh*bqA' W.\1a8\ kd/Vh]p̰kBif 70 t0u2x/ezMJ1?{F{^3J|Qogs{@K,zƬ$Xz$MF*za4n!>A{vKM\܌] KzՕ ~RPrg}4q @LKkA'4[1=nч8(6ŷ@.{en!U]1 }~ߔDž vcTc]^Y J2JS qhc>_ulmD=7XGЍ=wC{m?omQg 7 q &26MX  )a]o|{߼BzL ɀlǙJո_`kM2-y/4ӨZP!s\L|"?}}wNOM,Hcj}n3v^|سc_ V!#^50M{eP.šM]HhUSIV4&w;/8)_|ƨigd !2MkGw;-@O`%0=@Zgm1҆iWd/yۍot.[' ź(Ƶ|}1  4nE+|#G&Qu CY 0 X-ڿK# D@ă.Љ 3DaheN `E=6Kr7 NӨ:2 I&IGꪪ!Aj/ fRI"U |PÇ$ }1o3gW=idePdu'Mt͐ ~agoORNg5b N+=V EK ԩ,U{;Lv#g6vg\$A١//mċ6o,t1vp$őC{69 Z#eVX}X;^D$Ip3(.LCQJ@$@^)_!~+{ߟ>hzoBVpg!l'̆1zi$j*e2 %1.=- \nI& 5T oa6B!ɘ@y$c "Uk_Lq1,8[+|rKqd  U/|oI0Ψ N߼r@ː&-CRot͈W&+ҢV gb8(8+㛪v B$x|[e}Ej20:f W {3-CE_҃#f;$d&od]'<۞ IIzߺ⹬h-,MQU`%ir0FtځsEp,g/,mNL($loe64;)GmVѯX&iz.ٜ==SJܰz9vVGmMď4Z(Gw!/'$pKM{ޘTk9(uu1Ի?ϵ}&6.2*Cµ$;J>ߓڞ8XiccY{;\ x|BRlTp!3 +(Or0Ex#-ΡQZzkrO| PĂ;ELh`4);Bߢ'+wҬ^mќWWwrNyK{U>\cs ~>}l pWOw-庎zr+˪^l`huKWOp*uւ6յ,vWsꚅ룂ٛF6.beT"E"D/\,,!V *j31=0;ϋ#=}+?ٗ[pYL<@т(wLQFs|lSDcs^4|Aq1=OWW95;2KLK7|KV<_) ۟~{x^F>=ek3BrX6,Ŷ"J&RML,cSlŦQ8wmUڧ@]i1WmA)/\"4 hvW%T{? 5FW2OV8H =NJ<86"011!ܴ!AӴjj)^B(B^~}fsN=hV{5jQu'pGZ`Á/\="EȂ|:I9&n>/F%NzO8%⒃@Aox$Ӓ(#30UsH(zaOk!Bk_vT_K2ئb.Nւ4+|Xcf 7-eD1@կV0`^`i# I /0<]*z~1h"rtH1ʮ1O 5 u' P;ݹզ|ܬB  ً$Hmnbu[z\0IVi|k}_FjFΌ}#Ŵ$ IJ"yewqkW0_ՉWsϣ\XP˖Ͻ5l|~Wݴ̈JT4\sчY,ګ8@bb"Wѫ~U_c<~Æ[%: $BޫƆoιn!x4<|Y>W!wǐJ+#%D4lr˼NJ=Y?[Yh %Z5p i.N]S|VO߮2s_>?AɵyP43^;p$Oxʞm`HLB9iJRDuMz~A3J6`ÁC2~[᱾}8{1gHC4EWD`4î~F noZ^ > bq'QhwAcfk_*)|ކ` j[1|}r4ly}\׊Mf&.K?wLD5w :Ix'ss:4]A ΣJApC l1Fy^r§=E;~|/}jKXpك49ˊuL.KK»]mΆz\>[HbUYBfH?mjxY7O_n !wNڴo~sje!=op"5"W20UC|*8p\VP۸ya -G7}=9J*[ dS&eB5*~^ۚ}k NH#k7u_MձQwL,+H5mUEo`I[^ .,I@냀89/9\ ~5!4*F VcYYi2F޵CG@|u@Ύ}BGvtJ^ⳛr1g0k&N~n|1ht:7/u=haCCX#LlXՈ uMQs i#Mw8|뒃ƛab!`-4LqÐc_Fs+aIaWU. ]7"_Y?XQ&X(#w 5%Gutod7!m!iоrKVU-KzJ=uDlZ2>wo>Xۺ;mf:lj٤+1)go=.Z_s={s\nisCMH WOѼrktT GNk#>794Y,6&!Djl D D+xA>nGysA i(^35dWfSÿHm&%UU;i[:Vjmk?Ua3j)bMELA I",x>I/IWQ;>|pT =cNU72ZeIP#곶\87t =66_.g02*yugzz7q#-%ZքKU;-ĪzJka aflGqŮ?ĻzRĀ,Zݽ>^;/I\M{sy<ǢO 靤>A'8rURW<|g?l]>7hLzz'(UN"D!N!Q}[ۢYWb&pon/)G"/سnF% Ca:5ކwPҠ4ݎ]c-mt~foeJ W48e$Ѫv>.ڑ|C8Af1}>}gSFX4r[ ^Zf fᗭ oQeMԅU;Cbޛ3@g ًk@vo6W5ؒ.\yK~>|p˿gUanwC{Fa|SzkJ~\\24|h!7? nՎMJ{sm ኁggF ;s$#ym eE;h>gܙqcZ5 M䌨QZ<*,"Z b[~OPD'hY\@Gx[sଣr;ͪA]΂/z\ҔNtC-~b Igbt:d+n{;?,YRZm\ûߥ4Twu?7/Nocd֖gVV*z&]Los>@g ًd/>۴S$Kvc]$jo2WaN:ugTp94 VuI{B)ۍ=i>A> q.T;ەW2s׍M;Ij*@L{q.bǞ.}y˪vwŅr/3:Cmf@=_D!F`jlJQTx|(^(lbxKADb (|ꐶ&Loꂺ̍%qgkGG)!n~U߯~鑫’#Nt~u.+ՏYhF3ĸm\%y>I]~WZTMc9Bh@J&2!i;oAEShBA1U״O=9wm:Q51EOw5|<[{pjbUͧ˪hUazX\-p)Bᦷz77 s Ty8~_VWx!Z)(R $pU'Х%?C{u 5NQx /F e]y_Z ʰvmIQooy6/X[{ě}^jn\@, 5j⛋+z@nѓѣMh%pNMIy򾛱 q`Pz zX|1R3QTk]ud5oPfN j&9UUXٮߖ||_\J_s̵(lvۯ.%ef#PFKn,4K3@ZK^S3xu 28MrB[Dl+F۪YvEhƏM1ǁd10 { taW#iwO3@7?JhuK7,R-p0Ӌ7EOѴq%^|ܔ6nאU}v- r # ĜCcFI3tgBE+m0pQ>B=0)#8-Hu6`-w6mњW}b YV-UQGs ݀ 2[vxE, Lx;8dN9=l%mH߰7m}?!]tGN1X7o׍HJ^Sв #_8Mbs5q E@WUW5SjȰ@ /<@>:AVhtym[VVPկ}y_7]NlMbU옛4"Aeք#m󼭲&H;щ(^2P`/x[N 곇];k(1QQ[ `¸_5x0/$=) ][޸-Z"#MY Ւ1J Fm88tuCE~ꢺ/"  1dױy?ٻ$gj]1Л ɸ:rvnL4% iB02IpcfM'>|WM ʎz?ޔ"xY(hq:P[2\Wvp^h[噙a)ކ cg/ 4RK/M {sUmOs!X'J,(C{!`8yM nՏ LaM%qL+0D@0;iw{!ƀ(=W @߷{&l<䘿f{3Lλ'6#^E8}~|MyY =^oEu]'I%_V C\hR*`cM9@4Hill;zW%J"3_)PIKQ@aQ V*lX< XQ+ u=*&uP*0ƭ(I;%dluv|ZfrE:F˼vU rDs#N"~OUf~[Sxƙio9}68JR]2ł6^hVڇ>p^94|`~CGޑȡ7 ,g;+Zך ww=Jh5xߧ6yٱ)`ÁK>}ʐXy$4zYSz[ûkϷҤVÆMJN2 Ne1 ݹƈ$I9jWhKDqn ՂFcWEk>_5bVs/d_H8 D/5`Ν_|PA13S"/{6leI{+L) J!Eu] VY P[C|4]zП+»*VX;2(Ze.87(5KzD:\X"Ң. M7m,p0g;I. @-["2 ePS+q33m{zNۨ8DCxyf_T{dk'ğF@2} KSpr_+8\|"7Ʃ7!( YҒm/r̞zHMc,vR~Z@4V*@Ǝ$ =/`:$J S4L$I:e&S c&%߼M$$1$.%Z Dh&c5XS-^y߼d MO<f:G4:a3m8'<|(#G^O+~/tӒ0̹ zFK-Cܸ0g;I@\[8+Q4hU>[Ypcfm+5M~* ̪VtШH޹daeaTmX$ҁGv9ʲ I Q]aʍc$Qb"A&eukIB@c0GGۿ &[IZv,ðZ݀F҂C~XfLduUa;}Za,"8Bu E1,2ZP( U1@,ژ)Nߐ@+X#0,1qjA6lsd^e/:.Ar^leC}98hΜ9FckW_ ێ"@ST\r7>(@=*(I1}"ogh'}{tKعl!7[ Q IyKo;tiճ}"s5_Ҧ5aGE! ZJOY0ƄBHo024zBaouJvI89*(-X_zGHvi|}8|է" >P4  n 4:# x\vQT iF& !R)T{=uAH K0Jsux<Kr>n!B`0xL30$6k$")JVsj;`X_LѴ`Z fRvΩ`%(OWÕEJ@3{}h,=RF}rYgݥʪHCw/py:Get6J6~QEkbZ"ܲhi [7^RƼoN,,Yywzky aoMZbiP ?k tcpCm}{y kLDPXsXm>%gmz{ٱ *2`kt2jDpr].4䫃"ܵJ MRDۥ*7=Y_kQ]v):/܂ 0Y@>:s=\~sY2.ȫ.\JɉIERWWcuh() 1F]nP /NDэ܋]uG5 g~6\3P XSƪC-)32 06V%U_9T&`9qMdQңfTւ%EWEgоWS".:Y Ps_HF`D <^w|Y-s"}nK` }+w9OX[輲됭7A0(>Kfʜ^4` bi7:*wCE2( hWzd4 j,) TeI0TavvӅP'XWu!E`K7lLB:"7 t*BHJ fs=U_pyy}>)ehw=dT.94+̷@eg%ք6)%OUw2;tܥ&Ly4mˈ:u'撊{laܼٱ{MgىJDW4B v  (ospWCվe!P 9XV;:g[O+#vj’҈FyW\hXҘ)3:)Yljdc_Vx_--~{޻Z'p73HmnD+,v{=ĻϿdg};껲VlGCcf 7Tc U$$11!F6c[%ZjNY`ു,_.(BϪNgonj#Py Xs,۬o4n0'^Ĩ I4`"zv1j6#NEE8 1$![㝉!>-?7b`セԄV{t.9gmf"]Pʛ=)9m/֗|cb,[!9.VY"Ďmͦ9=QSﷴS @:CݔgY WM~ BzEʞz4uyb(%!5xacN#^|o6m̬>}tݻww%--<=B~ѣ,s8QF$:q-OܵX| ʣ_z;uA1~57&) nUKjr;V!~p¤~%w$I~op>scF UT=4PzmˆHY{lE S @uf4C3($ Cf [R GsoCYs!քBxR@b5H8E 06ɶu3}C+g0G{sXu:UW] /DFFXvY|ɗ_~<=_{̘18ۑpD8-qYuT""esS6\=+LPn6$qW CxeIﳕ5UF.f,3B,uS奇طWv1&jArłt"!QcW[֙bX`ߺʼC[%[d}$j Ψ^hI!̨ |CEQ!4Ӏ.dO=E4j3V;g-hbCIf}n_ޗ_1ؾ ?=v,|@uޖY{MoWIj8K bWbU!JS`,%XOn4k偩!v ô+*xR[0+&UD`9Om<5P>{Y@muRЀ!d/g2` O,~~zHBzdZ?;-$s핻'+MQu7J^'n_d6\voD͈:iԄ@ŒDnT;|`!Зy敔~Pէ~եK8p͚5-1cƶm ##+y߻w/M}*--ݺuknnJ9rI"p:t(11ro:n ӈΝ; JejjAL&TTT;v,"" ¸ ...&&7STk׮U}9v [IIIT*EQ$(((..nذaz⼼Ԭ^">>~&v]veذagNsƍG% >=pVv7@kÛ5gnfg`廇~8aҜ.3_k}K72;9伦jވr>_oŸ\#Ij n-(lw :ŁIo۔Xܷ}jsYkyAݬR=ֹi}k Oتm\\rd{dZp ũYYtg]ě)k@Bxq @ijKD;Zŀk-Aя~Y hCzM a'u:;qg`YuO~3f<6lnƓ,^x֬YFZ|9,[;y)S{=?~#Gdff?>tSZQQ/)OFFާOcvmڵfD߿lٲ_~ 8+|~2h„ Fw.\FC(;rW_u{n֬YׯCd1bܹs_|ŵkמ/bĉٳ{ٽ{{O?ݻwS%4 H.YAf_~˴_Mʀȹ^=n77Q Åk_͝jnYb;9Em霘hc_|KO}:xÆ 3f{ᇷlo;~?e,Zo3fرc޿Bz꩟9##oٻw~۫W>Ӓ$իW^޺u_._<<<|„ gnaf޼y>˲>p…Ǐꪫ[o;vҥKwweݯV>j¾⋏?8>>~Æ &Lطo+wߍ5_;mڴݻwO:uŊ7n7cƌB"9ܢ,: FԹEm[sω3ǒ<,tNm?_S-~:i\/E@d]Jj5(ʶpE+9 Z@#J!x%ikhN#PUwB%Ѣ )|x|||VV ,~gӧO0l0a/_l|r n s4G؊ C/ez'σGloxƈ-r`YIV._ypM>_ qu5/EKZNWbkr* @>&kO޻VicqXg|pW;KyqpcB 2~p1ƸIu+-׭[g'MuVFӿ₂I&I0ӪL&ӡCvDz$;v8A 70oV]] n{ٲeM2{3/_;~-[XGkTk= %%ŏe <<AZ31!1~ygrצ5+nM;-k _wv򀵿a% $;f>`A*tQc+gLYeΓ "{jqq@jDCz 6ra):2Joj]B*7g㌲V(8:iFE2$Z6E~Qۗ=єJi >qkLU"iC:[-R!Xn$y',^9w|Ñ8ʎΌ@4MNir8N`Yi替 } ̙3t8uT('biU}DDġCx≣GVVV3;Fcbb.K.[n۽{}xo.70Lll_{Ҫ*s ?u QR)?MERIA PZZ w}70~ {<ߨ%g%/s|`]TYWjre\\m|HIU+I@ {wf9Kվ o^(fTKɭ:뫖v{U؞#zOC1ٺcѻ?~\7B+"}NJ{>NTJTǩل0E  * !UsQ#F4FvׂHeB̅ߗ"Ґ?{4PCGm {op6666>>>;;;;;)e :k׮۷oEqP5((hM4q\rr2t\o$I**<dt 7'ni$)44/q8Yw;w,˾꫓'OX,F򫯾:<8&&fΝZjJkZ N?_|R}ÇG~'!Ei4ʯ2ƺD,39A M=O7#|HIߛ'LjVF㋖ Dwꌭc{}cՕ=bJvIKBdYcqI*[vz /Q Hwa% d`H+1?)6 12s=U ,!,JVS8N$y]'Bc5eueyUg^<ݻbjJbBٖaUwU]Ui-"k2 .0[cnzOWT񞡽55<|%KgBAxdz㎓O8QVϟ?ST'=/z~`01{v9?OJJ?mڵ ~<72e>w)SL:۷o\~{RRb0L`IJl޽%Iz뭷N|!!!vRD8';K\5Q r6>( qj̅OAYM.(T(/q[ŤLƥ!gU WJ֯Z6u|$r!qL"GL j;d|˄eR_Aβ9@q@xQXvM{U XEb`aA mE=0^ޚ$I&4.phʖ7Ӈ tT璉jrA³Fj3eN8rm\G jxPuX+Bx[^S?ӧos̹馛6op8V1qy4Z,jweǎ;z1cN 0L7p裏l6[^SȰ0(((ϗ$^2zJ ײeDQ1cFkϞ:t\$.\O;#ޗ_~۰aO'aiPADI[tؼu%>i$ﻬ:ec{ ]4oSދYEHbX!xRMѬB/:ZYBHk=3ē'7XH(fA4k V(kZs9*% c@嚬 "4`|e{e׽֋" {u}6oC}QHHȂ f2fĉW^ߟqOQu]k=uΜ9^{mIIɈ#w:sLy3䝁N I훝ݣGC7.**d2No߾]hʼ^+ݹsgZZСCSRRf̘ѽ{wFYO'kx ym9|ƌ>ؤL*Dܔ#ɇgXs-"M}hU]0MoE.%Cj?0.E5Be̋`R{j6 !ݯ?پq޲~¯?]ݗ+d5^Ϻ3;JZꨇ)ʬT *C ( H4ЂPC E쭫;r[vff+|wiTg<1GAA'Nsr][Qh߻huȻlY~K=z2"R}lȠhPmT؞iCB,Z.HKǩ۬MZrcF݅bTHXxQ hQ@jrmK/ 4ѣ?|0SLҥ !fTcǾ'}x>|b.r;655㸉'2 cGRRҜ9s|͈az왐pr-..'&:uYgϞSLiRSS9qē.AAA]v˲ n;99駟~z=EQqqqV2~as>vF:IVUU`̘1 R<#>lS. (b%8R>ɖC^V4ЖnY罛r'[]e;~Qɯ5sbZV pIw9Sȉxo,Ͱbw_ cs &.{>OP(GB'%.aOr@ *$H SJ.jy( #)pTf@c TOXe߬E5agcҟ7Z*L\E΁=^SxTg]gn1HR]t3'&6WY+v] K7Gv H\ǎ%]+nnA#^qV ) H p)JʭeuE[hwo"`ߜ_@McErXg#GܰaÞ={/՜bqi/˸F|ui {55ojYnQvWnV_vى|NXc~[:M5^)*^L[]K0}T g !xAI,Ws>6.9rKQ4Syl|d29J׈a,BPcaA AP |rD(Om9w˶jۿT|gMꑄ+v W?mѡ֊BY9>7!AO/XW>rUŁvo8D1`gww*r )H',#m(b:b} DͿ7fKrM6i[{<>{5kerWg6ާQDzeee:O>ٰaѣ6UlgͯK p)@7y`c>io볷~sjЛX-smeࡸ夁8=2뉧NvTHeFjs"|!twGNx |"ҍAɃXMPŞ_cFyv'|g.xg/pIze p|'=@&A,VG{ ~k97ۊϊ/YK@QXe-t|FH&{(",˒($B!$IR+y fS&2 CFz|D(B@}Qݢ# >&#P+-ԏx<%Ӈֽv'Ʀ(Ia 4iivg;Vؙ#/~S^/U4H Q@p2HnT%Ko,\$Iz$T#LEOLmY xG!Ťjvd&3|X|Jrl~m-%R٤=}syt0dE94CyY;o)F 0wW҅%o) X=Q?* `i!3YWuڒp1o[n%sgΞoDnBXss7>Q}B׹w 8wSK,|Ỳݴ?<:MU*BhZ(idV vTj+ʭbg%IILK }Rgj:!UY:aA m,`( O)6P+ͱ2+?fݵz~it*hWo jea !JU4aH}޹DȘHoT>yIjrh WOC".5bOwGG Ŝ rZKTt;_ejd޺3z]^ic/Po*Q <ǔ@1XNӀb:&0āW~ƛϯl?w]@ f(Ob5 L8ӥ5k;~u}LV~0̒ߊR*;kpBxfE嶆 }xz 9CzkJ=R 0 @<>ݥ#bU7E}%Qjۈt݊R_1c:nkNcZA4(ÆXlNr<"_v)6rW~p֤c~kyx5U^ |7EUy ~1z_n:knsR 姿.jtWi.+cn]Dq_߱a:i2Tuyɖ k=wk~G1S6@a@md@}4&t{ð{+|I!FM>T~`J׻$+y)wẔ]ĘtD"VMIؘ@D\}L(16˜9.|vBSK[t<Ӛ~%USSS[[(^Lg \IDQ.S0p{-L|q?l;ǍiAےZlt|pO̭̗^/rbg3$*?g.*-:eUJe+LIG58z.ipwX\%kVT-x5j[9_lqƯ%:>l%dcofYico}EZ~4P #rSaMP0#:oFhNـTÚ84ǪC.>0555555uuuC۽{AQ!ٳgGDDerrr-K\\\\\\Ξ8KFYBkNŝX6"y6{lG^^b  oz Y/ Й"x&m~T>0S?fQ]_co.|ҩom_aN/*4 4v43 + j8a$2f_Vc_[T`mm?W ~B o=mh_|")_QSS?TQ,2e0ʵYXcFp@T{?},>yo |1UjS^y'/8tҭ[z<J5h Q1L>J:#NbX,,|hҤI&M"l6Ij555}q%##cȐ!Z테X m0:*|{Nt:>I[Al:wZա,HVOpS%MӗMRcDSBXNi 6=vsH b!(ԞkDQXu΂)~U~zf*.yZoR4k_I\mSU5@sFD<A)]#;*NJtHilGh&J)pV}WZusN$h_~yϞ= )))44455`08k4JB#+66W^9~]֯_m0ۘݵH }φvYK6~;ק9=V:a3cz\烼@FIatsnٕQ(&MOۿ` 02J]bբǍu/(;re oL\9(#BӜbTRjQ`NOr}>R> ٠H, Ts{ϚW9f d׿p'o ڵ|"'=|t~n+ |_}U.];bd R%ޔoOVtЗw_@Л-G.0u&]R~d tsϰ0:N'nLv.twIH7؝/a* !H9yeY-~sS<Mwk~kקTKmM(N3@x㾙G+9g_}Lwˊgbya4,qsb&mV_2}?6L(#YwAHLZpBޯd Ivo`UFX]Jik޲f0̙3;wfYV 2$11{\H$?~Ƿnݺr7xP:J\ungYr{^> J~:%q "}B/w 150:29Va\͵,Q-[ّ!EEi>Ep-]Eo')-hޭ ?]p7!jޭK^ګe3OB M,xaNOl"Sq"Sإ{],HP8ˮS_f|.?d m w,'''o۶ԩSҸ8___6'1_AZhQhhѣt7|xʔ)5O8CYhM K7#Ҷ X4[yؒb??L9>؛;z5l\j'-8. _JR-k=p/a b?9KW\ 6"!"* J(]J{{Zn=Q ixR &s``#xP/|5+w|{h|Hs;!fy Nu(EI)h F)&=7BrUyqW4|[tdCAfggo޼y֭ E5}-[={~aokt9+\Rg1T U,kGqk+?W$E&~>iA"69g9[ V`֨q TQ*>D CnZs;8R!ȭV/-R((+.[j$/ j]B -yn_*7co]im(o#ӌG%T deݘt|WGeaE}||u Xg7B %6.2ʛp1Aw%y-7O-% ,3dR^`YO[oƌjVcDǣySBFu[9Ť;85ķYxEUd'XƩs޴9 +&4Ôg/s5#w>|QkP>|x̙iO\`I$ )G`}pa,L,̑UQywm>GDvY=uW)=w6Q1TI/#@s n-w@oKmI}6w9ks]E_c33;ԿT/_-\"@xzmJNotn܀I]4 -{*ڡ=C[?q|?7jT+&Vpݽ`PMor]<+^;dY>|pTTTz.lPI EOY0OVZw"ʔreѢP%AnrŸ.JO[,)pP@ ma~#zcy[w`!šBIYRL!PdzSօh]ӫxK[BNni(9p8֬YӮ]Ç4",[, `֬Y xP,Y ٫9*=c|^A=ܟpzmfnq?=֗鏍 \xOxJllAU6s>A4YQ+IRYY^t!Ibeb3`&#@VQVdApDUXuQ1Wo>Rڏ9ا31(6H呄p=933{>ĠGFvפNĬS**ȇ^(y#!8j#P#p>_tMo|y9sɩmDr! zl~g-Kww}o毿:v[x\о4)9 AXoc:Ok PDǎzhy%(xӳO|sz8j;6+xH\o$D ISN}W7`㺿s2r&1Xõee?#Ǐ`~::F?n-Cy 3lzmKXtEpxKѭ{N 9vO9kkcz 6lNɚVi6}㉹韽{5(> (Ȕn"}+AsPJKP%ru:'T3/}?1 ah q{?ֺncǎ+V9rРA7Z۰a?wVZ5HmuGB4UN3}/n[R~oOXRRv mW1 c)B{AA o~ ,)JA^P_G Hw(axvIηs.o%;m g/_~>}tt.((.X'K"΋[jʲẉikj1e}`eς8feM͕-j٦YzԞ̒ dfuEFKݻ8>8,*B)!V({ B>aJ%`2rmfs??-cт-ʵEfsq+?4ko k5O0 C*à~n J}Fy, =< #lRvIVpZ`b}q|˧eۏY/kgfst' YflqSNmȮrVꙺJ!WT>99PrlLSXs/[/;K"}ENeKF- d:)7I*-9%JRdԢ`#j].Wa~![\閛c?sy4I!U.UjUR*K2hLW*yRr'6}*I}}zǾ8ˎW k7"1rBd C-܎ k}vD9ۚ'_5eYL88 0+\9C.Hvh+ +#KKRv@Y忠8Ez*[(6EH(Ԥ1Q06!$eK'2;igHJJՀ`*cZv/B=ʴU 衈5/()9#5$E$IT ЛL}":rm۶?НGd2Zm~Xzk:R KPsa; WH؉Ztjż}\Z̺.HfhOirb`.!xQbۢRf6 ګ{x_aO\LsL{;m4oyg3"=:JڸySB>V#-IAuj N֮yxy[p^j-Z\HiӦzJ g좊·yy'}"ń.;oM.fKl ${qq3g.7rAJillle%7yyyf9>>aO'O ЪU+ɓ'n1B/Mwq_#UInnnff敲 ԩS6JǨ&ҧ7O_meM@E!Z~  YEe9?n) ,6vpelɗJ'$$T999YYY,˶o߾Z>~HLL8qjpJ(--MII֧!f9$$D(UxvvvNNU,=+--]w}1>>6pxe}%yySBB}U9i s'gӈJ;kP/o:m+y`kxҔF]O>$==޽{|SCΝO6Ɣ P9]evC k3>i9]+Z9q%w3+y+$eKB[q5Z~; #0 }}}}7mTѣ322p8!;f̘Yf%&&V{EQuKlp/¥8ի,w}{Õ~tjFMÚRʲLdLΫ /Y_ު\youL8z!CqHHC=CyyyA}5kּ+DƬFQ} :~3Tcr_Gf)-^wuW \sӦM#Gs :)heK”9]OZ#T}aZ]<+y%J B9isVEe۶m>*((8v̙3˟yq=nݺm۶r6##e9//UcǬVkn|||RRRN'0mڴAop8NC@=V^k׮ڳgU;l6[֭#""8r!ڵoaSSS,YzovԨQY}|| 01 Μ9gϞ%K{+vvGXx=LfIYHU#E9ƷZʴjI藱+Zę5༼pСCONOOO:5džrمI%%U7[OgbD"RY@칑4c,5£ :O_wfw?:ST!=׫9PH͕l.Z, ͖-[ܹeYSgy7タniǏlҐp8:tpM6U Ʊc@+':} B}L6m~~~ӷosʲ/{7n\~ */_yKgSaRmvM95kXjA(k*.㲒6xDǎ FaW~>z9bR~K{(??_y睝:u曟zꩽ{ZEÇϞ=TTTsNBHn֭[U'WVuV^_7oNHHS K.(ǍwȑxذaK.*g\`]I)e٦5Ο6eϝ||l#GS!!ũp /9ȾpCC}C.61tIYlU9[ZZ_0gΜj8 !2m!I3ZYQMLaxUJ w>0* f!sn=>Xk&|ٯm/^HOF6g 2;i4ufr0 ˲ h4'|2<١C 8p`=?o߾j:EDD<#խw7|MȒD~&!D{3<^7 L|RnӱwnCt:ݦ֝;wV*sw=nݺСC?={㾾ղJ^M&̙3KM8*Az|Uݔ7Sx;-%ǥDOR%Phq kPw3 ~Z\hJb4FcXZ^^[,'|2&&F}5Dn޽7 @)պgHgD &€.Je,|q=Gez_CkYQ yg̘QPP0v-Zݻ7`=qٳg;t6_;wRJ\H%={dff^Ml~6 UKv.FKہe,75ZXjm6[vfN0SEE9+랰 'jnuzi3gѣG ЫW/5nS}Bedd߿cǎl۶GEE筻t1>}tffU$I? .aRtw쀒$]Y!*vng9nzG{ژqc+ZG>Rx"͜eύfwc5)0Ư?眸qVJ5 mKy]zOO j-#D-UₘhY#GO$h⫯o߾RB{nBȠA3"`T(׭qj;vTz+{U}-Z~}Ir1b…3^+$h4/]vW Srw>z[S8 !\c(w{T@~Y9vr0ehh4V뀡K,tUСV=zhEEdZv{0`x͚5oqnGϞ=kEq EEE7));psG9~cǎ2eJ֭[׵kKz6m/^\ՔO[o >]ژ,ϙmݥN ҠP>q8@x^5)O[㥻D?7I'b[BQ!VZw+cu*x&؇}fb6T*_n'IR~~… ϟ۷o{Aj2ܶXmٲettO>wNWuL֠jWgڴiURPPP\|T.ծɩ$JCE?3iTE .a_{/9}#=5' .4L-[9r$55cǎ7nLbbbڴisС'No>rޫjOtSTT6Vs&Lxᇫ9;vҒu:]攑UrO:-t|Oo2;0go 69Ī]1OQ,V ֠KY_JKJJ_^Rs"##|3UTEi)UhE&.ikk֜Uș|)؛ 4]cl26{キlܹW~ݛ}>t:eYl NkpPȹ}aڨW &%dEL N4CC4ɚC;%d+]X]*ctvsZhqFV aΰ,;k֬:CPz.h`@a h#5R5BvmE5aŊΝ;W \ڽ{_~9k֬nݺm۶״=5C'Sks%%GkjS~뺁 бz!@2\C"Rݒ #x3Wq*(Wᇿ{DL)-ӝ Pc>}8۱c; A*uLvRRRVZ^^ PUhBDQ|cǎ}g_{W_}[n՜ohԨnyK>~֭€1۴i{"p>Ri?Y9f_y啿;55uΜ9;_ZC+ AZ#Rj}|u'C~@/"+ʮ{M,1(uCHΝ.@! ]Wfg fj)=7RJ+.]v5pʋڷo{t|:Rƍ-[vR~M6޽{Μ9;w<n]exsDǿ6m%ڿmCDֹۜw4k" rs~a|jF2WS99OIq2􉃘M SCnkp+s:M֙򊊊:H[ D+`XE~Zt|#?_W^`y4:.RM͕(ҏiͥm۶%wڵk׾[nr2]v]ho!4bĈ7xcǎBXVVe-e0 .[6l_պ9}={.5Ȳ] (jリ[jtܻQV1K߬Tl/dLK14uIX^w94,{ }I; c۶mxW. Ç߹s={Zn]=!4p 6l=zT\w7mTTVVk׮wyrM:uܸqWH^;ȑ# ,x*[-I+#5c,fޔ1,RC%(0YX;pr/!J?,4H旖cJ- |bI)Pݩ{ۜǂJ~>ts[Ɛ̭82 k3'NXl[oUk ,u<{"ٽ#{gw|/sGxky[%kk]TdТ ;WyqH1iҤO?( f=z<믿޻wC;+cNWcԨQnW\i<4M zS鵆/^ݩ 5we֒>yHiJ-|HQ)@DhNOO`/Yc ,U֫UvR+gNVdkfRɭf5ku*3?둑͵"=%x6`ll jU:qDw٪ҤI\jرw]^^^q6 `ʔ)(V8>>[n}:66>*-9%|)xcSqq!1u &+C#! U\ä1s _g7)9~vSQXgct27zl@ 2dծoӦͽ[ZZzmUSm>EEE<|}}'ONaPFDD3:622r„ -Zlgyp%%% :SN&Lp;^Vg8p^zڬYzϠƷV=U/;^#ex&T%"JBLhYg>xtɽFzʾ!K)I#C >S?R@^+qZgQNjU״֨'1Tq(DHvzʂݻÐa,\p[-C U, "²zU] T.*:Z_ڿo?{gj7J)@QW3uE٬O B4jO͡Sgܯ#l/^ea.k?m+RZ UЛ<Ĭ!enL#s8W~xٵkN4Z5s r//vq%k┷ԯX atfPyGFTe,ߩ|Yk[ZgLx9}}2-[M}G{Ûe#AT- -a畂%uxxY:@!2&bL^*C.$=eiNa7{?:bT*\y<0G4*Mgs3}T:.ه[lIK)V Ώ jU,ÖyuE^uߌ}-.("9-s'2 :%Kr`Qk 삶Lݺ]4!׽OU08#-B~&2r[A7$I&JZعs:>GfمXh˶ jyO_}j}QpcDB@j,41Tղ* XRAv@ ! Y. AUaά` mu(T`hۛI !R@D,2eй }{**K!t&;553ASp[zEupҮ%:]ul݆:]ſ}.2=8sNoٮ9B.NO2 [ g!ePw/o$SvYYǯ4:Msn{SvO=saYZ+8.@xl;8rVKeXl%$g%ܲQ Kсma'д\" !WJz":$SKD.k 0$K}IL@("?+f;D"Br ~y!3զ;K@b`1bF!գVy_6!Ⱦu:#БNW'&KC[S7 ,KQl{_lNohcDTHl}TB i#'񀯯ʒCy+{; xy3Ig>qEV`]N>߹`Q q9? _ NaCvW=nӾ}:n{XO@rWyv^\tH \eJ˓UWJFgJDYC;DB,2g1XoBإWp>WmU,B 8ZU(ȾFtU+-ƨXiֽ{18SdzWoݹ׾~&". !ԩw{o7%82Ht~A,pPHR|O_ի= :tڱdcgFZ *s@.`Vղ*$EKvdAR$ ՛VaA,&Z]3;סs2{Ym,To(\PZG+E!Z~ ";jXmWDoCy{@&D$Y咝VG!A?c4Fc-Y[kOLh\Lh\N Т;n€*-5_>gЛmeC;yЍYkaaa3g<ҩKZNJӑ2U^21xL h Evok59{q 17%( ^ĸS} CFU*w2wLGmڴE5d !grz8Iﯵe*^fd|wO>c h5)'PJ#B9&^ZZJ) 6:P6k (>_lta{46-"D2ּ. yiŧ 9.caxҲD^(aS:t 0 O18nXMrO'.N*?[dϯp9%TTZw:;uS'?g޽}]QQQMpzZ} !ҬfhqTp`7zټOVk9.=~WC<"=]{oE5! CdH0!DqhdL$IY9ϽsݤtRI@9"E\cD&"[E\ cr2:պӞGI>/l>5l̏?o8p`kޛ)R y!x[޴4T"+|/J,C&8ML}Ux =$Q= $I6˲]vX,7@VzJ9bWqUzM͠H2MɕXD0XղIEEtIdsJIVyoszYTOߺn~p ,x34 m}JrJNI습U#atQB'[(qQ UA{9q0B"Q}L0HܴM6mڴ=:KdE'Rmw 1<^2=%ABXN2|.geѬiV+j}|Z1(uCHΝeiѺϖHDITDAvE[V1!|r`ߍ Ox'S2O͟Y?e˖uHFuB7`M>ZRQhݺSfȒk6 d6.ZڡY (N=@mSƌf@!T2S"ITp6D+ k](k+k.: - Acc{RB%":% "TeSQ0:f@]Xt+%JؑRzjxփM EB8=N6 \,Zk0ltCqq!1Mfȫ؀3-R*(.Q\]TDAr:"30:5X5ߪ48/X3YDa%#G~WT'=ԜDIng\,Jpm}< 2i5m\ u!a {j3-m/<0FcTQj]b[jˌBN輀E&D$\b^kA #A-ByUꍖ׽=׿z&53?lyV?ܩShsNOJӨ"q:׬;f"*-pȑI&qr*sE8myhQ R;i̪ZiEP\.Y ˱k~V+ѩS'(/ jV#|kZ-A.%+bPE&2Ykü;uQWi:<0ذ߼lw~۴,_e1c42Rc kn1 jHzzFO!xLtdJ 8%{ͦJ|k֫A2 !]VDH"#6K+Ft6 KI]jC DXtZVw~&DQ]"]"b1d,fyFa4V6 6|DŽك>|jqγ=˫!6x!Z3uB(Kj&Z9Y(q9Ôسƞ+Uʔ bPlSD].F)e1 ֤857EQAt|`\N!,ɲ,XE  @<' (Bc)(TA+`9[wwp - %!g4mjiw THІ<>ጻ}ၷs2go (uZduA{^ڰkN g!gDr qaiW-#oz:J VXBdQ)zV c4N<5ch4vl=ixVSVRy-%h g>1xTYV 7oH@YV6ա[;Q,!dN{}&Ikc0b-ײbp*D(Y,JebbgFa^(o{SX3SJ1Հe?ԌSaDPjY k (%jۮDg89K,ŹY{XfO+S H&Ьo=D] +(8g&x#[aW.-[?9+E@,B;Sd>eTxd/b8e>o@>kIR @F#-pF"N[NAhY-y b1RGm6B6T%o2 1O1/&,gNyk灾!:t} fPi̤<N+Tm^:;;{̙}vƒ8<ڊJr˄g, !]$ױn*^c0 sV2LvK$YV$AvӈDˆc9gxBUє/0]Z.ϰiӦ~7hղ@ư[Χ?;_ at0꣏?#*: (ʜ9s7vzy8@1aZFRs oU^E%!m5pF=g0Z\|o3ˠkթyw*g/? A,`o)%%9%(Sr(DQN,Z-YXLU+ &,8}U @(a^LHJ[6@ϓVBi^i-b&H=WlKXQ*kVثm 9O3e纭B DlE|B;h83e1BᐎGm.˼v~SG)4hP(QID.g1 ((6-zd؃b--a!6=-u(&P]5Đ쒉E輴>F35uJ^@2 gs̷")DrNAJ Q0b 9irjh0z4Sٵh* @M燳[F89HvUhr20.Hղ-5ŜTTͷF=w.\ EE%; ݁0û<~a&ٽ{馛o V,ًYR_(WXcjȷfJyœ씴$C֋5.d6:u^,2W o(2Mʑ0 !zMv&+EVXF0**7Hy) ]SAv:%]eB)F1YLx,رcÇo։>Yx߮r_8wZC&pcǎΚ5Aye.g)ȟkQ!t, 1LղWV ʂKvE2 byp>ZVTTTjn޷W& .!. Vʀ ϳZq fYĩ"p\iCߞUNXY^vm` M8OK=4mi ȭsvvڸ-[6`&g(|B+]$!(yKBezs a^}K*** "p ޽GEER$) ;E)aX jyѰZz!rLh+SV;NuNDϯ1 *D[h/сOyk|cۯxgcthCӴj}-nשS-|g&{(yr u%VEEz[ssZ`$d Fe*5Ѳe0yH fS{sgO>zetqgzR@SۼS-ykgڙRBȵirV,zೝ-˕`b$w mNىb1Ƣc fɠ.)^n(BU{r.\,nzWRp/ߢr[s`/L٥P'N5>z {ty{ZC!III]tid=)/>~v鐞~a\\ܵ8iqg)YtMNeɹJ\ЍeZaeE`V:3Mz @YB@]f_L8J!J^L06Hw9JaIa0"ZމLAZ H`IfN(ϊ())h0&`wV?{+Ȥ0$PK~yw3<:ohҐ;j[ªUL&ScYOfGGO/n4-{#GZyYvEӤt*9%JTkjZftֳ ]v;RbjMBHSo^]Ö,#]ten@=Vթ!e!b27 oPQ2 s2gcr߫=l0vQIûn+~( [ #n >kJ%BM:uo*PA.-qrXuWBjsEr.]c)[8F2hNAˮZ,bBwiE1"#PT0^)Kgu \"/)b+bށ߼E/}ONckVTT6rzC?-fޛ>{LQիWiӦ]v !f9HyeJzqJKvEKrJDEXt<֨VXODVm: 7͞S4XhXydw aw蔜]MκDnx-wIɎ:zC#orN~1tO;-gė~xd8ǜxvgW, #lc챷{gw^n&6ZkK{m^p±ND֊Ndĺ}A:sp]#}Me? : NJ]j.<+>B02﷌VE91Re1l.2>+wr+^6R~K!e[e }>OųPb5 C;o}sϩZ /awFxͻ'3[Ƕϯmd2bez ̹]dz~d0C|6Y=?{"@Id0,Sp 0|PQ)y[QBeEZlCQf-1G<:>hddZE4ؿe"5Ya.ǟ|xa(C)˲ӦMϺFNLlU+L$Iz` 015]df,#ea7w1zgjNeh/E ^MJ!%0'?1,m~-I# nU~[5v@!oY<ߟW 2;+z[@vqe"( Qn7k[xÄ Ұ"ڻտvxs际AE@^4 e\,'eE y~hwtOwԢFݽx+(A2 Xi + 9iui&ďŲ s~=sAH&ďcDJShXaNfTu*;#?dnH~A5v{_nl^ֹe ?}SOwLRSRRV!%9(4p,ˈ|jYڦsԓi;69}1:n_/fY?rp α// ,+-czpLک~@oI& 9Ⱦ UƌӰiޘa#wbX]$gsssKKKtRgPJ!c!9ǏYΖHj#pR+YkRP߼q=7ڏxlûlnwRftW>gEL)=8Fbt6D \\˳";uUL5]7?6fj1)Qk`D.{ S/"F]/sXEfedA^k6^j 0ZJm&F(r-;} ԓQ$D:],>՞'2b]l= HpL H'_N9V ˙{B쯞9?hnbNt;ޛ [4-ym_<ٙSε^_ HEEEg2b;k Yv$wˆDF&@PZ!t#Aj,G:lYB"#ۋY$d(,"HBUl]#Bh)*7DÛΞ=ۀr?޾wF#,چXm$)2 Ðf~PxAE10eeegΜ]9ـw.0CHW!˧_pBp _O^ă,+]kGC]fڶ/y}lVձHz?BHGv24Ch #ײ>k0߼SHku;_2ccg{7Y4BHPPPm˔%zFdzLXg?aAAIޗ/ܘϟrlRMjMs$1LղwVXNª FʈyV|{Nԯ?+&"Df,[5{[zA[ȞȈ}G65<#'BBCdW 6s˰ a牎^@ܗ(,7YxlN?L|J#lLPƈ[e=}Cjhmm"Odj7(tc79yvȕûle˥?*,Ҡx%V.)za$V eqԿãe6{ A?,߿綳?\RSRlއ5Dikݲ ͠7Գf?$/1ԨHӠٚOTHaYi)BJ"Բz R}CIg:y;BX,wq,=lzw{1hլ)vA{!J Tىla:;JO܈1/.rcd~~ΦA`)z7ogO^Z{W5Dn 3Jʊ7m);k`<{<=@8gڳg}+{۞Od/;!}Cej6L* tL%V3 dB`.|_ڵ' oWϤdz7Czzz```eЫt8%BuB߫vԧ(⧳ 4Fڵtm* 8- CXihYB( rHvhS Ŝ3sѪǹT.3ˊ^rQ]#rܚO&fj US 1Oo^ϋg?5骫b+*` u7)A668}cێt3hʵ/@-@I_ vSVS_g:zͯωӋg?vzb[v_ҩWhD%K;+\JMs_eB1s>CX ykM\?z姭#^VȭOƆ'~f u@1,ӔZŠRfTx~Ѳak)(V^ϖ^ +)@Z_ 0 V/̲q=7~⇿sAK ]ƒai}Z]sHOE) V0RSWxo^Q/}it4pD n1''3:T)2wE)\{!!eLOk|n7t Ӊ~g:Aժa ,#qP-,Rr5x!unW\d@׏=۫G?}qF'!=Gao=ЭHn&ƨ9XU_ݘO_۷jJv$rZ:hYJ "8$[5+^ªȷJrSWr*#Fu_{er?WﺹrsZa U'g͎5gł27ws3:{l쒗Q`uꭎixWM,EjsrŎ~zy/qXz@BT;$*7~{ ݻ@sij_ۯ2:e,Ԩo=Uwv{u݁˷ֽ.¿G >![p4=Πq i퟿$FYp κONf9$"16Ab-z9w}g6lعsgJrZKI+ٜ$1e)"EC KVdѪVXz4fHqlZ V|!;tvh rAQk):mj }vU^ICЯRys0Y)q>§OI{ꮗmm٪dȶ! RJm^Ǧ(Ks}0"8ISRR-[6vVZrhٳraARJEE)URd3 buA7hRsBa?߷aN1#3 rf$OҴT@˙l7brl (ʛvoL`g8% w^8hjkl]#B!H;o}iWU}J}U9Ұ"sǝO{o?crsSs"bGvٲH7s}|٪ #s;+_VNtV_"Rn7-:1o=zF~]"rA̚5 VXQd"+DѲZoѰ:s,ԊPiI~#{AޅeOof;UӗmU5.FDR96.o3VXZ?c^N=hu)zeݥedﴻtW:|s(aCH2{K(ؠu0X'CЈ2 !5h *.R`Y8H1hW*Z-U:fR܍h(ǾY_ MKBVa`vqsq,b(D9Sc*4,F+-7#H!Xrܻ2C|C|3seUB˰TsKe:FJZNx2RdhrVͥ\!(//Rt٢Ηc5ȝYʲlf͚1cF۶mYyPҾyۮݽ]ښd(DL>޾bS6yOOUe-UrJ,U]"T0 Ҹfj:Jkʪ;U_VWW\' wU5ޘjyR2V** 9@ Ę\eKW)oH9YC!ZHͼxg* <r2v䶀+ia)W􏥐adBо6D3 \s4.lAAAùA~߼0cM}*ߕeY&buǝݸzc>bxͿWS9wˍǏ?yA F^Q(G3ZIEEe);U1 f5Fӱzԗ` :{`D*EPcnɕ_bt![S17O߷P̛7 u/=vLh_Xamr[ay[:V@Ar("(2QxѲc`v$I^^^|_v<=FU u,fQLB^eMШ IUDA ¬{_EQo^^ژnݺ~CB`0,^O>~aQʲ]j#ɹRG8e] KRDH, GZaUTTTTTjTBBe"(* ; Q0a 9Ѳ:sL߼ysVV"o?67Mʿ* 0,^^[0yMS3Z~E޹$Q2"80jDY'?.{̋-4h_g׽xWTT|˗/om.#(rZV&;5>975qf"o!Đ1p&p>L$Id" %;m (1 btZ]^Q.Pc\&<d|E NMžkAd-Jf^õ钸`ևk_?ptߨP?U8{߽~:+OzW#W<8m4o gzꩧBBB,˱cA )[2h`|0[KYaeC [jUQQQQQN4F @)> *(yL3 | =Fńxiyoz뗚}&uxE L*-*6iw* \bcZF::rtd sz[K9s_K9c@ס"2$kٳ_}U۶m/{NW\7nc_,%CPϢ @├veEfsVEEEEE*s,ЭS~ r6P- 0Zղ9Yp+_t]Ϗ{wb-mqɲ8!M"rK0 A!A2K=SgpӒVwL<%%壏>JHH>}2rV吐?<**Jӹ\ .b PQvx;JSvAY8X:L4}0䁄DI$wG!B:A0Pkj!C\.) %.TD汮"cy3Gy:G aj2t괻Wo=.J؅!#,1v\goxQA!@&BdI̙5ū6Nl~}a:ܗ_Hk:aG:J0[&_aԙ{M=Rڤ,` 1zj_A/)RUGMJJJdYe=[^7:{*{ZR^}ik7~.2<5&+*_oj:5jukwt~ꝙ %ֆ}wٛjպ)w}7::{iUܰ3 POw4&*&6e"'98Q5uAW 4a>9M%A6b2诿)>5V=~fZW bU v˦:[[ q?K̙:1!I2#BdEYY|@)={lV嬊ʵeee6O>BXs༆__7g7.Qc,R@]aߧoJu:]ǎ{ٔ*gUTTTTTT >hHHH]pjm(߿@5Ъ\LffuԹ(*rܻ°Ƀ:Q()P@cZF5 \~w}w7z 1*M_EEEEEEy kE1BxN^sZR Ȁ}:w/tرviYe 8tP|||H:_#e"24Jz;<6.^)YJi۽e|B|Ö++^}՜TݪUQQQQQQin5*335 d|g)N}&&$6TwygFoNխ:x(*Rͧ !("8k[ mp8N>]^^PdYE8 Rg"##trD[9 e{˵tRYfԑ]E̙39"˲,nQ !X,;vbcc+/NNN{+99|pyMVTTk֬9qncm۶C-Zu]jRQA:tel6[M'֮c\{W,g!ێ7Vu]&oC***B޽{|}۷ЦM]v^z?pmLMMݳgO9R"##Ν[ȢdB9G aÆ[oeٰZתUQ<ܖѷ~{ȑ,lVVֻᄏqӧвeիW{yy5&e]iӦ=zcǎj^O-Z4cƌ#G֪XECy\x n-( ?6 Ǿ:OJJ_222F+i&Ų,`\ţ nӦM߾}Ǐ~g<8>>{APv5cXP1V4-; Q!DHU­f}ELٙ9?xyyq\MM1cȜڊѡ!BUUU@UU7kzG XAAAsz#޽{GEEٓ:~)qԩիW{{{-\.,++cXR[[z.5_YY)L&^W_[eeePXQQr1}}}j٪K$P $ dpؔT%A2f1L!B$IJ$R  B!%$b)b &ۑJA *kD$1X I@ BJ D'ggg8p$ɐ?ǃ_GEGG$233 PԔf̙3P[X? }73f秤GEE!lmmsssᔣGB0$b1 B رcB޾$P8|aagn{̇@ ػwҥK|dX$R(Xi!vy$ !%D,D*feeر믿>|pss'~94DOH b) ϗJ)))!;;7o&%%S%b hׯSkX^^^ߺu EI$XQQ/@ʀ. $!--n>Osss۷oGֻwoooo]] . ,Y!7CRSS322lll:5g0V0`@^^^ff&QL6I`q…{)//0%4W/װH  &ˋج?O#""?'~>2A1Ǐӧ˰ax&pa;zɱcǎ;iii)))hb6W_\ti+W &O>kػ [*a‚WD!Yq;=}jJp˗#F7' 棁Λ7ԩS;w3gN7~~`5LOOO[[[MMmɒ%=zcܱ`>*x<^UUBT#A,ԩS[l矟={ٳcǎ/_\AA [ZZ/ 1d---6m:qǃ#2>G޲e˝;w|~BB_@uvvR_@BI@EEX,.//g22ՠ;CZIIɶmd1l6ߝ hhh,Z觟~rqqP(ϏoL&rQVVr5+WINNyf~~~hh9sg*H`eBUV?~\CCcŊ}qqqo(\cbby<^ccc``໒GWW۷=2@SSSrr2B=Ɣf~Zfd >nL!h_`#\.w…A|ZԩS999vvv˗//Y F/&Ir`Ĉ22sN<< `˖-駟̙%srr'>|P,{yyq8`‹/2Hw Tvvv_ʢ$fddPO!TUUE?722ӊ`>UEKW/^߿?ļk)..622RVVVUU780ɯB>>{ׯ#VZF U]]q˗As)ؤ$(|!PPPɤ[g!R:x"=z~ll\]]͏שּׁ>}:jԨ<ʋ}Æ aÆw1#KJJ`jj*eeeܹ:1))iΜ9EEEia0zuGdhkkKLLܶmݻ01l0eY ܐpPkk+lRUU 8p ͥR\\KMM͌+WRYȑ#ϟ__au EEÇ߾};((hٲeKm6uԽ{"M.]4iRRRAL&sgϖu+L4ի&MG^(k׮YfZ*--۷'O7oސ!Cx F]]}nnn۷o_~g͛7s\KKˑ#G:::8}:=L޽{Pի'62vΘ1 x<HZVV6zh*(Ǐ=Z"PHJJbjjj9997,[ Q̘1XWX}v˗ZZZk֬iiinڴIEErb0 HVZZ''޽{ϟokkܷiiiTpL0_SPP߹ N&D|Mee%h8 k׮M8q֭2!e|>!DO;PXXfA| TUU);6::LTD"066'Q}f;99q8>ҟ*  y<Ų[e ùLy )g///kkk333߭Nnn˗߼yiii9eʔO3w_;`0].a2޹qP(d0]Uj*#JY,X,>|0B(((菮~gQٓK*((P[-r`>MzܣGX,f0ekVUU8::BT+ i+ry{v'N?zcر!c0~ĉn#H|LaSYraU vD,}׮] Tg OY p8***qEEň}; y(((POL&s555ϟ?'lqq3g`cǂ0MMMeX<O__ L2w$R_|`Y !\%BÇ[[[}}}`0<˗V޽[l6L}/455>}ڿ*g 10L&blU*B0~ $C&D"dBwwl1Դo߾r >}hiiqعsg7 `0!7AB  &8X1_~}Rt C:R40w}&L&S(x111uuuGIIu<>51nhmmH$*0̿;wX,WWWOOO##OYLcc_|amm})O<{pjhhxxx|jgeem޼`9r$!!a޼yC?󨨨)S(ϩSn߾=k֬yeXzuuuٳg/\ jjjLLL`Sq Bx[ҥKB>BWW.? .?"+Hһwo۷?g>zhٳgqG*!tڵC}ZZZX,x򥥥 l`0(8c#33'NVjΜ93gSSSzUBC ټy:::|||\$#@aM)^p86HR777GGǰGX)̧CDDDFFidTQQq1 ZKKKwYFIrs8rHCC=jff֯_?[[[ u1DҥۀD"2d ]Ȋb'b,a#HƏfSRRB]G!􊋋<})RSS߾}kll\.w/tR(jժn=[g?.((pwwoɓ'*..RTCCcС/^144r:\uuqH*~6l&:B0999--UWW{ݻٕ:::fff2</>>cذa:IOO$J]6>|XRRI$I2Çx#G7I2..w Ԧg/^BY[[l`0VX7oP]ORRҡC^zE '644߿?!!MGGgذaV^@ 8zhttt}}=uvv^j /))QWW;wn as/7.(('N\zȑ#mmmZUUU>۷omxxヂ:P__t^zQ (++/_|۶mL&ĉ/^zӦMAӇ2W~톆ӦM={61ԩS`ȩS~5wޙ3g D"_pp09 /(,,$000p…j=ڷo_mccut%%ABuֵn۶<_zn:[[۰0s=z\r v3fw}WxBq/^촶޸q#z7ocǎH.{A''#GIRcc9s|2oy޽trr:tPvv6B!d25k\|yѢEh۷oUUU"""-[F+hwW͟??Tzȑ7oL0R$1 MMͭ[Gj@p[n޽cpSahh8|Ew\I 4Γ&M)F?k׮9~5 =jW7uyͣG`َ;B7n9W_!V^ݥQII`<{ U۷oA 'ON̄MLL\\\>}:̳kjj|}}BZZZ k``@o$wB#1|D$Id9RQQ !VTTLKKٳ'B---5""] wwwAٓjpAP] c̘1쥠ꪪj``ڹs'ⴱ2d*Bh;vLwocc G>}jkk/0`W 2\4iddd ꨳ>/s)'Ͷ@+445o 3f@ihh(**~g0kχBpɒ%p_wJ}A2"USSknn>|BevAUŅjPs[[[C5Gl4c0}UPPXv-Yy1BԴ\ 'W\9^RRbdddbb BC*-- !f=7 QZZZ`l``?ٴidpp0BH__)`חo>Pll,ܹs`uqqٹskbbb9cڣGܸq#+Æ ;}tllիW.\Rt7 g7o޼e˖ +++#333ǏG 4h۶m[ly aaa555p_.!0k׮!ƏfT E 0X k!_ gLfZZI G n޼affVUUEd{{;蕅 $I\y<ڵkB Vd2/]YSS3sLrRGG咒@겵_:::@gT7o\WW'n߾  ]:9@07|SVVFdccܹs)Fdtt4Ϟ=[nB-++K$:;;CoݺE$,omm#P=qЬY222+SX:|KKH$z5#Aϝ;ܿ2&K7n,//H$O #͛7B_|\㰦%%~͛7܆ jjjx<ެYB=tt0166NHHDÆ C{ ''5; CAAѣYYYى JJJgϞ%h``khcRMMH$qH^ x$IygϞxP|1L J'''u?-gb1(yY##.M>}@Luuuw%RN:ʿ]N:!JX7fJwRrV~N#C[[Bh„ 7}Efnn.ݶ:rH ~xb/7>ê'e7fw%LX/XlXMMMaaa`1gffe+]RRR6!0Zk`o޼p8 իWg(**2t@`iid2ܹ#/h@ad~ѰL766Ջfgg B Y~~)YhDɴ˗]K\ח2[ZZmll\QQAV=JTTT(**C"oذ~Ǐچ)St6O<xիW_'D"08q*2r.7o@`jj*šXf UOn \d2/]DN_ Zx1Bh߾}$I~А^~*s$I+))dgg+ ӧ$I>|T=zT3駟`OwG5 H4p@L`>JiʕT2cƌ?-g)%Ӎ-//?qӦM/䬛B ?͛3g LXtWPSS۽{Qn޼yeeeefooO۷/MG7p{l`cǎ9;;ںsΝ`Q?tĉ  Je\6]]]/\Y N J$]]]UUղK^v gQx2bښ qnD\.Ν;%%%VVV$I-4TPoͣ_d˖-l6[MMmرoӠ~qq1A/_LKKsppf hhh PZZZ<<< $33ZH$:wɔLeGGGyW˗/1 /^}<,CBB(/SN 3fН>Zifffkk[XXx555''/^@KVTTliiA#JKK A;ٳgϞ=͛7IIIb4""f@6hР߿ܹ~~~3zQ}}=K:OQ΂9s&[t)8uQ߱'СC/^{]jر ,ήUŚ۷o3Kv `>777##CUUUFPL>~֬Y<oԩ]jYx!>g_>...//oIpM+/////חt]/_|/Oڵk _x8===FÇ]6$$dӦM C㏰5!ה1dٳ!=T?˫M&D"aX|>Cjii566"@"c¥ZM/_L !#!4[466677{xxЏVTTظɉH$o9e,..믿&I>p6 !`Gu9aHMM7n\{{̙3 YXX8m4[[[JJJӧO|#G^xʌMZˣ[KJJ |8%%%{g)9I +y3+Kƴ4H6Q8l22N*!nJ6: _~pP` ˫S}ziF|ѣY,X,f0&MJLLyfpp0&G`hkk(Ja¿lٲ].ә$(cٳ'xOnAX%n޼gxs }%}knݺGuو&N7[o zIOS JɊRYII (H#X8L1ev@@->|Ad:u*,ҭz/qƓ'O[n*}X[[s8jplH^xe˖y{{rhxxxЍѮzzz_6g9~ɓ'/^<`!d|K_>p=zBxgΜ2 D~VSSAJ!  2qjjGtuu\x02YыbOOO%%ׯ_D"{{{*{|ny;::8`hh[P.3P\p}ʕb%'':t_~/vttttt3gĉ/]$ߟ.+** Agc^nGGTVV@\ e*3֬`jj****`ӣB0==}?ڡC$/4|̘1!!!`dd[RRRUUʶnS }}}CCÆ>-- N&tap5ʕ+ w...655%e6NTt,--UTT oݺEEuvvٓ.)z葟K& *Ԑrᓤ޸qcqq,AoOusYz5I$46-c WCQ:ҥK{|hmm $vD;v͚5ВKKK[[[{EwPsիW)6cƌUVĠSqAۗM 5  .D"6]ZZ˜첲2XGErl6133;| F٥{"wėqJJJ{=_xgϊb2;MMM \.W&z!r ݻwB=z'vttӧO>}GH+ۻ Q ޕg`׮]#F3l޼BeӔO~z! O2ԑ+W 444M6mڴS oߦ WTT>s~ JN4zCtС߿uxjR{ ɹs?~|ARz L w[/_S;@*VIYY \eekR$&\r%Bի?ޱcA PZzҥKi8p޽{# |>t[n]~\o=<< c_7e{g3=z;w=jhhd2eާO}\d ;88ܾ};**OOOdB v_T"6ggg[`ѣ^vm\\Y,ʽ{t!W^}޽}A&IrϞ=M6LwihÇ.A KLLLll,J@r!U11vFNjkkQY}"($$ 8~hkkرC>}-ts522ruuߵkײlHt% !ϩӟ={Ij!!!̔)S Rg~gGG6TB,ӗY /^f[TTT'1Iq͞7o^ﱹ ꅆ5fSs .cE(SO$LfuUEEQFA%KC=Ţ+:::6660߆ aʊ ;w᪤i&*֜9sl|OD̙3DUU] |HE-Ydԩ<~obb/}ÇϦN:qL++#GRꫯPPP`mmME9`2$MIIyQVVN~dMLLܹsݼ</*44R~ r%#!COrLfssΝ;$ 7ṅCvFyݺu+..f՛6m ssiӦnn ,}$333aԜFT=<<OO 4)I> 7o2C\bUUUZR9Ps/`ɓe744@d6 vYiZg_kg׏s H,&](w nZg.t),0?^~600000r;tlzzz`NNNLYgepuu [l MOD]X *G 4hϞ=ׯuܹsmllbbb.\ XzȑI&-X`رMMMO> {WBcbb ]vBYJKK!BUoݺUTTo>/'==*,i|/{n||!_Y~ c0 ``0 `~L0 `0Xb0 `0Xb0 `0Xb0 ``0 ``0 ``0 `duHDIENDB`aom-3.12.1/doc/dev_guide/partition.png000066400000000000000000000772541477627663500176370ustar00rootroot00000000000000PNG  IHDR ^iD> pHYs.#.#x?v cHRMz%u0`:o_F~2IDATxu`ɕ_5 󈙙lL뵗lr/]`l \dɸfX,y$I#?zjzӯ^իWxH@ DD""BDD!""@ DD""hČ:hу`"4ǔ'ѢI>pM*Uc;\1no]U9{I-&Q ט+4G:/hAD1/~hۛH-jygN?pbTiJUj4܏R醧w8MhQw`GbԳߔ{7{0 9JOo^x㿳E+˗&Z%Z̀ĭjuڽEwߓ2>+_«iկ-8_+c'7dդV5 [4]F1KBwwaCVyI-j@ݒ,*7w1 |ŤV5 ;}ü?V17Fv#>\@upZ5 )}9g,Vh?vVE TZz{菷};Jě L2O7:ZY J=q`?7_)AXs~P1Y@&yo2@T+b3QǾ(!#u84c}`{l#C? gn=C ɟhʞn/RD5'>#x9LiU<򳊪/(bCI-r@}-jqWW SF TsT^uzRD8bO]a4nto|Q8OɄUj%Z Z0;\c߻dc"@ӤZ; _Zc.Ku"U$K *2·_uA/ +Dnxލ3˚c)@ |o.r/JEdM1cWdjJ4[j#w&Qbn^s3 ;ZTG9.9w/-xt8o%_륪_P_6YFW|S~:0az~f1P[C)wQ}iǬ7S^V+~`՟6%etiq+P(O%^ɓ{Bz2~`3$dHiwgwd(_g~ݒc(=/w٠-Å C7On,/(N@I|̇ $A38d%P(O;1RٓMַwhLg.c/ "SE WewXL>8؎vvKw Nr8^3^"!|hZ2$O LEnO~6vm:-so279 DD OEمbLq$N7YVDB[ r[;EPAqCWF.2t?ko=a]HY ;&"/+*<  [g%R3|:zg}ʾT("eoj(M ~kش"xVCVo)Q}qyͫ]Y{9l"E)G}OJQqW:7n쿬ω }1:IXN@g62~G৻=ᑿ)wkc}kĂLFU}_gKտK LGuEs!3x̗E6+d!ݵ1c傳UmZ-WɓDž|)޸GYj>]y;3 +}i9$~6iPQ-SwЫ<8AɏyR\uOX֊ -y W`v_w1+s^  w_hHa7-pF3f?=:(+Ǥ}Iyn0)_\(׋#ƫ:g?K~KVc5Z>|uK$}Sţ3i=qLé*dp[d^zd:*daZ _ (j`~[Z:#4v9}wgݲ`]|-,ςD9,Q#\IusȘƒ1C^ϏLܚA'z#9!^_ ´_ꪅUWfۿ##M{Dϼ@'ߓ{> ,Ͼv}=yWK>/+v ΏUaRO衍c}yN?_Qz\M΋;ܛ l૗*e~M}_~ymPݹ^ug%)j?@=V@rKQgGcV~쇚Φ R4'J7WO1.L+&]X9u kW.gHx9z=d}9͛aٌn4%~uYh v}̡lsy|NW_7BXϟhlI",)Z4 zwŔP ~v^NB i#:Be:k {?_ CU<ҧ={_rUT= -z[o[MG^Cp̡_ t^p?G/pwט )aGjυ516iã3}nãڽAÀFEq"P 5POjuwȐx߁0/a=:#G&Y5L+cb,%, 5A6,ȕ~ Z+aդ:M #z!O1( 4Ȓ6ۭ&5ADzn 5ADzl!5AD%"DsB%`(N\.򕜮A4gfC{8)3SRW2LE<@+A4W7`63uK#3ku>*}SC_?fΆ0ފU;cI; +@7.Ke׫c3VGW7<{ݟ"y*؅Q, m t)s٪MV  ? 2rvW^3$nm!փhN>-؇xէ?W^?0'!Aŵ._*K/nB@.ߪ"?2'D&! #%v{%"wև^`;]dT`" 6'ϫ ܅ Xƺr'EQHGJ9wk[=@q{v3xCx0<c1,- ޱbzt5~Qͼ ϲ?C앜]WSw>%fnTA< cX"N2;g>ˇЫr?{ݻ OvQyqڷ~'L7C]ť (aO@HȘ}}@}>zKq)d* Xt-qH ;ZvQU2h[q/vyZ "L3opNLncKJX0DJ*À›|m.sL@|~wzbV$8kTV4QO:ߊA{2N`# RuItѠH>ȉ lZc8D)mϚl (nIbnj'-qhTt>,smn @T>aZTOYXi箄ui?FS/c@aWeDžF~㏌7k"eeLw No U)N$OhYE}_\&ء V("ƲWkͅ{|EA(0Eq:TOY=|cX x6OP2?>铬O״ÿ*c#>+ZQ=bcD,&i_h #/J>?Pe M1";BSߘ ǀ#h` R1pȸDȧ0Xy`q`I  #ۧWS@E@Ю\9Dپ%RZ#S/ADzl`AѴ?hGaB)ݒ= s.ګto=j1Hj= $QHk(.RmϬNߎC#0\'̭(RYgz*tO(({NEk^*Ψޮ`dyG;k"{@z8+kᲘ;:>lDHpL'6\b<59"?3f1`D"2X~!xZ( 2ES}E:C9`jyh!%<&4 زc C:X9[$#qj9A&w?ۼ7BHa5Y<^֘e`Gx+I!,<[$"QnkpuZHޠ#;@f=[п:bFIN~п&kOg[5aQQѬ<' iZpΎ~S"A*l¶]1~?xߩUqS(6ny5ڠP5KZќBu~xb1}*9EQmm+ <ќv`?&qD] VN6.(]D`H3 {@鿤[ w0OSkvZ" KФ@qJH@E .2@0, I-"1V/vux BX~%L>XR*YQxH6E#Z,0"I:!(t XR+j PöhZIhGDA M6 PCZPQ(<DY]$}16g|ZhQg(SX2g܊$ {`4`{(VdԪJ$`fr .r`$m ̐)-Lz @Qz\P"0'\ݤb@/6g֗\* Q%n2T5ujf(NMf` # _ݫ_YUt8mY$aQ ^bZݢ̠ BE4[ +ҕE%Ui,(Vv-:IjQP4EhCd(I.H/>eNJq,`X‚ r~IGdh5D {@J@g%'v(upKi2D3(Y"{@Ҋ -kcV$ɸ[6+69/VMyfc gx@W_E4-"=@O[IoS_*Qb \   *,QVPR\kiMc̊%q6cjrmTphZ{hp&<H`k:Ks(>툜L4M}]7 Su!ã]UYH`]@ہye:~&GPȐp^⁂ [#'ȼ "jyi< rHb06[_o N1pd2%;U/],9R@Io_=@8oͩ(:{3cYgzio.A )-9';Majʤājsf}2N3ݑ%Dbג̞)zۧ["bA@*+7F@D_W+ݙAd.Rk۰i'""<]qNμ6Ga: ,|򦩝"( 9Ra%EUAef6\(yUtէI>.p?L+M/]a%&8s7w/mҭ1I:LAh1 ީk]1d ձD1H  \u>voϬު^(WLjG?k+gŴQǁY+?YS BOclSzШ{e޽Ƞr+ Z(W,*GœJN ~ jm"x!kMw)cFdګ__5W| \mJN_(X%O ,)$,IVqeS'@alt##AY$֬8[|'m몉,ҫ05T̀ !< MQRţ+7+we'7)ݡ(:0IYGY\y)O8ԿF{ӆ'(ϟ<*E ,Ftߟߞkv.{R4 ^#uIY~"7myx1|@gηW/! hE<]ؾ"UJ3yJ\"AB^ ,@C5B *{O,1k#HXG6]|UNKaabcoDfUG',rivu5;1 0ڝX -A$k5dKeݝ WmJgUryDߪjb;&?Ks$k^EΏzRo[w8M!dI]|QׄWaB0b0]w-=̃ 4Dg_~!.O/op} eȒQƯWyP)Q[D6!]%$k!k'}F1B.2&2BYBB𗕬gpjpiGB`DM{/"B;vj(pJȀNY"$D7.)qE1zjL=LY2}ܨc "ȃIBމYr(N>*iC6u$$k"O.;G)ܕQNz'V7IIc[Pq=KI(j0YM("}tl-dlޔ n&'ǢbQ$yT*q( ؝>" `jՄ8K\YMn ǥcȒpVBG؈&LKU+HVYx_ !4jA,j2+O@-@؜ x<ˣ`q{ DΛ\f @k2x)l#yH@%VnՄv}̀ 2f;BQ`G7Bg3_  ` íIO_L(mW=*YMDwo{Ktݚce"p!k] rMrz.ZEe=gNqVvd$a!,=sӆ:'$ҀJw Adl߃G>y+> (y{{;fA_ǴQZf0$`>hߟFkFd5X&Ia ϼ6S+E)G91kk(7AdvC0 s\@B/wE{gj#Q@=;s(Rt{jd33N'>Β:,fcJ?#^>F `sx&KJ{3ނQ6n뮽Zjس6}nˬգMLˍ K|(@hNhI1140uhD1qQVQ^}fmբziʆt#Xv 2Cv@2. .Ayx{D` #)m{ѷ0ا{d<;=GTfǹ*;'3Yqa2-!28*Ǩ|Ds<HLauiAs\mEmK#&LQ+ 13VD ׻:|Gmk悏{VQÇ.K6@7'3gELwl'xJ.nꕆ̌ {'LC<7)ǛMZyn|.~z<;Pz~<ȼY1K~%z<<.<ڐuKwT$@&]yjvx@ڻ:{ʶ`([c2uhu҇W?:O|jlq"[e=.w- ͩA!Xߩ}f0>VǑk{st82<5@\^ƺI7KٮNSlFSgա].66ÖZb^3s+Z+*!:B228'}4|fds[n&433*t?0lwXgڡVy3^_PFed-,"-g6ݧM̋b3#̚LgOͤZjțV-e5^:KW $|h[9 :K*[QY% ÷wHf3yX:*+݆%S28]uhϠ- ]157&b)>:K(8Қ،m6T\o&ȗ#̸2S }f6Դ;UbG zsLfi//RK;#)dfUp1)C,oElT;{He<GGMIKbL@JӧG- V_Z!2cH; mH8vxbk{IbsU@fZMǮkY<2:7)VIK)̜v]ۚGV[ K%/Lrj)sL ~7STNH0ۤ2myΟXyZʛ yQ3fKu`FXS> LWoWD|b}\yA7SX!SE[`!t Sω}٦6`h2CT5( XozW,|"̇ xio?ϣ\'#V |D{c{K+Ze{3|QL/Z]Goύ$O2_d:q4y`{MY};3#INJ2o-8ڕz0on=Z\ocwf+|R[E2%<5MNuܮy $}[ջQY M +'ӧջ!J˛Э$"dk?m*y8;KʸI$dvgo:-]UYSgĒT=0_k{tkN$\=ޜv N<>-,X K>J$<1rz3Hx8ߢڏ<r+adž#tix|[yMU%)FC"^>ٍ>@iS>&}_7!Sgs% M'k--Q YϞqlxtNWwt)cxAwP_RI'7#7dǾo >~C.Q'I7Xp>;G{a];֔y`KEeC' ޘIb[JH/Ե1'TБO)(I\HB(oiiS^{?ׁGϯC>i>Zynǔ櫧ypm6von(_dHs0={ۖhF /ծ̄OQ X{wmJ3M1B^(v>'xzq(n Evy0HVQzx&{mTIԼ} O<K0(z@RH%KpvHYa9Ξ:7KͩߚE&1 OvAOL/ߎ9>?/ 3oFB6{_EO 祎܏3Y[G>yL=-Zwdz<"t3'*UJʧLvfp*nm6ӑsTdt3= -^a}bsA׿/T?n08?y(%/ϵ\iu}׺d{WSڔn Gg23Žڐ3jBLBߌj~ /'.r1D{s?08_n>opWw zϟ0/;8<$Z[ =Ut!A1^ , EўǞLt]0Ǔ\7B.DafCD3LJ&77X%1rRn>Q֦Ȫӯ߫ŠrkyX;wnb<=>jxrn$`mx@N-x%k7"9nưJ}bCبQewu7F/`: hLs3]}p{=40`ݸNsl0UM59D=(Q=7\# Eyz~f'yWTayy|H2CC]'*x v~j.eZ 5 ^htioK(5kYB,[Uҕ6~{B_eIK4|GfqH(u#Z L!g6 5!zZR)ގ.ь}߉ ~;n;>՟ߞ٥ېvk{1mk{A&Q)CTzZ53=te8#DCyK:]Nv|B yG@_juzx6c86JB16dae-ݔ`v ٌnj}1,h|1[%yv% W{~7^\dULU.f"8Ө7̝uu@cV&%0ӒҎm#i#CnL[<$2hflC@XOozGG[eE6#S7slFW2J6zlv[ldj]@W?mߍA5;6< lr.iwayijr2XcW= ݙ26pa 1Tӷ 9fǻ Z z'cC0w(-)=n/nrbgɚmT ӅșuNlX JZ V]-@$k]Ri`ˮ2c{v!3/&f;0fdb7*]NQ %1KnZ+ߺR!]!+ծucxUf6;cvs z@__/06pb2V25z H8bl7o{ׅ]4od#2?8QRjRDmNڥB@_b( I ͸]xl.XVc:F!W*d+ ׺ץH8k܆lK#Zc$'dvjff{9QІj3.AZ^Bw #^P-ߔf7Uv11qFmT73Akb"ff I$ԠPJ5A {MW(7htVfwߎ<֦$~߲{ftQ0CQFOf2 B宰]47kKMZؼ۵ V.,]@]ת%WjydI]A@w[yy?"3[7C.:/f3.1fUwA)#h#<;i졛/ ۛ 2۬'+au X,yp2wA :eS<#rx D#k{"ٚ އ*Mdlu9fl8^I C֠S ,1izTI6,7[r>yБ[₰bw5u!td{6ju9YVb 6J?l(l6\6׍YcAZRˉwA }ә>f<ꝆH QI%!(*6C$HB&]BO HE-]cs  <}M=YX~dxb5`a3TZljޅJ%+U@f9V]bň&Oc(އa{z-~یE]PL q߹iK^7ܻyP4tYh]n\n:03ume%Aٌ]wඁ~x:?T_ ܢz *57m<>&"X!xtܳN/JBtZj|(v=>a{#6ҦuLAs}Ö>|z2,2Bgs Yq*I>f$>rN/R鴋~= lnEN*K! ݳb 7\i6ϒqxx@Sk ͌cs?$gf"ovVt"K C֨)eJt ҆=7.BXْ1o\, ֬2=#;BF ޶^~{6mrڌޅj3J.Rkz9M99P3Elɂa׭ ⱏ {kjnMf\H0ˋ)EgX<+aS ZfL:Tls֮%Xc8I?K!"#Zllmq3!b8ہ3d3Vj4`&. b.0 V ƀLtTvF$q+D! DA]GcYEXzpNJ(fǒ.4 V9~m-Ne!+-fAef 6i3n^:&:8$$tp#N EMW{T.Itz4\2K+ ((`j8N@3rV99 `67MqĻ Z tV0V6 kޓ]YhNb0ˈw` v6iR3 'N01-Uxe2Aq(#hB1e(/N Ί bblr\z]ue]5WJà x@ BH}ax<^H(AC؋vMѴD)*\dU,y/KLS~%L9Sr:[|Wk3 x0Bz<^as:r $jRXFj}]IhD#QĮHb =h55jP@)B.X@O!z 5 ƴM2YhLQG 3)RB(఻u)/b;.K24CIĮL( B0Rk"@͸vl}Š̮k1k$vl1ahE19^I_7`P4ML)Urᱹ{]m/2bW2,TLn@ 4rw@@)goj9^rfv\iHF= vX 0&FjTUr:`OL´ޅgQ+n׮^5Td{{ڮ!<.ZR1< 6rMK[dRJ1Iz/xyX&wK/Į̫&(|p/@O3ꥊiH^Yp06=.16Y[. M0@; oq#.rt< KKCAAh&'+K.E D>Ysq ! !zCъqjBjcNa0[:d R C3Npl.bW <ʌs/+9 ̞kwbSju wkkgM<46"X+Jޘ&sax02A:ި([7Į@W?K!ƍ5J*2fUkMc!1=B(2͝-3GpbrXl FT e |.C˼rb}cAP4.̌ Ƽ,6fLiN͐W_:h`N{- ZRTȤһݰ2&=Ax uvHFJhQhr9\f%W*jިjizS@ d|b},c$1n]}UCuPZbؘ|Wkkge,*&¨rqxXdVbhuZR[Į@fR 08g$@PuFlrEHk5u7P(<-XEt9m&T<*ƨ7jh>@ d9ٚڵ9  5͵91"Įt4uKO&Fg$#_&o;{eM^g*]Qr%" ~J{3.ߘp Hs3o܌NMұ yy[e3ճwA  Nslh9^&UijcVk3# +ŲRkz)MBU^w}pCQ+ZN]KLT .;"(7W5^:J͈P5wGx<`p:>Ъ &HT1ǮX܂eh1tJRK a,,֍e5w0LZj_CuyiXJAEƊ[,;4=r\">VTT`Ψdn< O 䡈3֭ΐqs3{FlFZ&ԩ-UUir"esqa"3[,6G_C,3t*R"B+hbAI,eXP^W$NHSمS*p;VT%pJ+nyURqvsgMEUy\Bfb"%haprZ,.(\Tk Za8x C@8"b''ʹE,KM=xfɍhDz}Ɠ'uQ켾Saxo01 k0RTHeW",xW$1̵!⨥"zƢ·OWm5u4\UE%$R5xNi- ω2Z5KOY?ԮHYű)wjGJ9Ŏ SX~URU+ ΜMY0͝EWTqIÄ`p-&%rJiA2z|Ȯ\k#LE8цY/m Z4@D]y7Sv87'߯\aa+򚦢áia v7`oܵ~l|ʳǃSr#I}@$[Wp{fB#~8&Y"#WsMEC3#I}@4_ 4:PuzձEZ))^Yu]c|RVKjoj8 [ ʎN[쯳V^_KB<ԊTl۫8_SP^!diRtה9NjڷȩmzuOGF5T5ޖGdFj',t[ơ]w^6d.%鸣ԱlbH G=2(*EVc7o\]PT:osymM,"'92<dam9{xL}vo/걬 T۪+ 1dȂGZ=lz[ C$Nf^kH0',}_]=49˖oOϟ8wG"tT?J d+/±>I 1@Iae;$Nn*n.//Fg'E@={\`.]"~<=]ho8"fdCB #)|4oߋU}/F>`Hbc7*++#GBJ|Iwbj.g$+Kh<~8~etfji?pɦaqԗϯbYI}SiS}@\G~ѵS`0v꼤)mybGue˪جD@><5uƶli(}dzi4mU~҉!!e\>]H>[tli#%&nV4\LLP'C'5̹wLqF?z:HrN!2?dxzׯ>?]Յ=ler:עLNilvQ$OgV3w(LSJ퇎l)TZs6;ykçۃ4Wʨ9HP>ijr1} -(Sԏ %-:{{[LoFh7NKf<9y%ebn|,,:r. @^w>}d ׯwǫ>,I8~h%٥]&~N]XQ_"HLb)2xWN~EOx!t'h? ~nR^;\y_f 9ϯ=Dzyquۍ;'ΥIvn,UjnKUUsĬ(!nE~"8[߄ 񶉃iί= UKÁykN7zDlR?gpTpGM7 5]sNxֺig} ;H33nHe^-t-`@F~@O+!:va@gيO+y[98 ϒeݴą}[=ZrQӿ,eZKmYӥԌHv-s:lVe;ٻLn}+eM{O^ˏ!vIːc˛KoF<6M[wY*'@;+lTbNzs11S)_?'+{ÇKb_ 8w>][\zUݰ䀜Vpzlji]bNlze/?޸ ȾN}l(u0* g~텥e)׎`@Vpִ"~qmdko(%NHS-gFԙo_J%/D0 La74Z6f~+h0i}$.^c#4ފ7>0"ip QWXN6  ?)_ u5jpK$1ꄝz}S+KynFJm`Ĭ y*̍p,(kDڽu7ҳɧI" /_kdw7mx RA t|,`[r<ߑ̕UK̉'NׂFXkllÃ/?0kAE_⊕tL0Sѕ*AK+Üm!L ޏqLo͑^g?mi&OaZ vPI.RaM$[|rSTSϐ)k- /;n? %/U~rzcC/]0T(e g"1$( mP7?Vu pή"7BYMCvE> >`_UuɓԜ(LIoh!Q-.[k{۽8yAo$Z @(:ʘer$XoOOB91o*^_oAZ],Pd.Յp,U6OsZ l`<4mٍTdѮ],|Po>5N ሗIoL?[q0>- Zk%MLRF"1$t _?? q菞G+>Y5g }WKP_SYC3ӣD]Wʤ/FRCGjCG^6".]|g1(֦ 7ۑ!LVUu*?̂xԊ{ZJwmx۝ķr SҜ?Z>jMNL\SVMS3#I@xoi.]5h_!,Ӷ SȪТ3=Q&Tݼ^ _3NZ"Kg)U֍5 ⏊ kUGݼA e6#u|E t;G*g5޴"Bs D9|?ro+:$ 9i1o\ryjIڬ;5Rڤ˳kzi5C_{<0INwxxh}ʝ煳 H:lS xnHېj {h݌Թ2IEb2>W_Ajb$ӛS%~s?k!A,Z~wא %  ],RRO~"HhLMY޵{1$WgX d H_2<;W/1TWFq 5,J@SJՃI 67R!`d O!򳟕q vII,|p#gƋ@aφ pⱏSB]ؿuzEQxT\Oy_.`NF!_Zy/rg~Msx!R$ ~2ߔrVlz)1w} zww@K3CƏ] PނRL!\~m3j'lZXs;.v2wezae2C˰4MtJco^{dۿP@$1+WEO;Cp%[Gca DUԧ/" tSi듍dm[1[ffƊ+p5j"=T9)m҈nzF%&V}+2om4R|DLeB1߼ DKmى7mR۪e@{W*(>y.qCz4NډIl9rcdfR|.*3"q꣯X9 "Mi0ݏK{jn6$krn|k+o?rns+o{$6U֔)slH YI_/W?3{_?􈷸r[]_= HňUJ JI\68xPbQYg 3&'WU6]JV@֪GKvg3Ǹo{;f̝lG.Ӣwl>0l.xǹ YXPJg.y.Gxdo|&xVA|\q?\s ].R3# ĺw~`\йs{}qG3Vո+ҬYcl%HL Hee\8⦽sTJ#Ņw̛mթ-UUΩtSn9'ֵ{2uJՀazg[cyiT|,z$!3k~/YZqdŪŠ=.2-myuJFbeK!/>&~sU.]_>N92$kX;J+oI34Pv03'I<}.~>*jKZ90WxZm^seUSziQS D+NdAslzzo aV幹6m7WޖHOb6{HQj{0_dst!!f)tA\V&#c40⩶~D{-9dJ|Ic7G@%&M#MۧG=@ ޓU2p, 5>x<҄@B楩s@,ت?64Y@LXM&eтL"5:oxuz~ [{ms0?? (| i{ې>l "KYEȗЙVir=3X^P H}!ʓV% 0mhR~KaL>ö~$,` i&مBijffx*/(1`Zj? T602e@0=f'QB4}@$ JkK$,>WNm. * ""@ DD""Ѽ% t N ɚ(ahi."Ҙ+3b=XB:]qRzo}r6fޞKf= !gyTθ VX3rWdf~'(b%V&ԃ^F.o. B47Hym]%Jy-f}҉ffT@WVV=2B4B\#/@rc$> {r &hfMʱ@QC=sk;E D3H0 Ɖ'I>Xfb zi Z.ů>^ьB+夻:N4$EzXD$͈D!@m+Cܱg;3[[u=K! CZ a_0R1,bLxY -!̋k㚸2a&-)B}swb@ DD""<"Z, ú(ua(B~;,SWLxn' {&o"(.B>?5lS p7d$tM@Ez9 vDdOq֕`װe [ z:[<^$2~E[ [ QLM/nQqNmu ͥr!ȩoM_7zc>"I&T߷IENDB`aom-3.12.1/doc/dev_guide/tplgfgroupdiagram.png000066400000000000000000000755561477627663500213470ustar00rootroot00000000000000PNG  IHDR IDATx^Eֆ I1 VYQ~`EYbaׄ0 Y`̒L$AfoÁo=޾Uշ]vQ @*UB.." Q PE,=7Y ".I `9 "4 #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`9!", #`O?9cM5셫WnsO3h өS Yvg}/ljժUpçLbzi⋂kL4ɬZʴh\|oao?Sf͔krW_;XOaÆrWe&:[PUYZ,G:@lOn/b ~iӦm6A 뭷Y|yQ?0?B_ ȑ#afm~w;N3sέ\%X[)0P0̯" V~=`y4n|>0^x 0`9ͩj>ӱcG3zh_{܄ 9cUf.\hys皖-[y~͞-s1:t0wu F7xd+=iڴiZʲyga>իW>8gz 6YlN;dW_}\tEˎ;hƍgnVD9pʖO<ъ0c=̭ޚ1tv6o< D@U V#ʗ`v#F޼ m۶߭[ SLn~@8s棏>ی3nf6mژN:|'fV86|s08{S wyvSN1nocɒ%~o+DX￿YC4ib>Ӵ6`3b5qDsqY6lXv#:l!e]۞ӬYȕ㯸 ka`I'X=y饗ʕ+駟n;Yߊ`ewh V!iQ+x9Nn;MNWsuי:X>Lwܖ[ni ?Ws=v=-8Κ5ˌ5}NH{]D+/"DHB֭[tԩ9EXD7t2|+\O<)<Ю9 46'XgF9x=*I'X-2]v5;찃euGZ`S WS$Xo5| fw~:Bza$ʏ?h\V ɓ15sL_&3;E/#*ntWؼ'XMnゅiR5jd7p<|z-;}Q SݽVȣ%XG]c;1u5|p+ j۷O쬏0(wX*`p iY LD963Bk#V֙=/"`mf6t]D}m$E?6L*Y%psU Vi`1u歇7k*d]L1}ve٨OL18wK*,Md*2!k6hU:unrİldcƌ'` kֱd\L"z\r%z5?"O>ي3e푇*,mo<,GGȌ@*sWuu-\)Q^CU*ҕ\G#dF '0^{^ *vfۮ(/r{,G!3D@% r5,G!3D@% r5,G!3D@% r5,G!3D@% r5,GQb3Ȣ~Yg/g{Lђ ?pgt<[.VqHWmgE 3j  #`9@`R0i% LR?QwVVo;+7> V!=^],G|"ryK"Eز6n^?͂ ltrm),l&#ɺ@V>BfRwP.rCV "C]v\2A]=].s9)H05+إKJ#,.(dmo0 {ɂd!-3hݻY ^0ƾ4TK #c=fw_Uγ'D(j+;Ήzx_BKf ϕ @@M*`‹2{qL21#V!%-!&G #: {ID[‰d߈(,FXd׬X^t;, H ,G_*KdN@L>sqQ#J"m'X|`2"MgYaf'R gJa=GJ Ih)$%u9 ,{ jDC ЮC9\ fGDJqDfBGV>NAvOHQ R64?08%H~<26Zxؽ5,>.z)ts,""G>IYloȇ3Fkψ,1=J]|x+o<\SC" |JqDf0"DLyk($sewS\L]1`#H'X(bJFD;1=Qӈ~"X!~\eց/رc\`?Z \weX'vJhz_ef. <'+BGvT #N`9<@xz "Qŷn;;"BOؐAqD AKڨOpFƅ`1L}q D"t+jzEֳ,#=ku3bȇ+tEנWZl81}Ԫ> 7$XyvhN`9 #p!s94y)R"HM%9$XZ#jƿ/eqv U^ygDr-&#^!d{ӦMsREIqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqpHqp TYvZA E.`$!`]\,u uH)\$ r+IB& ",H2㤣JK@Viyj" H)qFJr5X*' (= V"< .JD] HC\$ r+IB& |$X" `SH@WdL@tyEX" `eIG"D $XpS⌔`%jTN@U9#Qz3Ey,]H%Xt-`H@WdL@tr H1D@! RpE&+dD@2# ʌ*-EX孫 fhjc<1b߿xy$Xg̘o>}z T^pER̷kΟm۶6(c$X1ȑ#͆nhUܜoO+WOe6ԨQ0g}JQg1;>|9$XOYvmk]wͰSNǥl+nK5,>c΋G}Kƿ_~Do J$cUl8e~}#ZJL@UbŸ\PbƦnjcm/|~ᇔdNP$e~6̗HJùW)+XaXZjU1{WxkDǟ; دW^=Bh+腾dY~IzW'J~ǔ{?FDO~lgC0$XZB$G[=%78Dt$S̳ʋJ$Xә!`"Š%DtiX S8T# ʎiu~{gSE@yO?MY"J$X!˼OCELoMy^!QYK)l=KzT͚5Uƈ@X"e݋܆4^D V]?8& )~*"D4bGpvaK"f Vb]^yi7(RݐMk]R:^^NR`%!?gΜĨ!:F^6m|i"Ֆ\`BMdD_o5ks|\ED UjUAp˿rk`9蔨Dz5vy6Kv@$ڻXep݋dq(8x16ʻIƌr4k,DtiH..[nsZ$XQ\vyqoS;wR~qguY_ntbX /pq93V[meaÆe˖ُu\اOCz.mxpmفXr|z2p”N׷K*o"Svw.Liի "Z 7`d/ɓ')Sa3gi޼y+``=Cf62=z0ժUv !`!'x0aN8rUO(<։BsV2*i ;_ڵsJ{8xn~Xp޴zS׷S]^a`/PEX<3?wL0xb×p';6|AswگQBc=f߈)ֈ# 떈_WӢE s5ה4SyDGO<:Szމ{n`!lÈs5Pε+]z2Dx(g+5p2k,;6ԫWȡ kPN"EXZ~` PqgM#C9$L쳏 fs162qy]w5^{9SZ)+XD۞7tᄏ\X&5j(> ]J'X#{ ˉne/HJXZﻑ$Xw֠5z~ ewR~Y 1W4h`e޽p=~& 4h]9 Zytq72s=)٠"{P6:ꨣ^venT BkخXaIrq;K(}Lm+;Ǝk;<+/M| Ç7׾}{C{ UW]e !S8 ӆDF,\pV@L'X#1uwW"]E4Au95MΛ7tqs@=x5vx&5=ܼtj2m[v rdpgJh'`a BI2Uؕ?O+D_l`/+"e#,@xޘ&%7Nᇨdᩒ@bÒ`62]TFyTl IDAT6C!rLǑv`YDf`eGr F q30K, s.%(*}=NT[\$lH\ѴIÊd[= ^`Yb8O /^.E5`|0 _$)DUI^g`%npͼ?e'w"b|7A@x.E^3,g })Oe:&i$8@-U% R~-&wws+\$\(#1ӦM1kS>^VK=$xtJoTE*"{}իWE&*`eKG;H$5{CEJIM(>m$*`Ng:HL^ŠNCOva-Iq @F|߱(ަn&: n TapQl@xC^dOZzS*' *l>oZ, 7ߤDltRZ$￧ Jn/`|W%YN>|YñJs /){k_|z{>'^y?Q/R̫ ;%3޺S7hAeR&-Z]vzT۶m3BDŽ@@t]2m2߹sg_ZlF%?SO},}$ $zu/xk]tqdo~"Wp=ZjɄVK"<{w}evsK/_YfHiy/|W'̃O[ncFߜ:%-ZDjO@ E$xu/> XxxW=e+ B_~I|5z-%s7ѣD슣G#W2x߽{DJ@+9'y9s$7l0ڣ… S3/R6m@ k(iO3h sa+"n\[쥚ǖ_D@"P{ahw7-[4Fr q",g\!CD,Z޳gO7Y S1" ~$X" a)R'̄+J:FF +aHrSD $Xqp4'dxUy$X9i"g8{7m`Ew\F@U48IWy3k,3`>;rW/&lw݅@U('&X{8p9"-X?Yre+O'LWDCZNo[r`K4]vmëTb*̯#E%~͉'hΝk{챇[?qCԱfoN)LMӦMyFŋ_~t\jժ&]Of϶4i?l]뭷y衇f̘ac-[f~GӱcGck!D^wimۚ}ǜqVhygN>d{=/z'͐!CLjuGam׮oAdY?,]ԞsUW+[Fm.{nf.aկ_0/=e˖>}v\6xcz74[l3gerZ5(a_uYs5WN{݊ XWn."sfT_EX"zcG%bPz饗 |駛k I'dy+V'N4w` FX | bov[om}i᯿FIaPrK˅zO_`$o2{>m?_]wN?A~ԩS}g9s}0º[`!8<:ԭ[<w-Z9C>h`!07t >kѢE{M{ ",DS8Ѐ+ *UK}+V),~Nu68#uAh+ϷµW(X tv=:Mf!]U|fV8xg݆) #~}:g`Dp9%Ra߈"O<ي/sꩧsXW >ڡjj֬Vh4+Իh+ux޼ycm,`QNqD``+m N8!u+,JG}LTž%X&cCL>F^AgpMd@^oXUa1sӮ:￿\bh]vS^D$;Ga;!Q9cL֭o #SL&Dۛ#|믿n>t`1=i)S {キ+(X]vW{+zbJasɒ%q)Ȧ;?<"eC'X< B Vpuǧne$r$O׎bbV!PJ≟*+Ǎgixwݮe$=SJL0  2)yg0d-zho0 ;v.D<#bc7!bY`SV-;"SA]x"ףG o3f+XDpnu:wl 5nć>}g 6!V u0-/ZpELzz9{k0Ta)XLMn?.aHzwHg_ 5K^ ݺu yHڼ:@v ^j&)(OTTu`.K 7K6-dvl[ 5[9/7xGX3ԩ;^l>|MK!cFb\.P?Cx?<"/qm^=2Tdels`"h_Tnr1810AdɷG!^&-kyL$P6k/j/,~ATP EZ)yL,Un.9 ) \t"&!{=yYe+/k/* <w#NbA?^HTLGz)lD\OȲYɞ,K//X^OB /Jŋ.*.܃2`eKLWJ *7 TM2[;>U ΀IFugPNVu>AV H7:S 7p B OqΊ+; p,؉X1!8SsLdi"sEVN`A‹^!).+K*?d5鯂`gB1^6W CDشQL O6T~қHD̶lJ DfAȪG!9-/"@٬Y3}Q2I90Q ͛kxa/÷*L :ǫ܅鲆c'I1M|H,LR<*/[y0>u^\*[\ Gqr_h>%B,=] kiH4$4S9].Qe(ʖ@Tn+B8ăEtD_V EΈQb!7TzCp@Tl&ʖ@TnL Nvf=t[tDܲ,A$Y7"jWe!O'Xζs6+,{l`LLz.Vc|1tPeFNA`a ^< A"W"r&6g605}vQ% d{0[lJ f1:H- )|SۄR65!ĈhXOӶ<' }l,kyaSnt^>;6Gx2Mɚt`"h_\oB&'@\A VB;t1כT@\A V!{zȽ"q%XQ37K\ SN +͏R VSA %ufI[=(P'qY_v@\A v f)8(U(E"{PUjz$٧j{`EFڸ,/#E ƯHԽ N@UpP"{PU7͒Iu@ ˅3zMjN `Ÿӆմ,auE [q%X_),6\#zJ`q2#7K|ě@\A Vm( L]Tr {PCg)"@T`EFθ,rLM8ރwb4?7K1XN(ރbכ%nU#D +B0*f )q%X'כT@\A V:L͒d"{P~ kzD}^D}@2' ʜb=%ufI[=(P'qY_v@\A v f)8(U(E"{PUjz$٧j{`EFڸ,/#E@D sYH(ރ[^g\oU͏ރu¨כ%*e`o@\oR"P$q%XE0I67K}G@\A Va$f4hP$l"W܇W\qE';( *Uk2"" *HU#q" 7 V||@H R0Uą+.W;$XZ#! *FUR`Uu"8x1~m`ϧjM@7BUP"@U"u{0KWJ" N. VZ2 V_%XZ&9`N'pUD+xWHr¦L@Ud^H@Ef V}@$XY# % *d]BF@5%^ V2V@V$XY%" *h]FD@%o%V Vr|@$Xҁ%$ *!l]JB@O%N V֊@F$XaA%& *1p]N@@/%F V|@$X"!`] H\P2`%jTH@" ^M"2 V% RXKH^ IDATE,"D d+RȌ+3N:a& g+q.WEr`(6l8p:tӧkĈf_~-E@,"P.ŋ ի:uEٟ-[fVZe_@$XE 0Ç7+WQ۷2d;ʒ`%j jԨYb5k4 ,Pt n"` Ȋ`7Q+u H\ ek]Z |  :@ ^zG}TkWN`9#;wy饗/l4ti޼yx"_,uH0I&Y8!R]v5ƍG,~ϟ;&& Vum(1kխ[;5cǎ[tFV ( V)("""EEK[ڲYfEMwƍO'@$Xq"!3gEeʔ))*TMw}1lӦM.zD`#@L3)6MxQD>*z^ަ ߭[RĘ+UM`ɒ%@!-Z:ɓ'}W)Z^# dq ̙3z+E4[:K믿& ;iʭYe[$XnCֈ:&NG*|MznQ+LW׽b _x;u`.%XfMsC}/D,^[jժqkړ VtyRv[jU8_ٳ}0aBJ4٤I헁% *._.i K4mڴ &… Sֽve_e>CL!x뭷?w#$sѣGQhZ|޽{t!K" N &|7imZ^Uɍ/C{\⋔-ē+~UB$;G.^{-e[CDץ2nJ|HK$Dl.e@ڵC,Y^|yf͚ ldQ6 V'C"zꔧ 6tUlYo~-%ڭVE+bSO=|OEx<}:4cO?չsglٲe~ǾƏ&Dy_7|LjrK$ZzȬ3fVO>=ey 2DG9G`ѢE)[۵kOm9{eHx߇l aZ|ӈ8*!oO+WOe6ԨQ0g}JQc 7`|IA;,"֮:uBZLG8QI} .F/x㍔AIfaG: ~Ľ[Ge4]MmuϞ= mw]Q^*~!e9TD"d& fl>-;pv*IUV<o^{OdP;)Q60Ǘpuָ ֏?|=vyhO~ltM(`9>ȟ"9_x㍳LG@~-;/`[%w Q[ gB򗿄h.-xW}c{bJv$X! Npo "Q {^?OSֽHR1 zl+H߇|͝[@&۔0R1K@?34*`W8EO\YS(+V{ЛI`'Jl>6IxOS$`7&z뭗D$z=n+|û!YRֺ$ //'(JCl~?1Μ93Q[Ct.)D[ xN;E-+ʅɈ믿\|w˛*" x/qT~W` ,UHO|#wQmHU׽HK"Cl_Dj̘1)7IfBLRuߛ|>'%gZx뮻L~vٝ#>/e=?~s>~"">Bݯ<\6n؟ܹ;f`IւP]{fذaZjf\FD[σaÆB{妅j7hHz=fx 5ɧ[իgx Q;}5{λFŧYdʔ)3ΰؼy 9 /BO9s:ةqIW^1 [L='l.]j|I3k,zQa(~gځ=\ի͋/h83!Kv7@^oٰUM4ɶ_~BC4K&rşw}V,FmN3p/ao!Xk68yӫW/͛gyxWsC"?I,6iUVLJv@R`uQ0-_M47~֭ ʺK,Z`5YĜw^N:$+2!$4DX??|{ÓB7wy}bGH{en 0v)36F_REev V'Nh2O rby^_}+4#GA7b W駟nA*\wuaJKV.*a˚c5_}ρ%K@S9bA4gO -Zdf̘ac1SN'p{ Q-u0 R`ejg^}D̢Pp=vBH=i#" f^NZ`yXDcP䩾lAH<Qɳi6%(# . Jp$EkXu05PG ق }QWx 1ZW7qf{حay.lDO.LQCgc-8Νk7|F$ D3wmX8_|2L<ِϑ(=kСXAYzg KHh1EW%# Cpd[om**HRwj$I.VOI{X Sr <[^DąքV&SO=e.SNs衇u#֖,{znkSkyEŶr"G6A{X#<# 1cZA+d}3{ュ X6us1[~}v!z7c'= P9LSyhꪫ ֭kXbʲ&Jre( g]%K0t^NrP.*UB@ssD s`EX/ʼ=,ȟU"I`a]zdN@_J_Vyґ`W0 _J.8H"w}OFT]Ļ65 OqFEHy"q$K/"B&& VNtR)ގ >eAELՂ ݻwgѨ. zE$%=xsa0^nU(6mߏY >_Qb,H ߞ'`ODtɆ=w}]23H"&YRyS;&(%6IףNN@;; я4i X\s9ęDNH/g'P=L0U* {!H>AE${=iɂQLLUn7U޺תUQe+'Pͧ^>T;c7;$Xqړ >l>-' FltRZ$￧ Jn/`|I%YN>|"UJJ\Wt_D]oː_"5j"Ň8U! rA3f׽OeAZ,2!hѢڵף"$XnEV9H܆^Ν;ֲeK-IAǧl='w$XH:H`)^:t4DzzTjՒ %­`Ey2l.e|ڵ14,_W>CLT~x7>`%jz ]/o @ۣG+=ڏ\L ?ߑRI V2V:H`̙j2߰aC-.I .LzwϼTHmڴ)1$XNAF$yR6 AVZ%ٳ,|M4I 5|, p5kR6&W׮]6sƍ緗5&UظR +':XQG:u"e˖D$DSNk . VXJ"79s: {eޭ*o. $XZCK,I2;Gu!O<ٷwӂ[ի}2 z$X,r 3݂Q*d\LWzڭ[RĘ+UӒK-xmpݫQF`(^DJ[ YK]AbN`)ұcG_TZnug͚IRİqYקD SLI8kצl[/^{w;?oҥ)[ϫT2jBH%(E"@MΝ;_}/(v5o)"4+ 꺦8HiӦt"a`]_$4Ӈ*" Kސ-" #)$X" `SH@WdL@tr H1D@aD+n"PZJ[Wˌ+3N:JETO^z{ . Q؊ HCD@BDzm6|s/K<, Pkȑf 7~kKR> "RN "X}'X#F076+89ְ, BM7tQpѮPmz(ŋ իh"3_.^j~VΝ;3fX.#tHOƋ@ 0!\үFo߾fȐ!P}f]v1W_}(ԣ[+"PPDY52+VYY`AQA@7"`U*3tKAD']cF@GdL(] ,UM#J@Ql(O>ٌ5JkWłzs" [aO H!*@SeDqRž$*,O&"?,G|*r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#r2CDY,G\#*#~isǚ5j W^fРASNvZ>_~ԪUO2|w5^{4iYjiѢ_W{7x~f͚)䘫:w駟LÆ fZ^=`M6)u$Xt Vi` ^xŊ03ӦM3nmZ[o=|a~+*(#G;;qwifΝ[%%XeK ͷR`:`$XC_Ee{vYիiܸ|`.B3`sGSO=w}cǎf믿. &s9TV,\<#s5-[43=[̙c:t`.nF1/V(z!ӴiӴe;38Ü})W6|p΂l\f` ,v.ݯ袋,wь7! rb-r<>x-eM'xa{[o5cl޼yM@ F/O'X#F0?yL۶m[nm@ z:qLΝG}d#3fvFumڴ1'tOpl6apN[Gڵ)b6tS=u~{ƒ%KL7|cOW4i|iomfjĉ㎳;lذrFtfCԷ.Xݻ=Yf)+_q8N{9K/+WO?\s5v:2)^+Ḅ. Vr85`r-v0%b*83Gu?2}-Ү!"'~ꫯ4 Q {zZp@5k5j=/ 'W/^lE(ҭ[7!ԩSso&yeVx Sx]K{s4h`m(OfϞۍ(szTN-Zdvjva#,fk+@H 2j$Xt10=H0VZɓ'=1cJk̙`MUyT"T`&OO{]o#g7o<H`/5o|gv+^$GUݬRy)O.6pCk+ Ӥ5jn(?x[v>6 >dk+{GK I3$XyԲŔѽkw}vcjVڷoo#YaPtDZFU,^5`A 0s1v' mgڅx"X4"U!L5'C2뮻g}F3z\_vE:lwm: 03HҋNY}mU`Ud'J8$eIDATXp_:b[oT9 9Vc.Q %ڙbp,8/`qUd_YTeLC%0lF9 o׫u1vaȐnjc7O֬c!"hDXO=K=P[k~D'|g#%U&XYުy$X #E%_uU+4uֵZLqRb+JeT+GȌ`.kOau*u@T̶]Q>^$X8Bf8K@k$X8Bf8K@k$X8Bf8K@k$X8Bf8K@k"Xd?묳eśIbU%l(lf3[IT^ɷM& ~aBmeB7nr5Y?*\l+UFt){6=W2‹7H/[]CHL Rр;QHe;NZJ[}1xi#6KKql ټ]ϊdcC&0cKmFfr)SH7vej^o*Xt\^򤃓 lAf[2OY`Nȿ{u?`pyk}*b~ᇧ_nC?^rی!-h7m <2XTG\i@2A`Έt .utRd%grؗDw`G]R@_ejgg>2ҧx_e|yw0{E ]wyPmg(Xtj2|?cVTȒ0pCn䭶Ĥ'"6)A^%/TG^{fLD*!2*0P)ai^D%3Ed;LABHV_oDI D]I@KZ"d@7" ˲YFW20Ks~mQaz .H-/C.ףL2@ΏY1恇4X)A>mA$> U$XIQ`7alEm|u-?sbap  魼/d4S:@_" >R4):7(X 2F@RU >".O^9~=!C9C)Wh~z ?DUD%Cp}"t2v'XdOw1W^Ft}` 81cJHk",G<`>x#+OI'?>G§%=bpf#O$b0TEpJ|o?ӑHܰlD r ^2pa9#eٲe6_,y?#.>X^t` "]|v‡O0cpEEkXWvm yit~^t|7:D6%(X2sn& 陝 -~#cy啲-y#J&kEG\,^A[C!9)D^Lq1Ftŀ#͛$)_e0Cx DDV5 ďk^ŔDpɒ4tÙ&*pnDBN -153vXvWvK:BD塀."!2SP)+ Aa-D<ԋ Azkd*XA#LtoZ`$B`؞`7]y}&GY {>.v"`UHA`=b-!* C6 P,$ _* V=(E$XE*& *`ϧqh+^TD$X B@UDE@/ƥ5xR`* VP" V||HME D@U $XũD $XcZ!G( V `*`E߇ql+^UD O<`*h`Eq^WϪ]" VtjHV@t H8[.w6ȐaСCM>}'X#F076˰6&! *W*"xbӠASzuSNh"e̪U׏TdlHSHr"0`3|prJ5j}!CTNB` jԨYbߊ5k (_d+NT[D O(KU0uz H T@t ,EWc\-`ճjH(] UuZQHU@t e|fԨQZci+nͬQk"" Xvmx VA0Fe3dI qCCv/" 9sܐ`8fNj?AH qC6]MHsܐ`Eg|/?u@X7$Xay݁hLȁ@+0;^\"4a@{x Ʈ@ 9nH"u3>̎:[D ,aux4_&@ qCrJ/. H0 Vzְq5]' ʟjȁ@/su7a92!̎Vu]9nH]Esܐ`%."- Vt|A U""PraJnw.fs,Ȇ@+O0;^P9"atufK0v5]"M qC鮓av," asܐ`ufs2AD aSxqav@9nH %.&!t0;^~l9nH 9| "0 V)av0T;D i7$XImZJtȟ+!av)" sP@˄0;^XmuE@#!w>;̎ip2^L qC`j@$X!f+HT@ 9n(*ݹ` DD al<cx1C@b9nHb͆ f8 S~r[fNj1v5MbM qCUvmzjӯ_?se#\av(c͡jjԨҌM6̝;M{w͹kOuKsW~L2|'_ށ˗/7C 1?Y`eqC&MUV[~N:qum.0 V/Gao֬Yc /0ExEmx+G.袼\Ex 3p@3a„Ap 7`f/ɓ'kp 5gӼy\s%!rSʬ E꼧]O_}vB]*1T&X7x4i?@CM^{eƏo#K㯺*sAY]w{@i|6B_~ײeK3rHXri׮om}QGetbrwݻgT6L͚5kꫯ6_|!T9f6m!73Gqy7,(_r EPE@AE4EP0P%BqEt P B2(Dġ!'|<^_Sy Ass={|Q1H焹˗/Tay|ܹsGaydgg[aY1kkkaʵ"@%wQ|J˼| @B>,6f~~^2ԋ/$E/*++9880===f{{[ӧ.k||\~ka$`}Ur+++LOO~z=̵ Had`˷o߄驩2%%%"Xw5N/ϟ?"ׯ v[ffLJ"+XMጨ`ɰS< }NǏK`!+8YD %5d.^?AX?.U__onݻ'H@ ! #]5&&&#X_~5yyy"q*..Q.f|`hqx'@Ό3\@ }r#n O`gdZ?qpޏ3Z[[Muu9<<@F`!C07d'BrrrnKXѣG"nC@I0ɃZ!ihh} `>x?D\AÇRb}5[[["KwcY<6[ ʅuN B %Z!҇AY2mdWȲQ\XX=Jo!Ai-u2'``],XTTde(q$YWWZЯp9;;~x CEI߿rpB 3Y,Ύ1dr P3 =,|<\kG?$}@l$pgX%…alllHւ BpIB1J9胡/zONNLEEru. DeK /bZu)oMNN^,&N~#p;$Z(++3SSS~?чKX5eqGXMNjDDdkr;;BJ BYE4ƍ->+Lt<+HWp'Яdp5A kh Vg4/VY qe#j:i: Ěfܠ`um^휳Ih ֩Gཚ 4ǁ%e%! h mޖbY@ h V]'5/9H@fܠ`izޫx0[ A@3nP|XRh:^Rfܠ`maY|4`g|t< Ќ̰"Z[t<-^ `4+z7OЌ,:h: W,|(4/ Ќ̰nBMNjH ͸ArR {V搀547jY@ h V]'5/9H@fܠ`izޫx0[ A@3nP|XRh:^Rfܠ`m)j:i: Ěfܠ`um^휳Ih ֩Gཚ 4ǁ%e%! h m'NI 8 Vp\M]N!͸ +MӲ%F@3nP]gk:^q$`1͸AYL x%@JχB@B1 :͸ ֏;:/tPNHЌ,/'g5/a(i XC@3nPq8HH+sSBy?d %IENDB`aom-3.12.1/doc/img/000077500000000000000000000000001477627663500137225ustar00rootroot00000000000000aom-3.12.1/doc/img/edge_direction.svg000066400000000000000000014333001477627663500174130ustar00rootroot00000000000000 Page-1 Square 0 0 Square.113 1 1 Square.114 2 2 Square.115 3 3 Square.116 1 1 Square.117 2 2 Square.118 3 3 Square.119 4 4 Square.124 2 2 Square.125 3 3 Square.126 4 4 Square.127 5 5 Square.128 3 3 Square.129 4 4 Square.130 5 5 Square.131 6 6 Square.132 4 4 Square.133 5 5 Square.134 6 6 Square.135 7 7 Square.136 5 5 Square.137 6 6 Square.138 7 7 Square.139 8 8 Square.140 6 6 Square.141 7 7 Square.142 8 8 Square.143 9 9 Square.144 7 7 Square.145 8 8 Square.146 9 9 Square.147 10 10 Square.148 4 4 Square.149 5 5 Square.150 6 6 Square.151 7 7 Square.152 5 5 Square.153 6 6 Square.154 7 7 Square.155 8 8 Square.156 6 6 Square.157 7 7 Square.158 8 8 Square.159 9 9 Square.160 7 7 Square.161 8 8 Square.162 9 9 Square.163 10 10 Square.164 8 8 Square.165 9 9 Square.166 10 10 Square.167 11 11 Square.168 9 9 Square.169 10 10 Square.170 11 11 Square.171 12 12 Square.172 10 10 Square.173 11 11 Square.174 12 12 Square.175 13 13 Square.176 11 11 Square.177 12 12 Square.178 13 13 Square.179 14 14 Square.180 0 0 Square.181 0 0 Square.182 1 1 Square.183 1 1 Square.184 1 1 Square.185 1 1 Square.186 2 2 Square.187 2 2 Square.188 2 2 Square.189 2 2 Square.190 3 3 Square.191 3 3 Square.192 3 3 Square.193 3 3 Square.194 4 4 Square.195 4 4 Square.196 4 4 Square.197 4 4 Square.198 5 5 Square.199 5 5 Square.200 5 5 Square.201 5 5 Square.202 6 6 Square.203 6 6 Square.204 6 6 Square.205 6 6 Square.206 7 7 Square.207 7 7 Square.208 7 7 Square.209 7 7 Square.210 8 8 Square.211 8 8 Square.212 2 2 Square.213 2 2 Square.214 3 3 Square.215 3 3 Square.216 3 3 Square.217 3 3 Square.218 4 4 Square.219 4 4 Square.220 4 4 Square.221 4 4 Square.222 5 5 Square.223 5 5 Square.224 5 5 Square.225 5 5 Square.226 6 6 Square.227 6 6 Square.228 6 6 Square.229 6 6 Square.230 7 7 Square.231 7 7 Square.232 7 7 Square.233 7 7 Square.234 8 8 Square.235 8 8 Square.236 8 8 Square.237 8 8 Square.238 9 9 Square.239 9 9 Square.240 9 9 Square.241 9 9 Square.242 10 10 Square.243 10 10 Square.244 0 0 Square.245 0 0 Square.246 0 0 Square.247 0 0 Square.248 1 1 Square.249 1 1 Square.250 1 1 Square.251 1 1 Square.252 2 2 Square.253 2 2 Square.254 2 2 Square.255 2 2 Square.256 3 3 Square.257 3 3 Square.258 3 3 Square.259 3 3 Square.260 4 4 Square.261 4 4 Square.262 4 4 Square.263 4 4 Square.264 5 5 Square.265 5 5 Square.266 5 5 Square.267 5 5 Square.268 6 6 Square.269 6 6 Square.270 6 6 Square.271 6 6 Square.272 7 7 Square.273 7 7 Square.274 7 7 Square.275 7 7 Square.276 0 0 Square.277 0 0 Square.278 0 0 Square.279 0 0 Square.280 1 1 Square.281 1 1 Square.282 1 1 Square.283 1 1 Square.284 2 2 Square.285 2 2 Square.286 2 2 Square.287 2 2 Square.288 3 3 Square.289 3 3 Square.290 3 3 Square.291 3 3 Square.292 4 4 Square.293 4 4 Square.294 4 4 Square.295 4 4 Square.296 5 5 Square.297 5 5 Square.298 5 5 Square.299 5 5 Square.300 6 6 Square.301 6 6 Square.302 6 6 Square.303 6 6 Square.304 7 7 Square.305 7 7 Square.306 7 7 Square.307 7 7 Square.308 3 3 Square.309 3 3 Square.310 2 2 Square.311 2 2 Square.312 4 4 Square.313 4 4 Square.314 3 3 Square.315 3 3 Square.316 5 5 Square.317 5 5 Square.318 4 4 Square.319 4 4 Square.320 6 6 Square.321 6 6 Square.322 5 5 Square.323 5 5 Square.324 7 7 Square.325 7 7 Square.326 6 6 Square.327 6 6 Square.328 8 8 Square.329 8 8 Square.330 7 7 Square.331 7 7 Square.332 9 9 Square.333 9 9 Square.334 8 8 Square.335 8 8 Square.336 10 10 Square.337 10 10 Square.338 9 9 Square.339 9 9 Square.340 1 1 Square.341 1 1 Square.342 0 0 Square.343 0 0 Square.344 2 2 Square.345 2 2 Square.346 1 1 Square.347 1 1 Square.348 3 3 Square.349 3 3 Square.350 2 2 Square.351 2 2 Square.352 4 4 Square.353 4 4 Square.354 3 3 Square.355 3 3 Square.356 5 5 Square.357 5 5 Square.358 4 4 Square.359 4 4 Square.360 6 6 Square.361 6 6 Square.362 5 5 Square.363 5 5 Square.364 7 7 Square.365 7 7 Square.366 6 6 Square.367 6 6 Square.368 8 8 Square.369 8 8 Square.370 7 7 Square.371 7 7 Square.372 7 7 Square.373 6 6 Square.374 5 5 Square.375 4 4 Square.376 8 8 Square.377 7 7 Square.378 6 6 Square.379 5 5 Square.380 9 9 Square.381 8 8 Square.382 7 7 Square.383 6 6 Square.384 10 10 Square.385 9 9 Square.386 8 8 Square.387 7 7 Square.388 11 11 Square.389 10 10 Square.390 9 9 Square.391 8 8 Square.392 12 12 Square.393 11 11 Square.394 10 10 Square.395 9 9 Square.396 13 13 Square.397 12 12 Square.398 11 11 Square.399 10 10 Square.400 14 14 Square.401 13 13 Square.402 12 12 Square.403 11 11 Square.404 3 3 Square.405 2 2 Square.406 1 1 Square.407 0 0 Square.408 4 4 Square.409 3 3 Square.410 2 2 Square.411 1 1 Square.412 5 5 Square.413 4 4 Square.414 3 3 Square.415 2 2 Square.416 6 6 Square.417 5 5 Square.418 4 4 Square.419 3 3 Square.420 7 7 Square.421 6 6 Square.422 5 5 Square.423 4 4 Square.424 8 8 Square.425 7 7 Square.426 6 6 Square.427 5 5 Square.428 9 9 Square.429 8 8 Square.430 7 7 Square.431 6 6 Square.432 10 10 Square.433 9 9 Square.434 8 8 Square.435 7 7 Square.436 7 7 Square.437 6 6 Square.438 5 5 Square.439 4 4 Square.440 7 7 Square.441 6 6 Square.442 5 5 Square.443 4 4 Square.444 8 8 Square.445 7 7 Square.446 6 6 Square.447 5 5 Square.448 8 8 Square.449 7 7 Square.450 6 6 Square.451 5 5 Square.452 9 9 Square.453 8 8 Square.454 7 7 Square.455 6 6 Square.456 9 9 Square.457 8 8 Square.458 7 7 Square.459 6 6 Square.460 10 10 Square.461 9 9 Square.462 8 8 Square.463 7 7 Square.464 10 10 Square.465 9 9 Square.466 8 8 Square.467 7 7 Square.468 3 3 Square.469 2 2 Square.470 1 1 Square.471 0 0 Square.472 3 3 Square.473 2 2 Square.474 1 1 Square.475 0 0 Square.476 4 4 Square.477 3 3 Square.478 2 2 Square.479 1 1 Square.480 4 4 Square.481 3 3 Square.482 2 2 Square.483 1 1 Square.484 5 5 Square.485 4 4 Square.486 3 3 Square.487 2 2 Square.488 5 5 Square.489 4 4 Square.490 3 3 Square.491 2 2 Square.492 6 6 Square.493 5 5 Square.494 4 4 Square.495 3 3 Square.496 6 6 Square.497 5 5 Square.498 4 4 Square.499 3 3 Square.500 0 0 Square.501 1 1 Square.502 2 2 Square.503 3 3 Square.504 0 0 Square.505 1 1 Square.506 2 2 Square.507 3 3 Square.508 0 0 Square.509 1 1 Square.510 2 2 Square.511 3 3 Square.512 0 0 Square.513 1 1 Square.514 2 2 Square.515 3 3 Square.516 0 0 Square.517 1 1 Square.518 2 2 Square.519 3 3 Square.520 0 0 Square.521 1 1 Square.522 2 2 Square.523 3 3 Square.524 0 0 Square.525 1 1 Square.526 2 2 Square.527 3 3 Square.528 0 0 Square.529 1 1 Square.530 2 2 Square.531 3 3 Square.532 4 4 Square.533 5 5 Square.534 6 6 Square.535 7 7 Square.536 4 4 Square.537 5 5 Square.538 6 6 Square.539 7 7 Square.540 4 4 Square.541 5 5 Square.542 6 6 Square.543 7 7 Square.544 4 4 Square.545 5 5 Square.546 6 6 Square.547 7 7 Square.548 4 4 Square.549 5 5 Square.550 6 6 Square.551 7 7 Square.552 4 4 Square.553 5 5 Square.554 6 6 Square.555 7 7 Square.556 4 4 Square.557 5 5 Square.558 6 6 Square.559 7 7 Square.560 4 4 Square.561 5 5 Square.562 6 6 Square.563 7 7 Square.564 0 0 Square.565 1 1 Square.566 2 2 Square.567 3 3 Square.568 0 0 Square.569 1 1 Square.570 2 2 Square.571 3 3 Square.572 1 1 Square.573 2 2 Square.574 3 3 Square.575 4 4 Square.576 1 1 Square.577 2 2 Square.578 3 3 Square.579 4 4 Square.580 2 2 Square.581 3 3 Square.582 4 4 Square.583 5 5 Square.584 2 2 Square.585 3 3 Square.586 4 4 Square.587 5 5 Square.588 3 3 Square.589 4 4 Square.590 5 5 Square.591 6 6 Square.592 3 3 Square.593 4 4 Square.594 5 5 Square.595 6 6 Square.596 4 4 Square.597 5 5 Square.598 6 6 Square.599 7 7 Square.600 4 4 Square.601 5 5 Square.602 6 6 Square.603 7 7 Square.604 5 5 Square.605 6 6 Square.606 7 7 Square.607 8 8 Square.608 5 5 Square.609 6 6 Square.610 7 7 Square.611 8 8 Square.612 6 6 Square.613 7 7 Square.614 8 8 Square.615 9 9 Square.616 6 6 Square.617 7 7 Square.618 8 8 Square.619 9 9 Square.620 7 7 Square.621 8 8 Square.622 9 9 Square.623 10 10 Square.624 7 7 Square.625 8 8 Square.626 9 9 Square.627 10 10 Sheet.630 Sheet.631 Sheet.632 Sheet.633 Sheet.634 Sheet.635 Sheet.636 Sheet.637 Sheet.638 d = 0 d = 0 Sheet.639 d = 1 d = 1 Sheet.640 d = 2 d = 2 Sheet.641 d = 3 d = 3 Sheet.642 d = 4 d = 4 Sheet.643 d = 5 d = 5 Sheet.644 d = 6 d = 6 Sheet.645 d = 7 d = 7 aom-3.12.1/doc/img/equ_dir_search.svg000066400000000000000000000512531477627663500174260ustar00rootroot00000000000000 Page-1 Sheet.1 aom-3.12.1/doc/img/equ_dual_self_guided.svg000066400000000000000000000140341477627663500205760ustar00rootroot00000000000000 Page-1 Sheet.1 aom-3.12.1/doc/img/equ_dual_self_para.svg000066400000000000000000000133741477627663500202660ustar00rootroot00000000000000 Page-1 Sheet.1 aom-3.12.1/doc/img/equ_edge_direction.svg000066400000000000000000000266151477627663500202730ustar00rootroot00000000000000 Page-1 Sheet.1 aom-3.12.1/doc/img/equ_guided_filter.svg000066400000000000000000000101071477627663500201220ustar00rootroot00000000000000 Page-1 Sheet.1 aom-3.12.1/doc/img/equ_wiener_filter.svg000066400000000000000000000074551477627663500201660ustar00rootroot00000000000000 Page-1 Sheet.1 aom-3.12.1/doc/img/inter_motion_field.svg000066400000000000000000000324521477627663500203220ustar00rootroot00000000000000 Page-1 Parallelogram Parallelogram.2 Parallelogram.3 Parallelogram.4 Parallelogram.5 Parallelogram.6 Sheet.7 MVref MVref Sheet.9 Sheet.12 Parallelogram.13 Parallelogram.14 Sheet.8 Sheet.15 MVref MVref Sheet.16 Sheet.17 MV0 MV0 Sheet.19 Sheet.21 Current frame Current frame Sheet.22 Reference frame 1 (R1) Reference frame 1 (R1) Sheet.23 Reference frame 0 Reference frame 0 Sheet.24 Reference frame of R1 Reference frame of R1 aom-3.12.1/doc/img/inter_obmc.svg000066400000000000000000000072641477627663500165750ustar00rootroot00000000000000 页-1 工作表.1 4 4 工作表.2 0 0 工作表.3 2 2 工作表.4 3 3 工作表.5 1 1 工作表.6 aom-3.12.1/doc/img/inter_spatial_mvp.svg000066400000000000000000000235561477627663500201760ustar00rootroot00000000000000 Page-1 Square.33 Square.38 Square.39 Square.40 Square.41 Square.42 Square.43 Square.44 Square.45 Square.46 Square.47 Square.48 Square.49 Square.50 Square.51 Square.52 Square.53 Square.54 Sheet.55 Sheet.56 Sheet.58 Square.59 Sheet.60 Sheet.61 aom-3.12.1/doc/img/inter_tmvp_positions.svg000066400000000000000000000117321477627663500207450ustar00rootroot00000000000000 Page-1 Square.31 B4 B4 Square.30 B6 B6 Square.32 B5 B5 Square Square.26 B0 B0 Square.27 B1 B1 Square.28 B2 B2 Square.29 B3 B3 aom-3.12.1/doc/img/inter_tx_partition.svg000066400000000000000000000103221477627663500203660ustar00rootroot00000000000000 Page-1 Square.24 Square.25 Sheet.26 Sheet.27 Sheet.28 Sheet.29 Sheet.30 Sheet.31 Sheet.32 Sheet.33 Sheet.34 Sheet.35 aom-3.12.1/doc/img/intra_cfl.svg000066400000000000000000000255061477627663500164140ustar00rootroot00000000000000 Page-1 Tagged process Sheet.6 Sheet.7 Sheet.8 Sub-Sample Sub-Sample Tagged process.9 Sheet.10 Sheet.11 Sheet.12 Average Average Sheet.27 Sheet.28 Sheet.29 - - Dynamic connector Sheet.35 Sheet.36 × × Sheet.37 Sheet.38 Sheet.39 + + Sheet.40 Dynamic connector.41 Dynamic connector.74 Sheet.75 Sheet.78 Chroma DC Prediction Chroma DC Prediction Sheet.82 Scaling parameter α Scaling parameter α Sheet.83 Luma reconstructed samples Luma reconstructed samples Sheet.84 CfL Prediction CfL Prediction Sheet.85 “AC” contribution AC contribution aom-3.12.1/doc/img/intra_directional.svg000066400000000000000000000270101477627663500201350ustar00rootroot00000000000000 Page-1 Square Sheet.5 D135_PRED D135_PRED Sheet.6 H_PRED H_PRED Sheet.8 D113_PRED D113_PRED Sheet.9 D157_PRED D157_PRED Sheet.10 D203_PRED D203_PRED Sheet.11 D45_PRED D45_PRED Sheet.12 V_PRED V_PRED Sheet.13 Sheet.14 Sheet.15 Sheet.16 Sheet.17 Sheet.18 Sheet.30 D67_PRED D67_PRED Sheet.31 +1 +1 Sheet.32 +2 +2 Sheet.33 +3 +3 Sheet.34 -1 -1 Sheet.35 -2 -2 Sheet.36 -3 -3 aom-3.12.1/doc/img/intra_paeth.svg000066400000000000000000000200341477627663500167400ustar00rootroot00000000000000 Page-1 Square.211 L L Square.212 T T Square.213 TL TL Sheet.214 Square.183 Square.184 Square.185 Square.186 Square.199 Square.200 Square.201 Current Pixel Current Pixel Square.202 Square.203 Square.204 Square.205 Square.206 Square.207 Square.208 Square.209 Square.210 Square.215 aom-3.12.1/doc/img/intra_recursive.svg000066400000000000000000000745611477627663500176640ustar00rootroot00000000000000 Page-1 Sheet.149 Square.142 Square.143 Square.144 Square.145 Square.146 Square.147 Square.148 Sheet.64 Rectangle Sheet.63 Square.46 Square.47 Square.48 Square.49 Square.50 Square.51 Square.52 Square.53 Sheet.65 Rectangle Sheet.67 Square.46 Square.47 Square.48 Square.49 Square.50 Square.51 Square.52 Square.53 Sheet.76 Rectangle Sheet.78 Square.46 Square.47 Square.48 Square.49 Square.50 Square.51 Square.52 Square.53 Sheet.87 Rectangle Sheet.89 Square.46 Square.47 Square.48 Square.49 Square.50 Square.51 Square.52 Square.53 Sheet.98 Rectangle Sheet.100 Square.46 Square.47 Square.48 Square.49 Square.50 Square.51 Square.52 Square.53 Sheet.109 Rectangle Sheet.111 Square.46 Square.47 Square.48 Square.49 Square.50 Square.51 Square.52 Square.53 Sheet.120 Rectangle Sheet.122 Square.46 Square.47 Square.48 Square.49 Square.50 Square.51 Square.52 Square.53 Sheet.131 Rectangle Sheet.133 Square.46 Square.47 Square.48 Square.49 Square.50 Square.51 Square.52 Square.53 Sheet.150 Sheet.151 Sheet.152 Sheet.153 Sheet.154 Sheet.155 Sheet.156 Sheet.157 Sheet.158 Sheet.159 Sheet.160 Sheet.161 Sheet.162 Sheet.163 aom-3.12.1/doc/img/intra_tx_partition.svg000066400000000000000000000150611477627663500203670ustar00rootroot00000000000000 Page-1 Square Square.4 Sheet.5 Sheet.6 Square.7 Sheet.8 Sheet.9 Sheet.10 Sheet.11 Sheet.12 Sheet.13 Sheet.14 Sheet.15 Sheet.16 Sheet.17 Sheet.18 Sheet.19 Sheet.20 Sheet.21 Sheet.22 Sheet.23 Sheet.36 Sheet.37 Sheet.38 Sheet.39 aom-3.12.1/doc/img/loop_restoration.svg000066400000000000000000000147741477627663500200620ustar00rootroot00000000000000 Page-1 Parallelogram Sheet.28 Sheet.29 Sheet.33 Sheet.36 Sheet.37 Sheet.38 X X Sheet.43 X1 X1 Sheet.52 X2 X2 Sheet.53 Y Y Sheet.54 Xr = X + α(X1 – X) + β(X2 – X) Xr = X + α(X1 X) + β(X2 X) aom-3.12.1/doc/img/partition_codingblock.svg000066400000000000000000000262711477627663500210220ustar00rootroot00000000000000 Page-1 Square Square.6 Square.15 Square.24 Square.30 Square.34 Square.38 Square.42 Square.47 Sheet.50 PARTITION_SPLIT PARTITION_SPLIT Sheet.51 PARTITION_VERT_4 PARTITION_VERT_4 Sheet.52 PARTITION_HORZ_4 PARTITION_HORZ_4 Sheet.60 PARTITION_HORZ_B PARTITION_HORZ_B Sheet.61 PARTITION_VERT_A PARTITION_VERT_A Sheet.62 PARTITION_HORZ_A PARTITION_HORZ_A Sheet.63 PARTITION_VERT_B PARTITION_VERT_B Sheet.64 PARTITION_HORZ PARTITION_HORZ Sheet.65 PARTITION_VERT PARTITION_VERT Sheet.66 Sheet.67 Sheet.68 Sheet.69 Sheet.70 Sheet.71 Sheet.72 Sheet.73 Sheet.74 Sheet.75 Sheet.76 Sheet.77 Sheet.78 Sheet.79 Sheet.80 Sheet.81 Sheet.82 Sheet.83 aom-3.12.1/doc/img/primary_tap.svg000066400000000000000000002112401477627663500167720ustar00rootroot00000000000000 Page-1 Square Square.2 Square.3 Square.4 Square.5 a/16 a/16 Square.6 Square.7 Square.8 Square.9 b/16 b/16 Square.10 Square.11 Square.12 Square.13 Square.14 Square.15 Square.16 Square.17 b/16 b/16 Square.18 Square.19 Square.20 Square.21 a/16 a/16 Square.22 Square.23 Square.24 Square.25 Square.30 Square.31 Square.32 Square.33 Square.34 Square.35 Square.36 Square.37 Square.38 Square.39 a/16 a/16 Square.40 Square.41 b/16 b/16 Square.42 Square.43 b/16 b/16 Square.44 Square.45 a/16 a/16 Square.46 Square.47 Square.48 Square.49 Square.50 Square.51 Square.52 Square.53 Square.54 Square.55 Square.56 Square.57 Square.58 Square.59 Square.60 Square.61 Square.62 Square.63 Square.64 Square.65 a/16 a/16 Square.66 b/16 b/16 Square.67 Square.68 b/16 b/16 Square.69 a/16 a/16 Square.70 Square.71 Square.72 Square.73 Square.74 Square.75 Square.76 Square.77 Square.78 Square.79 Square.80 Square.81 Square.82 Square.83 Square.84 Square.85 a/16 a/16 Square.86 Square.87 Square.88 Square.89 Square.90 Square.91 b/16 b/16 Square.92 Square.93 b/16 b/16 Square.94 Square.95 Square.96 Square.97 Square.98 Square.99 a/16 a/16 Square.100 Square.101 Square.102 Square.103 Square.104 Square.115 a/16 a/16 Square.116 Square.117 Square.118 Square.119 Square.120 Square.121 b/16 b/16 Square.122 Square.123 Square.124 Square.125 Square.126 Square.127 Square.128 Square.129 Square.130 Square.131 Square.132 Square.133 b/16 b/16 Square.134 Square.135 Square.136 Square.137 Square.138 Square.139 a/16 a/16 Square.140 Square.141 a/16 a/16 Square.142 Square.143 Square.144 Square.145 Square.146 Square.147 b/16 b/16 Square.148 Square.149 Square.150 Square.151 Square.152 Square.153 Square.154 Square.155 Square.156 Square.157 b/16 b/16 Square.158 Square.159 Square.160 Square.161 Square.162 Square.163 a/16 a/16 Square.164 Square.165 Square.166 Square.167 a/16 a/16 Square.168 Square.169 Square.170 Square.171 Square.172 b/16 b/16 Square.173 Square.174 Square.175 Square.176 Square.177 Square.178 Square.179 Square.180 Square.181 Square.182 b/16 b/16 Square.183 Square.184 Square.185 Square.186 Square.187 a/16 a/16 Square.188 Square.189 Square.190 Square.191 Square.192 Square.193 a/16 a/16 Square.194 Square.195 Square.196 Square.197 b/16 b/16 Square.198 Square.199 Square.200 Square.201 Square.202 Square.203 Square.204 Square.205 Square.206 Square.207 b/16 b/16 Square.208 Square.209 Square.210 Square.211 a/16 a/16 Square.212 Square.213 Square.214 Sheet.236 d = 0 d = 0 Sheet.237 d = 1 d = 1 Sheet.238 d = 2 d = 2 Sheet.239 d = 3 d = 3 Sheet.240 d = 4 d = 4 Sheet.241 d = 5 d = 5 Sheet.242 d = 6 d = 6 Sheet.243 d = 7 d = 7 aom-3.12.1/doc/img/quant_ac.svg000066400000000000000000004125421477627663500162460ustar00rootroot00000000000000tables3Asset 105000100001500020000250003000035000050100150200250QstepQ_index8-bit AC10-bit AC12-bit ACaom-3.12.1/doc/img/quant_dc.svg000066400000000000000000004042671477627663500162560ustar00rootroot00000000000000tables2Asset 10500010000150002000025000050100150200250QstepQ_index8-bit DC10-bit DC12-bit DCaom-3.12.1/doc/img/scc_intrabc.svg000066400000000000000000000433401477627663500167210ustar00rootroot00000000000000 Page-1 Sheet.1 Sheet.3 Sheet.4 Sheet.5 Sheet.6 Sheet.7 Sheet.8 Sheet.9 Sheet.10 Sheet.11 Sheet.12 Sheet.13 Sheet.14 Sheet.15 Sheet.16 Sheet.17 Sheet.18 Sheet.19 Sheet.20 Sheet.21 Sheet.22 Sheet.23 Sheet.24 Sheet.25 Sheet.26 Sheet.27 Sheet.28 Sheet.29 Sheet.30 Sheet.31 Sheet.32 Sheet.33 Sheet.34 Sheet.35 Sheet.36 Sheet.37 Sheet.38 Sheet.39 Sheet.40 Sheet.41 Sheet.42 Sheet.43 Sheet.44 Sheet.45 Sheet.46 Sheet.47 Sheet.48 Sheet.49 Sheet.50 Sheet.51 Sheet.52 Sheet.53 Sheet.54 Sheet.55 Sheet.56 Sheet.57 Sheet.58 Sheet.59 Sheet.60 Sheet.61 Sheet.62 Sheet.63 Sheet.64 Sheet.65 Sheet.66 Sheet.67 Sheet.68 Sheet.69 Sheet.70 Sheet.71 Sheet.72 Sheet.73 Sheet.74 Sheet.75 Sheet.76 Sheet.79 Current processing block Current processing block Sheet.80 Allowed prediction block Allowed prediction block Sheet.81 Restricted immediate blocks Restricted immediate blocks aom-3.12.1/doc/img/secondary_tap.svg000066400000000000000000001143541477627663500173060ustar00rootroot00000000000000 Page-1 Square Square.2 Square.3 1/16 1/16 Square.4 Square.5 Square.6 Square.7 Square.8 2/16 2/16 Square.9 Square.10 Square.11 1/16 1/16 Square.12 2/16 2/16 Square.13 Square.14 2/16 2/16 Square.15 1/16 1/16 Square.16 Square.17 Square.18 2/16 2/16 Square.19 Square.20 Square.21 Square.22 Square.23 1/16 1/16 Square.24 Square.25 Square.30 Square.31 Square.32 Square.33 1/16 1/16 Square.34 Square.35 1/16 1/16 Square.36 Square.37 2/16 2/16 Square.38 Square.39 Square.40 Square.41 2/16 2/16 Square.42 Square.43 2/16 2/16 Square.44 Square.45 Square.46 Square.47 2/16 2/16 Square.48 Square.49 1/16 1/16 Square.50 Square.51 1/16 1/16 Square.52 Square.53 Square.54 Square.55 1/16 1/16 Square.56 Square.57 Square.58 Square.59 1/16 1/16 Square.60 Square.61 2/16 2/16 Square.62 Square.63 2/16 2/16 Square.64 Square.65 Square.66 Square.67 Square.68 Square.69 Square.70 Square.71 2/16 2/16 Square.72 Square.73 2/16 2/16 Square.74 Square.75 1/16 1/16 Square.76 Square.77 Square.78 Square.79 1/16 1/16 Square.80 Square.81 1/16 1/16 Square.82 Square.83 Square.84 Square.85 Square.86 Square.87 2/16 2/16 Square.88 Square.89 1/16 1/16 Square.90 Square.91 2/16 2/16 Square.92 Square.93 2/16 2/16 Square.94 Square.95 1/16 1/16 Square.96 Square.97 2/16 2/16 Square.98 Square.99 Square.100 Square.101 Square.102 Square.103 1/16 1/16 Square.104 Sheet.236 d = 0, 4 d = 0, 4 Sheet.237 d = 1, 5 d = 1, 5 Sheet.238 d = 2, 6 d = 2, 6 Sheet.239 d = 3, 7 d = 3, 7 aom-3.12.1/doc/img/tx_basis.svg000066400000000000000000001067541477627663500162740ustar00rootroot00000000000000tables2Asset 1Transform TypeBasis function Ti(j), i, j = 0, 1, , N-1DCT-2DST-4DST-7IDT()=02cos+1where 0==0?2:1()=2sin(2+1)+14()=42+1sin(2+1)+12===?1:0aom-3.12.1/doc/img/tx_cands_large.svg000066400000000000000000000117521477627663500174260ustar00rootroot00000000000000tables2Asset 1Max(width, height)IntraInter32DCTOnlyDCTOnly, IDTX64DCTOnlyDCTOnlyaom-3.12.1/doc/img/tx_cands_small.svg000066400000000000000000000153641477627663500174470ustar00rootroot00000000000000tx_cands_smallAsset 1Min(width, height)IntraInter4DTT4, IDTX, 1DDCTALL168DTT4, IDTX, 1DDCTALL1616DTT4, IDTXDTT9, IDTX, 1DDCTaom-3.12.1/doc/img/tx_chroma.svg000066400000000000000000000351641477627663500164400ustar00rootroot00000000000000tx_chromaAsset 1Intra Prediction ModeVertical transformHorizontal transformDC_PREDDCTDCTV_PREDADSTDCTH_PREDDCTADSTD45_PREDDCTDCTD135_PREDADSTADSTD113_PREDADSTDCTD157_PREDDCTADSTD203_PREDDCTADSTD67_PREDADSTDCTSMOOTH_PREDADSTADSTSMOOTH_V_PREDADSTDCTSMOOTH_H_PREDDCTADSTPAETH_PREDADSTADSTaom-3.12.1/doc/img/tx_partition.svg000066400000000000000000000456661477627663500172100ustar00rootroot00000000000000tables2Asset 1Transform size of current depthTransform size of next depthTX_4X4TX_4X4TX_8X8TX_4X4TX_16X16TX_8X8TX_32X32TX_16X16TX_64X64TX_32X32TX_4X8TX_4X4TX_8X4TX_4X4TX_8X16TX_8X8TX_16X8TX_8X8TX_16X32TX_16X16TX_32X16TX_16X16TX_32X64TX_32X32TX_64X32TX_32X32TX_4X16TX_4X8TX_16X4TX_8X4TX_8X32TX_8X16TX_32X8TX_16X8TX_16X64TX_16X32TX_64X16TX_32X16aom-3.12.1/doc/img/tx_set.svg000066400000000000000000000622151477627663500157570ustar00rootroot00000000000000tx_setAsset 1Transform setVertical transformHorizontal transformDCTOnlyDCTDCTIDTXIDTIDTDCT IDT IDTDCTADST ADST ADST DCT DCT ADST DCTDCTDCT DCT DCT ADST DCT Flipped ADST ADSTDCTADST ADST ADST Flipped ADST Flipped ADST DCT Flipped ADST ADST Flipped ADSTFlipped ADST DCT DCT DCT ADST DCT Flipped ADST DCT IDT ADST DCT ADST ADST ADST Flipped ADST ADST IDT Flipped ADST DCT Flipped ADST ADST Flipped ADST Flipped ADST Flipped ADST IDT IDT DCT IDT ADST IDT Flipped ADST IDT IDT 1DDCTDTT4DTT9ALL16aom-3.12.1/docs.cmake000066400000000000000000000351051477627663500143370ustar00rootroot00000000000000# # Copyright (c) 2017, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and the # Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was # not distributed with this source code in the LICENSE file, you can obtain it # at www.aomedia.org/license/software. If the Alliance for Open Media Patent # License 1.0 was not distributed with this source code in the PATENTS file, you # can obtain it at www.aomedia.org/license/patent. # if(AOM_DOCS_CMAKE_) return() endif() # AOM_DOCS_CMAKE_ set(AOM_DOCS_CMAKE_ 1) cmake_minimum_required(VERSION 3.16) set(AOM_DOXYFILE "${AOM_CONFIG_DIR}/doxyfile") set(AOM_DOXYGEN_CONFIG_TEMPLATE "libs.doxy_template") set(AOM_DOXYGEN_OUTPUT_DIR "${AOM_CONFIG_DIR}/dox") set(AOM_DOXYGEN_SECTIONS "av1") set(AOM_DOXYGEN_SOURCES "${AOM_ROOT}/aom/aom.h" "${AOM_ROOT}/aom/aom_codec.h" "${AOM_ROOT}/aom/aom_decoder.h" "${AOM_ROOT}/aom/aom_encoder.h" "${AOM_ROOT}/aom/aom_external_partition.h" "${AOM_ROOT}/aom/aom_frame_buffer.h" "${AOM_ROOT}/aom/aom_image.h" "${AOM_ROOT}/aom/aom_integer.h" "${AOM_ROOT}/av1/common/av1_common_int.h" "${AOM_ROOT}/av1/common/av1_loopfilter.h" "${AOM_ROOT}/av1/common/blockd.h" "${AOM_ROOT}/av1/common/cdef.h" "${AOM_ROOT}/av1/common/enums.h" "${AOM_ROOT}/av1/common/restoration.h" "${AOM_ROOT}/keywords.dox" "${AOM_ROOT}/mainpage.dox" "${AOM_ROOT}/usage.dox") if(CONFIG_AV1_DECODER) set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} "${AOM_ROOT}/apps/aomdec.c" "${AOM_ROOT}/examples/decode_to_md5.c" "${AOM_ROOT}/examples/decode_with_drops.c" "${AOM_ROOT}/examples/simple_decoder.c") set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} "Full featured decoder." "Frame by frame MD5 checksum." "Drops frames while decoding." "Simplified decoder loop.") set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_decoder decoder") set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomdx.h" "${AOM_ROOT}/usage_dx.dox" "${AOM_ROOT}/av1/decoder/decoder.h") if(CONFIG_ANALYZER) set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} "${AOM_ROOT}/examples/analyzer.cc") set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} "Bitstream analyzer.") endif() if(CONFIG_INSPECTION) set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} "${AOM_ROOT}/examples/inspect.c") set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} "Bitstream inspector.") endif() set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/doc/dev_guide/av1_decoder.dox") endif() if(CONFIG_AV1_ENCODER) set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} "${AOM_ROOT}/apps/aomenc.c" "${AOM_ROOT}/examples/lossless_encoder.c" "${AOM_ROOT}/examples/set_maps.c" "${AOM_ROOT}/examples/simple_encoder.c" "${AOM_ROOT}/examples/twopass_encoder.c") set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} "Full featured encoder." "Simplified lossless encoder." "Set active and ROI maps." "Simplified encoder loop." "Two-pass encoder loop.") set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} "${AOM_ROOT}/examples/scalable_encoder.c") set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} "Scalable encoder loop.") set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} "${AOM_ROOT}/examples/svc_encoder_rtc.cc") set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} "Layered encoder for RTC.") set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_encoder encoder") set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomcx.h" "${AOM_ROOT}/usage_cx.dox") set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/doc/dev_guide/av1_encoder.dox") set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom_scale/yv12config.h" "${AOM_ROOT}/av1/encoder/bitstream.h" "${AOM_ROOT}/av1/encoder/block.h" "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h" "${AOM_ROOT}/av1/encoder/encode_strategy.c" "${AOM_ROOT}/av1/encoder/encode_strategy.h" "${AOM_ROOT}/av1/encoder/encodeframe.c" "${AOM_ROOT}/av1/encoder/encoder.c" "${AOM_ROOT}/av1/encoder/encoder.h" "${AOM_ROOT}/av1/encoder/encodetxb.h" "${AOM_ROOT}/av1/encoder/firstpass.h" "${AOM_ROOT}/av1/encoder/gop_structure.h" "${AOM_ROOT}/av1/encoder/interp_search.c" "${AOM_ROOT}/av1/encoder/intra_mode_search.h" "${AOM_ROOT}/av1/encoder/intra_mode_search.c" "${AOM_ROOT}/av1/encoder/intra_mode_search_utils.h" "${AOM_ROOT}/av1/encoder/lookahead.h" "${AOM_ROOT}/av1/encoder/palette.h" "${AOM_ROOT}/av1/encoder/palette.c" "${AOM_ROOT}/av1/encoder/partition_search.h" "${AOM_ROOT}/av1/encoder/partition_search.c" "${AOM_ROOT}/av1/encoder/pass2_strategy.h" "${AOM_ROOT}/av1/encoder/pass2_strategy.c" "${AOM_ROOT}/av1/encoder/pickcdef.h" "${AOM_ROOT}/av1/encoder/picklpf.h" "${AOM_ROOT}/av1/encoder/pickrst.h" "${AOM_ROOT}/av1/encoder/ratectrl.c" "${AOM_ROOT}/av1/encoder/ratectrl.h" "${AOM_ROOT}/av1/encoder/rc_utils.h" "${AOM_ROOT}/av1/encoder/rdopt.h" "${AOM_ROOT}/av1/encoder/rdopt.c" "${AOM_ROOT}/av1/encoder/speed_features.h" "${AOM_ROOT}/av1/encoder/svc_layercontext.c" "${AOM_ROOT}/av1/encoder/svc_layercontext.h" "${AOM_ROOT}/av1/encoder/temporal_filter.h" "${AOM_ROOT}/av1/encoder/temporal_filter.c" "${AOM_ROOT}/av1/encoder/tpl_model.h" "${AOM_ROOT}/av1/encoder/tx_search.h" "${AOM_ROOT}/av1/encoder/txb_rdopt.h" "${AOM_ROOT}/av1/encoder/var_based_part.h" "${AOM_ROOT}/av1/encoder/nonrd_opt.h" "${AOM_ROOT}/av1/encoder/nonrd_pickmode.c") endif() if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER) set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} "${AOM_ROOT}/examples/aom_cx_set_ref.c") set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} "Set encoder reference frame.") endif() if(CONFIG_AV1_ENCODER) set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} "${AOM_ROOT}/examples/lightfield_encoder.c") set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} "Lightfield encoder example.") endif() if(CONFIG_AV1_DECODER) set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} "${AOM_ROOT}/examples/lightfield_tile_list_decoder.c") set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} "Lightfield tile list decoder example.") endif() if(CONFIG_AV1_DECODER) set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} "${AOM_ROOT}/examples/lightfield_decoder.c") set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} "Lightfield decoder example.") endif() if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER) set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} "${AOM_ROOT}/examples/lightfield_bitstream_parsing.c") set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} "Lightfield bitstream parsing example.") endif() # Iterates over list named by $list_name and appends each item to $AOM_DOXYFILE # as values assigned to $var_name with no line breaks between list items. # Appends a new line after the entire config variable is expanded. function(write_cmake_list_to_doxygen_config_var var_name list_name) unset(output_string) foreach(list_item ${${list_name}}) set(output_string "${output_string} ${list_item} ") endforeach() string(STRIP "${output_string}" output_string) file(APPEND "${AOM_DOXYFILE}" "${var_name} += ${output_string}\n") endfunction() function(get_name file_path name_var) get_filename_component(file_basename ${file_path} NAME) get_filename_component(${name_var} ${file_basename} NAME_WE) set(${name_var} ${${name_var}} PARENT_SCOPE) endfunction() function(setup_documentation_targets) # Sanity check: the lengths of these lists must match. list(LENGTH AOM_DOXYGEN_EXAMPLE_SOURCES num_sources) list(LENGTH AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS num_descs) if(NOT ${num_sources} EQUAL ${num_descs}) message(FATAL_ERROR "Unequal example and description totals.") endif() # Take the list of examples and produce example_basename.dox for each file in # the list. file(MAKE_DIRECTORY "${AOM_DOXYGEN_OUTPUT_DIR}") foreach(example_file ${AOM_DOXYGEN_EXAMPLE_SOURCES}) unset(example_basename) get_name("${example_file}" "example_name") set(example_dox "${AOM_DOXYGEN_OUTPUT_DIR}/${example_name}.dox") set(dox_string "/*!\\page example_${example_name} ${example_name}\n") set(dox_string "${dox_string} \\includelineno ${example_file}\n*/\n") file(WRITE "${example_dox}" ${dox_string}) set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${example_dox}") endforeach() # Generate samples.dox, an index page that refers to the example_basename.dox # files that were just created. set(samples_header " /*!\\page samples Sample Code This SDK includes a number of sample applications. Each sample documents a feature of the SDK in both prose and the associated C code. The following samples are included: ") set(utils_desc " In addition, the SDK contains a number of utilities. Since these utilities are built upon the concepts described in the sample code listed above, they are not documented in pieces like the samples are. Their source is included here for reference. The following utilities are included: ") # Write the description for the samples section. set(samples_dox "${AOM_CONFIG_DIR}/samples.dox") file(WRITE "${samples_dox}" "${samples_header}\n") # Iterate over $AOM_DOXYGEN_EXAMPLE_SOURCES and # $AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS and massage example names as required by # AV1's doxygen setup. math(EXPR max_example_index "${num_sources} - 1") foreach(NUM RANGE ${max_example_index}) list(GET AOM_DOXYGEN_EXAMPLE_SOURCES ${NUM} ex_name) get_name("${ex_name}" "ex_name") # AV1's doxygen lists aomdec and aomenc as utils apart from the examples. # Save the indexes for another pass. if("${ex_name}" MATCHES "aomdec\|aomenc") set(util_indexes "${util_indexes}" "${NUM}") continue() endif() list(GET AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${NUM} ex_desc) file(APPEND "${samples_dox}" " - \\subpage example_${ex_name} ${ex_desc}\n") endforeach() # Write the description and index for the utils. file(APPEND "${samples_dox}" "${utils_desc}\n") foreach(util_index ${util_indexes}) list(GET AOM_DOXYGEN_EXAMPLE_SOURCES ${util_index} ex_name) get_name("${ex_name}" "ex_name") list(GET AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${util_index} ex_desc) file(APPEND "${samples_dox}" " - \\subpage example_${ex_name} ${ex_desc}\n") endforeach() file(APPEND "${samples_dox}" "*/") # Add $samples_dox to the doxygen inputs. get_filename_component(samples_dox ${samples_dox} NAME) set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} ${samples_dox}) # There are issues to show Markdown file for old Doxygen version. Here, only # enable Markdown support for 1.8.16 or newer. if(${DOXYGEN_VERSION_VALUE} GREATER_EQUAL 1008016) set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_md_support") set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/README.md") # Uncomment and add AlgorithmDescription.md in result page when it is done. # set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} # "${AOM_ROOT}/doc/AlgorithmDescription.md") endif() # Generate libaom's doxyfile. file(WRITE "${AOM_DOXYFILE}" "##\n## GENERATED FILE. DO NOT EDIT\n##\n") file(READ "${AOM_ROOT}/${AOM_DOXYGEN_CONFIG_TEMPLATE}" doxygen_template_data) file(APPEND "${AOM_DOXYFILE}" ${doxygen_template_data}) file(APPEND "${AOM_DOXYFILE}" "EXAMPLE_PATH += ${AOM_ROOT} ${AOM_ROOT}/examples\n") file(APPEND "${AOM_DOXYFILE}" "INCLUDE_PATH += ${AOM_CONFIG_DIR} ${AOM_ROOT}\n") file(APPEND "${AOM_DOXYFILE}" "STRIP_FROM_PATH += ${AOM_ROOT} ${AOM_CONFIG_DIR}\n") write_cmake_list_to_doxygen_config_var("INPUT" "AOM_DOXYGEN_SOURCES") write_cmake_list_to_doxygen_config_var("ENABLED_SECTIONS" "AOM_DOXYGEN_SECTIONS") # Add AOMedia logo. set(aom_logo "aomedia_logo_200.png") configure_file(${AOM_ROOT}/${aom_logo} ${AOM_CONFIG_DIR}/${aom_logo} COPYONLY) file(APPEND "${AOM_DOXYFILE}" "PROJECT_LOGO = ${AOM_CONFIG_DIR}/${aom_logo}\n") # Only set HAVE_DOT to YES if dot tool is found. if(DOXYGEN_DOT_FOUND) file(APPEND "${AOM_DOXYFILE}" "HAVE_DOT = YES\n") file(APPEND "${AOM_DOXYFILE}" "DOT_GRAPH_MAX_NODES = 10000\n") endif() # Add image path. file(APPEND "${AOM_DOXYFILE}" "IMAGE_PATH += ${AOM_ROOT}/doc/dev_guide\n") # Allow banner style comments file(APPEND "${AOM_DOXYFILE}" "JAVADOC_BANNER = YES") # Add the doxygen generation rule. add_custom_target(docs ALL COMMAND "${DOXYGEN_EXECUTABLE}" "${AOM_DOXYFILE}" DEPENDS "${AOM_DOXYFILE}" ${AOM_DOXYGEN_SOURCES} ${AOM_DOXYGEN_EXAMPLE_SOURCES} "${AOM_DOXYGEN_CONFIG_TEMPLATE}" SOURCES "${AOM_DOXYFILE}" ${AOM_DOXYGEN_SOURCES} ${AOM_DOXYGEN_EXAMPLE_SOURCES} "${AOM_DOXYGEN_CONFIG_TEMPLATE}") endfunction() aom-3.12.1/examples/000077500000000000000000000000001477627663500142175ustar00rootroot00000000000000aom-3.12.1/examples/analyzer.cc000066400000000000000000000514441477627663500163630ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include #include "aom/aom_decoder.h" #include "aom/aomdx.h" #include "av1/common/av1_common_int.h" #include "av1/decoder/accounting.h" #include "av1/decoder/inspection.h" #include "common/tools_common.h" #include "common/video_reader.h" #define OD_SIGNMASK(a) (-((a) < 0)) #define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b)) #define OD_DIV_ROUND(x, y) (((x) + OD_FLIPSIGNI((y) >> 1, x)) / (y)) enum { OD_LUMA_MASK = 1 << 0, OD_CB_MASK = 1 << 1, OD_CR_MASK = 1 << 2, OD_ALL_MASK = OD_LUMA_MASK | OD_CB_MASK | OD_CR_MASK }; class AV1Decoder { private: FILE *input; wxString path; AvxVideoReader *reader; const AvxVideoInfo *info; insp_frame_data frame_data; aom_codec_ctx_t codec; bool show_padding; public: aom_image_t *image; int frame; int plane_mask; AV1Decoder(); ~AV1Decoder(); bool open(const wxString &path); void close(); bool step(); int getWidthPadding() const; int getHeightPadding() const; void togglePadding(); int getWidth() const; int getHeight() const; bool getAccountingStruct(Accounting **acct); bool setInspectionCallback(); static void inspect(void *decoder, void *data); }; AV1Decoder::AV1Decoder() : reader(NULL), info(NULL), decoder(NULL), show_padding(false), image(NULL), frame(0) {} AV1Decoder::~AV1Decoder() {} void AV1Decoder::togglePadding() { show_padding = !show_padding; } bool AV1Decoder::open(const wxString &path) { reader = aom_video_reader_open(path.mb_str()); if (!reader) { fprintf(stderr, "Failed to open %s for reading.", path.mb_str().data()); return false; } this->path = path; info = aom_video_reader_get_info(reader); decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); if (!decoder) { fprintf(stderr, "Unknown input codec."); return false; } printf("Using %s\n", aom_codec_iface_name(decoder)); if (aom_codec_dec_init(&codec, decoder, NULL, 0)) { fprintf(stderr, "Failed to initialize decoder."); return false; } ifd_init(&frame_data, info->frame_width, info->frame_height); setInspectionCallback(); return true; } void AV1Decoder::close() {} bool AV1Decoder::step() { if (aom_video_reader_read_frame(reader)) { size_t frame_size; const unsigned char *frame_data; frame_data = aom_video_reader_get_frame(reader, &frame_size); if (aom_codec_decode(&codec, frame_data, frame_size, NULL)) { fprintf(stderr, "Failed to decode frame."); return false; } else { aom_codec_iter_t iter = NULL; image = aom_codec_get_frame(&codec, &iter); if (image != NULL) { frame++; return true; } return false; } } return false; } int AV1Decoder::getWidth() const { return info->frame_width + 2 * getWidthPadding(); } int AV1Decoder::getWidthPadding() const { return show_padding ? AOMMAX(info->frame_width + 16, ALIGN_POWER_OF_TWO(info->frame_width, 6)) - info->frame_width : 0; } int AV1Decoder::getHeight() const { return info->frame_height + 2 * getHeightPadding(); } int AV1Decoder::getHeightPadding() const { return show_padding ? AOMMAX(info->frame_height + 16, ALIGN_POWER_OF_TWO(info->frame_height, 6)) - info->frame_height : 0; } bool AV1Decoder::getAccountingStruct(Accounting **accounting) { return aom_codec_control(&codec, AV1_GET_ACCOUNTING, accounting) == AOM_CODEC_OK; } bool AV1Decoder::setInspectionCallback() { aom_inspect_init ii; ii.inspect_cb = AV1Decoder::inspect; ii.inspect_ctx = (void *)this; return aom_codec_control(&codec, AV1_SET_INSPECTION_CALLBACK, &ii) == AOM_CODEC_OK; } void AV1Decoder::inspect(void *pbi, void *data) { AV1Decoder *decoder = (AV1Decoder *)data; ifd_inspect(&decoder->frame_data, pbi, 0); } #define MIN_ZOOM (1) #define MAX_ZOOM (4) class AnalyzerPanel : public wxPanel { DECLARE_EVENT_TABLE() private: AV1Decoder decoder; const wxString path; int zoom; unsigned char *pixels; const bool bit_accounting; double *bpp_q3; int plane_mask; // The display size is the decode size, scaled by the zoom. int getDisplayWidth() const; int getDisplayHeight() const; bool updateDisplaySize(); void computeBitsPerPixel(); public: AnalyzerPanel(wxWindow *parent, const wxString &path, const bool bit_accounting); ~AnalyzerPanel(); bool open(const wxString &path); void close(); void render(); void togglePadding(); bool nextFrame(); void refresh(); int getZoom() const; bool setZoom(int zoom); void setShowPlane(bool show_plane, int mask); void onPaint(wxPaintEvent &event); // NOLINT }; BEGIN_EVENT_TABLE(AnalyzerPanel, wxPanel) EVT_PAINT(AnalyzerPanel::onPaint) END_EVENT_TABLE() AnalyzerPanel::AnalyzerPanel(wxWindow *parent, const wxString &path, const bool bit_accounting) : wxPanel(parent), path(path), zoom(0), pixels(NULL), bit_accounting(bit_accounting), bpp_q3(NULL), plane_mask(OD_ALL_MASK) {} AnalyzerPanel::~AnalyzerPanel() { close(); } void AnalyzerPanel::setShowPlane(bool show_plane, int mask) { if (show_plane) { plane_mask |= mask; } else { plane_mask &= ~mask; } } void AnalyzerPanel::render() { aom_image_t *img = decoder.image; const int hbd = !!(img->fmt & AOM_IMG_FMT_HIGHBITDEPTH); int y_stride = img->stride[0] >> hbd; int cb_stride = img->stride[1] >> hbd; int cr_stride = img->stride[2] >> hbd; int p_stride = 3 * getDisplayWidth(); unsigned char *y_row = img->planes[0]; unsigned char *cb_row = img->planes[1]; unsigned char *cr_row = img->planes[2]; uint16_t *y_row16 = reinterpret_cast(y_row); uint16_t *cb_row16 = reinterpret_cast(cb_row); uint16_t *cr_row16 = reinterpret_cast(cr_row); unsigned char *p_row = pixels; int y_width_padding = decoder.getWidthPadding(); int cb_width_padding = y_width_padding >> 1; int cr_width_padding = y_width_padding >> 1; int y_height_padding = decoder.getHeightPadding(); int cb_height_padding = y_height_padding >> 1; int cr_height_padding = y_height_padding >> 1; for (int j = 0; j < decoder.getHeight(); j++) { unsigned char *y = y_row - y_stride * y_height_padding; unsigned char *cb = cb_row - cb_stride * cb_height_padding; unsigned char *cr = cr_row - cr_stride * cr_height_padding; uint16_t *y16 = y_row16 - y_stride * y_height_padding; uint16_t *cb16 = cb_row16 - cb_stride * cb_height_padding; uint16_t *cr16 = cr_row16 - cr_stride * cr_height_padding; unsigned char *p = p_row; for (int i = 0; i < decoder.getWidth(); i++) { int64_t yval; int64_t cbval; int64_t crval; int pmask; unsigned rval; unsigned gval; unsigned bval; if (hbd) { yval = *(y16 - y_width_padding); cbval = *(cb16 - cb_width_padding); crval = *(cr16 - cr_width_padding); } else { yval = *(y - y_width_padding); cbval = *(cb - cb_width_padding); crval = *(cr - cr_width_padding); } pmask = plane_mask; if (pmask & OD_LUMA_MASK) { yval -= 16; } else { yval = 128; } cbval = ((pmask & OD_CB_MASK) >> 1) * (cbval - 128); crval = ((pmask & OD_CR_MASK) >> 2) * (crval - 128); /*This is intentionally slow and very accurate.*/ rval = OD_CLAMPI( 0, (int32_t)OD_DIV_ROUND( 2916394880000LL * yval + 4490222169144LL * crval, 9745792000LL), 65535); gval = OD_CLAMPI(0, (int32_t)OD_DIV_ROUND(2916394880000LL * yval - 534117096223LL * cbval - 1334761232047LL * crval, 9745792000LL), 65535); bval = OD_CLAMPI( 0, (int32_t)OD_DIV_ROUND( 2916394880000LL * yval + 5290866304968LL * cbval, 9745792000LL), 65535); unsigned char *px_row = p; for (int v = 0; v < zoom; v++) { unsigned char *px = px_row; for (int u = 0; u < zoom; u++) { *(px + 0) = (unsigned char)(rval >> 8); *(px + 1) = (unsigned char)(gval >> 8); *(px + 2) = (unsigned char)(bval >> 8); px += 3; } px_row += p_stride; } if (hbd) { int dc = ((y16 - y_row16) & 1) | (1 - img->x_chroma_shift); y16++; cb16 += dc; cr16 += dc; } else { int dc = ((y - y_row) & 1) | (1 - img->x_chroma_shift); y++; cb += dc; cr += dc; } p += zoom * 3; } int dc = -((j & 1) | (1 - img->y_chroma_shift)); if (hbd) { y_row16 += y_stride; cb_row16 += dc & cb_stride; cr_row16 += dc & cr_stride; } else { y_row += y_stride; cb_row += dc & cb_stride; cr_row += dc & cr_stride; } p_row += zoom * p_stride; } } void AnalyzerPanel::computeBitsPerPixel() { Accounting *acct; double bpp_total; int totals_q3[MAX_SYMBOL_TYPES] = { 0 }; int sym_count[MAX_SYMBOL_TYPES] = { 0 }; decoder.getAccountingStruct(&acct); for (int j = 0; j < decoder.getHeight(); j++) { for (int i = 0; i < decoder.getWidth(); i++) { bpp_q3[j * decoder.getWidth() + i] = 0.0; } } bpp_total = 0; for (int i = 0; i < acct->syms.num_syms; i++) { AccountingSymbol *s; s = &acct->syms.syms[i]; totals_q3[s->id] += s->bits; sym_count[s->id] += s->samples; } printf("=== Frame: %-3i ===\n", decoder.frame - 1); for (int i = 0; i < acct->syms.dictionary.num_strs; i++) { if (totals_q3[i]) { printf("%30s = %10.3f (%f bit/symbol)\n", acct->syms.dictionary.strs[i], (float)totals_q3[i] / 8, (float)totals_q3[i] / 8 / sym_count[i]); } } printf("\n"); } void AnalyzerPanel::togglePadding() { decoder.togglePadding(); updateDisplaySize(); } bool AnalyzerPanel::nextFrame() { if (decoder.step()) { refresh(); return true; } return false; } void AnalyzerPanel::refresh() { if (bit_accounting) { computeBitsPerPixel(); } render(); } int AnalyzerPanel::getDisplayWidth() const { return zoom * decoder.getWidth(); } int AnalyzerPanel::getDisplayHeight() const { return zoom * decoder.getHeight(); } bool AnalyzerPanel::updateDisplaySize() { unsigned char *p = (unsigned char *)malloc( sizeof(*p) * 3 * getDisplayWidth() * getDisplayHeight()); if (p == NULL) { return false; } free(pixels); pixels = p; SetSize(getDisplayWidth(), getDisplayHeight()); return true; } bool AnalyzerPanel::open(const wxString &path) { if (!decoder.open(path)) { return false; } if (!setZoom(MIN_ZOOM)) { return false; } if (bit_accounting) { bpp_q3 = (double *)malloc(sizeof(*bpp_q3) * decoder.getWidth() * decoder.getHeight()); if (bpp_q3 == NULL) { fprintf(stderr, "Could not allocate memory for bit accounting\n"); close(); return false; } } if (!nextFrame()) { close(); return false; } SetFocus(); return true; } void AnalyzerPanel::close() { decoder.close(); free(pixels); pixels = NULL; free(bpp_q3); bpp_q3 = NULL; } int AnalyzerPanel::getZoom() const { return zoom; } bool AnalyzerPanel::setZoom(int z) { if (z <= MAX_ZOOM && z >= MIN_ZOOM && zoom != z) { int old_zoom = zoom; zoom = z; if (!updateDisplaySize()) { zoom = old_zoom; return false; } return true; } return false; } void AnalyzerPanel::onPaint(wxPaintEvent &) { wxBitmap bmp(wxImage(getDisplayWidth(), getDisplayHeight(), pixels, true)); wxBufferedPaintDC dc(this, bmp); } class AnalyzerFrame : public wxFrame { DECLARE_EVENT_TABLE() private: AnalyzerPanel *panel; const bool bit_accounting; wxMenu *fileMenu; wxMenu *viewMenu; wxMenu *playbackMenu; public: AnalyzerFrame(const bool bit_accounting); // NOLINT void onOpen(wxCommandEvent &event); // NOLINT void onClose(wxCommandEvent &event); // NOLINT void onQuit(wxCommandEvent &event); // NOLINT void onTogglePadding(wxCommandEvent &event); // NOLINT void onZoomIn(wxCommandEvent &event); // NOLINT void onZoomOut(wxCommandEvent &event); // NOLINT void onActualSize(wxCommandEvent &event); // NOLINT void onToggleViewMenuCheckBox(wxCommandEvent &event); // NOLINT void onResetAndToggleViewMenuCheckBox(wxCommandEvent &event); // NOLINT void onNextFrame(wxCommandEvent &event); // NOLINT void onGotoFrame(wxCommandEvent &event); // NOLINT void onRestart(wxCommandEvent &event); // NOLINT void onAbout(wxCommandEvent &event); // NOLINT bool open(const wxString &path); bool setZoom(int zoom); void updateViewMenu(); }; enum { wxID_NEXT_FRAME = 6000, wxID_SHOW_Y, wxID_SHOW_U, wxID_SHOW_V, wxID_GOTO_FRAME, wxID_RESTART, wxID_ACTUAL_SIZE, wxID_PADDING }; BEGIN_EVENT_TABLE(AnalyzerFrame, wxFrame) EVT_MENU(wxID_OPEN, AnalyzerFrame::onOpen) EVT_MENU(wxID_CLOSE, AnalyzerFrame::onClose) EVT_MENU(wxID_EXIT, AnalyzerFrame::onQuit) EVT_MENU(wxID_PADDING, AnalyzerFrame::onTogglePadding) EVT_MENU(wxID_ZOOM_IN, AnalyzerFrame::onZoomIn) EVT_MENU(wxID_ZOOM_OUT, AnalyzerFrame::onZoomOut) EVT_MENU(wxID_ACTUAL_SIZE, AnalyzerFrame::onActualSize) EVT_MENU(wxID_SHOW_Y, AnalyzerFrame::onResetAndToggleViewMenuCheckBox) EVT_MENU(wxID_SHOW_U, AnalyzerFrame::onResetAndToggleViewMenuCheckBox) EVT_MENU(wxID_SHOW_V, AnalyzerFrame::onResetAndToggleViewMenuCheckBox) EVT_MENU(wxID_NEXT_FRAME, AnalyzerFrame::onNextFrame) EVT_MENU(wxID_GOTO_FRAME, AnalyzerFrame::onGotoFrame) EVT_MENU(wxID_RESTART, AnalyzerFrame::onRestart) EVT_MENU(wxID_ABOUT, AnalyzerFrame::onAbout) END_EVENT_TABLE() AnalyzerFrame::AnalyzerFrame(const bool bit_accounting) : wxFrame(NULL, wxID_ANY, _("AV1 Stream Analyzer"), wxDefaultPosition, wxDefaultSize, wxDEFAULT_FRAME_STYLE), panel(NULL), bit_accounting(bit_accounting) { wxMenuBar *mb = new wxMenuBar(); fileMenu = new wxMenu(); fileMenu->Append(wxID_OPEN, _("&Open...\tCtrl-O"), _("Open AV1 file")); fileMenu->Append(wxID_CLOSE, _("&Close\tCtrl-W"), _("Close AV1 file")); fileMenu->Enable(wxID_CLOSE, false); fileMenu->Append(wxID_EXIT, _("E&xit\tCtrl-Q"), _("Quit this program")); mb->Append(fileMenu, _("&File")); wxAcceleratorEntry entries[2]; entries[0].Set(wxACCEL_CTRL, (int)'=', wxID_ZOOM_IN); entries[1].Set(wxACCEL_CTRL | wxACCEL_SHIFT, (int)'-', wxID_ZOOM_OUT); wxAcceleratorTable accel(2, entries); this->SetAcceleratorTable(accel); viewMenu = new wxMenu(); +viewMenu->Append(wxID_PADDING, _("Toggle padding\tCtrl-p"), _("Show padding")); viewMenu->Append(wxID_ZOOM_IN, _("Zoom-In\tCtrl-+"), _("Double image size")); viewMenu->Append(wxID_ZOOM_OUT, _("Zoom-Out\tCtrl--"), _("Half image size")); viewMenu->Append(wxID_ACTUAL_SIZE, _("Actual size\tCtrl-0"), _("Actual size of the frame")); viewMenu->AppendSeparator(); viewMenu->AppendCheckItem(wxID_SHOW_Y, _("&Y plane\tCtrl-Y"), _("Show Y plane")); viewMenu->AppendCheckItem(wxID_SHOW_U, _("&U plane\tCtrl-U"), _("Show U plane")); viewMenu->AppendCheckItem(wxID_SHOW_V, _("&V plane\tCtrl-V"), _("Show V plane")); mb->Append(viewMenu, _("&View")); playbackMenu = new wxMenu(); playbackMenu->Append(wxID_NEXT_FRAME, _("Next frame\tCtrl-."), _("Go to next frame")); /*playbackMenu->Append(wxID_RESTART, _("&Restart\tCtrl-R"), _("Set video to frame 0")); playbackMenu->Append(wxID_GOTO_FRAME, _("Jump to Frame\tCtrl-J"), _("Go to frame number"));*/ mb->Append(playbackMenu, _("&Playback")); wxMenu *helpMenu = new wxMenu(); helpMenu->Append(wxID_ABOUT, _("&About...\tF1"), _("Show about dialog")); mb->Append(helpMenu, _("&Help")); SetMenuBar(mb); CreateStatusBar(1); } void AnalyzerFrame::onOpen(wxCommandEvent &WXUNUSED(event)) { wxFileDialog openFileDialog(this, _("Open file"), wxEmptyString, wxEmptyString, _("AV1 files (*.ivf)|*.ivf"), wxFD_OPEN | wxFD_FILE_MUST_EXIST); if (openFileDialog.ShowModal() != wxID_CANCEL) { open(openFileDialog.GetPath()); } } void AnalyzerFrame::onClose(wxCommandEvent &WXUNUSED(event)) {} void AnalyzerFrame::onQuit(wxCommandEvent &WXUNUSED(event)) { Close(true); } void AnalyzerFrame::onTogglePadding(wxCommandEvent &WXUNUSED(event)) { panel->togglePadding(); SetClientSize(panel->GetSize()); panel->render(); panel->Refresh(); } void AnalyzerFrame::onZoomIn(wxCommandEvent &WXUNUSED(event)) { setZoom(panel->getZoom() + 1); } void AnalyzerFrame::onZoomOut(wxCommandEvent &WXUNUSED(event)) { setZoom(panel->getZoom() - 1); } void AnalyzerFrame::onActualSize(wxCommandEvent &WXUNUSED(event)) { setZoom(MIN_ZOOM); } void AnalyzerFrame::onToggleViewMenuCheckBox(wxCommandEvent &event) { // NOLINT GetMenuBar()->Check(event.GetId(), event.IsChecked()); updateViewMenu(); } void AnalyzerFrame::onResetAndToggleViewMenuCheckBox( wxCommandEvent &event) { // NOLINT int id = event.GetId(); if (id != wxID_SHOW_Y && id != wxID_SHOW_U && id != wxID_SHOW_V) { GetMenuBar()->Check(wxID_SHOW_Y, true); GetMenuBar()->Check(wxID_SHOW_U, true); GetMenuBar()->Check(wxID_SHOW_V, true); } onToggleViewMenuCheckBox(event); } void AnalyzerFrame::onNextFrame(wxCommandEvent &WXUNUSED(event)) { panel->nextFrame(); panel->Refresh(false); } void AnalyzerFrame::onGotoFrame(wxCommandEvent &WXUNUSED(event)) {} void AnalyzerFrame::onRestart(wxCommandEvent &WXUNUSED(event)) {} void AnalyzerFrame::onAbout(wxCommandEvent &WXUNUSED(event)) { wxAboutDialogInfo info; info.SetName(_("AV1 Bitstream Analyzer")); info.SetVersion(_("0.1-beta")); info.SetDescription( _("This program implements a bitstream analyzer for AV1")); info.SetCopyright( wxT("(C) 2017 Alliance for Open Media ")); wxAboutBox(info); } bool AnalyzerFrame::open(const wxString &path) { panel = new AnalyzerPanel(this, path, bit_accounting); if (panel->open(path)) { SetClientSize(panel->GetSize()); return true; } else { delete panel; return false; } } bool AnalyzerFrame::setZoom(int zoom) { if (panel->setZoom(zoom)) { GetMenuBar()->Enable(wxID_ACTUAL_SIZE, zoom != MIN_ZOOM); GetMenuBar()->Enable(wxID_ZOOM_IN, zoom != MAX_ZOOM); GetMenuBar()->Enable(wxID_ZOOM_OUT, zoom != MIN_ZOOM); SetClientSize(panel->GetSize()); panel->render(); panel->Refresh(); return true; } return false; } void AnalyzerFrame::updateViewMenu() { panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_Y), OD_LUMA_MASK); panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_U), OD_CB_MASK); panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_V), OD_CR_MASK); SetClientSize(panel->GetSize()); panel->render(); panel->Refresh(false); } class Analyzer : public wxApp { private: AnalyzerFrame *frame; public: void OnInitCmdLine(wxCmdLineParser &parser); // NOLINT bool OnCmdLineParsed(wxCmdLineParser &parser); // NOLINT }; static const wxCmdLineEntryDesc CMD_LINE_DESC[] = { { wxCMD_LINE_SWITCH, _("h"), _("help"), _("Display this help and exit."), wxCMD_LINE_VAL_NONE, wxCMD_LINE_OPTION_HELP }, { wxCMD_LINE_SWITCH, _("a"), _("bit-accounting"), _("Enable bit accounting"), wxCMD_LINE_VAL_NONE, wxCMD_LINE_PARAM_OPTIONAL }, { wxCMD_LINE_PARAM, NULL, NULL, _("input.ivf"), wxCMD_LINE_VAL_STRING, wxCMD_LINE_PARAM_OPTIONAL }, { wxCMD_LINE_NONE } }; void Analyzer::OnInitCmdLine(wxCmdLineParser &parser) { // NOLINT parser.SetDesc(CMD_LINE_DESC); parser.SetSwitchChars(_("-")); } bool Analyzer::OnCmdLineParsed(wxCmdLineParser &parser) { // NOLINT bool bit_accounting = parser.Found(_("a")); if (bit_accounting && !CONFIG_ACCOUNTING) { fprintf(stderr, "Bit accounting support not found. " "Recompile with:\n./cmake -DCONFIG_ACCOUNTING=1\n"); return false; } frame = new AnalyzerFrame(parser.Found(_("a"))); frame->Show(); if (parser.GetParamCount() > 0) { return frame->open(parser.GetParam(0)); } return true; } void usage_exit(void) { fprintf(stderr, "uhh\n"); exit(EXIT_FAILURE); } IMPLEMENT_APP(Analyzer) aom-3.12.1/examples/aom_cx_set_ref.c000066400000000000000000000311501477627663500173400ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // AV1 Set Reference Frame // ============================ // // This is an example demonstrating how to overwrite the AV1 encoder's // internal reference frame. In the sample we set the last frame to the // current frame. This technique could be used to bounce between two cameras. // // The decoder would also have to set the reference frame to the same value // on the same frame, or the video will become corrupt. The 'test_decode' // variable is set to 1 in this example that tests if the encoder and decoder // results are matching. // // Usage // ----- // This example encodes a raw video. And the last argument passed in specifies // the frame number to update the reference frame on. For example, run // examples/aom_cx_set_ref av1 352 288 in.yuv out.ivf 4 30 // The parameter is parsed as follows: // // // Extra Variables // --------------- // This example maintains the frame number passed on the command line // in the `update_frame_num` variable. // // // Configuration // ------------- // // The reference frame is updated on the frame specified on the command // line. // // Observing The Effects // --------------------- // The encoder and decoder results should be matching when the same reference // frame setting operation is done in both encoder and decoder. Otherwise, // the encoder/decoder mismatch would be seen. #include #include #include #include "aom/aom_decoder.h" #include "aom/aom_encoder.h" #include "aom/aomcx.h" #include "aom_scale/yv12config.h" #include "common/tools_common.h" #include "common/video_writer.h" #include "examples/encoder_util.h" static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s " " \n", exec_name); exit(EXIT_FAILURE); } static void testing_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder, unsigned int frame_out, int *mismatch_seen) { aom_image_t enc_img, dec_img; if (*mismatch_seen) return; /* Get the internal reference frame */ if (aom_codec_control(encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img)) die_codec(encoder, "Failed to get encoder reference frame"); if (aom_codec_control(decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img)) die_codec(decoder, "Failed to get decoder reference frame"); if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) != (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) { if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { aom_image_t enc_hbd_img; aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH, enc_img.d_w, enc_img.d_h, 16); aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img); enc_img = enc_hbd_img; } if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { aom_image_t dec_hbd_img; aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH, dec_img.d_w, dec_img.d_h, 16); aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img); dec_img = dec_hbd_img; } } if (!aom_compare_img(&enc_img, &dec_img)) { int y[4], u[4], v[4]; if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { aom_find_mismatch_high(&enc_img, &dec_img, y, u, v); } else { aom_find_mismatch(&enc_img, &dec_img, y, u, v); } printf( "Encode/decode mismatch on frame %u at" " Y[%d, %d] {%d/%d}," " U[%d, %d] {%d/%d}," " V[%d, %d] {%d/%d}", frame_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1], v[2], v[3]); *mismatch_seen = 1; } aom_img_free(&enc_img); aom_img_free(&dec_img); } static int encode_frame(aom_codec_ctx_t *ecodec, aom_image_t *img, unsigned int frame_in, AvxVideoWriter *writer, int test_decode, aom_codec_ctx_t *dcodec, unsigned int *frame_out, int *mismatch_seen, aom_image_t *ext_ref) { int got_pkts = 0; aom_codec_iter_t iter = NULL; const aom_codec_cx_pkt_t *pkt = NULL; int got_data; const aom_codec_err_t res = aom_codec_encode(ecodec, img, frame_in, 1, 0); if (res != AOM_CODEC_OK) die_codec(ecodec, "Failed to encode frame"); got_data = 0; while ((pkt = aom_codec_get_cx_data(ecodec, &iter)) != NULL) { got_pkts = 1; if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; ++*frame_out; if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, pkt->data.frame.sz, pkt->data.frame.pts)) { die_codec(ecodec, "Failed to write compressed frame"); } printf(keyframe ? "K" : "."); fflush(stdout); got_data = 1; // Decode 1 frame. if (test_decode) { if (aom_codec_decode(dcodec, pkt->data.frame.buf, (unsigned int)pkt->data.frame.sz, NULL)) die_codec(dcodec, "Failed to decode frame."); // Copy out first decoded frame, and use it as reference later. if (*frame_out == 1 && ext_ref != NULL) if (aom_codec_control(dcodec, AV1_COPY_NEW_FRAME_IMAGE, ext_ref)) die_codec(dcodec, "Failed to get decoder new frame"); } } } // Mismatch checking if (got_data && test_decode) { testing_decode(ecodec, dcodec, *frame_out, mismatch_seen); } return got_pkts; } int main(int argc, char **argv) { FILE *infile = NULL; // Encoder aom_codec_ctx_t ecodec; aom_codec_enc_cfg_t cfg; unsigned int frame_in = 0; aom_image_t raw; aom_image_t raw_shift; aom_image_t ext_ref; aom_codec_err_t res; AvxVideoInfo info; AvxVideoWriter *writer = NULL; int flags = 0; int allocated_raw_shift = 0; aom_img_fmt_t raw_fmt = AOM_IMG_FMT_I420; aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420; // Test encoder/decoder mismatch. int test_decode = 1; // Decoder aom_codec_ctx_t dcodec; unsigned int frame_out = 0; // The frame number to set reference frame on unsigned int update_frame_num = 0; int mismatch_seen = 0; const int fps = 30; const int bitrate = 500; const char *codec_arg = NULL; const char *width_arg = NULL; const char *height_arg = NULL; const char *infile_arg = NULL; const char *outfile_arg = NULL; const char *update_frame_num_arg = NULL; unsigned int limit = 0; exec_name = argv[0]; // Clear explicitly, as simply assigning "{ 0 }" generates // "missing-field-initializers" warning in some compilers. memset(&ecodec, 0, sizeof(ecodec)); memset(&cfg, 0, sizeof(cfg)); memset(&info, 0, sizeof(info)); if (argc < 7) die("Invalid number of arguments"); codec_arg = argv[1]; width_arg = argv[2]; height_arg = argv[3]; infile_arg = argv[4]; outfile_arg = argv[5]; update_frame_num_arg = argv[6]; aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg); if (!encoder) die("Unsupported codec."); update_frame_num = (unsigned int)strtoul(update_frame_num_arg, NULL, 0); // In AV1, the reference buffers (cm->buffer_pool->frame_bufs[i].buf) are // allocated while calling aom_codec_encode(), thus, setting reference for // 1st frame isn't supported. if (update_frame_num <= 1) { die("Couldn't parse frame number '%s'\n", update_frame_num_arg); } if (argc > 7) { limit = (unsigned int)strtoul(argv[7], NULL, 0); if (update_frame_num > limit) die("Update frame number couldn't larger than limit\n"); } info.codec_fourcc = get_fourcc_by_aom_encoder(encoder); info.frame_width = (int)strtol(width_arg, NULL, 0); info.frame_height = (int)strtol(height_arg, NULL, 0); info.time_base.numerator = 1; info.time_base.denominator = fps; if (info.frame_width <= 0 || info.frame_height <= 0) { die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); } // In this test, the bit depth of input video is 8-bit, and the input format // is AOM_IMG_FMT_I420. if (!aom_img_alloc(&raw, raw_fmt, info.frame_width, info.frame_height, 32)) { die("Failed to allocate image."); } if (FORCE_HIGHBITDEPTH_DECODING) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH; // Allocate memory with the border so that it can be used as a reference. if (!aom_img_alloc_with_border(&ext_ref, ref_fmt, info.frame_width, info.frame_height, 32, 8, AOM_DEC_BORDER_IN_PIXELS)) { die("Failed to allocate image."); } printf("Using %s\n", aom_codec_iface_name(encoder)); #if CONFIG_REALTIME_ONLY res = aom_codec_enc_config_default(encoder, &cfg, 1); #else res = aom_codec_enc_config_default(encoder, &cfg, 0); #endif if (res) die_codec(&ecodec, "Failed to get default codec config."); cfg.g_w = info.frame_width; cfg.g_h = info.frame_height; cfg.g_timebase.num = info.time_base.numerator; cfg.g_timebase.den = info.time_base.denominator; cfg.rc_target_bitrate = bitrate; cfg.g_lag_in_frames = 3; cfg.g_bit_depth = AOM_BITS_8; flags |= (cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING) ? AOM_CODEC_USE_HIGHBITDEPTH : 0; writer = aom_video_writer_open(outfile_arg, kContainerIVF, &info); if (!writer) die("Failed to open %s for writing.", outfile_arg); if (!(infile = fopen(infile_arg, "rb"))) die("Failed to open %s for reading.", infile_arg); if (aom_codec_enc_init(&ecodec, encoder, &cfg, flags)) die("Failed to initialize encoder"); // Disable alt_ref. if (aom_codec_control(&ecodec, AOME_SET_ENABLEAUTOALTREF, 0)) die_codec(&ecodec, "Failed to set enable auto alt ref"); if (test_decode) { aom_codec_iface_t *decoder = get_aom_decoder_by_short_name(codec_arg); if (aom_codec_dec_init(&dcodec, decoder, NULL, 0)) die("Failed to initialize decoder."); } // Encode frames. while (aom_img_read(&raw, infile)) { if (limit && frame_in >= limit) break; aom_image_t *frame_to_encode; if (FORCE_HIGHBITDEPTH_DECODING) { // Need to allocate larger buffer to use hbd internal. int input_shift = 0; if (!allocated_raw_shift) { aom_img_alloc(&raw_shift, raw_fmt | AOM_IMG_FMT_HIGHBITDEPTH, info.frame_width, info.frame_height, 32); allocated_raw_shift = 1; } aom_img_upshift(&raw_shift, &raw, input_shift); frame_to_encode = &raw_shift; } else { frame_to_encode = &raw; } if (update_frame_num > 1 && frame_out + 1 == update_frame_num) { av1_ref_frame_t ref; ref.idx = 0; ref.use_external_ref = 0; ref.img = ext_ref; // Set reference frame in encoder. if (aom_codec_control(&ecodec, AV1_SET_REFERENCE, &ref)) die_codec(&ecodec, "Failed to set encoder reference frame"); printf(" "); #if CONFIG_REALTIME_ONLY // Set cpu speed in encoder. if (aom_codec_control(&ecodec, AOME_SET_CPUUSED, 7)) die_codec(&ecodec, "Failed to set cpu speed"); #endif // If set_reference in decoder is commented out, the enc/dec mismatch // would be seen. if (test_decode) { ref.use_external_ref = 1; if (aom_codec_control(&dcodec, AV1_SET_REFERENCE, &ref)) die_codec(&dcodec, "Failed to set decoder reference frame"); } } encode_frame(&ecodec, frame_to_encode, frame_in, writer, test_decode, &dcodec, &frame_out, &mismatch_seen, &ext_ref); frame_in++; if (mismatch_seen) break; } // Flush encoder. if (!mismatch_seen) while (encode_frame(&ecodec, NULL, frame_in, writer, test_decode, &dcodec, &frame_out, &mismatch_seen, NULL)) { } printf("\n"); fclose(infile); printf("Processed %u frames.\n", frame_out); if (test_decode) { if (!mismatch_seen) printf("Encoder/decoder results are matching.\n"); else printf("Encoder/decoder results are NOT matching.\n"); } if (test_decode) if (aom_codec_destroy(&dcodec)) die_codec(&dcodec, "Failed to destroy decoder"); if (allocated_raw_shift) aom_img_free(&raw_shift); aom_img_free(&ext_ref); aom_img_free(&raw); if (aom_codec_destroy(&ecodec)) die_codec(&ecodec, "Failed to destroy encoder."); aom_video_writer_close(writer); return EXIT_SUCCESS; } aom-3.12.1/examples/av1_dec_fuzzer.cc000066400000000000000000000060761477627663500174460ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /* * See build_av1_dec_fuzzer.sh for building instructions. */ #include #include #include #include #include #include #include "config/aom_config.h" #include "aom/aom_decoder.h" #include "aom/aomdx.h" #include "aom_ports/mem_ops.h" #define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */ #define IVF_FILE_HDR_SZ 32 extern "C" void usage_exit(void) { exit(EXIT_FAILURE); } extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { if (size <= IVF_FILE_HDR_SZ) { return 0; } // Abusing the four unused bytes at the end of the IVF file header as a source // of random bits. unsigned int tile_mode = (data[IVF_FILE_HDR_SZ - 1] & 2) != 0; unsigned int ext_tile_debug = (data[IVF_FILE_HDR_SZ - 1] & 4) != 0; unsigned int is_annexb = (data[IVF_FILE_HDR_SZ - 1] & 8) != 0; int output_all_layers = (data[IVF_FILE_HDR_SZ - 1] & 0x10) != 0; int operating_point = data[IVF_FILE_HDR_SZ - 2] & 0x1F; aom_codec_iface_t *codec_interface = aom_codec_av1_dx(); aom_codec_ctx_t codec; // Set thread count in the range [1, 64]. const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1; aom_codec_dec_cfg_t cfg = { threads, 0, 0, !FORCE_HIGHBITDEPTH_DECODING }; if (aom_codec_dec_init(&codec, codec_interface, &cfg, 0)) { return 0; } AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, tile_mode); AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_EXT_TILE_DEBUG, ext_tile_debug); AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB, is_annexb); AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_OUTPUT_ALL_LAYERS, output_all_layers); AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_OPERATING_POINT, operating_point); data += IVF_FILE_HDR_SZ; size -= IVF_FILE_HDR_SZ; while (size > IVF_FRAME_HDR_SZ) { size_t frame_size = mem_get_le32(data); size -= IVF_FRAME_HDR_SZ; data += IVF_FRAME_HDR_SZ; frame_size = std::min(size, frame_size); aom_codec_stream_info_t stream_info; stream_info.is_annexb = is_annexb; aom_codec_err_t err = aom_codec_peek_stream_info(codec_interface, data, size, &stream_info); static_cast(err); err = aom_codec_decode(&codec, data, frame_size, nullptr); static_cast(err); aom_codec_iter_t iter = nullptr; aom_image_t *img = nullptr; while ((img = aom_codec_get_frame(&codec, &iter)) != nullptr) { } data += frame_size; size -= frame_size; } aom_codec_destroy(&codec); return 0; } aom-3.12.1/examples/av1_dec_fuzzer.dict000066400000000000000000000001451477627663500177730ustar00rootroot00000000000000# IVF Signature + version (bytes 0-5) kw1="DKIF\x00\x00" # AV1 codec fourCC (bytes 8-11) kw2="AV01" aom-3.12.1/examples/build_av1_dec_fuzzer.sh000077500000000000000000000050361477627663500206500ustar00rootroot00000000000000#!/bin/bash # # Copyright (c) 2019, Alliance for Open Media. All rights reserved. # # This source code is subject to the terms of the BSD 2 Clause License and # the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License # was not distributed with this source code in the LICENSE file, you can # obtain it at www.aomedia.org/license/software. If the Alliance for Open # Media Patent License 1.0 was not distributed with this source code in the # PATENTS file, you can obtain it at www.aomedia.org/license/patent. # ############################################################################### # Fuzzer for libaom decoder. # ========================== # Requirements # --------------------- # Clang6.0 or above (must support -fsanitize=fuzzer -fsanitize=fuzzer-no-link) # # References: # --------------------- # http://llvm.org/docs/LibFuzzer.html # https://github.com/google/oss-fuzz # # Steps to build / run # --------------------- set -eu # Have a copy of AOM and a build directory ready. if [[ $# -ne 2 ]]; then echo "Pass in the AOM source tree as first argument, and a build directory " echo "as the second argument. The AOM source tree can be obtained via: " echo " git clone https://aomedia.googlesource.com/aom" exit 2 fi if [[ -z "${CC:-}" ]]; then echo "Set the CC environment variable to point to your C compiler." exit 2 fi if [[ -z "${CXX:-}" ]]; then echo "Set the CXX environment variable to point to your C++ compiler." exit 2 fi AOM_DIR=$1 BUILD_DIR=$2 # Run CMake with address sanitizer enabled and build the codec. # Enable DO_RANGE_CHECK_CLAMP to suppress the noise of integer overflows # in the transform functions. Also set memory limits. EXTRA_C_FLAGS='-UNDEBUG -DDO_RANGE_CHECK_CLAMP=1 -DAOM_MAX_ALLOCABLE_MEMORY=1073741824' cd "${BUILD_DIR}" cmake "${AOM_DIR}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCONFIG_PIC=1 \ -DFORCE_HIGHBITDEPTH_DECODING=0 \ -DCONFIG_AV1_ENCODER=0 -DENABLE_EXAMPLES=0 -DENABLE_DOCS=0 -DENABLE_TESTS=0 \ -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=12288 -DDECODE_WIDTH_LIMIT=12288 \ -DAOM_EXTRA_C_FLAGS="${EXTRA_C_FLAGS}" \ -DAOM_EXTRA_CXX_FLAGS="${EXTRA_C_FLAGS}" -DSANITIZE=fuzzer-no-link,address # Build the codec. make -j$(nproc) # Build the av1 fuzzer $CXX -std=c++11 -I${AOM_DIR} -I${BUILD_DIR} \ -g -fsanitize=fuzzer,address \ ${AOM_DIR}/examples/av1_dec_fuzzer.cc -o ${BUILD_DIR}/av1_dec_fuzzer \ ${BUILD_DIR}/libaom.a echo "Fuzzer built at ${BUILD_DIR}/av1_dec_fuzzer." echo "Create a corpus directory, copy IVF files in there, and run:" echo " av1_dec_fuzzer CORPUS_DIR" aom-3.12.1/examples/decode_to_md5.c000066400000000000000000000076571477627663500170740ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Frame-by-frame MD5 Checksum // =========================== // // This example builds upon the simple decoder loop to show how checksums // of the decoded output can be generated. These are used for validating // decoder implementations against the reference implementation, for example. // // MD5 algorithm // ------------- // The Message-Digest 5 (MD5) is a well known hash function. We have provided // an implementation derived from the RSA Data Security, Inc. MD5 Message-Digest // Algorithm for your use. Our implmentation only changes the interface of this // reference code. You must include the `md5_utils.h` header for access to these // functions. // // Processing The Decoded Data // --------------------------- // Each row of the image is passed to the MD5 accumulator. First the Y plane // is processed, then U, then V. It is important to honor the image's `stride` // values. #include #include #include #include "aom/aom_decoder.h" #include "aom/aomdx.h" #include "common/md5_utils.h" #include "common/tools_common.h" #include "common/video_reader.h" static void get_image_md5(const aom_image_t *img, unsigned char digest[16]) { int plane, y; MD5Context md5; MD5Init(&md5); for (plane = 0; plane < 3; ++plane) { const unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; const int w = plane ? (img->d_w + 1) >> 1 : img->d_w; const int h = plane ? (img->d_h + 1) >> 1 : img->d_h; for (y = 0; y < h; ++y) { MD5Update(&md5, buf, w); buf += stride; } } MD5Final(digest, &md5); } static void print_md5(FILE *stream, unsigned char digest[16]) { int i; for (i = 0; i < 16; ++i) fprintf(stream, "%02x", digest[i]); } static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s \n", exec_name); exit(EXIT_FAILURE); } int main(int argc, char **argv) { int frame_cnt = 0; FILE *outfile = NULL; AvxVideoReader *reader = NULL; const AvxVideoInfo *info = NULL; exec_name = argv[0]; if (argc != 3) die("Invalid number of arguments."); reader = aom_video_reader_open(argv[1]); if (!reader) die("Failed to open %s for reading.", argv[1]); if (!(outfile = fopen(argv[2], "wb"))) die("Failed to open %s for writing.", argv[2]); info = aom_video_reader_get_info(reader); aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); if (!decoder) die("Unknown input codec."); printf("Using %s\n", aom_codec_iface_name(decoder)); aom_codec_ctx_t codec; if (aom_codec_dec_init(&codec, decoder, NULL, 0)) die("Failed to initialize decoder"); while (aom_video_reader_read_frame(reader)) { aom_codec_iter_t iter = NULL; aom_image_t *img = NULL; size_t frame_size = 0; const unsigned char *frame = aom_video_reader_get_frame(reader, &frame_size); if (aom_codec_decode(&codec, frame, frame_size, NULL)) die_codec(&codec, "Failed to decode frame"); while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) { unsigned char digest[16]; get_image_md5(img, digest); print_md5(outfile, digest); fprintf(outfile, " img-%ux%u-%04d.i420\n", img->d_w, img->d_h, ++frame_cnt); } } printf("Processed %d frames.\n", frame_cnt); if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); aom_video_reader_close(reader); fclose(outfile); return EXIT_SUCCESS; } aom-3.12.1/examples/decode_with_drops.c000066400000000000000000000105061477627663500200520ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Decode With Drops Example // ========================= // // This is an example utility which drops a series of frames, as specified // on the command line. This is useful for observing the error recovery // features of the codec. // // Usage // ----- // This example adds a single argument to the `simple_decoder` example, // which specifies the range or pattern of frames to drop. The parameter is // parsed as follows: // // Dropping A Range Of Frames // -------------------------- // To drop a range of frames, specify the starting frame and the ending // frame to drop, separated by a dash. The following command will drop // frames 5 through 10 (base 1). // // $ ./decode_with_drops in.ivf out.i420 5-10 // // // Dropping A Pattern Of Frames // ---------------------------- // To drop a pattern of frames, specify the number of frames to drop and // the number of frames after which to repeat the pattern, separated by // a forward-slash. The following command will drop 3 of 7 frames. // Specifically, it will decode 4 frames, then drop 3 frames, and then // repeat. // // $ ./decode_with_drops in.ivf out.i420 3/7 // // // Extra Variables // --------------- // This example maintains the pattern passed on the command line in the // `n`, `m`, and `is_range` variables: // // // Making The Drop Decision // ------------------------ // The example decides whether to drop the frame based on the current // frame number, immediately before decoding the frame. #include #include #include #include "aom/aom_decoder.h" #include "aom/aomdx.h" #include "common/tools_common.h" #include "common/video_reader.h" static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s \n", exec_name); exit(EXIT_FAILURE); } int main(int argc, char **argv) { int frame_cnt = 0; FILE *outfile = NULL; AvxVideoReader *reader = NULL; const AvxVideoInfo *info = NULL; int n = 0; int m = 0; int is_range = 0; char *nptr = NULL; exec_name = argv[0]; if (argc != 4) die("Invalid number of arguments."); reader = aom_video_reader_open(argv[1]); if (!reader) die("Failed to open %s for reading.", argv[1]); if (!(outfile = fopen(argv[2], "wb"))) die("Failed to open %s for writing.", argv[2]); n = (int)strtol(argv[3], &nptr, 0); m = (int)strtol(nptr + 1, NULL, 0); is_range = (*nptr == '-'); if (!n || !m || (*nptr != '-' && *nptr != '/')) die("Couldn't parse pattern %s.\n", argv[3]); info = aom_video_reader_get_info(reader); aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); if (!decoder) die("Unknown input codec."); printf("Using %s\n", aom_codec_iface_name(decoder)); aom_codec_ctx_t codec; if (aom_codec_dec_init(&codec, decoder, NULL, 0)) die("Failed to initialize decoder."); while (aom_video_reader_read_frame(reader)) { aom_codec_iter_t iter = NULL; aom_image_t *img = NULL; size_t frame_size = 0; int skip; const unsigned char *frame = aom_video_reader_get_frame(reader, &frame_size); ++frame_cnt; skip = (is_range && frame_cnt >= n && frame_cnt <= m) || (!is_range && m - (frame_cnt - 1) % m <= n); if (!skip) { putc('.', stdout); if (aom_codec_decode(&codec, frame, frame_size, NULL)) die_codec(&codec, "Failed to decode frame."); while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) aom_img_write(img, outfile); } else { putc('X', stdout); } fflush(stdout); } printf("Processed %d frames.\n", frame_cnt); if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n", info->frame_width, info->frame_height, argv[2]); aom_video_reader_close(reader); fclose(outfile); return EXIT_SUCCESS; } aom-3.12.1/examples/encoder_util.c000066400000000000000000000120131477627663500170340ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Utility functions used by encoder binaries. #include "examples/encoder_util.h" #include #include #include "aom/aom_integer.h" #define mmin(a, b) ((a) < (b) ? (a) : (b)) static void find_mismatch_plane(const aom_image_t *const img1, const aom_image_t *const img2, int plane, int use_highbitdepth, int loc[4]) { const unsigned char *const p1 = img1->planes[plane]; const int p1_stride = img1->stride[plane] >> use_highbitdepth; const unsigned char *const p2 = img2->planes[plane]; const int p2_stride = img2->stride[plane] >> use_highbitdepth; const uint32_t bsize = 64; const int is_y_plane = (plane == AOM_PLANE_Y); const uint32_t bsizex = is_y_plane ? bsize : bsize >> img1->x_chroma_shift; const uint32_t bsizey = is_y_plane ? bsize : bsize >> img1->y_chroma_shift; const uint32_t c_w = is_y_plane ? img1->d_w : (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; const uint32_t c_h = is_y_plane ? img1->d_h : (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; assert(img1->d_w == img2->d_w && img1->d_h == img2->d_h); assert(img1->x_chroma_shift == img2->x_chroma_shift && img1->y_chroma_shift == img2->y_chroma_shift); loc[0] = loc[1] = loc[2] = loc[3] = -1; if (img1->monochrome && img2->monochrome && plane) return; int match = 1; uint32_t i, j; for (i = 0; match && i < c_h; i += bsizey) { for (j = 0; match && j < c_w; j += bsizex) { const int si = is_y_plane ? mmin(i + bsizey, c_h) - i : mmin(i + bsizey, c_h - i); const int sj = is_y_plane ? mmin(j + bsizex, c_w) - j : mmin(j + bsizex, c_w - j); int k, l; for (k = 0; match && k < si; ++k) { for (l = 0; match && l < sj; ++l) { const int row = i + k; const int col = j + l; const int offset1 = row * p1_stride + col; const int offset2 = row * p2_stride + col; const int val1 = use_highbitdepth ? p1[2 * offset1] | (p1[2 * offset1 + 1] << 8) : p1[offset1]; const int val2 = use_highbitdepth ? p2[2 * offset2] | (p2[2 * offset2 + 1] << 8) : p2[offset2]; if (val1 != val2) { loc[0] = row; loc[1] = col; loc[2] = val1; loc[3] = val2; match = 0; break; } } } } } } static void find_mismatch_helper(const aom_image_t *const img1, const aom_image_t *const img2, int use_highbitdepth, int yloc[4], int uloc[4], int vloc[4]) { find_mismatch_plane(img1, img2, AOM_PLANE_Y, use_highbitdepth, yloc); find_mismatch_plane(img1, img2, AOM_PLANE_U, use_highbitdepth, uloc); find_mismatch_plane(img1, img2, AOM_PLANE_V, use_highbitdepth, vloc); } void aom_find_mismatch_high(const aom_image_t *const img1, const aom_image_t *const img2, int yloc[4], int uloc[4], int vloc[4]) { find_mismatch_helper(img1, img2, 1, yloc, uloc, vloc); } void aom_find_mismatch(const aom_image_t *const img1, const aom_image_t *const img2, int yloc[4], int uloc[4], int vloc[4]) { find_mismatch_helper(img1, img2, 0, yloc, uloc, vloc); } int aom_compare_img(const aom_image_t *const img1, const aom_image_t *const img2) { assert(img1->cp == img2->cp); assert(img1->tc == img2->tc); assert(img1->mc == img2->mc); assert(img1->monochrome == img2->monochrome); int num_planes = img1->monochrome ? 1 : 3; uint32_t l_w = img1->d_w; uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; const uint32_t c_h = (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; int match = 1; match &= (img1->fmt == img2->fmt); match &= (img1->d_w == img2->d_w); match &= (img1->d_h == img2->d_h); if (img1->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { l_w *= 2; c_w *= 2; } for (int plane = 0; plane < num_planes; ++plane) { uint32_t height = plane ? c_h : img1->d_h; uint32_t width = plane ? c_w : l_w; for (uint32_t i = 0; i < height; ++i) { match &= (memcmp(img1->planes[plane] + i * img1->stride[plane], img2->planes[plane] + i * img2->stride[plane], width) == 0); } } return match; } aom-3.12.1/examples/encoder_util.h000066400000000000000000000026031477627663500170450ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Utility functions used by encoder binaries. #ifndef AOM_EXAMPLES_ENCODER_UTIL_H_ #define AOM_EXAMPLES_ENCODER_UTIL_H_ #ifdef __cplusplus extern "C" { #endif #include "aom/aom_image.h" // Returns mismatch location (?loc[0],?loc[1]) and the values at that location // in img1 (?loc[2]) and img2 (?loc[3]). void aom_find_mismatch_high(const aom_image_t *const img1, const aom_image_t *const img2, int yloc[4], int uloc[4], int vloc[4]); void aom_find_mismatch(const aom_image_t *const img1, const aom_image_t *const img2, int yloc[4], int uloc[4], int vloc[4]); // Returns 1 if the two images match. int aom_compare_img(const aom_image_t *const img1, const aom_image_t *const img2); #ifdef __cplusplus } #endif #endif // AOM_EXAMPLES_ENCODER_UTIL_H_ aom-3.12.1/examples/inspect.c000066400000000000000000001014741477627663500160370ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Inspect Decoder // ================ // // This is a simple decoder loop that writes JSON stats to stdout. This tool // can also be compiled with Emscripten and used as a library. #include #include #include #ifdef __EMSCRIPTEN__ #include #else #define EMSCRIPTEN_KEEPALIVE #endif #include "config/aom_config.h" #include "aom/aom_decoder.h" #include "aom/aomdx.h" #include "av1/common/av1_common_int.h" #if CONFIG_ACCOUNTING #include "av1/decoder/accounting.h" #endif #include "av1/decoder/inspection.h" #include "common/args.h" #include "common/tools_common.h" #include "common/video_common.h" #include "common/video_reader.h" // Max JSON buffer size. const int MAX_BUFFER = 1024 * 1024 * 256; typedef enum { ACCOUNTING_LAYER = 1, BLOCK_SIZE_LAYER = 1 << 1, TRANSFORM_SIZE_LAYER = 1 << 2, TRANSFORM_TYPE_LAYER = 1 << 3, MODE_LAYER = 1 << 4, SKIP_LAYER = 1 << 5, FILTER_LAYER = 1 << 6, CDEF_LAYER = 1 << 7, REFERENCE_FRAME_LAYER = 1 << 8, MOTION_VECTORS_LAYER = 1 << 9, UV_MODE_LAYER = 1 << 10, CFL_LAYER = 1 << 11, DUAL_FILTER_LAYER = 1 << 12, Q_INDEX_LAYER = 1 << 13, SEGMENT_ID_LAYER = 1 << 14, MOTION_MODE_LAYER = 1 << 15, COMPOUND_TYPE_LAYER = 1 << 16, INTRABC_LAYER = 1 << 17, PALETTE_LAYER = 1 << 18, UV_PALETTE_LAYER = 1 << 19, ALL_LAYERS = (1 << 20) - 1 } LayerType; static LayerType layers = 0; static int stop_after = 0; static int compress = 0; static const arg_def_t limit_arg = ARG_DEF(NULL, "limit", 1, "Stop decoding after n frames"); static const arg_def_t dump_all_arg = ARG_DEF("A", "all", 0, "Dump All"); static const arg_def_t compress_arg = ARG_DEF("x", "compress", 0, "Compress JSON using RLE"); static const arg_def_t dump_accounting_arg = ARG_DEF("a", "accounting", 0, "Dump Accounting"); static const arg_def_t dump_block_size_arg = ARG_DEF("bs", "blockSize", 0, "Dump Block Size"); static const arg_def_t dump_motion_vectors_arg = ARG_DEF("mv", "motionVectors", 0, "Dump Motion Vectors"); static const arg_def_t dump_transform_size_arg = ARG_DEF("ts", "transformSize", 0, "Dump Transform Size"); static const arg_def_t dump_transform_type_arg = ARG_DEF("tt", "transformType", 0, "Dump Transform Type"); static const arg_def_t dump_mode_arg = ARG_DEF("m", "mode", 0, "Dump Mode"); static const arg_def_t dump_motion_mode_arg = ARG_DEF("mm", "motion_mode", 0, "Dump Motion Modes"); static const arg_def_t dump_compound_type_arg = ARG_DEF("ct", "compound_type", 0, "Dump Compound Types"); static const arg_def_t dump_uv_mode_arg = ARG_DEF("uvm", "uv_mode", 0, "Dump UV Intra Prediction Modes"); static const arg_def_t dump_skip_arg = ARG_DEF("s", "skip", 0, "Dump Skip"); static const arg_def_t dump_filter_arg = ARG_DEF("f", "filter", 0, "Dump Filter"); static const arg_def_t dump_cdef_arg = ARG_DEF("c", "cdef", 0, "Dump CDEF"); static const arg_def_t dump_cfl_arg = ARG_DEF("cfl", "chroma_from_luma", 0, "Dump Chroma from Luma Alphas"); static const arg_def_t dump_dual_filter_type_arg = ARG_DEF("df", "dualFilterType", 0, "Dump Dual Filter Type"); static const arg_def_t dump_reference_frame_arg = ARG_DEF("r", "referenceFrame", 0, "Dump Reference Frame"); static const arg_def_t dump_delta_q_arg = ARG_DEF("dq", "delta_q", 0, "Dump QIndex"); static const arg_def_t dump_seg_id_arg = ARG_DEF("si", "seg_id", 0, "Dump Segment ID"); static const arg_def_t dump_intrabc_arg = ARG_DEF("ibc", "intrabc", 0, "Dump If IntraBC Is Used"); static const arg_def_t dump_palette_arg = ARG_DEF("plt", "palette", 0, "Dump Palette Size"); static const arg_def_t dump_uv_palette_arg = ARG_DEF("uvp", "uv_palette", 0, "Dump UV Palette Size"); static const arg_def_t usage_arg = ARG_DEF("h", "help", 0, "Help"); static const arg_def_t skip_non_transform_arg = ARG_DEF( "snt", "skip_non_transform", 1, "Skip is counted as a non transform."); static const arg_def_t combined_arg = ARG_DEF("comb", "combined", 1, "combinining parameters into one output."); int combined_parm_list[15]; int combined_parm_count = 0; static const arg_def_t *main_args[] = { &limit_arg, &dump_all_arg, &compress_arg, #if CONFIG_ACCOUNTING &dump_accounting_arg, #endif &dump_block_size_arg, &dump_transform_size_arg, &dump_transform_type_arg, &dump_mode_arg, &dump_uv_mode_arg, &dump_motion_mode_arg, &dump_compound_type_arg, &dump_skip_arg, &dump_filter_arg, &dump_cdef_arg, &dump_dual_filter_type_arg, &dump_cfl_arg, &dump_reference_frame_arg, &dump_motion_vectors_arg, &dump_delta_q_arg, &dump_seg_id_arg, &dump_intrabc_arg, &dump_palette_arg, &dump_uv_palette_arg, &usage_arg, &skip_non_transform_arg, &combined_arg, NULL }; #define ENUM(name) \ { #name, name } #define LAST_ENUM \ { NULL, 0 } typedef struct map_entry { const char *name; int value; } map_entry; const map_entry refs_map[] = { ENUM(INTRA_FRAME), ENUM(LAST_FRAME), ENUM(LAST2_FRAME), ENUM(LAST3_FRAME), ENUM(GOLDEN_FRAME), ENUM(BWDREF_FRAME), ENUM(ALTREF2_FRAME), ENUM(ALTREF_FRAME), LAST_ENUM }; const map_entry block_size_map[] = { ENUM(BLOCK_4X4), ENUM(BLOCK_4X8), ENUM(BLOCK_8X4), ENUM(BLOCK_8X8), ENUM(BLOCK_8X16), ENUM(BLOCK_16X8), ENUM(BLOCK_16X16), ENUM(BLOCK_16X32), ENUM(BLOCK_32X16), ENUM(BLOCK_32X32), ENUM(BLOCK_32X64), ENUM(BLOCK_64X32), ENUM(BLOCK_64X64), ENUM(BLOCK_64X128), ENUM(BLOCK_128X64), ENUM(BLOCK_128X128), ENUM(BLOCK_4X16), ENUM(BLOCK_16X4), ENUM(BLOCK_8X32), ENUM(BLOCK_32X8), ENUM(BLOCK_16X64), ENUM(BLOCK_64X16), LAST_ENUM }; #define TX_SKIP -1 const map_entry tx_size_map[] = { ENUM(TX_4X4), ENUM(TX_8X8), ENUM(TX_16X16), ENUM(TX_32X32), ENUM(TX_64X64), ENUM(TX_4X8), ENUM(TX_8X4), ENUM(TX_8X16), ENUM(TX_16X8), ENUM(TX_16X32), ENUM(TX_32X16), ENUM(TX_32X64), ENUM(TX_64X32), ENUM(TX_4X16), ENUM(TX_16X4), ENUM(TX_8X32), ENUM(TX_32X8), ENUM(TX_16X64), ENUM(TX_64X16), LAST_ENUM }; const map_entry tx_type_map[] = { ENUM(DCT_DCT), ENUM(ADST_DCT), ENUM(DCT_ADST), ENUM(ADST_ADST), ENUM(FLIPADST_DCT), ENUM(DCT_FLIPADST), ENUM(FLIPADST_FLIPADST), ENUM(ADST_FLIPADST), ENUM(FLIPADST_ADST), ENUM(IDTX), ENUM(V_DCT), ENUM(H_DCT), ENUM(V_ADST), ENUM(H_ADST), ENUM(V_FLIPADST), ENUM(H_FLIPADST), LAST_ENUM }; const map_entry dual_filter_map[] = { ENUM(REG_REG), ENUM(REG_SMOOTH), ENUM(REG_SHARP), ENUM(SMOOTH_REG), ENUM(SMOOTH_SMOOTH), ENUM(SMOOTH_SHARP), ENUM(SHARP_REG), ENUM(SHARP_SMOOTH), ENUM(SHARP_SHARP), LAST_ENUM }; const map_entry prediction_mode_map[] = { ENUM(DC_PRED), ENUM(V_PRED), ENUM(H_PRED), ENUM(D45_PRED), ENUM(D135_PRED), ENUM(D113_PRED), ENUM(D157_PRED), ENUM(D203_PRED), ENUM(D67_PRED), ENUM(SMOOTH_PRED), ENUM(SMOOTH_V_PRED), ENUM(SMOOTH_H_PRED), ENUM(PAETH_PRED), ENUM(NEARESTMV), ENUM(NEARMV), ENUM(GLOBALMV), ENUM(NEWMV), ENUM(NEAREST_NEARESTMV), ENUM(NEAR_NEARMV), ENUM(NEAREST_NEWMV), ENUM(NEW_NEARESTMV), ENUM(NEAR_NEWMV), ENUM(NEW_NEARMV), ENUM(GLOBAL_GLOBALMV), ENUM(NEW_NEWMV), ENUM(INTRA_INVALID), LAST_ENUM }; const map_entry motion_mode_map[] = { ENUM(SIMPLE_TRANSLATION), ENUM(OBMC_CAUSAL), // 2-sided OBMC ENUM(WARPED_CAUSAL), // 2-sided WARPED LAST_ENUM }; const map_entry compound_type_map[] = { ENUM(COMPOUND_AVERAGE), ENUM(COMPOUND_WEDGE), ENUM(COMPOUND_DIFFWTD), LAST_ENUM }; const map_entry uv_prediction_mode_map[] = { ENUM(UV_DC_PRED), ENUM(UV_V_PRED), ENUM(UV_H_PRED), ENUM(UV_D45_PRED), ENUM(UV_D135_PRED), ENUM(UV_D113_PRED), ENUM(UV_D157_PRED), ENUM(UV_D203_PRED), ENUM(UV_D67_PRED), ENUM(UV_SMOOTH_PRED), ENUM(UV_SMOOTH_V_PRED), ENUM(UV_SMOOTH_H_PRED), ENUM(UV_PAETH_PRED), ENUM(UV_CFL_PRED), ENUM(UV_MODE_INVALID), LAST_ENUM }; #define NO_SKIP 0 #define SKIP 1 const map_entry skip_map[] = { ENUM(SKIP), ENUM(NO_SKIP), LAST_ENUM }; const map_entry intrabc_map[] = { { "INTRABC", 1 }, { "NO_INTRABC", 0 }, LAST_ENUM }; const map_entry palette_map[] = { { "ZERO_COLORS", 0 }, { "TWO_COLORS", 2 }, { "THREE_COLORS", 3 }, { "FOUR_COLORS", 4 }, { "FIVE_COLORS", 5 }, { "SIX_COLORS", 6 }, { "SEVEN_COLORS", 7 }, { "EIGHT_COLORS", 8 }, LAST_ENUM }; const map_entry config_map[] = { ENUM(MI_SIZE), LAST_ENUM }; static const char *exec_name; struct parm_offset { char parm[60]; char offset; }; struct parm_offset parm_offsets[] = { { "blockSize", offsetof(insp_mi_data, bsize) }, { "transformSize", offsetof(insp_mi_data, tx_size) }, { "transformType", offsetof(insp_mi_data, tx_type) }, { "dualFilterType", offsetof(insp_mi_data, dual_filter_type) }, { "mode", offsetof(insp_mi_data, mode) }, { "uv_mode", offsetof(insp_mi_data, uv_mode) }, { "motion_mode", offsetof(insp_mi_data, motion_mode) }, { "compound_type", offsetof(insp_mi_data, compound_type) }, { "referenceFrame", offsetof(insp_mi_data, ref_frame) }, { "skip", offsetof(insp_mi_data, skip) }, }; int parm_count = sizeof(parm_offsets) / sizeof(parm_offsets[0]); static int convert_to_indices(char *str, int *indices, int maxCount, int *count) { *count = 0; do { char *comma = strchr(str, ','); int length = (comma ? (int)(comma - str) : (int)strlen(str)); int i; for (i = 0; i < parm_count; ++i) { if (!strncmp(str, parm_offsets[i].parm, length)) { break; } } if (i == parm_count) return 0; indices[(*count)++] = i; if (*count > maxCount) return 0; str += length + 1; } while (strlen(str) > 0); return 1; } insp_frame_data frame_data; int frame_count = 0; int decoded_frame_count = 0; aom_codec_ctx_t codec; AvxVideoReader *reader = NULL; const AvxVideoInfo *info = NULL; aom_image_t *img = NULL; static void on_frame_decoded_dump(char *json) { #ifdef __EMSCRIPTEN__ EM_ASM_({ Module.on_frame_decoded_json($0); }, json); #else printf("%s", json); #endif } // Writing out the JSON buffer using snprintf is very slow, especially when // compiled with emscripten, these functions speed things up quite a bit. static int put_str(char *buffer, const char *str) { int i; for (i = 0; str[i] != '\0'; i++) { buffer[i] = str[i]; } return i; } static int put_str_with_escape(char *buffer, const char *str) { int i; int j = 0; for (i = 0; str[i] != '\0'; i++) { if (str[i] < ' ') { continue; } else if (str[i] == '"' || str[i] == '\\') { buffer[j++] = '\\'; } buffer[j++] = str[i]; } return j; } static int put_num(char *buffer, char prefix, int num, char suffix) { int i = 0; char *buf = buffer; int is_neg = 0; if (prefix) { buf[i++] = prefix; } if (num == 0) { buf[i++] = '0'; } else { if (num < 0) { num = -num; is_neg = 1; } int s = i; while (num != 0) { buf[i++] = '0' + (num % 10); num = num / 10; } if (is_neg) { buf[i++] = '-'; } int e = i - 1; while (s < e) { int t = buf[s]; buf[s] = buf[e]; buf[e] = t; s++; e--; } } if (suffix) { buf[i++] = suffix; } return i; } static int put_map(char *buffer, const map_entry *map) { char *buf = buffer; const map_entry *entry = map; while (entry->name != NULL) { *(buf++) = '"'; buf += put_str(buf, entry->name); *(buf++) = '"'; buf += put_num(buf, ':', entry->value, 0); entry++; if (entry->name != NULL) { *(buf++) = ','; } } return (int)(buf - buffer); } #if 0 static int put_reference_frame(char *buffer) { const int mi_rows = frame_data.mi_rows; const int mi_cols = frame_data.mi_cols; char *buf = buffer; int r, c, t; buf += put_str(buf, " \"referenceFrameMap\": {"); buf += put_map(buf, refs_map); buf += put_str(buf, "},\n"); buf += put_str(buf, " \"referenceFrame\": ["); for (r = 0; r < mi_rows; ++r) { *(buf++) = '['; for (c = 0; c < mi_cols; ++c) { insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c]; buf += put_num(buf, '[', mi->ref_frame[0], 0); buf += put_num(buf, ',', mi->ref_frame[1], ']'); if (compress) { // RLE for (t = c + 1; t < mi_cols; ++t) { insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t]; if (mi->ref_frame[0] != next_mi->ref_frame[0] || mi->ref_frame[1] != next_mi->ref_frame[1]) { break; } } if (t - c > 1) { *(buf++) = ','; buf += put_num(buf, '[', t - c - 1, ']'); c = t - 1; } } if (c < mi_cols - 1) *(buf++) = ','; } *(buf++) = ']'; if (r < mi_rows - 1) *(buf++) = ','; } buf += put_str(buf, "],\n"); return (int)(buf - buffer); } #endif static int put_motion_vectors(char *buffer) { const int mi_rows = frame_data.mi_rows; const int mi_cols = frame_data.mi_cols; char *buf = buffer; int r, c, t; buf += put_str(buf, " \"motionVectors\": ["); for (r = 0; r < mi_rows; ++r) { *(buf++) = '['; for (c = 0; c < mi_cols; ++c) { insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c]; buf += put_num(buf, '[', mi->mv[0].col, 0); buf += put_num(buf, ',', mi->mv[0].row, 0); buf += put_num(buf, ',', mi->mv[1].col, 0); buf += put_num(buf, ',', mi->mv[1].row, ']'); if (compress) { // RLE for (t = c + 1; t < mi_cols; ++t) { insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t]; if (mi->mv[0].col != next_mi->mv[0].col || mi->mv[0].row != next_mi->mv[0].row || mi->mv[1].col != next_mi->mv[1].col || mi->mv[1].row != next_mi->mv[1].row) { break; } } if (t - c > 1) { *(buf++) = ','; buf += put_num(buf, '[', t - c - 1, ']'); c = t - 1; } } if (c < mi_cols - 1) *(buf++) = ','; } *(buf++) = ']'; if (r < mi_rows - 1) *(buf++) = ','; } buf += put_str(buf, "],\n"); return (int)(buf - buffer); } static int put_combined(char *buffer) { const int mi_rows = frame_data.mi_rows; const int mi_cols = frame_data.mi_cols; char *buf = buffer; int r, c, p; buf += put_str(buf, " \""); for (p = 0; p < combined_parm_count; ++p) { if (p) buf += put_str(buf, "&"); buf += put_str(buf, parm_offsets[combined_parm_list[p]].parm); } buf += put_str(buf, "\": ["); for (r = 0; r < mi_rows; ++r) { *(buf++) = '['; for (c = 0; c < mi_cols; ++c) { insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c]; *(buf++) = '['; for (p = 0; p < combined_parm_count; ++p) { if (p) *(buf++) = ','; int16_t *v = (int16_t *)(((int8_t *)mi) + parm_offsets[combined_parm_list[p]].offset); buf += put_num(buf, 0, v[0], 0); } *(buf++) = ']'; if (c < mi_cols - 1) *(buf++) = ','; } *(buf++) = ']'; if (r < mi_rows - 1) *(buf++) = ','; } buf += put_str(buf, "],\n"); return (int)(buf - buffer); } static int put_block_info(char *buffer, const map_entry *map, const char *name, size_t offset, int len) { const int mi_rows = frame_data.mi_rows; const int mi_cols = frame_data.mi_cols; char *buf = buffer; int r, c, t, i; if (compress && len == 1) { die("Can't encode scalars as arrays when RLE compression is enabled."); } if (map) { buf += snprintf(buf, MAX_BUFFER, " \"%sMap\": {", name); buf += put_map(buf, map); buf += put_str(buf, "},\n"); } buf += snprintf(buf, MAX_BUFFER, " \"%s\": [", name); for (r = 0; r < mi_rows; ++r) { *(buf++) = '['; for (c = 0; c < mi_cols; ++c) { insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c]; int16_t *v = (int16_t *)(((int8_t *)mi) + offset); if (len == 0) { buf += put_num(buf, 0, v[0], 0); } else { buf += put_str(buf, "["); for (i = 0; i < len; i++) { buf += put_num(buf, 0, v[i], 0); if (i < len - 1) { buf += put_str(buf, ","); } } buf += put_str(buf, "]"); } if (compress) { // RLE for (t = c + 1; t < mi_cols; ++t) { insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t]; int16_t *nv = (int16_t *)(((int8_t *)next_mi) + offset); int same = 0; if (len == 0) { same = v[0] == nv[0]; } else { for (i = 0; i < len; i++) { same = v[i] == nv[i]; if (!same) { break; } } } if (!same) { break; } } if (t - c > 1) { *(buf++) = ','; buf += put_num(buf, '[', t - c - 1, ']'); c = t - 1; } } if (c < mi_cols - 1) *(buf++) = ','; } *(buf++) = ']'; if (r < mi_rows - 1) *(buf++) = ','; } buf += put_str(buf, "],\n"); return (int)(buf - buffer); } #if CONFIG_ACCOUNTING static int put_accounting(char *buffer) { char *buf = buffer; int i; const Accounting *accounting = frame_data.accounting; if (accounting == NULL) { printf("XXX\n"); return 0; } const int num_syms = accounting->syms.num_syms; const int num_strs = accounting->syms.dictionary.num_strs; buf += put_str(buf, " \"symbolsMap\": ["); for (i = 0; i < num_strs; i++) { buf += snprintf(buf, MAX_BUFFER, "\"%s\"", accounting->syms.dictionary.strs[i]); if (i < num_strs - 1) *(buf++) = ','; } buf += put_str(buf, "],\n"); buf += put_str(buf, " \"symbols\": [\n "); AccountingSymbolContext context; context.x = -2; context.y = -2; AccountingSymbol *sym; for (i = 0; i < num_syms; i++) { sym = &accounting->syms.syms[i]; if (memcmp(&context, &sym->context, sizeof(AccountingSymbolContext)) != 0) { buf += put_num(buf, '[', sym->context.x, 0); buf += put_num(buf, ',', sym->context.y, ']'); } else { buf += put_num(buf, '[', sym->id, 0); buf += put_num(buf, ',', sym->bits, 0); buf += put_num(buf, ',', sym->samples, ']'); } context = sym->context; if (i < num_syms - 1) *(buf++) = ','; } buf += put_str(buf, "],\n"); return (int)(buf - buffer); } #endif int skip_non_transform = 0; static void inspect(void *pbi, void *data) { /* Fetch frame data. */ ifd_inspect(&frame_data, pbi, skip_non_transform); // Show existing frames just show a reference buffer we've already decoded. // There's no information to show. if (frame_data.show_existing_frame) return; (void)data; // We allocate enough space and hope we don't write out of bounds. Totally // unsafe but this speeds things up, especially when compiled to Javascript. char *buffer = aom_malloc(MAX_BUFFER); if (!buffer) { fprintf(stderr, "Error allocating inspect info buffer\n"); abort(); } char *buf = buffer; buf += put_str(buf, "{\n"); if (layers & BLOCK_SIZE_LAYER) { buf += put_block_info(buf, block_size_map, "blockSize", offsetof(insp_mi_data, bsize), 0); } if (layers & TRANSFORM_SIZE_LAYER) { buf += put_block_info(buf, tx_size_map, "transformSize", offsetof(insp_mi_data, tx_size), 0); } if (layers & TRANSFORM_TYPE_LAYER) { buf += put_block_info(buf, tx_type_map, "transformType", offsetof(insp_mi_data, tx_type), 0); } if (layers & DUAL_FILTER_LAYER) { buf += put_block_info(buf, dual_filter_map, "dualFilterType", offsetof(insp_mi_data, dual_filter_type), 0); } if (layers & MODE_LAYER) { buf += put_block_info(buf, prediction_mode_map, "mode", offsetof(insp_mi_data, mode), 0); } if (layers & UV_MODE_LAYER) { buf += put_block_info(buf, uv_prediction_mode_map, "uv_mode", offsetof(insp_mi_data, uv_mode), 0); } if (layers & MOTION_MODE_LAYER) { buf += put_block_info(buf, motion_mode_map, "motion_mode", offsetof(insp_mi_data, motion_mode), 0); } if (layers & COMPOUND_TYPE_LAYER) { buf += put_block_info(buf, compound_type_map, "compound_type", offsetof(insp_mi_data, compound_type), 0); } if (layers & SKIP_LAYER) { buf += put_block_info(buf, skip_map, "skip", offsetof(insp_mi_data, skip), 0); } if (layers & FILTER_LAYER) { buf += put_block_info(buf, NULL, "filter", offsetof(insp_mi_data, filter), 2); } if (layers & CDEF_LAYER) { buf += put_block_info(buf, NULL, "cdef_level", offsetof(insp_mi_data, cdef_level), 0); buf += put_block_info(buf, NULL, "cdef_strength", offsetof(insp_mi_data, cdef_strength), 0); } if (layers & CFL_LAYER) { buf += put_block_info(buf, NULL, "cfl_alpha_idx", offsetof(insp_mi_data, cfl_alpha_idx), 0); buf += put_block_info(buf, NULL, "cfl_alpha_sign", offsetof(insp_mi_data, cfl_alpha_sign), 0); } if (layers & Q_INDEX_LAYER) { buf += put_block_info(buf, NULL, "delta_q", offsetof(insp_mi_data, current_qindex), 0); } if (layers & SEGMENT_ID_LAYER) { buf += put_block_info(buf, NULL, "seg_id", offsetof(insp_mi_data, segment_id), 0); } if (layers & MOTION_VECTORS_LAYER) { buf += put_motion_vectors(buf); } if (layers & INTRABC_LAYER) { buf += put_block_info(buf, intrabc_map, "intrabc", offsetof(insp_mi_data, intrabc), 0); } if (layers & PALETTE_LAYER) { buf += put_block_info(buf, palette_map, "palette", offsetof(insp_mi_data, palette), 0); } if (layers & UV_PALETTE_LAYER) { buf += put_block_info(buf, palette_map, "uv_palette", offsetof(insp_mi_data, uv_palette), 0); } if (combined_parm_count > 0) buf += put_combined(buf); if (layers & REFERENCE_FRAME_LAYER) { buf += put_block_info(buf, refs_map, "referenceFrame", offsetof(insp_mi_data, ref_frame), 2); } #if CONFIG_ACCOUNTING if (layers & ACCOUNTING_LAYER) { buf += put_accounting(buf); } #endif buf += snprintf(buf, MAX_BUFFER, " \"frame\": %d,\n", frame_data.frame_number); buf += snprintf(buf, MAX_BUFFER, " \"showFrame\": %d,\n", frame_data.show_frame); buf += snprintf(buf, MAX_BUFFER, " \"frameType\": %d,\n", frame_data.frame_type); buf += snprintf(buf, MAX_BUFFER, " \"baseQIndex\": %d,\n", frame_data.base_qindex); buf += snprintf(buf, MAX_BUFFER, " \"tileCols\": %d,\n", frame_data.tile_mi_cols); buf += snprintf(buf, MAX_BUFFER, " \"tileRows\": %d,\n", frame_data.tile_mi_rows); buf += snprintf(buf, MAX_BUFFER, " \"deltaQPresentFlag\": %d,\n", frame_data.delta_q_present_flag); buf += snprintf(buf, MAX_BUFFER, " \"deltaQRes\": %d,\n", frame_data.delta_q_res); buf += put_str(buf, " \"config\": {"); buf += put_map(buf, config_map); buf += put_str(buf, "},\n"); buf += put_str(buf, " \"configString\": \""); buf += put_str_with_escape(buf, aom_codec_build_config()); buf += put_str(buf, "\"\n"); decoded_frame_count++; buf += put_str(buf, "},\n"); *(buf++) = 0; on_frame_decoded_dump(buffer); aom_free(buffer); } static void ifd_init_cb(void) { aom_inspect_init ii; ii.inspect_cb = inspect; ii.inspect_ctx = NULL; aom_codec_control(&codec, AV1_SET_INSPECTION_CALLBACK, &ii); } EMSCRIPTEN_KEEPALIVE int open_file(char *file); EMSCRIPTEN_KEEPALIVE int open_file(char *file) { if (file == NULL) { // The JS analyzer puts the .ivf file at this location. file = "/tmp/input.ivf"; } reader = aom_video_reader_open(file); if (!reader) die("Failed to open %s for reading.", file); info = aom_video_reader_get_info(reader); aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); if (!decoder) die("Unknown input codec."); fprintf(stderr, "Using %s\n", aom_codec_iface_name(decoder)); if (aom_codec_dec_init(&codec, decoder, NULL, 0)) die("Failed to initialize decoder."); ifd_init(&frame_data, info->frame_width, info->frame_height); ifd_init_cb(); return EXIT_SUCCESS; } Av1DecodeReturn adr; int have_frame = 0; const unsigned char *frame; const unsigned char *end_frame; size_t frame_size = 0; struct av1_ref_frame ref_dec; EMSCRIPTEN_KEEPALIVE int read_frame(void); EMSCRIPTEN_KEEPALIVE int read_frame(void) { img = NULL; // This loop skips over any frames that are show_existing_frames, as // there is nothing to analyze. do { if (!have_frame) { if (!aom_video_reader_read_frame(reader)) return EXIT_FAILURE; frame = aom_video_reader_get_frame(reader, &frame_size); have_frame = 1; end_frame = frame + frame_size; } if (aom_codec_decode(&codec, frame, (unsigned int)frame_size, &adr) != AOM_CODEC_OK) { die_codec(&codec, "Failed to decode frame."); } frame = adr.buf; frame_size = end_frame - frame; if (frame == end_frame) have_frame = 0; } while (adr.show_existing); int got_any_frames = 0; aom_image_t *frame_img; ref_dec.idx = adr.idx; // ref_dec.idx is the index to the reference buffer idx to AV1_GET_REFERENCE // if its -1 the decoder didn't update any reference buffer and the only // way to see the frame is aom_codec_get_frame. if (ref_dec.idx == -1) { aom_codec_iter_t iter = NULL; img = frame_img = aom_codec_get_frame(&codec, &iter); ++frame_count; got_any_frames = 1; } else if (!aom_codec_control(&codec, AV1_GET_REFERENCE, &ref_dec)) { img = frame_img = &ref_dec.img; ++frame_count; got_any_frames = 1; } if (!got_any_frames) { return EXIT_FAILURE; } return EXIT_SUCCESS; } EMSCRIPTEN_KEEPALIVE const char *get_aom_codec_build_config(void); EMSCRIPTEN_KEEPALIVE const char *get_aom_codec_build_config(void) { return aom_codec_build_config(); } EMSCRIPTEN_KEEPALIVE int get_bit_depth(void); EMSCRIPTEN_KEEPALIVE int get_bit_depth(void) { return img->bit_depth; } EMSCRIPTEN_KEEPALIVE int get_bits_per_sample(void); EMSCRIPTEN_KEEPALIVE int get_bits_per_sample(void) { return img->bps; } EMSCRIPTEN_KEEPALIVE int get_image_format(void); EMSCRIPTEN_KEEPALIVE int get_image_format(void) { return img->fmt; } EMSCRIPTEN_KEEPALIVE unsigned char *get_plane(int plane); EMSCRIPTEN_KEEPALIVE unsigned char *get_plane(int plane) { return img->planes[plane]; } EMSCRIPTEN_KEEPALIVE int get_plane_stride(int plane); EMSCRIPTEN_KEEPALIVE int get_plane_stride(int plane) { return img->stride[plane]; } EMSCRIPTEN_KEEPALIVE int get_plane_width(int plane); EMSCRIPTEN_KEEPALIVE int get_plane_width(int plane) { return aom_img_plane_width(img, plane); } EMSCRIPTEN_KEEPALIVE int get_plane_height(int plane); EMSCRIPTEN_KEEPALIVE int get_plane_height(int plane) { return aom_img_plane_height(img, plane); } EMSCRIPTEN_KEEPALIVE int get_frame_width(void); EMSCRIPTEN_KEEPALIVE int get_frame_width(void) { return info->frame_width; } EMSCRIPTEN_KEEPALIVE int get_frame_height(void); EMSCRIPTEN_KEEPALIVE int get_frame_height(void) { return info->frame_height; } static void parse_args(char **argv) { char **argi, **argj; struct arg arg; (void)dump_accounting_arg; (void)dump_cdef_arg; for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { arg.argv_step = 1; if (arg_match(&arg, &dump_block_size_arg, argi)) layers |= BLOCK_SIZE_LAYER; #if CONFIG_ACCOUNTING else if (arg_match(&arg, &dump_accounting_arg, argi)) layers |= ACCOUNTING_LAYER; #endif else if (arg_match(&arg, &dump_transform_size_arg, argi)) layers |= TRANSFORM_SIZE_LAYER; else if (arg_match(&arg, &dump_transform_type_arg, argi)) layers |= TRANSFORM_TYPE_LAYER; else if (arg_match(&arg, &dump_mode_arg, argi)) layers |= MODE_LAYER; else if (arg_match(&arg, &dump_uv_mode_arg, argi)) layers |= UV_MODE_LAYER; else if (arg_match(&arg, &dump_motion_mode_arg, argi)) layers |= MOTION_MODE_LAYER; else if (arg_match(&arg, &dump_compound_type_arg, argi)) layers |= COMPOUND_TYPE_LAYER; else if (arg_match(&arg, &dump_skip_arg, argi)) layers |= SKIP_LAYER; else if (arg_match(&arg, &dump_filter_arg, argi)) layers |= FILTER_LAYER; else if (arg_match(&arg, &dump_cdef_arg, argi)) layers |= CDEF_LAYER; else if (arg_match(&arg, &dump_cfl_arg, argi)) layers |= CFL_LAYER; else if (arg_match(&arg, &dump_reference_frame_arg, argi)) layers |= REFERENCE_FRAME_LAYER; else if (arg_match(&arg, &dump_motion_vectors_arg, argi)) layers |= MOTION_VECTORS_LAYER; else if (arg_match(&arg, &dump_dual_filter_type_arg, argi)) layers |= DUAL_FILTER_LAYER; else if (arg_match(&arg, &dump_delta_q_arg, argi)) layers |= Q_INDEX_LAYER; else if (arg_match(&arg, &dump_seg_id_arg, argi)) layers |= SEGMENT_ID_LAYER; else if (arg_match(&arg, &dump_intrabc_arg, argi)) layers |= INTRABC_LAYER; else if (arg_match(&arg, &dump_palette_arg, argi)) layers |= PALETTE_LAYER; else if (arg_match(&arg, &dump_uv_palette_arg, argi)) layers |= UV_PALETTE_LAYER; else if (arg_match(&arg, &dump_all_arg, argi)) layers |= ALL_LAYERS; else if (arg_match(&arg, &compress_arg, argi)) compress = 1; else if (arg_match(&arg, &usage_arg, argi)) usage_exit(); else if (arg_match(&arg, &limit_arg, argi)) stop_after = arg_parse_uint(&arg); else if (arg_match(&arg, &skip_non_transform_arg, argi)) skip_non_transform = arg_parse_uint(&arg); else if (arg_match(&arg, &combined_arg, argi)) convert_to_indices( (char *)arg.val, combined_parm_list, sizeof(combined_parm_list) / sizeof(combined_parm_list[0]), &combined_parm_count); else argj++; } } static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s src_filename \n", exec_name); fprintf(stderr, "\nOptions:\n"); arg_show_usage(stderr, main_args); exit(EXIT_FAILURE); } EMSCRIPTEN_KEEPALIVE int main(int argc, char **argv) { exec_name = argv[0]; parse_args(argv); if (argc >= 2) { open_file(argv[1]); printf("[\n"); while (1) { if (stop_after && (decoded_frame_count >= stop_after)) break; if (read_frame()) break; } printf("null\n"); printf("]"); } else { usage_exit(); } } EMSCRIPTEN_KEEPALIVE void quit(void); EMSCRIPTEN_KEEPALIVE void quit(void) { if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); aom_video_reader_close(reader); } EMSCRIPTEN_KEEPALIVE void set_layers(LayerType v); EMSCRIPTEN_KEEPALIVE void set_layers(LayerType v) { layers = v; } EMSCRIPTEN_KEEPALIVE void set_compress(int v); EMSCRIPTEN_KEEPALIVE void set_compress(int v) { compress = v; } aom-3.12.1/examples/lightfield_bitstream_parsing.c000066400000000000000000000350601477627663500222770ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Lightfield Bitstream Parsing // ============================ // // This is a lightfield bitstream parsing example. It takes an input file // containing the whole compressed lightfield bitstream(ivf file) and a text // file containing a stream of tiles to decode and then constructs and outputs // a new bitstream that can be decoded by an AV1 decoder. The output bitstream // contains reference frames(i.e. anchor frames), camera frame header, and // tile list OBUs. num_references is the number of anchor frames coded at the // beginning of the light field file. After running the lightfield encoder, // run lightfield bitstream parsing: // examples/lightfield_bitstream_parsing vase10x10.ivf vase_tile_list.ivf 4 // tile_list.txt // // The tile_list.txt is expected to be of the form: // Frame // // // ... // Frame #include #include #include "aom/aom_codec.h" #include "aom/aom_decoder.h" #include "aom/aom_encoder.h" #include "aom/aom_integer.h" #include "aom/aomdx.h" #include "common/tools_common.h" #include "common/video_reader.h" #include "common/video_writer.h" #define MAX_TILES 512 static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s \n", exec_name); exit(EXIT_FAILURE); } #define ALIGN_POWER_OF_TWO(value, n) \ (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1)) const int output_frame_width = 512; const int output_frame_height = 512; // Spec: // typedef struct { // uint8_t anchor_frame_idx; // uint8_t tile_row; // uint8_t tile_col; // uint16_t coded_tile_data_size_minus_1; // uint8_t *coded_tile_data; // } TILE_LIST_ENTRY; // Tile list entry provided by the application typedef struct { int image_idx; int reference_idx; int tile_col; int tile_row; } TILE_LIST_INFO; static int get_image_bps(aom_img_fmt_t fmt) { switch (fmt) { case AOM_IMG_FMT_I420: return 12; case AOM_IMG_FMT_I422: return 16; case AOM_IMG_FMT_I444: return 24; case AOM_IMG_FMT_I42016: return 24; case AOM_IMG_FMT_I42216: return 32; case AOM_IMG_FMT_I44416: return 48; default: die("Invalid image format"); } } static void process_tile_list(const TILE_LIST_INFO *tiles, int num_tiles, aom_codec_pts_t tl_pts, unsigned char **frames, const size_t *frame_sizes, aom_codec_ctx_t *codec, unsigned char *tl_buf, AvxVideoWriter *writer, uint8_t output_frame_width_in_tiles_minus_1, uint8_t output_frame_height_in_tiles_minus_1) { unsigned char *tl = tl_buf; unsigned char *saved_obu_size_loc = NULL; uint32_t tile_list_obu_header_size = 0; uint32_t tile_list_obu_size = 0; int num_tiles_minus_1 = num_tiles - 1; int i; // Write the tile list OBU header that is 1 byte long. int obu_type = OBU_TILE_LIST; int obu_has_size_field = 1; *tl++ = (obu_type << 3) | (obu_has_size_field << 1); tile_list_obu_header_size++; // Write the OBU size using a fixed length_field_size of 4 bytes. saved_obu_size_loc = tl; for (i = 0; i < 4; i++) { *tl++ = 0; } tile_list_obu_header_size += 4; // write_tile_list_obu() *tl++ = output_frame_width_in_tiles_minus_1; *tl++ = output_frame_height_in_tiles_minus_1; *tl++ = (num_tiles_minus_1 >> 8) & 0xff; *tl++ = num_tiles_minus_1 & 0xff; tile_list_obu_size += 4; // Write each tile's data for (i = 0; i <= num_tiles_minus_1; i++) { aom_tile_data tile_data = { 0, NULL, 0 }; int image_idx = tiles[i].image_idx; int ref_idx = tiles[i].reference_idx; int tc = tiles[i].tile_col; int tr = tiles[i].tile_row; size_t frame_size = frame_sizes[image_idx]; const unsigned char *frame = frames[image_idx]; AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_ROW, tr); AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_COL, tc); aom_codec_err_t aom_status = aom_codec_decode(codec, frame, frame_size, NULL); if (aom_status) die_codec(codec, "Failed to decode tile."); AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_DATA, &tile_data); // Copy over tile info. // uint8_t anchor_frame_idx; // uint8_t tile_row; // uint8_t tile_col; // uint16_t coded_tile_data_size_minus_1; // uint8_t *coded_tile_data; uint32_t tile_info_bytes = 5; *tl++ = ref_idx; *tl++ = tr; *tl++ = tc; int coded_tile_data_size_minus_1 = (int)tile_data.coded_tile_data_size - 1; *tl++ = (coded_tile_data_size_minus_1 >> 8) & 0xff; *tl++ = coded_tile_data_size_minus_1 & 0xff; memcpy(tl, (uint8_t *)tile_data.coded_tile_data, tile_data.coded_tile_data_size); tl += tile_data.coded_tile_data_size; tile_list_obu_size += tile_info_bytes + (uint32_t)tile_data.coded_tile_data_size; } // Write tile list OBU size. size_t bytes_written = 0; if (aom_uleb_encode_fixed_size(tile_list_obu_size, 4, 4, saved_obu_size_loc, &bytes_written)) die_codec(codec, "Failed to encode the tile list obu size."); // Copy the tile list. if (!aom_video_writer_write_frame( writer, tl_buf, tile_list_obu_header_size + tile_list_obu_size, tl_pts)) die_codec(codec, "Failed to copy compressed tile list."); } int main(int argc, char **argv) { AvxVideoReader *reader = NULL; AvxVideoWriter *writer = NULL; const AvxVideoInfo *info = NULL; int num_references; int i; aom_codec_pts_t pts; const char *tile_list_file = NULL; exec_name = argv[0]; if (argc != 5) die("Invalid number of arguments."); reader = aom_video_reader_open(argv[1]); if (!reader) die("Failed to open %s for reading.", argv[1]); num_references = (int)strtol(argv[3], NULL, 0); info = aom_video_reader_get_info(reader); aom_video_reader_set_fourcc(reader, AV1_FOURCC); // The writer to write out ivf file in tile list OBU, which can be decoded by // AV1 decoder. writer = aom_video_writer_open(argv[2], kContainerIVF, info); if (!writer) die("Failed to open %s for writing", argv[2]); tile_list_file = argv[4]; aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); if (!decoder) die("Unknown input codec."); printf("Using %s\n", aom_codec_iface_name(decoder)); aom_codec_ctx_t codec; if (aom_codec_dec_init(&codec, decoder, NULL, 0)) die("Failed to initialize decoder."); // Decode anchor frames. AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0); printf("Reading %d reference images.\n", num_references); for (i = 0; i < num_references; ++i) { aom_video_reader_read_frame(reader); size_t frame_size = 0; const unsigned char *frame = aom_video_reader_get_frame(reader, &frame_size); pts = (aom_codec_pts_t)aom_video_reader_get_frame_pts(reader); // Copy references bitstream directly. if (!aom_video_writer_write_frame(writer, frame, frame_size, pts)) die_codec(&codec, "Failed to copy compressed anchor frame."); if (aom_codec_decode(&codec, frame, frame_size, NULL)) die_codec(&codec, "Failed to decode frame."); } // Decode camera frames. AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 1); AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_EXT_TILE_DEBUG, 1); FILE *infile = aom_video_reader_get_file(reader); // Record the offset of the first camera image. const FileOffset camera_frame_pos = ftello(infile); printf("Loading compressed frames into memory.\n"); // Count the frames in the lightfield. int num_frames = 0; while (aom_video_reader_read_frame(reader)) { ++num_frames; } if (num_frames < 1) die("Input light field has no frames."); // Read all of the lightfield frames into memory. unsigned char **frames = (unsigned char **)malloc(num_frames * sizeof(unsigned char *)); size_t *frame_sizes = (size_t *)malloc(num_frames * sizeof(size_t)); if (!(frames && frame_sizes)) die("Failed to allocate frame data."); // Seek to the first camera image. fseeko(infile, camera_frame_pos, SEEK_SET); for (int f = 0; f < num_frames; ++f) { aom_video_reader_read_frame(reader); size_t frame_size = 0; const unsigned char *frame = aom_video_reader_get_frame(reader, &frame_size); frames[f] = (unsigned char *)malloc(frame_size * sizeof(unsigned char)); if (!frames[f]) die("Failed to allocate frame data."); memcpy(frames[f], frame, frame_size); frame_sizes[f] = frame_size; } printf("Read %d frames.\n", num_frames); // Copy first camera frame for getting camera frame header. This is done // only once. { size_t frame_size = frame_sizes[0]; const unsigned char *frame = frames[0]; pts = num_references; aom_tile_data frame_header_info = { 0, NULL, 0 }; // Need to decode frame header to get camera frame header info. So, here // decoding 1 tile is enough. AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_DECODE_TILE_ROW, 0); AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_DECODE_TILE_COL, 0); aom_codec_err_t aom_status = aom_codec_decode(&codec, frame, frame_size, NULL); if (aom_status) die_codec(&codec, "Failed to decode tile."); AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_HEADER_INFO, &frame_header_info); size_t obu_size_offset = (uint8_t *)frame_header_info.coded_tile_data - frame; size_t length_field_size = frame_header_info.coded_tile_data_size; // Remove ext-tile tile info. uint32_t frame_header_size = (uint32_t)frame_header_info.extra_size - 1; size_t bytes_to_copy = obu_size_offset + length_field_size + frame_header_size; unsigned char *frame_hdr_buf = (unsigned char *)malloc(bytes_to_copy); if (frame_hdr_buf == NULL) die_codec(&codec, "Failed to allocate frame header buffer."); memcpy(frame_hdr_buf, frame, bytes_to_copy); // Update frame header OBU size. size_t bytes_written = 0; if (aom_uleb_encode_fixed_size( frame_header_size, length_field_size, length_field_size, frame_hdr_buf + obu_size_offset, &bytes_written)) die_codec(&codec, "Failed to encode the tile list obu size."); // Copy camera frame header bitstream. if (!aom_video_writer_write_frame(writer, frame_hdr_buf, bytes_to_copy, pts)) die_codec(&codec, "Failed to copy compressed camera frame header."); free(frame_hdr_buf); } // Read out the image format. aom_img_fmt_t ref_fmt = 0; if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt)) die_codec(&codec, "Failed to get the image format"); const int bps = get_image_bps(ref_fmt); if (!bps) die_codec(&codec, "Invalid image format."); // read out the tile size. unsigned int tile_size = 0; if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_TILE_SIZE, &tile_size)) die_codec(&codec, "Failed to get the tile size"); const unsigned int tile_width = tile_size >> 16; const unsigned int tile_height = tile_size & 65535; // Allocate a buffer to store tile list bitstream. const size_t data_sz = MAX_TILES * ALIGN_POWER_OF_TWO(tile_width, 5) * ALIGN_POWER_OF_TWO(tile_height, 5) * bps / 8; unsigned char *tl_buf = (unsigned char *)malloc(data_sz); if (tl_buf == NULL) die_codec(&codec, "Failed to allocate tile list buffer."); aom_codec_pts_t tl_pts = num_references; const uint8_t output_frame_width_in_tiles_minus_1 = output_frame_width / tile_width - 1; const uint8_t output_frame_height_in_tiles_minus_1 = output_frame_height / tile_height - 1; printf("Reading tile list from file.\n"); char line[1024]; FILE *tile_list_fptr = fopen(tile_list_file, "r"); if (!tile_list_fptr) die_codec(&codec, "Failed to open tile list file."); int num_tiles = 0; TILE_LIST_INFO tiles[MAX_TILES]; while ((fgets(line, 1024, tile_list_fptr)) != NULL) { if (line[0] == 'F' || num_tiles >= MAX_TILES) { // Flush existing tile list and start another, either because we hit a // new render frame or because we've hit our max number of tiles per list. if (num_tiles > 0) { process_tile_list(tiles, num_tiles, tl_pts, frames, frame_sizes, &codec, tl_buf, writer, output_frame_width_in_tiles_minus_1, output_frame_height_in_tiles_minus_1); ++tl_pts; } num_tiles = 0; } if (line[0] == 'F') { continue; } if (sscanf(line, "%d %d %d %d", &tiles[num_tiles].image_idx, &tiles[num_tiles].reference_idx, &tiles[num_tiles].tile_col, &tiles[num_tiles].tile_row) == 4) { if (tiles[num_tiles].image_idx >= num_frames) { die("Tile list image_idx out of bounds: %d >= %d.", tiles[num_tiles].image_idx, num_frames); } if (tiles[num_tiles].reference_idx >= num_references) { die("Tile list reference_idx out of bounds: %d >= %d.", tiles[num_tiles].reference_idx, num_references); } ++num_tiles; } } if (num_tiles > 0) { // Flush out the last tile list. process_tile_list(tiles, num_tiles, tl_pts, frames, frame_sizes, &codec, tl_buf, writer, output_frame_width_in_tiles_minus_1, output_frame_height_in_tiles_minus_1); ++tl_pts; } const int num_tile_lists = (int)(tl_pts - pts); printf("Finished processing tile lists. Num tile lists: %d.\n", num_tile_lists); free(tl_buf); for (int f = 0; f < num_frames; ++f) { free(frames[f]); } free(frame_sizes); free(frames); if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); aom_video_writer_close(writer); aom_video_reader_close(reader); return EXIT_SUCCESS; } aom-3.12.1/examples/lightfield_decoder.c000066400000000000000000000322241477627663500201660ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Lightfield Decoder // ================== // // This is an example of a simple lightfield decoder. It builds upon the // simple_decoder.c example. It takes an input file containing the compressed // data (in ivf format), treating it as a lightfield instead of a video; and a // text file with a list of tiles to decode. There is an optional parameter // allowing to choose the output format, and the supported formats are // YUV1D(default), YUV, and NV12. // After running the lightfield encoder, run lightfield decoder to decode a // batch of tiles: // examples/lightfield_decoder vase10x10.ivf vase_reference.yuv 4 tile_list.txt // 0(optional) // The tile_list.txt is expected to be of the form: // Frame // // // ... // Frame #include #include #include "aom/aom_decoder.h" #include "aom/aomdx.h" #include "aom_scale/yv12config.h" #include "av1/common/enums.h" #include "common/tools_common.h" #include "common/video_reader.h" enum { YUV1D, // 1D tile output for conformance test. YUV, // Tile output in YUV format. NV12, // Tile output in NV12 format. } UENUM1BYTE(OUTPUT_FORMAT); static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s \n", exec_name); exit(EXIT_FAILURE); } // Output frame size static const int output_frame_width = 512; static const int output_frame_height = 512; static void aom_img_copy_tile(const aom_image_t *src, const aom_image_t *dst, int dst_row_offset, int dst_col_offset) { const int shift = (src->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0; int plane; for (plane = 0; plane < 3; ++plane) { const unsigned char *src_buf = src->planes[plane]; const int src_stride = src->stride[plane]; unsigned char *dst_buf = dst->planes[plane]; const int dst_stride = dst->stride[plane]; const int roffset = (plane > 0) ? dst_row_offset >> dst->y_chroma_shift : dst_row_offset; const int coffset = (plane > 0) ? dst_col_offset >> dst->x_chroma_shift : dst_col_offset; // col offset needs to be adjusted for HBD. dst_buf += roffset * dst_stride + (coffset << shift); const int w = (aom_img_plane_width(src, plane) << shift); const int h = aom_img_plane_height(src, plane); int y; for (y = 0; y < h; ++y) { memcpy(dst_buf, src_buf, w); src_buf += src_stride; dst_buf += dst_stride; } } } static void decode_tile(aom_codec_ctx_t *codec, const unsigned char *frame, size_t frame_size, int tr, int tc, int ref_idx, aom_image_t *reference_images, aom_image_t *output, int *tile_idx, unsigned int *output_bit_depth, aom_image_t **img_ptr, int output_format) { AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_TILE_MODE, 1); AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_EXT_TILE_DEBUG, 1); AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_ROW, tr); AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_COL, tc); av1_ref_frame_t ref; ref.idx = 0; ref.use_external_ref = 1; ref.img = reference_images[ref_idx]; if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_REFERENCE, &ref)) { die_codec(codec, "Failed to set reference frame."); } aom_codec_err_t aom_status = aom_codec_decode(codec, frame, frame_size, NULL); if (aom_status) die_codec(codec, "Failed to decode tile."); aom_codec_iter_t iter = NULL; aom_image_t *img = aom_codec_get_frame(codec, &iter); if (!img) die_codec(codec, "Failed to get frame."); *img_ptr = img; // aom_img_alloc() sets bit_depth as follows: // output->bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8; // Use img->bit_depth(read from bitstream), so that aom_shift_img() // works as expected. output->bit_depth = img->bit_depth; *output_bit_depth = img->bit_depth; if (output_format != YUV1D) { // read out the tile size. unsigned int tile_size = 0; if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_SIZE, &tile_size)) die_codec(codec, "Failed to get the tile size"); const unsigned int tile_width = tile_size >> 16; const unsigned int tile_height = tile_size & 65535; const uint32_t output_frame_width_in_tiles = output_frame_width / tile_width; // Copy the tile to the output frame. const int row_offset = (*tile_idx / output_frame_width_in_tiles) * tile_height; const int col_offset = (*tile_idx % output_frame_width_in_tiles) * tile_width; aom_img_copy_tile(img, output, row_offset, col_offset); (*tile_idx)++; } } static void img_write_to_file(const aom_image_t *img, FILE *file, int output_format) { if (output_format == YUV) aom_img_write(img, file); else if (output_format == NV12) aom_img_write_nv12(img, file); else die("Invalid output format"); } int main(int argc, char **argv) { FILE *outfile = NULL; AvxVideoReader *reader = NULL; const AvxVideoInfo *info = NULL; int num_references; aom_img_fmt_t ref_fmt = 0; aom_image_t reference_images[MAX_EXTERNAL_REFERENCES]; aom_image_t output; aom_image_t *output_shifted = NULL; size_t frame_size = 0; const unsigned char *frame = NULL; int i, j; const char *tile_list_file = NULL; int output_format = YUV1D; exec_name = argv[0]; if (argc < 5) die("Invalid number of arguments."); reader = aom_video_reader_open(argv[1]); if (!reader) die("Failed to open %s for reading.", argv[1]); if (!(outfile = fopen(argv[2], "wb"))) die("Failed to open %s for writing.", argv[2]); num_references = (int)strtol(argv[3], NULL, 0); tile_list_file = argv[4]; if (argc > 5) output_format = (int)strtol(argv[5], NULL, 0); if (output_format < YUV1D || output_format > NV12) die("Output format out of range [0, 2]"); info = aom_video_reader_get_info(reader); aom_codec_iface_t *decoder; if (info->codec_fourcc == LST_FOURCC) decoder = get_aom_decoder_by_fourcc(AV1_FOURCC); else die("Unknown input codec."); printf("Using %s\n", aom_codec_iface_name(decoder)); aom_codec_ctx_t codec; if (aom_codec_dec_init(&codec, decoder, NULL, 0)) die_codec(&codec, "Failed to initialize decoder."); if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB, info->is_annexb)) { die("Failed to set annex b status"); } // Decode anchor frames. AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0); for (i = 0; i < num_references; ++i) { aom_video_reader_read_frame(reader); frame = aom_video_reader_get_frame(reader, &frame_size); if (aom_codec_decode(&codec, frame, frame_size, NULL)) die_codec(&codec, "Failed to decode frame."); if (i == 0) { if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt)) die_codec(&codec, "Failed to get the image format"); int frame_res[2]; if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_SIZE, frame_res)) die_codec(&codec, "Failed to get the image frame size"); // Allocate memory to store decoded references. Allocate memory with the // border so that it can be used as a reference. for (j = 0; j < num_references; j++) { unsigned int border = AOM_DEC_BORDER_IN_PIXELS; if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt, frame_res[0], frame_res[1], 32, 8, border)) { die("Failed to allocate references."); } } } if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_COPY_NEW_FRAME_IMAGE, &reference_images[i])) die_codec(&codec, "Failed to copy decoded reference frame"); aom_codec_iter_t iter = NULL; aom_image_t *img = NULL; while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) { char name[1024]; snprintf(name, sizeof(name), "ref_%d.yuv", i); printf("writing ref image to %s, %u, %u\n", name, img->d_w, img->d_h); FILE *ref_file = fopen(name, "wb"); aom_img_write(img, ref_file); fclose(ref_file); } } FILE *infile = aom_video_reader_get_file(reader); // Record the offset of the first camera image. const FileOffset camera_frame_pos = ftello(infile); printf("Loading compressed frames into memory.\n"); // Count the frames in the lightfield. int num_frames = 0; while (aom_video_reader_read_frame(reader)) { ++num_frames; } if (num_frames < 1) die("Input light field has no frames."); // Read all of the lightfield frames into memory. unsigned char **frames = (unsigned char **)malloc(num_frames * sizeof(unsigned char *)); size_t *frame_sizes = (size_t *)malloc(num_frames * sizeof(size_t)); if (!(frames && frame_sizes)) die("Failed to allocate frame data."); // Seek to the first camera image. fseeko(infile, camera_frame_pos, SEEK_SET); for (int f = 0; f < num_frames; ++f) { aom_video_reader_read_frame(reader); frame = aom_video_reader_get_frame(reader, &frame_size); frames[f] = (unsigned char *)malloc(frame_size * sizeof(unsigned char)); if (!frames[f]) die("Failed to allocate frame data."); memcpy(frames[f], frame, frame_size); frame_sizes[f] = frame_size; } printf("Read %d frames.\n", num_frames); if (output_format != YUV1D) { // Allocate the output frame. aom_img_fmt_t out_fmt = ref_fmt; if (FORCE_HIGHBITDEPTH_DECODING) out_fmt |= AOM_IMG_FMT_HIGHBITDEPTH; if (!aom_img_alloc(&output, out_fmt, output_frame_width, output_frame_height, 32)) die("Failed to allocate output image."); } printf("Decoding tile list from file.\n"); char line[1024]; FILE *tile_list_fptr = fopen(tile_list_file, "r"); if (!tile_list_fptr) die_codec(&codec, "Failed to open tile list file."); int tile_list_cnt = 0; int tile_list_writes = 0; int tile_idx = 0; aom_image_t *out = NULL; unsigned int output_bit_depth = 0; while ((fgets(line, 1024, tile_list_fptr)) != NULL) { if (line[0] == 'F') { if (output_format != YUV1D) { // Write out the tile list. if (tile_list_cnt) { out = &output; if (output_bit_depth != 0) { if (!aom_shift_img(output_bit_depth, &out, &output_shifted)) { die("Error allocating image"); } } img_write_to_file(out, outfile, output_format); tile_list_writes++; } tile_list_cnt++; tile_idx = 0; // Then memset the frame. memset(output.img_data, 0, output.sz); } continue; } int image_idx, ref_idx, tc, tr; sscanf(line, "%d %d %d %d", &image_idx, &ref_idx, &tc, &tr); if (image_idx >= num_frames) { die("Tile list image_idx out of bounds: %d >= %d.", image_idx, num_frames); } if (ref_idx >= num_references) { die("Tile list ref_idx out of bounds: %d >= %d.", ref_idx, num_references); } frame = frames[image_idx]; frame_size = frame_sizes[image_idx]; aom_image_t *img = NULL; decode_tile(&codec, frame, frame_size, tr, tc, ref_idx, reference_images, &output, &tile_idx, &output_bit_depth, &img, output_format); if (output_format == YUV1D) { out = img; if (output_bit_depth != 0) { if (!aom_shift_img(output_bit_depth, &out, &output_shifted)) { die("Error allocating image"); } } aom_img_write(out, outfile); } } if (output_format != YUV1D) { // Write out the last tile list. if (tile_list_writes < tile_list_cnt) { out = &output; if (output_bit_depth != 0) { if (!aom_shift_img(output_bit_depth, &out, &output_shifted)) { die("Error allocating image"); } } img_write_to_file(out, outfile, output_format); } } if (output_shifted) aom_img_free(output_shifted); if (output_format != YUV1D) aom_img_free(&output); for (i = 0; i < num_references; i++) aom_img_free(&reference_images[i]); for (int f = 0; f < num_frames; ++f) { free(frames[f]); } free(frame_sizes); free(frames); if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); aom_video_reader_close(reader); fclose(outfile); return EXIT_SUCCESS; } aom-3.12.1/examples/lightfield_encoder.c000066400000000000000000000513241477627663500202020ustar00rootroot00000000000000/* * Copyright (c) 2017, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Lightfield Encoder // ================== // // This is an example of a simple lightfield encoder. It builds upon the // twopass_encoder.c example. It takes an input file in YV12 format, // treating it as a planar lightfield instead of a video. The img_width // and img_height arguments are the dimensions of the lightfield images, // while the lf_width and lf_height arguments are the number of // lightfield images in each dimension. The lf_blocksize determines the // number of reference images used for MCP. For example, 5 means that there // is a reference image for every 5x5 lightfield image block. All images // within a block will use the center image in that block as the reference // image for MCP. // Run "make test" to download lightfield test data: vase10x10.yuv. // Run lightfield encoder to encode whole lightfield: // examples/lightfield_encoder 1024 1024 vase10x10.yuv vase10x10.ivf 10 10 5 // Note: In bitstream.c and encoder.c, define EXT_TILE_DEBUG as 1 will print // out the uncompressed header and the frame contexts, which can be used to // test the bit exactness of the headers and the frame contexts for large scale // tile coded frames. #include #include #include #include "aom/aom_encoder.h" #include "aom/aomcx.h" #include "aom_scale/yv12config.h" #include "av1/common/enums.h" #include "av1/encoder/encoder_utils.h" #include "common/tools_common.h" #include "common/video_writer.h" static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s " " \n", exec_name); exit(EXIT_FAILURE); } static int img_size_bytes(aom_image_t *img) { int image_size_bytes = 0; int plane; for (plane = 0; plane < 3; ++plane) { const int w = aom_img_plane_width(img, plane) * ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); const int h = aom_img_plane_height(img, plane); image_size_bytes += w * h; } return image_size_bytes; } static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img, aom_codec_pts_t pts, unsigned int duration, aom_enc_frame_flags_t flags, aom_fixed_buf_t *stats) { int got_pkts = 0; aom_codec_iter_t iter = NULL; const aom_codec_cx_pkt_t *pkt = NULL; const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags); if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to get frame stats."); while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) { got_pkts = 1; if (pkt->kind == AOM_CODEC_STATS_PKT) { const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf; const size_t pkt_size = pkt->data.twopass_stats.sz; stats->buf = realloc(stats->buf, stats->sz + pkt_size); if (!stats->buf) die("Failed to allocate frame stats buffer."); memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size); stats->sz += pkt_size; } } return got_pkts; } static int encode_frame(aom_codec_ctx_t *ctx, const aom_image_t *img, aom_codec_pts_t pts, unsigned int duration, aom_enc_frame_flags_t flags, AvxVideoWriter *writer) { int got_pkts = 0; aom_codec_iter_t iter = NULL; const aom_codec_cx_pkt_t *pkt = NULL; const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags); if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to encode frame."); while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) { got_pkts = 1; if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, pkt->data.frame.sz, pkt->data.frame.pts)) die_codec(ctx, "Failed to write compressed frame."); printf(keyframe ? "K" : "."); fflush(stdout); } } return got_pkts; } static void get_raw_image(aom_image_t **frame_to_encode, aom_image_t *raw, aom_image_t *raw_shift) { if (FORCE_HIGHBITDEPTH_DECODING) { // Need to allocate larger buffer to use hbd internal. int input_shift = 0; aom_img_upshift(raw_shift, raw, input_shift); *frame_to_encode = raw_shift; } else { *frame_to_encode = raw; } } static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile, aom_codec_iface_t *encoder, const aom_codec_enc_cfg_t *cfg, int lf_width, int lf_height, int lf_blocksize, int flags, aom_image_t *raw_shift) { aom_codec_ctx_t codec; int frame_count = 0; int image_size_bytes = img_size_bytes(raw); int u_blocks, v_blocks; int bu, bv; aom_fixed_buf_t stats = { NULL, 0 }; aom_image_t *frame_to_encode; if (aom_codec_enc_init(&codec, encoder, cfg, flags)) die("Failed to initialize encoder"); if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0)) die_codec(&codec, "Failed to turn off auto altref"); if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0)) die_codec(&codec, "Failed to set frame parallel decoding"); // How many reference images we need to encode. u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize; v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize; printf("\n First pass: "); for (bv = 0; bv < v_blocks; ++bv) { for (bu = 0; bu < u_blocks; ++bu) { const int block_u_min = bu * lf_blocksize; const int block_v_min = bv * lf_blocksize; int block_u_end = (bu + 1) * lf_blocksize; int block_v_end = (bv + 1) * lf_blocksize; int u_block_size, v_block_size; int block_ref_u, block_ref_v; block_u_end = block_u_end < lf_width ? block_u_end : lf_width; block_v_end = block_v_end < lf_height ? block_v_end : lf_height; u_block_size = block_u_end - block_u_min; v_block_size = block_v_end - block_v_min; block_ref_u = block_u_min + u_block_size / 2; block_ref_v = block_v_min + v_block_size / 2; printf("A%d, ", (block_ref_u + block_ref_v * lf_width)); fseek(infile, (block_ref_u + block_ref_v * lf_width) * image_size_bytes, SEEK_SET); aom_img_read(raw, infile); get_raw_image(&frame_to_encode, raw, raw_shift); // Reference frames can be encoded encoded without tiles. ++frame_count; get_frame_stats(&codec, frame_to_encode, frame_count, 1, AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF, &stats); } } if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 1)) die_codec(&codec, "Failed to set frame parallel decoding"); for (bv = 0; bv < v_blocks; ++bv) { for (bu = 0; bu < u_blocks; ++bu) { const int block_u_min = bu * lf_blocksize; const int block_v_min = bv * lf_blocksize; int block_u_end = (bu + 1) * lf_blocksize; int block_v_end = (bv + 1) * lf_blocksize; int u, v; block_u_end = block_u_end < lf_width ? block_u_end : lf_width; block_v_end = block_v_end < lf_height ? block_v_end : lf_height; for (v = block_v_min; v < block_v_end; ++v) { for (u = block_u_min; u < block_u_end; ++u) { printf("C%d, ", (u + v * lf_width)); fseek(infile, (u + v * lf_width) * image_size_bytes, SEEK_SET); aom_img_read(raw, infile); get_raw_image(&frame_to_encode, raw, raw_shift); ++frame_count; get_frame_stats(&codec, frame_to_encode, frame_count, 1, AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY, &stats); } } } } // Flush encoder. // No ARF, this should not be needed. while (get_frame_stats(&codec, NULL, frame_count, 1, 0, &stats)) { } if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); printf("\nFirst pass complete. Processed %d frames.\n", frame_count); return stats; } static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name, aom_codec_iface_t *encoder, aom_codec_enc_cfg_t *cfg, int lf_width, int lf_height, int lf_blocksize, int flags, aom_image_t *raw_shift) { AvxVideoInfo info = { get_fourcc_by_aom_encoder(encoder), cfg->g_w, cfg->g_h, { cfg->g_timebase.num, cfg->g_timebase.den }, 0 }; AvxVideoWriter *writer = NULL; aom_codec_ctx_t codec; int frame_count = 0; int image_size_bytes = img_size_bytes(raw); int bu, bv; int u_blocks, v_blocks; aom_image_t *frame_to_encode; aom_image_t reference_images[MAX_EXTERNAL_REFERENCES]; int reference_image_num = 0; int i; writer = aom_video_writer_open(outfile_name, kContainerIVF, &info); if (!writer) die("Failed to open %s for writing", outfile_name); if (aom_codec_enc_init(&codec, encoder, cfg, flags)) die("Failed to initialize encoder"); if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0)) die_codec(&codec, "Failed to turn off auto altref"); if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0)) die_codec(&codec, "Failed to set frame parallel decoding"); if (aom_codec_control(&codec, AV1E_ENABLE_EXT_TILE_DEBUG, 1)) die_codec(&codec, "Failed to enable encoder ext_tile debug"); if (aom_codec_control(&codec, AOME_SET_CPUUSED, 3)) die_codec(&codec, "Failed to set cpu-used"); // Note: The superblock is a sequence parameter and has to be the same for 1 // sequence. In lightfield application, must choose the superblock size(either // 64x64 or 128x128) before the encoding starts. Otherwise, the default is // AOM_SUPERBLOCK_SIZE_DYNAMIC, and the superblock size will be set to 64x64 // internally. if (aom_codec_control(&codec, AV1E_SET_SUPERBLOCK_SIZE, AOM_SUPERBLOCK_SIZE_64X64)) die_codec(&codec, "Failed to set SB size"); u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize; v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize; reference_image_num = u_blocks * v_blocks; // Set the max gf group length so the references are guaranteed to be in // a different gf group than any of the regular frames. This avoids using // both vbr and constant quality mode in a single group. The number of // references now cannot surpass 17 because of the enforced MAX_GF_INTERVAL of // 16. If it is necessary to exceed this reference frame limit, one will have // to do some additional handling to ensure references are in separate gf // groups from the regular frames. if (aom_codec_control(&codec, AV1E_SET_MAX_GF_INTERVAL, reference_image_num - 1)) die_codec(&codec, "Failed to set max gf interval"); aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420; if (FORCE_HIGHBITDEPTH_DECODING) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH; // Allocate memory with the border so that it can be used as a reference. const bool resize = codec.config.enc->rc_resize_mode || codec.config.enc->rc_superres_mode; const bool all_intra = reference_image_num - 1 == 0; int border_in_pixels = av1_get_enc_border_size(resize, all_intra, BLOCK_64X64); for (i = 0; i < reference_image_num; i++) { if (!aom_img_alloc_with_border(&reference_images[i], ref_fmt, cfg->g_w, cfg->g_h, 32, 8, border_in_pixels)) { die("Failed to allocate image."); } } printf("\n Second pass: "); // Encode reference images first. printf("Encoding Reference Images\n"); for (bv = 0; bv < v_blocks; ++bv) { for (bu = 0; bu < u_blocks; ++bu) { const int block_u_min = bu * lf_blocksize; const int block_v_min = bv * lf_blocksize; int block_u_end = (bu + 1) * lf_blocksize; int block_v_end = (bv + 1) * lf_blocksize; int u_block_size, v_block_size; int block_ref_u, block_ref_v; block_u_end = block_u_end < lf_width ? block_u_end : lf_width; block_v_end = block_v_end < lf_height ? block_v_end : lf_height; u_block_size = block_u_end - block_u_min; v_block_size = block_v_end - block_v_min; block_ref_u = block_u_min + u_block_size / 2; block_ref_v = block_v_min + v_block_size / 2; printf("A%d, ", (block_ref_u + block_ref_v * lf_width)); fseek(infile, (block_ref_u + block_ref_v * lf_width) * image_size_bytes, SEEK_SET); aom_img_read(raw, infile); get_raw_image(&frame_to_encode, raw, raw_shift); // Reference frames may be encoded without tiles. ++frame_count; printf("Encoding reference image %d of %d\n", bv * u_blocks + bu, u_blocks * v_blocks); encode_frame(&codec, frame_to_encode, frame_count, 1, AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY, writer); if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE, &reference_images[frame_count - 1])) die_codec(&codec, "Failed to copy decoder reference frame"); } } cfg->large_scale_tile = 1; // Fixed q encoding for camera frames. cfg->rc_end_usage = AOM_Q; if (aom_codec_enc_config_set(&codec, cfg)) die_codec(&codec, "Failed to configure encoder"); // The fixed q value used in encoding. if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 36)) die_codec(&codec, "Failed to set cq level"); if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 1)) die_codec(&codec, "Failed to set frame parallel decoding"); if (aom_codec_control(&codec, AV1E_SET_SINGLE_TILE_DECODING, 1)) die_codec(&codec, "Failed to turn on single tile decoding"); // Set tile_columns and tile_rows to MAX values, which guarantees the tile // size of 64 x 64 pixels(i.e. 1 SB) for <= 4k resolution. if (aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, 6)) die_codec(&codec, "Failed to set tile width"); if (aom_codec_control(&codec, AV1E_SET_TILE_ROWS, 6)) die_codec(&codec, "Failed to set tile height"); for (bv = 0; bv < v_blocks; ++bv) { for (bu = 0; bu < u_blocks; ++bu) { const int block_u_min = bu * lf_blocksize; const int block_v_min = bv * lf_blocksize; int block_u_end = (bu + 1) * lf_blocksize; int block_v_end = (bv + 1) * lf_blocksize; int u, v; block_u_end = block_u_end < lf_width ? block_u_end : lf_width; block_v_end = block_v_end < lf_height ? block_v_end : lf_height; for (v = block_v_min; v < block_v_end; ++v) { for (u = block_u_min; u < block_u_end; ++u) { av1_ref_frame_t ref; ref.idx = 0; ref.use_external_ref = 1; ref.img = reference_images[bv * u_blocks + bu]; if (aom_codec_control(&codec, AV1_SET_REFERENCE, &ref)) die_codec(&codec, "Failed to set reference frame"); printf("C%d, ", (u + v * lf_width)); fseek(infile, (u + v * lf_width) * image_size_bytes, SEEK_SET); aom_img_read(raw, infile); get_raw_image(&frame_to_encode, raw, raw_shift); ++frame_count; printf("Encoding image %d of %d\n", frame_count - (u_blocks * v_blocks), lf_width * lf_height); encode_frame(&codec, frame_to_encode, frame_count, 1, AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY, writer); } } } } // Flush encoder. // No ARF, this should not be needed. while (encode_frame(&codec, NULL, -1, 1, 0, writer)) { } for (i = 0; i < reference_image_num; i++) aom_img_free(&reference_images[i]); if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); // Modify large_scale_file fourcc. if (cfg->large_scale_tile == 1) aom_video_writer_set_fourcc(writer, LST_FOURCC); aom_video_writer_close(writer); printf("\nSecond pass complete. Processed %d frames.\n", frame_count); } int main(int argc, char **argv) { FILE *infile = NULL; int w, h; // The number of lightfield images in the u and v dimensions. int lf_width, lf_height; // Defines how many images refer to the same reference image for MCP. // lf_blocksize X lf_blocksize images will all use the reference image // in the middle of the block of images. int lf_blocksize; aom_codec_ctx_t codec; aom_codec_enc_cfg_t cfg; aom_image_t raw; aom_image_t raw_shift; aom_codec_err_t res; aom_fixed_buf_t stats; int flags = 0; const int fps = 30; const int bitrate = 200; // kbit/s const char *const width_arg = argv[1]; const char *const height_arg = argv[2]; const char *const infile_arg = argv[3]; const char *const outfile_arg = argv[4]; const char *const lf_width_arg = argv[5]; const char *const lf_height_arg = argv[6]; const char *lf_blocksize_arg = argv[7]; exec_name = argv[0]; if (argc < 8) die("Invalid number of arguments"); aom_codec_iface_t *encoder = get_aom_encoder_by_short_name("av1"); if (!encoder) die("Unsupported codec."); w = (int)strtol(width_arg, NULL, 0); h = (int)strtol(height_arg, NULL, 0); lf_width = (int)strtol(lf_width_arg, NULL, 0); lf_height = (int)strtol(lf_height_arg, NULL, 0); lf_blocksize = (int)strtol(lf_blocksize_arg, NULL, 0); lf_blocksize = lf_blocksize < lf_width ? lf_blocksize : lf_width; lf_blocksize = lf_blocksize < lf_height ? lf_blocksize : lf_height; if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0) die("Invalid frame size: %dx%d", w, h); if (lf_width <= 0 || lf_height <= 0) die("Invalid lf_width and/or lf_height: %dx%d", lf_width, lf_height); if (lf_blocksize <= 0) die("Invalid lf_blocksize: %d", lf_blocksize); if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 32)) { die("Failed to allocate image."); } if (FORCE_HIGHBITDEPTH_DECODING) { // Need to allocate larger buffer to use hbd internal. aom_img_alloc(&raw_shift, AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH, w, h, 32); } printf("Using %s\n", aom_codec_iface_name(encoder)); // Configuration res = aom_codec_enc_config_default(encoder, &cfg, 0); if (res) die_codec(&codec, "Failed to get default codec config."); cfg.g_w = w; cfg.g_h = h; cfg.g_timebase.num = 1; cfg.g_timebase.den = fps; cfg.rc_target_bitrate = bitrate; cfg.g_error_resilient = 0; // This is required. cfg.g_lag_in_frames = 0; // need to set this since default is 19. cfg.kf_mode = AOM_KF_DISABLED; cfg.large_scale_tile = 0; // Only set it to 1 for camera frame encoding. cfg.g_bit_depth = AOM_BITS_8; flags |= (cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING) ? AOM_CODEC_USE_HIGHBITDEPTH : 0; if (!(infile = fopen(infile_arg, "rb"))) die("Failed to open %s for reading", infile_arg); // Pass 0 cfg.g_pass = AOM_RC_FIRST_PASS; stats = pass0(&raw, infile, encoder, &cfg, lf_width, lf_height, lf_blocksize, flags, &raw_shift); // Pass 1 rewind(infile); cfg.g_pass = AOM_RC_LAST_PASS; cfg.rc_twopass_stats_in = stats; pass1(&raw, infile, outfile_arg, encoder, &cfg, lf_width, lf_height, lf_blocksize, flags, &raw_shift); free(stats.buf); if (FORCE_HIGHBITDEPTH_DECODING) aom_img_free(&raw_shift); aom_img_free(&raw); fclose(infile); return EXIT_SUCCESS; } aom-3.12.1/examples/lightfield_tile_list_decoder.c000066400000000000000000000206701477627663500222400ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Lightfield Tile List Decoder // ============================ // // This is a lightfield tile list decoder example. It takes an input file that // contains the anchor frames that are references of the coded tiles, the camera // frame header, and tile list OBUs that include the tile information and the // compressed tile data. This input file is reconstructed from the encoded // lightfield ivf file, and is decodable by AV1 decoder. num_references is // the number of anchor frames coded at the beginning of the light field file. // num_tile_lists is the number of tile lists need to be decoded. There is an // optional parameter allowing to choose the output format, and the supported // formats are YUV1D(default), YUV, and NV12. // Run lightfield tile list decoder to decode an AV1 tile list file: // examples/lightfield_tile_list_decoder vase_tile_list.ivf vase_tile_list.yuv // 4 2 0(optional) #include #include #include #include #include "aom/aom_decoder.h" #include "aom/aomdx.h" #include "aom_scale/yv12config.h" #include "av1/common/enums.h" #include "common/tools_common.h" #include "common/video_reader.h" enum { YUV1D, // 1D tile output for conformance test. YUV, // Tile output in YUV format. NV12, // Tile output in NV12 format. } UENUM1BYTE(OUTPUT_FORMAT); static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s " "\n", exec_name); exit(EXIT_FAILURE); } static void write_tile_yuv1d(aom_codec_ctx_t *codec, const aom_image_t *img, FILE *file) { // read out the tile size. unsigned int tile_size = 0; if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_SIZE, &tile_size)) die_codec(codec, "Failed to get the tile size"); const unsigned int tile_width = tile_size >> 16; const unsigned int tile_height = tile_size & 65535; const uint32_t output_frame_width_in_tiles = img->d_w / tile_width; unsigned int tile_count = 0; if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_COUNT, &tile_count)) die_codec(codec, "Failed to get the tile size"); // Write tile to file. const int shift = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0; unsigned int tile_idx; for (tile_idx = 0; tile_idx < tile_count; ++tile_idx) { const int row_offset = (tile_idx / output_frame_width_in_tiles) * tile_height; const int col_offset = (tile_idx % output_frame_width_in_tiles) * tile_width; int plane; for (plane = 0; plane < 3; ++plane) { const unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; const int roffset = (plane > 0) ? row_offset >> img->y_chroma_shift : row_offset; const int coffset = (plane > 0) ? col_offset >> img->x_chroma_shift : col_offset; const int w = (plane > 0) ? ((tile_width >> img->x_chroma_shift) << shift) : (tile_width << shift); const int h = (plane > 0) ? (tile_height >> img->y_chroma_shift) : tile_height; int y; // col offset needs to be adjusted for HBD. buf += roffset * stride + (coffset << shift); for (y = 0; y < h; ++y) { fwrite(buf, 1, w, file); buf += stride; } } } } int main(int argc, char **argv) { FILE *outfile = NULL; AvxVideoReader *reader = NULL; const AvxVideoInfo *info = NULL; int num_references; int num_tile_lists; aom_image_t reference_images[MAX_EXTERNAL_REFERENCES]; size_t frame_size = 0; const unsigned char *frame = NULL; int output_format = YUV1D; int i, j, n; exec_name = argv[0]; if (argc < 5) die("Invalid number of arguments."); reader = aom_video_reader_open(argv[1]); if (!reader) die("Failed to open %s for reading.", argv[1]); if (!(outfile = fopen(argv[2], "wb"))) die("Failed to open %s for writing.", argv[2]); num_references = (int)strtol(argv[3], NULL, 0); num_tile_lists = (int)strtol(argv[4], NULL, 0); if (argc > 5) output_format = (int)strtol(argv[5], NULL, 0); if (output_format < YUV1D || output_format > NV12) die("Output format out of range [0, 2]"); info = aom_video_reader_get_info(reader); aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); if (!decoder) die("Unknown input codec."); printf("Using %s\n", aom_codec_iface_name(decoder)); aom_codec_ctx_t codec; if (aom_codec_dec_init(&codec, decoder, NULL, 0)) die("Failed to initialize decoder."); if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB, info->is_annexb)) { die_codec(&codec, "Failed to set annex b status"); } // Decode anchor frames. AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0); for (i = 0; i < num_references; ++i) { aom_video_reader_read_frame(reader); frame = aom_video_reader_get_frame(reader, &frame_size); if (aom_codec_decode(&codec, frame, frame_size, NULL)) die_codec(&codec, "Failed to decode frame."); if (i == 0) { aom_img_fmt_t ref_fmt = 0; if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt)) die_codec(&codec, "Failed to get the image format"); int frame_res[2]; if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_SIZE, frame_res)) die_codec(&codec, "Failed to get the image frame size"); // Allocate memory to store decoded references. Allocate memory with the // border so that it can be used as a reference. for (j = 0; j < num_references; j++) { unsigned int border = AOM_DEC_BORDER_IN_PIXELS; if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt, frame_res[0], frame_res[1], 32, 8, border)) { fatal("Failed to allocate references."); } } } if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_COPY_NEW_FRAME_IMAGE, &reference_images[i])) die_codec(&codec, "Failed to copy decoded reference frame"); aom_codec_iter_t iter = NULL; aom_image_t *img = NULL; while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) { char name[1024]; snprintf(name, sizeof(name), "ref_%d.yuv", i); printf("writing ref image to %s, %u, %u\n", name, img->d_w, img->d_h); FILE *ref_file = fopen(name, "wb"); aom_img_write(img, ref_file); fclose(ref_file); } } // Decode the lightfield. AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 1); // Set external references. av1_ext_ref_frame_t set_ext_ref = { &reference_images[0], num_references }; AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_EXT_REF_PTR, &set_ext_ref); // Must decode the camera frame header first. aom_video_reader_read_frame(reader); frame = aom_video_reader_get_frame(reader, &frame_size); if (aom_codec_decode(&codec, frame, frame_size, NULL)) die_codec(&codec, "Failed to decode the frame."); // Decode tile lists one by one. for (n = 0; n < num_tile_lists; n++) { aom_video_reader_read_frame(reader); frame = aom_video_reader_get_frame(reader, &frame_size); if (aom_codec_decode(&codec, frame, frame_size, NULL)) die_codec(&codec, "Failed to decode the tile list."); aom_codec_iter_t iter = NULL; aom_image_t *img = aom_codec_get_frame(&codec, &iter); if (!img) die_codec(&codec, "Failed to get frame."); if (output_format == YUV1D) // write the tile to the output file in 1D format. write_tile_yuv1d(&codec, img, outfile); else if (output_format == YUV) aom_img_write(img, outfile); else // NV12 output format aom_img_write_nv12(img, outfile); } for (i = 0; i < num_references; i++) aom_img_free(&reference_images[i]); if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); aom_video_reader_close(reader); fclose(outfile); return EXIT_SUCCESS; } aom-3.12.1/examples/lossless_encoder.c000066400000000000000000000104411477627663500177310ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include #include #include #include "aom/aom_encoder.h" #include "aom/aomcx.h" #include "common/tools_common.h" #include "common/video_writer.h" static const char *exec_name; void usage_exit(void) { fprintf(stderr, "lossless_encoder: Example demonstrating lossless " "encoding feature. Supports raw input only.\n"); fprintf(stderr, "Usage: %s \n", exec_name); exit(EXIT_FAILURE); } static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img, int frame_index, int flags, AvxVideoWriter *writer) { int got_pkts = 0; aom_codec_iter_t iter = NULL; const aom_codec_cx_pkt_t *pkt = NULL; const aom_codec_err_t res = aom_codec_encode(codec, img, frame_index, 1, flags); if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame"); while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) { got_pkts = 1; if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, pkt->data.frame.sz, pkt->data.frame.pts)) { die_codec(codec, "Failed to write compressed frame"); } printf(keyframe ? "K" : "."); fflush(stdout); } } return got_pkts; } int main(int argc, char **argv) { FILE *infile = NULL; aom_codec_enc_cfg_t cfg; int frame_count = 0; aom_image_t raw; aom_codec_err_t res; AvxVideoInfo info; AvxVideoWriter *writer = NULL; const int fps = 30; exec_name = argv[0]; // Clear explicitly, as simply assigning "{ 0 }" generates // "missing-field-initializers" warning in some compilers. memset(&info, 0, sizeof(info)); if (argc < 5) die("Invalid number of arguments"); aom_codec_iface_t *encoder = get_aom_encoder_by_short_name("av1"); if (!encoder) die("Unsupported codec."); info.codec_fourcc = get_fourcc_by_aom_encoder(encoder); info.frame_width = (int)strtol(argv[1], NULL, 0); info.frame_height = (int)strtol(argv[2], NULL, 0); info.time_base.numerator = 1; info.time_base.denominator = fps; if (info.frame_width <= 0 || info.frame_height <= 0 || (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); } if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width, info.frame_height, 1)) { die("Failed to allocate image."); } printf("Using %s\n", aom_codec_iface_name(encoder)); aom_codec_ctx_t codec; res = aom_codec_enc_config_default(encoder, &cfg, 0); if (res) die_codec(&codec, "Failed to get default codec config."); cfg.g_w = info.frame_width; cfg.g_h = info.frame_height; cfg.g_timebase.num = info.time_base.numerator; cfg.g_timebase.den = info.time_base.denominator; writer = aom_video_writer_open(argv[4], kContainerIVF, &info); if (!writer) die("Failed to open %s for writing.", argv[4]); if (!(infile = fopen(argv[3], "rb"))) die("Failed to open %s for reading.", argv[3]); if (aom_codec_enc_init(&codec, encoder, &cfg, 0)) die("Failed to initialize encoder"); if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1E_SET_LOSSLESS, 1)) die_codec(&codec, "Failed to use lossless mode"); // Encode frames. while (aom_img_read(&raw, infile)) { encode_frame(&codec, &raw, frame_count++, 0, writer); } // Flush encoder. while (encode_frame(&codec, NULL, -1, 0, writer)) { } printf("\n"); fclose(infile); printf("Processed %d frames.\n", frame_count); aom_img_free(&raw); if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); aom_video_writer_close(writer); return EXIT_SUCCESS; } aom-3.12.1/examples/multilayer_metadata.cc000066400000000000000000001131161477627663500205600ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "examples/multilayer_metadata.h" #include #include #include #include #include #include #include #include #include #include #include #include #include "aom/aom_integer.h" #include "examples/multilayer_metadata.h" namespace libaom_examples { namespace { #define RETURN_IF_FALSE(A) \ do { \ if (!(A)) { \ return false; \ } \ } while (0) constexpr int kMaxNumSpatialLayers = 4; // Removes comments and trailing spaces from the line. void cleanup_line(std::string &line) { // Remove everything after the first '#'. std::size_t comment_pos = line.find('#'); if (comment_pos != std::string::npos) { line.resize(comment_pos); } // Remove spaces at the end of the line. while (!line.empty() && line.back() == ' ') { line.resize(line.length() - 1); } } // Finds the indentation level of the line, and sets 'has_list_prefix' to true // if the line has a '-' indicating a new item in a list. void get_indent(const std::string &line, int *indent, bool *has_list_prefix) { *indent = 0; *has_list_prefix = false; while ( *indent < static_cast(line.length()) && (line[*indent] == ' ' || line[*indent] == '\t' || line[*indent] == '-')) { if (line[*indent] == '-') { *has_list_prefix = true; } ++(*indent); } } class ParsedValue { public: enum class Type { kNone, kInteger, kFloatingPoint }; void SetIntegerValue(int64_t v) { type_ = Type::kInteger; int_value_ = v; } void SetFloatingPointValue(double v) { type_ = Type::kFloatingPoint; double_value_ = v; } void Clear() { type_ = Type::kNone; } bool ValueAsFloatingPoint(int line_idx, double *v) { if (type_ == Type::kNone) { fprintf( stderr, "No value found where floating point value was expected at line %d\n", line_idx); return false; } *v = (type_ == Type::kFloatingPoint) ? double_value_ : static_cast(int_value_); return true; } template bool IntegerValueInRange(int64_t min, int64_t max, int line_idx, T *v) { switch (type_) { case Type::kInteger: if (int_value_ < min || int_value_ > max) { fprintf(stderr, "Integer value %" PRId64 " out of range [%" PRId64 ", %" PRId64 "] at line %d\n", int_value_, min, max, line_idx); return false; } *v = static_cast(int_value_); return true; case Type::kFloatingPoint: fprintf(stderr, "Floating point value found where integer was expected at line " "%d\n", line_idx); return false; case Type::kNone: default: fprintf(stderr, "No value found where integer was expected at line %d\n", line_idx); return false; } } private: Type type_ = Type::kNone; int64_t int_value_ = 0; double double_value_ = 0.0f; }; /* * Parses the next line from the file, skipping empty lines. * Returns false if the end of the file was reached, or if the line was indented * less than 'min_indent', meaning that parsing should go back to the previous * function in the stack. * * 'min_indent' is the minimum indentation expected for the next line. * 'is_list' must be true if the line is allowed to contain list items ('-'). * 'indent' MUST be initialized to -1 before the first call, and is then set to * the indentation of the line. * 'has_list_prefix' is set to true if the line starts a new list item with '-'. * 'line_idx' is set to the index of the last line read. * 'field_name' is set to the field name if the line contains a colon, or to an * empty string otherwise. * 'value' is set to the value on the line if present. * In case of syntax error, 'syntax_error' is set to true and the function * returns false. */ bool parse_line(std::ifstream &file, int min_indent, bool is_list, int *indent, bool *has_list_prefix, int *line_idx, std::string *field_name, ParsedValue *value, bool *syntax_error) { *field_name = ""; *syntax_error = false; value->Clear(); std::string line; std::ifstream::pos_type prev_file_position; const int prev_indent = *indent; while (prev_file_position = file.tellg(), std::getline(file, line)) { cleanup_line(line); get_indent(line, indent, has_list_prefix); line = line.substr(*indent); // skip indentation // If the line is indented less than 'min_indent', it belongs to the outer // object, and parsing should go back to the previous function in the stack. if (!line.empty() && (*indent < min_indent || (prev_indent > 0 && *indent < prev_indent))) { // Undo reading the last line. if (!file.seekg(prev_file_position, std::ios::beg)) { fprintf(stderr, "Failed to seek to previous file position\n"); *syntax_error = true; return false; } return false; } ++(*line_idx); if (line.empty()) continue; if (prev_indent >= 0 && prev_indent != *indent) { fprintf(stderr, "Error: Bad indentation at line %d\n", *line_idx); *syntax_error = true; return false; } if (*has_list_prefix && !is_list) { fprintf(stderr, "Error: Unexpected list item at line %d\n", *line_idx); *syntax_error = true; return false; } std::string value_str = line; size_t colon_pos = line.find(':'); if (colon_pos != std::string::npos) { *field_name = line.substr(0, colon_pos); value_str = line.substr(colon_pos + 1); } if (!value_str.empty()) { char *endptr; if (line.find('.') != std::string::npos) { value->SetFloatingPointValue(strtod(value_str.c_str(), &endptr)); if (*endptr != '\0') { fprintf(stderr, "Error: Failed to parse floating point value from '%s' at " "line %d\n", value_str.c_str(), *line_idx); *syntax_error = true; return false; } } else { value->SetIntegerValue(strtol(value_str.c_str(), &endptr, 10)); if (*endptr != '\0') { fprintf(stderr, "Error: Failed to parse integer from '%s' at line %d\n", value_str.c_str(), *line_idx); *syntax_error = true; return false; } } } return true; } return false; // Reached the end of the file. } template bool parse_integer_list(std::ifstream &file, int min_indent, int *line_idx, std::vector *result) { bool has_list_prefix; int indent = -1; std::string field_name; ParsedValue value; bool syntax_error; while (parse_line(file, min_indent, /*is_list=*/true, &indent, &has_list_prefix, line_idx, &field_name, &value, &syntax_error)) { if (!field_name.empty()) { fprintf( stderr, "Error: Unexpected field name '%s' at line %d, expected a number\n", field_name.c_str(), *line_idx); return false; } else if (!has_list_prefix) { fprintf(stderr, "Error: Missing list prefix '-' at line %d\n", *line_idx); return false; } else { T v; RETURN_IF_FALSE(value.IntegerValueInRange( static_cast(std::numeric_limits::min()), static_cast(std::numeric_limits::max()), *line_idx, &v)); result->push_back(v); } } if (syntax_error) return false; return true; } template std::pair value_present(const T &v) { return std::make_pair(v, true); } bool parse_color_properties(std::ifstream &file, int min_indent, int *line_idx, ColorProperties *color) { bool has_list_prefix; int indent = -1; std::string field_name; ParsedValue value; bool syntax_error; *color = {}; while (parse_line(file, min_indent, /*is_list=*/false, &indent, &has_list_prefix, line_idx, &field_name, &value, &syntax_error)) { if (field_name == "color_range") { RETURN_IF_FALSE(value.IntegerValueInRange(/*min=*/0, /*max=*/1, *line_idx, &color->color_range)); } else if (field_name == "color_primaries") { if (!value.IntegerValueInRange(/*min=*/0, /*max=*/255, *line_idx, &color->color_primaries)) { return false; } } else if (field_name == "transfer_characteristics") { RETURN_IF_FALSE(value.IntegerValueInRange( /*min=*/0, /*max=*/255, *line_idx, &color->transfer_characteristics)); } else if (field_name == "matrix_coefficients") { RETURN_IF_FALSE(value.IntegerValueInRange( /*min=*/0, /*max=*/255, *line_idx, &color->matrix_coefficients)); } else { fprintf(stderr, "Error: Unknown field '%s' at line %d\n", field_name.c_str(), *line_idx); return false; } } if (syntax_error) return false; return true; } bool parse_multilayer_layer_alpha(std::ifstream &file, int min_indent, int *line_idx, AlphaInformation *alpha_info) { bool has_list_prefix; int indent = -1; std::string field_name; ParsedValue value; bool syntax_error; *alpha_info = {}; while (parse_line(file, min_indent, /*is_list=*/false, &indent, &has_list_prefix, line_idx, &field_name, &value, &syntax_error)) { if (field_name == "alpha_use_idc") { RETURN_IF_FALSE(value.IntegerValueInRange( /*min=*/0, /*max=*/7, *line_idx, &alpha_info->alpha_use_idc)); } else if (field_name == "alpha_bit_depth") { RETURN_IF_FALSE(value.IntegerValueInRange( /*min=*/8, /*max=*/15, *line_idx, &alpha_info->alpha_bit_depth)); } else if (field_name == "alpha_clip_idc") { RETURN_IF_FALSE(value.IntegerValueInRange(/*min=*/0, /*max=*/3, *line_idx, &alpha_info->alpha_clip_idc)); } else if (field_name == "alpha_incr_flag") { RETURN_IF_FALSE(value.IntegerValueInRange(/*min=*/0, /*max=*/1, *line_idx, &alpha_info->alpha_incr_flag)); } else if (field_name == "alpha_transparent_value") { // At this point we may not have parsed 'alpha_bit_depth' yet, so the // exact range is checked later. RETURN_IF_FALSE(value.IntegerValueInRange( std::numeric_limits::min(), std::numeric_limits::max(), *line_idx, &alpha_info->alpha_transparent_value)); } else if (field_name == "alpha_opaque_value") { // At this point we may not have parsed 'alpha_bit_depth' yet, so the // exact range is checked later. RETURN_IF_FALSE(value.IntegerValueInRange( std::numeric_limits::min(), std::numeric_limits::max(), *line_idx, &alpha_info->alpha_opaque_value)); } else if (field_name == "alpha_color_description") { ColorProperties color; RETURN_IF_FALSE(parse_color_properties(file, indent, line_idx, &color)); alpha_info->alpha_color_description = value_present(color); } else if (field_name == "label_type_id") { RETURN_IF_FALSE( parse_integer_list(file, /*min_indent=*/indent + 1, line_idx, &alpha_info->label_type_id)); } else { fprintf(stderr, "Error: Unknown field '%s' at line %d\n", field_name.c_str(), *line_idx); return false; } } if (syntax_error) return false; // Validation. if (alpha_info->alpha_bit_depth == 0) { fprintf(stderr, "Error: alpha_bit_depth must be specified (in range [8, 15]) for " "alpha info\n"); return false; } const int alpha_max = (1 << (alpha_info->alpha_bit_depth + 1)) - 1; if (alpha_info->alpha_transparent_value > alpha_max) { fprintf(stderr, "Error: alpha_transparent_value %d out of range [0, %d]\n", alpha_info->alpha_transparent_value, alpha_max); return false; } if (alpha_info->alpha_opaque_value > alpha_max) { fprintf(stderr, "Error: alpha_opaque_value %d out of range [0, %d]\n", alpha_info->alpha_opaque_value, alpha_max); return false; } if ((!alpha_info->label_type_id.empty()) && (alpha_info->alpha_use_idc != ALPHA_SEGMENTATION)) { fprintf(stderr, "Error: label_type_id can only be set if alpha_use_idc is %d\n", ALPHA_SEGMENTATION); return false; } const int alpha_range = (std::abs(alpha_info->alpha_opaque_value - alpha_info->alpha_transparent_value) + 1); if (!alpha_info->label_type_id.empty() && static_cast(alpha_info->label_type_id.size()) != alpha_range) { fprintf(stderr, "Error: if present, label_type_id size must be " "equal to the range of alpha values between " "alpha_transparent_value and alpha_opaque_value (expected " "%d values, found %d values)\n", alpha_range, static_cast(alpha_info->label_type_id.size())); return false; } if (alpha_info->alpha_color_description.second && (alpha_info->alpha_use_idc != ALPHA_STRAIGHT)) { fprintf(stderr, "Error: alpha_color_description can only be set if alpha_use_idc " "is %d\n", ALPHA_STRAIGHT); return false; } return true; } bool parse_multilayer_layer_depth(std::ifstream &file, int min_indent, int *line_idx, DepthInformation *depth_info) { bool has_list_prefix; int indent = -1; std::string field_name; ParsedValue value; bool syntax_error; *depth_info = {}; while (parse_line(file, min_indent, /*is_list=*/false, &indent, &has_list_prefix, line_idx, &field_name, &value, &syntax_error)) { if (field_name == "z_near") { double tmp; RETURN_IF_FALSE(value.ValueAsFloatingPoint(*line_idx, &tmp)); DepthRepresentationElement el; RETURN_IF_FALSE(double_to_depth_representation_element(tmp, &el)); depth_info->z_near = value_present(el); } else if (field_name == "z_far") { double tmp; RETURN_IF_FALSE(value.ValueAsFloatingPoint(*line_idx, &tmp)); DepthRepresentationElement el; RETURN_IF_FALSE(double_to_depth_representation_element(tmp, &el)); depth_info->z_far = value_present(el); } else if (field_name == "d_min") { double tmp; RETURN_IF_FALSE(value.ValueAsFloatingPoint(*line_idx, &tmp)); DepthRepresentationElement el; RETURN_IF_FALSE(double_to_depth_representation_element(tmp, &el)); depth_info->d_min = value_present(el); } else if (field_name == "d_max") { double tmp; RETURN_IF_FALSE(value.ValueAsFloatingPoint(*line_idx, &tmp)); DepthRepresentationElement el; RETURN_IF_FALSE(double_to_depth_representation_element(tmp, &el)); depth_info->d_max = value_present(el); } else if (field_name == "depth_representation_type") { RETURN_IF_FALSE( value.IntegerValueInRange(/*min=*/0, /*max=*/15, *line_idx, &depth_info->depth_representation_type)); } else if (field_name == "disparity_ref_view_id") { RETURN_IF_FALSE(value.IntegerValueInRange( /*min=*/0, /*max=*/3, *line_idx, &depth_info->disparity_ref_view_id)); } else if (field_name == "depth_nonlinear_precision") { RETURN_IF_FALSE( value.IntegerValueInRange(/*min=*/8, /*max=*/23, *line_idx, &depth_info->depth_nonlinear_precision)); } else if (field_name == "depth_nonlinear_representation_model") { RETURN_IF_FALSE(parse_integer_list( file, /*min_indent=*/indent + 1, line_idx, &depth_info->depth_nonlinear_representation_model)); } else { fprintf(stderr, "Error: Unknown field '%s' at line %d\n", field_name.c_str(), *line_idx); return false; } } if (syntax_error) return false; // Validation. if (depth_info->depth_representation_type == 3 && depth_info->depth_nonlinear_precision == 0) { fprintf(stderr, "Error: depth_nonlinear_precision must be specified (in range [8, " "23]) when " "depth_representation_type is 3\n"); return false; } if ((depth_info->depth_representation_type == 3) != (!depth_info->depth_nonlinear_representation_model.empty())) { fprintf(stderr, "Error: depth_nonlinear_representation_model must be set if and " "only if depth_representation_type is 3\n"); return false; } const uint32_t depth_max = (1 << depth_info->depth_nonlinear_precision) - 1; for (uint32_t v : depth_info->depth_nonlinear_representation_model) { if (v > depth_max) { fprintf(stderr, "Error: depth_nonlinear_representation_model value %d out of " "range [0, %d]\n", v, depth_max); return false; } } return true; } bool validate_layer(const LayerMetadata &layer, bool layer_has_alpha, bool layer_has_depth) { if (layer_has_alpha != (layer.layer_type == MULTILAYER_LAYER_TYPE_ALPHA && layer.layer_metadata_scope >= SCOPE_GLOBAL)) { fprintf(stderr, "Error: alpha info must be set if and only if layer_type is " "%d and layer_metadata_scpoe is >= %d\n", MULTILAYER_LAYER_TYPE_ALPHA, SCOPE_GLOBAL); return false; } if (layer_has_depth != (layer.layer_type == MULTILAYER_LAYER_TYPE_DEPTH && layer.layer_metadata_scope >= SCOPE_GLOBAL)) { fprintf(stderr, "Error: depth info must be set if and only if layer_type is " "%d and layer_metadata_scpoe is >= %d\n", MULTILAYER_LAYER_TYPE_DEPTH, SCOPE_GLOBAL); return false; } return true; } bool parse_multilayer_layer_metadata(std::ifstream &file, int min_indent, int *line_idx, std::vector &layers) { bool has_list_prefix; int indent = -1; std::string field_name; ParsedValue value; bool syntax_error; bool layer_has_alpha = false; bool layer_has_depth = false; while (parse_line(file, min_indent, /*is_list=*/true, &indent, &has_list_prefix, line_idx, &field_name, &value, &syntax_error)) { if (has_list_prefix) { // Start of a new layer. if (layers.size() >= kMaxNumSpatialLayers) { fprintf(stderr, "Error: Too many layers at line %d, the maximum is %d\n", *line_idx, kMaxNumSpatialLayers); return false; } // Validate the previous layer. if (!layers.empty()) { validate_layer(layers.back(), layer_has_alpha, layer_has_depth); } if (layers.size() == 1 && layers.back().layer_color_description.second) { fprintf(stderr, "Error: layer_color_description cannot be specified for the " "first layer\n"); return false; } layers.push_back({}); layer_has_alpha = false; layer_has_depth = false; } if (layers.empty()) { fprintf(stderr, "Error: Missing list prefix '-' at line %d\n", *line_idx); return false; } LayerMetadata *layer = &layers.back(); // Check if string starts with field name. if ((field_name == "layer_type")) { RETURN_IF_FALSE(value.IntegerValueInRange( /*min=*/0, /*max=*/31, *line_idx, &layer->layer_type)); } else if ((field_name == "luma_plane_only_flag")) { RETURN_IF_FALSE(value.IntegerValueInRange(/*min=*/0, /*max=*/1, *line_idx, &layer->luma_plane_only_flag)); } else if ((field_name == "layer_view_type")) { RETURN_IF_FALSE(value.IntegerValueInRange( /*min=*/0, /*max=*/7, *line_idx, &layer->layer_view_type)); } else if ((field_name == "group_id")) { RETURN_IF_FALSE(value.IntegerValueInRange(/*min=*/0, /*max=*/3, *line_idx, &layer->group_id)); } else if ((field_name == "layer_dependency_idc")) { RETURN_IF_FALSE(value.IntegerValueInRange(/*min=*/0, /*max=*/7, *line_idx, &layer->layer_dependency_idc)); } else if ((field_name == "layer_metadata_scope")) { RETURN_IF_FALSE(value.IntegerValueInRange( /*min=*/0, /*max=*/3, *line_idx, &layer->layer_metadata_scope)); } else if ((field_name == "layer_color_description")) { ColorProperties color_properties; RETURN_IF_FALSE( parse_color_properties(file, indent, line_idx, &color_properties)); layer->layer_color_description = value_present(color_properties); } else if ((field_name == "alpha")) { layer_has_alpha = true; RETURN_IF_FALSE(parse_multilayer_layer_alpha( file, /*min_indent=*/indent + 1, line_idx, &layer->global_alpha_info)); } else if (field_name == "depth") { layer_has_depth = true; RETURN_IF_FALSE(parse_multilayer_layer_depth( file, /*min_indent=*/indent + 1, line_idx, &layer->global_depth_info)); if ((layer->global_depth_info.d_min.second || layer->global_depth_info.d_max.second) && layer->global_depth_info.disparity_ref_view_id == (layers.size() - 1)) { fprintf(stderr, "disparity_ref_view_id must be different from the layer's id " "for layer %d (zero-based index)\n", static_cast(layers.size()) - 1); return false; } } else { fprintf(stderr, "Error: Unknown field %s at line %d\n", field_name.c_str(), *line_idx); return false; } } if (syntax_error) return false; validate_layer(layers.back(), layer_has_alpha, layer_has_depth); return true; } bool parse_multilayer_metadata(std::ifstream &file, MultilayerMetadata *multilayer) { int line_idx = 0; bool has_list_prefix; int indent = -1; std::string field_name; ParsedValue value; bool syntax_error; *multilayer = {}; while (parse_line(file, /*min_indent=*/0, /*is_list=*/false, &indent, &has_list_prefix, &line_idx, &field_name, &value, &syntax_error)) { // Check if string starts with field name. if ((field_name == "use_case")) { RETURN_IF_FALSE(value.IntegerValueInRange( /*min=*/0, /*max=*/63, line_idx, &multilayer->use_case)); } else if ((field_name == "layers")) { RETURN_IF_FALSE(parse_multilayer_layer_metadata( file, /*min_indent=*/indent + 1, &line_idx, multilayer->layers)); } else { fprintf(stderr, "Error: Unknown field %s at line %d\n", field_name.c_str(), line_idx); return false; } } if (syntax_error) return false; return true; } std::string format_depth_representation_element( const std::pair &element) { if (!element.second) { return "absent"; } else { return std::to_string( depth_representation_element_to_double(element.first)) + " (sign " + std::to_string(element.first.sign_flag) + " exponent " + std::to_string(element.first.exponent) + " mantissa " + std::to_string(element.first.mantissa) + " mantissa_len " + std::to_string(element.first.mantissa_len) + ")"; } } std::string format_color_properties( const std::pair &color_properties) { if (!color_properties.second) { return "absent"; } else { return std::to_string(color_properties.first.color_primaries) + "/" + std::to_string(color_properties.first.transfer_characteristics) + "/" + std::to_string(color_properties.first.matrix_coefficients) + (color_properties.first.color_range ? "F" : "L"); } } bool validate_multilayer_metadata(const MultilayerMetadata &multilayer) { if (multilayer.layers.empty()) { fprintf(stderr, "Error: No layers found, there must be at least one\n"); return false; } if (multilayer.layers.size() > 4) { fprintf(stderr, "Error: Too many layers, found %d, max 4\n", static_cast(multilayer.layers.size())); return false; } bool same_view_type = true; MultilayerViewType view_type = multilayer.layers[0].layer_view_type; for (const LayerMetadata &layer : multilayer.layers) { if (layer.layer_view_type != view_type) { same_view_type = false; break; } } for (int i = 0; i < static_cast(multilayer.layers.size()); ++i) { const LayerMetadata &layer = multilayer.layers[i]; switch (multilayer.use_case) { case MULTILAYER_USE_CASE_GLOBAL_ALPHA: case MULTILAYER_USE_CASE_GLOBAL_DEPTH: case MULTILAYER_USE_CASE_STEREO: case MULTILAYER_USE_CASE_STEREO_GLOBAL_ALPHA: case MULTILAYER_USE_CASE_STEREO_GLOBAL_DEPTH: case MULTILAYER_USE_CASE_444_GLOBAL_ALPHA: case MULTILAYER_USE_CASE_444_GLOBAL_DEPTH: if (layer.layer_metadata_scope != SCOPE_GLOBAL) { fprintf( stderr, "Error: for use_case %d, all layers must have scope %d, found %d " "instead for layer %d (zero-based index)\n", multilayer.use_case, SCOPE_GLOBAL, layer.layer_metadata_scope, i); return false; } break; default: break; } switch (multilayer.use_case) { case MULTILAYER_USE_CASE_GLOBAL_ALPHA: case MULTILAYER_USE_CASE_GLOBAL_DEPTH: case MULTILAYER_USE_CASE_ALPHA: case MULTILAYER_USE_CASE_DEPTH: case MULTILAYER_USE_CASE_444_GLOBAL_ALPHA: case MULTILAYER_USE_CASE_444_GLOBAL_DEPTH: case MULTILAYER_USE_CASE_444: case MULTILAYER_USE_CASE_420_444: if (!same_view_type) { fprintf(stderr, "Error: for use_case %d, all layers must have the same view " "type, found different view_type for layer %d (zero-based " "index)\n", multilayer.use_case, i); return false; } default: break; } if (layer.layer_type != MULTILAYER_LAYER_TYPE_UNSPECIFIED) switch (multilayer.use_case) { case MULTILAYER_USE_CASE_GLOBAL_ALPHA: case MULTILAYER_USE_CASE_ALPHA: case MULTILAYER_USE_CASE_STEREO_GLOBAL_ALPHA: case MULTILAYER_USE_CASE_STEREO_ALPHA: if (layer.layer_type != MULTILAYER_LAYER_TYPE_TEXTURE && layer.layer_type != MULTILAYER_LAYER_TYPE_ALPHA) { fprintf(stderr, "Error: for use_case %d, all layers must be of type %d or " "%d, found %d for layer %d (zero-based index)\n", multilayer.use_case, MULTILAYER_LAYER_TYPE_TEXTURE, MULTILAYER_LAYER_TYPE_ALPHA, layer.layer_type, i); return false; } break; case MULTILAYER_USE_CASE_GLOBAL_DEPTH: case MULTILAYER_USE_CASE_DEPTH: case MULTILAYER_USE_CASE_STEREO_GLOBAL_DEPTH: case MULTILAYER_USE_CASE_STEREO_DEPTH: if (layer.layer_type != MULTILAYER_LAYER_TYPE_TEXTURE && layer.layer_type != MULTILAYER_LAYER_TYPE_DEPTH) { fprintf(stderr, "Error: for use_case %d, all layers must be of type %d or " "%d, found %d for layer %d (zero-based index)\n", multilayer.use_case, MULTILAYER_LAYER_TYPE_TEXTURE, MULTILAYER_LAYER_TYPE_DEPTH, layer.layer_type, i); return false; } break; case MULTILAYER_USE_CASE_STEREO: if (layer.layer_type != MULTILAYER_LAYER_TYPE_TEXTURE) { fprintf(stderr, "Error: for use_case %d, all layers must be of type %d, " "found %d for layer %d (zero-based index)\n", multilayer.use_case, MULTILAYER_LAYER_TYPE_TEXTURE, layer.layer_type, i); return false; } break; case MULTILAYER_USE_CASE_444_GLOBAL_ALPHA: if (layer.layer_type != MULTILAYER_LAYER_TYPE_TEXTURE_1 && layer.layer_type != MULTILAYER_LAYER_TYPE_TEXTURE_2 && layer.layer_type != MULTILAYER_LAYER_TYPE_TEXTURE_3 && layer.layer_type != MULTILAYER_LAYER_TYPE_ALPHA) { fprintf(stderr, "Error: for use_case %d, all layers must be of type %d, " "%d, %d, or %d, found %d for layer %d (zero-based index)\n", multilayer.use_case, MULTILAYER_LAYER_TYPE_TEXTURE_1, MULTILAYER_LAYER_TYPE_TEXTURE_2, MULTILAYER_LAYER_TYPE_TEXTURE_3, MULTILAYER_LAYER_TYPE_ALPHA, layer.layer_type, i); return false; } break; case MULTILAYER_USE_CASE_444_GLOBAL_DEPTH: if (layer.layer_type != MULTILAYER_LAYER_TYPE_TEXTURE_1 && layer.layer_type != MULTILAYER_LAYER_TYPE_TEXTURE_2 && layer.layer_type != MULTILAYER_LAYER_TYPE_TEXTURE_3 && layer.layer_type != MULTILAYER_LAYER_TYPE_DEPTH) { fprintf(stderr, "Error: for use_case %d, all layers must be of type %d, " "%d, %d, or %d, found %d for layer %d (zero-based index)\n", multilayer.use_case, MULTILAYER_LAYER_TYPE_TEXTURE_1, MULTILAYER_LAYER_TYPE_TEXTURE_2, MULTILAYER_LAYER_TYPE_TEXTURE_3, MULTILAYER_LAYER_TYPE_DEPTH, layer.layer_type, i); return false; } break; case MULTILAYER_USE_CASE_444: if (layer.layer_type != MULTILAYER_LAYER_TYPE_TEXTURE_1 && layer.layer_type != MULTILAYER_LAYER_TYPE_TEXTURE_2 && layer.layer_type != MULTILAYER_LAYER_TYPE_TEXTURE_3) { fprintf( stderr, "Error: for use_case %d, all layers must be of type %d, %d, or " "%d, found %d for layer %d (zero-based index)\n", multilayer.use_case, MULTILAYER_LAYER_TYPE_TEXTURE_1, MULTILAYER_LAYER_TYPE_TEXTURE_2, MULTILAYER_LAYER_TYPE_TEXTURE_3, layer.layer_type, i); return false; } break; case MULTILAYER_USE_CASE_420_444: if (layer.layer_type != MULTILAYER_LAYER_TYPE_TEXTURE && layer.layer_type != MULTILAYER_LAYER_TYPE_TEXTURE_1 && layer.layer_type != MULTILAYER_LAYER_TYPE_TEXTURE_2 && layer.layer_type != MULTILAYER_LAYER_TYPE_TEXTURE_3) { fprintf(stderr, "Error: for use_case %d, all layers must be of type %d, " "%d, %d, or %d, found %d for layer %d (zero-based index)\n", multilayer.use_case, MULTILAYER_LAYER_TYPE_TEXTURE, MULTILAYER_LAYER_TYPE_TEXTURE_1, MULTILAYER_LAYER_TYPE_TEXTURE_2, MULTILAYER_LAYER_TYPE_TEXTURE_3, layer.layer_type, i); return false; } break; default: break; } if (layer.layer_dependency_idc >= (1 << i)) { fprintf(stderr, "Error: layer_dependency_idc of layer %d (zero-based index) must " "be in [0, %d], found %d for layer %d (zero-based index)\n", i, (1 << i) - 1, layer.layer_dependency_idc, i); return false; } if ((layer.layer_type == MULTILAYER_LAYER_TYPE_ALPHA || layer.layer_type == MULTILAYER_LAYER_TYPE_DEPTH) && layer.layer_color_description.second) { fprintf(stderr, "Error: alpha or depth layers cannot have " "layer_color_description for layer %d (zero-based index)\n", i); return false; } } return true; } } // namespace double depth_representation_element_to_double( const DepthRepresentationElement &e) { // Let x be a variable that is computed using four variables s, e, m, and n, // as follows: If e is greater than 0 and less than 127, x is set equal to // (−1)^s*2^(e−31) * (1+m÷2^n). // Otherwise (e is equal to 0), x is set equal to (−1)^s*2^−(30+n)*m. if (e.exponent > 0) { return (e.sign_flag ? -1 : 1) * std::pow(2.0, e.exponent - 31) * (1 + static_cast(e.mantissa) / (static_cast(1) << e.mantissa_len)); } else { return (e.sign_flag ? -1 : 1) * e.mantissa * std::pow(2.0, -30 + e.mantissa_len); } } bool double_to_depth_representation_element( double v, DepthRepresentationElement *element) { const double orig = v; if (v == 0.0) { *element = { 0, 0, 0, 1 }; return true; } const bool sign = v < 0.0; if (sign) { v = -v; } int exp = 0; if (v >= 1.0) { while (v >= 2.0) { ++exp; v /= 2; } } else { while (v < 1.0) { ++exp; v *= 2.0; } exp = -exp; } if ((exp + 31) <= 0 || (exp + 31) > 126) { fprintf(stderr, "Error: Floating point value %f out of range (too large or too " "small)\n", orig); return false; } assert(v >= 1.0 && v < 2.0); v -= 1.0; uint32_t mantissa = 0; uint8_t mantissa_len = 0; constexpr uint8_t kMaxMantissaLen = 32; do { const int bit = (v >= 0.5); mantissa = (mantissa << 1) + bit; v -= bit * 0.5; ++mantissa_len; v *= 2.0; } while (mantissa_len < kMaxMantissaLen && v > 0.0); *element = { sign, static_cast(exp + 31), mantissa_len, mantissa }; return true; } bool parse_multilayer_file(const char *metadata_path, MultilayerMetadata *multilayer) { std::ifstream file(metadata_path); if (!file.is_open()) { fprintf(stderr, "Error: Failed to open %s\n", metadata_path); return false; } if (!parse_multilayer_metadata(file, multilayer) || !validate_multilayer_metadata(*multilayer)) { return false; } return multilayer; } void print_multilayer_metadata(const MultilayerMetadata &multilayer) { printf("=== Multilayer metadata ===\n"); printf("use_case: %d\n", multilayer.use_case); for (size_t i = 0; i < multilayer.layers.size(); ++i) { const LayerMetadata &layer = multilayer.layers[i]; printf("layer %zu\n", i); printf(" layer_type: %d\n", layer.layer_type); printf(" luma_plane_only_flag: %d\n", layer.luma_plane_only_flag); printf(" layer_view_type: %d\n", layer.layer_view_type); printf(" group_id: %d\n", layer.group_id); printf(" layer_dependency_idc: %d\n", layer.layer_dependency_idc); printf(" layer_metadata_scope: %d\n", layer.layer_metadata_scope); printf(" layer_color_description: %s\n", format_color_properties(layer.layer_color_description).c_str()); if (layer.layer_type == MULTILAYER_LAYER_TYPE_ALPHA) { printf(" alpha:\n"); printf(" alpha_use_idc: %d\n", layer.global_alpha_info.alpha_use_idc); printf(" alpha_bit_depth: %d\n", layer.global_alpha_info.alpha_bit_depth); printf(" alpha_clip_idc: %d\n", layer.global_alpha_info.alpha_clip_idc); printf(" alpha_incr_flag: %d\n", layer.global_alpha_info.alpha_incr_flag); printf(" alpha_transparent_value: %hu\n", layer.global_alpha_info.alpha_transparent_value); printf(" alpha_opaque_value: %hu\n", layer.global_alpha_info.alpha_opaque_value); printf(" alpha_color_description: %s\n", format_color_properties( layer.global_alpha_info.alpha_color_description) .c_str()); printf(" label_type_id:"); for (uint16_t label_type_id : layer.global_alpha_info.label_type_id) { printf(" %d", label_type_id); } printf("\n"); } else if (layer.layer_type == MULTILAYER_LAYER_TYPE_DEPTH) { printf(" depth:\n"); printf(" z_near: %s\n", format_depth_representation_element(layer.global_depth_info.z_near) .c_str()); printf(" z_far: %s\n", format_depth_representation_element(layer.global_depth_info.z_far) .c_str()); printf(" d_min: %s\n", format_depth_representation_element(layer.global_depth_info.d_min) .c_str()); printf(" d_max: %s\n", format_depth_representation_element(layer.global_depth_info.d_max) .c_str()); printf(" depth_representation_type: %d\n", layer.global_depth_info.depth_representation_type); printf(" disparity_ref_view_id: %d\n", layer.global_depth_info.disparity_ref_view_id); printf(" depth_nonlinear_precision: %d\n", layer.global_depth_info.depth_nonlinear_precision); printf(" depth_nonlinear_representation_model:"); for (uint32_t depth_nonlinear_representation_model : layer.global_depth_info.depth_nonlinear_representation_model) { printf(" %d", depth_nonlinear_representation_model); } printf("\n"); } } printf("\n"); } } // namespace libaom_examples aom-3.12.1/examples/multilayer_metadata.h000066400000000000000000000122121477627663500204150ustar00rootroot00000000000000/* * Copyright (c) 2024, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Experimental multilayer metadata defined in CWG-E050. #ifndef AOM_EXAMPLES_MULTILAYER_METADATA_H_ #define AOM_EXAMPLES_MULTILAYER_METADATA_H_ #include #include #include namespace libaom_examples { // std::pair is used to indicate presence of a field, // like an std::optional (which cannot be used because it's C++17). // If the boolean is true, then the value is present. struct ColorProperties { bool color_range; // true for full range values uint8_t color_primaries; uint8_t transfer_characteristics; uint8_t matrix_coefficients; }; enum AlphaUse { ALPHA_STRAIGHT = 0, ALPHA_PREMULTIPLIED = 1, ALPHA_SEGMENTATION = 2, ALPHA_UNSPECIFIED = 3, }; struct AlphaInformation { AlphaUse alpha_use_idc; // [0, 7] uint8_t alpha_bit_depth; // [8, 15] uint8_t alpha_clip_idc; // [0, 3] bool alpha_incr_flag; uint16_t alpha_transparent_value; // [0, 1<<(alpha_bit_depth+1)) uint16_t alpha_opaque_value; // [0, 1<<(alpha_bit_depth+1)) // Relevant for ALPHA_STRAIGHT only. std::pair alpha_color_description; // Relevant for ALPHA_SEGMENTATION only. // Must be either empty or have the same size as the number of values between // alpha_transparent_value and alpha_opaque_value, inclusively. std::vector label_type_id; }; struct DepthRepresentationElement { bool sign_flag; uint8_t exponent; // [0, 126] (biased exponent) uint8_t mantissa_len; // [1, 32] uint32_t mantissa; }; struct DepthInformation { std::pair z_near; std::pair z_far; std::pair d_min; std::pair d_max; uint8_t depth_representation_type; // [0, 15] uint8_t disparity_ref_view_id; // [0, 3] uint8_t depth_nonlinear_precision; // [8, 23] // [0, 1< depth_nonlinear_representation_model; }; enum MultilayerUseCase { MULTILAYER_USE_CASE_UNSPECIFIED = 0, MULTILAYER_USE_CASE_GLOBAL_ALPHA = 1, MULTILAYER_USE_CASE_GLOBAL_DEPTH = 2, MULTILAYER_USE_CASE_ALPHA = 3, MULTILAYER_USE_CASE_DEPTH = 4, MULTILAYER_USE_CASE_STEREO = 5, MULTILAYER_USE_CASE_STEREO_GLOBAL_ALPHA = 6, MULTILAYER_USE_CASE_STEREO_GLOBAL_DEPTH = 7, MULTILAYER_USE_CASE_STEREO_ALPHA = 8, MULTILAYER_USE_CASE_STEREO_DEPTH = 9, MULTILAYER_USE_CASE_444_GLOBAL_ALPHA = 10, MULTILAYER_USE_CASE_444_GLOBAL_DEPTH = 11, MULTILAYER_USE_CASE_444 = 12, MULTILAYER_USE_CASE_420_444 = 13, }; enum LayerType { MULTILAYER_LAYER_TYPE_UNSPECIFIED = 0, MULTILAYER_LAYER_TYPE_TEXTURE = 1, MULTILAYER_LAYER_TYPE_TEXTURE_1 = 2, MULTILAYER_LAYER_TYPE_TEXTURE_2 = 3, MULTILAYER_LAYER_TYPE_TEXTURE_3 = 4, MULTILAYER_LAYER_TYPE_ALPHA = 5, MULTILAYER_LAYER_TYPE_DEPTH = 6, }; enum MultilayerMetadataScope { SCOPE_UNSPECIFIED = 0, SCOPE_LOCAL = 1, SCOPE_GLOBAL = 2, SCOPE_MIXED = 3, }; enum MultilayerViewType { VIEW_UNSPECIFIED = 0, VIEW_CENTER = 1, VIEW_LEFT = 2, VIEW_RIGHT = 3, }; struct LayerMetadata { LayerType layer_type; // [0, 31] bool luma_plane_only_flag; MultilayerViewType layer_view_type; // [0, 7] uint8_t group_id; // [0, 3] uint8_t layer_dependency_idc; // [0, 7] MultilayerMetadataScope layer_metadata_scope; // [0, 3] std::pair layer_color_description; // Relevant for MULTILAYER_LAYER_TYPE_ALPHA with scope >= SCOPE_GLOBAL. AlphaInformation global_alpha_info; // Relevant for MULTILAYER_LAYER_TYPE_DEPTH with scope >= SCOPE_GLOBAL. DepthInformation global_depth_info; }; struct MultilayerMetadata { MultilayerUseCase use_case; // [0, 63] std::vector layers; }; // Parses a multilayer metadata file. // The metadata is expected to be in a subset of the YAML format supporting // simple lists and maps with integer values, and comments. // Checks that the metadata is valid and terminates the process in case of // error. bool parse_multilayer_file(const char *metadata_path, MultilayerMetadata *multilayer); // Prints the multilayer metadata to stdout for debugging. void print_multilayer_metadata(const MultilayerMetadata &multilayer); // Converts a double value to a DepthRepresentationElement struct. bool double_to_depth_representation_element( double v, DepthRepresentationElement *element); // Converts a DepthRepresentationElement struct to a double value. double depth_representation_element_to_double( const DepthRepresentationElement &e); } // namespace libaom_examples #endif // AOM_EXAMPLES_MULTILAYER_METADATA_H_ aom-3.12.1/examples/noise_model.c000066400000000000000000000414541477627663500166700ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /*!\file * \brief This is an sample binary to create noise params from input video. * * To allow for external denoising applications, this sample binary illustrates * how to create a film grain table (film grain params as a function of time) * from an input video and its corresponding denoised source. * * The --output-grain-table file can be passed as input to the encoder (in * aomenc this is done through the "--film-grain-table" parameter). * * As an example, where the input source is an 854x480 yuv420p 8-bit video * named "input.854_480.yuv" you would use steps similar to the following: * * # Run your denoiser (e.g, using hqdn3d filter): * ffmpeg -vcodec rawvideo -video_size 854x480 -i input.854_480.yuv \ * -vf hqdn3d=5:5:5:5 -vcodec rawvideo -an -f rawvideo \ * denoised.854_480.yuv * * # Model the noise between the denoised version and original source: * ./examples/noise_model --fps=25/1 --width=854 --height=480 --i420 \ * --input-denoised=denoised.854_480.yuv --input=original.854_480.yuv \ * --output-grain-table=film_grain.tbl * * # Encode with your favorite settings (including the grain table): * aomenc --limit=100 --cpu-used=4 --input-bit-depth=8 \ * --i420 -w 854 -h 480 --end-usage=q --cq-level=25 --lag-in-frames=25 \ * --auto-alt-ref=2 --bit-depth=8 --film-grain-table=film_grain.tbl \ * -o denoised_with_grain_params.ivf denoised.854_480.yuv */ #include #include #include #include #include "aom/aom_encoder.h" #include "aom_dsp/aom_dsp_common.h" #if CONFIG_AV1_DECODER #include "av1/decoder/grain_synthesis.h" #endif #include "aom_dsp/grain_table.h" #include "aom_dsp/noise_model.h" #include "aom_dsp/noise_util.h" #include "aom_mem/aom_mem.h" #include "common/args.h" #include "common/tools_common.h" #include "common/video_writer.h" static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s --input= --input-denoised= " "--output-grain-table= " "See comments in noise_model.c for more information.\n", exec_name); exit(EXIT_FAILURE); } static const arg_def_t help = ARG_DEF(NULL, "help", 0, "Show usage options and exit"); static const arg_def_t width_arg = ARG_DEF("w", "width", 1, "Input width (if rawvideo)"); static const arg_def_t height_arg = ARG_DEF("h", "height", 1, "Input height (if rawvideo)"); static const arg_def_t skip_frames_arg = ARG_DEF("s", "skip-frames", 1, "Number of frames to skip (default = 1)"); static const arg_def_t fps_arg = ARG_DEF(NULL, "fps", 1, "Frame rate"); static const arg_def_t input_arg = ARG_DEF("-i", "input", 1, "Input filename"); static const arg_def_t output_grain_table_arg = ARG_DEF("n", "output-grain-table", 1, "Output noise file"); static const arg_def_t input_denoised_arg = ARG_DEF("d", "input-denoised", 1, "Input denoised filename (YUV) only"); static const arg_def_t flat_block_finder_arg = ARG_DEF("b", "flat-block-finder", 1, "Run the flat block finder"); static const arg_def_t block_size_arg = ARG_DEF("b", "block-size", 1, "Block size"); static const arg_def_t bit_depth_arg = ARG_DEF(NULL, "bit-depth", 1, "Bit depth of input"); static const arg_def_t use_i420 = ARG_DEF(NULL, "i420", 0, "Input file (and denoised) is I420 (default)"); static const arg_def_t use_i422 = ARG_DEF(NULL, "i422", 0, "Input file (and denoised) is I422"); static const arg_def_t use_i444 = ARG_DEF(NULL, "i444", 0, "Input file (and denoised) is I444"); static const arg_def_t debug_file_arg = ARG_DEF(NULL, "debug-file", 1, "File to output debug info"); typedef struct { int width; int height; struct aom_rational fps; const char *input; const char *input_denoised; const char *output_grain_table; int img_fmt; int block_size; int bit_depth; int run_flat_block_finder; int force_flat_psd; int skip_frames; const char *debug_file; } noise_model_args_t; static void parse_args(noise_model_args_t *noise_args, char **argv) { struct arg arg; static const arg_def_t *main_args[] = { &help, &input_arg, &fps_arg, &width_arg, &height_arg, &block_size_arg, &output_grain_table_arg, &input_denoised_arg, &use_i420, &use_i422, &use_i444, &debug_file_arg, NULL }; for (; *argv; argv++) { if (arg_match(&arg, &help, argv)) { fprintf(stdout, "\nOptions:\n"); arg_show_usage(stdout, main_args); exit(0); } else if (arg_match(&arg, &width_arg, argv)) { noise_args->width = atoi(arg.val); } else if (arg_match(&arg, &height_arg, argv)) { noise_args->height = atoi(arg.val); } else if (arg_match(&arg, &input_arg, argv)) { noise_args->input = arg.val; } else if (arg_match(&arg, &input_denoised_arg, argv)) { noise_args->input_denoised = arg.val; } else if (arg_match(&arg, &output_grain_table_arg, argv)) { noise_args->output_grain_table = arg.val; } else if (arg_match(&arg, &block_size_arg, argv)) { noise_args->block_size = atoi(arg.val); } else if (arg_match(&arg, &bit_depth_arg, argv)) { noise_args->bit_depth = atoi(arg.val); } else if (arg_match(&arg, &flat_block_finder_arg, argv)) { noise_args->run_flat_block_finder = atoi(arg.val); } else if (arg_match(&arg, &fps_arg, argv)) { noise_args->fps = arg_parse_rational(&arg); } else if (arg_match(&arg, &use_i420, argv)) { noise_args->img_fmt = AOM_IMG_FMT_I420; } else if (arg_match(&arg, &use_i422, argv)) { noise_args->img_fmt = AOM_IMG_FMT_I422; } else if (arg_match(&arg, &use_i444, argv)) { noise_args->img_fmt = AOM_IMG_FMT_I444; } else if (arg_match(&arg, &skip_frames_arg, argv)) { noise_args->skip_frames = atoi(arg.val); } else if (arg_match(&arg, &debug_file_arg, argv)) { noise_args->debug_file = arg.val; } else { fprintf(stdout, "Unknown arg: %s\n\nUsage:\n", *argv); arg_show_usage(stdout, main_args); exit(0); } } if (noise_args->bit_depth > 8) { noise_args->img_fmt |= AOM_IMG_FMT_HIGHBITDEPTH; } } #if CONFIG_AV1_DECODER static void print_variance_y(FILE *debug_file, aom_image_t *raw, aom_image_t *denoised, const uint8_t *flat_blocks, int block_size, aom_film_grain_t *grain) { aom_image_t renoised; grain->apply_grain = 1; grain->random_seed = 7391; grain->bit_depth = raw->bit_depth; aom_img_alloc(&renoised, raw->fmt, raw->w, raw->h, 1); if (av1_add_film_grain(grain, denoised, &renoised)) { fprintf(stderr, "Internal failure in av1_add_film_grain().\n"); aom_img_free(&renoised); return; } const int num_blocks_w = (raw->w + block_size - 1) / block_size; const int num_blocks_h = (raw->h + block_size - 1) / block_size; fprintf(debug_file, "x = ["); for (int by = 0; by < num_blocks_h; by++) { for (int bx = 0; bx < num_blocks_w; bx++) { double block_mean = 0; double noise_std = 0, noise_mean = 0; double renoise_std = 0, renoise_mean = 0; for (int yi = 0; yi < block_size; ++yi) { const int y = by * block_size + yi; for (int xi = 0; xi < block_size; ++xi) { const int x = bx * block_size + xi; const double noise_v = (raw->planes[0][y * raw->stride[0] + x] - denoised->planes[0][y * raw->stride[0] + x]); noise_mean += noise_v; noise_std += noise_v * noise_v; block_mean += raw->planes[0][y * raw->stride[0] + x]; const double renoise_v = (renoised.planes[0][y * raw->stride[0] + x] - denoised->planes[0][y * raw->stride[0] + x]); renoise_mean += renoise_v; renoise_std += renoise_v * renoise_v; } } int n = (block_size * block_size); block_mean /= n; noise_mean /= n; renoise_mean /= n; noise_std = sqrt(noise_std / n - noise_mean * noise_mean); renoise_std = sqrt(renoise_std / n - renoise_mean * renoise_mean); fprintf(debug_file, "%d %3.2lf %3.2lf %3.2lf ", flat_blocks[by * num_blocks_w + bx], block_mean, noise_std, renoise_std); } fprintf(debug_file, "\n"); } fprintf(debug_file, "];\n"); if (raw->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { fprintf(stderr, "Detailed debug info not supported for high bit" "depth formats\n"); } else { fprintf(debug_file, "figure(2); clf;\n"); fprintf(debug_file, "scatter(x(:, 2:4:end), x(:, 3:4:end), 'r'); hold on;\n"); fprintf(debug_file, "scatter(x(:, 2:4:end), x(:, 4:4:end), 'b');\n"); fprintf(debug_file, "plot(linspace(0, 255, length(noise_strength_0)), " "noise_strength_0, 'b');\n"); fprintf(debug_file, "title('Scatter plot of intensity vs noise strength');\n"); fprintf(debug_file, "legend('Actual', 'Estimated', 'Estimated strength');\n"); fprintf(debug_file, "figure(3); clf;\n"); fprintf(debug_file, "scatter(x(:, 3:4:end), x(:, 4:4:end), 'k');\n"); fprintf(debug_file, "title('Actual vs Estimated');\n"); fprintf(debug_file, "pause(3);\n"); } aom_img_free(&renoised); } #endif static void print_debug_info(FILE *debug_file, aom_image_t *raw, aom_image_t *denoised, uint8_t *flat_blocks, int block_size, aom_noise_model_t *noise_model) { (void)raw; (void)denoised; (void)flat_blocks; (void)block_size; fprintf(debug_file, "figure(3); clf;\n"); fprintf(debug_file, "figure(2); clf;\n"); fprintf(debug_file, "figure(1); clf;\n"); for (int c = 0; c < 3; ++c) { fprintf(debug_file, "noise_strength_%d = [\n", c); const aom_equation_system_t *eqns = &noise_model->combined_state[c].strength_solver.eqns; for (int k = 0; k < eqns->n; ++k) { fprintf(debug_file, "%lf ", eqns->x[k]); } fprintf(debug_file, "];\n"); fprintf(debug_file, "plot(noise_strength_%d); hold on;\n", c); } fprintf(debug_file, "legend('Y', 'cb', 'cr');\n"); fprintf(debug_file, "title('Noise strength function');\n"); #if CONFIG_AV1_DECODER aom_film_grain_t grain; aom_noise_model_get_grain_parameters(noise_model, &grain); print_variance_y(debug_file, raw, denoised, flat_blocks, block_size, &grain); #endif fflush(debug_file); } int main(int argc, char *argv[]) { noise_model_args_t args = { 0, 0, { 25, 1 }, 0, 0, 0, AOM_IMG_FMT_I420, 32, 8, 1, 0, 1, NULL }; aom_image_t raw, denoised; FILE *infile = NULL; AvxVideoInfo info; memset(&info, 0, sizeof(info)); (void)argc; exec_name = argv[0]; parse_args(&args, argv + 1); info.frame_width = args.width; info.frame_height = args.height; info.time_base.numerator = args.fps.den; info.time_base.denominator = args.fps.num; if (info.frame_width <= 0 || info.frame_height <= 0 || (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); } if (!aom_img_alloc(&raw, args.img_fmt, info.frame_width, info.frame_height, 1)) { die("Failed to allocate image."); } if (!aom_img_alloc(&denoised, args.img_fmt, info.frame_width, info.frame_height, 1)) { die("Failed to allocate image."); } infile = fopen(args.input, "rb"); if (!infile) { die("Failed to open input file: %s", args.input); } fprintf(stderr, "Bit depth: %d stride:%d\n", args.bit_depth, raw.stride[0]); const int high_bd = args.bit_depth > 8; const int block_size = args.block_size; aom_flat_block_finder_t block_finder; aom_flat_block_finder_init(&block_finder, block_size, args.bit_depth, high_bd); const int num_blocks_w = (info.frame_width + block_size - 1) / block_size; const int num_blocks_h = (info.frame_height + block_size - 1) / block_size; uint8_t *flat_blocks = (uint8_t *)aom_malloc(num_blocks_w * num_blocks_h); if (!flat_blocks) die("Failed to allocate block data."); // Sets the random seed on the first entry in the output table int16_t random_seed = 7391; aom_noise_model_t noise_model; aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3, args.bit_depth, high_bd }; aom_noise_model_init(&noise_model, params); FILE *denoised_file = 0; if (args.input_denoised) { denoised_file = fopen(args.input_denoised, "rb"); if (!denoised_file) die("Unable to open input_denoised: %s", args.input_denoised); } else { die("--input-denoised file must be specified"); } FILE *debug_file = 0; if (args.debug_file) { debug_file = fopen(args.debug_file, "w"); } aom_film_grain_table_t grain_table = { 0, 0 }; int64_t prev_timestamp = 0; int frame_count = 0; while (aom_img_read(&raw, infile)) { if (args.input_denoised) { if (!aom_img_read(&denoised, denoised_file)) { die("Unable to read input denoised file"); } } if (frame_count % args.skip_frames == 0) { int num_flat_blocks = num_blocks_w * num_blocks_h; memset(flat_blocks, 1, num_flat_blocks); if (args.run_flat_block_finder) { memset(flat_blocks, 0, num_flat_blocks); num_flat_blocks = aom_flat_block_finder_run( &block_finder, raw.planes[0], info.frame_width, info.frame_height, info.frame_width, flat_blocks); fprintf(stdout, "Num flat blocks %d\n", num_flat_blocks); } const uint8_t *planes[3] = { raw.planes[0], raw.planes[1], raw.planes[2] }; uint8_t *denoised_planes[3] = { denoised.planes[0], denoised.planes[1], denoised.planes[2] }; int strides[3] = { raw.stride[0] >> high_bd, raw.stride[1] >> high_bd, raw.stride[2] >> high_bd }; int chroma_sub[3] = { raw.x_chroma_shift, raw.y_chroma_shift, 0 }; fprintf(stdout, "Updating noise model...\n"); aom_noise_status_t status = aom_noise_model_update( &noise_model, (const uint8_t *const *)planes, (const uint8_t *const *)denoised_planes, info.frame_width, info.frame_height, strides, chroma_sub, flat_blocks, block_size); int64_t cur_timestamp = frame_count * 10000000ULL * args.fps.den / args.fps.num; if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) { fprintf(stdout, "Noise type is different, updating parameters for time " "[ %" PRId64 ", %" PRId64 ")\n", prev_timestamp, cur_timestamp); aom_film_grain_t grain; aom_noise_model_get_grain_parameters(&noise_model, &grain); grain.random_seed = random_seed; random_seed = 0; aom_film_grain_table_append(&grain_table, prev_timestamp, cur_timestamp, &grain); aom_noise_model_save_latest(&noise_model); prev_timestamp = cur_timestamp; } if (debug_file) { print_debug_info(debug_file, &raw, &denoised, flat_blocks, block_size, &noise_model); } fprintf(stdout, "Done noise model update, status = %d\n", status); } frame_count++; } aom_film_grain_t grain; aom_noise_model_get_grain_parameters(&noise_model, &grain); grain.random_seed = random_seed; aom_film_grain_table_append(&grain_table, prev_timestamp, INT64_MAX, &grain); if (args.output_grain_table) { struct aom_internal_error_info error_info; if (AOM_CODEC_OK != aom_film_grain_table_write(&grain_table, args.output_grain_table, &error_info)) { die("Unable to write output film grain table"); } } aom_film_grain_table_free(&grain_table); if (infile) fclose(infile); if (denoised_file) fclose(denoised_file); if (debug_file) fclose(debug_file); aom_img_free(&raw); aom_img_free(&denoised); return EXIT_SUCCESS; } aom-3.12.1/examples/photon_noise_table.c000066400000000000000000000417021477627663500202420ustar00rootroot00000000000000/* * Copyright (c) 2021, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // This tool creates a film grain table, for use in stills and videos, // representing the noise that one would get by shooting with a digital camera // at a given light level. Much of the noise in digital images is photon shot // noise, which is due to the characteristics of photon arrival and grows in // standard deviation as the square root of the expected number of photons // captured. // https://www.photonstophotos.net/Emil%20Martinec/noise.html#shotnoise // // The proxy used by this tool for the amount of light captured is the ISO value // such that the focal plane exposure at the time of capture would have been // mapped by a 35mm camera to the output lightness observed in the image. That // is, if one were to shoot on a 35mm camera (36×24mm sensor) at the nominal // exposure for that ISO setting, the resulting image should contain noise of // the same order of magnitude as generated by this tool. // // Example usage: // // ./photon_noise_table --width=3840 --height=2160 --iso=25600 -o noise.tbl // # Then, for example: // aomenc --film-grain-table=noise.tbl ... // # Or: // avifenc -c aom -a film-grain-table=noise.tbl ... // // The (mostly) square-root relationship between light intensity and noise // amplitude holds in linear light, but AV1 streams are most often encoded // non-linearly, and the film grain is applied to those non-linear values. // Therefore, this tool must account for the non-linearity, and this is // controlled by the optional `--transfer-function` (or `-t`) parameter, which // specifies the tone response curve that will be used when encoding the actual // image. The default for this tool is sRGB, which is approximately similar to // an encoding gamma of 1/2.2 (i.e. a decoding gamma of 2.2) though not quite // identical. // // As alluded to above, the tool assumes that the image is taken from the // entirety of a 36×24mm (“35mm format”) sensor. If that assumption does not // hold, then a “35mm-equivalent ISO value” that can be passed to the tool can // be obtained by multiplying the true ISO value by the ratio of 36×24mm to the // area that was actually used. For formats that approximately share the same // aspect ratio, this is often expressed as the square of the “equivalence // ratio” which is the ratio of their diagonals. For example, APS-C (often // ~24×16mm) is said to have an equivalence ratio of 1.5 relative to the 35mm // format, and therefore ISO 1000 on APS-C and ISO 1000×1.5² = 2250 on 35mm // produce an image of the same lightness from the same amount of light spread // onto their respective surface areas (resulting in different focal plane // exposures), and those images will thus have similar amounts of noise if the // cameras are of similar technology. https://doi.org/10.1117/1.OE.57.11.110801 // // The tool needs to know the resolution of the images to which its grain tables // will be applied so that it can know how the light on the sensor was shared // between its pixels. As a general rule, while a higher pixel count will lead // to more noise per pixel, when the final image is viewed at the same physical // size, that noise will tend to “average out” to the same amount over a given // area, since there will be more pixels in it which, in aggregate, will have // received essentially as much light. Put differently, the amount of noise // depends on the scale at which it is measured, and the decision for this tool // was to make that scale relative to the image instead of its constituent // samples. For more on this, see: // // https://www.photonstophotos.net/Emil%20Martinec/noise-p3.html#pixelsize // https://www.dpreview.com/articles/5365920428/the-effect-of-pixel-and-sensor-sizes-on-noise/2 // https://www.dpreview.com/videos/7940373140/dpreview-tv-why-lower-resolution-sensors-are-not-better-in-low-light #include #include #include #include #include "aom_dsp/grain_table.h" #include "common/args.h" #include "common/tools_common.h" static const char *exec_name; static const struct arg_enum_list transfer_functions[] = { { "bt470m", AOM_CICP_TC_BT_470_M }, { "bt470bg", AOM_CICP_TC_BT_470_B_G }, { "srgb", AOM_CICP_TC_SRGB }, { "smpte2084", AOM_CICP_TC_SMPTE_2084 }, { "hlg", AOM_CICP_TC_HLG }, ARG_ENUM_LIST_END }; static arg_def_t help_arg = ARG_DEF("h", "help", 0, "Show the available options"); static arg_def_t width_arg = ARG_DEF("w", "width", 1, "Width of the image in pixels (required)"); static arg_def_t height_arg = ARG_DEF("l", "height", 1, "Height of the image in pixels (required)"); static arg_def_t iso_arg = ARG_DEF( "i", "iso", 1, "ISO setting indicative of the light level (required)"); static arg_def_t output_arg = ARG_DEF("o", "output", 1, "Output file to which to write the film grain table (required)"); static arg_def_t transfer_function_arg = ARG_DEF_ENUM("t", "transfer-function", 1, "Transfer function used by the encoded image (default = sRGB)", transfer_functions); void usage_exit(void) { fprintf(stderr, "Usage: %s [--transfer-function=] --width= " "--height= --iso= --output=\n", exec_name); exit(EXIT_FAILURE); } typedef struct { float (*to_linear)(float); float (*from_linear)(float); // In linear output light. This would typically be 0.18 for SDR (this matches // the definition of Standard Output Sensitivity from ISO 12232:2019), but in // HDR, we certainly do not want to consider 18% of the maximum output a // “mid-tone”, as it would be e.g. 1800 cd/m² for SMPTE ST 2084 (PQ). float mid_tone; } transfer_function_t; static const transfer_function_t *find_transfer_function( aom_transfer_characteristics_t tc); typedef struct { int width; int height; int iso_setting; const transfer_function_t *transfer_function; const char *output_filename; } photon_noise_args_t; static void parse_args(int argc, char **argv, photon_noise_args_t *photon_noise_args) { static const arg_def_t *args[] = { &help_arg, &width_arg, &height_arg, &iso_arg, &output_arg, &transfer_function_arg, NULL }; struct arg arg; int width_set = 0, height_set = 0, iso_set = 0, output_set = 0, i; photon_noise_args->transfer_function = find_transfer_function(AOM_CICP_TC_SRGB); for (i = 1; i < argc; i += arg.argv_step) { arg.argv_step = 1; if (arg_match(&arg, &help_arg, argv + i)) { arg_show_usage(stdout, args); exit(EXIT_SUCCESS); } else if (arg_match(&arg, &width_arg, argv + i)) { photon_noise_args->width = arg_parse_int(&arg); width_set = 1; } else if (arg_match(&arg, &height_arg, argv + i)) { photon_noise_args->height = arg_parse_int(&arg); height_set = 1; } else if (arg_match(&arg, &iso_arg, argv + i)) { photon_noise_args->iso_setting = arg_parse_int(&arg); iso_set = 1; } else if (arg_match(&arg, &output_arg, argv + i)) { photon_noise_args->output_filename = arg.val; output_set = 1; } else if (arg_match(&arg, &transfer_function_arg, argv + i)) { const aom_transfer_characteristics_t tc = arg_parse_enum(&arg); photon_noise_args->transfer_function = find_transfer_function(tc); } else { fatal("unrecognized argument \"%s\", see --help for available options", argv[i]); } } if (!width_set) { fprintf(stderr, "Missing required parameter --width\n"); exit(EXIT_FAILURE); } if (!height_set) { fprintf(stderr, "Missing required parameter --height\n"); exit(EXIT_FAILURE); } if (!iso_set) { fprintf(stderr, "Missing required parameter --iso\n"); exit(EXIT_FAILURE); } if (!output_set) { fprintf(stderr, "Missing required parameter --output\n"); exit(EXIT_FAILURE); } } static float maxf(float a, float b) { return a > b ? a : b; } static float minf(float a, float b) { return a < b ? a : b; } static float gamma22_to_linear(float g) { return powf(g, 2.2f); } static float gamma22_from_linear(float l) { return powf(l, 1 / 2.2f); } static float gamma28_to_linear(float g) { return powf(g, 2.8f); } static float gamma28_from_linear(float l) { return powf(l, 1 / 2.8f); } static float srgb_to_linear(float srgb) { return srgb <= 0.04045f ? srgb / 12.92f : powf((srgb + 0.055f) / 1.055f, 2.4f); } static float srgb_from_linear(float linear) { return linear <= 0.0031308f ? 12.92f * linear : 1.055f * powf(linear, 1 / 2.4f) - 0.055f; } static const float kPqM1 = 2610.f / 16384; static const float kPqM2 = 128 * 2523.f / 4096; static const float kPqC1 = 3424.f / 4096; static const float kPqC2 = 32 * 2413.f / 4096; static const float kPqC3 = 32 * 2392.f / 4096; static float pq_to_linear(float pq) { const float pq_pow_inv_m2 = powf(pq, 1.f / kPqM2); return powf(maxf(0, pq_pow_inv_m2 - kPqC1) / (kPqC2 - kPqC3 * pq_pow_inv_m2), 1.f / kPqM1); } static float pq_from_linear(float linear) { const float linear_pow_m1 = powf(linear, kPqM1); return powf((kPqC1 + kPqC2 * linear_pow_m1) / (1 + kPqC3 * linear_pow_m1), kPqM2); } // Note: it is perhaps debatable whether “linear” for HLG should be scene light // or display light. Here, it is implemented in terms of display light assuming // a nominal peak display luminance of 1000 cd/m², hence the system γ of 1.2. To // make it scene light instead, the OOTF (powf(x, 1.2f)) and its inverse should // be removed from the functions below, and the .mid_tone should be replaced // with powf(26.f / 1000, 1 / 1.2f). static const float kHlgA = 0.17883277f; static const float kHlgB = 0.28466892f; static const float kHlgC = 0.55991073f; static float hlg_to_linear(float hlg) { // EOTF = OOTF ∘ OETF⁻¹ const float linear = hlg <= 0.5f ? hlg * hlg / 3 : (expf((hlg - kHlgC) / kHlgA) + kHlgB) / 12; return powf(linear, 1.2f); } static float hlg_from_linear(float linear) { // EOTF⁻¹ = OETF ∘ OOTF⁻¹ linear = powf(linear, 1.f / 1.2f); return linear <= 1.f / 12 ? sqrtf(3 * linear) : kHlgA * logf(12 * linear - kHlgB) + kHlgC; } static const transfer_function_t *find_transfer_function( aom_transfer_characteristics_t tc) { static const transfer_function_t kGamma22TransferFunction = { .to_linear = &gamma22_to_linear, .from_linear = &gamma22_from_linear, .mid_tone = 0.18f }, kGamma28TransferFunction = { .to_linear = &gamma28_to_linear, .from_linear = &gamma28_from_linear, .mid_tone = 0.18f }, kSRgbTransferFunction = { .to_linear = &srgb_to_linear, .from_linear = &srgb_from_linear, .mid_tone = 0.18f }, kPqTransferFunction = { .to_linear = &pq_to_linear, .from_linear = &pq_from_linear, // https://www.itu.int/pub/R-REP-BT.2408-4-2021 // page 6 (PDF page 8) .mid_tone = 26.f / 10000 }, kHlgTransferFunction = { .to_linear = &hlg_to_linear, .from_linear = &hlg_from_linear, .mid_tone = 26.f / 1000 }; switch (tc) { case AOM_CICP_TC_BT_470_M: return &kGamma22TransferFunction; case AOM_CICP_TC_BT_470_B_G: return &kGamma28TransferFunction; case AOM_CICP_TC_SRGB: return &kSRgbTransferFunction; case AOM_CICP_TC_SMPTE_2084: return &kPqTransferFunction; case AOM_CICP_TC_HLG: return &kHlgTransferFunction; default: fatal("unimplemented transfer function %d", tc); } } static void generate_photon_noise(const photon_noise_args_t *photon_noise_args, aom_film_grain_t *film_grain) { // Assumes a daylight-like spectrum. // https://www.strollswithmydog.com/effective-quantum-efficiency-of-sensor/#:~:text=11%2C260%20photons/um%5E2/lx-s static const float kPhotonsPerLxSPerUm2 = 11260; // Order of magnitude for cameras in the 2010-2020 decade, taking the CFA into // account. static const float kEffectiveQuantumEfficiency = 0.20f; // Also reasonable values for current cameras. The read noise is typically // higher than this at low ISO settings but it matters less there. static const float kPhotoResponseNonUniformity = 0.005f; static const float kInputReferredReadNoise = 1.5f; // Focal plane exposure for a mid-tone (typically a 18% reflectance card), in // lx·s. const float mid_tone_exposure = 10.f / photon_noise_args->iso_setting; // In microns. Assumes a 35mm sensor (36mm × 24mm). const float pixel_area_um2 = (36000 * 24000.f) / (photon_noise_args->width * photon_noise_args->height); const float mid_tone_electrons_per_pixel = kEffectiveQuantumEfficiency * kPhotonsPerLxSPerUm2 * mid_tone_exposure * pixel_area_um2; const float max_electrons_per_pixel = mid_tone_electrons_per_pixel / photon_noise_args->transfer_function->mid_tone; int i; film_grain->num_y_points = 14; for (i = 0; i < film_grain->num_y_points; ++i) { float x = i / (film_grain->num_y_points - 1.f); const float linear = photon_noise_args->transfer_function->to_linear(x); const float electrons_per_pixel = max_electrons_per_pixel * linear; // Quadrature sum of the relevant sources of noise, in electrons rms. Photon // shot noise is sqrt(electrons) so we can skip the square root and the // squaring. // https://en.wikipedia.org/wiki/Addition_in_quadrature // https://doi.org/10.1117/3.725073 const float noise_in_electrons = sqrtf(kInputReferredReadNoise * kInputReferredReadNoise + electrons_per_pixel + (kPhotoResponseNonUniformity * kPhotoResponseNonUniformity * electrons_per_pixel * electrons_per_pixel)); const float linear_noise = noise_in_electrons / max_electrons_per_pixel; const float linear_range_start = maxf(0.f, linear - 2 * linear_noise); const float linear_range_end = minf(1.f, linear + 2 * linear_noise); const float tf_slope = (photon_noise_args->transfer_function->from_linear(linear_range_end) - photon_noise_args->transfer_function->from_linear( linear_range_start)) / (linear_range_end - linear_range_start); float encoded_noise = linear_noise * tf_slope; x = roundf(255 * x); encoded_noise = minf(255.f, roundf(255 * 7.88f * encoded_noise)); film_grain->scaling_points_y[i][0] = (int)x; film_grain->scaling_points_y[i][1] = (int)encoded_noise; } film_grain->apply_grain = 1; film_grain->update_parameters = 1; film_grain->num_cb_points = 0; film_grain->num_cr_points = 0; film_grain->scaling_shift = 8; film_grain->ar_coeff_lag = 0; film_grain->ar_coeffs_cb[0] = 0; film_grain->ar_coeffs_cr[0] = 0; film_grain->ar_coeff_shift = 6; film_grain->cb_mult = 0; film_grain->cb_luma_mult = 0; film_grain->cb_offset = 0; film_grain->cr_mult = 0; film_grain->cr_luma_mult = 0; film_grain->cr_offset = 0; film_grain->overlap_flag = 1; film_grain->random_seed = 7391; film_grain->chroma_scaling_from_luma = 0; } int main(int argc, char **argv) { photon_noise_args_t photon_noise_args; aom_film_grain_table_t film_grain_table; aom_film_grain_t film_grain; struct aom_internal_error_info error_info; memset(&photon_noise_args, 0, sizeof(photon_noise_args)); memset(&film_grain_table, 0, sizeof(film_grain_table)); memset(&film_grain, 0, sizeof(film_grain)); memset(&error_info, 0, sizeof(error_info)); exec_name = argv[0]; parse_args(argc, argv, &photon_noise_args); generate_photon_noise(&photon_noise_args, &film_grain); aom_film_grain_table_append(&film_grain_table, 0, 9223372036854775807ull, &film_grain); if (aom_film_grain_table_write(&film_grain_table, photon_noise_args.output_filename, &error_info) != AOM_CODEC_OK) { aom_film_grain_table_free(&film_grain_table); fprintf(stderr, "Failed to write film grain table"); if (error_info.has_detail) { fprintf(stderr, ": %s", error_info.detail); } fprintf(stderr, "\n"); return EXIT_FAILURE; } aom_film_grain_table_free(&film_grain_table); return EXIT_SUCCESS; } aom-3.12.1/examples/scalable_decoder.c000066400000000000000000000154711477627663500176260ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Scalable Decoder // ============== // // This is an example of a scalable decoder loop. It takes a 2-spatial-layer // input file // containing the compressed data (in OBU format), passes it through the // decoder, and writes the decompressed frames to disk. The base layer and // enhancement layers are stored as separate files, out_lyr0.yuv and // out_lyr1.yuv, respectively. // // Standard Includes // ----------------- // For decoders, you only have to include `aom_decoder.h` and then any // header files for the specific codecs you use. In this case, we're using // av1. // // Initializing The Codec // ---------------------- // The libaom decoder is initialized by the call to aom_codec_dec_init(). // Determining the codec interface to use is handled by AvxVideoReader and the // functions prefixed with aom_video_reader_. Discussion of those functions is // beyond the scope of this example, but the main gist is to open the input file // and parse just enough of it to determine if it's a AVx file and which AVx // codec is contained within the file. // Note the NULL pointer passed to aom_codec_dec_init(). We do that in this // example because we want the algorithm to determine the stream configuration // (width/height) and allocate memory automatically. // // Decoding A Frame // ---------------- // Once the frame has been read into memory, it is decoded using the // `aom_codec_decode` function. The call takes a pointer to the data // (`frame`) and the length of the data (`frame_size`). No application data // is associated with the frame in this example, so the `user_priv` // parameter is NULL. The `deadline` parameter is left at zero for this // example. This parameter is generally only used when doing adaptive post // processing. // // Codecs may produce a variable number of output frames for every call to // `aom_codec_decode`. These frames are retrieved by the // `aom_codec_get_frame` iterator function. The iterator variable `iter` is // initialized to NULL each time `aom_codec_decode` is called. // `aom_codec_get_frame` is called in a loop, returning a pointer to a // decoded image or NULL to indicate the end of list. // // Processing The Decoded Data // --------------------------- // In this example, we simply write the encoded data to disk. It is // important to honor the image's `stride` values. // // Cleanup // ------- // The `aom_codec_destroy` call frees any memory allocated by the codec. // // Error Handling // -------------- // This example does not special case any error return codes. If there was // an error, a descriptive message is printed and the program exits. With // few exceptions, aom_codec functions return an enumerated error status, // with the value `0` indicating success. #include #include #include #include "aom/aom_decoder.h" #include "aom/aomdx.h" #include "common/obudec.h" #include "common/tools_common.h" #include "common/video_reader.h" static const char *exec_name; #define MAX_LAYERS 5 void usage_exit(void) { fprintf(stderr, "Usage: %s \n", exec_name); exit(EXIT_FAILURE); } int main(int argc, char **argv) { int frame_cnt = 0; FILE *outfile[MAX_LAYERS]; char filename[80]; FILE *inputfile = NULL; uint8_t *buf = NULL; size_t bytes_in_buffer = 0; size_t buffer_size = 0; struct AvxInputContext aom_input_ctx; struct ObuDecInputContext obu_ctx = { &aom_input_ctx, NULL, 0, 0, 0 }; aom_codec_stream_info_t si; uint8_t tmpbuf[32]; unsigned int i; exec_name = argv[0]; if (argc != 2) die("Invalid number of arguments."); if (!(inputfile = fopen(argv[1], "rb"))) die("Failed to open %s for read.", argv[1]); obu_ctx.avx_ctx->file = inputfile; obu_ctx.avx_ctx->filename = argv[1]; aom_codec_iface_t *decoder = get_aom_decoder_by_index(0); printf("Using %s\n", aom_codec_iface_name(decoder)); aom_codec_ctx_t codec; if (aom_codec_dec_init(&codec, decoder, NULL, 0)) die("Failed to initialize decoder."); if (aom_codec_control(&codec, AV1D_SET_OUTPUT_ALL_LAYERS, 1)) { die_codec(&codec, "Failed to set output_all_layers control."); } // peak sequence header OBU to get number of spatial layers const size_t ret = fread(tmpbuf, 1, 32, inputfile); if (ret != 32) die_codec(&codec, "Input is not a valid obu file"); si.is_annexb = 0; if (aom_codec_peek_stream_info(decoder, tmpbuf, 32, &si)) { die_codec(&codec, "Input is not a valid obu file"); } fseek(inputfile, -32, SEEK_CUR); if (!file_is_obu(&obu_ctx)) die_codec(&codec, "Input is not a valid obu file"); // open base layer output yuv file snprintf(filename, sizeof(filename), "out_lyr%d.yuv", 0); if (!(outfile[0] = fopen(filename, "wb"))) die("Failed top open output for writing."); // open any enhancement layer output yuv files for (i = 1; i < si.number_spatial_layers; i++) { snprintf(filename, sizeof(filename), "out_lyr%u.yuv", i); if (!(outfile[i] = fopen(filename, "wb"))) die("Failed to open output for writing."); } while (!obudec_read_temporal_unit(&obu_ctx, &buf, &bytes_in_buffer, &buffer_size)) { aom_codec_iter_t iter = NULL; aom_image_t *img = NULL; if (aom_codec_decode(&codec, buf, bytes_in_buffer, NULL)) die_codec(&codec, "Failed to decode frame."); while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) { aom_image_t *img_shifted = aom_img_alloc(NULL, AOM_IMG_FMT_I420, img->d_w, img->d_h, 16); img_shifted->bit_depth = 8; aom_img_downshift(img_shifted, img, img->bit_depth - img_shifted->bit_depth); if (img->spatial_id == 0) { printf("Writing base layer 0 %d\n", frame_cnt); aom_img_write(img_shifted, outfile[0]); } else if (img->spatial_id <= (int)(si.number_spatial_layers - 1)) { printf("Writing enhancement layer %d %d\n", img->spatial_id, frame_cnt); aom_img_write(img_shifted, outfile[img->spatial_id]); } else { die_codec(&codec, "Invalid bitstream. Layer id exceeds layer count"); } if (img->spatial_id == (int)(si.number_spatial_layers - 1)) ++frame_cnt; } } printf("Processed %d frames.\n", frame_cnt); if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); for (i = 0; i < si.number_spatial_layers; i++) fclose(outfile[i]); fclose(inputfile); return EXIT_SUCCESS; } aom-3.12.1/examples/scalable_encoder.c000066400000000000000000000246551477627663500176440ustar00rootroot00000000000000/* * Copyright (c) 2018, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Scalable Encoder // ============== // // This is an example of a scalable encoder loop. It takes two input files in // YV12 format, passes it through the encoder, and writes the compressed // frames to disk in OBU format. // // Getting The Default Configuration // --------------------------------- // Encoders have the notion of "usage profiles." For example, an encoder // may want to publish default configurations for both a video // conferencing application and a best quality offline encoder. These // obviously have very different default settings. Consult the // documentation for your codec to see if it provides any default // configurations. All codecs provide a default configuration, number 0, // which is valid for material in the vacinity of QCIF/QVGA. // // Updating The Configuration // --------------------------------- // Almost all applications will want to update the default configuration // with settings specific to their usage. Here we set the width and height // of the video file to that specified on the command line. We also scale // the default bitrate based on the ratio between the default resolution // and the resolution specified on the command line. // // Encoding A Frame // ---------------- // The frame is read as a continuous block (size = width * height * 3 / 2) // from the input file. If a frame was read (the input file has not hit // EOF) then the frame is passed to the encoder. Otherwise, a NULL // is passed, indicating the End-Of-Stream condition to the encoder. The // `frame_cnt` is reused as the presentation time stamp (PTS) and each // frame is shown for one frame-time in duration. The flags parameter is // unused in this example. // Forced Keyframes // ---------------- // Keyframes can be forced by setting the AOM_EFLAG_FORCE_KF bit of the // flags passed to `aom_codec_control()`. In this example, we force a // keyframe every frames. Note, the output stream can // contain additional keyframes beyond those that have been forced using the // AOM_EFLAG_FORCE_KF flag because of automatic keyframe placement by the // encoder. // // Processing The Encoded Data // --------------------------- // Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data // for this frame. We write a IVF frame header, followed by the raw data. // // Cleanup // ------- // The `aom_codec_destroy` call frees any memory allocated by the codec. // // Error Handling // -------------- // This example does not special case any error return codes. If there was // an error, a descriptive message is printed and the program exits. With // few exeptions, aom_codec functions return an enumerated error status, // with the value `0` indicating success. #include #include #include #include "aom/aom_encoder.h" #include "aom/aomcx.h" #include "av1/common/enums.h" #include "common/tools_common.h" #include "common/video_writer.h" static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s " " \n" "See comments in scalable_encoder.c for more information.\n", exec_name); exit(EXIT_FAILURE); } static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img, int frame_index, int flags, FILE *outfile) { int got_pkts = 0; aom_codec_iter_t iter = NULL; const aom_codec_cx_pkt_t *pkt = NULL; const aom_codec_err_t res = aom_codec_encode(codec, img, frame_index, 1, flags); if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame"); while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) { got_pkts = 1; if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; if (fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile) != pkt->data.frame.sz) { die_codec(codec, "Failed to write compressed frame"); } printf(keyframe ? "K" : "."); printf(" %6d\n", (int)pkt->data.frame.sz); fflush(stdout); } } return got_pkts; } int main(int argc, char **argv) { FILE *infile0 = NULL; FILE *infile1 = NULL; aom_codec_enc_cfg_t cfg; int frame_count = 0; aom_image_t raw0, raw1; aom_codec_err_t res; AvxVideoInfo info; const int fps = 30; const int bitrate = 200; int keyframe_interval = 0; int max_frames = 0; int frames_encoded = 0; const char *codec_arg = NULL; const char *width_arg = NULL; const char *height_arg = NULL; const char *infile0_arg = NULL; const char *infile1_arg = NULL; const char *outfile_arg = NULL; // const char *keyframe_interval_arg = NULL; FILE *outfile = NULL; exec_name = argv[0]; // Clear explicitly, as simply assigning "{ 0 }" generates // "missing-field-initializers" warning in some compilers. memset(&info, 0, sizeof(info)); if (argc != 8) die("Invalid number of arguments"); codec_arg = argv[1]; width_arg = argv[2]; height_arg = argv[3]; infile0_arg = argv[4]; infile1_arg = argv[5]; outfile_arg = argv[6]; max_frames = (int)strtol(argv[7], NULL, 0); aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg); if (!encoder) die("Unsupported codec."); info.codec_fourcc = get_fourcc_by_aom_encoder(encoder); info.frame_width = (int)strtol(width_arg, NULL, 0); info.frame_height = (int)strtol(height_arg, NULL, 0); info.time_base.numerator = 1; info.time_base.denominator = fps; if (info.frame_width <= 0 || info.frame_height <= 0 || (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); } if (!aom_img_alloc(&raw0, AOM_IMG_FMT_I420, info.frame_width, info.frame_height, 1)) { die("Failed to allocate image for layer 0."); } if (!aom_img_alloc(&raw1, AOM_IMG_FMT_I420, info.frame_width, info.frame_height, 1)) { die("Failed to allocate image for layer 1."); } // keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0); keyframe_interval = 100; if (keyframe_interval < 0) die("Invalid keyframe interval value."); printf("Using %s\n", aom_codec_iface_name(encoder)); aom_codec_ctx_t codec; res = aom_codec_enc_config_default(encoder, &cfg, 0); if (res) die_codec(&codec, "Failed to get default codec config."); cfg.g_w = info.frame_width; cfg.g_h = info.frame_height; cfg.g_timebase.num = info.time_base.numerator; cfg.g_timebase.den = info.time_base.denominator; cfg.rc_target_bitrate = bitrate; cfg.g_error_resilient = 0; cfg.g_lag_in_frames = 0; cfg.rc_end_usage = AOM_Q; cfg.save_as_annexb = 0; outfile = fopen(outfile_arg, "wb"); if (!outfile) die("Failed to open %s for writing.", outfile_arg); if (!(infile0 = fopen(infile0_arg, "rb"))) die("Failed to open %s for reading.", infile0_arg); if (!(infile1 = fopen(infile1_arg, "rb"))) die("Failed to open %s for reading.", infile0_arg); if (aom_codec_enc_init(&codec, encoder, &cfg, 0)) die("Failed to initialize encoder"); if (aom_codec_control(&codec, AOME_SET_CPUUSED, 8)) die_codec(&codec, "Failed to set cpu to 8"); if (aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, 2)) die_codec(&codec, "Failed to set tile columns to 2"); if (aom_codec_control(&codec, AV1E_SET_NUM_TG, 3)) die_codec(&codec, "Failed to set num of tile groups to 3"); if (aom_codec_control(&codec, AOME_SET_NUMBER_SPATIAL_LAYERS, 2)) die_codec(&codec, "Failed to set number of spatial layers to 2"); // Encode frames. while (aom_img_read(&raw0, infile0)) { int flags = 0; // configure and encode base layer if (keyframe_interval > 0 && frames_encoded % keyframe_interval == 0) flags |= AOM_EFLAG_FORCE_KF; else // use previous base layer (LAST) as sole reference // save this frame as LAST to be used as reference by enhanmcent layer // and next base layer flags |= AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY; cfg.g_w = info.frame_width; cfg.g_h = info.frame_height; if (aom_codec_enc_config_set(&codec, &cfg)) die_codec(&codec, "Failed to set enc cfg for layer 0"); if (aom_codec_control(&codec, AOME_SET_SPATIAL_LAYER_ID, 0)) die_codec(&codec, "Failed to set layer id to 0"); if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 62)) die_codec(&codec, "Failed to set cq level"); encode_frame(&codec, &raw0, frame_count++, flags, outfile); // configure and encode enhancement layer // use LAST (base layer) as sole reference flags = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY; cfg.g_w = info.frame_width; cfg.g_h = info.frame_height; aom_img_read(&raw1, infile1); if (aom_codec_enc_config_set(&codec, &cfg)) die_codec(&codec, "Failed to set enc cfg for layer 1"); if (aom_codec_control(&codec, AOME_SET_SPATIAL_LAYER_ID, 1)) die_codec(&codec, "Failed to set layer id to 1"); if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 10)) die_codec(&codec, "Failed to set cq level"); encode_frame(&codec, &raw1, frame_count++, flags, outfile); frames_encoded++; if (max_frames > 0 && frames_encoded >= max_frames) break; } // Flush encoder. while (encode_frame(&codec, NULL, -1, 0, outfile)) continue; printf("\n"); fclose(infile0); fclose(infile1); printf("Processed %d frames.\n", frame_count / 2); aom_img_free(&raw0); aom_img_free(&raw1); if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); fclose(outfile); return EXIT_SUCCESS; } aom-3.12.1/examples/set_maps.c000066400000000000000000000153631477627663500162060ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // AOM Set Active and ROI Maps // =========================== // // This is an example demonstrating how to control the AOM encoder's // ROI and Active maps. // // ROI (Region of Interest) maps are a way for the application to assign // each macroblock in the image to a region, and then set quantizer and // filtering parameters on that image. // // Active maps are a way for the application to specify on a // macroblock-by-macroblock basis whether there is any activity in that // macroblock. // // // Configuration // ------------- // An ROI map is set on frame 22. If the width of the image in macroblocks // is evenly divisible by 4, then the output will appear to have distinct // columns, where the quantizer, loopfilter, and static threshold differ // from column to column. // // An active map is set on frame 33. If the width of the image in macroblocks // is evenly divisible by 4, then the output will appear to have distinct // columns, where one column will have motion and the next will not. // // The active map is cleared on frame 44. // // Observing The Effects // --------------------- // Use the `simple_decoder` example to decode this sample, and observe // the change in the image at frames 22, 33, and 44. #include #include #include #include #include "aom/aom_encoder.h" #include "aom/aomcx.h" #include "common/tools_common.h" #include "common/video_writer.h" static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s \n", exec_name); exit(EXIT_FAILURE); } static void set_active_map(const aom_codec_enc_cfg_t *cfg, aom_codec_ctx_t *codec) { unsigned int i; aom_active_map_t map = { 0, 0, 0 }; map.rows = (cfg->g_h + 15) / 16; map.cols = (cfg->g_w + 15) / 16; map.active_map = (uint8_t *)malloc(map.rows * map.cols); if (!map.active_map) die("Failed to allocate active map"); for (i = 0; i < map.rows * map.cols; ++i) map.active_map[i] = i % 2; if (aom_codec_control(codec, AOME_SET_ACTIVEMAP, &map)) die_codec(codec, "Failed to set active map"); free(map.active_map); } static void unset_active_map(const aom_codec_enc_cfg_t *cfg, aom_codec_ctx_t *codec) { aom_active_map_t map = { 0, 0, 0 }; map.rows = (cfg->g_h + 15) / 16; map.cols = (cfg->g_w + 15) / 16; map.active_map = NULL; if (aom_codec_control(codec, AOME_SET_ACTIVEMAP, &map)) die_codec(codec, "Failed to set active map"); } static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img, int frame_index, AvxVideoWriter *writer) { int got_pkts = 0; aom_codec_iter_t iter = NULL; const aom_codec_cx_pkt_t *pkt = NULL; const aom_codec_err_t res = aom_codec_encode(codec, img, frame_index, 1, 0); if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame"); while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) { got_pkts = 1; if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, pkt->data.frame.sz, pkt->data.frame.pts)) { die_codec(codec, "Failed to write compressed frame"); } printf(keyframe ? "K" : "."); fflush(stdout); } } return got_pkts; } int main(int argc, char **argv) { FILE *infile = NULL; aom_codec_ctx_t codec; aom_codec_enc_cfg_t cfg; int frame_count = 0; const int limit = 10; aom_image_t raw; aom_codec_err_t res; AvxVideoInfo info; AvxVideoWriter *writer = NULL; const int fps = 2; // TODO(dkovalev) add command line argument const double bits_per_pixel_per_frame = 0.067; #if CONFIG_REALTIME_ONLY const int usage = 1; const int speed = 7; #else const int usage = 0; const int speed = 2; #endif exec_name = argv[0]; if (argc != 6) die("Invalid number of arguments"); memset(&info, 0, sizeof(info)); aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(argv[1]); if (encoder == NULL) { die("Unsupported codec."); } assert(encoder != NULL); info.codec_fourcc = get_fourcc_by_aom_encoder(encoder); info.frame_width = (int)strtol(argv[2], NULL, 0); info.frame_height = (int)strtol(argv[3], NULL, 0); info.time_base.numerator = 1; info.time_base.denominator = fps; if (info.frame_width <= 0 || info.frame_height <= 0 || (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); } if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width, info.frame_height, 1)) { die("Failed to allocate image."); } printf("Using %s\n", aom_codec_iface_name(encoder)); res = aom_codec_enc_config_default(encoder, &cfg, usage); if (res) die_codec(&codec, "Failed to get default codec config."); cfg.g_w = info.frame_width; cfg.g_h = info.frame_height; cfg.g_timebase.num = info.time_base.numerator; cfg.g_timebase.den = info.time_base.denominator; cfg.rc_target_bitrate = (unsigned int)(bits_per_pixel_per_frame * cfg.g_w * cfg.g_h * fps / 1000); cfg.g_lag_in_frames = 0; writer = aom_video_writer_open(argv[5], kContainerIVF, &info); if (!writer) die("Failed to open %s for writing.", argv[5]); if (!(infile = fopen(argv[4], "rb"))) die("Failed to open %s for reading.", argv[4]); if (aom_codec_enc_init(&codec, encoder, &cfg, 0)) die("Failed to initialize encoder"); if (aom_codec_control(&codec, AOME_SET_CPUUSED, speed)) die_codec(&codec, "Failed to set cpu-used"); // Encode frames. while (aom_img_read(&raw, infile) && frame_count < limit) { ++frame_count; if (frame_count == 5) { set_active_map(&cfg, &codec); } else if (frame_count == 9) { unset_active_map(&cfg, &codec); } encode_frame(&codec, &raw, frame_count, writer); } // Flush encoder. while (encode_frame(&codec, NULL, -1, writer)) { } printf("\n"); fclose(infile); printf("Processed %d frames.\n", frame_count); aom_img_free(&raw); if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); aom_video_writer_close(writer); return EXIT_SUCCESS; } aom-3.12.1/examples/simple_decoder.c000066400000000000000000000124211477627663500173410ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Simple Decoder // ============== // // This is an example of a simple decoder loop. It takes an input file // containing the compressed data (in IVF format), passes it through the // decoder, and writes the decompressed frames to disk. Other decoder // examples build upon this one. // // The details of the IVF format have been elided from this example for // simplicity of presentation, as IVF files will not generally be used by // your application. In general, an IVF file consists of a file header, // followed by a variable number of frames. Each frame consists of a frame // header followed by a variable length payload. The length of the payload // is specified in the first four bytes of the frame header. The payload is // the raw compressed data. // // Standard Includes // ----------------- // For decoders, you only have to include `aom_decoder.h` and then any // header files for the specific codecs you use. In this case, we're using // aom. // // Initializing The Codec // ---------------------- // The libaom decoder is initialized by the call to aom_codec_dec_init(). // Determining the codec interface to use is handled by AvxVideoReader and the // functions prefixed with aom_video_reader_. Discussion of those functions is // beyond the scope of this example, but the main gist is to open the input file // and parse just enough of it to determine if it's a AVx file and which AVx // codec is contained within the file. // Note the NULL pointer passed to aom_codec_dec_init(). We do that in this // example because we want the algorithm to determine the stream configuration // (width/height) and allocate memory automatically. // // Decoding A Frame // ---------------- // Once the frame has been read into memory, it is decoded using the // `aom_codec_decode` function. The call takes a pointer to the data // (`frame`) and the length of the data (`frame_size`). No application data // is associated with the frame in this example, so the `user_priv` // parameter is NULL. // // Codecs may produce a variable number of output frames for every call to // `aom_codec_decode`. These frames are retrieved by the // `aom_codec_get_frame` iterator function. The iterator variable `iter` is // initialized to NULL each time `aom_codec_decode` is called. // `aom_codec_get_frame` is called in a loop, returning a pointer to a // decoded image or NULL to indicate the end of list. // // Processing The Decoded Data // --------------------------- // In this example, we simply write the encoded data to disk. It is // important to honor the image's `stride` values. // // Cleanup // ------- // The `aom_codec_destroy` call frees any memory allocated by the codec. // // Error Handling // -------------- // This example does not special case any error return codes. If there was // an error, a descriptive message is printed and the program exits. With // few exceptions, aom_codec functions return an enumerated error status, // with the value `0` indicating success. #include #include #include #include "aom/aom_decoder.h" #include "common/tools_common.h" #include "common/video_reader.h" static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s \n", exec_name); exit(EXIT_FAILURE); } int main(int argc, char **argv) { int frame_cnt = 0; FILE *outfile = NULL; AvxVideoReader *reader = NULL; const AvxVideoInfo *info = NULL; exec_name = argv[0]; if (argc != 3) die("Invalid number of arguments."); reader = aom_video_reader_open(argv[1]); if (!reader) die("Failed to open %s for reading.", argv[1]); if (!(outfile = fopen(argv[2], "wb"))) die("Failed to open %s for writing.", argv[2]); info = aom_video_reader_get_info(reader); aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); if (!decoder) die("Unknown input codec."); printf("Using %s\n", aom_codec_iface_name(decoder)); aom_codec_ctx_t codec; if (aom_codec_dec_init(&codec, decoder, NULL, 0)) die("Failed to initialize decoder."); while (aom_video_reader_read_frame(reader)) { aom_codec_iter_t iter = NULL; aom_image_t *img = NULL; size_t frame_size = 0; const unsigned char *frame = aom_video_reader_get_frame(reader, &frame_size); if (aom_codec_decode(&codec, frame, frame_size, NULL)) die_codec(&codec, "Failed to decode frame."); while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) { aom_img_write(img, outfile); ++frame_cnt; } } printf("Processed %d frames.\n", frame_cnt); if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n", info->frame_width, info->frame_height, argv[2]); aom_video_reader_close(reader); fclose(outfile); return EXIT_SUCCESS; } aom-3.12.1/examples/simple_encoder.c000066400000000000000000000224501477627663500173560ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Simple Encoder // ============== // // This is an example of a simple encoder loop. It takes an input file in // YV12 format, passes it through the encoder, and writes the compressed // frames to disk in IVF format. Other decoder examples build upon this // one. // // The details of the IVF format have been elided from this example for // simplicity of presentation, as IVF files will not generally be used by // your application. In general, an IVF file consists of a file header, // followed by a variable number of frames. Each frame consists of a frame // header followed by a variable length payload. The length of the payload // is specified in the first four bytes of the frame header. The payload is // the raw compressed data. // // Standard Includes // ----------------- // For encoders, you only have to include `aom_encoder.h` and then any // header files for the specific codecs you use. In this case, we're using // aom. // // Getting The Default Configuration // --------------------------------- // Encoders have the notion of "usage profiles." For example, an encoder // may want to publish default configurations for both a video // conferencing application and a best quality offline encoder. These // obviously have very different default settings. Consult the // documentation for your codec to see if it provides any default // configurations. All codecs provide a default configuration, number 0, // which is valid for material in the vacinity of QCIF/QVGA. // // Updating The Configuration // --------------------------------- // Almost all applications will want to update the default configuration // with settings specific to their usage. Here we set the width and height // of the video file to that specified on the command line. We also scale // the default bitrate based on the ratio between the default resolution // and the resolution specified on the command line. // // Initializing The Codec // ---------------------- // The encoder is initialized by the following code. // // Encoding A Frame // ---------------- // The frame is read as a continuous block (size width * height * 3 / 2) // from the input file. If a frame was read (the input file has not hit // EOF) then the frame is passed to the encoder. Otherwise, a NULL // is passed, indicating the End-Of-Stream condition to the encoder. The // `frame_cnt` is reused as the presentation time stamp (PTS) and each // frame is shown for one frame-time in duration. The flags parameter is // unused in this example. // Forced Keyframes // ---------------- // Keyframes can be forced by setting the AOM_EFLAG_FORCE_KF bit of the // flags passed to `aom_codec_control()`. In this example, we force a // keyframe every frames. Note, the output stream can // contain additional keyframes beyond those that have been forced using the // AOM_EFLAG_FORCE_KF flag because of automatic keyframe placement by the // encoder. // // Processing The Encoded Data // --------------------------- // Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data // for this frame. We write a IVF frame header, followed by the raw data. // // Cleanup // ------- // The `aom_codec_destroy` call frees any memory allocated by the codec. // // Error Handling // -------------- // This example does not special case any error return codes. If there was // an error, a descriptive message is printed and the program exits. With // few exeptions, aom_codec functions return an enumerated error status, // with the value `0` indicating success. // // Error Resiliency Features // ------------------------- // Error resiliency is controlled by the g_error_resilient member of the // configuration structure. Use the `decode_with_drops` example to decode with // frames 5-10 dropped. Compare the output for a file encoded with this example // versus one encoded with the `simple_encoder` example. #include #include #include #include "aom/aom_encoder.h" #include "aom/aomcx.h" #include "common/tools_common.h" #include "common/video_writer.h" static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s " " \n" "See comments in simple_encoder.c for more information.\n", exec_name); exit(EXIT_FAILURE); } static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img, int frame_index, int flags, AvxVideoWriter *writer) { int got_pkts = 0; aom_codec_iter_t iter = NULL; const aom_codec_cx_pkt_t *pkt = NULL; const aom_codec_err_t res = aom_codec_encode(codec, img, frame_index, 1, flags); if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame"); while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) { got_pkts = 1; if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, pkt->data.frame.sz, pkt->data.frame.pts)) { die_codec(codec, "Failed to write compressed frame"); } printf(keyframe ? "K" : "."); fflush(stdout); } } return got_pkts; } // TODO(tomfinegan): Improve command line parsing and add args for bitrate/fps. int main(int argc, char **argv) { FILE *infile = NULL; aom_codec_ctx_t codec; aom_codec_enc_cfg_t cfg; int frame_count = 0; aom_image_t raw; aom_codec_err_t res; AvxVideoInfo info; AvxVideoWriter *writer = NULL; const int fps = 30; const int bitrate = 200; int keyframe_interval = 0; int max_frames = 0; int frames_encoded = 0; const char *codec_arg = NULL; const char *width_arg = NULL; const char *height_arg = NULL; const char *infile_arg = NULL; const char *outfile_arg = NULL; const char *keyframe_interval_arg = NULL; #if CONFIG_REALTIME_ONLY const int usage = 1; const int speed = 7; #else const int usage = 0; const int speed = 2; #endif exec_name = argv[0]; // Clear explicitly, as simply assigning "{ 0 }" generates // "missing-field-initializers" warning in some compilers. memset(&info, 0, sizeof(info)); if (argc != 9) die("Invalid number of arguments"); codec_arg = argv[1]; width_arg = argv[2]; height_arg = argv[3]; infile_arg = argv[4]; outfile_arg = argv[5]; keyframe_interval_arg = argv[6]; max_frames = (int)strtol(argv[8], NULL, 0); aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg); if (!encoder) die("Unsupported codec."); info.codec_fourcc = get_fourcc_by_aom_encoder(encoder); info.frame_width = (int)strtol(width_arg, NULL, 0); info.frame_height = (int)strtol(height_arg, NULL, 0); info.time_base.numerator = 1; info.time_base.denominator = fps; if (info.frame_width <= 0 || info.frame_height <= 0 || (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); } if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width, info.frame_height, 1)) { die("Failed to allocate image."); } keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0); if (keyframe_interval < 0) die("Invalid keyframe interval value."); printf("Using %s\n", aom_codec_iface_name(encoder)); res = aom_codec_enc_config_default(encoder, &cfg, usage); if (res) die_codec(&codec, "Failed to get default codec config."); cfg.g_w = info.frame_width; cfg.g_h = info.frame_height; cfg.g_timebase.num = info.time_base.numerator; cfg.g_timebase.den = info.time_base.denominator; cfg.rc_target_bitrate = bitrate; cfg.g_error_resilient = (aom_codec_er_flags_t)strtoul(argv[7], NULL, 0); writer = aom_video_writer_open(outfile_arg, kContainerIVF, &info); if (!writer) die("Failed to open %s for writing.", outfile_arg); if (!(infile = fopen(infile_arg, "rb"))) die("Failed to open %s for reading.", infile_arg); if (aom_codec_enc_init(&codec, encoder, &cfg, 0)) die("Failed to initialize encoder"); if (aom_codec_control(&codec, AOME_SET_CPUUSED, speed)) die_codec(&codec, "Failed to set cpu-used"); // Encode frames. while (aom_img_read(&raw, infile)) { int flags = 0; if (keyframe_interval > 0 && frame_count % keyframe_interval == 0) flags |= AOM_EFLAG_FORCE_KF; encode_frame(&codec, &raw, frame_count++, flags, writer); frames_encoded++; if (max_frames > 0 && frames_encoded >= max_frames) break; } // Flush encoder. while (encode_frame(&codec, NULL, -1, 0, writer)) continue; printf("\n"); fclose(infile); printf("Processed %d frames.\n", frame_count); aom_img_free(&raw); if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); aom_video_writer_close(writer); return EXIT_SUCCESS; } aom-3.12.1/examples/svc_encoder_rtc.cc000066400000000000000000003041521477627663500176750ustar00rootroot00000000000000/* * Copyright (c) 2019, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // This is an example demonstrating how to implement a multi-layer AOM // encoding scheme for RTC video applications. #include #include #include #include #include #include #include #include #include "config/aom_config.h" #if CONFIG_AV1_DECODER #include "aom/aom_decoder.h" #endif #include "aom/aom_encoder.h" #include "aom/aom_image.h" #include "aom/aom_integer.h" #include "aom/aomcx.h" #include "aom_dsp/bitwriter_buffer.h" #include "aom_ports/aom_timer.h" #include "av1/ratectrl_rtc.h" #include "common/args.h" #include "common/tools_common.h" #include "common/video_writer.h" #include "examples/encoder_util.h" #include "examples/multilayer_metadata.h" #define OPTION_BUFFER_SIZE 1024 #define MAX_NUM_SPATIAL_LAYERS 4 typedef struct { const char *output_filename; char options[OPTION_BUFFER_SIZE]; struct AvxInputContext input_ctx[MAX_NUM_SPATIAL_LAYERS]; int speed; int aq_mode; int layering_mode; int output_obu; int decode; int tune_content; int show_psnr; bool use_external_rc; bool scale_factors_explicitly_set; const char *multilayer_metadata_file; } AppInput; typedef enum { QUANTIZER = 0, BITRATE, SCALE_FACTOR, AUTO_ALT_REF, ALL_OPTION_TYPES } LAYER_OPTION_TYPE; static const arg_def_t outputfile = ARG_DEF("o", "output", 1, "Output filename"); static const arg_def_t frames_arg = ARG_DEF("f", "frames", 1, "Number of frames to encode"); static const arg_def_t threads_arg = ARG_DEF("th", "threads", 1, "Number of threads to use"); static const arg_def_t width_arg = ARG_DEF("w", "width", 1, "Source width"); static const arg_def_t height_arg = ARG_DEF("h", "height", 1, "Source height"); static const arg_def_t timebase_arg = ARG_DEF("t", "timebase", 1, "Timebase (num/den)"); static const arg_def_t bitrate_arg = ARG_DEF( "b", "target-bitrate", 1, "Encoding bitrate, in kilobits per second"); static const arg_def_t spatial_layers_arg = ARG_DEF("sl", "spatial-layers", 1, "Number of spatial SVC layers"); static const arg_def_t temporal_layers_arg = ARG_DEF("tl", "temporal-layers", 1, "Number of temporal SVC layers"); static const arg_def_t layering_mode_arg = ARG_DEF("lm", "layering-mode", 1, "Temporal layering scheme."); static const arg_def_t kf_dist_arg = ARG_DEF("k", "kf-dist", 1, "Number of frames between keyframes"); static const arg_def_t scale_factors_arg = ARG_DEF("r", "scale-factors", 1, "Scale factors (lowest to highest layer)"); static const arg_def_t min_q_arg = ARG_DEF(NULL, "min-q", 1, "Minimum quantizer"); static const arg_def_t max_q_arg = ARG_DEF(NULL, "max-q", 1, "Maximum quantizer"); static const arg_def_t speed_arg = ARG_DEF("sp", "speed", 1, "Speed configuration"); static const arg_def_t aqmode_arg = ARG_DEF("aq", "aqmode", 1, "AQ mode off/on"); static const arg_def_t bitrates_arg = ARG_DEF("bl", "bitrates", 1, "Bitrates[spatial_layer * num_temporal_layer + temporal_layer]"); static const arg_def_t dropframe_thresh_arg = ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)"); static const arg_def_t error_resilient_arg = ARG_DEF(NULL, "error-resilient", 1, "Error resilient flag"); static const arg_def_t output_obu_arg = ARG_DEF(NULL, "output-obu", 1, "Write OBUs when set to 1. Otherwise write IVF files."); static const arg_def_t test_decode_arg = ARG_DEF(NULL, "test-decode", 1, "Attempt to test decoding the output when set to 1. Default is 1."); static const arg_def_t psnr_arg = ARG_DEF(NULL, "psnr", -1, "Show PSNR in status line."); static const arg_def_t ext_rc_arg = ARG_DEF(NULL, "use-ext-rc", 0, "Use external rate control."); static const struct arg_enum_list tune_content_enum[] = { { "default", AOM_CONTENT_DEFAULT }, { "screen", AOM_CONTENT_SCREEN }, { "film", AOM_CONTENT_FILM }, { NULL, 0 } }; static const arg_def_t tune_content_arg = ARG_DEF_ENUM( NULL, "tune-content", 1, "Tune content type", tune_content_enum); #if CONFIG_CWG_E050 static const arg_def_t multilayer_metadata_file_arg = ARG_DEF("ml", "multilayer_metadata_file", 1, "Experimental: path to multilayer metadata file"); #endif #if CONFIG_AV1_HIGHBITDEPTH static const struct arg_enum_list bitdepth_enum[] = { { "8", AOM_BITS_8 }, { "10", AOM_BITS_10 }, { NULL, 0 } }; static const arg_def_t bitdepth_arg = ARG_DEF_ENUM( "d", "bit-depth", 1, "Bit depth for codec 8 or 10. ", bitdepth_enum); #endif // CONFIG_AV1_HIGHBITDEPTH static const arg_def_t *svc_args[] = { &frames_arg, &outputfile, &width_arg, &height_arg, &timebase_arg, &bitrate_arg, &spatial_layers_arg, &kf_dist_arg, &scale_factors_arg, &min_q_arg, &max_q_arg, &temporal_layers_arg, &layering_mode_arg, &threads_arg, &aqmode_arg, #if CONFIG_AV1_HIGHBITDEPTH &bitdepth_arg, #endif &speed_arg, &bitrates_arg, &dropframe_thresh_arg, &error_resilient_arg, &output_obu_arg, &test_decode_arg, &tune_content_arg, &psnr_arg, #if CONFIG_CWG_E050 &multilayer_metadata_file_arg, #endif NULL, }; #define zero(Dest) memset(&(Dest), 0, sizeof(Dest)) static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s input_filename [input_filename ...] -o " "output_filename\n", exec_name); fprintf(stderr, "Options:\n"); arg_show_usage(stderr, svc_args); fprintf( stderr, "Input files must be y4m or yuv.\n" "If multiple input files are specified, they correspond to spatial " "layers, and there should be as many as there are spatial layers.\n" "All input files must have the same width, height, frame rate and number " "of frames.\n" "If only one file is specified, it is used for all spatial layers.\n"); exit(EXIT_FAILURE); } static int file_is_y4m(const char detect[4]) { return memcmp(detect, "YUV4", 4) == 0; } static int fourcc_is_ivf(const char detect[4]) { if (memcmp(detect, "DKIF", 4) == 0) { return 1; } return 0; } static const int option_max_values[ALL_OPTION_TYPES] = { 63, INT_MAX, INT_MAX, 1 }; static const int option_min_values[ALL_OPTION_TYPES] = { 0, 0, 1, 0 }; static void open_input_file(struct AvxInputContext *input, aom_chroma_sample_position_t csp) { /* Parse certain options from the input file, if possible */ input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb") : set_binary_mode(stdin); if (!input->file) fatal("Failed to open input file"); if (!fseeko(input->file, 0, SEEK_END)) { /* Input file is seekable. Figure out how long it is, so we can get * progress info. */ input->length = ftello(input->file); rewind(input->file); } /* Default to 1:1 pixel aspect ratio. */ input->pixel_aspect_ratio.numerator = 1; input->pixel_aspect_ratio.denominator = 1; /* For RAW input sources, these bytes will applied on the first frame * in read_frame(). */ input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file); input->detect.position = 0; if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) { if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, csp, input->only_i420) >= 0) { input->file_type = FILE_TYPE_Y4M; input->width = input->y4m.pic_w; input->height = input->y4m.pic_h; input->pixel_aspect_ratio.numerator = input->y4m.par_n; input->pixel_aspect_ratio.denominator = input->y4m.par_d; input->framerate.numerator = input->y4m.fps_n; input->framerate.denominator = input->y4m.fps_d; input->fmt = input->y4m.aom_fmt; input->bit_depth = static_cast(input->y4m.bit_depth); } else { fatal("Unsupported Y4M stream."); } } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) { fatal("IVF is not supported as input."); } else { input->file_type = FILE_TYPE_RAW; } } static aom_codec_err_t extract_option(LAYER_OPTION_TYPE type, char *input, int *value0, int *value1) { if (type == SCALE_FACTOR) { *value0 = (int)strtol(input, &input, 10); if (*input++ != '/') return AOM_CODEC_INVALID_PARAM; *value1 = (int)strtol(input, &input, 10); if (*value0 < option_min_values[SCALE_FACTOR] || *value1 < option_min_values[SCALE_FACTOR] || *value0 > option_max_values[SCALE_FACTOR] || *value1 > option_max_values[SCALE_FACTOR] || *value0 > *value1) // num shouldn't be greater than den return AOM_CODEC_INVALID_PARAM; } else { *value0 = atoi(input); if (*value0 < option_min_values[type] || *value0 > option_max_values[type]) return AOM_CODEC_INVALID_PARAM; } return AOM_CODEC_OK; } static aom_codec_err_t parse_layer_options_from_string( aom_svc_params_t *svc_params, LAYER_OPTION_TYPE type, const char *input, int *option0, int *option1) { aom_codec_err_t res = AOM_CODEC_OK; char *input_string; char *token; const char *delim = ","; int num_layers = svc_params->number_spatial_layers; int i = 0; if (type == BITRATE) num_layers = svc_params->number_spatial_layers * svc_params->number_temporal_layers; if (input == NULL || option0 == NULL || (option1 == NULL && type == SCALE_FACTOR)) return AOM_CODEC_INVALID_PARAM; const size_t input_length = strlen(input); input_string = reinterpret_cast(malloc(input_length + 1)); if (input_string == NULL) return AOM_CODEC_MEM_ERROR; memcpy(input_string, input, input_length + 1); token = strtok(input_string, delim); // NOLINT for (i = 0; i < num_layers; ++i) { if (token != NULL) { res = extract_option(type, token, option0 + i, option1 + i); if (res != AOM_CODEC_OK) break; token = strtok(NULL, delim); // NOLINT } else { res = AOM_CODEC_INVALID_PARAM; break; } } free(input_string); return res; } static void parse_command_line(int argc, const char **argv_, AppInput *app_input, aom_svc_params_t *svc_params, aom_codec_enc_cfg_t *enc_cfg) { struct arg arg; char **argv = NULL; char **argi = NULL; char **argj = NULL; char string_options[1024] = { 0 }; // Default settings svc_params->number_spatial_layers = 1; svc_params->number_temporal_layers = 1; app_input->layering_mode = 0; app_input->output_obu = 0; app_input->decode = 1; enc_cfg->g_threads = 1; enc_cfg->rc_end_usage = AOM_CBR; // process command line options argv = argv_dup(argc - 1, argv_ + 1); if (!argv) { fprintf(stderr, "Error allocating argument list\n"); exit(EXIT_FAILURE); } for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { arg.argv_step = 1; if (arg_match(&arg, &outputfile, argi)) { app_input->output_filename = arg.val; } else if (arg_match(&arg, &width_arg, argi)) { enc_cfg->g_w = arg_parse_uint(&arg); } else if (arg_match(&arg, &height_arg, argi)) { enc_cfg->g_h = arg_parse_uint(&arg); } else if (arg_match(&arg, &timebase_arg, argi)) { enc_cfg->g_timebase = arg_parse_rational(&arg); } else if (arg_match(&arg, &bitrate_arg, argi)) { enc_cfg->rc_target_bitrate = arg_parse_uint(&arg); } else if (arg_match(&arg, &spatial_layers_arg, argi)) { svc_params->number_spatial_layers = arg_parse_uint(&arg); } else if (arg_match(&arg, &temporal_layers_arg, argi)) { svc_params->number_temporal_layers = arg_parse_uint(&arg); } else if (arg_match(&arg, &speed_arg, argi)) { app_input->speed = arg_parse_uint(&arg); if (app_input->speed > 11) { aom_tools_warn("Mapping speed %d to speed 11.\n", app_input->speed); } } else if (arg_match(&arg, &aqmode_arg, argi)) { app_input->aq_mode = arg_parse_uint(&arg); } else if (arg_match(&arg, &threads_arg, argi)) { enc_cfg->g_threads = arg_parse_uint(&arg); } else if (arg_match(&arg, &layering_mode_arg, argi)) { app_input->layering_mode = arg_parse_int(&arg); } else if (arg_match(&arg, &kf_dist_arg, argi)) { enc_cfg->kf_min_dist = arg_parse_uint(&arg); enc_cfg->kf_max_dist = enc_cfg->kf_min_dist; } else if (arg_match(&arg, &scale_factors_arg, argi)) { aom_codec_err_t res = parse_layer_options_from_string( svc_params, SCALE_FACTOR, arg.val, svc_params->scaling_factor_num, svc_params->scaling_factor_den); app_input->scale_factors_explicitly_set = true; if (res != AOM_CODEC_OK) { die("Failed to parse scale factors: %s\n", aom_codec_err_to_string(res)); } } else if (arg_match(&arg, &min_q_arg, argi)) { enc_cfg->rc_min_quantizer = arg_parse_uint(&arg); } else if (arg_match(&arg, &max_q_arg, argi)) { enc_cfg->rc_max_quantizer = arg_parse_uint(&arg); #if CONFIG_AV1_HIGHBITDEPTH } else if (arg_match(&arg, &bitdepth_arg, argi)) { enc_cfg->g_bit_depth = static_cast(arg_parse_enum_or_int(&arg)); switch (enc_cfg->g_bit_depth) { case AOM_BITS_8: enc_cfg->g_input_bit_depth = 8; enc_cfg->g_profile = 0; break; case AOM_BITS_10: enc_cfg->g_input_bit_depth = 10; enc_cfg->g_profile = 0; break; default: die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth); } #endif // CONFIG_VP9_HIGHBITDEPTH } else if (arg_match(&arg, &dropframe_thresh_arg, argi)) { enc_cfg->rc_dropframe_thresh = arg_parse_uint(&arg); } else if (arg_match(&arg, &error_resilient_arg, argi)) { enc_cfg->g_error_resilient = arg_parse_uint(&arg); if (enc_cfg->g_error_resilient != 0 && enc_cfg->g_error_resilient != 1) die("Invalid value for error resilient (0, 1): %d.", enc_cfg->g_error_resilient); } else if (arg_match(&arg, &output_obu_arg, argi)) { app_input->output_obu = arg_parse_uint(&arg); if (app_input->output_obu != 0 && app_input->output_obu != 1) die("Invalid value for obu output flag (0, 1): %d.", app_input->output_obu); } else if (arg_match(&arg, &test_decode_arg, argi)) { app_input->decode = arg_parse_uint(&arg); if (app_input->decode != 0 && app_input->decode != 1) die("Invalid value for test decode flag (0, 1): %d.", app_input->decode); } else if (arg_match(&arg, &tune_content_arg, argi)) { app_input->tune_content = arg_parse_enum_or_int(&arg); printf("tune content %d\n", app_input->tune_content); } else if (arg_match(&arg, &psnr_arg, argi)) { app_input->show_psnr = 1; } else if (arg_match(&arg, &ext_rc_arg, argi)) { app_input->use_external_rc = true; #if CONFIG_CWG_E050 } else if (arg_match(&arg, &multilayer_metadata_file_arg, argi)) { app_input->multilayer_metadata_file = arg.val; #endif } else { ++argj; } } // Total bitrate needs to be parsed after the number of layers. for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { arg.argv_step = 1; if (arg_match(&arg, &bitrates_arg, argi)) { aom_codec_err_t res = parse_layer_options_from_string( svc_params, BITRATE, arg.val, svc_params->layer_target_bitrate, NULL); if (res != AOM_CODEC_OK) { die("Failed to parse bitrates: %s\n", aom_codec_err_to_string(res)); } } else { ++argj; } } // There will be a space in front of the string options if (strlen(string_options) > 0) strncpy(app_input->options, string_options, OPTION_BUFFER_SIZE); // Check for unrecognized options for (argi = argv; *argi; ++argi) if (argi[0][0] == '-' && strlen(argi[0]) > 1) die("Error: Unrecognized option %s\n", *argi); if (argv[0] == NULL) { usage_exit(); } int input_count = 0; while (argv[input_count] != NULL && input_count < MAX_NUM_SPATIAL_LAYERS) { app_input->input_ctx[input_count].filename = argv[input_count]; ++input_count; } if (input_count > 1 && input_count != svc_params->number_spatial_layers) { die("Error: Number of input files does not match number of spatial layers"); } if (argv[input_count] != NULL) { die("Error: Too many input files specified, there should be at most %d", MAX_NUM_SPATIAL_LAYERS); } free(argv); for (int i = 0; i < input_count; ++i) { open_input_file(&app_input->input_ctx[i], AOM_CSP_UNKNOWN); if (app_input->input_ctx[i].file_type == FILE_TYPE_Y4M) { if (enc_cfg->g_w == 0 || enc_cfg->g_h == 0) { // Override these settings with the info from Y4M file. enc_cfg->g_w = app_input->input_ctx[i].width; enc_cfg->g_h = app_input->input_ctx[i].height; // g_timebase is the reciprocal of frame rate. enc_cfg->g_timebase.num = app_input->input_ctx[i].framerate.denominator; enc_cfg->g_timebase.den = app_input->input_ctx[i].framerate.numerator; } else if (enc_cfg->g_w != app_input->input_ctx[i].width || enc_cfg->g_h != app_input->input_ctx[i].height || enc_cfg->g_timebase.num != app_input->input_ctx[i].framerate.denominator || enc_cfg->g_timebase.den != app_input->input_ctx[i].framerate.numerator) { die("Error: Input file dimensions and/or frame rate mismatch"); } } } if (enc_cfg->g_w == 0 || enc_cfg->g_h == 0) { die("Error: Input file dimensions not set, use -w and -h"); } if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 || enc_cfg->g_h % 2) die("Invalid resolution: %d x %d\n", enc_cfg->g_w, enc_cfg->g_h); printf( "Codec %s\n" "layers: %d\n" "width %u, height: %u\n" "num: %d, den: %d, bitrate: %u\n" "gop size: %u\n", aom_codec_iface_name(aom_codec_av1_cx()), svc_params->number_spatial_layers, enc_cfg->g_w, enc_cfg->g_h, enc_cfg->g_timebase.num, enc_cfg->g_timebase.den, enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist); } static const int mode_to_num_temporal_layers[12] = { 1, 2, 3, 3, 2, 1, 1, 3, 3, 3, 3, 3, }; static const int mode_to_num_spatial_layers[12] = { 1, 1, 1, 1, 1, 2, 3, 2, 3, 3, 3, 3, }; // For rate control encoding stats. struct RateControlMetrics { // Number of input frames per layer. int layer_input_frames[AOM_MAX_TS_LAYERS]; // Number of encoded non-key frames per layer. int layer_enc_frames[AOM_MAX_TS_LAYERS]; // Framerate per layer layer (cumulative). double layer_framerate[AOM_MAX_TS_LAYERS]; // Target average frame size per layer (per-frame-bandwidth per layer). double layer_pfb[AOM_MAX_LAYERS]; // Actual average frame size per layer. double layer_avg_frame_size[AOM_MAX_LAYERS]; // Average rate mismatch per layer (|target - actual| / target). double layer_avg_rate_mismatch[AOM_MAX_LAYERS]; // Actual encoding bitrate per layer (cumulative across temporal layers). double layer_encoding_bitrate[AOM_MAX_LAYERS]; // Average of the short-time encoder actual bitrate. // TODO(marpan): Should we add these short-time stats for each layer? double avg_st_encoding_bitrate; // Variance of the short-time encoder actual bitrate. double variance_st_encoding_bitrate; // Window (number of frames) for computing short-timee encoding bitrate. int window_size; // Number of window measurements. int window_count; int layer_target_bitrate[AOM_MAX_LAYERS]; }; static const int REF_FRAMES = 8; static const int INTER_REFS_PER_FRAME = 7; // Reference frames used in this example encoder. enum { SVC_LAST_FRAME = 0, SVC_LAST2_FRAME, SVC_LAST3_FRAME, SVC_GOLDEN_FRAME, SVC_BWDREF_FRAME, SVC_ALTREF2_FRAME, SVC_ALTREF_FRAME }; static int read_frame(struct AvxInputContext *input_ctx, aom_image_t *img) { FILE *f = input_ctx->file; y4m_input *y4m = &input_ctx->y4m; int shortread = 0; if (input_ctx->file_type == FILE_TYPE_Y4M) { if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0; } else { shortread = read_yuv_frame(input_ctx, img); } return !shortread; } static void close_input_file(struct AvxInputContext *input) { fclose(input->file); if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m); } // Note: these rate control metrics assume only 1 key frame in the // sequence (i.e., first frame only). So for temporal pattern# 7 // (which has key frame for every frame on base layer), the metrics // computation will be off/wrong. // TODO(marpan): Update these metrics to account for multiple key frames // in the stream. static void set_rate_control_metrics(struct RateControlMetrics *rc, double framerate, int ss_number_layers, int ts_number_layers) { int ts_rate_decimator[AOM_MAX_TS_LAYERS] = { 1 }; ts_rate_decimator[0] = 1; if (ts_number_layers == 2) { ts_rate_decimator[0] = 2; ts_rate_decimator[1] = 1; } if (ts_number_layers == 3) { ts_rate_decimator[0] = 4; ts_rate_decimator[1] = 2; ts_rate_decimator[2] = 1; } // Set the layer (cumulative) framerate and the target layer (non-cumulative) // per-frame-bandwidth, for the rate control encoding stats below. for (int sl = 0; sl < ss_number_layers; ++sl) { int i = sl * ts_number_layers; rc->layer_framerate[0] = framerate / ts_rate_decimator[0]; rc->layer_pfb[i] = 1000.0 * rc->layer_target_bitrate[i] / rc->layer_framerate[0]; for (int tl = 0; tl < ts_number_layers; ++tl) { i = sl * ts_number_layers + tl; if (tl > 0) { rc->layer_framerate[tl] = framerate / ts_rate_decimator[tl]; rc->layer_pfb[i] = 1000.0 * (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) / (rc->layer_framerate[tl] - rc->layer_framerate[tl - 1]); } rc->layer_input_frames[tl] = 0; rc->layer_enc_frames[tl] = 0; rc->layer_encoding_bitrate[i] = 0.0; rc->layer_avg_frame_size[i] = 0.0; rc->layer_avg_rate_mismatch[i] = 0.0; } } rc->window_count = 0; rc->window_size = 15; rc->avg_st_encoding_bitrate = 0.0; rc->variance_st_encoding_bitrate = 0.0; } static void printout_rate_control_summary(struct RateControlMetrics *rc, int frame_cnt, int ss_number_layers, int ts_number_layers) { int tot_num_frames = 0; double perc_fluctuation = 0.0; printf("Total number of processed frames: %d\n\n", frame_cnt - 1); printf("Rate control layer stats for %d layer(s):\n\n", ts_number_layers); for (int sl = 0; sl < ss_number_layers; ++sl) { tot_num_frames = 0; for (int tl = 0; tl < ts_number_layers; ++tl) { int i = sl * ts_number_layers + tl; const int num_dropped = tl > 0 ? rc->layer_input_frames[tl] - rc->layer_enc_frames[tl] : rc->layer_input_frames[tl] - rc->layer_enc_frames[tl] - 1; tot_num_frames += rc->layer_input_frames[tl]; rc->layer_encoding_bitrate[i] = 0.001 * rc->layer_framerate[tl] * rc->layer_encoding_bitrate[i] / tot_num_frames; rc->layer_avg_frame_size[i] = rc->layer_avg_frame_size[i] / rc->layer_enc_frames[tl]; rc->layer_avg_rate_mismatch[i] = 100.0 * rc->layer_avg_rate_mismatch[i] / rc->layer_enc_frames[tl]; printf("For layer#: %d %d \n", sl, tl); printf("Bitrate (target vs actual): %d %f\n", rc->layer_target_bitrate[i], rc->layer_encoding_bitrate[i]); printf("Average frame size (target vs actual): %f %f\n", rc->layer_pfb[i], rc->layer_avg_frame_size[i]); printf("Average rate_mismatch: %f\n", rc->layer_avg_rate_mismatch[i]); printf( "Number of input frames, encoded (non-key) frames, " "and perc dropped frames: %d %d %f\n", rc->layer_input_frames[tl], rc->layer_enc_frames[tl], 100.0 * num_dropped / rc->layer_input_frames[tl]); printf("\n"); } } rc->avg_st_encoding_bitrate = rc->avg_st_encoding_bitrate / rc->window_count; rc->variance_st_encoding_bitrate = rc->variance_st_encoding_bitrate / rc->window_count - (rc->avg_st_encoding_bitrate * rc->avg_st_encoding_bitrate); perc_fluctuation = 100.0 * sqrt(rc->variance_st_encoding_bitrate) / rc->avg_st_encoding_bitrate; printf("Short-time stats, for window of %d frames:\n", rc->window_size); printf("Average, rms-variance, and percent-fluct: %f %f %f\n", rc->avg_st_encoding_bitrate, sqrt(rc->variance_st_encoding_bitrate), perc_fluctuation); if (frame_cnt - 1 != tot_num_frames) die("Error: Number of input frames not equal to output!\n"); } // Layer pattern configuration. static void set_layer_pattern( int layering_mode, int superframe_cnt, aom_svc_layer_id_t *layer_id, aom_svc_ref_frame_config_t *ref_frame_config, aom_svc_ref_frame_comp_pred_t *ref_frame_comp_pred, int *use_svc_control, int spatial_layer_id, int is_key_frame, int ksvc_mode, int speed) { // Setting this flag to 1 enables simplex example of // RPS (Reference Picture Selection) for 1 layer. int use_rps_example = 0; int i; int enable_longterm_temporal_ref = 1; int shift = (layering_mode == 8) ? 2 : 0; int simulcast_mode = (layering_mode == 11); *use_svc_control = 1; layer_id->spatial_layer_id = spatial_layer_id; int lag_index = 0; int base_count = superframe_cnt >> 2; ref_frame_comp_pred->use_comp_pred[0] = 0; // GOLDEN_LAST ref_frame_comp_pred->use_comp_pred[1] = 0; // LAST2_LAST ref_frame_comp_pred->use_comp_pred[2] = 0; // ALTREF_LAST // Set the reference map buffer idx for the 7 references: // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = i; for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->reference[i] = 0; for (i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0; if (ksvc_mode) { // Same pattern as case 9, but the reference strucutre will be constrained // below. layering_mode = 9; } switch (layering_mode) { case 0: if (use_rps_example == 0) { // 1-layer: update LAST on every frame, reference LAST. layer_id->temporal_layer_id = 0; layer_id->spatial_layer_id = 0; ref_frame_config->refresh[0] = 1; ref_frame_config->reference[SVC_LAST_FRAME] = 1; } else { // Pattern of 2 references (ALTREF and GOLDEN) trailing // LAST by 4 and 8 frames, with some switching logic to // sometimes only predict from the longer-term reference //(golden here). This is simple example to test RPS // (reference picture selection). int last_idx = 0; int last_idx_refresh = 0; int gld_idx = 0; int alt_ref_idx = 0; int lag_alt = 4; int lag_gld = 8; layer_id->temporal_layer_id = 0; layer_id->spatial_layer_id = 0; int sh = 8; // slots 0 - 7. // Moving index slot for last: 0 - (sh - 1) if (superframe_cnt > 1) last_idx = (superframe_cnt - 1) % sh; // Moving index for refresh of last: one ahead for next frame. last_idx_refresh = superframe_cnt % sh; // Moving index for gld_ref, lag behind current by lag_gld if (superframe_cnt > lag_gld) gld_idx = (superframe_cnt - lag_gld) % sh; // Moving index for alt_ref, lag behind LAST by lag_alt frames. if (superframe_cnt > lag_alt) alt_ref_idx = (superframe_cnt - lag_alt) % sh; // Set the ref_idx. // Default all references to slot for last. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = last_idx; // Set the ref_idx for the relevant references. ref_frame_config->ref_idx[SVC_LAST_FRAME] = last_idx; ref_frame_config->ref_idx[SVC_LAST2_FRAME] = last_idx_refresh; ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = gld_idx; ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = alt_ref_idx; // Refresh this slot, which will become LAST on next frame. ref_frame_config->refresh[last_idx_refresh] = 1; // Reference LAST, ALTREF, and GOLDEN ref_frame_config->reference[SVC_LAST_FRAME] = 1; ref_frame_config->reference[SVC_ALTREF_FRAME] = 1; ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; // Switch to only GOLDEN every 300 frames. if (superframe_cnt % 200 == 0 && superframe_cnt > 0) { ref_frame_config->reference[SVC_LAST_FRAME] = 0; ref_frame_config->reference[SVC_ALTREF_FRAME] = 0; ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; // Test if the long-term is LAST instead, this is just a renaming // but its tests if encoder behaves the same, whether its // LAST or GOLDEN. if (superframe_cnt % 400 == 0 && superframe_cnt > 0) { ref_frame_config->ref_idx[SVC_LAST_FRAME] = gld_idx; ref_frame_config->reference[SVC_LAST_FRAME] = 1; ref_frame_config->reference[SVC_ALTREF_FRAME] = 0; ref_frame_config->reference[SVC_GOLDEN_FRAME] = 0; } } } break; case 1: // 2-temporal layer. // 1 3 5 // 0 2 4 // Keep golden fixed at slot 3. base_count = superframe_cnt >> 1; ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; // Cyclically refresh slots 5, 6, 7, for lag alt ref. lag_index = 5; if (base_count > 0) { lag_index = 5 + (base_count % 3); if (superframe_cnt % 2 != 0) lag_index = 5 + ((base_count + 1) % 3); } // Set the altref slot to lag_index. ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = lag_index; if (superframe_cnt % 2 == 0) { layer_id->temporal_layer_id = 0; // Update LAST on layer 0, reference LAST. ref_frame_config->refresh[0] = 1; ref_frame_config->reference[SVC_LAST_FRAME] = 1; // Refresh lag_index slot, needed for lagging golen. ref_frame_config->refresh[lag_index] = 1; // Refresh GOLDEN every x base layer frames. if (base_count % 32 == 0) ref_frame_config->refresh[3] = 1; } else { layer_id->temporal_layer_id = 1; // No updates on layer 1, reference LAST (TL0). ref_frame_config->reference[SVC_LAST_FRAME] = 1; } // Always reference golden and altref on TL0. if (layer_id->temporal_layer_id == 0) { ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; ref_frame_config->reference[SVC_ALTREF_FRAME] = 1; } break; case 2: // 3-temporal layer: // 1 3 5 7 // 2 6 // 0 4 8 if (superframe_cnt % 4 == 0) { // Base layer. layer_id->temporal_layer_id = 0; // Update LAST on layer 0, reference LAST. ref_frame_config->refresh[0] = 1; ref_frame_config->reference[SVC_LAST_FRAME] = 1; } else if ((superframe_cnt - 1) % 4 == 0) { layer_id->temporal_layer_id = 2; // First top layer: no updates, only reference LAST (TL0). ref_frame_config->reference[SVC_LAST_FRAME] = 1; } else if ((superframe_cnt - 2) % 4 == 0) { layer_id->temporal_layer_id = 1; // Middle layer (TL1): update LAST2, only reference LAST (TL0). ref_frame_config->refresh[1] = 1; ref_frame_config->reference[SVC_LAST_FRAME] = 1; } else if ((superframe_cnt - 3) % 4 == 0) { layer_id->temporal_layer_id = 2; // Second top layer: no updates, only reference LAST. // Set buffer idx for LAST to slot 1, since that was the slot // updated in previous frame. So LAST is TL1 frame. ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 0; ref_frame_config->reference[SVC_LAST_FRAME] = 1; } break; case 3: // 3 TL, same as above, except allow for predicting // off 2 more references (GOLDEN and ALTREF), with // GOLDEN updated periodically, and ALTREF lagging from // LAST from ~4 frames. Both GOLDEN and ALTREF // can only be updated on base temporal layer. // Keep golden fixed at slot 3. ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; // Cyclically refresh slots 5, 6, 7, for lag altref. lag_index = 5; if (base_count > 0) { lag_index = 5 + (base_count % 3); if (superframe_cnt % 4 != 0) lag_index = 5 + ((base_count + 1) % 3); } // Set the altref slot to lag_index. ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = lag_index; if (superframe_cnt % 4 == 0) { // Base layer. layer_id->temporal_layer_id = 0; // Update LAST on layer 0, reference LAST. ref_frame_config->refresh[0] = 1; ref_frame_config->reference[SVC_LAST_FRAME] = 1; // Refresh GOLDEN every x ~10 base layer frames. if (base_count % 10 == 0) ref_frame_config->refresh[3] = 1; // Refresh lag_index slot, needed for lagging altref. ref_frame_config->refresh[lag_index] = 1; } else if ((superframe_cnt - 1) % 4 == 0) { layer_id->temporal_layer_id = 2; // First top layer: no updates, only reference LAST (TL0). ref_frame_config->reference[SVC_LAST_FRAME] = 1; } else if ((superframe_cnt - 2) % 4 == 0) { layer_id->temporal_layer_id = 1; // Middle layer (TL1): update LAST2, only reference LAST (TL0). ref_frame_config->refresh[1] = 1; ref_frame_config->reference[SVC_LAST_FRAME] = 1; } else if ((superframe_cnt - 3) % 4 == 0) { layer_id->temporal_layer_id = 2; // Second top layer: no updates, only reference LAST. // Set buffer idx for LAST to slot 1, since that was the slot // updated in previous frame. So LAST is TL1 frame. ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 0; ref_frame_config->reference[SVC_LAST_FRAME] = 1; } // Every frame can reference GOLDEN AND ALTREF. ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; ref_frame_config->reference[SVC_ALTREF_FRAME] = 1; // Allow for compound prediction for LAST-ALTREF and LAST-GOLDEN. if (speed >= 7) { ref_frame_comp_pred->use_comp_pred[2] = 1; ref_frame_comp_pred->use_comp_pred[0] = 1; } break; case 4: // 3-temporal layer: but middle layer updates GF, so 2nd TL2 will // only reference GF (not LAST). Other frames only reference LAST. // 1 3 5 7 // 2 6 // 0 4 8 if (superframe_cnt % 4 == 0) { // Base layer. layer_id->temporal_layer_id = 0; // Update LAST on layer 0, only reference LAST. ref_frame_config->refresh[0] = 1; ref_frame_config->reference[SVC_LAST_FRAME] = 1; } else if ((superframe_cnt - 1) % 4 == 0) { layer_id->temporal_layer_id = 2; // First top layer: no updates, only reference LAST (TL0). ref_frame_config->reference[SVC_LAST_FRAME] = 1; } else if ((superframe_cnt - 2) % 4 == 0) { layer_id->temporal_layer_id = 1; // Middle layer (TL1): update GF, only reference LAST (TL0). ref_frame_config->refresh[3] = 1; ref_frame_config->reference[SVC_LAST_FRAME] = 1; } else if ((superframe_cnt - 3) % 4 == 0) { layer_id->temporal_layer_id = 2; // Second top layer: no updates, only reference GF. ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; } break; case 5: // 2 spatial layers, 1 temporal. layer_id->temporal_layer_id = 0; if (layer_id->spatial_layer_id == 0) { // Reference LAST, update LAST. ref_frame_config->refresh[0] = 1; ref_frame_config->reference[SVC_LAST_FRAME] = 1; } else if (layer_id->spatial_layer_id == 1) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1 // and GOLDEN to slot 0. Update slot 1 (LAST). ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 0; ref_frame_config->refresh[1] = 1; ref_frame_config->reference[SVC_LAST_FRAME] = 1; ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; } break; case 6: // 3 spatial layers, 1 temporal. // Note for this case, we set the buffer idx for all references to be // either LAST or GOLDEN, which are always valid references, since decoder // will check if any of the 7 references is valid scale in // valid_ref_frame_size(). layer_id->temporal_layer_id = 0; if (layer_id->spatial_layer_id == 0) { // Reference LAST, update LAST. Set all buffer_idx to 0. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->refresh[0] = 1; ref_frame_config->reference[SVC_LAST_FRAME] = 1; } else if (layer_id->spatial_layer_id == 1) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1 // and GOLDEN (and all other refs) to slot 0. // Update slot 1 (LAST). for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; ref_frame_config->refresh[1] = 1; ref_frame_config->reference[SVC_LAST_FRAME] = 1; ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; } else if (layer_id->spatial_layer_id == 2) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2 // and GOLDEN (and all other refs) to slot 1. // Update slot 2 (LAST). for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 1; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2; ref_frame_config->refresh[2] = 1; ref_frame_config->reference[SVC_LAST_FRAME] = 1; ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; // For 3 spatial layer case: allow for top spatial layer to use // additional temporal reference. Update every 10 frames. if (enable_longterm_temporal_ref) { ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = REF_FRAMES - 1; ref_frame_config->reference[SVC_ALTREF_FRAME] = 1; if (base_count % 10 == 0) ref_frame_config->refresh[REF_FRAMES - 1] = 1; } } break; case 7: // 2 spatial and 3 temporal layer. ref_frame_config->reference[SVC_LAST_FRAME] = 1; if (superframe_cnt % 4 == 0) { // Base temporal layer layer_id->temporal_layer_id = 0; if (layer_id->spatial_layer_id == 0) { // Reference LAST, update LAST // Set all buffer_idx to 0 for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->refresh[0] = 1; } else if (layer_id->spatial_layer_id == 1) { // Reference LAST and GOLDEN. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; ref_frame_config->refresh[1] = 1; } } else if ((superframe_cnt - 1) % 4 == 0) { // First top temporal enhancement layer. layer_id->temporal_layer_id = 2; if (layer_id->spatial_layer_id == 0) { for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; ref_frame_config->refresh[3] = 1; } else if (layer_id->spatial_layer_id == 1) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, // GOLDEN (and all other refs) to slot 3. // No update. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 3; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; } } else if ((superframe_cnt - 2) % 4 == 0) { // Middle temporal enhancement layer. layer_id->temporal_layer_id = 1; if (layer_id->spatial_layer_id == 0) { // Reference LAST. // Set all buffer_idx to 0. // Set GOLDEN to slot 5 and update slot 5. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 5 - shift; ref_frame_config->refresh[5 - shift] = 1; } else if (layer_id->spatial_layer_id == 1) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, // GOLDEN (and all other refs) to slot 5. // Set LAST3 to slot 6 and update slot 6. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 5 - shift; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; ref_frame_config->ref_idx[SVC_LAST3_FRAME] = 6 - shift; ref_frame_config->refresh[6 - shift] = 1; } } else if ((superframe_cnt - 3) % 4 == 0) { // Second top temporal enhancement layer. layer_id->temporal_layer_id = 2; if (layer_id->spatial_layer_id == 0) { // Set LAST to slot 5 and reference LAST. // Set GOLDEN to slot 3 and update slot 3. // Set all other buffer_idx to 0. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 5 - shift; ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; ref_frame_config->refresh[3] = 1; } else if (layer_id->spatial_layer_id == 1) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6, // GOLDEN to slot 3. No update. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 6 - shift; ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; } } break; case 8: // 3 spatial and 3 temporal layer. // Same as case 9 but overalap in the buffer slot updates. // (shift = 2). The slots 3 and 4 updated by first TL2 are // reused for update in TL1 superframe. // Note for this case, frame order hint must be disabled for // lower resolutios (operating points > 0) to be decoedable. case 9: // 3 spatial and 3 temporal layer. // No overlap in buffer updates between TL2 and TL1. // TL2 updates slot 3 and 4, TL1 updates 5, 6, 7. // Set the references via the svc_ref_frame_config control. // Always reference LAST. ref_frame_config->reference[SVC_LAST_FRAME] = 1; if (superframe_cnt % 4 == 0) { // Base temporal layer. layer_id->temporal_layer_id = 0; if (layer_id->spatial_layer_id == 0) { // Reference LAST, update LAST. // Set all buffer_idx to 0. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->refresh[0] = 1; } else if (layer_id->spatial_layer_id == 1) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, // GOLDEN (and all other refs) to slot 0. // Update slot 1 (LAST). for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; ref_frame_config->refresh[1] = 1; } else if (layer_id->spatial_layer_id == 2) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, // GOLDEN (and all other refs) to slot 1. // Update slot 2 (LAST). for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 1; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2; ref_frame_config->refresh[2] = 1; } } else if ((superframe_cnt - 1) % 4 == 0) { // First top temporal enhancement layer. layer_id->temporal_layer_id = 2; if (layer_id->spatial_layer_id == 0) { // Reference LAST (slot 0). // Set GOLDEN to slot 3 and update slot 3. // Set all other buffer_idx to slot 0. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; ref_frame_config->refresh[3] = 1; } else if (layer_id->spatial_layer_id == 1) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, // GOLDEN (and all other refs) to slot 3. // Set LAST2 to slot 4 and Update slot 4. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 3; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 4; ref_frame_config->refresh[4] = 1; } else if (layer_id->spatial_layer_id == 2) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, // GOLDEN (and all other refs) to slot 4. // No update. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 4; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2; } } else if ((superframe_cnt - 2) % 4 == 0) { // Middle temporal enhancement layer. layer_id->temporal_layer_id = 1; if (layer_id->spatial_layer_id == 0) { // Reference LAST. // Set all buffer_idx to 0. // Set GOLDEN to slot 5 and update slot 5. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 5 - shift; ref_frame_config->refresh[5 - shift] = 1; } else if (layer_id->spatial_layer_id == 1) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, // GOLDEN (and all other refs) to slot 5. // Set LAST3 to slot 6 and update slot 6. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 5 - shift; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; ref_frame_config->ref_idx[SVC_LAST3_FRAME] = 6 - shift; ref_frame_config->refresh[6 - shift] = 1; } else if (layer_id->spatial_layer_id == 2) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, // GOLDEN (and all other refs) to slot 6. // Set LAST3 to slot 7 and update slot 7. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 6 - shift; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2; ref_frame_config->ref_idx[SVC_LAST3_FRAME] = 7 - shift; ref_frame_config->refresh[7 - shift] = 1; } } else if ((superframe_cnt - 3) % 4 == 0) { // Second top temporal enhancement layer. layer_id->temporal_layer_id = 2; if (layer_id->spatial_layer_id == 0) { // Set LAST to slot 5 and reference LAST. // Set GOLDEN to slot 3 and update slot 3. // Set all other buffer_idx to 0. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 5 - shift; ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; ref_frame_config->refresh[3] = 1; } else if (layer_id->spatial_layer_id == 1) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6, // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 6 - shift; ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 4; ref_frame_config->refresh[4] = 1; } else if (layer_id->spatial_layer_id == 2) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7, // GOLDEN to slot 4. No update. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 7 - shift; ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 4; } } break; case 11: // Simulcast mode for 3 spatial and 3 temporal layers. // No inter-layer predicton, only prediction is temporal and single // reference (LAST). // No overlap in buffer slots between spatial layers. So for example, // SL0 only uses slots 0 and 1. // SL1 only uses slots 2 and 3. // SL2 only uses slots 4 and 5. // All 7 references for each inter-frame must only access buffer slots // for that spatial layer. // On key (super)frames: SL1 and SL2 must have no references set // and must refresh all the slots for that layer only (so 2 and 3 // for SL1, 4 and 5 for SL2). The base SL0 will be labelled internally // as a Key frame (refresh all slots). SL1/SL2 will be labelled // internally as Intra-only frames that allow that stream to be decoded. // These conditions will allow for each spatial stream to be // independently decodeable. // Initialize all references to 0 (don't use reference). for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->reference[i] = 0; // Initialize as no refresh/update for all slots. for (i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0; for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; if (is_key_frame) { if (layer_id->spatial_layer_id == 0) { // Assign LAST/GOLDEN to slot 0/1. // Refesh slots 0 and 1 for SL0. // SL0: this will get set to KEY frame internally. ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0; ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 1; ref_frame_config->refresh[0] = 1; ref_frame_config->refresh[1] = 1; } else if (layer_id->spatial_layer_id == 1) { // Assign LAST/GOLDEN to slot 2/3. // Refesh slots 2 and 3 for SL1. // This will get set to Intra-only frame internally. ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2; ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; ref_frame_config->refresh[2] = 1; ref_frame_config->refresh[3] = 1; } else if (layer_id->spatial_layer_id == 2) { // Assign LAST/GOLDEN to slot 4/5. // Refresh slots 4 and 5 for SL2. // This will get set to Intra-only frame internally. ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4; ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 5; ref_frame_config->refresh[4] = 1; ref_frame_config->refresh[5] = 1; } } else if (superframe_cnt % 4 == 0) { // Base temporal layer: TL0 layer_id->temporal_layer_id = 0; if (layer_id->spatial_layer_id == 0) { // SL0 // Reference LAST. Assign all references to either slot // 0 or 1. Here we assign LAST to slot 0, all others to 1. // Update slot 0 (LAST). ref_frame_config->reference[SVC_LAST_FRAME] = 1; for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 1; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0; ref_frame_config->refresh[0] = 1; } else if (layer_id->spatial_layer_id == 1) { // SL1 // Reference LAST. Assign all references to either slot // 2 or 3. Here we assign LAST to slot 2, all others to 3. // Update slot 2 (LAST). ref_frame_config->reference[SVC_LAST_FRAME] = 1; for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 3; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2; ref_frame_config->refresh[2] = 1; } else if (layer_id->spatial_layer_id == 2) { // SL2 // Reference LAST. Assign all references to either slot // 4 or 5. Here we assign LAST to slot 4, all others to 5. // Update slot 4 (LAST). ref_frame_config->reference[SVC_LAST_FRAME] = 1; for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 5; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4; ref_frame_config->refresh[4] = 1; } } else if ((superframe_cnt - 1) % 4 == 0) { // First top temporal enhancement layer: TL2 layer_id->temporal_layer_id = 2; if (layer_id->spatial_layer_id == 0) { // SL0 // Reference LAST (slot 0). Assign other references to slot 1. // No update/refresh on any slots. ref_frame_config->reference[SVC_LAST_FRAME] = 1; for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 1; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0; } else if (layer_id->spatial_layer_id == 1) { // SL1 // Reference LAST (slot 2). Assign other references to slot 3. // No update/refresh on any slots. ref_frame_config->reference[SVC_LAST_FRAME] = 1; for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 3; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2; } else if (layer_id->spatial_layer_id == 2) { // SL2 // Reference LAST (slot 4). Assign other references to slot 4. // No update/refresh on any slots. ref_frame_config->reference[SVC_LAST_FRAME] = 1; for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 5; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4; } } else if ((superframe_cnt - 2) % 4 == 0) { // Middle temporal enhancement layer: TL1 layer_id->temporal_layer_id = 1; if (layer_id->spatial_layer_id == 0) { // SL0 // Reference LAST (slot 0). // Set GOLDEN to slot 1 and update slot 1. // This will be used as reference for next TL2. ref_frame_config->reference[SVC_LAST_FRAME] = 1; for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 1; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0; ref_frame_config->refresh[1] = 1; } else if (layer_id->spatial_layer_id == 1) { // SL1 // Reference LAST (slot 2). // Set GOLDEN to slot 3 and update slot 3. // This will be used as reference for next TL2. ref_frame_config->reference[SVC_LAST_FRAME] = 1; for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 3; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2; ref_frame_config->refresh[3] = 1; } else if (layer_id->spatial_layer_id == 2) { // SL2 // Reference LAST (slot 4). // Set GOLDEN to slot 5 and update slot 5. // This will be used as reference for next TL2. ref_frame_config->reference[SVC_LAST_FRAME] = 1; for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 5; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4; ref_frame_config->refresh[5] = 1; } } else if ((superframe_cnt - 3) % 4 == 0) { // Second top temporal enhancement layer: TL2 layer_id->temporal_layer_id = 2; if (layer_id->spatial_layer_id == 0) { // SL0 // Reference LAST (slot 1). Assign other references to slot 0. // No update/refresh on any slots. ref_frame_config->reference[SVC_LAST_FRAME] = 1; for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; } else if (layer_id->spatial_layer_id == 1) { // SL1 // Reference LAST (slot 3). Assign other references to slot 2. // No update/refresh on any slots. ref_frame_config->reference[SVC_LAST_FRAME] = 1; for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 2; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 3; } else if (layer_id->spatial_layer_id == 2) { // SL2 // Reference LAST (slot 5). Assign other references to slot 4. // No update/refresh on any slots. ref_frame_config->reference[SVC_LAST_FRAME] = 1; for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 4; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 5; } } if (!simulcast_mode && layer_id->spatial_layer_id > 0) { // Always reference GOLDEN (inter-layer prediction). ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; if (ksvc_mode) { // KSVC: only keep the inter-layer reference (GOLDEN) for // superframes whose base is key. if (!is_key_frame) ref_frame_config->reference[SVC_GOLDEN_FRAME] = 0; } if (is_key_frame && layer_id->spatial_layer_id > 1) { // On superframes whose base is key: remove LAST to avoid prediction // off layer two levels below. ref_frame_config->reference[SVC_LAST_FRAME] = 0; } } // For 3 spatial layer case 8 (where there is free buffer slot): // allow for top spatial layer to use additional temporal reference. // Additional reference is only updated on base temporal layer, every // 10 TL0 frames here. if (!simulcast_mode && enable_longterm_temporal_ref && layer_id->spatial_layer_id == 2 && layering_mode == 8) { ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = REF_FRAMES - 1; if (!is_key_frame) ref_frame_config->reference[SVC_ALTREF_FRAME] = 1; if (base_count % 10 == 0 && layer_id->temporal_layer_id == 0) ref_frame_config->refresh[REF_FRAMES - 1] = 1; } break; default: assert(0); die("Error: Unsupported temporal layering mode!\n"); } } static void write_literal(struct aom_write_bit_buffer *wb, uint32_t data, uint8_t bits, uint32_t offset = 0) { if (bits > 32) { die("Invalid bits value %d > 32\n", bits); } const uint32_t max = static_cast(((uint64_t)1 << bits) - 1); if (data < offset || (data - offset) > max) { die("Invalid data, value %u out of range [%u, %" PRIu64 "]\n", data, offset, (uint64_t)max + offset); } aom_wb_write_unsigned_literal(wb, data - offset, bits); } static void write_depth_representation_element( struct aom_write_bit_buffer *buffer, const std::pair &element) { if (!element.second) { return; } write_literal(buffer, element.first.sign_flag, 1); write_literal(buffer, element.first.exponent, 7); if (element.first.mantissa_len == 0 || element.first.mantissa_len > 32) { die("Invalid mantissan_len %d\n", element.first.mantissa_len); } write_literal(buffer, element.first.mantissa_len - 1, 5); write_literal(buffer, element.first.mantissa, element.first.mantissa_len); } static void write_color_properties( struct aom_write_bit_buffer *buffer, const std::pair &color_properties) { write_literal(buffer, color_properties.second, 1); if (color_properties.second) { write_literal(buffer, color_properties.first.color_range, 1); write_literal(buffer, color_properties.first.color_primaries, 8); write_literal(buffer, color_properties.first.transfer_characteristics, 8); write_literal(buffer, color_properties.first.matrix_coefficients, 8); } else { write_literal(buffer, 0, 1); // reserved_1bit } } static void add_multilayer_metadata( aom_image_t *frame, const libaom_examples::MultilayerMetadata &multilayer) { // Pretty large buffer to accommodate the largest multilayer metadata // possible, with 4 alpha segmentation layers (each can be up to about 66kB). std::vector data(66000 * multilayer.layers.size()); struct aom_write_bit_buffer buffer = { data.data(), 0 }; write_literal(&buffer, multilayer.use_case, 6); if (multilayer.layers.empty()) { die("Invalid multilayer metadata, no layers found\n"); } else if (multilayer.layers.size() > MAX_NUM_SPATIAL_LAYERS) { die("Invalid multilayer metadata, too many layers (max is %d)\n", MAX_NUM_SPATIAL_LAYERS); } write_literal(&buffer, (int)multilayer.layers.size() - 1, 2); assert(buffer.bit_offset % 8 == 0); for (size_t i = 0; i < multilayer.layers.size(); ++i) { const libaom_examples::LayerMetadata &layer = multilayer.layers[i]; // Alpha info with segmentation with labels can be up to about 66k bytes, // which requires 3 bytes to encode in leb128. const int bytes_reserved_for_size = 3; // Placeholder for layer_metadata_size which will be written later. write_literal(&buffer, 0, bytes_reserved_for_size * 8); const uint32_t metadata_start = buffer.bit_offset; write_literal(&buffer, (int)i, 2); // ml_spatial_id write_literal(&buffer, layer.layer_type, 5); write_literal(&buffer, layer.luma_plane_only_flag, 1); write_literal(&buffer, layer.layer_view_type, 3); write_literal(&buffer, layer.group_id, 2); write_literal(&buffer, layer.layer_dependency_idc, 3); write_literal(&buffer, layer.layer_metadata_scope, 2); write_literal(&buffer, 0, 4); // ml_reserved_4bits if (i > 0) { write_color_properties(&buffer, layer.layer_color_description); } else { write_literal(&buffer, 0, 2); // ml_reserved_2bits } assert(buffer.bit_offset % 8 == 0); if (layer.layer_type == libaom_examples::MULTILAYER_LAYER_TYPE_ALPHA && layer.layer_metadata_scope >= libaom_examples::SCOPE_GLOBAL) { const libaom_examples::AlphaInformation &alpha_info = layer.global_alpha_info; write_literal(&buffer, alpha_info.alpha_use_idc, 3); write_literal(&buffer, alpha_info.alpha_bit_depth, 3, /*offset=*/8); write_literal(&buffer, alpha_info.alpha_clip_idc, 2); write_literal(&buffer, alpha_info.alpha_incr_flag, 1); write_literal(&buffer, alpha_info.alpha_transparent_value, alpha_info.alpha_bit_depth + 1); write_literal(&buffer, alpha_info.alpha_opaque_value, alpha_info.alpha_bit_depth + 1); if (buffer.bit_offset % 8 != 0) { // ai_byte_alignment_bits write_literal(&buffer, 0, 8 - (buffer.bit_offset % 8)); } assert(buffer.bit_offset % 8 == 0); if (alpha_info.alpha_use_idc == libaom_examples::ALPHA_STRAIGHT) { write_literal(&buffer, 0, 6); // ai_reserved_6bits write_color_properties(&buffer, alpha_info.alpha_color_description); } else if (alpha_info.alpha_use_idc == libaom_examples::ALPHA_SEGMENTATION) { write_literal(&buffer, 0, 7); // ai_reserved_7bits write_literal(&buffer, !alpha_info.label_type_id.empty(), 1); if (!alpha_info.label_type_id.empty()) { const size_t num_values = std::abs(alpha_info.alpha_transparent_value - alpha_info.alpha_opaque_value) + 1; if (!alpha_info.label_type_id.empty() && alpha_info.label_type_id.size() != num_values) { die("Invalid multilayer metadata, label_type_id size must be " "equal to the range of alpha values between " "alpha_transparent_value and alpha_opaque_value (expected " "%d values, found %d values)\n", (int)num_values, (int)alpha_info.label_type_id.size()); } for (size_t j = 0; j < num_values; ++j) { write_literal(&buffer, alpha_info.label_type_id[j], 16); } } } assert(buffer.bit_offset % 8 == 0); } else if (layer.layer_type == libaom_examples::MULTILAYER_LAYER_TYPE_DEPTH && layer.layer_metadata_scope >= libaom_examples::SCOPE_GLOBAL) { const libaom_examples::DepthInformation &depth_info = layer.global_depth_info; write_literal(&buffer, depth_info.z_near.second, 1); write_literal(&buffer, depth_info.z_far.second, 1); write_literal(&buffer, depth_info.d_min.second, 1); write_literal(&buffer, depth_info.d_max.second, 1); write_literal(&buffer, depth_info.depth_representation_type, 4); if (depth_info.d_min.second || depth_info.d_max.second) { write_literal(&buffer, depth_info.disparity_ref_view_id, 2); } write_depth_representation_element(&buffer, depth_info.z_near); write_depth_representation_element(&buffer, depth_info.z_far); write_depth_representation_element(&buffer, depth_info.d_min); write_depth_representation_element(&buffer, depth_info.d_max); if (depth_info.depth_representation_type == 3) { write_literal(&buffer, depth_info.depth_nonlinear_precision, 4, /*offset=*/8); if (depth_info.depth_nonlinear_representation_model.empty() || depth_info.depth_nonlinear_representation_model.size() > (1 << 6)) { die("Invalid multilayer metadata, if depth_nonlinear_precision " "== 3, depth_nonlinear_representation_model must have 1 to " "%d elements, found %d elements\n", 1 << 6, (int)depth_info.depth_nonlinear_representation_model.size()); } write_literal( &buffer, (int)depth_info.depth_nonlinear_representation_model.size() - 1, 6); const int bit_depth = depth_info.depth_nonlinear_precision; for (const uint32_t v : depth_info.depth_nonlinear_representation_model) { write_literal(&buffer, v, bit_depth); } } if (buffer.bit_offset % 8 != 0) { write_literal(&buffer, 0, 8 - (buffer.bit_offset % 8)); } assert(buffer.bit_offset % 8 == 0); } assert(buffer.bit_offset % 8 == 0); const int metadata_size_bytes = (buffer.bit_offset - metadata_start) / 8; const uint8_t size_pos = metadata_start / 8 - bytes_reserved_for_size; size_t coded_size; if (aom_uleb_encode_fixed_size(metadata_size_bytes, bytes_reserved_for_size, bytes_reserved_for_size, &buffer.bit_buffer[size_pos], &coded_size)) { // Need to increase bytes_reserved_for_size in the code above. die("Error: Failed to write metadata size\n"); } } assert(buffer.bit_offset % 8 == 0); if (aom_img_add_metadata(frame, 33 /*METADATA_TYPE_MULTILAYER*/, buffer.bit_buffer, buffer.bit_offset / 8, AOM_MIF_KEY_FRAME)) { die("Error: Failed to add metadata\n"); } } #if CONFIG_AV1_DECODER // Returns whether there is a mismatch between the encoder's new frame and the // decoder's new frame. static int test_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder, const int frames_out) { aom_image_t enc_img, dec_img; int mismatch = 0; /* Get the internal new frame */ AOM_CODEC_CONTROL_TYPECHECKED(encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img); AOM_CODEC_CONTROL_TYPECHECKED(decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img); #if CONFIG_AV1_HIGHBITDEPTH if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) != (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) { if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { aom_image_t enc_hbd_img; aom_img_alloc( &enc_hbd_img, static_cast(enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH), enc_img.d_w, enc_img.d_h, 16); aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img); enc_img = enc_hbd_img; } if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { aom_image_t dec_hbd_img; aom_img_alloc( &dec_hbd_img, static_cast(dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH), dec_img.d_w, dec_img.d_h, 16); aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img); dec_img = dec_hbd_img; } } #endif if (!aom_compare_img(&enc_img, &dec_img)) { int y[4], u[4], v[4]; #if CONFIG_AV1_HIGHBITDEPTH if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { aom_find_mismatch_high(&enc_img, &dec_img, y, u, v); } else { aom_find_mismatch(&enc_img, &dec_img, y, u, v); } #else aom_find_mismatch(&enc_img, &dec_img, y, u, v); #endif fprintf(stderr, "Encode/decode mismatch on frame %d at" " Y[%d, %d] {%d/%d}," " U[%d, %d] {%d/%d}," " V[%d, %d] {%d/%d}\n", frames_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1], v[2], v[3]); mismatch = 1; } aom_img_free(&enc_img); aom_img_free(&dec_img); return mismatch; } #endif // CONFIG_AV1_DECODER struct psnr_stats { // The second element of these arrays is reserved for high bitdepth. uint64_t psnr_sse_total[2]; uint64_t psnr_samples_total[2]; double psnr_totals[2][4]; int psnr_count[2]; }; static void show_psnr(struct psnr_stats *psnr_stream, double peak) { double ovpsnr; if (!psnr_stream->psnr_count[0]) return; fprintf(stderr, "\nPSNR (Overall/Avg/Y/U/V)"); ovpsnr = sse_to_psnr((double)psnr_stream->psnr_samples_total[0], peak, (double)psnr_stream->psnr_sse_total[0]); fprintf(stderr, " %.3f", ovpsnr); for (int i = 0; i < 4; i++) { fprintf(stderr, " %.3f", psnr_stream->psnr_totals[0][i] / psnr_stream->psnr_count[0]); } fprintf(stderr, "\n"); } static aom::AV1RateControlRtcConfig create_rtc_rc_config( const aom_codec_enc_cfg_t &cfg, const AppInput &app_input) { aom::AV1RateControlRtcConfig rc_cfg; rc_cfg.width = cfg.g_w; rc_cfg.height = cfg.g_h; rc_cfg.max_quantizer = cfg.rc_max_quantizer; rc_cfg.min_quantizer = cfg.rc_min_quantizer; rc_cfg.target_bandwidth = cfg.rc_target_bitrate; rc_cfg.buf_initial_sz = cfg.rc_buf_initial_sz; rc_cfg.buf_optimal_sz = cfg.rc_buf_optimal_sz; rc_cfg.buf_sz = cfg.rc_buf_sz; rc_cfg.overshoot_pct = cfg.rc_overshoot_pct; rc_cfg.undershoot_pct = cfg.rc_undershoot_pct; // This is hardcoded as AOME_SET_MAX_INTRA_BITRATE_PCT rc_cfg.max_intra_bitrate_pct = 300; rc_cfg.framerate = cfg.g_timebase.den; // TODO(jianj): Add suppor for SVC. rc_cfg.ss_number_layers = 1; rc_cfg.ts_number_layers = 1; rc_cfg.scaling_factor_num[0] = 1; rc_cfg.scaling_factor_den[0] = 1; rc_cfg.layer_target_bitrate[0] = static_cast(rc_cfg.target_bandwidth); rc_cfg.max_quantizers[0] = rc_cfg.max_quantizer; rc_cfg.min_quantizers[0] = rc_cfg.min_quantizer; rc_cfg.aq_mode = app_input.aq_mode; return rc_cfg; } static int qindex_to_quantizer(int qindex) { // Table that converts 0-63 range Q values passed in outside to the 0-255 // range Qindex used internally. static const int quantizer_to_qindex[] = { 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255, }; for (int quantizer = 0; quantizer < 64; ++quantizer) if (quantizer_to_qindex[quantizer] >= qindex) return quantizer; return 63; } static void set_active_map(const aom_codec_enc_cfg_t *cfg, aom_codec_ctx_t *codec, int frame_cnt) { aom_active_map_t map = { 0, 0, 0 }; map.rows = (cfg->g_h + 15) / 16; map.cols = (cfg->g_w + 15) / 16; map.active_map = (uint8_t *)malloc(map.rows * map.cols); if (!map.active_map) die("Failed to allocate active map"); // Example map for testing. for (unsigned int i = 0; i < map.rows; ++i) { for (unsigned int j = 0; j < map.cols; ++j) { int index = map.cols * i + j; map.active_map[index] = 1; if (frame_cnt < 300) { if (i < map.rows / 2 && j < map.cols / 2) map.active_map[index] = 0; } else if (frame_cnt >= 300) { if (i < map.rows / 2 && j >= map.cols / 2) map.active_map[index] = 0; } } } if (aom_codec_control(codec, AOME_SET_ACTIVEMAP, &map)) die_codec(codec, "Failed to set active map"); free(map.active_map); } int main(int argc, const char **argv) { AppInput app_input; AvxVideoWriter *outfile[AOM_MAX_LAYERS] = { NULL }; FILE *obu_files[AOM_MAX_LAYERS] = { NULL }; AvxVideoWriter *total_layer_file = NULL; FILE *total_layer_obu_file = NULL; aom_codec_enc_cfg_t cfg; int frame_cnt = 0; aom_image_t raw; int frame_avail; int got_data = 0; int flags = 0; int i; int pts = 0; // PTS starts at 0. int frame_duration = 1; // 1 timebase tick per frame. aom_svc_layer_id_t layer_id; aom_svc_params_t svc_params; aom_svc_ref_frame_config_t ref_frame_config; aom_svc_ref_frame_comp_pred_t ref_frame_comp_pred; #if CONFIG_INTERNAL_STATS FILE *stats_file = fopen("opsnr.stt", "a"); if (stats_file == NULL) { die("Cannot open opsnr.stt\n"); } #endif #if CONFIG_AV1_DECODER aom_codec_ctx_t decoder; #endif struct RateControlMetrics rc; int64_t cx_time = 0; int64_t cx_time_layer[AOM_MAX_LAYERS]; // max number of layers. int frame_cnt_layer[AOM_MAX_LAYERS]; double sum_bitrate = 0.0; double sum_bitrate2 = 0.0; double framerate = 30.0; int use_svc_control = 1; int set_err_resil_frame = 0; int test_changing_bitrate = 0; zero(rc.layer_target_bitrate); memset(&layer_id, 0, sizeof(aom_svc_layer_id_t)); memset(&app_input, 0, sizeof(AppInput)); memset(&svc_params, 0, sizeof(svc_params)); // Flag to test dynamic scaling of source frames for single // spatial stream, using the scaling_mode control. const int test_dynamic_scaling_single_layer = 0; // Flag to test setting speed per layer. const int test_speed_per_layer = 0; // Flag for testing active maps. const int test_active_maps = 0; /* Setup default input stream settings */ for (i = 0; i < MAX_NUM_SPATIAL_LAYERS; ++i) { app_input.input_ctx[i].framerate.numerator = 30; app_input.input_ctx[i].framerate.denominator = 1; app_input.input_ctx[i].only_i420 = 0; app_input.input_ctx[i].bit_depth = AOM_BITS_8; } app_input.speed = 7; exec_name = argv[0]; // start with default encoder configuration aom_codec_err_t res = aom_codec_enc_config_default(aom_codec_av1_cx(), &cfg, AOM_USAGE_REALTIME); if (res != AOM_CODEC_OK) { die("Failed to get config: %s\n", aom_codec_err_to_string(res)); } // Real time parameters. cfg.g_usage = AOM_USAGE_REALTIME; cfg.rc_end_usage = AOM_CBR; cfg.rc_min_quantizer = 2; cfg.rc_max_quantizer = 52; cfg.rc_undershoot_pct = 50; cfg.rc_overshoot_pct = 50; cfg.rc_buf_initial_sz = 600; cfg.rc_buf_optimal_sz = 600; cfg.rc_buf_sz = 1000; cfg.rc_resize_mode = 0; // Set to RESIZE_DYNAMIC for dynamic resize. cfg.g_lag_in_frames = 0; cfg.kf_mode = AOM_KF_AUTO; cfg.g_w = 0; // Force user to specify width and height for raw input. cfg.g_h = 0; parse_command_line(argc, argv, &app_input, &svc_params, &cfg); int ts_number_layers = svc_params.number_temporal_layers; int ss_number_layers = svc_params.number_spatial_layers; unsigned int width = cfg.g_w; unsigned int height = cfg.g_h; if (app_input.layering_mode >= 0) { if (ts_number_layers != mode_to_num_temporal_layers[app_input.layering_mode] || ss_number_layers != mode_to_num_spatial_layers[app_input.layering_mode]) { die("Number of layers doesn't match layering mode."); } } bool has_non_y4m_input = false; for (i = 0; i < AOM_MAX_LAYERS; ++i) { if (app_input.input_ctx[i].file_type != FILE_TYPE_Y4M) { has_non_y4m_input = true; break; } } // Y4M reader has its own allocation. if (has_non_y4m_input) { if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, width, height, 32)) { die("Failed to allocate image (%dx%d)", width, height); } } aom_codec_iface_t *encoder = aom_codec_av1_cx(); memcpy(&rc.layer_target_bitrate[0], &svc_params.layer_target_bitrate[0], sizeof(svc_params.layer_target_bitrate)); unsigned int total_rate = 0; for (i = 0; i < ss_number_layers; i++) { total_rate += svc_params .layer_target_bitrate[i * ts_number_layers + ts_number_layers - 1]; } if (total_rate != cfg.rc_target_bitrate) { die("Incorrect total target bitrate, expected: %d", total_rate); } svc_params.framerate_factor[0] = 1; if (ts_number_layers == 2) { svc_params.framerate_factor[0] = 2; svc_params.framerate_factor[1] = 1; } else if (ts_number_layers == 3) { svc_params.framerate_factor[0] = 4; svc_params.framerate_factor[1] = 2; svc_params.framerate_factor[2] = 1; } libaom_examples::MultilayerMetadata multilayer_metadata; if (app_input.multilayer_metadata_file != NULL) { if (!libaom_examples::parse_multilayer_file( app_input.multilayer_metadata_file, &multilayer_metadata)) { die("Failed to parse multilayer metadata"); } libaom_examples::print_multilayer_metadata(multilayer_metadata); } framerate = cfg.g_timebase.den / cfg.g_timebase.num; set_rate_control_metrics(&rc, framerate, ss_number_layers, ts_number_layers); AvxVideoInfo info; info.codec_fourcc = get_fourcc_by_aom_encoder(encoder); info.frame_width = cfg.g_w; info.frame_height = cfg.g_h; info.time_base.numerator = cfg.g_timebase.num; info.time_base.denominator = cfg.g_timebase.den; // Open an output file for each stream. for (int sl = 0; sl < ss_number_layers; ++sl) { for (int tl = 0; tl < ts_number_layers; ++tl) { i = sl * ts_number_layers + tl; char file_name[PATH_MAX]; snprintf(file_name, sizeof(file_name), "%s_%d.av1", app_input.output_filename, i); if (app_input.output_obu) { obu_files[i] = fopen(file_name, "wb"); if (!obu_files[i]) die("Failed to open %s for writing", file_name); } else { outfile[i] = aom_video_writer_open(file_name, kContainerIVF, &info); if (!outfile[i]) die("Failed to open %s for writing", file_name); } } } if (app_input.output_obu) { total_layer_obu_file = fopen(app_input.output_filename, "wb"); if (!total_layer_obu_file) die("Failed to open %s for writing", app_input.output_filename); } else { total_layer_file = aom_video_writer_open(app_input.output_filename, kContainerIVF, &info); if (!total_layer_file) die("Failed to open %s for writing", app_input.output_filename); } // Initialize codec. aom_codec_ctx_t codec; aom_codec_flags_t flag = 0; flag |= cfg.g_input_bit_depth == AOM_BITS_8 ? 0 : AOM_CODEC_USE_HIGHBITDEPTH; flag |= app_input.show_psnr ? AOM_CODEC_USE_PSNR : 0; if (aom_codec_enc_init(&codec, encoder, &cfg, flag)) die_codec(&codec, "Failed to initialize encoder"); #if CONFIG_AV1_DECODER if (app_input.decode) { if (aom_codec_dec_init(&decoder, get_aom_decoder_by_index(0), NULL, 0)) die_codec(&decoder, "Failed to initialize decoder"); } #endif aom_codec_control(&codec, AOME_SET_CPUUSED, app_input.speed); aom_codec_control(&codec, AV1E_SET_AQ_MODE, app_input.aq_mode ? 3 : 0); aom_codec_control(&codec, AV1E_SET_GF_CBR_BOOST_PCT, 0); aom_codec_control(&codec, AV1E_SET_ENABLE_CDEF, 1); aom_codec_control(&codec, AV1E_SET_LOOPFILTER_CONTROL, 1); aom_codec_control(&codec, AV1E_SET_ENABLE_WARPED_MOTION, 0); aom_codec_control(&codec, AV1E_SET_ENABLE_OBMC, 0); aom_codec_control(&codec, AV1E_SET_ENABLE_GLOBAL_MOTION, 0); aom_codec_control(&codec, AV1E_SET_ENABLE_ORDER_HINT, 0); aom_codec_control(&codec, AV1E_SET_ENABLE_TPL_MODEL, 0); aom_codec_control(&codec, AV1E_SET_DELTAQ_MODE, 0); aom_codec_control(&codec, AV1E_SET_COEFF_COST_UPD_FREQ, 3); aom_codec_control(&codec, AV1E_SET_MODE_COST_UPD_FREQ, 3); aom_codec_control(&codec, AV1E_SET_MV_COST_UPD_FREQ, 3); aom_codec_control(&codec, AV1E_SET_DV_COST_UPD_FREQ, 3); aom_codec_control(&codec, AV1E_SET_CDF_UPDATE_MODE, 1); // Settings to reduce key frame encoding time. aom_codec_control(&codec, AV1E_SET_ENABLE_CFL_INTRA, 0); aom_codec_control(&codec, AV1E_SET_ENABLE_SMOOTH_INTRA, 0); aom_codec_control(&codec, AV1E_SET_ENABLE_ANGLE_DELTA, 0); aom_codec_control(&codec, AV1E_SET_ENABLE_FILTER_INTRA, 0); aom_codec_control(&codec, AV1E_SET_INTRA_DEFAULT_TX_ONLY, 1); aom_codec_control(&codec, AV1E_SET_AUTO_TILES, 1); aom_codec_control(&codec, AV1E_SET_TUNE_CONTENT, app_input.tune_content); if (app_input.tune_content == AOM_CONTENT_SCREEN) { aom_codec_control(&codec, AV1E_SET_ENABLE_PALETTE, 1); // INTRABC is currently disabled for rt mode, as it's too slow. aom_codec_control(&codec, AV1E_SET_ENABLE_INTRABC, 0); } if (app_input.use_external_rc) { aom_codec_control(&codec, AV1E_SET_RTC_EXTERNAL_RC, 1); } aom_codec_control(&codec, AV1E_SET_MAX_CONSEC_FRAME_DROP_MS_CBR, INT_MAX); aom_codec_control(&codec, AV1E_SET_SVC_FRAME_DROP_MODE, AOM_FULL_SUPERFRAME_DROP); aom_codec_control(&codec, AV1E_SET_POSTENCODE_DROP_RTC, 1); svc_params.number_spatial_layers = ss_number_layers; svc_params.number_temporal_layers = ts_number_layers; for (i = 0; i < ss_number_layers * ts_number_layers; ++i) { svc_params.max_quantizers[i] = cfg.rc_max_quantizer; svc_params.min_quantizers[i] = cfg.rc_min_quantizer; } if (!app_input.scale_factors_explicitly_set) { for (i = 0; i < ss_number_layers; ++i) { svc_params.scaling_factor_num[i] = 1; svc_params.scaling_factor_den[i] = 1; } if (ss_number_layers == 2) { svc_params.scaling_factor_num[0] = 1; svc_params.scaling_factor_den[0] = 2; } else if (ss_number_layers == 3) { svc_params.scaling_factor_num[0] = 1; svc_params.scaling_factor_den[0] = 4; svc_params.scaling_factor_num[1] = 1; svc_params.scaling_factor_den[1] = 2; } } aom_codec_control(&codec, AV1E_SET_SVC_PARAMS, &svc_params); // TODO(aomedia:3032): Configure KSVC in fixed mode. // This controls the maximum target size of the key frame. // For generating smaller key frames, use a smaller max_intra_size_pct // value, like 100 or 200. { const int max_intra_size_pct = 300; aom_codec_control(&codec, AOME_SET_MAX_INTRA_BITRATE_PCT, max_intra_size_pct); } for (int lx = 0; lx < ts_number_layers * ss_number_layers; lx++) { cx_time_layer[lx] = 0; frame_cnt_layer[lx] = 0; } std::unique_ptr rc_api; if (app_input.use_external_rc) { const aom::AV1RateControlRtcConfig rc_cfg = create_rtc_rc_config(cfg, app_input); rc_api = aom::AV1RateControlRTC::Create(rc_cfg); } frame_avail = 1; struct psnr_stats psnr_stream; memset(&psnr_stream, 0, sizeof(psnr_stream)); while (frame_avail || got_data) { struct aom_usec_timer timer; frame_avail = read_frame(&(app_input.input_ctx[0]), &raw); // Loop over spatial layers. for (int slx = 0; slx < ss_number_layers; slx++) { if (slx > 0 && app_input.input_ctx[slx].filename != NULL) { const int previous_layer_frame_avail = frame_avail; frame_avail = read_frame(&(app_input.input_ctx[slx]), &raw); if (previous_layer_frame_avail != frame_avail) { die("Mismatch in number of frames between spatial layer input files"); } } aom_codec_iter_t iter = NULL; const aom_codec_cx_pkt_t *pkt; int layer = 0; // Flag for superframe whose base is key. int is_key_frame = (frame_cnt % cfg.kf_max_dist) == 0; // For flexible mode: if (app_input.layering_mode >= 0) { // Set the reference/update flags, layer_id, and reference_map // buffer index. set_layer_pattern(app_input.layering_mode, frame_cnt, &layer_id, &ref_frame_config, &ref_frame_comp_pred, &use_svc_control, slx, is_key_frame, (app_input.layering_mode == 10), app_input.speed); aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id); if (use_svc_control) { aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config); aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_COMP_PRED, &ref_frame_comp_pred); } if (app_input.multilayer_metadata_file != NULL) { add_multilayer_metadata(&raw, multilayer_metadata); } // Set the speed per layer. if (test_speed_per_layer) { int speed_per_layer = 10; if (layer_id.spatial_layer_id == 0) { if (layer_id.temporal_layer_id == 0) speed_per_layer = 6; if (layer_id.temporal_layer_id == 1) speed_per_layer = 7; if (layer_id.temporal_layer_id == 2) speed_per_layer = 8; } else if (layer_id.spatial_layer_id == 1) { if (layer_id.temporal_layer_id == 0) speed_per_layer = 7; if (layer_id.temporal_layer_id == 1) speed_per_layer = 8; if (layer_id.temporal_layer_id == 2) speed_per_layer = 9; } else if (layer_id.spatial_layer_id == 2) { if (layer_id.temporal_layer_id == 0) speed_per_layer = 8; if (layer_id.temporal_layer_id == 1) speed_per_layer = 9; if (layer_id.temporal_layer_id == 2) speed_per_layer = 10; } aom_codec_control(&codec, AOME_SET_CPUUSED, speed_per_layer); } } else { // Only up to 3 temporal layers supported in fixed mode. // Only need to set spatial and temporal layer_id: reference // prediction, refresh, and buffer_idx are set internally. layer_id.spatial_layer_id = slx; layer_id.temporal_layer_id = 0; if (ts_number_layers == 2) { layer_id.temporal_layer_id = (frame_cnt % 2) != 0; } else if (ts_number_layers == 3) { if (frame_cnt % 2 != 0) layer_id.temporal_layer_id = 2; else if ((frame_cnt > 1) && ((frame_cnt - 2) % 4 == 0)) layer_id.temporal_layer_id = 1; } aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id); } if (set_err_resil_frame && cfg.g_error_resilient == 0) { // Set error_resilient per frame: off/0 for base layer and // on/1 for enhancement layer frames. // Note that this is can only be done on the fly/per-frame/layer // if the config error_resilience is off/0. See the logic for updating // in set_encoder_config(): // tool_cfg->error_resilient_mode = // cfg->g_error_resilient | extra_cfg->error_resilient_mode; const int err_resil_mode = layer_id.spatial_layer_id > 0 || layer_id.temporal_layer_id > 0; aom_codec_control(&codec, AV1E_SET_ERROR_RESILIENT_MODE, err_resil_mode); } layer = slx * ts_number_layers + layer_id.temporal_layer_id; if (frame_avail && slx == 0) ++rc.layer_input_frames[layer]; if (test_dynamic_scaling_single_layer) { // Example to scale source down by 2x2, then 4x4, and then back up to // 2x2, and then back to original. int frame_2x2 = 200; int frame_4x4 = 400; int frame_2x2up = 600; int frame_orig = 800; if (frame_cnt >= frame_2x2 && frame_cnt < frame_4x4) { // Scale source down by 2x2. struct aom_scaling_mode mode = { AOME_ONETWO, AOME_ONETWO }; aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode); } else if (frame_cnt >= frame_4x4 && frame_cnt < frame_2x2up) { // Scale source down by 4x4. struct aom_scaling_mode mode = { AOME_ONEFOUR, AOME_ONEFOUR }; aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode); } else if (frame_cnt >= frame_2x2up && frame_cnt < frame_orig) { // Source back up to 2x2. struct aom_scaling_mode mode = { AOME_ONETWO, AOME_ONETWO }; aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode); } else if (frame_cnt >= frame_orig) { // Source back up to original resolution (no scaling). struct aom_scaling_mode mode = { AOME_NORMAL, AOME_NORMAL }; aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode); } if (frame_cnt == frame_2x2 || frame_cnt == frame_4x4 || frame_cnt == frame_2x2up || frame_cnt == frame_orig) { // For dynamic resize testing on single layer: refresh all references // on the resized frame: this is to avoid decode error: // if resize goes down by >= 4x4 then libaom decoder will throw an // error that some reference (even though not used) is beyond the // limit size (must be smaller than 4x4). for (i = 0; i < REF_FRAMES; i++) ref_frame_config.refresh[i] = 1; if (use_svc_control) { aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config); aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_COMP_PRED, &ref_frame_comp_pred); } } } // Change target_bitrate every other frame. if (test_changing_bitrate && frame_cnt % 2 == 0) { if (frame_cnt < 500) cfg.rc_target_bitrate += 10; else cfg.rc_target_bitrate -= 10; // Do big increase and decrease. if (frame_cnt == 100) cfg.rc_target_bitrate <<= 1; if (frame_cnt == 600) cfg.rc_target_bitrate >>= 1; if (cfg.rc_target_bitrate < 100) cfg.rc_target_bitrate = 100; // Call change_config, or bypass with new control. // res = aom_codec_enc_config_set(&codec, &cfg); if (aom_codec_control(&codec, AV1E_SET_BITRATE_ONE_PASS_CBR, cfg.rc_target_bitrate)) die_codec(&codec, "Failed to SET_BITRATE_ONE_PASS_CBR"); } if (rc_api) { aom::AV1FrameParamsRTC frame_params; // TODO(jianj): Add support for SVC. frame_params.spatial_layer_id = 0; frame_params.temporal_layer_id = 0; frame_params.frame_type = is_key_frame ? aom::kKeyFrame : aom::kInterFrame; rc_api->ComputeQP(frame_params); const int current_qp = rc_api->GetQP(); if (aom_codec_control(&codec, AV1E_SET_QUANTIZER_ONE_PASS, qindex_to_quantizer(current_qp))) { die_codec(&codec, "Failed to SET_QUANTIZER_ONE_PASS"); } } if (test_active_maps) set_active_map(&cfg, &codec, frame_cnt); // Do the layer encode. aom_usec_timer_start(&timer); if (aom_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags)) die_codec(&codec, "Failed to encode frame"); aom_usec_timer_mark(&timer); cx_time += aom_usec_timer_elapsed(&timer); cx_time_layer[layer] += aom_usec_timer_elapsed(&timer); frame_cnt_layer[layer] += 1; // Get the high motion content flag. int content_flag = 0; if (aom_codec_control(&codec, AV1E_GET_HIGH_MOTION_CONTENT_SCREEN_RTC, &content_flag)) { die_codec(&codec, "Failed to GET_HIGH_MOTION_CONTENT_SCREEN_RTC"); } got_data = 0; // For simulcast (mode 11): write out each spatial layer to the file. int ss_layers_write = (app_input.layering_mode == 11) ? layer_id.spatial_layer_id + 1 : ss_number_layers; while ((pkt = aom_codec_get_cx_data(&codec, &iter))) { switch (pkt->kind) { case AOM_CODEC_CX_FRAME_PKT: for (int sl = layer_id.spatial_layer_id; sl < ss_layers_write; ++sl) { for (int tl = layer_id.temporal_layer_id; tl < ts_number_layers; ++tl) { int j = sl * ts_number_layers + tl; if (app_input.output_obu) { fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, obu_files[j]); } else { aom_video_writer_write_frame( outfile[j], reinterpret_cast(pkt->data.frame.buf), pkt->data.frame.sz, pts); } if (sl == layer_id.spatial_layer_id) rc.layer_encoding_bitrate[j] += 8.0 * pkt->data.frame.sz; } } got_data = 1; // Write everything into the top layer. if (app_input.output_obu) { fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, total_layer_obu_file); } else { aom_video_writer_write_frame( total_layer_file, reinterpret_cast(pkt->data.frame.buf), pkt->data.frame.sz, pts); } // Keep count of rate control stats per layer (for non-key). if (!(pkt->data.frame.flags & AOM_FRAME_IS_KEY)) { int j = layer_id.spatial_layer_id * ts_number_layers + layer_id.temporal_layer_id; assert(j >= 0); rc.layer_avg_frame_size[j] += 8.0 * pkt->data.frame.sz; rc.layer_avg_rate_mismatch[j] += fabs(8.0 * pkt->data.frame.sz - rc.layer_pfb[j]) / rc.layer_pfb[j]; if (slx == 0) ++rc.layer_enc_frames[layer_id.temporal_layer_id]; } if (rc_api) { rc_api->PostEncodeUpdate(pkt->data.frame.sz); } // Update for short-time encoding bitrate states, for moving window // of size rc->window, shifted by rc->window / 2. // Ignore first window segment, due to key frame. // For spatial layers: only do this for top/highest SL. if (frame_cnt > rc.window_size && slx == ss_number_layers - 1) { sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate; rc.window_size = (rc.window_size <= 0) ? 1 : rc.window_size; if (frame_cnt % rc.window_size == 0) { rc.window_count += 1; rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size; rc.variance_st_encoding_bitrate += (sum_bitrate / rc.window_size) * (sum_bitrate / rc.window_size); sum_bitrate = 0.0; } } // Second shifted window. if (frame_cnt > rc.window_size + rc.window_size / 2 && slx == ss_number_layers - 1) { sum_bitrate2 += 0.001 * 8.0 * pkt->data.frame.sz * framerate; if (frame_cnt > 2 * rc.window_size && frame_cnt % rc.window_size == 0) { rc.window_count += 1; rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size; rc.variance_st_encoding_bitrate += (sum_bitrate2 / rc.window_size) * (sum_bitrate2 / rc.window_size); sum_bitrate2 = 0.0; } } #if CONFIG_AV1_DECODER if (app_input.decode) { if (aom_codec_decode( &decoder, reinterpret_cast(pkt->data.frame.buf), pkt->data.frame.sz, NULL)) die_codec(&decoder, "Failed to decode frame"); } #endif break; case AOM_CODEC_PSNR_PKT: if (app_input.show_psnr) { psnr_stream.psnr_sse_total[0] += pkt->data.psnr.sse[0]; psnr_stream.psnr_samples_total[0] += pkt->data.psnr.samples[0]; for (int plane = 0; plane < 4; plane++) { psnr_stream.psnr_totals[0][plane] += pkt->data.psnr.psnr[plane]; } psnr_stream.psnr_count[0]++; } break; default: break; } } #if CONFIG_AV1_DECODER if (got_data && app_input.decode) { // Don't look for mismatch on top spatial and top temporal layers as // they are non reference frames. if ((ss_number_layers > 1 || ts_number_layers > 1) && !(layer_id.temporal_layer_id > 0 && layer_id.temporal_layer_id == ts_number_layers - 1)) { if (test_decode(&codec, &decoder, frame_cnt)) { #if CONFIG_INTERNAL_STATS fprintf(stats_file, "First mismatch occurred in frame %d\n", frame_cnt); fclose(stats_file); #endif fatal("Mismatch seen"); } } } #endif } // loop over spatial layers ++frame_cnt; pts += frame_duration; } for (i = 0; i < MAX_NUM_SPATIAL_LAYERS; ++i) { if (app_input.input_ctx[i].filename == NULL) { break; } close_input_file(&(app_input.input_ctx[i])); } printout_rate_control_summary(&rc, frame_cnt, ss_number_layers, ts_number_layers); printf("\n"); for (int slx = 0; slx < ss_number_layers; slx++) for (int tlx = 0; tlx < ts_number_layers; tlx++) { int lx = slx * ts_number_layers + tlx; printf("Per layer encoding time/FPS stats for encoder: %d %d %d %f %f \n", slx, tlx, frame_cnt_layer[lx], (float)cx_time_layer[lx] / (double)(frame_cnt_layer[lx] * 1000), 1000000 * (double)frame_cnt_layer[lx] / (double)cx_time_layer[lx]); } printf("\n"); printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f\n", frame_cnt, 1000 * (float)cx_time / (double)(frame_cnt * 1000000), 1000000 * (double)frame_cnt / (double)cx_time); if (app_input.show_psnr) { show_psnr(&psnr_stream, 255.0); } if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy encoder"); #if CONFIG_AV1_DECODER if (app_input.decode) { if (aom_codec_destroy(&decoder)) die_codec(&decoder, "Failed to destroy decoder"); } #endif #if CONFIG_INTERNAL_STATS fprintf(stats_file, "No mismatch detected in recon buffers\n"); fclose(stats_file); #endif // Try to rewrite the output file headers with the actual frame count. for (i = 0; i < ss_number_layers * ts_number_layers; ++i) aom_video_writer_close(outfile[i]); aom_video_writer_close(total_layer_file); if (has_non_y4m_input) { aom_img_free(&raw); } return EXIT_SUCCESS; } aom-3.12.1/examples/twopass_encoder.c000066400000000000000000000177721477627663500176000ustar00rootroot00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ // Two Pass Encoder // ================ // // This is an example of a two pass encoder loop. It takes an input file in // YV12 format, passes it through the encoder twice, and writes the compressed // frames to disk in IVF format. It builds upon the simple_encoder example. // // Twopass Variables // ----------------- // Twopass mode needs to track the current pass number and the buffer of // statistics packets. // // Updating The Configuration // --------------------------------- // In two pass mode, the configuration has to be updated on each pass. The // statistics buffer is passed on the last pass. // // Encoding A Frame // ---------------- // Encoding a frame in two pass mode is identical to the simple encoder // example. // // Processing Statistics Packets // ----------------------------- // Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data // for this frame. We write a IVF frame header, followed by the raw data. // // // Pass Progress Reporting // ----------------------------- // It's sometimes helpful to see when each pass completes. // // // Clean-up // ----------------------------- // Destruction of the encoder instance must be done on each pass. The // raw image should be destroyed at the end as usual. #include #include #include #include "aom/aom_encoder.h" #include "aom/aomcx.h" #include "common/tools_common.h" #include "common/video_writer.h" static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s " "\n", exec_name); exit(EXIT_FAILURE); } static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img, aom_codec_pts_t pts, unsigned int duration, aom_enc_frame_flags_t flags, aom_fixed_buf_t *stats) { int got_pkts = 0; aom_codec_iter_t iter = NULL; const aom_codec_cx_pkt_t *pkt = NULL; const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags); if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to get frame stats."); while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) { got_pkts = 1; if (pkt->kind == AOM_CODEC_STATS_PKT) { const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf; const size_t pkt_size = pkt->data.twopass_stats.sz; stats->buf = realloc(stats->buf, stats->sz + pkt_size); if (!stats->buf) die("Failed to allocate frame stats buffer."); memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size); stats->sz += pkt_size; } } return got_pkts; } static int encode_frame(aom_codec_ctx_t *ctx, const aom_image_t *img, aom_codec_pts_t pts, unsigned int duration, aom_enc_frame_flags_t flags, AvxVideoWriter *writer) { int got_pkts = 0; aom_codec_iter_t iter = NULL; const aom_codec_cx_pkt_t *pkt = NULL; const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags); if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to encode frame."); while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) { got_pkts = 1; if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, pkt->data.frame.sz, pkt->data.frame.pts)) die_codec(ctx, "Failed to write compressed frame."); printf(keyframe ? "K" : "."); fflush(stdout); } } return got_pkts; } static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile, aom_codec_iface_t *encoder, const aom_codec_enc_cfg_t *cfg, int limit) { aom_codec_ctx_t codec; int frame_count = 0; aom_fixed_buf_t stats = { NULL, 0 }; if (aom_codec_enc_init(&codec, encoder, cfg, 0)) die("Failed to initialize encoder"); // Calculate frame statistics. while (aom_img_read(raw, infile) && frame_count < limit) { ++frame_count; get_frame_stats(&codec, raw, frame_count, 1, 0, &stats); } // Flush encoder. while (get_frame_stats(&codec, NULL, frame_count, 1, 0, &stats)) { } printf("Pass 0 complete. Processed %d frames.\n", frame_count); if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); return stats; } static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name, aom_codec_iface_t *encoder, const aom_codec_enc_cfg_t *cfg, int limit) { AvxVideoInfo info = { get_fourcc_by_aom_encoder(encoder), cfg->g_w, cfg->g_h, { cfg->g_timebase.num, cfg->g_timebase.den }, 0 }; AvxVideoWriter *writer = NULL; aom_codec_ctx_t codec; int frame_count = 0; writer = aom_video_writer_open(outfile_name, kContainerIVF, &info); if (!writer) die("Failed to open %s for writing", outfile_name); if (aom_codec_enc_init(&codec, encoder, cfg, 0)) die("Failed to initialize encoder"); if (aom_codec_control(&codec, AOME_SET_CPUUSED, 2)) die_codec(&codec, "Failed to set cpu-used"); // Encode frames. while (aom_img_read(raw, infile) && frame_count < limit) { ++frame_count; encode_frame(&codec, raw, frame_count, 1, 0, writer); } // Flush encoder. while (encode_frame(&codec, NULL, -1, 1, 0, writer)) { } printf("\n"); if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); aom_video_writer_close(writer); printf("Pass 1 complete. Processed %d frames.\n", frame_count); } int main(int argc, char **argv) { FILE *infile = NULL; int w, h; aom_codec_ctx_t codec; aom_codec_enc_cfg_t cfg; aom_image_t raw; aom_codec_err_t res; aom_fixed_buf_t stats; const int fps = 30; // TODO(dkovalev) add command line argument const int bitrate = 200; // kbit/s TODO(dkovalev) add command line argument const char *const codec_arg = argv[1]; const char *const width_arg = argv[2]; const char *const height_arg = argv[3]; const char *const infile_arg = argv[4]; const char *const outfile_arg = argv[5]; int limit = 0; exec_name = argv[0]; if (argc < 6) die("Invalid number of arguments"); if (argc > 6) limit = (int)strtol(argv[6], NULL, 0); if (limit == 0) limit = 100; aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg); if (!encoder) die("Unsupported codec."); w = (int)strtol(width_arg, NULL, 0); h = (int)strtol(height_arg, NULL, 0); if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0) die("Invalid frame size: %dx%d", w, h); if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 1)) die("Failed to allocate image (%dx%d)", w, h); printf("Using %s\n", aom_codec_iface_name(encoder)); // Configuration res = aom_codec_enc_config_default(encoder, &cfg, 0); if (res) die_codec(&codec, "Failed to get default codec config."); cfg.g_w = w; cfg.g_h = h; cfg.g_timebase.num = 1; cfg.g_timebase.den = fps; cfg.rc_target_bitrate = bitrate; if (!(infile = fopen(infile_arg, "rb"))) die("Failed to open %s for reading", infile_arg); // Pass 0 cfg.g_pass = AOM_RC_FIRST_PASS; stats = pass0(&raw, infile, encoder, &cfg, limit); // Pass 1 rewind(infile); cfg.g_pass = AOM_RC_LAST_PASS; cfg.rc_twopass_stats_in = stats; pass1(&raw, infile, outfile_arg, encoder, &cfg, limit); free(stats.buf); aom_img_free(&raw); fclose(infile); return EXIT_SUCCESS; } aom-3.12.1/keywords.dox000066400000000000000000000041241477627663500147650ustar00rootroot00000000000000/*!\page rfc2119 RFC2119 Keywords The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119. Specifically, the following definitions are used: \section MUST \anchor REQUIRED \anchor SHALL This word, or the terms "REQUIRED" or "SHALL", mean that the definition is an absolute requirement of the specification. \section MUSTNOT MUST NOT \anchor SHALLNOT This phrase, or the phrase "SHALL NOT", mean that the definition is an absolute prohibition of the specification. \section SHOULD \anchor RECOMMENDED This word, or the adjective "RECOMMENDED", mean that there may exist valid reasons in particular circumstances to ignore a particular item, but the full implications must be understood and carefully weighed before choosing a different course. \section SHOULDNOT SHOULD NOT \anchor NOTRECOMMENDED This phrase, or the phrase "NOT RECOMMENDED" mean that there may exist valid reasons in particular circumstances when the particular behavior is acceptable or even useful, but the full implications should be understood and the case carefully weighed before implementing any behavior described with this label. \section MAY \anchor OPTIONAL This word, or the adjective "OPTIONAL", mean that an item is truly optional. One vendor may choose to include the item because a particular marketplace requires it or because the vendor feels that it enhances the product while another vendor may omit the same item. An implementation which does not include a particular option \ref MUST be prepared to interoperate with another implementation which does include the option, though perhaps with reduced functionality. In the same vein an implementation which does include a particular option \ref MUST be prepared to interoperate with another implementation which does not include the option (except, of course, for the feature the option provides.) */ aom-3.12.1/libs.doxy_template000066400000000000000000003161071477627663500161420ustar00rootroot00000000000000## Copyright (c) 2020, Alliance for Open Media. All rights reserved. ## ## This source code is subject to the terms of the BSD 2 Clause License and ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ## was not distributed with this source code in the LICENSE file, you can ## obtain it at www.aomedia.org/license/software. If the Alliance for Open ## Media Patent License 1.0 was not distributed with this source code in the ## PATENTS file, you can obtain it at www.aomedia.org/license/patent. ## # Doxyfile 1.8.16 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed in # front of the TAG it is preceding. # # All text after a single hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the configuration # file that follow. The default is UTF-8 which is also the encoding used for all # text before the first occurrence of this tag. Doxygen uses libiconv (or the # iconv built into libc) for the transcoding. See # https://www.gnu.org/software/libiconv/ for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded by # double-quotes, unless you are using Doxywizard) that should identify the # project for which the documentation is generated. This name is used in the # title of most generated pages and in a few other places. # The default value is: My Project. PROJECT_NAME = "AOMedia AV1 Codec" # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. PROJECT_NUMBER = # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. OUTPUT_DIRECTORY = docs # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and # will distribute the generated files over these directories. Enabling this # option can be useful when feeding doxygen a huge amount of source files, where # putting all generated files in the same directory would otherwise causes # performance problems for the file system. # The default value is: NO. CREATE_SUBDIRS = NO # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode # U+3044. # The default value is: NO. ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, # Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), # Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, # Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), # Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, # Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, # Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, # Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. # The default value is: YES. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief # description of a member or function before the detailed description # # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. # The default value is: YES. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator that is # used to form the text in various listings. Each string in this list, if found # as the leading text of the brief description, will be stripped from the text # and the result, after processing the whole list, is used as the annotated # text. Otherwise, the brief description is used as-is. If left blank, the # following values are used ($name is automatically replaced with the name of # the entity):The $name class, The $name widget, The $name file, is, provides, # specifies, contains, represents, a, an and the. ABBREVIATE_BRIEF = # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # doxygen will generate a detailed section even if there is only a brief # description. # The default value is: NO. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. # The default value is: NO. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path # before files name in the file list and in the header files. If set to NO the # shortest path that makes the file name unique will be used # The default value is: YES. FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand # part of the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the path to # strip. # # Note that you can specify absolute paths here, but also relative paths, which # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which # header file to include in order to use a class. If left blank only the name of # the header file containing the class definition is used. Otherwise one should # specify the list of include paths that are normally passed to the compiler # using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't # support long names like on DOS, Mac, or CD-ROM. # The default value is: NO. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the # first line (until the first dot) of a Javadoc-style comment as the brief # description. If set to NO, the Javadoc-style will behave just like regular Qt- # style comments (thus requiring an explicit @brief command for a brief # description.) # The default value is: NO. JAVADOC_AUTOBRIEF = NO # If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line # such as # /*************** # as being the beginning of a Javadoc-style comment "banner". If set to NO, the # Javadoc-style will behave just like regular comments and it will not be # interpreted by doxygen. # The default value is: NO. JAVADOC_BANNER = NO # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus # requiring an explicit \brief command for a brief description.) # The default value is: NO. QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a # multi-line C++ special comment block (i.e. a block of //! or /// comments) as # a brief description. This used to be the default behavior. The new default is # to treat a multi-line C++ comment block as a detailed description. Set this # tag to YES if you prefer the old behavior instead. # # Note that setting this tag to YES also means that rational rose comments are # not recognized any more. # The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new # page for each member. If set to NO, the documentation of a member will be part # of the file/class/namespace that contains it. # The default value is: NO. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen # uses this value to replace tabs by spaces in code fragments. # Minimum value: 1, maximum value: 16, default value: 4. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that act as commands in # the documentation. An alias has the form: # name=value # For example adding # "sideeffect=@par Side Effects:\n" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines (in the resulting output). You can put ^^ in the value part of an # alias to insert a newline as if a physical newline was in the original file. # When you need a literal { or } or , in the value part of an alias you have to # escape them by means of a backslash (\), this can lead to conflicts with the # commands \{ and \} for these it is advised to use the version @{ and @} or use # a double escape (\\{ and \\}) ALIASES = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all # members will be omitted, etc. # The default value is: NO. OPTIMIZE_OUTPUT_FOR_C = YES # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or # Python sources only. Doxygen will then generate output that is more tailored # for that language. For instance, namespaces will be presented as packages, # qualified scopes will look different, etc. # The default value is: NO. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources. Doxygen will then generate output that is tailored for Fortran. # The default value is: NO. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for VHDL. # The default value is: NO. OPTIMIZE_OUTPUT_VHDL = NO # Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice # sources only. Doxygen will then generate output that is more tailored for that # language. For instance, namespaces will be presented as modules, types will be # separated into more groups, etc. # The default value is: NO. OPTIMIZE_OUTPUT_SLICE = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and # language is one of the parsers supported by doxygen: IDL, Java, Javascript, # Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, # Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: # FortranFree, unknown formatted Fortran: Fortran. In the later case the parser # tries to guess whether the code is fixed or free formatted code, this is the # default for Fortran type files), VHDL, tcl. For instance to make doxygen treat # .inc files as Fortran files (default is PHP), and .f files as C (default is # Fortran), use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable # documentation. See https://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. # The default value is: YES. MARKDOWN_SUPPORT = YES # When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up # to that level are automatically included in the table of contents, even if # they do not have an id attribute. # Note: This feature currently applies only to Markdown headings. # Minimum value: 0, maximum value: 99, default value: 5. # This tag requires that the tag MARKDOWN_SUPPORT is set to YES. TOC_INCLUDE_HEADINGS = 5 # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or # globally by setting AUTOLINK_SUPPORT to NO. # The default value is: YES. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should set this # tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); # versus func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. # The default value is: NO. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. # The default value is: NO. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: # https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES will make # doxygen to replace the get and set methods by a property in the documentation. # This will only work if the methods are indeed getting or setting a simple # type. If this is not the case, or you want to show the methods anyway, you # should set this option to NO. # The default value is: YES. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. # The default value is: NO. DISTRIBUTE_GROUP_DOC = NO # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option # is disabled and one has to add nested compounds explicitly via \ingroup. # The default value is: NO. GROUP_NESTED_COMPOUNDS = NO # Set the SUBGROUPING tag to YES to allow class member groups of the same type # (for instance a group of public functions) to be put as a subgroup of that # type (e.g. under the Public Functions section). Set it to NO to prevent # subgrouping. Alternatively, this can be done per class using the # \nosubgrouping command. # The default value is: YES. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions # are shown inside the group in which they are included (e.g. using \ingroup) # instead of on a separate page (for HTML and Man pages) or section (for LaTeX # and RTF). # # Note that this feature does not work in combination with # SEPARATE_MEMBER_PAGES. # The default value is: NO. INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions # with only public data fields or simple typedef fields will be shown inline in # the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO, structs, classes, and unions are shown on a separate page (for HTML and # Man pages) or section (for LaTeX and RTF). # The default value is: NO. INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or # enum is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically be # useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. # The default value is: NO. TYPEDEF_HIDES_STRUCT = NO # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can be # an expensive process and often the same symbol appears multiple times in the # code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small # doxygen will become slower. If the cache is too large, memory is wasted. The # cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range # is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 # symbols. At the end of a run doxygen will report the cache usage and suggest # the optimal cache size from a speed point of view. # Minimum value: 0, maximum value: 9, default value: 0. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. # Note: This will also disable the warnings about undocumented members that are # normally produced when WARNINGS is set to YES. # The default value is: NO. EXTRACT_ALL = NO # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. # The default value is: NO. EXTRACT_PRIVATE = NO # If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual # methods of a class will be included in the documentation. # The default value is: NO. EXTRACT_PRIV_VIRTUAL = NO # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. # The default value is: NO. EXTRACT_STATIC = YES # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined # locally in source files will be included in the documentation. If set to NO, # only classes defined in header files are included. Does not have any effect # for Java sources. # The default value is: YES. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. If set to YES, local methods, # which are defined in the implementation section but not in the interface are # included in the documentation. If set to NO, only methods in the interface are # included. # The default value is: NO. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base name of # the file that contains the anonymous namespace. By default anonymous namespace # are hidden. # The default value is: NO. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation # section is generated. This option has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option # has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend # (class|struct|union) declarations. If set to NO, these declarations will be # included in the documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these # blocks will be appended to the function's detailed documentation block. # The default value is: NO. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation that is typed after a # \internal command is included. If the tag is set to NO then the documentation # will be excluded. Set it to YES to include the internal documentation. # The default value is: NO. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file # names in lower-case letters. If set to YES, upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # (including Cygwin) ands Mac users are advised to set this option to NO. # The default value is: system dependent. CASE_SENSE_NAMES = YES # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with # their full class and namespace scopes in the documentation. If set to YES, the # scope will be hidden. # The default value is: NO. HIDE_SCOPE_NAMES = NO # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will # append additional text to a page's title, such as Class Reference. If set to # YES the compound reference will be hidden. # The default value is: NO. HIDE_COMPOUND_REFERENCE= NO # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. SHOW_INCLUDE_FILES = YES # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader # which file to include in order to use the member. # The default value is: NO. SHOW_GROUPED_MEMB_INC = NO # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include # files with double quotes in the documentation rather than with sharp brackets. # The default value is: NO. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the # documentation for inline members. # The default value is: YES. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the # (detailed) documentation of file and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. # The default value is: YES. SORT_MEMBER_DOCS = NO # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief # descriptions of file, namespace and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. Note that # this will also influence the order of the classes in the class list. # The default value is: NO. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the # (brief and detailed) documentation of class members so that constructors and # destructors are listed first. If set to NO the constructors will appear in the # respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. # Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief # member documentation. # Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting # detailed member documentation. # The default value is: NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy # of group names into alphabetical order. If set to NO the group names will # appear in their defined order. # The default value is: NO. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by # fully-qualified names, including namespaces. If set to NO, the class list will # be sorted only by class name, not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the alphabetical # list. # The default value is: NO. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper # type resolution of all parameters of a function it will reject a match between # the prototype and the implementation of a member function even if there is # only one candidate or it is obvious which candidate to choose by doing a # simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still # accept a match between prototype and implementation in such cases. # The default value is: NO. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo # list. This list is created by putting \todo commands in the documentation. # The default value is: YES. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test # list. This list is created by putting \test commands in the documentation. # The default value is: YES. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug # list. This list is created by putting \bug commands in the documentation. # The default value is: YES. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) # the deprecated list. This list is created by putting \deprecated commands in # the documentation. # The default value is: YES. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional documentation # sections, marked by \if ... \endif and \cond # ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the # documentation. If the initializer consists of more lines than specified here # it will be hidden. Use a value of 0 to hide initializers completely. The # appearance of the value of individual variables and macros / defines can be # controlled using \showinitializer or \hideinitializer command in the # documentation regardless of this setting. # Minimum value: 0, maximum value: 10000, default value: 30. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at # the bottom of the documentation of classes and structs. If set to YES, the # list will mention the files that were used to generate the documentation. # The default value is: YES. SHOW_USED_FILES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This # will remove the Files entry from the Quick Index and from the Folder Tree View # (if specified). # The default value is: YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces # page. This will remove the Namespaces entry from the Quick Index and from the # Folder Tree View (if specified). # The default value is: YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command command input-file, where command is the value of the # FILE_VERSION_FILTER tag, and input-file is the name of an input file provided # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml # will be used as the name of the layout file. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool # to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated to # standard output by doxygen. If QUIET is set to YES this implies that the # messages are off. # The default value is: NO. QUIET = YES # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES # this implies that the warnings are on. # # Tip: Turn warnings on while writing the documentation. # The default value is: YES. WARNINGS = YES # If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. WARN_IF_UNDOCUMENTED = YES # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some parameters # in a documented function, or documenting parameters that don't exist or using # markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete # parameter documentation, but not about the absence of documentation. If # EXTRACT_ALL is set to YES then this flag will automatically be disabled. # The default value is: NO. WARN_NO_PARAMDOC = NO # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when # a warning is encountered. # The default value is: NO. WARN_AS_ERROR = NO # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which # will be replaced by the file and line number from which the warning originated # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard # error (stderr). WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. INPUT = # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv # documentation (see: https://www.gnu.org/software/libiconv/) for the list of # possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, # *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice. FILE_PATTERNS = # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. # The default value is: NO. RECURSIVE = NO # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. # The default value is: NO. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and # *.h) to filter out the source-files in the directories. If left blank all # files are included. EXAMPLE_PATTERNS = # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude commands # irrespective of the value of the RECURSIVE tag. # The default value is: NO. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the # \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command: # # # # where is the value of the INPUT_FILTER tag, and is the # name of an input file. Doxygen will then use the output that the filter # program writes to standard output. If FILTER_PATTERNS is specified, this tag # will be ignored. # # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: pattern=filter # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). # The default value is: NO. FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) and # it is also possible to disable source filtering for a specific pattern using # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will be # generated. Documented entities will be cross-referenced with these sources. # # Note: To get rid of all source code in the generated output, make sure that # also VERBATIM_HEADERS is set to NO. # The default value is: NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body of functions, # classes and enums directly into the documentation. # The default value is: NO. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any # special comment blocks from generated source code fragments. Normal C, C++ and # Fortran comments will always remain visible. # The default value is: YES. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented # entity all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = YES # If the REFERENCES_RELATION tag is set to YES then for each documented function # all documented entities called/used by that function will be listed. # The default value is: NO. REFERENCES_RELATION = YES # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set # to YES then the hyperlinks from functions in REFERENCES_RELATION and # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will # link to the documentation. # The default value is: YES. REFERENCES_LINK_SOURCE = YES # If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the # source code will show a tooltip with additional information such as prototype, # brief description and links to the definition and documentation. Since this # will make the HTML file larger and loading of large files a bit slower, you # can opt to disable this feature. # The default value is: YES. # This tag requires that the tag SOURCE_BROWSER is set to YES. SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system # (see https://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global # - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # # Doxygen will invoke htags (and that will in turn invoke gtags), so these # tools must be available from the command line (i.e. in the search path). # # The result: instead of the source browser generated by doxygen, the links to # source code will now point to the output of htags. # The default value is: NO. # This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a # verbatim copy of the header file for each class for which an include is # specified. Set to NO to disable this. # See also: Section \class. # The default value is: YES. VERBATIM_HEADERS = YES # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the # clang parser (see: http://clang.llvm.org/) for more accurate parsing at the # cost of reduced performance. This can be particularly helpful with template # rich C++ code for which doxygen's built-in parser lacks the necessary type # information. # Note: The availability of this option depends on whether or not doxygen was # generated with the -Duse_libclang=ON option for CMake. # The default value is: NO. CLANG_ASSISTED_PARSING = NO # If clang assisted parsing is enabled you can provide the compiler with command # line options that you would normally use when invoking the compiler. Note that # the include paths will already be set by doxygen for the files and directories # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. CLANG_OPTIONS = # If clang assisted parsing is enabled you can provide the clang parser with the # path to the compilation database (see: # http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files # were built. This is equivalent to specifying the "-p" option to a clang tool, # such as clang-check. These options will then be passed to the parser. # Note: The availability of this option depends on whether or not doxygen was # generated with the -Duse_libclang=ON option for CMake. CLANG_DATABASE_PATH = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all # compounds will be generated. Enable this if the project contains a lot of # classes, structs, unions or interfaces. # The default value is: YES. ALPHABETICAL_INDEX = NO # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of # it. # The default directory is: html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each # generated HTML page (for example: .htm, .php, .asp). # The default value is: .html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a user-defined HTML header file for # each generated HTML page. If the tag is left blank doxygen will generate a # standard header. # # To get valid HTML the header file that includes any scripts and style sheets # that doxygen needs, which is dependent on the configuration options used (e.g. # the setting GENERATE_TREEVIEW). It is highly recommended to start with a # default header using # doxygen -w html new_header.html new_footer.html new_stylesheet.css # YourConfigFile # and then modify the file new_header.html. See also section "Doxygen usage" # for information on how to generate the default header that doxygen normally # uses. # Note: The header is subject to change so you typically have to regenerate the # default header when upgrading to a newer version of doxygen. For a description # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard # footer. See HTML_HEADER for more information on how to generate a default # footer and what special commands can be used inside the footer. See also # section "Doxygen usage" for information on how to generate the default footer # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of # the HTML output. If left blank doxygen will generate a default style sheet. # See also section "Doxygen usage" for information on how to generate the style # sheet that doxygen normally uses. # Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as # it is more robust and this tag (HTML_STYLESHEET) will in the future become # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the # standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that the # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see # https://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors # in the HTML output. For a value of 0 the output will use grayscales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the # luminance component of the colors in the HTML output. Values below 100 # gradually make the output lighter, whereas values above 100 make the output # darker. The value divided by 100 is the actual gamma applied, so 80 represents # a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not # change the gamma. # Minimum value: 40, maximum value: 240, default value: 80. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML # documentation will contain a main index with vertical navigation menus that # are dynamically created via Javascript. If disabled, the navigation index will # consists of multiple levels of tabs that are statically embedded in every HTML # page. Disable this option to support browsers that do not have Javascript, # like the Qt help browser. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_MENUS = YES # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_SECTIONS = NO # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to # such a level that at most the specified number of entries are visible (unless # a fully collapsed tree already exceeds this amount). So setting the number of # entries 1 will produce a full collapsed tree by default. 0 is a special value # representing an infinite number of entries and will result in a full expanded # tree by default. # Minimum value: 0, maximum value: 9999, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development # environment (see: https://developer.apple.com/xcode/), introduced with OSX # 10.5 (Leopard). To create a documentation set, doxygen will generate a # Makefile in the HTML output directory. Running make will produce the docset in # that directory and running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy # genXcode/_index.html for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_DOCSET = NO # This tag determines the name of the docset feed. A documentation feed provides # an umbrella under which multiple documentation sets from a single provider # (such as a company or product suite) can be grouped. # The default value is: Doxygen generated docs. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDNAME = "Doxygen generated docs" # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_BUNDLE_ID = org.doxygen.Project # The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. # The default value is: org.doxygen.Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. # The default value is: Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop # (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on # Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML # files are now used as the Windows 98 help format, and will replace the old # Windows help format (.hlp) on all Windows platforms in the future. Compressed # HTML files also contain an index, a table of contents, and you can search for # words in the documentation. The HTML workshop also contains a viewer for # compressed HTML files. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_HTMLHELP = NO # The CHM_FILE tag can be used to specify the file name of the resulting .chm # file. You can add a path in front of the file if the result should not be # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, # doxygen will try to run the HTML help compiler on the generated index.hhp. # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. GENERATE_CHI = NO # The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it # enables the Previous and Next buttons. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members to # the table of contents of the HTML help documentation and to the tree view. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help # (.qch) of the generated HTML documentation. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify # the file name of the resulting .qch file. The path specified is relative to # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace # (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual # Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual- # folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom # Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom # Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To # install this plugin and make it available under the help contents menu in # Eclipse, the contents of the directory containing the HTML and XML files needs # to be copied into the plugins directory of eclipse. The name of the directory # within the plugins directory should be the same as the ECLIPSE_DOC_ID value. # After copying Eclipse needs to be restarted before the help appears. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_ECLIPSEHELP = NO # A unique identifier for the Eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have this # name. Each documentation set should have its own identifier. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. ECLIPSE_DOC_ID = org.doxygen.Project # If you want full control over the layout of the generated HTML pages it might # be necessary to disable the index and replace it with your own. The # DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top # of each HTML page. A value of NO enables the index and the value YES disables # it. Since the tabs in the index contain the same information as the navigation # tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag # value is set to YES, a side panel will be generated containing a tree-like # index structure (just like the one that is generated for HTML Help). For this # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can # further fine-tune the look of the index. As an example, the default style # sheet generated by doxygen has an example that shows how to put an image at # the root of the tree instead of the PROJECT_NAME. Since the tree basically has # the same information as the tab index, you could consider setting # DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = YES # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # # Note that a value of 0 will completely suppress the enum values from appearing # in the overview section. # Minimum value: 0, maximum value: 20, default value: 4. # This tag requires that the tag GENERATE_HTML is set to YES. ENUM_VALUES_PER_LINE = 4 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used # to set the initial width (in pixels) of the frame in which the tree is shown. # Minimum value: 0, maximum value: 1500, default value: 250. # This tag requires that the tag GENERATE_HTML is set to YES. TREEVIEW_WIDTH = 250 # If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to # external symbols imported via tag files in a separate window. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML # output directory to force them to be regenerated. # Minimum value: 8, maximum value: 50, default value: 10. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # https://www.mathjax.org) which uses client side Javascript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path # to it using the MATHJAX_RELPATH option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. USE_MATHJAX = YES # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: # http://docs.mathjax.org/en/latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the HTML # output directory using the MATHJAX_RELPATH option. The destination directory # should contain the MathJax.js script. For instance, if the mathjax directory # is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from https://www.mathjax.org before deployment. # The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/ # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site # (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and # should work on any modern browser. Note that when using HTML help # (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) # there is already a search function so this one should typically be disabled. # For large projects the javascript based search engine can be slow, then # enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to # search using the keyboard; to jump to the search box use + S # (what the is depends on the OS and browser, but it is typically # , /